%PDF- %PDF-
| Direktori : /www/varak.net/wiki.varak.net/extensions/CirrusSearch/vendor/wikimedia/textcat/ |
| Current File : //www/varak.net/wiki.varak.net/extensions/CirrusSearch/vendor/wikimedia/textcat/catus.php |
<?php
/**
* Classify texts using ngrams. See help below for options.
*/
require_once __DIR__.'/TextCat.php';
$options = getopt( 'a:c:d:f:t:u:l:h' );
if ( isset( $options['h'] ) ) {
$help = <<<HELP
{$argv[0]} [-d Dir] [-a Int] [-f Int] [-l Text] [-t Int] [-u Float]
-a NUM the program returns the best-scoring language together
with all languages which are <N times worse (set by option -u).
If the number of languages to be printed is larger than the value
of this option then no language is returned, but
instead a message that the input is of an unknown language is
printed. Default: 10.
-c LANG,LANG,...
lists the candidate languages. Only languages listed will be
considered for detection.
-d DIR indicates in which directory the language models are
located (files ending in .lm). Currently only a single
directory is supported. Default: ./LM .
-f NUM Before sorting is performed the Ngrams which occur this number
of times or less are removed. This can be used to speed up
the program for longer inputs. For short inputs you should use
the default or -f 0. Default: 0.
-l TEXT indicates that input is given as an argument on the command line,
e.g. {$argv[0]} -l "this is english text"
If this option is not given, the input is stdin.
-t NUM indicates the topmost number of ngrams that should be used.
Default: 3000
-u NUM determines how much worse result must be in order not to be
mentioned as an alternative. Typical value: 1.05 or 1.1.
Default: 1.05.
HELP;
echo $help;
exit( 0 );
}
if ( !empty( $options['d'] ) ) {
$dir = $options['d'];
} else {
$dir = __DIR__."/LM";
}
$cat = new TextCat( $dir );
if ( !empty( $options['t'] ) ) {
$cat->setMaxNgrams( intval( $options['t'] ) );
}
if ( !empty( $options['f'] ) ) {
$cat->setMinFreq( intval( $options['f'] ) );
}
$input = isset( $options['l'] ) ? $options['l'] : file_get_contents( "php://stdin" );
if ( !empty( $options['c'] ) ) {
$result = $cat->classify( $input, explode( ",", $options['c'] ) );
} else {
$result = $cat->classify( $input );
}
if ( empty( $result ) ) {
echo "No match found.\n";
exit( 1 );
}
if ( !empty( $options['u'] ) ) {
$max = reset( $result ) * $options['u'];
} else {
$max = reset( $result ) * 1.05;
}
if ( !empty( $options['a'] ) ) {
$top = $options['a'];
} else {
$top = 10;
}
$result = array_filter( $result, function ( $res ) use( $max ) { return $res < $max;
} );
if ( $result && count( $result ) <= $top ) {
echo join( " OR ", array_keys( $result ) ) . "\n";
exit( 0 );
} else {
echo "Cannot determine language.\n";
exit( 1 );
}