%PDF- %PDF-
Direktori : /www/varak.net/wiki.varak.net/extensions/CirrusSearch/includes/LanguageDetector/ |
Current File : /www/varak.net/wiki.varak.net/extensions/CirrusSearch/includes/LanguageDetector/ElasticSearch.php |
<?php namespace CirrusSearch\LanguageDetector; use CirrusSearch; use Elastica\Request; use Elastica\Exception\ResponseException; use MediaWiki\Logger\LoggerFactory; /** * Try to detect language using langdetect plugin * See: https://github.com/jprante/elasticsearch-langdetect */ class ElasticSearch implements Detector { /** * Detect language * * @param CirrusSearch $cirrus Searching class * @param string $text Text to detect language * @return string|null Preferred language, or null if none found */ public function detect( CirrusSearch $cirrus, $text ) { $client = $cirrus->getConnection()->getClient(); try { $response = $this->request( $client, $text ); } catch ( ResponseException $e ) { // This happens when language detection is not configured LoggerFactory::getInstance( 'CirrusSearch' )->warning( "Could not connect to language detector: {exception}", [ "exception" => $e ] ); return null; } if ( $response->isOk() ) { $value = $response->getData(); if ( $value && !empty( $value['languages'] ) ) { $langs = $value['languages']; if ( count( $langs ) == 1 ) { // TODO: add minimal threshold return $langs[0]['language']; } // FIXME: here I'm just winging it, should be something // that makes sense for multiple languages if ( count( $langs ) == 2) { if( $langs[0]['probability'] > 2*$langs[1]['probability'] ) { return $langs[0]['language']; } } } } return null; } /** * @param \Elastica\Client $client * @param string $text * @return \Elastica\Response * @suppress PhanTypeMismatchArgument The third parameter is typically * an array, but langdetect is special and takes a string instead. */ private function request( \Elastica\Client $client, $text ) { return $client->request( "_langdetect", Request::POST, $text ); } }