%PDF- %PDF-
Direktori : /www/varak.net/wiki.varak.net/extensions/CirrusSearch/includes/ |
Current File : /www/varak.net/wiki.varak.net/extensions/CirrusSearch/includes/CompletionSuggester.php |
<?php namespace CirrusSearch; use Elastica; use Elastica\Request; use CirrusSearch; use CirrusSearch\BuildDocument\SuggestBuilder; use CirrusSearch\Search\SearchContext; use MediaWiki\MediaWikiServices; use MediaWiki\Logger\LoggerFactory; use SearchSuggestion; use SearchSuggestionSet; use Status; use UsageException; use User; /** * Performs search as you type queries using Completion Suggester. * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation; either version 2 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License along * with this program; if not, write to the Free Software Foundation, Inc., * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. * http://www.gnu.org/copyleft/gpl.html */ /** * Completion Suggester Searcher * * NOTES: * The CompletionSuggester is built on top of the ElasticSearch Completion * Suggester. * (https://www.elastic.co/guide/en/elasticsearch/reference/current/search-suggesters-completion.html). * * This class is used at query time, see * CirrusSearch\BuildDocument\SuggestBuilder for index time logic. * * Document model: Cirrus documents are indexed with 2 suggestions: * * 1. The title suggestion (and close redirects). * This helps to avoid displaying redirects with typos (e.g. Albert Enstein, * Unietd States) where we make the assumption that if the redirect is close * enough it's likely a typo and it's preferable to display the canonical title. * This decision is made at index-time in SuggestBuilder::extractTitleAndSimilarRedirects. * * 2. The redirect suggestions * Because the same canonical title can be returned twice we support fetch_limit_factor * in suggest profiles to fetch more than what the use asked. Because the list of redirects * can be very large we cannot store all of them in the index (see limitations). We run a second * pass query on the main cirrus index to fetch them, then we try to detect which one is the closest * to the user query (see Util::chooseBestRedirect). * * LIMITATIONS: * A number of hacks are required in Cirrus to workaround some limitations in * the elasticsearch completion suggester implementation: * - It is a _suggest API, unlike classic "query then fetch" there is no fetch * phase here. * - Payloads are stored in memory within the FST: we try to avoid them, but * this forces us to implement a second pass query to fetch redirect titles * from the cirrus main index. * - Fuzzy suggestions are ranked by index-time score: we allow to set * 'discount' param in the suggest profile (profiles/SuggestProfiles.php). The * default profile includes a fuzzy and non-fuzzy suggestion query. This is to * avoid having fuzzy suggestions ranked higher than exact suggestion. * - The suggestion string cannot be expanded to more than 255 strings at * index time: we limit the number of generated tokens in the analysis config * (see includes/Maintenance/SuggesterAnalysisConfigBuilder.php) but we can't * workaround this problem for geosuggestion (suggestions will be prepended by * geohash prefixes, one per precision step) * * @todo: investigate new features in elasticsearch completion suggester v2 to remove * some workarounds (https://github.com/elastic/elasticsearch/issues/10746). */ class CompletionSuggester extends ElasticsearchIntermediary { const VARIANT_EXTRA_DISCOUNT = 0.0001; /** * @var string term to search. */ private $term; /** * @var string[]|null search variants */ private $variants; /** * Currently very limited (see LIMITATIONS) and only works * for geo context * @var array|null context for contextualized suggestions */ private $context; /** * @var integer maximum number of result */ private $limit; /** * @var integer offset */ private $offset; /** * @var string index base name to use */ private $indexBaseName; /** * Search environment configuration * @var SearchConfig */ private $config; /** * @var string Query type (comp_suggest_geo or comp_suggest) */ public $queryType; /** * @var SearchContext */ private $searchContext; private $settings; /** * Constructor * @param Connection $conn * @param int $limit Limit the results to this many * @param int $offset the offset * @param SearchConfig $config Configuration settings * @param int[]|null $namespaces Array of namespace numbers to search or null to search all namespaces. * @param User|null $user user for which this search is being performed. Attached to slow request logs. * @param string|boolean $index Base name for index to search from, defaults to $wgCirrusSearchIndexBaseName * @param string|null $settings completion settings to use (see profiles/SuggestProfiles.php) * @throws \ConfigException */ public function __construct( Connection $conn, $limit, $offset = 0, SearchConfig $config = null, array $namespaces = null, User $user = null, $index = false, $profileName = null ) { if ( is_null( $config ) ) { // @todo connection has an embedded config ... reuse that? somehow should // at least ensure they are the same. $config = MediaWikiServices::getInstance() ->getConfigFactory() ->makeConfig( 'CirrusSearch' ); } parent::__construct( $conn, $user, $config->get( 'CirrusSearchSlowSearch' ) ); $this->config = $config; $this->limit = $limit; $this->offset = $offset; $this->indexBaseName = $index ?: $config->get( SearchConfig::INDEX_BASE_NAME ); $this->searchContext = new SearchContext( $this->config, $namespaces ); if ( $profileName == null ) { $profileName = $this->config->get( 'CirrusSearchCompletionSettings' ); } $this->settings = $this->config->getElement( 'CirrusSearchCompletionProfiles', $profileName ); } /** * @param string $search * @throws UsageException */ private function checkRequestLength( $search ) { $requestLength = mb_strlen( $search ); if ( $requestLength > Searcher::MAX_TITLE_SEARCH ) { throw new UsageException( 'Prefix search request was longer than the maximum allowed length.' . " ($requestLength > " . Searcher::MAX_TITLE_SEARCH . ')', 'request_too_long', 400 ); } } /** * Produce a set of completion suggestions for text using _suggest * See https://www.elastic.co/guide/en/elasticsearch/reference/1.6/search-suggesters-completion.html * * WARNING: experimental API * * @param string $text Search term * @param string[]|null $variants Search term variants * (usually issued from $wgContLang->autoConvertToAllVariants( $text ) ) * @param array $context * @return Status */ public function suggest( $text, $variants = null, $context = null ) { // If the offset requested is greater than the hard limit // allowed we will always return an empty set so let's do it // asap. if ( $this->offset >= $this->getHardLimit() ) { return Status::newGood( SearchSuggestionSet::emptySuggestionSet() ); } $this->checkRequestLength( $text ); $this->setTermAndVariants( $text, $variants ); $this->context = $context; list( $profiles, $suggest ) = $this->buildQuery(); $queryOptions = [ 'timeout' => $this->config->getElement( 'CirrusSearchSearchShardTimeout', 'default' ), ]; $this->connection->setTimeout( $queryOptions[ 'timeout' ] ); $index = $this->connection->getIndex( $this->indexBaseName, Connection::TITLE_SUGGEST_TYPE ); $logContext = [ 'query' => $text, 'queryType' => $this->queryType, ]; $result = Util::doPoolCounterWork( 'CirrusSearch-Completion', $this->user, function() use( $index, $suggest, $logContext, $queryOptions, $profiles, $text ) { $description = "{queryType} search for '{query}'"; $this->start( $description, $logContext ); try { $result = $index->request( "_suggest", Request::POST, $suggest, $queryOptions ); if( $result->isOk() ) { $result = $this->postProcessSuggest( $result, $profiles ); return $this->success( $result ); } return $result; } catch ( \Elastica\Exception\ExceptionInterface $e ) { return $this->failure( $e ); } } ); return $result; } /** * protected for tests * * @param string $term * @param string[]|null $variants */ protected function setTermAndVariants( $term, array $variants = null ) { $this->term = $term; if ( empty( $variants ) ) { $this->variants = null; return; } $variants = array_diff( array_unique( $variants ), [ $term ] ); if ( empty( $variants ) ) { $this->variants = null; } else { $this->variants = $variants; } } /** * Builds the suggest queries and profiles. * Use with list( $profiles, $suggest ). * @return array the profiles and suggest queries */ protected function buildQuery() { if ( mb_strlen( $this->term ) > SuggestBuilder::MAX_INPUT_LENGTH ) { // Trim the query otherwise we won't find results $this->term = mb_substr( $this->term, 0, SuggestBuilder::MAX_INPUT_LENGTH ); } $queryLen = mb_strlen( trim( $this->term ) ); // Avoid cheating with spaces $this->queryType = "comp_suggest"; $profiles = $this->settings; if ( $this->context != null && isset( $this->context['geo']['lat'] ) && isset( $this->context['geo']['lon'] ) && is_numeric( $this->context['geo']['lat'] ) && is_numeric( $this->context['geo']['lon'] ) ) { $profiles = $this->prepareGeoContextSuggestProfiles(); $this->queryType = "comp_suggest_geo"; } $suggest = $this->buildSuggestQueries( $profiles, $this->term, $queryLen ); // Handle variants, update the set of profiles and suggest queries if ( !empty( $this->variants ) ) { list( $addProfiles, $addSuggest ) = $this->handleVariants( $profiles, $queryLen ); $profiles += $addProfiles; $suggest += $addSuggest; } return [ $profiles, $suggest ]; } /** * Builds a set of suggest query by reading the list of profiles * @param array $profiles * @param string $query * @param int $queryLen the length to use when checking min/max_query_len * @return array a set of suggest queries ready to for elastic */ protected function buildSuggestQueries( array $profiles, $query, $queryLen ) { $suggest = []; foreach($profiles as $name => $config) { $sugg = $this->buildSuggestQuery( $config, $query, $queryLen ); if(!$sugg) { continue; } $suggest[$name] = $sugg; } return $suggest; } /** * Builds a suggest query from a profile * @param array $config Profile * @param string $query * @param int $queryLen the length to use when checking min/max_query_len * @return array|null suggest query ready to for elastic or null */ protected function buildSuggestQuery( array $config, $query, $queryLen ) { // Do not remove spaces at the end, the user might tell us he finished writing a word $query = ltrim( $query ); if ( $config['min_query_len'] > $queryLen ) { return null; } if ( isset( $config['max_query_len'] ) && $queryLen > $config['max_query_len'] ) { return null; } $field = $config['field']; $limit = $this->getHardLimit(); $suggest = [ 'text' => $query, 'completion' => [ 'field' => $field, 'size' => $limit * $config['fetch_limit_factor'] ] ]; if ( isset( $config['fuzzy'] ) ) { $suggest['completion']['fuzzy'] = $config['fuzzy']; } if ( isset( $config['context'] ) ) { $suggest['completion']['context'] = $config['context']; } return $suggest; } /** * Update the suggest queries and return additional profiles flagged the 'fallback' key * with a discount factor = originalDiscount * 0.0001/(variantIndex+1). * @param array $profiles the default profiles * @param int $queryLen the original query length * @return array new variant profiles */ protected function handleVariants( array $profiles, $queryLen ) { $variantIndex = 0; $allVariantProfiles = []; $allSuggestions = []; foreach( $this->variants as $variant ) { $variantIndex++; foreach ( $profiles as $name => $profile ) { $variantProfName = $name . '-variant-' . $variantIndex; $allVariantProfiles[$variantProfName] = $this->buildVariantProfile( $profile, self::VARIANT_EXTRA_DISCOUNT/$variantIndex ); $allSuggestions[$variantProfName] = $this->buildSuggestQuery( $allVariantProfiles[$variantProfName], $variant, $queryLen ); } } return [ $allVariantProfiles, $allSuggestions ]; } /** * Creates a copy of $profile[$name] with a custom '-variant-SEQ' suffix. * And applies an extra discount factor of 0.0001. * The copy is added to the profiles container. * @param array $profile profile to copy * @param float $extraDiscount extra discount factor to rank variant suggestion lower. * @return array */ protected function buildVariantProfile( array $profile, $extraDiscount = 0.0001 ) { // mark the profile as a fallback query $profile['fallback'] = true; $profile['discount'] *= $extraDiscount; return $profile; } /** * prepare the list of suggest requests used for geo context suggestions * This method will merge completion settings with * $this->config->get( 'CirrusSearchCompletionGeoContextSettings' ) * @return array of suggest request profiles */ private function prepareGeoContextSuggestProfiles() { $profiles = []; foreach ( $this->config->get( 'CirrusSearchCompletionGeoContextSettings' ) as $geoname => $geoprof ) { foreach ( $this->settings as $sugname => $sugprof ) { if ( !in_array( $sugname, $geoprof['with'] ) ) { continue; } $profile = $sugprof; $profile['field'] .= $geoprof['field_suffix']; $profile['discount'] *= $geoprof['discount']; $profile['context'] = [ 'location' => [ 'lat' => $this->context['geo']['lat'], 'lon' => $this->context['geo']['lon'], 'precision' => $geoprof['precision'] ] ]; $profiles["$sugname-$geoname"] = $profile; } } return $profiles; } /** * merge top level multi-queries and resolve returned pageIds into Title objects. * * WARNING: experimental API * * @param \Elastica\Response $response Response from elasticsearch _suggest api * @param array $profiles the suggestion profiles * @return SearchSuggestionSet a set of Suggestions */ protected function postProcessSuggest( \Elastica\Response $response, $profiles ) { $this->logContext['elasticTookMs'] = intval( $response->getQueryTime() * 1000 ); $data = $response->getData(); unset( $data['_shards'] ); $limit = $this->getHardLimit(); $suggestionsByDocId = []; $suggestionProfileByDocId = []; foreach ( $data as $name => $results ) { $discount = $profiles[$name]['discount']; foreach ( $results as $suggested ) { foreach ( $suggested['options'] as $suggest ) { $output = SuggestBuilder::decodeOutput( $suggest['text'] ); if ( $output === null ) { // Ignore broken output continue; } $docId = $output['docId']; $type = $output['type']; $score = $discount * $suggest['score']; if ( !isset( $suggestionsByDocId[$docId] ) || $score > $suggestionsByDocId[$docId]->getScore() ) { $pageId = $this->config->makePageId( $docId ); $suggestion = new SearchSuggestion( $score, null, null, $pageId ); // If it's a title suggestion we have the text if ( $type === SuggestBuilder::TITLE_SUGGESTION ) { $suggestion->setText( $output['text'] ); } $suggestionsByDocId[$docId] = $suggestion; $suggestionProfileByDocId[$docId] = $name; } } } } // simply sort by existing scores uasort( $suggestionsByDocId, function ( SearchSuggestion $a, SearchSuggestion $b ) { return $b->getScore() - $a->getScore(); } ); $this->logContext['hitsTotal'] = count( $suggestionsByDocId ); $suggestionsByDocId = $this->offset < $limit ? array_slice( $suggestionsByDocId, $this->offset, $limit - $this->offset, true ) : []; $this->logContext['hitsReturned'] = count( $suggestionsByDocId ); $this->logContext['hitsOffset'] = $this->offset; // we must fetch redirect data for redirect suggestions $missingTextDocIds = []; foreach ( $suggestionsByDocId as $docId => $suggestion ) { if ( $suggestion->getText() === null ) { $missingTextDocIds[] = $docId; } } if ( !empty ( $missingTextDocIds ) ) { // Experimental. // // Second pass query to fetch redirects. // It's not clear if it's the best option, this will slowdown the whole query // when we hit a redirect suggestion. // Other option would be to encode redirects as a payload resulting in a // very big index... // XXX: we support only the content index $type = $this->connection->getPageType( $this->indexBaseName, Connection::CONTENT_INDEX_TYPE ); // NOTE: we are already in a poolCounterWork // Multi get is not supported by elastica $redirResponse = null; try { $redirResponse = $type->request( '_mget', 'GET', [ 'ids' => $missingTextDocIds ], [ '_source_include' => 'redirect' ] ); if ( $redirResponse->isOk() ) { $this->logContext['elasticTook2PassMs'] = intval( $redirResponse->getQueryTime() * 1000 ); $docs = $redirResponse->getData(); foreach ( $docs['docs'] as $doc ) { if ( empty( $doc['_source']['redirect'] ) ) { continue; } // We use the original query, we should maybe use the variant that generated this result? $text = Util::chooseBestRedirect( $this->term, $doc['_source']['redirect'] ); if( !empty( $suggestionsByDocId[$doc['_id']] ) ) { $suggestionsByDocId[$doc['_id']]->setText( $text ); } } } else { LoggerFactory::getInstance( 'CirrusSearch' )->warning( 'Unable to fetch redirects for suggestion {query} with results {ids} : {error}', [ 'query' => $this->term, 'ids' => serialize( $missingText ), 'error' => $redirResponse->getError() ] ); } } catch ( \Elastica\Exception\ExceptionInterface $e ) { $error = self::extractFullError( $e ); LoggerFactory::getInstance( 'CirrusSearch' )->warning( 'Unable to fetch redirects for suggestion {query} with results {ids}. {error_type}: {error_reason}', [ 'query' => $this->term, 'ids' => serialize( $missingText ), 'error_type' => $error['type'], 'error_reason' => $error['reason'], ] ); } } $finalResults = array_filter( $suggestionsByDocId, function ( SearchSuggestion $suggestion ) { // text should be not empty for suggestions return $suggestion->getText() != null; } ); $this->logContext['hits'] = []; $indexName = $this->connection->getIndex( $this->indexBaseName, Connection::TITLE_SUGGEST_TYPE )->getName(); $maxScore = 0; foreach ( $finalResults as $docId => $suggestion ) { $title = $suggestion->getSuggestedTitle(); $pageId = $suggestion->getSuggestedTitleID() ?: -1; $maxScore = max( $maxScore, $suggestion->getScore() ); $this->logContext['hits'][] = [ // This *must* match the names and types of the CirrusSearchHit // record in the CirrusSearchRequestSet logging channel avro schema. 'title' => $title ? (string) $title : $suggestion->getText(), 'index' => $indexName, 'pageId' => (int) $pageId, 'profileName' => isset( $suggestionProfileByDocId[$docId] ) ? $suggestionProfileByDocId[$docId] : "", 'score' => $suggestion->getScore(), ]; } $this->logContext['maxScore'] = $maxScore; return new SearchSuggestionSet( $finalResults ); } /** * Set the max number of results to extract. * @param int $limit */ public function setLimit( $limit ) { $this->limit = $limit; } /** * Set the offset * @param int $offset */ public function setOffset( $offset ) { $this->offset = $offset; } /** * Get the hard limit * The completion api does not supports offset we have to add a hack * here to work around this limitation. * To avoid ridiculously large queries we set also a hard limit. * Note that this limit will be changed by fetch_limit_factor set to 2 or 1.5 * depending on the profile. * @return int the number of results to fetch from elastic */ private function getHardLimit() { $limit = $this->limit + $this->offset; $hardLimit = $this->config->get( 'CirrusSearchCompletionSuggesterHardLimit' ); if ( $hardLimit === NULL ) { $hardLimit = 50; } if ( $limit > $hardLimit ) { return $hardLimit; } return $limit; } }