%PDF- %PDF-
| Direktori : /www/varak.net/wiki.varak.net/extensions/CirrusSearch/includes/ |
| Current File : /www/varak.net/wiki.varak.net/extensions/CirrusSearch/includes/CompletionSuggester.php |
<?php
namespace CirrusSearch;
use Elastica;
use Elastica\Request;
use CirrusSearch;
use CirrusSearch\BuildDocument\SuggestBuilder;
use CirrusSearch\Search\SearchContext;
use MediaWiki\MediaWikiServices;
use MediaWiki\Logger\LoggerFactory;
use SearchSuggestion;
use SearchSuggestionSet;
use Status;
use UsageException;
use User;
/**
* Performs search as you type queries using Completion Suggester.
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License along
* with this program; if not, write to the Free Software Foundation, Inc.,
* 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
* http://www.gnu.org/copyleft/gpl.html
*/
/**
* Completion Suggester Searcher
*
* NOTES:
* The CompletionSuggester is built on top of the ElasticSearch Completion
* Suggester.
* (https://www.elastic.co/guide/en/elasticsearch/reference/current/search-suggesters-completion.html).
*
* This class is used at query time, see
* CirrusSearch\BuildDocument\SuggestBuilder for index time logic.
*
* Document model: Cirrus documents are indexed with 2 suggestions:
*
* 1. The title suggestion (and close redirects).
* This helps to avoid displaying redirects with typos (e.g. Albert Enstein,
* Unietd States) where we make the assumption that if the redirect is close
* enough it's likely a typo and it's preferable to display the canonical title.
* This decision is made at index-time in SuggestBuilder::extractTitleAndSimilarRedirects.
*
* 2. The redirect suggestions
* Because the same canonical title can be returned twice we support fetch_limit_factor
* in suggest profiles to fetch more than what the use asked. Because the list of redirects
* can be very large we cannot store all of them in the index (see limitations). We run a second
* pass query on the main cirrus index to fetch them, then we try to detect which one is the closest
* to the user query (see Util::chooseBestRedirect).
*
* LIMITATIONS:
* A number of hacks are required in Cirrus to workaround some limitations in
* the elasticsearch completion suggester implementation:
* - It is a _suggest API, unlike classic "query then fetch" there is no fetch
* phase here.
* - Payloads are stored in memory within the FST: we try to avoid them, but
* this forces us to implement a second pass query to fetch redirect titles
* from the cirrus main index.
* - Fuzzy suggestions are ranked by index-time score: we allow to set
* 'discount' param in the suggest profile (profiles/SuggestProfiles.php). The
* default profile includes a fuzzy and non-fuzzy suggestion query. This is to
* avoid having fuzzy suggestions ranked higher than exact suggestion.
* - The suggestion string cannot be expanded to more than 255 strings at
* index time: we limit the number of generated tokens in the analysis config
* (see includes/Maintenance/SuggesterAnalysisConfigBuilder.php) but we can't
* workaround this problem for geosuggestion (suggestions will be prepended by
* geohash prefixes, one per precision step)
*
* @todo: investigate new features in elasticsearch completion suggester v2 to remove
* some workarounds (https://github.com/elastic/elasticsearch/issues/10746).
*/
class CompletionSuggester extends ElasticsearchIntermediary {
const VARIANT_EXTRA_DISCOUNT = 0.0001;
/**
* @var string term to search.
*/
private $term;
/**
* @var string[]|null search variants
*/
private $variants;
/**
* Currently very limited (see LIMITATIONS) and only works
* for geo context
* @var array|null context for contextualized suggestions
*/
private $context;
/**
* @var integer maximum number of result
*/
private $limit;
/**
* @var integer offset
*/
private $offset;
/**
* @var string index base name to use
*/
private $indexBaseName;
/**
* Search environment configuration
* @var SearchConfig
*/
private $config;
/**
* @var string Query type (comp_suggest_geo or comp_suggest)
*/
public $queryType;
/**
* @var SearchContext
*/
private $searchContext;
private $settings;
/**
* Constructor
* @param Connection $conn
* @param int $limit Limit the results to this many
* @param int $offset the offset
* @param SearchConfig $config Configuration settings
* @param int[]|null $namespaces Array of namespace numbers to search or null to search all namespaces.
* @param User|null $user user for which this search is being performed. Attached to slow request logs.
* @param string|boolean $index Base name for index to search from, defaults to $wgCirrusSearchIndexBaseName
* @param string|null $settings completion settings to use (see profiles/SuggestProfiles.php)
* @throws \ConfigException
*/
public function __construct( Connection $conn, $limit, $offset = 0, SearchConfig $config = null, array $namespaces = null,
User $user = null, $index = false, $profileName = null ) {
if ( is_null( $config ) ) {
// @todo connection has an embedded config ... reuse that? somehow should
// at least ensure they are the same.
$config = MediaWikiServices::getInstance()
->getConfigFactory()
->makeConfig( 'CirrusSearch' );
}
parent::__construct( $conn, $user, $config->get( 'CirrusSearchSlowSearch' ) );
$this->config = $config;
$this->limit = $limit;
$this->offset = $offset;
$this->indexBaseName = $index ?: $config->get( SearchConfig::INDEX_BASE_NAME );
$this->searchContext = new SearchContext( $this->config, $namespaces );
if ( $profileName == null ) {
$profileName = $this->config->get( 'CirrusSearchCompletionSettings' );
}
$this->settings = $this->config->getElement( 'CirrusSearchCompletionProfiles', $profileName );
}
/**
* @param string $search
* @throws UsageException
*/
private function checkRequestLength( $search ) {
$requestLength = mb_strlen( $search );
if ( $requestLength > Searcher::MAX_TITLE_SEARCH ) {
throw new UsageException( 'Prefix search request was longer than the maximum allowed length.' .
" ($requestLength > " . Searcher::MAX_TITLE_SEARCH . ')', 'request_too_long', 400 );
}
}
/**
* Produce a set of completion suggestions for text using _suggest
* See https://www.elastic.co/guide/en/elasticsearch/reference/1.6/search-suggesters-completion.html
*
* WARNING: experimental API
*
* @param string $text Search term
* @param string[]|null $variants Search term variants
* (usually issued from $wgContLang->autoConvertToAllVariants( $text ) )
* @param array $context
* @return Status
*/
public function suggest( $text, $variants = null, $context = null ) {
// If the offset requested is greater than the hard limit
// allowed we will always return an empty set so let's do it
// asap.
if ( $this->offset >= $this->getHardLimit() ) {
return Status::newGood( SearchSuggestionSet::emptySuggestionSet() );
}
$this->checkRequestLength( $text );
$this->setTermAndVariants( $text, $variants );
$this->context = $context;
list( $profiles, $suggest ) = $this->buildQuery();
$queryOptions = [
'timeout' => $this->config->getElement( 'CirrusSearchSearchShardTimeout', 'default' ),
];
$this->connection->setTimeout( $queryOptions[ 'timeout' ] );
$index = $this->connection->getIndex( $this->indexBaseName, Connection::TITLE_SUGGEST_TYPE );
$logContext = [
'query' => $text,
'queryType' => $this->queryType,
];
$result = Util::doPoolCounterWork(
'CirrusSearch-Completion',
$this->user,
function() use( $index, $suggest, $logContext, $queryOptions,
$profiles, $text ) {
$description = "{queryType} search for '{query}'";
$this->start( $description, $logContext );
try {
$result = $index->request( "_suggest", Request::POST, $suggest, $queryOptions );
if( $result->isOk() ) {
$result = $this->postProcessSuggest( $result, $profiles );
return $this->success( $result );
}
return $result;
} catch ( \Elastica\Exception\ExceptionInterface $e ) {
return $this->failure( $e );
}
}
);
return $result;
}
/**
* protected for tests
*
* @param string $term
* @param string[]|null $variants
*/
protected function setTermAndVariants( $term, array $variants = null ) {
$this->term = $term;
if ( empty( $variants ) ) {
$this->variants = null;
return;
}
$variants = array_diff( array_unique( $variants ), [ $term ] );
if ( empty( $variants ) ) {
$this->variants = null;
} else {
$this->variants = $variants;
}
}
/**
* Builds the suggest queries and profiles.
* Use with list( $profiles, $suggest ).
* @return array the profiles and suggest queries
*/
protected function buildQuery() {
if ( mb_strlen( $this->term ) > SuggestBuilder::MAX_INPUT_LENGTH ) {
// Trim the query otherwise we won't find results
$this->term = mb_substr( $this->term, 0, SuggestBuilder::MAX_INPUT_LENGTH );
}
$queryLen = mb_strlen( trim( $this->term ) ); // Avoid cheating with spaces
$this->queryType = "comp_suggest";
$profiles = $this->settings;
if ( $this->context != null && isset( $this->context['geo']['lat'] )
&& isset( $this->context['geo']['lon'] ) && is_numeric( $this->context['geo']['lat'] )
&& is_numeric( $this->context['geo']['lon'] )
) {
$profiles = $this->prepareGeoContextSuggestProfiles();
$this->queryType = "comp_suggest_geo";
}
$suggest = $this->buildSuggestQueries( $profiles, $this->term, $queryLen );
// Handle variants, update the set of profiles and suggest queries
if ( !empty( $this->variants ) ) {
list( $addProfiles, $addSuggest ) = $this->handleVariants( $profiles, $queryLen );
$profiles += $addProfiles;
$suggest += $addSuggest;
}
return [ $profiles, $suggest ];
}
/**
* Builds a set of suggest query by reading the list of profiles
* @param array $profiles
* @param string $query
* @param int $queryLen the length to use when checking min/max_query_len
* @return array a set of suggest queries ready to for elastic
*/
protected function buildSuggestQueries( array $profiles, $query, $queryLen ) {
$suggest = [];
foreach($profiles as $name => $config) {
$sugg = $this->buildSuggestQuery( $config, $query, $queryLen );
if(!$sugg) {
continue;
}
$suggest[$name] = $sugg;
}
return $suggest;
}
/**
* Builds a suggest query from a profile
* @param array $config Profile
* @param string $query
* @param int $queryLen the length to use when checking min/max_query_len
* @return array|null suggest query ready to for elastic or null
*/
protected function buildSuggestQuery( array $config, $query, $queryLen ) {
// Do not remove spaces at the end, the user might tell us he finished writing a word
$query = ltrim( $query );
if ( $config['min_query_len'] > $queryLen ) {
return null;
}
if ( isset( $config['max_query_len'] ) && $queryLen > $config['max_query_len'] ) {
return null;
}
$field = $config['field'];
$limit = $this->getHardLimit();
$suggest = [
'text' => $query,
'completion' => [
'field' => $field,
'size' => $limit * $config['fetch_limit_factor']
]
];
if ( isset( $config['fuzzy'] ) ) {
$suggest['completion']['fuzzy'] = $config['fuzzy'];
}
if ( isset( $config['context'] ) ) {
$suggest['completion']['context'] = $config['context'];
}
return $suggest;
}
/**
* Update the suggest queries and return additional profiles flagged the 'fallback' key
* with a discount factor = originalDiscount * 0.0001/(variantIndex+1).
* @param array $profiles the default profiles
* @param int $queryLen the original query length
* @return array new variant profiles
*/
protected function handleVariants( array $profiles, $queryLen ) {
$variantIndex = 0;
$allVariantProfiles = [];
$allSuggestions = [];
foreach( $this->variants as $variant ) {
$variantIndex++;
foreach ( $profiles as $name => $profile ) {
$variantProfName = $name . '-variant-' . $variantIndex;
$allVariantProfiles[$variantProfName] = $this->buildVariantProfile( $profile, self::VARIANT_EXTRA_DISCOUNT/$variantIndex );
$allSuggestions[$variantProfName] = $this->buildSuggestQuery(
$allVariantProfiles[$variantProfName], $variant, $queryLen
);
}
}
return [ $allVariantProfiles, $allSuggestions ];
}
/**
* Creates a copy of $profile[$name] with a custom '-variant-SEQ' suffix.
* And applies an extra discount factor of 0.0001.
* The copy is added to the profiles container.
* @param array $profile profile to copy
* @param float $extraDiscount extra discount factor to rank variant suggestion lower.
* @return array
*/
protected function buildVariantProfile( array $profile, $extraDiscount = 0.0001 ) {
// mark the profile as a fallback query
$profile['fallback'] = true;
$profile['discount'] *= $extraDiscount;
return $profile;
}
/**
* prepare the list of suggest requests used for geo context suggestions
* This method will merge completion settings with
* $this->config->get( 'CirrusSearchCompletionGeoContextSettings' )
* @return array of suggest request profiles
*/
private function prepareGeoContextSuggestProfiles() {
$profiles = [];
foreach ( $this->config->get( 'CirrusSearchCompletionGeoContextSettings' ) as $geoname => $geoprof ) {
foreach ( $this->settings as $sugname => $sugprof ) {
if ( !in_array( $sugname, $geoprof['with'] ) ) {
continue;
}
$profile = $sugprof;
$profile['field'] .= $geoprof['field_suffix'];
$profile['discount'] *= $geoprof['discount'];
$profile['context'] = [
'location' => [
'lat' => $this->context['geo']['lat'],
'lon' => $this->context['geo']['lon'],
'precision' => $geoprof['precision']
]
];
$profiles["$sugname-$geoname"] = $profile;
}
}
return $profiles;
}
/**
* merge top level multi-queries and resolve returned pageIds into Title objects.
*
* WARNING: experimental API
*
* @param \Elastica\Response $response Response from elasticsearch _suggest api
* @param array $profiles the suggestion profiles
* @return SearchSuggestionSet a set of Suggestions
*/
protected function postProcessSuggest( \Elastica\Response $response, $profiles ) {
$this->logContext['elasticTookMs'] = intval( $response->getQueryTime() * 1000 );
$data = $response->getData();
unset( $data['_shards'] );
$limit = $this->getHardLimit();
$suggestionsByDocId = [];
$suggestionProfileByDocId = [];
foreach ( $data as $name => $results ) {
$discount = $profiles[$name]['discount'];
foreach ( $results as $suggested ) {
foreach ( $suggested['options'] as $suggest ) {
$output = SuggestBuilder::decodeOutput( $suggest['text'] );
if ( $output === null ) {
// Ignore broken output
continue;
}
$docId = $output['docId'];
$type = $output['type'];
$score = $discount * $suggest['score'];
if ( !isset( $suggestionsByDocId[$docId] ) ||
$score > $suggestionsByDocId[$docId]->getScore()
) {
$pageId = $this->config->makePageId( $docId );
$suggestion = new SearchSuggestion( $score, null, null, $pageId );
// If it's a title suggestion we have the text
if ( $type === SuggestBuilder::TITLE_SUGGESTION ) {
$suggestion->setText( $output['text'] );
}
$suggestionsByDocId[$docId] = $suggestion;
$suggestionProfileByDocId[$docId] = $name;
}
}
}
}
// simply sort by existing scores
uasort( $suggestionsByDocId, function ( SearchSuggestion $a, SearchSuggestion $b ) {
return $b->getScore() - $a->getScore();
} );
$this->logContext['hitsTotal'] = count( $suggestionsByDocId );
$suggestionsByDocId = $this->offset < $limit
? array_slice( $suggestionsByDocId, $this->offset, $limit - $this->offset, true )
: [];
$this->logContext['hitsReturned'] = count( $suggestionsByDocId );
$this->logContext['hitsOffset'] = $this->offset;
// we must fetch redirect data for redirect suggestions
$missingTextDocIds = [];
foreach ( $suggestionsByDocId as $docId => $suggestion ) {
if ( $suggestion->getText() === null ) {
$missingTextDocIds[] = $docId;
}
}
if ( !empty ( $missingTextDocIds ) ) {
// Experimental.
//
// Second pass query to fetch redirects.
// It's not clear if it's the best option, this will slowdown the whole query
// when we hit a redirect suggestion.
// Other option would be to encode redirects as a payload resulting in a
// very big index...
// XXX: we support only the content index
$type = $this->connection->getPageType( $this->indexBaseName, Connection::CONTENT_INDEX_TYPE );
// NOTE: we are already in a poolCounterWork
// Multi get is not supported by elastica
$redirResponse = null;
try {
$redirResponse = $type->request( '_mget', 'GET',
[ 'ids' => $missingTextDocIds ],
[ '_source_include' => 'redirect' ] );
if ( $redirResponse->isOk() ) {
$this->logContext['elasticTook2PassMs'] = intval( $redirResponse->getQueryTime() * 1000 );
$docs = $redirResponse->getData();
foreach ( $docs['docs'] as $doc ) {
if ( empty( $doc['_source']['redirect'] ) ) {
continue;
}
// We use the original query, we should maybe use the variant that generated this result?
$text = Util::chooseBestRedirect( $this->term, $doc['_source']['redirect'] );
if( !empty( $suggestionsByDocId[$doc['_id']] ) ) {
$suggestionsByDocId[$doc['_id']]->setText( $text );
}
}
} else {
LoggerFactory::getInstance( 'CirrusSearch' )->warning(
'Unable to fetch redirects for suggestion {query} with results {ids} : {error}',
[ 'query' => $this->term,
'ids' => serialize( $missingText ),
'error' => $redirResponse->getError() ] );
}
} catch ( \Elastica\Exception\ExceptionInterface $e ) {
$error = self::extractFullError( $e );
LoggerFactory::getInstance( 'CirrusSearch' )->warning(
'Unable to fetch redirects for suggestion {query} with results {ids}. {error_type}: {error_reason}',
[
'query' => $this->term,
'ids' => serialize( $missingText ),
'error_type' => $error['type'],
'error_reason' => $error['reason'],
]
);
}
}
$finalResults = array_filter(
$suggestionsByDocId,
function ( SearchSuggestion $suggestion ) {
// text should be not empty for suggestions
return $suggestion->getText() != null;
}
);
$this->logContext['hits'] = [];
$indexName = $this->connection->getIndex( $this->indexBaseName, Connection::TITLE_SUGGEST_TYPE )->getName();
$maxScore = 0;
foreach ( $finalResults as $docId => $suggestion ) {
$title = $suggestion->getSuggestedTitle();
$pageId = $suggestion->getSuggestedTitleID() ?: -1;
$maxScore = max( $maxScore, $suggestion->getScore() );
$this->logContext['hits'][] = [
// This *must* match the names and types of the CirrusSearchHit
// record in the CirrusSearchRequestSet logging channel avro schema.
'title' => $title ? (string) $title : $suggestion->getText(),
'index' => $indexName,
'pageId' => (int) $pageId,
'profileName' => isset( $suggestionProfileByDocId[$docId] )
? $suggestionProfileByDocId[$docId]
: "",
'score' => $suggestion->getScore(),
];
}
$this->logContext['maxScore'] = $maxScore;
return new SearchSuggestionSet( $finalResults );
}
/**
* Set the max number of results to extract.
* @param int $limit
*/
public function setLimit( $limit ) {
$this->limit = $limit;
}
/**
* Set the offset
* @param int $offset
*/
public function setOffset( $offset ) {
$this->offset = $offset;
}
/**
* Get the hard limit
* The completion api does not supports offset we have to add a hack
* here to work around this limitation.
* To avoid ridiculously large queries we set also a hard limit.
* Note that this limit will be changed by fetch_limit_factor set to 2 or 1.5
* depending on the profile.
* @return int the number of results to fetch from elastic
*/
private function getHardLimit() {
$limit = $this->limit + $this->offset;
$hardLimit = $this->config->get( 'CirrusSearchCompletionSuggesterHardLimit' );
if ( $hardLimit === NULL ) {
$hardLimit = 50;
}
if ( $limit > $hardLimit ) {
return $hardLimit;
}
return $limit;
}
}