%PDF- %PDF-
| Direktori : /www/varak.net/wiki.varak.net/extensions/CirrusSearch/includes/ |
| Current File : //www/varak.net/wiki.varak.net/extensions/CirrusSearch/includes/ElasticsearchIntermediary.php |
<?php
namespace CirrusSearch;
use DeferredUpdates;
use Elastica\Client;
use Elastica\Exception\PartialShardFailureException;
use Elastica\Exception\ResponseException;
use FormatJson;
use MediaWiki\Logger\LoggerFactory;
use MediaWiki\MediaWikiServices;
use RequestContext;
use SearchResultSet;
use Status;
use Title;
use User;
use UIDGenerator;
/**
* Base class with useful functions for communicating with Elasticsearch.
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License along
* with this program; if not, write to the Free Software Foundation, Inc.,
* 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
* http://www.gnu.org/copyleft/gpl.html
*/
class ElasticsearchIntermediary {
/**
* @const int max number of results to store in CirrusSearchRequestSet logs (per request)
*/
const LOG_MAX_RESULTS = 50;
/**
* @var Connection
*/
protected $connection;
/**
* @var User|null user for which we're performing this search or null in the case of
* requests kicked off by jobs
*/
protected $user;
/**
* @var UserTesting Reports on this requests participation in tests
*/
protected $ut;
/**
* @var float|null start time of current request or null if none is running
*/
private $requestStart = null;
/**
* @var string|null description of the next request to be sent to Elasticsearch or null if not yet decided
*/
private $description = null;
/**
* @var array map of search request stats to log about the current search request
*/
protected $logContext = [];
/**
* @var int how many millis a request through this intermediary needs to take before it counts as slow.
* 0 means none count as slow.
*/
private $slowMillis;
/**
* @var array Metrics about a completed search
*/
private $searchMetrics = [];
/**
* @var string Id identifying this php execution
*/
static private $executionId;
/**
* @var array[] Result of self::getLogContext for each request in this process
*/
static private $logContexts = [];
/**
* @var array[string] Result page ids that were returned to user
*/
static private $resultTitleStrings = [];
/**
* @var int artificial extra backend latency in micro seconds
*/
private $extraBackendLatency;
/**
* Constructor.
*
* @param Connection $connection
* @param User|null $user user for which this search is being performed. Attached to slow request logs. Note that
* null isn't for anonymous users - those are still User objects and should be provided if possible. Null is for
* when the action is being performed in some context where the user that caused it isn't available. Like when an
* action is being performed during a job.
* @param float $slowSeconds how many seconds a request through this intermediary needs to take before it counts as
* slow. 0 means none count as slow.
* @param float $extraBackendLatency artificial backend latency.
*/
protected function __construct( Connection $connection, User $user = null, $slowSeconds, $extraBackendLatency = 0 ) {
$this->connection = $connection;
if ( is_null( $user ) ) {
$user = RequestContext::getMain()->getUser();
}
$this->user = $user;
$this->slowMillis = (int) ( 1000 * $slowSeconds );
$this->extraBackendLatency = $extraBackendLatency;
$this->ut = UserTesting::getInstance();
}
/**
* Identifies a specific execution of php. That might be one web
* request, or multiple jobs run in the same executor. An execution id
* is valid over a brief timespan, perhaps a minute or two for some jobs.
*
* @return string unique identifier
*/
private static function getExecutionId() {
if ( self::$executionId === null ) {
self::$executionId = mt_rand();
}
return self::$executionId;
}
/**
* Unit tests only
*/
public static function resetExecutionId() {
self::$executionId = null;
}
/**
* Summarizes all the requests made in this process and reports
* them along with the test they belong to.
*/
private static function reportLogContexts() {
if ( !self::$logContexts ) {
return;
}
self::buildRequestSetLog();
self::buildUserTestingLog();
self::$logContexts = [];
}
/**
* Builds and ships a log context that is serialized to an avro
* schema. Avro is very specific that all fields must be defined,
* even if they have a default, and that types must match exactly.
* "5" is not an int as much as php would like it to be.
*
* Avro will happily ignore fields that are present but not used. To
* add new fields to the schema they must first be added here and
* deployed. Then the schema can be updated. Removing goes in reverse,
* adjust the schema to ignore the column, then deploy code no longer
* providing it.
*/
private static function buildRequestSetLog() {
global $wgRequest;
// for the moment these are still created in the old format to serve
// the old log formats, so here we transform the context into the new
// request format. At some point the context should just be created in
// the correct format.
$requests = [];
$allCached = true;
$allHits = [];
foreach ( self::$logContexts as $context ) {
$request = [
'query' => isset( $context['query'] ) ? (string) $context['query'] : '',
'queryType' => isset( $context['queryType'] ) ? (string) $context['queryType'] : '',
// populated below
'indices' => [],
'tookMs' => isset( $context['tookMs'] ) ? (int) $context['tookMs'] : -1,
'elasticTookMs' => isset( $context['elasticTookMs'] ) ? (int) $context['elasticTookMs'] : -1,
'limit' => isset( $context['limit'] ) ? (int) $context['limit'] : -1,
'hitsTotal' => isset( $context['hitsTotal'] ) ? (int) $context['hitsTotal'] : -1,
'hitsReturned' => isset( $context['hitsReturned'] ) ? (int) $context['hitsReturned'] : -1,
'hitsOffset' => isset( $context['hitsOffset'] ) ? (int) $context['hitsOffset'] : -1,
// populated below
'namespaces' => [],
'suggestion' => isset( $context['suggestion'] ) ? (string) $context['suggestion'] : '',
'suggestionRequested' => isset( $context['suggestion'] ),
'maxScore' => isset( $context['maxScore'] ) ? $context['maxScore'] : -1,
'payload' => [],
'hits' => isset( $context['hits'] ) ? array_slice( $context['hits'], 0, self::LOG_MAX_RESULTS ) : [],
];
if ( isset( $context['hits'] ) ) {
$allHits = array_merge( $allHits, $context['hits'] );
}
if ( isset( $context['index'] ) ) {
$request['indices'][] = $context['index'];
}
if ( isset( $context['namespaces'] ) ) {
foreach ( $context['namespaces'] as $nsId ) {
$request['namespaces'][] = (int) $nsId;
}
}
if ( !empty( $context['langdetect' ] ) ) {
$request['payload']['langdetect'] = (string) $context['langdetect'];
}
if ( isset( $context['cached'] ) && $context['cached'] ) {
$request['payload']['cached'] = 'true';
} else {
$allCached = false;
}
$requests[] = $request;
}
// Note that this is only accurate for hhvm and php-fpm
// since they close the request to the user before running
// deferred updates.
$timing = \RequestContext::getMain()->getTiming();
$startMark = $timing->getEntryByName( 'requestStart' );
$endMark = $timing->getEntryByName( 'requestShutdown' );
if ( $startMark && $endMark ) {
// should always work, but Timing can return null so
// fallbacks are provided.
$tookS = $endMark['startTime'] - $startMark['startTime'];
} elseif( isset( $_SERVER['REQUEST_TIME_FLOAT'] ) ) {
// php >= 5.4
$tookS = microtime( true ) - $_SERVER['REQUEST_TIME_FLOAT'];
} else {
// php 5.3
$tookS = microtime( true ) - $_SERVER['REQUEST_TIME'];
}
// Reindex allHits by page title's. It's maybe not perfect, but it's
// hopefully a "close enough" representation of where our final result
// set came from. maybe :(
$allHitsByTitle = [];
foreach ( $allHits as $hit ) {
$allHitsByTitle[$hit['title']] = $hit;
}
$resultHits = [];
// FIXME: temporary hack to investigate why SpecialSearch can display results
// that do not come from cirrus.
$bogusResult = null;
foreach ( self::$resultTitleStrings as $titleString ) {
// Track only the first missing title.
if ( $bogusResult === null && !isset( $allHitsByTitle[$titleString] ) ) {
$bogusResult = $titleString;
}
$hit = isset( $allHitsByTitle[$titleString] ) ? $allHitsByTitle[$titleString] : [];
// Apply defaults to ensure all properties are accounted for.
$resultHits[] = $hit + [
'title' => $titleString,
'index' => "",
'pageId' => -1,
'score' => -1,
'profileName' => ""
];
}
$requestSet = [
'id' => self::getRequestSetToken(),
'ts' => time(),
'wikiId' => wfWikiID(),
'source' => self::getExecutionContext(),
'identity' => self::generateIdentToken(),
'ip' => $wgRequest->getIP() ?: '',
'userAgent' => $wgRequest->getHeader( 'User-Agent') ?: '',
'backendUserTests' => UserTesting::getInstance()->getActiveTestNamesWithBucket(),
'tookMs' => 1000 * $tookS,
'hits' => array_slice( $resultHits, 0, self::LOG_MAX_RESULTS ),
'payload' => [
// useful while we are testing accept-lang based interwiki
'acceptLang' => (string) ($wgRequest->getHeader( 'Accept-Language' ) ?: ''),
// Helps to track down what actually caused the request. Will promote to full
// param if it proves useful
'queryString' => http_build_query( $_GET ),
],
'requests' => $requests,
];
if ( $bogusResult !== null ) {
if ( is_string( $bogusResult ) ) {
$requestSet['payload']['bogusResult'] = $bogusResult;
} else {
$requestSet['payload']['bogusResult'] = 'NOT_A_STRING?: ' . gettype( $bogusResult );
}
}
if ( $allCached ) {
$requestSet['payload']['cached'] = 'true';
}
LoggerFactory::getInstance( 'CirrusSearchRequestSet' )->debug( '', $requestSet );
}
/**
* This is set externally because we don't have complete control, from the
* SearchEngine interface, of what is actually sent to the user. Instead hooks
* receive the final results that will be sent to the user and set them here.
*
* Accepts two result sets because some places (Special:Search) perform multiple
* searches. This can be called multiple times, but only that last call wins. For
* API's that is correct, for Special:Search a hook catches the final results and
* sets them here.
*
* @param array[Search\ResultSet|null] $matches
*/
public static function setResultPages( array $matches ) {
$titleStrings = [];
foreach ( $matches as $resultSet ) {
if ( $resultSet !== null ) {
$titleStrings = array_merge( $titleStrings, self::extractTitleStrings( $resultSet ) );
}
}
self::$resultTitleStrings = $titleStrings;
}
private static function extractTitleStrings( SearchResultSet $matches ) {
$strings = [];
$result = $matches->next();
while ( $result ) {
$strings[] = (string) $result->getTitle();
$result = $matches->next();
}
$matches->rewind();
return $strings;
}
/**
* Get a token that (hopefully) uniquely identifies this search. It will be
* added to the search result page js config vars, and put into the url with
* history.replaceState(). This means click through's from supported browsers
* will record this token as part of the referrer.
*
* @return string
*/
public static function getRequestSetToken() {
static $token;
if ( $token === null ) {
// random UID, 70B tokens have a collision probability of 4*10^-16
// so should work for marking unique queries.
$uuid = UIDGenerator::newUUIDv4();
// make it a little shorter by using straight base36
$hex = substr( $uuid, 0, 8 ) . substr( $uuid, 9, 4 ) .
substr( $uuid, 14, 4 ) . substr( $uuid, 19, 4) .
substr( $uuid, 24 );
$token = \Wikimedia\base_convert( $hex, 16, 36 );
}
return $token;
}
private static function buildUserTestingLog() {
global $wgRequest;
$ut = UserTesting::getInstance();
if ( !$ut->getActiveTestNames() ) {
return;
}
$queries = [];
$parameters = [
'index' => [],
'queryType' => [],
'acceptLang' => $wgRequest->getHeader( 'Accept-Language' ),
];
$elasticTook = 0;
$hits = 0;
foreach ( self::$logContexts as $context ) {
$hits += isset( $context['hitsTotal'] ) ? $context['hitsTotal'] : 0;
if ( isset( $context['query'] ) ) {
$queries[] = $context['query'];
}
if ( isset( $context['elasticTookMs'] ) ) {
$elasticTook += $context['elasticTookMs'];
}
if ( isset( $context['index'] ) ) {
$parameters['index'][] = $context['index'];
}
if ( isset( $context['queryType'] ) ) {
$parameters['queryType'][] = $context['queryType'];
}
if ( !empty( $context['langdetect' ] ) ) {
$parameters['langdetect'] = $context['langdetect'];
}
}
foreach ( [ 'index', 'queryType' ] as $key ) {
$parameters[$key] = array_values( array_unique( $parameters[$key] ) );
}
$message = [
wfWikiID(),
'',
FormatJson::encode( $queries ),
$hits,
self::getExecutionContext(),
$elasticTook,
$wgRequest->getIP(),
preg_replace( "/[\t\"']/", "", $wgRequest->getHeader( 'User-Agent') ),
FormatJson::encode( $parameters ),
self::generateIdentToken(),
];
$logger = LoggerFactory::getInstance( 'CirrusSearchUserTesting' );
foreach ( $ut->getActiveTestNames() as $test ) {
$bucket = $ut->getBucket( $test );
$message[1] = "{$test}-{$bucket}";
$logger->debug( implode( "\t", $message ) );
}
}
/**
* Report the types of queries that were issued
* within the current request.
*
* @return string[]
*/
public static function getQueryTypesUsed() {
$types = [];
foreach ( self::$logContexts as $context ) {
if ( isset( $context['queryType'] ) ) {
$types[] = $context['queryType'];
}
}
return array_unique( $types );
}
/**
* Mark the start of a request to Elasticsearch. Public so it can be called from pool counter methods.
*
* @param string $description name of the action being started
* @param array $logContext Contextual variables for generating log messages
*/
public function start( $description, array $logContext = [] ) {
$this->description = $description;
$this->logContext = $logContext;
$this->requestStart = microtime( true );
if ( $this->extraBackendLatency ) {
usleep( $this->extraBackendLatency );
}
}
/**
* Log a successful request and return the provided result in a good Status. If you don't need the status
* just ignore the return. Public so it can be called from pool counter methods.
*
* @param mixed $result result of the request. defaults to null in case the request doesn't have a result
* @return Status wrapping $result
*/
public function success( $result = null ) {
$this->finishRequest();
return Status::newGood( $result );
}
/**
* Log a successful request when the response comes from a cache outside elasticsearch.
* @param string $description name of the action being started
* @param array $logContext Contextual variables for generating log messages
*/
public function successViaCache( $description, array $logContext = [] ) {
global $wgCirrusSearchLogElasticRequests;
$this->description = $description;
$logContext['cached'] = true;
$this->logContext = $logContext;
$logContext = $this->buildLogContext( -1, null );
if ( $wgCirrusSearchLogElasticRequests ) {
$logMessage = $this->buildLogMessage( $logContext );
LoggerFactory::getInstance( 'CirrusSearchRequests' )->debug( $logMessage, $logContext );
}
$this->requestStart = null;
}
/**
* Log a failure and return an appropriate status. Public so it can be called from pool counter methods.
*
* @param \Elastica\Exception\ExceptionInterface|null $exception if the request failed
* @return Status representing a backend failure
*/
public function failure( \Elastica\Exception\ExceptionInterface $exception = null ) {
$context = $this->logContext;
$context['took'] = $this->finishRequest();
list( $status, $message ) = $this->extractMessageAndStatus( $exception );
$context['message'] = $message;
$stats = MediaWikiServices::getInstance()->getStatsdDataFactory();
$type = self::classifyError( $exception );
$clusterName = $this->connection->getClusterName();
$stats->increment( "CirrusSearch.$clusterName.backend_failure.$type" );
LoggerFactory::getInstance( 'CirrusSearch' )->warning(
"Search backend error during {$this->description} after {took}: {message}",
$context
);
return $status;
}
/**
* Broadly classify the error message into failures where
* we decided to not serve the query, and failures where
* we just failed to answer
*
* @param \Elastica\Exception\ExceptionInterface|null $exception
* @return string Either 'rejected', 'failed' or 'unknown'
*/
static public function classifyError( \Elastica\Exception\ExceptionInterface $exception = null ) {
if ( $exception === null ) {
return 'unknown';
}
$error = self::extractFullError( $exception );
if ( isset( $error['root_cause'][0]['type'] ) ) {
$error = reset( $error['root_cause'] );
} else if ( ! ( isset( $error['type'] ) && isset( $error['reason'] ) ) ) {
return 'unknown';
}
$heuristics = [
'rejected' => [
'type_regexes' => [
'(^|_)regex_',
'^too_complex_to_determinize_exception$',
'^elasticsearch_parse_exception$',
'^search_parse_exception$',
'^query_parsing_exception$',
'^illegal_argument_exception$',
'^too_many_clauses$'
],
'msg_regexes' => [],
],
'failed' => [
'type_regexes' => [
'^es_rejected_execution_exception$',
'^remote_transport_exception$',
'^search_context_missing_exception$',
'^null_pointer_exception$',
'^elasticsearch_timeout_exception$'
],
// These are exceptions thrown by elastica itself
'msg_regexes' => [
'^Couldn\'t connect to host',
'^No enabled connection',
'^Operation timed out',
],
],
];
foreach( $heuristics as $type => $heuristic ) {
$regex = implode( '|', $heuristic['type_regexes'] );
if ( $regex && preg_match( "/$regex/", $error['type'] ) ) {
return $type;
}
$regex = implode( '|', $heuristic['msg_regexes'] );
if ( $regex && preg_match( "/$regex/", $error['reason'] ) ) {
return $type;
}
}
return "unknown";
}
/**
* Get the search metrics we have
* @return array
*/
public function getSearchMetrics() {
return $this->searchMetrics;
}
/**
* Extract an error message from an exception thrown by Elastica.
* @param \Elastica\Exception\ExceptionInterface $exception exception from which to extract a message
* @return array structuerd error from the exception
* @suppress PhanUndeclaredMethod ExceptionInterface doesn't declare any methods
* so we have to suppress those warnings.
*/
public static function extractFullError( \Elastica\Exception\ExceptionInterface $exception ) {
if ( !( $exception instanceof ResponseException ) ) {
// simulate the basic full error structure
return [
'type' => 'unknown',
'reason' => $exception->getMessage()
];
}
if ( $exception instanceof PartialShardFailureException ) {
// @todo still needs to be fixed, need a way to trigger this
// failure
$shardStats = $exception->getResponse()->getShardsStatistics();
$message = [];
$type = null;
foreach ( $shardStats[ 'failures' ] as $failure ) {
$message[] = $failure['reason']['reason'];
if ( $type === null ) {
$type = $failure['reason']['type'];
}
}
return [
'type' => $type,
'reason' => 'Partial failure: ' . implode( ',', $message ),
'partial' => true
];
}
return $exception->getResponse()->getFullError();
}
/**
* @param Elastica\Exception\ExceptionInterface $exception
* @return string
*/
public static function extractMessage( \Elastica\Exception\ExceptionInterface $exception ) {
$error = self::extractFullError( $exception );
return $error['type'] . ': ' .$error['reason'];
}
/**
* Does this status represent an Elasticsearch parse error?
* @param Status $status Status to check
* @return boolean is this a parse error?
*/
protected function isParseError( $status ) {
/** @suppress PhanDeprecatedFunction No good replacements for getErrorsArray */
foreach ( $status->getErrorsArray() as $errorMessage ) {
if ( $errorMessage[ 0 ] === 'cirrussearch-parse-error' ) {
return true;
}
}
return false;
}
/**
* Log the completion of a request to Elasticsearch.
* @return int|null number of milliseconds it took to complete the request
*/
private function finishRequest() {
global $wgCirrusSearchLogElasticRequests;
if ( !$this->requestStart ) {
LoggerFactory::getInstance( 'CirrusSearch' )->warning(
'finishRequest called without staring a request'
);
return null;
}
$endTime = microtime( true );
$took = (int) ( ( $endTime - $this->requestStart ) * 1000 );
$clusterName = $this->connection->getClusterName();
$stats = MediaWikiServices::getInstance()->getStatsdDataFactory();
$stats->timing( "CirrusSearch.$clusterName.requestTime", $took );
$this->searchMetrics['wgCirrusStartTime'] = $this->requestStart;
$this->searchMetrics['wgCirrusEndTime'] = $endTime;
$logContext = $this->buildLogContext( $took, $this->connection->getClient() );
$type = isset( $logContext['queryType'] ) ? $logContext['queryType'] : 'unknown';
$stats->timing( "CirrusSearch.$clusterName.requestTimeMs.$type", $took );
if ( isset( $logContext['elasticTookMs'] ) ) {
$this->searchMetrics['wgCirrusElasticTime'] = $logContext['elasticTookMs'];
}
if ( $wgCirrusSearchLogElasticRequests ) {
$logMessage = $this->buildLogMessage( $logContext );
LoggerFactory::getInstance( 'CirrusSearchRequests' )->debug( $logMessage, $logContext );
if ( $this->slowMillis && $took >= $this->slowMillis ) {
if ( $this->user ) {
$logContext['user'] = $this->user->getName();
$logMessage .= ' for {user}';
}
LoggerFactory::getInstance( 'CirrusSearchSlowRequests' )->info( $logMessage, $logContext );
}
}
$this->requestStart = null;
return $took;
}
/**
* @param array $context Request specific log variables from self::buildLogContext()
* @return string a PSR-3 compliant message describing $context
*/
private function buildLogMessage( array $context ) {
// No need to check description because it must be set by $this->start.
$message = $this->description;
$message .= " against {index} took {tookMs} millis";
if ( isset( $context['elasticTookMs'] ) ) {
$message .= " and {elasticTookMs} Elasticsearch millis";
if ( isset( $context['elasticTook2PassMs'] ) ) {
$message .= " (with 2nd pass: {elasticTook2PassMs} ms)";
}
}
if ( isset( $context['hitsTotal'] ) ){
$message .= ". Found {hitsTotal} total results";
$message .= " and returned {hitsReturned} of them starting at {hitsOffset}";
}
if ( isset( $context['namespaces'] ) ) {
$namespaces = implode( ', ', $context['namespaces'] );
$message .= " within these namespaces: $namespaces";
}
if ( isset( $context['suggestion'] ) && strlen( $context['suggestion'] ) > 0 ) {
$message .= " and suggested '{suggestion}'";
}
$message .= ". Requested via {source} for {identity} by executor {executor}";
return $message;
}
/**
* These values end up serialized into Avro which has strict typing
* requirements. float !== int !== string.
*
* Note that this really only handles the "standard" search response
* format from elasticsearch. The completion suggester is a bit of a
* special snowflake in that it has a completely different response
* format than other searches. The CirrusSearch\CompletionSuggester
* class is responsible for providing any useful logging data by adding
* directly to $this->logContext.
*
* @param float $took Number of milliseconds the request took
* @param Client|null $client
* @return array
*/
private function buildLogContext( $took, Client $client = null ) {
global $wgCirrusSearchLogElasticRequests;
if ( $client ) {
$query = $client->getLastRequest();
$result = $client->getLastResponse();
} else {
$query = null;
$result = null;
}
$params = $this->logContext;
$this->logContext = [];
$params += [
'tookMs' => intval( $took ),
'source' => self::getExecutionContext(),
'executor' => self::getExecutionId(),
'identity' => self::generateIdentToken(),
];
if ( $result ) {
$queryData = $query->getData();
$resultData = $result->getData();
$index = explode( '/', $query->getPath() );
$params['index'] = $index[0];
if ( isset( $resultData[ 'took' ] ) ) {
$elasticTook = $resultData[ 'took' ];
$params['elasticTookMs'] = intval( $elasticTook );
}
if ( isset( $resultData['hits']['total'] ) ) {
$params['hitsTotal'] = intval( $resultData['hits']['total'] );
}
if ( isset( $resultData['hits']['max_score'] ) ) {
$params['maxScore'] = $resultData['hits']['max_score'];
}
if ( isset( $resultData['hits']['hits'] ) ) {
$num = count( $resultData['hits']['hits'] );
$offset = isset( $queryData['from'] ) ? $queryData['from'] : 0;
$params['hitsReturned'] = $num;
$params['hitsOffset'] = intval( $offset );
$params['hits'] = [];
foreach ( $resultData['hits']['hits'] as $hit ) {
if ( !isset( $hit['_source']['namespace'] )
|| !isset( $hit['_source']['title'] )
) {
// This is probably a query that does not return pages
// like geo or namespace queries
continue;
}
// duplication of work ... this happens in the transformation
// stage but we can't see that here...Perhaps we instead attach
// this data at a later stage like CompletionSuggester?
$title = Title::makeTitle( $hit['_source']['namespace'], $hit['_source']['title'] );
$params['hits'][] = [
// This *must* match the names and types of the CirrusSearchHit
// record in the CirrusSearchRequestSet logging channel avro schema.
'title' => (string) $title,
'index' => isset( $hit['_index'] ) ? $hit['_index'] : "",
'pageId' => isset( $hit['_id'] ) ? (int) $hit['_id'] : -1,
'score' => isset( $hit['_score'] ) ? (float) $hit['_score'] : -1,
// only comp_suggest has profileName, and that is handled
// elsewhere
'profileName' => "",
];
}
}
if ( $this->_isset( $queryData, [ 'query', 'filtered', 'filter', 'terms', 'namespace' ] ) ) {
$namespaces = $queryData['query']['filtered']['filter']['terms']['namespace'];
$params['namespaces'] = array_map( 'intval', $namespaces );
}
if ( isset( $resultData['suggest']['suggest'][0]['options'][0]['text'] ) ) {
$params['suggestion'] = $resultData['suggest']['suggest'][0]['options'][0]['text'];
}
}
if ( $wgCirrusSearchLogElasticRequests ) {
if ( count( self::$logContexts ) === 0 ) {
DeferredUpdates::addCallableUpdate( function () {
ElasticsearchIntermediary::reportLogContexts();
} );
}
self::$logContexts[] = $params;
}
return $params;
}
/**
* @param array $values
*/
static public function appendLastLogContext( array $values ) {
$idx = count( self::$logContexts ) - 1;
if ( $idx >= 0 ) {
self::$logContexts[$idx] += $values;
}
}
/**
* @return string The context the request is in. Either cli, api or web.
*/
static public function getExecutionContext() {
if ( php_sapi_name() === 'cli' ) {
return 'cli';
} elseif ( defined( 'MW_API' ) ) {
return 'api';
} else {
return 'web';
}
}
/**
* @param \Elastica\Exception\ExceptionInterface|null $exception
* @return array Two elements, first is Status object, second is string.
*/
private function extractMessageAndStatus( \Elastica\Exception\ExceptionInterface $exception = null ) {
if ( !$exception ) {
return [ Status::newFatal( 'cirrussearch-backend-error' ), '' ];
}
// Lots of times these are the same as getFullError(), but sometimes
// they're not. I'm looking at you PartialShardFailureException.
$error = self::extractFullError( $exception );
// These can be top level errors, or exceptions that don't extend from
// ResponseException like PartialShardFailureException or errors
// contacting the cluster.
if ( !isset( $error['root_cause'][0]['type'] ) ) {
return [
Status::newFatal( 'cirrussearch-backend-error' ),
$error['type'] . ': ' . $error['reason']
];
}
// We can have multiple root causes if the error is not the
// same on different shards. Errors will be deduplicated based
// on their type. Currently we display only the first one if
// it happens.
$cause = reset( $error['root_cause'] );
if ( $cause['type'] === 'query_parsing_exception' ) {
// The important part of the parse error message is embedded a few levels down
// and comes before the next new line so lets slurp it up and log it rather than
// the huge clump of error.
$shardFailure = reset( $error['failed_shards'] );
$message = $shardFailure['reason']['caused_by']['reason'];
$end = strpos( $message, "\n", 0 );
$parseError = substr( $message, 0, $end );
return [
Status::newFatal( 'cirrussearch-parse-error' ),
'Parse error on ' . $parseError
];
}
if ( $cause['type'] === 'too_complex_to_determinize_exception' ) {
return [ Status::newFatal(
'cirrussearch-regex-too-complex-error' ),
$cause['reason']
];
}
if ( preg_match( '/(^|_)regex_/', $cause['type'] ) ) {
$syntaxError = $cause['reason'];
$errorMessage = 'unknown';
$position = 'unknown';
// Note: we support only error coming from the extra plugin
// In the case Cirrus is installed without the plugin and
// is using the Groovy script to do regex then a generic backend error
// will be displayed.
$matches = [];
// In some cases elastic will serialize the exception by adding
// an extra message prefix with the exception type.
// If the exception is serialized through Transport:
// invalid_regex_exception: expected ']' at position 2
// Or if the exception is thrown locally by the node receiving the query:
// expected ']' at position 2
if ( preg_match( '/(?:[a-z_]+: )?(.+) at position (\d+)/', $syntaxError, $matches ) ) {
$errorMessage = $matches[ 1 ];
$position = $matches[ 2 ];
} else if ( $syntaxError === 'unexpected end-of-string' ) {
$errorMessage = 'regex too short to be correct';
}
$status = Status::newFatal( 'cirrussearch-regex-syntax-error', $errorMessage, $position );
return [ $status, 'Regex syntax error: ' . $syntaxError ];
}
return [
Status::newFatal( 'cirrussearch-backend-error' ),
$cause['type'] . ': ' . $cause['reason']
];
}
/**
* @param string $extraData Extra information to mix into the hash
* @return string A token that identifies the source of the request
*/
public static function generateIdentToken( $extraData = '' ) {
$request = \RequestContext::getMain()->getRequest();
return md5( implode( ':', [
$extraData,
$request->getIP(),
$request->getHeader( 'X-Forwarded-For' ),
$request->getHeader( 'User-Agent' ),
] ) );
}
/**
* Like isset, but wont fatal when one of the expected array keys in a
* multi-dimensional array is a string.
*
* Temporary hack required only for php 5.3. Can be removed when 5.4 is no
* longer a requirement. See T99871 for more details.
*
* @param array $array
* @param array $path
* @return bool
*/
private function _isset( $array, $path ) {
while( true ) {
$step = array_shift( $path );
if ( !isset( $array[$step] ) ) {
// next step of the path is non-existent
return false;
} elseif( !$path ) {
// reached the end of our path
return true;
} elseif ( !is_array( $array[$step] ) ) {
// more steps exist in the path, but we don't have an array
return false;
} else {
// keep looking
$array = $array[$step];
}
}
}
}