%PDF- %PDF-
Direktori : /www/varak.net/wiki.varak.net/extensions/CirrusSearch/includes/Query/ |
Current File : /www/varak.net/wiki.varak.net/extensions/CirrusSearch/includes/Query/RegexInSourceFeature.php |
<?php namespace CirrusSearch\Query; use CirrusSearch\Extra\Query\SourceRegex; use CirrusSearch\SearchConfig; use CirrusSearch\Search\SearchContext; use Elastica\Query\AbstractQuery; /** * Implements an insource: keyword supporting regular expression matching * against wikitext source. Works best when combined with the wikimedia-extra * plugin for elasticsearch, but can also fallback to a groovy based * implementation. Can be really expensive, but mostly ok if you have the extra * plugin enabled. * * Examples: * insource:/abc?/ */ class RegexInSourceFeature implements KeywordFeature { /** * @var bool Is this feature enabled? */ private $enabled; /** * @var string Locale used for case conversions. It's important that this * matches the locale used for lowercasing in the ngram index. */ private $languageCode; /** * @var string[] Configuration flags for the regex plugin */ private $regexPlugin; /** * @var int The maximum number of automaton states that Lucene's regex * compilation can expand to (even temporarily). Provides protection * against overloading the search cluster. Only works when using the * extra plugin, groovy based execution is unbounded. */ private $maxDeterminizedStates; /** * @param SearchConfig $config */ public function __construct( SearchConfig $config ) { $this->enabled = $config->get( 'CirrusSearchEnableRegex' ); $this->languageCode = $config->get( 'LanguageCode' ); $this->regexPlugin = $config->getElement( 'CirrusSearchWikimediaExtraPlugin', 'regex' ); $this->maxDeterminizedStates = $config->get( 'CirrusSearchRegexMaxDeterminizedStates' ); } /** * @param SearchContext $context * @param string $term * @return string */ public function apply( SearchContext $context, $term ) { return QueryHelper::extractSpecialSyntaxFromTerm( $context, $term, '/(?<not>-)?insource:\/(?<pattern>(?:[^\\\\\/]|\\\\.)+)\/(?<insensitive>i)? ?/', function ( $matches ) use ( $context ) { if ( !$this->enabled ) { return ''; } $context->addSyntaxUsed( 'regex' ); $context->setSearchType( 'regex' ); $insensitive = !empty( $matches['insensitive'] ); $filter = $this->regexPlugin && in_array( 'use', $this->regexPlugin ) ? $this->buildRegexWithPlugin( $matches['pattern'], $insensitive ) : $this->buildRegexWithGroovy( $matches['pattern'], $insensitive ); if ( empty( $matches['not'] ) ) { $context->addFilter( $filter ); $context->addHighlightSource( [ 'pattern' => $matches['pattern'], 'locale' => $this->languageCode, 'insensitive' => $insensitive, ] ); } else { $context->addNotFilter( $filter ); } } ); } /** * Builds a regular expression query using the wikimedia-extra plugin. * * @param string $pattern The regular expression to match * @param bool $insensitive Should the match be case insensitive? * @return AbstractQuery Regular expression query */ private function buildRegexWithPlugin( $pattern, $insensitive ) { $filter = new SourceRegex( $pattern, 'source_text', 'source_text.trigram' ); // set some defaults $this->regexPlugin += [ 'max_inspect' => 10000, ]; $filter->setMaxInspect( isset( $this->regexPlugin['max_inspect'] ) ? $this->regexPlugin['max_inspect'] : 10000 ); $filter->setMaxDeterminizedStates( $this->maxDeterminizedStates ); if ( isset( $this->regexPlugin['max_ngrams_extracted'] ) ) { $filter->setMaxNgramsExtracted( $this->regexPlugin['max_ngrams_extracted'] ); } if ( isset( $this->regexPlugin['max_ngram_clauses'] ) && is_numeric( $this->regexPlugin['max_ngram_clauses'] ) ) { $filter->setMaxNgramClauses( (int) $this->regexPlugin['max_ngram_clauses'] ); } $filter->setCaseSensitive( !$insensitive ); $filter->setLocale( $this->languageCode ); return $filter; } /** * Builds a regular expression query using groovy. It's significantly less * good than the wikimedia-extra plugin, but it's something. * * @param string $pattern The regular expression to match * @param bool $insensitive Should the match be case insensitive? * @return AbstractQuery Regular expression query */ private function buildRegexWithGroovy( $pattern, $insensitive ) { $script = <<<GROOVY import org.apache.lucene.util.automaton.*; sourceText = _source.get("source_text"); if (sourceText == null) { false; } else { if (automaton == null) { if (insensitive) { locale = new Locale(language); pattern = pattern.toLowerCase(locale); } regexp = new RegExp(pattern, RegExp.ALL ^ RegExp.AUTOMATON); automaton = new CharacterRunAutomaton(regexp.toAutomaton()); } if (insensitive) { sourceText = sourceText.toLowerCase(locale); } automaton.run(sourceText); } GROOVY; return new \Elastica\Query\Script( new \Elastica\Script\Script( $script, [ 'pattern' => '.*(' . $pattern . ').*', 'insensitive' => $insensitive, 'language' => $this->languageCode, // The null here creates a slot in which the script will shove // an automaton while executing. 'automaton' => null, 'locale' => null, ], 'groovy' ) ); } }