%PDF- %PDF-
Direktori : /www/varak.net/wiki.varak.net/extensions/CirrusSearch/includes/Maintenance/ |
Current File : /www/varak.net/wiki.varak.net/extensions/CirrusSearch/includes/Maintenance/MappingConfigBuilder.php |
<?php namespace CirrusSearch\Maintenance; use CirrusSearch\Search\CirrusIndexField; use CirrusSearch\Search\IntegerIndexField; use CirrusSearch\Search\KeywordIndexField; use CirrusSearch\SearchConfig; use CirrusSearch\Search\TextIndexField; use Hooks; use MediaWiki\MediaWikiServices; use SearchIndexField; /** * Builds elasticsearch mapping configuration arrays. * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation; either version 2 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License along * with this program; if not, write to the Free Software Foundation, Inc., * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. * http://www.gnu.org/copyleft/gpl.html */ class MappingConfigBuilder { // Bit field parameters for buildConfig const PREFIX_START_WITH_ANY = 1; const PHRASE_SUGGEST_USE_TEXT = 2; const OPTIMIZE_FOR_EXPERIMENTAL_HIGHLIGHTER = 4; /** * Version number for the core analysis. Increment the major * version when the analysis changes in an incompatible way, * and change the minor version when it changes but isn't * incompatible */ const VERSION = '1.9'; /** * @var bool should the index be optimized for the experimental highlighter? */ private $optimizeForExperimentalHighlighter; /** * @var SearchConfig */ private $config; /** * @var \CirrusSearch */ private $engine; /** * Constructor * @param bool $optimizeForExperimentalHighlighter should the index be optimized for the experimental highlighter? * @param SearchConfig $config */ public function __construct( $optimizeForExperimentalHighlighter, SearchConfig $config = null ) { $this->optimizeForExperimentalHighlighter = $optimizeForExperimentalHighlighter; if ( is_null( $config ) ) { $config = MediaWikiServices::getInstance()->getConfigFactory()->makeConfig( 'CirrusSearch' ); } $this->config = $config; $this->engine = new \CirrusSearch(); $this->engine->setConfig( $config ); } /** * Get definitions for default index fields. * These fields are always present in the index. * @param int $flags * @return array */ private function getDefaultFields( $flags ) { global $wgCirrusSearchWikimediaExtraPlugin; // Note never to set something as type='object' here because that isn't returned by elasticsearch // and is inferred anyway. $titleExtraAnalyzers = [ [ 'analyzer' => 'prefix', 'search_analyzer' => 'near_match', 'index_options' => 'positions', 'norms' => [ 'enabled' => false ] ], [ 'analyzer' => 'prefix_asciifolding', 'search_analyzer' => 'near_match_asciifolding', 'index_options' => 'positions', 'norms' => [ 'enabled' => false ] ], [ 'analyzer' => 'near_match', 'index_options' => 'positions', 'norms' => [ 'enabled' => false ] ], [ 'analyzer' => 'near_match_asciifolding', 'index_options' => 'positions', 'norms' => [ 'enabled' => false ] ], [ 'analyzer' => 'keyword', 'index_options' => 'positions', 'norms' => [ 'enabled' => false ] ], ]; if ( $flags & self::PREFIX_START_WITH_ANY ) { $titleExtraAnalyzers[] = [ 'analyzer' => 'word_prefix', 'search_analyzer' => 'plain_search', 'index_options' => 'positions' ]; } $sourceExtraAnalyzers = []; if ( isset( $wgCirrusSearchWikimediaExtraPlugin[ 'regex' ] ) && in_array( 'build', $wgCirrusSearchWikimediaExtraPlugin[ 'regex' ] ) ) { $sourceExtraAnalyzers[] = [ 'analyzer' => 'trigram', 'index_options' => 'positions', ]; } $suggestField = [ 'type' => 'string', 'similarity' => TextIndexField::getSimilarity( $this->config, 'suggest' ), 'index_options' => 'positions', 'analyzer' => 'suggest', ]; if ( $this->config->getElement( 'CirrusSearchPhraseSuggestReverseField', 'build' ) ) { $suggestField['fields'] = [ 'reverse' => [ 'type' => 'string', 'similarity' => TextIndexField::getSimilarity( $this->config, 'suggest', 'reverse' ), 'index_options' => 'positions', 'analyzer' => 'suggest_reverse', ], ]; } $page = [ 'dynamic' => false, '_all' => [ 'enabled' => false ], 'properties' => [ 'timestamp' => [ 'type' => 'date', 'format' => 'dateOptionalTime', ], 'wiki' => $this->buildKeywordField( 'wiki' )->getMapping( $this->engine ), 'namespace' => $this->buildLongField( 'namespace' )->getMapping( $this->engine ), 'namespace_text' => $this->buildKeywordField( 'namespace_text' ) ->getMapping( $this->engine ), 'title' => $this->buildStringField( 'title', TextIndexField::ENABLE_NORMS | TextIndexField::COPY_TO_SUGGEST, $titleExtraAnalyzers )->setMappingFlags( $flags )->getMapping( $this->engine ), 'text' => array_merge_recursive( $this->buildStringField( 'text', null, ( $flags & self::PHRASE_SUGGEST_USE_TEXT ) ? [ 'analyzer' => 'suggest' ] : [ ] ) ->setMappingFlags( $flags )->getMapping( $this->engine ), [ 'fields' => [ 'word_count' => [ 'type' => 'token_count', 'store' => true, 'analyzer' => 'plain', ] ] ] ), 'text_bytes' => $this->buildLongField( 'text_bytes' ) ->setFlag( SearchIndexField::FLAG_NO_INDEX ) ->getMapping( $this->engine ), 'source_text' => $this->buildStringField( 'source_text', 0, $sourceExtraAnalyzers )->setMappingFlags( $flags )->getMapping( $this->engine ), 'redirect' => [ 'dynamic' => false, 'properties' => [ 'namespace' => $this->buildLongField( 'namespace' ) ->getMapping( $this->engine ), 'title' => $this->buildStringField( 'redirect.title', TextIndexField::ENABLE_NORMS | TextIndexField::SPEED_UP_HIGHLIGHTING | TextIndexField::COPY_TO_SUGGEST, $titleExtraAnalyzers ) ->setMappingFlags( $flags ) ->getMapping( $this->engine ), ] ], 'incoming_links' => $this->buildLongField( 'incoming_links' ) ->getMapping( $this->engine ), 'local_sites_with_dupe' => $this->buildKeywordField( 'local_sites_with_dupe' ) ->setFlag( SearchIndexField::FLAG_CASEFOLD ) ->getMapping( $this->engine ), 'suggest' => $suggestField, // FIXME: this should be moved to Wikibase Client 'wikibase_item' => $this->buildKeywordField( 'wikibase_item' ) ->getMapping( $this->engine ), ] ]; return $page; } /** * Build the mapping config. * @param int $flags Flags for building the configuration * @return array the mapping config */ public function buildConfig( $flags = 0 ) { global $wgCirrusSearchAllFields, $wgCirrusSearchWeights; if ( $this->optimizeForExperimentalHighlighter ) { $flags |= self::OPTIMIZE_FOR_EXPERIMENTAL_HIGHLIGHTER; } $page = $this->getDefaultFields( $flags ); $fields = $this->engine->getSearchIndexFields(); foreach ( $fields as $fieldName => $field ) { if ( $field instanceof CirrusIndexField ) { $field->setMappingFlags( $flags ); } $config = $field->getMapping( $this->engine ); if ( $config ) { $page['properties'][$fieldName] = $config; } } if ( $wgCirrusSearchAllFields[ 'build' ] ) { // Now layer all the fields into the all field once per weight. Querying it isn't strictly the // same as querying each field - in some ways it is better! In others it is worse.... // Better because theoretically tf/idf based scoring works better this way. // Worse because we have to analyze each field multiple times.... Bleh! // This field can't be used for the fvh/experimental highlighter for several reasons: // 1. It is built with copy_to and not stored. // 2. The term frequency information is all whoppy compared to the "real" source text. $allField = $this->buildStringField( 'all', TextIndexField::ENABLE_NORMS ); $page['properties']['all'] = $allField->setMappingFlags( $flags )->getMapping( $this->engine ); $page = $this->setupCopyTo( $page, $wgCirrusSearchWeights, 'all' ); // Now repeat for near_match fields. The same considerations above apply except near_match // is never used in phrase queries or highlighting. $page[ 'properties' ][ 'all_near_match' ] = [ 'type' => 'string', 'analyzer' => 'near_match', 'index_options' => 'positions', 'position_increment_gap' => TextIndexField::POSITION_INCREMENT_GAP, 'norms' => [ 'enabled' => false ], 'similarity' => TextIndexField::getSimilarity( $this->config, 'all_near_match' ), 'fields' => [ 'asciifolding' => [ 'type' => 'string', 'analyzer' => 'near_match_asciifolding', 'index_options' => 'positions', 'position_increment_gap' => TextIndexField::POSITION_INCREMENT_GAP, 'norms' => [ 'enabled' => false ], 'similarity' => TextIndexField::getSimilarity( $this->config, 'all_near_match', 'asciifolding' ), ], ], ]; $nearMatchFields = [ 'title' => $wgCirrusSearchWeights[ 'title' ], 'redirect' => $wgCirrusSearchWeights[ 'redirect' ], ]; $page = $this->setupCopyTo( $page, $nearMatchFields, 'all_near_match' ); } $config[ 'page' ] = $page; $config[ 'namespace' ] = [ 'dynamic' => false, '_all' => [ 'enabled' => false ], 'properties' => [ 'name' => [ 'type' => 'string', 'analyzer' => 'near_match_asciifolding', 'norms' => [ 'enabled' => false ], 'index_options' => 'positions', 'ignore_above' => KeywordIndexField::KEYWORD_IGNORE_ABOVE, ], 'wiki' => $this->buildKeywordField( 'wiki' )->getMapping( $this->engine ), ], ]; Hooks::run( 'CirrusSearchMappingConfig', [ &$config, $this ] ); return $config; } /** * Setup copy_to for some fields to $destination. * @param array $config to modify * @param array $fields field name to number of times copied * @param string $destination destination of the copy * @return array $config modified with the copy_to setup */ private function setupCopyTo( $config, $fields, $destination ) { foreach ( $fields as $field => $weight ) { // Note that weights this causes weights that are not whole numbers to be rounded up. // We're ok with that because we don't have a choice. for ( $r = 0; $r < $weight; $r++ ) { if ( $field === 'redirect' ) { // Redirect is in a funky place $config[ 'properties' ][ 'redirect' ][ 'properties' ][ 'title' ][ 'copy_to' ][] = $destination; } else { $config[ 'properties' ][ $field ][ 'copy_to' ][] = $destination; } } } return $config; } /** * Build a string field that does standard analysis for the language. * @param string $fieldName the field name * @param int $options Field options: * ENABLE_NORMS: Enable norms on the field. Good for text you search against but bad for array fields and useless * for fields that don't get involved in the score. * COPY_TO_SUGGEST: Copy the contents of this field to the suggest field for "Did you mean". * SPEED_UP_HIGHLIGHTING: Store extra data in the field to speed up highlighting. This is important for long * strings or fields with many values. * @param array $extra Extra analyzers for this field beyond the basic text and plain. * @return TextIndexField definition of the field */ protected function buildStringField( $fieldName, $options = null, $extra = [] ) { $field = new TextIndexField( $fieldName, SearchIndexField::INDEX_TYPE_TEXT, $this->config, $extra ); $field->setTextOptions( $options ); return $field; } /** * Create a long field. * @param string $name Field name * @return IntegerIndexField */ protected function buildLongField( $name ) { return new IntegerIndexField( $name, SearchIndexField::INDEX_TYPE_INTEGER, $this->config ); } /** * Create a long field. * @param string $name Field name * @return KeywordIndexField */ protected function buildKeywordField( $name ) { return new KeywordIndexField( $name, SearchIndexField::INDEX_TYPE_KEYWORD, $this->config ); } }