%PDF- %PDF-
| Direktori : /www/varak.net/wiki.varak.net/extensions/CirrusSearch/includes/Maintenance/ |
| Current File : /www/varak.net/wiki.varak.net/extensions/CirrusSearch/includes/Maintenance/MappingConfigBuilder.php |
<?php
namespace CirrusSearch\Maintenance;
use CirrusSearch\Search\CirrusIndexField;
use CirrusSearch\Search\IntegerIndexField;
use CirrusSearch\Search\KeywordIndexField;
use CirrusSearch\SearchConfig;
use CirrusSearch\Search\TextIndexField;
use Hooks;
use MediaWiki\MediaWikiServices;
use SearchIndexField;
/**
* Builds elasticsearch mapping configuration arrays.
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License along
* with this program; if not, write to the Free Software Foundation, Inc.,
* 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
* http://www.gnu.org/copyleft/gpl.html
*/
class MappingConfigBuilder {
// Bit field parameters for buildConfig
const PREFIX_START_WITH_ANY = 1;
const PHRASE_SUGGEST_USE_TEXT = 2;
const OPTIMIZE_FOR_EXPERIMENTAL_HIGHLIGHTER = 4;
/**
* Version number for the core analysis. Increment the major
* version when the analysis changes in an incompatible way,
* and change the minor version when it changes but isn't
* incompatible
*/
const VERSION = '1.9';
/**
* @var bool should the index be optimized for the experimental highlighter?
*/
private $optimizeForExperimentalHighlighter;
/**
* @var SearchConfig
*/
private $config;
/**
* @var \CirrusSearch
*/
private $engine;
/**
* Constructor
* @param bool $optimizeForExperimentalHighlighter should the index be optimized for the experimental highlighter?
* @param SearchConfig $config
*/
public function __construct( $optimizeForExperimentalHighlighter, SearchConfig $config = null ) {
$this->optimizeForExperimentalHighlighter = $optimizeForExperimentalHighlighter;
if ( is_null( $config ) ) {
$config =
MediaWikiServices::getInstance()->getConfigFactory()->makeConfig( 'CirrusSearch' );
}
$this->config = $config;
$this->engine = new \CirrusSearch();
$this->engine->setConfig( $config );
}
/**
* Get definitions for default index fields.
* These fields are always present in the index.
* @param int $flags
* @return array
*/
private function getDefaultFields( $flags ) {
global $wgCirrusSearchWikimediaExtraPlugin;
// Note never to set something as type='object' here because that isn't returned by elasticsearch
// and is inferred anyway.
$titleExtraAnalyzers = [
[ 'analyzer' => 'prefix', 'search_analyzer' => 'near_match', 'index_options' => 'positions', 'norms' => [ 'enabled' => false ] ],
[ 'analyzer' => 'prefix_asciifolding', 'search_analyzer' => 'near_match_asciifolding', 'index_options' => 'positions', 'norms' => [ 'enabled' => false ] ],
[ 'analyzer' => 'near_match', 'index_options' => 'positions', 'norms' => [ 'enabled' => false ] ],
[ 'analyzer' => 'near_match_asciifolding', 'index_options' => 'positions', 'norms' => [ 'enabled' => false ] ],
[ 'analyzer' => 'keyword', 'index_options' => 'positions', 'norms' => [ 'enabled' => false ] ],
];
if ( $flags & self::PREFIX_START_WITH_ANY ) {
$titleExtraAnalyzers[] = [
'analyzer' => 'word_prefix',
'search_analyzer' => 'plain_search',
'index_options' => 'positions'
];
}
$sourceExtraAnalyzers = [];
if ( isset( $wgCirrusSearchWikimediaExtraPlugin[ 'regex' ] ) &&
in_array( 'build', $wgCirrusSearchWikimediaExtraPlugin[ 'regex' ] ) ) {
$sourceExtraAnalyzers[] = [
'analyzer' => 'trigram',
'index_options' => 'positions',
];
}
$suggestField = [
'type' => 'string',
'similarity' => TextIndexField::getSimilarity( $this->config, 'suggest' ),
'index_options' => 'positions',
'analyzer' => 'suggest',
];
if ( $this->config->getElement( 'CirrusSearchPhraseSuggestReverseField', 'build' ) ) {
$suggestField['fields'] = [
'reverse' => [
'type' => 'string',
'similarity' => TextIndexField::getSimilarity( $this->config, 'suggest', 'reverse' ),
'index_options' => 'positions',
'analyzer' => 'suggest_reverse',
],
];
}
$page = [
'dynamic' => false,
'_all' => [ 'enabled' => false ],
'properties' => [
'timestamp' => [
'type' => 'date',
'format' => 'dateOptionalTime',
],
'wiki' => $this->buildKeywordField( 'wiki' )->getMapping( $this->engine ),
'namespace' => $this->buildLongField( 'namespace' )->getMapping( $this->engine ),
'namespace_text' => $this->buildKeywordField( 'namespace_text' )
->getMapping( $this->engine ),
'title' => $this->buildStringField( 'title',
TextIndexField::ENABLE_NORMS | TextIndexField::COPY_TO_SUGGEST,
$titleExtraAnalyzers )->setMappingFlags( $flags )->getMapping( $this->engine ),
'text' => array_merge_recursive( $this->buildStringField( 'text', null,
( $flags & self::PHRASE_SUGGEST_USE_TEXT ) ? [ 'analyzer' => 'suggest' ] : [ ] )
->setMappingFlags( $flags )->getMapping( $this->engine ), [
'fields' => [
'word_count' => [
'type' => 'token_count',
'store' => true,
'analyzer' => 'plain',
]
]
] ),
'text_bytes' => $this->buildLongField( 'text_bytes' )
->setFlag( SearchIndexField::FLAG_NO_INDEX )
->getMapping( $this->engine ),
'source_text' => $this->buildStringField( 'source_text', 0,
$sourceExtraAnalyzers
)->setMappingFlags( $flags )->getMapping( $this->engine ),
'redirect' => [
'dynamic' => false,
'properties' => [
'namespace' => $this->buildLongField( 'namespace' )
->getMapping( $this->engine ),
'title' => $this->buildStringField( 'redirect.title',
TextIndexField::ENABLE_NORMS | TextIndexField::SPEED_UP_HIGHLIGHTING |
TextIndexField::COPY_TO_SUGGEST, $titleExtraAnalyzers )
->setMappingFlags( $flags )
->getMapping( $this->engine ),
]
],
'incoming_links' => $this->buildLongField( 'incoming_links' )
->getMapping( $this->engine ),
'local_sites_with_dupe' => $this->buildKeywordField( 'local_sites_with_dupe' )
->setFlag( SearchIndexField::FLAG_CASEFOLD )
->getMapping( $this->engine ),
'suggest' => $suggestField,
// FIXME: this should be moved to Wikibase Client
'wikibase_item' => $this->buildKeywordField( 'wikibase_item' )
->getMapping( $this->engine ),
]
];
return $page;
}
/**
* Build the mapping config.
* @param int $flags Flags for building the configuration
* @return array the mapping config
*/
public function buildConfig( $flags = 0 ) {
global $wgCirrusSearchAllFields,
$wgCirrusSearchWeights;
if ( $this->optimizeForExperimentalHighlighter ) {
$flags |= self::OPTIMIZE_FOR_EXPERIMENTAL_HIGHLIGHTER;
}
$page = $this->getDefaultFields( $flags );
$fields = $this->engine->getSearchIndexFields();
foreach ( $fields as $fieldName => $field ) {
if ( $field instanceof CirrusIndexField ) {
$field->setMappingFlags( $flags );
}
$config = $field->getMapping( $this->engine );
if ( $config ) {
$page['properties'][$fieldName] = $config;
}
}
if ( $wgCirrusSearchAllFields[ 'build' ] ) {
// Now layer all the fields into the all field once per weight. Querying it isn't strictly the
// same as querying each field - in some ways it is better! In others it is worse....
// Better because theoretically tf/idf based scoring works better this way.
// Worse because we have to analyze each field multiple times.... Bleh!
// This field can't be used for the fvh/experimental highlighter for several reasons:
// 1. It is built with copy_to and not stored.
// 2. The term frequency information is all whoppy compared to the "real" source text.
$allField = $this->buildStringField( 'all', TextIndexField::ENABLE_NORMS );
$page['properties']['all'] =
$allField->setMappingFlags( $flags )->getMapping( $this->engine );
$page = $this->setupCopyTo( $page, $wgCirrusSearchWeights, 'all' );
// Now repeat for near_match fields. The same considerations above apply except near_match
// is never used in phrase queries or highlighting.
$page[ 'properties' ][ 'all_near_match' ] = [
'type' => 'string',
'analyzer' => 'near_match',
'index_options' => 'positions',
'position_increment_gap' => TextIndexField::POSITION_INCREMENT_GAP,
'norms' => [ 'enabled' => false ],
'similarity' => TextIndexField::getSimilarity( $this->config, 'all_near_match' ),
'fields' => [
'asciifolding' => [
'type' => 'string',
'analyzer' => 'near_match_asciifolding',
'index_options' => 'positions',
'position_increment_gap' => TextIndexField::POSITION_INCREMENT_GAP,
'norms' => [ 'enabled' => false ],
'similarity' => TextIndexField::getSimilarity( $this->config, 'all_near_match', 'asciifolding' ),
],
],
];
$nearMatchFields = [
'title' => $wgCirrusSearchWeights[ 'title' ],
'redirect' => $wgCirrusSearchWeights[ 'redirect' ],
];
$page = $this->setupCopyTo( $page, $nearMatchFields, 'all_near_match' );
}
$config[ 'page' ] = $page;
$config[ 'namespace' ] = [
'dynamic' => false,
'_all' => [ 'enabled' => false ],
'properties' => [
'name' => [
'type' => 'string',
'analyzer' => 'near_match_asciifolding',
'norms' => [ 'enabled' => false ],
'index_options' => 'positions',
'ignore_above' => KeywordIndexField::KEYWORD_IGNORE_ABOVE,
],
'wiki' => $this->buildKeywordField( 'wiki' )->getMapping( $this->engine ),
],
];
Hooks::run( 'CirrusSearchMappingConfig', [ &$config, $this ] );
return $config;
}
/**
* Setup copy_to for some fields to $destination.
* @param array $config to modify
* @param array $fields field name to number of times copied
* @param string $destination destination of the copy
* @return array $config modified with the copy_to setup
*/
private function setupCopyTo( $config, $fields, $destination ) {
foreach ( $fields as $field => $weight ) {
// Note that weights this causes weights that are not whole numbers to be rounded up.
// We're ok with that because we don't have a choice.
for ( $r = 0; $r < $weight; $r++ ) {
if ( $field === 'redirect' ) {
// Redirect is in a funky place
$config[ 'properties' ][ 'redirect' ][ 'properties' ][ 'title' ][ 'copy_to' ][] = $destination;
} else {
$config[ 'properties' ][ $field ][ 'copy_to' ][] = $destination;
}
}
}
return $config;
}
/**
* Build a string field that does standard analysis for the language.
* @param string $fieldName the field name
* @param int $options Field options:
* ENABLE_NORMS: Enable norms on the field. Good for text you search against but bad for array fields and useless
* for fields that don't get involved in the score.
* COPY_TO_SUGGEST: Copy the contents of this field to the suggest field for "Did you mean".
* SPEED_UP_HIGHLIGHTING: Store extra data in the field to speed up highlighting. This is important for long
* strings or fields with many values.
* @param array $extra Extra analyzers for this field beyond the basic text and plain.
* @return TextIndexField definition of the field
*/
protected function buildStringField( $fieldName, $options = null, $extra = [] ) {
$field = new TextIndexField( $fieldName, SearchIndexField::INDEX_TYPE_TEXT, $this->config,
$extra );
$field->setTextOptions( $options );
return $field;
}
/**
* Create a long field.
* @param string $name Field name
* @return IntegerIndexField
*/
protected function buildLongField( $name ) {
return new IntegerIndexField( $name, SearchIndexField::INDEX_TYPE_INTEGER, $this->config );
}
/**
* Create a long field.
* @param string $name Field name
* @return KeywordIndexField
*/
protected function buildKeywordField( $name ) {
return new KeywordIndexField( $name, SearchIndexField::INDEX_TYPE_KEYWORD, $this->config );
}
}