%PDF- %PDF-
| Direktori : /www/varak.net/wiki.varak.net/extensions/CirrusSearch/ |
| Current File : //www/varak.net/wiki.varak.net/extensions/CirrusSearch/CirrusSearch.php |
<?php
/**
* CirrusSearch - Searching for MediaWiki with Elasticsearch.
*
* Set $wgSearchType to 'CirrusSearch'
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License along
* with this program; if not, write to the Free Software Foundation, Inc.,
* 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
* http://www.gnu.org/copyleft/gpl.html
*/
require_once __DIR__ . "/profiles/SuggestProfiles.php";
require_once __DIR__ . "/profiles/PhraseSuggesterProfiles.config.php";
require_once __DIR__ . "/profiles/RescoreProfiles.config.php";
require_once __DIR__ . "/profiles/SimilarityProfiles.php";
require_once __DIR__ . "/profiles/SaneitizeProfiles.php";
require_once __DIR__ . "/profiles/FullTextQueryBuilderProfiles.config.php";
$wgExtensionCredits['other'][] = [
'path' => __FILE__,
'name' => 'CirrusSearch',
'author' => [ 'Nik Everett', 'Chad Horohoe', 'Erik Bernhardson' ],
'descriptionmsg' => 'cirrussearch-desc',
'url' => 'https://www.mediawiki.org/wiki/Extension:CirrusSearch',
'version' => '0.2',
'license-name' => 'GPL-2.0+'
];
/**
* Configuration
*/
// Default cluster for read operations. This is an array key
// mapping into $wgCirrusSearchClusters. When running multiple
// clusters this should be pointed to the closest cluster, and
// can be pointed at an alternate cluster during downtime.
//
// As a form of backwards compatibility the existence of
// $wgCirrusSearchServers will override all cluster configuration.
$wgCirrusSearchDefaultCluster = 'default';
// Each key is the name of an elasticsearch cluster. The value is
// a list of addresses to connect to. If no port is specified it
// defaults to 9200.
//
// All writes will be processed in all configured clusters by the
// ElasticaWrite job, unless $wgCirrusSearchWriteClusters is
// configured (see below).
//
// $wgCirrusSearchClusters = array(
// 'eqiad' => array( 'es01.eqiad.wmnet', 'es02.eqiad.wmnet' ),
// 'codfw' => array( 'es01.codfw.wmnet', 'es02.codfw.wmnet' ),
// );
$wgCirrusSearchClusters = [
'default' => [ 'localhost' ],
];
// List of clusters that can be used for writing. Must be a subset of keys
// from $wgCirrusSearchClusters.
// By default or when set to null, all keys of $wgCirrusSearchClusters are
// available for writing.
$wgCirrusSearchWriteClusters = null;
// How many times to attempt connecting to a given server
// If you're behind LVS and everything looks like one server,
// you may want to reattempt 2 or 3 times.
$wgCirrusSearchConnectionAttempts = 1;
// Number of shards for each index
// You can also set this setting for each cluster:
// $wgCirrusSearchShardCount = array(
// 'cluster1' => array( 'content' => 2, 'general' => 2 ),
// 'cluster2' => array( 'content' => 3, 'general' => 3 ),
//);
$wgCirrusSearchShardCount = [ 'content' => 4, 'general' => 4, 'titlesuggest' => 4 ];
// Number of replicas Elasticsearch can expand or contract to. This allows for
// easy development and deployment to a single node (0 replicas) to scale up to
// higher levels of replication. You if you need more redundancy you could
// adjust this to '0-10' or '0-all' or even 'false' (string, not boolean) to
// disable the behavior entirely. The default should be fine for most people.
// You can also set this setting for each cluster:
// $wgCirrusSearchReplicas = array(
// 'cluster1' => array( 'content' => '0-1', 'general' => '0-2' ),
// 'cluster2' => array( 'content' => '0-2', 'general' => '0-3' ),
//);
$wgCirrusSearchReplicas = '0-2';
// You can also specify this as an array of index type to replica count. If you
// do then you must specify all index types. For example:
// $wgCirrusSearchReplicas = array( 'content' => '0-3', 'general' => '0-2' );
// Number of shards allowed on the same elasticsearch node. Set this to 1 to
// prevent two shards from the same high traffic index from being allocated
// onto the same node.
$wgCirrusSearchMaxShardsPerNode = [];
// Example: $wgCirrusSearchMaxShardsPerNode[ 'content' ] = 1;
// How many seconds must a search of Elasticsearch be before we consider it
// slow? Default value is 10 seconds which should be fine for catching the rare
// truly abusive queries. Use Elasticsearch query more granular logs that
// don't contain user information.
$wgCirrusSearchSlowSearch = 10.0;
// Should CirrusSearch attempt to use the "experimental" highlighter. It is an
// Elasticsearch plugin that should produce better snippets for search results.
// Installation instructions are here:
// https://github.com/wikimedia/search-highlighter
// If you have the highlighter installed you can switch this on and off so long
// as you don't rebuild the index while
// $wgCirrusSearchOptimizeIndexForExperimentalHighlighter is true. Setting it
// to true without the highlighter installed will break search.
$wgCirrusSearchUseExperimentalHighlighter = false;
// Should CirrusSearch optimize the index for the experimental highlighter.
// This will speed up indexing, save a ton of space, and speed up highlighting
// slightly. This only takes effect if you rebuild the index. The downside is
// that you can no longer switch $wgCirrusSearchUseExperimentalHighlighter on
// and off - it has to stay on.
$wgCirrusSearchOptimizeIndexForExperimentalHighlighter = false;
// Should CirrusSearch try to use the wikimedia/extra plugin? An empty array
// means don't use it at all.
//
// Here is an example to enable faster regex matching:
// $wgCirrusSearchWikimediaExtraPlugin[ 'regex' ] =
// array( 'build', 'use', 'max_inspect' => 10000 );
// The 'build' value instructs Cirrus to build the index required to speed up
// regex queries. The 'use' value instructs Cirrus to use it to power regular
// expression queries. If 'use' is added before the index is rebuilt with
// 'build' in the array then regex will fail to find anything. The value of
// the 'max_inspect' key is the maximum number of pages to recheck the regex
// against. Its optional and defaults to 10000 which seems like a reasonable
// compromise to keep regexes fast while still producing good results.
//
// This turns on noop-detection for updates and is compatible with
// wikimedia-extra versions 1.3.1, 1.4.2, 1.5.0, and greater:
// $wgCirrusSearchWikimediaExtraPlugin[ 'super_detect_noop' ] = true;
//
// This allows forking on reindexing and is compatible with wikimedia-extra
// versions 1.3.1, 1.4.2, 1.5.0, and greater:
// $wgCirrusSearchWikimediaExtraPlugin[ 'id_hash_mod_filter' ] = true;
$wgCirrusSearchWikimediaExtraPlugin = [];
// Should CirrusSearch try to support regular expressions with insource:?
// These can be really expensive, but mostly ok, especially if you have the
// extra plugin installed. Sometimes they still cause issues though.
$wgCirrusSearchEnableRegex = true;
// Maximum complexity of regexes. Raising this will allow more complex
// regexes use the memory that they need to compile in Elasticsearch. The
// default allows reasonably complex regexes and doesn't use _too_ much memory.
$wgCirrusSearchRegexMaxDeterminizedStates = 20000;
// Maximum complexity of wildcard queries. Raising this value will allow
// more wildcards in search terms. 500 will allow about 20 wildcards.
// Setting a high value here can cause the cluster to consume a lot of memory
// when compiling complex wildcards queries.
// This setting requires elasticsearch 1.4+. Comment to disable.
// With elasticsearch 1.4+ if this setting is disabled the default value is
// 10000.
// With elasticsearch 1.3 this setting must be disabled.
// $wgCirrusSearchQueryStringMaxDeterminizedStates = 500;
$wgCirrusSearchQueryStringMaxDeterminizedStates = null;
// By default, Cirrus will organize pages into one of two indexes (general or
// content) based on whether a page is in a content namespace. This should
// suffice for most wikis. This setting allows individual namespaces to be
// mapped to specific index suffixes. The keys are the namespace number, and
// the value is a string name of what index suffix to use. Changing this setting
// requires a full reindex (not in-place) of the wiki. If this setting contains
// any values then the index names must also exist in $wgCirrusSearchShardCount.
$wgCirrusSearchNamespaceMappings = [];
// Extra indexes (if any) you want to search, and for what namespaces?
// The key should be the local namespace, with the value being an array of one
// or more indexes that should be searched as well for that namespace.
//
// NOTE: This setting makes no attempts to ensure compatibility across
// multiple indexes, and basically assumes everyone's using a CirrusSearch
// index that's more or less the same. Most notably, we can't guarantee
// that namespaces match up; so you should only use this for core namespaces
// or other times you can be sure that namespace IDs match 1-to-1.
//
// NOTE Part Two: Adding an index here is cause cirrus to update spawn jobs to
// update that other index, trying to set the local_sites_with_dupe field. This
// is used to filter duplicates that appear on the remote index. This is always
// done by a job, even when run from forceSearchIndex.php. If you add an image
// to your wiki but after it is in the extra search index you'll see duplicate
// results until the job is done.
$wgCirrusSearchExtraIndexes = [];
// Shard timeout for index operations. This is the amount of time
// Elasticsearch will wait around for an offline primary shard. Currently this
// is just used in page updates and not deletes. It is defined in
// Elasticsearch's time format which is a string containing a number and then a
// unit which is one of d (days), m (minutes), h (hours), ms (milliseconds) or
// w (weeks). Cirrus defaults to a very tiny value to prevent job executors
// from waiting around a long time for Elasticsearch. Instead, the job will
// fail and be retried later.
$wgCirrusSearchUpdateShardTimeout = '1ms';
// Client side timeout for non-maintenance index and delete operations and
// in seconds. Set it long enough to account for operations that may be
// delayed on the Elasticsearch node.
$wgCirrusSearchClientSideUpdateTimeout = 120;
// Client side timeout when initializing connections.
// Useful to fail fast if elasticsearch is unreachable.
// Set to 0 to use Elastica defaults (300 sec)
// You can also set this setting for each cluster:
// $wgCirrusSearchClientSideConnectTimeout = array(
// 'cluster1' => 10,
// 'cluster2' => 5,
// )
$wgCirrusSearchClientSideConnectTimeout = 5;
// The amount of time Elasticsearch will wait for search shard actions before
// giving up on them and returning the results from the other shards. Defaults
// to 20s for regular searches which is about twice the slowest queries we see.
// Some shard actions are capable of returning partial results and others are
// just ignored. Regexes default to 120 seconds because they are known to be
// slow at this point.
$wgCirrusSearchSearchShardTimeout = [
'default' => '20s',
'regex' => '120s',
];
// Client side timeout for searches in seconds. Best to keep this double the
// shard timeout to give Elasticsearch a chance to timeout the shards and return
// partial results.
$wgCirrusSearchClientSideSearchTimeout = [
'default' => 40,
'regex' => 240,
];
// Client side timeout for maintenance operations. We can't disable the timeout
// all together so we set it to one hour for really long running operations
// like optimize.
$wgCirrusSearchMaintenanceTimeout = 3600;
// Is it ok if the prefix starts on any word in the title or just the first word?
// Defaults to false (first word only) because that is the Wikipedia behavior and so
// what we expect users to expect. Does not effect the prefix: search filter or
// url parameter - that always starts with the first word. false -> true will break
// prefix searching until an in place reindex is complete. true -> false is fine
// any time and you can then go false -> true if you haven't run an in place reindex
// since the change.
$wgCirrusSearchPrefixSearchStartsWithAnyWord = false;
// Phrase slop is how many words not searched for can be in the phrase and it'll still
// match. If I search for "like yellow candy" then phraseSlop of 0 won't match "like
// brownish yellow candy" but phraseSlop of 1 will. The 'precise' key is for matching
// quoted text. The 'default' key is for matching quoted text that ends in a ~.
// The 'boost' key is used for the phrase rescore that boosts phrase matches on queries
// that don't already contain phrases.
$wgCirrusSearchPhraseSlop = [ 'precise' => 0, 'default' => 0, 'boost' => 1 ];
// If the search doesn't include any phrases (delimited by quotes) then we try wrapping
// the whole thing in quotes because sometimes that can turn up better results. This is
// the boost that we give such matches. Set this less than or equal to 1.0 to turn off
// this feature.
$wgCirrusSearchPhraseRescoreBoost = 10.0;
// Number of documents per shard for which automatic phrase matches are performed if it
// is enabled.
$wgCirrusSearchPhraseRescoreWindowSize = 512;
// Number of documents per shard for which function scoring is applied. This is stuff
// like incoming links boost, prefer-recent decay, and boost-templates.
$wgCirrusSearchFunctionRescoreWindowSize = 8192;
// If true CirrusSearch asks Elasticsearch to perform searches using a mode that should
// produce more accurate results at the cost of performance. See this for more info:
// http://www.elasticsearch.org/blog/understanding-query-then-fetch-vs-dfs-query-then-fetch/
$wgCirrusSearchMoreAccurateScoringMode = true;
/**
* Should the phrase suggester (did you mean) be enabled?
*/
$wgCirrusSearchEnablePhraseSuggest = true;
// NOTE: This settings is deprecated: update or create your own PhraseSuggester profile.
// Maximum number of terms that we ask phrase suggest to correct.
// See max_errors on http://www.elasticsearch.org/guide/reference/api/search/suggest/
// $wgCirrusSearchPhraseSuggestMaxErrors = 2;
// NOTE: This settings is deprecated: update or create your own PhraseSuggester profile.
// Confidence level required to suggest new phrases.
// See confidence on http://www.elasticsearch.org/guide/reference/api/search/suggest/
// $wgCirrusSearchPhraseSuggestConfidence = 2.0;
// Set the hard limit for $wgCirrusSearchPhraseSuggestMaxErrors. This prevents customizing
// this setting in a way that could hurt the system performances.
$wgCirrusSearchPhraseSuggestMaxErrorsHardLimit = 2;
// Set the hard limit for $wgCirrusSearchPhraseMaxTermFreq. This prevents customizing
// this setting in a way that could hurt the system performances.
$wgCirrusSearchPhraseSuggestMaxTermFreqHardLimit = 0.6;
// List of allowed values for the suggest mode
$wgCirrusSearchPhraseSuggestAllowedMode = [ 'missing', 'popular', 'always' ];
// List of allowed smoothing models
$wgCirrusSearchPhraseSuggestAllowedSmoothingModel = [ 'stupid_backoff', 'laplace', 'linear' ];
// Set the hard limit for $wgCirrusSearchPhraseSuggestPrefixLength. This prevents customizing
// this setting in a way that could hurt the system performances.
// (This is the minimal value)
$wgCirrusSearchPhraseSuggestPrefixLengthHardLimit = 2;
// Set the Phrase suggester settings using the default profile.
// see profiles/PhraseSuggesterProfiles.php
$wgCirrusSearchPhraseSuggestSettings = $wgCirrusSearchPhraseSuggestProfiles['default'];
// Use a reverse field to build the did you mean suggestions.
// This is usefull to workaround the prefix length limitation, by working with a reverse
// field we can suggest typos correction that appears in the first 2 characters of the word.
// i.e. Suggesting "search" if the user types "saerch" is possible with the reverse field.
// Set build to true and reindex before set use to true
$wgCirrusSearchPhraseSuggestReverseField = [
'build' => false,
'use' => false,
];
// Look for suggestions in the article text?
// An inplace reindex is needed after any changes to this value.
$wgCirrusSearchPhraseSuggestUseText = false;
// Look for suggestions in the article opening text?
// An inplace reindex is needed after any changes to this value.
$wgCirrusSearchPhraseSuggestUseOpeningText = false;
// Allow leading wildcard queries.
// Searching for terms that have a leading ? or * can be very slow. Turn this off to
// disable it. Terms with leading wildcards will have the wildcard escaped.
$wgCirrusSearchAllowLeadingWildcard = true;
// Maximum number of redirects per target page to index.
$wgCirrusSearchIndexedRedirects = 1024;
// Maximum number of newly linked articles to update when an article changes.
$wgCirrusSearchLinkedArticlesToUpdate = 25;
// Maximum number of newly unlinked articles to update when an article changes.
$wgCirrusSearchUnlinkedArticlesToUpdate = 25;
// Configure the similarity module
// see profile/SimilarityProfiles.php for more details
$wgCirrusSearchSimilarityProfile = $wgCirrusSearchSimilarityProfiles['default'];
// Weight of fields. Must be integers not decimals. If $wgCirrusSearchAllFields['use']
// is false this can be changed on the fly. If it is true then changes to this require
// an in place reindex to take effect.
$wgCirrusSearchWeights = [
'title' => 20,
'redirect' => 15,
'category' => 8,
'heading' => 5,
'opening_text' => 3,
'text' => 1,
'auxiliary_text' => 0.5,
'file_text' => 0.5,
];
// Weight of fields in prefix search. It is safe to change these at any time.
$wgCirrusSearchPrefixWeights = [
'title' => 10,
'redirect' => 1,
'title_asciifolding' => 7,
'redirect_asciifolding' => 0.7,
];
// Enable building and using of "all" fields that contain multiple copies of other fields
// for weighting. These all fields exist entirely to speed up the full_text query type by
// baking the weights above into a single field. This is useful because it drastically
// reduces the random io to power the query from 14 term queries per term in the query
// string to 2. Each term query is potentially one or two disk random io actions. The
// reduction isn't strictly 7:1 because we skip file_text in non file namespace (now 6:1)
// and the near match fields (title and redirect) also kick it, but only once per query.
// Also don't forget the io from the phrase rescore - this helps with that, but its even
// more muddy how much.
// Note setting 'use' to true without having set 'build' to true and performing an in place
// reindex will cause all searches to find nothing.
$wgCirrusSearchAllFields = [ 'build' => true, 'use' => true ];
// Should Cirrus use the weighted all fields for the phrase rescore if it is using them
// for the regular query?
$wgCirrusSearchAllFieldsForRescore = true;
// The method Cirrus will use to extract the opening section of the text. Valid values are:
// * first_heading - Wikipedia style. Grab the text before the first heading (h1-h6) tag.
// * none - Do not extract opening text and do not search it.
$wgCirrusSearchBoostOpening = 'first_heading';
// Weight of fields that match via "near_match" which is ordered.
$wgCirrusSearchNearMatchWeight = 2;
// Weight of stemmed fields relative to unstemmed. Meaning if searching for <used>, <use> is only
// worth this much while <used> is worth 1. Searching for <"used"> will still only find exact
// matches.
$wgCirrusSearchStemmedWeight = 0.5;
// Weight of each namespace relative to NS_MAIN. If not specified non-talk namespaces default to
// $wgCirrusSearchDefaultNamespaceWeight. If not specified talk namespaces default to:
// $wgCirrusSearchTalkNamespaceWeight * weightOfCorrespondingNonTalkNamespace
// The default values below inspired by the configuration used for lsearchd. Note that _technically_
// NS_MAIN can be overridden with this then 1 just represents what NS_MAIN would have been....
// If you override NS_MAIN here then NS_TALK will still default to:
// $wgCirrusSearchNamespaceWeights[ NS_MAIN ] * wgCirrusSearchTalkNamespaceWeight
// You can specify namespace by number or string. Strings are converted to numbers using the
// content language including aliases.
$wgCirrusSearchNamespaceWeights = [
NS_USER => 0.05,
NS_PROJECT => 0.1,
NS_MEDIAWIKI => 0.05,
NS_TEMPLATE => 0.005,
NS_HELP => 0.1,
];
// Default weight of non-talks namespaces
$wgCirrusSearchDefaultNamespaceWeight = 0.2;
// Default weight of a talk namespace relative to its corresponding non-talk namespace.
$wgCirrusSearchTalkNamespaceWeight = 0.25;
// Default weight of language field for multilingual wikis.
// 'user' is the weight given to the user's language
// 'wiki' is the weight given to the wiki's content language
// If your wiki is only one language you can leave these at 0, otherwise try setting it
// to something like 5.0 for 'user' and 2.5 for 'wiki'
$wgCirrusSearchLanguageWeight = [
'user' => 0.0,
'wiki' => 0.0,
];
// Portion of an article's score that decays with time since it's last update. Defaults to 0
// meaning don't decay the score at all unless prefer-recent: prefixes the query.
$wgCirrusSearchPreferRecentDefaultDecayPortion = 0;
// Portion of an article's score that decays with time if prefer-recent: prefixes the query but
// doesn't specify a portion. Defaults to .6 because that approximates the behavior that
// wikinews has been using for years. An article 160 days old is worth about 70% of its new score.
$wgCirrusSearchPreferRecentUnspecifiedDecayPortion = .6;
// Default number of days it takes the portion of an article's score that decays with time since
// last update to half way decay to use if prefer-recent: prefixes query and doesn't specify a
// half life or $wgCirrusSearchPreferRecentDefaultDecayPortion is non 0. Default to 160 because
// that approximates the behavior that wikinews has been using for years.
$wgCirrusSearchPreferRecentDefaultHalfLife = 160;
// Configuration parameters passed to more_like_this queries.
// Note: these values can be configured at runtime by editing the System
// message cirrussearch-morelikethis-settings
$wgCirrusSearchMoreLikeThisConfig = [
// Minimum number of documents (per shard) that need a term for it to be considered
'min_doc_freq' => 2,
// Maximum number of documents (per shard) that have a term for it to be considered
// Setting a sufficient high value can be useful to exclude stop words but it depends on the wiki size.
'max_doc_freq' => null,
// This is the max number it will collect from input data to build the query
// This value cannot exceed $wgCirrusSearchMoreLikeThisMaxQueryTermsLimit .
'max_query_terms' => 25,
// Minimum TF (number of times the term appears in the input text) for a term to be considered
// for small fields (title) tf is usually 1 so setting it to 2 will exclude all terms.
// for large fields (text) this value can help to exclude words that are not related to the subject.
'min_term_freq' => 2,
// Minimum length for a word to be considered
// small words tend to be stop words.
'min_word_len' => 0,
// Maximum length for a word to be considered
// Very long "words" tend to be uncommon, excluding them can help recall but it
// is highly dependent on the language.
'max_word_len' => 0,
// Percent of terms to match
// High value will increase precision but can prevent small docs to match against large ones
'minimum_should_match' => '30%',
];
// Hard limit to the max_query_terms parameter of more like this queries.
// This prevent running too large queries.
$wgCirrusSearchMoreLikeThisMaxQueryTermsLimit = 100;
// Set the default field used by the More Like This algorithm
$wgCirrusSearchMoreLikeThisFields = [ 'text' ];
// List of fields allowed for the more like this queries.
$wgCirrusSearchMoreLikeThisAllowedFields = [
'title',
'text',
'auxiliary_text',
'opening_text',
'headings',
'all'
];
// When set to false cirrus will use the text content to build the query
// and search on the field listed in $wgCirrusSearchMoreLikeThisFields
// Set to true if you want to use field data as input text to build the initial
// query.
// Note that if the all field is used then this setting will be forced to true.
// This is because the all field is not part of the _source and its content cannot
// be retrieved by elasticsearch.
$wgCirrusSearchMoreLikeThisUseFields = false;
// More like this is a very expensive query. This allows redirecting queries
// to a separate cluster configured in $wgCirrusSearchClusters. When set
// to a falsy value $wgCirrusSearchDefaultCluster is used.
$wgCirrusSearchMoreLikeThisCluster = null;
// More like this queries can be quite expensive. Set this to > 0 to cache the
// results for the specified # of seconds into ObjectCache (memcache, redis, or
// whatever is configured).
$wgCirrusSearchMoreLikeThisTTL = 0;
// Show the notification about this wiki using CirrusSearch on the search page.
$wgCirrusSearchShowNowUsing = false;
// CirrusSearch interwiki searching
// Keys are the interwiki prefix, values are the index to search
// Results are cached.
$wgCirrusSearchInterwikiSources = [];
// How long to cache interwiki search results for (in seconds)
$wgCirrusSearchInterwikiCacheTime = 7200;
// The seconds Elasticsearch will wait to batch index changes before making
// them available for search. Lower values make search more real time but put
// more load on Elasticsearch. Defaults to 1 second because that is the default
// in Elasticsearch. Changing this will immediately effect wait time on
// secondary (links) update if those allow waiting (basically if you use Redis
// for the job queue). For it to effect Elasticsearch you'll have to rebuild
// the index.
$wgCirrusSearchRefreshInterval = 1;
// Delay between when the job is queued for a change and when the job can be
// unqueued. The idea is to let the job queue deduplication logic take care
// of preventing multiple updates for frequently changed pages and to combine
// many of the secondary changes from template edits into a single update.
// Note that this does not work with every job queue implementation. It works
// with JobQueueRedis but is ignored with JobQueueDB.
$wgCirrusSearchUpdateDelay = [
'prioritized' => 0,
'default' => 0,
];
// List of plugins that Cirrus should ignore when it scans for plugins. This
// will cause the plugin not to be used by updateSearchIndexConfig.php and
// friends.
$wgCirrusSearchBannedPlugins = [];
// Number of times to instruct Elasticsearch to retry updates that fail on
// version conflicts. While we do have a version for each page in mediawiki
// (the revision timestamp) using it for versioning is a bit tricky because
// Cirrus uses two pass indexing the first time and sometimes needs to force
// updates. This is simpler but theoretically will put more load on
// Elasticsearch. At this point, though, we believe the load not to be
// substantial.
$wgCirrusSearchUpdateConflictRetryCount = 5;
// Number of characters to include in article fragments.
$wgCirrusSearchFragmentSize = 150;
// Should we add a cache warmer that searches for the main page to the content
// namespace?
// @see http://www.elasticsearch.org/guide/en/elasticsearch/reference/current/indices-warmers.html
$wgCirrusSearchMainPageCacheWarmer = true;
// Other cache warmers. Form is index name => array(searches). See examples
// commented out below.
$wgCirrusSearchCacheWarmers = [];
// $wgCirrusSearchCacheWarmers[ 'content' ][] = 'foo bar';
// $wgCirrusSearchCacheWarmers[ 'content' ][] = 'batman';
// $wgCirrusSearchCacheWarmers[ 'general' ][] = 'template:noble pipe';
// Whether to boost searches based on link counts. Default is true
// which most wikis will want. Edge cases will want to turn this off.
$wgCirrusSearchBoostLinks = true;
// Shard allocation settings. The include/exclude/require top level keys are
// the type of rule to use, the names should be self explanatory. The values
// are an array of keys and values of different rules to apply to an index.
//
// For example: if you wanted to make sure this index was only allocated to
// servers matching a specific IP block, you'd do this:
// $wgCirrusSearchIndexAllocation['require'] = array( '_ip' => '192.168.1.*' );
// Or let's say you want to keep an index off a given host:
// $wgCirrusSearchIndexAllocation['exclude'] = array( '_host' => 'badserver01' );
//
// Note that if you use anything other than the magic values of _ip, _name, _id
// or _host it requires you to configure the host keys/values on your server(s)
//
// http://www.elasticsearch.org/guide/en/elasticsearch/reference/current/index-modules-allocation.html
$wgCirrusSearchIndexAllocation = [
'include' => [],
'exclude' => [],
'require' => [],
];
// Dumpable config parameters. These are known not to include any private
// information and thus safe to include in the config dump. To disable the
// config dump entirely add this to your configuration after including:
// CirrusSearch.php:
// $wgApiModules['cirrus-config-dump'] = 'ApiDisabled';
$wgCirrusSearchConfigDumpWhiteList = [
'servers',
'connectionAttempts',
'shardCount',
'replicas',
'slowSearch',
'useExperimentalHighlighter',
'optimizeIndexForExperimentalHighlighter',
'namespaceMappings',
'extraIndexes',
'updateShardTimeout',
'clientSideUpdateTimeout',
'searchShardTimeout',
'clientSizeSearchTimeout',
'maintenanceTimeout',
'prefixSearchStartsWithAnyWord',
'phraseSlop',
'phraseRescoreBoost',
'phraseRescoreWindowSize',
'functionRescoreWindowSize',
'moreAccurateScoringMode',
'phraseSuggestMaxErrors',
'phraseSuggestConfidence',
'phraseSuggestUseText',
'phraseSuggestUseOpeningText',
'indexedRedirects',
'linkedArticlesToUpdate',
'unlikedArticlesToUpdate',
'weights',
'allFields',
'boostOpening',
'nearMatchWeight',
'stemmedWeight',
'namespaceWeights',
'defaultNamespaceWeight',
'talkNamespaceWeight',
'languageWeight',
'preferRecentDefaultDecayPortion',
'preferRecentUnspecifiedDecayPortion',
'preferRecentDefaultHalfLife',
'moreLikeThisConfig',
'showNowUsing',
'interwikiSources',
'interwikiCacheTime',
'refreshInterval',
'bannedPlugins',
'updateConflictRetryCount',
'fragmentSize',
'mainPageCacheWarmer',
'cacheWarmers',
'boostLinks',
'indexAllocation',
];
// Pool Counter key. If you use the PoolCounter extension, this can help segment your wiki's
// traffic into separate queues. This has no effect in vanilla MediaWiki and most people can
// just leave this as it is.
$wgCirrusSearchPoolCounterKey = '_elasticsearch';
/**
* Allow failures of the per-user Pool Counter to continue through. This
* still runs the error callbacks to trigger logging of failures, but does
* not prevent the search from running. Used to tune the per-user pool counter
* settings before enabling it fully and blocking queries.
*/
$wgCirrusSearchBypassPerUserFailure = false;
/**
* List of CIDR a.b.c.d/n ranges for which the per-user pool counter is
* always active, regardless of wgCirrusSearchBypassPerUserFailure setting.
*/
$wgCirrusSearchForcePerUserPoolCounter = [];
// Merge configuration for the indices. See
// http://www.elasticsearch.org/guide/en/elasticsearch/reference/current/index-modules-merge.html
// for the meanings.
$wgCirrusSearchMergeSettings = [
'content' => [
// Aggressive settings to try to keep the content index more optimized
// because it is searched more frequently.
'max_merge_at_once' => 5,
'segments_per_tier' => 5,
'reclaim_deletes_weight' => 3.0,
'max_merged_segment' => '25g',
],
'general' => [
// The Elasticsearch defaults for this less frequently searched index.
'max_merge_at_once' => 10,
'segments_per_tier' => 10,
'reclaim_deletes_weight' => 2.0,
'max_merged_segment' => '5g',
],
];
/**
* Whether search events should be logged in the client side.
*/
$wgCirrusSearchEnableSearchLogging = false;
/**
* Whether elasticsearch queries should be logged on the server side.
*/
$wgCirrusSearchLogElasticRequests = true;
/**
* When truthy and this value is passed as the cirrusLogElasticRequests query
* variable $wgCirrusSearchLogElasticRequests will be set to false for that
* request.
*/
$wgCirrusSearchLogElasticRequestsSecret = false;
// The maximum number of incategory:a|b|c items to OR together.
$wgCirrusSearchMaxIncategoryOptions = 100;
/**
* The URL of a "Give us your feedback" link to append to search results or
* something falsy if you don't want to show the link.
*/
$wgCirrusSearchFeedbackLink = false;
/**
* The maximum amount of time jobs delayed due to frozen indexes can remain
* in the job queue.
*/
$wgCirrusSearchDropDelayedJobsAfter = 60 * 60 * 24 * 2; // 2 days
/**
* The initial exponent used when backing off ElasticaWrite jobs. On the first
* failure the backoff will be either 2^exp or 2^(exp+1). This exponent will
* be increased to a maximum of exp+4 on repeated failures to run the job.
*/
$wgCirrusSearchWriteBackoffExponent = 6;
/**
* Configuration of individual a/b tests being run. See CirrusSearch\UserTesting
* for more information.
*/
$wgCirrusSearchUserTesting = [];
/**
* Profile for search as you type suggestion (completion suggestion)
* (see profiles/SuggestProfiles.php for more details.)
*/
$wgCirrusSearchCompletionSettings = 'fuzzy';
/**
* Enable ICU Folding instead of the default ASCII Folding.
* It allows to cover a wider range of characters when squashing diacritics.
* see https://www.elastic.co/guide/en/elasticsearch/plugins/current/analysis-icu-folding.html
* Currently this settings is only used by the CompletionSuggester.
* Requires the ICU plugin installed.
* Set to true to enable, false to use the default ASCII Folding
* NOTE: Experimental
*/
$wgCirrusSearchUseIcuFolding = false;
/**
* Set the default scoring function to be used by maintenance/updateSuggesterIndex.php
* @see includes/BuildDocument/SuggestScoring.php for more details about scoring functions
* NOTE: if you change the scoring method you'll have to rebuild the suggester index.
*/
$wgCirrusSearchCompletionDefaultScore = 'quality';
/**
* Use the completion suggester as the default implementation for searchSuggestions.
* You have to build the completion suggester index with the maintenance script
* updateSuggesterIndex.php. The suggester only supports queries to the main
* namespace. PrefixSearch will be used in all other cases.
* Valid values, all unknown values map to 'no':
* yes - Use completion suggester as the default
* beta - Allow users to enable completion suggester as a BetaFeature
* no - Don't use completion suggester
*/
$wgCirrusSearchUseCompletionSuggester = 'no';
/**
* Maximum number of results to ask from the elasticsearch completion
* api, note that this value will be multiplied by fetch_limit_factor
* set in Completion profiles (default to 2)
*/
$wgCirrusSearchCompletionSuggesterHardLimit = 50;
/**
* Try to recycle the completion suggester, if the wiki is small
* it's certainly better to not re-create the index from scratch
* since index creation is costly. Recycling the index will prevent
* elasticsearch from rebalancing shards.
* On large wikis it's maybe better to create a new index because
* documents are indexed and optimised with replication disabled
* reducing the number of disk operation to primary shards only.
*/
$wgCirrusSearchRecycleCompletionSuggesterIndex = true;
/**
* Profile for geo context search as you type suggestion (completion suggestion)
* (see profiles/SuggestProfiles.php for more details.)
*
* NOTE: This is an experimental API
*/
$wgCirrusSearchCompletionGeoContextSettings = $wgCirrusSearchCompletionGeoContextProfiles['default'];
/**
* Enable alternative language search.
*/
$wgCirrusSearchEnableAltLanguage = false;
/**
* Map of alternative languages and wikis, for search re-try.
* No defaults since we don't know how people call their other language wikis.
* Example:
* $wgCirrusSearchLanguageToWikiMap = array(
* 'ro' => 'ro',
* 'de' => 'de',
* 'ru' => 'ru',
* );
* The key is the language name, the value is interwiki link.
* You will also need to set:
* $wgCirrusSearchWikiToNameMap['ru'] = 'ruwiki';
* to link interwiki to the wiki DB name.
*/
$wgCirrusSearchLanguageToWikiMap = [];
/**
* Map of interwiki link -> wiki name
* e.g. $wgCirrusSearchWikiToNameMap['ru'] = 'ruwiki';
* FIXME: we really should already have this information, also we're possibly
* duplicating $wgCirrusSearchInterwikiSources. This needs to be fixed.
*/
$wgCirrusSearchWikiToNameMap = [];
/**
* If set to non-empty string, interwiki results will have ?wprov=XYZ parameter added.
*/
$wgCirrusSearchInterwikiProv = false;
/**
* Set the rescore profile to default.
* see profile/RescoreProfiles.php for more info
*/
$wgCirrusSearchRescoreProfile = 'classic';
/**
* If current wiki has less than this number of results, try to search other language wikis.
*/
$wgCirrusSearchInterwikiThreshold = 3;
/**
* List of classes to be used as language detectors, implementing
* CirrusSearch\LanguageDetector\Detector interface.
* Detectors will be called in the order given until one
* returns a non-null result. The array key will, currently, only be logged to the
* UserTesting logs. This is intended to be added to CirrusSearchRequestSet payload
* as well once schema migration is complete.
*
* Two options are built in:
*
* CirrusSearch\LanguageDetector\HttpAccept - uses the first language in the
* Accept-Language header that is not the current content language.
* CirrusSearch\LanguageDetector\ElasticSearch - uses the elasticsearch lang-detect plugin
* CirrusSearch\LanguageDetector\TextCat - uses TextCat library
*/
$wgCirrusSearchLanguageDetectors = [];
/**
* Directory where TextCat detector should look for language model
*/
$wgCirrusSearchTextcatModel = false;
/**
* Limit the set of languages detected by Textcat.
* Useful when some languages in the model have very bad precision, e.g.:
* $wgCirrusSearchTextcatLanguages = array( 'ar', 'it', 'de' );
*/
/**
* Overrides the master timeout on cluster wide actions, such as mapping updates.
* It may be necessary to increase this on clusters that support a large number
* of wiki's.
*/
$wgCirrusSearchMasterTimeout = '30s';
/**
* Activate/Deactivate continuous sanity check.
* The process will scan and check discrepancies between mysql and
* elasticsearch for all possible ids in the database.
* Settings will be automatically chosen according to wiki size (see
* profiles/SaneitizeProfiles.php)
* The script responsible for pushing sanitization jobs is saneitizeJobs.php.
* It needs to be scheduled by cron, default settings provided are suited
* for a bi-hourly schedule (--refresh-freq=7200).
* Setting $wgCirrusSearchSanityCheck to false will prevent the script from
* pushing new jobs even if it's still scheduled by cron.
*/
$wgCirrusSearchSanityCheck = true;
/**
* The base name of indexes used on this wiki. This value must be
* unique across all wiki's sharing an elasticsearch cluster unless
* $wgCirrusSearchMultiWikiIndices is set to true.
*/
$wgCirrusSearchIndexBaseName = wfWikiID();
/**
* Treat question marks in simple queries as question marks, not
* wildcard characters, especially at the end of a query. If the
* query doesn't use insource: and there is no escape character,
* remove ? from the end of the query, before a word boundary, or
* everywhere; also de-escape all escaped question marks.
*
* Valid values, all unknown values map to 'no':
* final - only strip trailing question marks and white space
* break - strip non-final question marks followed by a word boundary
* all - strip all question marks (and replace them with spaces)
* no - don't strip question marks
*/
$wgCirrusSearchStripQuestionMarks = 'all';
/**
* Elasticsearch QueryBuilder to use when when building
* FullText queries
*/
$wgCirrusSearchFullTextQueryBuilderProfile = 'default';
/**
* Transitionary flag for converting between older style
* doc ids (page ids) to the newer style ids (wikiid|pageid).
* Changing this from false to true requires first turning
* this on, then performing an in-place reindex. There may
* be some duplicate/outdated results while the inplace
* reindex is running.
*/
$wgCirrusSearchPrefixIds = false;
/**
* Adds an artificial backend latency in miroseconds.
* Only useful for testing.
*/
$wgCirrusSearchExtraBackendLatency = 0;
/**
* Configure default boost-templates
* Can be overridden on wiki and System messages.
*
* $wgCirrusSearchBoostTemplates = [
* 'Template:Featured article' => 2.0,
* ];
*/
$wgCirrusSearchBoostTemplates = [];
/**
* Disable customization of boot templates on wiki
* Set to true to disable onwiki config.
*/
$wgCirrusSearchIgnoreOnWikiBoostTemplates = false;
$includes = __DIR__ . "/includes/";
$apiDir = $includes . 'Api/';
$buildDocument = $includes . 'BuildDocument/';
$extraFilterDir = $includes . 'Extra/Filter/';
$jobsDir = $includes . 'Job/';
$maintenanceDir = $includes . 'Maintenance/';
$sanity = $includes . 'Sanity/';
$search = $includes . 'Search/';
/**
* Classes
*/
require_once __DIR__ . '/autoload.php';
if ( file_exists( __DIR__ . '/vendor/autoload.php' ) ) {
require_once __DIR__ . '/vendor/autoload.php';
}
/**
* Hooks
*/
$wgHooks[ 'CirrusSearchBuildDocumentLinks'][] = 'CirrusSearch\BuildDocument\RedirectsAndIncomingLinks::buildDocument';
$wgHooks[ 'AfterImportPage' ][] = 'CirrusSearch\Hooks::onAfterImportPage';
$wgHooks[ 'ApiBeforeMain' ][] = 'CirrusSearch\Hooks::onApiBeforeMain';
$wgHooks[ 'ArticleDelete' ][] = 'CirrusSearch\Hooks::onArticleDelete';
$wgHooks[ 'ArticleDeleteComplete' ][] = 'CirrusSearch\Hooks::onArticleDeleteComplete';
$wgHooks[ 'ArticleRevisionVisibilitySet' ][] = 'CirrusSearch\Hooks::onRevisionDelete';
$wgHooks[ 'BeforeInitialize' ][] = 'CirrusSearch\Hooks::onBeforeInitialize';
$wgHooks[ 'LinksUpdateComplete' ][] = 'CirrusSearch\Hooks::onLinksUpdateCompleted';
$wgHooks[ 'ResourceLoaderGetConfigVars' ][] = 'CirrusSearch\Hooks::onResourceLoaderGetConfigVars';
$wgHooks[ 'SoftwareInfo' ][] = 'CirrusSearch\Hooks::onSoftwareInfo';
$wgHooks[ 'SpecialSearchResultsPrepend' ][] = 'CirrusSearch\Hooks::onSpecialSearchResultsPrepend';
$wgHooks[ 'SpecialSearchResultsAppend' ][] = 'CirrusSearch\Hooks::onSpecialSearchResultsAppend';
$wgHooks[ 'TitleMove' ][] = 'CirrusSearch\Hooks::onTitleMove';
$wgHooks[ 'TitleMoveComplete' ][] = 'CirrusSearch\Hooks::onTitleMoveComplete';
$wgHooks[ 'UnitTestsList' ][] = 'CirrusSearch\Hooks::onUnitTestsList';
$wgHooks[ 'ShowSearchHitTitle' ][] = 'CirrusSearch\Hooks::onShowSearchHitTitle';
$wgHooks[ 'GetBetaFeaturePreferences' ][] = 'CirrusSearch\Hooks::getBetaFeaturePreferences';
$wgHooks[ 'APIAfterExecute' ][] = 'CirrusSearch\Hooks::onAPIAfterExecute';
$wgHooks[ 'SpecialSearchResults' ][] = 'CirrusSearch\Hooks::onSpecialSearchResults';
/**
* i18n
*/
$wgMessagesDirs['CirrusSearch'] = __DIR__ . '/i18n';
/**
* Jobs
*/
$wgJobClasses[ 'cirrusSearchDeletePages' ] = 'CirrusSearch\Job\DeletePages';
$wgJobClasses[ 'cirrusSearchIncomingLinkCount' ] = 'CirrusSearch\Job\IncomingLinkCount';
$wgJobClasses[ 'cirrusSearchLinksUpdate' ] = 'CirrusSearch\Job\LinksUpdate';
$wgJobClasses[ 'cirrusSearchLinksUpdatePrioritized' ] = 'CirrusSearch\Job\LinksUpdate';
$wgJobClasses[ 'cirrusSearchMassIndex' ] = 'CirrusSearch\Job\MassIndex';
$wgJobClasses[ 'cirrusSearchOtherIndex' ] = 'CirrusSearch\Job\OtherIndex';
$wgJobClasses[ 'cirrusSearchElasticaWrite' ] = 'CirrusSearch\Job\ElasticaWrite';
$wgJobClasses[ 'cirrusSearchCheckerJob' ] = 'CirrusSearch\Job\CheckerJob';
/**
* Actions
*/
$wgActions[ 'cirrusdump' ] = 'CirrusSearch\Dump';
/**
* API
*/
$wgAPIModules['cirrus-config-dump'] = 'CirrusSearch\Api\ConfigDump';
$wgAPIModules['cirrus-mapping-dump'] = 'CirrusSearch\Api\MappingDump';
$wgAPIModules['cirrus-settings-dump'] = 'CirrusSearch\Api\SettingsDump';
/**
* Configs
*/
$wgConfigRegistry['CirrusSearch'] = 'CirrusSearch\SearchConfig::newFromGlobals';
/**
* JavaScript served to all SERP's
*/
$wgResourceModules += [
"ext.cirrus.serp" => [
'scripts' => [
'resources/ext.cirrus.serp.js',
],
'dependencies' => [
'mediawiki.Uri'
],
'styles' => [],
'messages' => [],
'remoteExtPath' => 'CirrusSearch',
'localBasePath' => __DIR__,
],
];
/**
* Mapping of result types to CirrusSearch classes.
*/
$wgCirrusSearchFieldTypes = [
SearchIndexField::INDEX_TYPE_TEXT => \CirrusSearch\Search\TextIndexField::class,
SearchIndexField::INDEX_TYPE_KEYWORD => \CirrusSearch\Search\KeywordIndexField::class,
SearchIndexField::INDEX_TYPE_INTEGER => \CirrusSearch\Search\IntegerIndexField::class,
SearchIndexField::INDEX_TYPE_NUMBER => \CirrusSearch\Search\NumberIndexField::class,
SearchIndexField::INDEX_TYPE_DATETIME => \CirrusSearch\Search\DatetimeIndexField::class,
SearchIndexField::INDEX_TYPE_BOOL => \CirrusSearch\Search\BooleanIndexField::class,
SearchIndexField::INDEX_TYPE_NESTED => \CirrusSearch\Search\NestedIndexField::class,
];
/**
* Customize certain fields with a specific implementation.
* Useful to apply CirrusSearch specific config to fields
* controlled by MediaWiki core.
*/
$wgCirrusSearchFieldTypeOverrides = [
'opening_text' => \CirrusSearch\Search\OpeningTextIndexField::class,
];
/**
* Jenkins configuration required to get all the browser tests passing cleanly.
*
* @todo re-enable the code below if/when browser tests are enabled again
* on Jenkins for Cirrus, and ensure the job name check is specific to
* CirrusSearch and the entry point is not included for all extension
* browser tests that happen to have CirrusSearch as a dependency, but
* not all the other things that the below entry point requires.
*
* For now, browser tests are run via Cindy the browser test bot which
* already directly includes the entry point vs using the check below.
*
* Tests are also run for CirrusSearch on beta, but those don't use
* or need the entry point below.
if ( isset( $wgWikimediaJenkinsCI ) && $wgWikimediaJenkinsCI === true && (
PHP_SAPI !== 'cli' && // If we're not in the CLI then this is certainly a browser test
strpos( getenv( 'JOB_NAME' ), 'browsertests-CirrusSearch' ) !== false ) ) {
require( __DIR__ . '/tests/jenkins/Jenkins.php' );
}
*/