%PDF- %PDF-
Direktori : /www/varak.net/wiki.varak.net/extensions/CirrusSearch/ |
Current File : /www/varak.net/wiki.varak.net/extensions/CirrusSearch/CirrusSearch.php |
<?php /** * CirrusSearch - Searching for MediaWiki with Elasticsearch. * * Set $wgSearchType to 'CirrusSearch' * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation; either version 2 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License along * with this program; if not, write to the Free Software Foundation, Inc., * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. * http://www.gnu.org/copyleft/gpl.html */ require_once __DIR__ . "/profiles/SuggestProfiles.php"; require_once __DIR__ . "/profiles/PhraseSuggesterProfiles.config.php"; require_once __DIR__ . "/profiles/RescoreProfiles.config.php"; require_once __DIR__ . "/profiles/SimilarityProfiles.php"; require_once __DIR__ . "/profiles/SaneitizeProfiles.php"; require_once __DIR__ . "/profiles/FullTextQueryBuilderProfiles.config.php"; $wgExtensionCredits['other'][] = [ 'path' => __FILE__, 'name' => 'CirrusSearch', 'author' => [ 'Nik Everett', 'Chad Horohoe', 'Erik Bernhardson' ], 'descriptionmsg' => 'cirrussearch-desc', 'url' => 'https://www.mediawiki.org/wiki/Extension:CirrusSearch', 'version' => '0.2', 'license-name' => 'GPL-2.0+' ]; /** * Configuration */ // Default cluster for read operations. This is an array key // mapping into $wgCirrusSearchClusters. When running multiple // clusters this should be pointed to the closest cluster, and // can be pointed at an alternate cluster during downtime. // // As a form of backwards compatibility the existence of // $wgCirrusSearchServers will override all cluster configuration. $wgCirrusSearchDefaultCluster = 'default'; // Each key is the name of an elasticsearch cluster. The value is // a list of addresses to connect to. If no port is specified it // defaults to 9200. // // All writes will be processed in all configured clusters by the // ElasticaWrite job, unless $wgCirrusSearchWriteClusters is // configured (see below). // // $wgCirrusSearchClusters = array( // 'eqiad' => array( 'es01.eqiad.wmnet', 'es02.eqiad.wmnet' ), // 'codfw' => array( 'es01.codfw.wmnet', 'es02.codfw.wmnet' ), // ); $wgCirrusSearchClusters = [ 'default' => [ 'localhost' ], ]; // List of clusters that can be used for writing. Must be a subset of keys // from $wgCirrusSearchClusters. // By default or when set to null, all keys of $wgCirrusSearchClusters are // available for writing. $wgCirrusSearchWriteClusters = null; // How many times to attempt connecting to a given server // If you're behind LVS and everything looks like one server, // you may want to reattempt 2 or 3 times. $wgCirrusSearchConnectionAttempts = 1; // Number of shards for each index // You can also set this setting for each cluster: // $wgCirrusSearchShardCount = array( // 'cluster1' => array( 'content' => 2, 'general' => 2 ), // 'cluster2' => array( 'content' => 3, 'general' => 3 ), //); $wgCirrusSearchShardCount = [ 'content' => 4, 'general' => 4, 'titlesuggest' => 4 ]; // Number of replicas Elasticsearch can expand or contract to. This allows for // easy development and deployment to a single node (0 replicas) to scale up to // higher levels of replication. You if you need more redundancy you could // adjust this to '0-10' or '0-all' or even 'false' (string, not boolean) to // disable the behavior entirely. The default should be fine for most people. // You can also set this setting for each cluster: // $wgCirrusSearchReplicas = array( // 'cluster1' => array( 'content' => '0-1', 'general' => '0-2' ), // 'cluster2' => array( 'content' => '0-2', 'general' => '0-3' ), //); $wgCirrusSearchReplicas = '0-2'; // You can also specify this as an array of index type to replica count. If you // do then you must specify all index types. For example: // $wgCirrusSearchReplicas = array( 'content' => '0-3', 'general' => '0-2' ); // Number of shards allowed on the same elasticsearch node. Set this to 1 to // prevent two shards from the same high traffic index from being allocated // onto the same node. $wgCirrusSearchMaxShardsPerNode = []; // Example: $wgCirrusSearchMaxShardsPerNode[ 'content' ] = 1; // How many seconds must a search of Elasticsearch be before we consider it // slow? Default value is 10 seconds which should be fine for catching the rare // truly abusive queries. Use Elasticsearch query more granular logs that // don't contain user information. $wgCirrusSearchSlowSearch = 10.0; // Should CirrusSearch attempt to use the "experimental" highlighter. It is an // Elasticsearch plugin that should produce better snippets for search results. // Installation instructions are here: // https://github.com/wikimedia/search-highlighter // If you have the highlighter installed you can switch this on and off so long // as you don't rebuild the index while // $wgCirrusSearchOptimizeIndexForExperimentalHighlighter is true. Setting it // to true without the highlighter installed will break search. $wgCirrusSearchUseExperimentalHighlighter = false; // Should CirrusSearch optimize the index for the experimental highlighter. // This will speed up indexing, save a ton of space, and speed up highlighting // slightly. This only takes effect if you rebuild the index. The downside is // that you can no longer switch $wgCirrusSearchUseExperimentalHighlighter on // and off - it has to stay on. $wgCirrusSearchOptimizeIndexForExperimentalHighlighter = false; // Should CirrusSearch try to use the wikimedia/extra plugin? An empty array // means don't use it at all. // // Here is an example to enable faster regex matching: // $wgCirrusSearchWikimediaExtraPlugin[ 'regex' ] = // array( 'build', 'use', 'max_inspect' => 10000 ); // The 'build' value instructs Cirrus to build the index required to speed up // regex queries. The 'use' value instructs Cirrus to use it to power regular // expression queries. If 'use' is added before the index is rebuilt with // 'build' in the array then regex will fail to find anything. The value of // the 'max_inspect' key is the maximum number of pages to recheck the regex // against. Its optional and defaults to 10000 which seems like a reasonable // compromise to keep regexes fast while still producing good results. // // This turns on noop-detection for updates and is compatible with // wikimedia-extra versions 1.3.1, 1.4.2, 1.5.0, and greater: // $wgCirrusSearchWikimediaExtraPlugin[ 'super_detect_noop' ] = true; // // This allows forking on reindexing and is compatible with wikimedia-extra // versions 1.3.1, 1.4.2, 1.5.0, and greater: // $wgCirrusSearchWikimediaExtraPlugin[ 'id_hash_mod_filter' ] = true; $wgCirrusSearchWikimediaExtraPlugin = []; // Should CirrusSearch try to support regular expressions with insource:? // These can be really expensive, but mostly ok, especially if you have the // extra plugin installed. Sometimes they still cause issues though. $wgCirrusSearchEnableRegex = true; // Maximum complexity of regexes. Raising this will allow more complex // regexes use the memory that they need to compile in Elasticsearch. The // default allows reasonably complex regexes and doesn't use _too_ much memory. $wgCirrusSearchRegexMaxDeterminizedStates = 20000; // Maximum complexity of wildcard queries. Raising this value will allow // more wildcards in search terms. 500 will allow about 20 wildcards. // Setting a high value here can cause the cluster to consume a lot of memory // when compiling complex wildcards queries. // This setting requires elasticsearch 1.4+. Comment to disable. // With elasticsearch 1.4+ if this setting is disabled the default value is // 10000. // With elasticsearch 1.3 this setting must be disabled. // $wgCirrusSearchQueryStringMaxDeterminizedStates = 500; $wgCirrusSearchQueryStringMaxDeterminizedStates = null; // By default, Cirrus will organize pages into one of two indexes (general or // content) based on whether a page is in a content namespace. This should // suffice for most wikis. This setting allows individual namespaces to be // mapped to specific index suffixes. The keys are the namespace number, and // the value is a string name of what index suffix to use. Changing this setting // requires a full reindex (not in-place) of the wiki. If this setting contains // any values then the index names must also exist in $wgCirrusSearchShardCount. $wgCirrusSearchNamespaceMappings = []; // Extra indexes (if any) you want to search, and for what namespaces? // The key should be the local namespace, with the value being an array of one // or more indexes that should be searched as well for that namespace. // // NOTE: This setting makes no attempts to ensure compatibility across // multiple indexes, and basically assumes everyone's using a CirrusSearch // index that's more or less the same. Most notably, we can't guarantee // that namespaces match up; so you should only use this for core namespaces // or other times you can be sure that namespace IDs match 1-to-1. // // NOTE Part Two: Adding an index here is cause cirrus to update spawn jobs to // update that other index, trying to set the local_sites_with_dupe field. This // is used to filter duplicates that appear on the remote index. This is always // done by a job, even when run from forceSearchIndex.php. If you add an image // to your wiki but after it is in the extra search index you'll see duplicate // results until the job is done. $wgCirrusSearchExtraIndexes = []; // Shard timeout for index operations. This is the amount of time // Elasticsearch will wait around for an offline primary shard. Currently this // is just used in page updates and not deletes. It is defined in // Elasticsearch's time format which is a string containing a number and then a // unit which is one of d (days), m (minutes), h (hours), ms (milliseconds) or // w (weeks). Cirrus defaults to a very tiny value to prevent job executors // from waiting around a long time for Elasticsearch. Instead, the job will // fail and be retried later. $wgCirrusSearchUpdateShardTimeout = '1ms'; // Client side timeout for non-maintenance index and delete operations and // in seconds. Set it long enough to account for operations that may be // delayed on the Elasticsearch node. $wgCirrusSearchClientSideUpdateTimeout = 120; // Client side timeout when initializing connections. // Useful to fail fast if elasticsearch is unreachable. // Set to 0 to use Elastica defaults (300 sec) // You can also set this setting for each cluster: // $wgCirrusSearchClientSideConnectTimeout = array( // 'cluster1' => 10, // 'cluster2' => 5, // ) $wgCirrusSearchClientSideConnectTimeout = 5; // The amount of time Elasticsearch will wait for search shard actions before // giving up on them and returning the results from the other shards. Defaults // to 20s for regular searches which is about twice the slowest queries we see. // Some shard actions are capable of returning partial results and others are // just ignored. Regexes default to 120 seconds because they are known to be // slow at this point. $wgCirrusSearchSearchShardTimeout = [ 'default' => '20s', 'regex' => '120s', ]; // Client side timeout for searches in seconds. Best to keep this double the // shard timeout to give Elasticsearch a chance to timeout the shards and return // partial results. $wgCirrusSearchClientSideSearchTimeout = [ 'default' => 40, 'regex' => 240, ]; // Client side timeout for maintenance operations. We can't disable the timeout // all together so we set it to one hour for really long running operations // like optimize. $wgCirrusSearchMaintenanceTimeout = 3600; // Is it ok if the prefix starts on any word in the title or just the first word? // Defaults to false (first word only) because that is the Wikipedia behavior and so // what we expect users to expect. Does not effect the prefix: search filter or // url parameter - that always starts with the first word. false -> true will break // prefix searching until an in place reindex is complete. true -> false is fine // any time and you can then go false -> true if you haven't run an in place reindex // since the change. $wgCirrusSearchPrefixSearchStartsWithAnyWord = false; // Phrase slop is how many words not searched for can be in the phrase and it'll still // match. If I search for "like yellow candy" then phraseSlop of 0 won't match "like // brownish yellow candy" but phraseSlop of 1 will. The 'precise' key is for matching // quoted text. The 'default' key is for matching quoted text that ends in a ~. // The 'boost' key is used for the phrase rescore that boosts phrase matches on queries // that don't already contain phrases. $wgCirrusSearchPhraseSlop = [ 'precise' => 0, 'default' => 0, 'boost' => 1 ]; // If the search doesn't include any phrases (delimited by quotes) then we try wrapping // the whole thing in quotes because sometimes that can turn up better results. This is // the boost that we give such matches. Set this less than or equal to 1.0 to turn off // this feature. $wgCirrusSearchPhraseRescoreBoost = 10.0; // Number of documents per shard for which automatic phrase matches are performed if it // is enabled. $wgCirrusSearchPhraseRescoreWindowSize = 512; // Number of documents per shard for which function scoring is applied. This is stuff // like incoming links boost, prefer-recent decay, and boost-templates. $wgCirrusSearchFunctionRescoreWindowSize = 8192; // If true CirrusSearch asks Elasticsearch to perform searches using a mode that should // produce more accurate results at the cost of performance. See this for more info: // http://www.elasticsearch.org/blog/understanding-query-then-fetch-vs-dfs-query-then-fetch/ $wgCirrusSearchMoreAccurateScoringMode = true; /** * Should the phrase suggester (did you mean) be enabled? */ $wgCirrusSearchEnablePhraseSuggest = true; // NOTE: This settings is deprecated: update or create your own PhraseSuggester profile. // Maximum number of terms that we ask phrase suggest to correct. // See max_errors on http://www.elasticsearch.org/guide/reference/api/search/suggest/ // $wgCirrusSearchPhraseSuggestMaxErrors = 2; // NOTE: This settings is deprecated: update or create your own PhraseSuggester profile. // Confidence level required to suggest new phrases. // See confidence on http://www.elasticsearch.org/guide/reference/api/search/suggest/ // $wgCirrusSearchPhraseSuggestConfidence = 2.0; // Set the hard limit for $wgCirrusSearchPhraseSuggestMaxErrors. This prevents customizing // this setting in a way that could hurt the system performances. $wgCirrusSearchPhraseSuggestMaxErrorsHardLimit = 2; // Set the hard limit for $wgCirrusSearchPhraseMaxTermFreq. This prevents customizing // this setting in a way that could hurt the system performances. $wgCirrusSearchPhraseSuggestMaxTermFreqHardLimit = 0.6; // List of allowed values for the suggest mode $wgCirrusSearchPhraseSuggestAllowedMode = [ 'missing', 'popular', 'always' ]; // List of allowed smoothing models $wgCirrusSearchPhraseSuggestAllowedSmoothingModel = [ 'stupid_backoff', 'laplace', 'linear' ]; // Set the hard limit for $wgCirrusSearchPhraseSuggestPrefixLength. This prevents customizing // this setting in a way that could hurt the system performances. // (This is the minimal value) $wgCirrusSearchPhraseSuggestPrefixLengthHardLimit = 2; // Set the Phrase suggester settings using the default profile. // see profiles/PhraseSuggesterProfiles.php $wgCirrusSearchPhraseSuggestSettings = $wgCirrusSearchPhraseSuggestProfiles['default']; // Use a reverse field to build the did you mean suggestions. // This is usefull to workaround the prefix length limitation, by working with a reverse // field we can suggest typos correction that appears in the first 2 characters of the word. // i.e. Suggesting "search" if the user types "saerch" is possible with the reverse field. // Set build to true and reindex before set use to true $wgCirrusSearchPhraseSuggestReverseField = [ 'build' => false, 'use' => false, ]; // Look for suggestions in the article text? // An inplace reindex is needed after any changes to this value. $wgCirrusSearchPhraseSuggestUseText = false; // Look for suggestions in the article opening text? // An inplace reindex is needed after any changes to this value. $wgCirrusSearchPhraseSuggestUseOpeningText = false; // Allow leading wildcard queries. // Searching for terms that have a leading ? or * can be very slow. Turn this off to // disable it. Terms with leading wildcards will have the wildcard escaped. $wgCirrusSearchAllowLeadingWildcard = true; // Maximum number of redirects per target page to index. $wgCirrusSearchIndexedRedirects = 1024; // Maximum number of newly linked articles to update when an article changes. $wgCirrusSearchLinkedArticlesToUpdate = 25; // Maximum number of newly unlinked articles to update when an article changes. $wgCirrusSearchUnlinkedArticlesToUpdate = 25; // Configure the similarity module // see profile/SimilarityProfiles.php for more details $wgCirrusSearchSimilarityProfile = $wgCirrusSearchSimilarityProfiles['default']; // Weight of fields. Must be integers not decimals. If $wgCirrusSearchAllFields['use'] // is false this can be changed on the fly. If it is true then changes to this require // an in place reindex to take effect. $wgCirrusSearchWeights = [ 'title' => 20, 'redirect' => 15, 'category' => 8, 'heading' => 5, 'opening_text' => 3, 'text' => 1, 'auxiliary_text' => 0.5, 'file_text' => 0.5, ]; // Weight of fields in prefix search. It is safe to change these at any time. $wgCirrusSearchPrefixWeights = [ 'title' => 10, 'redirect' => 1, 'title_asciifolding' => 7, 'redirect_asciifolding' => 0.7, ]; // Enable building and using of "all" fields that contain multiple copies of other fields // for weighting. These all fields exist entirely to speed up the full_text query type by // baking the weights above into a single field. This is useful because it drastically // reduces the random io to power the query from 14 term queries per term in the query // string to 2. Each term query is potentially one or two disk random io actions. The // reduction isn't strictly 7:1 because we skip file_text in non file namespace (now 6:1) // and the near match fields (title and redirect) also kick it, but only once per query. // Also don't forget the io from the phrase rescore - this helps with that, but its even // more muddy how much. // Note setting 'use' to true without having set 'build' to true and performing an in place // reindex will cause all searches to find nothing. $wgCirrusSearchAllFields = [ 'build' => true, 'use' => true ]; // Should Cirrus use the weighted all fields for the phrase rescore if it is using them // for the regular query? $wgCirrusSearchAllFieldsForRescore = true; // The method Cirrus will use to extract the opening section of the text. Valid values are: // * first_heading - Wikipedia style. Grab the text before the first heading (h1-h6) tag. // * none - Do not extract opening text and do not search it. $wgCirrusSearchBoostOpening = 'first_heading'; // Weight of fields that match via "near_match" which is ordered. $wgCirrusSearchNearMatchWeight = 2; // Weight of stemmed fields relative to unstemmed. Meaning if searching for <used>, <use> is only // worth this much while <used> is worth 1. Searching for <"used"> will still only find exact // matches. $wgCirrusSearchStemmedWeight = 0.5; // Weight of each namespace relative to NS_MAIN. If not specified non-talk namespaces default to // $wgCirrusSearchDefaultNamespaceWeight. If not specified talk namespaces default to: // $wgCirrusSearchTalkNamespaceWeight * weightOfCorrespondingNonTalkNamespace // The default values below inspired by the configuration used for lsearchd. Note that _technically_ // NS_MAIN can be overridden with this then 1 just represents what NS_MAIN would have been.... // If you override NS_MAIN here then NS_TALK will still default to: // $wgCirrusSearchNamespaceWeights[ NS_MAIN ] * wgCirrusSearchTalkNamespaceWeight // You can specify namespace by number or string. Strings are converted to numbers using the // content language including aliases. $wgCirrusSearchNamespaceWeights = [ NS_USER => 0.05, NS_PROJECT => 0.1, NS_MEDIAWIKI => 0.05, NS_TEMPLATE => 0.005, NS_HELP => 0.1, ]; // Default weight of non-talks namespaces $wgCirrusSearchDefaultNamespaceWeight = 0.2; // Default weight of a talk namespace relative to its corresponding non-talk namespace. $wgCirrusSearchTalkNamespaceWeight = 0.25; // Default weight of language field for multilingual wikis. // 'user' is the weight given to the user's language // 'wiki' is the weight given to the wiki's content language // If your wiki is only one language you can leave these at 0, otherwise try setting it // to something like 5.0 for 'user' and 2.5 for 'wiki' $wgCirrusSearchLanguageWeight = [ 'user' => 0.0, 'wiki' => 0.0, ]; // Portion of an article's score that decays with time since it's last update. Defaults to 0 // meaning don't decay the score at all unless prefer-recent: prefixes the query. $wgCirrusSearchPreferRecentDefaultDecayPortion = 0; // Portion of an article's score that decays with time if prefer-recent: prefixes the query but // doesn't specify a portion. Defaults to .6 because that approximates the behavior that // wikinews has been using for years. An article 160 days old is worth about 70% of its new score. $wgCirrusSearchPreferRecentUnspecifiedDecayPortion = .6; // Default number of days it takes the portion of an article's score that decays with time since // last update to half way decay to use if prefer-recent: prefixes query and doesn't specify a // half life or $wgCirrusSearchPreferRecentDefaultDecayPortion is non 0. Default to 160 because // that approximates the behavior that wikinews has been using for years. $wgCirrusSearchPreferRecentDefaultHalfLife = 160; // Configuration parameters passed to more_like_this queries. // Note: these values can be configured at runtime by editing the System // message cirrussearch-morelikethis-settings $wgCirrusSearchMoreLikeThisConfig = [ // Minimum number of documents (per shard) that need a term for it to be considered 'min_doc_freq' => 2, // Maximum number of documents (per shard) that have a term for it to be considered // Setting a sufficient high value can be useful to exclude stop words but it depends on the wiki size. 'max_doc_freq' => null, // This is the max number it will collect from input data to build the query // This value cannot exceed $wgCirrusSearchMoreLikeThisMaxQueryTermsLimit . 'max_query_terms' => 25, // Minimum TF (number of times the term appears in the input text) for a term to be considered // for small fields (title) tf is usually 1 so setting it to 2 will exclude all terms. // for large fields (text) this value can help to exclude words that are not related to the subject. 'min_term_freq' => 2, // Minimum length for a word to be considered // small words tend to be stop words. 'min_word_len' => 0, // Maximum length for a word to be considered // Very long "words" tend to be uncommon, excluding them can help recall but it // is highly dependent on the language. 'max_word_len' => 0, // Percent of terms to match // High value will increase precision but can prevent small docs to match against large ones 'minimum_should_match' => '30%', ]; // Hard limit to the max_query_terms parameter of more like this queries. // This prevent running too large queries. $wgCirrusSearchMoreLikeThisMaxQueryTermsLimit = 100; // Set the default field used by the More Like This algorithm $wgCirrusSearchMoreLikeThisFields = [ 'text' ]; // List of fields allowed for the more like this queries. $wgCirrusSearchMoreLikeThisAllowedFields = [ 'title', 'text', 'auxiliary_text', 'opening_text', 'headings', 'all' ]; // When set to false cirrus will use the text content to build the query // and search on the field listed in $wgCirrusSearchMoreLikeThisFields // Set to true if you want to use field data as input text to build the initial // query. // Note that if the all field is used then this setting will be forced to true. // This is because the all field is not part of the _source and its content cannot // be retrieved by elasticsearch. $wgCirrusSearchMoreLikeThisUseFields = false; // More like this is a very expensive query. This allows redirecting queries // to a separate cluster configured in $wgCirrusSearchClusters. When set // to a falsy value $wgCirrusSearchDefaultCluster is used. $wgCirrusSearchMoreLikeThisCluster = null; // More like this queries can be quite expensive. Set this to > 0 to cache the // results for the specified # of seconds into ObjectCache (memcache, redis, or // whatever is configured). $wgCirrusSearchMoreLikeThisTTL = 0; // Show the notification about this wiki using CirrusSearch on the search page. $wgCirrusSearchShowNowUsing = false; // CirrusSearch interwiki searching // Keys are the interwiki prefix, values are the index to search // Results are cached. $wgCirrusSearchInterwikiSources = []; // How long to cache interwiki search results for (in seconds) $wgCirrusSearchInterwikiCacheTime = 7200; // The seconds Elasticsearch will wait to batch index changes before making // them available for search. Lower values make search more real time but put // more load on Elasticsearch. Defaults to 1 second because that is the default // in Elasticsearch. Changing this will immediately effect wait time on // secondary (links) update if those allow waiting (basically if you use Redis // for the job queue). For it to effect Elasticsearch you'll have to rebuild // the index. $wgCirrusSearchRefreshInterval = 1; // Delay between when the job is queued for a change and when the job can be // unqueued. The idea is to let the job queue deduplication logic take care // of preventing multiple updates for frequently changed pages and to combine // many of the secondary changes from template edits into a single update. // Note that this does not work with every job queue implementation. It works // with JobQueueRedis but is ignored with JobQueueDB. $wgCirrusSearchUpdateDelay = [ 'prioritized' => 0, 'default' => 0, ]; // List of plugins that Cirrus should ignore when it scans for plugins. This // will cause the plugin not to be used by updateSearchIndexConfig.php and // friends. $wgCirrusSearchBannedPlugins = []; // Number of times to instruct Elasticsearch to retry updates that fail on // version conflicts. While we do have a version for each page in mediawiki // (the revision timestamp) using it for versioning is a bit tricky because // Cirrus uses two pass indexing the first time and sometimes needs to force // updates. This is simpler but theoretically will put more load on // Elasticsearch. At this point, though, we believe the load not to be // substantial. $wgCirrusSearchUpdateConflictRetryCount = 5; // Number of characters to include in article fragments. $wgCirrusSearchFragmentSize = 150; // Should we add a cache warmer that searches for the main page to the content // namespace? // @see http://www.elasticsearch.org/guide/en/elasticsearch/reference/current/indices-warmers.html $wgCirrusSearchMainPageCacheWarmer = true; // Other cache warmers. Form is index name => array(searches). See examples // commented out below. $wgCirrusSearchCacheWarmers = []; // $wgCirrusSearchCacheWarmers[ 'content' ][] = 'foo bar'; // $wgCirrusSearchCacheWarmers[ 'content' ][] = 'batman'; // $wgCirrusSearchCacheWarmers[ 'general' ][] = 'template:noble pipe'; // Whether to boost searches based on link counts. Default is true // which most wikis will want. Edge cases will want to turn this off. $wgCirrusSearchBoostLinks = true; // Shard allocation settings. The include/exclude/require top level keys are // the type of rule to use, the names should be self explanatory. The values // are an array of keys and values of different rules to apply to an index. // // For example: if you wanted to make sure this index was only allocated to // servers matching a specific IP block, you'd do this: // $wgCirrusSearchIndexAllocation['require'] = array( '_ip' => '192.168.1.*' ); // Or let's say you want to keep an index off a given host: // $wgCirrusSearchIndexAllocation['exclude'] = array( '_host' => 'badserver01' ); // // Note that if you use anything other than the magic values of _ip, _name, _id // or _host it requires you to configure the host keys/values on your server(s) // // http://www.elasticsearch.org/guide/en/elasticsearch/reference/current/index-modules-allocation.html $wgCirrusSearchIndexAllocation = [ 'include' => [], 'exclude' => [], 'require' => [], ]; // Dumpable config parameters. These are known not to include any private // information and thus safe to include in the config dump. To disable the // config dump entirely add this to your configuration after including: // CirrusSearch.php: // $wgApiModules['cirrus-config-dump'] = 'ApiDisabled'; $wgCirrusSearchConfigDumpWhiteList = [ 'servers', 'connectionAttempts', 'shardCount', 'replicas', 'slowSearch', 'useExperimentalHighlighter', 'optimizeIndexForExperimentalHighlighter', 'namespaceMappings', 'extraIndexes', 'updateShardTimeout', 'clientSideUpdateTimeout', 'searchShardTimeout', 'clientSizeSearchTimeout', 'maintenanceTimeout', 'prefixSearchStartsWithAnyWord', 'phraseSlop', 'phraseRescoreBoost', 'phraseRescoreWindowSize', 'functionRescoreWindowSize', 'moreAccurateScoringMode', 'phraseSuggestMaxErrors', 'phraseSuggestConfidence', 'phraseSuggestUseText', 'phraseSuggestUseOpeningText', 'indexedRedirects', 'linkedArticlesToUpdate', 'unlikedArticlesToUpdate', 'weights', 'allFields', 'boostOpening', 'nearMatchWeight', 'stemmedWeight', 'namespaceWeights', 'defaultNamespaceWeight', 'talkNamespaceWeight', 'languageWeight', 'preferRecentDefaultDecayPortion', 'preferRecentUnspecifiedDecayPortion', 'preferRecentDefaultHalfLife', 'moreLikeThisConfig', 'showNowUsing', 'interwikiSources', 'interwikiCacheTime', 'refreshInterval', 'bannedPlugins', 'updateConflictRetryCount', 'fragmentSize', 'mainPageCacheWarmer', 'cacheWarmers', 'boostLinks', 'indexAllocation', ]; // Pool Counter key. If you use the PoolCounter extension, this can help segment your wiki's // traffic into separate queues. This has no effect in vanilla MediaWiki and most people can // just leave this as it is. $wgCirrusSearchPoolCounterKey = '_elasticsearch'; /** * Allow failures of the per-user Pool Counter to continue through. This * still runs the error callbacks to trigger logging of failures, but does * not prevent the search from running. Used to tune the per-user pool counter * settings before enabling it fully and blocking queries. */ $wgCirrusSearchBypassPerUserFailure = false; /** * List of CIDR a.b.c.d/n ranges for which the per-user pool counter is * always active, regardless of wgCirrusSearchBypassPerUserFailure setting. */ $wgCirrusSearchForcePerUserPoolCounter = []; // Merge configuration for the indices. See // http://www.elasticsearch.org/guide/en/elasticsearch/reference/current/index-modules-merge.html // for the meanings. $wgCirrusSearchMergeSettings = [ 'content' => [ // Aggressive settings to try to keep the content index more optimized // because it is searched more frequently. 'max_merge_at_once' => 5, 'segments_per_tier' => 5, 'reclaim_deletes_weight' => 3.0, 'max_merged_segment' => '25g', ], 'general' => [ // The Elasticsearch defaults for this less frequently searched index. 'max_merge_at_once' => 10, 'segments_per_tier' => 10, 'reclaim_deletes_weight' => 2.0, 'max_merged_segment' => '5g', ], ]; /** * Whether search events should be logged in the client side. */ $wgCirrusSearchEnableSearchLogging = false; /** * Whether elasticsearch queries should be logged on the server side. */ $wgCirrusSearchLogElasticRequests = true; /** * When truthy and this value is passed as the cirrusLogElasticRequests query * variable $wgCirrusSearchLogElasticRequests will be set to false for that * request. */ $wgCirrusSearchLogElasticRequestsSecret = false; // The maximum number of incategory:a|b|c items to OR together. $wgCirrusSearchMaxIncategoryOptions = 100; /** * The URL of a "Give us your feedback" link to append to search results or * something falsy if you don't want to show the link. */ $wgCirrusSearchFeedbackLink = false; /** * The maximum amount of time jobs delayed due to frozen indexes can remain * in the job queue. */ $wgCirrusSearchDropDelayedJobsAfter = 60 * 60 * 24 * 2; // 2 days /** * The initial exponent used when backing off ElasticaWrite jobs. On the first * failure the backoff will be either 2^exp or 2^(exp+1). This exponent will * be increased to a maximum of exp+4 on repeated failures to run the job. */ $wgCirrusSearchWriteBackoffExponent = 6; /** * Configuration of individual a/b tests being run. See CirrusSearch\UserTesting * for more information. */ $wgCirrusSearchUserTesting = []; /** * Profile for search as you type suggestion (completion suggestion) * (see profiles/SuggestProfiles.php for more details.) */ $wgCirrusSearchCompletionSettings = 'fuzzy'; /** * Enable ICU Folding instead of the default ASCII Folding. * It allows to cover a wider range of characters when squashing diacritics. * see https://www.elastic.co/guide/en/elasticsearch/plugins/current/analysis-icu-folding.html * Currently this settings is only used by the CompletionSuggester. * Requires the ICU plugin installed. * Set to true to enable, false to use the default ASCII Folding * NOTE: Experimental */ $wgCirrusSearchUseIcuFolding = false; /** * Set the default scoring function to be used by maintenance/updateSuggesterIndex.php * @see includes/BuildDocument/SuggestScoring.php for more details about scoring functions * NOTE: if you change the scoring method you'll have to rebuild the suggester index. */ $wgCirrusSearchCompletionDefaultScore = 'quality'; /** * Use the completion suggester as the default implementation for searchSuggestions. * You have to build the completion suggester index with the maintenance script * updateSuggesterIndex.php. The suggester only supports queries to the main * namespace. PrefixSearch will be used in all other cases. * Valid values, all unknown values map to 'no': * yes - Use completion suggester as the default * beta - Allow users to enable completion suggester as a BetaFeature * no - Don't use completion suggester */ $wgCirrusSearchUseCompletionSuggester = 'no'; /** * Maximum number of results to ask from the elasticsearch completion * api, note that this value will be multiplied by fetch_limit_factor * set in Completion profiles (default to 2) */ $wgCirrusSearchCompletionSuggesterHardLimit = 50; /** * Try to recycle the completion suggester, if the wiki is small * it's certainly better to not re-create the index from scratch * since index creation is costly. Recycling the index will prevent * elasticsearch from rebalancing shards. * On large wikis it's maybe better to create a new index because * documents are indexed and optimised with replication disabled * reducing the number of disk operation to primary shards only. */ $wgCirrusSearchRecycleCompletionSuggesterIndex = true; /** * Profile for geo context search as you type suggestion (completion suggestion) * (see profiles/SuggestProfiles.php for more details.) * * NOTE: This is an experimental API */ $wgCirrusSearchCompletionGeoContextSettings = $wgCirrusSearchCompletionGeoContextProfiles['default']; /** * Enable alternative language search. */ $wgCirrusSearchEnableAltLanguage = false; /** * Map of alternative languages and wikis, for search re-try. * No defaults since we don't know how people call their other language wikis. * Example: * $wgCirrusSearchLanguageToWikiMap = array( * 'ro' => 'ro', * 'de' => 'de', * 'ru' => 'ru', * ); * The key is the language name, the value is interwiki link. * You will also need to set: * $wgCirrusSearchWikiToNameMap['ru'] = 'ruwiki'; * to link interwiki to the wiki DB name. */ $wgCirrusSearchLanguageToWikiMap = []; /** * Map of interwiki link -> wiki name * e.g. $wgCirrusSearchWikiToNameMap['ru'] = 'ruwiki'; * FIXME: we really should already have this information, also we're possibly * duplicating $wgCirrusSearchInterwikiSources. This needs to be fixed. */ $wgCirrusSearchWikiToNameMap = []; /** * If set to non-empty string, interwiki results will have ?wprov=XYZ parameter added. */ $wgCirrusSearchInterwikiProv = false; /** * Set the rescore profile to default. * see profile/RescoreProfiles.php for more info */ $wgCirrusSearchRescoreProfile = 'classic'; /** * If current wiki has less than this number of results, try to search other language wikis. */ $wgCirrusSearchInterwikiThreshold = 3; /** * List of classes to be used as language detectors, implementing * CirrusSearch\LanguageDetector\Detector interface. * Detectors will be called in the order given until one * returns a non-null result. The array key will, currently, only be logged to the * UserTesting logs. This is intended to be added to CirrusSearchRequestSet payload * as well once schema migration is complete. * * Two options are built in: * * CirrusSearch\LanguageDetector\HttpAccept - uses the first language in the * Accept-Language header that is not the current content language. * CirrusSearch\LanguageDetector\ElasticSearch - uses the elasticsearch lang-detect plugin * CirrusSearch\LanguageDetector\TextCat - uses TextCat library */ $wgCirrusSearchLanguageDetectors = []; /** * Directory where TextCat detector should look for language model */ $wgCirrusSearchTextcatModel = false; /** * Limit the set of languages detected by Textcat. * Useful when some languages in the model have very bad precision, e.g.: * $wgCirrusSearchTextcatLanguages = array( 'ar', 'it', 'de' ); */ /** * Overrides the master timeout on cluster wide actions, such as mapping updates. * It may be necessary to increase this on clusters that support a large number * of wiki's. */ $wgCirrusSearchMasterTimeout = '30s'; /** * Activate/Deactivate continuous sanity check. * The process will scan and check discrepancies between mysql and * elasticsearch for all possible ids in the database. * Settings will be automatically chosen according to wiki size (see * profiles/SaneitizeProfiles.php) * The script responsible for pushing sanitization jobs is saneitizeJobs.php. * It needs to be scheduled by cron, default settings provided are suited * for a bi-hourly schedule (--refresh-freq=7200). * Setting $wgCirrusSearchSanityCheck to false will prevent the script from * pushing new jobs even if it's still scheduled by cron. */ $wgCirrusSearchSanityCheck = true; /** * The base name of indexes used on this wiki. This value must be * unique across all wiki's sharing an elasticsearch cluster unless * $wgCirrusSearchMultiWikiIndices is set to true. */ $wgCirrusSearchIndexBaseName = wfWikiID(); /** * Treat question marks in simple queries as question marks, not * wildcard characters, especially at the end of a query. If the * query doesn't use insource: and there is no escape character, * remove ? from the end of the query, before a word boundary, or * everywhere; also de-escape all escaped question marks. * * Valid values, all unknown values map to 'no': * final - only strip trailing question marks and white space * break - strip non-final question marks followed by a word boundary * all - strip all question marks (and replace them with spaces) * no - don't strip question marks */ $wgCirrusSearchStripQuestionMarks = 'all'; /** * Elasticsearch QueryBuilder to use when when building * FullText queries */ $wgCirrusSearchFullTextQueryBuilderProfile = 'default'; /** * Transitionary flag for converting between older style * doc ids (page ids) to the newer style ids (wikiid|pageid). * Changing this from false to true requires first turning * this on, then performing an in-place reindex. There may * be some duplicate/outdated results while the inplace * reindex is running. */ $wgCirrusSearchPrefixIds = false; /** * Adds an artificial backend latency in miroseconds. * Only useful for testing. */ $wgCirrusSearchExtraBackendLatency = 0; /** * Configure default boost-templates * Can be overridden on wiki and System messages. * * $wgCirrusSearchBoostTemplates = [ * 'Template:Featured article' => 2.0, * ]; */ $wgCirrusSearchBoostTemplates = []; /** * Disable customization of boot templates on wiki * Set to true to disable onwiki config. */ $wgCirrusSearchIgnoreOnWikiBoostTemplates = false; $includes = __DIR__ . "/includes/"; $apiDir = $includes . 'Api/'; $buildDocument = $includes . 'BuildDocument/'; $extraFilterDir = $includes . 'Extra/Filter/'; $jobsDir = $includes . 'Job/'; $maintenanceDir = $includes . 'Maintenance/'; $sanity = $includes . 'Sanity/'; $search = $includes . 'Search/'; /** * Classes */ require_once __DIR__ . '/autoload.php'; if ( file_exists( __DIR__ . '/vendor/autoload.php' ) ) { require_once __DIR__ . '/vendor/autoload.php'; } /** * Hooks */ $wgHooks[ 'CirrusSearchBuildDocumentLinks'][] = 'CirrusSearch\BuildDocument\RedirectsAndIncomingLinks::buildDocument'; $wgHooks[ 'AfterImportPage' ][] = 'CirrusSearch\Hooks::onAfterImportPage'; $wgHooks[ 'ApiBeforeMain' ][] = 'CirrusSearch\Hooks::onApiBeforeMain'; $wgHooks[ 'ArticleDelete' ][] = 'CirrusSearch\Hooks::onArticleDelete'; $wgHooks[ 'ArticleDeleteComplete' ][] = 'CirrusSearch\Hooks::onArticleDeleteComplete'; $wgHooks[ 'ArticleRevisionVisibilitySet' ][] = 'CirrusSearch\Hooks::onRevisionDelete'; $wgHooks[ 'BeforeInitialize' ][] = 'CirrusSearch\Hooks::onBeforeInitialize'; $wgHooks[ 'LinksUpdateComplete' ][] = 'CirrusSearch\Hooks::onLinksUpdateCompleted'; $wgHooks[ 'ResourceLoaderGetConfigVars' ][] = 'CirrusSearch\Hooks::onResourceLoaderGetConfigVars'; $wgHooks[ 'SoftwareInfo' ][] = 'CirrusSearch\Hooks::onSoftwareInfo'; $wgHooks[ 'SpecialSearchResultsPrepend' ][] = 'CirrusSearch\Hooks::onSpecialSearchResultsPrepend'; $wgHooks[ 'SpecialSearchResultsAppend' ][] = 'CirrusSearch\Hooks::onSpecialSearchResultsAppend'; $wgHooks[ 'TitleMove' ][] = 'CirrusSearch\Hooks::onTitleMove'; $wgHooks[ 'TitleMoveComplete' ][] = 'CirrusSearch\Hooks::onTitleMoveComplete'; $wgHooks[ 'UnitTestsList' ][] = 'CirrusSearch\Hooks::onUnitTestsList'; $wgHooks[ 'ShowSearchHitTitle' ][] = 'CirrusSearch\Hooks::onShowSearchHitTitle'; $wgHooks[ 'GetBetaFeaturePreferences' ][] = 'CirrusSearch\Hooks::getBetaFeaturePreferences'; $wgHooks[ 'APIAfterExecute' ][] = 'CirrusSearch\Hooks::onAPIAfterExecute'; $wgHooks[ 'SpecialSearchResults' ][] = 'CirrusSearch\Hooks::onSpecialSearchResults'; /** * i18n */ $wgMessagesDirs['CirrusSearch'] = __DIR__ . '/i18n'; /** * Jobs */ $wgJobClasses[ 'cirrusSearchDeletePages' ] = 'CirrusSearch\Job\DeletePages'; $wgJobClasses[ 'cirrusSearchIncomingLinkCount' ] = 'CirrusSearch\Job\IncomingLinkCount'; $wgJobClasses[ 'cirrusSearchLinksUpdate' ] = 'CirrusSearch\Job\LinksUpdate'; $wgJobClasses[ 'cirrusSearchLinksUpdatePrioritized' ] = 'CirrusSearch\Job\LinksUpdate'; $wgJobClasses[ 'cirrusSearchMassIndex' ] = 'CirrusSearch\Job\MassIndex'; $wgJobClasses[ 'cirrusSearchOtherIndex' ] = 'CirrusSearch\Job\OtherIndex'; $wgJobClasses[ 'cirrusSearchElasticaWrite' ] = 'CirrusSearch\Job\ElasticaWrite'; $wgJobClasses[ 'cirrusSearchCheckerJob' ] = 'CirrusSearch\Job\CheckerJob'; /** * Actions */ $wgActions[ 'cirrusdump' ] = 'CirrusSearch\Dump'; /** * API */ $wgAPIModules['cirrus-config-dump'] = 'CirrusSearch\Api\ConfigDump'; $wgAPIModules['cirrus-mapping-dump'] = 'CirrusSearch\Api\MappingDump'; $wgAPIModules['cirrus-settings-dump'] = 'CirrusSearch\Api\SettingsDump'; /** * Configs */ $wgConfigRegistry['CirrusSearch'] = 'CirrusSearch\SearchConfig::newFromGlobals'; /** * JavaScript served to all SERP's */ $wgResourceModules += [ "ext.cirrus.serp" => [ 'scripts' => [ 'resources/ext.cirrus.serp.js', ], 'dependencies' => [ 'mediawiki.Uri' ], 'styles' => [], 'messages' => [], 'remoteExtPath' => 'CirrusSearch', 'localBasePath' => __DIR__, ], ]; /** * Mapping of result types to CirrusSearch classes. */ $wgCirrusSearchFieldTypes = [ SearchIndexField::INDEX_TYPE_TEXT => \CirrusSearch\Search\TextIndexField::class, SearchIndexField::INDEX_TYPE_KEYWORD => \CirrusSearch\Search\KeywordIndexField::class, SearchIndexField::INDEX_TYPE_INTEGER => \CirrusSearch\Search\IntegerIndexField::class, SearchIndexField::INDEX_TYPE_NUMBER => \CirrusSearch\Search\NumberIndexField::class, SearchIndexField::INDEX_TYPE_DATETIME => \CirrusSearch\Search\DatetimeIndexField::class, SearchIndexField::INDEX_TYPE_BOOL => \CirrusSearch\Search\BooleanIndexField::class, SearchIndexField::INDEX_TYPE_NESTED => \CirrusSearch\Search\NestedIndexField::class, ]; /** * Customize certain fields with a specific implementation. * Useful to apply CirrusSearch specific config to fields * controlled by MediaWiki core. */ $wgCirrusSearchFieldTypeOverrides = [ 'opening_text' => \CirrusSearch\Search\OpeningTextIndexField::class, ]; /** * Jenkins configuration required to get all the browser tests passing cleanly. * * @todo re-enable the code below if/when browser tests are enabled again * on Jenkins for Cirrus, and ensure the job name check is specific to * CirrusSearch and the entry point is not included for all extension * browser tests that happen to have CirrusSearch as a dependency, but * not all the other things that the below entry point requires. * * For now, browser tests are run via Cindy the browser test bot which * already directly includes the entry point vs using the check below. * * Tests are also run for CirrusSearch on beta, but those don't use * or need the entry point below. if ( isset( $wgWikimediaJenkinsCI ) && $wgWikimediaJenkinsCI === true && ( PHP_SAPI !== 'cli' && // If we're not in the CLI then this is certainly a browser test strpos( getenv( 'JOB_NAME' ), 'browsertests-CirrusSearch' ) !== false ) ) { require( __DIR__ . '/tests/jenkins/Jenkins.php' ); } */