%PDF- %PDF-
Direktori : /www/varak.net/wiki.varak.net/extensions/CirrusSearch/includes/ |
Current File : /www/varak.net/wiki.varak.net/extensions/CirrusSearch/includes/Updater.php |
<?php namespace CirrusSearch; use Hooks as MWHooks; use MediaWiki\Logger\LoggerFactory; use ParserCache; use ParserOutput; use TextContent; use Title; use WikiPage; /** * Performs updates and deletes on the Elasticsearch index. Called by * CirrusSearch.php (our SearchEngine implementation), forceSearchIndex * (for bulk updates), and CirrusSearch's jobs. * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation; either version 2 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License along * with this program; if not, write to the Free Software Foundation, Inc., * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. * http://www.gnu.org/copyleft/gpl.html */ class Updater extends ElasticsearchIntermediary { // Bit field parameters for updatePages et al. const INDEX_EVERYTHING = 0; const INDEX_ON_SKIP = 1; const SKIP_PARSE = 2; const SKIP_LINKS = 4; const FORCE_PARSE = 8; /** * Full title text of pages updated in this process. Used for deduplication * of updates. * @var string[] */ private $updated = []; /** * @var string|null Name of cluster to write to, or null if none */ protected $writeToClusterName; /** * @var SearchConfig */ protected $searchConfig; /** * @param Connection $conn * @param string[] $flags */ public function __construct( Connection $conn, SearchConfig $config, array $flags = [] ) { parent::__construct( $conn, null, 0 ); $this->searchConfig = $config; if ( in_array( 'same-cluster', $flags ) ) { $this->writeToClusterName = $this->connection->getClusterName(); } } /** * Update a single page. * @param Title $title * @return bool true if the page updated, false if it failed, null if it didn't need updating */ public function updateFromTitle( $title ) { list( $page, $redirects ) = $this->traceRedirects( $title ); if ( $page ) { $updatedCount = $this->updatePages( [ $page ], self::INDEX_EVERYTHING ); if ( $updatedCount < 0 ) { return false; } } if ( count( $redirects ) === 0 ) { return true; } $redirectDocIds = []; foreach ( $redirects as $redirect ) { $redirectDocIds[] = $this->searchConfig->makeId( $redirect->getId() ); } return $this->deletePages( [], $redirectDocIds ); } /** * Trace redirects from the title to the destination. Also registers the title in the * memory of titles updated and detects special pages. * * @param Title $title title to trace * @return array(target, redirects) * - target is WikiPage|null wikipage if the $title either isn't a redirect or resolves * to an updatable page that hasn't been updated yet. Null if the page has been * updated, is a special page, or the redirects enter a loop. * - redirects is an array of WikiPages, one per redirect in the chain. If title isn't * a redirect then this will be an empty array */ public function traceRedirects( $title ) { // Loop through redirects until we get to the ultimate target $redirects = []; while ( true ) { $titleText = $title->getFullText(); if ( in_array( $titleText, $this->updated ) ) { // Already indexed this article in this process. This is mostly useful // to catch self redirects but has a storied history of catching strange // behavior. return [ null, $redirects ]; } // Never. Ever. Index. Negative. Namespaces. if ( $title->getNamespace() < 0 ) { return [ null, $redirects ]; } $page = WikiPage::factory( $title ); $logger = LoggerFactory::getInstance( 'CirrusSearch' ); if ( !$page->exists() ) { $logger->debug( "Ignoring an update for a nonexistent page: $titleText" ); return [ null, $redirects ]; } $content = $page->getContent(); if ( is_string( $content ) ) { $content = new TextContent( (string) $content ); } // If the event that the content is _still_ not usable, we have to give up. if ( !is_object( $content ) ) { return [ null, $redirects ]; } // Add the page to the list of updated pages before we start trying to update to catch redirect loops. $this->updated[] = $titleText; if ( $content->isRedirect() ) { $redirects[] = $page; $target = $content->getUltimateRedirectTarget(); if ( $target->equals( $page->getTitle() ) ) { // This doesn't warn about redirect loops longer than one but we'll catch those anyway. $logger->info( "Title redirecting to itself. Skip indexing" ); return [ null, $redirects ]; } $title = $target; continue; } else { return [ $page, $redirects ]; } } } /** * This updates pages in elasticsearch. * * $flags includes: * INDEX_EVERYTHING Cirrus will parse the page and count the links and send the document * to Elasticsearch as an index so if it doesn't exist it'll be created. * SKIP_PARSE Cirrus will skip parsing the page when building the document. It makes * sense to do this when you know the page hasn't changed like when it is newly linked * from another page. * SKIP_LINKS Cirrus will skip collecting links information. It makes sense to do this * when you know the link counts aren't yet available like during the first phase of * the two phase index build. * INDEX_ON_SKIP Cirrus will send an update if SKIP_PARSE or SKIP_LINKS rather than an * index. Indexing with any portion of the document skipped is dangerous because it * can put half created pages in the index. This is only a good idea during the first * half of the two phase index build. * * @param WikiPage[] $pages pages to update * @param int $flags Bit field containing instructions about how the document should be built * and sent to Elasticsearch. * @return int Number of documents updated of -1 if there was an error */ public function updatePages( $pages, $flags ) { global $wgCirrusSearchWikimediaExtraPlugin; // Don't update the same page twice. We shouldn't, but meh $pageIds = []; $pages = array_filter( $pages, function( WikiPage $page ) use ( &$pageIds ) { if ( !in_array( $page->getId(), $pageIds ) ) { $pageIds[] = $page->getId(); return true; } return false; } ); $titles = $this->pagesToTitles( $pages ); Job\OtherIndex::queueIfRequired( $titles, $this->writeToClusterName ); $allData = array_fill_keys( $this->connection->getAllIndexTypes(), [] ); foreach ( $this->buildDocumentsForPages( $pages, $flags ) as $document ) { $suffix = $this->connection->getIndexSuffixForNamespace( $document->get( 'namespace' ) ); if ( isset( $wgCirrusSearchWikimediaExtraPlugin[ 'super_detect_noop' ] ) && $wgCirrusSearchWikimediaExtraPlugin[ 'super_detect_noop' ] ) { $document = $this->docToSuperDetectNoopScript( $document ); } $allData[$suffix][] = $document; } $count = 0; foreach( $allData as $indexType => $data ) { // Elasticsearch has a queue capacity of 50 so if $data contains 50 pages it could bump up against // the max. So we chunk it and do them sequentially. foreach( array_chunk( $data, 10 ) as $chunked ) { $job = new Job\ElasticaWrite( reset( $titles ), [ 'method' => 'sendData', 'arguments' => [ $indexType, $chunked ], 'cluster' => $this->writeToClusterName, ] ); // This job type will insert itself into the job queue // with a delay if writes to ES are currently unavailable $job->run(); } $count += count( $data ); } return $count; } /** * Delete pages from the elasticsearch index. $titles and $docIds must point to the * same pages and should point to them in the same order. * * @param Title[] $titles List of titles to delete. If empty then skipped other index * maintenance is skipped. * @param integer[] $docIds List of elasticsearch document ids to delete * @param string $indexType index from which to delete * @return bool True if nothing happened or we successfully deleted, false on failure */ public function deletePages( $titles, $docIds, $indexType = null ) { Job\OtherIndex::queueIfRequired( $titles, $this->writeToClusterName ); $job = new Job\ElasticaWrite( $titles ? reset( $titles ) : Title::makeTitle( 0, "" ), [ 'method' => 'sendDeletes', 'arguments' => [ $docIds, $indexType ], 'cluster' => $this->writeToClusterName, ] ); // This job type will insert itself into the job queue // with a delay if writes to ES are currently paused $job->run(); } /** * @param \WikiPage[] $pages * @param int $flags * @return \Elastica\Document[] */ private function buildDocumentsForPages( $pages, $flags ) { global $wgCirrusSearchUpdateConflictRetryCount; $indexOnSkip = $flags & self::INDEX_ON_SKIP; $skipParse = $flags & self::SKIP_PARSE; $skipLinks = $flags & self::SKIP_LINKS; $forceParse = $flags & self::FORCE_PARSE; $fullDocument = !( $skipParse || $skipLinks ); $documents = []; $engine = new \CirrusSearch(); foreach ( $pages as $page ) { $title = $page->getTitle(); if ( !$page->exists() ) { LoggerFactory::getInstance( 'CirrusSearch' )->warning( 'Attempted to build a document for a page that doesn\'t exist. This should be caught ' . "earlier but wasn't. Page: {title}", [ 'title' => $title ] ); continue; } $doc = new \Elastica\Document( $this->searchConfig->makeId( $page->getId() ), [ 'version' => $page->getLatest(), 'version_type' => 'external', 'wiki' => wfWikiID(), 'namespace' => $title->getNamespace(), 'namespace_text' => Util::getNamespaceText( $title ), 'title' => $title->getText(), 'timestamp' => wfTimestamp( TS_ISO_8601, $page->getTimestamp() ), ] ); // Everything as sent as an update to prevent overwriting fields maintained in other processes like // OtherIndex::updateOtherIndex. // But we need a way to index documents that don't already exist. We're willing to upsert any full // documents or any documents that we've been explicitly told it is ok to index when they aren't full. // This is typically just done during the first phase of the initial index build. // A quick note about docAsUpsert's merging behavior: It overwrites all fields provided by doc unless they // are objects in both doc and the indexed source. We're ok with this because all of our fields are either // regular types or lists of objects and lists are overwritten. $doc->setDocAsUpsert( $fullDocument || $indexOnSkip ); $doc->setRetryOnConflict( $wgCirrusSearchUpdateConflictRetryCount ); if ( !$skipParse ) { $contentHandler = $page->getContentHandler(); $output = $contentHandler->getParserOutputForIndexing( $page, $forceParse ? null : ParserCache::singleton() ); foreach ( $contentHandler->getDataForSearchIndex( $page, $output, $engine ) as $field => $fieldData ) { $doc->set( $field, $fieldData ); } // Then let hooks have a go MWHooks::run( 'CirrusSearchBuildDocumentParse', [ $doc, $title, $page->getContent(), $output, $this->connection ] ); } if ( !$skipLinks ) { MWHooks::run( 'CirrusSearchBuildDocumentLinks', [ $doc, $title, $this->connection] ); } $documents[] = $doc; } return $documents; } /** * Converts a document into a call to super_detect_noop from the wikimedia-extra plugin. * @param \Elastica\Document $doc * @return \Elastica\Script\Script */ private function docToSuperDetectNoopScript( $doc ) { $params = $doc->getParams(); $params[ 'source' ] = $doc->getData(); $params[ 'detectors' ] = [ 'incoming_links' => 'within 20%', ]; $script = new \Elastica\Script\Script( 'super_detect_noop', $params, 'native' ); if ( $doc->getDocAsUpsert() ) { $script->setUpsert( $doc ); } return $script; } /** * Update the search index for newly linked or unlinked articles. * @param Title[] $titles titles to update * @return boolean were all pages updated? */ public function updateLinkedArticles( $titles ) { $pages = []; foreach ( $titles as $title ) { // Special pages don't get updated if ( !$title || $title->getNamespace() < 0 ) { continue; } $page = WikiPage::factory( $title ); if ( $page === null || !$page->exists() ) { // Skip link to nonexistent page. continue; } // Resolve one level of redirects because only one level of redirects is scored. if ( $page->isRedirect() ) { $target = $page->getRedirectTarget(); $page = new WikiPage( $target ); if ( !$page->exists() ) { // Skip redirects to nonexistent pages continue; } } if ( $page->isRedirect() ) { // This is a redirect to a redirect which doesn't count in the search score any way. continue; } if ( in_array( $title->getFullText(), $this->updated ) ) { // We've already updated this page in this process so there is no need to update it again. continue; } // Note that we don't add this page to the list of updated pages because this update isn't // a full update (just link counts). $pages[] = $page; } $updatedCount = $this->updatePages( $pages, self::SKIP_PARSE ); return $updatedCount >= 0; } /** * Convert an array of pages to an array of their titles. * * @param WikiPage[] $pages * @return Title[] */ private function pagesToTitles( $pages ) { $titles = []; foreach ( $pages as $page ) { $titles[] = $page->getTitle(); } return $titles; } }