%PDF- %PDF-
Mini Shell

Mini Shell

Direktori : /www/varak.net/wiki.varak.net/extensions/CirrusSearch/maintenance/
Upload File :
Create Path :
Current File : /www/varak.net/wiki.varak.net/extensions/CirrusSearch/maintenance/forceSearchIndex.php

<?php

namespace CirrusSearch;

use BatchRowIterator;
use CirrusSearch;
use CirrusSearch\Iterator\CallbackIterator;
use CirrusSearch\Maintenance\Maintenance;
use IDatabase;
use JobQueueGroup;
use MediaWiki\Logger\LoggerFactory;
use MediaWiki\MediaWikiServices;
use MWException;
use MWTimestamp;
use Title;
use WikiPage;

/**
 * Force reindexing change to the wiki.
 *
 * This program is free software; you can redistribute it and/or modify
 * it under the terms of the GNU General Public License as published by
 * the Free Software Foundation; either version 2 of the License, or
 * (at your option) any later version.
 *
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
 * GNU General Public License for more details.
 *
 * You should have received a copy of the GNU General Public License along
 * with this program; if not, write to the Free Software Foundation, Inc.,
 * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
 * http://www.gnu.org/copyleft/gpl.html
 */

$IP = getenv( 'MW_INSTALL_PATH' );
if( $IP === false ) {
	$IP = __DIR__ . '/../../..';
}
require_once( "$IP/maintenance/Maintenance.php" );
require_once( __DIR__ . '/../includes/Maintenance/Maintenance.php' );

class ForceSearchIndex extends Maintenance {
	const SECONDS_BETWEEN_JOB_QUEUE_LENGTH_CHECKS = 3;
	public $fromDate = null;
	public $toDate = null;
	public $toId = null;
	public $indexUpdates;
	public $limit;
	public $queue;
	public $maxJobs;
	public $pauseForJobs;
	public $namespace;
	public $excludeContentTypes;
	public $lastJobQueueCheckTime = 0;

	/**
	 * @var boolean true if the script is run with --ids
	 */
	private $runWithIds;

	/**
	 * @var int[] list of page ids to reindex when --ids is used
	 */
	private $pageIds;


	public function __construct() {
		parent::__construct();
		$this->mDescription = "Force indexing some pages.  Setting --from or --to will switch from page id based indexing to "
			. "date based indexing which uses less efficient queries and follows redirects.\n\n"
			. "Note: All froms are _exclusive_ and all tos are _inclusive_.\n"
			. "Note 2: Setting fromId and toId use the efficient query so those are ok.\n"
			. "Note 3: Operates on all clusters unless --cluster is provided.\n";
		$this->setBatchSize( 10 );
		$this->addOption( 'from', 'Start date of reindex in YYYY-mm-ddTHH:mm:ssZ (exc.  Defaults to 0 epoch.', false, true );
		$this->addOption( 'to', 'Stop date of reindex in YYYY-mm-ddTHH:mm:ssZ.  Defaults to now.', false, true );
		$this->addOption( 'fromId', 'Start indexing at a specific page_id.  Not useful with --deletes.', false, true );
		$this->addOption( 'toId', 'Stop indexing at a specific page_id.  Not useful with --deletes or --from or --to.', false, true );
		$this->addOption( 'ids', 'List of page ids (comma separated) to reindex. Not allowed with deletes/from/to/fromId/toId/limit.', false, true );
		$this->addOption( 'deletes', 'If this is set then just index deletes, not updates or creates.', false );
		$this->addOption( 'limit', 'Maximum number of pages to process before exiting the script. Default to unlimited.', false, true );
		$this->addOption( 'buildChunks', 'Instead of running the script spit out commands that can be farmed out to ' .
			'different processes or machines to rebuild the index.  Works with fromId and toId, not from and to.  ' .
			'If specified as a number then chunks no larger than that size are spat out.  If specified as a number ' .
			'followed by the word "total" without a space between them then that many chunks will be spat out sized to ' .
			'cover the entire wiki.' , false, true );
		$this->addOption( 'queue', 'Rather than perform the indexes in process add them to the job queue.  Ignored for delete.' );
		$this->addOption( 'maxJobs', 'If there are more than this many index jobs in the queue then pause before adding ' .
			'more.  This is only checked every ' . self::SECONDS_BETWEEN_JOB_QUEUE_LENGTH_CHECKS . ' seconds.  Not meaningful ' .
			'without --queue.', false, true );
		$this->addOption( 'pauseForJobs', 'If paused adding jobs then wait for there to be less than this many before ' .
			'starting again.  Defaults to the value specified for --maxJobs.  Not meaningful without --queue.', false, true );
		$this->addOption( 'indexOnSkip', 'When skipping either parsing or links send the document as an index.  ' .
			'This replaces the contents of the index for that entry with the entry built from a skipped process.' .
			'Without this if the entry does not exist then it will be skipped entirely.  Only set this when running ' .
			'the first pass of building the index.  Otherwise, don\'t tempt fate by indexing half complete documents.' );
		$this->addOption( 'forceParse', 'Bypass ParserCache and do a fresh parse of pages from the Content.' );
		$this->addOption( 'skipParse', 'Skip parsing the page.  This is really only good for running the second half ' .
			'of the two phase index build.  If this is specified then the default batch size is actually 50.' );
		$this->addOption( 'skipLinks', 'Skip looking for links to the page (counting and finding redirects).  Use ' .
			'this with --indexOnSkip for the first half of the two phase index build.' );
		$this->addOption( 'namespace', 'Only index pages in this given namespace', false, true );
		$this->addOption( 'excludeContentTypes', 'Exclude pages of the specified content types. These must be a comma separated list of strings such as "wikitext" or "json" matching the CONTENT_MODEL_* constants.', false, true, false );
	}

	public function execute() {
		$this->disablePoolCountersAndLogging();
		$wiki = sprintf( "[%20s]", wfWikiID() );

		// Make sure we've actually got indices to populate
		if ( !$this->simpleCheckIndexes() ) {
			$this->error( "$wiki index(es) do not exist. Did you forget to run updateSearchIndexConfig?", 1 );
		}

		// We need to check ids options early otherwise hasOption may return
		// true even if the user did not set the option on the commandline
		if ( $this->hasOption( 'ids' ) ) {
			$this->runWithIds = true;
			$this->pageIds = $this->buildPageIdBatches();
		}

		if ( !is_null( $this->getOption( 'from' ) ) || !is_null( $this->getOption( 'to' ) ) ) {
			// 0 is falsy so MWTimestamp makes that `now`.  '00' is epoch 0.
			$this->fromDate = new MWTimestamp( $this->getOption( 'from', '00' )  );
			$this->toDate = new MWTimestamp( $this->getOption( 'to', false ) );
		}
		$this->toId = $this->getOption( 'toId' );
		$this->indexUpdates = !$this->getOption( 'deletes', false );
		$this->limit = $this->getOption( 'limit' );
		$buildChunks = $this->getOption( 'buildChunks' );
		if ( $buildChunks !== null ) {
			$this->buildChunks( $buildChunks );
			return;
		}
		$this->queue = $this->getOption( 'queue' );
		$this->maxJobs = $this->getOption( 'maxJobs' ) ? intval( $this->getOption( 'maxJobs' ) ) : null;
		$this->pauseForJobs = $this->getOption( 'pauseForJobs' ) ?
			intval( $this->getOption( 'pauseForJobs' ) ) : $this->maxJobs;
		$updateFlags = $this->buildUpdateFlags();

		if ( !$this->getOption( 'batch-size' ) &&
			( $this->getOption( 'queue' ) || $this->getOption( 'deletes' ) )
		) {
			$this->setBatchSize( 100 );
		}

		$this->namespace = $this->hasOption( 'namespace' ) ?
			intval( $this->getOption( 'namespace' ) ) : null;

		$this->excludeContentTypes = array_filter( array_map(
			'trim',
			explode( ',', $this->getOption( 'excludeContentTypes', '' ) )
		) );

		$operationName = $this->indexUpdates
			? ( $this->queue ? 'Queued' : 'Indexed' )
			: 'Deleted';

		$operationStartTime = microtime( true );
		$completed = 0;
		$rate = 0;

		if ( $this->runWithIds ) {
			$it = $this->getIdsIterator();
		} elseif ( $this->indexUpdates && $this->fromDate === null) {
			$it = $this->getUpdatesByIdIterator();
		} elseif ( $this->indexUpdates ) {
			$it = $this->getUpdatesByDateIterator();
		} else {
			$it = $this->getDeletesIterator();
		}

		foreach ( $it as $batch ) {
			if ( $this->indexUpdates ) {
				$size = count( $batch['updates'] );
				$updates = array_filter( $batch['updates'] );
				if ( $this->queue ) {
					$this->waitForQueueToShrink( $wiki );
					JobQueueGroup::singleton()->push(
						Job\MassIndex::build( $updates, $updateFlags, $this->getOption( 'cluster' ) )
					);
				} else {
					// Update size with the actual number of updated documents.
					$updater = $this->createUpdater();
					$size = $updater->updatePages( $updates, $updateFlags );
				}
			} else {
				$size = count( $batch['titlesToDelete'] );
				$updater = $this->createUpdater();
				$updater->deletePages( $batch['titlesToDelete'], $batch['docIdsToDelete'] );
			}


			$completed += $size;
			$rate = $this->calculateIndexingRate( $completed, $operationStartTime );

			$this->output( "$wiki $operationName $size pages ending at {$batch['endingAt']} at $rate/second\n" );
			if ( !is_null( $this->limit ) && $completed > $this->limit ) {
				break;
			}
		}
		$this->output( "$operationName a total of {$completed} pages at $rate/second\n" );
		$this->waitForQueueToDrain( $wiki );
	}

	private function buildPageIdBatches() {
		if ( $this->getOption( 'deletes' ) || $this->hasOption( 'limit' )
			|| $this->hasOption( 'from' ) || $this->hasOption( 'to' )
			|| $this->hasOption( 'fromId' ) || $this->hasOption( 'toId' )
		) {
			$this->error( '--ids cannot be used with deletes/from/to/fromId/toId/limit', 1 );
		}

		$pageIds = array_map( function( $pageId ) {
				$pageId = trim( $pageId );
				if ( !ctype_digit( $pageId ) ) {
					$this->error( "Invalid page id provided in --ids, got '$pageId', expected a positive integer", 1 );
				}
				return intval( $pageId );
			},
			explode( ',', $this->getOption( 'ids' ) ) );
		return array_unique( $pageIds, SORT_REGULAR );
	}

	private function buildUpdateFlags() {
		$updateFlags = 0;
		if ( $this->getOption( 'indexOnSkip' ) ) {
			$updateFlags |= Updater::INDEX_ON_SKIP;
		}
		if ( $this->getOption( 'skipParse' ) ) {
			$updateFlags |= Updater::SKIP_PARSE;
			if ( !$this->getOption( 'batch-size' ) ) {
				$this->setBatchSize( 50 );
			}
		}
		if ( $this->getOption( 'skipLinks' ) ) {
			$updateFlags |= Updater::SKIP_LINKS;
		}

		if ( $this->getOption( 'forceParse' ) ) {
			$updateFlags |= Updater::FORCE_PARSE;
		}

		return $updateFlags;
	}

	private function waitForQueueToShrink( $wiki ) {
		$now = microtime( true );
		if ( $now - $this->lastJobQueueCheckTime <= self::SECONDS_BETWEEN_JOB_QUEUE_LENGTH_CHECKS ) {
			return;
		}

		$this->lastJobQueueCheckTime = $now;
		$queueSize = $this->getUpdatesInQueue();
		if ( $this->maxJobs === null || $this->maxJobs >= $queueSize )  {
			return;
		}

		do {
			$this->output( "$wiki Waiting while job queue shrinks: $this->pauseForJobs > $queueSize\n" );
			usleep( self::SECONDS_BETWEEN_JOB_QUEUE_LENGTH_CHECKS * 1000000 );
			$queueSize = $this->getUpdatesInQueue();
		} while ( $this->pauseForJobs < $queueSize );
	}

	private function waitForQueueToDrain( $wiki ) {
		if ( !$this->queue ) {
			return;
		}

		$lastQueueSizeForOurJob = PHP_INT_MAX;
		$waitStartTime = microtime( true );
		$this->output( "Waiting for jobs to drain from the queue\n" );
		while ( true ) {
			$queueSizeForOurJob = $this->getUpdatesInQueue();
			if ( $queueSizeForOurJob === 0 ) {
				return;
			}
			// We subtract 5 because we some jobs may be added by deletes
			if ( $queueSizeForOurJob > $lastQueueSizeForOurJob ) {
				$this->output( "Queue size went up.  Another script is likely adding jobs " .
					"and it'll wait for them to empty.\n" );
				return;
			}
			if ( microtime( true ) - $waitStartTime > 120 ) {
				// Wait at least two full minutes before we check if the job count went down.
				// Less then that and we might be seeing lag from redis's counts.
				$lastQueueSizeForOurJob = $queueSizeForOurJob;
			}
			$this->output( "$wiki $queueSizeForOurJob jobs left on the queue.\n" );
			usleep( self::SECONDS_BETWEEN_JOB_QUEUE_LENGTH_CHECKS * 1000000 );
		}
	}

	/**
	 * @param int $completed
	 * @param double $operationStartTime
	 *
	 * @return double
	 */
	private function calculateIndexingRate( $completed, $operationStartTime ) {
		$rate = $completed / ( microtime( true ) - $operationStartTime );

		if ( $rate < 1 ) {
			return round( $rate, 1 );
		}

		return round( $rate );
	}

	/**
	 * Do some simple sanity checking to make sure we've got indexes to populate.
	 * Note this isn't nearly as robust as updateSearchIndexConfig is, but it's
	 * not designed to be.
	 *
	 * @return bool
	 */
	private function simpleCheckIndexes() {
		$indexBaseName = $this->getSearchConfig()->get( SearchConfig::INDEX_BASE_NAME );

		// Top-level alias needs to exist
		if ( !$this->getConnection()->getIndex( $indexBaseName )->exists() ) {
			return false;
		}

		// Now check all index types to see if they exist
		foreach ( $this->getConnection()->getAllIndexTypes() as $indexType ) {
			// If the alias for this type doesn't exist, fail
			if ( !$this->getConnection()->getIndex( $indexBaseName, $indexType )->exists() ) {
				return false;
			}
		}

		return true;
	}

	protected function getDeletesIterator() {
		$dbr = $this->getDB( DB_SLAVE );
		$it = new BatchRowIterator(
			$dbr,
			'archive',
			[ 'ar_timestamp', 'ar_namespace', 'ar_title' ],
			$this->mBatchSize
		);

		$this->attachPageConditions( $dbr, $it, 'ar' );
		$this->attachTimestampConditions( $dbr, $it, 'ar' );

		$it->setFetchColumns( [ 'ar_timestamp', 'ar_namespace', 'ar_title', 'ar_page_id' ] );

		return new CallbackIterator( $it, function ( $batch ) {
			$titlesToDelete = [];
			$docIdsToDelete = [];
			foreach ( $batch as $row ) {
				$titlesToDelete[] = Title::makeTitle( $row->ar_namespace, $row->ar_title );
				$docIdsToDelete[] = $this->getSearchConfig()->makeId( $row->ar_page_id );
			}

			return [
				'titlesToDelete' => $titlesToDelete,
				'docIdsToDelete' => $docIdsToDelete,
				'endingAt' => isset( $row )
					? ( new MWTimestamp( $row->ar_timestamp ) )->getTimestamp( TS_ISO_8601 )
					: 'unknown',
			];
		} );
	}


	protected function getIdsIterator() {
		$dbr = $this->getDB( DB_SLAVE );
		$it = new BatchRowIterator( $dbr, 'page', 'page_id', $this->mBatchSize );
		$it->addConditions( [
			'page_id in (' . $dbr->makeList( $this->pageIds, LIST_COMMA ) . ')',
		] );
		$this->attachPageConditions( $dbr, $it, 'page' );

		return $this->wrapDecodeResults( $it, 'page_id' );
	}

	protected function getUpdatesByDateIterator() {
		$dbr = $this->getDB( DB_SLAVE );
		$it = new BatchRowIterator(
			$dbr,
			[ 'page', 'revision' ],
			[ 'rev_timestamp', 'page_id' ],
			$this->mBatchSize
		);
		$it->addConditions( [
			'rev_page = page_id',
			'rev_id = page_latest',
		] );

		$this->attachTimestampConditions( $dbr, $it, 'rev' );
		$this->attachPageConditions( $dbr, $it, 'page' );

		return $this->wrapDecodeResults( $it, 'rev_timestamp' );
	}

	protected function getUpdatesByIdIterator() {
		$dbr = $this->getDB( DB_SLAVE );
		$it = new BatchRowIterator( $dbr, 'page', 'page_id', $this->mBatchSize );
		$fromId = $this->getOption( 'fromId', 0 );
		if ( $fromId > 0 ) {
			$it->addConditions( [
				'page_id >= ' . $dbr->addQuotes( $fromId ),
			] );
		}
		if ( $this->toId ) {
			$it->addConditions( [
				'page_id <= ' . $dbr->addQuotes( $this->toId ),
			] );
		}

		$this->attachPageConditions( $dbr, $it, 'page' );

		return $this->wrapDecodeResults( $it, 'page_id' );
	}

	private function attachTimestampConditions( IDatabase $dbr, BatchRowIterator $it, $columnPrefix ) {
		// When initializing we guarantee that if either fromDate or toDate are provided
		// the other has a sane default value.
		if ( $this->fromDate ) {
			$it->addConditions( [
				"{$columnPrefix}_timestamp >= " . $dbr->addQuotes( $dbr->timestamp( $this->fromDate ) ),
				"{$columnPrefix}_timestamp <= " . $dbr->addQuotes( $dbr->timestamp( $this->toDate ) ),
			] );
		}
	}

	private function attachPageConditions( IDatabase $dbr, BatchRowIterator $it, $columnPrefix ) {
		if ( $columnPrefix === 'page' ) {
			$it->setFetchColumns( WikiPage::selectFields() );
		}
		if ( $this->namespace ) {
			$it->addConditions( [
				"{$columnPrefix}_namespace" => $this->namespace,
			] );
		}
		if ( $this->excludeContentTypes ) {
			$list = $dbr->makeList( $this->excludeContentTypes, LIST_COMMA );
			$it->addConditions( [
				"{$columnPrefix}_content_model NOT IN ($list)",
			] );
		}
	}

	/**
	 * @param BatchRowIterator $it
	 * @return CallbackIterator
	 */
	private function wrapDecodeResults( BatchRowIterator $it, $endingAtColumn ) {
		return new CallbackIterator( $it, function ( $batch ) use ( $endingAtColumn ) {
			// Build the updater outside the loop because it stores the redirects it hits.  Don't build it at the top
			// level so those are stored when it is freed.
			$updater = $this->createUpdater();

			$pages = [];
			foreach ( $batch as $row ) {
				// No need to call Updater::traceRedirects here because we know this is a valid page because
				// it is in the database.
				$page = WikiPage::newFromRow( $row, WikiPage::READ_LATEST );

				// null pages still get attached to keep the counts the same. They will be filtered
				// later on.
				$pages[] = $this->decidePage( $updater, $page );
			}

			if ( isset( $row ) ) {
				if ( $endingAtColumn === 'rev_timestamp' ) {
					$ts = new MWTimestamp( $row->rev_timestamp );
					$endingAt = $ts->getTimestamp( TS_ISO_8601 );
				} elseif ( $endingAtColumn === 'page_id' ) {
					$endingAt = $row->page_id;
				} else {
					throw new \MWException( 'Unknown $endingAtColumn: ' . $endingAtColumn );
				}
			} else {
				$endingAt = 'unknown';
			}

			return [
				'updates' => $pages,
				'endingAt' => $endingAt,
			];
		} );
	}

	/**
	 * Determine the actual page in the index that needs to be updated, based on a
	 * source page.
	 *
	 * @param Updater $updater
	 * @param WikiPage $page
	 * @return WikiPage|null WikiPage to be updated, or null if none.
	 */
	private function decidePage( Updater $updater, WikiPage $page ) {
		try {
			$content = $page->getContent();
		} catch ( MWException $ex ) {
			LoggerFactory::getInstance( 'CirrusSearch' )->warning(
				"Error deserializing content, skipping page: {pageId}",
				[ 'pageId' => $page->getTitle()->getArticleID() ]
			);
			return null;
		}

		if ( $content === null ) {
			// Skip pages without content.  Pages have no content because their latest revision
			// as loaded by the query above doesn't exist.
			$this->output( 'Skipping page with no content: ' . $page->getTitle()->getArticleID() . "\n" );
			return null;
		}

		if ( !$content->isRedirect() ) {
			return $page;
		}

		if ( $this->toDate === null ) {
			// Looks like we accidentally picked up a redirect when we were indexing by id and thus trying to
			// ignore redirects!  Just ignore it!  We would filter them out at the db level but that is slow
			// for large wikis.
			return null;
		}

		// We found a redirect.  Great.  Since we can't index special pages and redirects to special pages
		// are totally possible, as well as fun stuff like redirect loops, we need to use
		// Updater's redirect tracing logic which is very complete.  Also, it returns null on
		// self redirects.  Great!
		list( $page, ) = $updater->traceRedirects( $page->getTitle() );

		return $page;
	}

	/**
	 * @param string|int $buildChunks If specified as a number then chunks no
	 *  larger than that size are spat out.  If specified as a number followed
	 *  by the word "total" without a space between them then that many chunks
	 *  will be spat out sized to cover the entire wiki.
	 */
	private function buildChunks( $buildChunks ) {
		$dbr = $this->getDB( DB_SLAVE );
		if ( $this->toId === null ) {
			$this->toId = $dbr->selectField( 'page', 'MAX(page_id)' );
			if ( $this->toId === false ) {
				$this->error( "Couldn't find any pages to index.  toId = $this->toId.", 1 );
			}
		}
		$fromId = $this->getOption( 'fromId' );
		if ( $fromId === null ) {
			$fromId = $dbr->selectField( 'page', 'MIN(page_id) - 1' );
			if ( $fromId === false ) {
				$this->error( "Couldn't find any pages to index.  fromId = $fromId.", 1 );
			}
		}
		if ( $fromId === $this->toId ) {
			$this->error( "Couldn't find any pages to index.  fromId = $fromId = $this->toId = toId.", 1 );
		}
		$builder = new \CirrusSearch\Maintenance\ChunkBuilder();
		$builder->build( $this->mSelf, $this->mOptions, $buildChunks, $fromId, $this->toId );
	}

	/**
	 * Get the number of cirrusSearchMassIndex jobs in the queue.
	 * @return int length
	 */
	private function getUpdatesInQueue() {
		return JobQueueGroup::singleton()->get( 'cirrusSearchMassIndex' )->getSize();
	}

	/**
	 * @return Updater
	 */
	private function createUpdater() {
		$flags = [];
		if ( $this->hasOption( 'cluster' ) ) {
			$flags[] = 'same-cluster';
		}
		return new Updater( $this->getConnection(), $this->getSearchConfig(), $flags );
	}
}

$maintClass = ForceSearchIndex::class;
require_once RUN_MAINTENANCE_IF_MAIN;

Zerion Mini Shell 1.0