%PDF- %PDF-
| Direktori : /www/varak.net/wiki.varak.net/extensions/CirrusSearch/maintenance/ |
| Current File : //www/varak.net/wiki.varak.net/extensions/CirrusSearch/maintenance/forceSearchIndex.php |
<?php
namespace CirrusSearch;
use BatchRowIterator;
use CirrusSearch;
use CirrusSearch\Iterator\CallbackIterator;
use CirrusSearch\Maintenance\Maintenance;
use IDatabase;
use JobQueueGroup;
use MediaWiki\Logger\LoggerFactory;
use MediaWiki\MediaWikiServices;
use MWException;
use MWTimestamp;
use Title;
use WikiPage;
/**
* Force reindexing change to the wiki.
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License along
* with this program; if not, write to the Free Software Foundation, Inc.,
* 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
* http://www.gnu.org/copyleft/gpl.html
*/
$IP = getenv( 'MW_INSTALL_PATH' );
if( $IP === false ) {
$IP = __DIR__ . '/../../..';
}
require_once( "$IP/maintenance/Maintenance.php" );
require_once( __DIR__ . '/../includes/Maintenance/Maintenance.php' );
class ForceSearchIndex extends Maintenance {
const SECONDS_BETWEEN_JOB_QUEUE_LENGTH_CHECKS = 3;
public $fromDate = null;
public $toDate = null;
public $toId = null;
public $indexUpdates;
public $limit;
public $queue;
public $maxJobs;
public $pauseForJobs;
public $namespace;
public $excludeContentTypes;
public $lastJobQueueCheckTime = 0;
/**
* @var boolean true if the script is run with --ids
*/
private $runWithIds;
/**
* @var int[] list of page ids to reindex when --ids is used
*/
private $pageIds;
public function __construct() {
parent::__construct();
$this->mDescription = "Force indexing some pages. Setting --from or --to will switch from page id based indexing to "
. "date based indexing which uses less efficient queries and follows redirects.\n\n"
. "Note: All froms are _exclusive_ and all tos are _inclusive_.\n"
. "Note 2: Setting fromId and toId use the efficient query so those are ok.\n"
. "Note 3: Operates on all clusters unless --cluster is provided.\n";
$this->setBatchSize( 10 );
$this->addOption( 'from', 'Start date of reindex in YYYY-mm-ddTHH:mm:ssZ (exc. Defaults to 0 epoch.', false, true );
$this->addOption( 'to', 'Stop date of reindex in YYYY-mm-ddTHH:mm:ssZ. Defaults to now.', false, true );
$this->addOption( 'fromId', 'Start indexing at a specific page_id. Not useful with --deletes.', false, true );
$this->addOption( 'toId', 'Stop indexing at a specific page_id. Not useful with --deletes or --from or --to.', false, true );
$this->addOption( 'ids', 'List of page ids (comma separated) to reindex. Not allowed with deletes/from/to/fromId/toId/limit.', false, true );
$this->addOption( 'deletes', 'If this is set then just index deletes, not updates or creates.', false );
$this->addOption( 'limit', 'Maximum number of pages to process before exiting the script. Default to unlimited.', false, true );
$this->addOption( 'buildChunks', 'Instead of running the script spit out commands that can be farmed out to ' .
'different processes or machines to rebuild the index. Works with fromId and toId, not from and to. ' .
'If specified as a number then chunks no larger than that size are spat out. If specified as a number ' .
'followed by the word "total" without a space between them then that many chunks will be spat out sized to ' .
'cover the entire wiki.' , false, true );
$this->addOption( 'queue', 'Rather than perform the indexes in process add them to the job queue. Ignored for delete.' );
$this->addOption( 'maxJobs', 'If there are more than this many index jobs in the queue then pause before adding ' .
'more. This is only checked every ' . self::SECONDS_BETWEEN_JOB_QUEUE_LENGTH_CHECKS . ' seconds. Not meaningful ' .
'without --queue.', false, true );
$this->addOption( 'pauseForJobs', 'If paused adding jobs then wait for there to be less than this many before ' .
'starting again. Defaults to the value specified for --maxJobs. Not meaningful without --queue.', false, true );
$this->addOption( 'indexOnSkip', 'When skipping either parsing or links send the document as an index. ' .
'This replaces the contents of the index for that entry with the entry built from a skipped process.' .
'Without this if the entry does not exist then it will be skipped entirely. Only set this when running ' .
'the first pass of building the index. Otherwise, don\'t tempt fate by indexing half complete documents.' );
$this->addOption( 'forceParse', 'Bypass ParserCache and do a fresh parse of pages from the Content.' );
$this->addOption( 'skipParse', 'Skip parsing the page. This is really only good for running the second half ' .
'of the two phase index build. If this is specified then the default batch size is actually 50.' );
$this->addOption( 'skipLinks', 'Skip looking for links to the page (counting and finding redirects). Use ' .
'this with --indexOnSkip for the first half of the two phase index build.' );
$this->addOption( 'namespace', 'Only index pages in this given namespace', false, true );
$this->addOption( 'excludeContentTypes', 'Exclude pages of the specified content types. These must be a comma separated list of strings such as "wikitext" or "json" matching the CONTENT_MODEL_* constants.', false, true, false );
}
public function execute() {
$this->disablePoolCountersAndLogging();
$wiki = sprintf( "[%20s]", wfWikiID() );
// Make sure we've actually got indices to populate
if ( !$this->simpleCheckIndexes() ) {
$this->error( "$wiki index(es) do not exist. Did you forget to run updateSearchIndexConfig?", 1 );
}
// We need to check ids options early otherwise hasOption may return
// true even if the user did not set the option on the commandline
if ( $this->hasOption( 'ids' ) ) {
$this->runWithIds = true;
$this->pageIds = $this->buildPageIdBatches();
}
if ( !is_null( $this->getOption( 'from' ) ) || !is_null( $this->getOption( 'to' ) ) ) {
// 0 is falsy so MWTimestamp makes that `now`. '00' is epoch 0.
$this->fromDate = new MWTimestamp( $this->getOption( 'from', '00' ) );
$this->toDate = new MWTimestamp( $this->getOption( 'to', false ) );
}
$this->toId = $this->getOption( 'toId' );
$this->indexUpdates = !$this->getOption( 'deletes', false );
$this->limit = $this->getOption( 'limit' );
$buildChunks = $this->getOption( 'buildChunks' );
if ( $buildChunks !== null ) {
$this->buildChunks( $buildChunks );
return;
}
$this->queue = $this->getOption( 'queue' );
$this->maxJobs = $this->getOption( 'maxJobs' ) ? intval( $this->getOption( 'maxJobs' ) ) : null;
$this->pauseForJobs = $this->getOption( 'pauseForJobs' ) ?
intval( $this->getOption( 'pauseForJobs' ) ) : $this->maxJobs;
$updateFlags = $this->buildUpdateFlags();
if ( !$this->getOption( 'batch-size' ) &&
( $this->getOption( 'queue' ) || $this->getOption( 'deletes' ) )
) {
$this->setBatchSize( 100 );
}
$this->namespace = $this->hasOption( 'namespace' ) ?
intval( $this->getOption( 'namespace' ) ) : null;
$this->excludeContentTypes = array_filter( array_map(
'trim',
explode( ',', $this->getOption( 'excludeContentTypes', '' ) )
) );
$operationName = $this->indexUpdates
? ( $this->queue ? 'Queued' : 'Indexed' )
: 'Deleted';
$operationStartTime = microtime( true );
$completed = 0;
$rate = 0;
if ( $this->runWithIds ) {
$it = $this->getIdsIterator();
} elseif ( $this->indexUpdates && $this->fromDate === null) {
$it = $this->getUpdatesByIdIterator();
} elseif ( $this->indexUpdates ) {
$it = $this->getUpdatesByDateIterator();
} else {
$it = $this->getDeletesIterator();
}
foreach ( $it as $batch ) {
if ( $this->indexUpdates ) {
$size = count( $batch['updates'] );
$updates = array_filter( $batch['updates'] );
if ( $this->queue ) {
$this->waitForQueueToShrink( $wiki );
JobQueueGroup::singleton()->push(
Job\MassIndex::build( $updates, $updateFlags, $this->getOption( 'cluster' ) )
);
} else {
// Update size with the actual number of updated documents.
$updater = $this->createUpdater();
$size = $updater->updatePages( $updates, $updateFlags );
}
} else {
$size = count( $batch['titlesToDelete'] );
$updater = $this->createUpdater();
$updater->deletePages( $batch['titlesToDelete'], $batch['docIdsToDelete'] );
}
$completed += $size;
$rate = $this->calculateIndexingRate( $completed, $operationStartTime );
$this->output( "$wiki $operationName $size pages ending at {$batch['endingAt']} at $rate/second\n" );
if ( !is_null( $this->limit ) && $completed > $this->limit ) {
break;
}
}
$this->output( "$operationName a total of {$completed} pages at $rate/second\n" );
$this->waitForQueueToDrain( $wiki );
}
private function buildPageIdBatches() {
if ( $this->getOption( 'deletes' ) || $this->hasOption( 'limit' )
|| $this->hasOption( 'from' ) || $this->hasOption( 'to' )
|| $this->hasOption( 'fromId' ) || $this->hasOption( 'toId' )
) {
$this->error( '--ids cannot be used with deletes/from/to/fromId/toId/limit', 1 );
}
$pageIds = array_map( function( $pageId ) {
$pageId = trim( $pageId );
if ( !ctype_digit( $pageId ) ) {
$this->error( "Invalid page id provided in --ids, got '$pageId', expected a positive integer", 1 );
}
return intval( $pageId );
},
explode( ',', $this->getOption( 'ids' ) ) );
return array_unique( $pageIds, SORT_REGULAR );
}
private function buildUpdateFlags() {
$updateFlags = 0;
if ( $this->getOption( 'indexOnSkip' ) ) {
$updateFlags |= Updater::INDEX_ON_SKIP;
}
if ( $this->getOption( 'skipParse' ) ) {
$updateFlags |= Updater::SKIP_PARSE;
if ( !$this->getOption( 'batch-size' ) ) {
$this->setBatchSize( 50 );
}
}
if ( $this->getOption( 'skipLinks' ) ) {
$updateFlags |= Updater::SKIP_LINKS;
}
if ( $this->getOption( 'forceParse' ) ) {
$updateFlags |= Updater::FORCE_PARSE;
}
return $updateFlags;
}
private function waitForQueueToShrink( $wiki ) {
$now = microtime( true );
if ( $now - $this->lastJobQueueCheckTime <= self::SECONDS_BETWEEN_JOB_QUEUE_LENGTH_CHECKS ) {
return;
}
$this->lastJobQueueCheckTime = $now;
$queueSize = $this->getUpdatesInQueue();
if ( $this->maxJobs === null || $this->maxJobs >= $queueSize ) {
return;
}
do {
$this->output( "$wiki Waiting while job queue shrinks: $this->pauseForJobs > $queueSize\n" );
usleep( self::SECONDS_BETWEEN_JOB_QUEUE_LENGTH_CHECKS * 1000000 );
$queueSize = $this->getUpdatesInQueue();
} while ( $this->pauseForJobs < $queueSize );
}
private function waitForQueueToDrain( $wiki ) {
if ( !$this->queue ) {
return;
}
$lastQueueSizeForOurJob = PHP_INT_MAX;
$waitStartTime = microtime( true );
$this->output( "Waiting for jobs to drain from the queue\n" );
while ( true ) {
$queueSizeForOurJob = $this->getUpdatesInQueue();
if ( $queueSizeForOurJob === 0 ) {
return;
}
// We subtract 5 because we some jobs may be added by deletes
if ( $queueSizeForOurJob > $lastQueueSizeForOurJob ) {
$this->output( "Queue size went up. Another script is likely adding jobs " .
"and it'll wait for them to empty.\n" );
return;
}
if ( microtime( true ) - $waitStartTime > 120 ) {
// Wait at least two full minutes before we check if the job count went down.
// Less then that and we might be seeing lag from redis's counts.
$lastQueueSizeForOurJob = $queueSizeForOurJob;
}
$this->output( "$wiki $queueSizeForOurJob jobs left on the queue.\n" );
usleep( self::SECONDS_BETWEEN_JOB_QUEUE_LENGTH_CHECKS * 1000000 );
}
}
/**
* @param int $completed
* @param double $operationStartTime
*
* @return double
*/
private function calculateIndexingRate( $completed, $operationStartTime ) {
$rate = $completed / ( microtime( true ) - $operationStartTime );
if ( $rate < 1 ) {
return round( $rate, 1 );
}
return round( $rate );
}
/**
* Do some simple sanity checking to make sure we've got indexes to populate.
* Note this isn't nearly as robust as updateSearchIndexConfig is, but it's
* not designed to be.
*
* @return bool
*/
private function simpleCheckIndexes() {
$indexBaseName = $this->getSearchConfig()->get( SearchConfig::INDEX_BASE_NAME );
// Top-level alias needs to exist
if ( !$this->getConnection()->getIndex( $indexBaseName )->exists() ) {
return false;
}
// Now check all index types to see if they exist
foreach ( $this->getConnection()->getAllIndexTypes() as $indexType ) {
// If the alias for this type doesn't exist, fail
if ( !$this->getConnection()->getIndex( $indexBaseName, $indexType )->exists() ) {
return false;
}
}
return true;
}
protected function getDeletesIterator() {
$dbr = $this->getDB( DB_SLAVE );
$it = new BatchRowIterator(
$dbr,
'archive',
[ 'ar_timestamp', 'ar_namespace', 'ar_title' ],
$this->mBatchSize
);
$this->attachPageConditions( $dbr, $it, 'ar' );
$this->attachTimestampConditions( $dbr, $it, 'ar' );
$it->setFetchColumns( [ 'ar_timestamp', 'ar_namespace', 'ar_title', 'ar_page_id' ] );
return new CallbackIterator( $it, function ( $batch ) {
$titlesToDelete = [];
$docIdsToDelete = [];
foreach ( $batch as $row ) {
$titlesToDelete[] = Title::makeTitle( $row->ar_namespace, $row->ar_title );
$docIdsToDelete[] = $this->getSearchConfig()->makeId( $row->ar_page_id );
}
return [
'titlesToDelete' => $titlesToDelete,
'docIdsToDelete' => $docIdsToDelete,
'endingAt' => isset( $row )
? ( new MWTimestamp( $row->ar_timestamp ) )->getTimestamp( TS_ISO_8601 )
: 'unknown',
];
} );
}
protected function getIdsIterator() {
$dbr = $this->getDB( DB_SLAVE );
$it = new BatchRowIterator( $dbr, 'page', 'page_id', $this->mBatchSize );
$it->addConditions( [
'page_id in (' . $dbr->makeList( $this->pageIds, LIST_COMMA ) . ')',
] );
$this->attachPageConditions( $dbr, $it, 'page' );
return $this->wrapDecodeResults( $it, 'page_id' );
}
protected function getUpdatesByDateIterator() {
$dbr = $this->getDB( DB_SLAVE );
$it = new BatchRowIterator(
$dbr,
[ 'page', 'revision' ],
[ 'rev_timestamp', 'page_id' ],
$this->mBatchSize
);
$it->addConditions( [
'rev_page = page_id',
'rev_id = page_latest',
] );
$this->attachTimestampConditions( $dbr, $it, 'rev' );
$this->attachPageConditions( $dbr, $it, 'page' );
return $this->wrapDecodeResults( $it, 'rev_timestamp' );
}
protected function getUpdatesByIdIterator() {
$dbr = $this->getDB( DB_SLAVE );
$it = new BatchRowIterator( $dbr, 'page', 'page_id', $this->mBatchSize );
$fromId = $this->getOption( 'fromId', 0 );
if ( $fromId > 0 ) {
$it->addConditions( [
'page_id >= ' . $dbr->addQuotes( $fromId ),
] );
}
if ( $this->toId ) {
$it->addConditions( [
'page_id <= ' . $dbr->addQuotes( $this->toId ),
] );
}
$this->attachPageConditions( $dbr, $it, 'page' );
return $this->wrapDecodeResults( $it, 'page_id' );
}
private function attachTimestampConditions( IDatabase $dbr, BatchRowIterator $it, $columnPrefix ) {
// When initializing we guarantee that if either fromDate or toDate are provided
// the other has a sane default value.
if ( $this->fromDate ) {
$it->addConditions( [
"{$columnPrefix}_timestamp >= " . $dbr->addQuotes( $dbr->timestamp( $this->fromDate ) ),
"{$columnPrefix}_timestamp <= " . $dbr->addQuotes( $dbr->timestamp( $this->toDate ) ),
] );
}
}
private function attachPageConditions( IDatabase $dbr, BatchRowIterator $it, $columnPrefix ) {
if ( $columnPrefix === 'page' ) {
$it->setFetchColumns( WikiPage::selectFields() );
}
if ( $this->namespace ) {
$it->addConditions( [
"{$columnPrefix}_namespace" => $this->namespace,
] );
}
if ( $this->excludeContentTypes ) {
$list = $dbr->makeList( $this->excludeContentTypes, LIST_COMMA );
$it->addConditions( [
"{$columnPrefix}_content_model NOT IN ($list)",
] );
}
}
/**
* @param BatchRowIterator $it
* @return CallbackIterator
*/
private function wrapDecodeResults( BatchRowIterator $it, $endingAtColumn ) {
return new CallbackIterator( $it, function ( $batch ) use ( $endingAtColumn ) {
// Build the updater outside the loop because it stores the redirects it hits. Don't build it at the top
// level so those are stored when it is freed.
$updater = $this->createUpdater();
$pages = [];
foreach ( $batch as $row ) {
// No need to call Updater::traceRedirects here because we know this is a valid page because
// it is in the database.
$page = WikiPage::newFromRow( $row, WikiPage::READ_LATEST );
// null pages still get attached to keep the counts the same. They will be filtered
// later on.
$pages[] = $this->decidePage( $updater, $page );
}
if ( isset( $row ) ) {
if ( $endingAtColumn === 'rev_timestamp' ) {
$ts = new MWTimestamp( $row->rev_timestamp );
$endingAt = $ts->getTimestamp( TS_ISO_8601 );
} elseif ( $endingAtColumn === 'page_id' ) {
$endingAt = $row->page_id;
} else {
throw new \MWException( 'Unknown $endingAtColumn: ' . $endingAtColumn );
}
} else {
$endingAt = 'unknown';
}
return [
'updates' => $pages,
'endingAt' => $endingAt,
];
} );
}
/**
* Determine the actual page in the index that needs to be updated, based on a
* source page.
*
* @param Updater $updater
* @param WikiPage $page
* @return WikiPage|null WikiPage to be updated, or null if none.
*/
private function decidePage( Updater $updater, WikiPage $page ) {
try {
$content = $page->getContent();
} catch ( MWException $ex ) {
LoggerFactory::getInstance( 'CirrusSearch' )->warning(
"Error deserializing content, skipping page: {pageId}",
[ 'pageId' => $page->getTitle()->getArticleID() ]
);
return null;
}
if ( $content === null ) {
// Skip pages without content. Pages have no content because their latest revision
// as loaded by the query above doesn't exist.
$this->output( 'Skipping page with no content: ' . $page->getTitle()->getArticleID() . "\n" );
return null;
}
if ( !$content->isRedirect() ) {
return $page;
}
if ( $this->toDate === null ) {
// Looks like we accidentally picked up a redirect when we were indexing by id and thus trying to
// ignore redirects! Just ignore it! We would filter them out at the db level but that is slow
// for large wikis.
return null;
}
// We found a redirect. Great. Since we can't index special pages and redirects to special pages
// are totally possible, as well as fun stuff like redirect loops, we need to use
// Updater's redirect tracing logic which is very complete. Also, it returns null on
// self redirects. Great!
list( $page, ) = $updater->traceRedirects( $page->getTitle() );
return $page;
}
/**
* @param string|int $buildChunks If specified as a number then chunks no
* larger than that size are spat out. If specified as a number followed
* by the word "total" without a space between them then that many chunks
* will be spat out sized to cover the entire wiki.
*/
private function buildChunks( $buildChunks ) {
$dbr = $this->getDB( DB_SLAVE );
if ( $this->toId === null ) {
$this->toId = $dbr->selectField( 'page', 'MAX(page_id)' );
if ( $this->toId === false ) {
$this->error( "Couldn't find any pages to index. toId = $this->toId.", 1 );
}
}
$fromId = $this->getOption( 'fromId' );
if ( $fromId === null ) {
$fromId = $dbr->selectField( 'page', 'MIN(page_id) - 1' );
if ( $fromId === false ) {
$this->error( "Couldn't find any pages to index. fromId = $fromId.", 1 );
}
}
if ( $fromId === $this->toId ) {
$this->error( "Couldn't find any pages to index. fromId = $fromId = $this->toId = toId.", 1 );
}
$builder = new \CirrusSearch\Maintenance\ChunkBuilder();
$builder->build( $this->mSelf, $this->mOptions, $buildChunks, $fromId, $this->toId );
}
/**
* Get the number of cirrusSearchMassIndex jobs in the queue.
* @return int length
*/
private function getUpdatesInQueue() {
return JobQueueGroup::singleton()->get( 'cirrusSearchMassIndex' )->getSize();
}
/**
* @return Updater
*/
private function createUpdater() {
$flags = [];
if ( $this->hasOption( 'cluster' ) ) {
$flags[] = 'same-cluster';
}
return new Updater( $this->getConnection(), $this->getSearchConfig(), $flags );
}
}
$maintClass = ForceSearchIndex::class;
require_once RUN_MAINTENANCE_IF_MAIN;