%PDF- %PDF-
Direktori : /www/varak.net/wiki.varak.net/extensions/CirrusSearch/maintenance/ |
Current File : /www/varak.net/wiki.varak.net/extensions/CirrusSearch/maintenance/saneitizeJobs.php |
<?php namespace CirrusSearch\Maintenance; use CirrusSearch\Connection; use CirrusSearch\SearchConfig; use CirrusSearch\Job\CheckerJob; use CirrusSearch\Maintenance\Maintenance; use CirrusSearch\Sanity\NoopRemediator; use CirrusSearch\Sanity\PrintingRemediator; use CirrusSearch\Sanity\QueueingRemediator; use MediaWiki\MediaWikiServices; use JobQueueGroup; /** * Push some sanitize jobs to the JobQueue * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation; either version 2 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License along * with this program; if not, write to the Free Software Foundation, Inc., * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. * http://www.gnu.org/copyleft/gpl.html */ $IP = getenv( 'MW_INSTALL_PATH' ); if( $IP === false ) { $IP = __DIR__ . '/../../..'; } require_once( "$IP/maintenance/Maintenance.php" ); require_once( __DIR__ . '/../includes/Maintenance/Maintenance.php' ); class SaneitizeJobs extends Maintenance { /** * @var MetaStoreIndex[] all metastores for write clusters */ private $metaStores; /** * @var int min page id (from db) */ private $minId; /** * @var int max page id (from db) */ private $maxId; /** * @var SearchConfig */ private $config; /** * @var string profile name */ private $profileName; public function __construct() { parent::__construct(); $this->mDescription = 'Manage sanitize jobs (CheckerJob). This ' . 'script operates on all writable clusters by default. ' . 'Add --cluster to work on a single cluster. Note that ' . 'once a job has been pushed to a particular cluster the ' . 'script will fail if you try to run the same job with ' . 'different cluster options.'; $this->addOption( 'push', 'Push some jobs to the job queue.' ); $this->addOption( 'show', 'Display job info.' ); $this->addOption( 'delete-job', 'Delete the job.' ); $this->addOption( 'refresh-freq', 'Refresh rate in seconds this ' . 'script is run from your crontab. This will be '. 'used to spread jobs over time. Defaults to 7200 (2 ' . 'hours).', false, true ); $this->addOption( 'job-name', 'Tells the script the name of the ' . 'sanitize job only useful to run multiple sanitize jobs. ' . 'Defaults to "default".', false, true ); } public function execute() { $this->init(); if ( $this->hasOption( 'show' ) ) { $this->showJobDetail(); } elseif ( $this->hasOption( 'push' ) ) { $this->pushJobs(); } elseif ( $this->hasOption( 'delete-job' ) ) { $this->deleteJob(); } else { $this->maybeHelp( true ); } } private function init() { $res = $this->getDB( DB_SLAVE )->select( 'page', [ 'MIN(page_id) as min_id', 'MAX(page_id) as max_id' ] ); $row = $res->next(); /** @suppress PhanUndeclaredProperty */ $this->minId = $row->min_id; /** @suppress PhanUndeclaredProperty */ $this->maxId = $row->max_id; $profiles = $this->getSearchConfig()->get( 'CirrusSearchSanitizationProfiles' ); uasort( $profiles, function( $a, $b ) { return $a['max_wiki_size'] < $b['max_wiki_size'] ? -1 : 1; } ); $wikiSize = $this->maxId - $this->minId; foreach( $profiles as $name => $settings ) { if ( $settings['max_wiki_size'] > $wikiSize ) { $this->profileName = $name; $this->log( "Detected $wikiSize ids to check, selecting profile $name\n" ); break; } } if ( !$this->profileName ) { $this->error( "No profile found for $wikiSize ids, please check sanitization profiles", 1 ); } } private function deleteJob() { $jobName = $this->getOption( 'job-name', 'default' ); $this->initMetaStores(); $jobInfo = $this->getJobInfo( $jobName ); if ( $jobInfo === null ) { $this->error( "Unknown job $jobName\n", 1 ); } foreach( $this->metaStores as $cluster => $store ) { $store->sanitizeType()->deleteDocument( $jobInfo ); $this->log( "Deleted job $jobName from $cluster.\n" ); } } /** * Basically we support two modes: * - all writable cluster, cluster = null * - single cluster, cluster = 'clusterName' * If we detect a mismatch here we fail. * @param \Elastica\Document $jobInfo check if the stored job match * cluster config used by this script, will die if clusters mismatch */ private function checkJobClusterMismatch( \Elastica\Document $jobInfo ) { $jobCluster = $jobInfo->get( 'sanitize_job_cluster' ); $scriptCluster = $this->getOption( 'cluster' ); if ( $jobCluster != $scriptCluster ) { $jobCluster = $jobCluster != null ? $jobCluster : "all writable clusters"; $scriptCluster = $scriptCluster != null ? $scriptCluster : "all writable clusters"; $this->error( "Job cluster mismatch, stored job is configured to work on $jobCluster " . "but the script is configured to run on $scriptCluster.\n", 1 ); } } private function showJobDetail() { if ( !MetaStoreIndex::cirrusReady( $this->getConnection() ) ) { $this->error( "Metastore unavailable, please index some data first.\n", 1 ); } $profile = $this->getSearchConfig()->getElement( 'CirrusSearchSanitizationProfiles', $this->profileName ); $minLoopDuration = $profile['min_loop_duration']; $maxJobs = $profile['max_checker_jobs']; $maxUpdates = $profile['update_jobs_max_pressure']; $this->initMetaStores(); $jobName = $this->getOption( 'job-name', 'default' ); $jobInfo = $this->getJobInfo( $jobName ); if ( $jobInfo === null ) { $this->error( "Unknown job $jobName, push some jobs first.\n", 1 ); } $fmt = 'Y-m-d H:i:s'; $cluster = $jobInfo->get( 'sanitize_job_cluster' ) ?: 'All writable clusters'; $created = date( $fmt, $jobInfo->get( 'sanitize_job_created' ) ); $updated = date( $fmt, $jobInfo->get( 'sanitize_job_updated' ) ); $loopStart = date( $fmt, $jobInfo->get( 'sanitize_job_last_loop' ) ); $idsSent = $jobInfo->get( 'sanitize_job_ids_sent' ); $idsSentTotal = $jobInfo->get( 'sanitize_job_ids_sent_total' ); $jobsSent = $jobInfo->get( 'sanitize_job_jobs_sent' ); $jobsSentTotal = $jobInfo->get( 'sanitize_job_jobs_sent_total' ); $updatePressure = CheckerJob::getPressure(); $loopTime = time() - $jobInfo->get( 'sanitize_job_last_loop' ); $totalTime = time() - $jobInfo->get( 'sanitize_job_created' ); $jobsRate = $jobInfo->get( 'sanitize_job_jobs_sent' ) / $loopTime; $jobsPerHour = round( $jobsRate * 3600, 2 ); $jobsPerDay = round( $jobsRate * 3600 * 24, 2 ); $jobsRateTotal = $jobInfo->get( 'sanitize_job_jobs_sent_total' ) / $totalTime; $jobsTotalPerHour = round( $jobsRateTotal * 3600, 2 ); $jobsTotalPerDay = round( $jobsRateTotal * 3600 * 24, 2 ); $idsRate = $jobInfo->get( 'sanitize_job_ids_sent' ) / $loopTime; $idsPerHour = round( $idsRate * 3600, 2 ); $idsPerDay = round( $idsRate * 3600 * 24, 2 ); $idsRateTotal = $jobInfo->get( 'sanitize_job_ids_sent_total' ) / $totalTime; $idsTotalPerHour = round( $idsRateTotal * 3600, 2 ); $idsTotalPerDay = round( $idsRateTotal * 3600 * 24, 2 ); $idsTodo = $this->maxId - $jobInfo->get( 'sanitize_job_id_offset' ); $loopEta = date( $fmt, time() + ( $idsTodo * $jobsRate ) ); $loopRestartMinTime = date( $fmt, $jobInfo->get( 'sanitize_job_last_loop' ) + $minLoopDuration ); $this->output( <<<EOD JobDetail for {$jobName} Target Wiki: {$jobInfo->get( 'sanitize_job_wiki' )} Cluster: {$cluster} Created: {$created} Updated: {$updated} Loop start: {$loopStart} Current id: {$jobInfo->get( 'sanitize_job_id_offset' )} Ids sent: {$idsSent} ({$idsSentTotal} total) Jobs sent: {$jobsSent} ({$jobsSentTotal} total) Pressure (CheckerJobs): Cur: {$this->getPressure()} jobs Max: {$maxJobs} jobs Pressure (Updates): Cur: {$updatePressure} jobs Max: {$maxUpdates} jobs Jobs rate: Loop: {$jobsPerHour} jobs/hour, {$jobsPerDay} jobs/day Total: {$jobsTotalPerHour} jobs/hour, {$jobsTotalPerDay} jobs/day Ids rate : Loop: {$idsPerHour} ids/hour, {$idsPerDay} ids/day Total: {$idsTotalPerHour} ids/hour, {$idsTotalPerDay} ids/day Loop: Todo: {$idsTodo} ids ETA: {$loopEta} Loop restart min time: {$loopRestartMinTime} EOD ); } private function pushJobs() { $pushJobFreq = $this->getOption( 'refresh-freq', 2*3600 ); if ( !$this->getSearchConfig()->get( 'CirrusSearchSanityCheck' ) ) { $this->error( "Sanity check disabled, abandonning...\n", 1 ); } $profile = $this->getSearchConfig()->getElement( 'CirrusSearchSanitizationProfiles', $this->profileName ); $chunkSize = $profile['jobs_chunk_size']; $maxJobs = $profile['max_checker_jobs']; if ( !$maxJobs || $maxJobs <= 0 ) { $this->error( "max_checker_jobs invalid abandonning.\n", 1 ); } $minLoopDuration = $profile['min_loop_duration']; $pressure = $this->getPressure(); if ( $pressure >= $maxJobs ) { $this->error( "Too many CheckerJob: $pressure in the queue, $maxJobs allowed.\n", 1 ); } $this->log( "$pressure checker job(s) in the queue.\n" ); $this->disablePoolCountersAndLogging(); $this->initMetaStores(); $jobName = $this->getOption( 'job-name', 'default' ); $jobInfo = $this->getJobInfo( $jobName ); if ( $jobInfo === null ) { $jobInfo = $this->createNewJob( $jobName ); } // @var int $from = $jobInfo->get( 'sanitize_job_id_offset' ); $lastLoop = $jobInfo->get( 'sanitize_job_last_loop' ); if ( $from <= $this->minId ) { // Avoid sending too many CheckerJob for very small wikis if ( !$this->checkMinLoopDuration( $lastLoop, $minLoopDuration ) ) { return; } $lastLoop = time(); } $jobsSent = $jobInfo->get( 'sanitize_job_jobs_sent' ); $jobsSentTotal = $jobInfo->get( 'sanitize_job_jobs_sent_total' ); $idsSent = $jobInfo->get( 'sanitize_job_ids_sent' ); $idsSentTotal = $jobInfo->get( 'sanitize_job_ids_sent_total' ); for ( $i = 0; $i < $maxJobs; $i++ ) { $to = min( $from + $chunkSize - 1, $this->maxId ); $this->sendJob( $from, $to, $pushJobFreq, $jobInfo->get( 'sanitize_job_cluster' ) ); $jobsSent++; $jobsSentTotal++; $idsSent += $to - $from; $idsSentTotal += $to - $from; $from = $to; if ( $from >= $this->maxId ) { $from = $this->minId; $idsSent = 0; $jobsSent = 0; if ( !$this->checkMinLoopDuration( $lastLoop, $minLoopDuration ) ) { break; } $lastLoop = time(); } else { $from++; } } $this->log( "Sent $jobsSent jobs, setting from offset to $from.\n" ); $jobInfo->set( 'sanitize_job_last_loop', $lastLoop ); $jobInfo->set( 'sanitize_job_id_offset', $from ); $jobInfo->set( 'sanitize_job_jobs_sent', $jobsSent ); $jobInfo->set( 'sanitize_job_jobs_sent_total', $jobsSentTotal ); $jobInfo->set( 'sanitize_job_ids_sent', $idsSent ); $jobInfo->set( 'sanitize_job_ids_sent_total', $idsSentTotal ); $this->updateJob( $jobInfo ); } /** * @param int $from * @param int $to * @param int $refreshRate * @param string|null $cluster */ private function sendJob( $from, $to, $refreshRate, $cluster ) { $delay = mt_rand( 0, $refreshRate ); $this->log( "Pushing CheckerJob( $from, $to, $delay, $cluster )\n"); JobQueueGroup::singleton()->push( CheckerJob::build( $from, $to, $delay, $this->profileName, $cluster ) ); } /** * @param $lastLoop int|null last loop start time * @param $minLoopDuration int minimal duration of a loop * @return bool true if minLoopDuration is not reached false otherwize */ private function checkMinLoopDuration( $lastLoop, $minLoopDuration ) { if ( $lastLoop !== null && ( time() - $lastLoop ) < $minLoopDuration ) { $date = date( 'Y-m-d H:i:s', $lastLoop ); $newLoop = date( 'Y-m-d H:i:s', $lastLoop + $minLoopDuration ); $this->log( "Last loop ended at $date, new jobs will be sent when min_loop_duration is reached at $newLoop\n" ); return false; } return true; } private function initMetaStores() { $connections = []; if ( $this->hasOption( 'cluster' ) ) { $cluster = $this->getOption( 'cluster' ); if ( !$this->getSearchConfig()->clusterExists( $cluster ) ) { $this->error( "Unknown cluster $cluster\n", 1 ); } if ( $this->getSearchConfig()->canWriteToCluster( $cluster ) ) { $this->error( "$cluster is not writable\n", 1 ); } $connections[$cluster] = Connection::getPool( $this->getSearchConfig(), $cluster ); } else { $connections = Connection::getWritableClusterConnections( $this->getSearchConfig() ); } if ( empty( $connections ) ) { $this->error( "No writable cluster found.", 1 ); } $this->metaStores = []; foreach ( $connections as $cluster => $connection ) { $store = new MetaStoreIndex( $connection, $this ); $store->createOrUpgradeIfNecessary(); $this->metaStores[$cluster] = $store; } } /** * @param string $jobName job name. * @return \Elastica\Document|null */ private function getJobInfo( $jobName ) { $latest = null; // Fetch the lastest jobInfo from the metastore. Ideally all // jobInfo should be the same but in the case a cluster has // been decommissioned and re-added its job info may be outdated foreach ( $this->metaStores as $metastore ) { $current = null; try { // Try to fetch the JobInfo from one of the metastore $current = $metastore->sanitizeType()->getDocument( $this->jobId( $jobName ) ); $this->checkJobClusterMismatch( $current ); if ( $latest == null ) { $latest = $current; /** @suppress PhanNonClassMethodCall $current cannot be null */ } elseif ( $current->get( 'sanitize_job_updated' ) > $latest->get( 'sanitize_job_updated' ) ) { $latest = $current; } } catch( \Elastica\Exception\NotFoundException $e ) { } } return $latest; } /** * @param string $jobName * @return string the job id */ private function jobId( $jobName ) { return 'sanitize-job-' . wfWikiID() . '-' . $jobName; } /** * @param \Elastica\Document */ private function updateJob( \Elastica\Document $jobInfo ) { $version = time(); $jobInfo->set( 'sanitize_job_updated', $version ); $jobInfo->setVersion( $version ); // @todo: remove this suppress (https://github.com/ruflin/Elastica/pull/1134) /** @suppress PhanTypeMismatchArgument this method is improperly annotated */ $jobInfo->setVersionType( 'external' ); foreach( $this->metaStores as $store ) { $store->sanitizeType()->addDocument( $jobInfo ); } } /** * @return \Elastica\Document */ private function createNewJob( $jobName ) { reset( $this->metaStores ); $cluster = $this->getOption( 'cluster' ); $job = new \Elastica\Document( $this->jobId( $jobName ), [ 'sanitize_job_wiki' => wfWikiID(), 'sanitize_job_created' => time(), 'sanitize_job_updated' => time(), 'sanitize_job_last_loop' => null, 'sanitize_job_cluster' => $cluster, 'sanitize_job_id_offset' => $this->minId, 'sanitize_job_ids_sent' => 0, 'sanitize_job_ids_sent_total' => 0, 'sanitize_job_jobs_sent' => 0, 'sanitize_job_jobs_sent_total' => 0 ] ); foreach( $this->metaStores as $store ) { $store->sanitizeType()->addDocument( $job ); } return $job; } /** * @return int the number of jobs in the CheckerJob queue */ private function getPressure() { $queue = JobQueueGroup::singleton()->get( 'cirrusSearchCheckerJob' ); return $queue->getSize() + $queue->getDelayedCount(); } private function log( $msg, $channel = null ) { $date = new \DateTime(); $this->output( $date->format('Y-m-d H:i:s') . " " . $msg, $channel ); } /** * @param string $msg The error to display * @param int $die If > 0, go ahead and die out using this int as the code */ public function error( $msg, $die = 0 ) { $date = new \DateTime(); parent::error( $date->format('Y-m-d H:i:s') . " " . $msg, $die ); } } $maintClass = "CirrusSearch\Maintenance\SaneitizeJobs"; require_once RUN_MAINTENANCE_IF_MAIN;