%PDF- %PDF-
Mini Shell

Mini Shell

Direktori : /www/varak.net/wiki.varak.net/extensions/CirrusSearch/maintenance/
Upload File :
Create Path :
Current File : /www/varak.net/wiki.varak.net/extensions/CirrusSearch/maintenance/saneitizeJobs.php

<?php

namespace CirrusSearch\Maintenance;

use CirrusSearch\Connection;
use CirrusSearch\SearchConfig;
use CirrusSearch\Job\CheckerJob;
use CirrusSearch\Maintenance\Maintenance;
use CirrusSearch\Sanity\NoopRemediator;
use CirrusSearch\Sanity\PrintingRemediator;
use CirrusSearch\Sanity\QueueingRemediator;
use MediaWiki\MediaWikiServices;

use JobQueueGroup;

/**
 * Push some sanitize jobs to the JobQueue
 *
 * This program is free software; you can redistribute it and/or modify
 * it under the terms of the GNU General Public License as published by
 * the Free Software Foundation; either version 2 of the License, or
 * (at your option) any later version.
 *
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
 * GNU General Public License for more details.
 *
 * You should have received a copy of the GNU General Public License along
 * with this program; if not, write to the Free Software Foundation, Inc.,
 * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
 * http://www.gnu.org/copyleft/gpl.html
 */

$IP = getenv( 'MW_INSTALL_PATH' );
if( $IP === false ) {
	$IP = __DIR__ . '/../../..';
}
require_once( "$IP/maintenance/Maintenance.php" );
require_once( __DIR__ . '/../includes/Maintenance/Maintenance.php' );

class SaneitizeJobs extends Maintenance {
	/**
	 * @var MetaStoreIndex[] all metastores for write clusters
	 */
	private $metaStores;

	/**
	 * @var int min page id (from db)
	 */
	private $minId;

	/**
	 * @var int max page id (from db)
	 */
	private $maxId;

	/**
	 * @var SearchConfig
	 */
	private $config;

	/**
	 * @var string profile name
	 */
	private $profileName;

	public function __construct() {
		parent::__construct();
		$this->mDescription = 'Manage sanitize jobs (CheckerJob). This ' .
			'script operates on all writable clusters by default. ' .
			'Add --cluster to work on a single cluster. Note that ' .
			'once a job has been pushed to a particular cluster the ' .
			'script will fail if you try to run the same job with ' .
			'different cluster options.';
		$this->addOption( 'push', 'Push some jobs to the job queue.' );
		$this->addOption( 'show', 'Display job info.' );
		$this->addOption( 'delete-job', 'Delete the job.' );
		$this->addOption( 'refresh-freq', 'Refresh rate in seconds this ' .
			'script is run from your crontab. This will be '.
			'used to spread jobs over time. Defaults to 7200 (2 ' .
			'hours).', false, true );
		$this->addOption( 'job-name', 'Tells the script the name of the ' .
			'sanitize job only useful to run multiple sanitize jobs. ' .
			'Defaults to "default".', false, true );
	}

	public function execute() {
		$this->init();
		if ( $this->hasOption( 'show' ) ) {
			$this->showJobDetail();
		} elseif ( $this->hasOption( 'push' ) ) {
			$this->pushJobs();
		} elseif ( $this->hasOption( 'delete-job' ) ) {
			$this->deleteJob();
		} else {
			$this->maybeHelp( true );
		}
	}

	private function init() {
		$res = $this->getDB( DB_SLAVE )->select( 'page',
			[ 'MIN(page_id) as min_id', 'MAX(page_id) as max_id' ] );
		$row = $res->next();
		/** @suppress PhanUndeclaredProperty */
		$this->minId = $row->min_id;
		/** @suppress PhanUndeclaredProperty */
		$this->maxId = $row->max_id;
		$profiles = $this->getSearchConfig()->get( 'CirrusSearchSanitizationProfiles' );
		uasort( $profiles, function( $a, $b ) {
			return $a['max_wiki_size'] < $b['max_wiki_size'] ? -1 : 1;
		} );
		$wikiSize = $this->maxId - $this->minId;
		foreach( $profiles as $name => $settings ) {
			if ( $settings['max_wiki_size'] > $wikiSize ) {
				$this->profileName = $name;
				$this->log( "Detected $wikiSize ids to check, selecting profile $name\n" );
				break;
			}
		}
		if ( !$this->profileName ) {
			$this->error( "No profile found for $wikiSize ids, please check sanitization profiles", 1 );
		}
	}

	private function deleteJob() {
		$jobName = $this->getOption( 'job-name', 'default' );
		$this->initMetaStores();
		$jobInfo = $this->getJobInfo( $jobName );
		if ( $jobInfo === null ) {
			$this->error( "Unknown job $jobName\n", 1 );
		}
		foreach( $this->metaStores as $cluster => $store ) {
			$store->sanitizeType()->deleteDocument( $jobInfo );
			$this->log( "Deleted job $jobName from $cluster.\n" );
		}
	}

	/**
	 * Basically we support two modes:
	 *   - all writable cluster, cluster = null
	 *   - single cluster, cluster = 'clusterName'
	 * If we detect a mismatch here we fail.
	 * @param \Elastica\Document $jobInfo check if the stored job match
	 * cluster config used by this script, will die if clusters mismatch
	 */
	private function checkJobClusterMismatch( \Elastica\Document $jobInfo ) {
		$jobCluster = $jobInfo->get( 'sanitize_job_cluster' );
		$scriptCluster = $this->getOption( 'cluster' );
		if ( $jobCluster != $scriptCluster ) {
			$jobCluster = $jobCluster != null ? $jobCluster : "all writable clusters";
			$scriptCluster = $scriptCluster != null ? $scriptCluster : "all writable clusters";
			$this->error( "Job cluster mismatch, stored job is configured to work on $jobCluster " .
				"but the script is configured to run on $scriptCluster.\n", 1 );
		}
	}

	private function showJobDetail() {
		if ( !MetaStoreIndex::cirrusReady( $this->getConnection() ) ) {
			$this->error( "Metastore unavailable, please index some data first.\n", 1 );
		}
		$profile = $this->getSearchConfig()->getElement( 'CirrusSearchSanitizationProfiles', $this->profileName );
		$minLoopDuration = $profile['min_loop_duration'];
		$maxJobs = $profile['max_checker_jobs'];
		$maxUpdates = $profile['update_jobs_max_pressure'];

		$this->initMetaStores();
		$jobName = $this->getOption( 'job-name', 'default' );
		$jobInfo = $this->getJobInfo( $jobName );
		if ( $jobInfo === null ) {
			$this->error( "Unknown job $jobName, push some jobs first.\n", 1 );
		}
		$fmt = 'Y-m-d H:i:s';
		$cluster = $jobInfo->get( 'sanitize_job_cluster' ) ?: 'All writable clusters';

		$created = date( $fmt, $jobInfo->get( 'sanitize_job_created' ) );
		$updated = date( $fmt, $jobInfo->get( 'sanitize_job_updated' ) );
		$loopStart = date( $fmt, $jobInfo->get( 'sanitize_job_last_loop' ) );

		$idsSent = $jobInfo->get( 'sanitize_job_ids_sent' );
		$idsSentTotal = $jobInfo->get( 'sanitize_job_ids_sent_total' );

		$jobsSent = $jobInfo->get( 'sanitize_job_jobs_sent' );
		$jobsSentTotal = $jobInfo->get( 'sanitize_job_jobs_sent_total' );

		$updatePressure = CheckerJob::getPressure();

		$loopTime = time() - $jobInfo->get( 'sanitize_job_last_loop' );
		$totalTime = time() - $jobInfo->get( 'sanitize_job_created' );

		$jobsRate = $jobInfo->get( 'sanitize_job_jobs_sent' ) / $loopTime;
		$jobsPerHour = round( $jobsRate * 3600, 2 );
		$jobsPerDay = round( $jobsRate * 3600 * 24, 2 );
		$jobsRateTotal = $jobInfo->get( 'sanitize_job_jobs_sent_total' ) / $totalTime;
		$jobsTotalPerHour = round( $jobsRateTotal * 3600, 2 );
		$jobsTotalPerDay = round( $jobsRateTotal * 3600 * 24, 2 );

		$idsRate = $jobInfo->get( 'sanitize_job_ids_sent' ) / $loopTime;
		$idsPerHour = round( $idsRate * 3600, 2 );
		$idsPerDay = round( $idsRate * 3600 * 24, 2 );
		$idsRateTotal = $jobInfo->get( 'sanitize_job_ids_sent_total' ) / $totalTime;
		$idsTotalPerHour = round( $idsRateTotal * 3600, 2 );
		$idsTotalPerDay = round( $idsRateTotal * 3600 * 24, 2 );

		$idsTodo = $this->maxId - $jobInfo->get( 'sanitize_job_id_offset' );
		$loopEta = date( $fmt, time() + ( $idsTodo * $jobsRate ) );
		$loopRestartMinTime = date( $fmt, $jobInfo->get( 'sanitize_job_last_loop' ) + $minLoopDuration );

		$this->output( <<<EOD
JobDetail for {$jobName}
	Target Wiki: 	{$jobInfo->get( 'sanitize_job_wiki' )}
	Cluster: 	{$cluster}
	Created: 	{$created}
	Updated: 	{$updated}
	Loop start:	{$loopStart}
	Current id:	{$jobInfo->get( 'sanitize_job_id_offset' )}
	Ids sent:	{$idsSent} ({$idsSentTotal} total)
	Jobs sent:	{$jobsSent} ({$jobsSentTotal} total)
	Pressure (CheckerJobs):
		Cur:	{$this->getPressure()} jobs
		Max:	{$maxJobs} jobs
	Pressure (Updates):
		Cur:	{$updatePressure} jobs
		Max:	{$maxUpdates} jobs
	Jobs rate:
		Loop:	{$jobsPerHour} jobs/hour, {$jobsPerDay} jobs/day
		Total:	{$jobsTotalPerHour} jobs/hour, {$jobsTotalPerDay} jobs/day
	Ids rate :
		Loop:	{$idsPerHour} ids/hour, {$idsPerDay} ids/day
		Total:	{$idsTotalPerHour} ids/hour, {$idsTotalPerDay} ids/day
	Loop:
		Todo:	{$idsTodo} ids
		ETA:	{$loopEta}
	Loop restart min time: {$loopRestartMinTime}

EOD
		);

	}

	private function pushJobs() {
		$pushJobFreq = $this->getOption( 'refresh-freq', 2*3600 );
		if ( !$this->getSearchConfig()->get( 'CirrusSearchSanityCheck' ) ) {
			$this->error( "Sanity check disabled, abandonning...\n", 1 );
		}
		$profile = $this->getSearchConfig()->getElement( 'CirrusSearchSanitizationProfiles', $this->profileName );
		$chunkSize = $profile['jobs_chunk_size'];
		$maxJobs = $profile['max_checker_jobs'];
		if ( !$maxJobs || $maxJobs <= 0 ) {
			$this->error( "max_checker_jobs invalid abandonning.\n", 1 );
		}
		$minLoopDuration = $profile['min_loop_duration'];

		$pressure = $this->getPressure();
		if ( $pressure >= $maxJobs ) {
			$this->error( "Too many CheckerJob: $pressure in the queue, $maxJobs allowed.\n", 1 );
		}
		$this->log( "$pressure checker job(s) in the queue.\n" );

		$this->disablePoolCountersAndLogging();
		$this->initMetaStores();

		$jobName = $this->getOption( 'job-name', 'default' );
		$jobInfo = $this->getJobInfo( $jobName );
		if ( $jobInfo === null ) {
			$jobInfo = $this->createNewJob( $jobName );
		}
		// @var int
		$from = $jobInfo->get( 'sanitize_job_id_offset' );
		$lastLoop = $jobInfo->get( 'sanitize_job_last_loop' );
		if ( $from <= $this->minId ) {
			// Avoid sending too many CheckerJob for very small wikis
			if ( !$this->checkMinLoopDuration( $lastLoop,  $minLoopDuration ) ) {
				return;
			}
			$lastLoop = time();
		}
		$jobsSent = $jobInfo->get( 'sanitize_job_jobs_sent' );
		$jobsSentTotal = $jobInfo->get( 'sanitize_job_jobs_sent_total' );
		$idsSent = $jobInfo->get( 'sanitize_job_ids_sent' );
		$idsSentTotal = $jobInfo->get( 'sanitize_job_ids_sent_total' );
		for ( $i = 0; $i < $maxJobs; $i++ ) {
			$to = min( $from + $chunkSize - 1, $this->maxId );
			$this->sendJob( $from, $to, $pushJobFreq, $jobInfo->get( 'sanitize_job_cluster' ) );
			$jobsSent++;
			$jobsSentTotal++;
			$idsSent += $to - $from;
			$idsSentTotal += $to - $from;
			$from = $to;
			if ( $from >= $this->maxId ) {
				$from = $this->minId;
				$idsSent = 0;
				$jobsSent = 0;
				if ( !$this->checkMinLoopDuration( $lastLoop, $minLoopDuration ) ) {
					break;
				}
				$lastLoop = time();
			} else {
				$from++;
			}
		}
		$this->log( "Sent $jobsSent jobs, setting from offset to $from.\n" );
		$jobInfo->set( 'sanitize_job_last_loop', $lastLoop );
		$jobInfo->set( 'sanitize_job_id_offset', $from );
		$jobInfo->set( 'sanitize_job_jobs_sent', $jobsSent );
		$jobInfo->set( 'sanitize_job_jobs_sent_total', $jobsSentTotal );
		$jobInfo->set( 'sanitize_job_ids_sent', $idsSent );
		$jobInfo->set( 'sanitize_job_ids_sent_total', $idsSentTotal );
		$this->updateJob( $jobInfo );
	}

	/**
	 * @param int $from
	 * @param int $to
	 * @param int $refreshRate
	 * @param string|null $cluster
	 */
	private function sendJob( $from, $to, $refreshRate, $cluster ) {
		$delay = mt_rand( 0, $refreshRate );
		$this->log( "Pushing CheckerJob( $from, $to, $delay, $cluster )\n");
		JobQueueGroup::singleton()->push( CheckerJob::build( $from, $to, $delay, $this->profileName, $cluster ) );
	}

	/**
	 * @param $lastLoop int|null last loop start time
	 * @param $minLoopDuration int minimal duration of a loop
	 * @return bool true if minLoopDuration is not reached false otherwize
	 */
	private function checkMinLoopDuration( $lastLoop, $minLoopDuration ) {
		if ( $lastLoop !== null && ( time() - $lastLoop ) < $minLoopDuration ) {
			$date = date( 'Y-m-d H:i:s', $lastLoop );
			$newLoop = date( 'Y-m-d H:i:s', $lastLoop + $minLoopDuration );
			$this->log( "Last loop ended at $date, new jobs will be sent when min_loop_duration is reached at $newLoop\n" );
			return false;
		}
		return true;
	}


	private function initMetaStores() {
		$connections = [];
		if ( $this->hasOption( 'cluster' ) ) {
			$cluster = $this->getOption( 'cluster' );
			if ( !$this->getSearchConfig()->clusterExists( $cluster ) ) {
				$this->error( "Unknown cluster $cluster\n", 1 );
			}
			if ( $this->getSearchConfig()->canWriteToCluster( $cluster ) ) {
				$this->error( "$cluster is not writable\n", 1 );
			}
			$connections[$cluster] = Connection::getPool( $this->getSearchConfig(), $cluster );
		} else {
			$connections = Connection::getWritableClusterConnections( $this->getSearchConfig() );
		}

		if ( empty( $connections ) ) {
			$this->error( "No writable cluster found.", 1 );
		}

		$this->metaStores = [];
		foreach ( $connections as $cluster => $connection ) {
			$store = new MetaStoreIndex( $connection, $this );
			$store->createOrUpgradeIfNecessary();
			$this->metaStores[$cluster] = $store;
		}
	}

	/**
	 * @param string $jobName job name.
	 * @return \Elastica\Document|null
	 */
	private function getJobInfo( $jobName ) {
		$latest = null;
		// Fetch the lastest jobInfo from the metastore. Ideally all
		// jobInfo should be the same but in the case a cluster has
		// been decommissioned and re-added its job info may be outdated
		foreach ( $this->metaStores as $metastore ) {
			$current = null;
			try {
				// Try to fetch the JobInfo from one of the metastore
				$current = $metastore->sanitizeType()->getDocument(
					$this->jobId( $jobName )
				);
				$this->checkJobClusterMismatch( $current );
				if ( $latest == null ) {
					$latest = $current;
				/** @suppress PhanNonClassMethodCall $current cannot be null */
				} elseif ( $current->get( 'sanitize_job_updated' ) > $latest->get( 'sanitize_job_updated' ) ) {
					$latest = $current;
				}
			} catch( \Elastica\Exception\NotFoundException $e ) {
			}
		}
		return $latest;
	}

	/**
	 * @param string $jobName
	 * @return string the job id
	 */
	private function jobId( $jobName ) {
		return 'sanitize-job-' . wfWikiID() . '-' . $jobName;
	}

	/**
	 * @param \Elastica\Document
	 */
	private function updateJob( \Elastica\Document $jobInfo ) {
		$version = time();
		$jobInfo->set( 'sanitize_job_updated', $version );
		$jobInfo->setVersion( $version );
		// @todo: remove this suppress (https://github.com/ruflin/Elastica/pull/1134)
		/** @suppress PhanTypeMismatchArgument this method is improperly annotated */
		$jobInfo->setVersionType( 'external' );
		foreach( $this->metaStores as $store ) {
			$store->sanitizeType()->addDocument( $jobInfo );
		}
	}

	/**
	 * @return \Elastica\Document
	 */
	private function createNewJob( $jobName ) {
		reset( $this->metaStores );
		$cluster = $this->getOption( 'cluster' );
		$job = new \Elastica\Document(
			$this->jobId( $jobName ),
			[
				'sanitize_job_wiki' => wfWikiID(),
				'sanitize_job_created' => time(),
				'sanitize_job_updated' => time(),
				'sanitize_job_last_loop' => null,
				'sanitize_job_cluster' => $cluster,
				'sanitize_job_id_offset' => $this->minId,
				'sanitize_job_ids_sent' => 0,
				'sanitize_job_ids_sent_total' => 0,
				'sanitize_job_jobs_sent' => 0,
				'sanitize_job_jobs_sent_total' => 0
			]
		);
		foreach( $this->metaStores as $store ) {
			$store->sanitizeType()->addDocument( $job );
		}
		return $job;
	}

	/**
	 * @return int the number of jobs in the CheckerJob queue
	 */
	private function getPressure() {
		$queue = JobQueueGroup::singleton()->get( 'cirrusSearchCheckerJob' );
		return $queue->getSize() + $queue->getDelayedCount();
	}

	private function log( $msg, $channel = null ) {
		$date = new \DateTime();
		$this->output( $date->format('Y-m-d H:i:s') . " " . $msg, $channel );
	}

	/**
	 * @param string $msg The error to display
	 * @param int $die If > 0, go ahead and die out using this int as the code
	 */
	public function error( $msg, $die = 0 ) {
		$date = new \DateTime();
		parent::error( $date->format('Y-m-d H:i:s') . " " . $msg, $die );
	}
}

$maintClass = "CirrusSearch\Maintenance\SaneitizeJobs";
require_once RUN_MAINTENANCE_IF_MAIN;

Zerion Mini Shell 1.0