%PDF- %PDF-
Mini Shell

Mini Shell

Direktori : /www/varak.net/wiki.varak.net/extensions/CirrusSearch/includes/Sanity/
Upload File :
Create Path :
Current File : /www/varak.net/wiki.varak.net/extensions/CirrusSearch/includes/Sanity/Checker.php

<?php

namespace CirrusSearch\Sanity;

use ArrayObject;
use CirrusSearch\Connection;
use CirrusSearch\SearchConfig;
use CirrusSearch\Searcher;
use MediaWiki\MediaWikiServices;
use Status;
use Title;
use WikiPage;

/**
 * Checks if a WikiPage's representation in search index is sane.
 *
 * This program is free software; you can redistribute it and/or modify
 * it under the terms of the GNU General Public License as published by
 * the Free Software Foundation; either version 2 of the License, or
 * (at your option) any later version.
 *
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
 * GNU General Public License for more details.
 *
 * You should have received a copy of the GNU General Public License along
 * with this program; if not, write to the Free Software Foundation, Inc.,
 * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
 * http://www.gnu.org/copyleft/gpl.html
 */

class Checker {
	/**
	 * @var SearchConfig
	 */
	private $searchConfig;

	/**
	 * @var Connection
	 */
	private $connection;

	/**
	 * @var Searcher Used for fetching data, so we can check the content.
	 */
	private $searcher;

	/**
	 * @var Remediator Do something with the problems we found
	 */
	private $remediator;

	/**
	 * @var bool Should we log id's that are found to have no problems
	 */
	private $logSane;

	/**
	 * @var bool inspect WikiPage::isRedirect() instead of WikiPage::getContent()->isRedirect()
	 * Faster since it does not need to fetch the content but inconsistent in some cases.
	 */
	private $fastRedirectCheck;

	/**
	 * A cache for pages loaded with loadPagesFromDB( $pageIds ). This is only
	 * useful when multiple Checker are run to check different elastic clusters.
	 * @var ArrayObject|null
	 */
	private $pageCache;

	/**
	 * Build the checker.
	 * @param Connection $connection
	 * @param Remediator $remediator the remediator to which to send titles
	 *   that are insane
	 * @param Searcher $searcher searcher to use for fetches
	 * @param bool $logSane should we log sane ids
	 * @param bool $fastRedirectCheck fast but inconsistent redirect check
	 * @param ArrayObject|null $pageCache cache for WikiPage loaded from db
	 */
	public function __construct( SearchConfig $config, Connection $connection, Remediator $remediator, Searcher $searcher, $logSane, $fastRedirectCheck, ArrayObject $pageCache = null ) {
		$this->searchConfig = $config;
		$this->connection = $connection;
		$this->remediator = $remediator;
		$this->searcher = $searcher;
		$this->logSane = $logSane;
		$this->fastRedirectCheck = $fastRedirectCheck;
		$this->pageCache = $pageCache;
	}

	/**
	 * Check if a title is insane.
	 *
	 * @param int[] $pageIds page to check
	 * @return int the number of pages updated
	 */
	public function check( array $pageIds ) {
		$docIds = array_map( [ $this->searchConfig, 'makeId' ], $pageIds );

		$pagesFromDb = $this->loadPagesFromDB( $pageIds );
		$pagesFromIndex = $this->loadPagesFromIndex( $docIds );
		$nbPagesFixed = 0;
		foreach( array_combine( $pageIds, $docIds ) as $pageId => $docId ) {
			$fromIndex = [];
			if ( isset( $pagesFromIndex[$docId] ) ) {
				$fromIndex = $pagesFromIndex[$docId];
			}

			$updated = false;
			if ( isset ( $pagesFromDb[$pageId] ) ) {
				$page = $pagesFromDb[$pageId];
				$updated = $this->checkExisitingPage( $docId, $pageId, $page, $fromIndex );
			} else {
				$updated = $this->checkInexistentPage( $docId, $pageId, $fromIndex );
			}
			if( $updated ) {
				$nbPagesFixed++;
			}
		}
		$clusterName = $this->connection->getClusterName();
		$stats = MediaWikiServices::getInstance()->getStatsdDataFactory();
		$stats->updateCount( "CirrusSearch.$clusterName.sanitization.fixed", $nbPagesFixed );
		$stats->updateCount( "CirrusSearch.$clusterName.sanitization.checked", count( $pageIds ) );
		return $nbPagesFixed;
	}

	/**
	 * Check that an existing page is properly indexed:
	 * - index it if missing in the index
	 * - delete it if it's a redirect
	 * - verify it if found in the index
	 *
	 * @param string $docId
	 * @param int $pageId
	 * @param WikiPage $page
	 * @param \Elastica\Result[] $fromIndex
	 * @return bool true if a modification was needed
	 */
	private function checkExisitingPage( $docId, $pageId, $page, $fromIndex ) {
		$inIndex = count( $fromIndex ) > 0;
		if ( $this->checkIfRedirect( $page ) ) {
			if ( $inIndex ) {
				$this->remediator->redirectInIndex( $page );
				return true;
			}
			$this->sane( $pageId, 'Redirect not in index' );
			return false;
		}
		if ( $inIndex ) {
			return $this->checkPageInIndex( $docId, $pageId, $page, $fromIndex );
		}
		$this->remediator->pageNotInIndex( $page );
		return true;
	}

	/**
	 * Check if the page is a redirect
	 * @param WikiPage $page the page
	 * @return bool true if $page is a redirect
	 */
	private function checkIfRedirect( $page ) {
		if ( $this->fastRedirectCheck ) {
			return $page->isRedirect();
		}

		$content = $page->getContent();
		if ( $content == null ) {
			return false;
		}
		if( is_object ( $content ) ) {
			return $content->isRedirect();
		}
		return false;
	}

	/**
	 * Check that an inexistent page is not present in the index
	 * and delete it if found
	 *
	 * @param string $docId
	 * @param int $pageId
	 * @param WikiPage $page
	 * @param \Elastica\Result[] $fromIndex
	 * @return bool true if a modification was needed
	 */
	private function checkInexistentPage( $docId, $pageId, $fromIndex ) {
		$inIndex = count( $fromIndex ) > 0;
		if ( $inIndex ) {
			foreach( $fromIndex as $r ) {
				$title = Title::makeTitle( $r->namespace, $r->title );
				$this->remediator->ghostPageInIndex( $docId, $title );
			}
			return true;
		}
		$this->sane( $pageId, 'No ghost' );
		return false;
	}

	/**
	 * Check that a page present in the db and in the index
	 * is in the correct index with the latest version.
	 *
	 * @param string $docId
	 * @param int $pageId
	 * @param WikiPage $page
	 * @param \Elastica\Result[] $fromIndex
	 * @return bool true if a modification was needed
	 */
	private function checkPageInIndex( $docId, $pageId, WikiPage $page, array $fromIndex ) {
		$insane = $this->checkIndexMismatch( $docId, $pageId, $page, $fromIndex );
		if ( !$insane ) {
			$insane = $this->checkIndexedVersion( $docId, $pageId, $page, $fromIndex );
		}

		if ( !$insane ) {
			$this->sane( $pageId, 'Page in index with latest version' );
		}

		return $insane;
	}

	/**
	 * Check that a page present in the db and in the index
	 * is properly indexed to the appropriate index by checking its
	 * namespace.
	 *
	 * @param string $docId
	 * @param int $pageId
	 * @param WikiPage $page
	 * @param \Elastica\Result[] $fromIndex
	 * @return bool true if a modification was needed
	 */
	private function checkIndexMismatch( $docId, $pageId, WikiPage $page, array $fromIndex ) {
		$foundInsanityInIndex = false;
		$expectedType = $this->connection->getIndexSuffixForNamespace( $page->getTitle()->getNamespace() );
		foreach ( $fromIndex as $indexInfo ) {
			$type = $this->connection->extractIndexSuffix( $indexInfo->getIndex() );
			if ( $type !== $expectedType ) {
				// Got to grab the index type from the index name....
				$this->remediator->pageInWrongIndex( $docId, $page, $type );
				$foundInsanityInIndex = true;
			}
		}

		if ( $foundInsanityInIndex ) {
			return true;
		}

		return false;
	}


	/**
	 * Check that the indexed version of the page is the
	 * latest version in the database.
	 *
	 * @param string $docId
	 * @param int $pageId
	 * @param WikiPage $page
	 * @param \Elastica\Result[] $fromIndex
	 * @return bool true if a modification was needed
	 */
	private function checkIndexedVersion( $docId, $pageId, WikiPage $page, array $fromIndex ) {
		$latest = $page->getLatest();
		$foundInsanityInIndex = false;
		foreach ( $fromIndex as $indexInfo ) {
			$version = $indexInfo->getSource()['version'];
			if ( $version < $latest ) {
				$type = $this->connection->extractIndexSuffix( $indexInfo->getIndex() );
				$this->remediator->oldVersionInIndex( $docId, $page, $type );

				$foundInsanityInIndex = true;
			}
		}

		return $foundInsanityInIndex;
	}

	/**
	 * @param int[] $pageIds page ids
	 * @return WikiPage[] the list of wiki pages indexed in page id
	 */
	private function loadPagesFromDB( array $pageIds ) {
		// If no cache object is constructed we build a new one.
		// Building it in the constructor would cause memleaks because
		// there is no automatic prunning of old entries. If a cache
		// object is provided the owner of this Checker instance must take
		// care of the cleaning.
		$cache = $this->pageCache ?: new ArrayObject();
		$pageIds = array_diff( $pageIds, array_keys( $cache->getArrayCopy() ) );
		if ( empty( $pageIds ) ) {
			return $cache->getArrayCopy();
		}
		$dbr = wfGetDB( DB_SLAVE );
		$where = 'page_id IN (' . $dbr->makeList( $pageIds ) . ')';
		$res = $dbr->select(
			[ 'page' ],
			WikiPage::selectFields(),
			$where,
			__METHOD__
		);
		foreach ( $res as $row ) {
			$page = WikiPage::newFromRow( $row );
			$cache->offsetSet( $page->getId(), $page );
		}
		return $cache->getArrayCopy();
	}

	/**
	 * @param string[] $docIds document ids
	 * @return \Elastica\Result[][] search results indexed by page id
	 * @throws \Exception if an error occurred
	 */
	private function loadPagesFromIndex( array $docIds ) {
		$status = $this->searcher->get( $docIds, [ 'namespace', 'title', 'version' ] );
		if ( !$status->isOK() ) {
			throw new \Exception( 'Cannot fetch ids from index' );
		}
		/** @var \Elastica\ResultSet $dataFromIndex */
		$dataFromIndex = $status->getValue();

		$indexedPages = [];
		foreach ( $dataFromIndex as $indexInfo ) {
			$indexedPages[$indexInfo->getId()][] = $indexInfo;
		}
		return $indexedPages;
	}

	private function sane( $pageId, $reason ) {
		if ( $this->logSane ) {
			printf( "%30s %10d\n", $reason, $pageId );
		}
	}
}

Zerion Mini Shell 1.0