%PDF- %PDF-
| Direktori : /www/varak.net/wiki.varak.net/extensions/CirrusSearch/includes/Maintenance/ |
| Current File : //www/varak.net/wiki.varak.net/extensions/CirrusSearch/includes/Maintenance/Reindexer.php |
<?php
namespace CirrusSearch\Maintenance;
use CirrusSearch\Connection;
use CirrusSearch\ElasticsearchIntermediary;
use CirrusSearch\SearchConfig;
use CirrusSearch\Util;
use Elastica\Document;
use Elastica\Exception\Connection\HttpException;
use Elastica\Exception\ExceptionInterface;
use Elastica\Index;
use Elastica\Query;
use Elastica\Type;
use ForkController;
use MediaWiki\Logger\LoggerFactory;
use MWElasticUtils;
/**
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License along
* with this program; if not, write to the Free Software Foundation, Inc.,
* 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
* http://www.gnu.org/copyleft/gpl.html
*/
class Reindexer {
/**
* @var SearchConfig
*/
private $searchConfig;
/*** "From" portion ***/
/**
* @var Index
*/
private $oldIndex;
/**
* @var Connection
*/
private $oldConnection;
/*** "To" portion ***/
/**
* @var Index
*/
private $index;
/**
* @var Connection
*/
private $connection;
/**
* @var Type[]
*/
private $types;
/**
* @var Type[]
*/
private $oldTypes;
/**
* @var int
*/
private $shardCount;
/**
* @var string
*/
private $replicaCount;
/**
* @var array
*/
private $mergeSettings;
/**
* @var array
*/
private $mappingConfig;
/**
* @var Maintenance
*/
private $out;
/**
* @param SearchConfig $searchConfig
* @param Connection $source
* @param Connection $target
* @param Type[] $types
* @param Type[] $oldTypes
* @param int $shardCount
* @param string $replicaCount
* @param array $mergeSettings
* @param array $mappingConfig
* @param Maintenance $out
* @throws \Exception
*/
public function __construct( SearchConfig $searchConfig, Connection $source, Connection $target, array $types, array $oldTypes, $shardCount, $replicaCount, array $mergeSettings, array $mappingConfig, Maintenance $out = null ) {
// @todo: this constructor has too many arguments - refactor!
$this->searchConfig = $searchConfig;
$this->oldConnection = $source;
$this->connection = $target;
$this->types = $types;
$this->oldTypes = $oldTypes;
$this->shardCount = $shardCount;
$this->replicaCount = $replicaCount;
$this->mergeSettings = $mergeSettings;
$this->mappingConfig = $mappingConfig;
$this->out = $out;
if ( empty($types) || empty($oldTypes) ) {
throw new \Exception( "Types list should be non-empty" );
}
$this->index = $types[0]->getIndex();
$this->oldIndex = $oldTypes[0]->getIndex();
}
/**
* Dump everything from the live index into the one being worked on.
*
* @param int $processes
* @param int $refreshInterval
* @param int $retryAttempts
* @param int $chunkSize
* @param float $acceptableCountDeviation
*/
public function reindex( $processes = 1, $refreshInterval = 1, $retryAttempts = 5, $chunkSize = 100, $acceptableCountDeviation = .05 ) {
global $wgCirrusSearchWikimediaExtraPlugin;
// Set some settings that should help io load during bulk indexing. We'll have to
// optimize after this to consolidate down to a proper number of segments but that is
// is worth the price. total_shards_per_node will help to make sure that each shard
// has as few neighbors as possible.
$this->setConnectionTimeout();
$settings = $this->index->getSettings();
$maxShardsPerNode = $this->decideMaxShardsPerNodeForReindex();
$settings->set( [
'refresh_interval' => -1,
'merge.policy.segments_per_tier' => 40,
'merge.policy.max_merge_at_once' => 40,
'routing.allocation.total_shards_per_node' => $maxShardsPerNode,
] );
if ( $processes > 1 ) {
if ( !isset( $wgCirrusSearchWikimediaExtraPlugin[ 'id_hash_mod_filter' ] ) ||
!$wgCirrusSearchWikimediaExtraPlugin[ 'id_hash_mod_filter' ] ) {
$this->error( "Can't use multiple processes without \$wgCirrusSearchWikimediaExtraPlugin[ 'id_hash_mod_filter' ] = true", 1 );
}
$fork = new ForkController( $processes );
$forkResult = $fork->start();
// we don't want to share sockets between forks, so destroy the client.
$this->destroyClients();
switch ( $forkResult ) {
case 'child':
foreach ( $this->types as $i => $type ) {
$oldType = $this->oldTypes[$i];
$this->reindexInternal( $type, $oldType, $processes, $fork->getChildNumber(), $chunkSize, $retryAttempts );
}
die( 0 );
case 'done':
break;
default:
$this->error( "Unexpected result while forking: $forkResult", 1 );
}
$this->outputIndented( "Verifying counts..." );
// We can't verify counts are exactly equal because they won't be - we still push updates into
// the old index while reindexing the new one.
foreach ( $this->types as $i => $type ) {
$oldType = $this->oldTypes[$i];
$oldCount = (float) $oldType->count();
$this->index->refresh();
$newCount = (float) $type->count();
$difference = $oldCount > 0 ? abs( $oldCount - $newCount ) / $oldCount : 0;
if ( $difference > $acceptableCountDeviation ) {
$this->output( "Not close enough! old=$oldCount new=$newCount difference=$difference\n" );
$this->error( 'Failed to load index - counts not close enough. ' .
"old=$oldCount new=$newCount difference=$difference. " .
'Check for warnings above.', 1 );
}
}
$this->output( "done\n" );
} else {
foreach ( $this->types as $i => $type ) {
$oldType = $this->oldTypes[$i];
$this->reindexInternal( $type, $oldType, 1, 1, $chunkSize, $retryAttempts );
}
}
// Revert settings changed just for reindexing
$settings->set( [
'refresh_interval' => $refreshInterval . 's',
'merge.policy' => $this->mergeSettings,
] );
}
public function optimize() {
// Optimize the index so it'll be more compact for replication. Not required
// but should be helpful.
$this->outputIndented( "\tOptimizing..." );
try {
// Reset the timeout just in case we lost it somewhere along the line
$this->setConnectionTimeout();
$this->index->optimize( [ 'max_num_segments' => 5 ] );
$this->output( "Done\n" );
} catch ( HttpException $e ) {
if ( $e->getMessage() === 'Operation timed out' ) {
$this->output( "Timed out...Continuing any way\n" );
// To continue without blowing up we need to reset the connection.
$this->destroyClients();
} else {
throw $e;
}
}
}
public function waitForShards() {
if( !$this->replicaCount || $this->replicaCount === "false" ) {
$this->outputIndented( "\tNo replicas, skipping.\n" );
return;
}
$this->outputIndented( "\tWaiting for all shards to start...\n" );
list( $lower, $upper ) = explode( '-', $this->replicaCount );
$each = 0;
while ( true ) {
$health = $this->getHealth();
$active = $health[ 'active_shards' ];
$relocating = $health[ 'relocating_shards' ];
$initializing = $health[ 'initializing_shards' ];
$unassigned = $health[ 'unassigned_shards' ];
$nodes = $health[ 'number_of_nodes' ];
if ( $nodes < $lower ) {
$this->error( "Require $lower replicas but only have $nodes nodes. "
. "This is almost always due to misconfiguration, aborting.", 1 );
}
// If the upper range is all, expect the upper bound to be the number of nodes
if ( $upper === 'all' ) {
$upper = $nodes - 1;
}
$expectedReplicas = min( max( $nodes - 1, $lower ), $upper );
$expectedActive = $this->shardCount * ( 1 + $expectedReplicas );
if ( $each === 0 || $active === $expectedActive ) {
$this->outputIndented( "\t\tactive:$active/$expectedActive relocating:$relocating " .
"initializing:$initializing unassigned:$unassigned\n" );
if ( $active === $expectedActive ) {
break;
}
}
$each = ( $each + 1 ) % 20;
sleep( 1 );
}
}
/**
* @param Type $type
* @param Type $oldType
* @param int $children
* @param int $childNumber
* @param int|string $chunkSize
* @param int $retryAttempts
*/
private function reindexInternal( Type $type, Type $oldType, $children, $childNumber, $chunkSize, $retryAttempts ) {
$filter = null;
$messagePrefix = "";
if ( $childNumber === 1 && $children === 1 ) {
$this->outputIndented( "\t\tStarting single process reindex\n" );
} else {
if ( $childNumber >= $children ) {
$this->error( "Invalid parameters - childNumber >= children ($childNumber >= $children) ", 1 );
}
$messagePrefix = "\t\t[$childNumber] ";
$this->outputIndented( $messagePrefix . "Starting child process reindex\n" );
// Note that it is not ok to abs(_uid.hashCode) because hashCode(Integer.MIN_VALUE) == Integer.MIN_VALUE
$filter = new \CirrusSearch\Extra\Query\IdHashMod( $children, $childNumber );
}
$properties = $this->mappingConfig[$oldType->getName()]['properties'];
try {
$query = new Query();
$query->setFields( [ '_id', '_source' ] );
if ( $filter ) {
$bool = new \Elastica\Query\BoolQuery();
$bool->addFilter( $filter );
$query->setQuery( $bool );
}
// Note here we dump from the current index (using the alias) so we can use Connection::getPageType
$result = $oldType
->search( $query, [
'search_type' => 'scan',
'scroll' => '1h',
'size'=> $chunkSize,
] );
$totalDocsToReindex = $result->getResponse()->getData();
$totalDocsToReindex = $totalDocsToReindex['hits']['total'];
$this->outputIndented( $messagePrefix . "About to reindex $totalDocsToReindex documents\n" );
$operationStartTime = microtime( true );
$completed = 0;
MWElasticUtils::iterateOverScroll( $this->oldIndex, $result->getResponse()->getScrollId(), '1h',
function( $results ) use ( $properties, $retryAttempts, $messagePrefix, $type,
&$completed, $totalDocsToReindex, $operationStartTime ) {
$documents = [];
foreach( $results as $result ) {
$documents[] = $this->buildNewDocument( $result, $properties );
}
$this->withRetry( $retryAttempts, $messagePrefix, 'retrying as singles',
function() use ( $type, $messagePrefix, $documents ) {
$this->sendDocuments( $type, $messagePrefix, $documents );
} );
$completed += sizeof( $results );
$rate = round( $completed / ( microtime( true ) - $operationStartTime ) );
$this->outputIndented( $messagePrefix .
"Reindexed $completed/$totalDocsToReindex documents at $rate/second\n");
}, 0, $retryAttempts,
function( $e, $errors ) use ( $messagePrefix ) {
$this->sleepOnRetry( $e, $errors, $messagePrefix, 'fetching documents to reindex' );
} );
$this->outputIndented( $messagePrefix . "All done\n" );
} catch ( ExceptionInterface $e ) {
// Note that we can't fail the master here, we have to check how many documents are in the new index in the master.
$type = get_class( $e );
$error = ElasticsearchIntermediary::extractFullError( $e );
LoggerFactory::getInstance( 'CirrusSearch' )->warning(
"Search backend error during reindex. Error type is '{type}' ({error_type}) and message is: {error_reason}",
[
'type' => $type,
'error_type' => $error['type'],
'error_reason' => $error['reason'],
]
);
die( 1 );
}
}
/**
* Build the new document to just contain keys which have a mapping in the new properties. To clean
* out any old fields that we no longer use.
*
* @param \Elastica\Result $result original document retrieved from a search
* @param array $properties mapping properties
* @return Document
*/
private function buildNewDocument( \Elastica\Result $result, array $properties ) {
// Build the new document to just contain keys which have a mapping in the new properties. To clean
// out any old fields that we no longer use.
$data = Util::cleanUnusedFields( $result->getSource(), $properties );
// This field was added July, 2016. For the first reindex that occurs after it was added it will
// not exist in the documents, so add it here.
if ( !isset( $data['wiki'] ) ) {
$data['wiki'] = $this->searchConfig->getWikiId();
}
// Maybe instead the reindexer should know if we are converting from the old
// style numeric page id's to the new style prefixed id's. This probably
// works though.
$docId = $this->searchConfig->maybeMakeId( $result->getId() );
// Note that while setting the opType to create might improve performance slightly it can cause
// trouble if the scroll returns the same id twice. It can do that if the document is updated
// during the scroll process. I'm unclear on if it will always do that, so you still have to
// perform the date based catch up after the reindex.
return new Document( $docId, $data );
}
/**
* Get health information about the index
*
* @return array Response data array
*/
private function getHealth() {
while ( true ) {
$indexName = $this->index->getName();
$path = "_cluster/health/$indexName";
$response = $this->index->getClient()->request( $path );
if ( $response->hasError() ) {
$this->error( 'Error fetching index health but going to retry. Message: ' . $response->getError() );
sleep( 1 );
continue;
}
return $response->getData();
}
}
/**
* @return int
*/
private function decideMaxShardsPerNodeForReindex() {
$health = $this->getHealth();
$totalNodes = $health[ 'number_of_nodes' ];
$totalShards = $this->shardCount * ( $this->getMaxReplicaCount() + 1 );
return (int) ceil( 1.0 * $totalShards / $totalNodes );
}
/**
* @return int
*/
private function getMaxReplicaCount() {
$replica = explode( '-', $this->replicaCount );
return (int) $replica[ count( $replica ) - 1 ];
}
/**
* @param int $attempts
* @param string $messagePrefix
* @param string $description
* @param callable $func
* @return mixed
*/
private function withRetry( $attempts, $messagePrefix, $description, $func) {
return MWElasticUtils::withRetry ( $attempts, $func,
function( $e, $errors ) use ( $messagePrefix, $description ) {
$this->sleepOnRetry( $e, $errors, $messagePrefix, $description );
} );
}
/**
* @param ExceptionInterface $e exception caught
* @param int $errors number of errors
* @param string $messagePrefix
* @param string $description
*/
private function sleepOnRetry( ExceptionInterface $e, $errors, $messagePrefix, $description ) {
$type = get_class( $e );
$seconds = MWElasticUtils::backoffDelay( $errors );
$message = ElasticsearchIntermediary::extractMessage( $e );
$this->outputIndented( $messagePrefix . "Caught an error $description. " .
"Backing off for $seconds and retrying. Error type is '$type' and message is: $message\n" );
sleep( $seconds );
}
/**
* Send documents to type with retry.
*
* @param Type $type
* @param string $messagePrefix
* @param Elastica\Document[]
*/
private function sendDocuments( Type $type, $messagePrefix, array $documents ) {
try {
$type->addDocuments( $documents );
} catch ( ExceptionInterface $e ) {
$errorType = get_class( $e );
$message = ElasticsearchIntermediary::extractMessage( $e );
$this->outputIndented( $messagePrefix . "Error adding documents in bulk. Retrying as singles. Error type is '$errorType' and message is: $message" );
foreach ( $documents as $document ) {
// Continue using the bulk api because we're used to it.
$type->addDocuments( [ $document ] );
}
}
}
/**
* Reset connection timeouts
*/
private function setConnectionTimeout() {
$timeout = $this->searchConfig->get( 'CirrusSearchMaintenanceTimeout' );
$this->connection->setTimeout( $timeout );
$this->oldConnection->setTimeout( $timeout );
}
/**
* Destroy client connections
*/
private function destroyClients() {
$this->connection->destroyClient();
$this->oldConnection->destroyClient();
// Destroying connections resets timeouts, so we have to reinstate them
$this->setConnectionTimeout();
}
/**
* @param string $message
* @param mixed $channel
*/
protected function output( $message, $channel = null ) {
if ( $this->out ) {
$this->out->output( $message, $channel );
}
}
/**
* @param string $message
*/
private function outputIndented( $message ) {
if ( $this->out ) {
$this->out->outputIndented( $message );
}
}
/**
* @param string $message
* @param int $die
*/
private function error( $message, $die = 0 ) {
// @todo: I'll want to get rid of this method, but this patch will be big enough already
// @todo: I'll probably want to throw exceptions and/or return Status objects instead, later
if ( $this->out ) {
$this->out->error( $message, $die );
}
$die = intval( $die );
if ( $die > 0 ) {
die( $die );
}
}
}