%PDF- %PDF-
| Direktori : /www/varak.net/wiki.varak.net/extensions/CirrusSearch/includes/BuildDocument/ |
| Current File : //www/varak.net/wiki.varak.net/extensions/CirrusSearch/includes/BuildDocument/SuggestBuilder.php |
<?php
namespace CirrusSearch\BuildDocument;
use Title;
use LinkBatch;
/**
* Build a doc ready for the titlesuggest index.
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License along
* with this program; if not, write to the Free Software Foundation, Inc.,
* 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
* http://www.gnu.org/copyleft/gpl.html
*/
/**
* Builder used to create suggester docs
* NOTE: Experimental
*/
class SuggestBuilder {
/**
* We limit the input to 50 chars
*/
const MAX_INPUT_LENGTH = 50;
/**
* The acceptable edit distance to group similar strings
*/
const GROUP_ACCEPTABLE_DISTANCE = 2;
/**
* Discount suggestions based on redirects
*/
const REDIRECT_DISCOUNT = 0.1;
/**
* Discount suggestions based on cross namespace redirects
*/
const CROSSNS_DISCOUNT = 0.005;
/**
* Redirect suggestion type
*/
const REDIRECT_SUGGESTION = 'r';
/**
* Title suggestion type
*/
const TITLE_SUGGESTION = 't';
/**
* Number of common prefix chars a redirect must share with the title to be
* promoted as a title suggestion.
* This is useful not to promote Eraq as a title suggestion for Iraq
* Less than 3 can lead to weird results like oba => Osama Bin Laden
* @todo: to avoid displaying typos (if the typo is in the 3 chars)
* we could re-work Utils::chooseBestRedirect and display the title
* if the chosen redirect is close enough to the title.
*/
const REDIRECT_COMMON_PREFIX_LEN = 3;
/**
* @var SuggestScoringMethod the scoring function
*/
private $scoringMethod;
/**
* @var integer batch id
*/
private $batchId;
/**
* @var boolean builds geo contextualized suggestions
*/
private $withGeo;
/**
* NOTE: Currently a fixed value because the completion suggester does not support
* multi namespace suggestion.
*
* @var int $targetNamespace
*/
private $targetNamespace = NS_MAIN;
/**
* @param SuggestScoringMethod $scoringMethod the scoring function to use
* @param bool $withGeo
*/
public function __construct( SuggestScoringMethod $scoringMethod, $withGeo = true ) {
$this->scoringMethod = $scoringMethod;
$this->withGeo = $withGeo;
$this->batchId = time();
}
/**
* @param array[] $inputDocs a batch of docs to build
* @return \Elastica\Document[] a set of suggest documents
*/
public function build( $inputDocs ) {
// Cross namespace titles
$crossNsTitles = [];
$docs = [];
foreach ( $inputDocs as $sourceDoc ) {
$inputDoc = $sourceDoc['source'];
$docId = $sourceDoc['id'];
if ( !isset( $inputDoc['namespace'] ) ) {
// Bad doc, nothing to do here.
continue;
}
if( $inputDoc['namespace'] == NS_MAIN ) {
if ( !isset( $inputDoc['title'] ) ) {
// Bad doc, nothing to do here.
continue;
}
$docs = array_merge( $docs, $this->buildNormalSuggestions( $docId, $inputDoc ) );
} else {
if ( !isset( $inputDoc['redirect'] ) ) {
// Bad doc, nothing to do here.
continue;
}
foreach ( $inputDoc['redirect'] as $redir ) {
if ( !isset( $redir['namespace'] ) || !isset( $redir['title'] ) ) {
continue;
}
if ( $redir['namespace'] != $this->targetNamespace ) {
continue;
}
$score = $this->scoringMethod->score( $inputDoc );
// Discount the score of these suggestions.
$score = (int) ($score * self::CROSSNS_DISCOUNT);
// We support only earth and the primary/first coordinates...
$location = $this->findPrimaryCoordinates( $inputDoc );
$title = Title::makeTitle( $redir['namespace'], $redir['title'] );
$crossNsTitles[$redir['title']] = [
'title' => $title,
'score' => $score,
'location' => $location
];
}
}
}
// Build cross ns suggestions
if ( !empty ( $crossNsTitles ) ) {
$titles = [];
foreach( $crossNsTitles as $text => $data ) {
$titles[] = $data['title'];
}
$lb = new LinkBatch( $titles );
$lb->setCaller( __METHOD__ );
$lb->execute();
// This is far from perfect:
// - we won't try to group similar redirects since we don't know which one
// is the official one
// - we will certainly suggest multiple times the same pages
// - we must not run a second pass at query time: no redirect suggestion
foreach ( $crossNsTitles as $text => $data ) {
$suggestion = [
'text' => $text,
'variants' => []
];
$docs[] = $this->buildTitleSuggestion( $data['title']->getArticleID(), $suggestion, $data['location'], $data['score'] );
}
}
return $docs;
}
/**
* Build classic suggestion
*
* @param string $docId
* @param array $inputDoc
* @return \Elastica\Document[] a set of suggest documents
*/
private function buildNormalSuggestions( $docId, array $inputDoc ) {
if ( !isset( $inputDoc['title'] ) ) {
// Bad doc, nothing to do here.
return [];
}
$score = $this->scoringMethod->score( $inputDoc );
// We support only earth and the primary/first coordinates...
$location = $this->findPrimaryCoordinates( $inputDoc );
$suggestions = $this->extractTitleAndSimilarRedirects( $inputDoc );
$docs[] = $this->buildTitleSuggestion( $docId, $suggestions['group'], $location, $score );
if ( !empty( $suggestions['candidates'] ) ) {
$docs[] = $this->buildRedirectsSuggestion( $docId, $suggestions['candidates'],
$location, $score );
}
return $docs;
}
/**
* The fields needed to build and score documents.
*
* @return string[] the list of fields
*/
public function getRequiredFields() {
$fields = $this->scoringMethod->getRequiredFields();
$fields = array_merge( $fields, [ 'title', 'redirect', 'namespace' ] );
if ( $this->withGeo ) {
$fields[] = 'coordinates';
}
return $fields;
}
/**
* Inspects the 'coordinates' index and return the first coordinates flagged as 'primary'
* or the first coordinates if no primaries are found.
*
* @param array $inputDoc the input doc
* @return array|null with 'lat' and 'lon' or null
*/
public function findPrimaryCoordinates( array $inputDoc ) {
if ( !isset( $inputDoc['coordinates'] ) || !is_array( $inputDoc['coordinates'] ) ) {
return null;
}
$first = null;
foreach( $inputDoc['coordinates'] as $coord ) {
if ( isset( $coord['globe'] ) && $coord['globe'] == 'earth' && isset( $coord['coord'] ) ) {
if ( $first === null ) {
$first = $coord['coord'];
}
if ( isset( $coord['primary'] ) && $coord['primary'] ) {
return $coord['coord'];
}
}
}
return $first;
}
/**
* Builds the 'title' suggestion.
* The output is encoded as pageId:t:Title.
* NOTE: the client will be able to display Title encoded in the output when searching.
*
* @param string $docId the page id
* @param array $title the title in 'text' and an array of similar redirects in 'variants'
* @param array|null $location the geo coordinates or null if unavailable
* @param int $score the weight of the suggestion
* @return \Elastica\Document the suggestion document
*/
private function buildTitleSuggestion( $docId, array $title, array $location = null, $score ) {
$inputs = [ $this->prepareInput( $title['text'] ) ];
foreach ( $title['variants'] as $variant ) {
$inputs[] = $this->prepareInput( $variant );
}
$output = self::encodeTitleOutput( $docId, $title['text'] );
return $this->buildSuggestion(
self::TITLE_SUGGESTION . $docId,
$output,
$inputs,
$location,
$score
);
}
/**
* Builds the 'redirects' suggestion.
* The output is encoded as pageId:r
* The score will be discounted by the REDIRECT_DISCOUNT factor.
* NOTE: the client will have to fetch the doc redirects when searching
* and choose the best one to display. This is because we are unable
* to make this decision at index time.
*
* @param string $docId the elasticsearch document id
* @param string[] $redirects
* @param array|null $location the geo coordinates or null if unavailable
* @param int $score the weight of the suggestion
* @return \Elastica\Document the suggestion document
*/
private function buildRedirectsSuggestion( $docId, array $redirects, array $location = null, $score ) {
$inputs = [];
foreach ( $redirects as $redirect ) {
$inputs[] = $this->prepareInput( $redirect );
}
$output = $docId . ":" . self::REDIRECT_SUGGESTION;
$score = (int) ( $score * self::REDIRECT_DISCOUNT );
return $this->buildSuggestion( self::REDIRECT_SUGGESTION . $docId, $output, $inputs, $location, $score );
}
/**
* Builds a suggestion document.
*
* @param string $docId The document id
* @param string $output the suggestion output
* @param string[] $inputs the suggestion inputs
* @param array|null $location the geo coordinates or null if unavailable
* @param int $score the weight of the suggestion
* @return \Elastica\Document a doc ready to be indexed in the completion suggester
*/
private function buildSuggestion( $docId, $output, array $inputs, array $location = null, $score ) {
$doc = [
'batch_id' => $this->batchId,
'suggest' => [
'input' => $inputs,
'output' => $output,
'weight' => $score
],
'suggest-stop' => [
'input' => $inputs,
'output' => $output,
'weight' => $score
]
];
if ( $this->withGeo && $location !== null ) {
$doc['suggest-geo'] = [
'input' => $inputs,
'output' => $output,
'weight' => $score,
'context' => [ 'location' => $location ]
];
$doc['suggest-stop-geo'] = [
'input' => $inputs,
'output' => $output,
'weight' => $score,
'context' => [ 'location' => $location ]
];
}
return new \Elastica\Document( $docId, $doc );
}
/**
* @param array $input Document to build inputs for
* @return array list of prepared suggestions that should
* resolve to the document.
*/
public function buildInputs( array $input ) {
$inputs = [ $this->prepareInput( $input['text'] ) ];
foreach ( $input['variants'] as $variant ) {
$inputs[] = $this->prepareInput( $variant );
}
return $inputs;
}
/**
* @param string $input A page title
* @return string A page title short enough to not cause indexing
* issues.
*/
public function prepareInput( $input ) {
if ( mb_strlen( $input ) > self::MAX_INPUT_LENGTH ) {
$input = mb_substr( $input, 0, self::MAX_INPUT_LENGTH );
}
return $input;
}
/**
* Extracts title with redirects that are very close.
* It will allow to make one suggestion with title as the
* output and title + similar redirects as the inputs.
* It can be useful to avoid displaying redirects created to
* to handle typos.
*
* e.g. :
* title: Giraffe
* redirects: Girafe, Girraffe, Mating Giraffes
* will output
* - 'group' : { 'text': 'Giraffe', 'variants': ['Girafe', 'Girraffe'] }
* - 'candidates' : ['Mating Giraffes']
*
* It would be nice to do this for redirects but we have no way to decide
* which redirect is a typo and this technique would simply take the first
* redirect in the list.
*
* @param array $doc
* @return array mixed 'group' key contains the group with the
* lead and its variants and 'candidates' contains the remaining
* candidates that were not close enough to $groupHead.
*/
public function extractTitleAndSimilarRedirects( array $doc ) {
$redirects = [];
if ( isset( $doc['redirect'] ) ) {
foreach( $doc['redirect'] as $redir ) {
// Avoid suggesting/displaying non existent titles
// in the target namespace
if( $redir['namespace'] == $this->targetNamespace ) {
$redirects[] = $redir['title'];
}
}
}
return $this->extractSimilars( $doc['title'], $redirects, true );
}
/**
* Extracts from $candidates the values that are "similar" to $groupHead
*
* @param string $groupHead string the group "head"
* @param string[] $candidates array of string the candidates
* @param boolean $checkVariants if the candidate does not match the groupHead try to match a variant
* @return array 'group' key contains the group with the
* head and its variants and 'candidates' contains the remaining
* candidates that were not close enough to $groupHead.
*/
private function extractSimilars( $groupHead, array $candidates, $checkVariants = false ) {
$group = [
'text' => $groupHead,
'variants' => []
];
$newCandidates = [];
foreach( $candidates as $c ) {
$distance = $this->distance( $groupHead, $c );
if( $distance > self::GROUP_ACCEPTABLE_DISTANCE && $checkVariants ) {
// Run a second pass over the variants
foreach ( $group['variants'] as $v ) {
$distance = $this->distance( $v, $c );
if ( $distance <= self::GROUP_ACCEPTABLE_DISTANCE ) {
break;
}
}
}
if ( $distance <= self::GROUP_ACCEPTABLE_DISTANCE ) {
$group['variants'][] = $c;
} else {
$newCandidates[] = $c;
}
}
return [
'group' => $group,
'candidates' => $newCandidates
];
}
/**
* Computes the edit distance between $a and $b.
*
* @param string $a
* @param string $b
* @return integer the edit distance between a and b
*/
private function distance( $a, $b ) {
$a = $this->prepareInput( $a );
$b = $this->prepareInput( $b );
$a = mb_strtolower( $a );
$b = mb_strtolower( $b );
$aLength = mb_strlen( $a );
$bLength = mb_strlen( $b );
$commonPrefixLen = self::REDIRECT_COMMON_PREFIX_LEN;
if ( $aLength < $commonPrefixLen ) {
$commonPrefixLen = $aLength;
}
if( $bLength < $commonPrefixLen ) {
$commonPrefixLen = $bLength;
}
// check the common prefix
if ( mb_substr( $a, 0, $commonPrefixLen ) != mb_substr( $b, 0, $commonPrefixLen ) ) {
return PHP_INT_MAX;
}
// TODO: switch to a ratio instead of raw distance would help to group
// longer strings
return levenshtein( $a, $b );
}
/**
* Encode a title suggestion output
*
* @param string $docId elasticsearch document id
* @param string $title
* @return string the encoded output
*/
public static function encodeTitleOutput( $docId, $title ) {
return $docId . ':'. self::TITLE_SUGGESTION . ':' . $title;
}
/**
* Encode a redirect suggestion output
*
* @param string $docId elasticsearch document id
* @return string the encoded output
*/
public static function encodeRedirectOutput( $docId ) {
return $docId . ':' . self::REDIRECT_SUGGESTION;
}
/**
* Decode a suggestion output.
* The result is an array with the following keys:
* id: the pageId
* type: either REDIRECT_SUGGESTION or TITLE_SUGGESTION
* text (optional): if TITLE_SUGGESTION the Title text
*
* @param string $output text value returned by a suggest query
* @return string[]|null array of strings, or null if the output is not properly encoded
*/
public static function decodeOutput( $output ) {
if ( $output == null ) {
return null;
}
$parts = explode( ':', $output, 3 );
if ( sizeof ( $parts ) < 2 ) {
// Ignore broken output
return null;
}
switch( $parts[1] ) {
case self::REDIRECT_SUGGESTION:
return [
'docId' => $parts[0],
'type' => self::REDIRECT_SUGGESTION,
];
case self::TITLE_SUGGESTION:
if ( sizeof( $parts ) < 3 ) {
return null;
}
return [
'docId' => $parts[0],
'type' => self::TITLE_SUGGESTION,
'text' => $parts[2]
];
}
return null;
}
/**
* @return int the batchId
*/
public function getBatchId() {
return $this->batchId;
}
}