%PDF- %PDF-
| Direktori : /www/varak.net/wiki.varak.net/extensions/CirrusSearch/includes/BuildDocument/ |
| Current File : //www/varak.net/wiki.varak.net/extensions/CirrusSearch/includes/BuildDocument/SuggestScoring.php |
<?php
namespace CirrusSearch\BuildDocument;
use CirrusSearch\Util;
/**
* Scoring methods used by the completion suggester
*
* Set $wgSearchType to 'CirrusSearch'
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License along
* with this program; if not, write to the Free Software Foundation, Inc.,
* 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
* http://www.gnu.org/copyleft/gpl.html
*/
class SuggestScoringMethodFactory {
/**
* @param string $scoringMethod the name of the scoring method
* @return SuggestScoringMethod
*/
public static function getScoringMethod( $scoringMethod ) {
switch( $scoringMethod ) {
case 'incomingLinks':
return new IncomingLinksScoringMethod();
case 'quality':
return new QualityScore();
case 'popqual':
return new PQScore();
}
throw new \Exception( 'Unknown scoring method ' . $scoringMethod );
}
}
interface SuggestScoringMethod {
/**
* @param array $doc A document from the PAGE type
* @return int the weight of the document
*/
public function score( array $doc );
/**
* The list of fields needed to compute the score.
*
* @return string[] the list of required fields
*/
public function getRequiredFields();
/**
* This method will be called by the indexer script.
* some scoring method may want to normalize values based index size
*
* @param int $maxDocs the total number of docs in the index
*/
public function setMaxDocs( $maxDocs );
}
/**
* Very simple scoring method based on incoming links
*/
class IncomingLinksScoringMethod implements SuggestScoringMethod {
/**
* {@inheritDoc}
*/
public function score( array $doc ) {
return isset( $doc['incoming_links'] ) ? $doc['incoming_links'] : 0;
}
/**
* {@inheritDoc}
*/
public function getRequiredFields() {
return [ 'incoming_links' ];
}
/**
* @param int $maxDocs
*/
public function setMaxDocs( $maxDocs ) {}
}
/**
* Score that tries to reflect the quality of a page.
* NOTE: Experimental
*
* This score makes the assumption that bigger is better.
*
* Small cities/village which have a high number of incoming links because they
* link to each others ( see https://en.wikipedia.org/wiki/Villefort,_Loz%C3%A8re )
* will be be discounted correctly because others variables are very low.
*
* On the other hand some pages like List will get sometimes a very high but unjustified
* score.
*
* The boost templates feature might help but it's a System message that is not necessarily
* configured by wiki admins.
*/
class QualityScore implements SuggestScoringMethod {
// TODO: move these constants into a cirrus profile
const INCOMING_LINKS_MAX_DOCS_FACTOR = 0.1;
const EXTERNAL_LINKS_NORM = 20;
const PAGE_SIZE_NORM = 50000;
const HEADING_NORM = 20;
const REDIRECT_NORM = 30;
const INCOMING_LINKS_WEIGHT = 0.6;
const EXTERNAL_LINKS_WEIGHT = 0.1;
const PAGE_SIZE_WEIGHT = 0.1;
const HEADING_WEIGHT = 0.2;
const REDIRECT_WEIGHT = 0.1;
// The final score will be in the range [0, SCORE_RANGE]
const SCORE_RANGE = 10000000;
/**
* Template boosts configured by the mediawiki admin.
*
* @var float[] array of key values, key is the template and value is a float
*/
private $boostTemplates;
/**
* @var int the number of docs in the index
*/
protected $maxDocs;
/**
* @var int normalisation factor for incoming links
*/
private $incomingLinksNorm;
/**
* @param float[]|null $boostTemplates Array of key values, key is the template name, value the boost factor.
* Defaults to Util::getDefaultBoostTemplates()
*/
public function __construct( $boostTemplates = null ) {
$this->boostTemplates = $boostTemplates ?: Util::getDefaultBoostTemplates();
}
/**
* {@inheritDoc}
*/
public function score( array $doc ) {
return intval( $this->intermediateScore( $doc ) * self::SCORE_RANGE );
}
protected function intermediateScore( array $doc ) {
$incLinks = $this->scoreNormL2( isset( $doc['incoming_links'] ) ? $doc['incoming_links'] : 0, $this->incomingLinksNorm );
$pageSize = $this->scoreNormL2( isset( $doc['text_bytes'] ) ? $doc['text_bytes'] : 0, self::PAGE_SIZE_NORM );
$extLinks = $this->scoreNorm( isset( $doc['external_link'] ) ? count( $doc['external_link'] ) : 0, self::EXTERNAL_LINKS_NORM );
$headings = $this->scoreNorm( isset( $doc['heading'] ) ? count( $doc['heading'] ) : 0, self::HEADING_NORM );
$redirects = $this->scoreNorm( isset( $doc['redirect'] ) ? count( $doc['redirect'] ) : 0, self::REDIRECT_NORM );
$score = $incLinks * self::INCOMING_LINKS_WEIGHT;
$score += $extLinks * self::EXTERNAL_LINKS_WEIGHT;
$score += $pageSize * self::PAGE_SIZE_WEIGHT;
$score += $headings * self::HEADING_WEIGHT;
$score += $redirects * self::REDIRECT_WEIGHT;
// We have a standardized composite score between 0 and 1
$score /= self::INCOMING_LINKS_WEIGHT + self::EXTERNAL_LINKS_WEIGHT + self::PAGE_SIZE_WEIGHT + self::HEADING_WEIGHT + self::REDIRECT_WEIGHT;
return $this->boostTemplates( $doc, $score );
}
/**
* log2( ( value / norm ) + 1 ) => [0-1]
*
* @param float $value
* @param float $norm
* @return float between 0 and 1
*/
public function scoreNormL2( $value, $norm ) {
return log( $value > $norm ? 2 : ( $value / $norm ) + 1, 2 );
}
/**
* value / norm => [0-1]
*
* @param float $value
* @param float $norm
* @return float between 0 and 1
*/
public function scoreNorm( $value, $norm ) {
return $value > $norm ? 1 : $value / $norm;
}
/**
* Modify an existing score based on templates contained
* by the document.
*
* @param array $doc Document score is generated for
* @param float $score Current score between 0 and 1
* @return float Score after boosting templates
*/
public function boostTemplates( array $doc, $score ) {
if ( !isset( $doc['template'] ) ) {
return $score;
}
if ( $this->boostTemplates ) {
$boost = 1;
// compute the global boost
foreach ( $this->boostTemplates as $k => $v ) {
if ( in_array( $k, $doc['template'] ) ) {
$boost *= $v;
}
}
if ( $boost != 1 ) {
return $this->boost( $score, $boost );
}
}
return $score;
}
/**
* Boost the score :
* boost value lower than 1 will decrease the score
* boost value set to 1 will keep the score unchanged
* boost value greater than 1 will increase the score
*
* score = 0.5, boost = 0.5 result is 0.375
* score = 0.1, boost = 2 result is 0.325
*
* @param float $score
* @param float $boost
* @return float adjusted score
*/
public function boost( $score, $boost ) {
if ( $boost == 1 ) {
return $score;
}
// Transform the boost to a value between -1 and 1
$boost = $boost > 1 ? 1 - ( 1 / $boost ) : - ( 1 - $boost );
// @todo: the 0.5 ratio is hardcoded we could maybe allow customization
// here, this would be a way to increase the impact of template boost
if ( $boost > 0 ) {
return $score + ( ( ( 1 - $score ) / 2 ) * $boost );
} else {
return $score + ( ( $score / 2 ) * $boost );
}
}
/**
* {@inheritDoc}
*/
public function getRequiredFields() {
return [ 'incoming_links', 'external_link', 'text_bytes', 'heading', 'redirect', 'template' ];
}
/**
* @param int $maxDocs
*/
public function setMaxDocs( $maxDocs ) {
$this->maxDocs = $maxDocs;
// We normalize incoming links according to the size of the index
$this->incomingLinksNorm = (int) ($maxDocs * self::INCOMING_LINKS_MAX_DOCS_FACTOR);
if ( $this->incomingLinksNorm < 1 ) {
// it's a very small wiki let's force the norm to 1
$this->incomingLinksNorm = 1;
}
}
}
/**
* Score that combines QualityScore and the pageviews statistics (popularity)
*/
class PQScore extends QualityScore {
const QSCORE_WEIGHT = 1;
const POPULARITY_WEIGHT = 0.4;
// 0.04% of the total page views is the max we accept
// @todo: tested on enwiki values only
const POPULARITY_MAX = 0.0004;
/**
* @return string[]
*/
public function getRequiredFields() {
return array_merge( parent::getRequiredFields(), [ 'popularity_score' ] );
}
/**
* @param array $doc
* @return int
*/
public function score( array $doc ) {
$score = $this->intermediateScore( $doc ) * self::QSCORE_WEIGHT;
$pop = isset( $doc['popularity_score'] ) ? $doc['popularity_score'] : 0;
if ( $pop > self::POPULARITY_MAX ) {
$pop = 1;
} else {
$logBase = 1 + self::POPULARITY_MAX * $this->maxDocs;
// log₁(x) is undefined
if ( $logBase > 1 ) {
// @fixme: rough log scale by using maxDocs...
$pop = log ( 1 + ( $pop * $this->maxDocs ), $logBase );
} else {
$pop = 0;
}
}
$score += $pop * self::POPULARITY_WEIGHT;
$score /= self::QSCORE_WEIGHT + self::POPULARITY_WEIGHT;
return intval( $score * self::SCORE_RANGE );
}
}