%PDF- %PDF-
Mini Shell

Mini Shell

Direktori : /www/varak.net/wiki.varak.net/extensions/CirrusSearch/includes/Search/
Upload File :
Create Path :
Current File : /www/varak.net/wiki.varak.net/extensions/CirrusSearch/includes/Search/Escaper.php

<?php

namespace CirrusSearch\Search;

/**
 * Escapes queries.
 *
 * This program is free software; you can redistribute it and/or modify
 * it under the terms of the GNU General Public License as published by
 * the Free Software Foundation; either version 2 of the License, or
 * (at your option) any later version.
 *
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
 * GNU General Public License for more details.
 *
 * You should have received a copy of the GNU General Public License along
 * with this program; if not, write to the Free Software Foundation, Inc.,
 * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
 * http://www.gnu.org/copyleft/gpl.html
 */
class Escaper {

	/**
	 * @var string MediaWiki language code
	 */
	private $language;

	/**
	 * Allow leading wildcards?
	 * @var bool
	 */
	private $allowLeadingWildcard;

	/**
	 * @param string $language MediaWiki language code
	 * @param bool $allowLeadingWildcard
	 */
	public function __construct( $language, $allowLeadingWildcard = true ) {
		$this->language = $language;
		$this->allowLeadingWildcard = $allowLeadingWildcard;
	}

	/**
	 * @param string $text
	 * @return string
	 */
	public function escapeQuotes( $text ) {
		if ( $this->language === 'he' ) {
			// Hebrew uses the double quote (") character as a standin for quotation marks (“”)
			// which delineate phrases.  It also uses double quotes as a standin for another
			// character (״), call a Gershayim, which mark acronyms.  Here we guess if the intent
			// was to mark a phrase, in which case we leave the quotes alone, or to mark an
			// acronym, in which case we escape them.
			return preg_replace( '/(\S+)"(\S)/u', '\1\\"\2', $text );
		}
		return $text;
	}

	/**
	 * Make sure the the query string part is well formed by escaping some syntax that we don't
	 * want users to get direct access to and making sure quotes are balanced.
	 * These special characters _aren't_ escaped:
	 * * and ?: Do a wildcard search against the stemmed text which isn't strictly a good
	 * idea but this is so rarely used that adding extra code to flip prefix searches into
	 * real prefix searches isn't really worth it.
	 * ~: Do a fuzzy match against the stemmed text which isn't strictly a good idea but it
	 * gets the job done and fuzzy matches are a really rarely used feature to be creating an
	 * extra index for.
	 * ": Perform a phrase search for the quoted term.  If the "s aren't balanced we insert one
	 * at the end of the term to make sure elasticsearch doesn't barf at us.
	 *
	 * @param string $string
	 * @return string
	 */
	public function fixupQueryStringPart( $string ) {
		// Escape characters that can be escaped with \\
		$string = preg_replace( '/(
				\(|     (?# no user supplied groupings)
				\)|
				\{|     (?# no exclusive range queries)
				}|
				\[|     (?# no inclusive range queries either)
				]|
				\^|     (?# no user supplied boosts at this point, though I cant think why)
				:|		(?# no specifying your own fields)
				\\\(?!") (?# the only acceptable escaping is for quotes)
			)/x', '\\\$1', $string );
		// Forward slash escaping doesn't work properly in all environments so we just eat them.   Nom.
		$string = str_replace( '/', ' ', $string );

		// Elasticsearch's query strings can't abide unbalanced quotes
		return $this->balanceQuotes( $string );
	}

	/**
	 * Make sure that all operators and lucene syntax is used correctly in the query string
	 * and store if this is a fuzzy query.
	 * If it isn't then the syntax escaped so it becomes part of the query text.
	 *
	 * @param string $string
	 * @return array(string, boolean) (fixed up query string, is this a fuzzy query?)
	 */
	public function fixupWholeQueryString( $string ) {
		$escapeBadSyntax = [ self::class, 'escapeBadSyntax' ];

		// Be careful when editing this method because the ordering of the replacements matters.

		// Escape ~ that don't follow a term or a quote
		$string = preg_replace_callback( '/(?<![\w"])~/u', $escapeBadSyntax, $string );

		// When allow leading wildcard is disabled elasticsearch will report an
		// error if these are unescaped. Escape ? and * that don't follow a term.
		if ( !$this->allowLeadingWildcard ) {
			$string = preg_replace_callback( '/(?<![\w])([?*])/u', $escapeBadSyntax, $string );
		}

		// Reduce token ranges to bare tokens without the < or >
		$string = preg_replace( '/(?:<|>)+([^\s])/u', '$1', $string );

		// Turn bad fuzzy searches into searches that contain a ~ and set $this->fuzzyQuery for good ones.
		$fuzzyQuery = false;
		$string = preg_replace_callback( '/(?<leading>\w)~(?<trailing>\S*)/u',
			function ( $matches ) use ( &$fuzzyQuery ) {
				if ( preg_match( '/^(|[0-2])$/', $matches[ 'trailing' ] ) ) {
					$fuzzyQuery = true;
					return $matches[ 0 ];
				} else {
					return $matches[ 'leading' ] . '\\~' .
						preg_replace( '/(?<!\\\\)~/', '\~', $matches[ 'trailing' ] );
				}
			}, $string );

		// Turn bad proximity searches into searches that contain a ~
		$string = preg_replace_callback( '/"~(?<trailing>\S*)/u', function ( $matches ) {
			if ( preg_match( '/\d+/', $matches[ 'trailing' ] ) ) {
				return $matches[ 0 ];
			} else {
				return '"\\~' . $matches[ 'trailing' ];
			}
		}, $string );

		// Escape +, -, and ! when not immediately followed by a term or when immediately
		// prefixed with a term.  Catches "foo-bar", "foo- bar", "foo - bar".  The only
		// acceptable use is "foo -bar" and "-bar foo".
		$string = preg_replace_callback( '/[+\-!]+(?!\w)/u', $escapeBadSyntax, $string );
		$string = preg_replace_callback( '/(?<!^|[ \\\\])[+\-!]+/u', $escapeBadSyntax, $string );

		// Escape || when not between terms
		$string = preg_replace_callback( '/^\s*\|\|/u', $escapeBadSyntax, $string );
		$string = preg_replace_callback( '/\|\|\s*$/u', $escapeBadSyntax, $string );

		// Lowercase AND and OR when not surrounded on both sides by a term.
		// Lowercase NOT when it doesn't have a term after it.
		$string = preg_replace_callback( '/^|(AND|OR|NOT)\s*(?:AND|OR)/u',
			[ self::class, 'lowercaseMatched' ], $string );
		$string = preg_replace_callback( '/(?:AND|OR|NOT)\s*$/u',
			[ self::class, 'lowercaseMatched' ], $string );

		return [ $string, $fuzzyQuery ];
	}

	/**
	 * @param string[] $matches
	 * @return string
	 */
	private static function escapeBadSyntax( $matches ) {
		return "\\" . implode( "\\", str_split( $matches[ 0 ] ) );
	}

	/**
	 * @param string[] $matches
	 * @return string
	 */
	private static function lowercaseMatched( $matches ) {
		return strtolower( $matches[ 0 ] );
	}

	/**
	 * @param string $text
	 * @return string
	 */
	public function balanceQuotes( $text ) {
		$inQuote = false;
		$inEscape = false;
		$len = strlen( $text );
		for ( $i = 0; $i < $len; $i++ ) {
			if ( $inEscape ) {
				$inEscape = false;
				continue;
			}
			switch ( $text[ $i ] ) {
			case '"':
				$inQuote = !$inQuote;
				break;
			case '\\':
				$inEscape = true;
			}
		}
		if ( $inQuote ) {
			$text = $text . '"';
		}
		return $text;
	}

	/**
	 * Is leading wildcard allowed?
	 *
	 * @return bool
	 */
	public function getAllowLeadingWildcard() {
		return $this->allowLeadingWildcard;
	}
}

Zerion Mini Shell 1.0