%PDF- %PDF-
Mini Shell

Mini Shell

Direktori : /lib/calibre/calibre/utils/
Upload File :
Create Path :
Current File : //lib/calibre/calibre/utils/search_query_parser.py

#!/usr/bin/env python3


__license__   = 'GPL v3'
__copyright__ = '2008, Kovid Goyal kovid@kovidgoyal.net'
__docformat__ = 'restructuredtext en'

'''
A parser for search queries with a syntax very similar to that used by
the Google search engine.

For details on the search query syntax see :class:`SearchQueryParser`.
To use the parser, subclass :class:`SearchQueryParser` and implement the
methods :method:`SearchQueryParser.universal_set` and
:method:`SearchQueryParser.get_matches`. See for example :class:`Tester`.

If this module is run, it will perform a series of unit tests.
'''

import weakref, re

from calibre.constants import preferred_encoding
from calibre.utils.icu import sort_key
from calibre import prints
from polyglot.builtins import codepoint_to_chr


'''
This class manages access to the preference holding the saved search queries.
It exists to ensure that unicode is used throughout, and also to permit
adding other fields, such as whether the search is a 'favorite'
'''


class SavedSearchQueries:
    queries = {}
    opt_name = ''

    def __init__(self, db, _opt_name):
        self.opt_name = _opt_name
        if db is not None:
            db = db.new_api
            self._db = weakref.ref(db)
            self.queries = db.pref(self.opt_name, {})
        else:
            self.queries = {}
            self._db = lambda : None

    @property
    def db(self):
        return self._db()

    def save_queries(self):
        db = self.db
        if db is not None:
            db.set_pref(self.opt_name, self.queries)

    def force_unicode(self, x):
        if not isinstance(x, str):
            x = x.decode(preferred_encoding, 'replace')
        return x

    def add(self, name, value):
        self.queries[self.force_unicode(name)] = self.force_unicode(value).strip()
        self.save_queries()

    def lookup(self, name):
        sn = self.force_unicode(name).lower()
        for n, q in self.queries.items():
            if sn == n.lower():
                return q
        return None

    def delete(self, name):
        self.queries.pop(self.force_unicode(name), False)
        self.save_queries()

    def rename(self, old_name, new_name):
        self.queries[self.force_unicode(new_name)] = \
                    self.queries.get(self.force_unicode(old_name), None)
        self.queries.pop(self.force_unicode(old_name), False)
        self.save_queries()

    def set_all(self, smap):
        self.queries = smap
        self.save_queries()

    def names(self):
        return sorted(self.queries.keys(),key=sort_key)


'''
Create a global instance of the saved searches. It is global so that the searches
are common across all instances of the parser (devices, library, etc).
'''
ss = SavedSearchQueries(None, None)


def set_saved_searches(db, opt_name):
    global ss
    ss = SavedSearchQueries(db, opt_name)


def saved_searches():
    global ss
    return ss


def global_lookup_saved_search(name):
    return ss.lookup(name)


'''
Parse a search expression into a series of potentially recursive operations.

Note that the interpreter wants binary operators, not n-ary ops. This is why we
recurse instead of iterating when building sequences of the same op.

The syntax is more than a bit twisted. In particular, the handling of colons
in the base token requires semantic analysis.

Also note that the query string is lowercased before analysis. This is OK because
calibre's searches are all case-insensitive.

Grammar:

prog ::= or_expression

or_expression ::= and_expression [ 'or' or_expression ]

and_expression ::= not_expression [ [ 'and' ] and_expression ]

not_expression ::= [ 'not' ] location_expression

location_expression ::= base_token | ( '(' or_expression ')' )

base_token ::= a sequence of letters and colons, perhaps quoted
'''


class Parser:

    def __init__(self):
        self.current_token = 0
        self.tokens = None

    OPCODE = 1
    WORD = 2
    QUOTED_WORD = 3
    EOF = 4
    REPLACEMENTS = tuple(('\\' + x, codepoint_to_chr(i + 1)) for i, x in enumerate('\\"()'))

    # Had to translate named constants to numeric values
    lex_scanner = re.Scanner([
            (r'[()]', lambda x,t: (Parser.OPCODE, t)),
            (r'@.+?:[^")\s]+', lambda x,t: (Parser.WORD, str(t))),
            (r'[^"()\s]+', lambda x,t: (Parser.WORD, str(t))),
            (r'".*?((?<!\\)")', lambda x,t: (Parser.QUOTED_WORD, t[1:-1])),
            (r'\s+',              None)
    ], flags=re.DOTALL)

    def token(self, advance=False):
        if self.is_eof():
            return None
        res = self.tokens[self.current_token][1]
        if advance:
            self.current_token += 1
        return res

    def lcase_token(self, advance=False):
        if self.is_eof():
            return None
        res = self.tokens[self.current_token][1]
        if advance:
            self.current_token += 1
        return icu_lower(res)

    def token_type(self):
        if self.is_eof():
            return self.EOF
        return self.tokens[self.current_token][0]

    def is_eof(self):
        return self.current_token >= len(self.tokens)

    def advance(self):
        self.current_token += 1

    def tokenize(self, expr):
        # Strip out escaped backslashes, quotes and parens so that the
        # lex scanner doesn't get confused. We put them back later.
        for k, v in self.REPLACEMENTS:
            expr = expr.replace(k, v)
        tokens = self.lex_scanner.scan(expr)[0]

        def unescape(x):
            for k, v in self.REPLACEMENTS:
                x = x.replace(v, k[1:])
            return x

        return [
            (tt, unescape(tv) if tt in (self.WORD, self.QUOTED_WORD) else tv)
            for tt, tv in tokens
        ]

    def parse(self, expr, locations):
        self.locations = locations
        self.tokens = self.tokenize(expr)
        self.current_token = 0
        prog = self.or_expression()
        if not self.is_eof():
            raise ParseException(_('Extra characters at end of search'))
        return prog

    def or_expression(self):
        lhs = self.and_expression()
        if self.lcase_token() == 'or':
            self.advance()
            return ['or', lhs, self.or_expression()]
        return lhs

    def and_expression(self):
        lhs = self.not_expression()
        if self.lcase_token() == 'and':
            self.advance()
            return ['and', lhs, self.and_expression()]

        # Account for the optional 'and'
        if ((self.token_type() in [self.WORD, self.QUOTED_WORD] or self.token() == '(') and self.lcase_token() != 'or'):
            return ['and', lhs, self.and_expression()]
        return lhs

    def not_expression(self):
        if self.lcase_token() == 'not':
            self.advance()
            return ['not', self.not_expression()]
        return self.location_expression()

    def location_expression(self):
        if self.token_type() == self.OPCODE and self.token() == '(':
            self.advance()
            res = self.or_expression()
            if self.token_type() != self.OPCODE or self.token(advance=True) != ')':
                raise ParseException(_('missing )'))
            return res
        if self.token_type() not in (self.WORD, self.QUOTED_WORD):
            raise ParseException(_('Invalid syntax. Expected a lookup name or a word'))

        return self.base_token()

    def base_token(self):
        if self.token_type() == self.QUOTED_WORD:
            return ['token', 'all', self.token(advance=True)]

        words = self.token(advance=True).split(':')

        # The complexity here comes from having colon-separated search
        # values. That forces us to check that the first "word" in a colon-
        # separated group is a valid location. If not, then the token must
        # be reconstructed. We also have the problem that locations can be
        # followed by quoted strings that appear as the next token. and that
        # tokens can be a sequence of colons.

        # We have a location if there is more than one word and the first
        # word is in locations. This check could produce a "wrong" answer if
        # the search string is something like 'author: "foo"' because it
        # will be interpreted as 'author:"foo"'. I am choosing to accept the
        # possible error. The expression should be written '"author:" foo'
        if len(words) > 1 and words[0].lower() in self.locations:
            loc = words[0].lower()
            words = words[1:]
            if len(words) == 1 and self.token_type() == self.QUOTED_WORD:
                return ['token', loc, self.token(advance=True)]
            return ['token', icu_lower(loc), ':'.join(words)]

        return ['token', 'all', ':'.join(words)]


class ParseException(Exception):

    @property
    def msg(self):
        if len(self.args) > 0:
            return self.args[0]
        return ""


class SearchQueryParser:
    '''
    Parses a search query.

    A search query consists of tokens. The tokens can be combined using
    the `or`, `and` and `not` operators as well as grouped using parentheses.
    When no operator is specified between two tokens, `and` is assumed.

    Each token is a string of the form `location:query`. `location` is a string
    from :member:`DEFAULT_LOCATIONS`. It is optional. If it is omitted, it is assumed to
    be `all`. `query` is an arbitrary string that must not contain parentheses.
    If it contains whitespace, it should be quoted by enclosing it in `"` marks.

    Examples::

      * `Asimov` [search for the string "Asimov" in location `all`]
      * `comments:"This is a good book"` [search for "This is a good book" in `comments`]
      * `author:Asimov tag:unread` [search for books by Asimov that have been tagged as unread]
      * `author:Asimov or author:Hardy` [search for books by Asimov or Hardy]
      * `(author:Asimov or author:Hardy) and not tag:read` [search for unread books by Asimov or Hardy]
    '''

    @staticmethod
    def run_tests(parser, result, tests):
        failed = []
        for test in tests:
            prints('\tTesting:', test[0], end=' ')
            res = parser.parseString(test[0])
            if list(res.get(result, None)) == test[1]:
                print('OK')
            else:
                print('FAILED:', 'Expected:', test[1], 'Got:', list(res.get(result, None)))
                failed.append(test[0])
        return failed

    def __init__(self, locations, test=False, optimize=False, lookup_saved_search=None, parse_cache=None):
        self.sqp_initialize(locations, test=test, optimize=optimize)
        self.parser = Parser()
        self.lookup_saved_search = global_lookup_saved_search if lookup_saved_search is None else lookup_saved_search
        self.sqp_parse_cache = parse_cache

    def sqp_change_locations(self, locations):
        self.sqp_initialize(locations, optimize=self.optimize)
        if self.sqp_parse_cache is not None:
            self.sqp_parse_cache.clear()

    def sqp_initialize(self, locations, test=False, optimize=False):
        self.locations = locations
        self._tests_failed = False
        self.optimize = optimize

    def get_queried_fields(self, query):
        # empty the list of searches used for recursion testing
        self.recurse_level = 0
        self.searches_seen = set()
        tree = self._get_tree(query)
        yield from self._walk_expr(tree)

    def _walk_expr(self, tree):
        if tree[0] in ('or', 'and'):
            yield from self._walk_expr(tree[1])
            yield from self._walk_expr(tree[2])
        elif tree[0] == 'not':
            yield from self._walk_expr(tree[1])
        else:
            if tree[1] == 'search':
                yield from self._walk_expr(self._get_tree(
                                          self._get_saved_search_text(tree[2])))
            else:
                yield tree[1], tree[2]

    def parse(self, query, candidates=None):
        # empty the list of searches used for recursion testing
        self.recurse_level = 0
        self.searches_seen = set()
        candidates = self.universal_set()
        return self._parse(query, candidates=candidates)

    def _get_tree(self, query):
        self.recurse_level += 1
        try:
            res = self.sqp_parse_cache.get(query, None)
        except AttributeError:
            res = None
        if res is not None:
            return res
        try:
            res = self.parser.parse(query, self.locations)
        except RuntimeError:
            raise ParseException(_('Failed to parse query, recursion limit reached: %s')%repr(query))
        if self.sqp_parse_cache is not None:
            self.sqp_parse_cache[query] = res
        return res

    # this parse is used internally because it doesn't clear the
    # recursive search test list. However, we permit seeing the
    # same search a few times because the search might appear within
    # another search.
    def _parse(self, query, candidates=None):
        self.recurse_level += 1
        tree = self._get_tree(query)
        if candidates is None:
            candidates = self.universal_set()
        t = self.evaluate(tree, candidates)
        self.recurse_level -= 1
        return t

    def method(self, group_name):
        return getattr(self, 'evaluate_'+group_name)

    def evaluate(self, parse_result, candidates):
        return self.method(parse_result[0])(parse_result[1:], candidates)

    def evaluate_and(self, argument, candidates):
        # RHS checks only those items matched by LHS
        # returns result of RHS check: RHmatches(LHmatches(c))
        #  return self.evaluate(argument[0]).intersection(self.evaluate(argument[1]))
        l = self.evaluate(argument[0], candidates)
        return l.intersection(self.evaluate(argument[1], l))

    def evaluate_or(self, argument, candidates):
        # RHS checks only those elements not matched by LHS
        # returns LHS union RHS: LHmatches(c) + RHmatches(c-LHmatches(c))
        #  return self.evaluate(argument[0]).union(self.evaluate(argument[1]))
        l = self.evaluate(argument[0], candidates)
        return l.union(self.evaluate(argument[1], candidates.difference(l)))

    def evaluate_not(self, argument, candidates):
        # unary op checks only candidates. Result: list of items matching
        # returns: c - matches(c)
        #  return self.universal_set().difference(self.evaluate(argument[0]))
        return candidates.difference(self.evaluate(argument[0], candidates))

#     def evaluate_parenthesis(self, argument, candidates):
#         return self.evaluate(argument[0], candidates)

    def _get_saved_search_text(self, query):
        if query.startswith('='):
            query = query[1:]
        try:
            if query.lower() in self.searches_seen:
                raise ParseException(_('Recursive saved search: {0}').format(query))
            if self.recurse_level > 10:
                self.searches_seen.add(query.lower())
            ss = self.lookup_saved_search(query)
            if ss is None:
                raise ParseException(_('Unknown saved search: {}').format(query))
            return ss
        except ParseException as e:
            raise e
        except:  # convert all exceptions (e.g., missing key) to a parse error
            import traceback
            traceback.print_exc()
            raise ParseException(_('Unknown error in saved search: {0}').format(query))

    def evaluate_token(self, argument, candidates):
        location = argument[0]
        query = argument[1]
        if location.lower() == 'search':
            return self._parse(self._get_saved_search_text(query), candidates)
        return self._get_matches(location, query, candidates)

    def _get_matches(self, location, query, candidates):
        if self.optimize:
            return self.get_matches(location, query, candidates=candidates)
        else:
            return self.get_matches(location, query)

    def get_matches(self, location, query, candidates=None):
        '''
        Should return the set of matches for :param:'location` and :param:`query`.

        The search must be performed over all entries if :param:`candidates` is
        None otherwise only over the items in candidates.

        :param:`location` is one of the items in :member:`SearchQueryParser.DEFAULT_LOCATIONS`.
        :param:`query` is a string literal.
        :return: None or a subset of the set returned by :meth:`universal_set`.
        '''
        return set()

    def universal_set(self):
        '''
        Should return the set of all matches.
        '''
        return set()

Zerion Mini Shell 1.0