%PDF- %PDF-
Mini Shell

Mini Shell

Direktori : /lib/calibre/calibre/utils/
Upload File :
Create Path :
Current File : //lib/calibre/calibre/utils/matcher.py

#!/usr/bin/env python3


__license__ = 'GPL v3'
__copyright__ = '2014, Kovid Goyal <kovid at kovidgoyal.net>'

import atexit, os, sys
from math import ceil
from unicodedata import normalize
from threading import Thread, Lock
from operator import itemgetter
from collections import OrderedDict
from itertools import islice

from calibre import detect_ncpus as cpu_count, as_unicode
from calibre.constants import filesystem_encoding
from calibre.utils.icu import primary_sort_key, primary_find, primary_collator
from polyglot.builtins import iteritems, itervalues
from polyglot.queue import Queue

DEFAULT_LEVEL1 = '/'
DEFAULT_LEVEL2 = '-_ 0123456789'
DEFAULT_LEVEL3 = '.'


class PluginFailed(RuntimeError):
    pass


class Worker(Thread):

    daemon = True

    def __init__(self, requests, results):
        Thread.__init__(self)
        self.requests, self.results = requests, results
        atexit.register(lambda: requests.put(None))

    def run(self):
        while True:
            x = self.requests.get()
            if x is None:
                break
            try:
                i, scorer, query = x
                self.results.put((True, (i, scorer(query))))
            except Exception as e:
                self.results.put((False, as_unicode(e)))
                # import traceback
                # traceback.print_exc()


wlock = Lock()
workers = []


def split(tasks, pool_size):
    '''
    Split a list into a list of sub lists, with the number of sub lists being
    no more than pool_size. Each sublist contains
    2-tuples of the form (i, x) where x is an element from the original list
    and i is the index of the element x in the original list.
    '''
    ans, count = [], 0
    delta = int(ceil(len(tasks) / pool_size))
    while tasks:
        section = [(count + i, task) for i, task in enumerate(tasks[:delta])]
        tasks = tasks[delta:]
        count += len(section)
        ans.append(section)
    return ans


def default_scorer(*args, **kwargs):
    try:
        return CScorer(*args, **kwargs)
    except PluginFailed:
        return PyScorer(*args, **kwargs)


class Matcher:

    def __init__(
        self,
        items,
        level1=DEFAULT_LEVEL1,
        level2=DEFAULT_LEVEL2,
        level3=DEFAULT_LEVEL3,
        scorer=None
    ):
        with wlock:
            if not workers:
                requests, results = Queue(), Queue()
                w = [Worker(requests, results) for i in range(max(1, cpu_count()))]
                [x.start() for x in w]
                workers.extend(w)
        items = map(lambda x: normalize('NFC', str(x)), filter(None, items))
        self.items = items = tuple(items)
        tasks = split(items, len(workers))
        self.task_maps = [{j: i for j, (i, _) in enumerate(task)} for task in tasks]
        scorer = scorer or default_scorer
        self.scorers = [
            scorer(tuple(map(itemgetter(1), task_items))) for task_items in tasks
        ]
        self.sort_keys = None

    def __call__(self, query, limit=None):
        query = normalize('NFC', str(query))
        with wlock:
            for i, scorer in enumerate(self.scorers):
                workers[0].requests.put((i, scorer, query))
            if self.sort_keys is None:
                self.sort_keys = {
                    i: primary_sort_key(x)
                    for i, x in enumerate(self.items)
                }
            num = len(self.task_maps)
            scores, positions = {}, {}
            error = None
            while num > 0:
                ok, x = workers[0].results.get()
                num -= 1
                if ok:
                    task_num, vals = x
                    task_map = self.task_maps[task_num]
                    for i, (score, pos) in enumerate(vals):
                        item = task_map[i]
                        scores[item] = score
                        positions[item] = pos
                else:
                    error = x

        if error is not None:
            raise Exception('Failed to score items: %s' % error)
        items = sorted(((-scores[i], item, positions[i])
                        for i, item in enumerate(self.items)),
                       key=itemgetter(0))
        if limit is not None:
            del items[limit:]
        return OrderedDict(x[1:] for x in filter(itemgetter(0), items))


def get_items_from_dir(basedir, acceptq=lambda x: True):
    if isinstance(basedir, bytes):
        basedir = basedir.decode(filesystem_encoding)
    relsep = os.sep != '/'
    for dirpath, dirnames, filenames in os.walk(basedir):
        for f in filenames:
            x = os.path.join(dirpath, f)
            if acceptq(x):
                x = os.path.relpath(x, basedir)
                if relsep:
                    x = x.replace(os.sep, '/')
                yield x


class FilesystemMatcher(Matcher):

    def __init__(self, basedir, *args, **kwargs):
        Matcher.__init__(self, get_items_from_dir(basedir), *args, **kwargs)


# Python implementation of the scoring algorithm {{{


def calc_score_for_char(ctx, prev, current, distance):
    factor = 1.0
    ans = ctx.max_score_per_char

    if prev in ctx.level1:
        factor = 0.9
    elif prev in ctx.level2 or (
        icu_lower(prev) == prev and icu_upper(current) == current
    ):
        factor = 0.8
    elif prev in ctx.level3:
        factor = 0.7
    else:
        factor = (1.0 / distance) * 0.75

    return ans * factor


def process_item(ctx, haystack, needle):
    # non-recursive implementation using a stack
    stack = [(0, 0, 0, 0, [-1] * len(needle))]
    final_score, final_positions = stack[0][-2:]
    push, pop = stack.append, stack.pop
    while stack:
        hidx, nidx, last_idx, score, positions = pop()
        key = (hidx, nidx, last_idx)
        mem = ctx.memory.get(key, None)
        if mem is None:
            for i in range(nidx, len(needle)):
                n = needle[i]
                if (len(haystack) - hidx < len(needle) - i):
                    score = 0
                    break
                pos = primary_find(n, haystack[hidx:])[0]
                if pos == -1:
                    score = 0
                    break
                pos += hidx

                distance = pos - last_idx
                score_for_char = ctx.max_score_per_char if distance <= 1 else calc_score_for_char(
                    ctx, haystack[pos - 1], haystack[pos], distance
                )
                hidx = pos + 1
                push((hidx, i, last_idx, score, list(positions)))
                last_idx = positions[i] = pos
                score += score_for_char
            ctx.memory[key] = (score, positions)
        else:
            score, positions = mem
        if score > final_score:
            final_score = score
            final_positions = positions
    return final_score, final_positions


class PyScorer:
    __slots__ = (
        'level1', 'level2', 'level3', 'max_score_per_char', 'items', 'memory'
    )

    def __init__(
        self,
        items,
        level1=DEFAULT_LEVEL1,
        level2=DEFAULT_LEVEL2,
        level3=DEFAULT_LEVEL3
    ):
        self.level1, self.level2, self.level3 = level1, level2, level3
        self.max_score_per_char = 0
        self.items = items

    def __call__(self, needle):
        for item in self.items:
            self.max_score_per_char = (1.0 / len(item) + 1.0 / len(needle)) / 2.0
            self.memory = {}
            yield process_item(self, item, needle)


# }}}


class CScorer:

    def __init__(
        self,
        items,
        level1=DEFAULT_LEVEL1,
        level2=DEFAULT_LEVEL2,
        level3=DEFAULT_LEVEL3
    ):
        from calibre_extensions.matcher import Matcher
        self.m = Matcher(
            items,
            primary_collator().capsule,
            str(level1), str(level2), str(level3)
        )

    def __call__(self, query):
        scores, positions = self.m.calculate_scores(query)
        yield from zip(scores, positions)


def test(return_tests=False):
    is_sanitized = 'libasan' in os.environ.get('LD_PRELOAD', '')
    import unittest

    class Test(unittest.TestCase):

        @unittest.skipIf(is_sanitized, 'Sanitizer enabled can\'t check for leaks')
        def test_mem_leaks(self):
            import gc
            from calibre.utils.mem import get_memory as memory
            m = Matcher(['a'], scorer=CScorer)
            m('a')

            def doit(c):
                m = Matcher([
                    c + 'im/one.gif',
                    c + 'im/two.gif',
                    c + 'text/one.html',
                ],
                            scorer=CScorer)
                m('one')

            start = memory()
            for i in range(10):
                doit(str(i))
            gc.collect()
            used10 = memory() - start
            start = memory()
            for i in range(100):
                doit(str(i))
            gc.collect()
            used100 = memory() - start
            if used100 > 0 and used10 > 0:
                self.assertLessEqual(used100, 2 * used10)

        def test_non_bmp(self):
            raw = '_\U0001f431-'
            m = Matcher([raw], scorer=CScorer)
            positions = next(itervalues(m(raw)))
            self.assertEqual(
                positions, (0, 1, 2)
            )

    if return_tests:
        return unittest.TestLoader().loadTestsFromTestCase(Test)

    class TestRunner(unittest.main):

        def createTests(self):
            tl = unittest.TestLoader()
            self.test = tl.loadTestsFromTestCase(Test)

    TestRunner(verbosity=4)


def get_char(string, pos):
    return string[pos]


def input_unicode(prompt):
    ans = input(prompt)
    if isinstance(ans, bytes):
        ans = ans.decode(sys.stdin.encoding)
    return ans


def main(basedir=None, query=None):
    from calibre import prints
    from calibre.utils.terminal import ColoredStream
    if basedir is None:
        try:
            basedir = input_unicode('Enter directory to scan [%s]: ' % os.getcwd()
                                ).strip() or os.getcwd()
        except (EOFError, KeyboardInterrupt):
            return
    m = FilesystemMatcher(basedir)
    emph = ColoredStream(sys.stdout, fg='red', bold=True)
    while True:
        if query is None:
            try:
                query = input_unicode('Enter query: ')
            except (EOFError, KeyboardInterrupt):
                break
            if not query:
                break
        for path, positions in islice(iteritems(m(query)), 0, 10):
            positions = list(positions)
            p = 0
            while positions:
                pos = positions.pop(0)
                if pos == -1:
                    continue
                prints(path[p:pos], end='')
                ch = get_char(path, pos)
                with emph:
                    prints(ch, end='')
                p = pos + len(ch)
            prints(path[p:])
        query = None


if __name__ == '__main__':
    # main(basedir='/t', query='ns')
    # test()
    main()

Zerion Mini Shell 1.0