%PDF- %PDF-
Direktori : /lib/calibre/calibre/utils/ |
Current File : //lib/calibre/calibre/utils/matcher.py |
#!/usr/bin/env python3 __license__ = 'GPL v3' __copyright__ = '2014, Kovid Goyal <kovid at kovidgoyal.net>' import atexit, os, sys from math import ceil from unicodedata import normalize from threading import Thread, Lock from operator import itemgetter from collections import OrderedDict from itertools import islice from calibre import detect_ncpus as cpu_count, as_unicode from calibre.constants import filesystem_encoding from calibre.utils.icu import primary_sort_key, primary_find, primary_collator from polyglot.builtins import iteritems, itervalues from polyglot.queue import Queue DEFAULT_LEVEL1 = '/' DEFAULT_LEVEL2 = '-_ 0123456789' DEFAULT_LEVEL3 = '.' class PluginFailed(RuntimeError): pass class Worker(Thread): daemon = True def __init__(self, requests, results): Thread.__init__(self) self.requests, self.results = requests, results atexit.register(lambda: requests.put(None)) def run(self): while True: x = self.requests.get() if x is None: break try: i, scorer, query = x self.results.put((True, (i, scorer(query)))) except Exception as e: self.results.put((False, as_unicode(e))) # import traceback # traceback.print_exc() wlock = Lock() workers = [] def split(tasks, pool_size): ''' Split a list into a list of sub lists, with the number of sub lists being no more than pool_size. Each sublist contains 2-tuples of the form (i, x) where x is an element from the original list and i is the index of the element x in the original list. ''' ans, count = [], 0 delta = int(ceil(len(tasks) / pool_size)) while tasks: section = [(count + i, task) for i, task in enumerate(tasks[:delta])] tasks = tasks[delta:] count += len(section) ans.append(section) return ans def default_scorer(*args, **kwargs): try: return CScorer(*args, **kwargs) except PluginFailed: return PyScorer(*args, **kwargs) class Matcher: def __init__( self, items, level1=DEFAULT_LEVEL1, level2=DEFAULT_LEVEL2, level3=DEFAULT_LEVEL3, scorer=None ): with wlock: if not workers: requests, results = Queue(), Queue() w = [Worker(requests, results) for i in range(max(1, cpu_count()))] [x.start() for x in w] workers.extend(w) items = map(lambda x: normalize('NFC', str(x)), filter(None, items)) self.items = items = tuple(items) tasks = split(items, len(workers)) self.task_maps = [{j: i for j, (i, _) in enumerate(task)} for task in tasks] scorer = scorer or default_scorer self.scorers = [ scorer(tuple(map(itemgetter(1), task_items))) for task_items in tasks ] self.sort_keys = None def __call__(self, query, limit=None): query = normalize('NFC', str(query)) with wlock: for i, scorer in enumerate(self.scorers): workers[0].requests.put((i, scorer, query)) if self.sort_keys is None: self.sort_keys = { i: primary_sort_key(x) for i, x in enumerate(self.items) } num = len(self.task_maps) scores, positions = {}, {} error = None while num > 0: ok, x = workers[0].results.get() num -= 1 if ok: task_num, vals = x task_map = self.task_maps[task_num] for i, (score, pos) in enumerate(vals): item = task_map[i] scores[item] = score positions[item] = pos else: error = x if error is not None: raise Exception('Failed to score items: %s' % error) items = sorted(((-scores[i], item, positions[i]) for i, item in enumerate(self.items)), key=itemgetter(0)) if limit is not None: del items[limit:] return OrderedDict(x[1:] for x in filter(itemgetter(0), items)) def get_items_from_dir(basedir, acceptq=lambda x: True): if isinstance(basedir, bytes): basedir = basedir.decode(filesystem_encoding) relsep = os.sep != '/' for dirpath, dirnames, filenames in os.walk(basedir): for f in filenames: x = os.path.join(dirpath, f) if acceptq(x): x = os.path.relpath(x, basedir) if relsep: x = x.replace(os.sep, '/') yield x class FilesystemMatcher(Matcher): def __init__(self, basedir, *args, **kwargs): Matcher.__init__(self, get_items_from_dir(basedir), *args, **kwargs) # Python implementation of the scoring algorithm {{{ def calc_score_for_char(ctx, prev, current, distance): factor = 1.0 ans = ctx.max_score_per_char if prev in ctx.level1: factor = 0.9 elif prev in ctx.level2 or ( icu_lower(prev) == prev and icu_upper(current) == current ): factor = 0.8 elif prev in ctx.level3: factor = 0.7 else: factor = (1.0 / distance) * 0.75 return ans * factor def process_item(ctx, haystack, needle): # non-recursive implementation using a stack stack = [(0, 0, 0, 0, [-1] * len(needle))] final_score, final_positions = stack[0][-2:] push, pop = stack.append, stack.pop while stack: hidx, nidx, last_idx, score, positions = pop() key = (hidx, nidx, last_idx) mem = ctx.memory.get(key, None) if mem is None: for i in range(nidx, len(needle)): n = needle[i] if (len(haystack) - hidx < len(needle) - i): score = 0 break pos = primary_find(n, haystack[hidx:])[0] if pos == -1: score = 0 break pos += hidx distance = pos - last_idx score_for_char = ctx.max_score_per_char if distance <= 1 else calc_score_for_char( ctx, haystack[pos - 1], haystack[pos], distance ) hidx = pos + 1 push((hidx, i, last_idx, score, list(positions))) last_idx = positions[i] = pos score += score_for_char ctx.memory[key] = (score, positions) else: score, positions = mem if score > final_score: final_score = score final_positions = positions return final_score, final_positions class PyScorer: __slots__ = ( 'level1', 'level2', 'level3', 'max_score_per_char', 'items', 'memory' ) def __init__( self, items, level1=DEFAULT_LEVEL1, level2=DEFAULT_LEVEL2, level3=DEFAULT_LEVEL3 ): self.level1, self.level2, self.level3 = level1, level2, level3 self.max_score_per_char = 0 self.items = items def __call__(self, needle): for item in self.items: self.max_score_per_char = (1.0 / len(item) + 1.0 / len(needle)) / 2.0 self.memory = {} yield process_item(self, item, needle) # }}} class CScorer: def __init__( self, items, level1=DEFAULT_LEVEL1, level2=DEFAULT_LEVEL2, level3=DEFAULT_LEVEL3 ): from calibre_extensions.matcher import Matcher self.m = Matcher( items, primary_collator().capsule, str(level1), str(level2), str(level3) ) def __call__(self, query): scores, positions = self.m.calculate_scores(query) yield from zip(scores, positions) def test(return_tests=False): is_sanitized = 'libasan' in os.environ.get('LD_PRELOAD', '') import unittest class Test(unittest.TestCase): @unittest.skipIf(is_sanitized, 'Sanitizer enabled can\'t check for leaks') def test_mem_leaks(self): import gc from calibre.utils.mem import get_memory as memory m = Matcher(['a'], scorer=CScorer) m('a') def doit(c): m = Matcher([ c + 'im/one.gif', c + 'im/two.gif', c + 'text/one.html', ], scorer=CScorer) m('one') start = memory() for i in range(10): doit(str(i)) gc.collect() used10 = memory() - start start = memory() for i in range(100): doit(str(i)) gc.collect() used100 = memory() - start if used100 > 0 and used10 > 0: self.assertLessEqual(used100, 2 * used10) def test_non_bmp(self): raw = '_\U0001f431-' m = Matcher([raw], scorer=CScorer) positions = next(itervalues(m(raw))) self.assertEqual( positions, (0, 1, 2) ) if return_tests: return unittest.TestLoader().loadTestsFromTestCase(Test) class TestRunner(unittest.main): def createTests(self): tl = unittest.TestLoader() self.test = tl.loadTestsFromTestCase(Test) TestRunner(verbosity=4) def get_char(string, pos): return string[pos] def input_unicode(prompt): ans = input(prompt) if isinstance(ans, bytes): ans = ans.decode(sys.stdin.encoding) return ans def main(basedir=None, query=None): from calibre import prints from calibre.utils.terminal import ColoredStream if basedir is None: try: basedir = input_unicode('Enter directory to scan [%s]: ' % os.getcwd() ).strip() or os.getcwd() except (EOFError, KeyboardInterrupt): return m = FilesystemMatcher(basedir) emph = ColoredStream(sys.stdout, fg='red', bold=True) while True: if query is None: try: query = input_unicode('Enter query: ') except (EOFError, KeyboardInterrupt): break if not query: break for path, positions in islice(iteritems(m(query)), 0, 10): positions = list(positions) p = 0 while positions: pos = positions.pop(0) if pos == -1: continue prints(path[p:pos], end='') ch = get_char(path, pos) with emph: prints(ch, end='') p = pos + len(ch) prints(path[p:]) query = None if __name__ == '__main__': # main(basedir='/t', query='ns') # test() main()