%PDF- %PDF-
Direktori : /usr/lib/calibre/calibre/ebooks/metadata/ |
Current File : //usr/lib/calibre/calibre/ebooks/metadata/author_mapper.py |
#!/usr/bin/env python3 # License: GPLv3 Copyright: 2015, Kovid Goyal <kovid at kovidgoyal.net> import re from collections import deque from calibre.utils.icu import capitalize, lower, upper def cap_author_token(token): lt = lower(token) if lt in ('von', 'de', 'el', 'van', 'le'): return lt # no digits no spez. characters if re.match(r'([^\d\W]\.){2,}$', lt, re.UNICODE) is not None: # Normalize tokens of the form J.K. to J. K. parts = token.split('.') return '. '.join(map(capitalize, parts)).strip() scots_name = None for x in ('mc', 'mac'): if (token.lower().startswith(x) and len(token) > len(x) and ( token[len(x)] == upper(token[len(x)]) or lt == token )): scots_name = len(x) break ans = capitalize(token) if scots_name is not None: ans = ans[:scots_name] + upper(ans[scots_name]) + ans[scots_name+1:] for x in ('-', "'"): idx = ans.find(x) if idx > -1 and len(ans) > idx+2: ans = ans[:idx+1] + upper(ans[idx+1]) + ans[idx+2:] return ans def compile_pat(pat): import regex REGEX_FLAGS = regex.VERSION1 | regex.WORD | regex.FULLCASE | regex.IGNORECASE | regex.UNICODE return regex.compile(pat, flags=REGEX_FLAGS) def matcher(rule): mt = rule['match_type'] if mt == 'one_of': authors = {icu_lower(x.strip()) for x in rule['query'].split('&')} return lambda x: x in authors if mt == 'not_one_of': authors = {icu_lower(x.strip()) for x in rule['query'].split('&')} return lambda x: x not in authors if mt == 'matches': pat = compile_pat(rule['query']) return lambda x: pat.match(x) is not None if mt == 'not_matches': pat = compile_pat(rule['query']) return lambda x: pat.match(x) is None if mt == 'has': s = icu_lower(rule['query']) return lambda x: s in x return lambda x: False def apply_rules(author, rules): ans = [] authors = deque() authors.append(author) maxiter = 20 while authors and maxiter > 0: author = authors.popleft() lauthor = icu_lower(author) maxiter -= 1 for rule, matches in rules: ac = rule['action'] if matches(lauthor): if ac == 'replace': if 'matches' in rule['match_type']: author = compile_pat(rule['query']).sub(rule['replace'], author) else: author = rule['replace'] if '&' in author: replacement_authors = [] self_added = False for rauthor in (x.strip() for x in author.split('&')): if icu_lower(rauthor) == lauthor: if not self_added: ans.append(rauthor) self_added = True else: replacement_authors.append(rauthor) authors.extendleft(reversed(replacement_authors)) else: if icu_lower(author) == lauthor: # Case change or self replacement ans.append(author) break authors.appendleft(author) break if ac == 'capitalize': ans.append(' '.join(map(cap_author_token, author.split()))) break if ac == 'lower': ans.append(icu_lower(author)) break if ac == 'upper': ans.append(icu_upper(author)) break else: # no rule matched, default keep ans.append(author) ans.extend(authors) return ans def uniq(vals, kmap=icu_lower): ''' Remove all duplicates from vals, while preserving order. kmap must be a callable that returns a hashable value for every item in vals ''' vals = vals or () lvals = (kmap(x) for x in vals) seen = set() seen_add = seen.add return list(x for x, k in zip(vals, lvals) if k not in seen and not seen_add(k)) def compile_rules(rules): return tuple((r, matcher(r)) for r in rules) def map_authors(authors, rules=()): if not authors: return [] if not rules: return list(authors) ans = [] for a in authors: ans.extend(apply_rules(a, rules)) return uniq(list(filter(None, ans))) def find_tests(): import unittest class TestAuthorMapper(unittest.TestCase): def test_author_mapper(self): def rule(action, query, replace=None, match_type='one_of'): ans = {'action':action, 'query': query, 'match_type':match_type} if replace is not None: ans['replace'] = replace return ans def run(rules, authors, expected): if isinstance(rules, dict): rules = [rules] if isinstance(authors, str): authors = [x.strip() for x in authors.split('&')] if isinstance(expected, str): expected = [x.strip() for x in expected.split('&')] ans = map_authors(authors, compile_rules(rules)) self.assertEqual(ans, expected) run(rule('capitalize', 't1&t2'), 't1&x1', 'T1&x1') run(rule('upper', 'ta&t2'), 'ta&x1', 'TA&x1') run(rule('lower', 'ta&x1'), 'TA&X1', 'ta&x1') run(rule('replace', 't1', 't2'), 't1&x1', 't2&x1') run(rule('replace', '(.)1', r'\g<1>2', 'matches'), 't1&x1', 't2&x2') run(rule('replace', '(.)1', r'\g<1>2&3', 'matches'), 't1&x1', 't2&3&x2') run(rule('replace', 't1', 't2 & t3'), 't1&x1', 't2&t3&x1') run(rule('replace', 't1', 't1'), 't1&x1', 't1&x1') run([rule('replace', 't1', 't2'), rule('replace', 't2', 't1')], 't1&t2', 't1&t2') run(rule('replace', 'a', 'A'), 'a&b', 'A&b') run(rule('replace', 'a&b', 'A&B'), 'a&b', 'A&B') run(rule('replace', 'L', 'T', 'has'), 'L', 'T') return unittest.defaultTestLoader.loadTestsFromTestCase(TestAuthorMapper) if __name__ == '__main__': from calibre.utils.run_tests import run_cli run_cli(find_tests())