%PDF- %PDF-
Direktori : /lib/calibre/calibre/spell/ |
Current File : //lib/calibre/calibre/spell/dictionary.py |
#!/usr/bin/env python3 # License: GPLv3 Copyright: 2014, Kovid Goyal <kovid at kovidgoyal.net> import glob import os import re import shutil import sys from calibre_extensions import hunspell from collections import defaultdict, namedtuple from functools import partial from itertools import chain from calibre import prints from calibre.constants import config_dir, filesystem_encoding, iswindows from calibre.spell import parse_lang_code from calibre.utils.config import JSONConfig from calibre.utils.icu import capitalize from calibre.utils.localization import get_lang, get_system_locale from polyglot.builtins import iteritems, itervalues Dictionary = namedtuple('Dictionary', 'primary_locale locales dicpath affpath builtin name id') LoadedDictionary = namedtuple('Dictionary', 'primary_locale locales obj builtin name id') dprefs = JSONConfig('dictionaries/prefs.json') dprefs.defaults['preferred_dictionaries'] = {} dprefs.defaults['preferred_locales'] = {} dprefs.defaults['user_dictionaries'] = [{'name':_('Default'), 'is_active':True, 'words':[]}] not_present = object() class UserDictionary: __slots__ = ('name', 'is_active', 'words') def __init__(self, **kwargs): self.name = kwargs['name'] self.is_active = kwargs['is_active'] self.words = {(w, langcode) for w, langcode in kwargs['words']} def serialize(self): return {'name':self.name, 'is_active': self.is_active, 'words':[ (w, l) for w, l in self.words]} _builtins = _custom = None def builtin_dictionaries(): global _builtins if _builtins is None: dics = [] for lc in glob.glob(os.path.join(P('dictionaries', allow_user_override=False), '*/locales')): with open(lc, 'rb') as lcf: locales = list(filter(None, lcf.read().decode('utf-8').splitlines())) locale = locales[0] base = os.path.dirname(lc) dics.append(Dictionary( parse_lang_code(locale), frozenset(map(parse_lang_code, locales)), os.path.join(base, '%s.dic' % locale), os.path.join(base, '%s.aff' % locale), True, None, None)) _builtins = frozenset(dics) return _builtins def custom_dictionaries(reread=False): global _custom if _custom is None or reread: dics = [] for lc in glob.glob(os.path.join(config_dir, 'dictionaries', '*/locales')): with open(lc, 'rb') as cdf: locales = list(filter(None, cdf.read().decode('utf-8').splitlines())) try: name, locale, locales = locales[0], locales[1], locales[1:] except IndexError: continue base = os.path.dirname(lc) ploc = parse_lang_code(locale) if ploc.countrycode is None: continue dics.append(Dictionary( ploc, frozenset(filter(lambda x:x.countrycode is not None, map(parse_lang_code, locales))), os.path.join(base, '%s.dic' % locale), os.path.join(base, '%s.aff' % locale), False, name, os.path.basename(base))) _custom = frozenset(dics) return _custom default_en_locale = 'en-US' try: ul = parse_lang_code(get_system_locale() or 'en-US') except ValueError: ul = None if ul is not None and ul.langcode == 'eng' and ul.countrycode in 'GB BS BZ GH IE IN JM NZ TT'.split(): default_en_locale = 'en-' + ul.countrycode default_preferred_locales = {'eng':default_en_locale, 'deu':'de-DE', 'spa':'es-ES', 'fra':'fr-FR'} def best_locale_for_language(langcode): best_locale = dprefs['preferred_locales'].get(langcode, default_preferred_locales.get(langcode, None)) if best_locale is not None: return parse_lang_code(best_locale) def preferred_dictionary(locale): return {parse_lang_code(k):v for k, v in iteritems(dprefs['preferred_dictionaries'])}.get(locale, None) def remove_dictionary(dictionary): if dictionary.builtin: raise ValueError('Cannot remove builtin dictionaries') base = os.path.dirname(dictionary.dicpath) shutil.rmtree(base) dprefs['preferred_dictionaries'] = {k:v for k, v in iteritems(dprefs['preferred_dictionaries']) if v != dictionary.id} def rename_dictionary(dictionary, name): lf = os.path.join(os.path.dirname(dictionary.dicpath), 'locales') with open(lf, 'r+b') as f: lines = f.read().splitlines() lines[:1] = [name.encode('utf-8')] f.seek(0), f.truncate(), f.write(b'\n'.join(lines)) custom_dictionaries(reread=True) def get_dictionary(locale, exact_match=False): preferred = preferred_dictionary(locale) # First find all dictionaries that match locale exactly exact_matches = {} for collection in (custom_dictionaries(), builtin_dictionaries()): for d in collection: if d.primary_locale == locale: exact_matches[d.id] = d for d in collection: for q in d.locales: if q == locale and d.id not in exact_matches: exact_matches[d.id] = d # If the user has specified a preferred dictionary for this locale, use it, # otherwise, if a builtin dictionary exists, use that if preferred in exact_matches: return exact_matches[preferred] # Return one of the exactly matching dictionaries, preferring user # installed to builtin ones for k in sorted(exact_matches, key=lambda x: (1, None) if x is None else (0, x)): return exact_matches[k] if exact_match: return # No dictionary matched the locale exactly, we will now fallback to # matching only on language. First see if a dictionary matching the # preferred locale for the language exists. best_locale = best_locale_for_language(locale.langcode) if best_locale is not None: ans = get_dictionary(best_locale, exact_match=True) if ans is not None: return ans # Now just return any dictionary that matches the language, preferring user # installed ones to builtin ones for collection in (custom_dictionaries(), builtin_dictionaries()): for d in sorted(collection, key=lambda d: d.name or ''): if d.primary_locale.langcode == locale.langcode: return d def load_dictionary(dictionary): def fix_path(path): if isinstance(path, bytes): path = path.decode(filesystem_encoding) path = os.path.abspath(path) if iswindows: path = fr'\\?\{path}' return path obj = hunspell.Dictionary(fix_path(dictionary.dicpath), fix_path(dictionary.affpath)) return LoadedDictionary(dictionary.primary_locale, dictionary.locales, obj, dictionary.builtin, dictionary.name, dictionary.id) class Dictionaries: def __init__(self): self.remove_hyphenation = re.compile('[\u2010-]+') self.negative_pat = re.compile(r'-[.\d+]') self.fix_punctuation_pat = re.compile(r'''[:.]''') self.dictionaries = {} self.word_cache = {} self.ignored_words = set() self.added_user_words = {} try: self.default_locale = parse_lang_code(get_lang()) except ValueError: self.default_locale = parse_lang_code('en-US') self.ui_locale = self.default_locale def initialize(self, force=False): if force or not hasattr(self, 'active_user_dictionaries'): self.read_user_dictionaries() def clear_caches(self): self.dictionaries.clear(), self.word_cache.clear() def clear_ignored(self): self.ignored_words.clear() def dictionary_for_locale(self, locale): ans = self.dictionaries.get(locale, not_present) if ans is not_present: ans = get_dictionary(locale) if ans is not None: ans = load_dictionary(ans) for ud in self.active_user_dictionaries: for word, langcode in ud.words: if langcode == locale.langcode: try: ans.obj.add(word) except Exception: # not critical since all it means is that the word won't show up in suggestions prints(f'Failed to add the word {word!r} to the dictionary for {locale}', file=sys.stderr) self.dictionaries[locale] = ans return ans def ignore_word(self, word, locale): self.ignored_words.add((word, locale.langcode)) self.word_cache[(word, locale)] = True def unignore_word(self, word, locale): self.ignored_words.discard((word, locale.langcode)) self.word_cache.pop((word, locale), None) def is_word_ignored(self, word, locale): return (word, locale.langcode) in self.ignored_words @property def all_user_dictionaries(self): return chain(self.active_user_dictionaries, self.inactive_user_dictionaries) def user_dictionary(self, name): for ud in self.all_user_dictionaries: if ud.name == name: return ud def read_user_dictionaries(self): self.active_user_dictionaries = [] self.inactive_user_dictionaries = [] for d in dprefs['user_dictionaries'] or dprefs.defaults['user_dictionaries']: d = UserDictionary(**d) (self.active_user_dictionaries if d.is_active else self.inactive_user_dictionaries).append(d) def mark_user_dictionary_as_active(self, name, is_active=True): d = self.user_dictionary(name) if d is not None: d.is_active = is_active self.save_user_dictionaries() return True return False def save_user_dictionaries(self): dprefs['user_dictionaries'] = [d.serialize() for d in self.all_user_dictionaries] def add_user_words(self, words, langcode): for d in itervalues(self.dictionaries): if d and getattr(d.primary_locale, 'langcode', None) == langcode: for word in words: d.obj.add(word) def remove_user_words(self, words, langcode): for d in itervalues(self.dictionaries): if d and d.primary_locale.langcode == langcode: for word in words: d.obj.remove(word) def add_to_user_dictionary(self, name, word, locale): ud = self.user_dictionary(name) if ud is None: raise ValueError('Cannot add to the dictionary named: %s as no such dictionary exists' % name) wl = len(ud.words) if isinstance(word, (set, frozenset)): ud.words |= word self.add_user_words(word, locale.langcode) else: ud.words.add((word, locale.langcode)) self.add_user_words((word,), locale.langcode) if len(ud.words) > wl: self.save_user_dictionaries() try: self.word_cache.pop((word, locale), None) except TypeError: pass # word is a set, ignore return True return False def remove_from_user_dictionaries(self, word, locale): key = (word, locale.langcode) changed = False for ud in self.active_user_dictionaries: if key in ud.words: changed = True ud.words.discard(key) if changed: self.word_cache.pop((word, locale), None) self.save_user_dictionaries() self.remove_user_words((word,), locale.langcode) return changed def remove_from_user_dictionary(self, name, words): changed = False removals = defaultdict(set) keys = [(w, l.langcode) for w, l in words] for d in self.all_user_dictionaries: if d.name == name: for key in keys: if key in d.words: d.words.discard(key) removals[key[1]].add(key[0]) changed = True if changed: for key in words: self.word_cache.pop(key, None) for langcode, words in iteritems(removals): self.remove_user_words(words, langcode) self.save_user_dictionaries() return changed def word_in_user_dictionary(self, word, locale): key = (word, locale.langcode) for ud in self.active_user_dictionaries: if key in ud.words: return ud.name def create_user_dictionary(self, name): if name in {d.name for d in self.all_user_dictionaries}: raise ValueError('A dictionary named %s already exists' % name) d = UserDictionary(name=name, is_active=True, words=()) self.active_user_dictionaries.append(d) self.save_user_dictionaries() def remove_user_dictionary(self, name): changed = False for x in (self.active_user_dictionaries, self.inactive_user_dictionaries): for d in tuple(x): if d.name == name: x.remove(d) changed = True if changed: self.save_user_dictionaries() self.clear_caches() return changed def rename_user_dictionary(self, name, new_name): changed = False for d in self.all_user_dictionaries: if d.name == name: d.name = new_name changed = True if changed: self.save_user_dictionaries() return changed def recognized(self, word, locale=None): locale = locale or self.default_locale key = (word, locale) ans = self.word_cache.get(key, None) if ans is None: lkey = (word, locale.langcode) ans = False if lkey in self.ignored_words: ans = True else: for ud in self.active_user_dictionaries: if lkey in ud.words: ans = True break else: d = self.dictionary_for_locale(locale) if d is not None: try: ans = d.obj.recognized(word.replace('\u2010', '-')) except ValueError: pass else: ans = True if ans is False and self.negative_pat.match(word) is not None: ans = True self.word_cache[key] = ans return ans def suggestions(self, word, locale=None): locale = locale or self.default_locale d = self.dictionary_for_locale(locale) has_unicode_hyphen = '\u2010' in word ans = () def add_suggestion(w, ans): return (w,) + tuple(x for x in ans if x != w) if d is not None: try: ans = d.obj.suggest(str(word).replace('\u2010', '-')) except ValueError: pass else: dehyphenated_word = self.remove_hyphenation.sub('', word) if len(dehyphenated_word) != len(word) and self.recognized(dehyphenated_word, locale): # Ensure the de-hyphenated word is present and is the first suggestion ans = add_suggestion(dehyphenated_word, ans) else: m = self.fix_punctuation_pat.search(word) if m is not None: w1, w2 = word[:m.start()], word[m.end():] if self.recognized(w1) and self.recognized(w2): fw = w1 + m.group() + ' ' + w2 ans = add_suggestion(fw, ans) if capitalize(w2) != w2: fw = w1 + m.group() + ' ' + capitalize(w2) ans = add_suggestion(fw, ans) if has_unicode_hyphen: ans = tuple(w.replace('-', '\u2010') for w in ans) return ans def build_test(): dictionaries = Dictionaries() dictionaries.initialize() eng = parse_lang_code('en') if not dictionaries.recognized('recognized', locale=eng): raise AssertionError('The word recognized was not recognized') def find_tests(): import unittest class TestDictionaries(unittest.TestCase): def setUp(self): dictionaries = Dictionaries() dictionaries.initialize() eng = parse_lang_code('en-GB') self.recognized = partial(dictionaries.recognized, locale=eng) self.suggestions = partial(dictionaries.suggestions, locale=eng) def ar(self, w): if not self.recognized(w): raise AssertionError('The word %r was not recognized' % w) def test_dictionaries(self): for w in 'recognized one-half one\u2010half'.split(): self.ar(w) d = load_dictionary(get_dictionary(parse_lang_code('es-ES'))).obj self.assertTrue(d.recognized('Ahí')) self.assertIn('one\u2010half', self.suggestions('oone\u2010half')) self.assertIn('adequately', self.suggestions('ade-quately')) self.assertIn('magic. Wand', self.suggestions('magic.wand')) self.assertIn('List', self.suggestions('Lis𝑘t')) return unittest.TestLoader().loadTestsFromTestCase(TestDictionaries)