%PDF- %PDF-
Direktori : /lib/calibre/calibre/spell/ |
Current File : //lib/calibre/calibre/spell/import_from.py |
#!/usr/bin/env python3 __license__ = 'GPL v3' __copyright__ = '2014, Kovid Goyal <kovid at kovidgoyal.net>' import sys, glob, os, tempfile, re, codecs from lxml import etree from calibre.constants import config_dir from calibre.utils.xml_parse import safe_xml_fromstring from calibre.utils.zipfile import ZipFile from polyglot.builtins import iteritems NS_MAP = { 'oor': "http://openoffice.org/2001/registry", 'xs': "http://www.w3.org/2001/XMLSchema", 'manifest': 'http://openoffice.org/2001/manifest', } XPath = lambda x: etree.XPath(x, namespaces=NS_MAP) BUILTIN_LOCALES = {'en-US', 'en-GB', 'es-ES'} def parse_xcu(raw, origin='%origin%'): ' Get the dictionary and affix file names as well as supported locales for each dictionary ' ans = {} root = safe_xml_fromstring(raw) for node in XPath('//prop[@oor:name="Format"]/value[text()="DICT_SPELL"]/../..')(root): value = XPath('descendant::prop[@oor:name="Locations"]/value')(node) if len(value[0]) == 0: # The value node has no children, use its text paths = ''.join(XPath('descendant::prop[@oor:name="Locations"]/value/text()')(node)).replace('%origin%', origin).split() else: # Use the text of the value nodes children paths = [c.text.replace('%origin%', origin) for v in value for c in v.iterchildren('*') if c.text] aff, dic = paths if paths[0].endswith('.aff') else reversed(paths) locales = ''.join(XPath('descendant::prop[@oor:name="Locales"]/value/text()')(node)).split() ans[(dic, aff)] = locales return ans def convert_to_utf8(dic_data, aff_data, errors='strict'): m = re.search(br'^SET\s+(\S+)$', aff_data[:2048], flags=re.MULTILINE) if m is not None: enc = m.group(1).decode('ascii', 'replace') if enc.upper() not in ('UTF-8', 'UTF8'): try: codecs.lookup(enc) except LookupError: pass else: aff_data = aff_data[:m.start()] + b'SET UTF-8' + aff_data[m.end():] aff_data = aff_data.decode(enc, errors).encode('utf-8') dic_data = dic_data.decode(enc, errors).encode('utf-8') return dic_data, aff_data def import_from_libreoffice_source_tree(source_path): dictionaries = {} for x in glob.glob(os.path.join(source_path, '*', 'dictionaries.xcu')): origin = os.path.dirname(x) with open(x, 'rb') as f: dictionaries.update(parse_xcu(f.read(), origin)) base = P('dictionaries', allow_user_override=False) want_locales = set(BUILTIN_LOCALES) for (dic, aff), locales in iteritems(dictionaries): c = set(locales) & want_locales if c: locale = tuple(c)[0] want_locales.discard(locale) dest = os.path.join(base, locale) if not os.path.exists(dest): os.makedirs(dest) with open(dic, 'rb') as df, open(aff, 'rb') as af: dd, ad = convert_to_utf8(df.read(), af.read()) for src, raw in ((dic, dd), (aff, ad)): with open(os.path.join(dest, locale + os.path.splitext(src)[1]), 'wb') as df: df.write(raw) with open(os.path.join(dest, 'locales'), 'wb') as f: locales.sort(key=lambda x: (0, x) if x == locale else (1, x)) f.write(('\n'.join(locales)).encode('utf-8')) if want_locales: raise Exception('Failed to find dictionaries for some wanted locales: %s' % want_locales) def fill_country_code(x): return {'lt':'lt_LT'}.get(x, x) def uniq(vals, kmap=lambda x:x): ''' Remove all duplicates from vals, while preserving order. kmap must be a callable that returns a hashable value for every item in vals ''' vals = vals or () lvals = (kmap(x) for x in vals) seen = set() seen_add = seen.add return tuple(x for x, k in zip(vals, lvals) if k not in seen and not seen_add(k)) def import_from_oxt(source_path, name, dest_dir=None, prefix='dic-'): from calibre.spell.dictionary import parse_lang_code dest_dir = dest_dir or os.path.join(config_dir, 'dictionaries') if not os.path.exists(dest_dir): os.makedirs(dest_dir) num = 0 with ZipFile(source_path) as zf: def read_file(key): try: return zf.open(key).read() except KeyError: # Some dictionaries apparently put the xcu in a sub-directory # and incorrectly make paths relative to that directory instead # of the root, for example: # http://extensions.libreoffice.org/extension-center/italian-dictionary-thesaurus-hyphenation-patterns/releases/4.1/dict-it.oxt while key.startswith('../'): key = key[3:] return zf.open(key.lstrip('/')).read() root = safe_xml_fromstring(zf.open('META-INF/manifest.xml').read()) xcu = XPath('//manifest:file-entry[@manifest:media-type="application/vnd.sun.star.configuration-data"]')(root)[0].get( '{%s}full-path' % NS_MAP['manifest']) for (dic, aff), locales in iteritems(parse_xcu(zf.open(xcu).read(), origin='')): dic, aff = dic.lstrip('/'), aff.lstrip('/') d = tempfile.mkdtemp(prefix=prefix, dir=dest_dir) locales = uniq([x for x in map(fill_country_code, locales) if parse_lang_code(x).countrycode]) if not locales: continue metadata = [name] + list(locales) with open(os.path.join(d, 'locales'), 'wb') as f: f.write(('\n'.join(metadata)).encode('utf-8')) dd, ad = convert_to_utf8(read_file(dic), read_file(aff)) with open(os.path.join(d, '%s.dic' % locales[0]), 'wb') as f: f.write(dd) with open(os.path.join(d, '%s.aff' % locales[0]), 'wb') as f: f.write(ad) num += 1 return num if __name__ == '__main__': import_from_libreoffice_source_tree(sys.argv[-1])