%PDF- %PDF-
| Direktori : /usr/lib/calibre/calibre/ebooks/oeb/polish/ |
| Current File : //usr/lib/calibre/calibre/ebooks/oeb/polish/spell.py |
#!/usr/bin/env python3
__license__ = 'GPL v3'
__copyright__ = '2014, Kovid Goyal <kovid at kovidgoyal.net>'
import sys
from collections import defaultdict, Counter
from calibre import replace_entities
from calibre.spell.break_iterator import split_into_words, index_of
from calibre.spell.dictionary import parse_lang_code
from calibre.ebooks.oeb.base import barename
from calibre.ebooks.oeb.polish.container import OPF_NAMESPACES, get_container
from calibre.ebooks.oeb.polish.parsing import parse
from calibre.ebooks.oeb.polish.toc import find_existing_ncx_toc, find_existing_nav_toc
from calibre.utils.icu import ord_string
from polyglot.builtins import iteritems
_patterns = None
class Patterns:
__slots__ = ('sanitize_invisible_pat', 'split_pat', 'digit_pat', 'fr_elision_pat')
def __init__(self):
import regex
# Remove soft hyphens/zero width spaces/control codes
self.sanitize_invisible_pat = regex.compile(
r'[\u00ad\u200b\u200c\u200d\ufeff\0-\x08\x0b\x0c\x0e-\x1f\x7f]', regex.VERSION1 | regex.UNICODE)
self.split_pat = regex.compile(
r'\W+', flags=regex.VERSION1 | regex.WORD | regex.FULLCASE | regex.UNICODE)
self.digit_pat = regex.compile(
r'^\d+$', flags=regex.VERSION1 | regex.WORD | regex.UNICODE)
# French words with prefixes are reduced to the stem word, so that the
# words appear only once in the word list
self.fr_elision_pat = regex.compile(
"^(?:l|d|m|t|s|j|c|ç|lorsqu|puisqu|quoiqu|qu)['’]", flags=regex.UNICODE | regex.VERSION1 | regex.IGNORECASE)
def patterns():
global _patterns
if _patterns is None:
_patterns = Patterns()
return _patterns
class CharCounter:
def __init__(self):
self.counter = Counter()
self.chars = defaultdict(set)
self.update = self.counter.update
class Location:
__slots__ = ('file_name', 'sourceline', 'original_word', 'location_node', 'node_item', 'elided_prefix')
def __init__(self, file_name=None, elided_prefix='', original_word=None, location_node=None, node_item=(None, None)):
self.file_name, self.elided_prefix, self.original_word = file_name, elided_prefix, original_word
self.location_node, self.node_item, self.sourceline = location_node, node_item, location_node.sourceline
def __repr__(self):
return f'{self.original_word} @ {self.file_name}:{self.sourceline}'
__str__ = __repr__
def replace(self, new_word):
self.original_word = self.elided_prefix + new_word
def filter_words(word):
if not word:
return False
p = patterns()
if p.digit_pat.match(word) is not None:
return False
return True
def get_words(text, lang):
try:
ans = split_into_words(str(text), lang)
except (TypeError, ValueError):
return ()
return list(filter(filter_words, ans))
def add_words(text, node, words, file_name, locale, node_item):
candidates = get_words(text, locale.langcode)
if candidates:
p = patterns()
is_fr = locale.langcode == 'fra'
for word in candidates:
sword = p.sanitize_invisible_pat.sub('', word).strip()
elided_prefix = ''
if is_fr:
m = p.fr_elision_pat.match(sword)
if m is not None and len(sword) > len(elided_prefix):
elided_prefix = m.group()
sword = sword[len(elided_prefix):]
loc = Location(file_name, elided_prefix, word, node, node_item)
words[(sword, locale)].append(loc)
words[None] += 1
def add_chars(text, counter, file_name):
if text:
if isinstance(text, bytes):
text = text.decode('utf-8', 'ignore')
counts = Counter(ord_string(text))
counter.update(counts)
for codepoint in counts:
counter.chars[codepoint].add(file_name)
def add_words_from_attr(node, attr, words, file_name, locale):
text = node.get(attr, None)
if text:
add_words(text, node, words, file_name, locale, (True, attr))
def count_chars_in_attr(node, attr, counter, file_name, locale):
text = node.get(attr, None)
if text:
add_chars(text, counter, file_name)
def add_words_from_text(node, attr, words, file_name, locale):
add_words(getattr(node, attr), node, words, file_name, locale, (False, attr))
def count_chars_in_text(node, attr, counter, file_name, locale):
add_chars(getattr(node, attr), counter, file_name)
def add_words_from_escaped_html(text, words, file_name, node, attr, locale):
text = replace_entities(text)
root = parse('<html><body><div>%s</div></body></html>' % text, decoder=lambda x:x.decode('utf-8'))
ewords = defaultdict(list)
ewords[None] = 0
read_words_from_html(root, ewords, file_name, locale)
words[None] += ewords.pop(None)
for k, locs in iteritems(ewords):
for loc in locs:
loc.location_node, loc.node_item = node, (False, attr)
words[k].extend(locs)
def count_chars_in_escaped_html(text, counter, file_name, node, attr, locale):
text = replace_entities(text)
root = parse('<html><body><div>%s</div></body></html>' % text, decoder=lambda x:x.decode('utf-8'))
count_chars_in_html(root, counter, file_name, locale)
_opf_file_as = '{%s}file-as' % OPF_NAMESPACES['opf']
opf_spell_tags = {'title', 'creator', 'subject', 'description', 'publisher'}
# We can only use barename() for tag names and simple attribute checks so that
# this code matches up with the syntax highlighter base spell checking
def read_words_from_opf(root, words, file_name, book_locale):
for tag in root.iterdescendants('*'):
if tag.text is not None and barename(tag.tag) in opf_spell_tags:
if barename(tag.tag) == 'description':
add_words_from_escaped_html(tag.text, words, file_name, tag, 'text', book_locale)
else:
add_words_from_text(tag, 'text', words, file_name, book_locale)
add_words_from_attr(tag, _opf_file_as, words, file_name, book_locale)
def count_chars_in_opf(root, counter, file_name, book_locale):
for tag in root.iterdescendants('*'):
if tag.text is not None and barename(tag.tag) in opf_spell_tags:
if barename(tag.tag) == 'description':
count_chars_in_escaped_html(tag.text, counter, file_name, tag, 'text', book_locale)
else:
count_chars_in_text(tag, 'text', counter, file_name, book_locale)
count_chars_in_attr(tag, _opf_file_as, counter, file_name, book_locale)
ncx_spell_tags = {'text'}
xml_spell_tags = opf_spell_tags | ncx_spell_tags
def read_words_from_ncx(root, words, file_name, book_locale):
for tag in root.xpath('//*[local-name()="text"]'):
if tag.text is not None:
add_words_from_text(tag, 'text', words, file_name, book_locale)
def count_chars_in_ncx(root, counter, file_name, book_locale):
for tag in root.xpath('//*[local-name()="text"]'):
if tag.text is not None:
count_chars_in_text(tag, 'text', counter, file_name, book_locale)
html_spell_tags = {'script', 'style', 'link'}
def read_words_from_html_tag(tag, words, file_name, parent_locale, locale):
if tag.text is not None and barename(tag.tag) not in html_spell_tags:
add_words_from_text(tag, 'text', words, file_name, locale)
for attr in {'alt', 'title'}:
add_words_from_attr(tag, attr, words, file_name, locale)
if tag.tail is not None and tag.getparent() is not None and barename(tag.getparent().tag) not in html_spell_tags:
add_words_from_text(tag, 'tail', words, file_name, parent_locale)
def count_chars_in_html_tag(tag, counter, file_name, parent_locale, locale):
if tag.text is not None and barename(tag.tag) not in html_spell_tags:
count_chars_in_text(tag, 'text', counter, file_name, locale)
for attr in {'alt', 'title'}:
count_chars_in_attr(tag, attr, counter, file_name, locale)
if tag.tail is not None and tag.getparent() is not None and barename(tag.getparent().tag) not in html_spell_tags:
count_chars_in_text(tag, 'tail', counter, file_name, parent_locale)
def locale_from_tag(tag):
if 'lang' in tag.attrib:
try:
loc = parse_lang_code(tag.get('lang'))
except ValueError:
loc = None
if loc is not None:
return loc
if '{http://www.w3.org/XML/1998/namespace}lang' in tag.attrib:
try:
loc = parse_lang_code(tag.get('{http://www.w3.org/XML/1998/namespace}lang'))
except ValueError:
loc = None
if loc is not None:
return loc
def read_words_from_html(root, words, file_name, book_locale):
stack = [(root, book_locale)]
while stack:
parent, parent_locale = stack.pop()
locale = locale_from_tag(parent) or parent_locale
read_words_from_html_tag(parent, words, file_name, parent_locale, locale)
stack.extend((tag, locale) for tag in parent.iterchildren('*'))
def count_chars_in_html(root, counter, file_name, book_locale):
stack = [(root, book_locale)]
while stack:
parent, parent_locale = stack.pop()
locale = locale_from_tag(parent) or parent_locale
count_chars_in_html_tag(parent, counter, file_name, parent_locale, locale)
stack.extend((tag, locale) for tag in parent.iterchildren('*'))
def group_sort(locations):
order = {}
for loc in locations:
if loc.file_name not in order:
order[loc.file_name] = len(order)
return sorted(locations, key=lambda l:(order[l.file_name], l.sourceline))
def get_checkable_file_names(container):
file_names = [name for name, linear in container.spine_names] + [container.opf_name]
ncx_toc = find_existing_ncx_toc(container)
if ncx_toc is not None and container.exists(ncx_toc) and ncx_toc not in file_names:
file_names.append(ncx_toc)
else:
ncx_toc = None
toc = find_existing_nav_toc(container)
if toc is not None and container.exists(toc) and toc not in file_names:
file_names.append(toc)
return file_names, ncx_toc
def root_is_excluded_from_spell_check(root):
for child in root:
q = (getattr(child, 'text', '') or '').strip().lower()
if q == 'calibre-no-spell-check':
return True
return False
def get_all_words(container, book_locale, get_word_count=False, excluded_files=()):
words = defaultdict(list)
words[None] = 0
file_names, ncx_toc = get_checkable_file_names(container)
for file_name in file_names:
if not container.exists(file_name) or file_name in excluded_files:
continue
root = container.parsed(file_name)
if root_is_excluded_from_spell_check(root):
continue
if file_name == container.opf_name:
read_words_from_opf(root, words, file_name, book_locale)
elif file_name == ncx_toc:
read_words_from_ncx(root, words, file_name, book_locale)
elif hasattr(root, 'xpath'):
read_words_from_html(root, words, file_name, book_locale)
count = words.pop(None)
ans = {k:group_sort(v) for k, v in iteritems(words)}
if get_word_count:
return count, ans
return ans
def count_all_chars(container, book_locale):
ans = CharCounter()
file_names, ncx_toc = get_checkable_file_names(container)
for file_name in file_names:
if not container.exists(file_name):
continue
root = container.parsed(file_name)
if file_name == container.opf_name:
count_chars_in_opf(root, ans, file_name, book_locale)
elif file_name == ncx_toc:
count_chars_in_ncx(root, ans, file_name, book_locale)
elif hasattr(root, 'xpath'):
count_chars_in_html(root, ans, file_name, book_locale)
return ans
def merge_locations(locs1, locs2):
return group_sort(locs1 + locs2)
def replace(text, original_word, new_word, lang):
indices = []
original_word, new_word, text = str(original_word), str(new_word), str(text)
q = text
offset = 0
while True:
idx = index_of(original_word, q, lang=lang)
if idx == -1:
break
indices.append(offset + idx)
offset += idx + len(original_word)
q = text[offset:]
for idx in reversed(indices):
text = text[:idx] + new_word + text[idx+len(original_word):]
return text, bool(indices)
def replace_word(container, new_word, locations, locale, undo_cache=None):
changed = set()
for loc in locations:
node = loc.location_node
is_attr, attr = loc.node_item
if is_attr:
text = node.get(attr)
else:
text = getattr(node, attr)
replacement = loc.elided_prefix + new_word
rtext, replaced = replace(text, loc.original_word, replacement, locale.langcode)
if replaced:
if undo_cache is not None:
undo_cache[(loc.file_name, node, is_attr, attr)] = text
if is_attr:
node.set(attr, rtext)
else:
setattr(node, attr, rtext)
container.replace(loc.file_name, node.getroottree().getroot())
changed.add(loc.file_name)
return changed
def undo_replace_word(container, undo_cache):
changed = set()
for (file_name, node, is_attr, attr), text in iteritems(undo_cache):
node.set(attr, text) if is_attr else setattr(node, attr, text)
container.replace(file_name, node.getroottree().getroot())
changed.add(file_name)
return changed
if __name__ == '__main__':
import pprint
from calibre.gui2.tweak_book import set_book_locale, dictionaries
container = get_container(sys.argv[-1], tweak_mode=True)
set_book_locale(container.mi.language)
pprint.pprint(get_all_words(container, dictionaries.default_locale))