%PDF- %PDF-
Direktori : /usr/lib/calibre/calibre/ebooks/oeb/polish/ |
Current File : //usr/lib/calibre/calibre/ebooks/oeb/polish/report.py |
#!/usr/bin/env python3 __license__ = 'GPL v3' __copyright__ = '2015, Kovid Goyal <kovid at kovidgoyal.net>' import posixpath, os, time, types from collections import namedtuple, defaultdict from itertools import chain from calibre import prepare_string_for_xml, force_unicode from calibre.ebooks.oeb.base import XPath, xml2text from calibre.ebooks.oeb.polish.container import OEB_DOCS, OEB_STYLES, OEB_FONTS from calibre.ebooks.oeb.polish.spell import get_all_words, count_all_chars from calibre.utils.icu import numeric_sort_key, safe_chr from calibre.utils.imghdr import identify from css_selectors import Select, SelectorError from polyglot.builtins import iteritems File = namedtuple('File', 'name dir basename size category') def get_category(name, mt): category = 'misc' if mt.startswith('image/'): category = 'image' elif mt in OEB_FONTS: category = 'font' elif mt in OEB_STYLES: category = 'style' elif mt in OEB_DOCS: category = 'text' ext = name.rpartition('.')[-1].lower() if ext in {'ttf', 'otf', 'woff'}: # Probably wrong mimetype in the OPF category = 'font' elif ext == 'opf': category = 'opf' elif ext == 'ncx': category = 'toc' return category def safe_size(container, name): try: return os.path.getsize(container.name_to_abspath(name)) except Exception: return 0 def safe_img_data(container, name, mt): if 'svg' in mt: return 0, 0 try: fmt, width, height = identify(container.name_to_abspath(name)) except Exception: width = height = 0 return width, height def files_data(container, *args): for name, path in iteritems(container.name_path_map): yield File(name, posixpath.dirname(name), posixpath.basename(name), safe_size(container, name), get_category(name, container.mime_map.get(name, ''))) Image = namedtuple('Image', 'name mime_type usage size basename id width height') LinkLocation = namedtuple('LinkLocation', 'name line_number text_on_line') def sort_locations(container, locations): nmap = {n:i for i, (n, l) in enumerate(container.spine_names)} def sort_key(l): return (nmap.get(l.name, len(nmap)), numeric_sort_key(l.name), l.line_number) return sorted(locations, key=sort_key) def safe_href_to_name(container, href, base): try: return container.href_to_name(href, base) except ValueError: pass # Absolute path on windows def images_data(container, *args): image_usage = defaultdict(set) link_sources = OEB_STYLES | OEB_DOCS for name, mt in iteritems(container.mime_map): if mt in link_sources: for href, line_number, offset in container.iterlinks(name): target = safe_href_to_name(container, href, name) if target and container.exists(target): mt = container.mime_map.get(target) if mt and mt.startswith('image/'): image_usage[target].add(LinkLocation(name, line_number, href)) image_data = [] for name, mt in iteritems(container.mime_map): if mt.startswith('image/') and container.exists(name): image_data.append(Image(name, mt, sort_locations(container, image_usage.get(name, set())), safe_size(container, name), posixpath.basename(name), len(image_data), *safe_img_data(container, name, mt))) return tuple(image_data) def description_for_anchor(elem): def check(x, min_len=4): if x: x = x.strip() if len(x) >= min_len: return x[:30] desc = check(elem.get('title')) if desc is not None: return desc desc = check(elem.text) if desc is not None: return desc if len(elem) > 0: desc = check(elem[0].text) if desc is not None: return desc # Get full text for tags that have only a few descendants for i, x in enumerate(elem.iterdescendants('*')): if i > 5: break else: desc = check(xml2text(elem), min_len=1) if desc is not None: return desc def create_anchor_map(root, pat, name): ans = {} for elem in pat(root): anchor = elem.get('id') or elem.get('name') if anchor and anchor not in ans: ans[anchor] = (LinkLocation(name, elem.sourceline, anchor), description_for_anchor(elem)) return ans Anchor = namedtuple('Anchor', 'id location text') L = namedtuple('Link', 'location text is_external href path_ok anchor_ok anchor ok') def Link(location, text, is_external, href, path_ok, anchor_ok, anchor): if is_external: ok = None else: ok = path_ok and anchor_ok return L(location, text, is_external, href, path_ok, anchor_ok, anchor, ok) def links_data(container, *args): anchor_map = {} links = [] anchor_pat = XPath('//*[@id or @name]') link_pat = XPath('//h:a[@href]') for name, mt in iteritems(container.mime_map): if mt in OEB_DOCS: root = container.parsed(name) anchor_map[name] = create_anchor_map(root, anchor_pat, name) for a in link_pat(root): href = a.get('href') text = description_for_anchor(a) if href: base, frag = href.partition('#')[0::2] if frag and not base: dest = name else: dest = safe_href_to_name(container, href, name) location = LinkLocation(name, a.sourceline, href) links.append((base, frag, dest, location, text)) else: links.append(('', '', None, location, text)) for base, frag, dest, location, text in links: if dest is None: link = Link(location, text, True, base, True, True, Anchor(frag, None, None)) else: if dest in anchor_map: loc = LinkLocation(dest, None, None) if frag: anchor = anchor_map[dest].get(frag) if anchor is None: link = Link(location, text, False, dest, True, False, Anchor(frag, loc, None)) else: link = Link(location, text, False, dest, True, True, Anchor(frag, *anchor)) else: link = Link(location, text, False, dest, True, True, Anchor(None, loc, None)) else: link = Link(location, text, False, dest, False, False, Anchor(frag, None, None)) yield link Word = namedtuple('Word', 'id word locale usage') def words_data(container, book_locale, *args): count, words = get_all_words(container, book_locale, get_word_count=True) return (count, tuple(Word(i, word, locale, v) for i, ((word, locale), v) in enumerate(iteritems(words)))) Char = namedtuple('Char', 'id char codepoint usage count') def chars_data(container, book_locale, *args): cc = count_all_chars(container, book_locale) nmap = {n:i for i, (n, l) in enumerate(container.spine_names)} def sort_key(name): return nmap.get(name, len(nmap)), numeric_sort_key(name) for i, (codepoint, usage) in enumerate(iteritems(cc.chars)): yield Char(i, safe_chr(codepoint), codepoint, sorted(usage, key=sort_key), cc.counter[codepoint]) CSSRule = namedtuple('CSSRule', 'selector location') RuleLocation = namedtuple('RuleLocation', 'file_name line column') MatchLocation = namedtuple('MatchLocation', 'tag sourceline') CSSEntry = namedtuple('CSSEntry', 'rule count matched_files sort_key') CSSFileMatch = namedtuple('CSSFileMatch', 'file_name locations sort_key') ClassEntry = namedtuple('ClassEntry', 'cls num_of_matches matched_files sort_key') ClassFileMatch = namedtuple('ClassFileMatch', 'file_name class_elements sort_key') ClassElement = namedtuple('ClassElement', 'name line_number text_on_line tag matched_rules') def css_data(container, book_locale, result_data, *args): import tinycss from tinycss.css21 import RuleSet, ImportRule def css_rules(file_name, rules, sourceline=0): ans = [] for rule in rules: if isinstance(rule, RuleSet): selector = rule.selector.as_css() ans.append(CSSRule(selector, RuleLocation(file_name, sourceline + rule.line, rule.column))) elif isinstance(rule, ImportRule): import_name = safe_href_to_name(container, rule.uri, file_name) if import_name and container.exists(import_name): ans.append(import_name) elif getattr(rule, 'rules', False): ans.extend(css_rules(file_name, rule.rules, sourceline)) return ans parser = tinycss.make_full_parser() importable_sheets = {} html_sheets = {} spine_names = {name for name, is_linear in container.spine_names} style_path, link_path = XPath('//h:style'), XPath('//h:link/@href') for name, mt in iteritems(container.mime_map): if mt in OEB_STYLES: importable_sheets[name] = css_rules(name, parser.parse_stylesheet(container.raw_data(name)).rules) elif mt in OEB_DOCS and name in spine_names: html_sheets[name] = [] for style in style_path(container.parsed(name)): if style.get('type', 'text/css') == 'text/css' and style.text: html_sheets[name].append( css_rules(name, parser.parse_stylesheet(force_unicode(style.text, 'utf-8')).rules, style.sourceline - 1)) rule_map = defaultdict(lambda : defaultdict(list)) def rules_in_sheet(sheet): for rule in sheet: if isinstance(rule, CSSRule): yield rule else: # @import rule isheet = importable_sheets.get(rule) if isheet is not None: yield from rules_in_sheet(isheet) def sheets_for_html(name, root): for href in link_path(root): tname = safe_href_to_name(container, href, name) sheet = importable_sheets.get(tname) if sheet is not None: yield sheet tt_cache = {} def tag_text(elem): ans = tt_cache.get(elem) if ans is None: tag = elem.tag.rpartition('}')[-1] if elem.attrib: attribs = ' '.join('{}="{}"'.format(k, prepare_string_for_xml(elem.get(k, ''), True)) for k in elem.keys()) return f'<{tag} {attribs}>' ans = tt_cache[elem] = '<%s>' % tag def matches_for_selector(selector, select, class_map, rule): lsel = selector.lower() try: matches = tuple(select(selector)) except SelectorError: return () seen = set() def get_elem_and_ancestors(elem): p = elem while p is not None: if p not in seen: yield p seen.add(p) p = p.getparent() for e in matches: for elem in get_elem_and_ancestors(e): for cls in elem.get('class', '').split(): if '.' + cls.lower() in lsel: class_map[cls][elem].append(rule) return (MatchLocation(tag_text(elem), elem.sourceline) for elem in matches) class_map = defaultdict(lambda : defaultdict(list)) for name, inline_sheets in iteritems(html_sheets): root = container.parsed(name) cmap = defaultdict(lambda : defaultdict(list)) for elem in root.xpath('//*[@class]'): for cls in elem.get('class', '').split(): cmap[cls][elem] = [] select = Select(root, ignore_inappropriate_pseudo_classes=True) for sheet in chain(sheets_for_html(name, root), inline_sheets): for rule in rules_in_sheet(sheet): rule_map[rule][name].extend(matches_for_selector(rule.selector, select, cmap, rule)) for cls, elem_map in iteritems(cmap): class_elements = class_map[cls][name] for elem, usage in iteritems(elem_map): class_elements.append( ClassElement(name, elem.sourceline, elem.get('class'), tag_text(elem), tuple(usage))) result_data['classes'] = ans = [] for cls, name_map in iteritems(class_map): la = tuple(ClassFileMatch(name, tuple(class_elements), numeric_sort_key(name)) for name, class_elements in iteritems(name_map) if class_elements) num_of_matches = sum(sum(len(ce.matched_rules) for ce in cfm.class_elements) for cfm in la) ans.append(ClassEntry(cls, num_of_matches, la, numeric_sort_key(cls))) ans = [] for rule, loc_map in iteritems(rule_map): la = tuple(CSSFileMatch(name, tuple(locations), numeric_sort_key(name)) for name, locations in iteritems(loc_map) if locations) count = sum(len(fm.locations) for fm in la) ans.append(CSSEntry(rule, count, la, numeric_sort_key(rule.selector))) return ans def gather_data(container, book_locale): timing = {} data = {} for x in 'files chars images links words css'.split(): st = time.time() data[x] = globals()[x + '_data'](container, book_locale, data) if isinstance(data[x], types.GeneratorType): data[x] = tuple(data[x]) timing[x] = time.time() - st return data, timing