%PDF- %PDF-
Mini Shell

Mini Shell

Direktori : /usr/lib/calibre/calibre/ebooks/oeb/polish/
Upload File :
Create Path :
Current File : //usr/lib/calibre/calibre/ebooks/oeb/polish/report.py

#!/usr/bin/env python3


__license__ = 'GPL v3'
__copyright__ = '2015, Kovid Goyal <kovid at kovidgoyal.net>'

import posixpath, os, time, types
from collections import namedtuple, defaultdict
from itertools import chain

from calibre import prepare_string_for_xml, force_unicode
from calibre.ebooks.oeb.base import XPath, xml2text
from calibre.ebooks.oeb.polish.container import OEB_DOCS, OEB_STYLES, OEB_FONTS
from calibre.ebooks.oeb.polish.spell import get_all_words, count_all_chars
from calibre.utils.icu import numeric_sort_key, safe_chr
from calibre.utils.imghdr import identify
from css_selectors import Select, SelectorError
from polyglot.builtins import iteritems

File = namedtuple('File', 'name dir basename size category')


def get_category(name, mt):
    category = 'misc'
    if mt.startswith('image/'):
        category = 'image'
    elif mt in OEB_FONTS:
        category = 'font'
    elif mt in OEB_STYLES:
        category = 'style'
    elif mt in OEB_DOCS:
        category = 'text'
    ext = name.rpartition('.')[-1].lower()
    if ext in {'ttf', 'otf', 'woff'}:
        # Probably wrong mimetype in the OPF
        category = 'font'
    elif ext == 'opf':
        category = 'opf'
    elif ext == 'ncx':
        category = 'toc'
    return category


def safe_size(container, name):
    try:
        return os.path.getsize(container.name_to_abspath(name))
    except Exception:
        return 0


def safe_img_data(container, name, mt):
    if 'svg' in mt:
        return 0, 0
    try:
        fmt, width, height = identify(container.name_to_abspath(name))
    except Exception:
        width = height = 0
    return width, height


def files_data(container, *args):
    for name, path in iteritems(container.name_path_map):
        yield File(name, posixpath.dirname(name), posixpath.basename(name), safe_size(container, name),
                   get_category(name, container.mime_map.get(name, '')))


Image = namedtuple('Image', 'name mime_type usage size basename id width height')

LinkLocation = namedtuple('LinkLocation', 'name line_number text_on_line')


def sort_locations(container, locations):
    nmap = {n:i for i, (n, l) in enumerate(container.spine_names)}

    def sort_key(l):
        return (nmap.get(l.name, len(nmap)), numeric_sort_key(l.name), l.line_number)
    return sorted(locations, key=sort_key)


def safe_href_to_name(container, href, base):
    try:
        return container.href_to_name(href, base)
    except ValueError:
        pass  # Absolute path on windows


def images_data(container, *args):
    image_usage = defaultdict(set)
    link_sources = OEB_STYLES | OEB_DOCS
    for name, mt in iteritems(container.mime_map):
        if mt in link_sources:
            for href, line_number, offset in container.iterlinks(name):
                target = safe_href_to_name(container, href, name)
                if target and container.exists(target):
                    mt = container.mime_map.get(target)
                    if mt and mt.startswith('image/'):
                        image_usage[target].add(LinkLocation(name, line_number, href))

    image_data = []
    for name, mt in iteritems(container.mime_map):
        if mt.startswith('image/') and container.exists(name):
            image_data.append(Image(name, mt, sort_locations(container, image_usage.get(name, set())), safe_size(container, name),
                                    posixpath.basename(name), len(image_data), *safe_img_data(container, name, mt)))
    return tuple(image_data)


def description_for_anchor(elem):
    def check(x, min_len=4):
        if x:
            x = x.strip()
            if len(x) >= min_len:
                return x[:30]

    desc = check(elem.get('title'))
    if desc is not None:
        return desc
    desc = check(elem.text)
    if desc is not None:
        return desc
    if len(elem) > 0:
        desc = check(elem[0].text)
        if desc is not None:
            return desc
    # Get full text for tags that have only a few descendants
    for i, x in enumerate(elem.iterdescendants('*')):
        if i > 5:
            break
    else:
        desc = check(xml2text(elem), min_len=1)
        if desc is not None:
            return desc


def create_anchor_map(root, pat, name):
    ans = {}
    for elem in pat(root):
        anchor = elem.get('id') or elem.get('name')
        if anchor and anchor not in ans:
            ans[anchor] = (LinkLocation(name, elem.sourceline, anchor), description_for_anchor(elem))
    return ans


Anchor = namedtuple('Anchor', 'id location text')
L = namedtuple('Link', 'location text is_external href path_ok anchor_ok anchor ok')


def Link(location, text, is_external, href, path_ok, anchor_ok, anchor):
    if is_external:
        ok = None
    else:
        ok = path_ok and anchor_ok
    return L(location, text, is_external, href, path_ok, anchor_ok, anchor, ok)


def links_data(container, *args):
    anchor_map = {}
    links = []
    anchor_pat = XPath('//*[@id or @name]')
    link_pat = XPath('//h:a[@href]')
    for name, mt in iteritems(container.mime_map):
        if mt in OEB_DOCS:
            root = container.parsed(name)
            anchor_map[name] = create_anchor_map(root, anchor_pat, name)
            for a in link_pat(root):
                href = a.get('href')
                text = description_for_anchor(a)
                if href:
                    base, frag = href.partition('#')[0::2]
                    if frag and not base:
                        dest = name
                    else:
                        dest = safe_href_to_name(container, href, name)
                    location = LinkLocation(name, a.sourceline, href)
                    links.append((base, frag, dest, location, text))
                else:
                    links.append(('', '', None, location, text))

    for base, frag, dest, location, text in links:
        if dest is None:
            link = Link(location, text, True, base, True, True, Anchor(frag, None, None))
        else:
            if dest in anchor_map:
                loc = LinkLocation(dest, None, None)
                if frag:
                    anchor = anchor_map[dest].get(frag)
                    if anchor is None:
                        link = Link(location, text, False, dest, True, False, Anchor(frag, loc, None))
                    else:
                        link = Link(location, text, False, dest, True, True, Anchor(frag, *anchor))
                else:
                    link = Link(location, text, False, dest, True, True, Anchor(None, loc, None))
            else:
                link = Link(location, text, False, dest, False, False, Anchor(frag, None, None))
        yield link


Word = namedtuple('Word', 'id word locale usage')


def words_data(container, book_locale, *args):
    count, words = get_all_words(container, book_locale, get_word_count=True)
    return (count, tuple(Word(i, word, locale, v) for i, ((word, locale), v) in enumerate(iteritems(words))))


Char = namedtuple('Char', 'id char codepoint usage count')


def chars_data(container, book_locale, *args):
    cc = count_all_chars(container, book_locale)
    nmap = {n:i for i, (n, l) in enumerate(container.spine_names)}

    def sort_key(name):
        return nmap.get(name, len(nmap)), numeric_sort_key(name)

    for i, (codepoint, usage) in enumerate(iteritems(cc.chars)):
        yield Char(i, safe_chr(codepoint), codepoint, sorted(usage, key=sort_key), cc.counter[codepoint])


CSSRule = namedtuple('CSSRule', 'selector location')
RuleLocation = namedtuple('RuleLocation', 'file_name line column')
MatchLocation = namedtuple('MatchLocation', 'tag sourceline')
CSSEntry = namedtuple('CSSEntry', 'rule count matched_files sort_key')
CSSFileMatch = namedtuple('CSSFileMatch', 'file_name locations sort_key')

ClassEntry = namedtuple('ClassEntry', 'cls num_of_matches matched_files sort_key')
ClassFileMatch = namedtuple('ClassFileMatch', 'file_name class_elements sort_key')
ClassElement = namedtuple('ClassElement', 'name line_number text_on_line tag matched_rules')


def css_data(container, book_locale, result_data, *args):
    import tinycss
    from tinycss.css21 import RuleSet, ImportRule

    def css_rules(file_name, rules, sourceline=0):
        ans = []
        for rule in rules:
            if isinstance(rule, RuleSet):
                selector = rule.selector.as_css()
                ans.append(CSSRule(selector, RuleLocation(file_name, sourceline + rule.line, rule.column)))
            elif isinstance(rule, ImportRule):
                import_name = safe_href_to_name(container, rule.uri, file_name)
                if import_name and container.exists(import_name):
                    ans.append(import_name)
            elif getattr(rule, 'rules', False):
                ans.extend(css_rules(file_name, rule.rules, sourceline))
        return ans

    parser = tinycss.make_full_parser()
    importable_sheets = {}
    html_sheets = {}
    spine_names = {name for name, is_linear in container.spine_names}
    style_path, link_path = XPath('//h:style'), XPath('//h:link/@href')

    for name, mt in iteritems(container.mime_map):
        if mt in OEB_STYLES:
            importable_sheets[name] = css_rules(name, parser.parse_stylesheet(container.raw_data(name)).rules)
        elif mt in OEB_DOCS and name in spine_names:
            html_sheets[name] = []
            for style in style_path(container.parsed(name)):
                if style.get('type', 'text/css') == 'text/css' and style.text:
                    html_sheets[name].append(
                        css_rules(name, parser.parse_stylesheet(force_unicode(style.text, 'utf-8')).rules, style.sourceline - 1))

    rule_map = defaultdict(lambda : defaultdict(list))

    def rules_in_sheet(sheet):
        for rule in sheet:
            if isinstance(rule, CSSRule):
                yield rule
            else:  # @import rule
                isheet = importable_sheets.get(rule)
                if isheet is not None:
                    yield from rules_in_sheet(isheet)

    def sheets_for_html(name, root):
        for href in link_path(root):
            tname = safe_href_to_name(container, href, name)
            sheet = importable_sheets.get(tname)
            if sheet is not None:
                yield sheet

    tt_cache = {}

    def tag_text(elem):
        ans = tt_cache.get(elem)
        if ans is None:
            tag = elem.tag.rpartition('}')[-1]
            if elem.attrib:
                attribs = ' '.join('{}="{}"'.format(k, prepare_string_for_xml(elem.get(k, ''), True)) for k in elem.keys())
                return f'<{tag} {attribs}>'
            ans = tt_cache[elem] = '<%s>' % tag

    def matches_for_selector(selector, select, class_map, rule):
        lsel = selector.lower()
        try:
            matches = tuple(select(selector))
        except SelectorError:
            return ()
        seen = set()

        def get_elem_and_ancestors(elem):
            p = elem
            while p is not None:
                if p not in seen:
                    yield p
                    seen.add(p)
                p = p.getparent()

        for e in matches:
            for elem in get_elem_and_ancestors(e):
                for cls in elem.get('class', '').split():
                    if '.' + cls.lower() in lsel:
                        class_map[cls][elem].append(rule)

        return (MatchLocation(tag_text(elem), elem.sourceline) for elem in matches)

    class_map = defaultdict(lambda : defaultdict(list))

    for name, inline_sheets in iteritems(html_sheets):
        root = container.parsed(name)
        cmap = defaultdict(lambda : defaultdict(list))
        for elem in root.xpath('//*[@class]'):
            for cls in elem.get('class', '').split():
                cmap[cls][elem] = []
        select = Select(root, ignore_inappropriate_pseudo_classes=True)
        for sheet in chain(sheets_for_html(name, root), inline_sheets):
            for rule in rules_in_sheet(sheet):
                rule_map[rule][name].extend(matches_for_selector(rule.selector, select, cmap, rule))
        for cls, elem_map in iteritems(cmap):
            class_elements = class_map[cls][name]
            for elem, usage in iteritems(elem_map):
                class_elements.append(
                    ClassElement(name, elem.sourceline, elem.get('class'), tag_text(elem), tuple(usage)))

    result_data['classes'] = ans = []
    for cls, name_map in iteritems(class_map):
        la = tuple(ClassFileMatch(name, tuple(class_elements), numeric_sort_key(name)) for name, class_elements in iteritems(name_map) if class_elements)
        num_of_matches = sum(sum(len(ce.matched_rules) for ce in cfm.class_elements) for cfm in la)
        ans.append(ClassEntry(cls, num_of_matches, la, numeric_sort_key(cls)))

    ans = []
    for rule, loc_map in iteritems(rule_map):
        la = tuple(CSSFileMatch(name, tuple(locations), numeric_sort_key(name)) for name, locations in iteritems(loc_map) if locations)
        count = sum(len(fm.locations) for fm in la)
        ans.append(CSSEntry(rule, count, la, numeric_sort_key(rule.selector)))

    return ans


def gather_data(container, book_locale):
    timing = {}
    data = {}
    for x in 'files chars images links words css'.split():
        st = time.time()
        data[x] = globals()[x + '_data'](container, book_locale, data)
        if isinstance(data[x], types.GeneratorType):
            data[x] = tuple(data[x])
        timing[x] = time.time() - st
    return data, timing

Zerion Mini Shell 1.0