%PDF- %PDF-
Mini Shell

Mini Shell

Direktori : /lib/calibre/calibre/ebooks/oeb/iterator/
Upload File :
Create Path :
Current File : //lib/calibre/calibre/ebooks/oeb/iterator/spine.py

#!/usr/bin/env python3


__license__   = 'GPL v3'
__copyright__ = '2012, Kovid Goyal <kovid@kovidgoyal.net>'
__docformat__ = 'restructuredtext en'

import re, os
from functools import partial
from operator import attrgetter
from collections import namedtuple

from calibre import guess_type, replace_entities
from calibre.ebooks.chardet import xml_to_unicode


def character_count(html):
    ''' Return the number of "significant" text characters in a HTML string. '''
    count = 0
    strip_space = re.compile(r'\s+')
    for match in re.finditer(r'>[^<]+<', html):
        count += len(strip_space.sub(' ', match.group()))-2
    return count


def anchor_map(html):
    ''' Return map of all anchor names to their offsets in the html '''
    ans = {}
    for match in re.finditer(
        r'''(?:id|name)\s*=\s*['"]([^'"]+)['"]''', html):
        anchor = match.group(1)
        ans[anchor] = ans.get(anchor, match.start())
    return ans


def all_links(html):
    ''' Return set of all links in the file '''
    ans = set()
    for match in re.finditer(
            r'''<\s*[Aa]\s+.*?[hH][Rr][Ee][Ff]\s*=\s*(['"])(.+?)\1''', html, re.MULTILINE|re.DOTALL):
        ans.add(replace_entities(match.group(2)))
    return ans


class SpineItem(str):

    def __new__(cls, path, mime_type=None, read_anchor_map=True,
            run_char_count=True, from_epub=False, read_links=True):
        ppath = path.partition('#')[0]
        if not os.path.exists(path) and os.path.exists(ppath):
            path = ppath
        obj = super().__new__(cls, path)
        with lopen(path, 'rb') as f:
            raw = f.read()
        if from_epub:
            # According to the spec, HTML in EPUB must be encoded in utf-8 or
            # utf-16. Furthermore, there exist epub files produced by the usual
            # incompetents that have utf-8 encoded HTML files that contain
            # incorrect encoding declarations. See
            # http://www.idpf.org/epub/20/spec/OPS_2.0.1_draft.htm#Section1.4.1.2
            # http://www.idpf.org/epub/30/spec/epub30-publications.html#confreq-xml-enc
            # https://bugs.launchpad.net/bugs/1188843
            # So we first decode with utf-8 and only if that fails we try xml_to_unicode. This
            # is the same algorithm as that used by the conversion pipeline (modulo
            # some BOM based detection). Sigh.
            try:
                raw, obj.encoding = raw.decode('utf-8'), 'utf-8'
            except UnicodeDecodeError:
                raw, obj.encoding = xml_to_unicode(raw)
        else:
            raw, obj.encoding = xml_to_unicode(raw)
        obj.character_count = character_count(raw) if run_char_count else 10000
        obj.anchor_map = anchor_map(raw) if read_anchor_map else {}
        obj.all_links = all_links(raw) if read_links else set()
        obj.verified_links = set()
        obj.start_page = -1
        obj.pages      = -1
        obj.max_page   = -1
        obj.index_entries = []
        if mime_type is None:
            mime_type = guess_type(obj)[0]
        obj.mime_type = mime_type
        obj.is_single_page = None
        return obj


class IndexEntry:

    def __init__(self, spine, toc_entry, num):
        self.num = num
        self.text = toc_entry.text or _('Unknown')
        self.key = toc_entry.abspath
        self.anchor = self.start_anchor = toc_entry.fragment or None
        try:
            self.spine_pos = spine.index(self.key)
        except ValueError:
            self.spine_pos = -1
        self.anchor_pos = 0
        if self.spine_pos > -1:
            self.anchor_pos = spine[self.spine_pos].anchor_map.get(self.anchor,
                    0)

        self.depth = 0
        p = toc_entry.parent
        while p is not None:
            self.depth += 1
            p = p.parent

        self.sort_key = (self.spine_pos, self.anchor_pos)
        self.spine_count = len(spine)

    def find_end(self, all_entries):
        potential_enders = [i for i in all_entries if
                i.depth <= self.depth and
                (
                    (i.spine_pos == self.spine_pos and i.anchor_pos > self.anchor_pos) or
                    i.spine_pos > self.spine_pos
                )]
        if potential_enders:
            # potential_enders is sorted by (spine_pos, anchor_pos)
            end = potential_enders[0]
            self.end_spine_pos = end.spine_pos
            self.end_anchor = end.anchor
        else:
            self.end_spine_pos = self.spine_count - 1
            self.end_anchor = None


def create_indexing_data(spine, toc):
    if not toc:
        return
    f = partial(IndexEntry, spine)
    index_entries = list(map(f,
        (t for t in toc.flat() if t is not toc),
        (i-1 for i, t in enumerate(toc.flat()) if t is not toc)
        ))
    index_entries.sort(key=attrgetter('sort_key'))
    [i.find_end(index_entries) for i in index_entries]

    ie = namedtuple('IndexEntry', 'entry start_anchor end_anchor')

    for spine_pos, spine_item in enumerate(spine):
        for i in index_entries:
            if i.end_spine_pos < spine_pos or i.spine_pos > spine_pos:
                continue  # Does not touch this file
            start = i.anchor if i.spine_pos == spine_pos else None
            end = i.end_anchor if i.spine_pos == spine_pos else None
            spine_item.index_entries.append(ie(i, start, end))

Zerion Mini Shell 1.0