%PDF- %PDF-
Mini Shell

Mini Shell

Direktori : /usr/lib/calibre/calibre/ebooks/oeb/polish/check/
Upload File :
Create Path :
Current File : //usr/lib/calibre/calibre/ebooks/oeb/polish/check/links.py

#!/usr/bin/env python3


__license__ = 'GPL v3'
__copyright__ = '2013, Kovid Goyal <kovid at kovidgoyal.net>'

import os
from collections import defaultdict
from threading import Thread

from calibre import browser
from calibre.ebooks.oeb.base import OEB_DOCS, OEB_STYLES, urlunquote, XHTML_MIME
from calibre.ebooks.oeb.polish.container import OEB_FONTS
from calibre.ebooks.oeb.polish.parsing import parse_html5
from calibre.ebooks.oeb.polish.replace import remove_links_to
from calibre.ebooks.oeb.polish.cover import get_raster_cover_name
from calibre.ebooks.oeb.polish.utils import guess_type, actual_case_for_name, corrected_case_for_name
from calibre.ebooks.oeb.polish.check.base import BaseError, WARN, INFO
from polyglot.builtins import iteritems, itervalues
from polyglot.urllib import urlparse
from polyglot.queue import Queue, Empty


class BadLink(BaseError):

    HELP = _('The resource pointed to by this link does not exist. You should'
             ' either fix, or remove the link.')
    level = WARN


class InvalidCharInLink(BadLink):

    HELP = _('Windows computers do not allow the : character in filenames. For maximum'
             ' compatibility it is best to not use these in filenames/links to files.')


class CaseMismatch(BadLink):

    def __init__(self, href, corrected_name, name, lnum, col):
        BadLink.__init__(self, _('The linked to resource {0} does not exist').format(href), name, line=lnum, col=col)
        self.HELP = _('The case of the link {0} and the case of the actual file it points to {1}'
                      ' do not agree. You should change either the case of the link or rename the file.').format(
                          href, corrected_name)
        self.INDIVIDUAL_FIX = _('Change the case of the link to match the actual file')
        self.corrected_name = corrected_name
        self.href = href

    def __call__(self, container):
        frag = urlparse(self.href).fragment
        nhref = container.name_to_href(self.corrected_name, self.name)
        if frag:
            nhref += '#' + frag
        orig_href = self.href

        class LinkReplacer:
            replaced = False

            def __call__(self, url):
                if url != orig_href:
                    return url
                self.replaced = True
                return nhref
        replacer = LinkReplacer()
        container.replace_links(self.name, replacer)
        return replacer.replaced


class BadDestinationType(BaseError):

    level = WARN

    def __init__(self, link_source, link_dest, link_elem):
        BaseError.__init__(self, _('Link points to a file that is not a text document'), link_source, line=link_elem.sourceline)
        self.HELP = _('The link "{0}" points to a file <i>{1}</i> that is not a text (HTML) document.'
                      ' Many e-book readers will be unable to follow such a link. You should'
                      ' either remove the link or change it to point to a text document.'
                      ' For example, if it points to an image, you can create small wrapper'
                      ' document that contains the image and change the link to point to that.').format(
                          link_elem.get('href'), link_dest)
        self.bad_href = link_elem.get('href')


class BadDestinationFragment(BaseError):

    level = WARN

    def __init__(self, link_source, link_dest, link_elem, fragment):
        BaseError.__init__(self, _('Link points to a location not present in the target file'), link_source, line=link_elem.sourceline)
        self.bad_href = link_elem.get('href')
        self.HELP = _('The link "{0}" points to a location <i>{1}</i> in the file {2} that does not exist.'
                      ' You should either remove the location so that the link points to the top of the file,'
                      ' or change the link to point to the correct location.').format(
                          self.bad_href, fragment, link_dest)


class FileLink(BadLink):

    HELP = _('This link uses the file:// URL scheme. This does not work with many e-book readers.'
             ' Remove the file:// prefix and make sure the link points to a file inside the book.')


class LocalLink(BadLink):

    HELP = _('This link points to a file outside the book. It will not work if the'
             ' book is read on any computer other than the one it was created on.'
             ' Either fix or remove the link.')


class EmptyLink(BadLink):

    HELP = _('This link is empty. This is almost always a mistake. Either fill in the link destination or remove the link tag.')


class UnreferencedResource(BadLink):

    HELP = _('This file is included in the book but not referred to by any document in the spine.'
             ' This means that the file will not be viewable on most e-book readers. You should '
             ' probably remove this file from the book or add a link to it somewhere.')

    def __init__(self, name):
        BadLink.__init__(self, _(
            'The file %s is not referenced') % name, name)


class UnreferencedDoc(UnreferencedResource):

    HELP = _('This file is not in the book spine. All content documents must be in the spine.'
             ' You should probably add it to the spine.')
    INDIVIDUAL_FIX = _('Append this file to the spine')

    def __call__(self, container):
        from calibre.ebooks.oeb.base import OPF
        rmap = {v:k for k, v in iteritems(container.manifest_id_map)}
        if self.name in rmap:
            manifest_id = rmap[self.name]
        else:
            manifest_id = container.add_name_to_manifest(self.name)
        spine = container.opf_xpath('//opf:spine')[0]
        si = spine.makeelement(OPF('itemref'), idref=manifest_id)
        container.insert_into_xml(spine, si)
        container.dirty(container.opf_name)
        return True


class Unmanifested(BadLink):

    HELP = _('This file is not listed in the book manifest. While not strictly necessary'
             ' it is good practice to list all files in the manifest. Either list this'
             ' file in the manifest or remove it from the book if it is an unnecessary file.')

    def __init__(self, name, unreferenced=None):
        BadLink.__init__(self, _(
            'The file %s is not listed in the manifest') % name, name)
        self.file_action = None
        if unreferenced is not None:
            self.INDIVIDUAL_FIX = _(
                'Remove %s from the book') % name if unreferenced else _(
                    'Add %s to the manifest') % name
            self.file_action = 'remove' if unreferenced else 'add'

    def __call__(self, container):
        if self.file_action == 'remove':
            container.remove_item(self.name)
        else:
            rmap = {v:k for k, v in iteritems(container.manifest_id_map)}
            if self.name not in rmap:
                container.add_name_to_manifest(self.name)
        return True


class DanglingLink(BadLink):

    def __init__(self, text, target_name, name, lnum, col):
        BadLink.__init__(self, text, name, lnum, col)
        self.INDIVIDUAL_FIX = _('Remove all references to %s from the HTML and CSS in the book') % target_name
        self.target_name = target_name

    def __call__(self, container):
        return bool(remove_links_to(container, lambda name, *a: name == self.target_name))


class Bookmarks(BadLink):

    HELP = _(
        'This file stores the bookmarks and last opened information from'
        ' the calibre E-book viewer. You can remove it if you do not'
        ' need that information, or don\'t want to share it with'
        ' other people you send this book to.')
    INDIVIDUAL_FIX = _('Remove this file')
    level = INFO

    def __init__(self, name):
        BadLink.__init__(self, _(
            'The bookmarks file used by the calibre E-book viewer is present'), name)

    def __call__(self, container):
        container.remove_item(self.name)
        return True


class MimetypeMismatch(BaseError):

    level = WARN

    def __init__(self, container, name, opf_mt, ext_mt):
        self.opf_mt, self.ext_mt = opf_mt, ext_mt
        self.file_name = name
        BaseError.__init__(self, _('The file %s has a MIME type that does not match its extension') % name, container.opf_name)
        ext = name.rpartition('.')[-1]
        self.HELP = _('The file {0} has its MIME type specified as {1} in the OPF file.'
                      ' The recommended MIME type for files with the extension "{2}" is {3}.'
                      ' You should change either the file extension or the MIME type in the OPF.').format(
                          name, opf_mt, ext, ext_mt)
        if opf_mt in OEB_DOCS and name in {n for n, l in container.spine_names}:
            self.INDIVIDUAL_FIX = _('Change the file extension to .xhtml')
            self.change_ext_to = 'xhtml'
        else:
            self.INDIVIDUAL_FIX = _('Change the MIME type for this file in the OPF to %s') % ext_mt
            self.change_ext_to = None

    def __call__(self, container):
        changed = False
        if self.change_ext_to is not None:
            from calibre.ebooks.oeb.polish.replace import rename_files
            new_name = self.file_name.rpartition('.')[0] + '.' + self.change_ext_to
            c = 0
            while container.has_name(new_name):
                c += 1
                new_name = self.file_name.rpartition('.')[0] + ('%d.' % c) + self.change_ext_to
            rename_files(container, {self.file_name:new_name})
            changed = True
        else:
            for item in container.opf_xpath('//opf:manifest/opf:item[@href and @media-type="%s"]' % self.opf_mt):
                name = container.href_to_name(item.get('href'), container.opf_name)
                if name == self.file_name:
                    changed = True
                    item.set('media-type', self.ext_mt)
                    container.mime_map[name] = self.ext_mt
            if changed:
                container.dirty(container.opf_name)
        return changed


def check_mimetypes(container):
    errors = []
    a = errors.append
    for name, mt in iteritems(container.mime_map):
        gt = container.guess_type(name)
        if mt != gt:
            if mt == 'application/oebps-page-map+xml' and name.lower().endswith('.xml'):
                continue
            a(MimetypeMismatch(container, name, mt, gt))
    return errors


def check_link_destination(container, dest_map, name, href, a, errors):
    if href.startswith('#'):
        tname = name
    else:
        try:
            tname = container.href_to_name(href, name)
        except ValueError:
            tname = None  # Absolute links to files on another drive in windows cause this
    if tname and tname in container.mime_map:
        if container.mime_map[tname] not in OEB_DOCS:
            errors.append(BadDestinationType(name, tname, a))
        else:
            root = container.parsed(tname)
            if hasattr(root, 'xpath'):
                if tname not in dest_map:
                    dest_map[tname] = set(root.xpath('//*/@id|//*/@name'))
                purl = urlparse(href)
                if purl.fragment and purl.fragment not in dest_map[tname]:
                    errors.append(BadDestinationFragment(name, tname, a, purl.fragment))
            else:
                errors.append(BadDestinationType(name, tname, a))


def check_link_destinations(container):
    ' Check destinations of links that point to HTML files '
    errors = []
    dest_map = {}
    opf_type = guess_type('a.opf')
    ncx_type = guess_type('a.ncx')
    for name, mt in iteritems(container.mime_map):
        if mt in OEB_DOCS:
            for a in container.parsed(name).xpath('//*[local-name()="a" and @href]'):
                href = a.get('href')
                check_link_destination(container, dest_map, name, href, a, errors)
        elif mt == opf_type:
            for a in container.opf_xpath('//opf:reference[@href]'):
                if container.book_type == 'azw3' and a.get('type') in {'cover', 'other.ms-coverimage-standard', 'other.ms-coverimage'}:
                    continue
                href = a.get('href')
                check_link_destination(container, dest_map, name, href, a, errors)
        elif mt == ncx_type:
            for a in container.parsed(name).xpath('//*[local-name() = "content" and @src]'):
                href = a.get('src')
                check_link_destination(container, dest_map, name, href, a, errors)

    return errors


def check_links(container):
    links_map = defaultdict(set)
    xml_types = {guess_type('a.opf'), guess_type('a.ncx')}
    errors = []
    a = errors.append

    def fl(x):
        x = repr(x)
        if x.startswith('u'):
            x = x[1:]
        return x

    for name, mt in iteritems(container.mime_map):
        if mt in OEB_DOCS or mt in OEB_STYLES or mt in xml_types:
            for href, lnum, col in container.iterlinks(name):
                if not href:
                    a(EmptyLink(_('The link is empty'), name, lnum, col))
                try:
                    tname = container.href_to_name(href, name)
                except ValueError:
                    tname = None  # Absolute paths to files on another drive in windows cause this
                if tname is not None:
                    if container.exists(tname):
                        if tname in container.mime_map:
                            links_map[name].add(tname)
                        else:
                            # Filesystem says the file exists, but it is not in
                            # the mime_map, so either there is a case mismatch
                            # or the link is a directory
                            apath = container.name_to_abspath(tname)
                            if os.path.isdir(apath):
                                a(BadLink(_('The linked resource %s is a folder') % fl(href), name, lnum, col))
                            else:
                                a(CaseMismatch(href, actual_case_for_name(container, tname), name, lnum, col))
                    else:
                        cname = corrected_case_for_name(container, tname)
                        if cname is not None:
                            a(CaseMismatch(href, cname, name, lnum, col))
                        else:
                            a(DanglingLink(_('The linked resource %s does not exist') % fl(href), tname, name, lnum, col))
                else:
                    purl = urlparse(href)
                    if purl.scheme == 'file':
                        a(FileLink(_('The link %s is a file:// URL') % fl(href), name, lnum, col))
                    elif purl.path and purl.path.startswith('/') and purl.scheme in {'', 'file'}:
                        a(LocalLink(_('The link %s points to a file outside the book') % fl(href), name, lnum, col))
                    elif purl.path and purl.scheme in {'', 'file'} and ':' in urlunquote(purl.path):
                        a(InvalidCharInLink(_('The link %s contains a : character, this will cause errors on Windows computers') % fl(href), name, lnum, col))

    spine_docs = {name for name, linear in container.spine_names}
    spine_styles = {tname for name in spine_docs for tname in links_map[name] if container.mime_map.get(tname, None) in OEB_STYLES}
    num = -1
    while len(spine_styles) > num:
        # Handle import rules in stylesheets
        num = len(spine_styles)
        spine_styles |= {tname for name in spine_styles for tname in links_map[name] if container.mime_map.get(tname, None) in OEB_STYLES}
    seen = set(OEB_DOCS) | set(OEB_STYLES)
    spine_resources = {tname for name in spine_docs | spine_styles for tname in links_map[name] if container.mime_map[tname] not in seen}
    unreferenced = set()

    cover_name = container.guide_type_map.get('cover', None)
    nav_items = frozenset(container.manifest_items_with_property('nav'))

    for name, mt in iteritems(container.mime_map):
        if mt in OEB_STYLES and name not in spine_styles:
            a(UnreferencedResource(name))
        elif mt in OEB_DOCS and name not in spine_docs and name not in nav_items:
            a(UnreferencedDoc(name))
        elif (mt in OEB_FONTS or mt.partition('/')[0] in {'image', 'audio', 'video'}) and name not in spine_resources and name != cover_name:
            if mt.partition('/')[0] == 'image' and name == get_raster_cover_name(container):
                continue
            a(UnreferencedResource(name))
        else:
            continue
        unreferenced.add(name)

    manifest_names = set(itervalues(container.manifest_id_map))
    for name in container.mime_map:
        if name not in manifest_names and not container.ok_to_be_unmanifested(name):
            a(Unmanifested(name, unreferenced=name in unreferenced))
        if name == 'META-INF/calibre_bookmarks.txt':
            a(Bookmarks(name))

    return errors


def get_html_ids(raw_data):
    ans = set()
    root = parse_html5(raw_data, discard_namespaces=True, line_numbers=False, fix_newlines=False)
    for body in root.xpath('//body'):
        ans.update(set(body.xpath('descendant-or-self::*/@id')))
        ans.update(set(body.xpath('descendant::a/@name')))
    return ans


def check_external_links(container, progress_callback=(lambda num, total:None), check_anchors=True):
    progress_callback(0, 0)
    external_links = defaultdict(list)
    for name, mt in iteritems(container.mime_map):
        if mt in OEB_DOCS or mt in OEB_STYLES:
            for href, lnum, col in container.iterlinks(name):
                purl = urlparse(href)
                if purl.scheme in ('http', 'https'):
                    external_links[href].append((name, href, lnum, col))
    if not external_links:
        return []
    items = Queue()
    ans = []
    for el in iteritems(external_links):
        items.put(el)
    progress_callback(0, len(external_links))
    done = []
    downloaded_html_ids = {}

    def check_links():
        br = browser(honor_time=False, verify_ssl_certificates=False)
        while True:
            try:
                full_href, locations = items.get_nowait()
            except Empty:
                return
            href, frag = full_href.partition('#')[::2]
            try:
                res = br.open(href, timeout=10)
            except Exception as e:
                ans.append((locations, e, full_href))
            else:
                if frag and check_anchors:
                    ct = res.info().get('Content-Type')
                    if ct and ct.split(';')[0].lower() in {'text/html', XHTML_MIME}:
                        ids = downloaded_html_ids.get(href)
                        if ids is None:
                            try:
                                ids = downloaded_html_ids[href] = get_html_ids(res.read())
                            except Exception:
                                ids = downloaded_html_ids[href] = frozenset()
                        if frag not in ids:
                            ans.append((locations, ValueError(f'HTML anchor {frag} not found on the page'), full_href))
                res.close()
            finally:
                done.append(None)
                progress_callback(len(done), len(external_links))

    workers = [Thread(name="CheckLinks", target=check_links) for i in range(min(10, len(external_links)))]
    for w in workers:
        w.daemon = True
        w.start()

    for w in workers:
        w.join()
    return ans

Zerion Mini Shell 1.0