%PDF- %PDF-
Direktori : /usr/lib/calibre/calibre/ebooks/oeb/polish/check/ |
Current File : //usr/lib/calibre/calibre/ebooks/oeb/polish/check/links.py |
#!/usr/bin/env python3 __license__ = 'GPL v3' __copyright__ = '2013, Kovid Goyal <kovid at kovidgoyal.net>' import os from collections import defaultdict from threading import Thread from calibre import browser from calibre.ebooks.oeb.base import OEB_DOCS, OEB_STYLES, urlunquote, XHTML_MIME from calibre.ebooks.oeb.polish.container import OEB_FONTS from calibre.ebooks.oeb.polish.parsing import parse_html5 from calibre.ebooks.oeb.polish.replace import remove_links_to from calibre.ebooks.oeb.polish.cover import get_raster_cover_name from calibre.ebooks.oeb.polish.utils import guess_type, actual_case_for_name, corrected_case_for_name from calibre.ebooks.oeb.polish.check.base import BaseError, WARN, INFO from polyglot.builtins import iteritems, itervalues from polyglot.urllib import urlparse from polyglot.queue import Queue, Empty class BadLink(BaseError): HELP = _('The resource pointed to by this link does not exist. You should' ' either fix, or remove the link.') level = WARN class InvalidCharInLink(BadLink): HELP = _('Windows computers do not allow the : character in filenames. For maximum' ' compatibility it is best to not use these in filenames/links to files.') class CaseMismatch(BadLink): def __init__(self, href, corrected_name, name, lnum, col): BadLink.__init__(self, _('The linked to resource {0} does not exist').format(href), name, line=lnum, col=col) self.HELP = _('The case of the link {0} and the case of the actual file it points to {1}' ' do not agree. You should change either the case of the link or rename the file.').format( href, corrected_name) self.INDIVIDUAL_FIX = _('Change the case of the link to match the actual file') self.corrected_name = corrected_name self.href = href def __call__(self, container): frag = urlparse(self.href).fragment nhref = container.name_to_href(self.corrected_name, self.name) if frag: nhref += '#' + frag orig_href = self.href class LinkReplacer: replaced = False def __call__(self, url): if url != orig_href: return url self.replaced = True return nhref replacer = LinkReplacer() container.replace_links(self.name, replacer) return replacer.replaced class BadDestinationType(BaseError): level = WARN def __init__(self, link_source, link_dest, link_elem): BaseError.__init__(self, _('Link points to a file that is not a text document'), link_source, line=link_elem.sourceline) self.HELP = _('The link "{0}" points to a file <i>{1}</i> that is not a text (HTML) document.' ' Many e-book readers will be unable to follow such a link. You should' ' either remove the link or change it to point to a text document.' ' For example, if it points to an image, you can create small wrapper' ' document that contains the image and change the link to point to that.').format( link_elem.get('href'), link_dest) self.bad_href = link_elem.get('href') class BadDestinationFragment(BaseError): level = WARN def __init__(self, link_source, link_dest, link_elem, fragment): BaseError.__init__(self, _('Link points to a location not present in the target file'), link_source, line=link_elem.sourceline) self.bad_href = link_elem.get('href') self.HELP = _('The link "{0}" points to a location <i>{1}</i> in the file {2} that does not exist.' ' You should either remove the location so that the link points to the top of the file,' ' or change the link to point to the correct location.').format( self.bad_href, fragment, link_dest) class FileLink(BadLink): HELP = _('This link uses the file:// URL scheme. This does not work with many e-book readers.' ' Remove the file:// prefix and make sure the link points to a file inside the book.') class LocalLink(BadLink): HELP = _('This link points to a file outside the book. It will not work if the' ' book is read on any computer other than the one it was created on.' ' Either fix or remove the link.') class EmptyLink(BadLink): HELP = _('This link is empty. This is almost always a mistake. Either fill in the link destination or remove the link tag.') class UnreferencedResource(BadLink): HELP = _('This file is included in the book but not referred to by any document in the spine.' ' This means that the file will not be viewable on most e-book readers. You should ' ' probably remove this file from the book or add a link to it somewhere.') def __init__(self, name): BadLink.__init__(self, _( 'The file %s is not referenced') % name, name) class UnreferencedDoc(UnreferencedResource): HELP = _('This file is not in the book spine. All content documents must be in the spine.' ' You should probably add it to the spine.') INDIVIDUAL_FIX = _('Append this file to the spine') def __call__(self, container): from calibre.ebooks.oeb.base import OPF rmap = {v:k for k, v in iteritems(container.manifest_id_map)} if self.name in rmap: manifest_id = rmap[self.name] else: manifest_id = container.add_name_to_manifest(self.name) spine = container.opf_xpath('//opf:spine')[0] si = spine.makeelement(OPF('itemref'), idref=manifest_id) container.insert_into_xml(spine, si) container.dirty(container.opf_name) return True class Unmanifested(BadLink): HELP = _('This file is not listed in the book manifest. While not strictly necessary' ' it is good practice to list all files in the manifest. Either list this' ' file in the manifest or remove it from the book if it is an unnecessary file.') def __init__(self, name, unreferenced=None): BadLink.__init__(self, _( 'The file %s is not listed in the manifest') % name, name) self.file_action = None if unreferenced is not None: self.INDIVIDUAL_FIX = _( 'Remove %s from the book') % name if unreferenced else _( 'Add %s to the manifest') % name self.file_action = 'remove' if unreferenced else 'add' def __call__(self, container): if self.file_action == 'remove': container.remove_item(self.name) else: rmap = {v:k for k, v in iteritems(container.manifest_id_map)} if self.name not in rmap: container.add_name_to_manifest(self.name) return True class DanglingLink(BadLink): def __init__(self, text, target_name, name, lnum, col): BadLink.__init__(self, text, name, lnum, col) self.INDIVIDUAL_FIX = _('Remove all references to %s from the HTML and CSS in the book') % target_name self.target_name = target_name def __call__(self, container): return bool(remove_links_to(container, lambda name, *a: name == self.target_name)) class Bookmarks(BadLink): HELP = _( 'This file stores the bookmarks and last opened information from' ' the calibre E-book viewer. You can remove it if you do not' ' need that information, or don\'t want to share it with' ' other people you send this book to.') INDIVIDUAL_FIX = _('Remove this file') level = INFO def __init__(self, name): BadLink.__init__(self, _( 'The bookmarks file used by the calibre E-book viewer is present'), name) def __call__(self, container): container.remove_item(self.name) return True class MimetypeMismatch(BaseError): level = WARN def __init__(self, container, name, opf_mt, ext_mt): self.opf_mt, self.ext_mt = opf_mt, ext_mt self.file_name = name BaseError.__init__(self, _('The file %s has a MIME type that does not match its extension') % name, container.opf_name) ext = name.rpartition('.')[-1] self.HELP = _('The file {0} has its MIME type specified as {1} in the OPF file.' ' The recommended MIME type for files with the extension "{2}" is {3}.' ' You should change either the file extension or the MIME type in the OPF.').format( name, opf_mt, ext, ext_mt) if opf_mt in OEB_DOCS and name in {n for n, l in container.spine_names}: self.INDIVIDUAL_FIX = _('Change the file extension to .xhtml') self.change_ext_to = 'xhtml' else: self.INDIVIDUAL_FIX = _('Change the MIME type for this file in the OPF to %s') % ext_mt self.change_ext_to = None def __call__(self, container): changed = False if self.change_ext_to is not None: from calibre.ebooks.oeb.polish.replace import rename_files new_name = self.file_name.rpartition('.')[0] + '.' + self.change_ext_to c = 0 while container.has_name(new_name): c += 1 new_name = self.file_name.rpartition('.')[0] + ('%d.' % c) + self.change_ext_to rename_files(container, {self.file_name:new_name}) changed = True else: for item in container.opf_xpath('//opf:manifest/opf:item[@href and @media-type="%s"]' % self.opf_mt): name = container.href_to_name(item.get('href'), container.opf_name) if name == self.file_name: changed = True item.set('media-type', self.ext_mt) container.mime_map[name] = self.ext_mt if changed: container.dirty(container.opf_name) return changed def check_mimetypes(container): errors = [] a = errors.append for name, mt in iteritems(container.mime_map): gt = container.guess_type(name) if mt != gt: if mt == 'application/oebps-page-map+xml' and name.lower().endswith('.xml'): continue a(MimetypeMismatch(container, name, mt, gt)) return errors def check_link_destination(container, dest_map, name, href, a, errors): if href.startswith('#'): tname = name else: try: tname = container.href_to_name(href, name) except ValueError: tname = None # Absolute links to files on another drive in windows cause this if tname and tname in container.mime_map: if container.mime_map[tname] not in OEB_DOCS: errors.append(BadDestinationType(name, tname, a)) else: root = container.parsed(tname) if hasattr(root, 'xpath'): if tname not in dest_map: dest_map[tname] = set(root.xpath('//*/@id|//*/@name')) purl = urlparse(href) if purl.fragment and purl.fragment not in dest_map[tname]: errors.append(BadDestinationFragment(name, tname, a, purl.fragment)) else: errors.append(BadDestinationType(name, tname, a)) def check_link_destinations(container): ' Check destinations of links that point to HTML files ' errors = [] dest_map = {} opf_type = guess_type('a.opf') ncx_type = guess_type('a.ncx') for name, mt in iteritems(container.mime_map): if mt in OEB_DOCS: for a in container.parsed(name).xpath('//*[local-name()="a" and @href]'): href = a.get('href') check_link_destination(container, dest_map, name, href, a, errors) elif mt == opf_type: for a in container.opf_xpath('//opf:reference[@href]'): if container.book_type == 'azw3' and a.get('type') in {'cover', 'other.ms-coverimage-standard', 'other.ms-coverimage'}: continue href = a.get('href') check_link_destination(container, dest_map, name, href, a, errors) elif mt == ncx_type: for a in container.parsed(name).xpath('//*[local-name() = "content" and @src]'): href = a.get('src') check_link_destination(container, dest_map, name, href, a, errors) return errors def check_links(container): links_map = defaultdict(set) xml_types = {guess_type('a.opf'), guess_type('a.ncx')} errors = [] a = errors.append def fl(x): x = repr(x) if x.startswith('u'): x = x[1:] return x for name, mt in iteritems(container.mime_map): if mt in OEB_DOCS or mt in OEB_STYLES or mt in xml_types: for href, lnum, col in container.iterlinks(name): if not href: a(EmptyLink(_('The link is empty'), name, lnum, col)) try: tname = container.href_to_name(href, name) except ValueError: tname = None # Absolute paths to files on another drive in windows cause this if tname is not None: if container.exists(tname): if tname in container.mime_map: links_map[name].add(tname) else: # Filesystem says the file exists, but it is not in # the mime_map, so either there is a case mismatch # or the link is a directory apath = container.name_to_abspath(tname) if os.path.isdir(apath): a(BadLink(_('The linked resource %s is a folder') % fl(href), name, lnum, col)) else: a(CaseMismatch(href, actual_case_for_name(container, tname), name, lnum, col)) else: cname = corrected_case_for_name(container, tname) if cname is not None: a(CaseMismatch(href, cname, name, lnum, col)) else: a(DanglingLink(_('The linked resource %s does not exist') % fl(href), tname, name, lnum, col)) else: purl = urlparse(href) if purl.scheme == 'file': a(FileLink(_('The link %s is a file:// URL') % fl(href), name, lnum, col)) elif purl.path and purl.path.startswith('/') and purl.scheme in {'', 'file'}: a(LocalLink(_('The link %s points to a file outside the book') % fl(href), name, lnum, col)) elif purl.path and purl.scheme in {'', 'file'} and ':' in urlunquote(purl.path): a(InvalidCharInLink(_('The link %s contains a : character, this will cause errors on Windows computers') % fl(href), name, lnum, col)) spine_docs = {name for name, linear in container.spine_names} spine_styles = {tname for name in spine_docs for tname in links_map[name] if container.mime_map.get(tname, None) in OEB_STYLES} num = -1 while len(spine_styles) > num: # Handle import rules in stylesheets num = len(spine_styles) spine_styles |= {tname for name in spine_styles for tname in links_map[name] if container.mime_map.get(tname, None) in OEB_STYLES} seen = set(OEB_DOCS) | set(OEB_STYLES) spine_resources = {tname for name in spine_docs | spine_styles for tname in links_map[name] if container.mime_map[tname] not in seen} unreferenced = set() cover_name = container.guide_type_map.get('cover', None) nav_items = frozenset(container.manifest_items_with_property('nav')) for name, mt in iteritems(container.mime_map): if mt in OEB_STYLES and name not in spine_styles: a(UnreferencedResource(name)) elif mt in OEB_DOCS and name not in spine_docs and name not in nav_items: a(UnreferencedDoc(name)) elif (mt in OEB_FONTS or mt.partition('/')[0] in {'image', 'audio', 'video'}) and name not in spine_resources and name != cover_name: if mt.partition('/')[0] == 'image' and name == get_raster_cover_name(container): continue a(UnreferencedResource(name)) else: continue unreferenced.add(name) manifest_names = set(itervalues(container.manifest_id_map)) for name in container.mime_map: if name not in manifest_names and not container.ok_to_be_unmanifested(name): a(Unmanifested(name, unreferenced=name in unreferenced)) if name == 'META-INF/calibre_bookmarks.txt': a(Bookmarks(name)) return errors def get_html_ids(raw_data): ans = set() root = parse_html5(raw_data, discard_namespaces=True, line_numbers=False, fix_newlines=False) for body in root.xpath('//body'): ans.update(set(body.xpath('descendant-or-self::*/@id'))) ans.update(set(body.xpath('descendant::a/@name'))) return ans def check_external_links(container, progress_callback=(lambda num, total:None), check_anchors=True): progress_callback(0, 0) external_links = defaultdict(list) for name, mt in iteritems(container.mime_map): if mt in OEB_DOCS or mt in OEB_STYLES: for href, lnum, col in container.iterlinks(name): purl = urlparse(href) if purl.scheme in ('http', 'https'): external_links[href].append((name, href, lnum, col)) if not external_links: return [] items = Queue() ans = [] for el in iteritems(external_links): items.put(el) progress_callback(0, len(external_links)) done = [] downloaded_html_ids = {} def check_links(): br = browser(honor_time=False, verify_ssl_certificates=False) while True: try: full_href, locations = items.get_nowait() except Empty: return href, frag = full_href.partition('#')[::2] try: res = br.open(href, timeout=10) except Exception as e: ans.append((locations, e, full_href)) else: if frag and check_anchors: ct = res.info().get('Content-Type') if ct and ct.split(';')[0].lower() in {'text/html', XHTML_MIME}: ids = downloaded_html_ids.get(href) if ids is None: try: ids = downloaded_html_ids[href] = get_html_ids(res.read()) except Exception: ids = downloaded_html_ids[href] = frozenset() if frag not in ids: ans.append((locations, ValueError(f'HTML anchor {frag} not found on the page'), full_href)) res.close() finally: done.append(None) progress_callback(len(done), len(external_links)) workers = [Thread(name="CheckLinks", target=check_links) for i in range(min(10, len(external_links)))] for w in workers: w.daemon = True w.start() for w in workers: w.join() return ans