%PDF- %PDF-
Direktori : /lib/calibre/calibre/ebooks/oeb/ |
Current File : //lib/calibre/calibre/ebooks/oeb/reader.py |
""" Container-/OPF-based input OEBBook reader. """ __license__ = 'GPL v3' __copyright__ = '2008, Marshall T. Vandegrift <llasram@gmail.com>' import sys, os, uuid, copy, re, io from collections import defaultdict from lxml import etree from calibre.ebooks.oeb.base import OPF1_NS, OPF2_NS, OPF2_NSMAP, DC11_NS, \ DC_NSES, OPF, xml2text, XHTML_MIME from calibre.ebooks.oeb.base import OEB_DOCS, OEB_STYLES, OEB_IMAGES, \ PAGE_MAP_MIME, JPEG_MIME, NCX_MIME, SVG_MIME from calibre.ebooks.oeb.base import XMLDECL_RE, COLLAPSE_RE, \ MS_COVER_TYPE, iterlinks from calibre.ebooks.oeb.base import namespace, barename, XPath, xpath, \ urlnormalize, BINARY_MIME, \ OEBError, OEBBook, DirContainer from calibre.ebooks.oeb.writer import OEBWriter from calibre.utils.xml_parse import safe_xml_fromstring from calibre.utils.cleantext import clean_xml_chars from calibre.utils.localization import get_lang from calibre.ptempfile import TemporaryDirectory from calibre.constants import __appname__, __version__ from calibre import guess_type, xml_replace_entities from polyglot.urllib import unquote, urldefrag, urlparse __all__ = ['OEBReader'] class OEBReader: """Read an OEBPS 1.x or OPF/OPS 2.0 file collection.""" COVER_SVG_XP = XPath('h:body//svg:svg[position() = 1]') COVER_OBJECT_XP = XPath('h:body//h:object[@data][position() = 1]') Container = DirContainer """Container type used to access book files. Override in sub-classes.""" DEFAULT_PROFILE = 'PRS505' """Default renderer profile for content read with this Reader.""" TRANSFORMS = [] """List of transforms to apply to content read with this Reader.""" @classmethod def config(cls, cfg): """Add any book-reading options to the :class:`Config` object :param:`cfg`. """ return @classmethod def generate(cls, opts): """Generate a Reader instance from command-line options.""" return cls() def __call__(self, oeb, path): """Read the book at :param:`path` into the :class:`OEBBook` object :param:`oeb`. """ self.oeb = oeb self.logger = self.log = oeb.logger oeb.container = self.Container(path, self.logger) oeb.container.log = oeb.log opf = self._read_opf() self._all_from_opf(opf) return oeb def _clean_opf(self, opf): nsmap = {} for elem in opf.iter(tag=etree.Element): nsmap.update(elem.nsmap) for elem in opf.iter(tag=etree.Element): if namespace(elem.tag) in ('', OPF1_NS) and ':' not in barename(elem.tag): elem.tag = OPF(barename(elem.tag)) nsmap.update(OPF2_NSMAP) attrib = dict(opf.attrib) nroot = etree.Element(OPF('package'), nsmap={None: OPF2_NS}, attrib=attrib) metadata = etree.SubElement(nroot, OPF('metadata'), nsmap=nsmap) ignored = (OPF('dc-metadata'), OPF('x-metadata')) for elem in xpath(opf, 'o2:metadata//*'): if elem.tag in ignored: continue if namespace(elem.tag) in DC_NSES: tag = barename(elem.tag).lower() elem.tag = f'{{{DC11_NS}}}{tag}' if elem.tag.startswith('dc:'): tag = elem.tag.partition(':')[-1].lower() elem.tag = f'{{{DC11_NS}}}{tag}' metadata.append(elem) for element in xpath(opf, 'o2:metadata//o2:meta'): metadata.append(element) for tag in ('o2:manifest', 'o2:spine', 'o2:tours', 'o2:guide'): for element in xpath(opf, tag): nroot.append(element) return nroot def _read_opf(self): data = self.oeb.container.read(None) data = self.oeb.decode(data) data = XMLDECL_RE.sub('', data) data = re.sub(r'http://openebook.org/namespaces/oeb-package/1.0(/*)', OPF1_NS, data) try: opf = safe_xml_fromstring(data) except etree.XMLSyntaxError: data = xml_replace_entities(clean_xml_chars(data), encoding=None) try: opf = safe_xml_fromstring(data) self.logger.warn('OPF contains invalid HTML named entities') except etree.XMLSyntaxError: data = re.sub(r'(?is)<tours>.+</tours>', '', data) data = data.replace('<dc-metadata>', '<dc-metadata xmlns:dc="http://purl.org/metadata/dublin_core">') opf = safe_xml_fromstring(data) self.logger.warn('OPF contains invalid tours section') ns = namespace(opf.tag) if ns not in ('', OPF1_NS, OPF2_NS): raise OEBError('Invalid namespace %r for OPF document' % ns) opf = self._clean_opf(opf) return opf def _metadata_from_opf(self, opf): from calibre.ebooks.metadata.opf2 import OPF from calibre.ebooks.oeb.transforms.metadata import meta_info_to_oeb_metadata stream = io.BytesIO(etree.tostring(opf, xml_declaration=True, encoding='utf-8')) o = OPF(stream) pwm = o.primary_writing_mode if pwm: self.oeb.metadata.primary_writing_mode = pwm mi = o.to_book_metadata() if not mi.language: mi.language = get_lang().replace('_', '-') self.oeb.metadata.add('language', mi.language) if not mi.book_producer: mi.book_producer = '%(a)s (%(v)s) [http://%(a)s-ebook.com]'%\ dict(a=__appname__, v=__version__) meta_info_to_oeb_metadata(mi, self.oeb.metadata, self.logger) m = self.oeb.metadata m.add('identifier', str(uuid.uuid4()), id='uuid_id', scheme='uuid') self.oeb.uid = self.oeb.metadata.identifier[-1] if not m.title: m.add('title', self.oeb.translate(__('Unknown'))) has_aut = False for x in m.creator: if getattr(x, 'role', '').lower() in ('', 'aut'): has_aut = True break if not has_aut: m.add('creator', self.oeb.translate(__('Unknown')), role='aut') def _manifest_prune_invalid(self): ''' Remove items from manifest that contain invalid data. This prevents catastrophic conversion failure, when a few files contain corrupted data. ''' bad = [] check = OEB_DOCS.union(OEB_STYLES) for item in list(self.oeb.manifest.values()): if item.media_type in check: try: item.data except KeyboardInterrupt: raise except: self.logger.exception('Failed to parse content in %s'% item.href) bad.append(item) self.oeb.manifest.remove(item) return bad def _manifest_add_missing(self, invalid): import css_parser manifest = self.oeb.manifest known = set(manifest.hrefs) unchecked = set(manifest.values()) cdoc = OEB_DOCS|OEB_STYLES invalid = set() while unchecked: new = set() for item in unchecked: data = None if (item.media_type in cdoc or item.media_type[-4:] in ('/xml', '+xml')): try: data = item.data except: self.oeb.log.exception('Failed to read from manifest ' 'entry with id: %s, ignoring'%item.id) invalid.add(item) continue if data is None: continue if (item.media_type in OEB_DOCS or item.media_type[-4:] in ('/xml', '+xml')): hrefs = [r[2] for r in iterlinks(data)] for href in hrefs: if isinstance(href, bytes): href = href.decode('utf-8') href, _ = urldefrag(href) if not href: continue try: href = item.abshref(urlnormalize(href)) scheme = urlparse(href).scheme except: self.oeb.log.exception( 'Skipping invalid href: %r'%href) continue if not scheme and href not in known: new.add(href) elif item.media_type in OEB_STYLES: try: urls = list(css_parser.getUrls(data)) except: urls = [] for url in urls: href, _ = urldefrag(url) href = item.abshref(urlnormalize(href)) scheme = urlparse(href).scheme if not scheme and href not in known: new.add(href) unchecked.clear() warned = set() for href in new: known.add(href) is_invalid = False for item in invalid: if href == item.abshref(urlnormalize(href)): is_invalid = True break if is_invalid: continue if not self.oeb.container.exists(href): if href not in warned: self.logger.warn('Referenced file %r not found' % href) warned.add(href) continue if href not in warned: self.logger.warn('Referenced file %r not in manifest' % href) warned.add(href) id, _ = manifest.generate(id='added') guessed = guess_type(href)[0] media_type = guessed or BINARY_MIME added = manifest.add(id, href, media_type) unchecked.add(added) for item in invalid: self.oeb.manifest.remove(item) def _manifest_from_opf(self, opf): manifest = self.oeb.manifest for elem in xpath(opf, '/o2:package/o2:manifest/o2:item'): id = elem.get('id') href = elem.get('href') media_type = elem.get('media-type', None) if media_type is None: media_type = elem.get('mediatype', None) if not media_type or media_type == 'text/xml': guessed = guess_type(href)[0] media_type = guessed or media_type or BINARY_MIME if hasattr(media_type, 'lower'): media_type = media_type.lower() fallback = elem.get('fallback') if href in manifest.hrefs: self.logger.warn('Duplicate manifest entry for %r' % href) continue if not self.oeb.container.exists(href): self.logger.warn('Manifest item %r not found' % href) continue if id in manifest.ids: self.logger.warn('Duplicate manifest id %r' % id) id, href = manifest.generate(id, href) manifest.add(id, href, media_type, fallback) invalid = self._manifest_prune_invalid() self._manifest_add_missing(invalid) def _spine_add_extra(self): manifest = self.oeb.manifest spine = self.oeb.spine unchecked = set(spine) selector = XPath('h:body//h:a/@href') extras = set() while unchecked: new = set() for item in unchecked: if item.media_type not in OEB_DOCS: # TODO: handle fallback chains continue for href in selector(item.data): href, _ = urldefrag(href) if not href: continue try: href = item.abshref(urlnormalize(href)) except ValueError: # Malformed URL continue if href not in manifest.hrefs: continue found = manifest.hrefs[href] if found.media_type not in OEB_DOCS or \ found in spine or found in extras: continue new.add(found) extras.update(new) unchecked = new version = int(self.oeb.version[0]) removed_items_to_ignore = getattr(self.oeb, 'removed_items_to_ignore', ()) for item in extras: if item.href in removed_items_to_ignore: continue if version >= 2: self.logger.warn( 'Spine-referenced file %r not in spine' % item.href) spine.add(item, linear=False) def _spine_from_opf(self, opf): spine = self.oeb.spine manifest = self.oeb.manifest for elem in xpath(opf, '/o2:package/o2:spine/o2:itemref'): idref = elem.get('idref') if idref not in manifest.ids: self.logger.warn('Spine item %r not found' % idref) continue item = manifest.ids[idref] if item.media_type.lower() in OEB_DOCS and hasattr(item.data, 'xpath') and not getattr(item.data, 'tag', '').endswith('}ncx'): spine.add(item, elem.get('linear')) else: if hasattr(item.data, 'tag') and item.data.tag and item.data.tag.endswith('}html'): item.media_type = XHTML_MIME spine.add(item, elem.get('linear')) else: self.oeb.log.warn('The item %s is not a XML document.' ' Removing it from spine.'%item.href) if len(spine) == 0: raise OEBError("Spine is empty") self._spine_add_extra() for val in xpath(opf, '/o2:package/o2:spine/@page-progression-direction'): if val in {'ltr', 'rtl'}: spine.page_progression_direction = val def _guide_from_opf(self, opf): guide = self.oeb.guide manifest = self.oeb.manifest for elem in xpath(opf, '/o2:package/o2:guide/o2:reference'): ref_href = elem.get('href') path = urlnormalize(urldefrag(ref_href)[0]) if path not in manifest.hrefs: corrected_href = None for href in manifest.hrefs: if href.lower() == path.lower(): corrected_href = href break if corrected_href is None: self.logger.warn('Guide reference %r not found' % ref_href) continue ref_href = corrected_href typ = elem.get('type') if typ not in guide: guide.add(typ, elem.get('title'), ref_href) def _find_ncx(self, opf): result = xpath(opf, '/o2:package/o2:spine/@toc') if result: id = result[0] if id not in self.oeb.manifest.ids: return None item = self.oeb.manifest.ids[id] self.oeb.manifest.remove(item) return item for item in self.oeb.manifest.values(): if item.media_type == NCX_MIME: self.oeb.manifest.remove(item) return item return None def _toc_from_navpoint(self, item, toc, navpoint): children = xpath(navpoint, 'ncx:navPoint') for child in children: title = ''.join(xpath(child, 'ncx:navLabel/ncx:text/text()')) title = COLLAPSE_RE.sub(' ', title.strip()) href = xpath(child, 'ncx:content/@src') if not title: self._toc_from_navpoint(item, toc, child) continue if (not href or not href[0]) and not xpath(child, 'ncx:navPoint'): # This node is useless continue href = item.abshref(urlnormalize(href[0])) if href and href[0] else '' path, _ = urldefrag(href) if path and path not in self.oeb.manifest.hrefs: path = urlnormalize(path) if href and path not in self.oeb.manifest.hrefs: self.logger.warn('TOC reference %r not found' % href) gc = xpath(child, 'ncx:navPoint') if not gc: # This node is useless continue id = child.get('id') klass = child.get('class', 'chapter') try: po = int(child.get('playOrder', self.oeb.toc.next_play_order())) except: po = self.oeb.toc.next_play_order() authorElement = xpath(child, 'descendant::calibre:meta[@name = "author"]') if authorElement: author = authorElement[0].text else: author = None descriptionElement = xpath(child, 'descendant::calibre:meta[@name = "description"]') if descriptionElement: description = etree.tostring(descriptionElement[0], method='text', encoding='unicode').strip() if not description: description = None else: description = None index_image = xpath(child, 'descendant::calibre:meta[@name = "toc_thumbnail"]') toc_thumbnail = (index_image[0].text if index_image else None) if not toc_thumbnail or not toc_thumbnail.strip(): toc_thumbnail = None node = toc.add(title, href, id=id, klass=klass, play_order=po, description=description, author=author, toc_thumbnail=toc_thumbnail) self._toc_from_navpoint(item, node, child) def _toc_from_ncx(self, item): if (item is None) or (item.data is None): return False self.log.debug('Reading TOC from NCX...') ncx = item.data title = ''.join(xpath(ncx, 'ncx:docTitle/ncx:text/text()')) title = COLLAPSE_RE.sub(' ', title.strip()) title = title or str(self.oeb.metadata.title[0]) toc = self.oeb.toc toc.title = title navmaps = xpath(ncx, 'ncx:navMap') for navmap in navmaps: self._toc_from_navpoint(item, toc, navmap) return True def _toc_from_tour(self, opf): result = xpath(opf, 'o2:tours/o2:tour') if not result: return False self.log.debug('Reading TOC from tour...') tour = result[0] toc = self.oeb.toc toc.title = tour.get('title') sites = xpath(tour, 'o2:site') for site in sites: title = site.get('title') href = site.get('href') if not title or not href: continue path, _ = urldefrag(urlnormalize(href)) if path not in self.oeb.manifest.hrefs: self.logger.warn('TOC reference %r not found' % href) continue id = site.get('id') toc.add(title, href, id=id) return True def _toc_from_html(self, opf): if 'toc' not in self.oeb.guide: return False self.log.debug('Reading TOC from HTML...') itempath, frag = urldefrag(self.oeb.guide['toc'].href) item = self.oeb.manifest.hrefs[itempath] html = item.data if frag: elems = xpath(html, './/*[@id="%s"]' % frag) if not elems: elems = xpath(html, './/*[@name="%s"]' % frag) elem = elems[0] if elems else html while elem != html and not xpath(elem, './/h:a[@href]'): elem = elem.getparent() html = elem titles = defaultdict(list) order = [] for anchor in xpath(html, './/h:a[@href]'): href = anchor.attrib['href'] href = item.abshref(urlnormalize(href)) path, frag = urldefrag(href) if path not in self.oeb.manifest.hrefs: continue title = xml2text(anchor) title = COLLAPSE_RE.sub(' ', title.strip()) if href not in titles: order.append(href) titles[href].append(title) toc = self.oeb.toc for href in order: toc.add(' '.join(titles[href]), href) return True def _toc_from_spine(self, opf): self.log.warn('Generating default TOC from spine...') toc = self.oeb.toc titles = [] headers = [] for item in self.oeb.spine: if not item.linear: continue html = item.data title = ''.join(xpath(html, '/h:html/h:head/h:title/text()')) title = COLLAPSE_RE.sub(' ', title.strip()) if title: titles.append(title) headers.append('(unlabled)') for tag in ('h1', 'h2', 'h3', 'h4', 'h5', 'strong'): expr = '/h:html/h:body//h:%s[position()=1]/text()' header = ''.join(xpath(html, expr % tag)) header = COLLAPSE_RE.sub(' ', header.strip()) if header: headers[-1] = header break use = titles if len(titles) > len(set(titles)): use = headers for title, item in zip(use, self.oeb.spine): if not item.linear: continue toc.add(title, item.href) return True def _toc_from_opf(self, opf, item): self.oeb.auto_generated_toc = False if self._toc_from_ncx(item): return # Prefer HTML to tour based TOC, since several LIT files # have good HTML TOCs but bad tour based TOCs if self._toc_from_html(opf): return if self._toc_from_tour(opf): return self._toc_from_spine(opf) self.oeb.auto_generated_toc = True def _pages_from_ncx(self, opf, item): if item is None: return False ncx = item.data if ncx is None: return False ptargets = xpath(ncx, 'ncx:pageList/ncx:pageTarget') if not ptargets: return False pages = self.oeb.pages for ptarget in ptargets: name = ''.join(xpath(ptarget, 'ncx:navLabel/ncx:text/text()')) name = COLLAPSE_RE.sub(' ', name.strip()) href = xpath(ptarget, 'ncx:content/@src') if not href: continue href = item.abshref(urlnormalize(href[0])) id = ptarget.get('id') type = ptarget.get('type', 'normal') klass = ptarget.get('class') pages.add(name, href, type=type, id=id, klass=klass) return True def _find_page_map(self, opf): result = xpath(opf, '/o2:package/o2:spine/@page-map') if result: id = result[0] if id not in self.oeb.manifest.ids: return None item = self.oeb.manifest.ids[id] self.oeb.manifest.remove(item) return item for item in self.oeb.manifest.values(): if item.media_type == PAGE_MAP_MIME: self.oeb.manifest.remove(item) return item return None def _pages_from_page_map(self, opf): item = self._find_page_map(opf) if item is None: return False pmap = item.data pages = self.oeb.pages for page in xpath(pmap, 'o2:page'): name = page.get('name', '') href = page.get('href') if not href: continue name = COLLAPSE_RE.sub(' ', name.strip()) href = item.abshref(urlnormalize(href)) type = 'normal' if not name: type = 'special' elif name.lower().strip('ivxlcdm') == '': type = 'front' pages.add(name, href, type=type) return True def _pages_from_opf(self, opf, item): if self._pages_from_ncx(opf, item): return if self._pages_from_page_map(opf): return return def _cover_from_html(self, hcover): from calibre.ebooks import render_html_svg_workaround with TemporaryDirectory('_html_cover') as tdir: writer = OEBWriter() writer(self.oeb, tdir) path = os.path.join(tdir, unquote(hcover.href)) data = render_html_svg_workaround(path, self.logger) if not data: data = b'' id, href = self.oeb.manifest.generate('cover', 'cover.jpg') item = self.oeb.manifest.add(id, href, JPEG_MIME, data=data) return item def _locate_cover_image(self): if self.oeb.metadata.cover: id = str(self.oeb.metadata.cover[0]) item = self.oeb.manifest.ids.get(id, None) if item is not None and item.media_type in OEB_IMAGES: return item else: self.logger.warn('Invalid cover image @id %r' % id) hcover = self.oeb.spine[0] if 'cover' in self.oeb.guide: href = self.oeb.guide['cover'].href item = self.oeb.manifest.hrefs[href] media_type = item.media_type if media_type in OEB_IMAGES: return item elif media_type in OEB_DOCS: hcover = item html = hcover.data if MS_COVER_TYPE in self.oeb.guide: href = self.oeb.guide[MS_COVER_TYPE].href item = self.oeb.manifest.hrefs.get(href, None) if item is not None and item.media_type in OEB_IMAGES: return item if self.COVER_SVG_XP(html): svg = copy.deepcopy(self.COVER_SVG_XP(html)[0]) href = os.path.splitext(hcover.href)[0] + '.svg' id, href = self.oeb.manifest.generate(hcover.id, href) item = self.oeb.manifest.add(id, href, SVG_MIME, data=svg) return item if self.COVER_OBJECT_XP(html): object = self.COVER_OBJECT_XP(html)[0] href = hcover.abshref(object.get('data')) item = self.oeb.manifest.hrefs.get(href, None) if item is not None and item.media_type in OEB_IMAGES: return item return self._cover_from_html(hcover) def _ensure_cover_image(self): cover = self._locate_cover_image() if self.oeb.metadata.cover: self.oeb.metadata.cover[0].value = cover.id return self.oeb.metadata.add('cover', cover.id) def _manifest_remove_duplicates(self): seen = set() dups = set() for item in self.oeb.manifest: if item.href in seen: dups.add(item.href) seen.add(item.href) for href in dups: items = [x for x in self.oeb.manifest if x.href == href] for x in items: if x not in self.oeb.spine: self.oeb.log.warn('Removing duplicate manifest item with id:', x.id) self.oeb.manifest.remove_duplicate_item(x) def _all_from_opf(self, opf): self.oeb.version = opf.get('version', '1.2') self._metadata_from_opf(opf) self._manifest_from_opf(opf) self._spine_from_opf(opf) self._manifest_remove_duplicates() self._guide_from_opf(opf) item = self._find_ncx(opf) self._toc_from_opf(opf, item) self._pages_from_opf(opf, item) # self._ensure_cover_image() def main(argv=sys.argv): reader = OEBReader() for arg in argv[1:]: oeb = reader(OEBBook(), arg) for name, doc in oeb.to_opf1().values(): print(etree.tostring(doc, pretty_print=True)) for name, doc in oeb.to_opf2(page_map=True).values(): print(etree.tostring(doc, pretty_print=True)) return 0 if __name__ == '__main__': sys.exit(main())