%PDF- %PDF-
Direktori : /usr/lib/calibre/calibre/ebooks/conversion/plugins/ |
Current File : //usr/lib/calibre/calibre/ebooks/conversion/plugins/epub_input.py |
__license__ = 'GPL 3' __copyright__ = '2009, Kovid Goyal <kovid@kovidgoyal.net>' __docformat__ = 'restructuredtext en' import os, re, posixpath from itertools import cycle from calibre.customize.conversion import InputFormatPlugin, OptionRecommendation ADOBE_OBFUSCATION = 'http://ns.adobe.com/pdf/enc#RC' IDPF_OBFUSCATION = 'http://www.idpf.org/2008/embedding' def decrypt_font_data(key, data, algorithm): is_adobe = algorithm == ADOBE_OBFUSCATION crypt_len = 1024 if is_adobe else 1040 crypt = bytearray(data[:crypt_len]) key = cycle(iter(bytearray(key))) decrypt = bytes(bytearray(x^next(key) for x in crypt)) return decrypt + data[crypt_len:] def decrypt_font(key, path, algorithm): with lopen(path, 'r+b') as f: data = decrypt_font_data(key, f.read(), algorithm) f.seek(0), f.truncate(), f.write(data) class EPUBInput(InputFormatPlugin): name = 'EPUB Input' author = 'Kovid Goyal' description = _('Convert EPUB files (.epub) to HTML') file_types = {'epub'} output_encoding = None commit_name = 'epub_input' recommendations = {('page_breaks_before', '/', OptionRecommendation.MED)} def process_encryption(self, encfile, opf, log): from lxml import etree import uuid, hashlib idpf_key = opf.raw_unique_identifier if idpf_key: idpf_key = re.sub('[\u0020\u0009\u000d\u000a]', '', idpf_key) idpf_key = hashlib.sha1(idpf_key.encode('utf-8')).digest() key = None for item in opf.identifier_iter(): scheme = None for xkey in item.attrib.keys(): if xkey.endswith('scheme'): scheme = item.get(xkey) if (scheme and scheme.lower() == 'uuid') or \ (item.text and item.text.startswith('urn:uuid:')): try: key = item.text.rpartition(':')[-1] key = uuid.UUID(key).bytes except: import traceback traceback.print_exc() key = None try: root = etree.parse(encfile) for em in root.xpath('descendant::*[contains(name(), "EncryptionMethod")]'): algorithm = em.get('Algorithm', '') if algorithm not in {ADOBE_OBFUSCATION, IDPF_OBFUSCATION}: return False cr = em.getparent().xpath('descendant::*[contains(name(), "CipherReference")]')[0] uri = cr.get('URI') path = os.path.abspath(os.path.join(os.path.dirname(encfile), '..', *uri.split('/'))) tkey = (key if algorithm == ADOBE_OBFUSCATION else idpf_key) if (tkey and os.path.exists(path)): self._encrypted_font_uris.append(uri) decrypt_font(tkey, path, algorithm) return True except: import traceback traceback.print_exc() return False def set_guide_type(self, opf, gtype, href=None, title=''): # Set the specified guide entry for elem in list(opf.iterguide()): if elem.get('type', '').lower() == gtype: elem.getparent().remove(elem) if href is not None: t = opf.create_guide_item(gtype, title, href) for guide in opf.root.xpath('./*[local-name()="guide"]'): guide.append(t) return guide = opf.create_guide_element() opf.root.append(guide) guide.append(t) return t def rationalize_cover3(self, opf, log): ''' If there is a reference to the cover/titlepage via manifest properties, convert to entries in the <guide> so that the rest of the pipeline picks it up. ''' from calibre.ebooks.metadata.opf3 import items_with_property removed = guide_titlepage_href = guide_titlepage_id = None # Look for titlepages incorrectly marked in the <guide> as covers guide_cover, guide_elem = None, None for guide_elem in opf.iterguide(): if guide_elem.get('type', '').lower() == 'cover': guide_cover = guide_elem.get('href', '').partition('#')[0] break if guide_cover: spine = list(opf.iterspine()) if spine: idref = spine[0].get('idref', '') for x in opf.itermanifest(): if x.get('id') == idref and x.get('href') == guide_cover: guide_titlepage_href = guide_cover guide_titlepage_id = idref break raster_cover_href = opf.epub3_raster_cover or opf.raster_cover if raster_cover_href: self.set_guide_type(opf, 'cover', raster_cover_href, 'Cover Image') titlepage_id = titlepage_href = None for item in items_with_property(opf.root, 'calibre:title-page'): tid, href = item.get('id'), item.get('href') if href and tid: titlepage_id, titlepage_href = tid, href.partition('#')[0] break if titlepage_href is None: titlepage_href, titlepage_id = guide_titlepage_href, guide_titlepage_id if titlepage_href is not None: self.set_guide_type(opf, 'titlepage', titlepage_href, 'Title page') spine = list(opf.iterspine()) if len(spine) > 1: for item in spine: if item.get('idref') == titlepage_id: log('Found HTML cover', titlepage_href) if self.for_viewer: item.attrib.pop('linear', None) else: item.getparent().remove(item) removed = titlepage_href return removed def rationalize_cover2(self, opf, log): ''' Ensure that the cover information in the guide is correct. That means, at most one entry with type="cover" that points to a raster cover and at most one entry with type="titlepage" that points to an HTML titlepage. ''' from calibre.ebooks.oeb.base import OPF removed = None from lxml import etree guide_cover, guide_elem = None, None for guide_elem in opf.iterguide(): if guide_elem.get('type', '').lower() == 'cover': guide_cover = guide_elem.get('href', '').partition('#')[0] break if not guide_cover: raster_cover = opf.raster_cover if raster_cover: if guide_elem is None: g = opf.root.makeelement(OPF('guide')) opf.root.append(g) else: g = guide_elem.getparent() guide_cover = raster_cover guide_elem = g.makeelement(OPF('reference'), attrib={'href':raster_cover, 'type':'cover'}) g.append(guide_elem) return spine = list(opf.iterspine()) if not spine: return # Check if the cover specified in the guide is also # the first element in spine idref = spine[0].get('idref', '') manifest = list(opf.itermanifest()) if not manifest: return elem = [x for x in manifest if x.get('id', '') == idref] if not elem or elem[0].get('href', None) != guide_cover: return log('Found HTML cover', guide_cover) # Remove from spine as covers must be treated # specially if not self.for_viewer: if len(spine) == 1: log.warn('There is only a single spine item and it is marked as the cover. Removing cover marking.') for guide_elem in tuple(opf.iterguide()): if guide_elem.get('type', '').lower() == 'cover': guide_elem.getparent().remove(guide_elem) return else: spine[0].getparent().remove(spine[0]) removed = guide_cover else: # Ensure the cover is displayed as the first item in the book, some # epub files have it set with linear='no' which causes the cover to # display in the end spine[0].attrib.pop('linear', None) opf.spine[0].is_linear = True # Ensure that the guide has a cover entry pointing to a raster cover # and a titlepage entry pointing to the html titlepage. The titlepage # entry will be used by the epub output plugin, the raster cover entry # by other output plugins. # Search for a raster cover identified in the OPF raster_cover = opf.raster_cover # Set the cover guide entry if raster_cover is not None: guide_elem.set('href', raster_cover) else: # Render the titlepage to create a raster cover from calibre.ebooks import render_html_svg_workaround guide_elem.set('href', 'calibre_raster_cover.jpg') t = etree.SubElement( elem[0].getparent(), OPF('item'), href=guide_elem.get('href'), id='calibre_raster_cover') t.set('media-type', 'image/jpeg') if os.path.exists(guide_cover): renderer = render_html_svg_workaround(guide_cover, log) if renderer is not None: with lopen('calibre_raster_cover.jpg', 'wb') as f: f.write(renderer) # Set the titlepage guide entry self.set_guide_type(opf, 'titlepage', guide_cover, 'Title page') return removed def find_opf(self): from calibre.utils.xml_parse import safe_xml_fromstring def attr(n, attr): for k, v in n.attrib.items(): if k.endswith(attr): return v try: with lopen('META-INF/container.xml', 'rb') as f: root = safe_xml_fromstring(f.read()) for r in root.xpath('//*[local-name()="rootfile"]'): if attr(r, 'media-type') != "application/oebps-package+xml": continue path = attr(r, 'full-path') if not path: continue path = os.path.join(os.getcwd(), *path.split('/')) if os.path.exists(path): return path except Exception: import traceback traceback.print_exc() def convert(self, stream, options, file_ext, log, accelerators): from calibre.utils.zipfile import ZipFile from calibre import walk from calibre.ebooks import DRMError from calibre.ebooks.metadata.opf2 import OPF try: zf = ZipFile(stream) zf.extractall(os.getcwd()) except: log.exception('EPUB appears to be invalid ZIP file, trying a' ' more forgiving ZIP parser') from calibre.utils.localunzip import extractall stream.seek(0) extractall(stream) encfile = os.path.abspath(os.path.join('META-INF', 'encryption.xml')) opf = self.find_opf() if opf is None: for f in walk('.'): if f.lower().endswith('.opf') and '__MACOSX' not in f and \ not os.path.basename(f).startswith('.'): opf = os.path.abspath(f) break path = getattr(stream, 'name', 'stream') if opf is None: raise ValueError('%s is not a valid EPUB file (could not find opf)'%path) opf = os.path.relpath(opf, os.getcwd()) parts = os.path.split(opf) opf = OPF(opf, os.path.dirname(os.path.abspath(opf))) self._encrypted_font_uris = [] if os.path.exists(encfile): if not self.process_encryption(encfile, opf, log): raise DRMError(os.path.basename(path)) self.encrypted_fonts = self._encrypted_font_uris if len(parts) > 1 and parts[0]: delta = '/'.join(parts[:-1])+'/' def normpath(x): return posixpath.normpath(delta + elem.get('href')) for elem in opf.itermanifest(): elem.set('href', normpath(elem.get('href'))) for elem in opf.iterguide(): elem.set('href', normpath(elem.get('href'))) f = self.rationalize_cover3 if opf.package_version >= 3.0 else self.rationalize_cover2 self.removed_cover = f(opf, log) if self.removed_cover: self.removed_items_to_ignore = (self.removed_cover,) epub3_nav = opf.epub3_nav if epub3_nav is not None: self.convert_epub3_nav(epub3_nav, opf, log, options) for x in opf.itermanifest(): if x.get('media-type', '') == 'application/x-dtbook+xml': raise ValueError( 'EPUB files with DTBook markup are not supported') not_for_spine = set() for y in opf.itermanifest(): id_ = y.get('id', None) if id_: mt = y.get('media-type', None) if mt in { 'application/vnd.adobe-page-template+xml', 'application/vnd.adobe.page-template+xml', 'application/adobe-page-template+xml', 'application/adobe.page-template+xml', 'application/text' }: not_for_spine.add(id_) ext = y.get('href', '').rpartition('.')[-1].lower() if mt == 'text/plain' and ext in {'otf', 'ttf'}: # some epub authoring software sets font mime types to # text/plain not_for_spine.add(id_) y.set('media-type', 'application/font') seen = set() for x in list(opf.iterspine()): ref = x.get('idref', None) if not ref or ref in not_for_spine or ref in seen: x.getparent().remove(x) continue seen.add(ref) if len(list(opf.iterspine())) == 0: raise ValueError('No valid entries in the spine of this EPUB') with lopen('content.opf', 'wb') as nopf: nopf.write(opf.render()) return os.path.abspath('content.opf') def convert_epub3_nav(self, nav_path, opf, log, opts): from lxml import etree from calibre.ebooks.chardet import xml_to_unicode from calibre.ebooks.oeb.polish.parsing import parse from calibre.ebooks.oeb.base import EPUB_NS, XHTML, NCX_MIME, NCX, urlnormalize, urlunquote, serialize from calibre.ebooks.oeb.polish.toc import first_child from calibre.utils.xml_parse import safe_xml_fromstring from tempfile import NamedTemporaryFile with lopen(nav_path, 'rb') as f: raw = f.read() raw = xml_to_unicode(raw, strip_encoding_pats=True, assume_utf8=True)[0] root = parse(raw, log=log) ncx = safe_xml_fromstring('<ncx xmlns="http://www.daisy.org/z3986/2005/ncx/" version="2005-1" xml:lang="eng"><navMap/></ncx>') navmap = ncx[0] et = '{%s}type' % EPUB_NS bn = os.path.basename(nav_path) def add_from_li(li, parent): href = text = None for x in li.iterchildren(XHTML('a'), XHTML('span')): text = etree.tostring( x, method='text', encoding='unicode', with_tail=False).strip() or ' '.join( x.xpath('descendant-or-self::*/@title')).strip() href = x.get('href') if href: if href.startswith('#'): href = bn + href break np = parent.makeelement(NCX('navPoint')) parent.append(np) np.append(np.makeelement(NCX('navLabel'))) np[0].append(np.makeelement(NCX('text'))) np[0][0].text = text if href: np.append(np.makeelement(NCX('content'), attrib={'src':href})) return np def process_nav_node(node, toc_parent): for li in node.iterchildren(XHTML('li')): child = add_from_li(li, toc_parent) ol = first_child(li, XHTML('ol')) if child is not None and ol is not None: process_nav_node(ol, child) for nav in root.iterdescendants(XHTML('nav')): if nav.get(et) == 'toc': ol = first_child(nav, XHTML('ol')) if ol is not None: process_nav_node(ol, navmap) break else: return with NamedTemporaryFile(suffix='.ncx', dir=os.path.dirname(nav_path), delete=False) as f: f.write(etree.tostring(ncx, encoding='utf-8')) ncx_href = os.path.relpath(f.name, os.getcwd()).replace(os.sep, '/') ncx_id = opf.create_manifest_item(ncx_href, NCX_MIME, append=True).get('id') for spine in opf.root.xpath('//*[local-name()="spine"]'): spine.set('toc', ncx_id) opts.epub3_nav_href = urlnormalize(os.path.relpath(nav_path).replace(os.sep, '/')) opts.epub3_nav_parsed = root if getattr(self, 'removed_cover', None): changed = False base_path = os.path.dirname(nav_path) for elem in root.xpath('//*[@href]'): href, frag = elem.get('href').partition('#')[::2] link_path = os.path.relpath(os.path.join(base_path, urlunquote(href)), base_path) abs_href = urlnormalize(link_path) if abs_href == self.removed_cover: changed = True elem.set('data-calibre-removed-titlepage', '1') if changed: with lopen(nav_path, 'wb') as f: f.write(serialize(root, 'application/xhtml+xml')) def postprocess_book(self, oeb, opts, log): rc = getattr(self, 'removed_cover', None) if rc: cover_toc_item = None for item in oeb.toc.iterdescendants(): if item.href and item.href.partition('#')[0] == rc: cover_toc_item = item break spine = {x.href for x in oeb.spine} if (cover_toc_item is not None and cover_toc_item not in spine): oeb.toc.item_that_refers_to_cover = cover_toc_item