%PDF- %PDF-
Direktori : /lib/calibre/calibre/ebooks/oeb/iterator/ |
Current File : //lib/calibre/calibre/ebooks/oeb/iterator/book.py |
#!/usr/bin/env python3 __license__ = 'GPL v3' __copyright__ = '2012, Kovid Goyal <kovid@kovidgoyal.net>' __docformat__ = 'restructuredtext en' ''' Iterate over the HTML files in an ebook. Useful for writing viewers. ''' import re, os, math from functools import partial from calibre.ebooks.metadata.opf2 import OPF from calibre.ptempfile import PersistentTemporaryDirectory, remove_dir from calibre.utils.config import DynamicConfig from calibre.utils.logging import default_log from calibre.utils.tdir_in_cache import tdir_in_cache from calibre import guess_type, prepare_string_for_xml from calibre.ebooks.oeb.transforms.cover import CoverManager from calibre.ebooks.oeb.iterator.spine import (SpineItem, create_indexing_data) from calibre.ebooks.oeb.iterator.bookmarks import BookmarksMixin from calibre.ebooks.oeb.base import urlparse, urlunquote TITLEPAGE = CoverManager.SVG_TEMPLATE.replace( '__ar__', 'none').replace('__viewbox__', '0 0 600 800' ).replace('__width__', '600').replace('__height__', '800') class FakeOpts: verbose = 0 breadth_first = False max_levels = 5 input_encoding = None def write_oebbook(oeb, path): from calibre.ebooks.oeb.writer import OEBWriter from calibre import walk w = OEBWriter() w(oeb, path) for f in walk(path): if f.endswith('.opf'): return f def extract_book(pathtoebook, tdir, log=None, view_kepub=False, processed=False, only_input_plugin=False): from calibre.ebooks.conversion.plumber import Plumber, create_oebbook from calibre.utils.logging import default_log log = log or default_log plumber = Plumber(pathtoebook, tdir, log, view_kepub=view_kepub) plumber.setup_options() if pathtoebook.lower().endswith('.opf'): plumber.opts.dont_package = True if hasattr(plumber.opts, 'no_process'): plumber.opts.no_process = True plumber.input_plugin.for_viewer = True with plumber.input_plugin, open(plumber.input, 'rb') as inf: pathtoopf = plumber.input_plugin(inf, plumber.opts, plumber.input_fmt, log, {}, tdir) if not only_input_plugin: # Run the HTML preprocess/parsing from the conversion pipeline as # well if (processed or plumber.input_fmt.lower() in {'pdb', 'pdf', 'rb'} and not hasattr(pathtoopf, 'manifest')): if hasattr(pathtoopf, 'manifest'): pathtoopf = write_oebbook(pathtoopf, tdir) pathtoopf = create_oebbook(log, pathtoopf, plumber.opts) if hasattr(pathtoopf, 'manifest'): pathtoopf = write_oebbook(pathtoopf, tdir) book_format = os.path.splitext(pathtoebook)[1][1:].upper() if getattr(plumber.input_plugin, 'is_kf8', False): fs = ':joint' if getattr(plumber.input_plugin, 'mobi_is_joint', False) else '' book_format = 'KF8' + fs return book_format, pathtoopf, plumber.input_fmt def run_extract_book(*args, **kwargs): from calibre.utils.ipc.simple_worker import fork_job ans = fork_job('calibre.ebooks.oeb.iterator.book', 'extract_book', args=args, kwargs=kwargs, timeout=3000, no_output=True) return ans['result'] class EbookIterator(BookmarksMixin): CHARACTERS_PER_PAGE = 1000 def __init__(self, pathtoebook, log=None, copy_bookmarks_to_file=True, use_tdir_in_cache=False): BookmarksMixin.__init__(self, copy_bookmarks_to_file=copy_bookmarks_to_file) self.use_tdir_in_cache = use_tdir_in_cache self.log = log or default_log pathtoebook = pathtoebook.strip() self.pathtoebook = os.path.abspath(pathtoebook) self.config = DynamicConfig(name='iterator') ext = os.path.splitext(pathtoebook)[1].replace('.', '').lower() ext = re.sub(r'(x{0,1})htm(l{0,1})', 'html', ext) self.ebook_ext = ext.replace('original_', '') def search(self, text, index, backwards=False): from calibre.ebooks.oeb.polish.parsing import parse pmap = [(i, path) for i, path in enumerate(self.spine)] if backwards: pmap.reverse() q = text.lower() for i, path in pmap: if (backwards and i < index) or (not backwards and i > index): with open(path, 'rb') as f: raw = f.read().decode(path.encoding) root = parse(raw) fragments = [] def serialize(elem): if elem.text: fragments.append(elem.text.lower()) if elem.tail: fragments.append(elem.tail.lower()) for child in elem.iterchildren(): if hasattr(getattr(child, 'tag', None), 'rpartition') and child.tag.rpartition('}')[-1] not in {'script', 'style', 'del'}: serialize(child) elif getattr(child, 'tail', None): fragments.append(child.tail.lower()) for body in root.xpath('//*[local-name() = "body"]'): body.tail = None serialize(body) if q in ''.join(fragments): return i def __enter__(self, processed=False, only_input_plugin=False, run_char_count=True, read_anchor_map=True, view_kepub=False, read_links=True): ''' Convert an ebook file into an exploded OEB book suitable for display in viewers/preprocessing etc. ''' self.delete_on_exit = [] if self.use_tdir_in_cache: self._tdir = tdir_in_cache('ev') else: self._tdir = PersistentTemporaryDirectory('_ebook_iter') self.base = os.path.realpath(self._tdir) self.book_format, self.pathtoopf, input_fmt = run_extract_book( self.pathtoebook, self.base, only_input_plugin=only_input_plugin, view_kepub=view_kepub, processed=processed) self.opf = OPF(self.pathtoopf, os.path.dirname(self.pathtoopf)) self.mi = self.opf.to_book_metadata() self.language = None if self.mi.languages: self.language = self.mi.languages[0].lower() self.spine = [] Spiny = partial(SpineItem, read_anchor_map=read_anchor_map, read_links=read_links, run_char_count=run_char_count, from_epub=self.book_format == 'EPUB') if input_fmt.lower() == 'htmlz': self.spine.append(Spiny(os.path.join(os.path.dirname(self.pathtoopf), 'index.html'), mime_type='text/html')) else: ordered = [i for i in self.opf.spine if i.is_linear] + \ [i for i in self.opf.spine if not i.is_linear] is_comic = input_fmt.lower() in {'cbc', 'cbz', 'cbr', 'cb7'} for i in ordered: spath = i.path mt = None if i.idref is not None: mt = self.opf.manifest.type_for_id(i.idref) if mt is None: mt = guess_type(spath)[0] try: self.spine.append(Spiny(spath, mime_type=mt)) if is_comic: self.spine[-1].is_single_page = True except: self.log.warn('Missing spine item:', repr(spath)) cover = self.opf.cover if cover and self.ebook_ext in {'lit', 'mobi', 'prc', 'opf', 'fb2', 'azw', 'azw3', 'docx', 'htmlz'}: cfile = os.path.join(self.base, 'calibre_iterator_cover.html') rcpath = os.path.relpath(cover, self.base).replace(os.sep, '/') chtml = (TITLEPAGE%prepare_string_for_xml(rcpath, True)).encode('utf-8') with open(cfile, 'wb') as f: f.write(chtml) self.spine[0:0] = [Spiny(cfile, mime_type='application/xhtml+xml')] self.delete_on_exit.append(cfile) if self.opf.path_to_html_toc is not None and \ self.opf.path_to_html_toc not in self.spine: try: self.spine.append(Spiny(self.opf.path_to_html_toc)) except: import traceback traceback.print_exc() sizes = [i.character_count for i in self.spine] self.pages = [math.ceil(i/float(self.CHARACTERS_PER_PAGE)) for i in sizes] for p, s in zip(self.pages, self.spine): s.pages = p start = 1 for s in self.spine: s.start_page = start start += s.pages s.max_page = s.start_page + s.pages - 1 self.toc = self.opf.toc if read_anchor_map: create_indexing_data(self.spine, self.toc) self.verify_links() self.read_bookmarks() return self def verify_links(self): spine_paths = {s:s for s in self.spine} for item in self.spine: base = os.path.dirname(item) for link in item.all_links: try: p = urlparse(urlunquote(link)) except Exception: continue if not p.scheme and not p.netloc: path = os.path.abspath(os.path.join(base, p.path)) if p.path else item try: path = spine_paths[path] except Exception: continue if not p.fragment or p.fragment in path.anchor_map: item.verified_links.add((path, p.fragment)) def __exit__(self, *args): remove_dir(self._tdir) for x in self.delete_on_exit: try: os.remove(x) except: pass