%PDF- %PDF-
Direktori : /usr/lib/calibre/calibre/ebooks/conversion/plugins/ |
Current File : //usr/lib/calibre/calibre/ebooks/conversion/plugins/chm_input.py |
''' CHM File decoding support ''' __license__ = 'GPL v3' __copyright__ = '2008, Kovid Goyal <kovid at kovidgoyal.net>,' \ ' and Alex Bramley <a.bramley at gmail.com>.' import os from calibre.customize.conversion import InputFormatPlugin from calibre.ptempfile import TemporaryDirectory from calibre.constants import filesystem_encoding from polyglot.builtins import as_bytes class CHMInput(InputFormatPlugin): name = 'CHM Input' author = 'Kovid Goyal and Alex Bramley' description = _('Convert CHM files to OEB') file_types = {'chm'} commit_name = 'chm_input' def _chmtohtml(self, output_dir, chm_path, no_images, log, debug_dump=False): from calibre.ebooks.chm.reader import CHMReader log.debug('Opening CHM file') rdr = CHMReader(chm_path, log, input_encoding=self.opts.input_encoding) log.debug('Extracting CHM to %s' % output_dir) rdr.extract_content(output_dir, debug_dump=debug_dump) self._chm_reader = rdr return rdr.hhc_path def convert(self, stream, options, file_ext, log, accelerators): from calibre.ebooks.chm.metadata import get_metadata_from_reader from calibre.customize.ui import plugin_for_input_format self.opts = options log.debug('Processing CHM...') with TemporaryDirectory('_chm2oeb') as tdir: if not isinstance(tdir, str): tdir = tdir.decode(filesystem_encoding) html_input = plugin_for_input_format('html') for opt in html_input.options: setattr(options, opt.option.name, opt.recommended_value) no_images = False # options.no_images chm_name = stream.name # chm_data = stream.read() # closing stream so CHM can be opened by external library stream.close() log.debug('tdir=%s' % tdir) log.debug('stream.name=%s' % stream.name) debug_dump = False odi = options.debug_pipeline if odi: debug_dump = os.path.join(odi, 'input') mainname = self._chmtohtml(tdir, chm_name, no_images, log, debug_dump=debug_dump) mainpath = os.path.join(tdir, mainname) try: metadata = get_metadata_from_reader(self._chm_reader) except Exception: log.exception('Failed to read metadata, using filename') from calibre.ebooks.metadata.book.base import Metadata metadata = Metadata(os.path.basename(chm_name)) encoding = self._chm_reader.get_encoding() or options.input_encoding or 'cp1252' self._chm_reader.CloseCHM() # print((tdir, mainpath)) # from calibre import ipython # ipython() options.debug_pipeline = None options.input_encoding = 'utf-8' uenc = encoding if os.path.abspath(mainpath) in self._chm_reader.re_encoded_files: uenc = 'utf-8' htmlpath, toc = self._create_html_root(mainpath, log, uenc) oeb = self._create_oebbook_html(htmlpath, tdir, options, log, metadata) options.debug_pipeline = odi if toc.count() > 1: oeb.toc = self.parse_html_toc(oeb.spine[0]) oeb.manifest.remove(oeb.spine[0]) oeb.auto_generated_toc = False return oeb def parse_html_toc(self, item): from calibre.ebooks.oeb.base import TOC, XPath dx = XPath('./h:div') ax = XPath('./h:a[1]') def do_node(parent, div): for child in dx(div): a = ax(child)[0] c = parent.add(a.text, a.attrib['href']) do_node(c, child) toc = TOC() root = XPath('//h:div[1]')(item.data)[0] do_node(toc, root) return toc def _create_oebbook_html(self, htmlpath, basedir, opts, log, mi): # use HTMLInput plugin to generate book from calibre.customize.builtins import HTMLInput opts.breadth_first = True htmlinput = HTMLInput(None) oeb = htmlinput.create_oebbook(htmlpath, basedir, opts, log, mi) return oeb def _create_html_root(self, hhcpath, log, encoding): from lxml import html from polyglot.urllib import unquote as _unquote from calibre.ebooks.oeb.base import urlquote from calibre.ebooks.chardet import xml_to_unicode hhcdata = self._read_file(hhcpath) hhcdata = hhcdata.decode(encoding) hhcdata = xml_to_unicode(hhcdata, verbose=True, strip_encoding_pats=True, resolve_entities=True)[0] hhcroot = html.fromstring(hhcdata) toc = self._process_nodes(hhcroot) # print("=============================") # print("Printing hhcroot") # print(etree.tostring(hhcroot, pretty_print=True)) # print("=============================") log.debug('Found %d section nodes' % toc.count()) htmlpath = os.path.splitext(hhcpath)[0] + ".html" base = os.path.dirname(os.path.abspath(htmlpath)) def unquote(x): if isinstance(x, str): x = x.encode('utf-8') return _unquote(x).decode('utf-8') def unquote_path(x): y = unquote(x) if (not os.path.exists(os.path.join(base, x)) and os.path.exists(os.path.join(base, y))): x = y return x def donode(item, parent, base, subpath): for child in item: title = child.title if not title: continue raw = unquote_path(child.href or '') rsrcname = os.path.basename(raw) rsrcpath = os.path.join(subpath, rsrcname) if (not os.path.exists(os.path.join(base, rsrcpath)) and os.path.exists(os.path.join(base, raw))): rsrcpath = raw if '%' not in rsrcpath: rsrcpath = urlquote(rsrcpath) if not raw: rsrcpath = '' c = DIV(A(title, href=rsrcpath)) donode(child, c, base, subpath) parent.append(c) with open(htmlpath, 'wb') as f: if toc.count() > 1: from lxml.html.builder import HTML, BODY, DIV, A path0 = toc[0].href path0 = unquote_path(path0) subpath = os.path.dirname(path0) base = os.path.dirname(f.name) root = DIV() donode(toc, root, base, subpath) raw = html.tostring(HTML(BODY(root)), encoding='utf-8', pretty_print=True) f.write(raw) else: f.write(as_bytes(hhcdata)) return htmlpath, toc def _read_file(self, name): with lopen(name, 'rb') as f: data = f.read() return data def add_node(self, node, toc, ancestor_map): from calibre.ebooks.chm.reader import match_string if match_string(node.attrib.get('type', ''), 'text/sitemap'): p = node.xpath('ancestor::ul[1]/ancestor::li[1]/object[1]') parent = p[0] if p else None toc = ancestor_map.get(parent, toc) title = href = '' for param in node.xpath('./param'): if match_string(param.attrib['name'], 'name'): title = param.attrib['value'] elif match_string(param.attrib['name'], 'local'): href = param.attrib['value'] child = toc.add(title or _('Unknown'), href) ancestor_map[node] = child def _process_nodes(self, root): from calibre.ebooks.oeb.base import TOC toc = TOC() ancestor_map = {} for node in root.xpath('//object'): self.add_node(node, toc, ancestor_map) return toc