%PDF- %PDF-
Direktori : /usr/lib/calibre/calibre/ebooks/mobi/writer8/ |
Current File : //usr/lib/calibre/calibre/ebooks/mobi/writer8/main.py |
#!/usr/bin/env python3 __license__ = 'GPL v3' __copyright__ = '2012, Kovid Goyal <kovid@kovidgoyal.net>' __docformat__ = 'restructuredtext en' import copy, logging from functools import partial from collections import defaultdict, namedtuple from io import BytesIO from struct import pack import css_parser from css_parser.css import CSSRule from lxml import etree from calibre import isbytestring, force_unicode from calibre.ebooks.mobi.utils import (create_text_record, to_base, is_guide_ref_start) from calibre.ebooks.compression.palmdoc import compress_doc from calibre.ebooks.oeb.base import (OEB_DOCS, OEB_STYLES, SVG_MIME, XPath, extract, XHTML, urlnormalize) from calibre.ebooks.oeb.normalize_css import condense_sheet from calibre.ebooks.oeb.parse_utils import barename from calibre.ebooks.mobi.writer8.skeleton import Chunker, aid_able_tags, to_href from calibre.ebooks.mobi.writer8.index import (NCXIndex, SkelIndex, ChunkIndex, GuideIndex, NonLinearNCXIndex) from calibre.ebooks.mobi.writer8.mobi import KF8Book from calibre.ebooks.mobi.writer8.tbs import apply_trailing_byte_sequences from calibre.ebooks.mobi.writer8.toc import TOCAdder from polyglot.builtins import iteritems XML_DOCS = OEB_DOCS | {SVG_MIME} # References to record numbers in KF8 are stored as base-32 encoded integers, # with 4 digits to_ref = partial(to_base, base=32, min_num_digits=4) class KF8Writer: def __init__(self, oeb, opts, resources): self.oeb, self.opts, self.log = oeb, opts, oeb.log self.compress = not self.opts.dont_compress self.has_tbs = False self.log.info('Creating KF8 output') # Create an inline ToC if one does not already exist self.toc_adder = TOCAdder(oeb, opts) self.used_images = set() self.resources = resources self.flows = [None] # First flow item is reserved for the text self.records = [None] # Placeholder for zeroth record self.log('\tGenerating KF8 markup...') self.dup_data() self.cleanup_markup() self.replace_resource_links() self.extract_css_into_flows() self.extract_svg_into_flows() self.replace_internal_links_with_placeholders() self.insert_aid_attributes() self.chunk_it_up() # Dump the cloned data as it is no longer needed del self._data_cache self.create_text_records() self.log('\tCreating indices...') self.create_fdst_records() self.create_indices() self.create_guide() # We do not want to use this ToC for MOBI 6, so remove it self.toc_adder.remove_generated_toc() def dup_data(self): ''' Duplicate data so that any changes we make to markup/CSS only affect KF8 output and not MOBI 6 output ''' self._data_cache = {} # Suppress css_parser logging output as it is duplicated anyway earlier # in the pipeline css_parser.log.setLevel(logging.CRITICAL) for item in self.oeb.manifest: if item.media_type in XML_DOCS: self._data_cache[item.href] = copy.deepcopy(item.data) elif item.media_type in OEB_STYLES: # I can't figure out how to make an efficient copy of the # in-memory CSSStylesheet, as deepcopy doesn't work (raises an # exception) self._data_cache[item.href] = css_parser.parseString( item.data.cssText, validate=False) def data(self, item): return self._data_cache.get(item.href, item.data) def cleanup_markup(self): for item in self.oeb.spine: root = self.data(item) # Remove empty script tags as they are pointless for tag in XPath('//h:script')(root): if not tag.text and not tag.get('src', False): tag.getparent().remove(tag) # Remove [ac]id attributes as they are used by this code for anchor # to offset mapping for tag in XPath('//*[@aid or @cid]')(root): tag.attrib.pop('aid', None), tag.attrib.pop('cid', None) def replace_resource_links(self): ''' Replace links to resources (raster images/fonts) with pointers to the MOBI record containing the resource. The pointers are of the form: kindle:embed:XXXX?mime=image/* The ?mime= is apparently optional and not used for fonts. ''' def pointer(item, oref): ref = urlnormalize(item.abshref(oref)) idx = self.resources.item_map.get(ref, None) if idx is not None: is_image = self.resources.records[idx-1][:4] not in {b'FONT'} idx = to_ref(idx) if is_image: self.used_images.add(ref) return 'kindle:embed:%s?mime=%s'%(idx, self.resources.mime_map[ref]) else: return 'kindle:embed:%s'%idx return oref for item in self.oeb.manifest: if item.media_type in XML_DOCS: root = self.data(item) for tag in XPath('//h:img|//svg:image')(root): for attr, ref in iteritems(tag.attrib): if attr.split('}')[-1].lower() in {'src', 'href'}: tag.attrib[attr] = pointer(item, ref) for tag in XPath('//h:style')(root): if tag.text: sheet = css_parser.parseString(tag.text, validate=False) replacer = partial(pointer, item) css_parser.replaceUrls(sheet, replacer, ignoreImportRules=True) repl = sheet.cssText if isbytestring(repl): repl = repl.decode('utf-8') tag.text = '\n'+ repl + '\n' elif item.media_type in OEB_STYLES: sheet = self.data(item) replacer = partial(pointer, item) css_parser.replaceUrls(sheet, replacer, ignoreImportRules=True) def extract_css_into_flows(self): inlines = defaultdict(list) # Ensure identical <style>s not repeated sheets = {} passthrough = getattr(self.opts, 'mobi_passthrough', False) for item in self.oeb.manifest: if item.media_type in OEB_STYLES: sheet = self.data(item) if not passthrough and not self.opts.expand_css and hasattr(item.data, 'cssText'): condense_sheet(sheet) sheets[item.href] = len(self.flows) self.flows.append(sheet) def fix_import_rules(sheet): changed = False for rule in sheet.cssRules.rulesOfType(CSSRule.IMPORT_RULE): if rule.href: href = item.abshref(rule.href) idx = sheets.get(href, None) if idx is not None: idx = to_ref(idx) rule.href = 'kindle:flow:%s?mime=text/css'%idx changed = True return changed for item in self.oeb.spine: root = self.data(item) for link in XPath('//h:link[@href]')(root): href = item.abshref(link.get('href')) idx = sheets.get(href, None) if idx is not None: idx = to_ref(idx) link.set('href', 'kindle:flow:%s?mime=text/css'%idx) for tag in XPath('//h:style')(root): p = tag.getparent() idx = p.index(tag) raw = tag.text if not raw or not raw.strip(): extract(tag) continue sheet = css_parser.parseString(raw, validate=False) if fix_import_rules(sheet): raw = force_unicode(sheet.cssText, 'utf-8') repl = etree.Element(XHTML('link'), type='text/css', rel='stylesheet') repl.tail='\n' p.insert(idx, repl) extract(tag) inlines[raw].append(repl) for raw, elems in iteritems(inlines): idx = to_ref(len(self.flows)) self.flows.append(raw) for link in elems: link.set('href', 'kindle:flow:%s?mime=text/css'%idx) for item in self.oeb.manifest: if item.media_type in OEB_STYLES: sheet = self.data(item) if hasattr(sheet, 'cssRules'): fix_import_rules(sheet) for i, sheet in enumerate(tuple(self.flows)): if hasattr(sheet, 'cssText'): self.flows[i] = force_unicode(sheet.cssText, 'utf-8') def extract_svg_into_flows(self): images = {} for item in self.oeb.manifest: if item.media_type == SVG_MIME: data = self.data(item) images[item.href] = len(self.flows) self.flows.append(etree.tostring(data, encoding='UTF-8', with_tail=True, xml_declaration=True)) for item in self.oeb.spine: root = self.data(item) for svg in XPath('//svg:svg')(root): raw = etree.tostring(svg, encoding='unicode', with_tail=False) idx = len(self.flows) self.flows.append(raw) p = svg.getparent() pos = p.index(svg) img = etree.Element(XHTML('img'), src="kindle:flow:%s?mime=image/svg+xml"%to_ref(idx)) p.insert(pos, img) extract(svg) for img in XPath('//h:img[@src]')(root): src = img.get('src') abshref = item.abshref(src) idx = images.get(abshref, None) if idx is not None: img.set('src', 'kindle:flow:%s?mime=image/svg+xml'% to_ref(idx)) def replace_internal_links_with_placeholders(self): self.link_map = {} count = 0 hrefs = {item.href for item in self.oeb.spine} for item in self.oeb.spine: root = self.data(item) for a in XPath('//h:a[@href]')(root): count += 1 ref = item.abshref(a.get('href')) href, _, frag = ref.partition('#') try: href = urlnormalize(href) except ValueError: # a non utf-8 quoted url? Since we cannot interpret it, pass it through. pass if href in hrefs: placeholder = 'kindle:pos:fid:0000:off:%s'%to_href(count) self.link_map[placeholder] = (href, frag) a.set('href', placeholder) def insert_aid_attributes(self): self.id_map = {} cid = 0 for i, item in enumerate(self.oeb.spine): root = self.data(item) aidbase = i * int(1e6) j = 0 def in_table(elem): p = elem.getparent() if p is None: return False if barename(p.tag).lower() == 'table': return True return in_table(p) for tag in root.iterdescendants(etree.Element): id_ = tag.attrib.get('id', None) if id_ is None and tag.tag == XHTML('a'): # Can happen during tweaking id_ = tag.attrib.get('name', None) if id_ is not None: tag.attrib['id'] = id_ tagname = barename(tag.tag).lower() if id_ is not None or tagname in aid_able_tags: if tagname == 'table' or in_table(tag): # The Kindle renderer barfs on large tables that have # aid on any of their tags. See # https://bugs.launchpad.net/bugs/1489495 if id_: cid += 1 val = 'c%d' % cid self.id_map[(item.href, id_)] = val tag.set('cid', val) else: aid = to_base(aidbase + j, base=32) tag.set('aid', aid) if tag.tag == XHTML('body'): self.id_map[(item.href, '')] = aid if id_ is not None: self.id_map[(item.href, id_)] = aid j += 1 def chunk_it_up(self): placeholder_map = {} for placeholder, x in iteritems(self.link_map): href, frag = x aid = self.id_map.get(x, None) if aid is None: aid = self.id_map.get((href, '')) placeholder_map[placeholder] = aid chunker = Chunker(self.oeb, self.data, placeholder_map) for x in ('skel_table', 'chunk_table', 'aid_offset_map'): setattr(self, x, getattr(chunker, x)) self.flows[0] = chunker.text def create_text_records(self): self.flows = [x.encode('utf-8') if isinstance(x, str) else x for x in self.flows] text = b''.join(self.flows) self.text_length = len(text) text = BytesIO(text) nrecords = 0 records_size = 0 self.uncompressed_record_lengths = [] if self.compress: self.oeb.logger.info('\tCompressing markup...') while text.tell() < self.text_length: data, overlap = create_text_record(text) self.uncompressed_record_lengths.append(len(data)) if self.compress: data = compress_doc(data) data += overlap data += pack(b'>B', len(overlap)) self.records.append(data) records_size += len(data) nrecords += 1 self.last_text_record_idx = nrecords self.first_non_text_record_idx = nrecords + 1 # Pad so that the next records starts at a 4 byte boundary if records_size % 4 != 0: self.records.append(b'\x00'*(records_size % 4)) self.first_non_text_record_idx += 1 def create_fdst_records(self): FDST = namedtuple('Flow', 'start end') entries = [] self.fdst_table = [] for i, flow in enumerate(self.flows): start = 0 if i == 0 else self.fdst_table[-1].end self.fdst_table.append(FDST(start, start + len(flow))) entries.extend(self.fdst_table[-1]) rec = (b'FDST' + pack(b'>LL', 12, len(self.fdst_table)) + pack(b'>%dL'%len(entries), *entries)) self.fdst_records = [rec] self.fdst_count = len(self.fdst_table) def create_indices(self): self.skel_records = SkelIndex(self.skel_table)() self.chunk_records = ChunkIndex(self.chunk_table)() self.ncx_records = [] toc = self.oeb.toc entries = [] is_periodical = self.opts.mobi_periodical if toc.count() < 1: self.log.warn('Document has no ToC, MOBI will have no NCX index') return # Flatten the ToC into a depth first list fl = toc.iterdescendants() for i, item in enumerate(fl): entry = {'id': id(item), 'index': i, 'label':(item.title or _('Unknown')), 'children':[]} entry['depth'] = getattr(item, 'ncx_hlvl', 0) p = getattr(item, 'ncx_parent', None) if p is not None: entry['parent_id'] = p for child in item: child.ncx_parent = entry['id'] child.ncx_hlvl = entry['depth'] + 1 entry['children'].append(id(child)) if is_periodical: if item.author: entry['author'] = item.author if item.description: entry['description'] = item.description entries.append(entry) href = item.href or '' href, frag = href.partition('#')[0::2] aid = self.id_map.get((href, frag), None) if aid is None: aid = self.id_map.get((href, ''), None) if aid is None: pos, fid = 0, 0 chunk = self.chunk_table[pos] offset = chunk.insert_pos + fid else: pos, fid, offset = self.aid_offset_map[aid] entry['pos_fid'] = (pos, fid) entry['offset'] = offset # The Kindle requires entries to be sorted by (depth, playorder) # However, I cannot figure out how to deal with non linear ToCs, i.e. # ToCs whose nth entry at depth d has an offset after its n+k entry at # the same depth, so we sort on (depth, offset) instead. This re-orders # the ToC to be linear. A non-linear ToC causes section to section # jumping to not work. kindlegen somehow handles non-linear tocs, but I # cannot figure out how. original = sorted(entries, key=lambda entry: (entry['depth'], entry['index'])) linearized = sorted(entries, key=lambda entry: (entry['depth'], entry['offset'])) is_non_linear = original != linearized entries = linearized is_non_linear = False # False as we are using the linearized entries if is_non_linear: for entry in entries: entry['kind'] = 'chapter' for i, entry in enumerate(entries): entry['index'] = i id_to_index = {entry['id']:entry['index'] for entry in entries} # Write the hierarchical information for entry in entries: children = entry.pop('children') if children: entry['first_child'] = id_to_index[children[0]] entry['last_child'] = id_to_index[children[-1]] if 'parent_id' in entry: entry['parent'] = id_to_index[entry.pop('parent_id')] # Write the lengths def get_next_start(entry): enders = [e['offset'] for e in entries if e['depth'] <= entry['depth'] and e['offset'] > entry['offset']] if enders: return min(enders) return len(self.flows[0]) for entry in entries: entry['length'] = get_next_start(entry) - entry['offset'] self.has_tbs = apply_trailing_byte_sequences(entries, self.records, self.uncompressed_record_lengths) idx_type = NonLinearNCXIndex if is_non_linear else NCXIndex self.ncx_records = idx_type(entries)() def create_guide(self): self.start_offset = None self.guide_table = [] self.guide_records = [] GuideRef = namedtuple('GuideRef', 'title type pos_fid') for ref in self.oeb.guide.values(): href, frag = ref.href.partition('#')[0::2] aid = self.id_map.get((href, frag), None) if aid is None: aid = self.id_map.get((href, '')) if aid is None: continue pos, fid, offset = self.aid_offset_map[aid] if is_guide_ref_start(ref): self.start_offset = offset self.guide_table.append(GuideRef(ref.title or _('Unknown'), ref.type, (pos, fid))) if self.guide_table: self.guide_table.sort(key=lambda x:x.type) # Needed by the Kindle self.guide_records = GuideIndex(self.guide_table)() def create_kf8_book(oeb, opts, resources, for_joint=False): writer = KF8Writer(oeb, opts, resources) return KF8Book(writer, for_joint=for_joint)