%PDF- %PDF-
| Direktori : /usr/lib/calibre/calibre/ebooks/mobi/writer8/ |
| Current File : //usr/lib/calibre/calibre/ebooks/mobi/writer8/main.py |
#!/usr/bin/env python3
__license__ = 'GPL v3'
__copyright__ = '2012, Kovid Goyal <kovid@kovidgoyal.net>'
__docformat__ = 'restructuredtext en'
import copy, logging
from functools import partial
from collections import defaultdict, namedtuple
from io import BytesIO
from struct import pack
import css_parser
from css_parser.css import CSSRule
from lxml import etree
from calibre import isbytestring, force_unicode
from calibre.ebooks.mobi.utils import (create_text_record, to_base,
is_guide_ref_start)
from calibre.ebooks.compression.palmdoc import compress_doc
from calibre.ebooks.oeb.base import (OEB_DOCS, OEB_STYLES, SVG_MIME, XPath,
extract, XHTML, urlnormalize)
from calibre.ebooks.oeb.normalize_css import condense_sheet
from calibre.ebooks.oeb.parse_utils import barename
from calibre.ebooks.mobi.writer8.skeleton import Chunker, aid_able_tags, to_href
from calibre.ebooks.mobi.writer8.index import (NCXIndex, SkelIndex,
ChunkIndex, GuideIndex, NonLinearNCXIndex)
from calibre.ebooks.mobi.writer8.mobi import KF8Book
from calibre.ebooks.mobi.writer8.tbs import apply_trailing_byte_sequences
from calibre.ebooks.mobi.writer8.toc import TOCAdder
from polyglot.builtins import iteritems
XML_DOCS = OEB_DOCS | {SVG_MIME}
# References to record numbers in KF8 are stored as base-32 encoded integers,
# with 4 digits
to_ref = partial(to_base, base=32, min_num_digits=4)
class KF8Writer:
def __init__(self, oeb, opts, resources):
self.oeb, self.opts, self.log = oeb, opts, oeb.log
self.compress = not self.opts.dont_compress
self.has_tbs = False
self.log.info('Creating KF8 output')
# Create an inline ToC if one does not already exist
self.toc_adder = TOCAdder(oeb, opts)
self.used_images = set()
self.resources = resources
self.flows = [None] # First flow item is reserved for the text
self.records = [None] # Placeholder for zeroth record
self.log('\tGenerating KF8 markup...')
self.dup_data()
self.cleanup_markup()
self.replace_resource_links()
self.extract_css_into_flows()
self.extract_svg_into_flows()
self.replace_internal_links_with_placeholders()
self.insert_aid_attributes()
self.chunk_it_up()
# Dump the cloned data as it is no longer needed
del self._data_cache
self.create_text_records()
self.log('\tCreating indices...')
self.create_fdst_records()
self.create_indices()
self.create_guide()
# We do not want to use this ToC for MOBI 6, so remove it
self.toc_adder.remove_generated_toc()
def dup_data(self):
''' Duplicate data so that any changes we make to markup/CSS only
affect KF8 output and not MOBI 6 output '''
self._data_cache = {}
# Suppress css_parser logging output as it is duplicated anyway earlier
# in the pipeline
css_parser.log.setLevel(logging.CRITICAL)
for item in self.oeb.manifest:
if item.media_type in XML_DOCS:
self._data_cache[item.href] = copy.deepcopy(item.data)
elif item.media_type in OEB_STYLES:
# I can't figure out how to make an efficient copy of the
# in-memory CSSStylesheet, as deepcopy doesn't work (raises an
# exception)
self._data_cache[item.href] = css_parser.parseString(
item.data.cssText, validate=False)
def data(self, item):
return self._data_cache.get(item.href, item.data)
def cleanup_markup(self):
for item in self.oeb.spine:
root = self.data(item)
# Remove empty script tags as they are pointless
for tag in XPath('//h:script')(root):
if not tag.text and not tag.get('src', False):
tag.getparent().remove(tag)
# Remove [ac]id attributes as they are used by this code for anchor
# to offset mapping
for tag in XPath('//*[@aid or @cid]')(root):
tag.attrib.pop('aid', None), tag.attrib.pop('cid', None)
def replace_resource_links(self):
''' Replace links to resources (raster images/fonts) with pointers to
the MOBI record containing the resource. The pointers are of the form:
kindle:embed:XXXX?mime=image/* The ?mime= is apparently optional and
not used for fonts. '''
def pointer(item, oref):
ref = urlnormalize(item.abshref(oref))
idx = self.resources.item_map.get(ref, None)
if idx is not None:
is_image = self.resources.records[idx-1][:4] not in {b'FONT'}
idx = to_ref(idx)
if is_image:
self.used_images.add(ref)
return 'kindle:embed:%s?mime=%s'%(idx,
self.resources.mime_map[ref])
else:
return 'kindle:embed:%s'%idx
return oref
for item in self.oeb.manifest:
if item.media_type in XML_DOCS:
root = self.data(item)
for tag in XPath('//h:img|//svg:image')(root):
for attr, ref in iteritems(tag.attrib):
if attr.split('}')[-1].lower() in {'src', 'href'}:
tag.attrib[attr] = pointer(item, ref)
for tag in XPath('//h:style')(root):
if tag.text:
sheet = css_parser.parseString(tag.text, validate=False)
replacer = partial(pointer, item)
css_parser.replaceUrls(sheet, replacer,
ignoreImportRules=True)
repl = sheet.cssText
if isbytestring(repl):
repl = repl.decode('utf-8')
tag.text = '\n'+ repl + '\n'
elif item.media_type in OEB_STYLES:
sheet = self.data(item)
replacer = partial(pointer, item)
css_parser.replaceUrls(sheet, replacer, ignoreImportRules=True)
def extract_css_into_flows(self):
inlines = defaultdict(list) # Ensure identical <style>s not repeated
sheets = {}
passthrough = getattr(self.opts, 'mobi_passthrough', False)
for item in self.oeb.manifest:
if item.media_type in OEB_STYLES:
sheet = self.data(item)
if not passthrough and not self.opts.expand_css and hasattr(item.data, 'cssText'):
condense_sheet(sheet)
sheets[item.href] = len(self.flows)
self.flows.append(sheet)
def fix_import_rules(sheet):
changed = False
for rule in sheet.cssRules.rulesOfType(CSSRule.IMPORT_RULE):
if rule.href:
href = item.abshref(rule.href)
idx = sheets.get(href, None)
if idx is not None:
idx = to_ref(idx)
rule.href = 'kindle:flow:%s?mime=text/css'%idx
changed = True
return changed
for item in self.oeb.spine:
root = self.data(item)
for link in XPath('//h:link[@href]')(root):
href = item.abshref(link.get('href'))
idx = sheets.get(href, None)
if idx is not None:
idx = to_ref(idx)
link.set('href', 'kindle:flow:%s?mime=text/css'%idx)
for tag in XPath('//h:style')(root):
p = tag.getparent()
idx = p.index(tag)
raw = tag.text
if not raw or not raw.strip():
extract(tag)
continue
sheet = css_parser.parseString(raw, validate=False)
if fix_import_rules(sheet):
raw = force_unicode(sheet.cssText, 'utf-8')
repl = etree.Element(XHTML('link'), type='text/css',
rel='stylesheet')
repl.tail='\n'
p.insert(idx, repl)
extract(tag)
inlines[raw].append(repl)
for raw, elems in iteritems(inlines):
idx = to_ref(len(self.flows))
self.flows.append(raw)
for link in elems:
link.set('href', 'kindle:flow:%s?mime=text/css'%idx)
for item in self.oeb.manifest:
if item.media_type in OEB_STYLES:
sheet = self.data(item)
if hasattr(sheet, 'cssRules'):
fix_import_rules(sheet)
for i, sheet in enumerate(tuple(self.flows)):
if hasattr(sheet, 'cssText'):
self.flows[i] = force_unicode(sheet.cssText, 'utf-8')
def extract_svg_into_flows(self):
images = {}
for item in self.oeb.manifest:
if item.media_type == SVG_MIME:
data = self.data(item)
images[item.href] = len(self.flows)
self.flows.append(etree.tostring(data, encoding='UTF-8',
with_tail=True, xml_declaration=True))
for item in self.oeb.spine:
root = self.data(item)
for svg in XPath('//svg:svg')(root):
raw = etree.tostring(svg, encoding='unicode', with_tail=False)
idx = len(self.flows)
self.flows.append(raw)
p = svg.getparent()
pos = p.index(svg)
img = etree.Element(XHTML('img'),
src="kindle:flow:%s?mime=image/svg+xml"%to_ref(idx))
p.insert(pos, img)
extract(svg)
for img in XPath('//h:img[@src]')(root):
src = img.get('src')
abshref = item.abshref(src)
idx = images.get(abshref, None)
if idx is not None:
img.set('src', 'kindle:flow:%s?mime=image/svg+xml'%
to_ref(idx))
def replace_internal_links_with_placeholders(self):
self.link_map = {}
count = 0
hrefs = {item.href for item in self.oeb.spine}
for item in self.oeb.spine:
root = self.data(item)
for a in XPath('//h:a[@href]')(root):
count += 1
ref = item.abshref(a.get('href'))
href, _, frag = ref.partition('#')
try:
href = urlnormalize(href)
except ValueError:
# a non utf-8 quoted url? Since we cannot interpret it, pass it through.
pass
if href in hrefs:
placeholder = 'kindle:pos:fid:0000:off:%s'%to_href(count)
self.link_map[placeholder] = (href, frag)
a.set('href', placeholder)
def insert_aid_attributes(self):
self.id_map = {}
cid = 0
for i, item in enumerate(self.oeb.spine):
root = self.data(item)
aidbase = i * int(1e6)
j = 0
def in_table(elem):
p = elem.getparent()
if p is None:
return False
if barename(p.tag).lower() == 'table':
return True
return in_table(p)
for tag in root.iterdescendants(etree.Element):
id_ = tag.attrib.get('id', None)
if id_ is None and tag.tag == XHTML('a'):
# Can happen during tweaking
id_ = tag.attrib.get('name', None)
if id_ is not None:
tag.attrib['id'] = id_
tagname = barename(tag.tag).lower()
if id_ is not None or tagname in aid_able_tags:
if tagname == 'table' or in_table(tag):
# The Kindle renderer barfs on large tables that have
# aid on any of their tags. See
# https://bugs.launchpad.net/bugs/1489495
if id_:
cid += 1
val = 'c%d' % cid
self.id_map[(item.href, id_)] = val
tag.set('cid', val)
else:
aid = to_base(aidbase + j, base=32)
tag.set('aid', aid)
if tag.tag == XHTML('body'):
self.id_map[(item.href, '')] = aid
if id_ is not None:
self.id_map[(item.href, id_)] = aid
j += 1
def chunk_it_up(self):
placeholder_map = {}
for placeholder, x in iteritems(self.link_map):
href, frag = x
aid = self.id_map.get(x, None)
if aid is None:
aid = self.id_map.get((href, ''))
placeholder_map[placeholder] = aid
chunker = Chunker(self.oeb, self.data, placeholder_map)
for x in ('skel_table', 'chunk_table', 'aid_offset_map'):
setattr(self, x, getattr(chunker, x))
self.flows[0] = chunker.text
def create_text_records(self):
self.flows = [x.encode('utf-8') if isinstance(x, str) else x for x
in self.flows]
text = b''.join(self.flows)
self.text_length = len(text)
text = BytesIO(text)
nrecords = 0
records_size = 0
self.uncompressed_record_lengths = []
if self.compress:
self.oeb.logger.info('\tCompressing markup...')
while text.tell() < self.text_length:
data, overlap = create_text_record(text)
self.uncompressed_record_lengths.append(len(data))
if self.compress:
data = compress_doc(data)
data += overlap
data += pack(b'>B', len(overlap))
self.records.append(data)
records_size += len(data)
nrecords += 1
self.last_text_record_idx = nrecords
self.first_non_text_record_idx = nrecords + 1
# Pad so that the next records starts at a 4 byte boundary
if records_size % 4 != 0:
self.records.append(b'\x00'*(records_size % 4))
self.first_non_text_record_idx += 1
def create_fdst_records(self):
FDST = namedtuple('Flow', 'start end')
entries = []
self.fdst_table = []
for i, flow in enumerate(self.flows):
start = 0 if i == 0 else self.fdst_table[-1].end
self.fdst_table.append(FDST(start, start + len(flow)))
entries.extend(self.fdst_table[-1])
rec = (b'FDST' + pack(b'>LL', 12, len(self.fdst_table)) +
pack(b'>%dL'%len(entries), *entries))
self.fdst_records = [rec]
self.fdst_count = len(self.fdst_table)
def create_indices(self):
self.skel_records = SkelIndex(self.skel_table)()
self.chunk_records = ChunkIndex(self.chunk_table)()
self.ncx_records = []
toc = self.oeb.toc
entries = []
is_periodical = self.opts.mobi_periodical
if toc.count() < 1:
self.log.warn('Document has no ToC, MOBI will have no NCX index')
return
# Flatten the ToC into a depth first list
fl = toc.iterdescendants()
for i, item in enumerate(fl):
entry = {'id': id(item), 'index': i, 'label':(item.title or
_('Unknown')), 'children':[]}
entry['depth'] = getattr(item, 'ncx_hlvl', 0)
p = getattr(item, 'ncx_parent', None)
if p is not None:
entry['parent_id'] = p
for child in item:
child.ncx_parent = entry['id']
child.ncx_hlvl = entry['depth'] + 1
entry['children'].append(id(child))
if is_periodical:
if item.author:
entry['author'] = item.author
if item.description:
entry['description'] = item.description
entries.append(entry)
href = item.href or ''
href, frag = href.partition('#')[0::2]
aid = self.id_map.get((href, frag), None)
if aid is None:
aid = self.id_map.get((href, ''), None)
if aid is None:
pos, fid = 0, 0
chunk = self.chunk_table[pos]
offset = chunk.insert_pos + fid
else:
pos, fid, offset = self.aid_offset_map[aid]
entry['pos_fid'] = (pos, fid)
entry['offset'] = offset
# The Kindle requires entries to be sorted by (depth, playorder)
# However, I cannot figure out how to deal with non linear ToCs, i.e.
# ToCs whose nth entry at depth d has an offset after its n+k entry at
# the same depth, so we sort on (depth, offset) instead. This re-orders
# the ToC to be linear. A non-linear ToC causes section to section
# jumping to not work. kindlegen somehow handles non-linear tocs, but I
# cannot figure out how.
original = sorted(entries,
key=lambda entry: (entry['depth'], entry['index']))
linearized = sorted(entries,
key=lambda entry: (entry['depth'], entry['offset']))
is_non_linear = original != linearized
entries = linearized
is_non_linear = False # False as we are using the linearized entries
if is_non_linear:
for entry in entries:
entry['kind'] = 'chapter'
for i, entry in enumerate(entries):
entry['index'] = i
id_to_index = {entry['id']:entry['index'] for entry in entries}
# Write the hierarchical information
for entry in entries:
children = entry.pop('children')
if children:
entry['first_child'] = id_to_index[children[0]]
entry['last_child'] = id_to_index[children[-1]]
if 'parent_id' in entry:
entry['parent'] = id_to_index[entry.pop('parent_id')]
# Write the lengths
def get_next_start(entry):
enders = [e['offset'] for e in entries if e['depth'] <=
entry['depth'] and e['offset'] > entry['offset']]
if enders:
return min(enders)
return len(self.flows[0])
for entry in entries:
entry['length'] = get_next_start(entry) - entry['offset']
self.has_tbs = apply_trailing_byte_sequences(entries, self.records,
self.uncompressed_record_lengths)
idx_type = NonLinearNCXIndex if is_non_linear else NCXIndex
self.ncx_records = idx_type(entries)()
def create_guide(self):
self.start_offset = None
self.guide_table = []
self.guide_records = []
GuideRef = namedtuple('GuideRef', 'title type pos_fid')
for ref in self.oeb.guide.values():
href, frag = ref.href.partition('#')[0::2]
aid = self.id_map.get((href, frag), None)
if aid is None:
aid = self.id_map.get((href, ''))
if aid is None:
continue
pos, fid, offset = self.aid_offset_map[aid]
if is_guide_ref_start(ref):
self.start_offset = offset
self.guide_table.append(GuideRef(ref.title or
_('Unknown'), ref.type, (pos, fid)))
if self.guide_table:
self.guide_table.sort(key=lambda x:x.type) # Needed by the Kindle
self.guide_records = GuideIndex(self.guide_table)()
def create_kf8_book(oeb, opts, resources, for_joint=False):
writer = KF8Writer(oeb, opts, resources)
return KF8Book(writer, for_joint=for_joint)