%PDF- %PDF-
| Direktori : /usr/lib/calibre/calibre/ebooks/mobi/writer8/ |
| Current File : //usr/lib/calibre/calibre/ebooks/mobi/writer8/skeleton.py |
#!/usr/bin/env python3
__license__ = 'GPL v3'
__copyright__ = '2012, Kovid Goyal <kovid@kovidgoyal.net>'
__docformat__ = 'restructuredtext en'
import re
from collections import namedtuple
from functools import partial
from xml.sax.saxutils import escape
from lxml import etree
from calibre import my_unichr
from calibre.ebooks.oeb.base import XHTML_NS, extract
from calibre.ebooks.mobi.utils import to_base, PolyglotDict
from polyglot.builtins import iteritems, as_bytes
CHUNK_SIZE = 8192
# References in links are stored with 10 digits
to_href = partial(to_base, base=32, min_num_digits=10)
# Tags to which kindlegen adds the aid attribute
aid_able_tags = {'a', 'abbr', 'address', 'article', 'aside', 'audio', 'b',
'bdo', 'blockquote', 'body', 'button', 'cite', 'code', 'dd', 'del', 'details',
'dfn', 'div', 'dl', 'dt', 'em', 'fieldset', 'figcaption', 'figure', 'footer',
'h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'header', 'hgroup', 'i', 'ins', 'kbd',
'label', 'legend', 'li', 'map', 'mark', 'meter', 'nav', 'ol', 'output', 'p',
'pre', 'progress', 'q', 'rp', 'rt', 'samp', 'section', 'select', 'small',
'span', 'strong', 'sub', 'summary', 'sup', 'textarea', 'time', 'ul', 'var',
'video'}
_self_closing_pat = re.compile(
br'<(?P<tag>%s)(?=[\s/])(?P<arg>[^>]*)/>'%('|'.join(aid_able_tags|{'script',
'style', 'title', 'head'})).encode('ascii'),
re.IGNORECASE)
def close_self_closing_tags(raw):
return _self_closing_pat.sub(br'<\g<tag>\g<arg>></\g<tag>>', raw)
def path_to_node(node):
ans = []
parent = node.getparent()
while parent is not None:
ans.append(parent.index(node))
node = parent
parent = parent.getparent()
return tuple(reversed(ans))
def node_from_path(root, path):
parent = root
for idx in path:
parent = parent[idx]
return parent
def tostring(raw, **kwargs):
''' lxml *sometimes* represents non-ascii characters as hex entities in
attribute values. I can't figure out exactly what circumstances cause it.
It seems to happen when serializing a part of a larger tree. Since we need
serialization to be the same when serializing full and partial trees, we
manually replace all hex entities with their unicode codepoints. '''
xml_declaration = kwargs.pop('xml_declaration', False)
encoding = kwargs.pop('encoding', 'UTF-8')
kwargs['encoding'] = str
kwargs['xml_declaration'] = False
ans = etree.tostring(raw, **kwargs)
if xml_declaration:
ans = '<?xml version="1.0" encoding="%s"?>\n'%encoding + ans
return re.sub(r'&#x([0-9A-Fa-f]+);', lambda m:my_unichr(int(m.group(1), 16)),
ans).encode(encoding)
class Chunk:
def __init__(self, raw, selector):
self.raw = raw
self.starts_tags = []
self.ends_tags = []
self.insert_pos = None
self.is_first_chunk = False
self.selector = "%s-//*[@aid='%s']"%selector
def __len__(self):
return len(self.raw)
def merge(self, chunk):
self.raw += chunk.raw
self.ends_tags = chunk.ends_tags
def __repr__(self):
return 'Chunk(len=%r insert_pos=%r starts_tags=%r ends_tags=%r)'%(
len(self.raw), self.insert_pos, self.starts_tags, self.ends_tags)
__str__ = __repr__
class Skeleton:
def __init__(self, file_number, item, root, chunks):
self.file_number, self.item = file_number, item
self.chunks = chunks
self.skeleton = self.render(root)
self.body_offset = self.skeleton.find(b'<body')
self.calculate_metrics(root)
self.calculate_insert_positions()
def render(self, root):
raw = tostring(root, xml_declaration=True)
raw = raw.replace(b'<html', ('<html xmlns="%s"'%XHTML_NS).encode('ascii'), 1)
raw = close_self_closing_tags(raw)
return raw
def calculate_metrics(self, root):
Metric = namedtuple('Metric', 'start end')
self.metrics = {}
for tag in root.xpath('//*[@aid]'):
text = (tag.text or '').encode('utf-8')
raw = close_self_closing_tags(tostring(tag, with_tail=True))
start_length = len(raw.partition(b'>')[0]) + len(text) + 1
end_length = len(raw.rpartition(b'<')[-1]) + 1
self.metrics[tag.get('aid')] = Metric(start_length, end_length)
def calculate_insert_positions(self):
pos = self.body_offset
for chunk in self.chunks:
for tag in chunk.starts_tags:
pos += self.metrics[tag].start
chunk.insert_pos = pos
pos += len(chunk)
for tag in chunk.ends_tags:
pos += self.metrics[tag].end
def rebuild(self):
ans = self.skeleton
for chunk in self.chunks:
i = chunk.insert_pos
ans = ans[:i] + chunk.raw + ans[i:]
return ans
def __len__(self):
return len(self.skeleton) + sum(len(x.raw) for x in self.chunks)
@property
def raw_text(self):
return b''.join([self.skeleton] + [x.raw for x in self.chunks])
class Chunker:
def __init__(self, oeb, data_func, placeholder_map):
self.oeb, self.log = oeb, oeb.log
self.data = data_func
self.placeholder_map = placeholder_map
self.skeletons = []
# Set this to a list to enable dumping of the original and rebuilt
# html files for debugging
orig_dumps = None
for i, item in enumerate(self.oeb.spine):
root = self.remove_namespaces(self.data(item))
for child in root.xpath('//*[@aid]'):
child.set('aid', child.attrib.pop('aid')) # kindlegen always puts the aid last
body = root.xpath('//body')[0]
body.tail = '\n'
if orig_dumps is not None:
orig_dumps.append(tostring(root, xml_declaration=True,
with_tail=True))
orig_dumps[-1] = close_self_closing_tags(
orig_dumps[-1].replace(b'<html',
('<html xmlns="%s"'%XHTML_NS).encode('ascii'), 1))
# First pass: break up document into rendered strings of length no
# more than CHUNK_SIZE
chunks = []
self.step_into_tag(body, chunks)
# Second pass: Merge neighboring small chunks within the same
# skeleton tag so as to have chunks as close to the CHUNK_SIZE as
# possible.
chunks = self.merge_small_chunks(chunks)
# Third pass: Create the skeleton and calculate the insert position
# for all chunks
self.skeletons.append(Skeleton(i, item, root, chunks))
if orig_dumps:
self.dump(orig_dumps)
# Create the SKEL and Chunk tables
self.skel_table = []
self.chunk_table = []
self.create_tables()
# Set internal links
text = b''.join(x.raw_text for x in self.skeletons)
self.text = self.set_internal_links(text, b''.join(x.rebuild() for x in self.skeletons))
def remove_namespaces(self, root):
lang = None
for attr, val in iteritems(root.attrib):
if attr.rpartition('}')[-1] == 'lang':
lang = val
# Remove all namespace information from the tree. This means namespaced
# tags have their namespaces removed and all namespace declarations are
# removed. We have to do this manual cloning of the tree as there is no
# other way to remove namespace declarations in lxml. This is done so
# that serialization creates clean HTML 5 markup with no namespaces. We
# insert the XHTML namespace manually after serialization. The
# preceding layers should have removed svg and any other non html
# namespaced tags.
attrib = {'lang':lang} if lang else {}
if 'class' in root.attrib:
attrib['class'] = root.attrib['class']
if 'style' in root.attrib:
attrib['style'] = root.attrib['style']
nroot = etree.Element('html', attrib=attrib)
nroot.text = root.text
nroot.tail = '\n'
# Remove Comments and ProcessingInstructions as kindlegen seems to
# remove them as well
for tag in root.iterdescendants():
if tag.tag in {etree.Comment, etree.ProcessingInstruction}:
extract(tag)
for tag in root.iterdescendants():
if tag.tag == etree.Entity:
elem = etree.Entity(tag.name)
else:
tn = tag.tag
if tn is not None:
tn = tn.rpartition('}')[-1]
attrib = {k.rpartition('}')[-1]:v for k, v in iteritems(tag.attrib)}
try:
elem = nroot.makeelement(tn, attrib=attrib)
except ValueError:
attrib = {k:v for k, v in iteritems(attrib) if ':' not in k}
elem = nroot.makeelement(tn, attrib=attrib)
elem.text = tag.text
elem.tail = tag.tail
parent = node_from_path(nroot, path_to_node(tag.getparent()))
parent.append(elem)
return nroot
def step_into_tag(self, tag, chunks):
aid = tag.get('aid')
self.chunk_selector = ('P', aid)
first_chunk_idx = len(chunks)
# First handle any text
if tag.text and tag.text.strip(): # Leave pure whitespace in the skel
chunks.extend(self.chunk_up_text(tag.text))
tag.text = None
# Now loop over children
for child in list(tag):
raw = tostring(child, with_tail=False)
if child.tag == etree.Entity:
chunks.append(raw)
if child.tail:
chunks.extend(self.chunk_up_text(child.tail))
continue
raw = close_self_closing_tags(raw)
if len(raw) > CHUNK_SIZE and child.get('aid', None):
self.step_into_tag(child, chunks)
if child.tail and child.tail.strip(): # Leave pure whitespace
chunks.extend(self.chunk_up_text(child.tail))
child.tail = None
else:
if len(raw) > CHUNK_SIZE:
self.log.warn('Tag %s has no aid and a too large chunk'
' size. Adding anyway.'%child.tag)
chunks.append(Chunk(raw, self.chunk_selector))
if child.tail:
chunks.extend(self.chunk_up_text(child.tail))
tag.remove(child)
if len(chunks) <= first_chunk_idx and chunks:
raise ValueError('Stepped into a tag that generated no chunks.')
# Mark the first and last chunks of this tag
if chunks:
chunks[first_chunk_idx].starts_tags.append(aid)
chunks[-1].ends_tags.append(aid)
my_chunks = chunks[first_chunk_idx:]
if my_chunks:
my_chunks[0].is_first_chunk = True
self.chunk_selector = ('S', aid)
def chunk_up_text(self, text):
text = escape(text)
text = text.encode('utf-8')
ans = []
def split_multibyte_text(raw):
if len(raw) <= CHUNK_SIZE:
return raw, b''
l = raw[:CHUNK_SIZE]
l = l.decode('utf-8', 'ignore').encode('utf-8')
return l, raw[len(l):]
start, rest = split_multibyte_text(text)
ans.append(start)
while rest:
start, rest = split_multibyte_text(rest)
ans.append(b'<span class="AmznBigTextBlock">' + start + b'</span>')
return [Chunk(x, self.chunk_selector) for x in ans]
def merge_small_chunks(self, chunks):
ans = chunks[:1]
for chunk in chunks[1:]:
prev = ans[-1]
if (
chunk.starts_tags or # Starts a tag in the skel
len(chunk) + len(prev) > CHUNK_SIZE or # Too large
prev.ends_tags # Prev chunk ended a tag
):
ans.append(chunk)
else:
prev.merge(chunk)
return ans
def create_tables(self):
Skel = namedtuple('Skel',
'file_number name chunk_count start_pos length')
sp = 0
for s in self.skeletons:
s.start_pos = sp
sp += len(s)
self.skel_table = [Skel(s.file_number, 'SKEL%010d'%s.file_number,
len(s.chunks), s.start_pos, len(s.skeleton)) for s in self.skeletons]
Chunk = namedtuple('Chunk',
'insert_pos selector file_number sequence_number start_pos length')
num = 0
for skel in self.skeletons:
cp = 0
for chunk in skel.chunks:
self.chunk_table.append(
Chunk(chunk.insert_pos + skel.start_pos, chunk.selector,
skel.file_number, num, cp, len(chunk.raw)))
cp += len(chunk.raw)
num += 1
def set_internal_links(self, text, rebuilt_text):
''' Update the internal link placeholders to point to the correct
location, based on the chunk table.'''
# A kindle:pos:fid:off link contains two base 32 numbers of the form
# XXXX:YYYYYYYYYY
# The first number is an index into the chunk table and the second is
# an offset from the start of the chunk to the start of the tag pointed
# to by the link.
aid_map = PolyglotDict() # Map of aid to (fid, offset_from_start_of_chunk, offset_from_start_of_text)
for match in re.finditer(br'<[^>]+? [ac]id=[\'"]([cA-Z0-9]+)[\'"]', rebuilt_text):
offset = match.start()
pos_fid = None
for chunk in self.chunk_table:
if chunk.insert_pos <= offset < chunk.insert_pos + chunk.length:
pos_fid = (chunk.sequence_number, offset-chunk.insert_pos,
offset)
break
if chunk.insert_pos > offset:
# This aid is in the skeleton, not in a chunk, so we use
# the chunk immediately after
pos_fid = (chunk.sequence_number, 0, offset)
break
if chunk is self.chunk_table[-1]:
# This can happen for aids very close to the end of the
# text (https://bugs.launchpad.net/bugs/1011330)
pos_fid = (chunk.sequence_number, offset-chunk.insert_pos,
offset)
if pos_fid is None:
raise ValueError('Could not find chunk for aid: %r'%
match.group(1))
aid_map[match.group(1)] = pos_fid
self.aid_offset_map = aid_map
def to_placeholder(aid):
pos, fid, _ = aid_map[aid]
pos, fid = to_base(pos, min_num_digits=4), to_href(fid)
return ':off:'.join((pos, fid)).encode('utf-8')
placeholder_map = {as_bytes(k):to_placeholder(v) for k, v in
iteritems(self.placeholder_map)}
# Now update the links
def sub(match):
raw = match.group()
pl = match.group(1)
try:
return raw[:-19] + placeholder_map[pl]
except KeyError:
pass
return raw
return re.sub(br'<[^>]+(kindle:pos:fid:0000:off:[0-9A-Za-z]{10})', sub,
text)
def dump(self, orig_dumps):
import tempfile, shutil, os
tdir = os.path.join(tempfile.gettempdir(), 'skeleton')
self.log('Skeletons dumped to:', tdir)
if os.path.exists(tdir):
shutil.rmtree(tdir)
orig = os.path.join(tdir, 'orig')
rebuilt = os.path.join(tdir, 'rebuilt')
chunks = os.path.join(tdir, 'chunks')
for x in (orig, rebuilt, chunks):
os.makedirs(x)
error = False
for i, skeleton in enumerate(self.skeletons):
for j, chunk in enumerate(skeleton.chunks):
with open(os.path.join(chunks, 'file-%d-chunk-%d.html'%(i, j)),
'wb') as f:
f.write(chunk.raw)
oraw, rraw = orig_dumps[i], skeleton.rebuild()
with open(os.path.join(orig, '%04d.html'%i), 'wb') as f:
f.write(oraw)
with open(os.path.join(rebuilt, '%04d.html'%i), 'wb') as f:
f.write(rraw)
if oraw != rraw:
error = True
if error:
raise ValueError('The before and after HTML differs. Run a diff '
'tool on the orig and rebuilt directories')
else:
self.log('Skeleton HTML before and after is identical.')