%PDF- %PDF-
| Direktori : /lib/calibre/calibre/ebooks/mobi/ |
| Current File : //lib/calibre/calibre/ebooks/mobi/mobiml.py |
'''
Transform XHTML/OPS-ish content into Mobipocket HTML 3.2.
'''
__license__ = 'GPL v3'
__copyright__ = '2008, Marshall T. Vandegrift <llasram@gmail.cam>'
import copy
import numbers
import re
from contextlib import suppress
from lxml import etree
from calibre.ebooks.mobi.utils import convert_color_for_font_tag
from calibre.ebooks.oeb.base import (
XHTML, XHTML_NS, barename, namespace, urlnormalize
)
from calibre.ebooks.oeb.stylizer import Stylizer
from calibre.ebooks.oeb.transforms.flatcss import KeyMapper
from calibre.utils.imghdr import identify
from polyglot.builtins import string_or_bytes
MBP_NS = 'http://mobipocket.com/ns/mbp'
def MBP(name):
return f'{{{MBP_NS}}}{name}'
MOBI_NSMAP = {None: XHTML_NS, 'mbp': MBP_NS}
INLINE_TAGS = {'span', 'a', 'code', 'u', 's', 'big', 'strike', 'tt', 'font', 'q', 'i', 'b', 'em', 'strong', 'sup', 'sub'}
HEADER_TAGS = {'h1', 'h2', 'h3', 'h4', 'h5', 'h6'}
# GR: Added 'caption' to both sets
NESTABLE_TAGS = {'ol', 'ul', 'li', 'table', 'tr', 'td', 'th', 'caption'}
TABLE_TAGS = {'table', 'tr', 'td', 'th', 'caption'}
SPECIAL_TAGS = {'hr', 'br'}
CONTENT_TAGS = {'img', 'hr', 'br'}
NOT_VTAGS = HEADER_TAGS | NESTABLE_TAGS | TABLE_TAGS | SPECIAL_TAGS | \
CONTENT_TAGS
LEAF_TAGS = {'base', 'basefont', 'frame', 'link', 'meta', 'area', 'br',
'col', 'hr', 'img', 'input', 'param'}
PAGE_BREAKS = {'always', 'left', 'right'}
COLLAPSE = re.compile(r'[ \t\r\n\v]+')
def asfloat(value):
if not isinstance(value, numbers.Number):
return 0.0
return float(value)
def convert_margin(style, which):
# percentage values come out too large when the user uses a non kindle
# output profile like the tablet profile
ans = asfloat(style[which])
raw = style._get(which)
if isinstance(raw, str) and '%' in raw:
with suppress(TypeError):
ans = min(style._unit_convert(raw, base=600), ans)
return ans
def isspace(text):
if not text:
return True
if '\xa0' in text:
return False
return text.isspace()
class BlockState:
def __init__(self, body):
self.body = body
self.nested = []
self.para = None
self.inline = None
self.anchor = None
self.vpadding = 0.
self.vmargin = 0.
self.pbreak = False
self.istate = None
self.content = False
class FormatState:
def __init__(self):
self.rendered = False
self.left = 0.
self.halign = 'auto'
self.indent = 0.
self.fsize = 3
self.ids = set()
self.italic = False
self.bold = False
self.strikethrough = False
self.underline = False
self.preserve = False
self.pre_wrap = False
self.family = 'serif'
self.bgcolor = 'transparent'
self.fgcolor = 'black'
self.href = None
self.list_num = 0
self.attrib = {}
def __eq__(self, other):
return self.fsize == other.fsize \
and self.italic == other.italic \
and self.bold == other.bold \
and self.href == other.href \
and self.preserve == other.preserve \
and self.pre_wrap == other.pre_wrap \
and self.family == other.family \
and self.bgcolor == other.bgcolor \
and self.fgcolor == other.fgcolor \
and self.strikethrough == other.strikethrough \
and self.underline == other.underline
def __ne__(self, other):
return not self.__eq__(other)
class MobiMLizer:
def __init__(self, ignore_tables=False):
self.ignore_tables = ignore_tables
def __call__(self, oeb, context):
oeb.logger.info('Converting XHTML to Mobipocket markup...')
self.oeb = oeb
self.log = self.oeb.logger
self.opts = context
self.profile = profile = context.dest
self.fnums = fnums = {v: k for k, v in profile.fnums.items()}
self.fmap = KeyMapper(profile.fbase, profile.fbase, fnums.keys())
self.mobimlize_spine()
def mobimlize_spine(self):
'Iterate over the spine and convert it to MOBIML'
for item in self.oeb.spine:
stylizer = Stylizer(item.data, item.href, self.oeb, self.opts, self.profile)
body = item.data.find(XHTML('body'))
nroot = etree.Element(XHTML('html'), nsmap=MOBI_NSMAP)
nbody = etree.SubElement(nroot, XHTML('body'))
self.current_spine_item = item
self.mobimlize_elem(body, stylizer, BlockState(nbody),
[FormatState()])
item.data = nroot
# print(etree.tostring(nroot))
def mobimlize_font(self, ptsize):
return self.fnums[self.fmap[ptsize]]
def mobimlize_measure(self, ptsize):
if isinstance(ptsize, string_or_bytes):
return ptsize
embase = self.profile.fbase
if round(ptsize) < embase:
return "%dpt" % int(round(ptsize))
return "%dem" % int(round(ptsize / embase))
def preize_text(self, text, pre_wrap=False):
text = str(text)
if pre_wrap:
# Replace n consecutive spaces with n-1 NBSP + space
text = re.sub(r' {2,}', lambda m:('\xa0'*(len(m.group())-1) + ' '), text)
else:
text = text.replace(' ', '\xa0')
text = text.replace('\r\n', '\n')
text = text.replace('\r', '\n')
lines = text.split('\n')
result = lines[:1]
for line in lines[1:]:
result.append(etree.Element(XHTML('br')))
if line:
result.append(line)
return result
def mobimlize_content(self, tag, text, bstate, istates):
'Convert text content'
if text or tag != 'br':
bstate.content = True
istate = istates[-1]
para = bstate.para
if tag in SPECIAL_TAGS and not text:
para = para if para is not None else bstate.body
elif para is None or tag in ('td', 'th'):
body = bstate.body
if bstate.pbreak:
etree.SubElement(body, MBP('pagebreak'))
bstate.pbreak = False
bstate.istate = None
bstate.anchor = None
parent = bstate.nested[-1] if bstate.nested else bstate.body
indent = istate.indent
left = istate.left
if isinstance(indent, string_or_bytes):
indent = 0
if indent < 0 and abs(indent) < left:
left += indent
indent = 0
elif indent != 0 and abs(indent) < self.profile.fbase:
indent = (indent / abs(indent)) * self.profile.fbase
if tag in NESTABLE_TAGS and not istate.rendered:
para = wrapper = etree.SubElement(
parent, XHTML(tag), attrib=istate.attrib)
bstate.nested.append(para)
if tag == 'li' and len(istates) > 1:
istates[-2].list_num += 1
para.attrib['value'] = str(istates[-2].list_num)
elif tag in NESTABLE_TAGS and istate.rendered:
para = wrapper = bstate.nested[-1]
elif not self.opts.mobi_ignore_margins and left > 0 and indent >= 0:
ems = self.profile.mobi_ems_per_blockquote
para = wrapper = etree.SubElement(parent, XHTML('blockquote'))
para = wrapper
emleft = int(round(left / self.profile.fbase)) - ems
emleft = min((emleft, 10))
while emleft > ems / 2:
para = etree.SubElement(para, XHTML('blockquote'))
emleft -= ems
else:
para = wrapper = etree.SubElement(parent, XHTML('p'))
bstate.inline = bstate.para = para
vspace = bstate.vpadding + bstate.vmargin
bstate.vpadding = bstate.vmargin = 0
if tag not in TABLE_TAGS:
if tag in ('ul', 'ol') and vspace > 0:
wrapper.addprevious(etree.Element(XHTML('div'),
height=self.mobimlize_measure(vspace)))
else:
wrapper.attrib['height'] = self.mobimlize_measure(vspace)
para.attrib['width'] = self.mobimlize_measure(indent)
elif tag == 'table' and vspace > 0:
vspace = int(round(vspace / self.profile.fbase))
while vspace > 0:
wrapper.addprevious(etree.Element(XHTML('br')))
vspace -= 1
if istate.halign != 'auto' and isinstance(istate.halign, (bytes, str)):
if isinstance(istate.halign, bytes):
istate.halign = istate.halign.decode('utf-8')
para.attrib['align'] = istate.halign
istate.rendered = True
pstate = bstate.istate
if tag in CONTENT_TAGS:
bstate.inline = para
pstate = bstate.istate = None
try:
etree.SubElement(para, XHTML(tag), attrib=istate.attrib)
except:
print('Invalid subelement:', para, tag, istate.attrib)
raise
elif tag in TABLE_TAGS:
para.attrib['valign'] = 'top'
if istate.ids:
for id_ in istate.ids:
anchor = etree.Element(XHTML('a'), attrib={'id': id_})
if tag == 'li':
try:
last = bstate.body[-1][-1]
except:
break
last.insert(0, anchor)
anchor.tail = last.text
last.text = None
else:
last = bstate.body[-1]
# We use append instead of addprevious so that inline
# anchors in large blocks point to the correct place. See
# https://bugs.launchpad.net/calibre/+bug/899831
# This could potentially break if inserting an anchor at
# this point in the markup is illegal, but I cannot think
# of such a case offhand.
if barename(last.tag) in LEAF_TAGS:
last.addprevious(anchor)
else:
last.append(anchor)
istate.ids.clear()
if not text:
return
if not pstate or istate != pstate:
inline = para
fsize = istate.fsize
href = istate.href
if not href:
bstate.anchor = None
elif pstate and pstate.href == href:
inline = bstate.anchor
else:
inline = etree.SubElement(inline, XHTML('a'), href=href)
bstate.anchor = inline
if fsize != 3:
inline = etree.SubElement(inline, XHTML('font'),
size=str(fsize))
if istate.family == 'monospace':
inline = etree.SubElement(inline, XHTML('tt'))
if istate.italic:
inline = etree.SubElement(inline, XHTML('i'))
if istate.bold:
inline = etree.SubElement(inline, XHTML('b'))
if istate.bgcolor is not None and istate.bgcolor != 'transparent' :
inline = etree.SubElement(inline, XHTML('span'),
bgcolor=convert_color_for_font_tag(istate.bgcolor))
if istate.fgcolor != 'black':
inline = etree.SubElement(inline, XHTML('font'),
color=convert_color_for_font_tag(istate.fgcolor))
if istate.strikethrough:
inline = etree.SubElement(inline, XHTML('s'))
if istate.underline:
inline = etree.SubElement(inline, XHTML('u'))
bstate.inline = inline
bstate.istate = istate
inline = bstate.inline
content = self.preize_text(text, pre_wrap=istate.pre_wrap) if istate.preserve or istate.pre_wrap else [text]
for item in content:
if isinstance(item, string_or_bytes):
if len(inline) == 0:
inline.text = (inline.text or '') + item
else:
last = inline[-1]
last.tail = (last.tail or '') + item
else:
inline.append(item)
def mobimlize_elem(self, elem, stylizer, bstate, istates,
ignore_valign=False):
if not isinstance(elem.tag, string_or_bytes) \
or namespace(elem.tag) != XHTML_NS:
return
style = stylizer.style(elem)
# <mbp:frame-set/> does not exist lalalala
if ((style['display'] in ('none', 'oeb-page-head', 'oeb-page-foot') or style['visibility'] == 'hidden') and
elem.get('data-calibre-jacket-searchable-tags', None) != '1'):
id_ = elem.get('id', None)
if id_:
# Keep anchors so people can use display:none
# to generate hidden TOCs
tail = elem.tail
elem.clear()
elem.text = None
elem.set('id', id_)
elem.tail = tail
elem.tag = XHTML('a')
else:
return
tag = barename(elem.tag)
istate = copy.copy(istates[-1])
istate.rendered = False
istate.list_num = 0
if tag == 'ol' and 'start' in elem.attrib:
try:
istate.list_num = int(elem.attrib['start'])-1
except:
pass
istates.append(istate)
left = 0
display = style['display']
if display == 'table-cell':
display = 'inline'
elif display.startswith('table'):
display = 'block'
isblock = (not display.startswith('inline') and style['display'] !=
'none')
isblock = isblock and style['float'] == 'none'
isblock = isblock and tag != 'br'
if isblock:
bstate.para = None
istate.halign = style['text-align']
rawti = style._get('text-indent')
try:
istate.indent = style['text-indent']
except Exception:
istate.indent = 0
if hasattr(rawti, 'strip') and '%' in rawti:
# We have a percentage text indent, these can come out looking
# too large if the user chooses a wide output profile like
# tablet
istate.indent = min(style._unit_convert(rawti, base=500), istate.indent)
if style['margin-left'] == 'auto' \
and style['margin-right'] == 'auto':
istate.halign = 'center'
margin = convert_margin(style, 'margin-left')
padding = asfloat(style['padding-left'])
if tag != 'body':
left = margin + padding
istate.left += left
vmargin = convert_margin(style, 'margin-top')
bstate.vmargin = max((bstate.vmargin, vmargin))
vpadding = asfloat(style['padding-top'])
if vpadding > 0:
bstate.vpadding += bstate.vmargin
bstate.vmargin = 0
bstate.vpadding += vpadding
elif not istate.href:
margin = convert_margin(style, 'margin-left')
padding = asfloat(style['padding-left'])
lspace = margin + padding
if lspace > 0:
spaces = int(round((lspace * 3) / style['font-size']))
elem.text = ('\xa0' * spaces) + (elem.text or '')
margin = convert_margin(style, 'margin-right')
padding = asfloat(style['padding-right'])
rspace = margin + padding
if rspace > 0:
spaces = int(round((rspace * 3) / style['font-size']))
if len(elem) == 0:
elem.text = (elem.text or '') + ('\xa0' * spaces)
else:
last = elem[-1]
last.text = (last.text or '') + ('\xa0' * spaces)
if bstate.content and style['page-break-before'] in PAGE_BREAKS:
bstate.pbreak = True
istate.fsize = self.mobimlize_font(style['font-size'])
istate.italic = True if style['font-style'] == 'italic' else False
weight = style['font-weight']
istate.bold = weight in ('bold', 'bolder') or asfloat(weight) > 400
istate.preserve = style['white-space'] == 'pre'
istate.pre_wrap = style['white-space'] == 'pre-wrap'
istate.bgcolor = style['background-color']
istate.fgcolor = style['color']
istate.strikethrough = style.effective_text_decoration == 'line-through'
istate.underline = style.effective_text_decoration == 'underline'
ff = style['font-family'].lower() if hasattr(style['font-family'], 'lower') else ''
if 'monospace' in ff or 'courier' in ff or ff.endswith(' mono'):
istate.family = 'monospace'
elif ('sans-serif' in ff or 'sansserif' in ff or 'verdana' in ff or
'arial' in ff or 'helvetica' in ff):
istate.family = 'sans-serif'
else:
istate.family = 'serif'
if 'id' in elem.attrib:
istate.ids.add(elem.attrib['id'])
if 'name' in elem.attrib:
istate.ids.add(elem.attrib['name'])
if tag == 'a' and 'href' in elem.attrib:
istate.href = elem.attrib['href']
istate.attrib.clear()
if tag == 'img' and 'src' in elem.attrib:
istate.attrib['src'] = elem.attrib['src']
istate.attrib['align'] = 'baseline'
cssdict = style.cssdict()
valign = cssdict.get('vertical-align', None)
if valign in ('top', 'bottom', 'middle'):
istate.attrib['align'] = valign
for prop in ('width', 'height'):
if cssdict[prop] != 'auto':
value = style[prop]
if value == getattr(self.profile, prop):
result = '100%'
else:
# Amazon's renderer does not support
# img sizes in units other than px
# See #7520 for test case
try:
pixs = int(round(float(value) /
(72/self.profile.dpi)))
except:
continue
result = str(pixs)
istate.attrib[prop] = result
if 'width' not in istate.attrib or 'height' not in istate.attrib:
href = self.current_spine_item.abshref(elem.attrib['src'])
try:
item = self.oeb.manifest.hrefs[urlnormalize(href)]
except:
self.oeb.logger.warn('Failed to find image:',
href)
else:
try:
width, height = identify(item.data)[1:]
except Exception:
self.oeb.logger.warn('Invalid image:', href)
else:
if 'width' not in istate.attrib and 'height' not in \
istate.attrib:
istate.attrib['width'] = str(width)
istate.attrib['height'] = str(height)
else:
ar = width / height
if 'width' not in istate.attrib:
try:
width = int(istate.attrib['height'])*ar
except:
pass
istate.attrib['width'] = str(int(width))
else:
try:
height = int(istate.attrib['width'])/ar
except:
pass
istate.attrib['height'] = str(int(height))
item.unload_data_from_memory()
elif tag == 'hr' and asfloat(style['width']) > 0 and style._get('width') not in {'100%', 'auto'}:
raww = style._get('width')
if hasattr(raww, 'strip') and '%' in raww:
istate.attrib['width'] = raww
else:
prop = style['width'] / self.profile.width
istate.attrib['width'] = "%d%%" % int(round(prop * 100))
elif display == 'table':
tag = 'table'
elif display == 'table-row':
tag = 'tr'
elif display == 'table-cell':
tag = 'td'
if tag in TABLE_TAGS and self.ignore_tables:
tag = 'span' if tag == 'td' else 'div'
if tag in ('table', 'td', 'tr'):
col = style.backgroundColor
if col:
elem.set('bgcolor', col)
css = style.cssdict()
if 'border' in css or 'border-width' in css:
elem.set('border', '1')
if tag in TABLE_TAGS:
for attr in ('rowspan', 'colspan', 'width', 'border', 'scope',
'bgcolor'):
if attr in elem.attrib:
istate.attrib[attr] = elem.attrib[attr]
if tag == 'q':
t = elem.text
if not t:
t = ''
elem.text = '\u201c' + t
t = elem.tail
if not t:
t = ''
elem.tail = '\u201d' + t
text = None
if elem.text:
if istate.preserve or istate.pre_wrap:
text = elem.text
elif (len(elem) > 0 and isspace(elem.text) and hasattr(elem[0].tag, 'rpartition') and
elem[0].tag.rpartition('}')[-1] not in INLINE_TAGS):
text = None
else:
text = COLLAPSE.sub(' ', elem.text)
valign = style['vertical-align']
not_baseline = valign in ('super', 'sub', 'text-top',
'text-bottom', 'top', 'bottom') or (
isinstance(valign, numbers.Number) and abs(valign) != 0)
issup = valign in ('super', 'text-top', 'top') or (
isinstance(valign, numbers.Number) and valign > 0)
vtag = 'sup' if issup else 'sub'
if not_baseline and not ignore_valign and tag not in NOT_VTAGS and not isblock:
nroot = etree.Element(XHTML('html'), nsmap=MOBI_NSMAP)
vbstate = BlockState(etree.SubElement(nroot, XHTML('body')))
vbstate.para = etree.SubElement(vbstate.body, XHTML('p'))
self.mobimlize_elem(elem, stylizer, vbstate, istates,
ignore_valign=True)
if len(istates) > 0:
istates.pop()
if len(istates) == 0:
istates.append(FormatState())
at_start = bstate.para is None
if at_start:
self.mobimlize_content('span', '', bstate, istates)
parent = bstate.para if bstate.inline is None else bstate.inline
if parent is not None:
vtag = etree.SubElement(parent, XHTML(vtag))
vtag = etree.SubElement(vtag, XHTML('small'))
# Add anchors
for child in vbstate.body:
if child is not vbstate.para:
vtag.append(child)
else:
break
if vbstate.para is not None:
if vbstate.para.text:
vtag.text = vbstate.para.text
for child in vbstate.para:
vtag.append(child)
return
if tag == 'blockquote':
old_mim = self.opts.mobi_ignore_margins
self.opts.mobi_ignore_margins = False
if (text or tag in CONTENT_TAGS or tag in NESTABLE_TAGS or (
# We have an id but no text and no children, the id should still
# be added.
istate.ids and tag in ('a', 'span', 'i', 'b', 'u') and
len(elem)==0)):
if tag == 'li' and len(istates) > 1 and 'value' in elem.attrib:
try:
value = int(elem.attrib['value'])
istates[-2].list_num = value - 1
except:
pass
self.mobimlize_content(tag, text, bstate, istates)
for child in elem:
self.mobimlize_elem(child, stylizer, bstate, istates)
tail = None
if child.tail:
if istate.preserve or istate.pre_wrap:
tail = child.tail
elif bstate.para is None and isspace(child.tail):
tail = None
else:
tail = COLLAPSE.sub(' ', child.tail)
if tail:
self.mobimlize_content(tag, tail, bstate, istates)
if tag == 'blockquote':
self.opts.mobi_ignore_margins = old_mim
if bstate.content and style['page-break-after'] in PAGE_BREAKS:
bstate.pbreak = True
if isblock:
para = bstate.para
if para is not None and para.text == '\xa0' and len(para) < 1:
if style.height > 2:
para.getparent().replace(para, etree.Element(XHTML('br')))
else:
# This is too small to be rendered effectively, drop it
para.getparent().remove(para)
bstate.para = None
bstate.istate = None
vmargin = convert_margin(style, 'margin-bottom')
bstate.vmargin = max((bstate.vmargin, vmargin))
vpadding = asfloat(style['padding-bottom'])
if vpadding > 0:
bstate.vpadding += bstate.vmargin
bstate.vmargin = 0
bstate.vpadding += vpadding
if bstate.nested and bstate.nested[-1].tag == elem.tag:
bstate.nested.pop()
istates.pop()