%PDF- %PDF-
| Direktori : /lib/calibre/calibre/ebooks/oeb/ |
| Current File : //lib/calibre/calibre/ebooks/oeb/base.py |
'''
Basic support for manipulating OEB 1.x/2.0 content and metadata.
'''
__license__ = 'GPL v3'
__copyright__ = '2008, Marshall T. Vandegrift <llasram@gmail.com>'
__docformat__ = 'restructuredtext en'
import os, re, logging, sys, numbers
from collections import defaultdict
from itertools import count
from operator import attrgetter
from lxml import etree, html
from calibre import force_unicode
from calibre.constants import filesystem_encoding, __version__
from calibre.translations.dynamic import translate
from calibre.utils.xml_parse import safe_xml_fromstring
from calibre.ebooks.chardet import xml_to_unicode
from calibre.ebooks.conversion.preprocess import CSSPreProcessor
from calibre import (isbytestring, as_unicode, get_types_map)
from calibre.ebooks.oeb.parse_utils import barename, XHTML_NS, namespace, XHTML, parse_html, NotHTML
from calibre.utils.cleantext import clean_xml_chars
from calibre.utils.short_uuid import uuid4
from polyglot.builtins import iteritems, string_or_bytes, itervalues, codepoint_to_chr
from polyglot.urllib import unquote as urlunquote, urldefrag, urljoin, urlparse, urlunparse
from calibre.utils.icu import numeric_sort_key
XML_NS = 'http://www.w3.org/XML/1998/namespace'
OEB_DOC_NS = 'http://openebook.org/namespaces/oeb-document/1.0/'
OPF1_NS = 'http://openebook.org/namespaces/oeb-package/1.0/'
OPF2_NS = 'http://www.idpf.org/2007/opf'
OPF_NSES = {OPF1_NS, OPF2_NS}
DC09_NS = 'http://purl.org/metadata/dublin_core'
DC10_NS = 'http://purl.org/dc/elements/1.0/'
DC11_NS = 'http://purl.org/dc/elements/1.1/'
DC_NSES = {DC09_NS, DC10_NS, DC11_NS}
XSI_NS = 'http://www.w3.org/2001/XMLSchema-instance'
DCTERMS_NS = 'http://purl.org/dc/terms/'
NCX_NS = 'http://www.daisy.org/z3986/2005/ncx/'
SVG_NS = 'http://www.w3.org/2000/svg'
XLINK_NS = 'http://www.w3.org/1999/xlink'
CALIBRE_NS = 'http://calibre.kovidgoyal.net/2009/metadata'
RE_NS = 'http://exslt.org/regular-expressions'
MBP_NS = 'http://www.mobipocket.com'
EPUB_NS = 'http://www.idpf.org/2007/ops'
MATHML_NS = 'http://www.w3.org/1998/Math/MathML'
XPNSMAP = {
'h': XHTML_NS, 'o1': OPF1_NS, 'o2': OPF2_NS, 'd09': DC09_NS,
'd10': DC10_NS, 'd11': DC11_NS, 'xsi': XSI_NS, 'dt': DCTERMS_NS,
'ncx': NCX_NS, 'svg': SVG_NS, 'xl': XLINK_NS, 're': RE_NS,
'mathml': MATHML_NS, 'mbp': MBP_NS, 'calibre': CALIBRE_NS,
'epub':EPUB_NS
}
OPF1_NSMAP = {'dc': DC11_NS, 'oebpackage': OPF1_NS}
OPF2_NSMAP = {'opf': OPF2_NS, 'dc': DC11_NS, 'dcterms': DCTERMS_NS,
'xsi': XSI_NS, 'calibre': CALIBRE_NS}
def XML(name):
return f'{{{XML_NS}}}{name}'
def OPF(name):
return f'{{{OPF2_NS}}}{name}'
def DC(name):
return f'{{{DC11_NS}}}{name}'
def XSI(name):
return f'{{{XSI_NS}}}{name}'
def DCTERMS(name):
return f'{{{DCTERMS_NS}}}{name}'
def NCX(name):
return f'{{{NCX_NS}}}{name}'
def SVG(name):
return f'{{{SVG_NS}}}{name}'
def XLINK(name):
return f'{{{XLINK_NS}}}{name}'
def CALIBRE(name):
return f'{{{CALIBRE_NS}}}{name}'
_css_url_re = re.compile(r'url\s*\([\'"]{0,1}(.*?)[\'"]{0,1}\)', re.I)
_css_import_re = re.compile(r'@import "(.*?)"')
_archive_re = re.compile(r'[^ ]+')
# Tags that should not be self closed in epub output
self_closing_bad_tags = {'a', 'abbr', 'address', 'article', 'aside', 'audio', 'b',
'bdo', 'blockquote', 'body', 'button', 'cite', 'code', 'dd', 'del', 'details',
'dfn', 'div', 'dl', 'dt', 'em', 'fieldset', 'figcaption', 'figure', 'footer',
'h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'header', 'hgroup', 'i', 'iframe', 'ins', 'kbd',
'label', 'legend', 'li', 'map', 'mark', 'meter', 'nav', 'ol', 'output', 'p',
'pre', 'progress', 'q', 'rp', 'rt', 'samp', 'section', 'select', 'small',
'span', 'strong', 'sub', 'summary', 'sup', 'textarea', 'time', 'ul', 'var',
'video', 'title', 'script', 'style'}
def css_text(x):
ans = x.cssText
if isinstance(ans, bytes):
ans = ans.decode('utf-8', 'replace')
return ans
def as_string_type(pat, for_unicode):
if for_unicode:
if isinstance(pat, bytes):
pat = pat.decode('utf-8')
else:
if isinstance(pat, str):
pat = pat.encode('utf-8')
return pat
def self_closing_pat(for_unicode):
attr = 'unicode_ans' if for_unicode else 'bytes_ans'
ans = getattr(self_closing_pat, attr, None)
if ans is None:
sub = '|'.join(self_closing_bad_tags)
template = r'<(?P<tag>%s)(?=[\s/])(?P<arg>[^>]*)/>'
pat = template % sub
pat = as_string_type(pat, for_unicode)
ans = re.compile(pat, flags=re.IGNORECASE)
setattr(self_closing_pat, attr, ans)
return ans
def close_self_closing_tags(raw):
for_unicode = isinstance(raw, str)
repl = as_string_type(r'<\g<tag>\g<arg>></\g<tag>>', for_unicode)
pat = self_closing_pat(for_unicode)
return pat.sub(repl, raw)
def uuid_id():
return 'u' + uuid4()
def itercsslinks(raw):
for match in _css_url_re.finditer(raw):
yield match.group(1), match.start(1)
for match in _css_import_re.finditer(raw):
yield match.group(1), match.start(1)
_link_attrs = set(html.defs.link_attrs) | {XLINK('href'), 'poster'}
def iterlinks(root, find_links_in_css=True):
'''
Iterate over all links in a OEB Document.
:param root: A valid lxml.etree element.
'''
assert etree.iselement(root)
for el in root.iter('*'):
try:
tag = barename(el.tag).lower()
except Exception:
continue
attribs = el.attrib
if tag == 'object':
codebase = None
# <object> tags have attributes that are relative to
# codebase
if 'codebase' in attribs:
codebase = el.get('codebase')
yield (el, 'codebase', codebase, 0)
for attrib in 'classid', 'data':
if attrib in attribs:
value = el.get(attrib)
if codebase is not None:
value = urljoin(codebase, value)
yield (el, attrib, value, 0)
if 'archive' in attribs:
for match in _archive_re.finditer(el.get('archive')):
value = match.group(0)
if codebase is not None:
value = urljoin(codebase, value)
yield (el, 'archive', value, match.start())
else:
for attr in attribs:
if attr in _link_attrs:
yield (el, attr, attribs[attr], 0)
if not find_links_in_css:
continue
if tag == 'style' and el.text:
for match in _css_url_re.finditer(el.text):
yield (el, None, match.group(1), match.start(1))
for match in _css_import_re.finditer(el.text):
yield (el, None, match.group(1), match.start(1))
if 'style' in attribs:
for match in _css_url_re.finditer(attribs['style']):
yield (el, 'style', match.group(1), match.start(1))
def make_links_absolute(root, base_url):
'''
Make all links in the document absolute, given the
``base_url`` for the document (the full URL where the document
came from)
'''
def link_repl(href):
return urljoin(base_url, href)
rewrite_links(root, link_repl)
def resolve_base_href(root):
base_href = None
basetags = root.xpath('//base[@href]|//h:base[@href]',
namespaces=XPNSMAP)
for b in basetags:
base_href = b.get('href')
b.drop_tree()
if not base_href:
return
make_links_absolute(root, base_href, resolve_base_href=False)
def rewrite_links(root, link_repl_func, resolve_base_href=False):
'''
Rewrite all the links in the document. For each link
``link_repl_func(link)`` will be called, and the return value
will replace the old link.
Note that links may not be absolute (unless you first called
``make_links_absolute()``), and may be internal (e.g.,
``'#anchor'``). They can also be values like
``'mailto:email'`` or ``'javascript:expr'``.
If the ``link_repl_func`` returns None, the attribute or
tag text will be removed completely.
'''
from css_parser import replaceUrls, log, CSSParser
log.setLevel(logging.WARN)
log.raiseExceptions = False
if resolve_base_href:
resolve_base_href(root)
for el, attrib, link, pos in iterlinks(root, find_links_in_css=False):
new_link = link_repl_func(link.strip())
if new_link == link:
continue
if new_link is None:
# Remove the attribute or element content
if attrib is None:
el.text = ''
else:
del el.attrib[attrib]
continue
if attrib is None:
new = el.text[:pos] + new_link + el.text[pos+len(link):]
el.text = new
else:
cur = el.attrib[attrib]
if not pos and len(cur) == len(link):
# Most common case
el.attrib[attrib] = new_link
else:
new = cur[:pos] + new_link + cur[pos+len(link):]
el.attrib[attrib] = new
parser = CSSParser(raiseExceptions=False, log=_css_logger,
fetcher=lambda x:(None, ''))
for el in root.iter(etree.Element):
try:
tag = el.tag
except UnicodeDecodeError:
continue
if tag == XHTML('style') and el.text and \
(_css_url_re.search(el.text) is not None or '@import' in
el.text):
stylesheet = parser.parseString(el.text, validate=False)
replaceUrls(stylesheet, link_repl_func)
repl = css_text(stylesheet)
el.text = '\n'+ clean_xml_chars(repl) + '\n'
text = el.get('style')
if text and _css_url_re.search(text) is not None:
try:
stext = parser.parseStyle(text, validate=False)
except Exception:
# Parsing errors are raised by css_parser
continue
replaceUrls(stext, link_repl_func)
repl = css_text(stext).replace('\n', ' ').replace('\r',
' ')
el.set('style', repl)
types_map = get_types_map()
EPUB_MIME = types_map['.epub']
XHTML_MIME = types_map['.xhtml']
CSS_MIME = types_map['.css']
NCX_MIME = types_map['.ncx']
OPF_MIME = types_map['.opf']
PAGE_MAP_MIME = 'application/oebps-page-map+xml'
OEB_DOC_MIME = 'text/x-oeb1-document'
OEB_CSS_MIME = 'text/x-oeb1-css'
OPENTYPE_MIME = types_map['.otf']
GIF_MIME = types_map['.gif']
JPEG_MIME = types_map['.jpeg']
PNG_MIME = types_map['.png']
SVG_MIME = types_map['.svg']
WEBP_MIME = types_map['.webp']
BINARY_MIME = 'application/octet-stream'
XHTML_CSS_NAMESPACE = '@namespace "%s";\n' % XHTML_NS
OEB_STYLES = {CSS_MIME, OEB_CSS_MIME, 'text/x-oeb-css', 'xhtml/css'}
OEB_DOCS = {XHTML_MIME, 'text/html', OEB_DOC_MIME,
'text/x-oeb-document'}
OEB_RASTER_IMAGES = {GIF_MIME, JPEG_MIME, PNG_MIME, WEBP_MIME}
OEB_IMAGES = {GIF_MIME, JPEG_MIME, PNG_MIME, SVG_MIME}
MS_COVER_TYPE = 'other.ms-coverimage-standard'
ENTITY_RE = re.compile(r'&([a-zA-Z_:][a-zA-Z0-9.-_:]+);')
COLLAPSE_RE = re.compile(r'[ \t\r\n\v]+')
QNAME_RE = re.compile(r'^[{][^{}]+[}][^{}]+$')
PREFIXNAME_RE = re.compile(r'^[^:]+[:][^:]+')
XMLDECL_RE = re.compile(r'^\s*<[?]xml.*?[?]>')
CSSURL_RE = re.compile(r'''url[(](?P<q>["']?)(?P<url>[^)]+)(?P=q)[)]''')
def element(parent, *args, **kwargs):
if parent is not None:
return etree.SubElement(parent, *args, **kwargs)
return etree.Element(*args, **kwargs)
def prefixname(name, nsrmap):
if not isqname(name):
return name
ns = namespace(name)
if ns not in nsrmap:
return name
prefix = nsrmap[ns]
if not prefix:
return barename(name)
return ':'.join((prefix, barename(name)))
def isprefixname(name):
return name and PREFIXNAME_RE.match(name) is not None
def qname(name, nsmap):
if not isprefixname(name):
return name
prefix, local = name.split(':', 1)
if prefix not in nsmap:
return name
return f'{{{nsmap[prefix]}}}{local}'
def isqname(name):
return name and QNAME_RE.match(name) is not None
def XPath(expr):
return etree.XPath(expr, namespaces=XPNSMAP)
def xpath(elem, expr):
return elem.xpath(expr, namespaces=XPNSMAP)
def xml2str(root, pretty_print=False, strip_comments=False, with_tail=True):
if not strip_comments:
# -- in comments trips up adobe digital editions
for x in root.iterdescendants(etree.Comment):
if x.text and '--' in x.text:
x.text = x.text.replace('--', '__')
ans = etree.tostring(root, encoding='utf-8', xml_declaration=True,
pretty_print=pretty_print, with_tail=with_tail)
if strip_comments:
ans = re.compile(br'<!--.*?-->', re.DOTALL).sub(b'', ans)
return ans
def xml2text(elem, pretty_print=False, method='text'):
return etree.tostring(elem, method=method, encoding='unicode', with_tail=False, pretty_print=pretty_print)
def escape_cdata(root):
pat = re.compile(r'[<>&]')
for elem in root.iterdescendants('{%s}style' % XHTML_NS, '{%s}script' % XHTML_NS):
if elem.text and pat.search(elem.text) is not None:
elem.text = etree.CDATA(elem.text.replace(']]>', r'\]\]\>'))
def serialize(data, media_type, pretty_print=False):
if isinstance(data, etree._Element):
is_oeb_doc = media_type in OEB_DOCS
if is_oeb_doc:
escape_cdata(data)
ans = xml2str(data, pretty_print=pretty_print)
if is_oeb_doc:
# Convert self closing div|span|a|video|audio|iframe|etc tags
# to normally closed ones, as they are interpreted
# incorrectly by some browser based renderers
ans = close_self_closing_tags(ans)
return ans
if isinstance(data, str):
return data.encode('utf-8')
if hasattr(data, 'cssText'):
data = data.cssText
if isinstance(data, str):
data = data.encode('utf-8')
return data + b'\n'
return bytes(data)
ASCII_CHARS = frozenset(codepoint_to_chr(x) for x in range(128))
UNIBYTE_CHARS = frozenset(x.encode('ascii') for x in ASCII_CHARS)
USAFE = ('ABCDEFGHIJKLMNOPQRSTUVWXYZ'
'abcdefghijklmnopqrstuvwxyz'
'0123456789' '_.-/~')
URL_SAFE = frozenset(USAFE)
URL_SAFE_BYTES = frozenset(USAFE.encode('ascii'))
URL_UNSAFE = [ASCII_CHARS - URL_SAFE, UNIBYTE_CHARS - URL_SAFE_BYTES]
del USAFE
def urlquote(href):
""" Quote URL-unsafe characters, allowing IRI-safe characters.
That is, this function returns valid IRIs not valid URIs. In particular,
IRIs can contain non-ascii characters. """
result = []
isbytes = isinstance(href, bytes)
unsafe = URL_UNSAFE[int(isbytes)]
esc, join = "%%%02x", ''
if isbytes:
esc, join = esc.encode('ascii'), b''
for char in href:
if char in unsafe:
char = esc % ord(char)
result.append(char)
return join.join(result)
def urlnormalize(href):
"""Convert a URL into normalized form, with all and only URL-unsafe
characters URL quoted.
"""
try:
parts = urlparse(href)
except ValueError as e:
raise ValueError(f'Failed to parse the URL: {href!r} with underlying error: {as_unicode(e)}')
if not parts.scheme or parts.scheme == 'file':
path, frag = urldefrag(href)
parts = ('', '', path, '', '', frag)
parts = (part.replace('\\', '/') for part in parts)
parts = (urlunquote(part) for part in parts)
parts = (urlquote(part) for part in parts)
return urlunparse(parts)
def extract(elem):
"""
Removes this element from the tree, including its children and
text. The tail text is joined to the previous element or
parent.
"""
parent = elem.getparent()
if parent is not None:
if elem.tail:
previous = elem.getprevious()
if previous is None:
parent.text = (parent.text or '') + elem.tail
else:
previous.tail = (previous.tail or '') + elem.tail
parent.remove(elem)
class DummyHandler(logging.Handler):
def __init__(self):
logging.Handler.__init__(self, logging.WARNING)
self.setFormatter(logging.Formatter('%(message)s'))
self.log = None
def emit(self, record):
if self.log is not None:
msg = self.format(record)
f = self.log.error if record.levelno >= logging.ERROR \
else self.log.warn
f(msg)
_css_logger = logging.getLogger('calibre.css')
_css_logger.setLevel(logging.WARNING)
_css_log_handler = DummyHandler()
_css_logger.addHandler(_css_log_handler)
class OEBError(Exception):
"""Generic OEB-processing error."""
pass
class NullContainer:
"""An empty container.
For use with book formats which do not support container-like access.
"""
def __init__(self, log):
self.log = log
def read(self, path):
raise OEBError('Attempt to read from NullContainer')
def write(self, path):
raise OEBError('Attempt to write to NullContainer')
def exists(self, path):
return False
def namelist(self):
return []
class DirContainer:
"""Filesystem directory container."""
def __init__(self, path, log, ignore_opf=False):
self.log = log
if isbytestring(path):
path = path.decode(filesystem_encoding)
self.opfname = None
ext = os.path.splitext(path)[1].lower()
if ext == '.opf':
self.opfname = os.path.basename(path)
self.rootdir = os.path.dirname(path)
return
self.rootdir = path
if not ignore_opf:
for path in self.namelist():
ext = os.path.splitext(path)[1].lower()
if ext == '.opf':
self.opfname = path
return
def _unquote(self, path):
# unquote must run on a bytestring and will return a bytestring
# If it runs on a unicode object, it returns a double encoded unicode
# string: unquote(u'%C3%A4') != unquote(b'%C3%A4').decode('utf-8')
# and the latter is correct
if isinstance(path, str):
path = path.encode('utf-8')
return urlunquote(path).decode('utf-8')
def read(self, path):
if path is None:
path = self.opfname
path = os.path.join(self.rootdir, self._unquote(path))
with lopen(path, 'rb') as f:
return f.read()
def write(self, path, data):
path = os.path.join(self.rootdir, self._unquote(path))
dir = os.path.dirname(path)
if not os.path.isdir(dir):
os.makedirs(dir)
with lopen(path, 'wb') as f:
return f.write(data)
def exists(self, path):
if not path:
return False
try:
path = os.path.join(self.rootdir, self._unquote(path))
except ValueError: # Happens if path contains quoted special chars
return False
try:
return os.path.isfile(path)
except UnicodeEncodeError:
# On linux, if LANG is unset, the os.stat call tries to encode the
# unicode path using ASCII
# To replicate try:
# LANG=en_US.ASCII python -c "import os; os.stat(u'Espa\xf1a')"
return os.path.isfile(path.encode(filesystem_encoding))
def namelist(self):
names = []
base = self.rootdir
for root, dirs, files in os.walk(base):
for fname in files:
fname = os.path.join(root, fname)
if isinstance(fname, bytes):
try:
fname = fname.decode(filesystem_encoding)
except Exception:
try:
fname = fname.decode('utf-8')
except Exception:
continue
fname = fname.replace('\\', '/')
names.append(fname)
return names
class Metadata:
"""A collection of OEB data model metadata.
Provides access to the list of items associated with a particular metadata
term via the term's local name using either Python container or attribute
syntax. Return an empty list for any terms with no currently associated
metadata items.
"""
DC_TERMS = {'contributor', 'coverage', 'creator', 'date',
'description', 'format', 'identifier', 'language',
'publisher', 'relation', 'rights', 'source',
'subject', 'title', 'type'}
CALIBRE_TERMS = {'series', 'series_index', 'rating', 'timestamp',
'publication_type', 'title_sort'}
OPF_ATTRS = {'role': OPF('role'), 'file-as': OPF('file-as'),
'scheme': OPF('scheme'), 'event': OPF('event'),
'type': XSI('type'), 'lang': XML('lang'), 'id': 'id'}
OPF1_NSMAP = {'dc': DC11_NS, 'oebpackage': OPF1_NS}
OPF2_NSMAP = {'opf': OPF2_NS, 'dc': DC11_NS, 'dcterms': DCTERMS_NS,
'xsi': XSI_NS, 'calibre': CALIBRE_NS}
class Item:
"""An item of OEB data model metadata.
The metadata term or name may be accessed via the :attr:`term` or
:attr:`name` attributes. The metadata value or content may be accessed
via the :attr:`value` or :attr:`content` attributes, or via Unicode or
string representations of the object.
OEB data model metadata attributes may be accessed either via their
fully-qualified names using the Python container access syntax, or via
their local names using Python attribute syntax. Only attributes
allowed by the OPF 2.0 specification are supported.
"""
class Attribute:
"""Smart accessor for allowed OEB metadata item attributes."""
def __init__(self, attr, allowed=None):
if not callable(attr):
attr_, attr = attr, lambda term: attr_
self.attr = attr
self.allowed = allowed
def term_attr(self, obj):
term = obj.term
if namespace(term) != DC11_NS:
term = OPF('meta')
allowed = self.allowed
if allowed is not None and term not in allowed:
raise AttributeError(
'attribute {!r} not valid for metadata term {!r}'.format(
self.attr(term), barename(obj.term)))
return self.attr(term)
def __get__(self, obj, cls):
if obj is None:
return None
return obj.attrib.get(self.term_attr(obj), '')
def __set__(self, obj, value):
obj.attrib[self.term_attr(obj)] = value
def __init__(self, term, value, attrib={}, nsmap={}, **kwargs):
self.attrib = attrib = dict(attrib)
self.nsmap = nsmap = dict(nsmap)
attrib.update(kwargs)
if namespace(term) == OPF2_NS:
term = barename(term)
ns = namespace(term)
local = barename(term).lower()
if local in Metadata.DC_TERMS and (not ns or ns in DC_NSES):
# Anything looking like Dublin Core is coerced
term = DC(local)
elif local in Metadata.CALIBRE_TERMS and ns in (CALIBRE_NS, ''):
# Ditto for Calibre-specific metadata
term = CALIBRE(local)
self.term = term
self.value = value
for attr, value in tuple(iteritems(attrib)):
if isprefixname(value):
attrib[attr] = qname(value, nsmap)
nsattr = Metadata.OPF_ATTRS.get(attr, attr)
if nsattr == OPF('scheme') and namespace(term) != DC11_NS:
# The opf:meta element takes @scheme, not @opf:scheme
nsattr = 'scheme'
if attr != nsattr:
attrib[nsattr] = attrib.pop(attr)
@property
def name(self):
return self.term
@property
def content(self):
return self.value
@content.setter
def content(self, value):
self.value = value
scheme = Attribute(lambda term: 'scheme' if
term == OPF('meta') else OPF('scheme'),
[DC('identifier'), OPF('meta')])
file_as = Attribute(OPF('file-as'), [DC('creator'), DC('contributor'),
DC('title')])
role = Attribute(OPF('role'), [DC('creator'), DC('contributor')])
event = Attribute(OPF('event'), [DC('date')])
id = Attribute('id')
type = Attribute(XSI('type'), [DC('date'), DC('format'),
DC('type')])
lang = Attribute(XML('lang'), [DC('contributor'), DC('coverage'),
DC('creator'), DC('publisher'),
DC('relation'), DC('rights'),
DC('source'), DC('subject'),
OPF('meta')])
def __getitem__(self, key):
return self.attrib[key]
def __setitem__(self, key, value):
self.attrib[key] = value
def __contains__(self, key):
return key in self.attrib
def get(self, key, default=None):
return self.attrib.get(key, default)
def __repr__(self):
return 'Item(term=%r, value=%r, attrib=%r)' \
% (barename(self.term), self.value, self.attrib)
def __str__(self):
return as_unicode(self.value)
def to_opf1(self, dcmeta=None, xmeta=None, nsrmap={}):
attrib = {}
for key, value in self.attrib.items():
if namespace(key) == OPF2_NS:
key = barename(key)
attrib[key] = prefixname(value, nsrmap)
if namespace(self.term) == DC11_NS:
name = DC(icu_title(barename(self.term)))
elem = element(dcmeta, name, attrib=attrib)
elem.text = self.value
else:
elem = element(xmeta, 'meta', attrib=attrib)
elem.attrib['name'] = prefixname(self.term, nsrmap)
elem.attrib['content'] = prefixname(self.value, nsrmap)
return elem
def to_opf2(self, parent=None, nsrmap={}):
attrib = {}
for key, value in self.attrib.items():
attrib[key] = prefixname(value, nsrmap)
if namespace(self.term) == DC11_NS:
elem = element(parent, self.term, attrib=attrib)
try:
elem.text = self.value
except:
elem.text = repr(self.value)
else:
elem = element(parent, OPF('meta'), attrib=attrib)
elem.attrib['name'] = prefixname(self.term, nsrmap)
elem.attrib['content'] = prefixname(self.value, nsrmap)
return elem
def __init__(self, oeb):
self.oeb = oeb
self.items = defaultdict(list)
self.primary_writing_mode = None
def add(self, term, value, attrib={}, nsmap={}, **kwargs):
"""Add a new metadata item."""
item = self.Item(term, value, attrib, nsmap, **kwargs)
items = self.items[barename(item.term)]
items.append(item)
return item
def iterkeys(self):
yield from self.items
__iter__ = iterkeys
def clear(self, key):
l = self.items[key]
for x in list(l):
l.remove(x)
def filter(self, key, predicate):
l = self.items[key]
for x in list(l):
if predicate(x):
l.remove(x)
def __getitem__(self, key):
return self.items[key]
def __contains__(self, key):
return key in self.items
def __getattr__(self, term):
return self.items[term]
@property
def _nsmap(self):
nsmap = {}
for term in self.items:
for item in self.items[term]:
nsmap.update(item.nsmap)
return nsmap
@property
def _opf1_nsmap(self):
nsmap = self._nsmap
for key, value in nsmap.items():
if value in OPF_NSES or value in DC_NSES:
del nsmap[key]
return nsmap
@property
def _opf2_nsmap(self):
nsmap = self._nsmap
nsmap.update(OPF2_NSMAP)
return nsmap
def to_opf1(self, parent=None):
nsmap = self._opf1_nsmap
nsrmap = {value: key for key, value in iteritems(nsmap)}
elem = element(parent, 'metadata', nsmap=nsmap)
dcmeta = element(elem, 'dc-metadata', nsmap=OPF1_NSMAP)
xmeta = element(elem, 'x-metadata')
for term in self.items:
for item in self.items[term]:
item.to_opf1(dcmeta, xmeta, nsrmap=nsrmap)
if 'ms-chaptertour' not in self.items:
chaptertour = self.Item('ms-chaptertour', 'chaptertour')
chaptertour.to_opf1(dcmeta, xmeta, nsrmap=nsrmap)
return elem
def to_opf2(self, parent=None):
nsmap = self._opf2_nsmap
nsrmap = {value: key for key, value in iteritems(nsmap)}
elem = element(parent, OPF('metadata'), nsmap=nsmap)
for term in self.items:
for item in self.items[term]:
item.to_opf2(elem, nsrmap=nsrmap)
if self.primary_writing_mode:
elem.append(elem.makeelement(OPF('meta'), attrib={'name':'primary-writing-mode', 'content':self.primary_writing_mode}))
return elem
class Manifest:
"""Collection of files composing an OEB data model book.
Provides access to the content of the files composing the book and
attributes associated with those files, including their internal paths,
unique identifiers, and MIME types.
Itself acts as a :class:`set` of manifest items, and provides the following
instance data member for dictionary-like access:
:attr:`ids`: A dictionary in which the keys are the unique identifiers of
the manifest items and the values are the items themselves.
:attr:`hrefs`: A dictionary in which the keys are the internal paths of the
manifest items and the values are the items themselves.
"""
class Item:
"""An OEB data model book content file.
Provides the following data members for accessing the file content and
metadata associated with this particular file.
:attr:`id`: Unique identifier.
:attr:`href`: Book-internal path.
:attr:`media_type`: MIME type of the file content.
:attr:`fallback`: Unique id of any fallback manifest item associated
with this manifest item.
:attr:`spine_position`: Display/reading order index for book textual
content. `None` for manifest items which are not part of the
book's textual content.
:attr:`linear`: `True` for textual content items which are part of the
primary linear reading order and `False` for textual content items
which are not (such as footnotes). Meaningless for items which
have a :attr:`spine_position` of `None`.
"""
def __init__(self, oeb, id, href, media_type,
fallback=None, loader=str, data=None):
if href:
href = str(href)
self.oeb = oeb
self.id = id
self.href = self.path = urlnormalize(href)
self.media_type = media_type
self.fallback = fallback
self.override_css_fetch = None
self.resolve_css_imports = True
self.spine_position = None
self.linear = True
if loader is None and data is None:
loader = oeb.container.read
self._loader = loader
self._data = data
def __repr__(self):
return 'Item(id=%r, href=%r, media_type=%r)' \
% (self.id, self.href, self.media_type)
# Parsing {{{
def _parse_xml(self, data):
if not data:
return
data = xml_to_unicode(data, strip_encoding_pats=True,
assume_utf8=True, resolve_entities=True)[0]
return safe_xml_fromstring(data)
def _parse_xhtml(self, data):
orig_data = data
fname = urlunquote(self.href)
self.oeb.log.debug('Parsing', fname, '...')
self.oeb.html_preprocessor.current_href = self.href
try:
data = parse_html(data, log=self.oeb.log,
decoder=self.oeb.decode,
preprocessor=self.oeb.html_preprocessor,
filename=fname, non_html_file_tags={'ncx'})
except NotHTML:
return self._parse_xml(orig_data)
return data
def _parse_txt(self, data):
has_html = '<html>'
if isinstance(data, bytes):
has_html = has_html.encode('ascii')
if has_html in data:
return self._parse_xhtml(data)
self.oeb.log.debug('Converting', self.href, '...')
from calibre.ebooks.txt.processor import convert_markdown
title = self.oeb.metadata.title
if title:
title = str(title[0])
else:
title = _('Unknown')
return self._parse_xhtml(convert_markdown(data, title=title))
def _parse_css(self, data):
from css_parser import CSSParser, log, resolveImports
from css_parser.css import CSSRule
log.setLevel(logging.WARN)
log.raiseExceptions = False
self.oeb.log.debug('Parsing', self.href, '...')
data = self.oeb.decode(data)
data = self.oeb.css_preprocessor(data, add_namespace=False)
parser = CSSParser(loglevel=logging.WARNING,
fetcher=self.override_css_fetch or self._fetch_css,
log=_css_logger)
data = parser.parseString(data, href=self.href, validate=False)
if self.resolve_css_imports:
data = resolveImports(data)
for rule in tuple(data.cssRules.rulesOfType(CSSRule.PAGE_RULE)):
data.cssRules.remove(rule)
return data
def _fetch_css(self, path):
hrefs = self.oeb.manifest.hrefs
if path not in hrefs:
self.oeb.logger.warn('CSS import of missing file %r' % path)
return (None, None)
item = hrefs[path]
if item.media_type not in OEB_STYLES:
self.oeb.logger.warn('CSS import of non-CSS file %r' % path)
return (None, None)
data = item.data.cssText
enc = None if isinstance(data, str) else 'utf-8'
return (enc, data)
# }}}
@property
def data(self):
"""Provides MIME type sensitive access to the manifest
entry's associated content.
- XHTML, HTML, and variant content is parsed as necessary to
convert and return as an lxml.etree element in the XHTML
namespace.
- XML content is parsed and returned as an lxml.etree element.
- CSS and CSS-variant content is parsed and returned as a css_parser
CSS DOM stylesheet.
- All other content is returned as a :class:`str` or :class:`bytes`
object with no special parsing.
"""
data = self._data
if data is None:
if self._loader is None:
return None
data = self._loader(getattr(self, 'html_input_href',
self.href))
try:
mt = self.media_type.lower()
except Exception:
mt = 'application/octet-stream'
if not isinstance(data, string_or_bytes):
pass # already parsed
elif mt in OEB_DOCS:
data = self._parse_xhtml(data)
elif mt[-4:] in ('+xml', '/xml'):
data = self._parse_xml(data)
elif mt in OEB_STYLES:
data = self._parse_css(data)
elif mt == 'text/plain':
self.oeb.log.warn('%s contains data in TXT format'%self.href,
'converting to HTML')
data = self._parse_txt(data)
self.media_type = XHTML_MIME
self._data = data
return data
@data.setter
def data(self, value):
self._data = value
@data.deleter
def data(self):
self._data = None
def reparse_css(self):
self._data = self._parse_css(str(self))
def unload_data_from_memory(self, memory=None):
if isinstance(self._data, bytes):
if memory is None:
from calibre.ptempfile import PersistentTemporaryFile
pt = PersistentTemporaryFile(suffix='_oeb_base_mem_unloader.img')
with pt:
pt.write(self._data)
self.oeb._temp_files.append(pt.name)
def loader(*args):
with open(pt.name, 'rb') as f:
ans = f.read()
os.remove(pt.name)
return ans
self._loader = loader
else:
def loader2(*args):
with open(memory, 'rb') as f:
ans = f.read()
return ans
self._loader = loader2
self._data = None
@property
def unicode_representation(self):
data = self.data
if isinstance(data, etree._Element):
return xml2text(data, pretty_print=self.oeb.pretty_print)
if isinstance(data, str):
return data
if hasattr(data, 'cssText'):
return css_text(data)
return str(data)
@property
def bytes_representation(self):
return serialize(self.data, self.media_type, pretty_print=self.oeb.pretty_print)
def __str__(self):
return self.unicode_representation
def __eq__(self, other):
return self is other
def __ne__(self, other):
return self is not other
def __hash__(self):
return id(self)
@property
def sort_key(self):
href = self.href
if isinstance(href, bytes):
href = force_unicode(href)
sp = self.spine_position if isinstance(self.spine_position, numbers.Number) else sys.maxsize
return sp, (self.media_type or '').lower(), numeric_sort_key(href), self.id
def relhref(self, href):
"""Convert the URL provided in :param:`href` from a book-absolute
reference to a reference relative to this manifest item.
"""
return rel_href(self.href, href)
def abshref(self, href):
"""Convert the URL provided in :param:`href` from a reference
relative to this manifest item to a book-absolute reference.
"""
try:
purl = urlparse(href)
except ValueError:
return href
scheme = purl.scheme
if scheme and scheme != 'file':
return href
purl = list(purl)
purl[0] = ''
href = urlunparse(purl)
path, frag = urldefrag(href)
if not path:
if frag:
return '#'.join((self.href, frag))
else:
return self.href
if '/' not in self.href:
return href
dirname = os.path.dirname(self.href)
href = os.path.join(dirname, href)
href = os.path.normpath(href).replace('\\', '/')
return href
def __init__(self, oeb):
self.oeb = oeb
self.items = set()
self.ids = {}
self.hrefs = {}
def add(self, id, href, media_type, fallback=None, loader=None, data=None):
"""Add a new item to the book manifest.
The item's :param:`id`, :param:`href`, and :param:`media_type` are all
required. A :param:`fallback` item-id is required for any items with a
MIME type which is not one of the OPS core media types. Either the
item's data itself may be provided with :param:`data`, or a loader
function for the data may be provided with :param:`loader`, or the
item's data may later be set manually via the :attr:`data` attribute.
"""
item = self.Item(
self.oeb, id, href, media_type, fallback, loader, data)
self.items.add(item)
self.ids[item.id] = item
self.hrefs[item.href] = item
return item
def remove(self, item):
"""Removes :param:`item` from the manifest."""
if item in self.ids:
item = self.ids[item]
del self.ids[item.id]
if item.href in self.hrefs:
del self.hrefs[item.href]
self.items.remove(item)
if item in self.oeb.spine:
self.oeb.spine.remove(item)
def remove_duplicate_item(self, item):
if item in self.ids:
item = self.ids[item]
del self.ids[item.id]
self.items.remove(item)
def generate(self, id=None, href=None):
"""Generate a new unique identifier and/or internal path for use in
creating a new manifest item, using the provided :param:`id` and/or
:param:`href` as bases.
Returns an two-tuple of the new id and path. If either :param:`id` or
:param:`href` are `None` then the corresponding item in the return
tuple will also be `None`.
"""
if id is not None:
base = id
index = 1
while id in self.ids:
id = base + str(index)
index += 1
if href is not None:
href = urlnormalize(href)
base, ext = os.path.splitext(href)
index = 1
lhrefs = {x.lower() for x in self.hrefs}
while href.lower() in lhrefs:
href = base + str(index) + ext
index += 1
return id, str(href)
def __iter__(self):
yield from self.items
def __len__(self):
return len(self.items)
def values(self):
return list(self.items)
def __contains__(self, item):
return item in self.items
def to_opf1(self, parent=None):
elem = element(parent, 'manifest')
for item in self.items:
media_type = item.media_type
if media_type in OEB_DOCS:
media_type = OEB_DOC_MIME
elif media_type in OEB_STYLES:
media_type = OEB_CSS_MIME
attrib = {'id': item.id, 'href': urlunquote(item.href),
'media-type': media_type}
if item.fallback:
attrib['fallback'] = item.fallback
element(elem, 'item', attrib=attrib)
return elem
def to_opf2(self, parent=None):
elem = element(parent, OPF('manifest'))
for item in sorted(self.items, key=attrgetter('sort_key')):
media_type = item.media_type
if media_type in OEB_DOCS:
media_type = XHTML_MIME
elif media_type in OEB_STYLES:
media_type = CSS_MIME
attrib = {'id': item.id, 'href': urlunquote(item.href),
'media-type': media_type}
if item.fallback:
attrib['fallback'] = item.fallback
element(elem, OPF('item'), attrib=attrib)
return elem
@property
def main_stylesheet(self):
ans = getattr(self, '_main_stylesheet', None)
if ans is None:
for item in self:
if item.media_type.lower() in OEB_STYLES:
ans = item
break
return ans
@main_stylesheet.setter
def main_stylesheet(self, item):
self._main_stylesheet = item
class Spine:
"""Collection of manifest items composing an OEB data model book's main
textual content.
The spine manages which manifest items compose the book's main textual
content and the sequence in which they appear. Provides Python container
access as a list-like object.
"""
def __init__(self, oeb):
self.oeb = oeb
self.items = []
self.page_progression_direction = None
def _linear(self, linear):
if isinstance(linear, string_or_bytes):
linear = linear.lower()
if linear is None or linear in ('yes', 'true'):
linear = True
elif linear in ('no', 'false'):
linear = False
return linear
def add(self, item, linear=None):
"""Append :param:`item` to the end of the `Spine`."""
item.linear = self._linear(linear)
item.spine_position = len(self.items)
self.items.append(item)
return item
def insert(self, index, item, linear):
"""Insert :param:`item` at position :param:`index` in the `Spine`."""
item.linear = self._linear(linear)
item.spine_position = index
self.items.insert(index, item)
for i in range(index, len(self.items)):
self.items[i].spine_position = i
return item
def remove(self, item):
"""Remove :param:`item` from the `Spine`."""
index = item.spine_position
self.items.pop(index)
for i in range(index, len(self.items)):
self.items[i].spine_position = i
item.spine_position = None
def index(self, item):
for i, x in enumerate(self):
if item == x:
return i
return -1
def __iter__(self):
yield from self.items
def __getitem__(self, index):
return self.items[index]
def __len__(self):
return len(self.items)
def __contains__(self, item):
return (item in self.items)
def to_opf1(self, parent=None):
elem = element(parent, 'spine')
for item in self.items:
if item.linear:
element(elem, 'itemref', attrib={'idref': item.id})
return elem
def to_opf2(self, parent=None):
elem = element(parent, OPF('spine'))
for item in self.items:
attrib = {'idref': item.id}
if not item.linear:
attrib['linear'] = 'no'
element(elem, OPF('itemref'), attrib=attrib)
return elem
class Guide:
"""Collection of references to standard frequently-occurring sections
within an OEB data model book.
Provides dictionary-like access, in which the keys are the OEB reference
type identifiers and the values are `Reference` objects.
"""
class Reference:
"""Reference to a standard book section.
Provides the following instance data members:
:attr:`type`: Reference type identifier, as chosen from the list
allowed in the OPF 2.0 specification.
:attr:`title`: Human-readable section title.
:attr:`href`: Book-internal URL of the referenced section. May include
a fragment identifier.
"""
_TYPES_TITLES = [('cover', __('Cover')),
('title-page', __('Title page')),
('toc', __('Table of Contents')),
('index', __('Index')),
('glossary', __('Glossary')),
('acknowledgements', __('Acknowledgements')),
('bibliography', __('Bibliography')),
('colophon', __('Colophon')),
('copyright-page', __('Copyright')),
('dedication', __('Dedication')),
('epigraph', __('Epigraph')),
('foreword', __('Foreword')),
('loi', __('List of illustrations')),
('lot', __('List of tables')),
('notes', __('Notes')),
('preface', __('Preface')),
('text', __('Main text'))]
TITLES = dict(_TYPES_TITLES)
TYPES = frozenset(TITLES)
ORDER = {t: i for i, (t, _) in enumerate(_TYPES_TITLES)}
def __init__(self, oeb, type, title, href):
self.oeb = oeb
if type.lower() in self.TYPES:
type = type.lower()
elif type not in self.TYPES and \
not type.startswith('other.'):
type = 'other.' + type
if not title and type in self.TITLES:
title = oeb.translate(self.TITLES[type])
self.type = type
self.title = title
self.href = urlnormalize(href)
def __repr__(self):
return 'Reference(type=%r, title=%r, href=%r)' \
% (self.type, self.title, self.href)
@property
def item(self):
"""The manifest item associated with this reference."""
path = urldefrag(self.href)[0]
hrefs = self.oeb.manifest.hrefs
return hrefs.get(path, None)
def __init__(self, oeb):
self.oeb = oeb
self.refs = {}
def add(self, type, title, href):
"""Add a new reference to the `Guide`."""
if href:
href = str(href)
ref = self.Reference(self.oeb, type, title, href)
self.refs[type] = ref
return ref
def remove(self, type):
return self.refs.pop(type, None)
def remove_by_href(self, href):
remove = [r for r, i in iteritems(self.refs) if i.href == href]
for r in remove:
self.remove(r)
def iterkeys(self):
yield from self.refs
__iter__ = iterkeys
def values(self):
return sorted(itervalues(self.refs), key=lambda ref: ref.ORDER.get(ref.type, 10000))
def items(self):
yield from self.refs.items()
def __getitem__(self, key):
return self.refs[key]
def get(self, key):
return self.refs.get(key)
def __delitem__(self, key):
del self.refs[key]
def __contains__(self, key):
return key in self.refs
def __len__(self):
return len(self.refs)
def to_opf1(self, parent=None):
elem = element(parent, 'guide')
for ref in self.refs.values():
attrib = {'type': ref.type, 'href': urlunquote(ref.href)}
if ref.title:
attrib['title'] = ref.title
element(elem, 'reference', attrib=attrib)
return elem
def to_opf2(self, parent=None):
if not len(self):
return
elem = element(parent, OPF('guide'))
for ref in self.refs.values():
attrib = {'type': ref.type, 'href': urlunquote(ref.href)}
if ref.title:
attrib['title'] = ref.title
element(elem, OPF('reference'), attrib=attrib)
return elem
class TOC:
"""Represents a hierarchical table of contents or navigation tree for
accessing arbitrary semantic sections within an OEB data model book.
Acts as a node within the navigation tree. Provides list-like access to
sub-nodes. Provides the follow node instance data attributes:
:attr:`title`: The title of this navigation node.
:attr:`href`: Book-internal URL referenced by this node.
:attr:`klass`: Optional semantic class referenced by this node.
:attr:`id`: Option unique identifier for this node.
:attr:`author`: Optional author attribution for periodicals <mbp:>
:attr:`description`: Optional description attribute for periodicals <mbp:>
:attr:`toc_thumbnail`: Optional toc thumbnail image
"""
def __init__(self, title=None, href=None, klass=None, id=None,
play_order=None, author=None, description=None, toc_thumbnail=None):
self.title = title
self.href = urlnormalize(href) if href else href
self.klass = klass
self.id = id
self.nodes = []
self.play_order = 0
if play_order is None:
play_order = self.next_play_order()
self.play_order = play_order
self.author = author
self.description = description
self.toc_thumbnail = toc_thumbnail
def add(self, title, href, klass=None, id=None, play_order=0, author=None, description=None, toc_thumbnail=None):
"""Create and return a new sub-node of this node."""
node = TOC(title, href, klass, id, play_order, author, description, toc_thumbnail)
self.nodes.append(node)
return node
def remove(self, node):
for child in self.nodes:
if child is node:
self.nodes.remove(child)
return True
else:
if child.remove(node):
return True
return False
def iter(self):
"""Iterate over this node and all descendants in depth-first order."""
yield self
for child in self.nodes:
yield from child.iter()
def count(self):
return len(list(self.iter())) - 1
def next_play_order(self):
entries = [x.play_order for x in self.iter()]
base = max(entries) if entries else 0
return base+1
def has_href(self, href):
for x in self.iter():
if x.href == href:
return True
return False
def has_text(self, text):
for x in self.iter():
if x.title and x.title.lower() == text.lower():
return True
return False
def iterdescendants(self, breadth_first=False):
"""Iterate over all descendant nodes in depth-first order."""
if breadth_first:
for child in self.nodes:
yield child
for child in self.nodes:
yield from child.iterdescendants(breadth_first=True)
else:
for child in self.nodes:
yield from child.iter()
def __iter__(self):
"""Iterate over all immediate child nodes."""
yield from self.nodes
def __getitem__(self, index):
return self.nodes[index]
def autolayer(self):
"""Make sequences of children pointing to the same content file into
children of the first node referencing that file.
"""
prev = None
for node in list(self.nodes):
if prev and urldefrag(prev.href)[0] == urldefrag(node.href)[0]:
self.nodes.remove(node)
prev.nodes.append(node)
else:
prev = node
def depth(self):
"""The maximum depth of the navigation tree rooted at this node."""
try:
return max(node.depth() for node in self.nodes) + 1
except ValueError:
return 1
def get_lines(self, lvl=0):
ans = [('\t'*lvl) + 'TOC: %s --> %s'%(self.title, self.href)]
for child in self:
ans.extend(child.get_lines(lvl+1))
return ans
def __str__(self):
return '\n'.join(self.get_lines())
def to_opf1(self, tour):
for node in self.nodes:
element(tour, 'site', attrib={
'title': node.title, 'href': urlunquote(node.href)})
node.to_opf1(tour)
return tour
def to_ncx(self, parent=None):
if parent is None:
parent = etree.Element(NCX('navMap'))
for node in self.nodes:
id = node.id or uuid_id()
po = node.play_order
if po == 0:
po = 1
attrib = {'id': id, 'playOrder': str(po)}
if node.klass:
attrib['class'] = node.klass
point = element(parent, NCX('navPoint'), attrib=attrib)
label = etree.SubElement(point, NCX('navLabel'))
title = node.title
if title:
title = re.sub(r'\s+', ' ', title)
element(label, NCX('text')).text = title
# Do not unescape this URL as ADE requires it to be escaped to
# handle semi colons and other special characters in the file names
element(point, NCX('content'), src=node.href)
node.to_ncx(point)
return parent
def rationalize_play_orders(self):
'''
Ensure that all nodes with the same play_order have the same href and
with different play_orders have different hrefs.
'''
def po_node(n):
for x in self.iter():
if x is n:
return
if x.play_order == n.play_order:
return x
def href_node(n):
for x in self.iter():
if x is n:
return
if x.href == n.href:
return x
for x in self.iter():
y = po_node(x)
if y is not None:
if x.href != y.href:
x.play_order = getattr(href_node(x), 'play_order',
self.next_play_order())
y = href_node(x)
if y is not None:
x.play_order = y.play_order
class PageList:
"""Collection of named "pages" to mapped positions within an OEB data model
book's textual content.
Provides list-like access to the pages.
"""
class Page:
"""Represents a mapping between a page name and a position within
the book content.
Provides the following instance data attributes:
:attr:`name`: The name of this page. Generally a number.
:attr:`href`: Book-internal URL at which point this page begins.
:attr:`type`: Must be one of 'front' (for prefatory pages, as commonly
labeled in print with small-case Roman numerals), 'normal' (for
standard pages, as commonly labeled in print with Arabic numerals),
or 'special' (for other pages, as commonly not labeled in any
fashion in print, such as the cover and title pages).
:attr:`klass`: Optional semantic class of this page.
:attr:`id`: Optional unique identifier for this page.
"""
TYPES = {'front', 'normal', 'special'}
def __init__(self, name, href, type='normal', klass=None, id=None):
self.name = str(name)
self.href = urlnormalize(href)
self.type = type if type in self.TYPES else 'normal'
self.id = id
self.klass = klass
def __init__(self):
self.pages = []
def add(self, name, href, type='normal', klass=None, id=None):
"""Create a new page and add it to the `PageList`."""
page = self.Page(name, href, type, klass, id)
self.pages.append(page)
return page
def __len__(self):
return len(self.pages)
def __iter__(self):
yield from self.pages
def __getitem__(self, index):
return self.pages[index]
def pop(self, index=-1):
return self.pages.pop(index)
def remove(self, page):
return self.pages.remove(page)
def to_ncx(self, parent=None):
plist = element(parent, NCX('pageList'), id=uuid_id())
values = {t: count(1) for t in ('front', 'normal', 'special')}
for page in self.pages:
id = page.id or uuid_id()
type = page.type
value = str(next(values[type]))
attrib = {'id': id, 'value': value, 'type': type, 'playOrder': '0'}
if page.klass:
attrib['class'] = page.klass
ptarget = element(plist, NCX('pageTarget'), attrib=attrib)
label = element(ptarget, NCX('navLabel'))
element(label, NCX('text')).text = page.name
element(ptarget, NCX('content'), src=page.href)
return plist
def to_page_map(self):
pmap = etree.Element(OPF('page-map'), nsmap={None: OPF2_NS})
for page in self.pages:
element(pmap, OPF('page'), name=page.name, href=page.href)
return pmap
class OEBBook:
"""Representation of a book in the IDPF OEB data model."""
COVER_SVG_XP = XPath('h:body//svg:svg[position() = 1]')
COVER_OBJECT_XP = XPath('h:body//h:object[@data][position() = 1]')
def __init__(self, logger,
html_preprocessor,
css_preprocessor=CSSPreProcessor(),
encoding='utf-8', pretty_print=False,
input_encoding='utf-8'):
"""Create empty book. Arguments:
:param:`encoding`: Default encoding for textual content read
from an external container.
:param:`pretty_print`: Whether or not the canonical string form
of XML markup is pretty-printed.
:param html_preprocessor: A callable that takes a unicode object
and returns a unicode object. Will be called on all html files
before they are parsed.
:param css_preprocessor: A callable that takes a unicode object
and returns a unicode object. Will be called on all CSS files
before they are parsed.
:param:`logger`: A Log object to use for logging all messages
related to the processing of this book. It is accessible
via the instance data members :attr:`logger,log`.
It provides the following public instance data members for
accessing various parts of the OEB data model:
:attr:`metadata`: Metadata such as title, author name(s), etc.
:attr:`manifest`: Manifest of all files included in the book,
including MIME types and fallback information.
:attr:`spine`: In-order list of manifest items which compose
the textual content of the book.
:attr:`guide`: Collection of references to standard positions
within the text, such as the cover, preface, etc.
:attr:`toc`: Hierarchical table of contents.
:attr:`pages`: List of "pages," such as indexed to a print edition of
the same text.
"""
_css_log_handler.log = logger
self.encoding = encoding
self.input_encoding = input_encoding
self.html_preprocessor = html_preprocessor
self.css_preprocessor = css_preprocessor
self.pretty_print = pretty_print
self.logger = self.log = logger
self.version = '2.0'
self.container = NullContainer(self.log)
self.metadata = Metadata(self)
self.uid = None
self.manifest = Manifest(self)
self.spine = Spine(self)
self.guide = Guide(self)
self.toc = TOC()
self.pages = PageList()
self.auto_generated_toc = True
self._temp_files = []
def clean_temp_files(self):
for path in self._temp_files:
try:
os.remove(path)
except:
pass
@classmethod
def generate(cls, opts):
"""Generate an OEBBook instance from command-line options."""
encoding = opts.encoding
pretty_print = opts.pretty_print
return cls(encoding=encoding, pretty_print=pretty_print)
def translate(self, text):
"""Translate :param:`text` into the book's primary language."""
lang = str(self.metadata.language[0])
lang = lang.split('-', 1)[0].lower()
return translate(lang, text)
def decode(self, data):
"""Automatically decode :param:`data` into a `unicode` object."""
def fix_data(d):
return d.replace('\r\n', '\n').replace('\r', '\n')
if isinstance(data, str):
return fix_data(data)
bom_enc = None
if data[:4] in (b'\0\0\xfe\xff', b'\xff\xfe\0\0'):
bom_enc = {b'\0\0\xfe\xff':'utf-32-be',
b'\xff\xfe\0\0':'utf-32-le'}[data[:4]]
data = data[4:]
elif data[:2] in (b'\xff\xfe', b'\xfe\xff'):
bom_enc = {b'\xff\xfe':'utf-16-le', 'b\xfe\xff':'utf-16-be'}[data[:2]]
data = data[2:]
elif data[:3] == b'\xef\xbb\xbf':
bom_enc = 'utf-8'
data = data[3:]
if bom_enc is not None:
try:
return fix_data(data.decode(bom_enc))
except UnicodeDecodeError:
pass
if self.input_encoding:
try:
return fix_data(data.decode(self.input_encoding, 'replace'))
except UnicodeDecodeError:
pass
try:
return fix_data(data.decode('utf-8'))
except UnicodeDecodeError:
pass
data, _ = xml_to_unicode(data)
return fix_data(data)
def to_opf1(self):
"""Produce OPF 1.2 representing the book's metadata and structure.
Returns a dictionary in which the keys are MIME types and the values
are tuples of (default) filenames and lxml.etree element structures.
"""
package = etree.Element('package',
attrib={'unique-identifier': self.uid.id})
self.metadata.to_opf1(package)
self.manifest.to_opf1(package)
self.spine.to_opf1(package)
tours = element(package, 'tours')
tour = element(tours, 'tour',
attrib={'id': 'chaptertour', 'title': 'Chapter Tour'})
self.toc.to_opf1(tour)
self.guide.to_opf1(package)
return {OPF_MIME: ('content.opf', package)}
def _update_playorder(self, ncx):
hrefs = set(map(urlnormalize, xpath(ncx, '//ncx:content/@src')))
playorder = {}
next = 1
selector = XPath('h:body//*[@id or @name]')
for item in self.spine:
base = item.href
if base in hrefs:
playorder[base] = next
next += 1
for elem in selector(item.data):
added = False
for attr in ('id', 'name'):
id = elem.get(attr)
if not id:
continue
href = '#'.join([base, id])
if href in hrefs:
playorder[href] = next
added = True
if added:
next += 1
selector = XPath('ncx:content/@src')
for i, elem in enumerate(xpath(ncx, '//*[@playOrder and ./ncx:content[@src]]')):
href = urlnormalize(selector(elem)[0])
order = playorder.get(href, i)
elem.attrib['playOrder'] = str(order)
return
def _to_ncx(self):
lang = str(self.metadata.language[0])
lang = lang.replace('_', '-')
ncx = etree.Element(NCX('ncx'),
attrib={'version': '2005-1', XML('lang'): lang},
nsmap={None: NCX_NS})
head = etree.SubElement(ncx, NCX('head'))
etree.SubElement(head, NCX('meta'),
name='dtb:uid', content=str(self.uid))
etree.SubElement(head, NCX('meta'),
name='dtb:depth', content=str(self.toc.depth()))
generator = ''.join(['calibre (', __version__, ')'])
etree.SubElement(head, NCX('meta'),
name='dtb:generator', content=generator)
etree.SubElement(head, NCX('meta'),
name='dtb:totalPageCount', content=str(len(self.pages)))
maxpnum = etree.SubElement(head, NCX('meta'),
name='dtb:maxPageNumber', content='0')
title = etree.SubElement(ncx, NCX('docTitle'))
text = etree.SubElement(title, NCX('text'))
text.text = str(self.metadata.title[0])
navmap = etree.SubElement(ncx, NCX('navMap'))
self.toc.to_ncx(navmap)
if len(self.pages) > 0:
plist = self.pages.to_ncx(ncx)
value = max(int(x) for x in xpath(plist, '//@value'))
maxpnum.attrib['content'] = str(value)
self._update_playorder(ncx)
return ncx
def to_opf2(self, page_map=False):
"""Produce OPF 2.0 representing the book's metadata and structure.
Returns a dictionary in which the keys are MIME types and the values
are tuples of (default) filenames and lxml.etree element structures.
"""
results = {}
package = etree.Element(OPF('package'),
attrib={'version': '2.0', 'unique-identifier': self.uid.id},
nsmap={None: OPF2_NS})
self.metadata.to_opf2(package)
manifest = self.manifest.to_opf2(package)
spine = self.spine.to_opf2(package)
self.guide.to_opf2(package)
results[OPF_MIME] = ('content.opf', package)
id, href = self.manifest.generate('ncx', 'toc.ncx')
etree.SubElement(manifest, OPF('item'), id=id, href=href,
attrib={'media-type': NCX_MIME})
spine.attrib['toc'] = id
results[NCX_MIME] = (href, self._to_ncx())
if page_map and len(self.pages) > 0:
id, href = self.manifest.generate('page-map', 'page-map.xml')
etree.SubElement(manifest, OPF('item'), id=id, href=href,
attrib={'media-type': PAGE_MAP_MIME})
spine.attrib['page-map'] = id
results[PAGE_MAP_MIME] = (href, self.pages.to_page_map())
if self.spine.page_progression_direction in {'ltr', 'rtl'}:
spine.attrib['page-progression-direction'] = self.spine.page_progression_direction
return results
def rel_href(base_href, href):
"""Convert the URL provided in :param:`href` to a URL relative to the URL
in :param:`base_href` """
if urlparse(href).scheme:
return href
if '/' not in base_href:
return href
base = list(filter(lambda x: x and x != '.', os.path.dirname(os.path.normpath(base_href)).replace(os.sep, '/').split('/')))
while True:
try:
idx = base.index('..')
except ValueError:
break
if idx > 0:
del base[idx-1:idx+1]
else:
break
if not base:
return href
target, frag = urldefrag(href)
target = target.split('/')
index = 0
for index in range(min(len(base), len(target))):
if base[index] != target[index]:
break
else:
index += 1
relhref = (['..'] * (len(base) - index)) + target[index:]
relhref = '/'.join(relhref)
if frag:
relhref = '#'.join((relhref, frag))
return relhref