%PDF- %PDF-
| Direktori : /proc/self/root/usr/lib/calibre/calibre/ebooks/htmlz/ |
| Current File : //proc/self/root/usr/lib/calibre/calibre/ebooks/htmlz/oeb2html.py |
__license__ = 'GPL 3'
__copyright__ = '2011, John Schember <john@nachtimwald.com>'
__docformat__ = 'restructuredtext en'
'''
Transform OEB content into a single (more or less) HTML file.
'''
import os
import re
from functools import partial
from lxml import html
from calibre import prepare_string_for_xml
from calibre.ebooks.oeb.base import (
XHTML, XHTML_NS, SVG_NS, barename, namespace, OEB_IMAGES, XLINK, rewrite_links, urlnormalize)
from calibre.ebooks.oeb.stylizer import Stylizer
from calibre.utils.logging import default_log
from polyglot.builtins import string_or_bytes, as_unicode
from polyglot.urllib import urldefrag
SELF_CLOSING_TAGS = {'area', 'base', 'basefont', 'br', 'hr', 'input', 'img', 'link', 'meta'}
class OEB2HTML:
'''
Base class. All subclasses should implement dump_text to actually transform
content. Also, callers should use oeb2html to get the transformed html.
links and images can be retrieved after calling oeb2html to get the mapping
of OEB links and images to the new names used in the html returned by oeb2html.
Images will always be referenced as if they are in an images folder.
Use get_css to get the CSS classes for the OEB document as a string.
'''
def __init__(self, log=None):
self.log = default_log if log is None else log
self.links = {}
self.images = {}
def oeb2html(self, oeb_book, opts):
self.log.info('Converting OEB book to HTML...')
self.opts = opts
try:
self.book_title = str(oeb_book.metadata.title[0])
except Exception:
self.book_title = _('Unknown')
self.links = {}
self.images = {}
self.base_hrefs = [item.href for item in oeb_book.spine]
self.map_resources(oeb_book)
return self.mlize_spine(oeb_book)
def mlize_spine(self, oeb_book):
output = [
'<html><head><meta http-equiv="Content-Type" content="text/html;charset=utf-8" /><title>%s</title></head><body>' % (
prepare_string_for_xml(self.book_title))
]
for item in oeb_book.spine:
self.log.debug('Converting %s to HTML...' % item.href)
self.rewrite_ids(item.data, item)
rewrite_links(item.data, partial(self.rewrite_link, page=item))
stylizer = Stylizer(item.data, item.href, oeb_book, self.opts)
output += self.dump_text(item.data.find(XHTML('body')), stylizer, item)
output.append('\n\n')
output.append('</body></html>')
return ''.join(output)
def dump_text(self, elem, stylizer, page):
raise NotImplementedError
def get_link_id(self, href, id=''):
if id:
href += '#%s' % id
if href not in self.links:
self.links[href] = '#calibre_link-%s' % len(self.links.keys())
return self.links[href]
def map_resources(self, oeb_book):
for item in oeb_book.manifest:
if item.media_type in OEB_IMAGES:
if item.href not in self.images:
ext = os.path.splitext(item.href)[1]
fname = f'{len(self.images)}{ext}'
fname = fname.zfill(10)
self.images[item.href] = fname
if item in oeb_book.spine:
self.get_link_id(item.href)
root = item.data.find(XHTML('body'))
link_attrs = set(html.defs.link_attrs)
link_attrs.add(XLINK('href'))
for el in root.iter():
attribs = el.attrib
try:
if not isinstance(el.tag, string_or_bytes):
continue
except:
continue
for attr in attribs:
if attr in link_attrs:
href = item.abshref(attribs[attr])
href, id = urldefrag(href)
if href in self.base_hrefs:
self.get_link_id(href, id)
def rewrite_link(self, url, page=None):
if not page:
return url
abs_url = page.abshref(urlnormalize(url))
if abs_url in self.images:
return 'images/%s' % self.images[abs_url]
if abs_url in self.links:
return self.links[abs_url]
return url
def rewrite_ids(self, root, page):
for el in root.iter():
try:
tag = el.tag
except UnicodeDecodeError:
continue
if tag == XHTML('body'):
el.attrib['id'] = self.get_link_id(page.href)[1:]
continue
if 'id' in el.attrib:
el.attrib['id'] = self.get_link_id(page.href, el.attrib['id'])[1:]
def get_css(self, oeb_book):
css = ''
for item in oeb_book.manifest:
if item.media_type == 'text/css':
css += as_unicode(item.data.cssText) + '\n\n'
return css
def prepare_string_for_html(self, raw):
raw = prepare_string_for_xml(raw)
raw = raw.replace('\u00ad', '­')
raw = raw.replace('\u2014', '—')
raw = raw.replace('\u2013', '–')
raw = raw.replace('\u00a0', ' ')
return raw
class OEB2HTMLNoCSSizer(OEB2HTML):
'''
This will remap a small number of CSS styles to equivalent HTML tags.
'''
def dump_text(self, elem, stylizer, page):
'''
@elem: The element in the etree that we are working on.
@stylizer: The style information attached to the element.
'''
# We can only processes tags. If there isn't a tag return any text.
if not isinstance(elem.tag, string_or_bytes) \
or namespace(elem.tag) not in (XHTML_NS, SVG_NS):
p = elem.getparent()
if p is not None and isinstance(p.tag, string_or_bytes) and namespace(p.tag) in (XHTML_NS, SVG_NS) \
and elem.tail:
return [elem.tail]
return ['']
# Setup our variables.
text = ['']
style = stylizer.style(elem)
tags = []
tag = barename(elem.tag)
attribs = elem.attrib
if tag == 'body':
tag = 'div'
tags.append(tag)
# Ignore anything that is set to not be displayed.
if style['display'] in ('none', 'oeb-page-head', 'oeb-page-foot') \
or style['visibility'] == 'hidden':
return ['']
# Remove attributes we won't want.
if 'class' in attribs:
del attribs['class']
if 'style' in attribs:
del attribs['style']
# Turn the rest of the attributes into a string we can write with the tag.
at = ''
for k, v in attribs.items():
k = k.split('}')[-1]
at += f' {k}="{prepare_string_for_xml(v, attribute=True)}"'
# Write the tag.
text.append(f'<{tag}{at}')
if tag in SELF_CLOSING_TAGS:
text.append(' />')
else:
text.append('>')
# Turn styles into tags.
if style['font-weight'] in ('bold', 'bolder'):
text.append('<b>')
tags.append('b')
if style['font-style'] == 'italic':
text.append('<i>')
tags.append('i')
if style['text-decoration'] == 'underline':
text.append('<u>')
tags.append('u')
if style['text-decoration'] == 'line-through':
text.append('<s>')
tags.append('s')
# Process tags that contain text.
if hasattr(elem, 'text') and elem.text:
text.append(self.prepare_string_for_html(elem.text))
# Recurse down into tags within the tag we are in.
for item in elem:
text += self.dump_text(item, stylizer, page)
# Close all open tags.
tags.reverse()
for t in tags:
if t not in SELF_CLOSING_TAGS:
text.append('</%s>' % t)
# Add the text that is outside of the tag.
if hasattr(elem, 'tail') and elem.tail:
text.append(self.prepare_string_for_html(elem.tail))
return text
class OEB2HTMLInlineCSSizer(OEB2HTML):
'''
Turns external CSS classes into inline style attributes.
'''
def dump_text(self, elem, stylizer, page):
'''
@elem: The element in the etree that we are working on.
@stylizer: The style information attached to the element.
'''
# We can only processes tags. If there isn't a tag return any text.
if not isinstance(elem.tag, string_or_bytes) \
or namespace(elem.tag) not in (XHTML_NS, SVG_NS):
p = elem.getparent()
if p is not None and isinstance(p.tag, string_or_bytes) and namespace(p.tag) in (XHTML_NS, SVG_NS) \
and elem.tail:
return [elem.tail]
return ['']
# Setup our variables.
text = ['']
style = stylizer.style(elem)
tags = []
tag = barename(elem.tag)
attribs = elem.attrib
style_a = '%s' % style
style_a = style_a if style_a else ''
if tag == 'body':
# Change the body to a div so we can merge multiple files.
tag = 'div'
# Add page-break-brefore: always because renders typically treat a new file (we're merging files)
# as a page break and remove all other page break types that might be set.
style_a = 'page-break-before: always; %s' % re.sub('page-break-[^:]+:[^;]+;?', '', style_a)
# Remove unnecessary spaces.
style_a = re.sub(r'\s{2,}', ' ', style_a).strip()
tags.append(tag)
# Remove attributes we won't want.
if 'class' in attribs:
del attribs['class']
if 'style' in attribs:
del attribs['style']
# Turn the rest of the attributes into a string we can write with the tag.
at = ''
for k, v in attribs.items():
k = k.split('}')[-1]
at += f' {k}="{prepare_string_for_xml(v, attribute=True)}"'
# Turn style into strings for putting in the tag.
style_t = ''
if style_a:
style_t = ' style="%s"' % style_a.replace('"', "'")
# Write the tag.
text.append(f'<{tag}{at}{style_t}')
if tag in SELF_CLOSING_TAGS:
text.append(' />')
else:
text.append('>')
# Process tags that contain text.
if hasattr(elem, 'text') and elem.text:
text.append(self.prepare_string_for_html(elem.text))
# Recurse down into tags within the tag we are in.
for item in elem:
text += self.dump_text(item, stylizer, page)
# Close all open tags.
tags.reverse()
for t in tags:
if t not in SELF_CLOSING_TAGS:
text.append('</%s>' % t)
# Add the text that is outside of the tag.
if hasattr(elem, 'tail') and elem.tail:
text.append(self.prepare_string_for_html(elem.tail))
return text
class OEB2HTMLClassCSSizer(OEB2HTML):
'''
Use CSS classes. css_style option can specify whether to use
inline classes (style tag in the head) or reference an external
CSS file called style.css.
'''
def mlize_spine(self, oeb_book):
output = []
for item in oeb_book.spine:
self.log.debug('Converting %s to HTML...' % item.href)
self.rewrite_ids(item.data, item)
rewrite_links(item.data, partial(self.rewrite_link, page=item))
stylizer = Stylizer(item.data, item.href, oeb_book, self.opts)
output += self.dump_text(item.data.find(XHTML('body')), stylizer, item)
output.append('\n\n')
if self.opts.htmlz_class_style == 'external':
css = '<link href="style.css" rel="stylesheet" type="text/css" />'
else:
css = '<style type="text/css">' + self.get_css(oeb_book) + '</style>'
title = '<title>%s</title>' % prepare_string_for_xml(self.book_title)
output = ['<html><head><meta http-equiv="Content-Type" content="text/html;charset=utf-8" />'] + \
[css] + [title, '</head><body>'] + output + ['</body></html>']
return ''.join(output)
def dump_text(self, elem, stylizer, page):
'''
@elem: The element in the etree that we are working on.
@stylizer: The style information attached to the element.
'''
# We can only processes tags. If there isn't a tag return any text.
if not isinstance(elem.tag, string_or_bytes) \
or namespace(elem.tag) not in (XHTML_NS, SVG_NS):
p = elem.getparent()
if p is not None and isinstance(p.tag, string_or_bytes) and namespace(p.tag) in (XHTML_NS, SVG_NS) \
and elem.tail:
return [elem.tail]
return ['']
# Setup our variables.
text = ['']
tags = []
tag = barename(elem.tag)
attribs = elem.attrib
if tag == 'body':
tag = 'div'
tags.append(tag)
# Remove attributes we won't want.
if 'style' in attribs:
del attribs['style']
# Turn the rest of the attributes into a string we can write with the tag.
at = ''
for k, v in attribs.items():
k = k.split('}')[-1]
at += f' {k}="{prepare_string_for_xml(v, attribute=True)}"'
# Write the tag.
text.append(f'<{tag}{at}')
if tag in SELF_CLOSING_TAGS:
text.append(' />')
else:
text.append('>')
# Process tags that contain text.
if hasattr(elem, 'text') and elem.text:
text.append(self.prepare_string_for_html(elem.text))
# Recurse down into tags within the tag we are in.
for item in elem:
text += self.dump_text(item, stylizer, page)
# Close all open tags.
tags.reverse()
for t in tags:
if t not in SELF_CLOSING_TAGS:
text.append('</%s>' % t)
# Add the text that is outside of the tag.
if hasattr(elem, 'tail') and elem.tail:
text.append(self.prepare_string_for_html(elem.tail))
return text
def oeb2html_no_css(oeb_book, log, opts):
izer = OEB2HTMLNoCSSizer(log)
html = izer.oeb2html(oeb_book, opts)
images = izer.images
return (html, images)
def oeb2html_inline_css(oeb_book, log, opts):
izer = OEB2HTMLInlineCSSizer(log)
html = izer.oeb2html(oeb_book, opts)
images = izer.images
return (html, images)
def oeb2html_class_css(oeb_book, log, opts):
izer = OEB2HTMLClassCSSizer(log)
setattr(opts, 'class_style', 'inline')
html = izer.oeb2html(oeb_book, opts)
images = izer.images
return (html, images)