%PDF- %PDF-
| Direktori : /lib/calibre/calibre/ebooks/pdf/ |
| Current File : //lib/calibre/calibre/ebooks/pdf/html_writer.py |
#!/usr/bin/env python3
# License: GPL v3 Copyright: 2019, Kovid Goyal <kovid at kovidgoyal.net>
# Imports {{{
import copy
import json
import os
import signal
import sys
from collections import namedtuple
from html5_parser import parse
from io import BytesIO
from itertools import count, repeat
from qt.core import (
QApplication, QMarginsF, QObject, QPageLayout, Qt, QTimer, QUrl, pyqtSignal, sip
)
from qt.webengine import (
QWebEnginePage, QWebEngineProfile, QWebEngineUrlRequestInterceptor
)
from calibre import detect_ncpus, human_readable, prepare_string_for_xml
from calibre.constants import __version__, iswindows, ismacos
from calibre.ebooks.metadata.xmp import metadata_to_xmp_packet
from calibre.ebooks.oeb.base import XHTML, XPath
from calibre.ebooks.oeb.polish.container import Container as ContainerBase
from calibre.ebooks.oeb.polish.toc import get_toc
from calibre.ebooks.pdf.image_writer import (
Image, PDFMetadata, draw_image_page, get_page_layout
)
from calibre.ebooks.pdf.render.serialize import PDFStream
from calibre.gui2 import setup_unix_signals
from calibre.gui2.webengine import secure_webengine
from calibre.srv.render_book import check_for_maths
from calibre.utils.fonts.sfnt.container import Sfnt, UnsupportedFont
from calibre.utils.fonts.sfnt.errors import NoGlyphs
from calibre.utils.fonts.sfnt.merge import merge_truetype_fonts_for_pdf
from calibre.utils.fonts.sfnt.subset import pdf_subset
from calibre.utils.logging import default_log
from calibre.utils.monotonic import monotonic
from calibre.utils.podofo import (
dedup_type3_fonts, get_podofo, remove_unused_fonts, set_metadata_implementation
)
from calibre.utils.short_uuid import uuid4
from polyglot.builtins import iteritems
from polyglot.urllib import urlparse
OK, KILL_SIGNAL = range(0, 2)
HANG_TIME = 60 # seconds
# }}}
# Utils {{{
def data_as_pdf_doc(data):
podofo = get_podofo()
ans = podofo.PDFDoc()
ans.load(data)
return ans
def preprint_js():
ans = getattr(preprint_js, 'ans', None)
if ans is None:
ans = preprint_js.ans = P('pdf-preprint.js', data=True).decode('utf-8').replace('HYPHEN_CHAR', 'true' if ismacos else 'false', 1)
return ans
def last_tag(root):
return tuple(root.iterchildren('*'))[-1]
def create_skeleton(container):
spine_name = tuple(container.spine_names)[-1][0]
root = container.parsed(spine_name)
root = copy.deepcopy(root)
body = last_tag(root)
body.text = body.tail = None
del body[:]
name = container.add_file(spine_name, b'', modify_name_if_needed=True)
container.replace(name, root)
return name
def local_name(x):
return x.split('}', 1)[-1].lower()
def fix_fullscreen_images(container):
def is_svg_fs_markup(names, svg):
if svg is not None:
if len(names) == 2 or len(names) == 3:
if names[-1] == 'image' and names[-2] == 'svg':
if len(names) == 2 or names[0] == 'div':
if svg.get('width') == '100%' and svg.get('height') == '100%':
return True
return False
for file_name, is_linear in container.spine_names:
root = container.parsed(file_name)
root_kids = tuple(root.iterchildren('*'))
if not root_kids:
continue
body = root_kids[-1]
child_tags = []
for child in body.iterchildren('*'):
tag = local_name(child.tag)
if tag in ('script', 'style'):
continue
child_tags.append(tag)
if len(child_tags) > 1:
break
if len(child_tags) == 1 and child_tags[0] in ('div', 'svg'):
names = []
svg = None
for elem in body.iterdescendants('*'):
name = local_name(elem.tag)
if name != 'style' and name != 'script':
names.append(name)
if name == 'svg':
svg = elem
if is_svg_fs_markup(names, svg):
svg.set('width', '100vw')
svg.set('height', '100vh')
container.dirty(file_name)
# }}}
# Renderer {{{
class Container(ContainerBase):
tweak_mode = True
is_dir = True
def __init__(self, opf_path, log, root_dir=None):
ContainerBase.__init__(self, root_dir or os.path.dirname(opf_path), opf_path, log)
class Renderer(QWebEnginePage):
work_done = pyqtSignal(object, object)
def __init__(self, opts, parent, log):
QWebEnginePage.__init__(self, parent.profile, parent)
secure_webengine(self)
self.working = False
self.log = log
self.load_complete = False
self.settle_time = 0
self.wait_for_title = None
s = self.settings()
s.setAttribute(s.JavascriptEnabled, True)
s.setFontSize(s.DefaultFontSize, int(opts.pdf_default_font_size))
s.setFontSize(s.DefaultFixedFontSize, int(opts.pdf_mono_font_size))
s.setFontSize(s.MinimumLogicalFontSize, 8)
s.setFontSize(s.MinimumFontSize, 8)
std = {
'serif': opts.pdf_serif_family,
'sans' : opts.pdf_sans_family,
'mono' : opts.pdf_mono_family
}.get(opts.pdf_standard_font, opts.pdf_serif_family)
if std:
s.setFontFamily(s.StandardFont, std)
if opts.pdf_serif_family:
s.setFontFamily(s.SerifFont, opts.pdf_serif_family)
if opts.pdf_sans_family:
s.setFontFamily(s.SansSerifFont, opts.pdf_sans_family)
if opts.pdf_mono_family:
s.setFontFamily(s.FixedFont, opts.pdf_mono_family)
self.titleChanged.connect(self.title_changed)
self.loadStarted.connect(self.load_started)
self.loadProgress.connect(self.load_progress)
self.loadFinished.connect(self.load_finished)
self.load_hang_check_timer = t = QTimer(self)
self.load_started_at = 0
t.setTimerType(Qt.TimerType.VeryCoarseTimer)
t.setInterval(HANG_TIME * 1000)
t.setSingleShot(True)
t.timeout.connect(self.on_load_hang)
def load_started(self):
self.load_started_at = monotonic()
self.load_complete = False
self.load_hang_check_timer.start()
def load_progress(self, amt):
self.load_hang_check_timer.start()
def on_load_hang(self):
self.log(self.log_prefix, f'Loading not complete after {int(monotonic() - self.load_started_at)} seconds, aborting.')
self.load_finished(False)
def title_changed(self, title):
if self.wait_for_title and title == self.wait_for_title and self.load_complete:
QTimer.singleShot(self.settle_time, self.print_to_pdf)
@property
def log_prefix(self):
return os.path.basename(self.url().toLocalFile()) + ':'
def load_finished(self, ok):
self.load_complete = True
self.load_hang_check_timer.stop()
if not ok:
self.working = False
self.work_done.emit(self, f'Load of {self.url().toString()} failed')
return
if self.wait_for_title and self.title() != self.wait_for_title:
self.log(self.log_prefix, 'Load finished, waiting for title to change to:', self.wait_for_title)
return
QTimer.singleShot(int(1000 * self.settle_time), self.print_to_pdf)
def javaScriptConsoleMessage(self, level, message, linenum, source_id):
try:
self.log(f'{source_id}:{linenum}:{message}')
except Exception:
pass
def print_to_pdf(self):
self.runJavaScript(preprint_js(), self.start_print)
def start_print(self, *a):
self.printToPdf(self.printing_done, self.page_layout)
def printing_done(self, pdf_data):
self.working = False
if not sip.isdeleted(self):
self.work_done.emit(self, bytes(pdf_data))
def convert_html_file(self, path, page_layout, settle_time=0, wait_for_title=None):
self.working = True
self.load_complete = False
self.wait_for_title = wait_for_title
self.settle_time = settle_time
self.page_layout = page_layout
self.setUrl(QUrl.fromLocalFile(path))
class RequestInterceptor(QWebEngineUrlRequestInterceptor):
def interceptRequest(self, request_info):
method = bytes(request_info.requestMethod())
if method not in (b'GET', b'HEAD'):
self.log.warn(f'Blocking URL request with method: {method}')
request_info.block(True)
return
qurl = request_info.requestUrl()
if qurl.scheme() != 'file':
self.log.warn(f'Blocking URL request with scheme: {qurl.scheme()}')
request_info.block(True)
return
path = qurl.toLocalFile()
path = os.path.normcase(os.path.abspath(path))
if not path.startswith(self.container_root) and not path.startswith(self.resources_root):
self.log.warn(f'Blocking request with path: {path}')
request_info.block(True)
return
class RenderManager(QObject):
def __init__(self, opts, log, container_root):
QObject.__init__(self)
self.interceptor = RequestInterceptor(self)
self.has_maths = {}
self.interceptor.log = self.log = log
self.interceptor.container_root = os.path.normcase(os.path.abspath(container_root))
self.interceptor.resources_root = os.path.normcase(os.path.abspath(os.path.dirname(mathjax_dir())))
ans = QWebEngineProfile(QApplication.instance())
ua = 'calibre-pdf-output ' + __version__
ans.setHttpUserAgent(ua)
s = ans.settings()
s.setDefaultTextEncoding('utf-8')
try:
ans.setUrlRequestInterceptor(self.interceptor)
except AttributeError:
ans.setRequestInterceptor(self.interceptor)
self.profile = ans
self.opts = opts
self.workers = []
self.max_workers = detect_ncpus()
if iswindows:
self.original_signal_handlers = {}
else:
self.original_signal_handlers = setup_unix_signals(self)
def create_worker(self):
worker = Renderer(self.opts, self, self.log)
worker.work_done.connect(self.work_done)
self.workers.append(worker)
def signal_received(self, read_fd):
try:
os.read(read_fd, 1024)
except OSError:
return
QApplication.instance().exit(KILL_SIGNAL)
def block_signal_handlers(self):
for sig in self.original_signal_handlers:
signal.signal(sig, lambda x, y: None)
def restore_signal_handlers(self):
for sig, handler in self.original_signal_handlers.items():
signal.signal(sig, handler)
def run_loop(self):
self.block_signal_handlers()
try:
return QApplication.exec()
finally:
self.restore_signal_handlers()
def convert_html_files(self, jobs, settle_time=0, wait_for_title=None, has_maths=None):
self.has_maths = has_maths or {}
while len(self.workers) < min(len(jobs), self.max_workers):
self.create_worker()
self.pending = list(jobs)
self.results = {}
self.settle_time = settle_time
self.wait_for_title = wait_for_title
QTimer.singleShot(0, self.assign_work)
ret = self.run_loop()
self.has_maths = {}
if ret == KILL_SIGNAL:
raise SystemExit('Kill signal received')
if ret != OK:
raise SystemExit('Unknown error occurred')
return self.results
def evaljs(self, js):
if not self.workers:
self.create_worker()
w = self.workers[0]
self.evaljs_result = None
w.runJavaScript(js, self.evaljs_callback)
QApplication.exec()
return self.evaljs_result
def evaljs_callback(self, result):
self.evaljs_result = result
QApplication.instance().exit(0)
def assign_work(self):
free_workers = [w for w in self.workers if not w.working]
while free_workers and self.pending:
html_file, page_layout, result_key = self.pending.pop()
w = free_workers.pop()
w.result_key = result_key
wait_for_title = self.wait_for_title
settle_time = self.settle_time
if self.has_maths.get(result_key):
wait_for_title = 'mathjax-load-complete'
settle_time *= 2
w.convert_html_file(html_file, page_layout, settle_time=settle_time, wait_for_title=wait_for_title)
def work_done(self, worker, result):
self.results[worker.result_key] = result
if self.pending:
self.assign_work()
else:
for w in self.workers:
if w.working:
return
QApplication.instance().exit(OK)
def resolve_margins(margins, page_layout):
old_margins = page_layout.marginsPoints()
def m(which):
ans = getattr(margins, which, None)
if ans is None:
ans = getattr(old_margins, which)()
return ans
return Margins(*map(m, 'left top right bottom'.split()))
def job_for_name(container, name, margins, page_layout):
index_file = container.name_to_abspath(name)
if margins:
page_layout = QPageLayout(page_layout)
page_layout.setUnits(QPageLayout.Unit.Point)
new_margins = QMarginsF(*resolve_margins(margins, page_layout))
page_layout.setMargins(new_margins)
return index_file, page_layout, name
# }}}
# Metadata {{{
def update_metadata(pdf_doc, pdf_metadata):
if pdf_metadata.mi:
xmp_packet = metadata_to_xmp_packet(pdf_metadata.mi)
set_metadata_implementation(
pdf_doc, pdf_metadata.title, pdf_metadata.mi.authors,
pdf_metadata.mi.book_producer, pdf_metadata.mi.tags, xmp_packet)
def add_cover(pdf_doc, cover_data, page_layout, opts):
buf = BytesIO()
page_size = page_layout.fullRectPoints().size()
img = Image(cover_data)
writer = PDFStream(buf, (page_size.width(), page_size.height()), compress=True)
writer.apply_fill(color=(1, 1, 1))
draw_image_page(writer, img, preserve_aspect_ratio=opts.preserve_cover_aspect_ratio)
writer.end()
cover_pdf_doc = data_as_pdf_doc(buf.getvalue())
pdf_doc.insert_existing_page(cover_pdf_doc)
# }}}
# Margin groups {{{
Margins = namedtuple('Margins', 'left top right bottom')
MarginFile = namedtuple('MarginFile', 'name margins')
def dict_to_margins(val, d=None):
return Margins(val.get('left', d), val.get('top', d), val.get('right', d), val.get('bottom', d))
def create_margin_files(container):
for name, is_linear in container.spine_names:
root = container.parsed(name)
margins = root.get('data-calibre-pdf-output-page-margins')
if margins:
margins = dict_to_margins(json.loads(margins))
yield MarginFile(name, margins)
# }}}
# Link handling {{{
def add_anchors_markup(root, uuid, anchors):
body = last_tag(root)
div = body.makeelement(
XHTML('div'), id=uuid,
style='display:block !important; page-break-before: always !important; break-before: always !important; white-space: pre-wrap !important'
)
div.text = '\n\n'
body.append(div)
c = count()
def a(anchor):
num = next(c)
a = div.makeelement(
XHTML('a'), href='#' + anchor,
style='min-width: 10px !important; min-height: 10px !important;'
' border: solid 1px rgba(0, 0, 0, 0) !important; text-decoration: none !important'
)
a.text = a.tail = ' '
if num % 8 == 0:
# prevent too many anchors on a line as it causes chromium to
# rescale the viewport
a.tail = '\n'
div.append(a)
for anchor in anchors:
a(anchor)
a(uuid)
def add_all_links(container, margin_files):
uuid = uuid4()
name_anchor_map = {}
for name, is_linear in container.spine_names:
root = container.parsed(name)
name_anchor_map[name] = frozenset(root.xpath('//*/@id'))
for margin_file in margin_files:
name = margin_file.name
anchors = name_anchor_map.get(name, set())
add_anchors_markup(container.parsed(name), uuid, anchors)
container.dirty(name)
return uuid
def make_anchors_unique(container, log):
mapping = {}
count = 0
base = None
spine_names = set()
def replacer(url):
if replacer.file_type not in ('text', 'ncx'):
return url
if not url:
return url
if '#' not in url:
url += '#'
if url.startswith('#'):
href, frag = base, url[1:]
name = base
else:
href, frag = url.partition('#')[::2]
name = container.href_to_name(href, base)
if not name:
return url.rstrip('#')
if not frag and name in spine_names:
replacer.replaced = True
return 'https://calibre-pdf-anchor.n#' + name
key = name, frag
new_frag = mapping.get(key)
if new_frag is None:
if name in spine_names:
log.warn(f'Link anchor: {name}#{frag} not found, linking to top of file instead')
replacer.replaced = True
return 'https://calibre-pdf-anchor.n#' + name
return url.rstrip('#')
replacer.replaced = True
return 'https://calibre-pdf-anchor.a#' + new_frag
if url.startswith('#'):
return '#' + new_frag
return href + '#' + new_frag
name_anchor_map = {}
for spine_name, is_linear in container.spine_names:
spine_names.add(spine_name)
root = container.parsed(spine_name)
for elem in root.xpath('//*[@id]'):
count += 1
key = spine_name, elem.get('id')
if key not in mapping:
new_id = mapping[key] = f'a{count}'
elem.set('id', new_id)
body = last_tag(root)
if not body.get('id'):
count += 1
body.set('id', f'a{count}')
name_anchor_map[spine_name] = body.get('id')
for name in container.mime_map:
base = name
replacer.replaced = False
container.replace_links(name, replacer)
return name_anchor_map
class AnchorLocation:
__slots__ = ('pagenum', 'left', 'top', 'zoom')
def __init__(self, pagenum=1, left=0, top=0, zoom=0):
self.pagenum, self.left, self.top, self.zoom = pagenum, left, top, zoom
def __repr__(self):
return 'AnchorLocation(pagenum={}, left={}, top={}, zoom={})'.format(*self.as_tuple)
@property
def as_tuple(self):
return self.pagenum, self.left, self.top, self.zoom
def get_anchor_locations(name, pdf_doc, first_page_num, toc_uuid, log):
ans = {}
anchors = pdf_doc.extract_anchors()
try:
toc_pagenum = anchors.pop(toc_uuid)[0]
except KeyError:
toc_pagenum = None
if toc_pagenum is None:
log.warn(f'Failed to find ToC anchor in {name}')
toc_pagenum = 0
if toc_pagenum > 1:
pdf_doc.delete_pages(toc_pagenum, pdf_doc.page_count() - toc_pagenum + 1)
for anchor, loc in iteritems(anchors):
loc = list(loc)
loc[0] += first_page_num - 1
ans[anchor] = AnchorLocation(*loc)
return ans
def fix_links(pdf_doc, anchor_locations, name_anchor_map, mark_links, log):
def replace_link(url):
purl = urlparse(url)
if purl.scheme != 'https' or purl.netloc not in ('calibre-pdf-anchor.a', 'calibre-pdf-anchor.n'):
return
loc = None
if purl.netloc == 'calibre-pdf-anchor.a':
loc = anchor_locations.get(purl.fragment)
if loc is None:
log.warn(f'Anchor location for link to {purl.fragment} not found')
else:
loc = anchor_locations.get(name_anchor_map.get(purl.fragment))
if loc is None:
log.warn(f'Anchor location for link to {purl.fragment} not found')
return None if loc is None else loc.as_tuple
pdf_doc.alter_links(replace_link, mark_links)
# }}}
# Outline creation {{{
class PDFOutlineRoot:
def __init__(self, pdf_doc):
self.pdf_doc = pdf_doc
self.root_item = None
def create(self, title, pagenum, as_child, left, top, zoom):
if self.root_item is None:
self.root_item = self.pdf_doc.create_outline(title, pagenum, left, top, zoom)
else:
self.root_item = self.root_item.create(title, pagenum, False, left, top, zoom)
return self.root_item
def annotate_toc(toc, anchor_locations, name_anchor_map, log):
for child in toc.iterdescendants():
frag = child.frag
try:
if '.' in frag:
loc = anchor_locations[name_anchor_map[frag]]
else:
loc = anchor_locations[frag]
except Exception:
log.warn(f'Could not find anchor location for ToC entry: {child.title} with href: {frag}')
loc = AnchorLocation(1, 0, 0, 0)
child.pdf_loc = loc
def add_toc(pdf_parent, toc_parent, log, pdf_doc):
for child in toc_parent:
title, loc = child.title, child.pdf_loc
try:
pdf_child = pdf_parent.create(title, loc.pagenum, True, loc.left, loc.top, loc.zoom)
except ValueError:
if loc.pagenum > 1:
log.warn(f'TOC node: {title} at page: {loc.pagenum} is beyond end of file, moving it to last page')
pdf_child = pdf_parent.create(title, pdf_doc.page_count(), True, loc.left, loc.top, loc.zoom)
else:
log.warn(f'Ignoring TOC node: {title} at page: {loc.pagenum}')
continue
if len(child):
add_toc(pdf_child, child, log, pdf_doc)
def get_page_number_display_map(render_manager, opts, num_pages, log):
num_pages *= 2
default_map = {n:n for n in range(1, num_pages + 1)}
if opts.pdf_page_number_map:
js = '''
function map_num(n) { return eval(MAP_EXPRESSION); }
var ans = {};
for (var i=1; i <= NUM_PAGES; i++) ans[i] = map_num(i);
JSON.stringify(ans);
'''.replace('MAP_EXPRESSION', json.dumps(opts.pdf_page_number_map), 1).replace(
'NUM_PAGES', str(num_pages), 1)
result = render_manager.evaljs(js)
try:
result = json.loads(result)
if not isinstance(result, dict):
raise ValueError('Not a dict')
except Exception:
log.warn(f'Could not do page number mapping, got unexpected result: {repr(result)}')
else:
default_map = {int(k): int(v) for k, v in iteritems(result)}
return default_map
def add_pagenum_toc(root, toc, opts, page_number_display_map):
body = last_tag(root)
indents = []
for i in range(1, 7):
indents.extend((i, 1.4*i))
css = '''
.calibre-pdf-toc table { width: 100%% }
.calibre-pdf-toc table tr td:last-of-type { text-align: right }
.calibre-pdf-toc .level-0 {
font-size: larger;
}
.calibre-pdf-toc .level-%d td:first-of-type { padding-left: %.1gem }
.calibre-pdf-toc .level-%d td:first-of-type { padding-left: %.1gem }
.calibre-pdf-toc .level-%d td:first-of-type { padding-left: %.1gem }
.calibre-pdf-toc .level-%d td:first-of-type { padding-left: %.1gem }
.calibre-pdf-toc .level-%d td:first-of-type { padding-left: %.1gem }
.calibre-pdf-toc .level-%d td:first-of-type { padding-left: %.1gem }
''' % tuple(indents) + (opts.extra_css or '')
style = body.makeelement(XHTML('style'), type='text/css')
style.text = css
body.append(style)
body.set('class', 'calibre-pdf-toc')
def E(tag, cls=None, text=None, tail=None, parent=None, **attrs):
ans = body.makeelement(XHTML(tag), **attrs)
ans.text, ans.tail = text, tail
if cls is not None:
ans.set('class', cls)
if parent is not None:
parent.append(ans)
return ans
E('h2', text=(opts.toc_title or _('Table of Contents')), parent=body)
table = E('table', parent=body)
for level, node in toc.iterdescendants(level=0):
tr = E('tr', cls='level-%d' % level, parent=table)
E('td', text=node.title or _('Unknown'), parent=tr)
num = node.pdf_loc.pagenum
num = page_number_display_map.get(num, num)
E('td', text=f'{num}', parent=tr)
# }}}
# Fonts {{{
def all_glyph_ids_in_w_arrays(arrays, as_set=False):
ans = set()
for w in arrays:
i = 0
while i + 1 < len(w):
elem = w[i]
next_elem = w[i+1]
if isinstance(next_elem, list):
ans |= set(range(elem, elem + len(next_elem)))
i += 2
else:
ans |= set(range(elem, next_elem + 1))
i += 3
return ans if as_set else sorted(ans)
def fonts_are_identical(fonts):
sentinel = object()
for key in ('ToUnicode', 'Data', 'W', 'W2'):
prev_val = sentinel
for f in fonts:
val = f[key]
if prev_val is not sentinel and prev_val != val:
return False
prev_val = val
return True
def merge_font_files(fonts, log):
# As of Qt 5.15.1 Chromium has switched to harfbuzz and dropped sfntly. It
# now produces font descriptors whose W arrays dont match the glyph width
# information from the hhea table, in contravention of the PDF spec. So
# we can no longer merge font descriptors, all we can do is merge the
# actual sfnt data streams into a single stream and subset it to contain
# only the glyphs from all W arrays.
# choose the largest font as the base font
fonts.sort(key=lambda f: len(f['Data'] or b''), reverse=True)
descendant_fonts = [f for f in fonts if f['Subtype'] != 'Type0']
total_size = sum(len(f['Data']) for f in descendant_fonts)
merged_sfnt = merge_truetype_fonts_for_pdf(tuple(f['sfnt'] for f in descendant_fonts), log)
w_arrays = tuple(filter(None, (f['W'] for f in descendant_fonts)))
glyph_ids = all_glyph_ids_in_w_arrays(w_arrays, as_set=True)
h_arrays = tuple(filter(None, (f['W2'] for f in descendant_fonts)))
glyph_ids |= all_glyph_ids_in_w_arrays(h_arrays, as_set=True)
try:
pdf_subset(merged_sfnt, glyph_ids)
except NoGlyphs:
log.warn(f'Subsetting of {fonts[0]["BaseFont"]} failed with no glyphs found, ignoring')
font_data = merged_sfnt()[0]
log(f'Merged {len(fonts)} instances of {fonts[0]["BaseFont"]} reducing size from {human_readable(total_size)} to {human_readable(len(font_data))}')
return font_data, tuple(f['Reference'] for f in descendant_fonts)
def merge_fonts(pdf_doc, log):
all_fonts = pdf_doc.list_fonts(True)
base_font_map = {}
def mergeable(fonts):
has_type0 = False
for font in fonts:
if font['Subtype'] == 'Type0':
has_type0 = True
if not font['Encoding'] or not font['Encoding'].startswith('Identity-'):
return False
else:
if not font['Data']:
return False
try:
sfnt = Sfnt(font['Data'])
except UnsupportedFont:
return False
font['sfnt'] = sfnt
if b'glyf' not in sfnt:
return False
return has_type0
for f in all_fonts:
base_font_map.setdefault(f['BaseFont'], []).append(f)
for name, fonts in iteritems(base_font_map):
if mergeable(fonts):
font_data, references = merge_font_files(fonts, log)
pdf_doc.merge_fonts(font_data, references)
def test_merge_fonts():
path = sys.argv[-1]
podofo = get_podofo()
pdf_doc = podofo.PDFDoc()
pdf_doc.open(path)
from calibre.utils.logging import default_log
merge_fonts(pdf_doc, default_log)
out = path.rpartition('.')[0] + '-merged.pdf'
pdf_doc.save(out)
print('Merged PDF written to', out)
# }}}
# Header/footer {{{
PAGE_NUMBER_TEMPLATE = '<footer><div style="margin: auto">_PAGENUM_</div></footer>'
def add_header_footer(manager, opts, pdf_doc, container, page_number_display_map, page_layout, page_margins_map, pdf_metadata, report_progress, toc=None):
header_template, footer_template = opts.pdf_header_template, opts.pdf_footer_template
if not footer_template and opts.pdf_page_numbers:
footer_template = PAGE_NUMBER_TEMPLATE
if not header_template and not footer_template:
return
report_progress(0.8, _('Adding headers and footers'))
name = create_skeleton(container)
root = container.parsed(name)
reset_css = 'margin: 0; padding: 0; border-width: 0; background-color: unset;'
root.set('style', reset_css)
body = last_tag(root)
body.attrib.pop('id', None)
body.set('style', reset_css)
job = job_for_name(container, name, Margins(0, 0, 0, 0), page_layout)
def m(tag_name, text=None, style=None, **attrs):
ans = root.makeelement(XHTML(tag_name), **attrs)
if text is not None:
ans.text = text
if style is not None:
style = '; '.join(f'{k}: {v}' for k, v in iteritems(style))
ans.set('style', style)
return ans
justify = 'flex-end'
if header_template:
justify = 'space-between' if footer_template else 'flex-start'
def create_toc_stack(iterator):
ans = []
for level, child in iterator:
pdf_loc = getattr(child, 'pdf_loc', None)
if pdf_loc is not None and pdf_loc.pagenum > 0:
ans.append((level, pdf_loc.pagenum, child.title))
return ans
def stack_to_map(stack):
ans = []
stack_pos = 0
current, page_for_current, level_for_current = '', -1, -1
stack_len = len(stack)
for page in range(1, pdf_doc.page_count() + 1):
while stack_pos < stack_len:
level, pagenum, title = stack[stack_pos]
if pagenum != page:
break
if pagenum != page_for_current or level > level_for_current:
page_for_current = pagenum
level_for_current = level
current = title
stack_pos += 1
ans.append(current)
return ans
def page_counts_map(iterator):
pagenums = []
for level, child in iterator:
pdf_loc = getattr(child, 'pdf_loc', None)
if pdf_loc is not None and pdf_loc.pagenum > 0:
pagenums.append(pdf_loc.pagenum)
stack = []
for i, pagenum in enumerate(pagenums):
next_page_num = pagenums[i + 1] if i + 1 < len(pagenums) else (pdf_doc.page_count() + 1)
stack.append((pagenum, next_page_num - pagenum))
totals = []
section_nums = []
stack_len = len(stack)
stack_pos = 0
current, page_for_current, counter = 0, -1, 0
for page in range(1, pdf_doc.page_count() + 1):
while stack_pos < stack_len:
pagenum, pages = stack[stack_pos]
if pagenum != page:
break
if pagenum != page_for_current:
current = pages
page_for_current = pagenum
counter = 0
stack_pos += 1
counter += 1
totals.append(current)
section_nums.append(counter)
return totals, section_nums
if toc is None:
page_toc_map = stack_to_map(())
toplevel_toc_map = stack_to_map(())
toplevel_pagenum_map, toplevel_pages_map = page_counts_map(())
else:
page_toc_map = stack_to_map(create_toc_stack(toc.iterdescendants(level=0)))
def tc():
for x in toc:
yield 0, x
toplevel_toc_map = stack_to_map(create_toc_stack(tc()))
toplevel_pagenum_map, toplevel_pages_map = page_counts_map(tc())
def create_container(page_num, margins):
style = {
'page-break-inside': 'avoid',
'page-break-after': 'always',
'display': 'flex',
'flex-direction': 'column',
'height': '100vh',
'justify-content': justify,
'margin-left': f'{margins.left}pt',
'margin-right': f'{margins.right}pt',
'margin-top': '0',
'margin-bottom': '0',
'padding': '0',
'border-width': '0',
'overflow': 'hidden',
'background-color': 'unset',
}
ans = m('div', style=style, id=f'p{page_num}')
return ans
def format_template(template, page_num, height):
template = template.replace('_TOP_LEVEL_SECTION_PAGES_', str(toplevel_pagenum_map[page_num - 1]))
template = template.replace('_TOP_LEVEL_SECTION_PAGENUM_', str(toplevel_pages_map[page_num - 1]))
template = template.replace('_TOTAL_PAGES_', str(pages_in_doc))
template = template.replace('_PAGENUM_', str(page_number_display_map[page_num]))
template = template.replace('_TITLE_', prepare_string_for_xml(pdf_metadata.title, True))
template = template.replace('_AUTHOR_', prepare_string_for_xml(pdf_metadata.author, True))
template = template.replace('_TOP_LEVEL_SECTION_', prepare_string_for_xml(toplevel_toc_map[page_num - 1]))
template = template.replace('_SECTION_', prepare_string_for_xml(page_toc_map[page_num - 1]))
troot = parse(template, namespace_elements=True)
ans = last_tag(troot)[0]
style = ans.get('style') or ''
style = (
'margin: 0; padding: 0; height: {height}pt; border-width: 0;'
'display: flex; align-items: center; overflow: hidden; background-color: unset;').format(height=height) + style
ans.set('style', style)
for child in ans.xpath('descendant-or-self::*[@class]'):
cls = frozenset(child.get('class').split())
q = 'even-page' if page_num % 2 else 'odd-page'
if q in cls or q.replace('-', '_') in cls:
style = child.get('style') or ''
child.set('style', style + '; display: none')
return ans
pages_in_doc = pdf_doc.page_count()
for page_num in range(1, pages_in_doc + 1):
margins = page_margins_map[page_num - 1]
div = create_container(page_num, margins)
body.append(div)
if header_template:
div.append(format_template(header_template, page_num, margins.top))
if footer_template:
div.append(format_template(footer_template, page_num, margins.bottom))
container.commit()
# print(open(job[0]).read())
results = manager.convert_html_files([job], settle_time=1)
data = results[name]
if not isinstance(data, bytes):
raise SystemExit(data)
# open('/t/impose.pdf', 'wb').write(data)
doc = data_as_pdf_doc(data)
first_page_num = pdf_doc.page_count()
num_pages = doc.page_count()
if first_page_num != num_pages:
raise ValueError('The number of header/footers pages ({}) != number of document pages ({})'.format(
num_pages, first_page_num))
pdf_doc.append(doc)
pdf_doc.impose(1, first_page_num + 1, num_pages)
report_progress(0.9, _('Headers and footers added'))
# }}}
# Maths {{{
def mathjax_dir():
return P('mathjax', allow_user_override=False)
def path_to_url(path):
return QUrl.fromLocalFile(path).toString()
def add_maths_script(container):
has_maths = {}
for name, is_linear in container.spine_names:
root = container.parsed(name)
has_maths[name] = hm = check_for_maths(root)
if not hm:
continue
script = root.makeelement(XHTML('script'), type="text/javascript", src=path_to_url(
P('pdf-mathjax-loader.js', allow_user_override=False)))
script.set('async', 'async')
script.set('data-mathjax-path', path_to_url(mathjax_dir()))
last_tag(root).append(script)
return has_maths
# }}}
def fix_markup(container):
xp = XPath('//h:canvas')
for file_name, is_linear in container.spine_names:
root = container.parsed(file_name)
for canvas in xp(root):
# Canvas causes rendering issues, see https://bugs.launchpad.net/bugs/1859040
# for an example.
canvas.tag = XHTML('div')
def convert(opf_path, opts, metadata=None, output_path=None, log=default_log, cover_data=None, report_progress=lambda x, y: None):
container = Container(opf_path, log)
fix_markup(container)
report_progress(0.05, _('Parsed all content for markup transformation'))
if opts.pdf_hyphenate:
from calibre.ebooks.oeb.polish.hyphenation import add_soft_hyphens
add_soft_hyphens(container)
has_maths = add_maths_script(container)
fix_fullscreen_images(container)
name_anchor_map = make_anchors_unique(container, log)
margin_files = tuple(create_margin_files(container))
toc = get_toc(container, verify_destinations=False)
has_toc = toc and len(toc)
links_page_uuid = add_all_links(container, margin_files)
container.commit()
report_progress(0.1, _('Completed markup transformation'))
manager = RenderManager(opts, log, container.root)
page_layout = get_page_layout(opts)
pdf_doc = None
anchor_locations = {}
jobs = []
for margin_file in margin_files:
jobs.append(job_for_name(container, margin_file.name, margin_file.margins, page_layout))
results = manager.convert_html_files(jobs, settle_time=1, has_maths=has_maths)
num_pages = 0
page_margins_map = []
for margin_file in margin_files:
name = margin_file.name
data = results[name]
if not isinstance(data, bytes):
raise SystemExit(data)
doc = data_as_pdf_doc(data)
anchor_locations.update(get_anchor_locations(name, doc, num_pages + 1, links_page_uuid, log))
doc_pages = doc.page_count()
page_margins_map.extend(repeat(resolve_margins(margin_file.margins, page_layout), doc_pages))
num_pages += doc_pages
if pdf_doc is None:
pdf_doc = doc
else:
pdf_doc.append(doc)
page_number_display_map = get_page_number_display_map(manager, opts, num_pages, log)
if has_toc:
annotate_toc(toc, anchor_locations, name_anchor_map, log)
if opts.pdf_add_toc:
tocname = create_skeleton(container)
root = container.parsed(tocname)
add_pagenum_toc(root, toc, opts, page_number_display_map)
container.commit()
jobs = [job_for_name(container, tocname, None, page_layout)]
results = manager.convert_html_files(jobs, settle_time=1)
tocdoc = data_as_pdf_doc(results[tocname])
page_margins_map.extend(repeat(resolve_margins(None, page_layout), tocdoc.page_count()))
pdf_doc.append(tocdoc)
report_progress(0.7, _('Rendered all HTML as PDF'))
fix_links(pdf_doc, anchor_locations, name_anchor_map, opts.pdf_mark_links, log)
if toc and len(toc):
add_toc(PDFOutlineRoot(pdf_doc), toc, log, pdf_doc)
report_progress(0.75, _('Added links to PDF content'))
pdf_metadata = PDFMetadata(metadata)
add_header_footer(
manager, opts, pdf_doc, container,
page_number_display_map, page_layout, page_margins_map,
pdf_metadata, report_progress, toc if has_toc else None)
num_removed = remove_unused_fonts(pdf_doc)
if num_removed:
log('Removed', num_removed, 'unused fonts')
merge_fonts(pdf_doc, log)
num_removed = dedup_type3_fonts(pdf_doc)
if num_removed:
log('Removed', num_removed, 'duplicated Type3 glyphs')
num_removed = pdf_doc.dedup_images()
if num_removed:
log('Removed', num_removed, 'duplicate images')
if opts.pdf_odd_even_offset:
for i in range(1, pdf_doc.page_count()):
margins = page_margins_map[i]
mult = -1 if i % 2 else 1
val = opts.pdf_odd_even_offset
if abs(val) < min(margins.left, margins.right):
box = list(pdf_doc.get_page_box("CropBox", i))
box[0] += val * mult
pdf_doc.set_page_box("CropBox", i, *box)
if cover_data:
add_cover(pdf_doc, cover_data, page_layout, opts)
if metadata is not None:
update_metadata(pdf_doc, pdf_metadata)
report_progress(1, _('Updated metadata in PDF'))
if opts.uncompressed_pdf:
pdf_doc.uncompress()
pdf_data = pdf_doc.write()
if output_path is None:
return pdf_data
with open(output_path, 'wb') as f:
f.write(pdf_data)