%PDF- %PDF-
Direktori : /usr/lib/calibre/calibre/ebooks/pdf/ |
Current File : //usr/lib/calibre/calibre/ebooks/pdf/html_writer.py |
#!/usr/bin/env python3 # License: GPL v3 Copyright: 2019, Kovid Goyal <kovid at kovidgoyal.net> # Imports {{{ import copy import json import os import signal import sys from collections import namedtuple from html5_parser import parse from io import BytesIO from itertools import count, repeat from qt.core import ( QApplication, QMarginsF, QObject, QPageLayout, Qt, QTimer, QUrl, pyqtSignal, sip ) from qt.webengine import ( QWebEnginePage, QWebEngineProfile, QWebEngineUrlRequestInterceptor ) from calibre import detect_ncpus, human_readable, prepare_string_for_xml from calibre.constants import __version__, iswindows, ismacos from calibre.ebooks.metadata.xmp import metadata_to_xmp_packet from calibre.ebooks.oeb.base import XHTML, XPath from calibre.ebooks.oeb.polish.container import Container as ContainerBase from calibre.ebooks.oeb.polish.toc import get_toc from calibre.ebooks.pdf.image_writer import ( Image, PDFMetadata, draw_image_page, get_page_layout ) from calibre.ebooks.pdf.render.serialize import PDFStream from calibre.gui2 import setup_unix_signals from calibre.gui2.webengine import secure_webengine from calibre.srv.render_book import check_for_maths from calibre.utils.fonts.sfnt.container import Sfnt, UnsupportedFont from calibre.utils.fonts.sfnt.errors import NoGlyphs from calibre.utils.fonts.sfnt.merge import merge_truetype_fonts_for_pdf from calibre.utils.fonts.sfnt.subset import pdf_subset from calibre.utils.logging import default_log from calibre.utils.monotonic import monotonic from calibre.utils.podofo import ( dedup_type3_fonts, get_podofo, remove_unused_fonts, set_metadata_implementation ) from calibre.utils.short_uuid import uuid4 from polyglot.builtins import iteritems from polyglot.urllib import urlparse OK, KILL_SIGNAL = range(0, 2) HANG_TIME = 60 # seconds # }}} # Utils {{{ def data_as_pdf_doc(data): podofo = get_podofo() ans = podofo.PDFDoc() ans.load(data) return ans def preprint_js(): ans = getattr(preprint_js, 'ans', None) if ans is None: ans = preprint_js.ans = P('pdf-preprint.js', data=True).decode('utf-8').replace('HYPHEN_CHAR', 'true' if ismacos else 'false', 1) return ans def last_tag(root): return tuple(root.iterchildren('*'))[-1] def create_skeleton(container): spine_name = tuple(container.spine_names)[-1][0] root = container.parsed(spine_name) root = copy.deepcopy(root) body = last_tag(root) body.text = body.tail = None del body[:] name = container.add_file(spine_name, b'', modify_name_if_needed=True) container.replace(name, root) return name def local_name(x): return x.split('}', 1)[-1].lower() def fix_fullscreen_images(container): def is_svg_fs_markup(names, svg): if svg is not None: if len(names) == 2 or len(names) == 3: if names[-1] == 'image' and names[-2] == 'svg': if len(names) == 2 or names[0] == 'div': if svg.get('width') == '100%' and svg.get('height') == '100%': return True return False for file_name, is_linear in container.spine_names: root = container.parsed(file_name) root_kids = tuple(root.iterchildren('*')) if not root_kids: continue body = root_kids[-1] child_tags = [] for child in body.iterchildren('*'): tag = local_name(child.tag) if tag in ('script', 'style'): continue child_tags.append(tag) if len(child_tags) > 1: break if len(child_tags) == 1 and child_tags[0] in ('div', 'svg'): names = [] svg = None for elem in body.iterdescendants('*'): name = local_name(elem.tag) if name != 'style' and name != 'script': names.append(name) if name == 'svg': svg = elem if is_svg_fs_markup(names, svg): svg.set('width', '100vw') svg.set('height', '100vh') container.dirty(file_name) # }}} # Renderer {{{ class Container(ContainerBase): tweak_mode = True is_dir = True def __init__(self, opf_path, log, root_dir=None): ContainerBase.__init__(self, root_dir or os.path.dirname(opf_path), opf_path, log) class Renderer(QWebEnginePage): work_done = pyqtSignal(object, object) def __init__(self, opts, parent, log): QWebEnginePage.__init__(self, parent.profile, parent) secure_webengine(self) self.working = False self.log = log self.load_complete = False self.settle_time = 0 self.wait_for_title = None s = self.settings() s.setAttribute(s.JavascriptEnabled, True) s.setFontSize(s.DefaultFontSize, int(opts.pdf_default_font_size)) s.setFontSize(s.DefaultFixedFontSize, int(opts.pdf_mono_font_size)) s.setFontSize(s.MinimumLogicalFontSize, 8) s.setFontSize(s.MinimumFontSize, 8) std = { 'serif': opts.pdf_serif_family, 'sans' : opts.pdf_sans_family, 'mono' : opts.pdf_mono_family }.get(opts.pdf_standard_font, opts.pdf_serif_family) if std: s.setFontFamily(s.StandardFont, std) if opts.pdf_serif_family: s.setFontFamily(s.SerifFont, opts.pdf_serif_family) if opts.pdf_sans_family: s.setFontFamily(s.SansSerifFont, opts.pdf_sans_family) if opts.pdf_mono_family: s.setFontFamily(s.FixedFont, opts.pdf_mono_family) self.titleChanged.connect(self.title_changed) self.loadStarted.connect(self.load_started) self.loadProgress.connect(self.load_progress) self.loadFinished.connect(self.load_finished) self.load_hang_check_timer = t = QTimer(self) self.load_started_at = 0 t.setTimerType(Qt.TimerType.VeryCoarseTimer) t.setInterval(HANG_TIME * 1000) t.setSingleShot(True) t.timeout.connect(self.on_load_hang) def load_started(self): self.load_started_at = monotonic() self.load_complete = False self.load_hang_check_timer.start() def load_progress(self, amt): self.load_hang_check_timer.start() def on_load_hang(self): self.log(self.log_prefix, f'Loading not complete after {int(monotonic() - self.load_started_at)} seconds, aborting.') self.load_finished(False) def title_changed(self, title): if self.wait_for_title and title == self.wait_for_title and self.load_complete: QTimer.singleShot(self.settle_time, self.print_to_pdf) @property def log_prefix(self): return os.path.basename(self.url().toLocalFile()) + ':' def load_finished(self, ok): self.load_complete = True self.load_hang_check_timer.stop() if not ok: self.working = False self.work_done.emit(self, f'Load of {self.url().toString()} failed') return if self.wait_for_title and self.title() != self.wait_for_title: self.log(self.log_prefix, 'Load finished, waiting for title to change to:', self.wait_for_title) return QTimer.singleShot(int(1000 * self.settle_time), self.print_to_pdf) def javaScriptConsoleMessage(self, level, message, linenum, source_id): try: self.log(f'{source_id}:{linenum}:{message}') except Exception: pass def print_to_pdf(self): self.runJavaScript(preprint_js(), self.start_print) def start_print(self, *a): self.printToPdf(self.printing_done, self.page_layout) def printing_done(self, pdf_data): self.working = False if not sip.isdeleted(self): self.work_done.emit(self, bytes(pdf_data)) def convert_html_file(self, path, page_layout, settle_time=0, wait_for_title=None): self.working = True self.load_complete = False self.wait_for_title = wait_for_title self.settle_time = settle_time self.page_layout = page_layout self.setUrl(QUrl.fromLocalFile(path)) class RequestInterceptor(QWebEngineUrlRequestInterceptor): def interceptRequest(self, request_info): method = bytes(request_info.requestMethod()) if method not in (b'GET', b'HEAD'): self.log.warn(f'Blocking URL request with method: {method}') request_info.block(True) return qurl = request_info.requestUrl() if qurl.scheme() != 'file': self.log.warn(f'Blocking URL request with scheme: {qurl.scheme()}') request_info.block(True) return path = qurl.toLocalFile() path = os.path.normcase(os.path.abspath(path)) if not path.startswith(self.container_root) and not path.startswith(self.resources_root): self.log.warn(f'Blocking request with path: {path}') request_info.block(True) return class RenderManager(QObject): def __init__(self, opts, log, container_root): QObject.__init__(self) self.interceptor = RequestInterceptor(self) self.has_maths = {} self.interceptor.log = self.log = log self.interceptor.container_root = os.path.normcase(os.path.abspath(container_root)) self.interceptor.resources_root = os.path.normcase(os.path.abspath(os.path.dirname(mathjax_dir()))) ans = QWebEngineProfile(QApplication.instance()) ua = 'calibre-pdf-output ' + __version__ ans.setHttpUserAgent(ua) s = ans.settings() s.setDefaultTextEncoding('utf-8') try: ans.setUrlRequestInterceptor(self.interceptor) except AttributeError: ans.setRequestInterceptor(self.interceptor) self.profile = ans self.opts = opts self.workers = [] self.max_workers = detect_ncpus() if iswindows: self.original_signal_handlers = {} else: self.original_signal_handlers = setup_unix_signals(self) def create_worker(self): worker = Renderer(self.opts, self, self.log) worker.work_done.connect(self.work_done) self.workers.append(worker) def signal_received(self, read_fd): try: os.read(read_fd, 1024) except OSError: return QApplication.instance().exit(KILL_SIGNAL) def block_signal_handlers(self): for sig in self.original_signal_handlers: signal.signal(sig, lambda x, y: None) def restore_signal_handlers(self): for sig, handler in self.original_signal_handlers.items(): signal.signal(sig, handler) def run_loop(self): self.block_signal_handlers() try: return QApplication.exec() finally: self.restore_signal_handlers() def convert_html_files(self, jobs, settle_time=0, wait_for_title=None, has_maths=None): self.has_maths = has_maths or {} while len(self.workers) < min(len(jobs), self.max_workers): self.create_worker() self.pending = list(jobs) self.results = {} self.settle_time = settle_time self.wait_for_title = wait_for_title QTimer.singleShot(0, self.assign_work) ret = self.run_loop() self.has_maths = {} if ret == KILL_SIGNAL: raise SystemExit('Kill signal received') if ret != OK: raise SystemExit('Unknown error occurred') return self.results def evaljs(self, js): if not self.workers: self.create_worker() w = self.workers[0] self.evaljs_result = None w.runJavaScript(js, self.evaljs_callback) QApplication.exec() return self.evaljs_result def evaljs_callback(self, result): self.evaljs_result = result QApplication.instance().exit(0) def assign_work(self): free_workers = [w for w in self.workers if not w.working] while free_workers and self.pending: html_file, page_layout, result_key = self.pending.pop() w = free_workers.pop() w.result_key = result_key wait_for_title = self.wait_for_title settle_time = self.settle_time if self.has_maths.get(result_key): wait_for_title = 'mathjax-load-complete' settle_time *= 2 w.convert_html_file(html_file, page_layout, settle_time=settle_time, wait_for_title=wait_for_title) def work_done(self, worker, result): self.results[worker.result_key] = result if self.pending: self.assign_work() else: for w in self.workers: if w.working: return QApplication.instance().exit(OK) def resolve_margins(margins, page_layout): old_margins = page_layout.marginsPoints() def m(which): ans = getattr(margins, which, None) if ans is None: ans = getattr(old_margins, which)() return ans return Margins(*map(m, 'left top right bottom'.split())) def job_for_name(container, name, margins, page_layout): index_file = container.name_to_abspath(name) if margins: page_layout = QPageLayout(page_layout) page_layout.setUnits(QPageLayout.Unit.Point) new_margins = QMarginsF(*resolve_margins(margins, page_layout)) page_layout.setMargins(new_margins) return index_file, page_layout, name # }}} # Metadata {{{ def update_metadata(pdf_doc, pdf_metadata): if pdf_metadata.mi: xmp_packet = metadata_to_xmp_packet(pdf_metadata.mi) set_metadata_implementation( pdf_doc, pdf_metadata.title, pdf_metadata.mi.authors, pdf_metadata.mi.book_producer, pdf_metadata.mi.tags, xmp_packet) def add_cover(pdf_doc, cover_data, page_layout, opts): buf = BytesIO() page_size = page_layout.fullRectPoints().size() img = Image(cover_data) writer = PDFStream(buf, (page_size.width(), page_size.height()), compress=True) writer.apply_fill(color=(1, 1, 1)) draw_image_page(writer, img, preserve_aspect_ratio=opts.preserve_cover_aspect_ratio) writer.end() cover_pdf_doc = data_as_pdf_doc(buf.getvalue()) pdf_doc.insert_existing_page(cover_pdf_doc) # }}} # Margin groups {{{ Margins = namedtuple('Margins', 'left top right bottom') MarginFile = namedtuple('MarginFile', 'name margins') def dict_to_margins(val, d=None): return Margins(val.get('left', d), val.get('top', d), val.get('right', d), val.get('bottom', d)) def create_margin_files(container): for name, is_linear in container.spine_names: root = container.parsed(name) margins = root.get('data-calibre-pdf-output-page-margins') if margins: margins = dict_to_margins(json.loads(margins)) yield MarginFile(name, margins) # }}} # Link handling {{{ def add_anchors_markup(root, uuid, anchors): body = last_tag(root) div = body.makeelement( XHTML('div'), id=uuid, style='display:block !important; page-break-before: always !important; break-before: always !important; white-space: pre-wrap !important' ) div.text = '\n\n' body.append(div) c = count() def a(anchor): num = next(c) a = div.makeelement( XHTML('a'), href='#' + anchor, style='min-width: 10px !important; min-height: 10px !important;' ' border: solid 1px rgba(0, 0, 0, 0) !important; text-decoration: none !important' ) a.text = a.tail = ' ' if num % 8 == 0: # prevent too many anchors on a line as it causes chromium to # rescale the viewport a.tail = '\n' div.append(a) for anchor in anchors: a(anchor) a(uuid) def add_all_links(container, margin_files): uuid = uuid4() name_anchor_map = {} for name, is_linear in container.spine_names: root = container.parsed(name) name_anchor_map[name] = frozenset(root.xpath('//*/@id')) for margin_file in margin_files: name = margin_file.name anchors = name_anchor_map.get(name, set()) add_anchors_markup(container.parsed(name), uuid, anchors) container.dirty(name) return uuid def make_anchors_unique(container, log): mapping = {} count = 0 base = None spine_names = set() def replacer(url): if replacer.file_type not in ('text', 'ncx'): return url if not url: return url if '#' not in url: url += '#' if url.startswith('#'): href, frag = base, url[1:] name = base else: href, frag = url.partition('#')[::2] name = container.href_to_name(href, base) if not name: return url.rstrip('#') if not frag and name in spine_names: replacer.replaced = True return 'https://calibre-pdf-anchor.n#' + name key = name, frag new_frag = mapping.get(key) if new_frag is None: if name in spine_names: log.warn(f'Link anchor: {name}#{frag} not found, linking to top of file instead') replacer.replaced = True return 'https://calibre-pdf-anchor.n#' + name return url.rstrip('#') replacer.replaced = True return 'https://calibre-pdf-anchor.a#' + new_frag if url.startswith('#'): return '#' + new_frag return href + '#' + new_frag name_anchor_map = {} for spine_name, is_linear in container.spine_names: spine_names.add(spine_name) root = container.parsed(spine_name) for elem in root.xpath('//*[@id]'): count += 1 key = spine_name, elem.get('id') if key not in mapping: new_id = mapping[key] = f'a{count}' elem.set('id', new_id) body = last_tag(root) if not body.get('id'): count += 1 body.set('id', f'a{count}') name_anchor_map[spine_name] = body.get('id') for name in container.mime_map: base = name replacer.replaced = False container.replace_links(name, replacer) return name_anchor_map class AnchorLocation: __slots__ = ('pagenum', 'left', 'top', 'zoom') def __init__(self, pagenum=1, left=0, top=0, zoom=0): self.pagenum, self.left, self.top, self.zoom = pagenum, left, top, zoom def __repr__(self): return 'AnchorLocation(pagenum={}, left={}, top={}, zoom={})'.format(*self.as_tuple) @property def as_tuple(self): return self.pagenum, self.left, self.top, self.zoom def get_anchor_locations(name, pdf_doc, first_page_num, toc_uuid, log): ans = {} anchors = pdf_doc.extract_anchors() try: toc_pagenum = anchors.pop(toc_uuid)[0] except KeyError: toc_pagenum = None if toc_pagenum is None: log.warn(f'Failed to find ToC anchor in {name}') toc_pagenum = 0 if toc_pagenum > 1: pdf_doc.delete_pages(toc_pagenum, pdf_doc.page_count() - toc_pagenum + 1) for anchor, loc in iteritems(anchors): loc = list(loc) loc[0] += first_page_num - 1 ans[anchor] = AnchorLocation(*loc) return ans def fix_links(pdf_doc, anchor_locations, name_anchor_map, mark_links, log): def replace_link(url): purl = urlparse(url) if purl.scheme != 'https' or purl.netloc not in ('calibre-pdf-anchor.a', 'calibre-pdf-anchor.n'): return loc = None if purl.netloc == 'calibre-pdf-anchor.a': loc = anchor_locations.get(purl.fragment) if loc is None: log.warn(f'Anchor location for link to {purl.fragment} not found') else: loc = anchor_locations.get(name_anchor_map.get(purl.fragment)) if loc is None: log.warn(f'Anchor location for link to {purl.fragment} not found') return None if loc is None else loc.as_tuple pdf_doc.alter_links(replace_link, mark_links) # }}} # Outline creation {{{ class PDFOutlineRoot: def __init__(self, pdf_doc): self.pdf_doc = pdf_doc self.root_item = None def create(self, title, pagenum, as_child, left, top, zoom): if self.root_item is None: self.root_item = self.pdf_doc.create_outline(title, pagenum, left, top, zoom) else: self.root_item = self.root_item.create(title, pagenum, False, left, top, zoom) return self.root_item def annotate_toc(toc, anchor_locations, name_anchor_map, log): for child in toc.iterdescendants(): frag = child.frag try: if '.' in frag: loc = anchor_locations[name_anchor_map[frag]] else: loc = anchor_locations[frag] except Exception: log.warn(f'Could not find anchor location for ToC entry: {child.title} with href: {frag}') loc = AnchorLocation(1, 0, 0, 0) child.pdf_loc = loc def add_toc(pdf_parent, toc_parent, log, pdf_doc): for child in toc_parent: title, loc = child.title, child.pdf_loc try: pdf_child = pdf_parent.create(title, loc.pagenum, True, loc.left, loc.top, loc.zoom) except ValueError: if loc.pagenum > 1: log.warn(f'TOC node: {title} at page: {loc.pagenum} is beyond end of file, moving it to last page') pdf_child = pdf_parent.create(title, pdf_doc.page_count(), True, loc.left, loc.top, loc.zoom) else: log.warn(f'Ignoring TOC node: {title} at page: {loc.pagenum}') continue if len(child): add_toc(pdf_child, child, log, pdf_doc) def get_page_number_display_map(render_manager, opts, num_pages, log): num_pages *= 2 default_map = {n:n for n in range(1, num_pages + 1)} if opts.pdf_page_number_map: js = ''' function map_num(n) { return eval(MAP_EXPRESSION); } var ans = {}; for (var i=1; i <= NUM_PAGES; i++) ans[i] = map_num(i); JSON.stringify(ans); '''.replace('MAP_EXPRESSION', json.dumps(opts.pdf_page_number_map), 1).replace( 'NUM_PAGES', str(num_pages), 1) result = render_manager.evaljs(js) try: result = json.loads(result) if not isinstance(result, dict): raise ValueError('Not a dict') except Exception: log.warn(f'Could not do page number mapping, got unexpected result: {repr(result)}') else: default_map = {int(k): int(v) for k, v in iteritems(result)} return default_map def add_pagenum_toc(root, toc, opts, page_number_display_map): body = last_tag(root) indents = [] for i in range(1, 7): indents.extend((i, 1.4*i)) css = ''' .calibre-pdf-toc table { width: 100%% } .calibre-pdf-toc table tr td:last-of-type { text-align: right } .calibre-pdf-toc .level-0 { font-size: larger; } .calibre-pdf-toc .level-%d td:first-of-type { padding-left: %.1gem } .calibre-pdf-toc .level-%d td:first-of-type { padding-left: %.1gem } .calibre-pdf-toc .level-%d td:first-of-type { padding-left: %.1gem } .calibre-pdf-toc .level-%d td:first-of-type { padding-left: %.1gem } .calibre-pdf-toc .level-%d td:first-of-type { padding-left: %.1gem } .calibre-pdf-toc .level-%d td:first-of-type { padding-left: %.1gem } ''' % tuple(indents) + (opts.extra_css or '') style = body.makeelement(XHTML('style'), type='text/css') style.text = css body.append(style) body.set('class', 'calibre-pdf-toc') def E(tag, cls=None, text=None, tail=None, parent=None, **attrs): ans = body.makeelement(XHTML(tag), **attrs) ans.text, ans.tail = text, tail if cls is not None: ans.set('class', cls) if parent is not None: parent.append(ans) return ans E('h2', text=(opts.toc_title or _('Table of Contents')), parent=body) table = E('table', parent=body) for level, node in toc.iterdescendants(level=0): tr = E('tr', cls='level-%d' % level, parent=table) E('td', text=node.title or _('Unknown'), parent=tr) num = node.pdf_loc.pagenum num = page_number_display_map.get(num, num) E('td', text=f'{num}', parent=tr) # }}} # Fonts {{{ def all_glyph_ids_in_w_arrays(arrays, as_set=False): ans = set() for w in arrays: i = 0 while i + 1 < len(w): elem = w[i] next_elem = w[i+1] if isinstance(next_elem, list): ans |= set(range(elem, elem + len(next_elem))) i += 2 else: ans |= set(range(elem, next_elem + 1)) i += 3 return ans if as_set else sorted(ans) def fonts_are_identical(fonts): sentinel = object() for key in ('ToUnicode', 'Data', 'W', 'W2'): prev_val = sentinel for f in fonts: val = f[key] if prev_val is not sentinel and prev_val != val: return False prev_val = val return True def merge_font_files(fonts, log): # As of Qt 5.15.1 Chromium has switched to harfbuzz and dropped sfntly. It # now produces font descriptors whose W arrays dont match the glyph width # information from the hhea table, in contravention of the PDF spec. So # we can no longer merge font descriptors, all we can do is merge the # actual sfnt data streams into a single stream and subset it to contain # only the glyphs from all W arrays. # choose the largest font as the base font fonts.sort(key=lambda f: len(f['Data'] or b''), reverse=True) descendant_fonts = [f for f in fonts if f['Subtype'] != 'Type0'] total_size = sum(len(f['Data']) for f in descendant_fonts) merged_sfnt = merge_truetype_fonts_for_pdf(tuple(f['sfnt'] for f in descendant_fonts), log) w_arrays = tuple(filter(None, (f['W'] for f in descendant_fonts))) glyph_ids = all_glyph_ids_in_w_arrays(w_arrays, as_set=True) h_arrays = tuple(filter(None, (f['W2'] for f in descendant_fonts))) glyph_ids |= all_glyph_ids_in_w_arrays(h_arrays, as_set=True) try: pdf_subset(merged_sfnt, glyph_ids) except NoGlyphs: log.warn(f'Subsetting of {fonts[0]["BaseFont"]} failed with no glyphs found, ignoring') font_data = merged_sfnt()[0] log(f'Merged {len(fonts)} instances of {fonts[0]["BaseFont"]} reducing size from {human_readable(total_size)} to {human_readable(len(font_data))}') return font_data, tuple(f['Reference'] for f in descendant_fonts) def merge_fonts(pdf_doc, log): all_fonts = pdf_doc.list_fonts(True) base_font_map = {} def mergeable(fonts): has_type0 = False for font in fonts: if font['Subtype'] == 'Type0': has_type0 = True if not font['Encoding'] or not font['Encoding'].startswith('Identity-'): return False else: if not font['Data']: return False try: sfnt = Sfnt(font['Data']) except UnsupportedFont: return False font['sfnt'] = sfnt if b'glyf' not in sfnt: return False return has_type0 for f in all_fonts: base_font_map.setdefault(f['BaseFont'], []).append(f) for name, fonts in iteritems(base_font_map): if mergeable(fonts): font_data, references = merge_font_files(fonts, log) pdf_doc.merge_fonts(font_data, references) def test_merge_fonts(): path = sys.argv[-1] podofo = get_podofo() pdf_doc = podofo.PDFDoc() pdf_doc.open(path) from calibre.utils.logging import default_log merge_fonts(pdf_doc, default_log) out = path.rpartition('.')[0] + '-merged.pdf' pdf_doc.save(out) print('Merged PDF written to', out) # }}} # Header/footer {{{ PAGE_NUMBER_TEMPLATE = '<footer><div style="margin: auto">_PAGENUM_</div></footer>' def add_header_footer(manager, opts, pdf_doc, container, page_number_display_map, page_layout, page_margins_map, pdf_metadata, report_progress, toc=None): header_template, footer_template = opts.pdf_header_template, opts.pdf_footer_template if not footer_template and opts.pdf_page_numbers: footer_template = PAGE_NUMBER_TEMPLATE if not header_template and not footer_template: return report_progress(0.8, _('Adding headers and footers')) name = create_skeleton(container) root = container.parsed(name) reset_css = 'margin: 0; padding: 0; border-width: 0; background-color: unset;' root.set('style', reset_css) body = last_tag(root) body.attrib.pop('id', None) body.set('style', reset_css) job = job_for_name(container, name, Margins(0, 0, 0, 0), page_layout) def m(tag_name, text=None, style=None, **attrs): ans = root.makeelement(XHTML(tag_name), **attrs) if text is not None: ans.text = text if style is not None: style = '; '.join(f'{k}: {v}' for k, v in iteritems(style)) ans.set('style', style) return ans justify = 'flex-end' if header_template: justify = 'space-between' if footer_template else 'flex-start' def create_toc_stack(iterator): ans = [] for level, child in iterator: pdf_loc = getattr(child, 'pdf_loc', None) if pdf_loc is not None and pdf_loc.pagenum > 0: ans.append((level, pdf_loc.pagenum, child.title)) return ans def stack_to_map(stack): ans = [] stack_pos = 0 current, page_for_current, level_for_current = '', -1, -1 stack_len = len(stack) for page in range(1, pdf_doc.page_count() + 1): while stack_pos < stack_len: level, pagenum, title = stack[stack_pos] if pagenum != page: break if pagenum != page_for_current or level > level_for_current: page_for_current = pagenum level_for_current = level current = title stack_pos += 1 ans.append(current) return ans def page_counts_map(iterator): pagenums = [] for level, child in iterator: pdf_loc = getattr(child, 'pdf_loc', None) if pdf_loc is not None and pdf_loc.pagenum > 0: pagenums.append(pdf_loc.pagenum) stack = [] for i, pagenum in enumerate(pagenums): next_page_num = pagenums[i + 1] if i + 1 < len(pagenums) else (pdf_doc.page_count() + 1) stack.append((pagenum, next_page_num - pagenum)) totals = [] section_nums = [] stack_len = len(stack) stack_pos = 0 current, page_for_current, counter = 0, -1, 0 for page in range(1, pdf_doc.page_count() + 1): while stack_pos < stack_len: pagenum, pages = stack[stack_pos] if pagenum != page: break if pagenum != page_for_current: current = pages page_for_current = pagenum counter = 0 stack_pos += 1 counter += 1 totals.append(current) section_nums.append(counter) return totals, section_nums if toc is None: page_toc_map = stack_to_map(()) toplevel_toc_map = stack_to_map(()) toplevel_pagenum_map, toplevel_pages_map = page_counts_map(()) else: page_toc_map = stack_to_map(create_toc_stack(toc.iterdescendants(level=0))) def tc(): for x in toc: yield 0, x toplevel_toc_map = stack_to_map(create_toc_stack(tc())) toplevel_pagenum_map, toplevel_pages_map = page_counts_map(tc()) def create_container(page_num, margins): style = { 'page-break-inside': 'avoid', 'page-break-after': 'always', 'display': 'flex', 'flex-direction': 'column', 'height': '100vh', 'justify-content': justify, 'margin-left': f'{margins.left}pt', 'margin-right': f'{margins.right}pt', 'margin-top': '0', 'margin-bottom': '0', 'padding': '0', 'border-width': '0', 'overflow': 'hidden', 'background-color': 'unset', } ans = m('div', style=style, id=f'p{page_num}') return ans def format_template(template, page_num, height): template = template.replace('_TOP_LEVEL_SECTION_PAGES_', str(toplevel_pagenum_map[page_num - 1])) template = template.replace('_TOP_LEVEL_SECTION_PAGENUM_', str(toplevel_pages_map[page_num - 1])) template = template.replace('_TOTAL_PAGES_', str(pages_in_doc)) template = template.replace('_PAGENUM_', str(page_number_display_map[page_num])) template = template.replace('_TITLE_', prepare_string_for_xml(pdf_metadata.title, True)) template = template.replace('_AUTHOR_', prepare_string_for_xml(pdf_metadata.author, True)) template = template.replace('_TOP_LEVEL_SECTION_', prepare_string_for_xml(toplevel_toc_map[page_num - 1])) template = template.replace('_SECTION_', prepare_string_for_xml(page_toc_map[page_num - 1])) troot = parse(template, namespace_elements=True) ans = last_tag(troot)[0] style = ans.get('style') or '' style = ( 'margin: 0; padding: 0; height: {height}pt; border-width: 0;' 'display: flex; align-items: center; overflow: hidden; background-color: unset;').format(height=height) + style ans.set('style', style) for child in ans.xpath('descendant-or-self::*[@class]'): cls = frozenset(child.get('class').split()) q = 'even-page' if page_num % 2 else 'odd-page' if q in cls or q.replace('-', '_') in cls: style = child.get('style') or '' child.set('style', style + '; display: none') return ans pages_in_doc = pdf_doc.page_count() for page_num in range(1, pages_in_doc + 1): margins = page_margins_map[page_num - 1] div = create_container(page_num, margins) body.append(div) if header_template: div.append(format_template(header_template, page_num, margins.top)) if footer_template: div.append(format_template(footer_template, page_num, margins.bottom)) container.commit() # print(open(job[0]).read()) results = manager.convert_html_files([job], settle_time=1) data = results[name] if not isinstance(data, bytes): raise SystemExit(data) # open('/t/impose.pdf', 'wb').write(data) doc = data_as_pdf_doc(data) first_page_num = pdf_doc.page_count() num_pages = doc.page_count() if first_page_num != num_pages: raise ValueError('The number of header/footers pages ({}) != number of document pages ({})'.format( num_pages, first_page_num)) pdf_doc.append(doc) pdf_doc.impose(1, first_page_num + 1, num_pages) report_progress(0.9, _('Headers and footers added')) # }}} # Maths {{{ def mathjax_dir(): return P('mathjax', allow_user_override=False) def path_to_url(path): return QUrl.fromLocalFile(path).toString() def add_maths_script(container): has_maths = {} for name, is_linear in container.spine_names: root = container.parsed(name) has_maths[name] = hm = check_for_maths(root) if not hm: continue script = root.makeelement(XHTML('script'), type="text/javascript", src=path_to_url( P('pdf-mathjax-loader.js', allow_user_override=False))) script.set('async', 'async') script.set('data-mathjax-path', path_to_url(mathjax_dir())) last_tag(root).append(script) return has_maths # }}} def fix_markup(container): xp = XPath('//h:canvas') for file_name, is_linear in container.spine_names: root = container.parsed(file_name) for canvas in xp(root): # Canvas causes rendering issues, see https://bugs.launchpad.net/bugs/1859040 # for an example. canvas.tag = XHTML('div') def convert(opf_path, opts, metadata=None, output_path=None, log=default_log, cover_data=None, report_progress=lambda x, y: None): container = Container(opf_path, log) fix_markup(container) report_progress(0.05, _('Parsed all content for markup transformation')) if opts.pdf_hyphenate: from calibre.ebooks.oeb.polish.hyphenation import add_soft_hyphens add_soft_hyphens(container) has_maths = add_maths_script(container) fix_fullscreen_images(container) name_anchor_map = make_anchors_unique(container, log) margin_files = tuple(create_margin_files(container)) toc = get_toc(container, verify_destinations=False) has_toc = toc and len(toc) links_page_uuid = add_all_links(container, margin_files) container.commit() report_progress(0.1, _('Completed markup transformation')) manager = RenderManager(opts, log, container.root) page_layout = get_page_layout(opts) pdf_doc = None anchor_locations = {} jobs = [] for margin_file in margin_files: jobs.append(job_for_name(container, margin_file.name, margin_file.margins, page_layout)) results = manager.convert_html_files(jobs, settle_time=1, has_maths=has_maths) num_pages = 0 page_margins_map = [] for margin_file in margin_files: name = margin_file.name data = results[name] if not isinstance(data, bytes): raise SystemExit(data) doc = data_as_pdf_doc(data) anchor_locations.update(get_anchor_locations(name, doc, num_pages + 1, links_page_uuid, log)) doc_pages = doc.page_count() page_margins_map.extend(repeat(resolve_margins(margin_file.margins, page_layout), doc_pages)) num_pages += doc_pages if pdf_doc is None: pdf_doc = doc else: pdf_doc.append(doc) page_number_display_map = get_page_number_display_map(manager, opts, num_pages, log) if has_toc: annotate_toc(toc, anchor_locations, name_anchor_map, log) if opts.pdf_add_toc: tocname = create_skeleton(container) root = container.parsed(tocname) add_pagenum_toc(root, toc, opts, page_number_display_map) container.commit() jobs = [job_for_name(container, tocname, None, page_layout)] results = manager.convert_html_files(jobs, settle_time=1) tocdoc = data_as_pdf_doc(results[tocname]) page_margins_map.extend(repeat(resolve_margins(None, page_layout), tocdoc.page_count())) pdf_doc.append(tocdoc) report_progress(0.7, _('Rendered all HTML as PDF')) fix_links(pdf_doc, anchor_locations, name_anchor_map, opts.pdf_mark_links, log) if toc and len(toc): add_toc(PDFOutlineRoot(pdf_doc), toc, log, pdf_doc) report_progress(0.75, _('Added links to PDF content')) pdf_metadata = PDFMetadata(metadata) add_header_footer( manager, opts, pdf_doc, container, page_number_display_map, page_layout, page_margins_map, pdf_metadata, report_progress, toc if has_toc else None) num_removed = remove_unused_fonts(pdf_doc) if num_removed: log('Removed', num_removed, 'unused fonts') merge_fonts(pdf_doc, log) num_removed = dedup_type3_fonts(pdf_doc) if num_removed: log('Removed', num_removed, 'duplicated Type3 glyphs') num_removed = pdf_doc.dedup_images() if num_removed: log('Removed', num_removed, 'duplicate images') if opts.pdf_odd_even_offset: for i in range(1, pdf_doc.page_count()): margins = page_margins_map[i] mult = -1 if i % 2 else 1 val = opts.pdf_odd_even_offset if abs(val) < min(margins.left, margins.right): box = list(pdf_doc.get_page_box("CropBox", i)) box[0] += val * mult pdf_doc.set_page_box("CropBox", i, *box) if cover_data: add_cover(pdf_doc, cover_data, page_layout, opts) if metadata is not None: update_metadata(pdf_doc, pdf_metadata) report_progress(1, _('Updated metadata in PDF')) if opts.uncompressed_pdf: pdf_doc.uncompress() pdf_data = pdf_doc.write() if output_path is None: return pdf_data with open(output_path, 'wb') as f: f.write(pdf_data)