%PDF- %PDF-
Direktori : /lib/calibre/calibre/ebooks/oeb/transforms/ |
Current File : //lib/calibre/calibre/ebooks/oeb/transforms/structure.py |
#!/usr/bin/env python3 __license__ = 'GPL v3' __copyright__ = '2009, Kovid Goyal <kovid@kovidgoyal.net>' __docformat__ = 'restructuredtext en' import re, uuid from lxml import etree from collections import OrderedDict, Counter from calibre.ebooks.oeb.base import XPNSMAP, TOC, XHTML, xml2text, barename from calibre.ebooks import ConversionError from polyglot.builtins import itervalues from polyglot.urllib import urlparse def XPath(x): try: return etree.XPath(x, namespaces=XPNSMAP) except etree.XPathSyntaxError: raise ConversionError( 'The syntax of the XPath expression %s is invalid.' % repr(x)) def isspace(x): return not x or x.replace('\xa0', '').isspace() def at_start(elem): ' Return True if there is no content before elem ' body = XPath('ancestor-or-self::h:body')(elem) if not body: return True body = body[0] ancestors = frozenset(XPath('ancestor::*')(elem)) for x in body.iter(): if x is elem: return True if hasattr(getattr(x, 'tag', None), 'rpartition') and x.tag.rpartition('}')[-1] in {'img', 'svg'}: return False if isspace(getattr(x, 'text', None)) and (x in ancestors or isspace(getattr(x, 'tail', None))): continue return False return False class DetectStructure: def __call__(self, oeb, opts): self.log = oeb.log self.oeb = oeb self.opts = opts self.log('Detecting structure...') self.detect_chapters() if self.oeb.auto_generated_toc or opts.use_auto_toc: orig_toc = self.oeb.toc self.oeb.toc = TOC() self.create_level_based_toc() if self.oeb.toc.count() < 1: if not opts.no_chapters_in_toc and self.detected_chapters: self.create_toc_from_chapters() if self.oeb.toc.count() < opts.toc_threshold: self.create_toc_from_links() if self.oeb.toc.count() < 2 and orig_toc.count() > 2: self.oeb.toc = orig_toc else: self.oeb.auto_generated_toc = True self.log('Auto generated TOC with %d entries.' % self.oeb.toc.count()) if opts.toc_filter is not None: regexp = re.compile(opts.toc_filter) for node in list(self.oeb.toc.iter()): if not node.title or regexp.search(node.title) is not None: self.log('Filtering', node.title if node.title else 'empty node', 'from TOC') self.oeb.toc.remove(node) if opts.page_breaks_before is not None: pb_xpath = XPath(opts.page_breaks_before) for item in oeb.spine: for elem in pb_xpath(item.data): try: prev = next(elem.itersiblings(tag=etree.Element, preceding=True)) if (barename(elem.tag) in {'h1', 'h2'} and barename( prev.tag) in {'h1', 'h2'} and (not prev.tail or not prev.tail.split())): # We have two adjacent headings, do not put a page # break on the second one continue except StopIteration: pass style = elem.get('style', '') if style: style += '; ' elem.set('style', style+'page-break-before:always') for node in self.oeb.toc.iter(): if not node.title or not node.title.strip(): node.title = _('Unnamed') if self.opts.start_reading_at: self.detect_start_reading() def detect_start_reading(self): expr = self.opts.start_reading_at try: expr = XPath(expr) except: self.log.warn( 'Invalid start reading at XPath expression, ignoring: %s'%expr) return for item in self.oeb.spine: if not hasattr(item.data, 'xpath'): continue matches = expr(item.data) if matches: elem = matches[0] eid = elem.get('id', None) if not eid: eid = 'start_reading_at_'+str(uuid.uuid4()).replace('-', '') elem.set('id', eid) if 'text' in self.oeb.guide: self.oeb.guide.remove('text') self.oeb.guide.add('text', 'Start', item.href+'#'+eid) self.log('Setting start reading at position to %s in %s'%( self.opts.start_reading_at, item.href)) return self.log.warn("Failed to find start reading at position: %s"% self.opts.start_reading_at) def get_toc_parts_for_xpath(self, expr): # if an attribute is selected by the xpath expr then truncate it # from the path and instead return it as where to find the title text title_attribute_regex = re.compile(r'/@([-\w]+)$') match = title_attribute_regex.search(expr) if match is not None: return expr[0:match.start()], match.group(1) return expr, None def detect_chapters(self): self.detected_chapters = [] self.chapter_title_attribute = None def find_matches(expr, doc): try: ans = XPath(expr)(doc) len(ans) return ans except: self.log.warn('Invalid chapter expression, ignoring: %s'%expr) return [] if self.opts.chapter: chapter_path, title_attribute = self.get_toc_parts_for_xpath(self.opts.chapter) self.chapter_title_attribute = title_attribute for item in self.oeb.spine: for x in find_matches(chapter_path, item.data): self.detected_chapters.append((item, x)) chapter_mark = self.opts.chapter_mark page_break_before = 'display: block; page-break-before: always' page_break_after = 'display: block; page-break-after: always' c = Counter() for item, elem in self.detected_chapters: c[item] += 1 text = xml2text(elem).strip() text = re.sub(r'\s+', ' ', text.strip()) self.log('\tDetected chapter:', text[:50]) if chapter_mark == 'none': continue if chapter_mark == 'rule': mark = elem.makeelement(XHTML('hr')) elif chapter_mark == 'pagebreak': if c[item] < 3 and at_start(elem): # For the first two elements in this item, check if they # are at the start of the file, in which case inserting a # page break in unnecessary and can lead to extra blank # pages in the PDF Output plugin. We need to use two as # feedbooks epubs match both a heading tag and its # containing div with the default chapter expression. continue mark = elem.makeelement(XHTML('div'), style=page_break_after) else: # chapter_mark == 'both': mark = elem.makeelement(XHTML('hr'), style=page_break_before) try: elem.addprevious(mark) except TypeError: self.log.exception('Failed to mark chapter') def create_level_based_toc(self): if self.opts.level1_toc is not None: self.add_leveled_toc_items() def create_toc_from_chapters(self): counter = self.oeb.toc.next_play_order() for item, elem in self.detected_chapters: text, href = self.elem_to_link(item, elem, self.chapter_title_attribute, counter) self.oeb.toc.add(text, href, play_order=counter) counter += 1 def create_toc_from_links(self): num = 0 for item in self.oeb.spine: for a in XPath('//h:a[@href]')(item.data): href = a.get('href') try: purl = urlparse(href) except ValueError: self.log.warning('Ignoring malformed URL:', href) continue if not purl[0] or purl[0] == 'file': href, frag = purl.path, purl.fragment href = item.abshref(href) if frag: href = '#'.join((href, frag)) if not self.oeb.toc.has_href(href): text = xml2text(a) text = text[:100].strip() if (not self.opts.duplicate_links_in_toc and self.oeb.toc.has_text(text)): continue try: self.oeb.toc.add(text, href, play_order=self.oeb.toc.next_play_order()) num += 1 except ValueError: self.oeb.log.exception('Failed to process link: %r' % href) continue # Most likely an incorrectly URL encoded link if self.opts.max_toc_links > 0 and \ num >= self.opts.max_toc_links: self.log('Maximum TOC links reached, stopping.') return def elem_to_link(self, item, elem, title_attribute, counter): text = '' if title_attribute is not None: text = elem.get(title_attribute, '') if not text: text = xml2text(elem).strip() if not text: text = elem.get('title', '') if not text: text = elem.get('alt', '') text = re.sub(r'\s+', ' ', text.strip()) text = text[:1000].strip() id = elem.get('id', 'calibre_toc_%d'%counter) elem.set('id', id) href = '#'.join((item.href, id)) return text, href def add_leveled_toc_items(self): added = OrderedDict() added2 = OrderedDict() counter = 1 def find_matches(expr, doc): try: ans = XPath(expr)(doc) len(ans) return ans except: self.log.warn('Invalid ToC expression, ignoring: %s'%expr) return [] for document in self.oeb.spine: previous_level1 = list(itervalues(added))[-1] if added else None previous_level2 = list(itervalues(added2))[-1] if added2 else None level1_toc, level1_title = self.get_toc_parts_for_xpath(self.opts.level1_toc) for elem in find_matches(level1_toc, document.data): text, _href = self.elem_to_link(document, elem, level1_title, counter) counter += 1 if text: node = self.oeb.toc.add(text, _href, play_order=self.oeb.toc.next_play_order()) added[elem] = node # node.add(_('Top'), _href) if self.opts.level2_toc is not None and added: level2_toc, level2_title = self.get_toc_parts_for_xpath(self.opts.level2_toc) for elem in find_matches(level2_toc, document.data): level1 = None for item in document.data.iterdescendants(): if item in added: level1 = added[item] elif item == elem: if level1 is None: if previous_level1 is None: break level1 = previous_level1 text, _href = self.elem_to_link(document, elem, level2_title, counter) counter += 1 if text: added2[elem] = level1.add(text, _href, play_order=self.oeb.toc.next_play_order()) break if self.opts.level3_toc is not None and added2: level3_toc, level3_title = self.get_toc_parts_for_xpath(self.opts.level3_toc) for elem in find_matches(level3_toc, document.data): level2 = None for item in document.data.iterdescendants(): if item in added2: level2 = added2[item] elif item == elem: if level2 is None: if previous_level2 is None: break level2 = previous_level2 text, _href = \ self.elem_to_link(document, elem, level3_title, counter) counter += 1 if text: level2.add(text, _href, play_order=self.oeb.toc.next_play_order()) break