%PDF- %PDF-
Direktori : /lib/calibre/calibre/ebooks/oeb/transforms/ |
Current File : //lib/calibre/calibre/ebooks/oeb/transforms/split.py |
__license__ = 'GPL v3' __copyright__ = '2008, Kovid Goyal kovid@kovidgoyal.net' __docformat__ = 'restructuredtext en' ''' Splitting of the XHTML flows. Splitting can happen on page boundaries or can be forced at "likely" locations to conform to size limitations. This transform assumes a prior call to the flatcss transform. ''' import os, functools, collections, re, copy from collections import OrderedDict from lxml.etree import XPath as _XPath from lxml import etree from calibre import as_unicode, force_unicode from calibre.ebooks.epub import rules from calibre.ebooks.oeb.base import (OEB_STYLES, XPNSMAP as NAMESPACES, urldefrag, rewrite_links, XHTML, urlnormalize) from calibre.ebooks.oeb.polish.split import do_split from polyglot.builtins import iteritems from polyglot.urllib import unquote from css_selectors import Select, SelectorError XPath = functools.partial(_XPath, namespaces=NAMESPACES) SPLIT_POINT_ATTR = 'csp' def tostring(root): return etree.tostring(root, encoding='utf-8') class SplitError(ValueError): def __init__(self, path, root): size = len(tostring(root))/1024. ValueError.__init__(self, _('Could not find reasonable point at which to split: ' '%(path)s Sub-tree size: %(size)d KB')%dict( path=path, size=size)) class Split: def __init__(self, split_on_page_breaks=True, page_breaks_xpath=None, max_flow_size=0, remove_css_pagebreaks=True): self.split_on_page_breaks = split_on_page_breaks self.page_breaks_xpath = page_breaks_xpath self.max_flow_size = max_flow_size self.page_break_selectors = None self.remove_css_pagebreaks = remove_css_pagebreaks if self.page_breaks_xpath is not None: self.page_break_selectors = [(XPath(self.page_breaks_xpath), False)] def __call__(self, oeb, opts): self.oeb = oeb self.log = oeb.log self.log('Splitting markup on page breaks and flow limits, if any...') self.opts = opts self.map = {} for item in list(self.oeb.manifest.items): if item.spine_position is not None and etree.iselement(item.data): self.split_item(item) self.fix_links() def split_item(self, item): page_breaks, page_break_ids = [], [] if self.split_on_page_breaks: page_breaks, page_break_ids = self.find_page_breaks(item) splitter = FlowSplitter(item, page_breaks, page_break_ids, self.max_flow_size, self.oeb, self.opts) if splitter.was_split: am = splitter.anchor_map self.map[item.href] = collections.defaultdict( am.default_factory, am) def find_page_breaks(self, item): if self.page_break_selectors is None: self.page_break_selectors = set() stylesheets = [x.data for x in self.oeb.manifest if x.media_type in OEB_STYLES] for rule in rules(stylesheets): before = force_unicode(getattr(rule.style.getPropertyCSSValue( 'page-break-before'), 'cssText', '').strip().lower()) after = force_unicode(getattr(rule.style.getPropertyCSSValue( 'page-break-after'), 'cssText', '').strip().lower()) try: if before and before not in {'avoid', 'auto', 'inherit'}: self.page_break_selectors.add((rule.selectorText, True)) if self.remove_css_pagebreaks: rule.style.removeProperty('page-break-before') except: pass try: if after and after not in {'avoid', 'auto', 'inherit'}: self.page_break_selectors.add((rule.selectorText, False)) if self.remove_css_pagebreaks: rule.style.removeProperty('page-break-after') except: pass page_breaks = set() select = Select(item.data) if not self.page_break_selectors: return [], [] body = item.data.xpath('//h:body', namespaces=NAMESPACES) if not body: return [], [] descendants = frozenset(body[0].iterdescendants('*')) for selector, before in self.page_break_selectors: try: for elem in select(selector): if elem in descendants and elem.tag.rpartition('}')[2].lower() not in {'html', 'body', 'head', 'style', 'script', 'meta', 'link'}: elem.set('pb_before', '1' if before else '0') page_breaks.add(elem) except SelectorError as err: self.log.warn(f'Ignoring page breaks specified with invalid CSS selector: {selector!r} ({as_unicode(err)})') for i, elem in enumerate(item.data.iter('*')): try: elem.set('pb_order', str(i)) except TypeError: # Can't set attributes on comment nodes etc. continue page_breaks = list(page_breaks) page_breaks.sort(key=lambda x:int(x.get('pb_order'))) page_break_ids, page_breaks_ = [], [] for i, x in enumerate(page_breaks): x.set('id', x.get('id', 'calibre_pb_%d'%i)) id = x.get('id') try: xp = XPath('//*[@id="%s"]'%id) except: try: xp = XPath("//*[@id='%s']"%id) except: # The id has both a quote and an apostrophe or some other # Just replace it since I doubt its going to work anywhere else # either id = 'calibre_pb_%d'%i x.set('id', id) xp = XPath('//*[@id=%r]'%id) page_breaks_.append((xp, x.get('pb_before', '0') == '1')) page_break_ids.append(id) for elem in item.data.iter(etree.Element): elem.attrib.pop('pb_order', False) elem.attrib.pop('pb_before', False) return page_breaks_, page_break_ids def fix_links(self): ''' Fix references to the split files in other content files. ''' for item in self.oeb.manifest: if etree.iselement(item.data): self.current_item = item rewrite_links(item.data, self.rewrite_links) def rewrite_links(self, url): href, frag = urldefrag(url) try: href = self.current_item.abshref(href) except ValueError: # Unparsable URL return url try: href = urlnormalize(href) except ValueError: # href has non utf-8 quoting return url if href in self.map: anchor_map = self.map[href] nhref = anchor_map[frag if frag else None] nhref = self.current_item.relhref(nhref) if frag: nhref = '#'.join((unquote(nhref), frag)) return nhref return url class FlowSplitter: 'The actual splitting logic' def __init__(self, item, page_breaks, page_break_ids, max_flow_size, oeb, opts): self.item = item self.oeb = oeb self.opts = opts self.log = oeb.log self.page_breaks = page_breaks self.page_break_ids = page_break_ids self.max_flow_size = max_flow_size self.base = item.href self.csp_counter = 0 base, ext = os.path.splitext(self.base) self.base = base.replace('%', '%%')+'_split_%.3d'+ext self.trees = [self.item.data.getroottree()] self.splitting_on_page_breaks = True if self.page_breaks: self.split_on_page_breaks(self.trees[0]) self.splitting_on_page_breaks = False if self.max_flow_size > 0: lt_found = False self.log('\tLooking for large trees in %s...'%item.href) trees = list(self.trees) self.tree_map = {} for i, tree in enumerate(trees): size = len(tostring(tree.getroot())) if size > self.max_flow_size: self.log('\tFound large tree #%d'%i) lt_found = True self.split_trees = [] self.split_to_size(tree) self.tree_map[tree] = self.split_trees if not lt_found: self.log('\tNo large trees found') self.trees = [] for x in trees: self.trees.extend(self.tree_map.get(x, [x])) self.was_split = len(self.trees) > 1 if self.was_split: self.log('\tSplit into %d parts'%len(self.trees)) self.commit() def split_on_page_breaks(self, orig_tree): ordered_ids = OrderedDict() all_page_break_ids = frozenset(self.page_break_ids) for elem_id in orig_tree.xpath('//*/@id'): if elem_id in all_page_break_ids: ordered_ids[elem_id] = self.page_breaks[ self.page_break_ids.index(elem_id)] self.trees = [orig_tree] while ordered_ids: pb_id, (pattern, before) = next(iteritems(ordered_ids)) del ordered_ids[pb_id] for i in range(len(self.trees)-1, -1, -1): tree = self.trees[i] elem = pattern(tree) if elem: self.log.debug('\t\tSplitting on page-break at id=%s'% elem[0].get('id')) before_tree, after_tree = self.do_split(tree, elem[0], before) self.trees[i:i+1] = [before_tree, after_tree] break trees, ids = [], set() for tree in self.trees: root = tree.getroot() if self.is_page_empty(root): discarded_ids = root.xpath('//*[@id]') for x in discarded_ids: x = x.get('id') if not x.startswith('calibre_'): ids.add(x) else: if ids: body = self.get_body(root) if body is not None: existing_ids = frozenset(body.xpath('//*/@id')) for x in ids - existing_ids: body.insert(0, body.makeelement(XHTML('div'), id=x, style='height:0pt')) ids = set() trees.append(tree) self.trees = trees def get_body(self, root): body = root.xpath('//h:body', namespaces=NAMESPACES) if not body: return None return body[0] def do_split(self, tree, split_point, before): ''' Split ``tree`` into a *before* and *after* tree at ``split_point``. :param before: If True tree is split before split_point, otherwise after split_point :return: before_tree, after_tree ''' return do_split(split_point, self.log, before=before) def is_page_empty(self, root): body = self.get_body(root) if body is None: return False txt = re.sub(r'\s+|\xa0', '', etree.tostring(body, method='text', encoding='unicode')) if len(txt): return False for img in root.xpath('//h:img', namespaces=NAMESPACES): if img.get('style', '') != 'display:none': return False if root.xpath('//*[local-name() = "svg"]'): return False return True def split_text(self, text, root, size): self.log.debug('\t\t\tSplitting text of length: %d'%len(text)) rest = text.replace('\r', '') parts = re.split('\n\n', rest) self.log.debug('\t\t\t\tFound %d parts'%len(parts)) if max(map(len, parts)) > size: raise SplitError('Cannot split as file contains a <pre> tag ' 'with a very large paragraph', root) ans = [] buf = '' for part in parts: if len(buf) + len(part) < size: buf += '\n\n'+part else: ans.append(buf) buf = part return ans def split_to_size(self, tree): self.log.debug('\t\tSplitting...') root = tree.getroot() # Split large <pre> tags if they contain only text for pre in XPath('//h:pre')(root): if len(tuple(pre.iterchildren(etree.Element))) > 0: continue if pre.text and len(pre.text) > self.max_flow_size*0.5: self.log.debug('\t\tSplitting large <pre> tag') frags = self.split_text(pre.text, root, int(0.2*self.max_flow_size)) new_pres = [] for frag in frags: pre2 = copy.copy(pre) pre2.text = frag pre2.tail = '' new_pres.append(pre2) new_pres[-1].tail = pre.tail p = pre.getparent() i = p.index(pre) p[i:i+1] = new_pres split_point, before = self.find_split_point(root) if split_point is None: raise SplitError(self.item.href, root) self.log.debug('\t\t\tSplit point:', split_point.tag, tree.getpath(split_point)) trees = self.do_split(tree, split_point, before) sizes = [len(tostring(t.getroot())) for t in trees] if min(sizes) < 5*1024: self.log.debug('\t\t\tSplit tree too small') self.split_to_size(tree) return for t, size in zip(trees, sizes): r = t.getroot() if self.is_page_empty(r): continue elif size <= self.max_flow_size: self.split_trees.append(t) self.log.debug( '\t\t\tCommitted sub-tree #%d (%d KB)'%( len(self.split_trees), size/1024.)) else: self.log.debug( '\t\t\tSplit tree still too large: %d KB' % (size/1024.)) self.split_to_size(t) def find_split_point(self, root): ''' Find the tag at which to split the tree rooted at `root`. Search order is: * Heading tags * <div> tags * <pre> tags * <hr> tags * <p> tags * <br> tags * <li> tags We try to split in the "middle" of the file (as defined by tag counts. ''' def pick_elem(elems): if elems: elems = [i for i in elems if i.get(SPLIT_POINT_ATTR, '0') != '1'] if elems: i = int(len(elems)//2) elems[i].set(SPLIT_POINT_ATTR, '1') return elems[i] for path in ( '//*[re:match(name(), "h[1-6]", "i")]', '/h:html/h:body/h:div', '//h:pre', '//h:hr', '//h:p', '//h:div', '//h:br', '//h:li', ): elems = root.xpath(path, namespaces=NAMESPACES) elem = pick_elem(elems) if elem is not None: try: XPath(elem.getroottree().getpath(elem)) except: continue return elem, True return None, True def commit(self): ''' Commit all changes caused by the split. Calculates an *anchor_map* for all anchors in the original tree. Internal links are re-directed. The original file is deleted and the split files are saved. ''' if not self.was_split: return self.anchor_map = collections.defaultdict(lambda :self.base%0) self.files = [] for i, tree in enumerate(self.trees): root = tree.getroot() self.files.append(self.base%i) for elem in root.xpath('//*[@id or @name]'): for anchor in elem.get('id', ''), elem.get('name', ''): if anchor != '' and anchor not in self.anchor_map: self.anchor_map[anchor] = self.files[-1] for elem in root.xpath('//*[@%s]'%SPLIT_POINT_ATTR): elem.attrib.pop(SPLIT_POINT_ATTR, '0') spine_pos = self.item.spine_position for current, tree in zip(*map(reversed, (self.files, self.trees))): for a in tree.getroot().xpath('//h:a[@href]', namespaces=NAMESPACES): href = a.get('href').strip() if href.startswith('#'): anchor = href[1:] file = self.anchor_map[anchor] file = self.item.relhref(file) if file != current: a.set('href', file+href) new_id = self.oeb.manifest.generate(id=self.item.id)[0] new_item = self.oeb.manifest.add(new_id, current, self.item.media_type, data=tree.getroot()) self.oeb.spine.insert(spine_pos, new_item, self.item.linear) if self.oeb.guide: for ref in self.oeb.guide.values(): href, frag = urldefrag(ref.href) if href == self.item.href: nhref = self.anchor_map[frag if frag else None] if frag: nhref = '#'.join((nhref, frag)) ref.href = nhref def fix_toc_entry(toc): if toc.href: href, frag = urldefrag(toc.href) if href == self.item.href: nhref = self.anchor_map[frag if frag else None] if frag: nhref = '#'.join((nhref, frag)) toc.href = nhref for x in toc: fix_toc_entry(x) if self.oeb.toc: fix_toc_entry(self.oeb.toc) if self.oeb.pages: for page in self.oeb.pages: href, frag = urldefrag(page.href) if href == self.item.href: nhref = self.anchor_map[frag if frag else None] if frag: nhref = '#'.join((nhref, frag)) page.href = nhref self.oeb.manifest.remove(self.item)