%PDF- %PDF-
Direktori : /lib/calibre/calibre/ebooks/ |
Current File : //lib/calibre/calibre/ebooks/html_transform_rules.py |
#!/usr/bin/env python3 # License: GPLv3 Copyright: 2016, Kovid Goyal <kovid at kovidgoyal.net> import re from functools import partial from html5_parser import parse from lxml import etree from calibre.ebooks.oeb.parse_utils import XHTML from calibre.ebooks.oeb.base import OEB_DOCS, XPath from calibre.ebooks.metadata.tag_mapper import uniq from css_selectors.select import Select, get_parsed_selector def non_empty_validator(label, val): if not val: return _('{} must not be empty').format(label) def always_valid(*a): pass class Action: def __init__(self, name, short_text, long_text, placeholder='', validator=None): self.name = name self.short_text = short_text self.long_text = long_text self.placeholder = placeholder if validator is None and placeholder: validator = partial(non_empty_validator, self.placeholder) self.validator = validator or always_valid ACTION_MAP = {a.name: a for a in ( Action('rename', _('Change tag name'), _('Rename tag to the specified name'), _('New tag name')), Action('remove', _('Remove tag and children'), _('Remove the tag and all its contents')), Action('unwrap', _('Remove tag only'), _('Remove the tag but keep its contents')), Action('add_classes', _('Add classes'), _('Add the specified classes, e.g.:') + ' bold green', _('Space separated class names')), Action('remove_classes', _('Remove classes'), _('Remove the specified classes, e.g.:') + ' bold green', _('Space separated class names')), Action('remove_attrs', _('Remove attributes'), _( 'Remove the specified attributes from the tag. Multiple attribute names should be separated by spaces.' ' The special value * removes all attributes.'), _('Space separated attribute names')), Action('add_attrs', _('Add attributes'), _('Add the specified attributes, e.g.:') + ' class="red" name="test"', _('Space separated attribute names')), Action('empty', _('Empty the tag'), _('Remove all contents from the tag')), Action('wrap', _('Wrap the tag'), _( 'Wrap the tag in the specified tag, e.g.: {0} will wrap the tag in a DIV tag with class {1}').format( '<div class="box">', 'box'), _('An HTML opening tag')), Action('insert', _('Insert HTML at start'), _( 'The specified HTML snippet is inserted after the opening tag. Note that only valid HTML snippets can be used without unclosed tags'), _('HTML snippet')), Action('insert_end', _('Insert HTML at end'), _( 'The specified HTML snippet is inserted before the closing tag. Note that only valid HTML snippets can be used without unclosed tags'), _('HTML snippet')), Action('prepend', _('Insert HTML before tag'), _( 'The specified HTML snippet is inserted before the opening tag. Note that only valid HTML snippets can be used without unclosed tags'), _('HTML snippet')), Action('append', _('Insert HTML after tag'), _( 'The specified HTML snippet is inserted after the closing tag. Note that only valid HTML snippets can be used without unclosed tags'), _('HTML snippet')), )} def validate_action(action): if set(action) != {'type', 'data'}: return _('Action must have both:') + ' type and data' a = ACTION_MAP[action['type']] return a.validator(action['data']) def validate_css_selector(val): try: get_parsed_selector(val) except Exception: return _('{} is not a valid CSS selector').format(val) def validate_xpath_selector(val): try: XPath(val) except Exception: return _('{} is not a valid XPath selector').format(val) class Match: def __init__(self, name, text, placeholder='', validator=None): self.name = name self.text = text self.placeholder = placeholder if validator is None and placeholder: validator = partial(non_empty_validator, self.placeholder) self.validator = validator or always_valid MATCH_TYPE_MAP = {m.name: m for m in ( Match('is', _('is'), _('Tag name')), Match('has_class', _('has class'), _('Class name')), Match('not_has_class', _('does not have class'), _('Class name')), Match('css', _('matches CSS selector'), _('CSS selector'), validate_css_selector), Match('xpath', _('matches XPath selector'), _('XPath selector'), validate_xpath_selector), Match('*', _('is any tag')), Match('contains_text', _('contains text'), _('Text')), )} allowed_keys = frozenset('match_type query actions'.split()) def validate_rule(rule): keys = frozenset(rule) extra = keys - allowed_keys if extra: return _('Unknown keys'), _( 'The rule has unknown keys: %s') % ', '.join(extra) missing = allowed_keys - keys if missing: return _('Missing keys'), _( 'The rule has missing keys: %s') % ', '.join(missing) mt = rule['match_type'] if mt not in MATCH_TYPE_MAP: return _('Unknown match type'), _( 'The match type %s is not known') % mt if mt != '*' and not rule['query']: _('Query required'), _( 'You must specify a value for the tag to match') m = MATCH_TYPE_MAP[rule['match_type']] err = m.validator(rule.get('query') or '') if err: return _('Invalid {}').format(m.placeholder), err if not rule['actions']: return _('No actions'), _('The rule has no actions') for action in rule['actions']: err = validate_action(action) if err: return _('Invalid action'), err return None, None def rename_tag(new_name, tag): if new_name != tag.tag: tag.tag = new_name return True return False def qualify_tag_name(name): return XHTML(name) def remove_tag(tag): p = tag.getparent() idx = p.index(tag) sibling = p[idx-1] if idx else None p.remove(tag) if tag.tail: if sibling is None: p.text = (p.text or '') + tag.tail else: sibling.tail = (sibling.tail or '') + tag.tail return True def unwrap_tag(tag): p = tag.getparent() idx = p.index(tag) sibling = p[idx-1] if idx else None if tag.text: if sibling is None: p.text = (p.text or '') + tag.text else: sibling.tail = (sibling.tail or '') + tag.text for i, child in enumerate(reversed(tag)): p.insert(idx, child) if i == 0: sibling = child p.remove(tag) if tag.tail: if sibling is None: p.text = (p.text or '') + tag.tail else: sibling.tail = (sibling.tail or '') + tag.tail return True def add_classes(classes, tag): orig_cls = tag.get('class', '') orig = list(filter(None, str.split(orig_cls))) new_cls = ' '.join(uniq(orig + classes)) if new_cls != orig_cls: tag.set('class', new_cls) return True return False def remove_classes(classes, tag): orig_cls = tag.get('class', '') orig = list(filter(None, str.split(orig_cls))) for x in classes: while True: try: orig.remove(x) except ValueError: break new_cls = ' '.join(orig) if new_cls != orig_cls: tag.set('class', new_cls) return True return False def remove_attrs(attrs, tag): changed = False if not tag.attrib: return False for a in attrs: if a == '*': changed = True tag.attrib.clear() else: if tag.attrib.pop(a, None) is not None: changed = True return changed def parse_attrs(text): div = parse(f'<div {text} ></div>', fragment_context='div')[0] return div.items() def add_attrs(attrib, tag): orig = tag.items() for k, v in attrib: tag.set(k, v) return orig != tag.items() def empty(tag): changed = len(tag) > 0 or bool(tag.text) tag.text = None del tag[:] return changed def parse_start_tag(text): tag = parse(text, namespace_elements=True, fragment_context='div')[0] return {'tag': tag.tag, 'attrib': tag.items()} def wrap(data, tag): elem = tag.makeelement(data['tag']) for k, v in data['attrib']: elem.set(k, v) elem.tail = tag.tail tag.tail = None p = tag.getparent() idx = p.index(tag) p.insert(idx, elem) elem.append(tag) return True def parse_html_snippet(text): return parse(f'<div>{text}</div>', namespace_elements=True, fragment_context='div')[0] def clone(src_element, target_tree): if src_element.tag is etree.Comment: ans = etree.Comment('') else: ans = target_tree.makeelement(src_element.tag) for k, v in src_element.items(): ans.set(k, v) ans.extend(src_element) ans.text = src_element.text ans.tail = src_element.tail return ans def insert_snippet(container, before_children, tag): if before_children: orig_text = tag.text tag.text = container.text if len(container): for i, child in enumerate(reversed(container)): c = clone(child, tag) tag.insert(0, c) if i == 0 and orig_text: c.tail = (c.tail or '') + orig_text else: tag.text = (tag.text or '') + orig_text else: if container.text: if len(tag) > 0: tag[-1].tail = (tag[-1].tail or '') + container.text else: tag.text = (tag.text or '') + container.text for child in container: c = clone(child, tag) tag.append(c) return True def append_snippet(container, before_tag, tag): p = tag.getparent() idx = p.index(tag) if not before_tag: idx += 1 if container.text: if idx: c = p[idx-1] c.tail = (c.tail or '') + container.text else: p.text = (p.text or '') + container.text for child in reversed(container): c = clone(child, tag) p.insert(idx, c) return True action_map = { 'rename': lambda data: partial(rename_tag, qualify_tag_name(data)), 'remove': lambda data: remove_tag, 'unwrap': lambda data: unwrap_tag, 'empty': lambda data: empty, 'add_classes': lambda data: partial(add_classes, str.split(data)), 'remove_classes': lambda data: partial(remove_classes, str.split(data)), 'remove_attrs': lambda data: partial(remove_attrs, str.split(data)), 'add_attrs': lambda data: partial(add_attrs, parse_attrs(data)), 'wrap': lambda data: partial(wrap, parse_start_tag(data)), 'insert': lambda data: partial(insert_snippet, parse_html_snippet(data), True), 'insert_end': lambda data: partial(insert_snippet, parse_html_snippet(data), False), 'prepend': lambda data: partial(append_snippet, parse_html_snippet(data), True), 'append': lambda data: partial(append_snippet, parse_html_snippet(data), False), } def create_action(serialized_action): return action_map[serialized_action['type']](serialized_action.get('data', '')) def text_as_xpath_literal(text): if '"' in text: if "'" not in text: return f"'{text}'" parts = [] for x in re.split(r'(")', text): if not x: continue if x == '"': x = "'\"'" else: x = f'"{x}"' parts.append(x) return f'concat({",".join(parts)})' return f'"{text}"' class Rule: def __init__(self, serialized_rule): self.sel_type = 'xpath' mt = serialized_rule['match_type'] q = serialized_rule['query'] if mt == 'xpath': self.xpath_selector = XPath(q) self.selector = self.xpath elif mt in ('is', 'css'): self.css_selector = q self.selector = self.css elif mt == '*': self.xpath_selector = XPath('//*') self.selector = self.xpath elif mt == 'has_class': self.css_selector = '.' + q self.selector = self.css elif mt == 'not_has_class': self.css_selector = f":not(.{q})" self.selector = self.css elif mt == 'contains_text': self.xpath_selector = XPath(f'//*[contains(text(), {text_as_xpath_literal(q)})]') self.selector = self.xpath else: raise KeyError(f'Unknown match_type: {mt}') self.actions = tuple(map(create_action, serialized_rule['actions'])) def xpath(self, root): return self.xpath_selector(root) def css(self, root): return tuple(Select(root)(self.css_selector)) def __call__(self, root): changed = False for tag in self.selector(root): for action in self.actions: if action(tag): changed = True return changed def transform_doc(root, rules): changed = False for rule in rules: if rule(root): changed = True return changed def transform_html(html, serialized_rules): root = parse(html, namespace_elements=True) rules = tuple(Rule(r) for r in serialized_rules) changed = transform_doc(root, rules) return changed, etree.tostring(root, encoding='unicode') def transform_container(container, serialized_rules, names=()): if not names: types = OEB_DOCS names = [] for name, mt in container.mime_map.items(): if mt in types: names.append(name) doc_changed = False rules = tuple(Rule(r) for r in serialized_rules) for name in names: mt = container.mime_map.get(name) if mt in OEB_DOCS: root = container.parsed(name) if transform_doc(root, rules): container.dirty(name) doc_changed = True return doc_changed def transform_conversion_book(oeb, opts, serialized_rules): rules = tuple(Rule(r) for r in serialized_rules) for item in oeb.spine: root = item.data if not hasattr(root, 'xpath'): continue transform_doc(root, rules) def rule_to_text(rule): text = _('If the tag {match_type} {query}').format( match_type=MATCH_TYPE_MAP[rule['match_type']].text, query=rule.get('query') or '') for action in rule['actions']: text += '\n' text += _('{action_type} {action_data}').format( action_type=ACTION_MAP[action['type']].short_text, action_data=action.get('data') or '') return text def export_rules(serialized_rules): lines = [] for rule in serialized_rules: lines.extend('# ' + l for l in rule_to_text(rule).splitlines()) lines.extend('{}: {}'.format(k, v.replace('\n', ' ')) for k, v in rule.items() if k in allowed_keys and k != 'actions') for action in rule.get('actions', ()): lines.append(f"action: {action['type']}: {action.get('data', '')}") lines.append('') return '\n'.join(lines).encode('utf-8') def import_rules(raw_data): current_rule = {} def sanitize(r): return {k:(r.get(k) or '') for k in allowed_keys} for line in raw_data.decode('utf-8').splitlines(): if not line.strip(): if current_rule: yield sanitize(current_rule) current_rule = {} continue if line.lstrip().startswith('#'): continue parts = line.split(':', 1) if len(parts) == 2: k, v = parts[0].lower().strip(), parts[1].strip() if k == 'action': actions = current_rule.setdefault('actions', []) parts = v.split(':', 1) actions.append({'type': parts[0].strip(), 'data': parts[1].strip() if len(parts) == 2 else ''}) elif k in allowed_keys: current_rule[k] = v if current_rule: yield sanitize(current_rule) def test(return_tests=False): # {{{ import unittest class TestTransforms(unittest.TestCase): longMessage = True maxDiff = None ae = unittest.TestCase.assertEqual def test_matching(self): root = parse(namespace_elements=True, html=''' <html id='root'> <head id='head'></head> <body id='body'> <p class="one red" id='p1'>simple <p class="two green" id='p2'>a'b"c ''') all_ids = root.xpath('//*/@id') def q(mt, query=''): r = Rule({'match_type': mt, 'query': query, 'actions':[]}) ans = [] for tag in r.selector(root): ans.append(tag.get('id')) return ans def t(mt, query='', expected=[]): self.ae(expected, q(mt, query)) t('*', expected=all_ids) t('is', 'body', ['body']) t('is', 'p', ['p1', 'p2']) t('has_class', 'one', ['p1']) ei = list(all_ids) ei.remove('p1') t('not_has_class', 'one', ei) t('css', '#body > p.red', ['p1']) t('xpath', '//h:body', ['body']) t('contains_text', 'imple', ['p1']) t('contains_text', 'a\'b"c', ['p2']) def test_validate_rule(self): def av(match_type='*', query='', atype='remove', adata=''): rule = {'match_type': match_type, 'query': query, 'actions': [{'type': atype, 'data': adata}]} self.ae(validate_rule(rule), (None, None)) def ai(match_type='*', query='', atype='remove', adata=''): rule = {'match_type': match_type, 'query': query, 'actions': [{'type': atype, 'data': adata}]} self.assertNotEqual(validate_rule(rule), (None, None)) av() av('css', 'p') ai('css', 'p..c') av('xpath', '//h:p') ai('xpath', '//h:p[') ai(atype='wrap') def test_export_import(self): rule = {'match_type': 'is', 'query': 'p', 'actions': [{'type': 'rename', 'data': 'div'}, {'type': 'remove', 'data': ''}]} self.ae(rule, next(iter(import_rules(export_rules([rule]))))) rule = {'match_type': '*', 'actions': [{'type': 'remove'}]} erule = {'match_type': '*', 'query': '', 'actions': [{'type': 'remove', 'data': ''}]} self.ae(erule, next(iter(import_rules(export_rules([rule]))))) def test_html_transform_actions(self): parse('a', fragment_context='div') def r(html='<p>hello'): return parse(namespace_elements=True, html=html)[1] def tostring(x, with_tail=True): return etree.tostring(x, encoding='unicode', with_tail=with_tail) def ax(x, expected): v = tostring(x) self.ae(expected, v.replace(' xmlns="http://www.w3.org/1999/xhtml"', '')) def t(name, data=''): return action_map[name](data) p = r()[0] self.assertFalse(t('rename', 'p')(p)) self.assertTrue(t('rename', 'div')(p)) self.ae(p.tag, XHTML('div')) div = r('<div><div><span>remove</span></div>keep</div>')[0] self.assertTrue(t('remove')(div[0])) ax(div, '<div>keep</div>') div = r('<div><div></div><div><span>remove</span></div>keep</div>')[0] self.assertTrue(t('remove')(div[1])) ax(div, '<div><div/>keep</div>') div = r('<div><div>text<span>unwrap</span></div>tail</div>')[0] self.assertTrue(t('unwrap')(div[0])) ax(div, '<div>text<span>unwrap</span>tail</div>') div = r('<div><div></div><div>text<span>unwrap</span></div>tail</div>')[0] self.assertTrue(t('unwrap')(div[1])) ax(div, '<div><div/>text<span>unwrap</span>tail</div>') p = r()[0] self.assertTrue(t('add_classes', 'a b')(p)) self.ae(p.get('class'), 'a b') p = r('<p class="c a d">')[0] self.assertTrue(t('add_classes', 'a b')(p)) self.ae(p.get('class'), 'c a d b') p = r('<p class="c a d">')[0] self.assertFalse(t('add_classes', 'a')(p)) self.ae(p.get('class'), 'c a d') p = r()[0] self.assertFalse(t('remove_classes', 'a b')(p)) self.ae(p.get('class'), None) p = r('<p class="c a a d">')[0] self.assertTrue(t('remove_classes', 'a')(p)) self.ae(p.get('class'), 'c d') p = r()[0] self.assertFalse(t('remove_attrs', 'a b')(p)) self.assertFalse(p.attrib) p = r('<p class="c" x="y" id="p">')[0] self.assertTrue(t('remove_attrs', 'class id')(p)) self.ae(list(p.attrib), ['x']) p = r('<p class="c" x="y" id="p">')[0] self.assertTrue(t('remove_attrs', '*')(p)) self.ae(list(p.attrib), []) p = r()[0] self.assertTrue(t('add_attrs', "class='c' data-m=n")(p)) self.ae(p.items(), [('class', 'c'), ('data-m', 'n')]) p = r('<p a=1>')[0] self.assertTrue(t('add_attrs', "a=2")(p)) self.ae(p.items(), [('a', '2')]) p = r('<p>t<span>s')[0] self.assertTrue(t('empty')(p)) ax(p, '<p/>') p = r('<p>t<span>s</p>tail')[0] self.assertTrue(t('wrap', '<div a=b c=d>')(p)) ax(p.getparent(), '<div a="b" c="d"><p>t<span>s</span></p></div>tail') p = r('<p>hello<span>s')[0] self.assertTrue(t('insert', 'text<div a=b c=d><!-- comm -->tail')(p)) ax(p, '<p>text<div a="b" c="d"><!-- comm -->tail</div>hello<span>s</span></p>') p = r('<p>hello<span>s')[0] self.assertTrue(t('insert', 'text')(p)) ax(p, '<p>texthello<span>s</span></p>') p = r('<p>hello<span>s')[0] self.assertTrue(t('insert_end', 'text<div><!-- comm -->tail')(p)) ax(p, '<p>hello<span>s</span>text<div><!-- comm -->tail</div></p>') p = r('<p>hello<span>s</span>tail')[0] self.assertTrue(t('insert_end', 'text')(p)) ax(p, '<p>hello<span>s</span>tailtext</p>') p = r('<p>hello')[0] self.assertTrue(t('prepend', 'text<div>x</div>tail')(p)) ax(p.getparent(), '<body>text<div>x</div>tail<p>hello</p></body>') p = r('<p>hello')[0] self.assertTrue(t('append', 'text')(p)) ax(p.getparent(), '<body><p>hello</p>text</body>') tests = unittest.defaultTestLoader.loadTestsFromTestCase(TestTransforms) if return_tests: return tests unittest.TextTestRunner(verbosity=4).run(tests) if __name__ == '__main__': test() # }}}