%PDF- %PDF-
| Direktori : /proc/thread-self/root/usr/lib/calibre/calibre/ebooks/oeb/polish/tests/ |
| Current File : //proc/thread-self/root/usr/lib/calibre/calibre/ebooks/oeb/polish/tests/parsing.py |
#!/usr/bin/env python3
__license__ = 'GPL v3'
__copyright__ = '2013, Kovid Goyal <kovid at kovidgoyal.net>'
from functools import partial
from lxml import etree
from html5lib.constants import cdataElements, rcdataElements
from calibre.ebooks.oeb.polish.tests.base import BaseTest
from calibre.ebooks.oeb.polish.parsing import parse_html5 as parse
from calibre.ebooks.oeb.base import XPath, XHTML_NS, SVG_NS, XLINK_NS
from calibre.ebooks.oeb.parse_utils import html5_parse
from polyglot.builtins import iteritems
def nonvoid_cdata_elements(test, parse_function):
''' If self closed version of non-void cdata elements like <title/> are
present, the HTML5 parsing algorithm treats all following data as CDATA '''
markup = '''
<html> <head><{0}/></head> <body id="test"> </html>
'''
for tag in cdataElements | rcdataElements:
for x in (tag, tag.upper(), '\n' + tag, tag + ' id="xxx" '):
root = parse_function(markup.format(x))
test.assertEqual(
len(XPath('//h:body[@id="test"]')(root)), 1,
'Incorrect parsing for <%s/>, parsed markup:\n' % x + etree.tostring(root, encoding='unicode'))
def namespaces(test, parse_function):
ae = test.assertEqual
def match_and_prefix(root, xpath, prefix, err=''):
matches = XPath(xpath)(root)
ae(len(matches), 1, err)
ae(matches[0].prefix, prefix, err)
markup = f''' <html xmlns="{XHTML_NS}"><head><body id="test"></html> '''
root = parse_function(markup)
ae(
len(XPath('//h:body[@id="test"]')(root)), 1,
'Incorrect parsing, parsed markup:\n' + etree.tostring(root, encoding='unicode'))
match_and_prefix(root, '//h:body[@id="test"]', None)
markup = '''
<html xmlns="{xhtml}"><head><body id="test">
<svg:svg xmlns:svg="{svg}"><svg:image xmlns:xlink="{xlink}" xlink:href="xxx"/></svg:svg>
'''.format(xhtml=XHTML_NS, svg=SVG_NS, xlink=XLINK_NS)
root = parse_function(markup)
err = 'Incorrect parsing, parsed markup:\n' + etree.tostring(root, encoding='unicode')
match_and_prefix(root, '//h:body[@id="test"]', None, err)
match_and_prefix(root, '//svg:svg', 'svg', err)
match_and_prefix(root, '//svg:image[@xl:href]', 'svg', err)
markup = '''
<html xmlns="{xhtml}"><head><body id="test">
<svg xmlns="{svg}" xmlns:xlink="{xlink}" ><image xlink:href="xxx"/></svg>
'''.format(xhtml=XHTML_NS, svg=SVG_NS, xlink=XLINK_NS)
root = parse_function(markup)
err = 'Incorrect parsing, parsed markup:\n' + etree.tostring(root, encoding='unicode')
match_and_prefix(root, '//h:body[@id="test"]', None, err)
match_and_prefix(root, '//svg:svg', None, err)
match_and_prefix(root, '//svg:image[@xl:href]', None, err)
markup = '<html><body><svg><image xlink:href="xxx"></svg>'
root = parse_function(markup)
err = 'Namespaces not created, parsed markup:\n' + etree.tostring(root, encoding='unicode')
match_and_prefix(root, '//svg:svg', None, err)
match_and_prefix(root, '//svg:image[@xl:href]', None, err)
if parse_function is parse:
image = XPath('//svg:image')(root)[0]
ae(image.nsmap, {'xlink':XLINK_NS, None:SVG_NS})
root = parse_function('<html id="a"><p><html xmlns:x="y" lang="en"><p>')
err = 'Multiple HTML tags not handled, parsed markup:\n' + etree.tostring(root, encoding='unicode')
match_and_prefix(root, '//h:html', None, err)
match_and_prefix(root, '//h:html[@lang]', None, err)
match_and_prefix(root, '//h:html[@id]', None, err)
# if parse_function is not html5_parse:
# markup = '<html:html xmlns:html="{html}" id="a"><html:body><html:p></html:p></html:body></html>'.format(html=XHTML_NS)
# root = parse_function(markup)
# err = 'HTML namespace prefixed, parsed markup:\n' + etree.tostring(root, encoding='unicode')
# match_and_prefix(root, '//h:html', None, err)
markup = '<html><body><ns1:tag1 xmlns:ns1="NS"><ns2:tag2 xmlns:ns2="NS" ns1:id="test"/><ns1:tag3 xmlns:ns1="NS2" ns1:id="test"/></ns1:tag1>'
root = parse_function(markup)
err = 'Arbitrary namespaces not preserved, parsed markup:\n' + etree.tostring(root, encoding='unicode')
def xpath(expr):
return etree.XPath(expr, namespaces={'ns1':'NS', 'ns2':'NS2'})(root)
ae(len(xpath('//ns1:tag1')), 1, err)
ae(len(xpath('//ns1:tag2')), 1, err)
ae(len(xpath('//ns2:tag3')), 1, err)
ae(len(xpath('//ns1:tag2[@ns1:id="test"]')), 1, err)
ae(len(xpath('//ns2:tag3[@ns2:id="test"]')), 1, err)
# for tag in root.iter():
# if 'NS' in tag.tag:
# ae('ns1', tag.prefix)
markup = '<html xml:lang="en"><body><p lang="de"><p xml:lang="es"><p lang="en" xml:lang="de">'
root = parse_function(markup)
err = 'xml:lang not converted to lang, parsed markup:\n' + etree.tostring(root, encoding='unicode')
ae(len(root.xpath('//*[@lang="en"]')), 2, err)
ae(len(root.xpath('//*[@lang="de"]')), 1, err)
ae(len(root.xpath('//*[@lang="es"]')), 1, err)
# ae(len(XPath('//*[@xml:lang]')(root)), 0, err)
def space_characters(test, parse_function):
markup = '<html><p>\u000cX</p>'
root = parse_function(markup)
err = 'form feed character not converted, parsed markup:\n' + etree.tostring(root, encoding='unicode')
test.assertNotIn('\u000c', root.xpath('//*[local-name()="p"]')[0].text, err)
markup = '<html><p>a\u000b\u000c</p>'
root = parse_function(markup) # Should strip non XML safe control code \u000b
test.assertNotIn('\u000b', root.xpath('//*[local-name()="p"]')[0].text, err)
test.assertNotIn('\u000c', root.xpath('//*[local-name()="p"]')[0].text, err)
def case_insensitive_element_names(test, parse_function):
markup = '<HTML><P> </p>'
root = parse_function(markup)
err = 'case sensitive parsing, parsed markup:\n' + etree.tostring(root, encoding='unicode')
test.assertEqual(len(XPath('//h:p')(root)), 1, err)
def entities(test, parse_function):
markup = '<html><p> '</p>'
root = parse_function(markup)
err = 'Entities not handled, parsed markup:\n' + etree.tostring(root, encoding='unicode')
test.assertEqual('\xa0\'', root.xpath('//*[local-name()="p"]')[0].text, err)
def multiple_html_and_body(test, parse_function):
markup = '<html id="1"><body id="2"><p><html lang="en"><body lang="de"></p>'
root = parse_function(markup)
err = 'multiple html and body not handled, parsed markup:\n' + etree.tostring(root, encoding='unicode')
test.assertEqual(len(XPath('//h:html')(root)), 1, err)
test.assertEqual(len(XPath('//h:body')(root)), 1, err)
test.assertEqual(len(XPath('//h:html[@id and @lang]')(root)), 1, err)
test.assertEqual(len(XPath('//h:body[@id and @lang]')(root)), 1, err)
def attribute_replacement(test, parse_function):
markup = '<html><body><svg viewbox="0"></svg><svg xmlns="%s" viewbox="1">' % SVG_NS
root = parse_function(markup)
err = 'SVG attributes not normalized, parsed markup:\n' + etree.tostring(root, encoding='unicode')
test.assertEqual(len(XPath('//svg:svg[@viewBox]')(root)), 2, err)
def comments(test, parse_function):
markup = '<html><!-- -- ---><body/></html>'
root = parse_function(markup)
test.assertEqual(len(XPath('//h:body')(root)), 1, 'Failed to parse with comment containing dashes')
test.assertEqual(len(tuple(root.iterdescendants(etree.Comment))), 1)
basic_checks = (nonvoid_cdata_elements, namespaces, space_characters,
case_insensitive_element_names, entities, comments,
multiple_html_and_body, attribute_replacement)
class ParsingTests(BaseTest):
def test_lxml_tostring(self):
' Test for bug in some versions of lxml that causes incorrect serialization of sub-trees'
from html5_parser import parse
root = parse('<p>a<p>b<p>c')
p = root.xpath('//p')[0]
self.assertEqual(etree.tostring(p, encoding=str), '<p>a</p>')
def test_conversion_parser(self):
' Test parsing with the HTML5 parser used for conversion '
for test in basic_checks:
test(self, html5_parse)
def test_polish_parser(self):
' Test parsing with the HTML5 parser used for polishing '
for test in basic_checks:
test(self, parse)
root = parse('<html><p><svg><image /><b></svg> \n<b>xxx', discard_namespaces=True)
self.assertTrue(root.xpath('//b'), 'Namespaces not discarded')
self.assertFalse(root.xpath('//svg/b'), 'The <b> was not moved out of <svg>')
for ds in (False, True):
src = '\n<html>\n<p>\n<svg><image />\n<b></svg> '
root = parse(src, discard_namespaces=ds)
for tag, lnum in iteritems({'html':2, 'head':3, 'body':3, 'p':3, 'svg':4, 'image':4, 'b':5}):
elem = root.xpath('//*[local-name()="%s"]' % tag)[0]
self.assertEqual(lnum, elem.sourceline, f'Line number incorrect for {tag}, source: {src}:')
for ds in (False, True):
src = '\n<html>\n<p b=1 a=2 c=3 d=4 e=5 f=6 g=7 h=8><svg b=1 a=2 c=3 d=4 e=5 f=6 g=7 h=8>\n'
root = parse(src, discard_namespaces=ds)
for tag in ('p', 'svg'):
for i, (k, v) in enumerate(root.xpath('//*[local-name()="%s"]' % tag)[0].items()):
self.assertEqual(i+1, int(v))
root = parse('<html xmlns="http://www.w3.org/1999/xhtml" xml:lang="en-US" xmlns:xml="http://www.w3.org/XML/1998/namespace"><body/></html>')
self.assertNotIn('xmlnsU0003Axml', root.attrib, 'xml namespace declaration not removed')
root = parse('<html xmlns="http://www.w3.org/1999/xhtml" xml:lang="en-US" xmlns:extra="extra"><body/></html>')
self.assertIn('extra', root.nsmap, 'Extra namespace declaration on <html> tag not preserved')
def timing():
import sys
from calibre.ebooks.chardet import xml_to_unicode
from calibre.utils.monotonic import monotonic
from html5lib import parse as vanilla
filename = sys.argv[-1]
with lopen(filename, 'rb') as f:
raw = f.read()
raw = xml_to_unicode(raw)[0]
for name, f in (('calibre', partial(parse, line_numbers=False)), ('html5lib', vanilla), ('calibre-old', html5_parse)):
timings = []
for i in range(10):
st = monotonic()
f(raw)
timings.append(monotonic() - st)
avg = sum(timings)/len(timings)
print(f'Average time for {name}: {avg:.2g}')