%PDF- %PDF-
| Direktori : /proc/thread-self/root/usr/lib/calibre/calibre/ebooks/oeb/polish/check/ |
| Current File : //proc/thread-self/root/usr/lib/calibre/calibre/ebooks/oeb/polish/check/parsing.py |
#!/usr/bin/env python3
__license__ = 'GPL v3'
__copyright__ = '2013, Kovid Goyal <kovid at kovidgoyal.net>'
import re
from lxml.etree import XMLSyntaxError
from calibre import force_unicode, human_readable, prepare_string_for_xml
from calibre.ebooks.chardet import replace_encoding_declarations, find_declared_encoding
from calibre.utils.xml_parse import safe_xml_fromstring
from calibre.ebooks.html_entities import html5_entities
from calibre.ebooks.oeb.polish.pretty import pretty_script_or_style as fix_style_tag
from calibre.ebooks.oeb.polish.utils import PositionFinder, guess_type
from calibre.ebooks.oeb.polish.check.base import BaseError, WARN, ERROR, INFO
from calibre.ebooks.oeb.base import OEB_DOCS, XHTML_NS, urlquote, URL_SAFE, XHTML
from polyglot.builtins import iteritems, error_message
HTML_ENTITTIES = frozenset(html5_entities)
XML_ENTITIES = {'lt', 'gt', 'amp', 'apos', 'quot'}
ALL_ENTITIES = HTML_ENTITTIES | XML_ENTITIES
replace_pat = re.compile('&(%s);' % '|'.join(re.escape(x) for x in sorted(HTML_ENTITTIES - XML_ENTITIES)))
mismatch_pat = re.compile(r'tag mismatch:.+?line (\d+).+?line \d+')
class EmptyFile(BaseError):
HELP = _('This file is empty, it contains nothing, you should probably remove it.')
INDIVIDUAL_FIX = _('Remove this file')
def __init__(self, name):
BaseError.__init__(self, _('The file %s is empty') % name, name)
def __call__(self, container):
container.remove_item(self.name)
return True
class DecodeError(BaseError):
is_parsing_error = True
HELP = _('A decoding errors means that the contents of the file could not'
' be interpreted as text. This usually happens if the file has'
' an incorrect character encoding declaration or if the file is actually'
' a binary file, like an image or font that is mislabelled with'
' an incorrect media type in the OPF.')
def __init__(self, name):
BaseError.__init__(self, _('Parsing of %s failed, could not decode') % name, name)
class XMLParseError(BaseError):
is_parsing_error = True
HELP = _('A parsing error in an XML file means that the XML syntax in the file is incorrect.'
' Such a file will most probably not open in an e-book reader. These errors can '
' usually be fixed automatically, however, automatic fixing can sometimes '
' "do the wrong thing".')
def __init__(self, msg, *args, **kwargs):
msg = msg or ''
BaseError.__init__(self, 'Parsing failed: ' + msg, *args, **kwargs)
m = mismatch_pat.search(msg)
if m is not None:
self.has_multiple_locations = True
self.all_locations = [(self.name, int(m.group(1)), None), (self.name, self.line, self.col)]
class HTMLParseError(XMLParseError):
HELP = _('A parsing error in an HTML file means that the HTML syntax is incorrect.'
' Most readers will automatically ignore such errors, but they may result in '
' incorrect display of content. These errors can usually be fixed automatically,'
' however, automatic fixing can sometimes "do the wrong thing".')
class PrivateEntities(XMLParseError):
HELP = _('This HTML file uses private entities.'
' These are not supported. You can try running "Fix HTML" from the Tools menu,'
' which will try to automatically resolve the private entities.')
class NamedEntities(BaseError):
level = WARN
INDIVIDUAL_FIX = _('Replace all named entities with their character equivalents in this book')
HELP = _('Named entities are often only incompletely supported by various book reading software.'
' Therefore, it is best to not use them, replacing them with the actual characters they'
' represent. This can be done automatically.')
def __init__(self, name):
BaseError.__init__(self, _('Named entities present'), name)
def __call__(self, container):
changed = False
from calibre.ebooks.oeb.polish.check.main import XML_TYPES
check_types = XML_TYPES | OEB_DOCS
for name, mt in iteritems(container.mime_map):
if mt in check_types:
raw = container.raw_data(name)
nraw = replace_pat.sub(lambda m:html5_entities[m.group(1)], raw)
if raw != nraw:
changed = True
with container.open(name, 'wb') as f:
f.write(nraw.encode('utf-8'))
return changed
def make_filename_safe(name):
from calibre.utils.filenames import ascii_filename
def esc(n):
return ''.join(x if x in URL_SAFE else '_' for x in n)
return '/'.join(esc(ascii_filename(x)) for x in name.split('/'))
class EscapedName(BaseError):
level = WARN
def __init__(self, name):
BaseError.__init__(self, _('Filename contains unsafe characters'), name)
qname = urlquote(name)
self.sname = make_filename_safe(name)
self.HELP = _(
'The filename {0} contains unsafe characters, that must be escaped, like'
' this {1}. This can cause problems with some e-book readers. To be'
' absolutely safe, use only the English alphabet [a-z], the numbers [0-9],'
' underscores and hyphens in your file names. While many other characters'
' are allowed, they may cause problems with some software.').format(name, qname)
self.INDIVIDUAL_FIX = _(
'Rename the file {0} to {1}').format(name, self.sname)
def __call__(self, container):
from calibre.ebooks.oeb.polish.replace import rename_files
all_names = set(container.name_path_map)
bn, ext = self.sname.rpartition('.')[0::2]
c = 0
while self.sname in all_names:
c += 1
self.sname = '%s_%d.%s' % (bn, c, ext)
rename_files(container, {self.name:self.sname})
return True
class TooLarge(BaseError):
level = INFO
MAX_SIZE = 260 *1024
HELP = _('This HTML file is larger than %s. Too large HTML files can cause performance problems'
' on some e-book readers. Consider splitting this file into smaller sections.') % human_readable(MAX_SIZE)
def __init__(self, name):
BaseError.__init__(self, _('File too large'), name)
class BadEntity(BaseError):
HELP = _('This is an invalid (unrecognized) entity. Replace it with whatever'
' text it is supposed to have represented.')
def __init__(self, ent, name, lnum, col):
BaseError.__init__(self, _('Invalid entity: %s') % ent, name, lnum, col)
class BadNamespace(BaseError):
INDIVIDUAL_FIX = _(
'Run fix HTML on this file, which will automatically insert the correct namespace')
def __init__(self, name, namespace):
BaseError.__init__(self, _('Invalid or missing namespace'), name)
self.HELP = prepare_string_for_xml(_(
'This file has {0}. Its namespace must be {1}. Set the namespace by defining the xmlns'
' attribute on the <html> element, like this <html xmlns="{1}">').format(
(_('incorrect namespace %s') % namespace) if namespace else _('no namespace'),
XHTML_NS))
def __call__(self, container):
container.parsed(self.name)
container.dirty(self.name)
return True
class NonUTF8(BaseError):
level = WARN
INDIVIDUAL_FIX = _("Change this file's encoding to UTF-8")
def __init__(self, name, enc):
BaseError.__init__(self, _('Non UTF-8 encoding declaration'), name)
self.HELP = _('This file has its encoding declared as %s. Some'
' reader software cannot handle non-UTF8 encoded files.'
' You should change the encoding to UTF-8.') % enc
def __call__(self, container):
raw = container.raw_data(self.name)
if isinstance(raw, str):
raw, changed = replace_encoding_declarations(raw)
if changed:
container.open(self.name, 'wb').write(raw.encode('utf-8'))
return True
class EntitityProcessor:
def __init__(self, mt):
self.entities = ALL_ENTITIES if mt in OEB_DOCS else XML_ENTITIES
self.ok_named_entities = []
self.bad_entities = []
def __call__(self, m):
val = m.group(1).decode('ascii')
if val in XML_ENTITIES:
# Leave XML entities alone
return m.group()
if val.startswith('#'):
nval = val[1:]
try:
if nval.startswith('x'):
int(nval[1:], 16)
else:
int(nval, 10)
except ValueError:
# Invalid numerical entity
self.bad_entities.append((m.start(), m.group()))
return b' ' * len(m.group())
return m.group()
if val in self.entities:
# Known named entity, report it
self.ok_named_entities.append(m.start())
else:
self.bad_entities.append((m.start(), m.group()))
return b' ' * len(m.group())
def check_html_size(name, mt, raw):
errors = []
if len(raw) > TooLarge.MAX_SIZE:
errors.append(TooLarge(name))
return errors
entity_pat = re.compile(br'&(#{0,1}[a-zA-Z0-9]{1,8});')
def check_encoding_declarations(name, container):
errors = []
enc = find_declared_encoding(container.raw_data(name))
if enc is not None and enc.lower() != 'utf-8':
errors.append(NonUTF8(name, enc))
return errors
def check_for_private_entities(name, raw):
if re.search(br'<!DOCTYPE\s+.+?<!ENTITY\s+.+?]>', raw, flags=re.DOTALL) is not None:
return True
def check_xml_parsing(name, mt, raw):
if not raw:
return [EmptyFile(name)]
if check_for_private_entities(name, raw):
return [PrivateEntities(_('Private entities found'), name)]
raw = raw.replace(b'\r\n', b'\n').replace(b'\r', b'\n')
# Get rid of entities as named entities trip up the XML parser
eproc = EntitityProcessor(mt)
eraw = entity_pat.sub(eproc, raw)
errcls = HTMLParseError if mt in OEB_DOCS else XMLParseError
errors = []
if eproc.ok_named_entities:
errors.append(NamedEntities(name))
if eproc.bad_entities:
position = PositionFinder(raw)
for offset, ent in eproc.bad_entities:
lnum, col = position(offset)
errors.append(BadEntity(ent, name, lnum, col))
try:
root = safe_xml_fromstring(eraw, recover=False)
except UnicodeDecodeError:
return errors + [DecodeError(name)]
except XMLSyntaxError as err:
try:
line, col = err.position
except:
line = col = None
return errors + [errcls(error_message(err), name, line, col)]
except Exception as err:
return errors + [errcls(error_message(err), name)]
if mt in OEB_DOCS:
if root.nsmap.get(root.prefix, None) != XHTML_NS:
errors.append(BadNamespace(name, root.nsmap.get(root.prefix, None)))
return errors
class CSSError(BaseError):
is_parsing_error = True
def __init__(self, level, msg, name, line, col):
self.level = level
prefix = 'CSS: '
BaseError.__init__(self, prefix + msg, name, line, col)
if level == WARN:
self.HELP = _('This CSS construct is not recognized. That means that it'
' most likely will not work on reader devices. Consider'
' replacing it with something else.')
else:
self.HELP = _('Some reader programs are very'
' finicky about CSS stylesheets and will ignore the whole'
' sheet if there is an error. These errors can often'
' be fixed automatically, however, automatic fixing will'
' typically remove unrecognized items, instead of correcting them.')
self.INDIVIDUAL_FIX = _('Try to fix parsing errors in this stylesheet automatically')
def __call__(self, container):
root = container.parsed(self.name)
container.dirty(self.name)
if container.mime_map[self.name] in OEB_DOCS:
for style in root.xpath('//*[local-name()="style"]'):
if style.get('type', 'text/css') == 'text/css' and style.text and style.text.strip():
fix_style_tag(container, style)
for elem in root.xpath('//*[@style]'):
raw = elem.get('style')
if raw:
elem.set('style', force_unicode(container.parse_css(raw, is_declaration=True).cssText, 'utf-8').replace('\n', ' '))
return True
pos_pats = (re.compile(r'\[(\d+):(\d+)'), re.compile(r'(\d+), (\d+)\)'))
class DuplicateId(BaseError):
has_multiple_locations = True
INDIVIDUAL_FIX = _(
'Remove the duplicate ids from all but the first element')
def __init__(self, name, eid, locs):
BaseError.__init__(self, _('Duplicate id: %s') % eid, name)
self.HELP = _(
'The id {0} is present on more than one element in {1}. This is'
' not allowed. Remove the id from all but one of the elements').format(eid, name)
self.all_locations = [(name, lnum, None) for lnum in sorted(locs)]
self.duplicate_id = eid
def __call__(self, container):
elems = [e for e in container.parsed(self.name).xpath('//*[@id]') if e.get('id') == self.duplicate_id]
for e in elems[1:]:
e.attrib.pop('id')
container.dirty(self.name)
return True
class InvalidId(BaseError):
level = WARN
INDIVIDUAL_FIX = _(
'Replace this id with a randomly generated valid id')
def __init__(self, name, line, eid):
BaseError.__init__(self, _('Invalid id: %s') % eid, name, line)
self.HELP = _(
'The id {0} is not a valid id. IDs must start with a letter ([A-Za-z]) and may be'
' followed by any number of letters, digits ([0-9]), hyphens ("-"), underscores ("_")'
', colons (":"), and periods ("."). This is to ensure maximum compatibility'
' with a wide range of devices.').format(eid)
self.invalid_id = eid
def __call__(self, container):
from calibre.ebooks.oeb.base import uuid_id
from calibre.ebooks.oeb.polish.replace import replace_ids
newid = uuid_id()
changed = False
elems = (e for e in container.parsed(self.name).xpath('//*[@id]') if e.get('id') == self.invalid_id)
for e in elems:
e.set('id', newid)
changed = True
container.dirty(self.name)
if changed:
replace_ids(container, {self.name:{self.invalid_id:newid}})
return changed
class BareTextInBody(BaseError):
INDIVIDUAL_FIX = _('Wrap the bare text in a p tag')
HELP = _('You cannot have bare text inside the body tag. The text must be placed inside some other tag, such as p or div')
has_multiple_locations = True
def __init__(self, name, lines):
BaseError.__init__(self, _('Bare text in body tag'), name)
self.all_locations = [(name, l, None) for l in sorted(lines)]
def __call__(self, container):
root = container.parsed(self.name)
for body in root.xpath('//*[local-name() = "body"]'):
children = tuple(body.iterchildren('*'))
if body.text and body.text.strip():
p = body.makeelement(XHTML('p'))
p.text, body.text = body.text.strip(), '\n '
p.tail = '\n'
if children:
p.tail += ' '
body.insert(0, p)
for child in children:
if child.tail and child.tail.strip():
p = body.makeelement(XHTML('p'))
p.text, child.tail = child.tail.strip(), '\n '
p.tail = '\n'
body.insert(body.index(child) + 1, p)
if child is not children[-1]:
p.tail += ' '
container.dirty(self.name)
return True
class ErrorHandler:
' Replacement logger to get useful error/warning info out of css_parser during parsing '
def __init__(self, name):
# may be disabled during setting of known valid items
self.name = name
self.errors = []
def __noop(self, *args, **kwargs):
pass
info = debug = setLevel = getEffectiveLevel = addHandler = removeHandler = __noop
def __handle(self, level, *args):
msg = ' '.join(map(str, args))
line = col = None
for pat in pos_pats:
m = pat.search(msg)
if m is not None:
line, col = int(m.group(1)), int(m.group(2))
if msg and line is not None:
# Ignore error messages with no line numbers as these are usually
# summary messages for an underlying error with a line number
if 'panose-1' in msg and 'unknown property name' in msg.lower():
return # panose-1 is allowed in CSS 2.1 and is generated by calibre
self.errors.append(CSSError(level, msg, self.name, line, col))
def error(self, *args):
self.__handle(ERROR, *args)
def warn(self, *args):
self.__handle(WARN, *args)
warning = warn
def check_filenames(container):
errors = []
all_names = set(container.name_path_map) - container.names_that_must_not_be_changed
for name in all_names:
if urlquote(name) != name:
errors.append(EscapedName(name))
return errors
valid_id = re.compile(r'^[a-zA-Z][a-zA-Z0-9_:.-]*$')
def check_ids(container):
errors = []
mts = set(OEB_DOCS) | {guess_type('a.opf'), guess_type('a.ncx')}
for name, mt in iteritems(container.mime_map):
if mt in mts:
root = container.parsed(name)
seen_ids = {}
dups = {}
for elem in root.xpath('//*[@id]'):
eid = elem.get('id')
if eid in seen_ids:
if eid not in dups:
dups[eid] = [seen_ids[eid]]
dups[eid].append(elem.sourceline)
else:
seen_ids[eid] = elem.sourceline
if eid and valid_id.match(eid) is None:
errors.append(InvalidId(name, elem.sourceline, eid))
errors.extend(DuplicateId(name, eid, locs) for eid, locs in iteritems(dups))
return errors
def check_markup(container):
errors = []
for name, mt in iteritems(container.mime_map):
if mt in OEB_DOCS:
lines = []
root = container.parsed(name)
for body in root.xpath('//*[local-name()="body"]'):
if body.text and body.text.strip():
lines.append(body.sourceline)
for child in body.iterchildren('*'):
if child.tail and child.tail.strip():
lines.append(child.sourceline)
if lines:
errors.append(BareTextInBody(name, lines))
return errors