%PDF- %PDF-
| Direktori : /lib/calibre/calibre/ebooks/ |
| Current File : //lib/calibre/calibre/ebooks/chardet.py |
#!/usr/bin/env python3
__license__ = 'GPL v3'
__copyright__ = '2009, Kovid Goyal <kovid@kovidgoyal.net>'
__docformat__ = 'restructuredtext en'
import re, codecs, sys
_encoding_pats = (
# XML declaration
r'<\?[^<>]+encoding\s*=\s*[\'"](.*?)[\'"][^<>]*>',
# HTML 5 charset
r'''<meta\s+charset=['"]([-_a-z0-9]+)['"][^<>]*>(?:\s*</meta>){0,1}''',
# HTML 4 Pragma directive
r'''<meta\s+?[^<>]*?content\s*=\s*['"][^'"]*?charset=([-_a-z0-9]+)[^'"]*?['"][^<>]*>(?:\s*</meta>){0,1}''',
)
def compile_pats(binary):
for raw in _encoding_pats:
if binary:
raw = raw.encode('ascii')
yield re.compile(raw, flags=re.IGNORECASE)
class LazyEncodingPats:
def __call__(self, binary=False):
attr = 'binary_pats' if binary else 'unicode_pats'
pats = getattr(self, attr, None)
if pats is None:
pats = tuple(compile_pats(binary))
setattr(self, attr, pats)
yield from pats
lazy_encoding_pats = LazyEncodingPats()
ENTITY_PATTERN = re.compile(r'&(\S+?);')
def strip_encoding_declarations(raw, limit=50*1024, preserve_newlines=False):
prefix = raw[:limit]
suffix = raw[limit:]
is_binary = isinstance(raw, bytes)
if preserve_newlines:
if is_binary:
sub = lambda m: b'\n' * m.group().count(b'\n')
else:
sub = lambda m: '\n' * m.group().count('\n')
else:
sub = b'' if is_binary else ''
for pat in lazy_encoding_pats(is_binary):
prefix = pat.sub(sub, prefix)
raw = prefix + suffix
return raw
def replace_encoding_declarations(raw, enc='utf-8', limit=50*1024):
prefix = raw[:limit]
suffix = raw[limit:]
changed = [False]
is_binary = isinstance(raw, bytes)
if is_binary:
if not isinstance(enc, bytes):
enc = enc.encode('ascii')
else:
if isinstance(enc, bytes):
enc = enc.decode('ascii')
def sub(m):
ans = m.group()
if m.group(1).lower() != enc.lower():
changed[0] = True
start, end = m.start(1) - m.start(0), m.end(1) - m.end(0)
ans = ans[:start] + enc + ans[end:]
return ans
for pat in lazy_encoding_pats(is_binary):
prefix = pat.sub(sub, prefix)
raw = prefix + suffix
return raw, changed[0]
def find_declared_encoding(raw, limit=50*1024):
prefix = raw[:limit]
is_binary = isinstance(raw, bytes)
for pat in lazy_encoding_pats(is_binary):
m = pat.search(prefix)
if m is not None:
ans = m.group(1)
if is_binary:
ans = ans.decode('ascii', 'replace')
return ans
def substitute_entites(raw):
from calibre import xml_entity_to_unicode
return ENTITY_PATTERN.sub(xml_entity_to_unicode, raw)
_CHARSET_ALIASES = {"macintosh" : "mac-roman", "x-sjis" : "shift-jis"}
def detect(bytestring):
try:
from cchardet import detect as implementation
except ImportError:
from chardet import detect as implementation
ans = implementation(bytestring)
enc = ans.get('encoding')
if enc:
ans['encoding'] = enc.lower()
elif enc is None:
ans['encoding'] = ''
if ans.get('confidence') is None:
ans['confidence'] = 0
return ans
def force_encoding(raw, verbose, assume_utf8=False):
from calibre.constants import preferred_encoding
try:
chardet = detect(raw[:1024*50])
except Exception:
chardet = {'encoding':preferred_encoding, 'confidence':0}
encoding = chardet['encoding']
if chardet['confidence'] < 1:
if verbose:
print(f'WARNING: Encoding detection confidence for {chardet["encoding"]} is {chardet["confidence"]}', file=sys.stderr)
if assume_utf8:
encoding = 'utf-8'
if not encoding:
encoding = preferred_encoding
encoding = encoding.lower()
encoding = _CHARSET_ALIASES.get(encoding, encoding)
if encoding == 'ascii':
encoding = 'utf-8'
return encoding
def detect_xml_encoding(raw, verbose=False, assume_utf8=False):
if not raw or isinstance(raw, str):
return raw, None
for x in ('utf8', 'utf-16-le', 'utf-16-be'):
bom = getattr(codecs, 'BOM_'+x.upper().replace('-16', '16').replace(
'-', '_'))
if raw.startswith(bom):
return raw[len(bom):], x
encoding = None
for pat in lazy_encoding_pats(True):
match = pat.search(raw)
if match:
encoding = match.group(1)
encoding = encoding.decode('ascii', 'replace')
break
if encoding is None:
encoding = force_encoding(raw, verbose, assume_utf8=assume_utf8)
if encoding.lower().strip() == 'macintosh':
encoding = 'mac-roman'
if encoding.lower().replace('_', '-').strip() in (
'gb2312', 'chinese', 'csiso58gb231280', 'euc-cn', 'euccn',
'eucgb2312-cn', 'gb2312-1980', 'gb2312-80', 'iso-ir-58'):
# Microsoft Word exports to HTML with encoding incorrectly set to
# gb2312 instead of gbk. gbk is a superset of gb2312, anyway.
encoding = 'gbk'
try:
codecs.lookup(encoding)
except LookupError:
encoding = 'utf-8'
return raw, encoding
def xml_to_unicode(raw, verbose=False, strip_encoding_pats=False,
resolve_entities=False, assume_utf8=False):
'''
Force conversion of byte string to unicode. Tries to look for XML/HTML
encoding declaration first, if not found uses the chardet library and
prints a warning if detection confidence is < 100%
@return: (unicode, encoding used)
'''
if not raw:
return '', None
raw, encoding = detect_xml_encoding(raw, verbose=verbose,
assume_utf8=assume_utf8)
if not isinstance(raw, str):
raw = raw.decode(encoding, 'replace')
if strip_encoding_pats:
raw = strip_encoding_declarations(raw)
if resolve_entities:
raw = substitute_entites(raw)
return raw, encoding