%PDF- %PDF-
| Direktori : /lib/calibre/calibre/ebooks/lit/ |
| Current File : //lib/calibre/calibre/ebooks/lit/reader.py |
'''
Support for reading LIT files.
'''
__license__ = 'GPL v3'
__copyright__ = '2008, Kovid Goyal <kovid at kovidgoyal.net> ' \
'and Marshall T. Vandegrift <llasram@gmail.com>'
import io, struct, os, functools, re
from lxml import etree
from calibre.ebooks.lit import LitError
from calibre.ebooks.lit.maps import OPF_MAP, HTML_MAP
import calibre.ebooks.lit.mssha1 as mssha1
from calibre.ebooks.oeb.base import urlnormalize, xpath
from calibre.ebooks.oeb.reader import OEBReader
from calibre.ebooks import DRMError
from polyglot.builtins import codepoint_to_chr, string_or_bytes, itervalues
from polyglot.urllib import unquote as urlunquote, urldefrag
from calibre_extensions import lzx, msdes
__all__ = ["LitReader"]
XML_DECL = """<?xml version="1.0" encoding="UTF-8" ?>
"""
OPF_DECL = """<?xml version="1.0" encoding="UTF-8" ?>
<!DOCTYPE package
PUBLIC "+//ISBN 0-9673008-1-9//DTD OEB 1.0.1 Package//EN"
"http://openebook.org/dtds/oeb-1.0.1/oebpkg101.dtd">
"""
HTML_DECL = """<?xml version="1.0" encoding="UTF-8" ?>
<!DOCTYPE html PUBLIC
"+//ISBN 0-9673008-1-9//DTD OEB 1.0.1 Document//EN"
"http://openebook.org/dtds/oeb-1.0.1/oebdoc101.dtd">
"""
DESENCRYPT_GUID = "{67F6E4A2-60BF-11D3-8540-00C04F58C3CF}"
LZXCOMPRESS_GUID = "{0A9007C6-4076-11D3-8789-0000F8105754}"
CONTROL_TAG = 4
CONTROL_WINDOW_SIZE = 12
RESET_NENTRIES = 4
RESET_HDRLEN = 12
RESET_UCLENGTH = 16
RESET_INTERVAL = 32
FLAG_OPENING = (1 << 0)
FLAG_CLOSING = (1 << 1)
FLAG_BLOCK = (1 << 2)
FLAG_HEAD = (1 << 3)
FLAG_ATOM = (1 << 4)
def u32(bytes):
return struct.unpack('<L', bytes[:4])[0]
def u16(bytes):
return struct.unpack('<H', bytes[:2])[0]
def int32(bytes):
return struct.unpack('<l', bytes[:4])[0]
def encint(byts, remaining):
pos, val = 0, 0
ba = bytearray(byts)
while remaining > 0:
b = ba[pos]
pos += 1
remaining -= 1
val <<= 7
val |= (b & 0x7f)
if b & 0x80 == 0:
break
return val, byts[pos:], remaining
def msguid(bytes):
values = struct.unpack("<LHHBBBBBBBB", bytes[:16])
return "{%08lX-%04X-%04X-%02X%02X-%02X%02X%02X%02X%02X%02X}" % values
def read_utf8_char(bytes, pos):
c = ord(bytes[pos:pos+1])
mask = 0x80
if (c & mask):
elsize = 0
while c & mask:
mask >>= 1
elsize += 1
if (mask <= 1) or (mask == 0x40):
raise LitError('Invalid UTF8 character: %s' % repr(bytes[pos]))
else:
elsize = 1
if elsize > 1:
if elsize + pos > len(bytes):
raise LitError('Invalid UTF8 character: %s' % repr(bytes[pos]))
c &= (mask - 1)
for i in range(1, elsize):
b = ord(bytes[pos+i:pos+i+1])
if (b & 0xC0) != 0x80:
raise LitError(
'Invalid UTF8 character: %s' % repr(bytes[pos:pos+i]))
c = (c << 6) | (b & 0x3F)
return codepoint_to_chr(c), pos+elsize
def consume_sized_utf8_string(bytes, zpad=False):
result = []
slen, pos = read_utf8_char(bytes, 0)
for i in range(ord(slen)):
char, pos = read_utf8_char(bytes, pos)
result.append(char)
if zpad and bytes[pos:pos+1] == b'\0':
pos += 1
return ''.join(result), bytes[pos:]
def encode(string):
return str(string).encode('ascii', 'xmlcharrefreplace')
class UnBinary:
AMPERSAND_RE = re.compile(
br'&(?!(?:#[0-9]+|#x[0-9a-fA-F]+|[a-zA-Z_:][a-zA-Z0-9.-_:]+);)')
OPEN_ANGLE_RE = re.compile(br'<<(?![!]--)')
CLOSE_ANGLE_RE = re.compile(br'(?<!--)>>(?=>>|[^>])')
DOUBLE_ANGLE_RE = re.compile(br'([<>])\1')
EMPTY_ATOMS = ({},{})
def __init__(self, bin, path, manifest={}, map=HTML_MAP, atoms=EMPTY_ATOMS):
self.manifest = manifest
self.tag_map, self.attr_map, self.tag_to_attr_map = map
self.is_html = map is HTML_MAP
self.tag_atoms, self.attr_atoms = atoms
self.dir = os.path.dirname(path)
buf = io.BytesIO()
self.binary_to_text(bin, buf)
self.raw = buf.getvalue().lstrip()
self.escape_reserved()
self._tree = None
def escape_reserved(self):
raw = self.raw
raw = self.AMPERSAND_RE.sub(br'&', raw)
raw = self.OPEN_ANGLE_RE.sub(br'<', raw)
raw = self.CLOSE_ANGLE_RE.sub(br'>', raw)
raw = self.DOUBLE_ANGLE_RE.sub(br'\1', raw)
self.raw = raw
def item_path(self, internal_id):
try:
target = self.manifest[internal_id].path
except KeyError:
return internal_id
if not self.dir:
return target
target = target.split('/')
base = self.dir.split('/')
for index in range(min(len(base), len(target))):
if base[index] != target[index]:
break
else:
index += 1
relpath = (['..'] * (len(base) - index)) + target[index:]
return '/'.join(relpath)
@property
def binary_representation(self):
return self.raw
@property
def unicode_representation(self):
return self.raw.decode('utf-8')
def __unicode__(self):
return self.unicode_representation
def __str__(self):
return self.unicode_representation
def binary_to_text(self, bin, buf):
stack = [(0, None, None, 0, 0, False, False, 'text', 0)]
self.cpos = 0
while stack:
self.binary_to_text_inner(bin, buf, stack)
del self.cpos
def binary_to_text_inner(self, bin, buf, stack):
(depth, tag_name, current_map, dynamic_tag, errors,
in_censorship, is_goingdown, state, flags) = stack.pop()
if state == 'close tag':
if not tag_name:
raise LitError('Tag ends before it begins.')
buf.write(encode(''.join(('</', tag_name, '>'))))
dynamic_tag = 0
tag_name = None
state = 'text'
while self.cpos < len(bin):
c, self.cpos = read_utf8_char(bin, self.cpos)
oc = ord(c)
if state == 'text':
if oc == 0:
state = 'get flags'
continue
elif c == '\v':
c = '\n'
elif c == '>':
c = '>>'
elif c == '<':
c = '<<'
buf.write(encode(c))
elif state == 'get flags':
if oc == 0:
state = 'text'
continue
flags = oc
state = 'get tag'
elif state == 'get tag':
state = 'text' if oc == 0 else 'get attr'
if flags & FLAG_OPENING:
tag = oc
buf.write(b'<')
if not (flags & FLAG_CLOSING):
is_goingdown = True
if tag == 0x8000:
state = 'get custom length'
continue
if flags & FLAG_ATOM:
if not self.tag_atoms or tag not in self.tag_atoms:
raise LitError(
"atom tag %d not in atom tag list" % tag)
tag_name = self.tag_atoms[tag]
current_map = self.attr_atoms
elif tag < len(self.tag_map):
tag_name = self.tag_map[tag]
current_map = self.tag_to_attr_map[tag]
else:
dynamic_tag += 1
errors += 1
tag_name = '?'+codepoint_to_chr(tag)+'?'
current_map = self.tag_to_attr_map[tag]
print('WARNING: tag %s unknown' % codepoint_to_chr(tag))
buf.write(encode(tag_name))
elif flags & FLAG_CLOSING:
if depth == 0:
raise LitError('Extra closing tag %s at %d'%(tag_name,
self.cpos))
break
elif state == 'get attr':
in_censorship = False
if oc == 0:
state = 'text'
if not is_goingdown:
tag_name = None
dynamic_tag = 0
buf.write(b' />')
else:
buf.write(b'>')
frame = (depth, tag_name, current_map,
dynamic_tag, errors, in_censorship, False,
'close tag', flags)
stack.append(frame)
frame = (depth+1, None, None, 0, 0,
False, False, 'text', 0)
stack.append(frame)
break
else:
if oc == 0x8000:
state = 'get attr length'
continue
attr = None
if current_map and oc in current_map and current_map[oc]:
attr = current_map[oc]
elif oc in self.attr_map:
attr = self.attr_map[oc]
if not attr or not isinstance(attr, string_or_bytes):
raise LitError(
'Unknown attribute %d in tag %s' % (oc, tag_name))
if attr.startswith('%'):
in_censorship = True
state = 'get value length'
continue
buf.write(b' ' + encode(attr) + b'=')
if attr in ['href', 'src']:
state = 'get href length'
else:
state = 'get value length'
elif state == 'get value length':
if not in_censorship:
buf.write(b'"')
count = oc - 1
if count == 0:
if not in_censorship:
buf.write(b'"')
in_censorship = False
state = 'get attr'
continue
state = 'get value'
if oc == 0xffff:
continue
if count < 0 or count > (len(bin) - self.cpos):
raise LitError('Invalid character count %d' % count)
elif state == 'get value':
if count == 0xfffe:
if not in_censorship:
buf.write(encode('%s"' % (oc - 1)))
in_censorship = False
state = 'get attr'
elif count > 0:
if not in_censorship:
if c == '"':
c = '"'
elif c == '<':
c = '<'
if isinstance(c, str):
c = c.encode('ascii', 'xmlcharrefreplace')
buf.write(c)
count -= 1
if count == 0:
if not in_censorship:
buf.write(b'"')
in_censorship = False
state = 'get attr'
elif state == 'get custom length':
count = oc - 1
if count <= 0 or count > len(bin)-self.cpos:
raise LitError('Invalid character count %d' % count)
dynamic_tag += 1
state = 'get custom'
tag_name = ''
elif state == 'get custom':
tag_name += c
count -= 1
if count == 0:
buf.write(encode(tag_name))
state = 'get attr'
elif state == 'get attr length':
count = oc - 1
if count <= 0 or count > (len(bin) - self.cpos):
raise LitError('Invalid character count %d' % count)
buf.write(b' ')
state = 'get custom attr'
elif state == 'get custom attr':
buf.write(encode(c))
count -= 1
if count == 0:
buf.write(b'=')
state = 'get value length'
elif state == 'get href length':
count = oc - 1
if count <= 0 or count > (len(bin) - self.cpos):
raise LitError('Invalid character count %d' % count)
href = ''
state = 'get href'
elif state == 'get href':
href += c
count -= 1
if count == 0:
doc, frag = urldefrag(href[1:])
path = self.item_path(doc)
if frag:
path = '#'.join((path, frag))
path = urlnormalize(path)
buf.write(encode('"%s"' % path))
state = 'get attr'
class DirectoryEntry:
def __init__(self, name, section, offset, size):
self.name = name
self.section = section
self.offset = offset
self.size = size
def __repr__(self):
return "DirectoryEntry(name=%s, section=%d, offset=%d, size=%d)" \
% (repr(self.name), self.section, self.offset, self.size)
def __str__(self):
return repr(self)
class ManifestItem:
def __init__(self, original, internal, mime_type, offset, root, state):
self.original = original
self.internal = internal
self.mime_type = mime_type.lower() if hasattr(mime_type, 'lower') else mime_type
self.offset = offset
self.root = root
self.state = state
# Some LIT files have Windows-style paths
path = original.replace('\\', '/')
if path[1:3] == ':/':
path = path[2:]
# Some paths in Fictionwise "multiformat" LIT files contain '..' (!?)
path = os.path.normpath(path).replace('\\', '/')
while path.startswith('../'):
path = path[3:]
self.path = path
def __eq__(self, other):
if hasattr(other, 'internal'):
return self.internal == other.internal
return self.internal == other
def __repr__(self):
return "ManifestItem(internal=%r, path=%r, mime_type=%r, " \
"offset=%d, root=%r, state=%r)" \
% (self.internal, self.path, self.mime_type, self.offset,
self.root, self.state)
def preserve(function):
def wrapper(self, *args, **kwargs):
opos = self.stream.tell()
try:
return function(self, *args, **kwargs)
finally:
self.stream.seek(opos)
functools.update_wrapper(wrapper, function)
return wrapper
class LitFile:
PIECE_SIZE = 16
def __init__(self, filename_or_stream, log):
self._warn = log.warn
if hasattr(filename_or_stream, 'read'):
self.stream = filename_or_stream
else:
self.stream = open(filename_or_stream, 'rb')
try:
self.opf_path = os.path.splitext(
os.path.basename(self.stream.name))[0] + '.opf'
except AttributeError:
self.opf_path = 'content.opf'
if self.magic != b'ITOLITLS':
raise LitError('Not a valid LIT file')
if self.version != 1:
raise LitError('Unknown LIT version %d' % (self.version,))
self.read_secondary_header()
self.read_header_pieces()
self.read_section_names()
self.read_manifest()
self.read_drm()
def warn(self, msg):
self._warn(msg)
def magic():
@preserve
def fget(self):
self.stream.seek(0)
return self.stream.read(8)
return property(fget=fget)
magic = magic()
def version():
def fget(self):
self.stream.seek(8)
return u32(self.stream.read(4))
return property(fget=fget)
version = version()
def hdr_len():
@preserve
def fget(self):
self.stream.seek(12)
return int32(self.stream.read(4))
return property(fget=fget)
hdr_len = hdr_len()
def num_pieces():
@preserve
def fget(self):
self.stream.seek(16)
return int32(self.stream.read(4))
return property(fget=fget)
num_pieces = num_pieces()
def sec_hdr_len():
@preserve
def fget(self):
self.stream.seek(20)
return int32(self.stream.read(4))
return property(fget=fget)
sec_hdr_len = sec_hdr_len()
def guid():
@preserve
def fget(self):
self.stream.seek(24)
return self.stream.read(16)
return property(fget=fget)
guid = guid()
def header():
@preserve
def fget(self):
size = self.hdr_len \
+ (self.num_pieces * self.PIECE_SIZE) \
+ self.sec_hdr_len
self.stream.seek(0)
return self.stream.read(size)
return property(fget=fget)
header = header()
@preserve
def __len__(self):
self.stream.seek(0, 2)
return self.stream.tell()
@preserve
def read_raw(self, offset, size):
self.stream.seek(offset)
return self.stream.read(size)
def read_content(self, offset, size):
return self.read_raw(self.content_offset + offset, size)
def read_secondary_header(self):
offset = self.hdr_len + (self.num_pieces * self.PIECE_SIZE)
byts = self.read_raw(offset, self.sec_hdr_len)
offset = int32(byts[4:])
while offset < len(byts):
blocktype = byts[offset:offset+4]
blockver = u32(byts[offset+4:])
if blocktype == b'CAOL':
if blockver != 2:
raise LitError(
'Unknown CAOL block format %d' % blockver)
self.creator_id = u32(byts[offset+12:])
self.entry_chunklen = u32(byts[offset+20:])
self.count_chunklen = u32(byts[offset+24:])
self.entry_unknown = u32(byts[offset+28:])
self.count_unknown = u32(byts[offset+32:])
offset += 48
elif blocktype == b'ITSF':
if blockver != 4:
raise LitError(
'Unknown ITSF block format %d' % blockver)
if u32(byts[offset+4+16:]):
raise LitError('This file has a 64bit content offset')
self.content_offset = u32(byts[offset+16:])
self.timestamp = u32(byts[offset+24:])
self.language_id = u32(byts[offset+28:])
offset += 48
if not hasattr(self, 'content_offset'):
raise LitError('Could not figure out the content offset')
def read_header_pieces(self):
src = self.header[self.hdr_len:]
for i in range(self.num_pieces):
piece = src[i * self.PIECE_SIZE:(i + 1) * self.PIECE_SIZE]
if u32(piece[4:]) != 0 or u32(piece[12:]) != 0:
raise LitError('Piece %s has 64bit value' % repr(piece))
offset, size = u32(piece), int32(piece[8:])
piece = self.read_raw(offset, size)
if i == 0:
continue # Dont need this piece
elif i == 1:
if u32(piece[8:]) != self.entry_chunklen or \
u32(piece[12:]) != self.entry_unknown:
raise LitError('Secondary header does not match piece')
self.read_directory(piece)
elif i == 2:
if u32(piece[8:]) != self.count_chunklen or \
u32(piece[12:]) != self.count_unknown:
raise LitError('Secondary header does not match piece')
continue # No data needed from this piece
elif i == 3:
self.piece3_guid = piece
elif i == 4:
self.piece4_guid = piece
def read_directory(self, piece):
if not piece.startswith(b'IFCM'):
raise LitError('Header piece #1 is not main directory.')
chunk_size, num_chunks = int32(piece[8:12]), int32(piece[24:28])
if (32 + (num_chunks * chunk_size)) != len(piece):
raise LitError('IFCM header has incorrect length')
self.entries = {}
for i in range(num_chunks):
offset = 32 + (i * chunk_size)
chunk = piece[offset:offset + chunk_size]
tag, chunk = chunk[:4], chunk[4:]
if tag != b'AOLL':
continue
remaining, chunk = int32(chunk[:4]), chunk[4:]
if remaining >= chunk_size:
raise LitError('AOLL remaining count is negative')
remaining = chunk_size - (remaining + 48)
entries = u16(chunk[-2:])
if entries == 0:
# Hopefully will work even without a correct entries count
entries = (2 ** 16) - 1
chunk = chunk[40:]
for j in range(entries):
if remaining <= 0:
break
namelen, chunk, remaining = encint(chunk, remaining)
if namelen != (namelen & 0x7fffffff):
raise LitError('Directory entry had 64bit name length.')
if namelen > remaining - 3:
raise LitError('Read past end of directory chunk')
try:
name = chunk[:namelen].decode('utf-8')
chunk = chunk[namelen:]
remaining -= namelen
except UnicodeDecodeError:
break
section, chunk, remaining = encint(chunk, remaining)
offset, chunk, remaining = encint(chunk, remaining)
size, chunk, remaining = encint(chunk, remaining)
entry = DirectoryEntry(name, section, offset, size)
self.entries[name] = entry
def read_section_names(self):
if '::DataSpace/NameList' not in self.entries:
raise LitError('Lit file does not have a valid NameList')
raw = self.get_file('::DataSpace/NameList')
if len(raw) < 4:
raise LitError('Invalid Namelist section')
pos = 4
num_sections = u16(raw[2:pos])
self.section_names = [""] * num_sections
self.section_data = [None] * num_sections
for section in range(num_sections):
size = u16(raw[pos:pos+2])
pos += 2
size = size*2 + 2
if pos + size > len(raw):
raise LitError('Invalid Namelist section')
self.section_names[section] = \
raw[pos:pos+size].decode('utf-16-le').rstrip('\0')
pos += size
def read_manifest(self):
if '/manifest' not in self.entries:
raise LitError('Lit file does not have a valid manifest')
raw = self.get_file('/manifest')
self.manifest = {}
self.paths = {self.opf_path: None}
while raw:
slen, raw = ord(raw[0:1]), raw[1:]
if slen == 0:
break
root, raw = raw[:slen].decode('utf8'), raw[slen:]
if not raw:
raise LitError('Truncated manifest')
for state in ['spine', 'not spine', 'css', 'images']:
num_files, raw = int32(raw), raw[4:]
if num_files == 0:
continue
for i in range(num_files):
if len(raw) < 5:
raise LitError('Truncated manifest')
offset, raw = u32(raw), raw[4:]
internal, raw = consume_sized_utf8_string(raw)
original, raw = consume_sized_utf8_string(raw)
# The path should be stored unquoted, but not always
original = urlunquote(original)
# Is this last one UTF-8 or ASCIIZ?
mime_type, raw = consume_sized_utf8_string(raw, zpad=True)
self.manifest[internal] = ManifestItem(
original, internal, mime_type, offset, root, state)
mlist = list(itervalues(self.manifest))
# Remove any common path elements
if len(mlist) > 1:
shared = mlist[0].path
for item in mlist[1:]:
path = item.path
while shared and not path.startswith(shared):
try:
shared = shared[:shared.rindex("/", 0, -2) + 1]
except ValueError:
shared = None
if not shared:
break
if shared:
slen = len(shared)
for item in mlist:
item.path = item.path[slen:]
# Fix any straggling absolute paths
for item in mlist:
if item.path[0] == '/':
item.path = os.path.basename(item.path)
self.paths[item.path] = item
def read_drm(self):
self.drmlevel = 0
if '/DRMStorage/Licenses/EUL' in self.entries:
self.drmlevel = 5
elif '/DRMStorage/DRMBookplate' in self.entries:
self.drmlevel = 3
elif '/DRMStorage/DRMSealed' in self.entries:
self.drmlevel = 1
else:
return
if self.drmlevel < 5:
msdes.deskey(self.calculate_deskey(), msdes.DE1)
bookkey = msdes.des(self.get_file('/DRMStorage/DRMSealed'))
if bookkey[0:1] != b'\0':
raise LitError('Unable to decrypt title key!')
self.bookkey = bookkey[1:9]
else:
raise DRMError("Cannot access DRM-protected book")
def calculate_deskey(self):
hashfiles = ['/meta', '/DRMStorage/DRMSource']
if self.drmlevel == 3:
hashfiles.append('/DRMStorage/DRMBookplate')
prepad = 2
hash = mssha1.new()
for name in hashfiles:
data = self.get_file(name)
if prepad > 0:
data = (b"\000" * prepad) + data
prepad = 0
postpad = 64 - (len(data) % 64)
if postpad < 64:
data = data + (b"\000" * postpad)
hash.update(data)
digest = hash.digest()
if not isinstance(digest, bytes):
digest = digest.encode('ascii')
digest = bytearray(digest)
key = bytearray(8)
for i, d in enumerate(digest):
key[i % 8] ^= d
return bytes(key)
def get_file(self, name):
entry = self.entries[name]
if entry.section == 0:
return self.read_content(entry.offset, entry.size)
section = self.get_section(entry.section)
return section[entry.offset:entry.offset+entry.size]
def get_section(self, section):
data = self.section_data[section]
if not data:
data = self.get_section_uncached(section)
self.section_data[section] = data
return data
def get_section_uncached(self, section):
name = self.section_names[section]
path = '::DataSpace/Storage/' + name
transform = self.get_file(path + '/Transform/List')
content = self.get_file(path + '/Content')
control = self.get_file(path + '/ControlData')
while len(transform) >= 16:
csize = (int32(control) + 1) * 4
if csize > len(control) or csize <= 0:
raise LitError("ControlData is too short")
guid = msguid(transform)
if guid == DESENCRYPT_GUID:
content = self.decrypt(content)
control = control[csize:]
elif guid == LZXCOMPRESS_GUID:
reset_table = self.get_file(
'/'.join(('::DataSpace/Storage', name, 'Transform',
LZXCOMPRESS_GUID, 'InstanceData/ResetTable')))
content = self.decompress(content, control, reset_table)
control = control[csize:]
else:
raise LitError("Unrecognized transform: %s." % repr(guid))
transform = transform[16:]
return content
def decrypt(self, content):
length = len(content)
extra = length & 0x7
if extra > 0:
self.warn("content length not a multiple of block size")
content += b"\0" * (8 - extra)
msdes.deskey(self.bookkey, msdes.DE1)
return msdes.des(content)
def decompress(self, content, control, reset_table):
if len(control) < 32 or control[CONTROL_TAG:CONTROL_TAG+4] != b"LZXC":
raise LitError("Invalid ControlData tag value")
if len(reset_table) < (RESET_INTERVAL + 8):
raise LitError("Reset table is too short")
if u32(reset_table[RESET_UCLENGTH + 4:]) != 0:
raise LitError("Reset table has 64bit value for UCLENGTH")
result = []
window_size = 14
u = u32(control[CONTROL_WINDOW_SIZE:])
while u > 0:
u >>= 1
window_size += 1
if window_size < 15 or window_size > 21:
raise LitError("Invalid window in ControlData")
lzx.init(window_size)
ofs_entry = int32(reset_table[RESET_HDRLEN:]) + 8
uclength = int32(reset_table[RESET_UCLENGTH:])
accum = int32(reset_table[RESET_INTERVAL:])
bytes_remaining = uclength
window_bytes = (1 << window_size)
base = 0
while ofs_entry < len(reset_table):
if accum >= window_bytes:
accum = 0
size = int32(reset_table[ofs_entry:])
u = int32(reset_table[ofs_entry + 4:])
if u != 0:
raise LitError("Reset table entry greater than 32 bits")
if size >= len(content):
self._warn("LZX reset table entry out of bounds")
if bytes_remaining >= window_bytes:
lzx.reset()
try:
result.append(
lzx.decompress(content[base:size], window_bytes))
except lzx.LZXError:
self.warn("LZX decompression error; skipping chunk")
bytes_remaining -= window_bytes
base = size
accum += int32(reset_table[RESET_INTERVAL:])
ofs_entry += 8
if bytes_remaining < window_bytes and bytes_remaining > 0:
lzx.reset()
try:
result.append(lzx.decompress(content[base:], bytes_remaining))
except lzx.LZXError:
self.warn("LZX decompression error; skipping chunk")
bytes_remaining = 0
if bytes_remaining > 0:
raise LitError("Failed to completely decompress section")
return b''.join(result)
def get_atoms(self, entry):
name = '/'.join(('/data', entry.internal, 'atom'))
if name not in self.entries:
return ({}, {})
data = self.get_file(name)
nentries, data = u32(data), data[4:]
tags = {}
for i in range(1, nentries + 1):
if len(data) <= 1:
break
size, data = ord(data[0:1]), data[1:]
if size == 0 or len(data) < size:
break
tags[i], data = data[:size], data[size:]
if len(tags) != nentries:
self._warn("damaged or invalid atoms tag table")
if len(data) < 4:
return (tags, {})
attrs = {}
nentries, data = u32(data), data[4:]
for i in range(1, nentries + 1):
if len(data) <= 4:
break
size, data = u32(data), data[4:]
if size == 0 or len(data) < size:
break
attrs[i], data = data[:size], data[size:]
if len(attrs) != nentries:
self._warn("damaged or invalid atoms attributes table")
return (tags, attrs)
class LitContainer:
"""Simple Container-interface, read-only accessor for LIT files."""
def __init__(self, filename_or_stream, log):
self._litfile = LitFile(filename_or_stream, log)
self.log = log
def namelist(self):
return self._litfile.paths.keys()
def exists(self, name):
return urlunquote(name) in self._litfile.paths
def read(self, name):
entry = self._litfile.paths[urlunquote(name)] if name else None
if entry is None:
content = OPF_DECL + self._read_meta()
elif 'spine' in entry.state:
internal = '/'.join(('/data', entry.internal, 'content'))
raw = self._litfile.get_file(internal)
manifest = self._litfile.manifest
atoms = self._litfile.get_atoms(entry)
unbin = UnBinary(raw, name, manifest, HTML_MAP, atoms)
content = HTML_DECL + unbin.unicode_representation
tags = ('personname', 'place', 'city', 'country-region')
pat = r'(?i)</{0,1}st1:(%s)>'%('|'.join(tags))
content = re.sub(pat, '', content)
content = re.sub(r'<(/{0,1})form>', r'<\1div>', content)
else:
internal = '/'.join(('/data', entry.internal))
content = self._litfile.get_file(internal)
return content
def _read_meta(self):
path = 'content.opf'
raw = self._litfile.get_file('/meta')
try:
unbin = UnBinary(raw, path, self._litfile.manifest, OPF_MAP)
except LitError:
if b'PENGUIN group' not in raw:
raise
print("WARNING: attempting PENGUIN malformed OPF fix")
raw = raw.replace(
b'PENGUIN group', b'\x00\x01\x18\x00PENGUIN group', 1)
unbin = UnBinary(raw, path, self._litfile.manifest, OPF_MAP)
return unbin.unicode_representation
def get_metadata(self):
return self._read_meta()
class LitReader(OEBReader):
Container = LitContainer
DEFAULT_PROFILE = 'MSReader'
def _spine_from_opf(self, opf):
manifest = self.oeb.manifest
for elem in xpath(opf, '/o2:package/o2:spine/o2:itemref'):
idref = elem.get('idref')
if idref not in manifest.ids:
continue
item = manifest.ids[idref]
if (item.media_type.lower() == 'application/xml' and
hasattr(item.data, 'xpath') and item.data.xpath('/html')):
item.media_type = 'application/xhtml+xml'
item.data = item._parse_xhtml(etree.tostring(item.data))
super()._spine_from_opf(opf)