%PDF- %PDF-
| Direktori : /usr/lib/calibre/calibre/ebooks/mobi/debug/ |
| Current File : //usr/lib/calibre/calibre/ebooks/mobi/debug/mobi8.py |
#!/usr/bin/env python3
__license__ = 'GPL v3'
__copyright__ = '2012, Kovid Goyal <kovid@kovidgoyal.net>'
__docformat__ = 'restructuredtext en'
import sys, os, struct, textwrap
from calibre import CurrentDir
from calibre.ebooks.mobi.debug.containers import ContainerHeader
from calibre.ebooks.mobi.debug.headers import TextRecord
from calibre.ebooks.mobi.debug.index import (SKELIndex, SECTIndex, NCXIndex,
GuideIndex)
from calibre.ebooks.mobi.utils import read_font_record, decode_tbs, RECORD_SIZE
from calibre.ebooks.mobi.debug import format_bytes
from calibre.ebooks.mobi.reader.headers import NULL_INDEX
from calibre.utils.imghdr import what
from polyglot.builtins import iteritems, itervalues, print_to_binary_file
class FDST:
def __init__(self, raw):
if raw[:4] != b'FDST':
raise ValueError('KF8 does not have a valid FDST record')
self.sec_off, self.num_sections = struct.unpack_from(b'>LL', raw, 4)
if self.sec_off != 12:
raise ValueError('FDST record has unknown extra fields')
secf = b'>%dL' % (self.num_sections*2)
secs = struct.unpack_from(secf, raw, self.sec_off)
rest = raw[self.sec_off+struct.calcsize(secf):]
if rest:
raise ValueError('FDST record has trailing data: '
'%s'%format_bytes(rest))
self.sections = tuple(zip(secs[::2], secs[1::2]))
def __str__(self):
ans = ['FDST record']
a = lambda k, v:ans.append('%s: %s'%(k, v))
a('Offset to sections', self.sec_off)
a('Number of section records', self.num_sections)
ans.append('**** %d Sections ****'% len(self.sections))
for sec in self.sections:
ans.append('Start: %20d End: %d'%sec)
return '\n'.join(ans)
class File:
def __init__(self, skel, skeleton, text, first_aid, sections):
self.name = 'part%04d'%skel.file_number
self.skeleton, self.text, self.first_aid = skeleton, text, first_aid
self.sections = sections
def dump(self, ddir):
with open(os.path.join(ddir, self.name + '.html'), 'wb') as f:
f.write(self.text)
base = os.path.join(ddir, self.name + '-parts')
os.mkdir(base)
with CurrentDir(base):
with open('skeleton.html', 'wb') as f:
f.write(self.skeleton)
for i, text in enumerate(self.sections):
with open('sect-%04d.html'%i, 'wb') as f:
f.write(text)
class MOBIFile:
def __init__(self, mf):
self.mf = mf
h, h8 = mf.mobi_header, mf.mobi8_header
first_text_record = 1
offset = 0
self.resource_ranges = [(h8.first_resource_record, h8.last_resource_record, h8.first_image_index)]
if mf.kf8_type == 'joint':
offset = h.exth.kf8_header_index
self.resource_ranges.insert(0, (h.first_resource_record, h.last_resource_record, h.first_image_index))
self.text_records = [TextRecord(i, r, h8.extra_data_flags,
mf.decompress8) for i, r in
enumerate(mf.records[first_text_record+offset:
first_text_record+offset+h8.number_of_text_records])]
self.raw_text = b''.join(r.raw for r in self.text_records)
self.header = self.mf.mobi8_header
self.extract_resources(mf.records)
self.read_fdst()
self.read_indices()
self.build_files()
self.read_tbs()
def print_header(self, f=sys.stdout):
p = print_to_binary_file(f)
p(str(self.mf.palmdb))
p()
p('Record headers:')
for i, r in enumerate(self.mf.records):
p('%6d. %s'%(i, r.header))
p()
p(str(self.mf.mobi8_header))
def read_fdst(self):
self.fdst = None
if self.header.fdst_idx != NULL_INDEX:
idx = self.header.fdst_idx
self.fdst = FDST(self.mf.records[idx].raw)
if self.fdst.num_sections != self.header.fdst_count:
raise ValueError('KF8 Header contains invalid FDST count')
def read_indices(self):
self.skel_index = SKELIndex(self.header.skel_idx, self.mf.records,
self.header.encoding)
self.sect_index = SECTIndex(self.header.sect_idx, self.mf.records,
self.header.encoding)
self.ncx_index = NCXIndex(self.header.primary_index_record,
self.mf.records, self.header.encoding)
self.guide_index = GuideIndex(self.header.oth_idx, self.mf.records,
self.header.encoding)
def build_files(self):
text = self.raw_text
self.files = []
for skel in self.skel_index.records:
sects = [x for x in self.sect_index.records if x.file_number == skel.file_number]
skeleton = text[skel.start_position:skel.start_position+skel.length]
ftext = skeleton
first_aid = sects[0].toc_text
sections = []
for sect in sects:
start_pos = skel.start_position + skel.length + sect.start_pos
sect_text = text[start_pos:start_pos+sect.length]
insert_pos = sect.insert_pos - skel.start_position
ftext = ftext[:insert_pos] + sect_text + ftext[insert_pos:]
sections.append(sect_text)
self.files.append(File(skel, skeleton, ftext, first_aid, sections))
def dump_flows(self, ddir):
boundaries = [(0, len(self.raw_text))]
if self.fdst is not None:
boundaries = self.fdst.sections
for i, x in enumerate(boundaries):
start, end = x
raw = self.raw_text[start:end]
with open(os.path.join(ddir, 'flow%04d.txt'%i), 'wb') as f:
f.write(raw)
def extract_resources(self, records):
self.resource_map = []
self.containers = []
known_types = {b'FLIS', b'FCIS', b'SRCS',
b'\xe9\x8e\r\n', b'RESC', b'BOUN', b'FDST', b'DATP',
b'AUDI', b'VIDE', b'CRES', b'CONT', b'CMET', b'PAGE'}
container = None
for i, rec in enumerate(records):
for (l, r, offset) in self.resource_ranges:
if l <= i <= r:
resource_index = i + 1
if offset is not None and resource_index >= offset:
resource_index -= offset
break
else:
continue
sig = rec.raw[:4]
payload = rec.raw
ext = 'dat'
prefix = 'binary'
suffix = ''
if sig in {b'HUFF', b'CDIC', b'INDX'}:
continue
# TODO: Ignore CNCX records as well
if sig == b'FONT':
font = read_font_record(rec.raw)
if font['err']:
raise ValueError('Failed to read font record: %s Headers: %s'%(
font['err'], font['headers']))
payload = (font['font_data'] if font['font_data'] else
font['raw_data'])
prefix, ext = 'fonts', font['ext']
elif sig == b'CONT':
if payload == b'CONTBOUNDARY':
self.containers.append(container)
container = None
continue
container = ContainerHeader(payload)
elif sig == b'CRES':
container.resources.append(payload)
if container.is_image_container:
payload = payload[12:]
q = what(None, payload)
if q:
prefix, ext = 'hd-images', q
resource_index = len(container.resources)
elif sig == b'\xa0\xa0\xa0\xa0' and len(payload) == 4:
if container is None:
print('Found an end of container record with no container, ignoring')
else:
container.resources.append(None)
continue
elif sig not in known_types:
if container is not None and len(container.resources) == container.num_of_resource_records:
container.add_hrefs(payload)
continue
q = what(None, rec.raw)
if q:
prefix, ext = 'images', q
if prefix == 'binary':
if sig == b'\xe9\x8e\r\n':
suffix = '-EOF'
elif sig in known_types:
suffix = '-' + sig.decode('ascii')
self.resource_map.append(('%s/%06d%s.%s'%(prefix, resource_index, suffix, ext),
payload))
def read_tbs(self):
from calibre.ebooks.mobi.writer8.tbs import (Entry, DOC,
collect_indexing_data, encode_strands_as_sequences,
sequences_to_bytes, calculate_all_tbs, NegativeStrandIndex)
entry_map = []
for index in self.ncx_index:
vals = list(index)[:-1] + [None, None, None, None]
entry_map.append(Entry(*(vals[:12])))
indexing_data = collect_indexing_data(entry_map, list(map(len,
self.text_records)))
self.indexing_data = [DOC + '\n' +textwrap.dedent('''\
Index Entry lines are of the form:
depth:index_number [action] parent (index_num-parent) Geometry
Where Geometry is the start and end of the index entry w.r.t
the start of the text record.
''')]
tbs_type = 8
try:
calculate_all_tbs(indexing_data)
except NegativeStrandIndex:
calculate_all_tbs(indexing_data, tbs_type=5)
tbs_type = 5
for i, strands in enumerate(indexing_data):
rec = self.text_records[i]
tbs_bytes = rec.trailing_data.get('indexing', b'')
desc = ['Record #%d'%i]
for s, strand in enumerate(strands):
desc.append('Strand %d'%s)
for entries in itervalues(strand):
for e in entries:
desc.append(
' %s%d [%-9s] parent: %s (%d) Geometry: (%d, %d)'%(
e.depth * (' ') + '- ', e.index, e.action, e.parent,
e.index-(e.parent or 0), e.start-i*RECORD_SIZE,
e.start+e.length-i*RECORD_SIZE))
desc.append('TBS Bytes: ' + format_bytes(tbs_bytes))
flag_sz = 3
sequences = []
otbs = tbs_bytes
while tbs_bytes:
try:
val, extra, consumed = decode_tbs(tbs_bytes, flag_size=flag_sz)
except:
break
flag_sz = 4
tbs_bytes = tbs_bytes[consumed:]
extra = {bin(k):v for k, v in iteritems(extra)}
sequences.append((val, extra))
for j, seq in enumerate(sequences):
desc.append('Sequence #%d: %r %r'%(j, seq[0], seq[1]))
if tbs_bytes:
desc.append('Remaining bytes: %s'%format_bytes(tbs_bytes))
calculated_sequences = encode_strands_as_sequences(strands,
tbs_type=tbs_type)
try:
calculated_bytes = sequences_to_bytes(calculated_sequences)
except:
calculated_bytes = b'failed to calculate tbs bytes'
if calculated_bytes != otbs:
print('WARNING: TBS mismatch for record %d'%i)
desc.append('WARNING: TBS mismatch!')
desc.append('Calculated sequences: %r'%calculated_sequences)
desc.append('')
self.indexing_data.append('\n'.join(desc))
def inspect_mobi(mobi_file, ddir):
f = MOBIFile(mobi_file)
with open(os.path.join(ddir, 'header.txt'), 'wb') as out:
f.print_header(f=out)
alltext = os.path.join(ddir, 'raw_text.html')
with open(alltext, 'wb') as of:
of.write(f.raw_text)
for x in ('text_records', 'images', 'fonts', 'binary', 'files', 'flows', 'hd-images',):
os.mkdir(os.path.join(ddir, x))
for rec in f.text_records:
rec.dump(os.path.join(ddir, 'text_records'))
for href, payload in f.resource_map:
with open(os.path.join(ddir, href), 'wb') as fo:
fo.write(payload)
for i, container in enumerate(f.containers):
with open(os.path.join(ddir, 'container%d.txt' % (i + 1)), 'wb') as cf:
cf.write(str(container).encode('utf-8'))
if f.fdst:
with open(os.path.join(ddir, 'fdst.record'), 'wb') as fo:
fo.write(str(f.fdst).encode('utf-8'))
with open(os.path.join(ddir, 'skel.record'), 'wb') as fo:
fo.write(str(f.skel_index).encode('utf-8'))
with open(os.path.join(ddir, 'chunks.record'), 'wb') as fo:
fo.write(str(f.sect_index).encode('utf-8'))
with open(os.path.join(ddir, 'ncx.record'), 'wb') as fo:
fo.write(str(f.ncx_index).encode('utf-8'))
with open(os.path.join(ddir, 'guide.record'), 'wb') as fo:
fo.write(str(f.guide_index).encode('utf-8'))
with open(os.path.join(ddir, 'tbs.txt'), 'wb') as fo:
fo.write(('\n'.join(f.indexing_data)).encode('utf-8'))
for part in f.files:
part.dump(os.path.join(ddir, 'files'))
f.dump_flows(os.path.join(ddir, 'flows'))