%PDF- %PDF-
Direktori : /usr/lib/calibre/calibre/ebooks/mobi/debug/ |
Current File : //usr/lib/calibre/calibre/ebooks/mobi/debug/mobi8.py |
#!/usr/bin/env python3 __license__ = 'GPL v3' __copyright__ = '2012, Kovid Goyal <kovid@kovidgoyal.net>' __docformat__ = 'restructuredtext en' import sys, os, struct, textwrap from calibre import CurrentDir from calibre.ebooks.mobi.debug.containers import ContainerHeader from calibre.ebooks.mobi.debug.headers import TextRecord from calibre.ebooks.mobi.debug.index import (SKELIndex, SECTIndex, NCXIndex, GuideIndex) from calibre.ebooks.mobi.utils import read_font_record, decode_tbs, RECORD_SIZE from calibre.ebooks.mobi.debug import format_bytes from calibre.ebooks.mobi.reader.headers import NULL_INDEX from calibre.utils.imghdr import what from polyglot.builtins import iteritems, itervalues, print_to_binary_file class FDST: def __init__(self, raw): if raw[:4] != b'FDST': raise ValueError('KF8 does not have a valid FDST record') self.sec_off, self.num_sections = struct.unpack_from(b'>LL', raw, 4) if self.sec_off != 12: raise ValueError('FDST record has unknown extra fields') secf = b'>%dL' % (self.num_sections*2) secs = struct.unpack_from(secf, raw, self.sec_off) rest = raw[self.sec_off+struct.calcsize(secf):] if rest: raise ValueError('FDST record has trailing data: ' '%s'%format_bytes(rest)) self.sections = tuple(zip(secs[::2], secs[1::2])) def __str__(self): ans = ['FDST record'] a = lambda k, v:ans.append('%s: %s'%(k, v)) a('Offset to sections', self.sec_off) a('Number of section records', self.num_sections) ans.append('**** %d Sections ****'% len(self.sections)) for sec in self.sections: ans.append('Start: %20d End: %d'%sec) return '\n'.join(ans) class File: def __init__(self, skel, skeleton, text, first_aid, sections): self.name = 'part%04d'%skel.file_number self.skeleton, self.text, self.first_aid = skeleton, text, first_aid self.sections = sections def dump(self, ddir): with open(os.path.join(ddir, self.name + '.html'), 'wb') as f: f.write(self.text) base = os.path.join(ddir, self.name + '-parts') os.mkdir(base) with CurrentDir(base): with open('skeleton.html', 'wb') as f: f.write(self.skeleton) for i, text in enumerate(self.sections): with open('sect-%04d.html'%i, 'wb') as f: f.write(text) class MOBIFile: def __init__(self, mf): self.mf = mf h, h8 = mf.mobi_header, mf.mobi8_header first_text_record = 1 offset = 0 self.resource_ranges = [(h8.first_resource_record, h8.last_resource_record, h8.first_image_index)] if mf.kf8_type == 'joint': offset = h.exth.kf8_header_index self.resource_ranges.insert(0, (h.first_resource_record, h.last_resource_record, h.first_image_index)) self.text_records = [TextRecord(i, r, h8.extra_data_flags, mf.decompress8) for i, r in enumerate(mf.records[first_text_record+offset: first_text_record+offset+h8.number_of_text_records])] self.raw_text = b''.join(r.raw for r in self.text_records) self.header = self.mf.mobi8_header self.extract_resources(mf.records) self.read_fdst() self.read_indices() self.build_files() self.read_tbs() def print_header(self, f=sys.stdout): p = print_to_binary_file(f) p(str(self.mf.palmdb)) p() p('Record headers:') for i, r in enumerate(self.mf.records): p('%6d. %s'%(i, r.header)) p() p(str(self.mf.mobi8_header)) def read_fdst(self): self.fdst = None if self.header.fdst_idx != NULL_INDEX: idx = self.header.fdst_idx self.fdst = FDST(self.mf.records[idx].raw) if self.fdst.num_sections != self.header.fdst_count: raise ValueError('KF8 Header contains invalid FDST count') def read_indices(self): self.skel_index = SKELIndex(self.header.skel_idx, self.mf.records, self.header.encoding) self.sect_index = SECTIndex(self.header.sect_idx, self.mf.records, self.header.encoding) self.ncx_index = NCXIndex(self.header.primary_index_record, self.mf.records, self.header.encoding) self.guide_index = GuideIndex(self.header.oth_idx, self.mf.records, self.header.encoding) def build_files(self): text = self.raw_text self.files = [] for skel in self.skel_index.records: sects = [x for x in self.sect_index.records if x.file_number == skel.file_number] skeleton = text[skel.start_position:skel.start_position+skel.length] ftext = skeleton first_aid = sects[0].toc_text sections = [] for sect in sects: start_pos = skel.start_position + skel.length + sect.start_pos sect_text = text[start_pos:start_pos+sect.length] insert_pos = sect.insert_pos - skel.start_position ftext = ftext[:insert_pos] + sect_text + ftext[insert_pos:] sections.append(sect_text) self.files.append(File(skel, skeleton, ftext, first_aid, sections)) def dump_flows(self, ddir): boundaries = [(0, len(self.raw_text))] if self.fdst is not None: boundaries = self.fdst.sections for i, x in enumerate(boundaries): start, end = x raw = self.raw_text[start:end] with open(os.path.join(ddir, 'flow%04d.txt'%i), 'wb') as f: f.write(raw) def extract_resources(self, records): self.resource_map = [] self.containers = [] known_types = {b'FLIS', b'FCIS', b'SRCS', b'\xe9\x8e\r\n', b'RESC', b'BOUN', b'FDST', b'DATP', b'AUDI', b'VIDE', b'CRES', b'CONT', b'CMET', b'PAGE'} container = None for i, rec in enumerate(records): for (l, r, offset) in self.resource_ranges: if l <= i <= r: resource_index = i + 1 if offset is not None and resource_index >= offset: resource_index -= offset break else: continue sig = rec.raw[:4] payload = rec.raw ext = 'dat' prefix = 'binary' suffix = '' if sig in {b'HUFF', b'CDIC', b'INDX'}: continue # TODO: Ignore CNCX records as well if sig == b'FONT': font = read_font_record(rec.raw) if font['err']: raise ValueError('Failed to read font record: %s Headers: %s'%( font['err'], font['headers'])) payload = (font['font_data'] if font['font_data'] else font['raw_data']) prefix, ext = 'fonts', font['ext'] elif sig == b'CONT': if payload == b'CONTBOUNDARY': self.containers.append(container) container = None continue container = ContainerHeader(payload) elif sig == b'CRES': container.resources.append(payload) if container.is_image_container: payload = payload[12:] q = what(None, payload) if q: prefix, ext = 'hd-images', q resource_index = len(container.resources) elif sig == b'\xa0\xa0\xa0\xa0' and len(payload) == 4: if container is None: print('Found an end of container record with no container, ignoring') else: container.resources.append(None) continue elif sig not in known_types: if container is not None and len(container.resources) == container.num_of_resource_records: container.add_hrefs(payload) continue q = what(None, rec.raw) if q: prefix, ext = 'images', q if prefix == 'binary': if sig == b'\xe9\x8e\r\n': suffix = '-EOF' elif sig in known_types: suffix = '-' + sig.decode('ascii') self.resource_map.append(('%s/%06d%s.%s'%(prefix, resource_index, suffix, ext), payload)) def read_tbs(self): from calibre.ebooks.mobi.writer8.tbs import (Entry, DOC, collect_indexing_data, encode_strands_as_sequences, sequences_to_bytes, calculate_all_tbs, NegativeStrandIndex) entry_map = [] for index in self.ncx_index: vals = list(index)[:-1] + [None, None, None, None] entry_map.append(Entry(*(vals[:12]))) indexing_data = collect_indexing_data(entry_map, list(map(len, self.text_records))) self.indexing_data = [DOC + '\n' +textwrap.dedent('''\ Index Entry lines are of the form: depth:index_number [action] parent (index_num-parent) Geometry Where Geometry is the start and end of the index entry w.r.t the start of the text record. ''')] tbs_type = 8 try: calculate_all_tbs(indexing_data) except NegativeStrandIndex: calculate_all_tbs(indexing_data, tbs_type=5) tbs_type = 5 for i, strands in enumerate(indexing_data): rec = self.text_records[i] tbs_bytes = rec.trailing_data.get('indexing', b'') desc = ['Record #%d'%i] for s, strand in enumerate(strands): desc.append('Strand %d'%s) for entries in itervalues(strand): for e in entries: desc.append( ' %s%d [%-9s] parent: %s (%d) Geometry: (%d, %d)'%( e.depth * (' ') + '- ', e.index, e.action, e.parent, e.index-(e.parent or 0), e.start-i*RECORD_SIZE, e.start+e.length-i*RECORD_SIZE)) desc.append('TBS Bytes: ' + format_bytes(tbs_bytes)) flag_sz = 3 sequences = [] otbs = tbs_bytes while tbs_bytes: try: val, extra, consumed = decode_tbs(tbs_bytes, flag_size=flag_sz) except: break flag_sz = 4 tbs_bytes = tbs_bytes[consumed:] extra = {bin(k):v for k, v in iteritems(extra)} sequences.append((val, extra)) for j, seq in enumerate(sequences): desc.append('Sequence #%d: %r %r'%(j, seq[0], seq[1])) if tbs_bytes: desc.append('Remaining bytes: %s'%format_bytes(tbs_bytes)) calculated_sequences = encode_strands_as_sequences(strands, tbs_type=tbs_type) try: calculated_bytes = sequences_to_bytes(calculated_sequences) except: calculated_bytes = b'failed to calculate tbs bytes' if calculated_bytes != otbs: print('WARNING: TBS mismatch for record %d'%i) desc.append('WARNING: TBS mismatch!') desc.append('Calculated sequences: %r'%calculated_sequences) desc.append('') self.indexing_data.append('\n'.join(desc)) def inspect_mobi(mobi_file, ddir): f = MOBIFile(mobi_file) with open(os.path.join(ddir, 'header.txt'), 'wb') as out: f.print_header(f=out) alltext = os.path.join(ddir, 'raw_text.html') with open(alltext, 'wb') as of: of.write(f.raw_text) for x in ('text_records', 'images', 'fonts', 'binary', 'files', 'flows', 'hd-images',): os.mkdir(os.path.join(ddir, x)) for rec in f.text_records: rec.dump(os.path.join(ddir, 'text_records')) for href, payload in f.resource_map: with open(os.path.join(ddir, href), 'wb') as fo: fo.write(payload) for i, container in enumerate(f.containers): with open(os.path.join(ddir, 'container%d.txt' % (i + 1)), 'wb') as cf: cf.write(str(container).encode('utf-8')) if f.fdst: with open(os.path.join(ddir, 'fdst.record'), 'wb') as fo: fo.write(str(f.fdst).encode('utf-8')) with open(os.path.join(ddir, 'skel.record'), 'wb') as fo: fo.write(str(f.skel_index).encode('utf-8')) with open(os.path.join(ddir, 'chunks.record'), 'wb') as fo: fo.write(str(f.sect_index).encode('utf-8')) with open(os.path.join(ddir, 'ncx.record'), 'wb') as fo: fo.write(str(f.ncx_index).encode('utf-8')) with open(os.path.join(ddir, 'guide.record'), 'wb') as fo: fo.write(str(f.guide_index).encode('utf-8')) with open(os.path.join(ddir, 'tbs.txt'), 'wb') as fo: fo.write(('\n'.join(f.indexing_data)).encode('utf-8')) for part in f.files: part.dump(os.path.join(ddir, 'files')) f.dump_flows(os.path.join(ddir, 'flows'))