%PDF- %PDF-
Direktori : /usr/lib/calibre/calibre/ebooks/mobi/writer8/ |
Current File : //usr/lib/calibre/calibre/ebooks/mobi/writer8/index.py |
#!/usr/bin/env python3 __license__ = 'GPL v3' __copyright__ = '2012, Kovid Goyal <kovid@kovidgoyal.net>' __docformat__ = 'restructuredtext en' from collections import namedtuple from struct import pack from io import BytesIO from calibre.ebooks.mobi.utils import CNCX, encint, align_block from calibre.ebooks.mobi.writer8.header import Header TagMeta_ = namedtuple('TagMeta', 'name number values_per_entry bitmask end_flag') TagMeta = lambda x:TagMeta_(*x) EndTagTable = TagMeta(('eof', 0, 0, 0, 1)) # map of mask to number of shifts needed, works with 1 bit and two-bit wide masks # could also be extended to 4 bit wide ones as well mask_to_bit_shifts = {1:0, 2:1, 3:0, 4:2, 8:3, 12:2, 16:4, 32:5, 48:4, 64:6, 128:7, 192: 6} class IndexHeader(Header): # {{{ HEADER_NAME = b'INDX' ALIGN_BLOCK = True HEADER_LENGTH = 192 DEFINITION = ''' # 4 - 8: Header Length header_length = {header_length} # 8 - 16: Unknown unknown1 = zeroes(8) # 16 - 20: Index type: 0 - normal 2 - inflection type = 2 # 20 - 24: IDXT offset (filled in later) idxt_offset # 24 - 28: Number of index records num_of_records = DYN # 28 - 32: Index encoding (65001 = utf-8) encoding = 65001 # 32 - 36: Unknown unknown2 = NULL # 36 - 40: Number of Index entries num_of_entries = DYN # 40 - 44: ORDT offset ordt_offset # 44 - 48: LIGT offset ligt_offset # 48 - 52: Number of ORDT/LIGT? entries num_of_ordt_entries # 52 - 56: Number of CNCX records num_of_cncx = DYN # 56 - 180: Unknown unknown3 = zeroes(124) # 180 - 184: TAGX offset tagx_offset = {header_length} # 184 - 192: Unknown unknown4 = zeroes(8) # TAGX tagx = DYN # Geometry of index records geometry = DYN # IDXT idxt = DYN '''.format(header_length=HEADER_LENGTH) POSITIONS = {'idxt_offset':'idxt'} # }}} class Index: # {{{ control_byte_count = 1 cncx = CNCX() tag_types = (EndTagTable,) HEADER_LENGTH = IndexHeader.HEADER_LENGTH @classmethod def generate_tagx(cls): header = b'TAGX' byts = bytearray() for tag_meta in cls.tag_types: byts.extend(tag_meta[1:]) # table length, control byte count header += pack(b'>II', 12+len(byts), cls.control_byte_count) return header + bytes(byts) @classmethod def calculate_control_bytes_for_each_entry(cls, entries): control_bytes = [] for lead_text, tags in entries: cbs = [] ans = 0 for (name, number, vpe, mask, endi) in cls.tag_types: if endi == 1: cbs.append(ans) ans = 0 continue try: nvals = len(tags.get(name, ())) except TypeError: nvals = 1 nentries = nvals // vpe shifts = mask_to_bit_shifts[mask] ans |= mask & (nentries << shifts) if len(cbs) != cls.control_byte_count: raise ValueError(f'The entry {[lead_text, tags]!r} is invalid') control_bytes.append(cbs) return control_bytes def __call__(self): self.control_bytes = self.calculate_control_bytes_for_each_entry( self.entries) index_blocks, idxt_blocks, record_counts, last_indices = [BytesIO()], [BytesIO()], [0], [b''] buf = BytesIO() RECORD_LIMIT = 0x10000 - self.HEADER_LENGTH - 1048 # kindlegen uses 1048 (there has to be some margin because of block alignment) for i, (index_num, tags) in enumerate(self.entries): control_bytes = self.control_bytes[i] buf.seek(0), buf.truncate(0) index_num = (index_num.encode('utf-8') if isinstance(index_num, str) else index_num) raw = bytearray(index_num) raw.insert(0, len(index_num)) buf.write(bytes(raw)) buf.write(bytes(bytearray(control_bytes))) for tag in self.tag_types: values = tags.get(tag.name, None) if values is None: continue try: len(values) except TypeError: values = [values] if values: for val in values: try: buf.write(encint(val)) except ValueError: raise ValueError('Invalid values for %r: %r'%( tag, values)) raw = buf.getvalue() offset = index_blocks[-1].tell() idxt_pos = idxt_blocks[-1].tell() if offset + idxt_pos + len(raw) + 2 > RECORD_LIMIT: index_blocks.append(BytesIO()) idxt_blocks.append(BytesIO()) record_counts.append(0) offset = idxt_pos = 0 last_indices.append(b'') record_counts[-1] += 1 idxt_blocks[-1].write(pack(b'>H', self.HEADER_LENGTH+offset)) index_blocks[-1].write(raw) last_indices[-1] = index_num index_records = [] for index_block, idxt_block, record_count in zip(index_blocks, idxt_blocks, record_counts): index_block = align_block(index_block.getvalue()) idxt_block = align_block(b'IDXT' + idxt_block.getvalue()) # Create header for this index record header = b'INDX' buf.seek(0), buf.truncate(0) buf.write(pack(b'>I', self.HEADER_LENGTH)) buf.write(b'\0'*4) # Unknown buf.write(pack(b'>I', 1)) # Header type (0 for Index header record and 1 for Index records) buf.write(b'\0'*4) # Unknown # IDXT block offset buf.write(pack(b'>I', self.HEADER_LENGTH + len(index_block))) # Number of index entries in this record buf.write(pack(b'>I', record_count)) buf.write(b'\xff'*8) # Unknown buf.write(b'\0'*156) # Unknown header += buf.getvalue() index_records.append(header + index_block + idxt_block) if len(index_records[-1]) > 0x10000: raise ValueError('Failed to rollover index blocks for very large index.') # Create the Index Header record tagx = self.generate_tagx() # Geometry of the index records is written as index entries pointed to # by the IDXT records buf.seek(0), buf.truncate() idxt = [b'IDXT'] pos = IndexHeader.HEADER_LENGTH + len(tagx) for last_idx, num in zip(last_indices, record_counts): start = buf.tell() idxt.append(pack(b'>H', pos)) buf.write(bytes(bytearray([len(last_idx)])) + last_idx) buf.write(pack(b'>H', num)) pos += buf.tell() - start header = { 'num_of_entries': sum(r for r in record_counts), 'num_of_records': len(index_records), 'num_of_cncx': len(self.cncx), 'tagx':align_block(tagx), 'geometry':align_block(buf.getvalue()), 'idxt':align_block(b''.join(idxt)), } header = IndexHeader()(**header) self.records = [header] + index_records self.records.extend(self.cncx.records) return self.records # }}} class SkelIndex(Index): tag_types = tuple(map(TagMeta, ( ('chunk_count', 1, 1, 3, 0), ('geometry', 6, 2, 12, 0), EndTagTable ))) def __init__(self, skel_table): self.entries = [ (s.name, { # Dont ask me why these entries have to be repeated twice 'chunk_count':(s.chunk_count, s.chunk_count), 'geometry':(s.start_pos, s.length, s.start_pos, s.length), }) for s in skel_table ] class ChunkIndex(Index): tag_types = tuple(map(TagMeta, ( ('cncx_offset', 2, 1, 1, 0), ('file_number', 3, 1, 2, 0), ('sequence_number', 4, 1, 4, 0), ('geometry', 6, 2, 8, 0), EndTagTable ))) def __init__(self, chunk_table): self.cncx = CNCX(c.selector for c in chunk_table) self.entries = [ ('%010d'%c.insert_pos, { 'cncx_offset':self.cncx[c.selector], 'file_number':c.file_number, 'sequence_number':c.sequence_number, 'geometry':(c.start_pos, c.length), }) for c in chunk_table ] class GuideIndex(Index): tag_types = tuple(map(TagMeta, ( ('title', 1, 1, 1, 0), ('pos_fid', 6, 2, 2, 0), EndTagTable ))) def __init__(self, guide_table): self.cncx = CNCX(c.title for c in guide_table) self.entries = [ (r.type, { 'title':self.cncx[r.title], 'pos_fid':r.pos_fid, }) for r in guide_table ] class NCXIndex(Index): ''' The commented out parts have been seen in NCX indexes from MOBI 6 periodicals. Since we have no MOBI 8 periodicals to reverse engineer, leave it for now. ''' # control_byte_count = 2 tag_types = tuple(map(TagMeta, ( ('offset', 1, 1, 1, 0), ('length', 2, 1, 2, 0), ('label', 3, 1, 4, 0), ('depth', 4, 1, 8, 0), ('parent', 21, 1, 16, 0), ('first_child', 22, 1, 32, 0), ('last_child', 23, 1, 64, 0), ('pos_fid', 6, 2, 128, 0), EndTagTable, # ('image', 69, 1, 1, 0), # ('description', 70, 1, 2, 0), # ('author', 71, 1, 4, 0), # ('caption', 72, 1, 8, 0), # ('attribution', 73, 1, 16, 0), # EndTagTable ))) def __init__(self, toc_table): strings = [] for entry in toc_table: strings.append(entry['label']) aut = entry.get('author', None) if aut: strings.append(aut) desc = entry.get('description', None) if desc: strings.append(desc) kind = entry.get('kind', None) if kind: strings.append(kind) self.cncx = CNCX(strings) try: largest = max(x['index'] for x in toc_table) except ValueError: largest = 0 fmt = '%0{}X'.format(max(2, len('%X' % largest))) def to_entry(x): ans = {} for f in ('offset', 'length', 'depth', 'pos_fid', 'parent', 'first_child', 'last_child'): if f in x: ans[f] = x[f] for f in ('label', 'description', 'author', 'kind'): if f in x: ans[f] = self.cncx[x[f]] return (fmt % x['index'], ans) self.entries = list(map(to_entry, toc_table)) class NonLinearNCXIndex(NCXIndex): control_byte_count = 2 tag_types = tuple(map(TagMeta, ( ('offset', 1, 1, 1, 0), ('length', 2, 1, 2, 0), ('label', 3, 1, 4, 0), ('depth', 4, 1, 8, 0), ('kind', 5, 1, 16, 0), ('parent', 21, 1, 32, 0), ('first_child', 22, 1, 64, 0), ('last_child', 23, 1, 128, 0), EndTagTable, ('pos_fid', 6, 2, 1, 0), EndTagTable ))) if __name__ == '__main__': # Generate a document with a large number of index entries using both # calibre and kindlegen and compare the output import os, subprocess os.chdir('/t') paras = ['<p>%d</p>' % i for i in range(4000)] raw = '<html><body>' + '\n\n'.join(paras) + '</body></html>' src = 'index.html' with open(src, 'wb') as f: f.write(raw.encode('utf-8')) subprocess.check_call(['ebook-convert', src, '.epub', '--level1-toc', '//h:p', '--no-default-epub-cover', '--flow-size', '1000000']) subprocess.check_call(['ebook-convert', src, '.azw3', '--level1-toc', '//h:p', '--no-inline-toc', '--extract-to=x']) subprocess.call(['kindlegen', 'index.epub']) # kindlegen exit code is not 0 as we dont have a cover subprocess.check_call(['calibre-debug', 'index.mobi']) from calibre.gui2.tweak_book.diff.main import main main(['cdiff', 'decompiled_index/mobi8/ncx.record', 'x/ncx.record'])