%PDF- %PDF-
Direktori : /usr/lib/calibre/calibre/ebooks/mobi/writer2/ |
Current File : //usr/lib/calibre/calibre/ebooks/mobi/writer2/indexer.py |
#!/usr/bin/env python3 __license__ = 'GPL v3' __copyright__ = '2011, Kovid Goyal <kovid@kovidgoyal.net>' __docformat__ = 'restructuredtext en' import numbers from struct import pack import io from collections import OrderedDict, defaultdict from calibre.ebooks.mobi.utils import (encint, encode_number_as_hex, encode_tbs, align_block, RECORD_SIZE, CNCX as CNCX_) from polyglot.builtins import iteritems, itervalues class CNCX(CNCX_): # {{{ def __init__(self, toc, is_periodical): strings = [] for item in toc.iterdescendants(breadth_first=True): strings.append(item.title) if is_periodical: strings.append(item.klass) if item.author: strings.append(item.author) if item.description: strings.append(item.description) CNCX_.__init__(self, strings) # }}} class TAGX: # {{{ BITMASKS = {11:0b1} BITMASKS.update({x:(1 << i) for i, x in enumerate([1, 2, 3, 4, 5, 21, 22, 23])}) BITMASKS.update({x:(1 << i) for i, x in enumerate([69, 70, 71, 72, 73])}) NUM_VALUES = defaultdict(lambda :1) NUM_VALUES[11] = 3 NUM_VALUES[0] = 0 def __init__(self): self.byts = bytearray() def add_tag(self, tag): buf = self.byts buf.append(tag) buf.append(self.NUM_VALUES[tag]) # bitmask buf.append(self.BITMASKS[tag] if tag else 0) # eof buf.append(0 if tag else 1) def header(self, control_byte_count): header = b'TAGX' # table length, control byte count header += pack(b'>II', 12+len(self.byts), control_byte_count) return header @property def periodical(self): ''' TAGX block for the Primary index header of a periodical ''' for i in (1, 2, 3, 4, 5, 21, 22, 23, 0, 69, 70, 71, 72,73, 0): self.add_tag(i) return self.header(2) + bytes(self.byts) @property def secondary(self): ''' TAGX block for the secondary index header of a periodical ''' for i in (11, 0): self.add_tag(i) return self.header(1) + bytes(self.byts) @property def flat_book(self): ''' TAGX block for the primary index header of a flat book ''' for i in (1, 2, 3, 4, 0): self.add_tag(i) return self.header(1) + bytes(self.byts) # }}} # Index Entries {{{ class IndexEntry: TAG_VALUES = { 'offset': 1, 'size': 2, 'label_offset': 3, 'depth': 4, 'class_offset': 5, 'secondary': 11, 'parent_index': 21, 'first_child_index': 22, 'last_child_index': 23, 'image_index': 69, 'desc_offset': 70, 'author_offset': 71, } RTAG_MAP = {v:k for k, v in iteritems(TAG_VALUES)} # noqa def __init__(self, offset, label_offset): self.offset, self.label_offset = offset, label_offset self.depth, self.class_offset = 0, None self.control_byte_count = 1 self.length = 0 self.index = 0 self.parent_index = None self.first_child_index = None self.last_child_index = None self.image_index = None self.author_offset = None self.desc_offset = None def __repr__(self): return ('IndexEntry(offset=%r, depth=%r, length=%r, index=%r,' ' parent_index=%r)')%(self.offset, self.depth, self.length, self.index, self.parent_index) @property def size(self): return self.length @size.setter def size(self, val): self.length = val @property def next_offset(self): return self.offset + self.length @property def tag_nums(self): yield from range(1, 5) for attr in ('class_offset', 'parent_index', 'first_child_index', 'last_child_index'): if getattr(self, attr) is not None: yield self.TAG_VALUES[attr] @property def entry_type(self): ans = 0 for tag in self.tag_nums: ans |= TAGX.BITMASKS[tag] return ans def attr_for_tag(self, tag): return self.RTAG_MAP[tag] @property def bytestring(self): buf = io.BytesIO() if isinstance(self.index, numbers.Integral): buf.write(encode_number_as_hex(self.index)) else: raw = bytearray(self.index.encode('ascii')) raw.insert(0, len(raw)) buf.write(bytes(raw)) et = self.entry_type buf.write(bytes(bytearray([et]))) if self.control_byte_count == 2: flags = 0 for attr in ('image_index', 'desc_offset', 'author_offset'): val = getattr(self, attr) if val is not None: tag = self.TAG_VALUES[attr] bm = TAGX.BITMASKS[tag] flags |= bm buf.write(bytes(bytearray([flags]))) for tag in self.tag_nums: attr = self.attr_for_tag(tag) val = getattr(self, attr) if isinstance(val, numbers.Integral): val = [val] for x in val: buf.write(encint(x)) if self.control_byte_count == 2: for attr in ('image_index', 'desc_offset', 'author_offset'): val = getattr(self, attr) if val is not None: buf.write(encint(val)) ans = buf.getvalue() return ans class PeriodicalIndexEntry(IndexEntry): def __init__(self, offset, label_offset, class_offset, depth): IndexEntry.__init__(self, offset, label_offset) self.depth = depth self.class_offset = class_offset self.control_byte_count = 2 class SecondaryIndexEntry(IndexEntry): INDEX_MAP = {'author':73, 'caption':72, 'credit':71, 'description':70, 'mastheadImage':69} def __init__(self, index): IndexEntry.__init__(self, 0, 0) self.index = index tag = self.INDEX_MAP[index] # The values for this index entry # I dont know what the 5 means, it is not the number of entries self.secondary = [5 if tag == min( itervalues(self.INDEX_MAP)) else 0, 0, tag] @property def tag_nums(self): yield 11 @property def entry_type(self): return 1 @classmethod def entries(cls): rmap = {v:k for k,v in iteritems(cls.INDEX_MAP)} for tag in sorted(rmap, reverse=True): yield cls(rmap[tag]) # }}} class TBS: # {{{ ''' Take the list of index nodes starting/ending on a record and calculate the trailing byte sequence for the record. ''' def __init__(self, data, is_periodical, first=False, section_map={}, after_first=False): self.section_map = section_map if is_periodical: # The starting bytes. # The value is zero which I think indicates the periodical # index entry. The values for the various flags seem to be # unused. If the 0b100 is present, it means that the record # deals with section 1 (or is the final record with section # transitions). self.type_010 = encode_tbs(0, {0b010: 0}, flag_size=3) self.type_011 = encode_tbs(0, {0b010: 0, 0b001: 0}, flag_size=3) self.type_110 = encode_tbs(0, {0b100: 2, 0b010: 0}, flag_size=3) self.type_111 = encode_tbs(0, {0b100: 2, 0b010: 0, 0b001: 0}, flag_size=3) if not data: byts = b'' if after_first: # This can happen if a record contains only text between # the periodical start and the first section byts = self.type_011 self.bytestring = byts else: depth_map = defaultdict(list) for x in ('starts', 'ends', 'completes'): for idx in data[x]: depth_map[idx.depth].append(idx) for l in itervalues(depth_map): l.sort(key=lambda x:x.offset) self.periodical_tbs(data, first, depth_map) else: if not data: self.bytestring = b'' else: self.book_tbs(data, first) def periodical_tbs(self, data, first, depth_map): buf = io.BytesIO() has_section_start = (depth_map[1] and set(depth_map[1]).intersection(set(data['starts']))) spanner = data['spans'] parent_section_index = -1 if depth_map[0]: # We have a terminal record # Find the first non periodical node first_node = None for nodes in (depth_map[1], depth_map[2]): for node in nodes: if (first_node is None or (node.offset, node.depth) < (first_node.offset, first_node.depth)): first_node = node typ = (self.type_110 if has_section_start else self.type_010) # parent_section_index is needed for the last record if first_node is not None and first_node.depth > 0: parent_section_index = (first_node.index if first_node.depth == 1 else first_node.parent_index) else: parent_section_index = max(iter(self.section_map)) else: # Non terminal record if spanner is not None: # record is spanned by a single article parent_section_index = spanner.parent_index typ = (self.type_110 if parent_section_index == 1 else self.type_010) elif not depth_map[1]: # has only article nodes, i.e. spanned by a section parent_section_index = depth_map[2][0].parent_index typ = (self.type_111 if parent_section_index == 1 else self.type_010) else: # has section transitions if depth_map[2]: parent_section_index = depth_map[2][0].parent_index else: parent_section_index = depth_map[1][0].index typ = self.type_011 buf.write(typ) if typ not in (self.type_110, self.type_111) and parent_section_index > 0: extra = {} # Write starting section information if spanner is None: num_articles = len([a for a in depth_map[1] if a.parent_index == parent_section_index]) if not depth_map[1]: extra = {0b0001: 0} if num_articles > 1: extra = {0b0100: num_articles} buf.write(encode_tbs(parent_section_index, extra)) if spanner is None: articles = depth_map[2] sections = {self.section_map[a.parent_index] for a in articles} sections = sorted(sections, key=lambda x:x.offset) section_map = {s:[a for a in articles if a.parent_index == s.index] for s in sections} for i, section in enumerate(sections): # All the articles in this record that belong to section articles = section_map[section] first_article = articles[0] last_article = articles[-1] num = len(articles) last_article_ends = (last_article in data['ends'] or last_article in data['completes']) try: next_sec = sections[i+1] except: next_sec = None extra = {} if num > 1: extra[0b0100] = num if False and i == 0 and next_sec is not None: # Write offset to next section from start of record # I can't figure out exactly when Kindlegen decides to # write this so I have disabled it for now. extra[0b0001] = next_sec.offset - data['offset'] buf.write(encode_tbs(first_article.index-section.index, extra)) if next_sec is not None: buf.write(encode_tbs(last_article.index-next_sec.index, {0b1000: 0})) # If a section TOC starts and extends into the next record add # a trailing vwi. We detect this by TBS type==3, processing last # section present in the record, and the last article in that # section either ends or completes and doesn't finish # on the last byte of the record. elif (typ == self.type_011 and last_article_ends and ((last_article.offset+last_article.size) % RECORD_SIZE > 0) ): buf.write(encode_tbs(last_article.index-section.index-1, {0b1000: 0})) else: buf.write(encode_tbs(spanner.index - parent_section_index, {0b0001: 0})) self.bytestring = buf.getvalue() def book_tbs(self, data, first): spanner = data['spans'] if spanner is not None: self.bytestring = encode_tbs(spanner.index, {0b010: 0, 0b001: 0}, flag_size=3) else: starts, completes, ends = (data['starts'], data['completes'], data['ends']) if (not completes and ( (len(starts) == 1 and not ends) or (len(ends) == 1 and not starts))): node = starts[0] if starts else ends[0] self.bytestring = encode_tbs(node.index, {0b010: 0}, flag_size=3) else: nodes = [] for x in (starts, completes, ends): nodes.extend(x) nodes.sort(key=lambda x:x.index) self.bytestring = encode_tbs(nodes[0].index, {0b010:0, 0b100: len(nodes)}, flag_size=3) # }}} class Indexer: # {{{ def __init__(self, serializer, number_of_text_records, size_of_last_text_record, masthead_offset, is_periodical, opts, oeb): self.serializer = serializer self.number_of_text_records = number_of_text_records self.text_size = (RECORD_SIZE * (self.number_of_text_records-1) + size_of_last_text_record) self.masthead_offset = masthead_offset self.secondary_record_offset = None self.oeb = oeb self.log = oeb.log self.opts = opts self.is_periodical = is_periodical if self.is_periodical and self.masthead_offset is None: raise ValueError('Periodicals must have a masthead') self.log('Generating MOBI index for a %s'%('periodical' if self.is_periodical else 'book')) self.is_flat_periodical = False if self.is_periodical: periodical_node = next(iter(oeb.toc)) sections = tuple(periodical_node) self.is_flat_periodical = len(sections) == 1 self.records = [] if self.is_periodical: # Ensure all articles have an author and description before # creating the CNCX for node in oeb.toc.iterdescendants(): if node.klass == 'article': aut, desc = node.author, node.description if not aut: aut = _('Unknown') if not desc: desc = _('No details available') node.author, node.description = aut, desc self.cncx = CNCX(oeb.toc, self.is_periodical) if self.is_periodical: self.indices = self.create_periodical_index() else: self.indices = self.create_book_index() if not self.indices: raise ValueError('No valid entries in TOC, cannot generate index') self.records.append(self.create_index_record()) self.records.insert(0, self.create_header()) self.records.extend(self.cncx.records) if is_periodical: self.secondary_record_offset = len(self.records) self.records.append(self.create_header(secondary=True)) self.records.append(self.create_index_record(secondary=True)) self.calculate_trailing_byte_sequences() def create_index_record(self, secondary=False): # {{{ header_length = 192 buf = io.BytesIO() indices = list(SecondaryIndexEntry.entries()) if secondary else self.indices # Write index entries offsets = [] for i in indices: offsets.append(buf.tell()) buf.write(i.bytestring) index_block = align_block(buf.getvalue()) # Write offsets to index entries as an IDXT block idxt_block = b'IDXT' buf.seek(0), buf.truncate(0) for offset in offsets: buf.write(pack(b'>H', header_length+offset)) idxt_block = align_block(idxt_block + buf.getvalue()) body = index_block + idxt_block header = b'INDX' buf.seek(0), buf.truncate(0) buf.write(pack(b'>I', header_length)) buf.write(b'\0'*4) # Unknown buf.write(pack(b'>I', 1)) # Header type? Or index record number? buf.write(b'\0'*4) # Unknown # IDXT block offset buf.write(pack(b'>I', header_length + len(index_block))) # Number of index entries buf.write(pack(b'>I', len(offsets))) # Unknown buf.write(b'\xff'*8) # Unknown buf.write(b'\0'*156) header += buf.getvalue() ans = header + body if len(ans) > 0x10000: raise ValueError('Too many entries (%d) in the TOC'%len(offsets)) return ans # }}} def create_header(self, secondary=False): # {{{ buf = io.BytesIO() if secondary: tagx_block = TAGX().secondary else: tagx_block = (TAGX().periodical if self.is_periodical else TAGX().flat_book) header_length = 192 # Ident 0 - 4 buf.write(b'INDX') # Header length 4 - 8 buf.write(pack(b'>I', header_length)) # Unknown 8-16 buf.write(b'\0'*8) # Index type: 0 - normal, 2 - inflection 16 - 20 buf.write(pack(b'>I', 2)) # IDXT offset 20-24 buf.write(pack(b'>I', 0)) # Filled in later # Number of index records 24-28 buf.write(pack(b'>I', 1 if secondary else len(self.records))) # Index Encoding 28-32 buf.write(pack(b'>I', 65001)) # utf-8 # Unknown 32-36 buf.write(b'\xff'*4) # Number of index entries 36-40 indices = list(SecondaryIndexEntry.entries()) if secondary else self.indices buf.write(pack(b'>I', len(indices))) # ORDT offset 40-44 buf.write(pack(b'>I', 0)) # LIGT offset 44-48 buf.write(pack(b'>I', 0)) # Number of LIGT entries 48-52 buf.write(pack(b'>I', 0)) # Number of CNCX records 52-56 buf.write(pack(b'>I', 0 if secondary else len(self.cncx.records))) # Unknown 56-180 buf.write(b'\0'*124) # TAGX offset 180-184 buf.write(pack(b'>I', header_length)) # Unknown 184-192 buf.write(b'\0'*8) # TAGX block buf.write(tagx_block) num = len(indices) # The index of the last entry in the NCX idx = indices[-1].index if isinstance(idx, numbers.Integral): idx = encode_number_as_hex(idx) else: idx = idx.encode('ascii') idx = (bytes(bytearray([len(idx)]))) + idx buf.write(idx) # The number of entries in the NCX buf.write(pack(b'>H', num)) # Padding pad = (4 - (buf.tell()%4))%4 if pad: buf.write(b'\0'*pad) idxt_offset = buf.tell() buf.write(b'IDXT') buf.write(pack(b'>H', header_length + len(tagx_block))) buf.write(b'\0') buf.seek(20) buf.write(pack(b'>I', idxt_offset)) return align_block(buf.getvalue()) # }}} def create_book_index(self): # {{{ indices = [] seen = set() id_offsets = self.serializer.id_offsets # Flatten toc so that chapter to chapter jumps work with all sub # chapter levels as well for node in self.oeb.toc.iterdescendants(): try: offset = id_offsets[node.href] label = self.cncx[node.title] except: self.log.warn('TOC item %s [%s] not found in document'%( node.title, node.href)) continue if offset in seen: continue seen.add(offset) indices.append(IndexEntry(offset, label)) indices.sort(key=lambda x:x.offset) # Set lengths for i, index in enumerate(indices): try: next_offset = indices[i+1].offset except: next_offset = self.serializer.body_end_offset index.length = next_offset - index.offset # Remove empty indices indices = [x for x in indices if x.length > 0] # Reset lengths in case any were removed for i, index in enumerate(indices): try: next_offset = indices[i+1].offset except: next_offset = self.serializer.body_end_offset index.length = next_offset - index.offset # Set index values for index, x in enumerate(indices): x.index = index return indices # }}} def create_periodical_index(self): # {{{ periodical_node = next(iter(self.oeb.toc)) periodical_node_offset = self.serializer.body_start_offset periodical_node_size = (self.serializer.body_end_offset - periodical_node_offset) normalized_sections = [] id_offsets = self.serializer.id_offsets periodical = PeriodicalIndexEntry(periodical_node_offset, self.cncx[periodical_node.title], self.cncx[periodical_node.klass], 0) periodical.length = periodical_node_size periodical.first_child_index = 1 periodical.image_index = self.masthead_offset seen_sec_offsets = set() seen_art_offsets = set() for sec in periodical_node: normalized_articles = [] try: offset = id_offsets[sec.href] label = self.cncx[sec.title] klass = self.cncx[sec.klass] except: continue if offset in seen_sec_offsets: continue seen_sec_offsets.add(offset) section = PeriodicalIndexEntry(offset, label, klass, 1) section.parent_index = 0 for art in sec: try: offset = id_offsets[art.href] label = self.cncx[art.title] klass = self.cncx[art.klass] except: continue if offset in seen_art_offsets: continue seen_art_offsets.add(offset) article = PeriodicalIndexEntry(offset, label, klass, 2) normalized_articles.append(article) article.author_offset = self.cncx[art.author] article.desc_offset = self.cncx[art.description] if getattr(art, 'toc_thumbnail', None) is not None: try: ii = self.serializer.images[art.toc_thumbnail] - 1 if ii > -1: article.image_index = ii except KeyError: pass # Image not found in serializer if normalized_articles: normalized_articles.sort(key=lambda x:x.offset) normalized_sections.append((section, normalized_articles)) normalized_sections.sort(key=lambda x:x[0].offset) # Set lengths for s, x in enumerate(normalized_sections): sec, normalized_articles = x try: sec.length = normalized_sections[s+1][0].offset - sec.offset except: sec.length = self.serializer.body_end_offset - sec.offset for i, art in enumerate(normalized_articles): try: art.length = normalized_articles[i+1].offset - art.offset except: art.length = sec.offset + sec.length - art.offset # Filter for i, x in list(enumerate(normalized_sections)): sec, normalized_articles = x normalized_articles = list(filter(lambda x: x.length > 0, normalized_articles)) normalized_sections[i] = (sec, normalized_articles) normalized_sections = list(filter(lambda x: x[0].length > 0 and x[1], normalized_sections)) # Set indices i = 0 for sec, articles in normalized_sections: i += 1 sec.index = i sec.parent_index = 0 for sec, articles in normalized_sections: for art in articles: i += 1 art.index = i art.parent_index = sec.index for sec, normalized_articles in normalized_sections: sec.first_child_index = normalized_articles[0].index sec.last_child_index = normalized_articles[-1].index # Set lengths again to close up any gaps left by filtering for s, x in enumerate(normalized_sections): sec, articles = x try: next_offset = normalized_sections[s+1][0].offset except: next_offset = self.serializer.body_end_offset sec.length = next_offset - sec.offset for a, art in enumerate(articles): try: next_offset = articles[a+1].offset except: next_offset = sec.next_offset art.length = next_offset - art.offset # Sanity check for s, x in enumerate(normalized_sections): sec, articles = x try: next_sec = normalized_sections[s+1][0] except: if (sec.length == 0 or sec.next_offset != self.serializer.body_end_offset): raise ValueError('Invalid section layout') else: if next_sec.offset != sec.next_offset or sec.length == 0: raise ValueError('Invalid section layout') for a, art in enumerate(articles): try: next_art = articles[a+1] except: if (art.length == 0 or art.next_offset != sec.next_offset): raise ValueError('Invalid article layout') else: if art.length == 0 or art.next_offset != next_art.offset: raise ValueError('Invalid article layout') # Flatten indices = [periodical] for sec, articles in normalized_sections: indices.append(sec) periodical.last_child_index = sec.index for sec, articles in normalized_sections: for a in articles: indices.append(a) return indices # }}} # TBS {{{ def calculate_trailing_byte_sequences(self): self.tbs_map = {} found_node = False sections = [i for i in self.indices if i.depth == 1] section_map = OrderedDict((i.index, i) for i in sorted(sections, key=lambda x:x.offset)) deepest = max(i.depth for i in self.indices) for i in range(self.number_of_text_records): offset = i * RECORD_SIZE next_offset = offset + RECORD_SIZE data = {'ends':[], 'completes':[], 'starts':[], 'spans':None, 'offset':offset, 'record_number':i+1} for index in self.indices: if index.offset >= next_offset: # Node starts after current record if index.depth == deepest: break else: continue if index.next_offset <= offset: # Node ends before current record continue if index.offset >= offset: # Node starts in current record if index.next_offset <= next_offset: # Node ends in current record data['completes'].append(index) else: data['starts'].append(index) else: # Node starts before current records if index.next_offset <= next_offset: # Node ends in current record data['ends'].append(index) elif index.depth == deepest: data['spans'] = index if (data['ends'] or data['completes'] or data['starts'] or data['spans'] is not None): self.tbs_map[i+1] = TBS(data, self.is_periodical, first=not found_node, section_map=section_map) found_node = True else: self.tbs_map[i+1] = TBS({}, self.is_periodical, first=False, after_first=found_node, section_map=section_map) def get_trailing_byte_sequence(self, num): return self.tbs_map[num].bytestring # }}} # }}}