%PDF- %PDF-
| Direktori : /usr/lib/calibre/calibre/ebooks/mobi/writer2/ |
| Current File : //usr/lib/calibre/calibre/ebooks/mobi/writer2/indexer.py |
#!/usr/bin/env python3
__license__ = 'GPL v3'
__copyright__ = '2011, Kovid Goyal <kovid@kovidgoyal.net>'
__docformat__ = 'restructuredtext en'
import numbers
from struct import pack
import io
from collections import OrderedDict, defaultdict
from calibre.ebooks.mobi.utils import (encint, encode_number_as_hex,
encode_tbs, align_block, RECORD_SIZE, CNCX as CNCX_)
from polyglot.builtins import iteritems, itervalues
class CNCX(CNCX_): # {{{
def __init__(self, toc, is_periodical):
strings = []
for item in toc.iterdescendants(breadth_first=True):
strings.append(item.title)
if is_periodical:
strings.append(item.klass)
if item.author:
strings.append(item.author)
if item.description:
strings.append(item.description)
CNCX_.__init__(self, strings)
# }}}
class TAGX: # {{{
BITMASKS = {11:0b1}
BITMASKS.update({x:(1 << i) for i, x in enumerate([1, 2, 3, 4, 5, 21, 22, 23])})
BITMASKS.update({x:(1 << i) for i, x in enumerate([69, 70, 71, 72, 73])})
NUM_VALUES = defaultdict(lambda :1)
NUM_VALUES[11] = 3
NUM_VALUES[0] = 0
def __init__(self):
self.byts = bytearray()
def add_tag(self, tag):
buf = self.byts
buf.append(tag)
buf.append(self.NUM_VALUES[tag])
# bitmask
buf.append(self.BITMASKS[tag] if tag else 0)
# eof
buf.append(0 if tag else 1)
def header(self, control_byte_count):
header = b'TAGX'
# table length, control byte count
header += pack(b'>II', 12+len(self.byts), control_byte_count)
return header
@property
def periodical(self):
'''
TAGX block for the Primary index header of a periodical
'''
for i in (1, 2, 3, 4, 5, 21, 22, 23, 0, 69, 70, 71, 72,73, 0):
self.add_tag(i)
return self.header(2) + bytes(self.byts)
@property
def secondary(self):
'''
TAGX block for the secondary index header of a periodical
'''
for i in (11, 0):
self.add_tag(i)
return self.header(1) + bytes(self.byts)
@property
def flat_book(self):
'''
TAGX block for the primary index header of a flat book
'''
for i in (1, 2, 3, 4, 0):
self.add_tag(i)
return self.header(1) + bytes(self.byts)
# }}}
# Index Entries {{{
class IndexEntry:
TAG_VALUES = {
'offset': 1,
'size': 2,
'label_offset': 3,
'depth': 4,
'class_offset': 5,
'secondary': 11,
'parent_index': 21,
'first_child_index': 22,
'last_child_index': 23,
'image_index': 69,
'desc_offset': 70,
'author_offset': 71,
}
RTAG_MAP = {v:k for k, v in iteritems(TAG_VALUES)} # noqa
def __init__(self, offset, label_offset):
self.offset, self.label_offset = offset, label_offset
self.depth, self.class_offset = 0, None
self.control_byte_count = 1
self.length = 0
self.index = 0
self.parent_index = None
self.first_child_index = None
self.last_child_index = None
self.image_index = None
self.author_offset = None
self.desc_offset = None
def __repr__(self):
return ('IndexEntry(offset=%r, depth=%r, length=%r, index=%r,'
' parent_index=%r)')%(self.offset, self.depth, self.length,
self.index, self.parent_index)
@property
def size(self):
return self.length
@size.setter
def size(self, val):
self.length = val
@property
def next_offset(self):
return self.offset + self.length
@property
def tag_nums(self):
yield from range(1, 5)
for attr in ('class_offset', 'parent_index', 'first_child_index',
'last_child_index'):
if getattr(self, attr) is not None:
yield self.TAG_VALUES[attr]
@property
def entry_type(self):
ans = 0
for tag in self.tag_nums:
ans |= TAGX.BITMASKS[tag]
return ans
def attr_for_tag(self, tag):
return self.RTAG_MAP[tag]
@property
def bytestring(self):
buf = io.BytesIO()
if isinstance(self.index, numbers.Integral):
buf.write(encode_number_as_hex(self.index))
else:
raw = bytearray(self.index.encode('ascii'))
raw.insert(0, len(raw))
buf.write(bytes(raw))
et = self.entry_type
buf.write(bytes(bytearray([et])))
if self.control_byte_count == 2:
flags = 0
for attr in ('image_index', 'desc_offset', 'author_offset'):
val = getattr(self, attr)
if val is not None:
tag = self.TAG_VALUES[attr]
bm = TAGX.BITMASKS[tag]
flags |= bm
buf.write(bytes(bytearray([flags])))
for tag in self.tag_nums:
attr = self.attr_for_tag(tag)
val = getattr(self, attr)
if isinstance(val, numbers.Integral):
val = [val]
for x in val:
buf.write(encint(x))
if self.control_byte_count == 2:
for attr in ('image_index', 'desc_offset', 'author_offset'):
val = getattr(self, attr)
if val is not None:
buf.write(encint(val))
ans = buf.getvalue()
return ans
class PeriodicalIndexEntry(IndexEntry):
def __init__(self, offset, label_offset, class_offset, depth):
IndexEntry.__init__(self, offset, label_offset)
self.depth = depth
self.class_offset = class_offset
self.control_byte_count = 2
class SecondaryIndexEntry(IndexEntry):
INDEX_MAP = {'author':73, 'caption':72, 'credit':71, 'description':70,
'mastheadImage':69}
def __init__(self, index):
IndexEntry.__init__(self, 0, 0)
self.index = index
tag = self.INDEX_MAP[index]
# The values for this index entry
# I dont know what the 5 means, it is not the number of entries
self.secondary = [5 if tag == min(
itervalues(self.INDEX_MAP)) else 0, 0, tag]
@property
def tag_nums(self):
yield 11
@property
def entry_type(self):
return 1
@classmethod
def entries(cls):
rmap = {v:k for k,v in iteritems(cls.INDEX_MAP)}
for tag in sorted(rmap, reverse=True):
yield cls(rmap[tag])
# }}}
class TBS: # {{{
'''
Take the list of index nodes starting/ending on a record and calculate the
trailing byte sequence for the record.
'''
def __init__(self, data, is_periodical, first=False, section_map={},
after_first=False):
self.section_map = section_map
if is_periodical:
# The starting bytes.
# The value is zero which I think indicates the periodical
# index entry. The values for the various flags seem to be
# unused. If the 0b100 is present, it means that the record
# deals with section 1 (or is the final record with section
# transitions).
self.type_010 = encode_tbs(0, {0b010: 0}, flag_size=3)
self.type_011 = encode_tbs(0, {0b010: 0, 0b001: 0},
flag_size=3)
self.type_110 = encode_tbs(0, {0b100: 2, 0b010: 0},
flag_size=3)
self.type_111 = encode_tbs(0, {0b100: 2, 0b010: 0, 0b001:
0}, flag_size=3)
if not data:
byts = b''
if after_first:
# This can happen if a record contains only text between
# the periodical start and the first section
byts = self.type_011
self.bytestring = byts
else:
depth_map = defaultdict(list)
for x in ('starts', 'ends', 'completes'):
for idx in data[x]:
depth_map[idx.depth].append(idx)
for l in itervalues(depth_map):
l.sort(key=lambda x:x.offset)
self.periodical_tbs(data, first, depth_map)
else:
if not data:
self.bytestring = b''
else:
self.book_tbs(data, first)
def periodical_tbs(self, data, first, depth_map):
buf = io.BytesIO()
has_section_start = (depth_map[1] and
set(depth_map[1]).intersection(set(data['starts'])))
spanner = data['spans']
parent_section_index = -1
if depth_map[0]:
# We have a terminal record
# Find the first non periodical node
first_node = None
for nodes in (depth_map[1], depth_map[2]):
for node in nodes:
if (first_node is None or (node.offset, node.depth) <
(first_node.offset, first_node.depth)):
first_node = node
typ = (self.type_110 if has_section_start else self.type_010)
# parent_section_index is needed for the last record
if first_node is not None and first_node.depth > 0:
parent_section_index = (first_node.index if first_node.depth == 1 else first_node.parent_index)
else:
parent_section_index = max(iter(self.section_map))
else:
# Non terminal record
if spanner is not None:
# record is spanned by a single article
parent_section_index = spanner.parent_index
typ = (self.type_110 if parent_section_index == 1 else
self.type_010)
elif not depth_map[1]:
# has only article nodes, i.e. spanned by a section
parent_section_index = depth_map[2][0].parent_index
typ = (self.type_111 if parent_section_index == 1 else
self.type_010)
else:
# has section transitions
if depth_map[2]:
parent_section_index = depth_map[2][0].parent_index
else:
parent_section_index = depth_map[1][0].index
typ = self.type_011
buf.write(typ)
if typ not in (self.type_110, self.type_111) and parent_section_index > 0:
extra = {}
# Write starting section information
if spanner is None:
num_articles = len([a for a in depth_map[1] if a.parent_index == parent_section_index])
if not depth_map[1]:
extra = {0b0001: 0}
if num_articles > 1:
extra = {0b0100: num_articles}
buf.write(encode_tbs(parent_section_index, extra))
if spanner is None:
articles = depth_map[2]
sections = {self.section_map[a.parent_index] for a in
articles}
sections = sorted(sections, key=lambda x:x.offset)
section_map = {s:[a for a in articles if a.parent_index ==
s.index] for s in sections}
for i, section in enumerate(sections):
# All the articles in this record that belong to section
articles = section_map[section]
first_article = articles[0]
last_article = articles[-1]
num = len(articles)
last_article_ends = (last_article in data['ends'] or
last_article in data['completes'])
try:
next_sec = sections[i+1]
except:
next_sec = None
extra = {}
if num > 1:
extra[0b0100] = num
if False and i == 0 and next_sec is not None:
# Write offset to next section from start of record
# I can't figure out exactly when Kindlegen decides to
# write this so I have disabled it for now.
extra[0b0001] = next_sec.offset - data['offset']
buf.write(encode_tbs(first_article.index-section.index, extra))
if next_sec is not None:
buf.write(encode_tbs(last_article.index-next_sec.index,
{0b1000: 0}))
# If a section TOC starts and extends into the next record add
# a trailing vwi. We detect this by TBS type==3, processing last
# section present in the record, and the last article in that
# section either ends or completes and doesn't finish
# on the last byte of the record.
elif (typ == self.type_011 and last_article_ends and
((last_article.offset+last_article.size) % RECORD_SIZE > 0)
):
buf.write(encode_tbs(last_article.index-section.index-1,
{0b1000: 0}))
else:
buf.write(encode_tbs(spanner.index - parent_section_index,
{0b0001: 0}))
self.bytestring = buf.getvalue()
def book_tbs(self, data, first):
spanner = data['spans']
if spanner is not None:
self.bytestring = encode_tbs(spanner.index, {0b010: 0, 0b001: 0},
flag_size=3)
else:
starts, completes, ends = (data['starts'], data['completes'],
data['ends'])
if (not completes and (
(len(starts) == 1 and not ends) or (len(ends) == 1 and not
starts))):
node = starts[0] if starts else ends[0]
self.bytestring = encode_tbs(node.index, {0b010: 0}, flag_size=3)
else:
nodes = []
for x in (starts, completes, ends):
nodes.extend(x)
nodes.sort(key=lambda x:x.index)
self.bytestring = encode_tbs(nodes[0].index, {0b010:0,
0b100: len(nodes)}, flag_size=3)
# }}}
class Indexer: # {{{
def __init__(self, serializer, number_of_text_records,
size_of_last_text_record, masthead_offset, is_periodical,
opts, oeb):
self.serializer = serializer
self.number_of_text_records = number_of_text_records
self.text_size = (RECORD_SIZE * (self.number_of_text_records-1) +
size_of_last_text_record)
self.masthead_offset = masthead_offset
self.secondary_record_offset = None
self.oeb = oeb
self.log = oeb.log
self.opts = opts
self.is_periodical = is_periodical
if self.is_periodical and self.masthead_offset is None:
raise ValueError('Periodicals must have a masthead')
self.log('Generating MOBI index for a %s'%('periodical' if
self.is_periodical else 'book'))
self.is_flat_periodical = False
if self.is_periodical:
periodical_node = next(iter(oeb.toc))
sections = tuple(periodical_node)
self.is_flat_periodical = len(sections) == 1
self.records = []
if self.is_periodical:
# Ensure all articles have an author and description before
# creating the CNCX
for node in oeb.toc.iterdescendants():
if node.klass == 'article':
aut, desc = node.author, node.description
if not aut:
aut = _('Unknown')
if not desc:
desc = _('No details available')
node.author, node.description = aut, desc
self.cncx = CNCX(oeb.toc, self.is_periodical)
if self.is_periodical:
self.indices = self.create_periodical_index()
else:
self.indices = self.create_book_index()
if not self.indices:
raise ValueError('No valid entries in TOC, cannot generate index')
self.records.append(self.create_index_record())
self.records.insert(0, self.create_header())
self.records.extend(self.cncx.records)
if is_periodical:
self.secondary_record_offset = len(self.records)
self.records.append(self.create_header(secondary=True))
self.records.append(self.create_index_record(secondary=True))
self.calculate_trailing_byte_sequences()
def create_index_record(self, secondary=False): # {{{
header_length = 192
buf = io.BytesIO()
indices = list(SecondaryIndexEntry.entries()) if secondary else self.indices
# Write index entries
offsets = []
for i in indices:
offsets.append(buf.tell())
buf.write(i.bytestring)
index_block = align_block(buf.getvalue())
# Write offsets to index entries as an IDXT block
idxt_block = b'IDXT'
buf.seek(0), buf.truncate(0)
for offset in offsets:
buf.write(pack(b'>H', header_length+offset))
idxt_block = align_block(idxt_block + buf.getvalue())
body = index_block + idxt_block
header = b'INDX'
buf.seek(0), buf.truncate(0)
buf.write(pack(b'>I', header_length))
buf.write(b'\0'*4) # Unknown
buf.write(pack(b'>I', 1)) # Header type? Or index record number?
buf.write(b'\0'*4) # Unknown
# IDXT block offset
buf.write(pack(b'>I', header_length + len(index_block)))
# Number of index entries
buf.write(pack(b'>I', len(offsets)))
# Unknown
buf.write(b'\xff'*8)
# Unknown
buf.write(b'\0'*156)
header += buf.getvalue()
ans = header + body
if len(ans) > 0x10000:
raise ValueError('Too many entries (%d) in the TOC'%len(offsets))
return ans
# }}}
def create_header(self, secondary=False): # {{{
buf = io.BytesIO()
if secondary:
tagx_block = TAGX().secondary
else:
tagx_block = (TAGX().periodical if self.is_periodical else
TAGX().flat_book)
header_length = 192
# Ident 0 - 4
buf.write(b'INDX')
# Header length 4 - 8
buf.write(pack(b'>I', header_length))
# Unknown 8-16
buf.write(b'\0'*8)
# Index type: 0 - normal, 2 - inflection 16 - 20
buf.write(pack(b'>I', 2))
# IDXT offset 20-24
buf.write(pack(b'>I', 0)) # Filled in later
# Number of index records 24-28
buf.write(pack(b'>I', 1 if secondary else len(self.records)))
# Index Encoding 28-32
buf.write(pack(b'>I', 65001)) # utf-8
# Unknown 32-36
buf.write(b'\xff'*4)
# Number of index entries 36-40
indices = list(SecondaryIndexEntry.entries()) if secondary else self.indices
buf.write(pack(b'>I', len(indices)))
# ORDT offset 40-44
buf.write(pack(b'>I', 0))
# LIGT offset 44-48
buf.write(pack(b'>I', 0))
# Number of LIGT entries 48-52
buf.write(pack(b'>I', 0))
# Number of CNCX records 52-56
buf.write(pack(b'>I', 0 if secondary else len(self.cncx.records)))
# Unknown 56-180
buf.write(b'\0'*124)
# TAGX offset 180-184
buf.write(pack(b'>I', header_length))
# Unknown 184-192
buf.write(b'\0'*8)
# TAGX block
buf.write(tagx_block)
num = len(indices)
# The index of the last entry in the NCX
idx = indices[-1].index
if isinstance(idx, numbers.Integral):
idx = encode_number_as_hex(idx)
else:
idx = idx.encode('ascii')
idx = (bytes(bytearray([len(idx)]))) + idx
buf.write(idx)
# The number of entries in the NCX
buf.write(pack(b'>H', num))
# Padding
pad = (4 - (buf.tell()%4))%4
if pad:
buf.write(b'\0'*pad)
idxt_offset = buf.tell()
buf.write(b'IDXT')
buf.write(pack(b'>H', header_length + len(tagx_block)))
buf.write(b'\0')
buf.seek(20)
buf.write(pack(b'>I', idxt_offset))
return align_block(buf.getvalue())
# }}}
def create_book_index(self): # {{{
indices = []
seen = set()
id_offsets = self.serializer.id_offsets
# Flatten toc so that chapter to chapter jumps work with all sub
# chapter levels as well
for node in self.oeb.toc.iterdescendants():
try:
offset = id_offsets[node.href]
label = self.cncx[node.title]
except:
self.log.warn('TOC item %s [%s] not found in document'%(
node.title, node.href))
continue
if offset in seen:
continue
seen.add(offset)
indices.append(IndexEntry(offset, label))
indices.sort(key=lambda x:x.offset)
# Set lengths
for i, index in enumerate(indices):
try:
next_offset = indices[i+1].offset
except:
next_offset = self.serializer.body_end_offset
index.length = next_offset - index.offset
# Remove empty indices
indices = [x for x in indices if x.length > 0]
# Reset lengths in case any were removed
for i, index in enumerate(indices):
try:
next_offset = indices[i+1].offset
except:
next_offset = self.serializer.body_end_offset
index.length = next_offset - index.offset
# Set index values
for index, x in enumerate(indices):
x.index = index
return indices
# }}}
def create_periodical_index(self): # {{{
periodical_node = next(iter(self.oeb.toc))
periodical_node_offset = self.serializer.body_start_offset
periodical_node_size = (self.serializer.body_end_offset -
periodical_node_offset)
normalized_sections = []
id_offsets = self.serializer.id_offsets
periodical = PeriodicalIndexEntry(periodical_node_offset,
self.cncx[periodical_node.title],
self.cncx[periodical_node.klass], 0)
periodical.length = periodical_node_size
periodical.first_child_index = 1
periodical.image_index = self.masthead_offset
seen_sec_offsets = set()
seen_art_offsets = set()
for sec in periodical_node:
normalized_articles = []
try:
offset = id_offsets[sec.href]
label = self.cncx[sec.title]
klass = self.cncx[sec.klass]
except:
continue
if offset in seen_sec_offsets:
continue
seen_sec_offsets.add(offset)
section = PeriodicalIndexEntry(offset, label, klass, 1)
section.parent_index = 0
for art in sec:
try:
offset = id_offsets[art.href]
label = self.cncx[art.title]
klass = self.cncx[art.klass]
except:
continue
if offset in seen_art_offsets:
continue
seen_art_offsets.add(offset)
article = PeriodicalIndexEntry(offset, label, klass, 2)
normalized_articles.append(article)
article.author_offset = self.cncx[art.author]
article.desc_offset = self.cncx[art.description]
if getattr(art, 'toc_thumbnail', None) is not None:
try:
ii = self.serializer.images[art.toc_thumbnail] - 1
if ii > -1:
article.image_index = ii
except KeyError:
pass # Image not found in serializer
if normalized_articles:
normalized_articles.sort(key=lambda x:x.offset)
normalized_sections.append((section, normalized_articles))
normalized_sections.sort(key=lambda x:x[0].offset)
# Set lengths
for s, x in enumerate(normalized_sections):
sec, normalized_articles = x
try:
sec.length = normalized_sections[s+1][0].offset - sec.offset
except:
sec.length = self.serializer.body_end_offset - sec.offset
for i, art in enumerate(normalized_articles):
try:
art.length = normalized_articles[i+1].offset - art.offset
except:
art.length = sec.offset + sec.length - art.offset
# Filter
for i, x in list(enumerate(normalized_sections)):
sec, normalized_articles = x
normalized_articles = list(filter(lambda x: x.length > 0,
normalized_articles))
normalized_sections[i] = (sec, normalized_articles)
normalized_sections = list(filter(lambda x: x[0].length > 0 and x[1],
normalized_sections))
# Set indices
i = 0
for sec, articles in normalized_sections:
i += 1
sec.index = i
sec.parent_index = 0
for sec, articles in normalized_sections:
for art in articles:
i += 1
art.index = i
art.parent_index = sec.index
for sec, normalized_articles in normalized_sections:
sec.first_child_index = normalized_articles[0].index
sec.last_child_index = normalized_articles[-1].index
# Set lengths again to close up any gaps left by filtering
for s, x in enumerate(normalized_sections):
sec, articles = x
try:
next_offset = normalized_sections[s+1][0].offset
except:
next_offset = self.serializer.body_end_offset
sec.length = next_offset - sec.offset
for a, art in enumerate(articles):
try:
next_offset = articles[a+1].offset
except:
next_offset = sec.next_offset
art.length = next_offset - art.offset
# Sanity check
for s, x in enumerate(normalized_sections):
sec, articles = x
try:
next_sec = normalized_sections[s+1][0]
except:
if (sec.length == 0 or sec.next_offset !=
self.serializer.body_end_offset):
raise ValueError('Invalid section layout')
else:
if next_sec.offset != sec.next_offset or sec.length == 0:
raise ValueError('Invalid section layout')
for a, art in enumerate(articles):
try:
next_art = articles[a+1]
except:
if (art.length == 0 or art.next_offset !=
sec.next_offset):
raise ValueError('Invalid article layout')
else:
if art.length == 0 or art.next_offset != next_art.offset:
raise ValueError('Invalid article layout')
# Flatten
indices = [periodical]
for sec, articles in normalized_sections:
indices.append(sec)
periodical.last_child_index = sec.index
for sec, articles in normalized_sections:
for a in articles:
indices.append(a)
return indices
# }}}
# TBS {{{
def calculate_trailing_byte_sequences(self):
self.tbs_map = {}
found_node = False
sections = [i for i in self.indices if i.depth == 1]
section_map = OrderedDict((i.index, i) for i in
sorted(sections, key=lambda x:x.offset))
deepest = max(i.depth for i in self.indices)
for i in range(self.number_of_text_records):
offset = i * RECORD_SIZE
next_offset = offset + RECORD_SIZE
data = {'ends':[], 'completes':[], 'starts':[],
'spans':None, 'offset':offset, 'record_number':i+1}
for index in self.indices:
if index.offset >= next_offset:
# Node starts after current record
if index.depth == deepest:
break
else:
continue
if index.next_offset <= offset:
# Node ends before current record
continue
if index.offset >= offset:
# Node starts in current record
if index.next_offset <= next_offset:
# Node ends in current record
data['completes'].append(index)
else:
data['starts'].append(index)
else:
# Node starts before current records
if index.next_offset <= next_offset:
# Node ends in current record
data['ends'].append(index)
elif index.depth == deepest:
data['spans'] = index
if (data['ends'] or data['completes'] or data['starts'] or
data['spans'] is not None):
self.tbs_map[i+1] = TBS(data, self.is_periodical, first=not
found_node, section_map=section_map)
found_node = True
else:
self.tbs_map[i+1] = TBS({}, self.is_periodical, first=False,
after_first=found_node, section_map=section_map)
def get_trailing_byte_sequence(self, num):
return self.tbs_map[num].bytestring
# }}}
# }}}