%PDF- %PDF-
| Direktori : /usr/lib/calibre/calibre/ebooks/mobi/debug/ |
| Current File : //usr/lib/calibre/calibre/ebooks/mobi/debug/headers.py |
#!/usr/bin/env python3
__license__ = 'GPL v3'
__copyright__ = '2012, Kovid Goyal <kovid@kovidgoyal.net>'
__docformat__ = 'restructuredtext en'
import struct, datetime, os, numbers, binascii
from calibre.utils.date import utc_tz
from calibre.ebooks.mobi.reader.headers import NULL_INDEX
from calibre.ebooks.mobi.langcodes import main_language, sub_language
from calibre.ebooks.mobi.debug import format_bytes
from calibre.ebooks.mobi.utils import get_trailing_data
from polyglot.builtins import iteritems
# PalmDB {{{
class PalmDOCAttributes:
class Attr:
def __init__(self, name, field, val):
self.name = name
self.val = val & field
def __str__(self):
return '%s: %s'%(self.name, bool(self.val))
__unicode__ = __str__
def __init__(self, raw):
self.val = struct.unpack(b'<H', raw)[0]
self.attributes = []
for name, field in [('Read Only', 0x02), ('Dirty AppInfoArea', 0x04),
('Backup this database', 0x08),
('Okay to install newer over existing copy, if present on PalmPilot', 0x10),
('Force the PalmPilot to reset after this database is installed', 0x12),
('Don\'t allow copy of file to be beamed to other Pilot',
0x14)]:
self.attributes.append(PalmDOCAttributes.Attr(name, field,
self.val))
def __str__(self):
attrs = '\n\t'.join([str(x) for x in self.attributes])
return 'PalmDOC Attributes: %s\n\t%s'%(bin(self.val), attrs)
__unicode__ = __str__
class PalmDB:
def __init__(self, raw):
self.raw = raw
if self.raw.startswith(b'TPZ'):
raise ValueError('This is a Topaz file')
self.name = self.raw[:32].replace(b'\x00', b'')
self.attributes = PalmDOCAttributes(self.raw[32:34])
self.version = struct.unpack(b'>H', self.raw[34:36])[0]
palm_epoch = datetime.datetime(1904, 1, 1, tzinfo=utc_tz)
self.creation_date_raw = struct.unpack(b'>I', self.raw[36:40])[0]
self.creation_date = (palm_epoch +
datetime.timedelta(seconds=self.creation_date_raw))
self.modification_date_raw = struct.unpack(b'>I', self.raw[40:44])[0]
self.modification_date = (palm_epoch +
datetime.timedelta(seconds=self.modification_date_raw))
self.last_backup_date_raw = struct.unpack(b'>I', self.raw[44:48])[0]
self.last_backup_date = (palm_epoch +
datetime.timedelta(seconds=self.last_backup_date_raw))
self.modification_number = struct.unpack(b'>I', self.raw[48:52])[0]
self.app_info_id = self.raw[52:56]
self.sort_info_id = self.raw[56:60]
self.type = self.raw[60:64]
self.creator = self.raw[64:68]
self.ident = self.type + self.creator
if self.ident not in (b'BOOKMOBI', b'TEXTREAD'):
raise ValueError('Unknown book ident: %r'%self.ident)
self.last_record_uid, = struct.unpack(b'>I', self.raw[68:72])
self.next_rec_list_id = self.raw[72:76]
self.number_of_records, = struct.unpack(b'>H', self.raw[76:78])
def __str__(self):
ans = ['*'*20 + ' PalmDB Header '+ '*'*20]
ans.append('Name: %r'%self.name)
ans.append(str(self.attributes))
ans.append('Version: %s'%self.version)
ans.append('Creation date: %s (%s)'%(self.creation_date.isoformat(),
self.creation_date_raw))
ans.append('Modification date: %s (%s)'%(self.modification_date.isoformat(),
self.modification_date_raw))
ans.append('Backup date: %s (%s)'%(self.last_backup_date.isoformat(),
self.last_backup_date_raw))
ans.append('Modification number: %s'%self.modification_number)
ans.append('App Info ID: %r'%self.app_info_id)
ans.append('Sort Info ID: %r'%self.sort_info_id)
ans.append('Type: %r'%self.type)
ans.append('Creator: %r'%self.creator)
ans.append('Last record UID +1: %r'%self.last_record_uid)
ans.append('Next record list id: %r'%self.next_rec_list_id)
ans.append('Number of records: %s'%self.number_of_records)
return '\n'.join(ans)
__unicode__ = __str__
# }}}
class Record: # {{{
def __init__(self, raw, header):
self.offset, self.flags, self.uid = header
self.raw = raw
@property
def header(self):
return 'Offset: %d Flags: %d UID: %d First 4 bytes: %r Size: %d'%(self.offset, self.flags,
self.uid, self.raw[:4], len(self.raw))
# }}}
# EXTH {{{
class EXTHRecord:
def __init__(self, type_, data, length):
self.type = type_
self.data = data
self.length = length
self.name = {
1 : 'Drm Server Id',
2 : 'Drm Commerce Id',
3 : 'Drm Ebookbase Book Id',
100 : 'Creator',
101 : 'Publisher',
102 : 'Imprint',
103 : 'Description',
104 : 'ISBN',
105 : 'Subject',
106 : 'Published',
107 : 'Review',
108 : 'Contributor',
109 : 'Rights',
110 : 'SubjectCode',
111 : 'Type',
112 : 'Source',
113 : 'ASIN',
114 : 'versionNumber',
115 : 'sample',
116 : 'StartOffset',
117 : 'Adult',
118 : 'Price',
119 : 'Currency',
121 : 'KF8_Boundary_Section',
122 : 'fixed-layout',
123 : 'book-type',
124 : 'orientation-lock',
125 : 'KF8_Count_of_Resources_Fonts_Images',
126 : 'original-resolution',
127 : 'zero-gutter',
128 : 'zero-margin',
129 : 'KF8_Masthead/Cover_Image',
131 : 'KF8_Unidentified_Count',
132 : 'RegionMagnification',
200 : 'DictShortName',
201 : 'CoverOffset',
202 : 'ThumbOffset',
203 : 'Fake Cover',
204 : 'Creator Software',
205 : 'Creator Major Version', # '>I'
206 : 'Creator Minor Version', # '>I'
207 : 'Creator Build Number', # '>I'
208 : 'Watermark',
209 : 'Tamper Proof Keys [hex]',
300 : 'Font Signature [hex]',
301 : 'Clipping Limit [3xx]', # percentage '>B'
401 : 'Clipping Limit', # percentage '>B'
402 : 'Publisher Limit',
404 : 'Text to Speech Disabled', # '>B' 1 - TTS disabled 0 - TTS enabled
501 : 'CDE Type', # 4 chars (PDOC, EBOK, MAGZ, ...)
502 : 'last_update_time',
503 : 'Updated Title',
504 : 'ASIN [5xx]',
508 : 'Unknown Title Furigana?',
517 : 'Unknown Creator Furigana?',
522 : 'Unknown Publisher Furigana?',
524 : 'Language',
525 : 'primary-writing-mode',
527 : 'page-progression-direction',
528 : 'Override Kindle fonts',
534 : 'Input Source Type',
535 : 'Kindlegen Build-Rev Number',
536 : 'Container Info', # CONT_Header is 0, Ends with CONTAINER_BOUNDARY (or Asset_Type?)
538 : 'Container Resolution',
539 : 'Container Mimetype',
543 : 'Container id', # FONT_CONTAINER, BW_CONTAINER, HD_CONTAINER
}.get(self.type, repr(self.type))
if (self.name in {'sample', 'StartOffset', 'CoverOffset', 'ThumbOffset', 'Fake Cover',
'Creator Software', 'Creator Major Version', 'Creator Minor Version',
'Creator Build Number', 'Clipping Limit (3xx)', 'Clipping Limit',
'Publisher Limit', 'Text to Speech Disabled'} or
self.type in {121, 125, 131}):
if self.length == 9:
self.data, = struct.unpack(b'>B', self.data)
elif self.length == 10:
self.data, = struct.unpack(b'>H', self.data)
else:
self.data, = struct.unpack(b'>L', self.data)
elif self.type in {209, 300}:
self.data = binascii.hexlify(self.data)
def __str__(self):
return '%s (%d): %r'%(self.name, self.type, self.data)
class EXTHHeader:
def __init__(self, raw):
self.raw = raw
if not self.raw.startswith(b'EXTH'):
raise ValueError('EXTH header does not start with EXTH')
self.length, = struct.unpack(b'>L', self.raw[4:8])
self.count, = struct.unpack(b'>L', self.raw[8:12])
pos = 12
self.records = []
for i in range(self.count):
pos = self.read_record(pos)
self.records.sort(key=lambda x:x.type)
self.rmap = {x.type:x for x in self.records}
def __getitem__(self, type_):
return self.rmap.__getitem__(type_).data
def get(self, type_, default=None):
ans = self.rmap.get(type_, default)
return getattr(ans, 'data', default)
def read_record(self, pos):
type_, length = struct.unpack(b'>LL', self.raw[pos:pos+8])
data = self.raw[(pos+8):(pos+length)]
self.records.append(EXTHRecord(type_, data, length))
return pos + length
@property
def kf8_header_index(self):
ans = self.get(121, None)
if ans == NULL_INDEX:
ans = None
return ans
def __str__(self):
ans = ['*'*20 + ' EXTH Header '+ '*'*20]
ans.append('EXTH header length: %d'%self.length)
ans.append('Number of EXTH records: %d'%self.count)
ans.append('EXTH records...')
for r in self.records:
ans.append(str(r))
return '\n'.join(ans)
__unicode__ = __str__
# }}}
class MOBIHeader: # {{{
def __init__(self, record0, offset):
self.raw = record0.raw
self.header_offset = offset
self.compression_raw = self.raw[:2]
self.compression = {1: 'No compression', 2: 'PalmDoc compression',
17480: 'HUFF/CDIC compression'}.get(struct.unpack(b'>H',
self.compression_raw)[0],
repr(self.compression_raw))
self.unused = self.raw[2:4]
self.text_length, = struct.unpack(b'>I', self.raw[4:8])
self.number_of_text_records, self.text_record_size = \
struct.unpack(b'>HH', self.raw[8:12])
self.encryption_type_raw, = struct.unpack(b'>H', self.raw[12:14])
self.encryption_type = {
0: 'No encryption',
1: 'Old mobipocket encryption',
2: 'Mobipocket encryption'
}.get(self.encryption_type_raw, repr(self.encryption_type_raw))
self.unknown = self.raw[14:16]
self.identifier = self.raw[16:20]
if self.identifier != b'MOBI':
raise ValueError('Identifier %r unknown'%self.identifier)
self.length, = struct.unpack(b'>I', self.raw[20:24])
self.type_raw, = struct.unpack(b'>I', self.raw[24:28])
self.type = {
2 : 'Mobipocket book',
3 : 'PalmDOC book',
4 : 'Audio',
257 : 'News',
258 : 'News Feed',
259 : 'News magazine',
513 : 'PICS',
514 : 'Word',
515 : 'XLS',
516 : 'PPT',
517 : 'TEXT',
518 : 'HTML',
}.get(self.type_raw, repr(self.type_raw))
self.encoding_raw, = struct.unpack(b'>I', self.raw[28:32])
self.encoding = {
1252 : 'cp1252',
65001: 'utf-8',
}.get(self.encoding_raw, repr(self.encoding_raw))
self.uid = self.raw[32:36]
self.file_version, = struct.unpack(b'>I', self.raw[36:40])
self.meta_orth_indx, self.meta_infl_indx = struct.unpack(
b'>II', self.raw[40:48])
self.secondary_index_record, = struct.unpack(b'>I', self.raw[48:52])
self.reserved = self.raw[52:80]
self.first_non_book_record, = struct.unpack(b'>I', self.raw[80:84])
self.fullname_offset, = struct.unpack(b'>I', self.raw[84:88])
self.fullname_length, = struct.unpack(b'>I', self.raw[88:92])
self.locale_raw, = struct.unpack(b'>I', self.raw[92:96])
langcode = self.locale_raw
langid = langcode & 0xFF
sublangid = (langcode >> 10) & 0xFF
self.language = main_language.get(langid, 'ENGLISH')
self.sublanguage = sub_language.get(sublangid, 'NEUTRAL')
self.input_language = self.raw[96:100]
self.output_langauage = self.raw[100:104]
self.min_version, = struct.unpack(b'>I', self.raw[104:108])
self.first_image_index, = struct.unpack(b'>I', self.raw[108:112])
self.huffman_record_offset, = struct.unpack(b'>I', self.raw[112:116])
self.huffman_record_count, = struct.unpack(b'>I', self.raw[116:120])
self.datp_record_offset, = struct.unpack(b'>I', self.raw[120:124])
self.datp_record_count, = struct.unpack(b'>I', self.raw[124:128])
self.exth_flags, = struct.unpack(b'>I', self.raw[128:132])
self.has_exth = bool(self.exth_flags & 0x40)
self.has_drm_data = self.length >= 174 and len(self.raw) >= 184
if self.has_drm_data:
self.unknown3 = self.raw[132:168]
self.drm_offset, self.drm_count, self.drm_size, self.drm_flags = \
struct.unpack(b'>4I', self.raw[168:184])
self.has_extra_data_flags = self.length >= 232 and len(self.raw) >= 232+16
self.has_fcis_flis = False
self.has_multibytes = self.has_indexing_bytes = self.has_uncrossable_breaks = False
self.extra_data_flags = 0
if self.has_extra_data_flags:
self.unknown4 = self.raw[184:192]
if self.file_version < 8:
self.first_text_record, self.last_text_record = \
struct.unpack_from(b'>HH', self.raw, 192)
self.fdst_count = struct.unpack_from(b'>L', self.raw, 196)
else:
self.fdst_idx, self.fdst_count = struct.unpack_from(b'>LL',
self.raw, 192)
if self.fdst_count <= 1:
self.fdst_idx = NULL_INDEX
(self.fcis_number, self.fcis_count, self.flis_number,
self.flis_count) = struct.unpack(b'>IIII',
self.raw[200:216])
self.unknown6 = self.raw[216:224]
self.srcs_record_index = struct.unpack(b'>I',
self.raw[224:228])[0]
self.num_srcs_records = struct.unpack(b'>I',
self.raw[228:232])[0]
self.unknown7 = self.raw[232:240]
self.extra_data_flags = struct.unpack(b'>I',
self.raw[240:244])[0]
self.has_multibytes = bool(self.extra_data_flags & 0b1)
self.has_indexing_bytes = bool(self.extra_data_flags & 0b10)
self.has_uncrossable_breaks = bool(self.extra_data_flags & 0b100)
self.primary_index_record, = struct.unpack(b'>I',
self.raw[244:248])
if self.length >= 248:
(self.sect_idx, self.skel_idx, self.datp_idx, self.oth_idx
) = struct.unpack_from(b'>4L', self.raw, 248)
self.unknown9 = self.raw[264:self.length+16]
if self.meta_orth_indx not in {NULL_INDEX, self.sect_idx}:
raise ValueError('KF8 header has different Meta orth and '
'section indices')
# The following are all relative to the position of the header record
# make them absolute for ease of debugging
self.relative_records = {'sect_idx', 'skel_idx', 'datp_idx', 'oth_idx',
'meta_orth_indx', 'huffman_record_offset',
'first_non_book_record', 'datp_record_offset', 'fcis_number',
'flis_number', 'primary_index_record', 'fdst_idx',
'first_image_index'}
for x in self.relative_records:
if hasattr(self, x) and getattr(self, x) != NULL_INDEX:
setattr(self, x, self.header_offset+getattr(self, x))
# Try to find the first non-text record
self.first_resource_record = offset + 1 + self.number_of_text_records # Default to first record after all text records
pointer = min(getattr(self, 'first_non_book_record', NULL_INDEX), getattr(self, 'first_image_index', NULL_INDEX))
if pointer != NULL_INDEX:
self.first_resource_record = max(pointer, self.first_resource_record)
self.last_resource_record = NULL_INDEX
if self.has_exth:
self.exth_offset = 16 + self.length
self.exth = EXTHHeader(self.raw[self.exth_offset:])
self.end_of_exth = self.exth_offset + self.exth.length
self.bytes_after_exth = self.raw[self.end_of_exth:self.fullname_offset]
if self.exth.kf8_header_index is not None and offset == 0:
# MOBI 6 header in a joint file, adjust self.last_resource_record
self.last_resource_record = self.exth.kf8_header_index - 2
def __str__(self):
ans = ['*'*20 + ' MOBI %d Header '%self.file_version+ '*'*20]
a = ans.append
def i(d, x):
x = 'NULL' if x == NULL_INDEX else x
a('%s: %s'%(d, x))
def r(d, attr):
x = getattr(self, attr)
if attr in self.relative_records and x != NULL_INDEX:
a('%s: Absolute: %d Relative: %d'%(d, x, x-self.header_offset))
else:
i(d, x)
a('Compression: %s'%self.compression)
a('Unused: %r'%self.unused)
a('Text length: %d'%self.text_length)
a('Number of text records: %d'%self.number_of_text_records)
a('Text record size: %d'%self.text_record_size)
a('Encryption: %s'%self.encryption_type)
a('Unknown: %r'%self.unknown)
a('Identifier: %r'%self.identifier)
a('Header length: %d'% self.length)
a('Type: %s'%self.type)
a('Encoding: %s'%self.encoding)
a('UID: %r'%self.uid)
a('File version: %d'%self.file_version)
r('Meta Orth Index', 'meta_orth_indx')
r('Meta Infl Index', 'meta_infl_indx')
r('Secondary index record', 'secondary_index_record')
a('Reserved: %r'%self.reserved)
r('First non-book record', 'first_non_book_record')
a('Full name offset: %d'%self.fullname_offset)
a('Full name length: %d bytes'%self.fullname_length)
a('Langcode: %r'%self.locale_raw)
a('Language: %s'%self.language)
a('Sub language: %s'%self.sublanguage)
a('Input language: %r'%self.input_language)
a('Output language: %r'%self.output_langauage)
a('Min version: %d'%self.min_version)
r('First Image index', 'first_image_index')
r('Huffman record offset', 'huffman_record_offset')
a('Huffman record count: %d'%self.huffman_record_count)
r('Huffman table offset', 'datp_record_offset')
a('Huffman table length: %r'%self.datp_record_count)
a('EXTH flags: %s (%s)'%(bin(self.exth_flags)[2:], self.has_exth))
if self.has_drm_data:
a('Unknown3: %r'%self.unknown3)
r('DRM Offset', 'drm_offset')
a('DRM Count: %s'%self.drm_count)
a('DRM Size: %s'%self.drm_size)
a('DRM Flags: %r'%self.drm_flags)
if self.has_extra_data_flags:
a('Unknown4: %r'%self.unknown4)
if hasattr(self, 'first_text_record'):
a('First content record: %d'%self.first_text_record)
a('Last content record: %d'%self.last_text_record)
else:
r('FDST Index', 'fdst_idx')
a('FDST Count: %d'% self.fdst_count)
r('FCIS number', 'fcis_number')
a('FCIS count: %d'% self.fcis_count)
r('FLIS number', 'flis_number')
a('FLIS count: %d'% self.flis_count)
a('Unknown6: %r'% self.unknown6)
r('SRCS record index', 'srcs_record_index')
a('Number of SRCS records?: %d'%self.num_srcs_records)
a('Unknown7: %r'%self.unknown7)
a(('Extra data flags: %s (has multibyte: %s) '
'(has indexing: %s) (has uncrossable breaks: %s)')%(
bin(self.extra_data_flags), self.has_multibytes,
self.has_indexing_bytes, self.has_uncrossable_breaks))
r('NCX index', 'primary_index_record')
if self.length >= 248:
r('Sections Index', 'sect_idx')
r('SKEL Index', 'skel_idx')
r('DATP Index', 'datp_idx')
r('Other Index', 'oth_idx')
if self.unknown9:
a('Unknown9: %r'%self.unknown9)
ans = '\n'.join(ans)
if self.has_exth:
ans += '\n\n' + str(self.exth)
ans += '\n\nBytes after EXTH (%d bytes): %s'%(
len(self.bytes_after_exth),
format_bytes(self.bytes_after_exth))
ans += '\nNumber of bytes after full name: %d' % (len(self.raw) - (self.fullname_offset +
self.fullname_length))
ans += '\nRecord 0 length: %d'%len(self.raw)
return ans
# }}}
class MOBIFile:
def __init__(self, stream):
self.raw = stream.read()
self.palmdb = PalmDB(self.raw[:78])
self.record_headers = []
self.records = []
for i in range(self.palmdb.number_of_records):
pos = 78 + i * 8
offset, a1, a2, a3, a4 = struct.unpack(b'>LBBBB', self.raw[pos:pos+8])
flags, val = a1, a2 << 16 | a3 << 8 | a4
self.record_headers.append((offset, flags, val))
def section(section_number):
if section_number == self.palmdb.number_of_records - 1:
end_off = len(self.raw)
else:
end_off = self.record_headers[section_number + 1][0]
off = self.record_headers[section_number][0]
return self.raw[off:end_off]
for i in range(self.palmdb.number_of_records):
self.records.append(Record(section(i), self.record_headers[i]))
self.mobi_header = MOBIHeader(self.records[0], 0)
self.huffman_record_nums = []
self.kf8_type = None
mh = mh8 = self.mobi_header
if mh.file_version >= 8:
self.kf8_type = 'standalone'
elif mh.has_exth and mh.exth.kf8_header_index is not None:
kf8i = mh.exth.kf8_header_index
try:
rec = self.records[kf8i-1]
except IndexError:
pass
else:
if rec.raw == b'BOUNDARY':
self.kf8_type = 'joint'
mh8 = MOBIHeader(self.records[kf8i], kf8i)
self.mobi8_header = mh8
if 'huff' in self.mobi_header.compression.lower():
from calibre.ebooks.mobi.huffcdic import HuffReader
def huffit(off, cnt):
huffman_record_nums = list(range(off, off+cnt))
huffrecs = [self.records[r].raw for r in huffman_record_nums]
huffs = HuffReader(huffrecs)
return huffman_record_nums, huffs.unpack
if self.kf8_type == 'joint':
recs6, d6 = huffit(mh.huffman_record_offset,
mh.huffman_record_count)
recs8, d8 = huffit(mh8.huffman_record_offset,
mh8.huffman_record_count)
self.huffman_record_nums = recs6 + recs8
else:
self.huffman_record_nums, d6 = huffit(mh.huffman_record_offset,
mh.huffman_record_count)
d8 = d6
elif 'palmdoc' in self.mobi_header.compression.lower():
from calibre.ebooks.compression.palmdoc import decompress_doc
d8 = d6 = decompress_doc
else:
d8 = d6 = lambda x: x
self.decompress6, self.decompress8 = d6, d8
class TextRecord: # {{{
def __init__(self, idx, record, extra_data_flags, decompress):
self.trailing_data, self.raw = get_trailing_data(record.raw, extra_data_flags)
raw_trailing_bytes = record.raw[len(self.raw):]
self.raw = decompress(self.raw)
if 0 in self.trailing_data:
self.trailing_data['multibyte_overlap'] = self.trailing_data.pop(0)
if 1 in self.trailing_data:
self.trailing_data['indexing'] = self.trailing_data.pop(1)
if 2 in self.trailing_data:
self.trailing_data['uncrossable_breaks'] = self.trailing_data.pop(2)
self.trailing_data['raw_bytes'] = raw_trailing_bytes
for typ, val in iteritems(self.trailing_data):
if isinstance(typ, numbers.Integral):
print('Record %d has unknown trailing data of type: %d : %r'%
(idx, typ, val))
self.idx = idx
def dump(self, folder):
name = '%06d'%self.idx
with open(os.path.join(folder, name+'.txt'), 'wb') as f:
f.write(self.raw)
with open(os.path.join(folder, name+'.trailing_data'), 'wb') as f:
for k, v in iteritems(self.trailing_data):
raw = '%s : %r\n\n'%(k, v)
f.write(raw.encode('utf-8'))
def __len__(self):
return len(self.raw)
# }}}