%PDF- %PDF-
Direktori : /usr/lib/calibre/calibre/ebooks/metadata/ |
Current File : //usr/lib/calibre/calibre/ebooks/metadata/kfx.py |
#!/usr/bin/env python3 # License: GPLv3 Copyright: 2015, Kovid Goyal <kovid at kovidgoyal.net>, John Howell <jhowell@acm.org>' # Based on work of John Howell reversing the KFX format # https://www.mobileread.com/forums/showpost.php?p=3176029&postcount=89 import struct, sys, re from collections import defaultdict from calibre.ebooks.metadata.book.base import Metadata from calibre.ebooks.mobi.utils import decint from calibre.utils.cleantext import clean_xml_chars from calibre.utils.config_base import tweaks from calibre.utils.date import parse_only_date from calibre.utils.localization import canonicalize_lang from calibre.utils.imghdr import identify from polyglot.binary import as_base64_bytes, from_base64_bytes class InvalidKFX(ValueError): pass # magic numbers for data structures CONTAINER_MAGIC = b'CONT' ENTITY_MAGIC = b'ENTY' ION_MAGIC = b'\xe0\x01\x00\xea' # ION data types (comment shows equivalent python data type produced) DT_BOOLEAN = 1 # True/False DT_INTEGER = 2 # int # str (using non-unicode to distinguish symbols from strings) DT_PROPERTY = 7 DT_STRING = 8 # unicode DT_STRUCT = 11 # tuple DT_LIST = 12 # list DT_OBJECT = 13 # dict of property/value pairs DT_TYPED_DATA = 14 # type, name, value # property names (non-unicode strings to distinguish them from ION strings in this program) # These are place holders. The correct property names are unknown. PROP_METADATA = b'P258' PROP_METADATA2 = b'P490' PROP_METADATA3 = b'P491' PROP_METADATA_KEY = b'P492' PROP_METADATA_VALUE = b'P307' PROP_IMAGE = b'P417' METADATA_PROPERTIES = { b'P10' : "languages", b'P153': "title", b'P154': "description", b'P222': "author", b'P232': "publisher", } COVER_KEY = "cover_image_base64" def hexs(string, sep=' '): if isinstance(string, bytes): string = bytearray(string) else: string = map(ord, string) return sep.join('%02x' % b for b in string) class PackedData: ''' Simplify unpacking of packed binary data structures ''' def __init__(self, data): self.buffer = data self.offset = 0 def unpack_one(self, fmt, advance=True): return self.unpack_multi(fmt, advance)[0] def unpack_multi(self, fmt, advance=True): fmt = fmt.encode('ascii') result = struct.unpack_from(fmt, self.buffer, self.offset) if advance: self.advance(struct.calcsize(fmt)) return result def extract(self, size): data = self.buffer[self.offset:self.offset + size] self.advance(size) return data def advance(self, size): self.offset += size def remaining(self): return len(self.buffer) - self.offset class PackedBlock(PackedData): ''' Common header structure of container and entity blocks ''' def __init__(self, data, magic): PackedData.__init__(self, data) self.magic = self.unpack_one('4s') if self.magic != magic: raise InvalidKFX('%s magic number is incorrect (%s)' % (magic, hexs(self.magic))) self.version = self.unpack_one('<H') self.header_len = self.unpack_one('<L') class Container(PackedBlock): ''' Container file containing data entities ''' def __init__(self, data): self.data = data PackedBlock.__init__(self, data, CONTAINER_MAGIC) # Unknown data self.advance(8) self.entities = [] while self.unpack_one('4s', advance=False) != ION_MAGIC: entity_id, entity_type, entity_offset, entity_len = self.unpack_multi('<LLQQ') entity_start = self.header_len + entity_offset self.entities.append( Entity(self.data[entity_start:entity_start + entity_len], entity_type, entity_id)) def decode(self): return [entity.decode() for entity in self.entities] class Entity(PackedBlock): ''' Data entity inside a container ''' def __init__(self, data, entity_type, entity_id): PackedBlock.__init__(self, data, ENTITY_MAGIC) self.entity_type = entity_type self.entity_id = entity_id self.entity_data = data[self.header_len:] def decode(self): if PackedData(self.entity_data).unpack_one('4s') == ION_MAGIC: entity_value = PackedIon(self.entity_data).decode() else: entity_value = as_base64_bytes(self.entity_data) return (property_name(self.entity_type), property_name(self.entity_id), entity_value) class PackedIon(PackedData): ''' Packed structured binary data format used by KFX ''' def __init__(self, data): PackedData.__init__(self, data) def decode(self): if self.unpack_one('4s') != ION_MAGIC: raise Exception('ION marker missing at start of data') return self.unpack_typed_value() def unpack_typed_value(self): cmd = self.unpack_one('B') data_type = cmd >> 4 data_len = cmd & 0x0f if data_len == 14: data_len = self.unpack_number() # print('cmd=%02x, len=%s: %s' % (cmd, data_len, hexs(self.buffer[self.offset:][:data_len]))) if data_type == DT_BOOLEAN: return data_len != 0 # length is actually value if data_type == DT_INTEGER: return self.unpack_unsigned_int(data_len) if data_type == DT_PROPERTY: return property_name(self.unpack_unsigned_int(data_len)) if data_type == DT_STRING: return self.extract(data_len).decode('utf8') if data_type == DT_STRUCT or data_type == DT_LIST: ion = PackedIon(self.extract(data_len)) result = [] while ion.remaining(): result.append(ion.unpack_typed_value()) if data_type == DT_STRUCT: result = tuple(result) return result if data_type == DT_OBJECT: ion = PackedIon(self.extract(data_len)) result = {} while (ion.remaining()): symbol = property_name(ion.unpack_number()) result[symbol] = ion.unpack_typed_value() return result if data_type == DT_TYPED_DATA: ion = PackedIon(self.extract(data_len)) ion.unpack_number() ion.unpack_number() return ion.unpack_typed_value() # ignore unknown types self.advance(data_len) return None def unpack_number(self): # variable length numbers, MSB first, 7 bits per byte, last byte is # flagged by MSB set raw = self.buffer[self.offset:self.offset+10] number, consumed = decint(raw) self.advance(consumed) return number def unpack_unsigned_int(self, length): # unsigned big-endian (MSB first) return struct.unpack_from(b'>Q', b'\0' * (8 - length) + self.extract(length))[0] def property_name(property_number): # This should be changed to translate property numbers to the proper # strings using a symbol table return b"P%d" % property_number def extract_metadata(container_data): metadata = defaultdict(list) # locate book metadata within the container data structures metadata_entity = {} for entity_type, entity_id, entity_value in container_data: if entity_type == PROP_METADATA: metadata_entity = entity_value elif entity_type == PROP_METADATA2: if entity_value is not None: for value1 in entity_value[PROP_METADATA3]: for meta in value1[PROP_METADATA]: metadata[meta[PROP_METADATA_KEY]].append(meta[PROP_METADATA_VALUE]) elif entity_type == PROP_IMAGE and COVER_KEY not in metadata: # assume first image is the cover metadata[COVER_KEY] = entity_value for key, value in metadata_entity.items(): if key in METADATA_PROPERTIES and METADATA_PROPERTIES[key] not in metadata: metadata[METADATA_PROPERTIES[key]].append(value) return metadata def dump_metadata(m): d = dict(m) d[COVER_KEY] = bool(d.get(COVER_KEY)) from pprint import pprint pprint(d) def read_book_key_kfx(stream, read_cover=True): ' Read the metadata.kfx file that is found in the sdr book folder for KFX files ' c = Container(stream.read()) m = extract_metadata(c.decode()) def val(x): return m[x][0] if x in m else '' return (val('content_id') or val('ASIN')), val('cde_content_type') def read_metadata_kfx(stream, read_cover=True): ' Read the metadata.kfx file that is found in the sdr book folder for KFX files ' c = Container(stream.read()) m = extract_metadata(c.decode()) # dump_metadata(m) def has(x): return m[x] and m[x][0] def get(x, single=True): ans = m[x] if single: ans = clean_xml_chars(ans[0]) if ans else '' else: ans = [clean_xml_chars(y) for y in ans] return ans title = get('title') or _('Unknown') authors = get('author', False) or [_('Unknown')] auth_pat = re.compile(r'([^,]+?)\s*,\s+([^,]+)$') def fix_author(x): if tweaks['author_sort_copy_method'] != 'copy': m = auth_pat.match(x.strip()) if m is not None: return m.group(2) + ' ' + m.group(1) return x unique_authors = [] # remove duplicates while retaining order for f in [fix_author(x) for x in authors]: if f not in unique_authors: unique_authors.append(f) mi = Metadata(title, unique_authors) if has('author'): mi.author_sort = get('author') if has('ASIN'): mi.set_identifier('mobi-asin', get('ASIN')) elif has('content_id'): mi.set_identifier('mobi-asin', get('content_id')) if has('languages'): langs = list(filter(None, (canonicalize_lang(x) for x in get('languages', False)))) if langs: mi.languages = langs if has('issue_date'): try: mi.pubdate = parse_only_date(get('issue_date')) except Exception: pass if has('publisher') and get('publisher') != 'Unknown': mi.publisher = get('publisher') if read_cover and m[COVER_KEY]: try: data = from_base64_bytes(m[COVER_KEY]) fmt, w, h = identify(data) except Exception: w, h, fmt = 0, 0, None if fmt and w > -1 and h > -1: mi.cover_data = (fmt, data) return mi if __name__ == '__main__': from calibre import prints with open(sys.argv[-1], 'rb') as f: mi = read_metadata_kfx(f) prints(str(mi))