%PDF- %PDF-
| Direktori : /lib/calibre/calibre/ebooks/metadata/ |
| Current File : //lib/calibre/calibre/ebooks/metadata/mobi.py |
#!/usr/bin/env python3
# License: GPLv3 Copyright: 2009, Kovid Goyal <kovid at kovidgoyal.net>
import io
import numbers
import os
from struct import pack, unpack
from calibre.ebooks import normalize
from calibre.ebooks.mobi import MAX_THUMB_DIMEN, MobiError
from calibre.ebooks.mobi.langcodes import iana2mobi
from calibre.ebooks.mobi.utils import rescale_image
from calibre.utils.date import now as nowf
from calibre.utils.imghdr import what
from calibre.utils.localization import canonicalize_lang, lang_as_iso639_1
from polyglot.builtins import codepoint_to_chr
'''
Retrieve and modify in-place Mobipocket book metadata.
'''
def is_image(ss):
if ss is None:
return False
return what(None, ss[:200]) is not None
class StreamSlicer:
def __init__(self, stream, start=0, stop=None):
self._stream = stream
self.start = start
if stop is None:
stream.seek(0, 2)
stop = stream.tell()
self.stop = stop
self._len = stop - start
def __len__(self):
return self._len
def __getitem__(self, key):
stream = self._stream
base = self.start
if isinstance(key, numbers.Integral):
stream.seek(base + key)
return stream.read(1)
if isinstance(key, slice):
start, stop, stride = key.indices(self._len)
if stride < 0:
start, stop = stop, start
size = stop - start
if size <= 0:
return b""
stream.seek(base + start)
data = stream.read(size)
if stride != 1:
data = data[::stride]
return data
raise TypeError("stream indices must be integers")
def __setitem__(self, key, value):
stream = self._stream
base = self.start
if isinstance(key, numbers.Integral):
if len(value) != 1:
raise ValueError("key and value lengths must match")
stream.seek(base + key)
return stream.write(value)
if isinstance(key, slice):
start, stop, stride = key.indices(self._len)
if stride < 0:
start, stop = stop, start
size = stop - start
if stride != 1:
value = value[::stride]
if len(value) != size:
raise ValueError("key and value lengths must match")
stream.seek(base + start)
return stream.write(value)
raise TypeError("stream indices must be integers")
def update(self, data_blocks):
# Rewrite the stream
stream = self._stream
base = self.start
stream.seek(base)
stream.truncate()
for block in data_blocks:
stream.write(block)
def truncate(self, value):
self._stream.truncate(value)
class MetadataUpdater:
DRM_KEY_SIZE = 48
def __init__(self, stream):
self.stream = stream
data = self.data = StreamSlicer(stream)
self.type = data[60:68]
if self.type != b"BOOKMOBI":
return
self.nrecs, = unpack('>H', data[76:78])
record0 = self.record0 = self.record(0)
mobi_header_length, = unpack('>I', record0[0x14:0x18])
if not mobi_header_length:
raise MobiError("Non-standard file format. Try 'Convert E-Books' with MOBI as Input and Output formats.")
self.encryption_type, = unpack('>H', record0[12:14])
codepage, = unpack('>I', record0[28:32])
self.codec = 'utf-8' if codepage == 65001 else 'cp1252'
image_base, = unpack('>I', record0[108:112])
flags, = self.flags, = unpack('>I', record0[128:132])
have_exth = self.have_exth = (flags & 0x40) != 0
self.cover_record = self.thumbnail_record = None
self.timestamp = None
self.pdbrecords = self.get_pdbrecords()
self.drm_block = None
if self.encryption_type != 0:
if self.have_exth:
self.drm_block = self.fetchDRMdata()
else:
raise MobiError('Unable to set metadata on DRM file without EXTH header')
self.original_exth_records = {}
if not have_exth:
self.create_exth()
self.have_exth = True
# Fetch timestamp, cover_record, thumbnail_record
self.fetchEXTHFields()
def fetchDRMdata(self):
''' Fetch the DRM keys '''
drm_offset = int(unpack('>I', self.record0[0xa8:0xac])[0])
self.drm_key_count = int(unpack('>I', self.record0[0xac:0xb0])[0])
drm_keys = b''
for x in range(self.drm_key_count):
base_addr = drm_offset + (x * self.DRM_KEY_SIZE)
drm_keys += self.record0[base_addr:base_addr + self.DRM_KEY_SIZE]
return drm_keys
def fetchEXTHFields(self):
stream = self.stream
record0 = self.record0
# 20:24 = mobiHeaderLength, 16=PDBHeader size
exth_off = unpack('>I', record0[20:24])[0] + 16 + record0.start
image_base, = unpack('>I', record0[108:112])
# Fetch EXTH block
exth = self.exth = StreamSlicer(stream, exth_off, record0.stop)
nitems, = unpack('>I', exth[8:12])
pos = 12
# Store any EXTH fields not specifiable in GUI
for i in range(nitems):
id, size = unpack('>II', exth[pos:pos + 8])
content = exth[pos + 8: pos + size]
pos += size
self.original_exth_records[id] = content
if id == 106:
self.timestamp = content
elif id == 201:
rindex, = self.cover_rindex, = unpack('>I', content)
if rindex != 0xffffffff:
self.cover_record = self.record(rindex + image_base)
elif id == 202:
rindex, = self.thumbnail_rindex, = unpack('>I', content)
if rindex > 0 and rindex != 0xffffffff:
self.thumbnail_record = self.record(rindex + image_base)
def patch(self, off, new_record0):
# Save the current size of each record
record_sizes = [len(new_record0)]
for i in range(1,self.nrecs-1):
record_sizes.append(self.pdbrecords[i+1][0]-self.pdbrecords[i][0])
# And the last one
record_sizes.append(self.data.stop - self.pdbrecords[self.nrecs-1][0])
# pdbrecord[0] is the offset of record0. It will not change
# record1 offset will be offset of record0 + len(new_record0)
updated_pdbrecords = [self.pdbrecords[0][0]]
record0_offset = self.pdbrecords[0][0]
updated_offset = record0_offset + len(new_record0)
for i in range(1,self.nrecs-1):
updated_pdbrecords.append(updated_offset)
updated_offset += record_sizes[i]
# Update the last pdbrecord
updated_pdbrecords.append(updated_offset)
# Read in current records 1 to last
data_blocks = [new_record0]
for i in range(1,self.nrecs):
data_blocks.append(self.data[self.pdbrecords[i][0]:self.pdbrecords[i][0] + record_sizes[i]])
# Rewrite the stream
self.record0.update(data_blocks)
# Rewrite the pdbrecords
self.update_pdbrecords(updated_pdbrecords)
# Truncate if necessary
if (updated_pdbrecords[-1] + record_sizes[-1]) < self.data.stop:
self.data.truncate(updated_pdbrecords[-1] + record_sizes[-1])
else:
self.data.stop = updated_pdbrecords[-1] + record_sizes[-1]
def patchSection(self, section, new):
off = self.pdbrecords[section][0]
self.patch(off, new)
def create_exth(self, new_title=None, exth=None):
# Add an EXTH block to record 0, rewrite the stream
if isinstance(new_title, str):
new_title = new_title.encode(self.codec, 'replace')
# Fetch the existing title
title_offset, = unpack('>L', self.record0[0x54:0x58])
title_length, = unpack('>L', self.record0[0x58:0x5c])
title_in_file, = unpack('%ds' % (title_length), self.record0[title_offset:title_offset + title_length])
# Adjust length to accommodate PrimaryINDX if necessary
mobi_header_length, = unpack('>L', self.record0[0x14:0x18])
if mobi_header_length == 0xe4:
# Patch mobi_header_length to 0xE8
self.record0[0x17] = b"\xe8"
self.record0[0xf4:0xf8] = pack('>L', 0xFFFFFFFF)
mobi_header_length = 0xe8
# Set EXTH flag (0x40)
self.record0[0x80:0x84] = pack('>L', self.flags|0x40)
if not exth:
# Construct an empty EXTH block
pad = b'\0' * 4
exth = [b'EXTH', pack('>II', 12, 0), pad]
exth = b''.join(exth)
# Update drm_offset(0xa8), title_offset(0x54)
if self.encryption_type != 0:
self.record0[0xa8:0xac] = pack('>L', 0x10 + mobi_header_length + len(exth))
self.record0[0xb0:0xb4] = pack('>L', len(self.drm_block))
self.record0[0x54:0x58] = pack('>L', 0x10 + mobi_header_length + len(exth) + len(self.drm_block))
else:
self.record0[0x54:0x58] = pack('>L', 0x10 + mobi_header_length + len(exth))
if new_title:
self.record0[0x58:0x5c] = pack('>L', len(new_title))
# Create an updated Record0
new_record0 = io.BytesIO()
new_record0.write(self.record0[:0x10 + mobi_header_length])
new_record0.write(exth)
if self.encryption_type != 0:
new_record0.write(self.drm_block)
new_record0.write(new_title if new_title else title_in_file)
# Pad to a 4-byte boundary
trail = len(new_record0.getvalue()) % 4
pad = b'\0' * (4 - trail) # Always pad w/ at least 1 byte
new_record0.write(pad)
new_record0.write(b'\0'*(1024*8))
# Rebuild the stream, update the pdbrecords pointers
self.patchSection(0,new_record0.getvalue())
# Update record0
self.record0 = self.record(0)
def hexdump(self, src, length=16):
# Diagnostic
FILTER=''.join([(len(repr(codepoint_to_chr(x)))==3) and codepoint_to_chr(x) or '.' for x in range(256)])
N=0
result=''
while src:
s,src = src[:length],src[length:]
hexa = ' '.join(["%02X"%ord(x) for x in s])
s = s.translate(FILTER)
result += "%04X %-*s %s\n" % (N, length*3, hexa, s)
N+=length
print(result)
def get_pdbrecords(self):
pdbrecords = []
for i in range(self.nrecs):
offset, a1,a2,a3,a4 = unpack('>LBBBB', self.data[78+i*8:78+i*8+8])
flags, val = a1, a2<<16|a3<<8|a4
pdbrecords.append([offset, flags, val])
return pdbrecords
def update_pdbrecords(self, updated_pdbrecords):
for (i, pdbrecord) in enumerate(updated_pdbrecords):
self.data[78+i*8:78+i*8 + 4] = pack('>L',pdbrecord)
# Refresh local copy
self.pdbrecords = self.get_pdbrecords()
def dump_pdbrecords(self):
# Diagnostic
print("MetadataUpdater.dump_pdbrecords()")
print("%10s %10s %10s" % ("offset","flags","val"))
for i in range(len(self.pdbrecords)):
pdbrecord = self.pdbrecords[i]
print(f"{pdbrecord[0]:10X} {pdbrecord[1]:10X} {pdbrecord[2]:10X}")
def record(self, n):
if n >= self.nrecs:
raise ValueError('non-existent record %r' % n)
offoff = 78 + (8 * n)
start, = unpack('>I', self.data[offoff + 0:offoff + 4])
stop = None
if n < (self.nrecs - 1):
stop, = unpack('>I', self.data[offoff + 8:offoff + 12])
return StreamSlicer(self.stream, start, stop)
def update(self, mi, asin=None):
mi.title = normalize(mi.title)
def update_exth_record(rec):
recs.append(rec)
if rec[0] in self.original_exth_records:
self.original_exth_records.pop(rec[0])
if self.type != b"BOOKMOBI":
raise MobiError("Setting metadata only supported for MOBI files of type 'BOOK'.\n"
"\tThis is a %r file of type %r" % (self.type[0:4], self.type[4:8]))
recs = []
added_501 = False
try:
from calibre.ebooks.conversion.config import load_defaults
prefs = load_defaults('mobi_output')
pas = prefs.get('prefer_author_sort', False)
kindle_pdoc = prefs.get('personal_doc', None)
share_not_sync = prefs.get('share_not_sync', False)
except:
pas = False
kindle_pdoc = None
share_not_sync = False
if mi.author_sort and pas:
# We want an EXTH field per author...
authors = mi.author_sort.split(' & ')
for author in authors:
update_exth_record((100, normalize(author).encode(self.codec, 'replace')))
elif mi.authors:
authors = mi.authors
for author in authors:
update_exth_record((100, normalize(author).encode(self.codec, 'replace')))
if mi.publisher:
update_exth_record((101, normalize(mi.publisher).encode(self.codec, 'replace')))
if mi.comments:
# Strip user annotations
a_offset = mi.comments.find('<div class="user_annotations">')
ad_offset = mi.comments.find('<hr class="annotations_divider" />')
if a_offset >= 0:
mi.comments = mi.comments[:a_offset]
if ad_offset >= 0:
mi.comments = mi.comments[:ad_offset]
update_exth_record((103, normalize(mi.comments).encode(self.codec, 'replace')))
if mi.isbn:
update_exth_record((104, mi.isbn.encode(self.codec, 'replace')))
if mi.tags:
# FIXME: Keep a single subject per EXTH field?
subjects = '; '.join(mi.tags)
update_exth_record((105, normalize(subjects).encode(self.codec, 'replace')))
if kindle_pdoc and kindle_pdoc in mi.tags:
added_501 = True
update_exth_record((501, b'PDOC'))
if mi.pubdate:
update_exth_record((106, str(mi.pubdate).encode(self.codec, 'replace')))
elif mi.timestamp:
update_exth_record((106, str(mi.timestamp).encode(self.codec, 'replace')))
elif self.timestamp:
update_exth_record((106, self.timestamp))
else:
update_exth_record((106, nowf().isoformat().encode(self.codec, 'replace')))
if self.cover_record is not None:
update_exth_record((201, pack('>I', self.cover_rindex)))
update_exth_record((203, pack('>I', 0)))
if self.thumbnail_record is not None:
update_exth_record((202, pack('>I', self.thumbnail_rindex)))
# Add a 113 record if not present to allow Amazon syncing
if (113 not in self.original_exth_records and
self.original_exth_records.get(501, None) == b'EBOK' and
not added_501 and not share_not_sync):
from uuid import uuid4
update_exth_record((113, str(uuid4()).encode(self.codec)))
if asin is not None:
update_exth_record((113, asin.encode(self.codec)))
update_exth_record((501, b'EBOK'))
update_exth_record((504, asin.encode(self.codec)))
# Add a 112 record with actual UUID
if getattr(mi, 'uuid', None):
update_exth_record((112,
("calibre:%s" % mi.uuid).encode(self.codec, 'replace')))
if 503 in self.original_exth_records:
update_exth_record((503, mi.title.encode(self.codec, 'replace')))
# Update book producer
if getattr(mi, 'book_producer', False):
update_exth_record((108, mi.book_producer.encode(self.codec, 'replace')))
# Set langcode in EXTH header
if not mi.is_null('language'):
lang = canonicalize_lang(mi.language)
lang = lang_as_iso639_1(lang) or lang
if lang:
update_exth_record((524, lang.encode(self.codec, 'replace')))
# Include remaining original EXTH fields
for id in sorted(self.original_exth_records):
recs.append((id, self.original_exth_records[id]))
recs = sorted(recs, key=lambda x:(x[0],x[0]))
exth = io.BytesIO()
for code, data in recs:
exth.write(pack('>II', code, len(data) + 8))
exth.write(data)
exth = exth.getvalue()
trail = len(exth) % 4
pad = b'\0' * (4 - trail) # Always pad w/ at least 1 byte
exth = [b'EXTH', pack('>II', len(exth) + 12, len(recs)), exth, pad]
exth = b''.join(exth)
if getattr(self, 'exth', None) is None:
raise MobiError('No existing EXTH record. Cannot update metadata.')
if not mi.is_null('language'):
self.record0[92:96] = iana2mobi(mi.language)
self.create_exth(exth=exth, new_title=mi.title)
# Fetch updated timestamp, cover_record, thumbnail_record
self.fetchEXTHFields()
if mi.cover_data[1] or mi.cover:
try:
data = mi.cover_data[1]
if not data:
with open(mi.cover, 'rb') as f:
data = f.read()
except:
pass
else:
if is_image(self.cover_record):
size = len(self.cover_record)
cover = rescale_image(data, size)
if len(cover) <= size:
cover += b'\0' * (size - len(cover))
self.cover_record[:] = cover
if is_image(self.thumbnail_record):
size = len(self.thumbnail_record)
thumbnail = rescale_image(data, size, dimen=MAX_THUMB_DIMEN)
if len(thumbnail) <= size:
thumbnail += b'\0' * (size - len(thumbnail))
self.thumbnail_record[:] = thumbnail
return
def set_metadata(stream, mi):
mu = MetadataUpdater(stream)
mu.update(mi)
return
def get_metadata(stream):
from calibre.ebooks.metadata import MetaInformation
from calibre.ptempfile import TemporaryDirectory
from calibre.ebooks.mobi.reader.headers import MetadataHeader
from calibre.ebooks.mobi.reader.mobi6 import MobiReader
from calibre.utils.img import save_cover_data_to
from calibre import CurrentDir
stream.seek(0)
try:
raw = stream.read(3)
except Exception:
raw = b''
stream.seek(0)
if raw == b'TPZ':
from calibre.ebooks.metadata.topaz import get_metadata
return get_metadata(stream)
from calibre.utils.logging import Log
log = Log()
try:
mi = MetaInformation(os.path.basename(stream.name), [_('Unknown')])
except:
mi = MetaInformation(_('Unknown'), [_('Unknown')])
mh = MetadataHeader(stream, log)
if mh.title and mh.title != _('Unknown'):
mi.title = mh.title
if mh.exth is not None:
if mh.exth.mi is not None:
mi = mh.exth.mi
else:
size = 1024**3
if hasattr(stream, 'seek') and hasattr(stream, 'tell'):
pos = stream.tell()
stream.seek(0, 2)
size = stream.tell()
stream.seek(pos)
if size < 4*1024*1024:
with TemporaryDirectory('_mobi_meta_reader') as tdir:
with CurrentDir(tdir):
mr = MobiReader(stream, log)
parse_cache = {}
mr.extract_content(tdir, parse_cache)
if mr.embedded_mi is not None:
mi = mr.embedded_mi
if hasattr(mh.exth, 'cover_offset'):
cover_index = mh.first_image_index + mh.exth.cover_offset
data = mh.section_data(int(cover_index))
else:
try:
data = mh.section_data(mh.first_image_index)
except Exception:
data = b''
if data and what(None, data) in {'jpg', 'jpeg', 'gif', 'png', 'bmp', 'webp'}:
try:
mi.cover_data = ('jpg', save_cover_data_to(data))
except Exception:
log.exception('Failed to read MOBI cover')
return mi