%PDF- %PDF-
| Direktori : /lib/calibre/calibre/ebooks/metadata/ |
| Current File : //lib/calibre/calibre/ebooks/metadata/topaz.py |
__license__ = 'GPL 3'
__copyright__ = '2010, Greg Riker <griker@hotmail.com>'
__docformat__ = 'restructuredtext en'
''' Read/write metadata from Amazon's topaz format '''
import io, sys, numbers
from struct import pack
from calibre.ebooks.metadata import MetaInformation
from calibre import force_unicode
from polyglot.builtins import codepoint_to_chr, int_to_byte
def is_dkey(x):
q = b'dkey' if isinstance(x, bytes) else 'dkey'
return x == q
class StringIO(io.StringIO):
def write(self, x):
if isinstance(x, bytes):
x = x.decode('iso-8859-1')
return io.StringIO.write(self, x)
class StreamSlicer:
def __init__(self, stream, start=0, stop=None):
self._stream = stream
self.start = start
if stop is None:
stream.seek(0, 2)
stop = stream.tell()
self.stop = stop
self._len = stop - start
def __len__(self):
return self._len
def __getitem__(self, key):
stream = self._stream
base = self.start
if isinstance(key, numbers.Integral):
stream.seek(base + key)
return stream.read(1)
if isinstance(key, slice):
start, stop, stride = key.indices(self._len)
if stride < 0:
start, stop = stop, start
size = stop - start
if size <= 0:
return b""
stream.seek(base + start)
data = stream.read(size)
if stride != 1:
data = data[::stride]
return data
raise TypeError("stream indices must be integers")
def __setitem__(self, key, value):
stream = self._stream
base = self.start
if isinstance(key, numbers.Integral):
if len(value) != 1:
raise ValueError("key and value lengths must match")
stream.seek(base + key)
return stream.write(value)
if isinstance(key, slice):
start, stop, stride = key.indices(self._len)
if stride < 0:
start, stop = stop, start
size = stop - start
if stride != 1:
value = value[::stride]
if len(value) != size:
raise ValueError("key and value lengths must match")
stream.seek(base + start)
return stream.write(value)
raise TypeError("stream indices must be integers")
def update(self, data_blocks):
# Rewrite the stream
stream = self._stream
base = self.start
stream.seek(base)
self._stream.truncate(base)
for block in data_blocks:
stream.write(block)
def truncate(self, value):
self._stream.truncate(value)
class MetadataUpdater:
def __init__(self, stream):
self.stream = stream
self.data = StreamSlicer(stream)
sig = self.data[:4]
if not sig.startswith(b'TPZ'):
raise ValueError("'%s': Not a Topaz file" % getattr(stream, 'name', 'Unnamed stream'))
offset = 4
self.header_records, consumed = self.decode_vwi(self.data[offset:offset+4])
offset += consumed
self.topaz_headers, self.th_seq = self.get_headers(offset)
# First integrity test - metadata header
if 'metadata' not in self.topaz_headers:
raise ValueError("'%s': Invalid Topaz format - no metadata record" % getattr(stream, 'name', 'Unnamed stream'))
# Second integrity test - metadata body
md_offset = self.topaz_headers['metadata']['blocks'][0]['offset']
md_offset += self.base
if self.data[md_offset+1:md_offset+9] != b'metadata':
raise ValueError("'%s': Damaged metadata record" % getattr(stream, 'name', 'Unnamed stream'))
def book_length(self):
''' convenience method for retrieving book length '''
self.get_original_metadata()
if 'bookLength' in self.metadata:
return int(self.metadata['bookLength'])
return 0
def decode_vwi(self, byts):
pos, val = 0, 0
done = False
byts = bytearray(byts)
while pos < len(byts) and not done:
b = byts[pos]
pos += 1
if (b & 0x80) == 0:
done = True
b &= 0x7F
val <<= 7
val |= b
if done:
break
return val, pos
def dump_headers(self):
''' Diagnostic '''
print("\ndump_headers():")
for tag in self.topaz_headers:
print("%s: " % (tag))
num_recs = len(self.topaz_headers[tag]['blocks'])
print(" num_recs: %d" % num_recs)
if num_recs:
print(" starting offset: 0x%x" % self.topaz_headers[tag]['blocks'][0]['offset'])
def dump_hex(self, src, length=16):
''' Diagnostic '''
FILTER=''.join([(len(repr(codepoint_to_chr(x)))==3) and codepoint_to_chr(x) or '.' for x in range(256)])
N=0
result=''
while src:
s,src = src[:length],src[length:]
hexa = ' '.join(["%02X"%ord(x) for x in s])
s = s.translate(FILTER)
result += "%04X %-*s %s\n" % (N, length*3, hexa, s)
N+=length
print(result)
def dump_metadata(self):
''' Diagnostic '''
for tag in self.metadata:
print(f'{tag}: {repr(self.metadata[tag])}')
def encode_vwi(self,value):
ans = []
multi_byte = (value > 0x7f)
while value:
b = value & 0x7f
value >>= 7
if value == 0:
if multi_byte:
ans.append(b|0x80)
if ans[-1] == 0xFF:
ans.append(0x80)
if len(ans) == 4:
return pack('>BBBB',ans[3],ans[2],ans[1],ans[0]).decode('iso-8859-1')
elif len(ans) == 3:
return pack('>BBB',ans[2],ans[1],ans[0]).decode('iso-8859-1')
elif len(ans) == 2:
return pack('>BB',ans[1],ans[0]).decode('iso-8859-1')
else:
return pack('>B', b).decode('iso-8859-1')
else:
if len(ans):
ans.append(b|0x80)
else:
ans.append(b)
# If value == 0, return 0
return pack('>B', 0x0).decode('iso-8859-1')
def generate_dkey(self):
for x in self.topaz_headers:
if is_dkey(self.topaz_headers[x]['tag']):
if self.topaz_headers[x]['blocks']:
offset = self.base + self.topaz_headers[x]['blocks'][0]['offset']
len_uncomp = self.topaz_headers[x]['blocks'][0]['len_uncomp']
break
else:
return None
dkey = self.topaz_headers[x]
dks = StringIO()
dks.write(self.encode_vwi(len(dkey['tag'])))
offset += 1
dks.write(dkey['tag'])
offset += len('dkey')
dks.write('\0')
offset += 1
dks.write(self.data[offset:offset + len_uncomp].decode('iso-8859-1'))
return dks.getvalue().encode('iso-8859-1')
def get_headers(self, offset):
# Build a dict of topaz_header records, list of order
topaz_headers = {}
th_seq = []
for x in range(self.header_records):
offset += 1
taglen, consumed = self.decode_vwi(self.data[offset:offset+4])
offset += consumed
tag = self.data[offset:offset+taglen]
offset += taglen
num_vals, consumed = self.decode_vwi(self.data[offset:offset+4])
offset += consumed
blocks = {}
for val in range(num_vals):
hdr_offset, consumed = self.decode_vwi(self.data[offset:offset+4])
offset += consumed
len_uncomp, consumed = self.decode_vwi(self.data[offset:offset+4])
offset += consumed
len_comp, consumed = self.decode_vwi(self.data[offset:offset+4])
offset += consumed
blocks[val] = dict(offset=hdr_offset,len_uncomp=len_uncomp,len_comp=len_comp)
topaz_headers[tag] = dict(blocks=blocks)
th_seq.append(tag)
self.eoth = self.data[offset]
offset += 1
self.base = offset
return topaz_headers, th_seq
def generate_metadata_stream(self):
ms = StringIO()
ms.write(self.encode_vwi(len(self.md_header['tag'])).encode('iso-8859-1'))
ms.write(self.md_header['tag'])
ms.write(int_to_byte(self.md_header['flags']))
ms.write(int_to_byte(len(self.metadata)))
# Add the metadata fields.
# for tag in self.metadata:
for tag in self.md_seq:
ms.write(self.encode_vwi(len(tag)).encode('iso-8859-1'))
ms.write(tag)
ms.write(self.encode_vwi(len(self.metadata[tag])).encode('iso-8859-1'))
ms.write(self.metadata[tag])
return ms.getvalue()
def get_metadata(self):
''' Return MetaInformation with title, author'''
self.get_original_metadata()
title = force_unicode(self.metadata['Title'], 'utf-8')
authors = force_unicode(self.metadata['Authors'], 'utf-8').split(';')
return MetaInformation(title, authors)
def get_original_metadata(self):
offset = self.base + self.topaz_headers['metadata']['blocks'][0]['offset']
self.md_header = {}
taglen, consumed = self.decode_vwi(self.data[offset:offset+4])
offset += consumed
self.md_header['tag'] = self.data[offset:offset+taglen]
offset += taglen
self.md_header['flags'] = ord(self.data[offset:offset+1])
offset += 1
self.md_header['num_recs'] = ord(self.data[offset:offset+1])
offset += 1
# print "self.md_header: %s" % self.md_header
self.metadata = {}
self.md_seq = []
for x in range(self.md_header['num_recs']):
taglen, consumed = self.decode_vwi(self.data[offset:offset+4])
offset += consumed
tag = self.data[offset:offset+taglen]
offset += taglen
md_len, consumed = self.decode_vwi(self.data[offset:offset+4])
offset += consumed
metadata = self.data[offset:offset + md_len]
offset += md_len
self.metadata[tag] = metadata
self.md_seq.append(tag)
def regenerate_headers(self, updated_md_len):
original_md_len = self.topaz_headers['metadata']['blocks'][0]['len_uncomp']
original_md_offset = self.topaz_headers['metadata']['blocks'][0]['offset']
delta = updated_md_len - original_md_len
# Copy the first 5 bytes of the file: sig + num_recs
ths = io.StringIO()
ths.write(self.data[:5])
# Rewrite the offsets for hdr_offsets > metadata offset
for tag in self.th_seq:
ths.write('c')
ths.write(self.encode_vwi(len(tag)))
ths.write(tag)
if self.topaz_headers[tag]['blocks']:
ths.write(self.encode_vwi(len(self.topaz_headers[tag]['blocks'])))
for block in self.topaz_headers[tag]['blocks']:
b = self.topaz_headers[tag]['blocks'][block]
if b['offset'] <= original_md_offset:
ths.write(self.encode_vwi(b['offset']))
else:
ths.write(self.encode_vwi(b['offset'] + delta))
if tag == 'metadata':
ths.write(self.encode_vwi(updated_md_len))
else:
ths.write(self.encode_vwi(b['len_uncomp']))
ths.write(self.encode_vwi(b['len_comp']))
else:
ths.write(self.encode_vwi(0))
self.original_md_start = original_md_offset + self.base
self.original_md_len = original_md_len
return ths.getvalue().encode('iso-8859-1')
def update(self,mi):
# Collect the original metadata
self.get_original_metadata()
try:
from calibre.ebooks.conversion.config import load_defaults
prefs = load_defaults('mobi_output')
pas = prefs.get('prefer_author_sort', False)
except:
pas = False
if mi.author_sort and pas:
authors = mi.author_sort
self.metadata['Authors'] = authors.encode('utf-8')
elif mi.authors:
authors = '; '.join(mi.authors)
self.metadata['Authors'] = authors.encode('utf-8')
self.metadata['Title'] = mi.title.encode('utf-8')
updated_metadata = self.generate_metadata_stream()
# Skip tag_len, tag, extra
prefix = len('metadata') + 2
um_buf_len = len(updated_metadata) - prefix
head = self.regenerate_headers(um_buf_len)
# Chunk1: self.base -> original metadata start
# Chunk2: original metadata end -> eof
chunk1 = self.data[self.base:self.original_md_start]
chunk2 = self.data[prefix + self.original_md_start + self.original_md_len:]
self.stream.seek(0)
self.stream.truncate(0)
# Write the revised stream
self.stream.write(head)
self.stream.write('d')
self.stream.write(chunk1)
self.stream.write(updated_metadata)
self.stream.write(chunk2)
def get_metadata(stream):
mu = MetadataUpdater(stream)
return mu.get_metadata()
def set_metadata(stream, mi):
mu = MetadataUpdater(stream)
mu.update(mi)
return
if __name__ == '__main__':
if False:
# Test get_metadata()
print(get_metadata(open(sys.argv[1], 'rb')))
else:
# Test set_metadata()
stream = io.BytesIO()
with open(sys.argv[1], 'rb') as data:
stream.write(data.read())
mi = MetaInformation(title="Updated Title", authors=['Author, Random'])
set_metadata(stream, mi)
# Write the result
tokens = sys.argv[1].rpartition('.')
with open(tokens[0]+'-updated' + '.' + tokens[2],'wb') as updated_data:
updated_data.write(stream.getvalue())