%PDF- %PDF-
| Direktori : /proc/thread-self/root/usr/lib/calibre/calibre/ebooks/mobi/ |
| Current File : //proc/thread-self/root/usr/lib/calibre/calibre/ebooks/mobi/utils.py |
#!/usr/bin/env python3
__license__ = 'GPL v3'
__copyright__ = '2011, Kovid Goyal <kovid@kovidgoyal.net>'
__docformat__ = 'restructuredtext en'
import struct, string, zlib, os
from collections import OrderedDict
from io import BytesIO
from calibre.utils.img import save_cover_data_to, scale_image, image_to_data, image_from_data, resize_image, png_data_to_gif_data
from calibre.utils.imghdr import what
from calibre.ebooks import normalize
from polyglot.builtins import as_bytes
from tinycss.color3 import parse_color_string
IMAGE_MAX_SIZE = 10 * 1024 * 1024
RECORD_SIZE = 0x1000 # 4096 (Text record size (uncompressed))
class PolyglotDict(dict):
def __setitem__(self, key, val):
if isinstance(key, str):
key = key.encode('utf-8')
dict.__setitem__(self, key, val)
def __getitem__(self, key):
if isinstance(key, str):
key = key.encode('utf-8')
return dict.__getitem__(self, key)
def __contains__(self, key):
if isinstance(key, str):
key = key.encode('utf-8')
return dict.__contains__(self, key)
def decode_string(raw, codec='utf-8', ordt_map=None):
length, = struct.unpack(b'>B', raw[0:1])
raw = raw[1:1+length]
consumed = length+1
if ordt_map:
return ''.join(ordt_map[x] for x in bytearray(raw)), consumed
return raw.decode(codec), consumed
def decode_hex_number(raw, codec='utf-8'):
'''
Return a variable length number encoded using hexadecimal encoding. These
numbers have the first byte which tells the number of bytes that follow.
The bytes that follow are simply the hexadecimal representation of the
number.
:param raw: Raw binary data as a bytestring
:return: The number and the number of bytes from raw that the number
occupies.
'''
raw, consumed = decode_string(raw, codec=codec)
return int(raw, 16), consumed
def encode_string(raw):
ans = bytearray(as_bytes(raw))
ans.insert(0, len(ans))
return bytes(ans)
def encode_number_as_hex(num):
'''
Encode num as a variable length encoded hexadecimal number. Returns the
bytestring containing the encoded number. These
numbers have the first byte which tells the number of bytes that follow.
The bytes that follow are simply the hexadecimal representation of the
number.
'''
num = hex(num)[2:].upper().encode('ascii')
nlen = len(num)
if nlen % 2 != 0:
num = b'0'+num
return encode_string(num)
def encint(value, forward=True):
'''
Some parts of the Mobipocket format encode data as variable-width integers.
These integers are represented big-endian with 7 bits per byte in bits 1-7.
They may be either forward-encoded, in which case only the first byte has bit 8 set,
or backward-encoded, in which case only the last byte has bit 8 set.
For example, the number 0x11111 = 0b10001000100010001 would be represented
forward-encoded as:
0x04 0x22 0x91 = 0b100 0b100010 0b10010001
And backward-encoded as:
0x84 0x22 0x11 = 0b10000100 0b100010 0b10001
This function encodes the integer ``value`` as a variable width integer and
returns the bytestring corresponding to it.
If forward is True the bytes returned are suitable for prepending to the
output buffer, otherwise they must be append to the output buffer.
'''
if value < 0:
raise ValueError('Cannot encode negative numbers as vwi')
# Encode vwi
byts = bytearray()
while True:
b = value & 0b01111111
value >>= 7 # shift value to the right by 7 bits
byts.append(b)
if value == 0:
break
byts[0 if forward else -1] |= 0b10000000
byts.reverse()
return bytes(byts)
def decint(raw, forward=True):
'''
Read a variable width integer from the bytestring or bytearray raw and return the
integer and the number of bytes read. If forward is True bytes are read
from the start of raw, otherwise from the end of raw.
This function is the inverse of encint above, see its docs for more
details.
'''
val = 0
byts = bytearray()
src = bytearray(raw)
if not forward:
src.reverse()
for bnum in src:
byts.append(bnum & 0b01111111)
if bnum & 0b10000000:
break
if not forward:
byts.reverse()
for byte in byts:
val <<= 7 # Shift value to the left by 7 bits
val |= byte
return val, len(byts)
def test_decint(num):
for d in (True, False):
raw = encint(num, forward=d)
sz = len(raw)
if (num, sz) != decint(raw, forward=d):
raise ValueError('Failed for num %d, forward=%r: %r != %r' % (
num, d, (num, sz), decint(raw, forward=d)))
def rescale_image(data, maxsizeb=IMAGE_MAX_SIZE, dimen=None):
'''
Convert image setting all transparent pixels to white and changing format
to JPEG. Ensure the resultant image has a byte size less than
maxsizeb.
If dimen is not None, generate a thumbnail of
width=dimen, height=dimen or width, height = dimen (depending on the type
of dimen)
Returns the image as a bytestring
'''
if dimen is not None:
if hasattr(dimen, '__len__'):
width, height = dimen
else:
width = height = dimen
data = scale_image(data, width=width, height=height, compression_quality=90)[-1]
else:
# Replace transparent pixels with white pixels and convert to JPEG
data = save_cover_data_to(data)
if len(data) <= maxsizeb:
return data
orig_data = data # save it in case compression fails
quality = 90
while len(data) > maxsizeb and quality >= 5:
data = image_to_data(image_from_data(orig_data), compression_quality=quality)
quality -= 5
if len(data) <= maxsizeb:
return data
orig_data = data
scale = 0.9
while len(data) > maxsizeb and scale >= 0.05:
img = image_from_data(data)
w, h = img.width(), img.height()
img = resize_image(img, int(scale*w), int(scale*h))
data = image_to_data(img, compression_quality=quality)
scale -= 0.05
return data
def get_trailing_data(record, extra_data_flags):
'''
Given a text record as a bytestring and the extra data flags from the MOBI
header, return the trailing data as a dictionary, mapping bit number to
data as bytestring. Also returns the record - all trailing data.
:return: Trailing data, record - trailing data
'''
data = OrderedDict()
flags = extra_data_flags >> 1
num = 0
while flags:
num += 1
if flags & 0b1:
sz, consumed = decint(record, forward=False)
if sz > consumed:
data[num] = record[-sz:-consumed]
record = record[:-sz]
flags >>= 1
# Read multibyte chars if any
if extra_data_flags & 0b1:
# Only the first two bits are used for the size since there can
# never be more than 3 trailing multibyte chars
sz = (ord(record[-1:]) & 0b11) + 1
consumed = 1
if sz > consumed:
data[0] = record[-sz:-consumed]
record = record[:-sz]
return data, record
def encode_trailing_data(raw):
'''
Given some data in the bytestring raw, return a bytestring of the form
<data><size>
where size is a backwards encoded vwi whose value is the length of the
entire returned bytestring. data is the bytestring passed in as raw.
This is the encoding used for trailing data entries at the end of text
records. See get_trailing_data() for details.
'''
lsize = 1
while True:
encoded = encint(len(raw) + lsize, forward=False)
if len(encoded) == lsize:
break
lsize += 1
return raw + encoded
def encode_fvwi(val, flags, flag_size=4):
'''
Encode the value val and the flag_size bits from flags as a fvwi. This encoding is
used in the trailing byte sequences for indexing. Returns encoded
bytestring.
'''
ans = val << flag_size
for i in range(flag_size):
ans |= (flags & (1 << i))
return encint(ans)
def decode_fvwi(byts, flag_size=4):
'''
Decode encoded fvwi. Returns number, flags, consumed
'''
arg, consumed = decint(bytes(byts))
val = arg >> flag_size
flags = 0
for i in range(flag_size):
flags |= (arg & (1 << i))
return val, flags, consumed
def decode_tbs(byts, flag_size=4):
'''
Trailing byte sequences for indexing consists of series of fvwi numbers.
This function reads the fvwi number and its associated flags. It then uses
the flags to read any more numbers that belong to the series. The flags are
the lowest 4 bits of the vwi (see the encode_fvwi function above).
Returns the fvwi number, a dictionary mapping flags bits to the associated
data and the number of bytes consumed.
'''
byts = bytes(byts)
val, flags, consumed = decode_fvwi(byts, flag_size=flag_size)
extra = {}
byts = byts[consumed:]
if flags & 0b1000 and flag_size > 3:
extra[0b1000] = True
if flags & 0b0010:
x, consumed2 = decint(byts)
byts = byts[consumed2:]
extra[0b0010] = x
consumed += consumed2
if flags & 0b0100:
extra[0b0100] = ord(byts[0:1])
byts = byts[1:]
consumed += 1
if flags & 0b0001:
x, consumed2 = decint(byts)
byts = byts[consumed2:]
extra[0b0001] = x
consumed += consumed2
return val, extra, consumed
def encode_tbs(val, extra, flag_size=4):
'''
Encode the number val and the extra data in the extra dict as an fvwi. See
decode_tbs above.
'''
flags = 0
for flag in extra:
flags |= flag
ans = encode_fvwi(val, flags, flag_size=flag_size)
if 0b0010 in extra:
ans += encint(extra[0b0010])
if 0b0100 in extra:
ans += bytes(bytearray([extra[0b0100]]))
if 0b0001 in extra:
ans += encint(extra[0b0001])
return ans
def utf8_text(text):
'''
Convert a possibly null string to utf-8 bytes, guaranteeing to return a non
empty, normalized bytestring.
'''
if text and text.strip():
text = text.strip()
if not isinstance(text, str):
text = text.decode('utf-8', 'replace')
text = normalize(text).encode('utf-8')
else:
text = _('Unknown').encode('utf-8')
return text
def align_block(raw, multiple=4, pad=b'\0'):
'''
Return raw with enough pad bytes append to ensure its length is a multiple
of 4.
'''
extra = len(raw) % multiple
if extra == 0:
return raw
return raw + pad*(multiple - extra)
def detect_periodical(toc, log=None):
'''
Detect if the TOC object toc contains a periodical that conforms to the
structure required by kindlegen to generate a periodical.
'''
if toc.count() < 1 or not toc[0].klass == 'periodical':
return False
for node in toc.iterdescendants():
if node.depth() == 1 and node.klass != 'article':
if log is not None:
log.debug(
'Not a periodical: Deepest node does not have '
'class="article"')
return False
if node.depth() == 2 and node.klass != 'section':
if log is not None:
log.debug(
'Not a periodical: Second deepest node does not have'
' class="section"')
return False
if node.depth() == 3 and node.klass != 'periodical':
if log is not None:
log.debug('Not a periodical: Third deepest node'
' does not have class="periodical"')
return False
if node.depth() > 3:
if log is not None:
log.debug('Not a periodical: Has nodes of depth > 3')
return False
return True
def count_set_bits(num):
if num < 0:
num = -num
ans = 0
while num > 0:
ans += (num & 0b1)
num >>= 1
return ans
def to_base(num, base=32, min_num_digits=None):
digits = string.digits + string.ascii_uppercase
sign = 1 if num >= 0 else -1
if num == 0:
return ('0' if min_num_digits is None else '0'*min_num_digits)
num *= sign
ans = []
while num:
ans.append(digits[(num % base)])
num //= base
if min_num_digits is not None and len(ans) < min_num_digits:
ans.extend('0'*(min_num_digits - len(ans)))
if sign < 0:
ans.append('-')
ans.reverse()
return ''.join(ans)
def mobify_image(data):
'Convert PNG images to GIF as the idiotic Kindle cannot display some PNG'
fmt = what(None, data)
if fmt == 'png':
data = png_data_to_gif_data(data)
return data
# Font records {{{
def read_font_record(data, extent=1040):
'''
Return the font encoded in the MOBI FONT record represented by data.
The return value in a dict with fields raw_data, font_data, err, ext,
headers.
:param extent: The number of obfuscated bytes. So far I have only
encountered files with 1040 obfuscated bytes. If you encounter an
obfuscated record for which this function fails, try different extent
values (easily automated).
raw_data is the raw data in the font record
font_data is the decoded font_data or None if an error occurred
err is not None if some error occurred
ext is the font type (ttf for TrueType, dat for unknown and failed if an
error occurred)
headers is the list of decoded headers from the font record or None if
decoding failed
'''
# Format:
# bytes 0 - 3: 'FONT'
# bytes 4 - 7: Uncompressed size
# bytes 8 - 11: flags
# bit 1 - zlib compression
# bit 2 - XOR obfuscated
# bytes 12 - 15: offset to start of compressed data
# bytes 16 - 19: length of XOR string
# bytes 19 - 23: offset to start of XOR data
# The zlib compressed data begins with 2 bytes of header and
# has 4 bytes of checksum at the end
ans = {'raw_data':data, 'font_data':None, 'err':None, 'ext':'failed',
'headers':None, 'encrypted':False}
try:
usize, flags, dstart, xor_len, xor_start = struct.unpack_from(
b'>LLLLL', data, 4)
except:
ans['err'] = 'Failed to read font record header fields'
return ans
font_data = data[dstart:]
ans['headers'] = {'usize':usize, 'flags':bin(flags), 'xor_len':xor_len,
'xor_start':xor_start, 'dstart':dstart}
if flags & 0b10:
# De-obfuscate the data
key = bytearray(data[xor_start:xor_start+xor_len])
buf = bytearray(font_data)
extent = len(font_data) if extent is None else extent
extent = min(extent, len(font_data))
for n in range(extent):
buf[n] ^= key[n%xor_len] # XOR of buf and key
font_data = bytes(buf)
ans['encrypted'] = True
if flags & 0b1:
# ZLIB compressed data
try:
font_data = zlib.decompress(font_data)
except Exception as e:
ans['err'] = 'Failed to zlib decompress font data (%s)'%e
return ans
if len(font_data) != usize:
ans['err'] = 'Uncompressed font size mismatch'
return ans
ans['font_data'] = font_data
sig = font_data[:4]
ans['ext'] = ('ttf' if sig in {b'\0\1\0\0', b'true', b'ttcf'}
else 'otf' if sig == b'OTTO' else 'dat')
return ans
def write_font_record(data, obfuscate=True, compress=True):
'''
Write the ttf/otf font represented by data into a font record. See
read_font_record() for details on the format of the record.
'''
flags = 0
key_len = 20
usize = len(data)
xor_key = b''
if compress:
flags |= 0b1
data = zlib.compress(data, 9)
if obfuscate and len(data) >= 1040:
flags |= 0b10
xor_key = os.urandom(key_len)
key = bytearray(xor_key)
data = bytearray(data)
for i in range(1040):
data[i] ^= key[i%key_len]
data = bytes(data)
key_start = struct.calcsize(b'>5L') + 4
data_start = key_start + len(xor_key)
header = b'FONT' + struct.pack(b'>5L', usize, flags, data_start,
len(xor_key), key_start)
return header + xor_key + data
# }}}
def create_text_record(text):
'''
Return a Palmdoc record of size RECORD_SIZE from the text file object.
In case the record ends in the middle of a multibyte character return
the overlap as well.
Returns data, overlap: where both are byte strings. overlap is the
extra bytes needed to complete the truncated multibyte character.
'''
opos = text.tell()
text.seek(0, 2)
# npos is the position of the next record
npos = min((opos + RECORD_SIZE, text.tell()))
# Number of bytes from the next record needed to complete the last
# character in this record
extra = 0
last = b''
while not last.decode('utf-8', 'ignore'):
# last contains no valid utf-8 characters
size = len(last) + 1
text.seek(npos - size)
last = text.read(size)
# last now has one valid utf-8 char and possibly some bytes that belong
# to a truncated char
try:
last.decode('utf-8', 'strict')
except UnicodeDecodeError:
# There are some truncated bytes in last
prev = len(last)
while True:
text.seek(npos - prev)
last = text.read(len(last) + 1)
try:
last.decode('utf-8')
except UnicodeDecodeError:
pass
else:
break
extra = len(last) - prev
text.seek(opos)
data = text.read(RECORD_SIZE)
overlap = text.read(extra)
text.seek(npos)
return data, overlap
class CNCX: # {{{
'''
Create the CNCX records. These are records containing all the strings from
an index. Each record is of the form: <vwi string size><utf-8 encoded
string>
'''
MAX_STRING_LENGTH = 500
def __init__(self, strings=()):
self.strings = OrderedDict((s, 0) for s in strings)
self.records = []
offset = 0
buf = BytesIO()
RECORD_LIMIT = 0x10000 - 1024 # kindlegen appears to use 1024, PDB limit is 0x10000
for key in self.strings:
utf8 = utf8_text(key[:self.MAX_STRING_LENGTH])
l = len(utf8)
sz_bytes = encint(l)
raw = sz_bytes + utf8
if buf.tell() + len(raw) > RECORD_LIMIT:
self.records.append(align_block(buf.getvalue()))
buf.seek(0), buf.truncate(0)
offset = len(self.records) * 0x10000
buf.write(raw)
self.strings[key] = offset
offset += len(raw)
val = buf.getvalue()
if val:
self.records.append(align_block(val))
def __getitem__(self, string):
return self.strings[string]
def __bool__(self):
return bool(self.records)
__nonzero__ = __bool__
def __len__(self):
return len(self.records)
# }}}
def is_guide_ref_start(ref):
return (ref.title.lower() == 'start' or
(ref.type and ref.type.lower() in {'start',
'other.start', 'text'}))
def convert_color_for_font_tag(val):
rgba = parse_color_string(str(val or ''))
if rgba is None or rgba == 'currentColor':
return str(val)
clamp = lambda x: min(x, max(0, x), 1)
rgb = map(clamp, rgba[:3])
return '#' + ''.join(map(lambda x:'%02x' % int(x * 255), rgb))