%PDF- %PDF-
| Direktori : /lib/calibre/calibre/ebooks/compression/ |
| Current File : //lib/calibre/calibre/ebooks/compression/tcr.py |
__license__ = 'GPL 3'
__copyright__ = '2009, John Schember <john@nachtimwald.com>'
__docformat__ = 'restructuredtext en'
import re
from polyglot.builtins import int_to_byte
class TCRCompressor:
'''
TCR compression takes the form header+code_dict+coded_text.
The header is always "!!8-Bit!!". The code dict is a list of 256 strings.
The list takes the form 1 byte length and then a string. Each position in
The list corresponds to a code found in the file. The coded text is
string of characters values. for instance the character Q represents the
value 81 which corresponds to the string in the code list at position 81.
'''
def _reset(self):
# List of indexes in the codes list that are empty and can hold new codes
self.unused_codes = set()
self.coded_txt = b''
# Generate initial codes from text.
# The index of the list will be the code that represents the characters at that location
# in the list
self.codes = []
def _combine_codes(self):
'''
Combine two codes that always appear in pair into a single code.
The intent is to create more unused codes.
'''
possible_codes = []
a_code = set(re.findall(b'(?ms).', self.coded_txt))
for code in a_code:
single_code = set(re.findall(b'(?ms)%s.' % re.escape(code), self.coded_txt))
if len(single_code) == 1:
possible_codes.append(single_code.pop())
for code in possible_codes:
self.coded_txt = self.coded_txt.replace(code, code[0:1])
self.codes[code[0]] = b'%s%s' % (self.codes[code[0]], self.codes[code[1]])
def _free_unused_codes(self):
'''
Look for codes that do no not appear in the coded text and add them to
the list of free codes.
'''
for i in range(256):
if i not in self.unused_codes:
if int_to_byte(i) not in self.coded_txt:
self.unused_codes.add(i)
def _new_codes(self):
'''
Create new codes from codes that occur in pairs often.
'''
possible_new_codes = list(set(re.findall(b'(?ms)..', self.coded_txt)))
new_codes_count = []
for c in possible_new_codes:
count = self.coded_txt.count(c)
# Less than 3 occurrences will not produce any size reduction.
if count > 2:
new_codes_count.append((c, count))
# Arrange the codes in order of least to most occurring.
possible_new_codes = [x[0] for x in sorted(new_codes_count, key=lambda c: c[1])]
return possible_new_codes
def compress(self, txt):
self._reset()
self.codes = list(set(re.findall(b'(?ms).', txt)))
# Replace the text with their corresponding code
# FIXME: python3 is native bytearray, but all we want are bytes
for c in bytearray(txt):
self.coded_txt += int_to_byte(self.codes.index(int_to_byte(c)))
# Zero the unused codes and record which are unused.
for i in range(len(self.codes), 256):
self.codes.append(b'')
self.unused_codes.add(i)
self._combine_codes()
possible_codes = self._new_codes()
while possible_codes and self.unused_codes:
while possible_codes and self.unused_codes:
unused_code = self.unused_codes.pop()
# Take the last possible codes and split it into individual
# codes. The last possible code is the most often occurring.
code = possible_codes.pop()
self.codes[unused_code] = b'%s%s' % (self.codes[ord(code[0:1])], self.codes[ord(code[1:2])])
self.coded_txt = self.coded_txt.replace(code, int_to_byte(unused_code))
self._combine_codes()
self._free_unused_codes()
possible_codes = self._new_codes()
self._free_unused_codes()
# Generate the code dictionary.
code_dict = []
for i in range(0, 256):
if i in self.unused_codes:
code_dict.append(b'\0')
else:
code_dict.append(int_to_byte(len(self.codes[i])) + self.codes[i])
# Join the identifier with the dictionary and coded text.
return b'!!8-Bit!!'+b''.join(code_dict)+self.coded_txt
def decompress(stream):
txt = []
stream.seek(0)
if stream.read(9) != b'!!8-Bit!!':
raise ValueError('File %s contains an invalid TCR header.' % stream.name)
# Codes that the file contents are broken down into.
entries = []
for i in range(256):
entry_len = ord(stream.read(1))
entries.append(stream.read(entry_len))
# Map the values in the file to locations in the string list.
entry_loc = stream.read(1)
while entry_loc != b'': # EOF
txt.append(entries[ord(entry_loc)])
entry_loc = stream.read(1)
return b''.join(txt)
def compress(txt):
t = TCRCompressor()
return t.compress(txt)