%PDF- %PDF-
| Direktori : /lib/calibre/calibre/ebooks/rtf2xml/ |
| Current File : //lib/calibre/calibre/ebooks/rtf2xml/tokenize.py |
#########################################################################
# #
# #
# copyright 2002 Paul Henry Tremblay #
# #
# This program is distributed in the hope that it will be useful, #
# but WITHOUT ANY WARRANTY; without even the implied warranty of #
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU #
# General Public License for more details. #
# #
# #
#########################################################################
import os, re
from calibre.ebooks.rtf2xml import copy
from calibre.utils.mreplace import MReplace
from calibre.ptempfile import better_mktemp
from polyglot.builtins import codepoint_to_chr
from . import open_for_read, open_for_write
class Tokenize:
"""Tokenize RTF into one line per field. Each line will contain information useful for the rest of the script"""
def __init__(self,
in_file,
bug_handler,
copy=None,
run_level=1,
# out_file = None,
):
self.__file = in_file
self.__bug_handler = bug_handler
self.__copy = copy
self.__write_to = better_mktemp()
# self.__write_to = out_file
self.__compile_expressions()
# variables
self.__uc_char = 0
self.__uc_bin = False
self.__uc_value = [1]
def __reini_utf8_counters(self):
self.__uc_char = 0
self.__uc_bin = False
def __remove_uc_chars(self, startchar, token):
for i in range(startchar, len(token)):
if self.__uc_char:
self.__uc_char -= 1
else:
return token[i:]
# if only char to skip
return ''
def __unicode_process(self, token):
# change scope in
if token == r'\{':
self.__uc_value.append(self.__uc_value[-1])
# basic error handling
self.__reini_utf8_counters()
return token
# change scope out
elif token == r'\}':
self.__uc_value.pop()
self.__reini_utf8_counters()
return token
# add a uc control
elif token[:3] == '\\uc':
self.__uc_value[-1] = int(token[3:])
self.__reini_utf8_counters()
return token
# bin data to slip
elif self.__uc_bin:
self.__uc_bin = False
return ''
# uc char to remove
elif self.__uc_char:
# handle \bin tag in case of uc char to skip
if token[:4] == '\bin':
self.__uc_char -=1
self.__uc_bin = True
return ''
elif token[:1] == "\\" :
self.__uc_char -=1
return ''
else:
return self.__remove_uc_chars(0, token)
# go for real \u token
match_obj = self.__utf_exp.match(token)
if match_obj is not None:
self.__reini_utf8_counters()
# get value and handle negative case
uni_char = int(match_obj.group(1))
uni_len = len(match_obj.group(0))
if uni_char < 0:
uni_char += 65536
uni_char = codepoint_to_chr(uni_char).encode('ascii', 'xmlcharrefreplace').decode('ascii')
self.__uc_char = self.__uc_value[-1]
# there is only an unicode char
if len(token)<= uni_len:
return uni_char
# an unicode char and something else
# must be after as it is splited on \
# necessary? maybe for \bin?
elif not self.__uc_char:
return uni_char + token[uni_len:]
# if not uc0 and chars
else:
return uni_char + self.__remove_uc_chars(uni_len, token)
# default
return token
def __sub_reg_split(self,input_file):
input_file = self.__replace_spchar.mreplace(input_file)
# this is for older RTF
input_file = self.__par_exp.sub(r'\n\\par \n', input_file)
input_file = self.__cwdigit_exp.sub(r"\g<1>\n\g<2>", input_file)
input_file = self.__cs_ast.sub(r"\g<1>", input_file)
input_file = self.__ms_hex_exp.sub(r"\\mshex0\g<1> ", input_file)
input_file = self.__utf_ud.sub(r"\\{\\uc0 \g<1>\\}", input_file)
# remove \n in bin data
input_file = self.__bin_exp.sub(lambda x:
x.group().replace('\n', '') + '\n', input_file)
# split
tokens = re.split(self.__splitexp, input_file)
# remove empty tokens and \n
return list(filter(lambda x: len(x) > 0 and x != '\n', tokens))
def __compile_expressions(self):
SIMPLE_RPL = {
"\\\\": "\\backslash ",
"\\~": "\\~ ",
"\\;": "\\; ",
"&": "&",
"<": "<",
">": ">",
"\\~": "\\~ ",
"\\_": "\\_ ",
"\\:": "\\: ",
"\\-": "\\- ",
# turn into a generic token to eliminate special
# cases and make processing easier
"\\{": "\\ob ",
# turn into a generic token to eliminate special
# cases and make processing easier
"\\}": "\\cb ",
# put a backslash in front of to eliminate special cases and
# make processing easier
"{": "\\{",
# put a backslash in front of to eliminate special cases and
# make processing easier
"}": "\\}",
}
self.__replace_spchar = MReplace(SIMPLE_RPL)
# add ;? in case of char following \u
self.__ms_hex_exp = re.compile(r"\\\'([0-9a-fA-F]{2})")
self.__utf_exp = re.compile(r"\\u(-?\d{3,6}) ?")
self.__bin_exp = re.compile(r"(?:\\bin(-?\d{0,10})[\n ]+)[01\n]+")
# manage upr/ud situations
self.__utf_ud = re.compile(r"\\{[\n ]?\\upr[\n ]?(?:\\{.*?\\})[\n ]?" +
r"\\{[\n ]?\\*[\n ]?\\ud[\n ]?(\\{.*?\\})[\n ]?\\}[\n ]?\\}")
# add \n in split for whole file reading
# why keep backslash whereas \is replaced before?
# remove \n from endline char
self.__splitexp = re.compile(r"(\\[{}]|\n|\\[^\s\\{}&]+(?:[ \t\r\f\v])?)")
# this is for old RTF
self.__par_exp = re.compile(r'(\\\n+|\\ )')
# handle improper cs char-style with \* before without {
self.__cs_ast = re.compile(r'\\\*([\n ]*\\cs\d+[\n \\]+)')
# handle cw using a digit as argument and without space as delimiter
self.__cwdigit_exp = re.compile(r"(\\[a-zA-Z]+[\-0-9]+)([^0-9 \\]+)")
def tokenize(self):
"""Main class for handling other methods. Reads the file \
, uses method self.sub_reg to make basic substitutions,\
and process tokens by itself"""
# read
with open_for_read(self.__file) as read_obj:
input_file = read_obj.read()
# process simple replacements and split giving us a correct list
# remove '' and \n in the process
tokens = self.__sub_reg_split(input_file)
# correct unicode
tokens = map(self.__unicode_process, tokens)
# remove empty items created by removing \uc
tokens = list(filter(lambda x: len(x) > 0, tokens))
# write
with open_for_write(self.__write_to) as write_obj:
write_obj.write('\n'.join(tokens))
# Move and copy
copy_obj = copy.Copy(bug_handler=self.__bug_handler)
if self.__copy:
copy_obj.copy_file(self.__write_to, "tokenize.data")
copy_obj.rename(self.__write_to, self.__file)
os.remove(self.__write_to)
# self.__special_tokens = [ '_', '~', "'", '{', '}' ]
# import sys
# def main(args=sys.argv):
# if len(args) < 2:
# print 'No file'
# return
# file = 'data_tokens.txt'
# if len(args) == 3:
# file = args[2]
# to = Tokenize(args[1], Exception, out_file = file)
# to.tokenize()
# if __name__ == '__main__':
# sys.exit(main())
# calibre-debug -e src/calibre/ebooks/rtf2xml/tokenize.py