%PDF- %PDF-
| Direktori : /lib/calibre/calibre/ebooks/rtf2xml/ |
| Current File : //lib/calibre/calibre/ebooks/rtf2xml/hex_2_utf8.py |
#########################################################################
# #
# #
# copyright 2002 Paul Henry Tremblay #
# #
# This program is distributed in the hope that it will be useful, #
# but WITHOUT ANY WARRANTY; without even the implied warranty of #
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU #
# General Public License for more details. #
# #
# #
#########################################################################
import sys, os, io
from calibre.ebooks.rtf2xml import get_char_map, copy
from calibre.ebooks.rtf2xml.char_set import char_set
from calibre.ptempfile import better_mktemp
from . import open_for_read, open_for_write
class Hex2Utf8:
"""
Convert Microsoft hexadecimal numbers to utf-8
"""
def __init__(self,
in_file,
area_to_convert,
char_file,
default_char_map,
bug_handler,
invalid_rtf_handler,
copy=None,
temp_dir=None,
symbol=None,
wingdings=None,
caps=None,
convert_caps=None,
dingbats=None,
run_level=1,
):
"""
Required:
'file'
'area_to_convert'--the area of file to convert
'char_file'--the file containing the character mappings
'default_char_map'--name of default character map
Optional:
'copy'-- whether to make a copy of result for debugging
'temp_dir' --where to output temporary results (default is
directory from which the script is run.)
'symbol'--whether to load the symbol character map
'winddings'--whether to load the wingdings character map
'caps'--whether to load the caps character map
'convert_to_caps'--wether to convert caps to utf-8
Returns:
nothing
"""
self.__file = in_file
self.__copy = copy
if area_to_convert not in ('preamble', 'body'):
msg = (
'Developer error! Wrong flag.\n'
'in module "hex_2_utf8.py\n'
'"area_to_convert" must be "body" or "preamble"\n'
)
raise self.__bug_handler(msg)
self.__char_file = char_file
self.__area_to_convert = area_to_convert
self.__default_char_map = default_char_map
self.__symbol = symbol
self.__wingdings = wingdings
self.__dingbats = dingbats
self.__caps = caps
self.__convert_caps = 0
self.__convert_symbol = 0
self.__convert_wingdings = 0
self.__convert_zapf = 0
self.__run_level = run_level
self.__write_to = better_mktemp()
self.__bug_handler = bug_handler
self.__invalid_rtf_handler = invalid_rtf_handler
def update_values(self,
file,
area_to_convert,
char_file,
convert_caps,
convert_symbol,
convert_wingdings,
convert_zapf,
copy=None,
temp_dir=None,
symbol=None,
wingdings=None,
caps=None,
dingbats=None,
):
"""
Required:
'file'
'area_to_convert'--the area of file to convert
'char_file'--the file containing the character mappings
Optional:
'copy'-- whether to make a copy of result for debugging
'temp_dir' --where to output temporary results (default is
directory from which the script is run.)
'symbol'--whether to load the symbol character map
'winddings'--whether to load the wingdings character map
'caps'--whether to load the caps character map
'convert_to_caps'--wether to convert caps to utf-8
Returns:
nothing
"""
self.__file=file
self.__copy = copy
if area_to_convert not in ('preamble', 'body'):
msg = (
'in module "hex_2_utf8.py\n'
'"area_to_convert" must be "body" or "preamble"\n'
)
raise self.__bug_handler(msg)
self.__area_to_convert = area_to_convert
self.__symbol = symbol
self.__wingdings = wingdings
self.__dingbats = dingbats
self.__caps = caps
self.__convert_caps = convert_caps
self.__convert_symbol = convert_symbol
self.__convert_wingdings = convert_wingdings
self.__convert_zapf = convert_zapf
# new!
# no longer try to convert these
# self.__convert_symbol = 0
# self.__convert_wingdings = 0
# self.__convert_zapf = 0
def __initiate_values(self):
"""
Required:
Nothing
Set values, including those for the dictionaries.
The file that contains the maps is broken down into many different
sets. For example, for the Symbol font, there is the standard part for
hexadecimal numbers, and the part for Microsoft characters. Read
each part in, and then combine them.
"""
# the default encoding system, the lower map for characters 0 through
# 128, and the encoding system for Microsoft characters.
# New on 2004-05-8: the self.__char_map is not in directory with other
# modules
self.__char_file = io.StringIO(char_set)
char_map_obj = get_char_map.GetCharMap(
char_file=self.__char_file,
bug_handler=self.__bug_handler,
)
up_128_dict = char_map_obj.get_char_map(map=self.__default_char_map)
bt_128_dict = char_map_obj.get_char_map(map='bottom_128')
ms_standard_dict = char_map_obj.get_char_map(map='ms_standard')
self.__def_dict = {}
self.__def_dict.update(up_128_dict)
self.__def_dict.update(bt_128_dict)
self.__def_dict.update(ms_standard_dict)
self.__current_dict = self.__def_dict
self.__current_dict_name = 'default'
self.__in_caps = 0
self.__special_fonts_found = 0
if self.__symbol:
symbol_base_dict = char_map_obj.get_char_map(map='SYMBOL')
ms_symbol_dict = char_map_obj.get_char_map(map='ms_symbol')
self.__symbol_dict = {}
self.__symbol_dict.update(symbol_base_dict)
self.__symbol_dict.update(ms_symbol_dict)
if self.__wingdings:
wingdings_base_dict = char_map_obj.get_char_map(map='wingdings')
ms_wingdings_dict = char_map_obj.get_char_map(map='ms_wingdings')
self.__wingdings_dict = {}
self.__wingdings_dict.update(wingdings_base_dict)
self.__wingdings_dict.update(ms_wingdings_dict)
if self.__dingbats:
dingbats_base_dict = char_map_obj.get_char_map(map='dingbats')
ms_dingbats_dict = char_map_obj.get_char_map(map='ms_dingbats')
self.__dingbats_dict = {}
self.__dingbats_dict.update(dingbats_base_dict)
self.__dingbats_dict.update(ms_dingbats_dict)
# load dictionary for caps, and make a string for the replacement
self.__caps_uni_dict = char_map_obj.get_char_map(map='caps_uni')
# # print self.__caps_uni_dict
# don't think I'll need this
# keys = self.__caps_uni_dict.keys()
# self.__caps_uni_replace = '|'.join(keys)
self.__preamble_state_dict = {
'preamble' : self.__preamble_func,
'body' : self.__body_func,
'mi<mk<body-open_' : self.__found_body_func,
'tx<hx<__________' : self.__hex_text_func,
}
self.__body_state_dict = {
'preamble' : self.__preamble_for_body_func,
'body' : self.__body_for_body_func,
}
self.__in_body_dict = {
'mi<mk<body-open_' : self.__found_body_func,
'tx<ut<__________' : self.__utf_to_caps_func,
'tx<hx<__________' : self.__hex_text_func,
'tx<mc<__________' : self.__hex_text_func,
'tx<nu<__________' : self.__text_func,
'mi<mk<font______' : self.__start_font_func,
'mi<mk<caps______' : self.__start_caps_func,
'mi<mk<font-end__' : self.__end_font_func,
'mi<mk<caps-end__' : self.__end_caps_func,
}
self.__caps_list = ['false']
self.__font_list = ['not-defined']
def __hex_text_func(self, line):
"""
Required:
'line' -- the line
Logic:
get the hex_num and look it up in the default dictionary. If the
token is in the dictionary, then check if the value starts with a
"&". If it does, then tag the result as utf text. Otherwise, tag it
as normal text.
If the hex_num is not in the dictionary, then a mistake has been
made.
"""
hex_num = line[17:-1]
converted = self.__current_dict.get(hex_num)
if converted is not None:
# tag as utf-8
if converted[0:1] == "&":
font = self.__current_dict_name
if self.__convert_caps\
and self.__caps_list[-1] == 'true'\
and font not in ('Symbol', 'Wingdings', 'Zapf Dingbats'):
converted = self.__utf_token_to_caps_func(converted)
self.__write_obj.write(
'tx<ut<__________<%s\n' % converted
)
# tag as normal text
else:
font = self.__current_dict_name
if self.__convert_caps\
and self.__caps_list[-1] == 'true'\
and font not in ('Symbol', 'Wingdings', 'Zapf Dingbats'):
converted = converted.upper()
self.__write_obj.write(
'tx<nu<__________<%s\n' % converted
)
# error
else:
token = hex_num.replace("'", '')
the_num = 0
if token:
the_num = int(token, 16)
if the_num > 10:
self.__write_obj.write('mi<tg<empty-att_<udef_symbol<num>%s<description>not-in-table\n' %
hex_num)
if self.__run_level > 4:
# msg = 'no dictionary entry for %s\n'
# msg += 'the hexadecimal num is "%s"\n' % (hex_num)
# msg += 'dictionary is %s\n' % self.__current_dict_name
msg = 'Character "&#x%s;" does not appear to be valid (or is a control character)\n' % token
raise self.__bug_handler(msg)
def __found_body_func(self, line):
self.__state = 'body'
self.__write_obj.write(line)
def __body_func(self, line):
"""
When parsing preamble
"""
self.__write_obj.write(line)
def __preamble_func(self, line):
action = self.__preamble_state_dict.get(self.__token_info)
if action is not None:
action(line)
else:
self.__write_obj.write(line)
def __convert_preamble(self):
self.__state = 'preamble'
with open_for_write(self.__write_to) as self.__write_obj:
with open_for_read(self.__file) as read_obj:
for line in read_obj:
self.__token_info = line[:16]
action = self.__preamble_state_dict.get(self.__state)
if action is None:
sys.stderr.write('error no state found in hex_2_utf8',
self.__state
)
action(line)
copy_obj = copy.Copy(bug_handler=self.__bug_handler)
if self.__copy:
copy_obj.copy_file(self.__write_to, "preamble_utf_convert.data")
copy_obj.rename(self.__write_to, self.__file)
os.remove(self.__write_to)
def __preamble_for_body_func(self, line):
"""
Required:
line -- line to parse
Returns:
nothing
Logic:
Used when parsing the body.
"""
if self.__token_info == 'mi<mk<body-open_':
self.__found_body_func(line)
self.__write_obj.write(line)
def __body_for_body_func(self, line):
"""
Required:
line -- line to parse
Returns:
nothing
Logic:
Used when parsing the body.
"""
action = self.__in_body_dict.get(self.__token_info)
if action is not None:
action(line)
else:
self.__write_obj.write(line)
def __start_font_func(self, line):
"""
Required:
line -- line to parse
Returns:
nothing
Logic:
add font face to font_list
"""
face = line[17:-1]
self.__font_list.append(face)
if face == 'Symbol' and self.__convert_symbol:
self.__current_dict_name = 'Symbol'
self.__current_dict = self.__symbol_dict
elif face == 'Wingdings' and self.__convert_wingdings:
self.__current_dict_name = 'Wingdings'
self.__current_dict = self.__wingdings_dict
elif face == 'Zapf Dingbats' and self.__convert_zapf:
self.__current_dict_name = 'Zapf Dingbats'
self.__current_dict = self.__dingbats_dict
else:
self.__current_dict_name = 'default'
self.__current_dict = self.__def_dict
def __end_font_func(self, line):
"""
Required:
line -- line to parse
Returns:
nothing
Logic:
pop font_list
"""
if len(self.__font_list) > 1:
self.__font_list.pop()
else:
sys.stderr.write('module is hex_2_utf8\n')
sys.stderr.write('method is end_font_func\n')
sys.stderr.write('self.__font_list should be greater than one?\n')
face = self.__font_list[-1]
if face == 'Symbol' and self.__convert_symbol:
self.__current_dict_name = 'Symbol'
self.__current_dict = self.__symbol_dict
elif face == 'Wingdings' and self.__convert_wingdings:
self.__current_dict_name = 'Wingdings'
self.__current_dict = self.__wingdings_dict
elif face == 'Zapf Dingbats' and self.__convert_zapf:
self.__current_dict_name = 'Zapf Dingbats'
self.__current_dict = self.__dingbats_dict
else:
self.__current_dict_name = 'default'
self.__current_dict = self.__def_dict
def __start_special_font_func_old(self, line):
"""
Required:
line -- line
Returns;
nothing
Logic:
change the dictionary to use in conversion
"""
# for error checking
if self.__token_info == 'mi<mk<font-symbo':
self.__current_dict.append(self.__symbol_dict)
self.__special_fonts_found += 1
self.__current_dict_name = 'Symbol'
elif self.__token_info == 'mi<mk<font-wingd':
self.__special_fonts_found += 1
self.__current_dict.append(self.__wingdings_dict)
self.__current_dict_name = 'Wingdings'
elif self.__token_info == 'mi<mk<font-dingb':
self.__current_dict.append(self.__dingbats_dict)
self.__special_fonts_found += 1
self.__current_dict_name = 'Zapf Dingbats'
def __end_special_font_func(self, line):
"""
Required:
line --line to parse
Returns:
nothing
Logic:
pop the last dictionary, which should be a special font
"""
if len(self.__current_dict) < 2:
sys.stderr.write('module is hex_2_utf 8\n')
sys.stderr.write('method is __end_special_font_func\n')
sys.stderr.write('less than two dictionaries --can\'t pop\n')
self.__special_fonts_found -= 1
else:
self.__current_dict.pop()
self.__special_fonts_found -= 1
self.__dict_name = 'default'
def __start_caps_func_old(self, line):
"""
Required:
line -- line to parse
Returns:
nothing
Logic:
A marker that marks the start of caps has been found. Set
self.__in_caps to 1
"""
self.__in_caps = 1
def __start_caps_func(self, line):
"""
Required:
line -- line to parse
Returns:
nothing
Logic:
A marker that marks the start of caps has been found. Set
self.__in_caps to 1
"""
self.__in_caps = 1
value = line[17:-1]
self.__caps_list.append(value)
def __end_caps_func(self, line):
"""
Required:
line -- line to parse
Returns:
nothing
Logic:
A marker that marks the end of caps has been found.
set self.__in_caps to 0
"""
if len(self.__caps_list) > 1:
self.__caps_list.pop()
else:
sys.stderr.write('Module is hex_2_utf8\n'
'method is __end_caps_func\n'
'caps list should be more than one?\n') # self.__in_caps not set
def __text_func(self, line):
"""
Required:
line -- line to parse
Returns:
nothing
Logic:
if in caps, convert. Otherwise, print out.
"""
text = line[17:-1]
# print line
if self.__current_dict_name in ('Symbol', 'Wingdings', 'Zapf Dingbats'):
the_string = ''
for letter in text:
hex_num = hex(ord(letter))
hex_num = str(hex_num)
hex_num = hex_num.upper()
hex_num = hex_num[2:]
hex_num = '\'%s' % hex_num
converted = self.__current_dict.get(hex_num)
if converted is None:
sys.stderr.write('module is hex_2_ut8\nmethod is __text_func\n')
sys.stderr.write('no hex value for "%s"\n' % hex_num)
else:
the_string += converted
self.__write_obj.write('tx<nu<__________<%s\n' % the_string)
# print the_string
else:
if self.__caps_list[-1] == 'true' \
and self.__convert_caps\
and self.__current_dict_name not in ('Symbol', 'Wingdings', 'Zapf Dingbats'):
text = text.upper()
self.__write_obj.write('tx<nu<__________<%s\n' % text)
def __utf_to_caps_func(self, line):
"""
Required:
line -- line to parse
returns
nothing
Logic
Get the text, and use another method to convert
"""
utf_text = line[17:-1]
if self.__caps_list[-1] == 'true' and self.__convert_caps:
# utf_text = utf_text.upper()
utf_text = self.__utf_token_to_caps_func(utf_text)
self.__write_obj.write('tx<ut<__________<%s\n' % utf_text)
def __utf_token_to_caps_func(self, char_entity):
"""
Required:
utf_text -- such as &xxx;
Returns:
token converted to the capital equivalent
Logic:
RTF often stores text in the improper values. For example, a
capital umlaut o (?), is stores as ?. This function swaps the
case by looking up the value in a dictionary.
"""
hex_num = char_entity[3:]
length = len(hex_num)
if length == 3:
hex_num = '00%s' % hex_num
elif length == 4:
hex_num = '0%s' % hex_num
new_char_entity = '&#x%s' % hex_num
converted = self.__caps_uni_dict.get(new_char_entity)
if not converted:
# bullets and other entities don't have capital equivalents
return char_entity
else:
return converted
def __convert_body(self):
self.__state = 'body'
with open_for_read(self.__file) as read_obj:
with open_for_write(self.__write_to) as self.__write_obj:
for line in read_obj:
self.__token_info = line[:16]
action = self.__body_state_dict.get(self.__state)
if action is None:
sys.stderr.write('error no state found in hex_2_utf8',
self.__state
)
action(line)
copy_obj = copy.Copy(bug_handler=self.__bug_handler)
if self.__copy:
copy_obj.copy_file(self.__write_to, "body_utf_convert.data")
copy_obj.rename(self.__write_to, self.__file)
os.remove(self.__write_to)
def convert_hex_2_utf8(self):
self.__initiate_values()
if self.__area_to_convert == 'preamble':
self.__convert_preamble()
else:
self.__convert_body()
"""
how to swap case for non-capitals
my_string.swapcase()
An example of how to use a hash for the caps function
(but I shouldn't need this, since utf text is separate
from regular text?)
sub_dict = {
"а" : "some other value"
}
def my_sub_func(matchobj):
info = matchobj.group(0)
value = sub_dict.get(info)
return value
return "f"
line = "а more text"
reg_exp = re.compile(r'(?P<name>а|б)')
line2 = re.sub(reg_exp, my_sub_func, line)
print line2
"""