%PDF- %PDF-
| Direktori : /lib/calibre/calibre/ebooks/rtf2xml/ |
| Current File : //lib/calibre/calibre/ebooks/rtf2xml/default_encoding.py |
#########################################################################
# #
# copyright 2002 Paul Henry Tremblay #
# #
#########################################################################
'''
Codepages as to RTF 1.9.1:
437 United States IBM
708 Arabic (ASMO 708)
709 Arabic (ASMO 449+, BCON V4)
710 Arabic (transparent Arabic)
711 Arabic (Nafitha Enhanced)
720 Arabic (transparent ASMO)
819 Windows 3.1 (United States and Western Europe)
850 IBM multilingual
852 Eastern European
860 Portuguese
862 Hebrew
863 French Canadian
864 Arabic
865 Norwegian
866 Soviet Union
874 Thai
932 Japanese
936 Simplified Chinese
949 Korean
950 Traditional Chinese
1250 Eastern European
1251 Cyrillic
1252 Western European
1253 Greek
1254 Turkish
1255 Hebrew
1256 Arabic
1257 Baltic
1258 Vietnamese
1361 Johab
10000 MAC Roman
10001 MAC Japan
10004 MAC Arabic
10005 MAC Hebrew
10006 MAC Greek
10007 MAC Cyrillic
10029 MAC Latin2
10081 MAC Turkish
57002 Devanagari
57003 Bengali
57004 Tamil
57005 Telugu
57006 Assamese
57007 Oriya
57008 Kannada
57009 Malayalam
57010 Gujarati
57011 Punjabi
'''
import re
from . import open_for_read
class DefaultEncoding:
"""
Find the default encoding for the doc
"""
# Note: not all those encoding are really supported by rtf2xml
# See http://msdn.microsoft.com/en-us/library/windows/desktop/dd317756%28v=vs.85%29.aspx
# and src\calibre\gui2\widgets.py for the input list in calibre
ENCODINGS = {
# Special cases
'cp1252':'1252',
'utf-8':'1252',
'ascii':'1252',
# Normal cases
'big5':'950',
'cp1250':'1250',
'cp1251':'1251',
'cp1253':'1253',
'cp1254':'1254',
'cp1255':'1255',
'cp1256':'1256',
'shift_jis':'932',
'gb2312':'936',
# Not in RTF 1.9.1 codepage specification
'hz':'52936',
'iso8859_5':'28595',
'iso2022_jp':'50222',
'iso2022_kr':'50225',
'euc_jp':'51932',
'euc_kr':'51949',
'gb18030':'54936',
}
def __init__(self, in_file, bug_handler, default_encoding, run_level=1, check_raw=False):
self.__file = in_file
self.__bug_handler = bug_handler
self.__platform = 'Windows'
self.__default_num = 'not-defined'
self.__code_page = self.ENCODINGS.get(default_encoding, '1252')
self.__datafetched = False
self.__fetchraw = check_raw
def find_default_encoding(self):
if not self.__datafetched:
self._encoding()
self.__datafetched = True
code_page = 'ansicpg' + self.__code_page
# if self.__code_page == '10000':
# self.__code_page = 'mac_roman'
return self.__platform, code_page, self.__default_num
def get_codepage(self):
if not self.__datafetched:
self._encoding()
self.__datafetched = True
# if self.__code_page == '10000':
# self.__code_page = 'mac_roman'
return self.__code_page
def get_platform(self):
if not self.__datafetched:
self._encoding()
self.__datafetched = True
return self.__platform
def _encoding(self):
with open_for_read(self.__file) as read_obj:
cpfound = False
if not self.__fetchraw:
for line in read_obj:
self.__token_info = line[:16]
if self.__token_info == 'mi<mk<rtfhed-end':
break
if self.__token_info == 'cw<ri<macintosh_':
self.__platform = 'Macintosh'
elif self.__token_info == 'cw<ri<pc________':
self.__platform = 'IBMPC'
elif self.__token_info == 'cw<ri<pca_______':
self.__platform = 'OS/2'
if self.__token_info == 'cw<ri<ansi-codpg' \
and int(line[20:-1]):
self.__code_page = line[20:-1]
if self.__token_info == 'cw<ri<deflt-font':
self.__default_num = line[20:-1]
cpfound = True
# cw<ri<deflt-font<nu<0
if self.__platform != 'Windows' and \
not cpfound:
if self.__platform == 'Macintosh':
self.__code_page = '10000'
elif self.__platform == 'IBMPC':
self.__code_page = '437'
elif self.__platform == 'OS/2':
self.__code_page = '850'
else:
fenc = re.compile(r'\\(mac|pc|ansi|pca)[\\ \{\}\t\n]+')
fenccp = re.compile(r'\\ansicpg(\d+)[\\ \{\}\t\n]+')
for line in read_obj:
if fenc.search(line):
enc = fenc.search(line).group(1)
if fenccp.search(line):
cp = fenccp.search(line).group(1)
if not int(cp):
self.__code_page = cp
cpfound = True
break
if self.__platform != 'Windows' and \
not cpfound:
if enc == 'mac':
self.__code_page = '10000'
elif enc == 'pc':
self.__code_page = '437'
elif enc == 'pca':
self.__code_page = '850'
if __name__ == '__main__':
import sys
encode_obj = DefaultEncoding(
in_file=sys.argv[1],
default_encoding=sys.argv[2],
bug_handler=Exception,
check_raw=True,
)
print(encode_obj.get_codepage())