%PDF- %PDF-
Mini Shell

Mini Shell

Direktori : /lib/calibre/calibre/ebooks/rtf2xml/
Upload File :
Create Path :
Current File : //lib/calibre/calibre/ebooks/rtf2xml/default_encoding.py

#########################################################################
#                                                                       #
#   copyright 2002 Paul Henry Tremblay                                  #
#                                                                       #
#########################################################################

'''
Codepages as to RTF 1.9.1:
    437	United States IBM
    708	Arabic (ASMO 708)
    709	Arabic (ASMO 449+, BCON V4)
    710	Arabic (transparent Arabic)
    711	Arabic (Nafitha Enhanced)
    720	Arabic (transparent ASMO)
    819	Windows 3.1 (United States and Western Europe)
    850	IBM multilingual
    852	Eastern European
    860	Portuguese
    862	Hebrew
    863	French Canadian
    864	Arabic
    865	Norwegian
    866	Soviet Union
    874	Thai
    932	Japanese
    936	Simplified Chinese
    949	Korean
    950	Traditional Chinese
    1250	Eastern European
    1251	Cyrillic
    1252	Western European
    1253	Greek
    1254	Turkish
    1255	Hebrew
    1256	Arabic
    1257	Baltic
    1258	Vietnamese
    1361	Johab
    10000	MAC Roman
    10001	MAC Japan
    10004	MAC Arabic
    10005	MAC Hebrew
    10006	MAC Greek
    10007	MAC Cyrillic
    10029	MAC Latin2
    10081	MAC Turkish
    57002	Devanagari
    57003	Bengali
    57004	Tamil
    57005	Telugu
    57006	Assamese
    57007	Oriya
    57008	Kannada
    57009	Malayalam
    57010	Gujarati
    57011	Punjabi
'''
import re
from . import open_for_read


class DefaultEncoding:
    """
    Find the default encoding for the doc
    """

    # Note: not all those encoding are really supported by rtf2xml
    # See http://msdn.microsoft.com/en-us/library/windows/desktop/dd317756%28v=vs.85%29.aspx
    # and src\calibre\gui2\widgets.py for the input list in calibre
    ENCODINGS = {
                # Special cases
                'cp1252':'1252',
                'utf-8':'1252',
                'ascii':'1252',
                # Normal cases
                'big5':'950',
                'cp1250':'1250',
                'cp1251':'1251',
                'cp1253':'1253',
                'cp1254':'1254',
                'cp1255':'1255',
                'cp1256':'1256',
                'shift_jis':'932',
                'gb2312':'936',
                # Not in RTF 1.9.1 codepage specification
                'hz':'52936',
                'iso8859_5':'28595',
                'iso2022_jp':'50222',
                'iso2022_kr':'50225',
                'euc_jp':'51932',
                'euc_kr':'51949',
                'gb18030':'54936',
                }

    def __init__(self, in_file, bug_handler, default_encoding, run_level=1, check_raw=False):
        self.__file = in_file
        self.__bug_handler = bug_handler
        self.__platform = 'Windows'
        self.__default_num = 'not-defined'
        self.__code_page = self.ENCODINGS.get(default_encoding, '1252')
        self.__datafetched = False
        self.__fetchraw = check_raw

    def find_default_encoding(self):
        if not self.__datafetched:
            self._encoding()
            self.__datafetched = True
            code_page = 'ansicpg' + self.__code_page
            # if self.__code_page == '10000':
            # self.__code_page = 'mac_roman'
        return self.__platform, code_page, self.__default_num

    def get_codepage(self):
        if not self.__datafetched:
            self._encoding()
            self.__datafetched = True
            # if self.__code_page == '10000':
            # self.__code_page = 'mac_roman'
        return self.__code_page

    def get_platform(self):
        if not self.__datafetched:
            self._encoding()
            self.__datafetched = True
        return self.__platform

    def _encoding(self):
        with open_for_read(self.__file) as read_obj:
            cpfound = False
            if not self.__fetchraw:
                for line in read_obj:
                    self.__token_info = line[:16]
                    if self.__token_info == 'mi<mk<rtfhed-end':
                        break
                    if self.__token_info == 'cw<ri<macintosh_':
                        self.__platform = 'Macintosh'
                    elif self.__token_info == 'cw<ri<pc________':
                        self.__platform = 'IBMPC'
                    elif self.__token_info == 'cw<ri<pca_______':
                        self.__platform = 'OS/2'
                    if self.__token_info == 'cw<ri<ansi-codpg' \
                        and int(line[20:-1]):
                        self.__code_page = line[20:-1]
                    if self.__token_info == 'cw<ri<deflt-font':
                        self.__default_num = line[20:-1]
                        cpfound = True
                        # cw<ri<deflt-font<nu<0
                if self.__platform != 'Windows' and \
                        not cpfound:
                    if self.__platform == 'Macintosh':
                        self.__code_page = '10000'
                    elif self.__platform == 'IBMPC':
                        self.__code_page = '437'
                    elif self.__platform == 'OS/2':
                        self.__code_page = '850'
            else:
                fenc = re.compile(r'\\(mac|pc|ansi|pca)[\\ \{\}\t\n]+')
                fenccp = re.compile(r'\\ansicpg(\d+)[\\ \{\}\t\n]+')

                for line in read_obj:
                    if fenc.search(line):
                        enc = fenc.search(line).group(1)
                    if fenccp.search(line):
                        cp = fenccp.search(line).group(1)
                        if not int(cp):
                            self.__code_page = cp
                        cpfound = True
                        break
                if self.__platform != 'Windows' and \
                        not cpfound:
                    if enc == 'mac':
                        self.__code_page = '10000'
                    elif enc == 'pc':
                        self.__code_page = '437'
                    elif enc == 'pca':
                        self.__code_page = '850'


if __name__ == '__main__':
    import sys
    encode_obj = DefaultEncoding(
            in_file=sys.argv[1],
            default_encoding=sys.argv[2],
            bug_handler=Exception,
            check_raw=True,
            )
    print(encode_obj.get_codepage())

Zerion Mini Shell 1.0