%PDF- %PDF-
| Direktori : /lib/calibre/calibre/ebooks/rtf2xml/ |
| Current File : //lib/calibre/calibre/ebooks/rtf2xml/process_tokens.py |
#########################################################################
# #
# #
# copyright 2002 Paul Henry Tremblay #
# #
# This program is distributed in the hope that it will be useful, #
# but WITHOUT ANY WARRANTY; without even the implied warranty of #
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU #
# General Public License for more details. #
# #
# #
#########################################################################
import os, re
from calibre.ebooks.rtf2xml import copy, check_brackets
from calibre.ptempfile import better_mktemp
from . import open_for_read, open_for_write
class ProcessTokens:
"""
Process each token on a line and add information that will be useful for
later processing. Information will be put on one line, delimited by "<"
for main fields, and ">" for sub fields
"""
def __init__(self,
in_file,
exception_handler,
bug_handler,
copy=None,
run_level=1,
):
self.__file = in_file
self.__bug_handler = bug_handler
self.__copy = copy
self.__run_level = run_level
self.__write_to = better_mktemp()
self.initiate_token_dict()
# self.initiate_token_actions()
self.compile_expressions()
self.__bracket_count=0
self.__exception_handler = exception_handler
self.__bug_handler = bug_handler
def compile_expressions(self):
self.__num_exp = re.compile(r"([a-zA-Z]+)(.*)")
self.__utf_exp = re.compile(r'(&.*?;)')
def initiate_token_dict(self):
self.__return_code = 0
self.dict_token={
# unicode
'mshex' : ('nu', '__________', self.__ms_hex_func),
# brackets
'{' : ('nu', '{', self.ob_func),
'}' : ('nu', '}', self.cb_func),
# microsoft characters
'ldblquote' : ('mc', 'ldblquote', self.ms_sub_func),
'rdblquote' : ('mc', 'rdblquote', self.ms_sub_func),
'rquote' : ('mc', 'rquote', self.ms_sub_func),
'lquote' : ('mc', 'lquote', self.ms_sub_func),
'emdash' : ('mc', 'emdash', self.ms_sub_func),
'endash' : ('mc', 'endash', self.ms_sub_func),
'bullet' : ('mc', 'bullet', self.ms_sub_func),
'~' : ('mc', '~', self.ms_sub_func),
'tab' : ('mc', 'tab', self.ms_sub_func),
'_' : ('mc', '_', self.ms_sub_func),
';' : ('mc', ';', self.ms_sub_func),
# this must be wrong
'-' : ('mc', '-', self.ms_sub_func),
'line' : ('mi', 'hardline-break', self.direct_conv_func), # calibre
# misc => ml
'*' : ('ml', 'asterisk__', self.default_func),
':' : ('ml', 'colon_____', self.default_func),
# text
'backslash' : ('nu', '\\', self.text_func),
'ob' : ('nu', '{', self.text_func),
'cb' : ('nu', '}', self.text_func),
# paragraph formatting => pf
'page' : ('pf', 'page-break', self.default_func),
'par' : ('pf', 'par-end___', self.default_func),
'pard' : ('pf', 'par-def___', self.default_func),
'keepn' : ('pf', 'keep-w-nex', self.bool_st_func),
'widctlpar' : ('pf', 'widow-cntl', self.bool_st_func),
'adjustright' : ('pf', 'adjust-rgt', self.bool_st_func),
'lang' : ('pf', 'language__', self.__language_func),
'ri' : ('pf', 'right-inde', self.divide_by_20),
'fi' : ('pf', 'fir-ln-ind', self.divide_by_20),
'li' : ('pf', 'left-inden', self.divide_by_20),
'sb' : ('pf', 'space-befo', self.divide_by_20),
'sa' : ('pf', 'space-afte', self.divide_by_20),
'sl' : ('pf', 'line-space', self.divide_by_20),
'deftab' : ('pf', 'default-ta', self.divide_by_20),
'ql' : ('pf', 'align_____<left', self.two_part_func),
'qc' : ('pf', 'align_____<cent', self.two_part_func),
'qj' : ('pf', 'align_____<just', self.two_part_func),
'qr' : ('pf', 'align_____<right', self.two_part_func),
'nowidctlpar' : ('pf', 'widow-cntr<false', self.two_part_func),
'tx' : ('pf', 'tab-stop__', self.divide_by_20),
'tb' : ('pf', 'tab-bar-st', self.divide_by_20),
'tqr' : ('pf', 'tab-right_', self.default_func),
'tqdec' : ('pf', 'tab-dec___', self.default_func),
'tqc' : ('pf', 'tab-center', self.default_func),
'tlul' : ('pf', 'leader-und', self.default_func),
'tlhyph' : ('pf', 'leader-hyp', self.default_func),
'tldot' : ('pf', 'leader-dot', self.default_func),
# stylesheet = > ss
'stylesheet' : ('ss', 'style-shet', self.default_func),
'sbasedon' : ('ss', 'based-on__', self.default_func),
'snext' : ('ss', 'next-style', self.default_func),
'cs' : ('ss', 'char-style', self.default_func),
's' : ('ss', 'para-style', self.default_func),
# graphics => gr
'pict' : ('gr', 'picture___', self.default_func),
'objclass' : ('gr', 'obj-class_', self.default_func),
'macpict' : ('gr', 'mac-pic___', self.default_func),
# section => sc
'sect' : ('sc', 'section___', self.default_func),
'sectd' : ('sc', 'sect-defin', self.default_func),
'endhere' : ('sc', 'sect-note_', self.default_func),
# list=> ls
'pntext' : ('ls', 'list-text_', self.default_func),
# this line must be wrong because it duplicates an earlier one
'listtext' : ('ls', 'list-text_', self.default_func),
'pn' : ('ls', 'list______', self.default_func),
'pnseclvl' : ('ls', 'list-level', self.default_func),
'pncard' : ('ls', 'list-cardi', self.bool_st_func),
'pndec' : ('ls', 'list-decim', self.bool_st_func),
'pnucltr' : ('ls', 'list-up-al', self.bool_st_func),
'pnucrm' : ('ls', 'list-up-ro', self.bool_st_func),
'pnord' : ('ls', 'list-ord__', self.bool_st_func),
'pnordt' : ('ls', 'list-ordte', self.bool_st_func),
'pnlvlblt' : ('ls', 'list-bulli', self.bool_st_func),
'pnlvlbody' : ('ls', 'list-simpi', self.bool_st_func),
'pnlvlcont' : ('ls', 'list-conti', self.bool_st_func),
'pnhang' : ('ls', 'list-hang_', self.bool_st_func),
'pntxtb' : ('ls', 'list-tebef', self.bool_st_func),
'ilvl' : ('ls', 'list-level', self.default_func),
'ls' : ('ls', 'list-id___', self.default_func),
'pnstart' : ('ls', 'list-start', self.default_func),
'itap' : ('ls', 'nest-level', self.default_func),
'leveltext' : ('ls', 'level-text', self.default_func),
'levelnumbers' : ('ls', 'level-numb', self.default_func),
'list' : ('ls', 'list-in-tb', self.default_func),
'listlevel' : ('ls', 'list-tb-le', self.default_func),
'listname' : ('ls', 'list-name_', self.default_func),
'listtemplateid' : ('ls', 'ls-tem-id_', self.default_func),
'leveltemplateid' : ('ls', 'lv-tem-id_', self.default_func),
'listhybrid' : ('ls', 'list-hybri', self.default_func),
'levelstartat' : ('ls', 'level-star', self.default_func),
'levelspace' : ('ls', 'level-spac', self.divide_by_20),
'levelindent' : ('ls', 'level-inde', self.default_func),
'levelnfc' : ('ls', 'level-type', self.__list_type_func),
'levelnfcn' : ('ls', 'level-type', self.__list_type_func),
'listid' : ('ls', 'lis-tbl-id', self.default_func),
'listoverride' : ('ls', 'lis-overid', self.default_func),
# duplicate
'pnlvl' : ('ls', 'list-level', self.default_func),
# root info => ri
'rtf' : ('ri', 'rtf_______', self.default_func),
'deff' : ('ri', 'deflt-font', self.default_func),
'mac' : ('ri', 'macintosh_', self.default_func),
'pc' : ('ri', 'pc________', self.default_func),
'pca' : ('ri', 'pca_______', self.default_func),
'ansi' : ('ri', 'ansi______', self.default_func),
'ansicpg' : ('ri', 'ansi-codpg', self.default_func),
# notes => nt
'footnote' : ('nt', 'footnote__', self.default_func),
'ftnalt' : ('nt', 'type______<endnote', self.two_part_func),
# anchor => an
'tc' : ('an', 'toc_______', self.default_func),
'bkmkstt' : ('an', 'book-mk-st', self.default_func),
'bkmkstart' : ('an', 'book-mk-st', self.default_func),
'bkmkend' : ('an', 'book-mk-en', self.default_func),
'xe' : ('an', 'index-mark', self.default_func),
'rxe' : ('an', 'place_____', self.default_func),
# index => in
'bxe' : ('in', 'index-bold', self.default_func),
'ixe' : ('in', 'index-ital', self.default_func),
'txe' : ('in', 'index-see_', self.default_func),
# table of contents => tc
'tcl' : ('tc', 'toc-level_', self.default_func),
'tcn' : ('tc', 'toc-sup-nu', self.default_func),
# field => fd
'field' : ('fd', 'field_____', self.default_func),
'fldinst' : ('fd', 'field-inst', self.default_func),
'fldrslt' : ('fd', 'field-rslt', self.default_func),
'datafield' : ('fd', 'datafield_', self.default_func),
# info-tables => it
'fonttbl' : ('it', 'font-table', self.default_func),
'colortbl' : ('it', 'colr-table', self.default_func),
'listoverridetable' : ('it', 'lovr-table', self.default_func),
'listtable' : ('it', 'listtable_', self.default_func),
'revtbl' : ('it', 'revi-table', self.default_func),
# character info => ci
'b' : ('ci', 'bold______', self.bool_st_func),
'blue' : ('ci', 'blue______', self.color_func),
'caps' : ('ci', 'caps______', self.bool_st_func),
'cf' : ('ci', 'font-color', self.colorz_func),
'chftn' : ('ci', 'footnot-mk', self.bool_st_func),
'dn' : ('ci', 'font-down_', self.divide_by_2),
'embo' : ('ci', 'emboss____', self.bool_st_func),
'f' : ('ci', 'font-style', self.default_func),
'fs' : ('ci', 'font-size_', self.divide_by_2),
'green' : ('ci', 'green_____', self.color_func),
'i' : ('ci', 'italics___', self.bool_st_func),
'impr' : ('ci', 'engrave___', self.bool_st_func),
'outl' : ('ci', 'outline___', self.bool_st_func),
'plain' : ('ci', 'plain_____', self.bool_st_func),
'red' : ('ci', 'red_______', self.color_func),
'scaps' : ('ci', 'small-caps', self.bool_st_func),
'shad' : ('ci', 'shadow____', self.bool_st_func),
'strike' : ('ci', 'strike-thr', self.bool_st_func),
'striked' : ('ci', 'dbl-strike', self.bool_st_func),
'sub' : ('ci', 'subscript_', self.bool_st_func),
'super' : ('ci', 'superscrip', self.bool_st_func),
'nosupersub' : ('ci', 'no-su-supe', self.__no_sup_sub_func),
'up' : ('ci', 'font-up___', self.divide_by_2),
'v' : ('ci', 'hidden____', self.default_func),
# underline
# can't see why it isn't a char info: 'ul'=>'ci'
'ul' : ('ci', 'underlined<continous', self.two_part_func),
'uld' : ('ci', 'underlined<dotted', self.two_part_func),
'uldash' : ('ci', 'underlined<dash', self.two_part_func),
'uldashd' : ('ci', 'underlined<dash-dot', self.two_part_func),
'uldashdd' : ('ci', 'underlined<dash-dot-dot', self.two_part_func),
'uldb' : ('ci', 'underlined<double', self.two_part_func),
'ulhwave' : ('ci', 'underlined<heavy-wave', self.two_part_func),
'ulldash' : ('ci', 'underlined<long-dash', self.two_part_func),
'ulth' : ('ci', 'underlined<thich', self.two_part_func),
'ulthd' : ('ci', 'underlined<thick-dotted', self.two_part_func),
'ulthdash' : ('ci', 'underlined<thick-dash', self.two_part_func),
'ulthdashd' : ('ci', 'underlined<thick-dash-dot', self.two_part_func),
'ulthdashdd' : ('ci', 'underlined<thick-dash-dot-dot', self.two_part_func),
'ulthldash' : ('ci', 'underlined<thick-long-dash', self.two_part_func),
'ululdbwave' : ('ci', 'underlined<double-wave', self.two_part_func),
'ulw' : ('ci', 'underlined<word', self.two_part_func),
'ulwave' : ('ci', 'underlined<wave', self.two_part_func),
'ulnone' : ('ci', 'underlined<false', self.two_part_func),
# table => tb
'trowd' : ('tb', 'row-def___', self.default_func),
'cell' : ('tb', 'cell______', self.default_func),
'row' : ('tb', 'row_______', self.default_func),
'intbl' : ('tb', 'in-table__', self.default_func),
'cols' : ('tb', 'columns___', self.default_func),
'trleft' : ('tb', 'row-pos-le', self.divide_by_20),
'cellx' : ('tb', 'cell-posit', self.divide_by_20),
'trhdr' : ('tb', 'row-header', self.default_func),
# preamble => pr
# document information => di
# TODO integrate \userprops
'info' : ('di', 'doc-info__', self.default_func),
'title' : ('di', 'title_____', self.default_func),
'author' : ('di', 'author____', self.default_func),
'operator' : ('di', 'operator__', self.default_func),
'manager' : ('di', 'manager___', self.default_func),
'company' : ('di', 'company___', self.default_func),
'keywords' : ('di', 'keywords__', self.default_func),
'category' : ('di', 'category__', self.default_func),
'doccomm' : ('di', 'doc-notes_', self.default_func),
'comment' : ('di', 'doc-notes_', self.default_func),
'subject' : ('di', 'subject___', self.default_func),
'creatim' : ('di', 'create-tim', self.default_func),
'yr' : ('di', 'year______', self.default_func),
'mo' : ('di', 'month_____', self.default_func),
'dy' : ('di', 'day_______', self.default_func),
'min' : ('di', 'minute____', self.default_func),
'sec' : ('di', 'second____', self.default_func),
'revtim' : ('di', 'revis-time', self.default_func),
'edmins' : ('di', 'edit-time_', self.default_func),
'printim' : ('di', 'print-time', self.default_func),
'buptim' : ('di', 'backuptime', self.default_func),
'nofwords' : ('di', 'num-of-wor', self.default_func),
'nofchars' : ('di', 'num-of-chr', self.default_func),
'nofcharsws' : ('di', 'numofchrws', self.default_func),
'nofpages' : ('di', 'num-of-pag', self.default_func),
'version' : ('di', 'version___', self.default_func),
'vern' : ('di', 'intern-ver', self.default_func),
'hlinkbase' : ('di', 'linkbase__', self.default_func),
'id' : ('di', 'internalID', self.default_func),
# headers and footers => hf
'headerf' : ('hf', 'head-first', self.default_func),
'headerl' : ('hf', 'head-left_', self.default_func),
'headerr' : ('hf', 'head-right', self.default_func),
'footerf' : ('hf', 'foot-first', self.default_func),
'footerl' : ('hf', 'foot-left_', self.default_func),
'footerr' : ('hf', 'foot-right', self.default_func),
'header' : ('hf', 'header____', self.default_func),
'footer' : ('hf', 'footer____', self.default_func),
# page => pa
'margl' : ('pa', 'margin-lef', self.divide_by_20),
'margr' : ('pa', 'margin-rig', self.divide_by_20),
'margb' : ('pa', 'margin-bot', self.divide_by_20),
'margt' : ('pa', 'margin-top', self.divide_by_20),
'gutter' : ('pa', 'gutter____', self.divide_by_20),
'paperw' : ('pa', 'paper-widt', self.divide_by_20),
'paperh' : ('pa', 'paper-hght', self.divide_by_20),
# annotation => an
'annotation' : ('an', 'annotation', self.default_func),
# border => bd
'trbrdrh' : ('bd', 'bor-t-r-hi', self.default_func),
'trbrdrv' : ('bd', 'bor-t-r-vi', self.default_func),
'trbrdrt' : ('bd', 'bor-t-r-to', self.default_func),
'trbrdrl' : ('bd', 'bor-t-r-le', self.default_func),
'trbrdrb' : ('bd', 'bor-t-r-bo', self.default_func),
'trbrdrr' : ('bd', 'bor-t-r-ri', self.default_func),
'clbrdrb' : ('bd', 'bor-cel-bo', self.default_func),
'clbrdrt' : ('bd', 'bor-cel-to', self.default_func),
'clbrdrl' : ('bd', 'bor-cel-le', self.default_func),
'clbrdrr' : ('bd', 'bor-cel-ri', self.default_func),
'brdrb' : ('bd', 'bor-par-bo', self.default_func),
'brdrt' : ('bd', 'bor-par-to', self.default_func),
'brdrl' : ('bd', 'bor-par-le', self.default_func),
'brdrr' : ('bd', 'bor-par-ri', self.default_func),
'box' : ('bd', 'bor-par-bx', self.default_func),
'chbrdr' : ('bd', 'bor-par-bo', self.default_func),
'brdrbtw' : ('bd', 'bor-for-ev', self.default_func),
'brdrbar' : ('bd', 'bor-outsid', self.default_func),
'brdrnone' : ('bd', 'bor-none__<false', self.two_part_func),
# border type => bt
'brdrs' : ('bt', 'bdr-single', self.default_func),
'brdrth' : ('bt', 'bdr-doubtb', self.default_func),
'brdrsh' : ('bt', 'bdr-shadow', self.default_func),
'brdrdb' : ('bt', 'bdr-double', self.default_func),
'brdrdot' : ('bt', 'bdr-dotted', self.default_func),
'brdrdash' : ('bt', 'bdr-dashed', self.default_func),
'brdrhair' : ('bt', 'bdr-hair__', self.default_func),
'brdrinset' : ('bt', 'bdr-inset_', self.default_func),
'brdrdashsm' : ('bt', 'bdr-das-sm', self.default_func),
'brdrdashd' : ('bt', 'bdr-dot-sm', self.default_func),
'brdrdashdd' : ('bt', 'bdr-dot-do', self.default_func),
'brdroutset' : ('bt', 'bdr-outset', self.default_func),
'brdrtriple' : ('bt', 'bdr-trippl', self.default_func),
'brdrtnthsg' : ('bt', 'bdr-thsm__', self.default_func),
'brdrthtnsg' : ('bt', 'bdr-htsm__', self.default_func),
'brdrtnthtnsg' : ('bt', 'bdr-hthsm_', self.default_func),
'brdrtnthmg' : ('bt', 'bdr-thm___', self.default_func),
'brdrthtnmg' : ('bt', 'bdr-htm___', self.default_func),
'brdrtnthtnmg' : ('bt', 'bdr-hthm__', self.default_func),
'brdrtnthlg' : ('bt', 'bdr-thl___', self.default_func),
'brdrtnthtnlg' : ('bt', 'bdr-hthl__', self.default_func),
'brdrwavy' : ('bt', 'bdr-wavy__', self.default_func),
'brdrwavydb' : ('bt', 'bdr-d-wav_', self.default_func),
'brdrdashdotstr' : ('bt', 'bdr-strip_', self.default_func),
'brdremboss' : ('bt', 'bdr-embos_', self.default_func),
'brdrengrave' : ('bt', 'bdr-engra_', self.default_func),
'brdrframe' : ('bt', 'bdr-frame_', self.default_func),
'brdrw' : ('bt', 'bdr-li-wid', self.divide_by_20),
'brsp' : ('bt', 'bdr-sp-wid', self.divide_by_20),
'brdrcf' : ('bt', 'bdr-color_', self.default_func),
# comments
# 'comment' : ('cm', 'comment___', self.default_func),
}
self.__number_type_dict = {
0: 'Arabic',
1: 'uppercase Roman numeral',
2: 'lowercase Roman numeral',
3: 'uppercase letter',
4: 'lowercase letter',
5: 'ordinal number',
6: 'cardianl text number',
7: 'ordinal text number',
10: 'Kanji numbering without the digit character',
11: 'Kanji numbering with the digit character',
1246: 'phonetic Katakana characters in aiueo order',
1346: 'phonetic katakana characters in iroha order',
14: 'double byte character',
15: 'single byte character',
16: 'Kanji numbering 3',
17: 'Kanji numbering 4',
18: 'Circle numbering' ,
19: 'double-byte Arabic numbering',
2046: 'phonetic double-byte Katakana characters',
2146: 'phonetic double-byte katakana characters',
22: 'Arabic with leading zero',
23: 'bullet',
24: 'Korean numbering 2',
25: 'Korean numbering 1',
26: 'Chinese numbering 1',
27: 'Chinese numbering 2',
28: 'Chinese numbering 3',
29: 'Chinese numbering 4',
30: 'Chinese Zodiac numbering 1',
31: 'Chinese Zodiac numbering 2',
32: 'Chinese Zodiac numbering 3',
33: 'Taiwanese double-byte numbering 1',
34: 'Taiwanese double-byte numbering 2',
35: 'Taiwanese double-byte numbering 3',
36: 'Taiwanese double-byte numbering 4',
37: 'Chinese double-byte numbering 1',
38: 'Chinese double-byte numbering 2',
39: 'Chinese double-byte numbering 3',
40: 'Chinese double-byte numbering 4',
41: 'Korean double-byte numbering 1',
42: 'Korean double-byte numbering 2',
43: 'Korean double-byte numbering 3',
44: 'Korean double-byte numbering 4',
45: 'Hebrew non-standard decimal',
46: 'Arabic Alif Ba Tah',
47: 'Hebrew Biblical standard',
48: 'Arabic Abjad style',
255: 'No number',
}
self.__language_dict = {
1078 : 'Afrikaans',
1052 : 'Albanian',
1025 : 'Arabic',
5121 : 'Arabic Algeria',
15361 : 'Arabic Bahrain',
3073 : 'Arabic Egypt',
1 : 'Arabic General',
2049 : 'Arabic Iraq',
11265 : 'Arabic Jordan',
13313 : 'Arabic Kuwait',
12289 : 'Arabic Lebanon',
4097 : 'Arabic Libya',
6145 : 'Arabic Morocco',
8193 : 'Arabic Oman',
16385 : 'Arabic Qatar',
10241 : 'Arabic Syria',
7169 : 'Arabic Tunisia',
14337 : 'Arabic U.A.E.',
9217 : 'Arabic Yemen',
1067 : 'Armenian',
1101 : 'Assamese',
2092 : 'Azeri Cyrillic',
1068 : 'Azeri Latin',
1069 : 'Basque',
1093 : 'Bengali',
4122 : 'Bosnia Herzegovina',
1026 : 'Bulgarian',
1109 : 'Burmese',
1059 : 'Byelorussian',
1027 : 'Catalan',
2052 : 'Chinese China',
4 : 'Chinese General',
3076 : 'Chinese Hong Kong',
4100 : 'Chinese Singapore',
1028 : 'Chinese Taiwan',
1050 : 'Croatian',
1029 : 'Czech',
1030 : 'Danish',
2067 : 'Dutch Belgium',
1043 : 'Dutch Standard',
3081 : 'English Australia',
10249 : 'English Belize',
2057 : 'English British',
4105 : 'English Canada',
9225 : 'English Caribbean',
9 : 'English General',
6153 : 'English Ireland',
8201 : 'English Jamaica',
5129 : 'English New Zealand',
13321 : 'English Philippines',
7177 : 'English South Africa',
11273 : 'English Trinidad',
1033 : 'English United States',
1061 : 'Estonian',
1080 : 'Faerose',
1065 : 'Farsi',
1035 : 'Finnish',
1036 : 'French',
2060 : 'French Belgium',
11276 : 'French Cameroon',
3084 : 'French Canada',
12300 : 'French Cote d\'Ivoire',
5132 : 'French Luxembourg',
13324 : 'French Mali',
6156 : 'French Monaco',
8204 : 'French Reunion',
10252 : 'French Senegal',
4108 : 'French Swiss',
7180 : 'French West Indies',
9228 : 'French Democratic Republic of the Congo',
1122 : 'Frisian',
1084 : 'Gaelic',
2108 : 'Gaelic Ireland',
1110 : 'Galician',
1079 : 'Georgian',
1031 : 'German',
3079 : 'German Austrian',
5127 : 'German Liechtenstein',
4103 : 'German Luxembourg',
2055 : 'German Switzerland',
1032 : 'Greek',
1095 : 'Gujarati',
1037 : 'Hebrew',
1081 : 'Hindi',
1038 : 'Hungarian',
1039 : 'Icelandic',
1057 : 'Indonesian',
1040 : 'Italian',
2064 : 'Italian Switzerland',
1041 : 'Japanese',
1099 : 'Kannada',
1120 : 'Kashmiri',
2144 : 'Kashmiri India',
1087 : 'Kazakh',
1107 : 'Khmer',
1088 : 'Kirghiz',
1111 : 'Konkani',
1042 : 'Korean',
2066 : 'Korean Johab',
1108 : 'Lao',
1062 : 'Latvian',
1063 : 'Lithuanian',
2087 : 'Lithuanian Classic',
1086 : 'Malay',
2110 : 'Malay Brunei Darussalam',
1100 : 'Malayalam',
1082 : 'Maltese',
1112 : 'Manipuri',
1102 : 'Marathi',
1104 : 'Mongolian',
1121 : 'Nepali',
2145 : 'Nepali India',
1044 : 'Norwegian Bokmal',
2068 : 'Norwegian Nynorsk',
1096 : 'Oriya',
1045 : 'Polish',
1046 : 'Portuguese (Brazil)',
2070 : 'Portuguese (Portugal)',
1094 : 'Punjabi',
1047 : 'Rhaeto-Romanic',
1048 : 'Romanian',
2072 : 'Romanian Moldova',
1049 : 'Russian',
2073 : 'Russian Moldova',
1083 : 'Sami Lappish',
1103 : 'Sanskrit',
3098 : 'Serbian Cyrillic',
2074 : 'Serbian Latin',
1113 : 'Sindhi',
1051 : 'Slovak',
1060 : 'Slovenian',
1070 : 'Sorbian',
11274 : 'Spanish Argentina',
16394 : 'Spanish Bolivia',
13322 : 'Spanish Chile',
9226 : 'Spanish Colombia',
5130 : 'Spanish Costa Rica',
7178 : 'Spanish Dominican Republic',
12298 : 'Spanish Ecuador',
17418 : 'Spanish El Salvador',
4106 : 'Spanish Guatemala',
18442 : 'Spanish Honduras',
2058 : 'Spanish Mexico',
3082 : 'Spanish Modern',
19466 : 'Spanish Nicaragua',
6154 : 'Spanish Panama',
15370 : 'Spanish Paraguay',
10250 : 'Spanish Peru',
20490 : 'Spanish Puerto Rico',
1034 : 'Spanish Traditional',
14346 : 'Spanish Uruguay',
8202 : 'Spanish Venezuela',
1072 : 'Sutu',
1089 : 'Swahili',
1053 : 'Swedish',
2077 : 'Swedish Finland',
1064 : 'Tajik',
1097 : 'Tamil',
1092 : 'Tatar',
1098 : 'Telugu',
1054 : 'Thai',
1105 : 'Tibetan',
1073 : 'Tsonga',
1074 : 'Tswana',
1055 : 'Turkish',
1090 : 'Turkmen',
1058 : 'Ukranian',
1056 : 'Urdu',
2080 : 'Urdu India',
2115 : 'Uzbek Cyrillic',
1091 : 'Uzbek Latin',
1075 : 'Venda',
1066 : 'Vietnamese',
1106 : 'Welsh',
1076 : 'Xhosa',
1085 : 'Yiddish',
1077 : 'Zulu',
1024 : 'Unkown',
255 : 'Unkown',
}
"""
# unknown
# These must get passed on because they occurred after \\*
'do' : ('un', 'unknown___', self.default_func),
'company' : ('un', 'company___', self.default_func),
'shpinst' : ('un', 'unknown___', self.default_func),
'panose' : ('un', 'unknown___', self.default_func),
'falt' : ('un', 'unknown___', self.default_func),
'listoverridetable' : ('un', 'unknown___', self.default_func),
'category' : ('un', 'unknown___', self.default_func),
'template' : ('un', 'unknown___', self.default_func),
'ud' : ('un', 'unknown___', self.default_func),
'formfield' : ('un', 'unknown___', self.default_func),
'ts' : ('un', 'unknown___', self.default_func),
'rsidtbl' : ('un', 'unknown___', self.default_func),
'generator' : ('un', 'unknown___', self.default_func),
'ftnsep' : ('un', 'unknown___', self.default_func),
'aftnsep' : ('un', 'unknown___', self.default_func),
'aftnsepc' : ('un', 'unknown___', self.default_func),
'aftncn' : ('un', 'unknown___', self.default_func),
'objclass' : ('un', 'unknown___', self.default_func),
'objdata' : ('un', 'unknown___', self.default_func),
'picprop' : ('un', 'unknown___', self.default_func),
'blipuid' : ('un', 'unknown___', self.default_func),
"""
def __ms_hex_func(self, pre, token, num):
num = num[1:] # chop off leading 0, which I added
num = num.upper() # the mappings store hex in caps
return 'tx<hx<__________<\'%s\n' % num # add an ' for the mappings
def ms_sub_func(self, pre, token, num):
return 'tx<mc<__________<%s\n' % token
def direct_conv_func(self, pre, token, num):
return 'mi<tg<empty_____<%s\n' % token
def default_func(self, pre, token, num):
if num is None:
num = 'true'
return f'cw<{pre}<{token}<nu<{num}\n'
def colorz_func(self, pre, token, num):
if num is None:
num = '0'
return f'cw<{pre}<{token}<nu<{num}\n'
def __list_type_func(self, pre, token, num):
type = 'arabic'
if num is None:
type = 'Arabic'
else:
try:
num = int(num)
except ValueError:
if self.__run_level > 3:
msg = 'Number "%s" cannot be converted to integer\n' % num
raise self.__bug_handler(msg)
type = self.__number_type_dict.get(num)
if type is None:
if self.__run_level > 3:
msg = 'No type for "%s" in self.__number_type_dict\n'
raise self.__bug_handler
type = 'Arabic'
return f'cw<{pre}<{token}<nu<{type}\n'
def __language_func(self, pre, token, num):
lang_name = self.__language_dict.get(int(re.search('[0-9]+', num).group()))
if not lang_name:
lang_name = "not defined"
if self.__run_level > 3:
msg = 'No entry for number "%s"' % num
raise self.__bug_handler(msg)
return f'cw<{pre}<{token}<nu<{lang_name}\n'
def two_part_func(self, pre, token, num):
list = token.split("<")
token = list[0]
num = list[1]
return f'cw<{pre}<{token}<nu<{num}\n'
# return 'cw<nu<nu<nu<%s>num<%s\n' % (token, num)
def divide_by_2(self, pre, token, num):
num = self.divide_num(num, 2)
return f'cw<{pre}<{token}<nu<{num}\n'
# return 'cw<nu<nu<nu<%s>%s<%s\n' % (token, num, token)
def divide_by_20(self, pre, token, num):
num = self.divide_num(num, 20)
return f'cw<{pre}<{token}<nu<{num}\n'
# return 'cw<nu<nu<nu<%s>%s<%s\n' % (token, num, token)
def text_func(self, pre, token, num=None):
return 'tx<nu<__________<%s\n' % token
def ob_func(self, pre, token, num=None):
self.__bracket_count += 1
return 'ob<nu<open-brack<%04d\n' % self.__bracket_count
def cb_func(self, pre, token, num=None):
line = 'cb<nu<clos-brack<%04d\n' % self.__bracket_count
self.__bracket_count -= 1
return line
def color_func(self, pre, token, num):
third_field = 'nu'
if num[-1] == ';':
num = num[:-1]
third_field = 'en'
num = '%X' % int(num)
if len(num) != 2:
num = "0" + num
return f'cw<{pre}<{token}<{third_field}<{num}\n'
# return 'cw<cl<%s<nu<nu<%s>%s<%s\n' % (third_field, token, num, token)
def bool_st_func(self, pre, token, num):
if num is None or num == '' or num == '1':
return f'cw<{pre}<{token}<nu<true\n'
# return 'cw<nu<nu<nu<%s>true<%s\n' % (token, token)
elif num == '0':
return f'cw<{pre}<{token}<nu<false\n'
# return 'cw<nu<nu<nu<%s>false<%s\n' % (token, token)
else:
msg = f"boolean should have some value module process tokens\ntoken is {token}\n'{num}'\n"
raise self.__bug_handler(msg)
def __no_sup_sub_func(self, pre, token, num):
the_string = 'cw<ci<subscript_<nu<false\n'
the_string += 'cw<ci<superscrip<nu<false\n'
return the_string
def divide_num(self, numerator, denominator):
try:
# calibre why ignore negative number? Wrong in case of \fi
numerator = float(re.search('[0-9.\\-]+', numerator).group())
except TypeError as msg:
if self.__run_level > 3:
msg = ('No number to process?\nthis indicates that the token \\(\\li\\) \
should have a number and does not\nnumerator is \
"%s"\ndenominator is "%s"\n') % (numerator, denominator)
raise self.__bug_handler(msg)
if 5 > self.__return_code:
self.__return_code = 5
return 0
num = '%0.2f' % round(numerator/denominator, 2)
return num
string_num = str(num)
if string_num[-2:] == ".0":
string_num = string_num[:-2]
return string_num
def split_let_num(self, token):
match_obj = re.search(self.__num_exp,token)
if match_obj is not None:
first = match_obj.group(1)
second = match_obj.group(2)
if not second:
if self.__run_level > 3:
msg = "token is '%s' \n" % token
raise self.__bug_handler(msg)
return first, 0
else:
if self.__run_level > 3:
msg = "token is '%s' \n" % token
raise self.__bug_handler
return token, 0
return first, second
def convert_to_hex(self,number):
"""Convert a string to uppercase hexadecimal"""
num = int(number)
try:
hex_num = "%X" % num
return hex_num
except:
raise self.__bug_handler
def process_cw(self, token):
"""Change the value of the control word by determining what dictionary
it belongs to"""
special = ['*', ':', '}', '{', '~', '_', '-', ';']
# if token != "{" or token != "}":
token = token[1:] # strip off leading \
token = token.replace(" ", "")
# if not token: return
only_alpha = token.isalpha()
num = None
if not only_alpha and token not in special:
token, num = self.split_let_num(token)
pre, token, action = self.dict_token.get(token, (None, None, None))
if action:
return action(pre, token, num)
def __check_brackets(self, in_file):
self.__check_brack_obj = check_brackets.CheckBrackets(file=in_file)
good_br = self.__check_brack_obj.check_brackets()[0]
if not good_br:
return 1
def process_tokens(self):
"""Main method for handling other methods. """
line_count = 0
with open_for_read(self.__file) as read_obj:
with open_for_write(self.__write_to) as write_obj:
for line in read_obj:
token = line.replace("\n", "")
line_count += 1
if line_count == 1 and token != '\\{':
msg = '\nInvalid RTF: document doesn\'t start with {\n'
raise self.__exception_handler(msg)
elif line_count == 2 and token[0:4] != '\\rtf':
msg = '\nInvalid RTF: document doesn\'t start with \\rtf \n'
raise self.__exception_handler(msg)
the_index = token.find('\\ ')
if token is not None and the_index > -1:
msg = '\nInvalid RTF: token "\\ " not valid.\nError at line %d'\
% line_count
raise self.__exception_handler(msg)
elif token[:1] == "\\":
line = self.process_cw(token)
if line is not None:
write_obj.write(line)
else:
fields = re.split(self.__utf_exp, token)
for field in fields:
if not field:
continue
if field[0:1] == '&':
write_obj.write('tx<ut<__________<%s\n' % field)
else:
write_obj.write('tx<nu<__________<%s\n' % field)
if not line_count:
msg = '\nInvalid RTF: file appears to be empty.\n'
raise self.__exception_handler(msg)
copy_obj = copy.Copy(bug_handler=self.__bug_handler)
if self.__copy:
copy_obj.copy_file(self.__write_to, "processed_tokens.data")
copy_obj.rename(self.__write_to, self.__file)
os.remove(self.__write_to)
bad_brackets = self.__check_brackets(self.__file)
if bad_brackets:
msg = '\nInvalid RTF: document does not have matching brackets.\n'
raise self.__exception_handler(msg)
else:
return self.__return_code