%PDF- %PDF-
Direktori : /lib/calibre/calibre/ebooks/rtf2xml/ |
Current File : //lib/calibre/calibre/ebooks/rtf2xml/field_strings.py |
######################################################################### # # # # # copyright 2002 Paul Henry Tremblay # # # # This program is distributed in the hope that it will be useful, # # but WITHOUT ANY WARRANTY; without even the implied warranty of # # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU # # General Public License for more details. # # # # # ######################################################################### import sys, re class FieldStrings: """ This module is given a string. It processes the field instruction string and returns a list of three values. """ def __init__(self, bug_handler, run_level=1): """ Requires: nothing Returns: nothing """ self.__run_level = run_level self.__bug_handler = bug_handler self.__initiate_values() def __initiate_values(self): """ Requires: nothing. Returns: nothing. Logic: initiate values for rest of class. self.__field_instruction_dict: The dictionary for all field names. """ self.__field_instruction_dict = { # number type (arabic, etc.) and number format (\# " ") 'EDITTIME' : (self.__num_type_and_format_func, 'editing-time'), 'NUMCHARS' : (self.__num_type_and_format_func, 'number-of-characters-in-doc'), 'NUMPAGES' : (self.__num_type_and_format_func, 'number-of-pages-in-doc'), 'NUMWORDS' : (self.__num_type_and_format_func, 'number-of-words-in-doc'), 'REVNUM' : (self.__num_type_and_format_func, 'revision-number'), 'SECTIONPAGES' : (self.__num_type_and_format_func, 'num-of-pages-in-section'), 'SECTION' : (self.__num_type_and_format_func, 'insert-section-number'), 'QUOTE' : (self.__num_type_and_format_func, 'quote'), # number formatting (\# "") 'PAGE' : (self.__default_inst_func, 'insert-page-number'), 'page' : (self.__default_inst_func, 'insert-page-number'), # date format (\@ "") 'CREATEDATE' : (self.__date_func, 'insert-date'), 'PRINTDATE' : (self.__date_func, 'insert-date'), # PRINTDATE? 'SAVEDATE' : (self.__date_func, 'last-saved'), 'TIME' : (self.__date_func, 'insert-time'), # numbers? # these fields take four switches 'AUTHOR' : (self.__simple_info_func, 'user-name'), 'COMMENTS' : (self.__simple_info_func, 'comments'), 'FILENAME' : (self.__simple_info_func, 'file-name'), 'filename' : (self.__simple_info_func, 'file-name'), 'KEYWORDS' : (self.__simple_info_func, 'keywords'), 'LASTSAVEDBY' : (self.__simple_info_func, 'last-saved-by'), 'SUBJECT' : (self.__simple_info_func, 'subject'), 'TEMPLATE' : (self.__simple_info_func, 'based-on-template'), 'TITLE' : (self.__simple_info_func, 'document-title'), 'USERADDRESS' : (self.__simple_info_func, 'user-address'), 'USERINITIALS' : (self.__simple_info_func, 'user-initials'), 'USERNAME' : (self.__simple_info_func, 'user-name'), 'EQ' : (self.__equation_func, 'equation'), 'HYPERLINK' : (self.__hyperlink_func, 'hyperlink'), 'INCLUDEPICTURE': (self.__include_pict_func, 'include-picture'), 'INCLUDETEXT' : (self.__include_text_func, 'include-text-from-file'), 'INDEX' : (self.__index_func, 'index'), 'NOTEREF' : (self.__note_ref_func, 'reference-to-note'), 'PAGEREF' : (self.__page_ref_func, 'reference-to-page'), 'REF' : (self.__ref_func, 'reference'), 'ref' : (self.__ref_func, 'reference'), 'SEQ' : (self.__sequence_func, 'numbering-sequence'), 'SYMBOL' : (self.__symbol_func, 'symbol'), 'TA' : (self.__ta_func, 'anchor-for-table-of-authorities'), 'TOA' : (self.__toc_table_func, 'table-of-authorities'), 'TOC' : (self.__toc_table_func, 'table-of-contents'), # no switches 'AUTONUMOUT' : (self.__no_switch_func, 'auto-num-out?'), 'COMPARE' : (self.__no_switch_func, 'compare'), 'DOCVARIABLE' : (self.__no_switch_func, 'document-variable'), 'GOTOBUTTON' : (self.__no_switch_func, 'go-button'), 'NEXT' : (self.__no_switch_func, 'next'), 'NEXTIF' : (self.__no_switch_func, 'next-if'), 'SKIPIF' : (self.__no_switch_func, 'skip-if'), 'IF' : (self.__no_switch_func, 'if'), 'MERGEFIELD' : (self.__no_switch_func, 'merge-field'), 'MERGEREC' : (self.__no_switch_func, 'merge-record'), 'MERGESEQ' : (self.__no_switch_func, 'merge-sequence'), 'PLACEHOLDER' : (self.__no_switch_func, 'place-holder'), 'PRIVATE' : (self.__no_switch_func, 'private'), 'RD' : (self.__no_switch_func, 'referenced-document'), 'SET' : (self.__no_switch_func, 'set'), # default instructions (haven't written a method for them 'ADVANCE' : (self.__default_inst_func, 'advance'), 'ASK' : (self.__default_inst_func, 'prompt-user'), 'AUTONUMLGL' : (self.__default_inst_func, 'automatic-number'), 'AUTONUM' : (self.__default_inst_func, 'automatic-number'), 'AUTOTEXTLIST' : (self.__default_inst_func, 'auto-list-text'), 'AUTOTEXT' : (self.__default_inst_func, 'auto-text'), 'BARCODE' : (self.__default_inst_func, 'barcode'), 'CONTACT' : (self.__default_inst_func, 'contact'), 'DATABASE' : (self.__default_inst_func, 'database'), 'DATE' : (self.__default_inst_func, 'date'), 'date' : (self.__default_inst_func, 'date'), 'DOCPROPERTY' : (self.__default_inst_func, 'document-property'), 'FILESIZE' : (self.__default_inst_func, 'file-size'), 'FILLIN' : (self.__default_inst_func, 'fill-in'), 'INFO' : (self.__default_inst_func, 'document-info'), 'LINK' : (self.__default_inst_func, 'link'), 'PA' : (self.__default_inst_func, 'page'), 'PRINT' : (self.__default_inst_func, 'print'), 'STYLEREF' : (self.__default_inst_func, 'style-reference'), 'USERPROPERTY' : (self.__default_inst_func, 'user-property'), 'FORMCHECKBOX' : (self.__default_inst_func, 'form-checkbox'), 'FORMTEXT' : (self.__default_inst_func, 'form-text'), # buttons 'MACROBUTTON' : (self.__default_inst_func, 'macro-button'), } self.__number_dict = { 'Arabic' : 'arabic', 'alphabetic' : 'alphabetic', 'ALPHABETIC' : 'capital-alphabetic', 'roman' : 'roman', 'ROMAN' : 'capital-roman', 'Ordinal' : 'ordinal', 'CardText' : 'cardinal-text', 'OrdText' : 'ordinal-text', 'Hex' : 'hexidecimal', 'DollarText' : 'dollar-text', 'Upper' : 'upper-case', 'Lower' : 'lower-case', 'FirstCap' : 'first-cap', 'Caps' : 'caps', } self.__text_format_dict = { 'Upper' : 'upper', 'Lower' : 'lower', 'FirstCap' : 'first-cap', 'Caps' : 'caps', } self.__symbol_num_exp = re.compile(r'SYMBOL (.*?) ') self.__symbol_font_exp = re.compile(r'\\f "(.*?)"') self.__symbol_size_exp = re.compile(r'\\s (\d+)') # self.__toc_figure_exp = re.compile(r'\\c "Figure"') # \\@ "dddd, MMMM d, yyyy" self.__date_exp = re.compile(r'\\@\s{1,}"(.*?)"') self.__num_type_exp = re.compile( r'\\\*\s{1,}(Arabic|alphabetic|ALPHABETIC|roman|ROMAN|Ordinal|CardText|OrdText|Hex|DollarText|Upper|Lower|FirstCap|Caps)') self.__format_text_exp = re.compile(r'\\\*\s{1,}(Upper|Lower|FirstCap|Caps)') self.__merge_format_exp = re.compile(r'\\\*\s{1,}MERGEFORMAT') self.__ta_short_field_exp = re.compile(r'\\s\s{1,}"(.*?)"') self.__ta_long_field_exp = re.compile(r'\\l\s{1,}"(.*?)"') self.__ta_category_exp = re.compile(r'\\c\s{1,}(\d+)') # indices self.__index_insert_blank_line_exp = re.compile(r'\\h\s{1,}""') self.__index_insert_letter_exp = re.compile(r'\\h\s{1,}"()"') self.__index_columns_exp = re.compile(r'\\c\s{1,}"(.*?)"') self.__bookmark_exp = re.compile(r'\\b\s{1,}(.*?)\s') self.__d_separator = re.compile(r'\\d\s{1,}(.*?)\s') self.__e_separator = re.compile(r'\\e\s{1,}(.*?)\s') self.__l_separator = re.compile(r'\\l\s{1,}(.*?)\s') self.__p_separator = re.compile(r'\\p\s{1,}(.*?)\s') self.__index_sequence = re.compile(r'\\s\s{1,}(.*?)\s') self.__index_entry_typ_exp = re.compile(r'\\f\s{1,}"(.*?)"') self.__quote_exp = re.compile(r'"(.*?)"') self.__filter_switch = re.compile(r'\\c\s{1,}(.*?)\s') self.__link_switch = re.compile(r'\\l\s{1,}(.*?)\s') def process_string(self, my_string, type): """ Requires: my_string --the string to parse. type -- the type of string. Returns: Returns a string for a field instrution attribute. Logic: This handles all "large" fields, which means everything except toc entries, index entries, and bookmarks Split the string by spaces, and get the first item in the resulting list. This item is the field's type. Check for the action in the field instructions dictionary for further parsing. If no action is found, print out an error message. """ changed_string = '' lines = my_string.split('\n') for line in lines: if line[0:2] == 'tx': changed_string += line[17:] fields = changed_string.split() field_name = fields[0] action, name = self.__field_instruction_dict.get(field_name, (None, None)) match_obj = re.search(self.__merge_format_exp, changed_string) if match_obj and name: name += '<update>dynamic' elif name: name += '<update>static' else: pass # no name--not in list above if action: the_list = action(field_name, name, changed_string) else: # change -1 to 0--for now, I want users to report bugs msg = f'no key for "{field_name}" "{changed_string}"\n' sys.stderr.write(msg) if self.__run_level > 3: msg = f'no key for "{field_name}" "{changed_string}"\n' raise self.__bug_handler(msg) the_list = self.__fall_back_func(field_name, line) return the_list return the_list def __default_inst_func(self, field_name, name, line): """ Requires: field_name -- the first word in the string name -- the changed name according to the dictionary line -- the string to be parsed Returns: The name of the field. Logic: I only need the changed name for the field. """ return [None, None, name] def __fall_back_func(self, field_name, line): """ Requires: field_name -- the first word in the string name -- the changed name according to the dictionary line -- the string to be parsed Returns: The name of the field. Logic: Used for fields not found in dict """ the_string = field_name the_string += '<update>none' return [None, None, the_string] def __equation_func(self, field_name, name, line): """ Required: field_name -- the first word in the string name --the changed name according to the dictionary line -- the string to be parse Returns: The name of the field Logic: """ return [None, None, name] def __no_switch_func(self, field_name, name, line): """ Required: field_name --the first field_name -- the first word in the string name --the changed name according to the dictionary line -- the string to be parse Returns: The name of the field Logic: """ return [None, None, name] def __num_type_and_format_func(self, field_name, name, line): """ Required: field_name -- the first word in the string name --the changed name according to the dictionary line -- the string to be parse Returns: list of None, None, and part of a tag Logic: parse num_type parse num_format """ the_string = name num_format = self.__parse_num_format(line) if num_format: the_string += '<number-format>%s' % num_format num_type = self.__parse_num_type(line) if num_type: the_string += '<number-type>%s' % num_type # Only QUOTE takes a (mandatory?) argument if field_name == 'QUOTE': match_group = re.search(r'QUOTE\s{1,}"(.*?)"', line) if match_group: arg = match_group.group(1) the_string += '<argument>%s' % arg return [None, None, the_string] def __num_format_func(self, field_name, name, line): """ Required: field_name -- the first word in the string name --the changed name according to the dictionary line -- the string to be parse Returns: list of None, None, and part of a tag Logic: """ the_string = name num_format = self.__parse_num_format(line) if num_format: the_string += '<number-format>%s' % num_format return [None, None, the_string] def __parse_num_format(self, the_string): """ Required: the_string -- the string to parse Returns: a string if the_string contains number formatting information None, otherwise Logic: """ match_group = re.search(self.__date_exp, the_string) if match_group: return match_group(1) def __parse_num_type(self, the_string): """ Required: the_string -- the string to parse Returns: a string if the_string contains number type information None, otherwise Logic: the_string might look like: USERNAME \\* Arabic \\* MERGEFORMAT Get the \\* Upper part. Use a dictionary to convert the "Arabic" to a more-readable word for the value of the key "number-type". (<field number-type = "Arabic"> """ match_group = re.search(self.__num_type_exp, the_string) if match_group: name = match_group.group(1) changed_name = self.__number_dict.get(name) if changed_name: return changed_name else: sys.stderr.write('module is fields_string\n') sys.stderr.write('method is __parse_num_type\n') sys.stderr.write('no dictionary entry for %s\n' % name) def __date_func(self, field_name, name, line): """ Required: field_name --the fist field_name -- the first word in the string name --the changed name according to the dictionary line -- the string to be parse Returns: list of None, None, and part of a tag Logic: """ the_string = name match_group = re.search(self.__date_exp, line) if match_group: the_string += '<date-format>%s' % match_group.group(1) return [None, None, the_string] def __simple_info_func(self, field_name, name, line): """ Required: field_name -- the first word in the string name --the changed name according to the dictionary line -- the string to be parse Returns: The name of the field Logic: These fields can only have the following switches: 1. Upper 2. Lower 3. FirstCap 4. Caps """ the_string = name match_group = re.search(self.__format_text_exp, line) if match_group: name = match_group.group(1) changed_name = self.__text_format_dict.get(name) if changed_name: the_string += '<format>%s' % changed_name else: sys.stderr.write('module is fields_string\n') sys.stderr.write('method is __parse_num_type\n') sys.stderr.write('no dictionary entry for %s\n' % name) return [None, None, the_string] def __hyperlink_func(self, field_name, name, line): """ Required: field_name -- the first word in the string name --the changed name according to the dictionary line -- the string to be parse Returns: The name of the field """ self.__link_switch = re.compile(r'\\l\s{1,}"{0,1}(.*?)"{0,1}\s') the_string = name match_group = re.search(self.__link_switch, line) if match_group: link = match_group.group(1) link = link.replace('"', """) the_string += '<link>%s' % link # \l "txt" "link" # want "file name" so must get rid of \c "txt" line = re.sub(self.__link_switch, '', line) match_group = re.search(self.__quote_exp, line) if match_group: arg = match_group.group(1) the_string += '<argument>%s' % arg else: pass index = line.find('\\m') if index > -1: the_string += '<html2-image-map>true' index = line.find('\\n') if index > -1: the_string += '<new-window>true' index = line.find('\\h') if index > -1: the_string += '<no-history>true' return [None, None, the_string] def __include_text_func(self, field_name, name, line): """ Required: field_name -- the first word in the string name --the changed name according to the dictionary line -- the string to be parse Returns: The name of the field Logic: """ the_string = name match_group = re.search(self.__format_text_exp, line) if match_group: name = match_group.group(1) changed_name = self.__text_format_dict.get(name) if changed_name: the_string += '<format>%s' % changed_name else: sys.stderr.write('module is fields_string\n') sys.stderr.write('method is __parse_num_type\n') sys.stderr.write('no dictionary entry for %s\n' % name) match_group = re.search(self.__filter_switch, line) if match_group: arg = match_group.group(1) the_string += '<filter>%s' % arg # \c "txt" "file name" # want "file name" so must get rid of \c "txt" line = re.sub(self.__filter_switch, '', line) match_group = re.search(self.__quote_exp, line) if match_group: arg = match_group.group(1) arg = arg.replace('"', """) the_string += '<argument>%s' % arg else: sys.stderr.write('Module is field_strings\n') sys.stderr.write('method is include_text_func\n') sys.stderr.write('no argument for include text\n') index = line.find('\\!') if index > -1: the_string += '<no-field-update>true' return [None, None, the_string] def __include_pict_func(self, field_name, name, line): """ Required: field_name -- the first word in the string name --the changed name according to the dictionary line -- the string to be parse Returns: The name of the field Logic: """ the_string = name match_group = re.search(self.__filter_switch, line) if match_group: arg = match_group.group(1) arg = arg.replace('"', """) the_string += '<filter>%s' % arg # \c "txt" "file name" # want "file name" so must get rid of \c "txt" line = re.sub(self.__filter_switch, '', line) match_group = re.search(self.__quote_exp, line) if match_group: arg = match_group.group(1) the_string += '<argument>%s' % arg else: sys.stderr.write('Module is field_strings\n') sys.stderr.write('method is include_pict_func\n') sys.stderr.write('no argument for include pict\n') index = line.find('\\d') if index > -1: the_string += '<external>true' return [None, None, the_string] def __ref_func(self, field_name, name, line): """ Requires: field_name -- the first word in the string name -- the changed name according to the dictionary line -- the string to be parsed Returns: The name of the field. Logic: A page reference field looks like this: PAGEREF _Toc440880424 \\h I want to extract the second line of info, which is used as an anchor in the resulting XML file. """ the_string = name match_group = re.search(self.__format_text_exp, line) if match_group: name = match_group.group(1) changed_name = self.__text_format_dict.get(name) if changed_name: the_string += '<format>%s' % changed_name else: sys.stderr.write('module is fields_string\n') sys.stderr.write('method is __parse_num_type\n') sys.stderr.write('no dictionary entry for %s\n' % name) line = re.sub(self.__merge_format_exp, '', line) words = line.split() words = words[1:] # get rid of field name for word in words: if word[0:1] != '\\': the_string += '<bookmark>%s' % word index = line.find('\\f') if index > -1: the_string += '<include-note-number>true' index = line.find('\\h') if index > -1: the_string += '<hyperlink>true' index = line.find('\\n') if index > -1: the_string += '<insert-number>true' index = line.find('\\r') if index > -1: the_string += '<insert-number-relative>true' index = line.find('\\p') if index > -1: the_string += '<paragraph-relative-position>true' index = line.find('\\t') if index > -1: the_string += '<suppress-non-delimeter>true' index = line.find('\\w') if index > -1: the_string += '<insert-number-full>true' return [None, None, the_string] def __toc_table_func(self, field_name, name, line): """ Requires: field_name -- the name of the first word in the string name --the changed name, according to the dictionary. line --the string to be parsed. Returns: A string for a TOC table field. Logic: If the string contains Figure, it is a table of figures. Otherwise, it is a plain old table of contents. """ the_string = name index = line.find('\\c "Figure"') if index > -1: the_string = the_string.replace('table-of-contents', 'table-of-figures') # don't really need the first value in this list, I don't believe return [name, None, the_string] def __sequence_func(self, field_name, name, line): """ Requires: field_name --the name of the first word in the string. name --the changed name according to the dictionary. line -- the string to parse. Returns: A string with a value for the type and label attributes Logic: The type of sequence--whether figure, graph, my-name, or whatever--is represented by the second word in the string. Extract and return. SEQ Figure \\* ARABIC """ fields = line.split() label = fields[1] my_string = f'{name}<label>{label}' return [None, None, my_string] def __ta_func(self, field_name, name, line): """ Requires: field_name --the name of the first word in the string. name --the changed name according to the dictionary. line -- the string to parse. Returns: A string with a value for the type and label attributes Logic: """ the_string = name match_group = re.search(self.__ta_short_field_exp, line) if match_group: short_name = match_group.group(1) the_string += '<short-field>%s' % short_name match_group = re.search(self.__ta_long_field_exp, line) if match_group: long_name = match_group.group(1) the_string += '<long-field>%s' % long_name match_group = re.search(self.__ta_category_exp, line) if match_group: category = match_group.group(1) the_string += '<category>%s' % category index = line.find('\\b') if index > -1: the_string += '<bold>true' index = line.find('\\i') if index > -1: the_string += '<italics>true' return [None, None, the_string] def __index_func(self, field_name, name, line): """ Requires: field_name --the name of the first word in the string. name --the changed name according to the dictionary. line -- the string to parse. Returns: A string with a value for the type and label attributes Logic: """ # self.__index_insert_blank_line_exp = re.compile(r'\\h\s{1,}""') # self.__index_insert_letter_exp = re.compile(r'\\h\s{1,}(".*?")') the_string = name match_group = re.search(self.__index_insert_blank_line_exp, line) if match_group: the_string += '<insert-blank-line>true' else: match_group = re.search(self.__index_insert_letter_exp, line) if match_group: insert_letter = match_group.group(1) the_string += '<insert-letter>%s' % insert_letter match_group = re.search(self.__index_columns_exp, line) if match_group: columns = match_group.group(1) the_string += '<number-of-columns>%s' % columns # self.__bookmark_exp = re.compile(r'\\b\s{1,}(.*?)\s') match_group = re.search(self.__bookmark_exp, line) if match_group: bookmark = match_group.group(1) the_string += '<use-bookmark>%s' % bookmark match_group = re.search(self.__d_separator, line) if match_group: separator = match_group.group(1) separator = separator.replace('"', '"') the_string += '<sequence-separator>%s' % separator # self.__e_separator = re.compile(r'\\e\s{1,}(.*?)\s') match_group = re.search(self.__e_separator, line) if match_group: separator = match_group.group(1) separator = separator.replace('"', '"') the_string += '<page-separator>%s' % separator # self.__index_sequence = re.compile(r'\\s\s{1,}(.*?)\s') match_group = re.search(self.__index_sequence, line) if match_group: sequence = match_group.group(1) separator = separator.replace('"', '"') the_string += '<use-sequence>%s' % sequence # self.__index_entry_typ_exp = re.compile(r'\\f\s{1,}"(.*?)"') match_group = re.search(self.__index_entry_typ_exp, line) if match_group: entry_type = match_group.group(1) the_string += '<entry-type>%s' % entry_type # self.__p_separator = re.compile(r'\\p\s{1,}(.*?)\s') match_group = re.search(self.__p_separator, line) if match_group: limit = match_group.group(1) the_string += '<limit-to-letters>%s' % limit match_group = re.search(self.__l_separator, line) if match_group: separator = match_group.group(1) separator = separator.replace('"', '"') the_string += '<multi-page-separator>%s' % separator index = line.find('\\a') if index > -1: the_string += '<accented>true' index = line.find('\\r') if index > -1: the_string += '<sub-entry-on-same-line>true' index = line.find('\\t') if index > -1: the_string += '<enable-yomi-text>true' return [None, None, the_string] def __page_ref_func(self, field_name, name, line): """ Requires: field_name --first name in the string. name -- the changed name according to the dictionary. line -- the string to parse. Returns: A string . Logic: """ the_string = name num_format = self.__parse_num_format(line) if num_format: the_string += '<number-format>%s' % num_format num_type = self.__parse_num_type(line) if num_type: the_string += '<number-type>%s' % num_type line = re.sub(self.__merge_format_exp, '', line) words = line.split() words = words[1:] # get rid of field name for word in words: if word[0:1] != '\\': the_string += '<bookmark>%s' % word index = line.find('\\h') if index > -1: the_string += '<hyperlink>true' index = line.find('\\p') if index > -1: the_string += '<paragraph-relative-position>true' return [None, None, the_string] def __note_ref_func(self, field_name, name, line): """ Requires: field_name --first name in the string. name -- the changed name according to the dictionary. line -- the string to parse. Returns: A string . Logic: """ the_string = name line = re.sub(self.__merge_format_exp, '', line) words = line.split() words = words[1:] # get rid of field name for word in words: if word[0:1] != '\\': the_string += '<bookmark>%s' % word index = line.find('\\h') if index > -1: the_string += '<hyperlink>true' index = line.find('\\p') if index > -1: the_string += '<paragraph-relative-position>true' index = line.find('\\f') if index > -1: the_string += '<include-note-number>true' return [None, None, the_string] def __symbol_func(self, field_name, name, line): """ Requires: field_name --first name in the string. name -- the changed name according to the dictionary. line -- the string to parse. Returns: A string containing font size, font style, and a hexadecimal value. Logic: The SYMBOL field is one of Microsoft's many quirky ways of entering text. The string that results from this method looks like this: SYMBOL 97 \\f "Symbol" \\s 12 The first word merely tells us that we have encountered a SYMBOL field. The next value is the Microsoft decimal value. Change this to hexadecimal. The pattern '\\f "some font' tells us the font. The pattern '\\s some size' tells us the font size. Extract all of this information. Store this information in a string, and make this string the last item in a list. The first item in the list is the simple word 'symbol', which tells me that I don't really have field, but UTF-8 data. """ num = '' font = '' font_size = '' changed_line = '' search_obj = re.search(self.__symbol_num_exp, line) if search_obj: num = search_obj.group(1) num = int(num) num = '%X' % num search_obj = re.search(self.__symbol_font_exp, line) if search_obj: font = search_obj.group(1) changed_line += 'cw<ci<font-style<nu<%s\n' % font search_obj = re.search(self.__symbol_size_exp, line) if search_obj: font_size = search_obj.group(1) font_size = int(font_size) font_size = '%.2f' % font_size changed_line += 'cw<ci<font-size_<nu<%s\n' % font_size changed_line += 'tx<hx<__________<\'%s\n' % num return ['Symbol', None, changed_line]