%PDF- %PDF-
Direktori : /lib/calibre/calibre/ebooks/rtf2xml/ |
Current File : //lib/calibre/calibre/ebooks/rtf2xml/table.py |
######################################################################### # # # # # copyright 2002 Paul Henry Tremblay # # # # This program is distributed in the hope that it will be useful, # # but WITHOUT ANY WARRANTY; without even the implied warranty of # # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU # # General Public License for more details. # # # # # ######################################################################### import sys, os from calibre.ebooks.rtf2xml import copy, border_parse from calibre.ptempfile import better_mktemp from . import open_for_read, open_for_write """ States. 1. 'not_in_table' 1. 'cw<tb<row-def___' start a row definition 2. 'mi<mk<in-table__' start table 2. 'in_table' 1. 'mi<mk<pard-start', start of a row, cell 2. 'mi<mk<not-in-tbl', end the table. 3. 'cw<tb<row-def___' start a row definition 3. in_row_definition 1. 'mi<mk<not-in-tbl' : end the row definition. If in table, end the table. 2. 'mi<mk<pard-start' : end the row definition if already in the table, start a row and cell. 3. 'cw<tb<row_______' : end the row definition, end the row 4. 'cw...' use another method to handle the control word control word might be added to dictionary. 5. 'mi<mk<in-table__' If already in table, do nothing. Otherwise start the table. 4. 'in_row' 1. 'mi<mk<pard-start', start cell 2. 'mi<mk<not-in-tbl' end table, 3. 'cw<tb<row_______' close row, 5. 'in_cell' 1. 'mi<mk<not-in-tbl', end table 2. 'cw<tb<cell______', end cell """ class Table: """ Make tables. Logic: Read one line at a time. The default state (self.__state) is 'not_in_table'. Look for either a 'cw<tb<in-table__', or a row definition. """ def __init__(self, in_file, bug_handler, copy=None, run_level=1,): """ Required: 'file'--file to parse Optional: 'copy'-- whether to make a copy of result for debugging 'temp_dir' --where to output temporary results (default is directory from which the script is run.) Returns: nothing """ self.__file = in_file self.__bug_handler = bug_handler self.__copy = copy self.__run_level = run_level self.__write_to = better_mktemp() def __initiate_values(self): """ Initiate all values. """ self.__state_dict = { 'in_table': self.__in_table_func, 'in_row_def': self.__in_row_def_func, 'not_in_table': self.__not_in_table_func, 'in_cell': self.__in_cell_func, 'in_row': self.__in_row_func, } self.__not_in_table_dict = { 'cw<tb<row-def___': self.__found_row_def_func, 'cw<tb<in-table__': self.__start_table_func, 'mi<mk<in-table__' : self.__start_table_func, } # can't use this dictionary. When in row_definition, many tokens # require multiple definitions self.__in_row_definition_dict = { 'mi<mk<not-in-tbl' : self.__end_row_table_func, 'mi<mk<pard-start' : self.__end_row_def_func, } self.__in_row_dict = { 'mi<mk<not-in-tbl' : self.__close_table, 'mi<mk<pard-start' : self.__start_cell_func, 'cw<tb<row_______' : self.__end_row_func, 'cw<tb<cell______' : self.__empty_cell, } # set the default state self.__state = ['not_in_table'] # set empty data for all tables self.__table_data = [] # just in case there is no table data self.__row_dict = {} self.__cell_list = [] self.__cell_widths = [] def __in_table_func(self, line): """ Requires: line -- line to parse Logic: Look for the end of the table. If found, close out the table. Look for 'mi<mk<pard-start', which marks the beginning of a row. Start a row and start a cell. """ # 'cell' : ('tb', 'cell______', self.default_func), if self.__token_info == 'mi<mk<not-in-tbl' or\ self.__token_info == 'mi<mk<sect-start' or\ self.__token_info == 'mi<mk<sect-close' or\ self.__token_info == 'mi<mk<body-close': self.__close_table(line) elif self.__token_info == 'mi<mk<pard-start': self.__start_row_func(line) self.__start_cell_func(line) elif self.__token_info == 'cw<tb<row-def___': self.__found_row_def_func(line) elif self.__token_info == 'cw<tb<cell______': self.__start_row_func(line) self.__empty_cell(line) self.__write_obj.write(line) def __not_in_table_func(self, line): """ Requires: line -- the line of text read in from document Returns: nothing Logic: The state is not in a table, so look for the two tokens that mark the start of a table: 'cw<tb<row-def', or 'cw<tb<in-table__'. If these tokens are found, use another method to start a table and change states. Otherwise, just output the line. """ action = self.__not_in_table_dict.get(self.__token_info) if action: action(line) self.__write_obj.write(line) def __close_table(self, line): """ Requires: line -- line to parse Returns: ? Logic: Write the end marker for the table. Write the end tag for the table. Set the state to ['not_in_table'] """ self.__write_obj.write('mi<mk<table-end_\n') self.__state = ['not_in_table'] self.__table_data[-1]['number-of-columns'] = self.__max_number_cells_in_row self.__table_data[-1]['number-of-rows'] = self.__rows_in_table average_cells_in_row = self.__mode(self.__list_of_cells_in_row) self.__table_data[-1]['average-cells-per-row'] = average_cells_in_row average_cell_width = self.__mode(self.__cell_widths) self.__table_data[-1]['average-cell-width'] = average_cell_width def __found_row_def_func(self, line): """ Requires: line don't need this except for consistency with other methods. Returns: nothing Logic: A row definition has been found. Collect all the data from this to use later in writing attributes for the table. """ self.__state.append('in_row_def') self.__last_cell_position = 0 self.__row_dict = {} self.__cell_list = [] self.__cell_list.append({}) self.__cell_widths = [] def __start_table_func(self, line): """ Requires: line -- line to parse Returns: ? Logic: Add the 'in_table' to the state list. Write out the table marker. Initialize table values (not sure about these yet) """ self.__rows_in_table = 0 self.__cells_in_table = 0 self.__cells_in_row = 0 self.__max_number_cells_in_row = 0 self.__table_data.append({}) self.__list_of_cells_in_row = [] self.__write_obj.write('mi<mk<tabl-start\n') self.__state.append('in_table') def __end_row_table_func(self, line): """ Requires: line --just for consistencey Returns: ? Logic: ? """ self.__close_table(self, line) def __end_row_def_func(self, line): """ Requires: line --just for consistency Returns: nothing Logic: change the state. get rid of the last {} in the cell list figure out the number of cells based on the self.__row_dict[widths] ('122, 122') """ if len(self.__state) > 0: if self.__state[-1] == 'in_row_def': self.__state.pop() # added [{]] at the *end* of each /cell. Get rid of extra one self.__cell_list.pop() widths = self.__row_dict.get('widths') if widths: width_list = widths.split(',') num_cells = len(width_list) self.__row_dict['number-of-cells'] = num_cells def __in_row_def_func(self, line): """ Requires: line --line to parse Returns: nothing Logic: In the text that defines a row. If a control word is found, handle the control word with another method. Check for states that will end this state. While in the row definition, certain tokens can end a row or end a table. If a paragrah definition (pard-start) is found, and the you are already in a table, start of a row. """ if self.__token_info == 'cw<tb<row_______': # write tags self.__end_row_func(line) # change the state self.__end_row_def_func(line) self.__write_obj.write(line) elif line[0:2] == 'cw': self.__handle_row_token(line) self.__write_obj.write(line) elif self.__token_info == 'mi<mk<not-in-tbl' and 'in_table' in self.__state: self.__end_row_def_func(line) self.__close_table(line) self.__write_obj.write(line) elif self.__token_info == 'mi<mk<pard-start': self.__end_row_def_func(line) # if already in the table, start a row, then cell. if (self.__state) > 0 and self.__state[-1] == 'in_table': self.__start_row_func(line) self.__start_cell_func(line) self.__write_obj.write(line) elif self.__token_info == 'mi<mk<in-table__': self.__end_row_def_func(line) # if not in table, start a new table if len(self.__state) > 0 and self.__state[-1] != 'in_table': self.__start_table_func(line) self.__write_obj.write(line) else: self.__write_obj.write(line) def __handle_row_token(self, line): """ Requires: line -- line to parse Returns: ? Logic: the tokens in the row definition contain the following information: 1. row borders. 2. cell borders for all cells in the row. 3. cell positions for all cells in the row. Put all information about row borders into a row dictionary. Put all information about cell borders into into the dictionary in the last item in the cell list. ([{border:something, width:something}, {border:something, width:something}]) cw<bd<bor-t-r-to<nu<bdr-hair__|bdr-li-wid:0.50 """ if line[3:5] == 'bd': border_obj = border_parse.BorderParse() the_dict = border_obj.parse_border(line) keys = the_dict.keys() # border-cell-top-hairline in_cell = 0 for key in keys: if key[0:11] == 'border-cell': in_cell = 1 for key in keys: if in_cell: self.__cell_list[-1][key] = the_dict[key] else: self.__row_dict[key] = the_dict[key] # cw<tb<cell-posit<nu<216.00 elif self.__token_info == 'cw<tb<cell-posit': self.__found_cell_position(line) # cw<tb<row-pos-le<nu<-5.40 elif self.__token_info == 'cw<tb<row-pos-le': position = line[20:-1] self.__row_dict['left-row-position'] = position elif self.__token_info == 'cw<tb<row-header': self.__row_dict['header'] = 'true' def __start_cell_func(self, line): """ Required: line -- the line of text Returns: nothing Logic: Append 'in_cell' for states If the self.__cell list containst dictionaries, get the last dictionary. Write value => attributes for key=> value pop the self.__cell_list. Otherwise, print out a cell tag. """ self.__state.append('in_cell') # self.__cell_list = [] if len(self.__cell_list) > 0: self.__write_obj.write('mi<tg<open-att__<cell') # cell_dict = self.__cell_list[-1] cell_dict = self.__cell_list[0] keys = cell_dict.keys() for key in keys: self.__write_obj.write(f'<{key}>{cell_dict[key]}') self.__write_obj.write('\n') # self.__cell_list.pop() self.__cell_list.pop(0) # self.__cell_list = self.__cell_list[1:] else: self.__write_obj.write('mi<tg<open______<cell\n') self.__cells_in_table += 1 self.__cells_in_row += 1 def __start_row_func(self, line): """ Required: line -- the line of text Returns: nothing Logic: Append 'in_row' for states Write value => attributes for key=> value """ self.__state.append('in_row') self.__write_obj.write('mi<tg<open-att__<row') keys = self.__row_dict.keys() for key in keys: self.__write_obj.write(f'<{key}>{self.__row_dict[key]}') self.__write_obj.write('\n') self.__cells_in_row = 0 self.__rows_in_table += 1 def __found_cell_position(self, line): """ needs: line: current line returns: nothing logic: Calculate the cell width. If the cell is the first cell, you should add the left cell position to it. (This value is often negative.) Next, set the new last_cell_position to the current cell position. """ # cw<tb<cell-posit<nu<216.00 new_cell_position = round(float(line[20:-1]), 2) left_position = 0 if self.__last_cell_position == 0: left_position = self.__row_dict.get('left-row-position', 0) left_position = float(left_position) width = new_cell_position - self.__last_cell_position - left_position # width = round(width, 2) width = '%.2f' % width self.__last_cell_position = new_cell_position widths_exists = self.__row_dict.get('widths') if widths_exists: self.__row_dict['widths'] += ', %s' % str(width) else: self.__row_dict['widths'] = str(width) self.__cell_list[-1]['width'] = width self.__cell_list.append({}) self.__cell_widths.append(width) def __in_cell_func(self, line): """ Required: line Returns: nothing Logic: In the middle of a cell. Look for the close of the table. If found, use the close table function to close the table. Look for the close of the cell. If found, use the close cell function to close out the cell. Otherwise, print out the line. """ # cw<tb<cell______<nu<true # mi<mk<sect-start if self.__token_info == 'mi<mk<not-in-tbl' or\ self.__token_info == 'mi<mk<sect-start' or\ self.__token_info == 'mi<mk<sect-close' or\ self.__token_info == 'mi<mk<body-close': self.__end_cell_func(line) self.__end_row_func(line) self.__close_table(line) self.__write_obj.write(line) elif self.__token_info == 'cw<tb<cell______': self.__end_cell_func(line) else: self.__write_obj.write(line) def __end_cell_func(self, line): """ Requires: line Returns: nothing Logic: End the cell. Print out the closing marks. Pop the self.__state. """ if len(self.__state) > 1: if self.__state[-1] == 'in_cell': self.__state.pop() self.__write_obj.write('mi<mk<close_cell\n') self.__write_obj.write('mi<tg<close_____<cell\n') self.__write_obj.write('mi<mk<closecell_\n') def __in_row_func(self, line): if self.__token_info == 'mi<mk<not-in-tbl' or\ self.__token_info == 'mi<mk<sect-start' or\ self.__token_info == 'mi<mk<sect-close' or\ self.__token_info == 'mi<mk<body-close': self.__end_row_func(line) self.__close_table(line) self.__write_obj.write(line) else: action = self.__in_row_dict.get(self.__token_info) if action: action(line) self.__write_obj.write(line) """ elif self.__token_info == 'mi<mk<pard-start': self.__start_cell_func(line) self.__write_obj.write(line) elif self.__token_info == 'cw<tb<row_______': self.__end_row_func(line) self.__write_obj.write(line) else: self.__write_obj.write(line) """ def __end_row_func(self, line): """ """ if len(self.__state) > 1 and self.__state[-1] == 'in_row': self.__state.pop() self.__write_obj.write('mi<tg<close_____<row\n') else: self.__write_obj.write('mi<tg<empty_____<row\n') self.__rows_in_table += 1 if self.__cells_in_row > self.__max_number_cells_in_row: self.__max_number_cells_in_row = self.__cells_in_row self.__list_of_cells_in_row.append(self.__cells_in_row) def __empty_cell(self, line): """ Required: line -- line of text Returns: nothing Logic: Write an empty tag with attributes if there are attributes. Otherwise, written an empty tag with cell as element. """ if len(self.__cell_list) > 0: self.__write_obj.write('mi<tg<empty-att_<cell') cell_dict = self.__cell_list[-1] keys = cell_dict.keys() for key in keys: self.__write_obj.write(f'<{key}>{cell_dict[key]}') self.__write_obj.write('\n') else: self.__write_obj.write('mi<tg<empty_____<cell\n') self.__cells_in_table += 1 self.__cells_in_row += 1 def __mode(self, the_list): """ Required: the_list -- a list of something Returns: the number that occurs the most Logic: get the count of each item in list. The count that is the greatest is the mode. """ max = 0 mode = 'not-defined' for item in the_list: num_of_values = the_list.count(item) if num_of_values > max: mode = item max = num_of_values return mode def make_table(self): """ Requires: nothing Returns: A dictionary of values for the beginning of the table. Logic: Read one line in at a time. Determine what action to take based on the state. """ self.__initiate_values() read_obj = open_for_read(self.__file) self.__write_obj = open_for_write(self.__write_to) line_to_read = 1 while line_to_read: line_to_read = read_obj.readline() line = line_to_read self.__token_info = line[:16] action = self.__state_dict.get(self.__state[-1]) # print self.__state[-1] if action is None: sys.stderr.write('No matching state in module table.py\n') sys.stderr.write(self.__state[-1] + '\n') action(line) read_obj.close() self.__write_obj.close() copy_obj = copy.Copy(bug_handler=self.__bug_handler) if self.__copy: copy_obj.copy_file(self.__write_to, "table.data") copy_obj.rename(self.__write_to, self.__file) os.remove(self.__write_to) return self.__table_data