%PDF- %PDF-
| Direktori : /lib/calibre/calibre/ebooks/rtf2xml/ |
| Current File : //lib/calibre/calibre/ebooks/rtf2xml/make_lists.py |
#########################################################################
# #
# #
# copyright 2002 Paul Henry Tremblay #
# #
# This program is distributed in the hope that it will be useful, #
# but WITHOUT ANY WARRANTY; without even the implied warranty of #
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU #
# General Public License for more details. #
# #
# #
#########################################################################
import sys, os, re
from calibre.ebooks.rtf2xml import copy
from calibre.ptempfile import better_mktemp
from . import open_for_read, open_for_write
class MakeLists:
"""
Form lists.
Use RTF's own formatting to determine if a paragraph definition is part of a
list.
Use indents to determine items and how lists are nested.
"""
def __init__(self,
in_file,
bug_handler,
headings_to_sections,
list_of_lists,
copy=None,
run_level=1,
no_headings_as_list=1,
write_list_info=0,
):
"""
Required:
'file'
Optional:
'copy'-- whether to make a copy of result for debugging
'temp_dir' --where to output temporary results (default is
directory from which the script is run.)
Returns:
nothing
"""
self.__file = in_file
self.__bug_handler = bug_handler
self.__run_level = run_level
self.__no_headings_as_list = no_headings_as_list
self.__headings_to_sections = headings_to_sections
self.__copy = copy
self.__write_to = better_mktemp()
self.__list_of_lists = list_of_lists
self.__write_list_info = write_list_info
def __initiate_values(self):
"""
Required:
Nothing
Return:
Nothing
Logic:
The self.__end_list is a list of tokens that will force a list to end.
Likewise, the self.__end_lines is a list of lines that forces a list to end.
"""
self.__state = "default"
self.__left_indent = 0
self.__list_type = 'not-defined'
self.__pard_def = ""
self.__all_lists = []
self.__level = 0
self.__list_chunk = ''
self.__state_dict={
'default' : self.__default_func,
'in_pard' : self.__in_pard_func,
'after_pard' : self.__after_pard_func,
}
self.__headings = [
'heading 1', 'heading 2', 'heading 3', 'heading 4',
'heading 5', 'heading 6', 'heading 7', 'heading 8',
'heading 9'
]
self.__allow_levels = ['0', '1', '2', '3', '4', '5', '6', '7', '8', '9']
self.__style_name = ''
self.__end_list = [
'mi<mk<body-close',
'mi<mk<par-in-fld',
'cw<tb<cell______',
'cw<tb<row-def___',
'cw<tb<row_______',
'mi<mk<sect-close',
'mi<mk<sect-start',
'mi<mk<header-beg',
'mi<mk<header-end',
'mi<mk<head___clo',
'mi<mk<fldbk-end_',
'mi<mk<close_cell',
'mi<mk<footnt-ope',
'mi<mk<foot___clo',
'mi<mk<tabl-start',
# 'mi<mk<sec-fd-beg',
]
self.__end_lines = [
'mi<tg<close_____<cell\n',
]
self.__id_regex = re.compile(r'\<list-id\>(\d+)')
self.__lv_regex = re.compile(r'\<list-level\>(\d+)')
self.__found_appt = 0
self.__line_num = 0
def __in_pard_func(self, line):
"""
Required:
line -- the line of current text.
Return:
Nothing
Logic:
You are in a list, but in the middle of a paragraph definition.
Don't do anything until you find the end of the paragraph definition.
"""
if self.__token_info == 'mi<mk<pard-end__':
self.__state = 'after_pard'
self.__write_obj.write(line)
def __after_pard_func(self, line):
"""
Required:
line -- the line of current text.
Return:
Nothing
Logic:
You are in a list, but after a paragraph definition. You have to
determine if the last pargraph definition ends a list, continues
the old one, or starts a new one.
Otherwise, look for a paragraph definition. If one is found, determine if
the paragraph definition contains a list-id. If it does, use the method
self.__list_after_par_def to determine the action.
If the paragraph definition does not contain a list-id, use the method
close_lists to close out items and lists for a paragraph that is not
If a bigger block is found (such as a section or a cell), end all lists.
indented.
If no special line is found, add each line to a buffer.
"""
if self.__token_info == 'mi<tg<open-att__' and line[17:37] == 'paragraph-definition':
is_heading = self.__is_a_heading()
# found paragraph definition and not heading 1
search_obj = re.search(self.__id_regex, line)
if search_obj and not is_heading: # found list-id
search_obj_lv = re.search(self.__lv_regex, line)
if search_obj_lv:
self.__level = search_obj_lv.group(1)
num = search_obj.group(1)
self.__list_after_par_def_func(line, num)
self.__write_obj.write(line)
self.__state = 'in_pard'
# heading 1
elif is_heading:
self.__left_indent = -1000
self.__close_lists()
self.__write_obj.write(self.__list_chunk)
self.__list_chunk = ''
self.__state = 'default'
self.__write_obj.write(line)
# Normal with no list id
else:
self.__close_lists()
self.__write_obj.write(self.__list_chunk)
self.__list_chunk = ''
self.__write_obj.write(line)
if len(self.__all_lists) == 0:
self.__state= 'default'
else:
self.__state = 'in_pard'
# section to end lists
elif self.__token_info in self.__end_list :
self.__left_indent = -1000
self.__close_lists()
self.__write_obj.write(self.__list_chunk)
self.__list_chunk = ''
self.__state = 'default'
self.__write_obj.write(line)
else:
self.__list_chunk += line
def __list_after_par_def_func(self, line, id):
"""
Required:
line -- the line of current text.
id -- the id of the current list
Return:
Nothing
Logic:
You have found the end of a paragraph definition, and have found
another paragraph definition with a list id.
If the list-id is different from the last paragraph definition,
write the string in the buffer. Close out the lists with another
method and start a new list.
If the list id is the same as the last one, check the indent on the
current paragraph definition. If it is greater than the previous one,
do not end the current list or item. Start a new list.
"""
last_list_id = self.__all_lists[-1]['id']
if id != last_list_id:
self.__close_lists()
self.__write_obj.write(self.__list_chunk)
self.__write_start_list(id)
self.__list_chunk = ''
else:
last_list_indent = self.__all_lists[-1]['left-indent']
if self.__left_indent > last_list_indent:
self.__write_obj.write(self.__list_chunk)
self.__write_start_list(id)
else:
self.__write_end_item()
self.__write_obj.write(self.__list_chunk)
self.__write_start_item()
self.__list_chunk = ''
def __close_lists(self):
"""
Required:
Nothing
Return:
Nothing
Logic:
Reverse the list of dictionaries. Iterate through the list and
get the indent for each list. If the current indent is less than
or equal to the indent in the dictionary, close that level.
Keep track of how many levels you close. Reduce the list by that
many levels.
Reverse the list again.
"""
if self.__line_num < 25 and self.__found_appt:
sys.stderr.write('in closing out lists\n')
sys.stderr.write('current_indent is "%s"\n' % self.__left_indent)
current_indent = self.__left_indent
self.__all_lists.reverse()
num_levels_closed = 0
for the_dict in self.__all_lists:
list_indent = the_dict.get('left-indent')
if self.__line_num < 25 and self.__found_appt:
sys.stderr.write('last indent is "%s"' % list_indent)
if current_indent <= list_indent:
self.__write_end_item()
self.__write_end_list()
num_levels_closed += 1
self.__all_lists = self.__all_lists[num_levels_closed:]
self.__all_lists.reverse()
def __write_end_list(self):
"""
Required:
Nothing
Return:
Nothing
Logic:
Write the end of a list.
"""
self.__write_obj.write('mi<tg<close_____<list\n')
self.__write_obj.write('mi<mk<list_close\n')
def __write_start_list(self, id):
"""
Required:
id -- the id of the current list.
Return:
Nothing
Logic:
Write the start of a list and add the id and left-indent to the
self.__all_lists list.
Write cues of when a list starts for later processing.
In order to determine the type of list, you have to iterate through
the self.__list_of lists. This list looks like:
[[{list-id: [1, 2], [{}], [{}]] [{list-id: [3, 4], [{}]]]
I need to get the inside lists of the main lists. Then I need to get
the first item of what I just got. This is a dictionary. Get the list-id.
This is a list. Check to see if the current id is in this list. If
so, then get the list-type from the dictionary.
"""
the_dict = {}
the_dict['left-indent'] = self.__left_indent
the_dict['id'] = id
self.__all_lists.append(the_dict)
self.__write_obj.write(
'mi<mk<list_start\n'
)
# bogus levels are sometimes written for empty paragraphs
if str(self.__level) not in self.__allow_levels:
lev_num = '0'
else:
lev_num = self.__level
self.__write_obj.write(
'mi<tg<open-att__<list<list-id>%s<level>%s'
% (id, lev_num)
)
list_dict = {}
if self.__list_of_lists: # older RTF won't generate a list_of_lists
index_of_list = self.__get_index_of_list(id)
if index_of_list is not None: # found a matching id
curlist = self.__list_of_lists[index_of_list]
list_dict = curlist[0]
level = int(self.__level) + 1
if level >= len(curlist):
level = len(curlist) - 1
level_dict = curlist[level][0]
list_type = level_dict.get('numbering-type')
if list_type == 'bullet':
list_type = 'unordered'
else:
list_type = 'ordered'
self.__write_obj.write(
'<list-type>%s' % (list_type))
else: # no matching id
self.__write_obj.write(
'<list-type>%s' % (self.__list_type))
else: # older RTF
self.__write_obj.write(
'<list-type>%s' % (self.__list_type))
# if you want to dump all the info to the list, rather than
# keeping it in the table above, change self.__write_list_info
# to true.
if self.__list_of_lists and self.__write_list_info and list_dict:
not_allow = ['list-id',]
the_keys_list = list_dict.keys()
for the_key in the_keys_list:
if the_key in not_allow:
continue
self.__write_obj.write(f'<{the_key}>{list_dict[the_key]}')
the_keys_level = level_dict.keys()
for the_key in the_keys_level:
self.__write_obj.write(f'<{the_key}>{level_dict[the_key]}')
self.__write_obj.write('\n')
self.__write_obj.write(
'mi<mk<liststart_\n'
)
self.__write_start_item()
def __get_index_of_list(self, id):
"""
Requires:
id -- id of current paragraph-definition
Returns:
an index of where the id occurs in list_of_lists, the
dictionary passed to this module.
Logic:
Iterate through the big lists, the one passed to this module and
get the first item, the dictionary. Use a counter to keep
track of how many times you iterate with the counter.
Once you find a match, return the counter.
If no match is found, print out an error message.
"""
# some RTF use 0 indexed list. Don't know what to do?
if id == '0':
return
the_index = 0
for list in self.__list_of_lists:
the_dict = list[0]
id_in_list = the_dict.get('list-id')
if id in id_in_list:
return the_index
the_index += 1
if self.__run_level > 0:
sys.stderr.write('Module is make_lists.py\n'
'Method is __get_index_of_list\n'
'The main list does not appear to have a matching id for %s \n'
% (id)
)
# sys.stderr.write(repr(self.__list_of_lists))
# if self.__run_level > 3:
# msg = 'level is "%s"\n' % self.__run_level
# self.__bug_handler
def __write_start_item(self):
self.__write_obj.write('mi<mk<item_start\n')
self.__write_obj.write('mi<tg<open______<item\n')
self.__write_obj.write('mi<mk<itemstart_\n')
def __write_end_item(self):
self.__write_obj.write('mi<tg<item_end__\n')
self.__write_obj.write('mi<tg<close_____<item\n')
self.__write_obj.write('mi<tg<item__end_\n')
def __default_func(self, line):
"""
Required:
self, line
Returns:
Nothing
Logic
Look for the start of a paragraph definition. If one is found, check if
it contains a list-id. If it does, start a list. Change the state to
in_pard.
"""
if self.__token_info == 'mi<tg<open-att__' and line[17:37] == 'paragraph-definition':
is_a_heading = self.__is_a_heading()
if not is_a_heading:
search_obj = re.search(self.__id_regex, line)
if search_obj:
num = search_obj.group(1)
self.__state = 'in_pard'
search_obj_lv = re.search(self.__lv_regex, line)
if search_obj_lv:
self.__level = search_obj_lv.group(1)
self.__write_start_list(num)
self.__write_obj.write(line)
def __is_a_heading(self):
if self.__style_name in self.__headings:
if self.__headings_to_sections:
return 1
else:
if self.__no_headings_as_list:
return 1
else:
return 0
else:
return 0
def __get_indent(self, line):
if self.__token_info == 'mi<mk<left_inden':
self.__left_indent = float(line[17:-1])
def __get_list_type(self, line):
if self.__token_info == 'mi<mk<list-type_': # <ordered
self.__list_type = line[17:-1]
if self.__list_type == 'item':
self.__list_type = "unordered"
def __get_style_name(self, line):
if self.__token_info == 'mi<mk<style-name':
self.__style_name = line[17:-1]
def make_lists(self):
"""
Required:
nothing
Returns:
original file will be changed
Logic:
"""
self.__initiate_values()
read_obj = open_for_read(self.__file)
self.__write_obj = open_for_write(self.__write_to)
line_to_read = 1
while line_to_read:
line_to_read = read_obj.readline()
line = line_to_read
self.__token_info = line[:16]
self.__get_indent(line)
self.__get_list_type(line)
self.__get_style_name(line)
action = self.__state_dict.get(self.__state)
action(line)
read_obj.close()
self.__write_obj.close()
copy_obj = copy.Copy(bug_handler=self.__bug_handler)
if self.__copy:
copy_obj.copy_file(self.__write_to, "make_lists.data")
copy_obj.rename(self.__write_to, self.__file)
os.remove(self.__write_to)