%PDF- %PDF-
| Direktori : /lib/calibre/calibre/ebooks/rtf2xml/ |
| Current File : //lib/calibre/calibre/ebooks/rtf2xml/fields_small.py |
#########################################################################
# #
# #
# copyright 2002 Paul Henry Tremblay #
# #
# This program is distributed in the hope that it will be useful, #
# but WITHOUT ANY WARRANTY; without even the implied warranty of #
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU #
# General Public License for more details. #
# #
# #
#########################################################################
import sys, os, re
from calibre.ebooks.rtf2xml import field_strings, copy
from calibre.ptempfile import better_mktemp
from . import open_for_read, open_for_write
class FieldsSmall:
"""
=================
Purpose
=================
Write tags for bookmarks, index and toc entry fields in a tokenized file.
This module does not handle toc or index tables. (This module won't be any
use to you unless you use it as part of the other modules.)
-----------
Method
-----------
Look for the beginning of a bookmark, index, or toc entry. When such a token
is found, store the opening bracket count in a variable. Collect all the text
until the closing bracket entry is found. Send the string to the module
field_strings to process it. Write the processed string to the output
file.
"""
def __init__(self,
in_file,
bug_handler,
copy=None,
run_level=1,
):
"""
Required:
'file'--file to parse
Optional:
'copy'-- whether to make a copy of result for debugging
'temp_dir' --where to output temporary results (default is
directory from which the script is run.)
Returns:
nothing
"""
self.__file = in_file
self.__bug_handler = bug_handler
self.__copy = copy
self.__write_to = better_mktemp()
self.__run_level = run_level
def __initiate_values(self):
"""
Initiate all values.
"""
self.__string_obj = field_strings.FieldStrings(bug_handler=self.__bug_handler)
self.__state = 'before_body'
self.__text_string = ''
self.__marker = 'mi<mk<inline-fld\n'
self.__state_dict = {
'before_body' : self.__before_body_func,
'body' : self.__body_func,
'bookmark' : self.__bookmark_func,
'toc_index' : self.__toc_index_func,
}
self.__body_dict = {
'cw<an<book-mk-st' : (self.__found_bookmark_func, 'start'),
'cw<an<book-mk-en' : (self.__found_bookmark_func, 'end'),
'cw<an<toc_______' : (self.__found_toc_index_func, 'toc'),
'cw<an<index-mark' : (self.__found_toc_index_func, 'index'),
}
ob = 'ob<nu<open-brack.....'
cb = 'cb<nu<clos-brack'
bk_st = 'cw<an<book-mk-st<nu<true'
tx = 'tx<nu<__________<(.*?)'
reg_st = ob + bk_st + tx + cb
self.__book_start = re.compile(r'%s' % reg_st)
def __before_body_func(self, line):
"""
Requires:
line --the line to parse
Returns:
nothing
Logic:
Look for the beginning of the body. When found, change the state
to body. Always print out the line.
"""
if self.__token_info == 'mi<mk<body-open_':
self.__state = 'body'
self.__write_obj.write(line)
def __body_func(self, line):
"""
Requires:
line --the line to parse
Returns:
nothing
Logic:
This function handles all the lines in the body of the documents.
Look for a bookmark, index or toc entry and take the appropriate action.
"""
action, tag = \
self.__body_dict.get(self.__token_info, (None, None))
if action:
action(line, tag)
else:
self.__write_obj.write(line)
def __found_bookmark_func(self, line, tag):
"""
Requires:
line --the line to parse
Returns:
nothing
Logic:
This function is called when a bookmark is found. The opening
bracket count is stored int eh beginning bracket count. The state
is changed to 'bookmark.'
"""
self.__beg_bracket_count = self.__ob_count
self.__cb_count = 0
self.__state = 'bookmark'
self.__type_of_bookmark = tag
def __bookmark_func(self, line):
"""
Requires:
line --the line to parse
Returns:
nothing
Logic:
This function handles all lines within a bookmark. It adds each
line to a string until the end of the bookmark is found. It
processes the string with the fields_string module, and
prints out the result.
"""
if self.__beg_bracket_count == self.__cb_count:
self.__state = 'body'
type = 'bookmark-%s' % self.__type_of_bookmark
# change here
"""
my_string = self.__string_obj.process_string(
self.__text_string, type)
"""
my_string = self.__parse_bookmark_func(
self.__text_string, type)
self.__write_obj.write(self.__marker)
self.__write_obj.write(my_string)
self.__text_string = ''
self.__write_obj.write(line)
elif line[0:2] == 'tx':
self.__text_string += line[17:-1]
def __parse_index_func(self, my_string):
"""
Requires:
my_string --string to parse
type --type of string
Returns:
A string for a toc instruction field.
Logic:
This method is meant for *both* index and toc entries.
I want to eliminate paragraph endings, and I want to divide the
entry into a main entry and (if it exists) a sub entry.
Split the string by newlines. Read on token at a time. If the
token is a special colon, end the main entry element and start the
sub entry element.
If the token is a pargrah ending, ignore it, since I don't won't
paragraphs within toc or index entries.
"""
my_string, see_string = self.__index_see_func(my_string)
my_string, bookmark_string = self.__index_bookmark_func(my_string)
italics, bold = self.__index__format_func(my_string)
found_sub = 0
my_changed_string = 'mi<tg<empty-att_<field<type>index-entry'
my_changed_string += '<update>static'
if see_string:
my_changed_string += '<additional-text>%s' % see_string
if bookmark_string:
my_changed_string += '<bookmark>%s' % bookmark_string
if italics:
my_changed_string += '<italics>true'
if bold:
my_changed_string += '<bold>true'
main_entry = ''
sub_entry = ''
lines = my_string.split('\n')
for line in lines:
token_info = line[:16]
if token_info == 'cw<ml<colon_____':
found_sub = 1
elif token_info[0:2] == 'tx':
if found_sub:
sub_entry += line[17:]
else:
main_entry += line[17:]
my_changed_string += '<main-entry>%s' % main_entry
if found_sub:
my_changed_string += '<sub-entry>%s' % sub_entry
my_changed_string += '\n'
return my_changed_string
def __index_see_func(self, my_string):
in_see = 0
bracket_count = 0
see_string = ''
changed_string = ''
lines = my_string.split('\n')
end_bracket_count = sys.maxsize
for line in lines:
token_info = line[:16]
if token_info == 'ob<nu<open-brack':
bracket_count += 1
if token_info == 'cb<nu<clos-brack':
bracket_count -= 1
if in_see:
if bracket_count == end_bracket_count and token_info == 'cb<nu<clos-brack':
in_see = 0
else:
if token_info == 'tx<nu<__________':
see_string += line[17:]
else:
if token_info == 'cw<in<index-see_':
end_bracket_count = bracket_count - 1
in_see = 1
changed_string += '%s\n' % line
return changed_string, see_string
def __index_bookmark_func(self, my_string):
"""
Requires:
my_string -- string in all the index
Returns:
bookmark_string -- the text string of the book mark
index_string -- string minus the bookmark_string
"""
# cw<an<place_____<nu<true
in_bookmark = 0
bracket_count = 0
bookmark_string = ''
index_string = ''
lines = my_string.split('\n')
end_bracket_count = sys.maxsize
for line in lines:
token_info = line[:16]
if token_info == 'ob<nu<open-brack':
bracket_count += 1
if token_info == 'cb<nu<clos-brack':
bracket_count -= 1
if in_bookmark:
if bracket_count == end_bracket_count and token_info == 'cb<nu<clos-brack':
in_bookmark = 0
index_string += '%s\n' % line
else:
if token_info == 'tx<nu<__________':
bookmark_string += line[17:]
else:
index_string += '%s\n' % line
else:
if token_info == 'cw<an<place_____':
end_bracket_count = bracket_count - 1
in_bookmark = 1
index_string += '%s\n' % line
return index_string, bookmark_string
def __index__format_func(self, my_string):
italics = 0
bold =0
lines = my_string.split('\n')
for line in lines:
token_info = line[:16]
if token_info == 'cw<in<index-bold':
bold = 1
if token_info == 'cw<in<index-ital':
italics = 1
return italics, bold
def __parse_toc_func(self, my_string):
"""
Requires:
my_string -- all the string in the toc
Returns:
modidified string
Logic:
"""
toc_level = 0
toc_suppress = 0
my_string, book_start_string, book_end_string =\
self.__parse_bookmark_for_toc(my_string)
main_entry = ''
my_changed_string = 'mi<tg<empty-att_<field<type>toc-entry'
my_changed_string += '<update>static'
if book_start_string:
my_changed_string += '<bookmark-start>%s' % book_start_string
if book_end_string:
my_changed_string += '<bookmark-end>%s' % book_end_string
lines = my_string.split('\n')
for line in lines:
token_info = line[:16]
if token_info[0:2] == 'tx':
main_entry += line[17:]
if token_info == 'cw<tc<toc-level_':
toc_level = line[20:]
if token_info == 'cw<tc<toc-sup-nu':
toc_suppress = 1
if toc_level:
my_changed_string += '<toc-level>%s' % toc_level
if toc_suppress:
my_changed_string += '<toc-suppress-number>true'
my_changed_string += '<main-entry>%s' % main_entry
my_changed_string += '\n'
return my_changed_string
def __parse_bookmark_for_toc(self, my_string):
"""
Requires:
the_string --string of toc, with new lines
Returns:
the_string -- string minus bookmarks
bookmark_string -- bookmarks
Logic:
"""
in_bookmark = 0
bracket_count = 0
book_start_string = ''
book_end_string = ''
book_type = 0
toc_string = ''
lines = my_string.split('\n')
end_bracket_count = sys.maxsize
for line in lines:
token_info = line[:16]
if token_info == 'ob<nu<open-brack':
bracket_count += 1
if token_info == 'cb<nu<clos-brack':
bracket_count -= 1
if in_bookmark:
if bracket_count == end_bracket_count and token_info == 'cb<nu<clos-brack':
in_bookmark = 0
toc_string += '%s\n' % line
else:
if token_info == 'tx<nu<__________':
if book_type == 'start':
book_start_string += line[17:]
elif book_type == 'end':
book_end_string += line[17:]
else:
toc_string += '%s\n' % line
else:
if token_info == 'cw<an<book-mk-st' or token_info =='cw<an<book-mk-en':
if token_info == 'cw<an<book-mk-st':
book_type = 'start'
if token_info == 'cw<an<book-mk-en':
book_type = 'end'
end_bracket_count = bracket_count - 1
in_bookmark = 1
toc_string += '%s\n' % line
return toc_string, book_start_string, book_end_string
def __parse_bookmark_func(self, my_string, type):
"""
Requires:
my_string --string to parse
type --type of string
Returns:
A string formatted for a field instruction.
Logic:
The type is the name (either bookmark-end or bookmark-start). The
id is the complete text string.
"""
my_changed_string = ('mi<tg<empty-att_<field<type>%s'
'<number>%s<update>none\n' % (type, my_string))
return my_changed_string
def __found_toc_index_func(self, line, tag):
"""
Requires:
line --the line to parse
Returns:
nothing
Logic:
This function is called when a toc or index entry is found. The opening
bracket count is stored in the beginning bracket count. The state
is changed to 'toc_index.'
"""
self.__beg_bracket_count = self.__ob_count
self.__cb_count = 0
self.__state = 'toc_index'
self.__tag = tag
def __toc_index_func(self, line):
"""
Requires:
line --the line to parse
Returns:
nothing
Logic:
This function handles all lines within a toc or index entry. It
adds each line to a string until the end of the entry is found. It
processes the string with the fields_string module, and
prints out the result.
"""
if self.__beg_bracket_count == self.__cb_count:
self.__state = 'body'
type = self.__tag
if type == 'index':
my_string = self.__parse_index_func(
self.__text_string)
elif type == 'toc':
my_string = self.__parse_toc_func(
self.__text_string)
self.__write_obj.write(self.__marker)
self.__write_obj.write(my_string)
self.__text_string = ''
self.__write_obj.write(line)
else:
self.__text_string += line
def fix_fields(self):
"""
Requires:
nothing
Returns:
nothing (changes the original file)
Logic:
Read one line in at a time. Determine what action to take based on
the state. If the state is before the body, look for the
beginning of the body.
The other two states are toc_index (for toc and index entries) and
bookmark.
"""
self.__initiate_values()
with open_for_read(self.__file) as read_obj:
with open_for_write(self.__write_to) as self.__write_obj:
for line in read_obj:
self.__token_info = line[:16]
if self.__token_info == 'ob<nu<open-brack':
self.__ob_count = line[-5:-1]
if self.__token_info == 'cb<nu<clos-brack':
self.__cb_count = line[-5:-1]
action = self.__state_dict.get(self.__state)
if action is None:
sys.stderr.write('No matching state in module fields_small.py\n')
sys.stderr.write(self.__state + '\n')
action(line)
copy_obj = copy.Copy(bug_handler=self.__bug_handler)
if self.__copy:
copy_obj.copy_file(self.__write_to, "fields_small.data")
copy_obj.rename(self.__write_to, self.__file)
os.remove(self.__write_to)