%PDF- %PDF-
| Direktori : /lib/calibre/calibre/ebooks/rtf2xml/ |
| Current File : //lib/calibre/calibre/ebooks/rtf2xml/fields_large.py |
#########################################################################
# #
# #
# copyright 2002 Paul Henry Tremblay #
# #
# This program is distributed in the hope that it will be useful, #
# but WITHOUT ANY WARRANTY; without even the implied warranty of #
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU #
# General Public License for more details. #
# #
# #
#########################################################################
import sys, os
from calibre.ebooks.rtf2xml import field_strings, copy
from calibre.ptempfile import better_mktemp
from . import open_for_read, open_for_write
class FieldsLarge:
r"""
=========================
Logic
=========================
Make tags for fields.
-Fields reflect text that Microsoft Word automatically generates.
-Each file contains (or should contain) an inner group called field instructions.
-Fields can be nested.
--------------
Logic
--------------
1. As soon as a field is found, make a new text string by appending an empty
text string to the field list. Collect all the lines in this string until the
field instructions are found.
2. Collect all the tokens and text in the field instructions. When the end of
the field instructions is found, process the string of text with the
field_strings module. Append the processed string to the field instructins
list.
3. Continue collecting tokens. Check for paragraphs or sections. If either is found, add to the paragraph or section list.
4. Continue collecting tokens and text either the beginning of a new field is found, or the end of this field is found.
5. If a new field is found, repeat steps 1-3.
6. If the end of the field is found, process the last text string of the field list.
7. If the field list is empty (after removing the last text string), there are
no more fields. Print out the final string. If the list contains other strings,
add the processed string to the last string in the field list.
============================
Examples
============================
This line of RTF:
{\field{\*\fldinst { CREATEDATE \\* MERGEFORMAT }}{\fldrslt {
\lang1024 1/11/03 10:34 PM}}}
Becomes:
<field type = "insert-time">
10:34 PM
</field>
The simple field in the above example contains no paragraph or sections breaks.
This line of RTF:
{{\field{\*\fldinst SYMBOL 97 \\f "Symbol" \\s 12}{\fldrslt\f3\fs24}}}
Becomes:
<para><inline font-size="18"><inline font-style="Symbol">Χ</inline></inline></para>
The RTF in the example above should be represented as UTF-8 rather than a field.
This RTF:
{\field\fldedit{\*\fldinst { TOC \\o "1-3" }}{\fldrslt {\lang1024
Heading one\tab }{\field{\*\fldinst {\lang1024 PAGEREF _Toc440880424
\\h }{\lang1024 {\*\datafield
{\lang1024 1}}}{\lang1024 \par }\pard\plain
\s18\li240\widctlpar\tqr\tldot\tx8630\aspalpha\aspnum\faauto\adjustright\rin0\lin240\itap0
\f4\lang1033\cgrid {\lang1024 Heading 2\tab }{\field{\*\fldinst
{\lang1024 PAGEREF _Toc440880425 \\h }{\lang1024 {\*\datafield
{\lang1024 1}}}{\lang1024 \par }\pard\plain
\widctlpar\aspalpha\aspnum\faauto\adjustright\rin0\lin0\itap0
\f4\lang1033\cgrid }}\pard\plain
\widctlpar\aspalpha\aspnum\faauto\adjustright\rin0\lin0\itap0
\f4\lang1033\cgrid {\fs28 \\u214\'85 \par }{\fs36 {\field{\*\fldinst
SYMBOL 67 \\f "Symbol" \\s 18}{\fldrslt\f3\fs36}}}
Becomes:
<field-block type="table-of-contents">
<paragraph-definition language="1033" nest-level="0"
font-style="Times" name="toc 1" adjust-right="true"
widow-control="true">
<para><inline language="1024">Heading one	</inline><field
type="reference-to-page" ref="_Toc440880424"><inline
language="1024">1</inline></field></para>
</paragraph-definition>
<paragraph-definition language="1033" nest-level="0" left-indent="12"
font-style="Times" name="toc 2" adjust-right="true"
widow-control="true">
<para><inline language="1024">Heading 2	</inline><field
type="reference-to-page" ref="_Toc440880425"><inline
language="1024">1</inline></field></para>
</paragraph-definition>
</field-block>
"""
def __init__(self,
in_file,
bug_handler,
copy=None,
run_level=1,
):
"""
Required:
'file'--file to parse
Optional:
'copy'-- whether to make a copy of result for debugging
'temp_dir' --where to output temporary results (default is
directory from which the script is run.)
Returns:
nothing
"""
self.__file = in_file
self.__bug_handler = bug_handler
self.__copy = copy
self.__run_level = run_level
self.__write_to = better_mktemp()
def __initiate_values(self):
"""
Initiate all values.
"""
self.__text_string = ''
self.__field_instruction_string = ''
self.__marker = 'mi<mk<inline-fld\n'
self.__state = 'before_body'
self.__string_obj = field_strings.FieldStrings(run_level=self.__run_level,
bug_handler=self.__bug_handler,)
self.__state_dict = {
'before_body' : self.__before_body_func,
'in_body' : self.__in_body_func,
'field' : self.__in_field_func,
'field_instruction' : self.__field_instruction_func,
}
self.__in_body_dict = {
'cw<fd<field_____' : self.__found_field_func,
}
self.__field_dict = {
'cw<fd<field-inst' : self.__found_field_instruction_func,
'cw<fd<field_____' : self.__found_field_func,
'cw<pf<par-end___' : self.__par_in_field_func,
'cw<sc<section___' : self.__sec_in_field_func,
}
self.__field_count = [] # keep track of the brackets
self.__field_instruction = [] # field instruction strings
self.__symbol = 0 # whether or not the field is really UTF-8
# (these fields cannot be nested.)
self.__field_instruction_string = '' # string that collects field instruction
self.__par_in_field = [] # paragraphs in field?
self.__sec_in_field = [] # sections in field?
self.__field_string = [] # list of field strings
def __before_body_func(self, line):
"""
Required:
line --line ro parse
Returns:
nothing (changes an instant and writes a line)
Logic:
Check for the beginninf of the body. If found, changed the state.
Always write out the line.
"""
if self.__token_info == 'mi<mk<body-open_':
self.__state = 'in_body'
self.__write_obj.write(line)
def __in_body_func(self, line):
"""
Required:
line --line to parse
Returns:
nothing. (Writes a line to the output file, or performs other actions.)
Logic:
Check of the beginning of a field. Always output the line.
"""
action = self.__in_body_dict.get(self.__token_info)
if action:
action(line)
self.__write_obj.write(line)
def __found_field_func(self, line):
"""
Requires:
line --line to parse
Returns:
nothing
Logic:
Set the values for parsing the field. Four lists have to have
items appended to them.
"""
self.__state = 'field'
self.__cb_count = 0
ob_count = self.__ob_count
self.__field_string.append('')
self.__field_count.append(ob_count)
self.__sec_in_field.append(0)
self.__par_in_field.append(0)
def __in_field_func(self, line):
"""
Requires:
line --line to parse
Returns:
nothing.
Logic:
Check for the end of the field; a paragraph break; a section break;
the beginning of another field; or the beginning of the field
instruction.
"""
if self.__cb_count == self.__field_count[-1]:
self.__field_string[-1] += line
self.__end_field_func()
else:
action = self.__field_dict.get(self.__token_info)
if action:
action(line)
else:
self.__field_string[-1] += line
def __par_in_field_func(self, line):
"""
Requires:
line --line to parse
Returns:
nothing
Logic:
Write the line to the output file and set the last item in the
paragraph in field list to true.
"""
self.__field_string[-1] += line
self.__par_in_field[-1] = 1
def __sec_in_field_func(self, line):
"""
Requires:
line --line to parse
Returns:
nothing
Logic:
Write the line to the output file and set the last item in the
section in field list to true.
"""
self.__field_string[-1] += line
self.__sec_in_field[-1] = 1
def __found_field_instruction_func(self, line):
"""
Requires:
line -- line to parse
Returns:
nothing
Change the state to field instruction. Set the open bracket count of
the beginning of this field so you know when it ends. Set the closed
bracket count to 0 so you don't prematureley exit this state.
"""
self.__state = 'field_instruction'
self.__field_instruction_count = self.__ob_count
self.__cb_count = 0
def __field_instruction_func(self, line):
"""
Requires:
line --line to parse
Returns:
nothing
Logic:
Collect all the lines until the end of the field is reached.
Process these lines with the module rtr.field_strings.
Check if the field instruction is 'Symbol' (really UTF-8).
"""
if self.__cb_count == self.__field_instruction_count:
# The closing bracket should be written, since the opening bracket
# was written
self.__field_string[-1] += line
my_list = self.__string_obj.process_string(
self.__field_instruction_string, 'field_instruction')
instruction = my_list[2]
self.__field_instruction.append(instruction)
if my_list[0] == 'Symbol':
self.__symbol = 1
self.__state = 'field'
self.__field_instruction_string = ''
else:
self.__field_instruction_string += line
def __end_field_func(self):
"""
Requires:
nothing
Returns:
Nothing
Logic:
Pop the last values in the instructions list, the fields list, the
paragraph list, and the section list.
If the field is a symbol, do not write the tags <field></field>,
since this field is really just UTF-8.
If the field contains paragraph or section breaks, it is a
field-block rather than just a field.
Write the paragraph or section markers for later parsing of the
file.
If the filed list contains more strings, add the latest
(processed) string to the last string in the list. Otherwise,
write the string to the output file.
"""
last_bracket = self.__field_count.pop()
instruction = self.__field_instruction.pop()
inner_field_string = self.__field_string.pop()
sec_in_field = self.__sec_in_field.pop()
par_in_field = self.__par_in_field.pop()
# add a closing bracket, since the closing bracket is not included in
# the field string
if self.__symbol:
inner_field_string = '%scb<nu<clos-brack<%s\n' % \
(instruction, last_bracket)
elif sec_in_field or par_in_field:
inner_field_string = \
'mi<mk<fldbkstart\n'\
'mi<tg<open-att__<field-block<type>%s\n%s'\
'mi<mk<fldbk-end_\n' \
'mi<tg<close_____<field-block\n'\
'mi<mk<fld-bk-end\n' \
% (instruction, inner_field_string)
# write a marker to show an inline field for later parsing
else:
inner_field_string = \
'%s' \
'mi<tg<open-att__<field<type>%s\n%s'\
'mi<tg<close_____<field\n'\
% (self.__marker, instruction, inner_field_string)
if sec_in_field:
inner_field_string = 'mi<mk<sec-fd-beg\n' + inner_field_string + \
'mi<mk<sec-fd-end\n'
if par_in_field:
inner_field_string = 'mi<mk<par-in-fld\n' + inner_field_string
if len(self.__field_string) == 0:
self.__write_field_string(inner_field_string)
else:
self.__field_string[-1] += inner_field_string
self.__symbol = 0
def __write_field_string(self, the_string):
self.__state = 'in_body'
self.__write_obj.write(the_string)
def fix_fields(self):
"""
Requires:
nothing
Returns:
nothing (changes the original file)
Logic:
Read one line in at a time. Determine what action to take based on
the state. If the state is before the body, look for the
beginning of the body.
If the state is body, send the line to the body method.
"""
self.__initiate_values()
read_obj = open_for_read(self.__file)
self.__write_obj = open_for_write(self.__write_to)
line_to_read = 1
while line_to_read:
line_to_read = read_obj.readline()
line = line_to_read
self.__token_info = line[:16]
if self.__token_info == 'ob<nu<open-brack':
self.__ob_count = line[-5:-1]
if self.__token_info == 'cb<nu<clos-brack':
self.__cb_count = line[-5:-1]
action = self.__state_dict.get(self.__state)
if action is None:
sys.stderr.write('no no matching state in module styles.py\n')
sys.stderr.write(self.__state + '\n')
action(line)
read_obj.close()
self.__write_obj.close()
copy_obj = copy.Copy(bug_handler=self.__bug_handler)
if self.__copy:
copy_obj.copy_file(self.__write_to, "fields_large.data")
copy_obj.rename(self.__write_to, self.__file)
os.remove(self.__write_to)