%PDF- %PDF-
| Direktori : /proc/self/root/usr/lib/calibre/calibre/ebooks/pml/ |
| Current File : //proc/self/root/usr/lib/calibre/calibre/ebooks/pml/pmlconverter.py |
'''
Convert pml markup to and from html
'''
__license__ = 'GPL v3'
__copyright__ = '2009, John Schember <john@nachtimwald.com>'
__docformat__ = 'restructuredtext en'
import os
import re
import io
from copy import deepcopy
from calibre import my_unichr, prepare_string_for_xml
from calibre.ebooks.metadata.toc import TOC
class PML_HTMLizer:
STATES = [
'i',
'u',
'd',
'b',
'sp',
'sb',
'h1',
'h1c',
'h2',
'h3',
'h4',
'h5',
'h6',
'a',
'ra',
'c',
'r',
's',
'l',
'k',
'FN',
'SB',
]
STATES_VALUE_REQ = [
'a',
'FN',
'SB',
]
STATES_VALUE_REQ_2 = [
'ra',
]
STATES_CLOSE_VALUE_REQ = [
'FN',
'SB',
]
STATES_TAGS = {
'h1': ('<h1 style="page-break-before: always;">', '</h1>'),
'h1c': ('<h1>', '</h1>'),
'h2': ('<h2>', '</h2>'),
'h3': ('<h3>', '</h3>'),
'h4': ('<h4>', '</h4>'),
'h5': ('<h5>', '</h5>'),
'h6': ('<h6>', '</h6>'),
'sp': ('<sup>', '</sup>'),
'sb': ('<sub>', '</sub>'),
'a': ('<a href="#%s">', '</a>'),
'ra': ('<span id="r%s"></span><a href="#%s">', '</a>'),
'c': ('<div style="text-align: center; margin: auto;">', '</div>'),
'r': ('<div style="text-align: right;">', '</div>'),
't': ('<div style="margin-left: 5%;">', '</div>'),
'T': ('<div style="text-indent: %s;">', '</div>'),
'i': ('<span style="font-style: italic;">', '</span>'),
'u': ('<span style="text-decoration: underline;">', '</span>'),
'd': ('<span style="text-decoration: line-through;">', '</span>'),
'b': ('<span style="font-weight: bold;">', '</span>'),
'l': ('<span style="font-size: 150%;">', '</span>'),
'k': ('<span style="font-size: 75%; font-variant: small-caps;">', '</span>'),
'FN': ('<br /><br style="page-break-after: always;" /><div id="fn-%s"><p>', '</p><small><a href="#rfn-%s">return</a></small></div>'),
'SB': ('<br /><br style="page-break-after: always;" /><div id="sb-%s"><p>', '</p><small><a href="#rsb-%s">return</a></small></div>'),
}
CODE_STATES = {
'q': 'a',
'x': 'h1',
'X0': 'h2',
'X1': 'h3',
'X2': 'h4',
'X3': 'h5',
'X4': 'h6',
'Sp': 'sp',
'Sb': 'sb',
'c': 'c',
'r': 'r',
'i': 'i',
'I': 'i',
'u': 'u',
'o': 'd',
'b': 'b',
'B': 'b',
'l': 'l',
'k': 'k',
'Fn': 'ra',
'Sd': 'ra',
'FN': 'FN',
'SB': 'SB',
}
LINK_STATES = [
'a',
'ra',
]
BLOCK_STATES = [
'a',
'ra',
'h1',
'h2',
'h3',
'h4',
'h5',
'h6',
'sb',
'sp',
]
DIV_STATES = [
'c',
'r',
'FN',
'SB',
]
SPAN_STATES = [
'l',
'k',
'i',
'u',
'd',
'b',
]
NEW_LINE_EXCHANGE_STATES = {
'h1': 'h1c',
}
def __init__(self):
self.state = {}
# toc consists of a tuple
# (level, (href, id, text))
self.toc = []
self.file_name = ''
def prepare_pml(self, pml):
# Give Chapters the form \\*='text'text\\*. This is used for generating
# the TOC later.
pml = re.sub(r'(?msu)(?P<c>\\x)(?P<text>.*?)(?P=c)', lambda match: '%s="%s"%s%s' %
(match.group('c'), self.strip_pml(match.group('text')), match.group('text'), match.group('c')), pml)
pml = re.sub(r'(?msu)(?P<c>\\X[0-4])(?P<text>.*?)(?P=c)', lambda match: '%s="%s"%s%s' %
(match.group('c'), self.strip_pml(match.group('text')), match.group('text'), match.group('c')), pml)
# Remove comments
pml = re.sub(r'(?mus)\\v(?P<text>.*?)\\v', '', pml)
# Remove extra white spaces.
pml = re.sub(r'(?mus)[ ]{2,}', ' ', pml)
pml = re.sub(r'(?mus)^[ ]*(?=.)', '', pml)
pml = re.sub(r'(?mus)(?<=.)[ ]*$', '', pml)
pml = re.sub(r'(?mus)^[ ]*$', '', pml)
# Footnotes and Sidebars.
pml = re.sub(r'(?mus)<footnote\s+id="(?P<target>.+?)">\s*(?P<text>.*?)\s*</footnote>', lambda match: '\\FN="%s"%s\\FN' %
(match.group('target'), match.group('text')) if match.group('text') else '', pml)
pml = re.sub(r'(?mus)<sidebar\s+id="(?P<target>.+?)">\s*(?P<text>.*?)\s*</sidebar>', lambda match: '\\SB="%s"%s\\SB' %
(match.group('target'), match.group('text')) if match.group('text') else '', pml)
# Convert &'s into entities so & in the text doesn't get turned into
# &. It will display as &
pml = pml.replace('&', '&')
# Replace \\a and \\U with either the unicode character or the entity.
pml = re.sub(r'\\a(?P<num>\d{3})', lambda match: '&#%s;' % match.group('num'), pml)
pml = re.sub(r'\\U(?P<num>[0-9a-f]{4})', lambda match: '%s' % my_unichr(int(match.group('num'), 16)), pml)
pml = prepare_string_for_xml(pml)
return pml
def strip_pml(self, pml):
pml = re.sub(r'\\C\d=".*"', '', pml)
pml = re.sub(r'\\Fn=".*"', '', pml)
pml = re.sub(r'\\Sd=".*"', '', pml)
pml = re.sub(r'\\.=".*"', '', pml)
pml = re.sub(r'\\X\d', '', pml)
pml = re.sub(r'\\S[pbd]', '', pml)
pml = re.sub(r'\\Fn', '', pml)
pml = re.sub(r'\\a\d\d\d', '', pml)
pml = re.sub(r'\\U\d\d\d\d', '', pml)
pml = re.sub(r'\\.', '', pml)
pml = pml.replace('\r\n', ' ')
pml = pml.replace('\n', ' ')
pml = pml.replace('\r', ' ')
pml = pml.strip()
return pml
def cleanup_html(self, html):
old = html
html = self.cleanup_html_remove_redundant(html)
while html != old:
old = html
html = self.cleanup_html_remove_redundant(html)
html = re.sub(r'(?imu)^\s*', '', html)
return html
def cleanup_html_remove_redundant(self, html):
for key in self.STATES_TAGS:
open, close = self.STATES_TAGS[key]
if key in self.STATES_VALUE_REQ:
html = re.sub(r'(?u){}\s*{}'.format(open % '.*?', close), '', html)
else:
html = re.sub(fr'(?u){open}\s*{close}', '', html)
html = re.sub(r'(?imu)<p>\s*</p>', '', html)
return html
def start_line(self):
start = ''
state = deepcopy(self.state)
div = []
span = []
other = []
for key, val in state.items():
if key in self.NEW_LINE_EXCHANGE_STATES and val[0]:
state[self.NEW_LINE_EXCHANGE_STATES[key]] = val
state[key] = [False, '']
for key, val in state.items():
if val[0]:
if key in self.DIV_STATES:
div.append((key, val[1]))
elif key in self.SPAN_STATES:
span.append((key, val[1]))
else:
other.append((key, val[1]))
for key, val in other+div+span:
if key in self.STATES_VALUE_REQ:
start += self.STATES_TAGS[key][0] % val
elif key in self.STATES_VALUE_REQ_2:
start += self.STATES_TAGS[key][0] % (val, val)
else:
start += self.STATES_TAGS[key][0]
return '<p>%s' % start
def end_line(self):
end = ''
div = []
span = []
other = []
for key, val in self.state.items():
if val[0]:
if key in self.DIV_STATES:
div.append(key)
elif key in self.SPAN_STATES:
span.append(key)
else:
other.append(key)
for key in span+div+other:
if key in self.STATES_CLOSE_VALUE_REQ:
end += self.STATES_TAGS[key][1] % self.state[key][1]
else:
end += self.STATES_TAGS[key][1]
return '%s</p>' % end
def process_code(self, code, stream, pre=''):
text = ''
code = self.CODE_STATES.get(code, None)
if not code:
return text
if code in self.DIV_STATES:
# Ignore multilple T's on the same line. They do not have a closing
# code. They get closed at the end of the line.
if code == 'T' and self.state['T'][0]:
self.code_value(stream)
return text
text = self.process_code_div(code, stream)
elif code in self.SPAN_STATES:
text = self.process_code_span(code, stream)
elif code in self.BLOCK_STATES:
text = self.process_code_block(code, stream, pre)
else:
text = self.process_code_simple(code, stream)
self.state[code][0] = not self.state[code][0]
return text
def process_code_simple(self, code, stream):
text = ''
if self.state[code][0]:
if code in self.STATES_CLOSE_VALUE_REQ:
text = self.STATES_TAGS[code][1] % self.state[code][1]
else:
text = self.STATES_TAGS[code][1]
else:
if code in self.STATES_VALUE_REQ or code in self.STATES_VALUE_REQ_2:
val = self.code_value(stream)
if code in self.STATES_VALUE_REQ:
text = self.STATES_TAGS[code][0] % val
else:
text = self.STATES_TAGS[code][0] % (val, val)
self.state[code][1] = val
else:
text = self.STATES_TAGS[code][0]
return text
def process_code_div(self, code, stream):
text = ''
# Close code.
if self.state[code][0]:
# Close all.
for c in self.SPAN_STATES+self.DIV_STATES:
if self.state[c][0]:
if c in self.STATES_CLOSE_VALUE_REQ:
text += self.STATES_TAGS[c][1] % self.state[c][1]
else:
text += self.STATES_TAGS[c][1]
# Reopen the based on state.
for c in self.DIV_STATES+self.SPAN_STATES:
if code == c:
continue
if self.state[c][0]:
if c in self.STATES_VALUE_REQ:
text += self.STATES_TAGS[self.CODE_STATES[c]][0] % self.state[c][1]
elif c in self.STATES_VALUE_REQ_2:
text += self.STATES_TAGS[self.CODE_STATES[c]][0] % (self.state[c][1], self.state[c][1])
else:
text += self.STATES_TAGS[c][0]
# Open code.
else:
# Close all spans.
for c in self.SPAN_STATES:
if self.state[c][0]:
if c in self.STATES_CLOSE_VALUE_REQ:
text += self.STATES_TAGS[c][1] % self.state[c][1]
else:
text += self.STATES_TAGS[c][1]
# Process the code
if code in self.STATES_VALUE_REQ or code in self.STATES_VALUE_REQ_2:
val = self.code_value(stream)
if code in self.STATES_VALUE_REQ:
text += self.STATES_TAGS[code][0] % val
else:
text += self.STATES_TAGS[code][0] % (val, val)
self.state[code][1] = val
else:
text += self.STATES_TAGS[code][0]
# Re-open all spans based on state
for c in self.SPAN_STATES:
if self.state[c][0]:
if c in self.STATES_VALUE_REQ:
text += self.STATES_TAGS[self.CODE_STATES[c]][0] % self.state[c][1]
elif c in self.STATES_VALUE_REQ_2:
text += self.STATES_TAGS[self.CODE_STATES[c]][0] % (self.state[c][1], self.state[c][1])
else:
text += self.STATES_TAGS[c][0]
return text
def process_code_span(self, code, stream):
text = ''
# Close code.
if self.state[code][0]:
# Close all spans
for c in self.SPAN_STATES:
if self.state[c][0]:
if c in self.STATES_CLOSE_VALUE_REQ:
text += self.STATES_TAGS[c][1] % self.state[c][1]
else:
text += self.STATES_TAGS[c][1]
# Re-open the spans based on state except for code which will be
# left closed.
for c in self.SPAN_STATES:
if code == c:
continue
if self.state[c][0]:
if c in self.STATES_VALUE_REQ:
text += self.STATES_TAGS[code][0] % self.state[c][1]
elif c in self.STATES_VALUE_REQ_2:
text += self.STATES_TAGS[code][0] % (self.state[c][1], self.state[c][1])
else:
text += self.STATES_TAGS[c][0]
# Open code.
else:
if code in self.STATES_VALUE_REQ or code in self.STATES_VALUE_REQ_2:
val = self.code_value(stream)
if code in self.STATES_VALUE_REQ:
text += self.STATES_TAGS[code][0] % val
else:
text += self.STATES_TAGS[code][0] % (val, val)
self.state[code][1] = val
else:
text += self.STATES_TAGS[code][0]
return text
def process_code_block(self, code, stream, pre=''):
text = ''
# Close all spans
for c in self.SPAN_STATES:
if self.state[c][0]:
if c in self.STATES_CLOSE_VALUE_REQ:
text += self.STATES_TAGS[c][1] % self.state[c][1]
else:
text += self.STATES_TAGS[c][1]
# Process the code
if self.state[code][0]:
# Close tag
if code in self.STATES_CLOSE_VALUE_REQ:
text += self.STATES_TAGS[code][1] % self.state[code][1]
else:
text += self.STATES_TAGS[code][1]
else:
# Open tag
if code in self.STATES_VALUE_REQ or code in self.STATES_VALUE_REQ_2:
val = self.code_value(stream)
if code in self.LINK_STATES:
val = val.lstrip('#')
if pre:
val = f'{pre}-{val}'
if code in self.STATES_VALUE_REQ:
text += self.STATES_TAGS[code][0] % val
else:
text += self.STATES_TAGS[code][0] % (val, val)
self.state[code][1] = val
else:
text += self.STATES_TAGS[code][0]
# Re-open all spans if code was a div based on state
for c in self.SPAN_STATES:
if self.state[c][0]:
if c in self.STATES_VALUE_REQ:
text += self.STATES_TAGS[code][0] % self.state[c][1]
elif c in self.STATES_VALUE_REQ_2:
text += self.STATES_TAGS[code][0] % (self.state[c][1], self.state[c][1])
else:
text += self.STATES_TAGS[c][0]
return text
def code_value(self, stream):
value = ''
# state 0 is before =
# state 1 is before the first "
# state 2 is before the second "
# state 3 is after the second "
state = 0
loc = stream.tell()
c = stream.read(1)
while c != '':
if state == 0:
if c == '=':
state = 1
elif c != ' ':
# A code that requires an argument should have = after the
# code but sometimes has spaces. If it has anything other
# than a space or = after the code then we can assume the
# markup is invalid. We will stop looking for the value
# and continue to hopefully not lose any data.
break
elif state == 1:
if c == '"':
state = 2
elif c != ' ':
# " should always follow = but we will allow for blank
# space after the =.
break
elif state == 2:
if c == '"':
state = 3
break
else:
value += c
c = stream.read(1)
if state != 3:
# Unable to complete the sequence to reterieve the value. Reset
# the stream to the location it started.
stream.seek(loc)
value = ''
return value.strip()
def parse_pml(self, pml, file_name=''):
pml = self.prepare_pml(pml)
output = []
self.state = {}
self.toc = []
self.file_name = file_name
# t: Are we in an open \t tag set?
# T: Are we in an open \T?
# st: Did the \t start the line?
# sT: Did the \T start the line?
# et: Did the \t end the line?
indent_state = {'t': False, 'T': False, 'st': False, 'sT': False, 'et': False}
basic_indent = False
adv_indent_val = ''
# Keep track of the number of empty lines
# between paragraphs. When we reach a set number
# we assume it's a soft scene break.
empty_count = 0
for s in self.STATES:
self.state[s] = [False, '']
for line in pml.splitlines():
parsed = []
empty = True
basic_indent = indent_state['t']
indent_state['T'] = False
# Determine if the \t starts the line or if we are
# in an open \t block.
if line.lstrip().startswith('\\t') or basic_indent:
basic_indent = True
indent_state['st'] = True
else:
indent_state['st'] = False
# Determine if the \T starts the line.
if line.lstrip().startswith('\\T'):
indent_state['sT'] = True
else:
indent_state['sT'] = False
# Determine if the \t ends the line.
if line.rstrip().endswith('\\t'):
indent_state['et'] = True
else:
indent_state['et'] = False
if isinstance(line, bytes):
line = line.decode('utf-8')
line = io.StringIO(line)
parsed.append(self.start_line())
c = line.read(1)
while c != '':
text = ''
if c == '\\':
c = line.read(1)
if c in 'qcriIuobBlk':
text = self.process_code(c, line)
elif c in 'FS':
l = line.read(1)
if f'{c}{l}' == 'Fn':
text = self.process_code('Fn', line, 'fn')
elif f'{c}{l}' == 'FN':
text = self.process_code('FN', line)
elif f'{c}{l}' == 'SB':
text = self.process_code('SB', line)
elif f'{c}{l}' == 'Sd':
text = self.process_code('Sd', line, 'sb')
elif c in 'xXC':
empty = False
# The PML was modified eariler so x and X put the text
# inside of ="" so we don't have do special processing
# for C.
t = ''
level = 0
if c in 'XC':
level = line.read(1)
id = 'pml_toc-%s' % len(self.toc)
value = self.code_value(line)
if c == 'x':
t = self.process_code(c, line)
elif c == 'X':
t = self.process_code(f'{c}{level}', line)
if not value or value == '':
text = t
else:
self.toc.append((level, (os.path.basename(self.file_name), id, value)))
text = f'{t}<span id="{id}"></span>'
elif c == 'm':
empty = False
src = self.code_value(line)
text = '<img src="images/%s" />' % src
elif c == 'Q':
empty = False
id = self.code_value(line)
text = '<span id="%s"></span>' % id
elif c == 'p':
empty = False
text = '<br /><br style="page-break-after: always;" />'
elif c == 'n':
pass
elif c == 'w':
empty = False
text = '<hr style="width: %s" />' % self.code_value(line)
elif c == 't':
indent_state['t'] = not indent_state['t']
elif c == 'T':
# Ensure we only store the value on the first T set for the line.
if not indent_state['T']:
adv_indent_val = self.code_value(line)
else:
# We detected a T previously on this line.
# Don't replace the first detected value.
self.code_value(line)
indent_state['T'] = True
elif c == '-':
empty = False
text = '­'
elif c == '\\':
empty = False
text = '\\'
else:
if c != ' ':
empty = False
text = c
parsed.append(text)
c = line.read(1)
if empty:
empty_count += 1
if empty_count == 2:
output.append('<p> </p>')
else:
empty_count = 0
text = self.end_line()
parsed.append(text)
# Basic indent will be set if the \t starts the line or
# if we are in a continuing \t block.
if basic_indent:
# if the \t started the line and either it ended the line or the \t
# block is still open use a left margin.
if indent_state['st'] and (indent_state['et'] or indent_state['t']):
parsed.insert(0, self.STATES_TAGS['t'][0])
parsed.append(self.STATES_TAGS['t'][1])
# Use a text indent instead of a margin.
# This handles cases such as:
# \tO\tne upon a time...
else:
parsed.insert(0, self.STATES_TAGS['T'][0] % '5%')
parsed.append(self.STATES_TAGS['T'][1])
# \t will override \T's on the line.
# We only handle \T's that started the line.
elif indent_state['T'] and indent_state['sT']:
parsed.insert(0, self.STATES_TAGS['T'][0] % adv_indent_val)
parsed.append(self.STATES_TAGS['T'][1])
indent_state['T'] = False
adv_indent_val = ''
output.append(''.join(parsed))
line.close()
output = self.cleanup_html('\n'.join(output))
return output
def get_toc(self):
'''
Toc can have up to 5 levels, 0 - 4 inclusive.
This function will add items to their appropriate
depth in the TOC tree. If the specified depth is
invalid (item would not have a valid parent) add
it to the next valid level above the specified
level.
'''
# Base toc object all items will be added to.
n_toc = TOC()
# Used to track nodes in the toc so we can add
# sub items to the appropriate place in tree.
t_l0 = None
t_l1 = None
t_l2 = None
t_l3 = None
for level, (href, id, text) in self.toc:
if level == '0':
t_l0 = n_toc.add_item(href, id, text)
t_l1 = None
t_l2 = None
t_l3 = None
elif level == '1':
if t_l0 is None:
t_l0 = n_toc
t_l1 = t_l0.add_item(href, id, text)
t_l2 = None
t_l3 = None
elif level == '2':
if t_l1 is None:
if t_l0 is None:
t_l1 = n_toc
else:
t_l1 = t_l0
t_l2 = t_l1.add_item(href, id, text)
t_l3 = None
elif level == '3':
if t_l2 is None:
if t_l1 is None:
if t_l0 is None:
t_l2 = n_toc
else:
t_l2 = t_l0
else:
t_l2 = t_l1
t_l3 = t_l2.add_item(href, id, text)
# Level 4.
# Anything above 4 is invalid but we will count
# it as level 4.
else:
if t_l3 is None:
if t_l2 is None:
if t_l1 is None:
if t_l0 is None:
t_l3 = n_toc
else:
t_l3 = t_l0
else:
t_l3 = t_l1
else:
t_l3 = t_l2
t_l3.add_item(href, id, text)
return n_toc
def pml_to_html(pml):
hizer = PML_HTMLizer()
return hizer.parse_pml(pml)
def footnote_sidebar_to_html(pre_id, id, pml):
id = id.strip('\x01')
if id.strip():
html = '<br /><br style="page-break-after: always;" /><div id="{}-{}">{}<small><a href="#r{}-{}">return</a></small></div>'.format(
pre_id, id, pml_to_html(pml), pre_id, id)
else:
html = '<br /><br style="page-break-after: always;" /><div>%s</div>' % pml_to_html(pml)
return html
def footnote_to_html(id, pml):
return footnote_sidebar_to_html('fn', id, pml)
def sidebar_to_html(id, pml):
return footnote_sidebar_to_html('sb', id, pml)