%PDF- %PDF-
| Direktori : /proc/thread-self/root/usr/lib/calibre/calibre/ebooks/snb/ |
| Current File : //proc/thread-self/root/usr/lib/calibre/calibre/ebooks/snb/snbml.py |
__license__ = 'GPL 3'
__copyright__ = '2010, Li Fanxi <lifanxi@freemindworld.com>'
__docformat__ = 'restructuredtext en'
'''
Transform OEB content into SNB format
'''
import os
import re
from lxml import etree
from polyglot.builtins import string_or_bytes
def ProcessFileName(fileName):
# Flat the path
fileName = fileName.replace("/", "_").replace(os.sep, "_")
# Handle bookmark for HTML file
fileName = fileName.replace("#", "_")
# Make it lower case
fileName = fileName.lower()
# Change all images to jpg
root, ext = os.path.splitext(fileName)
if ext in ['.jpeg', '.jpg', '.gif', '.svg', '.png']:
fileName = root + '.jpg'
return fileName
BLOCK_TAGS = [
'div',
'p',
'h1',
'h2',
'h3',
'h4',
'h5',
'h6',
'li',
'tr',
]
BLOCK_STYLES = [
'block',
]
SPACE_TAGS = [
'td',
]
CALIBRE_SNB_IMG_TAG = "<$$calibre_snb_temp_img$$>"
CALIBRE_SNB_BM_TAG = "<$$calibre_snb_bm_tag$$>"
CALIBRE_SNB_PRE_TAG = "<$$calibre_snb_pre_tag$$>"
class SNBMLizer:
curSubItem = ""
# curText = [ ]
def __init__(self, log):
self.log = log
def extract_content(self, oeb_book, item, subitems, opts):
self.log.info('Converting XHTML to SNBC...')
self.oeb_book = oeb_book
self.opts = opts
self.item = item
self.subitems = subitems
return self.mlize()
def merge_content(self, old_tree, oeb_book, item, subitems, opts):
newTrees = self.extract_content(oeb_book, item, subitems, opts)
body = old_tree.find(".//body")
if body is not None:
for subName in newTrees:
newbody = newTrees[subName].find(".//body")
for entity in newbody:
body.append(entity)
def mlize(self):
from calibre.ebooks.oeb.base import XHTML
from calibre.ebooks.oeb.stylizer import Stylizer
from calibre.utils.xml_parse import safe_xml_fromstring
output = ['']
stylizer = Stylizer(self.item.data, self.item.href, self.oeb_book, self.opts, self.opts.output_profile)
content = etree.tostring(self.item.data.find(XHTML('body')), encoding='unicode')
# content = self.remove_newlines(content)
trees = {}
for subitem, subtitle in self.subitems:
snbcTree = etree.Element("snbc")
snbcHead = etree.SubElement(snbcTree, "head")
etree.SubElement(snbcHead, "title").text = subtitle
if self.opts and self.opts.snb_hide_chapter_name:
etree.SubElement(snbcHead, "hidetitle").text = "true"
etree.SubElement(snbcTree, "body")
trees[subitem] = snbcTree
output.append('{}{}\n\n'.format(CALIBRE_SNB_BM_TAG, ""))
output += self.dump_text(self.subitems, safe_xml_fromstring(content), stylizer)[0]
output = self.cleanup_text(''.join(output))
subitem = ''
bodyTree = trees[subitem].find(".//body")
for line in output.splitlines():
pos = line.find(CALIBRE_SNB_PRE_TAG)
if pos == -1:
line = line.strip(' \t\n\r\u3000')
else:
etree.SubElement(bodyTree, "text").text = \
etree.CDATA(line[pos+len(CALIBRE_SNB_PRE_TAG):])
continue
if len(line) != 0:
if line.find(CALIBRE_SNB_IMG_TAG) == 0:
prefix = ProcessFileName(os.path.dirname(self.item.href))
if prefix != '':
etree.SubElement(bodyTree, "img").text = \
prefix + '_' + line[len(CALIBRE_SNB_IMG_TAG):]
else:
etree.SubElement(bodyTree, "img").text = \
line[len(CALIBRE_SNB_IMG_TAG):]
elif line.find(CALIBRE_SNB_BM_TAG) == 0:
subitem = line[len(CALIBRE_SNB_BM_TAG):]
bodyTree = trees[subitem].find(".//body")
else:
if self.opts and not self.opts.snb_dont_indent_first_line:
prefix = '\u3000\u3000'
else:
prefix = ''
etree.SubElement(bodyTree, "text").text = \
etree.CDATA(str(prefix + line))
if self.opts and self.opts.snb_insert_empty_line:
etree.SubElement(bodyTree, "text").text = \
etree.CDATA('')
return trees
def remove_newlines(self, text):
self.log.debug('\tRemove newlines for processing...')
text = text.replace('\r\n', ' ')
text = text.replace('\n', ' ')
text = text.replace('\r', ' ')
return text
def cleanup_text(self, text):
self.log.debug('\tClean up text...')
# Replace bad characters.
text = text.replace('\xc2', '')
text = text.replace('\xa0', ' ')
text = text.replace('\xa9', '(C)')
# Replace tabs, vertical tags and form feeds with single space.
text = text.replace('\t+', ' ')
text = text.replace('\v+', ' ')
text = text.replace('\f+', ' ')
# Single line paragraph.
text = re.sub('(?<=.)%s(?=.)' % os.linesep, ' ', text)
# Remove multiple spaces.
# text = re.sub('[ ]{2,}', ' ', text)
# Remove excessive newlines.
text = re.sub('\n[ ]+\n', '\n\n', text)
if self.opts.remove_paragraph_spacing:
text = re.sub('\n{2,}', '\n', text)
text = re.sub('(?imu)^(?=.)', '\t', text)
else:
text = re.sub('\n{3,}', '\n\n', text)
# Replace spaces at the beginning and end of lines
text = re.sub('(?imu)^[ ]+', '', text)
text = re.sub('(?imu)[ ]+$', '', text)
if self.opts.snb_max_line_length:
max_length = self.opts.snb_max_line_length
if self.opts.max_line_length < 25: # and not self.opts.force_max_line_length:
max_length = 25
short_lines = []
lines = text.splitlines()
for line in lines:
while len(line) > max_length:
space = line.rfind(' ', 0, max_length)
if space != -1:
# Space was found.
short_lines.append(line[:space])
line = line[space + 1:]
else:
# Space was not found.
if False and self.opts.force_max_line_length:
# Force breaking at max_lenght.
short_lines.append(line[:max_length])
line = line[max_length:]
else:
# Look for the first space after max_length.
space = line.find(' ', max_length, len(line))
if space != -1:
# Space was found.
short_lines.append(line[:space])
line = line[space + 1:]
else:
# No space was found cannot break line.
short_lines.append(line)
line = ''
# Add the text that was less than max_lengh to the list
short_lines.append(line)
text = '\n'.join(short_lines)
return text
def dump_text(self, subitems, elem, stylizer, end='', pre=False, li=''):
from calibre.ebooks.oeb.base import XHTML_NS, barename, namespace
if not isinstance(elem.tag, string_or_bytes) \
or namespace(elem.tag) != XHTML_NS:
p = elem.getparent()
if p is not None and isinstance(p.tag, string_or_bytes) and namespace(p.tag) == XHTML_NS \
and elem.tail:
return [elem.tail]
return ['']
text = ['']
style = stylizer.style(elem)
if elem.attrib.get('id') is not None and elem.attrib['id'] in [href for href, title in subitems]:
if self.curSubItem is not None and self.curSubItem != elem.attrib['id']:
self.curSubItem = elem.attrib['id']
text.append(f'\n\n{CALIBRE_SNB_BM_TAG}{self.curSubItem}\n\n')
if style['display'] in ('none', 'oeb-page-head', 'oeb-page-foot') \
or style['visibility'] == 'hidden':
if hasattr(elem, 'tail') and elem.tail:
return [elem.tail]
return ['']
tag = barename(elem.tag)
in_block = False
# Are we in a paragraph block?
if tag in BLOCK_TAGS or style['display'] in BLOCK_STYLES:
in_block = True
if not end.endswith('\n\n') and hasattr(elem, 'text') and elem.text:
text.append('\n\n')
if tag in SPACE_TAGS:
if not end.endswith('u ') and hasattr(elem, 'text') and elem.text:
text.append(' ')
if tag == 'img':
text.append('\n\n{}{}\n\n'.format(CALIBRE_SNB_IMG_TAG, ProcessFileName(elem.attrib['src'])))
if tag == 'br':
text.append('\n\n')
if tag == 'li':
li = '- '
pre = (tag == 'pre' or pre)
# Process tags that contain text.
if hasattr(elem, 'text') and elem.text:
if pre:
text.append(('\n\n%s' % CALIBRE_SNB_PRE_TAG).join((li + elem.text).splitlines()))
else:
text.append(li + elem.text)
li = ''
for item in elem:
en = ''
if len(text) >= 2:
en = text[-1][-2:]
t = self.dump_text(subitems, item, stylizer, en, pre, li)[0]
text += t
if in_block:
text.append('\n\n')
if hasattr(elem, 'tail') and elem.tail:
if pre:
text.append(('\n\n%s' % CALIBRE_SNB_PRE_TAG).join(elem.tail.splitlines()))
else:
text.append(li + elem.tail)
li = ''
return text, li