%PDF- %PDF-
Direktori : /lib/calibre/calibre/ebooks/rtf/ |
Current File : //lib/calibre/calibre/ebooks/rtf/rtfml.py |
__license__ = 'GPL 3' __copyright__ = '2009, John Schember <john@nachtimwald.com>' __docformat__ = 'restructuredtext en' ''' Transform OEB content into RTF markup ''' import os import re import io from binascii import hexlify from lxml import etree from calibre.ebooks.metadata import authors_to_string from calibre.utils.img import save_cover_data_to from calibre.utils.imghdr import identify from polyglot.builtins import string_or_bytes TAGS = { 'b': '\\b', 'del': '\\deleted', 'h1': '\\s1 \\afs32', 'h2': '\\s2 \\afs28', 'h3': '\\s3 \\afs28', 'h4': '\\s4 \\afs23', 'h5': '\\s5 \\afs23', 'h6': '\\s6 \\afs21', 'i': '\\i', 'li': '\t', 'p': '\t', 'sub': '\\sub', 'sup': '\\super', 'u': '\\ul', } SINGLE_TAGS = { 'br': '\n{\\line }\n', } STYLES = [ ('font-weight', {'bold': '\\b', 'bolder': '\\b'}), ('font-style', {'italic': '\\i'}), ('text-align', {'center': '\\qc', 'left': '\\ql', 'right': '\\qr'}), ('text-decoration', {'line-through': '\\strike', 'underline': '\\ul'}), ] BLOCK_TAGS = [ 'div', 'p', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'li', ] BLOCK_STYLES = [ 'block' ] ''' TODO: * Tables * Fonts ''' def txt2rtf(text): # Escape { and } in the text. text = text.replace('{', r'\'7b') text = text.replace('}', r'\'7d') text = text.replace('\\', r'\'5c') if not isinstance(text, str): return text buf = io.StringIO() for x in text: val = ord(x) if val == 160: buf.write(r'\~') elif val <= 127: buf.write(x) else: # python2 and ur'\u' does not work c = f'\\u{val:d}?' buf.write(c) return buf.getvalue() class RTFMLizer: def __init__(self, log): self.log = log def extract_content(self, oeb_book, opts): self.log.info('Converting XHTML to RTF markup...') self.oeb_book = oeb_book self.opts = opts return self.mlize_spine() def mlize_spine(self): from calibre.ebooks.oeb.base import XHTML from calibre.ebooks.oeb.stylizer import Stylizer from calibre.utils.xml_parse import safe_xml_fromstring output = self.header() if 'titlepage' in self.oeb_book.guide: href = self.oeb_book.guide['titlepage'].href item = self.oeb_book.manifest.hrefs[href] if item.spine_position is None: stylizer = Stylizer(item.data, item.href, self.oeb_book, self.opts, self.opts.output_profile) self.currently_dumping_item = item output += self.dump_text(item.data.find(XHTML('body')), stylizer) output += r'{\page }' for item in self.oeb_book.spine: self.log.debug('Converting %s to RTF markup...' % item.href) # Removing comments is needed as comments with -- inside them can # cause fromstring() to fail content = re.sub('<!--.*?-->', '', etree.tostring(item.data, encoding='unicode'), flags=re.DOTALL) content = self.remove_newlines(content) content = self.remove_tabs(content) content = safe_xml_fromstring(content) stylizer = Stylizer(content, item.href, self.oeb_book, self.opts, self.opts.output_profile) self.currently_dumping_item = item output += self.dump_text(content.find(XHTML('body')), stylizer) output += r'{\page }' output += self.footer() output = self.insert_images(output) output = self.clean_text(output) return output def remove_newlines(self, text): self.log.debug('\tRemove newlines for processing...') text = text.replace('\r\n', ' ') text = text.replace('\n', ' ') text = text.replace('\r', ' ') return text def remove_tabs(self, text): self.log.debug('Replace tabs with space for processing...') text = text.replace('\t', ' ') return text def header(self): header = '{{\\rtf1{{\\info{{\\title {}}}{{\\author {}}}}}\\ansi\\ansicpg1252\\deff0\\deflang1033\n'.format( self.oeb_book.metadata.title[0].value, authors_to_string([x.value for x in self.oeb_book.metadata.creator])) return header + ( '{\\fonttbl{\\f0\\froman\\fprq2\\fcharset128 Times New Roman;}{\\f1\\froman\\fprq2\\fcharset128 Times New Roman;}{\\f2\\fswiss\\fprq2\\fcharset128 Arial;}{\\f3\\fnil\\fprq2\\fcharset128 Arial;}{\\f4\\fnil\\fprq2\\fcharset128 MS Mincho;}{\\f5\\fnil\\fprq2\\fcharset128 Tahoma;}{\\f6\\fnil\\fprq0\\fcharset128 Tahoma;}}\n' # noqa '{\\stylesheet{\\ql \\li0\\ri0\\nowidctlpar\\wrapdefault\\faauto\\rin0\\lin0\\itap0 \\rtlch\\fcs1 \\af25\\afs24\\alang1033 \\ltrch\\fcs0 \\fs24\\lang1033\\langfe255\\cgrid\\langnp1033\\langfenp255 \\snext0 Normal;}\n' # noqa '{\\s1\\ql \\li0\\ri0\\sb240\\sa120\\keepn\\nowidctlpar\\wrapdefault\\faauto\\outlinelevel0\\rin0\\lin0\\itap0 \\rtlch\\fcs1 \\ab\\af0\\afs32\\alang1033 \\ltrch\\fcs0 \\b\\fs32\\lang1033\\langfe255\\loch\\f1\\hich\\af1\\dbch\\af26\\cgrid\\langnp1033\\langfenp255 \\sbasedon15 \\snext16 \\slink21 heading 1;}\n' # noqa '{\\s2\\ql \\li0\\ri0\\sb240\\sa120\\keepn\\nowidctlpar\\wrapdefault\\faauto\\outlinelevel1\\rin0\\lin0\\itap0 \\rtlch\\fcs1 \\ab\\ai\\af0\\afs28\\alang1033 \\ltrch\\fcs0 \\b\\i\\fs28\\lang1033\\langfe255\\loch\\f1\\hich\\af1\\dbch\\af26\\cgrid\\langnp1033\\langfenp255 \\sbasedon15 \\snext16 \\slink22 heading 2;}\n' # noqa '{\\s3\\ql \\li0\\ri0\\sb240\\sa120\\keepn\\nowidctlpar\\wrapdefault\\faauto\\outlinelevel2\\rin0\\lin0\\itap0 \\rtlch\\fcs1 \\ab\\af0\\afs28\\alang1033 \\ltrch\\fcs0 \\b\\fs28\\lang1033\\langfe255\\loch\\f1\\hich\\af1\\dbch\\af26\\cgrid\\langnp1033\\langfenp255 \\sbasedon15 \\snext16 \\slink23 heading 3;}\n' # noqa '{\\s4\\ql \\li0\\ri0\\sb240\\sa120\\keepn\\nowidctlpar\\wrapdefault\\faauto\\outlinelevel3\\rin0\\lin0\\itap0 \\rtlch\\fcs1 \\ab\\ai\\af0\\afs23\\alang1033 \\ltrch\\fcs0\\b\\i\\fs23\\lang1033\\langfe255\\loch\\f1\\hich\\af1\\dbch\\af26\\cgrid\\langnp1033\\langfenp255 \\sbasedon15 \\snext16 \\slink24 heading 4;}\n' # noqa '{\\s5\\ql \\li0\\ri0\\sb240\\sa120\\keepn\\nowidctlpar\\wrapdefault\\faauto\\outlinelevel4\\rin0\\lin0\\itap0 \\rtlch\\fcs1 \\ab\\af0\\afs23\\alang1033 \\ltrch\\fcs0 \\b\\fs23\\lang1033\\langfe255\\loch\\f1\\hich\\af1\\dbch\\af26\\cgrid\\langnp1033\\langfenp255 \\sbasedon15 \\snext16 \\slink25 heading 5;}\n' # noqa '{\\s6\\ql \\li0\\ri0\\sb240\\sa120\\keepn\\nowidctlpar\\wrapdefault\\faauto\\outlinelevel5\\rin0\\lin0\\itap0 \\rtlch\\fcs1 \\ab\\af0\\afs21\\alang1033 \\ltrch\\fcs0 \\b\\fs21\\lang1033\\langfe255\\loch\\f1\\hich\\af1\\dbch\\af26\\cgrid\\langnp1033\\langfenp255 \\sbasedon15 \\snext16 \\slink26 heading 6;}}\n' # noqa ) def footer(self): return ' }' def insert_images(self, text): from calibre.ebooks.oeb.base import OEB_RASTER_IMAGES for item in self.oeb_book.manifest: if item.media_type in OEB_RASTER_IMAGES: src = item.href try: data, width, height = self.image_to_hexstring(item.data) except Exception: self.log.exception('Image %s is corrupted, ignoring'%item.href) repl = '\n\n' else: repl = '\n\n{\\*\\shppict{\\pict\\jpegblip\\picw%i\\pich%i \n%s\n}}\n\n' % (width, height, data) text = text.replace('SPECIAL_IMAGE-%s-REPLACE_ME' % src, repl) return text def image_to_hexstring(self, data): # Images must be hex-encoded in 128 character lines data = save_cover_data_to(data) width, height = identify(data)[1:] lines = [] v = memoryview(data) for i in range(0, len(data), 64): lines.append(hexlify(v[i:i+64])) hex_string = b'\n'.join(lines).decode('ascii') return hex_string, width, height def clean_text(self, text): # Remove excessive newlines text = re.sub('%s{3,}' % os.linesep, f'{os.linesep}{os.linesep}', text) # Remove excessive spaces text = re.sub('[ ]{2,}', ' ', text) text = re.sub('\t{2,}', '\t', text) text = re.sub('\t ', '\t', text) # Remove excessive line breaks text = re.sub(r'(\{\\line \}\s*){3,}', r'{\\line }{\\line }', text) # Remove non-breaking spaces text = text.replace('\xa0', ' ') text = text.replace('\n\r', '\n') return text def dump_text(self, elem, stylizer, tag_stack=[]): from calibre.ebooks.oeb.base import (XHTML_NS, namespace, barename, urlnormalize) if not isinstance(elem.tag, string_or_bytes) \ or namespace(elem.tag) != XHTML_NS: p = elem.getparent() if p is not None and isinstance(p.tag, string_or_bytes) and namespace(p.tag) == XHTML_NS \ and elem.tail: return elem.tail return '' text = '' style = stylizer.style(elem) if style['display'] in ('none', 'oeb-page-head', 'oeb-page-foot') \ or style['visibility'] == 'hidden': if hasattr(elem, 'tail') and elem.tail: return elem.tail return '' tag = barename(elem.tag) tag_count = 0 # Are we in a paragraph block? if tag in BLOCK_TAGS or style['display'] in BLOCK_STYLES: if 'block' not in tag_stack: tag_count += 1 tag_stack.append('block') # Process tags that need special processing and that do not have inner # text. Usually these require an argument if tag == 'img': src = elem.get('src') if src: src = urlnormalize(self.currently_dumping_item.abshref(src)) block_start = '' block_end = '' if 'block' not in tag_stack: block_start = r'{\par\pard\hyphpar ' block_end = '}' text += f'{block_start} SPECIAL_IMAGE-{src}-REPLACE_ME {block_end}' single_tag = SINGLE_TAGS.get(tag, None) if single_tag: text += single_tag rtf_tag = TAGS.get(tag, None) if rtf_tag and rtf_tag not in tag_stack: tag_count += 1 text += '{%s\n' % rtf_tag tag_stack.append(rtf_tag) # Processes style information for s in STYLES: style_tag = s[1].get(style[s[0]], None) if style_tag and style_tag not in tag_stack: tag_count += 1 text += '{%s\n' % style_tag tag_stack.append(style_tag) # Process tags that contain text. if hasattr(elem, 'text') and elem.text: text += txt2rtf(elem.text) for item in elem: text += self.dump_text(item, stylizer, tag_stack) for i in range(0, tag_count): end_tag = tag_stack.pop() if end_tag != 'block': if tag in BLOCK_TAGS: text += r'\par\pard\plain\hyphpar}' else: text += '}' if hasattr(elem, 'tail') and elem.tail: if 'block' in tag_stack: text += '%s' % txt2rtf(elem.tail) else: text += r'{\par\pard\hyphpar %s}' % txt2rtf(elem.tail) return text