%PDF- %PDF-
| Direktori : /usr/lib/calibre/calibre/ebooks/txt/ |
| Current File : //usr/lib/calibre/calibre/ebooks/txt/markdownml.py |
__license__ = 'GPL 3'
__copyright__ = '''2011, John Schember <john@nachtimwald.com>
2011, Leigh Parry <leighparry@blueyonder.co.uk>'''
__docformat__ = 'restructuredtext en'
'''
Transform OEB content into Textile formatted plain text
'''
import re
from functools import partial
from calibre.ebooks.htmlz.oeb2html import OEB2HTML
from calibre.ebooks.oeb.base import XHTML, XHTML_NS, barename, namespace, rewrite_links
from calibre.ebooks.oeb.stylizer import Stylizer
from polyglot.builtins import string_or_bytes
class MarkdownMLizer(OEB2HTML):
def extract_content(self, oeb_book, opts):
self.log.info('Converting XHTML to Markdown formatted TXT...')
self.opts = opts
self.in_code = False
self.in_pre = False
self.list = []
self.blockquotes = 0
self.remove_space_after_newline = False
self.base_hrefs = [item.href for item in oeb_book.spine]
self.map_resources(oeb_book)
self.style_bold = False
self.style_italic = False
txt = self.mlize_spine(oeb_book)
# Do some tidying up
txt = self.tidy_up(txt)
return txt
def mlize_spine(self, oeb_book):
output = ['']
for item in oeb_book.spine:
self.log.debug('Converting %s to Markdown formatted TXT...' % item.href)
self.rewrite_ids(item.data, item)
rewrite_links(item.data, partial(self.rewrite_link, page=item))
stylizer = Stylizer(item.data, item.href, oeb_book, self.opts, self.opts.output_profile)
output += self.dump_text(item.data.find(XHTML('body')), stylizer)
output.append('\n\n')
return ''.join(output)
def tidy_up(self, text):
# Remove blank space form beginning of paragraph.
text = re.sub('(?msu)^[ ]{1,3}', '', text)
# pre has 4 spaces. We trimmed 3 so anything with a space left is a pre.
text = re.sub('(?msu)^[ ]', ' ', text)
# Remove tabs that aren't at the beginning of a line
new_text = []
for l in text.splitlines():
start = re.match('\t+', l)
if start:
start = start.group()
else:
start = ''
l = re.sub('\t', '', l)
new_text.append(start + l)
text = '\n'.join(new_text)
# Remove spaces from blank lines.
text = re.sub('(?msu)^[ ]+$', '', text)
# Reduce blank lines
text = re.sub('(?msu)\n{7,}', '\n' * 6, text)
# Remove blank lines at beginning and end of document.
text = re.sub(r'^\s*', '', text)
text = re.sub(r'\s*$', '\n\n', text)
return text
def remove_newlines(self, text):
text = text.replace('\r\n', ' ')
text = text.replace('\n', ' ')
text = text.replace('\r', ' ')
# Condense redundant spaces created by replacing newlines with spaces.
text = re.sub(r'[ ]{2,}', ' ', text)
text = re.sub(r'\t+', '', text)
if self.remove_space_after_newline == True: # noqa
text = re.sub(r'^ +', '', text)
self.remove_space_after_newline = False
return text
def prepare_string_for_markdown(self, txt):
txt = re.sub(r'([\\`*_{}\[\]()#+!])', r'\\\1', txt)
return txt
def prepare_string_for_pre(self, txt):
new_text = []
for l in txt.splitlines():
new_text.append(' ' + l)
return '\n'.join(new_text)
def dump_text(self, elem, stylizer):
'''
@elem: The element in the etree that we are working on.
@stylizer: The style information attached to the element.
'''
# We can only processes tags. If there isn't a tag return any text.
if not isinstance(elem.tag, string_or_bytes) \
or namespace(elem.tag) != XHTML_NS:
p = elem.getparent()
if p is not None and isinstance(p.tag, string_or_bytes) and namespace(p.tag) == XHTML_NS \
and elem.tail:
return [elem.tail]
return ['']
# Setup our variables.
text = []
style = stylizer.style(elem)
tags = []
tag = barename(elem.tag)
attribs = elem.attrib
# Ignore anything that is set to not be displayed.
if style['display'] in ('none', 'oeb-page-head', 'oeb-page-foot') \
or style['visibility'] == 'hidden':
if hasattr(elem, 'tail') and elem.tail:
return [elem.tail]
return ['']
# Soft scene breaks.
if 'margin-top' in style.cssdict() and style['margin-top'] != 'auto':
ems = int(round(float(style.marginTop) / style.fontSize) - 1)
if ems >= 1:
text.append('\n\n' * ems)
bq = '> ' * self.blockquotes
# Block level elements
if tag in ('h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'p', 'div'):
h_tag = ''
if tag in ('h1', 'h2', 'h3', 'h4', 'h5', 'h6'):
h_tag = '#' * int(tag[1]) + ' '
text.append('\n' + bq + h_tag)
tags.append('\n')
self.remove_space_after_newline = True
if style['font-style'] == 'italic' or tag in ('i', 'em'):
if tag not in ('h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'cite'):
if self.style_italic == False: # noqa
text.append('*')
tags.append('*')
self.style_italic = True
if style['font-weight'] in ('bold', 'bolder') or tag in ('b', 'strong'):
if tag not in ('h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'th'):
if self.style_bold == False: # noqa
text.append('**')
tags.append('**')
self.style_bold = True
if tag == 'br':
text.append(' \n')
self.remove_space_after_newline = True
if tag == 'blockquote':
self.blockquotes += 1
tags.append('>')
text.append('> ' * self.blockquotes)
elif tag == 'code':
if not self.in_pre and not self.in_code:
text.append('`')
tags.append('`')
self.in_code = True
elif tag == 'pre':
if not self.in_pre:
text.append('\n')
tags.append('pre')
self.in_pre = True
elif tag == 'hr':
text.append('\n* * *')
tags.append('\n')
elif tag == 'a':
# Only write links with absolute (external) urls.
if self.opts.keep_links and 'href' in attribs and '://' in attribs['href']:
title = ''
if 'title' in attribs:
title = ' "' + attribs['title'] + '"'
remove_space = self.remove_space_after_newline
title = self.remove_newlines(title)
self.remove_space_after_newline = remove_space
text.append('[')
tags.append('](' + attribs['href'] + title + ')')
elif tag == 'img':
if self.opts.keep_image_references:
txt = '!'
if 'alt' in attribs:
remove_space = self.remove_space_after_newline
txt += '[' + self.remove_newlines(attribs['alt']) + ']'
self.remove_space_after_newline = remove_space
txt += '(' + attribs['src'] + ')'
text.append(txt)
elif tag in ('ol', 'ul'):
tags.append(tag)
# Add the list to our lists of lists so we can track
# nested lists.
self.list.append({'name': tag, 'num': 0})
elif tag == 'li':
# Get the last list from our list of lists
if self.list:
li = self.list[-1]
else:
li = {'name': 'ul', 'num': 0}
# Add a new line to start the item
text.append('\n')
# Add indent if we have nested lists.
list_count = len(self.list)
# We only care about indenting nested lists.
if (list_count - 1) > 0:
text.append('\t' * (list_count - 1))
# Add blockquote if we have a blockquote in a list item.
text.append(bq)
# Write the proper sign for ordered and unorded lists.
if li['name'] == 'ul':
text.append('+ ')
elif li['name'] == 'ol':
li['num'] += 1
text.append(str(li['num']) + '. ')
# Process tags that contain text.
if hasattr(elem, 'text') and elem.text:
txt = elem.text
if self.in_pre:
txt = self.prepare_string_for_pre(txt)
elif self.in_code:
txt = self.remove_newlines(txt)
else:
txt = self.prepare_string_for_markdown(self.remove_newlines(txt))
text.append(txt)
# Recurse down into tags within the tag we are in.
for item in elem:
text += self.dump_text(item, stylizer)
# Close all open tags.
tags.reverse()
for t in tags:
if t in ('pre', 'ul', 'ol', '>'):
if t == 'pre':
self.in_pre = False
text.append('\n')
elif t == '>':
self.blockquotes -= 1
elif t in ('ul', 'ol'):
if self.list:
self.list.pop()
text.append('\n')
else:
if t == '**':
self.style_bold = False
elif t == '*':
self.style_italic = False
elif t == '`':
self.in_code = False
text.append('%s' % t)
# Soft scene breaks.
if 'margin-bottom' in style.cssdict() and style['margin-bottom'] != 'auto':
ems = int(round((float(style.marginBottom) / style.fontSize) - 1))
if ems >= 1:
text.append('\n\n' * ems)
# Add the text that is outside of the tag.
if hasattr(elem, 'tail') and elem.tail:
tail = elem.tail
if self.in_pre:
tail = self.prepare_string_for_pre(tail)
elif self.in_code:
tail = self.remove_newlines(tail)
else:
tail = self.prepare_string_for_markdown(self.remove_newlines(tail))
text.append(tail)
return text