%PDF- %PDF-
Direktori : /usr/lib/calibre/calibre/ebooks/txt/ |
Current File : //usr/lib/calibre/calibre/ebooks/txt/txtml.py |
__license__ = 'GPL 3' __copyright__ = '2009, John Schember <john@nachtimwald.com>' __docformat__ = 'restructuredtext en' ''' Transform OEB content into plain text ''' import re from lxml import etree from polyglot.builtins import string_or_bytes BLOCK_TAGS = [ 'div', 'p', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'li', 'tr', ] BLOCK_STYLES = [ 'block', ] HEADING_TAGS = [ 'h1', 'h2', 'h3', 'h4', 'h5', 'h6', ] SPACE_TAGS = [ 'td', 'br', ] class TXTMLizer: def __init__(self, log): self.log = log def extract_content(self, oeb_book, opts): self.log.info('Converting XHTML to TXT...') self.oeb_book = oeb_book self.opts = opts self.toc_titles = [] self.toc_ids = [] self.last_was_heading = False self.create_flat_toc(self.oeb_book.toc) return self.mlize_spine() def mlize_spine(self): from calibre.ebooks.oeb.base import XHTML from calibre.ebooks.oeb.stylizer import Stylizer from calibre.utils.xml_parse import safe_xml_fromstring output = [''] output.append(self.get_toc()) for item in self.oeb_book.spine: self.log.debug('Converting %s to TXT...' % item.href) for x in item.data.iterdescendants(etree.Comment): if x.text and '--' in x.text: x.text = x.text.replace('--', '__') content = etree.tostring(item.data, encoding='unicode') content = self.remove_newlines(content) content = safe_xml_fromstring(content) stylizer = Stylizer(content, item.href, self.oeb_book, self.opts, self.opts.output_profile) output += self.dump_text(content.find(XHTML('body')), stylizer, item) output += '\n\n\n\n\n\n' output = ''.join(output) output = '\n'.join(l.rstrip() for l in output.splitlines()) output = self.cleanup_text(output) return output def remove_newlines(self, text): self.log.debug('\tRemove newlines for processing...') text = text.replace('\r\n', ' ') text = text.replace('\n', ' ') text = text.replace('\r', ' ') # Condense redundant spaces created by replacing newlines with spaces. text = re.sub(r'[ ]{2,}', ' ', text) return text def get_toc(self): toc = [''] if getattr(self.opts, 'inline_toc', None): self.log.debug('Generating table of contents...') toc.append('%s\n\n' % _('Table of Contents:')) for item in self.toc_titles: toc.append('* %s\n\n' % item) return ''.join(toc) def create_flat_toc(self, nodes): ''' Turns a hierarchical list of TOC href's into a flat list. ''' for item in nodes: self.toc_titles.append(item.title) self.toc_ids.append(item.href) self.create_flat_toc(item.nodes) def cleanup_text(self, text): self.log.debug('\tClean up text...') # Replace bad characters. text = text.replace('\xa0', ' ') # Replace tabs, vertical tags and form feeds with single space. text = text.replace('\t+', ' ') text = text.replace('\v+', ' ') text = text.replace('\f+', ' ') # Single line paragraph. text = re.sub('(?<=.)\n(?=.)', ' ', text) # Remove multiple spaces. text = re.sub('[ ]{2,}', ' ', text) # Remove excessive newlines. text = re.sub('\n[ ]+\n', '\n\n', text) if self.opts.remove_paragraph_spacing: text = re.sub('\n{2,}', '\n', text) text = re.sub(r'(?msu)^(?P<t>[^\t\n]+?)$', lambda mo: '%s\n\n' % mo.group('t'), text) text = re.sub(r'(?msu)(?P<b>[^\n])\n+(?P<t>[^\t\n]+?)(?=\n)', lambda mo: '{}\n\n\n\n\n\n{}'.format(mo.group('b'), mo.group('t')), text) else: text = re.sub('\n{7,}', '\n\n\n\n\n\n', text) # Replace spaces at the beginning and end of lines # We don't replace tabs because those are only added # when remove paragraph spacing is enabled. text = re.sub('(?imu)^[ ]+', '', text) text = re.sub('(?imu)[ ]+$', '', text) # Remove empty space and newlines at the beginning of the document. text = re.sub(r'(?u)^[ \n]+', '', text) if self.opts.max_line_length: max_length = int(self.opts.max_line_length) if max_length < 25 and not self.opts.force_max_line_length: max_length = 25 short_lines = [] lines = text.splitlines() for line in lines: while len(line) > max_length: space = line.rfind(' ', 0, max_length) if space != -1: # Space was found. short_lines.append(line[:space]) line = line[space + 1:] else: # Space was not found. if self.opts.force_max_line_length: # Force breaking at max_lenght. short_lines.append(line[:max_length]) line = line[max_length:] else: # Look for the first space after max_length. space = line.find(' ', max_length, len(line)) if space != -1: # Space was found. short_lines.append(line[:space]) line = line[space + 1:] else: # No space was found cannot break line. short_lines.append(line) line = '' # Add the text that was less than max_lengh to the list short_lines.append(line) text = '\n'.join(short_lines) return text def dump_text(self, elem, stylizer, page): ''' @elem: The element in the etree that we are working on. @stylizer: The style information attached to the element. @page: OEB page used to determine absolute urls. ''' from calibre.ebooks.oeb.base import XHTML_NS, barename, namespace if not isinstance(elem.tag, string_or_bytes) \ or namespace(elem.tag) != XHTML_NS: p = elem.getparent() if p is not None and isinstance(p.tag, string_or_bytes) and namespace(p.tag) == XHTML_NS \ and elem.tail: return [elem.tail] return [''] text = [''] style = stylizer.style(elem) if style['display'] in ('none', 'oeb-page-head', 'oeb-page-foot') \ or style['visibility'] == 'hidden': if hasattr(elem, 'tail') and elem.tail: return [elem.tail] return [''] tag = barename(elem.tag) tag_id = elem.attrib.get('id', None) in_block = False in_heading = False # Are we in a heading? # This can either be a heading tag or a TOC item. if tag in HEADING_TAGS or f'{page.href}#{tag_id}' in self.toc_ids: in_heading = True if not self.last_was_heading: text.append('\n\n\n\n\n\n') # Are we in a paragraph block? if tag in BLOCK_TAGS or style['display'] in BLOCK_STYLES: if self.opts.remove_paragraph_spacing and not in_heading: text.append('\t') in_block = True if tag in SPACE_TAGS: text.append(' ') # Hard scene breaks. if tag == 'hr': text.append('\n\n* * *\n\n') # Soft scene breaks. try: ems = int(round((float(style.marginTop) / style.fontSize) - 1)) if ems >= 1: text.append('\n' * ems) except: pass # Process tags that contain text. if hasattr(elem, 'text') and elem.text: text.append(elem.text) # Recurse down into tags within the tag we are in. for item in elem: text += self.dump_text(item, stylizer, page) if in_block: text.append('\n\n') if in_heading: text.append('\n') self.last_was_heading = True else: self.last_was_heading = False if hasattr(elem, 'tail') and elem.tail: text.append(elem.tail) return text