%PDF- %PDF-
Direktori : /lib/calibre/calibre/library/ |
Current File : //lib/calibre/calibre/library/comments.py |
#!/usr/bin/env python3 # License: GPLv3 Copyright: 2010, Kovid Goyal <kovid at kovidgoyal.net> import re from calibre import prepare_string_for_xml from calibre.constants import preferred_encoding from calibre.ebooks.BeautifulSoup import ( BeautifulSoup, CData, Comment, Declaration, NavigableString, ProcessingInstruction ) from calibre.utils.html2text import html2text # Hackish - ignoring sentences ending or beginning in numbers to avoid # confusion with decimal points. lost_cr_pat = re.compile('([a-z])([\\.\\?!])([A-Z])') lost_cr_exception_pat = re.compile(r'(Ph\.D)|(D\.Phil)|((Dr|Mr|Mrs|Ms)\.[A-Z])') sanitize_pat = re.compile(r'<script|<table|<tr|<td|<th|<style|<iframe', re.IGNORECASE) def comments_to_html(comments): ''' Convert random comment text to normalized, xml-legal block of <p>s 'plain text' returns as <p>plain text</p> 'plain text with <i>minimal</i> <b>markup</b>' returns as <p>plain text with <i>minimal</i> <b>markup</b></p> '<p>pre-formatted text</p> returns untouched 'A line of text\n\nFollowed by a line of text' returns as <p>A line of text</p> <p>Followed by a line of text</p> 'A line of text.\nA second line of text.\rA third line of text' returns as <p>A line of text.<br />A second line of text.<br />A third line of text.</p> '...end of a paragraph.Somehow the break was lost...' returns as <p>...end of a paragraph.</p> <p>Somehow the break was lost...</p> Deprecated HTML returns as HTML via BeautifulSoup() ''' if not comments: return '<p></p>' if not isinstance(comments, str): comments = comments.decode(preferred_encoding, 'replace') if comments.lstrip().startswith('<'): # Comment is already HTML do not mess with it return comments if '<' not in comments: comments = prepare_string_for_xml(comments) parts = ['<p class="description">%s</p>'%x.replace('\n', '<br />') for x in comments.split('\n\n')] return '\n'.join(parts) if sanitize_pat.search(comments) is not None: try: return sanitize_comments_html(comments) except: import traceback traceback.print_exc() return '<p></p>' # Explode lost CRs to \n\n comments = lost_cr_exception_pat.sub(lambda m: m.group().replace('.', '.\r'), comments) for lost_cr in lost_cr_pat.finditer(comments): comments = comments.replace(lost_cr.group(), '{}{}\n\n{}'.format(lost_cr.group(1), lost_cr.group(2), lost_cr.group(3))) comments = comments.replace('\r', '') # Convert \n\n to <p>s comments = comments.replace('\n\n', '<p>') # Convert solo returns to <br /> comments = comments.replace('\n', '<br />') # Convert two hyphens to emdash comments = comments.replace('--', '—') soup = BeautifulSoup('<div>' + comments + '</div>').find('div') result = BeautifulSoup('<div>') container = result.find('div') rtc = 0 open_pTag = False all_tokens = list(soup.contents) inline_tags = ('br', 'b', 'i', 'em', 'strong', 'span', 'font', 'a', 'hr') for token in all_tokens: if isinstance(token, (CData, Comment, Declaration, ProcessingInstruction)): continue if isinstance(token, NavigableString): if not open_pTag: pTag = result.new_tag('p') open_pTag = True ptc = 0 pTag.insert(ptc, token) ptc += 1 elif token.name in inline_tags: if not open_pTag: pTag = result.new_tag('p') open_pTag = True ptc = 0 pTag.insert(ptc, token) ptc += 1 else: if open_pTag: container.insert(rtc, pTag) rtc += 1 open_pTag = False ptc = 0 container.insert(rtc, token) rtc += 1 if open_pTag: container.insert(rtc, pTag) for p in container.findAll('p'): p['class'] = 'description' return container.decode_contents() def markdown(val): try: md = markdown.Markdown except AttributeError: from calibre.ebooks.markdown import Markdown md = markdown.Markdown = Markdown() return md.convert(val) def merge_comments(one, two): return comments_to_html(one) + '\n\n' + comments_to_html(two) def sanitize_comments_html(html): from calibre.ebooks.markdown import Markdown text = html2text(html, single_line_break=False) md = Markdown() html = md.convert(text) return html def find_tests(): import unittest class Test(unittest.TestCase): def test_comments_to_html(self): for pat, val in [ (b'lineone\n\nlinetwo', '<p class="description">lineone</p>\n<p class="description">linetwo</p>'), ('a <b>b&c</b>\nf', '<p class="description">a <b>b&c</b><br/>f</p>'), ('a <?xml asd> b\n\ncd', '<p class="description">a b</p><p class="description">cd</p>'), ]: cval = comments_to_html(pat) self.assertEqual(cval, val) return unittest.defaultTestLoader.loadTestsFromTestCase(Test)