%PDF- %PDF-
Direktori : /proc/thread-self/root/usr/lib/calibre/calibre/gui2/tweak_book/editor/syntax/ |
Current File : //proc/thread-self/root/usr/lib/calibre/calibre/gui2/tweak_book/editor/syntax/html.py |
#!/usr/bin/env python3 __license__ = 'GPL v3' __copyright__ = '2013, Kovid Goyal <kovid at kovidgoyal.net>' import re from collections import namedtuple from functools import partial from qt.core import QFont, QTextBlockUserData, QTextCharFormat from calibre.ebooks.oeb.polish.spell import html_spell_tags, patterns, xml_spell_tags from calibre.gui2.tweak_book import dictionaries, tprefs, verify_link from calibre.gui2.tweak_book.editor import ( CLASS_ATTRIBUTE_PROPERTY, LINK_PROPERTY, SPELL_LOCALE_PROPERTY, SPELL_PROPERTY, TAG_NAME_PROPERTY, store_locale, syntax_text_char_format ) from calibre.gui2.tweak_book.editor.syntax.base import SyntaxHighlighter, run_loop from calibre.gui2.tweak_book.editor.syntax.css import ( CSSState, CSSUserData, create_formats as create_css_formats, state_map as css_state_map ) from calibre.spell.break_iterator import split_into_words_and_positions from calibre.spell.dictionary import parse_lang_code from calibre_extensions import html_syntax_highlighter as _speedup from polyglot.builtins import iteritems cdata_tags = frozenset(['title', 'textarea', 'style', 'script', 'xmp', 'iframe', 'noembed', 'noframes', 'noscript']) normal_pat = re.compile(r'[^<>&]+') entity_pat = re.compile(r'&#{0,1}[a-zA-Z0-9]{1,8};') tag_name_pat = re.compile(r'/{0,1}[a-zA-Z0-9:-]+') space_chars = ' \t\r\n\u000c' attribute_name_pat = re.compile(r'''[^%s"'/><=]+''' % space_chars) self_closing_pat = re.compile(r'/\s*>') unquoted_val_pat = re.compile(r'''[^%s'"=<>`]+''' % space_chars) cdata_close_pats = {x:re.compile(r'</%s' % x, flags=re.I) for x in cdata_tags} nbsp_pat = re.compile('[\xa0\u2000-\u200A\u202F\u205F\u3000\u2011-\u2015\uFE58\uFE63\uFF0D]+') # special spaces and hyphens NORMAL = 0 IN_OPENING_TAG = 1 IN_CLOSING_TAG = 2 IN_COMMENT = 3 IN_PI = 4 IN_DOCTYPE = 5 ATTRIBUTE_NAME = 6 ATTRIBUTE_VALUE = 7 SQ_VAL = 8 DQ_VAL = 9 CDATA = 10 CSS = 11 TagStart = namedtuple('TagStart', 'offset prefix name closing is_start') TagEnd = namedtuple('TagEnd', 'offset self_closing is_start') NonTagBoundary = namedtuple('NonTagBoundary', 'offset is_start type') Attr = namedtuple('Attr', 'offset type data') LINK_ATTRS = frozenset(('href', 'src', 'poster', 'xlink:href')) do_spell_check = False def refresh_spell_check_status(): global do_spell_check do_spell_check = tprefs['inline_spell_check'] and hasattr(dictionaries, 'active_user_dictionaries') Tag = _speedup.Tag bold_tags, italic_tags = _speedup.bold_tags, _speedup.italic_tags State = _speedup.State def spell_property(sfmt, locale): s = QTextCharFormat(sfmt) s.setProperty(SPELL_LOCALE_PROPERTY, locale) return s def sanitizing_recognizer(): sanitize = patterns().sanitize_invisible_pat.sub r = dictionaries.recognized def recognized(word, locale=None): word = sanitize('', word).strip() return r(word, locale) return recognized _speedup.init(spell_property, sanitizing_recognizer(), split_into_words_and_positions) del spell_property check_spelling = _speedup.check_spelling def finish_opening_tag(state, cdata_tags): state.parse = NORMAL if state.tag_being_defined is None: return t, state.tag_being_defined = state.tag_being_defined, None state.tags.append(t) state.is_bold = state.is_bold or t.bold state.is_italic = state.is_italic or t.italic state.current_lang = t.lang or state.current_lang if t.name in cdata_tags: state.parse = CSS if t.name == 'style' else CDATA state.sub_parser_state = None def close_tag(state, name): removed_tags = [] for tag in reversed(state.tags): removed_tags.append(tag) if tag.name == name: break else: return # No matching open tag found, ignore the closing tag # Remove all tags up to the matching open tag state.tags = state.tags[:-len(removed_tags)] state.sub_parser_state = None # Check if we should still be bold or italic if state.is_bold: state.is_bold = False for tag in reversed(state.tags): if tag.bold: state.is_bold = True break if state.is_italic: state.is_italic = False for tag in reversed(state.tags): if tag.italic: state.is_italic = True break # Set the current language to the first lang attribute in a still open tag state.current_lang = None for tag in reversed(state.tags): if tag.lang is not None: state.current_lang = tag.lang break class HTMLUserData(QTextBlockUserData): def __init__(self): QTextBlockUserData.__init__(self) self.tags = [] self.attributes = [] self.non_tag_structures = [] self.state = State() self.css_user_data = None self.doc_name = None def clear(self, state=None, doc_name=None): self.tags, self.attributes, self.non_tag_structures = [], [], [] self.state = State() if state is None else state self.doc_name = doc_name @classmethod def tag_ok_for_spell(cls, name): return name not in html_spell_tags class XMLUserData(HTMLUserData): @classmethod def tag_ok_for_spell(cls, name): return name in xml_spell_tags def add_tag_data(user_data, tag): user_data.tags.append(tag) ATTR_NAME, ATTR_VALUE, ATTR_START, ATTR_END = object(), object(), object(), object() def add_attr_data(user_data, data_type, data, offset): user_data.attributes.append(Attr(offset, data_type, data)) def css(state, text, i, formats, user_data): ' Inside a <style> tag ' pat = cdata_close_pats['style'] m = pat.search(text, i) if m is None: css_text = text[i:] else: css_text = text[i:m.start()] ans = [] css_user_data = user_data.css_user_data = user_data.css_user_data or CSSUserData() state.sub_parser_state = css_user_data.state = state.sub_parser_state or CSSState() for j, num, fmt in run_loop(css_user_data, css_state_map, formats['css_sub_formats'], css_text): ans.append((num, fmt)) if m is not None: state.sub_parser_state = None state.parse = IN_CLOSING_TAG add_tag_data(user_data, TagStart(m.start(), '', 'style', True, True)) ans.extend([(2, formats['end_tag']), (len(m.group()) - 2, formats['tag_name'])]) return ans def cdata(state, text, i, formats, user_data): 'CDATA inside tags like <title> or <style>' name = state.tags[-1].name pat = cdata_close_pats[name] m = pat.search(text, i) fmt = formats['title' if name == 'title' else 'special'] if m is None: return [(len(text) - i, fmt)] state.parse = IN_CLOSING_TAG num = m.start() - i add_tag_data(user_data, TagStart(m.start(), '', name, True, True)) return [(num, fmt), (2, formats['end_tag']), (len(m.group()) - 2, formats['tag_name'])] def process_text(state, text, nbsp_format, spell_format, user_data): ans = [] fmt = None if state.is_bold or state.is_italic: fmt = syntax_text_char_format() if state.is_bold: fmt.setFontWeight(QFont.Weight.Bold) if state.is_italic: fmt.setFontItalic(True) last = 0 for m in nbsp_pat.finditer(text): ans.extend([(m.start() - last, fmt), (m.end() - m.start(), nbsp_format)]) last = m.end() if not ans: ans = [(len(text), fmt)] elif last < len(text): ans.append((len(text) - last, fmt)) if do_spell_check and state.tags and user_data.tag_ok_for_spell(state.tags[-1].name): split_ans = [] locale = state.current_lang or dictionaries.default_locale sfmt = QTextCharFormat(spell_format) if fmt is not None: sfmt.merge(fmt) tpos = 0 for tlen, fmt in ans: if fmt is nbsp_format: split_ans.append((tlen, fmt)) else: split_ans.extend(check_spelling(text[tpos:tpos+tlen], tlen, fmt, locale, sfmt, store_locale.enabled)) tpos += tlen ans = split_ans return ans def normal(state, text, i, formats, user_data): ' The normal state in between tags ' ch = text[i] if ch == '<': if text[i:i+4] == '<!--': state.parse, fmt = IN_COMMENT, formats['comment'] user_data.non_tag_structures.append(NonTagBoundary(i, True, IN_COMMENT)) return [(4, fmt)] if text[i:i+2] == '<?': state.parse, fmt = IN_PI, formats['preproc'] user_data.non_tag_structures.append(NonTagBoundary(i, True, IN_PI)) return [(2, fmt)] if text[i:i+2] == '<!' and text[i+2:].lstrip().lower().startswith('doctype'): state.parse, fmt = IN_DOCTYPE, formats['preproc'] user_data.non_tag_structures.append(NonTagBoundary(i, True, IN_DOCTYPE)) return [(2, fmt)] m = tag_name_pat.match(text, i + 1) if m is None: return [(1, formats['<'])] tname = m.group() closing = tname.startswith('/') if closing: tname = tname[1:] if ':' in tname: prefix, name = tname.split(':', 1) else: prefix, name = '', tname if prefix and not name: return [(len(m.group()) + 1, formats['only-prefix'])] ans = [(2 if closing else 1, formats['end_tag' if closing else 'tag'])] if prefix: ans.append((len(prefix)+1, formats['nsprefix'])) ans.append((len(name), formats['tag_name'])) state.parse = IN_CLOSING_TAG if closing else IN_OPENING_TAG add_tag_data(user_data, TagStart(i, prefix, name, closing, True)) if closing: close_tag(state, name) else: state.tag_being_defined = Tag(name) return ans if ch == '&': m = entity_pat.match(text, i) if m is None: return [(1, formats['&'])] return [(len(m.group()), formats['entity'])] if ch == '>': return [(1, formats['>'])] t = normal_pat.search(text, i).group() return process_text(state, t, formats['nbsp'], formats['spell'], user_data) def opening_tag(cdata_tags, state, text, i, formats, user_data): 'An opening tag, like <a>' ch = text[i] if ch in space_chars: return [(1, None)] if ch == '/': m = self_closing_pat.match(text, i) if m is None: return [(1, formats['/'])] state.parse = NORMAL l = len(m.group()) add_tag_data(user_data, TagEnd(i + l - 1, True, False)) return [(l, formats['tag'])] if ch == '>': finish_opening_tag(state, cdata_tags) add_tag_data(user_data, TagEnd(i, False, False)) return [(1, formats['tag'])] m = attribute_name_pat.match(text, i) if m is None: return [(1, formats['?'])] state.parse = ATTRIBUTE_NAME attrname = state.attribute_name = m.group() add_attr_data(user_data, ATTR_NAME, attrname, m.start()) prefix, name = attrname.partition(':')[0::2] if not prefix and not name: return [(len(attrname), formats['?'])] if prefix and name: return [(len(prefix) + 1, formats['nsprefix']), (len(name), formats['attr'])] return [(len(prefix), formats['attr'])] def attribute_name(state, text, i, formats, user_data): ' After attribute name ' ch = text[i] if ch in space_chars: return [(1, None)] if ch == '=': state.parse = ATTRIBUTE_VALUE return [(1, formats['attr'])] # Standalone attribute with no value state.parse = IN_OPENING_TAG state.attribute_name = None return [(0, None)] def attribute_value(state, text, i, formats, user_data): ' After attribute = ' ch = text[i] if ch in space_chars: return [(1, None)] if ch in {'"', "'"}: state.parse = SQ_VAL if ch == "'" else DQ_VAL return [(1, formats['string'])] state.parse = IN_OPENING_TAG state.attribute_name = None m = unquoted_val_pat.match(text, i) if m is None: return [(1, formats['no-attr-value'])] return [(len(m.group()), formats['string'])] def quoted_val(state, text, i, formats, user_data): ' A quoted attribute value ' quote = '"' if state.parse is DQ_VAL else "'" add_attr_data(user_data, ATTR_VALUE, ATTR_START, i) pos = text.find(quote, i) if pos == -1: num = len(text) - i is_link = is_class = False else: num = pos - i + 1 state.parse = IN_OPENING_TAG if state.tag_being_defined is not None and state.attribute_name in ('lang', 'xml:lang'): try: state.tag_being_defined.lang = parse_lang_code(text[i:pos]) except ValueError: pass add_attr_data(user_data, ATTR_VALUE, ATTR_END, i + num) is_link = state.attribute_name in LINK_ATTRS is_class = not is_link and state.attribute_name == 'class' if is_link: if verify_link(text[i:i+num - 1], user_data.doc_name) is False: return [(num - 1, formats['bad_link']), (1, formats['string'])] return [(num - 1, formats['link']), (1, formats['string'])] elif is_class: return [(num - 1, formats['class_attr']), (1, formats['string'])] return [(num, formats['string'])] def closing_tag(state, text, i, formats, user_data): ' A closing tag like </a> ' ch = text[i] if ch in space_chars: return [(1, None)] pos = text.find('>', i) if pos == -1: return [(len(text) - i, formats['bad-closing'])] state.parse = NORMAL num = pos - i + 1 ans = [(1, formats['end_tag'])] if num > 1: ans.insert(0, (num - 1, formats['bad-closing'])) add_tag_data(user_data, TagEnd(pos, False, False)) return ans def in_comment(state, text, i, formats, user_data): ' Comment, processing instruction or doctype ' end = {IN_COMMENT:'-->', IN_PI:'?>'}.get(state.parse, '>') pos = text.find(end, i) fmt = formats['comment' if state.parse is IN_COMMENT else 'preproc'] if pos == -1: num = len(text) - i else: user_data.non_tag_structures.append(NonTagBoundary(pos, False, state.parse)) num = pos - i + len(end) state.parse = NORMAL return [(num, fmt)] state_map = { NORMAL:normal, IN_OPENING_TAG: partial(opening_tag, cdata_tags), IN_CLOSING_TAG: closing_tag, ATTRIBUTE_NAME: attribute_name, ATTRIBUTE_VALUE: attribute_value, CDATA: cdata, CSS: css, } for x in (IN_COMMENT, IN_PI, IN_DOCTYPE): state_map[x] = in_comment for x in (SQ_VAL, DQ_VAL): state_map[x] = quoted_val xml_state_map = state_map.copy() xml_state_map[IN_OPENING_TAG] = partial(opening_tag, set()) def create_formats(highlighter, add_css=True): t = highlighter.theme formats = { 'tag': t['Function'], 'end_tag': t['Function'], 'attr': t['Type'], 'entity': t['Special'], 'error': t['Error'], 'comment': t['Comment'], 'special': t['Special'], 'string': t['String'], 'nsprefix': t['Constant'], 'preproc': t['PreProc'], 'nbsp': t['SpecialCharacter'], 'spell': t['SpellError'], } for name, msg in iteritems({ '<': _('An unescaped < is not allowed. Replace it with <'), '&': _('An unescaped ampersand is not allowed. Replace it with &'), '>': _('An unescaped > is not allowed. Replace it with >'), '/': _('/ not allowed except at the end of the tag'), '?': _('Unknown character'), 'bad-closing': _('A closing tag must contain only the tag name and nothing else'), 'no-attr-value': _('Expecting an attribute value'), 'only-prefix': _('A tag name cannot end with a colon'), }): f = formats[name] = syntax_text_char_format(formats['error']) f.setToolTip(msg) f = formats['title'] = syntax_text_char_format() f.setFontWeight(QFont.Weight.Bold) if add_css: formats['css_sub_formats'] = create_css_formats(highlighter) formats['spell'].setProperty(SPELL_PROPERTY, True) formats['class_attr'] = syntax_text_char_format(t['Special']) formats['class_attr'].setProperty(CLASS_ATTRIBUTE_PROPERTY, True) formats['class_attr'].setToolTip(_('Hold down the Ctrl key and click to open the first matching CSS style rule')) formats['link'] = syntax_text_char_format(t['Link']) formats['link'].setProperty(LINK_PROPERTY, True) formats['link'].setToolTip(_('Hold down the Ctrl key and click to open this link')) formats['bad_link'] = syntax_text_char_format(t['BadLink']) formats['bad_link'].setProperty(LINK_PROPERTY, True) formats['bad_link'].setToolTip(_('This link points to a file that is not present in the book')) formats['tag_name'] = f = syntax_text_char_format(t['Statement']) f.setProperty(TAG_NAME_PROPERTY, True) return formats class Highlighter(SyntaxHighlighter): state_map = state_map create_formats_func = create_formats spell_attributes = ('alt', 'title') user_data_factory = HTMLUserData def tag_ok_for_spell(self, name): return HTMLUserData.tag_ok_for_spell(name) class XMLHighlighter(Highlighter): state_map = xml_state_map spell_attributes = ('opf:file-as',) user_data_factory = XMLUserData def create_formats_func(self): return create_formats(self, add_css=False) def tag_ok_for_spell(self, name): return XMLUserData.tag_ok_for_spell(name) def profile(): import sys from qt.core import QTextDocument from calibre.gui2 import Application from calibre.gui2.tweak_book import set_book_locale from calibre.gui2.tweak_book.editor.themes import get_theme app = Application([]) set_book_locale('en') with open(sys.argv[-2], 'rb') as f: raw = f.read().decode('utf-8') doc = QTextDocument() doc.setPlainText(raw) h = Highlighter() theme = get_theme(tprefs['editor_theme']) h.apply_theme(theme) h.set_document(doc) h.join() import cProfile print('Running profile on', sys.argv[-2]) h.rehighlight() cProfile.runctx('h.join()', {}, {'h':h}, sys.argv[-1]) print('Stats saved to:', sys.argv[-1]) del h del doc del app if __name__ == '__main__': from calibre.gui2.tweak_book.editor.widget import launch_editor launch_editor('''\ <!DOCTYPE html> <html xml:lang="en" lang="en"> <!-- --> <head> <meta charset="utf-8" /> <title>A title with a tag <span> in it, the tag is treated as normal text</title> <style type="text/css"> body { color: green; font-size: 12pt; } </style> <style type="text/css">p.small { font-size: x-small; color:gray }</style> </head id="invalid attribute on closing tag"> <body lang="en_IN"><p: <!-- The start of the actual body text --> <h1 lang="en_US">A heading that should appear in bold, with an <i>italic</i> word</h1> <p>Some text with inline formatting, that is syntax highlighted. A <b>bold</b> word, and an <em>italic</em> word. \ <i>Some italic text with a <b>bold-italic</b> word in </i>the middle.</p> <!-- Let's see what exotic constructs like namespace prefixes and empty attributes look like --> <svg:svg xmlns:svg="http://whatever" /> <input disabled><input disabled /><span attr=<></span> <!-- Non-breaking spaces are rendered differently from normal spaces, so that they stand out --> <p>Some\xa0words\xa0separated\xa0by\xa0non\u2011breaking\xa0spaces and non\u2011breaking hyphens.</p> <p>Some non-BMP unicode text:\U0001f431\U0001f431\U0001f431</p> </body> </html> ''', path_is_raw=True)