%PDF- %PDF-
| Direktori : /lib/calibre/calibre/ebooks/docx/ |
| Current File : //lib/calibre/calibre/ebooks/docx/to_html.py |
#!/usr/bin/env python3
__license__ = 'GPL v3'
__copyright__ = '2013, Kovid Goyal <kovid at kovidgoyal.net>'
import sys, os, re, math, errno, uuid, numbers
from collections import OrderedDict, defaultdict
from lxml import html
from lxml.html.builder import (
HTML, HEAD, TITLE, BODY, LINK, META, P, SPAN, BR, DIV, A, DT, DL, DD, H1)
from calibre import guess_type
from calibre.ebooks.docx.container import DOCX, fromstring
from calibre.ebooks.docx.names import XML, generate_anchor
from calibre.ebooks.docx.styles import Styles, inherit, PageProperties
from calibre.ebooks.docx.numbering import Numbering
from calibre.ebooks.docx.fonts import Fonts, is_symbol_font, map_symbol_text
from calibre.ebooks.docx.images import Images
from calibre.ebooks.docx.tables import Tables
from calibre.ebooks.docx.footnotes import Footnotes
from calibre.ebooks.docx.cleanup import cleanup_markup
from calibre.ebooks.docx.theme import Theme
from calibre.ebooks.docx.toc import create_toc
from calibre.ebooks.docx.fields import Fields
from calibre.ebooks.docx.settings import Settings
from calibre.ebooks.metadata.opf2 import OPFCreator
from calibre.utils.localization import canonicalize_lang, lang_as_iso639_1
from polyglot.builtins import iteritems, itervalues
NBSP = '\xa0'
class Text:
def __init__(self, elem, attr, buf):
self.elem, self.attr, self.buf = elem, attr, buf
self.elems = [self.elem]
def add_elem(self, elem):
self.elems.append(elem)
setattr(self.elem, self.attr, ''.join(self.buf))
self.elem, self.attr, self.buf = elem, 'tail', []
def __iter__(self):
return iter(self.elems)
def html_lang(docx_lang):
lang = canonicalize_lang(docx_lang)
if lang and lang != 'und':
lang = lang_as_iso639_1(lang)
if lang:
return lang
class Convert:
def __init__(self, path_or_stream, dest_dir=None, log=None, detect_cover=True, notes_text=None, notes_nopb=False, nosupsub=False):
self.docx = DOCX(path_or_stream, log=log)
self.namespace = self.docx.namespace
self.ms_pat = re.compile(r'\s{2,}')
self.ws_pat = re.compile(r'[\n\r\t]')
self.log = self.docx.log
self.detect_cover = detect_cover
self.notes_text = notes_text or _('Notes')
self.notes_nopb = notes_nopb
self.nosupsub = nosupsub
self.dest_dir = dest_dir or os.getcwd()
self.mi = self.docx.metadata
self.body = BODY()
self.theme = Theme(self.namespace)
self.settings = Settings(self.namespace)
self.tables = Tables(self.namespace)
self.fields = Fields(self.namespace)
self.styles = Styles(self.namespace, self.tables)
self.images = Images(self.namespace, self.log)
self.object_map = OrderedDict()
self.html = HTML(
HEAD(
META(charset='utf-8'),
TITLE(self.mi.title or _('Unknown')),
LINK(rel='stylesheet', type='text/css', href='docx.css'),
),
self.body
)
self.html.text='\n\t'
self.html[0].text='\n\t\t'
self.html[0].tail='\n'
for child in self.html[0]:
child.tail = '\n\t\t'
self.html[0][-1].tail = '\n\t'
self.html[1].text = self.html[1].tail = '\n'
lang = html_lang(self.mi.language)
if lang:
self.html.set('lang', lang)
self.doc_lang = lang
else:
self.doc_lang = None
def __call__(self):
doc = self.docx.document
relationships_by_id, relationships_by_type = self.docx.document_relationships
self.resolve_alternate_content(doc)
self.fields(doc, self.log)
self.read_styles(relationships_by_type)
self.images(relationships_by_id)
self.layers = OrderedDict()
self.framed = [[]]
self.frame_map = {}
self.framed_map = {}
self.anchor_map = {}
self.link_map = defaultdict(list)
self.link_source_map = {}
self.toc_anchor = None
self.block_runs = []
paras = []
self.log.debug('Converting Word markup to HTML')
self.read_page_properties(doc)
self.current_rels = relationships_by_id
for wp, page_properties in iteritems(self.page_map):
self.current_page = page_properties
if wp.tag.endswith('}p'):
p = self.convert_p(wp)
self.body.append(p)
paras.append(wp)
self.read_block_anchors(doc)
self.styles.apply_contextual_spacing(paras)
self.mark_block_runs(paras)
# Apply page breaks at the start of every section, except the first
# section (since that will be the start of the file)
self.styles.apply_section_page_breaks(self.section_starts[1:])
notes_header = None
orig_rid_map = self.images.rid_map
if self.footnotes.has_notes:
self.body.append(H1(self.notes_text))
notes_header = self.body[-1]
notes_header.set('class', 'notes-header')
for anchor, text, note in self.footnotes:
dl = DL(id=anchor)
dl.set('class', 'footnote')
self.body.append(dl)
dl.append(DT('[', A('←' + text, href='#back_%s' % anchor, title=text)))
dl[-1][0].tail = ']'
dl.append(DD())
paras = []
self.images.rid_map = self.current_rels = note.rels[0]
for wp in note:
if wp.tag.endswith('}tbl'):
self.tables.register(wp, self.styles)
self.page_map[wp] = self.current_page
else:
p = self.convert_p(wp)
dl[-1].append(p)
paras.append(wp)
self.styles.apply_contextual_spacing(paras)
self.mark_block_runs(paras)
for p, wp in iteritems(self.object_map):
if len(p) > 0 and not p.text and len(p[0]) > 0 and not p[0].text and p[0][0].get('class', None) == 'tab':
# Paragraph uses tabs for indentation, convert to text-indent
parent = p[0]
tabs = []
for child in parent:
if child.get('class', None) == 'tab':
tabs.append(child)
if child.tail:
break
else:
break
indent = len(tabs) * self.settings.default_tab_stop
style = self.styles.resolve(wp)
if style.text_indent is inherit or (hasattr(style.text_indent, 'endswith') and style.text_indent.endswith('pt')):
if style.text_indent is not inherit:
indent = float(style.text_indent[:-2]) + indent
style.text_indent = '%.3gpt' % indent
parent.text = tabs[-1].tail or ''
for i in tabs:
parent.remove(i)
self.images.rid_map = orig_rid_map
self.resolve_links()
self.styles.cascade(self.layers)
self.tables.apply_markup(self.object_map, self.page_map)
numbered = []
for html_obj, obj in iteritems(self.object_map):
raw = obj.get('calibre_num_id', None)
if raw is not None:
lvl, num_id = raw.partition(':')[0::2]
try:
lvl = int(lvl)
except (TypeError, ValueError):
lvl = 0
numbered.append((html_obj, num_id, lvl))
self.numbering.apply_markup(numbered, self.body, self.styles, self.object_map, self.images)
self.apply_frames()
if len(self.body) > 0:
self.body.text = '\n\t'
for child in self.body:
child.tail = '\n\t'
self.body[-1].tail = '\n'
self.log.debug('Converting styles to CSS')
self.styles.generate_classes()
for html_obj, obj in iteritems(self.object_map):
style = self.styles.resolve(obj)
if style is not None:
css = style.css
if css:
cls = self.styles.class_name(css)
if cls:
html_obj.set('class', cls)
for html_obj, css in iteritems(self.framed_map):
cls = self.styles.class_name(css)
if cls:
html_obj.set('class', cls)
if notes_header is not None:
for h in self.namespace.children(self.body, 'h1', 'h2', 'h3'):
notes_header.tag = h.tag
cls = h.get('class', None)
if cls and cls != 'notes-header':
notes_header.set('class', '%s notes-header' % cls)
break
self.fields.polish_markup(self.object_map)
self.log.debug('Cleaning up redundant markup generated by Word')
self.cover_image = cleanup_markup(self.log, self.html, self.styles, self.dest_dir, self.detect_cover, self.namespace.XPath)
return self.write(doc)
def read_page_properties(self, doc):
current = []
self.page_map = OrderedDict()
self.section_starts = []
for p in self.namespace.descendants(doc, 'w:p', 'w:tbl'):
if p.tag.endswith('}tbl'):
self.tables.register(p, self.styles)
current.append(p)
continue
sect = tuple(self.namespace.descendants(p, 'w:sectPr'))
if sect:
pr = PageProperties(self.namespace, sect)
paras = current + [p]
for x in paras:
self.page_map[x] = pr
self.section_starts.append(paras[0])
current = []
else:
current.append(p)
if current:
self.section_starts.append(current[0])
last = self.namespace.XPath('./w:body/w:sectPr')(doc)
pr = PageProperties(self.namespace, last)
for x in current:
self.page_map[x] = pr
def resolve_alternate_content(self, doc):
# For proprietary extensions in Word documents use the fallback, spec
# compliant form
# See https://wiki.openoffice.org/wiki/OOXML/Markup_Compatibility_and_Extensibility
for ac in self.namespace.descendants(doc, 'mc:AlternateContent'):
choices = self.namespace.XPath('./mc:Choice')(ac)
fallbacks = self.namespace.XPath('./mc:Fallback')(ac)
if fallbacks:
for choice in choices:
ac.remove(choice)
def read_styles(self, relationships_by_type):
def get_name(rtype, defname):
name = relationships_by_type.get(rtype, None)
if name is None:
cname = self.docx.document_name.split('/')
cname[-1] = defname
if self.docx.exists('/'.join(cname)):
name = name
if name and name.startswith('word/word') and not self.docx.exists(name):
name = name.partition('/')[2]
return name
nname = get_name(self.namespace.names['NUMBERING'], 'numbering.xml')
sname = get_name(self.namespace.names['STYLES'], 'styles.xml')
sename = get_name(self.namespace.names['SETTINGS'], 'settings.xml')
fname = get_name(self.namespace.names['FONTS'], 'fontTable.xml')
tname = get_name(self.namespace.names['THEMES'], 'theme1.xml')
foname = get_name(self.namespace.names['FOOTNOTES'], 'footnotes.xml')
enname = get_name(self.namespace.names['ENDNOTES'], 'endnotes.xml')
numbering = self.numbering = Numbering(self.namespace)
footnotes = self.footnotes = Footnotes(self.namespace)
fonts = self.fonts = Fonts(self.namespace)
foraw = enraw = None
forel, enrel = ({}, {}), ({}, {})
if sename is not None:
try:
seraw = self.docx.read(sename)
except KeyError:
self.log.warn('Settings %s do not exist' % sename)
except OSError as e:
if e.errno != errno.ENOENT:
raise
self.log.warn('Settings %s file missing' % sename)
else:
self.settings(fromstring(seraw))
if foname is not None:
try:
foraw = self.docx.read(foname)
except KeyError:
self.log.warn('Footnotes %s do not exist' % foname)
else:
forel = self.docx.get_relationships(foname)
if enname is not None:
try:
enraw = self.docx.read(enname)
except KeyError:
self.log.warn('Endnotes %s do not exist' % enname)
else:
enrel = self.docx.get_relationships(enname)
footnotes(fromstring(foraw) if foraw else None, forel, fromstring(enraw) if enraw else None, enrel)
if fname is not None:
embed_relationships = self.docx.get_relationships(fname)[0]
try:
raw = self.docx.read(fname)
except KeyError:
self.log.warn('Fonts table %s does not exist' % fname)
else:
fonts(fromstring(raw), embed_relationships, self.docx, self.dest_dir)
if tname is not None:
try:
raw = self.docx.read(tname)
except KeyError:
self.log.warn('Styles %s do not exist' % sname)
else:
self.theme(fromstring(raw))
styles_loaded = False
if sname is not None:
try:
raw = self.docx.read(sname)
except KeyError:
self.log.warn('Styles %s do not exist' % sname)
else:
self.styles(fromstring(raw), fonts, self.theme)
styles_loaded = True
if not styles_loaded:
self.styles(None, fonts, self.theme)
if nname is not None:
try:
raw = self.docx.read(nname)
except KeyError:
self.log.warn('Numbering styles %s do not exist' % nname)
else:
numbering(fromstring(raw), self.styles, self.docx.get_relationships(nname)[0])
self.styles.resolve_numbering(numbering)
def write(self, doc):
toc = create_toc(doc, self.body, self.resolved_link_map, self.styles, self.object_map, self.log, self.namespace)
raw = html.tostring(self.html, encoding='utf-8', doctype='<!DOCTYPE html>')
with lopen(os.path.join(self.dest_dir, 'index.html'), 'wb') as f:
f.write(raw)
css = self.styles.generate_css(self.dest_dir, self.docx, self.notes_nopb, self.nosupsub)
if css:
with lopen(os.path.join(self.dest_dir, 'docx.css'), 'wb') as f:
f.write(css.encode('utf-8'))
opf = OPFCreator(self.dest_dir, self.mi)
opf.toc = toc
opf.create_manifest_from_files_in([self.dest_dir])
for item in opf.manifest:
if item.media_type == 'text/html':
item.media_type = guess_type('a.xhtml')[0]
opf.create_spine(['index.html'])
if self.cover_image is not None:
opf.guide.set_cover(self.cover_image)
def process_guide(E, guide):
if self.toc_anchor is not None:
guide.append(E.reference(
href='index.html#' + self.toc_anchor, title=_('Table of Contents'), type='toc'))
toc_file = os.path.join(self.dest_dir, 'toc.ncx')
with lopen(os.path.join(self.dest_dir, 'metadata.opf'), 'wb') as of, open(toc_file, 'wb') as ncx:
opf.render(of, ncx, 'toc.ncx', process_guide=process_guide)
if os.path.getsize(toc_file) == 0:
os.remove(toc_file)
return os.path.join(self.dest_dir, 'metadata.opf')
def read_block_anchors(self, doc):
doc_anchors = frozenset(self.namespace.XPath('./w:body/w:bookmarkStart[@w:name]')(doc))
if doc_anchors:
current_bm = set()
rmap = {v:k for k, v in iteritems(self.object_map)}
for p in self.namespace.descendants(doc, 'w:p', 'w:bookmarkStart[@w:name]'):
if p.tag.endswith('}p'):
if current_bm and p in rmap:
para = rmap[p]
if 'id' not in para.attrib:
para.set('id', generate_anchor(next(iter(current_bm)), frozenset(itervalues(self.anchor_map))))
for name in current_bm:
self.anchor_map[name] = para.get('id')
current_bm = set()
elif p in doc_anchors:
anchor = self.namespace.get(p, 'w:name')
if anchor:
current_bm.add(anchor)
def convert_p(self, p):
dest = P()
self.object_map[dest] = p
style = self.styles.resolve_paragraph(p)
self.layers[p] = []
self.frame_map[p] = style.frame
self.add_frame(dest, style.frame)
current_anchor = None
current_hyperlink = None
hl_xpath = self.namespace.XPath('ancestor::w:hyperlink[1]')
def p_parent(x):
# Ensure that nested <w:p> tags are handled. These can occur if a
# textbox is present inside a paragraph.
while True:
x = x.getparent()
try:
if x.tag.endswith('}p'):
return x
except AttributeError:
break
for x in self.namespace.descendants(p, 'w:r', 'w:bookmarkStart', 'w:hyperlink', 'w:instrText'):
if p_parent(x) is not p:
continue
if x.tag.endswith('}r'):
span = self.convert_run(x)
if current_anchor is not None:
(dest if len(dest) == 0 else span).set('id', current_anchor)
current_anchor = None
if current_hyperlink is not None:
try:
hl = hl_xpath(x)[0]
self.link_map[hl].append(span)
self.link_source_map[hl] = self.current_rels
x.set('is-link', '1')
except IndexError:
current_hyperlink = None
dest.append(span)
self.layers[p].append(x)
elif x.tag.endswith('}bookmarkStart'):
anchor = self.namespace.get(x, 'w:name')
if anchor and anchor not in self.anchor_map and anchor != '_GoBack':
# _GoBack is a special bookmark inserted by Word 2010 for
# the return to previous edit feature, we ignore it
old_anchor = current_anchor
self.anchor_map[anchor] = current_anchor = generate_anchor(anchor, frozenset(itervalues(self.anchor_map)))
if old_anchor is not None:
# The previous anchor was not applied to any element
for a, t in tuple(self.anchor_map.items()):
if t == old_anchor:
self.anchor_map[a] = current_anchor
elif x.tag.endswith('}hyperlink'):
current_hyperlink = x
elif x.tag.endswith('}instrText') and x.text and x.text.strip().startswith('TOC '):
old_anchor = current_anchor
anchor = str(uuid.uuid4())
self.anchor_map[anchor] = current_anchor = generate_anchor('toc', frozenset(itervalues(self.anchor_map)))
self.toc_anchor = current_anchor
if old_anchor is not None:
# The previous anchor was not applied to any element
for a, t in tuple(iteritems(self.anchor_map)):
if t == old_anchor:
self.anchor_map[a] = current_anchor
if current_anchor is not None:
if dest.get('id'):
# this bookmark was at the end of the paragraph
if len(dest):
if dest[-1].get('id'):
self.anchor_map[current_anchor] = dest[-1].get('id')
else:
dest[-1].set('id', current_anchor)
else:
self.anchor_map[current_anchor] = dest.get('id')
else:
# This paragraph had no <w:r> descendants
dest.set('id', current_anchor)
current_anchor = None
m = re.match(r'heading\s+(\d+)$', style.style_name or '', re.IGNORECASE)
if m is not None:
n = min(6, max(1, int(m.group(1))))
dest.tag = 'h%d' % n
dest.set('data-heading-level', str(n))
if style.bidi is True:
dest.set('dir', 'rtl')
border_runs = []
common_borders = []
for span in dest:
run = self.object_map[span]
style = self.styles.resolve_run(run)
if not border_runs or border_runs[-1][1].same_border(style):
border_runs.append((span, style))
elif border_runs:
if len(border_runs) > 1:
common_borders.append(border_runs)
border_runs = []
for border_run in common_borders:
spans = []
bs = {}
for span, style in border_run:
style.get_border_css(bs)
style.clear_border_css()
spans.append(span)
if bs:
cls = self.styles.register(bs, 'text_border')
wrapper = self.wrap_elems(spans, SPAN())
wrapper.set('class', cls)
if not dest.text and len(dest) == 0 and not style.has_visible_border():
# Empty paragraph add a non-breaking space so that it is rendered
# by WebKit
dest.text = NBSP
# If the last element in a block is a <br> the <br> is not rendered in
# HTML, unless it is followed by a trailing space. Word, on the other
# hand inserts a blank line for trailing <br>s.
if len(dest) > 0 and not dest[-1].tail:
if dest[-1].tag == 'br':
dest[-1].tail = NBSP
elif len(dest[-1]) > 0 and dest[-1][-1].tag == 'br' and not dest[-1][-1].tail:
dest[-1][-1].tail = NBSP
return dest
def wrap_elems(self, elems, wrapper):
p = elems[0].getparent()
idx = p.index(elems[0])
p.insert(idx, wrapper)
wrapper.tail = elems[-1].tail
elems[-1].tail = None
for elem in elems:
try:
p.remove(elem)
except ValueError:
# Probably a hyperlink that spans multiple
# paragraphs,theoretically we should break this up into
# multiple hyperlinks, but I can't be bothered.
elem.getparent().remove(elem)
wrapper.append(elem)
return wrapper
def resolve_links(self):
self.resolved_link_map = {}
for hyperlink, spans in iteritems(self.link_map):
relationships_by_id = self.link_source_map[hyperlink]
span = spans[0]
if len(spans) > 1:
span = self.wrap_elems(spans, SPAN())
span.tag = 'a'
self.resolved_link_map[hyperlink] = span
tgt = self.namespace.get(hyperlink, 'w:tgtFrame')
if tgt:
span.set('target', tgt)
tt = self.namespace.get(hyperlink, 'w:tooltip')
if tt:
span.set('title', tt)
rid = self.namespace.get(hyperlink, 'r:id')
if rid and rid in relationships_by_id:
span.set('href', relationships_by_id[rid])
continue
anchor = self.namespace.get(hyperlink, 'w:anchor')
if anchor and anchor in self.anchor_map:
span.set('href', '#' + self.anchor_map[anchor])
continue
self.log.warn('Hyperlink with unknown target (rid=%s, anchor=%s), ignoring' %
(rid, anchor))
# hrefs that point nowhere give epubcheck a hernia. The element
# should be styled explicitly by Word anyway.
# span.set('href', '#')
rmap = {v:k for k, v in iteritems(self.object_map)}
for hyperlink, runs in self.fields.hyperlink_fields:
spans = [rmap[r] for r in runs if r in rmap]
if not spans:
continue
span = spans[0]
if len(spans) > 1:
span = self.wrap_elems(spans, SPAN())
span.tag = 'a'
tgt = hyperlink.get('target', None)
if tgt:
span.set('target', tgt)
tt = hyperlink.get('title', None)
if tt:
span.set('title', tt)
url = hyperlink.get('url', None)
if url is None:
anchor = hyperlink.get('anchor', None)
if anchor in self.anchor_map:
span.set('href', '#' + self.anchor_map[anchor])
continue
self.log.warn('Hyperlink field with unknown anchor: %s' % anchor)
else:
if url in self.anchor_map:
span.set('href', '#' + self.anchor_map[url])
continue
span.set('href', url)
for img, link, relationships_by_id in self.images.links:
parent = img.getparent()
idx = parent.index(img)
a = A(img)
a.tail, img.tail = img.tail, None
parent.insert(idx, a)
tgt = link.get('target', None)
if tgt:
a.set('target', tgt)
tt = link.get('title', None)
if tt:
a.set('title', tt)
rid = link['id']
if rid in relationships_by_id:
dest = relationships_by_id[rid]
if dest.startswith('#'):
if dest[1:] in self.anchor_map:
a.set('href', '#' + self.anchor_map[dest[1:]])
else:
a.set('href', dest)
def convert_run(self, run):
ans = SPAN()
self.object_map[ans] = run
text = Text(ans, 'text', [])
for child in run:
if self.namespace.is_tag(child, 'w:t'):
if not child.text:
continue
space = child.get(XML('space'), None)
preserve = False
ctext = child.text
if space != 'preserve':
# Remove leading and trailing whitespace. Word ignores
# leading and trailing whitespace without preserve
ctext = ctext.strip(' \n\r\t')
# Only use a <span> with white-space:pre-wrap if this element
# actually needs it, i.e. if it has more than one
# consecutive space or it has newlines or tabs.
multi_spaces = self.ms_pat.search(ctext) is not None
preserve = multi_spaces or self.ws_pat.search(ctext) is not None
if preserve:
text.add_elem(SPAN(ctext, style="white-space:pre-wrap"))
ans.append(text.elem)
else:
text.buf.append(ctext)
elif self.namespace.is_tag(child, 'w:cr'):
text.add_elem(BR())
ans.append(text.elem)
elif self.namespace.is_tag(child, 'w:br'):
typ = self.namespace.get(child, 'w:type')
if typ in {'column', 'page'}:
br = BR(style='page-break-after:always')
else:
clear = child.get('clear', None)
if clear in {'all', 'left', 'right'}:
br = BR(style='clear:%s'%('both' if clear == 'all' else clear))
else:
br = BR()
text.add_elem(br)
ans.append(text.elem)
elif self.namespace.is_tag(child, 'w:drawing') or self.namespace.is_tag(child, 'w:pict'):
for img in self.images.to_html(child, self.current_page, self.docx, self.dest_dir):
text.add_elem(img)
ans.append(text.elem)
elif self.namespace.is_tag(child, 'w:footnoteReference') or self.namespace.is_tag(child, 'w:endnoteReference'):
anchor, name = self.footnotes.get_ref(child)
if anchor and name:
l = A(name, id='back_%s' % anchor, href='#' + anchor, title=name)
l.set('class', 'noteref')
text.add_elem(l)
ans.append(text.elem)
elif self.namespace.is_tag(child, 'w:tab'):
spaces = int(math.ceil((self.settings.default_tab_stop / 36) * 6))
text.add_elem(SPAN(NBSP * spaces))
ans.append(text.elem)
ans[-1].set('class', 'tab')
elif self.namespace.is_tag(child, 'w:noBreakHyphen'):
text.buf.append('\u2011')
elif self.namespace.is_tag(child, 'w:softHyphen'):
text.buf.append('\u00ad')
if text.buf:
setattr(text.elem, text.attr, ''.join(text.buf))
style = self.styles.resolve_run(run)
if style.vert_align in {'superscript', 'subscript'}:
if ans.text or len(ans):
ans.set('data-docx-vert', 'sup' if style.vert_align == 'superscript' else 'sub')
if style.lang is not inherit:
lang = html_lang(style.lang)
if lang is not None and lang != self.doc_lang:
ans.set('lang', lang)
if style.rtl is True:
ans.set('dir', 'rtl')
if is_symbol_font(style.font_family):
for elem in text:
if elem.text:
elem.text = map_symbol_text(elem.text, style.font_family)
if elem.tail:
elem.tail = map_symbol_text(elem.tail, style.font_family)
style.font_family = 'sans-serif'
return ans
def add_frame(self, html_obj, style):
last_run = self.framed[-1]
if style is inherit:
if last_run:
self.framed.append([])
return
if last_run:
if last_run[-1][1] == style:
last_run.append((html_obj, style))
else:
self.framed[-1].append((html_obj, style))
else:
last_run.append((html_obj, style))
def apply_frames(self):
for run in filter(None, self.framed):
style = run[0][1]
paras = tuple(x[0] for x in run)
parent = paras[0].getparent()
idx = parent.index(paras[0])
frame = DIV(*paras)
parent.insert(idx, frame)
self.framed_map[frame] = css = style.css(self.page_map[self.object_map[paras[0]]])
self.styles.register(css, 'frame')
if not self.block_runs:
return
rmap = {v:k for k, v in iteritems(self.object_map)}
for border_style, blocks in self.block_runs:
paras = tuple(rmap[p] for p in blocks)
for p in paras:
if p.tag == 'li':
has_li = True
break
else:
has_li = False
parent = paras[0].getparent()
if parent.tag in ('ul', 'ol'):
ul = parent
parent = ul.getparent()
idx = parent.index(ul)
frame = DIV(ul)
elif has_li:
def top_level_tag(x):
while True:
q = x.getparent()
if q is parent or q is None:
break
x = q
return x
paras = tuple(map(top_level_tag, paras))
idx = parent.index(paras[0])
frame = DIV(*paras)
else:
idx = parent.index(paras[0])
frame = DIV(*paras)
parent.insert(idx, frame)
self.framed_map[frame] = css = border_style.css
self.styles.register(css, 'frame')
def mark_block_runs(self, paras):
def process_run(run):
max_left = max_right = 0
has_visible_border = None
for p in run:
style = self.styles.resolve_paragraph(p)
if has_visible_border is None:
has_visible_border = style.has_visible_border()
if isinstance(style.margin_left, numbers.Number):
max_left = max(style.margin_left, max_left)
if isinstance(style.margin_right, numbers.Number):
max_right = max(style.margin_right, max_right)
if has_visible_border:
style.margin_left = style.margin_right = inherit
if p is not run[0]:
style.padding_top = 0
else:
border_style = style.clone_border_styles()
if has_visible_border:
border_style.margin_top, style.margin_top = style.margin_top, inherit
if p is not run[-1]:
style.padding_bottom = 0
else:
if has_visible_border:
border_style.margin_bottom, style.margin_bottom = style.margin_bottom, inherit
style.clear_borders()
if p is not run[-1]:
style.apply_between_border()
if has_visible_border:
border_style.margin_left, border_style.margin_right = max_left,max_right
self.block_runs.append((border_style, run))
run = []
for p in paras:
if run and self.frame_map.get(p) == self.frame_map.get(run[-1]):
style = self.styles.resolve_paragraph(p)
last_style = self.styles.resolve_paragraph(run[-1])
if style.has_identical_borders(last_style):
run.append(p)
continue
if len(run) > 1:
process_run(run)
run = [p]
if len(run) > 1:
process_run(run)
if __name__ == '__main__':
import shutil
from calibre.utils.logging import default_log
default_log.filter_level = default_log.DEBUG
dest_dir = os.path.join(os.getcwd(), 'docx_input')
if os.path.exists(dest_dir):
shutil.rmtree(dest_dir)
os.mkdir(dest_dir)
Convert(sys.argv[-1], dest_dir=dest_dir, log=default_log)()