%PDF- %PDF-
| Direktori : /proc/thread-self/root/usr/lib/calibre/calibre/ebooks/odt/ |
| Current File : //proc/thread-self/root/usr/lib/calibre/calibre/ebooks/odt/input.py |
__license__ = 'GPL v3'
__copyright__ = '2008, Kovid Goyal kovid@kovidgoyal.net'
__docformat__ = 'restructuredtext en'
'''
Convert an ODT file into a Open Ebook
'''
import os, logging
from lxml import etree
from css_parser import CSSParser
from css_parser.css import CSSRule
from odf.odf2xhtml import ODF2XHTML
from odf.opendocument import load as odLoad
from odf.draw import Frame as odFrame, Image as odImage
from odf.namespaces import TEXTNS as odTEXTNS
from calibre import CurrentDir, walk
from calibre.ebooks.oeb.base import _css_logger
from calibre.utils.xml_parse import safe_xml_fromstring
from polyglot.builtins import string_or_bytes, as_bytes
class Extract(ODF2XHTML):
def extract_pictures(self, zf):
if not os.path.exists('Pictures'):
os.makedirs('Pictures')
for name in zf.namelist():
if name.startswith('Pictures') and name not in {'Pictures', 'Pictures/'}:
data = zf.read(name)
with open(name, 'wb') as f:
f.write(data)
def apply_list_starts(self, root, log):
if not self.list_starts:
return
list_starts = frozenset(self.list_starts)
for ol in root.xpath('//*[local-name() = "ol" and @class]'):
classes = {'.' + x for x in ol.get('class', '').split()}
found = classes & list_starts
if found:
val = self.list_starts[next(iter(found))]
ol.set('start', val)
def fix_markup(self, html, log):
root = safe_xml_fromstring(html)
self.filter_css(root, log)
self.extract_css(root, log)
self.epubify_markup(root, log)
self.apply_list_starts(root, log)
html = etree.tostring(root, encoding='utf-8',
xml_declaration=True)
return html
def extract_css(self, root, log):
ans = []
for s in root.xpath('//*[local-name() = "style" and @type="text/css"]'):
ans.append(s.text)
s.getparent().remove(s)
head = root.xpath('//*[local-name() = "head"]')
if head:
head = head[0]
ns = head.nsmap.get(None, '')
if ns:
ns = '{%s}'%ns
etree.SubElement(head, ns+'link', {'type':'text/css',
'rel':'stylesheet', 'href':'odfpy.css'})
css = '\n\n'.join(ans)
parser = CSSParser(loglevel=logging.WARNING,
log=_css_logger)
self.css = parser.parseString(css, validate=False)
with open('odfpy.css', 'wb') as f:
f.write(css.encode('utf-8'))
def get_css_for_class(self, cls):
if not cls:
return None
for rule in self.css.cssRules.rulesOfType(CSSRule.STYLE_RULE):
for sel in rule.selectorList:
q = sel.selectorText
if q == '.' + cls:
return rule
def epubify_markup(self, root, log):
from calibre.ebooks.oeb.base import XPath, XHTML
# Fix empty title tags
for t in XPath('//h:title')(root):
if not t.text:
t.text = ' '
# Fix <p><div> constructs as the asinine epubchecker complains
# about them
pdiv = XPath('//h:p/h:div')
for div in pdiv(root):
div.getparent().tag = XHTML('div')
# Remove the position:relative as it causes problems with some epub
# renderers. Remove display: block on an image inside a div as it is
# redundant and prevents text-align:center from working in ADE
# Also ensure that the img is contained in its containing div
imgpath = XPath('//h:div/h:img[@style]')
for img in imgpath(root):
div = img.getparent()
if len(div) == 1:
style = div.attrib.get('style', '')
if style and not style.endswith(';'):
style = style + ';'
style += 'position:static' # Ensures position of containing div is static
# Ensure that the img is always contained in its frame
div.attrib['style'] = style
img.attrib['style'] = 'max-width: 100%; max-height: 100%'
# Handle anchored images. The default markup + CSS produced by
# odf2xhtml works with WebKit but not with ADE. So we convert the
# common cases of left/right/center aligned block images to work on
# both webkit and ADE. We detect the case of setting the side margins
# to auto and map it to an appropriate text-align directive, which
# works in both WebKit and ADE.
# https://bugs.launchpad.net/bugs/1063207
# https://bugs.launchpad.net/calibre/+bug/859343
imgpath = XPath('descendant::h:div/h:div/h:img')
for img in imgpath(root):
div2 = img.getparent()
div1 = div2.getparent()
if (len(div1), len(div2)) != (1, 1):
continue
cls = div1.get('class', '')
first_rules = list(filter(None, [self.get_css_for_class(x) for x in
cls.split()]))
has_align = False
for r in first_rules:
if r.style.getProperty('text-align') is not None:
has_align = True
ml = mr = None
if not has_align:
aval = None
cls = div2.get('class', '')
rules = list(filter(None, [self.get_css_for_class(x) for x in
cls.split()]))
for r in rules:
ml = r.style.getPropertyCSSValue('margin-left') or ml
mr = r.style.getPropertyCSSValue('margin-right') or mr
ml = getattr(ml, 'value', None)
mr = getattr(mr, 'value', None)
if ml == mr == 'auto':
aval = 'center'
elif ml == 'auto' and mr != 'auto':
aval = 'right'
elif ml != 'auto' and mr == 'auto':
aval = 'left'
if aval is not None:
style = div1.attrib.get('style', '').strip()
if style and not style.endswith(';'):
style = style + ';'
style += 'text-align:%s'%aval
has_align = True
div1.attrib['style'] = style
if has_align:
# This is needed for ADE, without it the text-align has no
# effect
style = div2.attrib['style']
div2.attrib['style'] = 'display:inline;'+style
def filter_css(self, root, log):
style = root.xpath('//*[local-name() = "style" and @type="text/css"]')
if style:
style = style[0]
css = style.text
if css:
css, sel_map = self.do_filter_css(css)
if not isinstance(css, str):
css = css.decode('utf-8', 'ignore')
style.text = css
for x in root.xpath('//*[@class]'):
extra = []
orig = x.get('class')
for cls in orig.split():
extra.extend(sel_map.get(cls, []))
if extra:
x.set('class', orig + ' ' + ' '.join(extra))
def do_filter_css(self, css):
from css_parser import parseString
from css_parser.css import CSSRule
sheet = parseString(css, validate=False)
rules = list(sheet.cssRules.rulesOfType(CSSRule.STYLE_RULE))
sel_map = {}
count = 0
for r in rules:
# Check if we have only class selectors for this rule
nc = [x for x in r.selectorList if not
x.selectorText.startswith('.')]
if len(r.selectorList) > 1 and not nc:
# Replace all the class selectors with a single class selector
# This will be added to the class attribute of all elements
# that have one of these selectors.
replace_name = 'c_odt%d'%count
count += 1
for sel in r.selectorList:
s = sel.selectorText[1:]
if s not in sel_map:
sel_map[s] = []
sel_map[s].append(replace_name)
r.selectorText = '.'+replace_name
return sheet.cssText, sel_map
def search_page_img(self, mi, log):
for frm in self.document.topnode.getElementsByType(odFrame):
try:
if frm.getAttrNS(odTEXTNS,'anchor-type') == 'page':
log.warn('Document has Pictures anchored to Page, will all end up before first page!')
break
except ValueError:
pass
def filter_cover(self, mi, log):
# filter the Element tree (remove the detected cover)
if mi.cover and mi.odf_cover_frame:
for frm in self.document.topnode.getElementsByType(odFrame):
# search the right frame
if frm.getAttribute('name') == mi.odf_cover_frame:
img = frm.getElementsByType(odImage)
# only one draw:image allowed in the draw:frame
if len(img) == 1 and img[0].getAttribute('href') == mi.cover:
# ok, this is the right frame with the right image
# check if there are more children
if len(frm.childNodes) != 1:
break
# check if the parent paragraph more children
para = frm.parentNode
if para.tagName != 'text:p' or len(para.childNodes) != 1:
break
# now it should be safe to remove the text:p
parent = para.parentNode
parent.removeChild(para)
log("Removed cover image paragraph from document...")
break
def filter_load(self, odffile, mi, log):
""" This is an adaption from ODF2XHTML. It adds a step between
load and parse of the document where the Element tree can be
modified.
"""
# first load the odf structure
self.lines = []
self._wfunc = self._wlines
if isinstance(odffile, string_or_bytes) \
or hasattr(odffile, 'read'): # Added by Kovid
self.document = odLoad(odffile)
else:
self.document = odffile
# filter stuff
self.search_page_img(mi, log)
try:
self.filter_cover(mi, log)
except:
pass
# parse the modified tree and generate xhtml
self._walknode(self.document.topnode)
def __call__(self, stream, odir, log):
from calibre.utils.zipfile import ZipFile
from calibre.ebooks.metadata.odt import get_metadata
from calibre.ebooks.metadata.opf2 import OPFCreator
if not os.path.exists(odir):
os.makedirs(odir)
with CurrentDir(odir):
log('Extracting ODT file...')
stream.seek(0)
mi = get_metadata(stream, 'odt')
if not mi.title:
mi.title = _('Unknown')
if not mi.authors:
mi.authors = [_('Unknown')]
self.filter_load(stream, mi, log)
html = self.xhtml()
# A blanket img specification like this causes problems
# with EPUB output as the containing element often has
# an absolute height and width set that is larger than
# the available screen real estate
html = html.replace('img { width: 100%; height: 100%; }', '')
# odf2xhtml creates empty title tag
html = html.replace('<title></title>','<title>%s</title>'%(mi.title,))
try:
html = self.fix_markup(html, log)
except:
log.exception('Failed to filter CSS, conversion may be slow')
with open('index.xhtml', 'wb') as f:
f.write(as_bytes(html))
zf = ZipFile(stream, 'r')
self.extract_pictures(zf)
opf = OPFCreator(os.path.abspath(os.getcwd()), mi)
opf.create_manifest([(os.path.abspath(f2), None) for f2 in
walk(os.getcwd())])
opf.create_spine([os.path.abspath('index.xhtml')])
with open('metadata.opf', 'wb') as f:
opf.render(f)
return os.path.abspath('metadata.opf')