%PDF- %PDF-
| Direktori : /usr/lib/calibre/calibre/ebooks/conversion/plugins/ |
| Current File : //usr/lib/calibre/calibre/ebooks/conversion/plugins/pdf_input.py |
__license__ = 'GPL 3'
__copyright__ = '2009, John Schember <john@nachtimwald.com>'
__docformat__ = 'restructuredtext en'
import os
from calibre.customize.conversion import InputFormatPlugin, OptionRecommendation
from polyglot.builtins import as_bytes
class PDFInput(InputFormatPlugin):
name = 'PDF Input'
author = 'Kovid Goyal and John Schember'
description = _('Convert PDF files to HTML')
file_types = {'pdf'}
commit_name = 'pdf_input'
options = {
OptionRecommendation(name='no_images', recommended_value=False,
help=_('Do not extract images from the document')),
OptionRecommendation(name='unwrap_factor', recommended_value=0.45,
help=_('Scale used to determine the length at which a line should '
'be unwrapped. Valid values are a decimal between 0 and 1. The '
'default is 0.45, just below the median line length.')),
OptionRecommendation(name='new_pdf_engine', recommended_value=False,
help=_('Use the new PDF conversion engine. Currently not operational.'))
}
def convert_new(self, stream, accelerators):
from calibre.ebooks.pdf.pdftohtml import pdftohtml
from calibre.utils.cleantext import clean_ascii_chars
from calibre.ebooks.pdf.reflow import PDFDocument
pdftohtml(os.getcwd(), stream.name, self.opts.no_images, as_xml=True)
with lopen('index.xml', 'rb') as f:
xml = clean_ascii_chars(f.read())
PDFDocument(xml, self.opts, self.log)
return os.path.join(os.getcwd(), 'metadata.opf')
def convert(self, stream, options, file_ext, log,
accelerators):
from calibre.ebooks.metadata.opf2 import OPFCreator
from calibre.ebooks.pdf.pdftohtml import pdftohtml
log.debug('Converting file to html...')
# The main html file will be named index.html
self.opts, self.log = options, log
if options.new_pdf_engine:
return self.convert_new(stream, accelerators)
pdftohtml(os.getcwd(), stream.name, options.no_images)
from calibre.ebooks.metadata.meta import get_metadata
log.debug('Retrieving document metadata...')
mi = get_metadata(stream, 'pdf')
opf = OPFCreator(os.getcwd(), mi)
manifest = [('index.html', None)]
images = os.listdir(os.getcwd())
images.remove('index.html')
for i in images:
manifest.append((i, None))
log.debug('Generating manifest...')
opf.create_manifest(manifest)
opf.create_spine(['index.html'])
log.debug('Rendering manifest...')
with lopen('metadata.opf', 'wb') as opffile:
opf.render(opffile)
if os.path.exists('toc.ncx'):
ncxid = opf.manifest.id_for_path('toc.ncx')
if ncxid:
with lopen('metadata.opf', 'r+b') as f:
raw = f.read().replace(b'<spine', b'<spine toc="%s"' % as_bytes(ncxid))
f.seek(0)
f.write(raw)
return os.path.join(os.getcwd(), 'metadata.opf')