%PDF- %PDF-
| Direktori : /lib/calibre/calibre/ebooks/chm/ |
| Current File : //lib/calibre/calibre/ebooks/chm/reader.py |
''' CHM File decoding support '''
__license__ = 'GPL v3'
__copyright__ = '2008, Kovid Goyal <kovid at kovidgoyal.net>,' \
' and Alex Bramley <a.bramley at gmail.com>.'
import codecs
import os
import re
from calibre import guess_type as guess_mimetype
from calibre.constants import filesystem_encoding, iswindows
from calibre.ebooks.BeautifulSoup import BeautifulSoup, NavigableString
from calibre.ebooks.chardet import xml_to_unicode
from calibre.ebooks.metadata.toc import TOC
from chm.chm import CHMFile, chmlib
from polyglot.builtins import as_unicode
def match_string(s1, s2_already_lowered):
if s1 is not None and s2_already_lowered is not None:
if s1.lower()==s2_already_lowered:
return True
return False
def check_all_prev_empty(tag):
if tag is None:
return True
if tag.__class__ == NavigableString and not check_empty(tag):
return False
return check_all_prev_empty(tag.previousSibling)
def check_empty(s, rex=re.compile(r'\S')):
return rex.search(s) is None
class CHMError(Exception):
pass
class CHMReader(CHMFile):
def __init__(self, input, log, input_encoding=None):
CHMFile.__init__(self)
if isinstance(input, str):
enc = 'mbcs' if iswindows else filesystem_encoding
try:
input = input.encode(enc)
except UnicodeEncodeError:
from calibre.ptempfile import PersistentTemporaryFile
with PersistentTemporaryFile(suffix='.chm') as t:
t.write(open(input, 'rb').read())
input = t.name
if not self.LoadCHM(input):
raise CHMError("Unable to open CHM file '%s'"%(input,))
self.log = log
self.input_encoding = input_encoding
self._sourcechm = input
self._contents = None
self._playorder = 0
self._metadata = False
self._extracted = False
self.re_encoded_files = set()
self.get_encodings()
if self.home:
self.home = self.decode_hhp_filename(self.home)
if self.topics:
self.topics = self.decode_hhp_filename(self.topics)
# location of '.hhc' file, which is the CHM TOC.
base = self.topics or self.home
self.root = os.path.splitext(base.lstrip('/'))[0]
self.hhc_path = self.root + ".hhc"
def decode_hhp_filename(self, path):
if isinstance(path, str):
return path
for enc in (self.encoding_from_system_file, self.encoding_from_lcid, 'cp1252', 'cp1251', 'latin1', 'utf-8'):
if enc:
try:
q = path.decode(enc)
except UnicodeDecodeError:
continue
res, ui = self.ResolveObject(q)
if res == chmlib.CHM_RESOLVE_SUCCESS:
return q
def get_encodings(self):
self.encoding_from_system_file = self.encoding_from_lcid = None
q = self.GetEncoding()
if q:
try:
if isinstance(q, bytes):
q = q.decode('ascii')
codecs.lookup(q)
self.encoding_from_system_file = q
except Exception:
pass
lcid = self.GetLCID()
if lcid is not None:
q = lcid[0]
if q:
try:
if isinstance(q, bytes):
q = q.decode('ascii')
codecs.lookup(q)
self.encoding_from_lcid = q
except Exception:
pass
def get_encoding(self):
return self.encoding_from_system_file or self.encoding_from_lcid or 'cp1252'
def _parse_toc(self, ul, basedir=os.getcwd()):
toc = TOC(play_order=self._playorder, base_path=basedir, text='')
self._playorder += 1
for li in ul('li', recursive=False):
href = li.object('param', {'name': 'Local'})[0]['value']
if href.count('#'):
href, frag = href.split('#')
else:
frag = None
name = self._deentity(li.object('param', {'name': 'Name'})[0]['value'])
# print "========>", name
toc.add_item(href, frag, name, play_order=self._playorder)
self._playorder += 1
if li.ul:
child = self._parse_toc(li.ul)
child.parent = toc
toc.append(child)
# print toc
return toc
def ResolveObject(self, path):
# filenames are utf-8 encoded in the chm index as far as I can
# determine, see https://tika.apache.org/1.11/api/org/apache/tika/parser/chm/accessor/ChmPmgiHeader.html
if not isinstance(path, bytes):
path = path.encode('utf-8')
return CHMFile.ResolveObject(self, path)
def GetFile(self, path):
# have to have abs paths for ResolveObject, but Contents() deliberately
# makes them relative. So we don't have to worry, re-add the leading /.
# note this path refers to the internal CHM structure
if path[0] != '/':
path = '/' + path
res, ui = self.ResolveObject(path)
if res != chmlib.CHM_RESOLVE_SUCCESS:
raise CHMError(f"Unable to locate {path!r} within CHM file {self.filename!r}")
size, data = self.RetrieveObject(ui)
if size == 0:
raise CHMError(f"{path!r} is zero bytes in length!")
return data
def get_home(self):
return self.GetFile(self.home)
def ExtractFiles(self, output_dir=os.getcwd(), debug_dump=False):
html_files = set()
for path in self.Contents():
fpath = path
lpath = os.path.join(output_dir, fpath)
self._ensure_dir(lpath)
try:
data = self.GetFile(path)
except:
self.log.exception('Failed to extract %s from CHM, ignoring'%path)
continue
if lpath.find(';') != -1:
# fix file names with ";<junk>" at the end, see _reformat()
lpath = lpath.split(';')[0]
try:
with open(lpath, 'wb') as f:
f.write(data)
try:
if 'html' in guess_mimetype(path)[0]:
html_files.add(lpath)
except:
pass
except:
if iswindows and len(lpath) > 250:
self.log.warn('%r filename too long, skipping'%path)
continue
raise
if debug_dump:
import shutil
shutil.copytree(output_dir, os.path.join(debug_dump, 'debug_dump'))
for lpath in html_files:
with lopen(lpath, 'r+b') as f:
data = f.read()
data = self._reformat(data, lpath)
if isinstance(data, str):
data = data.encode('utf-8')
f.seek(0)
f.truncate()
f.write(data)
self._extracted = True
files = [y for y in os.listdir(output_dir) if
os.path.isfile(os.path.join(output_dir, y))]
if self.hhc_path not in files:
for f in files:
if f.lower() == self.hhc_path.lower():
self.hhc_path = f
break
if self.hhc_path not in files and files:
for f in files:
if f.partition('.')[-1].lower() in {'html', 'htm', 'xhtm',
'xhtml'}:
self.hhc_path = f
break
if self.hhc_path == '.hhc' and self.hhc_path not in files:
from calibre import walk
for x in walk(output_dir):
if os.path.basename(x).lower() in ('index.htm', 'index.html',
'contents.htm', 'contents.html'):
self.hhc_path = os.path.relpath(x, output_dir)
break
if self.hhc_path not in files and files:
self.hhc_path = files[0]
def _reformat(self, data, htmlpath):
if self.input_encoding:
data = data.decode(self.input_encoding)
try:
data = xml_to_unicode(data, strip_encoding_pats=True)[0]
soup = BeautifulSoup(data)
except ValueError:
# hit some strange encoding problems...
self.log.exception("Unable to parse html for cleaning, leaving it")
return data
# nuke javascript...
[s.extract() for s in soup('script')]
# See if everything is inside a <head> tag
# https://bugs.launchpad.net/bugs/1273512
body = soup.find('body')
if body is not None and body.parent.name == 'head':
html = soup.find('html')
html.insert(len(html), body)
# remove forward and back nav bars from the top/bottom of each page
# cos they really fuck with the flow of things and generally waste space
# since we can't use [a,b] syntax to select arbitrary items from a list
# we'll have to do this manually...
# only remove the tables, if they have an image with an alt attribute
# containing prev, next or team
t = soup('table')
if t:
if (t[0].previousSibling is None or t[0].previousSibling.previousSibling is None):
try:
alt = t[0].img['alt'].lower()
if alt.find('prev') != -1 or alt.find('next') != -1 or alt.find('team') != -1:
t[0].extract()
except:
pass
if (t[-1].nextSibling is None or t[-1].nextSibling.nextSibling is None):
try:
alt = t[-1].img['alt'].lower()
if alt.find('prev') != -1 or alt.find('next') != -1 or alt.find('team') != -1:
t[-1].extract()
except:
pass
# for some very odd reason each page's content appears to be in a table
# too. and this table has sub-tables for random asides... grr.
# remove br at top of page if present after nav bars removed
br = soup('br')
if br:
if check_all_prev_empty(br[0].previousSibling):
br[0].extract()
# some images seem to be broken in some chm's :/
base = os.path.dirname(htmlpath)
for img in soup('img', src=True):
src = img['src']
ipath = os.path.join(base, *src.split('/'))
if os.path.exists(ipath):
continue
src = src.split(';')[0]
if not src:
continue
ipath = os.path.join(base, *src.split('/'))
if not os.path.exists(ipath):
while src.startswith('../'):
src = src[3:]
img['src'] = src
try:
# if there is only a single table with a single element
# in the body, replace it by the contents of this single element
tables = soup.body.findAll('table', recursive=False)
if tables and len(tables) == 1:
trs = tables[0].findAll('tr', recursive=False)
if trs and len(trs) == 1:
tds = trs[0].findAll('td', recursive=False)
if tds and len(tds) == 1:
tdContents = tds[0].contents
tableIdx = soup.body.contents.index(tables[0])
tables[0].extract()
while tdContents:
soup.body.insert(tableIdx, tdContents.pop())
except:
pass
# do not prettify, it would reformat the <pre> tags!
try:
ans = soup.decode_contents()
self.re_encoded_files.add(os.path.abspath(htmlpath))
return ans
except RuntimeError:
return data
def Contents(self):
if self._contents is not None:
return self._contents
paths = []
def get_paths(chm, ui, ctx):
# these are supposed to be UTF-8 in CHM as best as I can determine
# see https://tika.apache.org/1.11/api/org/apache/tika/parser/chm/accessor/ChmPmgiHeader.html
path = as_unicode(ui.path, 'utf-8')
# skip directories
# note this path refers to the internal CHM structure
if path[-1] != '/':
# and make paths relative
paths.append(path.lstrip('/'))
chmlib.chm_enumerate(self.file, chmlib.CHM_ENUMERATE_NORMAL, get_paths, None)
self._contents = paths
return self._contents
def _ensure_dir(self, path):
dir = os.path.dirname(path)
if not os.path.isdir(dir):
os.makedirs(dir)
def extract_content(self, output_dir=os.getcwd(), debug_dump=False):
self.ExtractFiles(output_dir=output_dir, debug_dump=debug_dump)