%PDF- %PDF-
Direktori : /lib/calibre/calibre/ebooks/html/ |
Current File : //lib/calibre/calibre/ebooks/html/input.py |
#!/usr/bin/env python3 __license__ = 'GPL v3' __copyright__ = '2009, Kovid Goyal <kovid@kovidgoyal.net>' __docformat__ = 'restructuredtext en' ''' Input plugin for HTML or OPF ebooks. ''' import os, re, sys, errno as gerrno from calibre.ebooks.oeb.base import urlunquote from calibre.ebooks.chardet import detect_xml_encoding from calibre.constants import iswindows from calibre import unicode_path, as_unicode, replace_entities from polyglot.urllib import urlparse, urlunparse class Link: ''' Represents a link in a HTML file. ''' @classmethod def url_to_local_path(cls, url, base): path = url.path isabs = False if iswindows and path.startswith('/'): path = path[1:] isabs = True path = urlunparse(('', '', path, url.params, url.query, '')) path = urlunquote(path) if isabs or os.path.isabs(path): return path return os.path.abspath(os.path.join(base, path)) def __init__(self, url, base): ''' :param url: The url this link points to. Must be an unquoted unicode string. :param base: The base folder that relative URLs are with respect to. Must be a unicode string. ''' assert isinstance(url, str) and isinstance(base, str) self.url = url self.parsed_url = urlparse(self.url) self.is_local = self.parsed_url.scheme in ('', 'file') self.is_internal = self.is_local and not bool(self.parsed_url.path) self.path = None self.fragment = urlunquote(self.parsed_url.fragment) if self.is_local and not self.is_internal: self.path = self.url_to_local_path(self.parsed_url, base) def __hash__(self): if self.path is None: return hash(self.url) return hash(self.path) def __eq__(self, other): return self.path == getattr(other, 'path', other) def __str__(self): return 'Link: %s --> %s'%(self.url, self.path) class IgnoreFile(Exception): def __init__(self, msg, errno): Exception.__init__(self, msg) self.doesnt_exist = errno == gerrno.ENOENT self.errno = errno class HTMLFile: ''' Contains basic information about an HTML file. This includes a list of links to other files as well as the encoding of each file. Also tries to detect if the file is not a HTML file in which case :member:`is_binary` is set to True. The encoding of the file is available as :member:`encoding`. ''' HTML_PAT = re.compile(r'<\s*html', re.IGNORECASE) HTML_PAT_BIN = re.compile(br'<\s*html', re.IGNORECASE) TITLE_PAT = re.compile('<title>([^<>]+)</title>', re.IGNORECASE) LINK_PAT = re.compile( r'<\s*a\s+.*?href\s*=\s*(?:(?:"(?P<url1>[^"]+)")|(?:\'(?P<url2>[^\']+)\')|(?P<url3>[^\s>]+))', re.DOTALL|re.IGNORECASE) def __init__(self, path_to_html_file, level, encoding, verbose, referrer=None): ''' :param level: The level of this file. Should be 0 for the root file. :param encoding: Use `encoding` to decode HTML. :param referrer: The :class:`HTMLFile` that first refers to this file. ''' self.path = unicode_path(path_to_html_file, abs=True) self.title = os.path.splitext(os.path.basename(self.path))[0] self.base = os.path.dirname(self.path) self.level = level self.referrer = referrer self.links = [] try: with open(self.path, 'rb') as f: src = header = f.read(4096) encoding = detect_xml_encoding(src)[1] if encoding: try: header = header.decode(encoding, errors='replace') except ValueError: pass self.is_binary = False if level > 0: pat = self.HTML_PAT_BIN if isinstance(header, bytes) else self.HTML_PAT self.is_binary = not bool(pat.search(header)) if not self.is_binary: src += f.read() except OSError as err: msg = 'Could not read from file: %s with error: %s'%(self.path, as_unicode(err)) if level == 0: raise OSError(msg) raise IgnoreFile(msg, err.errno) if not src: if level == 0: raise ValueError('The file %s is empty'%self.path) self.is_binary = True if not self.is_binary: if not encoding: encoding = detect_xml_encoding(src[:4096], verbose=verbose)[1] self.encoding = encoding else: self.encoding = encoding src = src.decode(encoding, 'replace') match = self.TITLE_PAT.search(src) self.title = match.group(1) if match is not None else self.title self.find_links(src) def __eq__(self, other): return self.path == getattr(other, 'path', other) def __hash__(self): return hash(self.path) def __str__(self): return 'HTMLFile:%d:%s:%r'%(self.level, 'b' if self.is_binary else 'a', self.path) def __repr__(self): return str(self) def find_links(self, src): for match in self.LINK_PAT.finditer(src): url = None for i in ('url1', 'url2', 'url3'): url = match.group(i) if url: break url = replace_entities(url) try: link = self.resolve(url) except ValueError: # Unparsable URL, ignore continue if link not in self.links: self.links.append(link) def resolve(self, url): return Link(url, self.base) def depth_first(root, flat, visited=None): yield root if visited is None: visited = set() visited.add(root) for link in root.links: if link.path is not None and link not in visited: try: index = flat.index(link) except ValueError: # Can happen if max_levels is used continue hf = flat[index] if hf not in visited: yield hf visited.add(hf) for hf in depth_first(hf, flat, visited): if hf not in visited: yield hf visited.add(hf) def traverse(path_to_html_file, max_levels=sys.maxsize, verbose=0, encoding=None): ''' Recursively traverse all links in the HTML file. :param max_levels: Maximum levels of recursion. Must be non-negative. 0 implies that no links in the root HTML file are followed. :param encoding: Specify character encoding of HTML files. If `None` it is auto-detected. :return: A pair of lists (breadth_first, depth_first). Each list contains :class:`HTMLFile` objects. ''' assert max_levels >= 0 level = 0 flat = [HTMLFile(path_to_html_file, level, encoding, verbose)] next_level = list(flat) while level < max_levels and len(next_level) > 0: level += 1 nl = [] for hf in next_level: rejects = [] for link in hf.links: if link.path is None or link.path in flat: continue try: nf = HTMLFile(link.path, level, encoding, verbose, referrer=hf) if nf.is_binary: raise IgnoreFile('%s is a binary file'%nf.path, -1) nl.append(nf) flat.append(nf) except IgnoreFile as err: rejects.append(link) if not err.doesnt_exist or verbose > 1: print(repr(err)) for link in rejects: hf.links.remove(link) next_level = list(nl) orec = sys.getrecursionlimit() sys.setrecursionlimit(500000) try: return flat, list(depth_first(flat[0], flat)) finally: sys.setrecursionlimit(orec) def get_filelist(htmlfile, dir, opts, log): ''' Build list of files referenced by html file or try to detect and use an OPF file instead. ''' log.info('Building file list...') filelist = traverse(htmlfile, max_levels=int(opts.max_levels), verbose=opts.verbose, encoding=opts.input_encoding)[0 if opts.breadth_first else 1] if opts.verbose: log.debug('\tFound files...') for f in filelist: log.debug('\t\t', f) return filelist