%PDF- %PDF-
Direktori : /usr/lib/calibre/calibre/ebooks/metadata/sources/ |
Current File : //usr/lib/calibre/calibre/ebooks/metadata/sources/google_images.py |
#!/usr/bin/env python3 # vim:fileencoding=UTF-8 from __future__ import absolute_import, division, print_function, unicode_literals __license__ = 'GPL v3' __copyright__ = '2013, Kovid Goyal <kovid@kovidgoyal.net>' __docformat__ = 'restructuredtext en' from collections import OrderedDict from calibre import random_user_agent from calibre.ebooks.metadata.sources.base import Source, Option def parse_html(raw): try: from html5_parser import parse except ImportError: # Old versions of calibre import html5lib return html5lib.parse(raw, treebuilder='lxml', namespaceHTMLElements=False) else: return parse(raw) def imgurl_from_id(raw, tbnid): from json import JSONDecoder q = '"{}",['.format(tbnid) start_pos = raw.index(q) if start_pos < 100: return jd = JSONDecoder() data = jd.raw_decode('[' + raw[start_pos:])[0] # from pprint import pprint # pprint(data) url_num = 0 for x in data: if isinstance(x, list) and len(x) == 3: q = x[0] if hasattr(q, 'lower') and q.lower().startswith('http'): url_num += 1 if url_num > 1: return q class GoogleImages(Source): name = 'Google Images' version = (1, 0, 2) minimum_calibre_version = (2, 80, 0) description = _('Downloads covers from a Google Image search. Useful to find larger/alternate covers.') capabilities = frozenset(['cover']) can_get_multiple_covers = True supports_gzip_transfer_encoding = True options = (Option('max_covers', 'number', 5, _('Maximum number of covers to get'), _('The maximum number of covers to process from the Google search result')), Option('size', 'choices', 'svga', _('Cover size'), _('Search for covers larger than the specified size'), choices=OrderedDict(( ('any', _('Any size'),), ('l', _('Large'),), ('qsvga', _('Larger than %s')%'400x300',), ('vga', _('Larger than %s')%'640x480',), ('svga', _('Larger than %s')%'600x800',), ('xga', _('Larger than %s')%'1024x768',), ('2mp', _('Larger than %s')%'2 MP',), ('4mp', _('Larger than %s')%'4 MP',), ))), ) def download_cover(self, log, result_queue, abort, title=None, authors=None, identifiers={}, timeout=30, get_best_cover=False): if not title: return timeout = max(60, timeout) # Needs at least a minute title = ' '.join(self.get_title_tokens(title)) author = ' '.join(self.get_author_tokens(authors)) urls = self.get_image_urls(title, author, log, abort, timeout) self.download_multiple_covers(title, authors, urls, get_best_cover, timeout, result_queue, abort, log) @property def user_agent(self): return random_user_agent(allow_ie=False) def get_image_urls(self, title, author, log, abort, timeout): from calibre.utils.cleantext import clean_ascii_chars try: from urllib.parse import urlencode except ImportError: from urllib import urlencode from collections import OrderedDict ans = OrderedDict() br = self.browser q = urlencode({'as_q': ('%s %s'%(title, author)).encode('utf-8')}) if isinstance(q, bytes): q = q.decode('utf-8') sz = self.prefs['size'] if sz == 'any': sz = '' elif sz == 'l': sz = 'isz:l,' else: sz = 'isz:lt,islt:%s,' % sz # See https://www.google.com/advanced_image_search to understand this # URL scheme url = 'https://www.google.com/search?as_st=y&tbm=isch&{}&as_epq=&as_oq=&as_eq=&cr=&as_sitesearch=&safe=images&tbs={}iar:t,ift:jpg'.format(q, sz) log('Search URL: ' + url) raw = clean_ascii_chars(br.open(url).read().decode('utf-8')) root = parse_html(raw) results = root.xpath('//div/@data-tbnid') # could also use data-id # from calibre.utils.ipython import ipython # ipython({'root': root, 'raw': raw, 'url': url, 'results': results}) for tbnid in results: try: imgurl = imgurl_from_id(raw, tbnid) except Exception: continue if imgurl: ans[imgurl] = True return list(ans) def test(): try: from queue import Queue except ImportError: from Queue import Queue from threading import Event from calibre.utils.logging import default_log p = GoogleImages(None) p.log = default_log rq = Queue() p.download_cover(default_log, rq, Event(), title='The Heroes', authors=('Joe Abercrombie',)) print('Downloaded', rq.qsize(), 'covers') if __name__ == '__main__': test()