%PDF- %PDF-
Direktori : /usr/lib/calibre/calibre/ebooks/metadata/sources/ |
Current File : //usr/lib/calibre/calibre/ebooks/metadata/sources/google.py |
#!/usr/bin/env python3 # vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:ai # License: GPLv3 Copyright: 2011, Kovid Goyal <kovid at kovidgoyal.net> from __future__ import absolute_import, division, print_function, unicode_literals import hashlib import re import time try: from queue import Empty, Queue except ImportError: from Queue import Empty, Queue from calibre import as_unicode from calibre.ebooks.chardet import xml_to_unicode from calibre.ebooks.metadata import check_isbn from calibre.ebooks.metadata.book.base import Metadata from calibre.ebooks.metadata.sources.base import Source from calibre.utils.cleantext import clean_ascii_chars from calibre.utils.localization import canonicalize_lang NAMESPACES = { 'openSearch': 'http://a9.com/-/spec/opensearchrss/1.0/', 'atom': 'http://www.w3.org/2005/Atom', 'dc': 'http://purl.org/dc/terms', 'gd': 'http://schemas.google.com/g/2005' } def get_details(browser, url, timeout): # {{{ try: raw = browser.open_novisit(url, timeout=timeout).read() except Exception as e: gc = getattr(e, 'getcode', lambda: -1) if gc() != 403: raise # Google is throttling us, wait a little time.sleep(2) raw = browser.open_novisit(url, timeout=timeout).read() return raw # }}} xpath_cache = {} def XPath(x): ans = xpath_cache.get(x) if ans is None: from lxml import etree ans = xpath_cache[x] = etree.XPath(x, namespaces=NAMESPACES) return ans def cleanup_title(title): if ':' in title: return title.partition(':')[0] return re.sub(r'(.+?) \(.+\)', r'\1', title) def to_metadata(browser, log, entry_, timeout): # {{{ from lxml import etree # total_results = XPath('//openSearch:totalResults') # start_index = XPath('//openSearch:startIndex') # items_per_page = XPath('//openSearch:itemsPerPage') entry = XPath('//atom:entry') entry_id = XPath('descendant::atom:id') url = XPath('descendant::atom:link[@rel="self"]/@href') creator = XPath('descendant::dc:creator') identifier = XPath('descendant::dc:identifier') title = XPath('descendant::dc:title') date = XPath('descendant::dc:date') publisher = XPath('descendant::dc:publisher') subject = XPath('descendant::dc:subject') description = XPath('descendant::dc:description') language = XPath('descendant::dc:language') # print(etree.tostring(entry_, pretty_print=True)) def get_text(extra, x): try: ans = x(extra) if ans: ans = ans[0].text if ans and ans.strip(): return ans.strip() except: log.exception('Programming error:') return None id_url = entry_id(entry_)[0].text google_id = id_url.split('/')[-1] details_url = url(entry_)[0] title_ = ': '.join([x.text for x in title(entry_)]).strip() authors = [x.text.strip() for x in creator(entry_) if x.text] if not authors: authors = [_('Unknown')] if not id_url or not title: # Silently discard this entry return None mi = Metadata(title_, authors) mi.identifiers = {'google': google_id} try: raw = get_details(browser, details_url, timeout) feed = etree.fromstring( xml_to_unicode(clean_ascii_chars(raw), strip_encoding_pats=True)[0], parser=etree.XMLParser(recover=True, no_network=True, resolve_entities=False) ) extra = entry(feed)[0] except: log.exception('Failed to get additional details for', mi.title) return mi mi.comments = get_text(extra, description) lang = canonicalize_lang(get_text(extra, language)) if lang: mi.language = lang mi.publisher = get_text(extra, publisher) # ISBN isbns = [] for x in identifier(extra): t = type('')(x.text).strip() if t[:5].upper() in ('ISBN:', 'LCCN:', 'OCLC:'): if t[:5].upper() == 'ISBN:': t = check_isbn(t[5:]) if t: isbns.append(t) if isbns: mi.isbn = sorted(isbns, key=len)[-1] mi.all_isbns = isbns # Tags try: btags = [x.text for x in subject(extra) if x.text] tags = [] for t in btags: atags = [y.strip() for y in t.split('/')] for tag in atags: if tag not in tags: tags.append(tag) except: log.exception('Failed to parse tags:') tags = [] if tags: mi.tags = [x.replace(',', ';') for x in tags] # pubdate pubdate = get_text(extra, date) if pubdate: from calibre.utils.date import parse_date, utcnow try: default = utcnow().replace(day=15) mi.pubdate = parse_date(pubdate, assume_utc=True, default=default) except: log.error('Failed to parse pubdate %r' % pubdate) # Cover mi.has_google_cover = None for x in extra.xpath( '//*[@href and @rel="http://schemas.google.com/books/2008/thumbnail"]' ): mi.has_google_cover = x.get('href') break return mi # }}} class GoogleBooks(Source): name = 'Google' version = (1, 0, 3) minimum_calibre_version = (2, 80, 0) description = _('Downloads metadata and covers from Google Books') capabilities = frozenset({'identify'}) touched_fields = frozenset({ 'title', 'authors', 'tags', 'pubdate', 'comments', 'publisher', 'identifier:isbn', 'identifier:google', 'languages' }) supports_gzip_transfer_encoding = True cached_cover_url_is_reliable = False GOOGLE_COVER = 'https://books.google.com/books?id=%s&printsec=frontcover&img=1' DUMMY_IMAGE_MD5 = frozenset( ('0de4383ebad0adad5eeb8975cd796657', 'a64fa89d7ebc97075c1d363fc5fea71f') ) def get_book_url(self, identifiers): # {{{ goog = identifiers.get('google', None) if goog is not None: return ('google', goog, 'https://books.google.com/books?id=%s' % goog) # }}} def create_query(self, title=None, authors=None, identifiers={}, capitalize_isbn=False): # {{{ try: from urllib.parse import urlencode except ImportError: from urllib import urlencode BASE_URL = 'https://books.google.com/books/feeds/volumes?' isbn = check_isbn(identifiers.get('isbn', None)) q = '' if isbn is not None: q += ('ISBN:' if capitalize_isbn else 'isbn:') + isbn elif title or authors: def build_term(prefix, parts): return ' '.join('in' + prefix + ':' + x for x in parts) title_tokens = list(self.get_title_tokens(title)) if title_tokens: q += build_term('title', title_tokens) author_tokens = list(self.get_author_tokens(authors, only_first_author=True)) if author_tokens: q += ('+' if q else '') + build_term('author', author_tokens) if not q: return None if not isinstance(q, bytes): q = q.encode('utf-8') return BASE_URL + urlencode({ 'q': q, 'max-results': 20, 'start-index': 1, 'min-viewability': 'none', }) # }}} def download_cover( # {{{ self, log, result_queue, abort, title=None, authors=None, identifiers={}, timeout=30, get_best_cover=False ): cached_url = self.get_cached_cover_url(identifiers) if cached_url is None: log.info('No cached cover found, running identify') rq = Queue() self.identify( log, rq, abort, title=title, authors=authors, identifiers=identifiers ) if abort.is_set(): return results = [] while True: try: results.append(rq.get_nowait()) except Empty: break results.sort( key=self.identify_results_keygen( title=title, authors=authors, identifiers=identifiers ) ) for mi in results: cached_url = self.get_cached_cover_url(mi.identifiers) if cached_url is not None: break if cached_url is None: log.info('No cover found') return br = self.browser for candidate in (0, 1): if abort.is_set(): return url = cached_url + '&zoom={}'.format(candidate) log('Downloading cover from:', cached_url) try: cdata = br.open_novisit(url, timeout=timeout).read() if cdata: if hashlib.md5(cdata).hexdigest() in self.DUMMY_IMAGE_MD5: log.warning('Google returned a dummy image, ignoring') else: result_queue.put((self, cdata)) break except Exception: log.exception('Failed to download cover from:', cached_url) # }}} def get_cached_cover_url(self, identifiers): # {{{ url = None goog = identifiers.get('google', None) if goog is None: isbn = identifiers.get('isbn', None) if isbn is not None: goog = self.cached_isbn_to_identifier(isbn) if goog is not None: url = self.cached_identifier_to_cover_url(goog) return url # }}} def get_all_details( # {{{ self, br, log, entries, abort, result_queue, timeout ): from lxml import etree for relevance, i in enumerate(entries): try: ans = to_metadata(br, log, i, timeout) if isinstance(ans, Metadata): ans.source_relevance = relevance goog = ans.identifiers['google'] for isbn in getattr(ans, 'all_isbns', []): self.cache_isbn_to_identifier(isbn, goog) if getattr(ans, 'has_google_cover', False): self.cache_identifier_to_cover_url( goog, self.GOOGLE_COVER % goog ) self.clean_downloaded_metadata(ans) result_queue.put(ans) except: log.exception( 'Failed to get metadata for identify entry:', etree.tostring(i) ) if abort.is_set(): break # }}} def identify( # {{{ self, log, result_queue, abort, title=None, authors=None, identifiers={}, timeout=30 ): from lxml import etree entry = XPath('//atom:entry') query = self.create_query( title=title, authors=authors, identifiers=identifiers ) if not query: log.error('Insufficient metadata to construct query') return alternate_query = self.create_query(title=title, authors=authors, identifiers=identifiers, capitalize_isbn=True) br = self.browser def make_query(query): log('Making query:', query) try: raw = br.open_novisit(query, timeout=timeout).read() except Exception as e: log.exception('Failed to make identify query: %r' % query) return False, as_unicode(e) try: feed = etree.fromstring( xml_to_unicode(clean_ascii_chars(raw), strip_encoding_pats=True)[0], parser=etree.XMLParser(recover=True, no_network=True, resolve_entities=False) ) return True, entry(feed) except Exception as e: log.exception('Failed to parse identify results') return False, as_unicode(e) ok, entries = make_query(query) if not ok: return entries if not entries and alternate_query != query and not abort.is_set(): log('No results found, retrying with capitalized ISBN') ok, entries = make_query(alternate_query) if not ok: return entries if not entries and title and not abort.is_set(): if identifiers: log('No results found, retrying without identifiers') return self.identify( log, result_queue, abort, title=title, authors=authors, timeout=timeout ) ntitle = cleanup_title(title) if ntitle and ntitle != title: log('No results found, retrying without sub-title') return self.identify( log, result_queue, abort, title=ntitle, authors=authors, timeout=timeout ) # There is no point running these queries in threads as google # throttles requests returning 403 Forbidden errors self.get_all_details(br, log, entries, abort, result_queue, timeout) # }}} if __name__ == '__main__': # tests {{{ # To run these test use: calibre-debug # src/calibre/ebooks/metadata/sources/google.py from calibre.ebooks.metadata.sources.test import ( test_identify_plugin, title_test, authors_test ) tests = [ ({ 'identifiers': { 'isbn': '978-0-7869-5437-7' # needs capitalized ISBN to find results }, 'title': 'Dragons of Autumn Twilight', 'authors': ['Margaret Weis', 'Tracy Hickman'] }, [ title_test('The great gatsby', exact=True), authors_test(['F. Scott Fitzgerald']) ]), ({ 'identifiers': { 'isbn': '0743273567' }, 'title': 'Great Gatsby', 'authors': ['Fitzgerald'] }, [ title_test('The great gatsby', exact=True), authors_test(['F. Scott Fitzgerald']) ]), ({ 'title': 'Flatland', 'authors': ['Abbott'] }, [title_test('Flatland', exact=False)]), ({ 'title': 'The Blood Red Indian Summer: A Berger and Mitry Mystery', 'authors': ['David Handler'], }, [title_test('The Blood Red Indian Summer: A Berger and Mitry Mystery') ]) ] test_identify_plugin(GoogleBooks.name, tests[:]) # }}}