%PDF- %PDF-
Direktori : /usr/lib/calibre/calibre/ebooks/metadata/sources/ |
Current File : //usr/lib/calibre/calibre/ebooks/metadata/sources/identify.py |
#!/usr/bin/env python3 # vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:ai __license__ = 'GPL v3' __copyright__ = '2011, Kovid Goyal <kovid@kovidgoyal.net>' __docformat__ = 'restructuredtext en' import re import unicodedata import time from datetime import datetime from io import StringIO from operator import attrgetter from threading import Thread from calibre.customize.ui import all_metadata_plugins, metadata_plugins from calibre.ebooks.metadata import authors_to_sort_string, check_issn from calibre.ebooks.metadata.book.base import Metadata from calibre.ebooks.metadata.sources.base import create_log from calibre.ebooks.metadata.sources.prefs import msprefs from calibre.ebooks.metadata.xisbn import xisbn from calibre.utils.date import UNDEFINED_DATE, as_utc, utc_tz from calibre.utils.formatter import EvalFormatter from calibre.utils.html2text import html2text from calibre.utils.icu import lower, primary_sort_key from polyglot.builtins import iteritems, itervalues, as_unicode from polyglot.queue import Empty, Queue from polyglot.urllib import quote, urlparse # Download worker {{{ class Worker(Thread): def __init__(self, plugin, kwargs, abort): Thread.__init__(self) self.daemon = True self.plugin, self.kwargs, self.rq = plugin, kwargs, Queue() self.abort = abort self.buf = StringIO() self.log = create_log(self.buf) def run(self): start = time.time() try: self.plugin.identify(self.log, self.rq, self.abort, **self.kwargs) except: self.log.exception('Plugin', self.plugin.name, 'failed') self.plugin.dl_time_spent = time.time() - start @property def name(self): return self.plugin.name def is_worker_alive(workers): for w in workers: if w.is_alive(): return True return False # }}} # Merge results from different sources {{{ class xISBN(Thread): def __init__(self, isbn): Thread.__init__(self) self.isbn = isbn self.isbns = frozenset() self.min_year = None self.daemon = True self.exception = self.tb = None def run(self): try: self.isbns, self.min_year = xisbn.get_isbn_pool(self.isbn) except Exception as e: import traceback self.exception = e self.tb = traceback.format_exception() class ISBNMerge: def __init__(self, log): self.pools = {} self.isbnless_results = [] self.results = [] self.log = log # The xISBN service has been de-commissioned # https://www.oclc.org/developer/news/2018/xid-decommission.en.html self.use_xisbn = False def isbn_in_pool(self, isbn): if isbn: for isbns, pool in iteritems(self.pools): if isbn in isbns: return pool return None def pool_has_result_from_same_source(self, pool, result): results = pool[1] for r in results: if r.identify_plugin is result.identify_plugin: return True return False def add_result(self, result): isbn = result.isbn if isbn: pool = self.isbn_in_pool(isbn) if pool is None: isbns = min_year = None if self.use_xisbn: xw = xISBN(isbn) xw.start() xw.join(10) if xw.is_alive(): self.log.error('Query to xISBN timed out') self.use_xisbn = False else: if xw.exception: self.log.error('Query to xISBN failed:') self.log.debug(xw.tb) else: isbns, min_year = xw.isbns, xw.min_year if not msprefs['find_first_edition_date']: min_year = None if not isbns: isbns = frozenset([isbn]) if isbns in self.pools: # xISBN had a brain fart pool = self.pools[isbns] else: self.pools[isbns] = pool = (min_year, []) if not self.pool_has_result_from_same_source(pool, result): pool[1].append(result) else: self.isbnless_results.append(result) def finalize(self): has_isbn_result = False for results in itervalues(self.pools): if results: has_isbn_result = True break isbn_sources = frozenset() if has_isbn_result: isbn_sources = self.merge_isbn_results() # Now handle results that have no ISBNs results = sorted(self.isbnless_results, key=attrgetter('relevance_in_source')) # Only use results that are from sources that have not also returned a # result with an ISBN results = [r for r in results if r.identify_plugin not in isbn_sources or not r.identify_plugin.prefer_results_with_isbn] if results: # Pick only the most relevant result from each source seen = set() for result in results: if msprefs['keep_dups'] or result.identify_plugin not in seen: seen.add(result.identify_plugin) self.results.append(result) result.average_source_relevance = \ result.relevance_in_source self.merge_metadata_results() return self.results def merge_metadata_results(self, merge_on_identifiers=False): ''' Merge results with identical title and authors or an identical identifier ''' # First title/author groups = {} for result in self.results: title = lower(result.title if result.title else '') key = (title, tuple(lower(x) for x in result.authors)) if key not in groups: groups[key] = [] groups[key].append(result) if len(groups) != len(self.results): self.results = [] for rgroup in itervalues(groups): rel = [r.average_source_relevance for r in rgroup] if len(rgroup) > 1: result = self.merge(rgroup, None, do_asr=False) result.average_source_relevance = sum(rel)/len(rel) else: result = rgroup[0] self.results.append(result) if merge_on_identifiers: # Now identifiers groups, empty = {}, [] for result in self.results: key = set() for typ, val in iteritems(result.identifiers): if typ and val: key.add((typ, val)) if key: key = frozenset(key) match = None for candidate in list(groups): if candidate.intersection(key): # We have at least one identifier in common match = candidate.union(key) results = groups.pop(candidate) results.append(result) groups[match] = results break if match is None: groups[key] = [result] else: empty.append(result) if len(groups) != len(self.results): self.results = [] for rgroup in itervalues(groups): rel = [r.average_source_relevance for r in rgroup] if len(rgroup) > 1: result = self.merge(rgroup, None, do_asr=False) result.average_source_relevance = sum(rel)/len(rel) elif rgroup: result = rgroup[0] self.results.append(result) if empty: self.results.extend(empty) self.results.sort(key=attrgetter('average_source_relevance')) def merge_isbn_results(self): self.results = [] sources = set() for min_year, results in itervalues(self.pools): if results: for r in results: sources.add(r.identify_plugin) self.results.append(self.merge(results, min_year)) self.results.sort(key=attrgetter('average_source_relevance')) return sources def length_merge(self, attr, results, null_value=None, shortest=True): values = [getattr(x, attr) for x in results if not x.is_null(attr)] values = [x for x in values if len(x) > 0] if not values: return null_value values.sort(key=len, reverse=not shortest) return values[0] def random_merge(self, attr, results, null_value=None): values = [getattr(x, attr) for x in results if not x.is_null(attr)] return values[0] if values else null_value def merge(self, results, min_year, do_asr=True): ans = Metadata(_('Unknown')) # We assume the shortest title has the least cruft in it ans.title = self.length_merge('title', results, null_value=ans.title) # No harm in having extra authors, maybe something useful like an # editor or translator ans.authors = self.length_merge('authors', results, null_value=ans.authors, shortest=False) # We assume the shortest publisher has the least cruft in it ans.publisher = self.length_merge('publisher', results, null_value=ans.publisher) # We assume the smallest set of tags has the least cruft in it ans.tags = self.length_merge('tags', results, null_value=ans.tags, shortest=msprefs['fewer_tags']) # We assume the longest series has the most info in it ans.series = self.length_merge('series', results, null_value=ans.series, shortest=False) for r in results: if r.series and r.series == ans.series: ans.series_index = r.series_index break # Average the rating over all sources ratings = [] for r in results: rating = r.rating if rating and rating > 0 and rating <= 5: ratings.append(rating) if ratings: ans.rating = int(round(sum(ratings)/len(ratings))) # Smallest language is likely to be valid ans.language = self.length_merge('language', results, null_value=ans.language) # Choose longest comments ans.comments = self.length_merge('comments', results, null_value=ans.comments, shortest=False) # Published date if min_year: for r in results: year = getattr(r.pubdate, 'year', None) if year == min_year: ans.pubdate = r.pubdate break if getattr(ans.pubdate, 'year', None) == min_year: min_date = datetime(min_year, ans.pubdate.month, ans.pubdate.day, tzinfo=utc_tz) else: min_date = datetime(min_year, 1, 2, tzinfo=utc_tz) ans.pubdate = min_date else: min_date = datetime(3001, 1, 1, tzinfo=utc_tz) for r in results: if r.pubdate is not None: candidate = as_utc(r.pubdate) if candidate < min_date: min_date = candidate if min_date.year < 3000: ans.pubdate = min_date # Identifiers for r in results: ans.identifiers.update(r.identifiers) # Cover URL ans.has_cached_cover_url = bool([r for r in results if getattr(r, 'has_cached_cover_url', False)]) # Merge any other fields with no special handling (random merge) touched_fields = set() for r in results: if hasattr(r, 'identify_plugin'): touched_fields |= r.identify_plugin.touched_fields for f in touched_fields: if f.startswith('identifier:') or not ans.is_null(f): continue setattr(ans, f, self.random_merge(f, results, null_value=getattr(ans, f))) if do_asr: avg = [x.relevance_in_source for x in results] avg = sum(avg)/len(avg) ans.average_source_relevance = avg return ans def merge_identify_results(result_map, log): isbn_merge = ISBNMerge(log) for plugin, results in iteritems(result_map): for result in results: isbn_merge.add_result(result) return isbn_merge.finalize() # }}} def identify(log, abort, # {{{ title=None, authors=None, identifiers={}, timeout=30, allowed_plugins=None): if title == _('Unknown'): title = None if authors == [_('Unknown')]: authors = None start_time = time.time() plugins = [p for p in metadata_plugins(['identify']) if p.is_configured() and (allowed_plugins is None or p.name in allowed_plugins)] kwargs = { 'title': title, 'authors': authors, 'identifiers': identifiers, 'timeout': timeout, } log('Running identify query with parameters:') log(kwargs) log('Using plugins:', ', '.join(['%s %s' % (p.name, p.version) for p in plugins])) log('The log from individual plugins is below') workers = [Worker(p, kwargs, abort) for p in plugins] for w in workers: w.start() first_result_at = None results = {} for p in plugins: results[p] = [] logs = {w.plugin: w.buf for w in workers} def get_results(): found = False for w in workers: try: result = w.rq.get_nowait() except Empty: pass else: results[w.plugin].append(result) found = True return found wait_time = msprefs['wait_after_first_identify_result'] while True: time.sleep(0.2) if get_results() and first_result_at is None: first_result_at = time.time() if not is_worker_alive(workers): break if (first_result_at is not None and time.time() - first_result_at > wait_time): log.warn('Not waiting any longer for more results. Still running' ' sources:') for worker in workers: if worker.is_alive(): log.debug('\t' + worker.name) abort.set() break while not abort.is_set() and get_results(): pass sort_kwargs = dict(kwargs) for k in list(sort_kwargs): if k not in ('title', 'authors', 'identifiers'): sort_kwargs.pop(k) longest, lp = -1, '' for plugin, presults in iteritems(results): presults.sort(key=plugin.identify_results_keygen(**sort_kwargs)) # Throw away lower priority results from the same source that have exactly the same # title and authors as a higher priority result filter_results = set() filtered_results = [] for r in presults: key = (r.title, tuple(r.authors)) if key not in filter_results: filtered_results.append(r) filter_results.add(key) results[plugin] = presults = filtered_results plog = logs[plugin].getvalue().strip() log('\n'+'*'*30, plugin.name, '%s' % (plugin.version,), '*'*30) log('Found %d results'%len(presults)) time_spent = getattr(plugin, 'dl_time_spent', None) if time_spent is None: log('Downloading was aborted') longest, lp = -1, plugin.name else: log('Downloading from', plugin.name, 'took', time_spent) if time_spent > longest: longest, lp = time_spent, plugin.name for r in presults: log('\n\n---') try: log(str(r)) except TypeError: log(repr(r)) if plog: log(plog) log('\n'+'*'*80) dummy = Metadata(_('Unknown')) for i, result in enumerate(presults): for f in plugin.prefs['ignore_fields']: if ':' not in f: setattr(result, f, getattr(dummy, f)) if f == 'series': result.series_index = dummy.series_index result.relevance_in_source = i result.has_cached_cover_url = ( plugin.cached_cover_url_is_reliable and plugin.get_cached_cover_url(result.identifiers) is not None) result.identify_plugin = plugin if msprefs['txt_comments']: if plugin.has_html_comments and result.comments: result.comments = html2text(result.comments) log('The identify phase took %.2f seconds'%(time.time() - start_time)) log('The longest time (%f) was taken by:'%longest, lp) log('Merging results from different sources') start_time = time.time() results = merge_identify_results(results, log) log('We have %d merged results, merging took: %.2f seconds' % (len(results), time.time() - start_time)) tm_rules = msprefs['tag_map_rules'] if tm_rules: from calibre.ebooks.metadata.tag_mapper import map_tags am_rules = msprefs['author_map_rules'] if am_rules: from calibre.ebooks.metadata.author_mapper import compile_rules, map_authors am_rules = compile_rules(am_rules) # normalize unicode strings n = lambda x: unicodedata.normalize('NFC', as_unicode(x or '', errors='replace')) for r in results: if r.tags: r.tags = list(map(n, r.tags)) if r.authors: r.authors = list(map(n, r.authors)) if r.author_sort: r.author_sort = n(r.author_sort) if r.title: r.title = n(r.title) if r.publisher: r.publisher = n(r.publisher) if r.comments: r.comments = n(r.comments) max_tags = msprefs['max_tags'] for r in results: if tm_rules: r.tags = map_tags(r.tags, tm_rules) r.tags = r.tags[:max_tags] if getattr(r.pubdate, 'year', 2000) <= UNDEFINED_DATE.year: r.pubdate = None if msprefs['swap_author_names']: for r in results: def swap_to_ln_fn(a): if ',' in a: return a parts = a.split(None) if len(parts) <= 1: return a surname = parts[-1] return '%s, %s' % (surname, ' '.join(parts[:-1])) r.authors = [swap_to_ln_fn(a) for a in r.authors] if am_rules: for r in results: new_authors = map_authors(r.authors, am_rules) if new_authors != r.authors: r.authors = new_authors r.author_sort = authors_to_sort_string(r.authors) return results # }}} def urls_from_identifiers(identifiers, sort_results=False): # {{{ identifiers = {k.lower():v for k, v in iteritems(identifiers)} ans = [] keys_left = set(identifiers) def add(name, k, val, url): ans.append((name, k, val, url)) keys_left.discard(k) rules = msprefs['id_link_rules'] if rules: formatter = EvalFormatter() for k, val in iteritems(identifiers): val = val.replace('|', ',') vals = { 'id':str(quote(val if isinstance(val, bytes) else val.encode('utf-8'))), 'id_unquoted': str(val), } items = rules.get(k) or () for name, template in items: try: url = formatter.safe_format(template, vals, '', vals) except Exception: import traceback traceback.format_exc() continue add(name, k, val, url) for plugin in all_metadata_plugins(): try: for id_type, id_val, url in plugin.get_book_urls(identifiers): add(plugin.get_book_url_name(id_type, id_val, url), id_type, id_val, url) except Exception: pass isbn = identifiers.get('isbn', None) if isbn: add(isbn, 'isbn', isbn, 'https://www.worldcat.org/isbn/'+isbn) doi = identifiers.get('doi', None) if doi: add('DOI', 'doi', doi, 'https://dx.doi.org/'+doi) arxiv = identifiers.get('arxiv', None) if arxiv: add('arXiv', 'arxiv', arxiv, 'https://arxiv.org/abs/'+arxiv) oclc = identifiers.get('oclc', None) if oclc: add('OCLC', 'oclc', oclc, 'https://www.worldcat.org/oclc/'+oclc) issn = check_issn(identifiers.get('issn', None)) if issn: add(issn, 'issn', issn, 'https://www.worldcat.org/issn/'+issn) q = {'http', 'https', 'file'} for k, url in iteritems(identifiers): if url and re.match(r'ur[il]\d*$', k) is not None: url = url[:8].replace('|', ':') + url[8:].replace('|', ',') if url.partition(':')[0].lower() in q: parts = urlparse(url) name = parts.netloc or parts.path add(name, k, url, url) for k in tuple(keys_left): val = identifiers.get(k) if val: url = val[:8].replace('|', ':') + val[8:].replace('|', ',') if url.partition(':')[0].lower() in q: parts = urlparse(url) name = parts.netloc or parts.path add(name, k, url, url) if sort_results: def url_key(x): return primary_sort_key(str(x[0])) ans = sorted(ans, key=url_key) return ans # }}} if __name__ == '__main__': # tests {{{ # To run these test use: calibre-debug -e # src/calibre/ebooks/metadata/sources/identify.py from calibre.ebooks.metadata.sources.test import ( authors_test, test_identify, title_test ) tests = [ ( {'title':'Magykal Papers', 'authors':['Sage']}, [title_test('Septimus Heap: The Magykal Papers', exact=True)], ), ( # An e-book ISBN not on Amazon, one of the authors is unknown to Amazon {'identifiers':{'isbn': '9780307459671'}, 'title':'Invisible Gorilla', 'authors':['Christopher Chabris']}, [title_test('The Invisible Gorilla: And Other Ways Our Intuitions Deceive Us', exact=True)] ), ( # Test absence of identifiers {'title':'Learning Python', 'authors':['Lutz']}, [title_test('Learning Python', exact=True), authors_test(['Mark J. Lutz', 'David Ascher']) ] ), ( # Sophisticated comment formatting {'identifiers':{'isbn': '9781416580829'}}, [title_test('Angels & Demons', exact=True), authors_test(['Dan Brown'])] ), ( # A newer book {'identifiers':{'isbn': '9780316044981'}}, [title_test('The Heroes', exact=True), authors_test(['Joe Abercrombie'])] ), ] # test_identify(tests[1:2]) test_identify(tests) # }}}