%PDF- %PDF-
| Direktori : /lib/calibre/calibre/ebooks/metadata/sources/ |
| Current File : //lib/calibre/calibre/ebooks/metadata/sources/identify.py |
#!/usr/bin/env python3
# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:ai
__license__ = 'GPL v3'
__copyright__ = '2011, Kovid Goyal <kovid@kovidgoyal.net>'
__docformat__ = 'restructuredtext en'
import re
import unicodedata
import time
from datetime import datetime
from io import StringIO
from operator import attrgetter
from threading import Thread
from calibre.customize.ui import all_metadata_plugins, metadata_plugins
from calibre.ebooks.metadata import authors_to_sort_string, check_issn
from calibre.ebooks.metadata.book.base import Metadata
from calibre.ebooks.metadata.sources.base import create_log
from calibre.ebooks.metadata.sources.prefs import msprefs
from calibre.ebooks.metadata.xisbn import xisbn
from calibre.utils.date import UNDEFINED_DATE, as_utc, utc_tz
from calibre.utils.formatter import EvalFormatter
from calibre.utils.html2text import html2text
from calibre.utils.icu import lower, primary_sort_key
from polyglot.builtins import iteritems, itervalues, as_unicode
from polyglot.queue import Empty, Queue
from polyglot.urllib import quote, urlparse
# Download worker {{{
class Worker(Thread):
def __init__(self, plugin, kwargs, abort):
Thread.__init__(self)
self.daemon = True
self.plugin, self.kwargs, self.rq = plugin, kwargs, Queue()
self.abort = abort
self.buf = StringIO()
self.log = create_log(self.buf)
def run(self):
start = time.time()
try:
self.plugin.identify(self.log, self.rq, self.abort, **self.kwargs)
except:
self.log.exception('Plugin', self.plugin.name, 'failed')
self.plugin.dl_time_spent = time.time() - start
@property
def name(self):
return self.plugin.name
def is_worker_alive(workers):
for w in workers:
if w.is_alive():
return True
return False
# }}}
# Merge results from different sources {{{
class xISBN(Thread):
def __init__(self, isbn):
Thread.__init__(self)
self.isbn = isbn
self.isbns = frozenset()
self.min_year = None
self.daemon = True
self.exception = self.tb = None
def run(self):
try:
self.isbns, self.min_year = xisbn.get_isbn_pool(self.isbn)
except Exception as e:
import traceback
self.exception = e
self.tb = traceback.format_exception()
class ISBNMerge:
def __init__(self, log):
self.pools = {}
self.isbnless_results = []
self.results = []
self.log = log
# The xISBN service has been de-commissioned
# https://www.oclc.org/developer/news/2018/xid-decommission.en.html
self.use_xisbn = False
def isbn_in_pool(self, isbn):
if isbn:
for isbns, pool in iteritems(self.pools):
if isbn in isbns:
return pool
return None
def pool_has_result_from_same_source(self, pool, result):
results = pool[1]
for r in results:
if r.identify_plugin is result.identify_plugin:
return True
return False
def add_result(self, result):
isbn = result.isbn
if isbn:
pool = self.isbn_in_pool(isbn)
if pool is None:
isbns = min_year = None
if self.use_xisbn:
xw = xISBN(isbn)
xw.start()
xw.join(10)
if xw.is_alive():
self.log.error('Query to xISBN timed out')
self.use_xisbn = False
else:
if xw.exception:
self.log.error('Query to xISBN failed:')
self.log.debug(xw.tb)
else:
isbns, min_year = xw.isbns, xw.min_year
if not msprefs['find_first_edition_date']:
min_year = None
if not isbns:
isbns = frozenset([isbn])
if isbns in self.pools:
# xISBN had a brain fart
pool = self.pools[isbns]
else:
self.pools[isbns] = pool = (min_year, [])
if not self.pool_has_result_from_same_source(pool, result):
pool[1].append(result)
else:
self.isbnless_results.append(result)
def finalize(self):
has_isbn_result = False
for results in itervalues(self.pools):
if results:
has_isbn_result = True
break
isbn_sources = frozenset()
if has_isbn_result:
isbn_sources = self.merge_isbn_results()
# Now handle results that have no ISBNs
results = sorted(self.isbnless_results,
key=attrgetter('relevance_in_source'))
# Only use results that are from sources that have not also returned a
# result with an ISBN
results = [r for r in results if r.identify_plugin not in isbn_sources or not r.identify_plugin.prefer_results_with_isbn]
if results:
# Pick only the most relevant result from each source
seen = set()
for result in results:
if msprefs['keep_dups'] or result.identify_plugin not in seen:
seen.add(result.identify_plugin)
self.results.append(result)
result.average_source_relevance = \
result.relevance_in_source
self.merge_metadata_results()
return self.results
def merge_metadata_results(self, merge_on_identifiers=False):
'''
Merge results with identical title and authors or an identical
identifier
'''
# First title/author
groups = {}
for result in self.results:
title = lower(result.title if result.title else '')
key = (title, tuple(lower(x) for x in result.authors))
if key not in groups:
groups[key] = []
groups[key].append(result)
if len(groups) != len(self.results):
self.results = []
for rgroup in itervalues(groups):
rel = [r.average_source_relevance for r in rgroup]
if len(rgroup) > 1:
result = self.merge(rgroup, None, do_asr=False)
result.average_source_relevance = sum(rel)/len(rel)
else:
result = rgroup[0]
self.results.append(result)
if merge_on_identifiers:
# Now identifiers
groups, empty = {}, []
for result in self.results:
key = set()
for typ, val in iteritems(result.identifiers):
if typ and val:
key.add((typ, val))
if key:
key = frozenset(key)
match = None
for candidate in list(groups):
if candidate.intersection(key):
# We have at least one identifier in common
match = candidate.union(key)
results = groups.pop(candidate)
results.append(result)
groups[match] = results
break
if match is None:
groups[key] = [result]
else:
empty.append(result)
if len(groups) != len(self.results):
self.results = []
for rgroup in itervalues(groups):
rel = [r.average_source_relevance for r in rgroup]
if len(rgroup) > 1:
result = self.merge(rgroup, None, do_asr=False)
result.average_source_relevance = sum(rel)/len(rel)
elif rgroup:
result = rgroup[0]
self.results.append(result)
if empty:
self.results.extend(empty)
self.results.sort(key=attrgetter('average_source_relevance'))
def merge_isbn_results(self):
self.results = []
sources = set()
for min_year, results in itervalues(self.pools):
if results:
for r in results:
sources.add(r.identify_plugin)
self.results.append(self.merge(results, min_year))
self.results.sort(key=attrgetter('average_source_relevance'))
return sources
def length_merge(self, attr, results, null_value=None, shortest=True):
values = [getattr(x, attr) for x in results if not x.is_null(attr)]
values = [x for x in values if len(x) > 0]
if not values:
return null_value
values.sort(key=len, reverse=not shortest)
return values[0]
def random_merge(self, attr, results, null_value=None):
values = [getattr(x, attr) for x in results if not x.is_null(attr)]
return values[0] if values else null_value
def merge(self, results, min_year, do_asr=True):
ans = Metadata(_('Unknown'))
# We assume the shortest title has the least cruft in it
ans.title = self.length_merge('title', results, null_value=ans.title)
# No harm in having extra authors, maybe something useful like an
# editor or translator
ans.authors = self.length_merge('authors', results,
null_value=ans.authors, shortest=False)
# We assume the shortest publisher has the least cruft in it
ans.publisher = self.length_merge('publisher', results,
null_value=ans.publisher)
# We assume the smallest set of tags has the least cruft in it
ans.tags = self.length_merge('tags', results,
null_value=ans.tags, shortest=msprefs['fewer_tags'])
# We assume the longest series has the most info in it
ans.series = self.length_merge('series', results,
null_value=ans.series, shortest=False)
for r in results:
if r.series and r.series == ans.series:
ans.series_index = r.series_index
break
# Average the rating over all sources
ratings = []
for r in results:
rating = r.rating
if rating and rating > 0 and rating <= 5:
ratings.append(rating)
if ratings:
ans.rating = int(round(sum(ratings)/len(ratings)))
# Smallest language is likely to be valid
ans.language = self.length_merge('language', results,
null_value=ans.language)
# Choose longest comments
ans.comments = self.length_merge('comments', results,
null_value=ans.comments, shortest=False)
# Published date
if min_year:
for r in results:
year = getattr(r.pubdate, 'year', None)
if year == min_year:
ans.pubdate = r.pubdate
break
if getattr(ans.pubdate, 'year', None) == min_year:
min_date = datetime(min_year, ans.pubdate.month, ans.pubdate.day,
tzinfo=utc_tz)
else:
min_date = datetime(min_year, 1, 2, tzinfo=utc_tz)
ans.pubdate = min_date
else:
min_date = datetime(3001, 1, 1, tzinfo=utc_tz)
for r in results:
if r.pubdate is not None:
candidate = as_utc(r.pubdate)
if candidate < min_date:
min_date = candidate
if min_date.year < 3000:
ans.pubdate = min_date
# Identifiers
for r in results:
ans.identifiers.update(r.identifiers)
# Cover URL
ans.has_cached_cover_url = bool([r for r in results if
getattr(r, 'has_cached_cover_url', False)])
# Merge any other fields with no special handling (random merge)
touched_fields = set()
for r in results:
if hasattr(r, 'identify_plugin'):
touched_fields |= r.identify_plugin.touched_fields
for f in touched_fields:
if f.startswith('identifier:') or not ans.is_null(f):
continue
setattr(ans, f, self.random_merge(f, results,
null_value=getattr(ans, f)))
if do_asr:
avg = [x.relevance_in_source for x in results]
avg = sum(avg)/len(avg)
ans.average_source_relevance = avg
return ans
def merge_identify_results(result_map, log):
isbn_merge = ISBNMerge(log)
for plugin, results in iteritems(result_map):
for result in results:
isbn_merge.add_result(result)
return isbn_merge.finalize()
# }}}
def identify(log, abort, # {{{
title=None, authors=None, identifiers={}, timeout=30, allowed_plugins=None):
if title == _('Unknown'):
title = None
if authors == [_('Unknown')]:
authors = None
start_time = time.time()
plugins = [p for p in metadata_plugins(['identify'])
if p.is_configured() and (allowed_plugins is None or p.name in allowed_plugins)]
kwargs = {
'title': title,
'authors': authors,
'identifiers': identifiers,
'timeout': timeout,
}
log('Running identify query with parameters:')
log(kwargs)
log('Using plugins:', ', '.join(['%s %s' % (p.name, p.version) for p in plugins]))
log('The log from individual plugins is below')
workers = [Worker(p, kwargs, abort) for p in plugins]
for w in workers:
w.start()
first_result_at = None
results = {}
for p in plugins:
results[p] = []
logs = {w.plugin: w.buf for w in workers}
def get_results():
found = False
for w in workers:
try:
result = w.rq.get_nowait()
except Empty:
pass
else:
results[w.plugin].append(result)
found = True
return found
wait_time = msprefs['wait_after_first_identify_result']
while True:
time.sleep(0.2)
if get_results() and first_result_at is None:
first_result_at = time.time()
if not is_worker_alive(workers):
break
if (first_result_at is not None and time.time() - first_result_at > wait_time):
log.warn('Not waiting any longer for more results. Still running'
' sources:')
for worker in workers:
if worker.is_alive():
log.debug('\t' + worker.name)
abort.set()
break
while not abort.is_set() and get_results():
pass
sort_kwargs = dict(kwargs)
for k in list(sort_kwargs):
if k not in ('title', 'authors', 'identifiers'):
sort_kwargs.pop(k)
longest, lp = -1, ''
for plugin, presults in iteritems(results):
presults.sort(key=plugin.identify_results_keygen(**sort_kwargs))
# Throw away lower priority results from the same source that have exactly the same
# title and authors as a higher priority result
filter_results = set()
filtered_results = []
for r in presults:
key = (r.title, tuple(r.authors))
if key not in filter_results:
filtered_results.append(r)
filter_results.add(key)
results[plugin] = presults = filtered_results
plog = logs[plugin].getvalue().strip()
log('\n'+'*'*30, plugin.name, '%s' % (plugin.version,), '*'*30)
log('Found %d results'%len(presults))
time_spent = getattr(plugin, 'dl_time_spent', None)
if time_spent is None:
log('Downloading was aborted')
longest, lp = -1, plugin.name
else:
log('Downloading from', plugin.name, 'took', time_spent)
if time_spent > longest:
longest, lp = time_spent, plugin.name
for r in presults:
log('\n\n---')
try:
log(str(r))
except TypeError:
log(repr(r))
if plog:
log(plog)
log('\n'+'*'*80)
dummy = Metadata(_('Unknown'))
for i, result in enumerate(presults):
for f in plugin.prefs['ignore_fields']:
if ':' not in f:
setattr(result, f, getattr(dummy, f))
if f == 'series':
result.series_index = dummy.series_index
result.relevance_in_source = i
result.has_cached_cover_url = (
plugin.cached_cover_url_is_reliable and plugin.get_cached_cover_url(result.identifiers) is not None)
result.identify_plugin = plugin
if msprefs['txt_comments']:
if plugin.has_html_comments and result.comments:
result.comments = html2text(result.comments)
log('The identify phase took %.2f seconds'%(time.time() - start_time))
log('The longest time (%f) was taken by:'%longest, lp)
log('Merging results from different sources')
start_time = time.time()
results = merge_identify_results(results, log)
log('We have %d merged results, merging took: %.2f seconds' %
(len(results), time.time() - start_time))
tm_rules = msprefs['tag_map_rules']
if tm_rules:
from calibre.ebooks.metadata.tag_mapper import map_tags
am_rules = msprefs['author_map_rules']
if am_rules:
from calibre.ebooks.metadata.author_mapper import compile_rules, map_authors
am_rules = compile_rules(am_rules)
# normalize unicode strings
n = lambda x: unicodedata.normalize('NFC', as_unicode(x or '', errors='replace'))
for r in results:
if r.tags:
r.tags = list(map(n, r.tags))
if r.authors:
r.authors = list(map(n, r.authors))
if r.author_sort:
r.author_sort = n(r.author_sort)
if r.title:
r.title = n(r.title)
if r.publisher:
r.publisher = n(r.publisher)
if r.comments:
r.comments = n(r.comments)
max_tags = msprefs['max_tags']
for r in results:
if tm_rules:
r.tags = map_tags(r.tags, tm_rules)
r.tags = r.tags[:max_tags]
if getattr(r.pubdate, 'year', 2000) <= UNDEFINED_DATE.year:
r.pubdate = None
if msprefs['swap_author_names']:
for r in results:
def swap_to_ln_fn(a):
if ',' in a:
return a
parts = a.split(None)
if len(parts) <= 1:
return a
surname = parts[-1]
return '%s, %s' % (surname, ' '.join(parts[:-1]))
r.authors = [swap_to_ln_fn(a) for a in r.authors]
if am_rules:
for r in results:
new_authors = map_authors(r.authors, am_rules)
if new_authors != r.authors:
r.authors = new_authors
r.author_sort = authors_to_sort_string(r.authors)
return results
# }}}
def urls_from_identifiers(identifiers, sort_results=False): # {{{
identifiers = {k.lower():v for k, v in iteritems(identifiers)}
ans = []
keys_left = set(identifiers)
def add(name, k, val, url):
ans.append((name, k, val, url))
keys_left.discard(k)
rules = msprefs['id_link_rules']
if rules:
formatter = EvalFormatter()
for k, val in iteritems(identifiers):
val = val.replace('|', ',')
vals = {
'id':str(quote(val if isinstance(val, bytes) else val.encode('utf-8'))),
'id_unquoted': str(val),
}
items = rules.get(k) or ()
for name, template in items:
try:
url = formatter.safe_format(template, vals, '', vals)
except Exception:
import traceback
traceback.format_exc()
continue
add(name, k, val, url)
for plugin in all_metadata_plugins():
try:
for id_type, id_val, url in plugin.get_book_urls(identifiers):
add(plugin.get_book_url_name(id_type, id_val, url), id_type, id_val, url)
except Exception:
pass
isbn = identifiers.get('isbn', None)
if isbn:
add(isbn, 'isbn', isbn,
'https://www.worldcat.org/isbn/'+isbn)
doi = identifiers.get('doi', None)
if doi:
add('DOI', 'doi', doi,
'https://dx.doi.org/'+doi)
arxiv = identifiers.get('arxiv', None)
if arxiv:
add('arXiv', 'arxiv', arxiv,
'https://arxiv.org/abs/'+arxiv)
oclc = identifiers.get('oclc', None)
if oclc:
add('OCLC', 'oclc', oclc,
'https://www.worldcat.org/oclc/'+oclc)
issn = check_issn(identifiers.get('issn', None))
if issn:
add(issn, 'issn', issn,
'https://www.worldcat.org/issn/'+issn)
q = {'http', 'https', 'file'}
for k, url in iteritems(identifiers):
if url and re.match(r'ur[il]\d*$', k) is not None:
url = url[:8].replace('|', ':') + url[8:].replace('|', ',')
if url.partition(':')[0].lower() in q:
parts = urlparse(url)
name = parts.netloc or parts.path
add(name, k, url, url)
for k in tuple(keys_left):
val = identifiers.get(k)
if val:
url = val[:8].replace('|', ':') + val[8:].replace('|', ',')
if url.partition(':')[0].lower() in q:
parts = urlparse(url)
name = parts.netloc or parts.path
add(name, k, url, url)
if sort_results:
def url_key(x):
return primary_sort_key(str(x[0]))
ans = sorted(ans, key=url_key)
return ans
# }}}
if __name__ == '__main__': # tests {{{
# To run these test use: calibre-debug -e
# src/calibre/ebooks/metadata/sources/identify.py
from calibre.ebooks.metadata.sources.test import (
authors_test, test_identify, title_test
)
tests = [
(
{'title':'Magykal Papers',
'authors':['Sage']},
[title_test('Septimus Heap: The Magykal Papers', exact=True)],
),
( # An e-book ISBN not on Amazon, one of the authors is unknown to Amazon
{'identifiers':{'isbn': '9780307459671'},
'title':'Invisible Gorilla', 'authors':['Christopher Chabris']},
[title_test('The Invisible Gorilla: And Other Ways Our Intuitions Deceive Us', exact=True)]
),
( # Test absence of identifiers
{'title':'Learning Python',
'authors':['Lutz']},
[title_test('Learning Python',
exact=True), authors_test(['Mark J. Lutz', 'David Ascher'])
]
),
( # Sophisticated comment formatting
{'identifiers':{'isbn': '9781416580829'}},
[title_test('Angels & Demons',
exact=True), authors_test(['Dan Brown'])]
),
( # A newer book
{'identifiers':{'isbn': '9780316044981'}},
[title_test('The Heroes', exact=True),
authors_test(['Joe Abercrombie'])]
),
]
# test_identify(tests[1:2])
test_identify(tests)
# }}}