%PDF- %PDF-
Direktori : /usr/lib/calibre/calibre/ebooks/metadata/sources/ |
Current File : //usr/lib/calibre/calibre/ebooks/metadata/sources/amazon.py |
#!/usr/bin/env python3 # vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:ai # License: GPLv3 Copyright: 2011, Kovid Goyal <kovid at kovidgoyal.net> from __future__ import absolute_import, division, print_function, unicode_literals import re import socket import time from functools import partial try: from queue import Empty, Queue except ImportError: from Queue import Empty, Queue from threading import Thread try: from urllib.parse import urlparse except ImportError: from urlparse import urlparse from calibre import as_unicode, browser, random_user_agent, xml_replace_entities from calibre.ebooks.metadata import check_isbn from calibre.ebooks.metadata.book.base import Metadata from calibre.ebooks.metadata.sources.base import Option, Source, fixauthors, fixcase from calibre.utils.localization import canonicalize_lang from calibre.utils.random_ua import accept_header_for_ua from calibre.ebooks.oeb.base import urlquote def iri_quote_plus(url): ans = urlquote(url) if isinstance(ans, bytes): ans = ans.decode('utf-8') return ans.replace('%20', '+') def user_agent_is_ok(ua): return 'Mobile/' not in ua and 'Mobile ' not in ua class CaptchaError(Exception): pass class SearchFailed(ValueError): pass def parse_html(raw): try: from html5_parser import parse except ImportError: # Old versions of calibre import html5lib return html5lib.parse(raw, treebuilder='lxml', namespaceHTMLElements=False) else: return parse(raw) def parse_details_page(url, log, timeout, browser, domain): from calibre.utils.cleantext import clean_ascii_chars from calibre.ebooks.chardet import xml_to_unicode from lxml.html import tostring log('Getting details from:', url) try: raw = browser.open_novisit(url, timeout=timeout).read().strip() except Exception as e: if callable(getattr(e, 'getcode', None)) and \ e.getcode() == 404: log.error('URL malformed: %r' % url) return attr = getattr(e, 'args', [None]) attr = attr if attr else [None] if isinstance(attr[0], socket.timeout): msg = 'Details page timed out. Try again later.' log.error(msg) else: msg = 'Failed to make details query: %r' % url log.exception(msg) return oraw = raw if 'amazon.com.br' in url: # amazon.com.br serves utf-8 but has an incorrect latin1 <meta> tag raw = raw.decode('utf-8') raw = xml_to_unicode(raw, strip_encoding_pats=True, resolve_entities=True)[0] if '<title>404 - ' in raw: raise ValueError('URL malformed: %r' % url) if '>Could not find the requested document in the cache.<' in raw: raise ValueError('No cached entry for %s found' % url) try: root = parse_html(clean_ascii_chars(raw)) except Exception: msg = 'Failed to parse amazon details page: %r' % url log.exception(msg) return if domain == 'jp': for a in root.xpath('//a[@href]'): if 'black-curtain-redirect.html' in a.get('href'): url = a.get('href') if url: if url.startswith('/'): url = 'https://amazon.co.jp' + a.get('href') log('Black curtain redirect found, following') return parse_details_page(url, log, timeout, browser, domain) errmsg = root.xpath('//*[@id="errorMessage"]') if errmsg: msg = 'Failed to parse amazon details page: %r' % url msg += tostring(errmsg, method='text', encoding='unicode').strip() log.error(msg) return from css_selectors import Select selector = Select(root) return oraw, root, selector def parse_asin(root, log, url): try: link = root.xpath('//link[@rel="canonical" and @href]') for l in link: return l.get('href').rpartition('/')[-1] except Exception: log.exception('Error parsing ASIN for url: %r' % url) class Worker(Thread): # Get details {{{ ''' Get book details from amazons book page in a separate thread ''' def __init__(self, url, result_queue, browser, log, relevance, domain, plugin, timeout=20, testing=False, preparsed_root=None, cover_url_processor=None, filter_result=None): Thread.__init__(self) self.cover_url_processor = cover_url_processor self.preparsed_root = preparsed_root self.daemon = True self.testing = testing self.url, self.result_queue = url, result_queue self.log, self.timeout = log, timeout self.filter_result = filter_result or (lambda x, log: True) self.relevance, self.plugin = relevance, plugin self.browser = browser self.cover_url = self.amazon_id = self.isbn = None self.domain = domain from lxml.html import tostring self.tostring = tostring months = { # {{{ 'de': { 1: ['jän', 'januar'], 2: ['februar'], 3: ['märz'], 5: ['mai'], 6: ['juni'], 7: ['juli'], 10: ['okt', 'oktober'], 12: ['dez', 'dezember'] }, 'it': { 1: ['gennaio', 'enn'], 2: ['febbraio', 'febbr'], 3: ['marzo'], 4: ['aprile'], 5: ['maggio', 'magg'], 6: ['giugno'], 7: ['luglio'], 8: ['agosto', 'ag'], 9: ['settembre', 'sett'], 10: ['ottobre', 'ott'], 11: ['novembre'], 12: ['dicembre', 'dic'], }, 'fr': { 1: ['janv'], 2: ['févr'], 3: ['mars'], 4: ['avril'], 5: ['mai'], 6: ['juin'], 7: ['juil'], 8: ['août'], 9: ['sept'], 12: ['déc'], }, 'br': { 1: ['janeiro'], 2: ['fevereiro'], 3: ['março'], 4: ['abril'], 5: ['maio'], 6: ['junho'], 7: ['julho'], 8: ['agosto'], 9: ['setembro'], 10: ['outubro'], 11: ['novembro'], 12: ['dezembro'], }, 'es': { 1: ['enero'], 2: ['febrero'], 3: ['marzo'], 4: ['abril'], 5: ['mayo'], 6: ['junio'], 7: ['julio'], 8: ['agosto'], 9: ['septiembre', 'setiembre'], 10: ['octubre'], 11: ['noviembre'], 12: ['diciembre'], }, 'se': { 1: ['januari'], 2: ['februari'], 3: ['mars'], 4: ['april'], 5: ['maj'], 6: ['juni'], 7: ['juli'], 8: ['augusti'], 9: ['september'], 10: ['oktober'], 11: ['november'], 12: ['december'], }, 'jp': { 1: ['1月'], 2: ['2月'], 3: ['3月'], 4: ['4月'], 5: ['5月'], 6: ['6月'], 7: ['7月'], 8: ['8月'], 9: ['9月'], 10: ['10月'], 11: ['11月'], 12: ['12月'], }, 'nl': { 1: ['januari'], 2: ['februari'], 3: ['maart'], 5: ['mei'], 6: ['juni'], 7: ['juli'], 8: ['augustus'], 10: ['oktober'], } } # }}} self.english_months = [None, 'Jan', 'Feb', 'Mar', 'Apr', 'May', 'Jun', 'Jul', 'Aug', 'Sep', 'Oct', 'Nov', 'Dec'] self.months = months.get(self.domain, {}) self.pd_xpath = ''' //h2[text()="Product Details" or \ text()="Produktinformation" or \ text()="Dettagli prodotto" or \ text()="Product details" or \ text()="Détails sur le produit" or \ text()="Detalles del producto" or \ text()="Detalhes do produto" or \ text()="Productgegevens" or \ text()="基本信息" or \ starts-with(text(), "登録情報")]/../div[@class="content"] ''' # Editor: is for Spanish self.publisher_xpath = ''' descendant::*[starts-with(text(), "Publisher:") or \ starts-with(text(), "Verlag:") or \ starts-with(text(), "Editore:") or \ starts-with(text(), "Editeur") or \ starts-with(text(), "Editor:") or \ starts-with(text(), "Editora:") or \ starts-with(text(), "Uitgever:") or \ starts-with(text(), "Utgivare:") or \ starts-with(text(), "出版社:")] ''' self.pubdate_xpath = ''' descendant::*[starts-with(text(), "Publication Date:") or \ starts-with(text(), "Audible.com Release Date:")] ''' self.publisher_names = {'Publisher', 'Uitgever', 'Verlag', 'Utgivare', 'Herausgeber', 'Editore', 'Editeur', 'Editor', 'Editora', '出版社'} self.language_xpath = ''' descendant::*[ starts-with(text(), "Language:") \ or text() = "Language" \ or text() = "Sprache:" \ or text() = "Lingua:" \ or text() = "Idioma:" \ or starts-with(text(), "Langue") \ or starts-with(text(), "言語") \ or starts-with(text(), "Språk") \ or starts-with(text(), "语种") ] ''' self.language_names = {'Language', 'Sprache', 'Språk', 'Lingua', 'Idioma', 'Langue', '言語', 'Taal', '语种'} self.tags_xpath = ''' descendant::h2[ text() = "Look for Similar Items by Category" or text() = "Ähnliche Artikel finden" or text() = "Buscar productos similares por categoría" or text() = "Ricerca articoli simili per categoria" or text() = "Rechercher des articles similaires par rubrique" or text() = "Procure por items similares por categoria" or text() = "関連商品を探す" ]/../descendant::ul/li ''' self.ratings_pat = re.compile( r'([0-9.,]+) ?(out of|von|van|su|étoiles sur|つ星のうち|de un máximo de|de|av) ' r'([\d\.]+)( (stars|Sternen|stelle|estrellas|estrelas|sterren|stjärnor)){0,1}' ) self.ratings_pat_cn = re.compile('([0-9.]+) 颗星,最多 5 颗星') self.ratings_pat_jp = re.compile(r'\d+つ星のうち([\d\.]+)') lm = { 'eng': ('English', 'Englisch', 'Engels', 'Engelska'), 'fra': ('French', 'Français'), 'ita': ('Italian', 'Italiano'), 'deu': ('German', 'Deutsch'), 'spa': ('Spanish', 'Espa\xf1ol', 'Espaniol'), 'jpn': ('Japanese', '日本語'), 'por': ('Portuguese', 'Português'), 'nld': ('Dutch', 'Nederlands',), 'chs': ('Chinese', '中文', '简体中文'), 'swe': ('Swedish', 'Svenska'), } self.lang_map = {} for code, names in lm.items(): for name in names: self.lang_map[name] = code self.series_pat = re.compile( r''' \|\s* # Prefix (Series)\s*:\s* # Series declaration (?P<series>.+?)\s+ # The series name \((Book)\s* # Book declaration (?P<index>[0-9.]+) # Series index \s*\) ''', re.X) def delocalize_datestr(self, raw): if self.domain == 'cn': return raw.replace('年', '-').replace('月', '-').replace('日', '') if not self.months: return raw ans = raw.lower() for i, vals in self.months.items(): for x in vals: ans = ans.replace(x, self.english_months[i]) ans = ans.replace(' de ', ' ') return ans def run(self): try: self.get_details() except: self.log.exception('get_details failed for url: %r' % self.url) def get_details(self): if self.preparsed_root is None: raw, root, selector = parse_details_page( self.url, self.log, self.timeout, self.browser, self.domain) else: raw, root, selector = self.preparsed_root from css_selectors import Select self.selector = Select(root) self.parse_details(raw, root) def parse_details(self, raw, root): asin = parse_asin(root, self.log, self.url) if not asin and root.xpath('//form[@action="/errors/validateCaptcha"]'): raise CaptchaError( 'Amazon returned a CAPTCHA page, probably because you downloaded too many books. Wait for some time and try again.') if self.testing: import tempfile import uuid with tempfile.NamedTemporaryFile(prefix=(asin or type('')(uuid.uuid4())) + '_', suffix='.html', delete=False) as f: f.write(raw) print('Downloaded html for', asin, 'saved in', f.name) try: title = self.parse_title(root) except: self.log.exception('Error parsing title for url: %r' % self.url) title = None try: authors = self.parse_authors(root) except: self.log.exception('Error parsing authors for url: %r' % self.url) authors = [] if not title or not authors or not asin: self.log.error( 'Could not find title/authors/asin for %r' % self.url) self.log.error('ASIN: %r Title: %r Authors: %r' % (asin, title, authors)) return mi = Metadata(title, authors) idtype = 'amazon' if self.domain == 'com' else 'amazon_' + self.domain mi.set_identifier(idtype, asin) self.amazon_id = asin try: mi.rating = self.parse_rating(root) except: self.log.exception('Error parsing ratings for url: %r' % self.url) try: mi.comments = self.parse_comments(root, raw) except: self.log.exception('Error parsing comments for url: %r' % self.url) try: series, series_index = self.parse_series(root) if series: mi.series, mi.series_index = series, series_index elif self.testing: mi.series, mi.series_index = 'Dummy series for testing', 1 except: self.log.exception('Error parsing series for url: %r' % self.url) try: mi.tags = self.parse_tags(root) except: self.log.exception('Error parsing tags for url: %r' % self.url) try: self.cover_url = self.parse_cover(root, raw) except: self.log.exception('Error parsing cover for url: %r' % self.url) if self.cover_url_processor is not None and self.cover_url and self.cover_url.startswith('/'): self.cover_url = self.cover_url_processor(self.cover_url) mi.has_cover = bool(self.cover_url) detail_bullets = root.xpath('//*[@data-feature-name="detailBullets"]') non_hero = tuple(self.selector( 'div#bookDetails_container_div div#nonHeroSection')) or tuple(self.selector( '#productDetails_techSpec_sections')) if detail_bullets: self.parse_detail_bullets(root, mi, detail_bullets[0]) elif non_hero: try: self.parse_new_details(root, mi, non_hero[0]) except: self.log.exception( 'Failed to parse new-style book details section') else: pd = root.xpath(self.pd_xpath) if pd: pd = pd[0] try: isbn = self.parse_isbn(pd) if isbn: self.isbn = mi.isbn = isbn except: self.log.exception( 'Error parsing ISBN for url: %r' % self.url) try: mi.publisher = self.parse_publisher(pd) except: self.log.exception( 'Error parsing publisher for url: %r' % self.url) try: mi.pubdate = self.parse_pubdate(pd) except: self.log.exception( 'Error parsing publish date for url: %r' % self.url) try: lang = self.parse_language(pd) if lang: mi.language = lang except: self.log.exception( 'Error parsing language for url: %r' % self.url) else: self.log.warning( 'Failed to find product description for url: %r' % self.url) mi.source_relevance = self.relevance if self.amazon_id: if self.isbn: self.plugin.cache_isbn_to_identifier(self.isbn, self.amazon_id) if self.cover_url: self.plugin.cache_identifier_to_cover_url(self.amazon_id, self.cover_url) self.plugin.clean_downloaded_metadata(mi) if self.filter_result(mi, self.log): self.result_queue.put(mi) def totext(self, elem, only_printable=False): res = self.tostring(elem, encoding='unicode', method='text') if only_printable: filtered_characters = list(s for s in res if s.isprintable()) res = ''.join(filtered_characters).strip() return res def parse_title(self, root): def sanitize_title(title): ans = re.sub(r'[(\[].*[)\]]', '', title).strip() if not ans: ans = title.rpartition('[')[0].strip() return ans h1 = root.xpath('//h1[@id="title"]') if h1: h1 = h1[0] for child in h1.xpath('./*[contains(@class, "a-color-secondary")]'): h1.remove(child) return sanitize_title(self.totext(h1)) tdiv = root.xpath('//h1[contains(@class, "parseasinTitle")]') if not tdiv: span = root.xpath('//*[@id="ebooksTitle"]') if span: return sanitize_title(self.totext(span[0])) raise ValueError('No title block found') tdiv = tdiv[0] actual_title = tdiv.xpath('descendant::*[@id="btAsinTitle"]') if actual_title: title = self.tostring(actual_title[0], encoding='unicode', method='text').strip() else: title = self.tostring(tdiv, encoding='unicode', method='text').strip() return sanitize_title(title) def parse_authors(self, root): for sel in ( '#byline .author .contributorNameID', '#byline .author a.a-link-normal', '#bylineInfo .author .contributorNameID', '#bylineInfo .author a.a-link-normal', '#bylineInfo #bylineContributor', ): matches = tuple(self.selector(sel)) if matches: authors = [self.totext(x) for x in matches] return [a for a in authors if a] x = '//h1[contains(@class, "parseasinTitle")]/following-sibling::span/*[(name()="a" and @href) or (name()="span" and @class="contributorNameTrigger")]' aname = root.xpath(x) if not aname: aname = root.xpath(''' //h1[contains(@class, "parseasinTitle")]/following-sibling::*[(name()="a" and @href) or (name()="span" and @class="contributorNameTrigger")] ''') for x in aname: x.tail = '' authors = [self.tostring(x, encoding='unicode', method='text').strip() for x in aname] authors = [a for a in authors if a] return authors def parse_rating(self, root): for x in root.xpath('//div[@id="cpsims-feature" or @id="purchase-sims-feature" or @id="rhf"]'): # Remove the similar books section as it can cause spurious # ratings matches x.getparent().remove(x) rating_paths = ( '//div[@data-feature-name="averageCustomerReviews" or @id="averageCustomerReviews"]', '//div[@class="jumpBar"]/descendant::span[contains(@class,"asinReviewsSummary")]', '//div[@class="buying"]/descendant::span[contains(@class,"asinReviewsSummary")]', '//span[@class="crAvgStars"]/descendant::span[contains(@class,"asinReviewsSummary")]' ) ratings = None for p in rating_paths: ratings = root.xpath(p) if ratings: break def parse_ratings_text(text): try: m = self.ratings_pat.match(text) return float(m.group(1).replace(',', '.')) / float(m.group(3)) * 5 except Exception: pass if ratings: ratings = ratings[0] for elem in ratings.xpath('descendant::*[@title]'): t = elem.get('title').strip() if self.domain == 'cn': m = self.ratings_pat_cn.match(t) if m is not None: return float(m.group(1)) elif self.domain == 'jp': m = self.ratings_pat_jp.match(t) if m is not None: return float(m.group(1)) else: ans = parse_ratings_text(t) if ans is not None: return ans for elem in ratings.xpath('descendant::span[@class="a-icon-alt"]'): t = self.tostring( elem, encoding='unicode', method='text', with_tail=False).strip() ans = parse_ratings_text(t) if ans is not None: return ans def _render_comments(self, desc): from calibre.library.comments import sanitize_comments_html for c in desc.xpath('descendant::noscript'): c.getparent().remove(c) for c in desc.xpath('descendant::*[@class="seeAll" or' ' @class="emptyClear" or @id="collapsePS" or' ' @id="expandPS"]'): c.getparent().remove(c) for b in desc.xpath('descendant::b[@style]'): # Bing highlights search results s = b.get('style', '') if 'color' in s: b.tag = 'span' del b.attrib['style'] for a in desc.xpath('descendant::a[@href]'): del a.attrib['href'] a.tag = 'span' desc = self.tostring(desc, method='html', encoding='unicode').strip() desc = xml_replace_entities(desc, 'utf-8') # Encoding bug in Amazon data U+fffd (replacement char) # in some examples it is present in place of ' desc = desc.replace('\ufffd', "'") # remove all attributes from tags desc = re.sub(r'<([a-zA-Z0-9]+)\s[^>]+>', r'<\1>', desc) # Collapse whitespace # desc = re.sub('\n+', '\n', desc) # desc = re.sub(' +', ' ', desc) # Remove the notice about text referring to out of print editions desc = re.sub(r'(?s)<em>--This text ref.*?</em>', '', desc) # Remove comments desc = re.sub(r'(?s)<!--.*?-->', '', desc) return sanitize_comments_html(desc) def parse_comments(self, root, raw): try: from urllib.parse import unquote except ImportError: from urllib import unquote ans = '' ns = tuple(self.selector('#bookDescription_feature_div noscript')) if ns: ns = ns[0] if len(ns) == 0 and ns.text: import html5lib # html5lib parsed noscript as CDATA ns = html5lib.parseFragment( '<div>%s</div>' % (ns.text), treebuilder='lxml', namespaceHTMLElements=False)[0] else: ns.tag = 'div' ans = self._render_comments(ns) else: desc = root.xpath('//div[@id="ps-content"]/div[@class="content"]') if desc: ans = self._render_comments(desc[0]) else: ns = tuple(self.selector('#bookDescription_feature_div .a-expander-content')) if ns: ans = self._render_comments(ns[0]) desc = root.xpath( '//div[@id="productDescription"]/*[@class="content"]') if desc: ans += self._render_comments(desc[0]) else: # Idiot chickens from amazon strike again. This data is now stored # in a JS variable inside a script tag URL encoded. m = re.search(br'var\s+iframeContent\s*=\s*"([^"]+)"', raw) if m is not None: try: text = unquote(m.group(1)).decode('utf-8') nr = parse_html(text) desc = nr.xpath( '//div[@id="productDescription"]/*[@class="content"]') if desc: ans += self._render_comments(desc[0]) except Exception as e: self.log.warn( 'Parsing of obfuscated product description failed with error: %s' % as_unicode(e)) else: desc = root.xpath('//div[@id="productDescription_fullView"]') if desc: ans += self._render_comments(desc[0]) return ans def parse_series(self, root): ans = (None, None) # This is found on the paperback/hardback pages for books on amazon.com series = root.xpath('//div[@data-feature-name="seriesTitle"]') if series: series = series[0] spans = series.xpath('./span') if spans: raw = self.tostring( spans[0], encoding='unicode', method='text', with_tail=False).strip() m = re.search(r'\s+([0-9.]+)$', raw.strip()) if m is not None: series_index = float(m.group(1)) s = series.xpath('./a[@id="series-page-link"]') if s: series = self.tostring( s[0], encoding='unicode', method='text', with_tail=False).strip() if series: ans = (series, series_index) else: series = root.xpath('//div[@id="seriesBulletWidget_feature_div"]') if series: a = series[0].xpath('descendant::a') if a: raw = self.tostring(a[0], encoding='unicode', method='text', with_tail=False) m = re.search(r'(?:Book|Libro|Buch)\s+(?P<index>[0-9.]+)\s+(?:of|de|von)\s+([0-9.]+)\s*:\s*(?P<series>.+)', raw.strip()) if m is not None: ans = (m.group('series').strip(), float(m.group('index'))) # This is found on Kindle edition pages on amazon.com if ans == (None, None): for span in root.xpath('//div[@id="aboutEbooksSection"]//li/span'): text = (span.text or '').strip() m = re.match(r'Book\s+([0-9.]+)', text) if m is not None: series_index = float(m.group(1)) a = span.xpath('./a[@href]') if a: series = self.tostring( a[0], encoding='unicode', method='text', with_tail=False).strip() if series: ans = (series, series_index) # This is found on newer Kindle edition pages on amazon.com if ans == (None, None): for b in root.xpath('//div[@id="reviewFeatureGroup"]/span/b'): text = (b.text or '').strip() m = re.match(r'Book\s+([0-9.]+)', text) if m is not None: series_index = float(m.group(1)) a = b.getparent().xpath('./a[@href]') if a: series = self.tostring( a[0], encoding='unicode', method='text', with_tail=False).partition('(')[0].strip() if series: ans = series, series_index if ans == (None, None): desc = root.xpath('//div[@id="ps-content"]/div[@class="buying"]') if desc: raw = self.tostring(desc[0], method='text', encoding='unicode') raw = re.sub(r'\s+', ' ', raw) match = self.series_pat.search(raw) if match is not None: s, i = match.group('series'), float(match.group('index')) if s: ans = (s, i) if ans[0]: ans = (re.sub(r'\s+Series$', '', ans[0]).strip(), ans[1]) ans = (re.sub(r'\(.+?\s+Series\)$', '', ans[0]).strip(), ans[1]) return ans def parse_tags(self, root): ans = [] exclude_tokens = {'kindle', 'a-z'} exclude = {'special features', 'by authors', 'authors & illustrators', 'books', 'new; used & rental textbooks'} seen = set() for li in root.xpath(self.tags_xpath): for i, a in enumerate(li.iterdescendants('a')): if i > 0: # we ignore the first category since it is almost always # too broad raw = (a.text or '').strip().replace(',', ';') lraw = icu_lower(raw) tokens = frozenset(lraw.split()) if raw and lraw not in exclude and not tokens.intersection(exclude_tokens) and lraw not in seen: ans.append(raw) seen.add(lraw) return ans def parse_cover(self, root, raw=b""): # Look for the image URL in javascript, using the first image in the # image gallery as the cover import json imgpat = re.compile(r"""'imageGalleryData'\s*:\s*(\[\s*{.+])""") for script in root.xpath('//script'): m = imgpat.search(script.text or '') if m is not None: try: return json.loads(m.group(1))[0]['mainUrl'] except Exception: continue def clean_img_src(src): parts = src.split('/') if len(parts) > 3: bn = parts[-1] sparts = bn.split('_') if len(sparts) > 2: bn = re.sub(r'\.\.jpg$', '.jpg', (sparts[0] + sparts[-1])) return ('/'.join(parts[:-1])) + '/' + bn imgpat2 = re.compile(r'var imageSrc = "([^"]+)"') for script in root.xpath('//script'): m = imgpat2.search(script.text or '') if m is not None: src = m.group(1) url = clean_img_src(src) if url: return url imgs = root.xpath( '//img[(@id="prodImage" or @id="original-main-image" or @id="main-image" or @id="main-image-nonjs") and @src]') if not imgs: imgs = ( root.xpath('//div[@class="main-image-inner-wrapper"]/img[@src]') or root.xpath('//div[@id="main-image-container" or @id="ebooks-main-image-container"]//img[@src]') or root.xpath( '//div[@id="mainImageContainer"]//img[@data-a-dynamic-image]') ) for img in imgs: try: idata = json.loads(img.get('data-a-dynamic-image')) except Exception: imgs = () else: mwidth = 0 try: url = None for iurl, (width, height) in idata.items(): if width > mwidth: mwidth = width url = iurl return url except Exception: pass for img in imgs: src = img.get('src') if 'data:' in src: continue if 'loading-' in src: js_img = re.search(br'"largeImage":"(https?://[^"]+)",', raw) if js_img: src = js_img.group(1).decode('utf-8') if ('/no-image-avail' not in src and 'loading-' not in src and '/no-img-sm' not in src): self.log('Found image: %s' % src) url = clean_img_src(src) if url: return url def parse_detail_bullets(self, root, mi, container): ul = next(self.selector('.detail-bullet-list', root=container)) for span in self.selector('.a-list-item', root=ul): cells = span.xpath('./span') if len(cells) >= 2: self.parse_detail_cells(mi, cells[0], cells[1]) def parse_new_details(self, root, mi, non_hero): table = non_hero.xpath('descendant::table')[0] for tr in table.xpath('descendant::tr'): cells = tr.xpath('descendant::*[local-name()="td" or local-name()="th"]') if len(cells) == 2: self.parse_detail_cells(mi, cells[0], cells[1]) def parse_detail_cells(self, mi, c1, c2): name = self.totext(c1, only_printable=True).strip().strip(':').strip() val = self.totext(c2).strip() if not val: return if name in self.language_names: ans = self.lang_map.get(val, None) if not ans: ans = canonicalize_lang(val) if ans: mi.language = ans elif name in self.publisher_names: pub = val.partition(';')[0].partition('(')[0].strip() if pub: mi.publisher = pub date = val.rpartition('(')[-1].replace(')', '').strip() try: from calibre.utils.date import parse_only_date date = self.delocalize_datestr(date) mi.pubdate = parse_only_date(date, assume_utc=True) except: self.log.exception('Failed to parse pubdate: %s' % val) elif name in {'ISBN', 'ISBN-10', 'ISBN-13'}: ans = check_isbn(val) if ans: self.isbn = mi.isbn = ans elif name in {'Publication date'}: from calibre.utils.date import parse_only_date date = self.delocalize_datestr(val) mi.pubdate = parse_only_date(date, assume_utc=True) def parse_isbn(self, pd): items = pd.xpath( 'descendant::*[starts-with(text(), "ISBN")]') if not items: items = pd.xpath( 'descendant::b[contains(text(), "ISBN:")]') for x in reversed(items): if x.tail: ans = check_isbn(x.tail.strip()) if ans: return ans def parse_publisher(self, pd): for x in reversed(pd.xpath(self.publisher_xpath)): if x.tail: ans = x.tail.partition(';')[0] return ans.partition('(')[0].strip() def parse_pubdate(self, pd): from calibre.utils.date import parse_only_date for x in reversed(pd.xpath(self.pubdate_xpath)): if x.tail: date = x.tail.strip() date = self.delocalize_datestr(date) try: return parse_only_date(date, assume_utc=True) except Exception: pass for x in reversed(pd.xpath(self.publisher_xpath)): if x.tail: ans = x.tail date = ans.rpartition('(')[-1].replace(')', '').strip() date = self.delocalize_datestr(date) try: return parse_only_date(date, assume_utc=True) except Exception: pass def parse_language(self, pd): for x in reversed(pd.xpath(self.language_xpath)): if x.tail: raw = x.tail.strip().partition(',')[0].strip() ans = self.lang_map.get(raw, None) if ans: return ans ans = canonicalize_lang(ans) if ans: return ans # }}} class Amazon(Source): name = 'Amazon.com' version = (1, 2, 23) minimum_calibre_version = (2, 82, 0) description = _('Downloads metadata and covers from Amazon') capabilities = frozenset(('identify', 'cover')) touched_fields = frozenset(('title', 'authors', 'identifier:amazon', 'rating', 'comments', 'publisher', 'pubdate', 'languages', 'series', 'tags')) has_html_comments = True supports_gzip_transfer_encoding = True prefer_results_with_isbn = False AMAZON_DOMAINS = { 'com': _('US'), 'fr': _('France'), 'de': _('Germany'), 'uk': _('UK'), 'au': _('Australia'), 'it': _('Italy'), 'jp': _('Japan'), 'es': _('Spain'), 'br': _('Brazil'), 'nl': _('Netherlands'), 'cn': _('China'), 'ca': _('Canada'), 'se': _('Sweden'), } SERVERS = { 'auto': _('Choose server automatically'), 'amazon': _('Amazon servers'), 'bing': _('Bing search cache'), 'google': _('Google search cache'), 'wayback': _('Wayback machine cache (slow)'), } options = ( Option('domain', 'choices', 'com', _('Amazon country website to use:'), _('Metadata from Amazon will be fetched using this ' 'country\'s Amazon website.'), choices=AMAZON_DOMAINS), Option('server', 'choices', 'auto', _('Server to get data from:'), _( 'Amazon has started blocking attempts to download' ' metadata from its servers. To get around this problem,' ' calibre can fetch the Amazon data from many different' ' places where it is cached. Choose the source you prefer.' ), choices=SERVERS), Option('use_mobi_asin', 'bool', False, _('Use the MOBI-ASIN for metadata search'), _( 'Enable this option to search for metadata with an' ' ASIN identifier from the MOBI file at the current country website,' ' unless any other amazon id is available. Note that if the' ' MOBI file came from a different Amazon country store, you could get' ' incorrect results.' )), ) def __init__(self, *args, **kwargs): Source.__init__(self, *args, **kwargs) self.set_amazon_id_touched_fields() def test_fields(self, mi): ''' Return the first field from self.touched_fields that is null on the mi object ''' for key in self.touched_fields: if key.startswith('identifier:'): key = key.partition(':')[-1] if key == 'amazon': if self.domain != 'com': key += '_' + self.domain if not mi.has_identifier(key): return 'identifier: ' + key elif mi.is_null(key): return key @property def browser(self): br = self._browser if br is None: ua = 'Mobile ' while not user_agent_is_ok(ua): ua = random_user_agent(allow_ie=False) # ua = 'Mozilla/5.0 (Linux; Android 8.0.0; VTR-L29; rv:63.0) Gecko/20100101 Firefox/63.0' self._browser = br = browser(user_agent=ua) br.set_handle_gzip(True) if self.use_search_engine: br.addheaders += [ ('Accept', accept_header_for_ua(ua)), ('Upgrade-insecure-requests', '1'), ] else: br.addheaders += [ ('Accept', accept_header_for_ua(ua)), ('Upgrade-insecure-requests', '1'), ('Referer', self.referrer_for_domain()), ] return br def save_settings(self, *args, **kwargs): Source.save_settings(self, *args, **kwargs) self.set_amazon_id_touched_fields() def set_amazon_id_touched_fields(self): ident_name = "identifier:amazon" if self.domain != 'com': ident_name += '_' + self.domain tf = [x for x in self.touched_fields if not x.startswith('identifier:amazon')] + [ident_name] self.touched_fields = frozenset(tf) def get_domain_and_asin(self, identifiers, extra_domains=()): identifiers = {k.lower(): v for k, v in identifiers.items()} for key, val in identifiers.items(): if key in ('amazon', 'asin'): return 'com', val if key.startswith('amazon_'): domain = key.partition('_')[-1] if domain and (domain in self.AMAZON_DOMAINS or domain in extra_domains): return domain, val if self.prefs['use_mobi_asin']: val = identifiers.get('mobi-asin') if val is not None: return self.domain, val return None, None def referrer_for_domain(self, domain=None): domain = domain or self.domain return { 'uk': 'https://www.amazon.co.uk/', 'au': 'https://www.amazon.com.au/', 'br': 'https://www.amazon.com.br/', 'jp': 'https://www.amazon.co.jp/', }.get(domain, 'https://www.amazon.%s/' % domain) def _get_book_url(self, identifiers): # {{{ domain, asin = self.get_domain_and_asin( identifiers, extra_domains=('in', 'au', 'ca')) if domain and asin: url = None r = self.referrer_for_domain(domain) if r is not None: url = r + 'dp/' + asin if url: idtype = 'amazon' if domain == 'com' else 'amazon_' + domain return domain, idtype, asin, url def get_book_url(self, identifiers): ans = self._get_book_url(identifiers) if ans is not None: return ans[1:] def get_book_url_name(self, idtype, idval, url): if idtype == 'amazon': return self.name return 'A' + idtype.replace('_', '.')[1:] # }}} @property def domain(self): x = getattr(self, 'testing_domain', None) if x is not None: return x domain = self.prefs['domain'] if domain not in self.AMAZON_DOMAINS: domain = 'com' return domain @property def server(self): x = getattr(self, 'testing_server', None) if x is not None: return x server = self.prefs['server'] if server not in self.SERVERS: server = 'auto' return server @property def use_search_engine(self): return self.server != 'amazon' def clean_downloaded_metadata(self, mi): docase = ( mi.language == 'eng' or (mi.is_null('language') and self.domain in {'com', 'uk', 'au'}) ) if mi.title and docase: # Remove series information from title m = re.search(r'\S+\s+(\(.+?\s+Book\s+\d+\))$', mi.title) if m is not None: mi.title = mi.title.replace(m.group(1), '').strip() mi.title = fixcase(mi.title) mi.authors = fixauthors(mi.authors) if mi.tags and docase: mi.tags = list(map(fixcase, mi.tags)) mi.isbn = check_isbn(mi.isbn) if mi.series and docase: mi.series = fixcase(mi.series) if mi.title and mi.series: for pat in (r':\s*Book\s+\d+\s+of\s+%s$', r'\(%s\)$', r':\s*%s\s+Book\s+\d+$'): pat = pat % re.escape(mi.series) q = re.sub(pat, '', mi.title, flags=re.I).strip() if q and q != mi.title: mi.title = q break def get_website_domain(self, domain): return {'uk': 'co.uk', 'jp': 'co.jp', 'br': 'com.br', 'au': 'com.au'}.get(domain, domain) def create_query(self, log, title=None, authors=None, identifiers={}, # {{{ domain=None, for_amazon=True): try: from urllib.parse import urlencode, unquote_plus except ImportError: from urllib import urlencode, unquote_plus if domain is None: domain = self.domain idomain, asin = self.get_domain_and_asin(identifiers) if idomain is not None: domain = idomain # See the amazon detailed search page to get all options terms = [] q = {'search-alias': 'aps', 'unfiltered': '1', } if domain == 'com': q['sort'] = 'relevanceexprank' else: q['sort'] = 'relevancerank' isbn = check_isbn(identifiers.get('isbn', None)) if asin is not None: q['field-keywords'] = asin terms.append(asin) elif isbn is not None: q['field-isbn'] = isbn if len(isbn) == 13: terms.extend('({} OR {}-{})'.format(isbn, isbn[:3], isbn[3:]).split()) else: terms.append(isbn) else: # Only return book results q['search-alias'] = {'br': 'digital-text', 'nl': 'aps'}.get(domain, 'stripbooks') if title: title_tokens = list(self.get_title_tokens(title)) if title_tokens: q['field-title'] = ' '.join(title_tokens) terms.extend(title_tokens) if authors: author_tokens = list(self.get_author_tokens(authors, only_first_author=True)) if author_tokens: q['field-author'] = ' '.join(author_tokens) terms.extend(author_tokens) if not ('field-keywords' in q or 'field-isbn' in q or ('field-title' in q)): # Insufficient metadata to make an identify query return None, None if not for_amazon: return terms, domain if domain == 'nl': q['__mk_nl_NL'] = 'ÅMÅŽÕÑ' if 'field-keywords' not in q: q['field-keywords'] = '' for f in 'field-isbn field-title field-author'.split(): q['field-keywords'] += ' ' + q.pop(f, '') q['field-keywords'] = q['field-keywords'].strip() encoded_q = dict([(x.encode('utf-8', 'ignore'), y.encode( 'utf-8', 'ignore')) for x, y in q.items()]) url_query = urlencode(encoded_q) # amazon's servers want IRIs with unicode characters not percent esaped parts = [] for x in url_query.split(b'&' if isinstance(url_query, bytes) else '&'): k, v = x.split(b'=' if isinstance(x, bytes) else '=', 1) parts.append('{}={}'.format(iri_quote_plus(unquote_plus(k)), iri_quote_plus(unquote_plus(v)))) url_query = '&'.join(parts) url = 'https://www.amazon.%s/s/?' % self.get_website_domain( domain) + url_query return url, domain # }}} def get_cached_cover_url(self, identifiers): # {{{ url = None domain, asin = self.get_domain_and_asin(identifiers) if asin is None: isbn = identifiers.get('isbn', None) if isbn is not None: asin = self.cached_isbn_to_identifier(isbn) if asin is not None: url = self.cached_identifier_to_cover_url(asin) return url # }}} def parse_results_page(self, root, domain): # {{{ from lxml.html import tostring matches = [] def title_ok(title): title = title.lower() bad = ['bulk pack', '[audiobook]', '[audio cd]', '(a book companion)', '( slipcase with door )', ': free sampler'] if self.domain == 'com': bad.extend(['(%s edition)' % x for x in ('spanish', 'german')]) for x in bad: if x in title: return False if title and title[0] in '[{' and re.search(r'\(\s*author\s*\)', title) is not None: # Bad entries in the catalog return False return True for query in ( '//div[contains(@class, "s-result-list")]//h2/a[@href]', '//div[contains(@class, "s-result-list")]//div[@data-index]//h5//a[@href]', r'//li[starts-with(@id, "result_")]//a[@href and contains(@class, "s-access-detail-page")]', ): result_links = root.xpath(query) if result_links: break for a in result_links: title = tostring(a, method='text', encoding='unicode') if title_ok(title): url = a.get('href') if url.startswith('/'): url = 'https://www.amazon.%s%s' % ( self.get_website_domain(domain), url) matches.append(url) if not matches: # Previous generation of results page markup for div in root.xpath(r'//div[starts-with(@id, "result_")]'): links = div.xpath(r'descendant::a[@class="title" and @href]') if not links: # New amazon markup links = div.xpath('descendant::h3/a[@href]') for a in links: title = tostring(a, method='text', encoding='unicode') if title_ok(title): url = a.get('href') if url.startswith('/'): url = 'https://www.amazon.%s%s' % ( self.get_website_domain(domain), url) matches.append(url) break if not matches: # This can happen for some user agents that Amazon thinks are # mobile/less capable for td in root.xpath( r'//div[@id="Results"]/descendant::td[starts-with(@id, "search:Td:")]'): for a in td.xpath(r'descendant::td[@class="dataColumn"]/descendant::a[@href]/span[@class="srTitle"]/..'): title = tostring(a, method='text', encoding='unicode') if title_ok(title): url = a.get('href') if url.startswith('/'): url = 'https://www.amazon.%s%s' % ( self.get_website_domain(domain), url) matches.append(url) break if not matches and root.xpath('//form[@action="/errors/validateCaptcha"]'): raise CaptchaError('Amazon returned a CAPTCHA page. Recently Amazon has begun using statistical' ' profiling to block access to its website. As such this metadata plugin is' ' unlikely to ever work reliably.') # Keep only the top 3 matches as the matches are sorted by relevance by # Amazon so lower matches are not likely to be very relevant return matches[:3] # }}} def search_amazon(self, br, testing, log, abort, title, authors, identifiers, timeout): # {{{ from calibre.utils.cleantext import clean_ascii_chars from calibre.ebooks.chardet import xml_to_unicode matches = [] query, domain = self.create_query(log, title=title, authors=authors, identifiers=identifiers) if query is None: log.error('Insufficient metadata to construct query') raise SearchFailed() try: raw = br.open_novisit(query, timeout=timeout).read().strip() except Exception as e: if callable(getattr(e, 'getcode', None)) and \ e.getcode() == 404: log.error('Query malformed: %r' % query) raise SearchFailed() attr = getattr(e, 'args', [None]) attr = attr if attr else [None] if isinstance(attr[0], socket.timeout): msg = _('Amazon timed out. Try again later.') log.error(msg) else: msg = 'Failed to make identify query: %r' % query log.exception(msg) raise SearchFailed() raw = clean_ascii_chars(xml_to_unicode(raw, strip_encoding_pats=True, resolve_entities=True)[0]) if testing: import tempfile with tempfile.NamedTemporaryFile(prefix='amazon_results_', suffix='.html', delete=False) as f: f.write(raw.encode('utf-8')) print('Downloaded html for results page saved in', f.name) matches = [] found = '<title>404 - ' not in raw if found: try: root = parse_html(raw) except Exception: msg = 'Failed to parse amazon page for query: %r' % query log.exception(msg) raise SearchFailed() matches = self.parse_results_page(root, domain) return matches, query, domain, None # }}} def search_search_engine(self, br, testing, log, abort, title, authors, identifiers, timeout, override_server=None): # {{{ from calibre.ebooks.metadata.sources.update import search_engines_module terms, domain = self.create_query(log, title=title, authors=authors, identifiers=identifiers, for_amazon=False) site = self.referrer_for_domain( domain)[len('https://'):].partition('/')[0] matches = [] se = search_engines_module() server = override_server or self.server if server in ('bing',): urlproc, sfunc = se.bing_url_processor, se.bing_search elif server in ('auto', 'google'): urlproc, sfunc = se.google_url_processor, se.google_search elif server == 'wayback': urlproc, sfunc = se.wayback_url_processor, se.ddg_search results, qurl = sfunc(terms, site, log=log, br=br, timeout=timeout) br.set_current_header('Referer', qurl) for result in results: if abort.is_set(): return matches, terms, domain, None purl = urlparse(result.url) if '/dp/' in purl.path and site in purl.netloc: url = result.cached_url if url is None: url = se.wayback_machine_cached_url( result.url, br, timeout=timeout) if url is None: log('Failed to find cached page for:', result.url) continue if url not in matches: matches.append(url) if len(matches) >= 3: break else: log('Skipping non-book result:', result) if not matches: log('No search engine results for terms:', ' '.join(terms)) if urlproc is se.google_url_processor: # Google does not cache adult titles log('Trying the bing search engine instead') return self.search_search_engine(br, testing, log, abort, title, authors, identifiers, timeout, 'bing') return matches, terms, domain, urlproc # }}} def identify(self, log, result_queue, abort, title=None, authors=None, # {{{ identifiers={}, timeout=60): ''' Note this method will retry without identifiers automatically if no match is found with identifiers. ''' testing = getattr(self, 'running_a_test', False) udata = self._get_book_url(identifiers) br = self.browser log('User-agent:', br.current_user_agent()) log('Server:', self.server) if testing: print('User-agent:', br.current_user_agent()) if udata is not None and not self.use_search_engine: # Try to directly get details page instead of running a search # Cannot use search engine as the directly constructed URL is # usually redirected to a full URL by amazon, and is therefore # not cached domain, idtype, asin, durl = udata if durl is not None: preparsed_root = parse_details_page( durl, log, timeout, br, domain) if preparsed_root is not None: qasin = parse_asin(preparsed_root[1], log, durl) if qasin == asin: w = Worker(durl, result_queue, br, log, 0, domain, self, testing=testing, preparsed_root=preparsed_root, timeout=timeout) try: w.get_details() return except Exception: log.exception( 'get_details failed for url: %r' % durl) func = self.search_search_engine if self.use_search_engine else self.search_amazon try: matches, query, domain, cover_url_processor = func( br, testing, log, abort, title, authors, identifiers, timeout) except SearchFailed: return if abort.is_set(): return if not matches: if identifiers and title and authors: log('No matches found with identifiers, retrying using only' ' title and authors. Query: %r' % query) time.sleep(1) return self.identify(log, result_queue, abort, title=title, authors=authors, timeout=timeout) log.error('No matches found with query: %r' % query) return workers = [Worker( url, result_queue, br, log, i, domain, self, testing=testing, timeout=timeout, cover_url_processor=cover_url_processor, filter_result=partial( self.filter_result, title, authors, identifiers)) for i, url in enumerate(matches)] for w in workers: # Don't send all requests at the same time time.sleep(1) w.start() if abort.is_set(): return while not abort.is_set(): a_worker_is_alive = False for w in workers: w.join(0.2) if abort.is_set(): break if w.is_alive(): a_worker_is_alive = True if not a_worker_is_alive: break return None # }}} def filter_result(self, title, authors, identifiers, mi, log): # {{{ if not self.use_search_engine: return True if title is not None: def tokenize_title(x): return icu_lower(x).replace("'", '').replace('"', '').rstrip(':') tokens = {tokenize_title(x) for x in title.split() if len(x) > 3} if tokens: result_tokens = {tokenize_title(x) for x in mi.title.split()} if not tokens.intersection(result_tokens): log('Ignoring result:', mi.title, 'as its title does not match') return False if authors: author_tokens = set() for author in authors: author_tokens |= {icu_lower(x) for x in author.split() if len(x) > 2} result_tokens = set() for author in mi.authors: result_tokens |= {icu_lower(x) for x in author.split() if len(x) > 2} if author_tokens and not author_tokens.intersection(result_tokens): log('Ignoring result:', mi.title, 'by', ' & '.join(mi.authors), 'as its author does not match') return False return True # }}} def download_cover(self, log, result_queue, abort, # {{{ title=None, authors=None, identifiers={}, timeout=60, get_best_cover=False): cached_url = self.get_cached_cover_url(identifiers) if cached_url is None: log.info('No cached cover found, running identify') rq = Queue() self.identify(log, rq, abort, title=title, authors=authors, identifiers=identifiers) if abort.is_set(): return if abort.is_set(): return results = [] while True: try: results.append(rq.get_nowait()) except Empty: break results.sort(key=self.identify_results_keygen( title=title, authors=authors, identifiers=identifiers)) for mi in results: cached_url = self.get_cached_cover_url(mi.identifiers) if cached_url is not None: break if cached_url is None: log.info('No cover found') return if abort.is_set(): return log('Downloading cover from:', cached_url) br = self.browser if self.use_search_engine: br = br.clone_browser() br.set_current_header('Referer', self.referrer_for_domain(self.domain)) try: time.sleep(1) cdata = br.open_novisit( cached_url, timeout=timeout).read() result_queue.put((self, cdata)) except: log.exception('Failed to download cover from:', cached_url) # }}} def manual_tests(domain, **kw): # {{{ # To run these test use: # calibre-debug -c "from calibre.ebooks.metadata.sources.amazon import *; manual_tests('com')" from calibre.ebooks.metadata.sources.test import (test_identify_plugin, isbn_test, title_test, authors_test, comments_test, series_test) all_tests = {} all_tests['com'] = [ # {{{ ( # Paperback with series {'identifiers': {'amazon': '1423146786'}}, [title_test('The Heroes of Olympus, Book Five The Blood of Olympus', exact=True), series_test('The Heroes of Olympus', 5)] ), ( # Kindle edition with series {'identifiers': {'amazon': 'B0085UEQDO'}}, [title_test('Three Parts Dead', exact=True), series_test('Craft Sequence', 1)] ), ( # + in title and uses id="main-image" for cover {'identifiers': {'amazon': '1933988770'}}, [title_test( 'C++ Concurrency in Action: Practical Multithreading', exact=True)] ), ( # Different comments markup, using Book Description section {'identifiers': {'amazon': '0982514506'}}, [title_test( "Griffin's Destiny: Book Three: The Griffin's Daughter Trilogy", exact=True), comments_test('Jelena'), comments_test('Ashinji'), ] ), ( # # in title {'title': 'Expert C# 2008 Business Objects', 'authors': ['Lhotka']}, [title_test('Expert C#'), authors_test(['Rockford Lhotka']) ] ), ( # No specific problems {'identifiers': {'isbn': '0743273567'}}, [title_test('the great gatsby: the only authorized edition', exact=True), authors_test(['Francis Scott Fitzgerald'])] ), ] # }}} all_tests['de'] = [ # {{{ # series ( {'identifiers': {'isbn': '3499275120'}}, [title_test('Vespasian: Das Schwert des Tribuns: Historischer Roman', exact=False), authors_test(['Robert Fabbri']), series_test('Die Vespasian-Reihe', 1) ] ), ( # umlaut in title/authors {'title': 'Flüsternde Wälder', 'authors': ['Nicola Förg']}, [title_test('Flüsternde Wälder'), authors_test(['Nicola Förg'], subset=True) ] ), ( {'identifiers': {'isbn': '9783453314979'}}, [title_test('Die letzten Wächter: Roman', exact=False), authors_test(['Sergej Lukianenko']) ] ), ( {'identifiers': {'isbn': '3548283519'}}, [title_test('Wer Wind Sät: Der Fünfte Fall Für Bodenstein Und Kirchhoff', exact=False), authors_test(['Nele Neuhaus']) ] ), ] # }}} all_tests['it'] = [ # {{{ ( {'identifiers': {'isbn': '8838922195'}}, [title_test('La briscola in cinque', exact=True), authors_test(['Marco Malvaldi']) ] ), ] # }}} all_tests['fr'] = [ # {{{ ( {'identifiers': {'amazon_fr': 'B07L7ST4RS'}}, [title_test('Le secret de Lola', exact=True), authors_test(['Amélie BRIZIO']) ] ), ( {'identifiers': {'isbn': '2221116798'}}, [title_test('L\'étrange voyage de Monsieur Daldry', exact=True), authors_test(['Marc Levy']) ] ), ] # }}} all_tests['es'] = [ # {{{ ( {'identifiers': {'isbn': '8483460831'}}, [title_test('Tiempos Interesantes', exact=False), authors_test(['Terry Pratchett']) ] ), ] # }}} all_tests['se'] = [ # {{{ ( {'identifiers': {'isbn': '9780552140287'}}, [title_test('Men At Arms: A Discworld Novel: 14', exact=False), authors_test(['Terry Pratchett']) ] ), ] # }}} all_tests['jp'] = [ # {{{ ( # Adult filtering test {'identifiers': {'isbn': '4799500066'}}, [title_test('Bitch Trap'), ] ), ( # isbn -> title, authors {'identifiers': {'isbn': '9784101302720'}}, [title_test('精霊の守り人', exact=True), authors_test(['上橋 菜穂子']) ] ), ( # title, authors -> isbn (will use Shift_JIS encoding in query.) {'title': '考えない練習', 'authors': ['小池 龍之介']}, [isbn_test('9784093881067'), ] ), ] # }}} all_tests['br'] = [ # {{{ ( {'title': 'Guerra dos Tronos'}, [title_test('A Guerra dos Tronos - As Crônicas de Gelo e Fogo', exact=True), authors_test(['George R. R. Martin']) ] ), ] # }}} all_tests['nl'] = [ # {{{ ( {'title': 'Freakonomics'}, [title_test('Freakonomics', exact=True), authors_test(['Steven Levitt & Stephen Dubner & R. Kuitenbrouwer & O. Brenninkmeijer & A. van Den Berg']) ] ), ] # }}} all_tests['cn'] = [ # {{{ ( {'identifiers': {'isbn': '9787115369512'}}, [title_test('若为自由故 自由软件之父理查德斯托曼传', exact=True), authors_test(['[美]sam Williams', '邓楠,李凡希'])] ), ( {'title': '爱上Raspberry Pi'}, [title_test('爱上Raspberry Pi', exact=True), authors_test(['Matt Richardson', 'Shawn Wallace', '李凡希']) ] ), ] # }}} all_tests['ca'] = [ # {{{ ( # Paperback with series {'identifiers': {'isbn': '9781623808747'}}, [title_test('Parting Shot', exact=True), authors_test(['Mary Calmes'])] ), ( # # in title {'title': 'Expert C# 2008 Business Objects', 'authors': ['Lhotka']}, [title_test('Expert C# 2008 Business Objects'), authors_test(['Rockford Lhotka'])] ), ( # noscript description {'identifiers': {'amazon_ca': '162380874X'}}, [title_test('Parting Shot', exact=True), authors_test(['Mary Calmes']) ] ), ] # }}} def do_test(domain, start=0, stop=None, server='auto'): tests = all_tests[domain] if stop is None: stop = len(tests) tests = tests[start:stop] test_identify_plugin(Amazon.name, tests, modify_plugin=lambda p: ( setattr(p, 'testing_domain', domain), setattr(p, 'touched_fields', p.touched_fields - {'tags'}), setattr(p, 'testing_server', server), )) do_test(domain, **kw) # }}}