%PDF- %PDF-
Direktori : /usr/lib/calibre/calibre/ebooks/metadata/sources/ |
Current File : //usr/lib/calibre/calibre/ebooks/metadata/sources/edelweiss.py |
#!/usr/bin/env python3 # vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:fdm=marker:ai from __future__ import absolute_import, division, print_function, unicode_literals __license__ = 'GPL v3' __copyright__ = '2013, Kovid Goyal <kovid at kovidgoyal.net>' __docformat__ = 'restructuredtext en' import time, re from threading import Thread try: from queue import Empty, Queue except ImportError: from Queue import Empty, Queue from calibre import as_unicode, random_user_agent from calibre.ebooks.metadata import check_isbn from calibre.ebooks.metadata.sources.base import Source def clean_html(raw): from calibre.ebooks.chardet import xml_to_unicode from calibre.utils.cleantext import clean_ascii_chars return clean_ascii_chars(xml_to_unicode(raw, strip_encoding_pats=True, resolve_entities=True, assume_utf8=True)[0]) def parse_html(raw): raw = clean_html(raw) from html5_parser import parse return parse(raw) def astext(node): from lxml import etree return etree.tostring(node, method='text', encoding='unicode', with_tail=False).strip() class Worker(Thread): # {{{ def __init__(self, basic_data, relevance, result_queue, br, timeout, log, plugin): Thread.__init__(self) self.daemon = True self.basic_data = basic_data self.br, self.log, self.timeout = br, log, timeout self.result_queue, self.plugin, self.sku = result_queue, plugin, self.basic_data['sku'] self.relevance = relevance def run(self): url = ('https://www.edelweiss.plus/GetTreelineControl.aspx?controlName=/uc/product/two_Enhanced.ascx&' 'sku={0}&idPrefix=content_1_{0}&mode=0'.format(self.sku)) try: raw = self.br.open_novisit(url, timeout=self.timeout).read() except: self.log.exception('Failed to load comments page: %r'%url) return try: mi = self.parse(raw) mi.source_relevance = self.relevance self.plugin.clean_downloaded_metadata(mi) self.result_queue.put(mi) except: self.log.exception('Failed to parse details for sku: %s'%self.sku) def parse(self, raw): from calibre.ebooks.metadata.book.base import Metadata from calibre.utils.date import UNDEFINED_DATE root = parse_html(raw) mi = Metadata(self.basic_data['title'], self.basic_data['authors']) # Identifiers if self.basic_data['isbns']: mi.isbn = self.basic_data['isbns'][0] mi.set_identifier('edelweiss', self.sku) # Tags if self.basic_data['tags']: mi.tags = self.basic_data['tags'] mi.tags = [t[1:].strip() if t.startswith('&') else t for t in mi.tags] # Publisher mi.publisher = self.basic_data['publisher'] # Pubdate if self.basic_data['pubdate'] and self.basic_data['pubdate'].year != UNDEFINED_DATE: mi.pubdate = self.basic_data['pubdate'] # Rating if self.basic_data['rating']: mi.rating = self.basic_data['rating'] # Comments comments = '' for cid in ('summary', 'contributorbio', 'quotes_reviews'): cid = 'desc_{}{}-content'.format(cid, self.sku) div = root.xpath('//*[@id="{}"]'.format(cid)) if div: comments += self.render_comments(div[0]) if comments: mi.comments = comments mi.has_cover = self.plugin.cached_identifier_to_cover_url(self.sku) is not None return mi def render_comments(self, desc): from lxml import etree from calibre.library.comments import sanitize_comments_html for c in desc.xpath('descendant::noscript'): c.getparent().remove(c) for a in desc.xpath('descendant::a[@href]'): del a.attrib['href'] a.tag = 'span' desc = etree.tostring(desc, method='html', encoding='unicode').strip() # remove all attributes from tags desc = re.sub(r'<([a-zA-Z0-9]+)\s[^>]+>', r'<\1>', desc) # Collapse whitespace # desc = re.sub('\n+', '\n', desc) # desc = re.sub(' +', ' ', desc) # Remove comments desc = re.sub(r'(?s)<!--.*?-->', '', desc) return sanitize_comments_html(desc) # }}} def get_basic_data(browser, log, *skus): from calibre.utils.date import parse_only_date from mechanize import Request zeroes = ','.join('0' for sku in skus) data = { 'skus': ','.join(skus), 'drc': zeroes, 'startPosition': '0', 'sequence': '1', 'selected': zeroes, 'itemID': '0', 'orderID': '0', 'mailingID': '', 'tContentWidth': '926', 'originalOrder': ','.join(type('')(i) for i in range(len(skus))), 'selectedOrderID': '0', 'selectedSortColumn': '0', 'listType': '1', 'resultType': '32', 'blockView': '1', } items_data_url = 'https://www.edelweiss.plus/GetTreelineControl.aspx?controlName=/uc/listviews/ListView_Title_Multi.ascx' req = Request(items_data_url, data) response = browser.open_novisit(req) raw = response.read() root = parse_html(raw) for item in root.xpath('//div[@data-priority]'): row = item.getparent().getparent() sku = item.get('id').split('-')[-1] isbns = [x.strip() for x in row.xpath('descendant::*[contains(@class, "pev_sku")]/text()')[0].split(',') if check_isbn(x.strip())] isbns.sort(key=len, reverse=True) try: tags = [x.strip() for x in astext(row.xpath('descendant::*[contains(@class, "pev_categories")]')[0]).split('/')] except IndexError: tags = [] rating = 0 for bar in row.xpath('descendant::*[contains(@class, "bgdColorCommunity")]/@style'): m = re.search(r'width: (\d+)px;.*max-width: (\d+)px', bar) if m is not None: rating = float(m.group(1)) / float(m.group(2)) break try: pubdate = parse_only_date(astext(row.xpath('descendant::*[contains(@class, "pev_shipDate")]')[0] ).split(':')[-1].split(u'\xa0')[-1].strip(), assume_utc=True) except Exception: log.exception('Error parsing published date') pubdate = None authors = [] for x in [x.strip() for x in row.xpath('descendant::*[contains(@class, "pev_contributor")]/@title')]: authors.extend(a.strip() for a in x.split(',')) entry = { 'sku': sku, 'cover': row.xpath('descendant::img/@src')[0].split('?')[0], 'publisher': astext(row.xpath('descendant::*[contains(@class, "headerPublisher")]')[0]), 'title': astext(row.xpath('descendant::*[@id="title_{}"]'.format(sku))[0]), 'authors': authors, 'isbns': isbns, 'tags': tags, 'pubdate': pubdate, 'format': ' '.join(row.xpath('descendant::*[contains(@class, "pev_format")]/text()')).strip(), 'rating': rating, } if entry['cover'].startswith('/'): entry['cover'] = None yield entry class Edelweiss(Source): name = 'Edelweiss' version = (2, 0, 1) minimum_calibre_version = (3, 6, 0) description = _('Downloads metadata and covers from Edelweiss - A catalog updated by book publishers') capabilities = frozenset(['identify', 'cover']) touched_fields = frozenset([ 'title', 'authors', 'tags', 'pubdate', 'comments', 'publisher', 'identifier:isbn', 'identifier:edelweiss', 'rating']) supports_gzip_transfer_encoding = True has_html_comments = True @property def user_agent(self): # Pass in an index to random_user_agent() to test with a particular # user agent return random_user_agent(allow_ie=False) def _get_book_url(self, sku): if sku: return 'https://www.edelweiss.plus/#sku={}&page=1'.format(sku) def get_book_url(self, identifiers): # {{{ sku = identifiers.get('edelweiss', None) if sku: return 'edelweiss', sku, self._get_book_url(sku) # }}} def get_cached_cover_url(self, identifiers): # {{{ sku = identifiers.get('edelweiss', None) if not sku: isbn = identifiers.get('isbn', None) if isbn is not None: sku = self.cached_isbn_to_identifier(isbn) return self.cached_identifier_to_cover_url(sku) # }}} def create_query(self, log, title=None, authors=None, identifiers={}): try: from urllib.parse import urlencode except ImportError: from urllib import urlencode import time BASE_URL = ('https://www.edelweiss.plus/GetTreelineControl.aspx?' 'controlName=/uc/listviews/controls/ListView_data.ascx&itemID=0&resultType=32&dashboardType=8&itemType=1&dataType=products&keywordSearch&') keywords = [] isbn = check_isbn(identifiers.get('isbn', None)) if isbn is not None: keywords.append(isbn) elif title: title_tokens = list(self.get_title_tokens(title)) if title_tokens: keywords.extend(title_tokens) author_tokens = self.get_author_tokens(authors, only_first_author=True) if author_tokens: keywords.extend(author_tokens) if not keywords: return None params = { 'q': (' '.join(keywords)).encode('utf-8'), '_': type('')(int(time.time())) } return BASE_URL+urlencode(params) # }}} def identify(self, log, result_queue, abort, title=None, authors=None, # {{{ identifiers={}, timeout=30): import json br = self.browser br.addheaders = [ ('Referer', 'https://www.edelweiss.plus/'), ('X-Requested-With', 'XMLHttpRequest'), ('Cache-Control', 'no-cache'), ('Pragma', 'no-cache'), ] if 'edelweiss' in identifiers: items = [identifiers['edelweiss']] else: log.error('Currently Edelweiss returns random books for search queries') return query = self.create_query(log, title=title, authors=authors, identifiers=identifiers) if not query: log.error('Insufficient metadata to construct query') return log('Using query URL:', query) try: raw = br.open(query, timeout=timeout).read().decode('utf-8') except Exception as e: log.exception('Failed to make identify query: %r'%query) return as_unicode(e) items = re.search(r'window[.]items\s*=\s*(.+?);', raw) if items is None: log.error('Failed to get list of matching items') log.debug('Response text:') log.debug(raw) return items = json.loads(items.group(1)) if (not items and identifiers and title and authors and not abort.is_set()): return self.identify(log, result_queue, abort, title=title, authors=authors, timeout=timeout) if not items: return workers = [] items = items[:5] for i, item in enumerate(get_basic_data(self.browser, log, *items)): sku = item['sku'] for isbn in item['isbns']: self.cache_isbn_to_identifier(isbn, sku) if item['cover']: self.cache_identifier_to_cover_url(sku, item['cover']) fmt = item['format'].lower() if 'audio' in fmt or 'mp3' in fmt: continue # Audio-book, ignore workers.append(Worker(item, i, result_queue, br.clone_browser(), timeout, log, self)) if not workers: return for w in workers: w.start() # Don't send all requests at the same time time.sleep(0.1) while not abort.is_set(): a_worker_is_alive = False for w in workers: w.join(0.2) if abort.is_set(): break if w.is_alive(): a_worker_is_alive = True if not a_worker_is_alive: break # }}} def download_cover(self, log, result_queue, abort, # {{{ title=None, authors=None, identifiers={}, timeout=30, get_best_cover=False): cached_url = self.get_cached_cover_url(identifiers) if cached_url is None: log.info('No cached cover found, running identify') rq = Queue() self.identify(log, rq, abort, title=title, authors=authors, identifiers=identifiers) if abort.is_set(): return results = [] while True: try: results.append(rq.get_nowait()) except Empty: break results.sort(key=self.identify_results_keygen( title=title, authors=authors, identifiers=identifiers)) for mi in results: cached_url = self.get_cached_cover_url(mi.identifiers) if cached_url is not None: break if cached_url is None: log.info('No cover found') return if abort.is_set(): return br = self.browser log('Downloading cover from:', cached_url) try: cdata = br.open_novisit(cached_url, timeout=timeout).read() result_queue.put((self, cdata)) except: log.exception('Failed to download cover from:', cached_url) # }}} if __name__ == '__main__': from calibre.ebooks.metadata.sources.test import ( test_identify_plugin, title_test, authors_test, comments_test, pubdate_test) tests = [ ( # A title and author search {'title': 'The Husband\'s Secret', 'authors':['Liane Moriarty']}, [title_test('The Husband\'s Secret', exact=True), authors_test(['Liane Moriarty'])] ), ( # An isbn present in edelweiss {'identifiers':{'isbn': '9780312621360'}, }, [title_test('Flame: A Sky Chasers Novel', exact=True), authors_test(['Amy Kathleen Ryan'])] ), # Multiple authors and two part title and no general description ({'identifiers':{'edelweiss':'0321180607'}}, [title_test( "XQuery From the Experts: A Guide to the W3C XML Query Language" , exact=True), authors_test([ 'Howard Katz', 'Don Chamberlin', 'Denise Draper', 'Mary Fernandez', 'Michael Kay', 'Jonathan Robie', 'Michael Rys', 'Jerome Simeon', 'Jim Tivy', 'Philip Wadler']), pubdate_test(2003, 8, 22), comments_test('Jérôme Siméon'), lambda mi: bool(mi.comments and 'No title summary' not in mi.comments) ]), ] start, stop = 0, len(tests) tests = tests[start:stop] test_identify_plugin(Edelweiss.name, tests)