%PDF- %PDF-
| Direktori : /usr/lib/calibre/calibre/web/feeds/ |
| Current File : //usr/lib/calibre/calibre/web/feeds/__init__.py |
#!/usr/bin/env python3
__license__ = 'GPL v3'
__copyright__ = '2008, Kovid Goyal <kovid at kovidgoyal.net>'
'''
Contains the logic for parsing feeds.
'''
import time, traceback, copy, re
from calibre.utils.logging import default_log
from calibre import entity_to_unicode, strftime, force_unicode
from calibre.utils.date import dt_factory, utcnow, local_tz
from calibre.utils.cleantext import clean_ascii_chars, clean_xml_chars
from polyglot.builtins import string_or_bytes
class Article:
def __init__(self, id, title, url, author, summary, published, content):
from lxml import html
self.downloaded = False
self.id = id
if not title or not isinstance(title, string_or_bytes):
title = _('Unknown')
title = force_unicode(title, 'utf-8')
self._title = clean_xml_chars(title).strip()
try:
self._title = re.sub(r'&(\S+?);',
entity_to_unicode, self._title)
except:
pass
self._title = clean_ascii_chars(self._title)
self.url = url
self.author = author
self.toc_thumbnail = None
self.internal_toc_entries = ()
if author and not isinstance(author, str):
author = author.decode('utf-8', 'replace')
if summary and not isinstance(summary, str):
summary = summary.decode('utf-8', 'replace')
summary = clean_xml_chars(summary) if summary else summary
self.summary = summary
if summary and '<' in summary:
try:
s = html.fragment_fromstring(summary, create_parent=True)
summary = html.tostring(s, method='text', encoding='unicode')
except:
print('Failed to process article summary, deleting:')
print(summary.encode('utf-8'))
traceback.print_exc()
summary = ''
self.text_summary = clean_ascii_chars(summary)
self.author = author
self.content = content
self.date = published
self.utctime = dt_factory(self.date, assume_utc=True, as_utc=True)
self.localtime = self.utctime.astimezone(local_tz)
self._formatted_date = None
@property
def formatted_date(self):
if self._formatted_date is None:
self._formatted_date = strftime(" [%a, %d %b %H:%M]",
t=self.localtime.timetuple())
return self._formatted_date
@formatted_date.setter
def formatted_date(self, val):
if isinstance(val, str):
self._formatted_date = val
@property
def title(self):
t = self._title
if not isinstance(t, str) and hasattr(t, 'decode'):
t = t.decode('utf-8', 'replace')
return t
@title.setter
def title(self, val):
self._title = clean_ascii_chars(val)
def __repr__(self):
return \
('''\
Title : %s
URL : %s
Author : %s
Summary : %s
Date : %s
TOC thumb : %s
Has content : %s
'''%(self.title, self.url, self.author, self.summary[:20]+'...',
self.localtime.strftime('%a, %d %b, %Y %H:%M'), self.toc_thumbnail,
bool(self.content)))
def __str__(self):
return repr(self)
def is_same_as(self, other_article):
# if self.title != getattr(other_article, 'title', False):
# return False
if self.url:
return self.url == getattr(other_article, 'url', False)
return self.content == getattr(other_article, 'content', False)
class Feed:
def __init__(self, get_article_url=lambda item: item.get('link', None),
log=default_log):
'''
Parse a feed into articles.
'''
self.logger = log
self.get_article_url = get_article_url
def populate_from_feed(self, feed, title=None, oldest_article=7,
max_articles_per_feed=100):
entries = feed.entries
feed = feed.feed
self.title = feed.get('title', _('Unknown section')) if not title else title
self.description = feed.get('description', '')
image = feed.get('image', {})
self.image_url = image.get('href', None)
self.image_width = image.get('width', 88)
self.image_height = image.get('height', 31)
self.image_alt = image.get('title', '')
self.articles = []
self.id_counter = 0
self.added_articles = []
self.oldest_article = oldest_article
for item in entries:
if len(self.articles) >= max_articles_per_feed:
break
self.parse_article(item)
def populate_from_preparsed_feed(self, title, articles, oldest_article=7,
max_articles_per_feed=100):
self.title = str(title if title else _('Unknown feed'))
self.description = ''
self.image_url = None
self.articles = []
self.added_articles = []
self.oldest_article = oldest_article
self.id_counter = 0
for item in articles:
if len(self.articles) >= max_articles_per_feed:
break
self.id_counter += 1
id = item.get('id', None)
if not id:
id = 'internal id#%s'%self.id_counter
if id in self.added_articles:
return
self.added_articles.append(id)
published = time.gmtime(item.get('timestamp', time.time()))
title = item.get('title', _('Untitled article'))
link = item.get('url', None)
description = item.get('description', '')
content = item.get('content', '')
author = item.get('author', '')
article = Article(id, title, link, author, description, published, content)
delta = utcnow() - article.utctime
if delta.days*24*3600 + delta.seconds <= 24*3600*self.oldest_article:
self.articles.append(article)
else:
t = strftime('%a, %d %b, %Y %H:%M', article.localtime.timetuple())
self.logger.debug('Skipping article %s (%s) from feed %s as it is too old.'%
(title, t, self.title))
d = item.get('date', '')
article.formatted_date = d
def parse_article(self, item):
self.id_counter += 1
id = item.get('id', None)
if not id:
id = 'internal id#%s'%self.id_counter
if id in self.added_articles:
return
published = None
for date_field in ('date_parsed', 'published_parsed',
'updated_parsed'):
published = item.get(date_field, None)
if published is not None:
break
if not published:
from dateutil.parser import parse
for date_field in ('date', 'published', 'updated'):
try:
published = parse(item[date_field]).timetuple()
except Exception:
continue
break
if not published:
published = time.gmtime()
self.added_articles.append(id)
title = item.get('title', _('Untitled article'))
if title.startswith('<'):
title = re.sub(r'<.+?>', '', title)
try:
link = self.get_article_url(item)
except:
self.logger.warning('Failed to get link for %s'%title)
self.logger.debug(traceback.format_exc())
link = None
description = item.get('summary', None)
author = item.get('author', None)
content = [i.value for i in item.get('content', []) if i.value]
content = [i if isinstance(i, str) else i.decode('utf-8', 'replace')
for i in content]
content = '\n'.join(content)
if not content.strip():
content = None
if not link and not content:
return
article = Article(id, title, link, author, description, published, content)
delta = utcnow() - article.utctime
if delta.days*24*3600 + delta.seconds <= 24*3600*self.oldest_article:
self.articles.append(article)
else:
try:
self.logger.debug('Skipping article %s (%s) from feed %s as it is too old.'%
(title, article.localtime.strftime('%a, %d %b, %Y %H:%M'), self.title))
except UnicodeDecodeError:
if not isinstance(title, str):
title = title.decode('utf-8', 'replace')
self.logger.debug('Skipping article %s as it is too old'%title)
def reverse(self):
self.articles.reverse()
def __iter__(self):
return iter(self.articles)
def __len__(self):
return len(self.articles)
def __repr__(self):
res = [('%20s\n'%'').replace(' ', '_')+repr(art) for art in self]
return '\n'+'\n'.join(res)+'\n'
def __str__(self):
return repr(self)
def has_embedded_content(self):
length = 0
for a in self:
if a.content or a.summary:
length += max(len(a.content if a.content else ''),
len(a.summary if a.summary else ''))
return length > 2000 * len(self)
def has_article(self, article):
for a in self:
if a.is_same_as(article):
return True
return False
def find(self, article):
for i, a in enumerate(self):
if a.is_same_as(article):
return i
return -1
def remove(self, article):
i = self.index(article)
if i > -1:
self.articles[i:i+1] = []
def remove_article(self, article):
try:
self.articles.remove(article)
except ValueError:
pass
class FeedCollection(list):
def __init__(self, feeds):
list.__init__(self, [f for f in feeds if len(f.articles) > 0])
found_articles = set()
duplicates = set()
def in_set(s, a):
for x in s:
if a.is_same_as(x):
return x
return None
print('#feeds', len(self))
print(list(map(len, self)))
for f in self:
dups = []
for a in f:
first = in_set(found_articles, a)
if first is not None:
dups.append(a)
duplicates.add((first, f))
else:
found_articles.add(a)
for x in dups:
f.articles.remove(x)
self.duplicates = duplicates
print(len(duplicates))
print(list(map(len, self)))
# raise
def find_article(self, article):
for j, f in enumerate(self):
for i, a in enumerate(f):
if a is article:
return (j, i)
def restore_duplicates(self):
temp = []
for article, feed in self.duplicates:
art = copy.deepcopy(article)
j, i = self.find_article(article)
art.url = '../feed_%d/article_%d/index.html'%(j, i)
temp.append((feed, art))
for feed, art in temp:
feed.articles.append(art)
def feed_from_xml(raw_xml, title=None, oldest_article=7,
max_articles_per_feed=100,
get_article_url=lambda item: item.get('link', None),
log=default_log):
from calibre.web.feeds.feedparser import parse
# Handle unclosed escaped entities. They trip up feedparser and HBR for one
# generates them
raw_xml = re.sub(br'(&#\d+)([^0-9;])', br'\1;\2', raw_xml)
feed = parse(raw_xml)
pfeed = Feed(get_article_url=get_article_url, log=log)
pfeed.populate_from_feed(feed, title=title,
oldest_article=oldest_article,
max_articles_per_feed=max_articles_per_feed)
return pfeed
def feeds_from_index(index, oldest_article=7, max_articles_per_feed=100,
log=default_log):
'''
@param index: A parsed index as returned by L{BasicNewsRecipe.parse_index}.
@return: A list of L{Feed} objects.
@rtype: list
'''
feeds = []
for title, articles in index:
pfeed = Feed(log=log)
pfeed.populate_from_preparsed_feed(title, articles, oldest_article=oldest_article,
max_articles_per_feed=max_articles_per_feed)
feeds.append(pfeed)
return feeds