%PDF- %PDF-
| Direktori : /proc/self/root/usr/lib/calibre/calibre/ebooks/metadata/ |
| Current File : //proc/self/root/usr/lib/calibre/calibre/ebooks/metadata/__init__.py |
#!/usr/bin/env python3
__license__ = 'GPL v3'
__copyright__ = '2008, Kovid Goyal kovid@kovidgoyal.net'
__docformat__ = 'restructuredtext en'
"""
Provides abstraction for metadata reading.writing from a variety of ebook formats.
"""
import os, sys, re
from contextlib import suppress
from calibre import relpath, guess_type, prints, force_unicode
from calibre.utils.config_base import tweaks
from polyglot.builtins import codepoint_to_chr, iteritems, as_unicode
from polyglot.urllib import quote, unquote, urlparse
try:
_author_pat = re.compile(tweaks['authors_split_regex'])
except Exception:
prints('Author split regexp:', tweaks['authors_split_regex'],
'is invalid, using default')
_author_pat = re.compile(r'(?i),?\s+(and|with)\s+')
def string_to_authors(raw):
if not raw:
return []
raw = raw.replace('&&', '\uffff')
raw = _author_pat.sub('&', raw)
authors = [a.strip().replace('\uffff', '&') for a in raw.split('&')]
return [a for a in authors if a]
def authors_to_string(authors):
if authors is not None:
return ' & '.join([a.replace('&', '&&') for a in authors if a])
else:
return ''
def remove_bracketed_text(src, brackets=None):
if brackets is None:
brackets = {'(': ')', '[': ']', '{': '}'}
from collections import Counter
counts = Counter()
total = 0
buf = []
src = force_unicode(src)
rmap = {v: k for k, v in iteritems(brackets)}
for char in src:
if char in brackets:
counts[char] += 1
total += 1
elif char in rmap:
idx = rmap[char]
if counts[idx] > 0:
counts[idx] -= 1
total -= 1
elif total < 1:
buf.append(char)
return ''.join(buf)
def author_to_author_sort(
author,
method=None,
copywords=None,
use_surname_prefixes=None,
surname_prefixes=None,
name_prefixes=None,
name_suffixes=None
):
if not author:
return ''
if method is None:
method = tweaks['author_sort_copy_method']
if method == 'copy':
return author
sauthor = remove_bracketed_text(author).strip()
if method == 'comma' and ',' in sauthor:
return author
tokens = sauthor.split()
if len(tokens) < 2:
return author
ltoks = frozenset(x.lower() for x in tokens)
copy_words = frozenset(x.lower() for x in (tweaks['author_name_copywords'] if copywords is None else copywords))
if ltoks.intersection(copy_words):
return author
author_use_surname_prefixes = tweaks['author_use_surname_prefixes'] if use_surname_prefixes is None else use_surname_prefixes
if author_use_surname_prefixes:
author_surname_prefixes = frozenset(x.lower() for x in (tweaks['author_surname_prefixes'] if surname_prefixes is None else surname_prefixes))
if len(tokens) == 2 and tokens[0].lower() in author_surname_prefixes:
return author
prefixes = {force_unicode(y).lower() for y in (tweaks['author_name_prefixes'] if name_prefixes is None else name_prefixes)}
prefixes |= {y+'.' for y in prefixes}
for first in range(len(tokens)):
if tokens[first].lower() not in prefixes:
break
else:
return author
suffixes = {force_unicode(y).lower() for y in (tweaks['author_name_suffixes'] if name_suffixes is None else name_suffixes)}
suffixes |= {y+'.' for y in suffixes}
for last in range(len(tokens) - 1, first - 1, -1):
if tokens[last].lower() not in suffixes:
break
else:
return author
suffix = ' '.join(tokens[last + 1:])
if author_use_surname_prefixes:
if last > first and tokens[last - 1].lower() in author_surname_prefixes:
tokens[last - 1] += ' ' + tokens[last]
last -= 1
atokens = tokens[last:last + 1] + tokens[first:last]
num_toks = len(atokens)
if suffix:
atokens.append(suffix)
if method != 'nocomma' and num_toks > 1:
atokens[0] += ','
return ' '.join(atokens)
def authors_to_sort_string(authors):
return ' & '.join(map(author_to_author_sort, authors))
_title_pats = {}
def get_title_sort_pat(lang=None):
ans = _title_pats.get(lang, None)
if ans is not None:
return ans
q = lang
from calibre.utils.localization import canonicalize_lang, get_lang
if lang is None:
q = tweaks['default_language_for_title_sort']
if q is None:
q = get_lang()
q = canonicalize_lang(q) if q else q
data = tweaks['per_language_title_sort_articles']
try:
ans = data.get(q, None)
except AttributeError:
ans = None # invalid tweak value
try:
ans = frozenset(ans) if ans is not None else frozenset(data['eng'])
except Exception:
ans = frozenset((r'A\s+', r'The\s+', r'An\s+'))
if ans:
ans = '|'.join(ans)
ans = '^(%s)'%ans
try:
ans = re.compile(ans, re.IGNORECASE)
except:
ans = re.compile(r'^(A|The|An)\s+', re.IGNORECASE)
else:
ans = re.compile('^$') # matches only the empty string
_title_pats[lang] = ans
return ans
_ignore_starts = '\'"'+''.join(codepoint_to_chr(x) for x in
list(range(0x2018, 0x201e))+[0x2032, 0x2033])
def title_sort(title, order=None, lang=None):
if order is None:
order = tweaks['title_series_sorting']
title = title.strip()
if order == 'strictly_alphabetic':
return title
if title and title[0] in _ignore_starts:
title = title[1:]
match = get_title_sort_pat(lang).search(title)
if match:
try:
prep = match.group(1)
except IndexError:
pass
else:
if prep:
title = title[len(prep):] + ', ' + prep
if title[0] in _ignore_starts:
title = title[1:]
return title.strip()
coding = list(zip(
[1000,900,500,400,100,90,50,40,10,9,5,4,1],
["M","CM","D","CD","C","XC","L","XL","X","IX","V","IV","I"]
))
def roman(num):
if num <= 0 or num >= 4000 or int(num) != num:
return str(num)
result = []
for d, r in coding:
while num >= d:
result.append(r)
num -= d
return ''.join(result)
def fmt_sidx(i, fmt='%.2f', use_roman=False):
if i is None or i == '':
i = 1
try:
i = float(i)
except Exception:
return str(i)
if int(i) == float(i):
return roman(int(i)) if use_roman else '%d'%int(i)
return fmt%i
class Resource:
'''
Represents a resource (usually a file on the filesystem or a URL pointing
to the web. Such resources are commonly referred to in OPF files.
They have the interface:
:member:`path`
:member:`mime_type`
:method:`href`
'''
def __init__(self, href_or_path, basedir=os.getcwd(), is_path=True):
self._href = None
self._basedir = basedir
self.path = None
self.fragment = ''
try:
self.mime_type = guess_type(href_or_path)[0]
except:
self.mime_type = None
if self.mime_type is None:
self.mime_type = 'application/octet-stream'
if is_path:
path = href_or_path
if not os.path.isabs(path):
path = os.path.abspath(os.path.join(basedir, path))
if isinstance(path, bytes):
path = path.decode(sys.getfilesystemencoding())
self.path = path
else:
url = urlparse(href_or_path)
if url[0] not in ('', 'file'):
self._href = href_or_path
else:
pc = url[2]
if isinstance(pc, str):
pc = pc.encode('utf-8')
pc = unquote(pc).decode('utf-8')
self.path = os.path.abspath(os.path.join(basedir, pc.replace('/', os.sep)))
self.fragment = unquote(url[-1])
def href(self, basedir=None):
'''
Return a URL pointing to this resource. If it is a file on the filesystem
the URL is relative to `basedir`.
`basedir`: If None, the basedir of this resource is used (see :method:`set_basedir`).
If this resource has no basedir, then the current working directory is used as the basedir.
'''
if basedir is None:
if self._basedir:
basedir = self._basedir
else:
basedir = os.getcwd()
if self.path is None:
return self._href
f = self.fragment.encode('utf-8') if isinstance(self.fragment, str) else self.fragment
frag = '#'+as_unicode(quote(f)) if self.fragment else ''
if self.path == basedir:
return ''+frag
try:
rpath = relpath(self.path, basedir)
except OSError: # On windows path and basedir could be on different drives
rpath = self.path
if isinstance(rpath, str):
rpath = rpath.encode('utf-8')
return as_unicode(quote(rpath.replace(os.sep, '/')))+frag
def set_basedir(self, path):
self._basedir = path
def basedir(self):
return self._basedir
def __repr__(self):
return 'Resource(%s, %s)'%(repr(self.path), repr(self.href()))
class ResourceCollection:
def __init__(self):
self._resources = []
def __iter__(self):
yield from self._resources
def __len__(self):
return len(self._resources)
def __getitem__(self, index):
return self._resources[index]
def __bool__(self):
return len(self._resources) > 0
def __str__(self):
resources = map(repr, self)
return '[%s]'%', '.join(resources)
def __repr__(self):
return str(self)
def append(self, resource):
if not isinstance(resource, Resource):
raise ValueError('Can only append objects of type Resource')
self._resources.append(resource)
def remove(self, resource):
self._resources.remove(resource)
def replace(self, start, end, items):
'Same as list[start:end] = items'
self._resources[start:end] = items
@staticmethod
def from_directory_contents(top, topdown=True):
collection = ResourceCollection()
for spec in os.walk(top, topdown=topdown):
path = os.path.abspath(os.path.join(spec[0], spec[1]))
res = Resource.from_path(path)
res.set_basedir(top)
collection.append(res)
return collection
def set_basedir(self, path):
for res in self:
res.set_basedir(path)
def MetaInformation(title, authors=(_('Unknown'),)):
''' Convenient encapsulation of book metadata, needed for compatibility
@param title: title or ``_('Unknown')`` or a MetaInformation object
@param authors: List of strings or []
'''
from calibre.ebooks.metadata.book.base import Metadata
mi = None
if hasattr(title, 'title') and hasattr(title, 'authors'):
mi = title
title = mi.title
authors = mi.authors
return Metadata(title, authors, other=mi)
def check_digit_for_isbn10(isbn):
check = sum((i+1)*int(isbn[i]) for i in range(9)) % 11
return 'X' if check == 10 else str(check)
def check_digit_for_isbn13(isbn):
check = 10 - sum((1 if i%2 ==0 else 3)*int(isbn[i]) for i in range(12)) % 10
if check == 10:
check = 0
return str(check)
def check_isbn10(isbn):
with suppress(Exception):
return check_digit_for_isbn10(isbn) == isbn[9]
return False
def check_isbn13(isbn):
with suppress(Exception):
return check_digit_for_isbn13(isbn) == isbn[12]
return False
def check_isbn(isbn, simple_sanitize=False):
if not isbn:
return None
if simple_sanitize:
isbn = isbn.upper().replace('-', '').strip().replace(' ', '')
else:
isbn = re.sub(r'[^0-9X]', '', isbn.upper())
il = len(isbn)
if il not in (10, 13):
return None
all_same = re.match(r'(\d)\1{9,12}$', isbn)
if all_same is not None:
return None
if il == 10:
return isbn if check_isbn10(isbn) else None
if il == 13:
return isbn if check_isbn13(isbn) else None
return None
def normalize_isbn(isbn):
if not isbn:
return isbn
ans = check_isbn(isbn)
if ans is None:
return isbn
if len(ans) == 10:
ans = '978' + ans[:9]
ans += check_digit_for_isbn13(ans)
return ans
def check_issn(issn):
if not issn:
return None
issn = re.sub(r'[^0-9X]', '', issn.upper())
try:
digits = tuple(map(int, issn[:7]))
products = [(8 - i) * d for i, d in enumerate(digits)]
check = 11 - sum(products) % 11
if (check == 10 and issn[7] == 'X') or check == int(issn[7]):
return issn
except Exception:
pass
return None
def format_isbn(isbn):
cisbn = check_isbn(isbn)
if not cisbn:
return isbn
i = cisbn
if len(i) == 10:
return '-'.join((i[:2], i[2:6], i[6:9], i[9]))
return '-'.join((i[:3], i[3:5], i[5:9], i[9:12], i[12]))
def check_doi(doi):
'Check if something that looks like a DOI is present anywhere in the string'
if not doi:
return None
doi_check = re.search(r'10\.\d{4}/\S+', doi)
if doi_check is not None:
return doi_check.group()
return None
def rating_to_stars(value, allow_half_stars=False, star='★', half='⯨'):
r = max(0, min(int(value or 0), 10))
ans = star * (r // 2)
if allow_half_stars and r % 2:
ans += half
return ans