%PDF- %PDF-
| Direktori : /usr/lib/calibre/calibre/ebooks/oeb/polish/ |
| Current File : //usr/lib/calibre/calibre/ebooks/oeb/polish/utils.py |
#!/usr/bin/env python3
__license__ = 'GPL v3'
__copyright__ = '2013, Kovid Goyal <kovid at kovidgoyal.net>'
import re, os
from bisect import bisect
from calibre import guess_type as _guess_type, replace_entities
def guess_type(x):
return _guess_type(x)[0] or 'application/octet-stream'
def setup_css_parser_serialization(tab_width=2):
import css_parser
prefs = css_parser.ser.prefs
prefs.indent = tab_width * ' '
prefs.indentClosingBrace = False
prefs.omitLastSemicolon = False
def actual_case_for_name(container, name):
from calibre.utils.filenames import samefile
if not container.exists(name):
raise ValueError('Cannot get actual case for %s as it does not exist' % name)
parts = name.split('/')
base = ''
ans = []
for i, x in enumerate(parts):
base = '/'.join(ans + [x])
path = container.name_to_abspath(base)
pdir = os.path.dirname(path)
candidates = {os.path.join(pdir, q) for q in os.listdir(pdir)}
if x in candidates:
correctx = x
else:
for q in candidates:
if samefile(q, path):
correctx = os.path.basename(q)
break
else:
raise RuntimeError('Something bad happened')
ans.append(correctx)
return '/'.join(ans)
def corrected_case_for_name(container, name):
parts = name.split('/')
ans = []
base = ''
for i, x in enumerate(parts):
base = '/'.join(ans + [x])
if container.exists(base):
correctx = x
else:
try:
candidates = {q for q in os.listdir(os.path.dirname(container.name_to_abspath(base)))}
except OSError:
return None # one of the non-terminal components of name is a file instead of a directory
for q in candidates:
if q.lower() == x.lower():
correctx = q
break
else:
return None
ans.append(correctx)
return '/'.join(ans)
class PositionFinder:
def __init__(self, raw):
pat = br'\n' if isinstance(raw, bytes) else r'\n'
self.new_lines = tuple(m.start() + 1 for m in re.finditer(pat, raw))
def __call__(self, pos):
lnum = bisect(self.new_lines, pos)
try:
offset = abs(pos - self.new_lines[lnum - 1])
except IndexError:
offset = pos
return (lnum + 1, offset)
class CommentFinder:
def __init__(self, raw, pat=r'(?s)/\*.*?\*/'):
self.starts, self.ends = [], []
for m in re.finditer(pat, raw):
start, end = m.span()
self.starts.append(start), self.ends.append(end)
def __call__(self, offset):
if not self.starts:
return False
q = bisect(self.starts, offset) - 1
return q >= 0 and self.starts[q] <= offset <= self.ends[q]
def link_stylesheets(container, names, sheets, remove=False, mtype='text/css'):
from calibre.ebooks.oeb.base import XPath, XHTML
changed_names = set()
snames = set(sheets)
lp = XPath('//h:link[@href]')
hp = XPath('//h:head')
for name in names:
root = container.parsed(name)
if remove:
for link in lp(root):
if (link.get('type', mtype) or mtype) == mtype:
container.remove_from_xml(link)
changed_names.add(name)
container.dirty(name)
existing = {container.href_to_name(l.get('href'), name) for l in lp(root) if (l.get('type', mtype) or mtype) == mtype}
extra = snames - existing
if extra:
changed_names.add(name)
try:
parent = hp(root)[0]
except (TypeError, IndexError):
parent = root.makeelement(XHTML('head'))
container.insert_into_xml(root, parent, index=0)
for sheet in sheets:
if sheet in extra:
container.insert_into_xml(
parent, parent.makeelement(XHTML('link'), rel='stylesheet', type=mtype,
href=container.name_to_href(sheet, name)))
container.dirty(name)
return changed_names
def lead_text(top_elem, num_words=10):
''' Return the leading text contained in top_elem (including descendants)
up to a maximum of num_words words. More efficient than using
etree.tostring(method='text') as it does not have to serialize the entire
sub-tree rooted at top_elem.'''
pat = re.compile(r'\s+', flags=re.UNICODE)
words = []
def get_text(x, attr='text'):
ans = getattr(x, attr)
if ans:
words.extend(filter(None, pat.split(ans)))
stack = [(top_elem, 'text')]
while stack and len(words) < num_words:
elem, attr = stack.pop()
get_text(elem, attr)
if attr == 'text':
if elem is not top_elem:
stack.append((elem, 'tail'))
stack.extend(reversed(list((c, 'text') for c in elem.iterchildren('*'))))
return ' '.join(words[:num_words])
def parse_css(data, fname='<string>', is_declaration=False, decode=None, log_level=None, css_preprocessor=None):
if log_level is None:
import logging
log_level = logging.WARNING
from css_parser import CSSParser, log
from calibre.ebooks.oeb.base import _css_logger
log.setLevel(log_level)
log.raiseExceptions = False
data = data or ''
if isinstance(data, bytes):
data = data.decode('utf-8') if decode is None else decode(data)
if css_preprocessor is not None:
data = css_preprocessor(data)
parser = CSSParser(loglevel=log_level,
# We dont care about @import rules
fetcher=lambda x: (None, None), log=_css_logger)
if is_declaration:
data = parser.parseStyle(data, validate=False)
else:
data = parser.parseString(data, href=fname, validate=False)
return data
def handle_entities(text, func):
return func(replace_entities(text))
def apply_func_to_match_groups(match, func=icu_upper, handle_entities=handle_entities):
'''Apply the specified function to individual groups in the match object (the result of re.search() or
the whole match if no groups were defined. Returns the replaced string.'''
found_groups = False
i = 0
parts, pos = [], match.start()
f = lambda text:handle_entities(text, func)
while True:
i += 1
try:
start, end = match.span(i)
except IndexError:
break
found_groups = True
if start > -1:
parts.append(match.string[pos:start])
parts.append(f(match.string[start:end]))
pos = end
if not found_groups:
return f(match.group())
parts.append(match.string[pos:match.end()])
return ''.join(parts)
def apply_func_to_html_text(match, func=icu_upper, handle_entities=handle_entities):
''' Apply the specified function only to text between HTML tag definitions. '''
f = lambda text:handle_entities(text, func)
parts = re.split(r'(<[^>]+>)', match.group())
parts = (x if x.startswith('<') else f(x) for x in parts)
return ''.join(parts)
def extract(elem):
''' Remove an element from the tree, keeping elem.tail '''
p = elem.getparent()
if p is not None:
idx = p.index(elem)
p.remove(elem)
if elem.tail:
if idx > 0:
p[idx-1].tail = (p[idx-1].tail or '') + elem.tail
else:
p.text = (p.text or '') + elem.tail