%PDF- %PDF-
Direktori : /usr/lib/calibre/calibre/ebooks/oeb/polish/ |
Current File : //usr/lib/calibre/calibre/ebooks/oeb/polish/stats.py |
#!/usr/bin/env python3 __license__ = 'GPL v3' __copyright__ = '2013, Kovid Goyal <kovid at kovidgoyal.net>' __docformat__ = 'restructuredtext en' import sys from functools import partial from lxml.etree import tostring import regex from calibre.ebooks.oeb.base import XHTML, css_text from calibre.ebooks.oeb.polish.cascade import iterrules, resolve_styles, iterdeclaration from calibre.utils.icu import ord_string, safe_chr from polyglot.builtins import iteritems, itervalues from tinycss.fonts3 import parse_font_family def normalize_font_properties(font): w = font.get('font-weight', None) if not w and w != 0: w = 'normal' w = str(w) w = {'normal':'400', 'bold':'700'}.get(w, w) if w not in {'100', '200', '300', '400', '500', '600', '700', '800', '900'}: w = '400' font['font-weight'] = w val = font.get('font-style', None) if val not in {'normal', 'italic', 'oblique'}: val = 'normal' font['font-style'] = val val = font.get('font-stretch', None) if val not in {'normal', 'ultra-condensed', 'extra-condensed', 'condensed', 'semi-condensed', 'semi-expanded', 'expanded', 'extra-expanded', 'ultra-expanded'}: val = 'normal' font['font-stretch'] = val return font widths = {x:i for i, x in enumerate(('ultra-condensed', 'extra-condensed', 'condensed', 'semi-condensed', 'normal', 'semi-expanded', 'expanded', 'extra-expanded', 'ultra-expanded' ))} def get_matching_rules(rules, font): matches = [] # Filter on family for rule in reversed(rules): ff = frozenset(icu_lower(x) for x in font.get('font-family', [])) if ff.intersection(rule['font-family']): matches.append(rule) if not matches: return [] # Filter on font stretch width = widths[font.get('font-stretch', 'normal')] min_dist = min(abs(width-y['width']) for y in matches) nearest = [x for x in matches if abs(width-x['width']) == min_dist] if width <= 4: lmatches = [f for f in nearest if f['width'] <= width] else: lmatches = [f for f in nearest if f['width'] >= width] matches = (lmatches or nearest) # Filter on font-style fs = font.get('font-style', 'normal') order = { 'oblique':['oblique', 'italic', 'normal'], 'normal':['normal', 'oblique', 'italic'] }.get(fs, ['italic', 'oblique', 'normal']) for q in order: m = [f for f in matches if f.get('font-style', 'normal') == q] if m: matches = m break # Filter on font weight fw = int(font.get('font-weight', '400')) if fw == 400: q = [400, 500, 300, 200, 100, 600, 700, 800, 900] elif fw == 500: q = [500, 400, 300, 200, 100, 600, 700, 800, 900] elif fw < 400: q = [fw] + list(range(fw-100, -100, -100)) + list(range(fw+100, 100, 1000)) else: q = [fw] + list(range(fw+100, 100, 1000)) + list(range(fw-100, -100, -100)) for wt in q: m = [f for f in matches if f['weight'] == wt] if m: return m return [] def get_css_text(elem, resolve_pseudo_property, which='before'): text = resolve_pseudo_property(elem, which, 'content')[0].value if text and len(text) > 2 and text[0] == '"' and text[-1] == '"': return text[1:-1] return '' caps_variants = {'smallcaps', 'small-caps', 'all-small-caps', 'petite-caps', 'all-petite-caps', 'unicase'} def get_element_text(elem, resolve_property, resolve_pseudo_property, capitalize_pat, for_pseudo=None): ans = [] before = get_css_text(elem, resolve_pseudo_property) if before: ans.append(before) if for_pseudo is not None: ans.append(tostring(elem, method='text', encoding='unicode', with_tail=False)) else: if elem.text: ans.append(elem.text) for child in elem.iterchildren(): t = getattr(child, 'tail', '') if t: ans.append(t) after = get_css_text(elem, resolve_pseudo_property, 'after') if after: ans.append(after) ans = ''.join(ans) if for_pseudo is not None: tt = resolve_pseudo_property(elem, for_pseudo, 'text-transform')[0].value fv = resolve_pseudo_property(elem, for_pseudo, 'font-variant')[0].value else: tt = resolve_property(elem, 'text-transform')[0].value fv = resolve_property(elem, 'font-variant')[0].value if fv in caps_variants: ans += icu_upper(ans) if tt != 'none': if tt == 'uppercase': ans = icu_upper(ans) elif tt == 'lowercase': ans = icu_lower(ans) elif tt == 'capitalize': m = capitalize_pat.search(ans) if m is not None: ans += icu_upper(m.group()) return ans def get_font_dict(elem, resolve_property, pseudo=None): ans = {} if pseudo is None: ff = resolve_property(elem, 'font-family') else: ff = resolve_property(elem, pseudo, 'font-family') ans['font-family'] = tuple(x.value for x in ff) for p in 'weight', 'style', 'stretch': p = 'font-' + p rp = resolve_property(elem, p) if pseudo is None else resolve_property(elem, pseudo, p) ans[p] = str(rp[0].value) normalize_font_properties(ans) return ans bad_fonts = {'serif', 'sans-serif', 'monospace', 'cursive', 'fantasy', 'sansserif', 'inherit'} exclude_chars = frozenset(ord_string('\n\r\t')) skip_tags = {XHTML(x) for x in 'script style title meta link'.split()} font_keys = {'font-weight', 'font-style', 'font-stretch', 'font-family'} def prepare_font_rule(cssdict): cssdict['font-family'] = frozenset(cssdict['font-family'][:1]) cssdict['width'] = widths[cssdict['font-stretch']] cssdict['weight'] = int(cssdict['font-weight']) class StatsCollector: first_letter_pat = capitalize_pat = None def __init__(self, container, do_embed=False): if self.first_letter_pat is None: StatsCollector.first_letter_pat = self.first_letter_pat = regex.compile( r'^[\p{Ps}\p{Ps}\p{Pe}\p{Pi}\p{Pf}\p{Po}]+', regex.VERSION1 | regex.UNICODE) StatsCollector.capitalize_pat = self.capitalize_pat = regex.compile( r'[\p{L}\p{N}]', regex.VERSION1 | regex.UNICODE) self.collect_font_stats(container, do_embed) def collect_font_face_rules(self, container, processed, spine_name, sheet, sheet_name): if sheet_name in processed: sheet_rules = processed[sheet_name] else: sheet_rules = [] if sheet_name != spine_name: processed[sheet_name] = sheet_rules for rule, base_name, rule_index in iterrules(container, sheet_name, rules=sheet, rule_type='FONT_FACE_RULE'): cssdict = {} for prop in iterdeclaration(rule.style): if prop.name == 'font-family': cssdict['font-family'] = [icu_lower(x) for x in parse_font_family(css_text(prop.propertyValue))] elif prop.name.startswith('font-'): cssdict[prop.name] = prop.propertyValue[0].value elif prop.name == 'src': for val in prop.propertyValue: x = val.value fname = container.href_to_name(x, sheet_name) if container.has_name(fname): cssdict['src'] = fname break else: container.log.warn('The @font-face rule refers to a font file that does not exist in the book: %s' % css_text(prop.propertyValue)) if 'src' not in cssdict: continue ff = cssdict.get('font-family') if not ff or ff[0] in bad_fonts: continue normalize_font_properties(cssdict) prepare_font_rule(cssdict) sheet_rules.append(cssdict) self.font_rule_map[spine_name].extend(sheet_rules) def get_element_font_usage(self, elem, resolve_property, resolve_pseudo_property, font_face_rules, do_embed, font_usage_map, font_spec): text = get_element_text(elem, resolve_property, resolve_pseudo_property, self.capitalize_pat) if not text: return def update_usage_for_embed(font, chars): if not do_embed: return ff = [icu_lower(x) for x in font.get('font-family', ())] if ff and ff[0] not in bad_fonts: key = frozenset(((k, ff[0] if k == 'font-family' else v) for k, v in iteritems(font) if k in font_keys)) val = font_usage_map.get(key) if val is None: val = font_usage_map[key] = {'text': set()} for k in font_keys: val[k] = font[k][0] if k == 'font-family' else font[k] val['text'] |= chars for ff in font.get('font-family', ()): if ff and icu_lower(ff) not in bad_fonts: font_spec.add(ff) font = get_font_dict(elem, resolve_property) chars = frozenset(ord_string(text)) - exclude_chars update_usage_for_embed(font, chars) for rule in get_matching_rules(font_face_rules, font): self.font_stats[rule['src']] |= chars q = resolve_pseudo_property(elem, 'first-letter', 'font-family', abort_on_missing=True) if q is not None: font = get_font_dict(elem, resolve_pseudo_property, pseudo='first-letter') text = get_element_text(elem, resolve_property, resolve_pseudo_property, self.capitalize_pat, for_pseudo='first-letter') m = self.first_letter_pat.search(text.lstrip()) if m is not None: chars = frozenset(ord_string(m.group())) - exclude_chars update_usage_for_embed(font, chars) for rule in get_matching_rules(font_face_rules, font): self.font_stats[rule['src']] |= chars q = resolve_pseudo_property(elem, 'first-line', 'font-family', abort_on_missing=True) if q is not None: font = get_font_dict(elem, resolve_pseudo_property, pseudo='first-letter') text = get_element_text(elem, resolve_property, resolve_pseudo_property, self.capitalize_pat, for_pseudo='first-line') chars = frozenset(ord_string(text)) - exclude_chars update_usage_for_embed(font, chars) for rule in get_matching_rules(font_face_rules, font): self.font_stats[rule['src']] |= chars def get_font_usage(self, container, spine_name, resolve_property, resolve_pseudo_property, font_face_rules, do_embed): root = container.parsed(spine_name) for body in root.iterchildren(XHTML('body')): for elem in body.iter('*'): if elem.tag not in skip_tags: self.get_element_font_usage( elem, resolve_property, resolve_pseudo_property, font_face_rules, do_embed, self.font_usage_map[spine_name], self.font_spec_map[spine_name]) def collect_font_stats(self, container, do_embed=False): self.font_stats = {} self.font_usage_map = {} self.font_spec_map = {} self.font_rule_map = {} self.all_font_rules = {} processed_sheets = {} for name, is_linear in container.spine_names: self.font_rule_map[name] = font_face_rules = [] resolve_property, resolve_pseudo_property, select = resolve_styles(container, name, sheet_callback=partial( self.collect_font_face_rules, container, processed_sheets, name)) for rule in font_face_rules: self.all_font_rules[rule['src']] = rule if rule['src'] not in self.font_stats: self.font_stats[rule['src']] = set() self.font_usage_map[name] = {} self.font_spec_map[name] = set() self.get_font_usage(container, name, resolve_property, resolve_pseudo_property, font_face_rules, do_embed) self.font_stats = {k:{safe_chr(x) for x in v} for k, v in iteritems(self.font_stats)} for fum in itervalues(self.font_usage_map): for v in itervalues(fum): v['text'] = {safe_chr(x) for x in v['text']} if __name__ == '__main__': from calibre.ebooks.oeb.polish.container import get_container from calibre.utils.logging import default_log default_log.filter_level = default_log.DEBUG ebook = get_container(sys.argv[-1], default_log) from pprint import pprint pprint(StatsCollector(ebook, do_embed=True).font_stats)