%PDF- %PDF-
Direktori : /usr/lib/calibre/calibre/ebooks/pdf/ |
Current File : //usr/lib/calibre/calibre/ebooks/pdf/reflow.py |
#!/usr/bin/env python3 __license__ = 'GPL v3' __copyright__ = '2009, Kovid Goyal <kovid@kovidgoyal.net>' __docformat__ = 'restructuredtext en' import sys, os, numbers from itertools import count from lxml import etree from calibre.utils.xml_parse import safe_xml_fromstring class Font: def __init__(self, spec): self.id = spec.get('id') self.size = float(spec.get('size')) self.color = spec.get('color') self.family = spec.get('family') class Element: def __init__(self): self.starts_block = None self.block_style = None def __eq__(self, other): return self.id == other.id def __hash__(self): return hash(self.id) class Image(Element): def __init__(self, img, opts, log, idc): Element.__init__(self) self.opts, self.log = opts, log self.id = next(idc) self.top, self.left, self.width, self.height, self.iwidth, self.iheight = \ map(float, map(img.get, ('top', 'left', 'rwidth', 'rheight', 'iwidth', 'iheight'))) self.src = img.get('src') self.bottom = self.top + self.height self.right = self.left + self.width def to_html(self): return '<img src="%s" width="%dpx" height="%dpx"/>' % \ (self.src, int(self.width), int(self.height)) def dump(self, f): f.write(self.to_html()) f.write('\n') class Text(Element): def __init__(self, text, font_map, opts, log, idc): Element.__init__(self) self.id = next(idc) self.opts, self.log = opts, log self.font_map = font_map self.top, self.left, self.width, self.height = map(float, map(text.get, ('top', 'left', 'width', 'height'))) self.bottom = self.top + self.height self.right = self.left + self.width self.font = self.font_map[text.get('font')] self.font_size = self.font.size self.color = self.font.color self.font_family = self.font.family text.tail = '' self.text_as_string = etree.tostring(text, method='text', encoding='unicode') self.raw = text.text if text.text else '' for x in text.iterchildren(): self.raw += etree.tostring(x, method='xml', encoding='unicode') self.average_character_width = self.width/len(self.text_as_string) def coalesce(self, other, page_number): if self.opts.verbose > 2: self.log.debug('Coalescing %r with %r on page %d'%(self.text_as_string, other.text_as_string, page_number)) self.top = min(self.top, other.top) self.right = other.right self.width = self.right - self.left self.bottom = max(self.bottom, other.bottom) self.height = self.bottom - self.top self.font_size = max(self.font_size, other.font_size) self.font = other.font if self.font_size == other.font_size else other.font self.text_as_string += other.text_as_string self.raw += other.raw self.average_character_width = (self.average_character_width + other.average_character_width)/2.0 def to_html(self): return self.raw def dump(self, f): f.write(self.to_html().encode('utf-8')) f.write('\n') class FontSizeStats(dict): def __init__(self, stats): total = float(sum(stats.values())) self.most_common_size, self.chars_at_most_common_size = -1, 0 for sz, chars in stats.items(): if chars >= self.chars_at_most_common_size: self.most_common_size, self.chars_at_most_common_size = sz, chars self[sz] = chars/total class Interval: def __init__(self, left, right): self.left, self.right = left, right self.width = right - left def intersection(self, other): left = max(self.left, other.left) right = min(self.right, other.right) return Interval(left, right) def centered_in(self, parent): left = abs(self.left - parent.left) right = abs(self.right - parent.right) return abs(left-right) < 3 def __nonzero__(self): return self.width > 0 def __eq__(self, other): return self.left == other.left and self.right == other.right def __hash__(self): return hash('(%f,%f)'%self.left, self.right) class Column: # A column contains an element is the element bulges out to # the left or the right by at most HFUZZ*col width. HFUZZ = 0.2 def __init__(self): self.left = self.right = self.top = self.bottom = 0 self.width = self.height = 0 self.elements = [] self.average_line_separation = 0 def add(self, elem): if elem in self.elements: return self.elements.append(elem) self._post_add() def prepend(self, elem): if elem in self.elements: return self.elements.insert(0, elem) self._post_add() def _post_add(self): self.elements.sort(key=lambda x: x.bottom) self.top = self.elements[0].top self.bottom = self.elements[-1].bottom self.left, self.right = sys.maxsize, 0 for x in self: self.left = min(self.left, x.left) self.right = max(self.right, x.right) self.width, self.height = self.right-self.left, self.bottom-self.top def __iter__(self): yield from self.elements def __len__(self): return len(self.elements) def contains(self, elem): return elem.left > self.left - self.HFUZZ*self.width and \ elem.right < self.right + self.HFUZZ*self.width def collect_stats(self): if len(self.elements) > 1: gaps = [self.elements[i+1].top - self.elements[i].bottom for i in range(0, len(self.elements)-1)] self.average_line_separation = sum(gaps)/len(gaps) for i, elem in enumerate(self.elements): left_margin = elem.left - self.left elem.indent_fraction = left_margin/self.width elem.width_fraction = elem.width/self.width if i == 0: elem.top_gap_ratio = None else: elem.top_gap_ratio = (self.elements[i-1].bottom - elem.top)/self.average_line_separation def previous_element(self, idx): if idx == 0: return None return self.elements[idx-1] def dump(self, f, num): f.write('******** Column %d\n\n'%num) for elem in self.elements: elem.dump(f) class Box(list): def __init__(self, type='p'): self.tag = type def to_html(self): ans = ['<%s>'%self.tag] for elem in self: if isinstance(elem, numbers.Integral): ans.append('<a name="page_%d"/>'%elem) else: ans.append(elem.to_html()+' ') ans.append('</%s>'%self.tag) return ans class ImageBox(Box): def __init__(self, img): Box.__init__(self) self.img = img def to_html(self): ans = ['<div style="text-align:center">'] ans.append(self.img.to_html()) if len(self) > 0: ans.append('<br/>') for elem in self: if isinstance(elem, numbers.Integral): ans.append('<a name="page_%d"/>'%elem) else: ans.append(elem.to_html()+' ') ans.append('</div>') return ans class Region: def __init__(self, opts, log): self.opts, self.log = opts, log self.columns = [] self.top = self.bottom = self.left = self.right = self.width = self.height = 0 def add(self, columns): if not self.columns: for x in sorted(columns, key=lambda x: x.left): self.columns.append(x) else: for i in range(len(columns)): for elem in columns[i]: self.columns[i].add(elem) def contains(self, columns): # TODO: handle unbalanced columns if not self.columns: return True if len(columns) != len(self.columns): return False for i in range(len(columns)): c1, c2 = self.columns[i], columns[i] x1 = Interval(c1.left, c1.right) x2 = Interval(c2.left, c2.right) intersection = x1.intersection(x2) base = min(x1.width, x2.width) if intersection.width/base < 0.6: return False return True @property def is_empty(self): return len(self.columns) == 0 @property def line_count(self): max_lines = 0 for c in self.columns: max_lines = max(max_lines, len(c)) return max_lines @property def is_small(self): return self.line_count < 3 def absorb(self, singleton): def most_suitable_column(elem): mc, mw = None, 0 for c in self.columns: i = Interval(c.left, c.right) e = Interval(elem.left, elem.right) w = i.intersection(e).width if w > mw: mc, mw = c, w if mc is None: self.log.warn('No suitable column for singleton', elem.to_html()) mc = self.columns[0] return mc for c in singleton.columns: for elem in c: col = most_suitable_column(elem) if self.opts.verbose > 3: idx = self.columns.index(col) self.log.debug('Absorbing singleton %s into column'%elem.to_html(), idx) col.add(elem) def collect_stats(self): for column in self.columns: column.collect_stats() self.average_line_separation = sum(x.average_line_separation for x in self.columns)/float(len(self.columns)) def __iter__(self): yield from self.columns def absorb_regions(self, regions, at): for region in regions: self.absorb_region(region, at) def absorb_region(self, region, at): if len(region.columns) <= len(self.columns): for i in range(len(region.columns)): src, dest = region.columns[i], self.columns[i] if at != 'bottom': src = reversed(list(iter(src))) for elem in src: func = dest.add if at == 'bottom' else dest.prepend func(elem) else: col_map = {} for i, col in enumerate(region.columns): max_overlap, max_overlap_index = 0, 0 for j, dcol in enumerate(self.columns): sint = Interval(col.left, col.right) dint = Interval(dcol.left, dcol.right) width = sint.intersection(dint).width if width > max_overlap: max_overlap = width max_overlap_index = j col_map[i] = max_overlap_index lines = max(map(len, region.columns)) if at == 'bottom': lines = range(lines) else: lines = range(lines-1, -1, -1) for i in lines: for j, src in enumerate(region.columns): dest = self.columns[col_map[j]] if i < len(src): func = dest.add if at == 'bottom' else dest.prepend func(src.elements[i]) def dump(self, f): f.write('############################################################\n') f.write('########## Region (%d columns) ###############\n'%len(self.columns)) f.write('############################################################\n\n') for i, col in enumerate(self.columns): col.dump(f, i) def linearize(self): self.elements = [] for x in self.columns: self.elements.extend(x) self.boxes = [Box()] for i, elem in enumerate(self.elements): if isinstance(elem, Image): self.boxes.append(ImageBox(elem)) img = Interval(elem.left, elem.right) for j in range(i+1, len(self.elements)): t = self.elements[j] if not isinstance(t, Text): break ti = Interval(t.left, t.right) if not ti.centered_in(img): break self.boxes[-1].append(t) self.boxes.append(Box()) else: is_indented = False if i+1 < len(self.elements): indent_diff = elem.indent_fraction - \ self.elements[i+1].indent_fraction if indent_diff > 0.05: is_indented = True if elem.top_gap_ratio > 1.2 or is_indented: self.boxes.append(Box()) self.boxes[-1].append(elem) class Page: # Fraction of a character width that two strings have to be apart, # for them to be considered part of the same text fragment COALESCE_FACTOR = 0.5 # Fraction of text height that two strings' bottoms can differ by # for them to be considered to be part of the same text fragment LINE_FACTOR = 0.4 # Multiplies the average line height when determining row height # of a particular element to detect columns. YFUZZ = 1.5 def __init__(self, page, font_map, opts, log, idc): self.opts, self.log = opts, log self.font_map = font_map self.number = int(page.get('number')) self.width, self.height = map(float, map(page.get, ('width', 'height'))) self.id = 'page%d'%self.number self.texts = [] self.left_margin, self.right_margin = self.width, 0 for text in page.xpath('descendant::text'): self.texts.append(Text(text, self.font_map, self.opts, self.log, idc)) text = self.texts[-1] self.left_margin = min(text.left, self.left_margin) self.right_margin = max(text.right, self.right_margin) self.textwidth = self.right_margin - self.left_margin self.font_size_stats = {} self.average_text_height = 0 for t in self.texts: if t.font_size not in self.font_size_stats: self.font_size_stats[t.font_size] = 0 self.font_size_stats[t.font_size] += len(t.text_as_string) self.average_text_height += t.height if len(self.texts): self.average_text_height /= len(self.texts) self.font_size_stats = FontSizeStats(self.font_size_stats) self.coalesce_fragments() self.elements = list(self.texts) for img in page.xpath('descendant::img'): self.elements.append(Image(img, self.opts, self.log, idc)) self.elements.sort(key=lambda x: x.top) def coalesce_fragments(self): def find_match(frag): for t in self.texts: hdelta = t.left - frag.right hoverlap = self.COALESCE_FACTOR * frag.average_character_width if t is not frag and hdelta > -hoverlap and \ hdelta < hoverlap and \ abs(t.bottom - frag.bottom) < self.LINE_FACTOR*frag.height: return t match_found = True while match_found: match_found, match = False, None for frag in self.texts: match = find_match(frag) if match is not None: match_found = True frag.coalesce(match, self.number) break if match is not None: self.texts.remove(match) def first_pass(self): 'Sort page into regions and columns' self.regions = [] if not self.elements: return for i, x in enumerate(self.elements): x.idx = i current_region = Region(self.opts, self.log) processed = set() for x in self.elements: if x in processed: continue elems = set(self.find_elements_in_row_of(x)) columns = self.sort_into_columns(x, elems) processed.update(elems) if not current_region.contains(columns): self.regions.append(current_region) current_region = Region(self.opts, self.log) current_region.add(columns) if not current_region.is_empty: self.regions.append(current_region) if self.opts.verbose > 2: self.debug_dir = 'page-%d'%self.number os.mkdir(self.debug_dir) self.dump_regions('pre-coalesce') self.coalesce_regions() self.dump_regions('post-coalesce') def dump_regions(self, fname): fname = 'regions-'+fname+'.txt' with open(os.path.join(self.debug_dir, fname), 'wb') as f: f.write('Page #%d\n\n'%self.number) for region in self.regions: region.dump(f) def coalesce_regions(self): # find contiguous sets of small regions # absorb into a neighboring region (prefer the one with number of cols # closer to the avg number of cols in the set, if equal use larger # region) found = True absorbed = set() processed = set() while found: found = False for i, region in enumerate(self.regions): if region in absorbed: continue if region.is_small and region not in processed: found = True processed.add(region) regions = [region] end = i+1 for j in range(i+1, len(self.regions)): end = j if self.regions[j].is_small: regions.append(self.regions[j]) else: break prev_region = None if i == 0 else i-1 next_region = end if end < len(self.regions) and self.regions[end] not in regions else None absorb_at = 'bottom' if prev_region is None and next_region is not None: absorb_into = next_region absorb_at = 'top' elif next_region is None and prev_region is not None: absorb_into = prev_region elif prev_region is None and next_region is None: if len(regions) > 1: absorb_into = i regions = regions[1:] else: absorb_into = None else: absorb_into = prev_region if self.regions[next_region].line_count >= \ self.regions[prev_region].line_count: avg_column_count = sum(len(r.columns) for r in regions)/float(len(regions)) if self.regions[next_region].line_count > \ self.regions[prev_region].line_count \ or abs(avg_column_count - len(self.regions[prev_region].columns)) \ > abs(avg_column_count - len(self.regions[next_region].columns)): absorb_into = next_region absorb_at = 'top' if absorb_into is not None: self.regions[absorb_into].absorb_regions(regions, absorb_at) absorbed.update(regions) for region in absorbed: self.regions.remove(region) def sort_into_columns(self, elem, neighbors): neighbors.add(elem) neighbors = sorted(neighbors, key=lambda x: x.left) if self.opts.verbose > 3: self.log.debug('Neighbors:', [x.to_html() for x in neighbors]) columns = [Column()] columns[0].add(elem) for x in neighbors: added = False for c in columns: if c.contains(x): c.add(x) added = True break if not added: columns.append(Column()) columns[-1].add(x) columns.sort(key=lambda x: x.left) return columns def find_elements_in_row_of(self, x): interval = Interval(x.top, x.top + self.YFUZZ*(self.average_text_height)) h_interval = Interval(x.left, x.right) for y in self.elements[x.idx:x.idx+15]: if y is not x: y_interval = Interval(y.top, y.bottom) x_interval = Interval(y.left, y.right) if interval.intersection(y_interval).width > \ 0.5*self.average_text_height and \ x_interval.intersection(h_interval).width <= 0: yield y def second_pass(self): 'Locate paragraph boundaries in each column' for region in self.regions: region.collect_stats() region.linearize() class PDFDocument: def __init__(self, xml, opts, log): self.opts, self.log = opts, log self.root = safe_xml_fromstring(xml) idc = count() self.fonts = [] self.font_map = {} for spec in self.root.xpath('//font'): self.fonts.append(Font(spec)) self.font_map[self.fonts[-1].id] = self.fonts[-1] self.pages = [] self.page_map = {} for page in self.root.xpath('//page'): page = Page(page, self.font_map, opts, log, idc) self.page_map[page.id] = page self.pages.append(page) self.collect_font_statistics() for page in self.pages: page.document_font_stats = self.font_size_stats page.first_pass() page.second_pass() self.linearize() self.render() def collect_font_statistics(self): self.font_size_stats = {} for p in self.pages: for sz in p.font_size_stats: chars = p.font_size_stats[sz] if sz not in self.font_size_stats: self.font_size_stats[sz] = 0 self.font_size_stats[sz] += chars self.font_size_stats = FontSizeStats(self.font_size_stats) def linearize(self): self.elements = [] last_region = last_block = None for page in self.pages: page_number_inserted = False for region in page.regions: merge_first_block = last_region is not None and \ len(last_region.columns) == len(region.columns) and \ not hasattr(last_block, 'img') for i, block in enumerate(region.boxes): if merge_first_block: merge_first_block = False if not page_number_inserted: last_block.append(page.number) page_number_inserted = True for elem in block: last_block.append(elem) else: if not page_number_inserted: block.insert(0, page.number) page_number_inserted = True self.elements.append(block) last_block = block last_region = region def render(self): html = ['<?xml version="1.0" encoding="UTF-8"?>', '<html xmlns="http://www.w3.org/1999/xhtml">', '<head>', '<title>PDF Reflow conversion</title>', '</head>', '<body>', '<div>'] for elem in self.elements: html.extend(elem.to_html()) html += ['</body>', '</html>'] raw = ('\n'.join(html)).replace('</strong><strong>', '') with open('index.html', 'wb') as f: f.write(raw.encode('utf-8'))