%PDF- %PDF-
| Direktori : /proc/self/root/usr/lib/calibre/calibre/ebooks/docx/ |
| Current File : //proc/self/root/usr/lib/calibre/calibre/ebooks/docx/index.py |
#!/usr/bin/env python3
__license__ = 'GPL v3'
__copyright__ = '2014, Kovid Goyal <kovid at kovidgoyal.net>'
from operator import itemgetter
from lxml import etree
from calibre.utils.icu import partition_by_first_letter, sort_key
from polyglot.builtins import iteritems
def get_applicable_xe_fields(index, xe_fields, XPath, expand):
iet = index.get('entry-type', None)
xe_fields = [xe for xe in xe_fields if xe.get('entry-type', None) == iet]
lr = index.get('letter-range', None)
if lr is not None:
sl, el = lr.partition('-')[0::2]
sl, el = sl.strip(), el.strip()
if sl and el:
def inrange(text):
return sl <= text[0] <= el
xe_fields = [xe for xe in xe_fields if inrange(xe.get('text', ''))]
bmark = index.get('bookmark', None)
if bmark is None:
return xe_fields
attr = expand('w:name')
bookmarks = {b for b in XPath('//w:bookmarkStart')(xe_fields[0]['start_elem']) if b.get(attr, None) == bmark}
ancestors = XPath('ancestor::w:bookmarkStart')
def contained(xe):
# Check if the xe field is contained inside a bookmark with the
# specified name
return bool(set(ancestors(xe['start_elem'])) & bookmarks)
return [xe for xe in xe_fields if contained(xe)]
def make_block(expand, style, parent, pos):
p = parent.makeelement(expand('w:p'))
parent.insert(pos, p)
if style is not None:
ppr = p.makeelement(expand('w:pPr'))
p.append(ppr)
ps = ppr.makeelement(expand('w:pStyle'))
ppr.append(ps)
ps.set(expand('w:val'), style)
r = p.makeelement(expand('w:r'))
p.append(r)
t = r.makeelement(expand('w:t'))
t.set(expand('xml:space'), 'preserve')
r.append(t)
return p, t
def add_xe(xe, t, expand):
run = t.getparent()
idx = run.index(t)
t.text = xe.get('text') or ' '
pt = xe.get('page-number-text', None)
if pt:
p = t.getparent().getparent()
r = p.makeelement(expand('w:r'))
p.append(r)
t2 = r.makeelement(expand('w:t'))
t2.set(expand('xml:space'), 'preserve')
t2.text = ' [%s]' % pt
r.append(t2)
# put separate entries on separate lines
run.insert(idx + 1, run.makeelement(expand('w:br')))
return xe['anchor'], run
def process_index(field, index, xe_fields, log, XPath, expand):
'''
We remove all the word generated index markup and replace it with our own
that is more suitable for an ebook.
'''
styles = []
heading_text = index.get('heading', None)
heading_style = 'IndexHeading'
start_pos = None
for elem in field.contents:
if elem.tag.endswith('}p'):
s = XPath('descendant::pStyle/@w:val')(elem)
if s:
styles.append(s[0])
p = elem.getparent()
if start_pos is None:
start_pos = (p, p.index(elem))
p.remove(elem)
xe_fields = get_applicable_xe_fields(index, xe_fields, XPath, expand)
if not xe_fields:
return [], []
if heading_text is not None:
groups = partition_by_first_letter(xe_fields, key=itemgetter('text'))
items = []
for key, fields in iteritems(groups):
items.append(key), items.extend(fields)
if styles:
heading_style = styles[0]
else:
items = sorted(xe_fields, key=lambda x:sort_key(x['text']))
hyperlinks = []
blocks = []
for item in reversed(items):
is_heading = not isinstance(item, dict)
style = heading_style if is_heading else None
p, t = make_block(expand, style, *start_pos)
if is_heading:
text = heading_text
if text.lower().startswith('a'):
text = item + text[1:]
t.text = text
else:
hyperlinks.append(add_xe(item, t, expand))
blocks.append(p)
return hyperlinks, blocks
def split_up_block(block, a, text, parts, ldict):
prefix = parts[:-1]
a.text = parts[-1]
parent = a.getparent()
style = 'display:block; margin-left: %.3gem'
for i, prefix in enumerate(prefix):
m = 1.5 * i
span = parent.makeelement('span', style=style % m)
ldict[span] = i
parent.append(span)
span.text = prefix
span = parent.makeelement('span', style=style % ((i + 1) * 1.5))
parent.append(span)
span.append(a)
ldict[span] = len(prefix)
"""
The merge algorithm is a little tricky.
We start with a list of elementary blocks. Each is an HtmlElement, a p node
with a list of child nodes. The last child may be a link, and the earlier ones are
just text.
The list is in reverse order from what we want in the index.
There is a dictionary ldict which records the level of each child node.
Now we want to do a reduce-like operation, combining all blocks with the same
top level index entry into a single block representing the structure of all
references, subentries, etc. under that top entry.
Here's the algorithm.
Given a block p and the next block n, and the top level entries p1 and n1 in each
block, which we assume have the same text:
Start with (p, p1) and (n, n1).
Given (p, p1, ..., pk) and (n, n1, ..., nk) which we want to merge:
If there are no more levels in n, and we have a link in nk,
then add the link from nk to the links for pk.
This might be the first link for pk, or we might get a list of references.
Otherwise nk+1 is the next level in n. Look for a matching entry in p. It must have
the same text, it must follow pk, it must come before we find any other p entries at
the same level as pk, and it must have the same level as nk+1.
If we find such a matching entry, go back to the start with (p ... pk+1) and (n ... nk+1).
If there is no matching entry, then because of the original reversed order we want
to insert nk+1 and all following entries from n into p immediately following pk.
"""
def find_match(prev_block, pind, nextent, ldict):
curlevel = ldict.get(prev_block[pind], -1)
if curlevel < 0:
return -1
for p in range(pind+1, len(prev_block)):
trylev = ldict.get(prev_block[p], -1)
if trylev <= curlevel:
return -1
if trylev > (curlevel+1):
continue
if prev_block[p].text_content() == nextent.text_content():
return p
return -1
def add_link(pent, nent, ldict):
na = nent.xpath('descendant::a[1]')
# If there is no link, leave it as text
if not na or len(na) == 0:
return
na = na[0]
pa = pent.xpath('descendant::a')
if pa and len(pa) > 0:
# Put on same line with a comma
pa = pa[-1]
pa.tail = ', '
p = pa.getparent()
p.insert(p.index(pa) + 1, na)
else:
# substitute link na for plain text in pent
pent.text = ""
pent.append(na)
def merge_blocks(prev_block, next_block, pind, nind, next_path, ldict):
# First elements match. Any more in next?
if len(next_path) == (nind + 1):
nextent = next_block[nind]
add_link(prev_block[pind], nextent, ldict)
return
nind = nind + 1
nextent = next_block[nind]
prevent = find_match(prev_block, pind, nextent, ldict)
if prevent > 0:
merge_blocks(prev_block, next_block, prevent, nind, next_path, ldict)
return
# Want to insert elements into previous block
while nind < len(next_block):
# insert takes it out of old
pind = pind + 1
prev_block.insert(pind, next_block[nind])
next_block.getparent().remove(next_block)
def polish_index_markup(index, blocks):
# Blocks are in reverse order at this point
path_map = {}
ldict = {}
for block in blocks:
cls = block.get('class', '') or ''
block.set('class', (cls + ' index-entry').lstrip())
a = block.xpath('descendant::a[1]')
text = ''
if a:
text = etree.tostring(a[0], method='text', with_tail=False, encoding='unicode').strip()
if ':' in text:
path_map[block] = parts = list(filter(None, (x.strip() for x in text.split(':'))))
if len(parts) > 1:
split_up_block(block, a[0], text, parts, ldict)
else:
# try using a span all the time
path_map[block] = [text]
parent = a[0].getparent()
span = parent.makeelement('span', style='display:block; margin-left: 0em')
parent.append(span)
span.append(a[0])
ldict[span] = 0
for br in block.xpath('descendant::br'):
br.tail = None
# We want a single block for each main entry
prev_block = blocks[0]
for block in blocks[1:]:
pp, pn = path_map[prev_block], path_map[block]
if pp[0] == pn[0]:
merge_blocks(prev_block, block, 0, 0, pn, ldict)
else:
prev_block = block