%PDF- %PDF-
Direktori : /usr/lib/calibre/calibre/ebooks/metadata/ |
Current File : //usr/lib/calibre/calibre/ebooks/metadata/odt.py |
#!/usr/bin/env python3 # # Copyright (C) 2006 Søren Roug, European Environment Agency # # This is free software. You may redistribute it under the terms # of the Apache license and the GNU General Public License Version # 2 or at your option any later version. # # This program is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # GNU General Public License for more details. # # You should have received a copy of the GNU General Public # License along with this program; if not, write to the Free Software # Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA # # Contributor(s): # import io import json import os import re from lxml.etree import fromstring, tostring from calibre.ebooks.metadata import ( MetaInformation, authors_to_string, check_isbn, string_to_authors ) from calibre.utils.date import isoformat, parse_date from calibre.utils.imghdr import identify from calibre.utils.localization import canonicalize_lang, lang_as_iso639_1 from calibre.utils.zipfile import ZipFile, safe_replace from odf.draw import Frame as odFrame, Image as odImage from odf.namespaces import DCNS, METANS, OFFICENS from odf.opendocument import load as odLoad from polyglot.builtins import as_unicode fields = { 'title': (DCNS, 'title'), 'description': (DCNS, 'description'), 'subject': (DCNS, 'subject'), 'creator': (DCNS, 'creator'), 'date': (DCNS, 'date'), 'language': (DCNS, 'language'), 'generator': (METANS, 'generator'), 'initial-creator': (METANS, 'initial-creator'), 'keyword': (METANS, 'keyword'), 'keywords': (METANS, 'keywords'), 'editing-duration': (METANS, 'editing-duration'), 'editing-cycles': (METANS, 'editing-cycles'), 'printed-by': (METANS, 'printed-by'), 'print-date': (METANS, 'print-date'), 'creation-date': (METANS, 'creation-date'), 'user-defined': (METANS, 'user-defined'), # 'template': (METANS, 'template'), } def uniq(vals): ''' Remove all duplicates from vals, while preserving order. ''' vals = vals or () seen = set() seen_add = seen.add return list(x for x in vals if x not in seen and not seen_add(x)) def get_metadata(stream, extract_cover=True): whitespace = re.compile(r'\s+') def normalize(s): return whitespace.sub(' ', s).strip() with ZipFile(stream) as zf: meta = zf.read('meta.xml') root = fromstring(meta) def find(field): ns, tag = fields[field] ans = root.xpath(f'//ns0:{tag}', namespaces={'ns0': ns}) if ans: return normalize(tostring(ans[0], method='text', encoding='unicode', with_tail=False)).strip() def find_all(field): ns, tag = fields[field] for x in root.xpath(f'//ns0:{tag}', namespaces={'ns0': ns}): yield normalize(tostring(x, method='text', encoding='unicode', with_tail=False)).strip() mi = MetaInformation(None, []) title = find('title') if title: mi.title = title creator = find('initial-creator') or find('creator') if creator: mi.authors = string_to_authors(creator) desc = find('description') if desc: mi.comments = desc lang = find('language') if lang and canonicalize_lang(lang): mi.languages = [canonicalize_lang(lang)] keywords = [] for q in ('keyword', 'keywords'): for kw in find_all(q): keywords += [x.strip() for x in kw.split(',') if x.strip()] mi.tags = uniq(keywords) data = {} for tag in root.xpath('//ns0:user-defined', namespaces={'ns0': fields['user-defined'][0]}): name = (tag.get('{%s}name' % METANS) or '').lower() vtype = tag.get('{%s}value-type' % METANS) or 'string' val = tag.text if name and val: if vtype == 'boolean': val = val == 'true' data[name] = val opfmeta = False # we need this later for the cover opfnocover = False if data.get('opf.metadata'): # custom metadata contains OPF information opfmeta = True if data.get('opf.titlesort', ''): mi.title_sort = data['opf.titlesort'] if data.get('opf.authors', ''): mi.authors = string_to_authors(data['opf.authors']) if data.get('opf.authorsort', ''): mi.author_sort = data['opf.authorsort'] if data.get('opf.isbn', ''): isbn = check_isbn(data['opf.isbn']) if isbn is not None: mi.isbn = isbn if data.get('opf.publisher', ''): mi.publisher = data['opf.publisher'] if data.get('opf.pubdate', ''): mi.pubdate = parse_date(data['opf.pubdate'], assume_utc=True) if data.get('opf.identifiers'): try: mi.identifiers = json.loads(data['opf.identifiers']) except Exception: pass if data.get('opf.rating'): try: mi.rating = max(0, min(float(data['opf.rating']), 10)) except Exception: pass if data.get('opf.series', ''): mi.series = data['opf.series'] if data.get('opf.seriesindex', ''): try: mi.series_index = float(data['opf.seriesindex']) except Exception: mi.series_index = 1.0 if data.get('opf.language', ''): cl = canonicalize_lang(data['opf.language']) if cl: mi.languages = [cl] opfnocover = data.get('opf.nocover', False) if not opfnocover: try: read_cover(stream, zf, mi, opfmeta, extract_cover) except Exception: pass # Do not let an error reading the cover prevent reading other data return mi def set_metadata(stream, mi): with ZipFile(stream) as zf: raw = _set_metadata(zf.open('meta.xml').read(), mi) # print(raw.decode('utf-8')) stream.seek(os.SEEK_SET) safe_replace(stream, "meta.xml", io.BytesIO(raw)) def _set_metadata(raw, mi): root = fromstring(raw) namespaces = {'office': OFFICENS, 'meta': METANS, 'dc': DCNS} nsrmap = {v: k for k, v in namespaces.items()} def xpath(expr, parent=root): return parent.xpath(expr, namespaces=namespaces) def remove(*tag_names): for tag_name in tag_names: ns = fields[tag_name][0] tag_name = f'{nsrmap[ns]}:{tag_name}' for x in xpath('descendant::' + tag_name, meta): x.getparent().remove(x) def add(tag, val=None): ans = meta.makeelement('{%s}%s' % fields[tag]) ans.text = val meta.append(ans) return ans def remove_user_metadata(*names): for x in xpath('//meta:user-defined'): q = (x.get('{%s}name' % METANS) or '').lower() if q in names: x.getparent().remove(x) def add_um(name, val, vtype='string'): ans = add('user-defined', val) ans.set('{%s}value-type' % METANS, vtype) ans.set('{%s}name' % METANS, name) def add_user_metadata(name, val): if not hasattr(add_user_metadata, 'sentinel_added'): add_user_metadata.sentinel_added = True remove_user_metadata('opf.metadata') add_um('opf.metadata', 'true', 'boolean') val_type = 'string' if hasattr(val, 'strftime'): val = isoformat(val, as_utc=True).split('T')[0] val_type = 'date' add_um(name, val, val_type) meta = xpath('//office:meta')[0] if not mi.is_null('title'): remove('title') add('title', mi.title) if not mi.is_null('title_sort'): remove_user_metadata('opf.titlesort') add_user_metadata('opf.titlesort', mi.title_sort) if not mi.is_null('authors'): remove('initial-creator', 'creator') val = authors_to_string(mi.authors) add('initial-creator', val), add('creator', val) remove_user_metadata('opf.authors') add_user_metadata('opf.authors', val) if not mi.is_null('author_sort'): remove_user_metadata('opf.authorsort') add_user_metadata('opf.authorsort', mi.author_sort) if not mi.is_null('comments'): remove('description') add('description', mi.comments) if not mi.is_null('tags'): remove('keyword') add('keyword', ', '.join(mi.tags)) if not mi.is_null('languages'): lang = lang_as_iso639_1(mi.languages[0]) if lang: remove('language') add('language', lang) if not mi.is_null('pubdate'): remove_user_metadata('opf.pubdate') add_user_metadata('opf.pubdate', mi.pubdate) if not mi.is_null('publisher'): remove_user_metadata('opf.publisher') add_user_metadata('opf.publisher', mi.publisher) if not mi.is_null('series'): remove_user_metadata('opf.series', 'opf.seriesindex') add_user_metadata('opf.series', mi.series) add_user_metadata('opf.seriesindex', f'{mi.series_index}') if not mi.is_null('identifiers'): remove_user_metadata('opf.identifiers') add_user_metadata('opf.identifiers', as_unicode(json.dumps(mi.identifiers))) if not mi.is_null('rating'): remove_user_metadata('opf.rating') add_user_metadata('opf.rating', '%.2g' % mi.rating) return tostring(root, encoding='utf-8', pretty_print=True) def read_cover(stream, zin, mi, opfmeta, extract_cover): # search for an draw:image in a draw:frame with the name 'opf.cover' # if opf.metadata prop is false, just use the first image that # has a proper size (borrowed from docx) otext = odLoad(stream) cover_href = None cover_data = None cover_frame = None imgnum = 0 for frm in otext.topnode.getElementsByType(odFrame): img = frm.getElementsByType(odImage) if len(img) == 0: continue i_href = img[0].getAttribute('href') try: raw = zin.read(i_href) except KeyError: continue try: fmt, width, height = identify(raw) except Exception: continue imgnum += 1 if opfmeta and frm.getAttribute('name').lower() == 'opf.cover': cover_href = i_href cover_data = (fmt, raw) cover_frame = frm.getAttribute('name') # could have upper case break if cover_href is None and imgnum == 1 and 0.8 <= height/width <= 1.8 and height*width >= 12000: # Pick the first image as the cover if it is of a suitable size cover_href = i_href cover_data = (fmt, raw) if not opfmeta: break if cover_href is not None: mi.cover = cover_href mi.odf_cover_frame = cover_frame if extract_cover: if not cover_data: raw = zin.read(cover_href) try: fmt = identify(raw)[0] except Exception: pass else: cover_data = (fmt, raw) mi.cover_data = cover_data