%PDF- %PDF-
| Direktori : /usr/lib/calibre/calibre/ebooks/pdb/haodoo/ |
| Current File : //usr/lib/calibre/calibre/ebooks/pdb/haodoo/reader.py |
'''
Read content from Haodoo.net pdb file.
'''
__license__ = 'GPL v3'
__copyright__ = '2012, Kan-Ru Chen <kanru@kanru.info>'
__docformat__ = 'restructuredtext en'
import struct
import os
from calibre import prepare_string_for_xml
from calibre.ebooks.pdb.formatreader import FormatReader
from calibre.ebooks.metadata import MetaInformation
from calibre.ebooks.txt.processor import opf_writer, HTML_TEMPLATE
BPDB_IDENT = 'BOOKMTIT'
UPDB_IDENT = 'BOOKMTIU'
punct_table = {
"︵": "(",
"︶": ")",
"︷": "{",
"︸": "}",
"︹": "〔",
"︺": "〕",
"︻": "【",
"︼": "】",
"︗": "〖",
"︘": "〗",
"﹇": "[]",
"﹈": "[]",
"︽": "《",
"︾": "》",
"︿": "〈",
"﹀": "〉",
"﹁": "「",
"﹂": "」",
"﹃": "『",
"﹄": "』",
"|": "—",
"︙": "…",
"ⸯ": "~",
"│": "…",
"¦": "…",
" ": " ",
}
def fix_punct(line):
for (key, value) in punct_table.items():
line = line.replace(key, value)
return line
class LegacyHeaderRecord:
def __init__(self, raw):
fields = raw.lstrip().replace(b'\x1b\x1b\x1b', b'\x1b').split(b'\x1b')
self.title = fix_punct(fields[0].decode('cp950', 'replace'))
self.num_records = int(fields[1])
self.chapter_titles = list(map(
lambda x: fix_punct(x.decode('cp950', 'replace').rstrip('\x00')),
fields[2:]))
class UnicodeHeaderRecord:
def __init__(self, raw):
fields = raw.lstrip().replace(b'\x1b\x00\x1b\x00\x1b\x00',
b'\x1b\x00').split(b'\x1b\x00')
self.title = fix_punct(fields[0].decode('utf_16_le', 'ignore'))
self.num_records = int(fields[1])
self.chapter_titles = list(map(
lambda x: fix_punct(x.decode('utf_16_le', 'replace').rstrip('\x00')),
fields[2].split(b'\r\x00\n\x00')))
class Reader(FormatReader):
def __init__(self, header, stream, log, options):
self.stream = stream
self.log = log
self.sections = []
for i in range(header.num_sections):
self.sections.append(header.section_data(i))
if header.ident == BPDB_IDENT:
self.header_record = LegacyHeaderRecord(self.section_data(0))
self.encoding = 'cp950'
else:
self.header_record = UnicodeHeaderRecord(self.section_data(0))
self.encoding = 'utf_16_le'
def author(self):
self.stream.seek(35)
version = struct.unpack('>b', self.stream.read(1))[0]
if version == 2:
self.stream.seek(0)
author = self.stream.read(35).rstrip(b'\x00').decode(self.encoding, 'replace')
return author
else:
return 'Unknown'
def get_metadata(self):
mi = MetaInformation(self.header_record.title,
[self.author()])
mi.language = 'zh-tw'
return mi
def section_data(self, number):
return self.sections[number]
def decompress_text(self, number):
return self.section_data(number).decode(self.encoding,
'replace').rstrip('\x00')
def extract_content(self, output_dir):
txt = ''
self.log.info('Decompressing text...')
for i in range(1, self.header_record.num_records + 1):
self.log.debug('\tDecompressing text section %i' % i)
title = self.header_record.chapter_titles[i-1]
lines = []
title_added = False
for line in self.decompress_text(i).splitlines():
line = fix_punct(line)
line = line.strip()
if not title_added and title in line:
line = '<h1 class="chapter">' + line + '</h1>\n'
title_added = True
else:
line = prepare_string_for_xml(line)
lines.append('<p>%s</p>' % line)
if not title_added:
lines.insert(0, '<h1 class="chapter">' + title + '</h1>\n')
txt += '\n'.join(lines)
self.log.info('Converting text to OEB...')
html = HTML_TEMPLATE % (self.header_record.title, txt)
with open(os.path.join(output_dir, 'index.html'), 'wb') as index:
index.write(html.encode('utf-8'))
mi = self.get_metadata()
manifest = [('index.html', None)]
spine = ['index.html']
opf_writer(output_dir, 'metadata.opf', manifest, spine, mi)
return os.path.join(output_dir, 'metadata.opf')