%PDF- %PDF-
| Direktori : /lib/calibre/calibre/utils/ |
| Current File : //lib/calibre/calibre/utils/icu_test.py |
#!/usr/bin/env python3
__license__ = 'GPL v3'
__copyright__ = '2014, Kovid Goyal <kovid at kovidgoyal.net>'
import unittest, sys
from contextlib import contextmanager
import calibre.utils.icu as icu
from polyglot.builtins import iteritems, cmp
@contextmanager
def make_collation_func(name, locale, numeric=True, maker=icu.make_sort_key_func, func='strcmp'):
def coll():
ans = icu._icu.Collator(locale)
ans.numeric = numeric
return ans
yield maker(coll, func)
class TestICU(unittest.TestCase):
ae = unittest.TestCase.assertEqual
ane= unittest.TestCase.assertNotEqual
def setUp(self):
icu.change_locale('en')
def test_sorting(self):
' Test the various sorting APIs '
german = '''Sonntag Montag Dienstag Januar Februar März Fuße Fluße Flusse flusse fluße flüße flüsse'''.split()
german_good = '''Dienstag Februar flusse Flusse fluße Fluße flüsse flüße Fuße Januar März Montag Sonntag'''.split()
french = '''dimanche lundi mardi janvier février mars déjà Meme deja même dejà bpef bœg Boef Mémé bœf boef bnef pêche pèché pêché pêche pêché'''.split()
french_good = '''bnef boef Boef bœf bœg bpef deja dejà déjà dimanche février janvier lundi mardi mars Meme Mémé même pèché pêche pêche pêché pêché'''.split() # noqa
# Test corner cases
sort_key = icu.sort_key
s = '\U0001f431'
self.ae(sort_key(s), sort_key(s.encode(sys.getdefaultencoding())), 'UTF-8 encoded object not correctly decoded to generate sort key')
self.ae(s.encode('utf-16'), s.encode('utf-16'), 'Undecodable bytestring not returned as itself')
self.ae(b'', sort_key(None))
self.ae(0, icu.strcmp(None, b''))
self.ae(0, icu.strcmp(s, s.encode(sys.getdefaultencoding())))
# Test locales
with make_collation_func('dsk', 'de', maker=icu.make_sort_key_func, func='sort_key') as dsk:
self.ae(german_good, sorted(german, key=dsk))
with make_collation_func('dcmp', 'de', maker=icu.make_two_arg_func, func='strcmp') as dcmp:
for x in german:
for y in german:
self.ae(cmp(dsk(x), dsk(y)), dcmp(x, y))
with make_collation_func('fsk', 'fr', maker=icu.make_sort_key_func, func='sort_key') as fsk:
self.ae(french_good, sorted(french, key=fsk))
with make_collation_func('fcmp', 'fr', maker=icu.make_two_arg_func) as fcmp:
for x in french:
for y in french:
self.ae(cmp(fsk(x), fsk(y)), fcmp(x, y))
with make_collation_func('ssk', 'es', maker=icu.make_sort_key_func, func='sort_key') as ssk:
self.assertNotEqual(ssk('peña'), ssk('pena'))
with make_collation_func('scmp', 'es', maker=icu.make_two_arg_func) as scmp:
self.assertNotEqual(0, scmp('pena', 'peña'))
for k, v in iteritems({'pèché': 'peche', 'flüße':'Flusse', 'Štepánek':'ŠtepaneK'}):
self.ae(0, icu.primary_strcmp(k, v))
# Test different types of collation
self.ae(icu.primary_sort_key('Aä'), icu.primary_sort_key('aa'))
self.assertLess(icu.numeric_sort_key('something 2'), icu.numeric_sort_key('something 11'))
self.assertLess(icu.case_sensitive_sort_key('A'), icu.case_sensitive_sort_key('a'))
self.ae(0, icu.strcmp('a', 'A'))
self.ae(cmp('a', 'A'), icu.case_sensitive_strcmp('a', 'A'))
self.ae(0, icu.primary_strcmp('ä', 'A'))
def test_change_case(self):
' Test the various ways of changing the case '
from calibre.utils.titlecase import titlecase
# Test corner cases
self.ae('A', icu.upper(b'a'))
for x in ('', None, False, 1):
self.ae(x, icu.capitalize(x))
for x in ('a', 'Alice\'s code', 'macdonald\'s machIne', '02 the wars'):
self.ae(icu.upper(x), x.upper())
self.ae(icu.lower(x), x.lower())
# ICU's title case algorithm is different from ours, when there are
# capitals inside words
self.ae(icu.title_case(x), titlecase(x).replace('machIne', 'Machine'))
self.ae(icu.capitalize(x), x[0].upper() + x[1:].lower())
self.ae(icu.swapcase(x), x.swapcase())
def test_find(self):
' Test searching for substrings '
self.ae((1, 1), icu.find(b'a', b'1ab'))
self.ae((1, 1), icu.find('\U0001f431', 'x\U0001f431x'))
self.ae((1, 1), icu.find('y', '\U0001f431y'))
self.ae((0, 4), icu.primary_find('pena', 'peña'))
for k, v in iteritems({'pèché': 'peche', 'flüße':'Flusse', 'Štepánek':'ŠtepaneK'}):
self.ae((1, len(k)), icu.primary_find(v, ' ' + k), f'Failed to find {v} in {k}')
self.assertTrue(icu.startswith(b'abc', b'ab'))
self.assertTrue(icu.startswith('abc', 'abc'))
self.assertFalse(icu.startswith('xyz', 'a'))
self.assertTrue(icu.startswith('xxx', ''))
self.assertTrue(icu.primary_startswith('pena', 'peña'))
self.assertTrue(icu.contains('\U0001f431', '\U0001f431'))
self.assertTrue(icu.contains('something', 'some other something else'))
self.assertTrue(icu.contains('', 'a'))
self.assertTrue(icu.contains('', ''))
self.assertFalse(icu.contains('xxx', 'xx'))
self.assertTrue(icu.primary_contains('pena', 'peña'))
def test_collation_order(self):
'Testing collation ordering'
for group in [
(self.ae, ('Šaa', 'Smith', 'Solženicyn', 'Štepánek')),
(self.ae, ('11', '011')),
(self.ane, ('2', '1')),
(self.ae, ('100 Smith', '0100 Smith')),
]:
last = None
assert_func = group[0]
for x in group[1]:
order, _ = icu.numeric_collator().collation_order(x)
if last is not None:
assert_func(last, order, f'Order for {x} not correct: {last} != {order}')
last = order
self.ae(dict(icu.partition_by_first_letter(['A1', '', 'a1', '\U0001f431', '\U0001f431x'])),
{' ':[''], 'A':['A1', 'a1'], '\U0001f431':['\U0001f431', '\U0001f431x']})
def test_collation_order_for_partitioning(self):
'Testing collation ordering for partitioning'
for group in [
(self.ae, ('Smith', 'Šaa', 'Solženicyn', 'Štepánek')),
(self.ane, ('11', '011')),
(self.ae, ('102 Smith', '100 Smith')),
(self.ane, ('100 Smith', '0100 Smith')),
]:
last = None
assert_func = group[0]
for x in group[1]:
order, _ = icu.non_numeric_sort_collator().collation_order(x)
if last is not None:
assert_func(last, order, f'Order for {x} not correct: {last} != {order}')
last = order
self.ae(dict(icu.partition_by_first_letter(['A1', '', 'a1', '\U0001f431', '\U0001f431x'])),
{' ':[''], 'A':['A1', 'a1'], '\U0001f431':['\U0001f431', '\U0001f431x']})
def test_roundtrip(self):
' Test roundtripping '
for r in ('xxx\0\u2219\U0001f431xxx', '\0', '', 'simple'):
self.ae(r, icu._icu.roundtrip(r))
self.ae(icu._icu.roundtrip('\ud8e81'), '\ufffd1')
self.ae(icu._icu.roundtrip('\udc01\ud8e8'), '\ufffd\ufffd')
for x, l in [('', 0), ('a', 1), ('\U0001f431', 1)]:
self.ae(icu._icu.string_length(x), l)
for x, l in [('', 0), ('a', 1), ('\U0001f431', 2)]:
self.ae(icu._icu.utf16_length(x), l)
self.ae(icu._icu.chr(0x1f431), '\U0001f431')
self.ae(icu._icu.ord_string('abc'*100), tuple(map(ord, 'abc'*100)))
self.ae(icu._icu.ord_string('\U0001f431'), (0x1f431,))
def test_character_name(self):
' Test character naming '
from calibre.utils.unicode_names import character_name_from_code
for q, e in {
'\U0001f431': 'CAT FACE'
}.items():
self.ae(icu.character_name(q), e)
self.ae(character_name_from_code(icu.ord_string(q)[0]), e)
def test_contractions(self):
' Test contractions '
self.skipTest('Skipping as this depends too much on ICU version')
c = icu._icu.Collator('cs')
self.ae(icu.contractions(c), frozenset({'Z\u030c', 'z\u030c', 'Ch',
'C\u030c', 'ch', 'cH', 'c\u030c', 's\u030c', 'r\u030c', 'CH',
'S\u030c', 'R\u030c'}))
def test_break_iterator(self):
' Test the break iterator '
from calibre.spell.break_iterator import split_into_words as split, index_of, split_into_words_and_positions, count_words
for q in ('one two three', ' one two three', 'one\ntwo three ', ):
self.ae(split(str(q)), ['one', 'two', 'three'], 'Failed to split: %r' % q)
self.ae(split('I I\'m'), ['I', "I'm"])
self.ae(split('out-of-the-box'), ['out-of-the-box'])
self.ae(split('-one two-'), ['-one', 'two-'])
self.ae(split('-one a-b-c-d e'), ['-one', 'a-b-c-d', 'e'])
self.ae(split('-one -a-b-c-d- e'), ['-one', '-a-b-c-d-', 'e'])
self.ae(split_into_words_and_positions('one \U0001f431 three'), [(0, 3), (6, 5)])
self.ae(count_words('a b c d e f'), 6)
for needle, haystack, pos in (
('word', 'a word b', 2),
('word', 'a word', 2),
('one-two', 'a one-two punch', 2),
('one-two', 'one-two punch', 0),
('one-two', 'one-two', 0),
('one', 'one-two one', 8),
('one-two', 'one-two-three one-two', 14),
('one', 'onet one', 5),
('two', 'one-two two', 8),
('two', 'two-one two', 8),
('-two', 'one-two -two', 8),
('-two', 'two', -1),
('i', 'i', 0),
('i', 'six i', 4),
('i', '', -1), ('', '', -1), ('', 'i', -1),
('i', 'six clicks', -1),
('i', '\U0001f431 i', 2),
('-a', 'b -a', 2),
('a-', 'a-b a- d', 4),
('-a-', 'b -a -a-', 5),
('-a-', '-a-', 0),
('-a-', 'a-', -1),
('-a-', '-a', -1),
('-a-', 'a', -1),
('a-', 'a-', 0),
('-a', '-a', 0),
('a-b-c-', 'a-b-c-d', -1),
('a-b-c-', 'a-b-c-.', 0),
('a-b-c-', 'a-b-c-d a-b-c- d', 8),
):
fpos = index_of(needle, haystack)
self.ae(pos, fpos, 'Failed to find index of %r in %r (%d != %d)' % (needle, haystack, pos, fpos))
def find_tests():
return unittest.defaultTestLoader.loadTestsFromTestCase(TestICU)
class TestRunner(unittest.main):
def createTests(self):
self.test = find_tests()
def run(verbosity=4):
TestRunner(verbosity=verbosity, exit=False)
def test_build():
result = TestRunner(verbosity=0, buffer=True, catchbreak=True, failfast=True, argv=sys.argv[:1], exit=False).result
if not result.wasSuccessful():
raise SystemExit(1)
if __name__ == '__main__':
run(verbosity=4)