%PDF- %PDF-
| Direktori : /lib/calibre/calibre/utils/ |
| Current File : //lib/calibre/calibre/utils/icu.py |
#!/usr/bin/env python3
__license__ = 'GPL v3'
__copyright__ = '2010, Kovid Goyal <kovid@kovidgoyal.net>'
__docformat__ = 'restructuredtext en'
# Setup code {{{
import codecs
import sys
from calibre.utils.config_base import tweaks, prefs
from calibre_extensions import icu as _icu
from polyglot.builtins import cmp
_locale = _collator = _primary_collator = _sort_collator = _non_numeric_sort_collator = _numeric_collator = _case_sensitive_collator = None
cmp
_none = ''
_none2 = b''
_cmap = {}
icu_unicode_version = getattr(_icu, 'unicode_version', None)
_nmodes = {m:getattr(_icu, m) for m in ('NFC', 'NFD', 'NFKC', 'NFKD')}
# Ensure that the python internal filesystem and default encodings are not ASCII
def is_ascii(name):
try:
return codecs.lookup(name).name == b'ascii'
except (TypeError, LookupError):
return True
try:
if is_ascii(sys.getdefaultencoding()):
_icu.set_default_encoding(b'utf-8')
except:
import traceback
traceback.print_exc()
try:
if is_ascii(sys.getfilesystemencoding()):
_icu.set_filesystem_encoding(b'utf-8')
except:
import traceback
traceback.print_exc()
del is_ascii
def collator():
global _collator, _locale
if _collator is None:
if _locale is None:
from calibre.utils.localization import get_lang
if tweaks['locale_for_sorting']:
_locale = tweaks['locale_for_sorting']
else:
_locale = get_lang()
try:
_collator = _icu.Collator(_locale)
except Exception as e:
print(f'Failed to load collator for locale: {_locale!r} with error {e!r}, using English')
_collator = _icu.Collator('en')
return _collator
def change_locale(locale=None):
global _locale, _collator, _primary_collator, _sort_collator, _numeric_collator, _case_sensitive_collator, _non_numeric_sort_collator
_collator = _primary_collator = _sort_collator = _numeric_collator = _case_sensitive_collator = _non_numeric_sort_collator = None
_locale = locale
def primary_collator():
'Ignores case differences and accented characters'
global _primary_collator
if _primary_collator is None:
_primary_collator = collator().clone()
_primary_collator.strength = _icu.UCOL_PRIMARY
return _primary_collator
def sort_collator():
'Ignores case differences and recognizes numbers in strings (if the tweak is set)'
global _sort_collator
if _sort_collator is None:
_sort_collator = collator().clone()
_sort_collator.strength = _icu.UCOL_SECONDARY
_sort_collator.numeric = prefs['numeric_collation']
return _sort_collator
def non_numeric_sort_collator():
'Ignores case differences only'
global _non_numeric_sort_collator
if _non_numeric_sort_collator is None:
_non_numeric_sort_collator = collator().clone()
_non_numeric_sort_collator.strength = _icu.UCOL_SECONDARY
_non_numeric_sort_collator.numeric = False
return _non_numeric_sort_collator
def numeric_collator():
'Uses natural sorting for numbers inside strings so something2 will sort before something10'
global _numeric_collator
if _numeric_collator is None:
_numeric_collator = collator().clone()
_numeric_collator.strength = _icu.UCOL_SECONDARY
_numeric_collator.numeric = True
return _numeric_collator
def case_sensitive_collator():
'Always sorts upper case letter before lower case'
global _case_sensitive_collator
if _case_sensitive_collator is None:
_case_sensitive_collator = collator().clone()
_case_sensitive_collator.numeric = sort_collator().numeric
_case_sensitive_collator.upper_first = True
return _case_sensitive_collator
def make_sort_key_func(collator_function, func_name='sort_key'):
func = None
def sort_key(a):
nonlocal func
if func is None:
func = getattr(collator_function(), func_name)
try:
return func(a)
except TypeError:
if isinstance(a, bytes):
try:
a = a.decode(sys.getdefaultencoding())
except ValueError:
return a
return func(a)
return b''
return sort_key
def make_two_arg_func(collator_function, func_name='strcmp'):
func = None
def two_args(a, b):
nonlocal func
if func is None:
func = getattr(collator_function(), func_name)
try:
return func(a, b)
except TypeError:
if isinstance(a, bytes):
try:
a = a.decode(sys.getdefaultencoding())
except Exception:
return cmp(a, b)
elif a is None:
a = ''
if isinstance(b, bytes):
try:
b = b.decode(sys.getdefaultencoding())
except Exception:
return cmp(a, b)
elif b is None:
b = ''
return func(a, b)
return two_args
def make_change_case_func(which):
def change_case(x):
try:
try:
return _icu.change_case(x, which, _locale)
except NotImplementedError:
pass
collator() # sets _locale
return _icu.change_case(x, which, _locale)
except TypeError:
if isinstance(x, bytes):
try:
x = x.decode(sys.getdefaultencoding())
except ValueError:
return x
return _icu.change_case(x, which, _locale)
raise
return change_case
# }}}
# ################ The string functions ########################################
sort_key = make_sort_key_func(sort_collator)
numeric_sort_key = make_sort_key_func(numeric_collator)
primary_sort_key = make_sort_key_func(primary_collator)
case_sensitive_sort_key = make_sort_key_func(case_sensitive_collator)
collation_order = make_sort_key_func(sort_collator, 'collation_order')
collation_order_for_partitioning = make_sort_key_func(non_numeric_sort_collator, 'collation_order')
strcmp = make_two_arg_func(sort_collator)
case_sensitive_strcmp = make_two_arg_func(case_sensitive_collator)
primary_strcmp = make_two_arg_func(primary_collator)
upper = make_change_case_func(_icu.UPPER_CASE)
lower = make_change_case_func(_icu.LOWER_CASE)
title_case = make_change_case_func(_icu.TITLE_CASE)
def capitalize(x):
try:
return upper(x[0]) + lower(x[1:])
except (IndexError, TypeError, AttributeError):
return x
try:
swapcase = _icu.swap_case
except AttributeError: # For people running from source
swapcase = lambda x:x.swapcase()
find = make_two_arg_func(collator, 'find')
primary_find = make_two_arg_func(primary_collator, 'find')
contains = make_two_arg_func(collator, 'contains')
primary_contains = make_two_arg_func(primary_collator, 'contains')
startswith = make_two_arg_func(collator, 'startswith')
primary_startswith = make_two_arg_func(primary_collator, 'startswith')
safe_chr = _icu.chr
ord_string = _icu.ord_string
def character_name(string):
try:
return _icu.character_name(str(string)) or None
except (TypeError, ValueError, KeyError):
pass
def character_name_from_code(code):
try:
return _icu.character_name_from_code(code) or ''
except (TypeError, ValueError, KeyError):
return ''
def normalize(text, mode='NFC'):
# This is very slightly slower than using unicodedata.normalize, so stick with
# that unless you have very good reasons not too. Also, it's speed
# decreases on wide python builds, where conversion to/from ICU's string
# representation is slower.
return _icu.normalize(_nmodes[mode], str(text))
def contractions(col=None):
global _cmap
col = col or _collator
if col is None:
col = collator()
ans = _cmap.get(collator, None)
if ans is None:
ans = col.contractions()
ans = frozenset(filter(None, ans))
_cmap[col] = ans
return ans
def partition_by_first_letter(items, reverse=False, key=lambda x:x):
# Build a list of 'equal' first letters by noticing changes
# in ICU's 'ordinal' for the first letter.
from collections import OrderedDict
items = sorted(items, key=lambda x:sort_key(key(x)), reverse=reverse)
ans = OrderedDict()
last_c, last_ordnum = ' ', 0
for item in items:
c = icu_upper(key(item) or ' ')
ordnum, ordlen = collation_order(c)
if last_ordnum != ordnum:
last_c = c[0:1]
last_ordnum = ordnum
try:
ans[last_c].append(item)
except KeyError:
ans[last_c] = [item]
return ans
# Return the number of unicode codepoints in a string
string_length = len
# Return the number of UTF-16 codepoints in a string
utf16_length = _icu.utf16_length
################################################################################
if __name__ == '__main__':
from calibre.utils.icu_test import run
run(verbosity=4)