Compare commits

...

3 Commits

Author SHA1 Message Date
dirkf
087ddc2371
[compat] Add test for compat_casefold() 2022-11-01 22:47:02 +00:00
dirkf
65ccb0dd4e
[compat] Add test for compat_casefold() 2022-11-01 21:33:39 +00:00
dirkf
a874871801
[compat] Reformat casefold.py for easier updating 2022-11-01 19:25:59 +00:00
2 changed files with 34 additions and 5 deletions

View File

@ -11,6 +11,7 @@ sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
from youtube_dl.compat import (
compat_casefold,
compat_getenv,
compat_setenv,
compat_etree_Element,
@ -118,9 +119,21 @@ class TestCompat(unittest.TestCase):
<smil xmlns="http://www.w3.org/2001/SMIL20/Language"></smil>'''
compat_etree_fromstring(xml)
def test_struct_unpack(self):
def test_compat_struct_unpack(self):
self.assertEqual(compat_struct_unpack('!B', b'\x00'), (0,))
def test_compat_casefold(self):
if hasattr(compat_str, 'casefold'):
# don't bother to test str.casefold() (again)
return
# thanks https://bugs.python.org/file24232/casefolding.patch
self.assertEqual(compat_casefold('hello'), 'hello')
self.assertEqual(compat_casefold('hELlo'), 'hello')
self.assertEqual(compat_casefold('ß'), 'ss')
self.assertEqual(compat_casefold(''), 'fi')
self.assertEqual(compat_casefold('\u03a3'), '\u03c3')
self.assertEqual(compat_casefold('A\u0345\u03a3'), 'a\u03b9\u03c3')
if __name__ == '__main__':
unittest.main()

View File

@ -1,8 +1,20 @@
# coding: utf-8
from __future__ import unicode_literals
from .compat import compat_str
from .compat import (
compat_str,
compat_chr,
)
# Below is included the text of icu/CaseFolding.txt retrieved from
# https://github.com/unicode-org/icu/blob/main/icu4c/source/data/unidata/CaseFolding.txt
# In case newly foldable Unicode characters are defined, paste the new version
# of the text inside the ''' marks.
# The text is expected to have only blank lines andlines with 1st character #,
# all ignored, and fold definitions like this:
# `from_hex_code; space_separated_to_hex_code_list; comment`
_map_str = '''
# CaseFolding-15.0.0.txt
# Date: 2022-02-02, 23:35:35 GMT
# © 2022 Unicode®, Inc.
@ -65,7 +77,6 @@ from .compat import compat_str
# have the value C for the status field, and the code point itself for the mapping field.
# =================================================================
_map_str = '''
0041; C; 0061; # LATIN CAPITAL LETTER A
0042; C; 0062; # LATIN CAPITAL LETTER B
0043; C; 0063; # LATIN CAPITAL LETTER C
@ -1627,17 +1638,22 @@ FF3A; C; FF5A; # FULLWIDTH LATIN CAPITAL LETTER Z
1E920; C; 1E942; # ADLAM CAPITAL LETTER KPO
1E921; C; 1E943; # ADLAM CAPITAL LETTER SHA
'''
_parse_unichr = lambda s: compat_chr(int(s, 16))
_map = dict(
(unichr(int(from_, 16)), ''.join((unichr(int(v, 16)) for v in to_.split(' '))))
(_parse_unichr(from_), ''.join(map(_parse_unichr, to_.split(' '))))
for from_, type_, to_, _ in (
l.split('; ', 3) for l in _map_str.splitlines() if l)
l.split('; ', 3) for l in _map_str.splitlines() if l and not l[0] == '#')
if type_ in ('C', 'F'))
del _map_str
def casefold(s):
assert isinstance(s, compat_str)
return ''.join((_map.get(c, c) for c in s))
__all__ = [
casefold
]