python (3.12.0)
1 # unicode.py
2
3 import sys
4 from itertools import filterfalse
5 from typing import List, Tuple, Union
6
7
8 class ESC[4;38;5;81m_lazyclassproperty:
9 def __init__(self, fn):
10 self.fn = fn
11 self.__doc__ = fn.__doc__
12 self.__name__ = fn.__name__
13
14 def __get__(self, obj, cls):
15 if cls is None:
16 cls = type(obj)
17 if not hasattr(cls, "_intern") or any(
18 cls._intern is getattr(superclass, "_intern", [])
19 for superclass in cls.__mro__[1:]
20 ):
21 cls._intern = {}
22 attrname = self.fn.__name__
23 if attrname not in cls._intern:
24 cls._intern[attrname] = self.fn(cls)
25 return cls._intern[attrname]
26
27
28 UnicodeRangeList = List[Union[Tuple[int, int], Tuple[int]]]
29
30
31 class ESC[4;38;5;81municode_set:
32 """
33 A set of Unicode characters, for language-specific strings for
34 ``alphas``, ``nums``, ``alphanums``, and ``printables``.
35 A unicode_set is defined by a list of ranges in the Unicode character
36 set, in a class attribute ``_ranges``. Ranges can be specified using
37 2-tuples or a 1-tuple, such as::
38
39 _ranges = [
40 (0x0020, 0x007e),
41 (0x00a0, 0x00ff),
42 (0x0100,),
43 ]
44
45 Ranges are left- and right-inclusive. A 1-tuple of (x,) is treated as (x, x).
46
47 A unicode set can also be defined using multiple inheritance of other unicode sets::
48
49 class CJK(Chinese, Japanese, Korean):
50 pass
51 """
52
53 _ranges: UnicodeRangeList = []
54
55 @_lazyclassproperty
56 def _chars_for_ranges(cls):
57 ret = []
58 for cc in cls.__mro__:
59 if cc is unicode_set:
60 break
61 for rr in getattr(cc, "_ranges", ()):
62 ret.extend(range(rr[0], rr[-1] + 1))
63 return [chr(c) for c in sorted(set(ret))]
64
65 @_lazyclassproperty
66 def printables(cls):
67 """all non-whitespace characters in this range"""
68 return "".join(filterfalse(str.isspace, cls._chars_for_ranges))
69
70 @_lazyclassproperty
71 def alphas(cls):
72 """all alphabetic characters in this range"""
73 return "".join(filter(str.isalpha, cls._chars_for_ranges))
74
75 @_lazyclassproperty
76 def nums(cls):
77 """all numeric digit characters in this range"""
78 return "".join(filter(str.isdigit, cls._chars_for_ranges))
79
80 @_lazyclassproperty
81 def alphanums(cls):
82 """all alphanumeric characters in this range"""
83 return cls.alphas + cls.nums
84
85 @_lazyclassproperty
86 def identchars(cls):
87 """all characters in this range that are valid identifier characters, plus underscore '_'"""
88 return "".join(
89 sorted(
90 set(
91 "".join(filter(str.isidentifier, cls._chars_for_ranges))
92 + "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyzªµº"
93 + "ÀÁÂÃÄÅÆÇÈÉÊËÌÍÎÏÐÑÒÓÔÕÖØÙÚÛÜÝÞßàáâãäåæçèéêëìíîïðñòóôõöøùúûüýþÿ"
94 + "_"
95 )
96 )
97 )
98
99 @_lazyclassproperty
100 def identbodychars(cls):
101 """
102 all characters in this range that are valid identifier body characters,
103 plus the digits 0-9, and · (Unicode MIDDLE DOT)
104 """
105 return "".join(
106 sorted(
107 set(
108 cls.identchars
109 + "0123456789·"
110 + "".join(
111 [c for c in cls._chars_for_ranges if ("_" + c).isidentifier()]
112 )
113 )
114 )
115 )
116
117 @_lazyclassproperty
118 def identifier(cls):
119 """
120 a pyparsing Word expression for an identifier using this range's definitions for
121 identchars and identbodychars
122 """
123 from pip._vendor.pyparsing import Word
124
125 return Word(cls.identchars, cls.identbodychars)
126
127
128 class ESC[4;38;5;81mpyparsing_unicode(ESC[4;38;5;149municode_set):
129 """
130 A namespace class for defining common language unicode_sets.
131 """
132
133 # fmt: off
134
135 # define ranges in language character sets
136 _ranges: UnicodeRangeList = [
137 (0x0020, sys.maxunicode),
138 ]
139
140 class ESC[4;38;5;81mBasicMultilingualPlane(ESC[4;38;5;149municode_set):
141 """Unicode set for the Basic Multilingual Plane"""
142 _ranges: UnicodeRangeList = [
143 (0x0020, 0xFFFF),
144 ]
145
146 class ESC[4;38;5;81mLatin1(ESC[4;38;5;149municode_set):
147 """Unicode set for Latin-1 Unicode Character Range"""
148 _ranges: UnicodeRangeList = [
149 (0x0020, 0x007E),
150 (0x00A0, 0x00FF),
151 ]
152
153 class ESC[4;38;5;81mLatinA(ESC[4;38;5;149municode_set):
154 """Unicode set for Latin-A Unicode Character Range"""
155 _ranges: UnicodeRangeList = [
156 (0x0100, 0x017F),
157 ]
158
159 class ESC[4;38;5;81mLatinB(ESC[4;38;5;149municode_set):
160 """Unicode set for Latin-B Unicode Character Range"""
161 _ranges: UnicodeRangeList = [
162 (0x0180, 0x024F),
163 ]
164
165 class ESC[4;38;5;81mGreek(ESC[4;38;5;149municode_set):
166 """Unicode set for Greek Unicode Character Ranges"""
167 _ranges: UnicodeRangeList = [
168 (0x0342, 0x0345),
169 (0x0370, 0x0377),
170 (0x037A, 0x037F),
171 (0x0384, 0x038A),
172 (0x038C,),
173 (0x038E, 0x03A1),
174 (0x03A3, 0x03E1),
175 (0x03F0, 0x03FF),
176 (0x1D26, 0x1D2A),
177 (0x1D5E,),
178 (0x1D60,),
179 (0x1D66, 0x1D6A),
180 (0x1F00, 0x1F15),
181 (0x1F18, 0x1F1D),
182 (0x1F20, 0x1F45),
183 (0x1F48, 0x1F4D),
184 (0x1F50, 0x1F57),
185 (0x1F59,),
186 (0x1F5B,),
187 (0x1F5D,),
188 (0x1F5F, 0x1F7D),
189 (0x1F80, 0x1FB4),
190 (0x1FB6, 0x1FC4),
191 (0x1FC6, 0x1FD3),
192 (0x1FD6, 0x1FDB),
193 (0x1FDD, 0x1FEF),
194 (0x1FF2, 0x1FF4),
195 (0x1FF6, 0x1FFE),
196 (0x2129,),
197 (0x2719, 0x271A),
198 (0xAB65,),
199 (0x10140, 0x1018D),
200 (0x101A0,),
201 (0x1D200, 0x1D245),
202 (0x1F7A1, 0x1F7A7),
203 ]
204
205 class ESC[4;38;5;81mCyrillic(ESC[4;38;5;149municode_set):
206 """Unicode set for Cyrillic Unicode Character Range"""
207 _ranges: UnicodeRangeList = [
208 (0x0400, 0x052F),
209 (0x1C80, 0x1C88),
210 (0x1D2B,),
211 (0x1D78,),
212 (0x2DE0, 0x2DFF),
213 (0xA640, 0xA672),
214 (0xA674, 0xA69F),
215 (0xFE2E, 0xFE2F),
216 ]
217
218 class ESC[4;38;5;81mChinese(ESC[4;38;5;149municode_set):
219 """Unicode set for Chinese Unicode Character Range"""
220 _ranges: UnicodeRangeList = [
221 (0x2E80, 0x2E99),
222 (0x2E9B, 0x2EF3),
223 (0x31C0, 0x31E3),
224 (0x3400, 0x4DB5),
225 (0x4E00, 0x9FEF),
226 (0xA700, 0xA707),
227 (0xF900, 0xFA6D),
228 (0xFA70, 0xFAD9),
229 (0x16FE2, 0x16FE3),
230 (0x1F210, 0x1F212),
231 (0x1F214, 0x1F23B),
232 (0x1F240, 0x1F248),
233 (0x20000, 0x2A6D6),
234 (0x2A700, 0x2B734),
235 (0x2B740, 0x2B81D),
236 (0x2B820, 0x2CEA1),
237 (0x2CEB0, 0x2EBE0),
238 (0x2F800, 0x2FA1D),
239 ]
240
241 class ESC[4;38;5;81mJapanese(ESC[4;38;5;149municode_set):
242 """Unicode set for Japanese Unicode Character Range, combining Kanji, Hiragana, and Katakana ranges"""
243
244 class ESC[4;38;5;81mKanji(ESC[4;38;5;149municode_set):
245 "Unicode set for Kanji Unicode Character Range"
246 _ranges: UnicodeRangeList = [
247 (0x4E00, 0x9FBF),
248 (0x3000, 0x303F),
249 ]
250
251 class ESC[4;38;5;81mHiragana(ESC[4;38;5;149municode_set):
252 """Unicode set for Hiragana Unicode Character Range"""
253 _ranges: UnicodeRangeList = [
254 (0x3041, 0x3096),
255 (0x3099, 0x30A0),
256 (0x30FC,),
257 (0xFF70,),
258 (0x1B001,),
259 (0x1B150, 0x1B152),
260 (0x1F200,),
261 ]
262
263 class ESC[4;38;5;81mKatakana(ESC[4;38;5;149municode_set):
264 """Unicode set for Katakana Unicode Character Range"""
265 _ranges: UnicodeRangeList = [
266 (0x3099, 0x309C),
267 (0x30A0, 0x30FF),
268 (0x31F0, 0x31FF),
269 (0x32D0, 0x32FE),
270 (0xFF65, 0xFF9F),
271 (0x1B000,),
272 (0x1B164, 0x1B167),
273 (0x1F201, 0x1F202),
274 (0x1F213,),
275 ]
276
277 漢字 = Kanji
278 カタカナ = Katakana
279 ひらがな = Hiragana
280
281 _ranges = (
282 Kanji._ranges
283 + Hiragana._ranges
284 + Katakana._ranges
285 )
286
287 class ESC[4;38;5;81mHangul(ESC[4;38;5;149municode_set):
288 """Unicode set for Hangul (Korean) Unicode Character Range"""
289 _ranges: UnicodeRangeList = [
290 (0x1100, 0x11FF),
291 (0x302E, 0x302F),
292 (0x3131, 0x318E),
293 (0x3200, 0x321C),
294 (0x3260, 0x327B),
295 (0x327E,),
296 (0xA960, 0xA97C),
297 (0xAC00, 0xD7A3),
298 (0xD7B0, 0xD7C6),
299 (0xD7CB, 0xD7FB),
300 (0xFFA0, 0xFFBE),
301 (0xFFC2, 0xFFC7),
302 (0xFFCA, 0xFFCF),
303 (0xFFD2, 0xFFD7),
304 (0xFFDA, 0xFFDC),
305 ]
306
307 Korean = Hangul
308
309 class ESC[4;38;5;81mCJK(ESC[4;38;5;149mChinese, ESC[4;38;5;149mJapanese, ESC[4;38;5;149mHangul):
310 """Unicode set for combined Chinese, Japanese, and Korean (CJK) Unicode Character Range"""
311
312 class ESC[4;38;5;81mThai(ESC[4;38;5;149municode_set):
313 """Unicode set for Thai Unicode Character Range"""
314 _ranges: UnicodeRangeList = [
315 (0x0E01, 0x0E3A),
316 (0x0E3F, 0x0E5B)
317 ]
318
319 class ESC[4;38;5;81mArabic(ESC[4;38;5;149municode_set):
320 """Unicode set for Arabic Unicode Character Range"""
321 _ranges: UnicodeRangeList = [
322 (0x0600, 0x061B),
323 (0x061E, 0x06FF),
324 (0x0700, 0x077F),
325 ]
326
327 class ESC[4;38;5;81mHebrew(ESC[4;38;5;149municode_set):
328 """Unicode set for Hebrew Unicode Character Range"""
329 _ranges: UnicodeRangeList = [
330 (0x0591, 0x05C7),
331 (0x05D0, 0x05EA),
332 (0x05EF, 0x05F4),
333 (0xFB1D, 0xFB36),
334 (0xFB38, 0xFB3C),
335 (0xFB3E,),
336 (0xFB40, 0xFB41),
337 (0xFB43, 0xFB44),
338 (0xFB46, 0xFB4F),
339 ]
340
341 class ESC[4;38;5;81mDevanagari(ESC[4;38;5;149municode_set):
342 """Unicode set for Devanagari Unicode Character Range"""
343 _ranges: UnicodeRangeList = [
344 (0x0900, 0x097F),
345 (0xA8E0, 0xA8FF)
346 ]
347
348 BMP = BasicMultilingualPlane
349
350 # add language identifiers using language Unicode
351 العربية = Arabic
352 中文 = Chinese
353 кириллица = Cyrillic
354 Ελληνικά = Greek
355 עִברִית = Hebrew
356 日本語 = Japanese
357 한국어 = Korean
358 ไทย = Thai
359 देवनागरी = Devanagari
360
361 # fmt: on