1 /* ------------------------------------------------------------------------
2
3 unicodedata -- Provides access to the Unicode database.
4
5 The current version number is reported in the unidata_version constant.
6
7 Written by Marc-Andre Lemburg (mal@lemburg.com).
8 Modified for Python 2.0 by Fredrik Lundh (fredrik@pythonware.com)
9 Modified by Martin v. Löwis (martin@v.loewis.de)
10
11 Copyright (c) Corporation for National Research Initiatives.
12
13 ------------------------------------------------------------------------ */
14
15 #ifndef Py_BUILD_CORE_BUILTIN
16 # define Py_BUILD_CORE_MODULE 1
17 #endif
18
19 #define PY_SSIZE_T_CLEAN
20
21 #include "Python.h"
22 #include "pycore_ucnhash.h" // _PyUnicode_Name_CAPI
23 #include "structmember.h" // PyMemberDef
24
25 #include <stdbool.h>
26
27 /*[clinic input]
28 module unicodedata
29 class unicodedata.UCD 'PreviousDBVersion *' '<not used>'
30 [clinic start generated code]*/
31 /*[clinic end generated code: output=da39a3ee5e6b4b0d input=e47113e05924be43]*/
32
33 /* character properties */
34
35 typedef struct {
36 const unsigned char category; /* index into
37 _PyUnicode_CategoryNames */
38 const unsigned char combining; /* combining class value 0 - 255 */
39 const unsigned char bidirectional; /* index into
40 _PyUnicode_BidirectionalNames */
41 const unsigned char mirrored; /* true if mirrored in bidir mode */
42 const unsigned char east_asian_width; /* index into
43 _PyUnicode_EastAsianWidth */
44 const unsigned char normalization_quick_check; /* see is_normalized() */
45 } _PyUnicode_DatabaseRecord;
46
47 typedef struct change_record {
48 /* sequence of fields should be the same as in merge_old_version */
49 const unsigned char bidir_changed;
50 const unsigned char category_changed;
51 const unsigned char decimal_changed;
52 const unsigned char mirrored_changed;
53 const unsigned char east_asian_width_changed;
54 const double numeric_changed;
55 } change_record;
56
57 /* data file generated by Tools/unicode/makeunicodedata.py */
58 #include "unicodedata_db.h"
59
60 static const _PyUnicode_DatabaseRecord*
61 _getrecord_ex(Py_UCS4 code)
62 {
63 int index;
64 if (code >= 0x110000)
65 index = 0;
66 else {
67 index = index1[(code>>SHIFT)];
68 index = index2[(index<<SHIFT)+(code&((1<<SHIFT)-1))];
69 }
70
71 return &_PyUnicode_Database_Records[index];
72 }
73
74 /* ------------- Previous-version API ------------------------------------- */
75 typedef struct previous_version {
76 PyObject_HEAD
77 const char *name;
78 const change_record* (*getrecord)(Py_UCS4);
79 Py_UCS4 (*normalization)(Py_UCS4);
80 } PreviousDBVersion;
81
82 #include "clinic/unicodedata.c.h"
83
84 #define get_old_record(self, v) ((((PreviousDBVersion*)self)->getrecord)(v))
85
86 static PyMemberDef DB_members[] = {
87 {"unidata_version", T_STRING, offsetof(PreviousDBVersion, name), READONLY},
88 {NULL}
89 };
90
91 // Check if self is an unicodedata.UCD instance.
92 // If self is NULL (when the PyCapsule C API is used), return 0.
93 // PyModule_Check() is used to avoid having to retrieve the ucd_type.
94 // See unicodedata_functions comment to the rationale of this macro.
95 #define UCD_Check(self) (self != NULL && !PyModule_Check(self))
96
97 static PyObject*
98 new_previous_version(PyTypeObject *ucd_type,
99 const char*name, const change_record* (*getrecord)(Py_UCS4),
100 Py_UCS4 (*normalization)(Py_UCS4))
101 {
102 PreviousDBVersion *self;
103 self = PyObject_GC_New(PreviousDBVersion, ucd_type);
104 if (self == NULL)
105 return NULL;
106 self->name = name;
107 self->getrecord = getrecord;
108 self->normalization = normalization;
109 PyObject_GC_Track(self);
110 return (PyObject*)self;
111 }
112
113
114 /* --- Module API --------------------------------------------------------- */
115
116 /*[clinic input]
117 unicodedata.UCD.decimal
118
119 self: self
120 chr: int(accept={str})
121 default: object=NULL
122 /
123
124 Converts a Unicode character into its equivalent decimal value.
125
126 Returns the decimal value assigned to the character chr as integer.
127 If no such value is defined, default is returned, or, if not given,
128 ValueError is raised.
129 [clinic start generated code]*/
130
131 static PyObject *
132 unicodedata_UCD_decimal_impl(PyObject *self, int chr,
133 PyObject *default_value)
134 /*[clinic end generated code: output=be23376e1a185231 input=933f8107993f23d0]*/
135 {
136 int have_old = 0;
137 long rc;
138 Py_UCS4 c = (Py_UCS4)chr;
139
140 if (UCD_Check(self)) {
141 const change_record *old = get_old_record(self, c);
142 if (old->category_changed == 0) {
143 /* unassigned */
144 have_old = 1;
145 rc = -1;
146 }
147 else if (old->decimal_changed != 0xFF) {
148 have_old = 1;
149 rc = old->decimal_changed;
150 }
151 }
152
153 if (!have_old)
154 rc = Py_UNICODE_TODECIMAL(c);
155 if (rc < 0) {
156 if (default_value == NULL) {
157 PyErr_SetString(PyExc_ValueError,
158 "not a decimal");
159 return NULL;
160 }
161 else {
162 return Py_NewRef(default_value);
163 }
164 }
165 return PyLong_FromLong(rc);
166 }
167
168 /*[clinic input]
169 unicodedata.UCD.digit
170
171 self: self
172 chr: int(accept={str})
173 default: object=NULL
174 /
175
176 Converts a Unicode character into its equivalent digit value.
177
178 Returns the digit value assigned to the character chr as integer.
179 If no such value is defined, default is returned, or, if not given,
180 ValueError is raised.
181 [clinic start generated code]*/
182
183 static PyObject *
184 unicodedata_UCD_digit_impl(PyObject *self, int chr, PyObject *default_value)
185 /*[clinic end generated code: output=96e18c950171fd2f input=e27d6e4565cd29f2]*/
186 {
187 long rc;
188 Py_UCS4 c = (Py_UCS4)chr;
189 rc = Py_UNICODE_TODIGIT(c);
190 if (rc < 0) {
191 if (default_value == NULL) {
192 PyErr_SetString(PyExc_ValueError, "not a digit");
193 return NULL;
194 }
195 else {
196 return Py_NewRef(default_value);
197 }
198 }
199 return PyLong_FromLong(rc);
200 }
201
202 /*[clinic input]
203 unicodedata.UCD.numeric
204
205 self: self
206 chr: int(accept={str})
207 default: object=NULL
208 /
209
210 Converts a Unicode character into its equivalent numeric value.
211
212 Returns the numeric value assigned to the character chr as float.
213 If no such value is defined, default is returned, or, if not given,
214 ValueError is raised.
215 [clinic start generated code]*/
216
217 static PyObject *
218 unicodedata_UCD_numeric_impl(PyObject *self, int chr,
219 PyObject *default_value)
220 /*[clinic end generated code: output=53ce281fe85b10c4 input=fdf5871a5542893c]*/
221 {
222 int have_old = 0;
223 double rc;
224 Py_UCS4 c = (Py_UCS4)chr;
225
226 if (UCD_Check(self)) {
227 const change_record *old = get_old_record(self, c);
228 if (old->category_changed == 0) {
229 /* unassigned */
230 have_old = 1;
231 rc = -1.0;
232 }
233 else if (old->decimal_changed != 0xFF) {
234 have_old = 1;
235 rc = old->decimal_changed;
236 }
237 }
238
239 if (!have_old)
240 rc = Py_UNICODE_TONUMERIC(c);
241 if (rc == -1.0) {
242 if (default_value == NULL) {
243 PyErr_SetString(PyExc_ValueError, "not a numeric character");
244 return NULL;
245 }
246 else {
247 return Py_NewRef(default_value);
248 }
249 }
250 return PyFloat_FromDouble(rc);
251 }
252
253 /*[clinic input]
254 unicodedata.UCD.category
255
256 self: self
257 chr: int(accept={str})
258 /
259
260 Returns the general category assigned to the character chr as string.
261 [clinic start generated code]*/
262
263 static PyObject *
264 unicodedata_UCD_category_impl(PyObject *self, int chr)
265 /*[clinic end generated code: output=8571539ee2e6783a input=27d6f3d85050bc06]*/
266 {
267 int index;
268 Py_UCS4 c = (Py_UCS4)chr;
269 index = (int) _getrecord_ex(c)->category;
270 if (UCD_Check(self)) {
271 const change_record *old = get_old_record(self, c);
272 if (old->category_changed != 0xFF)
273 index = old->category_changed;
274 }
275 return PyUnicode_FromString(_PyUnicode_CategoryNames[index]);
276 }
277
278 /*[clinic input]
279 unicodedata.UCD.bidirectional
280
281 self: self
282 chr: int(accept={str})
283 /
284
285 Returns the bidirectional class assigned to the character chr as string.
286
287 If no such value is defined, an empty string is returned.
288 [clinic start generated code]*/
289
290 static PyObject *
291 unicodedata_UCD_bidirectional_impl(PyObject *self, int chr)
292 /*[clinic end generated code: output=d36310ce2039bb92 input=b3d8f42cebfcf475]*/
293 {
294 int index;
295 Py_UCS4 c = (Py_UCS4)chr;
296 index = (int) _getrecord_ex(c)->bidirectional;
297 if (UCD_Check(self)) {
298 const change_record *old = get_old_record(self, c);
299 if (old->category_changed == 0)
300 index = 0; /* unassigned */
301 else if (old->bidir_changed != 0xFF)
302 index = old->bidir_changed;
303 }
304 return PyUnicode_FromString(_PyUnicode_BidirectionalNames[index]);
305 }
306
307 /*[clinic input]
308 unicodedata.UCD.combining -> int
309
310 self: self
311 chr: int(accept={str})
312 /
313
314 Returns the canonical combining class assigned to the character chr as integer.
315
316 Returns 0 if no combining class is defined.
317 [clinic start generated code]*/
318
319 static int
320 unicodedata_UCD_combining_impl(PyObject *self, int chr)
321 /*[clinic end generated code: output=cad056d0cb6a5920 input=9f2d6b2a95d0a22a]*/
322 {
323 int index;
324 Py_UCS4 c = (Py_UCS4)chr;
325 index = (int) _getrecord_ex(c)->combining;
326 if (UCD_Check(self)) {
327 const change_record *old = get_old_record(self, c);
328 if (old->category_changed == 0)
329 index = 0; /* unassigned */
330 }
331 return index;
332 }
333
334 /*[clinic input]
335 unicodedata.UCD.mirrored -> int
336
337 self: self
338 chr: int(accept={str})
339 /
340
341 Returns the mirrored property assigned to the character chr as integer.
342
343 Returns 1 if the character has been identified as a "mirrored"
344 character in bidirectional text, 0 otherwise.
345 [clinic start generated code]*/
346
347 static int
348 unicodedata_UCD_mirrored_impl(PyObject *self, int chr)
349 /*[clinic end generated code: output=2532dbf8121b50e6 input=5dd400d351ae6f3b]*/
350 {
351 int index;
352 Py_UCS4 c = (Py_UCS4)chr;
353 index = (int) _getrecord_ex(c)->mirrored;
354 if (UCD_Check(self)) {
355 const change_record *old = get_old_record(self, c);
356 if (old->category_changed == 0)
357 index = 0; /* unassigned */
358 else if (old->mirrored_changed != 0xFF)
359 index = old->mirrored_changed;
360 }
361 return index;
362 }
363
364 /*[clinic input]
365 unicodedata.UCD.east_asian_width
366
367 self: self
368 chr: int(accept={str})
369 /
370
371 Returns the east asian width assigned to the character chr as string.
372 [clinic start generated code]*/
373
374 static PyObject *
375 unicodedata_UCD_east_asian_width_impl(PyObject *self, int chr)
376 /*[clinic end generated code: output=484e8537d9ee8197 input=c4854798aab026e0]*/
377 {
378 int index;
379 Py_UCS4 c = (Py_UCS4)chr;
380 index = (int) _getrecord_ex(c)->east_asian_width;
381 if (UCD_Check(self)) {
382 const change_record *old = get_old_record(self, c);
383 if (old->category_changed == 0)
384 index = 0; /* unassigned */
385 else if (old->east_asian_width_changed != 0xFF)
386 index = old->east_asian_width_changed;
387 }
388 return PyUnicode_FromString(_PyUnicode_EastAsianWidthNames[index]);
389 }
390
391 /*[clinic input]
392 unicodedata.UCD.decomposition
393
394 self: self
395 chr: int(accept={str})
396 /
397
398 Returns the character decomposition mapping assigned to the character chr as string.
399
400 An empty string is returned in case no such mapping is defined.
401 [clinic start generated code]*/
402
403 static PyObject *
404 unicodedata_UCD_decomposition_impl(PyObject *self, int chr)
405 /*[clinic end generated code: output=7d699f3ec7565d27 input=e4c12459ad68507b]*/
406 {
407 char decomp[256];
408 int code, index, count;
409 size_t i;
410 unsigned int prefix_index;
411 Py_UCS4 c = (Py_UCS4)chr;
412
413 code = (int)c;
414
415 if (UCD_Check(self)) {
416 const change_record *old = get_old_record(self, c);
417 if (old->category_changed == 0)
418 return PyUnicode_FromString(""); /* unassigned */
419 }
420
421 if (code < 0 || code >= 0x110000)
422 index = 0;
423 else {
424 index = decomp_index1[(code>>DECOMP_SHIFT)];
425 index = decomp_index2[(index<<DECOMP_SHIFT)+
426 (code&((1<<DECOMP_SHIFT)-1))];
427 }
428
429 /* high byte is number of hex bytes (usually one or two), low byte
430 is prefix code (from*/
431 count = decomp_data[index] >> 8;
432
433 /* XXX: could allocate the PyString up front instead
434 (strlen(prefix) + 5 * count + 1 bytes) */
435
436 /* Based on how index is calculated above and decomp_data is generated
437 from Tools/unicode/makeunicodedata.py, it should not be possible
438 to overflow decomp_prefix. */
439 prefix_index = decomp_data[index] & 255;
440 assert(prefix_index < Py_ARRAY_LENGTH(decomp_prefix));
441
442 /* copy prefix */
443 i = strlen(decomp_prefix[prefix_index]);
444 memcpy(decomp, decomp_prefix[prefix_index], i);
445
446 while (count-- > 0) {
447 if (i)
448 decomp[i++] = ' ';
449 assert(i < sizeof(decomp));
450 PyOS_snprintf(decomp + i, sizeof(decomp) - i, "%04X",
451 decomp_data[++index]);
452 i += strlen(decomp + i);
453 }
454 return PyUnicode_FromStringAndSize(decomp, i);
455 }
456
457 static void
458 get_decomp_record(PyObject *self, Py_UCS4 code,
459 int *index, int *prefix, int *count)
460 {
461 if (code >= 0x110000) {
462 *index = 0;
463 }
464 else if (UCD_Check(self)
465 && get_old_record(self, code)->category_changed==0) {
466 /* unassigned in old version */
467 *index = 0;
468 }
469 else {
470 *index = decomp_index1[(code>>DECOMP_SHIFT)];
471 *index = decomp_index2[(*index<<DECOMP_SHIFT)+
472 (code&((1<<DECOMP_SHIFT)-1))];
473 }
474
475 /* high byte is number of hex bytes (usually one or two), low byte
476 is prefix code (from*/
477 *count = decomp_data[*index] >> 8;
478 *prefix = decomp_data[*index] & 255;
479
480 (*index)++;
481 }
482
483 #define SBase 0xAC00
484 #define LBase 0x1100
485 #define VBase 0x1161
486 #define TBase 0x11A7
487 #define LCount 19
488 #define VCount 21
489 #define TCount 28
490 #define NCount (VCount*TCount)
491 #define SCount (LCount*NCount)
492
493 static PyObject*
494 nfd_nfkd(PyObject *self, PyObject *input, int k)
495 {
496 PyObject *result;
497 Py_UCS4 *output;
498 Py_ssize_t i, o, osize;
499 int kind;
500 const void *data;
501 /* Longest decomposition in Unicode 3.2: U+FDFA */
502 Py_UCS4 stack[20];
503 Py_ssize_t space, isize;
504 int index, prefix, count, stackptr;
505 unsigned char prev, cur;
506
507 stackptr = 0;
508 isize = PyUnicode_GET_LENGTH(input);
509 space = isize;
510 /* Overallocate at most 10 characters. */
511 if (space > 10) {
512 if (space <= PY_SSIZE_T_MAX - 10)
513 space += 10;
514 }
515 else {
516 space *= 2;
517 }
518 osize = space;
519 output = PyMem_NEW(Py_UCS4, space);
520 if (!output) {
521 PyErr_NoMemory();
522 return NULL;
523 }
524 i = o = 0;
525 kind = PyUnicode_KIND(input);
526 data = PyUnicode_DATA(input);
527
528 while (i < isize) {
529 stack[stackptr++] = PyUnicode_READ(kind, data, i++);
530 while(stackptr) {
531 Py_UCS4 code = stack[--stackptr];
532 /* Hangul Decomposition adds three characters in
533 a single step, so we need at least that much room. */
534 if (space < 3) {
535 Py_UCS4 *new_output;
536 osize += 10;
537 space += 10;
538 new_output = PyMem_Realloc(output, osize*sizeof(Py_UCS4));
539 if (new_output == NULL) {
540 PyMem_Free(output);
541 PyErr_NoMemory();
542 return NULL;
543 }
544 output = new_output;
545 }
546 /* Hangul Decomposition. */
547 if (SBase <= code && code < (SBase+SCount)) {
548 int SIndex = code - SBase;
549 int L = LBase + SIndex / NCount;
550 int V = VBase + (SIndex % NCount) / TCount;
551 int T = TBase + SIndex % TCount;
552 output[o++] = L;
553 output[o++] = V;
554 space -= 2;
555 if (T != TBase) {
556 output[o++] = T;
557 space --;
558 }
559 continue;
560 }
561 /* normalization changes */
562 if (UCD_Check(self)) {
563 Py_UCS4 value = ((PreviousDBVersion*)self)->normalization(code);
564 if (value != 0) {
565 stack[stackptr++] = value;
566 continue;
567 }
568 }
569
570 /* Other decompositions. */
571 get_decomp_record(self, code, &index, &prefix, &count);
572
573 /* Copy character if it is not decomposable, or has a
574 compatibility decomposition, but we do NFD. */
575 if (!count || (prefix && !k)) {
576 output[o++] = code;
577 space--;
578 continue;
579 }
580 /* Copy decomposition onto the stack, in reverse
581 order. */
582 while(count) {
583 code = decomp_data[index + (--count)];
584 stack[stackptr++] = code;
585 }
586 }
587 }
588
589 result = PyUnicode_FromKindAndData(PyUnicode_4BYTE_KIND,
590 output, o);
591 PyMem_Free(output);
592 if (!result)
593 return NULL;
594 /* result is guaranteed to be ready, as it is compact. */
595 kind = PyUnicode_KIND(result);
596 data = PyUnicode_DATA(result);
597
598 /* Sort canonically. */
599 i = 0;
600 prev = _getrecord_ex(PyUnicode_READ(kind, data, i))->combining;
601 for (i++; i < PyUnicode_GET_LENGTH(result); i++) {
602 cur = _getrecord_ex(PyUnicode_READ(kind, data, i))->combining;
603 if (prev == 0 || cur == 0 || prev <= cur) {
604 prev = cur;
605 continue;
606 }
607 /* Non-canonical order. Need to switch *i with previous. */
608 o = i - 1;
609 while (1) {
610 Py_UCS4 tmp = PyUnicode_READ(kind, data, o+1);
611 PyUnicode_WRITE(kind, data, o+1,
612 PyUnicode_READ(kind, data, o));
613 PyUnicode_WRITE(kind, data, o, tmp);
614 o--;
615 if (o < 0)
616 break;
617 prev = _getrecord_ex(PyUnicode_READ(kind, data, o))->combining;
618 if (prev == 0 || prev <= cur)
619 break;
620 }
621 prev = _getrecord_ex(PyUnicode_READ(kind, data, i))->combining;
622 }
623 return result;
624 }
625
626 static int
627 find_nfc_index(const struct reindex* nfc, Py_UCS4 code)
628 {
629 unsigned int index;
630 for (index = 0; nfc[index].start; index++) {
631 unsigned int start = nfc[index].start;
632 if (code < start)
633 return -1;
634 if (code <= start + nfc[index].count) {
635 unsigned int delta = code - start;
636 return nfc[index].index + delta;
637 }
638 }
639 return -1;
640 }
641
642 static PyObject*
643 nfc_nfkc(PyObject *self, PyObject *input, int k)
644 {
645 PyObject *result;
646 int kind;
647 const void *data;
648 Py_UCS4 *output;
649 Py_ssize_t i, i1, o, len;
650 int f,l,index,index1,comb;
651 Py_UCS4 code;
652 Py_ssize_t skipped[20];
653 int cskipped = 0;
654
655 result = nfd_nfkd(self, input, k);
656 if (!result)
657 return NULL;
658 /* result will be "ready". */
659 kind = PyUnicode_KIND(result);
660 data = PyUnicode_DATA(result);
661 len = PyUnicode_GET_LENGTH(result);
662
663 /* We allocate a buffer for the output.
664 If we find that we made no changes, we still return
665 the NFD result. */
666 output = PyMem_NEW(Py_UCS4, len);
667 if (!output) {
668 PyErr_NoMemory();
669 Py_DECREF(result);
670 return 0;
671 }
672 i = o = 0;
673
674 again:
675 while (i < len) {
676 for (index = 0; index < cskipped; index++) {
677 if (skipped[index] == i) {
678 /* *i character is skipped.
679 Remove from list. */
680 skipped[index] = skipped[cskipped-1];
681 cskipped--;
682 i++;
683 goto again; /* continue while */
684 }
685 }
686 /* Hangul Composition. We don't need to check for <LV,T>
687 pairs, since we always have decomposed data. */
688 code = PyUnicode_READ(kind, data, i);
689 if (LBase <= code && code < (LBase+LCount) &&
690 i + 1 < len &&
691 VBase <= PyUnicode_READ(kind, data, i+1) &&
692 PyUnicode_READ(kind, data, i+1) < (VBase+VCount)) {
693 /* check L character is a modern leading consonant (0x1100 ~ 0x1112)
694 and V character is a modern vowel (0x1161 ~ 0x1175). */
695 int LIndex, VIndex;
696 LIndex = code - LBase;
697 VIndex = PyUnicode_READ(kind, data, i+1) - VBase;
698 code = SBase + (LIndex*VCount+VIndex)*TCount;
699 i+=2;
700 if (i < len &&
701 TBase < PyUnicode_READ(kind, data, i) &&
702 PyUnicode_READ(kind, data, i) < (TBase+TCount)) {
703 /* check T character is a modern trailing consonant
704 (0x11A8 ~ 0x11C2). */
705 code += PyUnicode_READ(kind, data, i)-TBase;
706 i++;
707 }
708 output[o++] = code;
709 continue;
710 }
711
712 /* code is still input[i] here */
713 f = find_nfc_index(nfc_first, code);
714 if (f == -1) {
715 output[o++] = code;
716 i++;
717 continue;
718 }
719 /* Find next unblocked character. */
720 i1 = i+1;
721 comb = 0;
722 /* output base character for now; might be updated later. */
723 output[o] = PyUnicode_READ(kind, data, i);
724 while (i1 < len) {
725 Py_UCS4 code1 = PyUnicode_READ(kind, data, i1);
726 int comb1 = _getrecord_ex(code1)->combining;
727 if (comb) {
728 if (comb1 == 0)
729 break;
730 if (comb >= comb1) {
731 /* Character is blocked. */
732 i1++;
733 continue;
734 }
735 }
736 l = find_nfc_index(nfc_last, code1);
737 /* i1 cannot be combined with i. If i1
738 is a starter, we don't need to look further.
739 Otherwise, record the combining class. */
740 if (l == -1) {
741 not_combinable:
742 if (comb1 == 0)
743 break;
744 comb = comb1;
745 i1++;
746 continue;
747 }
748 index = f*TOTAL_LAST + l;
749 index1 = comp_index[index >> COMP_SHIFT];
750 code = comp_data[(index1<<COMP_SHIFT)+
751 (index&((1<<COMP_SHIFT)-1))];
752 if (code == 0)
753 goto not_combinable;
754
755 /* Replace the original character. */
756 output[o] = code;
757 /* Mark the second character unused. */
758 assert(cskipped < 20);
759 skipped[cskipped++] = i1;
760 i1++;
761 f = find_nfc_index(nfc_first, output[o]);
762 if (f == -1)
763 break;
764 }
765 /* Output character was already written.
766 Just advance the indices. */
767 o++; i++;
768 }
769 if (o == len) {
770 /* No changes. Return original string. */
771 PyMem_Free(output);
772 return result;
773 }
774 Py_DECREF(result);
775 result = PyUnicode_FromKindAndData(PyUnicode_4BYTE_KIND,
776 output, o);
777 PyMem_Free(output);
778 return result;
779 }
780
781 // This needs to match the logic in makeunicodedata.py
782 // which constructs the quickcheck data.
783 typedef enum {YES = 0, MAYBE = 1, NO = 2} QuickcheckResult;
784
785 /* Run the Unicode normalization "quickcheck" algorithm.
786 *
787 * Return YES or NO if quickcheck determines the input is certainly
788 * normalized or certainly not, and MAYBE if quickcheck is unable to
789 * tell.
790 *
791 * If `yes_only` is true, then return MAYBE as soon as we determine
792 * the answer is not YES.
793 *
794 * For background and details on the algorithm, see UAX #15:
795 * https://www.unicode.org/reports/tr15/#Detecting_Normalization_Forms
796 */
797 static QuickcheckResult
798 is_normalized_quickcheck(PyObject *self, PyObject *input, bool nfc, bool k,
799 bool yes_only)
800 {
801 /* UCD 3.2.0 is requested, quickchecks must be disabled. */
802 if (UCD_Check(self)) {
803 return MAYBE;
804 }
805
806 if (PyUnicode_IS_ASCII(input)) {
807 return YES;
808 }
809
810 Py_ssize_t i, len;
811 int kind;
812 const void *data;
813 unsigned char prev_combining = 0;
814
815 /* The two quickcheck bits at this shift have type QuickcheckResult. */
816 int quickcheck_shift = (nfc ? 4 : 0) + (k ? 2 : 0);
817
818 QuickcheckResult result = YES; /* certainly normalized, unless we find something */
819
820 i = 0;
821 kind = PyUnicode_KIND(input);
822 data = PyUnicode_DATA(input);
823 len = PyUnicode_GET_LENGTH(input);
824 while (i < len) {
825 Py_UCS4 ch = PyUnicode_READ(kind, data, i++);
826 const _PyUnicode_DatabaseRecord *record = _getrecord_ex(ch);
827
828 unsigned char combining = record->combining;
829 if (combining && prev_combining > combining)
830 return NO; /* non-canonical sort order, not normalized */
831 prev_combining = combining;
832
833 unsigned char quickcheck_whole = record->normalization_quick_check;
834 if (yes_only) {
835 if (quickcheck_whole & (3 << quickcheck_shift))
836 return MAYBE;
837 } else {
838 switch ((quickcheck_whole >> quickcheck_shift) & 3) {
839 case NO:
840 return NO;
841 case MAYBE:
842 result = MAYBE; /* this string might need normalization */
843 }
844 }
845 }
846 return result;
847 }
848
849 /*[clinic input]
850 unicodedata.UCD.is_normalized
851
852 self: self
853 form: unicode
854 unistr as input: unicode
855 /
856
857 Return whether the Unicode string unistr is in the normal form 'form'.
858
859 Valid values for form are 'NFC', 'NFKC', 'NFD', and 'NFKD'.
860 [clinic start generated code]*/
861
862 static PyObject *
863 unicodedata_UCD_is_normalized_impl(PyObject *self, PyObject *form,
864 PyObject *input)
865 /*[clinic end generated code: output=11e5a3694e723ca5 input=a544f14cea79e508]*/
866 {
867 if (PyUnicode_READY(input) == -1) {
868 return NULL;
869 }
870
871 if (PyUnicode_GET_LENGTH(input) == 0) {
872 /* special case empty input strings. */
873 Py_RETURN_TRUE;
874 }
875
876 PyObject *result;
877 bool nfc = false;
878 bool k = false;
879 QuickcheckResult m;
880
881 PyObject *cmp;
882 int match = 0;
883
884 if (PyUnicode_CompareWithASCIIString(form, "NFC") == 0) {
885 nfc = true;
886 }
887 else if (PyUnicode_CompareWithASCIIString(form, "NFKC") == 0) {
888 nfc = true;
889 k = true;
890 }
891 else if (PyUnicode_CompareWithASCIIString(form, "NFD") == 0) {
892 /* matches default values for `nfc` and `k` */
893 }
894 else if (PyUnicode_CompareWithASCIIString(form, "NFKD") == 0) {
895 k = true;
896 }
897 else {
898 PyErr_SetString(PyExc_ValueError, "invalid normalization form");
899 return NULL;
900 }
901
902 m = is_normalized_quickcheck(self, input, nfc, k, false);
903
904 if (m == MAYBE) {
905 cmp = (nfc ? nfc_nfkc : nfd_nfkd)(self, input, k);
906 if (cmp == NULL) {
907 return NULL;
908 }
909 match = PyUnicode_Compare(input, cmp);
910 Py_DECREF(cmp);
911 result = (match == 0) ? Py_True : Py_False;
912 }
913 else {
914 result = (m == YES) ? Py_True : Py_False;
915 }
916
917 return Py_NewRef(result);
918 }
919
920
921 /*[clinic input]
922 unicodedata.UCD.normalize
923
924 self: self
925 form: unicode
926 unistr as input: unicode
927 /
928
929 Return the normal form 'form' for the Unicode string unistr.
930
931 Valid values for form are 'NFC', 'NFKC', 'NFD', and 'NFKD'.
932 [clinic start generated code]*/
933
934 static PyObject *
935 unicodedata_UCD_normalize_impl(PyObject *self, PyObject *form,
936 PyObject *input)
937 /*[clinic end generated code: output=05ca4385a2ad6983 input=3a5206c0ad2833fb]*/
938 {
939 if (PyUnicode_GET_LENGTH(input) == 0) {
940 /* Special case empty input strings, since resizing
941 them later would cause internal errors. */
942 return Py_NewRef(input);
943 }
944
945 if (PyUnicode_CompareWithASCIIString(form, "NFC") == 0) {
946 if (is_normalized_quickcheck(self, input,
947 true, false, true) == YES) {
948 return Py_NewRef(input);
949 }
950 return nfc_nfkc(self, input, 0);
951 }
952 if (PyUnicode_CompareWithASCIIString(form, "NFKC") == 0) {
953 if (is_normalized_quickcheck(self, input,
954 true, true, true) == YES) {
955 return Py_NewRef(input);
956 }
957 return nfc_nfkc(self, input, 1);
958 }
959 if (PyUnicode_CompareWithASCIIString(form, "NFD") == 0) {
960 if (is_normalized_quickcheck(self, input,
961 false, false, true) == YES) {
962 return Py_NewRef(input);
963 }
964 return nfd_nfkd(self, input, 0);
965 }
966 if (PyUnicode_CompareWithASCIIString(form, "NFKD") == 0) {
967 if (is_normalized_quickcheck(self, input,
968 false, true, true) == YES) {
969 return Py_NewRef(input);
970 }
971 return nfd_nfkd(self, input, 1);
972 }
973 PyErr_SetString(PyExc_ValueError, "invalid normalization form");
974 return NULL;
975 }
976
977 /* -------------------------------------------------------------------- */
978 /* unicode character name tables */
979
980 /* data file generated by Tools/unicode/makeunicodedata.py */
981 #include "unicodename_db.h"
982
983 /* -------------------------------------------------------------------- */
984 /* database code (cut and pasted from the unidb package) */
985
986 static unsigned long
987 _gethash(const char *s, int len, int scale)
988 {
989 int i;
990 unsigned long h = 0;
991 unsigned long ix;
992 for (i = 0; i < len; i++) {
993 h = (h * scale) + (unsigned char) Py_TOUPPER(s[i]);
994 ix = h & 0xff000000;
995 if (ix)
996 h = (h ^ ((ix>>24) & 0xff)) & 0x00ffffff;
997 }
998 return h;
999 }
1000
1001 static const char * const hangul_syllables[][3] = {
1002 { "G", "A", "" },
1003 { "GG", "AE", "G" },
1004 { "N", "YA", "GG" },
1005 { "D", "YAE", "GS" },
1006 { "DD", "EO", "N", },
1007 { "R", "E", "NJ" },
1008 { "M", "YEO", "NH" },
1009 { "B", "YE", "D" },
1010 { "BB", "O", "L" },
1011 { "S", "WA", "LG" },
1012 { "SS", "WAE", "LM" },
1013 { "", "OE", "LB" },
1014 { "J", "YO", "LS" },
1015 { "JJ", "U", "LT" },
1016 { "C", "WEO", "LP" },
1017 { "K", "WE", "LH" },
1018 { "T", "WI", "M" },
1019 { "P", "YU", "B" },
1020 { "H", "EU", "BS" },
1021 { 0, "YI", "S" },
1022 { 0, "I", "SS" },
1023 { 0, 0, "NG" },
1024 { 0, 0, "J" },
1025 { 0, 0, "C" },
1026 { 0, 0, "K" },
1027 { 0, 0, "T" },
1028 { 0, 0, "P" },
1029 { 0, 0, "H" }
1030 };
1031
1032 /* These ranges need to match makeunicodedata.py:cjk_ranges. */
1033 static int
1034 is_unified_ideograph(Py_UCS4 code)
1035 {
1036 return
1037 (0x3400 <= code && code <= 0x4DBF) || /* CJK Ideograph Extension A */
1038 (0x4E00 <= code && code <= 0x9FFF) || /* CJK Ideograph */
1039 (0x20000 <= code && code <= 0x2A6DF) || /* CJK Ideograph Extension B */
1040 (0x2A700 <= code && code <= 0x2B739) || /* CJK Ideograph Extension C */
1041 (0x2B740 <= code && code <= 0x2B81D) || /* CJK Ideograph Extension D */
1042 (0x2B820 <= code && code <= 0x2CEA1) || /* CJK Ideograph Extension E */
1043 (0x2CEB0 <= code && code <= 0x2EBE0) || /* CJK Ideograph Extension F */
1044 (0x30000 <= code && code <= 0x3134A) || /* CJK Ideograph Extension G */
1045 (0x31350 <= code && code <= 0x323AF); /* CJK Ideograph Extension H */
1046 }
1047
1048 /* macros used to determine if the given code point is in the PUA range that
1049 * we are using to store aliases and named sequences */
1050 #define IS_ALIAS(cp) ((cp >= aliases_start) && (cp < aliases_end))
1051 #define IS_NAMED_SEQ(cp) ((cp >= named_sequences_start) && \
1052 (cp < named_sequences_end))
1053
1054 static int
1055 _getucname(PyObject *self,
1056 Py_UCS4 code, char* buffer, int buflen, int with_alias_and_seq)
1057 {
1058 /* Find the name associated with the given code point.
1059 * If with_alias_and_seq is 1, check for names in the Private Use Area 15
1060 * that we are using for aliases and named sequences. */
1061 int offset;
1062 int i;
1063 int word;
1064 const unsigned char* w;
1065
1066 if (code >= 0x110000)
1067 return 0;
1068
1069 /* XXX should we just skip all the code points in the PUAs here? */
1070 if (!with_alias_and_seq && (IS_ALIAS(code) || IS_NAMED_SEQ(code)))
1071 return 0;
1072
1073 if (UCD_Check(self)) {
1074 /* in 3.2.0 there are no aliases and named sequences */
1075 const change_record *old;
1076 if (IS_ALIAS(code) || IS_NAMED_SEQ(code))
1077 return 0;
1078 old = get_old_record(self, code);
1079 if (old->category_changed == 0) {
1080 /* unassigned */
1081 return 0;
1082 }
1083 }
1084
1085 if (SBase <= code && code < SBase+SCount) {
1086 /* Hangul syllable. */
1087 int SIndex = code - SBase;
1088 int L = SIndex / NCount;
1089 int V = (SIndex % NCount) / TCount;
1090 int T = SIndex % TCount;
1091
1092 if (buflen < 27)
1093 /* Worst case: HANGUL SYLLABLE <10chars>. */
1094 return 0;
1095 strcpy(buffer, "HANGUL SYLLABLE ");
1096 buffer += 16;
1097 strcpy(buffer, hangul_syllables[L][0]);
1098 buffer += strlen(hangul_syllables[L][0]);
1099 strcpy(buffer, hangul_syllables[V][1]);
1100 buffer += strlen(hangul_syllables[V][1]);
1101 strcpy(buffer, hangul_syllables[T][2]);
1102 buffer += strlen(hangul_syllables[T][2]);
1103 *buffer = '\0';
1104 return 1;
1105 }
1106
1107 if (is_unified_ideograph(code)) {
1108 if (buflen < 28)
1109 /* Worst case: CJK UNIFIED IDEOGRAPH-20000 */
1110 return 0;
1111 sprintf(buffer, "CJK UNIFIED IDEOGRAPH-%X", code);
1112 return 1;
1113 }
1114
1115 /* get offset into phrasebook */
1116 offset = phrasebook_offset1[(code>>phrasebook_shift)];
1117 offset = phrasebook_offset2[(offset<<phrasebook_shift) +
1118 (code&((1<<phrasebook_shift)-1))];
1119 if (!offset)
1120 return 0;
1121
1122 i = 0;
1123
1124 for (;;) {
1125 /* get word index */
1126 word = phrasebook[offset] - phrasebook_short;
1127 if (word >= 0) {
1128 word = (word << 8) + phrasebook[offset+1];
1129 offset += 2;
1130 } else
1131 word = phrasebook[offset++];
1132 if (i) {
1133 if (i > buflen)
1134 return 0; /* buffer overflow */
1135 buffer[i++] = ' ';
1136 }
1137 /* copy word string from lexicon. the last character in the
1138 word has bit 7 set. the last word in a string ends with
1139 0x80 */
1140 w = lexicon + lexicon_offset[word];
1141 while (*w < 128) {
1142 if (i >= buflen)
1143 return 0; /* buffer overflow */
1144 buffer[i++] = *w++;
1145 }
1146 if (i >= buflen)
1147 return 0; /* buffer overflow */
1148 buffer[i++] = *w & 127;
1149 if (*w == 128)
1150 break; /* end of word */
1151 }
1152
1153 return 1;
1154 }
1155
1156 static int
1157 capi_getucname(Py_UCS4 code,
1158 char* buffer, int buflen,
1159 int with_alias_and_seq)
1160 {
1161 return _getucname(NULL, code, buffer, buflen, with_alias_and_seq);
1162
1163 }
1164
1165 static int
1166 _cmpname(PyObject *self, int code, const char* name, int namelen)
1167 {
1168 /* check if code corresponds to the given name */
1169 int i;
1170 char buffer[NAME_MAXLEN+1];
1171 if (!_getucname(self, code, buffer, NAME_MAXLEN, 1))
1172 return 0;
1173 for (i = 0; i < namelen; i++) {
1174 if (Py_TOUPPER(name[i]) != buffer[i])
1175 return 0;
1176 }
1177 return buffer[namelen] == '\0';
1178 }
1179
1180 static void
1181 find_syllable(const char *str, int *len, int *pos, int count, int column)
1182 {
1183 int i, len1;
1184 *len = -1;
1185 for (i = 0; i < count; i++) {
1186 const char *s = hangul_syllables[i][column];
1187 len1 = Py_SAFE_DOWNCAST(strlen(s), size_t, int);
1188 if (len1 <= *len)
1189 continue;
1190 if (strncmp(str, s, len1) == 0) {
1191 *len = len1;
1192 *pos = i;
1193 }
1194 }
1195 if (*len == -1) {
1196 *len = 0;
1197 }
1198 }
1199
1200 static int
1201 _check_alias_and_seq(unsigned int cp, Py_UCS4* code, int with_named_seq)
1202 {
1203 /* check if named sequences are allowed */
1204 if (!with_named_seq && IS_NAMED_SEQ(cp))
1205 return 0;
1206 /* if the code point is in the PUA range that we use for aliases,
1207 * convert it to obtain the right code point */
1208 if (IS_ALIAS(cp))
1209 *code = name_aliases[cp-aliases_start];
1210 else
1211 *code = cp;
1212 return 1;
1213 }
1214
1215 static int
1216 _getcode(PyObject* self,
1217 const char* name, int namelen, Py_UCS4* code, int with_named_seq)
1218 {
1219 /* Return the code point associated with the given name.
1220 * Named aliases are resolved too (unless self != NULL (i.e. we are using
1221 * 3.2.0)). If with_named_seq is 1, returns the PUA code point that we are
1222 * using for the named sequence, and the caller must then convert it. */
1223 unsigned int h, v;
1224 unsigned int mask = code_size-1;
1225 unsigned int i, incr;
1226
1227 /* Check for hangul syllables. */
1228 if (strncmp(name, "HANGUL SYLLABLE ", 16) == 0) {
1229 int len, L = -1, V = -1, T = -1;
1230 const char *pos = name + 16;
1231 find_syllable(pos, &len, &L, LCount, 0);
1232 pos += len;
1233 find_syllable(pos, &len, &V, VCount, 1);
1234 pos += len;
1235 find_syllable(pos, &len, &T, TCount, 2);
1236 pos += len;
1237 if (L != -1 && V != -1 && T != -1 && pos-name == namelen) {
1238 *code = SBase + (L*VCount+V)*TCount + T;
1239 return 1;
1240 }
1241 /* Otherwise, it's an illegal syllable name. */
1242 return 0;
1243 }
1244
1245 /* Check for unified ideographs. */
1246 if (strncmp(name, "CJK UNIFIED IDEOGRAPH-", 22) == 0) {
1247 /* Four or five hexdigits must follow. */
1248 v = 0;
1249 name += 22;
1250 namelen -= 22;
1251 if (namelen != 4 && namelen != 5)
1252 return 0;
1253 while (namelen--) {
1254 v *= 16;
1255 if (*name >= '0' && *name <= '9')
1256 v += *name - '0';
1257 else if (*name >= 'A' && *name <= 'F')
1258 v += *name - 'A' + 10;
1259 else
1260 return 0;
1261 name++;
1262 }
1263 if (!is_unified_ideograph(v))
1264 return 0;
1265 *code = v;
1266 return 1;
1267 }
1268
1269 /* the following is the same as python's dictionary lookup, with
1270 only minor changes. see the makeunicodedata script for more
1271 details */
1272
1273 h = (unsigned int) _gethash(name, namelen, code_magic);
1274 i = (~h) & mask;
1275 v = code_hash[i];
1276 if (!v)
1277 return 0;
1278 if (_cmpname(self, v, name, namelen)) {
1279 return _check_alias_and_seq(v, code, with_named_seq);
1280 }
1281 incr = (h ^ (h >> 3)) & mask;
1282 if (!incr)
1283 incr = mask;
1284 for (;;) {
1285 i = (i + incr) & mask;
1286 v = code_hash[i];
1287 if (!v)
1288 return 0;
1289 if (_cmpname(self, v, name, namelen)) {
1290 return _check_alias_and_seq(v, code, with_named_seq);
1291 }
1292 incr = incr << 1;
1293 if (incr > mask)
1294 incr = incr ^ code_poly;
1295 }
1296 }
1297
1298 static int
1299 capi_getcode(const char* name, int namelen, Py_UCS4* code,
1300 int with_named_seq)
1301 {
1302 return _getcode(NULL, name, namelen, code, with_named_seq);
1303
1304 }
1305
1306 static void
1307 unicodedata_destroy_capi(PyObject *capsule)
1308 {
1309 void *capi = PyCapsule_GetPointer(capsule, PyUnicodeData_CAPSULE_NAME);
1310 PyMem_Free(capi);
1311 }
1312
1313 static PyObject *
1314 unicodedata_create_capi(void)
1315 {
1316 _PyUnicode_Name_CAPI *capi = PyMem_Malloc(sizeof(_PyUnicode_Name_CAPI));
1317 if (capi == NULL) {
1318 PyErr_NoMemory();
1319 return NULL;
1320 }
1321 capi->getname = capi_getucname;
1322 capi->getcode = capi_getcode;
1323
1324 PyObject *capsule = PyCapsule_New(capi,
1325 PyUnicodeData_CAPSULE_NAME,
1326 unicodedata_destroy_capi);
1327 if (capsule == NULL) {
1328 PyMem_Free(capi);
1329 }
1330 return capsule;
1331 };
1332
1333
1334 /* -------------------------------------------------------------------- */
1335 /* Python bindings */
1336
1337 /*[clinic input]
1338 unicodedata.UCD.name
1339
1340 self: self
1341 chr: int(accept={str})
1342 default: object=NULL
1343 /
1344
1345 Returns the name assigned to the character chr as a string.
1346
1347 If no name is defined, default is returned, or, if not given,
1348 ValueError is raised.
1349 [clinic start generated code]*/
1350
1351 static PyObject *
1352 unicodedata_UCD_name_impl(PyObject *self, int chr, PyObject *default_value)
1353 /*[clinic end generated code: output=6bbb37a326407707 input=3e0367f534de56d9]*/
1354 {
1355 char name[NAME_MAXLEN+1];
1356 Py_UCS4 c = (Py_UCS4)chr;
1357
1358 if (!_getucname(self, c, name, NAME_MAXLEN, 0)) {
1359 if (default_value == NULL) {
1360 PyErr_SetString(PyExc_ValueError, "no such name");
1361 return NULL;
1362 }
1363 else {
1364 return Py_NewRef(default_value);
1365 }
1366 }
1367
1368 return PyUnicode_FromString(name);
1369 }
1370
1371 /*[clinic input]
1372 unicodedata.UCD.lookup
1373
1374 self: self
1375 name: str(accept={str, robuffer}, zeroes=True)
1376 /
1377
1378 Look up character by name.
1379
1380 If a character with the given name is found, return the
1381 corresponding character. If not found, KeyError is raised.
1382 [clinic start generated code]*/
1383
1384 static PyObject *
1385 unicodedata_UCD_lookup_impl(PyObject *self, const char *name,
1386 Py_ssize_t name_length)
1387 /*[clinic end generated code: output=7f03fc4959b242f6 input=a557be0f8607a0d6]*/
1388 {
1389 Py_UCS4 code;
1390 unsigned int index;
1391 if (name_length > NAME_MAXLEN) {
1392 PyErr_SetString(PyExc_KeyError, "name too long");
1393 return NULL;
1394 }
1395
1396 if (!_getcode(self, name, (int)name_length, &code, 1)) {
1397 PyErr_Format(PyExc_KeyError, "undefined character name '%s'", name);
1398 return NULL;
1399 }
1400 /* check if code is in the PUA range that we use for named sequences
1401 and convert it */
1402 if (IS_NAMED_SEQ(code)) {
1403 index = code-named_sequences_start;
1404 return PyUnicode_FromKindAndData(PyUnicode_2BYTE_KIND,
1405 named_sequences[index].seq,
1406 named_sequences[index].seqlen);
1407 }
1408 return PyUnicode_FromOrdinal(code);
1409 }
1410
1411 // List of functions used to define module functions *AND* unicodedata.UCD
1412 // methods. For module functions, self is the module. For UCD methods, self
1413 // is an UCD instance. The UCD_Check() macro is used to check if self is
1414 // an UCD instance.
1415 static PyMethodDef unicodedata_functions[] = {
1416 UNICODEDATA_UCD_DECIMAL_METHODDEF
1417 UNICODEDATA_UCD_DIGIT_METHODDEF
1418 UNICODEDATA_UCD_NUMERIC_METHODDEF
1419 UNICODEDATA_UCD_CATEGORY_METHODDEF
1420 UNICODEDATA_UCD_BIDIRECTIONAL_METHODDEF
1421 UNICODEDATA_UCD_COMBINING_METHODDEF
1422 UNICODEDATA_UCD_MIRRORED_METHODDEF
1423 UNICODEDATA_UCD_EAST_ASIAN_WIDTH_METHODDEF
1424 UNICODEDATA_UCD_DECOMPOSITION_METHODDEF
1425 UNICODEDATA_UCD_NAME_METHODDEF
1426 UNICODEDATA_UCD_LOOKUP_METHODDEF
1427 UNICODEDATA_UCD_IS_NORMALIZED_METHODDEF
1428 UNICODEDATA_UCD_NORMALIZE_METHODDEF
1429 {NULL, NULL} /* sentinel */
1430 };
1431
1432 static int
1433 ucd_traverse(PreviousDBVersion *self, visitproc visit, void *arg)
1434 {
1435 Py_VISIT(Py_TYPE(self));
1436 return 0;
1437 }
1438
1439 static void
1440 ucd_dealloc(PreviousDBVersion *self)
1441 {
1442 PyTypeObject *tp = Py_TYPE(self);
1443 PyObject_GC_UnTrack(self);
1444 PyObject_GC_Del(self);
1445 Py_DECREF(tp);
1446 }
1447
1448 static PyType_Slot ucd_type_slots[] = {
1449 {Py_tp_dealloc, ucd_dealloc},
1450 {Py_tp_traverse, ucd_traverse},
1451 {Py_tp_getattro, PyObject_GenericGetAttr},
1452 {Py_tp_methods, unicodedata_functions},
1453 {Py_tp_members, DB_members},
1454 {0, 0}
1455 };
1456
1457 static PyType_Spec ucd_type_spec = {
1458 .name = "unicodedata.UCD",
1459 .basicsize = sizeof(PreviousDBVersion),
1460 .flags = (Py_TPFLAGS_DEFAULT | Py_TPFLAGS_DISALLOW_INSTANTIATION |
1461 Py_TPFLAGS_HAVE_GC | Py_TPFLAGS_IMMUTABLETYPE),
1462 .slots = ucd_type_slots
1463 };
1464
1465 PyDoc_STRVAR(unicodedata_docstring,
1466 "This module provides access to the Unicode Character Database which\n\
1467 defines character properties for all Unicode characters. The data in\n\
1468 this database is based on the UnicodeData.txt file version\n\
1469 " UNIDATA_VERSION " which is publicly available from ftp://ftp.unicode.org/.\n\
1470 \n\
1471 The module uses the same names and symbols as defined by the\n\
1472 UnicodeData File Format " UNIDATA_VERSION ".");
1473
1474 static int
1475 unicodedata_exec(PyObject *module)
1476 {
1477 if (PyModule_AddStringConstant(module, "unidata_version", UNIDATA_VERSION) < 0) {
1478 return -1;
1479 }
1480
1481 PyTypeObject *ucd_type = (PyTypeObject *)PyType_FromSpec(&ucd_type_spec);
1482 if (ucd_type == NULL) {
1483 return -1;
1484 }
1485
1486 if (PyModule_AddType(module, ucd_type) < 0) {
1487 Py_DECREF(ucd_type);
1488 return -1;
1489 }
1490
1491 // Unicode database version 3.2.0 used by the IDNA encoding
1492 PyObject *v;
1493 v = new_previous_version(ucd_type, "3.2.0",
1494 get_change_3_2_0, normalization_3_2_0);
1495 Py_DECREF(ucd_type);
1496 if (v == NULL) {
1497 return -1;
1498 }
1499 if (PyModule_AddObject(module, "ucd_3_2_0", v) < 0) {
1500 Py_DECREF(v);
1501 return -1;
1502 }
1503
1504 /* Export C API */
1505 PyObject *capsule = unicodedata_create_capi();
1506 if (capsule == NULL) {
1507 return -1;
1508 }
1509 int rc = PyModule_AddObjectRef(module, "_ucnhash_CAPI", capsule);
1510 Py_DECREF(capsule);
1511 if (rc < 0) {
1512 return -1;
1513 }
1514 return 0;
1515 }
1516
1517 static PyModuleDef_Slot unicodedata_slots[] = {
1518 {Py_mod_exec, unicodedata_exec},
1519 {Py_mod_multiple_interpreters, Py_MOD_PER_INTERPRETER_GIL_SUPPORTED},
1520 {0, NULL}
1521 };
1522
1523 static struct PyModuleDef unicodedata_module = {
1524 PyModuleDef_HEAD_INIT,
1525 .m_name = "unicodedata",
1526 .m_doc = unicodedata_docstring,
1527 .m_size = 0,
1528 .m_methods = unicodedata_functions,
1529 .m_slots = unicodedata_slots,
1530 };
1531
1532 PyMODINIT_FUNC
1533 PyInit_unicodedata(void)
1534 {
1535 return PyModuleDef_Init(&unicodedata_module);
1536 }
1537
1538
1539 /*
1540 Local variables:
1541 c-basic-offset: 4
1542 indent-tabs-mode: nil
1543 End:
1544 */