1  /* ------------------------------------------------------------------------
       2  
       3     unicodedata -- Provides access to the Unicode database.
       4  
       5     The current version number is reported in the unidata_version constant.
       6  
       7     Written by Marc-Andre Lemburg (mal@lemburg.com).
       8     Modified for Python 2.0 by Fredrik Lundh (fredrik@pythonware.com)
       9     Modified by Martin v. Löwis (martin@v.loewis.de)
      10  
      11     Copyright (c) Corporation for National Research Initiatives.
      12  
      13     ------------------------------------------------------------------------ */
      14  
      15  #ifndef Py_BUILD_CORE_BUILTIN
      16  #  define Py_BUILD_CORE_MODULE 1
      17  #endif
      18  
      19  #define PY_SSIZE_T_CLEAN
      20  
      21  #include "Python.h"
      22  #include "pycore_ucnhash.h"       // _PyUnicode_Name_CAPI
      23  #include "structmember.h"         // PyMemberDef
      24  
      25  #include <stdbool.h>
      26  
      27  /*[clinic input]
      28  module unicodedata
      29  class unicodedata.UCD 'PreviousDBVersion *' '<not used>'
      30  [clinic start generated code]*/
      31  /*[clinic end generated code: output=da39a3ee5e6b4b0d input=e47113e05924be43]*/
      32  
      33  /* character properties */
      34  
      35  typedef struct {
      36      const unsigned char category;       /* index into
      37                                             _PyUnicode_CategoryNames */
      38      const unsigned char combining;      /* combining class value 0 - 255 */
      39      const unsigned char bidirectional;  /* index into
      40                                             _PyUnicode_BidirectionalNames */
      41      const unsigned char mirrored;       /* true if mirrored in bidir mode */
      42      const unsigned char east_asian_width;       /* index into
      43                                                     _PyUnicode_EastAsianWidth */
      44      const unsigned char normalization_quick_check; /* see is_normalized() */
      45  } _PyUnicode_DatabaseRecord;
      46  
      47  typedef struct change_record {
      48      /* sequence of fields should be the same as in merge_old_version */
      49      const unsigned char bidir_changed;
      50      const unsigned char category_changed;
      51      const unsigned char decimal_changed;
      52      const unsigned char mirrored_changed;
      53      const unsigned char east_asian_width_changed;
      54      const double numeric_changed;
      55  } change_record;
      56  
      57  /* data file generated by Tools/unicode/makeunicodedata.py */
      58  #include "unicodedata_db.h"
      59  
      60  static const _PyUnicode_DatabaseRecord*
      61  _getrecord_ex(Py_UCS4 code)
      62  {
      63      int index;
      64      if (code >= 0x110000)
      65          index = 0;
      66      else {
      67          index = index1[(code>>SHIFT)];
      68          index = index2[(index<<SHIFT)+(code&((1<<SHIFT)-1))];
      69      }
      70  
      71      return &_PyUnicode_Database_Records[index];
      72  }
      73  
      74  /* ------------- Previous-version API ------------------------------------- */
      75  typedef struct previous_version {
      76      PyObject_HEAD
      77      const char *name;
      78      const change_record* (*getrecord)(Py_UCS4);
      79      Py_UCS4 (*normalization)(Py_UCS4);
      80  } PreviousDBVersion;
      81  
      82  #include "clinic/unicodedata.c.h"
      83  
      84  #define get_old_record(self, v)    ((((PreviousDBVersion*)self)->getrecord)(v))
      85  
      86  static PyMemberDef DB_members[] = {
      87          {"unidata_version", T_STRING, offsetof(PreviousDBVersion, name), READONLY},
      88          {NULL}
      89  };
      90  
      91  // Check if self is an unicodedata.UCD instance.
      92  // If self is NULL (when the PyCapsule C API is used), return 0.
      93  // PyModule_Check() is used to avoid having to retrieve the ucd_type.
      94  // See unicodedata_functions comment to the rationale of this macro.
      95  #define UCD_Check(self) (self != NULL && !PyModule_Check(self))
      96  
      97  static PyObject*
      98  new_previous_version(PyTypeObject *ucd_type,
      99                       const char*name, const change_record* (*getrecord)(Py_UCS4),
     100                       Py_UCS4 (*normalization)(Py_UCS4))
     101  {
     102      PreviousDBVersion *self;
     103      self = PyObject_GC_New(PreviousDBVersion, ucd_type);
     104      if (self == NULL)
     105          return NULL;
     106      self->name = name;
     107      self->getrecord = getrecord;
     108      self->normalization = normalization;
     109      PyObject_GC_Track(self);
     110      return (PyObject*)self;
     111  }
     112  
     113  
     114  /* --- Module API --------------------------------------------------------- */
     115  
     116  /*[clinic input]
     117  unicodedata.UCD.decimal
     118  
     119      self: self
     120      chr: int(accept={str})
     121      default: object=NULL
     122      /
     123  
     124  Converts a Unicode character into its equivalent decimal value.
     125  
     126  Returns the decimal value assigned to the character chr as integer.
     127  If no such value is defined, default is returned, or, if not given,
     128  ValueError is raised.
     129  [clinic start generated code]*/
     130  
     131  static PyObject *
     132  unicodedata_UCD_decimal_impl(PyObject *self, int chr,
     133                               PyObject *default_value)
     134  /*[clinic end generated code: output=be23376e1a185231 input=933f8107993f23d0]*/
     135  {
     136      int have_old = 0;
     137      long rc;
     138      Py_UCS4 c = (Py_UCS4)chr;
     139  
     140      if (UCD_Check(self)) {
     141          const change_record *old = get_old_record(self, c);
     142          if (old->category_changed == 0) {
     143              /* unassigned */
     144              have_old = 1;
     145              rc = -1;
     146          }
     147          else if (old->decimal_changed != 0xFF) {
     148              have_old = 1;
     149              rc = old->decimal_changed;
     150          }
     151      }
     152  
     153      if (!have_old)
     154          rc = Py_UNICODE_TODECIMAL(c);
     155      if (rc < 0) {
     156          if (default_value == NULL) {
     157              PyErr_SetString(PyExc_ValueError,
     158                              "not a decimal");
     159              return NULL;
     160          }
     161          else {
     162              return Py_NewRef(default_value);
     163          }
     164      }
     165      return PyLong_FromLong(rc);
     166  }
     167  
     168  /*[clinic input]
     169  unicodedata.UCD.digit
     170  
     171      self: self
     172      chr: int(accept={str})
     173      default: object=NULL
     174      /
     175  
     176  Converts a Unicode character into its equivalent digit value.
     177  
     178  Returns the digit value assigned to the character chr as integer.
     179  If no such value is defined, default is returned, or, if not given,
     180  ValueError is raised.
     181  [clinic start generated code]*/
     182  
     183  static PyObject *
     184  unicodedata_UCD_digit_impl(PyObject *self, int chr, PyObject *default_value)
     185  /*[clinic end generated code: output=96e18c950171fd2f input=e27d6e4565cd29f2]*/
     186  {
     187      long rc;
     188      Py_UCS4 c = (Py_UCS4)chr;
     189      rc = Py_UNICODE_TODIGIT(c);
     190      if (rc < 0) {
     191          if (default_value == NULL) {
     192              PyErr_SetString(PyExc_ValueError, "not a digit");
     193              return NULL;
     194          }
     195          else {
     196              return Py_NewRef(default_value);
     197          }
     198      }
     199      return PyLong_FromLong(rc);
     200  }
     201  
     202  /*[clinic input]
     203  unicodedata.UCD.numeric
     204  
     205      self: self
     206      chr: int(accept={str})
     207      default: object=NULL
     208      /
     209  
     210  Converts a Unicode character into its equivalent numeric value.
     211  
     212  Returns the numeric value assigned to the character chr as float.
     213  If no such value is defined, default is returned, or, if not given,
     214  ValueError is raised.
     215  [clinic start generated code]*/
     216  
     217  static PyObject *
     218  unicodedata_UCD_numeric_impl(PyObject *self, int chr,
     219                               PyObject *default_value)
     220  /*[clinic end generated code: output=53ce281fe85b10c4 input=fdf5871a5542893c]*/
     221  {
     222      int have_old = 0;
     223      double rc;
     224      Py_UCS4 c = (Py_UCS4)chr;
     225  
     226      if (UCD_Check(self)) {
     227          const change_record *old = get_old_record(self, c);
     228          if (old->category_changed == 0) {
     229              /* unassigned */
     230              have_old = 1;
     231              rc = -1.0;
     232          }
     233          else if (old->decimal_changed != 0xFF) {
     234              have_old = 1;
     235              rc = old->decimal_changed;
     236          }
     237      }
     238  
     239      if (!have_old)
     240          rc = Py_UNICODE_TONUMERIC(c);
     241      if (rc == -1.0) {
     242          if (default_value == NULL) {
     243              PyErr_SetString(PyExc_ValueError, "not a numeric character");
     244              return NULL;
     245          }
     246          else {
     247              return Py_NewRef(default_value);
     248          }
     249      }
     250      return PyFloat_FromDouble(rc);
     251  }
     252  
     253  /*[clinic input]
     254  unicodedata.UCD.category
     255  
     256      self: self
     257      chr: int(accept={str})
     258      /
     259  
     260  Returns the general category assigned to the character chr as string.
     261  [clinic start generated code]*/
     262  
     263  static PyObject *
     264  unicodedata_UCD_category_impl(PyObject *self, int chr)
     265  /*[clinic end generated code: output=8571539ee2e6783a input=27d6f3d85050bc06]*/
     266  {
     267      int index;
     268      Py_UCS4 c = (Py_UCS4)chr;
     269      index = (int) _getrecord_ex(c)->category;
     270      if (UCD_Check(self)) {
     271          const change_record *old = get_old_record(self, c);
     272          if (old->category_changed != 0xFF)
     273              index = old->category_changed;
     274      }
     275      return PyUnicode_FromString(_PyUnicode_CategoryNames[index]);
     276  }
     277  
     278  /*[clinic input]
     279  unicodedata.UCD.bidirectional
     280  
     281      self: self
     282      chr: int(accept={str})
     283      /
     284  
     285  Returns the bidirectional class assigned to the character chr as string.
     286  
     287  If no such value is defined, an empty string is returned.
     288  [clinic start generated code]*/
     289  
     290  static PyObject *
     291  unicodedata_UCD_bidirectional_impl(PyObject *self, int chr)
     292  /*[clinic end generated code: output=d36310ce2039bb92 input=b3d8f42cebfcf475]*/
     293  {
     294      int index;
     295      Py_UCS4 c = (Py_UCS4)chr;
     296      index = (int) _getrecord_ex(c)->bidirectional;
     297      if (UCD_Check(self)) {
     298          const change_record *old = get_old_record(self, c);
     299          if (old->category_changed == 0)
     300              index = 0; /* unassigned */
     301          else if (old->bidir_changed != 0xFF)
     302              index = old->bidir_changed;
     303      }
     304      return PyUnicode_FromString(_PyUnicode_BidirectionalNames[index]);
     305  }
     306  
     307  /*[clinic input]
     308  unicodedata.UCD.combining -> int
     309  
     310      self: self
     311      chr: int(accept={str})
     312      /
     313  
     314  Returns the canonical combining class assigned to the character chr as integer.
     315  
     316  Returns 0 if no combining class is defined.
     317  [clinic start generated code]*/
     318  
     319  static int
     320  unicodedata_UCD_combining_impl(PyObject *self, int chr)
     321  /*[clinic end generated code: output=cad056d0cb6a5920 input=9f2d6b2a95d0a22a]*/
     322  {
     323      int index;
     324      Py_UCS4 c = (Py_UCS4)chr;
     325      index = (int) _getrecord_ex(c)->combining;
     326      if (UCD_Check(self)) {
     327          const change_record *old = get_old_record(self, c);
     328          if (old->category_changed == 0)
     329              index = 0; /* unassigned */
     330      }
     331      return index;
     332  }
     333  
     334  /*[clinic input]
     335  unicodedata.UCD.mirrored -> int
     336  
     337      self: self
     338      chr: int(accept={str})
     339      /
     340  
     341  Returns the mirrored property assigned to the character chr as integer.
     342  
     343  Returns 1 if the character has been identified as a "mirrored"
     344  character in bidirectional text, 0 otherwise.
     345  [clinic start generated code]*/
     346  
     347  static int
     348  unicodedata_UCD_mirrored_impl(PyObject *self, int chr)
     349  /*[clinic end generated code: output=2532dbf8121b50e6 input=5dd400d351ae6f3b]*/
     350  {
     351      int index;
     352      Py_UCS4 c = (Py_UCS4)chr;
     353      index = (int) _getrecord_ex(c)->mirrored;
     354      if (UCD_Check(self)) {
     355          const change_record *old = get_old_record(self, c);
     356          if (old->category_changed == 0)
     357              index = 0; /* unassigned */
     358          else if (old->mirrored_changed != 0xFF)
     359              index = old->mirrored_changed;
     360      }
     361      return index;
     362  }
     363  
     364  /*[clinic input]
     365  unicodedata.UCD.east_asian_width
     366  
     367      self: self
     368      chr: int(accept={str})
     369      /
     370  
     371  Returns the east asian width assigned to the character chr as string.
     372  [clinic start generated code]*/
     373  
     374  static PyObject *
     375  unicodedata_UCD_east_asian_width_impl(PyObject *self, int chr)
     376  /*[clinic end generated code: output=484e8537d9ee8197 input=c4854798aab026e0]*/
     377  {
     378      int index;
     379      Py_UCS4 c = (Py_UCS4)chr;
     380      index = (int) _getrecord_ex(c)->east_asian_width;
     381      if (UCD_Check(self)) {
     382          const change_record *old = get_old_record(self, c);
     383          if (old->category_changed == 0)
     384              index = 0; /* unassigned */
     385          else if (old->east_asian_width_changed != 0xFF)
     386              index = old->east_asian_width_changed;
     387      }
     388      return PyUnicode_FromString(_PyUnicode_EastAsianWidthNames[index]);
     389  }
     390  
     391  /*[clinic input]
     392  unicodedata.UCD.decomposition
     393  
     394      self: self
     395      chr: int(accept={str})
     396      /
     397  
     398  Returns the character decomposition mapping assigned to the character chr as string.
     399  
     400  An empty string is returned in case no such mapping is defined.
     401  [clinic start generated code]*/
     402  
     403  static PyObject *
     404  unicodedata_UCD_decomposition_impl(PyObject *self, int chr)
     405  /*[clinic end generated code: output=7d699f3ec7565d27 input=e4c12459ad68507b]*/
     406  {
     407      char decomp[256];
     408      int code, index, count;
     409      size_t i;
     410      unsigned int prefix_index;
     411      Py_UCS4 c = (Py_UCS4)chr;
     412  
     413      code = (int)c;
     414  
     415      if (UCD_Check(self)) {
     416          const change_record *old = get_old_record(self, c);
     417          if (old->category_changed == 0)
     418              return PyUnicode_FromString(""); /* unassigned */
     419      }
     420  
     421      if (code < 0 || code >= 0x110000)
     422          index = 0;
     423      else {
     424          index = decomp_index1[(code>>DECOMP_SHIFT)];
     425          index = decomp_index2[(index<<DECOMP_SHIFT)+
     426                               (code&((1<<DECOMP_SHIFT)-1))];
     427      }
     428  
     429      /* high byte is number of hex bytes (usually one or two), low byte
     430         is prefix code (from*/
     431      count = decomp_data[index] >> 8;
     432  
     433      /* XXX: could allocate the PyString up front instead
     434         (strlen(prefix) + 5 * count + 1 bytes) */
     435  
     436      /* Based on how index is calculated above and decomp_data is generated
     437         from Tools/unicode/makeunicodedata.py, it should not be possible
     438         to overflow decomp_prefix. */
     439      prefix_index = decomp_data[index] & 255;
     440      assert(prefix_index < Py_ARRAY_LENGTH(decomp_prefix));
     441  
     442      /* copy prefix */
     443      i = strlen(decomp_prefix[prefix_index]);
     444      memcpy(decomp, decomp_prefix[prefix_index], i);
     445  
     446      while (count-- > 0) {
     447          if (i)
     448              decomp[i++] = ' ';
     449          assert(i < sizeof(decomp));
     450          PyOS_snprintf(decomp + i, sizeof(decomp) - i, "%04X",
     451                        decomp_data[++index]);
     452          i += strlen(decomp + i);
     453      }
     454      return PyUnicode_FromStringAndSize(decomp, i);
     455  }
     456  
     457  static void
     458  get_decomp_record(PyObject *self, Py_UCS4 code,
     459                    int *index, int *prefix, int *count)
     460  {
     461      if (code >= 0x110000) {
     462          *index = 0;
     463      }
     464      else if (UCD_Check(self)
     465               && get_old_record(self, code)->category_changed==0) {
     466          /* unassigned in old version */
     467          *index = 0;
     468      }
     469      else {
     470          *index = decomp_index1[(code>>DECOMP_SHIFT)];
     471          *index = decomp_index2[(*index<<DECOMP_SHIFT)+
     472                                 (code&((1<<DECOMP_SHIFT)-1))];
     473      }
     474  
     475      /* high byte is number of hex bytes (usually one or two), low byte
     476         is prefix code (from*/
     477      *count = decomp_data[*index] >> 8;
     478      *prefix = decomp_data[*index] & 255;
     479  
     480      (*index)++;
     481  }
     482  
     483  #define SBase   0xAC00
     484  #define LBase   0x1100
     485  #define VBase   0x1161
     486  #define TBase   0x11A7
     487  #define LCount  19
     488  #define VCount  21
     489  #define TCount  28
     490  #define NCount  (VCount*TCount)
     491  #define SCount  (LCount*NCount)
     492  
     493  static PyObject*
     494  nfd_nfkd(PyObject *self, PyObject *input, int k)
     495  {
     496      PyObject *result;
     497      Py_UCS4 *output;
     498      Py_ssize_t i, o, osize;
     499      int kind;
     500      const void *data;
     501      /* Longest decomposition in Unicode 3.2: U+FDFA */
     502      Py_UCS4 stack[20];
     503      Py_ssize_t space, isize;
     504      int index, prefix, count, stackptr;
     505      unsigned char prev, cur;
     506  
     507      stackptr = 0;
     508      isize = PyUnicode_GET_LENGTH(input);
     509      space = isize;
     510      /* Overallocate at most 10 characters. */
     511      if (space > 10) {
     512          if (space <= PY_SSIZE_T_MAX - 10)
     513              space += 10;
     514      }
     515      else {
     516          space *= 2;
     517      }
     518      osize = space;
     519      output = PyMem_NEW(Py_UCS4, space);
     520      if (!output) {
     521          PyErr_NoMemory();
     522          return NULL;
     523      }
     524      i = o = 0;
     525      kind = PyUnicode_KIND(input);
     526      data = PyUnicode_DATA(input);
     527  
     528      while (i < isize) {
     529          stack[stackptr++] = PyUnicode_READ(kind, data, i++);
     530          while(stackptr) {
     531              Py_UCS4 code = stack[--stackptr];
     532              /* Hangul Decomposition adds three characters in
     533                 a single step, so we need at least that much room. */
     534              if (space < 3) {
     535                  Py_UCS4 *new_output;
     536                  osize += 10;
     537                  space += 10;
     538                  new_output = PyMem_Realloc(output, osize*sizeof(Py_UCS4));
     539                  if (new_output == NULL) {
     540                      PyMem_Free(output);
     541                      PyErr_NoMemory();
     542                      return NULL;
     543                  }
     544                  output = new_output;
     545              }
     546              /* Hangul Decomposition. */
     547              if (SBase <= code && code < (SBase+SCount)) {
     548                  int SIndex = code - SBase;
     549                  int L = LBase + SIndex / NCount;
     550                  int V = VBase + (SIndex % NCount) / TCount;
     551                  int T = TBase + SIndex % TCount;
     552                  output[o++] = L;
     553                  output[o++] = V;
     554                  space -= 2;
     555                  if (T != TBase) {
     556                      output[o++] = T;
     557                      space --;
     558                  }
     559                  continue;
     560              }
     561              /* normalization changes */
     562              if (UCD_Check(self)) {
     563                  Py_UCS4 value = ((PreviousDBVersion*)self)->normalization(code);
     564                  if (value != 0) {
     565                      stack[stackptr++] = value;
     566                      continue;
     567                  }
     568              }
     569  
     570              /* Other decompositions. */
     571              get_decomp_record(self, code, &index, &prefix, &count);
     572  
     573              /* Copy character if it is not decomposable, or has a
     574                 compatibility decomposition, but we do NFD. */
     575              if (!count || (prefix && !k)) {
     576                  output[o++] = code;
     577                  space--;
     578                  continue;
     579              }
     580              /* Copy decomposition onto the stack, in reverse
     581                 order.  */
     582              while(count) {
     583                  code = decomp_data[index + (--count)];
     584                  stack[stackptr++] = code;
     585              }
     586          }
     587      }
     588  
     589      result = PyUnicode_FromKindAndData(PyUnicode_4BYTE_KIND,
     590                                         output, o);
     591      PyMem_Free(output);
     592      if (!result)
     593          return NULL;
     594      /* result is guaranteed to be ready, as it is compact. */
     595      kind = PyUnicode_KIND(result);
     596      data = PyUnicode_DATA(result);
     597  
     598      /* Sort canonically. */
     599      i = 0;
     600      prev = _getrecord_ex(PyUnicode_READ(kind, data, i))->combining;
     601      for (i++; i < PyUnicode_GET_LENGTH(result); i++) {
     602          cur = _getrecord_ex(PyUnicode_READ(kind, data, i))->combining;
     603          if (prev == 0 || cur == 0 || prev <= cur) {
     604              prev = cur;
     605              continue;
     606          }
     607          /* Non-canonical order. Need to switch *i with previous. */
     608          o = i - 1;
     609          while (1) {
     610              Py_UCS4 tmp = PyUnicode_READ(kind, data, o+1);
     611              PyUnicode_WRITE(kind, data, o+1,
     612                              PyUnicode_READ(kind, data, o));
     613              PyUnicode_WRITE(kind, data, o, tmp);
     614              o--;
     615              if (o < 0)
     616                  break;
     617              prev = _getrecord_ex(PyUnicode_READ(kind, data, o))->combining;
     618              if (prev == 0 || prev <= cur)
     619                  break;
     620          }
     621          prev = _getrecord_ex(PyUnicode_READ(kind, data, i))->combining;
     622      }
     623      return result;
     624  }
     625  
     626  static int
     627  find_nfc_index(const struct reindex* nfc, Py_UCS4 code)
     628  {
     629      unsigned int index;
     630      for (index = 0; nfc[index].start; index++) {
     631          unsigned int start = nfc[index].start;
     632          if (code < start)
     633              return -1;
     634          if (code <= start + nfc[index].count) {
     635              unsigned int delta = code - start;
     636              return nfc[index].index + delta;
     637          }
     638      }
     639      return -1;
     640  }
     641  
     642  static PyObject*
     643  nfc_nfkc(PyObject *self, PyObject *input, int k)
     644  {
     645      PyObject *result;
     646      int kind;
     647      const void *data;
     648      Py_UCS4 *output;
     649      Py_ssize_t i, i1, o, len;
     650      int f,l,index,index1,comb;
     651      Py_UCS4 code;
     652      Py_ssize_t skipped[20];
     653      int cskipped = 0;
     654  
     655      result = nfd_nfkd(self, input, k);
     656      if (!result)
     657          return NULL;
     658      /* result will be "ready". */
     659      kind = PyUnicode_KIND(result);
     660      data = PyUnicode_DATA(result);
     661      len = PyUnicode_GET_LENGTH(result);
     662  
     663      /* We allocate a buffer for the output.
     664         If we find that we made no changes, we still return
     665         the NFD result. */
     666      output = PyMem_NEW(Py_UCS4, len);
     667      if (!output) {
     668          PyErr_NoMemory();
     669          Py_DECREF(result);
     670          return 0;
     671      }
     672      i = o = 0;
     673  
     674    again:
     675      while (i < len) {
     676        for (index = 0; index < cskipped; index++) {
     677            if (skipped[index] == i) {
     678                /* *i character is skipped.
     679                   Remove from list. */
     680                skipped[index] = skipped[cskipped-1];
     681                cskipped--;
     682                i++;
     683                goto again; /* continue while */
     684            }
     685        }
     686        /* Hangul Composition. We don't need to check for <LV,T>
     687           pairs, since we always have decomposed data. */
     688        code = PyUnicode_READ(kind, data, i);
     689        if (LBase <= code && code < (LBase+LCount) &&
     690            i + 1 < len &&
     691            VBase <= PyUnicode_READ(kind, data, i+1) &&
     692            PyUnicode_READ(kind, data, i+1) < (VBase+VCount)) {
     693            /* check L character is a modern leading consonant (0x1100 ~ 0x1112)
     694               and V character is a modern vowel (0x1161 ~ 0x1175). */
     695            int LIndex, VIndex;
     696            LIndex = code - LBase;
     697            VIndex = PyUnicode_READ(kind, data, i+1) - VBase;
     698            code = SBase + (LIndex*VCount+VIndex)*TCount;
     699            i+=2;
     700            if (i < len &&
     701                TBase < PyUnicode_READ(kind, data, i) &&
     702                PyUnicode_READ(kind, data, i) < (TBase+TCount)) {
     703                /* check T character is a modern trailing consonant
     704                   (0x11A8 ~ 0x11C2). */
     705                code += PyUnicode_READ(kind, data, i)-TBase;
     706                i++;
     707            }
     708            output[o++] = code;
     709            continue;
     710        }
     711  
     712        /* code is still input[i] here */
     713        f = find_nfc_index(nfc_first, code);
     714        if (f == -1) {
     715            output[o++] = code;
     716            i++;
     717            continue;
     718        }
     719        /* Find next unblocked character. */
     720        i1 = i+1;
     721        comb = 0;
     722        /* output base character for now; might be updated later. */
     723        output[o] = PyUnicode_READ(kind, data, i);
     724        while (i1 < len) {
     725            Py_UCS4 code1 = PyUnicode_READ(kind, data, i1);
     726            int comb1 = _getrecord_ex(code1)->combining;
     727            if (comb) {
     728                if (comb1 == 0)
     729                    break;
     730                if (comb >= comb1) {
     731                    /* Character is blocked. */
     732                    i1++;
     733                    continue;
     734                }
     735            }
     736            l = find_nfc_index(nfc_last, code1);
     737            /* i1 cannot be combined with i. If i1
     738               is a starter, we don't need to look further.
     739               Otherwise, record the combining class. */
     740            if (l == -1) {
     741              not_combinable:
     742                if (comb1 == 0)
     743                    break;
     744                comb = comb1;
     745                i1++;
     746                continue;
     747            }
     748            index = f*TOTAL_LAST + l;
     749            index1 = comp_index[index >> COMP_SHIFT];
     750            code = comp_data[(index1<<COMP_SHIFT)+
     751                             (index&((1<<COMP_SHIFT)-1))];
     752            if (code == 0)
     753                goto not_combinable;
     754  
     755            /* Replace the original character. */
     756            output[o] = code;
     757            /* Mark the second character unused. */
     758            assert(cskipped < 20);
     759            skipped[cskipped++] = i1;
     760            i1++;
     761            f = find_nfc_index(nfc_first, output[o]);
     762            if (f == -1)
     763                break;
     764        }
     765        /* Output character was already written.
     766           Just advance the indices. */
     767        o++; i++;
     768      }
     769      if (o == len) {
     770          /* No changes. Return original string. */
     771          PyMem_Free(output);
     772          return result;
     773      }
     774      Py_DECREF(result);
     775      result = PyUnicode_FromKindAndData(PyUnicode_4BYTE_KIND,
     776                                         output, o);
     777      PyMem_Free(output);
     778      return result;
     779  }
     780  
     781  // This needs to match the logic in makeunicodedata.py
     782  // which constructs the quickcheck data.
     783  typedef enum {YES = 0, MAYBE = 1, NO = 2} QuickcheckResult;
     784  
     785  /* Run the Unicode normalization "quickcheck" algorithm.
     786   *
     787   * Return YES or NO if quickcheck determines the input is certainly
     788   * normalized or certainly not, and MAYBE if quickcheck is unable to
     789   * tell.
     790   *
     791   * If `yes_only` is true, then return MAYBE as soon as we determine
     792   * the answer is not YES.
     793   *
     794   * For background and details on the algorithm, see UAX #15:
     795   *   https://www.unicode.org/reports/tr15/#Detecting_Normalization_Forms
     796   */
     797  static QuickcheckResult
     798  is_normalized_quickcheck(PyObject *self, PyObject *input, bool nfc, bool k,
     799                           bool yes_only)
     800  {
     801      /* UCD 3.2.0 is requested, quickchecks must be disabled. */
     802      if (UCD_Check(self)) {
     803          return MAYBE;
     804      }
     805  
     806      if (PyUnicode_IS_ASCII(input)) {
     807          return YES;
     808      }
     809  
     810      Py_ssize_t i, len;
     811      int kind;
     812      const void *data;
     813      unsigned char prev_combining = 0;
     814  
     815      /* The two quickcheck bits at this shift have type QuickcheckResult. */
     816      int quickcheck_shift = (nfc ? 4 : 0) + (k ? 2 : 0);
     817  
     818      QuickcheckResult result = YES; /* certainly normalized, unless we find something */
     819  
     820      i = 0;
     821      kind = PyUnicode_KIND(input);
     822      data = PyUnicode_DATA(input);
     823      len = PyUnicode_GET_LENGTH(input);
     824      while (i < len) {
     825          Py_UCS4 ch = PyUnicode_READ(kind, data, i++);
     826          const _PyUnicode_DatabaseRecord *record = _getrecord_ex(ch);
     827  
     828          unsigned char combining = record->combining;
     829          if (combining && prev_combining > combining)
     830              return NO; /* non-canonical sort order, not normalized */
     831          prev_combining = combining;
     832  
     833          unsigned char quickcheck_whole = record->normalization_quick_check;
     834          if (yes_only) {
     835              if (quickcheck_whole & (3 << quickcheck_shift))
     836                  return MAYBE;
     837          } else {
     838              switch ((quickcheck_whole >> quickcheck_shift) & 3) {
     839              case NO:
     840                return NO;
     841              case MAYBE:
     842                result = MAYBE; /* this string might need normalization */
     843              }
     844          }
     845      }
     846      return result;
     847  }
     848  
     849  /*[clinic input]
     850  unicodedata.UCD.is_normalized
     851  
     852      self: self
     853      form: unicode
     854      unistr as input: unicode
     855      /
     856  
     857  Return whether the Unicode string unistr is in the normal form 'form'.
     858  
     859  Valid values for form are 'NFC', 'NFKC', 'NFD', and 'NFKD'.
     860  [clinic start generated code]*/
     861  
     862  static PyObject *
     863  unicodedata_UCD_is_normalized_impl(PyObject *self, PyObject *form,
     864                                     PyObject *input)
     865  /*[clinic end generated code: output=11e5a3694e723ca5 input=a544f14cea79e508]*/
     866  {
     867      if (PyUnicode_READY(input) == -1) {
     868          return NULL;
     869      }
     870  
     871      if (PyUnicode_GET_LENGTH(input) == 0) {
     872          /* special case empty input strings. */
     873          Py_RETURN_TRUE;
     874      }
     875  
     876      PyObject *result;
     877      bool nfc = false;
     878      bool k = false;
     879      QuickcheckResult m;
     880  
     881      PyObject *cmp;
     882      int match = 0;
     883  
     884      if (PyUnicode_CompareWithASCIIString(form, "NFC") == 0) {
     885          nfc = true;
     886      }
     887      else if (PyUnicode_CompareWithASCIIString(form, "NFKC") == 0) {
     888          nfc = true;
     889          k = true;
     890      }
     891      else if (PyUnicode_CompareWithASCIIString(form, "NFD") == 0) {
     892          /* matches default values for `nfc` and `k` */
     893      }
     894      else if (PyUnicode_CompareWithASCIIString(form, "NFKD") == 0) {
     895          k = true;
     896      }
     897      else {
     898          PyErr_SetString(PyExc_ValueError, "invalid normalization form");
     899          return NULL;
     900      }
     901  
     902      m = is_normalized_quickcheck(self, input, nfc, k, false);
     903  
     904      if (m == MAYBE) {
     905          cmp = (nfc ? nfc_nfkc : nfd_nfkd)(self, input, k);
     906          if (cmp == NULL) {
     907              return NULL;
     908          }
     909          match = PyUnicode_Compare(input, cmp);
     910          Py_DECREF(cmp);
     911          result = (match == 0) ? Py_True : Py_False;
     912      }
     913      else {
     914          result = (m == YES) ? Py_True : Py_False;
     915      }
     916  
     917      return Py_NewRef(result);
     918  }
     919  
     920  
     921  /*[clinic input]
     922  unicodedata.UCD.normalize
     923  
     924      self: self
     925      form: unicode
     926      unistr as input: unicode
     927      /
     928  
     929  Return the normal form 'form' for the Unicode string unistr.
     930  
     931  Valid values for form are 'NFC', 'NFKC', 'NFD', and 'NFKD'.
     932  [clinic start generated code]*/
     933  
     934  static PyObject *
     935  unicodedata_UCD_normalize_impl(PyObject *self, PyObject *form,
     936                                 PyObject *input)
     937  /*[clinic end generated code: output=05ca4385a2ad6983 input=3a5206c0ad2833fb]*/
     938  {
     939      if (PyUnicode_GET_LENGTH(input) == 0) {
     940          /* Special case empty input strings, since resizing
     941             them  later would cause internal errors. */
     942          return Py_NewRef(input);
     943      }
     944  
     945      if (PyUnicode_CompareWithASCIIString(form, "NFC") == 0) {
     946          if (is_normalized_quickcheck(self, input,
     947                                       true,  false, true) == YES) {
     948              return Py_NewRef(input);
     949          }
     950          return nfc_nfkc(self, input, 0);
     951      }
     952      if (PyUnicode_CompareWithASCIIString(form, "NFKC") == 0) {
     953          if (is_normalized_quickcheck(self, input,
     954                                       true,  true,  true) == YES) {
     955              return Py_NewRef(input);
     956          }
     957          return nfc_nfkc(self, input, 1);
     958      }
     959      if (PyUnicode_CompareWithASCIIString(form, "NFD") == 0) {
     960          if (is_normalized_quickcheck(self, input,
     961                                       false, false, true) == YES) {
     962              return Py_NewRef(input);
     963          }
     964          return nfd_nfkd(self, input, 0);
     965      }
     966      if (PyUnicode_CompareWithASCIIString(form, "NFKD") == 0) {
     967          if (is_normalized_quickcheck(self, input,
     968                                       false, true,  true) == YES) {
     969              return Py_NewRef(input);
     970          }
     971          return nfd_nfkd(self, input, 1);
     972      }
     973      PyErr_SetString(PyExc_ValueError, "invalid normalization form");
     974      return NULL;
     975  }
     976  
     977  /* -------------------------------------------------------------------- */
     978  /* unicode character name tables */
     979  
     980  /* data file generated by Tools/unicode/makeunicodedata.py */
     981  #include "unicodename_db.h"
     982  
     983  /* -------------------------------------------------------------------- */
     984  /* database code (cut and pasted from the unidb package) */
     985  
     986  static unsigned long
     987  _gethash(const char *s, int len, int scale)
     988  {
     989      int i;
     990      unsigned long h = 0;
     991      unsigned long ix;
     992      for (i = 0; i < len; i++) {
     993          h = (h * scale) + (unsigned char) Py_TOUPPER(s[i]);
     994          ix = h & 0xff000000;
     995          if (ix)
     996              h = (h ^ ((ix>>24) & 0xff)) & 0x00ffffff;
     997      }
     998      return h;
     999  }
    1000  
    1001  static const char * const hangul_syllables[][3] = {
    1002      { "G",  "A",   ""   },
    1003      { "GG", "AE",  "G"  },
    1004      { "N",  "YA",  "GG" },
    1005      { "D",  "YAE", "GS" },
    1006      { "DD", "EO",  "N", },
    1007      { "R",  "E",   "NJ" },
    1008      { "M",  "YEO", "NH" },
    1009      { "B",  "YE",  "D"  },
    1010      { "BB", "O",   "L"  },
    1011      { "S",  "WA",  "LG" },
    1012      { "SS", "WAE", "LM" },
    1013      { "",   "OE",  "LB" },
    1014      { "J",  "YO",  "LS" },
    1015      { "JJ", "U",   "LT" },
    1016      { "C",  "WEO", "LP" },
    1017      { "K",  "WE",  "LH" },
    1018      { "T",  "WI",  "M"  },
    1019      { "P",  "YU",  "B"  },
    1020      { "H",  "EU",  "BS" },
    1021      { 0,    "YI",  "S"  },
    1022      { 0,    "I",   "SS" },
    1023      { 0,    0,     "NG" },
    1024      { 0,    0,     "J"  },
    1025      { 0,    0,     "C"  },
    1026      { 0,    0,     "K"  },
    1027      { 0,    0,     "T"  },
    1028      { 0,    0,     "P"  },
    1029      { 0,    0,     "H"  }
    1030  };
    1031  
    1032  /* These ranges need to match makeunicodedata.py:cjk_ranges. */
    1033  static int
    1034  is_unified_ideograph(Py_UCS4 code)
    1035  {
    1036      return
    1037          (0x3400 <= code && code <= 0x4DBF)   || /* CJK Ideograph Extension A */
    1038          (0x4E00 <= code && code <= 0x9FFF)   || /* CJK Ideograph */
    1039          (0x20000 <= code && code <= 0x2A6DF) || /* CJK Ideograph Extension B */
    1040          (0x2A700 <= code && code <= 0x2B739) || /* CJK Ideograph Extension C */
    1041          (0x2B740 <= code && code <= 0x2B81D) || /* CJK Ideograph Extension D */
    1042          (0x2B820 <= code && code <= 0x2CEA1) || /* CJK Ideograph Extension E */
    1043          (0x2CEB0 <= code && code <= 0x2EBE0) || /* CJK Ideograph Extension F */
    1044          (0x30000 <= code && code <= 0x3134A) || /* CJK Ideograph Extension G */
    1045          (0x31350 <= code && code <= 0x323AF);   /* CJK Ideograph Extension H */
    1046  }
    1047  
    1048  /* macros used to determine if the given code point is in the PUA range that
    1049   * we are using to store aliases and named sequences */
    1050  #define IS_ALIAS(cp) ((cp >= aliases_start) && (cp < aliases_end))
    1051  #define IS_NAMED_SEQ(cp) ((cp >= named_sequences_start) && \
    1052                            (cp < named_sequences_end))
    1053  
    1054  static int
    1055  _getucname(PyObject *self,
    1056             Py_UCS4 code, char* buffer, int buflen, int with_alias_and_seq)
    1057  {
    1058      /* Find the name associated with the given code point.
    1059       * If with_alias_and_seq is 1, check for names in the Private Use Area 15
    1060       * that we are using for aliases and named sequences. */
    1061      int offset;
    1062      int i;
    1063      int word;
    1064      const unsigned char* w;
    1065  
    1066      if (code >= 0x110000)
    1067          return 0;
    1068  
    1069      /* XXX should we just skip all the code points in the PUAs here? */
    1070      if (!with_alias_and_seq && (IS_ALIAS(code) || IS_NAMED_SEQ(code)))
    1071          return 0;
    1072  
    1073      if (UCD_Check(self)) {
    1074          /* in 3.2.0 there are no aliases and named sequences */
    1075          const change_record *old;
    1076          if (IS_ALIAS(code) || IS_NAMED_SEQ(code))
    1077              return 0;
    1078          old = get_old_record(self, code);
    1079          if (old->category_changed == 0) {
    1080              /* unassigned */
    1081              return 0;
    1082          }
    1083      }
    1084  
    1085      if (SBase <= code && code < SBase+SCount) {
    1086          /* Hangul syllable. */
    1087          int SIndex = code - SBase;
    1088          int L = SIndex / NCount;
    1089          int V = (SIndex % NCount) / TCount;
    1090          int T = SIndex % TCount;
    1091  
    1092          if (buflen < 27)
    1093              /* Worst case: HANGUL SYLLABLE <10chars>. */
    1094              return 0;
    1095          strcpy(buffer, "HANGUL SYLLABLE ");
    1096          buffer += 16;
    1097          strcpy(buffer, hangul_syllables[L][0]);
    1098          buffer += strlen(hangul_syllables[L][0]);
    1099          strcpy(buffer, hangul_syllables[V][1]);
    1100          buffer += strlen(hangul_syllables[V][1]);
    1101          strcpy(buffer, hangul_syllables[T][2]);
    1102          buffer += strlen(hangul_syllables[T][2]);
    1103          *buffer = '\0';
    1104          return 1;
    1105      }
    1106  
    1107      if (is_unified_ideograph(code)) {
    1108          if (buflen < 28)
    1109              /* Worst case: CJK UNIFIED IDEOGRAPH-20000 */
    1110              return 0;
    1111          sprintf(buffer, "CJK UNIFIED IDEOGRAPH-%X", code);
    1112          return 1;
    1113      }
    1114  
    1115      /* get offset into phrasebook */
    1116      offset = phrasebook_offset1[(code>>phrasebook_shift)];
    1117      offset = phrasebook_offset2[(offset<<phrasebook_shift) +
    1118                                 (code&((1<<phrasebook_shift)-1))];
    1119      if (!offset)
    1120          return 0;
    1121  
    1122      i = 0;
    1123  
    1124      for (;;) {
    1125          /* get word index */
    1126          word = phrasebook[offset] - phrasebook_short;
    1127          if (word >= 0) {
    1128              word = (word << 8) + phrasebook[offset+1];
    1129              offset += 2;
    1130          } else
    1131              word = phrasebook[offset++];
    1132          if (i) {
    1133              if (i > buflen)
    1134                  return 0; /* buffer overflow */
    1135              buffer[i++] = ' ';
    1136          }
    1137          /* copy word string from lexicon.  the last character in the
    1138             word has bit 7 set.  the last word in a string ends with
    1139             0x80 */
    1140          w = lexicon + lexicon_offset[word];
    1141          while (*w < 128) {
    1142              if (i >= buflen)
    1143                  return 0; /* buffer overflow */
    1144              buffer[i++] = *w++;
    1145          }
    1146          if (i >= buflen)
    1147              return 0; /* buffer overflow */
    1148          buffer[i++] = *w & 127;
    1149          if (*w == 128)
    1150              break; /* end of word */
    1151      }
    1152  
    1153      return 1;
    1154  }
    1155  
    1156  static int
    1157  capi_getucname(Py_UCS4 code,
    1158                 char* buffer, int buflen,
    1159                 int with_alias_and_seq)
    1160  {
    1161      return _getucname(NULL, code, buffer, buflen, with_alias_and_seq);
    1162  
    1163  }
    1164  
    1165  static int
    1166  _cmpname(PyObject *self, int code, const char* name, int namelen)
    1167  {
    1168      /* check if code corresponds to the given name */
    1169      int i;
    1170      char buffer[NAME_MAXLEN+1];
    1171      if (!_getucname(self, code, buffer, NAME_MAXLEN, 1))
    1172          return 0;
    1173      for (i = 0; i < namelen; i++) {
    1174          if (Py_TOUPPER(name[i]) != buffer[i])
    1175              return 0;
    1176      }
    1177      return buffer[namelen] == '\0';
    1178  }
    1179  
    1180  static void
    1181  find_syllable(const char *str, int *len, int *pos, int count, int column)
    1182  {
    1183      int i, len1;
    1184      *len = -1;
    1185      for (i = 0; i < count; i++) {
    1186          const char *s = hangul_syllables[i][column];
    1187          len1 = Py_SAFE_DOWNCAST(strlen(s), size_t, int);
    1188          if (len1 <= *len)
    1189              continue;
    1190          if (strncmp(str, s, len1) == 0) {
    1191              *len = len1;
    1192              *pos = i;
    1193          }
    1194      }
    1195      if (*len == -1) {
    1196          *len = 0;
    1197      }
    1198  }
    1199  
    1200  static int
    1201  _check_alias_and_seq(unsigned int cp, Py_UCS4* code, int with_named_seq)
    1202  {
    1203      /* check if named sequences are allowed */
    1204      if (!with_named_seq && IS_NAMED_SEQ(cp))
    1205          return 0;
    1206      /* if the code point is in the PUA range that we use for aliases,
    1207       * convert it to obtain the right code point */
    1208      if (IS_ALIAS(cp))
    1209          *code = name_aliases[cp-aliases_start];
    1210      else
    1211          *code = cp;
    1212      return 1;
    1213  }
    1214  
    1215  static int
    1216  _getcode(PyObject* self,
    1217           const char* name, int namelen, Py_UCS4* code, int with_named_seq)
    1218  {
    1219      /* Return the code point associated with the given name.
    1220       * Named aliases are resolved too (unless self != NULL (i.e. we are using
    1221       * 3.2.0)).  If with_named_seq is 1, returns the PUA code point that we are
    1222       * using for the named sequence, and the caller must then convert it. */
    1223      unsigned int h, v;
    1224      unsigned int mask = code_size-1;
    1225      unsigned int i, incr;
    1226  
    1227      /* Check for hangul syllables. */
    1228      if (strncmp(name, "HANGUL SYLLABLE ", 16) == 0) {
    1229          int len, L = -1, V = -1, T = -1;
    1230          const char *pos = name + 16;
    1231          find_syllable(pos, &len, &L, LCount, 0);
    1232          pos += len;
    1233          find_syllable(pos, &len, &V, VCount, 1);
    1234          pos += len;
    1235          find_syllable(pos, &len, &T, TCount, 2);
    1236          pos += len;
    1237          if (L != -1 && V != -1 && T != -1 && pos-name == namelen) {
    1238              *code = SBase + (L*VCount+V)*TCount + T;
    1239              return 1;
    1240          }
    1241          /* Otherwise, it's an illegal syllable name. */
    1242          return 0;
    1243      }
    1244  
    1245      /* Check for unified ideographs. */
    1246      if (strncmp(name, "CJK UNIFIED IDEOGRAPH-", 22) == 0) {
    1247          /* Four or five hexdigits must follow. */
    1248          v = 0;
    1249          name += 22;
    1250          namelen -= 22;
    1251          if (namelen != 4 && namelen != 5)
    1252              return 0;
    1253          while (namelen--) {
    1254              v *= 16;
    1255              if (*name >= '0' && *name <= '9')
    1256                  v += *name - '0';
    1257              else if (*name >= 'A' && *name <= 'F')
    1258                  v += *name - 'A' + 10;
    1259              else
    1260                  return 0;
    1261              name++;
    1262          }
    1263          if (!is_unified_ideograph(v))
    1264              return 0;
    1265          *code = v;
    1266          return 1;
    1267      }
    1268  
    1269      /* the following is the same as python's dictionary lookup, with
    1270         only minor changes.  see the makeunicodedata script for more
    1271         details */
    1272  
    1273      h = (unsigned int) _gethash(name, namelen, code_magic);
    1274      i = (~h) & mask;
    1275      v = code_hash[i];
    1276      if (!v)
    1277          return 0;
    1278      if (_cmpname(self, v, name, namelen)) {
    1279          return _check_alias_and_seq(v, code, with_named_seq);
    1280      }
    1281      incr = (h ^ (h >> 3)) & mask;
    1282      if (!incr)
    1283          incr = mask;
    1284      for (;;) {
    1285          i = (i + incr) & mask;
    1286          v = code_hash[i];
    1287          if (!v)
    1288              return 0;
    1289          if (_cmpname(self, v, name, namelen)) {
    1290              return _check_alias_and_seq(v, code, with_named_seq);
    1291          }
    1292          incr = incr << 1;
    1293          if (incr > mask)
    1294              incr = incr ^ code_poly;
    1295      }
    1296  }
    1297  
    1298  static int
    1299  capi_getcode(const char* name, int namelen, Py_UCS4* code,
    1300               int with_named_seq)
    1301  {
    1302      return _getcode(NULL, name, namelen, code, with_named_seq);
    1303  
    1304  }
    1305  
    1306  static void
    1307  unicodedata_destroy_capi(PyObject *capsule)
    1308  {
    1309      void *capi = PyCapsule_GetPointer(capsule, PyUnicodeData_CAPSULE_NAME);
    1310      PyMem_Free(capi);
    1311  }
    1312  
    1313  static PyObject *
    1314  unicodedata_create_capi(void)
    1315  {
    1316      _PyUnicode_Name_CAPI *capi = PyMem_Malloc(sizeof(_PyUnicode_Name_CAPI));
    1317      if (capi == NULL) {
    1318          PyErr_NoMemory();
    1319          return NULL;
    1320      }
    1321      capi->getname = capi_getucname;
    1322      capi->getcode = capi_getcode;
    1323  
    1324      PyObject *capsule = PyCapsule_New(capi,
    1325                                        PyUnicodeData_CAPSULE_NAME,
    1326                                        unicodedata_destroy_capi);
    1327      if (capsule == NULL) {
    1328          PyMem_Free(capi);
    1329      }
    1330      return capsule;
    1331  };
    1332  
    1333  
    1334  /* -------------------------------------------------------------------- */
    1335  /* Python bindings */
    1336  
    1337  /*[clinic input]
    1338  unicodedata.UCD.name
    1339  
    1340      self: self
    1341      chr: int(accept={str})
    1342      default: object=NULL
    1343      /
    1344  
    1345  Returns the name assigned to the character chr as a string.
    1346  
    1347  If no name is defined, default is returned, or, if not given,
    1348  ValueError is raised.
    1349  [clinic start generated code]*/
    1350  
    1351  static PyObject *
    1352  unicodedata_UCD_name_impl(PyObject *self, int chr, PyObject *default_value)
    1353  /*[clinic end generated code: output=6bbb37a326407707 input=3e0367f534de56d9]*/
    1354  {
    1355      char name[NAME_MAXLEN+1];
    1356      Py_UCS4 c = (Py_UCS4)chr;
    1357  
    1358      if (!_getucname(self, c, name, NAME_MAXLEN, 0)) {
    1359          if (default_value == NULL) {
    1360              PyErr_SetString(PyExc_ValueError, "no such name");
    1361              return NULL;
    1362          }
    1363          else {
    1364              return Py_NewRef(default_value);
    1365          }
    1366      }
    1367  
    1368      return PyUnicode_FromString(name);
    1369  }
    1370  
    1371  /*[clinic input]
    1372  unicodedata.UCD.lookup
    1373  
    1374      self: self
    1375      name: str(accept={str, robuffer}, zeroes=True)
    1376      /
    1377  
    1378  Look up character by name.
    1379  
    1380  If a character with the given name is found, return the
    1381  corresponding character.  If not found, KeyError is raised.
    1382  [clinic start generated code]*/
    1383  
    1384  static PyObject *
    1385  unicodedata_UCD_lookup_impl(PyObject *self, const char *name,
    1386                              Py_ssize_t name_length)
    1387  /*[clinic end generated code: output=7f03fc4959b242f6 input=a557be0f8607a0d6]*/
    1388  {
    1389      Py_UCS4 code;
    1390      unsigned int index;
    1391      if (name_length > NAME_MAXLEN) {
    1392          PyErr_SetString(PyExc_KeyError, "name too long");
    1393          return NULL;
    1394      }
    1395  
    1396      if (!_getcode(self, name, (int)name_length, &code, 1)) {
    1397          PyErr_Format(PyExc_KeyError, "undefined character name '%s'", name);
    1398          return NULL;
    1399      }
    1400      /* check if code is in the PUA range that we use for named sequences
    1401         and convert it */
    1402      if (IS_NAMED_SEQ(code)) {
    1403          index = code-named_sequences_start;
    1404          return PyUnicode_FromKindAndData(PyUnicode_2BYTE_KIND,
    1405                                           named_sequences[index].seq,
    1406                                           named_sequences[index].seqlen);
    1407      }
    1408      return PyUnicode_FromOrdinal(code);
    1409  }
    1410  
    1411  // List of functions used to define module functions *AND* unicodedata.UCD
    1412  // methods. For module functions, self is the module. For UCD methods, self
    1413  // is an UCD instance. The UCD_Check() macro is used to check if self is
    1414  // an UCD instance.
    1415  static PyMethodDef unicodedata_functions[] = {
    1416      UNICODEDATA_UCD_DECIMAL_METHODDEF
    1417      UNICODEDATA_UCD_DIGIT_METHODDEF
    1418      UNICODEDATA_UCD_NUMERIC_METHODDEF
    1419      UNICODEDATA_UCD_CATEGORY_METHODDEF
    1420      UNICODEDATA_UCD_BIDIRECTIONAL_METHODDEF
    1421      UNICODEDATA_UCD_COMBINING_METHODDEF
    1422      UNICODEDATA_UCD_MIRRORED_METHODDEF
    1423      UNICODEDATA_UCD_EAST_ASIAN_WIDTH_METHODDEF
    1424      UNICODEDATA_UCD_DECOMPOSITION_METHODDEF
    1425      UNICODEDATA_UCD_NAME_METHODDEF
    1426      UNICODEDATA_UCD_LOOKUP_METHODDEF
    1427      UNICODEDATA_UCD_IS_NORMALIZED_METHODDEF
    1428      UNICODEDATA_UCD_NORMALIZE_METHODDEF
    1429      {NULL, NULL}                /* sentinel */
    1430  };
    1431  
    1432  static int
    1433  ucd_traverse(PreviousDBVersion *self, visitproc visit, void *arg)
    1434  {
    1435      Py_VISIT(Py_TYPE(self));
    1436      return 0;
    1437  }
    1438  
    1439  static void
    1440  ucd_dealloc(PreviousDBVersion *self)
    1441  {
    1442      PyTypeObject *tp = Py_TYPE(self);
    1443      PyObject_GC_UnTrack(self);
    1444      PyObject_GC_Del(self);
    1445      Py_DECREF(tp);
    1446  }
    1447  
    1448  static PyType_Slot ucd_type_slots[] = {
    1449      {Py_tp_dealloc, ucd_dealloc},
    1450      {Py_tp_traverse, ucd_traverse},
    1451      {Py_tp_getattro, PyObject_GenericGetAttr},
    1452      {Py_tp_methods, unicodedata_functions},
    1453      {Py_tp_members, DB_members},
    1454      {0, 0}
    1455  };
    1456  
    1457  static PyType_Spec ucd_type_spec = {
    1458      .name = "unicodedata.UCD",
    1459      .basicsize = sizeof(PreviousDBVersion),
    1460      .flags = (Py_TPFLAGS_DEFAULT | Py_TPFLAGS_DISALLOW_INSTANTIATION |
    1461                Py_TPFLAGS_HAVE_GC | Py_TPFLAGS_IMMUTABLETYPE),
    1462      .slots = ucd_type_slots
    1463  };
    1464  
    1465  PyDoc_STRVAR(unicodedata_docstring,
    1466  "This module provides access to the Unicode Character Database which\n\
    1467  defines character properties for all Unicode characters. The data in\n\
    1468  this database is based on the UnicodeData.txt file version\n\
    1469  " UNIDATA_VERSION " which is publicly available from ftp://ftp.unicode.org/.\n\
    1470  \n\
    1471  The module uses the same names and symbols as defined by the\n\
    1472  UnicodeData File Format " UNIDATA_VERSION ".");
    1473  
    1474  static int
    1475  unicodedata_exec(PyObject *module)
    1476  {
    1477      if (PyModule_AddStringConstant(module, "unidata_version", UNIDATA_VERSION) < 0) {
    1478          return -1;
    1479      }
    1480  
    1481      PyTypeObject *ucd_type = (PyTypeObject *)PyType_FromSpec(&ucd_type_spec);
    1482      if (ucd_type == NULL) {
    1483          return -1;
    1484      }
    1485  
    1486      if (PyModule_AddType(module, ucd_type) < 0) {
    1487          Py_DECREF(ucd_type);
    1488          return -1;
    1489      }
    1490  
    1491      // Unicode database version 3.2.0 used by the IDNA encoding
    1492      PyObject *v;
    1493      v = new_previous_version(ucd_type, "3.2.0",
    1494                               get_change_3_2_0, normalization_3_2_0);
    1495      Py_DECREF(ucd_type);
    1496      if (v == NULL) {
    1497          return -1;
    1498      }
    1499      if (PyModule_AddObject(module, "ucd_3_2_0", v) < 0) {
    1500          Py_DECREF(v);
    1501          return -1;
    1502      }
    1503  
    1504      /* Export C API */
    1505      PyObject *capsule = unicodedata_create_capi();
    1506      if (capsule == NULL) {
    1507          return -1;
    1508      }
    1509      int rc = PyModule_AddObjectRef(module, "_ucnhash_CAPI", capsule);
    1510      Py_DECREF(capsule);
    1511      if (rc < 0) {
    1512          return -1;
    1513      }
    1514      return 0;
    1515  }
    1516  
    1517  static PyModuleDef_Slot unicodedata_slots[] = {
    1518      {Py_mod_exec, unicodedata_exec},
    1519      {Py_mod_multiple_interpreters, Py_MOD_PER_INTERPRETER_GIL_SUPPORTED},
    1520      {0, NULL}
    1521  };
    1522  
    1523  static struct PyModuleDef unicodedata_module = {
    1524      PyModuleDef_HEAD_INIT,
    1525      .m_name = "unicodedata",
    1526      .m_doc = unicodedata_docstring,
    1527      .m_size = 0,
    1528      .m_methods = unicodedata_functions,
    1529      .m_slots = unicodedata_slots,
    1530  };
    1531  
    1532  PyMODINIT_FUNC
    1533  PyInit_unicodedata(void)
    1534  {
    1535      return PyModuleDef_Init(&unicodedata_module);
    1536  }
    1537  
    1538  
    1539  /*
    1540  Local variables:
    1541  c-basic-offset: 4
    1542  indent-tabs-mode: nil
    1543  End:
    1544  */