1  #ifndef Py_CPYTHON_UNICODEOBJECT_H
       2  #  error "this header file must not be included directly"
       3  #endif
       4  
       5  /* Py_UNICODE was the native Unicode storage format (code unit) used by
       6     Python and represents a single Unicode element in the Unicode type.
       7     With PEP 393, Py_UNICODE is deprecated and replaced with a
       8     typedef to wchar_t. */
       9  #define PY_UNICODE_TYPE wchar_t
      10  /* Py_DEPRECATED(3.3) */ typedef wchar_t Py_UNICODE;
      11  
      12  /* --- Internal Unicode Operations ---------------------------------------- */
      13  
      14  // Static inline functions to work with surrogates
      15  static inline int Py_UNICODE_IS_SURROGATE(Py_UCS4 ch) {
      16      return (0xD800 <= ch && ch <= 0xDFFF);
      17  }
      18  static inline int Py_UNICODE_IS_HIGH_SURROGATE(Py_UCS4 ch) {
      19      return (0xD800 <= ch && ch <= 0xDBFF);
      20  }
      21  static inline int Py_UNICODE_IS_LOW_SURROGATE(Py_UCS4 ch) {
      22      return (0xDC00 <= ch && ch <= 0xDFFF);
      23  }
      24  
      25  // Join two surrogate characters and return a single Py_UCS4 value.
      26  static inline Py_UCS4 Py_UNICODE_JOIN_SURROGATES(Py_UCS4 high, Py_UCS4 low)  {
      27      assert(Py_UNICODE_IS_HIGH_SURROGATE(high));
      28      assert(Py_UNICODE_IS_LOW_SURROGATE(low));
      29      return 0x10000 + (((high & 0x03FF) << 10) | (low & 0x03FF));
      30  }
      31  
      32  // High surrogate = top 10 bits added to 0xD800.
      33  // The character must be in the range [U+10000; U+10ffff].
      34  static inline Py_UCS4 Py_UNICODE_HIGH_SURROGATE(Py_UCS4 ch) {
      35      assert(0x10000 <= ch && ch <= 0x10ffff);
      36      return (0xD800 - (0x10000 >> 10) + (ch >> 10));
      37  }
      38  
      39  // Low surrogate = bottom 10 bits added to 0xDC00.
      40  // The character must be in the range [U+10000; U+10ffff].
      41  static inline Py_UCS4 Py_UNICODE_LOW_SURROGATE(Py_UCS4 ch) {
      42      assert(0x10000 <= ch && ch <= 0x10ffff);
      43      return (0xDC00 + (ch & 0x3FF));
      44  }
      45  
      46  /* --- Unicode Type ------------------------------------------------------- */
      47  
      48  /* ASCII-only strings created through PyUnicode_New use the PyASCIIObject
      49     structure. state.ascii and state.compact are set, and the data
      50     immediately follow the structure. utf8_length can be found
      51     in the length field; the utf8 pointer is equal to the data pointer. */
      52  typedef struct {
      53      /* There are 4 forms of Unicode strings:
      54  
      55         - compact ascii:
      56  
      57           * structure = PyASCIIObject
      58           * test: PyUnicode_IS_COMPACT_ASCII(op)
      59           * kind = PyUnicode_1BYTE_KIND
      60           * compact = 1
      61           * ascii = 1
      62           * (length is the length of the utf8)
      63           * (data starts just after the structure)
      64           * (since ASCII is decoded from UTF-8, the utf8 string are the data)
      65  
      66         - compact:
      67  
      68           * structure = PyCompactUnicodeObject
      69           * test: PyUnicode_IS_COMPACT(op) && !PyUnicode_IS_ASCII(op)
      70           * kind = PyUnicode_1BYTE_KIND, PyUnicode_2BYTE_KIND or
      71             PyUnicode_4BYTE_KIND
      72           * compact = 1
      73           * ascii = 0
      74           * utf8 is not shared with data
      75           * utf8_length = 0 if utf8 is NULL
      76           * (data starts just after the structure)
      77  
      78         - legacy string:
      79  
      80           * structure = PyUnicodeObject structure
      81           * test: !PyUnicode_IS_COMPACT(op)
      82           * kind = PyUnicode_1BYTE_KIND, PyUnicode_2BYTE_KIND or
      83             PyUnicode_4BYTE_KIND
      84           * compact = 0
      85           * data.any is not NULL
      86           * utf8 is shared and utf8_length = length with data.any if ascii = 1
      87           * utf8_length = 0 if utf8 is NULL
      88  
      89         Compact strings use only one memory block (structure + characters),
      90         whereas legacy strings use one block for the structure and one block
      91         for characters.
      92  
      93         Legacy strings are created by subclasses of Unicode.
      94  
      95         See also _PyUnicode_CheckConsistency().
      96      */
      97      PyObject_HEAD
      98      Py_ssize_t length;          /* Number of code points in the string */
      99      Py_hash_t hash;             /* Hash value; -1 if not set */
     100      struct {
     101          /* If interned is non-zero, the two references from the
     102             dictionary to this object are *not* counted in ob_refcnt.
     103             The possible values here are:
     104                 0: Not Interned
     105                 1: Interned
     106                 2: Interned and Immortal
     107                 3: Interned, Immortal, and Static
     108             This categorization allows the runtime to determine the right
     109             cleanup mechanism at runtime shutdown. */
     110          unsigned int interned:2;
     111          /* Character size:
     112  
     113             - PyUnicode_1BYTE_KIND (1):
     114  
     115               * character type = Py_UCS1 (8 bits, unsigned)
     116               * all characters are in the range U+0000-U+00FF (latin1)
     117               * if ascii is set, all characters are in the range U+0000-U+007F
     118                 (ASCII), otherwise at least one character is in the range
     119                 U+0080-U+00FF
     120  
     121             - PyUnicode_2BYTE_KIND (2):
     122  
     123               * character type = Py_UCS2 (16 bits, unsigned)
     124               * all characters are in the range U+0000-U+FFFF (BMP)
     125               * at least one character is in the range U+0100-U+FFFF
     126  
     127             - PyUnicode_4BYTE_KIND (4):
     128  
     129               * character type = Py_UCS4 (32 bits, unsigned)
     130               * all characters are in the range U+0000-U+10FFFF
     131               * at least one character is in the range U+10000-U+10FFFF
     132           */
     133          unsigned int kind:3;
     134          /* Compact is with respect to the allocation scheme. Compact unicode
     135             objects only require one memory block while non-compact objects use
     136             one block for the PyUnicodeObject struct and another for its data
     137             buffer. */
     138          unsigned int compact:1;
     139          /* The string only contains characters in the range U+0000-U+007F (ASCII)
     140             and the kind is PyUnicode_1BYTE_KIND. If ascii is set and compact is
     141             set, use the PyASCIIObject structure. */
     142          unsigned int ascii:1;
     143          /* Padding to ensure that PyUnicode_DATA() is always aligned to
     144             4 bytes (see issue #19537 on m68k). */
     145          unsigned int :25;
     146      } state;
     147  } PyASCIIObject;
     148  
     149  /* Non-ASCII strings allocated through PyUnicode_New use the
     150     PyCompactUnicodeObject structure. state.compact is set, and the data
     151     immediately follow the structure. */
     152  typedef struct {
     153      PyASCIIObject _base;
     154      Py_ssize_t utf8_length;     /* Number of bytes in utf8, excluding the
     155                                   * terminating \0. */
     156      char *utf8;                 /* UTF-8 representation (null-terminated) */
     157  } PyCompactUnicodeObject;
     158  
     159  /* Object format for Unicode subclasses. */
     160  typedef struct {
     161      PyCompactUnicodeObject _base;
     162      union {
     163          void *any;
     164          Py_UCS1 *latin1;
     165          Py_UCS2 *ucs2;
     166          Py_UCS4 *ucs4;
     167      } data;                     /* Canonical, smallest-form Unicode buffer */
     168  } PyUnicodeObject;
     169  
     170  PyAPI_FUNC(int) _PyUnicode_CheckConsistency(
     171      PyObject *op,
     172      int check_content);
     173  
     174  
     175  #define _PyASCIIObject_CAST(op) \
     176      (assert(PyUnicode_Check(op)), \
     177       _Py_CAST(PyASCIIObject*, (op)))
     178  #define _PyCompactUnicodeObject_CAST(op) \
     179      (assert(PyUnicode_Check(op)), \
     180       _Py_CAST(PyCompactUnicodeObject*, (op)))
     181  #define _PyUnicodeObject_CAST(op) \
     182      (assert(PyUnicode_Check(op)), \
     183       _Py_CAST(PyUnicodeObject*, (op)))
     184  
     185  
     186  /* --- Flexible String Representation Helper Macros (PEP 393) -------------- */
     187  
     188  /* Values for PyASCIIObject.state: */
     189  
     190  /* Interning state. */
     191  #define SSTATE_NOT_INTERNED 0
     192  #define SSTATE_INTERNED_MORTAL 1
     193  #define SSTATE_INTERNED_IMMORTAL 2
     194  #define SSTATE_INTERNED_IMMORTAL_STATIC 3
     195  
     196  /* Use only if you know it's a string */
     197  static inline unsigned int PyUnicode_CHECK_INTERNED(PyObject *op) {
     198      return _PyASCIIObject_CAST(op)->state.interned;
     199  }
     200  #define PyUnicode_CHECK_INTERNED(op) PyUnicode_CHECK_INTERNED(_PyObject_CAST(op))
     201  
     202  /* For backward compatibility */
     203  static inline unsigned int PyUnicode_IS_READY(PyObject* Py_UNUSED(op)) {
     204      return 1;
     205  }
     206  #define PyUnicode_IS_READY(op) PyUnicode_IS_READY(_PyObject_CAST(op))
     207  
     208  /* Return true if the string contains only ASCII characters, or 0 if not. The
     209     string may be compact (PyUnicode_IS_COMPACT_ASCII) or not, but must be
     210     ready. */
     211  static inline unsigned int PyUnicode_IS_ASCII(PyObject *op) {
     212      return _PyASCIIObject_CAST(op)->state.ascii;
     213  }
     214  #define PyUnicode_IS_ASCII(op) PyUnicode_IS_ASCII(_PyObject_CAST(op))
     215  
     216  /* Return true if the string is compact or 0 if not.
     217     No type checks or Ready calls are performed. */
     218  static inline unsigned int PyUnicode_IS_COMPACT(PyObject *op) {
     219      return _PyASCIIObject_CAST(op)->state.compact;
     220  }
     221  #define PyUnicode_IS_COMPACT(op) PyUnicode_IS_COMPACT(_PyObject_CAST(op))
     222  
     223  /* Return true if the string is a compact ASCII string (use PyASCIIObject
     224     structure), or 0 if not.  No type checks or Ready calls are performed. */
     225  static inline int PyUnicode_IS_COMPACT_ASCII(PyObject *op) {
     226      return (_PyASCIIObject_CAST(op)->state.ascii && PyUnicode_IS_COMPACT(op));
     227  }
     228  #define PyUnicode_IS_COMPACT_ASCII(op) PyUnicode_IS_COMPACT_ASCII(_PyObject_CAST(op))
     229  
     230  enum PyUnicode_Kind {
     231  /* Return values of the PyUnicode_KIND() function: */
     232      PyUnicode_1BYTE_KIND = 1,
     233      PyUnicode_2BYTE_KIND = 2,
     234      PyUnicode_4BYTE_KIND = 4
     235  };
     236  
     237  // PyUnicode_KIND(): Return one of the PyUnicode_*_KIND values defined above.
     238  //
     239  // gh-89653: Converting this macro to a static inline function would introduce
     240  // new compiler warnings on "kind < PyUnicode_KIND(str)" (compare signed and
     241  // unsigned numbers) where kind type is an int or on
     242  // "unsigned int kind = PyUnicode_KIND(str)" (cast signed to unsigned).
     243  #define PyUnicode_KIND(op) _Py_RVALUE(_PyASCIIObject_CAST(op)->state.kind)
     244  
     245  /* Return a void pointer to the raw unicode buffer. */
     246  static inline void* _PyUnicode_COMPACT_DATA(PyObject *op) {
     247      if (PyUnicode_IS_ASCII(op)) {
     248          return _Py_STATIC_CAST(void*, (_PyASCIIObject_CAST(op) + 1));
     249      }
     250      return _Py_STATIC_CAST(void*, (_PyCompactUnicodeObject_CAST(op) + 1));
     251  }
     252  
     253  static inline void* _PyUnicode_NONCOMPACT_DATA(PyObject *op) {
     254      void *data;
     255      assert(!PyUnicode_IS_COMPACT(op));
     256      data = _PyUnicodeObject_CAST(op)->data.any;
     257      assert(data != NULL);
     258      return data;
     259  }
     260  
     261  static inline void* PyUnicode_DATA(PyObject *op) {
     262      if (PyUnicode_IS_COMPACT(op)) {
     263          return _PyUnicode_COMPACT_DATA(op);
     264      }
     265      return _PyUnicode_NONCOMPACT_DATA(op);
     266  }
     267  #define PyUnicode_DATA(op) PyUnicode_DATA(_PyObject_CAST(op))
     268  
     269  /* Return pointers to the canonical representation cast to unsigned char,
     270     Py_UCS2, or Py_UCS4 for direct character access.
     271     No checks are performed, use PyUnicode_KIND() before to ensure
     272     these will work correctly. */
     273  
     274  #define PyUnicode_1BYTE_DATA(op) _Py_STATIC_CAST(Py_UCS1*, PyUnicode_DATA(op))
     275  #define PyUnicode_2BYTE_DATA(op) _Py_STATIC_CAST(Py_UCS2*, PyUnicode_DATA(op))
     276  #define PyUnicode_4BYTE_DATA(op) _Py_STATIC_CAST(Py_UCS4*, PyUnicode_DATA(op))
     277  
     278  /* Returns the length of the unicode string. */
     279  static inline Py_ssize_t PyUnicode_GET_LENGTH(PyObject *op) {
     280      return _PyASCIIObject_CAST(op)->length;
     281  }
     282  #define PyUnicode_GET_LENGTH(op) PyUnicode_GET_LENGTH(_PyObject_CAST(op))
     283  
     284  /* Write into the canonical representation, this function does not do any sanity
     285     checks and is intended for usage in loops.  The caller should cache the
     286     kind and data pointers obtained from other function calls.
     287     index is the index in the string (starts at 0) and value is the new
     288     code point value which should be written to that location. */
     289  static inline void PyUnicode_WRITE(int kind, void *data,
     290                                     Py_ssize_t index, Py_UCS4 value)
     291  {
     292      assert(index >= 0);
     293      if (kind == PyUnicode_1BYTE_KIND) {
     294          assert(value <= 0xffU);
     295          _Py_STATIC_CAST(Py_UCS1*, data)[index] = _Py_STATIC_CAST(Py_UCS1, value);
     296      }
     297      else if (kind == PyUnicode_2BYTE_KIND) {
     298          assert(value <= 0xffffU);
     299          _Py_STATIC_CAST(Py_UCS2*, data)[index] = _Py_STATIC_CAST(Py_UCS2, value);
     300      }
     301      else {
     302          assert(kind == PyUnicode_4BYTE_KIND);
     303          assert(value <= 0x10ffffU);
     304          _Py_STATIC_CAST(Py_UCS4*, data)[index] = value;
     305      }
     306  }
     307  #define PyUnicode_WRITE(kind, data, index, value) \
     308      PyUnicode_WRITE(_Py_STATIC_CAST(int, kind), _Py_CAST(void*, data), \
     309                      (index), _Py_STATIC_CAST(Py_UCS4, value))
     310  
     311  /* Read a code point from the string's canonical representation.  No checks
     312     or ready calls are performed. */
     313  static inline Py_UCS4 PyUnicode_READ(int kind,
     314                                       const void *data, Py_ssize_t index)
     315  {
     316      assert(index >= 0);
     317      if (kind == PyUnicode_1BYTE_KIND) {
     318          return _Py_STATIC_CAST(const Py_UCS1*, data)[index];
     319      }
     320      if (kind == PyUnicode_2BYTE_KIND) {
     321          return _Py_STATIC_CAST(const Py_UCS2*, data)[index];
     322      }
     323      assert(kind == PyUnicode_4BYTE_KIND);
     324      return _Py_STATIC_CAST(const Py_UCS4*, data)[index];
     325  }
     326  #define PyUnicode_READ(kind, data, index) \
     327      PyUnicode_READ(_Py_STATIC_CAST(int, kind), \
     328                     _Py_STATIC_CAST(const void*, data), \
     329                     (index))
     330  
     331  /* PyUnicode_READ_CHAR() is less efficient than PyUnicode_READ() because it
     332     calls PyUnicode_KIND() and might call it twice.  For single reads, use
     333     PyUnicode_READ_CHAR, for multiple consecutive reads callers should
     334     cache kind and use PyUnicode_READ instead. */
     335  static inline Py_UCS4 PyUnicode_READ_CHAR(PyObject *unicode, Py_ssize_t index)
     336  {
     337      int kind;
     338  
     339      assert(index >= 0);
     340      // Tolerate reading the NUL character at str[len(str)]
     341      assert(index <= PyUnicode_GET_LENGTH(unicode));
     342  
     343      kind = PyUnicode_KIND(unicode);
     344      if (kind == PyUnicode_1BYTE_KIND) {
     345          return PyUnicode_1BYTE_DATA(unicode)[index];
     346      }
     347      if (kind == PyUnicode_2BYTE_KIND) {
     348          return PyUnicode_2BYTE_DATA(unicode)[index];
     349      }
     350      assert(kind == PyUnicode_4BYTE_KIND);
     351      return PyUnicode_4BYTE_DATA(unicode)[index];
     352  }
     353  #define PyUnicode_READ_CHAR(unicode, index) \
     354      PyUnicode_READ_CHAR(_PyObject_CAST(unicode), (index))
     355  
     356  /* Return a maximum character value which is suitable for creating another
     357     string based on op.  This is always an approximation but more efficient
     358     than iterating over the string. */
     359  static inline Py_UCS4 PyUnicode_MAX_CHAR_VALUE(PyObject *op)
     360  {
     361      int kind;
     362  
     363      if (PyUnicode_IS_ASCII(op)) {
     364          return 0x7fU;
     365      }
     366  
     367      kind = PyUnicode_KIND(op);
     368      if (kind == PyUnicode_1BYTE_KIND) {
     369         return 0xffU;
     370      }
     371      if (kind == PyUnicode_2BYTE_KIND) {
     372          return 0xffffU;
     373      }
     374      assert(kind == PyUnicode_4BYTE_KIND);
     375      return 0x10ffffU;
     376  }
     377  #define PyUnicode_MAX_CHAR_VALUE(op) \
     378      PyUnicode_MAX_CHAR_VALUE(_PyObject_CAST(op))
     379  
     380  /* === Public API ========================================================= */
     381  
     382  /* --- Plain Py_UNICODE --------------------------------------------------- */
     383  
     384  /* With PEP 393, this is the recommended way to allocate a new unicode object.
     385     This function will allocate the object and its buffer in a single memory
     386     block.  Objects created using this function are not resizable. */
     387  PyAPI_FUNC(PyObject*) PyUnicode_New(
     388      Py_ssize_t size,            /* Number of code points in the new string */
     389      Py_UCS4 maxchar             /* maximum code point value in the string */
     390      );
     391  
     392  /* For backward compatibility */
     393  static inline int PyUnicode_READY(PyObject* Py_UNUSED(op))
     394  {
     395      return 0;
     396  }
     397  #define PyUnicode_READY(op) PyUnicode_READY(_PyObject_CAST(op))
     398  
     399  /* Get a copy of a Unicode string. */
     400  PyAPI_FUNC(PyObject*) _PyUnicode_Copy(
     401      PyObject *unicode
     402      );
     403  
     404  /* Copy character from one unicode object into another, this function performs
     405     character conversion when necessary and falls back to memcpy() if possible.
     406  
     407     Fail if to is too small (smaller than *how_many* or smaller than
     408     len(from)-from_start), or if kind(from[from_start:from_start+how_many]) >
     409     kind(to), or if *to* has more than 1 reference.
     410  
     411     Return the number of written character, or return -1 and raise an exception
     412     on error.
     413  
     414     Pseudo-code:
     415  
     416         how_many = min(how_many, len(from) - from_start)
     417         to[to_start:to_start+how_many] = from[from_start:from_start+how_many]
     418         return how_many
     419  
     420     Note: The function doesn't write a terminating null character.
     421     */
     422  PyAPI_FUNC(Py_ssize_t) PyUnicode_CopyCharacters(
     423      PyObject *to,
     424      Py_ssize_t to_start,
     425      PyObject *from,
     426      Py_ssize_t from_start,
     427      Py_ssize_t how_many
     428      );
     429  
     430  /* Unsafe version of PyUnicode_CopyCharacters(): don't check arguments and so
     431     may crash if parameters are invalid (e.g. if the output string
     432     is too short). */
     433  PyAPI_FUNC(void) _PyUnicode_FastCopyCharacters(
     434      PyObject *to,
     435      Py_ssize_t to_start,
     436      PyObject *from,
     437      Py_ssize_t from_start,
     438      Py_ssize_t how_many
     439      );
     440  
     441  /* Fill a string with a character: write fill_char into
     442     unicode[start:start+length].
     443  
     444     Fail if fill_char is bigger than the string maximum character, or if the
     445     string has more than 1 reference.
     446  
     447     Return the number of written character, or return -1 and raise an exception
     448     on error. */
     449  PyAPI_FUNC(Py_ssize_t) PyUnicode_Fill(
     450      PyObject *unicode,
     451      Py_ssize_t start,
     452      Py_ssize_t length,
     453      Py_UCS4 fill_char
     454      );
     455  
     456  /* Unsafe version of PyUnicode_Fill(): don't check arguments and so may crash
     457     if parameters are invalid (e.g. if length is longer than the string). */
     458  PyAPI_FUNC(void) _PyUnicode_FastFill(
     459      PyObject *unicode,
     460      Py_ssize_t start,
     461      Py_ssize_t length,
     462      Py_UCS4 fill_char
     463      );
     464  
     465  /* Create a new string from a buffer of Py_UCS1, Py_UCS2 or Py_UCS4 characters.
     466     Scan the string to find the maximum character. */
     467  PyAPI_FUNC(PyObject*) PyUnicode_FromKindAndData(
     468      int kind,
     469      const void *buffer,
     470      Py_ssize_t size);
     471  
     472  /* Create a new string from a buffer of ASCII characters.
     473     WARNING: Don't check if the string contains any non-ASCII character. */
     474  PyAPI_FUNC(PyObject*) _PyUnicode_FromASCII(
     475      const char *buffer,
     476      Py_ssize_t size);
     477  
     478  /* Compute the maximum character of the substring unicode[start:end].
     479     Return 127 for an empty string. */
     480  PyAPI_FUNC(Py_UCS4) _PyUnicode_FindMaxChar (
     481      PyObject *unicode,
     482      Py_ssize_t start,
     483      Py_ssize_t end);
     484  
     485  /* --- _PyUnicodeWriter API ----------------------------------------------- */
     486  
     487  typedef struct {
     488      PyObject *buffer;
     489      void *data;
     490      int kind;
     491      Py_UCS4 maxchar;
     492      Py_ssize_t size;
     493      Py_ssize_t pos;
     494  
     495      /* minimum number of allocated characters (default: 0) */
     496      Py_ssize_t min_length;
     497  
     498      /* minimum character (default: 127, ASCII) */
     499      Py_UCS4 min_char;
     500  
     501      /* If non-zero, overallocate the buffer (default: 0). */
     502      unsigned char overallocate;
     503  
     504      /* If readonly is 1, buffer is a shared string (cannot be modified)
     505         and size is set to 0. */
     506      unsigned char readonly;
     507  } _PyUnicodeWriter ;
     508  
     509  /* Initialize a Unicode writer.
     510   *
     511   * By default, the minimum buffer size is 0 character and overallocation is
     512   * disabled. Set min_length, min_char and overallocate attributes to control
     513   * the allocation of the buffer. */
     514  PyAPI_FUNC(void)
     515  _PyUnicodeWriter_Init(_PyUnicodeWriter *writer);
     516  
     517  /* Prepare the buffer to write 'length' characters
     518     with the specified maximum character.
     519  
     520     Return 0 on success, raise an exception and return -1 on error. */
     521  #define _PyUnicodeWriter_Prepare(WRITER, LENGTH, MAXCHAR)             \
     522      (((MAXCHAR) <= (WRITER)->maxchar                                  \
     523        && (LENGTH) <= (WRITER)->size - (WRITER)->pos)                  \
     524       ? 0                                                              \
     525       : (((LENGTH) == 0)                                               \
     526          ? 0                                                           \
     527          : _PyUnicodeWriter_PrepareInternal((WRITER), (LENGTH), (MAXCHAR))))
     528  
     529  /* Don't call this function directly, use the _PyUnicodeWriter_Prepare() macro
     530     instead. */
     531  PyAPI_FUNC(int)
     532  _PyUnicodeWriter_PrepareInternal(_PyUnicodeWriter *writer,
     533                                   Py_ssize_t length, Py_UCS4 maxchar);
     534  
     535  /* Prepare the buffer to have at least the kind KIND.
     536     For example, kind=PyUnicode_2BYTE_KIND ensures that the writer will
     537     support characters in range U+000-U+FFFF.
     538  
     539     Return 0 on success, raise an exception and return -1 on error. */
     540  #define _PyUnicodeWriter_PrepareKind(WRITER, KIND)                    \
     541      ((KIND) <= (WRITER)->kind                                         \
     542       ? 0                                                              \
     543       : _PyUnicodeWriter_PrepareKindInternal((WRITER), (KIND)))
     544  
     545  /* Don't call this function directly, use the _PyUnicodeWriter_PrepareKind()
     546     macro instead. */
     547  PyAPI_FUNC(int)
     548  _PyUnicodeWriter_PrepareKindInternal(_PyUnicodeWriter *writer,
     549                                       int kind);
     550  
     551  /* Append a Unicode character.
     552     Return 0 on success, raise an exception and return -1 on error. */
     553  PyAPI_FUNC(int)
     554  _PyUnicodeWriter_WriteChar(_PyUnicodeWriter *writer,
     555      Py_UCS4 ch
     556      );
     557  
     558  /* Append a Unicode string.
     559     Return 0 on success, raise an exception and return -1 on error. */
     560  PyAPI_FUNC(int)
     561  _PyUnicodeWriter_WriteStr(_PyUnicodeWriter *writer,
     562      PyObject *str               /* Unicode string */
     563      );
     564  
     565  /* Append a substring of a Unicode string.
     566     Return 0 on success, raise an exception and return -1 on error. */
     567  PyAPI_FUNC(int)
     568  _PyUnicodeWriter_WriteSubstring(_PyUnicodeWriter *writer,
     569      PyObject *str,              /* Unicode string */
     570      Py_ssize_t start,
     571      Py_ssize_t end
     572      );
     573  
     574  /* Append an ASCII-encoded byte string.
     575     Return 0 on success, raise an exception and return -1 on error. */
     576  PyAPI_FUNC(int)
     577  _PyUnicodeWriter_WriteASCIIString(_PyUnicodeWriter *writer,
     578      const char *str,           /* ASCII-encoded byte string */
     579      Py_ssize_t len             /* number of bytes, or -1 if unknown */
     580      );
     581  
     582  /* Append a latin1-encoded byte string.
     583     Return 0 on success, raise an exception and return -1 on error. */
     584  PyAPI_FUNC(int)
     585  _PyUnicodeWriter_WriteLatin1String(_PyUnicodeWriter *writer,
     586      const char *str,           /* latin1-encoded byte string */
     587      Py_ssize_t len             /* length in bytes */
     588      );
     589  
     590  /* Get the value of the writer as a Unicode string. Clear the
     591     buffer of the writer. Raise an exception and return NULL
     592     on error. */
     593  PyAPI_FUNC(PyObject *)
     594  _PyUnicodeWriter_Finish(_PyUnicodeWriter *writer);
     595  
     596  /* Deallocate memory of a writer (clear its internal buffer). */
     597  PyAPI_FUNC(void)
     598  _PyUnicodeWriter_Dealloc(_PyUnicodeWriter *writer);
     599  
     600  
     601  /* Format the object based on the format_spec, as defined in PEP 3101
     602     (Advanced String Formatting). */
     603  PyAPI_FUNC(int) _PyUnicode_FormatAdvancedWriter(
     604      _PyUnicodeWriter *writer,
     605      PyObject *obj,
     606      PyObject *format_spec,
     607      Py_ssize_t start,
     608      Py_ssize_t end);
     609  
     610  /* --- Manage the default encoding ---------------------------------------- */
     611  
     612  /* Returns a pointer to the default encoding (UTF-8) of the
     613     Unicode object unicode.
     614  
     615     Like PyUnicode_AsUTF8AndSize(), this also caches the UTF-8 representation
     616     in the unicodeobject.
     617  
     618     _PyUnicode_AsString is a #define for PyUnicode_AsUTF8 to
     619     support the previous internal function with the same behaviour.
     620  
     621     Use of this API is DEPRECATED since no size information can be
     622     extracted from the returned data.
     623  */
     624  
     625  PyAPI_FUNC(const char *) PyUnicode_AsUTF8(PyObject *unicode);
     626  
     627  #define _PyUnicode_AsString PyUnicode_AsUTF8
     628  
     629  /* --- UTF-7 Codecs ------------------------------------------------------- */
     630  
     631  PyAPI_FUNC(PyObject*) _PyUnicode_EncodeUTF7(
     632      PyObject *unicode,          /* Unicode object */
     633      int base64SetO,             /* Encode RFC2152 Set O characters in base64 */
     634      int base64WhiteSpace,       /* Encode whitespace (sp, ht, nl, cr) in base64 */
     635      const char *errors          /* error handling */
     636      );
     637  
     638  /* --- UTF-8 Codecs ------------------------------------------------------- */
     639  
     640  PyAPI_FUNC(PyObject*) _PyUnicode_AsUTF8String(
     641      PyObject *unicode,
     642      const char *errors);
     643  
     644  /* --- UTF-32 Codecs ------------------------------------------------------ */
     645  
     646  PyAPI_FUNC(PyObject*) _PyUnicode_EncodeUTF32(
     647      PyObject *object,           /* Unicode object */
     648      const char *errors,         /* error handling */
     649      int byteorder               /* byteorder to use 0=BOM+native;-1=LE,1=BE */
     650      );
     651  
     652  /* --- UTF-16 Codecs ------------------------------------------------------ */
     653  
     654  /* Returns a Python string object holding the UTF-16 encoded value of
     655     the Unicode data.
     656  
     657     If byteorder is not 0, output is written according to the following
     658     byte order:
     659  
     660     byteorder == -1: little endian
     661     byteorder == 0:  native byte order (writes a BOM mark)
     662     byteorder == 1:  big endian
     663  
     664     If byteorder is 0, the output string will always start with the
     665     Unicode BOM mark (U+FEFF). In the other two modes, no BOM mark is
     666     prepended.
     667  */
     668  PyAPI_FUNC(PyObject*) _PyUnicode_EncodeUTF16(
     669      PyObject* unicode,          /* Unicode object */
     670      const char *errors,         /* error handling */
     671      int byteorder               /* byteorder to use 0=BOM+native;-1=LE,1=BE */
     672      );
     673  
     674  /* --- Unicode-Escape Codecs ---------------------------------------------- */
     675  
     676  /* Variant of PyUnicode_DecodeUnicodeEscape that supports partial decoding. */
     677  PyAPI_FUNC(PyObject*) _PyUnicode_DecodeUnicodeEscapeStateful(
     678          const char *string,     /* Unicode-Escape encoded string */
     679          Py_ssize_t length,      /* size of string */
     680          const char *errors,     /* error handling */
     681          Py_ssize_t *consumed    /* bytes consumed */
     682  );
     683  /* Helper for PyUnicode_DecodeUnicodeEscape that detects invalid escape
     684     chars. */
     685  PyAPI_FUNC(PyObject*) _PyUnicode_DecodeUnicodeEscapeInternal(
     686          const char *string,     /* Unicode-Escape encoded string */
     687          Py_ssize_t length,      /* size of string */
     688          const char *errors,     /* error handling */
     689          Py_ssize_t *consumed,   /* bytes consumed */
     690          const char **first_invalid_escape  /* on return, points to first
     691                                                invalid escaped char in
     692                                                string. */
     693  );
     694  
     695  /* --- Raw-Unicode-Escape Codecs ---------------------------------------------- */
     696  
     697  /* Variant of PyUnicode_DecodeRawUnicodeEscape that supports partial decoding. */
     698  PyAPI_FUNC(PyObject*) _PyUnicode_DecodeRawUnicodeEscapeStateful(
     699          const char *string,     /* Unicode-Escape encoded string */
     700          Py_ssize_t length,      /* size of string */
     701          const char *errors,     /* error handling */
     702          Py_ssize_t *consumed    /* bytes consumed */
     703  );
     704  
     705  /* --- Latin-1 Codecs ----------------------------------------------------- */
     706  
     707  PyAPI_FUNC(PyObject*) _PyUnicode_AsLatin1String(
     708      PyObject* unicode,
     709      const char* errors);
     710  
     711  /* --- ASCII Codecs ------------------------------------------------------- */
     712  
     713  PyAPI_FUNC(PyObject*) _PyUnicode_AsASCIIString(
     714      PyObject* unicode,
     715      const char* errors);
     716  
     717  /* --- Character Map Codecs ----------------------------------------------- */
     718  
     719  /* Translate an Unicode object by applying a character mapping table to
     720     it and return the resulting Unicode object.
     721  
     722     The mapping table must map Unicode ordinal integers to Unicode strings,
     723     Unicode ordinal integers or None (causing deletion of the character).
     724  
     725     Mapping tables may be dictionaries or sequences. Unmapped character
     726     ordinals (ones which cause a LookupError) are left untouched and
     727     are copied as-is.
     728  */
     729  PyAPI_FUNC(PyObject*) _PyUnicode_EncodeCharmap(
     730      PyObject *unicode,          /* Unicode object */
     731      PyObject *mapping,          /* encoding mapping */
     732      const char *errors          /* error handling */
     733      );
     734  
     735  /* --- Decimal Encoder ---------------------------------------------------- */
     736  
     737  /* Coverts a Unicode object holding a decimal value to an ASCII string
     738     for using in int, float and complex parsers.
     739     Transforms code points that have decimal digit property to the
     740     corresponding ASCII digit code points.  Transforms spaces to ASCII.
     741     Transforms code points starting from the first non-ASCII code point that
     742     is neither a decimal digit nor a space to the end into '?'. */
     743  
     744  PyAPI_FUNC(PyObject*) _PyUnicode_TransformDecimalAndSpaceToASCII(
     745      PyObject *unicode           /* Unicode object */
     746      );
     747  
     748  /* --- Methods & Slots ---------------------------------------------------- */
     749  
     750  PyAPI_FUNC(PyObject *) _PyUnicode_JoinArray(
     751      PyObject *separator,
     752      PyObject *const *items,
     753      Py_ssize_t seqlen
     754      );
     755  
     756  /* Test whether a unicode is equal to ASCII identifier.  Return 1 if true,
     757     0 otherwise.  The right argument must be ASCII identifier.
     758     Any error occurs inside will be cleared before return. */
     759  PyAPI_FUNC(int) _PyUnicode_EqualToASCIIId(
     760      PyObject *left,             /* Left string */
     761      _Py_Identifier *right       /* Right identifier */
     762      );
     763  
     764  /* Test whether a unicode is equal to ASCII string.  Return 1 if true,
     765     0 otherwise.  The right argument must be ASCII-encoded string.
     766     Any error occurs inside will be cleared before return. */
     767  PyAPI_FUNC(int) _PyUnicode_EqualToASCIIString(
     768      PyObject *left,
     769      const char *right           /* ASCII-encoded string */
     770      );
     771  
     772  /* Externally visible for str.strip(unicode) */
     773  PyAPI_FUNC(PyObject *) _PyUnicode_XStrip(
     774      PyObject *self,
     775      int striptype,
     776      PyObject *sepobj
     777      );
     778  
     779  /* Using explicit passed-in values, insert the thousands grouping
     780     into the string pointed to by buffer.  For the argument descriptions,
     781     see Objects/stringlib/localeutil.h */
     782  PyAPI_FUNC(Py_ssize_t) _PyUnicode_InsertThousandsGrouping(
     783      _PyUnicodeWriter *writer,
     784      Py_ssize_t n_buffer,
     785      PyObject *digits,
     786      Py_ssize_t d_pos,
     787      Py_ssize_t n_digits,
     788      Py_ssize_t min_width,
     789      const char *grouping,
     790      PyObject *thousands_sep,
     791      Py_UCS4 *maxchar);
     792  
     793  /* === Characters Type APIs =============================================== */
     794  
     795  /* These should not be used directly. Use the Py_UNICODE_IS* and
     796     Py_UNICODE_TO* macros instead.
     797  
     798     These APIs are implemented in Objects/unicodectype.c.
     799  
     800  */
     801  
     802  PyAPI_FUNC(int) _PyUnicode_IsLowercase(
     803      Py_UCS4 ch       /* Unicode character */
     804      );
     805  
     806  PyAPI_FUNC(int) _PyUnicode_IsUppercase(
     807      Py_UCS4 ch       /* Unicode character */
     808      );
     809  
     810  PyAPI_FUNC(int) _PyUnicode_IsTitlecase(
     811      Py_UCS4 ch       /* Unicode character */
     812      );
     813  
     814  PyAPI_FUNC(int) _PyUnicode_IsXidStart(
     815      Py_UCS4 ch       /* Unicode character */
     816      );
     817  
     818  PyAPI_FUNC(int) _PyUnicode_IsXidContinue(
     819      Py_UCS4 ch       /* Unicode character */
     820      );
     821  
     822  PyAPI_FUNC(int) _PyUnicode_IsWhitespace(
     823      const Py_UCS4 ch         /* Unicode character */
     824      );
     825  
     826  PyAPI_FUNC(int) _PyUnicode_IsLinebreak(
     827      const Py_UCS4 ch         /* Unicode character */
     828      );
     829  
     830  /* Py_DEPRECATED(3.3) */ PyAPI_FUNC(Py_UCS4) _PyUnicode_ToLowercase(
     831      Py_UCS4 ch       /* Unicode character */
     832      );
     833  
     834  /* Py_DEPRECATED(3.3) */ PyAPI_FUNC(Py_UCS4) _PyUnicode_ToUppercase(
     835      Py_UCS4 ch       /* Unicode character */
     836      );
     837  
     838  Py_DEPRECATED(3.3) PyAPI_FUNC(Py_UCS4) _PyUnicode_ToTitlecase(
     839      Py_UCS4 ch       /* Unicode character */
     840      );
     841  
     842  PyAPI_FUNC(int) _PyUnicode_ToLowerFull(
     843      Py_UCS4 ch,       /* Unicode character */
     844      Py_UCS4 *res
     845      );
     846  
     847  PyAPI_FUNC(int) _PyUnicode_ToTitleFull(
     848      Py_UCS4 ch,       /* Unicode character */
     849      Py_UCS4 *res
     850      );
     851  
     852  PyAPI_FUNC(int) _PyUnicode_ToUpperFull(
     853      Py_UCS4 ch,       /* Unicode character */
     854      Py_UCS4 *res
     855      );
     856  
     857  PyAPI_FUNC(int) _PyUnicode_ToFoldedFull(
     858      Py_UCS4 ch,       /* Unicode character */
     859      Py_UCS4 *res
     860      );
     861  
     862  PyAPI_FUNC(int) _PyUnicode_IsCaseIgnorable(
     863      Py_UCS4 ch         /* Unicode character */
     864      );
     865  
     866  PyAPI_FUNC(int) _PyUnicode_IsCased(
     867      Py_UCS4 ch         /* Unicode character */
     868      );
     869  
     870  PyAPI_FUNC(int) _PyUnicode_ToDecimalDigit(
     871      Py_UCS4 ch       /* Unicode character */
     872      );
     873  
     874  PyAPI_FUNC(int) _PyUnicode_ToDigit(
     875      Py_UCS4 ch       /* Unicode character */
     876      );
     877  
     878  PyAPI_FUNC(double) _PyUnicode_ToNumeric(
     879      Py_UCS4 ch       /* Unicode character */
     880      );
     881  
     882  PyAPI_FUNC(int) _PyUnicode_IsDecimalDigit(
     883      Py_UCS4 ch       /* Unicode character */
     884      );
     885  
     886  PyAPI_FUNC(int) _PyUnicode_IsDigit(
     887      Py_UCS4 ch       /* Unicode character */
     888      );
     889  
     890  PyAPI_FUNC(int) _PyUnicode_IsNumeric(
     891      Py_UCS4 ch       /* Unicode character */
     892      );
     893  
     894  PyAPI_FUNC(int) _PyUnicode_IsPrintable(
     895      Py_UCS4 ch       /* Unicode character */
     896      );
     897  
     898  PyAPI_FUNC(int) _PyUnicode_IsAlpha(
     899      Py_UCS4 ch       /* Unicode character */
     900      );
     901  
     902  // Helper array used by Py_UNICODE_ISSPACE().
     903  PyAPI_DATA(const unsigned char) _Py_ascii_whitespace[];
     904  
     905  // Since splitting on whitespace is an important use case, and
     906  // whitespace in most situations is solely ASCII whitespace, we
     907  // optimize for the common case by using a quick look-up table
     908  // _Py_ascii_whitespace (see below) with an inlined check.
     909  static inline int Py_UNICODE_ISSPACE(Py_UCS4 ch) {
     910      if (ch < 128) {
     911          return _Py_ascii_whitespace[ch];
     912      }
     913      return _PyUnicode_IsWhitespace(ch);
     914  }
     915  
     916  #define Py_UNICODE_ISLOWER(ch) _PyUnicode_IsLowercase(ch)
     917  #define Py_UNICODE_ISUPPER(ch) _PyUnicode_IsUppercase(ch)
     918  #define Py_UNICODE_ISTITLE(ch) _PyUnicode_IsTitlecase(ch)
     919  #define Py_UNICODE_ISLINEBREAK(ch) _PyUnicode_IsLinebreak(ch)
     920  
     921  #define Py_UNICODE_TOLOWER(ch) _PyUnicode_ToLowercase(ch)
     922  #define Py_UNICODE_TOUPPER(ch) _PyUnicode_ToUppercase(ch)
     923  #define Py_UNICODE_TOTITLE(ch) _PyUnicode_ToTitlecase(ch)
     924  
     925  #define Py_UNICODE_ISDECIMAL(ch) _PyUnicode_IsDecimalDigit(ch)
     926  #define Py_UNICODE_ISDIGIT(ch) _PyUnicode_IsDigit(ch)
     927  #define Py_UNICODE_ISNUMERIC(ch) _PyUnicode_IsNumeric(ch)
     928  #define Py_UNICODE_ISPRINTABLE(ch) _PyUnicode_IsPrintable(ch)
     929  
     930  #define Py_UNICODE_TODECIMAL(ch) _PyUnicode_ToDecimalDigit(ch)
     931  #define Py_UNICODE_TODIGIT(ch) _PyUnicode_ToDigit(ch)
     932  #define Py_UNICODE_TONUMERIC(ch) _PyUnicode_ToNumeric(ch)
     933  
     934  #define Py_UNICODE_ISALPHA(ch) _PyUnicode_IsAlpha(ch)
     935  
     936  static inline int Py_UNICODE_ISALNUM(Py_UCS4 ch) {
     937     return (Py_UNICODE_ISALPHA(ch)
     938             || Py_UNICODE_ISDECIMAL(ch)
     939             || Py_UNICODE_ISDIGIT(ch)
     940             || Py_UNICODE_ISNUMERIC(ch));
     941  }
     942  
     943  
     944  /* === Misc functions ===================================================== */
     945  
     946  PyAPI_FUNC(PyObject*) _PyUnicode_FormatLong(PyObject *, int, int, int);
     947  
     948  /* Return an interned Unicode object for an Identifier; may fail if there is no memory.*/
     949  PyAPI_FUNC(PyObject*) _PyUnicode_FromId(_Py_Identifier*);
     950  
     951  /* Fast equality check when the inputs are known to be exact unicode types
     952     and where the hash values are equal (i.e. a very probable match) */
     953  PyAPI_FUNC(int) _PyUnicode_EQ(PyObject *, PyObject *);
     954  
     955  /* Equality check. */
     956  PyAPI_FUNC(int) _PyUnicode_Equal(PyObject *, PyObject *);
     957  
     958  PyAPI_FUNC(int) _PyUnicode_WideCharString_Converter(PyObject *, void *);
     959  PyAPI_FUNC(int) _PyUnicode_WideCharString_Opt_Converter(PyObject *, void *);
     960  
     961  PyAPI_FUNC(Py_ssize_t) _PyUnicode_ScanIdentifier(PyObject *);