1  #ifndef Py_UNICODEOBJECT_H
       2  #define Py_UNICODEOBJECT_H
       3  
       4  #include <stdarg.h>               // va_list
       5  
       6  /*
       7  
       8  Unicode implementation based on original code by Fredrik Lundh,
       9  modified by Marc-Andre Lemburg (mal@lemburg.com) according to the
      10  Unicode Integration Proposal. (See
      11  http://www.egenix.com/files/python/unicode-proposal.txt).
      12  
      13  Copyright (c) Corporation for National Research Initiatives.
      14  
      15  
      16   Original header:
      17   --------------------------------------------------------------------
      18  
      19   * Yet another Unicode string type for Python.  This type supports the
      20   * 16-bit Basic Multilingual Plane (BMP) only.
      21   *
      22   * Written by Fredrik Lundh, January 1999.
      23   *
      24   * Copyright (c) 1999 by Secret Labs AB.
      25   * Copyright (c) 1999 by Fredrik Lundh.
      26   *
      27   * fredrik@pythonware.com
      28   * http://www.pythonware.com
      29   *
      30   * --------------------------------------------------------------------
      31   * This Unicode String Type is
      32   *
      33   * Copyright (c) 1999 by Secret Labs AB
      34   * Copyright (c) 1999 by Fredrik Lundh
      35   *
      36   * By obtaining, using, and/or copying this software and/or its
      37   * associated documentation, you agree that you have read, understood,
      38   * and will comply with the following terms and conditions:
      39   *
      40   * Permission to use, copy, modify, and distribute this software and its
      41   * associated documentation for any purpose and without fee is hereby
      42   * granted, provided that the above copyright notice appears in all
      43   * copies, and that both that copyright notice and this permission notice
      44   * appear in supporting documentation, and that the name of Secret Labs
      45   * AB or the author not be used in advertising or publicity pertaining to
      46   * distribution of the software without specific, written prior
      47   * permission.
      48   *
      49   * SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD TO
      50   * THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
      51   * FITNESS.  IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR BE LIABLE FOR
      52   * ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
      53   * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
      54   * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT
      55   * OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
      56   * -------------------------------------------------------------------- */
      57  
      58  #include <ctype.h>
      59  
      60  /* === Internal API ======================================================= */
      61  
      62  /* --- Internal Unicode Format -------------------------------------------- */
      63  
      64  /* Python 3.x requires unicode */
      65  #define Py_USING_UNICODE
      66  
      67  #ifndef SIZEOF_WCHAR_T
      68  #error Must define SIZEOF_WCHAR_T
      69  #endif
      70  
      71  #define Py_UNICODE_SIZE SIZEOF_WCHAR_T
      72  
      73  /* If wchar_t can be used for UCS-4 storage, set Py_UNICODE_WIDE.
      74     Otherwise, Unicode strings are stored as UCS-2 (with limited support
      75     for UTF-16) */
      76  
      77  #if Py_UNICODE_SIZE >= 4
      78  #define Py_UNICODE_WIDE
      79  #endif
      80  
      81  /* Set these flags if the platform has "wchar.h" and the
      82     wchar_t type is a 16-bit unsigned type */
      83  /* #define HAVE_WCHAR_H */
      84  /* #define HAVE_USABLE_WCHAR_T */
      85  
      86  /* If the compiler provides a wchar_t type we try to support it
      87     through the interface functions PyUnicode_FromWideChar(),
      88     PyUnicode_AsWideChar() and PyUnicode_AsWideCharString(). */
      89  
      90  #ifdef HAVE_USABLE_WCHAR_T
      91  # ifndef HAVE_WCHAR_H
      92  #  define HAVE_WCHAR_H
      93  # endif
      94  #endif
      95  
      96  #ifdef HAVE_WCHAR_H
      97  #  include <wchar.h>
      98  #endif
      99  
     100  /* Py_UCS4 and Py_UCS2 are typedefs for the respective
     101     unicode representations. */
     102  typedef uint32_t Py_UCS4;
     103  typedef uint16_t Py_UCS2;
     104  typedef uint8_t Py_UCS1;
     105  
     106  #ifdef __cplusplus
     107  extern "C" {
     108  #endif
     109  
     110  
     111  PyAPI_DATA(PyTypeObject) PyUnicode_Type;
     112  PyAPI_DATA(PyTypeObject) PyUnicodeIter_Type;
     113  
     114  #define PyUnicode_Check(op) \
     115      PyType_FastSubclass(Py_TYPE(op), Py_TPFLAGS_UNICODE_SUBCLASS)
     116  #define PyUnicode_CheckExact(op) Py_IS_TYPE(op, &PyUnicode_Type)
     117  
     118  /* --- Constants ---------------------------------------------------------- */
     119  
     120  /* This Unicode character will be used as replacement character during
     121     decoding if the errors argument is set to "replace". Note: the
     122     Unicode character U+FFFD is the official REPLACEMENT CHARACTER in
     123     Unicode 3.0. */
     124  
     125  #define Py_UNICODE_REPLACEMENT_CHARACTER ((Py_UCS4) 0xFFFD)
     126  
     127  /* === Public API ========================================================= */
     128  
     129  /* Similar to PyUnicode_FromUnicode(), but u points to UTF-8 encoded bytes */
     130  PyAPI_FUNC(PyObject*) PyUnicode_FromStringAndSize(
     131      const char *u,             /* UTF-8 encoded string */
     132      Py_ssize_t size            /* size of buffer */
     133      );
     134  
     135  /* Similar to PyUnicode_FromUnicode(), but u points to null-terminated
     136     UTF-8 encoded bytes.  The size is determined with strlen(). */
     137  PyAPI_FUNC(PyObject*) PyUnicode_FromString(
     138      const char *u              /* UTF-8 encoded string */
     139      );
     140  
     141  #if !defined(Py_LIMITED_API) || Py_LIMITED_API+0 >= 0x03030000
     142  PyAPI_FUNC(PyObject*) PyUnicode_Substring(
     143      PyObject *str,
     144      Py_ssize_t start,
     145      Py_ssize_t end);
     146  #endif
     147  
     148  #if !defined(Py_LIMITED_API) || Py_LIMITED_API+0 >= 0x03030000
     149  /* Copy the string into a UCS4 buffer including the null character if copy_null
     150     is set. Return NULL and raise an exception on error. Raise a SystemError if
     151     the buffer is smaller than the string. Return buffer on success.
     152  
     153     buflen is the length of the buffer in (Py_UCS4) characters. */
     154  PyAPI_FUNC(Py_UCS4*) PyUnicode_AsUCS4(
     155      PyObject *unicode,
     156      Py_UCS4* buffer,
     157      Py_ssize_t buflen,
     158      int copy_null);
     159  
     160  /* Copy the string into a UCS4 buffer. A new buffer is allocated using
     161   * PyMem_Malloc; if this fails, NULL is returned with a memory error
     162     exception set. */
     163  PyAPI_FUNC(Py_UCS4*) PyUnicode_AsUCS4Copy(PyObject *unicode);
     164  #endif
     165  
     166  #if !defined(Py_LIMITED_API) || Py_LIMITED_API+0 >= 0x03030000
     167  /* Get the length of the Unicode object. */
     168  
     169  PyAPI_FUNC(Py_ssize_t) PyUnicode_GetLength(
     170      PyObject *unicode
     171  );
     172  #endif
     173  
     174  /* Get the number of Py_UNICODE units in the
     175     string representation. */
     176  
     177  Py_DEPRECATED(3.3) PyAPI_FUNC(Py_ssize_t) PyUnicode_GetSize(
     178      PyObject *unicode           /* Unicode object */
     179      );
     180  
     181  #if !defined(Py_LIMITED_API) || Py_LIMITED_API+0 >= 0x03030000
     182  /* Read a character from the string. */
     183  
     184  PyAPI_FUNC(Py_UCS4) PyUnicode_ReadChar(
     185      PyObject *unicode,
     186      Py_ssize_t index
     187      );
     188  
     189  /* Write a character to the string. The string must have been created through
     190     PyUnicode_New, must not be shared, and must not have been hashed yet.
     191  
     192     Return 0 on success, -1 on error. */
     193  
     194  PyAPI_FUNC(int) PyUnicode_WriteChar(
     195      PyObject *unicode,
     196      Py_ssize_t index,
     197      Py_UCS4 character
     198      );
     199  #endif
     200  
     201  /* Resize a Unicode object. The length is the number of characters, except
     202     if the kind of the string is PyUnicode_WCHAR_KIND: in this case, the length
     203     is the number of Py_UNICODE characters.
     204  
     205     *unicode is modified to point to the new (resized) object and 0
     206     returned on success.
     207  
     208     Try to resize the string in place (which is usually faster than allocating
     209     a new string and copy characters), or create a new string.
     210  
     211     Error handling is implemented as follows: an exception is set, -1
     212     is returned and *unicode left untouched.
     213  
     214     WARNING: The function doesn't check string content, the result may not be a
     215              string in canonical representation. */
     216  
     217  PyAPI_FUNC(int) PyUnicode_Resize(
     218      PyObject **unicode,         /* Pointer to the Unicode object */
     219      Py_ssize_t length           /* New length */
     220      );
     221  
     222  /* Decode obj to a Unicode object.
     223  
     224     bytes, bytearray and other bytes-like objects are decoded according to the
     225     given encoding and error handler. The encoding and error handler can be
     226     NULL to have the interface use UTF-8 and "strict".
     227  
     228     All other objects (including Unicode objects) raise an exception.
     229  
     230     The API returns NULL in case of an error. The caller is responsible
     231     for decref'ing the returned objects.
     232  
     233  */
     234  
     235  PyAPI_FUNC(PyObject*) PyUnicode_FromEncodedObject(
     236      PyObject *obj,              /* Object */
     237      const char *encoding,       /* encoding */
     238      const char *errors          /* error handling */
     239      );
     240  
     241  /* Copy an instance of a Unicode subtype to a new true Unicode object if
     242     necessary. If obj is already a true Unicode object (not a subtype), return
     243     the reference with *incremented* refcount.
     244  
     245     The API returns NULL in case of an error. The caller is responsible
     246     for decref'ing the returned objects.
     247  
     248  */
     249  
     250  PyAPI_FUNC(PyObject*) PyUnicode_FromObject(
     251      PyObject *obj      /* Object */
     252      );
     253  
     254  PyAPI_FUNC(PyObject *) PyUnicode_FromFormatV(
     255      const char *format,   /* ASCII-encoded string  */
     256      va_list vargs
     257      );
     258  PyAPI_FUNC(PyObject *) PyUnicode_FromFormat(
     259      const char *format,   /* ASCII-encoded string  */
     260      ...
     261      );
     262  
     263  PyAPI_FUNC(void) PyUnicode_InternInPlace(PyObject **);
     264  PyAPI_FUNC(PyObject *) PyUnicode_InternFromString(
     265      const char *u              /* UTF-8 encoded string */
     266      );
     267  
     268  // PyUnicode_InternImmortal() is deprecated since Python 3.10
     269  // and will be removed in Python 3.12. Use PyUnicode_InternInPlace() instead.
     270  Py_DEPRECATED(3.10) PyAPI_FUNC(void) PyUnicode_InternImmortal(PyObject **);
     271  
     272  /* --- wchar_t support for platforms which support it --------------------- */
     273  
     274  #ifdef HAVE_WCHAR_H
     275  
     276  /* Create a Unicode Object from the wchar_t buffer w of the given
     277     size.
     278  
     279     The buffer is copied into the new object. */
     280  
     281  PyAPI_FUNC(PyObject*) PyUnicode_FromWideChar(
     282      const wchar_t *w,           /* wchar_t buffer */
     283      Py_ssize_t size             /* size of buffer */
     284      );
     285  
     286  /* Copies the Unicode Object contents into the wchar_t buffer w.  At
     287     most size wchar_t characters are copied.
     288  
     289     Note that the resulting wchar_t string may or may not be
     290     0-terminated.  It is the responsibility of the caller to make sure
     291     that the wchar_t string is 0-terminated in case this is required by
     292     the application.
     293  
     294     Returns the number of wchar_t characters copied (excluding a
     295     possibly trailing 0-termination character) or -1 in case of an
     296     error. */
     297  
     298  PyAPI_FUNC(Py_ssize_t) PyUnicode_AsWideChar(
     299      PyObject *unicode,          /* Unicode object */
     300      wchar_t *w,                 /* wchar_t buffer */
     301      Py_ssize_t size             /* size of buffer */
     302      );
     303  
     304  /* Convert the Unicode object to a wide character string. The output string
     305     always ends with a nul character. If size is not NULL, write the number of
     306     wide characters (excluding the null character) into *size.
     307  
     308     Returns a buffer allocated by PyMem_Malloc() (use PyMem_Free() to free it)
     309     on success. On error, returns NULL, *size is undefined and raises a
     310     MemoryError. */
     311  
     312  PyAPI_FUNC(wchar_t*) PyUnicode_AsWideCharString(
     313      PyObject *unicode,          /* Unicode object */
     314      Py_ssize_t *size            /* number of characters of the result */
     315      );
     316  
     317  #endif
     318  
     319  /* --- Unicode ordinals --------------------------------------------------- */
     320  
     321  /* Create a Unicode Object from the given Unicode code point ordinal.
     322  
     323     The ordinal must be in range(0x110000). A ValueError is
     324     raised in case it is not.
     325  
     326  */
     327  
     328  PyAPI_FUNC(PyObject*) PyUnicode_FromOrdinal(int ordinal);
     329  
     330  /* === Builtin Codecs =====================================================
     331  
     332     Many of these APIs take two arguments encoding and errors. These
     333     parameters encoding and errors have the same semantics as the ones
     334     of the builtin str() API.
     335  
     336     Setting encoding to NULL causes the default encoding (UTF-8) to be used.
     337  
     338     Error handling is set by errors which may also be set to NULL
     339     meaning to use the default handling defined for the codec. Default
     340     error handling for all builtin codecs is "strict" (ValueErrors are
     341     raised).
     342  
     343     The codecs all use a similar interface. Only deviation from the
     344     generic ones are documented.
     345  
     346  */
     347  
     348  /* --- Manage the default encoding ---------------------------------------- */
     349  
     350  /* Returns "utf-8".  */
     351  PyAPI_FUNC(const char*) PyUnicode_GetDefaultEncoding(void);
     352  
     353  /* --- Generic Codecs ----------------------------------------------------- */
     354  
     355  /* Create a Unicode object by decoding the encoded string s of the
     356     given size. */
     357  
     358  PyAPI_FUNC(PyObject*) PyUnicode_Decode(
     359      const char *s,              /* encoded string */
     360      Py_ssize_t size,            /* size of buffer */
     361      const char *encoding,       /* encoding */
     362      const char *errors          /* error handling */
     363      );
     364  
     365  /* Decode a Unicode object unicode and return the result as Python
     366     object.
     367  
     368     This API is DEPRECATED. The only supported standard encoding is rot13.
     369     Use PyCodec_Decode() to decode with rot13 and non-standard codecs
     370     that decode from str. */
     371  
     372  Py_DEPRECATED(3.6) PyAPI_FUNC(PyObject*) PyUnicode_AsDecodedObject(
     373      PyObject *unicode,          /* Unicode object */
     374      const char *encoding,       /* encoding */
     375      const char *errors          /* error handling */
     376      );
     377  
     378  /* Decode a Unicode object unicode and return the result as Unicode
     379     object.
     380  
     381     This API is DEPRECATED. The only supported standard encoding is rot13.
     382     Use PyCodec_Decode() to decode with rot13 and non-standard codecs
     383     that decode from str to str. */
     384  
     385  Py_DEPRECATED(3.6) PyAPI_FUNC(PyObject*) PyUnicode_AsDecodedUnicode(
     386      PyObject *unicode,          /* Unicode object */
     387      const char *encoding,       /* encoding */
     388      const char *errors          /* error handling */
     389      );
     390  
     391  /* Encodes a Unicode object and returns the result as Python
     392     object.
     393  
     394     This API is DEPRECATED.  It is superseded by PyUnicode_AsEncodedString()
     395     since all standard encodings (except rot13) encode str to bytes.
     396     Use PyCodec_Encode() for encoding with rot13 and non-standard codecs
     397     that encode form str to non-bytes. */
     398  
     399  Py_DEPRECATED(3.6) PyAPI_FUNC(PyObject*) PyUnicode_AsEncodedObject(
     400      PyObject *unicode,          /* Unicode object */
     401      const char *encoding,       /* encoding */
     402      const char *errors          /* error handling */
     403      );
     404  
     405  /* Encodes a Unicode object and returns the result as Python string
     406     object. */
     407  
     408  PyAPI_FUNC(PyObject*) PyUnicode_AsEncodedString(
     409      PyObject *unicode,          /* Unicode object */
     410      const char *encoding,       /* encoding */
     411      const char *errors          /* error handling */
     412      );
     413  
     414  /* Encodes a Unicode object and returns the result as Unicode
     415     object.
     416  
     417     This API is DEPRECATED.  The only supported standard encodings is rot13.
     418     Use PyCodec_Encode() to encode with rot13 and non-standard codecs
     419     that encode from str to str. */
     420  
     421  Py_DEPRECATED(3.6) PyAPI_FUNC(PyObject*) PyUnicode_AsEncodedUnicode(
     422      PyObject *unicode,          /* Unicode object */
     423      const char *encoding,       /* encoding */
     424      const char *errors          /* error handling */
     425      );
     426  
     427  /* Build an encoding map. */
     428  
     429  PyAPI_FUNC(PyObject*) PyUnicode_BuildEncodingMap(
     430      PyObject* string            /* 256 character map */
     431     );
     432  
     433  /* --- UTF-7 Codecs ------------------------------------------------------- */
     434  
     435  PyAPI_FUNC(PyObject*) PyUnicode_DecodeUTF7(
     436      const char *string,         /* UTF-7 encoded string */
     437      Py_ssize_t length,          /* size of string */
     438      const char *errors          /* error handling */
     439      );
     440  
     441  PyAPI_FUNC(PyObject*) PyUnicode_DecodeUTF7Stateful(
     442      const char *string,         /* UTF-7 encoded string */
     443      Py_ssize_t length,          /* size of string */
     444      const char *errors,         /* error handling */
     445      Py_ssize_t *consumed        /* bytes consumed */
     446      );
     447  
     448  /* --- UTF-8 Codecs ------------------------------------------------------- */
     449  
     450  PyAPI_FUNC(PyObject*) PyUnicode_DecodeUTF8(
     451      const char *string,         /* UTF-8 encoded string */
     452      Py_ssize_t length,          /* size of string */
     453      const char *errors          /* error handling */
     454      );
     455  
     456  PyAPI_FUNC(PyObject*) PyUnicode_DecodeUTF8Stateful(
     457      const char *string,         /* UTF-8 encoded string */
     458      Py_ssize_t length,          /* size of string */
     459      const char *errors,         /* error handling */
     460      Py_ssize_t *consumed        /* bytes consumed */
     461      );
     462  
     463  PyAPI_FUNC(PyObject*) PyUnicode_AsUTF8String(
     464      PyObject *unicode           /* Unicode object */
     465      );
     466  
     467  /* Returns a pointer to the default encoding (UTF-8) of the
     468     Unicode object unicode and the size of the encoded representation
     469     in bytes stored in *size.
     470  
     471     In case of an error, no *size is set.
     472  
     473     This function caches the UTF-8 encoded string in the unicodeobject
     474     and subsequent calls will return the same string.  The memory is released
     475     when the unicodeobject is deallocated.
     476  */
     477  
     478  #if !defined(Py_LIMITED_API) || Py_LIMITED_API+0 >= 0x030A0000
     479  PyAPI_FUNC(const char *) PyUnicode_AsUTF8AndSize(
     480      PyObject *unicode,
     481      Py_ssize_t *size);
     482  #endif
     483  
     484  /* --- UTF-32 Codecs ------------------------------------------------------ */
     485  
     486  /* Decodes length bytes from a UTF-32 encoded buffer string and returns
     487     the corresponding Unicode object.
     488  
     489     errors (if non-NULL) defines the error handling. It defaults
     490     to "strict".
     491  
     492     If byteorder is non-NULL, the decoder starts decoding using the
     493     given byte order:
     494  
     495      *byteorder == -1: little endian
     496      *byteorder == 0:  native order
     497      *byteorder == 1:  big endian
     498  
     499     In native mode, the first four bytes of the stream are checked for a
     500     BOM mark. If found, the BOM mark is analysed, the byte order
     501     adjusted and the BOM skipped.  In the other modes, no BOM mark
     502     interpretation is done. After completion, *byteorder is set to the
     503     current byte order at the end of input data.
     504  
     505     If byteorder is NULL, the codec starts in native order mode.
     506  
     507  */
     508  
     509  PyAPI_FUNC(PyObject*) PyUnicode_DecodeUTF32(
     510      const char *string,         /* UTF-32 encoded string */
     511      Py_ssize_t length,          /* size of string */
     512      const char *errors,         /* error handling */
     513      int *byteorder              /* pointer to byteorder to use
     514                                     0=native;-1=LE,1=BE; updated on
     515                                     exit */
     516      );
     517  
     518  PyAPI_FUNC(PyObject*) PyUnicode_DecodeUTF32Stateful(
     519      const char *string,         /* UTF-32 encoded string */
     520      Py_ssize_t length,          /* size of string */
     521      const char *errors,         /* error handling */
     522      int *byteorder,             /* pointer to byteorder to use
     523                                     0=native;-1=LE,1=BE; updated on
     524                                     exit */
     525      Py_ssize_t *consumed        /* bytes consumed */
     526      );
     527  
     528  /* Returns a Python string using the UTF-32 encoding in native byte
     529     order. The string always starts with a BOM mark.  */
     530  
     531  PyAPI_FUNC(PyObject*) PyUnicode_AsUTF32String(
     532      PyObject *unicode           /* Unicode object */
     533      );
     534  
     535  /* Returns a Python string object holding the UTF-32 encoded value of
     536     the Unicode data.
     537  
     538     If byteorder is not 0, output is written according to the following
     539     byte order:
     540  
     541     byteorder == -1: little endian
     542     byteorder == 0:  native byte order (writes a BOM mark)
     543     byteorder == 1:  big endian
     544  
     545     If byteorder is 0, the output string will always start with the
     546     Unicode BOM mark (U+FEFF). In the other two modes, no BOM mark is
     547     prepended.
     548  
     549  */
     550  
     551  /* --- UTF-16 Codecs ------------------------------------------------------ */
     552  
     553  /* Decodes length bytes from a UTF-16 encoded buffer string and returns
     554     the corresponding Unicode object.
     555  
     556     errors (if non-NULL) defines the error handling. It defaults
     557     to "strict".
     558  
     559     If byteorder is non-NULL, the decoder starts decoding using the
     560     given byte order:
     561  
     562      *byteorder == -1: little endian
     563      *byteorder == 0:  native order
     564      *byteorder == 1:  big endian
     565  
     566     In native mode, the first two bytes of the stream are checked for a
     567     BOM mark. If found, the BOM mark is analysed, the byte order
     568     adjusted and the BOM skipped.  In the other modes, no BOM mark
     569     interpretation is done. After completion, *byteorder is set to the
     570     current byte order at the end of input data.
     571  
     572     If byteorder is NULL, the codec starts in native order mode.
     573  
     574  */
     575  
     576  PyAPI_FUNC(PyObject*) PyUnicode_DecodeUTF16(
     577      const char *string,         /* UTF-16 encoded string */
     578      Py_ssize_t length,          /* size of string */
     579      const char *errors,         /* error handling */
     580      int *byteorder              /* pointer to byteorder to use
     581                                     0=native;-1=LE,1=BE; updated on
     582                                     exit */
     583      );
     584  
     585  PyAPI_FUNC(PyObject*) PyUnicode_DecodeUTF16Stateful(
     586      const char *string,         /* UTF-16 encoded string */
     587      Py_ssize_t length,          /* size of string */
     588      const char *errors,         /* error handling */
     589      int *byteorder,             /* pointer to byteorder to use
     590                                     0=native;-1=LE,1=BE; updated on
     591                                     exit */
     592      Py_ssize_t *consumed        /* bytes consumed */
     593      );
     594  
     595  /* Returns a Python string using the UTF-16 encoding in native byte
     596     order. The string always starts with a BOM mark.  */
     597  
     598  PyAPI_FUNC(PyObject*) PyUnicode_AsUTF16String(
     599      PyObject *unicode           /* Unicode object */
     600      );
     601  
     602  /* --- Unicode-Escape Codecs ---------------------------------------------- */
     603  
     604  PyAPI_FUNC(PyObject*) PyUnicode_DecodeUnicodeEscape(
     605      const char *string,         /* Unicode-Escape encoded string */
     606      Py_ssize_t length,          /* size of string */
     607      const char *errors          /* error handling */
     608      );
     609  
     610  PyAPI_FUNC(PyObject*) PyUnicode_AsUnicodeEscapeString(
     611      PyObject *unicode           /* Unicode object */
     612      );
     613  
     614  /* --- Raw-Unicode-Escape Codecs ------------------------------------------ */
     615  
     616  PyAPI_FUNC(PyObject*) PyUnicode_DecodeRawUnicodeEscape(
     617      const char *string,         /* Raw-Unicode-Escape encoded string */
     618      Py_ssize_t length,          /* size of string */
     619      const char *errors          /* error handling */
     620      );
     621  
     622  PyAPI_FUNC(PyObject*) PyUnicode_AsRawUnicodeEscapeString(
     623      PyObject *unicode           /* Unicode object */
     624      );
     625  
     626  /* --- Latin-1 Codecs -----------------------------------------------------
     627  
     628     Note: Latin-1 corresponds to the first 256 Unicode ordinals. */
     629  
     630  PyAPI_FUNC(PyObject*) PyUnicode_DecodeLatin1(
     631      const char *string,         /* Latin-1 encoded string */
     632      Py_ssize_t length,          /* size of string */
     633      const char *errors          /* error handling */
     634      );
     635  
     636  PyAPI_FUNC(PyObject*) PyUnicode_AsLatin1String(
     637      PyObject *unicode           /* Unicode object */
     638      );
     639  
     640  /* --- ASCII Codecs -------------------------------------------------------
     641  
     642     Only 7-bit ASCII data is excepted. All other codes generate errors.
     643  
     644  */
     645  
     646  PyAPI_FUNC(PyObject*) PyUnicode_DecodeASCII(
     647      const char *string,         /* ASCII encoded string */
     648      Py_ssize_t length,          /* size of string */
     649      const char *errors          /* error handling */
     650      );
     651  
     652  PyAPI_FUNC(PyObject*) PyUnicode_AsASCIIString(
     653      PyObject *unicode           /* Unicode object */
     654      );
     655  
     656  /* --- Character Map Codecs -----------------------------------------------
     657  
     658     This codec uses mappings to encode and decode characters.
     659  
     660     Decoding mappings must map byte ordinals (integers in the range from 0 to
     661     255) to Unicode strings, integers (which are then interpreted as Unicode
     662     ordinals) or None.  Unmapped data bytes (ones which cause a LookupError)
     663     as well as mapped to None, 0xFFFE or '\ufffe' are treated as "undefined
     664     mapping" and cause an error.
     665  
     666     Encoding mappings must map Unicode ordinal integers to bytes objects,
     667     integers in the range from 0 to 255 or None.  Unmapped character
     668     ordinals (ones which cause a LookupError) as well as mapped to
     669     None are treated as "undefined mapping" and cause an error.
     670  
     671  */
     672  
     673  PyAPI_FUNC(PyObject*) PyUnicode_DecodeCharmap(
     674      const char *string,         /* Encoded string */
     675      Py_ssize_t length,          /* size of string */
     676      PyObject *mapping,          /* decoding mapping */
     677      const char *errors          /* error handling */
     678      );
     679  
     680  PyAPI_FUNC(PyObject*) PyUnicode_AsCharmapString(
     681      PyObject *unicode,          /* Unicode object */
     682      PyObject *mapping           /* encoding mapping */
     683      );
     684  
     685  /* --- MBCS codecs for Windows -------------------------------------------- */
     686  
     687  #ifdef MS_WINDOWS
     688  PyAPI_FUNC(PyObject*) PyUnicode_DecodeMBCS(
     689      const char *string,         /* MBCS encoded string */
     690      Py_ssize_t length,          /* size of string */
     691      const char *errors          /* error handling */
     692      );
     693  
     694  PyAPI_FUNC(PyObject*) PyUnicode_DecodeMBCSStateful(
     695      const char *string,         /* MBCS encoded string */
     696      Py_ssize_t length,          /* size of string */
     697      const char *errors,         /* error handling */
     698      Py_ssize_t *consumed        /* bytes consumed */
     699      );
     700  
     701  #if !defined(Py_LIMITED_API) || Py_LIMITED_API+0 >= 0x03030000
     702  PyAPI_FUNC(PyObject*) PyUnicode_DecodeCodePageStateful(
     703      int code_page,              /* code page number */
     704      const char *string,         /* encoded string */
     705      Py_ssize_t length,          /* size of string */
     706      const char *errors,         /* error handling */
     707      Py_ssize_t *consumed        /* bytes consumed */
     708      );
     709  #endif
     710  
     711  PyAPI_FUNC(PyObject*) PyUnicode_AsMBCSString(
     712      PyObject *unicode           /* Unicode object */
     713      );
     714  
     715  #if !defined(Py_LIMITED_API) || Py_LIMITED_API+0 >= 0x03030000
     716  PyAPI_FUNC(PyObject*) PyUnicode_EncodeCodePage(
     717      int code_page,              /* code page number */
     718      PyObject *unicode,          /* Unicode object */
     719      const char *errors          /* error handling */
     720      );
     721  #endif
     722  
     723  #endif /* MS_WINDOWS */
     724  
     725  /* --- Locale encoding --------------------------------------------------- */
     726  
     727  #if !defined(Py_LIMITED_API) || Py_LIMITED_API+0 >= 0x03030000
     728  /* Decode a string from the current locale encoding. The decoder is strict if
     729     *surrogateescape* is equal to zero, otherwise it uses the 'surrogateescape'
     730     error handler (PEP 383) to escape undecodable bytes. If a byte sequence can
     731     be decoded as a surrogate character and *surrogateescape* is not equal to
     732     zero, the byte sequence is escaped using the 'surrogateescape' error handler
     733     instead of being decoded. *str* must end with a null character but cannot
     734     contain embedded null characters. */
     735  
     736  PyAPI_FUNC(PyObject*) PyUnicode_DecodeLocaleAndSize(
     737      const char *str,
     738      Py_ssize_t len,
     739      const char *errors);
     740  
     741  /* Similar to PyUnicode_DecodeLocaleAndSize(), but compute the string
     742     length using strlen(). */
     743  
     744  PyAPI_FUNC(PyObject*) PyUnicode_DecodeLocale(
     745      const char *str,
     746      const char *errors);
     747  
     748  /* Encode a Unicode object to the current locale encoding. The encoder is
     749     strict is *surrogateescape* is equal to zero, otherwise the
     750     "surrogateescape" error handler is used. Return a bytes object. The string
     751     cannot contain embedded null characters. */
     752  
     753  PyAPI_FUNC(PyObject*) PyUnicode_EncodeLocale(
     754      PyObject *unicode,
     755      const char *errors
     756      );
     757  #endif
     758  
     759  /* --- File system encoding ---------------------------------------------- */
     760  
     761  /* ParseTuple converter: encode str objects to bytes using
     762     PyUnicode_EncodeFSDefault(); bytes objects are output as-is. */
     763  
     764  PyAPI_FUNC(int) PyUnicode_FSConverter(PyObject*, void*);
     765  
     766  /* ParseTuple converter: decode bytes objects to unicode using
     767     PyUnicode_DecodeFSDefaultAndSize(); str objects are output as-is. */
     768  
     769  PyAPI_FUNC(int) PyUnicode_FSDecoder(PyObject*, void*);
     770  
     771  /* Decode a null-terminated string using Py_FileSystemDefaultEncoding
     772     and the "surrogateescape" error handler.
     773  
     774     If Py_FileSystemDefaultEncoding is not set, fall back to the locale
     775     encoding.
     776  
     777     Use PyUnicode_DecodeFSDefaultAndSize() if the string length is known.
     778  */
     779  
     780  PyAPI_FUNC(PyObject*) PyUnicode_DecodeFSDefault(
     781      const char *s               /* encoded string */
     782      );
     783  
     784  /* Decode a string using Py_FileSystemDefaultEncoding
     785     and the "surrogateescape" error handler.
     786  
     787     If Py_FileSystemDefaultEncoding is not set, fall back to the locale
     788     encoding.
     789  */
     790  
     791  PyAPI_FUNC(PyObject*) PyUnicode_DecodeFSDefaultAndSize(
     792      const char *s,               /* encoded string */
     793      Py_ssize_t size              /* size */
     794      );
     795  
     796  /* Encode a Unicode object to Py_FileSystemDefaultEncoding with the
     797     "surrogateescape" error handler, and return bytes.
     798  
     799     If Py_FileSystemDefaultEncoding is not set, fall back to the locale
     800     encoding.
     801  */
     802  
     803  PyAPI_FUNC(PyObject*) PyUnicode_EncodeFSDefault(
     804      PyObject *unicode
     805      );
     806  
     807  /* --- Methods & Slots ----------------------------------------------------
     808  
     809     These are capable of handling Unicode objects and strings on input
     810     (we refer to them as strings in the descriptions) and return
     811     Unicode objects or integers as appropriate. */
     812  
     813  /* Concat two strings giving a new Unicode string. */
     814  
     815  PyAPI_FUNC(PyObject*) PyUnicode_Concat(
     816      PyObject *left,             /* Left string */
     817      PyObject *right             /* Right string */
     818      );
     819  
     820  /* Concat two strings and put the result in *pleft
     821     (sets *pleft to NULL on error) */
     822  
     823  PyAPI_FUNC(void) PyUnicode_Append(
     824      PyObject **pleft,           /* Pointer to left string */
     825      PyObject *right             /* Right string */
     826      );
     827  
     828  /* Concat two strings, put the result in *pleft and drop the right object
     829     (sets *pleft to NULL on error) */
     830  
     831  PyAPI_FUNC(void) PyUnicode_AppendAndDel(
     832      PyObject **pleft,           /* Pointer to left string */
     833      PyObject *right             /* Right string */
     834      );
     835  
     836  /* Split a string giving a list of Unicode strings.
     837  
     838     If sep is NULL, splitting will be done at all whitespace
     839     substrings. Otherwise, splits occur at the given separator.
     840  
     841     At most maxsplit splits will be done. If negative, no limit is set.
     842  
     843     Separators are not included in the resulting list.
     844  
     845  */
     846  
     847  PyAPI_FUNC(PyObject*) PyUnicode_Split(
     848      PyObject *s,                /* String to split */
     849      PyObject *sep,              /* String separator */
     850      Py_ssize_t maxsplit         /* Maxsplit count */
     851      );
     852  
     853  /* Dito, but split at line breaks.
     854  
     855     CRLF is considered to be one line break. Line breaks are not
     856     included in the resulting list. */
     857  
     858  PyAPI_FUNC(PyObject*) PyUnicode_Splitlines(
     859      PyObject *s,                /* String to split */
     860      int keepends                /* If true, line end markers are included */
     861      );
     862  
     863  /* Partition a string using a given separator. */
     864  
     865  PyAPI_FUNC(PyObject*) PyUnicode_Partition(
     866      PyObject *s,                /* String to partition */
     867      PyObject *sep               /* String separator */
     868      );
     869  
     870  /* Partition a string using a given separator, searching from the end of the
     871     string. */
     872  
     873  PyAPI_FUNC(PyObject*) PyUnicode_RPartition(
     874      PyObject *s,                /* String to partition */
     875      PyObject *sep               /* String separator */
     876      );
     877  
     878  /* Split a string giving a list of Unicode strings.
     879  
     880     If sep is NULL, splitting will be done at all whitespace
     881     substrings. Otherwise, splits occur at the given separator.
     882  
     883     At most maxsplit splits will be done. But unlike PyUnicode_Split
     884     PyUnicode_RSplit splits from the end of the string. If negative,
     885     no limit is set.
     886  
     887     Separators are not included in the resulting list.
     888  
     889  */
     890  
     891  PyAPI_FUNC(PyObject*) PyUnicode_RSplit(
     892      PyObject *s,                /* String to split */
     893      PyObject *sep,              /* String separator */
     894      Py_ssize_t maxsplit         /* Maxsplit count */
     895      );
     896  
     897  /* Translate a string by applying a character mapping table to it and
     898     return the resulting Unicode object.
     899  
     900     The mapping table must map Unicode ordinal integers to Unicode strings,
     901     Unicode ordinal integers or None (causing deletion of the character).
     902  
     903     Mapping tables may be dictionaries or sequences. Unmapped character
     904     ordinals (ones which cause a LookupError) are left untouched and
     905     are copied as-is.
     906  
     907  */
     908  
     909  PyAPI_FUNC(PyObject *) PyUnicode_Translate(
     910      PyObject *str,              /* String */
     911      PyObject *table,            /* Translate table */
     912      const char *errors          /* error handling */
     913      );
     914  
     915  /* Join a sequence of strings using the given separator and return
     916     the resulting Unicode string. */
     917  
     918  PyAPI_FUNC(PyObject*) PyUnicode_Join(
     919      PyObject *separator,        /* Separator string */
     920      PyObject *seq               /* Sequence object */
     921      );
     922  
     923  /* Return 1 if substr matches str[start:end] at the given tail end, 0
     924     otherwise. */
     925  
     926  PyAPI_FUNC(Py_ssize_t) PyUnicode_Tailmatch(
     927      PyObject *str,              /* String */
     928      PyObject *substr,           /* Prefix or Suffix string */
     929      Py_ssize_t start,           /* Start index */
     930      Py_ssize_t end,             /* Stop index */
     931      int direction               /* Tail end: -1 prefix, +1 suffix */
     932      );
     933  
     934  /* Return the first position of substr in str[start:end] using the
     935     given search direction or -1 if not found. -2 is returned in case
     936     an error occurred and an exception is set. */
     937  
     938  PyAPI_FUNC(Py_ssize_t) PyUnicode_Find(
     939      PyObject *str,              /* String */
     940      PyObject *substr,           /* Substring to find */
     941      Py_ssize_t start,           /* Start index */
     942      Py_ssize_t end,             /* Stop index */
     943      int direction               /* Find direction: +1 forward, -1 backward */
     944      );
     945  
     946  #if !defined(Py_LIMITED_API) || Py_LIMITED_API+0 >= 0x03030000
     947  /* Like PyUnicode_Find, but search for single character only. */
     948  PyAPI_FUNC(Py_ssize_t) PyUnicode_FindChar(
     949      PyObject *str,
     950      Py_UCS4 ch,
     951      Py_ssize_t start,
     952      Py_ssize_t end,
     953      int direction
     954      );
     955  #endif
     956  
     957  /* Count the number of occurrences of substr in str[start:end]. */
     958  
     959  PyAPI_FUNC(Py_ssize_t) PyUnicode_Count(
     960      PyObject *str,              /* String */
     961      PyObject *substr,           /* Substring to count */
     962      Py_ssize_t start,           /* Start index */
     963      Py_ssize_t end              /* Stop index */
     964      );
     965  
     966  /* Replace at most maxcount occurrences of substr in str with replstr
     967     and return the resulting Unicode object. */
     968  
     969  PyAPI_FUNC(PyObject *) PyUnicode_Replace(
     970      PyObject *str,              /* String */
     971      PyObject *substr,           /* Substring to find */
     972      PyObject *replstr,          /* Substring to replace */
     973      Py_ssize_t maxcount         /* Max. number of replacements to apply;
     974                                     -1 = all */
     975      );
     976  
     977  /* Compare two strings and return -1, 0, 1 for less than, equal,
     978     greater than resp.
     979     Raise an exception and return -1 on error. */
     980  
     981  PyAPI_FUNC(int) PyUnicode_Compare(
     982      PyObject *left,             /* Left string */
     983      PyObject *right             /* Right string */
     984      );
     985  
     986  /* Compare a Unicode object with C string and return -1, 0, 1 for less than,
     987     equal, and greater than, respectively.  It is best to pass only
     988     ASCII-encoded strings, but the function interprets the input string as
     989     ISO-8859-1 if it contains non-ASCII characters.
     990     This function does not raise exceptions. */
     991  
     992  PyAPI_FUNC(int) PyUnicode_CompareWithASCIIString(
     993      PyObject *left,
     994      const char *right           /* ASCII-encoded string */
     995      );
     996  
     997  /* Rich compare two strings and return one of the following:
     998  
     999     - NULL in case an exception was raised
    1000     - Py_True or Py_False for successful comparisons
    1001     - Py_NotImplemented in case the type combination is unknown
    1002  
    1003     Possible values for op:
    1004  
    1005       Py_GT, Py_GE, Py_EQ, Py_NE, Py_LT, Py_LE
    1006  
    1007  */
    1008  
    1009  PyAPI_FUNC(PyObject *) PyUnicode_RichCompare(
    1010      PyObject *left,             /* Left string */
    1011      PyObject *right,            /* Right string */
    1012      int op                      /* Operation: Py_EQ, Py_NE, Py_GT, etc. */
    1013      );
    1014  
    1015  /* Apply an argument tuple or dictionary to a format string and return
    1016     the resulting Unicode string. */
    1017  
    1018  PyAPI_FUNC(PyObject *) PyUnicode_Format(
    1019      PyObject *format,           /* Format string */
    1020      PyObject *args              /* Argument tuple or dictionary */
    1021      );
    1022  
    1023  /* Checks whether element is contained in container and return 1/0
    1024     accordingly.
    1025  
    1026     element has to coerce to a one element Unicode string. -1 is
    1027     returned in case of an error. */
    1028  
    1029  PyAPI_FUNC(int) PyUnicode_Contains(
    1030      PyObject *container,        /* Container string */
    1031      PyObject *element           /* Element string */
    1032      );
    1033  
    1034  /* Checks whether argument is a valid identifier. */
    1035  
    1036  PyAPI_FUNC(int) PyUnicode_IsIdentifier(PyObject *s);
    1037  
    1038  /* === Characters Type APIs =============================================== */
    1039  
    1040  #ifndef Py_LIMITED_API
    1041  #  define Py_CPYTHON_UNICODEOBJECT_H
    1042  #  include "cpython/unicodeobject.h"
    1043  #  undef Py_CPYTHON_UNICODEOBJECT_H
    1044  #endif
    1045  
    1046  #ifdef __cplusplus
    1047  }
    1048  #endif
    1049  #endif /* !Py_UNICODEOBJECT_H */