1 #ifndef Py_CPYTHON_UNICODEOBJECT_H
2 # error "this header file must not be included directly"
3 #endif
4
5 /* Py_UNICODE was the native Unicode storage format (code unit) used by
6 Python and represents a single Unicode element in the Unicode type.
7 With PEP 393, Py_UNICODE is deprecated and replaced with a
8 typedef to wchar_t. */
9 #define PY_UNICODE_TYPE wchar_t
10 /* Py_DEPRECATED(3.3) */ typedef wchar_t Py_UNICODE;
11
12 /* --- Internal Unicode Operations ---------------------------------------- */
13
14 // Static inline functions to work with surrogates
15 static inline int Py_UNICODE_IS_SURROGATE(Py_UCS4 ch) {
16 return (0xD800 <= ch && ch <= 0xDFFF);
17 }
18 static inline int Py_UNICODE_IS_HIGH_SURROGATE(Py_UCS4 ch) {
19 return (0xD800 <= ch && ch <= 0xDBFF);
20 }
21 static inline int Py_UNICODE_IS_LOW_SURROGATE(Py_UCS4 ch) {
22 return (0xDC00 <= ch && ch <= 0xDFFF);
23 }
24
25 // Join two surrogate characters and return a single Py_UCS4 value.
26 static inline Py_UCS4 Py_UNICODE_JOIN_SURROGATES(Py_UCS4 high, Py_UCS4 low) {
27 assert(Py_UNICODE_IS_HIGH_SURROGATE(high));
28 assert(Py_UNICODE_IS_LOW_SURROGATE(low));
29 return 0x10000 + (((high & 0x03FF) << 10) | (low & 0x03FF));
30 }
31
32 // High surrogate = top 10 bits added to 0xD800.
33 // The character must be in the range [U+10000; U+10ffff].
34 static inline Py_UCS4 Py_UNICODE_HIGH_SURROGATE(Py_UCS4 ch) {
35 assert(0x10000 <= ch && ch <= 0x10ffff);
36 return (0xD800 - (0x10000 >> 10) + (ch >> 10));
37 }
38
39 // Low surrogate = bottom 10 bits added to 0xDC00.
40 // The character must be in the range [U+10000; U+10ffff].
41 static inline Py_UCS4 Py_UNICODE_LOW_SURROGATE(Py_UCS4 ch) {
42 assert(0x10000 <= ch && ch <= 0x10ffff);
43 return (0xDC00 + (ch & 0x3FF));
44 }
45
46 /* --- Unicode Type ------------------------------------------------------- */
47
48 /* ASCII-only strings created through PyUnicode_New use the PyASCIIObject
49 structure. state.ascii and state.compact are set, and the data
50 immediately follow the structure. utf8_length can be found
51 in the length field; the utf8 pointer is equal to the data pointer. */
52 typedef struct {
53 /* There are 4 forms of Unicode strings:
54
55 - compact ascii:
56
57 * structure = PyASCIIObject
58 * test: PyUnicode_IS_COMPACT_ASCII(op)
59 * kind = PyUnicode_1BYTE_KIND
60 * compact = 1
61 * ascii = 1
62 * (length is the length of the utf8)
63 * (data starts just after the structure)
64 * (since ASCII is decoded from UTF-8, the utf8 string are the data)
65
66 - compact:
67
68 * structure = PyCompactUnicodeObject
69 * test: PyUnicode_IS_COMPACT(op) && !PyUnicode_IS_ASCII(op)
70 * kind = PyUnicode_1BYTE_KIND, PyUnicode_2BYTE_KIND or
71 PyUnicode_4BYTE_KIND
72 * compact = 1
73 * ascii = 0
74 * utf8 is not shared with data
75 * utf8_length = 0 if utf8 is NULL
76 * (data starts just after the structure)
77
78 - legacy string:
79
80 * structure = PyUnicodeObject structure
81 * test: !PyUnicode_IS_COMPACT(op)
82 * kind = PyUnicode_1BYTE_KIND, PyUnicode_2BYTE_KIND or
83 PyUnicode_4BYTE_KIND
84 * compact = 0
85 * data.any is not NULL
86 * utf8 is shared and utf8_length = length with data.any if ascii = 1
87 * utf8_length = 0 if utf8 is NULL
88
89 Compact strings use only one memory block (structure + characters),
90 whereas legacy strings use one block for the structure and one block
91 for characters.
92
93 Legacy strings are created by subclasses of Unicode.
94
95 See also _PyUnicode_CheckConsistency().
96 */
97 PyObject_HEAD
98 Py_ssize_t length; /* Number of code points in the string */
99 Py_hash_t hash; /* Hash value; -1 if not set */
100 struct {
101 /* If interned is non-zero, the two references from the
102 dictionary to this object are *not* counted in ob_refcnt.
103 The possible values here are:
104 0: Not Interned
105 1: Interned
106 2: Interned and Immortal
107 3: Interned, Immortal, and Static
108 This categorization allows the runtime to determine the right
109 cleanup mechanism at runtime shutdown. */
110 unsigned int interned:2;
111 /* Character size:
112
113 - PyUnicode_1BYTE_KIND (1):
114
115 * character type = Py_UCS1 (8 bits, unsigned)
116 * all characters are in the range U+0000-U+00FF (latin1)
117 * if ascii is set, all characters are in the range U+0000-U+007F
118 (ASCII), otherwise at least one character is in the range
119 U+0080-U+00FF
120
121 - PyUnicode_2BYTE_KIND (2):
122
123 * character type = Py_UCS2 (16 bits, unsigned)
124 * all characters are in the range U+0000-U+FFFF (BMP)
125 * at least one character is in the range U+0100-U+FFFF
126
127 - PyUnicode_4BYTE_KIND (4):
128
129 * character type = Py_UCS4 (32 bits, unsigned)
130 * all characters are in the range U+0000-U+10FFFF
131 * at least one character is in the range U+10000-U+10FFFF
132 */
133 unsigned int kind:3;
134 /* Compact is with respect to the allocation scheme. Compact unicode
135 objects only require one memory block while non-compact objects use
136 one block for the PyUnicodeObject struct and another for its data
137 buffer. */
138 unsigned int compact:1;
139 /* The string only contains characters in the range U+0000-U+007F (ASCII)
140 and the kind is PyUnicode_1BYTE_KIND. If ascii is set and compact is
141 set, use the PyASCIIObject structure. */
142 unsigned int ascii:1;
143 /* Padding to ensure that PyUnicode_DATA() is always aligned to
144 4 bytes (see issue #19537 on m68k). */
145 unsigned int :25;
146 } state;
147 } PyASCIIObject;
148
149 /* Non-ASCII strings allocated through PyUnicode_New use the
150 PyCompactUnicodeObject structure. state.compact is set, and the data
151 immediately follow the structure. */
152 typedef struct {
153 PyASCIIObject _base;
154 Py_ssize_t utf8_length; /* Number of bytes in utf8, excluding the
155 * terminating \0. */
156 char *utf8; /* UTF-8 representation (null-terminated) */
157 } PyCompactUnicodeObject;
158
159 /* Object format for Unicode subclasses. */
160 typedef struct {
161 PyCompactUnicodeObject _base;
162 union {
163 void *any;
164 Py_UCS1 *latin1;
165 Py_UCS2 *ucs2;
166 Py_UCS4 *ucs4;
167 } data; /* Canonical, smallest-form Unicode buffer */
168 } PyUnicodeObject;
169
170 PyAPI_FUNC(int) _PyUnicode_CheckConsistency(
171 PyObject *op,
172 int check_content);
173
174
175 #define _PyASCIIObject_CAST(op) \
176 (assert(PyUnicode_Check(op)), \
177 _Py_CAST(PyASCIIObject*, (op)))
178 #define _PyCompactUnicodeObject_CAST(op) \
179 (assert(PyUnicode_Check(op)), \
180 _Py_CAST(PyCompactUnicodeObject*, (op)))
181 #define _PyUnicodeObject_CAST(op) \
182 (assert(PyUnicode_Check(op)), \
183 _Py_CAST(PyUnicodeObject*, (op)))
184
185
186 /* --- Flexible String Representation Helper Macros (PEP 393) -------------- */
187
188 /* Values for PyASCIIObject.state: */
189
190 /* Interning state. */
191 #define SSTATE_NOT_INTERNED 0
192 #define SSTATE_INTERNED_MORTAL 1
193 #define SSTATE_INTERNED_IMMORTAL 2
194 #define SSTATE_INTERNED_IMMORTAL_STATIC 3
195
196 /* Use only if you know it's a string */
197 static inline unsigned int PyUnicode_CHECK_INTERNED(PyObject *op) {
198 return _PyASCIIObject_CAST(op)->state.interned;
199 }
200 #define PyUnicode_CHECK_INTERNED(op) PyUnicode_CHECK_INTERNED(_PyObject_CAST(op))
201
202 /* For backward compatibility */
203 static inline unsigned int PyUnicode_IS_READY(PyObject* Py_UNUSED(op)) {
204 return 1;
205 }
206 #define PyUnicode_IS_READY(op) PyUnicode_IS_READY(_PyObject_CAST(op))
207
208 /* Return true if the string contains only ASCII characters, or 0 if not. The
209 string may be compact (PyUnicode_IS_COMPACT_ASCII) or not, but must be
210 ready. */
211 static inline unsigned int PyUnicode_IS_ASCII(PyObject *op) {
212 return _PyASCIIObject_CAST(op)->state.ascii;
213 }
214 #define PyUnicode_IS_ASCII(op) PyUnicode_IS_ASCII(_PyObject_CAST(op))
215
216 /* Return true if the string is compact or 0 if not.
217 No type checks or Ready calls are performed. */
218 static inline unsigned int PyUnicode_IS_COMPACT(PyObject *op) {
219 return _PyASCIIObject_CAST(op)->state.compact;
220 }
221 #define PyUnicode_IS_COMPACT(op) PyUnicode_IS_COMPACT(_PyObject_CAST(op))
222
223 /* Return true if the string is a compact ASCII string (use PyASCIIObject
224 structure), or 0 if not. No type checks or Ready calls are performed. */
225 static inline int PyUnicode_IS_COMPACT_ASCII(PyObject *op) {
226 return (_PyASCIIObject_CAST(op)->state.ascii && PyUnicode_IS_COMPACT(op));
227 }
228 #define PyUnicode_IS_COMPACT_ASCII(op) PyUnicode_IS_COMPACT_ASCII(_PyObject_CAST(op))
229
230 enum PyUnicode_Kind {
231 /* Return values of the PyUnicode_KIND() function: */
232 PyUnicode_1BYTE_KIND = 1,
233 PyUnicode_2BYTE_KIND = 2,
234 PyUnicode_4BYTE_KIND = 4
235 };
236
237 // PyUnicode_KIND(): Return one of the PyUnicode_*_KIND values defined above.
238 //
239 // gh-89653: Converting this macro to a static inline function would introduce
240 // new compiler warnings on "kind < PyUnicode_KIND(str)" (compare signed and
241 // unsigned numbers) where kind type is an int or on
242 // "unsigned int kind = PyUnicode_KIND(str)" (cast signed to unsigned).
243 #define PyUnicode_KIND(op) _Py_RVALUE(_PyASCIIObject_CAST(op)->state.kind)
244
245 /* Return a void pointer to the raw unicode buffer. */
246 static inline void* _PyUnicode_COMPACT_DATA(PyObject *op) {
247 if (PyUnicode_IS_ASCII(op)) {
248 return _Py_STATIC_CAST(void*, (_PyASCIIObject_CAST(op) + 1));
249 }
250 return _Py_STATIC_CAST(void*, (_PyCompactUnicodeObject_CAST(op) + 1));
251 }
252
253 static inline void* _PyUnicode_NONCOMPACT_DATA(PyObject *op) {
254 void *data;
255 assert(!PyUnicode_IS_COMPACT(op));
256 data = _PyUnicodeObject_CAST(op)->data.any;
257 assert(data != NULL);
258 return data;
259 }
260
261 static inline void* PyUnicode_DATA(PyObject *op) {
262 if (PyUnicode_IS_COMPACT(op)) {
263 return _PyUnicode_COMPACT_DATA(op);
264 }
265 return _PyUnicode_NONCOMPACT_DATA(op);
266 }
267 #define PyUnicode_DATA(op) PyUnicode_DATA(_PyObject_CAST(op))
268
269 /* Return pointers to the canonical representation cast to unsigned char,
270 Py_UCS2, or Py_UCS4 for direct character access.
271 No checks are performed, use PyUnicode_KIND() before to ensure
272 these will work correctly. */
273
274 #define PyUnicode_1BYTE_DATA(op) _Py_STATIC_CAST(Py_UCS1*, PyUnicode_DATA(op))
275 #define PyUnicode_2BYTE_DATA(op) _Py_STATIC_CAST(Py_UCS2*, PyUnicode_DATA(op))
276 #define PyUnicode_4BYTE_DATA(op) _Py_STATIC_CAST(Py_UCS4*, PyUnicode_DATA(op))
277
278 /* Returns the length of the unicode string. */
279 static inline Py_ssize_t PyUnicode_GET_LENGTH(PyObject *op) {
280 return _PyASCIIObject_CAST(op)->length;
281 }
282 #define PyUnicode_GET_LENGTH(op) PyUnicode_GET_LENGTH(_PyObject_CAST(op))
283
284 /* Write into the canonical representation, this function does not do any sanity
285 checks and is intended for usage in loops. The caller should cache the
286 kind and data pointers obtained from other function calls.
287 index is the index in the string (starts at 0) and value is the new
288 code point value which should be written to that location. */
289 static inline void PyUnicode_WRITE(int kind, void *data,
290 Py_ssize_t index, Py_UCS4 value)
291 {
292 assert(index >= 0);
293 if (kind == PyUnicode_1BYTE_KIND) {
294 assert(value <= 0xffU);
295 _Py_STATIC_CAST(Py_UCS1*, data)[index] = _Py_STATIC_CAST(Py_UCS1, value);
296 }
297 else if (kind == PyUnicode_2BYTE_KIND) {
298 assert(value <= 0xffffU);
299 _Py_STATIC_CAST(Py_UCS2*, data)[index] = _Py_STATIC_CAST(Py_UCS2, value);
300 }
301 else {
302 assert(kind == PyUnicode_4BYTE_KIND);
303 assert(value <= 0x10ffffU);
304 _Py_STATIC_CAST(Py_UCS4*, data)[index] = value;
305 }
306 }
307 #define PyUnicode_WRITE(kind, data, index, value) \
308 PyUnicode_WRITE(_Py_STATIC_CAST(int, kind), _Py_CAST(void*, data), \
309 (index), _Py_STATIC_CAST(Py_UCS4, value))
310
311 /* Read a code point from the string's canonical representation. No checks
312 or ready calls are performed. */
313 static inline Py_UCS4 PyUnicode_READ(int kind,
314 const void *data, Py_ssize_t index)
315 {
316 assert(index >= 0);
317 if (kind == PyUnicode_1BYTE_KIND) {
318 return _Py_STATIC_CAST(const Py_UCS1*, data)[index];
319 }
320 if (kind == PyUnicode_2BYTE_KIND) {
321 return _Py_STATIC_CAST(const Py_UCS2*, data)[index];
322 }
323 assert(kind == PyUnicode_4BYTE_KIND);
324 return _Py_STATIC_CAST(const Py_UCS4*, data)[index];
325 }
326 #define PyUnicode_READ(kind, data, index) \
327 PyUnicode_READ(_Py_STATIC_CAST(int, kind), \
328 _Py_STATIC_CAST(const void*, data), \
329 (index))
330
331 /* PyUnicode_READ_CHAR() is less efficient than PyUnicode_READ() because it
332 calls PyUnicode_KIND() and might call it twice. For single reads, use
333 PyUnicode_READ_CHAR, for multiple consecutive reads callers should
334 cache kind and use PyUnicode_READ instead. */
335 static inline Py_UCS4 PyUnicode_READ_CHAR(PyObject *unicode, Py_ssize_t index)
336 {
337 int kind;
338
339 assert(index >= 0);
340 // Tolerate reading the NUL character at str[len(str)]
341 assert(index <= PyUnicode_GET_LENGTH(unicode));
342
343 kind = PyUnicode_KIND(unicode);
344 if (kind == PyUnicode_1BYTE_KIND) {
345 return PyUnicode_1BYTE_DATA(unicode)[index];
346 }
347 if (kind == PyUnicode_2BYTE_KIND) {
348 return PyUnicode_2BYTE_DATA(unicode)[index];
349 }
350 assert(kind == PyUnicode_4BYTE_KIND);
351 return PyUnicode_4BYTE_DATA(unicode)[index];
352 }
353 #define PyUnicode_READ_CHAR(unicode, index) \
354 PyUnicode_READ_CHAR(_PyObject_CAST(unicode), (index))
355
356 /* Return a maximum character value which is suitable for creating another
357 string based on op. This is always an approximation but more efficient
358 than iterating over the string. */
359 static inline Py_UCS4 PyUnicode_MAX_CHAR_VALUE(PyObject *op)
360 {
361 int kind;
362
363 if (PyUnicode_IS_ASCII(op)) {
364 return 0x7fU;
365 }
366
367 kind = PyUnicode_KIND(op);
368 if (kind == PyUnicode_1BYTE_KIND) {
369 return 0xffU;
370 }
371 if (kind == PyUnicode_2BYTE_KIND) {
372 return 0xffffU;
373 }
374 assert(kind == PyUnicode_4BYTE_KIND);
375 return 0x10ffffU;
376 }
377 #define PyUnicode_MAX_CHAR_VALUE(op) \
378 PyUnicode_MAX_CHAR_VALUE(_PyObject_CAST(op))
379
380 /* === Public API ========================================================= */
381
382 /* --- Plain Py_UNICODE --------------------------------------------------- */
383
384 /* With PEP 393, this is the recommended way to allocate a new unicode object.
385 This function will allocate the object and its buffer in a single memory
386 block. Objects created using this function are not resizable. */
387 PyAPI_FUNC(PyObject*) PyUnicode_New(
388 Py_ssize_t size, /* Number of code points in the new string */
389 Py_UCS4 maxchar /* maximum code point value in the string */
390 );
391
392 /* For backward compatibility */
393 static inline int PyUnicode_READY(PyObject* Py_UNUSED(op))
394 {
395 return 0;
396 }
397 #define PyUnicode_READY(op) PyUnicode_READY(_PyObject_CAST(op))
398
399 /* Get a copy of a Unicode string. */
400 PyAPI_FUNC(PyObject*) _PyUnicode_Copy(
401 PyObject *unicode
402 );
403
404 /* Copy character from one unicode object into another, this function performs
405 character conversion when necessary and falls back to memcpy() if possible.
406
407 Fail if to is too small (smaller than *how_many* or smaller than
408 len(from)-from_start), or if kind(from[from_start:from_start+how_many]) >
409 kind(to), or if *to* has more than 1 reference.
410
411 Return the number of written character, or return -1 and raise an exception
412 on error.
413
414 Pseudo-code:
415
416 how_many = min(how_many, len(from) - from_start)
417 to[to_start:to_start+how_many] = from[from_start:from_start+how_many]
418 return how_many
419
420 Note: The function doesn't write a terminating null character.
421 */
422 PyAPI_FUNC(Py_ssize_t) PyUnicode_CopyCharacters(
423 PyObject *to,
424 Py_ssize_t to_start,
425 PyObject *from,
426 Py_ssize_t from_start,
427 Py_ssize_t how_many
428 );
429
430 /* Unsafe version of PyUnicode_CopyCharacters(): don't check arguments and so
431 may crash if parameters are invalid (e.g. if the output string
432 is too short). */
433 PyAPI_FUNC(void) _PyUnicode_FastCopyCharacters(
434 PyObject *to,
435 Py_ssize_t to_start,
436 PyObject *from,
437 Py_ssize_t from_start,
438 Py_ssize_t how_many
439 );
440
441 /* Fill a string with a character: write fill_char into
442 unicode[start:start+length].
443
444 Fail if fill_char is bigger than the string maximum character, or if the
445 string has more than 1 reference.
446
447 Return the number of written character, or return -1 and raise an exception
448 on error. */
449 PyAPI_FUNC(Py_ssize_t) PyUnicode_Fill(
450 PyObject *unicode,
451 Py_ssize_t start,
452 Py_ssize_t length,
453 Py_UCS4 fill_char
454 );
455
456 /* Unsafe version of PyUnicode_Fill(): don't check arguments and so may crash
457 if parameters are invalid (e.g. if length is longer than the string). */
458 PyAPI_FUNC(void) _PyUnicode_FastFill(
459 PyObject *unicode,
460 Py_ssize_t start,
461 Py_ssize_t length,
462 Py_UCS4 fill_char
463 );
464
465 /* Create a new string from a buffer of Py_UCS1, Py_UCS2 or Py_UCS4 characters.
466 Scan the string to find the maximum character. */
467 PyAPI_FUNC(PyObject*) PyUnicode_FromKindAndData(
468 int kind,
469 const void *buffer,
470 Py_ssize_t size);
471
472 /* Create a new string from a buffer of ASCII characters.
473 WARNING: Don't check if the string contains any non-ASCII character. */
474 PyAPI_FUNC(PyObject*) _PyUnicode_FromASCII(
475 const char *buffer,
476 Py_ssize_t size);
477
478 /* Compute the maximum character of the substring unicode[start:end].
479 Return 127 for an empty string. */
480 PyAPI_FUNC(Py_UCS4) _PyUnicode_FindMaxChar (
481 PyObject *unicode,
482 Py_ssize_t start,
483 Py_ssize_t end);
484
485 /* --- _PyUnicodeWriter API ----------------------------------------------- */
486
487 typedef struct {
488 PyObject *buffer;
489 void *data;
490 int kind;
491 Py_UCS4 maxchar;
492 Py_ssize_t size;
493 Py_ssize_t pos;
494
495 /* minimum number of allocated characters (default: 0) */
496 Py_ssize_t min_length;
497
498 /* minimum character (default: 127, ASCII) */
499 Py_UCS4 min_char;
500
501 /* If non-zero, overallocate the buffer (default: 0). */
502 unsigned char overallocate;
503
504 /* If readonly is 1, buffer is a shared string (cannot be modified)
505 and size is set to 0. */
506 unsigned char readonly;
507 } _PyUnicodeWriter ;
508
509 /* Initialize a Unicode writer.
510 *
511 * By default, the minimum buffer size is 0 character and overallocation is
512 * disabled. Set min_length, min_char and overallocate attributes to control
513 * the allocation of the buffer. */
514 PyAPI_FUNC(void)
515 _PyUnicodeWriter_Init(_PyUnicodeWriter *writer);
516
517 /* Prepare the buffer to write 'length' characters
518 with the specified maximum character.
519
520 Return 0 on success, raise an exception and return -1 on error. */
521 #define _PyUnicodeWriter_Prepare(WRITER, LENGTH, MAXCHAR) \
522 (((MAXCHAR) <= (WRITER)->maxchar \
523 && (LENGTH) <= (WRITER)->size - (WRITER)->pos) \
524 ? 0 \
525 : (((LENGTH) == 0) \
526 ? 0 \
527 : _PyUnicodeWriter_PrepareInternal((WRITER), (LENGTH), (MAXCHAR))))
528
529 /* Don't call this function directly, use the _PyUnicodeWriter_Prepare() macro
530 instead. */
531 PyAPI_FUNC(int)
532 _PyUnicodeWriter_PrepareInternal(_PyUnicodeWriter *writer,
533 Py_ssize_t length, Py_UCS4 maxchar);
534
535 /* Prepare the buffer to have at least the kind KIND.
536 For example, kind=PyUnicode_2BYTE_KIND ensures that the writer will
537 support characters in range U+000-U+FFFF.
538
539 Return 0 on success, raise an exception and return -1 on error. */
540 #define _PyUnicodeWriter_PrepareKind(WRITER, KIND) \
541 ((KIND) <= (WRITER)->kind \
542 ? 0 \
543 : _PyUnicodeWriter_PrepareKindInternal((WRITER), (KIND)))
544
545 /* Don't call this function directly, use the _PyUnicodeWriter_PrepareKind()
546 macro instead. */
547 PyAPI_FUNC(int)
548 _PyUnicodeWriter_PrepareKindInternal(_PyUnicodeWriter *writer,
549 int kind);
550
551 /* Append a Unicode character.
552 Return 0 on success, raise an exception and return -1 on error. */
553 PyAPI_FUNC(int)
554 _PyUnicodeWriter_WriteChar(_PyUnicodeWriter *writer,
555 Py_UCS4 ch
556 );
557
558 /* Append a Unicode string.
559 Return 0 on success, raise an exception and return -1 on error. */
560 PyAPI_FUNC(int)
561 _PyUnicodeWriter_WriteStr(_PyUnicodeWriter *writer,
562 PyObject *str /* Unicode string */
563 );
564
565 /* Append a substring of a Unicode string.
566 Return 0 on success, raise an exception and return -1 on error. */
567 PyAPI_FUNC(int)
568 _PyUnicodeWriter_WriteSubstring(_PyUnicodeWriter *writer,
569 PyObject *str, /* Unicode string */
570 Py_ssize_t start,
571 Py_ssize_t end
572 );
573
574 /* Append an ASCII-encoded byte string.
575 Return 0 on success, raise an exception and return -1 on error. */
576 PyAPI_FUNC(int)
577 _PyUnicodeWriter_WriteASCIIString(_PyUnicodeWriter *writer,
578 const char *str, /* ASCII-encoded byte string */
579 Py_ssize_t len /* number of bytes, or -1 if unknown */
580 );
581
582 /* Append a latin1-encoded byte string.
583 Return 0 on success, raise an exception and return -1 on error. */
584 PyAPI_FUNC(int)
585 _PyUnicodeWriter_WriteLatin1String(_PyUnicodeWriter *writer,
586 const char *str, /* latin1-encoded byte string */
587 Py_ssize_t len /* length in bytes */
588 );
589
590 /* Get the value of the writer as a Unicode string. Clear the
591 buffer of the writer. Raise an exception and return NULL
592 on error. */
593 PyAPI_FUNC(PyObject *)
594 _PyUnicodeWriter_Finish(_PyUnicodeWriter *writer);
595
596 /* Deallocate memory of a writer (clear its internal buffer). */
597 PyAPI_FUNC(void)
598 _PyUnicodeWriter_Dealloc(_PyUnicodeWriter *writer);
599
600
601 /* Format the object based on the format_spec, as defined in PEP 3101
602 (Advanced String Formatting). */
603 PyAPI_FUNC(int) _PyUnicode_FormatAdvancedWriter(
604 _PyUnicodeWriter *writer,
605 PyObject *obj,
606 PyObject *format_spec,
607 Py_ssize_t start,
608 Py_ssize_t end);
609
610 /* --- Manage the default encoding ---------------------------------------- */
611
612 /* Returns a pointer to the default encoding (UTF-8) of the
613 Unicode object unicode.
614
615 Like PyUnicode_AsUTF8AndSize(), this also caches the UTF-8 representation
616 in the unicodeobject.
617
618 _PyUnicode_AsString is a #define for PyUnicode_AsUTF8 to
619 support the previous internal function with the same behaviour.
620
621 Use of this API is DEPRECATED since no size information can be
622 extracted from the returned data.
623 */
624
625 PyAPI_FUNC(const char *) PyUnicode_AsUTF8(PyObject *unicode);
626
627 #define _PyUnicode_AsString PyUnicode_AsUTF8
628
629 /* --- UTF-7 Codecs ------------------------------------------------------- */
630
631 PyAPI_FUNC(PyObject*) _PyUnicode_EncodeUTF7(
632 PyObject *unicode, /* Unicode object */
633 int base64SetO, /* Encode RFC2152 Set O characters in base64 */
634 int base64WhiteSpace, /* Encode whitespace (sp, ht, nl, cr) in base64 */
635 const char *errors /* error handling */
636 );
637
638 /* --- UTF-8 Codecs ------------------------------------------------------- */
639
640 PyAPI_FUNC(PyObject*) _PyUnicode_AsUTF8String(
641 PyObject *unicode,
642 const char *errors);
643
644 /* --- UTF-32 Codecs ------------------------------------------------------ */
645
646 PyAPI_FUNC(PyObject*) _PyUnicode_EncodeUTF32(
647 PyObject *object, /* Unicode object */
648 const char *errors, /* error handling */
649 int byteorder /* byteorder to use 0=BOM+native;-1=LE,1=BE */
650 );
651
652 /* --- UTF-16 Codecs ------------------------------------------------------ */
653
654 /* Returns a Python string object holding the UTF-16 encoded value of
655 the Unicode data.
656
657 If byteorder is not 0, output is written according to the following
658 byte order:
659
660 byteorder == -1: little endian
661 byteorder == 0: native byte order (writes a BOM mark)
662 byteorder == 1: big endian
663
664 If byteorder is 0, the output string will always start with the
665 Unicode BOM mark (U+FEFF). In the other two modes, no BOM mark is
666 prepended.
667 */
668 PyAPI_FUNC(PyObject*) _PyUnicode_EncodeUTF16(
669 PyObject* unicode, /* Unicode object */
670 const char *errors, /* error handling */
671 int byteorder /* byteorder to use 0=BOM+native;-1=LE,1=BE */
672 );
673
674 /* --- Unicode-Escape Codecs ---------------------------------------------- */
675
676 /* Variant of PyUnicode_DecodeUnicodeEscape that supports partial decoding. */
677 PyAPI_FUNC(PyObject*) _PyUnicode_DecodeUnicodeEscapeStateful(
678 const char *string, /* Unicode-Escape encoded string */
679 Py_ssize_t length, /* size of string */
680 const char *errors, /* error handling */
681 Py_ssize_t *consumed /* bytes consumed */
682 );
683 /* Helper for PyUnicode_DecodeUnicodeEscape that detects invalid escape
684 chars. */
685 PyAPI_FUNC(PyObject*) _PyUnicode_DecodeUnicodeEscapeInternal(
686 const char *string, /* Unicode-Escape encoded string */
687 Py_ssize_t length, /* size of string */
688 const char *errors, /* error handling */
689 Py_ssize_t *consumed, /* bytes consumed */
690 const char **first_invalid_escape /* on return, points to first
691 invalid escaped char in
692 string. */
693 );
694
695 /* --- Raw-Unicode-Escape Codecs ---------------------------------------------- */
696
697 /* Variant of PyUnicode_DecodeRawUnicodeEscape that supports partial decoding. */
698 PyAPI_FUNC(PyObject*) _PyUnicode_DecodeRawUnicodeEscapeStateful(
699 const char *string, /* Unicode-Escape encoded string */
700 Py_ssize_t length, /* size of string */
701 const char *errors, /* error handling */
702 Py_ssize_t *consumed /* bytes consumed */
703 );
704
705 /* --- Latin-1 Codecs ----------------------------------------------------- */
706
707 PyAPI_FUNC(PyObject*) _PyUnicode_AsLatin1String(
708 PyObject* unicode,
709 const char* errors);
710
711 /* --- ASCII Codecs ------------------------------------------------------- */
712
713 PyAPI_FUNC(PyObject*) _PyUnicode_AsASCIIString(
714 PyObject* unicode,
715 const char* errors);
716
717 /* --- Character Map Codecs ----------------------------------------------- */
718
719 /* Translate an Unicode object by applying a character mapping table to
720 it and return the resulting Unicode object.
721
722 The mapping table must map Unicode ordinal integers to Unicode strings,
723 Unicode ordinal integers or None (causing deletion of the character).
724
725 Mapping tables may be dictionaries or sequences. Unmapped character
726 ordinals (ones which cause a LookupError) are left untouched and
727 are copied as-is.
728 */
729 PyAPI_FUNC(PyObject*) _PyUnicode_EncodeCharmap(
730 PyObject *unicode, /* Unicode object */
731 PyObject *mapping, /* encoding mapping */
732 const char *errors /* error handling */
733 );
734
735 /* --- Decimal Encoder ---------------------------------------------------- */
736
737 /* Coverts a Unicode object holding a decimal value to an ASCII string
738 for using in int, float and complex parsers.
739 Transforms code points that have decimal digit property to the
740 corresponding ASCII digit code points. Transforms spaces to ASCII.
741 Transforms code points starting from the first non-ASCII code point that
742 is neither a decimal digit nor a space to the end into '?'. */
743
744 PyAPI_FUNC(PyObject*) _PyUnicode_TransformDecimalAndSpaceToASCII(
745 PyObject *unicode /* Unicode object */
746 );
747
748 /* --- Methods & Slots ---------------------------------------------------- */
749
750 PyAPI_FUNC(PyObject *) _PyUnicode_JoinArray(
751 PyObject *separator,
752 PyObject *const *items,
753 Py_ssize_t seqlen
754 );
755
756 /* Test whether a unicode is equal to ASCII identifier. Return 1 if true,
757 0 otherwise. The right argument must be ASCII identifier.
758 Any error occurs inside will be cleared before return. */
759 PyAPI_FUNC(int) _PyUnicode_EqualToASCIIId(
760 PyObject *left, /* Left string */
761 _Py_Identifier *right /* Right identifier */
762 );
763
764 /* Test whether a unicode is equal to ASCII string. Return 1 if true,
765 0 otherwise. The right argument must be ASCII-encoded string.
766 Any error occurs inside will be cleared before return. */
767 PyAPI_FUNC(int) _PyUnicode_EqualToASCIIString(
768 PyObject *left,
769 const char *right /* ASCII-encoded string */
770 );
771
772 /* Externally visible for str.strip(unicode) */
773 PyAPI_FUNC(PyObject *) _PyUnicode_XStrip(
774 PyObject *self,
775 int striptype,
776 PyObject *sepobj
777 );
778
779 /* Using explicit passed-in values, insert the thousands grouping
780 into the string pointed to by buffer. For the argument descriptions,
781 see Objects/stringlib/localeutil.h */
782 PyAPI_FUNC(Py_ssize_t) _PyUnicode_InsertThousandsGrouping(
783 _PyUnicodeWriter *writer,
784 Py_ssize_t n_buffer,
785 PyObject *digits,
786 Py_ssize_t d_pos,
787 Py_ssize_t n_digits,
788 Py_ssize_t min_width,
789 const char *grouping,
790 PyObject *thousands_sep,
791 Py_UCS4 *maxchar);
792
793 /* === Characters Type APIs =============================================== */
794
795 /* These should not be used directly. Use the Py_UNICODE_IS* and
796 Py_UNICODE_TO* macros instead.
797
798 These APIs are implemented in Objects/unicodectype.c.
799
800 */
801
802 PyAPI_FUNC(int) _PyUnicode_IsLowercase(
803 Py_UCS4 ch /* Unicode character */
804 );
805
806 PyAPI_FUNC(int) _PyUnicode_IsUppercase(
807 Py_UCS4 ch /* Unicode character */
808 );
809
810 PyAPI_FUNC(int) _PyUnicode_IsTitlecase(
811 Py_UCS4 ch /* Unicode character */
812 );
813
814 PyAPI_FUNC(int) _PyUnicode_IsXidStart(
815 Py_UCS4 ch /* Unicode character */
816 );
817
818 PyAPI_FUNC(int) _PyUnicode_IsXidContinue(
819 Py_UCS4 ch /* Unicode character */
820 );
821
822 PyAPI_FUNC(int) _PyUnicode_IsWhitespace(
823 const Py_UCS4 ch /* Unicode character */
824 );
825
826 PyAPI_FUNC(int) _PyUnicode_IsLinebreak(
827 const Py_UCS4 ch /* Unicode character */
828 );
829
830 /* Py_DEPRECATED(3.3) */ PyAPI_FUNC(Py_UCS4) _PyUnicode_ToLowercase(
831 Py_UCS4 ch /* Unicode character */
832 );
833
834 /* Py_DEPRECATED(3.3) */ PyAPI_FUNC(Py_UCS4) _PyUnicode_ToUppercase(
835 Py_UCS4 ch /* Unicode character */
836 );
837
838 Py_DEPRECATED(3.3) PyAPI_FUNC(Py_UCS4) _PyUnicode_ToTitlecase(
839 Py_UCS4 ch /* Unicode character */
840 );
841
842 PyAPI_FUNC(int) _PyUnicode_ToLowerFull(
843 Py_UCS4 ch, /* Unicode character */
844 Py_UCS4 *res
845 );
846
847 PyAPI_FUNC(int) _PyUnicode_ToTitleFull(
848 Py_UCS4 ch, /* Unicode character */
849 Py_UCS4 *res
850 );
851
852 PyAPI_FUNC(int) _PyUnicode_ToUpperFull(
853 Py_UCS4 ch, /* Unicode character */
854 Py_UCS4 *res
855 );
856
857 PyAPI_FUNC(int) _PyUnicode_ToFoldedFull(
858 Py_UCS4 ch, /* Unicode character */
859 Py_UCS4 *res
860 );
861
862 PyAPI_FUNC(int) _PyUnicode_IsCaseIgnorable(
863 Py_UCS4 ch /* Unicode character */
864 );
865
866 PyAPI_FUNC(int) _PyUnicode_IsCased(
867 Py_UCS4 ch /* Unicode character */
868 );
869
870 PyAPI_FUNC(int) _PyUnicode_ToDecimalDigit(
871 Py_UCS4 ch /* Unicode character */
872 );
873
874 PyAPI_FUNC(int) _PyUnicode_ToDigit(
875 Py_UCS4 ch /* Unicode character */
876 );
877
878 PyAPI_FUNC(double) _PyUnicode_ToNumeric(
879 Py_UCS4 ch /* Unicode character */
880 );
881
882 PyAPI_FUNC(int) _PyUnicode_IsDecimalDigit(
883 Py_UCS4 ch /* Unicode character */
884 );
885
886 PyAPI_FUNC(int) _PyUnicode_IsDigit(
887 Py_UCS4 ch /* Unicode character */
888 );
889
890 PyAPI_FUNC(int) _PyUnicode_IsNumeric(
891 Py_UCS4 ch /* Unicode character */
892 );
893
894 PyAPI_FUNC(int) _PyUnicode_IsPrintable(
895 Py_UCS4 ch /* Unicode character */
896 );
897
898 PyAPI_FUNC(int) _PyUnicode_IsAlpha(
899 Py_UCS4 ch /* Unicode character */
900 );
901
902 // Helper array used by Py_UNICODE_ISSPACE().
903 PyAPI_DATA(const unsigned char) _Py_ascii_whitespace[];
904
905 // Since splitting on whitespace is an important use case, and
906 // whitespace in most situations is solely ASCII whitespace, we
907 // optimize for the common case by using a quick look-up table
908 // _Py_ascii_whitespace (see below) with an inlined check.
909 static inline int Py_UNICODE_ISSPACE(Py_UCS4 ch) {
910 if (ch < 128) {
911 return _Py_ascii_whitespace[ch];
912 }
913 return _PyUnicode_IsWhitespace(ch);
914 }
915
916 #define Py_UNICODE_ISLOWER(ch) _PyUnicode_IsLowercase(ch)
917 #define Py_UNICODE_ISUPPER(ch) _PyUnicode_IsUppercase(ch)
918 #define Py_UNICODE_ISTITLE(ch) _PyUnicode_IsTitlecase(ch)
919 #define Py_UNICODE_ISLINEBREAK(ch) _PyUnicode_IsLinebreak(ch)
920
921 #define Py_UNICODE_TOLOWER(ch) _PyUnicode_ToLowercase(ch)
922 #define Py_UNICODE_TOUPPER(ch) _PyUnicode_ToUppercase(ch)
923 #define Py_UNICODE_TOTITLE(ch) _PyUnicode_ToTitlecase(ch)
924
925 #define Py_UNICODE_ISDECIMAL(ch) _PyUnicode_IsDecimalDigit(ch)
926 #define Py_UNICODE_ISDIGIT(ch) _PyUnicode_IsDigit(ch)
927 #define Py_UNICODE_ISNUMERIC(ch) _PyUnicode_IsNumeric(ch)
928 #define Py_UNICODE_ISPRINTABLE(ch) _PyUnicode_IsPrintable(ch)
929
930 #define Py_UNICODE_TODECIMAL(ch) _PyUnicode_ToDecimalDigit(ch)
931 #define Py_UNICODE_TODIGIT(ch) _PyUnicode_ToDigit(ch)
932 #define Py_UNICODE_TONUMERIC(ch) _PyUnicode_ToNumeric(ch)
933
934 #define Py_UNICODE_ISALPHA(ch) _PyUnicode_IsAlpha(ch)
935
936 static inline int Py_UNICODE_ISALNUM(Py_UCS4 ch) {
937 return (Py_UNICODE_ISALPHA(ch)
938 || Py_UNICODE_ISDECIMAL(ch)
939 || Py_UNICODE_ISDIGIT(ch)
940 || Py_UNICODE_ISNUMERIC(ch));
941 }
942
943
944 /* === Misc functions ===================================================== */
945
946 PyAPI_FUNC(PyObject*) _PyUnicode_FormatLong(PyObject *, int, int, int);
947
948 /* Return an interned Unicode object for an Identifier; may fail if there is no memory.*/
949 PyAPI_FUNC(PyObject*) _PyUnicode_FromId(_Py_Identifier*);
950
951 /* Fast equality check when the inputs are known to be exact unicode types
952 and where the hash values are equal (i.e. a very probable match) */
953 PyAPI_FUNC(int) _PyUnicode_EQ(PyObject *, PyObject *);
954
955 /* Equality check. */
956 PyAPI_FUNC(int) _PyUnicode_Equal(PyObject *, PyObject *);
957
958 PyAPI_FUNC(int) _PyUnicode_WideCharString_Converter(PyObject *, void *);
959 PyAPI_FUNC(int) _PyUnicode_WideCharString_Opt_Converter(PyObject *, void *);
960
961 PyAPI_FUNC(Py_ssize_t) _PyUnicode_ScanIdentifier(PyObject *);