1 /*
2 * Secret Labs' Regular Expression Engine
3 *
4 * regular expression matching engine
5 *
6 * partial history:
7 * 1999-10-24 fl created (based on existing template matcher code)
8 * 2000-03-06 fl first alpha, sort of
9 * 2000-08-01 fl fixes for 1.6b1
10 * 2000-08-07 fl use PyOS_CheckStack() if available
11 * 2000-09-20 fl added expand method
12 * 2001-03-20 fl lots of fixes for 2.1b2
13 * 2001-04-15 fl export copyright as Python attribute, not global
14 * 2001-04-28 fl added __copy__ methods (work in progress)
15 * 2001-05-14 fl fixes for 1.5.2 compatibility
16 * 2001-07-01 fl added BIGCHARSET support (from Martin von Loewis)
17 * 2001-10-18 fl fixed group reset issue (from Matthew Mueller)
18 * 2001-10-20 fl added split primitive; re-enable unicode for 1.6/2.0/2.1
19 * 2001-10-21 fl added sub/subn primitive
20 * 2001-10-24 fl added finditer primitive (for 2.2 only)
21 * 2001-12-07 fl fixed memory leak in sub/subn (Guido van Rossum)
22 * 2002-11-09 fl fixed empty sub/subn return type
23 * 2003-04-18 mvl fully support 4-byte codes
24 * 2003-10-17 gn implemented non recursive scheme
25 * 2013-02-04 mrab added fullmatch primitive
26 *
27 * Copyright (c) 1997-2001 by Secret Labs AB. All rights reserved.
28 *
29 * This version of the SRE library can be redistributed under CNRI's
30 * Python 1.6 license. For any other use, please contact Secret Labs
31 * AB (info@pythonware.com).
32 *
33 * Portions of this engine have been developed in cooperation with
34 * CNRI. Hewlett-Packard provided funding for 1.6 integration and
35 * other compatibility work.
36 */
37
38 static const char copyright[] =
39 " SRE 2.2.2 Copyright (c) 1997-2002 by Secret Labs AB ";
40
41 #define PY_SSIZE_T_CLEAN
42
43 #include "Python.h"
44 #include "pycore_long.h" // _PyLong_GetZero()
45 #include "pycore_moduleobject.h" // _PyModule_GetState()
46 #include "structmember.h" // PyMemberDef
47
48 #include "sre.h"
49
50 #define SRE_CODE_BITS (8 * sizeof(SRE_CODE))
51
52 #include <ctype.h>
53
54 /* defining this one enables tracing */
55 #undef VERBOSE
56
57 /* -------------------------------------------------------------------- */
58
59 #if defined(_MSC_VER)
60 #pragma optimize("agtw", on) /* doesn't seem to make much difference... */
61 #pragma warning(disable: 4710) /* who cares if functions are not inlined ;-) */
62 /* fastest possible local call under MSVC */
63 #define LOCAL(type) static __inline type __fastcall
64 #else
65 #define LOCAL(type) static inline type
66 #endif
67
68 /* error codes */
69 #define SRE_ERROR_ILLEGAL -1 /* illegal opcode */
70 #define SRE_ERROR_STATE -2 /* illegal state */
71 #define SRE_ERROR_RECURSION_LIMIT -3 /* runaway recursion */
72 #define SRE_ERROR_MEMORY -9 /* out of memory */
73 #define SRE_ERROR_INTERRUPTED -10 /* signal handler raised exception */
74
75 #if defined(VERBOSE)
76 #define TRACE(v) printf v
77 #else
78 #define TRACE(v)
79 #endif
80
81 /* -------------------------------------------------------------------- */
82 /* search engine state */
83
84 #define SRE_IS_DIGIT(ch)\
85 ((ch) <= '9' && Py_ISDIGIT(ch))
86 #define SRE_IS_SPACE(ch)\
87 ((ch) <= ' ' && Py_ISSPACE(ch))
88 #define SRE_IS_LINEBREAK(ch)\
89 ((ch) == '\n')
90 #define SRE_IS_WORD(ch)\
91 ((ch) <= 'z' && (Py_ISALNUM(ch) || (ch) == '_'))
92
93 static unsigned int sre_lower_ascii(unsigned int ch)
94 {
95 return ((ch) < 128 ? Py_TOLOWER(ch) : ch);
96 }
97
98 /* locale-specific character predicates */
99 /* !(c & ~N) == (c < N+1) for any unsigned c, this avoids
100 * warnings when c's type supports only numbers < N+1 */
101 #define SRE_LOC_IS_ALNUM(ch) (!((ch) & ~255) ? isalnum((ch)) : 0)
102 #define SRE_LOC_IS_WORD(ch) (SRE_LOC_IS_ALNUM((ch)) || (ch) == '_')
103
104 static unsigned int sre_lower_locale(unsigned int ch)
105 {
106 return ((ch) < 256 ? (unsigned int)tolower((ch)) : ch);
107 }
108
109 static unsigned int sre_upper_locale(unsigned int ch)
110 {
111 return ((ch) < 256 ? (unsigned int)toupper((ch)) : ch);
112 }
113
114 /* unicode-specific character predicates */
115
116 #define SRE_UNI_IS_DIGIT(ch) Py_UNICODE_ISDECIMAL(ch)
117 #define SRE_UNI_IS_SPACE(ch) Py_UNICODE_ISSPACE(ch)
118 #define SRE_UNI_IS_LINEBREAK(ch) Py_UNICODE_ISLINEBREAK(ch)
119 #define SRE_UNI_IS_ALNUM(ch) Py_UNICODE_ISALNUM(ch)
120 #define SRE_UNI_IS_WORD(ch) (SRE_UNI_IS_ALNUM(ch) || (ch) == '_')
121
122 static unsigned int sre_lower_unicode(unsigned int ch)
123 {
124 return (unsigned int) Py_UNICODE_TOLOWER(ch);
125 }
126
127 static unsigned int sre_upper_unicode(unsigned int ch)
128 {
129 return (unsigned int) Py_UNICODE_TOUPPER(ch);
130 }
131
132 LOCAL(int)
133 sre_category(SRE_CODE category, unsigned int ch)
134 {
135 switch (category) {
136
137 case SRE_CATEGORY_DIGIT:
138 return SRE_IS_DIGIT(ch);
139 case SRE_CATEGORY_NOT_DIGIT:
140 return !SRE_IS_DIGIT(ch);
141 case SRE_CATEGORY_SPACE:
142 return SRE_IS_SPACE(ch);
143 case SRE_CATEGORY_NOT_SPACE:
144 return !SRE_IS_SPACE(ch);
145 case SRE_CATEGORY_WORD:
146 return SRE_IS_WORD(ch);
147 case SRE_CATEGORY_NOT_WORD:
148 return !SRE_IS_WORD(ch);
149 case SRE_CATEGORY_LINEBREAK:
150 return SRE_IS_LINEBREAK(ch);
151 case SRE_CATEGORY_NOT_LINEBREAK:
152 return !SRE_IS_LINEBREAK(ch);
153
154 case SRE_CATEGORY_LOC_WORD:
155 return SRE_LOC_IS_WORD(ch);
156 case SRE_CATEGORY_LOC_NOT_WORD:
157 return !SRE_LOC_IS_WORD(ch);
158
159 case SRE_CATEGORY_UNI_DIGIT:
160 return SRE_UNI_IS_DIGIT(ch);
161 case SRE_CATEGORY_UNI_NOT_DIGIT:
162 return !SRE_UNI_IS_DIGIT(ch);
163 case SRE_CATEGORY_UNI_SPACE:
164 return SRE_UNI_IS_SPACE(ch);
165 case SRE_CATEGORY_UNI_NOT_SPACE:
166 return !SRE_UNI_IS_SPACE(ch);
167 case SRE_CATEGORY_UNI_WORD:
168 return SRE_UNI_IS_WORD(ch);
169 case SRE_CATEGORY_UNI_NOT_WORD:
170 return !SRE_UNI_IS_WORD(ch);
171 case SRE_CATEGORY_UNI_LINEBREAK:
172 return SRE_UNI_IS_LINEBREAK(ch);
173 case SRE_CATEGORY_UNI_NOT_LINEBREAK:
174 return !SRE_UNI_IS_LINEBREAK(ch);
175 }
176 return 0;
177 }
178
179 LOCAL(int)
180 char_loc_ignore(SRE_CODE pattern, SRE_CODE ch)
181 {
182 return ch == pattern
183 || (SRE_CODE) sre_lower_locale(ch) == pattern
184 || (SRE_CODE) sre_upper_locale(ch) == pattern;
185 }
186
187
188 /* helpers */
189
190 static void
191 data_stack_dealloc(SRE_STATE* state)
192 {
193 if (state->data_stack) {
194 PyMem_Free(state->data_stack);
195 state->data_stack = NULL;
196 }
197 state->data_stack_size = state->data_stack_base = 0;
198 }
199
200 static int
201 data_stack_grow(SRE_STATE* state, Py_ssize_t size)
202 {
203 Py_ssize_t minsize, cursize;
204 minsize = state->data_stack_base+size;
205 cursize = state->data_stack_size;
206 if (cursize < minsize) {
207 void* stack;
208 cursize = minsize+minsize/4+1024;
209 TRACE(("allocate/grow stack %zd\n", cursize));
210 stack = PyMem_Realloc(state->data_stack, cursize);
211 if (!stack) {
212 data_stack_dealloc(state);
213 return SRE_ERROR_MEMORY;
214 }
215 state->data_stack = (char *)stack;
216 state->data_stack_size = cursize;
217 }
218 return 0;
219 }
220
221 /* generate 8-bit version */
222
223 #define SRE_CHAR Py_UCS1
224 #define SIZEOF_SRE_CHAR 1
225 #define SRE(F) sre_ucs1_##F
226 #include "sre_lib.h"
227
228 /* generate 16-bit unicode version */
229
230 #define SRE_CHAR Py_UCS2
231 #define SIZEOF_SRE_CHAR 2
232 #define SRE(F) sre_ucs2_##F
233 #include "sre_lib.h"
234
235 /* generate 32-bit unicode version */
236
237 #define SRE_CHAR Py_UCS4
238 #define SIZEOF_SRE_CHAR 4
239 #define SRE(F) sre_ucs4_##F
240 #include "sre_lib.h"
241
242 /* -------------------------------------------------------------------- */
243 /* factories and destructors */
244
245 /* module state */
246 typedef struct {
247 PyTypeObject *Pattern_Type;
248 PyTypeObject *Match_Type;
249 PyTypeObject *Scanner_Type;
250 PyTypeObject *Template_Type;
251 PyObject *compile_template; // reference to re._compile_template
252 } _sremodulestate;
253
254 static _sremodulestate *
255 get_sre_module_state(PyObject *m)
256 {
257 _sremodulestate *state = (_sremodulestate *)_PyModule_GetState(m);
258 assert(state);
259 return state;
260 }
261
262 static struct PyModuleDef sremodule;
263 #define get_sre_module_state_by_class(cls) \
264 (get_sre_module_state(PyType_GetModule(cls)))
265
266 /* see sre.h for object declarations */
267 static PyObject*pattern_new_match(_sremodulestate *, PatternObject*, SRE_STATE*, Py_ssize_t);
268 static PyObject *pattern_scanner(_sremodulestate *, PatternObject *, PyObject *, Py_ssize_t, Py_ssize_t);
269
270 /*[clinic input]
271 module _sre
272 class _sre.SRE_Pattern "PatternObject *" "get_sre_module_state_by_class(tp)->Pattern_Type"
273 class _sre.SRE_Match "MatchObject *" "get_sre_module_state_by_class(tp)->Match_Type"
274 class _sre.SRE_Scanner "ScannerObject *" "get_sre_module_state_by_class(tp)->Scanner_Type"
275 [clinic start generated code]*/
276 /*[clinic end generated code: output=da39a3ee5e6b4b0d input=fe2966e32b66a231]*/
277
278 /*[clinic input]
279 _sre.getcodesize -> int
280 [clinic start generated code]*/
281
282 static int
283 _sre_getcodesize_impl(PyObject *module)
284 /*[clinic end generated code: output=e0db7ce34a6dd7b1 input=bd6f6ecf4916bb2b]*/
285 {
286 return sizeof(SRE_CODE);
287 }
288
289 /*[clinic input]
290 _sre.ascii_iscased -> bool
291
292 character: int
293 /
294
295 [clinic start generated code]*/
296
297 static int
298 _sre_ascii_iscased_impl(PyObject *module, int character)
299 /*[clinic end generated code: output=4f454b630fbd19a2 input=9f0bd952812c7ed3]*/
300 {
301 unsigned int ch = (unsigned int)character;
302 return ch < 128 && Py_ISALPHA(ch);
303 }
304
305 /*[clinic input]
306 _sre.unicode_iscased -> bool
307
308 character: int
309 /
310
311 [clinic start generated code]*/
312
313 static int
314 _sre_unicode_iscased_impl(PyObject *module, int character)
315 /*[clinic end generated code: output=9c5ddee0dc2bc258 input=51e42c3b8dddb78e]*/
316 {
317 unsigned int ch = (unsigned int)character;
318 return ch != sre_lower_unicode(ch) || ch != sre_upper_unicode(ch);
319 }
320
321 /*[clinic input]
322 _sre.ascii_tolower -> int
323
324 character: int
325 /
326
327 [clinic start generated code]*/
328
329 static int
330 _sre_ascii_tolower_impl(PyObject *module, int character)
331 /*[clinic end generated code: output=228294ed6ff2a612 input=272c609b5b61f136]*/
332 {
333 return sre_lower_ascii(character);
334 }
335
336 /*[clinic input]
337 _sre.unicode_tolower -> int
338
339 character: int
340 /
341
342 [clinic start generated code]*/
343
344 static int
345 _sre_unicode_tolower_impl(PyObject *module, int character)
346 /*[clinic end generated code: output=6422272d7d7fee65 input=91d708c5f3c2045a]*/
347 {
348 return sre_lower_unicode(character);
349 }
350
351 LOCAL(void)
352 state_reset(SRE_STATE* state)
353 {
354 /* state->mark will be set to 0 in SRE_OP_MARK dynamically. */
355 /*memset(state->mark, 0, sizeof(*state->mark) * SRE_MARK_SIZE);*/
356
357 state->lastmark = -1;
358 state->lastindex = -1;
359
360 state->repeat = NULL;
361
362 data_stack_dealloc(state);
363 }
364
365 static const void*
366 getstring(PyObject* string, Py_ssize_t* p_length,
367 int* p_isbytes, int* p_charsize,
368 Py_buffer *view)
369 {
370 /* given a python object, return a data pointer, a length (in
371 characters), and a character size. return NULL if the object
372 is not a string (or not compatible) */
373
374 /* Unicode objects do not support the buffer API. So, get the data
375 directly instead. */
376 if (PyUnicode_Check(string)) {
377 if (PyUnicode_READY(string) == -1)
378 return NULL;
379 *p_length = PyUnicode_GET_LENGTH(string);
380 *p_charsize = PyUnicode_KIND(string);
381 *p_isbytes = 0;
382 return PyUnicode_DATA(string);
383 }
384
385 /* get pointer to byte string buffer */
386 if (PyObject_GetBuffer(string, view, PyBUF_SIMPLE) != 0) {
387 PyErr_Format(PyExc_TypeError, "expected string or bytes-like "
388 "object, got '%.200s'", Py_TYPE(string)->tp_name);
389 return NULL;
390 }
391
392 *p_length = view->len;
393 *p_charsize = 1;
394 *p_isbytes = 1;
395
396 if (view->buf == NULL) {
397 PyErr_SetString(PyExc_ValueError, "Buffer is NULL");
398 PyBuffer_Release(view);
399 view->buf = NULL;
400 return NULL;
401 }
402 return view->buf;
403 }
404
405 LOCAL(PyObject*)
406 state_init(SRE_STATE* state, PatternObject* pattern, PyObject* string,
407 Py_ssize_t start, Py_ssize_t end)
408 {
409 /* prepare state object */
410
411 Py_ssize_t length;
412 int isbytes, charsize;
413 const void* ptr;
414
415 memset(state, 0, sizeof(SRE_STATE));
416
417 state->mark = PyMem_New(const void *, pattern->groups * 2);
418 if (!state->mark) {
419 PyErr_NoMemory();
420 goto err;
421 }
422 state->lastmark = -1;
423 state->lastindex = -1;
424
425 state->buffer.buf = NULL;
426 ptr = getstring(string, &length, &isbytes, &charsize, &state->buffer);
427 if (!ptr)
428 goto err;
429
430 if (isbytes && pattern->isbytes == 0) {
431 PyErr_SetString(PyExc_TypeError,
432 "cannot use a string pattern on a bytes-like object");
433 goto err;
434 }
435 if (!isbytes && pattern->isbytes > 0) {
436 PyErr_SetString(PyExc_TypeError,
437 "cannot use a bytes pattern on a string-like object");
438 goto err;
439 }
440
441 /* adjust boundaries */
442 if (start < 0)
443 start = 0;
444 else if (start > length)
445 start = length;
446
447 if (end < 0)
448 end = 0;
449 else if (end > length)
450 end = length;
451
452 state->isbytes = isbytes;
453 state->charsize = charsize;
454 state->match_all = 0;
455 state->must_advance = 0;
456
457 state->beginning = ptr;
458
459 state->start = (void*) ((char*) ptr + start * state->charsize);
460 state->end = (void*) ((char*) ptr + end * state->charsize);
461
462 state->string = Py_NewRef(string);
463 state->pos = start;
464 state->endpos = end;
465
466 return string;
467 err:
468 /* We add an explicit cast here because MSVC has a bug when
469 compiling C code where it believes that `const void**` cannot be
470 safely casted to `void*`, see bpo-39943 for details. */
471 PyMem_Free((void*) state->mark);
472 state->mark = NULL;
473 if (state->buffer.buf)
474 PyBuffer_Release(&state->buffer);
475 return NULL;
476 }
477
478 LOCAL(void)
479 state_fini(SRE_STATE* state)
480 {
481 if (state->buffer.buf)
482 PyBuffer_Release(&state->buffer);
483 Py_XDECREF(state->string);
484 data_stack_dealloc(state);
485 /* See above PyMem_Del for why we explicitly cast here. */
486 PyMem_Free((void*) state->mark);
487 state->mark = NULL;
488 }
489
490 /* calculate offset from start of string */
491 #define STATE_OFFSET(state, member)\
492 (((char*)(member) - (char*)(state)->beginning) / (state)->charsize)
493
494 LOCAL(PyObject*)
495 getslice(int isbytes, const void *ptr,
496 PyObject* string, Py_ssize_t start, Py_ssize_t end)
497 {
498 if (isbytes) {
499 if (PyBytes_CheckExact(string) &&
500 start == 0 && end == PyBytes_GET_SIZE(string)) {
501 return Py_NewRef(string);
502 }
503 return PyBytes_FromStringAndSize(
504 (const char *)ptr + start, end - start);
505 }
506 else {
507 return PyUnicode_Substring(string, start, end);
508 }
509 }
510
511 LOCAL(PyObject*)
512 state_getslice(SRE_STATE* state, Py_ssize_t index, PyObject* string, int empty)
513 {
514 Py_ssize_t i, j;
515
516 index = (index - 1) * 2;
517
518 if (string == Py_None || index >= state->lastmark || !state->mark[index] || !state->mark[index+1]) {
519 if (empty)
520 /* want empty string */
521 i = j = 0;
522 else {
523 Py_RETURN_NONE;
524 }
525 } else {
526 i = STATE_OFFSET(state, state->mark[index]);
527 j = STATE_OFFSET(state, state->mark[index+1]);
528
529 /* check wrong span */
530 if (i > j) {
531 PyErr_SetString(PyExc_SystemError,
532 "The span of capturing group is wrong,"
533 " please report a bug for the re module.");
534 return NULL;
535 }
536 }
537
538 return getslice(state->isbytes, state->beginning, string, i, j);
539 }
540
541 static void
542 pattern_error(Py_ssize_t status)
543 {
544 switch (status) {
545 case SRE_ERROR_RECURSION_LIMIT:
546 /* This error code seems to be unused. */
547 PyErr_SetString(
548 PyExc_RecursionError,
549 "maximum recursion limit exceeded"
550 );
551 break;
552 case SRE_ERROR_MEMORY:
553 PyErr_NoMemory();
554 break;
555 case SRE_ERROR_INTERRUPTED:
556 /* An exception has already been raised, so let it fly */
557 break;
558 default:
559 /* other error codes indicate compiler/engine bugs */
560 PyErr_SetString(
561 PyExc_RuntimeError,
562 "internal error in regular expression engine"
563 );
564 }
565 }
566
567 static int
568 pattern_traverse(PatternObject *self, visitproc visit, void *arg)
569 {
570 Py_VISIT(Py_TYPE(self));
571 Py_VISIT(self->groupindex);
572 Py_VISIT(self->indexgroup);
573 Py_VISIT(self->pattern);
574 return 0;
575 }
576
577 static int
578 pattern_clear(PatternObject *self)
579 {
580 Py_CLEAR(self->groupindex);
581 Py_CLEAR(self->indexgroup);
582 Py_CLEAR(self->pattern);
583 return 0;
584 }
585
586 static void
587 pattern_dealloc(PatternObject* self)
588 {
589 PyTypeObject *tp = Py_TYPE(self);
590
591 PyObject_GC_UnTrack(self);
592 if (self->weakreflist != NULL) {
593 PyObject_ClearWeakRefs((PyObject *) self);
594 }
595 (void)pattern_clear(self);
596 tp->tp_free(self);
597 Py_DECREF(tp);
598 }
599
600 LOCAL(Py_ssize_t)
601 sre_match(SRE_STATE* state, SRE_CODE* pattern)
602 {
603 if (state->charsize == 1)
604 return sre_ucs1_match(state, pattern, 1);
605 if (state->charsize == 2)
606 return sre_ucs2_match(state, pattern, 1);
607 assert(state->charsize == 4);
608 return sre_ucs4_match(state, pattern, 1);
609 }
610
611 LOCAL(Py_ssize_t)
612 sre_search(SRE_STATE* state, SRE_CODE* pattern)
613 {
614 if (state->charsize == 1)
615 return sre_ucs1_search(state, pattern);
616 if (state->charsize == 2)
617 return sre_ucs2_search(state, pattern);
618 assert(state->charsize == 4);
619 return sre_ucs4_search(state, pattern);
620 }
621
622 /*[clinic input]
623 _sre.SRE_Pattern.match
624
625 cls: defining_class
626 /
627 string: object
628 pos: Py_ssize_t = 0
629 endpos: Py_ssize_t(c_default="PY_SSIZE_T_MAX") = sys.maxsize
630
631 Matches zero or more characters at the beginning of the string.
632 [clinic start generated code]*/
633
634 static PyObject *
635 _sre_SRE_Pattern_match_impl(PatternObject *self, PyTypeObject *cls,
636 PyObject *string, Py_ssize_t pos,
637 Py_ssize_t endpos)
638 /*[clinic end generated code: output=ec6208ea58a0cca0 input=4bdb9c3e564d13ac]*/
639 {
640 _sremodulestate *module_state = get_sre_module_state_by_class(cls);
641 SRE_STATE state;
642 Py_ssize_t status;
643 PyObject *match;
644
645 if (!state_init(&state, (PatternObject *)self, string, pos, endpos))
646 return NULL;
647
648 state.ptr = state.start;
649
650 TRACE(("|%p|%p|MATCH\n", PatternObject_GetCode(self), state.ptr));
651
652 status = sre_match(&state, PatternObject_GetCode(self));
653
654 TRACE(("|%p|%p|END\n", PatternObject_GetCode(self), state.ptr));
655 if (PyErr_Occurred()) {
656 state_fini(&state);
657 return NULL;
658 }
659
660 match = pattern_new_match(module_state, self, &state, status);
661 state_fini(&state);
662 return match;
663 }
664
665 /*[clinic input]
666 _sre.SRE_Pattern.fullmatch
667
668 cls: defining_class
669 /
670 string: object
671 pos: Py_ssize_t = 0
672 endpos: Py_ssize_t(c_default="PY_SSIZE_T_MAX") = sys.maxsize
673
674 Matches against all of the string.
675 [clinic start generated code]*/
676
677 static PyObject *
678 _sre_SRE_Pattern_fullmatch_impl(PatternObject *self, PyTypeObject *cls,
679 PyObject *string, Py_ssize_t pos,
680 Py_ssize_t endpos)
681 /*[clinic end generated code: output=625b75b027ef94da input=50981172ab0fcfdd]*/
682 {
683 _sremodulestate *module_state = get_sre_module_state_by_class(cls);
684 SRE_STATE state;
685 Py_ssize_t status;
686 PyObject *match;
687
688 if (!state_init(&state, self, string, pos, endpos))
689 return NULL;
690
691 state.ptr = state.start;
692
693 TRACE(("|%p|%p|FULLMATCH\n", PatternObject_GetCode(self), state.ptr));
694
695 state.match_all = 1;
696 status = sre_match(&state, PatternObject_GetCode(self));
697
698 TRACE(("|%p|%p|END\n", PatternObject_GetCode(self), state.ptr));
699 if (PyErr_Occurred()) {
700 state_fini(&state);
701 return NULL;
702 }
703
704 match = pattern_new_match(module_state, self, &state, status);
705 state_fini(&state);
706 return match;
707 }
708
709 /*[clinic input]
710 _sre.SRE_Pattern.search
711
712 cls: defining_class
713 /
714 string: object
715 pos: Py_ssize_t = 0
716 endpos: Py_ssize_t(c_default="PY_SSIZE_T_MAX") = sys.maxsize
717
718 Scan through string looking for a match, and return a corresponding match object instance.
719
720 Return None if no position in the string matches.
721 [clinic start generated code]*/
722
723 static PyObject *
724 _sre_SRE_Pattern_search_impl(PatternObject *self, PyTypeObject *cls,
725 PyObject *string, Py_ssize_t pos,
726 Py_ssize_t endpos)
727 /*[clinic end generated code: output=bd7f2d9d583e1463 input=afa9afb66a74a4b3]*/
728 {
729 _sremodulestate *module_state = get_sre_module_state_by_class(cls);
730 SRE_STATE state;
731 Py_ssize_t status;
732 PyObject *match;
733
734 if (!state_init(&state, self, string, pos, endpos))
735 return NULL;
736
737 TRACE(("|%p|%p|SEARCH\n", PatternObject_GetCode(self), state.ptr));
738
739 status = sre_search(&state, PatternObject_GetCode(self));
740
741 TRACE(("|%p|%p|END\n", PatternObject_GetCode(self), state.ptr));
742
743 if (PyErr_Occurred()) {
744 state_fini(&state);
745 return NULL;
746 }
747
748 match = pattern_new_match(module_state, self, &state, status);
749 state_fini(&state);
750 return match;
751 }
752
753 /*[clinic input]
754 _sre.SRE_Pattern.findall
755
756 string: object
757 pos: Py_ssize_t = 0
758 endpos: Py_ssize_t(c_default="PY_SSIZE_T_MAX") = sys.maxsize
759
760 Return a list of all non-overlapping matches of pattern in string.
761 [clinic start generated code]*/
762
763 static PyObject *
764 _sre_SRE_Pattern_findall_impl(PatternObject *self, PyObject *string,
765 Py_ssize_t pos, Py_ssize_t endpos)
766 /*[clinic end generated code: output=f4966baceea60aca input=5b6a4ee799741563]*/
767 {
768 SRE_STATE state;
769 PyObject* list;
770 Py_ssize_t status;
771 Py_ssize_t i, b, e;
772
773 if (!state_init(&state, self, string, pos, endpos))
774 return NULL;
775
776 list = PyList_New(0);
777 if (!list) {
778 state_fini(&state);
779 return NULL;
780 }
781
782 while (state.start <= state.end) {
783
784 PyObject* item;
785
786 state_reset(&state);
787
788 state.ptr = state.start;
789
790 status = sre_search(&state, PatternObject_GetCode(self));
791 if (PyErr_Occurred())
792 goto error;
793
794 if (status <= 0) {
795 if (status == 0)
796 break;
797 pattern_error(status);
798 goto error;
799 }
800
801 /* don't bother to build a match object */
802 switch (self->groups) {
803 case 0:
804 b = STATE_OFFSET(&state, state.start);
805 e = STATE_OFFSET(&state, state.ptr);
806 item = getslice(state.isbytes, state.beginning,
807 string, b, e);
808 if (!item)
809 goto error;
810 break;
811 case 1:
812 item = state_getslice(&state, 1, string, 1);
813 if (!item)
814 goto error;
815 break;
816 default:
817 item = PyTuple_New(self->groups);
818 if (!item)
819 goto error;
820 for (i = 0; i < self->groups; i++) {
821 PyObject* o = state_getslice(&state, i+1, string, 1);
822 if (!o) {
823 Py_DECREF(item);
824 goto error;
825 }
826 PyTuple_SET_ITEM(item, i, o);
827 }
828 break;
829 }
830
831 status = PyList_Append(list, item);
832 Py_DECREF(item);
833 if (status < 0)
834 goto error;
835
836 state.must_advance = (state.ptr == state.start);
837 state.start = state.ptr;
838 }
839
840 state_fini(&state);
841 return list;
842
843 error:
844 Py_DECREF(list);
845 state_fini(&state);
846 return NULL;
847
848 }
849
850 /*[clinic input]
851 _sre.SRE_Pattern.finditer
852
853 cls: defining_class
854 /
855 string: object
856 pos: Py_ssize_t = 0
857 endpos: Py_ssize_t(c_default="PY_SSIZE_T_MAX") = sys.maxsize
858
859 Return an iterator over all non-overlapping matches for the RE pattern in string.
860
861 For each match, the iterator returns a match object.
862 [clinic start generated code]*/
863
864 static PyObject *
865 _sre_SRE_Pattern_finditer_impl(PatternObject *self, PyTypeObject *cls,
866 PyObject *string, Py_ssize_t pos,
867 Py_ssize_t endpos)
868 /*[clinic end generated code: output=1791dbf3618ade56 input=812e332a4848cbaf]*/
869 {
870 _sremodulestate *module_state = get_sre_module_state_by_class(cls);
871 PyObject* scanner;
872 PyObject* search;
873 PyObject* iterator;
874
875 scanner = pattern_scanner(module_state, self, string, pos, endpos);
876 if (!scanner)
877 return NULL;
878
879 search = PyObject_GetAttrString(scanner, "search");
880 Py_DECREF(scanner);
881 if (!search)
882 return NULL;
883
884 iterator = PyCallIter_New(search, Py_None);
885 Py_DECREF(search);
886
887 return iterator;
888 }
889
890 /*[clinic input]
891 _sre.SRE_Pattern.scanner
892
893 cls: defining_class
894 /
895 string: object
896 pos: Py_ssize_t = 0
897 endpos: Py_ssize_t(c_default="PY_SSIZE_T_MAX") = sys.maxsize
898
899 [clinic start generated code]*/
900
901 static PyObject *
902 _sre_SRE_Pattern_scanner_impl(PatternObject *self, PyTypeObject *cls,
903 PyObject *string, Py_ssize_t pos,
904 Py_ssize_t endpos)
905 /*[clinic end generated code: output=f70cd506112f1bd9 input=2e487e5151bcee4c]*/
906 {
907 _sremodulestate *module_state = get_sre_module_state_by_class(cls);
908
909 return pattern_scanner(module_state, self, string, pos, endpos);
910 }
911
912 /*[clinic input]
913 _sre.SRE_Pattern.split
914
915 string: object
916 maxsplit: Py_ssize_t = 0
917
918 Split string by the occurrences of pattern.
919 [clinic start generated code]*/
920
921 static PyObject *
922 _sre_SRE_Pattern_split_impl(PatternObject *self, PyObject *string,
923 Py_ssize_t maxsplit)
924 /*[clinic end generated code: output=7ac66f381c45e0be input=1eeeb10dafc9947a]*/
925 {
926 SRE_STATE state;
927 PyObject* list;
928 PyObject* item;
929 Py_ssize_t status;
930 Py_ssize_t n;
931 Py_ssize_t i;
932 const void* last;
933
934 assert(self->codesize != 0);
935
936 if (!state_init(&state, self, string, 0, PY_SSIZE_T_MAX))
937 return NULL;
938
939 list = PyList_New(0);
940 if (!list) {
941 state_fini(&state);
942 return NULL;
943 }
944
945 n = 0;
946 last = state.start;
947
948 while (!maxsplit || n < maxsplit) {
949
950 state_reset(&state);
951
952 state.ptr = state.start;
953
954 status = sre_search(&state, PatternObject_GetCode(self));
955 if (PyErr_Occurred())
956 goto error;
957
958 if (status <= 0) {
959 if (status == 0)
960 break;
961 pattern_error(status);
962 goto error;
963 }
964
965 /* get segment before this match */
966 item = getslice(state.isbytes, state.beginning,
967 string, STATE_OFFSET(&state, last),
968 STATE_OFFSET(&state, state.start)
969 );
970 if (!item)
971 goto error;
972 status = PyList_Append(list, item);
973 Py_DECREF(item);
974 if (status < 0)
975 goto error;
976
977 /* add groups (if any) */
978 for (i = 0; i < self->groups; i++) {
979 item = state_getslice(&state, i+1, string, 0);
980 if (!item)
981 goto error;
982 status = PyList_Append(list, item);
983 Py_DECREF(item);
984 if (status < 0)
985 goto error;
986 }
987
988 n = n + 1;
989 state.must_advance = (state.ptr == state.start);
990 last = state.start = state.ptr;
991
992 }
993
994 /* get segment following last match (even if empty) */
995 item = getslice(state.isbytes, state.beginning,
996 string, STATE_OFFSET(&state, last), state.endpos
997 );
998 if (!item)
999 goto error;
1000 status = PyList_Append(list, item);
1001 Py_DECREF(item);
1002 if (status < 0)
1003 goto error;
1004
1005 state_fini(&state);
1006 return list;
1007
1008 error:
1009 Py_DECREF(list);
1010 state_fini(&state);
1011 return NULL;
1012
1013 }
1014
1015 static PyObject *
1016 compile_template(_sremodulestate *module_state,
1017 PatternObject *pattern, PyObject *template)
1018 {
1019 /* delegate to Python code */
1020 PyObject *func = module_state->compile_template;
1021 if (func == NULL) {
1022 func = _PyImport_GetModuleAttrString("re", "_compile_template");
1023 if (func == NULL) {
1024 return NULL;
1025 }
1026 Py_XSETREF(module_state->compile_template, func);
1027 }
1028
1029 PyObject *args[] = {(PyObject *)pattern, template};
1030 PyObject *result = PyObject_Vectorcall(func, args, 2, NULL);
1031
1032 if (result == NULL && PyErr_ExceptionMatches(PyExc_TypeError)) {
1033 /* If the replacement string is unhashable (e.g. bytearray),
1034 * convert it to the basic type (str or bytes) and repeat. */
1035 if (PyUnicode_Check(template) && !PyUnicode_CheckExact(template)) {
1036 PyErr_Clear();
1037 template = _PyUnicode_Copy(template);
1038 }
1039 else if (PyObject_CheckBuffer(template) && !PyBytes_CheckExact(template)) {
1040 PyErr_Clear();
1041 template = PyBytes_FromObject(template);
1042 }
1043 else {
1044 return NULL;
1045 }
1046 if (template == NULL) {
1047 return NULL;
1048 }
1049 args[1] = template;
1050 result = PyObject_Vectorcall(func, args, 2, NULL);
1051 Py_DECREF(template);
1052 }
1053
1054 if (result != NULL && Py_TYPE(result) != module_state->Template_Type) {
1055 PyErr_Format(PyExc_RuntimeError,
1056 "the result of compiling a replacement string is %.200s",
1057 Py_TYPE(result)->tp_name);
1058 Py_DECREF(result);
1059 return NULL;
1060 }
1061 return result;
1062 }
1063
1064 static PyObject *expand_template(TemplateObject *, MatchObject *); /* Forward */
1065
1066 static PyObject*
1067 pattern_subx(_sremodulestate* module_state,
1068 PatternObject* self,
1069 PyObject* ptemplate,
1070 PyObject* string,
1071 Py_ssize_t count,
1072 Py_ssize_t subn)
1073 {
1074 SRE_STATE state;
1075 PyObject* list;
1076 PyObject* joiner;
1077 PyObject* item;
1078 PyObject* filter;
1079 PyObject* match;
1080 const void* ptr;
1081 Py_ssize_t status;
1082 Py_ssize_t n;
1083 Py_ssize_t i, b, e;
1084 int isbytes, charsize;
1085 enum {LITERAL, TEMPLATE, CALLABLE} filter_type;
1086 Py_buffer view;
1087
1088 if (PyCallable_Check(ptemplate)) {
1089 /* sub/subn takes either a function or a template */
1090 filter = Py_NewRef(ptemplate);
1091 filter_type = CALLABLE;
1092 } else {
1093 /* if not callable, check if it's a literal string */
1094 int literal;
1095 view.buf = NULL;
1096 ptr = getstring(ptemplate, &n, &isbytes, &charsize, &view);
1097 if (ptr) {
1098 if (charsize == 1)
1099 literal = memchr(ptr, '\\', n) == NULL;
1100 else
1101 literal = PyUnicode_FindChar(ptemplate, '\\', 0, n, 1) == -1;
1102 } else {
1103 PyErr_Clear();
1104 literal = 0;
1105 }
1106 if (view.buf)
1107 PyBuffer_Release(&view);
1108 if (literal) {
1109 filter = Py_NewRef(ptemplate);
1110 filter_type = LITERAL;
1111 } else {
1112 /* not a literal; hand it over to the template compiler */
1113 filter = compile_template(module_state, self, ptemplate);
1114 if (!filter)
1115 return NULL;
1116
1117 assert(Py_TYPE(filter) == module_state->Template_Type);
1118 if (Py_SIZE(filter) == 0) {
1119 Py_SETREF(filter,
1120 Py_NewRef(((TemplateObject *)filter)->literal));
1121 filter_type = LITERAL;
1122 }
1123 else {
1124 filter_type = TEMPLATE;
1125 }
1126 }
1127 }
1128
1129 if (!state_init(&state, self, string, 0, PY_SSIZE_T_MAX)) {
1130 Py_DECREF(filter);
1131 return NULL;
1132 }
1133
1134 list = PyList_New(0);
1135 if (!list) {
1136 Py_DECREF(filter);
1137 state_fini(&state);
1138 return NULL;
1139 }
1140
1141 n = i = 0;
1142
1143 while (!count || n < count) {
1144
1145 state_reset(&state);
1146
1147 state.ptr = state.start;
1148
1149 status = sre_search(&state, PatternObject_GetCode(self));
1150 if (PyErr_Occurred())
1151 goto error;
1152
1153 if (status <= 0) {
1154 if (status == 0)
1155 break;
1156 pattern_error(status);
1157 goto error;
1158 }
1159
1160 b = STATE_OFFSET(&state, state.start);
1161 e = STATE_OFFSET(&state, state.ptr);
1162
1163 if (i < b) {
1164 /* get segment before this match */
1165 item = getslice(state.isbytes, state.beginning,
1166 string, i, b);
1167 if (!item)
1168 goto error;
1169 status = PyList_Append(list, item);
1170 Py_DECREF(item);
1171 if (status < 0)
1172 goto error;
1173
1174 }
1175
1176 if (filter_type != LITERAL) {
1177 /* pass match object through filter */
1178 match = pattern_new_match(module_state, self, &state, 1);
1179 if (!match)
1180 goto error;
1181 if (filter_type == TEMPLATE) {
1182 item = expand_template((TemplateObject *)filter,
1183 (MatchObject *)match);
1184 }
1185 else {
1186 assert(filter_type == CALLABLE);
1187 item = PyObject_CallOneArg(filter, match);
1188 }
1189 Py_DECREF(match);
1190 if (!item)
1191 goto error;
1192 } else {
1193 /* filter is literal string */
1194 item = Py_NewRef(filter);
1195 }
1196
1197 /* add to list */
1198 if (item != Py_None) {
1199 status = PyList_Append(list, item);
1200 Py_DECREF(item);
1201 if (status < 0)
1202 goto error;
1203 }
1204
1205 i = e;
1206 n = n + 1;
1207 state.must_advance = (state.ptr == state.start);
1208 state.start = state.ptr;
1209 }
1210
1211 /* get segment following last match */
1212 if (i < state.endpos) {
1213 item = getslice(state.isbytes, state.beginning,
1214 string, i, state.endpos);
1215 if (!item)
1216 goto error;
1217 status = PyList_Append(list, item);
1218 Py_DECREF(item);
1219 if (status < 0)
1220 goto error;
1221 }
1222
1223 state_fini(&state);
1224
1225 Py_DECREF(filter);
1226
1227 /* convert list to single string (also removes list) */
1228 joiner = getslice(state.isbytes, state.beginning, string, 0, 0);
1229 if (!joiner) {
1230 Py_DECREF(list);
1231 return NULL;
1232 }
1233 if (PyList_GET_SIZE(list) == 0) {
1234 Py_DECREF(list);
1235 item = joiner;
1236 }
1237 else {
1238 if (state.isbytes)
1239 item = _PyBytes_Join(joiner, list);
1240 else
1241 item = PyUnicode_Join(joiner, list);
1242 Py_DECREF(joiner);
1243 Py_DECREF(list);
1244 if (!item)
1245 return NULL;
1246 }
1247
1248 if (subn)
1249 return Py_BuildValue("Nn", item, n);
1250
1251 return item;
1252
1253 error:
1254 Py_DECREF(list);
1255 state_fini(&state);
1256 Py_DECREF(filter);
1257 return NULL;
1258
1259 }
1260
1261 /*[clinic input]
1262 _sre.SRE_Pattern.sub
1263
1264 cls: defining_class
1265 /
1266 repl: object
1267 string: object
1268 count: Py_ssize_t = 0
1269
1270 Return the string obtained by replacing the leftmost non-overlapping occurrences of pattern in string by the replacement repl.
1271 [clinic start generated code]*/
1272
1273 static PyObject *
1274 _sre_SRE_Pattern_sub_impl(PatternObject *self, PyTypeObject *cls,
1275 PyObject *repl, PyObject *string, Py_ssize_t count)
1276 /*[clinic end generated code: output=4be141ab04bca60d input=d8d1d4ac2311a07c]*/
1277 {
1278 _sremodulestate *module_state = get_sre_module_state_by_class(cls);
1279
1280 return pattern_subx(module_state, self, repl, string, count, 0);
1281 }
1282
1283 /*[clinic input]
1284 _sre.SRE_Pattern.subn
1285
1286 cls: defining_class
1287 /
1288 repl: object
1289 string: object
1290 count: Py_ssize_t = 0
1291
1292 Return the tuple (new_string, number_of_subs_made) found by replacing the leftmost non-overlapping occurrences of pattern with the replacement repl.
1293 [clinic start generated code]*/
1294
1295 static PyObject *
1296 _sre_SRE_Pattern_subn_impl(PatternObject *self, PyTypeObject *cls,
1297 PyObject *repl, PyObject *string,
1298 Py_ssize_t count)
1299 /*[clinic end generated code: output=da02fd85258b1e1f input=8b78a65b8302e58d]*/
1300 {
1301 _sremodulestate *module_state = get_sre_module_state_by_class(cls);
1302
1303 return pattern_subx(module_state, self, repl, string, count, 1);
1304 }
1305
1306 /*[clinic input]
1307 _sre.SRE_Pattern.__copy__
1308
1309 [clinic start generated code]*/
1310
1311 static PyObject *
1312 _sre_SRE_Pattern___copy___impl(PatternObject *self)
1313 /*[clinic end generated code: output=85dedc2db1bd8694 input=a730a59d863bc9f5]*/
1314 {
1315 return Py_NewRef(self);
1316 }
1317
1318 /*[clinic input]
1319 _sre.SRE_Pattern.__deepcopy__
1320
1321 memo: object
1322 /
1323
1324 [clinic start generated code]*/
1325
1326 static PyObject *
1327 _sre_SRE_Pattern___deepcopy__(PatternObject *self, PyObject *memo)
1328 /*[clinic end generated code: output=2ad25679c1f1204a input=a465b1602f997bed]*/
1329 {
1330 return Py_NewRef(self);
1331 }
1332
1333 static PyObject *
1334 pattern_repr(PatternObject *obj)
1335 {
1336 static const struct {
1337 const char *name;
1338 int value;
1339 } flag_names[] = {
1340 {"re.TEMPLATE", SRE_FLAG_TEMPLATE},
1341 {"re.IGNORECASE", SRE_FLAG_IGNORECASE},
1342 {"re.LOCALE", SRE_FLAG_LOCALE},
1343 {"re.MULTILINE", SRE_FLAG_MULTILINE},
1344 {"re.DOTALL", SRE_FLAG_DOTALL},
1345 {"re.UNICODE", SRE_FLAG_UNICODE},
1346 {"re.VERBOSE", SRE_FLAG_VERBOSE},
1347 {"re.DEBUG", SRE_FLAG_DEBUG},
1348 {"re.ASCII", SRE_FLAG_ASCII},
1349 };
1350 PyObject *result = NULL;
1351 PyObject *flag_items;
1352 size_t i;
1353 int flags = obj->flags;
1354
1355 /* Omit re.UNICODE for valid string patterns. */
1356 if (obj->isbytes == 0 &&
1357 (flags & (SRE_FLAG_LOCALE|SRE_FLAG_UNICODE|SRE_FLAG_ASCII)) ==
1358 SRE_FLAG_UNICODE)
1359 flags &= ~SRE_FLAG_UNICODE;
1360
1361 flag_items = PyList_New(0);
1362 if (!flag_items)
1363 return NULL;
1364
1365 for (i = 0; i < Py_ARRAY_LENGTH(flag_names); i++) {
1366 if (flags & flag_names[i].value) {
1367 PyObject *item = PyUnicode_FromString(flag_names[i].name);
1368 if (!item)
1369 goto done;
1370
1371 if (PyList_Append(flag_items, item) < 0) {
1372 Py_DECREF(item);
1373 goto done;
1374 }
1375 Py_DECREF(item);
1376 flags &= ~flag_names[i].value;
1377 }
1378 }
1379 if (flags) {
1380 PyObject *item = PyUnicode_FromFormat("0x%x", flags);
1381 if (!item)
1382 goto done;
1383
1384 if (PyList_Append(flag_items, item) < 0) {
1385 Py_DECREF(item);
1386 goto done;
1387 }
1388 Py_DECREF(item);
1389 }
1390
1391 if (PyList_Size(flag_items) > 0) {
1392 PyObject *flags_result;
1393 PyObject *sep = PyUnicode_FromString("|");
1394 if (!sep)
1395 goto done;
1396 flags_result = PyUnicode_Join(sep, flag_items);
1397 Py_DECREF(sep);
1398 if (!flags_result)
1399 goto done;
1400 result = PyUnicode_FromFormat("re.compile(%.200R, %S)",
1401 obj->pattern, flags_result);
1402 Py_DECREF(flags_result);
1403 }
1404 else {
1405 result = PyUnicode_FromFormat("re.compile(%.200R)", obj->pattern);
1406 }
1407
1408 done:
1409 Py_DECREF(flag_items);
1410 return result;
1411 }
1412
1413 PyDoc_STRVAR(pattern_doc, "Compiled regular expression object.");
1414
1415 /* PatternObject's 'groupindex' method. */
1416 static PyObject *
1417 pattern_groupindex(PatternObject *self, void *Py_UNUSED(ignored))
1418 {
1419 if (self->groupindex == NULL)
1420 return PyDict_New();
1421 return PyDictProxy_New(self->groupindex);
1422 }
1423
1424 static int _validate(PatternObject *self); /* Forward */
1425
1426 /*[clinic input]
1427 _sre.compile
1428
1429 pattern: object
1430 flags: int
1431 code: object(subclass_of='&PyList_Type')
1432 groups: Py_ssize_t
1433 groupindex: object(subclass_of='&PyDict_Type')
1434 indexgroup: object(subclass_of='&PyTuple_Type')
1435
1436 [clinic start generated code]*/
1437
1438 static PyObject *
1439 _sre_compile_impl(PyObject *module, PyObject *pattern, int flags,
1440 PyObject *code, Py_ssize_t groups, PyObject *groupindex,
1441 PyObject *indexgroup)
1442 /*[clinic end generated code: output=ef9c2b3693776404 input=0a68476dbbe5db30]*/
1443 {
1444 /* "compile" pattern descriptor to pattern object */
1445
1446 _sremodulestate *module_state = get_sre_module_state(module);
1447 PatternObject* self;
1448 Py_ssize_t i, n;
1449
1450 n = PyList_GET_SIZE(code);
1451 /* coverity[ampersand_in_size] */
1452 self = PyObject_GC_NewVar(PatternObject, module_state->Pattern_Type, n);
1453 if (!self)
1454 return NULL;
1455 self->weakreflist = NULL;
1456 self->pattern = NULL;
1457 self->groupindex = NULL;
1458 self->indexgroup = NULL;
1459
1460 self->codesize = n;
1461
1462 for (i = 0; i < n; i++) {
1463 PyObject *o = PyList_GET_ITEM(code, i);
1464 unsigned long value = PyLong_AsUnsignedLong(o);
1465 self->code[i] = (SRE_CODE) value;
1466 if ((unsigned long) self->code[i] != value) {
1467 PyErr_SetString(PyExc_OverflowError,
1468 "regular expression code size limit exceeded");
1469 break;
1470 }
1471 }
1472 PyObject_GC_Track(self);
1473
1474 if (PyErr_Occurred()) {
1475 Py_DECREF(self);
1476 return NULL;
1477 }
1478
1479 if (pattern == Py_None) {
1480 self->isbytes = -1;
1481 }
1482 else {
1483 Py_ssize_t p_length;
1484 int charsize;
1485 Py_buffer view;
1486 view.buf = NULL;
1487 if (!getstring(pattern, &p_length, &self->isbytes,
1488 &charsize, &view)) {
1489 Py_DECREF(self);
1490 return NULL;
1491 }
1492 if (view.buf)
1493 PyBuffer_Release(&view);
1494 }
1495
1496 self->pattern = Py_NewRef(pattern);
1497
1498 self->flags = flags;
1499
1500 self->groups = groups;
1501
1502 if (PyDict_GET_SIZE(groupindex) > 0) {
1503 self->groupindex = Py_NewRef(groupindex);
1504 if (PyTuple_GET_SIZE(indexgroup) > 0) {
1505 self->indexgroup = Py_NewRef(indexgroup);
1506 }
1507 }
1508
1509 if (!_validate(self)) {
1510 Py_DECREF(self);
1511 return NULL;
1512 }
1513
1514 return (PyObject*) self;
1515 }
1516
1517 /*[clinic input]
1518 _sre.template
1519
1520 pattern: object
1521 template: object(subclass_of="&PyList_Type")
1522 A list containing interleaved literal strings (str or bytes) and group
1523 indices (int), as returned by re._parser.parse_template():
1524 [literal1, group1, ..., literalN, groupN]
1525 /
1526
1527 [clinic start generated code]*/
1528
1529 static PyObject *
1530 _sre_template_impl(PyObject *module, PyObject *pattern, PyObject *template)
1531 /*[clinic end generated code: output=d51290e596ebca86 input=af55380b27f02942]*/
1532 {
1533 /* template is a list containing interleaved literal strings (str or bytes)
1534 * and group indices (int), as returned by _parser.parse_template:
1535 * [literal1, group1, literal2, ..., literalN].
1536 */
1537 _sremodulestate *module_state = get_sre_module_state(module);
1538 TemplateObject *self = NULL;
1539 Py_ssize_t n = PyList_GET_SIZE(template);
1540 if ((n & 1) == 0 || n < 1) {
1541 goto bad_template;
1542 }
1543 n /= 2;
1544 self = PyObject_GC_NewVar(TemplateObject, module_state->Template_Type, n);
1545 if (!self)
1546 return NULL;
1547 self->chunks = 1 + 2*n;
1548 self->literal = Py_NewRef(PyList_GET_ITEM(template, 0));
1549 for (Py_ssize_t i = 0; i < n; i++) {
1550 Py_ssize_t index = PyLong_AsSsize_t(PyList_GET_ITEM(template, 2*i+1));
1551 if (index == -1 && PyErr_Occurred()) {
1552 Py_SET_SIZE(self, i);
1553 Py_DECREF(self);
1554 return NULL;
1555 }
1556 if (index < 0) {
1557 Py_SET_SIZE(self, i);
1558 goto bad_template;
1559 }
1560 self->items[i].index = index;
1561
1562 PyObject *literal = PyList_GET_ITEM(template, 2*i+2);
1563 // Skip empty literals.
1564 if ((PyUnicode_Check(literal) && !PyUnicode_GET_LENGTH(literal)) ||
1565 (PyBytes_Check(literal) && !PyBytes_GET_SIZE(literal)))
1566 {
1567 literal = NULL;
1568 self->chunks--;
1569 }
1570 self->items[i].literal = Py_XNewRef(literal);
1571 }
1572 return (PyObject*) self;
1573
1574 bad_template:
1575 PyErr_SetString(PyExc_TypeError, "invalid template");
1576 Py_XDECREF(self);
1577 return NULL;
1578 }
1579
1580 /* -------------------------------------------------------------------- */
1581 /* Code validation */
1582
1583 /* To learn more about this code, have a look at the _compile() function in
1584 Lib/sre_compile.py. The validation functions below checks the code array
1585 for conformance with the code patterns generated there.
1586
1587 The nice thing about the generated code is that it is position-independent:
1588 all jumps are relative jumps forward. Also, jumps don't cross each other:
1589 the target of a later jump is always earlier than the target of an earlier
1590 jump. IOW, this is okay:
1591
1592 J---------J-------T--------T
1593 \ \_____/ /
1594 \______________________/
1595
1596 but this is not:
1597
1598 J---------J-------T--------T
1599 \_________\_____/ /
1600 \____________/
1601
1602 It also helps that SRE_CODE is always an unsigned type.
1603 */
1604
1605 /* Defining this one enables tracing of the validator */
1606 #undef VVERBOSE
1607
1608 /* Trace macro for the validator */
1609 #if defined(VVERBOSE)
1610 #define VTRACE(v) printf v
1611 #else
1612 #define VTRACE(v) do {} while(0) /* do nothing */
1613 #endif
1614
1615 /* Report failure */
1616 #define FAIL do { VTRACE(("FAIL: %d\n", __LINE__)); return -1; } while (0)
1617
1618 /* Extract opcode, argument, or skip count from code array */
1619 #define GET_OP \
1620 do { \
1621 VTRACE(("%p: ", code)); \
1622 if (code >= end) FAIL; \
1623 op = *code++; \
1624 VTRACE(("%lu (op)\n", (unsigned long)op)); \
1625 } while (0)
1626 #define GET_ARG \
1627 do { \
1628 VTRACE(("%p= ", code)); \
1629 if (code >= end) FAIL; \
1630 arg = *code++; \
1631 VTRACE(("%lu (arg)\n", (unsigned long)arg)); \
1632 } while (0)
1633 #define GET_SKIP_ADJ(adj) \
1634 do { \
1635 VTRACE(("%p= ", code)); \
1636 if (code >= end) FAIL; \
1637 skip = *code; \
1638 VTRACE(("%lu (skip to %p)\n", \
1639 (unsigned long)skip, code+skip)); \
1640 if (skip-adj > (uintptr_t)(end - code)) \
1641 FAIL; \
1642 code++; \
1643 } while (0)
1644 #define GET_SKIP GET_SKIP_ADJ(0)
1645
1646 static int
1647 _validate_charset(SRE_CODE *code, SRE_CODE *end)
1648 {
1649 /* Some variables are manipulated by the macros above */
1650 SRE_CODE op;
1651 SRE_CODE arg;
1652 SRE_CODE offset;
1653 int i;
1654
1655 while (code < end) {
1656 GET_OP;
1657 switch (op) {
1658
1659 case SRE_OP_NEGATE:
1660 break;
1661
1662 case SRE_OP_LITERAL:
1663 GET_ARG;
1664 break;
1665
1666 case SRE_OP_RANGE:
1667 case SRE_OP_RANGE_UNI_IGNORE:
1668 GET_ARG;
1669 GET_ARG;
1670 break;
1671
1672 case SRE_OP_CHARSET:
1673 offset = 256/SRE_CODE_BITS; /* 256-bit bitmap */
1674 if (offset > (uintptr_t)(end - code))
1675 FAIL;
1676 code += offset;
1677 break;
1678
1679 case SRE_OP_BIGCHARSET:
1680 GET_ARG; /* Number of blocks */
1681 offset = 256/sizeof(SRE_CODE); /* 256-byte table */
1682 if (offset > (uintptr_t)(end - code))
1683 FAIL;
1684 /* Make sure that each byte points to a valid block */
1685 for (i = 0; i < 256; i++) {
1686 if (((unsigned char *)code)[i] >= arg)
1687 FAIL;
1688 }
1689 code += offset;
1690 offset = arg * (256/SRE_CODE_BITS); /* 256-bit bitmap times arg */
1691 if (offset > (uintptr_t)(end - code))
1692 FAIL;
1693 code += offset;
1694 break;
1695
1696 case SRE_OP_CATEGORY:
1697 GET_ARG;
1698 switch (arg) {
1699 case SRE_CATEGORY_DIGIT:
1700 case SRE_CATEGORY_NOT_DIGIT:
1701 case SRE_CATEGORY_SPACE:
1702 case SRE_CATEGORY_NOT_SPACE:
1703 case SRE_CATEGORY_WORD:
1704 case SRE_CATEGORY_NOT_WORD:
1705 case SRE_CATEGORY_LINEBREAK:
1706 case SRE_CATEGORY_NOT_LINEBREAK:
1707 case SRE_CATEGORY_LOC_WORD:
1708 case SRE_CATEGORY_LOC_NOT_WORD:
1709 case SRE_CATEGORY_UNI_DIGIT:
1710 case SRE_CATEGORY_UNI_NOT_DIGIT:
1711 case SRE_CATEGORY_UNI_SPACE:
1712 case SRE_CATEGORY_UNI_NOT_SPACE:
1713 case SRE_CATEGORY_UNI_WORD:
1714 case SRE_CATEGORY_UNI_NOT_WORD:
1715 case SRE_CATEGORY_UNI_LINEBREAK:
1716 case SRE_CATEGORY_UNI_NOT_LINEBREAK:
1717 break;
1718 default:
1719 FAIL;
1720 }
1721 break;
1722
1723 default:
1724 FAIL;
1725
1726 }
1727 }
1728
1729 return 0;
1730 }
1731
1732 /* Returns 0 on success, -1 on failure, and 1 if the last op is JUMP. */
1733 static int
1734 _validate_inner(SRE_CODE *code, SRE_CODE *end, Py_ssize_t groups)
1735 {
1736 /* Some variables are manipulated by the macros above */
1737 SRE_CODE op;
1738 SRE_CODE arg;
1739 SRE_CODE skip;
1740
1741 VTRACE(("code=%p, end=%p\n", code, end));
1742
1743 if (code > end)
1744 FAIL;
1745
1746 while (code < end) {
1747 GET_OP;
1748 switch (op) {
1749
1750 case SRE_OP_MARK:
1751 /* We don't check whether marks are properly nested; the
1752 sre_match() code is robust even if they don't, and the worst
1753 you can get is nonsensical match results. */
1754 GET_ARG;
1755 if (arg > 2 * (size_t)groups + 1) {
1756 VTRACE(("arg=%d, groups=%d\n", (int)arg, (int)groups));
1757 FAIL;
1758 }
1759 break;
1760
1761 case SRE_OP_LITERAL:
1762 case SRE_OP_NOT_LITERAL:
1763 case SRE_OP_LITERAL_IGNORE:
1764 case SRE_OP_NOT_LITERAL_IGNORE:
1765 case SRE_OP_LITERAL_UNI_IGNORE:
1766 case SRE_OP_NOT_LITERAL_UNI_IGNORE:
1767 case SRE_OP_LITERAL_LOC_IGNORE:
1768 case SRE_OP_NOT_LITERAL_LOC_IGNORE:
1769 GET_ARG;
1770 /* The arg is just a character, nothing to check */
1771 break;
1772
1773 case SRE_OP_SUCCESS:
1774 case SRE_OP_FAILURE:
1775 /* Nothing to check; these normally end the matching process */
1776 break;
1777
1778 case SRE_OP_AT:
1779 GET_ARG;
1780 switch (arg) {
1781 case SRE_AT_BEGINNING:
1782 case SRE_AT_BEGINNING_STRING:
1783 case SRE_AT_BEGINNING_LINE:
1784 case SRE_AT_END:
1785 case SRE_AT_END_LINE:
1786 case SRE_AT_END_STRING:
1787 case SRE_AT_BOUNDARY:
1788 case SRE_AT_NON_BOUNDARY:
1789 case SRE_AT_LOC_BOUNDARY:
1790 case SRE_AT_LOC_NON_BOUNDARY:
1791 case SRE_AT_UNI_BOUNDARY:
1792 case SRE_AT_UNI_NON_BOUNDARY:
1793 break;
1794 default:
1795 FAIL;
1796 }
1797 break;
1798
1799 case SRE_OP_ANY:
1800 case SRE_OP_ANY_ALL:
1801 /* These have no operands */
1802 break;
1803
1804 case SRE_OP_IN:
1805 case SRE_OP_IN_IGNORE:
1806 case SRE_OP_IN_UNI_IGNORE:
1807 case SRE_OP_IN_LOC_IGNORE:
1808 GET_SKIP;
1809 /* Stop 1 before the end; we check the FAILURE below */
1810 if (_validate_charset(code, code+skip-2))
1811 FAIL;
1812 if (code[skip-2] != SRE_OP_FAILURE)
1813 FAIL;
1814 code += skip-1;
1815 break;
1816
1817 case SRE_OP_INFO:
1818 {
1819 /* A minimal info field is
1820 <INFO> <1=skip> <2=flags> <3=min> <4=max>;
1821 If SRE_INFO_PREFIX or SRE_INFO_CHARSET is in the flags,
1822 more follows. */
1823 SRE_CODE flags, i;
1824 SRE_CODE *newcode;
1825 GET_SKIP;
1826 newcode = code+skip-1;
1827 GET_ARG; flags = arg;
1828 GET_ARG;
1829 GET_ARG;
1830 /* Check that only valid flags are present */
1831 if ((flags & ~(SRE_INFO_PREFIX |
1832 SRE_INFO_LITERAL |
1833 SRE_INFO_CHARSET)) != 0)
1834 FAIL;
1835 /* PREFIX and CHARSET are mutually exclusive */
1836 if ((flags & SRE_INFO_PREFIX) &&
1837 (flags & SRE_INFO_CHARSET))
1838 FAIL;
1839 /* LITERAL implies PREFIX */
1840 if ((flags & SRE_INFO_LITERAL) &&
1841 !(flags & SRE_INFO_PREFIX))
1842 FAIL;
1843 /* Validate the prefix */
1844 if (flags & SRE_INFO_PREFIX) {
1845 SRE_CODE prefix_len;
1846 GET_ARG; prefix_len = arg;
1847 GET_ARG;
1848 /* Here comes the prefix string */
1849 if (prefix_len > (uintptr_t)(newcode - code))
1850 FAIL;
1851 code += prefix_len;
1852 /* And here comes the overlap table */
1853 if (prefix_len > (uintptr_t)(newcode - code))
1854 FAIL;
1855 /* Each overlap value should be < prefix_len */
1856 for (i = 0; i < prefix_len; i++) {
1857 if (code[i] >= prefix_len)
1858 FAIL;
1859 }
1860 code += prefix_len;
1861 }
1862 /* Validate the charset */
1863 if (flags & SRE_INFO_CHARSET) {
1864 if (_validate_charset(code, newcode-1))
1865 FAIL;
1866 if (newcode[-1] != SRE_OP_FAILURE)
1867 FAIL;
1868 code = newcode;
1869 }
1870 else if (code != newcode) {
1871 VTRACE(("code=%p, newcode=%p\n", code, newcode));
1872 FAIL;
1873 }
1874 }
1875 break;
1876
1877 case SRE_OP_BRANCH:
1878 {
1879 SRE_CODE *target = NULL;
1880 for (;;) {
1881 GET_SKIP;
1882 if (skip == 0)
1883 break;
1884 /* Stop 2 before the end; we check the JUMP below */
1885 if (_validate_inner(code, code+skip-3, groups))
1886 FAIL;
1887 code += skip-3;
1888 /* Check that it ends with a JUMP, and that each JUMP
1889 has the same target */
1890 GET_OP;
1891 if (op != SRE_OP_JUMP)
1892 FAIL;
1893 GET_SKIP;
1894 if (target == NULL)
1895 target = code+skip-1;
1896 else if (code+skip-1 != target)
1897 FAIL;
1898 }
1899 if (code != target)
1900 FAIL;
1901 }
1902 break;
1903
1904 case SRE_OP_REPEAT_ONE:
1905 case SRE_OP_MIN_REPEAT_ONE:
1906 case SRE_OP_POSSESSIVE_REPEAT_ONE:
1907 {
1908 SRE_CODE min, max;
1909 GET_SKIP;
1910 GET_ARG; min = arg;
1911 GET_ARG; max = arg;
1912 if (min > max)
1913 FAIL;
1914 if (max > SRE_MAXREPEAT)
1915 FAIL;
1916 if (_validate_inner(code, code+skip-4, groups))
1917 FAIL;
1918 code += skip-4;
1919 GET_OP;
1920 if (op != SRE_OP_SUCCESS)
1921 FAIL;
1922 }
1923 break;
1924
1925 case SRE_OP_REPEAT:
1926 case SRE_OP_POSSESSIVE_REPEAT:
1927 {
1928 SRE_CODE op1 = op, min, max;
1929 GET_SKIP;
1930 GET_ARG; min = arg;
1931 GET_ARG; max = arg;
1932 if (min > max)
1933 FAIL;
1934 if (max > SRE_MAXREPEAT)
1935 FAIL;
1936 if (_validate_inner(code, code+skip-3, groups))
1937 FAIL;
1938 code += skip-3;
1939 GET_OP;
1940 if (op1 == SRE_OP_POSSESSIVE_REPEAT) {
1941 if (op != SRE_OP_SUCCESS)
1942 FAIL;
1943 }
1944 else {
1945 if (op != SRE_OP_MAX_UNTIL && op != SRE_OP_MIN_UNTIL)
1946 FAIL;
1947 }
1948 }
1949 break;
1950
1951 case SRE_OP_ATOMIC_GROUP:
1952 {
1953 GET_SKIP;
1954 if (_validate_inner(code, code+skip-2, groups))
1955 FAIL;
1956 code += skip-2;
1957 GET_OP;
1958 if (op != SRE_OP_SUCCESS)
1959 FAIL;
1960 }
1961 break;
1962
1963 case SRE_OP_GROUPREF:
1964 case SRE_OP_GROUPREF_IGNORE:
1965 case SRE_OP_GROUPREF_UNI_IGNORE:
1966 case SRE_OP_GROUPREF_LOC_IGNORE:
1967 GET_ARG;
1968 if (arg >= (size_t)groups)
1969 FAIL;
1970 break;
1971
1972 case SRE_OP_GROUPREF_EXISTS:
1973 /* The regex syntax for this is: '(?(group)then|else)', where
1974 'group' is either an integer group number or a group name,
1975 'then' and 'else' are sub-regexes, and 'else' is optional. */
1976 GET_ARG;
1977 if (arg >= (size_t)groups)
1978 FAIL;
1979 GET_SKIP_ADJ(1);
1980 code--; /* The skip is relative to the first arg! */
1981 /* There are two possibilities here: if there is both a 'then'
1982 part and an 'else' part, the generated code looks like:
1983
1984 GROUPREF_EXISTS
1985 <group>
1986 <skipyes>
1987 ...then part...
1988 JUMP
1989 <skipno>
1990 (<skipyes> jumps here)
1991 ...else part...
1992 (<skipno> jumps here)
1993
1994 If there is only a 'then' part, it looks like:
1995
1996 GROUPREF_EXISTS
1997 <group>
1998 <skip>
1999 ...then part...
2000 (<skip> jumps here)
2001
2002 There is no direct way to decide which it is, and we don't want
2003 to allow arbitrary jumps anywhere in the code; so we just look
2004 for a JUMP opcode preceding our skip target.
2005 */
2006 VTRACE(("then part:\n"));
2007 int rc = _validate_inner(code+1, code+skip-1, groups);
2008 if (rc == 1) {
2009 VTRACE(("else part:\n"));
2010 code += skip-2; /* Position after JUMP, at <skipno> */
2011 GET_SKIP;
2012 rc = _validate_inner(code, code+skip-1, groups);
2013 }
2014 if (rc)
2015 FAIL;
2016 code += skip-1;
2017 break;
2018
2019 case SRE_OP_ASSERT:
2020 case SRE_OP_ASSERT_NOT:
2021 GET_SKIP;
2022 GET_ARG; /* 0 for lookahead, width for lookbehind */
2023 code--; /* Back up over arg to simplify math below */
2024 if (arg & 0x80000000)
2025 FAIL; /* Width too large */
2026 /* Stop 1 before the end; we check the SUCCESS below */
2027 if (_validate_inner(code+1, code+skip-2, groups))
2028 FAIL;
2029 code += skip-2;
2030 GET_OP;
2031 if (op != SRE_OP_SUCCESS)
2032 FAIL;
2033 break;
2034
2035 case SRE_OP_JUMP:
2036 if (code + 1 != end)
2037 FAIL;
2038 VTRACE(("JUMP: %d\n", __LINE__));
2039 return 1;
2040
2041 default:
2042 FAIL;
2043
2044 }
2045 }
2046
2047 VTRACE(("okay\n"));
2048 return 0;
2049 }
2050
2051 static int
2052 _validate_outer(SRE_CODE *code, SRE_CODE *end, Py_ssize_t groups)
2053 {
2054 if (groups < 0 || (size_t)groups > SRE_MAXGROUPS ||
2055 code >= end || end[-1] != SRE_OP_SUCCESS)
2056 FAIL;
2057 return _validate_inner(code, end-1, groups);
2058 }
2059
2060 static int
2061 _validate(PatternObject *self)
2062 {
2063 if (_validate_outer(self->code, self->code+self->codesize, self->groups))
2064 {
2065 PyErr_SetString(PyExc_RuntimeError, "invalid SRE code");
2066 return 0;
2067 }
2068 else
2069 VTRACE(("Success!\n"));
2070 return 1;
2071 }
2072
2073 /* -------------------------------------------------------------------- */
2074 /* match methods */
2075
2076 static int
2077 match_traverse(MatchObject *self, visitproc visit, void *arg)
2078 {
2079 Py_VISIT(Py_TYPE(self));
2080 Py_VISIT(self->string);
2081 Py_VISIT(self->regs);
2082 Py_VISIT(self->pattern);
2083 return 0;
2084 }
2085
2086 static int
2087 match_clear(MatchObject *self)
2088 {
2089 Py_CLEAR(self->string);
2090 Py_CLEAR(self->regs);
2091 Py_CLEAR(self->pattern);
2092 return 0;
2093 }
2094
2095 static void
2096 match_dealloc(MatchObject* self)
2097 {
2098 PyTypeObject *tp = Py_TYPE(self);
2099
2100 PyObject_GC_UnTrack(self);
2101 (void)match_clear(self);
2102 tp->tp_free(self);
2103 Py_DECREF(tp);
2104 }
2105
2106 static PyObject*
2107 match_getslice_by_index(MatchObject* self, Py_ssize_t index, PyObject* def)
2108 {
2109 Py_ssize_t length;
2110 int isbytes, charsize;
2111 Py_buffer view;
2112 PyObject *result;
2113 const void* ptr;
2114 Py_ssize_t i, j;
2115
2116 assert(0 <= index && index < self->groups);
2117 index *= 2;
2118
2119 if (self->string == Py_None || self->mark[index] < 0) {
2120 /* return default value if the string or group is undefined */
2121 return Py_NewRef(def);
2122 }
2123
2124 ptr = getstring(self->string, &length, &isbytes, &charsize, &view);
2125 if (ptr == NULL)
2126 return NULL;
2127
2128 i = self->mark[index];
2129 j = self->mark[index+1];
2130 i = Py_MIN(i, length);
2131 j = Py_MIN(j, length);
2132 result = getslice(isbytes, ptr, self->string, i, j);
2133 if (isbytes && view.buf != NULL)
2134 PyBuffer_Release(&view);
2135 return result;
2136 }
2137
2138 static Py_ssize_t
2139 match_getindex(MatchObject* self, PyObject* index)
2140 {
2141 Py_ssize_t i;
2142
2143 if (index == NULL)
2144 /* Default value */
2145 return 0;
2146
2147 if (PyIndex_Check(index)) {
2148 i = PyNumber_AsSsize_t(index, NULL);
2149 }
2150 else {
2151 i = -1;
2152
2153 if (self->pattern->groupindex) {
2154 index = PyDict_GetItemWithError(self->pattern->groupindex, index);
2155 if (index && PyLong_Check(index)) {
2156 i = PyLong_AsSsize_t(index);
2157 }
2158 }
2159 }
2160 if (i < 0 || i >= self->groups) {
2161 /* raise IndexError if we were given a bad group number */
2162 if (!PyErr_Occurred()) {
2163 PyErr_SetString(PyExc_IndexError, "no such group");
2164 }
2165 return -1;
2166 }
2167
2168 return i;
2169 }
2170
2171 static PyObject*
2172 match_getslice(MatchObject* self, PyObject* index, PyObject* def)
2173 {
2174 Py_ssize_t i = match_getindex(self, index);
2175
2176 if (i < 0) {
2177 return NULL;
2178 }
2179
2180 return match_getslice_by_index(self, i, def);
2181 }
2182
2183 /*[clinic input]
2184 _sre.SRE_Match.expand
2185
2186 template: object
2187
2188 Return the string obtained by doing backslash substitution on the string template, as done by the sub() method.
2189 [clinic start generated code]*/
2190
2191 static PyObject *
2192 _sre_SRE_Match_expand_impl(MatchObject *self, PyObject *template)
2193 /*[clinic end generated code: output=931b58ccc323c3a1 input=4bfdb22c2f8b146a]*/
2194 {
2195 _sremodulestate *module_state = get_sre_module_state_by_class(Py_TYPE(self));
2196 PyObject *filter = compile_template(module_state, self->pattern, template);
2197 if (filter == NULL) {
2198 return NULL;
2199 }
2200 PyObject *result = expand_template((TemplateObject *)filter, self);
2201 Py_DECREF(filter);
2202 return result;
2203 }
2204
2205 static PyObject*
2206 match_group(MatchObject* self, PyObject* args)
2207 {
2208 PyObject* result;
2209 Py_ssize_t i, size;
2210
2211 size = PyTuple_GET_SIZE(args);
2212
2213 switch (size) {
2214 case 0:
2215 result = match_getslice(self, _PyLong_GetZero(), Py_None);
2216 break;
2217 case 1:
2218 result = match_getslice(self, PyTuple_GET_ITEM(args, 0), Py_None);
2219 break;
2220 default:
2221 /* fetch multiple items */
2222 result = PyTuple_New(size);
2223 if (!result)
2224 return NULL;
2225 for (i = 0; i < size; i++) {
2226 PyObject* item = match_getslice(
2227 self, PyTuple_GET_ITEM(args, i), Py_None
2228 );
2229 if (!item) {
2230 Py_DECREF(result);
2231 return NULL;
2232 }
2233 PyTuple_SET_ITEM(result, i, item);
2234 }
2235 break;
2236 }
2237 return result;
2238 }
2239
2240 static PyObject*
2241 match_getitem(MatchObject* self, PyObject* name)
2242 {
2243 return match_getslice(self, name, Py_None);
2244 }
2245
2246 /*[clinic input]
2247 _sre.SRE_Match.groups
2248
2249 default: object = None
2250 Is used for groups that did not participate in the match.
2251
2252 Return a tuple containing all the subgroups of the match, from 1.
2253 [clinic start generated code]*/
2254
2255 static PyObject *
2256 _sre_SRE_Match_groups_impl(MatchObject *self, PyObject *default_value)
2257 /*[clinic end generated code: output=daf8e2641537238a input=bb069ef55dabca91]*/
2258 {
2259 PyObject* result;
2260 Py_ssize_t index;
2261
2262 result = PyTuple_New(self->groups-1);
2263 if (!result)
2264 return NULL;
2265
2266 for (index = 1; index < self->groups; index++) {
2267 PyObject* item;
2268 item = match_getslice_by_index(self, index, default_value);
2269 if (!item) {
2270 Py_DECREF(result);
2271 return NULL;
2272 }
2273 PyTuple_SET_ITEM(result, index-1, item);
2274 }
2275
2276 return result;
2277 }
2278
2279 /*[clinic input]
2280 _sre.SRE_Match.groupdict
2281
2282 default: object = None
2283 Is used for groups that did not participate in the match.
2284
2285 Return a dictionary containing all the named subgroups of the match, keyed by the subgroup name.
2286 [clinic start generated code]*/
2287
2288 static PyObject *
2289 _sre_SRE_Match_groupdict_impl(MatchObject *self, PyObject *default_value)
2290 /*[clinic end generated code: output=29917c9073e41757 input=0ded7960b23780aa]*/
2291 {
2292 PyObject *result;
2293 PyObject *key;
2294 PyObject *value;
2295 Py_ssize_t pos = 0;
2296 Py_hash_t hash;
2297
2298 result = PyDict_New();
2299 if (!result || !self->pattern->groupindex)
2300 return result;
2301
2302 while (_PyDict_Next(self->pattern->groupindex, &pos, &key, &value, &hash)) {
2303 int status;
2304 Py_INCREF(key);
2305 value = match_getslice(self, key, default_value);
2306 if (!value) {
2307 Py_DECREF(key);
2308 goto failed;
2309 }
2310 status = _PyDict_SetItem_KnownHash(result, key, value, hash);
2311 Py_DECREF(value);
2312 Py_DECREF(key);
2313 if (status < 0)
2314 goto failed;
2315 }
2316
2317 return result;
2318
2319 failed:
2320 Py_DECREF(result);
2321 return NULL;
2322 }
2323
2324 /*[clinic input]
2325 _sre.SRE_Match.start -> Py_ssize_t
2326
2327 group: object(c_default="NULL") = 0
2328 /
2329
2330 Return index of the start of the substring matched by group.
2331 [clinic start generated code]*/
2332
2333 static Py_ssize_t
2334 _sre_SRE_Match_start_impl(MatchObject *self, PyObject *group)
2335 /*[clinic end generated code: output=3f6e7f9df2fb5201 input=ced8e4ed4b33ee6c]*/
2336 {
2337 Py_ssize_t index = match_getindex(self, group);
2338
2339 if (index < 0) {
2340 return -1;
2341 }
2342
2343 /* mark is -1 if group is undefined */
2344 return self->mark[index*2];
2345 }
2346
2347 /*[clinic input]
2348 _sre.SRE_Match.end -> Py_ssize_t
2349
2350 group: object(c_default="NULL") = 0
2351 /
2352
2353 Return index of the end of the substring matched by group.
2354 [clinic start generated code]*/
2355
2356 static Py_ssize_t
2357 _sre_SRE_Match_end_impl(MatchObject *self, PyObject *group)
2358 /*[clinic end generated code: output=f4240b09911f7692 input=1b799560c7f3d7e6]*/
2359 {
2360 Py_ssize_t index = match_getindex(self, group);
2361
2362 if (index < 0) {
2363 return -1;
2364 }
2365
2366 /* mark is -1 if group is undefined */
2367 return self->mark[index*2+1];
2368 }
2369
2370 LOCAL(PyObject*)
2371 _pair(Py_ssize_t i1, Py_ssize_t i2)
2372 {
2373 PyObject* pair;
2374 PyObject* item;
2375
2376 pair = PyTuple_New(2);
2377 if (!pair)
2378 return NULL;
2379
2380 item = PyLong_FromSsize_t(i1);
2381 if (!item)
2382 goto error;
2383 PyTuple_SET_ITEM(pair, 0, item);
2384
2385 item = PyLong_FromSsize_t(i2);
2386 if (!item)
2387 goto error;
2388 PyTuple_SET_ITEM(pair, 1, item);
2389
2390 return pair;
2391
2392 error:
2393 Py_DECREF(pair);
2394 return NULL;
2395 }
2396
2397 /*[clinic input]
2398 _sre.SRE_Match.span
2399
2400 group: object(c_default="NULL") = 0
2401 /
2402
2403 For match object m, return the 2-tuple (m.start(group), m.end(group)).
2404 [clinic start generated code]*/
2405
2406 static PyObject *
2407 _sre_SRE_Match_span_impl(MatchObject *self, PyObject *group)
2408 /*[clinic end generated code: output=f02ae40594d14fe6 input=8fa6014e982d71d4]*/
2409 {
2410 Py_ssize_t index = match_getindex(self, group);
2411
2412 if (index < 0) {
2413 return NULL;
2414 }
2415
2416 /* marks are -1 if group is undefined */
2417 return _pair(self->mark[index*2], self->mark[index*2+1]);
2418 }
2419
2420 static PyObject*
2421 match_regs(MatchObject* self)
2422 {
2423 PyObject* regs;
2424 PyObject* item;
2425 Py_ssize_t index;
2426
2427 regs = PyTuple_New(self->groups);
2428 if (!regs)
2429 return NULL;
2430
2431 for (index = 0; index < self->groups; index++) {
2432 item = _pair(self->mark[index*2], self->mark[index*2+1]);
2433 if (!item) {
2434 Py_DECREF(regs);
2435 return NULL;
2436 }
2437 PyTuple_SET_ITEM(regs, index, item);
2438 }
2439
2440 self->regs = Py_NewRef(regs);
2441
2442 return regs;
2443 }
2444
2445 /*[clinic input]
2446 _sre.SRE_Match.__copy__
2447
2448 [clinic start generated code]*/
2449
2450 static PyObject *
2451 _sre_SRE_Match___copy___impl(MatchObject *self)
2452 /*[clinic end generated code: output=a779c5fc8b5b4eb4 input=3bb4d30b6baddb5b]*/
2453 {
2454 return Py_NewRef(self);
2455 }
2456
2457 /*[clinic input]
2458 _sre.SRE_Match.__deepcopy__
2459
2460 memo: object
2461 /
2462
2463 [clinic start generated code]*/
2464
2465 static PyObject *
2466 _sre_SRE_Match___deepcopy__(MatchObject *self, PyObject *memo)
2467 /*[clinic end generated code: output=ba7cb46d655e4ee2 input=779d12a31c2c325e]*/
2468 {
2469 return Py_NewRef(self);
2470 }
2471
2472 PyDoc_STRVAR(match_doc,
2473 "The result of re.match() and re.search().\n\
2474 Match objects always have a boolean value of True.");
2475
2476 PyDoc_STRVAR(match_group_doc,
2477 "group([group1, ...]) -> str or tuple.\n\
2478 Return subgroup(s) of the match by indices or names.\n\
2479 For 0 returns the entire match.");
2480
2481 static PyObject *
2482 match_lastindex_get(MatchObject *self, void *Py_UNUSED(ignored))
2483 {
2484 if (self->lastindex >= 0)
2485 return PyLong_FromSsize_t(self->lastindex);
2486 Py_RETURN_NONE;
2487 }
2488
2489 static PyObject *
2490 match_lastgroup_get(MatchObject *self, void *Py_UNUSED(ignored))
2491 {
2492 if (self->pattern->indexgroup &&
2493 self->lastindex >= 0 &&
2494 self->lastindex < PyTuple_GET_SIZE(self->pattern->indexgroup))
2495 {
2496 PyObject *result = PyTuple_GET_ITEM(self->pattern->indexgroup,
2497 self->lastindex);
2498 return Py_NewRef(result);
2499 }
2500 Py_RETURN_NONE;
2501 }
2502
2503 static PyObject *
2504 match_regs_get(MatchObject *self, void *Py_UNUSED(ignored))
2505 {
2506 if (self->regs) {
2507 return Py_NewRef(self->regs);
2508 } else
2509 return match_regs(self);
2510 }
2511
2512 static PyObject *
2513 match_repr(MatchObject *self)
2514 {
2515 PyObject *result;
2516 PyObject *group0 = match_getslice_by_index(self, 0, Py_None);
2517 if (group0 == NULL)
2518 return NULL;
2519 result = PyUnicode_FromFormat(
2520 "<%s object; span=(%zd, %zd), match=%.50R>",
2521 Py_TYPE(self)->tp_name,
2522 self->mark[0], self->mark[1], group0);
2523 Py_DECREF(group0);
2524 return result;
2525 }
2526
2527
2528 static PyObject*
2529 pattern_new_match(_sremodulestate* module_state,
2530 PatternObject* pattern,
2531 SRE_STATE* state,
2532 Py_ssize_t status)
2533 {
2534 /* create match object (from state object) */
2535
2536 MatchObject* match;
2537 Py_ssize_t i, j;
2538 char* base;
2539 int n;
2540
2541 if (status > 0) {
2542
2543 /* create match object (with room for extra group marks) */
2544 /* coverity[ampersand_in_size] */
2545 match = PyObject_GC_NewVar(MatchObject,
2546 module_state->Match_Type,
2547 2*(pattern->groups+1));
2548 if (!match)
2549 return NULL;
2550
2551 match->pattern = (PatternObject*)Py_NewRef(pattern);
2552
2553 match->string = Py_NewRef(state->string);
2554
2555 match->regs = NULL;
2556 match->groups = pattern->groups+1;
2557
2558 /* fill in group slices */
2559
2560 base = (char*) state->beginning;
2561 n = state->charsize;
2562
2563 match->mark[0] = ((char*) state->start - base) / n;
2564 match->mark[1] = ((char*) state->ptr - base) / n;
2565
2566 for (i = j = 0; i < pattern->groups; i++, j+=2)
2567 if (j+1 <= state->lastmark && state->mark[j] && state->mark[j+1]) {
2568 match->mark[j+2] = ((char*) state->mark[j] - base) / n;
2569 match->mark[j+3] = ((char*) state->mark[j+1] - base) / n;
2570
2571 /* check wrong span */
2572 if (match->mark[j+2] > match->mark[j+3]) {
2573 PyErr_SetString(PyExc_SystemError,
2574 "The span of capturing group is wrong,"
2575 " please report a bug for the re module.");
2576 Py_DECREF(match);
2577 return NULL;
2578 }
2579 } else
2580 match->mark[j+2] = match->mark[j+3] = -1; /* undefined */
2581
2582 match->pos = state->pos;
2583 match->endpos = state->endpos;
2584
2585 match->lastindex = state->lastindex;
2586
2587 PyObject_GC_Track(match);
2588 return (PyObject*) match;
2589
2590 } else if (status == 0) {
2591
2592 /* no match */
2593 Py_RETURN_NONE;
2594
2595 }
2596
2597 /* internal error */
2598 pattern_error(status);
2599 return NULL;
2600 }
2601
2602
2603 /* -------------------------------------------------------------------- */
2604 /* scanner methods (experimental) */
2605
2606 static int
2607 scanner_traverse(ScannerObject *self, visitproc visit, void *arg)
2608 {
2609 Py_VISIT(Py_TYPE(self));
2610 Py_VISIT(self->pattern);
2611 return 0;
2612 }
2613
2614 static int
2615 scanner_clear(ScannerObject *self)
2616 {
2617 Py_CLEAR(self->pattern);
2618 return 0;
2619 }
2620
2621 static void
2622 scanner_dealloc(ScannerObject* self)
2623 {
2624 PyTypeObject *tp = Py_TYPE(self);
2625
2626 PyObject_GC_UnTrack(self);
2627 state_fini(&self->state);
2628 (void)scanner_clear(self);
2629 tp->tp_free(self);
2630 Py_DECREF(tp);
2631 }
2632
2633 static int
2634 scanner_begin(ScannerObject* self)
2635 {
2636 if (self->executing) {
2637 PyErr_SetString(PyExc_ValueError,
2638 "regular expression scanner already executing");
2639 return 0;
2640 }
2641 self->executing = 1;
2642 return 1;
2643 }
2644
2645 static void
2646 scanner_end(ScannerObject* self)
2647 {
2648 assert(self->executing);
2649 self->executing = 0;
2650 }
2651
2652 /*[clinic input]
2653 _sre.SRE_Scanner.match
2654
2655 cls: defining_class
2656 /
2657
2658 [clinic start generated code]*/
2659
2660 static PyObject *
2661 _sre_SRE_Scanner_match_impl(ScannerObject *self, PyTypeObject *cls)
2662 /*[clinic end generated code: output=6e22c149dc0f0325 input=b5146e1f30278cb7]*/
2663 {
2664 _sremodulestate *module_state = get_sre_module_state_by_class(cls);
2665 SRE_STATE* state = &self->state;
2666 PyObject* match;
2667 Py_ssize_t status;
2668
2669 if (!scanner_begin(self)) {
2670 return NULL;
2671 }
2672 if (state->start == NULL) {
2673 scanner_end(self);
2674 Py_RETURN_NONE;
2675 }
2676
2677 state_reset(state);
2678
2679 state->ptr = state->start;
2680
2681 status = sre_match(state, PatternObject_GetCode(self->pattern));
2682 if (PyErr_Occurred()) {
2683 scanner_end(self);
2684 return NULL;
2685 }
2686
2687 match = pattern_new_match(module_state, (PatternObject*) self->pattern,
2688 state, status);
2689
2690 if (status == 0)
2691 state->start = NULL;
2692 else {
2693 state->must_advance = (state->ptr == state->start);
2694 state->start = state->ptr;
2695 }
2696
2697 scanner_end(self);
2698 return match;
2699 }
2700
2701
2702 /*[clinic input]
2703 _sre.SRE_Scanner.search
2704
2705 cls: defining_class
2706 /
2707
2708 [clinic start generated code]*/
2709
2710 static PyObject *
2711 _sre_SRE_Scanner_search_impl(ScannerObject *self, PyTypeObject *cls)
2712 /*[clinic end generated code: output=23e8fc78013f9161 input=056c2d37171d0bf2]*/
2713 {
2714 _sremodulestate *module_state = get_sre_module_state_by_class(cls);
2715 SRE_STATE* state = &self->state;
2716 PyObject* match;
2717 Py_ssize_t status;
2718
2719 if (!scanner_begin(self)) {
2720 return NULL;
2721 }
2722 if (state->start == NULL) {
2723 scanner_end(self);
2724 Py_RETURN_NONE;
2725 }
2726
2727 state_reset(state);
2728
2729 state->ptr = state->start;
2730
2731 status = sre_search(state, PatternObject_GetCode(self->pattern));
2732 if (PyErr_Occurred()) {
2733 scanner_end(self);
2734 return NULL;
2735 }
2736
2737 match = pattern_new_match(module_state, (PatternObject*) self->pattern,
2738 state, status);
2739
2740 if (status == 0)
2741 state->start = NULL;
2742 else {
2743 state->must_advance = (state->ptr == state->start);
2744 state->start = state->ptr;
2745 }
2746
2747 scanner_end(self);
2748 return match;
2749 }
2750
2751 static PyObject *
2752 pattern_scanner(_sremodulestate *module_state,
2753 PatternObject *self,
2754 PyObject *string,
2755 Py_ssize_t pos,
2756 Py_ssize_t endpos)
2757 {
2758 ScannerObject* scanner;
2759
2760 /* create scanner object */
2761 scanner = PyObject_GC_New(ScannerObject, module_state->Scanner_Type);
2762 if (!scanner)
2763 return NULL;
2764 scanner->pattern = NULL;
2765 scanner->executing = 0;
2766
2767 /* create search state object */
2768 if (!state_init(&scanner->state, self, string, pos, endpos)) {
2769 Py_DECREF(scanner);
2770 return NULL;
2771 }
2772
2773 scanner->pattern = Py_NewRef(self);
2774
2775 PyObject_GC_Track(scanner);
2776 return (PyObject*) scanner;
2777 }
2778
2779 /* -------------------------------------------------------------------- */
2780 /* template methods */
2781
2782 static int
2783 template_traverse(TemplateObject *self, visitproc visit, void *arg)
2784 {
2785 Py_VISIT(Py_TYPE(self));
2786 Py_VISIT(self->literal);
2787 for (Py_ssize_t i = 0, n = Py_SIZE(self); i < n; i++) {
2788 Py_VISIT(self->items[i].literal);
2789 }
2790 return 0;
2791 }
2792
2793 static int
2794 template_clear(TemplateObject *self)
2795 {
2796 Py_CLEAR(self->literal);
2797 for (Py_ssize_t i = 0, n = Py_SIZE(self); i < n; i++) {
2798 Py_CLEAR(self->items[i].literal);
2799 }
2800 return 0;
2801 }
2802
2803 static void
2804 template_dealloc(TemplateObject *self)
2805 {
2806 PyTypeObject *tp = Py_TYPE(self);
2807
2808 PyObject_GC_UnTrack(self);
2809 (void)template_clear(self);
2810 tp->tp_free(self);
2811 Py_DECREF(tp);
2812 }
2813
2814 static PyObject *
2815 expand_template(TemplateObject *self, MatchObject *match)
2816 {
2817 if (Py_SIZE(self) == 0) {
2818 return Py_NewRef(self->literal);
2819 }
2820
2821 PyObject *result = NULL;
2822 Py_ssize_t count = 0; // the number of non-empty chunks
2823 /* For small number of strings use a buffer allocated on the stack,
2824 * otherwise use a list object. */
2825 PyObject *buffer[10];
2826 PyObject **out = buffer;
2827 PyObject *list = NULL;
2828 if (self->chunks > (int)Py_ARRAY_LENGTH(buffer) ||
2829 !PyUnicode_Check(self->literal))
2830 {
2831 list = PyList_New(self->chunks);
2832 if (!list) {
2833 return NULL;
2834 }
2835 out = &PyList_GET_ITEM(list, 0);
2836 }
2837
2838 out[count++] = Py_NewRef(self->literal);
2839 for (Py_ssize_t i = 0; i < Py_SIZE(self); i++) {
2840 Py_ssize_t index = self->items[i].index;
2841 if (index >= match->groups) {
2842 PyErr_SetString(PyExc_IndexError, "no such group");
2843 goto cleanup;
2844 }
2845 PyObject *item = match_getslice_by_index(match, index, Py_None);
2846 if (item == NULL) {
2847 goto cleanup;
2848 }
2849 if (item != Py_None) {
2850 out[count++] = Py_NewRef(item);
2851 }
2852 Py_DECREF(item);
2853
2854 PyObject *literal = self->items[i].literal;
2855 if (literal != NULL) {
2856 out[count++] = Py_NewRef(literal);
2857 }
2858 }
2859
2860 if (PyUnicode_Check(self->literal)) {
2861 result = _PyUnicode_JoinArray(&_Py_STR(empty), out, count);
2862 }
2863 else {
2864 Py_SET_SIZE(list, count);
2865 result = _PyBytes_Join((PyObject *)&_Py_SINGLETON(bytes_empty), list);
2866 }
2867
2868 cleanup:
2869 if (list) {
2870 Py_DECREF(list);
2871 }
2872 else {
2873 for (Py_ssize_t i = 0; i < count; i++) {
2874 Py_DECREF(out[i]);
2875 }
2876 }
2877 return result;
2878 }
2879
2880
2881 static Py_hash_t
2882 pattern_hash(PatternObject *self)
2883 {
2884 Py_hash_t hash, hash2;
2885
2886 hash = PyObject_Hash(self->pattern);
2887 if (hash == -1) {
2888 return -1;
2889 }
2890
2891 hash2 = _Py_HashBytes(self->code, sizeof(self->code[0]) * self->codesize);
2892 hash ^= hash2;
2893
2894 hash ^= self->flags;
2895 hash ^= self->isbytes;
2896 hash ^= self->codesize;
2897
2898 if (hash == -1) {
2899 hash = -2;
2900 }
2901 return hash;
2902 }
2903
2904 static PyObject*
2905 pattern_richcompare(PyObject *lefto, PyObject *righto, int op)
2906 {
2907 PyTypeObject *tp = Py_TYPE(lefto);
2908 _sremodulestate *module_state = get_sre_module_state_by_class(tp);
2909 PatternObject *left, *right;
2910 int cmp;
2911
2912 if (op != Py_EQ && op != Py_NE) {
2913 Py_RETURN_NOTIMPLEMENTED;
2914 }
2915
2916 if (!Py_IS_TYPE(righto, module_state->Pattern_Type))
2917 {
2918 Py_RETURN_NOTIMPLEMENTED;
2919 }
2920
2921 if (lefto == righto) {
2922 /* a pattern is equal to itself */
2923 return PyBool_FromLong(op == Py_EQ);
2924 }
2925
2926 left = (PatternObject *)lefto;
2927 right = (PatternObject *)righto;
2928
2929 cmp = (left->flags == right->flags
2930 && left->isbytes == right->isbytes
2931 && left->codesize == right->codesize);
2932 if (cmp) {
2933 /* Compare the code and the pattern because the same pattern can
2934 produce different codes depending on the locale used to compile the
2935 pattern when the re.LOCALE flag is used. Don't compare groups,
2936 indexgroup nor groupindex: they are derivated from the pattern. */
2937 cmp = (memcmp(left->code, right->code,
2938 sizeof(left->code[0]) * left->codesize) == 0);
2939 }
2940 if (cmp) {
2941 cmp = PyObject_RichCompareBool(left->pattern, right->pattern,
2942 Py_EQ);
2943 if (cmp < 0) {
2944 return NULL;
2945 }
2946 }
2947 if (op == Py_NE) {
2948 cmp = !cmp;
2949 }
2950 return PyBool_FromLong(cmp);
2951 }
2952
2953 #include "clinic/sre.c.h"
2954
2955 static PyMethodDef pattern_methods[] = {
2956 _SRE_SRE_PATTERN_MATCH_METHODDEF
2957 _SRE_SRE_PATTERN_FULLMATCH_METHODDEF
2958 _SRE_SRE_PATTERN_SEARCH_METHODDEF
2959 _SRE_SRE_PATTERN_SUB_METHODDEF
2960 _SRE_SRE_PATTERN_SUBN_METHODDEF
2961 _SRE_SRE_PATTERN_FINDALL_METHODDEF
2962 _SRE_SRE_PATTERN_SPLIT_METHODDEF
2963 _SRE_SRE_PATTERN_FINDITER_METHODDEF
2964 _SRE_SRE_PATTERN_SCANNER_METHODDEF
2965 _SRE_SRE_PATTERN___COPY___METHODDEF
2966 _SRE_SRE_PATTERN___DEEPCOPY___METHODDEF
2967 {"__class_getitem__", Py_GenericAlias, METH_O|METH_CLASS,
2968 PyDoc_STR("See PEP 585")},
2969 {NULL, NULL}
2970 };
2971
2972 static PyGetSetDef pattern_getset[] = {
2973 {"groupindex", (getter)pattern_groupindex, (setter)NULL,
2974 "A dictionary mapping group names to group numbers."},
2975 {NULL} /* Sentinel */
2976 };
2977
2978 #define PAT_OFF(x) offsetof(PatternObject, x)
2979 static PyMemberDef pattern_members[] = {
2980 {"pattern", T_OBJECT, PAT_OFF(pattern), READONLY,
2981 "The pattern string from which the RE object was compiled."},
2982 {"flags", T_INT, PAT_OFF(flags), READONLY,
2983 "The regex matching flags."},
2984 {"groups", T_PYSSIZET, PAT_OFF(groups), READONLY,
2985 "The number of capturing groups in the pattern."},
2986 {"__weaklistoffset__", T_PYSSIZET, offsetof(PatternObject, weakreflist), READONLY},
2987 {NULL} /* Sentinel */
2988 };
2989
2990 static PyType_Slot pattern_slots[] = {
2991 {Py_tp_dealloc, (destructor)pattern_dealloc},
2992 {Py_tp_repr, (reprfunc)pattern_repr},
2993 {Py_tp_hash, (hashfunc)pattern_hash},
2994 {Py_tp_doc, (void *)pattern_doc},
2995 {Py_tp_richcompare, pattern_richcompare},
2996 {Py_tp_methods, pattern_methods},
2997 {Py_tp_members, pattern_members},
2998 {Py_tp_getset, pattern_getset},
2999 {Py_tp_traverse, pattern_traverse},
3000 {Py_tp_clear, pattern_clear},
3001 {0, NULL},
3002 };
3003
3004 static PyType_Spec pattern_spec = {
3005 .name = "re.Pattern",
3006 .basicsize = sizeof(PatternObject),
3007 .itemsize = sizeof(SRE_CODE),
3008 .flags = (Py_TPFLAGS_DEFAULT | Py_TPFLAGS_IMMUTABLETYPE |
3009 Py_TPFLAGS_DISALLOW_INSTANTIATION | Py_TPFLAGS_HAVE_GC),
3010 .slots = pattern_slots,
3011 };
3012
3013 static PyMethodDef match_methods[] = {
3014 {"group", (PyCFunction) match_group, METH_VARARGS, match_group_doc},
3015 _SRE_SRE_MATCH_START_METHODDEF
3016 _SRE_SRE_MATCH_END_METHODDEF
3017 _SRE_SRE_MATCH_SPAN_METHODDEF
3018 _SRE_SRE_MATCH_GROUPS_METHODDEF
3019 _SRE_SRE_MATCH_GROUPDICT_METHODDEF
3020 _SRE_SRE_MATCH_EXPAND_METHODDEF
3021 _SRE_SRE_MATCH___COPY___METHODDEF
3022 _SRE_SRE_MATCH___DEEPCOPY___METHODDEF
3023 {"__class_getitem__", Py_GenericAlias, METH_O|METH_CLASS,
3024 PyDoc_STR("See PEP 585")},
3025 {NULL, NULL}
3026 };
3027
3028 static PyGetSetDef match_getset[] = {
3029 {"lastindex", (getter)match_lastindex_get, (setter)NULL,
3030 "The integer index of the last matched capturing group."},
3031 {"lastgroup", (getter)match_lastgroup_get, (setter)NULL,
3032 "The name of the last matched capturing group."},
3033 {"regs", (getter)match_regs_get, (setter)NULL},
3034 {NULL}
3035 };
3036
3037 #define MATCH_OFF(x) offsetof(MatchObject, x)
3038 static PyMemberDef match_members[] = {
3039 {"string", T_OBJECT, MATCH_OFF(string), READONLY,
3040 "The string passed to match() or search()."},
3041 {"re", T_OBJECT, MATCH_OFF(pattern), READONLY,
3042 "The regular expression object."},
3043 {"pos", T_PYSSIZET, MATCH_OFF(pos), READONLY,
3044 "The index into the string at which the RE engine started looking for a match."},
3045 {"endpos", T_PYSSIZET, MATCH_OFF(endpos), READONLY,
3046 "The index into the string beyond which the RE engine will not go."},
3047 {NULL}
3048 };
3049
3050 /* FIXME: implement setattr("string", None) as a special case (to
3051 detach the associated string, if any */
3052 static PyType_Slot match_slots[] = {
3053 {Py_tp_dealloc, match_dealloc},
3054 {Py_tp_repr, match_repr},
3055 {Py_tp_doc, (void *)match_doc},
3056 {Py_tp_methods, match_methods},
3057 {Py_tp_members, match_members},
3058 {Py_tp_getset, match_getset},
3059 {Py_tp_traverse, match_traverse},
3060 {Py_tp_clear, match_clear},
3061
3062 /* As mapping.
3063 *
3064 * Match objects do not support length or assignment, but do support
3065 * __getitem__.
3066 */
3067 {Py_mp_subscript, match_getitem},
3068
3069 {0, NULL},
3070 };
3071
3072 static PyType_Spec match_spec = {
3073 .name = "re.Match",
3074 .basicsize = sizeof(MatchObject),
3075 .itemsize = sizeof(Py_ssize_t),
3076 .flags = (Py_TPFLAGS_DEFAULT | Py_TPFLAGS_IMMUTABLETYPE |
3077 Py_TPFLAGS_DISALLOW_INSTANTIATION | Py_TPFLAGS_HAVE_GC),
3078 .slots = match_slots,
3079 };
3080
3081 static PyMethodDef scanner_methods[] = {
3082 _SRE_SRE_SCANNER_MATCH_METHODDEF
3083 _SRE_SRE_SCANNER_SEARCH_METHODDEF
3084 {NULL, NULL}
3085 };
3086
3087 #define SCAN_OFF(x) offsetof(ScannerObject, x)
3088 static PyMemberDef scanner_members[] = {
3089 {"pattern", T_OBJECT, SCAN_OFF(pattern), READONLY},
3090 {NULL} /* Sentinel */
3091 };
3092
3093 static PyType_Slot scanner_slots[] = {
3094 {Py_tp_dealloc, scanner_dealloc},
3095 {Py_tp_methods, scanner_methods},
3096 {Py_tp_members, scanner_members},
3097 {Py_tp_traverse, scanner_traverse},
3098 {Py_tp_clear, scanner_clear},
3099 {0, NULL},
3100 };
3101
3102 static PyType_Spec scanner_spec = {
3103 .name = "_sre.SRE_Scanner",
3104 .basicsize = sizeof(ScannerObject),
3105 .flags = (Py_TPFLAGS_DEFAULT | Py_TPFLAGS_IMMUTABLETYPE |
3106 Py_TPFLAGS_DISALLOW_INSTANTIATION | Py_TPFLAGS_HAVE_GC),
3107 .slots = scanner_slots,
3108 };
3109
3110 static PyType_Slot template_slots[] = {
3111 {Py_tp_dealloc, template_dealloc},
3112 {Py_tp_traverse, template_traverse},
3113 {Py_tp_clear, template_clear},
3114 {0, NULL},
3115 };
3116
3117 static PyType_Spec template_spec = {
3118 .name = "_sre.SRE_Template",
3119 .basicsize = sizeof(TemplateObject),
3120 .itemsize = sizeof(((TemplateObject *)0)->items[0]),
3121 .flags = (Py_TPFLAGS_DEFAULT | Py_TPFLAGS_IMMUTABLETYPE |
3122 Py_TPFLAGS_DISALLOW_INSTANTIATION | Py_TPFLAGS_HAVE_GC),
3123 .slots = template_slots,
3124 };
3125
3126 static PyMethodDef _functions[] = {
3127 _SRE_COMPILE_METHODDEF
3128 _SRE_TEMPLATE_METHODDEF
3129 _SRE_GETCODESIZE_METHODDEF
3130 _SRE_ASCII_ISCASED_METHODDEF
3131 _SRE_UNICODE_ISCASED_METHODDEF
3132 _SRE_ASCII_TOLOWER_METHODDEF
3133 _SRE_UNICODE_TOLOWER_METHODDEF
3134 {NULL, NULL}
3135 };
3136
3137 static int
3138 sre_traverse(PyObject *module, visitproc visit, void *arg)
3139 {
3140 _sremodulestate *state = get_sre_module_state(module);
3141
3142 Py_VISIT(state->Pattern_Type);
3143 Py_VISIT(state->Match_Type);
3144 Py_VISIT(state->Scanner_Type);
3145 Py_VISIT(state->Template_Type);
3146 Py_VISIT(state->compile_template);
3147
3148 return 0;
3149 }
3150
3151 static int
3152 sre_clear(PyObject *module)
3153 {
3154 _sremodulestate *state = get_sre_module_state(module);
3155
3156 Py_CLEAR(state->Pattern_Type);
3157 Py_CLEAR(state->Match_Type);
3158 Py_CLEAR(state->Scanner_Type);
3159 Py_CLEAR(state->Template_Type);
3160 Py_CLEAR(state->compile_template);
3161
3162 return 0;
3163 }
3164
3165 static void
3166 sre_free(void *module)
3167 {
3168 sre_clear((PyObject *)module);
3169 }
3170
3171 #define CREATE_TYPE(m, type, spec) \
3172 do { \
3173 type = (PyTypeObject *)PyType_FromModuleAndSpec(m, spec, NULL); \
3174 if (type == NULL) { \
3175 goto error; \
3176 } \
3177 } while (0)
3178
3179 #define ADD_ULONG_CONSTANT(module, name, value) \
3180 do { \
3181 PyObject *o = PyLong_FromUnsignedLong(value); \
3182 if (!o) \
3183 goto error; \
3184 int res = PyModule_AddObjectRef(module, name, o); \
3185 Py_DECREF(o); \
3186 if (res < 0) { \
3187 goto error; \
3188 } \
3189 } while (0)
3190
3191 static int
3192 sre_exec(PyObject *m)
3193 {
3194 _sremodulestate *state;
3195
3196 /* Create heap types */
3197 state = get_sre_module_state(m);
3198 CREATE_TYPE(m, state->Pattern_Type, &pattern_spec);
3199 CREATE_TYPE(m, state->Match_Type, &match_spec);
3200 CREATE_TYPE(m, state->Scanner_Type, &scanner_spec);
3201 CREATE_TYPE(m, state->Template_Type, &template_spec);
3202
3203 if (PyModule_AddIntConstant(m, "MAGIC", SRE_MAGIC) < 0) {
3204 goto error;
3205 }
3206
3207 if (PyModule_AddIntConstant(m, "CODESIZE", sizeof(SRE_CODE)) < 0) {
3208 goto error;
3209 }
3210
3211 ADD_ULONG_CONSTANT(m, "MAXREPEAT", SRE_MAXREPEAT);
3212 ADD_ULONG_CONSTANT(m, "MAXGROUPS", SRE_MAXGROUPS);
3213
3214 if (PyModule_AddStringConstant(m, "copyright", copyright) < 0) {
3215 goto error;
3216 }
3217
3218 return 0;
3219
3220 error:
3221 return -1;
3222 }
3223
3224 static PyModuleDef_Slot sre_slots[] = {
3225 {Py_mod_exec, sre_exec},
3226 {Py_mod_multiple_interpreters, Py_MOD_PER_INTERPRETER_GIL_SUPPORTED},
3227 {0, NULL},
3228 };
3229
3230 static struct PyModuleDef sremodule = {
3231 .m_base = PyModuleDef_HEAD_INIT,
3232 .m_name = "_sre",
3233 .m_size = sizeof(_sremodulestate),
3234 .m_methods = _functions,
3235 .m_slots = sre_slots,
3236 .m_traverse = sre_traverse,
3237 .m_free = sre_free,
3238 .m_clear = sre_clear,
3239 };
3240
3241 PyMODINIT_FUNC
3242 PyInit__sre(void)
3243 {
3244 return PyModuleDef_Init(&sremodule);
3245 }
3246
3247 /* vim:ts=4:sw=4:et
3248 */