1 /*
2 * Secret Labs' Regular Expression Engine
3 *
4 * regular expression matching engine
5 *
6 * Copyright (c) 1997-2001 by Secret Labs AB. All rights reserved.
7 *
8 * See the sre.c file for information on usage and redistribution.
9 */
10
11 /* String matching engine */
12
13 /* This file is included three times, with different character settings */
14
15 LOCAL(int)
16 SRE(at)(SRE_STATE* state, const SRE_CHAR* ptr, SRE_CODE at)
17 {
18 /* check if pointer is at given position */
19
20 Py_ssize_t thisp, thatp;
21
22 switch (at) {
23
24 case SRE_AT_BEGINNING:
25 case SRE_AT_BEGINNING_STRING:
26 return ((void*) ptr == state->beginning);
27
28 case SRE_AT_BEGINNING_LINE:
29 return ((void*) ptr == state->beginning ||
30 SRE_IS_LINEBREAK((int) ptr[-1]));
31
32 case SRE_AT_END:
33 return (((SRE_CHAR *)state->end - ptr == 1 &&
34 SRE_IS_LINEBREAK((int) ptr[0])) ||
35 ((void*) ptr == state->end));
36
37 case SRE_AT_END_LINE:
38 return ((void*) ptr == state->end ||
39 SRE_IS_LINEBREAK((int) ptr[0]));
40
41 case SRE_AT_END_STRING:
42 return ((void*) ptr == state->end);
43
44 case SRE_AT_BOUNDARY:
45 if (state->beginning == state->end)
46 return 0;
47 thatp = ((void*) ptr > state->beginning) ?
48 SRE_IS_WORD((int) ptr[-1]) : 0;
49 thisp = ((void*) ptr < state->end) ?
50 SRE_IS_WORD((int) ptr[0]) : 0;
51 return thisp != thatp;
52
53 case SRE_AT_NON_BOUNDARY:
54 if (state->beginning == state->end)
55 return 0;
56 thatp = ((void*) ptr > state->beginning) ?
57 SRE_IS_WORD((int) ptr[-1]) : 0;
58 thisp = ((void*) ptr < state->end) ?
59 SRE_IS_WORD((int) ptr[0]) : 0;
60 return thisp == thatp;
61
62 case SRE_AT_LOC_BOUNDARY:
63 if (state->beginning == state->end)
64 return 0;
65 thatp = ((void*) ptr > state->beginning) ?
66 SRE_LOC_IS_WORD((int) ptr[-1]) : 0;
67 thisp = ((void*) ptr < state->end) ?
68 SRE_LOC_IS_WORD((int) ptr[0]) : 0;
69 return thisp != thatp;
70
71 case SRE_AT_LOC_NON_BOUNDARY:
72 if (state->beginning == state->end)
73 return 0;
74 thatp = ((void*) ptr > state->beginning) ?
75 SRE_LOC_IS_WORD((int) ptr[-1]) : 0;
76 thisp = ((void*) ptr < state->end) ?
77 SRE_LOC_IS_WORD((int) ptr[0]) : 0;
78 return thisp == thatp;
79
80 case SRE_AT_UNI_BOUNDARY:
81 if (state->beginning == state->end)
82 return 0;
83 thatp = ((void*) ptr > state->beginning) ?
84 SRE_UNI_IS_WORD((int) ptr[-1]) : 0;
85 thisp = ((void*) ptr < state->end) ?
86 SRE_UNI_IS_WORD((int) ptr[0]) : 0;
87 return thisp != thatp;
88
89 case SRE_AT_UNI_NON_BOUNDARY:
90 if (state->beginning == state->end)
91 return 0;
92 thatp = ((void*) ptr > state->beginning) ?
93 SRE_UNI_IS_WORD((int) ptr[-1]) : 0;
94 thisp = ((void*) ptr < state->end) ?
95 SRE_UNI_IS_WORD((int) ptr[0]) : 0;
96 return thisp == thatp;
97
98 }
99
100 return 0;
101 }
102
103 LOCAL(int)
104 SRE(charset)(SRE_STATE* state, const SRE_CODE* set, SRE_CODE ch)
105 {
106 /* check if character is a member of the given set */
107
108 int ok = 1;
109
110 for (;;) {
111 switch (*set++) {
112
113 case SRE_OP_FAILURE:
114 return !ok;
115
116 case SRE_OP_LITERAL:
117 /* <LITERAL> <code> */
118 if (ch == set[0])
119 return ok;
120 set++;
121 break;
122
123 case SRE_OP_CATEGORY:
124 /* <CATEGORY> <code> */
125 if (sre_category(set[0], (int) ch))
126 return ok;
127 set++;
128 break;
129
130 case SRE_OP_CHARSET:
131 /* <CHARSET> <bitmap> */
132 if (ch < 256 &&
133 (set[ch/SRE_CODE_BITS] & (1u << (ch & (SRE_CODE_BITS-1)))))
134 return ok;
135 set += 256/SRE_CODE_BITS;
136 break;
137
138 case SRE_OP_RANGE:
139 /* <RANGE> <lower> <upper> */
140 if (set[0] <= ch && ch <= set[1])
141 return ok;
142 set += 2;
143 break;
144
145 case SRE_OP_RANGE_UNI_IGNORE:
146 /* <RANGE_UNI_IGNORE> <lower> <upper> */
147 {
148 SRE_CODE uch;
149 /* ch is already lower cased */
150 if (set[0] <= ch && ch <= set[1])
151 return ok;
152 uch = sre_upper_unicode(ch);
153 if (set[0] <= uch && uch <= set[1])
154 return ok;
155 set += 2;
156 break;
157 }
158
159 case SRE_OP_NEGATE:
160 ok = !ok;
161 break;
162
163 case SRE_OP_BIGCHARSET:
164 /* <BIGCHARSET> <blockcount> <256 blockindices> <blocks> */
165 {
166 Py_ssize_t count, block;
167 count = *(set++);
168
169 if (ch < 0x10000u)
170 block = ((unsigned char*)set)[ch >> 8];
171 else
172 block = -1;
173 set += 256/sizeof(SRE_CODE);
174 if (block >=0 &&
175 (set[(block * 256 + (ch & 255))/SRE_CODE_BITS] &
176 (1u << (ch & (SRE_CODE_BITS-1)))))
177 return ok;
178 set += count * (256/SRE_CODE_BITS);
179 break;
180 }
181
182 default:
183 /* internal error -- there's not much we can do about it
184 here, so let's just pretend it didn't match... */
185 return 0;
186 }
187 }
188 }
189
190 LOCAL(int)
191 SRE(charset_loc_ignore)(SRE_STATE* state, const SRE_CODE* set, SRE_CODE ch)
192 {
193 SRE_CODE lo, up;
194 lo = sre_lower_locale(ch);
195 if (SRE(charset)(state, set, lo))
196 return 1;
197
198 up = sre_upper_locale(ch);
199 return up != lo && SRE(charset)(state, set, up);
200 }
201
202 LOCAL(Py_ssize_t) SRE(match)(SRE_STATE* state, const SRE_CODE* pattern, int toplevel);
203
204 LOCAL(Py_ssize_t)
205 SRE(count)(SRE_STATE* state, const SRE_CODE* pattern, Py_ssize_t maxcount)
206 {
207 SRE_CODE chr;
208 SRE_CHAR c;
209 const SRE_CHAR* ptr = (const SRE_CHAR *)state->ptr;
210 const SRE_CHAR* end = (const SRE_CHAR *)state->end;
211 Py_ssize_t i;
212
213 /* adjust end */
214 if (maxcount < end - ptr && maxcount != SRE_MAXREPEAT)
215 end = ptr + maxcount;
216
217 switch (pattern[0]) {
218
219 case SRE_OP_IN:
220 /* repeated set */
221 TRACE(("|%p|%p|COUNT IN\n", pattern, ptr));
222 while (ptr < end && SRE(charset)(state, pattern + 2, *ptr))
223 ptr++;
224 break;
225
226 case SRE_OP_ANY:
227 /* repeated dot wildcard. */
228 TRACE(("|%p|%p|COUNT ANY\n", pattern, ptr));
229 while (ptr < end && !SRE_IS_LINEBREAK(*ptr))
230 ptr++;
231 break;
232
233 case SRE_OP_ANY_ALL:
234 /* repeated dot wildcard. skip to the end of the target
235 string, and backtrack from there */
236 TRACE(("|%p|%p|COUNT ANY_ALL\n", pattern, ptr));
237 ptr = end;
238 break;
239
240 case SRE_OP_LITERAL:
241 /* repeated literal */
242 chr = pattern[1];
243 TRACE(("|%p|%p|COUNT LITERAL %d\n", pattern, ptr, chr));
244 c = (SRE_CHAR) chr;
245 #if SIZEOF_SRE_CHAR < 4
246 if ((SRE_CODE) c != chr)
247 ; /* literal can't match: doesn't fit in char width */
248 else
249 #endif
250 while (ptr < end && *ptr == c)
251 ptr++;
252 break;
253
254 case SRE_OP_LITERAL_IGNORE:
255 /* repeated literal */
256 chr = pattern[1];
257 TRACE(("|%p|%p|COUNT LITERAL_IGNORE %d\n", pattern, ptr, chr));
258 while (ptr < end && (SRE_CODE) sre_lower_ascii(*ptr) == chr)
259 ptr++;
260 break;
261
262 case SRE_OP_LITERAL_UNI_IGNORE:
263 /* repeated literal */
264 chr = pattern[1];
265 TRACE(("|%p|%p|COUNT LITERAL_UNI_IGNORE %d\n", pattern, ptr, chr));
266 while (ptr < end && (SRE_CODE) sre_lower_unicode(*ptr) == chr)
267 ptr++;
268 break;
269
270 case SRE_OP_LITERAL_LOC_IGNORE:
271 /* repeated literal */
272 chr = pattern[1];
273 TRACE(("|%p|%p|COUNT LITERAL_LOC_IGNORE %d\n", pattern, ptr, chr));
274 while (ptr < end && char_loc_ignore(chr, *ptr))
275 ptr++;
276 break;
277
278 case SRE_OP_NOT_LITERAL:
279 /* repeated non-literal */
280 chr = pattern[1];
281 TRACE(("|%p|%p|COUNT NOT_LITERAL %d\n", pattern, ptr, chr));
282 c = (SRE_CHAR) chr;
283 #if SIZEOF_SRE_CHAR < 4
284 if ((SRE_CODE) c != chr)
285 ptr = end; /* literal can't match: doesn't fit in char width */
286 else
287 #endif
288 while (ptr < end && *ptr != c)
289 ptr++;
290 break;
291
292 case SRE_OP_NOT_LITERAL_IGNORE:
293 /* repeated non-literal */
294 chr = pattern[1];
295 TRACE(("|%p|%p|COUNT NOT_LITERAL_IGNORE %d\n", pattern, ptr, chr));
296 while (ptr < end && (SRE_CODE) sre_lower_ascii(*ptr) != chr)
297 ptr++;
298 break;
299
300 case SRE_OP_NOT_LITERAL_UNI_IGNORE:
301 /* repeated non-literal */
302 chr = pattern[1];
303 TRACE(("|%p|%p|COUNT NOT_LITERAL_UNI_IGNORE %d\n", pattern, ptr, chr));
304 while (ptr < end && (SRE_CODE) sre_lower_unicode(*ptr) != chr)
305 ptr++;
306 break;
307
308 case SRE_OP_NOT_LITERAL_LOC_IGNORE:
309 /* repeated non-literal */
310 chr = pattern[1];
311 TRACE(("|%p|%p|COUNT NOT_LITERAL_LOC_IGNORE %d\n", pattern, ptr, chr));
312 while (ptr < end && !char_loc_ignore(chr, *ptr))
313 ptr++;
314 break;
315
316 default:
317 /* repeated single character pattern */
318 TRACE(("|%p|%p|COUNT SUBPATTERN\n", pattern, ptr));
319 while ((SRE_CHAR*) state->ptr < end) {
320 i = SRE(match)(state, pattern, 0);
321 if (i < 0)
322 return i;
323 if (!i)
324 break;
325 }
326 TRACE(("|%p|%p|COUNT %zd\n", pattern, ptr,
327 (SRE_CHAR*) state->ptr - ptr));
328 return (SRE_CHAR*) state->ptr - ptr;
329 }
330
331 TRACE(("|%p|%p|COUNT %zd\n", pattern, ptr,
332 ptr - (SRE_CHAR*) state->ptr));
333 return ptr - (SRE_CHAR*) state->ptr;
334 }
335
336 /* The macros below should be used to protect recursive SRE(match)()
337 * calls that *failed* and do *not* return immediately (IOW, those
338 * that will backtrack). Explaining:
339 *
340 * - Recursive SRE(match)() returned true: that's usually a success
341 * (besides atypical cases like ASSERT_NOT), therefore there's no
342 * reason to restore lastmark;
343 *
344 * - Recursive SRE(match)() returned false but the current SRE(match)()
345 * is returning to the caller: If the current SRE(match)() is the
346 * top function of the recursion, returning false will be a matching
347 * failure, and it doesn't matter where lastmark is pointing to.
348 * If it's *not* the top function, it will be a recursive SRE(match)()
349 * failure by itself, and the calling SRE(match)() will have to deal
350 * with the failure by the same rules explained here (it will restore
351 * lastmark by itself if necessary);
352 *
353 * - Recursive SRE(match)() returned false, and will continue the
354 * outside 'for' loop: must be protected when breaking, since the next
355 * OP could potentially depend on lastmark;
356 *
357 * - Recursive SRE(match)() returned false, and will be called again
358 * inside a local for/while loop: must be protected between each
359 * loop iteration, since the recursive SRE(match)() could do anything,
360 * and could potentially depend on lastmark.
361 *
362 * For more information, check the discussion at SF patch #712900.
363 */
364 #define LASTMARK_SAVE() \
365 do { \
366 ctx->lastmark = state->lastmark; \
367 ctx->lastindex = state->lastindex; \
368 } while (0)
369 #define LASTMARK_RESTORE() \
370 do { \
371 state->lastmark = ctx->lastmark; \
372 state->lastindex = ctx->lastindex; \
373 } while (0)
374
375 #define RETURN_ERROR(i) do { return i; } while(0)
376 #define RETURN_FAILURE do { ret = 0; goto exit; } while(0)
377 #define RETURN_SUCCESS do { ret = 1; goto exit; } while(0)
378
379 #define RETURN_ON_ERROR(i) \
380 do { if (i < 0) RETURN_ERROR(i); } while (0)
381 #define RETURN_ON_SUCCESS(i) \
382 do { RETURN_ON_ERROR(i); if (i > 0) RETURN_SUCCESS; } while (0)
383 #define RETURN_ON_FAILURE(i) \
384 do { RETURN_ON_ERROR(i); if (i == 0) RETURN_FAILURE; } while (0)
385
386 #define DATA_STACK_ALLOC(state, type, ptr) \
387 do { \
388 alloc_pos = state->data_stack_base; \
389 TRACE(("allocating %s in %zd (%zd)\n", \
390 Py_STRINGIFY(type), alloc_pos, sizeof(type))); \
391 if (sizeof(type) > state->data_stack_size - alloc_pos) { \
392 int j = data_stack_grow(state, sizeof(type)); \
393 if (j < 0) return j; \
394 if (ctx_pos != -1) \
395 DATA_STACK_LOOKUP_AT(state, SRE(match_context), ctx, ctx_pos); \
396 } \
397 ptr = (type*)(state->data_stack+alloc_pos); \
398 state->data_stack_base += sizeof(type); \
399 } while (0)
400
401 #define DATA_STACK_LOOKUP_AT(state, type, ptr, pos) \
402 do { \
403 TRACE(("looking up %s at %zd\n", Py_STRINGIFY(type), pos)); \
404 ptr = (type*)(state->data_stack+pos); \
405 } while (0)
406
407 #define DATA_STACK_PUSH(state, data, size) \
408 do { \
409 TRACE(("copy data in %p to %zd (%zd)\n", \
410 data, state->data_stack_base, size)); \
411 if (size > state->data_stack_size - state->data_stack_base) { \
412 int j = data_stack_grow(state, size); \
413 if (j < 0) return j; \
414 if (ctx_pos != -1) \
415 DATA_STACK_LOOKUP_AT(state, SRE(match_context), ctx, ctx_pos); \
416 } \
417 memcpy(state->data_stack+state->data_stack_base, data, size); \
418 state->data_stack_base += size; \
419 } while (0)
420
421 /* We add an explicit cast to memcpy here because MSVC has a bug when
422 compiling C code where it believes that `const void**` cannot be
423 safely casted to `void*`, see bpo-39943 for details. */
424 #define DATA_STACK_POP(state, data, size, discard) \
425 do { \
426 TRACE(("copy data to %p from %zd (%zd)\n", \
427 data, state->data_stack_base-size, size)); \
428 memcpy((void*) data, state->data_stack+state->data_stack_base-size, size); \
429 if (discard) \
430 state->data_stack_base -= size; \
431 } while (0)
432
433 #define DATA_STACK_POP_DISCARD(state, size) \
434 do { \
435 TRACE(("discard data from %zd (%zd)\n", \
436 state->data_stack_base-size, size)); \
437 state->data_stack_base -= size; \
438 } while(0)
439
440 #define DATA_PUSH(x) \
441 DATA_STACK_PUSH(state, (x), sizeof(*(x)))
442 #define DATA_POP(x) \
443 DATA_STACK_POP(state, (x), sizeof(*(x)), 1)
444 #define DATA_POP_DISCARD(x) \
445 DATA_STACK_POP_DISCARD(state, sizeof(*(x)))
446 #define DATA_ALLOC(t,p) \
447 DATA_STACK_ALLOC(state, t, p)
448 #define DATA_LOOKUP_AT(t,p,pos) \
449 DATA_STACK_LOOKUP_AT(state,t,p,pos)
450
451 #define MARK_PUSH(lastmark) \
452 do if (lastmark >= 0) { \
453 size_t _marks_size = (lastmark+1) * sizeof(void*); \
454 DATA_STACK_PUSH(state, state->mark, _marks_size); \
455 } while (0)
456 #define MARK_POP(lastmark) \
457 do if (lastmark >= 0) { \
458 size_t _marks_size = (lastmark+1) * sizeof(void*); \
459 DATA_STACK_POP(state, state->mark, _marks_size, 1); \
460 } while (0)
461 #define MARK_POP_KEEP(lastmark) \
462 do if (lastmark >= 0) { \
463 size_t _marks_size = (lastmark+1) * sizeof(void*); \
464 DATA_STACK_POP(state, state->mark, _marks_size, 0); \
465 } while (0)
466 #define MARK_POP_DISCARD(lastmark) \
467 do if (lastmark >= 0) { \
468 size_t _marks_size = (lastmark+1) * sizeof(void*); \
469 DATA_STACK_POP_DISCARD(state, _marks_size); \
470 } while (0)
471
472 #define JUMP_NONE 0
473 #define JUMP_MAX_UNTIL_1 1
474 #define JUMP_MAX_UNTIL_2 2
475 #define JUMP_MAX_UNTIL_3 3
476 #define JUMP_MIN_UNTIL_1 4
477 #define JUMP_MIN_UNTIL_2 5
478 #define JUMP_MIN_UNTIL_3 6
479 #define JUMP_REPEAT 7
480 #define JUMP_REPEAT_ONE_1 8
481 #define JUMP_REPEAT_ONE_2 9
482 #define JUMP_MIN_REPEAT_ONE 10
483 #define JUMP_BRANCH 11
484 #define JUMP_ASSERT 12
485 #define JUMP_ASSERT_NOT 13
486 #define JUMP_POSS_REPEAT_1 14
487 #define JUMP_POSS_REPEAT_2 15
488 #define JUMP_ATOMIC_GROUP 16
489
490 #define DO_JUMPX(jumpvalue, jumplabel, nextpattern, toplevel_) \
491 ctx->pattern = pattern; \
492 ctx->ptr = ptr; \
493 DATA_ALLOC(SRE(match_context), nextctx); \
494 nextctx->pattern = nextpattern; \
495 nextctx->toplevel = toplevel_; \
496 nextctx->jump = jumpvalue; \
497 nextctx->last_ctx_pos = ctx_pos; \
498 pattern = nextpattern; \
499 ctx_pos = alloc_pos; \
500 ctx = nextctx; \
501 goto entrance; \
502 jumplabel: \
503 pattern = ctx->pattern; \
504 ptr = ctx->ptr;
505
506 #define DO_JUMP(jumpvalue, jumplabel, nextpattern) \
507 DO_JUMPX(jumpvalue, jumplabel, nextpattern, ctx->toplevel)
508
509 #define DO_JUMP0(jumpvalue, jumplabel, nextpattern) \
510 DO_JUMPX(jumpvalue, jumplabel, nextpattern, 0)
511
512 typedef struct {
513 Py_ssize_t count;
514 union {
515 SRE_CODE chr;
516 SRE_REPEAT* rep;
517 } u;
518 int lastmark;
519 int lastindex;
520 const SRE_CODE* pattern;
521 const SRE_CHAR* ptr;
522 int toplevel;
523 int jump;
524 Py_ssize_t last_ctx_pos;
525 } SRE(match_context);
526
527 #define MAYBE_CHECK_SIGNALS \
528 do { \
529 if ((0 == (++sigcount & 0xfff)) && PyErr_CheckSignals()) { \
530 RETURN_ERROR(SRE_ERROR_INTERRUPTED); \
531 } \
532 } while (0)
533
534 #ifdef HAVE_COMPUTED_GOTOS
535 #ifndef USE_COMPUTED_GOTOS
536 #define USE_COMPUTED_GOTOS 1
537 #endif
538 #elif defined(USE_COMPUTED_GOTOS) && USE_COMPUTED_GOTOS
539 #error "Computed gotos are not supported on this compiler."
540 #else
541 #undef USE_COMPUTED_GOTOS
542 #define USE_COMPUTED_GOTOS 0
543 #endif
544
545 #if USE_COMPUTED_GOTOS
546 #define TARGET(OP) TARGET_ ## OP
547 #define DISPATCH \
548 do { \
549 MAYBE_CHECK_SIGNALS; \
550 goto *sre_targets[*pattern++]; \
551 } while (0)
552 #else
553 #define TARGET(OP) case OP
554 #define DISPATCH goto dispatch
555 #endif
556
557 /* check if string matches the given pattern. returns <0 for
558 error, 0 for failure, and 1 for success */
559 LOCAL(Py_ssize_t)
560 SRE(match)(SRE_STATE* state, const SRE_CODE* pattern, int toplevel)
561 {
562 const SRE_CHAR* end = (const SRE_CHAR *)state->end;
563 Py_ssize_t alloc_pos, ctx_pos = -1;
564 Py_ssize_t ret = 0;
565 int jump;
566 unsigned int sigcount=0;
567
568 SRE(match_context)* ctx;
569 SRE(match_context)* nextctx;
570
571 TRACE(("|%p|%p|ENTER\n", pattern, state->ptr));
572
573 DATA_ALLOC(SRE(match_context), ctx);
574 ctx->last_ctx_pos = -1;
575 ctx->jump = JUMP_NONE;
576 ctx->toplevel = toplevel;
577 ctx_pos = alloc_pos;
578
579 #if USE_COMPUTED_GOTOS
580 #include "sre_targets.h"
581 #endif
582
583 entrance:
584
585 ; // Fashion statement.
586 const SRE_CHAR *ptr = (SRE_CHAR *)state->ptr;
587
588 if (pattern[0] == SRE_OP_INFO) {
589 /* optimization info block */
590 /* <INFO> <1=skip> <2=flags> <3=min> ... */
591 if (pattern[3] && (uintptr_t)(end - ptr) < pattern[3]) {
592 TRACE(("reject (got %zd chars, need %zd)\n",
593 end - ptr, (Py_ssize_t) pattern[3]));
594 RETURN_FAILURE;
595 }
596 pattern += pattern[1] + 1;
597 }
598
599 #if USE_COMPUTED_GOTOS
600 DISPATCH;
601 #else
602 dispatch:
603 MAYBE_CHECK_SIGNALS;
604 switch (*pattern++)
605 #endif
606 {
607
608 TARGET(SRE_OP_MARK):
609 /* set mark */
610 /* <MARK> <gid> */
611 TRACE(("|%p|%p|MARK %d\n", pattern,
612 ptr, pattern[0]));
613 {
614 int i = pattern[0];
615 if (i & 1)
616 state->lastindex = i/2 + 1;
617 if (i > state->lastmark) {
618 /* state->lastmark is the highest valid index in the
619 state->mark array. If it is increased by more than 1,
620 the intervening marks must be set to NULL to signal
621 that these marks have not been encountered. */
622 int j = state->lastmark + 1;
623 while (j < i)
624 state->mark[j++] = NULL;
625 state->lastmark = i;
626 }
627 state->mark[i] = ptr;
628 }
629 pattern++;
630 DISPATCH;
631
632 TARGET(SRE_OP_LITERAL):
633 /* match literal string */
634 /* <LITERAL> <code> */
635 TRACE(("|%p|%p|LITERAL %d\n", pattern,
636 ptr, *pattern));
637 if (ptr >= end || (SRE_CODE) ptr[0] != pattern[0])
638 RETURN_FAILURE;
639 pattern++;
640 ptr++;
641 DISPATCH;
642
643 TARGET(SRE_OP_NOT_LITERAL):
644 /* match anything that is not literal character */
645 /* <NOT_LITERAL> <code> */
646 TRACE(("|%p|%p|NOT_LITERAL %d\n", pattern,
647 ptr, *pattern));
648 if (ptr >= end || (SRE_CODE) ptr[0] == pattern[0])
649 RETURN_FAILURE;
650 pattern++;
651 ptr++;
652 DISPATCH;
653
654 TARGET(SRE_OP_SUCCESS):
655 /* end of pattern */
656 TRACE(("|%p|%p|SUCCESS\n", pattern, ptr));
657 if (ctx->toplevel &&
658 ((state->match_all && ptr != state->end) ||
659 (state->must_advance && ptr == state->start)))
660 {
661 RETURN_FAILURE;
662 }
663 state->ptr = ptr;
664 RETURN_SUCCESS;
665
666 TARGET(SRE_OP_AT):
667 /* match at given position */
668 /* <AT> <code> */
669 TRACE(("|%p|%p|AT %d\n", pattern, ptr, *pattern));
670 if (!SRE(at)(state, ptr, *pattern))
671 RETURN_FAILURE;
672 pattern++;
673 DISPATCH;
674
675 TARGET(SRE_OP_CATEGORY):
676 /* match at given category */
677 /* <CATEGORY> <code> */
678 TRACE(("|%p|%p|CATEGORY %d\n", pattern,
679 ptr, *pattern));
680 if (ptr >= end || !sre_category(pattern[0], ptr[0]))
681 RETURN_FAILURE;
682 pattern++;
683 ptr++;
684 DISPATCH;
685
686 TARGET(SRE_OP_ANY):
687 /* match anything (except a newline) */
688 /* <ANY> */
689 TRACE(("|%p|%p|ANY\n", pattern, ptr));
690 if (ptr >= end || SRE_IS_LINEBREAK(ptr[0]))
691 RETURN_FAILURE;
692 ptr++;
693 DISPATCH;
694
695 TARGET(SRE_OP_ANY_ALL):
696 /* match anything */
697 /* <ANY_ALL> */
698 TRACE(("|%p|%p|ANY_ALL\n", pattern, ptr));
699 if (ptr >= end)
700 RETURN_FAILURE;
701 ptr++;
702 DISPATCH;
703
704 TARGET(SRE_OP_IN):
705 /* match set member (or non_member) */
706 /* <IN> <skip> <set> */
707 TRACE(("|%p|%p|IN\n", pattern, ptr));
708 if (ptr >= end ||
709 !SRE(charset)(state, pattern + 1, *ptr))
710 RETURN_FAILURE;
711 pattern += pattern[0];
712 ptr++;
713 DISPATCH;
714
715 TARGET(SRE_OP_LITERAL_IGNORE):
716 TRACE(("|%p|%p|LITERAL_IGNORE %d\n",
717 pattern, ptr, pattern[0]));
718 if (ptr >= end ||
719 sre_lower_ascii(*ptr) != *pattern)
720 RETURN_FAILURE;
721 pattern++;
722 ptr++;
723 DISPATCH;
724
725 TARGET(SRE_OP_LITERAL_UNI_IGNORE):
726 TRACE(("|%p|%p|LITERAL_UNI_IGNORE %d\n",
727 pattern, ptr, pattern[0]));
728 if (ptr >= end ||
729 sre_lower_unicode(*ptr) != *pattern)
730 RETURN_FAILURE;
731 pattern++;
732 ptr++;
733 DISPATCH;
734
735 TARGET(SRE_OP_LITERAL_LOC_IGNORE):
736 TRACE(("|%p|%p|LITERAL_LOC_IGNORE %d\n",
737 pattern, ptr, pattern[0]));
738 if (ptr >= end
739 || !char_loc_ignore(*pattern, *ptr))
740 RETURN_FAILURE;
741 pattern++;
742 ptr++;
743 DISPATCH;
744
745 TARGET(SRE_OP_NOT_LITERAL_IGNORE):
746 TRACE(("|%p|%p|NOT_LITERAL_IGNORE %d\n",
747 pattern, ptr, *pattern));
748 if (ptr >= end ||
749 sre_lower_ascii(*ptr) == *pattern)
750 RETURN_FAILURE;
751 pattern++;
752 ptr++;
753 DISPATCH;
754
755 TARGET(SRE_OP_NOT_LITERAL_UNI_IGNORE):
756 TRACE(("|%p|%p|NOT_LITERAL_UNI_IGNORE %d\n",
757 pattern, ptr, *pattern));
758 if (ptr >= end ||
759 sre_lower_unicode(*ptr) == *pattern)
760 RETURN_FAILURE;
761 pattern++;
762 ptr++;
763 DISPATCH;
764
765 TARGET(SRE_OP_NOT_LITERAL_LOC_IGNORE):
766 TRACE(("|%p|%p|NOT_LITERAL_LOC_IGNORE %d\n",
767 pattern, ptr, *pattern));
768 if (ptr >= end
769 || char_loc_ignore(*pattern, *ptr))
770 RETURN_FAILURE;
771 pattern++;
772 ptr++;
773 DISPATCH;
774
775 TARGET(SRE_OP_IN_IGNORE):
776 TRACE(("|%p|%p|IN_IGNORE\n", pattern, ptr));
777 if (ptr >= end
778 || !SRE(charset)(state, pattern+1,
779 (SRE_CODE)sre_lower_ascii(*ptr)))
780 RETURN_FAILURE;
781 pattern += pattern[0];
782 ptr++;
783 DISPATCH;
784
785 TARGET(SRE_OP_IN_UNI_IGNORE):
786 TRACE(("|%p|%p|IN_UNI_IGNORE\n", pattern, ptr));
787 if (ptr >= end
788 || !SRE(charset)(state, pattern+1,
789 (SRE_CODE)sre_lower_unicode(*ptr)))
790 RETURN_FAILURE;
791 pattern += pattern[0];
792 ptr++;
793 DISPATCH;
794
795 TARGET(SRE_OP_IN_LOC_IGNORE):
796 TRACE(("|%p|%p|IN_LOC_IGNORE\n", pattern, ptr));
797 if (ptr >= end
798 || !SRE(charset_loc_ignore)(state, pattern+1, *ptr))
799 RETURN_FAILURE;
800 pattern += pattern[0];
801 ptr++;
802 DISPATCH;
803
804 TARGET(SRE_OP_JUMP):
805 TARGET(SRE_OP_INFO):
806 /* jump forward */
807 /* <JUMP> <offset> */
808 TRACE(("|%p|%p|JUMP %d\n", pattern,
809 ptr, pattern[0]));
810 pattern += pattern[0];
811 DISPATCH;
812
813 TARGET(SRE_OP_BRANCH):
814 /* alternation */
815 /* <BRANCH> <0=skip> code <JUMP> ... <NULL> */
816 TRACE(("|%p|%p|BRANCH\n", pattern, ptr));
817 LASTMARK_SAVE();
818 if (state->repeat)
819 MARK_PUSH(ctx->lastmark);
820 for (; pattern[0]; pattern += pattern[0]) {
821 if (pattern[1] == SRE_OP_LITERAL &&
822 (ptr >= end ||
823 (SRE_CODE) *ptr != pattern[2]))
824 continue;
825 if (pattern[1] == SRE_OP_IN &&
826 (ptr >= end ||
827 !SRE(charset)(state, pattern + 3,
828 (SRE_CODE) *ptr)))
829 continue;
830 state->ptr = ptr;
831 DO_JUMP(JUMP_BRANCH, jump_branch, pattern+1);
832 if (ret) {
833 if (state->repeat)
834 MARK_POP_DISCARD(ctx->lastmark);
835 RETURN_ON_ERROR(ret);
836 RETURN_SUCCESS;
837 }
838 if (state->repeat)
839 MARK_POP_KEEP(ctx->lastmark);
840 LASTMARK_RESTORE();
841 }
842 if (state->repeat)
843 MARK_POP_DISCARD(ctx->lastmark);
844 RETURN_FAILURE;
845
846 TARGET(SRE_OP_REPEAT_ONE):
847 /* match repeated sequence (maximizing regexp) */
848
849 /* this operator only works if the repeated item is
850 exactly one character wide, and we're not already
851 collecting backtracking points. for other cases,
852 use the MAX_REPEAT operator */
853
854 /* <REPEAT_ONE> <skip> <1=min> <2=max> item <SUCCESS> tail */
855
856 TRACE(("|%p|%p|REPEAT_ONE %d %d\n", pattern, ptr,
857 pattern[1], pattern[2]));
858
859 if ((Py_ssize_t) pattern[1] > end - ptr)
860 RETURN_FAILURE; /* cannot match */
861
862 state->ptr = ptr;
863
864 ret = SRE(count)(state, pattern+3, pattern[2]);
865 RETURN_ON_ERROR(ret);
866 DATA_LOOKUP_AT(SRE(match_context), ctx, ctx_pos);
867 ctx->count = ret;
868 ptr += ctx->count;
869
870 /* when we arrive here, count contains the number of
871 matches, and ptr points to the tail of the target
872 string. check if the rest of the pattern matches,
873 and backtrack if not. */
874
875 if (ctx->count < (Py_ssize_t) pattern[1])
876 RETURN_FAILURE;
877
878 if (pattern[pattern[0]] == SRE_OP_SUCCESS &&
879 ptr == state->end &&
880 !(ctx->toplevel && state->must_advance && ptr == state->start))
881 {
882 /* tail is empty. we're finished */
883 state->ptr = ptr;
884 RETURN_SUCCESS;
885 }
886
887 LASTMARK_SAVE();
888 if (state->repeat)
889 MARK_PUSH(ctx->lastmark);
890
891 if (pattern[pattern[0]] == SRE_OP_LITERAL) {
892 /* tail starts with a literal. skip positions where
893 the rest of the pattern cannot possibly match */
894 ctx->u.chr = pattern[pattern[0]+1];
895 for (;;) {
896 while (ctx->count >= (Py_ssize_t) pattern[1] &&
897 (ptr >= end || *ptr != ctx->u.chr)) {
898 ptr--;
899 ctx->count--;
900 }
901 if (ctx->count < (Py_ssize_t) pattern[1])
902 break;
903 state->ptr = ptr;
904 DO_JUMP(JUMP_REPEAT_ONE_1, jump_repeat_one_1,
905 pattern+pattern[0]);
906 if (ret) {
907 if (state->repeat)
908 MARK_POP_DISCARD(ctx->lastmark);
909 RETURN_ON_ERROR(ret);
910 RETURN_SUCCESS;
911 }
912 if (state->repeat)
913 MARK_POP_KEEP(ctx->lastmark);
914 LASTMARK_RESTORE();
915
916 ptr--;
917 ctx->count--;
918 }
919 if (state->repeat)
920 MARK_POP_DISCARD(ctx->lastmark);
921 } else {
922 /* general case */
923 while (ctx->count >= (Py_ssize_t) pattern[1]) {
924 state->ptr = ptr;
925 DO_JUMP(JUMP_REPEAT_ONE_2, jump_repeat_one_2,
926 pattern+pattern[0]);
927 if (ret) {
928 if (state->repeat)
929 MARK_POP_DISCARD(ctx->lastmark);
930 RETURN_ON_ERROR(ret);
931 RETURN_SUCCESS;
932 }
933 if (state->repeat)
934 MARK_POP_KEEP(ctx->lastmark);
935 LASTMARK_RESTORE();
936
937 ptr--;
938 ctx->count--;
939 }
940 if (state->repeat)
941 MARK_POP_DISCARD(ctx->lastmark);
942 }
943 RETURN_FAILURE;
944
945 TARGET(SRE_OP_MIN_REPEAT_ONE):
946 /* match repeated sequence (minimizing regexp) */
947
948 /* this operator only works if the repeated item is
949 exactly one character wide, and we're not already
950 collecting backtracking points. for other cases,
951 use the MIN_REPEAT operator */
952
953 /* <MIN_REPEAT_ONE> <skip> <1=min> <2=max> item <SUCCESS> tail */
954
955 TRACE(("|%p|%p|MIN_REPEAT_ONE %d %d\n", pattern, ptr,
956 pattern[1], pattern[2]));
957
958 if ((Py_ssize_t) pattern[1] > end - ptr)
959 RETURN_FAILURE; /* cannot match */
960
961 state->ptr = ptr;
962
963 if (pattern[1] == 0)
964 ctx->count = 0;
965 else {
966 /* count using pattern min as the maximum */
967 ret = SRE(count)(state, pattern+3, pattern[1]);
968 RETURN_ON_ERROR(ret);
969 DATA_LOOKUP_AT(SRE(match_context), ctx, ctx_pos);
970 if (ret < (Py_ssize_t) pattern[1])
971 /* didn't match minimum number of times */
972 RETURN_FAILURE;
973 /* advance past minimum matches of repeat */
974 ctx->count = ret;
975 ptr += ctx->count;
976 }
977
978 if (pattern[pattern[0]] == SRE_OP_SUCCESS &&
979 !(ctx->toplevel &&
980 ((state->match_all && ptr != state->end) ||
981 (state->must_advance && ptr == state->start))))
982 {
983 /* tail is empty. we're finished */
984 state->ptr = ptr;
985 RETURN_SUCCESS;
986
987 } else {
988 /* general case */
989 LASTMARK_SAVE();
990 if (state->repeat)
991 MARK_PUSH(ctx->lastmark);
992
993 while ((Py_ssize_t)pattern[2] == SRE_MAXREPEAT
994 || ctx->count <= (Py_ssize_t)pattern[2]) {
995 state->ptr = ptr;
996 DO_JUMP(JUMP_MIN_REPEAT_ONE,jump_min_repeat_one,
997 pattern+pattern[0]);
998 if (ret) {
999 if (state->repeat)
1000 MARK_POP_DISCARD(ctx->lastmark);
1001 RETURN_ON_ERROR(ret);
1002 RETURN_SUCCESS;
1003 }
1004 if (state->repeat)
1005 MARK_POP_KEEP(ctx->lastmark);
1006 LASTMARK_RESTORE();
1007
1008 state->ptr = ptr;
1009 ret = SRE(count)(state, pattern+3, 1);
1010 RETURN_ON_ERROR(ret);
1011 DATA_LOOKUP_AT(SRE(match_context), ctx, ctx_pos);
1012 if (ret == 0)
1013 break;
1014 assert(ret == 1);
1015 ptr++;
1016 ctx->count++;
1017 }
1018 if (state->repeat)
1019 MARK_POP_DISCARD(ctx->lastmark);
1020 }
1021 RETURN_FAILURE;
1022
1023 TARGET(SRE_OP_POSSESSIVE_REPEAT_ONE):
1024 /* match repeated sequence (maximizing regexp) without
1025 backtracking */
1026
1027 /* this operator only works if the repeated item is
1028 exactly one character wide, and we're not already
1029 collecting backtracking points. for other cases,
1030 use the MAX_REPEAT operator */
1031
1032 /* <POSSESSIVE_REPEAT_ONE> <skip> <1=min> <2=max> item <SUCCESS>
1033 tail */
1034
1035 TRACE(("|%p|%p|POSSESSIVE_REPEAT_ONE %d %d\n", pattern,
1036 ptr, pattern[1], pattern[2]));
1037
1038 if (ptr + pattern[1] > end) {
1039 RETURN_FAILURE; /* cannot match */
1040 }
1041
1042 state->ptr = ptr;
1043
1044 ret = SRE(count)(state, pattern + 3, pattern[2]);
1045 RETURN_ON_ERROR(ret);
1046 DATA_LOOKUP_AT(SRE(match_context), ctx, ctx_pos);
1047 ctx->count = ret;
1048 ptr += ctx->count;
1049
1050 /* when we arrive here, count contains the number of
1051 matches, and ptr points to the tail of the target
1052 string. check if the rest of the pattern matches,
1053 and fail if not. */
1054
1055 /* Test for not enough repetitions in match */
1056 if (ctx->count < (Py_ssize_t) pattern[1]) {
1057 RETURN_FAILURE;
1058 }
1059
1060 /* Update the pattern to point to the next op code */
1061 pattern += pattern[0];
1062
1063 /* Let the tail be evaluated separately and consider this
1064 match successful. */
1065 if (*pattern == SRE_OP_SUCCESS &&
1066 ptr == state->end &&
1067 !(ctx->toplevel && state->must_advance && ptr == state->start))
1068 {
1069 /* tail is empty. we're finished */
1070 state->ptr = ptr;
1071 RETURN_SUCCESS;
1072 }
1073
1074 /* Attempt to match the rest of the string */
1075 DISPATCH;
1076
1077 TARGET(SRE_OP_REPEAT):
1078 /* create repeat context. all the hard work is done
1079 by the UNTIL operator (MAX_UNTIL, MIN_UNTIL) */
1080 /* <REPEAT> <skip> <1=min> <2=max>
1081 <3=repeat_index> item <UNTIL> tail */
1082 TRACE(("|%p|%p|REPEAT %d %d\n", pattern, ptr,
1083 pattern[1], pattern[2]));
1084
1085 /* install new repeat context */
1086 /* TODO(https://github.com/python/cpython/issues/67877): Fix this
1087 * potential memory leak. */
1088 ctx->u.rep = (SRE_REPEAT*) PyObject_Malloc(sizeof(*ctx->u.rep));
1089 if (!ctx->u.rep) {
1090 PyErr_NoMemory();
1091 RETURN_FAILURE;
1092 }
1093 ctx->u.rep->count = -1;
1094 ctx->u.rep->pattern = pattern;
1095 ctx->u.rep->prev = state->repeat;
1096 ctx->u.rep->last_ptr = NULL;
1097 state->repeat = ctx->u.rep;
1098
1099 state->ptr = ptr;
1100 DO_JUMP(JUMP_REPEAT, jump_repeat, pattern+pattern[0]);
1101 state->repeat = ctx->u.rep->prev;
1102 PyObject_Free(ctx->u.rep);
1103
1104 if (ret) {
1105 RETURN_ON_ERROR(ret);
1106 RETURN_SUCCESS;
1107 }
1108 RETURN_FAILURE;
1109
1110 TARGET(SRE_OP_MAX_UNTIL):
1111 /* maximizing repeat */
1112 /* <REPEAT> <skip> <1=min> <2=max> item <MAX_UNTIL> tail */
1113
1114 /* FIXME: we probably need to deal with zero-width
1115 matches in here... */
1116
1117 ctx->u.rep = state->repeat;
1118 if (!ctx->u.rep)
1119 RETURN_ERROR(SRE_ERROR_STATE);
1120
1121 state->ptr = ptr;
1122
1123 ctx->count = ctx->u.rep->count+1;
1124
1125 TRACE(("|%p|%p|MAX_UNTIL %zd\n", pattern,
1126 ptr, ctx->count));
1127
1128 if (ctx->count < (Py_ssize_t) ctx->u.rep->pattern[1]) {
1129 /* not enough matches */
1130 ctx->u.rep->count = ctx->count;
1131 DO_JUMP(JUMP_MAX_UNTIL_1, jump_max_until_1,
1132 ctx->u.rep->pattern+3);
1133 if (ret) {
1134 RETURN_ON_ERROR(ret);
1135 RETURN_SUCCESS;
1136 }
1137 ctx->u.rep->count = ctx->count-1;
1138 state->ptr = ptr;
1139 RETURN_FAILURE;
1140 }
1141
1142 if ((ctx->count < (Py_ssize_t) ctx->u.rep->pattern[2] ||
1143 ctx->u.rep->pattern[2] == SRE_MAXREPEAT) &&
1144 state->ptr != ctx->u.rep->last_ptr) {
1145 /* we may have enough matches, but if we can
1146 match another item, do so */
1147 ctx->u.rep->count = ctx->count;
1148 LASTMARK_SAVE();
1149 MARK_PUSH(ctx->lastmark);
1150 /* zero-width match protection */
1151 DATA_PUSH(&ctx->u.rep->last_ptr);
1152 ctx->u.rep->last_ptr = state->ptr;
1153 DO_JUMP(JUMP_MAX_UNTIL_2, jump_max_until_2,
1154 ctx->u.rep->pattern+3);
1155 DATA_POP(&ctx->u.rep->last_ptr);
1156 if (ret) {
1157 MARK_POP_DISCARD(ctx->lastmark);
1158 RETURN_ON_ERROR(ret);
1159 RETURN_SUCCESS;
1160 }
1161 MARK_POP(ctx->lastmark);
1162 LASTMARK_RESTORE();
1163 ctx->u.rep->count = ctx->count-1;
1164 state->ptr = ptr;
1165 }
1166
1167 /* cannot match more repeated items here. make sure the
1168 tail matches */
1169 state->repeat = ctx->u.rep->prev;
1170 DO_JUMP(JUMP_MAX_UNTIL_3, jump_max_until_3, pattern);
1171 state->repeat = ctx->u.rep; // restore repeat before return
1172
1173 RETURN_ON_SUCCESS(ret);
1174 state->ptr = ptr;
1175 RETURN_FAILURE;
1176
1177 TARGET(SRE_OP_MIN_UNTIL):
1178 /* minimizing repeat */
1179 /* <REPEAT> <skip> <1=min> <2=max> item <MIN_UNTIL> tail */
1180
1181 ctx->u.rep = state->repeat;
1182 if (!ctx->u.rep)
1183 RETURN_ERROR(SRE_ERROR_STATE);
1184
1185 state->ptr = ptr;
1186
1187 ctx->count = ctx->u.rep->count+1;
1188
1189 TRACE(("|%p|%p|MIN_UNTIL %zd %p\n", pattern,
1190 ptr, ctx->count, ctx->u.rep->pattern));
1191
1192 if (ctx->count < (Py_ssize_t) ctx->u.rep->pattern[1]) {
1193 /* not enough matches */
1194 ctx->u.rep->count = ctx->count;
1195 DO_JUMP(JUMP_MIN_UNTIL_1, jump_min_until_1,
1196 ctx->u.rep->pattern+3);
1197 if (ret) {
1198 RETURN_ON_ERROR(ret);
1199 RETURN_SUCCESS;
1200 }
1201 ctx->u.rep->count = ctx->count-1;
1202 state->ptr = ptr;
1203 RETURN_FAILURE;
1204 }
1205
1206 /* see if the tail matches */
1207 state->repeat = ctx->u.rep->prev;
1208
1209 LASTMARK_SAVE();
1210 if (state->repeat)
1211 MARK_PUSH(ctx->lastmark);
1212
1213 DO_JUMP(JUMP_MIN_UNTIL_2, jump_min_until_2, pattern);
1214 SRE_REPEAT *repeat_of_tail = state->repeat;
1215 state->repeat = ctx->u.rep; // restore repeat before return
1216
1217 if (ret) {
1218 if (repeat_of_tail)
1219 MARK_POP_DISCARD(ctx->lastmark);
1220 RETURN_ON_ERROR(ret);
1221 RETURN_SUCCESS;
1222 }
1223 if (repeat_of_tail)
1224 MARK_POP(ctx->lastmark);
1225 LASTMARK_RESTORE();
1226
1227 state->ptr = ptr;
1228
1229 if ((ctx->count >= (Py_ssize_t) ctx->u.rep->pattern[2]
1230 && ctx->u.rep->pattern[2] != SRE_MAXREPEAT) ||
1231 state->ptr == ctx->u.rep->last_ptr)
1232 RETURN_FAILURE;
1233
1234 ctx->u.rep->count = ctx->count;
1235 /* zero-width match protection */
1236 DATA_PUSH(&ctx->u.rep->last_ptr);
1237 ctx->u.rep->last_ptr = state->ptr;
1238 DO_JUMP(JUMP_MIN_UNTIL_3,jump_min_until_3,
1239 ctx->u.rep->pattern+3);
1240 DATA_POP(&ctx->u.rep->last_ptr);
1241 if (ret) {
1242 RETURN_ON_ERROR(ret);
1243 RETURN_SUCCESS;
1244 }
1245 ctx->u.rep->count = ctx->count-1;
1246 state->ptr = ptr;
1247 RETURN_FAILURE;
1248
1249 TARGET(SRE_OP_POSSESSIVE_REPEAT):
1250 /* create possessive repeat contexts. */
1251 /* <POSSESSIVE_REPEAT> <skip> <1=min> <2=max> pattern
1252 <SUCCESS> tail */
1253 TRACE(("|%p|%p|POSSESSIVE_REPEAT %d %d\n", pattern,
1254 ptr, pattern[1], pattern[2]));
1255
1256 /* Set the global Input pointer to this context's Input
1257 pointer */
1258 state->ptr = ptr;
1259
1260 /* Initialize Count to 0 */
1261 ctx->count = 0;
1262
1263 /* Check for minimum required matches. */
1264 while (ctx->count < (Py_ssize_t)pattern[1]) {
1265 /* not enough matches */
1266 DO_JUMP0(JUMP_POSS_REPEAT_1, jump_poss_repeat_1,
1267 &pattern[3]);
1268 if (ret) {
1269 RETURN_ON_ERROR(ret);
1270 ctx->count++;
1271 }
1272 else {
1273 state->ptr = ptr;
1274 RETURN_FAILURE;
1275 }
1276 }
1277
1278 /* Clear the context's Input stream pointer so that it
1279 doesn't match the global state so that the while loop can
1280 be entered. */
1281 ptr = NULL;
1282
1283 /* Keep trying to parse the <pattern> sub-pattern until the
1284 end is reached, creating a new context each time. */
1285 while ((ctx->count < (Py_ssize_t)pattern[2] ||
1286 (Py_ssize_t)pattern[2] == SRE_MAXREPEAT) &&
1287 state->ptr != ptr) {
1288 /* Save the Capture Group Marker state into the current
1289 Context and back up the current highest number
1290 Capture Group marker. */
1291 LASTMARK_SAVE();
1292 MARK_PUSH(ctx->lastmark);
1293
1294 /* zero-width match protection */
1295 /* Set the context's Input Stream pointer to be the
1296 current Input Stream pointer from the global
1297 state. When the loop reaches the next iteration,
1298 the context will then store the last known good
1299 position with the global state holding the Input
1300 Input Stream position that has been updated with
1301 the most recent match. Thus, if state's Input
1302 stream remains the same as the one stored in the
1303 current Context, we know we have successfully
1304 matched an empty string and that all subsequent
1305 matches will also be the empty string until the
1306 maximum number of matches are counted, and because
1307 of this, we could immediately stop at that point and
1308 consider this match successful. */
1309 ptr = state->ptr;
1310
1311 /* We have not reached the maximin matches, so try to
1312 match once more. */
1313 DO_JUMP0(JUMP_POSS_REPEAT_2, jump_poss_repeat_2,
1314 &pattern[3]);
1315
1316 /* Check to see if the last attempted match
1317 succeeded. */
1318 if (ret) {
1319 /* Drop the saved highest number Capture Group
1320 marker saved above and use the newly updated
1321 value. */
1322 MARK_POP_DISCARD(ctx->lastmark);
1323 RETURN_ON_ERROR(ret);
1324
1325 /* Success, increment the count. */
1326 ctx->count++;
1327 }
1328 /* Last attempted match failed. */
1329 else {
1330 /* Restore the previously saved highest number
1331 Capture Group marker since the last iteration
1332 did not match, then restore that to the global
1333 state. */
1334 MARK_POP(ctx->lastmark);
1335 LASTMARK_RESTORE();
1336
1337 /* Restore the global Input Stream pointer
1338 since it can change after jumps. */
1339 state->ptr = ptr;
1340
1341 /* We have sufficient matches, so exit loop. */
1342 break;
1343 }
1344 }
1345
1346 /* Evaluate Tail */
1347 /* Jump to end of pattern indicated by skip, and then skip
1348 the SUCCESS op code that follows it. */
1349 pattern += pattern[0] + 1;
1350 ptr = state->ptr;
1351 DISPATCH;
1352
1353 TARGET(SRE_OP_ATOMIC_GROUP):
1354 /* Atomic Group Sub Pattern */
1355 /* <ATOMIC_GROUP> <skip> pattern <SUCCESS> tail */
1356 TRACE(("|%p|%p|ATOMIC_GROUP\n", pattern, ptr));
1357
1358 /* Set the global Input pointer to this context's Input
1359 pointer */
1360 state->ptr = ptr;
1361
1362 /* Evaluate the Atomic Group in a new context, terminating
1363 when the end of the group, represented by a SUCCESS op
1364 code, is reached. */
1365 /* Group Pattern begins at an offset of 1 code. */
1366 DO_JUMP0(JUMP_ATOMIC_GROUP, jump_atomic_group,
1367 &pattern[1]);
1368
1369 /* Test Exit Condition */
1370 RETURN_ON_ERROR(ret);
1371
1372 if (ret == 0) {
1373 /* Atomic Group failed to Match. */
1374 state->ptr = ptr;
1375 RETURN_FAILURE;
1376 }
1377
1378 /* Evaluate Tail */
1379 /* Jump to end of pattern indicated by skip, and then skip
1380 the SUCCESS op code that follows it. */
1381 pattern += pattern[0];
1382 ptr = state->ptr;
1383 DISPATCH;
1384
1385 TARGET(SRE_OP_GROUPREF):
1386 /* match backreference */
1387 TRACE(("|%p|%p|GROUPREF %d\n", pattern,
1388 ptr, pattern[0]));
1389 {
1390 int groupref = pattern[0] * 2;
1391 if (groupref >= state->lastmark) {
1392 RETURN_FAILURE;
1393 } else {
1394 SRE_CHAR* p = (SRE_CHAR*) state->mark[groupref];
1395 SRE_CHAR* e = (SRE_CHAR*) state->mark[groupref+1];
1396 if (!p || !e || e < p)
1397 RETURN_FAILURE;
1398 while (p < e) {
1399 if (ptr >= end || *ptr != *p)
1400 RETURN_FAILURE;
1401 p++;
1402 ptr++;
1403 }
1404 }
1405 }
1406 pattern++;
1407 DISPATCH;
1408
1409 TARGET(SRE_OP_GROUPREF_IGNORE):
1410 /* match backreference */
1411 TRACE(("|%p|%p|GROUPREF_IGNORE %d\n", pattern,
1412 ptr, pattern[0]));
1413 {
1414 int groupref = pattern[0] * 2;
1415 if (groupref >= state->lastmark) {
1416 RETURN_FAILURE;
1417 } else {
1418 SRE_CHAR* p = (SRE_CHAR*) state->mark[groupref];
1419 SRE_CHAR* e = (SRE_CHAR*) state->mark[groupref+1];
1420 if (!p || !e || e < p)
1421 RETURN_FAILURE;
1422 while (p < e) {
1423 if (ptr >= end ||
1424 sre_lower_ascii(*ptr) != sre_lower_ascii(*p))
1425 RETURN_FAILURE;
1426 p++;
1427 ptr++;
1428 }
1429 }
1430 }
1431 pattern++;
1432 DISPATCH;
1433
1434 TARGET(SRE_OP_GROUPREF_UNI_IGNORE):
1435 /* match backreference */
1436 TRACE(("|%p|%p|GROUPREF_UNI_IGNORE %d\n", pattern,
1437 ptr, pattern[0]));
1438 {
1439 int groupref = pattern[0] * 2;
1440 if (groupref >= state->lastmark) {
1441 RETURN_FAILURE;
1442 } else {
1443 SRE_CHAR* p = (SRE_CHAR*) state->mark[groupref];
1444 SRE_CHAR* e = (SRE_CHAR*) state->mark[groupref+1];
1445 if (!p || !e || e < p)
1446 RETURN_FAILURE;
1447 while (p < e) {
1448 if (ptr >= end ||
1449 sre_lower_unicode(*ptr) != sre_lower_unicode(*p))
1450 RETURN_FAILURE;
1451 p++;
1452 ptr++;
1453 }
1454 }
1455 }
1456 pattern++;
1457 DISPATCH;
1458
1459 TARGET(SRE_OP_GROUPREF_LOC_IGNORE):
1460 /* match backreference */
1461 TRACE(("|%p|%p|GROUPREF_LOC_IGNORE %d\n", pattern,
1462 ptr, pattern[0]));
1463 {
1464 int groupref = pattern[0] * 2;
1465 if (groupref >= state->lastmark) {
1466 RETURN_FAILURE;
1467 } else {
1468 SRE_CHAR* p = (SRE_CHAR*) state->mark[groupref];
1469 SRE_CHAR* e = (SRE_CHAR*) state->mark[groupref+1];
1470 if (!p || !e || e < p)
1471 RETURN_FAILURE;
1472 while (p < e) {
1473 if (ptr >= end ||
1474 sre_lower_locale(*ptr) != sre_lower_locale(*p))
1475 RETURN_FAILURE;
1476 p++;
1477 ptr++;
1478 }
1479 }
1480 }
1481 pattern++;
1482 DISPATCH;
1483
1484 TARGET(SRE_OP_GROUPREF_EXISTS):
1485 TRACE(("|%p|%p|GROUPREF_EXISTS %d\n", pattern,
1486 ptr, pattern[0]));
1487 /* <GROUPREF_EXISTS> <group> <skip> codeyes <JUMP> codeno ... */
1488 {
1489 int groupref = pattern[0] * 2;
1490 if (groupref >= state->lastmark) {
1491 pattern += pattern[1];
1492 DISPATCH;
1493 } else {
1494 SRE_CHAR* p = (SRE_CHAR*) state->mark[groupref];
1495 SRE_CHAR* e = (SRE_CHAR*) state->mark[groupref+1];
1496 if (!p || !e || e < p) {
1497 pattern += pattern[1];
1498 DISPATCH;
1499 }
1500 }
1501 }
1502 pattern += 2;
1503 DISPATCH;
1504
1505 TARGET(SRE_OP_ASSERT):
1506 /* assert subpattern */
1507 /* <ASSERT> <skip> <back> <pattern> */
1508 TRACE(("|%p|%p|ASSERT %d\n", pattern,
1509 ptr, pattern[1]));
1510 if (ptr - (SRE_CHAR *)state->beginning < (Py_ssize_t)pattern[1])
1511 RETURN_FAILURE;
1512 state->ptr = ptr - pattern[1];
1513 DO_JUMP0(JUMP_ASSERT, jump_assert, pattern+2);
1514 RETURN_ON_FAILURE(ret);
1515 pattern += pattern[0];
1516 DISPATCH;
1517
1518 TARGET(SRE_OP_ASSERT_NOT):
1519 /* assert not subpattern */
1520 /* <ASSERT_NOT> <skip> <back> <pattern> */
1521 TRACE(("|%p|%p|ASSERT_NOT %d\n", pattern,
1522 ptr, pattern[1]));
1523 if (ptr - (SRE_CHAR *)state->beginning >= (Py_ssize_t)pattern[1]) {
1524 state->ptr = ptr - pattern[1];
1525 LASTMARK_SAVE();
1526 if (state->repeat)
1527 MARK_PUSH(ctx->lastmark);
1528
1529 DO_JUMP0(JUMP_ASSERT_NOT, jump_assert_not, pattern+2);
1530 if (ret) {
1531 if (state->repeat)
1532 MARK_POP_DISCARD(ctx->lastmark);
1533 RETURN_ON_ERROR(ret);
1534 RETURN_FAILURE;
1535 }
1536 if (state->repeat)
1537 MARK_POP(ctx->lastmark);
1538 LASTMARK_RESTORE();
1539 }
1540 pattern += pattern[0];
1541 DISPATCH;
1542
1543 TARGET(SRE_OP_FAILURE):
1544 /* immediate failure */
1545 TRACE(("|%p|%p|FAILURE\n", pattern, ptr));
1546 RETURN_FAILURE;
1547
1548 #if !USE_COMPUTED_GOTOS
1549 default:
1550 #endif
1551 // Also any unused opcodes:
1552 TARGET(SRE_OP_RANGE_UNI_IGNORE):
1553 TARGET(SRE_OP_SUBPATTERN):
1554 TARGET(SRE_OP_RANGE):
1555 TARGET(SRE_OP_NEGATE):
1556 TARGET(SRE_OP_BIGCHARSET):
1557 TARGET(SRE_OP_CHARSET):
1558 TRACE(("|%p|%p|UNKNOWN %d\n", pattern, ptr,
1559 pattern[-1]));
1560 RETURN_ERROR(SRE_ERROR_ILLEGAL);
1561
1562 }
1563
1564 exit:
1565 ctx_pos = ctx->last_ctx_pos;
1566 jump = ctx->jump;
1567 DATA_POP_DISCARD(ctx);
1568 if (ctx_pos == -1)
1569 return ret;
1570 DATA_LOOKUP_AT(SRE(match_context), ctx, ctx_pos);
1571
1572 switch (jump) {
1573 case JUMP_MAX_UNTIL_2:
1574 TRACE(("|%p|%p|JUMP_MAX_UNTIL_2\n", pattern, ptr));
1575 goto jump_max_until_2;
1576 case JUMP_MAX_UNTIL_3:
1577 TRACE(("|%p|%p|JUMP_MAX_UNTIL_3\n", pattern, ptr));
1578 goto jump_max_until_3;
1579 case JUMP_MIN_UNTIL_2:
1580 TRACE(("|%p|%p|JUMP_MIN_UNTIL_2\n", pattern, ptr));
1581 goto jump_min_until_2;
1582 case JUMP_MIN_UNTIL_3:
1583 TRACE(("|%p|%p|JUMP_MIN_UNTIL_3\n", pattern, ptr));
1584 goto jump_min_until_3;
1585 case JUMP_BRANCH:
1586 TRACE(("|%p|%p|JUMP_BRANCH\n", pattern, ptr));
1587 goto jump_branch;
1588 case JUMP_MAX_UNTIL_1:
1589 TRACE(("|%p|%p|JUMP_MAX_UNTIL_1\n", pattern, ptr));
1590 goto jump_max_until_1;
1591 case JUMP_MIN_UNTIL_1:
1592 TRACE(("|%p|%p|JUMP_MIN_UNTIL_1\n", pattern, ptr));
1593 goto jump_min_until_1;
1594 case JUMP_POSS_REPEAT_1:
1595 TRACE(("|%p|%p|JUMP_POSS_REPEAT_1\n", pattern, ptr));
1596 goto jump_poss_repeat_1;
1597 case JUMP_POSS_REPEAT_2:
1598 TRACE(("|%p|%p|JUMP_POSS_REPEAT_2\n", pattern, ptr));
1599 goto jump_poss_repeat_2;
1600 case JUMP_REPEAT:
1601 TRACE(("|%p|%p|JUMP_REPEAT\n", pattern, ptr));
1602 goto jump_repeat;
1603 case JUMP_REPEAT_ONE_1:
1604 TRACE(("|%p|%p|JUMP_REPEAT_ONE_1\n", pattern, ptr));
1605 goto jump_repeat_one_1;
1606 case JUMP_REPEAT_ONE_2:
1607 TRACE(("|%p|%p|JUMP_REPEAT_ONE_2\n", pattern, ptr));
1608 goto jump_repeat_one_2;
1609 case JUMP_MIN_REPEAT_ONE:
1610 TRACE(("|%p|%p|JUMP_MIN_REPEAT_ONE\n", pattern, ptr));
1611 goto jump_min_repeat_one;
1612 case JUMP_ATOMIC_GROUP:
1613 TRACE(("|%p|%p|JUMP_ATOMIC_GROUP\n", pattern, ptr));
1614 goto jump_atomic_group;
1615 case JUMP_ASSERT:
1616 TRACE(("|%p|%p|JUMP_ASSERT\n", pattern, ptr));
1617 goto jump_assert;
1618 case JUMP_ASSERT_NOT:
1619 TRACE(("|%p|%p|JUMP_ASSERT_NOT\n", pattern, ptr));
1620 goto jump_assert_not;
1621 case JUMP_NONE:
1622 TRACE(("|%p|%p|RETURN %zd\n", pattern,
1623 ptr, ret));
1624 break;
1625 }
1626
1627 return ret; /* should never get here */
1628 }
1629
1630 /* need to reset capturing groups between two SRE(match) callings in loops */
1631 #define RESET_CAPTURE_GROUP() \
1632 do { state->lastmark = state->lastindex = -1; } while (0)
1633
1634 LOCAL(Py_ssize_t)
1635 SRE(search)(SRE_STATE* state, SRE_CODE* pattern)
1636 {
1637 SRE_CHAR* ptr = (SRE_CHAR *)state->start;
1638 SRE_CHAR* end = (SRE_CHAR *)state->end;
1639 Py_ssize_t status = 0;
1640 Py_ssize_t prefix_len = 0;
1641 Py_ssize_t prefix_skip = 0;
1642 SRE_CODE* prefix = NULL;
1643 SRE_CODE* charset = NULL;
1644 SRE_CODE* overlap = NULL;
1645 int flags = 0;
1646
1647 if (ptr > end)
1648 return 0;
1649
1650 if (pattern[0] == SRE_OP_INFO) {
1651 /* optimization info block */
1652 /* <INFO> <1=skip> <2=flags> <3=min> <4=max> <5=prefix info> */
1653
1654 flags = pattern[2];
1655
1656 if (pattern[3] && end - ptr < (Py_ssize_t)pattern[3]) {
1657 TRACE(("reject (got %u chars, need %u)\n",
1658 (unsigned int)(end - ptr), pattern[3]));
1659 return 0;
1660 }
1661 if (pattern[3] > 1) {
1662 /* adjust end point (but make sure we leave at least one
1663 character in there, so literal search will work) */
1664 end -= pattern[3] - 1;
1665 if (end <= ptr)
1666 end = ptr;
1667 }
1668
1669 if (flags & SRE_INFO_PREFIX) {
1670 /* pattern starts with a known prefix */
1671 /* <length> <skip> <prefix data> <overlap data> */
1672 prefix_len = pattern[5];
1673 prefix_skip = pattern[6];
1674 prefix = pattern + 7;
1675 overlap = prefix + prefix_len - 1;
1676 } else if (flags & SRE_INFO_CHARSET)
1677 /* pattern starts with a character from a known set */
1678 /* <charset> */
1679 charset = pattern + 5;
1680
1681 pattern += 1 + pattern[1];
1682 }
1683
1684 TRACE(("prefix = %p %zd %zd\n",
1685 prefix, prefix_len, prefix_skip));
1686 TRACE(("charset = %p\n", charset));
1687
1688 if (prefix_len == 1) {
1689 /* pattern starts with a literal character */
1690 SRE_CHAR c = (SRE_CHAR) prefix[0];
1691 #if SIZEOF_SRE_CHAR < 4
1692 if ((SRE_CODE) c != prefix[0])
1693 return 0; /* literal can't match: doesn't fit in char width */
1694 #endif
1695 end = (SRE_CHAR *)state->end;
1696 state->must_advance = 0;
1697 while (ptr < end) {
1698 while (*ptr != c) {
1699 if (++ptr >= end)
1700 return 0;
1701 }
1702 TRACE(("|%p|%p|SEARCH LITERAL\n", pattern, ptr));
1703 state->start = ptr;
1704 state->ptr = ptr + prefix_skip;
1705 if (flags & SRE_INFO_LITERAL)
1706 return 1; /* we got all of it */
1707 status = SRE(match)(state, pattern + 2*prefix_skip, 0);
1708 if (status != 0)
1709 return status;
1710 ++ptr;
1711 RESET_CAPTURE_GROUP();
1712 }
1713 return 0;
1714 }
1715
1716 if (prefix_len > 1) {
1717 /* pattern starts with a known prefix. use the overlap
1718 table to skip forward as fast as we possibly can */
1719 Py_ssize_t i = 0;
1720
1721 end = (SRE_CHAR *)state->end;
1722 if (prefix_len > end - ptr)
1723 return 0;
1724 #if SIZEOF_SRE_CHAR < 4
1725 for (i = 0; i < prefix_len; i++)
1726 if ((SRE_CODE)(SRE_CHAR) prefix[i] != prefix[i])
1727 return 0; /* literal can't match: doesn't fit in char width */
1728 #endif
1729 while (ptr < end) {
1730 SRE_CHAR c = (SRE_CHAR) prefix[0];
1731 while (*ptr++ != c) {
1732 if (ptr >= end)
1733 return 0;
1734 }
1735 if (ptr >= end)
1736 return 0;
1737
1738 i = 1;
1739 state->must_advance = 0;
1740 do {
1741 if (*ptr == (SRE_CHAR) prefix[i]) {
1742 if (++i != prefix_len) {
1743 if (++ptr >= end)
1744 return 0;
1745 continue;
1746 }
1747 /* found a potential match */
1748 TRACE(("|%p|%p|SEARCH SCAN\n", pattern, ptr));
1749 state->start = ptr - (prefix_len - 1);
1750 state->ptr = ptr - (prefix_len - prefix_skip - 1);
1751 if (flags & SRE_INFO_LITERAL)
1752 return 1; /* we got all of it */
1753 status = SRE(match)(state, pattern + 2*prefix_skip, 0);
1754 if (status != 0)
1755 return status;
1756 /* close but no cigar -- try again */
1757 if (++ptr >= end)
1758 return 0;
1759 RESET_CAPTURE_GROUP();
1760 }
1761 i = overlap[i];
1762 } while (i != 0);
1763 }
1764 return 0;
1765 }
1766
1767 if (charset) {
1768 /* pattern starts with a character from a known set */
1769 end = (SRE_CHAR *)state->end;
1770 state->must_advance = 0;
1771 for (;;) {
1772 while (ptr < end && !SRE(charset)(state, charset, *ptr))
1773 ptr++;
1774 if (ptr >= end)
1775 return 0;
1776 TRACE(("|%p|%p|SEARCH CHARSET\n", pattern, ptr));
1777 state->start = ptr;
1778 state->ptr = ptr;
1779 status = SRE(match)(state, pattern, 0);
1780 if (status != 0)
1781 break;
1782 ptr++;
1783 RESET_CAPTURE_GROUP();
1784 }
1785 } else {
1786 /* general case */
1787 assert(ptr <= end);
1788 TRACE(("|%p|%p|SEARCH\n", pattern, ptr));
1789 state->start = state->ptr = ptr;
1790 status = SRE(match)(state, pattern, 1);
1791 state->must_advance = 0;
1792 if (status == 0 && pattern[0] == SRE_OP_AT &&
1793 (pattern[1] == SRE_AT_BEGINNING ||
1794 pattern[1] == SRE_AT_BEGINNING_STRING))
1795 {
1796 state->start = state->ptr = ptr = end;
1797 return 0;
1798 }
1799 while (status == 0 && ptr < end) {
1800 ptr++;
1801 RESET_CAPTURE_GROUP();
1802 TRACE(("|%p|%p|SEARCH\n", pattern, ptr));
1803 state->start = state->ptr = ptr;
1804 status = SRE(match)(state, pattern, 0);
1805 }
1806 }
1807
1808 return status;
1809 }
1810
1811 #undef SRE_CHAR
1812 #undef SIZEOF_SRE_CHAR
1813 #undef SRE
1814
1815 /* vim:ts=4:sw=4:et
1816 */