python (3.12.0)
1 #
2 # Secret Labs' Regular Expression Engine
3 #
4 # convert re-style regular expression to sre pattern
5 #
6 # Copyright (c) 1998-2001 by Secret Labs AB. All rights reserved.
7 #
8 # See the __init__.py file for information on usage and redistribution.
9 #
10
11 """Internal support module for sre"""
12
13 # XXX: show string offset and offending character for all errors
14
15 from ._constants import *
16
17 SPECIAL_CHARS = ".\\[{()*+?^$|"
18 REPEAT_CHARS = "*+?{"
19
20 DIGITS = frozenset("0123456789")
21
22 OCTDIGITS = frozenset("01234567")
23 HEXDIGITS = frozenset("0123456789abcdefABCDEF")
24 ASCIILETTERS = frozenset("abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ")
25
26 WHITESPACE = frozenset(" \t\n\r\v\f")
27
28 _REPEATCODES = frozenset({MIN_REPEAT, MAX_REPEAT, POSSESSIVE_REPEAT})
29 _UNITCODES = frozenset({ANY, RANGE, IN, LITERAL, NOT_LITERAL, CATEGORY})
30
31 ESCAPES = {
32 r"\a": (LITERAL, ord("\a")),
33 r"\b": (LITERAL, ord("\b")),
34 r"\f": (LITERAL, ord("\f")),
35 r"\n": (LITERAL, ord("\n")),
36 r"\r": (LITERAL, ord("\r")),
37 r"\t": (LITERAL, ord("\t")),
38 r"\v": (LITERAL, ord("\v")),
39 r"\\": (LITERAL, ord("\\"))
40 }
41
42 CATEGORIES = {
43 r"\A": (AT, AT_BEGINNING_STRING), # start of string
44 r"\b": (AT, AT_BOUNDARY),
45 r"\B": (AT, AT_NON_BOUNDARY),
46 r"\d": (IN, [(CATEGORY, CATEGORY_DIGIT)]),
47 r"\D": (IN, [(CATEGORY, CATEGORY_NOT_DIGIT)]),
48 r"\s": (IN, [(CATEGORY, CATEGORY_SPACE)]),
49 r"\S": (IN, [(CATEGORY, CATEGORY_NOT_SPACE)]),
50 r"\w": (IN, [(CATEGORY, CATEGORY_WORD)]),
51 r"\W": (IN, [(CATEGORY, CATEGORY_NOT_WORD)]),
52 r"\Z": (AT, AT_END_STRING), # end of string
53 }
54
55 FLAGS = {
56 # standard flags
57 "i": SRE_FLAG_IGNORECASE,
58 "L": SRE_FLAG_LOCALE,
59 "m": SRE_FLAG_MULTILINE,
60 "s": SRE_FLAG_DOTALL,
61 "x": SRE_FLAG_VERBOSE,
62 # extensions
63 "a": SRE_FLAG_ASCII,
64 "t": SRE_FLAG_TEMPLATE,
65 "u": SRE_FLAG_UNICODE,
66 }
67
68 TYPE_FLAGS = SRE_FLAG_ASCII | SRE_FLAG_LOCALE | SRE_FLAG_UNICODE
69 GLOBAL_FLAGS = SRE_FLAG_DEBUG | SRE_FLAG_TEMPLATE
70
71 class ESC[4;38;5;81mState:
72 # keeps track of state for parsing
73 def __init__(self):
74 self.flags = 0
75 self.groupdict = {}
76 self.groupwidths = [None] # group 0
77 self.lookbehindgroups = None
78 self.grouprefpos = {}
79 @property
80 def groups(self):
81 return len(self.groupwidths)
82 def opengroup(self, name=None):
83 gid = self.groups
84 self.groupwidths.append(None)
85 if self.groups > MAXGROUPS:
86 raise error("too many groups")
87 if name is not None:
88 ogid = self.groupdict.get(name, None)
89 if ogid is not None:
90 raise error("redefinition of group name %r as group %d; "
91 "was group %d" % (name, gid, ogid))
92 self.groupdict[name] = gid
93 return gid
94 def closegroup(self, gid, p):
95 self.groupwidths[gid] = p.getwidth()
96 def checkgroup(self, gid):
97 return gid < self.groups and self.groupwidths[gid] is not None
98
99 def checklookbehindgroup(self, gid, source):
100 if self.lookbehindgroups is not None:
101 if not self.checkgroup(gid):
102 raise source.error('cannot refer to an open group')
103 if gid >= self.lookbehindgroups:
104 raise source.error('cannot refer to group defined in the same '
105 'lookbehind subpattern')
106
107 class ESC[4;38;5;81mSubPattern:
108 # a subpattern, in intermediate form
109 def __init__(self, state, data=None):
110 self.state = state
111 if data is None:
112 data = []
113 self.data = data
114 self.width = None
115
116 def dump(self, level=0):
117 seqtypes = (tuple, list)
118 for op, av in self.data:
119 print(level*" " + str(op), end='')
120 if op is IN:
121 # member sublanguage
122 print()
123 for op, a in av:
124 print((level+1)*" " + str(op), a)
125 elif op is BRANCH:
126 print()
127 for i, a in enumerate(av[1]):
128 if i:
129 print(level*" " + "OR")
130 a.dump(level+1)
131 elif op is GROUPREF_EXISTS:
132 condgroup, item_yes, item_no = av
133 print('', condgroup)
134 item_yes.dump(level+1)
135 if item_no:
136 print(level*" " + "ELSE")
137 item_no.dump(level+1)
138 elif isinstance(av, SubPattern):
139 print()
140 av.dump(level+1)
141 elif isinstance(av, seqtypes):
142 nl = False
143 for a in av:
144 if isinstance(a, SubPattern):
145 if not nl:
146 print()
147 a.dump(level+1)
148 nl = True
149 else:
150 if not nl:
151 print(' ', end='')
152 print(a, end='')
153 nl = False
154 if not nl:
155 print()
156 else:
157 print('', av)
158 def __repr__(self):
159 return repr(self.data)
160 def __len__(self):
161 return len(self.data)
162 def __delitem__(self, index):
163 del self.data[index]
164 def __getitem__(self, index):
165 if isinstance(index, slice):
166 return SubPattern(self.state, self.data[index])
167 return self.data[index]
168 def __setitem__(self, index, code):
169 self.data[index] = code
170 def insert(self, index, code):
171 self.data.insert(index, code)
172 def append(self, code):
173 self.data.append(code)
174 def getwidth(self):
175 # determine the width (min, max) for this subpattern
176 if self.width is not None:
177 return self.width
178 lo = hi = 0
179 for op, av in self.data:
180 if op is BRANCH:
181 i = MAXREPEAT - 1
182 j = 0
183 for av in av[1]:
184 l, h = av.getwidth()
185 i = min(i, l)
186 j = max(j, h)
187 lo = lo + i
188 hi = hi + j
189 elif op is ATOMIC_GROUP:
190 i, j = av.getwidth()
191 lo = lo + i
192 hi = hi + j
193 elif op is SUBPATTERN:
194 i, j = av[-1].getwidth()
195 lo = lo + i
196 hi = hi + j
197 elif op in _REPEATCODES:
198 i, j = av[2].getwidth()
199 lo = lo + i * av[0]
200 hi = hi + j * av[1]
201 elif op in _UNITCODES:
202 lo = lo + 1
203 hi = hi + 1
204 elif op is GROUPREF:
205 i, j = self.state.groupwidths[av]
206 lo = lo + i
207 hi = hi + j
208 elif op is GROUPREF_EXISTS:
209 i, j = av[1].getwidth()
210 if av[2] is not None:
211 l, h = av[2].getwidth()
212 i = min(i, l)
213 j = max(j, h)
214 else:
215 i = 0
216 lo = lo + i
217 hi = hi + j
218 elif op is SUCCESS:
219 break
220 self.width = min(lo, MAXREPEAT - 1), min(hi, MAXREPEAT)
221 return self.width
222
223 class ESC[4;38;5;81mTokenizer:
224 def __init__(self, string):
225 self.istext = isinstance(string, str)
226 self.string = string
227 if not self.istext:
228 string = str(string, 'latin1')
229 self.decoded_string = string
230 self.index = 0
231 self.next = None
232 self.__next()
233 def __next(self):
234 index = self.index
235 try:
236 char = self.decoded_string[index]
237 except IndexError:
238 self.next = None
239 return
240 if char == "\\":
241 index += 1
242 try:
243 char += self.decoded_string[index]
244 except IndexError:
245 raise error("bad escape (end of pattern)",
246 self.string, len(self.string) - 1) from None
247 self.index = index + 1
248 self.next = char
249 def match(self, char):
250 if char == self.next:
251 self.__next()
252 return True
253 return False
254 def get(self):
255 this = self.next
256 self.__next()
257 return this
258 def getwhile(self, n, charset):
259 result = ''
260 for _ in range(n):
261 c = self.next
262 if c not in charset:
263 break
264 result += c
265 self.__next()
266 return result
267 def getuntil(self, terminator, name):
268 result = ''
269 while True:
270 c = self.next
271 self.__next()
272 if c is None:
273 if not result:
274 raise self.error("missing " + name)
275 raise self.error("missing %s, unterminated name" % terminator,
276 len(result))
277 if c == terminator:
278 if not result:
279 raise self.error("missing " + name, 1)
280 break
281 result += c
282 return result
283 @property
284 def pos(self):
285 return self.index - len(self.next or '')
286 def tell(self):
287 return self.index - len(self.next or '')
288 def seek(self, index):
289 self.index = index
290 self.__next()
291
292 def error(self, msg, offset=0):
293 if not self.istext:
294 msg = msg.encode('ascii', 'backslashreplace').decode('ascii')
295 return error(msg, self.string, self.tell() - offset)
296
297 def checkgroupname(self, name, offset):
298 if not (self.istext or name.isascii()):
299 msg = "bad character in group name %a" % name
300 raise self.error(msg, len(name) + offset)
301 if not name.isidentifier():
302 msg = "bad character in group name %r" % name
303 raise self.error(msg, len(name) + offset)
304
305 def _class_escape(source, escape):
306 # handle escape code inside character class
307 code = ESCAPES.get(escape)
308 if code:
309 return code
310 code = CATEGORIES.get(escape)
311 if code and code[0] is IN:
312 return code
313 try:
314 c = escape[1:2]
315 if c == "x":
316 # hexadecimal escape (exactly two digits)
317 escape += source.getwhile(2, HEXDIGITS)
318 if len(escape) != 4:
319 raise source.error("incomplete escape %s" % escape, len(escape))
320 return LITERAL, int(escape[2:], 16)
321 elif c == "u" and source.istext:
322 # unicode escape (exactly four digits)
323 escape += source.getwhile(4, HEXDIGITS)
324 if len(escape) != 6:
325 raise source.error("incomplete escape %s" % escape, len(escape))
326 return LITERAL, int(escape[2:], 16)
327 elif c == "U" and source.istext:
328 # unicode escape (exactly eight digits)
329 escape += source.getwhile(8, HEXDIGITS)
330 if len(escape) != 10:
331 raise source.error("incomplete escape %s" % escape, len(escape))
332 c = int(escape[2:], 16)
333 chr(c) # raise ValueError for invalid code
334 return LITERAL, c
335 elif c == "N" and source.istext:
336 import unicodedata
337 # named unicode escape e.g. \N{EM DASH}
338 if not source.match('{'):
339 raise source.error("missing {")
340 charname = source.getuntil('}', 'character name')
341 try:
342 c = ord(unicodedata.lookup(charname))
343 except (KeyError, TypeError):
344 raise source.error("undefined character name %r" % charname,
345 len(charname) + len(r'\N{}')) from None
346 return LITERAL, c
347 elif c in OCTDIGITS:
348 # octal escape (up to three digits)
349 escape += source.getwhile(2, OCTDIGITS)
350 c = int(escape[1:], 8)
351 if c > 0o377:
352 raise source.error('octal escape value %s outside of '
353 'range 0-0o377' % escape, len(escape))
354 return LITERAL, c
355 elif c in DIGITS:
356 raise ValueError
357 if len(escape) == 2:
358 if c in ASCIILETTERS:
359 raise source.error('bad escape %s' % escape, len(escape))
360 return LITERAL, ord(escape[1])
361 except ValueError:
362 pass
363 raise source.error("bad escape %s" % escape, len(escape))
364
365 def _escape(source, escape, state):
366 # handle escape code in expression
367 code = CATEGORIES.get(escape)
368 if code:
369 return code
370 code = ESCAPES.get(escape)
371 if code:
372 return code
373 try:
374 c = escape[1:2]
375 if c == "x":
376 # hexadecimal escape
377 escape += source.getwhile(2, HEXDIGITS)
378 if len(escape) != 4:
379 raise source.error("incomplete escape %s" % escape, len(escape))
380 return LITERAL, int(escape[2:], 16)
381 elif c == "u" and source.istext:
382 # unicode escape (exactly four digits)
383 escape += source.getwhile(4, HEXDIGITS)
384 if len(escape) != 6:
385 raise source.error("incomplete escape %s" % escape, len(escape))
386 return LITERAL, int(escape[2:], 16)
387 elif c == "U" and source.istext:
388 # unicode escape (exactly eight digits)
389 escape += source.getwhile(8, HEXDIGITS)
390 if len(escape) != 10:
391 raise source.error("incomplete escape %s" % escape, len(escape))
392 c = int(escape[2:], 16)
393 chr(c) # raise ValueError for invalid code
394 return LITERAL, c
395 elif c == "N" and source.istext:
396 import unicodedata
397 # named unicode escape e.g. \N{EM DASH}
398 if not source.match('{'):
399 raise source.error("missing {")
400 charname = source.getuntil('}', 'character name')
401 try:
402 c = ord(unicodedata.lookup(charname))
403 except (KeyError, TypeError):
404 raise source.error("undefined character name %r" % charname,
405 len(charname) + len(r'\N{}')) from None
406 return LITERAL, c
407 elif c == "0":
408 # octal escape
409 escape += source.getwhile(2, OCTDIGITS)
410 return LITERAL, int(escape[1:], 8)
411 elif c in DIGITS:
412 # octal escape *or* decimal group reference (sigh)
413 if source.next in DIGITS:
414 escape += source.get()
415 if (escape[1] in OCTDIGITS and escape[2] in OCTDIGITS and
416 source.next in OCTDIGITS):
417 # got three octal digits; this is an octal escape
418 escape += source.get()
419 c = int(escape[1:], 8)
420 if c > 0o377:
421 raise source.error('octal escape value %s outside of '
422 'range 0-0o377' % escape,
423 len(escape))
424 return LITERAL, c
425 # not an octal escape, so this is a group reference
426 group = int(escape[1:])
427 if group < state.groups:
428 if not state.checkgroup(group):
429 raise source.error("cannot refer to an open group",
430 len(escape))
431 state.checklookbehindgroup(group, source)
432 return GROUPREF, group
433 raise source.error("invalid group reference %d" % group, len(escape) - 1)
434 if len(escape) == 2:
435 if c in ASCIILETTERS:
436 raise source.error("bad escape %s" % escape, len(escape))
437 return LITERAL, ord(escape[1])
438 except ValueError:
439 pass
440 raise source.error("bad escape %s" % escape, len(escape))
441
442 def _uniq(items):
443 return list(dict.fromkeys(items))
444
445 def _parse_sub(source, state, verbose, nested):
446 # parse an alternation: a|b|c
447
448 items = []
449 itemsappend = items.append
450 sourcematch = source.match
451 start = source.tell()
452 while True:
453 itemsappend(_parse(source, state, verbose, nested + 1,
454 not nested and not items))
455 if not sourcematch("|"):
456 break
457 if not nested:
458 verbose = state.flags & SRE_FLAG_VERBOSE
459
460 if len(items) == 1:
461 return items[0]
462
463 subpattern = SubPattern(state)
464
465 # check if all items share a common prefix
466 while True:
467 prefix = None
468 for item in items:
469 if not item:
470 break
471 if prefix is None:
472 prefix = item[0]
473 elif item[0] != prefix:
474 break
475 else:
476 # all subitems start with a common "prefix".
477 # move it out of the branch
478 for item in items:
479 del item[0]
480 subpattern.append(prefix)
481 continue # check next one
482 break
483
484 # check if the branch can be replaced by a character set
485 set = []
486 for item in items:
487 if len(item) != 1:
488 break
489 op, av = item[0]
490 if op is LITERAL:
491 set.append((op, av))
492 elif op is IN and av[0][0] is not NEGATE:
493 set.extend(av)
494 else:
495 break
496 else:
497 # we can store this as a character set instead of a
498 # branch (the compiler may optimize this even more)
499 subpattern.append((IN, _uniq(set)))
500 return subpattern
501
502 subpattern.append((BRANCH, (None, items)))
503 return subpattern
504
505 def _parse(source, state, verbose, nested, first=False):
506 # parse a simple pattern
507 subpattern = SubPattern(state)
508
509 # precompute constants into local variables
510 subpatternappend = subpattern.append
511 sourceget = source.get
512 sourcematch = source.match
513 _len = len
514 _ord = ord
515
516 while True:
517
518 this = source.next
519 if this is None:
520 break # end of pattern
521 if this in "|)":
522 break # end of subpattern
523 sourceget()
524
525 if verbose:
526 # skip whitespace and comments
527 if this in WHITESPACE:
528 continue
529 if this == "#":
530 while True:
531 this = sourceget()
532 if this is None or this == "\n":
533 break
534 continue
535
536 if this[0] == "\\":
537 code = _escape(source, this, state)
538 subpatternappend(code)
539
540 elif this not in SPECIAL_CHARS:
541 subpatternappend((LITERAL, _ord(this)))
542
543 elif this == "[":
544 here = source.tell() - 1
545 # character set
546 set = []
547 setappend = set.append
548 ## if sourcematch(":"):
549 ## pass # handle character classes
550 if source.next == '[':
551 import warnings
552 warnings.warn(
553 'Possible nested set at position %d' % source.tell(),
554 FutureWarning, stacklevel=nested + 6
555 )
556 negate = sourcematch("^")
557 # check remaining characters
558 while True:
559 this = sourceget()
560 if this is None:
561 raise source.error("unterminated character set",
562 source.tell() - here)
563 if this == "]" and set:
564 break
565 elif this[0] == "\\":
566 code1 = _class_escape(source, this)
567 else:
568 if set and this in '-&~|' and source.next == this:
569 import warnings
570 warnings.warn(
571 'Possible set %s at position %d' % (
572 'difference' if this == '-' else
573 'intersection' if this == '&' else
574 'symmetric difference' if this == '~' else
575 'union',
576 source.tell() - 1),
577 FutureWarning, stacklevel=nested + 6
578 )
579 code1 = LITERAL, _ord(this)
580 if sourcematch("-"):
581 # potential range
582 that = sourceget()
583 if that is None:
584 raise source.error("unterminated character set",
585 source.tell() - here)
586 if that == "]":
587 if code1[0] is IN:
588 code1 = code1[1][0]
589 setappend(code1)
590 setappend((LITERAL, _ord("-")))
591 break
592 if that[0] == "\\":
593 code2 = _class_escape(source, that)
594 else:
595 if that == '-':
596 import warnings
597 warnings.warn(
598 'Possible set difference at position %d' % (
599 source.tell() - 2),
600 FutureWarning, stacklevel=nested + 6
601 )
602 code2 = LITERAL, _ord(that)
603 if code1[0] != LITERAL or code2[0] != LITERAL:
604 msg = "bad character range %s-%s" % (this, that)
605 raise source.error(msg, len(this) + 1 + len(that))
606 lo = code1[1]
607 hi = code2[1]
608 if hi < lo:
609 msg = "bad character range %s-%s" % (this, that)
610 raise source.error(msg, len(this) + 1 + len(that))
611 setappend((RANGE, (lo, hi)))
612 else:
613 if code1[0] is IN:
614 code1 = code1[1][0]
615 setappend(code1)
616
617 set = _uniq(set)
618 # XXX: <fl> should move set optimization to compiler!
619 if _len(set) == 1 and set[0][0] is LITERAL:
620 # optimization
621 if negate:
622 subpatternappend((NOT_LITERAL, set[0][1]))
623 else:
624 subpatternappend(set[0])
625 else:
626 if negate:
627 set.insert(0, (NEGATE, None))
628 # charmap optimization can't be added here because
629 # global flags still are not known
630 subpatternappend((IN, set))
631
632 elif this in REPEAT_CHARS:
633 # repeat previous item
634 here = source.tell()
635 if this == "?":
636 min, max = 0, 1
637 elif this == "*":
638 min, max = 0, MAXREPEAT
639
640 elif this == "+":
641 min, max = 1, MAXREPEAT
642 elif this == "{":
643 if source.next == "}":
644 subpatternappend((LITERAL, _ord(this)))
645 continue
646
647 min, max = 0, MAXREPEAT
648 lo = hi = ""
649 while source.next in DIGITS:
650 lo += sourceget()
651 if sourcematch(","):
652 while source.next in DIGITS:
653 hi += sourceget()
654 else:
655 hi = lo
656 if not sourcematch("}"):
657 subpatternappend((LITERAL, _ord(this)))
658 source.seek(here)
659 continue
660
661 if lo:
662 min = int(lo)
663 if min >= MAXREPEAT:
664 raise OverflowError("the repetition number is too large")
665 if hi:
666 max = int(hi)
667 if max >= MAXREPEAT:
668 raise OverflowError("the repetition number is too large")
669 if max < min:
670 raise source.error("min repeat greater than max repeat",
671 source.tell() - here)
672 else:
673 raise AssertionError("unsupported quantifier %r" % (char,))
674 # figure out which item to repeat
675 if subpattern:
676 item = subpattern[-1:]
677 else:
678 item = None
679 if not item or item[0][0] is AT:
680 raise source.error("nothing to repeat",
681 source.tell() - here + len(this))
682 if item[0][0] in _REPEATCODES:
683 raise source.error("multiple repeat",
684 source.tell() - here + len(this))
685 if item[0][0] is SUBPATTERN:
686 group, add_flags, del_flags, p = item[0][1]
687 if group is None and not add_flags and not del_flags:
688 item = p
689 if sourcematch("?"):
690 # Non-Greedy Match
691 subpattern[-1] = (MIN_REPEAT, (min, max, item))
692 elif sourcematch("+"):
693 # Possessive Match (Always Greedy)
694 subpattern[-1] = (POSSESSIVE_REPEAT, (min, max, item))
695 else:
696 # Greedy Match
697 subpattern[-1] = (MAX_REPEAT, (min, max, item))
698
699 elif this == ".":
700 subpatternappend((ANY, None))
701
702 elif this == "(":
703 start = source.tell() - 1
704 capture = True
705 atomic = False
706 name = None
707 add_flags = 0
708 del_flags = 0
709 if sourcematch("?"):
710 # options
711 char = sourceget()
712 if char is None:
713 raise source.error("unexpected end of pattern")
714 if char == "P":
715 # python extensions
716 if sourcematch("<"):
717 # named group: skip forward to end of name
718 name = source.getuntil(">", "group name")
719 source.checkgroupname(name, 1)
720 elif sourcematch("="):
721 # named backreference
722 name = source.getuntil(")", "group name")
723 source.checkgroupname(name, 1)
724 gid = state.groupdict.get(name)
725 if gid is None:
726 msg = "unknown group name %r" % name
727 raise source.error(msg, len(name) + 1)
728 if not state.checkgroup(gid):
729 raise source.error("cannot refer to an open group",
730 len(name) + 1)
731 state.checklookbehindgroup(gid, source)
732 subpatternappend((GROUPREF, gid))
733 continue
734
735 else:
736 char = sourceget()
737 if char is None:
738 raise source.error("unexpected end of pattern")
739 raise source.error("unknown extension ?P" + char,
740 len(char) + 2)
741 elif char == ":":
742 # non-capturing group
743 capture = False
744 elif char == "#":
745 # comment
746 while True:
747 if source.next is None:
748 raise source.error("missing ), unterminated comment",
749 source.tell() - start)
750 if sourceget() == ")":
751 break
752 continue
753
754 elif char in "=!<":
755 # lookahead assertions
756 dir = 1
757 if char == "<":
758 char = sourceget()
759 if char is None:
760 raise source.error("unexpected end of pattern")
761 if char not in "=!":
762 raise source.error("unknown extension ?<" + char,
763 len(char) + 2)
764 dir = -1 # lookbehind
765 lookbehindgroups = state.lookbehindgroups
766 if lookbehindgroups is None:
767 state.lookbehindgroups = state.groups
768 p = _parse_sub(source, state, verbose, nested + 1)
769 if dir < 0:
770 if lookbehindgroups is None:
771 state.lookbehindgroups = None
772 if not sourcematch(")"):
773 raise source.error("missing ), unterminated subpattern",
774 source.tell() - start)
775 if char == "=":
776 subpatternappend((ASSERT, (dir, p)))
777 else:
778 subpatternappend((ASSERT_NOT, (dir, p)))
779 continue
780
781 elif char == "(":
782 # conditional backreference group
783 condname = source.getuntil(")", "group name")
784 if not (condname.isdecimal() and condname.isascii()):
785 source.checkgroupname(condname, 1)
786 condgroup = state.groupdict.get(condname)
787 if condgroup is None:
788 msg = "unknown group name %r" % condname
789 raise source.error(msg, len(condname) + 1)
790 else:
791 condgroup = int(condname)
792 if not condgroup:
793 raise source.error("bad group number",
794 len(condname) + 1)
795 if condgroup >= MAXGROUPS:
796 msg = "invalid group reference %d" % condgroup
797 raise source.error(msg, len(condname) + 1)
798 if condgroup not in state.grouprefpos:
799 state.grouprefpos[condgroup] = (
800 source.tell() - len(condname) - 1
801 )
802 if not (condname.isdecimal() and condname.isascii()):
803 import warnings
804 warnings.warn(
805 "bad character in group name %s at position %d" %
806 (repr(condname) if source.istext else ascii(condname),
807 source.tell() - len(condname) - 1),
808 DeprecationWarning, stacklevel=nested + 6
809 )
810 state.checklookbehindgroup(condgroup, source)
811 item_yes = _parse(source, state, verbose, nested + 1)
812 if source.match("|"):
813 item_no = _parse(source, state, verbose, nested + 1)
814 if source.next == "|":
815 raise source.error("conditional backref with more than two branches")
816 else:
817 item_no = None
818 if not source.match(")"):
819 raise source.error("missing ), unterminated subpattern",
820 source.tell() - start)
821 subpatternappend((GROUPREF_EXISTS, (condgroup, item_yes, item_no)))
822 continue
823
824 elif char == ">":
825 # non-capturing, atomic group
826 capture = False
827 atomic = True
828 elif char in FLAGS or char == "-":
829 # flags
830 flags = _parse_flags(source, state, char)
831 if flags is None: # global flags
832 if not first or subpattern:
833 raise source.error('global flags not at the start '
834 'of the expression',
835 source.tell() - start)
836 verbose = state.flags & SRE_FLAG_VERBOSE
837 continue
838
839 add_flags, del_flags = flags
840 capture = False
841 else:
842 raise source.error("unknown extension ?" + char,
843 len(char) + 1)
844
845 # parse group contents
846 if capture:
847 try:
848 group = state.opengroup(name)
849 except error as err:
850 raise source.error(err.msg, len(name) + 1) from None
851 else:
852 group = None
853 sub_verbose = ((verbose or (add_flags & SRE_FLAG_VERBOSE)) and
854 not (del_flags & SRE_FLAG_VERBOSE))
855 p = _parse_sub(source, state, sub_verbose, nested + 1)
856 if not source.match(")"):
857 raise source.error("missing ), unterminated subpattern",
858 source.tell() - start)
859 if group is not None:
860 state.closegroup(group, p)
861 if atomic:
862 assert group is None
863 subpatternappend((ATOMIC_GROUP, p))
864 else:
865 subpatternappend((SUBPATTERN, (group, add_flags, del_flags, p)))
866
867 elif this == "^":
868 subpatternappend((AT, AT_BEGINNING))
869
870 elif this == "$":
871 subpatternappend((AT, AT_END))
872
873 else:
874 raise AssertionError("unsupported special character %r" % (char,))
875
876 # unpack non-capturing groups
877 for i in range(len(subpattern))[::-1]:
878 op, av = subpattern[i]
879 if op is SUBPATTERN:
880 group, add_flags, del_flags, p = av
881 if group is None and not add_flags and not del_flags:
882 subpattern[i: i+1] = p
883
884 return subpattern
885
886 def _parse_flags(source, state, char):
887 sourceget = source.get
888 add_flags = 0
889 del_flags = 0
890 if char != "-":
891 while True:
892 flag = FLAGS[char]
893 if source.istext:
894 if char == 'L':
895 msg = "bad inline flags: cannot use 'L' flag with a str pattern"
896 raise source.error(msg)
897 else:
898 if char == 'u':
899 msg = "bad inline flags: cannot use 'u' flag with a bytes pattern"
900 raise source.error(msg)
901 add_flags |= flag
902 if (flag & TYPE_FLAGS) and (add_flags & TYPE_FLAGS) != flag:
903 msg = "bad inline flags: flags 'a', 'u' and 'L' are incompatible"
904 raise source.error(msg)
905 char = sourceget()
906 if char is None:
907 raise source.error("missing -, : or )")
908 if char in ")-:":
909 break
910 if char not in FLAGS:
911 msg = "unknown flag" if char.isalpha() else "missing -, : or )"
912 raise source.error(msg, len(char))
913 if char == ")":
914 state.flags |= add_flags
915 return None
916 if add_flags & GLOBAL_FLAGS:
917 raise source.error("bad inline flags: cannot turn on global flag", 1)
918 if char == "-":
919 char = sourceget()
920 if char is None:
921 raise source.error("missing flag")
922 if char not in FLAGS:
923 msg = "unknown flag" if char.isalpha() else "missing flag"
924 raise source.error(msg, len(char))
925 while True:
926 flag = FLAGS[char]
927 if flag & TYPE_FLAGS:
928 msg = "bad inline flags: cannot turn off flags 'a', 'u' and 'L'"
929 raise source.error(msg)
930 del_flags |= flag
931 char = sourceget()
932 if char is None:
933 raise source.error("missing :")
934 if char == ":":
935 break
936 if char not in FLAGS:
937 msg = "unknown flag" if char.isalpha() else "missing :"
938 raise source.error(msg, len(char))
939 assert char == ":"
940 if del_flags & GLOBAL_FLAGS:
941 raise source.error("bad inline flags: cannot turn off global flag", 1)
942 if add_flags & del_flags:
943 raise source.error("bad inline flags: flag turned on and off", 1)
944 return add_flags, del_flags
945
946 def fix_flags(src, flags):
947 # Check and fix flags according to the type of pattern (str or bytes)
948 if isinstance(src, str):
949 if flags & SRE_FLAG_LOCALE:
950 raise ValueError("cannot use LOCALE flag with a str pattern")
951 if not flags & SRE_FLAG_ASCII:
952 flags |= SRE_FLAG_UNICODE
953 elif flags & SRE_FLAG_UNICODE:
954 raise ValueError("ASCII and UNICODE flags are incompatible")
955 else:
956 if flags & SRE_FLAG_UNICODE:
957 raise ValueError("cannot use UNICODE flag with a bytes pattern")
958 if flags & SRE_FLAG_LOCALE and flags & SRE_FLAG_ASCII:
959 raise ValueError("ASCII and LOCALE flags are incompatible")
960 return flags
961
962 def parse(str, flags=0, state=None):
963 # parse 're' pattern into list of (opcode, argument) tuples
964
965 source = Tokenizer(str)
966
967 if state is None:
968 state = State()
969 state.flags = flags
970 state.str = str
971
972 p = _parse_sub(source, state, flags & SRE_FLAG_VERBOSE, 0)
973 p.state.flags = fix_flags(str, p.state.flags)
974
975 if source.next is not None:
976 assert source.next == ")"
977 raise source.error("unbalanced parenthesis")
978
979 for g in p.state.grouprefpos:
980 if g >= p.state.groups:
981 msg = "invalid group reference %d" % g
982 raise error(msg, str, p.state.grouprefpos[g])
983
984 if flags & SRE_FLAG_DEBUG:
985 p.dump()
986
987 return p
988
989 def parse_template(source, pattern):
990 # parse 're' replacement string into list of literals and
991 # group references
992 s = Tokenizer(source)
993 sget = s.get
994 result = []
995 literal = []
996 lappend = literal.append
997 def addliteral():
998 if s.istext:
999 result.append(''.join(literal))
1000 else:
1001 # The tokenizer implicitly decodes bytes objects as latin-1, we must
1002 # therefore re-encode the final representation.
1003 result.append(''.join(literal).encode('latin-1'))
1004 del literal[:]
1005 def addgroup(index, pos):
1006 if index > pattern.groups:
1007 raise s.error("invalid group reference %d" % index, pos)
1008 addliteral()
1009 result.append(index)
1010 groupindex = pattern.groupindex
1011 while True:
1012 this = sget()
1013 if this is None:
1014 break # end of replacement string
1015 if this[0] == "\\":
1016 # group
1017 c = this[1]
1018 if c == "g":
1019 if not s.match("<"):
1020 raise s.error("missing <")
1021 name = s.getuntil(">", "group name")
1022 if not (name.isdecimal() and name.isascii()):
1023 s.checkgroupname(name, 1)
1024 try:
1025 index = groupindex[name]
1026 except KeyError:
1027 raise IndexError("unknown group name %r" % name) from None
1028 else:
1029 index = int(name)
1030 if index >= MAXGROUPS:
1031 raise s.error("invalid group reference %d" % index,
1032 len(name) + 1)
1033 if not (name.isdecimal() and name.isascii()):
1034 import warnings
1035 warnings.warn(
1036 "bad character in group name %s at position %d" %
1037 (repr(name) if s.istext else ascii(name),
1038 s.tell() - len(name) - 1),
1039 DeprecationWarning, stacklevel=5
1040 )
1041 addgroup(index, len(name) + 1)
1042 elif c == "0":
1043 if s.next in OCTDIGITS:
1044 this += sget()
1045 if s.next in OCTDIGITS:
1046 this += sget()
1047 lappend(chr(int(this[1:], 8) & 0xff))
1048 elif c in DIGITS:
1049 isoctal = False
1050 if s.next in DIGITS:
1051 this += sget()
1052 if (c in OCTDIGITS and this[2] in OCTDIGITS and
1053 s.next in OCTDIGITS):
1054 this += sget()
1055 isoctal = True
1056 c = int(this[1:], 8)
1057 if c > 0o377:
1058 raise s.error('octal escape value %s outside of '
1059 'range 0-0o377' % this, len(this))
1060 lappend(chr(c))
1061 if not isoctal:
1062 addgroup(int(this[1:]), len(this) - 1)
1063 else:
1064 try:
1065 this = chr(ESCAPES[this][1])
1066 except KeyError:
1067 if c in ASCIILETTERS:
1068 raise s.error('bad escape %s' % this, len(this)) from None
1069 lappend(this)
1070 else:
1071 lappend(this)
1072 addliteral()
1073 return result