1 #
2 # Secret Labs' Regular Expression Engine
3 #
4 # various symbols used by the regular expression engine.
5 # run this script to update the _sre include files!
6 #
7 # Copyright (c) 1998-2001 by Secret Labs AB. All rights reserved.
8 #
9 # See the __init__.py file for information on usage and redistribution.
10 #
11
12 """Internal support module for sre"""
13
14 # update when constants are added or removed
15
16 MAGIC = 20220615
17
18 from _sre import MAXREPEAT, MAXGROUPS
19
20 # SRE standard exception (access as sre.error)
21 # should this really be here?
22
23 class ESC[4;38;5;81merror(ESC[4;38;5;149mException):
24 """Exception raised for invalid regular expressions.
25
26 Attributes:
27
28 msg: The unformatted error message
29 pattern: The regular expression pattern
30 pos: The index in the pattern where compilation failed (may be None)
31 lineno: The line corresponding to pos (may be None)
32 colno: The column corresponding to pos (may be None)
33 """
34
35 __module__ = 're'
36
37 def __init__(self, msg, pattern=None, pos=None):
38 self.msg = msg
39 self.pattern = pattern
40 self.pos = pos
41 if pattern is not None and pos is not None:
42 msg = '%s at position %d' % (msg, pos)
43 if isinstance(pattern, str):
44 newline = '\n'
45 else:
46 newline = b'\n'
47 self.lineno = pattern.count(newline, 0, pos) + 1
48 self.colno = pos - pattern.rfind(newline, 0, pos)
49 if newline in pattern:
50 msg = '%s (line %d, column %d)' % (msg, self.lineno, self.colno)
51 else:
52 self.lineno = self.colno = None
53 super().__init__(msg)
54
55
56 class ESC[4;38;5;81m_NamedIntConstant(ESC[4;38;5;149mint):
57 def __new__(cls, value, name):
58 self = super(_NamedIntConstant, cls).__new__(cls, value)
59 self.name = name
60 return self
61
62 def __repr__(self):
63 return self.name
64
65 __reduce__ = None
66
67 MAXREPEAT = _NamedIntConstant(MAXREPEAT, 'MAXREPEAT')
68
69 def _makecodes(*names):
70 items = [_NamedIntConstant(i, name) for i, name in enumerate(names)]
71 globals().update({item.name: item for item in items})
72 return items
73
74 # operators
75 OPCODES = _makecodes(
76 # failure=0 success=1 (just because it looks better that way :-)
77 'FAILURE', 'SUCCESS',
78
79 'ANY', 'ANY_ALL',
80 'ASSERT', 'ASSERT_NOT',
81 'AT',
82 'BRANCH',
83 'CATEGORY',
84 'CHARSET', 'BIGCHARSET',
85 'GROUPREF', 'GROUPREF_EXISTS',
86 'IN',
87 'INFO',
88 'JUMP',
89 'LITERAL',
90 'MARK',
91 'MAX_UNTIL',
92 'MIN_UNTIL',
93 'NOT_LITERAL',
94 'NEGATE',
95 'RANGE',
96 'REPEAT',
97 'REPEAT_ONE',
98 'SUBPATTERN',
99 'MIN_REPEAT_ONE',
100 'ATOMIC_GROUP',
101 'POSSESSIVE_REPEAT',
102 'POSSESSIVE_REPEAT_ONE',
103
104 'GROUPREF_IGNORE',
105 'IN_IGNORE',
106 'LITERAL_IGNORE',
107 'NOT_LITERAL_IGNORE',
108
109 'GROUPREF_LOC_IGNORE',
110 'IN_LOC_IGNORE',
111 'LITERAL_LOC_IGNORE',
112 'NOT_LITERAL_LOC_IGNORE',
113
114 'GROUPREF_UNI_IGNORE',
115 'IN_UNI_IGNORE',
116 'LITERAL_UNI_IGNORE',
117 'NOT_LITERAL_UNI_IGNORE',
118 'RANGE_UNI_IGNORE',
119
120 # The following opcodes are only occurred in the parser output,
121 # but not in the compiled code.
122 'MIN_REPEAT', 'MAX_REPEAT',
123 )
124 del OPCODES[-2:] # remove MIN_REPEAT and MAX_REPEAT
125
126 # positions
127 ATCODES = _makecodes(
128 'AT_BEGINNING', 'AT_BEGINNING_LINE', 'AT_BEGINNING_STRING',
129 'AT_BOUNDARY', 'AT_NON_BOUNDARY',
130 'AT_END', 'AT_END_LINE', 'AT_END_STRING',
131
132 'AT_LOC_BOUNDARY', 'AT_LOC_NON_BOUNDARY',
133
134 'AT_UNI_BOUNDARY', 'AT_UNI_NON_BOUNDARY',
135 )
136
137 # categories
138 CHCODES = _makecodes(
139 'CATEGORY_DIGIT', 'CATEGORY_NOT_DIGIT',
140 'CATEGORY_SPACE', 'CATEGORY_NOT_SPACE',
141 'CATEGORY_WORD', 'CATEGORY_NOT_WORD',
142 'CATEGORY_LINEBREAK', 'CATEGORY_NOT_LINEBREAK',
143
144 'CATEGORY_LOC_WORD', 'CATEGORY_LOC_NOT_WORD',
145
146 'CATEGORY_UNI_DIGIT', 'CATEGORY_UNI_NOT_DIGIT',
147 'CATEGORY_UNI_SPACE', 'CATEGORY_UNI_NOT_SPACE',
148 'CATEGORY_UNI_WORD', 'CATEGORY_UNI_NOT_WORD',
149 'CATEGORY_UNI_LINEBREAK', 'CATEGORY_UNI_NOT_LINEBREAK',
150 )
151
152
153 # replacement operations for "ignore case" mode
154 OP_IGNORE = {
155 LITERAL: LITERAL_IGNORE,
156 NOT_LITERAL: NOT_LITERAL_IGNORE,
157 }
158
159 OP_LOCALE_IGNORE = {
160 LITERAL: LITERAL_LOC_IGNORE,
161 NOT_LITERAL: NOT_LITERAL_LOC_IGNORE,
162 }
163
164 OP_UNICODE_IGNORE = {
165 LITERAL: LITERAL_UNI_IGNORE,
166 NOT_LITERAL: NOT_LITERAL_UNI_IGNORE,
167 }
168
169 AT_MULTILINE = {
170 AT_BEGINNING: AT_BEGINNING_LINE,
171 AT_END: AT_END_LINE
172 }
173
174 AT_LOCALE = {
175 AT_BOUNDARY: AT_LOC_BOUNDARY,
176 AT_NON_BOUNDARY: AT_LOC_NON_BOUNDARY
177 }
178
179 AT_UNICODE = {
180 AT_BOUNDARY: AT_UNI_BOUNDARY,
181 AT_NON_BOUNDARY: AT_UNI_NON_BOUNDARY
182 }
183
184 CH_LOCALE = {
185 CATEGORY_DIGIT: CATEGORY_DIGIT,
186 CATEGORY_NOT_DIGIT: CATEGORY_NOT_DIGIT,
187 CATEGORY_SPACE: CATEGORY_SPACE,
188 CATEGORY_NOT_SPACE: CATEGORY_NOT_SPACE,
189 CATEGORY_WORD: CATEGORY_LOC_WORD,
190 CATEGORY_NOT_WORD: CATEGORY_LOC_NOT_WORD,
191 CATEGORY_LINEBREAK: CATEGORY_LINEBREAK,
192 CATEGORY_NOT_LINEBREAK: CATEGORY_NOT_LINEBREAK
193 }
194
195 CH_UNICODE = {
196 CATEGORY_DIGIT: CATEGORY_UNI_DIGIT,
197 CATEGORY_NOT_DIGIT: CATEGORY_UNI_NOT_DIGIT,
198 CATEGORY_SPACE: CATEGORY_UNI_SPACE,
199 CATEGORY_NOT_SPACE: CATEGORY_UNI_NOT_SPACE,
200 CATEGORY_WORD: CATEGORY_UNI_WORD,
201 CATEGORY_NOT_WORD: CATEGORY_UNI_NOT_WORD,
202 CATEGORY_LINEBREAK: CATEGORY_UNI_LINEBREAK,
203 CATEGORY_NOT_LINEBREAK: CATEGORY_UNI_NOT_LINEBREAK
204 }
205
206 # flags
207 SRE_FLAG_TEMPLATE = 1 # template mode (unknown purpose, deprecated)
208 SRE_FLAG_IGNORECASE = 2 # case insensitive
209 SRE_FLAG_LOCALE = 4 # honour system locale
210 SRE_FLAG_MULTILINE = 8 # treat target as multiline string
211 SRE_FLAG_DOTALL = 16 # treat target as a single string
212 SRE_FLAG_UNICODE = 32 # use unicode "locale"
213 SRE_FLAG_VERBOSE = 64 # ignore whitespace and comments
214 SRE_FLAG_DEBUG = 128 # debugging
215 SRE_FLAG_ASCII = 256 # use ascii "locale"
216
217 # flags for INFO primitive
218 SRE_INFO_PREFIX = 1 # has prefix
219 SRE_INFO_LITERAL = 2 # entire pattern is literal (given by prefix)
220 SRE_INFO_CHARSET = 4 # pattern starts with character from given set