1 #
2 # Secret Labs' Regular Expression Engine
3 #
4 # convert template to internal format
5 #
6 # Copyright (c) 1997-2001 by Secret Labs AB. All rights reserved.
7 #
8 # See the __init__.py file for information on usage and redistribution.
9 #
10
11 """Internal support module for sre"""
12
13 import _sre
14 from . import _parser
15 from ._constants import *
16 from ._casefix import _EXTRA_CASES
17
18 assert _sre.MAGIC == MAGIC, "SRE module mismatch"
19
20 _LITERAL_CODES = {LITERAL, NOT_LITERAL}
21 _SUCCESS_CODES = {SUCCESS, FAILURE}
22 _ASSERT_CODES = {ASSERT, ASSERT_NOT}
23 _UNIT_CODES = _LITERAL_CODES | {ANY, IN}
24
25 _REPEATING_CODES = {
26 MIN_REPEAT: (REPEAT, MIN_UNTIL, MIN_REPEAT_ONE),
27 MAX_REPEAT: (REPEAT, MAX_UNTIL, REPEAT_ONE),
28 POSSESSIVE_REPEAT: (POSSESSIVE_REPEAT, SUCCESS, POSSESSIVE_REPEAT_ONE),
29 }
30
31 def _combine_flags(flags, add_flags, del_flags,
32 TYPE_FLAGS=_parser.TYPE_FLAGS):
33 if add_flags & TYPE_FLAGS:
34 flags &= ~TYPE_FLAGS
35 return (flags | add_flags) & ~del_flags
36
37 def _compile(code, pattern, flags):
38 # internal: compile a (sub)pattern
39 emit = code.append
40 _len = len
41 LITERAL_CODES = _LITERAL_CODES
42 REPEATING_CODES = _REPEATING_CODES
43 SUCCESS_CODES = _SUCCESS_CODES
44 ASSERT_CODES = _ASSERT_CODES
45 iscased = None
46 tolower = None
47 fixes = None
48 if flags & SRE_FLAG_IGNORECASE and not flags & SRE_FLAG_LOCALE:
49 if flags & SRE_FLAG_UNICODE:
50 iscased = _sre.unicode_iscased
51 tolower = _sre.unicode_tolower
52 fixes = _EXTRA_CASES
53 else:
54 iscased = _sre.ascii_iscased
55 tolower = _sre.ascii_tolower
56 for op, av in pattern:
57 if op in LITERAL_CODES:
58 if not flags & SRE_FLAG_IGNORECASE:
59 emit(op)
60 emit(av)
61 elif flags & SRE_FLAG_LOCALE:
62 emit(OP_LOCALE_IGNORE[op])
63 emit(av)
64 elif not iscased(av):
65 emit(op)
66 emit(av)
67 else:
68 lo = tolower(av)
69 if not fixes: # ascii
70 emit(OP_IGNORE[op])
71 emit(lo)
72 elif lo not in fixes:
73 emit(OP_UNICODE_IGNORE[op])
74 emit(lo)
75 else:
76 emit(IN_UNI_IGNORE)
77 skip = _len(code); emit(0)
78 if op is NOT_LITERAL:
79 emit(NEGATE)
80 for k in (lo,) + fixes[lo]:
81 emit(LITERAL)
82 emit(k)
83 emit(FAILURE)
84 code[skip] = _len(code) - skip
85 elif op is IN:
86 charset, hascased = _optimize_charset(av, iscased, tolower, fixes)
87 if flags & SRE_FLAG_IGNORECASE and flags & SRE_FLAG_LOCALE:
88 emit(IN_LOC_IGNORE)
89 elif not hascased:
90 emit(IN)
91 elif not fixes: # ascii
92 emit(IN_IGNORE)
93 else:
94 emit(IN_UNI_IGNORE)
95 skip = _len(code); emit(0)
96 _compile_charset(charset, flags, code)
97 code[skip] = _len(code) - skip
98 elif op is ANY:
99 if flags & SRE_FLAG_DOTALL:
100 emit(ANY_ALL)
101 else:
102 emit(ANY)
103 elif op in REPEATING_CODES:
104 if flags & SRE_FLAG_TEMPLATE:
105 raise error("internal: unsupported template operator %r" % (op,))
106 if _simple(av[2]):
107 emit(REPEATING_CODES[op][2])
108 skip = _len(code); emit(0)
109 emit(av[0])
110 emit(av[1])
111 _compile(code, av[2], flags)
112 emit(SUCCESS)
113 code[skip] = _len(code) - skip
114 else:
115 emit(REPEATING_CODES[op][0])
116 skip = _len(code); emit(0)
117 emit(av[0])
118 emit(av[1])
119 _compile(code, av[2], flags)
120 code[skip] = _len(code) - skip
121 emit(REPEATING_CODES[op][1])
122 elif op is SUBPATTERN:
123 group, add_flags, del_flags, p = av
124 if group:
125 emit(MARK)
126 emit((group-1)*2)
127 # _compile_info(code, p, _combine_flags(flags, add_flags, del_flags))
128 _compile(code, p, _combine_flags(flags, add_flags, del_flags))
129 if group:
130 emit(MARK)
131 emit((group-1)*2+1)
132 elif op is ATOMIC_GROUP:
133 # Atomic Groups are handled by starting with an Atomic
134 # Group op code, then putting in the atomic group pattern
135 # and finally a success op code to tell any repeat
136 # operations within the Atomic Group to stop eating and
137 # pop their stack if they reach it
138 emit(ATOMIC_GROUP)
139 skip = _len(code); emit(0)
140 _compile(code, av, flags)
141 emit(SUCCESS)
142 code[skip] = _len(code) - skip
143 elif op in SUCCESS_CODES:
144 emit(op)
145 elif op in ASSERT_CODES:
146 emit(op)
147 skip = _len(code); emit(0)
148 if av[0] >= 0:
149 emit(0) # look ahead
150 else:
151 lo, hi = av[1].getwidth()
152 if lo != hi:
153 raise error("look-behind requires fixed-width pattern")
154 emit(lo) # look behind
155 _compile(code, av[1], flags)
156 emit(SUCCESS)
157 code[skip] = _len(code) - skip
158 elif op is AT:
159 emit(op)
160 if flags & SRE_FLAG_MULTILINE:
161 av = AT_MULTILINE.get(av, av)
162 if flags & SRE_FLAG_LOCALE:
163 av = AT_LOCALE.get(av, av)
164 elif flags & SRE_FLAG_UNICODE:
165 av = AT_UNICODE.get(av, av)
166 emit(av)
167 elif op is BRANCH:
168 emit(op)
169 tail = []
170 tailappend = tail.append
171 for av in av[1]:
172 skip = _len(code); emit(0)
173 # _compile_info(code, av, flags)
174 _compile(code, av, flags)
175 emit(JUMP)
176 tailappend(_len(code)); emit(0)
177 code[skip] = _len(code) - skip
178 emit(FAILURE) # end of branch
179 for tail in tail:
180 code[tail] = _len(code) - tail
181 elif op is CATEGORY:
182 emit(op)
183 if flags & SRE_FLAG_LOCALE:
184 av = CH_LOCALE[av]
185 elif flags & SRE_FLAG_UNICODE:
186 av = CH_UNICODE[av]
187 emit(av)
188 elif op is GROUPREF:
189 if not flags & SRE_FLAG_IGNORECASE:
190 emit(op)
191 elif flags & SRE_FLAG_LOCALE:
192 emit(GROUPREF_LOC_IGNORE)
193 elif not fixes: # ascii
194 emit(GROUPREF_IGNORE)
195 else:
196 emit(GROUPREF_UNI_IGNORE)
197 emit(av-1)
198 elif op is GROUPREF_EXISTS:
199 emit(op)
200 emit(av[0]-1)
201 skipyes = _len(code); emit(0)
202 _compile(code, av[1], flags)
203 if av[2]:
204 emit(JUMP)
205 skipno = _len(code); emit(0)
206 code[skipyes] = _len(code) - skipyes + 1
207 _compile(code, av[2], flags)
208 code[skipno] = _len(code) - skipno
209 else:
210 code[skipyes] = _len(code) - skipyes + 1
211 else:
212 raise error("internal: unsupported operand type %r" % (op,))
213
214 def _compile_charset(charset, flags, code):
215 # compile charset subprogram
216 emit = code.append
217 for op, av in charset:
218 emit(op)
219 if op is NEGATE:
220 pass
221 elif op is LITERAL:
222 emit(av)
223 elif op is RANGE or op is RANGE_UNI_IGNORE:
224 emit(av[0])
225 emit(av[1])
226 elif op is CHARSET:
227 code.extend(av)
228 elif op is BIGCHARSET:
229 code.extend(av)
230 elif op is CATEGORY:
231 if flags & SRE_FLAG_LOCALE:
232 emit(CH_LOCALE[av])
233 elif flags & SRE_FLAG_UNICODE:
234 emit(CH_UNICODE[av])
235 else:
236 emit(av)
237 else:
238 raise error("internal: unsupported set operator %r" % (op,))
239 emit(FAILURE)
240
241 def _optimize_charset(charset, iscased=None, fixup=None, fixes=None):
242 # internal: optimize character set
243 out = []
244 tail = []
245 charmap = bytearray(256)
246 hascased = False
247 for op, av in charset:
248 while True:
249 try:
250 if op is LITERAL:
251 if fixup:
252 lo = fixup(av)
253 charmap[lo] = 1
254 if fixes and lo in fixes:
255 for k in fixes[lo]:
256 charmap[k] = 1
257 if not hascased and iscased(av):
258 hascased = True
259 else:
260 charmap[av] = 1
261 elif op is RANGE:
262 r = range(av[0], av[1]+1)
263 if fixup:
264 if fixes:
265 for i in map(fixup, r):
266 charmap[i] = 1
267 if i in fixes:
268 for k in fixes[i]:
269 charmap[k] = 1
270 else:
271 for i in map(fixup, r):
272 charmap[i] = 1
273 if not hascased:
274 hascased = any(map(iscased, r))
275 else:
276 for i in r:
277 charmap[i] = 1
278 elif op is NEGATE:
279 out.append((op, av))
280 else:
281 tail.append((op, av))
282 except IndexError:
283 if len(charmap) == 256:
284 # character set contains non-UCS1 character codes
285 charmap += b'\0' * 0xff00
286 continue
287 # Character set contains non-BMP character codes.
288 # For range, all BMP characters in the range are already
289 # proceeded.
290 if fixup:
291 hascased = True
292 # For now, IN_UNI_IGNORE+LITERAL and
293 # IN_UNI_IGNORE+RANGE_UNI_IGNORE work for all non-BMP
294 # characters, because two characters (at least one of
295 # which is not in the BMP) match case-insensitively
296 # if and only if:
297 # 1) c1.lower() == c2.lower()
298 # 2) c1.lower() == c2 or c1.lower().upper() == c2
299 # Also, both c.lower() and c.lower().upper() are single
300 # characters for every non-BMP character.
301 if op is RANGE:
302 op = RANGE_UNI_IGNORE
303 tail.append((op, av))
304 break
305
306 # compress character map
307 runs = []
308 q = 0
309 while True:
310 p = charmap.find(1, q)
311 if p < 0:
312 break
313 if len(runs) >= 2:
314 runs = None
315 break
316 q = charmap.find(0, p)
317 if q < 0:
318 runs.append((p, len(charmap)))
319 break
320 runs.append((p, q))
321 if runs is not None:
322 # use literal/range
323 for p, q in runs:
324 if q - p == 1:
325 out.append((LITERAL, p))
326 else:
327 out.append((RANGE, (p, q - 1)))
328 out += tail
329 # if the case was changed or new representation is more compact
330 if hascased or len(out) < len(charset):
331 return out, hascased
332 # else original character set is good enough
333 return charset, hascased
334
335 # use bitmap
336 if len(charmap) == 256:
337 data = _mk_bitmap(charmap)
338 out.append((CHARSET, data))
339 out += tail
340 return out, hascased
341
342 # To represent a big charset, first a bitmap of all characters in the
343 # set is constructed. Then, this bitmap is sliced into chunks of 256
344 # characters, duplicate chunks are eliminated, and each chunk is
345 # given a number. In the compiled expression, the charset is
346 # represented by a 32-bit word sequence, consisting of one word for
347 # the number of different chunks, a sequence of 256 bytes (64 words)
348 # of chunk numbers indexed by their original chunk position, and a
349 # sequence of 256-bit chunks (8 words each).
350
351 # Compression is normally good: in a typical charset, large ranges of
352 # Unicode will be either completely excluded (e.g. if only cyrillic
353 # letters are to be matched), or completely included (e.g. if large
354 # subranges of Kanji match). These ranges will be represented by
355 # chunks of all one-bits or all zero-bits.
356
357 # Matching can be also done efficiently: the more significant byte of
358 # the Unicode character is an index into the chunk number, and the
359 # less significant byte is a bit index in the chunk (just like the
360 # CHARSET matching).
361
362 charmap = bytes(charmap) # should be hashable
363 comps = {}
364 mapping = bytearray(256)
365 block = 0
366 data = bytearray()
367 for i in range(0, 65536, 256):
368 chunk = charmap[i: i + 256]
369 if chunk in comps:
370 mapping[i // 256] = comps[chunk]
371 else:
372 mapping[i // 256] = comps[chunk] = block
373 block += 1
374 data += chunk
375 data = _mk_bitmap(data)
376 data[0:0] = [block] + _bytes_to_codes(mapping)
377 out.append((BIGCHARSET, data))
378 out += tail
379 return out, hascased
380
381 _CODEBITS = _sre.CODESIZE * 8
382 MAXCODE = (1 << _CODEBITS) - 1
383 _BITS_TRANS = b'0' + b'1' * 255
384 def _mk_bitmap(bits, _CODEBITS=_CODEBITS, _int=int):
385 s = bits.translate(_BITS_TRANS)[::-1]
386 return [_int(s[i - _CODEBITS: i], 2)
387 for i in range(len(s), 0, -_CODEBITS)]
388
389 def _bytes_to_codes(b):
390 # Convert block indices to word array
391 a = memoryview(b).cast('I')
392 assert a.itemsize == _sre.CODESIZE
393 assert len(a) * a.itemsize == len(b)
394 return a.tolist()
395
396 def _simple(p):
397 # check if this subpattern is a "simple" operator
398 if len(p) != 1:
399 return False
400 op, av = p[0]
401 if op is SUBPATTERN:
402 return av[0] is None and _simple(av[-1])
403 return op in _UNIT_CODES
404
405 def _generate_overlap_table(prefix):
406 """
407 Generate an overlap table for the following prefix.
408 An overlap table is a table of the same size as the prefix which
409 informs about the potential self-overlap for each index in the prefix:
410 - if overlap[i] == 0, prefix[i:] can't overlap prefix[0:...]
411 - if overlap[i] == k with 0 < k <= i, prefix[i-k+1:i+1] overlaps with
412 prefix[0:k]
413 """
414 table = [0] * len(prefix)
415 for i in range(1, len(prefix)):
416 idx = table[i - 1]
417 while prefix[i] != prefix[idx]:
418 if idx == 0:
419 table[i] = 0
420 break
421 idx = table[idx - 1]
422 else:
423 table[i] = idx + 1
424 return table
425
426 def _get_iscased(flags):
427 if not flags & SRE_FLAG_IGNORECASE:
428 return None
429 elif flags & SRE_FLAG_UNICODE:
430 return _sre.unicode_iscased
431 else:
432 return _sre.ascii_iscased
433
434 def _get_literal_prefix(pattern, flags):
435 # look for literal prefix
436 prefix = []
437 prefixappend = prefix.append
438 prefix_skip = None
439 iscased = _get_iscased(flags)
440 for op, av in pattern.data:
441 if op is LITERAL:
442 if iscased and iscased(av):
443 break
444 prefixappend(av)
445 elif op is SUBPATTERN:
446 group, add_flags, del_flags, p = av
447 flags1 = _combine_flags(flags, add_flags, del_flags)
448 if flags1 & SRE_FLAG_IGNORECASE and flags1 & SRE_FLAG_LOCALE:
449 break
450 prefix1, prefix_skip1, got_all = _get_literal_prefix(p, flags1)
451 if prefix_skip is None:
452 if group is not None:
453 prefix_skip = len(prefix)
454 elif prefix_skip1 is not None:
455 prefix_skip = len(prefix) + prefix_skip1
456 prefix.extend(prefix1)
457 if not got_all:
458 break
459 else:
460 break
461 else:
462 return prefix, prefix_skip, True
463 return prefix, prefix_skip, False
464
465 def _get_charset_prefix(pattern, flags):
466 while True:
467 if not pattern.data:
468 return None
469 op, av = pattern.data[0]
470 if op is not SUBPATTERN:
471 break
472 group, add_flags, del_flags, pattern = av
473 flags = _combine_flags(flags, add_flags, del_flags)
474 if flags & SRE_FLAG_IGNORECASE and flags & SRE_FLAG_LOCALE:
475 return None
476
477 iscased = _get_iscased(flags)
478 if op is LITERAL:
479 if iscased and iscased(av):
480 return None
481 return [(op, av)]
482 elif op is BRANCH:
483 charset = []
484 charsetappend = charset.append
485 for p in av[1]:
486 if not p:
487 return None
488 op, av = p[0]
489 if op is LITERAL and not (iscased and iscased(av)):
490 charsetappend((op, av))
491 else:
492 return None
493 return charset
494 elif op is IN:
495 charset = av
496 if iscased:
497 for op, av in charset:
498 if op is LITERAL:
499 if iscased(av):
500 return None
501 elif op is RANGE:
502 if av[1] > 0xffff:
503 return None
504 if any(map(iscased, range(av[0], av[1]+1))):
505 return None
506 return charset
507 return None
508
509 def _compile_info(code, pattern, flags):
510 # internal: compile an info block. in the current version,
511 # this contains min/max pattern width, and an optional literal
512 # prefix or a character map
513 lo, hi = pattern.getwidth()
514 if hi > MAXCODE:
515 hi = MAXCODE
516 if lo == 0:
517 code.extend([INFO, 4, 0, lo, hi])
518 return
519 # look for a literal prefix
520 prefix = []
521 prefix_skip = 0
522 charset = [] # not used
523 if not (flags & SRE_FLAG_IGNORECASE and flags & SRE_FLAG_LOCALE):
524 # look for literal prefix
525 prefix, prefix_skip, got_all = _get_literal_prefix(pattern, flags)
526 # if no prefix, look for charset prefix
527 if not prefix:
528 charset = _get_charset_prefix(pattern, flags)
529 ## if prefix:
530 ## print("*** PREFIX", prefix, prefix_skip)
531 ## if charset:
532 ## print("*** CHARSET", charset)
533 # add an info block
534 emit = code.append
535 emit(INFO)
536 skip = len(code); emit(0)
537 # literal flag
538 mask = 0
539 if prefix:
540 mask = SRE_INFO_PREFIX
541 if prefix_skip is None and got_all:
542 mask = mask | SRE_INFO_LITERAL
543 elif charset:
544 mask = mask | SRE_INFO_CHARSET
545 emit(mask)
546 # pattern length
547 if lo < MAXCODE:
548 emit(lo)
549 else:
550 emit(MAXCODE)
551 prefix = prefix[:MAXCODE]
552 emit(min(hi, MAXCODE))
553 # add literal prefix
554 if prefix:
555 emit(len(prefix)) # length
556 if prefix_skip is None:
557 prefix_skip = len(prefix)
558 emit(prefix_skip) # skip
559 code.extend(prefix)
560 # generate overlap table
561 code.extend(_generate_overlap_table(prefix))
562 elif charset:
563 charset, hascased = _optimize_charset(charset)
564 assert not hascased
565 _compile_charset(charset, flags, code)
566 code[skip] = len(code) - skip
567
568 def isstring(obj):
569 return isinstance(obj, (str, bytes))
570
571 def _code(p, flags):
572
573 flags = p.state.flags | flags
574 code = []
575
576 # compile info block
577 _compile_info(code, p, flags)
578
579 # compile the pattern
580 _compile(code, p.data, flags)
581
582 code.append(SUCCESS)
583
584 return code
585
586 def _hex_code(code):
587 return '[%s]' % ', '.join('%#0*x' % (_sre.CODESIZE*2+2, x) for x in code)
588
589 def dis(code):
590 import sys
591
592 labels = set()
593 level = 0
594 offset_width = len(str(len(code) - 1))
595
596 def dis_(start, end):
597 def print_(*args, to=None):
598 if to is not None:
599 labels.add(to)
600 args += ('(to %d)' % (to,),)
601 print('%*d%s ' % (offset_width, start, ':' if start in labels else '.'),
602 end=' '*(level-1))
603 print(*args)
604
605 def print_2(*args):
606 print(end=' '*(offset_width + 2*level))
607 print(*args)
608
609 nonlocal level
610 level += 1
611 i = start
612 while i < end:
613 start = i
614 op = code[i]
615 i += 1
616 op = OPCODES[op]
617 if op in (SUCCESS, FAILURE, ANY, ANY_ALL,
618 MAX_UNTIL, MIN_UNTIL, NEGATE):
619 print_(op)
620 elif op in (LITERAL, NOT_LITERAL,
621 LITERAL_IGNORE, NOT_LITERAL_IGNORE,
622 LITERAL_UNI_IGNORE, NOT_LITERAL_UNI_IGNORE,
623 LITERAL_LOC_IGNORE, NOT_LITERAL_LOC_IGNORE):
624 arg = code[i]
625 i += 1
626 print_(op, '%#02x (%r)' % (arg, chr(arg)))
627 elif op is AT:
628 arg = code[i]
629 i += 1
630 arg = str(ATCODES[arg])
631 assert arg[:3] == 'AT_'
632 print_(op, arg[3:])
633 elif op is CATEGORY:
634 arg = code[i]
635 i += 1
636 arg = str(CHCODES[arg])
637 assert arg[:9] == 'CATEGORY_'
638 print_(op, arg[9:])
639 elif op in (IN, IN_IGNORE, IN_UNI_IGNORE, IN_LOC_IGNORE):
640 skip = code[i]
641 print_(op, skip, to=i+skip)
642 dis_(i+1, i+skip)
643 i += skip
644 elif op in (RANGE, RANGE_UNI_IGNORE):
645 lo, hi = code[i: i+2]
646 i += 2
647 print_(op, '%#02x %#02x (%r-%r)' % (lo, hi, chr(lo), chr(hi)))
648 elif op is CHARSET:
649 print_(op, _hex_code(code[i: i + 256//_CODEBITS]))
650 i += 256//_CODEBITS
651 elif op is BIGCHARSET:
652 arg = code[i]
653 i += 1
654 mapping = list(b''.join(x.to_bytes(_sre.CODESIZE, sys.byteorder)
655 for x in code[i: i + 256//_sre.CODESIZE]))
656 print_(op, arg, mapping)
657 i += 256//_sre.CODESIZE
658 level += 1
659 for j in range(arg):
660 print_2(_hex_code(code[i: i + 256//_CODEBITS]))
661 i += 256//_CODEBITS
662 level -= 1
663 elif op in (MARK, GROUPREF, GROUPREF_IGNORE, GROUPREF_UNI_IGNORE,
664 GROUPREF_LOC_IGNORE):
665 arg = code[i]
666 i += 1
667 print_(op, arg)
668 elif op is JUMP:
669 skip = code[i]
670 print_(op, skip, to=i+skip)
671 i += 1
672 elif op is BRANCH:
673 skip = code[i]
674 print_(op, skip, to=i+skip)
675 while skip:
676 dis_(i+1, i+skip)
677 i += skip
678 start = i
679 skip = code[i]
680 if skip:
681 print_('branch', skip, to=i+skip)
682 else:
683 print_(FAILURE)
684 i += 1
685 elif op in (REPEAT, REPEAT_ONE, MIN_REPEAT_ONE,
686 POSSESSIVE_REPEAT, POSSESSIVE_REPEAT_ONE):
687 skip, min, max = code[i: i+3]
688 if max == MAXREPEAT:
689 max = 'MAXREPEAT'
690 print_(op, skip, min, max, to=i+skip)
691 dis_(i+3, i+skip)
692 i += skip
693 elif op is GROUPREF_EXISTS:
694 arg, skip = code[i: i+2]
695 print_(op, arg, skip, to=i+skip)
696 i += 2
697 elif op in (ASSERT, ASSERT_NOT):
698 skip, arg = code[i: i+2]
699 print_(op, skip, arg, to=i+skip)
700 dis_(i+2, i+skip)
701 i += skip
702 elif op is ATOMIC_GROUP:
703 skip = code[i]
704 print_(op, skip, to=i+skip)
705 dis_(i+1, i+skip)
706 i += skip
707 elif op is INFO:
708 skip, flags, min, max = code[i: i+4]
709 if max == MAXREPEAT:
710 max = 'MAXREPEAT'
711 print_(op, skip, bin(flags), min, max, to=i+skip)
712 start = i+4
713 if flags & SRE_INFO_PREFIX:
714 prefix_len, prefix_skip = code[i+4: i+6]
715 print_2(' prefix_skip', prefix_skip)
716 start = i + 6
717 prefix = code[start: start+prefix_len]
718 print_2(' prefix',
719 '[%s]' % ', '.join('%#02x' % x for x in prefix),
720 '(%r)' % ''.join(map(chr, prefix)))
721 start += prefix_len
722 print_2(' overlap', code[start: start+prefix_len])
723 start += prefix_len
724 if flags & SRE_INFO_CHARSET:
725 level += 1
726 print_2('in')
727 dis_(start, i+skip)
728 level -= 1
729 i += skip
730 else:
731 raise ValueError(op)
732
733 level -= 1
734
735 dis_(0, len(code))
736
737
738 def compile(p, flags=0):
739 # internal: convert pattern list to internal format
740
741 if isstring(p):
742 pattern = p
743 p = _parser.parse(p, flags)
744 else:
745 pattern = None
746
747 code = _code(p, flags)
748
749 if flags & SRE_FLAG_DEBUG:
750 print()
751 dis(code)
752
753 # map in either direction
754 groupindex = p.state.groupdict
755 indexgroup = [None] * p.state.groups
756 for k, i in groupindex.items():
757 indexgroup[i] = k
758
759 return _sre.compile(
760 pattern, flags | p.state.flags, code,
761 p.state.groups-1,
762 groupindex, tuple(indexgroup)
763 )