1  #
       2  # Secret Labs' Regular Expression Engine
       3  #
       4  # convert template to internal format
       5  #
       6  # Copyright (c) 1997-2001 by Secret Labs AB.  All rights reserved.
       7  #
       8  # See the __init__.py file for information on usage and redistribution.
       9  #
      10  
      11  """Internal support module for sre"""
      12  
      13  import _sre
      14  from . import _parser
      15  from ._constants import *
      16  from ._casefix import _EXTRA_CASES
      17  
      18  assert _sre.MAGIC == MAGIC, "SRE module mismatch"
      19  
      20  _LITERAL_CODES = {LITERAL, NOT_LITERAL}
      21  _SUCCESS_CODES = {SUCCESS, FAILURE}
      22  _ASSERT_CODES = {ASSERT, ASSERT_NOT}
      23  _UNIT_CODES = _LITERAL_CODES | {ANY, IN}
      24  
      25  _REPEATING_CODES = {
      26      MIN_REPEAT: (REPEAT, MIN_UNTIL, MIN_REPEAT_ONE),
      27      MAX_REPEAT: (REPEAT, MAX_UNTIL, REPEAT_ONE),
      28      POSSESSIVE_REPEAT: (POSSESSIVE_REPEAT, SUCCESS, POSSESSIVE_REPEAT_ONE),
      29  }
      30  
      31  def _combine_flags(flags, add_flags, del_flags,
      32                     TYPE_FLAGS=_parser.TYPE_FLAGS):
      33      if add_flags & TYPE_FLAGS:
      34          flags &= ~TYPE_FLAGS
      35      return (flags | add_flags) & ~del_flags
      36  
      37  def _compile(code, pattern, flags):
      38      # internal: compile a (sub)pattern
      39      emit = code.append
      40      _len = len
      41      LITERAL_CODES = _LITERAL_CODES
      42      REPEATING_CODES = _REPEATING_CODES
      43      SUCCESS_CODES = _SUCCESS_CODES
      44      ASSERT_CODES = _ASSERT_CODES
      45      iscased = None
      46      tolower = None
      47      fixes = None
      48      if flags & SRE_FLAG_IGNORECASE and not flags & SRE_FLAG_LOCALE:
      49          if flags & SRE_FLAG_UNICODE:
      50              iscased = _sre.unicode_iscased
      51              tolower = _sre.unicode_tolower
      52              fixes = _EXTRA_CASES
      53          else:
      54              iscased = _sre.ascii_iscased
      55              tolower = _sre.ascii_tolower
      56      for op, av in pattern:
      57          if op in LITERAL_CODES:
      58              if not flags & SRE_FLAG_IGNORECASE:
      59                  emit(op)
      60                  emit(av)
      61              elif flags & SRE_FLAG_LOCALE:
      62                  emit(OP_LOCALE_IGNORE[op])
      63                  emit(av)
      64              elif not iscased(av):
      65                  emit(op)
      66                  emit(av)
      67              else:
      68                  lo = tolower(av)
      69                  if not fixes:  # ascii
      70                      emit(OP_IGNORE[op])
      71                      emit(lo)
      72                  elif lo not in fixes:
      73                      emit(OP_UNICODE_IGNORE[op])
      74                      emit(lo)
      75                  else:
      76                      emit(IN_UNI_IGNORE)
      77                      skip = _len(code); emit(0)
      78                      if op is NOT_LITERAL:
      79                          emit(NEGATE)
      80                      for k in (lo,) + fixes[lo]:
      81                          emit(LITERAL)
      82                          emit(k)
      83                      emit(FAILURE)
      84                      code[skip] = _len(code) - skip
      85          elif op is IN:
      86              charset, hascased = _optimize_charset(av, iscased, tolower, fixes)
      87              if flags & SRE_FLAG_IGNORECASE and flags & SRE_FLAG_LOCALE:
      88                  emit(IN_LOC_IGNORE)
      89              elif not hascased:
      90                  emit(IN)
      91              elif not fixes:  # ascii
      92                  emit(IN_IGNORE)
      93              else:
      94                  emit(IN_UNI_IGNORE)
      95              skip = _len(code); emit(0)
      96              _compile_charset(charset, flags, code)
      97              code[skip] = _len(code) - skip
      98          elif op is ANY:
      99              if flags & SRE_FLAG_DOTALL:
     100                  emit(ANY_ALL)
     101              else:
     102                  emit(ANY)
     103          elif op in REPEATING_CODES:
     104              if flags & SRE_FLAG_TEMPLATE:
     105                  raise error("internal: unsupported template operator %r" % (op,))
     106              if _simple(av[2]):
     107                  emit(REPEATING_CODES[op][2])
     108                  skip = _len(code); emit(0)
     109                  emit(av[0])
     110                  emit(av[1])
     111                  _compile(code, av[2], flags)
     112                  emit(SUCCESS)
     113                  code[skip] = _len(code) - skip
     114              else:
     115                  emit(REPEATING_CODES[op][0])
     116                  skip = _len(code); emit(0)
     117                  emit(av[0])
     118                  emit(av[1])
     119                  _compile(code, av[2], flags)
     120                  code[skip] = _len(code) - skip
     121                  emit(REPEATING_CODES[op][1])
     122          elif op is SUBPATTERN:
     123              group, add_flags, del_flags, p = av
     124              if group:
     125                  emit(MARK)
     126                  emit((group-1)*2)
     127              # _compile_info(code, p, _combine_flags(flags, add_flags, del_flags))
     128              _compile(code, p, _combine_flags(flags, add_flags, del_flags))
     129              if group:
     130                  emit(MARK)
     131                  emit((group-1)*2+1)
     132          elif op is ATOMIC_GROUP:
     133              # Atomic Groups are handled by starting with an Atomic
     134              # Group op code, then putting in the atomic group pattern
     135              # and finally a success op code to tell any repeat
     136              # operations within the Atomic Group to stop eating and
     137              # pop their stack if they reach it
     138              emit(ATOMIC_GROUP)
     139              skip = _len(code); emit(0)
     140              _compile(code, av, flags)
     141              emit(SUCCESS)
     142              code[skip] = _len(code) - skip
     143          elif op in SUCCESS_CODES:
     144              emit(op)
     145          elif op in ASSERT_CODES:
     146              emit(op)
     147              skip = _len(code); emit(0)
     148              if av[0] >= 0:
     149                  emit(0) # look ahead
     150              else:
     151                  lo, hi = av[1].getwidth()
     152                  if lo != hi:
     153                      raise error("look-behind requires fixed-width pattern")
     154                  emit(lo) # look behind
     155              _compile(code, av[1], flags)
     156              emit(SUCCESS)
     157              code[skip] = _len(code) - skip
     158          elif op is AT:
     159              emit(op)
     160              if flags & SRE_FLAG_MULTILINE:
     161                  av = AT_MULTILINE.get(av, av)
     162              if flags & SRE_FLAG_LOCALE:
     163                  av = AT_LOCALE.get(av, av)
     164              elif flags & SRE_FLAG_UNICODE:
     165                  av = AT_UNICODE.get(av, av)
     166              emit(av)
     167          elif op is BRANCH:
     168              emit(op)
     169              tail = []
     170              tailappend = tail.append
     171              for av in av[1]:
     172                  skip = _len(code); emit(0)
     173                  # _compile_info(code, av, flags)
     174                  _compile(code, av, flags)
     175                  emit(JUMP)
     176                  tailappend(_len(code)); emit(0)
     177                  code[skip] = _len(code) - skip
     178              emit(FAILURE) # end of branch
     179              for tail in tail:
     180                  code[tail] = _len(code) - tail
     181          elif op is CATEGORY:
     182              emit(op)
     183              if flags & SRE_FLAG_LOCALE:
     184                  av = CH_LOCALE[av]
     185              elif flags & SRE_FLAG_UNICODE:
     186                  av = CH_UNICODE[av]
     187              emit(av)
     188          elif op is GROUPREF:
     189              if not flags & SRE_FLAG_IGNORECASE:
     190                  emit(op)
     191              elif flags & SRE_FLAG_LOCALE:
     192                  emit(GROUPREF_LOC_IGNORE)
     193              elif not fixes:  # ascii
     194                  emit(GROUPREF_IGNORE)
     195              else:
     196                  emit(GROUPREF_UNI_IGNORE)
     197              emit(av-1)
     198          elif op is GROUPREF_EXISTS:
     199              emit(op)
     200              emit(av[0]-1)
     201              skipyes = _len(code); emit(0)
     202              _compile(code, av[1], flags)
     203              if av[2]:
     204                  emit(JUMP)
     205                  skipno = _len(code); emit(0)
     206                  code[skipyes] = _len(code) - skipyes + 1
     207                  _compile(code, av[2], flags)
     208                  code[skipno] = _len(code) - skipno
     209              else:
     210                  code[skipyes] = _len(code) - skipyes + 1
     211          else:
     212              raise error("internal: unsupported operand type %r" % (op,))
     213  
     214  def _compile_charset(charset, flags, code):
     215      # compile charset subprogram
     216      emit = code.append
     217      for op, av in charset:
     218          emit(op)
     219          if op is NEGATE:
     220              pass
     221          elif op is LITERAL:
     222              emit(av)
     223          elif op is RANGE or op is RANGE_UNI_IGNORE:
     224              emit(av[0])
     225              emit(av[1])
     226          elif op is CHARSET:
     227              code.extend(av)
     228          elif op is BIGCHARSET:
     229              code.extend(av)
     230          elif op is CATEGORY:
     231              if flags & SRE_FLAG_LOCALE:
     232                  emit(CH_LOCALE[av])
     233              elif flags & SRE_FLAG_UNICODE:
     234                  emit(CH_UNICODE[av])
     235              else:
     236                  emit(av)
     237          else:
     238              raise error("internal: unsupported set operator %r" % (op,))
     239      emit(FAILURE)
     240  
     241  def _optimize_charset(charset, iscased=None, fixup=None, fixes=None):
     242      # internal: optimize character set
     243      out = []
     244      tail = []
     245      charmap = bytearray(256)
     246      hascased = False
     247      for op, av in charset:
     248          while True:
     249              try:
     250                  if op is LITERAL:
     251                      if fixup:
     252                          lo = fixup(av)
     253                          charmap[lo] = 1
     254                          if fixes and lo in fixes:
     255                              for k in fixes[lo]:
     256                                  charmap[k] = 1
     257                          if not hascased and iscased(av):
     258                              hascased = True
     259                      else:
     260                          charmap[av] = 1
     261                  elif op is RANGE:
     262                      r = range(av[0], av[1]+1)
     263                      if fixup:
     264                          if fixes:
     265                              for i in map(fixup, r):
     266                                  charmap[i] = 1
     267                                  if i in fixes:
     268                                      for k in fixes[i]:
     269                                          charmap[k] = 1
     270                          else:
     271                              for i in map(fixup, r):
     272                                  charmap[i] = 1
     273                          if not hascased:
     274                              hascased = any(map(iscased, r))
     275                      else:
     276                          for i in r:
     277                              charmap[i] = 1
     278                  elif op is NEGATE:
     279                      out.append((op, av))
     280                  else:
     281                      tail.append((op, av))
     282              except IndexError:
     283                  if len(charmap) == 256:
     284                      # character set contains non-UCS1 character codes
     285                      charmap += b'\0' * 0xff00
     286                      continue
     287                  # Character set contains non-BMP character codes.
     288                  # For range, all BMP characters in the range are already
     289                  # proceeded.
     290                  if fixup:
     291                      hascased = True
     292                      # For now, IN_UNI_IGNORE+LITERAL and
     293                      # IN_UNI_IGNORE+RANGE_UNI_IGNORE work for all non-BMP
     294                      # characters, because two characters (at least one of
     295                      # which is not in the BMP) match case-insensitively
     296                      # if and only if:
     297                      # 1) c1.lower() == c2.lower()
     298                      # 2) c1.lower() == c2 or c1.lower().upper() == c2
     299                      # Also, both c.lower() and c.lower().upper() are single
     300                      # characters for every non-BMP character.
     301                      if op is RANGE:
     302                          op = RANGE_UNI_IGNORE
     303                  tail.append((op, av))
     304              break
     305  
     306      # compress character map
     307      runs = []
     308      q = 0
     309      while True:
     310          p = charmap.find(1, q)
     311          if p < 0:
     312              break
     313          if len(runs) >= 2:
     314              runs = None
     315              break
     316          q = charmap.find(0, p)
     317          if q < 0:
     318              runs.append((p, len(charmap)))
     319              break
     320          runs.append((p, q))
     321      if runs is not None:
     322          # use literal/range
     323          for p, q in runs:
     324              if q - p == 1:
     325                  out.append((LITERAL, p))
     326              else:
     327                  out.append((RANGE, (p, q - 1)))
     328          out += tail
     329          # if the case was changed or new representation is more compact
     330          if hascased or len(out) < len(charset):
     331              return out, hascased
     332          # else original character set is good enough
     333          return charset, hascased
     334  
     335      # use bitmap
     336      if len(charmap) == 256:
     337          data = _mk_bitmap(charmap)
     338          out.append((CHARSET, data))
     339          out += tail
     340          return out, hascased
     341  
     342      # To represent a big charset, first a bitmap of all characters in the
     343      # set is constructed. Then, this bitmap is sliced into chunks of 256
     344      # characters, duplicate chunks are eliminated, and each chunk is
     345      # given a number. In the compiled expression, the charset is
     346      # represented by a 32-bit word sequence, consisting of one word for
     347      # the number of different chunks, a sequence of 256 bytes (64 words)
     348      # of chunk numbers indexed by their original chunk position, and a
     349      # sequence of 256-bit chunks (8 words each).
     350  
     351      # Compression is normally good: in a typical charset, large ranges of
     352      # Unicode will be either completely excluded (e.g. if only cyrillic
     353      # letters are to be matched), or completely included (e.g. if large
     354      # subranges of Kanji match). These ranges will be represented by
     355      # chunks of all one-bits or all zero-bits.
     356  
     357      # Matching can be also done efficiently: the more significant byte of
     358      # the Unicode character is an index into the chunk number, and the
     359      # less significant byte is a bit index in the chunk (just like the
     360      # CHARSET matching).
     361  
     362      charmap = bytes(charmap) # should be hashable
     363      comps = {}
     364      mapping = bytearray(256)
     365      block = 0
     366      data = bytearray()
     367      for i in range(0, 65536, 256):
     368          chunk = charmap[i: i + 256]
     369          if chunk in comps:
     370              mapping[i // 256] = comps[chunk]
     371          else:
     372              mapping[i // 256] = comps[chunk] = block
     373              block += 1
     374              data += chunk
     375      data = _mk_bitmap(data)
     376      data[0:0] = [block] + _bytes_to_codes(mapping)
     377      out.append((BIGCHARSET, data))
     378      out += tail
     379      return out, hascased
     380  
     381  _CODEBITS = _sre.CODESIZE * 8
     382  MAXCODE = (1 << _CODEBITS) - 1
     383  _BITS_TRANS = b'0' + b'1' * 255
     384  def _mk_bitmap(bits, _CODEBITS=_CODEBITS, _int=int):
     385      s = bits.translate(_BITS_TRANS)[::-1]
     386      return [_int(s[i - _CODEBITS: i], 2)
     387              for i in range(len(s), 0, -_CODEBITS)]
     388  
     389  def _bytes_to_codes(b):
     390      # Convert block indices to word array
     391      a = memoryview(b).cast('I')
     392      assert a.itemsize == _sre.CODESIZE
     393      assert len(a) * a.itemsize == len(b)
     394      return a.tolist()
     395  
     396  def _simple(p):
     397      # check if this subpattern is a "simple" operator
     398      if len(p) != 1:
     399          return False
     400      op, av = p[0]
     401      if op is SUBPATTERN:
     402          return av[0] is None and _simple(av[-1])
     403      return op in _UNIT_CODES
     404  
     405  def _generate_overlap_table(prefix):
     406      """
     407      Generate an overlap table for the following prefix.
     408      An overlap table is a table of the same size as the prefix which
     409      informs about the potential self-overlap for each index in the prefix:
     410      - if overlap[i] == 0, prefix[i:] can't overlap prefix[0:...]
     411      - if overlap[i] == k with 0 < k <= i, prefix[i-k+1:i+1] overlaps with
     412        prefix[0:k]
     413      """
     414      table = [0] * len(prefix)
     415      for i in range(1, len(prefix)):
     416          idx = table[i - 1]
     417          while prefix[i] != prefix[idx]:
     418              if idx == 0:
     419                  table[i] = 0
     420                  break
     421              idx = table[idx - 1]
     422          else:
     423              table[i] = idx + 1
     424      return table
     425  
     426  def _get_iscased(flags):
     427      if not flags & SRE_FLAG_IGNORECASE:
     428          return None
     429      elif flags & SRE_FLAG_UNICODE:
     430          return _sre.unicode_iscased
     431      else:
     432          return _sre.ascii_iscased
     433  
     434  def _get_literal_prefix(pattern, flags):
     435      # look for literal prefix
     436      prefix = []
     437      prefixappend = prefix.append
     438      prefix_skip = None
     439      iscased = _get_iscased(flags)
     440      for op, av in pattern.data:
     441          if op is LITERAL:
     442              if iscased and iscased(av):
     443                  break
     444              prefixappend(av)
     445          elif op is SUBPATTERN:
     446              group, add_flags, del_flags, p = av
     447              flags1 = _combine_flags(flags, add_flags, del_flags)
     448              if flags1 & SRE_FLAG_IGNORECASE and flags1 & SRE_FLAG_LOCALE:
     449                  break
     450              prefix1, prefix_skip1, got_all = _get_literal_prefix(p, flags1)
     451              if prefix_skip is None:
     452                  if group is not None:
     453                      prefix_skip = len(prefix)
     454                  elif prefix_skip1 is not None:
     455                      prefix_skip = len(prefix) + prefix_skip1
     456              prefix.extend(prefix1)
     457              if not got_all:
     458                  break
     459          else:
     460              break
     461      else:
     462          return prefix, prefix_skip, True
     463      return prefix, prefix_skip, False
     464  
     465  def _get_charset_prefix(pattern, flags):
     466      while True:
     467          if not pattern.data:
     468              return None
     469          op, av = pattern.data[0]
     470          if op is not SUBPATTERN:
     471              break
     472          group, add_flags, del_flags, pattern = av
     473          flags = _combine_flags(flags, add_flags, del_flags)
     474          if flags & SRE_FLAG_IGNORECASE and flags & SRE_FLAG_LOCALE:
     475              return None
     476  
     477      iscased = _get_iscased(flags)
     478      if op is LITERAL:
     479          if iscased and iscased(av):
     480              return None
     481          return [(op, av)]
     482      elif op is BRANCH:
     483          charset = []
     484          charsetappend = charset.append
     485          for p in av[1]:
     486              if not p:
     487                  return None
     488              op, av = p[0]
     489              if op is LITERAL and not (iscased and iscased(av)):
     490                  charsetappend((op, av))
     491              else:
     492                  return None
     493          return charset
     494      elif op is IN:
     495          charset = av
     496          if iscased:
     497              for op, av in charset:
     498                  if op is LITERAL:
     499                      if iscased(av):
     500                          return None
     501                  elif op is RANGE:
     502                      if av[1] > 0xffff:
     503                          return None
     504                      if any(map(iscased, range(av[0], av[1]+1))):
     505                          return None
     506          return charset
     507      return None
     508  
     509  def _compile_info(code, pattern, flags):
     510      # internal: compile an info block.  in the current version,
     511      # this contains min/max pattern width, and an optional literal
     512      # prefix or a character map
     513      lo, hi = pattern.getwidth()
     514      if hi > MAXCODE:
     515          hi = MAXCODE
     516      if lo == 0:
     517          code.extend([INFO, 4, 0, lo, hi])
     518          return
     519      # look for a literal prefix
     520      prefix = []
     521      prefix_skip = 0
     522      charset = [] # not used
     523      if not (flags & SRE_FLAG_IGNORECASE and flags & SRE_FLAG_LOCALE):
     524          # look for literal prefix
     525          prefix, prefix_skip, got_all = _get_literal_prefix(pattern, flags)
     526          # if no prefix, look for charset prefix
     527          if not prefix:
     528              charset = _get_charset_prefix(pattern, flags)
     529  ##     if prefix:
     530  ##         print("*** PREFIX", prefix, prefix_skip)
     531  ##     if charset:
     532  ##         print("*** CHARSET", charset)
     533      # add an info block
     534      emit = code.append
     535      emit(INFO)
     536      skip = len(code); emit(0)
     537      # literal flag
     538      mask = 0
     539      if prefix:
     540          mask = SRE_INFO_PREFIX
     541          if prefix_skip is None and got_all:
     542              mask = mask | SRE_INFO_LITERAL
     543      elif charset:
     544          mask = mask | SRE_INFO_CHARSET
     545      emit(mask)
     546      # pattern length
     547      if lo < MAXCODE:
     548          emit(lo)
     549      else:
     550          emit(MAXCODE)
     551          prefix = prefix[:MAXCODE]
     552      emit(min(hi, MAXCODE))
     553      # add literal prefix
     554      if prefix:
     555          emit(len(prefix)) # length
     556          if prefix_skip is None:
     557              prefix_skip =  len(prefix)
     558          emit(prefix_skip) # skip
     559          code.extend(prefix)
     560          # generate overlap table
     561          code.extend(_generate_overlap_table(prefix))
     562      elif charset:
     563          charset, hascased = _optimize_charset(charset)
     564          assert not hascased
     565          _compile_charset(charset, flags, code)
     566      code[skip] = len(code) - skip
     567  
     568  def isstring(obj):
     569      return isinstance(obj, (str, bytes))
     570  
     571  def _code(p, flags):
     572  
     573      flags = p.state.flags | flags
     574      code = []
     575  
     576      # compile info block
     577      _compile_info(code, p, flags)
     578  
     579      # compile the pattern
     580      _compile(code, p.data, flags)
     581  
     582      code.append(SUCCESS)
     583  
     584      return code
     585  
     586  def _hex_code(code):
     587      return '[%s]' % ', '.join('%#0*x' % (_sre.CODESIZE*2+2, x) for x in code)
     588  
     589  def dis(code):
     590      import sys
     591  
     592      labels = set()
     593      level = 0
     594      offset_width = len(str(len(code) - 1))
     595  
     596      def dis_(start, end):
     597          def print_(*args, to=None):
     598              if to is not None:
     599                  labels.add(to)
     600                  args += ('(to %d)' % (to,),)
     601              print('%*d%s ' % (offset_width, start, ':' if start in labels else '.'),
     602                    end='  '*(level-1))
     603              print(*args)
     604  
     605          def print_2(*args):
     606              print(end=' '*(offset_width + 2*level))
     607              print(*args)
     608  
     609          nonlocal level
     610          level += 1
     611          i = start
     612          while i < end:
     613              start = i
     614              op = code[i]
     615              i += 1
     616              op = OPCODES[op]
     617              if op in (SUCCESS, FAILURE, ANY, ANY_ALL,
     618                        MAX_UNTIL, MIN_UNTIL, NEGATE):
     619                  print_(op)
     620              elif op in (LITERAL, NOT_LITERAL,
     621                          LITERAL_IGNORE, NOT_LITERAL_IGNORE,
     622                          LITERAL_UNI_IGNORE, NOT_LITERAL_UNI_IGNORE,
     623                          LITERAL_LOC_IGNORE, NOT_LITERAL_LOC_IGNORE):
     624                  arg = code[i]
     625                  i += 1
     626                  print_(op, '%#02x (%r)' % (arg, chr(arg)))
     627              elif op is AT:
     628                  arg = code[i]
     629                  i += 1
     630                  arg = str(ATCODES[arg])
     631                  assert arg[:3] == 'AT_'
     632                  print_(op, arg[3:])
     633              elif op is CATEGORY:
     634                  arg = code[i]
     635                  i += 1
     636                  arg = str(CHCODES[arg])
     637                  assert arg[:9] == 'CATEGORY_'
     638                  print_(op, arg[9:])
     639              elif op in (IN, IN_IGNORE, IN_UNI_IGNORE, IN_LOC_IGNORE):
     640                  skip = code[i]
     641                  print_(op, skip, to=i+skip)
     642                  dis_(i+1, i+skip)
     643                  i += skip
     644              elif op in (RANGE, RANGE_UNI_IGNORE):
     645                  lo, hi = code[i: i+2]
     646                  i += 2
     647                  print_(op, '%#02x %#02x (%r-%r)' % (lo, hi, chr(lo), chr(hi)))
     648              elif op is CHARSET:
     649                  print_(op, _hex_code(code[i: i + 256//_CODEBITS]))
     650                  i += 256//_CODEBITS
     651              elif op is BIGCHARSET:
     652                  arg = code[i]
     653                  i += 1
     654                  mapping = list(b''.join(x.to_bytes(_sre.CODESIZE, sys.byteorder)
     655                                          for x in code[i: i + 256//_sre.CODESIZE]))
     656                  print_(op, arg, mapping)
     657                  i += 256//_sre.CODESIZE
     658                  level += 1
     659                  for j in range(arg):
     660                      print_2(_hex_code(code[i: i + 256//_CODEBITS]))
     661                      i += 256//_CODEBITS
     662                  level -= 1
     663              elif op in (MARK, GROUPREF, GROUPREF_IGNORE, GROUPREF_UNI_IGNORE,
     664                          GROUPREF_LOC_IGNORE):
     665                  arg = code[i]
     666                  i += 1
     667                  print_(op, arg)
     668              elif op is JUMP:
     669                  skip = code[i]
     670                  print_(op, skip, to=i+skip)
     671                  i += 1
     672              elif op is BRANCH:
     673                  skip = code[i]
     674                  print_(op, skip, to=i+skip)
     675                  while skip:
     676                      dis_(i+1, i+skip)
     677                      i += skip
     678                      start = i
     679                      skip = code[i]
     680                      if skip:
     681                          print_('branch', skip, to=i+skip)
     682                      else:
     683                          print_(FAILURE)
     684                  i += 1
     685              elif op in (REPEAT, REPEAT_ONE, MIN_REPEAT_ONE,
     686                          POSSESSIVE_REPEAT, POSSESSIVE_REPEAT_ONE):
     687                  skip, min, max = code[i: i+3]
     688                  if max == MAXREPEAT:
     689                      max = 'MAXREPEAT'
     690                  print_(op, skip, min, max, to=i+skip)
     691                  dis_(i+3, i+skip)
     692                  i += skip
     693              elif op is GROUPREF_EXISTS:
     694                  arg, skip = code[i: i+2]
     695                  print_(op, arg, skip, to=i+skip)
     696                  i += 2
     697              elif op in (ASSERT, ASSERT_NOT):
     698                  skip, arg = code[i: i+2]
     699                  print_(op, skip, arg, to=i+skip)
     700                  dis_(i+2, i+skip)
     701                  i += skip
     702              elif op is ATOMIC_GROUP:
     703                  skip = code[i]
     704                  print_(op, skip, to=i+skip)
     705                  dis_(i+1, i+skip)
     706                  i += skip
     707              elif op is INFO:
     708                  skip, flags, min, max = code[i: i+4]
     709                  if max == MAXREPEAT:
     710                      max = 'MAXREPEAT'
     711                  print_(op, skip, bin(flags), min, max, to=i+skip)
     712                  start = i+4
     713                  if flags & SRE_INFO_PREFIX:
     714                      prefix_len, prefix_skip = code[i+4: i+6]
     715                      print_2('  prefix_skip', prefix_skip)
     716                      start = i + 6
     717                      prefix = code[start: start+prefix_len]
     718                      print_2('  prefix',
     719                              '[%s]' % ', '.join('%#02x' % x for x in prefix),
     720                              '(%r)' % ''.join(map(chr, prefix)))
     721                      start += prefix_len
     722                      print_2('  overlap', code[start: start+prefix_len])
     723                      start += prefix_len
     724                  if flags & SRE_INFO_CHARSET:
     725                      level += 1
     726                      print_2('in')
     727                      dis_(start, i+skip)
     728                      level -= 1
     729                  i += skip
     730              else:
     731                  raise ValueError(op)
     732  
     733          level -= 1
     734  
     735      dis_(0, len(code))
     736  
     737  
     738  def compile(p, flags=0):
     739      # internal: convert pattern list to internal format
     740  
     741      if isstring(p):
     742          pattern = p
     743          p = _parser.parse(p, flags)
     744      else:
     745          pattern = None
     746  
     747      code = _code(p, flags)
     748  
     749      if flags & SRE_FLAG_DEBUG:
     750          print()
     751          dis(code)
     752  
     753      # map in either direction
     754      groupindex = p.state.groupdict
     755      indexgroup = [None] * p.state.groups
     756      for k, i in groupindex.items():
     757          indexgroup[i] = k
     758  
     759      return _sre.compile(
     760          pattern, flags | p.state.flags, code,
     761          p.state.groups-1,
     762          groupindex, tuple(indexgroup)
     763          )