(root)/
Python-3.11.7/
Lib/
re/
__init__.py
       1  #
       2  # Secret Labs' Regular Expression Engine
       3  #
       4  # re-compatible interface for the sre matching engine
       5  #
       6  # Copyright (c) 1998-2001 by Secret Labs AB.  All rights reserved.
       7  #
       8  # This version of the SRE library can be redistributed under CNRI's
       9  # Python 1.6 license.  For any other use, please contact Secret Labs
      10  # AB (info@pythonware.com).
      11  #
      12  # Portions of this engine have been developed in cooperation with
      13  # CNRI.  Hewlett-Packard provided funding for 1.6 integration and
      14  # other compatibility work.
      15  #
      16  
      17  r"""Support for regular expressions (RE).
      18  
      19  This module provides regular expression matching operations similar to
      20  those found in Perl.  It supports both 8-bit and Unicode strings; both
      21  the pattern and the strings being processed can contain null bytes and
      22  characters outside the US ASCII range.
      23  
      24  Regular expressions can contain both special and ordinary characters.
      25  Most ordinary characters, like "A", "a", or "0", are the simplest
      26  regular expressions; they simply match themselves.  You can
      27  concatenate ordinary characters, so last matches the string 'last'.
      28  
      29  The special characters are:
      30      "."      Matches any character except a newline.
      31      "^"      Matches the start of the string.
      32      "$"      Matches the end of the string or just before the newline at
      33               the end of the string.
      34      "*"      Matches 0 or more (greedy) repetitions of the preceding RE.
      35               Greedy means that it will match as many repetitions as possible.
      36      "+"      Matches 1 or more (greedy) repetitions of the preceding RE.
      37      "?"      Matches 0 or 1 (greedy) of the preceding RE.
      38      *?,+?,?? Non-greedy versions of the previous three special characters.
      39      {m,n}    Matches from m to n repetitions of the preceding RE.
      40      {m,n}?   Non-greedy version of the above.
      41      "\\"     Either escapes special characters or signals a special sequence.
      42      []       Indicates a set of characters.
      43               A "^" as the first character indicates a complementing set.
      44      "|"      A|B, creates an RE that will match either A or B.
      45      (...)    Matches the RE inside the parentheses.
      46               The contents can be retrieved or matched later in the string.
      47      (?aiLmsux) The letters set the corresponding flags defined below.
      48      (?:...)  Non-grouping version of regular parentheses.
      49      (?P<name>...) The substring matched by the group is accessible by name.
      50      (?P=name)     Matches the text matched earlier by the group named name.
      51      (?#...)  A comment; ignored.
      52      (?=...)  Matches if ... matches next, but doesn't consume the string.
      53      (?!...)  Matches if ... doesn't match next.
      54      (?<=...) Matches if preceded by ... (must be fixed length).
      55      (?<!...) Matches if not preceded by ... (must be fixed length).
      56      (?(id/name)yes|no) Matches yes pattern if the group with id/name matched,
      57                         the (optional) no pattern otherwise.
      58  
      59  The special sequences consist of "\\" and a character from the list
      60  below.  If the ordinary character is not on the list, then the
      61  resulting RE will match the second character.
      62      \number  Matches the contents of the group of the same number.
      63      \A       Matches only at the start of the string.
      64      \Z       Matches only at the end of the string.
      65      \b       Matches the empty string, but only at the start or end of a word.
      66      \B       Matches the empty string, but not at the start or end of a word.
      67      \d       Matches any decimal digit; equivalent to the set [0-9] in
      68               bytes patterns or string patterns with the ASCII flag.
      69               In string patterns without the ASCII flag, it will match the whole
      70               range of Unicode digits.
      71      \D       Matches any non-digit character; equivalent to [^\d].
      72      \s       Matches any whitespace character; equivalent to [ \t\n\r\f\v] in
      73               bytes patterns or string patterns with the ASCII flag.
      74               In string patterns without the ASCII flag, it will match the whole
      75               range of Unicode whitespace characters.
      76      \S       Matches any non-whitespace character; equivalent to [^\s].
      77      \w       Matches any alphanumeric character; equivalent to [a-zA-Z0-9_]
      78               in bytes patterns or string patterns with the ASCII flag.
      79               In string patterns without the ASCII flag, it will match the
      80               range of Unicode alphanumeric characters (letters plus digits
      81               plus underscore).
      82               With LOCALE, it will match the set [0-9_] plus characters defined
      83               as letters for the current locale.
      84      \W       Matches the complement of \w.
      85      \\       Matches a literal backslash.
      86  
      87  This module exports the following functions:
      88      match     Match a regular expression pattern to the beginning of a string.
      89      fullmatch Match a regular expression pattern to all of a string.
      90      search    Search a string for the presence of a pattern.
      91      sub       Substitute occurrences of a pattern found in a string.
      92      subn      Same as sub, but also return the number of substitutions made.
      93      split     Split a string by the occurrences of a pattern.
      94      findall   Find all occurrences of a pattern in a string.
      95      finditer  Return an iterator yielding a Match object for each match.
      96      compile   Compile a pattern into a Pattern object.
      97      purge     Clear the regular expression cache.
      98      escape    Backslash all non-alphanumerics in a string.
      99  
     100  Each function other than purge and escape can take an optional 'flags' argument
     101  consisting of one or more of the following module constants, joined by "|".
     102  A, L, and U are mutually exclusive.
     103      A  ASCII       For string patterns, make \w, \W, \b, \B, \d, \D
     104                     match the corresponding ASCII character categories
     105                     (rather than the whole Unicode categories, which is the
     106                     default).
     107                     For bytes patterns, this flag is the only available
     108                     behaviour and needn't be specified.
     109      I  IGNORECASE  Perform case-insensitive matching.
     110      L  LOCALE      Make \w, \W, \b, \B, dependent on the current locale.
     111      M  MULTILINE   "^" matches the beginning of lines (after a newline)
     112                     as well as the string.
     113                     "$" matches the end of lines (before a newline) as well
     114                     as the end of the string.
     115      S  DOTALL      "." matches any character at all, including the newline.
     116      X  VERBOSE     Ignore whitespace and comments for nicer looking RE's.
     117      U  UNICODE     For compatibility only. Ignored for string patterns (it
     118                     is the default), and forbidden for bytes patterns.
     119  
     120  This module also defines an exception 'error'.
     121  
     122  """
     123  
     124  import enum
     125  from . import _compiler, _parser
     126  import functools
     127  
     128  
     129  # public symbols
     130  __all__ = [
     131      "match", "fullmatch", "search", "sub", "subn", "split",
     132      "findall", "finditer", "compile", "purge", "template", "escape",
     133      "error", "Pattern", "Match", "A", "I", "L", "M", "S", "X", "U",
     134      "ASCII", "IGNORECASE", "LOCALE", "MULTILINE", "DOTALL", "VERBOSE",
     135      "UNICODE", "NOFLAG", "RegexFlag",
     136  ]
     137  
     138  __version__ = "2.2.1"
     139  
     140  @enum.global_enum
     141  @enum._simple_enum(enum.IntFlag, boundary=enum.KEEP)
     142  class ESC[4;38;5;81mRegexFlag:
     143      NOFLAG = 0
     144      ASCII = A = _compiler.SRE_FLAG_ASCII # assume ascii "locale"
     145      IGNORECASE = I = _compiler.SRE_FLAG_IGNORECASE # ignore case
     146      LOCALE = L = _compiler.SRE_FLAG_LOCALE # assume current 8-bit locale
     147      UNICODE = U = _compiler.SRE_FLAG_UNICODE # assume unicode "locale"
     148      MULTILINE = M = _compiler.SRE_FLAG_MULTILINE # make anchors look for newline
     149      DOTALL = S = _compiler.SRE_FLAG_DOTALL # make dot match newline
     150      VERBOSE = X = _compiler.SRE_FLAG_VERBOSE # ignore whitespace and comments
     151      # sre extensions (experimental, don't rely on these)
     152      TEMPLATE = T = _compiler.SRE_FLAG_TEMPLATE # unknown purpose, deprecated
     153      DEBUG = _compiler.SRE_FLAG_DEBUG # dump pattern after compilation
     154      __str__ = object.__str__
     155      _numeric_repr_ = hex
     156  
     157  # sre exception
     158  error = _compiler.error
     159  
     160  # --------------------------------------------------------------------
     161  # public interface
     162  
     163  def match(pattern, string, flags=0):
     164      """Try to apply the pattern at the start of the string, returning
     165      a Match object, or None if no match was found."""
     166      return _compile(pattern, flags).match(string)
     167  
     168  def fullmatch(pattern, string, flags=0):
     169      """Try to apply the pattern to all of the string, returning
     170      a Match object, or None if no match was found."""
     171      return _compile(pattern, flags).fullmatch(string)
     172  
     173  def search(pattern, string, flags=0):
     174      """Scan through string looking for a match to the pattern, returning
     175      a Match object, or None if no match was found."""
     176      return _compile(pattern, flags).search(string)
     177  
     178  def sub(pattern, repl, string, count=0, flags=0):
     179      """Return the string obtained by replacing the leftmost
     180      non-overlapping occurrences of the pattern in string by the
     181      replacement repl.  repl can be either a string or a callable;
     182      if a string, backslash escapes in it are processed.  If it is
     183      a callable, it's passed the Match object and must return
     184      a replacement string to be used."""
     185      return _compile(pattern, flags).sub(repl, string, count)
     186  
     187  def subn(pattern, repl, string, count=0, flags=0):
     188      """Return a 2-tuple containing (new_string, number).
     189      new_string is the string obtained by replacing the leftmost
     190      non-overlapping occurrences of the pattern in the source
     191      string by the replacement repl.  number is the number of
     192      substitutions that were made. repl can be either a string or a
     193      callable; if a string, backslash escapes in it are processed.
     194      If it is a callable, it's passed the Match object and must
     195      return a replacement string to be used."""
     196      return _compile(pattern, flags).subn(repl, string, count)
     197  
     198  def split(pattern, string, maxsplit=0, flags=0):
     199      """Split the source string by the occurrences of the pattern,
     200      returning a list containing the resulting substrings.  If
     201      capturing parentheses are used in pattern, then the text of all
     202      groups in the pattern are also returned as part of the resulting
     203      list.  If maxsplit is nonzero, at most maxsplit splits occur,
     204      and the remainder of the string is returned as the final element
     205      of the list."""
     206      return _compile(pattern, flags).split(string, maxsplit)
     207  
     208  def findall(pattern, string, flags=0):
     209      """Return a list of all non-overlapping matches in the string.
     210  
     211      If one or more capturing groups are present in the pattern, return
     212      a list of groups; this will be a list of tuples if the pattern
     213      has more than one group.
     214  
     215      Empty matches are included in the result."""
     216      return _compile(pattern, flags).findall(string)
     217  
     218  def finditer(pattern, string, flags=0):
     219      """Return an iterator over all non-overlapping matches in the
     220      string.  For each match, the iterator returns a Match object.
     221  
     222      Empty matches are included in the result."""
     223      return _compile(pattern, flags).finditer(string)
     224  
     225  def compile(pattern, flags=0):
     226      "Compile a regular expression pattern, returning a Pattern object."
     227      return _compile(pattern, flags)
     228  
     229  def purge():
     230      "Clear the regular expression caches"
     231      _cache.clear()
     232      _compile_repl.cache_clear()
     233  
     234  def template(pattern, flags=0):
     235      "Compile a template pattern, returning a Pattern object, deprecated"
     236      import warnings
     237      warnings.warn("The re.template() function is deprecated "
     238                    "as it is an undocumented function "
     239                    "without an obvious purpose. "
     240                    "Use re.compile() instead.",
     241                    DeprecationWarning)
     242      with warnings.catch_warnings():
     243          warnings.simplefilter("ignore", DeprecationWarning)  # warn just once
     244          return _compile(pattern, flags|T)
     245  
     246  # SPECIAL_CHARS
     247  # closing ')', '}' and ']'
     248  # '-' (a range in character set)
     249  # '&', '~', (extended character set operations)
     250  # '#' (comment) and WHITESPACE (ignored) in verbose mode
     251  _special_chars_map = {i: '\\' + chr(i) for i in b'()[]{}?*+-|^$\\.&~# \t\n\r\v\f'}
     252  
     253  def escape(pattern):
     254      """
     255      Escape special characters in a string.
     256      """
     257      if isinstance(pattern, str):
     258          return pattern.translate(_special_chars_map)
     259      else:
     260          pattern = str(pattern, 'latin1')
     261          return pattern.translate(_special_chars_map).encode('latin1')
     262  
     263  Pattern = type(_compiler.compile('', 0))
     264  Match = type(_compiler.compile('', 0).match(''))
     265  
     266  # --------------------------------------------------------------------
     267  # internals
     268  
     269  _cache = {}  # ordered!
     270  
     271  _MAXCACHE = 512
     272  def _compile(pattern, flags):
     273      # internal: compile pattern
     274      if isinstance(flags, RegexFlag):
     275          flags = flags.value
     276      try:
     277          return _cache[type(pattern), pattern, flags]
     278      except KeyError:
     279          pass
     280      if isinstance(pattern, Pattern):
     281          if flags:
     282              raise ValueError(
     283                  "cannot process flags argument with a compiled pattern")
     284          return pattern
     285      if not _compiler.isstring(pattern):
     286          raise TypeError("first argument must be string or compiled pattern")
     287      if flags & T:
     288          import warnings
     289          warnings.warn("The re.TEMPLATE/re.T flag is deprecated "
     290                    "as it is an undocumented flag "
     291                    "without an obvious purpose. "
     292                    "Don't use it.",
     293                    DeprecationWarning)
     294      p = _compiler.compile(pattern, flags)
     295      if not (flags & DEBUG):
     296          if len(_cache) >= _MAXCACHE:
     297              # Drop the oldest item
     298              try:
     299                  del _cache[next(iter(_cache))]
     300              except (StopIteration, RuntimeError, KeyError):
     301                  pass
     302          _cache[type(pattern), pattern, flags] = p
     303      return p
     304  
     305  @functools.lru_cache(_MAXCACHE)
     306  def _compile_repl(repl, pattern):
     307      # internal: compile replacement pattern
     308      return _parser.parse_template(repl, pattern)
     309  
     310  def _expand(pattern, match, template):
     311      # internal: Match.expand implementation hook
     312      template = _parser.parse_template(template, pattern)
     313      return _parser.expand_template(template, match)
     314  
     315  def _subx(pattern, template):
     316      # internal: Pattern.sub/subn implementation helper
     317      template = _compile_repl(template, pattern)
     318      if not template[0] and len(template[1]) == 1:
     319          # literal replacement
     320          return template[1][0]
     321      def filter(match, template=template):
     322          return _parser.expand_template(template, match)
     323      return filter
     324  
     325  # register myself for pickling
     326  
     327  import copyreg
     328  
     329  def _pickle(p):
     330      return _compile, (p.pattern, p.flags)
     331  
     332  copyreg.pickle(Pattern, _pickle, _compile)
     333  
     334  # --------------------------------------------------------------------
     335  # experimental stuff (see python-dev discussions for details)
     336  
     337  class ESC[4;38;5;81mScanner:
     338      def __init__(self, lexicon, flags=0):
     339          from ._constants import BRANCH, SUBPATTERN
     340          if isinstance(flags, RegexFlag):
     341              flags = flags.value
     342          self.lexicon = lexicon
     343          # combine phrases into a compound pattern
     344          p = []
     345          s = _parser.State()
     346          s.flags = flags
     347          for phrase, action in lexicon:
     348              gid = s.opengroup()
     349              p.append(_parser.SubPattern(s, [
     350                  (SUBPATTERN, (gid, 0, 0, _parser.parse(phrase, flags))),
     351                  ]))
     352              s.closegroup(gid, p[-1])
     353          p = _parser.SubPattern(s, [(BRANCH, (None, p))])
     354          self.scanner = _compiler.compile(p)
     355      def scan(self, string):
     356          result = []
     357          append = result.append
     358          match = self.scanner.scanner(string).match
     359          i = 0
     360          while True:
     361              m = match()
     362              if not m:
     363                  break
     364              j = m.end()
     365              if i == j:
     366                  break
     367              action = self.lexicon[m.lastindex-1][1]
     368              if callable(action):
     369                  self.match = m
     370                  action = action(self, m.group())
     371              if action is not None:
     372                  append(action)
     373              i = j
     374          return result, string[i:]