python (3.11.7)
       1  # helpers.py
       2  import html.entities
       3  import re
       4  import typing
       5  
       6  from . import __diag__
       7  from .core import *
       8  from .util import _bslash, _flatten, _escape_regex_range_chars
       9  
      10  
      11  #
      12  # global helpers
      13  #
      14  def delimited_list(
      15      expr: Union[str, ParserElement],
      16      delim: Union[str, ParserElement] = ",",
      17      combine: bool = False,
      18      min: typing.Optional[int] = None,
      19      max: typing.Optional[int] = None,
      20      *,
      21      allow_trailing_delim: bool = False,
      22  ) -> ParserElement:
      23      """Helper to define a delimited list of expressions - the delimiter
      24      defaults to ','. By default, the list elements and delimiters can
      25      have intervening whitespace, and comments, but this can be
      26      overridden by passing ``combine=True`` in the constructor. If
      27      ``combine`` is set to ``True``, the matching tokens are
      28      returned as a single token string, with the delimiters included;
      29      otherwise, the matching tokens are returned as a list of tokens,
      30      with the delimiters suppressed.
      31  
      32      If ``allow_trailing_delim`` is set to True, then the list may end with
      33      a delimiter.
      34  
      35      Example::
      36  
      37          delimited_list(Word(alphas)).parse_string("aa,bb,cc") # -> ['aa', 'bb', 'cc']
      38          delimited_list(Word(hexnums), delim=':', combine=True).parse_string("AA:BB:CC:DD:EE") # -> ['AA:BB:CC:DD:EE']
      39      """
      40      if isinstance(expr, str_type):
      41          expr = ParserElement._literalStringClass(expr)
      42  
      43      dlName = "{expr} [{delim} {expr}]...{end}".format(
      44          expr=str(expr.copy().streamline()),
      45          delim=str(delim),
      46          end=" [{}]".format(str(delim)) if allow_trailing_delim else "",
      47      )
      48  
      49      if not combine:
      50          delim = Suppress(delim)
      51  
      52      if min is not None:
      53          if min < 1:
      54              raise ValueError("min must be greater than 0")
      55          min -= 1
      56      if max is not None:
      57          if min is not None and max <= min:
      58              raise ValueError("max must be greater than, or equal to min")
      59          max -= 1
      60      delimited_list_expr = expr + (delim + expr)[min, max]
      61  
      62      if allow_trailing_delim:
      63          delimited_list_expr += Opt(delim)
      64  
      65      if combine:
      66          return Combine(delimited_list_expr).set_name(dlName)
      67      else:
      68          return delimited_list_expr.set_name(dlName)
      69  
      70  
      71  def counted_array(
      72      expr: ParserElement,
      73      int_expr: typing.Optional[ParserElement] = None,
      74      *,
      75      intExpr: typing.Optional[ParserElement] = None,
      76  ) -> ParserElement:
      77      """Helper to define a counted list of expressions.
      78  
      79      This helper defines a pattern of the form::
      80  
      81          integer expr expr expr...
      82  
      83      where the leading integer tells how many expr expressions follow.
      84      The matched tokens returns the array of expr tokens as a list - the
      85      leading count token is suppressed.
      86  
      87      If ``int_expr`` is specified, it should be a pyparsing expression
      88      that produces an integer value.
      89  
      90      Example::
      91  
      92          counted_array(Word(alphas)).parse_string('2 ab cd ef')  # -> ['ab', 'cd']
      93  
      94          # in this parser, the leading integer value is given in binary,
      95          # '10' indicating that 2 values are in the array
      96          binary_constant = Word('01').set_parse_action(lambda t: int(t[0], 2))
      97          counted_array(Word(alphas), int_expr=binary_constant).parse_string('10 ab cd ef')  # -> ['ab', 'cd']
      98  
      99          # if other fields must be parsed after the count but before the
     100          # list items, give the fields results names and they will
     101          # be preserved in the returned ParseResults:
     102          count_with_metadata = integer + Word(alphas)("type")
     103          typed_array = counted_array(Word(alphanums), int_expr=count_with_metadata)("items")
     104          result = typed_array.parse_string("3 bool True True False")
     105          print(result.dump())
     106  
     107          # prints
     108          # ['True', 'True', 'False']
     109          # - items: ['True', 'True', 'False']
     110          # - type: 'bool'
     111      """
     112      intExpr = intExpr or int_expr
     113      array_expr = Forward()
     114  
     115      def count_field_parse_action(s, l, t):
     116          nonlocal array_expr
     117          n = t[0]
     118          array_expr <<= (expr * n) if n else Empty()
     119          # clear list contents, but keep any named results
     120          del t[:]
     121  
     122      if intExpr is None:
     123          intExpr = Word(nums).set_parse_action(lambda t: int(t[0]))
     124      else:
     125          intExpr = intExpr.copy()
     126      intExpr.set_name("arrayLen")
     127      intExpr.add_parse_action(count_field_parse_action, call_during_try=True)
     128      return (intExpr + array_expr).set_name("(len) " + str(expr) + "...")
     129  
     130  
     131  def match_previous_literal(expr: ParserElement) -> ParserElement:
     132      """Helper to define an expression that is indirectly defined from
     133      the tokens matched in a previous expression, that is, it looks for
     134      a 'repeat' of a previous expression.  For example::
     135  
     136          first = Word(nums)
     137          second = match_previous_literal(first)
     138          match_expr = first + ":" + second
     139  
     140      will match ``"1:1"``, but not ``"1:2"``.  Because this
     141      matches a previous literal, will also match the leading
     142      ``"1:1"`` in ``"1:10"``. If this is not desired, use
     143      :class:`match_previous_expr`. Do *not* use with packrat parsing
     144      enabled.
     145      """
     146      rep = Forward()
     147  
     148      def copy_token_to_repeater(s, l, t):
     149          if t:
     150              if len(t) == 1:
     151                  rep << t[0]
     152              else:
     153                  # flatten t tokens
     154                  tflat = _flatten(t.as_list())
     155                  rep << And(Literal(tt) for tt in tflat)
     156          else:
     157              rep << Empty()
     158  
     159      expr.add_parse_action(copy_token_to_repeater, callDuringTry=True)
     160      rep.set_name("(prev) " + str(expr))
     161      return rep
     162  
     163  
     164  def match_previous_expr(expr: ParserElement) -> ParserElement:
     165      """Helper to define an expression that is indirectly defined from
     166      the tokens matched in a previous expression, that is, it looks for
     167      a 'repeat' of a previous expression.  For example::
     168  
     169          first = Word(nums)
     170          second = match_previous_expr(first)
     171          match_expr = first + ":" + second
     172  
     173      will match ``"1:1"``, but not ``"1:2"``.  Because this
     174      matches by expressions, will *not* match the leading ``"1:1"``
     175      in ``"1:10"``; the expressions are evaluated first, and then
     176      compared, so ``"1"`` is compared with ``"10"``. Do *not* use
     177      with packrat parsing enabled.
     178      """
     179      rep = Forward()
     180      e2 = expr.copy()
     181      rep <<= e2
     182  
     183      def copy_token_to_repeater(s, l, t):
     184          matchTokens = _flatten(t.as_list())
     185  
     186          def must_match_these_tokens(s, l, t):
     187              theseTokens = _flatten(t.as_list())
     188              if theseTokens != matchTokens:
     189                  raise ParseException(
     190                      s, l, "Expected {}, found{}".format(matchTokens, theseTokens)
     191                  )
     192  
     193          rep.set_parse_action(must_match_these_tokens, callDuringTry=True)
     194  
     195      expr.add_parse_action(copy_token_to_repeater, callDuringTry=True)
     196      rep.set_name("(prev) " + str(expr))
     197      return rep
     198  
     199  
     200  def one_of(
     201      strs: Union[typing.Iterable[str], str],
     202      caseless: bool = False,
     203      use_regex: bool = True,
     204      as_keyword: bool = False,
     205      *,
     206      useRegex: bool = True,
     207      asKeyword: bool = False,
     208  ) -> ParserElement:
     209      """Helper to quickly define a set of alternative :class:`Literal` s,
     210      and makes sure to do longest-first testing when there is a conflict,
     211      regardless of the input order, but returns
     212      a :class:`MatchFirst` for best performance.
     213  
     214      Parameters:
     215  
     216      - ``strs`` - a string of space-delimited literals, or a collection of
     217        string literals
     218      - ``caseless`` - treat all literals as caseless - (default= ``False``)
     219      - ``use_regex`` - as an optimization, will
     220        generate a :class:`Regex` object; otherwise, will generate
     221        a :class:`MatchFirst` object (if ``caseless=True`` or ``asKeyword=True``, or if
     222        creating a :class:`Regex` raises an exception) - (default= ``True``)
     223      - ``as_keyword`` - enforce :class:`Keyword`-style matching on the
     224        generated expressions - (default= ``False``)
     225      - ``asKeyword`` and ``useRegex`` are retained for pre-PEP8 compatibility,
     226        but will be removed in a future release
     227  
     228      Example::
     229  
     230          comp_oper = one_of("< = > <= >= !=")
     231          var = Word(alphas)
     232          number = Word(nums)
     233          term = var | number
     234          comparison_expr = term + comp_oper + term
     235          print(comparison_expr.search_string("B = 12  AA=23 B<=AA AA>12"))
     236  
     237      prints::
     238  
     239          [['B', '=', '12'], ['AA', '=', '23'], ['B', '<=', 'AA'], ['AA', '>', '12']]
     240      """
     241      asKeyword = asKeyword or as_keyword
     242      useRegex = useRegex and use_regex
     243  
     244      if (
     245          isinstance(caseless, str_type)
     246          and __diag__.warn_on_multiple_string_args_to_oneof
     247      ):
     248          warnings.warn(
     249              "More than one string argument passed to one_of, pass"
     250              " choices as a list or space-delimited string",
     251              stacklevel=2,
     252          )
     253  
     254      if caseless:
     255          isequal = lambda a, b: a.upper() == b.upper()
     256          masks = lambda a, b: b.upper().startswith(a.upper())
     257          parseElementClass = CaselessKeyword if asKeyword else CaselessLiteral
     258      else:
     259          isequal = lambda a, b: a == b
     260          masks = lambda a, b: b.startswith(a)
     261          parseElementClass = Keyword if asKeyword else Literal
     262  
     263      symbols: List[str] = []
     264      if isinstance(strs, str_type):
     265          symbols = strs.split()
     266      elif isinstance(strs, Iterable):
     267          symbols = list(strs)
     268      else:
     269          raise TypeError("Invalid argument to one_of, expected string or iterable")
     270      if not symbols:
     271          return NoMatch()
     272  
     273      # reorder given symbols to take care to avoid masking longer choices with shorter ones
     274      # (but only if the given symbols are not just single characters)
     275      if any(len(sym) > 1 for sym in symbols):
     276          i = 0
     277          while i < len(symbols) - 1:
     278              cur = symbols[i]
     279              for j, other in enumerate(symbols[i + 1 :]):
     280                  if isequal(other, cur):
     281                      del symbols[i + j + 1]
     282                      break
     283                  elif masks(cur, other):
     284                      del symbols[i + j + 1]
     285                      symbols.insert(i, other)
     286                      break
     287              else:
     288                  i += 1
     289  
     290      if useRegex:
     291          re_flags: int = re.IGNORECASE if caseless else 0
     292  
     293          try:
     294              if all(len(sym) == 1 for sym in symbols):
     295                  # symbols are just single characters, create range regex pattern
     296                  patt = "[{}]".format(
     297                      "".join(_escape_regex_range_chars(sym) for sym in symbols)
     298                  )
     299              else:
     300                  patt = "|".join(re.escape(sym) for sym in symbols)
     301  
     302              # wrap with \b word break markers if defining as keywords
     303              if asKeyword:
     304                  patt = r"\b(?:{})\b".format(patt)
     305  
     306              ret = Regex(patt, flags=re_flags).set_name(" | ".join(symbols))
     307  
     308              if caseless:
     309                  # add parse action to return symbols as specified, not in random
     310                  # casing as found in input string
     311                  symbol_map = {sym.lower(): sym for sym in symbols}
     312                  ret.add_parse_action(lambda s, l, t: symbol_map[t[0].lower()])
     313  
     314              return ret
     315  
     316          except re.error:
     317              warnings.warn(
     318                  "Exception creating Regex for one_of, building MatchFirst", stacklevel=2
     319              )
     320  
     321      # last resort, just use MatchFirst
     322      return MatchFirst(parseElementClass(sym) for sym in symbols).set_name(
     323          " | ".join(symbols)
     324      )
     325  
     326  
     327  def dict_of(key: ParserElement, value: ParserElement) -> ParserElement:
     328      """Helper to easily and clearly define a dictionary by specifying
     329      the respective patterns for the key and value.  Takes care of
     330      defining the :class:`Dict`, :class:`ZeroOrMore`, and
     331      :class:`Group` tokens in the proper order.  The key pattern
     332      can include delimiting markers or punctuation, as long as they are
     333      suppressed, thereby leaving the significant key text.  The value
     334      pattern can include named results, so that the :class:`Dict` results
     335      can include named token fields.
     336  
     337      Example::
     338  
     339          text = "shape: SQUARE posn: upper left color: light blue texture: burlap"
     340          attr_expr = (label + Suppress(':') + OneOrMore(data_word, stop_on=label).set_parse_action(' '.join))
     341          print(attr_expr[1, ...].parse_string(text).dump())
     342  
     343          attr_label = label
     344          attr_value = Suppress(':') + OneOrMore(data_word, stop_on=label).set_parse_action(' '.join)
     345  
     346          # similar to Dict, but simpler call format
     347          result = dict_of(attr_label, attr_value).parse_string(text)
     348          print(result.dump())
     349          print(result['shape'])
     350          print(result.shape)  # object attribute access works too
     351          print(result.as_dict())
     352  
     353      prints::
     354  
     355          [['shape', 'SQUARE'], ['posn', 'upper left'], ['color', 'light blue'], ['texture', 'burlap']]
     356          - color: 'light blue'
     357          - posn: 'upper left'
     358          - shape: 'SQUARE'
     359          - texture: 'burlap'
     360          SQUARE
     361          SQUARE
     362          {'color': 'light blue', 'shape': 'SQUARE', 'posn': 'upper left', 'texture': 'burlap'}
     363      """
     364      return Dict(OneOrMore(Group(key + value)))
     365  
     366  
     367  def original_text_for(
     368      expr: ParserElement, as_string: bool = True, *, asString: bool = True
     369  ) -> ParserElement:
     370      """Helper to return the original, untokenized text for a given
     371      expression.  Useful to restore the parsed fields of an HTML start
     372      tag into the raw tag text itself, or to revert separate tokens with
     373      intervening whitespace back to the original matching input text. By
     374      default, returns astring containing the original parsed text.
     375  
     376      If the optional ``as_string`` argument is passed as
     377      ``False``, then the return value is
     378      a :class:`ParseResults` containing any results names that
     379      were originally matched, and a single token containing the original
     380      matched text from the input string.  So if the expression passed to
     381      :class:`original_text_for` contains expressions with defined
     382      results names, you must set ``as_string`` to ``False`` if you
     383      want to preserve those results name values.
     384  
     385      The ``asString`` pre-PEP8 argument is retained for compatibility,
     386      but will be removed in a future release.
     387  
     388      Example::
     389  
     390          src = "this is test <b> bold <i>text</i> </b> normal text "
     391          for tag in ("b", "i"):
     392              opener, closer = make_html_tags(tag)
     393              patt = original_text_for(opener + SkipTo(closer) + closer)
     394              print(patt.search_string(src)[0])
     395  
     396      prints::
     397  
     398          ['<b> bold <i>text</i> </b>']
     399          ['<i>text</i>']
     400      """
     401      asString = asString and as_string
     402  
     403      locMarker = Empty().set_parse_action(lambda s, loc, t: loc)
     404      endlocMarker = locMarker.copy()
     405      endlocMarker.callPreparse = False
     406      matchExpr = locMarker("_original_start") + expr + endlocMarker("_original_end")
     407      if asString:
     408          extractText = lambda s, l, t: s[t._original_start : t._original_end]
     409      else:
     410  
     411          def extractText(s, l, t):
     412              t[:] = [s[t.pop("_original_start") : t.pop("_original_end")]]
     413  
     414      matchExpr.set_parse_action(extractText)
     415      matchExpr.ignoreExprs = expr.ignoreExprs
     416      matchExpr.suppress_warning(Diagnostics.warn_ungrouped_named_tokens_in_collection)
     417      return matchExpr
     418  
     419  
     420  def ungroup(expr: ParserElement) -> ParserElement:
     421      """Helper to undo pyparsing's default grouping of And expressions,
     422      even if all but one are non-empty.
     423      """
     424      return TokenConverter(expr).add_parse_action(lambda t: t[0])
     425  
     426  
     427  def locatedExpr(expr: ParserElement) -> ParserElement:
     428      """
     429      (DEPRECATED - future code should use the Located class)
     430      Helper to decorate a returned token with its starting and ending
     431      locations in the input string.
     432  
     433      This helper adds the following results names:
     434  
     435      - ``locn_start`` - location where matched expression begins
     436      - ``locn_end`` - location where matched expression ends
     437      - ``value`` - the actual parsed results
     438  
     439      Be careful if the input text contains ``<TAB>`` characters, you
     440      may want to call :class:`ParserElement.parseWithTabs`
     441  
     442      Example::
     443  
     444          wd = Word(alphas)
     445          for match in locatedExpr(wd).searchString("ljsdf123lksdjjf123lkkjj1222"):
     446              print(match)
     447  
     448      prints::
     449  
     450          [[0, 'ljsdf', 5]]
     451          [[8, 'lksdjjf', 15]]
     452          [[18, 'lkkjj', 23]]
     453      """
     454      locator = Empty().set_parse_action(lambda ss, ll, tt: ll)
     455      return Group(
     456          locator("locn_start")
     457          + expr("value")
     458          + locator.copy().leaveWhitespace()("locn_end")
     459      )
     460  
     461  
     462  def nested_expr(
     463      opener: Union[str, ParserElement] = "(",
     464      closer: Union[str, ParserElement] = ")",
     465      content: typing.Optional[ParserElement] = None,
     466      ignore_expr: ParserElement = quoted_string(),
     467      *,
     468      ignoreExpr: ParserElement = quoted_string(),
     469  ) -> ParserElement:
     470      """Helper method for defining nested lists enclosed in opening and
     471      closing delimiters (``"("`` and ``")"`` are the default).
     472  
     473      Parameters:
     474      - ``opener`` - opening character for a nested list
     475        (default= ``"("``); can also be a pyparsing expression
     476      - ``closer`` - closing character for a nested list
     477        (default= ``")"``); can also be a pyparsing expression
     478      - ``content`` - expression for items within the nested lists
     479        (default= ``None``)
     480      - ``ignore_expr`` - expression for ignoring opening and closing delimiters
     481        (default= :class:`quoted_string`)
     482      - ``ignoreExpr`` - this pre-PEP8 argument is retained for compatibility
     483        but will be removed in a future release
     484  
     485      If an expression is not provided for the content argument, the
     486      nested expression will capture all whitespace-delimited content
     487      between delimiters as a list of separate values.
     488  
     489      Use the ``ignore_expr`` argument to define expressions that may
     490      contain opening or closing characters that should not be treated as
     491      opening or closing characters for nesting, such as quoted_string or
     492      a comment expression.  Specify multiple expressions using an
     493      :class:`Or` or :class:`MatchFirst`. The default is
     494      :class:`quoted_string`, but if no expressions are to be ignored, then
     495      pass ``None`` for this argument.
     496  
     497      Example::
     498  
     499          data_type = one_of("void int short long char float double")
     500          decl_data_type = Combine(data_type + Opt(Word('*')))
     501          ident = Word(alphas+'_', alphanums+'_')
     502          number = pyparsing_common.number
     503          arg = Group(decl_data_type + ident)
     504          LPAR, RPAR = map(Suppress, "()")
     505  
     506          code_body = nested_expr('{', '}', ignore_expr=(quoted_string | c_style_comment))
     507  
     508          c_function = (decl_data_type("type")
     509                        + ident("name")
     510                        + LPAR + Opt(delimited_list(arg), [])("args") + RPAR
     511                        + code_body("body"))
     512          c_function.ignore(c_style_comment)
     513  
     514          source_code = '''
     515              int is_odd(int x) {
     516                  return (x%2);
     517              }
     518  
     519              int dec_to_hex(char hchar) {
     520                  if (hchar >= '0' && hchar <= '9') {
     521                      return (ord(hchar)-ord('0'));
     522                  } else {
     523                      return (10+ord(hchar)-ord('A'));
     524                  }
     525              }
     526          '''
     527          for func in c_function.search_string(source_code):
     528              print("%(name)s (%(type)s) args: %(args)s" % func)
     529  
     530  
     531      prints::
     532  
     533          is_odd (int) args: [['int', 'x']]
     534          dec_to_hex (int) args: [['char', 'hchar']]
     535      """
     536      if ignoreExpr != ignore_expr:
     537          ignoreExpr = ignore_expr if ignoreExpr == quoted_string() else ignoreExpr
     538      if opener == closer:
     539          raise ValueError("opening and closing strings cannot be the same")
     540      if content is None:
     541          if isinstance(opener, str_type) and isinstance(closer, str_type):
     542              if len(opener) == 1 and len(closer) == 1:
     543                  if ignoreExpr is not None:
     544                      content = Combine(
     545                          OneOrMore(
     546                              ~ignoreExpr
     547                              + CharsNotIn(
     548                                  opener + closer + ParserElement.DEFAULT_WHITE_CHARS,
     549                                  exact=1,
     550                              )
     551                          )
     552                      ).set_parse_action(lambda t: t[0].strip())
     553                  else:
     554                      content = empty.copy() + CharsNotIn(
     555                          opener + closer + ParserElement.DEFAULT_WHITE_CHARS
     556                      ).set_parse_action(lambda t: t[0].strip())
     557              else:
     558                  if ignoreExpr is not None:
     559                      content = Combine(
     560                          OneOrMore(
     561                              ~ignoreExpr
     562                              + ~Literal(opener)
     563                              + ~Literal(closer)
     564                              + CharsNotIn(ParserElement.DEFAULT_WHITE_CHARS, exact=1)
     565                          )
     566                      ).set_parse_action(lambda t: t[0].strip())
     567                  else:
     568                      content = Combine(
     569                          OneOrMore(
     570                              ~Literal(opener)
     571                              + ~Literal(closer)
     572                              + CharsNotIn(ParserElement.DEFAULT_WHITE_CHARS, exact=1)
     573                          )
     574                      ).set_parse_action(lambda t: t[0].strip())
     575          else:
     576              raise ValueError(
     577                  "opening and closing arguments must be strings if no content expression is given"
     578              )
     579      ret = Forward()
     580      if ignoreExpr is not None:
     581          ret <<= Group(
     582              Suppress(opener) + ZeroOrMore(ignoreExpr | ret | content) + Suppress(closer)
     583          )
     584      else:
     585          ret <<= Group(Suppress(opener) + ZeroOrMore(ret | content) + Suppress(closer))
     586      ret.set_name("nested %s%s expression" % (opener, closer))
     587      return ret
     588  
     589  
     590  def _makeTags(tagStr, xml, suppress_LT=Suppress("<"), suppress_GT=Suppress(">")):
     591      """Internal helper to construct opening and closing tag expressions, given a tag name"""
     592      if isinstance(tagStr, str_type):
     593          resname = tagStr
     594          tagStr = Keyword(tagStr, caseless=not xml)
     595      else:
     596          resname = tagStr.name
     597  
     598      tagAttrName = Word(alphas, alphanums + "_-:")
     599      if xml:
     600          tagAttrValue = dbl_quoted_string.copy().set_parse_action(remove_quotes)
     601          openTag = (
     602              suppress_LT
     603              + tagStr("tag")
     604              + Dict(ZeroOrMore(Group(tagAttrName + Suppress("=") + tagAttrValue)))
     605              + Opt("/", default=[False])("empty").set_parse_action(
     606                  lambda s, l, t: t[0] == "/"
     607              )
     608              + suppress_GT
     609          )
     610      else:
     611          tagAttrValue = quoted_string.copy().set_parse_action(remove_quotes) | Word(
     612              printables, exclude_chars=">"
     613          )
     614          openTag = (
     615              suppress_LT
     616              + tagStr("tag")
     617              + Dict(
     618                  ZeroOrMore(
     619                      Group(
     620                          tagAttrName.set_parse_action(lambda t: t[0].lower())
     621                          + Opt(Suppress("=") + tagAttrValue)
     622                      )
     623                  )
     624              )
     625              + Opt("/", default=[False])("empty").set_parse_action(
     626                  lambda s, l, t: t[0] == "/"
     627              )
     628              + suppress_GT
     629          )
     630      closeTag = Combine(Literal("</") + tagStr + ">", adjacent=False)
     631  
     632      openTag.set_name("<%s>" % resname)
     633      # add start<tagname> results name in parse action now that ungrouped names are not reported at two levels
     634      openTag.add_parse_action(
     635          lambda t: t.__setitem__(
     636              "start" + "".join(resname.replace(":", " ").title().split()), t.copy()
     637          )
     638      )
     639      closeTag = closeTag(
     640          "end" + "".join(resname.replace(":", " ").title().split())
     641      ).set_name("</%s>" % resname)
     642      openTag.tag = resname
     643      closeTag.tag = resname
     644      openTag.tag_body = SkipTo(closeTag())
     645      return openTag, closeTag
     646  
     647  
     648  def make_html_tags(
     649      tag_str: Union[str, ParserElement]
     650  ) -> Tuple[ParserElement, ParserElement]:
     651      """Helper to construct opening and closing tag expressions for HTML,
     652      given a tag name. Matches tags in either upper or lower case,
     653      attributes with namespaces and with quoted or unquoted values.
     654  
     655      Example::
     656  
     657          text = '<td>More info at the <a href="https://github.com/pyparsing/pyparsing/wiki">pyparsing</a> wiki page</td>'
     658          # make_html_tags returns pyparsing expressions for the opening and
     659          # closing tags as a 2-tuple
     660          a, a_end = make_html_tags("A")
     661          link_expr = a + SkipTo(a_end)("link_text") + a_end
     662  
     663          for link in link_expr.search_string(text):
     664              # attributes in the <A> tag (like "href" shown here) are
     665              # also accessible as named results
     666              print(link.link_text, '->', link.href)
     667  
     668      prints::
     669  
     670          pyparsing -> https://github.com/pyparsing/pyparsing/wiki
     671      """
     672      return _makeTags(tag_str, False)
     673  
     674  
     675  def make_xml_tags(
     676      tag_str: Union[str, ParserElement]
     677  ) -> Tuple[ParserElement, ParserElement]:
     678      """Helper to construct opening and closing tag expressions for XML,
     679      given a tag name. Matches tags only in the given upper/lower case.
     680  
     681      Example: similar to :class:`make_html_tags`
     682      """
     683      return _makeTags(tag_str, True)
     684  
     685  
     686  any_open_tag: ParserElement
     687  any_close_tag: ParserElement
     688  any_open_tag, any_close_tag = make_html_tags(
     689      Word(alphas, alphanums + "_:").set_name("any tag")
     690  )
     691  
     692  _htmlEntityMap = {k.rstrip(";"): v for k, v in html.entities.html5.items()}
     693  common_html_entity = Regex("&(?P<entity>" + "|".join(_htmlEntityMap) + ");").set_name(
     694      "common HTML entity"
     695  )
     696  
     697  
     698  def replace_html_entity(t):
     699      """Helper parser action to replace common HTML entities with their special characters"""
     700      return _htmlEntityMap.get(t.entity)
     701  
     702  
     703  class ESC[4;38;5;81mOpAssoc(ESC[4;38;5;149mEnum):
     704      LEFT = 1
     705      RIGHT = 2
     706  
     707  
     708  InfixNotationOperatorArgType = Union[
     709      ParserElement, str, Tuple[Union[ParserElement, str], Union[ParserElement, str]]
     710  ]
     711  InfixNotationOperatorSpec = Union[
     712      Tuple[
     713          InfixNotationOperatorArgType,
     714          int,
     715          OpAssoc,
     716          typing.Optional[ParseAction],
     717      ],
     718      Tuple[
     719          InfixNotationOperatorArgType,
     720          int,
     721          OpAssoc,
     722      ],
     723  ]
     724  
     725  
     726  def infix_notation(
     727      base_expr: ParserElement,
     728      op_list: List[InfixNotationOperatorSpec],
     729      lpar: Union[str, ParserElement] = Suppress("("),
     730      rpar: Union[str, ParserElement] = Suppress(")"),
     731  ) -> ParserElement:
     732      """Helper method for constructing grammars of expressions made up of
     733      operators working in a precedence hierarchy.  Operators may be unary
     734      or binary, left- or right-associative.  Parse actions can also be
     735      attached to operator expressions. The generated parser will also
     736      recognize the use of parentheses to override operator precedences
     737      (see example below).
     738  
     739      Note: if you define a deep operator list, you may see performance
     740      issues when using infix_notation. See
     741      :class:`ParserElement.enable_packrat` for a mechanism to potentially
     742      improve your parser performance.
     743  
     744      Parameters:
     745      - ``base_expr`` - expression representing the most basic operand to
     746        be used in the expression
     747      - ``op_list`` - list of tuples, one for each operator precedence level
     748        in the expression grammar; each tuple is of the form ``(op_expr,
     749        num_operands, right_left_assoc, (optional)parse_action)``, where:
     750  
     751        - ``op_expr`` is the pyparsing expression for the operator; may also
     752          be a string, which will be converted to a Literal; if ``num_operands``
     753          is 3, ``op_expr`` is a tuple of two expressions, for the two
     754          operators separating the 3 terms
     755        - ``num_operands`` is the number of terms for this operator (must be 1,
     756          2, or 3)
     757        - ``right_left_assoc`` is the indicator whether the operator is right
     758          or left associative, using the pyparsing-defined constants
     759          ``OpAssoc.RIGHT`` and ``OpAssoc.LEFT``.
     760        - ``parse_action`` is the parse action to be associated with
     761          expressions matching this operator expression (the parse action
     762          tuple member may be omitted); if the parse action is passed
     763          a tuple or list of functions, this is equivalent to calling
     764          ``set_parse_action(*fn)``
     765          (:class:`ParserElement.set_parse_action`)
     766      - ``lpar`` - expression for matching left-parentheses; if passed as a
     767        str, then will be parsed as Suppress(lpar). If lpar is passed as
     768        an expression (such as ``Literal('(')``), then it will be kept in
     769        the parsed results, and grouped with them. (default= ``Suppress('(')``)
     770      - ``rpar`` - expression for matching right-parentheses; if passed as a
     771        str, then will be parsed as Suppress(rpar). If rpar is passed as
     772        an expression (such as ``Literal(')')``), then it will be kept in
     773        the parsed results, and grouped with them. (default= ``Suppress(')')``)
     774  
     775      Example::
     776  
     777          # simple example of four-function arithmetic with ints and
     778          # variable names
     779          integer = pyparsing_common.signed_integer
     780          varname = pyparsing_common.identifier
     781  
     782          arith_expr = infix_notation(integer | varname,
     783              [
     784              ('-', 1, OpAssoc.RIGHT),
     785              (one_of('* /'), 2, OpAssoc.LEFT),
     786              (one_of('+ -'), 2, OpAssoc.LEFT),
     787              ])
     788  
     789          arith_expr.run_tests('''
     790              5+3*6
     791              (5+3)*6
     792              -2--11
     793              ''', full_dump=False)
     794  
     795      prints::
     796  
     797          5+3*6
     798          [[5, '+', [3, '*', 6]]]
     799  
     800          (5+3)*6
     801          [[[5, '+', 3], '*', 6]]
     802  
     803          -2--11
     804          [[['-', 2], '-', ['-', 11]]]
     805      """
     806      # captive version of FollowedBy that does not do parse actions or capture results names
     807      class ESC[4;38;5;81m_FB(ESC[4;38;5;149mFollowedBy):
     808          def parseImpl(self, instring, loc, doActions=True):
     809              self.expr.try_parse(instring, loc)
     810              return loc, []
     811  
     812      _FB.__name__ = "FollowedBy>"
     813  
     814      ret = Forward()
     815      if isinstance(lpar, str):
     816          lpar = Suppress(lpar)
     817      if isinstance(rpar, str):
     818          rpar = Suppress(rpar)
     819  
     820      # if lpar and rpar are not suppressed, wrap in group
     821      if not (isinstance(rpar, Suppress) and isinstance(rpar, Suppress)):
     822          lastExpr = base_expr | Group(lpar + ret + rpar)
     823      else:
     824          lastExpr = base_expr | (lpar + ret + rpar)
     825  
     826      for i, operDef in enumerate(op_list):
     827          opExpr, arity, rightLeftAssoc, pa = (operDef + (None,))[:4]
     828          if isinstance(opExpr, str_type):
     829              opExpr = ParserElement._literalStringClass(opExpr)
     830          if arity == 3:
     831              if not isinstance(opExpr, (tuple, list)) or len(opExpr) != 2:
     832                  raise ValueError(
     833                      "if numterms=3, opExpr must be a tuple or list of two expressions"
     834                  )
     835              opExpr1, opExpr2 = opExpr
     836              term_name = "{}{} term".format(opExpr1, opExpr2)
     837          else:
     838              term_name = "{} term".format(opExpr)
     839  
     840          if not 1 <= arity <= 3:
     841              raise ValueError("operator must be unary (1), binary (2), or ternary (3)")
     842  
     843          if rightLeftAssoc not in (OpAssoc.LEFT, OpAssoc.RIGHT):
     844              raise ValueError("operator must indicate right or left associativity")
     845  
     846          thisExpr: Forward = Forward().set_name(term_name)
     847          if rightLeftAssoc is OpAssoc.LEFT:
     848              if arity == 1:
     849                  matchExpr = _FB(lastExpr + opExpr) + Group(lastExpr + opExpr[1, ...])
     850              elif arity == 2:
     851                  if opExpr is not None:
     852                      matchExpr = _FB(lastExpr + opExpr + lastExpr) + Group(
     853                          lastExpr + (opExpr + lastExpr)[1, ...]
     854                      )
     855                  else:
     856                      matchExpr = _FB(lastExpr + lastExpr) + Group(lastExpr[2, ...])
     857              elif arity == 3:
     858                  matchExpr = _FB(
     859                      lastExpr + opExpr1 + lastExpr + opExpr2 + lastExpr
     860                  ) + Group(lastExpr + OneOrMore(opExpr1 + lastExpr + opExpr2 + lastExpr))
     861          elif rightLeftAssoc is OpAssoc.RIGHT:
     862              if arity == 1:
     863                  # try to avoid LR with this extra test
     864                  if not isinstance(opExpr, Opt):
     865                      opExpr = Opt(opExpr)
     866                  matchExpr = _FB(opExpr.expr + thisExpr) + Group(opExpr + thisExpr)
     867              elif arity == 2:
     868                  if opExpr is not None:
     869                      matchExpr = _FB(lastExpr + opExpr + thisExpr) + Group(
     870                          lastExpr + (opExpr + thisExpr)[1, ...]
     871                      )
     872                  else:
     873                      matchExpr = _FB(lastExpr + thisExpr) + Group(
     874                          lastExpr + thisExpr[1, ...]
     875                      )
     876              elif arity == 3:
     877                  matchExpr = _FB(
     878                      lastExpr + opExpr1 + thisExpr + opExpr2 + thisExpr
     879                  ) + Group(lastExpr + opExpr1 + thisExpr + opExpr2 + thisExpr)
     880          if pa:
     881              if isinstance(pa, (tuple, list)):
     882                  matchExpr.set_parse_action(*pa)
     883              else:
     884                  matchExpr.set_parse_action(pa)
     885          thisExpr <<= (matchExpr | lastExpr).setName(term_name)
     886          lastExpr = thisExpr
     887      ret <<= lastExpr
     888      return ret
     889  
     890  
     891  def indentedBlock(blockStatementExpr, indentStack, indent=True, backup_stacks=[]):
     892      """
     893      (DEPRECATED - use IndentedBlock class instead)
     894      Helper method for defining space-delimited indentation blocks,
     895      such as those used to define block statements in Python source code.
     896  
     897      Parameters:
     898  
     899      - ``blockStatementExpr`` - expression defining syntax of statement that
     900        is repeated within the indented block
     901      - ``indentStack`` - list created by caller to manage indentation stack
     902        (multiple ``statementWithIndentedBlock`` expressions within a single
     903        grammar should share a common ``indentStack``)
     904      - ``indent`` - boolean indicating whether block must be indented beyond
     905        the current level; set to ``False`` for block of left-most statements
     906        (default= ``True``)
     907  
     908      A valid block must contain at least one ``blockStatement``.
     909  
     910      (Note that indentedBlock uses internal parse actions which make it
     911      incompatible with packrat parsing.)
     912  
     913      Example::
     914  
     915          data = '''
     916          def A(z):
     917            A1
     918            B = 100
     919            G = A2
     920            A2
     921            A3
     922          B
     923          def BB(a,b,c):
     924            BB1
     925            def BBA():
     926              bba1
     927              bba2
     928              bba3
     929          C
     930          D
     931          def spam(x,y):
     932               def eggs(z):
     933                   pass
     934          '''
     935  
     936  
     937          indentStack = [1]
     938          stmt = Forward()
     939  
     940          identifier = Word(alphas, alphanums)
     941          funcDecl = ("def" + identifier + Group("(" + Opt(delimitedList(identifier)) + ")") + ":")
     942          func_body = indentedBlock(stmt, indentStack)
     943          funcDef = Group(funcDecl + func_body)
     944  
     945          rvalue = Forward()
     946          funcCall = Group(identifier + "(" + Opt(delimitedList(rvalue)) + ")")
     947          rvalue << (funcCall | identifier | Word(nums))
     948          assignment = Group(identifier + "=" + rvalue)
     949          stmt << (funcDef | assignment | identifier)
     950  
     951          module_body = stmt[1, ...]
     952  
     953          parseTree = module_body.parseString(data)
     954          parseTree.pprint()
     955  
     956      prints::
     957  
     958          [['def',
     959            'A',
     960            ['(', 'z', ')'],
     961            ':',
     962            [['A1'], [['B', '=', '100']], [['G', '=', 'A2']], ['A2'], ['A3']]],
     963           'B',
     964           ['def',
     965            'BB',
     966            ['(', 'a', 'b', 'c', ')'],
     967            ':',
     968            [['BB1'], [['def', 'BBA', ['(', ')'], ':', [['bba1'], ['bba2'], ['bba3']]]]]],
     969           'C',
     970           'D',
     971           ['def',
     972            'spam',
     973            ['(', 'x', 'y', ')'],
     974            ':',
     975            [[['def', 'eggs', ['(', 'z', ')'], ':', [['pass']]]]]]]
     976      """
     977      backup_stacks.append(indentStack[:])
     978  
     979      def reset_stack():
     980          indentStack[:] = backup_stacks[-1]
     981  
     982      def checkPeerIndent(s, l, t):
     983          if l >= len(s):
     984              return
     985          curCol = col(l, s)
     986          if curCol != indentStack[-1]:
     987              if curCol > indentStack[-1]:
     988                  raise ParseException(s, l, "illegal nesting")
     989              raise ParseException(s, l, "not a peer entry")
     990  
     991      def checkSubIndent(s, l, t):
     992          curCol = col(l, s)
     993          if curCol > indentStack[-1]:
     994              indentStack.append(curCol)
     995          else:
     996              raise ParseException(s, l, "not a subentry")
     997  
     998      def checkUnindent(s, l, t):
     999          if l >= len(s):
    1000              return
    1001          curCol = col(l, s)
    1002          if not (indentStack and curCol in indentStack):
    1003              raise ParseException(s, l, "not an unindent")
    1004          if curCol < indentStack[-1]:
    1005              indentStack.pop()
    1006  
    1007      NL = OneOrMore(LineEnd().set_whitespace_chars("\t ").suppress())
    1008      INDENT = (Empty() + Empty().set_parse_action(checkSubIndent)).set_name("INDENT")
    1009      PEER = Empty().set_parse_action(checkPeerIndent).set_name("")
    1010      UNDENT = Empty().set_parse_action(checkUnindent).set_name("UNINDENT")
    1011      if indent:
    1012          smExpr = Group(
    1013              Opt(NL)
    1014              + INDENT
    1015              + OneOrMore(PEER + Group(blockStatementExpr) + Opt(NL))
    1016              + UNDENT
    1017          )
    1018      else:
    1019          smExpr = Group(
    1020              Opt(NL)
    1021              + OneOrMore(PEER + Group(blockStatementExpr) + Opt(NL))
    1022              + Opt(UNDENT)
    1023          )
    1024  
    1025      # add a parse action to remove backup_stack from list of backups
    1026      smExpr.add_parse_action(
    1027          lambda: backup_stacks.pop(-1) and None if backup_stacks else None
    1028      )
    1029      smExpr.set_fail_action(lambda a, b, c, d: reset_stack())
    1030      blockStatementExpr.ignore(_bslash + LineEnd())
    1031      return smExpr.set_name("indented block")
    1032  
    1033  
    1034  # it's easy to get these comment structures wrong - they're very common, so may as well make them available
    1035  c_style_comment = Combine(Regex(r"/\*(?:[^*]|\*(?!/))*") + "*/").set_name(
    1036      "C style comment"
    1037  )
    1038  "Comment of the form ``/* ... */``"
    1039  
    1040  html_comment = Regex(r"<!--[\s\S]*?-->").set_name("HTML comment")
    1041  "Comment of the form ``<!-- ... -->``"
    1042  
    1043  rest_of_line = Regex(r".*").leave_whitespace().set_name("rest of line")
    1044  dbl_slash_comment = Regex(r"//(?:\\\n|[^\n])*").set_name("// comment")
    1045  "Comment of the form ``// ... (to end of line)``"
    1046  
    1047  cpp_style_comment = Combine(
    1048      Regex(r"/\*(?:[^*]|\*(?!/))*") + "*/" | dbl_slash_comment
    1049  ).set_name("C++ style comment")
    1050  "Comment of either form :class:`c_style_comment` or :class:`dbl_slash_comment`"
    1051  
    1052  java_style_comment = cpp_style_comment
    1053  "Same as :class:`cpp_style_comment`"
    1054  
    1055  python_style_comment = Regex(r"#.*").set_name("Python style comment")
    1056  "Comment of the form ``# ... (to end of line)``"
    1057  
    1058  
    1059  # build list of built-in expressions, for future reference if a global default value
    1060  # gets updated
    1061  _builtin_exprs: List[ParserElement] = [
    1062      v for v in vars().values() if isinstance(v, ParserElement)
    1063  ]
    1064  
    1065  
    1066  # pre-PEP8 compatible names
    1067  delimitedList = delimited_list
    1068  countedArray = counted_array
    1069  matchPreviousLiteral = match_previous_literal
    1070  matchPreviousExpr = match_previous_expr
    1071  oneOf = one_of
    1072  dictOf = dict_of
    1073  originalTextFor = original_text_for
    1074  nestedExpr = nested_expr
    1075  makeHTMLTags = make_html_tags
    1076  makeXMLTags = make_xml_tags
    1077  anyOpenTag, anyCloseTag = any_open_tag, any_close_tag
    1078  commonHTMLEntity = common_html_entity
    1079  replaceHTMLEntity = replace_html_entity
    1080  opAssoc = OpAssoc
    1081  infixNotation = infix_notation
    1082  cStyleComment = c_style_comment
    1083  htmlComment = html_comment
    1084  restOfLine = rest_of_line
    1085  dblSlashComment = dbl_slash_comment
    1086  cppStyleComment = cpp_style_comment
    1087  javaStyleComment = java_style_comment
    1088  pythonStyleComment = python_style_comment