python (3.11.7)

(root)/
lib/
python3.11/
html/
__init__.py
       1  """
       2  General functions for HTML manipulation.
       3  """
       4  
       5  import re as _re
       6  from html.entities import html5 as _html5
       7  
       8  
       9  __all__ = ['escape', 'unescape']
      10  
      11  
      12  def escape(s, quote=True):
      13      """
      14      Replace special characters "&", "<" and ">" to HTML-safe sequences.
      15      If the optional flag quote is true (the default), the quotation mark
      16      characters, both double quote (") and single quote (') characters are also
      17      translated.
      18      """
      19      s = s.replace("&", "&amp;") # Must be done first!
      20      s = s.replace("<", "&lt;")
      21      s = s.replace(">", "&gt;")
      22      if quote:
      23          s = s.replace('"', "&quot;")
      24          s = s.replace('\'', "&#x27;")
      25      return s
      26  
      27  
      28  # see https://html.spec.whatwg.org/multipage/parsing.html#numeric-character-reference-end-state
      29  
      30  _invalid_charrefs = {
      31      0x00: '\ufffd',  # REPLACEMENT CHARACTER
      32      0x0d: '\r',      # CARRIAGE RETURN
      33      0x80: '\u20ac',  # EURO SIGN
      34      0x81: '\x81',    # <control>
      35      0x82: '\u201a',  # SINGLE LOW-9 QUOTATION MARK
      36      0x83: '\u0192',  # LATIN SMALL LETTER F WITH HOOK
      37      0x84: '\u201e',  # DOUBLE LOW-9 QUOTATION MARK
      38      0x85: '\u2026',  # HORIZONTAL ELLIPSIS
      39      0x86: '\u2020',  # DAGGER
      40      0x87: '\u2021',  # DOUBLE DAGGER
      41      0x88: '\u02c6',  # MODIFIER LETTER CIRCUMFLEX ACCENT
      42      0x89: '\u2030',  # PER MILLE SIGN
      43      0x8a: '\u0160',  # LATIN CAPITAL LETTER S WITH CARON
      44      0x8b: '\u2039',  # SINGLE LEFT-POINTING ANGLE QUOTATION MARK
      45      0x8c: '\u0152',  # LATIN CAPITAL LIGATURE OE
      46      0x8d: '\x8d',    # <control>
      47      0x8e: '\u017d',  # LATIN CAPITAL LETTER Z WITH CARON
      48      0x8f: '\x8f',    # <control>
      49      0x90: '\x90',    # <control>
      50      0x91: '\u2018',  # LEFT SINGLE QUOTATION MARK
      51      0x92: '\u2019',  # RIGHT SINGLE QUOTATION MARK
      52      0x93: '\u201c',  # LEFT DOUBLE QUOTATION MARK
      53      0x94: '\u201d',  # RIGHT DOUBLE QUOTATION MARK
      54      0x95: '\u2022',  # BULLET
      55      0x96: '\u2013',  # EN DASH
      56      0x97: '\u2014',  # EM DASH
      57      0x98: '\u02dc',  # SMALL TILDE
      58      0x99: '\u2122',  # TRADE MARK SIGN
      59      0x9a: '\u0161',  # LATIN SMALL LETTER S WITH CARON
      60      0x9b: '\u203a',  # SINGLE RIGHT-POINTING ANGLE QUOTATION MARK
      61      0x9c: '\u0153',  # LATIN SMALL LIGATURE OE
      62      0x9d: '\x9d',    # <control>
      63      0x9e: '\u017e',  # LATIN SMALL LETTER Z WITH CARON
      64      0x9f: '\u0178',  # LATIN CAPITAL LETTER Y WITH DIAERESIS
      65  }
      66  
      67  _invalid_codepoints = {
      68      # 0x0001 to 0x0008
      69      0x1, 0x2, 0x3, 0x4, 0x5, 0x6, 0x7, 0x8,
      70      # 0x000E to 0x001F
      71      0xe, 0xf, 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, 0x18, 0x19,
      72      0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f,
      73      # 0x007F to 0x009F
      74      0x7f, 0x80, 0x81, 0x82, 0x83, 0x84, 0x85, 0x86, 0x87, 0x88, 0x89, 0x8a,
      75      0x8b, 0x8c, 0x8d, 0x8e, 0x8f, 0x90, 0x91, 0x92, 0x93, 0x94, 0x95, 0x96,
      76      0x97, 0x98, 0x99, 0x9a, 0x9b, 0x9c, 0x9d, 0x9e, 0x9f,
      77      # 0xFDD0 to 0xFDEF
      78      0xfdd0, 0xfdd1, 0xfdd2, 0xfdd3, 0xfdd4, 0xfdd5, 0xfdd6, 0xfdd7, 0xfdd8,
      79      0xfdd9, 0xfdda, 0xfddb, 0xfddc, 0xfddd, 0xfdde, 0xfddf, 0xfde0, 0xfde1,
      80      0xfde2, 0xfde3, 0xfde4, 0xfde5, 0xfde6, 0xfde7, 0xfde8, 0xfde9, 0xfdea,
      81      0xfdeb, 0xfdec, 0xfded, 0xfdee, 0xfdef,
      82      # others
      83      0xb, 0xfffe, 0xffff, 0x1fffe, 0x1ffff, 0x2fffe, 0x2ffff, 0x3fffe, 0x3ffff,
      84      0x4fffe, 0x4ffff, 0x5fffe, 0x5ffff, 0x6fffe, 0x6ffff, 0x7fffe, 0x7ffff,
      85      0x8fffe, 0x8ffff, 0x9fffe, 0x9ffff, 0xafffe, 0xaffff, 0xbfffe, 0xbffff,
      86      0xcfffe, 0xcffff, 0xdfffe, 0xdffff, 0xefffe, 0xeffff, 0xffffe, 0xfffff,
      87      0x10fffe, 0x10ffff
      88  }
      89  
      90  
      91  def _replace_charref(s):
      92      s = s.group(1)
      93      if s[0] == '#':
      94          # numeric charref
      95          if s[1] in 'xX':
      96              num = int(s[2:].rstrip(';'), 16)
      97          else:
      98              num = int(s[1:].rstrip(';'))
      99          if num in _invalid_charrefs:
     100              return _invalid_charrefs[num]
     101          if 0xD800 <= num <= 0xDFFF or num > 0x10FFFF:
     102              return '\uFFFD'
     103          if num in _invalid_codepoints:
     104              return ''
     105          return chr(num)
     106      else:
     107          # named charref
     108          if s in _html5:
     109              return _html5[s]
     110          # find the longest matching name (as defined by the standard)
     111          for x in range(len(s)-1, 1, -1):
     112              if s[:x] in _html5:
     113                  return _html5[s[:x]] + s[x:]
     114          else:
     115              return '&' + s
     116  
     117  
     118  _charref = _re.compile(r'&(#[0-9]+;?'
     119                         r'|#[xX][0-9a-fA-F]+;?'
     120                         r'|[^\t\n\f <&#;]{1,32};?)')
     121  
     122  def unescape(s):
     123      """
     124      Convert all named and numeric character references (e.g. &gt;, &#62;,
     125      &x3e;) in the string s to the corresponding unicode characters.
     126      This function uses the rules defined by the HTML 5 standard
     127      for both valid and invalid character references, and the list of
     128      HTML 5 named character references defined in html.entities.html5.
     129      """
     130      if '&' not in s:
     131          return s
     132      return _charref.sub(_replace_charref, s)