1  # This module implements the RFCs 3490 (IDNA) and 3491 (Nameprep)
       2  
       3  import stringprep, re, codecs
       4  from unicodedata import ucd_3_2_0 as unicodedata
       5  
       6  # IDNA section 3.1
       7  dots = re.compile("[\u002E\u3002\uFF0E\uFF61]")
       8  
       9  # IDNA section 5
      10  ace_prefix = b"xn--"
      11  sace_prefix = "xn--"
      12  
      13  # This assumes query strings, so AllowUnassigned is true
      14  def nameprep(label):
      15      # Map
      16      newlabel = []
      17      for c in label:
      18          if stringprep.in_table_b1(c):
      19              # Map to nothing
      20              continue
      21          newlabel.append(stringprep.map_table_b2(c))
      22      label = "".join(newlabel)
      23  
      24      # Normalize
      25      label = unicodedata.normalize("NFKC", label)
      26  
      27      # Prohibit
      28      for c in label:
      29          if stringprep.in_table_c12(c) or \
      30             stringprep.in_table_c22(c) or \
      31             stringprep.in_table_c3(c) or \
      32             stringprep.in_table_c4(c) or \
      33             stringprep.in_table_c5(c) or \
      34             stringprep.in_table_c6(c) or \
      35             stringprep.in_table_c7(c) or \
      36             stringprep.in_table_c8(c) or \
      37             stringprep.in_table_c9(c):
      38              raise UnicodeError("Invalid character %r" % c)
      39  
      40      # Check bidi
      41      RandAL = [stringprep.in_table_d1(x) for x in label]
      42      if any(RandAL):
      43          # There is a RandAL char in the string. Must perform further
      44          # tests:
      45          # 1) The characters in section 5.8 MUST be prohibited.
      46          # This is table C.8, which was already checked
      47          # 2) If a string contains any RandALCat character, the string
      48          # MUST NOT contain any LCat character.
      49          if any(stringprep.in_table_d2(x) for x in label):
      50              raise UnicodeError("Violation of BIDI requirement 2")
      51          # 3) If a string contains any RandALCat character, a
      52          # RandALCat character MUST be the first character of the
      53          # string, and a RandALCat character MUST be the last
      54          # character of the string.
      55          if not RandAL[0] or not RandAL[-1]:
      56              raise UnicodeError("Violation of BIDI requirement 3")
      57  
      58      return label
      59  
      60  def ToASCII(label):
      61      try:
      62          # Step 1: try ASCII
      63          label = label.encode("ascii")
      64      except UnicodeError:
      65          pass
      66      else:
      67          # Skip to step 3: UseSTD3ASCIIRules is false, so
      68          # Skip to step 8.
      69          if 0 < len(label) < 64:
      70              return label
      71          raise UnicodeError("label empty or too long")
      72  
      73      # Step 2: nameprep
      74      label = nameprep(label)
      75  
      76      # Step 3: UseSTD3ASCIIRules is false
      77      # Step 4: try ASCII
      78      try:
      79          label = label.encode("ascii")
      80      except UnicodeError:
      81          pass
      82      else:
      83          # Skip to step 8.
      84          if 0 < len(label) < 64:
      85              return label
      86          raise UnicodeError("label empty or too long")
      87  
      88      # Step 5: Check ACE prefix
      89      if label.startswith(sace_prefix):
      90          raise UnicodeError("Label starts with ACE prefix")
      91  
      92      # Step 6: Encode with PUNYCODE
      93      label = label.encode("punycode")
      94  
      95      # Step 7: Prepend ACE prefix
      96      label = ace_prefix + label
      97  
      98      # Step 8: Check size
      99      if 0 < len(label) < 64:
     100          return label
     101      raise UnicodeError("label empty or too long")
     102  
     103  def ToUnicode(label):
     104      if len(label) > 1024:
     105          # Protection from https://github.com/python/cpython/issues/98433.
     106          # https://datatracker.ietf.org/doc/html/rfc5894#section-6
     107          # doesn't specify a label size limit prior to NAMEPREP. But having
     108          # one makes practical sense.
     109          # This leaves ample room for nameprep() to remove Nothing characters
     110          # per https://www.rfc-editor.org/rfc/rfc3454#section-3.1 while still
     111          # preventing us from wasting time decoding a big thing that'll just
     112          # hit the actual <= 63 length limit in Step 6.
     113          raise UnicodeError("label way too long")
     114      # Step 1: Check for ASCII
     115      if isinstance(label, bytes):
     116          pure_ascii = True
     117      else:
     118          try:
     119              label = label.encode("ascii")
     120              pure_ascii = True
     121          except UnicodeError:
     122              pure_ascii = False
     123      if not pure_ascii:
     124          # Step 2: Perform nameprep
     125          label = nameprep(label)
     126          # It doesn't say this, but apparently, it should be ASCII now
     127          try:
     128              label = label.encode("ascii")
     129          except UnicodeError:
     130              raise UnicodeError("Invalid character in IDN label")
     131      # Step 3: Check for ACE prefix
     132      if not label.startswith(ace_prefix):
     133          return str(label, "ascii")
     134  
     135      # Step 4: Remove ACE prefix
     136      label1 = label[len(ace_prefix):]
     137  
     138      # Step 5: Decode using PUNYCODE
     139      result = label1.decode("punycode")
     140  
     141      # Step 6: Apply ToASCII
     142      label2 = ToASCII(result)
     143  
     144      # Step 7: Compare the result of step 6 with the one of step 3
     145      # label2 will already be in lower case.
     146      if str(label, "ascii").lower() != str(label2, "ascii"):
     147          raise UnicodeError("IDNA does not round-trip", label, label2)
     148  
     149      # Step 8: return the result of step 5
     150      return result
     151  
     152  ### Codec APIs
     153  
     154  class ESC[4;38;5;81mCodec(ESC[4;38;5;149mcodecsESC[4;38;5;149m.ESC[4;38;5;149mCodec):
     155      def encode(self, input, errors='strict'):
     156  
     157          if errors != 'strict':
     158              # IDNA is quite clear that implementations must be strict
     159              raise UnicodeError("unsupported error handling "+errors)
     160  
     161          if not input:
     162              return b'', 0
     163  
     164          try:
     165              result = input.encode('ascii')
     166          except UnicodeEncodeError:
     167              pass
     168          else:
     169              # ASCII name: fast path
     170              labels = result.split(b'.')
     171              for label in labels[:-1]:
     172                  if not (0 < len(label) < 64):
     173                      raise UnicodeError("label empty or too long")
     174              if len(labels[-1]) >= 64:
     175                  raise UnicodeError("label too long")
     176              return result, len(input)
     177  
     178          result = bytearray()
     179          labels = dots.split(input)
     180          if labels and not labels[-1]:
     181              trailing_dot = b'.'
     182              del labels[-1]
     183          else:
     184              trailing_dot = b''
     185          for label in labels:
     186              if result:
     187                  # Join with U+002E
     188                  result.extend(b'.')
     189              result.extend(ToASCII(label))
     190          return bytes(result+trailing_dot), len(input)
     191  
     192      def decode(self, input, errors='strict'):
     193  
     194          if errors != 'strict':
     195              raise UnicodeError("Unsupported error handling "+errors)
     196  
     197          if not input:
     198              return "", 0
     199  
     200          # IDNA allows decoding to operate on Unicode strings, too.
     201          if not isinstance(input, bytes):
     202              # XXX obviously wrong, see #3232
     203              input = bytes(input)
     204  
     205          if ace_prefix not in input:
     206              # Fast path
     207              try:
     208                  return input.decode('ascii'), len(input)
     209              except UnicodeDecodeError:
     210                  pass
     211  
     212          labels = input.split(b".")
     213  
     214          if labels and len(labels[-1]) == 0:
     215              trailing_dot = '.'
     216              del labels[-1]
     217          else:
     218              trailing_dot = ''
     219  
     220          result = []
     221          for label in labels:
     222              result.append(ToUnicode(label))
     223  
     224          return ".".join(result)+trailing_dot, len(input)
     225  
     226  class ESC[4;38;5;81mIncrementalEncoder(ESC[4;38;5;149mcodecsESC[4;38;5;149m.ESC[4;38;5;149mBufferedIncrementalEncoder):
     227      def _buffer_encode(self, input, errors, final):
     228          if errors != 'strict':
     229              # IDNA is quite clear that implementations must be strict
     230              raise UnicodeError("unsupported error handling "+errors)
     231  
     232          if not input:
     233              return (b'', 0)
     234  
     235          labels = dots.split(input)
     236          trailing_dot = b''
     237          if labels:
     238              if not labels[-1]:
     239                  trailing_dot = b'.'
     240                  del labels[-1]
     241              elif not final:
     242                  # Keep potentially unfinished label until the next call
     243                  del labels[-1]
     244                  if labels:
     245                      trailing_dot = b'.'
     246  
     247          result = bytearray()
     248          size = 0
     249          for label in labels:
     250              if size:
     251                  # Join with U+002E
     252                  result.extend(b'.')
     253                  size += 1
     254              result.extend(ToASCII(label))
     255              size += len(label)
     256  
     257          result += trailing_dot
     258          size += len(trailing_dot)
     259          return (bytes(result), size)
     260  
     261  class ESC[4;38;5;81mIncrementalDecoder(ESC[4;38;5;149mcodecsESC[4;38;5;149m.ESC[4;38;5;149mBufferedIncrementalDecoder):
     262      def _buffer_decode(self, input, errors, final):
     263          if errors != 'strict':
     264              raise UnicodeError("Unsupported error handling "+errors)
     265  
     266          if not input:
     267              return ("", 0)
     268  
     269          # IDNA allows decoding to operate on Unicode strings, too.
     270          if isinstance(input, str):
     271              labels = dots.split(input)
     272          else:
     273              # Must be ASCII string
     274              input = str(input, "ascii")
     275              labels = input.split(".")
     276  
     277          trailing_dot = ''
     278          if labels:
     279              if not labels[-1]:
     280                  trailing_dot = '.'
     281                  del labels[-1]
     282              elif not final:
     283                  # Keep potentially unfinished label until the next call
     284                  del labels[-1]
     285                  if labels:
     286                      trailing_dot = '.'
     287  
     288          result = []
     289          size = 0
     290          for label in labels:
     291              result.append(ToUnicode(label))
     292              if size:
     293                  size += 1
     294              size += len(label)
     295  
     296          result = ".".join(result) + trailing_dot
     297          size += len(trailing_dot)
     298          return (result, size)
     299  
     300  class ESC[4;38;5;81mStreamWriter(ESC[4;38;5;149mCodec,ESC[4;38;5;149mcodecsESC[4;38;5;149m.ESC[4;38;5;149mStreamWriter):
     301      pass
     302  
     303  class ESC[4;38;5;81mStreamReader(ESC[4;38;5;149mCodec,ESC[4;38;5;149mcodecsESC[4;38;5;149m.ESC[4;38;5;149mStreamReader):
     304      pass
     305  
     306  ### encodings module API
     307  
     308  def getregentry():
     309      return codecs.CodecInfo(
     310          name='idna',
     311          encode=Codec().encode,
     312          decode=Codec().decode,
     313          incrementalencoder=IncrementalEncoder,
     314          incrementaldecoder=IncrementalDecoder,
     315          streamwriter=StreamWriter,
     316          streamreader=StreamReader,
     317      )