1 # This module implements the RFCs 3490 (IDNA) and 3491 (Nameprep)
2
3 import stringprep, re, codecs
4 from unicodedata import ucd_3_2_0 as unicodedata
5
6 # IDNA section 3.1
7 dots = re.compile("[\u002E\u3002\uFF0E\uFF61]")
8
9 # IDNA section 5
10 ace_prefix = b"xn--"
11 sace_prefix = "xn--"
12
13 # This assumes query strings, so AllowUnassigned is true
14 def nameprep(label):
15 # Map
16 newlabel = []
17 for c in label:
18 if stringprep.in_table_b1(c):
19 # Map to nothing
20 continue
21 newlabel.append(stringprep.map_table_b2(c))
22 label = "".join(newlabel)
23
24 # Normalize
25 label = unicodedata.normalize("NFKC", label)
26
27 # Prohibit
28 for c in label:
29 if stringprep.in_table_c12(c) or \
30 stringprep.in_table_c22(c) or \
31 stringprep.in_table_c3(c) or \
32 stringprep.in_table_c4(c) or \
33 stringprep.in_table_c5(c) or \
34 stringprep.in_table_c6(c) or \
35 stringprep.in_table_c7(c) or \
36 stringprep.in_table_c8(c) or \
37 stringprep.in_table_c9(c):
38 raise UnicodeError("Invalid character %r" % c)
39
40 # Check bidi
41 RandAL = [stringprep.in_table_d1(x) for x in label]
42 if any(RandAL):
43 # There is a RandAL char in the string. Must perform further
44 # tests:
45 # 1) The characters in section 5.8 MUST be prohibited.
46 # This is table C.8, which was already checked
47 # 2) If a string contains any RandALCat character, the string
48 # MUST NOT contain any LCat character.
49 if any(stringprep.in_table_d2(x) for x in label):
50 raise UnicodeError("Violation of BIDI requirement 2")
51 # 3) If a string contains any RandALCat character, a
52 # RandALCat character MUST be the first character of the
53 # string, and a RandALCat character MUST be the last
54 # character of the string.
55 if not RandAL[0] or not RandAL[-1]:
56 raise UnicodeError("Violation of BIDI requirement 3")
57
58 return label
59
60 def ToASCII(label):
61 try:
62 # Step 1: try ASCII
63 label = label.encode("ascii")
64 except UnicodeError:
65 pass
66 else:
67 # Skip to step 3: UseSTD3ASCIIRules is false, so
68 # Skip to step 8.
69 if 0 < len(label) < 64:
70 return label
71 raise UnicodeError("label empty or too long")
72
73 # Step 2: nameprep
74 label = nameprep(label)
75
76 # Step 3: UseSTD3ASCIIRules is false
77 # Step 4: try ASCII
78 try:
79 label = label.encode("ascii")
80 except UnicodeError:
81 pass
82 else:
83 # Skip to step 8.
84 if 0 < len(label) < 64:
85 return label
86 raise UnicodeError("label empty or too long")
87
88 # Step 5: Check ACE prefix
89 if label.startswith(sace_prefix):
90 raise UnicodeError("Label starts with ACE prefix")
91
92 # Step 6: Encode with PUNYCODE
93 label = label.encode("punycode")
94
95 # Step 7: Prepend ACE prefix
96 label = ace_prefix + label
97
98 # Step 8: Check size
99 if 0 < len(label) < 64:
100 return label
101 raise UnicodeError("label empty or too long")
102
103 def ToUnicode(label):
104 if len(label) > 1024:
105 # Protection from https://github.com/python/cpython/issues/98433.
106 # https://datatracker.ietf.org/doc/html/rfc5894#section-6
107 # doesn't specify a label size limit prior to NAMEPREP. But having
108 # one makes practical sense.
109 # This leaves ample room for nameprep() to remove Nothing characters
110 # per https://www.rfc-editor.org/rfc/rfc3454#section-3.1 while still
111 # preventing us from wasting time decoding a big thing that'll just
112 # hit the actual <= 63 length limit in Step 6.
113 raise UnicodeError("label way too long")
114 # Step 1: Check for ASCII
115 if isinstance(label, bytes):
116 pure_ascii = True
117 else:
118 try:
119 label = label.encode("ascii")
120 pure_ascii = True
121 except UnicodeError:
122 pure_ascii = False
123 if not pure_ascii:
124 # Step 2: Perform nameprep
125 label = nameprep(label)
126 # It doesn't say this, but apparently, it should be ASCII now
127 try:
128 label = label.encode("ascii")
129 except UnicodeError:
130 raise UnicodeError("Invalid character in IDN label")
131 # Step 3: Check for ACE prefix
132 if not label.startswith(ace_prefix):
133 return str(label, "ascii")
134
135 # Step 4: Remove ACE prefix
136 label1 = label[len(ace_prefix):]
137
138 # Step 5: Decode using PUNYCODE
139 result = label1.decode("punycode")
140
141 # Step 6: Apply ToASCII
142 label2 = ToASCII(result)
143
144 # Step 7: Compare the result of step 6 with the one of step 3
145 # label2 will already be in lower case.
146 if str(label, "ascii").lower() != str(label2, "ascii"):
147 raise UnicodeError("IDNA does not round-trip", label, label2)
148
149 # Step 8: return the result of step 5
150 return result
151
152 ### Codec APIs
153
154 class ESC[4;38;5;81mCodec(ESC[4;38;5;149mcodecsESC[4;38;5;149m.ESC[4;38;5;149mCodec):
155 def encode(self, input, errors='strict'):
156
157 if errors != 'strict':
158 # IDNA is quite clear that implementations must be strict
159 raise UnicodeError("unsupported error handling "+errors)
160
161 if not input:
162 return b'', 0
163
164 try:
165 result = input.encode('ascii')
166 except UnicodeEncodeError:
167 pass
168 else:
169 # ASCII name: fast path
170 labels = result.split(b'.')
171 for label in labels[:-1]:
172 if not (0 < len(label) < 64):
173 raise UnicodeError("label empty or too long")
174 if len(labels[-1]) >= 64:
175 raise UnicodeError("label too long")
176 return result, len(input)
177
178 result = bytearray()
179 labels = dots.split(input)
180 if labels and not labels[-1]:
181 trailing_dot = b'.'
182 del labels[-1]
183 else:
184 trailing_dot = b''
185 for label in labels:
186 if result:
187 # Join with U+002E
188 result.extend(b'.')
189 result.extend(ToASCII(label))
190 return bytes(result+trailing_dot), len(input)
191
192 def decode(self, input, errors='strict'):
193
194 if errors != 'strict':
195 raise UnicodeError("Unsupported error handling "+errors)
196
197 if not input:
198 return "", 0
199
200 # IDNA allows decoding to operate on Unicode strings, too.
201 if not isinstance(input, bytes):
202 # XXX obviously wrong, see #3232
203 input = bytes(input)
204
205 if ace_prefix not in input:
206 # Fast path
207 try:
208 return input.decode('ascii'), len(input)
209 except UnicodeDecodeError:
210 pass
211
212 labels = input.split(b".")
213
214 if labels and len(labels[-1]) == 0:
215 trailing_dot = '.'
216 del labels[-1]
217 else:
218 trailing_dot = ''
219
220 result = []
221 for label in labels:
222 result.append(ToUnicode(label))
223
224 return ".".join(result)+trailing_dot, len(input)
225
226 class ESC[4;38;5;81mIncrementalEncoder(ESC[4;38;5;149mcodecsESC[4;38;5;149m.ESC[4;38;5;149mBufferedIncrementalEncoder):
227 def _buffer_encode(self, input, errors, final):
228 if errors != 'strict':
229 # IDNA is quite clear that implementations must be strict
230 raise UnicodeError("unsupported error handling "+errors)
231
232 if not input:
233 return (b'', 0)
234
235 labels = dots.split(input)
236 trailing_dot = b''
237 if labels:
238 if not labels[-1]:
239 trailing_dot = b'.'
240 del labels[-1]
241 elif not final:
242 # Keep potentially unfinished label until the next call
243 del labels[-1]
244 if labels:
245 trailing_dot = b'.'
246
247 result = bytearray()
248 size = 0
249 for label in labels:
250 if size:
251 # Join with U+002E
252 result.extend(b'.')
253 size += 1
254 result.extend(ToASCII(label))
255 size += len(label)
256
257 result += trailing_dot
258 size += len(trailing_dot)
259 return (bytes(result), size)
260
261 class ESC[4;38;5;81mIncrementalDecoder(ESC[4;38;5;149mcodecsESC[4;38;5;149m.ESC[4;38;5;149mBufferedIncrementalDecoder):
262 def _buffer_decode(self, input, errors, final):
263 if errors != 'strict':
264 raise UnicodeError("Unsupported error handling "+errors)
265
266 if not input:
267 return ("", 0)
268
269 # IDNA allows decoding to operate on Unicode strings, too.
270 if isinstance(input, str):
271 labels = dots.split(input)
272 else:
273 # Must be ASCII string
274 input = str(input, "ascii")
275 labels = input.split(".")
276
277 trailing_dot = ''
278 if labels:
279 if not labels[-1]:
280 trailing_dot = '.'
281 del labels[-1]
282 elif not final:
283 # Keep potentially unfinished label until the next call
284 del labels[-1]
285 if labels:
286 trailing_dot = '.'
287
288 result = []
289 size = 0
290 for label in labels:
291 result.append(ToUnicode(label))
292 if size:
293 size += 1
294 size += len(label)
295
296 result = ".".join(result) + trailing_dot
297 size += len(trailing_dot)
298 return (result, size)
299
300 class ESC[4;38;5;81mStreamWriter(ESC[4;38;5;149mCodec,ESC[4;38;5;149mcodecsESC[4;38;5;149m.ESC[4;38;5;149mStreamWriter):
301 pass
302
303 class ESC[4;38;5;81mStreamReader(ESC[4;38;5;149mCodec,ESC[4;38;5;149mcodecsESC[4;38;5;149m.ESC[4;38;5;149mStreamReader):
304 pass
305
306 ### encodings module API
307
308 def getregentry():
309 return codecs.CodecInfo(
310 name='idna',
311 encode=Codec().encode,
312 decode=Codec().decode,
313 incrementalencoder=IncrementalEncoder,
314 incrementaldecoder=IncrementalDecoder,
315 streamwriter=StreamWriter,
316 streamreader=StreamReader,
317 )