1 # Copyright (C) 2002-2007 Python Software Foundation
2 # Contact: email-sig@python.org
3
4 """Email address parsing code.
5
6 Lifted directly from rfc822.py. This should eventually be rewritten.
7 """
8
9 __all__ = [
10 'mktime_tz',
11 'parsedate',
12 'parsedate_tz',
13 'quote',
14 ]
15
16 import time, calendar
17
18 SPACE = ' '
19 EMPTYSTRING = ''
20 COMMASPACE = ', '
21
22 # Parse a date field
23 _monthnames = ['jan', 'feb', 'mar', 'apr', 'may', 'jun', 'jul',
24 'aug', 'sep', 'oct', 'nov', 'dec',
25 'january', 'february', 'march', 'april', 'may', 'june', 'july',
26 'august', 'september', 'october', 'november', 'december']
27
28 _daynames = ['mon', 'tue', 'wed', 'thu', 'fri', 'sat', 'sun']
29
30 # The timezone table does not include the military time zones defined
31 # in RFC822, other than Z. According to RFC1123, the description in
32 # RFC822 gets the signs wrong, so we can't rely on any such time
33 # zones. RFC1123 recommends that numeric timezone indicators be used
34 # instead of timezone names.
35
36 _timezones = {'UT':0, 'UTC':0, 'GMT':0, 'Z':0,
37 'AST': -400, 'ADT': -300, # Atlantic (used in Canada)
38 'EST': -500, 'EDT': -400, # Eastern
39 'CST': -600, 'CDT': -500, # Central
40 'MST': -700, 'MDT': -600, # Mountain
41 'PST': -800, 'PDT': -700 # Pacific
42 }
43
44
45 def parsedate_tz(data):
46 """Convert a date string to a time tuple.
47
48 Accounts for military timezones.
49 """
50 res = _parsedate_tz(data)
51 if not res:
52 return
53 if res[9] is None:
54 res[9] = 0
55 return tuple(res)
56
57 def _parsedate_tz(data):
58 """Convert date to extended time tuple.
59
60 The last (additional) element is the time zone offset in seconds, except if
61 the timezone was specified as -0000. In that case the last element is
62 None. This indicates a UTC timestamp that explicitly declaims knowledge of
63 the source timezone, as opposed to a +0000 timestamp that indicates the
64 source timezone really was UTC.
65
66 """
67 if not data:
68 return None
69 data = data.split()
70 if not data: # This happens for whitespace-only input.
71 return None
72 # The FWS after the comma after the day-of-week is optional, so search and
73 # adjust for this.
74 if data[0].endswith(',') or data[0].lower() in _daynames:
75 # There's a dayname here. Skip it
76 del data[0]
77 else:
78 i = data[0].rfind(',')
79 if i >= 0:
80 data[0] = data[0][i+1:]
81 if len(data) == 3: # RFC 850 date, deprecated
82 stuff = data[0].split('-')
83 if len(stuff) == 3:
84 data = stuff + data[1:]
85 if len(data) == 4:
86 s = data[3]
87 i = s.find('+')
88 if i == -1:
89 i = s.find('-')
90 if i > 0:
91 data[3:] = [s[:i], s[i:]]
92 else:
93 data.append('') # Dummy tz
94 if len(data) < 5:
95 return None
96 data = data[:5]
97 [dd, mm, yy, tm, tz] = data
98 if not (dd and mm and yy):
99 return None
100 mm = mm.lower()
101 if mm not in _monthnames:
102 dd, mm = mm, dd.lower()
103 if mm not in _monthnames:
104 return None
105 mm = _monthnames.index(mm) + 1
106 if mm > 12:
107 mm -= 12
108 if dd[-1] == ',':
109 dd = dd[:-1]
110 i = yy.find(':')
111 if i > 0:
112 yy, tm = tm, yy
113 if yy[-1] == ',':
114 yy = yy[:-1]
115 if not yy:
116 return None
117 if not yy[0].isdigit():
118 yy, tz = tz, yy
119 if tm[-1] == ',':
120 tm = tm[:-1]
121 tm = tm.split(':')
122 if len(tm) == 2:
123 [thh, tmm] = tm
124 tss = '0'
125 elif len(tm) == 3:
126 [thh, tmm, tss] = tm
127 elif len(tm) == 1 and '.' in tm[0]:
128 # Some non-compliant MUAs use '.' to separate time elements.
129 tm = tm[0].split('.')
130 if len(tm) == 2:
131 [thh, tmm] = tm
132 tss = 0
133 elif len(tm) == 3:
134 [thh, tmm, tss] = tm
135 else:
136 return None
137 else:
138 return None
139 try:
140 yy = int(yy)
141 dd = int(dd)
142 thh = int(thh)
143 tmm = int(tmm)
144 tss = int(tss)
145 except ValueError:
146 return None
147 # Check for a yy specified in two-digit format, then convert it to the
148 # appropriate four-digit format, according to the POSIX standard. RFC 822
149 # calls for a two-digit yy, but RFC 2822 (which obsoletes RFC 822)
150 # mandates a 4-digit yy. For more information, see the documentation for
151 # the time module.
152 if yy < 100:
153 # The year is between 1969 and 1999 (inclusive).
154 if yy > 68:
155 yy += 1900
156 # The year is between 2000 and 2068 (inclusive).
157 else:
158 yy += 2000
159 tzoffset = None
160 tz = tz.upper()
161 if tz in _timezones:
162 tzoffset = _timezones[tz]
163 else:
164 try:
165 tzoffset = int(tz)
166 except ValueError:
167 pass
168 if tzoffset==0 and tz.startswith('-'):
169 tzoffset = None
170 # Convert a timezone offset into seconds ; -0500 -> -18000
171 if tzoffset:
172 if tzoffset < 0:
173 tzsign = -1
174 tzoffset = -tzoffset
175 else:
176 tzsign = 1
177 tzoffset = tzsign * ( (tzoffset//100)*3600 + (tzoffset % 100)*60)
178 # Daylight Saving Time flag is set to -1, since DST is unknown.
179 return [yy, mm, dd, thh, tmm, tss, 0, 1, -1, tzoffset]
180
181
182 def parsedate(data):
183 """Convert a time string to a time tuple."""
184 t = parsedate_tz(data)
185 if isinstance(t, tuple):
186 return t[:9]
187 else:
188 return t
189
190
191 def mktime_tz(data):
192 """Turn a 10-tuple as returned by parsedate_tz() into a POSIX timestamp."""
193 if data[9] is None:
194 # No zone info, so localtime is better assumption than GMT
195 return time.mktime(data[:8] + (-1,))
196 else:
197 t = calendar.timegm(data)
198 return t - data[9]
199
200
201 def quote(str):
202 """Prepare string to be used in a quoted string.
203
204 Turns backslash and double quote characters into quoted pairs. These
205 are the only characters that need to be quoted inside a quoted string.
206 Does not add the surrounding double quotes.
207 """
208 return str.replace('\\', '\\\\').replace('"', '\\"')
209
210
211 class ESC[4;38;5;81mAddrlistClass:
212 """Address parser class by Ben Escoto.
213
214 To understand what this class does, it helps to have a copy of RFC 2822 in
215 front of you.
216
217 Note: this class interface is deprecated and may be removed in the future.
218 Use email.utils.AddressList instead.
219 """
220
221 def __init__(self, field):
222 """Initialize a new instance.
223
224 `field' is an unparsed address header field, containing
225 one or more addresses.
226 """
227 self.specials = '()<>@,:;.\"[]'
228 self.pos = 0
229 self.LWS = ' \t'
230 self.CR = '\r\n'
231 self.FWS = self.LWS + self.CR
232 self.atomends = self.specials + self.LWS + self.CR
233 # Note that RFC 2822 now specifies `.' as obs-phrase, meaning that it
234 # is obsolete syntax. RFC 2822 requires that we recognize obsolete
235 # syntax, so allow dots in phrases.
236 self.phraseends = self.atomends.replace('.', '')
237 self.field = field
238 self.commentlist = []
239
240 def gotonext(self):
241 """Skip white space and extract comments."""
242 wslist = []
243 while self.pos < len(self.field):
244 if self.field[self.pos] in self.LWS + '\n\r':
245 if self.field[self.pos] not in '\n\r':
246 wslist.append(self.field[self.pos])
247 self.pos += 1
248 elif self.field[self.pos] == '(':
249 self.commentlist.append(self.getcomment())
250 else:
251 break
252 return EMPTYSTRING.join(wslist)
253
254 def getaddrlist(self):
255 """Parse all addresses.
256
257 Returns a list containing all of the addresses.
258 """
259 result = []
260 while self.pos < len(self.field):
261 ad = self.getaddress()
262 if ad:
263 result += ad
264 else:
265 result.append(('', ''))
266 return result
267
268 def getaddress(self):
269 """Parse the next address."""
270 self.commentlist = []
271 self.gotonext()
272
273 oldpos = self.pos
274 oldcl = self.commentlist
275 plist = self.getphraselist()
276
277 self.gotonext()
278 returnlist = []
279
280 if self.pos >= len(self.field):
281 # Bad email address technically, no domain.
282 if plist:
283 returnlist = [(SPACE.join(self.commentlist), plist[0])]
284
285 elif self.field[self.pos] in '.@':
286 # email address is just an addrspec
287 # this isn't very efficient since we start over
288 self.pos = oldpos
289 self.commentlist = oldcl
290 addrspec = self.getaddrspec()
291 returnlist = [(SPACE.join(self.commentlist), addrspec)]
292
293 elif self.field[self.pos] == ':':
294 # address is a group
295 returnlist = []
296
297 fieldlen = len(self.field)
298 self.pos += 1
299 while self.pos < len(self.field):
300 self.gotonext()
301 if self.pos < fieldlen and self.field[self.pos] == ';':
302 self.pos += 1
303 break
304 returnlist = returnlist + self.getaddress()
305
306 elif self.field[self.pos] == '<':
307 # Address is a phrase then a route addr
308 routeaddr = self.getrouteaddr()
309
310 if self.commentlist:
311 returnlist = [(SPACE.join(plist) + ' (' +
312 ' '.join(self.commentlist) + ')', routeaddr)]
313 else:
314 returnlist = [(SPACE.join(plist), routeaddr)]
315
316 else:
317 if plist:
318 returnlist = [(SPACE.join(self.commentlist), plist[0])]
319 elif self.field[self.pos] in self.specials:
320 self.pos += 1
321
322 self.gotonext()
323 if self.pos < len(self.field) and self.field[self.pos] == ',':
324 self.pos += 1
325 return returnlist
326
327 def getrouteaddr(self):
328 """Parse a route address (Return-path value).
329
330 This method just skips all the route stuff and returns the addrspec.
331 """
332 if self.field[self.pos] != '<':
333 return
334
335 expectroute = False
336 self.pos += 1
337 self.gotonext()
338 adlist = ''
339 while self.pos < len(self.field):
340 if expectroute:
341 self.getdomain()
342 expectroute = False
343 elif self.field[self.pos] == '>':
344 self.pos += 1
345 break
346 elif self.field[self.pos] == '@':
347 self.pos += 1
348 expectroute = True
349 elif self.field[self.pos] == ':':
350 self.pos += 1
351 else:
352 adlist = self.getaddrspec()
353 self.pos += 1
354 break
355 self.gotonext()
356
357 return adlist
358
359 def getaddrspec(self):
360 """Parse an RFC 2822 addr-spec."""
361 aslist = []
362
363 self.gotonext()
364 while self.pos < len(self.field):
365 preserve_ws = True
366 if self.field[self.pos] == '.':
367 if aslist and not aslist[-1].strip():
368 aslist.pop()
369 aslist.append('.')
370 self.pos += 1
371 preserve_ws = False
372 elif self.field[self.pos] == '"':
373 aslist.append('"%s"' % quote(self.getquote()))
374 elif self.field[self.pos] in self.atomends:
375 if aslist and not aslist[-1].strip():
376 aslist.pop()
377 break
378 else:
379 aslist.append(self.getatom())
380 ws = self.gotonext()
381 if preserve_ws and ws:
382 aslist.append(ws)
383
384 if self.pos >= len(self.field) or self.field[self.pos] != '@':
385 return EMPTYSTRING.join(aslist)
386
387 aslist.append('@')
388 self.pos += 1
389 self.gotonext()
390 domain = self.getdomain()
391 if not domain:
392 # Invalid domain, return an empty address instead of returning a
393 # local part to denote failed parsing.
394 return EMPTYSTRING
395 return EMPTYSTRING.join(aslist) + domain
396
397 def getdomain(self):
398 """Get the complete domain name from an address."""
399 sdlist = []
400 while self.pos < len(self.field):
401 if self.field[self.pos] in self.LWS:
402 self.pos += 1
403 elif self.field[self.pos] == '(':
404 self.commentlist.append(self.getcomment())
405 elif self.field[self.pos] == '[':
406 sdlist.append(self.getdomainliteral())
407 elif self.field[self.pos] == '.':
408 self.pos += 1
409 sdlist.append('.')
410 elif self.field[self.pos] == '@':
411 # bpo-34155: Don't parse domains with two `@` like
412 # `a@malicious.org@important.com`.
413 return EMPTYSTRING
414 elif self.field[self.pos] in self.atomends:
415 break
416 else:
417 sdlist.append(self.getatom())
418 return EMPTYSTRING.join(sdlist)
419
420 def getdelimited(self, beginchar, endchars, allowcomments=True):
421 """Parse a header fragment delimited by special characters.
422
423 `beginchar' is the start character for the fragment.
424 If self is not looking at an instance of `beginchar' then
425 getdelimited returns the empty string.
426
427 `endchars' is a sequence of allowable end-delimiting characters.
428 Parsing stops when one of these is encountered.
429
430 If `allowcomments' is non-zero, embedded RFC 2822 comments are allowed
431 within the parsed fragment.
432 """
433 if self.field[self.pos] != beginchar:
434 return ''
435
436 slist = ['']
437 quote = False
438 self.pos += 1
439 while self.pos < len(self.field):
440 if quote:
441 slist.append(self.field[self.pos])
442 quote = False
443 elif self.field[self.pos] in endchars:
444 self.pos += 1
445 break
446 elif allowcomments and self.field[self.pos] == '(':
447 slist.append(self.getcomment())
448 continue # have already advanced pos from getcomment
449 elif self.field[self.pos] == '\\':
450 quote = True
451 else:
452 slist.append(self.field[self.pos])
453 self.pos += 1
454
455 return EMPTYSTRING.join(slist)
456
457 def getquote(self):
458 """Get a quote-delimited fragment from self's field."""
459 return self.getdelimited('"', '"\r', False)
460
461 def getcomment(self):
462 """Get a parenthesis-delimited fragment from self's field."""
463 return self.getdelimited('(', ')\r', True)
464
465 def getdomainliteral(self):
466 """Parse an RFC 2822 domain-literal."""
467 return '[%s]' % self.getdelimited('[', ']\r', False)
468
469 def getatom(self, atomends=None):
470 """Parse an RFC 2822 atom.
471
472 Optional atomends specifies a different set of end token delimiters
473 (the default is to use self.atomends). This is used e.g. in
474 getphraselist() since phrase endings must not include the `.' (which
475 is legal in phrases)."""
476 atomlist = ['']
477 if atomends is None:
478 atomends = self.atomends
479
480 while self.pos < len(self.field):
481 if self.field[self.pos] in atomends:
482 break
483 else:
484 atomlist.append(self.field[self.pos])
485 self.pos += 1
486
487 return EMPTYSTRING.join(atomlist)
488
489 def getphraselist(self):
490 """Parse a sequence of RFC 2822 phrases.
491
492 A phrase is a sequence of words, which are in turn either RFC 2822
493 atoms or quoted-strings. Phrases are canonicalized by squeezing all
494 runs of continuous whitespace into one space.
495 """
496 plist = []
497
498 while self.pos < len(self.field):
499 if self.field[self.pos] in self.FWS:
500 self.pos += 1
501 elif self.field[self.pos] == '"':
502 plist.append(self.getquote())
503 elif self.field[self.pos] == '(':
504 self.commentlist.append(self.getcomment())
505 elif self.field[self.pos] in self.phraseends:
506 break
507 else:
508 plist.append(self.getatom(self.phraseends))
509
510 return plist
511
512 class ESC[4;38;5;81mAddressList(ESC[4;38;5;149mAddrlistClass):
513 """An AddressList encapsulates a list of parsed RFC 2822 addresses."""
514 def __init__(self, field):
515 AddrlistClass.__init__(self, field)
516 if field:
517 self.addresslist = self.getaddrlist()
518 else:
519 self.addresslist = []
520
521 def __len__(self):
522 return len(self.addresslist)
523
524 def __add__(self, other):
525 # Set union
526 newaddr = AddressList(None)
527 newaddr.addresslist = self.addresslist[:]
528 for x in other.addresslist:
529 if not x in self.addresslist:
530 newaddr.addresslist.append(x)
531 return newaddr
532
533 def __iadd__(self, other):
534 # Set union, in-place
535 for x in other.addresslist:
536 if not x in self.addresslist:
537 self.addresslist.append(x)
538 return self
539
540 def __sub__(self, other):
541 # Set difference
542 newaddr = AddressList(None)
543 for x in self.addresslist:
544 if not x in other.addresslist:
545 newaddr.addresslist.append(x)
546 return newaddr
547
548 def __isub__(self, other):
549 # Set difference, in-place
550 for x in other.addresslist:
551 if x in self.addresslist:
552 self.addresslist.remove(x)
553 return self
554
555 def __getitem__(self, index):
556 # Make indexing, slices, and 'in' work
557 return self.addresslist[index]