1  # Copyright (C) 2002-2007 Python Software Foundation
       2  # Contact: email-sig@python.org
       3  
       4  """Email address parsing code.
       5  
       6  Lifted directly from rfc822.py.  This should eventually be rewritten.
       7  """
       8  
       9  __all__ = [
      10      'mktime_tz',
      11      'parsedate',
      12      'parsedate_tz',
      13      'quote',
      14      ]
      15  
      16  import time, calendar
      17  
      18  SPACE = ' '
      19  EMPTYSTRING = ''
      20  COMMASPACE = ', '
      21  
      22  # Parse a date field
      23  _monthnames = ['jan', 'feb', 'mar', 'apr', 'may', 'jun', 'jul',
      24                 'aug', 'sep', 'oct', 'nov', 'dec',
      25                 'january', 'february', 'march', 'april', 'may', 'june', 'july',
      26                 'august', 'september', 'october', 'november', 'december']
      27  
      28  _daynames = ['mon', 'tue', 'wed', 'thu', 'fri', 'sat', 'sun']
      29  
      30  # The timezone table does not include the military time zones defined
      31  # in RFC822, other than Z.  According to RFC1123, the description in
      32  # RFC822 gets the signs wrong, so we can't rely on any such time
      33  # zones.  RFC1123 recommends that numeric timezone indicators be used
      34  # instead of timezone names.
      35  
      36  _timezones = {'UT':0, 'UTC':0, 'GMT':0, 'Z':0,
      37                'AST': -400, 'ADT': -300,  # Atlantic (used in Canada)
      38                'EST': -500, 'EDT': -400,  # Eastern
      39                'CST': -600, 'CDT': -500,  # Central
      40                'MST': -700, 'MDT': -600,  # Mountain
      41                'PST': -800, 'PDT': -700   # Pacific
      42                }
      43  
      44  
      45  def parsedate_tz(data):
      46      """Convert a date string to a time tuple.
      47  
      48      Accounts for military timezones.
      49      """
      50      res = _parsedate_tz(data)
      51      if not res:
      52          return
      53      if res[9] is None:
      54          res[9] = 0
      55      return tuple(res)
      56  
      57  def _parsedate_tz(data):
      58      """Convert date to extended time tuple.
      59  
      60      The last (additional) element is the time zone offset in seconds, except if
      61      the timezone was specified as -0000.  In that case the last element is
      62      None.  This indicates a UTC timestamp that explicitly declaims knowledge of
      63      the source timezone, as opposed to a +0000 timestamp that indicates the
      64      source timezone really was UTC.
      65  
      66      """
      67      if not data:
      68          return None
      69      data = data.split()
      70      if not data:  # This happens for whitespace-only input.
      71          return None
      72      # The FWS after the comma after the day-of-week is optional, so search and
      73      # adjust for this.
      74      if data[0].endswith(',') or data[0].lower() in _daynames:
      75          # There's a dayname here. Skip it
      76          del data[0]
      77      else:
      78          i = data[0].rfind(',')
      79          if i >= 0:
      80              data[0] = data[0][i+1:]
      81      if len(data) == 3: # RFC 850 date, deprecated
      82          stuff = data[0].split('-')
      83          if len(stuff) == 3:
      84              data = stuff + data[1:]
      85      if len(data) == 4:
      86          s = data[3]
      87          i = s.find('+')
      88          if i == -1:
      89              i = s.find('-')
      90          if i > 0:
      91              data[3:] = [s[:i], s[i:]]
      92          else:
      93              data.append('') # Dummy tz
      94      if len(data) < 5:
      95          return None
      96      data = data[:5]
      97      [dd, mm, yy, tm, tz] = data
      98      if not (dd and mm and yy):
      99          return None
     100      mm = mm.lower()
     101      if mm not in _monthnames:
     102          dd, mm = mm, dd.lower()
     103          if mm not in _monthnames:
     104              return None
     105      mm = _monthnames.index(mm) + 1
     106      if mm > 12:
     107          mm -= 12
     108      if dd[-1] == ',':
     109          dd = dd[:-1]
     110      i = yy.find(':')
     111      if i > 0:
     112          yy, tm = tm, yy
     113      if yy[-1] == ',':
     114          yy = yy[:-1]
     115          if not yy:
     116              return None
     117      if not yy[0].isdigit():
     118          yy, tz = tz, yy
     119      if tm[-1] == ',':
     120          tm = tm[:-1]
     121      tm = tm.split(':')
     122      if len(tm) == 2:
     123          [thh, tmm] = tm
     124          tss = '0'
     125      elif len(tm) == 3:
     126          [thh, tmm, tss] = tm
     127      elif len(tm) == 1 and '.' in tm[0]:
     128          # Some non-compliant MUAs use '.' to separate time elements.
     129          tm = tm[0].split('.')
     130          if len(tm) == 2:
     131              [thh, tmm] = tm
     132              tss = 0
     133          elif len(tm) == 3:
     134              [thh, tmm, tss] = tm
     135          else:
     136              return None
     137      else:
     138          return None
     139      try:
     140          yy = int(yy)
     141          dd = int(dd)
     142          thh = int(thh)
     143          tmm = int(tmm)
     144          tss = int(tss)
     145      except ValueError:
     146          return None
     147      # Check for a yy specified in two-digit format, then convert it to the
     148      # appropriate four-digit format, according to the POSIX standard. RFC 822
     149      # calls for a two-digit yy, but RFC 2822 (which obsoletes RFC 822)
     150      # mandates a 4-digit yy. For more information, see the documentation for
     151      # the time module.
     152      if yy < 100:
     153          # The year is between 1969 and 1999 (inclusive).
     154          if yy > 68:
     155              yy += 1900
     156          # The year is between 2000 and 2068 (inclusive).
     157          else:
     158              yy += 2000
     159      tzoffset = None
     160      tz = tz.upper()
     161      if tz in _timezones:
     162          tzoffset = _timezones[tz]
     163      else:
     164          try:
     165              tzoffset = int(tz)
     166          except ValueError:
     167              pass
     168          if tzoffset==0 and tz.startswith('-'):
     169              tzoffset = None
     170      # Convert a timezone offset into seconds ; -0500 -> -18000
     171      if tzoffset:
     172          if tzoffset < 0:
     173              tzsign = -1
     174              tzoffset = -tzoffset
     175          else:
     176              tzsign = 1
     177          tzoffset = tzsign * ( (tzoffset//100)*3600 + (tzoffset % 100)*60)
     178      # Daylight Saving Time flag is set to -1, since DST is unknown.
     179      return [yy, mm, dd, thh, tmm, tss, 0, 1, -1, tzoffset]
     180  
     181  
     182  def parsedate(data):
     183      """Convert a time string to a time tuple."""
     184      t = parsedate_tz(data)
     185      if isinstance(t, tuple):
     186          return t[:9]
     187      else:
     188          return t
     189  
     190  
     191  def mktime_tz(data):
     192      """Turn a 10-tuple as returned by parsedate_tz() into a POSIX timestamp."""
     193      if data[9] is None:
     194          # No zone info, so localtime is better assumption than GMT
     195          return time.mktime(data[:8] + (-1,))
     196      else:
     197          t = calendar.timegm(data)
     198          return t - data[9]
     199  
     200  
     201  def quote(str):
     202      """Prepare string to be used in a quoted string.
     203  
     204      Turns backslash and double quote characters into quoted pairs.  These
     205      are the only characters that need to be quoted inside a quoted string.
     206      Does not add the surrounding double quotes.
     207      """
     208      return str.replace('\\', '\\\\').replace('"', '\\"')
     209  
     210  
     211  class ESC[4;38;5;81mAddrlistClass:
     212      """Address parser class by Ben Escoto.
     213  
     214      To understand what this class does, it helps to have a copy of RFC 2822 in
     215      front of you.
     216  
     217      Note: this class interface is deprecated and may be removed in the future.
     218      Use email.utils.AddressList instead.
     219      """
     220  
     221      def __init__(self, field):
     222          """Initialize a new instance.
     223  
     224          `field' is an unparsed address header field, containing
     225          one or more addresses.
     226          """
     227          self.specials = '()<>@,:;.\"[]'
     228          self.pos = 0
     229          self.LWS = ' \t'
     230          self.CR = '\r\n'
     231          self.FWS = self.LWS + self.CR
     232          self.atomends = self.specials + self.LWS + self.CR
     233          # Note that RFC 2822 now specifies `.' as obs-phrase, meaning that it
     234          # is obsolete syntax.  RFC 2822 requires that we recognize obsolete
     235          # syntax, so allow dots in phrases.
     236          self.phraseends = self.atomends.replace('.', '')
     237          self.field = field
     238          self.commentlist = []
     239  
     240      def gotonext(self):
     241          """Skip white space and extract comments."""
     242          wslist = []
     243          while self.pos < len(self.field):
     244              if self.field[self.pos] in self.LWS + '\n\r':
     245                  if self.field[self.pos] not in '\n\r':
     246                      wslist.append(self.field[self.pos])
     247                  self.pos += 1
     248              elif self.field[self.pos] == '(':
     249                  self.commentlist.append(self.getcomment())
     250              else:
     251                  break
     252          return EMPTYSTRING.join(wslist)
     253  
     254      def getaddrlist(self):
     255          """Parse all addresses.
     256  
     257          Returns a list containing all of the addresses.
     258          """
     259          result = []
     260          while self.pos < len(self.field):
     261              ad = self.getaddress()
     262              if ad:
     263                  result += ad
     264              else:
     265                  result.append(('', ''))
     266          return result
     267  
     268      def getaddress(self):
     269          """Parse the next address."""
     270          self.commentlist = []
     271          self.gotonext()
     272  
     273          oldpos = self.pos
     274          oldcl = self.commentlist
     275          plist = self.getphraselist()
     276  
     277          self.gotonext()
     278          returnlist = []
     279  
     280          if self.pos >= len(self.field):
     281              # Bad email address technically, no domain.
     282              if plist:
     283                  returnlist = [(SPACE.join(self.commentlist), plist[0])]
     284  
     285          elif self.field[self.pos] in '.@':
     286              # email address is just an addrspec
     287              # this isn't very efficient since we start over
     288              self.pos = oldpos
     289              self.commentlist = oldcl
     290              addrspec = self.getaddrspec()
     291              returnlist = [(SPACE.join(self.commentlist), addrspec)]
     292  
     293          elif self.field[self.pos] == ':':
     294              # address is a group
     295              returnlist = []
     296  
     297              fieldlen = len(self.field)
     298              self.pos += 1
     299              while self.pos < len(self.field):
     300                  self.gotonext()
     301                  if self.pos < fieldlen and self.field[self.pos] == ';':
     302                      self.pos += 1
     303                      break
     304                  returnlist = returnlist + self.getaddress()
     305  
     306          elif self.field[self.pos] == '<':
     307              # Address is a phrase then a route addr
     308              routeaddr = self.getrouteaddr()
     309  
     310              if self.commentlist:
     311                  returnlist = [(SPACE.join(plist) + ' (' +
     312                                 ' '.join(self.commentlist) + ')', routeaddr)]
     313              else:
     314                  returnlist = [(SPACE.join(plist), routeaddr)]
     315  
     316          else:
     317              if plist:
     318                  returnlist = [(SPACE.join(self.commentlist), plist[0])]
     319              elif self.field[self.pos] in self.specials:
     320                  self.pos += 1
     321  
     322          self.gotonext()
     323          if self.pos < len(self.field) and self.field[self.pos] == ',':
     324              self.pos += 1
     325          return returnlist
     326  
     327      def getrouteaddr(self):
     328          """Parse a route address (Return-path value).
     329  
     330          This method just skips all the route stuff and returns the addrspec.
     331          """
     332          if self.field[self.pos] != '<':
     333              return
     334  
     335          expectroute = False
     336          self.pos += 1
     337          self.gotonext()
     338          adlist = ''
     339          while self.pos < len(self.field):
     340              if expectroute:
     341                  self.getdomain()
     342                  expectroute = False
     343              elif self.field[self.pos] == '>':
     344                  self.pos += 1
     345                  break
     346              elif self.field[self.pos] == '@':
     347                  self.pos += 1
     348                  expectroute = True
     349              elif self.field[self.pos] == ':':
     350                  self.pos += 1
     351              else:
     352                  adlist = self.getaddrspec()
     353                  self.pos += 1
     354                  break
     355              self.gotonext()
     356  
     357          return adlist
     358  
     359      def getaddrspec(self):
     360          """Parse an RFC 2822 addr-spec."""
     361          aslist = []
     362  
     363          self.gotonext()
     364          while self.pos < len(self.field):
     365              preserve_ws = True
     366              if self.field[self.pos] == '.':
     367                  if aslist and not aslist[-1].strip():
     368                      aslist.pop()
     369                  aslist.append('.')
     370                  self.pos += 1
     371                  preserve_ws = False
     372              elif self.field[self.pos] == '"':
     373                  aslist.append('"%s"' % quote(self.getquote()))
     374              elif self.field[self.pos] in self.atomends:
     375                  if aslist and not aslist[-1].strip():
     376                      aslist.pop()
     377                  break
     378              else:
     379                  aslist.append(self.getatom())
     380              ws = self.gotonext()
     381              if preserve_ws and ws:
     382                  aslist.append(ws)
     383  
     384          if self.pos >= len(self.field) or self.field[self.pos] != '@':
     385              return EMPTYSTRING.join(aslist)
     386  
     387          aslist.append('@')
     388          self.pos += 1
     389          self.gotonext()
     390          domain = self.getdomain()
     391          if not domain:
     392              # Invalid domain, return an empty address instead of returning a
     393              # local part to denote failed parsing.
     394              return EMPTYSTRING
     395          return EMPTYSTRING.join(aslist) + domain
     396  
     397      def getdomain(self):
     398          """Get the complete domain name from an address."""
     399          sdlist = []
     400          while self.pos < len(self.field):
     401              if self.field[self.pos] in self.LWS:
     402                  self.pos += 1
     403              elif self.field[self.pos] == '(':
     404                  self.commentlist.append(self.getcomment())
     405              elif self.field[self.pos] == '[':
     406                  sdlist.append(self.getdomainliteral())
     407              elif self.field[self.pos] == '.':
     408                  self.pos += 1
     409                  sdlist.append('.')
     410              elif self.field[self.pos] == '@':
     411                  # bpo-34155: Don't parse domains with two `@` like
     412                  # `a@malicious.org@important.com`.
     413                  return EMPTYSTRING
     414              elif self.field[self.pos] in self.atomends:
     415                  break
     416              else:
     417                  sdlist.append(self.getatom())
     418          return EMPTYSTRING.join(sdlist)
     419  
     420      def getdelimited(self, beginchar, endchars, allowcomments=True):
     421          """Parse a header fragment delimited by special characters.
     422  
     423          `beginchar' is the start character for the fragment.
     424          If self is not looking at an instance of `beginchar' then
     425          getdelimited returns the empty string.
     426  
     427          `endchars' is a sequence of allowable end-delimiting characters.
     428          Parsing stops when one of these is encountered.
     429  
     430          If `allowcomments' is non-zero, embedded RFC 2822 comments are allowed
     431          within the parsed fragment.
     432          """
     433          if self.field[self.pos] != beginchar:
     434              return ''
     435  
     436          slist = ['']
     437          quote = False
     438          self.pos += 1
     439          while self.pos < len(self.field):
     440              if quote:
     441                  slist.append(self.field[self.pos])
     442                  quote = False
     443              elif self.field[self.pos] in endchars:
     444                  self.pos += 1
     445                  break
     446              elif allowcomments and self.field[self.pos] == '(':
     447                  slist.append(self.getcomment())
     448                  continue        # have already advanced pos from getcomment
     449              elif self.field[self.pos] == '\\':
     450                  quote = True
     451              else:
     452                  slist.append(self.field[self.pos])
     453              self.pos += 1
     454  
     455          return EMPTYSTRING.join(slist)
     456  
     457      def getquote(self):
     458          """Get a quote-delimited fragment from self's field."""
     459          return self.getdelimited('"', '"\r', False)
     460  
     461      def getcomment(self):
     462          """Get a parenthesis-delimited fragment from self's field."""
     463          return self.getdelimited('(', ')\r', True)
     464  
     465      def getdomainliteral(self):
     466          """Parse an RFC 2822 domain-literal."""
     467          return '[%s]' % self.getdelimited('[', ']\r', False)
     468  
     469      def getatom(self, atomends=None):
     470          """Parse an RFC 2822 atom.
     471  
     472          Optional atomends specifies a different set of end token delimiters
     473          (the default is to use self.atomends).  This is used e.g. in
     474          getphraselist() since phrase endings must not include the `.' (which
     475          is legal in phrases)."""
     476          atomlist = ['']
     477          if atomends is None:
     478              atomends = self.atomends
     479  
     480          while self.pos < len(self.field):
     481              if self.field[self.pos] in atomends:
     482                  break
     483              else:
     484                  atomlist.append(self.field[self.pos])
     485              self.pos += 1
     486  
     487          return EMPTYSTRING.join(atomlist)
     488  
     489      def getphraselist(self):
     490          """Parse a sequence of RFC 2822 phrases.
     491  
     492          A phrase is a sequence of words, which are in turn either RFC 2822
     493          atoms or quoted-strings.  Phrases are canonicalized by squeezing all
     494          runs of continuous whitespace into one space.
     495          """
     496          plist = []
     497  
     498          while self.pos < len(self.field):
     499              if self.field[self.pos] in self.FWS:
     500                  self.pos += 1
     501              elif self.field[self.pos] == '"':
     502                  plist.append(self.getquote())
     503              elif self.field[self.pos] == '(':
     504                  self.commentlist.append(self.getcomment())
     505              elif self.field[self.pos] in self.phraseends:
     506                  break
     507              else:
     508                  plist.append(self.getatom(self.phraseends))
     509  
     510          return plist
     511  
     512  class ESC[4;38;5;81mAddressList(ESC[4;38;5;149mAddrlistClass):
     513      """An AddressList encapsulates a list of parsed RFC 2822 addresses."""
     514      def __init__(self, field):
     515          AddrlistClass.__init__(self, field)
     516          if field:
     517              self.addresslist = self.getaddrlist()
     518          else:
     519              self.addresslist = []
     520  
     521      def __len__(self):
     522          return len(self.addresslist)
     523  
     524      def __add__(self, other):
     525          # Set union
     526          newaddr = AddressList(None)
     527          newaddr.addresslist = self.addresslist[:]
     528          for x in other.addresslist:
     529              if not x in self.addresslist:
     530                  newaddr.addresslist.append(x)
     531          return newaddr
     532  
     533      def __iadd__(self, other):
     534          # Set union, in-place
     535          for x in other.addresslist:
     536              if not x in self.addresslist:
     537                  self.addresslist.append(x)
     538          return self
     539  
     540      def __sub__(self, other):
     541          # Set difference
     542          newaddr = AddressList(None)
     543          for x in self.addresslist:
     544              if not x in other.addresslist:
     545                  newaddr.addresslist.append(x)
     546          return newaddr
     547  
     548      def __isub__(self, other):
     549          # Set difference, in-place
     550          for x in other.addresslist:
     551              if x in self.addresslist:
     552                  self.addresslist.remove(x)
     553          return self
     554  
     555      def __getitem__(self, index):
     556          # Make indexing, slices, and 'in' work
     557          return self.addresslist[index]