1  # Copyright (C) 2004-2006 Python Software Foundation
       2  # Authors: Baxter, Wouters and Warsaw
       3  # Contact: email-sig@python.org
       4  
       5  """FeedParser - An email feed parser.
       6  
       7  The feed parser implements an interface for incrementally parsing an email
       8  message, line by line.  This has advantages for certain applications, such as
       9  those reading email messages off a socket.
      10  
      11  FeedParser.feed() is the primary interface for pushing new data into the
      12  parser.  It returns when there's nothing more it can do with the available
      13  data.  When you have no more data to push into the parser, call .close().
      14  This completes the parsing and returns the root message object.
      15  
      16  The other advantage of this parser is that it will never raise a parsing
      17  exception.  Instead, when it finds something unexpected, it adds a 'defect' to
      18  the current message.  Defects are just instances that live on the message
      19  object's .defects attribute.
      20  """
      21  
      22  __all__ = ['FeedParser', 'BytesFeedParser']
      23  
      24  import re
      25  
      26  from email import errors
      27  from email._policybase import compat32
      28  from collections import deque
      29  from io import StringIO
      30  
      31  NLCRE = re.compile(r'\r\n|\r|\n')
      32  NLCRE_bol = re.compile(r'(\r\n|\r|\n)')
      33  NLCRE_eol = re.compile(r'(\r\n|\r|\n)\Z')
      34  NLCRE_crack = re.compile(r'(\r\n|\r|\n)')
      35  # RFC 2822 $3.6.8 Optional fields.  ftext is %d33-57 / %d59-126, Any character
      36  # except controls, SP, and ":".
      37  headerRE = re.compile(r'^(From |[\041-\071\073-\176]*:|[\t ])')
      38  EMPTYSTRING = ''
      39  NL = '\n'
      40  
      41  NeedMoreData = object()
      42  
      43  
      44  class ESC[4;38;5;81mBufferedSubFile(ESC[4;38;5;149mobject):
      45      """A file-ish object that can have new data loaded into it.
      46  
      47      You can also push and pop line-matching predicates onto a stack.  When the
      48      current predicate matches the current line, a false EOF response
      49      (i.e. empty string) is returned instead.  This lets the parser adhere to a
      50      simple abstraction -- it parses until EOF closes the current message.
      51      """
      52      def __init__(self):
      53          # Text stream of the last partial line pushed into this object.
      54          # See issue 22233 for why this is a text stream and not a list.
      55          self._partial = StringIO(newline='')
      56          # A deque of full, pushed lines
      57          self._lines = deque()
      58          # The stack of false-EOF checking predicates.
      59          self._eofstack = []
      60          # A flag indicating whether the file has been closed or not.
      61          self._closed = False
      62  
      63      def push_eof_matcher(self, pred):
      64          self._eofstack.append(pred)
      65  
      66      def pop_eof_matcher(self):
      67          return self._eofstack.pop()
      68  
      69      def close(self):
      70          # Don't forget any trailing partial line.
      71          self._partial.seek(0)
      72          self.pushlines(self._partial.readlines())
      73          self._partial.seek(0)
      74          self._partial.truncate()
      75          self._closed = True
      76  
      77      def readline(self):
      78          if not self._lines:
      79              if self._closed:
      80                  return ''
      81              return NeedMoreData
      82          # Pop the line off the stack and see if it matches the current
      83          # false-EOF predicate.
      84          line = self._lines.popleft()
      85          # RFC 2046, section 5.1.2 requires us to recognize outer level
      86          # boundaries at any level of inner nesting.  Do this, but be sure it's
      87          # in the order of most to least nested.
      88          for ateof in reversed(self._eofstack):
      89              if ateof(line):
      90                  # We're at the false EOF.  But push the last line back first.
      91                  self._lines.appendleft(line)
      92                  return ''
      93          return line
      94  
      95      def unreadline(self, line):
      96          # Let the consumer push a line back into the buffer.
      97          assert line is not NeedMoreData
      98          self._lines.appendleft(line)
      99  
     100      def push(self, data):
     101          """Push some new data into this object."""
     102          self._partial.write(data)
     103          if '\n' not in data and '\r' not in data:
     104              # No new complete lines, wait for more.
     105              return
     106  
     107          # Crack into lines, preserving the linesep characters.
     108          self._partial.seek(0)
     109          parts = self._partial.readlines()
     110          self._partial.seek(0)
     111          self._partial.truncate()
     112  
     113          # If the last element of the list does not end in a newline, then treat
     114          # it as a partial line.  We only check for '\n' here because a line
     115          # ending with '\r' might be a line that was split in the middle of a
     116          # '\r\n' sequence (see bugs 1555570 and 1721862).
     117          if not parts[-1].endswith('\n'):
     118              self._partial.write(parts.pop())
     119          self.pushlines(parts)
     120  
     121      def pushlines(self, lines):
     122          self._lines.extend(lines)
     123  
     124      def __iter__(self):
     125          return self
     126  
     127      def __next__(self):
     128          line = self.readline()
     129          if line == '':
     130              raise StopIteration
     131          return line
     132  
     133  
     134  class ESC[4;38;5;81mFeedParser:
     135      """A feed-style parser of email."""
     136  
     137      def __init__(self, _factory=None, *, policy=compat32):
     138          """_factory is called with no arguments to create a new message obj
     139  
     140          The policy keyword specifies a policy object that controls a number of
     141          aspects of the parser's operation.  The default policy maintains
     142          backward compatibility.
     143  
     144          """
     145          self.policy = policy
     146          self._old_style_factory = False
     147          if _factory is None:
     148              if policy.message_factory is None:
     149                  from email.message import Message
     150                  self._factory = Message
     151              else:
     152                  self._factory = policy.message_factory
     153          else:
     154              self._factory = _factory
     155              try:
     156                  _factory(policy=self.policy)
     157              except TypeError:
     158                  # Assume this is an old-style factory
     159                  self._old_style_factory = True
     160          self._input = BufferedSubFile()
     161          self._msgstack = []
     162          self._parse = self._parsegen().__next__
     163          self._cur = None
     164          self._last = None
     165          self._headersonly = False
     166  
     167      # Non-public interface for supporting Parser's headersonly flag
     168      def _set_headersonly(self):
     169          self._headersonly = True
     170  
     171      def feed(self, data):
     172          """Push more data into the parser."""
     173          self._input.push(data)
     174          self._call_parse()
     175  
     176      def _call_parse(self):
     177          try:
     178              self._parse()
     179          except StopIteration:
     180              pass
     181  
     182      def close(self):
     183          """Parse all remaining data and return the root message object."""
     184          self._input.close()
     185          self._call_parse()
     186          root = self._pop_message()
     187          assert not self._msgstack
     188          # Look for final set of defects
     189          if root.get_content_maintype() == 'multipart' \
     190                 and not root.is_multipart() and not self._headersonly:
     191              defect = errors.MultipartInvariantViolationDefect()
     192              self.policy.handle_defect(root, defect)
     193          return root
     194  
     195      def _new_message(self):
     196          if self._old_style_factory:
     197              msg = self._factory()
     198          else:
     199              msg = self._factory(policy=self.policy)
     200          if self._cur and self._cur.get_content_type() == 'multipart/digest':
     201              msg.set_default_type('message/rfc822')
     202          if self._msgstack:
     203              self._msgstack[-1].attach(msg)
     204          self._msgstack.append(msg)
     205          self._cur = msg
     206          self._last = msg
     207  
     208      def _pop_message(self):
     209          retval = self._msgstack.pop()
     210          if self._msgstack:
     211              self._cur = self._msgstack[-1]
     212          else:
     213              self._cur = None
     214          return retval
     215  
     216      def _parsegen(self):
     217          # Create a new message and start by parsing headers.
     218          self._new_message()
     219          headers = []
     220          # Collect the headers, searching for a line that doesn't match the RFC
     221          # 2822 header or continuation pattern (including an empty line).
     222          for line in self._input:
     223              if line is NeedMoreData:
     224                  yield NeedMoreData
     225                  continue
     226              if not headerRE.match(line):
     227                  # If we saw the RFC defined header/body separator
     228                  # (i.e. newline), just throw it away. Otherwise the line is
     229                  # part of the body so push it back.
     230                  if not NLCRE.match(line):
     231                      defect = errors.MissingHeaderBodySeparatorDefect()
     232                      self.policy.handle_defect(self._cur, defect)
     233                      self._input.unreadline(line)
     234                  break
     235              headers.append(line)
     236          # Done with the headers, so parse them and figure out what we're
     237          # supposed to see in the body of the message.
     238          self._parse_headers(headers)
     239          # Headers-only parsing is a backwards compatibility hack, which was
     240          # necessary in the older parser, which could raise errors.  All
     241          # remaining lines in the input are thrown into the message body.
     242          if self._headersonly:
     243              lines = []
     244              while True:
     245                  line = self._input.readline()
     246                  if line is NeedMoreData:
     247                      yield NeedMoreData
     248                      continue
     249                  if line == '':
     250                      break
     251                  lines.append(line)
     252              self._cur.set_payload(EMPTYSTRING.join(lines))
     253              return
     254          if self._cur.get_content_type() == 'message/delivery-status':
     255              # message/delivery-status contains blocks of headers separated by
     256              # a blank line.  We'll represent each header block as a separate
     257              # nested message object, but the processing is a bit different
     258              # than standard message/* types because there is no body for the
     259              # nested messages.  A blank line separates the subparts.
     260              while True:
     261                  self._input.push_eof_matcher(NLCRE.match)
     262                  for retval in self._parsegen():
     263                      if retval is NeedMoreData:
     264                          yield NeedMoreData
     265                          continue
     266                      break
     267                  self._pop_message()
     268                  # We need to pop the EOF matcher in order to tell if we're at
     269                  # the end of the current file, not the end of the last block
     270                  # of message headers.
     271                  self._input.pop_eof_matcher()
     272                  # The input stream must be sitting at the newline or at the
     273                  # EOF.  We want to see if we're at the end of this subpart, so
     274                  # first consume the blank line, then test the next line to see
     275                  # if we're at this subpart's EOF.
     276                  while True:
     277                      line = self._input.readline()
     278                      if line is NeedMoreData:
     279                          yield NeedMoreData
     280                          continue
     281                      break
     282                  while True:
     283                      line = self._input.readline()
     284                      if line is NeedMoreData:
     285                          yield NeedMoreData
     286                          continue
     287                      break
     288                  if line == '':
     289                      break
     290                  # Not at EOF so this is a line we're going to need.
     291                  self._input.unreadline(line)
     292              return
     293          if self._cur.get_content_maintype() == 'message':
     294              # The message claims to be a message/* type, then what follows is
     295              # another RFC 2822 message.
     296              for retval in self._parsegen():
     297                  if retval is NeedMoreData:
     298                      yield NeedMoreData
     299                      continue
     300                  break
     301              self._pop_message()
     302              return
     303          if self._cur.get_content_maintype() == 'multipart':
     304              boundary = self._cur.get_boundary()
     305              if boundary is None:
     306                  # The message /claims/ to be a multipart but it has not
     307                  # defined a boundary.  That's a problem which we'll handle by
     308                  # reading everything until the EOF and marking the message as
     309                  # defective.
     310                  defect = errors.NoBoundaryInMultipartDefect()
     311                  self.policy.handle_defect(self._cur, defect)
     312                  lines = []
     313                  for line in self._input:
     314                      if line is NeedMoreData:
     315                          yield NeedMoreData
     316                          continue
     317                      lines.append(line)
     318                  self._cur.set_payload(EMPTYSTRING.join(lines))
     319                  return
     320              # Make sure a valid content type was specified per RFC 2045:6.4.
     321              if (str(self._cur.get('content-transfer-encoding', '8bit')).lower()
     322                      not in ('7bit', '8bit', 'binary')):
     323                  defect = errors.InvalidMultipartContentTransferEncodingDefect()
     324                  self.policy.handle_defect(self._cur, defect)
     325              # Create a line match predicate which matches the inter-part
     326              # boundary as well as the end-of-multipart boundary.  Don't push
     327              # this onto the input stream until we've scanned past the
     328              # preamble.
     329              separator = '--' + boundary
     330              boundaryre = re.compile(
     331                  '(?P<sep>' + re.escape(separator) +
     332                  r')(?P<end>--)?(?P<ws>[ \t]*)(?P<linesep>\r\n|\r|\n)?$')
     333              capturing_preamble = True
     334              preamble = []
     335              linesep = False
     336              close_boundary_seen = False
     337              while True:
     338                  line = self._input.readline()
     339                  if line is NeedMoreData:
     340                      yield NeedMoreData
     341                      continue
     342                  if line == '':
     343                      break
     344                  mo = boundaryre.match(line)
     345                  if mo:
     346                      # If we're looking at the end boundary, we're done with
     347                      # this multipart.  If there was a newline at the end of
     348                      # the closing boundary, then we need to initialize the
     349                      # epilogue with the empty string (see below).
     350                      if mo.group('end'):
     351                          close_boundary_seen = True
     352                          linesep = mo.group('linesep')
     353                          break
     354                      # We saw an inter-part boundary.  Were we in the preamble?
     355                      if capturing_preamble:
     356                          if preamble:
     357                              # According to RFC 2046, the last newline belongs
     358                              # to the boundary.
     359                              lastline = preamble[-1]
     360                              eolmo = NLCRE_eol.search(lastline)
     361                              if eolmo:
     362                                  preamble[-1] = lastline[:-len(eolmo.group(0))]
     363                              self._cur.preamble = EMPTYSTRING.join(preamble)
     364                          capturing_preamble = False
     365                          self._input.unreadline(line)
     366                          continue
     367                      # We saw a boundary separating two parts.  Consume any
     368                      # multiple boundary lines that may be following.  Our
     369                      # interpretation of RFC 2046 BNF grammar does not produce
     370                      # body parts within such double boundaries.
     371                      while True:
     372                          line = self._input.readline()
     373                          if line is NeedMoreData:
     374                              yield NeedMoreData
     375                              continue
     376                          mo = boundaryre.match(line)
     377                          if not mo:
     378                              self._input.unreadline(line)
     379                              break
     380                      # Recurse to parse this subpart; the input stream points
     381                      # at the subpart's first line.
     382                      self._input.push_eof_matcher(boundaryre.match)
     383                      for retval in self._parsegen():
     384                          if retval is NeedMoreData:
     385                              yield NeedMoreData
     386                              continue
     387                          break
     388                      # Because of RFC 2046, the newline preceding the boundary
     389                      # separator actually belongs to the boundary, not the
     390                      # previous subpart's payload (or epilogue if the previous
     391                      # part is a multipart).
     392                      if self._last.get_content_maintype() == 'multipart':
     393                          epilogue = self._last.epilogue
     394                          if epilogue == '':
     395                              self._last.epilogue = None
     396                          elif epilogue is not None:
     397                              mo = NLCRE_eol.search(epilogue)
     398                              if mo:
     399                                  end = len(mo.group(0))
     400                                  self._last.epilogue = epilogue[:-end]
     401                      else:
     402                          payload = self._last._payload
     403                          if isinstance(payload, str):
     404                              mo = NLCRE_eol.search(payload)
     405                              if mo:
     406                                  payload = payload[:-len(mo.group(0))]
     407                                  self._last._payload = payload
     408                      self._input.pop_eof_matcher()
     409                      self._pop_message()
     410                      # Set the multipart up for newline cleansing, which will
     411                      # happen if we're in a nested multipart.
     412                      self._last = self._cur
     413                  else:
     414                      # I think we must be in the preamble
     415                      assert capturing_preamble
     416                      preamble.append(line)
     417              # We've seen either the EOF or the end boundary.  If we're still
     418              # capturing the preamble, we never saw the start boundary.  Note
     419              # that as a defect and store the captured text as the payload.
     420              if capturing_preamble:
     421                  defect = errors.StartBoundaryNotFoundDefect()
     422                  self.policy.handle_defect(self._cur, defect)
     423                  self._cur.set_payload(EMPTYSTRING.join(preamble))
     424                  epilogue = []
     425                  for line in self._input:
     426                      if line is NeedMoreData:
     427                          yield NeedMoreData
     428                          continue
     429                  self._cur.epilogue = EMPTYSTRING.join(epilogue)
     430                  return
     431              # If we're not processing the preamble, then we might have seen
     432              # EOF without seeing that end boundary...that is also a defect.
     433              if not close_boundary_seen:
     434                  defect = errors.CloseBoundaryNotFoundDefect()
     435                  self.policy.handle_defect(self._cur, defect)
     436                  return
     437              # Everything from here to the EOF is epilogue.  If the end boundary
     438              # ended in a newline, we'll need to make sure the epilogue isn't
     439              # None
     440              if linesep:
     441                  epilogue = ['']
     442              else:
     443                  epilogue = []
     444              for line in self._input:
     445                  if line is NeedMoreData:
     446                      yield NeedMoreData
     447                      continue
     448                  epilogue.append(line)
     449              # Any CRLF at the front of the epilogue is not technically part of
     450              # the epilogue.  Also, watch out for an empty string epilogue,
     451              # which means a single newline.
     452              if epilogue:
     453                  firstline = epilogue[0]
     454                  bolmo = NLCRE_bol.match(firstline)
     455                  if bolmo:
     456                      epilogue[0] = firstline[len(bolmo.group(0)):]
     457              self._cur.epilogue = EMPTYSTRING.join(epilogue)
     458              return
     459          # Otherwise, it's some non-multipart type, so the entire rest of the
     460          # file contents becomes the payload.
     461          lines = []
     462          for line in self._input:
     463              if line is NeedMoreData:
     464                  yield NeedMoreData
     465                  continue
     466              lines.append(line)
     467          self._cur.set_payload(EMPTYSTRING.join(lines))
     468  
     469      def _parse_headers(self, lines):
     470          # Passed a list of lines that make up the headers for the current msg
     471          lastheader = ''
     472          lastvalue = []
     473          for lineno, line in enumerate(lines):
     474              # Check for continuation
     475              if line[0] in ' \t':
     476                  if not lastheader:
     477                      # The first line of the headers was a continuation.  This
     478                      # is illegal, so let's note the defect, store the illegal
     479                      # line, and ignore it for purposes of headers.
     480                      defect = errors.FirstHeaderLineIsContinuationDefect(line)
     481                      self.policy.handle_defect(self._cur, defect)
     482                      continue
     483                  lastvalue.append(line)
     484                  continue
     485              if lastheader:
     486                  self._cur.set_raw(*self.policy.header_source_parse(lastvalue))
     487                  lastheader, lastvalue = '', []
     488              # Check for envelope header, i.e. unix-from
     489              if line.startswith('From '):
     490                  if lineno == 0:
     491                      # Strip off the trailing newline
     492                      mo = NLCRE_eol.search(line)
     493                      if mo:
     494                          line = line[:-len(mo.group(0))]
     495                      self._cur.set_unixfrom(line)
     496                      continue
     497                  elif lineno == len(lines) - 1:
     498                      # Something looking like a unix-from at the end - it's
     499                      # probably the first line of the body, so push back the
     500                      # line and stop.
     501                      self._input.unreadline(line)
     502                      return
     503                  else:
     504                      # Weirdly placed unix-from line.  Note this as a defect
     505                      # and ignore it.
     506                      defect = errors.MisplacedEnvelopeHeaderDefect(line)
     507                      self._cur.defects.append(defect)
     508                      continue
     509              # Split the line on the colon separating field name from value.
     510              # There will always be a colon, because if there wasn't the part of
     511              # the parser that calls us would have started parsing the body.
     512              i = line.find(':')
     513  
     514              # If the colon is on the start of the line the header is clearly
     515              # malformed, but we might be able to salvage the rest of the
     516              # message. Track the error but keep going.
     517              if i == 0:
     518                  defect = errors.InvalidHeaderDefect("Missing header name.")
     519                  self._cur.defects.append(defect)
     520                  continue
     521  
     522              assert i>0, "_parse_headers fed line with no : and no leading WS"
     523              lastheader = line[:i]
     524              lastvalue = [line]
     525          # Done with all the lines, so handle the last header.
     526          if lastheader:
     527              self._cur.set_raw(*self.policy.header_source_parse(lastvalue))
     528  
     529  
     530  class ESC[4;38;5;81mBytesFeedParser(ESC[4;38;5;149mFeedParser):
     531      """Like FeedParser, but feed accepts bytes."""
     532  
     533      def feed(self, data):
     534          super().feed(data.decode('ascii', 'surrogateescape'))