1 # Copyright (C) 2004-2006 Python Software Foundation
2 # Authors: Baxter, Wouters and Warsaw
3 # Contact: email-sig@python.org
4
5 """FeedParser - An email feed parser.
6
7 The feed parser implements an interface for incrementally parsing an email
8 message, line by line. This has advantages for certain applications, such as
9 those reading email messages off a socket.
10
11 FeedParser.feed() is the primary interface for pushing new data into the
12 parser. It returns when there's nothing more it can do with the available
13 data. When you have no more data to push into the parser, call .close().
14 This completes the parsing and returns the root message object.
15
16 The other advantage of this parser is that it will never raise a parsing
17 exception. Instead, when it finds something unexpected, it adds a 'defect' to
18 the current message. Defects are just instances that live on the message
19 object's .defects attribute.
20 """
21
22 __all__ = ['FeedParser', 'BytesFeedParser']
23
24 import re
25
26 from email import errors
27 from email._policybase import compat32
28 from collections import deque
29 from io import StringIO
30
31 NLCRE = re.compile(r'\r\n|\r|\n')
32 NLCRE_bol = re.compile(r'(\r\n|\r|\n)')
33 NLCRE_eol = re.compile(r'(\r\n|\r|\n)\Z')
34 NLCRE_crack = re.compile(r'(\r\n|\r|\n)')
35 # RFC 2822 $3.6.8 Optional fields. ftext is %d33-57 / %d59-126, Any character
36 # except controls, SP, and ":".
37 headerRE = re.compile(r'^(From |[\041-\071\073-\176]*:|[\t ])')
38 EMPTYSTRING = ''
39 NL = '\n'
40
41 NeedMoreData = object()
42
43
44 class ESC[4;38;5;81mBufferedSubFile(ESC[4;38;5;149mobject):
45 """A file-ish object that can have new data loaded into it.
46
47 You can also push and pop line-matching predicates onto a stack. When the
48 current predicate matches the current line, a false EOF response
49 (i.e. empty string) is returned instead. This lets the parser adhere to a
50 simple abstraction -- it parses until EOF closes the current message.
51 """
52 def __init__(self):
53 # Text stream of the last partial line pushed into this object.
54 # See issue 22233 for why this is a text stream and not a list.
55 self._partial = StringIO(newline='')
56 # A deque of full, pushed lines
57 self._lines = deque()
58 # The stack of false-EOF checking predicates.
59 self._eofstack = []
60 # A flag indicating whether the file has been closed or not.
61 self._closed = False
62
63 def push_eof_matcher(self, pred):
64 self._eofstack.append(pred)
65
66 def pop_eof_matcher(self):
67 return self._eofstack.pop()
68
69 def close(self):
70 # Don't forget any trailing partial line.
71 self._partial.seek(0)
72 self.pushlines(self._partial.readlines())
73 self._partial.seek(0)
74 self._partial.truncate()
75 self._closed = True
76
77 def readline(self):
78 if not self._lines:
79 if self._closed:
80 return ''
81 return NeedMoreData
82 # Pop the line off the stack and see if it matches the current
83 # false-EOF predicate.
84 line = self._lines.popleft()
85 # RFC 2046, section 5.1.2 requires us to recognize outer level
86 # boundaries at any level of inner nesting. Do this, but be sure it's
87 # in the order of most to least nested.
88 for ateof in reversed(self._eofstack):
89 if ateof(line):
90 # We're at the false EOF. But push the last line back first.
91 self._lines.appendleft(line)
92 return ''
93 return line
94
95 def unreadline(self, line):
96 # Let the consumer push a line back into the buffer.
97 assert line is not NeedMoreData
98 self._lines.appendleft(line)
99
100 def push(self, data):
101 """Push some new data into this object."""
102 self._partial.write(data)
103 if '\n' not in data and '\r' not in data:
104 # No new complete lines, wait for more.
105 return
106
107 # Crack into lines, preserving the linesep characters.
108 self._partial.seek(0)
109 parts = self._partial.readlines()
110 self._partial.seek(0)
111 self._partial.truncate()
112
113 # If the last element of the list does not end in a newline, then treat
114 # it as a partial line. We only check for '\n' here because a line
115 # ending with '\r' might be a line that was split in the middle of a
116 # '\r\n' sequence (see bugs 1555570 and 1721862).
117 if not parts[-1].endswith('\n'):
118 self._partial.write(parts.pop())
119 self.pushlines(parts)
120
121 def pushlines(self, lines):
122 self._lines.extend(lines)
123
124 def __iter__(self):
125 return self
126
127 def __next__(self):
128 line = self.readline()
129 if line == '':
130 raise StopIteration
131 return line
132
133
134 class ESC[4;38;5;81mFeedParser:
135 """A feed-style parser of email."""
136
137 def __init__(self, _factory=None, *, policy=compat32):
138 """_factory is called with no arguments to create a new message obj
139
140 The policy keyword specifies a policy object that controls a number of
141 aspects of the parser's operation. The default policy maintains
142 backward compatibility.
143
144 """
145 self.policy = policy
146 self._old_style_factory = False
147 if _factory is None:
148 if policy.message_factory is None:
149 from email.message import Message
150 self._factory = Message
151 else:
152 self._factory = policy.message_factory
153 else:
154 self._factory = _factory
155 try:
156 _factory(policy=self.policy)
157 except TypeError:
158 # Assume this is an old-style factory
159 self._old_style_factory = True
160 self._input = BufferedSubFile()
161 self._msgstack = []
162 self._parse = self._parsegen().__next__
163 self._cur = None
164 self._last = None
165 self._headersonly = False
166
167 # Non-public interface for supporting Parser's headersonly flag
168 def _set_headersonly(self):
169 self._headersonly = True
170
171 def feed(self, data):
172 """Push more data into the parser."""
173 self._input.push(data)
174 self._call_parse()
175
176 def _call_parse(self):
177 try:
178 self._parse()
179 except StopIteration:
180 pass
181
182 def close(self):
183 """Parse all remaining data and return the root message object."""
184 self._input.close()
185 self._call_parse()
186 root = self._pop_message()
187 assert not self._msgstack
188 # Look for final set of defects
189 if root.get_content_maintype() == 'multipart' \
190 and not root.is_multipart() and not self._headersonly:
191 defect = errors.MultipartInvariantViolationDefect()
192 self.policy.handle_defect(root, defect)
193 return root
194
195 def _new_message(self):
196 if self._old_style_factory:
197 msg = self._factory()
198 else:
199 msg = self._factory(policy=self.policy)
200 if self._cur and self._cur.get_content_type() == 'multipart/digest':
201 msg.set_default_type('message/rfc822')
202 if self._msgstack:
203 self._msgstack[-1].attach(msg)
204 self._msgstack.append(msg)
205 self._cur = msg
206 self._last = msg
207
208 def _pop_message(self):
209 retval = self._msgstack.pop()
210 if self._msgstack:
211 self._cur = self._msgstack[-1]
212 else:
213 self._cur = None
214 return retval
215
216 def _parsegen(self):
217 # Create a new message and start by parsing headers.
218 self._new_message()
219 headers = []
220 # Collect the headers, searching for a line that doesn't match the RFC
221 # 2822 header or continuation pattern (including an empty line).
222 for line in self._input:
223 if line is NeedMoreData:
224 yield NeedMoreData
225 continue
226 if not headerRE.match(line):
227 # If we saw the RFC defined header/body separator
228 # (i.e. newline), just throw it away. Otherwise the line is
229 # part of the body so push it back.
230 if not NLCRE.match(line):
231 defect = errors.MissingHeaderBodySeparatorDefect()
232 self.policy.handle_defect(self._cur, defect)
233 self._input.unreadline(line)
234 break
235 headers.append(line)
236 # Done with the headers, so parse them and figure out what we're
237 # supposed to see in the body of the message.
238 self._parse_headers(headers)
239 # Headers-only parsing is a backwards compatibility hack, which was
240 # necessary in the older parser, which could raise errors. All
241 # remaining lines in the input are thrown into the message body.
242 if self._headersonly:
243 lines = []
244 while True:
245 line = self._input.readline()
246 if line is NeedMoreData:
247 yield NeedMoreData
248 continue
249 if line == '':
250 break
251 lines.append(line)
252 self._cur.set_payload(EMPTYSTRING.join(lines))
253 return
254 if self._cur.get_content_type() == 'message/delivery-status':
255 # message/delivery-status contains blocks of headers separated by
256 # a blank line. We'll represent each header block as a separate
257 # nested message object, but the processing is a bit different
258 # than standard message/* types because there is no body for the
259 # nested messages. A blank line separates the subparts.
260 while True:
261 self._input.push_eof_matcher(NLCRE.match)
262 for retval in self._parsegen():
263 if retval is NeedMoreData:
264 yield NeedMoreData
265 continue
266 break
267 self._pop_message()
268 # We need to pop the EOF matcher in order to tell if we're at
269 # the end of the current file, not the end of the last block
270 # of message headers.
271 self._input.pop_eof_matcher()
272 # The input stream must be sitting at the newline or at the
273 # EOF. We want to see if we're at the end of this subpart, so
274 # first consume the blank line, then test the next line to see
275 # if we're at this subpart's EOF.
276 while True:
277 line = self._input.readline()
278 if line is NeedMoreData:
279 yield NeedMoreData
280 continue
281 break
282 while True:
283 line = self._input.readline()
284 if line is NeedMoreData:
285 yield NeedMoreData
286 continue
287 break
288 if line == '':
289 break
290 # Not at EOF so this is a line we're going to need.
291 self._input.unreadline(line)
292 return
293 if self._cur.get_content_maintype() == 'message':
294 # The message claims to be a message/* type, then what follows is
295 # another RFC 2822 message.
296 for retval in self._parsegen():
297 if retval is NeedMoreData:
298 yield NeedMoreData
299 continue
300 break
301 self._pop_message()
302 return
303 if self._cur.get_content_maintype() == 'multipart':
304 boundary = self._cur.get_boundary()
305 if boundary is None:
306 # The message /claims/ to be a multipart but it has not
307 # defined a boundary. That's a problem which we'll handle by
308 # reading everything until the EOF and marking the message as
309 # defective.
310 defect = errors.NoBoundaryInMultipartDefect()
311 self.policy.handle_defect(self._cur, defect)
312 lines = []
313 for line in self._input:
314 if line is NeedMoreData:
315 yield NeedMoreData
316 continue
317 lines.append(line)
318 self._cur.set_payload(EMPTYSTRING.join(lines))
319 return
320 # Make sure a valid content type was specified per RFC 2045:6.4.
321 if (str(self._cur.get('content-transfer-encoding', '8bit')).lower()
322 not in ('7bit', '8bit', 'binary')):
323 defect = errors.InvalidMultipartContentTransferEncodingDefect()
324 self.policy.handle_defect(self._cur, defect)
325 # Create a line match predicate which matches the inter-part
326 # boundary as well as the end-of-multipart boundary. Don't push
327 # this onto the input stream until we've scanned past the
328 # preamble.
329 separator = '--' + boundary
330 boundaryre = re.compile(
331 '(?P<sep>' + re.escape(separator) +
332 r')(?P<end>--)?(?P<ws>[ \t]*)(?P<linesep>\r\n|\r|\n)?$')
333 capturing_preamble = True
334 preamble = []
335 linesep = False
336 close_boundary_seen = False
337 while True:
338 line = self._input.readline()
339 if line is NeedMoreData:
340 yield NeedMoreData
341 continue
342 if line == '':
343 break
344 mo = boundaryre.match(line)
345 if mo:
346 # If we're looking at the end boundary, we're done with
347 # this multipart. If there was a newline at the end of
348 # the closing boundary, then we need to initialize the
349 # epilogue with the empty string (see below).
350 if mo.group('end'):
351 close_boundary_seen = True
352 linesep = mo.group('linesep')
353 break
354 # We saw an inter-part boundary. Were we in the preamble?
355 if capturing_preamble:
356 if preamble:
357 # According to RFC 2046, the last newline belongs
358 # to the boundary.
359 lastline = preamble[-1]
360 eolmo = NLCRE_eol.search(lastline)
361 if eolmo:
362 preamble[-1] = lastline[:-len(eolmo.group(0))]
363 self._cur.preamble = EMPTYSTRING.join(preamble)
364 capturing_preamble = False
365 self._input.unreadline(line)
366 continue
367 # We saw a boundary separating two parts. Consume any
368 # multiple boundary lines that may be following. Our
369 # interpretation of RFC 2046 BNF grammar does not produce
370 # body parts within such double boundaries.
371 while True:
372 line = self._input.readline()
373 if line is NeedMoreData:
374 yield NeedMoreData
375 continue
376 mo = boundaryre.match(line)
377 if not mo:
378 self._input.unreadline(line)
379 break
380 # Recurse to parse this subpart; the input stream points
381 # at the subpart's first line.
382 self._input.push_eof_matcher(boundaryre.match)
383 for retval in self._parsegen():
384 if retval is NeedMoreData:
385 yield NeedMoreData
386 continue
387 break
388 # Because of RFC 2046, the newline preceding the boundary
389 # separator actually belongs to the boundary, not the
390 # previous subpart's payload (or epilogue if the previous
391 # part is a multipart).
392 if self._last.get_content_maintype() == 'multipart':
393 epilogue = self._last.epilogue
394 if epilogue == '':
395 self._last.epilogue = None
396 elif epilogue is not None:
397 mo = NLCRE_eol.search(epilogue)
398 if mo:
399 end = len(mo.group(0))
400 self._last.epilogue = epilogue[:-end]
401 else:
402 payload = self._last._payload
403 if isinstance(payload, str):
404 mo = NLCRE_eol.search(payload)
405 if mo:
406 payload = payload[:-len(mo.group(0))]
407 self._last._payload = payload
408 self._input.pop_eof_matcher()
409 self._pop_message()
410 # Set the multipart up for newline cleansing, which will
411 # happen if we're in a nested multipart.
412 self._last = self._cur
413 else:
414 # I think we must be in the preamble
415 assert capturing_preamble
416 preamble.append(line)
417 # We've seen either the EOF or the end boundary. If we're still
418 # capturing the preamble, we never saw the start boundary. Note
419 # that as a defect and store the captured text as the payload.
420 if capturing_preamble:
421 defect = errors.StartBoundaryNotFoundDefect()
422 self.policy.handle_defect(self._cur, defect)
423 self._cur.set_payload(EMPTYSTRING.join(preamble))
424 epilogue = []
425 for line in self._input:
426 if line is NeedMoreData:
427 yield NeedMoreData
428 continue
429 self._cur.epilogue = EMPTYSTRING.join(epilogue)
430 return
431 # If we're not processing the preamble, then we might have seen
432 # EOF without seeing that end boundary...that is also a defect.
433 if not close_boundary_seen:
434 defect = errors.CloseBoundaryNotFoundDefect()
435 self.policy.handle_defect(self._cur, defect)
436 return
437 # Everything from here to the EOF is epilogue. If the end boundary
438 # ended in a newline, we'll need to make sure the epilogue isn't
439 # None
440 if linesep:
441 epilogue = ['']
442 else:
443 epilogue = []
444 for line in self._input:
445 if line is NeedMoreData:
446 yield NeedMoreData
447 continue
448 epilogue.append(line)
449 # Any CRLF at the front of the epilogue is not technically part of
450 # the epilogue. Also, watch out for an empty string epilogue,
451 # which means a single newline.
452 if epilogue:
453 firstline = epilogue[0]
454 bolmo = NLCRE_bol.match(firstline)
455 if bolmo:
456 epilogue[0] = firstline[len(bolmo.group(0)):]
457 self._cur.epilogue = EMPTYSTRING.join(epilogue)
458 return
459 # Otherwise, it's some non-multipart type, so the entire rest of the
460 # file contents becomes the payload.
461 lines = []
462 for line in self._input:
463 if line is NeedMoreData:
464 yield NeedMoreData
465 continue
466 lines.append(line)
467 self._cur.set_payload(EMPTYSTRING.join(lines))
468
469 def _parse_headers(self, lines):
470 # Passed a list of lines that make up the headers for the current msg
471 lastheader = ''
472 lastvalue = []
473 for lineno, line in enumerate(lines):
474 # Check for continuation
475 if line[0] in ' \t':
476 if not lastheader:
477 # The first line of the headers was a continuation. This
478 # is illegal, so let's note the defect, store the illegal
479 # line, and ignore it for purposes of headers.
480 defect = errors.FirstHeaderLineIsContinuationDefect(line)
481 self.policy.handle_defect(self._cur, defect)
482 continue
483 lastvalue.append(line)
484 continue
485 if lastheader:
486 self._cur.set_raw(*self.policy.header_source_parse(lastvalue))
487 lastheader, lastvalue = '', []
488 # Check for envelope header, i.e. unix-from
489 if line.startswith('From '):
490 if lineno == 0:
491 # Strip off the trailing newline
492 mo = NLCRE_eol.search(line)
493 if mo:
494 line = line[:-len(mo.group(0))]
495 self._cur.set_unixfrom(line)
496 continue
497 elif lineno == len(lines) - 1:
498 # Something looking like a unix-from at the end - it's
499 # probably the first line of the body, so push back the
500 # line and stop.
501 self._input.unreadline(line)
502 return
503 else:
504 # Weirdly placed unix-from line. Note this as a defect
505 # and ignore it.
506 defect = errors.MisplacedEnvelopeHeaderDefect(line)
507 self._cur.defects.append(defect)
508 continue
509 # Split the line on the colon separating field name from value.
510 # There will always be a colon, because if there wasn't the part of
511 # the parser that calls us would have started parsing the body.
512 i = line.find(':')
513
514 # If the colon is on the start of the line the header is clearly
515 # malformed, but we might be able to salvage the rest of the
516 # message. Track the error but keep going.
517 if i == 0:
518 defect = errors.InvalidHeaderDefect("Missing header name.")
519 self._cur.defects.append(defect)
520 continue
521
522 assert i>0, "_parse_headers fed line with no : and no leading WS"
523 lastheader = line[:i]
524 lastvalue = [line]
525 # Done with all the lines, so handle the last header.
526 if lastheader:
527 self._cur.set_raw(*self.policy.header_source_parse(lastvalue))
528
529
530 class ESC[4;38;5;81mBytesFeedParser(ESC[4;38;5;149mFeedParser):
531 """Like FeedParser, but feed accepts bytes."""
532
533 def feed(self, data):
534 super().feed(data.decode('ascii', 'surrogateescape'))