python (3.12.0)
1 """Header value parser implementing various email-related RFC parsing rules.
2
3 The parsing methods defined in this module implement various email related
4 parsing rules. Principal among them is RFC 5322, which is the followon
5 to RFC 2822 and primarily a clarification of the former. It also implements
6 RFC 2047 encoded word decoding.
7
8 RFC 5322 goes to considerable trouble to maintain backward compatibility with
9 RFC 822 in the parse phase, while cleaning up the structure on the generation
10 phase. This parser supports correct RFC 5322 generation by tagging white space
11 as folding white space only when folding is allowed in the non-obsolete rule
12 sets. Actually, the parser is even more generous when accepting input than RFC
13 5322 mandates, following the spirit of Postel's Law, which RFC 5322 encourages.
14 Where possible deviations from the standard are annotated on the 'defects'
15 attribute of tokens that deviate.
16
17 The general structure of the parser follows RFC 5322, and uses its terminology
18 where there is a direct correspondence. Where the implementation requires a
19 somewhat different structure than that used by the formal grammar, new terms
20 that mimic the closest existing terms are used. Thus, it really helps to have
21 a copy of RFC 5322 handy when studying this code.
22
23 Input to the parser is a string that has already been unfolded according to
24 RFC 5322 rules. According to the RFC this unfolding is the very first step, and
25 this parser leaves the unfolding step to a higher level message parser, which
26 will have already detected the line breaks that need unfolding while
27 determining the beginning and end of each header.
28
29 The output of the parser is a TokenList object, which is a list subclass. A
30 TokenList is a recursive data structure. The terminal nodes of the structure
31 are Terminal objects, which are subclasses of str. These do not correspond
32 directly to terminal objects in the formal grammar, but are instead more
33 practical higher level combinations of true terminals.
34
35 All TokenList and Terminal objects have a 'value' attribute, which produces the
36 semantically meaningful value of that part of the parse subtree. The value of
37 all whitespace tokens (no matter how many sub-tokens they may contain) is a
38 single space, as per the RFC rules. This includes 'CFWS', which is herein
39 included in the general class of whitespace tokens. There is one exception to
40 the rule that whitespace tokens are collapsed into single spaces in values: in
41 the value of a 'bare-quoted-string' (a quoted-string with no leading or
42 trailing whitespace), any whitespace that appeared between the quotation marks
43 is preserved in the returned value. Note that in all Terminal strings quoted
44 pairs are turned into their unquoted values.
45
46 All TokenList and Terminal objects also have a string value, which attempts to
47 be a "canonical" representation of the RFC-compliant form of the substring that
48 produced the parsed subtree, including minimal use of quoted pair quoting.
49 Whitespace runs are not collapsed.
50
51 Comment tokens also have a 'content' attribute providing the string found
52 between the parens (including any nested comments) with whitespace preserved.
53
54 All TokenList and Terminal objects have a 'defects' attribute which is a
55 possibly empty list all of the defects found while creating the token. Defects
56 may appear on any token in the tree, and a composite list of all defects in the
57 subtree is available through the 'all_defects' attribute of any node. (For
58 Terminal notes x.defects == x.all_defects.)
59
60 Each object in a parse tree is called a 'token', and each has a 'token_type'
61 attribute that gives the name from the RFC 5322 grammar that it represents.
62 Not all RFC 5322 nodes are produced, and there is one non-RFC 5322 node that
63 may be produced: 'ptext'. A 'ptext' is a string of printable ascii characters.
64 It is returned in place of lists of (ctext/quoted-pair) and
65 (qtext/quoted-pair).
66
67 XXX: provide complete list of token types.
68 """
69
70 import re
71 import sys
72 import urllib # For urllib.parse.unquote
73 from string import hexdigits
74 from operator import itemgetter
75 from email import _encoded_words as _ew
76 from email import errors
77 from email import utils
78
79 #
80 # Useful constants and functions
81 #
82
83 WSP = set(' \t')
84 CFWS_LEADER = WSP | set('(')
85 SPECIALS = set(r'()<>@,:;.\"[]')
86 ATOM_ENDS = SPECIALS | WSP
87 DOT_ATOM_ENDS = ATOM_ENDS - set('.')
88 # '.', '"', and '(' do not end phrases in order to support obs-phrase
89 PHRASE_ENDS = SPECIALS - set('."(')
90 TSPECIALS = (SPECIALS | set('/?=')) - set('.')
91 TOKEN_ENDS = TSPECIALS | WSP
92 ASPECIALS = TSPECIALS | set("*'%")
93 ATTRIBUTE_ENDS = ASPECIALS | WSP
94 EXTENDED_ATTRIBUTE_ENDS = ATTRIBUTE_ENDS - set('%')
95
96 def quote_string(value):
97 return '"'+str(value).replace('\\', '\\\\').replace('"', r'\"')+'"'
98
99 # Match a RFC 2047 word, looks like =?utf-8?q?someword?=
100 rfc2047_matcher = re.compile(r'''
101 =\? # literal =?
102 [^?]* # charset
103 \? # literal ?
104 [qQbB] # literal 'q' or 'b', case insensitive
105 \? # literal ?
106 .*? # encoded word
107 \?= # literal ?=
108 ''', re.VERBOSE | re.MULTILINE)
109
110
111 #
112 # TokenList and its subclasses
113 #
114
115 class ESC[4;38;5;81mTokenList(ESC[4;38;5;149mlist):
116
117 token_type = None
118 syntactic_break = True
119 ew_combine_allowed = True
120
121 def __init__(self, *args, **kw):
122 super().__init__(*args, **kw)
123 self.defects = []
124
125 def __str__(self):
126 return ''.join(str(x) for x in self)
127
128 def __repr__(self):
129 return '{}({})'.format(self.__class__.__name__,
130 super().__repr__())
131
132 @property
133 def value(self):
134 return ''.join(x.value for x in self if x.value)
135
136 @property
137 def all_defects(self):
138 return sum((x.all_defects for x in self), self.defects)
139
140 def startswith_fws(self):
141 return self[0].startswith_fws()
142
143 @property
144 def as_ew_allowed(self):
145 """True if all top level tokens of this part may be RFC2047 encoded."""
146 return all(part.as_ew_allowed for part in self)
147
148 @property
149 def comments(self):
150 comments = []
151 for token in self:
152 comments.extend(token.comments)
153 return comments
154
155 def fold(self, *, policy):
156 return _refold_parse_tree(self, policy=policy)
157
158 def pprint(self, indent=''):
159 print(self.ppstr(indent=indent))
160
161 def ppstr(self, indent=''):
162 return '\n'.join(self._pp(indent=indent))
163
164 def _pp(self, indent=''):
165 yield '{}{}/{}('.format(
166 indent,
167 self.__class__.__name__,
168 self.token_type)
169 for token in self:
170 if not hasattr(token, '_pp'):
171 yield (indent + ' !! invalid element in token '
172 'list: {!r}'.format(token))
173 else:
174 yield from token._pp(indent+' ')
175 if self.defects:
176 extra = ' Defects: {}'.format(self.defects)
177 else:
178 extra = ''
179 yield '{}){}'.format(indent, extra)
180
181
182 class ESC[4;38;5;81mWhiteSpaceTokenList(ESC[4;38;5;149mTokenList):
183
184 @property
185 def value(self):
186 return ' '
187
188 @property
189 def comments(self):
190 return [x.content for x in self if x.token_type=='comment']
191
192
193 class ESC[4;38;5;81mUnstructuredTokenList(ESC[4;38;5;149mTokenList):
194 token_type = 'unstructured'
195
196
197 class ESC[4;38;5;81mPhrase(ESC[4;38;5;149mTokenList):
198 token_type = 'phrase'
199
200 class ESC[4;38;5;81mWord(ESC[4;38;5;149mTokenList):
201 token_type = 'word'
202
203
204 class ESC[4;38;5;81mCFWSList(ESC[4;38;5;149mWhiteSpaceTokenList):
205 token_type = 'cfws'
206
207
208 class ESC[4;38;5;81mAtom(ESC[4;38;5;149mTokenList):
209 token_type = 'atom'
210
211
212 class ESC[4;38;5;81mToken(ESC[4;38;5;149mTokenList):
213 token_type = 'token'
214 encode_as_ew = False
215
216
217 class ESC[4;38;5;81mEncodedWord(ESC[4;38;5;149mTokenList):
218 token_type = 'encoded-word'
219 cte = None
220 charset = None
221 lang = None
222
223
224 class ESC[4;38;5;81mQuotedString(ESC[4;38;5;149mTokenList):
225
226 token_type = 'quoted-string'
227
228 @property
229 def content(self):
230 for x in self:
231 if x.token_type == 'bare-quoted-string':
232 return x.value
233
234 @property
235 def quoted_value(self):
236 res = []
237 for x in self:
238 if x.token_type == 'bare-quoted-string':
239 res.append(str(x))
240 else:
241 res.append(x.value)
242 return ''.join(res)
243
244 @property
245 def stripped_value(self):
246 for token in self:
247 if token.token_type == 'bare-quoted-string':
248 return token.value
249
250
251 class ESC[4;38;5;81mBareQuotedString(ESC[4;38;5;149mQuotedString):
252
253 token_type = 'bare-quoted-string'
254
255 def __str__(self):
256 return quote_string(''.join(str(x) for x in self))
257
258 @property
259 def value(self):
260 return ''.join(str(x) for x in self)
261
262
263 class ESC[4;38;5;81mComment(ESC[4;38;5;149mWhiteSpaceTokenList):
264
265 token_type = 'comment'
266
267 def __str__(self):
268 return ''.join(sum([
269 ["("],
270 [self.quote(x) for x in self],
271 [")"],
272 ], []))
273
274 def quote(self, value):
275 if value.token_type == 'comment':
276 return str(value)
277 return str(value).replace('\\', '\\\\').replace(
278 '(', r'\(').replace(
279 ')', r'\)')
280
281 @property
282 def content(self):
283 return ''.join(str(x) for x in self)
284
285 @property
286 def comments(self):
287 return [self.content]
288
289 class ESC[4;38;5;81mAddressList(ESC[4;38;5;149mTokenList):
290
291 token_type = 'address-list'
292
293 @property
294 def addresses(self):
295 return [x for x in self if x.token_type=='address']
296
297 @property
298 def mailboxes(self):
299 return sum((x.mailboxes
300 for x in self if x.token_type=='address'), [])
301
302 @property
303 def all_mailboxes(self):
304 return sum((x.all_mailboxes
305 for x in self if x.token_type=='address'), [])
306
307
308 class ESC[4;38;5;81mAddress(ESC[4;38;5;149mTokenList):
309
310 token_type = 'address'
311
312 @property
313 def display_name(self):
314 if self[0].token_type == 'group':
315 return self[0].display_name
316
317 @property
318 def mailboxes(self):
319 if self[0].token_type == 'mailbox':
320 return [self[0]]
321 elif self[0].token_type == 'invalid-mailbox':
322 return []
323 return self[0].mailboxes
324
325 @property
326 def all_mailboxes(self):
327 if self[0].token_type == 'mailbox':
328 return [self[0]]
329 elif self[0].token_type == 'invalid-mailbox':
330 return [self[0]]
331 return self[0].all_mailboxes
332
333 class ESC[4;38;5;81mMailboxList(ESC[4;38;5;149mTokenList):
334
335 token_type = 'mailbox-list'
336
337 @property
338 def mailboxes(self):
339 return [x for x in self if x.token_type=='mailbox']
340
341 @property
342 def all_mailboxes(self):
343 return [x for x in self
344 if x.token_type in ('mailbox', 'invalid-mailbox')]
345
346
347 class ESC[4;38;5;81mGroupList(ESC[4;38;5;149mTokenList):
348
349 token_type = 'group-list'
350
351 @property
352 def mailboxes(self):
353 if not self or self[0].token_type != 'mailbox-list':
354 return []
355 return self[0].mailboxes
356
357 @property
358 def all_mailboxes(self):
359 if not self or self[0].token_type != 'mailbox-list':
360 return []
361 return self[0].all_mailboxes
362
363
364 class ESC[4;38;5;81mGroup(ESC[4;38;5;149mTokenList):
365
366 token_type = "group"
367
368 @property
369 def mailboxes(self):
370 if self[2].token_type != 'group-list':
371 return []
372 return self[2].mailboxes
373
374 @property
375 def all_mailboxes(self):
376 if self[2].token_type != 'group-list':
377 return []
378 return self[2].all_mailboxes
379
380 @property
381 def display_name(self):
382 return self[0].display_name
383
384
385 class ESC[4;38;5;81mNameAddr(ESC[4;38;5;149mTokenList):
386
387 token_type = 'name-addr'
388
389 @property
390 def display_name(self):
391 if len(self) == 1:
392 return None
393 return self[0].display_name
394
395 @property
396 def local_part(self):
397 return self[-1].local_part
398
399 @property
400 def domain(self):
401 return self[-1].domain
402
403 @property
404 def route(self):
405 return self[-1].route
406
407 @property
408 def addr_spec(self):
409 return self[-1].addr_spec
410
411
412 class ESC[4;38;5;81mAngleAddr(ESC[4;38;5;149mTokenList):
413
414 token_type = 'angle-addr'
415
416 @property
417 def local_part(self):
418 for x in self:
419 if x.token_type == 'addr-spec':
420 return x.local_part
421
422 @property
423 def domain(self):
424 for x in self:
425 if x.token_type == 'addr-spec':
426 return x.domain
427
428 @property
429 def route(self):
430 for x in self:
431 if x.token_type == 'obs-route':
432 return x.domains
433
434 @property
435 def addr_spec(self):
436 for x in self:
437 if x.token_type == 'addr-spec':
438 if x.local_part:
439 return x.addr_spec
440 else:
441 return quote_string(x.local_part) + x.addr_spec
442 else:
443 return '<>'
444
445
446 class ESC[4;38;5;81mObsRoute(ESC[4;38;5;149mTokenList):
447
448 token_type = 'obs-route'
449
450 @property
451 def domains(self):
452 return [x.domain for x in self if x.token_type == 'domain']
453
454
455 class ESC[4;38;5;81mMailbox(ESC[4;38;5;149mTokenList):
456
457 token_type = 'mailbox'
458
459 @property
460 def display_name(self):
461 if self[0].token_type == 'name-addr':
462 return self[0].display_name
463
464 @property
465 def local_part(self):
466 return self[0].local_part
467
468 @property
469 def domain(self):
470 return self[0].domain
471
472 @property
473 def route(self):
474 if self[0].token_type == 'name-addr':
475 return self[0].route
476
477 @property
478 def addr_spec(self):
479 return self[0].addr_spec
480
481
482 class ESC[4;38;5;81mInvalidMailbox(ESC[4;38;5;149mTokenList):
483
484 token_type = 'invalid-mailbox'
485
486 @property
487 def display_name(self):
488 return None
489
490 local_part = domain = route = addr_spec = display_name
491
492
493 class ESC[4;38;5;81mDomain(ESC[4;38;5;149mTokenList):
494
495 token_type = 'domain'
496 as_ew_allowed = False
497
498 @property
499 def domain(self):
500 return ''.join(super().value.split())
501
502
503 class ESC[4;38;5;81mDotAtom(ESC[4;38;5;149mTokenList):
504 token_type = 'dot-atom'
505
506
507 class ESC[4;38;5;81mDotAtomText(ESC[4;38;5;149mTokenList):
508 token_type = 'dot-atom-text'
509 as_ew_allowed = True
510
511
512 class ESC[4;38;5;81mNoFoldLiteral(ESC[4;38;5;149mTokenList):
513 token_type = 'no-fold-literal'
514 as_ew_allowed = False
515
516
517 class ESC[4;38;5;81mAddrSpec(ESC[4;38;5;149mTokenList):
518
519 token_type = 'addr-spec'
520 as_ew_allowed = False
521
522 @property
523 def local_part(self):
524 return self[0].local_part
525
526 @property
527 def domain(self):
528 if len(self) < 3:
529 return None
530 return self[-1].domain
531
532 @property
533 def value(self):
534 if len(self) < 3:
535 return self[0].value
536 return self[0].value.rstrip()+self[1].value+self[2].value.lstrip()
537
538 @property
539 def addr_spec(self):
540 nameset = set(self.local_part)
541 if len(nameset) > len(nameset-DOT_ATOM_ENDS):
542 lp = quote_string(self.local_part)
543 else:
544 lp = self.local_part
545 if self.domain is not None:
546 return lp + '@' + self.domain
547 return lp
548
549
550 class ESC[4;38;5;81mObsLocalPart(ESC[4;38;5;149mTokenList):
551
552 token_type = 'obs-local-part'
553 as_ew_allowed = False
554
555
556 class ESC[4;38;5;81mDisplayName(ESC[4;38;5;149mPhrase):
557
558 token_type = 'display-name'
559 ew_combine_allowed = False
560
561 @property
562 def display_name(self):
563 res = TokenList(self)
564 if len(res) == 0:
565 return res.value
566 if res[0].token_type == 'cfws':
567 res.pop(0)
568 else:
569 if res[0][0].token_type == 'cfws':
570 res[0] = TokenList(res[0][1:])
571 if res[-1].token_type == 'cfws':
572 res.pop()
573 else:
574 if res[-1][-1].token_type == 'cfws':
575 res[-1] = TokenList(res[-1][:-1])
576 return res.value
577
578 @property
579 def value(self):
580 quote = False
581 if self.defects:
582 quote = True
583 else:
584 for x in self:
585 if x.token_type == 'quoted-string':
586 quote = True
587 if len(self) != 0 and quote:
588 pre = post = ''
589 if self[0].token_type=='cfws' or self[0][0].token_type=='cfws':
590 pre = ' '
591 if self[-1].token_type=='cfws' or self[-1][-1].token_type=='cfws':
592 post = ' '
593 return pre+quote_string(self.display_name)+post
594 else:
595 return super().value
596
597
598 class ESC[4;38;5;81mLocalPart(ESC[4;38;5;149mTokenList):
599
600 token_type = 'local-part'
601 as_ew_allowed = False
602
603 @property
604 def value(self):
605 if self[0].token_type == "quoted-string":
606 return self[0].quoted_value
607 else:
608 return self[0].value
609
610 @property
611 def local_part(self):
612 # Strip whitespace from front, back, and around dots.
613 res = [DOT]
614 last = DOT
615 last_is_tl = False
616 for tok in self[0] + [DOT]:
617 if tok.token_type == 'cfws':
618 continue
619 if (last_is_tl and tok.token_type == 'dot' and
620 last[-1].token_type == 'cfws'):
621 res[-1] = TokenList(last[:-1])
622 is_tl = isinstance(tok, TokenList)
623 if (is_tl and last.token_type == 'dot' and
624 tok[0].token_type == 'cfws'):
625 res.append(TokenList(tok[1:]))
626 else:
627 res.append(tok)
628 last = res[-1]
629 last_is_tl = is_tl
630 res = TokenList(res[1:-1])
631 return res.value
632
633
634 class ESC[4;38;5;81mDomainLiteral(ESC[4;38;5;149mTokenList):
635
636 token_type = 'domain-literal'
637 as_ew_allowed = False
638
639 @property
640 def domain(self):
641 return ''.join(super().value.split())
642
643 @property
644 def ip(self):
645 for x in self:
646 if x.token_type == 'ptext':
647 return x.value
648
649
650 class ESC[4;38;5;81mMIMEVersion(ESC[4;38;5;149mTokenList):
651
652 token_type = 'mime-version'
653 major = None
654 minor = None
655
656
657 class ESC[4;38;5;81mParameter(ESC[4;38;5;149mTokenList):
658
659 token_type = 'parameter'
660 sectioned = False
661 extended = False
662 charset = 'us-ascii'
663
664 @property
665 def section_number(self):
666 # Because the first token, the attribute (name) eats CFWS, the second
667 # token is always the section if there is one.
668 return self[1].number if self.sectioned else 0
669
670 @property
671 def param_value(self):
672 # This is part of the "handle quoted extended parameters" hack.
673 for token in self:
674 if token.token_type == 'value':
675 return token.stripped_value
676 if token.token_type == 'quoted-string':
677 for token in token:
678 if token.token_type == 'bare-quoted-string':
679 for token in token:
680 if token.token_type == 'value':
681 return token.stripped_value
682 return ''
683
684
685 class ESC[4;38;5;81mInvalidParameter(ESC[4;38;5;149mParameter):
686
687 token_type = 'invalid-parameter'
688
689
690 class ESC[4;38;5;81mAttribute(ESC[4;38;5;149mTokenList):
691
692 token_type = 'attribute'
693
694 @property
695 def stripped_value(self):
696 for token in self:
697 if token.token_type.endswith('attrtext'):
698 return token.value
699
700 class ESC[4;38;5;81mSection(ESC[4;38;5;149mTokenList):
701
702 token_type = 'section'
703 number = None
704
705
706 class ESC[4;38;5;81mValue(ESC[4;38;5;149mTokenList):
707
708 token_type = 'value'
709
710 @property
711 def stripped_value(self):
712 token = self[0]
713 if token.token_type == 'cfws':
714 token = self[1]
715 if token.token_type.endswith(
716 ('quoted-string', 'attribute', 'extended-attribute')):
717 return token.stripped_value
718 return self.value
719
720
721 class ESC[4;38;5;81mMimeParameters(ESC[4;38;5;149mTokenList):
722
723 token_type = 'mime-parameters'
724 syntactic_break = False
725
726 @property
727 def params(self):
728 # The RFC specifically states that the ordering of parameters is not
729 # guaranteed and may be reordered by the transport layer. So we have
730 # to assume the RFC 2231 pieces can come in any order. However, we
731 # output them in the order that we first see a given name, which gives
732 # us a stable __str__.
733 params = {} # Using order preserving dict from Python 3.7+
734 for token in self:
735 if not token.token_type.endswith('parameter'):
736 continue
737 if token[0].token_type != 'attribute':
738 continue
739 name = token[0].value.strip()
740 if name not in params:
741 params[name] = []
742 params[name].append((token.section_number, token))
743 for name, parts in params.items():
744 parts = sorted(parts, key=itemgetter(0))
745 first_param = parts[0][1]
746 charset = first_param.charset
747 # Our arbitrary error recovery is to ignore duplicate parameters,
748 # to use appearance order if there are duplicate rfc 2231 parts,
749 # and to ignore gaps. This mimics the error recovery of get_param.
750 if not first_param.extended and len(parts) > 1:
751 if parts[1][0] == 0:
752 parts[1][1].defects.append(errors.InvalidHeaderDefect(
753 'duplicate parameter name; duplicate(s) ignored'))
754 parts = parts[:1]
755 # Else assume the *0* was missing...note that this is different
756 # from get_param, but we registered a defect for this earlier.
757 value_parts = []
758 i = 0
759 for section_number, param in parts:
760 if section_number != i:
761 # We could get fancier here and look for a complete
762 # duplicate extended parameter and ignore the second one
763 # seen. But we're not doing that. The old code didn't.
764 if not param.extended:
765 param.defects.append(errors.InvalidHeaderDefect(
766 'duplicate parameter name; duplicate ignored'))
767 continue
768 else:
769 param.defects.append(errors.InvalidHeaderDefect(
770 "inconsistent RFC2231 parameter numbering"))
771 i += 1
772 value = param.param_value
773 if param.extended:
774 try:
775 value = urllib.parse.unquote_to_bytes(value)
776 except UnicodeEncodeError:
777 # source had surrogate escaped bytes. What we do now
778 # is a bit of an open question. I'm not sure this is
779 # the best choice, but it is what the old algorithm did
780 value = urllib.parse.unquote(value, encoding='latin-1')
781 else:
782 try:
783 value = value.decode(charset, 'surrogateescape')
784 except (LookupError, UnicodeEncodeError):
785 # XXX: there should really be a custom defect for
786 # unknown character set to make it easy to find,
787 # because otherwise unknown charset is a silent
788 # failure.
789 value = value.decode('us-ascii', 'surrogateescape')
790 if utils._has_surrogates(value):
791 param.defects.append(errors.UndecodableBytesDefect())
792 value_parts.append(value)
793 value = ''.join(value_parts)
794 yield name, value
795
796 def __str__(self):
797 params = []
798 for name, value in self.params:
799 if value:
800 params.append('{}={}'.format(name, quote_string(value)))
801 else:
802 params.append(name)
803 params = '; '.join(params)
804 return ' ' + params if params else ''
805
806
807 class ESC[4;38;5;81mParameterizedHeaderValue(ESC[4;38;5;149mTokenList):
808
809 # Set this false so that the value doesn't wind up on a new line even
810 # if it and the parameters would fit there but not on the first line.
811 syntactic_break = False
812
813 @property
814 def params(self):
815 for token in reversed(self):
816 if token.token_type == 'mime-parameters':
817 return token.params
818 return {}
819
820
821 class ESC[4;38;5;81mContentType(ESC[4;38;5;149mParameterizedHeaderValue):
822 token_type = 'content-type'
823 as_ew_allowed = False
824 maintype = 'text'
825 subtype = 'plain'
826
827
828 class ESC[4;38;5;81mContentDisposition(ESC[4;38;5;149mParameterizedHeaderValue):
829 token_type = 'content-disposition'
830 as_ew_allowed = False
831 content_disposition = None
832
833
834 class ESC[4;38;5;81mContentTransferEncoding(ESC[4;38;5;149mTokenList):
835 token_type = 'content-transfer-encoding'
836 as_ew_allowed = False
837 cte = '7bit'
838
839
840 class ESC[4;38;5;81mHeaderLabel(ESC[4;38;5;149mTokenList):
841 token_type = 'header-label'
842 as_ew_allowed = False
843
844
845 class ESC[4;38;5;81mMsgID(ESC[4;38;5;149mTokenList):
846 token_type = 'msg-id'
847 as_ew_allowed = False
848
849 def fold(self, policy):
850 # message-id tokens may not be folded.
851 return str(self) + policy.linesep
852
853
854 class ESC[4;38;5;81mMessageID(ESC[4;38;5;149mMsgID):
855 token_type = 'message-id'
856
857
858 class ESC[4;38;5;81mInvalidMessageID(ESC[4;38;5;149mMessageID):
859 token_type = 'invalid-message-id'
860
861
862 class ESC[4;38;5;81mHeader(ESC[4;38;5;149mTokenList):
863 token_type = 'header'
864
865
866 #
867 # Terminal classes and instances
868 #
869
870 class ESC[4;38;5;81mTerminal(ESC[4;38;5;149mstr):
871
872 as_ew_allowed = True
873 ew_combine_allowed = True
874 syntactic_break = True
875
876 def __new__(cls, value, token_type):
877 self = super().__new__(cls, value)
878 self.token_type = token_type
879 self.defects = []
880 return self
881
882 def __repr__(self):
883 return "{}({})".format(self.__class__.__name__, super().__repr__())
884
885 def pprint(self):
886 print(self.__class__.__name__ + '/' + self.token_type)
887
888 @property
889 def all_defects(self):
890 return list(self.defects)
891
892 def _pp(self, indent=''):
893 return ["{}{}/{}({}){}".format(
894 indent,
895 self.__class__.__name__,
896 self.token_type,
897 super().__repr__(),
898 '' if not self.defects else ' {}'.format(self.defects),
899 )]
900
901 def pop_trailing_ws(self):
902 # This terminates the recursion.
903 return None
904
905 @property
906 def comments(self):
907 return []
908
909 def __getnewargs__(self):
910 return(str(self), self.token_type)
911
912
913 class ESC[4;38;5;81mWhiteSpaceTerminal(ESC[4;38;5;149mTerminal):
914
915 @property
916 def value(self):
917 return ' '
918
919 def startswith_fws(self):
920 return True
921
922
923 class ESC[4;38;5;81mValueTerminal(ESC[4;38;5;149mTerminal):
924
925 @property
926 def value(self):
927 return self
928
929 def startswith_fws(self):
930 return False
931
932
933 class ESC[4;38;5;81mEWWhiteSpaceTerminal(ESC[4;38;5;149mWhiteSpaceTerminal):
934
935 @property
936 def value(self):
937 return ''
938
939 def __str__(self):
940 return ''
941
942
943 class ESC[4;38;5;81m_InvalidEwError(ESC[4;38;5;149merrorsESC[4;38;5;149m.ESC[4;38;5;149mHeaderParseError):
944 """Invalid encoded word found while parsing headers."""
945
946
947 # XXX these need to become classes and used as instances so
948 # that a program can't change them in a parse tree and screw
949 # up other parse trees. Maybe should have tests for that, too.
950 DOT = ValueTerminal('.', 'dot')
951 ListSeparator = ValueTerminal(',', 'list-separator')
952 RouteComponentMarker = ValueTerminal('@', 'route-component-marker')
953
954 #
955 # Parser
956 #
957
958 # Parse strings according to RFC822/2047/2822/5322 rules.
959 #
960 # This is a stateless parser. Each get_XXX function accepts a string and
961 # returns either a Terminal or a TokenList representing the RFC object named
962 # by the method and a string containing the remaining unparsed characters
963 # from the input. Thus a parser method consumes the next syntactic construct
964 # of a given type and returns a token representing the construct plus the
965 # unparsed remainder of the input string.
966 #
967 # For example, if the first element of a structured header is a 'phrase',
968 # then:
969 #
970 # phrase, value = get_phrase(value)
971 #
972 # returns the complete phrase from the start of the string value, plus any
973 # characters left in the string after the phrase is removed.
974
975 _wsp_splitter = re.compile(r'([{}]+)'.format(''.join(WSP))).split
976 _non_atom_end_matcher = re.compile(r"[^{}]+".format(
977 re.escape(''.join(ATOM_ENDS)))).match
978 _non_printable_finder = re.compile(r"[\x00-\x20\x7F]").findall
979 _non_token_end_matcher = re.compile(r"[^{}]+".format(
980 re.escape(''.join(TOKEN_ENDS)))).match
981 _non_attribute_end_matcher = re.compile(r"[^{}]+".format(
982 re.escape(''.join(ATTRIBUTE_ENDS)))).match
983 _non_extended_attribute_end_matcher = re.compile(r"[^{}]+".format(
984 re.escape(''.join(EXTENDED_ATTRIBUTE_ENDS)))).match
985
986 def _validate_xtext(xtext):
987 """If input token contains ASCII non-printables, register a defect."""
988
989 non_printables = _non_printable_finder(xtext)
990 if non_printables:
991 xtext.defects.append(errors.NonPrintableDefect(non_printables))
992 if utils._has_surrogates(xtext):
993 xtext.defects.append(errors.UndecodableBytesDefect(
994 "Non-ASCII characters found in header token"))
995
996 def _get_ptext_to_endchars(value, endchars):
997 """Scan printables/quoted-pairs until endchars and return unquoted ptext.
998
999 This function turns a run of qcontent, ccontent-without-comments, or
1000 dtext-with-quoted-printables into a single string by unquoting any
1001 quoted printables. It returns the string, the remaining value, and
1002 a flag that is True iff there were any quoted printables decoded.
1003
1004 """
1005 fragment, *remainder = _wsp_splitter(value, 1)
1006 vchars = []
1007 escape = False
1008 had_qp = False
1009 for pos in range(len(fragment)):
1010 if fragment[pos] == '\\':
1011 if escape:
1012 escape = False
1013 had_qp = True
1014 else:
1015 escape = True
1016 continue
1017 if escape:
1018 escape = False
1019 elif fragment[pos] in endchars:
1020 break
1021 vchars.append(fragment[pos])
1022 else:
1023 pos = pos + 1
1024 return ''.join(vchars), ''.join([fragment[pos:]] + remainder), had_qp
1025
1026 def get_fws(value):
1027 """FWS = 1*WSP
1028
1029 This isn't the RFC definition. We're using fws to represent tokens where
1030 folding can be done, but when we are parsing the *un*folding has already
1031 been done so we don't need to watch out for CRLF.
1032
1033 """
1034 newvalue = value.lstrip()
1035 fws = WhiteSpaceTerminal(value[:len(value)-len(newvalue)], 'fws')
1036 return fws, newvalue
1037
1038 def get_encoded_word(value):
1039 """ encoded-word = "=?" charset "?" encoding "?" encoded-text "?="
1040
1041 """
1042 ew = EncodedWord()
1043 if not value.startswith('=?'):
1044 raise errors.HeaderParseError(
1045 "expected encoded word but found {}".format(value))
1046 tok, *remainder = value[2:].split('?=', 1)
1047 if tok == value[2:]:
1048 raise errors.HeaderParseError(
1049 "expected encoded word but found {}".format(value))
1050 remstr = ''.join(remainder)
1051 if (len(remstr) > 1 and
1052 remstr[0] in hexdigits and
1053 remstr[1] in hexdigits and
1054 tok.count('?') < 2):
1055 # The ? after the CTE was followed by an encoded word escape (=XX).
1056 rest, *remainder = remstr.split('?=', 1)
1057 tok = tok + '?=' + rest
1058 if len(tok.split()) > 1:
1059 ew.defects.append(errors.InvalidHeaderDefect(
1060 "whitespace inside encoded word"))
1061 ew.cte = value
1062 value = ''.join(remainder)
1063 try:
1064 text, charset, lang, defects = _ew.decode('=?' + tok + '?=')
1065 except (ValueError, KeyError):
1066 raise _InvalidEwError(
1067 "encoded word format invalid: '{}'".format(ew.cte))
1068 ew.charset = charset
1069 ew.lang = lang
1070 ew.defects.extend(defects)
1071 while text:
1072 if text[0] in WSP:
1073 token, text = get_fws(text)
1074 ew.append(token)
1075 continue
1076 chars, *remainder = _wsp_splitter(text, 1)
1077 vtext = ValueTerminal(chars, 'vtext')
1078 _validate_xtext(vtext)
1079 ew.append(vtext)
1080 text = ''.join(remainder)
1081 # Encoded words should be followed by a WS
1082 if value and value[0] not in WSP:
1083 ew.defects.append(errors.InvalidHeaderDefect(
1084 "missing trailing whitespace after encoded-word"))
1085 return ew, value
1086
1087 def get_unstructured(value):
1088 """unstructured = (*([FWS] vchar) *WSP) / obs-unstruct
1089 obs-unstruct = *((*LF *CR *(obs-utext) *LF *CR)) / FWS)
1090 obs-utext = %d0 / obs-NO-WS-CTL / LF / CR
1091
1092 obs-NO-WS-CTL is control characters except WSP/CR/LF.
1093
1094 So, basically, we have printable runs, plus control characters or nulls in
1095 the obsolete syntax, separated by whitespace. Since RFC 2047 uses the
1096 obsolete syntax in its specification, but requires whitespace on either
1097 side of the encoded words, I can see no reason to need to separate the
1098 non-printable-non-whitespace from the printable runs if they occur, so we
1099 parse this into xtext tokens separated by WSP tokens.
1100
1101 Because an 'unstructured' value must by definition constitute the entire
1102 value, this 'get' routine does not return a remaining value, only the
1103 parsed TokenList.
1104
1105 """
1106 # XXX: but what about bare CR and LF? They might signal the start or
1107 # end of an encoded word. YAGNI for now, since our current parsers
1108 # will never send us strings with bare CR or LF.
1109
1110 unstructured = UnstructuredTokenList()
1111 while value:
1112 if value[0] in WSP:
1113 token, value = get_fws(value)
1114 unstructured.append(token)
1115 continue
1116 valid_ew = True
1117 if value.startswith('=?'):
1118 try:
1119 token, value = get_encoded_word(value)
1120 except _InvalidEwError:
1121 valid_ew = False
1122 except errors.HeaderParseError:
1123 # XXX: Need to figure out how to register defects when
1124 # appropriate here.
1125 pass
1126 else:
1127 have_ws = True
1128 if len(unstructured) > 0:
1129 if unstructured[-1].token_type != 'fws':
1130 unstructured.defects.append(errors.InvalidHeaderDefect(
1131 "missing whitespace before encoded word"))
1132 have_ws = False
1133 if have_ws and len(unstructured) > 1:
1134 if unstructured[-2].token_type == 'encoded-word':
1135 unstructured[-1] = EWWhiteSpaceTerminal(
1136 unstructured[-1], 'fws')
1137 unstructured.append(token)
1138 continue
1139 tok, *remainder = _wsp_splitter(value, 1)
1140 # Split in the middle of an atom if there is a rfc2047 encoded word
1141 # which does not have WSP on both sides. The defect will be registered
1142 # the next time through the loop.
1143 # This needs to only be performed when the encoded word is valid;
1144 # otherwise, performing it on an invalid encoded word can cause
1145 # the parser to go in an infinite loop.
1146 if valid_ew and rfc2047_matcher.search(tok):
1147 tok, *remainder = value.partition('=?')
1148 vtext = ValueTerminal(tok, 'vtext')
1149 _validate_xtext(vtext)
1150 unstructured.append(vtext)
1151 value = ''.join(remainder)
1152 return unstructured
1153
1154 def get_qp_ctext(value):
1155 r"""ctext = <printable ascii except \ ( )>
1156
1157 This is not the RFC ctext, since we are handling nested comments in comment
1158 and unquoting quoted-pairs here. We allow anything except the '()'
1159 characters, but if we find any ASCII other than the RFC defined printable
1160 ASCII, a NonPrintableDefect is added to the token's defects list. Since
1161 quoted pairs are converted to their unquoted values, what is returned is
1162 a 'ptext' token. In this case it is a WhiteSpaceTerminal, so it's value
1163 is ' '.
1164
1165 """
1166 ptext, value, _ = _get_ptext_to_endchars(value, '()')
1167 ptext = WhiteSpaceTerminal(ptext, 'ptext')
1168 _validate_xtext(ptext)
1169 return ptext, value
1170
1171 def get_qcontent(value):
1172 """qcontent = qtext / quoted-pair
1173
1174 We allow anything except the DQUOTE character, but if we find any ASCII
1175 other than the RFC defined printable ASCII, a NonPrintableDefect is
1176 added to the token's defects list. Any quoted pairs are converted to their
1177 unquoted values, so what is returned is a 'ptext' token. In this case it
1178 is a ValueTerminal.
1179
1180 """
1181 ptext, value, _ = _get_ptext_to_endchars(value, '"')
1182 ptext = ValueTerminal(ptext, 'ptext')
1183 _validate_xtext(ptext)
1184 return ptext, value
1185
1186 def get_atext(value):
1187 """atext = <matches _atext_matcher>
1188
1189 We allow any non-ATOM_ENDS in atext, but add an InvalidATextDefect to
1190 the token's defects list if we find non-atext characters.
1191 """
1192 m = _non_atom_end_matcher(value)
1193 if not m:
1194 raise errors.HeaderParseError(
1195 "expected atext but found '{}'".format(value))
1196 atext = m.group()
1197 value = value[len(atext):]
1198 atext = ValueTerminal(atext, 'atext')
1199 _validate_xtext(atext)
1200 return atext, value
1201
1202 def get_bare_quoted_string(value):
1203 """bare-quoted-string = DQUOTE *([FWS] qcontent) [FWS] DQUOTE
1204
1205 A quoted-string without the leading or trailing white space. Its
1206 value is the text between the quote marks, with whitespace
1207 preserved and quoted pairs decoded.
1208 """
1209 if value[0] != '"':
1210 raise errors.HeaderParseError(
1211 "expected '\"' but found '{}'".format(value))
1212 bare_quoted_string = BareQuotedString()
1213 value = value[1:]
1214 if value and value[0] == '"':
1215 token, value = get_qcontent(value)
1216 bare_quoted_string.append(token)
1217 while value and value[0] != '"':
1218 if value[0] in WSP:
1219 token, value = get_fws(value)
1220 elif value[:2] == '=?':
1221 valid_ew = False
1222 try:
1223 token, value = get_encoded_word(value)
1224 bare_quoted_string.defects.append(errors.InvalidHeaderDefect(
1225 "encoded word inside quoted string"))
1226 valid_ew = True
1227 except errors.HeaderParseError:
1228 token, value = get_qcontent(value)
1229 # Collapse the whitespace between two encoded words that occur in a
1230 # bare-quoted-string.
1231 if valid_ew and len(bare_quoted_string) > 1:
1232 if (bare_quoted_string[-1].token_type == 'fws' and
1233 bare_quoted_string[-2].token_type == 'encoded-word'):
1234 bare_quoted_string[-1] = EWWhiteSpaceTerminal(
1235 bare_quoted_string[-1], 'fws')
1236 else:
1237 token, value = get_qcontent(value)
1238 bare_quoted_string.append(token)
1239 if not value:
1240 bare_quoted_string.defects.append(errors.InvalidHeaderDefect(
1241 "end of header inside quoted string"))
1242 return bare_quoted_string, value
1243 return bare_quoted_string, value[1:]
1244
1245 def get_comment(value):
1246 """comment = "(" *([FWS] ccontent) [FWS] ")"
1247 ccontent = ctext / quoted-pair / comment
1248
1249 We handle nested comments here, and quoted-pair in our qp-ctext routine.
1250 """
1251 if value and value[0] != '(':
1252 raise errors.HeaderParseError(
1253 "expected '(' but found '{}'".format(value))
1254 comment = Comment()
1255 value = value[1:]
1256 while value and value[0] != ")":
1257 if value[0] in WSP:
1258 token, value = get_fws(value)
1259 elif value[0] == '(':
1260 token, value = get_comment(value)
1261 else:
1262 token, value = get_qp_ctext(value)
1263 comment.append(token)
1264 if not value:
1265 comment.defects.append(errors.InvalidHeaderDefect(
1266 "end of header inside comment"))
1267 return comment, value
1268 return comment, value[1:]
1269
1270 def get_cfws(value):
1271 """CFWS = (1*([FWS] comment) [FWS]) / FWS
1272
1273 """
1274 cfws = CFWSList()
1275 while value and value[0] in CFWS_LEADER:
1276 if value[0] in WSP:
1277 token, value = get_fws(value)
1278 else:
1279 token, value = get_comment(value)
1280 cfws.append(token)
1281 return cfws, value
1282
1283 def get_quoted_string(value):
1284 """quoted-string = [CFWS] <bare-quoted-string> [CFWS]
1285
1286 'bare-quoted-string' is an intermediate class defined by this
1287 parser and not by the RFC grammar. It is the quoted string
1288 without any attached CFWS.
1289 """
1290 quoted_string = QuotedString()
1291 if value and value[0] in CFWS_LEADER:
1292 token, value = get_cfws(value)
1293 quoted_string.append(token)
1294 token, value = get_bare_quoted_string(value)
1295 quoted_string.append(token)
1296 if value and value[0] in CFWS_LEADER:
1297 token, value = get_cfws(value)
1298 quoted_string.append(token)
1299 return quoted_string, value
1300
1301 def get_atom(value):
1302 """atom = [CFWS] 1*atext [CFWS]
1303
1304 An atom could be an rfc2047 encoded word.
1305 """
1306 atom = Atom()
1307 if value and value[0] in CFWS_LEADER:
1308 token, value = get_cfws(value)
1309 atom.append(token)
1310 if value and value[0] in ATOM_ENDS:
1311 raise errors.HeaderParseError(
1312 "expected atom but found '{}'".format(value))
1313 if value.startswith('=?'):
1314 try:
1315 token, value = get_encoded_word(value)
1316 except errors.HeaderParseError:
1317 # XXX: need to figure out how to register defects when
1318 # appropriate here.
1319 token, value = get_atext(value)
1320 else:
1321 token, value = get_atext(value)
1322 atom.append(token)
1323 if value and value[0] in CFWS_LEADER:
1324 token, value = get_cfws(value)
1325 atom.append(token)
1326 return atom, value
1327
1328 def get_dot_atom_text(value):
1329 """ dot-text = 1*atext *("." 1*atext)
1330
1331 """
1332 dot_atom_text = DotAtomText()
1333 if not value or value[0] in ATOM_ENDS:
1334 raise errors.HeaderParseError("expected atom at a start of "
1335 "dot-atom-text but found '{}'".format(value))
1336 while value and value[0] not in ATOM_ENDS:
1337 token, value = get_atext(value)
1338 dot_atom_text.append(token)
1339 if value and value[0] == '.':
1340 dot_atom_text.append(DOT)
1341 value = value[1:]
1342 if dot_atom_text[-1] is DOT:
1343 raise errors.HeaderParseError("expected atom at end of dot-atom-text "
1344 "but found '{}'".format('.'+value))
1345 return dot_atom_text, value
1346
1347 def get_dot_atom(value):
1348 """ dot-atom = [CFWS] dot-atom-text [CFWS]
1349
1350 Any place we can have a dot atom, we could instead have an rfc2047 encoded
1351 word.
1352 """
1353 dot_atom = DotAtom()
1354 if value[0] in CFWS_LEADER:
1355 token, value = get_cfws(value)
1356 dot_atom.append(token)
1357 if value.startswith('=?'):
1358 try:
1359 token, value = get_encoded_word(value)
1360 except errors.HeaderParseError:
1361 # XXX: need to figure out how to register defects when
1362 # appropriate here.
1363 token, value = get_dot_atom_text(value)
1364 else:
1365 token, value = get_dot_atom_text(value)
1366 dot_atom.append(token)
1367 if value and value[0] in CFWS_LEADER:
1368 token, value = get_cfws(value)
1369 dot_atom.append(token)
1370 return dot_atom, value
1371
1372 def get_word(value):
1373 """word = atom / quoted-string
1374
1375 Either atom or quoted-string may start with CFWS. We have to peel off this
1376 CFWS first to determine which type of word to parse. Afterward we splice
1377 the leading CFWS, if any, into the parsed sub-token.
1378
1379 If neither an atom or a quoted-string is found before the next special, a
1380 HeaderParseError is raised.
1381
1382 The token returned is either an Atom or a QuotedString, as appropriate.
1383 This means the 'word' level of the formal grammar is not represented in the
1384 parse tree; this is because having that extra layer when manipulating the
1385 parse tree is more confusing than it is helpful.
1386
1387 """
1388 if value[0] in CFWS_LEADER:
1389 leader, value = get_cfws(value)
1390 else:
1391 leader = None
1392 if not value:
1393 raise errors.HeaderParseError(
1394 "Expected 'atom' or 'quoted-string' but found nothing.")
1395 if value[0]=='"':
1396 token, value = get_quoted_string(value)
1397 elif value[0] in SPECIALS:
1398 raise errors.HeaderParseError("Expected 'atom' or 'quoted-string' "
1399 "but found '{}'".format(value))
1400 else:
1401 token, value = get_atom(value)
1402 if leader is not None:
1403 token[:0] = [leader]
1404 return token, value
1405
1406 def get_phrase(value):
1407 """ phrase = 1*word / obs-phrase
1408 obs-phrase = word *(word / "." / CFWS)
1409
1410 This means a phrase can be a sequence of words, periods, and CFWS in any
1411 order as long as it starts with at least one word. If anything other than
1412 words is detected, an ObsoleteHeaderDefect is added to the token's defect
1413 list. We also accept a phrase that starts with CFWS followed by a dot;
1414 this is registered as an InvalidHeaderDefect, since it is not supported by
1415 even the obsolete grammar.
1416
1417 """
1418 phrase = Phrase()
1419 try:
1420 token, value = get_word(value)
1421 phrase.append(token)
1422 except errors.HeaderParseError:
1423 phrase.defects.append(errors.InvalidHeaderDefect(
1424 "phrase does not start with word"))
1425 while value and value[0] not in PHRASE_ENDS:
1426 if value[0]=='.':
1427 phrase.append(DOT)
1428 phrase.defects.append(errors.ObsoleteHeaderDefect(
1429 "period in 'phrase'"))
1430 value = value[1:]
1431 else:
1432 try:
1433 token, value = get_word(value)
1434 except errors.HeaderParseError:
1435 if value[0] in CFWS_LEADER:
1436 token, value = get_cfws(value)
1437 phrase.defects.append(errors.ObsoleteHeaderDefect(
1438 "comment found without atom"))
1439 else:
1440 raise
1441 phrase.append(token)
1442 return phrase, value
1443
1444 def get_local_part(value):
1445 """ local-part = dot-atom / quoted-string / obs-local-part
1446
1447 """
1448 local_part = LocalPart()
1449 leader = None
1450 if value[0] in CFWS_LEADER:
1451 leader, value = get_cfws(value)
1452 if not value:
1453 raise errors.HeaderParseError(
1454 "expected local-part but found '{}'".format(value))
1455 try:
1456 token, value = get_dot_atom(value)
1457 except errors.HeaderParseError:
1458 try:
1459 token, value = get_word(value)
1460 except errors.HeaderParseError:
1461 if value[0] != '\\' and value[0] in PHRASE_ENDS:
1462 raise
1463 token = TokenList()
1464 if leader is not None:
1465 token[:0] = [leader]
1466 local_part.append(token)
1467 if value and (value[0]=='\\' or value[0] not in PHRASE_ENDS):
1468 obs_local_part, value = get_obs_local_part(str(local_part) + value)
1469 if obs_local_part.token_type == 'invalid-obs-local-part':
1470 local_part.defects.append(errors.InvalidHeaderDefect(
1471 "local-part is not dot-atom, quoted-string, or obs-local-part"))
1472 else:
1473 local_part.defects.append(errors.ObsoleteHeaderDefect(
1474 "local-part is not a dot-atom (contains CFWS)"))
1475 local_part[0] = obs_local_part
1476 try:
1477 local_part.value.encode('ascii')
1478 except UnicodeEncodeError:
1479 local_part.defects.append(errors.NonASCIILocalPartDefect(
1480 "local-part contains non-ASCII characters)"))
1481 return local_part, value
1482
1483 def get_obs_local_part(value):
1484 """ obs-local-part = word *("." word)
1485 """
1486 obs_local_part = ObsLocalPart()
1487 last_non_ws_was_dot = False
1488 while value and (value[0]=='\\' or value[0] not in PHRASE_ENDS):
1489 if value[0] == '.':
1490 if last_non_ws_was_dot:
1491 obs_local_part.defects.append(errors.InvalidHeaderDefect(
1492 "invalid repeated '.'"))
1493 obs_local_part.append(DOT)
1494 last_non_ws_was_dot = True
1495 value = value[1:]
1496 continue
1497 elif value[0]=='\\':
1498 obs_local_part.append(ValueTerminal(value[0],
1499 'misplaced-special'))
1500 value = value[1:]
1501 obs_local_part.defects.append(errors.InvalidHeaderDefect(
1502 "'\\' character outside of quoted-string/ccontent"))
1503 last_non_ws_was_dot = False
1504 continue
1505 if obs_local_part and obs_local_part[-1].token_type != 'dot':
1506 obs_local_part.defects.append(errors.InvalidHeaderDefect(
1507 "missing '.' between words"))
1508 try:
1509 token, value = get_word(value)
1510 last_non_ws_was_dot = False
1511 except errors.HeaderParseError:
1512 if value[0] not in CFWS_LEADER:
1513 raise
1514 token, value = get_cfws(value)
1515 obs_local_part.append(token)
1516 if (obs_local_part[0].token_type == 'dot' or
1517 obs_local_part[0].token_type=='cfws' and
1518 obs_local_part[1].token_type=='dot'):
1519 obs_local_part.defects.append(errors.InvalidHeaderDefect(
1520 "Invalid leading '.' in local part"))
1521 if (obs_local_part[-1].token_type == 'dot' or
1522 obs_local_part[-1].token_type=='cfws' and
1523 obs_local_part[-2].token_type=='dot'):
1524 obs_local_part.defects.append(errors.InvalidHeaderDefect(
1525 "Invalid trailing '.' in local part"))
1526 if obs_local_part.defects:
1527 obs_local_part.token_type = 'invalid-obs-local-part'
1528 return obs_local_part, value
1529
1530 def get_dtext(value):
1531 r""" dtext = <printable ascii except \ [ ]> / obs-dtext
1532 obs-dtext = obs-NO-WS-CTL / quoted-pair
1533
1534 We allow anything except the excluded characters, but if we find any
1535 ASCII other than the RFC defined printable ASCII, a NonPrintableDefect is
1536 added to the token's defects list. Quoted pairs are converted to their
1537 unquoted values, so what is returned is a ptext token, in this case a
1538 ValueTerminal. If there were quoted-printables, an ObsoleteHeaderDefect is
1539 added to the returned token's defect list.
1540
1541 """
1542 ptext, value, had_qp = _get_ptext_to_endchars(value, '[]')
1543 ptext = ValueTerminal(ptext, 'ptext')
1544 if had_qp:
1545 ptext.defects.append(errors.ObsoleteHeaderDefect(
1546 "quoted printable found in domain-literal"))
1547 _validate_xtext(ptext)
1548 return ptext, value
1549
1550 def _check_for_early_dl_end(value, domain_literal):
1551 if value:
1552 return False
1553 domain_literal.append(errors.InvalidHeaderDefect(
1554 "end of input inside domain-literal"))
1555 domain_literal.append(ValueTerminal(']', 'domain-literal-end'))
1556 return True
1557
1558 def get_domain_literal(value):
1559 """ domain-literal = [CFWS] "[" *([FWS] dtext) [FWS] "]" [CFWS]
1560
1561 """
1562 domain_literal = DomainLiteral()
1563 if value[0] in CFWS_LEADER:
1564 token, value = get_cfws(value)
1565 domain_literal.append(token)
1566 if not value:
1567 raise errors.HeaderParseError("expected domain-literal")
1568 if value[0] != '[':
1569 raise errors.HeaderParseError("expected '[' at start of domain-literal "
1570 "but found '{}'".format(value))
1571 value = value[1:]
1572 if _check_for_early_dl_end(value, domain_literal):
1573 return domain_literal, value
1574 domain_literal.append(ValueTerminal('[', 'domain-literal-start'))
1575 if value[0] in WSP:
1576 token, value = get_fws(value)
1577 domain_literal.append(token)
1578 token, value = get_dtext(value)
1579 domain_literal.append(token)
1580 if _check_for_early_dl_end(value, domain_literal):
1581 return domain_literal, value
1582 if value[0] in WSP:
1583 token, value = get_fws(value)
1584 domain_literal.append(token)
1585 if _check_for_early_dl_end(value, domain_literal):
1586 return domain_literal, value
1587 if value[0] != ']':
1588 raise errors.HeaderParseError("expected ']' at end of domain-literal "
1589 "but found '{}'".format(value))
1590 domain_literal.append(ValueTerminal(']', 'domain-literal-end'))
1591 value = value[1:]
1592 if value and value[0] in CFWS_LEADER:
1593 token, value = get_cfws(value)
1594 domain_literal.append(token)
1595 return domain_literal, value
1596
1597 def get_domain(value):
1598 """ domain = dot-atom / domain-literal / obs-domain
1599 obs-domain = atom *("." atom))
1600
1601 """
1602 domain = Domain()
1603 leader = None
1604 if value[0] in CFWS_LEADER:
1605 leader, value = get_cfws(value)
1606 if not value:
1607 raise errors.HeaderParseError(
1608 "expected domain but found '{}'".format(value))
1609 if value[0] == '[':
1610 token, value = get_domain_literal(value)
1611 if leader is not None:
1612 token[:0] = [leader]
1613 domain.append(token)
1614 return domain, value
1615 try:
1616 token, value = get_dot_atom(value)
1617 except errors.HeaderParseError:
1618 token, value = get_atom(value)
1619 if value and value[0] == '@':
1620 raise errors.HeaderParseError('Invalid Domain')
1621 if leader is not None:
1622 token[:0] = [leader]
1623 domain.append(token)
1624 if value and value[0] == '.':
1625 domain.defects.append(errors.ObsoleteHeaderDefect(
1626 "domain is not a dot-atom (contains CFWS)"))
1627 if domain[0].token_type == 'dot-atom':
1628 domain[:] = domain[0]
1629 while value and value[0] == '.':
1630 domain.append(DOT)
1631 token, value = get_atom(value[1:])
1632 domain.append(token)
1633 return domain, value
1634
1635 def get_addr_spec(value):
1636 """ addr-spec = local-part "@" domain
1637
1638 """
1639 addr_spec = AddrSpec()
1640 token, value = get_local_part(value)
1641 addr_spec.append(token)
1642 if not value or value[0] != '@':
1643 addr_spec.defects.append(errors.InvalidHeaderDefect(
1644 "addr-spec local part with no domain"))
1645 return addr_spec, value
1646 addr_spec.append(ValueTerminal('@', 'address-at-symbol'))
1647 token, value = get_domain(value[1:])
1648 addr_spec.append(token)
1649 return addr_spec, value
1650
1651 def get_obs_route(value):
1652 """ obs-route = obs-domain-list ":"
1653 obs-domain-list = *(CFWS / ",") "@" domain *("," [CFWS] ["@" domain])
1654
1655 Returns an obs-route token with the appropriate sub-tokens (that is,
1656 there is no obs-domain-list in the parse tree).
1657 """
1658 obs_route = ObsRoute()
1659 while value and (value[0]==',' or value[0] in CFWS_LEADER):
1660 if value[0] in CFWS_LEADER:
1661 token, value = get_cfws(value)
1662 obs_route.append(token)
1663 elif value[0] == ',':
1664 obs_route.append(ListSeparator)
1665 value = value[1:]
1666 if not value or value[0] != '@':
1667 raise errors.HeaderParseError(
1668 "expected obs-route domain but found '{}'".format(value))
1669 obs_route.append(RouteComponentMarker)
1670 token, value = get_domain(value[1:])
1671 obs_route.append(token)
1672 while value and value[0]==',':
1673 obs_route.append(ListSeparator)
1674 value = value[1:]
1675 if not value:
1676 break
1677 if value[0] in CFWS_LEADER:
1678 token, value = get_cfws(value)
1679 obs_route.append(token)
1680 if value[0] == '@':
1681 obs_route.append(RouteComponentMarker)
1682 token, value = get_domain(value[1:])
1683 obs_route.append(token)
1684 if not value:
1685 raise errors.HeaderParseError("end of header while parsing obs-route")
1686 if value[0] != ':':
1687 raise errors.HeaderParseError( "expected ':' marking end of "
1688 "obs-route but found '{}'".format(value))
1689 obs_route.append(ValueTerminal(':', 'end-of-obs-route-marker'))
1690 return obs_route, value[1:]
1691
1692 def get_angle_addr(value):
1693 """ angle-addr = [CFWS] "<" addr-spec ">" [CFWS] / obs-angle-addr
1694 obs-angle-addr = [CFWS] "<" obs-route addr-spec ">" [CFWS]
1695
1696 """
1697 angle_addr = AngleAddr()
1698 if value[0] in CFWS_LEADER:
1699 token, value = get_cfws(value)
1700 angle_addr.append(token)
1701 if not value or value[0] != '<':
1702 raise errors.HeaderParseError(
1703 "expected angle-addr but found '{}'".format(value))
1704 angle_addr.append(ValueTerminal('<', 'angle-addr-start'))
1705 value = value[1:]
1706 # Although it is not legal per RFC5322, SMTP uses '<>' in certain
1707 # circumstances.
1708 if value[0] == '>':
1709 angle_addr.append(ValueTerminal('>', 'angle-addr-end'))
1710 angle_addr.defects.append(errors.InvalidHeaderDefect(
1711 "null addr-spec in angle-addr"))
1712 value = value[1:]
1713 return angle_addr, value
1714 try:
1715 token, value = get_addr_spec(value)
1716 except errors.HeaderParseError:
1717 try:
1718 token, value = get_obs_route(value)
1719 angle_addr.defects.append(errors.ObsoleteHeaderDefect(
1720 "obsolete route specification in angle-addr"))
1721 except errors.HeaderParseError:
1722 raise errors.HeaderParseError(
1723 "expected addr-spec or obs-route but found '{}'".format(value))
1724 angle_addr.append(token)
1725 token, value = get_addr_spec(value)
1726 angle_addr.append(token)
1727 if value and value[0] == '>':
1728 value = value[1:]
1729 else:
1730 angle_addr.defects.append(errors.InvalidHeaderDefect(
1731 "missing trailing '>' on angle-addr"))
1732 angle_addr.append(ValueTerminal('>', 'angle-addr-end'))
1733 if value and value[0] in CFWS_LEADER:
1734 token, value = get_cfws(value)
1735 angle_addr.append(token)
1736 return angle_addr, value
1737
1738 def get_display_name(value):
1739 """ display-name = phrase
1740
1741 Because this is simply a name-rule, we don't return a display-name
1742 token containing a phrase, but rather a display-name token with
1743 the content of the phrase.
1744
1745 """
1746 display_name = DisplayName()
1747 token, value = get_phrase(value)
1748 display_name.extend(token[:])
1749 display_name.defects = token.defects[:]
1750 return display_name, value
1751
1752
1753 def get_name_addr(value):
1754 """ name-addr = [display-name] angle-addr
1755
1756 """
1757 name_addr = NameAddr()
1758 # Both the optional display name and the angle-addr can start with cfws.
1759 leader = None
1760 if value[0] in CFWS_LEADER:
1761 leader, value = get_cfws(value)
1762 if not value:
1763 raise errors.HeaderParseError(
1764 "expected name-addr but found '{}'".format(leader))
1765 if value[0] != '<':
1766 if value[0] in PHRASE_ENDS:
1767 raise errors.HeaderParseError(
1768 "expected name-addr but found '{}'".format(value))
1769 token, value = get_display_name(value)
1770 if not value:
1771 raise errors.HeaderParseError(
1772 "expected name-addr but found '{}'".format(token))
1773 if leader is not None:
1774 token[0][:0] = [leader]
1775 leader = None
1776 name_addr.append(token)
1777 token, value = get_angle_addr(value)
1778 if leader is not None:
1779 token[:0] = [leader]
1780 name_addr.append(token)
1781 return name_addr, value
1782
1783 def get_mailbox(value):
1784 """ mailbox = name-addr / addr-spec
1785
1786 """
1787 # The only way to figure out if we are dealing with a name-addr or an
1788 # addr-spec is to try parsing each one.
1789 mailbox = Mailbox()
1790 try:
1791 token, value = get_name_addr(value)
1792 except errors.HeaderParseError:
1793 try:
1794 token, value = get_addr_spec(value)
1795 except errors.HeaderParseError:
1796 raise errors.HeaderParseError(
1797 "expected mailbox but found '{}'".format(value))
1798 if any(isinstance(x, errors.InvalidHeaderDefect)
1799 for x in token.all_defects):
1800 mailbox.token_type = 'invalid-mailbox'
1801 mailbox.append(token)
1802 return mailbox, value
1803
1804 def get_invalid_mailbox(value, endchars):
1805 """ Read everything up to one of the chars in endchars.
1806
1807 This is outside the formal grammar. The InvalidMailbox TokenList that is
1808 returned acts like a Mailbox, but the data attributes are None.
1809
1810 """
1811 invalid_mailbox = InvalidMailbox()
1812 while value and value[0] not in endchars:
1813 if value[0] in PHRASE_ENDS:
1814 invalid_mailbox.append(ValueTerminal(value[0],
1815 'misplaced-special'))
1816 value = value[1:]
1817 else:
1818 token, value = get_phrase(value)
1819 invalid_mailbox.append(token)
1820 return invalid_mailbox, value
1821
1822 def get_mailbox_list(value):
1823 """ mailbox-list = (mailbox *("," mailbox)) / obs-mbox-list
1824 obs-mbox-list = *([CFWS] ",") mailbox *("," [mailbox / CFWS])
1825
1826 For this routine we go outside the formal grammar in order to improve error
1827 handling. We recognize the end of the mailbox list only at the end of the
1828 value or at a ';' (the group terminator). This is so that we can turn
1829 invalid mailboxes into InvalidMailbox tokens and continue parsing any
1830 remaining valid mailboxes. We also allow all mailbox entries to be null,
1831 and this condition is handled appropriately at a higher level.
1832
1833 """
1834 mailbox_list = MailboxList()
1835 while value and value[0] != ';':
1836 try:
1837 token, value = get_mailbox(value)
1838 mailbox_list.append(token)
1839 except errors.HeaderParseError:
1840 leader = None
1841 if value[0] in CFWS_LEADER:
1842 leader, value = get_cfws(value)
1843 if not value or value[0] in ',;':
1844 mailbox_list.append(leader)
1845 mailbox_list.defects.append(errors.ObsoleteHeaderDefect(
1846 "empty element in mailbox-list"))
1847 else:
1848 token, value = get_invalid_mailbox(value, ',;')
1849 if leader is not None:
1850 token[:0] = [leader]
1851 mailbox_list.append(token)
1852 mailbox_list.defects.append(errors.InvalidHeaderDefect(
1853 "invalid mailbox in mailbox-list"))
1854 elif value[0] == ',':
1855 mailbox_list.defects.append(errors.ObsoleteHeaderDefect(
1856 "empty element in mailbox-list"))
1857 else:
1858 token, value = get_invalid_mailbox(value, ',;')
1859 if leader is not None:
1860 token[:0] = [leader]
1861 mailbox_list.append(token)
1862 mailbox_list.defects.append(errors.InvalidHeaderDefect(
1863 "invalid mailbox in mailbox-list"))
1864 if value and value[0] not in ',;':
1865 # Crap after mailbox; treat it as an invalid mailbox.
1866 # The mailbox info will still be available.
1867 mailbox = mailbox_list[-1]
1868 mailbox.token_type = 'invalid-mailbox'
1869 token, value = get_invalid_mailbox(value, ',;')
1870 mailbox.extend(token)
1871 mailbox_list.defects.append(errors.InvalidHeaderDefect(
1872 "invalid mailbox in mailbox-list"))
1873 if value and value[0] == ',':
1874 mailbox_list.append(ListSeparator)
1875 value = value[1:]
1876 return mailbox_list, value
1877
1878
1879 def get_group_list(value):
1880 """ group-list = mailbox-list / CFWS / obs-group-list
1881 obs-group-list = 1*([CFWS] ",") [CFWS]
1882
1883 """
1884 group_list = GroupList()
1885 if not value:
1886 group_list.defects.append(errors.InvalidHeaderDefect(
1887 "end of header before group-list"))
1888 return group_list, value
1889 leader = None
1890 if value and value[0] in CFWS_LEADER:
1891 leader, value = get_cfws(value)
1892 if not value:
1893 # This should never happen in email parsing, since CFWS-only is a
1894 # legal alternative to group-list in a group, which is the only
1895 # place group-list appears.
1896 group_list.defects.append(errors.InvalidHeaderDefect(
1897 "end of header in group-list"))
1898 group_list.append(leader)
1899 return group_list, value
1900 if value[0] == ';':
1901 group_list.append(leader)
1902 return group_list, value
1903 token, value = get_mailbox_list(value)
1904 if len(token.all_mailboxes)==0:
1905 if leader is not None:
1906 group_list.append(leader)
1907 group_list.extend(token)
1908 group_list.defects.append(errors.ObsoleteHeaderDefect(
1909 "group-list with empty entries"))
1910 return group_list, value
1911 if leader is not None:
1912 token[:0] = [leader]
1913 group_list.append(token)
1914 return group_list, value
1915
1916 def get_group(value):
1917 """ group = display-name ":" [group-list] ";" [CFWS]
1918
1919 """
1920 group = Group()
1921 token, value = get_display_name(value)
1922 if not value or value[0] != ':':
1923 raise errors.HeaderParseError("expected ':' at end of group "
1924 "display name but found '{}'".format(value))
1925 group.append(token)
1926 group.append(ValueTerminal(':', 'group-display-name-terminator'))
1927 value = value[1:]
1928 if value and value[0] == ';':
1929 group.append(ValueTerminal(';', 'group-terminator'))
1930 return group, value[1:]
1931 token, value = get_group_list(value)
1932 group.append(token)
1933 if not value:
1934 group.defects.append(errors.InvalidHeaderDefect(
1935 "end of header in group"))
1936 elif value[0] != ';':
1937 raise errors.HeaderParseError(
1938 "expected ';' at end of group but found {}".format(value))
1939 group.append(ValueTerminal(';', 'group-terminator'))
1940 value = value[1:]
1941 if value and value[0] in CFWS_LEADER:
1942 token, value = get_cfws(value)
1943 group.append(token)
1944 return group, value
1945
1946 def get_address(value):
1947 """ address = mailbox / group
1948
1949 Note that counter-intuitively, an address can be either a single address or
1950 a list of addresses (a group). This is why the returned Address object has
1951 a 'mailboxes' attribute which treats a single address as a list of length
1952 one. When you need to differentiate between to two cases, extract the single
1953 element, which is either a mailbox or a group token.
1954
1955 """
1956 # The formal grammar isn't very helpful when parsing an address. mailbox
1957 # and group, especially when allowing for obsolete forms, start off very
1958 # similarly. It is only when you reach one of @, <, or : that you know
1959 # what you've got. So, we try each one in turn, starting with the more
1960 # likely of the two. We could perhaps make this more efficient by looking
1961 # for a phrase and then branching based on the next character, but that
1962 # would be a premature optimization.
1963 address = Address()
1964 try:
1965 token, value = get_group(value)
1966 except errors.HeaderParseError:
1967 try:
1968 token, value = get_mailbox(value)
1969 except errors.HeaderParseError:
1970 raise errors.HeaderParseError(
1971 "expected address but found '{}'".format(value))
1972 address.append(token)
1973 return address, value
1974
1975 def get_address_list(value):
1976 """ address_list = (address *("," address)) / obs-addr-list
1977 obs-addr-list = *([CFWS] ",") address *("," [address / CFWS])
1978
1979 We depart from the formal grammar here by continuing to parse until the end
1980 of the input, assuming the input to be entirely composed of an
1981 address-list. This is always true in email parsing, and allows us
1982 to skip invalid addresses to parse additional valid ones.
1983
1984 """
1985 address_list = AddressList()
1986 while value:
1987 try:
1988 token, value = get_address(value)
1989 address_list.append(token)
1990 except errors.HeaderParseError:
1991 leader = None
1992 if value[0] in CFWS_LEADER:
1993 leader, value = get_cfws(value)
1994 if not value or value[0] == ',':
1995 address_list.append(leader)
1996 address_list.defects.append(errors.ObsoleteHeaderDefect(
1997 "address-list entry with no content"))
1998 else:
1999 token, value = get_invalid_mailbox(value, ',')
2000 if leader is not None:
2001 token[:0] = [leader]
2002 address_list.append(Address([token]))
2003 address_list.defects.append(errors.InvalidHeaderDefect(
2004 "invalid address in address-list"))
2005 elif value[0] == ',':
2006 address_list.defects.append(errors.ObsoleteHeaderDefect(
2007 "empty element in address-list"))
2008 else:
2009 token, value = get_invalid_mailbox(value, ',')
2010 if leader is not None:
2011 token[:0] = [leader]
2012 address_list.append(Address([token]))
2013 address_list.defects.append(errors.InvalidHeaderDefect(
2014 "invalid address in address-list"))
2015 if value and value[0] != ',':
2016 # Crap after address; treat it as an invalid mailbox.
2017 # The mailbox info will still be available.
2018 mailbox = address_list[-1][0]
2019 mailbox.token_type = 'invalid-mailbox'
2020 token, value = get_invalid_mailbox(value, ',')
2021 mailbox.extend(token)
2022 address_list.defects.append(errors.InvalidHeaderDefect(
2023 "invalid address in address-list"))
2024 if value: # Must be a , at this point.
2025 address_list.append(ValueTerminal(',', 'list-separator'))
2026 value = value[1:]
2027 return address_list, value
2028
2029
2030 def get_no_fold_literal(value):
2031 """ no-fold-literal = "[" *dtext "]"
2032 """
2033 no_fold_literal = NoFoldLiteral()
2034 if not value:
2035 raise errors.HeaderParseError(
2036 "expected no-fold-literal but found '{}'".format(value))
2037 if value[0] != '[':
2038 raise errors.HeaderParseError(
2039 "expected '[' at the start of no-fold-literal "
2040 "but found '{}'".format(value))
2041 no_fold_literal.append(ValueTerminal('[', 'no-fold-literal-start'))
2042 value = value[1:]
2043 token, value = get_dtext(value)
2044 no_fold_literal.append(token)
2045 if not value or value[0] != ']':
2046 raise errors.HeaderParseError(
2047 "expected ']' at the end of no-fold-literal "
2048 "but found '{}'".format(value))
2049 no_fold_literal.append(ValueTerminal(']', 'no-fold-literal-end'))
2050 return no_fold_literal, value[1:]
2051
2052 def get_msg_id(value):
2053 """msg-id = [CFWS] "<" id-left '@' id-right ">" [CFWS]
2054 id-left = dot-atom-text / obs-id-left
2055 id-right = dot-atom-text / no-fold-literal / obs-id-right
2056 no-fold-literal = "[" *dtext "]"
2057 """
2058 msg_id = MsgID()
2059 if value and value[0] in CFWS_LEADER:
2060 token, value = get_cfws(value)
2061 msg_id.append(token)
2062 if not value or value[0] != '<':
2063 raise errors.HeaderParseError(
2064 "expected msg-id but found '{}'".format(value))
2065 msg_id.append(ValueTerminal('<', 'msg-id-start'))
2066 value = value[1:]
2067 # Parse id-left.
2068 try:
2069 token, value = get_dot_atom_text(value)
2070 except errors.HeaderParseError:
2071 try:
2072 # obs-id-left is same as local-part of add-spec.
2073 token, value = get_obs_local_part(value)
2074 msg_id.defects.append(errors.ObsoleteHeaderDefect(
2075 "obsolete id-left in msg-id"))
2076 except errors.HeaderParseError:
2077 raise errors.HeaderParseError(
2078 "expected dot-atom-text or obs-id-left"
2079 " but found '{}'".format(value))
2080 msg_id.append(token)
2081 if not value or value[0] != '@':
2082 msg_id.defects.append(errors.InvalidHeaderDefect(
2083 "msg-id with no id-right"))
2084 # Even though there is no id-right, if the local part
2085 # ends with `>` let's just parse it too and return
2086 # along with the defect.
2087 if value and value[0] == '>':
2088 msg_id.append(ValueTerminal('>', 'msg-id-end'))
2089 value = value[1:]
2090 return msg_id, value
2091 msg_id.append(ValueTerminal('@', 'address-at-symbol'))
2092 value = value[1:]
2093 # Parse id-right.
2094 try:
2095 token, value = get_dot_atom_text(value)
2096 except errors.HeaderParseError:
2097 try:
2098 token, value = get_no_fold_literal(value)
2099 except errors.HeaderParseError:
2100 try:
2101 token, value = get_domain(value)
2102 msg_id.defects.append(errors.ObsoleteHeaderDefect(
2103 "obsolete id-right in msg-id"))
2104 except errors.HeaderParseError:
2105 raise errors.HeaderParseError(
2106 "expected dot-atom-text, no-fold-literal or obs-id-right"
2107 " but found '{}'".format(value))
2108 msg_id.append(token)
2109 if value and value[0] == '>':
2110 value = value[1:]
2111 else:
2112 msg_id.defects.append(errors.InvalidHeaderDefect(
2113 "missing trailing '>' on msg-id"))
2114 msg_id.append(ValueTerminal('>', 'msg-id-end'))
2115 if value and value[0] in CFWS_LEADER:
2116 token, value = get_cfws(value)
2117 msg_id.append(token)
2118 return msg_id, value
2119
2120
2121 def parse_message_id(value):
2122 """message-id = "Message-ID:" msg-id CRLF
2123 """
2124 message_id = MessageID()
2125 try:
2126 token, value = get_msg_id(value)
2127 message_id.append(token)
2128 except errors.HeaderParseError as ex:
2129 token = get_unstructured(value)
2130 message_id = InvalidMessageID(token)
2131 message_id.defects.append(
2132 errors.InvalidHeaderDefect("Invalid msg-id: {!r}".format(ex)))
2133 else:
2134 # Value after parsing a valid msg_id should be None.
2135 if value:
2136 message_id.defects.append(errors.InvalidHeaderDefect(
2137 "Unexpected {!r}".format(value)))
2138
2139 return message_id
2140
2141 #
2142 # XXX: As I begin to add additional header parsers, I'm realizing we probably
2143 # have two level of parser routines: the get_XXX methods that get a token in
2144 # the grammar, and parse_XXX methods that parse an entire field value. So
2145 # get_address_list above should really be a parse_ method, as probably should
2146 # be get_unstructured.
2147 #
2148
2149 def parse_mime_version(value):
2150 """ mime-version = [CFWS] 1*digit [CFWS] "." [CFWS] 1*digit [CFWS]
2151
2152 """
2153 # The [CFWS] is implicit in the RFC 2045 BNF.
2154 # XXX: This routine is a bit verbose, should factor out a get_int method.
2155 mime_version = MIMEVersion()
2156 if not value:
2157 mime_version.defects.append(errors.HeaderMissingRequiredValue(
2158 "Missing MIME version number (eg: 1.0)"))
2159 return mime_version
2160 if value[0] in CFWS_LEADER:
2161 token, value = get_cfws(value)
2162 mime_version.append(token)
2163 if not value:
2164 mime_version.defects.append(errors.HeaderMissingRequiredValue(
2165 "Expected MIME version number but found only CFWS"))
2166 digits = ''
2167 while value and value[0] != '.' and value[0] not in CFWS_LEADER:
2168 digits += value[0]
2169 value = value[1:]
2170 if not digits.isdigit():
2171 mime_version.defects.append(errors.InvalidHeaderDefect(
2172 "Expected MIME major version number but found {!r}".format(digits)))
2173 mime_version.append(ValueTerminal(digits, 'xtext'))
2174 else:
2175 mime_version.major = int(digits)
2176 mime_version.append(ValueTerminal(digits, 'digits'))
2177 if value and value[0] in CFWS_LEADER:
2178 token, value = get_cfws(value)
2179 mime_version.append(token)
2180 if not value or value[0] != '.':
2181 if mime_version.major is not None:
2182 mime_version.defects.append(errors.InvalidHeaderDefect(
2183 "Incomplete MIME version; found only major number"))
2184 if value:
2185 mime_version.append(ValueTerminal(value, 'xtext'))
2186 return mime_version
2187 mime_version.append(ValueTerminal('.', 'version-separator'))
2188 value = value[1:]
2189 if value and value[0] in CFWS_LEADER:
2190 token, value = get_cfws(value)
2191 mime_version.append(token)
2192 if not value:
2193 if mime_version.major is not None:
2194 mime_version.defects.append(errors.InvalidHeaderDefect(
2195 "Incomplete MIME version; found only major number"))
2196 return mime_version
2197 digits = ''
2198 while value and value[0] not in CFWS_LEADER:
2199 digits += value[0]
2200 value = value[1:]
2201 if not digits.isdigit():
2202 mime_version.defects.append(errors.InvalidHeaderDefect(
2203 "Expected MIME minor version number but found {!r}".format(digits)))
2204 mime_version.append(ValueTerminal(digits, 'xtext'))
2205 else:
2206 mime_version.minor = int(digits)
2207 mime_version.append(ValueTerminal(digits, 'digits'))
2208 if value and value[0] in CFWS_LEADER:
2209 token, value = get_cfws(value)
2210 mime_version.append(token)
2211 if value:
2212 mime_version.defects.append(errors.InvalidHeaderDefect(
2213 "Excess non-CFWS text after MIME version"))
2214 mime_version.append(ValueTerminal(value, 'xtext'))
2215 return mime_version
2216
2217 def get_invalid_parameter(value):
2218 """ Read everything up to the next ';'.
2219
2220 This is outside the formal grammar. The InvalidParameter TokenList that is
2221 returned acts like a Parameter, but the data attributes are None.
2222
2223 """
2224 invalid_parameter = InvalidParameter()
2225 while value and value[0] != ';':
2226 if value[0] in PHRASE_ENDS:
2227 invalid_parameter.append(ValueTerminal(value[0],
2228 'misplaced-special'))
2229 value = value[1:]
2230 else:
2231 token, value = get_phrase(value)
2232 invalid_parameter.append(token)
2233 return invalid_parameter, value
2234
2235 def get_ttext(value):
2236 """ttext = <matches _ttext_matcher>
2237
2238 We allow any non-TOKEN_ENDS in ttext, but add defects to the token's
2239 defects list if we find non-ttext characters. We also register defects for
2240 *any* non-printables even though the RFC doesn't exclude all of them,
2241 because we follow the spirit of RFC 5322.
2242
2243 """
2244 m = _non_token_end_matcher(value)
2245 if not m:
2246 raise errors.HeaderParseError(
2247 "expected ttext but found '{}'".format(value))
2248 ttext = m.group()
2249 value = value[len(ttext):]
2250 ttext = ValueTerminal(ttext, 'ttext')
2251 _validate_xtext(ttext)
2252 return ttext, value
2253
2254 def get_token(value):
2255 """token = [CFWS] 1*ttext [CFWS]
2256
2257 The RFC equivalent of ttext is any US-ASCII chars except space, ctls, or
2258 tspecials. We also exclude tabs even though the RFC doesn't.
2259
2260 The RFC implies the CFWS but is not explicit about it in the BNF.
2261
2262 """
2263 mtoken = Token()
2264 if value and value[0] in CFWS_LEADER:
2265 token, value = get_cfws(value)
2266 mtoken.append(token)
2267 if value and value[0] in TOKEN_ENDS:
2268 raise errors.HeaderParseError(
2269 "expected token but found '{}'".format(value))
2270 token, value = get_ttext(value)
2271 mtoken.append(token)
2272 if value and value[0] in CFWS_LEADER:
2273 token, value = get_cfws(value)
2274 mtoken.append(token)
2275 return mtoken, value
2276
2277 def get_attrtext(value):
2278 """attrtext = 1*(any non-ATTRIBUTE_ENDS character)
2279
2280 We allow any non-ATTRIBUTE_ENDS in attrtext, but add defects to the
2281 token's defects list if we find non-attrtext characters. We also register
2282 defects for *any* non-printables even though the RFC doesn't exclude all of
2283 them, because we follow the spirit of RFC 5322.
2284
2285 """
2286 m = _non_attribute_end_matcher(value)
2287 if not m:
2288 raise errors.HeaderParseError(
2289 "expected attrtext but found {!r}".format(value))
2290 attrtext = m.group()
2291 value = value[len(attrtext):]
2292 attrtext = ValueTerminal(attrtext, 'attrtext')
2293 _validate_xtext(attrtext)
2294 return attrtext, value
2295
2296 def get_attribute(value):
2297 """ [CFWS] 1*attrtext [CFWS]
2298
2299 This version of the BNF makes the CFWS explicit, and as usual we use a
2300 value terminal for the actual run of characters. The RFC equivalent of
2301 attrtext is the token characters, with the subtraction of '*', "'", and '%'.
2302 We include tab in the excluded set just as we do for token.
2303
2304 """
2305 attribute = Attribute()
2306 if value and value[0] in CFWS_LEADER:
2307 token, value = get_cfws(value)
2308 attribute.append(token)
2309 if value and value[0] in ATTRIBUTE_ENDS:
2310 raise errors.HeaderParseError(
2311 "expected token but found '{}'".format(value))
2312 token, value = get_attrtext(value)
2313 attribute.append(token)
2314 if value and value[0] in CFWS_LEADER:
2315 token, value = get_cfws(value)
2316 attribute.append(token)
2317 return attribute, value
2318
2319 def get_extended_attrtext(value):
2320 """attrtext = 1*(any non-ATTRIBUTE_ENDS character plus '%')
2321
2322 This is a special parsing routine so that we get a value that
2323 includes % escapes as a single string (which we decode as a single
2324 string later).
2325
2326 """
2327 m = _non_extended_attribute_end_matcher(value)
2328 if not m:
2329 raise errors.HeaderParseError(
2330 "expected extended attrtext but found {!r}".format(value))
2331 attrtext = m.group()
2332 value = value[len(attrtext):]
2333 attrtext = ValueTerminal(attrtext, 'extended-attrtext')
2334 _validate_xtext(attrtext)
2335 return attrtext, value
2336
2337 def get_extended_attribute(value):
2338 """ [CFWS] 1*extended_attrtext [CFWS]
2339
2340 This is like the non-extended version except we allow % characters, so that
2341 we can pick up an encoded value as a single string.
2342
2343 """
2344 # XXX: should we have an ExtendedAttribute TokenList?
2345 attribute = Attribute()
2346 if value and value[0] in CFWS_LEADER:
2347 token, value = get_cfws(value)
2348 attribute.append(token)
2349 if value and value[0] in EXTENDED_ATTRIBUTE_ENDS:
2350 raise errors.HeaderParseError(
2351 "expected token but found '{}'".format(value))
2352 token, value = get_extended_attrtext(value)
2353 attribute.append(token)
2354 if value and value[0] in CFWS_LEADER:
2355 token, value = get_cfws(value)
2356 attribute.append(token)
2357 return attribute, value
2358
2359 def get_section(value):
2360 """ '*' digits
2361
2362 The formal BNF is more complicated because leading 0s are not allowed. We
2363 check for that and add a defect. We also assume no CFWS is allowed between
2364 the '*' and the digits, though the RFC is not crystal clear on that.
2365 The caller should already have dealt with leading CFWS.
2366
2367 """
2368 section = Section()
2369 if not value or value[0] != '*':
2370 raise errors.HeaderParseError("Expected section but found {}".format(
2371 value))
2372 section.append(ValueTerminal('*', 'section-marker'))
2373 value = value[1:]
2374 if not value or not value[0].isdigit():
2375 raise errors.HeaderParseError("Expected section number but "
2376 "found {}".format(value))
2377 digits = ''
2378 while value and value[0].isdigit():
2379 digits += value[0]
2380 value = value[1:]
2381 if digits[0] == '0' and digits != '0':
2382 section.defects.append(errors.InvalidHeaderDefect(
2383 "section number has an invalid leading 0"))
2384 section.number = int(digits)
2385 section.append(ValueTerminal(digits, 'digits'))
2386 return section, value
2387
2388
2389 def get_value(value):
2390 """ quoted-string / attribute
2391
2392 """
2393 v = Value()
2394 if not value:
2395 raise errors.HeaderParseError("Expected value but found end of string")
2396 leader = None
2397 if value[0] in CFWS_LEADER:
2398 leader, value = get_cfws(value)
2399 if not value:
2400 raise errors.HeaderParseError("Expected value but found "
2401 "only {}".format(leader))
2402 if value[0] == '"':
2403 token, value = get_quoted_string(value)
2404 else:
2405 token, value = get_extended_attribute(value)
2406 if leader is not None:
2407 token[:0] = [leader]
2408 v.append(token)
2409 return v, value
2410
2411 def get_parameter(value):
2412 """ attribute [section] ["*"] [CFWS] "=" value
2413
2414 The CFWS is implied by the RFC but not made explicit in the BNF. This
2415 simplified form of the BNF from the RFC is made to conform with the RFC BNF
2416 through some extra checks. We do it this way because it makes both error
2417 recovery and working with the resulting parse tree easier.
2418 """
2419 # It is possible CFWS would also be implicitly allowed between the section
2420 # and the 'extended-attribute' marker (the '*') , but we've never seen that
2421 # in the wild and we will therefore ignore the possibility.
2422 param = Parameter()
2423 token, value = get_attribute(value)
2424 param.append(token)
2425 if not value or value[0] == ';':
2426 param.defects.append(errors.InvalidHeaderDefect("Parameter contains "
2427 "name ({}) but no value".format(token)))
2428 return param, value
2429 if value[0] == '*':
2430 try:
2431 token, value = get_section(value)
2432 param.sectioned = True
2433 param.append(token)
2434 except errors.HeaderParseError:
2435 pass
2436 if not value:
2437 raise errors.HeaderParseError("Incomplete parameter")
2438 if value[0] == '*':
2439 param.append(ValueTerminal('*', 'extended-parameter-marker'))
2440 value = value[1:]
2441 param.extended = True
2442 if value[0] != '=':
2443 raise errors.HeaderParseError("Parameter not followed by '='")
2444 param.append(ValueTerminal('=', 'parameter-separator'))
2445 value = value[1:]
2446 if value and value[0] in CFWS_LEADER:
2447 token, value = get_cfws(value)
2448 param.append(token)
2449 remainder = None
2450 appendto = param
2451 if param.extended and value and value[0] == '"':
2452 # Now for some serious hackery to handle the common invalid case of
2453 # double quotes around an extended value. We also accept (with defect)
2454 # a value marked as encoded that isn't really.
2455 qstring, remainder = get_quoted_string(value)
2456 inner_value = qstring.stripped_value
2457 semi_valid = False
2458 if param.section_number == 0:
2459 if inner_value and inner_value[0] == "'":
2460 semi_valid = True
2461 else:
2462 token, rest = get_attrtext(inner_value)
2463 if rest and rest[0] == "'":
2464 semi_valid = True
2465 else:
2466 try:
2467 token, rest = get_extended_attrtext(inner_value)
2468 except:
2469 pass
2470 else:
2471 if not rest:
2472 semi_valid = True
2473 if semi_valid:
2474 param.defects.append(errors.InvalidHeaderDefect(
2475 "Quoted string value for extended parameter is invalid"))
2476 param.append(qstring)
2477 for t in qstring:
2478 if t.token_type == 'bare-quoted-string':
2479 t[:] = []
2480 appendto = t
2481 break
2482 value = inner_value
2483 else:
2484 remainder = None
2485 param.defects.append(errors.InvalidHeaderDefect(
2486 "Parameter marked as extended but appears to have a "
2487 "quoted string value that is non-encoded"))
2488 if value and value[0] == "'":
2489 token = None
2490 else:
2491 token, value = get_value(value)
2492 if not param.extended or param.section_number > 0:
2493 if not value or value[0] != "'":
2494 appendto.append(token)
2495 if remainder is not None:
2496 assert not value, value
2497 value = remainder
2498 return param, value
2499 param.defects.append(errors.InvalidHeaderDefect(
2500 "Apparent initial-extended-value but attribute "
2501 "was not marked as extended or was not initial section"))
2502 if not value:
2503 # Assume the charset/lang is missing and the token is the value.
2504 param.defects.append(errors.InvalidHeaderDefect(
2505 "Missing required charset/lang delimiters"))
2506 appendto.append(token)
2507 if remainder is None:
2508 return param, value
2509 else:
2510 if token is not None:
2511 for t in token:
2512 if t.token_type == 'extended-attrtext':
2513 break
2514 t.token_type == 'attrtext'
2515 appendto.append(t)
2516 param.charset = t.value
2517 if value[0] != "'":
2518 raise errors.HeaderParseError("Expected RFC2231 char/lang encoding "
2519 "delimiter, but found {!r}".format(value))
2520 appendto.append(ValueTerminal("'", 'RFC2231-delimiter'))
2521 value = value[1:]
2522 if value and value[0] != "'":
2523 token, value = get_attrtext(value)
2524 appendto.append(token)
2525 param.lang = token.value
2526 if not value or value[0] != "'":
2527 raise errors.HeaderParseError("Expected RFC2231 char/lang encoding "
2528 "delimiter, but found {}".format(value))
2529 appendto.append(ValueTerminal("'", 'RFC2231-delimiter'))
2530 value = value[1:]
2531 if remainder is not None:
2532 # Treat the rest of value as bare quoted string content.
2533 v = Value()
2534 while value:
2535 if value[0] in WSP:
2536 token, value = get_fws(value)
2537 elif value[0] == '"':
2538 token = ValueTerminal('"', 'DQUOTE')
2539 value = value[1:]
2540 else:
2541 token, value = get_qcontent(value)
2542 v.append(token)
2543 token = v
2544 else:
2545 token, value = get_value(value)
2546 appendto.append(token)
2547 if remainder is not None:
2548 assert not value, value
2549 value = remainder
2550 return param, value
2551
2552 def parse_mime_parameters(value):
2553 """ parameter *( ";" parameter )
2554
2555 That BNF is meant to indicate this routine should only be called after
2556 finding and handling the leading ';'. There is no corresponding rule in
2557 the formal RFC grammar, but it is more convenient for us for the set of
2558 parameters to be treated as its own TokenList.
2559
2560 This is 'parse' routine because it consumes the remaining value, but it
2561 would never be called to parse a full header. Instead it is called to
2562 parse everything after the non-parameter value of a specific MIME header.
2563
2564 """
2565 mime_parameters = MimeParameters()
2566 while value:
2567 try:
2568 token, value = get_parameter(value)
2569 mime_parameters.append(token)
2570 except errors.HeaderParseError:
2571 leader = None
2572 if value[0] in CFWS_LEADER:
2573 leader, value = get_cfws(value)
2574 if not value:
2575 mime_parameters.append(leader)
2576 return mime_parameters
2577 if value[0] == ';':
2578 if leader is not None:
2579 mime_parameters.append(leader)
2580 mime_parameters.defects.append(errors.InvalidHeaderDefect(
2581 "parameter entry with no content"))
2582 else:
2583 token, value = get_invalid_parameter(value)
2584 if leader:
2585 token[:0] = [leader]
2586 mime_parameters.append(token)
2587 mime_parameters.defects.append(errors.InvalidHeaderDefect(
2588 "invalid parameter {!r}".format(token)))
2589 if value and value[0] != ';':
2590 # Junk after the otherwise valid parameter. Mark it as
2591 # invalid, but it will have a value.
2592 param = mime_parameters[-1]
2593 param.token_type = 'invalid-parameter'
2594 token, value = get_invalid_parameter(value)
2595 param.extend(token)
2596 mime_parameters.defects.append(errors.InvalidHeaderDefect(
2597 "parameter with invalid trailing text {!r}".format(token)))
2598 if value:
2599 # Must be a ';' at this point.
2600 mime_parameters.append(ValueTerminal(';', 'parameter-separator'))
2601 value = value[1:]
2602 return mime_parameters
2603
2604 def _find_mime_parameters(tokenlist, value):
2605 """Do our best to find the parameters in an invalid MIME header
2606
2607 """
2608 while value and value[0] != ';':
2609 if value[0] in PHRASE_ENDS:
2610 tokenlist.append(ValueTerminal(value[0], 'misplaced-special'))
2611 value = value[1:]
2612 else:
2613 token, value = get_phrase(value)
2614 tokenlist.append(token)
2615 if not value:
2616 return
2617 tokenlist.append(ValueTerminal(';', 'parameter-separator'))
2618 tokenlist.append(parse_mime_parameters(value[1:]))
2619
2620 def parse_content_type_header(value):
2621 """ maintype "/" subtype *( ";" parameter )
2622
2623 The maintype and substype are tokens. Theoretically they could
2624 be checked against the official IANA list + x-token, but we
2625 don't do that.
2626 """
2627 ctype = ContentType()
2628 if not value:
2629 ctype.defects.append(errors.HeaderMissingRequiredValue(
2630 "Missing content type specification"))
2631 return ctype
2632 try:
2633 token, value = get_token(value)
2634 except errors.HeaderParseError:
2635 ctype.defects.append(errors.InvalidHeaderDefect(
2636 "Expected content maintype but found {!r}".format(value)))
2637 _find_mime_parameters(ctype, value)
2638 return ctype
2639 ctype.append(token)
2640 # XXX: If we really want to follow the formal grammar we should make
2641 # mantype and subtype specialized TokenLists here. Probably not worth it.
2642 if not value or value[0] != '/':
2643 ctype.defects.append(errors.InvalidHeaderDefect(
2644 "Invalid content type"))
2645 if value:
2646 _find_mime_parameters(ctype, value)
2647 return ctype
2648 ctype.maintype = token.value.strip().lower()
2649 ctype.append(ValueTerminal('/', 'content-type-separator'))
2650 value = value[1:]
2651 try:
2652 token, value = get_token(value)
2653 except errors.HeaderParseError:
2654 ctype.defects.append(errors.InvalidHeaderDefect(
2655 "Expected content subtype but found {!r}".format(value)))
2656 _find_mime_parameters(ctype, value)
2657 return ctype
2658 ctype.append(token)
2659 ctype.subtype = token.value.strip().lower()
2660 if not value:
2661 return ctype
2662 if value[0] != ';':
2663 ctype.defects.append(errors.InvalidHeaderDefect(
2664 "Only parameters are valid after content type, but "
2665 "found {!r}".format(value)))
2666 # The RFC requires that a syntactically invalid content-type be treated
2667 # as text/plain. Perhaps we should postel this, but we should probably
2668 # only do that if we were checking the subtype value against IANA.
2669 del ctype.maintype, ctype.subtype
2670 _find_mime_parameters(ctype, value)
2671 return ctype
2672 ctype.append(ValueTerminal(';', 'parameter-separator'))
2673 ctype.append(parse_mime_parameters(value[1:]))
2674 return ctype
2675
2676 def parse_content_disposition_header(value):
2677 """ disposition-type *( ";" parameter )
2678
2679 """
2680 disp_header = ContentDisposition()
2681 if not value:
2682 disp_header.defects.append(errors.HeaderMissingRequiredValue(
2683 "Missing content disposition"))
2684 return disp_header
2685 try:
2686 token, value = get_token(value)
2687 except errors.HeaderParseError:
2688 disp_header.defects.append(errors.InvalidHeaderDefect(
2689 "Expected content disposition but found {!r}".format(value)))
2690 _find_mime_parameters(disp_header, value)
2691 return disp_header
2692 disp_header.append(token)
2693 disp_header.content_disposition = token.value.strip().lower()
2694 if not value:
2695 return disp_header
2696 if value[0] != ';':
2697 disp_header.defects.append(errors.InvalidHeaderDefect(
2698 "Only parameters are valid after content disposition, but "
2699 "found {!r}".format(value)))
2700 _find_mime_parameters(disp_header, value)
2701 return disp_header
2702 disp_header.append(ValueTerminal(';', 'parameter-separator'))
2703 disp_header.append(parse_mime_parameters(value[1:]))
2704 return disp_header
2705
2706 def parse_content_transfer_encoding_header(value):
2707 """ mechanism
2708
2709 """
2710 # We should probably validate the values, since the list is fixed.
2711 cte_header = ContentTransferEncoding()
2712 if not value:
2713 cte_header.defects.append(errors.HeaderMissingRequiredValue(
2714 "Missing content transfer encoding"))
2715 return cte_header
2716 try:
2717 token, value = get_token(value)
2718 except errors.HeaderParseError:
2719 cte_header.defects.append(errors.InvalidHeaderDefect(
2720 "Expected content transfer encoding but found {!r}".format(value)))
2721 else:
2722 cte_header.append(token)
2723 cte_header.cte = token.value.strip().lower()
2724 if not value:
2725 return cte_header
2726 while value:
2727 cte_header.defects.append(errors.InvalidHeaderDefect(
2728 "Extra text after content transfer encoding"))
2729 if value[0] in PHRASE_ENDS:
2730 cte_header.append(ValueTerminal(value[0], 'misplaced-special'))
2731 value = value[1:]
2732 else:
2733 token, value = get_phrase(value)
2734 cte_header.append(token)
2735 return cte_header
2736
2737
2738 #
2739 # Header folding
2740 #
2741 # Header folding is complex, with lots of rules and corner cases. The
2742 # following code does its best to obey the rules and handle the corner
2743 # cases, but you can be sure there are few bugs:)
2744 #
2745 # This folder generally canonicalizes as it goes, preferring the stringified
2746 # version of each token. The tokens contain information that supports the
2747 # folder, including which tokens can be encoded in which ways.
2748 #
2749 # Folded text is accumulated in a simple list of strings ('lines'), each
2750 # one of which should be less than policy.max_line_length ('maxlen').
2751 #
2752
2753 def _steal_trailing_WSP_if_exists(lines):
2754 wsp = ''
2755 if lines and lines[-1] and lines[-1][-1] in WSP:
2756 wsp = lines[-1][-1]
2757 lines[-1] = lines[-1][:-1]
2758 return wsp
2759
2760 def _refold_parse_tree(parse_tree, *, policy):
2761 """Return string of contents of parse_tree folded according to RFC rules.
2762
2763 """
2764 # max_line_length 0/None means no limit, ie: infinitely long.
2765 maxlen = policy.max_line_length or sys.maxsize
2766 encoding = 'utf-8' if policy.utf8 else 'us-ascii'
2767 lines = ['']
2768 last_ew = None
2769 wrap_as_ew_blocked = 0
2770 want_encoding = False
2771 end_ew_not_allowed = Terminal('', 'wrap_as_ew_blocked')
2772 parts = list(parse_tree)
2773 while parts:
2774 part = parts.pop(0)
2775 if part is end_ew_not_allowed:
2776 wrap_as_ew_blocked -= 1
2777 continue
2778 tstr = str(part)
2779 if part.token_type == 'ptext' and set(tstr) & SPECIALS:
2780 # Encode if tstr contains special characters.
2781 want_encoding = True
2782 try:
2783 tstr.encode(encoding)
2784 charset = encoding
2785 except UnicodeEncodeError:
2786 if any(isinstance(x, errors.UndecodableBytesDefect)
2787 for x in part.all_defects):
2788 charset = 'unknown-8bit'
2789 else:
2790 # If policy.utf8 is false this should really be taken from a
2791 # 'charset' property on the policy.
2792 charset = 'utf-8'
2793 want_encoding = True
2794 if part.token_type == 'mime-parameters':
2795 # Mime parameter folding (using RFC2231) is extra special.
2796 _fold_mime_parameters(part, lines, maxlen, encoding)
2797 continue
2798 if want_encoding and not wrap_as_ew_blocked:
2799 if not part.as_ew_allowed:
2800 want_encoding = False
2801 last_ew = None
2802 if part.syntactic_break:
2803 encoded_part = part.fold(policy=policy)[:-len(policy.linesep)]
2804 if policy.linesep not in encoded_part:
2805 # It fits on a single line
2806 if len(encoded_part) > maxlen - len(lines[-1]):
2807 # But not on this one, so start a new one.
2808 newline = _steal_trailing_WSP_if_exists(lines)
2809 # XXX what if encoded_part has no leading FWS?
2810 lines.append(newline)
2811 lines[-1] += encoded_part
2812 continue
2813 # Either this is not a major syntactic break, so we don't
2814 # want it on a line by itself even if it fits, or it
2815 # doesn't fit on a line by itself. Either way, fall through
2816 # to unpacking the subparts and wrapping them.
2817 if not hasattr(part, 'encode'):
2818 # It's not a Terminal, do each piece individually.
2819 parts = list(part) + parts
2820 else:
2821 # It's a terminal, wrap it as an encoded word, possibly
2822 # combining it with previously encoded words if allowed.
2823 last_ew = _fold_as_ew(tstr, lines, maxlen, last_ew,
2824 part.ew_combine_allowed, charset)
2825 want_encoding = False
2826 continue
2827 if len(tstr) <= maxlen - len(lines[-1]):
2828 lines[-1] += tstr
2829 continue
2830 # This part is too long to fit. The RFC wants us to break at
2831 # "major syntactic breaks", so unless we don't consider this
2832 # to be one, check if it will fit on the next line by itself.
2833 if (part.syntactic_break and
2834 len(tstr) + 1 <= maxlen):
2835 newline = _steal_trailing_WSP_if_exists(lines)
2836 if newline or part.startswith_fws():
2837 lines.append(newline + tstr)
2838 last_ew = None
2839 continue
2840 if not hasattr(part, 'encode'):
2841 # It's not a terminal, try folding the subparts.
2842 newparts = list(part)
2843 if not part.as_ew_allowed:
2844 wrap_as_ew_blocked += 1
2845 newparts.append(end_ew_not_allowed)
2846 parts = newparts + parts
2847 continue
2848 if part.as_ew_allowed and not wrap_as_ew_blocked:
2849 # It doesn't need CTE encoding, but encode it anyway so we can
2850 # wrap it.
2851 parts.insert(0, part)
2852 want_encoding = True
2853 continue
2854 # We can't figure out how to wrap, it, so give up.
2855 newline = _steal_trailing_WSP_if_exists(lines)
2856 if newline or part.startswith_fws():
2857 lines.append(newline + tstr)
2858 else:
2859 # We can't fold it onto the next line either...
2860 lines[-1] += tstr
2861 return policy.linesep.join(lines) + policy.linesep
2862
2863 def _fold_as_ew(to_encode, lines, maxlen, last_ew, ew_combine_allowed, charset):
2864 """Fold string to_encode into lines as encoded word, combining if allowed.
2865 Return the new value for last_ew, or None if ew_combine_allowed is False.
2866
2867 If there is already an encoded word in the last line of lines (indicated by
2868 a non-None value for last_ew) and ew_combine_allowed is true, decode the
2869 existing ew, combine it with to_encode, and re-encode. Otherwise, encode
2870 to_encode. In either case, split to_encode as necessary so that the
2871 encoded segments fit within maxlen.
2872
2873 """
2874 if last_ew is not None and ew_combine_allowed:
2875 to_encode = str(
2876 get_unstructured(lines[-1][last_ew:] + to_encode))
2877 lines[-1] = lines[-1][:last_ew]
2878 if to_encode[0] in WSP:
2879 # We're joining this to non-encoded text, so don't encode
2880 # the leading blank.
2881 leading_wsp = to_encode[0]
2882 to_encode = to_encode[1:]
2883 if (len(lines[-1]) == maxlen):
2884 lines.append(_steal_trailing_WSP_if_exists(lines))
2885 lines[-1] += leading_wsp
2886 trailing_wsp = ''
2887 if to_encode[-1] in WSP:
2888 # Likewise for the trailing space.
2889 trailing_wsp = to_encode[-1]
2890 to_encode = to_encode[:-1]
2891 new_last_ew = len(lines[-1]) if last_ew is None else last_ew
2892
2893 encode_as = 'utf-8' if charset == 'us-ascii' else charset
2894
2895 # The RFC2047 chrome takes up 7 characters plus the length
2896 # of the charset name.
2897 chrome_len = len(encode_as) + 7
2898
2899 if (chrome_len + 1) >= maxlen:
2900 raise errors.HeaderParseError(
2901 "max_line_length is too small to fit an encoded word")
2902
2903 while to_encode:
2904 remaining_space = maxlen - len(lines[-1])
2905 text_space = remaining_space - chrome_len
2906 if text_space <= 0:
2907 lines.append(' ')
2908 continue
2909
2910 to_encode_word = to_encode[:text_space]
2911 encoded_word = _ew.encode(to_encode_word, charset=encode_as)
2912 excess = len(encoded_word) - remaining_space
2913 while excess > 0:
2914 # Since the chunk to encode is guaranteed to fit into less than 100 characters,
2915 # shrinking it by one at a time shouldn't take long.
2916 to_encode_word = to_encode_word[:-1]
2917 encoded_word = _ew.encode(to_encode_word, charset=encode_as)
2918 excess = len(encoded_word) - remaining_space
2919 lines[-1] += encoded_word
2920 to_encode = to_encode[len(to_encode_word):]
2921
2922 if to_encode:
2923 lines.append(' ')
2924 new_last_ew = len(lines[-1])
2925 lines[-1] += trailing_wsp
2926 return new_last_ew if ew_combine_allowed else None
2927
2928 def _fold_mime_parameters(part, lines, maxlen, encoding):
2929 """Fold TokenList 'part' into the 'lines' list as mime parameters.
2930
2931 Using the decoded list of parameters and values, format them according to
2932 the RFC rules, including using RFC2231 encoding if the value cannot be
2933 expressed in 'encoding' and/or the parameter+value is too long to fit
2934 within 'maxlen'.
2935
2936 """
2937 # Special case for RFC2231 encoding: start from decoded values and use
2938 # RFC2231 encoding iff needed.
2939 #
2940 # Note that the 1 and 2s being added to the length calculations are
2941 # accounting for the possibly-needed spaces and semicolons we'll be adding.
2942 #
2943 for name, value in part.params:
2944 # XXX What if this ';' puts us over maxlen the first time through the
2945 # loop? We should split the header value onto a newline in that case,
2946 # but to do that we need to recognize the need earlier or reparse the
2947 # header, so I'm going to ignore that bug for now. It'll only put us
2948 # one character over.
2949 if not lines[-1].rstrip().endswith(';'):
2950 lines[-1] += ';'
2951 charset = encoding
2952 error_handler = 'strict'
2953 try:
2954 value.encode(encoding)
2955 encoding_required = False
2956 except UnicodeEncodeError:
2957 encoding_required = True
2958 if utils._has_surrogates(value):
2959 charset = 'unknown-8bit'
2960 error_handler = 'surrogateescape'
2961 else:
2962 charset = 'utf-8'
2963 if encoding_required:
2964 encoded_value = urllib.parse.quote(
2965 value, safe='', errors=error_handler)
2966 tstr = "{}*={}''{}".format(name, charset, encoded_value)
2967 else:
2968 tstr = '{}={}'.format(name, quote_string(value))
2969 if len(lines[-1]) + len(tstr) + 1 < maxlen:
2970 lines[-1] = lines[-1] + ' ' + tstr
2971 continue
2972 elif len(tstr) + 2 <= maxlen:
2973 lines.append(' ' + tstr)
2974 continue
2975 # We need multiple sections. We are allowed to mix encoded and
2976 # non-encoded sections, but we aren't going to. We'll encode them all.
2977 section = 0
2978 extra_chrome = charset + "''"
2979 while value:
2980 chrome_len = len(name) + len(str(section)) + 3 + len(extra_chrome)
2981 if maxlen <= chrome_len + 3:
2982 # We need room for the leading blank, the trailing semicolon,
2983 # and at least one character of the value. If we don't
2984 # have that, we'd be stuck, so in that case fall back to
2985 # the RFC standard width.
2986 maxlen = 78
2987 splitpoint = maxchars = maxlen - chrome_len - 2
2988 while True:
2989 partial = value[:splitpoint]
2990 encoded_value = urllib.parse.quote(
2991 partial, safe='', errors=error_handler)
2992 if len(encoded_value) <= maxchars:
2993 break
2994 splitpoint -= 1
2995 lines.append(" {}*{}*={}{}".format(
2996 name, section, extra_chrome, encoded_value))
2997 extra_chrome = ''
2998 section += 1
2999 value = value[splitpoint:]
3000 if value:
3001 lines[-1] += ';'