python (3.12.0)

(root)/
lib/
python3.12/
email/
_header_value_parser.py
       1  """Header value parser implementing various email-related RFC parsing rules.
       2  
       3  The parsing methods defined in this module implement various email related
       4  parsing rules.  Principal among them is RFC 5322, which is the followon
       5  to RFC 2822 and primarily a clarification of the former.  It also implements
       6  RFC 2047 encoded word decoding.
       7  
       8  RFC 5322 goes to considerable trouble to maintain backward compatibility with
       9  RFC 822 in the parse phase, while cleaning up the structure on the generation
      10  phase.  This parser supports correct RFC 5322 generation by tagging white space
      11  as folding white space only when folding is allowed in the non-obsolete rule
      12  sets.  Actually, the parser is even more generous when accepting input than RFC
      13  5322 mandates, following the spirit of Postel's Law, which RFC 5322 encourages.
      14  Where possible deviations from the standard are annotated on the 'defects'
      15  attribute of tokens that deviate.
      16  
      17  The general structure of the parser follows RFC 5322, and uses its terminology
      18  where there is a direct correspondence.  Where the implementation requires a
      19  somewhat different structure than that used by the formal grammar, new terms
      20  that mimic the closest existing terms are used.  Thus, it really helps to have
      21  a copy of RFC 5322 handy when studying this code.
      22  
      23  Input to the parser is a string that has already been unfolded according to
      24  RFC 5322 rules.  According to the RFC this unfolding is the very first step, and
      25  this parser leaves the unfolding step to a higher level message parser, which
      26  will have already detected the line breaks that need unfolding while
      27  determining the beginning and end of each header.
      28  
      29  The output of the parser is a TokenList object, which is a list subclass.  A
      30  TokenList is a recursive data structure.  The terminal nodes of the structure
      31  are Terminal objects, which are subclasses of str.  These do not correspond
      32  directly to terminal objects in the formal grammar, but are instead more
      33  practical higher level combinations of true terminals.
      34  
      35  All TokenList and Terminal objects have a 'value' attribute, which produces the
      36  semantically meaningful value of that part of the parse subtree.  The value of
      37  all whitespace tokens (no matter how many sub-tokens they may contain) is a
      38  single space, as per the RFC rules.  This includes 'CFWS', which is herein
      39  included in the general class of whitespace tokens.  There is one exception to
      40  the rule that whitespace tokens are collapsed into single spaces in values: in
      41  the value of a 'bare-quoted-string' (a quoted-string with no leading or
      42  trailing whitespace), any whitespace that appeared between the quotation marks
      43  is preserved in the returned value.  Note that in all Terminal strings quoted
      44  pairs are turned into their unquoted values.
      45  
      46  All TokenList and Terminal objects also have a string value, which attempts to
      47  be a "canonical" representation of the RFC-compliant form of the substring that
      48  produced the parsed subtree, including minimal use of quoted pair quoting.
      49  Whitespace runs are not collapsed.
      50  
      51  Comment tokens also have a 'content' attribute providing the string found
      52  between the parens (including any nested comments) with whitespace preserved.
      53  
      54  All TokenList and Terminal objects have a 'defects' attribute which is a
      55  possibly empty list all of the defects found while creating the token.  Defects
      56  may appear on any token in the tree, and a composite list of all defects in the
      57  subtree is available through the 'all_defects' attribute of any node.  (For
      58  Terminal notes x.defects == x.all_defects.)
      59  
      60  Each object in a parse tree is called a 'token', and each has a 'token_type'
      61  attribute that gives the name from the RFC 5322 grammar that it represents.
      62  Not all RFC 5322 nodes are produced, and there is one non-RFC 5322 node that
      63  may be produced: 'ptext'.  A 'ptext' is a string of printable ascii characters.
      64  It is returned in place of lists of (ctext/quoted-pair) and
      65  (qtext/quoted-pair).
      66  
      67  XXX: provide complete list of token types.
      68  """
      69  
      70  import re
      71  import sys
      72  import urllib   # For urllib.parse.unquote
      73  from string import hexdigits
      74  from operator import itemgetter
      75  from email import _encoded_words as _ew
      76  from email import errors
      77  from email import utils
      78  
      79  #
      80  # Useful constants and functions
      81  #
      82  
      83  WSP = set(' \t')
      84  CFWS_LEADER = WSP | set('(')
      85  SPECIALS = set(r'()<>@,:;.\"[]')
      86  ATOM_ENDS = SPECIALS | WSP
      87  DOT_ATOM_ENDS = ATOM_ENDS - set('.')
      88  # '.', '"', and '(' do not end phrases in order to support obs-phrase
      89  PHRASE_ENDS = SPECIALS - set('."(')
      90  TSPECIALS = (SPECIALS | set('/?=')) - set('.')
      91  TOKEN_ENDS = TSPECIALS | WSP
      92  ASPECIALS = TSPECIALS | set("*'%")
      93  ATTRIBUTE_ENDS = ASPECIALS | WSP
      94  EXTENDED_ATTRIBUTE_ENDS = ATTRIBUTE_ENDS - set('%')
      95  
      96  def quote_string(value):
      97      return '"'+str(value).replace('\\', '\\\\').replace('"', r'\"')+'"'
      98  
      99  # Match a RFC 2047 word, looks like =?utf-8?q?someword?=
     100  rfc2047_matcher = re.compile(r'''
     101     =\?            # literal =?
     102     [^?]*          # charset
     103     \?             # literal ?
     104     [qQbB]         # literal 'q' or 'b', case insensitive
     105     \?             # literal ?
     106    .*?             # encoded word
     107    \?=             # literal ?=
     108  ''', re.VERBOSE | re.MULTILINE)
     109  
     110  
     111  #
     112  # TokenList and its subclasses
     113  #
     114  
     115  class ESC[4;38;5;81mTokenList(ESC[4;38;5;149mlist):
     116  
     117      token_type = None
     118      syntactic_break = True
     119      ew_combine_allowed = True
     120  
     121      def __init__(self, *args, **kw):
     122          super().__init__(*args, **kw)
     123          self.defects = []
     124  
     125      def __str__(self):
     126          return ''.join(str(x) for x in self)
     127  
     128      def __repr__(self):
     129          return '{}({})'.format(self.__class__.__name__,
     130                               super().__repr__())
     131  
     132      @property
     133      def value(self):
     134          return ''.join(x.value for x in self if x.value)
     135  
     136      @property
     137      def all_defects(self):
     138          return sum((x.all_defects for x in self), self.defects)
     139  
     140      def startswith_fws(self):
     141          return self[0].startswith_fws()
     142  
     143      @property
     144      def as_ew_allowed(self):
     145          """True if all top level tokens of this part may be RFC2047 encoded."""
     146          return all(part.as_ew_allowed for part in self)
     147  
     148      @property
     149      def comments(self):
     150          comments = []
     151          for token in self:
     152              comments.extend(token.comments)
     153          return comments
     154  
     155      def fold(self, *, policy):
     156          return _refold_parse_tree(self, policy=policy)
     157  
     158      def pprint(self, indent=''):
     159          print(self.ppstr(indent=indent))
     160  
     161      def ppstr(self, indent=''):
     162          return '\n'.join(self._pp(indent=indent))
     163  
     164      def _pp(self, indent=''):
     165          yield '{}{}/{}('.format(
     166              indent,
     167              self.__class__.__name__,
     168              self.token_type)
     169          for token in self:
     170              if not hasattr(token, '_pp'):
     171                  yield (indent + '    !! invalid element in token '
     172                                          'list: {!r}'.format(token))
     173              else:
     174                  yield from token._pp(indent+'    ')
     175          if self.defects:
     176              extra = ' Defects: {}'.format(self.defects)
     177          else:
     178              extra = ''
     179          yield '{}){}'.format(indent, extra)
     180  
     181  
     182  class ESC[4;38;5;81mWhiteSpaceTokenList(ESC[4;38;5;149mTokenList):
     183  
     184      @property
     185      def value(self):
     186          return ' '
     187  
     188      @property
     189      def comments(self):
     190          return [x.content for x in self if x.token_type=='comment']
     191  
     192  
     193  class ESC[4;38;5;81mUnstructuredTokenList(ESC[4;38;5;149mTokenList):
     194      token_type = 'unstructured'
     195  
     196  
     197  class ESC[4;38;5;81mPhrase(ESC[4;38;5;149mTokenList):
     198      token_type = 'phrase'
     199  
     200  class ESC[4;38;5;81mWord(ESC[4;38;5;149mTokenList):
     201      token_type = 'word'
     202  
     203  
     204  class ESC[4;38;5;81mCFWSList(ESC[4;38;5;149mWhiteSpaceTokenList):
     205      token_type = 'cfws'
     206  
     207  
     208  class ESC[4;38;5;81mAtom(ESC[4;38;5;149mTokenList):
     209      token_type = 'atom'
     210  
     211  
     212  class ESC[4;38;5;81mToken(ESC[4;38;5;149mTokenList):
     213      token_type = 'token'
     214      encode_as_ew = False
     215  
     216  
     217  class ESC[4;38;5;81mEncodedWord(ESC[4;38;5;149mTokenList):
     218      token_type = 'encoded-word'
     219      cte = None
     220      charset = None
     221      lang = None
     222  
     223  
     224  class ESC[4;38;5;81mQuotedString(ESC[4;38;5;149mTokenList):
     225  
     226      token_type = 'quoted-string'
     227  
     228      @property
     229      def content(self):
     230          for x in self:
     231              if x.token_type == 'bare-quoted-string':
     232                  return x.value
     233  
     234      @property
     235      def quoted_value(self):
     236          res = []
     237          for x in self:
     238              if x.token_type == 'bare-quoted-string':
     239                  res.append(str(x))
     240              else:
     241                  res.append(x.value)
     242          return ''.join(res)
     243  
     244      @property
     245      def stripped_value(self):
     246          for token in self:
     247              if token.token_type == 'bare-quoted-string':
     248                  return token.value
     249  
     250  
     251  class ESC[4;38;5;81mBareQuotedString(ESC[4;38;5;149mQuotedString):
     252  
     253      token_type = 'bare-quoted-string'
     254  
     255      def __str__(self):
     256          return quote_string(''.join(str(x) for x in self))
     257  
     258      @property
     259      def value(self):
     260          return ''.join(str(x) for x in self)
     261  
     262  
     263  class ESC[4;38;5;81mComment(ESC[4;38;5;149mWhiteSpaceTokenList):
     264  
     265      token_type = 'comment'
     266  
     267      def __str__(self):
     268          return ''.join(sum([
     269                              ["("],
     270                              [self.quote(x) for x in self],
     271                              [")"],
     272                              ], []))
     273  
     274      def quote(self, value):
     275          if value.token_type == 'comment':
     276              return str(value)
     277          return str(value).replace('\\', '\\\\').replace(
     278                                    '(', r'\(').replace(
     279                                    ')', r'\)')
     280  
     281      @property
     282      def content(self):
     283          return ''.join(str(x) for x in self)
     284  
     285      @property
     286      def comments(self):
     287          return [self.content]
     288  
     289  class ESC[4;38;5;81mAddressList(ESC[4;38;5;149mTokenList):
     290  
     291      token_type = 'address-list'
     292  
     293      @property
     294      def addresses(self):
     295          return [x for x in self if x.token_type=='address']
     296  
     297      @property
     298      def mailboxes(self):
     299          return sum((x.mailboxes
     300                      for x in self if x.token_type=='address'), [])
     301  
     302      @property
     303      def all_mailboxes(self):
     304          return sum((x.all_mailboxes
     305                      for x in self if x.token_type=='address'), [])
     306  
     307  
     308  class ESC[4;38;5;81mAddress(ESC[4;38;5;149mTokenList):
     309  
     310      token_type = 'address'
     311  
     312      @property
     313      def display_name(self):
     314          if self[0].token_type == 'group':
     315              return self[0].display_name
     316  
     317      @property
     318      def mailboxes(self):
     319          if self[0].token_type == 'mailbox':
     320              return [self[0]]
     321          elif self[0].token_type == 'invalid-mailbox':
     322              return []
     323          return self[0].mailboxes
     324  
     325      @property
     326      def all_mailboxes(self):
     327          if self[0].token_type == 'mailbox':
     328              return [self[0]]
     329          elif self[0].token_type == 'invalid-mailbox':
     330              return [self[0]]
     331          return self[0].all_mailboxes
     332  
     333  class ESC[4;38;5;81mMailboxList(ESC[4;38;5;149mTokenList):
     334  
     335      token_type = 'mailbox-list'
     336  
     337      @property
     338      def mailboxes(self):
     339          return [x for x in self if x.token_type=='mailbox']
     340  
     341      @property
     342      def all_mailboxes(self):
     343          return [x for x in self
     344              if x.token_type in ('mailbox', 'invalid-mailbox')]
     345  
     346  
     347  class ESC[4;38;5;81mGroupList(ESC[4;38;5;149mTokenList):
     348  
     349      token_type = 'group-list'
     350  
     351      @property
     352      def mailboxes(self):
     353          if not self or self[0].token_type != 'mailbox-list':
     354              return []
     355          return self[0].mailboxes
     356  
     357      @property
     358      def all_mailboxes(self):
     359          if not self or self[0].token_type != 'mailbox-list':
     360              return []
     361          return self[0].all_mailboxes
     362  
     363  
     364  class ESC[4;38;5;81mGroup(ESC[4;38;5;149mTokenList):
     365  
     366      token_type = "group"
     367  
     368      @property
     369      def mailboxes(self):
     370          if self[2].token_type != 'group-list':
     371              return []
     372          return self[2].mailboxes
     373  
     374      @property
     375      def all_mailboxes(self):
     376          if self[2].token_type != 'group-list':
     377              return []
     378          return self[2].all_mailboxes
     379  
     380      @property
     381      def display_name(self):
     382          return self[0].display_name
     383  
     384  
     385  class ESC[4;38;5;81mNameAddr(ESC[4;38;5;149mTokenList):
     386  
     387      token_type = 'name-addr'
     388  
     389      @property
     390      def display_name(self):
     391          if len(self) == 1:
     392              return None
     393          return self[0].display_name
     394  
     395      @property
     396      def local_part(self):
     397          return self[-1].local_part
     398  
     399      @property
     400      def domain(self):
     401          return self[-1].domain
     402  
     403      @property
     404      def route(self):
     405          return self[-1].route
     406  
     407      @property
     408      def addr_spec(self):
     409          return self[-1].addr_spec
     410  
     411  
     412  class ESC[4;38;5;81mAngleAddr(ESC[4;38;5;149mTokenList):
     413  
     414      token_type = 'angle-addr'
     415  
     416      @property
     417      def local_part(self):
     418          for x in self:
     419              if x.token_type == 'addr-spec':
     420                  return x.local_part
     421  
     422      @property
     423      def domain(self):
     424          for x in self:
     425              if x.token_type == 'addr-spec':
     426                  return x.domain
     427  
     428      @property
     429      def route(self):
     430          for x in self:
     431              if x.token_type == 'obs-route':
     432                  return x.domains
     433  
     434      @property
     435      def addr_spec(self):
     436          for x in self:
     437              if x.token_type == 'addr-spec':
     438                  if x.local_part:
     439                      return x.addr_spec
     440                  else:
     441                      return quote_string(x.local_part) + x.addr_spec
     442          else:
     443              return '<>'
     444  
     445  
     446  class ESC[4;38;5;81mObsRoute(ESC[4;38;5;149mTokenList):
     447  
     448      token_type = 'obs-route'
     449  
     450      @property
     451      def domains(self):
     452          return [x.domain for x in self if x.token_type == 'domain']
     453  
     454  
     455  class ESC[4;38;5;81mMailbox(ESC[4;38;5;149mTokenList):
     456  
     457      token_type = 'mailbox'
     458  
     459      @property
     460      def display_name(self):
     461          if self[0].token_type == 'name-addr':
     462              return self[0].display_name
     463  
     464      @property
     465      def local_part(self):
     466          return self[0].local_part
     467  
     468      @property
     469      def domain(self):
     470          return self[0].domain
     471  
     472      @property
     473      def route(self):
     474          if self[0].token_type == 'name-addr':
     475              return self[0].route
     476  
     477      @property
     478      def addr_spec(self):
     479          return self[0].addr_spec
     480  
     481  
     482  class ESC[4;38;5;81mInvalidMailbox(ESC[4;38;5;149mTokenList):
     483  
     484      token_type = 'invalid-mailbox'
     485  
     486      @property
     487      def display_name(self):
     488          return None
     489  
     490      local_part = domain = route = addr_spec = display_name
     491  
     492  
     493  class ESC[4;38;5;81mDomain(ESC[4;38;5;149mTokenList):
     494  
     495      token_type = 'domain'
     496      as_ew_allowed = False
     497  
     498      @property
     499      def domain(self):
     500          return ''.join(super().value.split())
     501  
     502  
     503  class ESC[4;38;5;81mDotAtom(ESC[4;38;5;149mTokenList):
     504      token_type = 'dot-atom'
     505  
     506  
     507  class ESC[4;38;5;81mDotAtomText(ESC[4;38;5;149mTokenList):
     508      token_type = 'dot-atom-text'
     509      as_ew_allowed = True
     510  
     511  
     512  class ESC[4;38;5;81mNoFoldLiteral(ESC[4;38;5;149mTokenList):
     513      token_type = 'no-fold-literal'
     514      as_ew_allowed = False
     515  
     516  
     517  class ESC[4;38;5;81mAddrSpec(ESC[4;38;5;149mTokenList):
     518  
     519      token_type = 'addr-spec'
     520      as_ew_allowed = False
     521  
     522      @property
     523      def local_part(self):
     524          return self[0].local_part
     525  
     526      @property
     527      def domain(self):
     528          if len(self) < 3:
     529              return None
     530          return self[-1].domain
     531  
     532      @property
     533      def value(self):
     534          if len(self) < 3:
     535              return self[0].value
     536          return self[0].value.rstrip()+self[1].value+self[2].value.lstrip()
     537  
     538      @property
     539      def addr_spec(self):
     540          nameset = set(self.local_part)
     541          if len(nameset) > len(nameset-DOT_ATOM_ENDS):
     542              lp = quote_string(self.local_part)
     543          else:
     544              lp = self.local_part
     545          if self.domain is not None:
     546              return lp + '@' + self.domain
     547          return lp
     548  
     549  
     550  class ESC[4;38;5;81mObsLocalPart(ESC[4;38;5;149mTokenList):
     551  
     552      token_type = 'obs-local-part'
     553      as_ew_allowed = False
     554  
     555  
     556  class ESC[4;38;5;81mDisplayName(ESC[4;38;5;149mPhrase):
     557  
     558      token_type = 'display-name'
     559      ew_combine_allowed = False
     560  
     561      @property
     562      def display_name(self):
     563          res = TokenList(self)
     564          if len(res) == 0:
     565              return res.value
     566          if res[0].token_type == 'cfws':
     567              res.pop(0)
     568          else:
     569              if res[0][0].token_type == 'cfws':
     570                  res[0] = TokenList(res[0][1:])
     571          if res[-1].token_type == 'cfws':
     572              res.pop()
     573          else:
     574              if res[-1][-1].token_type == 'cfws':
     575                  res[-1] = TokenList(res[-1][:-1])
     576          return res.value
     577  
     578      @property
     579      def value(self):
     580          quote = False
     581          if self.defects:
     582              quote = True
     583          else:
     584              for x in self:
     585                  if x.token_type == 'quoted-string':
     586                      quote = True
     587          if len(self) != 0 and quote:
     588              pre = post = ''
     589              if self[0].token_type=='cfws' or self[0][0].token_type=='cfws':
     590                  pre = ' '
     591              if self[-1].token_type=='cfws' or self[-1][-1].token_type=='cfws':
     592                  post = ' '
     593              return pre+quote_string(self.display_name)+post
     594          else:
     595              return super().value
     596  
     597  
     598  class ESC[4;38;5;81mLocalPart(ESC[4;38;5;149mTokenList):
     599  
     600      token_type = 'local-part'
     601      as_ew_allowed = False
     602  
     603      @property
     604      def value(self):
     605          if self[0].token_type == "quoted-string":
     606              return self[0].quoted_value
     607          else:
     608              return self[0].value
     609  
     610      @property
     611      def local_part(self):
     612          # Strip whitespace from front, back, and around dots.
     613          res = [DOT]
     614          last = DOT
     615          last_is_tl = False
     616          for tok in self[0] + [DOT]:
     617              if tok.token_type == 'cfws':
     618                  continue
     619              if (last_is_tl and tok.token_type == 'dot' and
     620                      last[-1].token_type == 'cfws'):
     621                  res[-1] = TokenList(last[:-1])
     622              is_tl = isinstance(tok, TokenList)
     623              if (is_tl and last.token_type == 'dot' and
     624                      tok[0].token_type == 'cfws'):
     625                  res.append(TokenList(tok[1:]))
     626              else:
     627                  res.append(tok)
     628              last = res[-1]
     629              last_is_tl = is_tl
     630          res = TokenList(res[1:-1])
     631          return res.value
     632  
     633  
     634  class ESC[4;38;5;81mDomainLiteral(ESC[4;38;5;149mTokenList):
     635  
     636      token_type = 'domain-literal'
     637      as_ew_allowed = False
     638  
     639      @property
     640      def domain(self):
     641          return ''.join(super().value.split())
     642  
     643      @property
     644      def ip(self):
     645          for x in self:
     646              if x.token_type == 'ptext':
     647                  return x.value
     648  
     649  
     650  class ESC[4;38;5;81mMIMEVersion(ESC[4;38;5;149mTokenList):
     651  
     652      token_type = 'mime-version'
     653      major = None
     654      minor = None
     655  
     656  
     657  class ESC[4;38;5;81mParameter(ESC[4;38;5;149mTokenList):
     658  
     659      token_type = 'parameter'
     660      sectioned = False
     661      extended = False
     662      charset = 'us-ascii'
     663  
     664      @property
     665      def section_number(self):
     666          # Because the first token, the attribute (name) eats CFWS, the second
     667          # token is always the section if there is one.
     668          return self[1].number if self.sectioned else 0
     669  
     670      @property
     671      def param_value(self):
     672          # This is part of the "handle quoted extended parameters" hack.
     673          for token in self:
     674              if token.token_type == 'value':
     675                  return token.stripped_value
     676              if token.token_type == 'quoted-string':
     677                  for token in token:
     678                      if token.token_type == 'bare-quoted-string':
     679                          for token in token:
     680                              if token.token_type == 'value':
     681                                  return token.stripped_value
     682          return ''
     683  
     684  
     685  class ESC[4;38;5;81mInvalidParameter(ESC[4;38;5;149mParameter):
     686  
     687      token_type = 'invalid-parameter'
     688  
     689  
     690  class ESC[4;38;5;81mAttribute(ESC[4;38;5;149mTokenList):
     691  
     692      token_type = 'attribute'
     693  
     694      @property
     695      def stripped_value(self):
     696          for token in self:
     697              if token.token_type.endswith('attrtext'):
     698                  return token.value
     699  
     700  class ESC[4;38;5;81mSection(ESC[4;38;5;149mTokenList):
     701  
     702      token_type = 'section'
     703      number = None
     704  
     705  
     706  class ESC[4;38;5;81mValue(ESC[4;38;5;149mTokenList):
     707  
     708      token_type = 'value'
     709  
     710      @property
     711      def stripped_value(self):
     712          token = self[0]
     713          if token.token_type == 'cfws':
     714              token = self[1]
     715          if token.token_type.endswith(
     716                  ('quoted-string', 'attribute', 'extended-attribute')):
     717              return token.stripped_value
     718          return self.value
     719  
     720  
     721  class ESC[4;38;5;81mMimeParameters(ESC[4;38;5;149mTokenList):
     722  
     723      token_type = 'mime-parameters'
     724      syntactic_break = False
     725  
     726      @property
     727      def params(self):
     728          # The RFC specifically states that the ordering of parameters is not
     729          # guaranteed and may be reordered by the transport layer.  So we have
     730          # to assume the RFC 2231 pieces can come in any order.  However, we
     731          # output them in the order that we first see a given name, which gives
     732          # us a stable __str__.
     733          params = {}  # Using order preserving dict from Python 3.7+
     734          for token in self:
     735              if not token.token_type.endswith('parameter'):
     736                  continue
     737              if token[0].token_type != 'attribute':
     738                  continue
     739              name = token[0].value.strip()
     740              if name not in params:
     741                  params[name] = []
     742              params[name].append((token.section_number, token))
     743          for name, parts in params.items():
     744              parts = sorted(parts, key=itemgetter(0))
     745              first_param = parts[0][1]
     746              charset = first_param.charset
     747              # Our arbitrary error recovery is to ignore duplicate parameters,
     748              # to use appearance order if there are duplicate rfc 2231 parts,
     749              # and to ignore gaps.  This mimics the error recovery of get_param.
     750              if not first_param.extended and len(parts) > 1:
     751                  if parts[1][0] == 0:
     752                      parts[1][1].defects.append(errors.InvalidHeaderDefect(
     753                          'duplicate parameter name; duplicate(s) ignored'))
     754                      parts = parts[:1]
     755                  # Else assume the *0* was missing...note that this is different
     756                  # from get_param, but we registered a defect for this earlier.
     757              value_parts = []
     758              i = 0
     759              for section_number, param in parts:
     760                  if section_number != i:
     761                      # We could get fancier here and look for a complete
     762                      # duplicate extended parameter and ignore the second one
     763                      # seen.  But we're not doing that.  The old code didn't.
     764                      if not param.extended:
     765                          param.defects.append(errors.InvalidHeaderDefect(
     766                              'duplicate parameter name; duplicate ignored'))
     767                          continue
     768                      else:
     769                          param.defects.append(errors.InvalidHeaderDefect(
     770                              "inconsistent RFC2231 parameter numbering"))
     771                  i += 1
     772                  value = param.param_value
     773                  if param.extended:
     774                      try:
     775                          value = urllib.parse.unquote_to_bytes(value)
     776                      except UnicodeEncodeError:
     777                          # source had surrogate escaped bytes.  What we do now
     778                          # is a bit of an open question.  I'm not sure this is
     779                          # the best choice, but it is what the old algorithm did
     780                          value = urllib.parse.unquote(value, encoding='latin-1')
     781                      else:
     782                          try:
     783                              value = value.decode(charset, 'surrogateescape')
     784                          except (LookupError, UnicodeEncodeError):
     785                              # XXX: there should really be a custom defect for
     786                              # unknown character set to make it easy to find,
     787                              # because otherwise unknown charset is a silent
     788                              # failure.
     789                              value = value.decode('us-ascii', 'surrogateescape')
     790                          if utils._has_surrogates(value):
     791                              param.defects.append(errors.UndecodableBytesDefect())
     792                  value_parts.append(value)
     793              value = ''.join(value_parts)
     794              yield name, value
     795  
     796      def __str__(self):
     797          params = []
     798          for name, value in self.params:
     799              if value:
     800                  params.append('{}={}'.format(name, quote_string(value)))
     801              else:
     802                  params.append(name)
     803          params = '; '.join(params)
     804          return ' ' + params if params else ''
     805  
     806  
     807  class ESC[4;38;5;81mParameterizedHeaderValue(ESC[4;38;5;149mTokenList):
     808  
     809      # Set this false so that the value doesn't wind up on a new line even
     810      # if it and the parameters would fit there but not on the first line.
     811      syntactic_break = False
     812  
     813      @property
     814      def params(self):
     815          for token in reversed(self):
     816              if token.token_type == 'mime-parameters':
     817                  return token.params
     818          return {}
     819  
     820  
     821  class ESC[4;38;5;81mContentType(ESC[4;38;5;149mParameterizedHeaderValue):
     822      token_type = 'content-type'
     823      as_ew_allowed = False
     824      maintype = 'text'
     825      subtype = 'plain'
     826  
     827  
     828  class ESC[4;38;5;81mContentDisposition(ESC[4;38;5;149mParameterizedHeaderValue):
     829      token_type = 'content-disposition'
     830      as_ew_allowed = False
     831      content_disposition = None
     832  
     833  
     834  class ESC[4;38;5;81mContentTransferEncoding(ESC[4;38;5;149mTokenList):
     835      token_type = 'content-transfer-encoding'
     836      as_ew_allowed = False
     837      cte = '7bit'
     838  
     839  
     840  class ESC[4;38;5;81mHeaderLabel(ESC[4;38;5;149mTokenList):
     841      token_type = 'header-label'
     842      as_ew_allowed = False
     843  
     844  
     845  class ESC[4;38;5;81mMsgID(ESC[4;38;5;149mTokenList):
     846      token_type = 'msg-id'
     847      as_ew_allowed = False
     848  
     849      def fold(self, policy):
     850          # message-id tokens may not be folded.
     851          return str(self) + policy.linesep
     852  
     853  
     854  class ESC[4;38;5;81mMessageID(ESC[4;38;5;149mMsgID):
     855      token_type = 'message-id'
     856  
     857  
     858  class ESC[4;38;5;81mInvalidMessageID(ESC[4;38;5;149mMessageID):
     859      token_type = 'invalid-message-id'
     860  
     861  
     862  class ESC[4;38;5;81mHeader(ESC[4;38;5;149mTokenList):
     863      token_type = 'header'
     864  
     865  
     866  #
     867  # Terminal classes and instances
     868  #
     869  
     870  class ESC[4;38;5;81mTerminal(ESC[4;38;5;149mstr):
     871  
     872      as_ew_allowed = True
     873      ew_combine_allowed = True
     874      syntactic_break = True
     875  
     876      def __new__(cls, value, token_type):
     877          self = super().__new__(cls, value)
     878          self.token_type = token_type
     879          self.defects = []
     880          return self
     881  
     882      def __repr__(self):
     883          return "{}({})".format(self.__class__.__name__, super().__repr__())
     884  
     885      def pprint(self):
     886          print(self.__class__.__name__ + '/' + self.token_type)
     887  
     888      @property
     889      def all_defects(self):
     890          return list(self.defects)
     891  
     892      def _pp(self, indent=''):
     893          return ["{}{}/{}({}){}".format(
     894              indent,
     895              self.__class__.__name__,
     896              self.token_type,
     897              super().__repr__(),
     898              '' if not self.defects else ' {}'.format(self.defects),
     899              )]
     900  
     901      def pop_trailing_ws(self):
     902          # This terminates the recursion.
     903          return None
     904  
     905      @property
     906      def comments(self):
     907          return []
     908  
     909      def __getnewargs__(self):
     910          return(str(self), self.token_type)
     911  
     912  
     913  class ESC[4;38;5;81mWhiteSpaceTerminal(ESC[4;38;5;149mTerminal):
     914  
     915      @property
     916      def value(self):
     917          return ' '
     918  
     919      def startswith_fws(self):
     920          return True
     921  
     922  
     923  class ESC[4;38;5;81mValueTerminal(ESC[4;38;5;149mTerminal):
     924  
     925      @property
     926      def value(self):
     927          return self
     928  
     929      def startswith_fws(self):
     930          return False
     931  
     932  
     933  class ESC[4;38;5;81mEWWhiteSpaceTerminal(ESC[4;38;5;149mWhiteSpaceTerminal):
     934  
     935      @property
     936      def value(self):
     937          return ''
     938  
     939      def __str__(self):
     940          return ''
     941  
     942  
     943  class ESC[4;38;5;81m_InvalidEwError(ESC[4;38;5;149merrorsESC[4;38;5;149m.ESC[4;38;5;149mHeaderParseError):
     944      """Invalid encoded word found while parsing headers."""
     945  
     946  
     947  # XXX these need to become classes and used as instances so
     948  # that a program can't change them in a parse tree and screw
     949  # up other parse trees.  Maybe should have  tests for that, too.
     950  DOT = ValueTerminal('.', 'dot')
     951  ListSeparator = ValueTerminal(',', 'list-separator')
     952  RouteComponentMarker = ValueTerminal('@', 'route-component-marker')
     953  
     954  #
     955  # Parser
     956  #
     957  
     958  # Parse strings according to RFC822/2047/2822/5322 rules.
     959  #
     960  # This is a stateless parser.  Each get_XXX function accepts a string and
     961  # returns either a Terminal or a TokenList representing the RFC object named
     962  # by the method and a string containing the remaining unparsed characters
     963  # from the input.  Thus a parser method consumes the next syntactic construct
     964  # of a given type and returns a token representing the construct plus the
     965  # unparsed remainder of the input string.
     966  #
     967  # For example, if the first element of a structured header is a 'phrase',
     968  # then:
     969  #
     970  #     phrase, value = get_phrase(value)
     971  #
     972  # returns the complete phrase from the start of the string value, plus any
     973  # characters left in the string after the phrase is removed.
     974  
     975  _wsp_splitter = re.compile(r'([{}]+)'.format(''.join(WSP))).split
     976  _non_atom_end_matcher = re.compile(r"[^{}]+".format(
     977      re.escape(''.join(ATOM_ENDS)))).match
     978  _non_printable_finder = re.compile(r"[\x00-\x20\x7F]").findall
     979  _non_token_end_matcher = re.compile(r"[^{}]+".format(
     980      re.escape(''.join(TOKEN_ENDS)))).match
     981  _non_attribute_end_matcher = re.compile(r"[^{}]+".format(
     982      re.escape(''.join(ATTRIBUTE_ENDS)))).match
     983  _non_extended_attribute_end_matcher = re.compile(r"[^{}]+".format(
     984      re.escape(''.join(EXTENDED_ATTRIBUTE_ENDS)))).match
     985  
     986  def _validate_xtext(xtext):
     987      """If input token contains ASCII non-printables, register a defect."""
     988  
     989      non_printables = _non_printable_finder(xtext)
     990      if non_printables:
     991          xtext.defects.append(errors.NonPrintableDefect(non_printables))
     992      if utils._has_surrogates(xtext):
     993          xtext.defects.append(errors.UndecodableBytesDefect(
     994              "Non-ASCII characters found in header token"))
     995  
     996  def _get_ptext_to_endchars(value, endchars):
     997      """Scan printables/quoted-pairs until endchars and return unquoted ptext.
     998  
     999      This function turns a run of qcontent, ccontent-without-comments, or
    1000      dtext-with-quoted-printables into a single string by unquoting any
    1001      quoted printables.  It returns the string, the remaining value, and
    1002      a flag that is True iff there were any quoted printables decoded.
    1003  
    1004      """
    1005      fragment, *remainder = _wsp_splitter(value, 1)
    1006      vchars = []
    1007      escape = False
    1008      had_qp = False
    1009      for pos in range(len(fragment)):
    1010          if fragment[pos] == '\\':
    1011              if escape:
    1012                  escape = False
    1013                  had_qp = True
    1014              else:
    1015                  escape = True
    1016                  continue
    1017          if escape:
    1018              escape = False
    1019          elif fragment[pos] in endchars:
    1020              break
    1021          vchars.append(fragment[pos])
    1022      else:
    1023          pos = pos + 1
    1024      return ''.join(vchars), ''.join([fragment[pos:]] + remainder), had_qp
    1025  
    1026  def get_fws(value):
    1027      """FWS = 1*WSP
    1028  
    1029      This isn't the RFC definition.  We're using fws to represent tokens where
    1030      folding can be done, but when we are parsing the *un*folding has already
    1031      been done so we don't need to watch out for CRLF.
    1032  
    1033      """
    1034      newvalue = value.lstrip()
    1035      fws = WhiteSpaceTerminal(value[:len(value)-len(newvalue)], 'fws')
    1036      return fws, newvalue
    1037  
    1038  def get_encoded_word(value):
    1039      """ encoded-word = "=?" charset "?" encoding "?" encoded-text "?="
    1040  
    1041      """
    1042      ew = EncodedWord()
    1043      if not value.startswith('=?'):
    1044          raise errors.HeaderParseError(
    1045              "expected encoded word but found {}".format(value))
    1046      tok, *remainder = value[2:].split('?=', 1)
    1047      if tok == value[2:]:
    1048          raise errors.HeaderParseError(
    1049              "expected encoded word but found {}".format(value))
    1050      remstr = ''.join(remainder)
    1051      if (len(remstr) > 1 and
    1052          remstr[0] in hexdigits and
    1053          remstr[1] in hexdigits and
    1054          tok.count('?') < 2):
    1055          # The ? after the CTE was followed by an encoded word escape (=XX).
    1056          rest, *remainder = remstr.split('?=', 1)
    1057          tok = tok + '?=' + rest
    1058      if len(tok.split()) > 1:
    1059          ew.defects.append(errors.InvalidHeaderDefect(
    1060              "whitespace inside encoded word"))
    1061      ew.cte = value
    1062      value = ''.join(remainder)
    1063      try:
    1064          text, charset, lang, defects = _ew.decode('=?' + tok + '?=')
    1065      except (ValueError, KeyError):
    1066          raise _InvalidEwError(
    1067              "encoded word format invalid: '{}'".format(ew.cte))
    1068      ew.charset = charset
    1069      ew.lang = lang
    1070      ew.defects.extend(defects)
    1071      while text:
    1072          if text[0] in WSP:
    1073              token, text = get_fws(text)
    1074              ew.append(token)
    1075              continue
    1076          chars, *remainder = _wsp_splitter(text, 1)
    1077          vtext = ValueTerminal(chars, 'vtext')
    1078          _validate_xtext(vtext)
    1079          ew.append(vtext)
    1080          text = ''.join(remainder)
    1081      # Encoded words should be followed by a WS
    1082      if value and value[0] not in WSP:
    1083          ew.defects.append(errors.InvalidHeaderDefect(
    1084              "missing trailing whitespace after encoded-word"))
    1085      return ew, value
    1086  
    1087  def get_unstructured(value):
    1088      """unstructured = (*([FWS] vchar) *WSP) / obs-unstruct
    1089         obs-unstruct = *((*LF *CR *(obs-utext) *LF *CR)) / FWS)
    1090         obs-utext = %d0 / obs-NO-WS-CTL / LF / CR
    1091  
    1092         obs-NO-WS-CTL is control characters except WSP/CR/LF.
    1093  
    1094      So, basically, we have printable runs, plus control characters or nulls in
    1095      the obsolete syntax, separated by whitespace.  Since RFC 2047 uses the
    1096      obsolete syntax in its specification, but requires whitespace on either
    1097      side of the encoded words, I can see no reason to need to separate the
    1098      non-printable-non-whitespace from the printable runs if they occur, so we
    1099      parse this into xtext tokens separated by WSP tokens.
    1100  
    1101      Because an 'unstructured' value must by definition constitute the entire
    1102      value, this 'get' routine does not return a remaining value, only the
    1103      parsed TokenList.
    1104  
    1105      """
    1106      # XXX: but what about bare CR and LF?  They might signal the start or
    1107      # end of an encoded word.  YAGNI for now, since our current parsers
    1108      # will never send us strings with bare CR or LF.
    1109  
    1110      unstructured = UnstructuredTokenList()
    1111      while value:
    1112          if value[0] in WSP:
    1113              token, value = get_fws(value)
    1114              unstructured.append(token)
    1115              continue
    1116          valid_ew = True
    1117          if value.startswith('=?'):
    1118              try:
    1119                  token, value = get_encoded_word(value)
    1120              except _InvalidEwError:
    1121                  valid_ew = False
    1122              except errors.HeaderParseError:
    1123                  # XXX: Need to figure out how to register defects when
    1124                  # appropriate here.
    1125                  pass
    1126              else:
    1127                  have_ws = True
    1128                  if len(unstructured) > 0:
    1129                      if unstructured[-1].token_type != 'fws':
    1130                          unstructured.defects.append(errors.InvalidHeaderDefect(
    1131                              "missing whitespace before encoded word"))
    1132                          have_ws = False
    1133                  if have_ws and len(unstructured) > 1:
    1134                      if unstructured[-2].token_type == 'encoded-word':
    1135                          unstructured[-1] = EWWhiteSpaceTerminal(
    1136                              unstructured[-1], 'fws')
    1137                  unstructured.append(token)
    1138                  continue
    1139          tok, *remainder = _wsp_splitter(value, 1)
    1140          # Split in the middle of an atom if there is a rfc2047 encoded word
    1141          # which does not have WSP on both sides. The defect will be registered
    1142          # the next time through the loop.
    1143          # This needs to only be performed when the encoded word is valid;
    1144          # otherwise, performing it on an invalid encoded word can cause
    1145          # the parser to go in an infinite loop.
    1146          if valid_ew and rfc2047_matcher.search(tok):
    1147              tok, *remainder = value.partition('=?')
    1148          vtext = ValueTerminal(tok, 'vtext')
    1149          _validate_xtext(vtext)
    1150          unstructured.append(vtext)
    1151          value = ''.join(remainder)
    1152      return unstructured
    1153  
    1154  def get_qp_ctext(value):
    1155      r"""ctext = <printable ascii except \ ( )>
    1156  
    1157      This is not the RFC ctext, since we are handling nested comments in comment
    1158      and unquoting quoted-pairs here.  We allow anything except the '()'
    1159      characters, but if we find any ASCII other than the RFC defined printable
    1160      ASCII, a NonPrintableDefect is added to the token's defects list.  Since
    1161      quoted pairs are converted to their unquoted values, what is returned is
    1162      a 'ptext' token.  In this case it is a WhiteSpaceTerminal, so it's value
    1163      is ' '.
    1164  
    1165      """
    1166      ptext, value, _ = _get_ptext_to_endchars(value, '()')
    1167      ptext = WhiteSpaceTerminal(ptext, 'ptext')
    1168      _validate_xtext(ptext)
    1169      return ptext, value
    1170  
    1171  def get_qcontent(value):
    1172      """qcontent = qtext / quoted-pair
    1173  
    1174      We allow anything except the DQUOTE character, but if we find any ASCII
    1175      other than the RFC defined printable ASCII, a NonPrintableDefect is
    1176      added to the token's defects list.  Any quoted pairs are converted to their
    1177      unquoted values, so what is returned is a 'ptext' token.  In this case it
    1178      is a ValueTerminal.
    1179  
    1180      """
    1181      ptext, value, _ = _get_ptext_to_endchars(value, '"')
    1182      ptext = ValueTerminal(ptext, 'ptext')
    1183      _validate_xtext(ptext)
    1184      return ptext, value
    1185  
    1186  def get_atext(value):
    1187      """atext = <matches _atext_matcher>
    1188  
    1189      We allow any non-ATOM_ENDS in atext, but add an InvalidATextDefect to
    1190      the token's defects list if we find non-atext characters.
    1191      """
    1192      m = _non_atom_end_matcher(value)
    1193      if not m:
    1194          raise errors.HeaderParseError(
    1195              "expected atext but found '{}'".format(value))
    1196      atext = m.group()
    1197      value = value[len(atext):]
    1198      atext = ValueTerminal(atext, 'atext')
    1199      _validate_xtext(atext)
    1200      return atext, value
    1201  
    1202  def get_bare_quoted_string(value):
    1203      """bare-quoted-string = DQUOTE *([FWS] qcontent) [FWS] DQUOTE
    1204  
    1205      A quoted-string without the leading or trailing white space.  Its
    1206      value is the text between the quote marks, with whitespace
    1207      preserved and quoted pairs decoded.
    1208      """
    1209      if value[0] != '"':
    1210          raise errors.HeaderParseError(
    1211              "expected '\"' but found '{}'".format(value))
    1212      bare_quoted_string = BareQuotedString()
    1213      value = value[1:]
    1214      if value and value[0] == '"':
    1215          token, value = get_qcontent(value)
    1216          bare_quoted_string.append(token)
    1217      while value and value[0] != '"':
    1218          if value[0] in WSP:
    1219              token, value = get_fws(value)
    1220          elif value[:2] == '=?':
    1221              valid_ew = False
    1222              try:
    1223                  token, value = get_encoded_word(value)
    1224                  bare_quoted_string.defects.append(errors.InvalidHeaderDefect(
    1225                      "encoded word inside quoted string"))
    1226                  valid_ew = True
    1227              except errors.HeaderParseError:
    1228                  token, value = get_qcontent(value)
    1229              # Collapse the whitespace between two encoded words that occur in a
    1230              # bare-quoted-string.
    1231              if valid_ew and len(bare_quoted_string) > 1:
    1232                  if (bare_quoted_string[-1].token_type == 'fws' and
    1233                          bare_quoted_string[-2].token_type == 'encoded-word'):
    1234                      bare_quoted_string[-1] = EWWhiteSpaceTerminal(
    1235                          bare_quoted_string[-1], 'fws')
    1236          else:
    1237              token, value = get_qcontent(value)
    1238          bare_quoted_string.append(token)
    1239      if not value:
    1240          bare_quoted_string.defects.append(errors.InvalidHeaderDefect(
    1241              "end of header inside quoted string"))
    1242          return bare_quoted_string, value
    1243      return bare_quoted_string, value[1:]
    1244  
    1245  def get_comment(value):
    1246      """comment = "(" *([FWS] ccontent) [FWS] ")"
    1247         ccontent = ctext / quoted-pair / comment
    1248  
    1249      We handle nested comments here, and quoted-pair in our qp-ctext routine.
    1250      """
    1251      if value and value[0] != '(':
    1252          raise errors.HeaderParseError(
    1253              "expected '(' but found '{}'".format(value))
    1254      comment = Comment()
    1255      value = value[1:]
    1256      while value and value[0] != ")":
    1257          if value[0] in WSP:
    1258              token, value = get_fws(value)
    1259          elif value[0] == '(':
    1260              token, value = get_comment(value)
    1261          else:
    1262              token, value = get_qp_ctext(value)
    1263          comment.append(token)
    1264      if not value:
    1265          comment.defects.append(errors.InvalidHeaderDefect(
    1266              "end of header inside comment"))
    1267          return comment, value
    1268      return comment, value[1:]
    1269  
    1270  def get_cfws(value):
    1271      """CFWS = (1*([FWS] comment) [FWS]) / FWS
    1272  
    1273      """
    1274      cfws = CFWSList()
    1275      while value and value[0] in CFWS_LEADER:
    1276          if value[0] in WSP:
    1277              token, value = get_fws(value)
    1278          else:
    1279              token, value = get_comment(value)
    1280          cfws.append(token)
    1281      return cfws, value
    1282  
    1283  def get_quoted_string(value):
    1284      """quoted-string = [CFWS] <bare-quoted-string> [CFWS]
    1285  
    1286      'bare-quoted-string' is an intermediate class defined by this
    1287      parser and not by the RFC grammar.  It is the quoted string
    1288      without any attached CFWS.
    1289      """
    1290      quoted_string = QuotedString()
    1291      if value and value[0] in CFWS_LEADER:
    1292          token, value = get_cfws(value)
    1293          quoted_string.append(token)
    1294      token, value = get_bare_quoted_string(value)
    1295      quoted_string.append(token)
    1296      if value and value[0] in CFWS_LEADER:
    1297          token, value = get_cfws(value)
    1298          quoted_string.append(token)
    1299      return quoted_string, value
    1300  
    1301  def get_atom(value):
    1302      """atom = [CFWS] 1*atext [CFWS]
    1303  
    1304      An atom could be an rfc2047 encoded word.
    1305      """
    1306      atom = Atom()
    1307      if value and value[0] in CFWS_LEADER:
    1308          token, value = get_cfws(value)
    1309          atom.append(token)
    1310      if value and value[0] in ATOM_ENDS:
    1311          raise errors.HeaderParseError(
    1312              "expected atom but found '{}'".format(value))
    1313      if value.startswith('=?'):
    1314          try:
    1315              token, value = get_encoded_word(value)
    1316          except errors.HeaderParseError:
    1317              # XXX: need to figure out how to register defects when
    1318              # appropriate here.
    1319              token, value = get_atext(value)
    1320      else:
    1321          token, value = get_atext(value)
    1322      atom.append(token)
    1323      if value and value[0] in CFWS_LEADER:
    1324          token, value = get_cfws(value)
    1325          atom.append(token)
    1326      return atom, value
    1327  
    1328  def get_dot_atom_text(value):
    1329      """ dot-text = 1*atext *("." 1*atext)
    1330  
    1331      """
    1332      dot_atom_text = DotAtomText()
    1333      if not value or value[0] in ATOM_ENDS:
    1334          raise errors.HeaderParseError("expected atom at a start of "
    1335              "dot-atom-text but found '{}'".format(value))
    1336      while value and value[0] not in ATOM_ENDS:
    1337          token, value = get_atext(value)
    1338          dot_atom_text.append(token)
    1339          if value and value[0] == '.':
    1340              dot_atom_text.append(DOT)
    1341              value = value[1:]
    1342      if dot_atom_text[-1] is DOT:
    1343          raise errors.HeaderParseError("expected atom at end of dot-atom-text "
    1344              "but found '{}'".format('.'+value))
    1345      return dot_atom_text, value
    1346  
    1347  def get_dot_atom(value):
    1348      """ dot-atom = [CFWS] dot-atom-text [CFWS]
    1349  
    1350      Any place we can have a dot atom, we could instead have an rfc2047 encoded
    1351      word.
    1352      """
    1353      dot_atom = DotAtom()
    1354      if value[0] in CFWS_LEADER:
    1355          token, value = get_cfws(value)
    1356          dot_atom.append(token)
    1357      if value.startswith('=?'):
    1358          try:
    1359              token, value = get_encoded_word(value)
    1360          except errors.HeaderParseError:
    1361              # XXX: need to figure out how to register defects when
    1362              # appropriate here.
    1363              token, value = get_dot_atom_text(value)
    1364      else:
    1365          token, value = get_dot_atom_text(value)
    1366      dot_atom.append(token)
    1367      if value and value[0] in CFWS_LEADER:
    1368          token, value = get_cfws(value)
    1369          dot_atom.append(token)
    1370      return dot_atom, value
    1371  
    1372  def get_word(value):
    1373      """word = atom / quoted-string
    1374  
    1375      Either atom or quoted-string may start with CFWS.  We have to peel off this
    1376      CFWS first to determine which type of word to parse.  Afterward we splice
    1377      the leading CFWS, if any, into the parsed sub-token.
    1378  
    1379      If neither an atom or a quoted-string is found before the next special, a
    1380      HeaderParseError is raised.
    1381  
    1382      The token returned is either an Atom or a QuotedString, as appropriate.
    1383      This means the 'word' level of the formal grammar is not represented in the
    1384      parse tree; this is because having that extra layer when manipulating the
    1385      parse tree is more confusing than it is helpful.
    1386  
    1387      """
    1388      if value[0] in CFWS_LEADER:
    1389          leader, value = get_cfws(value)
    1390      else:
    1391          leader = None
    1392      if not value:
    1393          raise errors.HeaderParseError(
    1394              "Expected 'atom' or 'quoted-string' but found nothing.")
    1395      if value[0]=='"':
    1396          token, value = get_quoted_string(value)
    1397      elif value[0] in SPECIALS:
    1398          raise errors.HeaderParseError("Expected 'atom' or 'quoted-string' "
    1399                                        "but found '{}'".format(value))
    1400      else:
    1401          token, value = get_atom(value)
    1402      if leader is not None:
    1403          token[:0] = [leader]
    1404      return token, value
    1405  
    1406  def get_phrase(value):
    1407      """ phrase = 1*word / obs-phrase
    1408          obs-phrase = word *(word / "." / CFWS)
    1409  
    1410      This means a phrase can be a sequence of words, periods, and CFWS in any
    1411      order as long as it starts with at least one word.  If anything other than
    1412      words is detected, an ObsoleteHeaderDefect is added to the token's defect
    1413      list.  We also accept a phrase that starts with CFWS followed by a dot;
    1414      this is registered as an InvalidHeaderDefect, since it is not supported by
    1415      even the obsolete grammar.
    1416  
    1417      """
    1418      phrase = Phrase()
    1419      try:
    1420          token, value = get_word(value)
    1421          phrase.append(token)
    1422      except errors.HeaderParseError:
    1423          phrase.defects.append(errors.InvalidHeaderDefect(
    1424              "phrase does not start with word"))
    1425      while value and value[0] not in PHRASE_ENDS:
    1426          if value[0]=='.':
    1427              phrase.append(DOT)
    1428              phrase.defects.append(errors.ObsoleteHeaderDefect(
    1429                  "period in 'phrase'"))
    1430              value = value[1:]
    1431          else:
    1432              try:
    1433                  token, value = get_word(value)
    1434              except errors.HeaderParseError:
    1435                  if value[0] in CFWS_LEADER:
    1436                      token, value = get_cfws(value)
    1437                      phrase.defects.append(errors.ObsoleteHeaderDefect(
    1438                          "comment found without atom"))
    1439                  else:
    1440                      raise
    1441              phrase.append(token)
    1442      return phrase, value
    1443  
    1444  def get_local_part(value):
    1445      """ local-part = dot-atom / quoted-string / obs-local-part
    1446  
    1447      """
    1448      local_part = LocalPart()
    1449      leader = None
    1450      if value[0] in CFWS_LEADER:
    1451          leader, value = get_cfws(value)
    1452      if not value:
    1453          raise errors.HeaderParseError(
    1454              "expected local-part but found '{}'".format(value))
    1455      try:
    1456          token, value = get_dot_atom(value)
    1457      except errors.HeaderParseError:
    1458          try:
    1459              token, value = get_word(value)
    1460          except errors.HeaderParseError:
    1461              if value[0] != '\\' and value[0] in PHRASE_ENDS:
    1462                  raise
    1463              token = TokenList()
    1464      if leader is not None:
    1465          token[:0] = [leader]
    1466      local_part.append(token)
    1467      if value and (value[0]=='\\' or value[0] not in PHRASE_ENDS):
    1468          obs_local_part, value = get_obs_local_part(str(local_part) + value)
    1469          if obs_local_part.token_type == 'invalid-obs-local-part':
    1470              local_part.defects.append(errors.InvalidHeaderDefect(
    1471                  "local-part is not dot-atom, quoted-string, or obs-local-part"))
    1472          else:
    1473              local_part.defects.append(errors.ObsoleteHeaderDefect(
    1474                  "local-part is not a dot-atom (contains CFWS)"))
    1475          local_part[0] = obs_local_part
    1476      try:
    1477          local_part.value.encode('ascii')
    1478      except UnicodeEncodeError:
    1479          local_part.defects.append(errors.NonASCIILocalPartDefect(
    1480                  "local-part contains non-ASCII characters)"))
    1481      return local_part, value
    1482  
    1483  def get_obs_local_part(value):
    1484      """ obs-local-part = word *("." word)
    1485      """
    1486      obs_local_part = ObsLocalPart()
    1487      last_non_ws_was_dot = False
    1488      while value and (value[0]=='\\' or value[0] not in PHRASE_ENDS):
    1489          if value[0] == '.':
    1490              if last_non_ws_was_dot:
    1491                  obs_local_part.defects.append(errors.InvalidHeaderDefect(
    1492                      "invalid repeated '.'"))
    1493              obs_local_part.append(DOT)
    1494              last_non_ws_was_dot = True
    1495              value = value[1:]
    1496              continue
    1497          elif value[0]=='\\':
    1498              obs_local_part.append(ValueTerminal(value[0],
    1499                                                  'misplaced-special'))
    1500              value = value[1:]
    1501              obs_local_part.defects.append(errors.InvalidHeaderDefect(
    1502                  "'\\' character outside of quoted-string/ccontent"))
    1503              last_non_ws_was_dot = False
    1504              continue
    1505          if obs_local_part and obs_local_part[-1].token_type != 'dot':
    1506              obs_local_part.defects.append(errors.InvalidHeaderDefect(
    1507                  "missing '.' between words"))
    1508          try:
    1509              token, value = get_word(value)
    1510              last_non_ws_was_dot = False
    1511          except errors.HeaderParseError:
    1512              if value[0] not in CFWS_LEADER:
    1513                  raise
    1514              token, value = get_cfws(value)
    1515          obs_local_part.append(token)
    1516      if (obs_local_part[0].token_type == 'dot' or
    1517              obs_local_part[0].token_type=='cfws' and
    1518              obs_local_part[1].token_type=='dot'):
    1519          obs_local_part.defects.append(errors.InvalidHeaderDefect(
    1520              "Invalid leading '.' in local part"))
    1521      if (obs_local_part[-1].token_type == 'dot' or
    1522              obs_local_part[-1].token_type=='cfws' and
    1523              obs_local_part[-2].token_type=='dot'):
    1524          obs_local_part.defects.append(errors.InvalidHeaderDefect(
    1525              "Invalid trailing '.' in local part"))
    1526      if obs_local_part.defects:
    1527          obs_local_part.token_type = 'invalid-obs-local-part'
    1528      return obs_local_part, value
    1529  
    1530  def get_dtext(value):
    1531      r""" dtext = <printable ascii except \ [ ]> / obs-dtext
    1532          obs-dtext = obs-NO-WS-CTL / quoted-pair
    1533  
    1534      We allow anything except the excluded characters, but if we find any
    1535      ASCII other than the RFC defined printable ASCII, a NonPrintableDefect is
    1536      added to the token's defects list.  Quoted pairs are converted to their
    1537      unquoted values, so what is returned is a ptext token, in this case a
    1538      ValueTerminal.  If there were quoted-printables, an ObsoleteHeaderDefect is
    1539      added to the returned token's defect list.
    1540  
    1541      """
    1542      ptext, value, had_qp = _get_ptext_to_endchars(value, '[]')
    1543      ptext = ValueTerminal(ptext, 'ptext')
    1544      if had_qp:
    1545          ptext.defects.append(errors.ObsoleteHeaderDefect(
    1546              "quoted printable found in domain-literal"))
    1547      _validate_xtext(ptext)
    1548      return ptext, value
    1549  
    1550  def _check_for_early_dl_end(value, domain_literal):
    1551      if value:
    1552          return False
    1553      domain_literal.append(errors.InvalidHeaderDefect(
    1554          "end of input inside domain-literal"))
    1555      domain_literal.append(ValueTerminal(']', 'domain-literal-end'))
    1556      return True
    1557  
    1558  def get_domain_literal(value):
    1559      """ domain-literal = [CFWS] "[" *([FWS] dtext) [FWS] "]" [CFWS]
    1560  
    1561      """
    1562      domain_literal = DomainLiteral()
    1563      if value[0] in CFWS_LEADER:
    1564          token, value = get_cfws(value)
    1565          domain_literal.append(token)
    1566      if not value:
    1567          raise errors.HeaderParseError("expected domain-literal")
    1568      if value[0] != '[':
    1569          raise errors.HeaderParseError("expected '[' at start of domain-literal "
    1570                  "but found '{}'".format(value))
    1571      value = value[1:]
    1572      if _check_for_early_dl_end(value, domain_literal):
    1573          return domain_literal, value
    1574      domain_literal.append(ValueTerminal('[', 'domain-literal-start'))
    1575      if value[0] in WSP:
    1576          token, value = get_fws(value)
    1577          domain_literal.append(token)
    1578      token, value = get_dtext(value)
    1579      domain_literal.append(token)
    1580      if _check_for_early_dl_end(value, domain_literal):
    1581          return domain_literal, value
    1582      if value[0] in WSP:
    1583          token, value = get_fws(value)
    1584          domain_literal.append(token)
    1585      if _check_for_early_dl_end(value, domain_literal):
    1586          return domain_literal, value
    1587      if value[0] != ']':
    1588          raise errors.HeaderParseError("expected ']' at end of domain-literal "
    1589                  "but found '{}'".format(value))
    1590      domain_literal.append(ValueTerminal(']', 'domain-literal-end'))
    1591      value = value[1:]
    1592      if value and value[0] in CFWS_LEADER:
    1593          token, value = get_cfws(value)
    1594          domain_literal.append(token)
    1595      return domain_literal, value
    1596  
    1597  def get_domain(value):
    1598      """ domain = dot-atom / domain-literal / obs-domain
    1599          obs-domain = atom *("." atom))
    1600  
    1601      """
    1602      domain = Domain()
    1603      leader = None
    1604      if value[0] in CFWS_LEADER:
    1605          leader, value = get_cfws(value)
    1606      if not value:
    1607          raise errors.HeaderParseError(
    1608              "expected domain but found '{}'".format(value))
    1609      if value[0] == '[':
    1610          token, value = get_domain_literal(value)
    1611          if leader is not None:
    1612              token[:0] = [leader]
    1613          domain.append(token)
    1614          return domain, value
    1615      try:
    1616          token, value = get_dot_atom(value)
    1617      except errors.HeaderParseError:
    1618          token, value = get_atom(value)
    1619      if value and value[0] == '@':
    1620          raise errors.HeaderParseError('Invalid Domain')
    1621      if leader is not None:
    1622          token[:0] = [leader]
    1623      domain.append(token)
    1624      if value and value[0] == '.':
    1625          domain.defects.append(errors.ObsoleteHeaderDefect(
    1626              "domain is not a dot-atom (contains CFWS)"))
    1627          if domain[0].token_type == 'dot-atom':
    1628              domain[:] = domain[0]
    1629          while value and value[0] == '.':
    1630              domain.append(DOT)
    1631              token, value = get_atom(value[1:])
    1632              domain.append(token)
    1633      return domain, value
    1634  
    1635  def get_addr_spec(value):
    1636      """ addr-spec = local-part "@" domain
    1637  
    1638      """
    1639      addr_spec = AddrSpec()
    1640      token, value = get_local_part(value)
    1641      addr_spec.append(token)
    1642      if not value or value[0] != '@':
    1643          addr_spec.defects.append(errors.InvalidHeaderDefect(
    1644              "addr-spec local part with no domain"))
    1645          return addr_spec, value
    1646      addr_spec.append(ValueTerminal('@', 'address-at-symbol'))
    1647      token, value = get_domain(value[1:])
    1648      addr_spec.append(token)
    1649      return addr_spec, value
    1650  
    1651  def get_obs_route(value):
    1652      """ obs-route = obs-domain-list ":"
    1653          obs-domain-list = *(CFWS / ",") "@" domain *("," [CFWS] ["@" domain])
    1654  
    1655          Returns an obs-route token with the appropriate sub-tokens (that is,
    1656          there is no obs-domain-list in the parse tree).
    1657      """
    1658      obs_route = ObsRoute()
    1659      while value and (value[0]==',' or value[0] in CFWS_LEADER):
    1660          if value[0] in CFWS_LEADER:
    1661              token, value = get_cfws(value)
    1662              obs_route.append(token)
    1663          elif value[0] == ',':
    1664              obs_route.append(ListSeparator)
    1665              value = value[1:]
    1666      if not value or value[0] != '@':
    1667          raise errors.HeaderParseError(
    1668              "expected obs-route domain but found '{}'".format(value))
    1669      obs_route.append(RouteComponentMarker)
    1670      token, value = get_domain(value[1:])
    1671      obs_route.append(token)
    1672      while value and value[0]==',':
    1673          obs_route.append(ListSeparator)
    1674          value = value[1:]
    1675          if not value:
    1676              break
    1677          if value[0] in CFWS_LEADER:
    1678              token, value = get_cfws(value)
    1679              obs_route.append(token)
    1680          if value[0] == '@':
    1681              obs_route.append(RouteComponentMarker)
    1682              token, value = get_domain(value[1:])
    1683              obs_route.append(token)
    1684      if not value:
    1685          raise errors.HeaderParseError("end of header while parsing obs-route")
    1686      if value[0] != ':':
    1687          raise errors.HeaderParseError( "expected ':' marking end of "
    1688              "obs-route but found '{}'".format(value))
    1689      obs_route.append(ValueTerminal(':', 'end-of-obs-route-marker'))
    1690      return obs_route, value[1:]
    1691  
    1692  def get_angle_addr(value):
    1693      """ angle-addr = [CFWS] "<" addr-spec ">" [CFWS] / obs-angle-addr
    1694          obs-angle-addr = [CFWS] "<" obs-route addr-spec ">" [CFWS]
    1695  
    1696      """
    1697      angle_addr = AngleAddr()
    1698      if value[0] in CFWS_LEADER:
    1699          token, value = get_cfws(value)
    1700          angle_addr.append(token)
    1701      if not value or value[0] != '<':
    1702          raise errors.HeaderParseError(
    1703              "expected angle-addr but found '{}'".format(value))
    1704      angle_addr.append(ValueTerminal('<', 'angle-addr-start'))
    1705      value = value[1:]
    1706      # Although it is not legal per RFC5322, SMTP uses '<>' in certain
    1707      # circumstances.
    1708      if value[0] == '>':
    1709          angle_addr.append(ValueTerminal('>', 'angle-addr-end'))
    1710          angle_addr.defects.append(errors.InvalidHeaderDefect(
    1711              "null addr-spec in angle-addr"))
    1712          value = value[1:]
    1713          return angle_addr, value
    1714      try:
    1715          token, value = get_addr_spec(value)
    1716      except errors.HeaderParseError:
    1717          try:
    1718              token, value = get_obs_route(value)
    1719              angle_addr.defects.append(errors.ObsoleteHeaderDefect(
    1720                  "obsolete route specification in angle-addr"))
    1721          except errors.HeaderParseError:
    1722              raise errors.HeaderParseError(
    1723                  "expected addr-spec or obs-route but found '{}'".format(value))
    1724          angle_addr.append(token)
    1725          token, value = get_addr_spec(value)
    1726      angle_addr.append(token)
    1727      if value and value[0] == '>':
    1728          value = value[1:]
    1729      else:
    1730          angle_addr.defects.append(errors.InvalidHeaderDefect(
    1731              "missing trailing '>' on angle-addr"))
    1732      angle_addr.append(ValueTerminal('>', 'angle-addr-end'))
    1733      if value and value[0] in CFWS_LEADER:
    1734          token, value = get_cfws(value)
    1735          angle_addr.append(token)
    1736      return angle_addr, value
    1737  
    1738  def get_display_name(value):
    1739      """ display-name = phrase
    1740  
    1741      Because this is simply a name-rule, we don't return a display-name
    1742      token containing a phrase, but rather a display-name token with
    1743      the content of the phrase.
    1744  
    1745      """
    1746      display_name = DisplayName()
    1747      token, value = get_phrase(value)
    1748      display_name.extend(token[:])
    1749      display_name.defects = token.defects[:]
    1750      return display_name, value
    1751  
    1752  
    1753  def get_name_addr(value):
    1754      """ name-addr = [display-name] angle-addr
    1755  
    1756      """
    1757      name_addr = NameAddr()
    1758      # Both the optional display name and the angle-addr can start with cfws.
    1759      leader = None
    1760      if value[0] in CFWS_LEADER:
    1761          leader, value = get_cfws(value)
    1762          if not value:
    1763              raise errors.HeaderParseError(
    1764                  "expected name-addr but found '{}'".format(leader))
    1765      if value[0] != '<':
    1766          if value[0] in PHRASE_ENDS:
    1767              raise errors.HeaderParseError(
    1768                  "expected name-addr but found '{}'".format(value))
    1769          token, value = get_display_name(value)
    1770          if not value:
    1771              raise errors.HeaderParseError(
    1772                  "expected name-addr but found '{}'".format(token))
    1773          if leader is not None:
    1774              token[0][:0] = [leader]
    1775              leader = None
    1776          name_addr.append(token)
    1777      token, value = get_angle_addr(value)
    1778      if leader is not None:
    1779          token[:0] = [leader]
    1780      name_addr.append(token)
    1781      return name_addr, value
    1782  
    1783  def get_mailbox(value):
    1784      """ mailbox = name-addr / addr-spec
    1785  
    1786      """
    1787      # The only way to figure out if we are dealing with a name-addr or an
    1788      # addr-spec is to try parsing each one.
    1789      mailbox = Mailbox()
    1790      try:
    1791          token, value = get_name_addr(value)
    1792      except errors.HeaderParseError:
    1793          try:
    1794              token, value = get_addr_spec(value)
    1795          except errors.HeaderParseError:
    1796              raise errors.HeaderParseError(
    1797                  "expected mailbox but found '{}'".format(value))
    1798      if any(isinstance(x, errors.InvalidHeaderDefect)
    1799                         for x in token.all_defects):
    1800          mailbox.token_type = 'invalid-mailbox'
    1801      mailbox.append(token)
    1802      return mailbox, value
    1803  
    1804  def get_invalid_mailbox(value, endchars):
    1805      """ Read everything up to one of the chars in endchars.
    1806  
    1807      This is outside the formal grammar.  The InvalidMailbox TokenList that is
    1808      returned acts like a Mailbox, but the data attributes are None.
    1809  
    1810      """
    1811      invalid_mailbox = InvalidMailbox()
    1812      while value and value[0] not in endchars:
    1813          if value[0] in PHRASE_ENDS:
    1814              invalid_mailbox.append(ValueTerminal(value[0],
    1815                                                   'misplaced-special'))
    1816              value = value[1:]
    1817          else:
    1818              token, value = get_phrase(value)
    1819              invalid_mailbox.append(token)
    1820      return invalid_mailbox, value
    1821  
    1822  def get_mailbox_list(value):
    1823      """ mailbox-list = (mailbox *("," mailbox)) / obs-mbox-list
    1824          obs-mbox-list = *([CFWS] ",") mailbox *("," [mailbox / CFWS])
    1825  
    1826      For this routine we go outside the formal grammar in order to improve error
    1827      handling.  We recognize the end of the mailbox list only at the end of the
    1828      value or at a ';' (the group terminator).  This is so that we can turn
    1829      invalid mailboxes into InvalidMailbox tokens and continue parsing any
    1830      remaining valid mailboxes.  We also allow all mailbox entries to be null,
    1831      and this condition is handled appropriately at a higher level.
    1832  
    1833      """
    1834      mailbox_list = MailboxList()
    1835      while value and value[0] != ';':
    1836          try:
    1837              token, value = get_mailbox(value)
    1838              mailbox_list.append(token)
    1839          except errors.HeaderParseError:
    1840              leader = None
    1841              if value[0] in CFWS_LEADER:
    1842                  leader, value = get_cfws(value)
    1843                  if not value or value[0] in ',;':
    1844                      mailbox_list.append(leader)
    1845                      mailbox_list.defects.append(errors.ObsoleteHeaderDefect(
    1846                          "empty element in mailbox-list"))
    1847                  else:
    1848                      token, value = get_invalid_mailbox(value, ',;')
    1849                      if leader is not None:
    1850                          token[:0] = [leader]
    1851                      mailbox_list.append(token)
    1852                      mailbox_list.defects.append(errors.InvalidHeaderDefect(
    1853                          "invalid mailbox in mailbox-list"))
    1854              elif value[0] == ',':
    1855                  mailbox_list.defects.append(errors.ObsoleteHeaderDefect(
    1856                      "empty element in mailbox-list"))
    1857              else:
    1858                  token, value = get_invalid_mailbox(value, ',;')
    1859                  if leader is not None:
    1860                      token[:0] = [leader]
    1861                  mailbox_list.append(token)
    1862                  mailbox_list.defects.append(errors.InvalidHeaderDefect(
    1863                      "invalid mailbox in mailbox-list"))
    1864          if value and value[0] not in ',;':
    1865              # Crap after mailbox; treat it as an invalid mailbox.
    1866              # The mailbox info will still be available.
    1867              mailbox = mailbox_list[-1]
    1868              mailbox.token_type = 'invalid-mailbox'
    1869              token, value = get_invalid_mailbox(value, ',;')
    1870              mailbox.extend(token)
    1871              mailbox_list.defects.append(errors.InvalidHeaderDefect(
    1872                  "invalid mailbox in mailbox-list"))
    1873          if value and value[0] == ',':
    1874              mailbox_list.append(ListSeparator)
    1875              value = value[1:]
    1876      return mailbox_list, value
    1877  
    1878  
    1879  def get_group_list(value):
    1880      """ group-list = mailbox-list / CFWS / obs-group-list
    1881          obs-group-list = 1*([CFWS] ",") [CFWS]
    1882  
    1883      """
    1884      group_list = GroupList()
    1885      if not value:
    1886          group_list.defects.append(errors.InvalidHeaderDefect(
    1887              "end of header before group-list"))
    1888          return group_list, value
    1889      leader = None
    1890      if value and value[0] in CFWS_LEADER:
    1891          leader, value = get_cfws(value)
    1892          if not value:
    1893              # This should never happen in email parsing, since CFWS-only is a
    1894              # legal alternative to group-list in a group, which is the only
    1895              # place group-list appears.
    1896              group_list.defects.append(errors.InvalidHeaderDefect(
    1897                  "end of header in group-list"))
    1898              group_list.append(leader)
    1899              return group_list, value
    1900          if value[0] == ';':
    1901              group_list.append(leader)
    1902              return group_list, value
    1903      token, value = get_mailbox_list(value)
    1904      if len(token.all_mailboxes)==0:
    1905          if leader is not None:
    1906              group_list.append(leader)
    1907          group_list.extend(token)
    1908          group_list.defects.append(errors.ObsoleteHeaderDefect(
    1909              "group-list with empty entries"))
    1910          return group_list, value
    1911      if leader is not None:
    1912          token[:0] = [leader]
    1913      group_list.append(token)
    1914      return group_list, value
    1915  
    1916  def get_group(value):
    1917      """ group = display-name ":" [group-list] ";" [CFWS]
    1918  
    1919      """
    1920      group = Group()
    1921      token, value = get_display_name(value)
    1922      if not value or value[0] != ':':
    1923          raise errors.HeaderParseError("expected ':' at end of group "
    1924              "display name but found '{}'".format(value))
    1925      group.append(token)
    1926      group.append(ValueTerminal(':', 'group-display-name-terminator'))
    1927      value = value[1:]
    1928      if value and value[0] == ';':
    1929          group.append(ValueTerminal(';', 'group-terminator'))
    1930          return group, value[1:]
    1931      token, value = get_group_list(value)
    1932      group.append(token)
    1933      if not value:
    1934          group.defects.append(errors.InvalidHeaderDefect(
    1935              "end of header in group"))
    1936      elif value[0] != ';':
    1937          raise errors.HeaderParseError(
    1938              "expected ';' at end of group but found {}".format(value))
    1939      group.append(ValueTerminal(';', 'group-terminator'))
    1940      value = value[1:]
    1941      if value and value[0] in CFWS_LEADER:
    1942          token, value = get_cfws(value)
    1943          group.append(token)
    1944      return group, value
    1945  
    1946  def get_address(value):
    1947      """ address = mailbox / group
    1948  
    1949      Note that counter-intuitively, an address can be either a single address or
    1950      a list of addresses (a group).  This is why the returned Address object has
    1951      a 'mailboxes' attribute which treats a single address as a list of length
    1952      one.  When you need to differentiate between to two cases, extract the single
    1953      element, which is either a mailbox or a group token.
    1954  
    1955      """
    1956      # The formal grammar isn't very helpful when parsing an address.  mailbox
    1957      # and group, especially when allowing for obsolete forms, start off very
    1958      # similarly.  It is only when you reach one of @, <, or : that you know
    1959      # what you've got.  So, we try each one in turn, starting with the more
    1960      # likely of the two.  We could perhaps make this more efficient by looking
    1961      # for a phrase and then branching based on the next character, but that
    1962      # would be a premature optimization.
    1963      address = Address()
    1964      try:
    1965          token, value = get_group(value)
    1966      except errors.HeaderParseError:
    1967          try:
    1968              token, value = get_mailbox(value)
    1969          except errors.HeaderParseError:
    1970              raise errors.HeaderParseError(
    1971                  "expected address but found '{}'".format(value))
    1972      address.append(token)
    1973      return address, value
    1974  
    1975  def get_address_list(value):
    1976      """ address_list = (address *("," address)) / obs-addr-list
    1977          obs-addr-list = *([CFWS] ",") address *("," [address / CFWS])
    1978  
    1979      We depart from the formal grammar here by continuing to parse until the end
    1980      of the input, assuming the input to be entirely composed of an
    1981      address-list.  This is always true in email parsing, and allows us
    1982      to skip invalid addresses to parse additional valid ones.
    1983  
    1984      """
    1985      address_list = AddressList()
    1986      while value:
    1987          try:
    1988              token, value = get_address(value)
    1989              address_list.append(token)
    1990          except errors.HeaderParseError:
    1991              leader = None
    1992              if value[0] in CFWS_LEADER:
    1993                  leader, value = get_cfws(value)
    1994                  if not value or value[0] == ',':
    1995                      address_list.append(leader)
    1996                      address_list.defects.append(errors.ObsoleteHeaderDefect(
    1997                          "address-list entry with no content"))
    1998                  else:
    1999                      token, value = get_invalid_mailbox(value, ',')
    2000                      if leader is not None:
    2001                          token[:0] = [leader]
    2002                      address_list.append(Address([token]))
    2003                      address_list.defects.append(errors.InvalidHeaderDefect(
    2004                          "invalid address in address-list"))
    2005              elif value[0] == ',':
    2006                  address_list.defects.append(errors.ObsoleteHeaderDefect(
    2007                      "empty element in address-list"))
    2008              else:
    2009                  token, value = get_invalid_mailbox(value, ',')
    2010                  if leader is not None:
    2011                      token[:0] = [leader]
    2012                  address_list.append(Address([token]))
    2013                  address_list.defects.append(errors.InvalidHeaderDefect(
    2014                      "invalid address in address-list"))
    2015          if value and value[0] != ',':
    2016              # Crap after address; treat it as an invalid mailbox.
    2017              # The mailbox info will still be available.
    2018              mailbox = address_list[-1][0]
    2019              mailbox.token_type = 'invalid-mailbox'
    2020              token, value = get_invalid_mailbox(value, ',')
    2021              mailbox.extend(token)
    2022              address_list.defects.append(errors.InvalidHeaderDefect(
    2023                  "invalid address in address-list"))
    2024          if value:  # Must be a , at this point.
    2025              address_list.append(ValueTerminal(',', 'list-separator'))
    2026              value = value[1:]
    2027      return address_list, value
    2028  
    2029  
    2030  def get_no_fold_literal(value):
    2031      """ no-fold-literal = "[" *dtext "]"
    2032      """
    2033      no_fold_literal = NoFoldLiteral()
    2034      if not value:
    2035          raise errors.HeaderParseError(
    2036              "expected no-fold-literal but found '{}'".format(value))
    2037      if value[0] != '[':
    2038          raise errors.HeaderParseError(
    2039              "expected '[' at the start of no-fold-literal "
    2040              "but found '{}'".format(value))
    2041      no_fold_literal.append(ValueTerminal('[', 'no-fold-literal-start'))
    2042      value = value[1:]
    2043      token, value = get_dtext(value)
    2044      no_fold_literal.append(token)
    2045      if not value or value[0] != ']':
    2046          raise errors.HeaderParseError(
    2047              "expected ']' at the end of no-fold-literal "
    2048              "but found '{}'".format(value))
    2049      no_fold_literal.append(ValueTerminal(']', 'no-fold-literal-end'))
    2050      return no_fold_literal, value[1:]
    2051  
    2052  def get_msg_id(value):
    2053      """msg-id = [CFWS] "<" id-left '@' id-right  ">" [CFWS]
    2054         id-left = dot-atom-text / obs-id-left
    2055         id-right = dot-atom-text / no-fold-literal / obs-id-right
    2056         no-fold-literal = "[" *dtext "]"
    2057      """
    2058      msg_id = MsgID()
    2059      if value and value[0] in CFWS_LEADER:
    2060          token, value = get_cfws(value)
    2061          msg_id.append(token)
    2062      if not value or value[0] != '<':
    2063          raise errors.HeaderParseError(
    2064              "expected msg-id but found '{}'".format(value))
    2065      msg_id.append(ValueTerminal('<', 'msg-id-start'))
    2066      value = value[1:]
    2067      # Parse id-left.
    2068      try:
    2069          token, value = get_dot_atom_text(value)
    2070      except errors.HeaderParseError:
    2071          try:
    2072              # obs-id-left is same as local-part of add-spec.
    2073              token, value = get_obs_local_part(value)
    2074              msg_id.defects.append(errors.ObsoleteHeaderDefect(
    2075                  "obsolete id-left in msg-id"))
    2076          except errors.HeaderParseError:
    2077              raise errors.HeaderParseError(
    2078                  "expected dot-atom-text or obs-id-left"
    2079                  " but found '{}'".format(value))
    2080      msg_id.append(token)
    2081      if not value or value[0] != '@':
    2082          msg_id.defects.append(errors.InvalidHeaderDefect(
    2083              "msg-id with no id-right"))
    2084          # Even though there is no id-right, if the local part
    2085          # ends with `>` let's just parse it too and return
    2086          # along with the defect.
    2087          if value and value[0] == '>':
    2088              msg_id.append(ValueTerminal('>', 'msg-id-end'))
    2089              value = value[1:]
    2090          return msg_id, value
    2091      msg_id.append(ValueTerminal('@', 'address-at-symbol'))
    2092      value = value[1:]
    2093      # Parse id-right.
    2094      try:
    2095          token, value = get_dot_atom_text(value)
    2096      except errors.HeaderParseError:
    2097          try:
    2098              token, value = get_no_fold_literal(value)
    2099          except errors.HeaderParseError:
    2100              try:
    2101                  token, value = get_domain(value)
    2102                  msg_id.defects.append(errors.ObsoleteHeaderDefect(
    2103                      "obsolete id-right in msg-id"))
    2104              except errors.HeaderParseError:
    2105                  raise errors.HeaderParseError(
    2106                      "expected dot-atom-text, no-fold-literal or obs-id-right"
    2107                      " but found '{}'".format(value))
    2108      msg_id.append(token)
    2109      if value and value[0] == '>':
    2110          value = value[1:]
    2111      else:
    2112          msg_id.defects.append(errors.InvalidHeaderDefect(
    2113              "missing trailing '>' on msg-id"))
    2114      msg_id.append(ValueTerminal('>', 'msg-id-end'))
    2115      if value and value[0] in CFWS_LEADER:
    2116          token, value = get_cfws(value)
    2117          msg_id.append(token)
    2118      return msg_id, value
    2119  
    2120  
    2121  def parse_message_id(value):
    2122      """message-id      =   "Message-ID:" msg-id CRLF
    2123      """
    2124      message_id = MessageID()
    2125      try:
    2126          token, value = get_msg_id(value)
    2127          message_id.append(token)
    2128      except errors.HeaderParseError as ex:
    2129          token = get_unstructured(value)
    2130          message_id = InvalidMessageID(token)
    2131          message_id.defects.append(
    2132              errors.InvalidHeaderDefect("Invalid msg-id: {!r}".format(ex)))
    2133      else:
    2134          # Value after parsing a valid msg_id should be None.
    2135          if value:
    2136              message_id.defects.append(errors.InvalidHeaderDefect(
    2137                  "Unexpected {!r}".format(value)))
    2138  
    2139      return message_id
    2140  
    2141  #
    2142  # XXX: As I begin to add additional header parsers, I'm realizing we probably
    2143  # have two level of parser routines: the get_XXX methods that get a token in
    2144  # the grammar, and parse_XXX methods that parse an entire field value.  So
    2145  # get_address_list above should really be a parse_ method, as probably should
    2146  # be get_unstructured.
    2147  #
    2148  
    2149  def parse_mime_version(value):
    2150      """ mime-version = [CFWS] 1*digit [CFWS] "." [CFWS] 1*digit [CFWS]
    2151  
    2152      """
    2153      # The [CFWS] is implicit in the RFC 2045 BNF.
    2154      # XXX: This routine is a bit verbose, should factor out a get_int method.
    2155      mime_version = MIMEVersion()
    2156      if not value:
    2157          mime_version.defects.append(errors.HeaderMissingRequiredValue(
    2158              "Missing MIME version number (eg: 1.0)"))
    2159          return mime_version
    2160      if value[0] in CFWS_LEADER:
    2161          token, value = get_cfws(value)
    2162          mime_version.append(token)
    2163          if not value:
    2164              mime_version.defects.append(errors.HeaderMissingRequiredValue(
    2165                  "Expected MIME version number but found only CFWS"))
    2166      digits = ''
    2167      while value and value[0] != '.' and value[0] not in CFWS_LEADER:
    2168          digits += value[0]
    2169          value = value[1:]
    2170      if not digits.isdigit():
    2171          mime_version.defects.append(errors.InvalidHeaderDefect(
    2172              "Expected MIME major version number but found {!r}".format(digits)))
    2173          mime_version.append(ValueTerminal(digits, 'xtext'))
    2174      else:
    2175          mime_version.major = int(digits)
    2176          mime_version.append(ValueTerminal(digits, 'digits'))
    2177      if value and value[0] in CFWS_LEADER:
    2178          token, value = get_cfws(value)
    2179          mime_version.append(token)
    2180      if not value or value[0] != '.':
    2181          if mime_version.major is not None:
    2182              mime_version.defects.append(errors.InvalidHeaderDefect(
    2183                  "Incomplete MIME version; found only major number"))
    2184          if value:
    2185              mime_version.append(ValueTerminal(value, 'xtext'))
    2186          return mime_version
    2187      mime_version.append(ValueTerminal('.', 'version-separator'))
    2188      value = value[1:]
    2189      if value and value[0] in CFWS_LEADER:
    2190          token, value = get_cfws(value)
    2191          mime_version.append(token)
    2192      if not value:
    2193          if mime_version.major is not None:
    2194              mime_version.defects.append(errors.InvalidHeaderDefect(
    2195                  "Incomplete MIME version; found only major number"))
    2196          return mime_version
    2197      digits = ''
    2198      while value and value[0] not in CFWS_LEADER:
    2199          digits += value[0]
    2200          value = value[1:]
    2201      if not digits.isdigit():
    2202          mime_version.defects.append(errors.InvalidHeaderDefect(
    2203              "Expected MIME minor version number but found {!r}".format(digits)))
    2204          mime_version.append(ValueTerminal(digits, 'xtext'))
    2205      else:
    2206          mime_version.minor = int(digits)
    2207          mime_version.append(ValueTerminal(digits, 'digits'))
    2208      if value and value[0] in CFWS_LEADER:
    2209          token, value = get_cfws(value)
    2210          mime_version.append(token)
    2211      if value:
    2212          mime_version.defects.append(errors.InvalidHeaderDefect(
    2213              "Excess non-CFWS text after MIME version"))
    2214          mime_version.append(ValueTerminal(value, 'xtext'))
    2215      return mime_version
    2216  
    2217  def get_invalid_parameter(value):
    2218      """ Read everything up to the next ';'.
    2219  
    2220      This is outside the formal grammar.  The InvalidParameter TokenList that is
    2221      returned acts like a Parameter, but the data attributes are None.
    2222  
    2223      """
    2224      invalid_parameter = InvalidParameter()
    2225      while value and value[0] != ';':
    2226          if value[0] in PHRASE_ENDS:
    2227              invalid_parameter.append(ValueTerminal(value[0],
    2228                                                     'misplaced-special'))
    2229              value = value[1:]
    2230          else:
    2231              token, value = get_phrase(value)
    2232              invalid_parameter.append(token)
    2233      return invalid_parameter, value
    2234  
    2235  def get_ttext(value):
    2236      """ttext = <matches _ttext_matcher>
    2237  
    2238      We allow any non-TOKEN_ENDS in ttext, but add defects to the token's
    2239      defects list if we find non-ttext characters.  We also register defects for
    2240      *any* non-printables even though the RFC doesn't exclude all of them,
    2241      because we follow the spirit of RFC 5322.
    2242  
    2243      """
    2244      m = _non_token_end_matcher(value)
    2245      if not m:
    2246          raise errors.HeaderParseError(
    2247              "expected ttext but found '{}'".format(value))
    2248      ttext = m.group()
    2249      value = value[len(ttext):]
    2250      ttext = ValueTerminal(ttext, 'ttext')
    2251      _validate_xtext(ttext)
    2252      return ttext, value
    2253  
    2254  def get_token(value):
    2255      """token = [CFWS] 1*ttext [CFWS]
    2256  
    2257      The RFC equivalent of ttext is any US-ASCII chars except space, ctls, or
    2258      tspecials.  We also exclude tabs even though the RFC doesn't.
    2259  
    2260      The RFC implies the CFWS but is not explicit about it in the BNF.
    2261  
    2262      """
    2263      mtoken = Token()
    2264      if value and value[0] in CFWS_LEADER:
    2265          token, value = get_cfws(value)
    2266          mtoken.append(token)
    2267      if value and value[0] in TOKEN_ENDS:
    2268          raise errors.HeaderParseError(
    2269              "expected token but found '{}'".format(value))
    2270      token, value = get_ttext(value)
    2271      mtoken.append(token)
    2272      if value and value[0] in CFWS_LEADER:
    2273          token, value = get_cfws(value)
    2274          mtoken.append(token)
    2275      return mtoken, value
    2276  
    2277  def get_attrtext(value):
    2278      """attrtext = 1*(any non-ATTRIBUTE_ENDS character)
    2279  
    2280      We allow any non-ATTRIBUTE_ENDS in attrtext, but add defects to the
    2281      token's defects list if we find non-attrtext characters.  We also register
    2282      defects for *any* non-printables even though the RFC doesn't exclude all of
    2283      them, because we follow the spirit of RFC 5322.
    2284  
    2285      """
    2286      m = _non_attribute_end_matcher(value)
    2287      if not m:
    2288          raise errors.HeaderParseError(
    2289              "expected attrtext but found {!r}".format(value))
    2290      attrtext = m.group()
    2291      value = value[len(attrtext):]
    2292      attrtext = ValueTerminal(attrtext, 'attrtext')
    2293      _validate_xtext(attrtext)
    2294      return attrtext, value
    2295  
    2296  def get_attribute(value):
    2297      """ [CFWS] 1*attrtext [CFWS]
    2298  
    2299      This version of the BNF makes the CFWS explicit, and as usual we use a
    2300      value terminal for the actual run of characters.  The RFC equivalent of
    2301      attrtext is the token characters, with the subtraction of '*', "'", and '%'.
    2302      We include tab in the excluded set just as we do for token.
    2303  
    2304      """
    2305      attribute = Attribute()
    2306      if value and value[0] in CFWS_LEADER:
    2307          token, value = get_cfws(value)
    2308          attribute.append(token)
    2309      if value and value[0] in ATTRIBUTE_ENDS:
    2310          raise errors.HeaderParseError(
    2311              "expected token but found '{}'".format(value))
    2312      token, value = get_attrtext(value)
    2313      attribute.append(token)
    2314      if value and value[0] in CFWS_LEADER:
    2315          token, value = get_cfws(value)
    2316          attribute.append(token)
    2317      return attribute, value
    2318  
    2319  def get_extended_attrtext(value):
    2320      """attrtext = 1*(any non-ATTRIBUTE_ENDS character plus '%')
    2321  
    2322      This is a special parsing routine so that we get a value that
    2323      includes % escapes as a single string (which we decode as a single
    2324      string later).
    2325  
    2326      """
    2327      m = _non_extended_attribute_end_matcher(value)
    2328      if not m:
    2329          raise errors.HeaderParseError(
    2330              "expected extended attrtext but found {!r}".format(value))
    2331      attrtext = m.group()
    2332      value = value[len(attrtext):]
    2333      attrtext = ValueTerminal(attrtext, 'extended-attrtext')
    2334      _validate_xtext(attrtext)
    2335      return attrtext, value
    2336  
    2337  def get_extended_attribute(value):
    2338      """ [CFWS] 1*extended_attrtext [CFWS]
    2339  
    2340      This is like the non-extended version except we allow % characters, so that
    2341      we can pick up an encoded value as a single string.
    2342  
    2343      """
    2344      # XXX: should we have an ExtendedAttribute TokenList?
    2345      attribute = Attribute()
    2346      if value and value[0] in CFWS_LEADER:
    2347          token, value = get_cfws(value)
    2348          attribute.append(token)
    2349      if value and value[0] in EXTENDED_ATTRIBUTE_ENDS:
    2350          raise errors.HeaderParseError(
    2351              "expected token but found '{}'".format(value))
    2352      token, value = get_extended_attrtext(value)
    2353      attribute.append(token)
    2354      if value and value[0] in CFWS_LEADER:
    2355          token, value = get_cfws(value)
    2356          attribute.append(token)
    2357      return attribute, value
    2358  
    2359  def get_section(value):
    2360      """ '*' digits
    2361  
    2362      The formal BNF is more complicated because leading 0s are not allowed.  We
    2363      check for that and add a defect.  We also assume no CFWS is allowed between
    2364      the '*' and the digits, though the RFC is not crystal clear on that.
    2365      The caller should already have dealt with leading CFWS.
    2366  
    2367      """
    2368      section = Section()
    2369      if not value or value[0] != '*':
    2370          raise errors.HeaderParseError("Expected section but found {}".format(
    2371                                          value))
    2372      section.append(ValueTerminal('*', 'section-marker'))
    2373      value = value[1:]
    2374      if not value or not value[0].isdigit():
    2375          raise errors.HeaderParseError("Expected section number but "
    2376                                        "found {}".format(value))
    2377      digits = ''
    2378      while value and value[0].isdigit():
    2379          digits += value[0]
    2380          value = value[1:]
    2381      if digits[0] == '0' and digits != '0':
    2382          section.defects.append(errors.InvalidHeaderDefect(
    2383                  "section number has an invalid leading 0"))
    2384      section.number = int(digits)
    2385      section.append(ValueTerminal(digits, 'digits'))
    2386      return section, value
    2387  
    2388  
    2389  def get_value(value):
    2390      """ quoted-string / attribute
    2391  
    2392      """
    2393      v = Value()
    2394      if not value:
    2395          raise errors.HeaderParseError("Expected value but found end of string")
    2396      leader = None
    2397      if value[0] in CFWS_LEADER:
    2398          leader, value = get_cfws(value)
    2399      if not value:
    2400          raise errors.HeaderParseError("Expected value but found "
    2401                                        "only {}".format(leader))
    2402      if value[0] == '"':
    2403          token, value = get_quoted_string(value)
    2404      else:
    2405          token, value = get_extended_attribute(value)
    2406      if leader is not None:
    2407          token[:0] = [leader]
    2408      v.append(token)
    2409      return v, value
    2410  
    2411  def get_parameter(value):
    2412      """ attribute [section] ["*"] [CFWS] "=" value
    2413  
    2414      The CFWS is implied by the RFC but not made explicit in the BNF.  This
    2415      simplified form of the BNF from the RFC is made to conform with the RFC BNF
    2416      through some extra checks.  We do it this way because it makes both error
    2417      recovery and working with the resulting parse tree easier.
    2418      """
    2419      # It is possible CFWS would also be implicitly allowed between the section
    2420      # and the 'extended-attribute' marker (the '*') , but we've never seen that
    2421      # in the wild and we will therefore ignore the possibility.
    2422      param = Parameter()
    2423      token, value = get_attribute(value)
    2424      param.append(token)
    2425      if not value or value[0] == ';':
    2426          param.defects.append(errors.InvalidHeaderDefect("Parameter contains "
    2427              "name ({}) but no value".format(token)))
    2428          return param, value
    2429      if value[0] == '*':
    2430          try:
    2431              token, value = get_section(value)
    2432              param.sectioned = True
    2433              param.append(token)
    2434          except errors.HeaderParseError:
    2435              pass
    2436          if not value:
    2437              raise errors.HeaderParseError("Incomplete parameter")
    2438          if value[0] == '*':
    2439              param.append(ValueTerminal('*', 'extended-parameter-marker'))
    2440              value = value[1:]
    2441              param.extended = True
    2442      if value[0] != '=':
    2443          raise errors.HeaderParseError("Parameter not followed by '='")
    2444      param.append(ValueTerminal('=', 'parameter-separator'))
    2445      value = value[1:]
    2446      if value and value[0] in CFWS_LEADER:
    2447          token, value = get_cfws(value)
    2448          param.append(token)
    2449      remainder = None
    2450      appendto = param
    2451      if param.extended and value and value[0] == '"':
    2452          # Now for some serious hackery to handle the common invalid case of
    2453          # double quotes around an extended value.  We also accept (with defect)
    2454          # a value marked as encoded that isn't really.
    2455          qstring, remainder = get_quoted_string(value)
    2456          inner_value = qstring.stripped_value
    2457          semi_valid = False
    2458          if param.section_number == 0:
    2459              if inner_value and inner_value[0] == "'":
    2460                  semi_valid = True
    2461              else:
    2462                  token, rest = get_attrtext(inner_value)
    2463                  if rest and rest[0] == "'":
    2464                      semi_valid = True
    2465          else:
    2466              try:
    2467                  token, rest = get_extended_attrtext(inner_value)
    2468              except:
    2469                  pass
    2470              else:
    2471                  if not rest:
    2472                      semi_valid = True
    2473          if semi_valid:
    2474              param.defects.append(errors.InvalidHeaderDefect(
    2475                  "Quoted string value for extended parameter is invalid"))
    2476              param.append(qstring)
    2477              for t in qstring:
    2478                  if t.token_type == 'bare-quoted-string':
    2479                      t[:] = []
    2480                      appendto = t
    2481                      break
    2482              value = inner_value
    2483          else:
    2484              remainder = None
    2485              param.defects.append(errors.InvalidHeaderDefect(
    2486                  "Parameter marked as extended but appears to have a "
    2487                  "quoted string value that is non-encoded"))
    2488      if value and value[0] == "'":
    2489          token = None
    2490      else:
    2491          token, value = get_value(value)
    2492      if not param.extended or param.section_number > 0:
    2493          if not value or value[0] != "'":
    2494              appendto.append(token)
    2495              if remainder is not None:
    2496                  assert not value, value
    2497                  value = remainder
    2498              return param, value
    2499          param.defects.append(errors.InvalidHeaderDefect(
    2500              "Apparent initial-extended-value but attribute "
    2501              "was not marked as extended or was not initial section"))
    2502      if not value:
    2503          # Assume the charset/lang is missing and the token is the value.
    2504          param.defects.append(errors.InvalidHeaderDefect(
    2505              "Missing required charset/lang delimiters"))
    2506          appendto.append(token)
    2507          if remainder is None:
    2508              return param, value
    2509      else:
    2510          if token is not None:
    2511              for t in token:
    2512                  if t.token_type == 'extended-attrtext':
    2513                      break
    2514              t.token_type == 'attrtext'
    2515              appendto.append(t)
    2516              param.charset = t.value
    2517          if value[0] != "'":
    2518              raise errors.HeaderParseError("Expected RFC2231 char/lang encoding "
    2519                                            "delimiter, but found {!r}".format(value))
    2520          appendto.append(ValueTerminal("'", 'RFC2231-delimiter'))
    2521          value = value[1:]
    2522          if value and value[0] != "'":
    2523              token, value = get_attrtext(value)
    2524              appendto.append(token)
    2525              param.lang = token.value
    2526              if not value or value[0] != "'":
    2527                  raise errors.HeaderParseError("Expected RFC2231 char/lang encoding "
    2528                                    "delimiter, but found {}".format(value))
    2529          appendto.append(ValueTerminal("'", 'RFC2231-delimiter'))
    2530          value = value[1:]
    2531      if remainder is not None:
    2532          # Treat the rest of value as bare quoted string content.
    2533          v = Value()
    2534          while value:
    2535              if value[0] in WSP:
    2536                  token, value = get_fws(value)
    2537              elif value[0] == '"':
    2538                  token = ValueTerminal('"', 'DQUOTE')
    2539                  value = value[1:]
    2540              else:
    2541                  token, value = get_qcontent(value)
    2542              v.append(token)
    2543          token = v
    2544      else:
    2545          token, value = get_value(value)
    2546      appendto.append(token)
    2547      if remainder is not None:
    2548          assert not value, value
    2549          value = remainder
    2550      return param, value
    2551  
    2552  def parse_mime_parameters(value):
    2553      """ parameter *( ";" parameter )
    2554  
    2555      That BNF is meant to indicate this routine should only be called after
    2556      finding and handling the leading ';'.  There is no corresponding rule in
    2557      the formal RFC grammar, but it is more convenient for us for the set of
    2558      parameters to be treated as its own TokenList.
    2559  
    2560      This is 'parse' routine because it consumes the remaining value, but it
    2561      would never be called to parse a full header.  Instead it is called to
    2562      parse everything after the non-parameter value of a specific MIME header.
    2563  
    2564      """
    2565      mime_parameters = MimeParameters()
    2566      while value:
    2567          try:
    2568              token, value = get_parameter(value)
    2569              mime_parameters.append(token)
    2570          except errors.HeaderParseError:
    2571              leader = None
    2572              if value[0] in CFWS_LEADER:
    2573                  leader, value = get_cfws(value)
    2574              if not value:
    2575                  mime_parameters.append(leader)
    2576                  return mime_parameters
    2577              if value[0] == ';':
    2578                  if leader is not None:
    2579                      mime_parameters.append(leader)
    2580                  mime_parameters.defects.append(errors.InvalidHeaderDefect(
    2581                      "parameter entry with no content"))
    2582              else:
    2583                  token, value = get_invalid_parameter(value)
    2584                  if leader:
    2585                      token[:0] = [leader]
    2586                  mime_parameters.append(token)
    2587                  mime_parameters.defects.append(errors.InvalidHeaderDefect(
    2588                      "invalid parameter {!r}".format(token)))
    2589          if value and value[0] != ';':
    2590              # Junk after the otherwise valid parameter.  Mark it as
    2591              # invalid, but it will have a value.
    2592              param = mime_parameters[-1]
    2593              param.token_type = 'invalid-parameter'
    2594              token, value = get_invalid_parameter(value)
    2595              param.extend(token)
    2596              mime_parameters.defects.append(errors.InvalidHeaderDefect(
    2597                  "parameter with invalid trailing text {!r}".format(token)))
    2598          if value:
    2599              # Must be a ';' at this point.
    2600              mime_parameters.append(ValueTerminal(';', 'parameter-separator'))
    2601              value = value[1:]
    2602      return mime_parameters
    2603  
    2604  def _find_mime_parameters(tokenlist, value):
    2605      """Do our best to find the parameters in an invalid MIME header
    2606  
    2607      """
    2608      while value and value[0] != ';':
    2609          if value[0] in PHRASE_ENDS:
    2610              tokenlist.append(ValueTerminal(value[0], 'misplaced-special'))
    2611              value = value[1:]
    2612          else:
    2613              token, value = get_phrase(value)
    2614              tokenlist.append(token)
    2615      if not value:
    2616          return
    2617      tokenlist.append(ValueTerminal(';', 'parameter-separator'))
    2618      tokenlist.append(parse_mime_parameters(value[1:]))
    2619  
    2620  def parse_content_type_header(value):
    2621      """ maintype "/" subtype *( ";" parameter )
    2622  
    2623      The maintype and substype are tokens.  Theoretically they could
    2624      be checked against the official IANA list + x-token, but we
    2625      don't do that.
    2626      """
    2627      ctype = ContentType()
    2628      if not value:
    2629          ctype.defects.append(errors.HeaderMissingRequiredValue(
    2630              "Missing content type specification"))
    2631          return ctype
    2632      try:
    2633          token, value = get_token(value)
    2634      except errors.HeaderParseError:
    2635          ctype.defects.append(errors.InvalidHeaderDefect(
    2636              "Expected content maintype but found {!r}".format(value)))
    2637          _find_mime_parameters(ctype, value)
    2638          return ctype
    2639      ctype.append(token)
    2640      # XXX: If we really want to follow the formal grammar we should make
    2641      # mantype and subtype specialized TokenLists here.  Probably not worth it.
    2642      if not value or value[0] != '/':
    2643          ctype.defects.append(errors.InvalidHeaderDefect(
    2644              "Invalid content type"))
    2645          if value:
    2646              _find_mime_parameters(ctype, value)
    2647          return ctype
    2648      ctype.maintype = token.value.strip().lower()
    2649      ctype.append(ValueTerminal('/', 'content-type-separator'))
    2650      value = value[1:]
    2651      try:
    2652          token, value = get_token(value)
    2653      except errors.HeaderParseError:
    2654          ctype.defects.append(errors.InvalidHeaderDefect(
    2655              "Expected content subtype but found {!r}".format(value)))
    2656          _find_mime_parameters(ctype, value)
    2657          return ctype
    2658      ctype.append(token)
    2659      ctype.subtype = token.value.strip().lower()
    2660      if not value:
    2661          return ctype
    2662      if value[0] != ';':
    2663          ctype.defects.append(errors.InvalidHeaderDefect(
    2664              "Only parameters are valid after content type, but "
    2665              "found {!r}".format(value)))
    2666          # The RFC requires that a syntactically invalid content-type be treated
    2667          # as text/plain.  Perhaps we should postel this, but we should probably
    2668          # only do that if we were checking the subtype value against IANA.
    2669          del ctype.maintype, ctype.subtype
    2670          _find_mime_parameters(ctype, value)
    2671          return ctype
    2672      ctype.append(ValueTerminal(';', 'parameter-separator'))
    2673      ctype.append(parse_mime_parameters(value[1:]))
    2674      return ctype
    2675  
    2676  def parse_content_disposition_header(value):
    2677      """ disposition-type *( ";" parameter )
    2678  
    2679      """
    2680      disp_header = ContentDisposition()
    2681      if not value:
    2682          disp_header.defects.append(errors.HeaderMissingRequiredValue(
    2683              "Missing content disposition"))
    2684          return disp_header
    2685      try:
    2686          token, value = get_token(value)
    2687      except errors.HeaderParseError:
    2688          disp_header.defects.append(errors.InvalidHeaderDefect(
    2689              "Expected content disposition but found {!r}".format(value)))
    2690          _find_mime_parameters(disp_header, value)
    2691          return disp_header
    2692      disp_header.append(token)
    2693      disp_header.content_disposition = token.value.strip().lower()
    2694      if not value:
    2695          return disp_header
    2696      if value[0] != ';':
    2697          disp_header.defects.append(errors.InvalidHeaderDefect(
    2698              "Only parameters are valid after content disposition, but "
    2699              "found {!r}".format(value)))
    2700          _find_mime_parameters(disp_header, value)
    2701          return disp_header
    2702      disp_header.append(ValueTerminal(';', 'parameter-separator'))
    2703      disp_header.append(parse_mime_parameters(value[1:]))
    2704      return disp_header
    2705  
    2706  def parse_content_transfer_encoding_header(value):
    2707      """ mechanism
    2708  
    2709      """
    2710      # We should probably validate the values, since the list is fixed.
    2711      cte_header = ContentTransferEncoding()
    2712      if not value:
    2713          cte_header.defects.append(errors.HeaderMissingRequiredValue(
    2714              "Missing content transfer encoding"))
    2715          return cte_header
    2716      try:
    2717          token, value = get_token(value)
    2718      except errors.HeaderParseError:
    2719          cte_header.defects.append(errors.InvalidHeaderDefect(
    2720              "Expected content transfer encoding but found {!r}".format(value)))
    2721      else:
    2722          cte_header.append(token)
    2723          cte_header.cte = token.value.strip().lower()
    2724      if not value:
    2725          return cte_header
    2726      while value:
    2727          cte_header.defects.append(errors.InvalidHeaderDefect(
    2728              "Extra text after content transfer encoding"))
    2729          if value[0] in PHRASE_ENDS:
    2730              cte_header.append(ValueTerminal(value[0], 'misplaced-special'))
    2731              value = value[1:]
    2732          else:
    2733              token, value = get_phrase(value)
    2734              cte_header.append(token)
    2735      return cte_header
    2736  
    2737  
    2738  #
    2739  # Header folding
    2740  #
    2741  # Header folding is complex, with lots of rules and corner cases.  The
    2742  # following code does its best to obey the rules and handle the corner
    2743  # cases, but you can be sure there are few bugs:)
    2744  #
    2745  # This folder generally canonicalizes as it goes, preferring the stringified
    2746  # version of each token.  The tokens contain information that supports the
    2747  # folder, including which tokens can be encoded in which ways.
    2748  #
    2749  # Folded text is accumulated in a simple list of strings ('lines'), each
    2750  # one of which should be less than policy.max_line_length ('maxlen').
    2751  #
    2752  
    2753  def _steal_trailing_WSP_if_exists(lines):
    2754      wsp = ''
    2755      if lines and lines[-1] and lines[-1][-1] in WSP:
    2756          wsp = lines[-1][-1]
    2757          lines[-1] = lines[-1][:-1]
    2758      return wsp
    2759  
    2760  def _refold_parse_tree(parse_tree, *, policy):
    2761      """Return string of contents of parse_tree folded according to RFC rules.
    2762  
    2763      """
    2764      # max_line_length 0/None means no limit, ie: infinitely long.
    2765      maxlen = policy.max_line_length or sys.maxsize
    2766      encoding = 'utf-8' if policy.utf8 else 'us-ascii'
    2767      lines = ['']
    2768      last_ew = None
    2769      wrap_as_ew_blocked = 0
    2770      want_encoding = False
    2771      end_ew_not_allowed = Terminal('', 'wrap_as_ew_blocked')
    2772      parts = list(parse_tree)
    2773      while parts:
    2774          part = parts.pop(0)
    2775          if part is end_ew_not_allowed:
    2776              wrap_as_ew_blocked -= 1
    2777              continue
    2778          tstr = str(part)
    2779          if part.token_type == 'ptext' and set(tstr) & SPECIALS:
    2780              # Encode if tstr contains special characters.
    2781              want_encoding = True
    2782          try:
    2783              tstr.encode(encoding)
    2784              charset = encoding
    2785          except UnicodeEncodeError:
    2786              if any(isinstance(x, errors.UndecodableBytesDefect)
    2787                     for x in part.all_defects):
    2788                  charset = 'unknown-8bit'
    2789              else:
    2790                  # If policy.utf8 is false this should really be taken from a
    2791                  # 'charset' property on the policy.
    2792                  charset = 'utf-8'
    2793              want_encoding = True
    2794          if part.token_type == 'mime-parameters':
    2795              # Mime parameter folding (using RFC2231) is extra special.
    2796              _fold_mime_parameters(part, lines, maxlen, encoding)
    2797              continue
    2798          if want_encoding and not wrap_as_ew_blocked:
    2799              if not part.as_ew_allowed:
    2800                  want_encoding = False
    2801                  last_ew = None
    2802                  if part.syntactic_break:
    2803                      encoded_part = part.fold(policy=policy)[:-len(policy.linesep)]
    2804                      if policy.linesep not in encoded_part:
    2805                          # It fits on a single line
    2806                          if len(encoded_part) > maxlen - len(lines[-1]):
    2807                              # But not on this one, so start a new one.
    2808                              newline = _steal_trailing_WSP_if_exists(lines)
    2809                              # XXX what if encoded_part has no leading FWS?
    2810                              lines.append(newline)
    2811                          lines[-1] += encoded_part
    2812                          continue
    2813                  # Either this is not a major syntactic break, so we don't
    2814                  # want it on a line by itself even if it fits, or it
    2815                  # doesn't fit on a line by itself.  Either way, fall through
    2816                  # to unpacking the subparts and wrapping them.
    2817              if not hasattr(part, 'encode'):
    2818                  # It's not a Terminal, do each piece individually.
    2819                  parts = list(part) + parts
    2820              else:
    2821                  # It's a terminal, wrap it as an encoded word, possibly
    2822                  # combining it with previously encoded words if allowed.
    2823                  last_ew = _fold_as_ew(tstr, lines, maxlen, last_ew,
    2824                                        part.ew_combine_allowed, charset)
    2825              want_encoding = False
    2826              continue
    2827          if len(tstr) <= maxlen - len(lines[-1]):
    2828              lines[-1] += tstr
    2829              continue
    2830          # This part is too long to fit.  The RFC wants us to break at
    2831          # "major syntactic breaks", so unless we don't consider this
    2832          # to be one, check if it will fit on the next line by itself.
    2833          if (part.syntactic_break and
    2834                  len(tstr) + 1 <= maxlen):
    2835              newline = _steal_trailing_WSP_if_exists(lines)
    2836              if newline or part.startswith_fws():
    2837                  lines.append(newline + tstr)
    2838                  last_ew = None
    2839                  continue
    2840          if not hasattr(part, 'encode'):
    2841              # It's not a terminal, try folding the subparts.
    2842              newparts = list(part)
    2843              if not part.as_ew_allowed:
    2844                  wrap_as_ew_blocked += 1
    2845                  newparts.append(end_ew_not_allowed)
    2846              parts = newparts + parts
    2847              continue
    2848          if part.as_ew_allowed and not wrap_as_ew_blocked:
    2849              # It doesn't need CTE encoding, but encode it anyway so we can
    2850              # wrap it.
    2851              parts.insert(0, part)
    2852              want_encoding = True
    2853              continue
    2854          # We can't figure out how to wrap, it, so give up.
    2855          newline = _steal_trailing_WSP_if_exists(lines)
    2856          if newline or part.startswith_fws():
    2857              lines.append(newline + tstr)
    2858          else:
    2859              # We can't fold it onto the next line either...
    2860              lines[-1] += tstr
    2861      return policy.linesep.join(lines) + policy.linesep
    2862  
    2863  def _fold_as_ew(to_encode, lines, maxlen, last_ew, ew_combine_allowed, charset):
    2864      """Fold string to_encode into lines as encoded word, combining if allowed.
    2865      Return the new value for last_ew, or None if ew_combine_allowed is False.
    2866  
    2867      If there is already an encoded word in the last line of lines (indicated by
    2868      a non-None value for last_ew) and ew_combine_allowed is true, decode the
    2869      existing ew, combine it with to_encode, and re-encode.  Otherwise, encode
    2870      to_encode.  In either case, split to_encode as necessary so that the
    2871      encoded segments fit within maxlen.
    2872  
    2873      """
    2874      if last_ew is not None and ew_combine_allowed:
    2875          to_encode = str(
    2876              get_unstructured(lines[-1][last_ew:] + to_encode))
    2877          lines[-1] = lines[-1][:last_ew]
    2878      if to_encode[0] in WSP:
    2879          # We're joining this to non-encoded text, so don't encode
    2880          # the leading blank.
    2881          leading_wsp = to_encode[0]
    2882          to_encode = to_encode[1:]
    2883          if (len(lines[-1]) == maxlen):
    2884              lines.append(_steal_trailing_WSP_if_exists(lines))
    2885          lines[-1] += leading_wsp
    2886      trailing_wsp = ''
    2887      if to_encode[-1] in WSP:
    2888          # Likewise for the trailing space.
    2889          trailing_wsp = to_encode[-1]
    2890          to_encode = to_encode[:-1]
    2891      new_last_ew = len(lines[-1]) if last_ew is None else last_ew
    2892  
    2893      encode_as = 'utf-8' if charset == 'us-ascii' else charset
    2894  
    2895      # The RFC2047 chrome takes up 7 characters plus the length
    2896      # of the charset name.
    2897      chrome_len = len(encode_as) + 7
    2898  
    2899      if (chrome_len + 1) >= maxlen:
    2900          raise errors.HeaderParseError(
    2901              "max_line_length is too small to fit an encoded word")
    2902  
    2903      while to_encode:
    2904          remaining_space = maxlen - len(lines[-1])
    2905          text_space = remaining_space - chrome_len
    2906          if text_space <= 0:
    2907              lines.append(' ')
    2908              continue
    2909  
    2910          to_encode_word = to_encode[:text_space]
    2911          encoded_word = _ew.encode(to_encode_word, charset=encode_as)
    2912          excess = len(encoded_word) - remaining_space
    2913          while excess > 0:
    2914              # Since the chunk to encode is guaranteed to fit into less than 100 characters,
    2915              # shrinking it by one at a time shouldn't take long.
    2916              to_encode_word = to_encode_word[:-1]
    2917              encoded_word = _ew.encode(to_encode_word, charset=encode_as)
    2918              excess = len(encoded_word) - remaining_space
    2919          lines[-1] += encoded_word
    2920          to_encode = to_encode[len(to_encode_word):]
    2921  
    2922          if to_encode:
    2923              lines.append(' ')
    2924              new_last_ew = len(lines[-1])
    2925      lines[-1] += trailing_wsp
    2926      return new_last_ew if ew_combine_allowed else None
    2927  
    2928  def _fold_mime_parameters(part, lines, maxlen, encoding):
    2929      """Fold TokenList 'part' into the 'lines' list as mime parameters.
    2930  
    2931      Using the decoded list of parameters and values, format them according to
    2932      the RFC rules, including using RFC2231 encoding if the value cannot be
    2933      expressed in 'encoding' and/or the parameter+value is too long to fit
    2934      within 'maxlen'.
    2935  
    2936      """
    2937      # Special case for RFC2231 encoding: start from decoded values and use
    2938      # RFC2231 encoding iff needed.
    2939      #
    2940      # Note that the 1 and 2s being added to the length calculations are
    2941      # accounting for the possibly-needed spaces and semicolons we'll be adding.
    2942      #
    2943      for name, value in part.params:
    2944          # XXX What if this ';' puts us over maxlen the first time through the
    2945          # loop?  We should split the header value onto a newline in that case,
    2946          # but to do that we need to recognize the need earlier or reparse the
    2947          # header, so I'm going to ignore that bug for now.  It'll only put us
    2948          # one character over.
    2949          if not lines[-1].rstrip().endswith(';'):
    2950              lines[-1] += ';'
    2951          charset = encoding
    2952          error_handler = 'strict'
    2953          try:
    2954              value.encode(encoding)
    2955              encoding_required = False
    2956          except UnicodeEncodeError:
    2957              encoding_required = True
    2958              if utils._has_surrogates(value):
    2959                  charset = 'unknown-8bit'
    2960                  error_handler = 'surrogateescape'
    2961              else:
    2962                  charset = 'utf-8'
    2963          if encoding_required:
    2964              encoded_value = urllib.parse.quote(
    2965                  value, safe='', errors=error_handler)
    2966              tstr = "{}*={}''{}".format(name, charset, encoded_value)
    2967          else:
    2968              tstr = '{}={}'.format(name, quote_string(value))
    2969          if len(lines[-1]) + len(tstr) + 1 < maxlen:
    2970              lines[-1] = lines[-1] + ' ' + tstr
    2971              continue
    2972          elif len(tstr) + 2 <= maxlen:
    2973              lines.append(' ' + tstr)
    2974              continue
    2975          # We need multiple sections.  We are allowed to mix encoded and
    2976          # non-encoded sections, but we aren't going to.  We'll encode them all.
    2977          section = 0
    2978          extra_chrome = charset + "''"
    2979          while value:
    2980              chrome_len = len(name) + len(str(section)) + 3 + len(extra_chrome)
    2981              if maxlen <= chrome_len + 3:
    2982                  # We need room for the leading blank, the trailing semicolon,
    2983                  # and at least one character of the value.  If we don't
    2984                  # have that, we'd be stuck, so in that case fall back to
    2985                  # the RFC standard width.
    2986                  maxlen = 78
    2987              splitpoint = maxchars = maxlen - chrome_len - 2
    2988              while True:
    2989                  partial = value[:splitpoint]
    2990                  encoded_value = urllib.parse.quote(
    2991                      partial, safe='', errors=error_handler)
    2992                  if len(encoded_value) <= maxchars:
    2993                      break
    2994                  splitpoint -= 1
    2995              lines.append(" {}*{}*={}{}".format(
    2996                  name, section, extra_chrome, encoded_value))
    2997              extra_chrome = ''
    2998              section += 1
    2999              value = value[splitpoint:]
    3000              if value:
    3001                  lines[-1] += ';'