1  # Copyright (C) 2001-2006 Python Software Foundation
       2  # Author: Ben Gertzfield
       3  # Contact: email-sig@python.org
       4  
       5  """Quoted-printable content transfer encoding per RFCs 2045-2047.
       6  
       7  This module handles the content transfer encoding method defined in RFC 2045
       8  to encode US ASCII-like 8-bit data called `quoted-printable'.  It is used to
       9  safely encode text that is in a character set similar to the 7-bit US ASCII
      10  character set, but that includes some 8-bit characters that are normally not
      11  allowed in email bodies or headers.
      12  
      13  Quoted-printable is very space-inefficient for encoding binary files; use the
      14  email.base64mime module for that instead.
      15  
      16  This module provides an interface to encode and decode both headers and bodies
      17  with quoted-printable encoding.
      18  
      19  RFC 2045 defines a method for including character set information in an
      20  `encoded-word' in a header.  This method is commonly used for 8-bit real names
      21  in To:/From:/Cc: etc. fields, as well as Subject: lines.
      22  
      23  This module does not do the line wrapping or end-of-line character
      24  conversion necessary for proper internationalized headers; it only
      25  does dumb encoding and decoding.  To deal with the various line
      26  wrapping issues, use the email.header module.
      27  """
      28  
      29  __all__ = [
      30      'body_decode',
      31      'body_encode',
      32      'body_length',
      33      'decode',
      34      'decodestring',
      35      'header_decode',
      36      'header_encode',
      37      'header_length',
      38      'quote',
      39      'unquote',
      40      ]
      41  
      42  import re
      43  
      44  from string import ascii_letters, digits, hexdigits
      45  
      46  CRLF = '\r\n'
      47  NL = '\n'
      48  EMPTYSTRING = ''
      49  
      50  # Build a mapping of octets to the expansion of that octet.  Since we're only
      51  # going to have 256 of these things, this isn't terribly inefficient
      52  # space-wise.  Remember that headers and bodies have different sets of safe
      53  # characters.  Initialize both maps with the full expansion, and then override
      54  # the safe bytes with the more compact form.
      55  _QUOPRI_MAP = ['=%02X' % c for c in range(256)]
      56  _QUOPRI_HEADER_MAP = _QUOPRI_MAP[:]
      57  _QUOPRI_BODY_MAP = _QUOPRI_MAP[:]
      58  
      59  # Safe header bytes which need no encoding.
      60  for c in b'-!*+/' + ascii_letters.encode('ascii') + digits.encode('ascii'):
      61      _QUOPRI_HEADER_MAP[c] = chr(c)
      62  # Headers have one other special encoding; spaces become underscores.
      63  _QUOPRI_HEADER_MAP[ord(' ')] = '_'
      64  
      65  # Safe body bytes which need no encoding.
      66  for c in (b' !"#$%&\'()*+,-./0123456789:;<>'
      67            b'?@ABCDEFGHIJKLMNOPQRSTUVWXYZ[\\]^_`'
      68            b'abcdefghijklmnopqrstuvwxyz{|}~\t'):
      69      _QUOPRI_BODY_MAP[c] = chr(c)
      70  
      71  
      72  
      73  # Helpers
      74  def header_check(octet):
      75      """Return True if the octet should be escaped with header quopri."""
      76      return chr(octet) != _QUOPRI_HEADER_MAP[octet]
      77  
      78  
      79  def body_check(octet):
      80      """Return True if the octet should be escaped with body quopri."""
      81      return chr(octet) != _QUOPRI_BODY_MAP[octet]
      82  
      83  
      84  def header_length(bytearray):
      85      """Return a header quoted-printable encoding length.
      86  
      87      Note that this does not include any RFC 2047 chrome added by
      88      `header_encode()`.
      89  
      90      :param bytearray: An array of bytes (a.k.a. octets).
      91      :return: The length in bytes of the byte array when it is encoded with
      92          quoted-printable for headers.
      93      """
      94      return sum(len(_QUOPRI_HEADER_MAP[octet]) for octet in bytearray)
      95  
      96  
      97  def body_length(bytearray):
      98      """Return a body quoted-printable encoding length.
      99  
     100      :param bytearray: An array of bytes (a.k.a. octets).
     101      :return: The length in bytes of the byte array when it is encoded with
     102          quoted-printable for bodies.
     103      """
     104      return sum(len(_QUOPRI_BODY_MAP[octet]) for octet in bytearray)
     105  
     106  
     107  def _max_append(L, s, maxlen, extra=''):
     108      if not isinstance(s, str):
     109          s = chr(s)
     110      if not L:
     111          L.append(s.lstrip())
     112      elif len(L[-1]) + len(s) <= maxlen:
     113          L[-1] += extra + s
     114      else:
     115          L.append(s.lstrip())
     116  
     117  
     118  def unquote(s):
     119      """Turn a string in the form =AB to the ASCII character with value 0xab"""
     120      return chr(int(s[1:3], 16))
     121  
     122  
     123  def quote(c):
     124      return _QUOPRI_MAP[ord(c)]
     125  
     126  
     127  def header_encode(header_bytes, charset='iso-8859-1'):
     128      """Encode a single header line with quoted-printable (like) encoding.
     129  
     130      Defined in RFC 2045, this `Q' encoding is similar to quoted-printable, but
     131      used specifically for email header fields to allow charsets with mostly 7
     132      bit characters (and some 8 bit) to remain more or less readable in non-RFC
     133      2045 aware mail clients.
     134  
     135      charset names the character set to use in the RFC 2046 header.  It
     136      defaults to iso-8859-1.
     137      """
     138      # Return empty headers as an empty string.
     139      if not header_bytes:
     140          return ''
     141      # Iterate over every byte, encoding if necessary.
     142      encoded = header_bytes.decode('latin1').translate(_QUOPRI_HEADER_MAP)
     143      # Now add the RFC chrome to each encoded chunk and glue the chunks
     144      # together.
     145      return '=?%s?q?%s?=' % (charset, encoded)
     146  
     147  
     148  _QUOPRI_BODY_ENCODE_MAP = _QUOPRI_BODY_MAP[:]
     149  for c in b'\r\n':
     150      _QUOPRI_BODY_ENCODE_MAP[c] = chr(c)
     151  del c
     152  
     153  def body_encode(body, maxlinelen=76, eol=NL):
     154      """Encode with quoted-printable, wrapping at maxlinelen characters.
     155  
     156      Each line of encoded text will end with eol, which defaults to "\\n".  Set
     157      this to "\\r\\n" if you will be using the result of this function directly
     158      in an email.
     159  
     160      Each line will be wrapped at, at most, maxlinelen characters before the
     161      eol string (maxlinelen defaults to 76 characters, the maximum value
     162      permitted by RFC 2045).  Long lines will have the 'soft line break'
     163      quoted-printable character "=" appended to them, so the decoded text will
     164      be identical to the original text.
     165  
     166      The minimum maxlinelen is 4 to have room for a quoted character ("=XX")
     167      followed by a soft line break.  Smaller values will generate a
     168      ValueError.
     169  
     170      """
     171  
     172      if maxlinelen < 4:
     173          raise ValueError("maxlinelen must be at least 4")
     174      if not body:
     175          return body
     176  
     177      # quote special characters
     178      body = body.translate(_QUOPRI_BODY_ENCODE_MAP)
     179  
     180      soft_break = '=' + eol
     181      # leave space for the '=' at the end of a line
     182      maxlinelen1 = maxlinelen - 1
     183  
     184      encoded_body = []
     185      append = encoded_body.append
     186  
     187      for line in body.splitlines():
     188          # break up the line into pieces no longer than maxlinelen - 1
     189          start = 0
     190          laststart = len(line) - 1 - maxlinelen
     191          while start <= laststart:
     192              stop = start + maxlinelen1
     193              # make sure we don't break up an escape sequence
     194              if line[stop - 2] == '=':
     195                  append(line[start:stop - 1])
     196                  start = stop - 2
     197              elif line[stop - 1] == '=':
     198                  append(line[start:stop])
     199                  start = stop - 1
     200              else:
     201                  append(line[start:stop] + '=')
     202                  start = stop
     203  
     204          # handle rest of line, special case if line ends in whitespace
     205          if line and line[-1] in ' \t':
     206              room = start - laststart
     207              if room >= 3:
     208                  # It's a whitespace character at end-of-line, and we have room
     209                  # for the three-character quoted encoding.
     210                  q = quote(line[-1])
     211              elif room == 2:
     212                  # There's room for the whitespace character and a soft break.
     213                  q = line[-1] + soft_break
     214              else:
     215                  # There's room only for a soft break.  The quoted whitespace
     216                  # will be the only content on the subsequent line.
     217                  q = soft_break + quote(line[-1])
     218              append(line[start:-1] + q)
     219          else:
     220              append(line[start:])
     221  
     222      # add back final newline if present
     223      if body[-1] in CRLF:
     224          append('')
     225  
     226      return eol.join(encoded_body)
     227  
     228  
     229  
     230  # BAW: I'm not sure if the intent was for the signature of this function to be
     231  # the same as base64MIME.decode() or not...
     232  def decode(encoded, eol=NL):
     233      """Decode a quoted-printable string.
     234  
     235      Lines are separated with eol, which defaults to \\n.
     236      """
     237      if not encoded:
     238          return encoded
     239      # BAW: see comment in encode() above.  Again, we're building up the
     240      # decoded string with string concatenation, which could be done much more
     241      # efficiently.
     242      decoded = ''
     243  
     244      for line in encoded.splitlines():
     245          line = line.rstrip()
     246          if not line:
     247              decoded += eol
     248              continue
     249  
     250          i = 0
     251          n = len(line)
     252          while i < n:
     253              c = line[i]
     254              if c != '=':
     255                  decoded += c
     256                  i += 1
     257              # Otherwise, c == "=".  Are we at the end of the line?  If so, add
     258              # a soft line break.
     259              elif i+1 == n:
     260                  i += 1
     261                  continue
     262              # Decode if in form =AB
     263              elif i+2 < n and line[i+1] in hexdigits and line[i+2] in hexdigits:
     264                  decoded += unquote(line[i:i+3])
     265                  i += 3
     266              # Otherwise, not in form =AB, pass literally
     267              else:
     268                  decoded += c
     269                  i += 1
     270  
     271              if i == n:
     272                  decoded += eol
     273      # Special case if original string did not end with eol
     274      if encoded[-1] not in '\r\n' and decoded.endswith(eol):
     275          decoded = decoded[:-1]
     276      return decoded
     277  
     278  
     279  # For convenience and backwards compatibility w/ standard base64 module
     280  body_decode = decode
     281  decodestring = decode
     282  
     283  
     284  
     285  def _unquote_match(match):
     286      """Turn a match in the form =AB to the ASCII character with value 0xab"""
     287      s = match.group(0)
     288      return unquote(s)
     289  
     290  
     291  # Header decoding is done a bit differently
     292  def header_decode(s):
     293      """Decode a string encoded with RFC 2045 MIME header `Q' encoding.
     294  
     295      This function does not parse a full MIME header value encoded with
     296      quoted-printable (like =?iso-8859-1?q?Hello_World?=) -- please use
     297      the high level email.header class for that functionality.
     298      """
     299      s = s.replace('_', ' ')
     300      return re.sub(r'=[a-fA-F0-9]{2}', _unquote_match, s, flags=re.ASCII)