1  # Copyright (C) 2001-2007 Python Software Foundation
       2  # Author: Ben Gertzfield, Barry Warsaw
       3  # Contact: email-sig@python.org
       4  
       5  __all__ = [
       6      'Charset',
       7      'add_alias',
       8      'add_charset',
       9      'add_codec',
      10      ]
      11  
      12  from functools import partial
      13  
      14  import email.base64mime
      15  import email.quoprimime
      16  
      17  from email import errors
      18  from email.encoders import encode_7or8bit
      19  
      20  
      21  # Flags for types of header encodings
      22  QP          = 1 # Quoted-Printable
      23  BASE64      = 2 # Base64
      24  SHORTEST    = 3 # the shorter of QP and base64, but only for headers
      25  
      26  # In "=?charset?q?hello_world?=", the =?, ?q?, and ?= add up to 7
      27  RFC2047_CHROME_LEN = 7
      28  
      29  DEFAULT_CHARSET = 'us-ascii'
      30  UNKNOWN8BIT = 'unknown-8bit'
      31  EMPTYSTRING = ''
      32  
      33  
      34  # Defaults
      35  CHARSETS = {
      36      # input        header enc  body enc output conv
      37      'iso-8859-1':  (QP,        QP,      None),
      38      'iso-8859-2':  (QP,        QP,      None),
      39      'iso-8859-3':  (QP,        QP,      None),
      40      'iso-8859-4':  (QP,        QP,      None),
      41      # iso-8859-5 is Cyrillic, and not especially used
      42      # iso-8859-6 is Arabic, also not particularly used
      43      # iso-8859-7 is Greek, QP will not make it readable
      44      # iso-8859-8 is Hebrew, QP will not make it readable
      45      'iso-8859-9':  (QP,        QP,      None),
      46      'iso-8859-10': (QP,        QP,      None),
      47      # iso-8859-11 is Thai, QP will not make it readable
      48      'iso-8859-13': (QP,        QP,      None),
      49      'iso-8859-14': (QP,        QP,      None),
      50      'iso-8859-15': (QP,        QP,      None),
      51      'iso-8859-16': (QP,        QP,      None),
      52      'windows-1252':(QP,        QP,      None),
      53      'viscii':      (QP,        QP,      None),
      54      'us-ascii':    (None,      None,    None),
      55      'big5':        (BASE64,    BASE64,  None),
      56      'gb2312':      (BASE64,    BASE64,  None),
      57      'euc-jp':      (BASE64,    None,    'iso-2022-jp'),
      58      'shift_jis':   (BASE64,    None,    'iso-2022-jp'),
      59      'iso-2022-jp': (BASE64,    None,    None),
      60      'koi8-r':      (BASE64,    BASE64,  None),
      61      'utf-8':       (SHORTEST,  BASE64, 'utf-8'),
      62      }
      63  
      64  # Aliases for other commonly-used names for character sets.  Map
      65  # them to the real ones used in email.
      66  ALIASES = {
      67      'latin_1': 'iso-8859-1',
      68      'latin-1': 'iso-8859-1',
      69      'latin_2': 'iso-8859-2',
      70      'latin-2': 'iso-8859-2',
      71      'latin_3': 'iso-8859-3',
      72      'latin-3': 'iso-8859-3',
      73      'latin_4': 'iso-8859-4',
      74      'latin-4': 'iso-8859-4',
      75      'latin_5': 'iso-8859-9',
      76      'latin-5': 'iso-8859-9',
      77      'latin_6': 'iso-8859-10',
      78      'latin-6': 'iso-8859-10',
      79      'latin_7': 'iso-8859-13',
      80      'latin-7': 'iso-8859-13',
      81      'latin_8': 'iso-8859-14',
      82      'latin-8': 'iso-8859-14',
      83      'latin_9': 'iso-8859-15',
      84      'latin-9': 'iso-8859-15',
      85      'latin_10':'iso-8859-16',
      86      'latin-10':'iso-8859-16',
      87      'cp949':   'ks_c_5601-1987',
      88      'euc_jp':  'euc-jp',
      89      'euc_kr':  'euc-kr',
      90      'ascii':   'us-ascii',
      91      }
      92  
      93  
      94  # Map charsets to their Unicode codec strings.
      95  CODEC_MAP = {
      96      'gb2312':      'eucgb2312_cn',
      97      'big5':        'big5_tw',
      98      # Hack: We don't want *any* conversion for stuff marked us-ascii, as all
      99      # sorts of garbage might be sent to us in the guise of 7-bit us-ascii.
     100      # Let that stuff pass through without conversion to/from Unicode.
     101      'us-ascii':    None,
     102      }
     103  
     104  
     105  # Convenience functions for extending the above mappings
     106  def add_charset(charset, header_enc=None, body_enc=None, output_charset=None):
     107      """Add character set properties to the global registry.
     108  
     109      charset is the input character set, and must be the canonical name of a
     110      character set.
     111  
     112      Optional header_enc and body_enc is either charset.QP for
     113      quoted-printable, charset.BASE64 for base64 encoding, charset.SHORTEST for
     114      the shortest of qp or base64 encoding, or None for no encoding.  SHORTEST
     115      is only valid for header_enc.  It describes how message headers and
     116      message bodies in the input charset are to be encoded.  Default is no
     117      encoding.
     118  
     119      Optional output_charset is the character set that the output should be
     120      in.  Conversions will proceed from input charset, to Unicode, to the
     121      output charset when the method Charset.convert() is called.  The default
     122      is to output in the same character set as the input.
     123  
     124      Both input_charset and output_charset must have Unicode codec entries in
     125      the module's charset-to-codec mapping; use add_codec(charset, codecname)
     126      to add codecs the module does not know about.  See the codecs module's
     127      documentation for more information.
     128      """
     129      if body_enc == SHORTEST:
     130          raise ValueError('SHORTEST not allowed for body_enc')
     131      CHARSETS[charset] = (header_enc, body_enc, output_charset)
     132  
     133  
     134  def add_alias(alias, canonical):
     135      """Add a character set alias.
     136  
     137      alias is the alias name, e.g. latin-1
     138      canonical is the character set's canonical name, e.g. iso-8859-1
     139      """
     140      ALIASES[alias] = canonical
     141  
     142  
     143  def add_codec(charset, codecname):
     144      """Add a codec that map characters in the given charset to/from Unicode.
     145  
     146      charset is the canonical name of a character set.  codecname is the name
     147      of a Python codec, as appropriate for the second argument to the unicode()
     148      built-in, or to the encode() method of a Unicode string.
     149      """
     150      CODEC_MAP[charset] = codecname
     151  
     152  
     153  # Convenience function for encoding strings, taking into account
     154  # that they might be unknown-8bit (ie: have surrogate-escaped bytes)
     155  def _encode(string, codec):
     156      if codec == UNKNOWN8BIT:
     157          return string.encode('ascii', 'surrogateescape')
     158      else:
     159          return string.encode(codec)
     160  
     161  
     162  class ESC[4;38;5;81mCharset:
     163      """Map character sets to their email properties.
     164  
     165      This class provides information about the requirements imposed on email
     166      for a specific character set.  It also provides convenience routines for
     167      converting between character sets, given the availability of the
     168      applicable codecs.  Given a character set, it will do its best to provide
     169      information on how to use that character set in an email in an
     170      RFC-compliant way.
     171  
     172      Certain character sets must be encoded with quoted-printable or base64
     173      when used in email headers or bodies.  Certain character sets must be
     174      converted outright, and are not allowed in email.  Instances of this
     175      module expose the following information about a character set:
     176  
     177      input_charset: The initial character set specified.  Common aliases
     178                     are converted to their `official' email names (e.g. latin_1
     179                     is converted to iso-8859-1).  Defaults to 7-bit us-ascii.
     180  
     181      header_encoding: If the character set must be encoded before it can be
     182                       used in an email header, this attribute will be set to
     183                       charset.QP (for quoted-printable), charset.BASE64 (for
     184                       base64 encoding), or charset.SHORTEST for the shortest of
     185                       QP or BASE64 encoding.  Otherwise, it will be None.
     186  
     187      body_encoding: Same as header_encoding, but describes the encoding for the
     188                     mail message's body, which indeed may be different than the
     189                     header encoding.  charset.SHORTEST is not allowed for
     190                     body_encoding.
     191  
     192      output_charset: Some character sets must be converted before they can be
     193                      used in email headers or bodies.  If the input_charset is
     194                      one of them, this attribute will contain the name of the
     195                      charset output will be converted to.  Otherwise, it will
     196                      be None.
     197  
     198      input_codec: The name of the Python codec used to convert the
     199                   input_charset to Unicode.  If no conversion codec is
     200                   necessary, this attribute will be None.
     201  
     202      output_codec: The name of the Python codec used to convert Unicode
     203                    to the output_charset.  If no conversion codec is necessary,
     204                    this attribute will have the same value as the input_codec.
     205      """
     206      def __init__(self, input_charset=DEFAULT_CHARSET):
     207          # RFC 2046, $4.1.2 says charsets are not case sensitive.  We coerce to
     208          # unicode because its .lower() is locale insensitive.  If the argument
     209          # is already a unicode, we leave it at that, but ensure that the
     210          # charset is ASCII, as the standard (RFC XXX) requires.
     211          try:
     212              if isinstance(input_charset, str):
     213                  input_charset.encode('ascii')
     214              else:
     215                  input_charset = str(input_charset, 'ascii')
     216          except UnicodeError:
     217              raise errors.CharsetError(input_charset)
     218          input_charset = input_charset.lower()
     219          # Set the input charset after filtering through the aliases
     220          self.input_charset = ALIASES.get(input_charset, input_charset)
     221          # We can try to guess which encoding and conversion to use by the
     222          # charset_map dictionary.  Try that first, but let the user override
     223          # it.
     224          henc, benc, conv = CHARSETS.get(self.input_charset,
     225                                          (SHORTEST, BASE64, None))
     226          if not conv:
     227              conv = self.input_charset
     228          # Set the attributes, allowing the arguments to override the default.
     229          self.header_encoding = henc
     230          self.body_encoding = benc
     231          self.output_charset = ALIASES.get(conv, conv)
     232          # Now set the codecs.  If one isn't defined for input_charset,
     233          # guess and try a Unicode codec with the same name as input_codec.
     234          self.input_codec = CODEC_MAP.get(self.input_charset,
     235                                           self.input_charset)
     236          self.output_codec = CODEC_MAP.get(self.output_charset,
     237                                            self.output_charset)
     238  
     239      def __repr__(self):
     240          return self.input_charset.lower()
     241  
     242      def __eq__(self, other):
     243          return str(self) == str(other).lower()
     244  
     245      def get_body_encoding(self):
     246          """Return the content-transfer-encoding used for body encoding.
     247  
     248          This is either the string `quoted-printable' or `base64' depending on
     249          the encoding used, or it is a function in which case you should call
     250          the function with a single argument, the Message object being
     251          encoded.  The function should then set the Content-Transfer-Encoding
     252          header itself to whatever is appropriate.
     253  
     254          Returns "quoted-printable" if self.body_encoding is QP.
     255          Returns "base64" if self.body_encoding is BASE64.
     256          Returns conversion function otherwise.
     257          """
     258          assert self.body_encoding != SHORTEST
     259          if self.body_encoding == QP:
     260              return 'quoted-printable'
     261          elif self.body_encoding == BASE64:
     262              return 'base64'
     263          else:
     264              return encode_7or8bit
     265  
     266      def get_output_charset(self):
     267          """Return the output character set.
     268  
     269          This is self.output_charset if that is not None, otherwise it is
     270          self.input_charset.
     271          """
     272          return self.output_charset or self.input_charset
     273  
     274      def header_encode(self, string):
     275          """Header-encode a string by converting it first to bytes.
     276  
     277          The type of encoding (base64 or quoted-printable) will be based on
     278          this charset's `header_encoding`.
     279  
     280          :param string: A unicode string for the header.  It must be possible
     281              to encode this string to bytes using the character set's
     282              output codec.
     283          :return: The encoded string, with RFC 2047 chrome.
     284          """
     285          codec = self.output_codec or 'us-ascii'
     286          header_bytes = _encode(string, codec)
     287          # 7bit/8bit encodings return the string unchanged (modulo conversions)
     288          encoder_module = self._get_encoder(header_bytes)
     289          if encoder_module is None:
     290              return string
     291          return encoder_module.header_encode(header_bytes, codec)
     292  
     293      def header_encode_lines(self, string, maxlengths):
     294          """Header-encode a string by converting it first to bytes.
     295  
     296          This is similar to `header_encode()` except that the string is fit
     297          into maximum line lengths as given by the argument.
     298  
     299          :param string: A unicode string for the header.  It must be possible
     300              to encode this string to bytes using the character set's
     301              output codec.
     302          :param maxlengths: Maximum line length iterator.  Each element
     303              returned from this iterator will provide the next maximum line
     304              length.  This parameter is used as an argument to built-in next()
     305              and should never be exhausted.  The maximum line lengths should
     306              not count the RFC 2047 chrome.  These line lengths are only a
     307              hint; the splitter does the best it can.
     308          :return: Lines of encoded strings, each with RFC 2047 chrome.
     309          """
     310          # See which encoding we should use.
     311          codec = self.output_codec or 'us-ascii'
     312          header_bytes = _encode(string, codec)
     313          encoder_module = self._get_encoder(header_bytes)
     314          encoder = partial(encoder_module.header_encode, charset=codec)
     315          # Calculate the number of characters that the RFC 2047 chrome will
     316          # contribute to each line.
     317          charset = self.get_output_charset()
     318          extra = len(charset) + RFC2047_CHROME_LEN
     319          # Now comes the hard part.  We must encode bytes but we can't split on
     320          # bytes because some character sets are variable length and each
     321          # encoded word must stand on its own.  So the problem is you have to
     322          # encode to bytes to figure out this word's length, but you must split
     323          # on characters.  This causes two problems: first, we don't know how
     324          # many octets a specific substring of unicode characters will get
     325          # encoded to, and second, we don't know how many ASCII characters
     326          # those octets will get encoded to.  Unless we try it.  Which seems
     327          # inefficient.  In the interest of being correct rather than fast (and
     328          # in the hope that there will be few encoded headers in any such
     329          # message), brute force it. :(
     330          lines = []
     331          current_line = []
     332          maxlen = next(maxlengths) - extra
     333          for character in string:
     334              current_line.append(character)
     335              this_line = EMPTYSTRING.join(current_line)
     336              length = encoder_module.header_length(_encode(this_line, charset))
     337              if length > maxlen:
     338                  # This last character doesn't fit so pop it off.
     339                  current_line.pop()
     340                  # Does nothing fit on the first line?
     341                  if not lines and not current_line:
     342                      lines.append(None)
     343                  else:
     344                      joined_line = EMPTYSTRING.join(current_line)
     345                      header_bytes = _encode(joined_line, codec)
     346                      lines.append(encoder(header_bytes))
     347                  current_line = [character]
     348                  maxlen = next(maxlengths) - extra
     349          joined_line = EMPTYSTRING.join(current_line)
     350          header_bytes = _encode(joined_line, codec)
     351          lines.append(encoder(header_bytes))
     352          return lines
     353  
     354      def _get_encoder(self, header_bytes):
     355          if self.header_encoding == BASE64:
     356              return email.base64mime
     357          elif self.header_encoding == QP:
     358              return email.quoprimime
     359          elif self.header_encoding == SHORTEST:
     360              len64 = email.base64mime.header_length(header_bytes)
     361              lenqp = email.quoprimime.header_length(header_bytes)
     362              if len64 < lenqp:
     363                  return email.base64mime
     364              else:
     365                  return email.quoprimime
     366          else:
     367              return None
     368  
     369      def body_encode(self, string):
     370          """Body-encode a string by converting it first to bytes.
     371  
     372          The type of encoding (base64 or quoted-printable) will be based on
     373          self.body_encoding.  If body_encoding is None, we assume the
     374          output charset is a 7bit encoding, so re-encoding the decoded
     375          string using the ascii codec produces the correct string version
     376          of the content.
     377          """
     378          if not string:
     379              return string
     380          if self.body_encoding is BASE64:
     381              if isinstance(string, str):
     382                  string = string.encode(self.output_charset)
     383              return email.base64mime.body_encode(string)
     384          elif self.body_encoding is QP:
     385              # quopromime.body_encode takes a string, but operates on it as if
     386              # it were a list of byte codes.  For a (minimal) history on why
     387              # this is so, see changeset 0cf700464177.  To correctly encode a
     388              # character set, then, we must turn it into pseudo bytes via the
     389              # latin1 charset, which will encode any byte as a single code point
     390              # between 0 and 255, which is what body_encode is expecting.
     391              if isinstance(string, str):
     392                  string = string.encode(self.output_charset)
     393              string = string.decode('latin1')
     394              return email.quoprimime.body_encode(string)
     395          else:
     396              if isinstance(string, str):
     397                  string = string.encode(self.output_charset).decode('ascii')
     398              return string