1  # Copyright (C) 2001-2010 Python Software Foundation
       2  # Author: Barry Warsaw
       3  # Contact: email-sig@python.org
       4  
       5  """Miscellaneous utilities."""
       6  
       7  __all__ = [
       8      'collapse_rfc2231_value',
       9      'decode_params',
      10      'decode_rfc2231',
      11      'encode_rfc2231',
      12      'formataddr',
      13      'formatdate',
      14      'format_datetime',
      15      'getaddresses',
      16      'make_msgid',
      17      'mktime_tz',
      18      'parseaddr',
      19      'parsedate',
      20      'parsedate_tz',
      21      'parsedate_to_datetime',
      22      'unquote',
      23      ]
      24  
      25  import os
      26  import re
      27  import time
      28  import random
      29  import socket
      30  import datetime
      31  import urllib.parse
      32  
      33  from email._parseaddr import quote
      34  from email._parseaddr import AddressList as _AddressList
      35  from email._parseaddr import mktime_tz
      36  
      37  from email._parseaddr import parsedate, parsedate_tz, _parsedate_tz
      38  
      39  # Intrapackage imports
      40  from email.charset import Charset
      41  
      42  COMMASPACE = ', '
      43  EMPTYSTRING = ''
      44  UEMPTYSTRING = ''
      45  CRLF = '\r\n'
      46  TICK = "'"
      47  
      48  specialsre = re.compile(r'[][\\()<>@,:;".]')
      49  escapesre = re.compile(r'[\\"]')
      50  
      51  def _has_surrogates(s):
      52      """Return True if s contains surrogate-escaped binary data."""
      53      # This check is based on the fact that unless there are surrogates, utf8
      54      # (Python's default encoding) can encode any string.  This is the fastest
      55      # way to check for surrogates, see issue 11454 for timings.
      56      try:
      57          s.encode()
      58          return False
      59      except UnicodeEncodeError:
      60          return True
      61  
      62  # How to deal with a string containing bytes before handing it to the
      63  # application through the 'normal' interface.
      64  def _sanitize(string):
      65      # Turn any escaped bytes into unicode 'unknown' char.  If the escaped
      66      # bytes happen to be utf-8 they will instead get decoded, even if they
      67      # were invalid in the charset the source was supposed to be in.  This
      68      # seems like it is not a bad thing; a defect was still registered.
      69      original_bytes = string.encode('utf-8', 'surrogateescape')
      70      return original_bytes.decode('utf-8', 'replace')
      71  
      72  
      73  
      74  # Helpers
      75  
      76  def formataddr(pair, charset='utf-8'):
      77      """The inverse of parseaddr(), this takes a 2-tuple of the form
      78      (realname, email_address) and returns the string value suitable
      79      for an RFC 2822 From, To or Cc header.
      80  
      81      If the first element of pair is false, then the second element is
      82      returned unmodified.
      83  
      84      The optional charset is the character set that is used to encode
      85      realname in case realname is not ASCII safe.  Can be an instance of str or
      86      a Charset-like object which has a header_encode method.  Default is
      87      'utf-8'.
      88      """
      89      name, address = pair
      90      # The address MUST (per RFC) be ascii, so raise a UnicodeError if it isn't.
      91      address.encode('ascii')
      92      if name:
      93          try:
      94              name.encode('ascii')
      95          except UnicodeEncodeError:
      96              if isinstance(charset, str):
      97                  charset = Charset(charset)
      98              encoded_name = charset.header_encode(name)
      99              return "%s <%s>" % (encoded_name, address)
     100          else:
     101              quotes = ''
     102              if specialsre.search(name):
     103                  quotes = '"'
     104              name = escapesre.sub(r'\\\g<0>', name)
     105              return '%s%s%s <%s>' % (quotes, name, quotes, address)
     106      return address
     107  
     108  
     109  
     110  def getaddresses(fieldvalues):
     111      """Return a list of (REALNAME, EMAIL) for each fieldvalue."""
     112      all = COMMASPACE.join(str(v) for v in fieldvalues)
     113      a = _AddressList(all)
     114      return a.addresslist
     115  
     116  
     117  def _format_timetuple_and_zone(timetuple, zone):
     118      return '%s, %02d %s %04d %02d:%02d:%02d %s' % (
     119          ['Mon', 'Tue', 'Wed', 'Thu', 'Fri', 'Sat', 'Sun'][timetuple[6]],
     120          timetuple[2],
     121          ['Jan', 'Feb', 'Mar', 'Apr', 'May', 'Jun',
     122           'Jul', 'Aug', 'Sep', 'Oct', 'Nov', 'Dec'][timetuple[1] - 1],
     123          timetuple[0], timetuple[3], timetuple[4], timetuple[5],
     124          zone)
     125  
     126  def formatdate(timeval=None, localtime=False, usegmt=False):
     127      """Returns a date string as specified by RFC 2822, e.g.:
     128  
     129      Fri, 09 Nov 2001 01:08:47 -0000
     130  
     131      Optional timeval if given is a floating point time value as accepted by
     132      gmtime() and localtime(), otherwise the current time is used.
     133  
     134      Optional localtime is a flag that when True, interprets timeval, and
     135      returns a date relative to the local timezone instead of UTC, properly
     136      taking daylight savings time into account.
     137  
     138      Optional argument usegmt means that the timezone is written out as
     139      an ascii string, not numeric one (so "GMT" instead of "+0000"). This
     140      is needed for HTTP, and is only used when localtime==False.
     141      """
     142      # Note: we cannot use strftime() because that honors the locale and RFC
     143      # 2822 requires that day and month names be the English abbreviations.
     144      if timeval is None:
     145          timeval = time.time()
     146      dt = datetime.datetime.fromtimestamp(timeval, datetime.timezone.utc)
     147  
     148      if localtime:
     149          dt = dt.astimezone()
     150          usegmt = False
     151      elif not usegmt:
     152          dt = dt.replace(tzinfo=None)
     153      return format_datetime(dt, usegmt)
     154  
     155  def format_datetime(dt, usegmt=False):
     156      """Turn a datetime into a date string as specified in RFC 2822.
     157  
     158      If usegmt is True, dt must be an aware datetime with an offset of zero.  In
     159      this case 'GMT' will be rendered instead of the normal +0000 required by
     160      RFC2822.  This is to support HTTP headers involving date stamps.
     161      """
     162      now = dt.timetuple()
     163      if usegmt:
     164          if dt.tzinfo is None or dt.tzinfo != datetime.timezone.utc:
     165              raise ValueError("usegmt option requires a UTC datetime")
     166          zone = 'GMT'
     167      elif dt.tzinfo is None:
     168          zone = '-0000'
     169      else:
     170          zone = dt.strftime("%z")
     171      return _format_timetuple_and_zone(now, zone)
     172  
     173  
     174  def make_msgid(idstring=None, domain=None):
     175      """Returns a string suitable for RFC 2822 compliant Message-ID, e.g:
     176  
     177      <142480216486.20800.16526388040877946887@nightshade.la.mastaler.com>
     178  
     179      Optional idstring if given is a string used to strengthen the
     180      uniqueness of the message id.  Optional domain if given provides the
     181      portion of the message id after the '@'.  It defaults to the locally
     182      defined hostname.
     183      """
     184      timeval = int(time.time()*100)
     185      pid = os.getpid()
     186      randint = random.getrandbits(64)
     187      if idstring is None:
     188          idstring = ''
     189      else:
     190          idstring = '.' + idstring
     191      if domain is None:
     192          domain = socket.getfqdn()
     193      msgid = '<%d.%d.%d%s@%s>' % (timeval, pid, randint, idstring, domain)
     194      return msgid
     195  
     196  
     197  def parsedate_to_datetime(data):
     198      parsed_date_tz = _parsedate_tz(data)
     199      if parsed_date_tz is None:
     200          raise ValueError('Invalid date value or format "%s"' % str(data))
     201      *dtuple, tz = parsed_date_tz
     202      if tz is None:
     203          return datetime.datetime(*dtuple[:6])
     204      return datetime.datetime(*dtuple[:6],
     205              tzinfo=datetime.timezone(datetime.timedelta(seconds=tz)))
     206  
     207  
     208  def parseaddr(addr):
     209      """
     210      Parse addr into its constituent realname and email address parts.
     211  
     212      Return a tuple of realname and email address, unless the parse fails, in
     213      which case return a 2-tuple of ('', '').
     214      """
     215      addrs = _AddressList(addr).addresslist
     216      if not addrs:
     217          return '', ''
     218      return addrs[0]
     219  
     220  
     221  # rfc822.unquote() doesn't properly de-backslash-ify in Python pre-2.3.
     222  def unquote(str):
     223      """Remove quotes from a string."""
     224      if len(str) > 1:
     225          if str.startswith('"') and str.endswith('"'):
     226              return str[1:-1].replace('\\\\', '\\').replace('\\"', '"')
     227          if str.startswith('<') and str.endswith('>'):
     228              return str[1:-1]
     229      return str
     230  
     231  
     232  
     233  # RFC2231-related functions - parameter encoding and decoding
     234  def decode_rfc2231(s):
     235      """Decode string according to RFC 2231"""
     236      parts = s.split(TICK, 2)
     237      if len(parts) <= 2:
     238          return None, None, s
     239      return parts
     240  
     241  
     242  def encode_rfc2231(s, charset=None, language=None):
     243      """Encode string according to RFC 2231.
     244  
     245      If neither charset nor language is given, then s is returned as-is.  If
     246      charset is given but not language, the string is encoded using the empty
     247      string for language.
     248      """
     249      s = urllib.parse.quote(s, safe='', encoding=charset or 'ascii')
     250      if charset is None and language is None:
     251          return s
     252      if language is None:
     253          language = ''
     254      return "%s'%s'%s" % (charset, language, s)
     255  
     256  
     257  rfc2231_continuation = re.compile(r'^(?P<name>\w+)\*((?P<num>[0-9]+)\*?)?$',
     258      re.ASCII)
     259  
     260  def decode_params(params):
     261      """Decode parameters list according to RFC 2231.
     262  
     263      params is a sequence of 2-tuples containing (param name, string value).
     264      """
     265      new_params = [params[0]]
     266      # Map parameter's name to a list of continuations.  The values are a
     267      # 3-tuple of the continuation number, the string value, and a flag
     268      # specifying whether a particular segment is %-encoded.
     269      rfc2231_params = {}
     270      for name, value in params[1:]:
     271          encoded = name.endswith('*')
     272          value = unquote(value)
     273          mo = rfc2231_continuation.match(name)
     274          if mo:
     275              name, num = mo.group('name', 'num')
     276              if num is not None:
     277                  num = int(num)
     278              rfc2231_params.setdefault(name, []).append((num, value, encoded))
     279          else:
     280              new_params.append((name, '"%s"' % quote(value)))
     281      if rfc2231_params:
     282          for name, continuations in rfc2231_params.items():
     283              value = []
     284              extended = False
     285              # Sort by number
     286              continuations.sort()
     287              # And now append all values in numerical order, converting
     288              # %-encodings for the encoded segments.  If any of the
     289              # continuation names ends in a *, then the entire string, after
     290              # decoding segments and concatenating, must have the charset and
     291              # language specifiers at the beginning of the string.
     292              for num, s, encoded in continuations:
     293                  if encoded:
     294                      # Decode as "latin-1", so the characters in s directly
     295                      # represent the percent-encoded octet values.
     296                      # collapse_rfc2231_value treats this as an octet sequence.
     297                      s = urllib.parse.unquote(s, encoding="latin-1")
     298                      extended = True
     299                  value.append(s)
     300              value = quote(EMPTYSTRING.join(value))
     301              if extended:
     302                  charset, language, value = decode_rfc2231(value)
     303                  new_params.append((name, (charset, language, '"%s"' % value)))
     304              else:
     305                  new_params.append((name, '"%s"' % value))
     306      return new_params
     307  
     308  def collapse_rfc2231_value(value, errors='replace',
     309                             fallback_charset='us-ascii'):
     310      if not isinstance(value, tuple) or len(value) != 3:
     311          return unquote(value)
     312      # While value comes to us as a unicode string, we need it to be a bytes
     313      # object.  We do not want bytes() normal utf-8 decoder, we want a straight
     314      # interpretation of the string as character bytes.
     315      charset, language, text = value
     316      if charset is None:
     317          # Issue 17369: if charset/lang is None, decode_rfc2231 couldn't parse
     318          # the value, so use the fallback_charset.
     319          charset = fallback_charset
     320      rawbytes = bytes(text, 'raw-unicode-escape')
     321      try:
     322          return str(rawbytes, charset, errors)
     323      except LookupError:
     324          # charset is not a known codec.
     325          return unquote(text)
     326  
     327  
     328  #
     329  # datetime doesn't provide a localtime function yet, so provide one.  Code
     330  # adapted from the patch in issue 9527.  This may not be perfect, but it is
     331  # better than not having it.
     332  #
     333  
     334  def localtime(dt=None, isdst=None):
     335      """Return local time as an aware datetime object.
     336  
     337      If called without arguments, return current time.  Otherwise *dt*
     338      argument should be a datetime instance, and it is converted to the
     339      local time zone according to the system time zone database.  If *dt* is
     340      naive (that is, dt.tzinfo is None), it is assumed to be in local time.
     341      The isdst parameter is ignored.
     342  
     343      """
     344      if isdst is not None:
     345          import warnings
     346          warnings._deprecated(
     347              "The 'isdst' parameter to 'localtime'",
     348              message='{name} is deprecated and slated for removal in Python {remove}',
     349              remove=(3, 14),
     350              )
     351      if dt is None:
     352          dt = datetime.datetime.now()
     353      return dt.astimezone()