python (3.11.7)
       1  # SPDX-FileCopyrightText: 2015 Eric Larson
       2  #
       3  # SPDX-License-Identifier: Apache-2.0
       4  
       5  """
       6  The httplib2 algorithms ported for use with requests.
       7  """
       8  import logging
       9  import re
      10  import calendar
      11  import time
      12  from email.utils import parsedate_tz
      13  
      14  from pip._vendor.requests.structures import CaseInsensitiveDict
      15  
      16  from .cache import DictCache, SeparateBodyBaseCache
      17  from .serialize import Serializer
      18  
      19  
      20  logger = logging.getLogger(__name__)
      21  
      22  URI = re.compile(r"^(([^:/?#]+):)?(//([^/?#]*))?([^?#]*)(\?([^#]*))?(#(.*))?")
      23  
      24  PERMANENT_REDIRECT_STATUSES = (301, 308)
      25  
      26  
      27  def parse_uri(uri):
      28      """Parses a URI using the regex given in Appendix B of RFC 3986.
      29  
      30      (scheme, authority, path, query, fragment) = parse_uri(uri)
      31      """
      32      groups = URI.match(uri).groups()
      33      return (groups[1], groups[3], groups[4], groups[6], groups[8])
      34  
      35  
      36  class ESC[4;38;5;81mCacheController(ESC[4;38;5;149mobject):
      37      """An interface to see if request should cached or not."""
      38  
      39      def __init__(
      40          self, cache=None, cache_etags=True, serializer=None, status_codes=None
      41      ):
      42          self.cache = DictCache() if cache is None else cache
      43          self.cache_etags = cache_etags
      44          self.serializer = serializer or Serializer()
      45          self.cacheable_status_codes = status_codes or (200, 203, 300, 301, 308)
      46  
      47      @classmethod
      48      def _urlnorm(cls, uri):
      49          """Normalize the URL to create a safe key for the cache"""
      50          (scheme, authority, path, query, fragment) = parse_uri(uri)
      51          if not scheme or not authority:
      52              raise Exception("Only absolute URIs are allowed. uri = %s" % uri)
      53  
      54          scheme = scheme.lower()
      55          authority = authority.lower()
      56  
      57          if not path:
      58              path = "/"
      59  
      60          # Could do syntax based normalization of the URI before
      61          # computing the digest. See Section 6.2.2 of Std 66.
      62          request_uri = query and "?".join([path, query]) or path
      63          defrag_uri = scheme + "://" + authority + request_uri
      64  
      65          return defrag_uri
      66  
      67      @classmethod
      68      def cache_url(cls, uri):
      69          return cls._urlnorm(uri)
      70  
      71      def parse_cache_control(self, headers):
      72          known_directives = {
      73              # https://tools.ietf.org/html/rfc7234#section-5.2
      74              "max-age": (int, True),
      75              "max-stale": (int, False),
      76              "min-fresh": (int, True),
      77              "no-cache": (None, False),
      78              "no-store": (None, False),
      79              "no-transform": (None, False),
      80              "only-if-cached": (None, False),
      81              "must-revalidate": (None, False),
      82              "public": (None, False),
      83              "private": (None, False),
      84              "proxy-revalidate": (None, False),
      85              "s-maxage": (int, True),
      86          }
      87  
      88          cc_headers = headers.get("cache-control", headers.get("Cache-Control", ""))
      89  
      90          retval = {}
      91  
      92          for cc_directive in cc_headers.split(","):
      93              if not cc_directive.strip():
      94                  continue
      95  
      96              parts = cc_directive.split("=", 1)
      97              directive = parts[0].strip()
      98  
      99              try:
     100                  typ, required = known_directives[directive]
     101              except KeyError:
     102                  logger.debug("Ignoring unknown cache-control directive: %s", directive)
     103                  continue
     104  
     105              if not typ or not required:
     106                  retval[directive] = None
     107              if typ:
     108                  try:
     109                      retval[directive] = typ(parts[1].strip())
     110                  except IndexError:
     111                      if required:
     112                          logger.debug(
     113                              "Missing value for cache-control " "directive: %s",
     114                              directive,
     115                          )
     116                  except ValueError:
     117                      logger.debug(
     118                          "Invalid value for cache-control directive " "%s, must be %s",
     119                          directive,
     120                          typ.__name__,
     121                      )
     122  
     123          return retval
     124  
     125      def cached_request(self, request):
     126          """
     127          Return a cached response if it exists in the cache, otherwise
     128          return False.
     129          """
     130          cache_url = self.cache_url(request.url)
     131          logger.debug('Looking up "%s" in the cache', cache_url)
     132          cc = self.parse_cache_control(request.headers)
     133  
     134          # Bail out if the request insists on fresh data
     135          if "no-cache" in cc:
     136              logger.debug('Request header has "no-cache", cache bypassed')
     137              return False
     138  
     139          if "max-age" in cc and cc["max-age"] == 0:
     140              logger.debug('Request header has "max_age" as 0, cache bypassed')
     141              return False
     142  
     143          # Request allows serving from the cache, let's see if we find something
     144          cache_data = self.cache.get(cache_url)
     145          if cache_data is None:
     146              logger.debug("No cache entry available")
     147              return False
     148  
     149          if isinstance(self.cache, SeparateBodyBaseCache):
     150              body_file = self.cache.get_body(cache_url)
     151          else:
     152              body_file = None
     153  
     154          # Check whether it can be deserialized
     155          resp = self.serializer.loads(request, cache_data, body_file)
     156          if not resp:
     157              logger.warning("Cache entry deserialization failed, entry ignored")
     158              return False
     159  
     160          # If we have a cached permanent redirect, return it immediately. We
     161          # don't need to test our response for other headers b/c it is
     162          # intrinsically "cacheable" as it is Permanent.
     163          #
     164          # See:
     165          #   https://tools.ietf.org/html/rfc7231#section-6.4.2
     166          #
     167          # Client can try to refresh the value by repeating the request
     168          # with cache busting headers as usual (ie no-cache).
     169          if int(resp.status) in PERMANENT_REDIRECT_STATUSES:
     170              msg = (
     171                  "Returning cached permanent redirect response "
     172                  "(ignoring date and etag information)"
     173              )
     174              logger.debug(msg)
     175              return resp
     176  
     177          headers = CaseInsensitiveDict(resp.headers)
     178          if not headers or "date" not in headers:
     179              if "etag" not in headers:
     180                  # Without date or etag, the cached response can never be used
     181                  # and should be deleted.
     182                  logger.debug("Purging cached response: no date or etag")
     183                  self.cache.delete(cache_url)
     184              logger.debug("Ignoring cached response: no date")
     185              return False
     186  
     187          now = time.time()
     188          date = calendar.timegm(parsedate_tz(headers["date"]))
     189          current_age = max(0, now - date)
     190          logger.debug("Current age based on date: %i", current_age)
     191  
     192          # TODO: There is an assumption that the result will be a
     193          #       urllib3 response object. This may not be best since we
     194          #       could probably avoid instantiating or constructing the
     195          #       response until we know we need it.
     196          resp_cc = self.parse_cache_control(headers)
     197  
     198          # determine freshness
     199          freshness_lifetime = 0
     200  
     201          # Check the max-age pragma in the cache control header
     202          if "max-age" in resp_cc:
     203              freshness_lifetime = resp_cc["max-age"]
     204              logger.debug("Freshness lifetime from max-age: %i", freshness_lifetime)
     205  
     206          # If there isn't a max-age, check for an expires header
     207          elif "expires" in headers:
     208              expires = parsedate_tz(headers["expires"])
     209              if expires is not None:
     210                  expire_time = calendar.timegm(expires) - date
     211                  freshness_lifetime = max(0, expire_time)
     212                  logger.debug("Freshness lifetime from expires: %i", freshness_lifetime)
     213  
     214          # Determine if we are setting freshness limit in the
     215          # request. Note, this overrides what was in the response.
     216          if "max-age" in cc:
     217              freshness_lifetime = cc["max-age"]
     218              logger.debug(
     219                  "Freshness lifetime from request max-age: %i", freshness_lifetime
     220              )
     221  
     222          if "min-fresh" in cc:
     223              min_fresh = cc["min-fresh"]
     224              # adjust our current age by our min fresh
     225              current_age += min_fresh
     226              logger.debug("Adjusted current age from min-fresh: %i", current_age)
     227  
     228          # Return entry if it is fresh enough
     229          if freshness_lifetime > current_age:
     230              logger.debug('The response is "fresh", returning cached response')
     231              logger.debug("%i > %i", freshness_lifetime, current_age)
     232              return resp
     233  
     234          # we're not fresh. If we don't have an Etag, clear it out
     235          if "etag" not in headers:
     236              logger.debug('The cached response is "stale" with no etag, purging')
     237              self.cache.delete(cache_url)
     238  
     239          # return the original handler
     240          return False
     241  
     242      def conditional_headers(self, request):
     243          cache_url = self.cache_url(request.url)
     244          resp = self.serializer.loads(request, self.cache.get(cache_url))
     245          new_headers = {}
     246  
     247          if resp:
     248              headers = CaseInsensitiveDict(resp.headers)
     249  
     250              if "etag" in headers:
     251                  new_headers["If-None-Match"] = headers["ETag"]
     252  
     253              if "last-modified" in headers:
     254                  new_headers["If-Modified-Since"] = headers["Last-Modified"]
     255  
     256          return new_headers
     257  
     258      def _cache_set(self, cache_url, request, response, body=None, expires_time=None):
     259          """
     260          Store the data in the cache.
     261          """
     262          if isinstance(self.cache, SeparateBodyBaseCache):
     263              # We pass in the body separately; just put a placeholder empty
     264              # string in the metadata.
     265              self.cache.set(
     266                  cache_url,
     267                  self.serializer.dumps(request, response, b""),
     268                  expires=expires_time,
     269              )
     270              self.cache.set_body(cache_url, body)
     271          else:
     272              self.cache.set(
     273                  cache_url,
     274                  self.serializer.dumps(request, response, body),
     275                  expires=expires_time,
     276              )
     277  
     278      def cache_response(self, request, response, body=None, status_codes=None):
     279          """
     280          Algorithm for caching requests.
     281  
     282          This assumes a requests Response object.
     283          """
     284          # From httplib2: Don't cache 206's since we aren't going to
     285          #                handle byte range requests
     286          cacheable_status_codes = status_codes or self.cacheable_status_codes
     287          if response.status not in cacheable_status_codes:
     288              logger.debug(
     289                  "Status code %s not in %s", response.status, cacheable_status_codes
     290              )
     291              return
     292  
     293          response_headers = CaseInsensitiveDict(response.headers)
     294  
     295          if "date" in response_headers:
     296              date = calendar.timegm(parsedate_tz(response_headers["date"]))
     297          else:
     298              date = 0
     299  
     300          # If we've been given a body, our response has a Content-Length, that
     301          # Content-Length is valid then we can check to see if the body we've
     302          # been given matches the expected size, and if it doesn't we'll just
     303          # skip trying to cache it.
     304          if (
     305              body is not None
     306              and "content-length" in response_headers
     307              and response_headers["content-length"].isdigit()
     308              and int(response_headers["content-length"]) != len(body)
     309          ):
     310              return
     311  
     312          cc_req = self.parse_cache_control(request.headers)
     313          cc = self.parse_cache_control(response_headers)
     314  
     315          cache_url = self.cache_url(request.url)
     316          logger.debug('Updating cache with response from "%s"', cache_url)
     317  
     318          # Delete it from the cache if we happen to have it stored there
     319          no_store = False
     320          if "no-store" in cc:
     321              no_store = True
     322              logger.debug('Response header has "no-store"')
     323          if "no-store" in cc_req:
     324              no_store = True
     325              logger.debug('Request header has "no-store"')
     326          if no_store and self.cache.get(cache_url):
     327              logger.debug('Purging existing cache entry to honor "no-store"')
     328              self.cache.delete(cache_url)
     329          if no_store:
     330              return
     331  
     332          # https://tools.ietf.org/html/rfc7234#section-4.1:
     333          # A Vary header field-value of "*" always fails to match.
     334          # Storing such a response leads to a deserialization warning
     335          # during cache lookup and is not allowed to ever be served,
     336          # so storing it can be avoided.
     337          if "*" in response_headers.get("vary", ""):
     338              logger.debug('Response header has "Vary: *"')
     339              return
     340  
     341          # If we've been given an etag, then keep the response
     342          if self.cache_etags and "etag" in response_headers:
     343              expires_time = 0
     344              if response_headers.get("expires"):
     345                  expires = parsedate_tz(response_headers["expires"])
     346                  if expires is not None:
     347                      expires_time = calendar.timegm(expires) - date
     348  
     349              expires_time = max(expires_time, 14 * 86400)
     350  
     351              logger.debug("etag object cached for {0} seconds".format(expires_time))
     352              logger.debug("Caching due to etag")
     353              self._cache_set(cache_url, request, response, body, expires_time)
     354  
     355          # Add to the cache any permanent redirects. We do this before looking
     356          # that the Date headers.
     357          elif int(response.status) in PERMANENT_REDIRECT_STATUSES:
     358              logger.debug("Caching permanent redirect")
     359              self._cache_set(cache_url, request, response, b"")
     360  
     361          # Add to the cache if the response headers demand it. If there
     362          # is no date header then we can't do anything about expiring
     363          # the cache.
     364          elif "date" in response_headers:
     365              date = calendar.timegm(parsedate_tz(response_headers["date"]))
     366              # cache when there is a max-age > 0
     367              if "max-age" in cc and cc["max-age"] > 0:
     368                  logger.debug("Caching b/c date exists and max-age > 0")
     369                  expires_time = cc["max-age"]
     370                  self._cache_set(
     371                      cache_url,
     372                      request,
     373                      response,
     374                      body,
     375                      expires_time,
     376                  )
     377  
     378              # If the request can expire, it means we should cache it
     379              # in the meantime.
     380              elif "expires" in response_headers:
     381                  if response_headers["expires"]:
     382                      expires = parsedate_tz(response_headers["expires"])
     383                      if expires is not None:
     384                          expires_time = calendar.timegm(expires) - date
     385                      else:
     386                          expires_time = None
     387  
     388                      logger.debug(
     389                          "Caching b/c of expires header. expires in {0} seconds".format(
     390                              expires_time
     391                          )
     392                      )
     393                      self._cache_set(
     394                          cache_url,
     395                          request,
     396                          response,
     397                          body,
     398                          expires_time,
     399                      )
     400  
     401      def update_cached_response(self, request, response):
     402          """On a 304 we will get a new set of headers that we want to
     403          update our cached value with, assuming we have one.
     404  
     405          This should only ever be called when we've sent an ETag and
     406          gotten a 304 as the response.
     407          """
     408          cache_url = self.cache_url(request.url)
     409  
     410          cached_response = self.serializer.loads(request, self.cache.get(cache_url))
     411  
     412          if not cached_response:
     413              # we didn't have a cached response
     414              return response
     415  
     416          # Lets update our headers with the headers from the new request:
     417          # http://tools.ietf.org/html/draft-ietf-httpbis-p4-conditional-26#section-4.1
     418          #
     419          # The server isn't supposed to send headers that would make
     420          # the cached body invalid. But... just in case, we'll be sure
     421          # to strip out ones we know that might be problmatic due to
     422          # typical assumptions.
     423          excluded_headers = ["content-length"]
     424  
     425          cached_response.headers.update(
     426              dict(
     427                  (k, v)
     428                  for k, v in response.headers.items()
     429                  if k.lower() not in excluded_headers
     430              )
     431          )
     432  
     433          # we want a 200 b/c we have content via the cache
     434          cached_response.status = 200
     435  
     436          # update our cache
     437          self._cache_set(cache_url, request, cached_response)
     438  
     439          return cached_response