1 r"""HTTP cookie handling for web clients.
2
3 This module has (now fairly distant) origins in Gisle Aas' Perl module
4 HTTP::Cookies, from the libwww-perl library.
5
6 Docstrings, comments and debug strings in this code refer to the
7 attributes of the HTTP cookie system as cookie-attributes, to distinguish
8 them clearly from Python attributes.
9
10 Class diagram (note that BSDDBCookieJar and the MSIE* classes are not
11 distributed with the Python standard library, but are available from
12 http://wwwsearch.sf.net/):
13
14 CookieJar____
15 / \ \
16 FileCookieJar \ \
17 / | \ \ \
18 MozillaCookieJar | LWPCookieJar \ \
19 | | \
20 | ---MSIEBase | \
21 | / | | \
22 | / MSIEDBCookieJar BSDDBCookieJar
23 |/
24 MSIECookieJar
25
26 """
27
28 __all__ = ['Cookie', 'CookieJar', 'CookiePolicy', 'DefaultCookiePolicy',
29 'FileCookieJar', 'LWPCookieJar', 'LoadError', 'MozillaCookieJar']
30
31 import os
32 import copy
33 import datetime
34 import re
35 import time
36 import urllib.parse, urllib.request
37 import threading as _threading
38 import http.client # only for the default HTTP port
39 from calendar import timegm
40
41 debug = False # set to True to enable debugging via the logging module
42 logger = None
43
44 def _debug(*args):
45 if not debug:
46 return
47 global logger
48 if not logger:
49 import logging
50 logger = logging.getLogger("http.cookiejar")
51 return logger.debug(*args)
52
53 HTTPONLY_ATTR = "HTTPOnly"
54 HTTPONLY_PREFIX = "#HttpOnly_"
55 DEFAULT_HTTP_PORT = str(http.client.HTTP_PORT)
56 NETSCAPE_MAGIC_RGX = re.compile("#( Netscape)? HTTP Cookie File")
57 MISSING_FILENAME_TEXT = ("a filename was not supplied (nor was the CookieJar "
58 "instance initialised with one)")
59 NETSCAPE_HEADER_TEXT = """\
60 # Netscape HTTP Cookie File
61 # http://curl.haxx.se/rfc/cookie_spec.html
62 # This is a generated file! Do not edit.
63
64 """
65
66 def _warn_unhandled_exception():
67 # There are a few catch-all except: statements in this module, for
68 # catching input that's bad in unexpected ways. Warn if any
69 # exceptions are caught there.
70 import io, warnings, traceback
71 f = io.StringIO()
72 traceback.print_exc(None, f)
73 msg = f.getvalue()
74 warnings.warn("http.cookiejar bug!\n%s" % msg, stacklevel=2)
75
76
77 # Date/time conversion
78 # -----------------------------------------------------------------------------
79
80 EPOCH_YEAR = 1970
81 def _timegm(tt):
82 year, month, mday, hour, min, sec = tt[:6]
83 if ((year >= EPOCH_YEAR) and (1 <= month <= 12) and (1 <= mday <= 31) and
84 (0 <= hour <= 24) and (0 <= min <= 59) and (0 <= sec <= 61)):
85 return timegm(tt)
86 else:
87 return None
88
89 DAYS = ["Mon", "Tue", "Wed", "Thu", "Fri", "Sat", "Sun"]
90 MONTHS = ["Jan", "Feb", "Mar", "Apr", "May", "Jun",
91 "Jul", "Aug", "Sep", "Oct", "Nov", "Dec"]
92 MONTHS_LOWER = [month.lower() for month in MONTHS]
93
94 def time2isoz(t=None):
95 """Return a string representing time in seconds since epoch, t.
96
97 If the function is called without an argument, it will use the current
98 time.
99
100 The format of the returned string is like "YYYY-MM-DD hh:mm:ssZ",
101 representing Universal Time (UTC, aka GMT). An example of this format is:
102
103 1994-11-24 08:49:37Z
104
105 """
106 if t is None:
107 dt = datetime.datetime.now(tz=datetime.UTC)
108 else:
109 dt = datetime.datetime.fromtimestamp(t, tz=datetime.UTC)
110 return "%04d-%02d-%02d %02d:%02d:%02dZ" % (
111 dt.year, dt.month, dt.day, dt.hour, dt.minute, dt.second)
112
113 def time2netscape(t=None):
114 """Return a string representing time in seconds since epoch, t.
115
116 If the function is called without an argument, it will use the current
117 time.
118
119 The format of the returned string is like this:
120
121 Wed, DD-Mon-YYYY HH:MM:SS GMT
122
123 """
124 if t is None:
125 dt = datetime.datetime.now(tz=datetime.UTC)
126 else:
127 dt = datetime.datetime.fromtimestamp(t, tz=datetime.UTC)
128 return "%s, %02d-%s-%04d %02d:%02d:%02d GMT" % (
129 DAYS[dt.weekday()], dt.day, MONTHS[dt.month-1],
130 dt.year, dt.hour, dt.minute, dt.second)
131
132
133 UTC_ZONES = {"GMT": None, "UTC": None, "UT": None, "Z": None}
134
135 TIMEZONE_RE = re.compile(r"^([-+])?(\d\d?):?(\d\d)?$", re.ASCII)
136 def offset_from_tz_string(tz):
137 offset = None
138 if tz in UTC_ZONES:
139 offset = 0
140 else:
141 m = TIMEZONE_RE.search(tz)
142 if m:
143 offset = 3600 * int(m.group(2))
144 if m.group(3):
145 offset = offset + 60 * int(m.group(3))
146 if m.group(1) == '-':
147 offset = -offset
148 return offset
149
150 def _str2time(day, mon, yr, hr, min, sec, tz):
151 yr = int(yr)
152 if yr > datetime.MAXYEAR:
153 return None
154
155 # translate month name to number
156 # month numbers start with 1 (January)
157 try:
158 mon = MONTHS_LOWER.index(mon.lower())+1
159 except ValueError:
160 # maybe it's already a number
161 try:
162 imon = int(mon)
163 except ValueError:
164 return None
165 if 1 <= imon <= 12:
166 mon = imon
167 else:
168 return None
169
170 # make sure clock elements are defined
171 if hr is None: hr = 0
172 if min is None: min = 0
173 if sec is None: sec = 0
174
175 day = int(day)
176 hr = int(hr)
177 min = int(min)
178 sec = int(sec)
179
180 if yr < 1000:
181 # find "obvious" year
182 cur_yr = time.localtime(time.time())[0]
183 m = cur_yr % 100
184 tmp = yr
185 yr = yr + cur_yr - m
186 m = m - tmp
187 if abs(m) > 50:
188 if m > 0: yr = yr + 100
189 else: yr = yr - 100
190
191 # convert UTC time tuple to seconds since epoch (not timezone-adjusted)
192 t = _timegm((yr, mon, day, hr, min, sec, tz))
193
194 if t is not None:
195 # adjust time using timezone string, to get absolute time since epoch
196 if tz is None:
197 tz = "UTC"
198 tz = tz.upper()
199 offset = offset_from_tz_string(tz)
200 if offset is None:
201 return None
202 t = t - offset
203
204 return t
205
206 STRICT_DATE_RE = re.compile(
207 r"^[SMTWF][a-z][a-z], (\d\d) ([JFMASOND][a-z][a-z]) "
208 r"(\d\d\d\d) (\d\d):(\d\d):(\d\d) GMT$", re.ASCII)
209 WEEKDAY_RE = re.compile(
210 r"^(?:Sun|Mon|Tue|Wed|Thu|Fri|Sat)[a-z]*,?\s*", re.I | re.ASCII)
211 LOOSE_HTTP_DATE_RE = re.compile(
212 r"""^
213 (\d\d?) # day
214 (?:\s+|[-\/])
215 (\w+) # month
216 (?:\s+|[-\/])
217 (\d+) # year
218 (?:
219 (?:\s+|:) # separator before clock
220 (\d\d?):(\d\d) # hour:min
221 (?::(\d\d))? # optional seconds
222 )? # optional clock
223 \s*
224 (?:
225 ([-+]?\d{2,4}|(?![APap][Mm]\b)[A-Za-z]+) # timezone
226 \s*
227 )?
228 (?:
229 \(\w+\) # ASCII representation of timezone in parens.
230 \s*
231 )?$""", re.X | re.ASCII)
232 def http2time(text):
233 """Returns time in seconds since epoch of time represented by a string.
234
235 Return value is an integer.
236
237 None is returned if the format of str is unrecognized, the time is outside
238 the representable range, or the timezone string is not recognized. If the
239 string contains no timezone, UTC is assumed.
240
241 The timezone in the string may be numerical (like "-0800" or "+0100") or a
242 string timezone (like "UTC", "GMT", "BST" or "EST"). Currently, only the
243 timezone strings equivalent to UTC (zero offset) are known to the function.
244
245 The function loosely parses the following formats:
246
247 Wed, 09 Feb 1994 22:23:32 GMT -- HTTP format
248 Tuesday, 08-Feb-94 14:15:29 GMT -- old rfc850 HTTP format
249 Tuesday, 08-Feb-1994 14:15:29 GMT -- broken rfc850 HTTP format
250 09 Feb 1994 22:23:32 GMT -- HTTP format (no weekday)
251 08-Feb-94 14:15:29 GMT -- rfc850 format (no weekday)
252 08-Feb-1994 14:15:29 GMT -- broken rfc850 format (no weekday)
253
254 The parser ignores leading and trailing whitespace. The time may be
255 absent.
256
257 If the year is given with only 2 digits, the function will select the
258 century that makes the year closest to the current date.
259
260 """
261 # fast exit for strictly conforming string
262 m = STRICT_DATE_RE.search(text)
263 if m:
264 g = m.groups()
265 mon = MONTHS_LOWER.index(g[1].lower()) + 1
266 tt = (int(g[2]), mon, int(g[0]),
267 int(g[3]), int(g[4]), float(g[5]))
268 return _timegm(tt)
269
270 # No, we need some messy parsing...
271
272 # clean up
273 text = text.lstrip()
274 text = WEEKDAY_RE.sub("", text, 1) # Useless weekday
275
276 # tz is time zone specifier string
277 day, mon, yr, hr, min, sec, tz = [None]*7
278
279 # loose regexp parse
280 m = LOOSE_HTTP_DATE_RE.search(text)
281 if m is not None:
282 day, mon, yr, hr, min, sec, tz = m.groups()
283 else:
284 return None # bad format
285
286 return _str2time(day, mon, yr, hr, min, sec, tz)
287
288 ISO_DATE_RE = re.compile(
289 r"""^
290 (\d{4}) # year
291 [-\/]?
292 (\d\d?) # numerical month
293 [-\/]?
294 (\d\d?) # day
295 (?:
296 (?:\s+|[-:Tt]) # separator before clock
297 (\d\d?):?(\d\d) # hour:min
298 (?::?(\d\d(?:\.\d*)?))? # optional seconds (and fractional)
299 )? # optional clock
300 \s*
301 (?:
302 ([-+]?\d\d?:?(:?\d\d)?
303 |Z|z) # timezone (Z is "zero meridian", i.e. GMT)
304 \s*
305 )?$""", re.X | re. ASCII)
306 def iso2time(text):
307 """
308 As for http2time, but parses the ISO 8601 formats:
309
310 1994-02-03 14:15:29 -0100 -- ISO 8601 format
311 1994-02-03 14:15:29 -- zone is optional
312 1994-02-03 -- only date
313 1994-02-03T14:15:29 -- Use T as separator
314 19940203T141529Z -- ISO 8601 compact format
315 19940203 -- only date
316
317 """
318 # clean up
319 text = text.lstrip()
320
321 # tz is time zone specifier string
322 day, mon, yr, hr, min, sec, tz = [None]*7
323
324 # loose regexp parse
325 m = ISO_DATE_RE.search(text)
326 if m is not None:
327 # XXX there's an extra bit of the timezone I'm ignoring here: is
328 # this the right thing to do?
329 yr, mon, day, hr, min, sec, tz, _ = m.groups()
330 else:
331 return None # bad format
332
333 return _str2time(day, mon, yr, hr, min, sec, tz)
334
335
336 # Header parsing
337 # -----------------------------------------------------------------------------
338
339 def unmatched(match):
340 """Return unmatched part of re.Match object."""
341 start, end = match.span(0)
342 return match.string[:start]+match.string[end:]
343
344 HEADER_TOKEN_RE = re.compile(r"^\s*([^=\s;,]+)")
345 HEADER_QUOTED_VALUE_RE = re.compile(r"^\s*=\s*\"([^\"\\]*(?:\\.[^\"\\]*)*)\"")
346 HEADER_VALUE_RE = re.compile(r"^\s*=\s*([^\s;,]*)")
347 HEADER_ESCAPE_RE = re.compile(r"\\(.)")
348 def split_header_words(header_values):
349 r"""Parse header values into a list of lists containing key,value pairs.
350
351 The function knows how to deal with ",", ";" and "=" as well as quoted
352 values after "=". A list of space separated tokens are parsed as if they
353 were separated by ";".
354
355 If the header_values passed as argument contains multiple values, then they
356 are treated as if they were a single value separated by comma ",".
357
358 This means that this function is useful for parsing header fields that
359 follow this syntax (BNF as from the HTTP/1.1 specification, but we relax
360 the requirement for tokens).
361
362 headers = #header
363 header = (token | parameter) *( [";"] (token | parameter))
364
365 token = 1*<any CHAR except CTLs or separators>
366 separators = "(" | ")" | "<" | ">" | "@"
367 | "," | ";" | ":" | "\" | <">
368 | "/" | "[" | "]" | "?" | "="
369 | "{" | "}" | SP | HT
370
371 quoted-string = ( <"> *(qdtext | quoted-pair ) <"> )
372 qdtext = <any TEXT except <">>
373 quoted-pair = "\" CHAR
374
375 parameter = attribute "=" value
376 attribute = token
377 value = token | quoted-string
378
379 Each header is represented by a list of key/value pairs. The value for a
380 simple token (not part of a parameter) is None. Syntactically incorrect
381 headers will not necessarily be parsed as you would want.
382
383 This is easier to describe with some examples:
384
385 >>> split_header_words(['foo="bar"; port="80,81"; discard, bar=baz'])
386 [[('foo', 'bar'), ('port', '80,81'), ('discard', None)], [('bar', 'baz')]]
387 >>> split_header_words(['text/html; charset="iso-8859-1"'])
388 [[('text/html', None), ('charset', 'iso-8859-1')]]
389 >>> split_header_words([r'Basic realm="\"foo\bar\""'])
390 [[('Basic', None), ('realm', '"foobar"')]]
391
392 """
393 assert not isinstance(header_values, str)
394 result = []
395 for text in header_values:
396 orig_text = text
397 pairs = []
398 while text:
399 m = HEADER_TOKEN_RE.search(text)
400 if m:
401 text = unmatched(m)
402 name = m.group(1)
403 m = HEADER_QUOTED_VALUE_RE.search(text)
404 if m: # quoted value
405 text = unmatched(m)
406 value = m.group(1)
407 value = HEADER_ESCAPE_RE.sub(r"\1", value)
408 else:
409 m = HEADER_VALUE_RE.search(text)
410 if m: # unquoted value
411 text = unmatched(m)
412 value = m.group(1)
413 value = value.rstrip()
414 else:
415 # no value, a lone token
416 value = None
417 pairs.append((name, value))
418 elif text.lstrip().startswith(","):
419 # concatenated headers, as per RFC 2616 section 4.2
420 text = text.lstrip()[1:]
421 if pairs: result.append(pairs)
422 pairs = []
423 else:
424 # skip junk
425 non_junk, nr_junk_chars = re.subn(r"^[=\s;]*", "", text)
426 assert nr_junk_chars > 0, (
427 "split_header_words bug: '%s', '%s', %s" %
428 (orig_text, text, pairs))
429 text = non_junk
430 if pairs: result.append(pairs)
431 return result
432
433 HEADER_JOIN_ESCAPE_RE = re.compile(r"([\"\\])")
434 def join_header_words(lists):
435 """Do the inverse (almost) of the conversion done by split_header_words.
436
437 Takes a list of lists of (key, value) pairs and produces a single header
438 value. Attribute values are quoted if needed.
439
440 >>> join_header_words([[("text/plain", None), ("charset", "iso-8859-1")]])
441 'text/plain; charset="iso-8859-1"'
442 >>> join_header_words([[("text/plain", None)], [("charset", "iso-8859-1")]])
443 'text/plain, charset="iso-8859-1"'
444
445 """
446 headers = []
447 for pairs in lists:
448 attr = []
449 for k, v in pairs:
450 if v is not None:
451 if not re.search(r"^\w+$", v):
452 v = HEADER_JOIN_ESCAPE_RE.sub(r"\\\1", v) # escape " and \
453 v = '"%s"' % v
454 k = "%s=%s" % (k, v)
455 attr.append(k)
456 if attr: headers.append("; ".join(attr))
457 return ", ".join(headers)
458
459 def strip_quotes(text):
460 if text.startswith('"'):
461 text = text[1:]
462 if text.endswith('"'):
463 text = text[:-1]
464 return text
465
466 def parse_ns_headers(ns_headers):
467 """Ad-hoc parser for Netscape protocol cookie-attributes.
468
469 The old Netscape cookie format for Set-Cookie can for instance contain
470 an unquoted "," in the expires field, so we have to use this ad-hoc
471 parser instead of split_header_words.
472
473 XXX This may not make the best possible effort to parse all the crap
474 that Netscape Cookie headers contain. Ronald Tschalar's HTTPClient
475 parser is probably better, so could do worse than following that if
476 this ever gives any trouble.
477
478 Currently, this is also used for parsing RFC 2109 cookies.
479
480 """
481 known_attrs = ("expires", "domain", "path", "secure",
482 # RFC 2109 attrs (may turn up in Netscape cookies, too)
483 "version", "port", "max-age")
484
485 result = []
486 for ns_header in ns_headers:
487 pairs = []
488 version_set = False
489
490 # XXX: The following does not strictly adhere to RFCs in that empty
491 # names and values are legal (the former will only appear once and will
492 # be overwritten if multiple occurrences are present). This is
493 # mostly to deal with backwards compatibility.
494 for ii, param in enumerate(ns_header.split(';')):
495 param = param.strip()
496
497 key, sep, val = param.partition('=')
498 key = key.strip()
499
500 if not key:
501 if ii == 0:
502 break
503 else:
504 continue
505
506 # allow for a distinction between present and empty and missing
507 # altogether
508 val = val.strip() if sep else None
509
510 if ii != 0:
511 lc = key.lower()
512 if lc in known_attrs:
513 key = lc
514
515 if key == "version":
516 # This is an RFC 2109 cookie.
517 if val is not None:
518 val = strip_quotes(val)
519 version_set = True
520 elif key == "expires":
521 # convert expires date to seconds since epoch
522 if val is not None:
523 val = http2time(strip_quotes(val)) # None if invalid
524 pairs.append((key, val))
525
526 if pairs:
527 if not version_set:
528 pairs.append(("version", "0"))
529 result.append(pairs)
530
531 return result
532
533
534 IPV4_RE = re.compile(r"\.\d+$", re.ASCII)
535 def is_HDN(text):
536 """Return True if text is a host domain name."""
537 # XXX
538 # This may well be wrong. Which RFC is HDN defined in, if any (for
539 # the purposes of RFC 2965)?
540 # For the current implementation, what about IPv6? Remember to look
541 # at other uses of IPV4_RE also, if change this.
542 if IPV4_RE.search(text):
543 return False
544 if text == "":
545 return False
546 if text[0] == "." or text[-1] == ".":
547 return False
548 return True
549
550 def domain_match(A, B):
551 """Return True if domain A domain-matches domain B, according to RFC 2965.
552
553 A and B may be host domain names or IP addresses.
554
555 RFC 2965, section 1:
556
557 Host names can be specified either as an IP address or a HDN string.
558 Sometimes we compare one host name with another. (Such comparisons SHALL
559 be case-insensitive.) Host A's name domain-matches host B's if
560
561 * their host name strings string-compare equal; or
562
563 * A is a HDN string and has the form NB, where N is a non-empty
564 name string, B has the form .B', and B' is a HDN string. (So,
565 x.y.com domain-matches .Y.com but not Y.com.)
566
567 Note that domain-match is not a commutative operation: a.b.c.com
568 domain-matches .c.com, but not the reverse.
569
570 """
571 # Note that, if A or B are IP addresses, the only relevant part of the
572 # definition of the domain-match algorithm is the direct string-compare.
573 A = A.lower()
574 B = B.lower()
575 if A == B:
576 return True
577 if not is_HDN(A):
578 return False
579 i = A.rfind(B)
580 if i == -1 or i == 0:
581 # A does not have form NB, or N is the empty string
582 return False
583 if not B.startswith("."):
584 return False
585 if not is_HDN(B[1:]):
586 return False
587 return True
588
589 def liberal_is_HDN(text):
590 """Return True if text is a sort-of-like a host domain name.
591
592 For accepting/blocking domains.
593
594 """
595 if IPV4_RE.search(text):
596 return False
597 return True
598
599 def user_domain_match(A, B):
600 """For blocking/accepting domains.
601
602 A and B may be host domain names or IP addresses.
603
604 """
605 A = A.lower()
606 B = B.lower()
607 if not (liberal_is_HDN(A) and liberal_is_HDN(B)):
608 if A == B:
609 # equal IP addresses
610 return True
611 return False
612 initial_dot = B.startswith(".")
613 if initial_dot and A.endswith(B):
614 return True
615 if not initial_dot and A == B:
616 return True
617 return False
618
619 cut_port_re = re.compile(r":\d+$", re.ASCII)
620 def request_host(request):
621 """Return request-host, as defined by RFC 2965.
622
623 Variation from RFC: returned value is lowercased, for convenient
624 comparison.
625
626 """
627 url = request.get_full_url()
628 host = urllib.parse.urlparse(url)[1]
629 if host == "":
630 host = request.get_header("Host", "")
631
632 # remove port, if present
633 host = cut_port_re.sub("", host, 1)
634 return host.lower()
635
636 def eff_request_host(request):
637 """Return a tuple (request-host, effective request-host name).
638
639 As defined by RFC 2965, except both are lowercased.
640
641 """
642 erhn = req_host = request_host(request)
643 if "." not in req_host:
644 erhn = req_host + ".local"
645 return req_host, erhn
646
647 def request_path(request):
648 """Path component of request-URI, as defined by RFC 2965."""
649 url = request.get_full_url()
650 parts = urllib.parse.urlsplit(url)
651 path = escape_path(parts.path)
652 if not path.startswith("/"):
653 # fix bad RFC 2396 absoluteURI
654 path = "/" + path
655 return path
656
657 def request_port(request):
658 host = request.host
659 i = host.find(':')
660 if i >= 0:
661 port = host[i+1:]
662 try:
663 int(port)
664 except ValueError:
665 _debug("nonnumeric port: '%s'", port)
666 return None
667 else:
668 port = DEFAULT_HTTP_PORT
669 return port
670
671 # Characters in addition to A-Z, a-z, 0-9, '_', '.', and '-' that don't
672 # need to be escaped to form a valid HTTP URL (RFCs 2396 and 1738).
673 HTTP_PATH_SAFE = "%/;:@&=+$,!~*'()"
674 ESCAPED_CHAR_RE = re.compile(r"%([0-9a-fA-F][0-9a-fA-F])")
675 def uppercase_escaped_char(match):
676 return "%%%s" % match.group(1).upper()
677 def escape_path(path):
678 """Escape any invalid characters in HTTP URL, and uppercase all escapes."""
679 # There's no knowing what character encoding was used to create URLs
680 # containing %-escapes, but since we have to pick one to escape invalid
681 # path characters, we pick UTF-8, as recommended in the HTML 4.0
682 # specification:
683 # http://www.w3.org/TR/REC-html40/appendix/notes.html#h-B.2.1
684 # And here, kind of: draft-fielding-uri-rfc2396bis-03
685 # (And in draft IRI specification: draft-duerst-iri-05)
686 # (And here, for new URI schemes: RFC 2718)
687 path = urllib.parse.quote(path, HTTP_PATH_SAFE)
688 path = ESCAPED_CHAR_RE.sub(uppercase_escaped_char, path)
689 return path
690
691 def reach(h):
692 """Return reach of host h, as defined by RFC 2965, section 1.
693
694 The reach R of a host name H is defined as follows:
695
696 * If
697
698 - H is the host domain name of a host; and,
699
700 - H has the form A.B; and
701
702 - A has no embedded (that is, interior) dots; and
703
704 - B has at least one embedded dot, or B is the string "local".
705 then the reach of H is .B.
706
707 * Otherwise, the reach of H is H.
708
709 >>> reach("www.acme.com")
710 '.acme.com'
711 >>> reach("acme.com")
712 'acme.com'
713 >>> reach("acme.local")
714 '.local'
715
716 """
717 i = h.find(".")
718 if i >= 0:
719 #a = h[:i] # this line is only here to show what a is
720 b = h[i+1:]
721 i = b.find(".")
722 if is_HDN(h) and (i >= 0 or b == "local"):
723 return "."+b
724 return h
725
726 def is_third_party(request):
727 """
728
729 RFC 2965, section 3.3.6:
730
731 An unverifiable transaction is to a third-party host if its request-
732 host U does not domain-match the reach R of the request-host O in the
733 origin transaction.
734
735 """
736 req_host = request_host(request)
737 if not domain_match(req_host, reach(request.origin_req_host)):
738 return True
739 else:
740 return False
741
742
743 class ESC[4;38;5;81mCookie:
744 """HTTP Cookie.
745
746 This class represents both Netscape and RFC 2965 cookies.
747
748 This is deliberately a very simple class. It just holds attributes. It's
749 possible to construct Cookie instances that don't comply with the cookie
750 standards. CookieJar.make_cookies is the factory function for Cookie
751 objects -- it deals with cookie parsing, supplying defaults, and
752 normalising to the representation used in this class. CookiePolicy is
753 responsible for checking them to see whether they should be accepted from
754 and returned to the server.
755
756 Note that the port may be present in the headers, but unspecified ("Port"
757 rather than"Port=80", for example); if this is the case, port is None.
758
759 """
760
761 def __init__(self, version, name, value,
762 port, port_specified,
763 domain, domain_specified, domain_initial_dot,
764 path, path_specified,
765 secure,
766 expires,
767 discard,
768 comment,
769 comment_url,
770 rest,
771 rfc2109=False,
772 ):
773
774 if version is not None: version = int(version)
775 if expires is not None: expires = int(float(expires))
776 if port is None and port_specified is True:
777 raise ValueError("if port is None, port_specified must be false")
778
779 self.version = version
780 self.name = name
781 self.value = value
782 self.port = port
783 self.port_specified = port_specified
784 # normalise case, as per RFC 2965 section 3.3.3
785 self.domain = domain.lower()
786 self.domain_specified = domain_specified
787 # Sigh. We need to know whether the domain given in the
788 # cookie-attribute had an initial dot, in order to follow RFC 2965
789 # (as clarified in draft errata). Needed for the returned $Domain
790 # value.
791 self.domain_initial_dot = domain_initial_dot
792 self.path = path
793 self.path_specified = path_specified
794 self.secure = secure
795 self.expires = expires
796 self.discard = discard
797 self.comment = comment
798 self.comment_url = comment_url
799 self.rfc2109 = rfc2109
800
801 self._rest = copy.copy(rest)
802
803 def has_nonstandard_attr(self, name):
804 return name in self._rest
805 def get_nonstandard_attr(self, name, default=None):
806 return self._rest.get(name, default)
807 def set_nonstandard_attr(self, name, value):
808 self._rest[name] = value
809
810 def is_expired(self, now=None):
811 if now is None: now = time.time()
812 if (self.expires is not None) and (self.expires <= now):
813 return True
814 return False
815
816 def __str__(self):
817 if self.port is None: p = ""
818 else: p = ":"+self.port
819 limit = self.domain + p + self.path
820 if self.value is not None:
821 namevalue = "%s=%s" % (self.name, self.value)
822 else:
823 namevalue = self.name
824 return "<Cookie %s for %s>" % (namevalue, limit)
825
826 def __repr__(self):
827 args = []
828 for name in ("version", "name", "value",
829 "port", "port_specified",
830 "domain", "domain_specified", "domain_initial_dot",
831 "path", "path_specified",
832 "secure", "expires", "discard", "comment", "comment_url",
833 ):
834 attr = getattr(self, name)
835 args.append("%s=%s" % (name, repr(attr)))
836 args.append("rest=%s" % repr(self._rest))
837 args.append("rfc2109=%s" % repr(self.rfc2109))
838 return "%s(%s)" % (self.__class__.__name__, ", ".join(args))
839
840
841 class ESC[4;38;5;81mCookiePolicy:
842 """Defines which cookies get accepted from and returned to server.
843
844 May also modify cookies, though this is probably a bad idea.
845
846 The subclass DefaultCookiePolicy defines the standard rules for Netscape
847 and RFC 2965 cookies -- override that if you want a customized policy.
848
849 """
850 def set_ok(self, cookie, request):
851 """Return true if (and only if) cookie should be accepted from server.
852
853 Currently, pre-expired cookies never get this far -- the CookieJar
854 class deletes such cookies itself.
855
856 """
857 raise NotImplementedError()
858
859 def return_ok(self, cookie, request):
860 """Return true if (and only if) cookie should be returned to server."""
861 raise NotImplementedError()
862
863 def domain_return_ok(self, domain, request):
864 """Return false if cookies should not be returned, given cookie domain.
865 """
866 return True
867
868 def path_return_ok(self, path, request):
869 """Return false if cookies should not be returned, given cookie path.
870 """
871 return True
872
873
874 class ESC[4;38;5;81mDefaultCookiePolicy(ESC[4;38;5;149mCookiePolicy):
875 """Implements the standard rules for accepting and returning cookies."""
876
877 DomainStrictNoDots = 1
878 DomainStrictNonDomain = 2
879 DomainRFC2965Match = 4
880
881 DomainLiberal = 0
882 DomainStrict = DomainStrictNoDots|DomainStrictNonDomain
883
884 def __init__(self,
885 blocked_domains=None, allowed_domains=None,
886 netscape=True, rfc2965=False,
887 rfc2109_as_netscape=None,
888 hide_cookie2=False,
889 strict_domain=False,
890 strict_rfc2965_unverifiable=True,
891 strict_ns_unverifiable=False,
892 strict_ns_domain=DomainLiberal,
893 strict_ns_set_initial_dollar=False,
894 strict_ns_set_path=False,
895 secure_protocols=("https", "wss")
896 ):
897 """Constructor arguments should be passed as keyword arguments only."""
898 self.netscape = netscape
899 self.rfc2965 = rfc2965
900 self.rfc2109_as_netscape = rfc2109_as_netscape
901 self.hide_cookie2 = hide_cookie2
902 self.strict_domain = strict_domain
903 self.strict_rfc2965_unverifiable = strict_rfc2965_unverifiable
904 self.strict_ns_unverifiable = strict_ns_unverifiable
905 self.strict_ns_domain = strict_ns_domain
906 self.strict_ns_set_initial_dollar = strict_ns_set_initial_dollar
907 self.strict_ns_set_path = strict_ns_set_path
908 self.secure_protocols = secure_protocols
909
910 if blocked_domains is not None:
911 self._blocked_domains = tuple(blocked_domains)
912 else:
913 self._blocked_domains = ()
914
915 if allowed_domains is not None:
916 allowed_domains = tuple(allowed_domains)
917 self._allowed_domains = allowed_domains
918
919 def blocked_domains(self):
920 """Return the sequence of blocked domains (as a tuple)."""
921 return self._blocked_domains
922 def set_blocked_domains(self, blocked_domains):
923 """Set the sequence of blocked domains."""
924 self._blocked_domains = tuple(blocked_domains)
925
926 def is_blocked(self, domain):
927 for blocked_domain in self._blocked_domains:
928 if user_domain_match(domain, blocked_domain):
929 return True
930 return False
931
932 def allowed_domains(self):
933 """Return None, or the sequence of allowed domains (as a tuple)."""
934 return self._allowed_domains
935 def set_allowed_domains(self, allowed_domains):
936 """Set the sequence of allowed domains, or None."""
937 if allowed_domains is not None:
938 allowed_domains = tuple(allowed_domains)
939 self._allowed_domains = allowed_domains
940
941 def is_not_allowed(self, domain):
942 if self._allowed_domains is None:
943 return False
944 for allowed_domain in self._allowed_domains:
945 if user_domain_match(domain, allowed_domain):
946 return False
947 return True
948
949 def set_ok(self, cookie, request):
950 """
951 If you override .set_ok(), be sure to call this method. If it returns
952 false, so should your subclass (assuming your subclass wants to be more
953 strict about which cookies to accept).
954
955 """
956 _debug(" - checking cookie %s=%s", cookie.name, cookie.value)
957
958 assert cookie.name is not None
959
960 for n in "version", "verifiability", "name", "path", "domain", "port":
961 fn_name = "set_ok_"+n
962 fn = getattr(self, fn_name)
963 if not fn(cookie, request):
964 return False
965
966 return True
967
968 def set_ok_version(self, cookie, request):
969 if cookie.version is None:
970 # Version is always set to 0 by parse_ns_headers if it's a Netscape
971 # cookie, so this must be an invalid RFC 2965 cookie.
972 _debug(" Set-Cookie2 without version attribute (%s=%s)",
973 cookie.name, cookie.value)
974 return False
975 if cookie.version > 0 and not self.rfc2965:
976 _debug(" RFC 2965 cookies are switched off")
977 return False
978 elif cookie.version == 0 and not self.netscape:
979 _debug(" Netscape cookies are switched off")
980 return False
981 return True
982
983 def set_ok_verifiability(self, cookie, request):
984 if request.unverifiable and is_third_party(request):
985 if cookie.version > 0 and self.strict_rfc2965_unverifiable:
986 _debug(" third-party RFC 2965 cookie during "
987 "unverifiable transaction")
988 return False
989 elif cookie.version == 0 and self.strict_ns_unverifiable:
990 _debug(" third-party Netscape cookie during "
991 "unverifiable transaction")
992 return False
993 return True
994
995 def set_ok_name(self, cookie, request):
996 # Try and stop servers setting V0 cookies designed to hack other
997 # servers that know both V0 and V1 protocols.
998 if (cookie.version == 0 and self.strict_ns_set_initial_dollar and
999 cookie.name.startswith("$")):
1000 _debug(" illegal name (starts with '$'): '%s'", cookie.name)
1001 return False
1002 return True
1003
1004 def set_ok_path(self, cookie, request):
1005 if cookie.path_specified:
1006 req_path = request_path(request)
1007 if ((cookie.version > 0 or
1008 (cookie.version == 0 and self.strict_ns_set_path)) and
1009 not self.path_return_ok(cookie.path, request)):
1010 _debug(" path attribute %s is not a prefix of request "
1011 "path %s", cookie.path, req_path)
1012 return False
1013 return True
1014
1015 def set_ok_domain(self, cookie, request):
1016 if self.is_blocked(cookie.domain):
1017 _debug(" domain %s is in user block-list", cookie.domain)
1018 return False
1019 if self.is_not_allowed(cookie.domain):
1020 _debug(" domain %s is not in user allow-list", cookie.domain)
1021 return False
1022 if cookie.domain_specified:
1023 req_host, erhn = eff_request_host(request)
1024 domain = cookie.domain
1025 if self.strict_domain and (domain.count(".") >= 2):
1026 # XXX This should probably be compared with the Konqueror
1027 # (kcookiejar.cpp) and Mozilla implementations, but it's a
1028 # losing battle.
1029 i = domain.rfind(".")
1030 j = domain.rfind(".", 0, i)
1031 if j == 0: # domain like .foo.bar
1032 tld = domain[i+1:]
1033 sld = domain[j+1:i]
1034 if sld.lower() in ("co", "ac", "com", "edu", "org", "net",
1035 "gov", "mil", "int", "aero", "biz", "cat", "coop",
1036 "info", "jobs", "mobi", "museum", "name", "pro",
1037 "travel", "eu") and len(tld) == 2:
1038 # domain like .co.uk
1039 _debug(" country-code second level domain %s", domain)
1040 return False
1041 if domain.startswith("."):
1042 undotted_domain = domain[1:]
1043 else:
1044 undotted_domain = domain
1045 embedded_dots = (undotted_domain.find(".") >= 0)
1046 if not embedded_dots and not erhn.endswith(".local"):
1047 _debug(" non-local domain %s contains no embedded dot",
1048 domain)
1049 return False
1050 if cookie.version == 0:
1051 if (not (erhn.endswith(domain) or
1052 erhn.endswith(f"{undotted_domain}.local")) and
1053 (not erhn.startswith(".") and
1054 not ("."+erhn).endswith(domain))):
1055 _debug(" effective request-host %s (even with added "
1056 "initial dot) does not end with %s",
1057 erhn, domain)
1058 return False
1059 if (cookie.version > 0 or
1060 (self.strict_ns_domain & self.DomainRFC2965Match)):
1061 if not domain_match(erhn, domain):
1062 _debug(" effective request-host %s does not domain-match "
1063 "%s", erhn, domain)
1064 return False
1065 if (cookie.version > 0 or
1066 (self.strict_ns_domain & self.DomainStrictNoDots)):
1067 host_prefix = req_host[:-len(domain)]
1068 if (host_prefix.find(".") >= 0 and
1069 not IPV4_RE.search(req_host)):
1070 _debug(" host prefix %s for domain %s contains a dot",
1071 host_prefix, domain)
1072 return False
1073 return True
1074
1075 def set_ok_port(self, cookie, request):
1076 if cookie.port_specified:
1077 req_port = request_port(request)
1078 if req_port is None:
1079 req_port = "80"
1080 else:
1081 req_port = str(req_port)
1082 for p in cookie.port.split(","):
1083 try:
1084 int(p)
1085 except ValueError:
1086 _debug(" bad port %s (not numeric)", p)
1087 return False
1088 if p == req_port:
1089 break
1090 else:
1091 _debug(" request port (%s) not found in %s",
1092 req_port, cookie.port)
1093 return False
1094 return True
1095
1096 def return_ok(self, cookie, request):
1097 """
1098 If you override .return_ok(), be sure to call this method. If it
1099 returns false, so should your subclass (assuming your subclass wants to
1100 be more strict about which cookies to return).
1101
1102 """
1103 # Path has already been checked by .path_return_ok(), and domain
1104 # blocking done by .domain_return_ok().
1105 _debug(" - checking cookie %s=%s", cookie.name, cookie.value)
1106
1107 for n in "version", "verifiability", "secure", "expires", "port", "domain":
1108 fn_name = "return_ok_"+n
1109 fn = getattr(self, fn_name)
1110 if not fn(cookie, request):
1111 return False
1112 return True
1113
1114 def return_ok_version(self, cookie, request):
1115 if cookie.version > 0 and not self.rfc2965:
1116 _debug(" RFC 2965 cookies are switched off")
1117 return False
1118 elif cookie.version == 0 and not self.netscape:
1119 _debug(" Netscape cookies are switched off")
1120 return False
1121 return True
1122
1123 def return_ok_verifiability(self, cookie, request):
1124 if request.unverifiable and is_third_party(request):
1125 if cookie.version > 0 and self.strict_rfc2965_unverifiable:
1126 _debug(" third-party RFC 2965 cookie during unverifiable "
1127 "transaction")
1128 return False
1129 elif cookie.version == 0 and self.strict_ns_unverifiable:
1130 _debug(" third-party Netscape cookie during unverifiable "
1131 "transaction")
1132 return False
1133 return True
1134
1135 def return_ok_secure(self, cookie, request):
1136 if cookie.secure and request.type not in self.secure_protocols:
1137 _debug(" secure cookie with non-secure request")
1138 return False
1139 return True
1140
1141 def return_ok_expires(self, cookie, request):
1142 if cookie.is_expired(self._now):
1143 _debug(" cookie expired")
1144 return False
1145 return True
1146
1147 def return_ok_port(self, cookie, request):
1148 if cookie.port:
1149 req_port = request_port(request)
1150 if req_port is None:
1151 req_port = "80"
1152 for p in cookie.port.split(","):
1153 if p == req_port:
1154 break
1155 else:
1156 _debug(" request port %s does not match cookie port %s",
1157 req_port, cookie.port)
1158 return False
1159 return True
1160
1161 def return_ok_domain(self, cookie, request):
1162 req_host, erhn = eff_request_host(request)
1163 domain = cookie.domain
1164
1165 if domain and not domain.startswith("."):
1166 dotdomain = "." + domain
1167 else:
1168 dotdomain = domain
1169
1170 # strict check of non-domain cookies: Mozilla does this, MSIE5 doesn't
1171 if (cookie.version == 0 and
1172 (self.strict_ns_domain & self.DomainStrictNonDomain) and
1173 not cookie.domain_specified and domain != erhn):
1174 _debug(" cookie with unspecified domain does not string-compare "
1175 "equal to request domain")
1176 return False
1177
1178 if cookie.version > 0 and not domain_match(erhn, domain):
1179 _debug(" effective request-host name %s does not domain-match "
1180 "RFC 2965 cookie domain %s", erhn, domain)
1181 return False
1182 if cookie.version == 0 and not ("."+erhn).endswith(dotdomain):
1183 _debug(" request-host %s does not match Netscape cookie domain "
1184 "%s", req_host, domain)
1185 return False
1186 return True
1187
1188 def domain_return_ok(self, domain, request):
1189 # Liberal check of. This is here as an optimization to avoid
1190 # having to load lots of MSIE cookie files unless necessary.
1191 req_host, erhn = eff_request_host(request)
1192 if not req_host.startswith("."):
1193 req_host = "."+req_host
1194 if not erhn.startswith("."):
1195 erhn = "."+erhn
1196 if domain and not domain.startswith("."):
1197 dotdomain = "." + domain
1198 else:
1199 dotdomain = domain
1200 if not (req_host.endswith(dotdomain) or erhn.endswith(dotdomain)):
1201 #_debug(" request domain %s does not match cookie domain %s",
1202 # req_host, domain)
1203 return False
1204
1205 if self.is_blocked(domain):
1206 _debug(" domain %s is in user block-list", domain)
1207 return False
1208 if self.is_not_allowed(domain):
1209 _debug(" domain %s is not in user allow-list", domain)
1210 return False
1211
1212 return True
1213
1214 def path_return_ok(self, path, request):
1215 _debug("- checking cookie path=%s", path)
1216 req_path = request_path(request)
1217 pathlen = len(path)
1218 if req_path == path:
1219 return True
1220 elif (req_path.startswith(path) and
1221 (path.endswith("/") or req_path[pathlen:pathlen+1] == "/")):
1222 return True
1223
1224 _debug(" %s does not path-match %s", req_path, path)
1225 return False
1226
1227 def deepvalues(mapping):
1228 """Iterates over nested mapping, depth-first"""
1229 for obj in list(mapping.values()):
1230 mapping = False
1231 try:
1232 obj.items
1233 except AttributeError:
1234 pass
1235 else:
1236 mapping = True
1237 yield from deepvalues(obj)
1238 if not mapping:
1239 yield obj
1240
1241
1242 # Used as second parameter to dict.get() method, to distinguish absent
1243 # dict key from one with a None value.
1244 class ESC[4;38;5;81mAbsent: pass
1245
1246 class ESC[4;38;5;81mCookieJar:
1247 """Collection of HTTP cookies.
1248
1249 You may not need to know about this class: try
1250 urllib.request.build_opener(HTTPCookieProcessor).open(url).
1251 """
1252
1253 non_word_re = re.compile(r"\W")
1254 quote_re = re.compile(r"([\"\\])")
1255 strict_domain_re = re.compile(r"\.?[^.]*")
1256 domain_re = re.compile(r"[^.]*")
1257 dots_re = re.compile(r"^\.+")
1258
1259 magic_re = re.compile(r"^\#LWP-Cookies-(\d+\.\d+)", re.ASCII)
1260
1261 def __init__(self, policy=None):
1262 if policy is None:
1263 policy = DefaultCookiePolicy()
1264 self._policy = policy
1265
1266 self._cookies_lock = _threading.RLock()
1267 self._cookies = {}
1268
1269 def set_policy(self, policy):
1270 self._policy = policy
1271
1272 def _cookies_for_domain(self, domain, request):
1273 cookies = []
1274 if not self._policy.domain_return_ok(domain, request):
1275 return []
1276 _debug("Checking %s for cookies to return", domain)
1277 cookies_by_path = self._cookies[domain]
1278 for path in cookies_by_path.keys():
1279 if not self._policy.path_return_ok(path, request):
1280 continue
1281 cookies_by_name = cookies_by_path[path]
1282 for cookie in cookies_by_name.values():
1283 if not self._policy.return_ok(cookie, request):
1284 _debug(" not returning cookie")
1285 continue
1286 _debug(" it's a match")
1287 cookies.append(cookie)
1288 return cookies
1289
1290 def _cookies_for_request(self, request):
1291 """Return a list of cookies to be returned to server."""
1292 cookies = []
1293 for domain in self._cookies.keys():
1294 cookies.extend(self._cookies_for_domain(domain, request))
1295 return cookies
1296
1297 def _cookie_attrs(self, cookies):
1298 """Return a list of cookie-attributes to be returned to server.
1299
1300 like ['foo="bar"; $Path="/"', ...]
1301
1302 The $Version attribute is also added when appropriate (currently only
1303 once per request).
1304
1305 """
1306 # add cookies in order of most specific (ie. longest) path first
1307 cookies.sort(key=lambda a: len(a.path), reverse=True)
1308
1309 version_set = False
1310
1311 attrs = []
1312 for cookie in cookies:
1313 # set version of Cookie header
1314 # XXX
1315 # What should it be if multiple matching Set-Cookie headers have
1316 # different versions themselves?
1317 # Answer: there is no answer; was supposed to be settled by
1318 # RFC 2965 errata, but that may never appear...
1319 version = cookie.version
1320 if not version_set:
1321 version_set = True
1322 if version > 0:
1323 attrs.append("$Version=%s" % version)
1324
1325 # quote cookie value if necessary
1326 # (not for Netscape protocol, which already has any quotes
1327 # intact, due to the poorly-specified Netscape Cookie: syntax)
1328 if ((cookie.value is not None) and
1329 self.non_word_re.search(cookie.value) and version > 0):
1330 value = self.quote_re.sub(r"\\\1", cookie.value)
1331 else:
1332 value = cookie.value
1333
1334 # add cookie-attributes to be returned in Cookie header
1335 if cookie.value is None:
1336 attrs.append(cookie.name)
1337 else:
1338 attrs.append("%s=%s" % (cookie.name, value))
1339 if version > 0:
1340 if cookie.path_specified:
1341 attrs.append('$Path="%s"' % cookie.path)
1342 if cookie.domain.startswith("."):
1343 domain = cookie.domain
1344 if (not cookie.domain_initial_dot and
1345 domain.startswith(".")):
1346 domain = domain[1:]
1347 attrs.append('$Domain="%s"' % domain)
1348 if cookie.port is not None:
1349 p = "$Port"
1350 if cookie.port_specified:
1351 p = p + ('="%s"' % cookie.port)
1352 attrs.append(p)
1353
1354 return attrs
1355
1356 def add_cookie_header(self, request):
1357 """Add correct Cookie: header to request (urllib.request.Request object).
1358
1359 The Cookie2 header is also added unless policy.hide_cookie2 is true.
1360
1361 """
1362 _debug("add_cookie_header")
1363 self._cookies_lock.acquire()
1364 try:
1365
1366 self._policy._now = self._now = int(time.time())
1367
1368 cookies = self._cookies_for_request(request)
1369
1370 attrs = self._cookie_attrs(cookies)
1371 if attrs:
1372 if not request.has_header("Cookie"):
1373 request.add_unredirected_header(
1374 "Cookie", "; ".join(attrs))
1375
1376 # if necessary, advertise that we know RFC 2965
1377 if (self._policy.rfc2965 and not self._policy.hide_cookie2 and
1378 not request.has_header("Cookie2")):
1379 for cookie in cookies:
1380 if cookie.version != 1:
1381 request.add_unredirected_header("Cookie2", '$Version="1"')
1382 break
1383
1384 finally:
1385 self._cookies_lock.release()
1386
1387 self.clear_expired_cookies()
1388
1389 def _normalized_cookie_tuples(self, attrs_set):
1390 """Return list of tuples containing normalised cookie information.
1391
1392 attrs_set is the list of lists of key,value pairs extracted from
1393 the Set-Cookie or Set-Cookie2 headers.
1394
1395 Tuples are name, value, standard, rest, where name and value are the
1396 cookie name and value, standard is a dictionary containing the standard
1397 cookie-attributes (discard, secure, version, expires or max-age,
1398 domain, path and port) and rest is a dictionary containing the rest of
1399 the cookie-attributes.
1400
1401 """
1402 cookie_tuples = []
1403
1404 boolean_attrs = "discard", "secure"
1405 value_attrs = ("version",
1406 "expires", "max-age",
1407 "domain", "path", "port",
1408 "comment", "commenturl")
1409
1410 for cookie_attrs in attrs_set:
1411 name, value = cookie_attrs[0]
1412
1413 # Build dictionary of standard cookie-attributes (standard) and
1414 # dictionary of other cookie-attributes (rest).
1415
1416 # Note: expiry time is normalised to seconds since epoch. V0
1417 # cookies should have the Expires cookie-attribute, and V1 cookies
1418 # should have Max-Age, but since V1 includes RFC 2109 cookies (and
1419 # since V0 cookies may be a mish-mash of Netscape and RFC 2109), we
1420 # accept either (but prefer Max-Age).
1421 max_age_set = False
1422
1423 bad_cookie = False
1424
1425 standard = {}
1426 rest = {}
1427 for k, v in cookie_attrs[1:]:
1428 lc = k.lower()
1429 # don't lose case distinction for unknown fields
1430 if lc in value_attrs or lc in boolean_attrs:
1431 k = lc
1432 if k in boolean_attrs and v is None:
1433 # boolean cookie-attribute is present, but has no value
1434 # (like "discard", rather than "port=80")
1435 v = True
1436 if k in standard:
1437 # only first value is significant
1438 continue
1439 if k == "domain":
1440 if v is None:
1441 _debug(" missing value for domain attribute")
1442 bad_cookie = True
1443 break
1444 # RFC 2965 section 3.3.3
1445 v = v.lower()
1446 if k == "expires":
1447 if max_age_set:
1448 # Prefer max-age to expires (like Mozilla)
1449 continue
1450 if v is None:
1451 _debug(" missing or invalid value for expires "
1452 "attribute: treating as session cookie")
1453 continue
1454 if k == "max-age":
1455 max_age_set = True
1456 try:
1457 v = int(v)
1458 except ValueError:
1459 _debug(" missing or invalid (non-numeric) value for "
1460 "max-age attribute")
1461 bad_cookie = True
1462 break
1463 # convert RFC 2965 Max-Age to seconds since epoch
1464 # XXX Strictly you're supposed to follow RFC 2616
1465 # age-calculation rules. Remember that zero Max-Age
1466 # is a request to discard (old and new) cookie, though.
1467 k = "expires"
1468 v = self._now + v
1469 if (k in value_attrs) or (k in boolean_attrs):
1470 if (v is None and
1471 k not in ("port", "comment", "commenturl")):
1472 _debug(" missing value for %s attribute" % k)
1473 bad_cookie = True
1474 break
1475 standard[k] = v
1476 else:
1477 rest[k] = v
1478
1479 if bad_cookie:
1480 continue
1481
1482 cookie_tuples.append((name, value, standard, rest))
1483
1484 return cookie_tuples
1485
1486 def _cookie_from_cookie_tuple(self, tup, request):
1487 # standard is dict of standard cookie-attributes, rest is dict of the
1488 # rest of them
1489 name, value, standard, rest = tup
1490
1491 domain = standard.get("domain", Absent)
1492 path = standard.get("path", Absent)
1493 port = standard.get("port", Absent)
1494 expires = standard.get("expires", Absent)
1495
1496 # set the easy defaults
1497 version = standard.get("version", None)
1498 if version is not None:
1499 try:
1500 version = int(version)
1501 except ValueError:
1502 return None # invalid version, ignore cookie
1503 secure = standard.get("secure", False)
1504 # (discard is also set if expires is Absent)
1505 discard = standard.get("discard", False)
1506 comment = standard.get("comment", None)
1507 comment_url = standard.get("commenturl", None)
1508
1509 # set default path
1510 if path is not Absent and path != "":
1511 path_specified = True
1512 path = escape_path(path)
1513 else:
1514 path_specified = False
1515 path = request_path(request)
1516 i = path.rfind("/")
1517 if i != -1:
1518 if version == 0:
1519 # Netscape spec parts company from reality here
1520 path = path[:i]
1521 else:
1522 path = path[:i+1]
1523 if len(path) == 0: path = "/"
1524
1525 # set default domain
1526 domain_specified = domain is not Absent
1527 # but first we have to remember whether it starts with a dot
1528 domain_initial_dot = False
1529 if domain_specified:
1530 domain_initial_dot = bool(domain.startswith("."))
1531 if domain is Absent:
1532 req_host, erhn = eff_request_host(request)
1533 domain = erhn
1534 elif not domain.startswith("."):
1535 domain = "."+domain
1536
1537 # set default port
1538 port_specified = False
1539 if port is not Absent:
1540 if port is None:
1541 # Port attr present, but has no value: default to request port.
1542 # Cookie should then only be sent back on that port.
1543 port = request_port(request)
1544 else:
1545 port_specified = True
1546 port = re.sub(r"\s+", "", port)
1547 else:
1548 # No port attr present. Cookie can be sent back on any port.
1549 port = None
1550
1551 # set default expires and discard
1552 if expires is Absent:
1553 expires = None
1554 discard = True
1555 elif expires <= self._now:
1556 # Expiry date in past is request to delete cookie. This can't be
1557 # in DefaultCookiePolicy, because can't delete cookies there.
1558 try:
1559 self.clear(domain, path, name)
1560 except KeyError:
1561 pass
1562 _debug("Expiring cookie, domain='%s', path='%s', name='%s'",
1563 domain, path, name)
1564 return None
1565
1566 return Cookie(version,
1567 name, value,
1568 port, port_specified,
1569 domain, domain_specified, domain_initial_dot,
1570 path, path_specified,
1571 secure,
1572 expires,
1573 discard,
1574 comment,
1575 comment_url,
1576 rest)
1577
1578 def _cookies_from_attrs_set(self, attrs_set, request):
1579 cookie_tuples = self._normalized_cookie_tuples(attrs_set)
1580
1581 cookies = []
1582 for tup in cookie_tuples:
1583 cookie = self._cookie_from_cookie_tuple(tup, request)
1584 if cookie: cookies.append(cookie)
1585 return cookies
1586
1587 def _process_rfc2109_cookies(self, cookies):
1588 rfc2109_as_ns = getattr(self._policy, 'rfc2109_as_netscape', None)
1589 if rfc2109_as_ns is None:
1590 rfc2109_as_ns = not self._policy.rfc2965
1591 for cookie in cookies:
1592 if cookie.version == 1:
1593 cookie.rfc2109 = True
1594 if rfc2109_as_ns:
1595 # treat 2109 cookies as Netscape cookies rather than
1596 # as RFC2965 cookies
1597 cookie.version = 0
1598
1599 def make_cookies(self, response, request):
1600 """Return sequence of Cookie objects extracted from response object."""
1601 # get cookie-attributes for RFC 2965 and Netscape protocols
1602 headers = response.info()
1603 rfc2965_hdrs = headers.get_all("Set-Cookie2", [])
1604 ns_hdrs = headers.get_all("Set-Cookie", [])
1605 self._policy._now = self._now = int(time.time())
1606
1607 rfc2965 = self._policy.rfc2965
1608 netscape = self._policy.netscape
1609
1610 if ((not rfc2965_hdrs and not ns_hdrs) or
1611 (not ns_hdrs and not rfc2965) or
1612 (not rfc2965_hdrs and not netscape) or
1613 (not netscape and not rfc2965)):
1614 return [] # no relevant cookie headers: quick exit
1615
1616 try:
1617 cookies = self._cookies_from_attrs_set(
1618 split_header_words(rfc2965_hdrs), request)
1619 except Exception:
1620 _warn_unhandled_exception()
1621 cookies = []
1622
1623 if ns_hdrs and netscape:
1624 try:
1625 # RFC 2109 and Netscape cookies
1626 ns_cookies = self._cookies_from_attrs_set(
1627 parse_ns_headers(ns_hdrs), request)
1628 except Exception:
1629 _warn_unhandled_exception()
1630 ns_cookies = []
1631 self._process_rfc2109_cookies(ns_cookies)
1632
1633 # Look for Netscape cookies (from Set-Cookie headers) that match
1634 # corresponding RFC 2965 cookies (from Set-Cookie2 headers).
1635 # For each match, keep the RFC 2965 cookie and ignore the Netscape
1636 # cookie (RFC 2965 section 9.1). Actually, RFC 2109 cookies are
1637 # bundled in with the Netscape cookies for this purpose, which is
1638 # reasonable behaviour.
1639 if rfc2965:
1640 lookup = {}
1641 for cookie in cookies:
1642 lookup[(cookie.domain, cookie.path, cookie.name)] = None
1643
1644 def no_matching_rfc2965(ns_cookie, lookup=lookup):
1645 key = ns_cookie.domain, ns_cookie.path, ns_cookie.name
1646 return key not in lookup
1647 ns_cookies = filter(no_matching_rfc2965, ns_cookies)
1648
1649 if ns_cookies:
1650 cookies.extend(ns_cookies)
1651
1652 return cookies
1653
1654 def set_cookie_if_ok(self, cookie, request):
1655 """Set a cookie if policy says it's OK to do so."""
1656 self._cookies_lock.acquire()
1657 try:
1658 self._policy._now = self._now = int(time.time())
1659
1660 if self._policy.set_ok(cookie, request):
1661 self.set_cookie(cookie)
1662
1663
1664 finally:
1665 self._cookies_lock.release()
1666
1667 def set_cookie(self, cookie):
1668 """Set a cookie, without checking whether or not it should be set."""
1669 c = self._cookies
1670 self._cookies_lock.acquire()
1671 try:
1672 if cookie.domain not in c: c[cookie.domain] = {}
1673 c2 = c[cookie.domain]
1674 if cookie.path not in c2: c2[cookie.path] = {}
1675 c3 = c2[cookie.path]
1676 c3[cookie.name] = cookie
1677 finally:
1678 self._cookies_lock.release()
1679
1680 def extract_cookies(self, response, request):
1681 """Extract cookies from response, where allowable given the request."""
1682 _debug("extract_cookies: %s", response.info())
1683 self._cookies_lock.acquire()
1684 try:
1685 for cookie in self.make_cookies(response, request):
1686 if self._policy.set_ok(cookie, request):
1687 _debug(" setting cookie: %s", cookie)
1688 self.set_cookie(cookie)
1689 finally:
1690 self._cookies_lock.release()
1691
1692 def clear(self, domain=None, path=None, name=None):
1693 """Clear some cookies.
1694
1695 Invoking this method without arguments will clear all cookies. If
1696 given a single argument, only cookies belonging to that domain will be
1697 removed. If given two arguments, cookies belonging to the specified
1698 path within that domain are removed. If given three arguments, then
1699 the cookie with the specified name, path and domain is removed.
1700
1701 Raises KeyError if no matching cookie exists.
1702
1703 """
1704 if name is not None:
1705 if (domain is None) or (path is None):
1706 raise ValueError(
1707 "domain and path must be given to remove a cookie by name")
1708 del self._cookies[domain][path][name]
1709 elif path is not None:
1710 if domain is None:
1711 raise ValueError(
1712 "domain must be given to remove cookies by path")
1713 del self._cookies[domain][path]
1714 elif domain is not None:
1715 del self._cookies[domain]
1716 else:
1717 self._cookies = {}
1718
1719 def clear_session_cookies(self):
1720 """Discard all session cookies.
1721
1722 Note that the .save() method won't save session cookies anyway, unless
1723 you ask otherwise by passing a true ignore_discard argument.
1724
1725 """
1726 self._cookies_lock.acquire()
1727 try:
1728 for cookie in self:
1729 if cookie.discard:
1730 self.clear(cookie.domain, cookie.path, cookie.name)
1731 finally:
1732 self._cookies_lock.release()
1733
1734 def clear_expired_cookies(self):
1735 """Discard all expired cookies.
1736
1737 You probably don't need to call this method: expired cookies are never
1738 sent back to the server (provided you're using DefaultCookiePolicy),
1739 this method is called by CookieJar itself every so often, and the
1740 .save() method won't save expired cookies anyway (unless you ask
1741 otherwise by passing a true ignore_expires argument).
1742
1743 """
1744 self._cookies_lock.acquire()
1745 try:
1746 now = time.time()
1747 for cookie in self:
1748 if cookie.is_expired(now):
1749 self.clear(cookie.domain, cookie.path, cookie.name)
1750 finally:
1751 self._cookies_lock.release()
1752
1753 def __iter__(self):
1754 return deepvalues(self._cookies)
1755
1756 def __len__(self):
1757 """Return number of contained cookies."""
1758 i = 0
1759 for cookie in self: i = i + 1
1760 return i
1761
1762 def __repr__(self):
1763 r = []
1764 for cookie in self: r.append(repr(cookie))
1765 return "<%s[%s]>" % (self.__class__.__name__, ", ".join(r))
1766
1767 def __str__(self):
1768 r = []
1769 for cookie in self: r.append(str(cookie))
1770 return "<%s[%s]>" % (self.__class__.__name__, ", ".join(r))
1771
1772
1773 # derives from OSError for backwards-compatibility with Python 2.4.0
1774 class ESC[4;38;5;81mLoadError(ESC[4;38;5;149mOSError): pass
1775
1776 class ESC[4;38;5;81mFileCookieJar(ESC[4;38;5;149mCookieJar):
1777 """CookieJar that can be loaded from and saved to a file."""
1778
1779 def __init__(self, filename=None, delayload=False, policy=None):
1780 """
1781 Cookies are NOT loaded from the named file until either the .load() or
1782 .revert() method is called.
1783
1784 """
1785 CookieJar.__init__(self, policy)
1786 if filename is not None:
1787 filename = os.fspath(filename)
1788 self.filename = filename
1789 self.delayload = bool(delayload)
1790
1791 def save(self, filename=None, ignore_discard=False, ignore_expires=False):
1792 """Save cookies to a file."""
1793 raise NotImplementedError()
1794
1795 def load(self, filename=None, ignore_discard=False, ignore_expires=False):
1796 """Load cookies from a file."""
1797 if filename is None:
1798 if self.filename is not None: filename = self.filename
1799 else: raise ValueError(MISSING_FILENAME_TEXT)
1800
1801 with open(filename) as f:
1802 self._really_load(f, filename, ignore_discard, ignore_expires)
1803
1804 def revert(self, filename=None,
1805 ignore_discard=False, ignore_expires=False):
1806 """Clear all cookies and reload cookies from a saved file.
1807
1808 Raises LoadError (or OSError) if reversion is not successful; the
1809 object's state will not be altered if this happens.
1810
1811 """
1812 if filename is None:
1813 if self.filename is not None: filename = self.filename
1814 else: raise ValueError(MISSING_FILENAME_TEXT)
1815
1816 self._cookies_lock.acquire()
1817 try:
1818
1819 old_state = copy.deepcopy(self._cookies)
1820 self._cookies = {}
1821 try:
1822 self.load(filename, ignore_discard, ignore_expires)
1823 except OSError:
1824 self._cookies = old_state
1825 raise
1826
1827 finally:
1828 self._cookies_lock.release()
1829
1830
1831 def lwp_cookie_str(cookie):
1832 """Return string representation of Cookie in the LWP cookie file format.
1833
1834 Actually, the format is extended a bit -- see module docstring.
1835
1836 """
1837 h = [(cookie.name, cookie.value),
1838 ("path", cookie.path),
1839 ("domain", cookie.domain)]
1840 if cookie.port is not None: h.append(("port", cookie.port))
1841 if cookie.path_specified: h.append(("path_spec", None))
1842 if cookie.port_specified: h.append(("port_spec", None))
1843 if cookie.domain_initial_dot: h.append(("domain_dot", None))
1844 if cookie.secure: h.append(("secure", None))
1845 if cookie.expires: h.append(("expires",
1846 time2isoz(float(cookie.expires))))
1847 if cookie.discard: h.append(("discard", None))
1848 if cookie.comment: h.append(("comment", cookie.comment))
1849 if cookie.comment_url: h.append(("commenturl", cookie.comment_url))
1850
1851 keys = sorted(cookie._rest.keys())
1852 for k in keys:
1853 h.append((k, str(cookie._rest[k])))
1854
1855 h.append(("version", str(cookie.version)))
1856
1857 return join_header_words([h])
1858
1859 class ESC[4;38;5;81mLWPCookieJar(ESC[4;38;5;149mFileCookieJar):
1860 """
1861 The LWPCookieJar saves a sequence of "Set-Cookie3" lines.
1862 "Set-Cookie3" is the format used by the libwww-perl library, not known
1863 to be compatible with any browser, but which is easy to read and
1864 doesn't lose information about RFC 2965 cookies.
1865
1866 Additional methods
1867
1868 as_lwp_str(ignore_discard=True, ignore_expired=True)
1869
1870 """
1871
1872 def as_lwp_str(self, ignore_discard=True, ignore_expires=True):
1873 """Return cookies as a string of "\\n"-separated "Set-Cookie3" headers.
1874
1875 ignore_discard and ignore_expires: see docstring for FileCookieJar.save
1876
1877 """
1878 now = time.time()
1879 r = []
1880 for cookie in self:
1881 if not ignore_discard and cookie.discard:
1882 continue
1883 if not ignore_expires and cookie.is_expired(now):
1884 continue
1885 r.append("Set-Cookie3: %s" % lwp_cookie_str(cookie))
1886 return "\n".join(r+[""])
1887
1888 def save(self, filename=None, ignore_discard=False, ignore_expires=False):
1889 if filename is None:
1890 if self.filename is not None: filename = self.filename
1891 else: raise ValueError(MISSING_FILENAME_TEXT)
1892
1893 with os.fdopen(
1894 os.open(filename, os.O_CREAT | os.O_WRONLY | os.O_TRUNC, 0o600),
1895 'w',
1896 ) as f:
1897 # There really isn't an LWP Cookies 2.0 format, but this indicates
1898 # that there is extra information in here (domain_dot and
1899 # port_spec) while still being compatible with libwww-perl, I hope.
1900 f.write("#LWP-Cookies-2.0\n")
1901 f.write(self.as_lwp_str(ignore_discard, ignore_expires))
1902
1903 def _really_load(self, f, filename, ignore_discard, ignore_expires):
1904 magic = f.readline()
1905 if not self.magic_re.search(magic):
1906 msg = ("%r does not look like a Set-Cookie3 (LWP) format "
1907 "file" % filename)
1908 raise LoadError(msg)
1909
1910 now = time.time()
1911
1912 header = "Set-Cookie3:"
1913 boolean_attrs = ("port_spec", "path_spec", "domain_dot",
1914 "secure", "discard")
1915 value_attrs = ("version",
1916 "port", "path", "domain",
1917 "expires",
1918 "comment", "commenturl")
1919
1920 try:
1921 while (line := f.readline()) != "":
1922 if not line.startswith(header):
1923 continue
1924 line = line[len(header):].strip()
1925
1926 for data in split_header_words([line]):
1927 name, value = data[0]
1928 standard = {}
1929 rest = {}
1930 for k in boolean_attrs:
1931 standard[k] = False
1932 for k, v in data[1:]:
1933 if k is not None:
1934 lc = k.lower()
1935 else:
1936 lc = None
1937 # don't lose case distinction for unknown fields
1938 if (lc in value_attrs) or (lc in boolean_attrs):
1939 k = lc
1940 if k in boolean_attrs:
1941 if v is None: v = True
1942 standard[k] = v
1943 elif k in value_attrs:
1944 standard[k] = v
1945 else:
1946 rest[k] = v
1947
1948 h = standard.get
1949 expires = h("expires")
1950 discard = h("discard")
1951 if expires is not None:
1952 expires = iso2time(expires)
1953 if expires is None:
1954 discard = True
1955 domain = h("domain")
1956 domain_specified = domain.startswith(".")
1957 c = Cookie(h("version"), name, value,
1958 h("port"), h("port_spec"),
1959 domain, domain_specified, h("domain_dot"),
1960 h("path"), h("path_spec"),
1961 h("secure"),
1962 expires,
1963 discard,
1964 h("comment"),
1965 h("commenturl"),
1966 rest)
1967 if not ignore_discard and c.discard:
1968 continue
1969 if not ignore_expires and c.is_expired(now):
1970 continue
1971 self.set_cookie(c)
1972 except OSError:
1973 raise
1974 except Exception:
1975 _warn_unhandled_exception()
1976 raise LoadError("invalid Set-Cookie3 format file %r: %r" %
1977 (filename, line))
1978
1979
1980 class ESC[4;38;5;81mMozillaCookieJar(ESC[4;38;5;149mFileCookieJar):
1981 """
1982
1983 WARNING: you may want to backup your browser's cookies file if you use
1984 this class to save cookies. I *think* it works, but there have been
1985 bugs in the past!
1986
1987 This class differs from CookieJar only in the format it uses to save and
1988 load cookies to and from a file. This class uses the Mozilla/Netscape
1989 `cookies.txt' format. curl and lynx use this file format, too.
1990
1991 Don't expect cookies saved while the browser is running to be noticed by
1992 the browser (in fact, Mozilla on unix will overwrite your saved cookies if
1993 you change them on disk while it's running; on Windows, you probably can't
1994 save at all while the browser is running).
1995
1996 Note that the Mozilla/Netscape format will downgrade RFC2965 cookies to
1997 Netscape cookies on saving.
1998
1999 In particular, the cookie version and port number information is lost,
2000 together with information about whether or not Path, Port and Discard were
2001 specified by the Set-Cookie2 (or Set-Cookie) header, and whether or not the
2002 domain as set in the HTTP header started with a dot (yes, I'm aware some
2003 domains in Netscape files start with a dot and some don't -- trust me, you
2004 really don't want to know any more about this).
2005
2006 Note that though Mozilla and Netscape use the same format, they use
2007 slightly different headers. The class saves cookies using the Netscape
2008 header by default (Mozilla can cope with that).
2009
2010 """
2011
2012 def _really_load(self, f, filename, ignore_discard, ignore_expires):
2013 now = time.time()
2014
2015 if not NETSCAPE_MAGIC_RGX.match(f.readline()):
2016 raise LoadError(
2017 "%r does not look like a Netscape format cookies file" %
2018 filename)
2019
2020 try:
2021 while (line := f.readline()) != "":
2022 rest = {}
2023
2024 # httponly is a cookie flag as defined in rfc6265
2025 # when encoded in a netscape cookie file,
2026 # the line is prepended with "#HttpOnly_"
2027 if line.startswith(HTTPONLY_PREFIX):
2028 rest[HTTPONLY_ATTR] = ""
2029 line = line[len(HTTPONLY_PREFIX):]
2030
2031 # last field may be absent, so keep any trailing tab
2032 if line.endswith("\n"): line = line[:-1]
2033
2034 # skip comments and blank lines XXX what is $ for?
2035 if (line.strip().startswith(("#", "$")) or
2036 line.strip() == ""):
2037 continue
2038
2039 domain, domain_specified, path, secure, expires, name, value = \
2040 line.split("\t")
2041 secure = (secure == "TRUE")
2042 domain_specified = (domain_specified == "TRUE")
2043 if name == "":
2044 # cookies.txt regards 'Set-Cookie: foo' as a cookie
2045 # with no name, whereas http.cookiejar regards it as a
2046 # cookie with no value.
2047 name = value
2048 value = None
2049
2050 initial_dot = domain.startswith(".")
2051 assert domain_specified == initial_dot
2052
2053 discard = False
2054 if expires == "":
2055 expires = None
2056 discard = True
2057
2058 # assume path_specified is false
2059 c = Cookie(0, name, value,
2060 None, False,
2061 domain, domain_specified, initial_dot,
2062 path, False,
2063 secure,
2064 expires,
2065 discard,
2066 None,
2067 None,
2068 rest)
2069 if not ignore_discard and c.discard:
2070 continue
2071 if not ignore_expires and c.is_expired(now):
2072 continue
2073 self.set_cookie(c)
2074
2075 except OSError:
2076 raise
2077 except Exception:
2078 _warn_unhandled_exception()
2079 raise LoadError("invalid Netscape format cookies file %r: %r" %
2080 (filename, line))
2081
2082 def save(self, filename=None, ignore_discard=False, ignore_expires=False):
2083 if filename is None:
2084 if self.filename is not None: filename = self.filename
2085 else: raise ValueError(MISSING_FILENAME_TEXT)
2086
2087 with os.fdopen(
2088 os.open(filename, os.O_CREAT | os.O_WRONLY | os.O_TRUNC, 0o600),
2089 'w',
2090 ) as f:
2091 f.write(NETSCAPE_HEADER_TEXT)
2092 now = time.time()
2093 for cookie in self:
2094 domain = cookie.domain
2095 if not ignore_discard and cookie.discard:
2096 continue
2097 if not ignore_expires and cookie.is_expired(now):
2098 continue
2099 if cookie.secure: secure = "TRUE"
2100 else: secure = "FALSE"
2101 if domain.startswith("."): initial_dot = "TRUE"
2102 else: initial_dot = "FALSE"
2103 if cookie.expires is not None:
2104 expires = str(cookie.expires)
2105 else:
2106 expires = ""
2107 if cookie.value is None:
2108 # cookies.txt regards 'Set-Cookie: foo' as a cookie
2109 # with no name, whereas http.cookiejar regards it as a
2110 # cookie with no value.
2111 name = ""
2112 value = cookie.name
2113 else:
2114 name = cookie.name
2115 value = cookie.value
2116 if cookie.has_nonstandard_attr(HTTPONLY_ATTR):
2117 domain = HTTPONLY_PREFIX + domain
2118 f.write(
2119 "\t".join([domain, initial_dot, cookie.path,
2120 secure, expires, name, value])+
2121 "\n")