1  '''"Executable documentation" for the pickle module.
       2  
       3  Extensive comments about the pickle protocols and pickle-machine opcodes
       4  can be found here.  Some functions meant for external use:
       5  
       6  genops(pickle)
       7     Generate all the opcodes in a pickle, as (opcode, arg, position) triples.
       8  
       9  dis(pickle, out=None, memo=None, indentlevel=4)
      10     Print a symbolic disassembly of a pickle.
      11  '''
      12  
      13  import codecs
      14  import io
      15  import pickle
      16  import re
      17  import sys
      18  
      19  __all__ = ['dis', 'genops', 'optimize']
      20  
      21  bytes_types = pickle.bytes_types
      22  
      23  # Other ideas:
      24  #
      25  # - A pickle verifier:  read a pickle and check it exhaustively for
      26  #   well-formedness.  dis() does a lot of this already.
      27  #
      28  # - A protocol identifier:  examine a pickle and return its protocol number
      29  #   (== the highest .proto attr value among all the opcodes in the pickle).
      30  #   dis() already prints this info at the end.
      31  #
      32  # - A pickle optimizer:  for example, tuple-building code is sometimes more
      33  #   elaborate than necessary, catering for the possibility that the tuple
      34  #   is recursive.  Or lots of times a PUT is generated that's never accessed
      35  #   by a later GET.
      36  
      37  
      38  # "A pickle" is a program for a virtual pickle machine (PM, but more accurately
      39  # called an unpickling machine).  It's a sequence of opcodes, interpreted by the
      40  # PM, building an arbitrarily complex Python object.
      41  #
      42  # For the most part, the PM is very simple:  there are no looping, testing, or
      43  # conditional instructions, no arithmetic and no function calls.  Opcodes are
      44  # executed once each, from first to last, until a STOP opcode is reached.
      45  #
      46  # The PM has two data areas, "the stack" and "the memo".
      47  #
      48  # Many opcodes push Python objects onto the stack; e.g., INT pushes a Python
      49  # integer object on the stack, whose value is gotten from a decimal string
      50  # literal immediately following the INT opcode in the pickle bytestream.  Other
      51  # opcodes take Python objects off the stack.  The result of unpickling is
      52  # whatever object is left on the stack when the final STOP opcode is executed.
      53  #
      54  # The memo is simply an array of objects, or it can be implemented as a dict
      55  # mapping little integers to objects.  The memo serves as the PM's "long term
      56  # memory", and the little integers indexing the memo are akin to variable
      57  # names.  Some opcodes pop a stack object into the memo at a given index,
      58  # and others push a memo object at a given index onto the stack again.
      59  #
      60  # At heart, that's all the PM has.  Subtleties arise for these reasons:
      61  #
      62  # + Object identity.  Objects can be arbitrarily complex, and subobjects
      63  #   may be shared (for example, the list [a, a] refers to the same object a
      64  #   twice).  It can be vital that unpickling recreate an isomorphic object
      65  #   graph, faithfully reproducing sharing.
      66  #
      67  # + Recursive objects.  For example, after "L = []; L.append(L)", L is a
      68  #   list, and L[0] is the same list.  This is related to the object identity
      69  #   point, and some sequences of pickle opcodes are subtle in order to
      70  #   get the right result in all cases.
      71  #
      72  # + Things pickle doesn't know everything about.  Examples of things pickle
      73  #   does know everything about are Python's builtin scalar and container
      74  #   types, like ints and tuples.  They generally have opcodes dedicated to
      75  #   them.  For things like module references and instances of user-defined
      76  #   classes, pickle's knowledge is limited.  Historically, many enhancements
      77  #   have been made to the pickle protocol in order to do a better (faster,
      78  #   and/or more compact) job on those.
      79  #
      80  # + Backward compatibility and micro-optimization.  As explained below,
      81  #   pickle opcodes never go away, not even when better ways to do a thing
      82  #   get invented.  The repertoire of the PM just keeps growing over time.
      83  #   For example, protocol 0 had two opcodes for building Python integers (INT
      84  #   and LONG), protocol 1 added three more for more-efficient pickling of short
      85  #   integers, and protocol 2 added two more for more-efficient pickling of
      86  #   long integers (before protocol 2, the only ways to pickle a Python long
      87  #   took time quadratic in the number of digits, for both pickling and
      88  #   unpickling).  "Opcode bloat" isn't so much a subtlety as a source of
      89  #   wearying complication.
      90  #
      91  #
      92  # Pickle protocols:
      93  #
      94  # For compatibility, the meaning of a pickle opcode never changes.  Instead new
      95  # pickle opcodes get added, and each version's unpickler can handle all the
      96  # pickle opcodes in all protocol versions to date.  So old pickles continue to
      97  # be readable forever.  The pickler can generally be told to restrict itself to
      98  # the subset of opcodes available under previous protocol versions too, so that
      99  # users can create pickles under the current version readable by older
     100  # versions.  However, a pickle does not contain its version number embedded
     101  # within it.  If an older unpickler tries to read a pickle using a later
     102  # protocol, the result is most likely an exception due to seeing an unknown (in
     103  # the older unpickler) opcode.
     104  #
     105  # The original pickle used what's now called "protocol 0", and what was called
     106  # "text mode" before Python 2.3.  The entire pickle bytestream is made up of
     107  # printable 7-bit ASCII characters, plus the newline character, in protocol 0.
     108  # That's why it was called text mode.  Protocol 0 is small and elegant, but
     109  # sometimes painfully inefficient.
     110  #
     111  # The second major set of additions is now called "protocol 1", and was called
     112  # "binary mode" before Python 2.3.  This added many opcodes with arguments
     113  # consisting of arbitrary bytes, including NUL bytes and unprintable "high bit"
     114  # bytes.  Binary mode pickles can be substantially smaller than equivalent
     115  # text mode pickles, and sometimes faster too; e.g., BININT represents a 4-byte
     116  # int as 4 bytes following the opcode, which is cheaper to unpickle than the
     117  # (perhaps) 11-character decimal string attached to INT.  Protocol 1 also added
     118  # a number of opcodes that operate on many stack elements at once (like APPENDS
     119  # and SETITEMS), and "shortcut" opcodes (like EMPTY_DICT and EMPTY_TUPLE).
     120  #
     121  # The third major set of additions came in Python 2.3, and is called "protocol
     122  # 2".  This added:
     123  #
     124  # - A better way to pickle instances of new-style classes (NEWOBJ).
     125  #
     126  # - A way for a pickle to identify its protocol (PROTO).
     127  #
     128  # - Time- and space- efficient pickling of long ints (LONG{1,4}).
     129  #
     130  # - Shortcuts for small tuples (TUPLE{1,2,3}}.
     131  #
     132  # - Dedicated opcodes for bools (NEWTRUE, NEWFALSE).
     133  #
     134  # - The "extension registry", a vector of popular objects that can be pushed
     135  #   efficiently by index (EXT{1,2,4}).  This is akin to the memo and GET, but
     136  #   the registry contents are predefined (there's nothing akin to the memo's
     137  #   PUT).
     138  #
     139  # Another independent change with Python 2.3 is the abandonment of any
     140  # pretense that it might be safe to load pickles received from untrusted
     141  # parties -- no sufficient security analysis has been done to guarantee
     142  # this and there isn't a use case that warrants the expense of such an
     143  # analysis.
     144  #
     145  # To this end, all tests for __safe_for_unpickling__ or for
     146  # copyreg.safe_constructors are removed from the unpickling code.
     147  # References to these variables in the descriptions below are to be seen
     148  # as describing unpickling in Python 2.2 and before.
     149  
     150  
     151  # Meta-rule:  Descriptions are stored in instances of descriptor objects,
     152  # with plain constructors.  No meta-language is defined from which
     153  # descriptors could be constructed.  If you want, e.g., XML, write a little
     154  # program to generate XML from the objects.
     155  
     156  ##############################################################################
     157  # Some pickle opcodes have an argument, following the opcode in the
     158  # bytestream.  An argument is of a specific type, described by an instance
     159  # of ArgumentDescriptor.  These are not to be confused with arguments taken
     160  # off the stack -- ArgumentDescriptor applies only to arguments embedded in
     161  # the opcode stream, immediately following an opcode.
     162  
     163  # Represents the number of bytes consumed by an argument delimited by the
     164  # next newline character.
     165  UP_TO_NEWLINE = -1
     166  
     167  # Represents the number of bytes consumed by a two-argument opcode where
     168  # the first argument gives the number of bytes in the second argument.
     169  TAKEN_FROM_ARGUMENT1  = -2   # num bytes is 1-byte unsigned int
     170  TAKEN_FROM_ARGUMENT4  = -3   # num bytes is 4-byte signed little-endian int
     171  TAKEN_FROM_ARGUMENT4U = -4   # num bytes is 4-byte unsigned little-endian int
     172  TAKEN_FROM_ARGUMENT8U = -5   # num bytes is 8-byte unsigned little-endian int
     173  
     174  class ESC[4;38;5;81mArgumentDescriptor(ESC[4;38;5;149mobject):
     175      __slots__ = (
     176          # name of descriptor record, also a module global name; a string
     177          'name',
     178  
     179          # length of argument, in bytes; an int; UP_TO_NEWLINE and
     180          # TAKEN_FROM_ARGUMENT{1,4,8} are negative values for variable-length
     181          # cases
     182          'n',
     183  
     184          # a function taking a file-like object, reading this kind of argument
     185          # from the object at the current position, advancing the current
     186          # position by n bytes, and returning the value of the argument
     187          'reader',
     188  
     189          # human-readable docs for this arg descriptor; a string
     190          'doc',
     191      )
     192  
     193      def __init__(self, name, n, reader, doc):
     194          assert isinstance(name, str)
     195          self.name = name
     196  
     197          assert isinstance(n, int) and (n >= 0 or
     198                                         n in (UP_TO_NEWLINE,
     199                                               TAKEN_FROM_ARGUMENT1,
     200                                               TAKEN_FROM_ARGUMENT4,
     201                                               TAKEN_FROM_ARGUMENT4U,
     202                                               TAKEN_FROM_ARGUMENT8U))
     203          self.n = n
     204  
     205          self.reader = reader
     206  
     207          assert isinstance(doc, str)
     208          self.doc = doc
     209  
     210  from struct import unpack as _unpack
     211  
     212  def read_uint1(f):
     213      r"""
     214      >>> import io
     215      >>> read_uint1(io.BytesIO(b'\xff'))
     216      255
     217      """
     218  
     219      data = f.read(1)
     220      if data:
     221          return data[0]
     222      raise ValueError("not enough data in stream to read uint1")
     223  
     224  uint1 = ArgumentDescriptor(
     225              name='uint1',
     226              n=1,
     227              reader=read_uint1,
     228              doc="One-byte unsigned integer.")
     229  
     230  
     231  def read_uint2(f):
     232      r"""
     233      >>> import io
     234      >>> read_uint2(io.BytesIO(b'\xff\x00'))
     235      255
     236      >>> read_uint2(io.BytesIO(b'\xff\xff'))
     237      65535
     238      """
     239  
     240      data = f.read(2)
     241      if len(data) == 2:
     242          return _unpack("<H", data)[0]
     243      raise ValueError("not enough data in stream to read uint2")
     244  
     245  uint2 = ArgumentDescriptor(
     246              name='uint2',
     247              n=2,
     248              reader=read_uint2,
     249              doc="Two-byte unsigned integer, little-endian.")
     250  
     251  
     252  def read_int4(f):
     253      r"""
     254      >>> import io
     255      >>> read_int4(io.BytesIO(b'\xff\x00\x00\x00'))
     256      255
     257      >>> read_int4(io.BytesIO(b'\x00\x00\x00\x80')) == -(2**31)
     258      True
     259      """
     260  
     261      data = f.read(4)
     262      if len(data) == 4:
     263          return _unpack("<i", data)[0]
     264      raise ValueError("not enough data in stream to read int4")
     265  
     266  int4 = ArgumentDescriptor(
     267             name='int4',
     268             n=4,
     269             reader=read_int4,
     270             doc="Four-byte signed integer, little-endian, 2's complement.")
     271  
     272  
     273  def read_uint4(f):
     274      r"""
     275      >>> import io
     276      >>> read_uint4(io.BytesIO(b'\xff\x00\x00\x00'))
     277      255
     278      >>> read_uint4(io.BytesIO(b'\x00\x00\x00\x80')) == 2**31
     279      True
     280      """
     281  
     282      data = f.read(4)
     283      if len(data) == 4:
     284          return _unpack("<I", data)[0]
     285      raise ValueError("not enough data in stream to read uint4")
     286  
     287  uint4 = ArgumentDescriptor(
     288              name='uint4',
     289              n=4,
     290              reader=read_uint4,
     291              doc="Four-byte unsigned integer, little-endian.")
     292  
     293  
     294  def read_uint8(f):
     295      r"""
     296      >>> import io
     297      >>> read_uint8(io.BytesIO(b'\xff\x00\x00\x00\x00\x00\x00\x00'))
     298      255
     299      >>> read_uint8(io.BytesIO(b'\xff' * 8)) == 2**64-1
     300      True
     301      """
     302  
     303      data = f.read(8)
     304      if len(data) == 8:
     305          return _unpack("<Q", data)[0]
     306      raise ValueError("not enough data in stream to read uint8")
     307  
     308  uint8 = ArgumentDescriptor(
     309              name='uint8',
     310              n=8,
     311              reader=read_uint8,
     312              doc="Eight-byte unsigned integer, little-endian.")
     313  
     314  
     315  def read_stringnl(f, decode=True, stripquotes=True):
     316      r"""
     317      >>> import io
     318      >>> read_stringnl(io.BytesIO(b"'abcd'\nefg\n"))
     319      'abcd'
     320  
     321      >>> read_stringnl(io.BytesIO(b"\n"))
     322      Traceback (most recent call last):
     323      ...
     324      ValueError: no string quotes around b''
     325  
     326      >>> read_stringnl(io.BytesIO(b"\n"), stripquotes=False)
     327      ''
     328  
     329      >>> read_stringnl(io.BytesIO(b"''\n"))
     330      ''
     331  
     332      >>> read_stringnl(io.BytesIO(b'"abcd"'))
     333      Traceback (most recent call last):
     334      ...
     335      ValueError: no newline found when trying to read stringnl
     336  
     337      Embedded escapes are undone in the result.
     338      >>> read_stringnl(io.BytesIO(br"'a\n\\b\x00c\td'" + b"\n'e'"))
     339      'a\n\\b\x00c\td'
     340      """
     341  
     342      data = f.readline()
     343      if not data.endswith(b'\n'):
     344          raise ValueError("no newline found when trying to read stringnl")
     345      data = data[:-1]    # lose the newline
     346  
     347      if stripquotes:
     348          for q in (b'"', b"'"):
     349              if data.startswith(q):
     350                  if not data.endswith(q):
     351                      raise ValueError("strinq quote %r not found at both "
     352                                       "ends of %r" % (q, data))
     353                  data = data[1:-1]
     354                  break
     355          else:
     356              raise ValueError("no string quotes around %r" % data)
     357  
     358      if decode:
     359          data = codecs.escape_decode(data)[0].decode("ascii")
     360      return data
     361  
     362  stringnl = ArgumentDescriptor(
     363                 name='stringnl',
     364                 n=UP_TO_NEWLINE,
     365                 reader=read_stringnl,
     366                 doc="""A newline-terminated string.
     367  
     368                     This is a repr-style string, with embedded escapes, and
     369                     bracketing quotes.
     370                     """)
     371  
     372  def read_stringnl_noescape(f):
     373      return read_stringnl(f, stripquotes=False)
     374  
     375  stringnl_noescape = ArgumentDescriptor(
     376                          name='stringnl_noescape',
     377                          n=UP_TO_NEWLINE,
     378                          reader=read_stringnl_noescape,
     379                          doc="""A newline-terminated string.
     380  
     381                          This is a str-style string, without embedded escapes,
     382                          or bracketing quotes.  It should consist solely of
     383                          printable ASCII characters.
     384                          """)
     385  
     386  def read_stringnl_noescape_pair(f):
     387      r"""
     388      >>> import io
     389      >>> read_stringnl_noescape_pair(io.BytesIO(b"Queue\nEmpty\njunk"))
     390      'Queue Empty'
     391      """
     392  
     393      return "%s %s" % (read_stringnl_noescape(f), read_stringnl_noescape(f))
     394  
     395  stringnl_noescape_pair = ArgumentDescriptor(
     396                               name='stringnl_noescape_pair',
     397                               n=UP_TO_NEWLINE,
     398                               reader=read_stringnl_noescape_pair,
     399                               doc="""A pair of newline-terminated strings.
     400  
     401                               These are str-style strings, without embedded
     402                               escapes, or bracketing quotes.  They should
     403                               consist solely of printable ASCII characters.
     404                               The pair is returned as a single string, with
     405                               a single blank separating the two strings.
     406                               """)
     407  
     408  
     409  def read_string1(f):
     410      r"""
     411      >>> import io
     412      >>> read_string1(io.BytesIO(b"\x00"))
     413      ''
     414      >>> read_string1(io.BytesIO(b"\x03abcdef"))
     415      'abc'
     416      """
     417  
     418      n = read_uint1(f)
     419      assert n >= 0
     420      data = f.read(n)
     421      if len(data) == n:
     422          return data.decode("latin-1")
     423      raise ValueError("expected %d bytes in a string1, but only %d remain" %
     424                       (n, len(data)))
     425  
     426  string1 = ArgumentDescriptor(
     427                name="string1",
     428                n=TAKEN_FROM_ARGUMENT1,
     429                reader=read_string1,
     430                doc="""A counted string.
     431  
     432                The first argument is a 1-byte unsigned int giving the number
     433                of bytes in the string, and the second argument is that many
     434                bytes.
     435                """)
     436  
     437  
     438  def read_string4(f):
     439      r"""
     440      >>> import io
     441      >>> read_string4(io.BytesIO(b"\x00\x00\x00\x00abc"))
     442      ''
     443      >>> read_string4(io.BytesIO(b"\x03\x00\x00\x00abcdef"))
     444      'abc'
     445      >>> read_string4(io.BytesIO(b"\x00\x00\x00\x03abcdef"))
     446      Traceback (most recent call last):
     447      ...
     448      ValueError: expected 50331648 bytes in a string4, but only 6 remain
     449      """
     450  
     451      n = read_int4(f)
     452      if n < 0:
     453          raise ValueError("string4 byte count < 0: %d" % n)
     454      data = f.read(n)
     455      if len(data) == n:
     456          return data.decode("latin-1")
     457      raise ValueError("expected %d bytes in a string4, but only %d remain" %
     458                       (n, len(data)))
     459  
     460  string4 = ArgumentDescriptor(
     461                name="string4",
     462                n=TAKEN_FROM_ARGUMENT4,
     463                reader=read_string4,
     464                doc="""A counted string.
     465  
     466                The first argument is a 4-byte little-endian signed int giving
     467                the number of bytes in the string, and the second argument is
     468                that many bytes.
     469                """)
     470  
     471  
     472  def read_bytes1(f):
     473      r"""
     474      >>> import io
     475      >>> read_bytes1(io.BytesIO(b"\x00"))
     476      b''
     477      >>> read_bytes1(io.BytesIO(b"\x03abcdef"))
     478      b'abc'
     479      """
     480  
     481      n = read_uint1(f)
     482      assert n >= 0
     483      data = f.read(n)
     484      if len(data) == n:
     485          return data
     486      raise ValueError("expected %d bytes in a bytes1, but only %d remain" %
     487                       (n, len(data)))
     488  
     489  bytes1 = ArgumentDescriptor(
     490                name="bytes1",
     491                n=TAKEN_FROM_ARGUMENT1,
     492                reader=read_bytes1,
     493                doc="""A counted bytes string.
     494  
     495                The first argument is a 1-byte unsigned int giving the number
     496                of bytes, and the second argument is that many bytes.
     497                """)
     498  
     499  
     500  def read_bytes4(f):
     501      r"""
     502      >>> import io
     503      >>> read_bytes4(io.BytesIO(b"\x00\x00\x00\x00abc"))
     504      b''
     505      >>> read_bytes4(io.BytesIO(b"\x03\x00\x00\x00abcdef"))
     506      b'abc'
     507      >>> read_bytes4(io.BytesIO(b"\x00\x00\x00\x03abcdef"))
     508      Traceback (most recent call last):
     509      ...
     510      ValueError: expected 50331648 bytes in a bytes4, but only 6 remain
     511      """
     512  
     513      n = read_uint4(f)
     514      assert n >= 0
     515      if n > sys.maxsize:
     516          raise ValueError("bytes4 byte count > sys.maxsize: %d" % n)
     517      data = f.read(n)
     518      if len(data) == n:
     519          return data
     520      raise ValueError("expected %d bytes in a bytes4, but only %d remain" %
     521                       (n, len(data)))
     522  
     523  bytes4 = ArgumentDescriptor(
     524                name="bytes4",
     525                n=TAKEN_FROM_ARGUMENT4U,
     526                reader=read_bytes4,
     527                doc="""A counted bytes string.
     528  
     529                The first argument is a 4-byte little-endian unsigned int giving
     530                the number of bytes, and the second argument is that many bytes.
     531                """)
     532  
     533  
     534  def read_bytes8(f):
     535      r"""
     536      >>> import io, struct, sys
     537      >>> read_bytes8(io.BytesIO(b"\x00\x00\x00\x00\x00\x00\x00\x00abc"))
     538      b''
     539      >>> read_bytes8(io.BytesIO(b"\x03\x00\x00\x00\x00\x00\x00\x00abcdef"))
     540      b'abc'
     541      >>> bigsize8 = struct.pack("<Q", sys.maxsize//3)
     542      >>> read_bytes8(io.BytesIO(bigsize8 + b"abcdef"))  #doctest: +ELLIPSIS
     543      Traceback (most recent call last):
     544      ...
     545      ValueError: expected ... bytes in a bytes8, but only 6 remain
     546      """
     547  
     548      n = read_uint8(f)
     549      assert n >= 0
     550      if n > sys.maxsize:
     551          raise ValueError("bytes8 byte count > sys.maxsize: %d" % n)
     552      data = f.read(n)
     553      if len(data) == n:
     554          return data
     555      raise ValueError("expected %d bytes in a bytes8, but only %d remain" %
     556                       (n, len(data)))
     557  
     558  bytes8 = ArgumentDescriptor(
     559                name="bytes8",
     560                n=TAKEN_FROM_ARGUMENT8U,
     561                reader=read_bytes8,
     562                doc="""A counted bytes string.
     563  
     564                The first argument is an 8-byte little-endian unsigned int giving
     565                the number of bytes, and the second argument is that many bytes.
     566                """)
     567  
     568  
     569  def read_bytearray8(f):
     570      r"""
     571      >>> import io, struct, sys
     572      >>> read_bytearray8(io.BytesIO(b"\x00\x00\x00\x00\x00\x00\x00\x00abc"))
     573      bytearray(b'')
     574      >>> read_bytearray8(io.BytesIO(b"\x03\x00\x00\x00\x00\x00\x00\x00abcdef"))
     575      bytearray(b'abc')
     576      >>> bigsize8 = struct.pack("<Q", sys.maxsize//3)
     577      >>> read_bytearray8(io.BytesIO(bigsize8 + b"abcdef"))  #doctest: +ELLIPSIS
     578      Traceback (most recent call last):
     579      ...
     580      ValueError: expected ... bytes in a bytearray8, but only 6 remain
     581      """
     582  
     583      n = read_uint8(f)
     584      assert n >= 0
     585      if n > sys.maxsize:
     586          raise ValueError("bytearray8 byte count > sys.maxsize: %d" % n)
     587      data = f.read(n)
     588      if len(data) == n:
     589          return bytearray(data)
     590      raise ValueError("expected %d bytes in a bytearray8, but only %d remain" %
     591                       (n, len(data)))
     592  
     593  bytearray8 = ArgumentDescriptor(
     594                name="bytearray8",
     595                n=TAKEN_FROM_ARGUMENT8U,
     596                reader=read_bytearray8,
     597                doc="""A counted bytearray.
     598  
     599                The first argument is an 8-byte little-endian unsigned int giving
     600                the number of bytes, and the second argument is that many bytes.
     601                """)
     602  
     603  def read_unicodestringnl(f):
     604      r"""
     605      >>> import io
     606      >>> read_unicodestringnl(io.BytesIO(b"abc\\uabcd\njunk")) == 'abc\uabcd'
     607      True
     608      """
     609  
     610      data = f.readline()
     611      if not data.endswith(b'\n'):
     612          raise ValueError("no newline found when trying to read "
     613                           "unicodestringnl")
     614      data = data[:-1]    # lose the newline
     615      return str(data, 'raw-unicode-escape')
     616  
     617  unicodestringnl = ArgumentDescriptor(
     618                        name='unicodestringnl',
     619                        n=UP_TO_NEWLINE,
     620                        reader=read_unicodestringnl,
     621                        doc="""A newline-terminated Unicode string.
     622  
     623                        This is raw-unicode-escape encoded, so consists of
     624                        printable ASCII characters, and may contain embedded
     625                        escape sequences.
     626                        """)
     627  
     628  
     629  def read_unicodestring1(f):
     630      r"""
     631      >>> import io
     632      >>> s = 'abcd\uabcd'
     633      >>> enc = s.encode('utf-8')
     634      >>> enc
     635      b'abcd\xea\xaf\x8d'
     636      >>> n = bytes([len(enc)])  # little-endian 1-byte length
     637      >>> t = read_unicodestring1(io.BytesIO(n + enc + b'junk'))
     638      >>> s == t
     639      True
     640  
     641      >>> read_unicodestring1(io.BytesIO(n + enc[:-1]))
     642      Traceback (most recent call last):
     643      ...
     644      ValueError: expected 7 bytes in a unicodestring1, but only 6 remain
     645      """
     646  
     647      n = read_uint1(f)
     648      assert n >= 0
     649      data = f.read(n)
     650      if len(data) == n:
     651          return str(data, 'utf-8', 'surrogatepass')
     652      raise ValueError("expected %d bytes in a unicodestring1, but only %d "
     653                       "remain" % (n, len(data)))
     654  
     655  unicodestring1 = ArgumentDescriptor(
     656                      name="unicodestring1",
     657                      n=TAKEN_FROM_ARGUMENT1,
     658                      reader=read_unicodestring1,
     659                      doc="""A counted Unicode string.
     660  
     661                      The first argument is a 1-byte little-endian signed int
     662                      giving the number of bytes in the string, and the second
     663                      argument-- the UTF-8 encoding of the Unicode string --
     664                      contains that many bytes.
     665                      """)
     666  
     667  
     668  def read_unicodestring4(f):
     669      r"""
     670      >>> import io
     671      >>> s = 'abcd\uabcd'
     672      >>> enc = s.encode('utf-8')
     673      >>> enc
     674      b'abcd\xea\xaf\x8d'
     675      >>> n = bytes([len(enc), 0, 0, 0])  # little-endian 4-byte length
     676      >>> t = read_unicodestring4(io.BytesIO(n + enc + b'junk'))
     677      >>> s == t
     678      True
     679  
     680      >>> read_unicodestring4(io.BytesIO(n + enc[:-1]))
     681      Traceback (most recent call last):
     682      ...
     683      ValueError: expected 7 bytes in a unicodestring4, but only 6 remain
     684      """
     685  
     686      n = read_uint4(f)
     687      assert n >= 0
     688      if n > sys.maxsize:
     689          raise ValueError("unicodestring4 byte count > sys.maxsize: %d" % n)
     690      data = f.read(n)
     691      if len(data) == n:
     692          return str(data, 'utf-8', 'surrogatepass')
     693      raise ValueError("expected %d bytes in a unicodestring4, but only %d "
     694                       "remain" % (n, len(data)))
     695  
     696  unicodestring4 = ArgumentDescriptor(
     697                      name="unicodestring4",
     698                      n=TAKEN_FROM_ARGUMENT4U,
     699                      reader=read_unicodestring4,
     700                      doc="""A counted Unicode string.
     701  
     702                      The first argument is a 4-byte little-endian signed int
     703                      giving the number of bytes in the string, and the second
     704                      argument-- the UTF-8 encoding of the Unicode string --
     705                      contains that many bytes.
     706                      """)
     707  
     708  
     709  def read_unicodestring8(f):
     710      r"""
     711      >>> import io
     712      >>> s = 'abcd\uabcd'
     713      >>> enc = s.encode('utf-8')
     714      >>> enc
     715      b'abcd\xea\xaf\x8d'
     716      >>> n = bytes([len(enc)]) + b'\0' * 7  # little-endian 8-byte length
     717      >>> t = read_unicodestring8(io.BytesIO(n + enc + b'junk'))
     718      >>> s == t
     719      True
     720  
     721      >>> read_unicodestring8(io.BytesIO(n + enc[:-1]))
     722      Traceback (most recent call last):
     723      ...
     724      ValueError: expected 7 bytes in a unicodestring8, but only 6 remain
     725      """
     726  
     727      n = read_uint8(f)
     728      assert n >= 0
     729      if n > sys.maxsize:
     730          raise ValueError("unicodestring8 byte count > sys.maxsize: %d" % n)
     731      data = f.read(n)
     732      if len(data) == n:
     733          return str(data, 'utf-8', 'surrogatepass')
     734      raise ValueError("expected %d bytes in a unicodestring8, but only %d "
     735                       "remain" % (n, len(data)))
     736  
     737  unicodestring8 = ArgumentDescriptor(
     738                      name="unicodestring8",
     739                      n=TAKEN_FROM_ARGUMENT8U,
     740                      reader=read_unicodestring8,
     741                      doc="""A counted Unicode string.
     742  
     743                      The first argument is an 8-byte little-endian signed int
     744                      giving the number of bytes in the string, and the second
     745                      argument-- the UTF-8 encoding of the Unicode string --
     746                      contains that many bytes.
     747                      """)
     748  
     749  
     750  def read_decimalnl_short(f):
     751      r"""
     752      >>> import io
     753      >>> read_decimalnl_short(io.BytesIO(b"1234\n56"))
     754      1234
     755  
     756      >>> read_decimalnl_short(io.BytesIO(b"1234L\n56"))
     757      Traceback (most recent call last):
     758      ...
     759      ValueError: invalid literal for int() with base 10: b'1234L'
     760      """
     761  
     762      s = read_stringnl(f, decode=False, stripquotes=False)
     763  
     764      # There's a hack for True and False here.
     765      if s == b"00":
     766          return False
     767      elif s == b"01":
     768          return True
     769  
     770      return int(s)
     771  
     772  def read_decimalnl_long(f):
     773      r"""
     774      >>> import io
     775  
     776      >>> read_decimalnl_long(io.BytesIO(b"1234L\n56"))
     777      1234
     778  
     779      >>> read_decimalnl_long(io.BytesIO(b"123456789012345678901234L\n6"))
     780      123456789012345678901234
     781      """
     782  
     783      s = read_stringnl(f, decode=False, stripquotes=False)
     784      if s[-1:] == b'L':
     785          s = s[:-1]
     786      return int(s)
     787  
     788  
     789  decimalnl_short = ArgumentDescriptor(
     790                        name='decimalnl_short',
     791                        n=UP_TO_NEWLINE,
     792                        reader=read_decimalnl_short,
     793                        doc="""A newline-terminated decimal integer literal.
     794  
     795                            This never has a trailing 'L', and the integer fit
     796                            in a short Python int on the box where the pickle
     797                            was written -- but there's no guarantee it will fit
     798                            in a short Python int on the box where the pickle
     799                            is read.
     800                            """)
     801  
     802  decimalnl_long = ArgumentDescriptor(
     803                       name='decimalnl_long',
     804                       n=UP_TO_NEWLINE,
     805                       reader=read_decimalnl_long,
     806                       doc="""A newline-terminated decimal integer literal.
     807  
     808                           This has a trailing 'L', and can represent integers
     809                           of any size.
     810                           """)
     811  
     812  
     813  def read_floatnl(f):
     814      r"""
     815      >>> import io
     816      >>> read_floatnl(io.BytesIO(b"-1.25\n6"))
     817      -1.25
     818      """
     819      s = read_stringnl(f, decode=False, stripquotes=False)
     820      return float(s)
     821  
     822  floatnl = ArgumentDescriptor(
     823                name='floatnl',
     824                n=UP_TO_NEWLINE,
     825                reader=read_floatnl,
     826                doc="""A newline-terminated decimal floating literal.
     827  
     828                In general this requires 17 significant digits for roundtrip
     829                identity, and pickling then unpickling infinities, NaNs, and
     830                minus zero doesn't work across boxes, or on some boxes even
     831                on itself (e.g., Windows can't read the strings it produces
     832                for infinities or NaNs).
     833                """)
     834  
     835  def read_float8(f):
     836      r"""
     837      >>> import io, struct
     838      >>> raw = struct.pack(">d", -1.25)
     839      >>> raw
     840      b'\xbf\xf4\x00\x00\x00\x00\x00\x00'
     841      >>> read_float8(io.BytesIO(raw + b"\n"))
     842      -1.25
     843      """
     844  
     845      data = f.read(8)
     846      if len(data) == 8:
     847          return _unpack(">d", data)[0]
     848      raise ValueError("not enough data in stream to read float8")
     849  
     850  
     851  float8 = ArgumentDescriptor(
     852               name='float8',
     853               n=8,
     854               reader=read_float8,
     855               doc="""An 8-byte binary representation of a float, big-endian.
     856  
     857               The format is unique to Python, and shared with the struct
     858               module (format string '>d') "in theory" (the struct and pickle
     859               implementations don't share the code -- they should).  It's
     860               strongly related to the IEEE-754 double format, and, in normal
     861               cases, is in fact identical to the big-endian 754 double format.
     862               On other boxes the dynamic range is limited to that of a 754
     863               double, and "add a half and chop" rounding is used to reduce
     864               the precision to 53 bits.  However, even on a 754 box,
     865               infinities, NaNs, and minus zero may not be handled correctly
     866               (may not survive roundtrip pickling intact).
     867               """)
     868  
     869  # Protocol 2 formats
     870  
     871  from pickle import decode_long
     872  
     873  def read_long1(f):
     874      r"""
     875      >>> import io
     876      >>> read_long1(io.BytesIO(b"\x00"))
     877      0
     878      >>> read_long1(io.BytesIO(b"\x02\xff\x00"))
     879      255
     880      >>> read_long1(io.BytesIO(b"\x02\xff\x7f"))
     881      32767
     882      >>> read_long1(io.BytesIO(b"\x02\x00\xff"))
     883      -256
     884      >>> read_long1(io.BytesIO(b"\x02\x00\x80"))
     885      -32768
     886      """
     887  
     888      n = read_uint1(f)
     889      data = f.read(n)
     890      if len(data) != n:
     891          raise ValueError("not enough data in stream to read long1")
     892      return decode_long(data)
     893  
     894  long1 = ArgumentDescriptor(
     895      name="long1",
     896      n=TAKEN_FROM_ARGUMENT1,
     897      reader=read_long1,
     898      doc="""A binary long, little-endian, using 1-byte size.
     899  
     900      This first reads one byte as an unsigned size, then reads that
     901      many bytes and interprets them as a little-endian 2's-complement long.
     902      If the size is 0, that's taken as a shortcut for the long 0L.
     903      """)
     904  
     905  def read_long4(f):
     906      r"""
     907      >>> import io
     908      >>> read_long4(io.BytesIO(b"\x02\x00\x00\x00\xff\x00"))
     909      255
     910      >>> read_long4(io.BytesIO(b"\x02\x00\x00\x00\xff\x7f"))
     911      32767
     912      >>> read_long4(io.BytesIO(b"\x02\x00\x00\x00\x00\xff"))
     913      -256
     914      >>> read_long4(io.BytesIO(b"\x02\x00\x00\x00\x00\x80"))
     915      -32768
     916      >>> read_long1(io.BytesIO(b"\x00\x00\x00\x00"))
     917      0
     918      """
     919  
     920      n = read_int4(f)
     921      if n < 0:
     922          raise ValueError("long4 byte count < 0: %d" % n)
     923      data = f.read(n)
     924      if len(data) != n:
     925          raise ValueError("not enough data in stream to read long4")
     926      return decode_long(data)
     927  
     928  long4 = ArgumentDescriptor(
     929      name="long4",
     930      n=TAKEN_FROM_ARGUMENT4,
     931      reader=read_long4,
     932      doc="""A binary representation of a long, little-endian.
     933  
     934      This first reads four bytes as a signed size (but requires the
     935      size to be >= 0), then reads that many bytes and interprets them
     936      as a little-endian 2's-complement long.  If the size is 0, that's taken
     937      as a shortcut for the int 0, although LONG1 should really be used
     938      then instead (and in any case where # of bytes < 256).
     939      """)
     940  
     941  
     942  ##############################################################################
     943  # Object descriptors.  The stack used by the pickle machine holds objects,
     944  # and in the stack_before and stack_after attributes of OpcodeInfo
     945  # descriptors we need names to describe the various types of objects that can
     946  # appear on the stack.
     947  
     948  class ESC[4;38;5;81mStackObject(ESC[4;38;5;149mobject):
     949      __slots__ = (
     950          # name of descriptor record, for info only
     951          'name',
     952  
     953          # type of object, or tuple of type objects (meaning the object can
     954          # be of any type in the tuple)
     955          'obtype',
     956  
     957          # human-readable docs for this kind of stack object; a string
     958          'doc',
     959      )
     960  
     961      def __init__(self, name, obtype, doc):
     962          assert isinstance(name, str)
     963          self.name = name
     964  
     965          assert isinstance(obtype, type) or isinstance(obtype, tuple)
     966          if isinstance(obtype, tuple):
     967              for contained in obtype:
     968                  assert isinstance(contained, type)
     969          self.obtype = obtype
     970  
     971          assert isinstance(doc, str)
     972          self.doc = doc
     973  
     974      def __repr__(self):
     975          return self.name
     976  
     977  
     978  pyint = pylong = StackObject(
     979      name='int',
     980      obtype=int,
     981      doc="A Python integer object.")
     982  
     983  pyinteger_or_bool = StackObject(
     984      name='int_or_bool',
     985      obtype=(int, bool),
     986      doc="A Python integer or boolean object.")
     987  
     988  pybool = StackObject(
     989      name='bool',
     990      obtype=bool,
     991      doc="A Python boolean object.")
     992  
     993  pyfloat = StackObject(
     994      name='float',
     995      obtype=float,
     996      doc="A Python float object.")
     997  
     998  pybytes_or_str = pystring = StackObject(
     999      name='bytes_or_str',
    1000      obtype=(bytes, str),
    1001      doc="A Python bytes or (Unicode) string object.")
    1002  
    1003  pybytes = StackObject(
    1004      name='bytes',
    1005      obtype=bytes,
    1006      doc="A Python bytes object.")
    1007  
    1008  pybytearray = StackObject(
    1009      name='bytearray',
    1010      obtype=bytearray,
    1011      doc="A Python bytearray object.")
    1012  
    1013  pyunicode = StackObject(
    1014      name='str',
    1015      obtype=str,
    1016      doc="A Python (Unicode) string object.")
    1017  
    1018  pynone = StackObject(
    1019      name="None",
    1020      obtype=type(None),
    1021      doc="The Python None object.")
    1022  
    1023  pytuple = StackObject(
    1024      name="tuple",
    1025      obtype=tuple,
    1026      doc="A Python tuple object.")
    1027  
    1028  pylist = StackObject(
    1029      name="list",
    1030      obtype=list,
    1031      doc="A Python list object.")
    1032  
    1033  pydict = StackObject(
    1034      name="dict",
    1035      obtype=dict,
    1036      doc="A Python dict object.")
    1037  
    1038  pyset = StackObject(
    1039      name="set",
    1040      obtype=set,
    1041      doc="A Python set object.")
    1042  
    1043  pyfrozenset = StackObject(
    1044      name="frozenset",
    1045      obtype=set,
    1046      doc="A Python frozenset object.")
    1047  
    1048  pybuffer = StackObject(
    1049      name='buffer',
    1050      obtype=object,
    1051      doc="A Python buffer-like object.")
    1052  
    1053  anyobject = StackObject(
    1054      name='any',
    1055      obtype=object,
    1056      doc="Any kind of object whatsoever.")
    1057  
    1058  markobject = StackObject(
    1059      name="mark",
    1060      obtype=StackObject,
    1061      doc="""'The mark' is a unique object.
    1062  
    1063  Opcodes that operate on a variable number of objects
    1064  generally don't embed the count of objects in the opcode,
    1065  or pull it off the stack.  Instead the MARK opcode is used
    1066  to push a special marker object on the stack, and then
    1067  some other opcodes grab all the objects from the top of
    1068  the stack down to (but not including) the topmost marker
    1069  object.
    1070  """)
    1071  
    1072  stackslice = StackObject(
    1073      name="stackslice",
    1074      obtype=StackObject,
    1075      doc="""An object representing a contiguous slice of the stack.
    1076  
    1077  This is used in conjunction with markobject, to represent all
    1078  of the stack following the topmost markobject.  For example,
    1079  the POP_MARK opcode changes the stack from
    1080  
    1081      [..., markobject, stackslice]
    1082  to
    1083      [...]
    1084  
    1085  No matter how many object are on the stack after the topmost
    1086  markobject, POP_MARK gets rid of all of them (including the
    1087  topmost markobject too).
    1088  """)
    1089  
    1090  ##############################################################################
    1091  # Descriptors for pickle opcodes.
    1092  
    1093  class ESC[4;38;5;81mOpcodeInfo(ESC[4;38;5;149mobject):
    1094  
    1095      __slots__ = (
    1096          # symbolic name of opcode; a string
    1097          'name',
    1098  
    1099          # the code used in a bytestream to represent the opcode; a
    1100          # one-character string
    1101          'code',
    1102  
    1103          # If the opcode has an argument embedded in the byte string, an
    1104          # instance of ArgumentDescriptor specifying its type.  Note that
    1105          # arg.reader(s) can be used to read and decode the argument from
    1106          # the bytestream s, and arg.doc documents the format of the raw
    1107          # argument bytes.  If the opcode doesn't have an argument embedded
    1108          # in the bytestream, arg should be None.
    1109          'arg',
    1110  
    1111          # what the stack looks like before this opcode runs; a list
    1112          'stack_before',
    1113  
    1114          # what the stack looks like after this opcode runs; a list
    1115          'stack_after',
    1116  
    1117          # the protocol number in which this opcode was introduced; an int
    1118          'proto',
    1119  
    1120          # human-readable docs for this opcode; a string
    1121          'doc',
    1122      )
    1123  
    1124      def __init__(self, name, code, arg,
    1125                   stack_before, stack_after, proto, doc):
    1126          assert isinstance(name, str)
    1127          self.name = name
    1128  
    1129          assert isinstance(code, str)
    1130          assert len(code) == 1
    1131          self.code = code
    1132  
    1133          assert arg is None or isinstance(arg, ArgumentDescriptor)
    1134          self.arg = arg
    1135  
    1136          assert isinstance(stack_before, list)
    1137          for x in stack_before:
    1138              assert isinstance(x, StackObject)
    1139          self.stack_before = stack_before
    1140  
    1141          assert isinstance(stack_after, list)
    1142          for x in stack_after:
    1143              assert isinstance(x, StackObject)
    1144          self.stack_after = stack_after
    1145  
    1146          assert isinstance(proto, int) and 0 <= proto <= pickle.HIGHEST_PROTOCOL
    1147          self.proto = proto
    1148  
    1149          assert isinstance(doc, str)
    1150          self.doc = doc
    1151  
    1152  I = OpcodeInfo
    1153  opcodes = [
    1154  
    1155      # Ways to spell integers.
    1156  
    1157      I(name='INT',
    1158        code='I',
    1159        arg=decimalnl_short,
    1160        stack_before=[],
    1161        stack_after=[pyinteger_or_bool],
    1162        proto=0,
    1163        doc="""Push an integer or bool.
    1164  
    1165        The argument is a newline-terminated decimal literal string.
    1166  
    1167        The intent may have been that this always fit in a short Python int,
    1168        but INT can be generated in pickles written on a 64-bit box that
    1169        require a Python long on a 32-bit box.  The difference between this
    1170        and LONG then is that INT skips a trailing 'L', and produces a short
    1171        int whenever possible.
    1172  
    1173        Another difference is due to that, when bool was introduced as a
    1174        distinct type in 2.3, builtin names True and False were also added to
    1175        2.2.2, mapping to ints 1 and 0.  For compatibility in both directions,
    1176        True gets pickled as INT + "I01\\n", and False as INT + "I00\\n".
    1177        Leading zeroes are never produced for a genuine integer.  The 2.3
    1178        (and later) unpicklers special-case these and return bool instead;
    1179        earlier unpicklers ignore the leading "0" and return the int.
    1180        """),
    1181  
    1182      I(name='BININT',
    1183        code='J',
    1184        arg=int4,
    1185        stack_before=[],
    1186        stack_after=[pyint],
    1187        proto=1,
    1188        doc="""Push a four-byte signed integer.
    1189  
    1190        This handles the full range of Python (short) integers on a 32-bit
    1191        box, directly as binary bytes (1 for the opcode and 4 for the integer).
    1192        If the integer is non-negative and fits in 1 or 2 bytes, pickling via
    1193        BININT1 or BININT2 saves space.
    1194        """),
    1195  
    1196      I(name='BININT1',
    1197        code='K',
    1198        arg=uint1,
    1199        stack_before=[],
    1200        stack_after=[pyint],
    1201        proto=1,
    1202        doc="""Push a one-byte unsigned integer.
    1203  
    1204        This is a space optimization for pickling very small non-negative ints,
    1205        in range(256).
    1206        """),
    1207  
    1208      I(name='BININT2',
    1209        code='M',
    1210        arg=uint2,
    1211        stack_before=[],
    1212        stack_after=[pyint],
    1213        proto=1,
    1214        doc="""Push a two-byte unsigned integer.
    1215  
    1216        This is a space optimization for pickling small positive ints, in
    1217        range(256, 2**16).  Integers in range(256) can also be pickled via
    1218        BININT2, but BININT1 instead saves a byte.
    1219        """),
    1220  
    1221      I(name='LONG',
    1222        code='L',
    1223        arg=decimalnl_long,
    1224        stack_before=[],
    1225        stack_after=[pyint],
    1226        proto=0,
    1227        doc="""Push a long integer.
    1228  
    1229        The same as INT, except that the literal ends with 'L', and always
    1230        unpickles to a Python long.  There doesn't seem a real purpose to the
    1231        trailing 'L'.
    1232  
    1233        Note that LONG takes time quadratic in the number of digits when
    1234        unpickling (this is simply due to the nature of decimal->binary
    1235        conversion).  Proto 2 added linear-time (in C; still quadratic-time
    1236        in Python) LONG1 and LONG4 opcodes.
    1237        """),
    1238  
    1239      I(name="LONG1",
    1240        code='\x8a',
    1241        arg=long1,
    1242        stack_before=[],
    1243        stack_after=[pyint],
    1244        proto=2,
    1245        doc="""Long integer using one-byte length.
    1246  
    1247        A more efficient encoding of a Python long; the long1 encoding
    1248        says it all."""),
    1249  
    1250      I(name="LONG4",
    1251        code='\x8b',
    1252        arg=long4,
    1253        stack_before=[],
    1254        stack_after=[pyint],
    1255        proto=2,
    1256        doc="""Long integer using found-byte length.
    1257  
    1258        A more efficient encoding of a Python long; the long4 encoding
    1259        says it all."""),
    1260  
    1261      # Ways to spell strings (8-bit, not Unicode).
    1262  
    1263      I(name='STRING',
    1264        code='S',
    1265        arg=stringnl,
    1266        stack_before=[],
    1267        stack_after=[pybytes_or_str],
    1268        proto=0,
    1269        doc="""Push a Python string object.
    1270  
    1271        The argument is a repr-style string, with bracketing quote characters,
    1272        and perhaps embedded escapes.  The argument extends until the next
    1273        newline character.  These are usually decoded into a str instance
    1274        using the encoding given to the Unpickler constructor. or the default,
    1275        'ASCII'.  If the encoding given was 'bytes' however, they will be
    1276        decoded as bytes object instead.
    1277        """),
    1278  
    1279      I(name='BINSTRING',
    1280        code='T',
    1281        arg=string4,
    1282        stack_before=[],
    1283        stack_after=[pybytes_or_str],
    1284        proto=1,
    1285        doc="""Push a Python string object.
    1286  
    1287        There are two arguments: the first is a 4-byte little-endian
    1288        signed int giving the number of bytes in the string, and the
    1289        second is that many bytes, which are taken literally as the string
    1290        content.  These are usually decoded into a str instance using the
    1291        encoding given to the Unpickler constructor. or the default,
    1292        'ASCII'.  If the encoding given was 'bytes' however, they will be
    1293        decoded as bytes object instead.
    1294        """),
    1295  
    1296      I(name='SHORT_BINSTRING',
    1297        code='U',
    1298        arg=string1,
    1299        stack_before=[],
    1300        stack_after=[pybytes_or_str],
    1301        proto=1,
    1302        doc="""Push a Python string object.
    1303  
    1304        There are two arguments: the first is a 1-byte unsigned int giving
    1305        the number of bytes in the string, and the second is that many
    1306        bytes, which are taken literally as the string content.  These are
    1307        usually decoded into a str instance using the encoding given to
    1308        the Unpickler constructor. or the default, 'ASCII'.  If the
    1309        encoding given was 'bytes' however, they will be decoded as bytes
    1310        object instead.
    1311        """),
    1312  
    1313      # Bytes (protocol 3 and higher)
    1314  
    1315      I(name='BINBYTES',
    1316        code='B',
    1317        arg=bytes4,
    1318        stack_before=[],
    1319        stack_after=[pybytes],
    1320        proto=3,
    1321        doc="""Push a Python bytes object.
    1322  
    1323        There are two arguments:  the first is a 4-byte little-endian unsigned int
    1324        giving the number of bytes, and the second is that many bytes, which are
    1325        taken literally as the bytes content.
    1326        """),
    1327  
    1328      I(name='SHORT_BINBYTES',
    1329        code='C',
    1330        arg=bytes1,
    1331        stack_before=[],
    1332        stack_after=[pybytes],
    1333        proto=3,
    1334        doc="""Push a Python bytes object.
    1335  
    1336        There are two arguments:  the first is a 1-byte unsigned int giving
    1337        the number of bytes, and the second is that many bytes, which are taken
    1338        literally as the string content.
    1339        """),
    1340  
    1341      I(name='BINBYTES8',
    1342        code='\x8e',
    1343        arg=bytes8,
    1344        stack_before=[],
    1345        stack_after=[pybytes],
    1346        proto=4,
    1347        doc="""Push a Python bytes object.
    1348  
    1349        There are two arguments:  the first is an 8-byte unsigned int giving
    1350        the number of bytes in the string, and the second is that many bytes,
    1351        which are taken literally as the string content.
    1352        """),
    1353  
    1354      # Bytearray (protocol 5 and higher)
    1355  
    1356      I(name='BYTEARRAY8',
    1357        code='\x96',
    1358        arg=bytearray8,
    1359        stack_before=[],
    1360        stack_after=[pybytearray],
    1361        proto=5,
    1362        doc="""Push a Python bytearray object.
    1363  
    1364        There are two arguments:  the first is an 8-byte unsigned int giving
    1365        the number of bytes in the bytearray, and the second is that many bytes,
    1366        which are taken literally as the bytearray content.
    1367        """),
    1368  
    1369      # Out-of-band buffer (protocol 5 and higher)
    1370  
    1371      I(name='NEXT_BUFFER',
    1372        code='\x97',
    1373        arg=None,
    1374        stack_before=[],
    1375        stack_after=[pybuffer],
    1376        proto=5,
    1377        doc="Push an out-of-band buffer object."),
    1378  
    1379      I(name='READONLY_BUFFER',
    1380        code='\x98',
    1381        arg=None,
    1382        stack_before=[pybuffer],
    1383        stack_after=[pybuffer],
    1384        proto=5,
    1385        doc="Make an out-of-band buffer object read-only."),
    1386  
    1387      # Ways to spell None.
    1388  
    1389      I(name='NONE',
    1390        code='N',
    1391        arg=None,
    1392        stack_before=[],
    1393        stack_after=[pynone],
    1394        proto=0,
    1395        doc="Push None on the stack."),
    1396  
    1397      # Ways to spell bools, starting with proto 2.  See INT for how this was
    1398      # done before proto 2.
    1399  
    1400      I(name='NEWTRUE',
    1401        code='\x88',
    1402        arg=None,
    1403        stack_before=[],
    1404        stack_after=[pybool],
    1405        proto=2,
    1406        doc="Push True onto the stack."),
    1407  
    1408      I(name='NEWFALSE',
    1409        code='\x89',
    1410        arg=None,
    1411        stack_before=[],
    1412        stack_after=[pybool],
    1413        proto=2,
    1414        doc="Push False onto the stack."),
    1415  
    1416      # Ways to spell Unicode strings.
    1417  
    1418      I(name='UNICODE',
    1419        code='V',
    1420        arg=unicodestringnl,
    1421        stack_before=[],
    1422        stack_after=[pyunicode],
    1423        proto=0,  # this may be pure-text, but it's a later addition
    1424        doc="""Push a Python Unicode string object.
    1425  
    1426        The argument is a raw-unicode-escape encoding of a Unicode string,
    1427        and so may contain embedded escape sequences.  The argument extends
    1428        until the next newline character.
    1429        """),
    1430  
    1431      I(name='SHORT_BINUNICODE',
    1432        code='\x8c',
    1433        arg=unicodestring1,
    1434        stack_before=[],
    1435        stack_after=[pyunicode],
    1436        proto=4,
    1437        doc="""Push a Python Unicode string object.
    1438  
    1439        There are two arguments:  the first is a 1-byte little-endian signed int
    1440        giving the number of bytes in the string.  The second is that many
    1441        bytes, and is the UTF-8 encoding of the Unicode string.
    1442        """),
    1443  
    1444      I(name='BINUNICODE',
    1445        code='X',
    1446        arg=unicodestring4,
    1447        stack_before=[],
    1448        stack_after=[pyunicode],
    1449        proto=1,
    1450        doc="""Push a Python Unicode string object.
    1451  
    1452        There are two arguments:  the first is a 4-byte little-endian unsigned int
    1453        giving the number of bytes in the string.  The second is that many
    1454        bytes, and is the UTF-8 encoding of the Unicode string.
    1455        """),
    1456  
    1457      I(name='BINUNICODE8',
    1458        code='\x8d',
    1459        arg=unicodestring8,
    1460        stack_before=[],
    1461        stack_after=[pyunicode],
    1462        proto=4,
    1463        doc="""Push a Python Unicode string object.
    1464  
    1465        There are two arguments:  the first is an 8-byte little-endian signed int
    1466        giving the number of bytes in the string.  The second is that many
    1467        bytes, and is the UTF-8 encoding of the Unicode string.
    1468        """),
    1469  
    1470      # Ways to spell floats.
    1471  
    1472      I(name='FLOAT',
    1473        code='F',
    1474        arg=floatnl,
    1475        stack_before=[],
    1476        stack_after=[pyfloat],
    1477        proto=0,
    1478        doc="""Newline-terminated decimal float literal.
    1479  
    1480        The argument is repr(a_float), and in general requires 17 significant
    1481        digits for roundtrip conversion to be an identity (this is so for
    1482        IEEE-754 double precision values, which is what Python float maps to
    1483        on most boxes).
    1484  
    1485        In general, FLOAT cannot be used to transport infinities, NaNs, or
    1486        minus zero across boxes (or even on a single box, if the platform C
    1487        library can't read the strings it produces for such things -- Windows
    1488        is like that), but may do less damage than BINFLOAT on boxes with
    1489        greater precision or dynamic range than IEEE-754 double.
    1490        """),
    1491  
    1492      I(name='BINFLOAT',
    1493        code='G',
    1494        arg=float8,
    1495        stack_before=[],
    1496        stack_after=[pyfloat],
    1497        proto=1,
    1498        doc="""Float stored in binary form, with 8 bytes of data.
    1499  
    1500        This generally requires less than half the space of FLOAT encoding.
    1501        In general, BINFLOAT cannot be used to transport infinities, NaNs, or
    1502        minus zero, raises an exception if the exponent exceeds the range of
    1503        an IEEE-754 double, and retains no more than 53 bits of precision (if
    1504        there are more than that, "add a half and chop" rounding is used to
    1505        cut it back to 53 significant bits).
    1506        """),
    1507  
    1508      # Ways to build lists.
    1509  
    1510      I(name='EMPTY_LIST',
    1511        code=']',
    1512        arg=None,
    1513        stack_before=[],
    1514        stack_after=[pylist],
    1515        proto=1,
    1516        doc="Push an empty list."),
    1517  
    1518      I(name='APPEND',
    1519        code='a',
    1520        arg=None,
    1521        stack_before=[pylist, anyobject],
    1522        stack_after=[pylist],
    1523        proto=0,
    1524        doc="""Append an object to a list.
    1525  
    1526        Stack before:  ... pylist anyobject
    1527        Stack after:   ... pylist+[anyobject]
    1528  
    1529        although pylist is really extended in-place.
    1530        """),
    1531  
    1532      I(name='APPENDS',
    1533        code='e',
    1534        arg=None,
    1535        stack_before=[pylist, markobject, stackslice],
    1536        stack_after=[pylist],
    1537        proto=1,
    1538        doc="""Extend a list by a slice of stack objects.
    1539  
    1540        Stack before:  ... pylist markobject stackslice
    1541        Stack after:   ... pylist+stackslice
    1542  
    1543        although pylist is really extended in-place.
    1544        """),
    1545  
    1546      I(name='LIST',
    1547        code='l',
    1548        arg=None,
    1549        stack_before=[markobject, stackslice],
    1550        stack_after=[pylist],
    1551        proto=0,
    1552        doc="""Build a list out of the topmost stack slice, after markobject.
    1553  
    1554        All the stack entries following the topmost markobject are placed into
    1555        a single Python list, which single list object replaces all of the
    1556        stack from the topmost markobject onward.  For example,
    1557  
    1558        Stack before: ... markobject 1 2 3 'abc'
    1559        Stack after:  ... [1, 2, 3, 'abc']
    1560        """),
    1561  
    1562      # Ways to build tuples.
    1563  
    1564      I(name='EMPTY_TUPLE',
    1565        code=')',
    1566        arg=None,
    1567        stack_before=[],
    1568        stack_after=[pytuple],
    1569        proto=1,
    1570        doc="Push an empty tuple."),
    1571  
    1572      I(name='TUPLE',
    1573        code='t',
    1574        arg=None,
    1575        stack_before=[markobject, stackslice],
    1576        stack_after=[pytuple],
    1577        proto=0,
    1578        doc="""Build a tuple out of the topmost stack slice, after markobject.
    1579  
    1580        All the stack entries following the topmost markobject are placed into
    1581        a single Python tuple, which single tuple object replaces all of the
    1582        stack from the topmost markobject onward.  For example,
    1583  
    1584        Stack before: ... markobject 1 2 3 'abc'
    1585        Stack after:  ... (1, 2, 3, 'abc')
    1586        """),
    1587  
    1588      I(name='TUPLE1',
    1589        code='\x85',
    1590        arg=None,
    1591        stack_before=[anyobject],
    1592        stack_after=[pytuple],
    1593        proto=2,
    1594        doc="""Build a one-tuple out of the topmost item on the stack.
    1595  
    1596        This code pops one value off the stack and pushes a tuple of
    1597        length 1 whose one item is that value back onto it.  In other
    1598        words:
    1599  
    1600            stack[-1] = tuple(stack[-1:])
    1601        """),
    1602  
    1603      I(name='TUPLE2',
    1604        code='\x86',
    1605        arg=None,
    1606        stack_before=[anyobject, anyobject],
    1607        stack_after=[pytuple],
    1608        proto=2,
    1609        doc="""Build a two-tuple out of the top two items on the stack.
    1610  
    1611        This code pops two values off the stack and pushes a tuple of
    1612        length 2 whose items are those values back onto it.  In other
    1613        words:
    1614  
    1615            stack[-2:] = [tuple(stack[-2:])]
    1616        """),
    1617  
    1618      I(name='TUPLE3',
    1619        code='\x87',
    1620        arg=None,
    1621        stack_before=[anyobject, anyobject, anyobject],
    1622        stack_after=[pytuple],
    1623        proto=2,
    1624        doc="""Build a three-tuple out of the top three items on the stack.
    1625  
    1626        This code pops three values off the stack and pushes a tuple of
    1627        length 3 whose items are those values back onto it.  In other
    1628        words:
    1629  
    1630            stack[-3:] = [tuple(stack[-3:])]
    1631        """),
    1632  
    1633      # Ways to build dicts.
    1634  
    1635      I(name='EMPTY_DICT',
    1636        code='}',
    1637        arg=None,
    1638        stack_before=[],
    1639        stack_after=[pydict],
    1640        proto=1,
    1641        doc="Push an empty dict."),
    1642  
    1643      I(name='DICT',
    1644        code='d',
    1645        arg=None,
    1646        stack_before=[markobject, stackslice],
    1647        stack_after=[pydict],
    1648        proto=0,
    1649        doc="""Build a dict out of the topmost stack slice, after markobject.
    1650  
    1651        All the stack entries following the topmost markobject are placed into
    1652        a single Python dict, which single dict object replaces all of the
    1653        stack from the topmost markobject onward.  The stack slice alternates
    1654        key, value, key, value, ....  For example,
    1655  
    1656        Stack before: ... markobject 1 2 3 'abc'
    1657        Stack after:  ... {1: 2, 3: 'abc'}
    1658        """),
    1659  
    1660      I(name='SETITEM',
    1661        code='s',
    1662        arg=None,
    1663        stack_before=[pydict, anyobject, anyobject],
    1664        stack_after=[pydict],
    1665        proto=0,
    1666        doc="""Add a key+value pair to an existing dict.
    1667  
    1668        Stack before:  ... pydict key value
    1669        Stack after:   ... pydict
    1670  
    1671        where pydict has been modified via pydict[key] = value.
    1672        """),
    1673  
    1674      I(name='SETITEMS',
    1675        code='u',
    1676        arg=None,
    1677        stack_before=[pydict, markobject, stackslice],
    1678        stack_after=[pydict],
    1679        proto=1,
    1680        doc="""Add an arbitrary number of key+value pairs to an existing dict.
    1681  
    1682        The slice of the stack following the topmost markobject is taken as
    1683        an alternating sequence of keys and values, added to the dict
    1684        immediately under the topmost markobject.  Everything at and after the
    1685        topmost markobject is popped, leaving the mutated dict at the top
    1686        of the stack.
    1687  
    1688        Stack before:  ... pydict markobject key_1 value_1 ... key_n value_n
    1689        Stack after:   ... pydict
    1690  
    1691        where pydict has been modified via pydict[key_i] = value_i for i in
    1692        1, 2, ..., n, and in that order.
    1693        """),
    1694  
    1695      # Ways to build sets
    1696  
    1697      I(name='EMPTY_SET',
    1698        code='\x8f',
    1699        arg=None,
    1700        stack_before=[],
    1701        stack_after=[pyset],
    1702        proto=4,
    1703        doc="Push an empty set."),
    1704  
    1705      I(name='ADDITEMS',
    1706        code='\x90',
    1707        arg=None,
    1708        stack_before=[pyset, markobject, stackslice],
    1709        stack_after=[pyset],
    1710        proto=4,
    1711        doc="""Add an arbitrary number of items to an existing set.
    1712  
    1713        The slice of the stack following the topmost markobject is taken as
    1714        a sequence of items, added to the set immediately under the topmost
    1715        markobject.  Everything at and after the topmost markobject is popped,
    1716        leaving the mutated set at the top of the stack.
    1717  
    1718        Stack before:  ... pyset markobject item_1 ... item_n
    1719        Stack after:   ... pyset
    1720  
    1721        where pyset has been modified via pyset.add(item_i) = item_i for i in
    1722        1, 2, ..., n, and in that order.
    1723        """),
    1724  
    1725      # Way to build frozensets
    1726  
    1727      I(name='FROZENSET',
    1728        code='\x91',
    1729        arg=None,
    1730        stack_before=[markobject, stackslice],
    1731        stack_after=[pyfrozenset],
    1732        proto=4,
    1733        doc="""Build a frozenset out of the topmost slice, after markobject.
    1734  
    1735        All the stack entries following the topmost markobject are placed into
    1736        a single Python frozenset, which single frozenset object replaces all
    1737        of the stack from the topmost markobject onward.  For example,
    1738  
    1739        Stack before: ... markobject 1 2 3
    1740        Stack after:  ... frozenset({1, 2, 3})
    1741        """),
    1742  
    1743      # Stack manipulation.
    1744  
    1745      I(name='POP',
    1746        code='0',
    1747        arg=None,
    1748        stack_before=[anyobject],
    1749        stack_after=[],
    1750        proto=0,
    1751        doc="Discard the top stack item, shrinking the stack by one item."),
    1752  
    1753      I(name='DUP',
    1754        code='2',
    1755        arg=None,
    1756        stack_before=[anyobject],
    1757        stack_after=[anyobject, anyobject],
    1758        proto=0,
    1759        doc="Push the top stack item onto the stack again, duplicating it."),
    1760  
    1761      I(name='MARK',
    1762        code='(',
    1763        arg=None,
    1764        stack_before=[],
    1765        stack_after=[markobject],
    1766        proto=0,
    1767        doc="""Push markobject onto the stack.
    1768  
    1769        markobject is a unique object, used by other opcodes to identify a
    1770        region of the stack containing a variable number of objects for them
    1771        to work on.  See markobject.doc for more detail.
    1772        """),
    1773  
    1774      I(name='POP_MARK',
    1775        code='1',
    1776        arg=None,
    1777        stack_before=[markobject, stackslice],
    1778        stack_after=[],
    1779        proto=1,
    1780        doc="""Pop all the stack objects at and above the topmost markobject.
    1781  
    1782        When an opcode using a variable number of stack objects is done,
    1783        POP_MARK is used to remove those objects, and to remove the markobject
    1784        that delimited their starting position on the stack.
    1785        """),
    1786  
    1787      # Memo manipulation.  There are really only two operations (get and put),
    1788      # each in all-text, "short binary", and "long binary" flavors.
    1789  
    1790      I(name='GET',
    1791        code='g',
    1792        arg=decimalnl_short,
    1793        stack_before=[],
    1794        stack_after=[anyobject],
    1795        proto=0,
    1796        doc="""Read an object from the memo and push it on the stack.
    1797  
    1798        The index of the memo object to push is given by the newline-terminated
    1799        decimal string following.  BINGET and LONG_BINGET are space-optimized
    1800        versions.
    1801        """),
    1802  
    1803      I(name='BINGET',
    1804        code='h',
    1805        arg=uint1,
    1806        stack_before=[],
    1807        stack_after=[anyobject],
    1808        proto=1,
    1809        doc="""Read an object from the memo and push it on the stack.
    1810  
    1811        The index of the memo object to push is given by the 1-byte unsigned
    1812        integer following.
    1813        """),
    1814  
    1815      I(name='LONG_BINGET',
    1816        code='j',
    1817        arg=uint4,
    1818        stack_before=[],
    1819        stack_after=[anyobject],
    1820        proto=1,
    1821        doc="""Read an object from the memo and push it on the stack.
    1822  
    1823        The index of the memo object to push is given by the 4-byte unsigned
    1824        little-endian integer following.
    1825        """),
    1826  
    1827      I(name='PUT',
    1828        code='p',
    1829        arg=decimalnl_short,
    1830        stack_before=[],
    1831        stack_after=[],
    1832        proto=0,
    1833        doc="""Store the stack top into the memo.  The stack is not popped.
    1834  
    1835        The index of the memo location to write into is given by the newline-
    1836        terminated decimal string following.  BINPUT and LONG_BINPUT are
    1837        space-optimized versions.
    1838        """),
    1839  
    1840      I(name='BINPUT',
    1841        code='q',
    1842        arg=uint1,
    1843        stack_before=[],
    1844        stack_after=[],
    1845        proto=1,
    1846        doc="""Store the stack top into the memo.  The stack is not popped.
    1847  
    1848        The index of the memo location to write into is given by the 1-byte
    1849        unsigned integer following.
    1850        """),
    1851  
    1852      I(name='LONG_BINPUT',
    1853        code='r',
    1854        arg=uint4,
    1855        stack_before=[],
    1856        stack_after=[],
    1857        proto=1,
    1858        doc="""Store the stack top into the memo.  The stack is not popped.
    1859  
    1860        The index of the memo location to write into is given by the 4-byte
    1861        unsigned little-endian integer following.
    1862        """),
    1863  
    1864      I(name='MEMOIZE',
    1865        code='\x94',
    1866        arg=None,
    1867        stack_before=[anyobject],
    1868        stack_after=[anyobject],
    1869        proto=4,
    1870        doc="""Store the stack top into the memo.  The stack is not popped.
    1871  
    1872        The index of the memo location to write is the number of
    1873        elements currently present in the memo.
    1874        """),
    1875  
    1876      # Access the extension registry (predefined objects).  Akin to the GET
    1877      # family.
    1878  
    1879      I(name='EXT1',
    1880        code='\x82',
    1881        arg=uint1,
    1882        stack_before=[],
    1883        stack_after=[anyobject],
    1884        proto=2,
    1885        doc="""Extension code.
    1886  
    1887        This code and the similar EXT2 and EXT4 allow using a registry
    1888        of popular objects that are pickled by name, typically classes.
    1889        It is envisioned that through a global negotiation and
    1890        registration process, third parties can set up a mapping between
    1891        ints and object names.
    1892  
    1893        In order to guarantee pickle interchangeability, the extension
    1894        code registry ought to be global, although a range of codes may
    1895        be reserved for private use.
    1896  
    1897        EXT1 has a 1-byte integer argument.  This is used to index into the
    1898        extension registry, and the object at that index is pushed on the stack.
    1899        """),
    1900  
    1901      I(name='EXT2',
    1902        code='\x83',
    1903        arg=uint2,
    1904        stack_before=[],
    1905        stack_after=[anyobject],
    1906        proto=2,
    1907        doc="""Extension code.
    1908  
    1909        See EXT1.  EXT2 has a two-byte integer argument.
    1910        """),
    1911  
    1912      I(name='EXT4',
    1913        code='\x84',
    1914        arg=int4,
    1915        stack_before=[],
    1916        stack_after=[anyobject],
    1917        proto=2,
    1918        doc="""Extension code.
    1919  
    1920        See EXT1.  EXT4 has a four-byte integer argument.
    1921        """),
    1922  
    1923      # Push a class object, or module function, on the stack, via its module
    1924      # and name.
    1925  
    1926      I(name='GLOBAL',
    1927        code='c',
    1928        arg=stringnl_noescape_pair,
    1929        stack_before=[],
    1930        stack_after=[anyobject],
    1931        proto=0,
    1932        doc="""Push a global object (module.attr) on the stack.
    1933  
    1934        Two newline-terminated strings follow the GLOBAL opcode.  The first is
    1935        taken as a module name, and the second as a class name.  The class
    1936        object module.class is pushed on the stack.  More accurately, the
    1937        object returned by self.find_class(module, class) is pushed on the
    1938        stack, so unpickling subclasses can override this form of lookup.
    1939        """),
    1940  
    1941      I(name='STACK_GLOBAL',
    1942        code='\x93',
    1943        arg=None,
    1944        stack_before=[pyunicode, pyunicode],
    1945        stack_after=[anyobject],
    1946        proto=4,
    1947        doc="""Push a global object (module.attr) on the stack.
    1948        """),
    1949  
    1950      # Ways to build objects of classes pickle doesn't know about directly
    1951      # (user-defined classes).  I despair of documenting this accurately
    1952      # and comprehensibly -- you really have to read the pickle code to
    1953      # find all the special cases.
    1954  
    1955      I(name='REDUCE',
    1956        code='R',
    1957        arg=None,
    1958        stack_before=[anyobject, anyobject],
    1959        stack_after=[anyobject],
    1960        proto=0,
    1961        doc="""Push an object built from a callable and an argument tuple.
    1962  
    1963        The opcode is named to remind of the __reduce__() method.
    1964  
    1965        Stack before: ... callable pytuple
    1966        Stack after:  ... callable(*pytuple)
    1967  
    1968        The callable and the argument tuple are the first two items returned
    1969        by a __reduce__ method.  Applying the callable to the argtuple is
    1970        supposed to reproduce the original object, or at least get it started.
    1971        If the __reduce__ method returns a 3-tuple, the last component is an
    1972        argument to be passed to the object's __setstate__, and then the REDUCE
    1973        opcode is followed by code to create setstate's argument, and then a
    1974        BUILD opcode to apply  __setstate__ to that argument.
    1975  
    1976        If not isinstance(callable, type), REDUCE complains unless the
    1977        callable has been registered with the copyreg module's
    1978        safe_constructors dict, or the callable has a magic
    1979        '__safe_for_unpickling__' attribute with a true value.  I'm not sure
    1980        why it does this, but I've sure seen this complaint often enough when
    1981        I didn't want to <wink>.
    1982        """),
    1983  
    1984      I(name='BUILD',
    1985        code='b',
    1986        arg=None,
    1987        stack_before=[anyobject, anyobject],
    1988        stack_after=[anyobject],
    1989        proto=0,
    1990        doc="""Finish building an object, via __setstate__ or dict update.
    1991  
    1992        Stack before: ... anyobject argument
    1993        Stack after:  ... anyobject
    1994  
    1995        where anyobject may have been mutated, as follows:
    1996  
    1997        If the object has a __setstate__ method,
    1998  
    1999            anyobject.__setstate__(argument)
    2000  
    2001        is called.
    2002  
    2003        Else the argument must be a dict, the object must have a __dict__, and
    2004        the object is updated via
    2005  
    2006            anyobject.__dict__.update(argument)
    2007        """),
    2008  
    2009      I(name='INST',
    2010        code='i',
    2011        arg=stringnl_noescape_pair,
    2012        stack_before=[markobject, stackslice],
    2013        stack_after=[anyobject],
    2014        proto=0,
    2015        doc="""Build a class instance.
    2016  
    2017        This is the protocol 0 version of protocol 1's OBJ opcode.
    2018        INST is followed by two newline-terminated strings, giving a
    2019        module and class name, just as for the GLOBAL opcode (and see
    2020        GLOBAL for more details about that).  self.find_class(module, name)
    2021        is used to get a class object.
    2022  
    2023        In addition, all the objects on the stack following the topmost
    2024        markobject are gathered into a tuple and popped (along with the
    2025        topmost markobject), just as for the TUPLE opcode.
    2026  
    2027        Now it gets complicated.  If all of these are true:
    2028  
    2029          + The argtuple is empty (markobject was at the top of the stack
    2030            at the start).
    2031  
    2032          + The class object does not have a __getinitargs__ attribute.
    2033  
    2034        then we want to create an old-style class instance without invoking
    2035        its __init__() method (pickle has waffled on this over the years; not
    2036        calling __init__() is current wisdom).  In this case, an instance of
    2037        an old-style dummy class is created, and then we try to rebind its
    2038        __class__ attribute to the desired class object.  If this succeeds,
    2039        the new instance object is pushed on the stack, and we're done.
    2040  
    2041        Else (the argtuple is not empty, it's not an old-style class object,
    2042        or the class object does have a __getinitargs__ attribute), the code
    2043        first insists that the class object have a __safe_for_unpickling__
    2044        attribute.  Unlike as for the __safe_for_unpickling__ check in REDUCE,
    2045        it doesn't matter whether this attribute has a true or false value, it
    2046        only matters whether it exists (XXX this is a bug).  If
    2047        __safe_for_unpickling__ doesn't exist, UnpicklingError is raised.
    2048  
    2049        Else (the class object does have a __safe_for_unpickling__ attr),
    2050        the class object obtained from INST's arguments is applied to the
    2051        argtuple obtained from the stack, and the resulting instance object
    2052        is pushed on the stack.
    2053  
    2054        NOTE:  checks for __safe_for_unpickling__ went away in Python 2.3.
    2055        NOTE:  the distinction between old-style and new-style classes does
    2056               not make sense in Python 3.
    2057        """),
    2058  
    2059      I(name='OBJ',
    2060        code='o',
    2061        arg=None,
    2062        stack_before=[markobject, anyobject, stackslice],
    2063        stack_after=[anyobject],
    2064        proto=1,
    2065        doc="""Build a class instance.
    2066  
    2067        This is the protocol 1 version of protocol 0's INST opcode, and is
    2068        very much like it.  The major difference is that the class object
    2069        is taken off the stack, allowing it to be retrieved from the memo
    2070        repeatedly if several instances of the same class are created.  This
    2071        can be much more efficient (in both time and space) than repeatedly
    2072        embedding the module and class names in INST opcodes.
    2073  
    2074        Unlike INST, OBJ takes no arguments from the opcode stream.  Instead
    2075        the class object is taken off the stack, immediately above the
    2076        topmost markobject:
    2077  
    2078        Stack before: ... markobject classobject stackslice
    2079        Stack after:  ... new_instance_object
    2080  
    2081        As for INST, the remainder of the stack above the markobject is
    2082        gathered into an argument tuple, and then the logic seems identical,
    2083        except that no __safe_for_unpickling__ check is done (XXX this is
    2084        a bug).  See INST for the gory details.
    2085  
    2086        NOTE:  In Python 2.3, INST and OBJ are identical except for how they
    2087        get the class object.  That was always the intent; the implementations
    2088        had diverged for accidental reasons.
    2089        """),
    2090  
    2091      I(name='NEWOBJ',
    2092        code='\x81',
    2093        arg=None,
    2094        stack_before=[anyobject, anyobject],
    2095        stack_after=[anyobject],
    2096        proto=2,
    2097        doc="""Build an object instance.
    2098  
    2099        The stack before should be thought of as containing a class
    2100        object followed by an argument tuple (the tuple being the stack
    2101        top).  Call these cls and args.  They are popped off the stack,
    2102        and the value returned by cls.__new__(cls, *args) is pushed back
    2103        onto the stack.
    2104        """),
    2105  
    2106      I(name='NEWOBJ_EX',
    2107        code='\x92',
    2108        arg=None,
    2109        stack_before=[anyobject, anyobject, anyobject],
    2110        stack_after=[anyobject],
    2111        proto=4,
    2112        doc="""Build an object instance.
    2113  
    2114        The stack before should be thought of as containing a class
    2115        object followed by an argument tuple and by a keyword argument dict
    2116        (the dict being the stack top).  Call these cls and args.  They are
    2117        popped off the stack, and the value returned by
    2118        cls.__new__(cls, *args, *kwargs) is  pushed back  onto the stack.
    2119        """),
    2120  
    2121      # Machine control.
    2122  
    2123      I(name='PROTO',
    2124        code='\x80',
    2125        arg=uint1,
    2126        stack_before=[],
    2127        stack_after=[],
    2128        proto=2,
    2129        doc="""Protocol version indicator.
    2130  
    2131        For protocol 2 and above, a pickle must start with this opcode.
    2132        The argument is the protocol version, an int in range(2, 256).
    2133        """),
    2134  
    2135      I(name='STOP',
    2136        code='.',
    2137        arg=None,
    2138        stack_before=[anyobject],
    2139        stack_after=[],
    2140        proto=0,
    2141        doc="""Stop the unpickling machine.
    2142  
    2143        Every pickle ends with this opcode.  The object at the top of the stack
    2144        is popped, and that's the result of unpickling.  The stack should be
    2145        empty then.
    2146        """),
    2147  
    2148      # Framing support.
    2149  
    2150      I(name='FRAME',
    2151        code='\x95',
    2152        arg=uint8,
    2153        stack_before=[],
    2154        stack_after=[],
    2155        proto=4,
    2156        doc="""Indicate the beginning of a new frame.
    2157  
    2158        The unpickler may use this opcode to safely prefetch data from its
    2159        underlying stream.
    2160        """),
    2161  
    2162      # Ways to deal with persistent IDs.
    2163  
    2164      I(name='PERSID',
    2165        code='P',
    2166        arg=stringnl_noescape,
    2167        stack_before=[],
    2168        stack_after=[anyobject],
    2169        proto=0,
    2170        doc="""Push an object identified by a persistent ID.
    2171  
    2172        The pickle module doesn't define what a persistent ID means.  PERSID's
    2173        argument is a newline-terminated str-style (no embedded escapes, no
    2174        bracketing quote characters) string, which *is* "the persistent ID".
    2175        The unpickler passes this string to self.persistent_load().  Whatever
    2176        object that returns is pushed on the stack.  There is no implementation
    2177        of persistent_load() in Python's unpickler:  it must be supplied by an
    2178        unpickler subclass.
    2179        """),
    2180  
    2181      I(name='BINPERSID',
    2182        code='Q',
    2183        arg=None,
    2184        stack_before=[anyobject],
    2185        stack_after=[anyobject],
    2186        proto=1,
    2187        doc="""Push an object identified by a persistent ID.
    2188  
    2189        Like PERSID, except the persistent ID is popped off the stack (instead
    2190        of being a string embedded in the opcode bytestream).  The persistent
    2191        ID is passed to self.persistent_load(), and whatever object that
    2192        returns is pushed on the stack.  See PERSID for more detail.
    2193        """),
    2194  ]
    2195  del I
    2196  
    2197  # Verify uniqueness of .name and .code members.
    2198  name2i = {}
    2199  code2i = {}
    2200  
    2201  for i, d in enumerate(opcodes):
    2202      if d.name in name2i:
    2203          raise ValueError("repeated name %r at indices %d and %d" %
    2204                           (d.name, name2i[d.name], i))
    2205      if d.code in code2i:
    2206          raise ValueError("repeated code %r at indices %d and %d" %
    2207                           (d.code, code2i[d.code], i))
    2208  
    2209      name2i[d.name] = i
    2210      code2i[d.code] = i
    2211  
    2212  del name2i, code2i, i, d
    2213  
    2214  ##############################################################################
    2215  # Build a code2op dict, mapping opcode characters to OpcodeInfo records.
    2216  # Also ensure we've got the same stuff as pickle.py, although the
    2217  # introspection here is dicey.
    2218  
    2219  code2op = {}
    2220  for d in opcodes:
    2221      code2op[d.code] = d
    2222  del d
    2223  
    2224  def assure_pickle_consistency(verbose=False):
    2225  
    2226      copy = code2op.copy()
    2227      for name in pickle.__all__:
    2228          if not re.match("[A-Z][A-Z0-9_]+$", name):
    2229              if verbose:
    2230                  print("skipping %r: it doesn't look like an opcode name" % name)
    2231              continue
    2232          picklecode = getattr(pickle, name)
    2233          if not isinstance(picklecode, bytes) or len(picklecode) != 1:
    2234              if verbose:
    2235                  print(("skipping %r: value %r doesn't look like a pickle "
    2236                         "code" % (name, picklecode)))
    2237              continue
    2238          picklecode = picklecode.decode("latin-1")
    2239          if picklecode in copy:
    2240              if verbose:
    2241                  print("checking name %r w/ code %r for consistency" % (
    2242                        name, picklecode))
    2243              d = copy[picklecode]
    2244              if d.name != name:
    2245                  raise ValueError("for pickle code %r, pickle.py uses name %r "
    2246                                   "but we're using name %r" % (picklecode,
    2247                                                                name,
    2248                                                                d.name))
    2249              # Forget this one.  Any left over in copy at the end are a problem
    2250              # of a different kind.
    2251              del copy[picklecode]
    2252          else:
    2253              raise ValueError("pickle.py appears to have a pickle opcode with "
    2254                               "name %r and code %r, but we don't" %
    2255                               (name, picklecode))
    2256      if copy:
    2257          msg = ["we appear to have pickle opcodes that pickle.py doesn't have:"]
    2258          for code, d in copy.items():
    2259              msg.append("    name %r with code %r" % (d.name, code))
    2260          raise ValueError("\n".join(msg))
    2261  
    2262  assure_pickle_consistency()
    2263  del assure_pickle_consistency
    2264  
    2265  ##############################################################################
    2266  # A pickle opcode generator.
    2267  
    2268  def _genops(data, yield_end_pos=False):
    2269      if isinstance(data, bytes_types):
    2270          data = io.BytesIO(data)
    2271  
    2272      if hasattr(data, "tell"):
    2273          getpos = data.tell
    2274      else:
    2275          getpos = lambda: None
    2276  
    2277      while True:
    2278          pos = getpos()
    2279          code = data.read(1)
    2280          opcode = code2op.get(code.decode("latin-1"))
    2281          if opcode is None:
    2282              if code == b"":
    2283                  raise ValueError("pickle exhausted before seeing STOP")
    2284              else:
    2285                  raise ValueError("at position %s, opcode %r unknown" % (
    2286                                   "<unknown>" if pos is None else pos,
    2287                                   code))
    2288          if opcode.arg is None:
    2289              arg = None
    2290          else:
    2291              arg = opcode.arg.reader(data)
    2292          if yield_end_pos:
    2293              yield opcode, arg, pos, getpos()
    2294          else:
    2295              yield opcode, arg, pos
    2296          if code == b'.':
    2297              assert opcode.name == 'STOP'
    2298              break
    2299  
    2300  def genops(pickle):
    2301      """Generate all the opcodes in a pickle.
    2302  
    2303      'pickle' is a file-like object, or string, containing the pickle.
    2304  
    2305      Each opcode in the pickle is generated, from the current pickle position,
    2306      stopping after a STOP opcode is delivered.  A triple is generated for
    2307      each opcode:
    2308  
    2309          opcode, arg, pos
    2310  
    2311      opcode is an OpcodeInfo record, describing the current opcode.
    2312  
    2313      If the opcode has an argument embedded in the pickle, arg is its decoded
    2314      value, as a Python object.  If the opcode doesn't have an argument, arg
    2315      is None.
    2316  
    2317      If the pickle has a tell() method, pos was the value of pickle.tell()
    2318      before reading the current opcode.  If the pickle is a bytes object,
    2319      it's wrapped in a BytesIO object, and the latter's tell() result is
    2320      used.  Else (the pickle doesn't have a tell(), and it's not obvious how
    2321      to query its current position) pos is None.
    2322      """
    2323      return _genops(pickle)
    2324  
    2325  ##############################################################################
    2326  # A pickle optimizer.
    2327  
    2328  def optimize(p):
    2329      'Optimize a pickle string by removing unused PUT opcodes'
    2330      put = 'PUT'
    2331      get = 'GET'
    2332      oldids = set()          # set of all PUT ids
    2333      newids = {}             # set of ids used by a GET opcode
    2334      opcodes = []            # (op, idx) or (pos, end_pos)
    2335      proto = 0
    2336      protoheader = b''
    2337      for opcode, arg, pos, end_pos in _genops(p, yield_end_pos=True):
    2338          if 'PUT' in opcode.name:
    2339              oldids.add(arg)
    2340              opcodes.append((put, arg))
    2341          elif opcode.name == 'MEMOIZE':
    2342              idx = len(oldids)
    2343              oldids.add(idx)
    2344              opcodes.append((put, idx))
    2345          elif 'FRAME' in opcode.name:
    2346              pass
    2347          elif 'GET' in opcode.name:
    2348              if opcode.proto > proto:
    2349                  proto = opcode.proto
    2350              newids[arg] = None
    2351              opcodes.append((get, arg))
    2352          elif opcode.name == 'PROTO':
    2353              if arg > proto:
    2354                  proto = arg
    2355              if pos == 0:
    2356                  protoheader = p[pos:end_pos]
    2357              else:
    2358                  opcodes.append((pos, end_pos))
    2359          else:
    2360              opcodes.append((pos, end_pos))
    2361      del oldids
    2362  
    2363      # Copy the opcodes except for PUTS without a corresponding GET
    2364      out = io.BytesIO()
    2365      # Write the PROTO header before any framing
    2366      out.write(protoheader)
    2367      pickler = pickle._Pickler(out, proto)
    2368      if proto >= 4:
    2369          pickler.framer.start_framing()
    2370      idx = 0
    2371      for op, arg in opcodes:
    2372          frameless = False
    2373          if op is put:
    2374              if arg not in newids:
    2375                  continue
    2376              data = pickler.put(idx)
    2377              newids[arg] = idx
    2378              idx += 1
    2379          elif op is get:
    2380              data = pickler.get(newids[arg])
    2381          else:
    2382              data = p[op:arg]
    2383              frameless = len(data) > pickler.framer._FRAME_SIZE_TARGET
    2384          pickler.framer.commit_frame(force=frameless)
    2385          if frameless:
    2386              pickler.framer.file_write(data)
    2387          else:
    2388              pickler.write(data)
    2389      pickler.framer.end_framing()
    2390      return out.getvalue()
    2391  
    2392  ##############################################################################
    2393  # A symbolic pickle disassembler.
    2394  
    2395  def dis(pickle, out=None, memo=None, indentlevel=4, annotate=0):
    2396      """Produce a symbolic disassembly of a pickle.
    2397  
    2398      'pickle' is a file-like object, or string, containing a (at least one)
    2399      pickle.  The pickle is disassembled from the current position, through
    2400      the first STOP opcode encountered.
    2401  
    2402      Optional arg 'out' is a file-like object to which the disassembly is
    2403      printed.  It defaults to sys.stdout.
    2404  
    2405      Optional arg 'memo' is a Python dict, used as the pickle's memo.  It
    2406      may be mutated by dis(), if the pickle contains PUT or BINPUT opcodes.
    2407      Passing the same memo object to another dis() call then allows disassembly
    2408      to proceed across multiple pickles that were all created by the same
    2409      pickler with the same memo.  Ordinarily you don't need to worry about this.
    2410  
    2411      Optional arg 'indentlevel' is the number of blanks by which to indent
    2412      a new MARK level.  It defaults to 4.
    2413  
    2414      Optional arg 'annotate' if nonzero instructs dis() to add short
    2415      description of the opcode on each line of disassembled output.
    2416      The value given to 'annotate' must be an integer and is used as a
    2417      hint for the column where annotation should start.  The default
    2418      value is 0, meaning no annotations.
    2419  
    2420      In addition to printing the disassembly, some sanity checks are made:
    2421  
    2422      + All embedded opcode arguments "make sense".
    2423  
    2424      + Explicit and implicit pop operations have enough items on the stack.
    2425  
    2426      + When an opcode implicitly refers to a markobject, a markobject is
    2427        actually on the stack.
    2428  
    2429      + A memo entry isn't referenced before it's defined.
    2430  
    2431      + The markobject isn't stored in the memo.
    2432  
    2433      + A memo entry isn't redefined.
    2434      """
    2435  
    2436      # Most of the hair here is for sanity checks, but most of it is needed
    2437      # anyway to detect when a protocol 0 POP takes a MARK off the stack
    2438      # (which in turn is needed to indent MARK blocks correctly).
    2439  
    2440      stack = []          # crude emulation of unpickler stack
    2441      if memo is None:
    2442          memo = {}       # crude emulation of unpickler memo
    2443      maxproto = -1       # max protocol number seen
    2444      markstack = []      # bytecode positions of MARK opcodes
    2445      indentchunk = ' ' * indentlevel
    2446      errormsg = None
    2447      annocol = annotate  # column hint for annotations
    2448      for opcode, arg, pos in genops(pickle):
    2449          if pos is not None:
    2450              print("%5d:" % pos, end=' ', file=out)
    2451  
    2452          line = "%-4s %s%s" % (repr(opcode.code)[1:-1],
    2453                                indentchunk * len(markstack),
    2454                                opcode.name)
    2455  
    2456          maxproto = max(maxproto, opcode.proto)
    2457          before = opcode.stack_before    # don't mutate
    2458          after = opcode.stack_after      # don't mutate
    2459          numtopop = len(before)
    2460  
    2461          # See whether a MARK should be popped.
    2462          markmsg = None
    2463          if markobject in before or (opcode.name == "POP" and
    2464                                      stack and
    2465                                      stack[-1] is markobject):
    2466              assert markobject not in after
    2467              if __debug__:
    2468                  if markobject in before:
    2469                      assert before[-1] is stackslice
    2470              if markstack:
    2471                  markpos = markstack.pop()
    2472                  if markpos is None:
    2473                      markmsg = "(MARK at unknown opcode offset)"
    2474                  else:
    2475                      markmsg = "(MARK at %d)" % markpos
    2476                  # Pop everything at and after the topmost markobject.
    2477                  while stack[-1] is not markobject:
    2478                      stack.pop()
    2479                  stack.pop()
    2480                  # Stop later code from popping too much.
    2481                  try:
    2482                      numtopop = before.index(markobject)
    2483                  except ValueError:
    2484                      assert opcode.name == "POP"
    2485                      numtopop = 0
    2486              else:
    2487                  errormsg = markmsg = "no MARK exists on stack"
    2488  
    2489          # Check for correct memo usage.
    2490          if opcode.name in ("PUT", "BINPUT", "LONG_BINPUT", "MEMOIZE"):
    2491              if opcode.name == "MEMOIZE":
    2492                  memo_idx = len(memo)
    2493                  markmsg = "(as %d)" % memo_idx
    2494              else:
    2495                  assert arg is not None
    2496                  memo_idx = arg
    2497              if memo_idx in memo:
    2498                  errormsg = "memo key %r already defined" % arg
    2499              elif not stack:
    2500                  errormsg = "stack is empty -- can't store into memo"
    2501              elif stack[-1] is markobject:
    2502                  errormsg = "can't store markobject in the memo"
    2503              else:
    2504                  memo[memo_idx] = stack[-1]
    2505          elif opcode.name in ("GET", "BINGET", "LONG_BINGET"):
    2506              if arg in memo:
    2507                  assert len(after) == 1
    2508                  after = [memo[arg]]     # for better stack emulation
    2509              else:
    2510                  errormsg = "memo key %r has never been stored into" % arg
    2511  
    2512          if arg is not None or markmsg:
    2513              # make a mild effort to align arguments
    2514              line += ' ' * (10 - len(opcode.name))
    2515              if arg is not None:
    2516                  line += ' ' + repr(arg)
    2517              if markmsg:
    2518                  line += ' ' + markmsg
    2519          if annotate:
    2520              line += ' ' * (annocol - len(line))
    2521              # make a mild effort to align annotations
    2522              annocol = len(line)
    2523              if annocol > 50:
    2524                  annocol = annotate
    2525              line += ' ' + opcode.doc.split('\n', 1)[0]
    2526          print(line, file=out)
    2527  
    2528          if errormsg:
    2529              # Note that we delayed complaining until the offending opcode
    2530              # was printed.
    2531              raise ValueError(errormsg)
    2532  
    2533          # Emulate the stack effects.
    2534          if len(stack) < numtopop:
    2535              raise ValueError("tries to pop %d items from stack with "
    2536                               "only %d items" % (numtopop, len(stack)))
    2537          if numtopop:
    2538              del stack[-numtopop:]
    2539          if markobject in after:
    2540              assert markobject not in before
    2541              markstack.append(pos)
    2542  
    2543          stack.extend(after)
    2544  
    2545      print("highest protocol among opcodes =", maxproto, file=out)
    2546      if stack:
    2547          raise ValueError("stack not empty after STOP: %r" % stack)
    2548  
    2549  # For use in the doctest, simply as an example of a class to pickle.
    2550  class ESC[4;38;5;81m_Example:
    2551      def __init__(self, value):
    2552          self.value = value
    2553  
    2554  _dis_test = r"""
    2555  >>> import pickle
    2556  >>> x = [1, 2, (3, 4), {b'abc': "def"}]
    2557  >>> pkl0 = pickle.dumps(x, 0)
    2558  >>> dis(pkl0)
    2559      0: (    MARK
    2560      1: l        LIST       (MARK at 0)
    2561      2: p    PUT        0
    2562      5: I    INT        1
    2563      8: a    APPEND
    2564      9: I    INT        2
    2565     12: a    APPEND
    2566     13: (    MARK
    2567     14: I        INT        3
    2568     17: I        INT        4
    2569     20: t        TUPLE      (MARK at 13)
    2570     21: p    PUT        1
    2571     24: a    APPEND
    2572     25: (    MARK
    2573     26: d        DICT       (MARK at 25)
    2574     27: p    PUT        2
    2575     30: c    GLOBAL     '_codecs encode'
    2576     46: p    PUT        3
    2577     49: (    MARK
    2578     50: V        UNICODE    'abc'
    2579     55: p        PUT        4
    2580     58: V        UNICODE    'latin1'
    2581     66: p        PUT        5
    2582     69: t        TUPLE      (MARK at 49)
    2583     70: p    PUT        6
    2584     73: R    REDUCE
    2585     74: p    PUT        7
    2586     77: V    UNICODE    'def'
    2587     82: p    PUT        8
    2588     85: s    SETITEM
    2589     86: a    APPEND
    2590     87: .    STOP
    2591  highest protocol among opcodes = 0
    2592  
    2593  Try again with a "binary" pickle.
    2594  
    2595  >>> pkl1 = pickle.dumps(x, 1)
    2596  >>> dis(pkl1)
    2597      0: ]    EMPTY_LIST
    2598      1: q    BINPUT     0
    2599      3: (    MARK
    2600      4: K        BININT1    1
    2601      6: K        BININT1    2
    2602      8: (        MARK
    2603      9: K            BININT1    3
    2604     11: K            BININT1    4
    2605     13: t            TUPLE      (MARK at 8)
    2606     14: q        BINPUT     1
    2607     16: }        EMPTY_DICT
    2608     17: q        BINPUT     2
    2609     19: c        GLOBAL     '_codecs encode'
    2610     35: q        BINPUT     3
    2611     37: (        MARK
    2612     38: X            BINUNICODE 'abc'
    2613     46: q            BINPUT     4
    2614     48: X            BINUNICODE 'latin1'
    2615     59: q            BINPUT     5
    2616     61: t            TUPLE      (MARK at 37)
    2617     62: q        BINPUT     6
    2618     64: R        REDUCE
    2619     65: q        BINPUT     7
    2620     67: X        BINUNICODE 'def'
    2621     75: q        BINPUT     8
    2622     77: s        SETITEM
    2623     78: e        APPENDS    (MARK at 3)
    2624     79: .    STOP
    2625  highest protocol among opcodes = 1
    2626  
    2627  Exercise the INST/OBJ/BUILD family.
    2628  
    2629  >>> import pickletools
    2630  >>> dis(pickle.dumps(pickletools.dis, 0))
    2631      0: c    GLOBAL     'pickletools dis'
    2632     17: p    PUT        0
    2633     20: .    STOP
    2634  highest protocol among opcodes = 0
    2635  
    2636  >>> from pickletools import _Example
    2637  >>> x = [_Example(42)] * 2
    2638  >>> dis(pickle.dumps(x, 0))
    2639      0: (    MARK
    2640      1: l        LIST       (MARK at 0)
    2641      2: p    PUT        0
    2642      5: c    GLOBAL     'copy_reg _reconstructor'
    2643     30: p    PUT        1
    2644     33: (    MARK
    2645     34: c        GLOBAL     'pickletools _Example'
    2646     56: p        PUT        2
    2647     59: c        GLOBAL     '__builtin__ object'
    2648     79: p        PUT        3
    2649     82: N        NONE
    2650     83: t        TUPLE      (MARK at 33)
    2651     84: p    PUT        4
    2652     87: R    REDUCE
    2653     88: p    PUT        5
    2654     91: (    MARK
    2655     92: d        DICT       (MARK at 91)
    2656     93: p    PUT        6
    2657     96: V    UNICODE    'value'
    2658    103: p    PUT        7
    2659    106: I    INT        42
    2660    110: s    SETITEM
    2661    111: b    BUILD
    2662    112: a    APPEND
    2663    113: g    GET        5
    2664    116: a    APPEND
    2665    117: .    STOP
    2666  highest protocol among opcodes = 0
    2667  
    2668  >>> dis(pickle.dumps(x, 1))
    2669      0: ]    EMPTY_LIST
    2670      1: q    BINPUT     0
    2671      3: (    MARK
    2672      4: c        GLOBAL     'copy_reg _reconstructor'
    2673     29: q        BINPUT     1
    2674     31: (        MARK
    2675     32: c            GLOBAL     'pickletools _Example'
    2676     54: q            BINPUT     2
    2677     56: c            GLOBAL     '__builtin__ object'
    2678     76: q            BINPUT     3
    2679     78: N            NONE
    2680     79: t            TUPLE      (MARK at 31)
    2681     80: q        BINPUT     4
    2682     82: R        REDUCE
    2683     83: q        BINPUT     5
    2684     85: }        EMPTY_DICT
    2685     86: q        BINPUT     6
    2686     88: X        BINUNICODE 'value'
    2687     98: q        BINPUT     7
    2688    100: K        BININT1    42
    2689    102: s        SETITEM
    2690    103: b        BUILD
    2691    104: h        BINGET     5
    2692    106: e        APPENDS    (MARK at 3)
    2693    107: .    STOP
    2694  highest protocol among opcodes = 1
    2695  
    2696  Try "the canonical" recursive-object test.
    2697  
    2698  >>> L = []
    2699  >>> T = L,
    2700  >>> L.append(T)
    2701  >>> L[0] is T
    2702  True
    2703  >>> T[0] is L
    2704  True
    2705  >>> L[0][0] is L
    2706  True
    2707  >>> T[0][0] is T
    2708  True
    2709  >>> dis(pickle.dumps(L, 0))
    2710      0: (    MARK
    2711      1: l        LIST       (MARK at 0)
    2712      2: p    PUT        0
    2713      5: (    MARK
    2714      6: g        GET        0
    2715      9: t        TUPLE      (MARK at 5)
    2716     10: p    PUT        1
    2717     13: a    APPEND
    2718     14: .    STOP
    2719  highest protocol among opcodes = 0
    2720  
    2721  >>> dis(pickle.dumps(L, 1))
    2722      0: ]    EMPTY_LIST
    2723      1: q    BINPUT     0
    2724      3: (    MARK
    2725      4: h        BINGET     0
    2726      6: t        TUPLE      (MARK at 3)
    2727      7: q    BINPUT     1
    2728      9: a    APPEND
    2729     10: .    STOP
    2730  highest protocol among opcodes = 1
    2731  
    2732  Note that, in the protocol 0 pickle of the recursive tuple, the disassembler
    2733  has to emulate the stack in order to realize that the POP opcode at 16 gets
    2734  rid of the MARK at 0.
    2735  
    2736  >>> dis(pickle.dumps(T, 0))
    2737      0: (    MARK
    2738      1: (        MARK
    2739      2: l            LIST       (MARK at 1)
    2740      3: p        PUT        0
    2741      6: (        MARK
    2742      7: g            GET        0
    2743     10: t            TUPLE      (MARK at 6)
    2744     11: p        PUT        1
    2745     14: a        APPEND
    2746     15: 0        POP
    2747     16: 0        POP        (MARK at 0)
    2748     17: g    GET        1
    2749     20: .    STOP
    2750  highest protocol among opcodes = 0
    2751  
    2752  >>> dis(pickle.dumps(T, 1))
    2753      0: (    MARK
    2754      1: ]        EMPTY_LIST
    2755      2: q        BINPUT     0
    2756      4: (        MARK
    2757      5: h            BINGET     0
    2758      7: t            TUPLE      (MARK at 4)
    2759      8: q        BINPUT     1
    2760     10: a        APPEND
    2761     11: 1        POP_MARK   (MARK at 0)
    2762     12: h    BINGET     1
    2763     14: .    STOP
    2764  highest protocol among opcodes = 1
    2765  
    2766  Try protocol 2.
    2767  
    2768  >>> dis(pickle.dumps(L, 2))
    2769      0: \x80 PROTO      2
    2770      2: ]    EMPTY_LIST
    2771      3: q    BINPUT     0
    2772      5: h    BINGET     0
    2773      7: \x85 TUPLE1
    2774      8: q    BINPUT     1
    2775     10: a    APPEND
    2776     11: .    STOP
    2777  highest protocol among opcodes = 2
    2778  
    2779  >>> dis(pickle.dumps(T, 2))
    2780      0: \x80 PROTO      2
    2781      2: ]    EMPTY_LIST
    2782      3: q    BINPUT     0
    2783      5: h    BINGET     0
    2784      7: \x85 TUPLE1
    2785      8: q    BINPUT     1
    2786     10: a    APPEND
    2787     11: 0    POP
    2788     12: h    BINGET     1
    2789     14: .    STOP
    2790  highest protocol among opcodes = 2
    2791  
    2792  Try protocol 3 with annotations:
    2793  
    2794  >>> dis(pickle.dumps(T, 3), annotate=1)
    2795      0: \x80 PROTO      3 Protocol version indicator.
    2796      2: ]    EMPTY_LIST   Push an empty list.
    2797      3: q    BINPUT     0 Store the stack top into the memo.  The stack is not popped.
    2798      5: h    BINGET     0 Read an object from the memo and push it on the stack.
    2799      7: \x85 TUPLE1       Build a one-tuple out of the topmost item on the stack.
    2800      8: q    BINPUT     1 Store the stack top into the memo.  The stack is not popped.
    2801     10: a    APPEND       Append an object to a list.
    2802     11: 0    POP          Discard the top stack item, shrinking the stack by one item.
    2803     12: h    BINGET     1 Read an object from the memo and push it on the stack.
    2804     14: .    STOP         Stop the unpickling machine.
    2805  highest protocol among opcodes = 2
    2806  
    2807  """
    2808  
    2809  _memo_test = r"""
    2810  >>> import pickle
    2811  >>> import io
    2812  >>> f = io.BytesIO()
    2813  >>> p = pickle.Pickler(f, 2)
    2814  >>> x = [1, 2, 3]
    2815  >>> p.dump(x)
    2816  >>> p.dump(x)
    2817  >>> f.seek(0)
    2818  0
    2819  >>> memo = {}
    2820  >>> dis(f, memo=memo)
    2821      0: \x80 PROTO      2
    2822      2: ]    EMPTY_LIST
    2823      3: q    BINPUT     0
    2824      5: (    MARK
    2825      6: K        BININT1    1
    2826      8: K        BININT1    2
    2827     10: K        BININT1    3
    2828     12: e        APPENDS    (MARK at 5)
    2829     13: .    STOP
    2830  highest protocol among opcodes = 2
    2831  >>> dis(f, memo=memo)
    2832     14: \x80 PROTO      2
    2833     16: h    BINGET     0
    2834     18: .    STOP
    2835  highest protocol among opcodes = 2
    2836  """
    2837  
    2838  __test__ = {'disassembler_test': _dis_test,
    2839              'disassembler_memo_test': _memo_test,
    2840             }
    2841  
    2842  def _test():
    2843      import doctest
    2844      return doctest.testmod()
    2845  
    2846  if __name__ == "__main__":
    2847      import argparse
    2848      parser = argparse.ArgumentParser(
    2849          description='disassemble one or more pickle files')
    2850      parser.add_argument(
    2851          'pickle_file', type=argparse.FileType('br'),
    2852          nargs='*', help='the pickle file')
    2853      parser.add_argument(
    2854          '-o', '--output', default=sys.stdout, type=argparse.FileType('w'),
    2855          help='the file where the output should be written')
    2856      parser.add_argument(
    2857          '-m', '--memo', action='store_true',
    2858          help='preserve memo between disassemblies')
    2859      parser.add_argument(
    2860          '-l', '--indentlevel', default=4, type=int,
    2861          help='the number of blanks by which to indent a new MARK level')
    2862      parser.add_argument(
    2863          '-a', '--annotate',  action='store_true',
    2864          help='annotate each line with a short opcode description')
    2865      parser.add_argument(
    2866          '-p', '--preamble', default="==> {name} <==",
    2867          help='if more than one pickle file is specified, print this before'
    2868          ' each disassembly')
    2869      parser.add_argument(
    2870          '-t', '--test', action='store_true',
    2871          help='run self-test suite')
    2872      parser.add_argument(
    2873          '-v', action='store_true',
    2874          help='run verbosely; only affects self-test run')
    2875      args = parser.parse_args()
    2876      if args.test:
    2877          _test()
    2878      else:
    2879          annotate = 30 if args.annotate else 0
    2880          if not args.pickle_file:
    2881              parser.print_help()
    2882          elif len(args.pickle_file) == 1:
    2883              dis(args.pickle_file[0], args.output, None,
    2884                  args.indentlevel, annotate)
    2885          else:
    2886              memo = {} if args.memo else None
    2887              for f in args.pickle_file:
    2888                  preamble = args.preamble.format(name=f.name)
    2889                  args.output.write(preamble + '\n')
    2890                  dis(f, args.output, memo, args.indentlevel, annotate)