1  #! /usr/bin/env python3
       2  
       3  """
       4  combinerefs path
       5  
       6  A helper for analyzing PYTHONDUMPREFS output.
       7  
       8  When the PYTHONDUMPREFS envar is set in a debug build, at Python shutdown
       9  time Py_FinalizeEx() prints the list of all live objects twice:  first it
      10  prints the repr() of each object while the interpreter is still fully intact.
      11  After cleaning up everything it can, it prints all remaining live objects
      12  again, but the second time just prints their addresses, refcounts, and type
      13  names (because the interpreter has been torn down, calling repr methods at
      14  this point can get into infinite loops or blow up).
      15  
      16  Save all this output into a file, then run this script passing the path to
      17  that file.  The script finds both output chunks, combines them, then prints
      18  a line of output for each object still alive at the end:
      19  
      20      address refcnt typename repr
      21  
      22  address is the address of the object, in whatever format the platform C
      23  produces for a %p format code.
      24  
      25  refcnt is of the form
      26  
      27      "[" ref "]"
      28  
      29  when the object's refcount is the same in both PYTHONDUMPREFS output blocks,
      30  or
      31  
      32      "[" ref_before "->" ref_after "]"
      33  
      34  if the refcount changed.
      35  
      36  typename is Py_TYPE(object)->tp_name, extracted from the second PYTHONDUMPREFS
      37  output block.
      38  
      39  repr is repr(object), extracted from the first PYTHONDUMPREFS output block.
      40  CAUTION:  If object is a container type, it may not actually contain all the
      41  objects shown in the repr:  the repr was captured from the first output block,
      42  and some of the containees may have been released since then.  For example,
      43  it's common for the line showing the dict of interned strings to display
      44  strings that no longer exist at the end of Py_FinalizeEx; this can be recognized
      45  (albeit painfully) because such containees don't have a line of their own.
      46  
      47  The objects are listed in allocation order, with most-recently allocated
      48  printed first, and the first object allocated printed last.
      49  
      50  
      51  Simple examples:
      52  
      53      00857060 [14] str '__len__'
      54  
      55  The str object '__len__' is alive at shutdown time, and both PYTHONDUMPREFS
      56  output blocks said there were 14 references to it.  This is probably due to
      57  C modules that intern the string "__len__" and keep a reference to it in a
      58  file static.
      59  
      60      00857038 [46->5] tuple ()
      61  
      62  46-5 = 41 references to the empty tuple were removed by the cleanup actions
      63  between the times PYTHONDUMPREFS produced output.
      64  
      65      00858028 [1025->1456] str '<dummy key>'
      66  
      67  The string '<dummy key>', which is used in dictobject.c to overwrite a real
      68  key that gets deleted, grew several hundred references during cleanup.  It
      69  suggests that stuff did get removed from dicts by cleanup, but that the dicts
      70  themselves are staying alive for some reason. """
      71  
      72  import re
      73  import sys
      74  
      75  # Generate lines from fileiter.  If whilematch is true, continue reading
      76  # while the regexp object pat matches line.  If whilematch is false, lines
      77  # are read so long as pat doesn't match them.  In any case, the first line
      78  # that doesn't match pat (when whilematch is true), or that does match pat
      79  # (when whilematch is false), is lost, and fileiter will resume at the line
      80  # following it.
      81  def read(fileiter, pat, whilematch):
      82      for line in fileiter:
      83          if bool(pat.match(line)) == whilematch:
      84              yield line
      85          else:
      86              break
      87  
      88  def combinefile(f):
      89      fi = iter(f)
      90  
      91      for line in read(fi, re.compile(r'^Remaining objects:$'), False):
      92          pass
      93  
      94      crack = re.compile(r'([a-zA-Z\d]+) \[(\d+)\] (.*)')
      95      addr2rc = {}
      96      addr2guts = {}
      97      before = 0
      98      for line in read(fi, re.compile(r'^Remaining object addresses:$'), False):
      99          m = crack.match(line)
     100          if m:
     101              addr, addr2rc[addr], addr2guts[addr] = m.groups()
     102              before += 1
     103          else:
     104              print('??? skipped:', line)
     105  
     106      after = 0
     107      for line in read(fi, crack, True):
     108          after += 1
     109          m = crack.match(line)
     110          assert m
     111          addr, rc, guts = m.groups() # guts is type name here
     112          if addr not in addr2rc:
     113              print('??? new object created while tearing down:', line.rstrip())
     114              continue
     115          print(addr, end=' ')
     116          if rc == addr2rc[addr]:
     117              print('[%s]' % rc, end=' ')
     118          else:
     119              print('[%s->%s]' % (addr2rc[addr], rc), end=' ')
     120          print(guts, addr2guts[addr])
     121  
     122      print("%d objects before, %d after" % (before, after))
     123  
     124  def combine(fname):
     125      with open(fname) as f:
     126          combinefile(f)
     127  
     128  if __name__ == '__main__':
     129      combine(sys.argv[1])