1  /*
       2  
       3  Perf trampoline instrumentation
       4  ===============================
       5  
       6  This file contains instrumentation to allow to associate
       7  calls to the CPython eval loop back to the names of the Python
       8  functions and filename being executed.
       9  
      10  Many native performance profilers like the Linux perf tools are
      11  only available to 'see' the C stack when sampling from the profiled
      12  process. This means that if we have the following python code:
      13  
      14      import time
      15      def foo(n):
      16          # Some CPU intensive code
      17  
      18      def bar(n):
      19          foo(n)
      20  
      21      def baz(n):
      22          bar(n)
      23  
      24      baz(10000000)
      25  
      26  A performance profiler that is only able to see native frames will
      27  produce the following backtrace when sampling from foo():
      28  
      29      _PyEval_EvalFrameDefault -----> Evaluation frame of foo()
      30      _PyEval_Vector
      31      _PyFunction_Vectorcall
      32      PyObject_Vectorcall
      33      call_function
      34  
      35      _PyEval_EvalFrameDefault ------> Evaluation frame of bar()
      36      _PyEval_EvalFrame
      37      _PyEval_Vector
      38      _PyFunction_Vectorcall
      39      PyObject_Vectorcall
      40      call_function
      41  
      42      _PyEval_EvalFrameDefault -------> Evaluation frame of baz()
      43      _PyEval_EvalFrame
      44      _PyEval_Vector
      45      _PyFunction_Vectorcall
      46      PyObject_Vectorcall
      47      call_function
      48  
      49      ...
      50  
      51      Py_RunMain
      52  
      53  Because the profiler is only able to see the native frames and the native
      54  function that runs the evaluation loop is the same (_PyEval_EvalFrameDefault)
      55  then the profiler and any reporter generated by it will not be able to
      56  associate the names of the Python functions and the filenames associated with
      57  those calls, rendering the results useless in the Python world.
      58  
      59  To fix this problem, we introduce the concept of a trampoline frame. A
      60  trampoline frame is a piece of code that is unique per Python code object that
      61  is executed before entering the CPython eval loop. This piece of code just
      62  calls the original Python evaluation function (_PyEval_EvalFrameDefault) and
      63  forwards all the arguments received. In this way, when a profiler samples
      64  frames from the previous example it will see;
      65  
      66      _PyEval_EvalFrameDefault -----> Evaluation frame of foo()
      67      [Jit compiled code 3]
      68      _PyEval_Vector
      69      _PyFunction_Vectorcall
      70      PyObject_Vectorcall
      71      call_function
      72  
      73      _PyEval_EvalFrameDefault ------> Evaluation frame of bar()
      74      [Jit compiled code 2]
      75      _PyEval_EvalFrame
      76      _PyEval_Vector
      77      _PyFunction_Vectorcall
      78      PyObject_Vectorcall
      79      call_function
      80  
      81      _PyEval_EvalFrameDefault -------> Evaluation frame of baz()
      82      [Jit compiled code 1]
      83      _PyEval_EvalFrame
      84      _PyEval_Vector
      85      _PyFunction_Vectorcall
      86      PyObject_Vectorcall
      87      call_function
      88  
      89      ...
      90  
      91      Py_RunMain
      92  
      93  When we generate every unique copy of the trampoline (what here we called "[Jit
      94  compiled code N]") we write the relationship between the compiled code and the
      95  Python function that is associated with it. Every profiler requires this
      96  information in a different format. For example, the Linux "perf" profiler
      97  requires a file in "/tmp/perf-PID.map" (name and location not configurable)
      98  with the following format:
      99  
     100      <compiled code address> <compiled code size> <name of the compiled code>
     101  
     102  If this file is available when "perf" generates reports, it will automatically
     103  associate every trampoline with the Python function that it is associated with
     104  allowing it to generate reports that include Python information. These reports
     105  then can also be filtered in a way that *only* Python information appears.
     106  
     107  Notice that for this to work, there must be a unique copied of the trampoline
     108  per Python code object even if the code in the trampoline is the same. To
     109  achieve this we have a assembly template in Objects/asm_trampiline.S that is
     110  compiled into the Python executable/shared library. This template generates a
     111  symbol that maps the start of the assembly code and another that marks the end
     112  of the assembly code for the trampoline.  Then, every time we need a unique
     113  trampoline for a Python code object, we copy the assembly code into a mmaped
     114  area that has executable permissions and we return the start of that area as
     115  our trampoline function.
     116  
     117  Asking for a mmap-ed memory area for trampoline is very wasteful so we
     118  allocate big arenas of memory in a single mmap call, we populate the entire
     119  arena with copies of the trampoline (this allows us to now have to invalidate
     120  the icache for the instructions in the page) and then we return the next
     121  available chunk every time someone asks for a new trampoline. We keep a linked
     122  list of arenas in case the current memory arena is exhausted and another one is
     123  needed.
     124  
     125  For the best results, Python should be compiled with
     126  CFLAGS="-fno-omit-frame-pointer -mno-omit-leaf-frame-pointer" as this allows
     127  profilers to unwind using only the frame pointer and not on DWARF debug
     128  information (note that as trampilines are dynamically generated there won't be
     129  any DWARF information available for them).
     130  */
     131  
     132  #include "Python.h"
     133  #include "pycore_ceval.h"
     134  #include "pycore_frame.h"
     135  #include "pycore_interp.h"
     136  
     137  
     138  #ifdef PY_HAVE_PERF_TRAMPOLINE
     139  
     140  #include <fcntl.h>
     141  #include <stdio.h>
     142  #include <stdlib.h>
     143  #include <sys/mman.h>
     144  #include <sys/types.h>
     145  #include <unistd.h>
     146  
     147  #if defined(__arm__) || defined(__arm64__) || defined(__aarch64__)
     148  #define PY_HAVE_INVALIDATE_ICACHE
     149  
     150  #if defined(__clang__) || defined(__GNUC__)
     151  extern void __clear_cache(void *, void*);
     152  #endif
     153  
     154  static void invalidate_icache(char* begin, char*end) {
     155  #if defined(__clang__) || defined(__GNUC__)
     156      return __clear_cache(begin, end);
     157  #else
     158      return;
     159  #endif
     160  }
     161  #endif
     162  
     163  /* The function pointer is passed as last argument. The other three arguments
     164   * are passed in the same order as the function requires. This results in
     165   * shorter, more efficient ASM code for trampoline.
     166   */
     167  typedef PyObject *(*py_evaluator)(PyThreadState *, _PyInterpreterFrame *,
     168                                    int throwflag);
     169  typedef PyObject *(*py_trampoline)(PyThreadState *, _PyInterpreterFrame *, int,
     170                                     py_evaluator);
     171  
     172  extern void *_Py_trampoline_func_start;  // Start of the template of the
     173                                           // assembly trampoline
     174  extern void *
     175      _Py_trampoline_func_end;  // End of the template of the assembly trampoline
     176  
     177  struct code_arena_st {
     178      char *start_addr;    // Start of the memory arena
     179      char *current_addr;  // Address of the current trampoline within the arena
     180      size_t size;         // Size of the memory arena
     181      size_t size_left;    // Remaining size of the memory arena
     182      size_t code_size;    // Size of the code of every trampoline in the arena
     183      struct code_arena_st
     184          *prev;  // Pointer to the arena  or NULL if this is the first arena.
     185  };
     186  
     187  typedef struct code_arena_st code_arena_t;
     188  typedef struct trampoline_api_st trampoline_api_t;
     189  
     190  #define perf_status _PyRuntime.ceval.perf.status
     191  #define extra_code_index _PyRuntime.ceval.perf.extra_code_index
     192  #define perf_code_arena _PyRuntime.ceval.perf.code_arena
     193  #define trampoline_api _PyRuntime.ceval.perf.trampoline_api
     194  #define perf_map_file _PyRuntime.ceval.perf.map_file
     195  
     196  
     197  static void
     198  perf_map_write_entry(void *state, const void *code_addr,
     199                           unsigned int code_size, PyCodeObject *co)
     200  {
     201      const char *entry = "";
     202      if (co->co_qualname != NULL) {
     203          entry = PyUnicode_AsUTF8(co->co_qualname);
     204      }
     205      const char *filename = "";
     206      if (co->co_filename != NULL) {
     207          filename = PyUnicode_AsUTF8(co->co_filename);
     208      }
     209      size_t perf_map_entry_size = snprintf(NULL, 0, "py::%s:%s", entry, filename) + 1;
     210      char* perf_map_entry = (char*) PyMem_RawMalloc(perf_map_entry_size);
     211      if (perf_map_entry == NULL) {
     212          return;
     213      }
     214      snprintf(perf_map_entry, perf_map_entry_size, "py::%s:%s", entry, filename);
     215      PyUnstable_WritePerfMapEntry(code_addr, code_size, perf_map_entry);
     216      PyMem_RawFree(perf_map_entry);
     217  }
     218  
     219  _PyPerf_Callbacks _Py_perfmap_callbacks = {
     220      NULL,
     221      &perf_map_write_entry,
     222      NULL,
     223  };
     224  
     225  static int
     226  new_code_arena(void)
     227  {
     228      // non-trivial programs typically need 64 to 256 kiB.
     229      size_t mem_size = 4096 * 16;
     230      assert(mem_size % sysconf(_SC_PAGESIZE) == 0);
     231      char *memory =
     232          mmap(NULL,  // address
     233               mem_size, PROT_READ | PROT_WRITE, MAP_PRIVATE | MAP_ANONYMOUS,
     234               -1,  // fd (not used here)
     235               0);  // offset (not used here)
     236      if (!memory) {
     237          PyErr_SetFromErrno(PyExc_OSError);
     238          _PyErr_WriteUnraisableMsg(
     239              "Failed to create new mmap for perf trampoline", NULL);
     240          perf_status = PERF_STATUS_FAILED;
     241          return -1;
     242      }
     243      void *start = &_Py_trampoline_func_start;
     244      void *end = &_Py_trampoline_func_end;
     245      size_t code_size = end - start;
     246      // TODO: Check the effect of alignment of the code chunks. Initial investigation
     247      // showed that this has no effect on performance in x86-64 or aarch64 and the current
     248      // version has the advantage that the unwinder in GDB can unwind across JIT-ed code.
     249      //
     250      // We should check the values in the future and see if there is a
     251      // measurable performance improvement by rounding trampolines up to 32-bit
     252      // or 64-bit alignment.
     253  
     254      size_t n_copies = mem_size / code_size;
     255      for (size_t i = 0; i < n_copies; i++) {
     256          memcpy(memory + i * code_size, start, code_size * sizeof(char));
     257      }
     258      // Some systems may prevent us from creating executable code on the fly.
     259      int res = mprotect(memory, mem_size, PROT_READ | PROT_EXEC);
     260      if (res == -1) {
     261          PyErr_SetFromErrno(PyExc_OSError);
     262          munmap(memory, mem_size);
     263          _PyErr_WriteUnraisableMsg(
     264              "Failed to set mmap for perf trampoline to PROT_READ | PROT_EXEC",
     265              NULL);
     266          return -1;
     267      }
     268  
     269  #ifdef PY_HAVE_INVALIDATE_ICACHE
     270      // Before the JIT can run a block of code that has been emitted it must invalidate
     271      // the instruction cache on some platforms like arm and aarch64.
     272      invalidate_icache(memory, memory + mem_size);
     273  #endif
     274  
     275      code_arena_t *new_arena = PyMem_RawCalloc(1, sizeof(code_arena_t));
     276      if (new_arena == NULL) {
     277          PyErr_NoMemory();
     278          munmap(memory, mem_size);
     279          _PyErr_WriteUnraisableMsg("Failed to allocate new code arena struct",
     280                                    NULL);
     281          return -1;
     282      }
     283  
     284      new_arena->start_addr = memory;
     285      new_arena->current_addr = memory;
     286      new_arena->size = mem_size;
     287      new_arena->size_left = mem_size;
     288      new_arena->code_size = code_size;
     289      new_arena->prev = perf_code_arena;
     290      perf_code_arena = new_arena;
     291      return 0;
     292  }
     293  
     294  static void
     295  free_code_arenas(void)
     296  {
     297      code_arena_t *cur = perf_code_arena;
     298      code_arena_t *prev;
     299      perf_code_arena = NULL;  // invalid static pointer
     300      while (cur) {
     301          munmap(cur->start_addr, cur->size);
     302          prev = cur->prev;
     303          PyMem_RawFree(cur);
     304          cur = prev;
     305      }
     306  }
     307  
     308  static inline py_trampoline
     309  code_arena_new_code(code_arena_t *code_arena)
     310  {
     311      py_trampoline trampoline = (py_trampoline)code_arena->current_addr;
     312      code_arena->size_left -= code_arena->code_size;
     313      code_arena->current_addr += code_arena->code_size;
     314      return trampoline;
     315  }
     316  
     317  static inline py_trampoline
     318  compile_trampoline(void)
     319  {
     320      if ((perf_code_arena == NULL) ||
     321          (perf_code_arena->size_left <= perf_code_arena->code_size)) {
     322          if (new_code_arena() < 0) {
     323              return NULL;
     324          }
     325      }
     326      assert(perf_code_arena->size_left <= perf_code_arena->size);
     327      return code_arena_new_code(perf_code_arena);
     328  }
     329  
     330  static PyObject *
     331  py_trampoline_evaluator(PyThreadState *ts, _PyInterpreterFrame *frame,
     332                          int throw)
     333  {
     334      if (perf_status == PERF_STATUS_FAILED ||
     335          perf_status == PERF_STATUS_NO_INIT) {
     336          goto default_eval;
     337      }
     338      PyCodeObject *co = frame->f_code;
     339      py_trampoline f = NULL;
     340      assert(extra_code_index != -1);
     341      int ret = _PyCode_GetExtra((PyObject *)co, extra_code_index, (void **)&f);
     342      if (ret != 0 || f == NULL) {
     343          // This is the first time we see this code object so we need
     344          // to compile a trampoline for it.
     345          py_trampoline new_trampoline = compile_trampoline();
     346          if (new_trampoline == NULL) {
     347              goto default_eval;
     348          }
     349          trampoline_api.write_state(trampoline_api.state, new_trampoline,
     350                                     perf_code_arena->code_size, co);
     351          _PyCode_SetExtra((PyObject *)co, extra_code_index,
     352                           (void *)new_trampoline);
     353          f = new_trampoline;
     354      }
     355      assert(f != NULL);
     356      return f(ts, frame, throw, _PyEval_EvalFrameDefault);
     357  default_eval:
     358      // Something failed, fall back to the default evaluator.
     359      return _PyEval_EvalFrameDefault(ts, frame, throw);
     360  }
     361  #endif  // PY_HAVE_PERF_TRAMPOLINE
     362  
     363  int
     364  _PyIsPerfTrampolineActive(void)
     365  {
     366  #ifdef PY_HAVE_PERF_TRAMPOLINE
     367      PyThreadState *tstate = _PyThreadState_GET();
     368      return tstate->interp->eval_frame == py_trampoline_evaluator;
     369  #endif
     370      return 0;
     371  }
     372  
     373  void
     374  _PyPerfTrampoline_GetCallbacks(_PyPerf_Callbacks *callbacks)
     375  {
     376      if (callbacks == NULL) {
     377          return;
     378      }
     379  #ifdef PY_HAVE_PERF_TRAMPOLINE
     380      callbacks->init_state = trampoline_api.init_state;
     381      callbacks->write_state = trampoline_api.write_state;
     382      callbacks->free_state = trampoline_api.free_state;
     383  #endif
     384      return;
     385  }
     386  
     387  int
     388  _PyPerfTrampoline_SetCallbacks(_PyPerf_Callbacks *callbacks)
     389  {
     390      if (callbacks == NULL) {
     391          return -1;
     392      }
     393  #ifdef PY_HAVE_PERF_TRAMPOLINE
     394      if (trampoline_api.state) {
     395          _PyPerfTrampoline_Fini();
     396      }
     397      trampoline_api.init_state = callbacks->init_state;
     398      trampoline_api.write_state = callbacks->write_state;
     399      trampoline_api.free_state = callbacks->free_state;
     400      trampoline_api.state = NULL;
     401      perf_status = PERF_STATUS_OK;
     402  #endif
     403      return 0;
     404  }
     405  
     406  int
     407  _PyPerfTrampoline_Init(int activate)
     408  {
     409  #ifdef PY_HAVE_PERF_TRAMPOLINE
     410      PyThreadState *tstate = _PyThreadState_GET();
     411      if (tstate->interp->eval_frame &&
     412          tstate->interp->eval_frame != py_trampoline_evaluator) {
     413          PyErr_SetString(PyExc_RuntimeError,
     414                          "Trampoline cannot be initialized as a custom eval "
     415                          "frame is already present");
     416          return -1;
     417      }
     418      if (!activate) {
     419          tstate->interp->eval_frame = NULL;
     420      }
     421      else {
     422          tstate->interp->eval_frame = py_trampoline_evaluator;
     423          if (new_code_arena() < 0) {
     424              return -1;
     425          }
     426          extra_code_index = _PyEval_RequestCodeExtraIndex(NULL);
     427          if (extra_code_index == -1) {
     428              return -1;
     429          }
     430          perf_status = PERF_STATUS_OK;
     431      }
     432  #endif
     433      return 0;
     434  }
     435  
     436  int
     437  _PyPerfTrampoline_Fini(void)
     438  {
     439  #ifdef PY_HAVE_PERF_TRAMPOLINE
     440      PyThreadState *tstate = _PyThreadState_GET();
     441      if (tstate->interp->eval_frame == py_trampoline_evaluator) {
     442          tstate->interp->eval_frame = NULL;
     443      }
     444      free_code_arenas();
     445      extra_code_index = -1;
     446  #endif
     447      return 0;
     448  }
     449  
     450  PyStatus
     451  _PyPerfTrampoline_AfterFork_Child(void)
     452  {
     453  #ifdef PY_HAVE_PERF_TRAMPOLINE
     454      // Restart trampoline in file in child.
     455      int was_active = _PyIsPerfTrampolineActive();
     456      _PyPerfTrampoline_Fini();
     457      PyUnstable_PerfMapState_Fini();
     458      if (was_active) {
     459          _PyPerfTrampoline_Init(1);
     460      }
     461  #endif
     462      return PyStatus_Ok();
     463  }