1 /*
2
3 Perf trampoline instrumentation
4 ===============================
5
6 This file contains instrumentation to allow to associate
7 calls to the CPython eval loop back to the names of the Python
8 functions and filename being executed.
9
10 Many native performance profilers like the Linux perf tools are
11 only available to 'see' the C stack when sampling from the profiled
12 process. This means that if we have the following python code:
13
14 import time
15 def foo(n):
16 # Some CPU intensive code
17
18 def bar(n):
19 foo(n)
20
21 def baz(n):
22 bar(n)
23
24 baz(10000000)
25
26 A performance profiler that is only able to see native frames will
27 produce the following backtrace when sampling from foo():
28
29 _PyEval_EvalFrameDefault -----> Evaluation frame of foo()
30 _PyEval_Vector
31 _PyFunction_Vectorcall
32 PyObject_Vectorcall
33 call_function
34
35 _PyEval_EvalFrameDefault ------> Evaluation frame of bar()
36 _PyEval_EvalFrame
37 _PyEval_Vector
38 _PyFunction_Vectorcall
39 PyObject_Vectorcall
40 call_function
41
42 _PyEval_EvalFrameDefault -------> Evaluation frame of baz()
43 _PyEval_EvalFrame
44 _PyEval_Vector
45 _PyFunction_Vectorcall
46 PyObject_Vectorcall
47 call_function
48
49 ...
50
51 Py_RunMain
52
53 Because the profiler is only able to see the native frames and the native
54 function that runs the evaluation loop is the same (_PyEval_EvalFrameDefault)
55 then the profiler and any reporter generated by it will not be able to
56 associate the names of the Python functions and the filenames associated with
57 those calls, rendering the results useless in the Python world.
58
59 To fix this problem, we introduce the concept of a trampoline frame. A
60 trampoline frame is a piece of code that is unique per Python code object that
61 is executed before entering the CPython eval loop. This piece of code just
62 calls the original Python evaluation function (_PyEval_EvalFrameDefault) and
63 forwards all the arguments received. In this way, when a profiler samples
64 frames from the previous example it will see;
65
66 _PyEval_EvalFrameDefault -----> Evaluation frame of foo()
67 [Jit compiled code 3]
68 _PyEval_Vector
69 _PyFunction_Vectorcall
70 PyObject_Vectorcall
71 call_function
72
73 _PyEval_EvalFrameDefault ------> Evaluation frame of bar()
74 [Jit compiled code 2]
75 _PyEval_EvalFrame
76 _PyEval_Vector
77 _PyFunction_Vectorcall
78 PyObject_Vectorcall
79 call_function
80
81 _PyEval_EvalFrameDefault -------> Evaluation frame of baz()
82 [Jit compiled code 1]
83 _PyEval_EvalFrame
84 _PyEval_Vector
85 _PyFunction_Vectorcall
86 PyObject_Vectorcall
87 call_function
88
89 ...
90
91 Py_RunMain
92
93 When we generate every unique copy of the trampoline (what here we called "[Jit
94 compiled code N]") we write the relationship between the compiled code and the
95 Python function that is associated with it. Every profiler requires this
96 information in a different format. For example, the Linux "perf" profiler
97 requires a file in "/tmp/perf-PID.map" (name and location not configurable)
98 with the following format:
99
100 <compiled code address> <compiled code size> <name of the compiled code>
101
102 If this file is available when "perf" generates reports, it will automatically
103 associate every trampoline with the Python function that it is associated with
104 allowing it to generate reports that include Python information. These reports
105 then can also be filtered in a way that *only* Python information appears.
106
107 Notice that for this to work, there must be a unique copied of the trampoline
108 per Python code object even if the code in the trampoline is the same. To
109 achieve this we have a assembly template in Objects/asm_trampiline.S that is
110 compiled into the Python executable/shared library. This template generates a
111 symbol that maps the start of the assembly code and another that marks the end
112 of the assembly code for the trampoline. Then, every time we need a unique
113 trampoline for a Python code object, we copy the assembly code into a mmaped
114 area that has executable permissions and we return the start of that area as
115 our trampoline function.
116
117 Asking for a mmap-ed memory area for trampoline is very wasteful so we
118 allocate big arenas of memory in a single mmap call, we populate the entire
119 arena with copies of the trampoline (this allows us to now have to invalidate
120 the icache for the instructions in the page) and then we return the next
121 available chunk every time someone asks for a new trampoline. We keep a linked
122 list of arenas in case the current memory arena is exhausted and another one is
123 needed.
124
125 For the best results, Python should be compiled with
126 CFLAGS="-fno-omit-frame-pointer -mno-omit-leaf-frame-pointer" as this allows
127 profilers to unwind using only the frame pointer and not on DWARF debug
128 information (note that as trampilines are dynamically generated there won't be
129 any DWARF information available for them).
130 */
131
132 #include "Python.h"
133 #include "pycore_ceval.h"
134 #include "pycore_frame.h"
135 #include "pycore_interp.h"
136
137
138 #ifdef PY_HAVE_PERF_TRAMPOLINE
139
140 #include <fcntl.h>
141 #include <stdio.h>
142 #include <stdlib.h>
143 #include <sys/mman.h>
144 #include <sys/types.h>
145 #include <unistd.h>
146
147 #if defined(__arm__) || defined(__arm64__) || defined(__aarch64__)
148 #define PY_HAVE_INVALIDATE_ICACHE
149
150 #if defined(__clang__) || defined(__GNUC__)
151 extern void __clear_cache(void *, void*);
152 #endif
153
154 static void invalidate_icache(char* begin, char*end) {
155 #if defined(__clang__) || defined(__GNUC__)
156 return __clear_cache(begin, end);
157 #else
158 return;
159 #endif
160 }
161 #endif
162
163 /* The function pointer is passed as last argument. The other three arguments
164 * are passed in the same order as the function requires. This results in
165 * shorter, more efficient ASM code for trampoline.
166 */
167 typedef PyObject *(*py_evaluator)(PyThreadState *, _PyInterpreterFrame *,
168 int throwflag);
169 typedef PyObject *(*py_trampoline)(PyThreadState *, _PyInterpreterFrame *, int,
170 py_evaluator);
171
172 extern void *_Py_trampoline_func_start; // Start of the template of the
173 // assembly trampoline
174 extern void *
175 _Py_trampoline_func_end; // End of the template of the assembly trampoline
176
177 struct code_arena_st {
178 char *start_addr; // Start of the memory arena
179 char *current_addr; // Address of the current trampoline within the arena
180 size_t size; // Size of the memory arena
181 size_t size_left; // Remaining size of the memory arena
182 size_t code_size; // Size of the code of every trampoline in the arena
183 struct code_arena_st
184 *prev; // Pointer to the arena or NULL if this is the first arena.
185 };
186
187 typedef struct code_arena_st code_arena_t;
188 typedef struct trampoline_api_st trampoline_api_t;
189
190 #define perf_status _PyRuntime.ceval.perf.status
191 #define extra_code_index _PyRuntime.ceval.perf.extra_code_index
192 #define perf_code_arena _PyRuntime.ceval.perf.code_arena
193 #define trampoline_api _PyRuntime.ceval.perf.trampoline_api
194 #define perf_map_file _PyRuntime.ceval.perf.map_file
195
196
197 static void
198 perf_map_write_entry(void *state, const void *code_addr,
199 unsigned int code_size, PyCodeObject *co)
200 {
201 const char *entry = "";
202 if (co->co_qualname != NULL) {
203 entry = PyUnicode_AsUTF8(co->co_qualname);
204 }
205 const char *filename = "";
206 if (co->co_filename != NULL) {
207 filename = PyUnicode_AsUTF8(co->co_filename);
208 }
209 size_t perf_map_entry_size = snprintf(NULL, 0, "py::%s:%s", entry, filename) + 1;
210 char* perf_map_entry = (char*) PyMem_RawMalloc(perf_map_entry_size);
211 if (perf_map_entry == NULL) {
212 return;
213 }
214 snprintf(perf_map_entry, perf_map_entry_size, "py::%s:%s", entry, filename);
215 PyUnstable_WritePerfMapEntry(code_addr, code_size, perf_map_entry);
216 PyMem_RawFree(perf_map_entry);
217 }
218
219 _PyPerf_Callbacks _Py_perfmap_callbacks = {
220 NULL,
221 &perf_map_write_entry,
222 NULL,
223 };
224
225 static int
226 new_code_arena(void)
227 {
228 // non-trivial programs typically need 64 to 256 kiB.
229 size_t mem_size = 4096 * 16;
230 assert(mem_size % sysconf(_SC_PAGESIZE) == 0);
231 char *memory =
232 mmap(NULL, // address
233 mem_size, PROT_READ | PROT_WRITE, MAP_PRIVATE | MAP_ANONYMOUS,
234 -1, // fd (not used here)
235 0); // offset (not used here)
236 if (!memory) {
237 PyErr_SetFromErrno(PyExc_OSError);
238 _PyErr_WriteUnraisableMsg(
239 "Failed to create new mmap for perf trampoline", NULL);
240 perf_status = PERF_STATUS_FAILED;
241 return -1;
242 }
243 void *start = &_Py_trampoline_func_start;
244 void *end = &_Py_trampoline_func_end;
245 size_t code_size = end - start;
246 // TODO: Check the effect of alignment of the code chunks. Initial investigation
247 // showed that this has no effect on performance in x86-64 or aarch64 and the current
248 // version has the advantage that the unwinder in GDB can unwind across JIT-ed code.
249 //
250 // We should check the values in the future and see if there is a
251 // measurable performance improvement by rounding trampolines up to 32-bit
252 // or 64-bit alignment.
253
254 size_t n_copies = mem_size / code_size;
255 for (size_t i = 0; i < n_copies; i++) {
256 memcpy(memory + i * code_size, start, code_size * sizeof(char));
257 }
258 // Some systems may prevent us from creating executable code on the fly.
259 int res = mprotect(memory, mem_size, PROT_READ | PROT_EXEC);
260 if (res == -1) {
261 PyErr_SetFromErrno(PyExc_OSError);
262 munmap(memory, mem_size);
263 _PyErr_WriteUnraisableMsg(
264 "Failed to set mmap for perf trampoline to PROT_READ | PROT_EXEC",
265 NULL);
266 return -1;
267 }
268
269 #ifdef PY_HAVE_INVALIDATE_ICACHE
270 // Before the JIT can run a block of code that has been emitted it must invalidate
271 // the instruction cache on some platforms like arm and aarch64.
272 invalidate_icache(memory, memory + mem_size);
273 #endif
274
275 code_arena_t *new_arena = PyMem_RawCalloc(1, sizeof(code_arena_t));
276 if (new_arena == NULL) {
277 PyErr_NoMemory();
278 munmap(memory, mem_size);
279 _PyErr_WriteUnraisableMsg("Failed to allocate new code arena struct",
280 NULL);
281 return -1;
282 }
283
284 new_arena->start_addr = memory;
285 new_arena->current_addr = memory;
286 new_arena->size = mem_size;
287 new_arena->size_left = mem_size;
288 new_arena->code_size = code_size;
289 new_arena->prev = perf_code_arena;
290 perf_code_arena = new_arena;
291 return 0;
292 }
293
294 static void
295 free_code_arenas(void)
296 {
297 code_arena_t *cur = perf_code_arena;
298 code_arena_t *prev;
299 perf_code_arena = NULL; // invalid static pointer
300 while (cur) {
301 munmap(cur->start_addr, cur->size);
302 prev = cur->prev;
303 PyMem_RawFree(cur);
304 cur = prev;
305 }
306 }
307
308 static inline py_trampoline
309 code_arena_new_code(code_arena_t *code_arena)
310 {
311 py_trampoline trampoline = (py_trampoline)code_arena->current_addr;
312 code_arena->size_left -= code_arena->code_size;
313 code_arena->current_addr += code_arena->code_size;
314 return trampoline;
315 }
316
317 static inline py_trampoline
318 compile_trampoline(void)
319 {
320 if ((perf_code_arena == NULL) ||
321 (perf_code_arena->size_left <= perf_code_arena->code_size)) {
322 if (new_code_arena() < 0) {
323 return NULL;
324 }
325 }
326 assert(perf_code_arena->size_left <= perf_code_arena->size);
327 return code_arena_new_code(perf_code_arena);
328 }
329
330 static PyObject *
331 py_trampoline_evaluator(PyThreadState *ts, _PyInterpreterFrame *frame,
332 int throw)
333 {
334 if (perf_status == PERF_STATUS_FAILED ||
335 perf_status == PERF_STATUS_NO_INIT) {
336 goto default_eval;
337 }
338 PyCodeObject *co = frame->f_code;
339 py_trampoline f = NULL;
340 assert(extra_code_index != -1);
341 int ret = _PyCode_GetExtra((PyObject *)co, extra_code_index, (void **)&f);
342 if (ret != 0 || f == NULL) {
343 // This is the first time we see this code object so we need
344 // to compile a trampoline for it.
345 py_trampoline new_trampoline = compile_trampoline();
346 if (new_trampoline == NULL) {
347 goto default_eval;
348 }
349 trampoline_api.write_state(trampoline_api.state, new_trampoline,
350 perf_code_arena->code_size, co);
351 _PyCode_SetExtra((PyObject *)co, extra_code_index,
352 (void *)new_trampoline);
353 f = new_trampoline;
354 }
355 assert(f != NULL);
356 return f(ts, frame, throw, _PyEval_EvalFrameDefault);
357 default_eval:
358 // Something failed, fall back to the default evaluator.
359 return _PyEval_EvalFrameDefault(ts, frame, throw);
360 }
361 #endif // PY_HAVE_PERF_TRAMPOLINE
362
363 int
364 _PyIsPerfTrampolineActive(void)
365 {
366 #ifdef PY_HAVE_PERF_TRAMPOLINE
367 PyThreadState *tstate = _PyThreadState_GET();
368 return tstate->interp->eval_frame == py_trampoline_evaluator;
369 #endif
370 return 0;
371 }
372
373 void
374 _PyPerfTrampoline_GetCallbacks(_PyPerf_Callbacks *callbacks)
375 {
376 if (callbacks == NULL) {
377 return;
378 }
379 #ifdef PY_HAVE_PERF_TRAMPOLINE
380 callbacks->init_state = trampoline_api.init_state;
381 callbacks->write_state = trampoline_api.write_state;
382 callbacks->free_state = trampoline_api.free_state;
383 #endif
384 return;
385 }
386
387 int
388 _PyPerfTrampoline_SetCallbacks(_PyPerf_Callbacks *callbacks)
389 {
390 if (callbacks == NULL) {
391 return -1;
392 }
393 #ifdef PY_HAVE_PERF_TRAMPOLINE
394 if (trampoline_api.state) {
395 _PyPerfTrampoline_Fini();
396 }
397 trampoline_api.init_state = callbacks->init_state;
398 trampoline_api.write_state = callbacks->write_state;
399 trampoline_api.free_state = callbacks->free_state;
400 trampoline_api.state = NULL;
401 perf_status = PERF_STATUS_OK;
402 #endif
403 return 0;
404 }
405
406 int
407 _PyPerfTrampoline_Init(int activate)
408 {
409 #ifdef PY_HAVE_PERF_TRAMPOLINE
410 PyThreadState *tstate = _PyThreadState_GET();
411 if (tstate->interp->eval_frame &&
412 tstate->interp->eval_frame != py_trampoline_evaluator) {
413 PyErr_SetString(PyExc_RuntimeError,
414 "Trampoline cannot be initialized as a custom eval "
415 "frame is already present");
416 return -1;
417 }
418 if (!activate) {
419 tstate->interp->eval_frame = NULL;
420 }
421 else {
422 tstate->interp->eval_frame = py_trampoline_evaluator;
423 if (new_code_arena() < 0) {
424 return -1;
425 }
426 extra_code_index = _PyEval_RequestCodeExtraIndex(NULL);
427 if (extra_code_index == -1) {
428 return -1;
429 }
430 perf_status = PERF_STATUS_OK;
431 }
432 #endif
433 return 0;
434 }
435
436 int
437 _PyPerfTrampoline_Fini(void)
438 {
439 #ifdef PY_HAVE_PERF_TRAMPOLINE
440 PyThreadState *tstate = _PyThreadState_GET();
441 if (tstate->interp->eval_frame == py_trampoline_evaluator) {
442 tstate->interp->eval_frame = NULL;
443 }
444 free_code_arenas();
445 extra_code_index = -1;
446 #endif
447 return 0;
448 }
449
450 PyStatus
451 _PyPerfTrampoline_AfterFork_Child(void)
452 {
453 #ifdef PY_HAVE_PERF_TRAMPOLINE
454 // Restart trampoline in file in child.
455 int was_active = _PyIsPerfTrampolineActive();
456 _PyPerfTrampoline_Fini();
457 PyUnstable_PerfMapState_Fini();
458 if (was_active) {
459 _PyPerfTrampoline_Init(1);
460 }
461 #endif
462 return PyStatus_Ok();
463 }