1 /* A fuzz test for CPython.
2
3 The only exposed function is LLVMFuzzerTestOneInput, which is called by
4 fuzzers and by the _fuzz module for smoke tests.
5
6 To build exactly one fuzz test, as when running in oss-fuzz etc.,
7 build with -D _Py_FUZZ_ONE and -D _Py_FUZZ_<test_name>. e.g. to build
8 LLVMFuzzerTestOneInput to only run "fuzz_builtin_float", build this file with
9 -D _Py_FUZZ_ONE -D _Py_FUZZ_fuzz_builtin_float.
10
11 See the source code for LLVMFuzzerTestOneInput for details. */
12
13 #include <Python.h>
14 #include <stdlib.h>
15 #include <inttypes.h>
16
17 /* Fuzz PyFloat_FromString as a proxy for float(str). */
18 static int fuzz_builtin_float(const char* data, size_t size) {
19 PyObject* s = PyBytes_FromStringAndSize(data, size);
20 if (s == NULL) return 0;
21 PyObject* f = PyFloat_FromString(s);
22 if (PyErr_Occurred() && PyErr_ExceptionMatches(PyExc_ValueError)) {
23 PyErr_Clear();
24 }
25
26 Py_XDECREF(f);
27 Py_DECREF(s);
28 return 0;
29 }
30
31 #define MAX_INT_TEST_SIZE 0x10000
32
33 /* Fuzz PyLong_FromUnicodeObject as a proxy for int(str). */
34 static int fuzz_builtin_int(const char* data, size_t size) {
35 /* Ignore test cases with very long ints to avoid timeouts
36 int("9" * 1000000) is not a very interesting test caase */
37 if (size > MAX_INT_TEST_SIZE) {
38 return 0;
39 }
40 /* Pick a random valid base. (When the fuzzed function takes extra
41 parameters, it's somewhat normal to hash the input to generate those
42 parameters. We want to exercise all code paths, so we do so here.) */
43 int base = _Py_HashBytes(data, size) % 37;
44 if (base == 1) {
45 // 1 is the only number between 0 and 36 that is not a valid base.
46 base = 0;
47 }
48 if (base == -1) {
49 return 0; // An error occurred, bail early.
50 }
51 if (base < 0) {
52 base = -base;
53 }
54
55 PyObject* s = PyUnicode_FromStringAndSize(data, size);
56 if (s == NULL) {
57 if (PyErr_ExceptionMatches(PyExc_UnicodeDecodeError)) {
58 PyErr_Clear();
59 }
60 return 0;
61 }
62 PyObject* l = PyLong_FromUnicodeObject(s, base);
63 if (l == NULL && PyErr_ExceptionMatches(PyExc_ValueError)) {
64 PyErr_Clear();
65 }
66 PyErr_Clear();
67 Py_XDECREF(l);
68 Py_DECREF(s);
69 return 0;
70 }
71
72 /* Fuzz PyUnicode_FromStringAndSize as a proxy for unicode(str). */
73 static int fuzz_builtin_unicode(const char* data, size_t size) {
74 PyObject* s = PyUnicode_FromStringAndSize(data, size);
75 if (s == NULL && PyErr_ExceptionMatches(PyExc_UnicodeDecodeError)) {
76 PyErr_Clear();
77 }
78 Py_XDECREF(s);
79 return 0;
80 }
81
82
83 PyObject* struct_unpack_method = NULL;
84 PyObject* struct_error = NULL;
85 /* Called by LLVMFuzzerTestOneInput for initialization */
86 static int init_struct_unpack(void) {
87 /* Import struct.unpack */
88 PyObject* struct_module = PyImport_ImportModule("struct");
89 if (struct_module == NULL) {
90 return 0;
91 }
92 struct_error = PyObject_GetAttrString(struct_module, "error");
93 if (struct_error == NULL) {
94 return 0;
95 }
96 struct_unpack_method = PyObject_GetAttrString(struct_module, "unpack");
97 return struct_unpack_method != NULL;
98 }
99 /* Fuzz struct.unpack(x, y) */
100 static int fuzz_struct_unpack(const char* data, size_t size) {
101 /* Everything up to the first null byte is considered the
102 format. Everything after is the buffer */
103 const char* first_null = memchr(data, '\0', size);
104 if (first_null == NULL) {
105 return 0;
106 }
107
108 size_t format_length = first_null - data;
109 size_t buffer_length = size - format_length - 1;
110
111 PyObject* pattern = PyBytes_FromStringAndSize(data, format_length);
112 if (pattern == NULL) {
113 return 0;
114 }
115 PyObject* buffer = PyBytes_FromStringAndSize(first_null + 1, buffer_length);
116 if (buffer == NULL) {
117 Py_DECREF(pattern);
118 return 0;
119 }
120
121 PyObject* unpacked = PyObject_CallFunctionObjArgs(
122 struct_unpack_method, pattern, buffer, NULL);
123 /* Ignore any overflow errors, these are easily triggered accidentally */
124 if (unpacked == NULL && PyErr_ExceptionMatches(PyExc_OverflowError)) {
125 PyErr_Clear();
126 }
127 /* The pascal format string will throw a negative size when passing 0
128 like: struct.unpack('0p', b'') */
129 if (unpacked == NULL && PyErr_ExceptionMatches(PyExc_SystemError)) {
130 PyErr_Clear();
131 }
132 /* Ignore any struct.error exceptions, these can be caused by invalid
133 formats or incomplete buffers both of which are common. */
134 if (unpacked == NULL && PyErr_ExceptionMatches(struct_error)) {
135 PyErr_Clear();
136 }
137
138 Py_XDECREF(unpacked);
139 Py_DECREF(pattern);
140 Py_DECREF(buffer);
141 return 0;
142 }
143
144
145 #define MAX_JSON_TEST_SIZE 0x10000
146
147 PyObject* json_loads_method = NULL;
148 /* Called by LLVMFuzzerTestOneInput for initialization */
149 static int init_json_loads(void) {
150 /* Import json.loads */
151 PyObject* json_module = PyImport_ImportModule("json");
152 if (json_module == NULL) {
153 return 0;
154 }
155 json_loads_method = PyObject_GetAttrString(json_module, "loads");
156 return json_loads_method != NULL;
157 }
158 /* Fuzz json.loads(x) */
159 static int fuzz_json_loads(const char* data, size_t size) {
160 /* Since python supports arbitrarily large ints in JSON,
161 long inputs can lead to timeouts on boring inputs like
162 `json.loads("9" * 100000)` */
163 if (size > MAX_JSON_TEST_SIZE) {
164 return 0;
165 }
166 PyObject* input_bytes = PyBytes_FromStringAndSize(data, size);
167 if (input_bytes == NULL) {
168 return 0;
169 }
170 PyObject* parsed = PyObject_CallOneArg(json_loads_method, input_bytes);
171 if (parsed == NULL) {
172 /* Ignore ValueError as the fuzzer will more than likely
173 generate some invalid json and values */
174 if (PyErr_ExceptionMatches(PyExc_ValueError) ||
175 /* Ignore RecursionError as the fuzzer generates long sequences of
176 arrays such as `[[[...` */
177 PyErr_ExceptionMatches(PyExc_RecursionError) ||
178 /* Ignore unicode errors, invalid byte sequences are common */
179 PyErr_ExceptionMatches(PyExc_UnicodeDecodeError)
180 ) {
181 PyErr_Clear();
182 }
183 }
184 Py_DECREF(input_bytes);
185 Py_XDECREF(parsed);
186 return 0;
187 }
188
189 #define MAX_RE_TEST_SIZE 0x10000
190
191 PyObject* re_compile_method = NULL;
192 PyObject* re_error_exception = NULL;
193 int RE_FLAG_DEBUG = 0;
194 /* Called by LLVMFuzzerTestOneInput for initialization */
195 static int init_sre_compile(void) {
196 /* Import sre_compile.compile and sre.error */
197 PyObject* re_module = PyImport_ImportModule("re");
198 if (re_module == NULL) {
199 return 0;
200 }
201 re_compile_method = PyObject_GetAttrString(re_module, "compile");
202 if (re_compile_method == NULL) {
203 return 0;
204 }
205
206 re_error_exception = PyObject_GetAttrString(re_module, "error");
207 if (re_error_exception == NULL) {
208 return 0;
209 }
210 PyObject* debug_flag = PyObject_GetAttrString(re_module, "DEBUG");
211 if (debug_flag == NULL) {
212 return 0;
213 }
214 RE_FLAG_DEBUG = PyLong_AsLong(debug_flag);
215 return 1;
216 }
217 /* Fuzz re.compile(x) */
218 static int fuzz_sre_compile(const char* data, size_t size) {
219 /* Ignore really long regex patterns that will timeout the fuzzer */
220 if (size > MAX_RE_TEST_SIZE) {
221 return 0;
222 }
223 /* We treat the first 2 bytes of the input as a number for the flags */
224 if (size < 2) {
225 return 0;
226 }
227 uint16_t flags = ((uint16_t*) data)[0];
228 /* We remove the SRE_FLAG_DEBUG if present. This is because it
229 prints to stdout which greatly decreases fuzzing speed */
230 flags &= ~RE_FLAG_DEBUG;
231
232 /* Pull the pattern from the remaining bytes */
233 PyObject* pattern_bytes = PyBytes_FromStringAndSize(data + 2, size - 2);
234 if (pattern_bytes == NULL) {
235 return 0;
236 }
237 PyObject* flags_obj = PyLong_FromUnsignedLong(flags);
238 if (flags_obj == NULL) {
239 Py_DECREF(pattern_bytes);
240 return 0;
241 }
242
243 /* compiled = re.compile(data[2:], data[0:2] */
244 PyObject* compiled = PyObject_CallFunctionObjArgs(
245 re_compile_method, pattern_bytes, flags_obj, NULL);
246 /* Ignore ValueError as the fuzzer will more than likely
247 generate some invalid combination of flags */
248 if (compiled == NULL && PyErr_ExceptionMatches(PyExc_ValueError)) {
249 PyErr_Clear();
250 }
251 /* Ignore some common errors thrown by sre_parse:
252 Overflow, Assertion, Recursion and Index */
253 if (compiled == NULL && (PyErr_ExceptionMatches(PyExc_OverflowError) ||
254 PyErr_ExceptionMatches(PyExc_AssertionError) ||
255 PyErr_ExceptionMatches(PyExc_RecursionError) ||
256 PyErr_ExceptionMatches(PyExc_IndexError))
257 ) {
258 PyErr_Clear();
259 }
260 /* Ignore re.error */
261 if (compiled == NULL && PyErr_ExceptionMatches(re_error_exception)) {
262 PyErr_Clear();
263 }
264
265 Py_DECREF(pattern_bytes);
266 Py_DECREF(flags_obj);
267 Py_XDECREF(compiled);
268 return 0;
269 }
270
271 /* Some random patterns used to test re.match.
272 Be careful not to add catostraphically slow regexes here, we want to
273 exercise the matching code without causing timeouts.*/
274 static const char* regex_patterns[] = {
275 ".", "^", "abc", "abc|def", "^xxx$", "\\b", "()", "[a-zA-Z0-9]",
276 "abc+", "[^A-Z]", "[x]", "(?=)", "a{z}", "a+b", "a*?", "a??", "a+?",
277 "{}", "a{,}", "{", "}", "^\\(*\\d{3}\\)*( |-)*\\d{3}( |-)*\\d{4}$",
278 "(?:a*)*", "a{1,2}?"
279 };
280 const size_t NUM_PATTERNS = sizeof(regex_patterns) / sizeof(regex_patterns[0]);
281 PyObject** compiled_patterns = NULL;
282 /* Called by LLVMFuzzerTestOneInput for initialization */
283 static int init_sre_match(void) {
284 PyObject* re_module = PyImport_ImportModule("re");
285 if (re_module == NULL) {
286 return 0;
287 }
288 compiled_patterns = (PyObject**) PyMem_RawMalloc(
289 sizeof(PyObject*) * NUM_PATTERNS);
290 if (compiled_patterns == NULL) {
291 PyErr_NoMemory();
292 return 0;
293 }
294
295 /* Precompile all the regex patterns on the first run for faster fuzzing */
296 for (size_t i = 0; i < NUM_PATTERNS; i++) {
297 PyObject* compiled = PyObject_CallMethod(
298 re_module, "compile", "y", regex_patterns[i]);
299 /* Bail if any of the patterns fail to compile */
300 if (compiled == NULL) {
301 return 0;
302 }
303 compiled_patterns[i] = compiled;
304 }
305 return 1;
306 }
307 /* Fuzz re.match(x) */
308 static int fuzz_sre_match(const char* data, size_t size) {
309 if (size < 1 || size > MAX_RE_TEST_SIZE) {
310 return 0;
311 }
312 /* Use the first byte as a uint8_t specifying the index of the
313 regex to use */
314 unsigned char idx = (unsigned char) data[0];
315 idx = idx % NUM_PATTERNS;
316
317 /* Pull the string to match from the remaining bytes */
318 PyObject* to_match = PyBytes_FromStringAndSize(data + 1, size - 1);
319 if (to_match == NULL) {
320 return 0;
321 }
322
323 PyObject* pattern = compiled_patterns[idx];
324 PyObject* match_callable = PyObject_GetAttrString(pattern, "match");
325
326 PyObject* matches = PyObject_CallOneArg(match_callable, to_match);
327
328 Py_XDECREF(matches);
329 Py_DECREF(match_callable);
330 Py_DECREF(to_match);
331 return 0;
332 }
333
334 #define MAX_CSV_TEST_SIZE 0x10000
335 PyObject* csv_module = NULL;
336 PyObject* csv_error = NULL;
337 /* Called by LLVMFuzzerTestOneInput for initialization */
338 static int init_csv_reader(void) {
339 /* Import csv and csv.Error */
340 csv_module = PyImport_ImportModule("csv");
341 if (csv_module == NULL) {
342 return 0;
343 }
344 csv_error = PyObject_GetAttrString(csv_module, "Error");
345 return csv_error != NULL;
346 }
347 /* Fuzz csv.reader([x]) */
348 static int fuzz_csv_reader(const char* data, size_t size) {
349 if (size < 1 || size > MAX_CSV_TEST_SIZE) {
350 return 0;
351 }
352 /* Ignore non null-terminated strings since _csv can't handle
353 embedded nulls */
354 if (memchr(data, '\0', size) == NULL) {
355 return 0;
356 }
357
358 PyObject* s = PyUnicode_FromString(data);
359 /* Ignore exceptions until we have a valid string */
360 if (s == NULL) {
361 PyErr_Clear();
362 return 0;
363 }
364
365 /* Split on \n so we can test multiple lines */
366 PyObject* lines = PyObject_CallMethod(s, "split", "s", "\n");
367 if (lines == NULL) {
368 Py_DECREF(s);
369 return 0;
370 }
371
372 PyObject* reader = PyObject_CallMethod(csv_module, "reader", "N", lines);
373 if (reader) {
374 /* Consume all of the reader as an iterator */
375 PyObject* parsed_line;
376 while ((parsed_line = PyIter_Next(reader))) {
377 Py_DECREF(parsed_line);
378 }
379 }
380
381 /* Ignore csv.Error because we're probably going to generate
382 some bad files (embedded new-lines, unterminated quotes etc) */
383 if (PyErr_ExceptionMatches(csv_error)) {
384 PyErr_Clear();
385 }
386
387 Py_XDECREF(reader);
388 Py_DECREF(s);
389 return 0;
390 }
391
392 #define MAX_AST_LITERAL_EVAL_TEST_SIZE 0x10000
393 PyObject* ast_literal_eval_method = NULL;
394 /* Called by LLVMFuzzerTestOneInput for initialization */
395 static int init_ast_literal_eval(void) {
396 PyObject* ast_module = PyImport_ImportModule("ast");
397 if (ast_module == NULL) {
398 return 0;
399 }
400 ast_literal_eval_method = PyObject_GetAttrString(ast_module, "literal_eval");
401 return ast_literal_eval_method != NULL;
402 }
403 /* Fuzz ast.literal_eval(x) */
404 static int fuzz_ast_literal_eval(const char* data, size_t size) {
405 if (size > MAX_AST_LITERAL_EVAL_TEST_SIZE) {
406 return 0;
407 }
408 /* Ignore non null-terminated strings since ast can't handle
409 embedded nulls */
410 if (memchr(data, '\0', size) == NULL) {
411 return 0;
412 }
413
414 PyObject* s = PyUnicode_FromString(data);
415 /* Ignore exceptions until we have a valid string */
416 if (s == NULL) {
417 PyErr_Clear();
418 return 0;
419 }
420
421 PyObject* literal = PyObject_CallOneArg(ast_literal_eval_method, s);
422 /* Ignore some common errors thrown by ast.literal_eval */
423 if (literal == NULL && (PyErr_ExceptionMatches(PyExc_ValueError) ||
424 PyErr_ExceptionMatches(PyExc_TypeError) ||
425 PyErr_ExceptionMatches(PyExc_SyntaxError) ||
426 PyErr_ExceptionMatches(PyExc_MemoryError) ||
427 PyErr_ExceptionMatches(PyExc_RecursionError))
428 ) {
429 PyErr_Clear();
430 }
431
432 Py_XDECREF(literal);
433 Py_DECREF(s);
434 return 0;
435 }
436
437 /* Run fuzzer and abort on failure. */
438 static int _run_fuzz(const uint8_t *data, size_t size, int(*fuzzer)(const char* , size_t)) {
439 int rv = fuzzer((const char*) data, size);
440 if (PyErr_Occurred()) {
441 /* Fuzz tests should handle expected errors for themselves.
442 This is last-ditch check in case they didn't. */
443 PyErr_Print();
444 abort();
445 }
446 /* Someday the return value might mean something, propagate it. */
447 return rv;
448 }
449
450 /* CPython generates a lot of leak warnings for whatever reason. */
451 int __lsan_is_turned_off(void) { return 1; }
452
453
454 int LLVMFuzzerInitialize(int *argc, char ***argv) {
455 PyConfig config;
456 PyConfig_InitPythonConfig(&config);
457 config.install_signal_handlers = 0;
458 PyStatus status;
459 status = PyConfig_SetBytesString(&config, &config.program_name, *argv[0]);
460 if (PyStatus_Exception(status)) {
461 goto fail;
462 }
463
464 status = Py_InitializeFromConfig(&config);
465 if (PyStatus_Exception(status)) {
466 goto fail;
467 }
468 PyConfig_Clear(&config);
469
470 return 0;
471
472 fail:
473 PyConfig_Clear(&config);
474 Py_ExitStatusException(status);
475 }
476
477 /* Fuzz test interface.
478 This returns the bitwise or of all fuzz test's return values.
479
480 All fuzz tests must return 0, as all nonzero return codes are reserved for
481 future use -- we propagate the return values for that future case.
482 (And we bitwise or when running multiple tests to verify that normally we
483 only return 0.) */
484 int LLVMFuzzerTestOneInput(const uint8_t *data, size_t size) {
485 assert(Py_IsInitialized());
486
487 int rv = 0;
488
489 #if !defined(_Py_FUZZ_ONE) || defined(_Py_FUZZ_fuzz_builtin_float)
490 rv |= _run_fuzz(data, size, fuzz_builtin_float);
491 #endif
492 #if !defined(_Py_FUZZ_ONE) || defined(_Py_FUZZ_fuzz_builtin_int)
493 rv |= _run_fuzz(data, size, fuzz_builtin_int);
494 #endif
495 #if !defined(_Py_FUZZ_ONE) || defined(_Py_FUZZ_fuzz_builtin_unicode)
496 rv |= _run_fuzz(data, size, fuzz_builtin_unicode);
497 #endif
498 #if !defined(_Py_FUZZ_ONE) || defined(_Py_FUZZ_fuzz_struct_unpack)
499 static int STRUCT_UNPACK_INITIALIZED = 0;
500 if (!STRUCT_UNPACK_INITIALIZED && !init_struct_unpack()) {
501 PyErr_Print();
502 abort();
503 } else {
504 STRUCT_UNPACK_INITIALIZED = 1;
505 }
506 rv |= _run_fuzz(data, size, fuzz_struct_unpack);
507 #endif
508 #if !defined(_Py_FUZZ_ONE) || defined(_Py_FUZZ_fuzz_json_loads)
509 static int JSON_LOADS_INITIALIZED = 0;
510 if (!JSON_LOADS_INITIALIZED && !init_json_loads()) {
511 PyErr_Print();
512 abort();
513 } else {
514 JSON_LOADS_INITIALIZED = 1;
515 }
516
517 rv |= _run_fuzz(data, size, fuzz_json_loads);
518 #endif
519 #if !defined(_Py_FUZZ_ONE) || defined(_Py_FUZZ_fuzz_sre_compile)
520 static int SRE_COMPILE_INITIALIZED = 0;
521 if (!SRE_COMPILE_INITIALIZED && !init_sre_compile()) {
522 PyErr_Print();
523 abort();
524 } else {
525 SRE_COMPILE_INITIALIZED = 1;
526 }
527
528 rv |= _run_fuzz(data, size, fuzz_sre_compile);
529 #endif
530 #if !defined(_Py_FUZZ_ONE) || defined(_Py_FUZZ_fuzz_sre_match)
531 static int SRE_MATCH_INITIALIZED = 0;
532 if (!SRE_MATCH_INITIALIZED && !init_sre_match()) {
533 PyErr_Print();
534 abort();
535 } else {
536 SRE_MATCH_INITIALIZED = 1;
537 }
538
539 rv |= _run_fuzz(data, size, fuzz_sre_match);
540 #endif
541 #if !defined(_Py_FUZZ_ONE) || defined(_Py_FUZZ_fuzz_csv_reader)
542 static int CSV_READER_INITIALIZED = 0;
543 if (!CSV_READER_INITIALIZED && !init_csv_reader()) {
544 PyErr_Print();
545 abort();
546 } else {
547 CSV_READER_INITIALIZED = 1;
548 }
549
550 rv |= _run_fuzz(data, size, fuzz_csv_reader);
551 #endif
552 #if !defined(_Py_FUZZ_ONE) || defined(_Py_FUZZ_fuzz_ast_literal_eval)
553 static int AST_LITERAL_EVAL_INITIALIZED = 0;
554 if (!AST_LITERAL_EVAL_INITIALIZED && !init_ast_literal_eval()) {
555 PyErr_Print();
556 abort();
557 } else {
558 AST_LITERAL_EVAL_INITIALIZED = 1;
559 }
560
561 rv |= _run_fuzz(data, size, fuzz_ast_literal_eval);
562 #endif
563 return rv;
564 }