1 #include <Python.h>
2 #include "pycore_ast.h" // _PyAST_Validate(),
3 #include "pycore_pystate.h" // _PyThreadState_GET()
4 #include <errcode.h>
5
6 #include "tokenizer.h"
7 #include "pegen.h"
8
9 // Internal parser functions
10
11 asdl_stmt_seq*
12 _PyPegen_interactive_exit(Parser *p)
13 {
14 if (p->errcode) {
15 *(p->errcode) = E_EOF;
16 }
17 return NULL;
18 }
19
20 Py_ssize_t
21 _PyPegen_byte_offset_to_character_offset(PyObject *line, Py_ssize_t col_offset)
22 {
23 const char *str = PyUnicode_AsUTF8(line);
24 if (!str) {
25 return -1;
26 }
27 Py_ssize_t len = strlen(str);
28 if (col_offset > len + 1) {
29 col_offset = len + 1;
30 }
31 assert(col_offset >= 0);
32 PyObject *text = PyUnicode_DecodeUTF8(str, col_offset, "replace");
33 if (!text) {
34 return -1;
35 }
36 Py_ssize_t size = PyUnicode_GET_LENGTH(text);
37 Py_DECREF(text);
38 return size;
39 }
40
41 // Calculate the extra amount of width space the given source
42 // code segment might take if it were to be displayed on a fixed
43 // width output device. Supports wide unicode characters and emojis.
44 Py_ssize_t
45 _PyPegen_calculate_display_width(PyObject *line, Py_ssize_t character_offset)
46 {
47 PyObject *segment = PyUnicode_Substring(line, 0, character_offset);
48 if (!segment) {
49 return -1;
50 }
51
52 // Fast track for ascii strings
53 if (PyUnicode_IS_ASCII(segment)) {
54 Py_DECREF(segment);
55 return character_offset;
56 }
57
58 PyObject *width_fn = _PyImport_GetModuleAttrString("unicodedata", "east_asian_width");
59 if (!width_fn) {
60 return -1;
61 }
62
63 Py_ssize_t width = 0;
64 Py_ssize_t len = PyUnicode_GET_LENGTH(segment);
65 for (Py_ssize_t i = 0; i < len; i++) {
66 PyObject *chr = PyUnicode_Substring(segment, i, i + 1);
67 if (!chr) {
68 Py_DECREF(segment);
69 Py_DECREF(width_fn);
70 return -1;
71 }
72
73 PyObject *width_specifier = PyObject_CallOneArg(width_fn, chr);
74 Py_DECREF(chr);
75 if (!width_specifier) {
76 Py_DECREF(segment);
77 Py_DECREF(width_fn);
78 return -1;
79 }
80
81 if (_PyUnicode_EqualToASCIIString(width_specifier, "W") ||
82 _PyUnicode_EqualToASCIIString(width_specifier, "F")) {
83 width += 2;
84 }
85 else {
86 width += 1;
87 }
88 Py_DECREF(width_specifier);
89 }
90
91 Py_DECREF(segment);
92 Py_DECREF(width_fn);
93 return width;
94 }
95
96 // Here, mark is the start of the node, while p->mark is the end.
97 // If node==NULL, they should be the same.
98 int
99 _PyPegen_insert_memo(Parser *p, int mark, int type, void *node)
100 {
101 // Insert in front
102 Memo *m = _PyArena_Malloc(p->arena, sizeof(Memo));
103 if (m == NULL) {
104 return -1;
105 }
106 m->type = type;
107 m->node = node;
108 m->mark = p->mark;
109 m->next = p->tokens[mark]->memo;
110 p->tokens[mark]->memo = m;
111 return 0;
112 }
113
114 // Like _PyPegen_insert_memo(), but updates an existing node if found.
115 int
116 _PyPegen_update_memo(Parser *p, int mark, int type, void *node)
117 {
118 for (Memo *m = p->tokens[mark]->memo; m != NULL; m = m->next) {
119 if (m->type == type) {
120 // Update existing node.
121 m->node = node;
122 m->mark = p->mark;
123 return 0;
124 }
125 }
126 // Insert new node.
127 return _PyPegen_insert_memo(p, mark, type, node);
128 }
129
130 static int
131 init_normalization(Parser *p)
132 {
133 if (p->normalize) {
134 return 1;
135 }
136 PyObject *m = PyImport_ImportModule("unicodedata");
137 if (!m)
138 {
139 return 0;
140 }
141 p->normalize = PyObject_GetAttrString(m, "normalize");
142 Py_DECREF(m);
143 if (!p->normalize)
144 {
145 return 0;
146 }
147 return 1;
148 }
149
150 static int
151 growable_comment_array_init(growable_comment_array *arr, size_t initial_size) {
152 assert(initial_size > 0);
153 arr->items = PyMem_Malloc(initial_size * sizeof(*arr->items));
154 arr->size = initial_size;
155 arr->num_items = 0;
156
157 return arr->items != NULL;
158 }
159
160 static int
161 growable_comment_array_add(growable_comment_array *arr, int lineno, char *comment) {
162 if (arr->num_items >= arr->size) {
163 size_t new_size = arr->size * 2;
164 void *new_items_array = PyMem_Realloc(arr->items, new_size * sizeof(*arr->items));
165 if (!new_items_array) {
166 return 0;
167 }
168 arr->items = new_items_array;
169 arr->size = new_size;
170 }
171
172 arr->items[arr->num_items].lineno = lineno;
173 arr->items[arr->num_items].comment = comment; // Take ownership
174 arr->num_items++;
175 return 1;
176 }
177
178 static void
179 growable_comment_array_deallocate(growable_comment_array *arr) {
180 for (unsigned i = 0; i < arr->num_items; i++) {
181 PyMem_Free(arr->items[i].comment);
182 }
183 PyMem_Free(arr->items);
184 }
185
186 static int
187 _get_keyword_or_name_type(Parser *p, const char *name, int name_len)
188 {
189 assert(name_len > 0);
190 if (name_len >= p->n_keyword_lists ||
191 p->keywords[name_len] == NULL ||
192 p->keywords[name_len]->type == -1) {
193 return NAME;
194 }
195 for (KeywordToken *k = p->keywords[name_len]; k != NULL && k->type != -1; k++) {
196 if (strncmp(k->str, name, name_len) == 0) {
197 return k->type;
198 }
199 }
200 return NAME;
201 }
202
203 static int
204 initialize_token(Parser *p, Token *token, const char *start, const char *end, int token_type) {
205 assert(token != NULL);
206
207 token->type = (token_type == NAME) ? _get_keyword_or_name_type(p, start, (int)(end - start)) : token_type;
208 token->bytes = PyBytes_FromStringAndSize(start, end - start);
209 if (token->bytes == NULL) {
210 return -1;
211 }
212
213 if (_PyArena_AddPyObject(p->arena, token->bytes) < 0) {
214 Py_DECREF(token->bytes);
215 return -1;
216 }
217
218 token->level = p->tok->level;
219
220 const char *line_start = token_type == STRING ? p->tok->multi_line_start : p->tok->line_start;
221 int lineno = token_type == STRING ? p->tok->first_lineno : p->tok->lineno;
222 int end_lineno = p->tok->lineno;
223
224 int col_offset = (start != NULL && start >= line_start) ? (int)(start - line_start) : -1;
225 int end_col_offset = (end != NULL && end >= p->tok->line_start) ? (int)(end - p->tok->line_start) : -1;
226
227 token->lineno = lineno;
228 token->col_offset = p->tok->lineno == p->starting_lineno ? p->starting_col_offset + col_offset : col_offset;
229 token->end_lineno = end_lineno;
230 token->end_col_offset = p->tok->lineno == p->starting_lineno ? p->starting_col_offset + end_col_offset : end_col_offset;
231
232 p->fill += 1;
233
234 if (token_type == ERRORTOKEN && p->tok->done == E_DECODE) {
235 return _Pypegen_raise_decode_error(p);
236 }
237
238 return (token_type == ERRORTOKEN ? _Pypegen_tokenizer_error(p) : 0);
239 }
240
241 static int
242 _resize_tokens_array(Parser *p) {
243 int newsize = p->size * 2;
244 Token **new_tokens = PyMem_Realloc(p->tokens, newsize * sizeof(Token *));
245 if (new_tokens == NULL) {
246 PyErr_NoMemory();
247 return -1;
248 }
249 p->tokens = new_tokens;
250
251 for (int i = p->size; i < newsize; i++) {
252 p->tokens[i] = PyMem_Calloc(1, sizeof(Token));
253 if (p->tokens[i] == NULL) {
254 p->size = i; // Needed, in order to cleanup correctly after parser fails
255 PyErr_NoMemory();
256 return -1;
257 }
258 }
259 p->size = newsize;
260 return 0;
261 }
262
263 int
264 _PyPegen_fill_token(Parser *p)
265 {
266 const char *start;
267 const char *end;
268 int type = _PyTokenizer_Get(p->tok, &start, &end);
269
270 // Record and skip '# type: ignore' comments
271 while (type == TYPE_IGNORE) {
272 Py_ssize_t len = end - start;
273 char *tag = PyMem_Malloc(len + 1);
274 if (tag == NULL) {
275 PyErr_NoMemory();
276 return -1;
277 }
278 strncpy(tag, start, len);
279 tag[len] = '\0';
280 // Ownership of tag passes to the growable array
281 if (!growable_comment_array_add(&p->type_ignore_comments, p->tok->lineno, tag)) {
282 PyErr_NoMemory();
283 return -1;
284 }
285 type = _PyTokenizer_Get(p->tok, &start, &end);
286 }
287
288 // If we have reached the end and we are in single input mode we need to insert a newline and reset the parsing
289 if (p->start_rule == Py_single_input && type == ENDMARKER && p->parsing_started) {
290 type = NEWLINE; /* Add an extra newline */
291 p->parsing_started = 0;
292
293 if (p->tok->indent && !(p->flags & PyPARSE_DONT_IMPLY_DEDENT)) {
294 p->tok->pendin = -p->tok->indent;
295 p->tok->indent = 0;
296 }
297 }
298 else {
299 p->parsing_started = 1;
300 }
301
302 // Check if we are at the limit of the token array capacity and resize if needed
303 if ((p->fill == p->size) && (_resize_tokens_array(p) != 0)) {
304 return -1;
305 }
306
307 Token *t = p->tokens[p->fill];
308 return initialize_token(p, t, start, end, type);
309 }
310
311 #if defined(Py_DEBUG)
312 // Instrumentation to count the effectiveness of memoization.
313 // The array counts the number of tokens skipped by memoization,
314 // indexed by type.
315
316 #define NSTATISTICS 2000
317 static long memo_statistics[NSTATISTICS];
318
319 void
320 _PyPegen_clear_memo_statistics(void)
321 {
322 for (int i = 0; i < NSTATISTICS; i++) {
323 memo_statistics[i] = 0;
324 }
325 }
326
327 PyObject *
328 _PyPegen_get_memo_statistics(void)
329 {
330 PyObject *ret = PyList_New(NSTATISTICS);
331 if (ret == NULL) {
332 return NULL;
333 }
334 for (int i = 0; i < NSTATISTICS; i++) {
335 PyObject *value = PyLong_FromLong(memo_statistics[i]);
336 if (value == NULL) {
337 Py_DECREF(ret);
338 return NULL;
339 }
340 // PyList_SetItem borrows a reference to value.
341 if (PyList_SetItem(ret, i, value) < 0) {
342 Py_DECREF(ret);
343 return NULL;
344 }
345 }
346 return ret;
347 }
348 #endif
349
350 int // bool
351 _PyPegen_is_memoized(Parser *p, int type, void *pres)
352 {
353 if (p->mark == p->fill) {
354 if (_PyPegen_fill_token(p) < 0) {
355 p->error_indicator = 1;
356 return -1;
357 }
358 }
359
360 Token *t = p->tokens[p->mark];
361
362 for (Memo *m = t->memo; m != NULL; m = m->next) {
363 if (m->type == type) {
364 #if defined(PY_DEBUG)
365 if (0 <= type && type < NSTATISTICS) {
366 long count = m->mark - p->mark;
367 // A memoized negative result counts for one.
368 if (count <= 0) {
369 count = 1;
370 }
371 memo_statistics[type] += count;
372 }
373 #endif
374 p->mark = m->mark;
375 *(void **)(pres) = m->node;
376 return 1;
377 }
378 }
379 return 0;
380 }
381
382 int
383 _PyPegen_lookahead_with_name(int positive, expr_ty (func)(Parser *), Parser *p)
384 {
385 int mark = p->mark;
386 void *res = func(p);
387 p->mark = mark;
388 return (res != NULL) == positive;
389 }
390
391 int
392 _PyPegen_lookahead_with_string(int positive, expr_ty (func)(Parser *, const char*), Parser *p, const char* arg)
393 {
394 int mark = p->mark;
395 void *res = func(p, arg);
396 p->mark = mark;
397 return (res != NULL) == positive;
398 }
399
400 int
401 _PyPegen_lookahead_with_int(int positive, Token *(func)(Parser *, int), Parser *p, int arg)
402 {
403 int mark = p->mark;
404 void *res = func(p, arg);
405 p->mark = mark;
406 return (res != NULL) == positive;
407 }
408
409 int
410 _PyPegen_lookahead(int positive, void *(func)(Parser *), Parser *p)
411 {
412 int mark = p->mark;
413 void *res = (void*)func(p);
414 p->mark = mark;
415 return (res != NULL) == positive;
416 }
417
418 Token *
419 _PyPegen_expect_token(Parser *p, int type)
420 {
421 if (p->mark == p->fill) {
422 if (_PyPegen_fill_token(p) < 0) {
423 p->error_indicator = 1;
424 return NULL;
425 }
426 }
427 Token *t = p->tokens[p->mark];
428 if (t->type != type) {
429 return NULL;
430 }
431 p->mark += 1;
432 return t;
433 }
434
435 void*
436 _PyPegen_expect_forced_result(Parser *p, void* result, const char* expected) {
437
438 if (p->error_indicator == 1) {
439 return NULL;
440 }
441 if (result == NULL) {
442 RAISE_SYNTAX_ERROR("expected (%s)", expected);
443 return NULL;
444 }
445 return result;
446 }
447
448 Token *
449 _PyPegen_expect_forced_token(Parser *p, int type, const char* expected) {
450
451 if (p->error_indicator == 1) {
452 return NULL;
453 }
454
455 if (p->mark == p->fill) {
456 if (_PyPegen_fill_token(p) < 0) {
457 p->error_indicator = 1;
458 return NULL;
459 }
460 }
461 Token *t = p->tokens[p->mark];
462 if (t->type != type) {
463 RAISE_SYNTAX_ERROR_KNOWN_LOCATION(t, "expected '%s'", expected);
464 return NULL;
465 }
466 p->mark += 1;
467 return t;
468 }
469
470 expr_ty
471 _PyPegen_expect_soft_keyword(Parser *p, const char *keyword)
472 {
473 if (p->mark == p->fill) {
474 if (_PyPegen_fill_token(p) < 0) {
475 p->error_indicator = 1;
476 return NULL;
477 }
478 }
479 Token *t = p->tokens[p->mark];
480 if (t->type != NAME) {
481 return NULL;
482 }
483 const char *s = PyBytes_AsString(t->bytes);
484 if (!s) {
485 p->error_indicator = 1;
486 return NULL;
487 }
488 if (strcmp(s, keyword) != 0) {
489 return NULL;
490 }
491 return _PyPegen_name_token(p);
492 }
493
494 Token *
495 _PyPegen_get_last_nonnwhitespace_token(Parser *p)
496 {
497 assert(p->mark >= 0);
498 Token *token = NULL;
499 for (int m = p->mark - 1; m >= 0; m--) {
500 token = p->tokens[m];
501 if (token->type != ENDMARKER && (token->type < NEWLINE || token->type > DEDENT)) {
502 break;
503 }
504 }
505 return token;
506 }
507
508 PyObject *
509 _PyPegen_new_identifier(Parser *p, const char *n)
510 {
511 PyObject *id = PyUnicode_DecodeUTF8(n, strlen(n), NULL);
512 if (!id) {
513 goto error;
514 }
515 /* PyUnicode_DecodeUTF8 should always return a ready string. */
516 assert(PyUnicode_IS_READY(id));
517 /* Check whether there are non-ASCII characters in the
518 identifier; if so, normalize to NFKC. */
519 if (!PyUnicode_IS_ASCII(id))
520 {
521 PyObject *id2;
522 if (!init_normalization(p))
523 {
524 Py_DECREF(id);
525 goto error;
526 }
527 PyObject *form = PyUnicode_InternFromString("NFKC");
528 if (form == NULL)
529 {
530 Py_DECREF(id);
531 goto error;
532 }
533 PyObject *args[2] = {form, id};
534 id2 = _PyObject_FastCall(p->normalize, args, 2);
535 Py_DECREF(id);
536 Py_DECREF(form);
537 if (!id2) {
538 goto error;
539 }
540 if (!PyUnicode_Check(id2))
541 {
542 PyErr_Format(PyExc_TypeError,
543 "unicodedata.normalize() must return a string, not "
544 "%.200s",
545 _PyType_Name(Py_TYPE(id2)));
546 Py_DECREF(id2);
547 goto error;
548 }
549 id = id2;
550 }
551 PyUnicode_InternInPlace(&id);
552 if (_PyArena_AddPyObject(p->arena, id) < 0)
553 {
554 Py_DECREF(id);
555 goto error;
556 }
557 return id;
558
559 error:
560 p->error_indicator = 1;
561 return NULL;
562 }
563
564 static expr_ty
565 _PyPegen_name_from_token(Parser *p, Token* t)
566 {
567 if (t == NULL) {
568 return NULL;
569 }
570 const char *s = PyBytes_AsString(t->bytes);
571 if (!s) {
572 p->error_indicator = 1;
573 return NULL;
574 }
575 PyObject *id = _PyPegen_new_identifier(p, s);
576 if (id == NULL) {
577 p->error_indicator = 1;
578 return NULL;
579 }
580 return _PyAST_Name(id, Load, t->lineno, t->col_offset, t->end_lineno,
581 t->end_col_offset, p->arena);
582 }
583
584 expr_ty
585 _PyPegen_name_token(Parser *p)
586 {
587 Token *t = _PyPegen_expect_token(p, NAME);
588 return _PyPegen_name_from_token(p, t);
589 }
590
591 void *
592 _PyPegen_string_token(Parser *p)
593 {
594 return _PyPegen_expect_token(p, STRING);
595 }
596
597 expr_ty _PyPegen_soft_keyword_token(Parser *p) {
598 Token *t = _PyPegen_expect_token(p, NAME);
599 if (t == NULL) {
600 return NULL;
601 }
602 char *the_token;
603 Py_ssize_t size;
604 PyBytes_AsStringAndSize(t->bytes, &the_token, &size);
605 for (char **keyword = p->soft_keywords; *keyword != NULL; keyword++) {
606 if (strncmp(*keyword, the_token, size) == 0) {
607 return _PyPegen_name_from_token(p, t);
608 }
609 }
610 return NULL;
611 }
612
613 static PyObject *
614 parsenumber_raw(const char *s)
615 {
616 const char *end;
617 long x;
618 double dx;
619 Py_complex compl;
620 int imflag;
621
622 assert(s != NULL);
623 errno = 0;
624 end = s + strlen(s) - 1;
625 imflag = *end == 'j' || *end == 'J';
626 if (s[0] == '0') {
627 x = (long)PyOS_strtoul(s, (char **)&end, 0);
628 if (x < 0 && errno == 0) {
629 return PyLong_FromString(s, (char **)0, 0);
630 }
631 }
632 else {
633 x = PyOS_strtol(s, (char **)&end, 0);
634 }
635 if (*end == '\0') {
636 if (errno != 0) {
637 return PyLong_FromString(s, (char **)0, 0);
638 }
639 return PyLong_FromLong(x);
640 }
641 /* XXX Huge floats may silently fail */
642 if (imflag) {
643 compl.real = 0.;
644 compl.imag = PyOS_string_to_double(s, (char **)&end, NULL);
645 if (compl.imag == -1.0 && PyErr_Occurred()) {
646 return NULL;
647 }
648 return PyComplex_FromCComplex(compl);
649 }
650 dx = PyOS_string_to_double(s, NULL, NULL);
651 if (dx == -1.0 && PyErr_Occurred()) {
652 return NULL;
653 }
654 return PyFloat_FromDouble(dx);
655 }
656
657 static PyObject *
658 parsenumber(const char *s)
659 {
660 char *dup;
661 char *end;
662 PyObject *res = NULL;
663
664 assert(s != NULL);
665
666 if (strchr(s, '_') == NULL) {
667 return parsenumber_raw(s);
668 }
669 /* Create a duplicate without underscores. */
670 dup = PyMem_Malloc(strlen(s) + 1);
671 if (dup == NULL) {
672 return PyErr_NoMemory();
673 }
674 end = dup;
675 for (; *s; s++) {
676 if (*s != '_') {
677 *end++ = *s;
678 }
679 }
680 *end = '\0';
681 res = parsenumber_raw(dup);
682 PyMem_Free(dup);
683 return res;
684 }
685
686 expr_ty
687 _PyPegen_number_token(Parser *p)
688 {
689 Token *t = _PyPegen_expect_token(p, NUMBER);
690 if (t == NULL) {
691 return NULL;
692 }
693
694 const char *num_raw = PyBytes_AsString(t->bytes);
695 if (num_raw == NULL) {
696 p->error_indicator = 1;
697 return NULL;
698 }
699
700 if (p->feature_version < 6 && strchr(num_raw, '_') != NULL) {
701 p->error_indicator = 1;
702 return RAISE_SYNTAX_ERROR("Underscores in numeric literals are only supported "
703 "in Python 3.6 and greater");
704 }
705
706 PyObject *c = parsenumber(num_raw);
707
708 if (c == NULL) {
709 p->error_indicator = 1;
710 PyThreadState *tstate = _PyThreadState_GET();
711 // The only way a ValueError should happen in _this_ code is via
712 // PyLong_FromString hitting a length limit.
713 if (tstate->curexc_type == PyExc_ValueError &&
714 tstate->curexc_value != NULL) {
715 PyObject *type, *value, *tb;
716 // This acts as PyErr_Clear() as we're replacing curexc.
717 PyErr_Fetch(&type, &value, &tb);
718 Py_XDECREF(tb);
719 Py_DECREF(type);
720 /* Intentionally omitting columns to avoid a wall of 1000s of '^'s
721 * on the error message. Nobody is going to overlook their huge
722 * numeric literal once given the line. */
723 RAISE_ERROR_KNOWN_LOCATION(
724 p, PyExc_SyntaxError,
725 t->lineno, -1 /* col_offset */,
726 t->end_lineno, -1 /* end_col_offset */,
727 "%S - Consider hexadecimal for huge integer literals "
728 "to avoid decimal conversion limits.",
729 value);
730 Py_DECREF(value);
731 }
732 return NULL;
733 }
734
735 if (_PyArena_AddPyObject(p->arena, c) < 0) {
736 Py_DECREF(c);
737 p->error_indicator = 1;
738 return NULL;
739 }
740
741 return _PyAST_Constant(c, NULL, t->lineno, t->col_offset, t->end_lineno,
742 t->end_col_offset, p->arena);
743 }
744
745 /* Check that the source for a single input statement really is a single
746 statement by looking at what is left in the buffer after parsing.
747 Trailing whitespace and comments are OK. */
748 static int // bool
749 bad_single_statement(Parser *p)
750 {
751 char *cur = p->tok->cur;
752 char c = *cur;
753
754 for (;;) {
755 while (c == ' ' || c == '\t' || c == '\n' || c == '\014') {
756 c = *++cur;
757 }
758
759 if (!c) {
760 return 0;
761 }
762
763 if (c != '#') {
764 return 1;
765 }
766
767 /* Suck up comment. */
768 while (c && c != '\n') {
769 c = *++cur;
770 }
771 }
772 }
773
774 static int
775 compute_parser_flags(PyCompilerFlags *flags)
776 {
777 int parser_flags = 0;
778 if (!flags) {
779 return 0;
780 }
781 if (flags->cf_flags & PyCF_DONT_IMPLY_DEDENT) {
782 parser_flags |= PyPARSE_DONT_IMPLY_DEDENT;
783 }
784 if (flags->cf_flags & PyCF_IGNORE_COOKIE) {
785 parser_flags |= PyPARSE_IGNORE_COOKIE;
786 }
787 if (flags->cf_flags & CO_FUTURE_BARRY_AS_BDFL) {
788 parser_flags |= PyPARSE_BARRY_AS_BDFL;
789 }
790 if (flags->cf_flags & PyCF_TYPE_COMMENTS) {
791 parser_flags |= PyPARSE_TYPE_COMMENTS;
792 }
793 if ((flags->cf_flags & PyCF_ONLY_AST) && flags->cf_feature_version < 7) {
794 parser_flags |= PyPARSE_ASYNC_HACKS;
795 }
796 if (flags->cf_flags & PyCF_ALLOW_INCOMPLETE_INPUT) {
797 parser_flags |= PyPARSE_ALLOW_INCOMPLETE_INPUT;
798 }
799 return parser_flags;
800 }
801
802 // Parser API
803
804 Parser *
805 _PyPegen_Parser_New(struct tok_state *tok, int start_rule, int flags,
806 int feature_version, int *errcode, PyArena *arena)
807 {
808 Parser *p = PyMem_Malloc(sizeof(Parser));
809 if (p == NULL) {
810 return (Parser *) PyErr_NoMemory();
811 }
812 assert(tok != NULL);
813 tok->type_comments = (flags & PyPARSE_TYPE_COMMENTS) > 0;
814 tok->async_hacks = (flags & PyPARSE_ASYNC_HACKS) > 0;
815 p->tok = tok;
816 p->keywords = NULL;
817 p->n_keyword_lists = -1;
818 p->soft_keywords = NULL;
819 p->tokens = PyMem_Malloc(sizeof(Token *));
820 if (!p->tokens) {
821 PyMem_Free(p);
822 return (Parser *) PyErr_NoMemory();
823 }
824 p->tokens[0] = PyMem_Calloc(1, sizeof(Token));
825 if (!p->tokens[0]) {
826 PyMem_Free(p->tokens);
827 PyMem_Free(p);
828 return (Parser *) PyErr_NoMemory();
829 }
830 if (!growable_comment_array_init(&p->type_ignore_comments, 10)) {
831 PyMem_Free(p->tokens[0]);
832 PyMem_Free(p->tokens);
833 PyMem_Free(p);
834 return (Parser *) PyErr_NoMemory();
835 }
836
837 p->mark = 0;
838 p->fill = 0;
839 p->size = 1;
840
841 p->errcode = errcode;
842 p->arena = arena;
843 p->start_rule = start_rule;
844 p->parsing_started = 0;
845 p->normalize = NULL;
846 p->error_indicator = 0;
847
848 p->starting_lineno = 0;
849 p->starting_col_offset = 0;
850 p->flags = flags;
851 p->feature_version = feature_version;
852 p->known_err_token = NULL;
853 p->level = 0;
854 p->call_invalid_rules = 0;
855 return p;
856 }
857
858 void
859 _PyPegen_Parser_Free(Parser *p)
860 {
861 Py_XDECREF(p->normalize);
862 for (int i = 0; i < p->size; i++) {
863 PyMem_Free(p->tokens[i]);
864 }
865 PyMem_Free(p->tokens);
866 growable_comment_array_deallocate(&p->type_ignore_comments);
867 PyMem_Free(p);
868 }
869
870 static void
871 reset_parser_state_for_error_pass(Parser *p)
872 {
873 for (int i = 0; i < p->fill; i++) {
874 p->tokens[i]->memo = NULL;
875 }
876 p->mark = 0;
877 p->call_invalid_rules = 1;
878 // Don't try to get extra tokens in interactive mode when trying to
879 // raise specialized errors in the second pass.
880 p->tok->interactive_underflow = IUNDERFLOW_STOP;
881 }
882
883 static inline int
884 _is_end_of_source(Parser *p) {
885 int err = p->tok->done;
886 return err == E_EOF || err == E_EOFS || err == E_EOLS;
887 }
888
889 void *
890 _PyPegen_run_parser(Parser *p)
891 {
892 void *res = _PyPegen_parse(p);
893 assert(p->level == 0);
894 if (res == NULL) {
895 if ((p->flags & PyPARSE_ALLOW_INCOMPLETE_INPUT) && _is_end_of_source(p)) {
896 PyErr_Clear();
897 return RAISE_SYNTAX_ERROR("incomplete input");
898 }
899 if (PyErr_Occurred() && !PyErr_ExceptionMatches(PyExc_SyntaxError)) {
900 return NULL;
901 }
902 // Make a second parser pass. In this pass we activate heavier and slower checks
903 // to produce better error messages and more complete diagnostics. Extra "invalid_*"
904 // rules will be active during parsing.
905 Token *last_token = p->tokens[p->fill - 1];
906 reset_parser_state_for_error_pass(p);
907 _PyPegen_parse(p);
908
909 // Set SyntaxErrors accordingly depending on the parser/tokenizer status at the failure
910 // point.
911 _Pypegen_set_syntax_error(p, last_token);
912 return NULL;
913 }
914
915 if (p->start_rule == Py_single_input && bad_single_statement(p)) {
916 p->tok->done = E_BADSINGLE; // This is not necessary for now, but might be in the future
917 return RAISE_SYNTAX_ERROR("multiple statements found while compiling a single statement");
918 }
919
920 // test_peg_generator defines _Py_TEST_PEGEN to not call PyAST_Validate()
921 #if defined(Py_DEBUG) && !defined(_Py_TEST_PEGEN)
922 if (p->start_rule == Py_single_input ||
923 p->start_rule == Py_file_input ||
924 p->start_rule == Py_eval_input)
925 {
926 if (!_PyAST_Validate(res)) {
927 return NULL;
928 }
929 }
930 #endif
931 return res;
932 }
933
934 mod_ty
935 _PyPegen_run_parser_from_file_pointer(FILE *fp, int start_rule, PyObject *filename_ob,
936 const char *enc, const char *ps1, const char *ps2,
937 PyCompilerFlags *flags, int *errcode, PyArena *arena)
938 {
939 struct tok_state *tok = _PyTokenizer_FromFile(fp, enc, ps1, ps2);
940 if (tok == NULL) {
941 if (PyErr_Occurred()) {
942 _PyPegen_raise_tokenizer_init_error(filename_ob);
943 return NULL;
944 }
945 return NULL;
946 }
947 if (!tok->fp || ps1 != NULL || ps2 != NULL ||
948 PyUnicode_CompareWithASCIIString(filename_ob, "<stdin>") == 0) {
949 tok->fp_interactive = 1;
950 }
951 // This transfers the ownership to the tokenizer
952 tok->filename = filename_ob;
953 Py_INCREF(filename_ob);
954
955 // From here on we need to clean up even if there's an error
956 mod_ty result = NULL;
957
958 int parser_flags = compute_parser_flags(flags);
959 Parser *p = _PyPegen_Parser_New(tok, start_rule, parser_flags, PY_MINOR_VERSION,
960 errcode, arena);
961 if (p == NULL) {
962 goto error;
963 }
964
965 result = _PyPegen_run_parser(p);
966 _PyPegen_Parser_Free(p);
967
968 error:
969 _PyTokenizer_Free(tok);
970 return result;
971 }
972
973 mod_ty
974 _PyPegen_run_parser_from_string(const char *str, int start_rule, PyObject *filename_ob,
975 PyCompilerFlags *flags, PyArena *arena)
976 {
977 int exec_input = start_rule == Py_file_input;
978
979 struct tok_state *tok;
980 if (flags != NULL && flags->cf_flags & PyCF_IGNORE_COOKIE) {
981 tok = _PyTokenizer_FromUTF8(str, exec_input);
982 } else {
983 tok = _PyTokenizer_FromString(str, exec_input);
984 }
985 if (tok == NULL) {
986 if (PyErr_Occurred()) {
987 _PyPegen_raise_tokenizer_init_error(filename_ob);
988 }
989 return NULL;
990 }
991 // This transfers the ownership to the tokenizer
992 tok->filename = filename_ob;
993 Py_INCREF(filename_ob);
994
995 // We need to clear up from here on
996 mod_ty result = NULL;
997
998 int parser_flags = compute_parser_flags(flags);
999 int feature_version = flags && (flags->cf_flags & PyCF_ONLY_AST) ?
1000 flags->cf_feature_version : PY_MINOR_VERSION;
1001 Parser *p = _PyPegen_Parser_New(tok, start_rule, parser_flags, feature_version,
1002 NULL, arena);
1003 if (p == NULL) {
1004 goto error;
1005 }
1006
1007 result = _PyPegen_run_parser(p);
1008 _PyPegen_Parser_Free(p);
1009
1010 error:
1011 _PyTokenizer_Free(tok);
1012 return result;
1013 }