1 #include <Python.h>
2 #include "pycore_ast.h" // _PyAST_Validate(),
3 #include "pycore_pystate.h" // _PyThreadState_GET()
4 #include <errcode.h>
5
6 #include "tokenizer.h"
7 #include "pegen.h"
8
9 // Internal parser functions
10
11 asdl_stmt_seq*
12 _PyPegen_interactive_exit(Parser *p)
13 {
14 if (p->errcode) {
15 *(p->errcode) = E_EOF;
16 }
17 return NULL;
18 }
19
20 Py_ssize_t
21 _PyPegen_byte_offset_to_character_offset(PyObject *line, Py_ssize_t col_offset)
22 {
23 const char *str = PyUnicode_AsUTF8(line);
24 if (!str) {
25 return -1;
26 }
27 Py_ssize_t len = strlen(str);
28 if (col_offset > len + 1) {
29 col_offset = len + 1;
30 }
31 assert(col_offset >= 0);
32 PyObject *text = PyUnicode_DecodeUTF8(str, col_offset, "replace");
33 if (!text) {
34 return -1;
35 }
36 Py_ssize_t size = PyUnicode_GET_LENGTH(text);
37 Py_DECREF(text);
38 return size;
39 }
40
41 // Here, mark is the start of the node, while p->mark is the end.
42 // If node==NULL, they should be the same.
43 int
44 _PyPegen_insert_memo(Parser *p, int mark, int type, void *node)
45 {
46 // Insert in front
47 Memo *m = _PyArena_Malloc(p->arena, sizeof(Memo));
48 if (m == NULL) {
49 return -1;
50 }
51 m->type = type;
52 m->node = node;
53 m->mark = p->mark;
54 m->next = p->tokens[mark]->memo;
55 p->tokens[mark]->memo = m;
56 return 0;
57 }
58
59 // Like _PyPegen_insert_memo(), but updates an existing node if found.
60 int
61 _PyPegen_update_memo(Parser *p, int mark, int type, void *node)
62 {
63 for (Memo *m = p->tokens[mark]->memo; m != NULL; m = m->next) {
64 if (m->type == type) {
65 // Update existing node.
66 m->node = node;
67 m->mark = p->mark;
68 return 0;
69 }
70 }
71 // Insert new node.
72 return _PyPegen_insert_memo(p, mark, type, node);
73 }
74
75 static int
76 init_normalization(Parser *p)
77 {
78 if (p->normalize) {
79 return 1;
80 }
81 p->normalize = _PyImport_GetModuleAttrString("unicodedata", "normalize");
82 if (!p->normalize)
83 {
84 return 0;
85 }
86 return 1;
87 }
88
89 static int
90 growable_comment_array_init(growable_comment_array *arr, size_t initial_size) {
91 assert(initial_size > 0);
92 arr->items = PyMem_Malloc(initial_size * sizeof(*arr->items));
93 arr->size = initial_size;
94 arr->num_items = 0;
95
96 return arr->items != NULL;
97 }
98
99 static int
100 growable_comment_array_add(growable_comment_array *arr, int lineno, char *comment) {
101 if (arr->num_items >= arr->size) {
102 size_t new_size = arr->size * 2;
103 void *new_items_array = PyMem_Realloc(arr->items, new_size * sizeof(*arr->items));
104 if (!new_items_array) {
105 return 0;
106 }
107 arr->items = new_items_array;
108 arr->size = new_size;
109 }
110
111 arr->items[arr->num_items].lineno = lineno;
112 arr->items[arr->num_items].comment = comment; // Take ownership
113 arr->num_items++;
114 return 1;
115 }
116
117 static void
118 growable_comment_array_deallocate(growable_comment_array *arr) {
119 for (unsigned i = 0; i < arr->num_items; i++) {
120 PyMem_Free(arr->items[i].comment);
121 }
122 PyMem_Free(arr->items);
123 }
124
125 static int
126 _get_keyword_or_name_type(Parser *p, struct token *new_token)
127 {
128 int name_len = new_token->end_col_offset - new_token->col_offset;
129 assert(name_len > 0);
130
131 if (name_len >= p->n_keyword_lists ||
132 p->keywords[name_len] == NULL ||
133 p->keywords[name_len]->type == -1) {
134 return NAME;
135 }
136 for (KeywordToken *k = p->keywords[name_len]; k != NULL && k->type != -1; k++) {
137 if (strncmp(k->str, new_token->start, name_len) == 0) {
138 return k->type;
139 }
140 }
141 return NAME;
142 }
143
144 static int
145 initialize_token(Parser *p, Token *parser_token, struct token *new_token, int token_type) {
146 assert(parser_token != NULL);
147
148 parser_token->type = (token_type == NAME) ? _get_keyword_or_name_type(p, new_token) : token_type;
149 parser_token->bytes = PyBytes_FromStringAndSize(new_token->start, new_token->end - new_token->start);
150 if (parser_token->bytes == NULL) {
151 return -1;
152 }
153 if (_PyArena_AddPyObject(p->arena, parser_token->bytes) < 0) {
154 Py_DECREF(parser_token->bytes);
155 return -1;
156 }
157
158 parser_token->metadata = NULL;
159 if (new_token->metadata != NULL) {
160 if (_PyArena_AddPyObject(p->arena, new_token->metadata) < 0) {
161 Py_DECREF(parser_token->metadata);
162 return -1;
163 }
164 parser_token->metadata = new_token->metadata;
165 new_token->metadata = NULL;
166 }
167
168 parser_token->level = new_token->level;
169 parser_token->lineno = new_token->lineno;
170 parser_token->col_offset = p->tok->lineno == p->starting_lineno ? p->starting_col_offset + new_token->col_offset
171 : new_token->col_offset;
172 parser_token->end_lineno = new_token->end_lineno;
173 parser_token->end_col_offset = p->tok->lineno == p->starting_lineno ? p->starting_col_offset + new_token->end_col_offset
174 : new_token->end_col_offset;
175
176 p->fill += 1;
177
178 if (token_type == ERRORTOKEN && p->tok->done == E_DECODE) {
179 return _Pypegen_raise_decode_error(p);
180 }
181
182 return (token_type == ERRORTOKEN ? _Pypegen_tokenizer_error(p) : 0);
183 }
184
185 static int
186 _resize_tokens_array(Parser *p) {
187 int newsize = p->size * 2;
188 Token **new_tokens = PyMem_Realloc(p->tokens, newsize * sizeof(Token *));
189 if (new_tokens == NULL) {
190 PyErr_NoMemory();
191 return -1;
192 }
193 p->tokens = new_tokens;
194
195 for (int i = p->size; i < newsize; i++) {
196 p->tokens[i] = PyMem_Calloc(1, sizeof(Token));
197 if (p->tokens[i] == NULL) {
198 p->size = i; // Needed, in order to cleanup correctly after parser fails
199 PyErr_NoMemory();
200 return -1;
201 }
202 }
203 p->size = newsize;
204 return 0;
205 }
206
207 int
208 _PyPegen_fill_token(Parser *p)
209 {
210 struct token new_token;
211 _PyToken_Init(&new_token);
212 int type = _PyTokenizer_Get(p->tok, &new_token);
213
214 // Record and skip '# type: ignore' comments
215 while (type == TYPE_IGNORE) {
216 Py_ssize_t len = new_token.end_col_offset - new_token.col_offset;
217 char *tag = PyMem_Malloc(len + 1);
218 if (tag == NULL) {
219 PyErr_NoMemory();
220 goto error;
221 }
222 strncpy(tag, new_token.start, len);
223 tag[len] = '\0';
224 // Ownership of tag passes to the growable array
225 if (!growable_comment_array_add(&p->type_ignore_comments, p->tok->lineno, tag)) {
226 PyErr_NoMemory();
227 goto error;
228 }
229 type = _PyTokenizer_Get(p->tok, &new_token);
230 }
231
232 // If we have reached the end and we are in single input mode we need to insert a newline and reset the parsing
233 if (p->start_rule == Py_single_input && type == ENDMARKER && p->parsing_started) {
234 type = NEWLINE; /* Add an extra newline */
235 p->parsing_started = 0;
236
237 if (p->tok->indent && !(p->flags & PyPARSE_DONT_IMPLY_DEDENT)) {
238 p->tok->pendin = -p->tok->indent;
239 p->tok->indent = 0;
240 }
241 }
242 else {
243 p->parsing_started = 1;
244 }
245
246 // Check if we are at the limit of the token array capacity and resize if needed
247 if ((p->fill == p->size) && (_resize_tokens_array(p) != 0)) {
248 goto error;
249 }
250
251 Token *t = p->tokens[p->fill];
252 return initialize_token(p, t, &new_token, type);
253 error:
254 _PyToken_Free(&new_token);
255 return -1;
256 }
257
258 #if defined(Py_DEBUG)
259 // Instrumentation to count the effectiveness of memoization.
260 // The array counts the number of tokens skipped by memoization,
261 // indexed by type.
262
263 #define NSTATISTICS _PYPEGEN_NSTATISTICS
264 #define memo_statistics _PyRuntime.parser.memo_statistics
265
266 void
267 _PyPegen_clear_memo_statistics(void)
268 {
269 for (int i = 0; i < NSTATISTICS; i++) {
270 memo_statistics[i] = 0;
271 }
272 }
273
274 PyObject *
275 _PyPegen_get_memo_statistics(void)
276 {
277 PyObject *ret = PyList_New(NSTATISTICS);
278 if (ret == NULL) {
279 return NULL;
280 }
281 for (int i = 0; i < NSTATISTICS; i++) {
282 PyObject *value = PyLong_FromLong(memo_statistics[i]);
283 if (value == NULL) {
284 Py_DECREF(ret);
285 return NULL;
286 }
287 // PyList_SetItem borrows a reference to value.
288 if (PyList_SetItem(ret, i, value) < 0) {
289 Py_DECREF(ret);
290 return NULL;
291 }
292 }
293 return ret;
294 }
295 #endif
296
297 int // bool
298 _PyPegen_is_memoized(Parser *p, int type, void *pres)
299 {
300 if (p->mark == p->fill) {
301 if (_PyPegen_fill_token(p) < 0) {
302 p->error_indicator = 1;
303 return -1;
304 }
305 }
306
307 Token *t = p->tokens[p->mark];
308
309 for (Memo *m = t->memo; m != NULL; m = m->next) {
310 if (m->type == type) {
311 #if defined(PY_DEBUG)
312 if (0 <= type && type < NSTATISTICS) {
313 long count = m->mark - p->mark;
314 // A memoized negative result counts for one.
315 if (count <= 0) {
316 count = 1;
317 }
318 memo_statistics[type] += count;
319 }
320 #endif
321 p->mark = m->mark;
322 *(void **)(pres) = m->node;
323 return 1;
324 }
325 }
326 return 0;
327 }
328
329 int
330 _PyPegen_lookahead_with_name(int positive, expr_ty (func)(Parser *), Parser *p)
331 {
332 int mark = p->mark;
333 void *res = func(p);
334 p->mark = mark;
335 return (res != NULL) == positive;
336 }
337
338 int
339 _PyPegen_lookahead_with_string(int positive, expr_ty (func)(Parser *, const char*), Parser *p, const char* arg)
340 {
341 int mark = p->mark;
342 void *res = func(p, arg);
343 p->mark = mark;
344 return (res != NULL) == positive;
345 }
346
347 int
348 _PyPegen_lookahead_with_int(int positive, Token *(func)(Parser *, int), Parser *p, int arg)
349 {
350 int mark = p->mark;
351 void *res = func(p, arg);
352 p->mark = mark;
353 return (res != NULL) == positive;
354 }
355
356 int
357 _PyPegen_lookahead(int positive, void *(func)(Parser *), Parser *p)
358 {
359 int mark = p->mark;
360 void *res = (void*)func(p);
361 p->mark = mark;
362 return (res != NULL) == positive;
363 }
364
365 Token *
366 _PyPegen_expect_token(Parser *p, int type)
367 {
368 if (p->mark == p->fill) {
369 if (_PyPegen_fill_token(p) < 0) {
370 p->error_indicator = 1;
371 return NULL;
372 }
373 }
374 Token *t = p->tokens[p->mark];
375 if (t->type != type) {
376 return NULL;
377 }
378 p->mark += 1;
379 return t;
380 }
381
382 void*
383 _PyPegen_expect_forced_result(Parser *p, void* result, const char* expected) {
384
385 if (p->error_indicator == 1) {
386 return NULL;
387 }
388 if (result == NULL) {
389 RAISE_SYNTAX_ERROR("expected (%s)", expected);
390 return NULL;
391 }
392 return result;
393 }
394
395 Token *
396 _PyPegen_expect_forced_token(Parser *p, int type, const char* expected) {
397
398 if (p->error_indicator == 1) {
399 return NULL;
400 }
401
402 if (p->mark == p->fill) {
403 if (_PyPegen_fill_token(p) < 0) {
404 p->error_indicator = 1;
405 return NULL;
406 }
407 }
408 Token *t = p->tokens[p->mark];
409 if (t->type != type) {
410 RAISE_SYNTAX_ERROR_KNOWN_LOCATION(t, "expected '%s'", expected);
411 return NULL;
412 }
413 p->mark += 1;
414 return t;
415 }
416
417 expr_ty
418 _PyPegen_expect_soft_keyword(Parser *p, const char *keyword)
419 {
420 if (p->mark == p->fill) {
421 if (_PyPegen_fill_token(p) < 0) {
422 p->error_indicator = 1;
423 return NULL;
424 }
425 }
426 Token *t = p->tokens[p->mark];
427 if (t->type != NAME) {
428 return NULL;
429 }
430 const char *s = PyBytes_AsString(t->bytes);
431 if (!s) {
432 p->error_indicator = 1;
433 return NULL;
434 }
435 if (strcmp(s, keyword) != 0) {
436 return NULL;
437 }
438 return _PyPegen_name_token(p);
439 }
440
441 Token *
442 _PyPegen_get_last_nonnwhitespace_token(Parser *p)
443 {
444 assert(p->mark >= 0);
445 Token *token = NULL;
446 for (int m = p->mark - 1; m >= 0; m--) {
447 token = p->tokens[m];
448 if (token->type != ENDMARKER && (token->type < NEWLINE || token->type > DEDENT)) {
449 break;
450 }
451 }
452 return token;
453 }
454
455 PyObject *
456 _PyPegen_new_identifier(Parser *p, const char *n)
457 {
458 PyObject *id = PyUnicode_DecodeUTF8(n, strlen(n), NULL);
459 if (!id) {
460 goto error;
461 }
462 /* PyUnicode_DecodeUTF8 should always return a ready string. */
463 assert(PyUnicode_IS_READY(id));
464 /* Check whether there are non-ASCII characters in the
465 identifier; if so, normalize to NFKC. */
466 if (!PyUnicode_IS_ASCII(id))
467 {
468 PyObject *id2;
469 if (!init_normalization(p))
470 {
471 Py_DECREF(id);
472 goto error;
473 }
474 PyObject *form = PyUnicode_InternFromString("NFKC");
475 if (form == NULL)
476 {
477 Py_DECREF(id);
478 goto error;
479 }
480 PyObject *args[2] = {form, id};
481 id2 = _PyObject_FastCall(p->normalize, args, 2);
482 Py_DECREF(id);
483 Py_DECREF(form);
484 if (!id2) {
485 goto error;
486 }
487 if (!PyUnicode_Check(id2))
488 {
489 PyErr_Format(PyExc_TypeError,
490 "unicodedata.normalize() must return a string, not "
491 "%.200s",
492 _PyType_Name(Py_TYPE(id2)));
493 Py_DECREF(id2);
494 goto error;
495 }
496 id = id2;
497 }
498 PyUnicode_InternInPlace(&id);
499 if (_PyArena_AddPyObject(p->arena, id) < 0)
500 {
501 Py_DECREF(id);
502 goto error;
503 }
504 return id;
505
506 error:
507 p->error_indicator = 1;
508 return NULL;
509 }
510
511 static expr_ty
512 _PyPegen_name_from_token(Parser *p, Token* t)
513 {
514 if (t == NULL) {
515 return NULL;
516 }
517 const char *s = PyBytes_AsString(t->bytes);
518 if (!s) {
519 p->error_indicator = 1;
520 return NULL;
521 }
522 PyObject *id = _PyPegen_new_identifier(p, s);
523 if (id == NULL) {
524 p->error_indicator = 1;
525 return NULL;
526 }
527 return _PyAST_Name(id, Load, t->lineno, t->col_offset, t->end_lineno,
528 t->end_col_offset, p->arena);
529 }
530
531 expr_ty
532 _PyPegen_name_token(Parser *p)
533 {
534 Token *t = _PyPegen_expect_token(p, NAME);
535 return _PyPegen_name_from_token(p, t);
536 }
537
538 void *
539 _PyPegen_string_token(Parser *p)
540 {
541 return _PyPegen_expect_token(p, STRING);
542 }
543
544 expr_ty _PyPegen_soft_keyword_token(Parser *p) {
545 Token *t = _PyPegen_expect_token(p, NAME);
546 if (t == NULL) {
547 return NULL;
548 }
549 char *the_token;
550 Py_ssize_t size;
551 PyBytes_AsStringAndSize(t->bytes, &the_token, &size);
552 for (char **keyword = p->soft_keywords; *keyword != NULL; keyword++) {
553 if (strncmp(*keyword, the_token, size) == 0) {
554 return _PyPegen_name_from_token(p, t);
555 }
556 }
557 return NULL;
558 }
559
560 static PyObject *
561 parsenumber_raw(const char *s)
562 {
563 const char *end;
564 long x;
565 double dx;
566 Py_complex compl;
567 int imflag;
568
569 assert(s != NULL);
570 errno = 0;
571 end = s + strlen(s) - 1;
572 imflag = *end == 'j' || *end == 'J';
573 if (s[0] == '0') {
574 x = (long)PyOS_strtoul(s, (char **)&end, 0);
575 if (x < 0 && errno == 0) {
576 return PyLong_FromString(s, (char **)0, 0);
577 }
578 }
579 else {
580 x = PyOS_strtol(s, (char **)&end, 0);
581 }
582 if (*end == '\0') {
583 if (errno != 0) {
584 return PyLong_FromString(s, (char **)0, 0);
585 }
586 return PyLong_FromLong(x);
587 }
588 /* XXX Huge floats may silently fail */
589 if (imflag) {
590 compl.real = 0.;
591 compl.imag = PyOS_string_to_double(s, (char **)&end, NULL);
592 if (compl.imag == -1.0 && PyErr_Occurred()) {
593 return NULL;
594 }
595 return PyComplex_FromCComplex(compl);
596 }
597 dx = PyOS_string_to_double(s, NULL, NULL);
598 if (dx == -1.0 && PyErr_Occurred()) {
599 return NULL;
600 }
601 return PyFloat_FromDouble(dx);
602 }
603
604 static PyObject *
605 parsenumber(const char *s)
606 {
607 char *dup;
608 char *end;
609 PyObject *res = NULL;
610
611 assert(s != NULL);
612
613 if (strchr(s, '_') == NULL) {
614 return parsenumber_raw(s);
615 }
616 /* Create a duplicate without underscores. */
617 dup = PyMem_Malloc(strlen(s) + 1);
618 if (dup == NULL) {
619 return PyErr_NoMemory();
620 }
621 end = dup;
622 for (; *s; s++) {
623 if (*s != '_') {
624 *end++ = *s;
625 }
626 }
627 *end = '\0';
628 res = parsenumber_raw(dup);
629 PyMem_Free(dup);
630 return res;
631 }
632
633 expr_ty
634 _PyPegen_number_token(Parser *p)
635 {
636 Token *t = _PyPegen_expect_token(p, NUMBER);
637 if (t == NULL) {
638 return NULL;
639 }
640
641 const char *num_raw = PyBytes_AsString(t->bytes);
642 if (num_raw == NULL) {
643 p->error_indicator = 1;
644 return NULL;
645 }
646
647 if (p->feature_version < 6 && strchr(num_raw, '_') != NULL) {
648 p->error_indicator = 1;
649 return RAISE_SYNTAX_ERROR("Underscores in numeric literals are only supported "
650 "in Python 3.6 and greater");
651 }
652
653 PyObject *c = parsenumber(num_raw);
654
655 if (c == NULL) {
656 p->error_indicator = 1;
657 PyThreadState *tstate = _PyThreadState_GET();
658 // The only way a ValueError should happen in _this_ code is via
659 // PyLong_FromString hitting a length limit.
660 if (tstate->current_exception != NULL &&
661 Py_TYPE(tstate->current_exception) == (PyTypeObject *)PyExc_ValueError
662 ) {
663 PyObject *exc = PyErr_GetRaisedException();
664 /* Intentionally omitting columns to avoid a wall of 1000s of '^'s
665 * on the error message. Nobody is going to overlook their huge
666 * numeric literal once given the line. */
667 RAISE_ERROR_KNOWN_LOCATION(
668 p, PyExc_SyntaxError,
669 t->lineno, -1 /* col_offset */,
670 t->end_lineno, -1 /* end_col_offset */,
671 "%S - Consider hexadecimal for huge integer literals "
672 "to avoid decimal conversion limits.",
673 exc);
674 Py_DECREF(exc);
675 }
676 return NULL;
677 }
678
679 if (_PyArena_AddPyObject(p->arena, c) < 0) {
680 Py_DECREF(c);
681 p->error_indicator = 1;
682 return NULL;
683 }
684
685 return _PyAST_Constant(c, NULL, t->lineno, t->col_offset, t->end_lineno,
686 t->end_col_offset, p->arena);
687 }
688
689 /* Check that the source for a single input statement really is a single
690 statement by looking at what is left in the buffer after parsing.
691 Trailing whitespace and comments are OK. */
692 static int // bool
693 bad_single_statement(Parser *p)
694 {
695 char *cur = p->tok->cur;
696 char c = *cur;
697
698 for (;;) {
699 while (c == ' ' || c == '\t' || c == '\n' || c == '\014') {
700 c = *++cur;
701 }
702
703 if (!c) {
704 return 0;
705 }
706
707 if (c != '#') {
708 return 1;
709 }
710
711 /* Suck up comment. */
712 while (c && c != '\n') {
713 c = *++cur;
714 }
715 }
716 }
717
718 static int
719 compute_parser_flags(PyCompilerFlags *flags)
720 {
721 int parser_flags = 0;
722 if (!flags) {
723 return 0;
724 }
725 if (flags->cf_flags & PyCF_DONT_IMPLY_DEDENT) {
726 parser_flags |= PyPARSE_DONT_IMPLY_DEDENT;
727 }
728 if (flags->cf_flags & PyCF_IGNORE_COOKIE) {
729 parser_flags |= PyPARSE_IGNORE_COOKIE;
730 }
731 if (flags->cf_flags & CO_FUTURE_BARRY_AS_BDFL) {
732 parser_flags |= PyPARSE_BARRY_AS_BDFL;
733 }
734 if (flags->cf_flags & PyCF_TYPE_COMMENTS) {
735 parser_flags |= PyPARSE_TYPE_COMMENTS;
736 }
737 if ((flags->cf_flags & PyCF_ONLY_AST) && flags->cf_feature_version < 7) {
738 parser_flags |= PyPARSE_ASYNC_HACKS;
739 }
740 if (flags->cf_flags & PyCF_ALLOW_INCOMPLETE_INPUT) {
741 parser_flags |= PyPARSE_ALLOW_INCOMPLETE_INPUT;
742 }
743 return parser_flags;
744 }
745
746 // Parser API
747
748 Parser *
749 _PyPegen_Parser_New(struct tok_state *tok, int start_rule, int flags,
750 int feature_version, int *errcode, PyArena *arena)
751 {
752 Parser *p = PyMem_Malloc(sizeof(Parser));
753 if (p == NULL) {
754 return (Parser *) PyErr_NoMemory();
755 }
756 assert(tok != NULL);
757 tok->type_comments = (flags & PyPARSE_TYPE_COMMENTS) > 0;
758 tok->async_hacks = (flags & PyPARSE_ASYNC_HACKS) > 0;
759 p->tok = tok;
760 p->keywords = NULL;
761 p->n_keyword_lists = -1;
762 p->soft_keywords = NULL;
763 p->tokens = PyMem_Malloc(sizeof(Token *));
764 if (!p->tokens) {
765 PyMem_Free(p);
766 return (Parser *) PyErr_NoMemory();
767 }
768 p->tokens[0] = PyMem_Calloc(1, sizeof(Token));
769 if (!p->tokens[0]) {
770 PyMem_Free(p->tokens);
771 PyMem_Free(p);
772 return (Parser *) PyErr_NoMemory();
773 }
774 if (!growable_comment_array_init(&p->type_ignore_comments, 10)) {
775 PyMem_Free(p->tokens[0]);
776 PyMem_Free(p->tokens);
777 PyMem_Free(p);
778 return (Parser *) PyErr_NoMemory();
779 }
780
781 p->mark = 0;
782 p->fill = 0;
783 p->size = 1;
784
785 p->errcode = errcode;
786 p->arena = arena;
787 p->start_rule = start_rule;
788 p->parsing_started = 0;
789 p->normalize = NULL;
790 p->error_indicator = 0;
791
792 p->starting_lineno = 0;
793 p->starting_col_offset = 0;
794 p->flags = flags;
795 p->feature_version = feature_version;
796 p->known_err_token = NULL;
797 p->level = 0;
798 p->call_invalid_rules = 0;
799 #ifdef Py_DEBUG
800 p->debug = _Py_GetConfig()->parser_debug;
801 #endif
802 return p;
803 }
804
805 void
806 _PyPegen_Parser_Free(Parser *p)
807 {
808 Py_XDECREF(p->normalize);
809 for (int i = 0; i < p->size; i++) {
810 PyMem_Free(p->tokens[i]);
811 }
812 PyMem_Free(p->tokens);
813 growable_comment_array_deallocate(&p->type_ignore_comments);
814 PyMem_Free(p);
815 }
816
817 static void
818 reset_parser_state_for_error_pass(Parser *p)
819 {
820 for (int i = 0; i < p->fill; i++) {
821 p->tokens[i]->memo = NULL;
822 }
823 p->mark = 0;
824 p->call_invalid_rules = 1;
825 // Don't try to get extra tokens in interactive mode when trying to
826 // raise specialized errors in the second pass.
827 p->tok->interactive_underflow = IUNDERFLOW_STOP;
828 }
829
830 static inline int
831 _is_end_of_source(Parser *p) {
832 int err = p->tok->done;
833 return err == E_EOF || err == E_EOFS || err == E_EOLS;
834 }
835
836 void *
837 _PyPegen_run_parser(Parser *p)
838 {
839 void *res = _PyPegen_parse(p);
840 assert(p->level == 0);
841 if (res == NULL) {
842 if ((p->flags & PyPARSE_ALLOW_INCOMPLETE_INPUT) && _is_end_of_source(p)) {
843 PyErr_Clear();
844 return RAISE_SYNTAX_ERROR("incomplete input");
845 }
846 if (PyErr_Occurred() && !PyErr_ExceptionMatches(PyExc_SyntaxError)) {
847 return NULL;
848 }
849 // Make a second parser pass. In this pass we activate heavier and slower checks
850 // to produce better error messages and more complete diagnostics. Extra "invalid_*"
851 // rules will be active during parsing.
852 Token *last_token = p->tokens[p->fill - 1];
853 reset_parser_state_for_error_pass(p);
854 _PyPegen_parse(p);
855
856 // Set SyntaxErrors accordingly depending on the parser/tokenizer status at the failure
857 // point.
858 _Pypegen_set_syntax_error(p, last_token);
859 return NULL;
860 }
861
862 if (p->start_rule == Py_single_input && bad_single_statement(p)) {
863 p->tok->done = E_BADSINGLE; // This is not necessary for now, but might be in the future
864 return RAISE_SYNTAX_ERROR("multiple statements found while compiling a single statement");
865 }
866
867 // test_peg_generator defines _Py_TEST_PEGEN to not call PyAST_Validate()
868 #if defined(Py_DEBUG) && !defined(_Py_TEST_PEGEN)
869 if (p->start_rule == Py_single_input ||
870 p->start_rule == Py_file_input ||
871 p->start_rule == Py_eval_input)
872 {
873 if (!_PyAST_Validate(res)) {
874 return NULL;
875 }
876 }
877 #endif
878 return res;
879 }
880
881 mod_ty
882 _PyPegen_run_parser_from_file_pointer(FILE *fp, int start_rule, PyObject *filename_ob,
883 const char *enc, const char *ps1, const char *ps2,
884 PyCompilerFlags *flags, int *errcode, PyArena *arena)
885 {
886 struct tok_state *tok = _PyTokenizer_FromFile(fp, enc, ps1, ps2);
887 if (tok == NULL) {
888 if (PyErr_Occurred()) {
889 _PyPegen_raise_tokenizer_init_error(filename_ob);
890 return NULL;
891 }
892 return NULL;
893 }
894 if (!tok->fp || ps1 != NULL || ps2 != NULL ||
895 PyUnicode_CompareWithASCIIString(filename_ob, "<stdin>") == 0) {
896 tok->fp_interactive = 1;
897 }
898 // This transfers the ownership to the tokenizer
899 tok->filename = Py_NewRef(filename_ob);
900
901 // From here on we need to clean up even if there's an error
902 mod_ty result = NULL;
903
904 int parser_flags = compute_parser_flags(flags);
905 Parser *p = _PyPegen_Parser_New(tok, start_rule, parser_flags, PY_MINOR_VERSION,
906 errcode, arena);
907 if (p == NULL) {
908 goto error;
909 }
910
911 result = _PyPegen_run_parser(p);
912 _PyPegen_Parser_Free(p);
913
914 error:
915 _PyTokenizer_Free(tok);
916 return result;
917 }
918
919 mod_ty
920 _PyPegen_run_parser_from_string(const char *str, int start_rule, PyObject *filename_ob,
921 PyCompilerFlags *flags, PyArena *arena)
922 {
923 int exec_input = start_rule == Py_file_input;
924
925 struct tok_state *tok;
926 if (flags != NULL && flags->cf_flags & PyCF_IGNORE_COOKIE) {
927 tok = _PyTokenizer_FromUTF8(str, exec_input, 0);
928 } else {
929 tok = _PyTokenizer_FromString(str, exec_input, 0);
930 }
931 if (tok == NULL) {
932 if (PyErr_Occurred()) {
933 _PyPegen_raise_tokenizer_init_error(filename_ob);
934 }
935 return NULL;
936 }
937 // This transfers the ownership to the tokenizer
938 tok->filename = Py_NewRef(filename_ob);
939
940 // We need to clear up from here on
941 mod_ty result = NULL;
942
943 int parser_flags = compute_parser_flags(flags);
944 int feature_version = flags && (flags->cf_flags & PyCF_ONLY_AST) ?
945 flags->cf_feature_version : PY_MINOR_VERSION;
946 Parser *p = _PyPegen_Parser_New(tok, start_rule, parser_flags, feature_version,
947 NULL, arena);
948 if (p == NULL) {
949 goto error;
950 }
951
952 result = _PyPegen_run_parser(p);
953 _PyPegen_Parser_Free(p);
954
955 error:
956 _PyTokenizer_Free(tok);
957 return result;
958 }