(root)/
Python-3.11.7/
Parser/
pegen.c
       1  #include <Python.h>
       2  #include "pycore_ast.h"           // _PyAST_Validate(),
       3  #include "pycore_pystate.h"       // _PyThreadState_GET()
       4  #include <errcode.h>
       5  
       6  #include "tokenizer.h"
       7  #include "pegen.h"
       8  
       9  // Internal parser functions
      10  
      11  asdl_stmt_seq*
      12  _PyPegen_interactive_exit(Parser *p)
      13  {
      14      if (p->errcode) {
      15          *(p->errcode) = E_EOF;
      16      }
      17      return NULL;
      18  }
      19  
      20  Py_ssize_t
      21  _PyPegen_byte_offset_to_character_offset(PyObject *line, Py_ssize_t col_offset)
      22  {
      23      const char *str = PyUnicode_AsUTF8(line);
      24      if (!str) {
      25          return -1;
      26      }
      27      Py_ssize_t len = strlen(str);
      28      if (col_offset > len + 1) {
      29          col_offset = len + 1;
      30      }
      31      assert(col_offset >= 0);
      32      PyObject *text = PyUnicode_DecodeUTF8(str, col_offset, "replace");
      33      if (!text) {
      34          return -1;
      35      }
      36      Py_ssize_t size = PyUnicode_GET_LENGTH(text);
      37      Py_DECREF(text);
      38      return size;
      39  }
      40  
      41  // Calculate the extra amount of width space the given source
      42  // code segment might take if it were to be displayed on a fixed
      43  // width output device. Supports wide unicode characters and emojis.
      44  Py_ssize_t
      45  _PyPegen_calculate_display_width(PyObject *line, Py_ssize_t character_offset)
      46  {
      47      PyObject *segment = PyUnicode_Substring(line, 0, character_offset);
      48      if (!segment) {
      49          return -1;
      50      }
      51  
      52      // Fast track for ascii strings
      53      if (PyUnicode_IS_ASCII(segment)) {
      54          Py_DECREF(segment);
      55          return character_offset;
      56      }
      57  
      58      PyObject *width_fn = _PyImport_GetModuleAttrString("unicodedata", "east_asian_width");
      59      if (!width_fn) {
      60          return -1;
      61      }
      62  
      63      Py_ssize_t width = 0;
      64      Py_ssize_t len = PyUnicode_GET_LENGTH(segment);
      65      for (Py_ssize_t i = 0; i < len; i++) {
      66          PyObject *chr = PyUnicode_Substring(segment, i, i + 1);
      67          if (!chr) {
      68              Py_DECREF(segment);
      69              Py_DECREF(width_fn);
      70              return -1;
      71          }
      72  
      73          PyObject *width_specifier = PyObject_CallOneArg(width_fn, chr);
      74          Py_DECREF(chr);
      75          if (!width_specifier) {
      76              Py_DECREF(segment);
      77              Py_DECREF(width_fn);
      78              return -1;
      79          }
      80  
      81          if (_PyUnicode_EqualToASCIIString(width_specifier, "W") ||
      82              _PyUnicode_EqualToASCIIString(width_specifier, "F")) {
      83              width += 2;
      84          }
      85          else {
      86              width += 1;
      87          }
      88          Py_DECREF(width_specifier);
      89      }
      90  
      91      Py_DECREF(segment);
      92      Py_DECREF(width_fn);
      93      return width;
      94  }
      95  
      96  // Here, mark is the start of the node, while p->mark is the end.
      97  // If node==NULL, they should be the same.
      98  int
      99  _PyPegen_insert_memo(Parser *p, int mark, int type, void *node)
     100  {
     101      // Insert in front
     102      Memo *m = _PyArena_Malloc(p->arena, sizeof(Memo));
     103      if (m == NULL) {
     104          return -1;
     105      }
     106      m->type = type;
     107      m->node = node;
     108      m->mark = p->mark;
     109      m->next = p->tokens[mark]->memo;
     110      p->tokens[mark]->memo = m;
     111      return 0;
     112  }
     113  
     114  // Like _PyPegen_insert_memo(), but updates an existing node if found.
     115  int
     116  _PyPegen_update_memo(Parser *p, int mark, int type, void *node)
     117  {
     118      for (Memo *m = p->tokens[mark]->memo; m != NULL; m = m->next) {
     119          if (m->type == type) {
     120              // Update existing node.
     121              m->node = node;
     122              m->mark = p->mark;
     123              return 0;
     124          }
     125      }
     126      // Insert new node.
     127      return _PyPegen_insert_memo(p, mark, type, node);
     128  }
     129  
     130  static int
     131  init_normalization(Parser *p)
     132  {
     133      if (p->normalize) {
     134          return 1;
     135      }
     136      PyObject *m = PyImport_ImportModule("unicodedata");
     137      if (!m)
     138      {
     139          return 0;
     140      }
     141      p->normalize = PyObject_GetAttrString(m, "normalize");
     142      Py_DECREF(m);
     143      if (!p->normalize)
     144      {
     145          return 0;
     146      }
     147      return 1;
     148  }
     149  
     150  static int
     151  growable_comment_array_init(growable_comment_array *arr, size_t initial_size) {
     152      assert(initial_size > 0);
     153      arr->items = PyMem_Malloc(initial_size * sizeof(*arr->items));
     154      arr->size = initial_size;
     155      arr->num_items = 0;
     156  
     157      return arr->items != NULL;
     158  }
     159  
     160  static int
     161  growable_comment_array_add(growable_comment_array *arr, int lineno, char *comment) {
     162      if (arr->num_items >= arr->size) {
     163          size_t new_size = arr->size * 2;
     164          void *new_items_array = PyMem_Realloc(arr->items, new_size * sizeof(*arr->items));
     165          if (!new_items_array) {
     166              return 0;
     167          }
     168          arr->items = new_items_array;
     169          arr->size = new_size;
     170      }
     171  
     172      arr->items[arr->num_items].lineno = lineno;
     173      arr->items[arr->num_items].comment = comment;  // Take ownership
     174      arr->num_items++;
     175      return 1;
     176  }
     177  
     178  static void
     179  growable_comment_array_deallocate(growable_comment_array *arr) {
     180      for (unsigned i = 0; i < arr->num_items; i++) {
     181          PyMem_Free(arr->items[i].comment);
     182      }
     183      PyMem_Free(arr->items);
     184  }
     185  
     186  static int
     187  _get_keyword_or_name_type(Parser *p, const char *name, int name_len)
     188  {
     189      assert(name_len > 0);
     190      if (name_len >= p->n_keyword_lists ||
     191          p->keywords[name_len] == NULL ||
     192          p->keywords[name_len]->type == -1) {
     193          return NAME;
     194      }
     195      for (KeywordToken *k = p->keywords[name_len]; k != NULL && k->type != -1; k++) {
     196          if (strncmp(k->str, name, name_len) == 0) {
     197              return k->type;
     198          }
     199      }
     200      return NAME;
     201  }
     202  
     203  static int
     204  initialize_token(Parser *p, Token *token, const char *start, const char *end, int token_type) {
     205      assert(token != NULL);
     206  
     207      token->type = (token_type == NAME) ? _get_keyword_or_name_type(p, start, (int)(end - start)) : token_type;
     208      token->bytes = PyBytes_FromStringAndSize(start, end - start);
     209      if (token->bytes == NULL) {
     210          return -1;
     211      }
     212  
     213      if (_PyArena_AddPyObject(p->arena, token->bytes) < 0) {
     214          Py_DECREF(token->bytes);
     215          return -1;
     216      }
     217  
     218      token->level = p->tok->level;
     219  
     220      const char *line_start = token_type == STRING ? p->tok->multi_line_start : p->tok->line_start;
     221      int lineno = token_type == STRING ? p->tok->first_lineno : p->tok->lineno;
     222      int end_lineno = p->tok->lineno;
     223  
     224      int col_offset = (start != NULL && start >= line_start) ? (int)(start - line_start) : -1;
     225      int end_col_offset = (end != NULL && end >= p->tok->line_start) ? (int)(end - p->tok->line_start) : -1;
     226  
     227      token->lineno = lineno;
     228      token->col_offset = p->tok->lineno == p->starting_lineno ? p->starting_col_offset + col_offset : col_offset;
     229      token->end_lineno = end_lineno;
     230      token->end_col_offset = p->tok->lineno == p->starting_lineno ? p->starting_col_offset + end_col_offset : end_col_offset;
     231  
     232      p->fill += 1;
     233  
     234      if (token_type == ERRORTOKEN && p->tok->done == E_DECODE) {
     235          return _Pypegen_raise_decode_error(p);
     236      }
     237  
     238      return (token_type == ERRORTOKEN ? _Pypegen_tokenizer_error(p) : 0);
     239  }
     240  
     241  static int
     242  _resize_tokens_array(Parser *p) {
     243      int newsize = p->size * 2;
     244      Token **new_tokens = PyMem_Realloc(p->tokens, newsize * sizeof(Token *));
     245      if (new_tokens == NULL) {
     246          PyErr_NoMemory();
     247          return -1;
     248      }
     249      p->tokens = new_tokens;
     250  
     251      for (int i = p->size; i < newsize; i++) {
     252          p->tokens[i] = PyMem_Calloc(1, sizeof(Token));
     253          if (p->tokens[i] == NULL) {
     254              p->size = i; // Needed, in order to cleanup correctly after parser fails
     255              PyErr_NoMemory();
     256              return -1;
     257          }
     258      }
     259      p->size = newsize;
     260      return 0;
     261  }
     262  
     263  int
     264  _PyPegen_fill_token(Parser *p)
     265  {
     266      const char *start;
     267      const char *end;
     268      int type = _PyTokenizer_Get(p->tok, &start, &end);
     269  
     270      // Record and skip '# type: ignore' comments
     271      while (type == TYPE_IGNORE) {
     272          Py_ssize_t len = end - start;
     273          char *tag = PyMem_Malloc(len + 1);
     274          if (tag == NULL) {
     275              PyErr_NoMemory();
     276              return -1;
     277          }
     278          strncpy(tag, start, len);
     279          tag[len] = '\0';
     280          // Ownership of tag passes to the growable array
     281          if (!growable_comment_array_add(&p->type_ignore_comments, p->tok->lineno, tag)) {
     282              PyErr_NoMemory();
     283              return -1;
     284          }
     285          type = _PyTokenizer_Get(p->tok, &start, &end);
     286      }
     287  
     288      // If we have reached the end and we are in single input mode we need to insert a newline and reset the parsing
     289      if (p->start_rule == Py_single_input && type == ENDMARKER && p->parsing_started) {
     290          type = NEWLINE; /* Add an extra newline */
     291          p->parsing_started = 0;
     292  
     293          if (p->tok->indent && !(p->flags & PyPARSE_DONT_IMPLY_DEDENT)) {
     294              p->tok->pendin = -p->tok->indent;
     295              p->tok->indent = 0;
     296          }
     297      }
     298      else {
     299          p->parsing_started = 1;
     300      }
     301  
     302      // Check if we are at the limit of the token array capacity and resize if needed
     303      if ((p->fill == p->size) && (_resize_tokens_array(p) != 0)) {
     304          return -1;
     305      }
     306  
     307      Token *t = p->tokens[p->fill];
     308      return initialize_token(p, t, start, end, type);
     309  }
     310  
     311  #if defined(Py_DEBUG)
     312  // Instrumentation to count the effectiveness of memoization.
     313  // The array counts the number of tokens skipped by memoization,
     314  // indexed by type.
     315  
     316  #define NSTATISTICS 2000
     317  static long memo_statistics[NSTATISTICS];
     318  
     319  void
     320  _PyPegen_clear_memo_statistics(void)
     321  {
     322      for (int i = 0; i < NSTATISTICS; i++) {
     323          memo_statistics[i] = 0;
     324      }
     325  }
     326  
     327  PyObject *
     328  _PyPegen_get_memo_statistics(void)
     329  {
     330      PyObject *ret = PyList_New(NSTATISTICS);
     331      if (ret == NULL) {
     332          return NULL;
     333      }
     334      for (int i = 0; i < NSTATISTICS; i++) {
     335          PyObject *value = PyLong_FromLong(memo_statistics[i]);
     336          if (value == NULL) {
     337              Py_DECREF(ret);
     338              return NULL;
     339          }
     340          // PyList_SetItem borrows a reference to value.
     341          if (PyList_SetItem(ret, i, value) < 0) {
     342              Py_DECREF(ret);
     343              return NULL;
     344          }
     345      }
     346      return ret;
     347  }
     348  #endif
     349  
     350  int  // bool
     351  _PyPegen_is_memoized(Parser *p, int type, void *pres)
     352  {
     353      if (p->mark == p->fill) {
     354          if (_PyPegen_fill_token(p) < 0) {
     355              p->error_indicator = 1;
     356              return -1;
     357          }
     358      }
     359  
     360      Token *t = p->tokens[p->mark];
     361  
     362      for (Memo *m = t->memo; m != NULL; m = m->next) {
     363          if (m->type == type) {
     364  #if defined(PY_DEBUG)
     365              if (0 <= type && type < NSTATISTICS) {
     366                  long count = m->mark - p->mark;
     367                  // A memoized negative result counts for one.
     368                  if (count <= 0) {
     369                      count = 1;
     370                  }
     371                  memo_statistics[type] += count;
     372              }
     373  #endif
     374              p->mark = m->mark;
     375              *(void **)(pres) = m->node;
     376              return 1;
     377          }
     378      }
     379      return 0;
     380  }
     381  
     382  int
     383  _PyPegen_lookahead_with_name(int positive, expr_ty (func)(Parser *), Parser *p)
     384  {
     385      int mark = p->mark;
     386      void *res = func(p);
     387      p->mark = mark;
     388      return (res != NULL) == positive;
     389  }
     390  
     391  int
     392  _PyPegen_lookahead_with_string(int positive, expr_ty (func)(Parser *, const char*), Parser *p, const char* arg)
     393  {
     394      int mark = p->mark;
     395      void *res = func(p, arg);
     396      p->mark = mark;
     397      return (res != NULL) == positive;
     398  }
     399  
     400  int
     401  _PyPegen_lookahead_with_int(int positive, Token *(func)(Parser *, int), Parser *p, int arg)
     402  {
     403      int mark = p->mark;
     404      void *res = func(p, arg);
     405      p->mark = mark;
     406      return (res != NULL) == positive;
     407  }
     408  
     409  int
     410  _PyPegen_lookahead(int positive, void *(func)(Parser *), Parser *p)
     411  {
     412      int mark = p->mark;
     413      void *res = (void*)func(p);
     414      p->mark = mark;
     415      return (res != NULL) == positive;
     416  }
     417  
     418  Token *
     419  _PyPegen_expect_token(Parser *p, int type)
     420  {
     421      if (p->mark == p->fill) {
     422          if (_PyPegen_fill_token(p) < 0) {
     423              p->error_indicator = 1;
     424              return NULL;
     425          }
     426      }
     427      Token *t = p->tokens[p->mark];
     428      if (t->type != type) {
     429          return NULL;
     430      }
     431      p->mark += 1;
     432      return t;
     433  }
     434  
     435  void*
     436  _PyPegen_expect_forced_result(Parser *p, void* result, const char* expected) {
     437  
     438      if (p->error_indicator == 1) {
     439          return NULL;
     440      }
     441      if (result == NULL) {
     442          RAISE_SYNTAX_ERROR("expected (%s)", expected);
     443          return NULL;
     444      }
     445      return result;
     446  }
     447  
     448  Token *
     449  _PyPegen_expect_forced_token(Parser *p, int type, const char* expected) {
     450  
     451      if (p->error_indicator == 1) {
     452          return NULL;
     453      }
     454  
     455      if (p->mark == p->fill) {
     456          if (_PyPegen_fill_token(p) < 0) {
     457              p->error_indicator = 1;
     458              return NULL;
     459          }
     460      }
     461      Token *t = p->tokens[p->mark];
     462      if (t->type != type) {
     463          RAISE_SYNTAX_ERROR_KNOWN_LOCATION(t, "expected '%s'", expected);
     464          return NULL;
     465      }
     466      p->mark += 1;
     467      return t;
     468  }
     469  
     470  expr_ty
     471  _PyPegen_expect_soft_keyword(Parser *p, const char *keyword)
     472  {
     473      if (p->mark == p->fill) {
     474          if (_PyPegen_fill_token(p) < 0) {
     475              p->error_indicator = 1;
     476              return NULL;
     477          }
     478      }
     479      Token *t = p->tokens[p->mark];
     480      if (t->type != NAME) {
     481          return NULL;
     482      }
     483      const char *s = PyBytes_AsString(t->bytes);
     484      if (!s) {
     485          p->error_indicator = 1;
     486          return NULL;
     487      }
     488      if (strcmp(s, keyword) != 0) {
     489          return NULL;
     490      }
     491      return _PyPegen_name_token(p);
     492  }
     493  
     494  Token *
     495  _PyPegen_get_last_nonnwhitespace_token(Parser *p)
     496  {
     497      assert(p->mark >= 0);
     498      Token *token = NULL;
     499      for (int m = p->mark - 1; m >= 0; m--) {
     500          token = p->tokens[m];
     501          if (token->type != ENDMARKER && (token->type < NEWLINE || token->type > DEDENT)) {
     502              break;
     503          }
     504      }
     505      return token;
     506  }
     507  
     508  PyObject *
     509  _PyPegen_new_identifier(Parser *p, const char *n)
     510  {
     511      PyObject *id = PyUnicode_DecodeUTF8(n, strlen(n), NULL);
     512      if (!id) {
     513          goto error;
     514      }
     515      /* PyUnicode_DecodeUTF8 should always return a ready string. */
     516      assert(PyUnicode_IS_READY(id));
     517      /* Check whether there are non-ASCII characters in the
     518         identifier; if so, normalize to NFKC. */
     519      if (!PyUnicode_IS_ASCII(id))
     520      {
     521          PyObject *id2;
     522          if (!init_normalization(p))
     523          {
     524              Py_DECREF(id);
     525              goto error;
     526          }
     527          PyObject *form = PyUnicode_InternFromString("NFKC");
     528          if (form == NULL)
     529          {
     530              Py_DECREF(id);
     531              goto error;
     532          }
     533          PyObject *args[2] = {form, id};
     534          id2 = _PyObject_FastCall(p->normalize, args, 2);
     535          Py_DECREF(id);
     536          Py_DECREF(form);
     537          if (!id2) {
     538              goto error;
     539          }
     540          if (!PyUnicode_Check(id2))
     541          {
     542              PyErr_Format(PyExc_TypeError,
     543                           "unicodedata.normalize() must return a string, not "
     544                           "%.200s",
     545                           _PyType_Name(Py_TYPE(id2)));
     546              Py_DECREF(id2);
     547              goto error;
     548          }
     549          id = id2;
     550      }
     551      PyUnicode_InternInPlace(&id);
     552      if (_PyArena_AddPyObject(p->arena, id) < 0)
     553      {
     554          Py_DECREF(id);
     555          goto error;
     556      }
     557      return id;
     558  
     559  error:
     560      p->error_indicator = 1;
     561      return NULL;
     562  }
     563  
     564  static expr_ty
     565  _PyPegen_name_from_token(Parser *p, Token* t)
     566  {
     567      if (t == NULL) {
     568          return NULL;
     569      }
     570      const char *s = PyBytes_AsString(t->bytes);
     571      if (!s) {
     572          p->error_indicator = 1;
     573          return NULL;
     574      }
     575      PyObject *id = _PyPegen_new_identifier(p, s);
     576      if (id == NULL) {
     577          p->error_indicator = 1;
     578          return NULL;
     579      }
     580      return _PyAST_Name(id, Load, t->lineno, t->col_offset, t->end_lineno,
     581                         t->end_col_offset, p->arena);
     582  }
     583  
     584  expr_ty
     585  _PyPegen_name_token(Parser *p)
     586  {
     587      Token *t = _PyPegen_expect_token(p, NAME);
     588      return _PyPegen_name_from_token(p, t);
     589  }
     590  
     591  void *
     592  _PyPegen_string_token(Parser *p)
     593  {
     594      return _PyPegen_expect_token(p, STRING);
     595  }
     596  
     597  expr_ty _PyPegen_soft_keyword_token(Parser *p) {
     598      Token *t = _PyPegen_expect_token(p, NAME);
     599      if (t == NULL) {
     600          return NULL;
     601      }
     602      char *the_token;
     603      Py_ssize_t size;
     604      PyBytes_AsStringAndSize(t->bytes, &the_token, &size);
     605      for (char **keyword = p->soft_keywords; *keyword != NULL; keyword++) {
     606          if (strncmp(*keyword, the_token, size) == 0) {
     607              return _PyPegen_name_from_token(p, t);
     608          }
     609      }
     610      return NULL;
     611  }
     612  
     613  static PyObject *
     614  parsenumber_raw(const char *s)
     615  {
     616      const char *end;
     617      long x;
     618      double dx;
     619      Py_complex compl;
     620      int imflag;
     621  
     622      assert(s != NULL);
     623      errno = 0;
     624      end = s + strlen(s) - 1;
     625      imflag = *end == 'j' || *end == 'J';
     626      if (s[0] == '0') {
     627          x = (long)PyOS_strtoul(s, (char **)&end, 0);
     628          if (x < 0 && errno == 0) {
     629              return PyLong_FromString(s, (char **)0, 0);
     630          }
     631      }
     632      else {
     633          x = PyOS_strtol(s, (char **)&end, 0);
     634      }
     635      if (*end == '\0') {
     636          if (errno != 0) {
     637              return PyLong_FromString(s, (char **)0, 0);
     638          }
     639          return PyLong_FromLong(x);
     640      }
     641      /* XXX Huge floats may silently fail */
     642      if (imflag) {
     643          compl.real = 0.;
     644          compl.imag = PyOS_string_to_double(s, (char **)&end, NULL);
     645          if (compl.imag == -1.0 && PyErr_Occurred()) {
     646              return NULL;
     647          }
     648          return PyComplex_FromCComplex(compl);
     649      }
     650      dx = PyOS_string_to_double(s, NULL, NULL);
     651      if (dx == -1.0 && PyErr_Occurred()) {
     652          return NULL;
     653      }
     654      return PyFloat_FromDouble(dx);
     655  }
     656  
     657  static PyObject *
     658  parsenumber(const char *s)
     659  {
     660      char *dup;
     661      char *end;
     662      PyObject *res = NULL;
     663  
     664      assert(s != NULL);
     665  
     666      if (strchr(s, '_') == NULL) {
     667          return parsenumber_raw(s);
     668      }
     669      /* Create a duplicate without underscores. */
     670      dup = PyMem_Malloc(strlen(s) + 1);
     671      if (dup == NULL) {
     672          return PyErr_NoMemory();
     673      }
     674      end = dup;
     675      for (; *s; s++) {
     676          if (*s != '_') {
     677              *end++ = *s;
     678          }
     679      }
     680      *end = '\0';
     681      res = parsenumber_raw(dup);
     682      PyMem_Free(dup);
     683      return res;
     684  }
     685  
     686  expr_ty
     687  _PyPegen_number_token(Parser *p)
     688  {
     689      Token *t = _PyPegen_expect_token(p, NUMBER);
     690      if (t == NULL) {
     691          return NULL;
     692      }
     693  
     694      const char *num_raw = PyBytes_AsString(t->bytes);
     695      if (num_raw == NULL) {
     696          p->error_indicator = 1;
     697          return NULL;
     698      }
     699  
     700      if (p->feature_version < 6 && strchr(num_raw, '_') != NULL) {
     701          p->error_indicator = 1;
     702          return RAISE_SYNTAX_ERROR("Underscores in numeric literals are only supported "
     703                                    "in Python 3.6 and greater");
     704      }
     705  
     706      PyObject *c = parsenumber(num_raw);
     707  
     708      if (c == NULL) {
     709          p->error_indicator = 1;
     710          PyThreadState *tstate = _PyThreadState_GET();
     711          // The only way a ValueError should happen in _this_ code is via
     712          // PyLong_FromString hitting a length limit.
     713          if (tstate->curexc_type == PyExc_ValueError &&
     714              tstate->curexc_value != NULL) {
     715              PyObject *type, *value, *tb;
     716              // This acts as PyErr_Clear() as we're replacing curexc.
     717              PyErr_Fetch(&type, &value, &tb);
     718              Py_XDECREF(tb);
     719              Py_DECREF(type);
     720              /* Intentionally omitting columns to avoid a wall of 1000s of '^'s
     721               * on the error message. Nobody is going to overlook their huge
     722               * numeric literal once given the line. */
     723              RAISE_ERROR_KNOWN_LOCATION(
     724                  p, PyExc_SyntaxError,
     725                  t->lineno, -1 /* col_offset */,
     726                  t->end_lineno, -1 /* end_col_offset */,
     727                  "%S - Consider hexadecimal for huge integer literals "
     728                  "to avoid decimal conversion limits.",
     729                  value);
     730              Py_DECREF(value);
     731          }
     732          return NULL;
     733      }
     734  
     735      if (_PyArena_AddPyObject(p->arena, c) < 0) {
     736          Py_DECREF(c);
     737          p->error_indicator = 1;
     738          return NULL;
     739      }
     740  
     741      return _PyAST_Constant(c, NULL, t->lineno, t->col_offset, t->end_lineno,
     742                             t->end_col_offset, p->arena);
     743  }
     744  
     745  /* Check that the source for a single input statement really is a single
     746     statement by looking at what is left in the buffer after parsing.
     747     Trailing whitespace and comments are OK. */
     748  static int // bool
     749  bad_single_statement(Parser *p)
     750  {
     751      char *cur = p->tok->cur;
     752      char c = *cur;
     753  
     754      for (;;) {
     755          while (c == ' ' || c == '\t' || c == '\n' || c == '\014') {
     756              c = *++cur;
     757          }
     758  
     759          if (!c) {
     760              return 0;
     761          }
     762  
     763          if (c != '#') {
     764              return 1;
     765          }
     766  
     767          /* Suck up comment. */
     768          while (c && c != '\n') {
     769              c = *++cur;
     770          }
     771      }
     772  }
     773  
     774  static int
     775  compute_parser_flags(PyCompilerFlags *flags)
     776  {
     777      int parser_flags = 0;
     778      if (!flags) {
     779          return 0;
     780      }
     781      if (flags->cf_flags & PyCF_DONT_IMPLY_DEDENT) {
     782          parser_flags |= PyPARSE_DONT_IMPLY_DEDENT;
     783      }
     784      if (flags->cf_flags & PyCF_IGNORE_COOKIE) {
     785          parser_flags |= PyPARSE_IGNORE_COOKIE;
     786      }
     787      if (flags->cf_flags & CO_FUTURE_BARRY_AS_BDFL) {
     788          parser_flags |= PyPARSE_BARRY_AS_BDFL;
     789      }
     790      if (flags->cf_flags & PyCF_TYPE_COMMENTS) {
     791          parser_flags |= PyPARSE_TYPE_COMMENTS;
     792      }
     793      if ((flags->cf_flags & PyCF_ONLY_AST) && flags->cf_feature_version < 7) {
     794          parser_flags |= PyPARSE_ASYNC_HACKS;
     795      }
     796      if (flags->cf_flags & PyCF_ALLOW_INCOMPLETE_INPUT) {
     797          parser_flags |= PyPARSE_ALLOW_INCOMPLETE_INPUT;
     798      }
     799      return parser_flags;
     800  }
     801  
     802  // Parser API
     803  
     804  Parser *
     805  _PyPegen_Parser_New(struct tok_state *tok, int start_rule, int flags,
     806                      int feature_version, int *errcode, PyArena *arena)
     807  {
     808      Parser *p = PyMem_Malloc(sizeof(Parser));
     809      if (p == NULL) {
     810          return (Parser *) PyErr_NoMemory();
     811      }
     812      assert(tok != NULL);
     813      tok->type_comments = (flags & PyPARSE_TYPE_COMMENTS) > 0;
     814      tok->async_hacks = (flags & PyPARSE_ASYNC_HACKS) > 0;
     815      p->tok = tok;
     816      p->keywords = NULL;
     817      p->n_keyword_lists = -1;
     818      p->soft_keywords = NULL;
     819      p->tokens = PyMem_Malloc(sizeof(Token *));
     820      if (!p->tokens) {
     821          PyMem_Free(p);
     822          return (Parser *) PyErr_NoMemory();
     823      }
     824      p->tokens[0] = PyMem_Calloc(1, sizeof(Token));
     825      if (!p->tokens[0]) {
     826          PyMem_Free(p->tokens);
     827          PyMem_Free(p);
     828          return (Parser *) PyErr_NoMemory();
     829      }
     830      if (!growable_comment_array_init(&p->type_ignore_comments, 10)) {
     831          PyMem_Free(p->tokens[0]);
     832          PyMem_Free(p->tokens);
     833          PyMem_Free(p);
     834          return (Parser *) PyErr_NoMemory();
     835      }
     836  
     837      p->mark = 0;
     838      p->fill = 0;
     839      p->size = 1;
     840  
     841      p->errcode = errcode;
     842      p->arena = arena;
     843      p->start_rule = start_rule;
     844      p->parsing_started = 0;
     845      p->normalize = NULL;
     846      p->error_indicator = 0;
     847  
     848      p->starting_lineno = 0;
     849      p->starting_col_offset = 0;
     850      p->flags = flags;
     851      p->feature_version = feature_version;
     852      p->known_err_token = NULL;
     853      p->level = 0;
     854      p->call_invalid_rules = 0;
     855      return p;
     856  }
     857  
     858  void
     859  _PyPegen_Parser_Free(Parser *p)
     860  {
     861      Py_XDECREF(p->normalize);
     862      for (int i = 0; i < p->size; i++) {
     863          PyMem_Free(p->tokens[i]);
     864      }
     865      PyMem_Free(p->tokens);
     866      growable_comment_array_deallocate(&p->type_ignore_comments);
     867      PyMem_Free(p);
     868  }
     869  
     870  static void
     871  reset_parser_state_for_error_pass(Parser *p)
     872  {
     873      for (int i = 0; i < p->fill; i++) {
     874          p->tokens[i]->memo = NULL;
     875      }
     876      p->mark = 0;
     877      p->call_invalid_rules = 1;
     878      // Don't try to get extra tokens in interactive mode when trying to
     879      // raise specialized errors in the second pass.
     880      p->tok->interactive_underflow = IUNDERFLOW_STOP;
     881  }
     882  
     883  static inline int
     884  _is_end_of_source(Parser *p) {
     885      int err = p->tok->done;
     886      return err == E_EOF || err == E_EOFS || err == E_EOLS;
     887  }
     888  
     889  void *
     890  _PyPegen_run_parser(Parser *p)
     891  {
     892      void *res = _PyPegen_parse(p);
     893      assert(p->level == 0);
     894      if (res == NULL) {
     895          if ((p->flags & PyPARSE_ALLOW_INCOMPLETE_INPUT) &&  _is_end_of_source(p)) {
     896              PyErr_Clear();
     897              return RAISE_SYNTAX_ERROR("incomplete input");
     898          }
     899          if (PyErr_Occurred() && !PyErr_ExceptionMatches(PyExc_SyntaxError)) {
     900              return NULL;
     901          }
     902         // Make a second parser pass. In this pass we activate heavier and slower checks
     903          // to produce better error messages and more complete diagnostics. Extra "invalid_*"
     904          // rules will be active during parsing.
     905          Token *last_token = p->tokens[p->fill - 1];
     906          reset_parser_state_for_error_pass(p);
     907          _PyPegen_parse(p);
     908  
     909          // Set SyntaxErrors accordingly depending on the parser/tokenizer status at the failure
     910          // point.
     911          _Pypegen_set_syntax_error(p, last_token);
     912         return NULL;
     913      }
     914  
     915      if (p->start_rule == Py_single_input && bad_single_statement(p)) {
     916          p->tok->done = E_BADSINGLE; // This is not necessary for now, but might be in the future
     917          return RAISE_SYNTAX_ERROR("multiple statements found while compiling a single statement");
     918      }
     919  
     920      // test_peg_generator defines _Py_TEST_PEGEN to not call PyAST_Validate()
     921  #if defined(Py_DEBUG) && !defined(_Py_TEST_PEGEN)
     922      if (p->start_rule == Py_single_input ||
     923          p->start_rule == Py_file_input ||
     924          p->start_rule == Py_eval_input)
     925      {
     926          if (!_PyAST_Validate(res)) {
     927              return NULL;
     928          }
     929      }
     930  #endif
     931      return res;
     932  }
     933  
     934  mod_ty
     935  _PyPegen_run_parser_from_file_pointer(FILE *fp, int start_rule, PyObject *filename_ob,
     936                               const char *enc, const char *ps1, const char *ps2,
     937                               PyCompilerFlags *flags, int *errcode, PyArena *arena)
     938  {
     939      struct tok_state *tok = _PyTokenizer_FromFile(fp, enc, ps1, ps2);
     940      if (tok == NULL) {
     941          if (PyErr_Occurred()) {
     942              _PyPegen_raise_tokenizer_init_error(filename_ob);
     943              return NULL;
     944          }
     945          return NULL;
     946      }
     947      if (!tok->fp || ps1 != NULL || ps2 != NULL ||
     948          PyUnicode_CompareWithASCIIString(filename_ob, "<stdin>") == 0) {
     949          tok->fp_interactive = 1;
     950      }
     951      // This transfers the ownership to the tokenizer
     952      tok->filename = filename_ob;
     953      Py_INCREF(filename_ob);
     954  
     955      // From here on we need to clean up even if there's an error
     956      mod_ty result = NULL;
     957  
     958      int parser_flags = compute_parser_flags(flags);
     959      Parser *p = _PyPegen_Parser_New(tok, start_rule, parser_flags, PY_MINOR_VERSION,
     960                                      errcode, arena);
     961      if (p == NULL) {
     962          goto error;
     963      }
     964  
     965      result = _PyPegen_run_parser(p);
     966      _PyPegen_Parser_Free(p);
     967  
     968  error:
     969      _PyTokenizer_Free(tok);
     970      return result;
     971  }
     972  
     973  mod_ty
     974  _PyPegen_run_parser_from_string(const char *str, int start_rule, PyObject *filename_ob,
     975                         PyCompilerFlags *flags, PyArena *arena)
     976  {
     977      int exec_input = start_rule == Py_file_input;
     978  
     979      struct tok_state *tok;
     980      if (flags != NULL && flags->cf_flags & PyCF_IGNORE_COOKIE) {
     981          tok = _PyTokenizer_FromUTF8(str, exec_input);
     982      } else {
     983          tok = _PyTokenizer_FromString(str, exec_input);
     984      }
     985      if (tok == NULL) {
     986          if (PyErr_Occurred()) {
     987              _PyPegen_raise_tokenizer_init_error(filename_ob);
     988          }
     989          return NULL;
     990      }
     991      // This transfers the ownership to the tokenizer
     992      tok->filename = filename_ob;
     993      Py_INCREF(filename_ob);
     994  
     995      // We need to clear up from here on
     996      mod_ty result = NULL;
     997  
     998      int parser_flags = compute_parser_flags(flags);
     999      int feature_version = flags && (flags->cf_flags & PyCF_ONLY_AST) ?
    1000          flags->cf_feature_version : PY_MINOR_VERSION;
    1001      Parser *p = _PyPegen_Parser_New(tok, start_rule, parser_flags, feature_version,
    1002                                      NULL, arena);
    1003      if (p == NULL) {
    1004          goto error;
    1005      }
    1006  
    1007      result = _PyPegen_run_parser(p);
    1008      _PyPegen_Parser_Free(p);
    1009  
    1010  error:
    1011      _PyTokenizer_Free(tok);
    1012      return result;
    1013  }