1  #include <Python.h>
       2  #include <errcode.h>
       3  
       4  #include "tokenizer.h"
       5  #include "pegen.h"
       6  
       7  // TOKENIZER ERRORS
       8  
       9  void
      10  _PyPegen_raise_tokenizer_init_error(PyObject *filename)
      11  {
      12      if (!(PyErr_ExceptionMatches(PyExc_LookupError)
      13            || PyErr_ExceptionMatches(PyExc_SyntaxError)
      14            || PyErr_ExceptionMatches(PyExc_ValueError)
      15            || PyErr_ExceptionMatches(PyExc_UnicodeDecodeError))) {
      16          return;
      17      }
      18      PyObject *errstr = NULL;
      19      PyObject *tuple = NULL;
      20      PyObject *type;
      21      PyObject *value;
      22      PyObject *tback;
      23      PyErr_Fetch(&type, &value, &tback);
      24      errstr = PyObject_Str(value);
      25      if (!errstr) {
      26          goto error;
      27      }
      28  
      29      PyObject *tmp = Py_BuildValue("(OiiO)", filename, 0, -1, Py_None);
      30      if (!tmp) {
      31          goto error;
      32      }
      33  
      34      tuple = PyTuple_Pack(2, errstr, tmp);
      35      Py_DECREF(tmp);
      36      if (!value) {
      37          goto error;
      38      }
      39      PyErr_SetObject(PyExc_SyntaxError, tuple);
      40  
      41  error:
      42      Py_XDECREF(type);
      43      Py_XDECREF(value);
      44      Py_XDECREF(tback);
      45      Py_XDECREF(errstr);
      46      Py_XDECREF(tuple);
      47  }
      48  
      49  static inline void
      50  raise_unclosed_parentheses_error(Parser *p) {
      51         int error_lineno = p->tok->parenlinenostack[p->tok->level-1];
      52         int error_col = p->tok->parencolstack[p->tok->level-1];
      53         RAISE_ERROR_KNOWN_LOCATION(p, PyExc_SyntaxError,
      54                                    error_lineno, error_col, error_lineno, -1,
      55                                    "'%c' was never closed",
      56                                    p->tok->parenstack[p->tok->level-1]);
      57  }
      58  
      59  int
      60  _Pypegen_tokenizer_error(Parser *p)
      61  {
      62      if (PyErr_Occurred()) {
      63          return -1;
      64      }
      65  
      66      const char *msg = NULL;
      67      PyObject* errtype = PyExc_SyntaxError;
      68      Py_ssize_t col_offset = -1;
      69      switch (p->tok->done) {
      70          case E_TOKEN:
      71              msg = "invalid token";
      72              break;
      73          case E_EOF:
      74              if (p->tok->level) {
      75                  raise_unclosed_parentheses_error(p);
      76              } else {
      77                  RAISE_SYNTAX_ERROR("unexpected EOF while parsing");
      78              }
      79              return -1;
      80          case E_DEDENT:
      81              RAISE_INDENTATION_ERROR("unindent does not match any outer indentation level");
      82              return -1;
      83          case E_INTR:
      84              if (!PyErr_Occurred()) {
      85                  PyErr_SetNone(PyExc_KeyboardInterrupt);
      86              }
      87              return -1;
      88          case E_NOMEM:
      89              PyErr_NoMemory();
      90              return -1;
      91          case E_TABSPACE:
      92              errtype = PyExc_TabError;
      93              msg = "inconsistent use of tabs and spaces in indentation";
      94              break;
      95          case E_TOODEEP:
      96              errtype = PyExc_IndentationError;
      97              msg = "too many levels of indentation";
      98              break;
      99          case E_LINECONT: {
     100              col_offset = p->tok->cur - p->tok->buf - 1;
     101              msg = "unexpected character after line continuation character";
     102              break;
     103          }
     104          default:
     105              msg = "unknown parsing error";
     106      }
     107  
     108      RAISE_ERROR_KNOWN_LOCATION(p, errtype, p->tok->lineno,
     109                                 col_offset >= 0 ? col_offset : 0,
     110                                 p->tok->lineno, -1, msg);
     111      return -1;
     112  }
     113  
     114  int
     115  _Pypegen_raise_decode_error(Parser *p)
     116  {
     117      assert(PyErr_Occurred());
     118      const char *errtype = NULL;
     119      if (PyErr_ExceptionMatches(PyExc_UnicodeError)) {
     120          errtype = "unicode error";
     121      }
     122      else if (PyErr_ExceptionMatches(PyExc_ValueError)) {
     123          errtype = "value error";
     124      }
     125      if (errtype) {
     126          PyObject *type;
     127          PyObject *value;
     128          PyObject *tback;
     129          PyObject *errstr;
     130          PyErr_Fetch(&type, &value, &tback);
     131          errstr = PyObject_Str(value);
     132          if (errstr) {
     133              RAISE_SYNTAX_ERROR("(%s) %U", errtype, errstr);
     134              Py_DECREF(errstr);
     135          }
     136          else {
     137              PyErr_Clear();
     138              RAISE_SYNTAX_ERROR("(%s) unknown error", errtype);
     139          }
     140          Py_XDECREF(type);
     141          Py_XDECREF(value);
     142          Py_XDECREF(tback);
     143      }
     144  
     145      return -1;
     146  }
     147  
     148  static int
     149  _PyPegen_tokenize_full_source_to_check_for_errors(Parser *p) {
     150      // Tokenize the whole input to see if there are any tokenization
     151      // errors such as mistmatching parentheses. These will get priority
     152      // over generic syntax errors only if the line number of the error is
     153      // before the one that we had for the generic error.
     154  
     155      // We don't want to tokenize to the end for interactive input
     156      if (p->tok->prompt != NULL) {
     157          return 0;
     158      }
     159  
     160      PyObject *type, *value, *traceback;
     161      PyErr_Fetch(&type, &value, &traceback);
     162  
     163      Token *current_token = p->known_err_token != NULL ? p->known_err_token : p->tokens[p->fill - 1];
     164      Py_ssize_t current_err_line = current_token->lineno;
     165  
     166      int ret = 0;
     167      struct token new_token;
     168      _PyToken_Init(&new_token);
     169  
     170      for (;;) {
     171          switch (_PyTokenizer_Get(p->tok, &new_token)) {
     172              case ERRORTOKEN:
     173                  if (PyErr_Occurred()) {
     174                      ret = -1;
     175                      goto exit;
     176                  }
     177                  if (p->tok->level != 0) {
     178                      int error_lineno = p->tok->parenlinenostack[p->tok->level-1];
     179                      if (current_err_line > error_lineno) {
     180                          raise_unclosed_parentheses_error(p);
     181                          ret = -1;
     182                          goto exit;
     183                      }
     184                  }
     185                  break;
     186              case ENDMARKER:
     187                  break;
     188              default:
     189                  continue;
     190          }
     191          break;
     192      }
     193  
     194  
     195  exit:
     196      _PyToken_Free(&new_token);
     197      // If we're in an f-string, we want the syntax error in the expression part
     198      // to propagate, so that tokenizer errors (like expecting '}') that happen afterwards
     199      // do not swallow it.
     200      if (PyErr_Occurred() && p->tok->tok_mode_stack_index <= 0) {
     201          Py_XDECREF(value);
     202          Py_XDECREF(type);
     203          Py_XDECREF(traceback);
     204      } else {
     205          PyErr_Restore(type, value, traceback);
     206      }
     207      return ret;
     208  }
     209  
     210  // PARSER ERRORS
     211  
     212  void *
     213  _PyPegen_raise_error(Parser *p, PyObject *errtype, int use_mark, const char *errmsg, ...)
     214  {
     215      if (p->fill == 0) {
     216          va_list va;
     217          va_start(va, errmsg);
     218          _PyPegen_raise_error_known_location(p, errtype, 0, 0, 0, -1, errmsg, va);
     219          va_end(va);
     220          return NULL;
     221      }
     222      if (use_mark && p->mark == p->fill && _PyPegen_fill_token(p) < 0) {
     223          p->error_indicator = 1;
     224          return NULL;
     225      }
     226      Token *t = p->known_err_token != NULL
     227                     ? p->known_err_token
     228                     : p->tokens[use_mark ? p->mark : p->fill - 1];
     229      Py_ssize_t col_offset;
     230      Py_ssize_t end_col_offset = -1;
     231      if (t->col_offset == -1) {
     232          if (p->tok->cur == p->tok->buf) {
     233              col_offset = 0;
     234          } else {
     235              const char* start = p->tok->buf  ? p->tok->line_start : p->tok->buf;
     236              col_offset = Py_SAFE_DOWNCAST(p->tok->cur - start, intptr_t, int);
     237          }
     238      } else {
     239          col_offset = t->col_offset + 1;
     240      }
     241  
     242      if (t->end_col_offset != -1) {
     243          end_col_offset = t->end_col_offset + 1;
     244      }
     245  
     246      va_list va;
     247      va_start(va, errmsg);
     248      _PyPegen_raise_error_known_location(p, errtype, t->lineno, col_offset, t->end_lineno, end_col_offset, errmsg, va);
     249      va_end(va);
     250  
     251      return NULL;
     252  }
     253  
     254  static PyObject *
     255  get_error_line_from_tokenizer_buffers(Parser *p, Py_ssize_t lineno)
     256  {
     257      /* If the file descriptor is interactive, the source lines of the current
     258       * (multi-line) statement are stored in p->tok->interactive_src_start.
     259       * If not, we're parsing from a string, which means that the whole source
     260       * is stored in p->tok->str. */
     261      assert((p->tok->fp == NULL && p->tok->str != NULL) || p->tok->fp != NULL);
     262  
     263      char *cur_line = p->tok->fp_interactive ? p->tok->interactive_src_start : p->tok->str;
     264      if (cur_line == NULL) {
     265          assert(p->tok->fp_interactive);
     266          // We can reach this point if the tokenizer buffers for interactive source have not been
     267          // initialized because we failed to decode the original source with the given locale.
     268          return PyUnicode_FromStringAndSize("", 0);
     269      }
     270  
     271      Py_ssize_t relative_lineno = p->starting_lineno ? lineno - p->starting_lineno + 1 : lineno;
     272      const char* buf_end = p->tok->fp_interactive ? p->tok->interactive_src_end : p->tok->inp;
     273  
     274      for (int i = 0; i < relative_lineno - 1; i++) {
     275          char *new_line = strchr(cur_line, '\n');
     276          // The assert is here for debug builds but the conditional that
     277          // follows is there so in release builds we do not crash at the cost
     278          // to report a potentially wrong line.
     279          assert(new_line != NULL && new_line + 1 < buf_end);
     280          if (new_line == NULL || new_line + 1 > buf_end) {
     281              break;
     282          }
     283          cur_line = new_line + 1;
     284      }
     285  
     286      char *next_newline;
     287      if ((next_newline = strchr(cur_line, '\n')) == NULL) { // This is the last line
     288          next_newline = cur_line + strlen(cur_line);
     289      }
     290      return PyUnicode_DecodeUTF8(cur_line, next_newline - cur_line, "replace");
     291  }
     292  
     293  void *
     294  _PyPegen_raise_error_known_location(Parser *p, PyObject *errtype,
     295                                      Py_ssize_t lineno, Py_ssize_t col_offset,
     296                                      Py_ssize_t end_lineno, Py_ssize_t end_col_offset,
     297                                      const char *errmsg, va_list va)
     298  {
     299      PyObject *value = NULL;
     300      PyObject *errstr = NULL;
     301      PyObject *error_line = NULL;
     302      PyObject *tmp = NULL;
     303      p->error_indicator = 1;
     304  
     305      if (end_lineno == CURRENT_POS) {
     306          end_lineno = p->tok->lineno;
     307      }
     308      if (end_col_offset == CURRENT_POS) {
     309          end_col_offset = p->tok->cur - p->tok->line_start;
     310      }
     311  
     312      if (p->start_rule == Py_fstring_input) {
     313          const char *fstring_msg = "f-string: ";
     314          Py_ssize_t len = strlen(fstring_msg) + strlen(errmsg);
     315  
     316          char *new_errmsg = PyMem_Malloc(len + 1); // Lengths of both strings plus NULL character
     317          if (!new_errmsg) {
     318              return (void *) PyErr_NoMemory();
     319          }
     320  
     321          // Copy both strings into new buffer
     322          memcpy(new_errmsg, fstring_msg, strlen(fstring_msg));
     323          memcpy(new_errmsg + strlen(fstring_msg), errmsg, strlen(errmsg));
     324          new_errmsg[len] = 0;
     325          errmsg = new_errmsg;
     326      }
     327      errstr = PyUnicode_FromFormatV(errmsg, va);
     328      if (!errstr) {
     329          goto error;
     330      }
     331  
     332      if (p->tok->fp_interactive && p->tok->interactive_src_start != NULL) {
     333          error_line = get_error_line_from_tokenizer_buffers(p, lineno);
     334      }
     335      else if (p->start_rule == Py_file_input) {
     336          error_line = _PyErr_ProgramDecodedTextObject(p->tok->filename,
     337                                                       (int) lineno, p->tok->encoding);
     338      }
     339  
     340      if (!error_line) {
     341          /* PyErr_ProgramTextObject was not called or returned NULL. If it was not called,
     342             then we need to find the error line from some other source, because
     343             p->start_rule != Py_file_input. If it returned NULL, then it either unexpectedly
     344             failed or we're parsing from a string or the REPL. There's a third edge case where
     345             we're actually parsing from a file, which has an E_EOF SyntaxError and in that case
     346             `PyErr_ProgramTextObject` fails because lineno points to last_file_line + 1, which
     347             does not physically exist */
     348          assert(p->tok->fp == NULL || p->tok->fp == stdin || p->tok->done == E_EOF);
     349  
     350          if (p->tok->lineno <= lineno && p->tok->inp > p->tok->buf) {
     351              Py_ssize_t size = p->tok->inp - p->tok->buf;
     352              error_line = PyUnicode_DecodeUTF8(p->tok->buf, size, "replace");
     353          }
     354          else if (p->tok->fp == NULL || p->tok->fp == stdin) {
     355              error_line = get_error_line_from_tokenizer_buffers(p, lineno);
     356          }
     357          else {
     358              error_line = PyUnicode_FromStringAndSize("", 0);
     359          }
     360          if (!error_line) {
     361              goto error;
     362          }
     363      }
     364  
     365      if (p->start_rule == Py_fstring_input) {
     366          col_offset -= p->starting_col_offset;
     367          end_col_offset -= p->starting_col_offset;
     368      }
     369  
     370      Py_ssize_t col_number = col_offset;
     371      Py_ssize_t end_col_number = end_col_offset;
     372  
     373      if (p->tok->encoding != NULL) {
     374          col_number = _PyPegen_byte_offset_to_character_offset(error_line, col_offset);
     375          if (col_number < 0) {
     376              goto error;
     377          }
     378          if (end_col_number > 0) {
     379              Py_ssize_t end_col_offset = _PyPegen_byte_offset_to_character_offset(error_line, end_col_number);
     380              if (end_col_offset < 0) {
     381                  goto error;
     382              } else {
     383                  end_col_number = end_col_offset;
     384              }
     385          }
     386      }
     387      tmp = Py_BuildValue("(OnnNnn)", p->tok->filename, lineno, col_number, error_line, end_lineno, end_col_number);
     388      if (!tmp) {
     389          goto error;
     390      }
     391      value = PyTuple_Pack(2, errstr, tmp);
     392      Py_DECREF(tmp);
     393      if (!value) {
     394          goto error;
     395      }
     396      PyErr_SetObject(errtype, value);
     397  
     398      Py_DECREF(errstr);
     399      Py_DECREF(value);
     400      if (p->start_rule == Py_fstring_input) {
     401          PyMem_Free((void *)errmsg);
     402      }
     403      return NULL;
     404  
     405  error:
     406      Py_XDECREF(errstr);
     407      Py_XDECREF(error_line);
     408      if (p->start_rule == Py_fstring_input) {
     409          PyMem_Free((void *)errmsg);
     410      }
     411      return NULL;
     412  }
     413  
     414  void
     415  _Pypegen_set_syntax_error(Parser* p, Token* last_token) {
     416      // Existing sintax error
     417      if (PyErr_Occurred()) {
     418          // Prioritize tokenizer errors to custom syntax errors raised
     419          // on the second phase only if the errors come from the parser.
     420          int is_tok_ok = (p->tok->done == E_DONE || p->tok->done == E_OK);
     421          if (is_tok_ok && PyErr_ExceptionMatches(PyExc_SyntaxError)) {
     422              _PyPegen_tokenize_full_source_to_check_for_errors(p);
     423          }
     424          // Propagate the existing syntax error.
     425          return;
     426      }
     427      // Initialization error
     428      if (p->fill == 0) {
     429          RAISE_SYNTAX_ERROR("error at start before reading any input");
     430      }
     431      // Parser encountered EOF (End of File) unexpectedtly
     432      if (last_token->type == ERRORTOKEN && p->tok->done == E_EOF) {
     433          if (p->tok->level) {
     434              raise_unclosed_parentheses_error(p);
     435          } else {
     436              RAISE_SYNTAX_ERROR("unexpected EOF while parsing");
     437          }
     438          return;
     439      }
     440      // Indentation error in the tokenizer
     441      if (last_token->type == INDENT || last_token->type == DEDENT) {
     442          RAISE_INDENTATION_ERROR(last_token->type == INDENT ? "unexpected indent" : "unexpected unindent");
     443          return;
     444      }
     445      // Unknown error (generic case)
     446  
     447      // Use the last token we found on the first pass to avoid reporting
     448      // incorrect locations for generic syntax errors just because we reached
     449      // further away when trying to find specific syntax errors in the second
     450      // pass.
     451      RAISE_SYNTAX_ERROR_KNOWN_LOCATION(last_token, "invalid syntax");
     452      // _PyPegen_tokenize_full_source_to_check_for_errors will override the existing
     453      // generic SyntaxError we just raised if errors are found.
     454      _PyPegen_tokenize_full_source_to_check_for_errors(p);
     455  }
     456  
     457  void
     458  _Pypegen_stack_overflow(Parser *p)
     459  {
     460      p->error_indicator = 1;
     461      PyErr_SetString(PyExc_MemoryError,
     462          "Parser stack overflowed - Python source too complex to parse");
     463  }