1  #include <Python.h>
       2  #include <errcode.h>
       3  
       4  #include "tokenizer.h"
       5  #include "pegen.h"
       6  
       7  // TOKENIZER ERRORS
       8  
       9  void
      10  _PyPegen_raise_tokenizer_init_error(PyObject *filename)
      11  {
      12      if (!(PyErr_ExceptionMatches(PyExc_LookupError)
      13            || PyErr_ExceptionMatches(PyExc_SyntaxError)
      14            || PyErr_ExceptionMatches(PyExc_ValueError)
      15            || PyErr_ExceptionMatches(PyExc_UnicodeDecodeError))) {
      16          return;
      17      }
      18      PyObject *errstr = NULL;
      19      PyObject *tuple = NULL;
      20      PyObject *type;
      21      PyObject *value;
      22      PyObject *tback;
      23      PyErr_Fetch(&type, &value, &tback);
      24      errstr = PyObject_Str(value);
      25      if (!errstr) {
      26          goto error;
      27      }
      28  
      29      PyObject *tmp = Py_BuildValue("(OiiO)", filename, 0, -1, Py_None);
      30      if (!tmp) {
      31          goto error;
      32      }
      33  
      34      tuple = PyTuple_Pack(2, errstr, tmp);
      35      Py_DECREF(tmp);
      36      if (!value) {
      37          goto error;
      38      }
      39      PyErr_SetObject(PyExc_SyntaxError, tuple);
      40  
      41  error:
      42      Py_XDECREF(type);
      43      Py_XDECREF(value);
      44      Py_XDECREF(tback);
      45      Py_XDECREF(errstr);
      46      Py_XDECREF(tuple);
      47  }
      48  
      49  static inline void
      50  raise_unclosed_parentheses_error(Parser *p) {
      51         int error_lineno = p->tok->parenlinenostack[p->tok->level-1];
      52         int error_col = p->tok->parencolstack[p->tok->level-1];
      53         RAISE_ERROR_KNOWN_LOCATION(p, PyExc_SyntaxError,
      54                                    error_lineno, error_col, error_lineno, -1,
      55                                    "'%c' was never closed",
      56                                    p->tok->parenstack[p->tok->level-1]);
      57  }
      58  
      59  int
      60  _Pypegen_tokenizer_error(Parser *p)
      61  {
      62      if (PyErr_Occurred()) {
      63          return -1;
      64      }
      65  
      66      const char *msg = NULL;
      67      PyObject* errtype = PyExc_SyntaxError;
      68      Py_ssize_t col_offset = -1;
      69      switch (p->tok->done) {
      70          case E_TOKEN:
      71              msg = "invalid token";
      72              break;
      73          case E_EOF:
      74              if (p->tok->level) {
      75                  raise_unclosed_parentheses_error(p);
      76              } else {
      77                  RAISE_SYNTAX_ERROR("unexpected EOF while parsing");
      78              }
      79              return -1;
      80          case E_DEDENT:
      81              RAISE_INDENTATION_ERROR("unindent does not match any outer indentation level");
      82              return -1;
      83          case E_INTR:
      84              if (!PyErr_Occurred()) {
      85                  PyErr_SetNone(PyExc_KeyboardInterrupt);
      86              }
      87              return -1;
      88          case E_NOMEM:
      89              PyErr_NoMemory();
      90              return -1;
      91          case E_TABSPACE:
      92              errtype = PyExc_TabError;
      93              msg = "inconsistent use of tabs and spaces in indentation";
      94              break;
      95          case E_TOODEEP:
      96              errtype = PyExc_IndentationError;
      97              msg = "too many levels of indentation";
      98              break;
      99          case E_LINECONT: {
     100              col_offset = p->tok->cur - p->tok->buf - 1;
     101              msg = "unexpected character after line continuation character";
     102              break;
     103          }
     104          case E_COLUMNOVERFLOW:
     105              PyErr_SetString(PyExc_OverflowError,
     106                      "Parser column offset overflow - source line is too big");
     107              return -1;
     108          default:
     109              msg = "unknown parsing error";
     110      }
     111  
     112      RAISE_ERROR_KNOWN_LOCATION(p, errtype, p->tok->lineno,
     113                                 col_offset >= 0 ? col_offset : 0,
     114                                 p->tok->lineno, -1, msg);
     115      return -1;
     116  }
     117  
     118  int
     119  _Pypegen_raise_decode_error(Parser *p)
     120  {
     121      assert(PyErr_Occurred());
     122      const char *errtype = NULL;
     123      if (PyErr_ExceptionMatches(PyExc_UnicodeError)) {
     124          errtype = "unicode error";
     125      }
     126      else if (PyErr_ExceptionMatches(PyExc_ValueError)) {
     127          errtype = "value error";
     128      }
     129      if (errtype) {
     130          PyObject *type;
     131          PyObject *value;
     132          PyObject *tback;
     133          PyObject *errstr;
     134          PyErr_Fetch(&type, &value, &tback);
     135          errstr = PyObject_Str(value);
     136          if (errstr) {
     137              RAISE_SYNTAX_ERROR("(%s) %U", errtype, errstr);
     138              Py_DECREF(errstr);
     139          }
     140          else {
     141              PyErr_Clear();
     142              RAISE_SYNTAX_ERROR("(%s) unknown error", errtype);
     143          }
     144          Py_XDECREF(type);
     145          Py_XDECREF(value);
     146          Py_XDECREF(tback);
     147      }
     148  
     149      return -1;
     150  }
     151  
     152  static int
     153  _PyPegen_tokenize_full_source_to_check_for_errors(Parser *p) {
     154      // Tokenize the whole input to see if there are any tokenization
     155      // errors such as mistmatching parentheses. These will get priority
     156      // over generic syntax errors only if the line number of the error is
     157      // before the one that we had for the generic error.
     158  
     159      // We don't want to tokenize to the end for interactive input
     160      if (p->tok->prompt != NULL) {
     161          return 0;
     162      }
     163  
     164      PyObject *type, *value, *traceback;
     165      PyErr_Fetch(&type, &value, &traceback);
     166  
     167      Token *current_token = p->known_err_token != NULL ? p->known_err_token : p->tokens[p->fill - 1];
     168      Py_ssize_t current_err_line = current_token->lineno;
     169  
     170      int ret = 0;
     171  
     172      for (;;) {
     173          const char *start;
     174          const char *end;
     175          switch (_PyTokenizer_Get(p->tok, &start, &end)) {
     176              case ERRORTOKEN:
     177                  if (PyErr_Occurred()) {
     178                      ret = -1;
     179                      goto exit;
     180                  }
     181                  if (p->tok->level != 0) {
     182                      int error_lineno = p->tok->parenlinenostack[p->tok->level-1];
     183                      if (current_err_line > error_lineno) {
     184                          raise_unclosed_parentheses_error(p);
     185                          ret = -1;
     186                          goto exit;
     187                      }
     188                  }
     189                  break;
     190              case ENDMARKER:
     191                  break;
     192              default:
     193                  continue;
     194          }
     195          break;
     196      }
     197  
     198  
     199  exit:
     200      if (PyErr_Occurred()) {
     201          Py_XDECREF(value);
     202          Py_XDECREF(type);
     203          Py_XDECREF(traceback);
     204      } else {
     205          PyErr_Restore(type, value, traceback);
     206      }
     207      return ret;
     208  }
     209  
     210  // PARSER ERRORS
     211  
     212  void *
     213  _PyPegen_raise_error(Parser *p, PyObject *errtype, const char *errmsg, ...)
     214  {
     215      // Bail out if we already have an error set.
     216      if (p->error_indicator && PyErr_Occurred()) {
     217          return NULL;
     218      }
     219      if (p->fill == 0) {
     220          va_list va;
     221          va_start(va, errmsg);
     222          _PyPegen_raise_error_known_location(p, errtype, 0, 0, 0, -1, errmsg, va);
     223          va_end(va);
     224          return NULL;
     225      }
     226  
     227      Token *t = p->known_err_token != NULL ? p->known_err_token : p->tokens[p->fill - 1];
     228      Py_ssize_t col_offset;
     229      Py_ssize_t end_col_offset = -1;
     230      if (t->col_offset == -1) {
     231          if (p->tok->cur == p->tok->buf) {
     232              col_offset = 0;
     233          } else {
     234              const char* start = p->tok->buf  ? p->tok->line_start : p->tok->buf;
     235              col_offset = Py_SAFE_DOWNCAST(p->tok->cur - start, intptr_t, int);
     236          }
     237      } else {
     238          col_offset = t->col_offset + 1;
     239      }
     240  
     241      if (t->end_col_offset != -1) {
     242          end_col_offset = t->end_col_offset + 1;
     243      }
     244  
     245      va_list va;
     246      va_start(va, errmsg);
     247      _PyPegen_raise_error_known_location(p, errtype, t->lineno, col_offset, t->end_lineno, end_col_offset, errmsg, va);
     248      va_end(va);
     249  
     250      return NULL;
     251  }
     252  
     253  static PyObject *
     254  get_error_line_from_tokenizer_buffers(Parser *p, Py_ssize_t lineno)
     255  {
     256      /* If the file descriptor is interactive, the source lines of the current
     257       * (multi-line) statement are stored in p->tok->interactive_src_start.
     258       * If not, we're parsing from a string, which means that the whole source
     259       * is stored in p->tok->str. */
     260      assert((p->tok->fp == NULL && p->tok->str != NULL) || p->tok->fp == stdin);
     261  
     262      char *cur_line = p->tok->fp_interactive ? p->tok->interactive_src_start : p->tok->str;
     263      if (cur_line == NULL) {
     264          assert(p->tok->fp_interactive);
     265          // We can reach this point if the tokenizer buffers for interactive source have not been
     266          // initialized because we failed to decode the original source with the given locale.
     267          return PyUnicode_FromStringAndSize("", 0);
     268      }
     269  
     270      Py_ssize_t relative_lineno = p->starting_lineno ? lineno - p->starting_lineno + 1 : lineno;
     271      const char* buf_end = p->tok->fp_interactive ? p->tok->interactive_src_end : p->tok->inp;
     272  
     273      if (buf_end < cur_line) {
     274          buf_end = cur_line + strlen(cur_line);
     275      }
     276  
     277      for (int i = 0; i < relative_lineno - 1; i++) {
     278          char *new_line = strchr(cur_line, '\n');
     279          // The assert is here for debug builds but the conditional that
     280          // follows is there so in release builds we do not crash at the cost
     281          // to report a potentially wrong line.
     282          assert(new_line != NULL && new_line + 1 < buf_end);
     283          if (new_line == NULL || new_line + 1 > buf_end) {
     284              break;
     285          }
     286          cur_line = new_line + 1;
     287      }
     288  
     289      char *next_newline;
     290      if ((next_newline = strchr(cur_line, '\n')) == NULL) { // This is the last line
     291          next_newline = cur_line + strlen(cur_line);
     292      }
     293      return PyUnicode_DecodeUTF8(cur_line, next_newline - cur_line, "replace");
     294  }
     295  
     296  void *
     297  _PyPegen_raise_error_known_location(Parser *p, PyObject *errtype,
     298                                      Py_ssize_t lineno, Py_ssize_t col_offset,
     299                                      Py_ssize_t end_lineno, Py_ssize_t end_col_offset,
     300                                      const char *errmsg, va_list va)
     301  {
     302      PyObject *value = NULL;
     303      PyObject *errstr = NULL;
     304      PyObject *error_line = NULL;
     305      PyObject *tmp = NULL;
     306      p->error_indicator = 1;
     307  
     308      if (end_lineno == CURRENT_POS) {
     309          end_lineno = p->tok->lineno;
     310      }
     311      if (end_col_offset == CURRENT_POS) {
     312          end_col_offset = p->tok->cur - p->tok->line_start;
     313      }
     314  
     315      if (p->start_rule == Py_fstring_input) {
     316          const char *fstring_msg = "f-string: ";
     317          Py_ssize_t len = strlen(fstring_msg) + strlen(errmsg);
     318  
     319          char *new_errmsg = PyMem_Malloc(len + 1); // Lengths of both strings plus NULL character
     320          if (!new_errmsg) {
     321              return (void *) PyErr_NoMemory();
     322          }
     323  
     324          // Copy both strings into new buffer
     325          memcpy(new_errmsg, fstring_msg, strlen(fstring_msg));
     326          memcpy(new_errmsg + strlen(fstring_msg), errmsg, strlen(errmsg));
     327          new_errmsg[len] = 0;
     328          errmsg = new_errmsg;
     329      }
     330      errstr = PyUnicode_FromFormatV(errmsg, va);
     331      if (!errstr) {
     332          goto error;
     333      }
     334  
     335      if (p->tok->fp_interactive && p->tok->interactive_src_start != NULL) {
     336          error_line = get_error_line_from_tokenizer_buffers(p, lineno);
     337      }
     338      else if (p->start_rule == Py_file_input) {
     339          error_line = _PyErr_ProgramDecodedTextObject(p->tok->filename,
     340                                                       (int) lineno, p->tok->encoding);
     341      }
     342  
     343      if (!error_line) {
     344          /* PyErr_ProgramTextObject was not called or returned NULL. If it was not called,
     345             then we need to find the error line from some other source, because
     346             p->start_rule != Py_file_input. If it returned NULL, then it either unexpectedly
     347             failed or we're parsing from a string or the REPL. There's a third edge case where
     348             we're actually parsing from a file, which has an E_EOF SyntaxError and in that case
     349             `PyErr_ProgramTextObject` fails because lineno points to last_file_line + 1, which
     350             does not physically exist */
     351          assert(p->tok->fp == NULL || p->tok->fp == stdin || p->tok->done == E_EOF);
     352  
     353          if (p->tok->lineno <= lineno && p->tok->inp > p->tok->buf) {
     354              Py_ssize_t size = p->tok->inp - p->tok->buf;
     355              error_line = PyUnicode_DecodeUTF8(p->tok->buf, size, "replace");
     356          }
     357          else if (p->tok->fp == NULL || p->tok->fp == stdin) {
     358              error_line = get_error_line_from_tokenizer_buffers(p, lineno);
     359          }
     360          else {
     361              error_line = PyUnicode_FromStringAndSize("", 0);
     362          }
     363          if (!error_line) {
     364              goto error;
     365          }
     366      }
     367  
     368      if (p->start_rule == Py_fstring_input) {
     369          col_offset -= p->starting_col_offset;
     370          end_col_offset -= p->starting_col_offset;
     371      }
     372  
     373      Py_ssize_t col_number = col_offset;
     374      Py_ssize_t end_col_number = end_col_offset;
     375  
     376      if (p->tok->encoding != NULL) {
     377          col_number = _PyPegen_byte_offset_to_character_offset(error_line, col_offset);
     378          if (col_number < 0) {
     379              goto error;
     380          }
     381          if (end_col_number > 0) {
     382              Py_ssize_t end_col_offset = _PyPegen_byte_offset_to_character_offset(error_line, end_col_number);
     383              if (end_col_offset < 0) {
     384                  goto error;
     385              } else {
     386                  end_col_number = end_col_offset;
     387              }
     388          }
     389      }
     390      tmp = Py_BuildValue("(OnnNnn)", p->tok->filename, lineno, col_number, error_line, end_lineno, end_col_number);
     391      if (!tmp) {
     392          goto error;
     393      }
     394      value = PyTuple_Pack(2, errstr, tmp);
     395      Py_DECREF(tmp);
     396      if (!value) {
     397          goto error;
     398      }
     399      PyErr_SetObject(errtype, value);
     400  
     401      Py_DECREF(errstr);
     402      Py_DECREF(value);
     403      if (p->start_rule == Py_fstring_input) {
     404          PyMem_Free((void *)errmsg);
     405      }
     406      return NULL;
     407  
     408  error:
     409      Py_XDECREF(errstr);
     410      Py_XDECREF(error_line);
     411      if (p->start_rule == Py_fstring_input) {
     412          PyMem_Free((void *)errmsg);
     413      }
     414      return NULL;
     415  }
     416  
     417  void
     418  _Pypegen_set_syntax_error(Parser* p, Token* last_token) {
     419      // Existing sintax error
     420      if (PyErr_Occurred()) {
     421          // Prioritize tokenizer errors to custom syntax errors raised
     422          // on the second phase only if the errors come from the parser.
     423          int is_tok_ok = (p->tok->done == E_DONE || p->tok->done == E_OK);
     424          if (is_tok_ok && PyErr_ExceptionMatches(PyExc_SyntaxError)) {
     425              _PyPegen_tokenize_full_source_to_check_for_errors(p);
     426          }
     427          // Propagate the existing syntax error.
     428          return;
     429      }
     430      // Initialization error
     431      if (p->fill == 0) {
     432          RAISE_SYNTAX_ERROR("error at start before reading any input");
     433      }
     434      // Parser encountered EOF (End of File) unexpectedtly
     435      if (last_token->type == ERRORTOKEN && p->tok->done == E_EOF) {
     436          if (p->tok->level) {
     437              raise_unclosed_parentheses_error(p);
     438          } else {
     439              RAISE_SYNTAX_ERROR("unexpected EOF while parsing");
     440          }
     441          return;
     442      }
     443      // Indentation error in the tokenizer
     444      if (last_token->type == INDENT || last_token->type == DEDENT) {
     445          RAISE_INDENTATION_ERROR(last_token->type == INDENT ? "unexpected indent" : "unexpected unindent");
     446          return;
     447      }
     448      // Unknown error (generic case)
     449  
     450      // Use the last token we found on the first pass to avoid reporting
     451      // incorrect locations for generic syntax errors just because we reached
     452      // further away when trying to find specific syntax errors in the second
     453      // pass.
     454      RAISE_SYNTAX_ERROR_KNOWN_LOCATION(last_token, "invalid syntax");
     455      // _PyPegen_tokenize_full_source_to_check_for_errors will override the existing
     456      // generic SyntaxError we just raised if errors are found.
     457      _PyPegen_tokenize_full_source_to_check_for_errors(p);
     458  }