1 #include <Python.h>
2 #include <errcode.h>
3
4 #include "tokenizer.h"
5 #include "pegen.h"
6
7 // TOKENIZER ERRORS
8
9 void
10 _PyPegen_raise_tokenizer_init_error(PyObject *filename)
11 {
12 if (!(PyErr_ExceptionMatches(PyExc_LookupError)
13 || PyErr_ExceptionMatches(PyExc_SyntaxError)
14 || PyErr_ExceptionMatches(PyExc_ValueError)
15 || PyErr_ExceptionMatches(PyExc_UnicodeDecodeError))) {
16 return;
17 }
18 PyObject *errstr = NULL;
19 PyObject *tuple = NULL;
20 PyObject *type;
21 PyObject *value;
22 PyObject *tback;
23 PyErr_Fetch(&type, &value, &tback);
24 errstr = PyObject_Str(value);
25 if (!errstr) {
26 goto error;
27 }
28
29 PyObject *tmp = Py_BuildValue("(OiiO)", filename, 0, -1, Py_None);
30 if (!tmp) {
31 goto error;
32 }
33
34 tuple = PyTuple_Pack(2, errstr, tmp);
35 Py_DECREF(tmp);
36 if (!value) {
37 goto error;
38 }
39 PyErr_SetObject(PyExc_SyntaxError, tuple);
40
41 error:
42 Py_XDECREF(type);
43 Py_XDECREF(value);
44 Py_XDECREF(tback);
45 Py_XDECREF(errstr);
46 Py_XDECREF(tuple);
47 }
48
49 static inline void
50 raise_unclosed_parentheses_error(Parser *p) {
51 int error_lineno = p->tok->parenlinenostack[p->tok->level-1];
52 int error_col = p->tok->parencolstack[p->tok->level-1];
53 RAISE_ERROR_KNOWN_LOCATION(p, PyExc_SyntaxError,
54 error_lineno, error_col, error_lineno, -1,
55 "'%c' was never closed",
56 p->tok->parenstack[p->tok->level-1]);
57 }
58
59 int
60 _Pypegen_tokenizer_error(Parser *p)
61 {
62 if (PyErr_Occurred()) {
63 return -1;
64 }
65
66 const char *msg = NULL;
67 PyObject* errtype = PyExc_SyntaxError;
68 Py_ssize_t col_offset = -1;
69 switch (p->tok->done) {
70 case E_TOKEN:
71 msg = "invalid token";
72 break;
73 case E_EOF:
74 if (p->tok->level) {
75 raise_unclosed_parentheses_error(p);
76 } else {
77 RAISE_SYNTAX_ERROR("unexpected EOF while parsing");
78 }
79 return -1;
80 case E_DEDENT:
81 RAISE_INDENTATION_ERROR("unindent does not match any outer indentation level");
82 return -1;
83 case E_INTR:
84 if (!PyErr_Occurred()) {
85 PyErr_SetNone(PyExc_KeyboardInterrupt);
86 }
87 return -1;
88 case E_NOMEM:
89 PyErr_NoMemory();
90 return -1;
91 case E_TABSPACE:
92 errtype = PyExc_TabError;
93 msg = "inconsistent use of tabs and spaces in indentation";
94 break;
95 case E_TOODEEP:
96 errtype = PyExc_IndentationError;
97 msg = "too many levels of indentation";
98 break;
99 case E_LINECONT: {
100 col_offset = p->tok->cur - p->tok->buf - 1;
101 msg = "unexpected character after line continuation character";
102 break;
103 }
104 case E_COLUMNOVERFLOW:
105 PyErr_SetString(PyExc_OverflowError,
106 "Parser column offset overflow - source line is too big");
107 return -1;
108 default:
109 msg = "unknown parsing error";
110 }
111
112 RAISE_ERROR_KNOWN_LOCATION(p, errtype, p->tok->lineno,
113 col_offset >= 0 ? col_offset : 0,
114 p->tok->lineno, -1, msg);
115 return -1;
116 }
117
118 int
119 _Pypegen_raise_decode_error(Parser *p)
120 {
121 assert(PyErr_Occurred());
122 const char *errtype = NULL;
123 if (PyErr_ExceptionMatches(PyExc_UnicodeError)) {
124 errtype = "unicode error";
125 }
126 else if (PyErr_ExceptionMatches(PyExc_ValueError)) {
127 errtype = "value error";
128 }
129 if (errtype) {
130 PyObject *type;
131 PyObject *value;
132 PyObject *tback;
133 PyObject *errstr;
134 PyErr_Fetch(&type, &value, &tback);
135 errstr = PyObject_Str(value);
136 if (errstr) {
137 RAISE_SYNTAX_ERROR("(%s) %U", errtype, errstr);
138 Py_DECREF(errstr);
139 }
140 else {
141 PyErr_Clear();
142 RAISE_SYNTAX_ERROR("(%s) unknown error", errtype);
143 }
144 Py_XDECREF(type);
145 Py_XDECREF(value);
146 Py_XDECREF(tback);
147 }
148
149 return -1;
150 }
151
152 static int
153 _PyPegen_tokenize_full_source_to_check_for_errors(Parser *p) {
154 // Tokenize the whole input to see if there are any tokenization
155 // errors such as mistmatching parentheses. These will get priority
156 // over generic syntax errors only if the line number of the error is
157 // before the one that we had for the generic error.
158
159 // We don't want to tokenize to the end for interactive input
160 if (p->tok->prompt != NULL) {
161 return 0;
162 }
163
164 PyObject *type, *value, *traceback;
165 PyErr_Fetch(&type, &value, &traceback);
166
167 Token *current_token = p->known_err_token != NULL ? p->known_err_token : p->tokens[p->fill - 1];
168 Py_ssize_t current_err_line = current_token->lineno;
169
170 int ret = 0;
171
172 for (;;) {
173 const char *start;
174 const char *end;
175 switch (_PyTokenizer_Get(p->tok, &start, &end)) {
176 case ERRORTOKEN:
177 if (PyErr_Occurred()) {
178 ret = -1;
179 goto exit;
180 }
181 if (p->tok->level != 0) {
182 int error_lineno = p->tok->parenlinenostack[p->tok->level-1];
183 if (current_err_line > error_lineno) {
184 raise_unclosed_parentheses_error(p);
185 ret = -1;
186 goto exit;
187 }
188 }
189 break;
190 case ENDMARKER:
191 break;
192 default:
193 continue;
194 }
195 break;
196 }
197
198
199 exit:
200 if (PyErr_Occurred()) {
201 Py_XDECREF(value);
202 Py_XDECREF(type);
203 Py_XDECREF(traceback);
204 } else {
205 PyErr_Restore(type, value, traceback);
206 }
207 return ret;
208 }
209
210 // PARSER ERRORS
211
212 void *
213 _PyPegen_raise_error(Parser *p, PyObject *errtype, const char *errmsg, ...)
214 {
215 // Bail out if we already have an error set.
216 if (p->error_indicator && PyErr_Occurred()) {
217 return NULL;
218 }
219 if (p->fill == 0) {
220 va_list va;
221 va_start(va, errmsg);
222 _PyPegen_raise_error_known_location(p, errtype, 0, 0, 0, -1, errmsg, va);
223 va_end(va);
224 return NULL;
225 }
226
227 Token *t = p->known_err_token != NULL ? p->known_err_token : p->tokens[p->fill - 1];
228 Py_ssize_t col_offset;
229 Py_ssize_t end_col_offset = -1;
230 if (t->col_offset == -1) {
231 if (p->tok->cur == p->tok->buf) {
232 col_offset = 0;
233 } else {
234 const char* start = p->tok->buf ? p->tok->line_start : p->tok->buf;
235 col_offset = Py_SAFE_DOWNCAST(p->tok->cur - start, intptr_t, int);
236 }
237 } else {
238 col_offset = t->col_offset + 1;
239 }
240
241 if (t->end_col_offset != -1) {
242 end_col_offset = t->end_col_offset + 1;
243 }
244
245 va_list va;
246 va_start(va, errmsg);
247 _PyPegen_raise_error_known_location(p, errtype, t->lineno, col_offset, t->end_lineno, end_col_offset, errmsg, va);
248 va_end(va);
249
250 return NULL;
251 }
252
253 static PyObject *
254 get_error_line_from_tokenizer_buffers(Parser *p, Py_ssize_t lineno)
255 {
256 /* If the file descriptor is interactive, the source lines of the current
257 * (multi-line) statement are stored in p->tok->interactive_src_start.
258 * If not, we're parsing from a string, which means that the whole source
259 * is stored in p->tok->str. */
260 assert((p->tok->fp == NULL && p->tok->str != NULL) || p->tok->fp == stdin);
261
262 char *cur_line = p->tok->fp_interactive ? p->tok->interactive_src_start : p->tok->str;
263 if (cur_line == NULL) {
264 assert(p->tok->fp_interactive);
265 // We can reach this point if the tokenizer buffers for interactive source have not been
266 // initialized because we failed to decode the original source with the given locale.
267 return PyUnicode_FromStringAndSize("", 0);
268 }
269
270 Py_ssize_t relative_lineno = p->starting_lineno ? lineno - p->starting_lineno + 1 : lineno;
271 const char* buf_end = p->tok->fp_interactive ? p->tok->interactive_src_end : p->tok->inp;
272
273 if (buf_end < cur_line) {
274 buf_end = cur_line + strlen(cur_line);
275 }
276
277 for (int i = 0; i < relative_lineno - 1; i++) {
278 char *new_line = strchr(cur_line, '\n');
279 // The assert is here for debug builds but the conditional that
280 // follows is there so in release builds we do not crash at the cost
281 // to report a potentially wrong line.
282 assert(new_line != NULL && new_line + 1 < buf_end);
283 if (new_line == NULL || new_line + 1 > buf_end) {
284 break;
285 }
286 cur_line = new_line + 1;
287 }
288
289 char *next_newline;
290 if ((next_newline = strchr(cur_line, '\n')) == NULL) { // This is the last line
291 next_newline = cur_line + strlen(cur_line);
292 }
293 return PyUnicode_DecodeUTF8(cur_line, next_newline - cur_line, "replace");
294 }
295
296 void *
297 _PyPegen_raise_error_known_location(Parser *p, PyObject *errtype,
298 Py_ssize_t lineno, Py_ssize_t col_offset,
299 Py_ssize_t end_lineno, Py_ssize_t end_col_offset,
300 const char *errmsg, va_list va)
301 {
302 PyObject *value = NULL;
303 PyObject *errstr = NULL;
304 PyObject *error_line = NULL;
305 PyObject *tmp = NULL;
306 p->error_indicator = 1;
307
308 if (end_lineno == CURRENT_POS) {
309 end_lineno = p->tok->lineno;
310 }
311 if (end_col_offset == CURRENT_POS) {
312 end_col_offset = p->tok->cur - p->tok->line_start;
313 }
314
315 if (p->start_rule == Py_fstring_input) {
316 const char *fstring_msg = "f-string: ";
317 Py_ssize_t len = strlen(fstring_msg) + strlen(errmsg);
318
319 char *new_errmsg = PyMem_Malloc(len + 1); // Lengths of both strings plus NULL character
320 if (!new_errmsg) {
321 return (void *) PyErr_NoMemory();
322 }
323
324 // Copy both strings into new buffer
325 memcpy(new_errmsg, fstring_msg, strlen(fstring_msg));
326 memcpy(new_errmsg + strlen(fstring_msg), errmsg, strlen(errmsg));
327 new_errmsg[len] = 0;
328 errmsg = new_errmsg;
329 }
330 errstr = PyUnicode_FromFormatV(errmsg, va);
331 if (!errstr) {
332 goto error;
333 }
334
335 if (p->tok->fp_interactive && p->tok->interactive_src_start != NULL) {
336 error_line = get_error_line_from_tokenizer_buffers(p, lineno);
337 }
338 else if (p->start_rule == Py_file_input) {
339 error_line = _PyErr_ProgramDecodedTextObject(p->tok->filename,
340 (int) lineno, p->tok->encoding);
341 }
342
343 if (!error_line) {
344 /* PyErr_ProgramTextObject was not called or returned NULL. If it was not called,
345 then we need to find the error line from some other source, because
346 p->start_rule != Py_file_input. If it returned NULL, then it either unexpectedly
347 failed or we're parsing from a string or the REPL. There's a third edge case where
348 we're actually parsing from a file, which has an E_EOF SyntaxError and in that case
349 `PyErr_ProgramTextObject` fails because lineno points to last_file_line + 1, which
350 does not physically exist */
351 assert(p->tok->fp == NULL || p->tok->fp == stdin || p->tok->done == E_EOF);
352
353 if (p->tok->lineno <= lineno && p->tok->inp > p->tok->buf) {
354 Py_ssize_t size = p->tok->inp - p->tok->buf;
355 error_line = PyUnicode_DecodeUTF8(p->tok->buf, size, "replace");
356 }
357 else if (p->tok->fp == NULL || p->tok->fp == stdin) {
358 error_line = get_error_line_from_tokenizer_buffers(p, lineno);
359 }
360 else {
361 error_line = PyUnicode_FromStringAndSize("", 0);
362 }
363 if (!error_line) {
364 goto error;
365 }
366 }
367
368 if (p->start_rule == Py_fstring_input) {
369 col_offset -= p->starting_col_offset;
370 end_col_offset -= p->starting_col_offset;
371 }
372
373 Py_ssize_t col_number = col_offset;
374 Py_ssize_t end_col_number = end_col_offset;
375
376 if (p->tok->encoding != NULL) {
377 col_number = _PyPegen_byte_offset_to_character_offset(error_line, col_offset);
378 if (col_number < 0) {
379 goto error;
380 }
381 if (end_col_number > 0) {
382 Py_ssize_t end_col_offset = _PyPegen_byte_offset_to_character_offset(error_line, end_col_number);
383 if (end_col_offset < 0) {
384 goto error;
385 } else {
386 end_col_number = end_col_offset;
387 }
388 }
389 }
390 tmp = Py_BuildValue("(OnnNnn)", p->tok->filename, lineno, col_number, error_line, end_lineno, end_col_number);
391 if (!tmp) {
392 goto error;
393 }
394 value = PyTuple_Pack(2, errstr, tmp);
395 Py_DECREF(tmp);
396 if (!value) {
397 goto error;
398 }
399 PyErr_SetObject(errtype, value);
400
401 Py_DECREF(errstr);
402 Py_DECREF(value);
403 if (p->start_rule == Py_fstring_input) {
404 PyMem_Free((void *)errmsg);
405 }
406 return NULL;
407
408 error:
409 Py_XDECREF(errstr);
410 Py_XDECREF(error_line);
411 if (p->start_rule == Py_fstring_input) {
412 PyMem_Free((void *)errmsg);
413 }
414 return NULL;
415 }
416
417 void
418 _Pypegen_set_syntax_error(Parser* p, Token* last_token) {
419 // Existing sintax error
420 if (PyErr_Occurred()) {
421 // Prioritize tokenizer errors to custom syntax errors raised
422 // on the second phase only if the errors come from the parser.
423 int is_tok_ok = (p->tok->done == E_DONE || p->tok->done == E_OK);
424 if (is_tok_ok && PyErr_ExceptionMatches(PyExc_SyntaxError)) {
425 _PyPegen_tokenize_full_source_to_check_for_errors(p);
426 }
427 // Propagate the existing syntax error.
428 return;
429 }
430 // Initialization error
431 if (p->fill == 0) {
432 RAISE_SYNTAX_ERROR("error at start before reading any input");
433 }
434 // Parser encountered EOF (End of File) unexpectedtly
435 if (last_token->type == ERRORTOKEN && p->tok->done == E_EOF) {
436 if (p->tok->level) {
437 raise_unclosed_parentheses_error(p);
438 } else {
439 RAISE_SYNTAX_ERROR("unexpected EOF while parsing");
440 }
441 return;
442 }
443 // Indentation error in the tokenizer
444 if (last_token->type == INDENT || last_token->type == DEDENT) {
445 RAISE_INDENTATION_ERROR(last_token->type == INDENT ? "unexpected indent" : "unexpected unindent");
446 return;
447 }
448 // Unknown error (generic case)
449
450 // Use the last token we found on the first pass to avoid reporting
451 // incorrect locations for generic syntax errors just because we reached
452 // further away when trying to find specific syntax errors in the second
453 // pass.
454 RAISE_SYNTAX_ERROR_KNOWN_LOCATION(last_token, "invalid syntax");
455 // _PyPegen_tokenize_full_source_to_check_for_errors will override the existing
456 // generic SyntaxError we just raised if errors are found.
457 _PyPegen_tokenize_full_source_to_check_for_errors(p);
458 }