1
2 /* Tokenizer implementation */
3
4 #define PY_SSIZE_T_CLEAN
5 #include "Python.h"
6 #include "pycore_call.h" // _PyObject_CallNoArgs()
7
8 #include <ctype.h>
9 #include <assert.h>
10
11 #include "tokenizer.h"
12 #include "errcode.h"
13
14 #include "unicodeobject.h"
15 #include "bytesobject.h"
16 #include "fileobject.h"
17 #include "abstract.h"
18
19 /* Alternate tab spacing */
20 #define ALTTABSIZE 1
21
22 #define is_potential_identifier_start(c) (\
23 (c >= 'a' && c <= 'z')\
24 || (c >= 'A' && c <= 'Z')\
25 || c == '_'\
26 || (c >= 128))
27
28 #define is_potential_identifier_char(c) (\
29 (c >= 'a' && c <= 'z')\
30 || (c >= 'A' && c <= 'Z')\
31 || (c >= '0' && c <= '9')\
32 || c == '_'\
33 || (c >= 128))
34
35
36 /* Don't ever change this -- it would break the portability of Python code */
37 #define TABSIZE 8
38
39 /* Forward */
40 static struct tok_state *tok_new(void);
41 static int tok_nextc(struct tok_state *tok);
42 static void tok_backup(struct tok_state *tok, int c);
43 static int syntaxerror(struct tok_state *tok, const char *format, ...);
44
45 /* Spaces in this constant are treated as "zero or more spaces or tabs" when
46 tokenizing. */
47 static const char* type_comment_prefix = "# type: ";
48
49 /* Create and initialize a new tok_state structure */
50
51 static struct tok_state *
52 tok_new(void)
53 {
54 struct tok_state *tok = (struct tok_state *)PyMem_Malloc(
55 sizeof(struct tok_state));
56 if (tok == NULL)
57 return NULL;
58 tok->buf = tok->cur = tok->inp = NULL;
59 tok->fp_interactive = 0;
60 tok->interactive_src_start = NULL;
61 tok->interactive_src_end = NULL;
62 tok->start = NULL;
63 tok->end = NULL;
64 tok->done = E_OK;
65 tok->fp = NULL;
66 tok->input = NULL;
67 tok->tabsize = TABSIZE;
68 tok->indent = 0;
69 tok->indstack[0] = 0;
70 tok->atbol = 1;
71 tok->pendin = 0;
72 tok->prompt = tok->nextprompt = NULL;
73 tok->lineno = 0;
74 tok->level = 0;
75 tok->altindstack[0] = 0;
76 tok->decoding_state = STATE_INIT;
77 tok->decoding_erred = 0;
78 tok->enc = NULL;
79 tok->encoding = NULL;
80 tok->cont_line = 0;
81 tok->filename = NULL;
82 tok->decoding_readline = NULL;
83 tok->decoding_buffer = NULL;
84 tok->type_comments = 0;
85 tok->async_hacks = 0;
86 tok->async_def = 0;
87 tok->async_def_indent = 0;
88 tok->async_def_nl = 0;
89 tok->interactive_underflow = IUNDERFLOW_NORMAL;
90 tok->str = NULL;
91 tok->report_warnings = 1;
92 return tok;
93 }
94
95 static char *
96 new_string(const char *s, Py_ssize_t len, struct tok_state *tok)
97 {
98 char* result = (char *)PyMem_Malloc(len + 1);
99 if (!result) {
100 tok->done = E_NOMEM;
101 return NULL;
102 }
103 memcpy(result, s, len);
104 result[len] = '\0';
105 return result;
106 }
107
108 static char *
109 error_ret(struct tok_state *tok) /* XXX */
110 {
111 tok->decoding_erred = 1;
112 if (tok->fp != NULL && tok->buf != NULL) /* see _PyTokenizer_Free */
113 PyMem_Free(tok->buf);
114 tok->buf = tok->cur = tok->inp = NULL;
115 tok->start = NULL;
116 tok->end = NULL;
117 tok->done = E_DECODE;
118 return NULL; /* as if it were EOF */
119 }
120
121
122 static const char *
123 get_normal_name(const char *s) /* for utf-8 and latin-1 */
124 {
125 char buf[13];
126 int i;
127 for (i = 0; i < 12; i++) {
128 int c = s[i];
129 if (c == '\0')
130 break;
131 else if (c == '_')
132 buf[i] = '-';
133 else
134 buf[i] = tolower(c);
135 }
136 buf[i] = '\0';
137 if (strcmp(buf, "utf-8") == 0 ||
138 strncmp(buf, "utf-8-", 6) == 0)
139 return "utf-8";
140 else if (strcmp(buf, "latin-1") == 0 ||
141 strcmp(buf, "iso-8859-1") == 0 ||
142 strcmp(buf, "iso-latin-1") == 0 ||
143 strncmp(buf, "latin-1-", 8) == 0 ||
144 strncmp(buf, "iso-8859-1-", 11) == 0 ||
145 strncmp(buf, "iso-latin-1-", 12) == 0)
146 return "iso-8859-1";
147 else
148 return s;
149 }
150
151 /* Return the coding spec in S, or NULL if none is found. */
152
153 static int
154 get_coding_spec(const char *s, char **spec, Py_ssize_t size, struct tok_state *tok)
155 {
156 Py_ssize_t i;
157 *spec = NULL;
158 /* Coding spec must be in a comment, and that comment must be
159 * the only statement on the source code line. */
160 for (i = 0; i < size - 6; i++) {
161 if (s[i] == '#')
162 break;
163 if (s[i] != ' ' && s[i] != '\t' && s[i] != '\014')
164 return 1;
165 }
166 for (; i < size - 6; i++) { /* XXX inefficient search */
167 const char* t = s + i;
168 if (memcmp(t, "coding", 6) == 0) {
169 const char* begin = NULL;
170 t += 6;
171 if (t[0] != ':' && t[0] != '=')
172 continue;
173 do {
174 t++;
175 } while (t[0] == ' ' || t[0] == '\t');
176
177 begin = t;
178 while (Py_ISALNUM(t[0]) ||
179 t[0] == '-' || t[0] == '_' || t[0] == '.')
180 t++;
181
182 if (begin < t) {
183 char* r = new_string(begin, t - begin, tok);
184 const char* q;
185 if (!r)
186 return 0;
187 q = get_normal_name(r);
188 if (r != q) {
189 PyMem_Free(r);
190 r = new_string(q, strlen(q), tok);
191 if (!r)
192 return 0;
193 }
194 *spec = r;
195 break;
196 }
197 }
198 }
199 return 1;
200 }
201
202 /* Check whether the line contains a coding spec. If it does,
203 invoke the set_readline function for the new encoding.
204 This function receives the tok_state and the new encoding.
205 Return 1 on success, 0 on failure. */
206
207 static int
208 check_coding_spec(const char* line, Py_ssize_t size, struct tok_state *tok,
209 int set_readline(struct tok_state *, const char *))
210 {
211 char *cs;
212 if (tok->cont_line) {
213 /* It's a continuation line, so it can't be a coding spec. */
214 tok->decoding_state = STATE_NORMAL;
215 return 1;
216 }
217 if (!get_coding_spec(line, &cs, size, tok)) {
218 return 0;
219 }
220 if (!cs) {
221 Py_ssize_t i;
222 for (i = 0; i < size; i++) {
223 if (line[i] == '#' || line[i] == '\n' || line[i] == '\r')
224 break;
225 if (line[i] != ' ' && line[i] != '\t' && line[i] != '\014') {
226 /* Stop checking coding spec after a line containing
227 * anything except a comment. */
228 tok->decoding_state = STATE_NORMAL;
229 break;
230 }
231 }
232 return 1;
233 }
234 tok->decoding_state = STATE_NORMAL;
235 if (tok->encoding == NULL) {
236 assert(tok->decoding_readline == NULL);
237 if (strcmp(cs, "utf-8") != 0 && !set_readline(tok, cs)) {
238 error_ret(tok);
239 PyErr_Format(PyExc_SyntaxError, "encoding problem: %s", cs);
240 PyMem_Free(cs);
241 return 0;
242 }
243 tok->encoding = cs;
244 } else { /* then, compare cs with BOM */
245 if (strcmp(tok->encoding, cs) != 0) {
246 error_ret(tok);
247 PyErr_Format(PyExc_SyntaxError,
248 "encoding problem: %s with BOM", cs);
249 PyMem_Free(cs);
250 return 0;
251 }
252 PyMem_Free(cs);
253 }
254 return 1;
255 }
256
257 /* See whether the file starts with a BOM. If it does,
258 invoke the set_readline function with the new encoding.
259 Return 1 on success, 0 on failure. */
260
261 static int
262 check_bom(int get_char(struct tok_state *),
263 void unget_char(int, struct tok_state *),
264 int set_readline(struct tok_state *, const char *),
265 struct tok_state *tok)
266 {
267 int ch1, ch2, ch3;
268 ch1 = get_char(tok);
269 tok->decoding_state = STATE_SEEK_CODING;
270 if (ch1 == EOF) {
271 return 1;
272 } else if (ch1 == 0xEF) {
273 ch2 = get_char(tok);
274 if (ch2 != 0xBB) {
275 unget_char(ch2, tok);
276 unget_char(ch1, tok);
277 return 1;
278 }
279 ch3 = get_char(tok);
280 if (ch3 != 0xBF) {
281 unget_char(ch3, tok);
282 unget_char(ch2, tok);
283 unget_char(ch1, tok);
284 return 1;
285 }
286 } else {
287 unget_char(ch1, tok);
288 return 1;
289 }
290 if (tok->encoding != NULL)
291 PyMem_Free(tok->encoding);
292 tok->encoding = new_string("utf-8", 5, tok);
293 if (!tok->encoding)
294 return 0;
295 /* No need to set_readline: input is already utf-8 */
296 return 1;
297 }
298
299 static int
300 tok_concatenate_interactive_new_line(struct tok_state *tok, const char *line) {
301 assert(tok->fp_interactive);
302
303 if (!line) {
304 return 0;
305 }
306
307 Py_ssize_t current_size = tok->interactive_src_end - tok->interactive_src_start;
308 Py_ssize_t line_size = strlen(line);
309 char last_char = line[line_size > 0 ? line_size - 1 : line_size];
310 if (last_char != '\n') {
311 line_size += 1;
312 }
313 char* new_str = tok->interactive_src_start;
314
315 new_str = PyMem_Realloc(new_str, current_size + line_size + 1);
316 if (!new_str) {
317 if (tok->interactive_src_start) {
318 PyMem_Free(tok->interactive_src_start);
319 }
320 tok->interactive_src_start = NULL;
321 tok->interactive_src_end = NULL;
322 tok->done = E_NOMEM;
323 return -1;
324 }
325 strcpy(new_str + current_size, line);
326 if (last_char != '\n') {
327 /* Last line does not end in \n, fake one */
328 new_str[current_size + line_size - 1] = '\n';
329 new_str[current_size + line_size] = '\0';
330 }
331 tok->interactive_src_start = new_str;
332 tok->interactive_src_end = new_str + current_size + line_size;
333 return 0;
334 }
335
336
337 /* Read a line of text from TOK into S, using the stream in TOK.
338 Return NULL on failure, else S.
339
340 On entry, tok->decoding_buffer will be one of:
341 1) NULL: need to call tok->decoding_readline to get a new line
342 2) PyUnicodeObject *: decoding_feof has called tok->decoding_readline and
343 stored the result in tok->decoding_buffer
344 3) PyByteArrayObject *: previous call to tok_readline_recode did not have enough room
345 (in the s buffer) to copy entire contents of the line read
346 by tok->decoding_readline. tok->decoding_buffer has the overflow.
347 In this case, tok_readline_recode is called in a loop (with an expanded buffer)
348 until the buffer ends with a '\n' (or until the end of the file is
349 reached): see tok_nextc and its calls to tok_reserve_buf.
350 */
351
352 static int
353 tok_reserve_buf(struct tok_state *tok, Py_ssize_t size)
354 {
355 Py_ssize_t cur = tok->cur - tok->buf;
356 Py_ssize_t oldsize = tok->inp - tok->buf;
357 Py_ssize_t newsize = oldsize + Py_MAX(size, oldsize >> 1);
358 if (newsize > tok->end - tok->buf) {
359 char *newbuf = tok->buf;
360 Py_ssize_t start = tok->start == NULL ? -1 : tok->start - tok->buf;
361 Py_ssize_t line_start = tok->start == NULL ? -1 : tok->line_start - tok->buf;
362 Py_ssize_t multi_line_start = tok->multi_line_start - tok->buf;
363 newbuf = (char *)PyMem_Realloc(newbuf, newsize);
364 if (newbuf == NULL) {
365 tok->done = E_NOMEM;
366 return 0;
367 }
368 tok->buf = newbuf;
369 tok->cur = tok->buf + cur;
370 tok->inp = tok->buf + oldsize;
371 tok->end = tok->buf + newsize;
372 tok->start = start < 0 ? NULL : tok->buf + start;
373 tok->line_start = line_start < 0 ? NULL : tok->buf + line_start;
374 tok->multi_line_start = multi_line_start < 0 ? NULL : tok->buf + multi_line_start;
375 }
376 return 1;
377 }
378
379 static inline int
380 contains_null_bytes(const char* str, size_t size) {
381 return memchr(str, 0, size) != NULL;
382 }
383
384 static int
385 tok_readline_recode(struct tok_state *tok) {
386 PyObject *line;
387 const char *buf;
388 Py_ssize_t buflen;
389 line = tok->decoding_buffer;
390 if (line == NULL) {
391 line = PyObject_CallNoArgs(tok->decoding_readline);
392 if (line == NULL) {
393 error_ret(tok);
394 goto error;
395 }
396 }
397 else {
398 tok->decoding_buffer = NULL;
399 }
400 buf = PyUnicode_AsUTF8AndSize(line, &buflen);
401 if (buf == NULL) {
402 error_ret(tok);
403 goto error;
404 }
405 // Make room for the null terminator *and* potentially
406 // an extra newline character that we may need to artificially
407 // add.
408 size_t buffer_size = buflen + 2;
409 if (!tok_reserve_buf(tok, buffer_size)) {
410 goto error;
411 }
412 memcpy(tok->inp, buf, buflen);
413 tok->inp += buflen;
414 *tok->inp = '\0';
415 if (tok->fp_interactive &&
416 tok_concatenate_interactive_new_line(tok, buf) == -1) {
417 goto error;
418 }
419 Py_DECREF(line);
420 return 1;
421 error:
422 Py_XDECREF(line);
423 return 0;
424 }
425
426 /* Set the readline function for TOK to a StreamReader's
427 readline function. The StreamReader is named ENC.
428
429 This function is called from check_bom and check_coding_spec.
430
431 ENC is usually identical to the future value of tok->encoding,
432 except for the (currently unsupported) case of UTF-16.
433
434 Return 1 on success, 0 on failure. */
435
436 static int
437 fp_setreadl(struct tok_state *tok, const char* enc)
438 {
439 PyObject *readline, *io, *stream;
440 int fd;
441 long pos;
442
443 fd = fileno(tok->fp);
444 /* Due to buffering the file offset for fd can be different from the file
445 * position of tok->fp. If tok->fp was opened in text mode on Windows,
446 * its file position counts CRLF as one char and can't be directly mapped
447 * to the file offset for fd. Instead we step back one byte and read to
448 * the end of line.*/
449 pos = ftell(tok->fp);
450 if (pos == -1 ||
451 lseek(fd, (off_t)(pos > 0 ? pos - 1 : pos), SEEK_SET) == (off_t)-1) {
452 PyErr_SetFromErrnoWithFilename(PyExc_OSError, NULL);
453 return 0;
454 }
455
456 io = PyImport_ImportModule("io");
457 if (io == NULL) {
458 return 0;
459 }
460 stream = _PyObject_CallMethod(io, &_Py_ID(open), "isisOOO",
461 fd, "r", -1, enc, Py_None, Py_None, Py_False);
462 Py_DECREF(io);
463 if (stream == NULL) {
464 return 0;
465 }
466
467 readline = PyObject_GetAttr(stream, &_Py_ID(readline));
468 Py_DECREF(stream);
469 if (readline == NULL) {
470 return 0;
471 }
472 Py_XSETREF(tok->decoding_readline, readline);
473
474 if (pos > 0) {
475 PyObject *bufobj = _PyObject_CallNoArgs(readline);
476 if (bufobj == NULL) {
477 return 0;
478 }
479 Py_DECREF(bufobj);
480 }
481
482 return 1;
483 }
484
485 /* Fetch the next byte from TOK. */
486
487 static int fp_getc(struct tok_state *tok) {
488 return getc(tok->fp);
489 }
490
491 /* Unfetch the last byte back into TOK. */
492
493 static void fp_ungetc(int c, struct tok_state *tok) {
494 ungetc(c, tok->fp);
495 }
496
497 /* Check whether the characters at s start a valid
498 UTF-8 sequence. Return the number of characters forming
499 the sequence if yes, 0 if not. The special cases match
500 those in stringlib/codecs.h:utf8_decode.
501 */
502 static int
503 valid_utf8(const unsigned char* s)
504 {
505 int expected = 0;
506 int length;
507 if (*s < 0x80) {
508 /* single-byte code */
509 return 1;
510 }
511 else if (*s < 0xE0) {
512 /* \xC2\x80-\xDF\xBF -- 0080-07FF */
513 if (*s < 0xC2) {
514 /* invalid sequence
515 \x80-\xBF -- continuation byte
516 \xC0-\xC1 -- fake 0000-007F */
517 return 0;
518 }
519 expected = 1;
520 }
521 else if (*s < 0xF0) {
522 /* \xE0\xA0\x80-\xEF\xBF\xBF -- 0800-FFFF */
523 if (*s == 0xE0 && *(s + 1) < 0xA0) {
524 /* invalid sequence
525 \xE0\x80\x80-\xE0\x9F\xBF -- fake 0000-0800 */
526 return 0;
527 }
528 else if (*s == 0xED && *(s + 1) >= 0xA0) {
529 /* Decoding UTF-8 sequences in range \xED\xA0\x80-\xED\xBF\xBF
530 will result in surrogates in range D800-DFFF. Surrogates are
531 not valid UTF-8 so they are rejected.
532 See https://www.unicode.org/versions/Unicode5.2.0/ch03.pdf
533 (table 3-7) and http://www.rfc-editor.org/rfc/rfc3629.txt */
534 return 0;
535 }
536 expected = 2;
537 }
538 else if (*s < 0xF5) {
539 /* \xF0\x90\x80\x80-\xF4\x8F\xBF\xBF -- 10000-10FFFF */
540 if (*(s + 1) < 0x90 ? *s == 0xF0 : *s == 0xF4) {
541 /* invalid sequence -- one of:
542 \xF0\x80\x80\x80-\xF0\x8F\xBF\xBF -- fake 0000-FFFF
543 \xF4\x90\x80\x80- -- 110000- overflow */
544 return 0;
545 }
546 expected = 3;
547 }
548 else {
549 /* invalid start byte */
550 return 0;
551 }
552 length = expected + 1;
553 for (; expected; expected--)
554 if (s[expected] < 0x80 || s[expected] >= 0xC0)
555 return 0;
556 return length;
557 }
558
559 static int
560 ensure_utf8(char *line, struct tok_state *tok)
561 {
562 int badchar = 0;
563 unsigned char *c;
564 int length;
565 for (c = (unsigned char *)line; *c; c += length) {
566 if (!(length = valid_utf8(c))) {
567 badchar = *c;
568 break;
569 }
570 }
571 if (badchar) {
572 PyErr_Format(PyExc_SyntaxError,
573 "Non-UTF-8 code starting with '\\x%.2x' "
574 "in file %U on line %i, "
575 "but no encoding declared; "
576 "see https://peps.python.org/pep-0263/ for details",
577 badchar, tok->filename, tok->lineno);
578 return 0;
579 }
580 return 1;
581 }
582
583 /* Fetch a byte from TOK, using the string buffer. */
584
585 static int
586 buf_getc(struct tok_state *tok) {
587 return Py_CHARMASK(*tok->str++);
588 }
589
590 /* Unfetch a byte from TOK, using the string buffer. */
591
592 static void
593 buf_ungetc(int c, struct tok_state *tok) {
594 tok->str--;
595 assert(Py_CHARMASK(*tok->str) == c); /* tok->cur may point to read-only segment */
596 }
597
598 /* Set the readline function for TOK to ENC. For the string-based
599 tokenizer, this means to just record the encoding. */
600
601 static int
602 buf_setreadl(struct tok_state *tok, const char* enc) {
603 tok->enc = enc;
604 return 1;
605 }
606
607 /* Return a UTF-8 encoding Python string object from the
608 C byte string STR, which is encoded with ENC. */
609
610 static PyObject *
611 translate_into_utf8(const char* str, const char* enc) {
612 PyObject *utf8;
613 PyObject* buf = PyUnicode_Decode(str, strlen(str), enc, NULL);
614 if (buf == NULL)
615 return NULL;
616 utf8 = PyUnicode_AsUTF8String(buf);
617 Py_DECREF(buf);
618 return utf8;
619 }
620
621
622 static char *
623 translate_newlines(const char *s, int exec_input, struct tok_state *tok) {
624 int skip_next_lf = 0;
625 size_t needed_length = strlen(s) + 2, final_length;
626 char *buf, *current;
627 char c = '\0';
628 buf = PyMem_Malloc(needed_length);
629 if (buf == NULL) {
630 tok->done = E_NOMEM;
631 return NULL;
632 }
633 for (current = buf; *s; s++, current++) {
634 c = *s;
635 if (skip_next_lf) {
636 skip_next_lf = 0;
637 if (c == '\n') {
638 c = *++s;
639 if (!c)
640 break;
641 }
642 }
643 if (c == '\r') {
644 skip_next_lf = 1;
645 c = '\n';
646 }
647 *current = c;
648 }
649 /* If this is exec input, add a newline to the end of the string if
650 there isn't one already. */
651 if (exec_input && c != '\n') {
652 *current = '\n';
653 current++;
654 }
655 *current = '\0';
656 final_length = current - buf + 1;
657 if (final_length < needed_length && final_length) {
658 /* should never fail */
659 char* result = PyMem_Realloc(buf, final_length);
660 if (result == NULL) {
661 PyMem_Free(buf);
662 }
663 buf = result;
664 }
665 return buf;
666 }
667
668 /* Decode a byte string STR for use as the buffer of TOK.
669 Look for encoding declarations inside STR, and record them
670 inside TOK. */
671
672 static char *
673 decode_str(const char *input, int single, struct tok_state *tok)
674 {
675 PyObject* utf8 = NULL;
676 char *str;
677 const char *s;
678 const char *newl[2] = {NULL, NULL};
679 int lineno = 0;
680 tok->input = str = translate_newlines(input, single, tok);
681 if (str == NULL)
682 return NULL;
683 tok->enc = NULL;
684 tok->str = str;
685 if (!check_bom(buf_getc, buf_ungetc, buf_setreadl, tok))
686 return error_ret(tok);
687 str = tok->str; /* string after BOM if any */
688 assert(str);
689 if (tok->enc != NULL) {
690 utf8 = translate_into_utf8(str, tok->enc);
691 if (utf8 == NULL)
692 return error_ret(tok);
693 str = PyBytes_AsString(utf8);
694 }
695 for (s = str;; s++) {
696 if (*s == '\0') break;
697 else if (*s == '\n') {
698 assert(lineno < 2);
699 newl[lineno] = s;
700 lineno++;
701 if (lineno == 2) break;
702 }
703 }
704 tok->enc = NULL;
705 /* need to check line 1 and 2 separately since check_coding_spec
706 assumes a single line as input */
707 if (newl[0]) {
708 if (!check_coding_spec(str, newl[0] - str, tok, buf_setreadl)) {
709 return NULL;
710 }
711 if (tok->enc == NULL && tok->decoding_state != STATE_NORMAL && newl[1]) {
712 if (!check_coding_spec(newl[0]+1, newl[1] - newl[0],
713 tok, buf_setreadl))
714 return NULL;
715 }
716 }
717 if (tok->enc != NULL) {
718 assert(utf8 == NULL);
719 utf8 = translate_into_utf8(str, tok->enc);
720 if (utf8 == NULL)
721 return error_ret(tok);
722 str = PyBytes_AS_STRING(utf8);
723 }
724 assert(tok->decoding_buffer == NULL);
725 tok->decoding_buffer = utf8; /* CAUTION */
726 return str;
727 }
728
729 /* Set up tokenizer for string */
730
731 struct tok_state *
732 _PyTokenizer_FromString(const char *str, int exec_input)
733 {
734 struct tok_state *tok = tok_new();
735 char *decoded;
736
737 if (tok == NULL)
738 return NULL;
739 decoded = decode_str(str, exec_input, tok);
740 if (decoded == NULL) {
741 _PyTokenizer_Free(tok);
742 return NULL;
743 }
744
745 tok->buf = tok->cur = tok->inp = decoded;
746 tok->end = decoded;
747 return tok;
748 }
749
750 /* Set up tokenizer for UTF-8 string */
751
752 struct tok_state *
753 _PyTokenizer_FromUTF8(const char *str, int exec_input)
754 {
755 struct tok_state *tok = tok_new();
756 char *translated;
757 if (tok == NULL)
758 return NULL;
759 tok->input = translated = translate_newlines(str, exec_input, tok);
760 if (translated == NULL) {
761 _PyTokenizer_Free(tok);
762 return NULL;
763 }
764 tok->decoding_state = STATE_NORMAL;
765 tok->enc = NULL;
766 tok->str = translated;
767 tok->encoding = new_string("utf-8", 5, tok);
768 if (!tok->encoding) {
769 _PyTokenizer_Free(tok);
770 return NULL;
771 }
772
773 tok->buf = tok->cur = tok->inp = translated;
774 tok->end = translated;
775 return tok;
776 }
777
778 /* Set up tokenizer for file */
779
780 struct tok_state *
781 _PyTokenizer_FromFile(FILE *fp, const char* enc,
782 const char *ps1, const char *ps2)
783 {
784 struct tok_state *tok = tok_new();
785 if (tok == NULL)
786 return NULL;
787 if ((tok->buf = (char *)PyMem_Malloc(BUFSIZ)) == NULL) {
788 _PyTokenizer_Free(tok);
789 return NULL;
790 }
791 tok->cur = tok->inp = tok->buf;
792 tok->end = tok->buf + BUFSIZ;
793 tok->fp = fp;
794 tok->prompt = ps1;
795 tok->nextprompt = ps2;
796 if (enc != NULL) {
797 /* Must copy encoding declaration since it
798 gets copied into the parse tree. */
799 tok->encoding = new_string(enc, strlen(enc), tok);
800 if (!tok->encoding) {
801 _PyTokenizer_Free(tok);
802 return NULL;
803 }
804 tok->decoding_state = STATE_NORMAL;
805 }
806 return tok;
807 }
808
809 /* Free a tok_state structure */
810
811 void
812 _PyTokenizer_Free(struct tok_state *tok)
813 {
814 if (tok->encoding != NULL) {
815 PyMem_Free(tok->encoding);
816 }
817 Py_XDECREF(tok->decoding_readline);
818 Py_XDECREF(tok->decoding_buffer);
819 Py_XDECREF(tok->filename);
820 if (tok->fp != NULL && tok->buf != NULL) {
821 PyMem_Free(tok->buf);
822 }
823 if (tok->input) {
824 PyMem_Free(tok->input);
825 }
826 if (tok->interactive_src_start != NULL) {
827 PyMem_Free(tok->interactive_src_start);
828 }
829 PyMem_Free(tok);
830 }
831
832 static int
833 tok_readline_raw(struct tok_state *tok)
834 {
835 do {
836 if (!tok_reserve_buf(tok, BUFSIZ)) {
837 return 0;
838 }
839 int n_chars = (int)(tok->end - tok->inp);
840 size_t line_size = 0;
841 char *line = _Py_UniversalNewlineFgetsWithSize(tok->inp, n_chars, tok->fp, NULL, &line_size);
842 if (line == NULL) {
843 return 1;
844 }
845 if (tok->fp_interactive &&
846 tok_concatenate_interactive_new_line(tok, line) == -1) {
847 return 0;
848 }
849 tok->inp += line_size;
850 if (tok->inp == tok->buf) {
851 return 0;
852 }
853 } while (tok->inp[-1] != '\n');
854 return 1;
855 }
856
857 static int
858 tok_underflow_string(struct tok_state *tok) {
859 char *end = strchr(tok->inp, '\n');
860 if (end != NULL) {
861 end++;
862 }
863 else {
864 end = strchr(tok->inp, '\0');
865 if (end == tok->inp) {
866 tok->done = E_EOF;
867 return 0;
868 }
869 }
870 if (tok->start == NULL) {
871 tok->buf = tok->cur;
872 }
873 tok->line_start = tok->cur;
874 tok->lineno++;
875 tok->inp = end;
876 return 1;
877 }
878
879 static int
880 tok_underflow_interactive(struct tok_state *tok) {
881 if (tok->interactive_underflow == IUNDERFLOW_STOP) {
882 tok->done = E_INTERACT_STOP;
883 return 1;
884 }
885 char *newtok = PyOS_Readline(tok->fp ? tok->fp : stdin, stdout, tok->prompt);
886 if (newtok != NULL) {
887 char *translated = translate_newlines(newtok, 0, tok);
888 PyMem_Free(newtok);
889 if (translated == NULL) {
890 return 0;
891 }
892 newtok = translated;
893 }
894 if (tok->encoding && newtok && *newtok) {
895 /* Recode to UTF-8 */
896 Py_ssize_t buflen;
897 const char* buf;
898 PyObject *u = translate_into_utf8(newtok, tok->encoding);
899 PyMem_Free(newtok);
900 if (u == NULL) {
901 tok->done = E_DECODE;
902 return 0;
903 }
904 buflen = PyBytes_GET_SIZE(u);
905 buf = PyBytes_AS_STRING(u);
906 newtok = PyMem_Malloc(buflen+1);
907 if (newtok == NULL) {
908 Py_DECREF(u);
909 tok->done = E_NOMEM;
910 return 0;
911 }
912 strcpy(newtok, buf);
913 Py_DECREF(u);
914 }
915 if (tok->fp_interactive &&
916 tok_concatenate_interactive_new_line(tok, newtok) == -1) {
917 PyMem_Free(newtok);
918 return 0;
919 }
920 if (tok->nextprompt != NULL) {
921 tok->prompt = tok->nextprompt;
922 }
923 if (newtok == NULL) {
924 tok->done = E_INTR;
925 }
926 else if (*newtok == '\0') {
927 PyMem_Free(newtok);
928 tok->done = E_EOF;
929 }
930 else if (tok->start != NULL) {
931 Py_ssize_t cur_multi_line_start = tok->multi_line_start - tok->buf;
932 size_t size = strlen(newtok);
933 tok->lineno++;
934 if (!tok_reserve_buf(tok, size + 1)) {
935 PyMem_Free(tok->buf);
936 tok->buf = NULL;
937 PyMem_Free(newtok);
938 return 0;
939 }
940 memcpy(tok->cur, newtok, size + 1);
941 PyMem_Free(newtok);
942 tok->inp += size;
943 tok->multi_line_start = tok->buf + cur_multi_line_start;
944 }
945 else {
946 tok->lineno++;
947 PyMem_Free(tok->buf);
948 tok->buf = newtok;
949 tok->cur = tok->buf;
950 tok->line_start = tok->buf;
951 tok->inp = strchr(tok->buf, '\0');
952 tok->end = tok->inp + 1;
953 }
954 if (tok->done != E_OK) {
955 if (tok->prompt != NULL) {
956 PySys_WriteStderr("\n");
957 }
958 return 0;
959 }
960 return 1;
961 }
962
963 static int
964 tok_underflow_file(struct tok_state *tok) {
965 if (tok->start == NULL) {
966 tok->cur = tok->inp = tok->buf;
967 }
968 if (tok->decoding_state == STATE_INIT) {
969 /* We have not yet determined the encoding.
970 If an encoding is found, use the file-pointer
971 reader functions from now on. */
972 if (!check_bom(fp_getc, fp_ungetc, fp_setreadl, tok)) {
973 error_ret(tok);
974 return 0;
975 }
976 assert(tok->decoding_state != STATE_INIT);
977 }
978 /* Read until '\n' or EOF */
979 if (tok->decoding_readline != NULL) {
980 /* We already have a codec associated with this input. */
981 if (!tok_readline_recode(tok)) {
982 return 0;
983 }
984 }
985 else {
986 /* We want a 'raw' read. */
987 if (!tok_readline_raw(tok)) {
988 return 0;
989 }
990 }
991 if (tok->inp == tok->cur) {
992 tok->done = E_EOF;
993 return 0;
994 }
995 if (tok->inp[-1] != '\n') {
996 assert(tok->inp + 1 < tok->end);
997 /* Last line does not end in \n, fake one */
998 *tok->inp++ = '\n';
999 *tok->inp = '\0';
1000 }
1001
1002 tok->lineno++;
1003 if (tok->decoding_state != STATE_NORMAL) {
1004 if (tok->lineno > 2) {
1005 tok->decoding_state = STATE_NORMAL;
1006 }
1007 else if (!check_coding_spec(tok->cur, strlen(tok->cur),
1008 tok, fp_setreadl))
1009 {
1010 return 0;
1011 }
1012 }
1013 /* The default encoding is UTF-8, so make sure we don't have any
1014 non-UTF-8 sequences in it. */
1015 if (!tok->encoding && !ensure_utf8(tok->cur, tok)) {
1016 error_ret(tok);
1017 return 0;
1018 }
1019 assert(tok->done == E_OK);
1020 return tok->done == E_OK;
1021 }
1022
1023 #if defined(Py_DEBUG)
1024 static void
1025 print_escape(FILE *f, const char *s, Py_ssize_t size)
1026 {
1027 if (s == NULL) {
1028 fputs("NULL", f);
1029 return;
1030 }
1031 putc('"', f);
1032 while (size-- > 0) {
1033 unsigned char c = *s++;
1034 switch (c) {
1035 case '\n': fputs("\\n", f); break;
1036 case '\r': fputs("\\r", f); break;
1037 case '\t': fputs("\\t", f); break;
1038 case '\f': fputs("\\f", f); break;
1039 case '\'': fputs("\\'", f); break;
1040 case '"': fputs("\\\"", f); break;
1041 default:
1042 if (0x20 <= c && c <= 0x7f)
1043 putc(c, f);
1044 else
1045 fprintf(f, "\\x%02x", c);
1046 }
1047 }
1048 putc('"', f);
1049 }
1050 #endif
1051
1052 /* Get next char, updating state; error code goes into tok->done */
1053
1054 static int
1055 tok_nextc(struct tok_state *tok)
1056 {
1057 int rc;
1058 for (;;) {
1059 if (tok->cur != tok->inp) {
1060 if (tok->cur - tok->buf >= INT_MAX) {
1061 tok->done = E_COLUMNOVERFLOW;
1062 return EOF;
1063 }
1064 return Py_CHARMASK(*tok->cur++); /* Fast path */
1065 }
1066 if (tok->done != E_OK) {
1067 return EOF;
1068 }
1069 if (tok->fp == NULL) {
1070 rc = tok_underflow_string(tok);
1071 }
1072 else if (tok->prompt != NULL) {
1073 rc = tok_underflow_interactive(tok);
1074 }
1075 else {
1076 rc = tok_underflow_file(tok);
1077 }
1078 #if defined(Py_DEBUG)
1079 if (Py_DebugFlag) {
1080 fprintf(stderr, "line[%d] = ", tok->lineno);
1081 print_escape(stderr, tok->cur, tok->inp - tok->cur);
1082 fprintf(stderr, " tok->done = %d\n", tok->done);
1083 }
1084 #endif
1085 if (!rc) {
1086 tok->cur = tok->inp;
1087 return EOF;
1088 }
1089 tok->line_start = tok->cur;
1090
1091 if (contains_null_bytes(tok->line_start, tok->inp - tok->line_start)) {
1092 syntaxerror(tok, "source code cannot contain null bytes");
1093 tok->cur = tok->inp;
1094 return EOF;
1095 }
1096 }
1097 Py_UNREACHABLE();
1098 }
1099
1100 /* Back-up one character */
1101
1102 static void
1103 tok_backup(struct tok_state *tok, int c)
1104 {
1105 if (c != EOF) {
1106 if (--tok->cur < tok->buf) {
1107 Py_FatalError("tokenizer beginning of buffer");
1108 }
1109 if ((int)(unsigned char)*tok->cur != c) {
1110 Py_FatalError("tok_backup: wrong character");
1111 }
1112 }
1113 }
1114
1115 static int
1116 _syntaxerror_range(struct tok_state *tok, const char *format,
1117 int col_offset, int end_col_offset,
1118 va_list vargs)
1119 {
1120 PyObject *errmsg, *errtext, *args;
1121 errmsg = PyUnicode_FromFormatV(format, vargs);
1122 if (!errmsg) {
1123 goto error;
1124 }
1125
1126 errtext = PyUnicode_DecodeUTF8(tok->line_start, tok->cur - tok->line_start,
1127 "replace");
1128 if (!errtext) {
1129 goto error;
1130 }
1131
1132 if (col_offset == -1) {
1133 col_offset = (int)PyUnicode_GET_LENGTH(errtext);
1134 }
1135 if (end_col_offset == -1) {
1136 end_col_offset = col_offset;
1137 }
1138
1139 Py_ssize_t line_len = strcspn(tok->line_start, "\n");
1140 if (line_len != tok->cur - tok->line_start) {
1141 Py_DECREF(errtext);
1142 errtext = PyUnicode_DecodeUTF8(tok->line_start, line_len,
1143 "replace");
1144 }
1145 if (!errtext) {
1146 goto error;
1147 }
1148
1149 args = Py_BuildValue("(O(OiiNii))", errmsg, tok->filename, tok->lineno,
1150 col_offset, errtext, tok->lineno, end_col_offset);
1151 if (args) {
1152 PyErr_SetObject(PyExc_SyntaxError, args);
1153 Py_DECREF(args);
1154 }
1155
1156 error:
1157 Py_XDECREF(errmsg);
1158 tok->done = E_ERROR;
1159 return ERRORTOKEN;
1160 }
1161
1162 static int
1163 syntaxerror(struct tok_state *tok, const char *format, ...)
1164 {
1165 va_list vargs;
1166 #ifdef HAVE_STDARG_PROTOTYPES
1167 va_start(vargs, format);
1168 #else
1169 va_start(vargs);
1170 #endif
1171 int ret = _syntaxerror_range(tok, format, -1, -1, vargs);
1172 va_end(vargs);
1173 return ret;
1174 }
1175
1176 static int
1177 syntaxerror_known_range(struct tok_state *tok,
1178 int col_offset, int end_col_offset,
1179 const char *format, ...)
1180 {
1181 va_list vargs;
1182 #ifdef HAVE_STDARG_PROTOTYPES
1183 va_start(vargs, format);
1184 #else
1185 va_start(vargs);
1186 #endif
1187 int ret = _syntaxerror_range(tok, format, col_offset, end_col_offset, vargs);
1188 va_end(vargs);
1189 return ret;
1190 }
1191
1192
1193
1194 static int
1195 indenterror(struct tok_state *tok)
1196 {
1197 tok->done = E_TABSPACE;
1198 tok->cur = tok->inp;
1199 return ERRORTOKEN;
1200 }
1201
1202 static int
1203 parser_warn(struct tok_state *tok, PyObject *category, const char *format, ...)
1204 {
1205 if (!tok->report_warnings) {
1206 return 0;
1207 }
1208
1209 PyObject *errmsg;
1210 va_list vargs;
1211 #ifdef HAVE_STDARG_PROTOTYPES
1212 va_start(vargs, format);
1213 #else
1214 va_start(vargs);
1215 #endif
1216 errmsg = PyUnicode_FromFormatV(format, vargs);
1217 va_end(vargs);
1218 if (!errmsg) {
1219 goto error;
1220 }
1221
1222 if (PyErr_WarnExplicitObject(category, errmsg, tok->filename,
1223 tok->lineno, NULL, NULL) < 0) {
1224 if (PyErr_ExceptionMatches(category)) {
1225 /* Replace the DeprecationWarning exception with a SyntaxError
1226 to get a more accurate error report */
1227 PyErr_Clear();
1228 syntaxerror(tok, "%U", errmsg);
1229 }
1230 goto error;
1231 }
1232 Py_DECREF(errmsg);
1233 return 0;
1234
1235 error:
1236 Py_XDECREF(errmsg);
1237 tok->done = E_ERROR;
1238 return -1;
1239 }
1240
1241 static int
1242 lookahead(struct tok_state *tok, const char *test)
1243 {
1244 const char *s = test;
1245 int res = 0;
1246 while (1) {
1247 int c = tok_nextc(tok);
1248 if (*s == 0) {
1249 res = !is_potential_identifier_char(c);
1250 }
1251 else if (c == *s) {
1252 s++;
1253 continue;
1254 }
1255
1256 tok_backup(tok, c);
1257 while (s != test) {
1258 tok_backup(tok, *--s);
1259 }
1260 return res;
1261 }
1262 }
1263
1264 static int
1265 verify_end_of_number(struct tok_state *tok, int c, const char *kind)
1266 {
1267 /* Emit a deprecation warning only if the numeric literal is immediately
1268 * followed by one of keywords which can occur after a numeric literal
1269 * in valid code: "and", "else", "for", "if", "in", "is" and "or".
1270 * It allows to gradually deprecate existing valid code without adding
1271 * warning before error in most cases of invalid numeric literal (which
1272 * would be confusing and break existing tests).
1273 * Raise a syntax error with slightly better message than plain
1274 * "invalid syntax" if the numeric literal is immediately followed by
1275 * other keyword or identifier.
1276 */
1277 int r = 0;
1278 if (c == 'a') {
1279 r = lookahead(tok, "nd");
1280 }
1281 else if (c == 'e') {
1282 r = lookahead(tok, "lse");
1283 }
1284 else if (c == 'f') {
1285 r = lookahead(tok, "or");
1286 }
1287 else if (c == 'i') {
1288 int c2 = tok_nextc(tok);
1289 if (c2 == 'f' || c2 == 'n' || c2 == 's') {
1290 r = 1;
1291 }
1292 tok_backup(tok, c2);
1293 }
1294 else if (c == 'o') {
1295 r = lookahead(tok, "r");
1296 }
1297 else if (c == 'n') {
1298 r = lookahead(tok, "ot");
1299 }
1300 if (r) {
1301 tok_backup(tok, c);
1302 if (parser_warn(tok, PyExc_SyntaxWarning,
1303 "invalid %s literal", kind))
1304 {
1305 return 0;
1306 }
1307 tok_nextc(tok);
1308 }
1309 else /* In future releases, only error will remain. */
1310 if (c < 128 && is_potential_identifier_char(c)) {
1311 tok_backup(tok, c);
1312 syntaxerror(tok, "invalid %s literal", kind);
1313 return 0;
1314 }
1315 return 1;
1316 }
1317
1318 /* Verify that the identifier follows PEP 3131.
1319 All identifier strings are guaranteed to be "ready" unicode objects.
1320 */
1321 static int
1322 verify_identifier(struct tok_state *tok)
1323 {
1324 PyObject *s;
1325 if (tok->decoding_erred)
1326 return 0;
1327 s = PyUnicode_DecodeUTF8(tok->start, tok->cur - tok->start, NULL);
1328 if (s == NULL) {
1329 if (PyErr_ExceptionMatches(PyExc_UnicodeDecodeError)) {
1330 tok->done = E_DECODE;
1331 }
1332 else {
1333 tok->done = E_ERROR;
1334 }
1335 return 0;
1336 }
1337 Py_ssize_t invalid = _PyUnicode_ScanIdentifier(s);
1338 if (invalid < 0) {
1339 Py_DECREF(s);
1340 tok->done = E_ERROR;
1341 return 0;
1342 }
1343 assert(PyUnicode_GET_LENGTH(s) > 0);
1344 if (invalid < PyUnicode_GET_LENGTH(s)) {
1345 Py_UCS4 ch = PyUnicode_READ_CHAR(s, invalid);
1346 if (invalid + 1 < PyUnicode_GET_LENGTH(s)) {
1347 /* Determine the offset in UTF-8 encoded input */
1348 Py_SETREF(s, PyUnicode_Substring(s, 0, invalid + 1));
1349 if (s != NULL) {
1350 Py_SETREF(s, PyUnicode_AsUTF8String(s));
1351 }
1352 if (s == NULL) {
1353 tok->done = E_ERROR;
1354 return 0;
1355 }
1356 tok->cur = (char *)tok->start + PyBytes_GET_SIZE(s);
1357 }
1358 Py_DECREF(s);
1359 // PyUnicode_FromFormatV() does not support %X
1360 char hex[9];
1361 (void)PyOS_snprintf(hex, sizeof(hex), "%04X", ch);
1362 if (Py_UNICODE_ISPRINTABLE(ch)) {
1363 syntaxerror(tok, "invalid character '%c' (U+%s)", ch, hex);
1364 }
1365 else {
1366 syntaxerror(tok, "invalid non-printable character U+%s", hex);
1367 }
1368 return 0;
1369 }
1370 Py_DECREF(s);
1371 return 1;
1372 }
1373
1374 static int
1375 tok_decimal_tail(struct tok_state *tok)
1376 {
1377 int c;
1378
1379 while (1) {
1380 do {
1381 c = tok_nextc(tok);
1382 } while (isdigit(c));
1383 if (c != '_') {
1384 break;
1385 }
1386 c = tok_nextc(tok);
1387 if (!isdigit(c)) {
1388 tok_backup(tok, c);
1389 syntaxerror(tok, "invalid decimal literal");
1390 return 0;
1391 }
1392 }
1393 return c;
1394 }
1395
1396 /* Get next token, after space stripping etc. */
1397
1398 static inline int
1399 tok_continuation_line(struct tok_state *tok) {
1400 int c = tok_nextc(tok);
1401 if (c != '\n') {
1402 tok->done = E_LINECONT;
1403 return -1;
1404 }
1405 c = tok_nextc(tok);
1406 if (c == EOF) {
1407 tok->done = E_EOF;
1408 tok->cur = tok->inp;
1409 return -1;
1410 } else {
1411 tok_backup(tok, c);
1412 }
1413 return c;
1414 }
1415
1416 static int
1417 tok_get(struct tok_state *tok, const char **p_start, const char **p_end)
1418 {
1419 int c;
1420 int blankline, nonascii;
1421
1422 *p_start = *p_end = NULL;
1423 nextline:
1424 tok->start = NULL;
1425 blankline = 0;
1426
1427 /* Get indentation level */
1428 if (tok->atbol) {
1429 int col = 0;
1430 int altcol = 0;
1431 tok->atbol = 0;
1432 int cont_line_col = 0;
1433 for (;;) {
1434 c = tok_nextc(tok);
1435 if (c == ' ') {
1436 col++, altcol++;
1437 }
1438 else if (c == '\t') {
1439 col = (col / tok->tabsize + 1) * tok->tabsize;
1440 altcol = (altcol / ALTTABSIZE + 1) * ALTTABSIZE;
1441 }
1442 else if (c == '\014') {/* Control-L (formfeed) */
1443 col = altcol = 0; /* For Emacs users */
1444 }
1445 else if (c == '\\') {
1446 // Indentation cannot be split over multiple physical lines
1447 // using backslashes. This means that if we found a backslash
1448 // preceded by whitespace, **the first one we find** determines
1449 // the level of indentation of whatever comes next.
1450 cont_line_col = cont_line_col ? cont_line_col : col;
1451 if ((c = tok_continuation_line(tok)) == -1) {
1452 return ERRORTOKEN;
1453 }
1454 }
1455 else {
1456 break;
1457 }
1458 }
1459 tok_backup(tok, c);
1460 if (c == '#' || c == '\n') {
1461 /* Lines with only whitespace and/or comments
1462 shouldn't affect the indentation and are
1463 not passed to the parser as NEWLINE tokens,
1464 except *totally* empty lines in interactive
1465 mode, which signal the end of a command group. */
1466 if (col == 0 && c == '\n' && tok->prompt != NULL) {
1467 blankline = 0; /* Let it through */
1468 }
1469 else if (tok->prompt != NULL && tok->lineno == 1) {
1470 /* In interactive mode, if the first line contains
1471 only spaces and/or a comment, let it through. */
1472 blankline = 0;
1473 col = altcol = 0;
1474 }
1475 else {
1476 blankline = 1; /* Ignore completely */
1477 }
1478 /* We can't jump back right here since we still
1479 may need to skip to the end of a comment */
1480 }
1481 if (!blankline && tok->level == 0) {
1482 col = cont_line_col ? cont_line_col : col;
1483 altcol = cont_line_col ? cont_line_col : altcol;
1484 if (col == tok->indstack[tok->indent]) {
1485 /* No change */
1486 if (altcol != tok->altindstack[tok->indent]) {
1487 return indenterror(tok);
1488 }
1489 }
1490 else if (col > tok->indstack[tok->indent]) {
1491 /* Indent -- always one */
1492 if (tok->indent+1 >= MAXINDENT) {
1493 tok->done = E_TOODEEP;
1494 tok->cur = tok->inp;
1495 return ERRORTOKEN;
1496 }
1497 if (altcol <= tok->altindstack[tok->indent]) {
1498 return indenterror(tok);
1499 }
1500 tok->pendin++;
1501 tok->indstack[++tok->indent] = col;
1502 tok->altindstack[tok->indent] = altcol;
1503 }
1504 else /* col < tok->indstack[tok->indent] */ {
1505 /* Dedent -- any number, must be consistent */
1506 while (tok->indent > 0 &&
1507 col < tok->indstack[tok->indent]) {
1508 tok->pendin--;
1509 tok->indent--;
1510 }
1511 if (col != tok->indstack[tok->indent]) {
1512 tok->done = E_DEDENT;
1513 tok->cur = tok->inp;
1514 return ERRORTOKEN;
1515 }
1516 if (altcol != tok->altindstack[tok->indent]) {
1517 return indenterror(tok);
1518 }
1519 }
1520 }
1521 }
1522
1523 tok->start = tok->cur;
1524
1525 /* Return pending indents/dedents */
1526 if (tok->pendin != 0) {
1527 if (tok->pendin < 0) {
1528 tok->pendin++;
1529 return DEDENT;
1530 }
1531 else {
1532 tok->pendin--;
1533 return INDENT;
1534 }
1535 }
1536
1537 /* Peek ahead at the next character */
1538 c = tok_nextc(tok);
1539 tok_backup(tok, c);
1540 /* Check if we are closing an async function */
1541 if (tok->async_def
1542 && !blankline
1543 /* Due to some implementation artifacts of type comments,
1544 * a TYPE_COMMENT at the start of a function won't set an
1545 * indentation level and it will produce a NEWLINE after it.
1546 * To avoid spuriously ending an async function due to this,
1547 * wait until we have some non-newline char in front of us. */
1548 && c != '\n'
1549 && tok->level == 0
1550 /* There was a NEWLINE after ASYNC DEF,
1551 so we're past the signature. */
1552 && tok->async_def_nl
1553 /* Current indentation level is less than where
1554 the async function was defined */
1555 && tok->async_def_indent >= tok->indent)
1556 {
1557 tok->async_def = 0;
1558 tok->async_def_indent = 0;
1559 tok->async_def_nl = 0;
1560 }
1561
1562 again:
1563 tok->start = NULL;
1564 /* Skip spaces */
1565 do {
1566 c = tok_nextc(tok);
1567 } while (c == ' ' || c == '\t' || c == '\014');
1568
1569 /* Set start of current token */
1570 tok->start = tok->cur == NULL ? NULL : tok->cur - 1;
1571
1572 /* Skip comment, unless it's a type comment */
1573 if (c == '#') {
1574 const char *prefix, *p, *type_start;
1575
1576 while (c != EOF && c != '\n') {
1577 c = tok_nextc(tok);
1578 }
1579
1580 if (tok->type_comments) {
1581 p = tok->start;
1582 prefix = type_comment_prefix;
1583 while (*prefix && p < tok->cur) {
1584 if (*prefix == ' ') {
1585 while (*p == ' ' || *p == '\t') {
1586 p++;
1587 }
1588 } else if (*prefix == *p) {
1589 p++;
1590 } else {
1591 break;
1592 }
1593
1594 prefix++;
1595 }
1596
1597 /* This is a type comment if we matched all of type_comment_prefix. */
1598 if (!*prefix) {
1599 int is_type_ignore = 1;
1600 const char *ignore_end = p + 6;
1601 tok_backup(tok, c); /* don't eat the newline or EOF */
1602
1603 type_start = p;
1604
1605 /* A TYPE_IGNORE is "type: ignore" followed by the end of the token
1606 * or anything ASCII and non-alphanumeric. */
1607 is_type_ignore = (
1608 tok->cur >= ignore_end && memcmp(p, "ignore", 6) == 0
1609 && !(tok->cur > ignore_end
1610 && ((unsigned char)ignore_end[0] >= 128 || Py_ISALNUM(ignore_end[0]))));
1611
1612 if (is_type_ignore) {
1613 *p_start = ignore_end;
1614 *p_end = tok->cur;
1615
1616 /* If this type ignore is the only thing on the line, consume the newline also. */
1617 if (blankline) {
1618 tok_nextc(tok);
1619 tok->atbol = 1;
1620 }
1621 return TYPE_IGNORE;
1622 } else {
1623 *p_start = type_start; /* after type_comment_prefix */
1624 *p_end = tok->cur;
1625 return TYPE_COMMENT;
1626 }
1627 }
1628 }
1629 }
1630
1631 if (tok->done == E_INTERACT_STOP) {
1632 return ENDMARKER;
1633 }
1634
1635 /* Check for EOF and errors now */
1636 if (c == EOF) {
1637 if (tok->level) {
1638 return ERRORTOKEN;
1639 }
1640 return tok->done == E_EOF ? ENDMARKER : ERRORTOKEN;
1641 }
1642
1643 /* Identifier (most frequent token!) */
1644 nonascii = 0;
1645 if (is_potential_identifier_start(c)) {
1646 /* Process the various legal combinations of b"", r"", u"", and f"". */
1647 int saw_b = 0, saw_r = 0, saw_u = 0, saw_f = 0;
1648 while (1) {
1649 if (!(saw_b || saw_u || saw_f) && (c == 'b' || c == 'B'))
1650 saw_b = 1;
1651 /* Since this is a backwards compatibility support literal we don't
1652 want to support it in arbitrary order like byte literals. */
1653 else if (!(saw_b || saw_u || saw_r || saw_f)
1654 && (c == 'u'|| c == 'U')) {
1655 saw_u = 1;
1656 }
1657 /* ur"" and ru"" are not supported */
1658 else if (!(saw_r || saw_u) && (c == 'r' || c == 'R')) {
1659 saw_r = 1;
1660 }
1661 else if (!(saw_f || saw_b || saw_u) && (c == 'f' || c == 'F')) {
1662 saw_f = 1;
1663 }
1664 else {
1665 break;
1666 }
1667 c = tok_nextc(tok);
1668 if (c == '"' || c == '\'') {
1669 goto letter_quote;
1670 }
1671 }
1672 while (is_potential_identifier_char(c)) {
1673 if (c >= 128) {
1674 nonascii = 1;
1675 }
1676 c = tok_nextc(tok);
1677 }
1678 tok_backup(tok, c);
1679 if (nonascii && !verify_identifier(tok)) {
1680 return ERRORTOKEN;
1681 }
1682
1683 *p_start = tok->start;
1684 *p_end = tok->cur;
1685
1686 /* async/await parsing block. */
1687 if (tok->cur - tok->start == 5 && tok->start[0] == 'a') {
1688 /* May be an 'async' or 'await' token. For Python 3.7 or
1689 later we recognize them unconditionally. For Python
1690 3.5 or 3.6 we recognize 'async' in front of 'def', and
1691 either one inside of 'async def'. (Technically we
1692 shouldn't recognize these at all for 3.4 or earlier,
1693 but there's no *valid* Python 3.4 code that would be
1694 rejected, and async functions will be rejected in a
1695 later phase.) */
1696 if (!tok->async_hacks || tok->async_def) {
1697 /* Always recognize the keywords. */
1698 if (memcmp(tok->start, "async", 5) == 0) {
1699 return ASYNC;
1700 }
1701 if (memcmp(tok->start, "await", 5) == 0) {
1702 return AWAIT;
1703 }
1704 }
1705 else if (memcmp(tok->start, "async", 5) == 0) {
1706 /* The current token is 'async'.
1707 Look ahead one token to see if that is 'def'. */
1708
1709 struct tok_state ahead_tok;
1710 const char *ahead_tok_start = NULL;
1711 const char *ahead_tok_end = NULL;
1712 int ahead_tok_kind;
1713
1714 memcpy(&ahead_tok, tok, sizeof(ahead_tok));
1715 ahead_tok_kind = tok_get(&ahead_tok, &ahead_tok_start,
1716 &ahead_tok_end);
1717
1718 if (ahead_tok_kind == NAME
1719 && ahead_tok.cur - ahead_tok.start == 3
1720 && memcmp(ahead_tok.start, "def", 3) == 0)
1721 {
1722 /* The next token is going to be 'def', so instead of
1723 returning a plain NAME token, return ASYNC. */
1724 tok->async_def_indent = tok->indent;
1725 tok->async_def = 1;
1726 return ASYNC;
1727 }
1728 }
1729 }
1730
1731 return NAME;
1732 }
1733
1734 /* Newline */
1735 if (c == '\n') {
1736 tok->atbol = 1;
1737 if (blankline || tok->level > 0) {
1738 goto nextline;
1739 }
1740 *p_start = tok->start;
1741 *p_end = tok->cur - 1; /* Leave '\n' out of the string */
1742 tok->cont_line = 0;
1743 if (tok->async_def) {
1744 /* We're somewhere inside an 'async def' function, and
1745 we've encountered a NEWLINE after its signature. */
1746 tok->async_def_nl = 1;
1747 }
1748 return NEWLINE;
1749 }
1750
1751 /* Period or number starting with period? */
1752 if (c == '.') {
1753 c = tok_nextc(tok);
1754 if (isdigit(c)) {
1755 goto fraction;
1756 } else if (c == '.') {
1757 c = tok_nextc(tok);
1758 if (c == '.') {
1759 *p_start = tok->start;
1760 *p_end = tok->cur;
1761 return ELLIPSIS;
1762 }
1763 else {
1764 tok_backup(tok, c);
1765 }
1766 tok_backup(tok, '.');
1767 }
1768 else {
1769 tok_backup(tok, c);
1770 }
1771 *p_start = tok->start;
1772 *p_end = tok->cur;
1773 return DOT;
1774 }
1775
1776 /* Number */
1777 if (isdigit(c)) {
1778 if (c == '0') {
1779 /* Hex, octal or binary -- maybe. */
1780 c = tok_nextc(tok);
1781 if (c == 'x' || c == 'X') {
1782 /* Hex */
1783 c = tok_nextc(tok);
1784 do {
1785 if (c == '_') {
1786 c = tok_nextc(tok);
1787 }
1788 if (!isxdigit(c)) {
1789 tok_backup(tok, c);
1790 return syntaxerror(tok, "invalid hexadecimal literal");
1791 }
1792 do {
1793 c = tok_nextc(tok);
1794 } while (isxdigit(c));
1795 } while (c == '_');
1796 if (!verify_end_of_number(tok, c, "hexadecimal")) {
1797 return ERRORTOKEN;
1798 }
1799 }
1800 else if (c == 'o' || c == 'O') {
1801 /* Octal */
1802 c = tok_nextc(tok);
1803 do {
1804 if (c == '_') {
1805 c = tok_nextc(tok);
1806 }
1807 if (c < '0' || c >= '8') {
1808 if (isdigit(c)) {
1809 return syntaxerror(tok,
1810 "invalid digit '%c' in octal literal", c);
1811 }
1812 else {
1813 tok_backup(tok, c);
1814 return syntaxerror(tok, "invalid octal literal");
1815 }
1816 }
1817 do {
1818 c = tok_nextc(tok);
1819 } while ('0' <= c && c < '8');
1820 } while (c == '_');
1821 if (isdigit(c)) {
1822 return syntaxerror(tok,
1823 "invalid digit '%c' in octal literal", c);
1824 }
1825 if (!verify_end_of_number(tok, c, "octal")) {
1826 return ERRORTOKEN;
1827 }
1828 }
1829 else if (c == 'b' || c == 'B') {
1830 /* Binary */
1831 c = tok_nextc(tok);
1832 do {
1833 if (c == '_') {
1834 c = tok_nextc(tok);
1835 }
1836 if (c != '0' && c != '1') {
1837 if (isdigit(c)) {
1838 return syntaxerror(tok,
1839 "invalid digit '%c' in binary literal", c);
1840 }
1841 else {
1842 tok_backup(tok, c);
1843 return syntaxerror(tok, "invalid binary literal");
1844 }
1845 }
1846 do {
1847 c = tok_nextc(tok);
1848 } while (c == '0' || c == '1');
1849 } while (c == '_');
1850 if (isdigit(c)) {
1851 return syntaxerror(tok,
1852 "invalid digit '%c' in binary literal", c);
1853 }
1854 if (!verify_end_of_number(tok, c, "binary")) {
1855 return ERRORTOKEN;
1856 }
1857 }
1858 else {
1859 int nonzero = 0;
1860 /* maybe old-style octal; c is first char of it */
1861 /* in any case, allow '0' as a literal */
1862 while (1) {
1863 if (c == '_') {
1864 c = tok_nextc(tok);
1865 if (!isdigit(c)) {
1866 tok_backup(tok, c);
1867 return syntaxerror(tok, "invalid decimal literal");
1868 }
1869 }
1870 if (c != '0') {
1871 break;
1872 }
1873 c = tok_nextc(tok);
1874 }
1875 char* zeros_end = tok->cur;
1876 if (isdigit(c)) {
1877 nonzero = 1;
1878 c = tok_decimal_tail(tok);
1879 if (c == 0) {
1880 return ERRORTOKEN;
1881 }
1882 }
1883 if (c == '.') {
1884 c = tok_nextc(tok);
1885 goto fraction;
1886 }
1887 else if (c == 'e' || c == 'E') {
1888 goto exponent;
1889 }
1890 else if (c == 'j' || c == 'J') {
1891 goto imaginary;
1892 }
1893 else if (nonzero) {
1894 /* Old-style octal: now disallowed. */
1895 tok_backup(tok, c);
1896 return syntaxerror_known_range(
1897 tok, (int)(tok->start + 1 - tok->line_start),
1898 (int)(zeros_end - tok->line_start),
1899 "leading zeros in decimal integer "
1900 "literals are not permitted; "
1901 "use an 0o prefix for octal integers");
1902 }
1903 if (!verify_end_of_number(tok, c, "decimal")) {
1904 return ERRORTOKEN;
1905 }
1906 }
1907 }
1908 else {
1909 /* Decimal */
1910 c = tok_decimal_tail(tok);
1911 if (c == 0) {
1912 return ERRORTOKEN;
1913 }
1914 {
1915 /* Accept floating point numbers. */
1916 if (c == '.') {
1917 c = tok_nextc(tok);
1918 fraction:
1919 /* Fraction */
1920 if (isdigit(c)) {
1921 c = tok_decimal_tail(tok);
1922 if (c == 0) {
1923 return ERRORTOKEN;
1924 }
1925 }
1926 }
1927 if (c == 'e' || c == 'E') {
1928 int e;
1929 exponent:
1930 e = c;
1931 /* Exponent part */
1932 c = tok_nextc(tok);
1933 if (c == '+' || c == '-') {
1934 c = tok_nextc(tok);
1935 if (!isdigit(c)) {
1936 tok_backup(tok, c);
1937 return syntaxerror(tok, "invalid decimal literal");
1938 }
1939 } else if (!isdigit(c)) {
1940 tok_backup(tok, c);
1941 if (!verify_end_of_number(tok, e, "decimal")) {
1942 return ERRORTOKEN;
1943 }
1944 tok_backup(tok, e);
1945 *p_start = tok->start;
1946 *p_end = tok->cur;
1947 return NUMBER;
1948 }
1949 c = tok_decimal_tail(tok);
1950 if (c == 0) {
1951 return ERRORTOKEN;
1952 }
1953 }
1954 if (c == 'j' || c == 'J') {
1955 /* Imaginary part */
1956 imaginary:
1957 c = tok_nextc(tok);
1958 if (!verify_end_of_number(tok, c, "imaginary")) {
1959 return ERRORTOKEN;
1960 }
1961 }
1962 else if (!verify_end_of_number(tok, c, "decimal")) {
1963 return ERRORTOKEN;
1964 }
1965 }
1966 }
1967 tok_backup(tok, c);
1968 *p_start = tok->start;
1969 *p_end = tok->cur;
1970 return NUMBER;
1971 }
1972
1973 letter_quote:
1974 /* String */
1975 if (c == '\'' || c == '"') {
1976 int quote = c;
1977 int quote_size = 1; /* 1 or 3 */
1978 int end_quote_size = 0;
1979
1980 /* Nodes of type STRING, especially multi line strings
1981 must be handled differently in order to get both
1982 the starting line number and the column offset right.
1983 (cf. issue 16806) */
1984 tok->first_lineno = tok->lineno;
1985 tok->multi_line_start = tok->line_start;
1986
1987 /* Find the quote size and start of string */
1988 c = tok_nextc(tok);
1989 if (c == quote) {
1990 c = tok_nextc(tok);
1991 if (c == quote) {
1992 quote_size = 3;
1993 }
1994 else {
1995 end_quote_size = 1; /* empty string found */
1996 }
1997 }
1998 if (c != quote) {
1999 tok_backup(tok, c);
2000 }
2001
2002 /* Get rest of string */
2003 while (end_quote_size != quote_size) {
2004 c = tok_nextc(tok);
2005 if (tok->done == E_ERROR) {
2006 return ERRORTOKEN;
2007 }
2008 if (tok->done == E_DECODE) {
2009 break;
2010 }
2011 if (c == EOF || (quote_size == 1 && c == '\n')) {
2012 assert(tok->multi_line_start != NULL);
2013 // shift the tok_state's location into
2014 // the start of string, and report the error
2015 // from the initial quote character
2016 tok->cur = (char *)tok->start;
2017 tok->cur++;
2018 tok->line_start = tok->multi_line_start;
2019 int start = tok->lineno;
2020 tok->lineno = tok->first_lineno;
2021 if (quote_size == 3) {
2022 syntaxerror(tok, "unterminated triple-quoted string literal"
2023 " (detected at line %d)", start);
2024 if (c != '\n') {
2025 tok->done = E_EOFS;
2026 }
2027 return ERRORTOKEN;
2028 }
2029 else {
2030 syntaxerror(tok, "unterminated string literal (detected at"
2031 " line %d)", start);
2032 if (c != '\n') {
2033 tok->done = E_EOLS;
2034 }
2035 return ERRORTOKEN;
2036 }
2037 }
2038 if (c == quote) {
2039 end_quote_size += 1;
2040 }
2041 else {
2042 end_quote_size = 0;
2043 if (c == '\\') {
2044 tok_nextc(tok); /* skip escaped char */
2045 }
2046 }
2047 }
2048
2049 *p_start = tok->start;
2050 *p_end = tok->cur;
2051 return STRING;
2052 }
2053
2054 /* Line continuation */
2055 if (c == '\\') {
2056 if ((c = tok_continuation_line(tok)) == -1) {
2057 return ERRORTOKEN;
2058 }
2059 tok->cont_line = 1;
2060 goto again; /* Read next line */
2061 }
2062
2063 /* Check for two-character token */
2064 {
2065 int c2 = tok_nextc(tok);
2066 int token = PyToken_TwoChars(c, c2);
2067 if (token != OP) {
2068 int c3 = tok_nextc(tok);
2069 int token3 = PyToken_ThreeChars(c, c2, c3);
2070 if (token3 != OP) {
2071 token = token3;
2072 }
2073 else {
2074 tok_backup(tok, c3);
2075 }
2076 *p_start = tok->start;
2077 *p_end = tok->cur;
2078 return token;
2079 }
2080 tok_backup(tok, c2);
2081 }
2082
2083 /* Keep track of parentheses nesting level */
2084 switch (c) {
2085 case '(':
2086 case '[':
2087 case '{':
2088 if (tok->level >= MAXLEVEL) {
2089 return syntaxerror(tok, "too many nested parentheses");
2090 }
2091 tok->parenstack[tok->level] = c;
2092 tok->parenlinenostack[tok->level] = tok->lineno;
2093 tok->parencolstack[tok->level] = (int)(tok->start - tok->line_start);
2094 tok->level++;
2095 break;
2096 case ')':
2097 case ']':
2098 case '}':
2099 if (!tok->level) {
2100 return syntaxerror(tok, "unmatched '%c'", c);
2101 }
2102 tok->level--;
2103 int opening = tok->parenstack[tok->level];
2104 if (!((opening == '(' && c == ')') ||
2105 (opening == '[' && c == ']') ||
2106 (opening == '{' && c == '}')))
2107 {
2108 if (tok->parenlinenostack[tok->level] != tok->lineno) {
2109 return syntaxerror(tok,
2110 "closing parenthesis '%c' does not match "
2111 "opening parenthesis '%c' on line %d",
2112 c, opening, tok->parenlinenostack[tok->level]);
2113 }
2114 else {
2115 return syntaxerror(tok,
2116 "closing parenthesis '%c' does not match "
2117 "opening parenthesis '%c'",
2118 c, opening);
2119 }
2120 }
2121 break;
2122 }
2123
2124 if (!Py_UNICODE_ISPRINTABLE(c)) {
2125 char hex[9];
2126 (void)PyOS_snprintf(hex, sizeof(hex), "%04X", c);
2127 return syntaxerror(tok, "invalid non-printable character U+%s", hex);
2128 }
2129
2130 /* Punctuation character */
2131 *p_start = tok->start;
2132 *p_end = tok->cur;
2133 return PyToken_OneChar(c);
2134 }
2135
2136 int
2137 _PyTokenizer_Get(struct tok_state *tok,
2138 const char **p_start, const char **p_end)
2139 {
2140 int result = tok_get(tok, p_start, p_end);
2141 if (tok->decoding_erred) {
2142 result = ERRORTOKEN;
2143 tok->done = E_DECODE;
2144 }
2145 return result;
2146 }
2147
2148 #if defined(__wasi__) || (defined(__EMSCRIPTEN__) && (__EMSCRIPTEN_major__ >= 3))
2149 // fdopen() with borrowed fd. WASI does not provide dup() and Emscripten's
2150 // dup() emulation with open() is slow.
2151 typedef union {
2152 void *cookie;
2153 int fd;
2154 } borrowed;
2155
2156 static ssize_t
2157 borrow_read(void *cookie, char *buf, size_t size)
2158 {
2159 borrowed b = {.cookie = cookie};
2160 return read(b.fd, (void *)buf, size);
2161 }
2162
2163 static FILE *
2164 fdopen_borrow(int fd) {
2165 // supports only reading. seek fails. close and write are no-ops.
2166 cookie_io_functions_t io_cb = {borrow_read, NULL, NULL, NULL};
2167 borrowed b = {.fd = fd};
2168 return fopencookie(b.cookie, "r", io_cb);
2169 }
2170 #else
2171 static FILE *
2172 fdopen_borrow(int fd) {
2173 fd = _Py_dup(fd);
2174 if (fd < 0) {
2175 return NULL;
2176 }
2177 return fdopen(fd, "r");
2178 }
2179 #endif
2180
2181 /* Get the encoding of a Python file. Check for the coding cookie and check if
2182 the file starts with a BOM.
2183
2184 _PyTokenizer_FindEncodingFilename() returns NULL when it can't find the
2185 encoding in the first or second line of the file (in which case the encoding
2186 should be assumed to be UTF-8).
2187
2188 The char* returned is malloc'ed via PyMem_Malloc() and thus must be freed
2189 by the caller. */
2190
2191 char *
2192 _PyTokenizer_FindEncodingFilename(int fd, PyObject *filename)
2193 {
2194 struct tok_state *tok;
2195 FILE *fp;
2196 const char *p_start = NULL;
2197 const char *p_end = NULL;
2198 char *encoding = NULL;
2199
2200 fp = fdopen_borrow(fd);
2201 if (fp == NULL) {
2202 return NULL;
2203 }
2204 tok = _PyTokenizer_FromFile(fp, NULL, NULL, NULL);
2205 if (tok == NULL) {
2206 fclose(fp);
2207 return NULL;
2208 }
2209 if (filename != NULL) {
2210 Py_INCREF(filename);
2211 tok->filename = filename;
2212 }
2213 else {
2214 tok->filename = PyUnicode_FromString("<string>");
2215 if (tok->filename == NULL) {
2216 fclose(fp);
2217 _PyTokenizer_Free(tok);
2218 return encoding;
2219 }
2220 }
2221 // We don't want to report warnings here because it could cause infinite recursion
2222 // if fetching the encoding shows a warning.
2223 tok->report_warnings = 0;
2224 while (tok->lineno < 2 && tok->done == E_OK) {
2225 _PyTokenizer_Get(tok, &p_start, &p_end);
2226 }
2227 fclose(fp);
2228 if (tok->encoding) {
2229 encoding = (char *)PyMem_Malloc(strlen(tok->encoding) + 1);
2230 if (encoding) {
2231 strcpy(encoding, tok->encoding);
2232 }
2233 }
2234 _PyTokenizer_Free(tok);
2235 return encoding;
2236 }
2237
2238 #ifdef Py_DEBUG
2239 void
2240 tok_dump(int type, char *start, char *end)
2241 {
2242 fprintf(stderr, "%s", _PyParser_TokenNames[type]);
2243 if (type == NAME || type == NUMBER || type == STRING || type == OP)
2244 fprintf(stderr, "(%.*s)", (int)(end - start), start);
2245 }
2246 #endif // Py_DEBUG