1  
       2  /* Tokenizer implementation */
       3  
       4  #define PY_SSIZE_T_CLEAN
       5  #include "Python.h"
       6  #include "pycore_call.h"          // _PyObject_CallNoArgs()
       7  
       8  #include <ctype.h>
       9  #include <assert.h>
      10  
      11  #include "tokenizer.h"
      12  #include "errcode.h"
      13  
      14  #include "unicodeobject.h"
      15  #include "bytesobject.h"
      16  #include "fileobject.h"
      17  #include "abstract.h"
      18  
      19  /* Alternate tab spacing */
      20  #define ALTTABSIZE 1
      21  
      22  #define is_potential_identifier_start(c) (\
      23                (c >= 'a' && c <= 'z')\
      24                 || (c >= 'A' && c <= 'Z')\
      25                 || c == '_'\
      26                 || (c >= 128))
      27  
      28  #define is_potential_identifier_char(c) (\
      29                (c >= 'a' && c <= 'z')\
      30                 || (c >= 'A' && c <= 'Z')\
      31                 || (c >= '0' && c <= '9')\
      32                 || c == '_'\
      33                 || (c >= 128))
      34  
      35  
      36  /* Don't ever change this -- it would break the portability of Python code */
      37  #define TABSIZE 8
      38  
      39  /* Forward */
      40  static struct tok_state *tok_new(void);
      41  static int tok_nextc(struct tok_state *tok);
      42  static void tok_backup(struct tok_state *tok, int c);
      43  static int syntaxerror(struct tok_state *tok, const char *format, ...);
      44  
      45  /* Spaces in this constant are treated as "zero or more spaces or tabs" when
      46     tokenizing. */
      47  static const char* type_comment_prefix = "# type: ";
      48  
      49  /* Create and initialize a new tok_state structure */
      50  
      51  static struct tok_state *
      52  tok_new(void)
      53  {
      54      struct tok_state *tok = (struct tok_state *)PyMem_Malloc(
      55                                              sizeof(struct tok_state));
      56      if (tok == NULL)
      57          return NULL;
      58      tok->buf = tok->cur = tok->inp = NULL;
      59      tok->fp_interactive = 0;
      60      tok->interactive_src_start = NULL;
      61      tok->interactive_src_end = NULL;
      62      tok->start = NULL;
      63      tok->end = NULL;
      64      tok->done = E_OK;
      65      tok->fp = NULL;
      66      tok->input = NULL;
      67      tok->tabsize = TABSIZE;
      68      tok->indent = 0;
      69      tok->indstack[0] = 0;
      70      tok->atbol = 1;
      71      tok->pendin = 0;
      72      tok->prompt = tok->nextprompt = NULL;
      73      tok->lineno = 0;
      74      tok->level = 0;
      75      tok->altindstack[0] = 0;
      76      tok->decoding_state = STATE_INIT;
      77      tok->decoding_erred = 0;
      78      tok->enc = NULL;
      79      tok->encoding = NULL;
      80      tok->cont_line = 0;
      81      tok->filename = NULL;
      82      tok->decoding_readline = NULL;
      83      tok->decoding_buffer = NULL;
      84      tok->type_comments = 0;
      85      tok->async_hacks = 0;
      86      tok->async_def = 0;
      87      tok->async_def_indent = 0;
      88      tok->async_def_nl = 0;
      89      tok->interactive_underflow = IUNDERFLOW_NORMAL;
      90      tok->str = NULL;
      91      tok->report_warnings = 1;
      92      return tok;
      93  }
      94  
      95  static char *
      96  new_string(const char *s, Py_ssize_t len, struct tok_state *tok)
      97  {
      98      char* result = (char *)PyMem_Malloc(len + 1);
      99      if (!result) {
     100          tok->done = E_NOMEM;
     101          return NULL;
     102      }
     103      memcpy(result, s, len);
     104      result[len] = '\0';
     105      return result;
     106  }
     107  
     108  static char *
     109  error_ret(struct tok_state *tok) /* XXX */
     110  {
     111      tok->decoding_erred = 1;
     112      if (tok->fp != NULL && tok->buf != NULL) /* see _PyTokenizer_Free */
     113          PyMem_Free(tok->buf);
     114      tok->buf = tok->cur = tok->inp = NULL;
     115      tok->start = NULL;
     116      tok->end = NULL;
     117      tok->done = E_DECODE;
     118      return NULL;                /* as if it were EOF */
     119  }
     120  
     121  
     122  static const char *
     123  get_normal_name(const char *s)  /* for utf-8 and latin-1 */
     124  {
     125      char buf[13];
     126      int i;
     127      for (i = 0; i < 12; i++) {
     128          int c = s[i];
     129          if (c == '\0')
     130              break;
     131          else if (c == '_')
     132              buf[i] = '-';
     133          else
     134              buf[i] = tolower(c);
     135      }
     136      buf[i] = '\0';
     137      if (strcmp(buf, "utf-8") == 0 ||
     138          strncmp(buf, "utf-8-", 6) == 0)
     139          return "utf-8";
     140      else if (strcmp(buf, "latin-1") == 0 ||
     141               strcmp(buf, "iso-8859-1") == 0 ||
     142               strcmp(buf, "iso-latin-1") == 0 ||
     143               strncmp(buf, "latin-1-", 8) == 0 ||
     144               strncmp(buf, "iso-8859-1-", 11) == 0 ||
     145               strncmp(buf, "iso-latin-1-", 12) == 0)
     146          return "iso-8859-1";
     147      else
     148          return s;
     149  }
     150  
     151  /* Return the coding spec in S, or NULL if none is found.  */
     152  
     153  static int
     154  get_coding_spec(const char *s, char **spec, Py_ssize_t size, struct tok_state *tok)
     155  {
     156      Py_ssize_t i;
     157      *spec = NULL;
     158      /* Coding spec must be in a comment, and that comment must be
     159       * the only statement on the source code line. */
     160      for (i = 0; i < size - 6; i++) {
     161          if (s[i] == '#')
     162              break;
     163          if (s[i] != ' ' && s[i] != '\t' && s[i] != '\014')
     164              return 1;
     165      }
     166      for (; i < size - 6; i++) { /* XXX inefficient search */
     167          const char* t = s + i;
     168          if (memcmp(t, "coding", 6) == 0) {
     169              const char* begin = NULL;
     170              t += 6;
     171              if (t[0] != ':' && t[0] != '=')
     172                  continue;
     173              do {
     174                  t++;
     175              } while (t[0] == ' ' || t[0] == '\t');
     176  
     177              begin = t;
     178              while (Py_ISALNUM(t[0]) ||
     179                     t[0] == '-' || t[0] == '_' || t[0] == '.')
     180                  t++;
     181  
     182              if (begin < t) {
     183                  char* r = new_string(begin, t - begin, tok);
     184                  const char* q;
     185                  if (!r)
     186                      return 0;
     187                  q = get_normal_name(r);
     188                  if (r != q) {
     189                      PyMem_Free(r);
     190                      r = new_string(q, strlen(q), tok);
     191                      if (!r)
     192                          return 0;
     193                  }
     194                  *spec = r;
     195                  break;
     196              }
     197          }
     198      }
     199      return 1;
     200  }
     201  
     202  /* Check whether the line contains a coding spec. If it does,
     203     invoke the set_readline function for the new encoding.
     204     This function receives the tok_state and the new encoding.
     205     Return 1 on success, 0 on failure.  */
     206  
     207  static int
     208  check_coding_spec(const char* line, Py_ssize_t size, struct tok_state *tok,
     209                    int set_readline(struct tok_state *, const char *))
     210  {
     211      char *cs;
     212      if (tok->cont_line) {
     213          /* It's a continuation line, so it can't be a coding spec. */
     214          tok->decoding_state = STATE_NORMAL;
     215          return 1;
     216      }
     217      if (!get_coding_spec(line, &cs, size, tok)) {
     218          return 0;
     219      }
     220      if (!cs) {
     221          Py_ssize_t i;
     222          for (i = 0; i < size; i++) {
     223              if (line[i] == '#' || line[i] == '\n' || line[i] == '\r')
     224                  break;
     225              if (line[i] != ' ' && line[i] != '\t' && line[i] != '\014') {
     226                  /* Stop checking coding spec after a line containing
     227                   * anything except a comment. */
     228                  tok->decoding_state = STATE_NORMAL;
     229                  break;
     230              }
     231          }
     232          return 1;
     233      }
     234      tok->decoding_state = STATE_NORMAL;
     235      if (tok->encoding == NULL) {
     236          assert(tok->decoding_readline == NULL);
     237          if (strcmp(cs, "utf-8") != 0 && !set_readline(tok, cs)) {
     238              error_ret(tok);
     239              PyErr_Format(PyExc_SyntaxError, "encoding problem: %s", cs);
     240              PyMem_Free(cs);
     241              return 0;
     242          }
     243          tok->encoding = cs;
     244      } else {                /* then, compare cs with BOM */
     245          if (strcmp(tok->encoding, cs) != 0) {
     246              error_ret(tok);
     247              PyErr_Format(PyExc_SyntaxError,
     248                           "encoding problem: %s with BOM", cs);
     249              PyMem_Free(cs);
     250              return 0;
     251          }
     252          PyMem_Free(cs);
     253      }
     254      return 1;
     255  }
     256  
     257  /* See whether the file starts with a BOM. If it does,
     258     invoke the set_readline function with the new encoding.
     259     Return 1 on success, 0 on failure.  */
     260  
     261  static int
     262  check_bom(int get_char(struct tok_state *),
     263            void unget_char(int, struct tok_state *),
     264            int set_readline(struct tok_state *, const char *),
     265            struct tok_state *tok)
     266  {
     267      int ch1, ch2, ch3;
     268      ch1 = get_char(tok);
     269      tok->decoding_state = STATE_SEEK_CODING;
     270      if (ch1 == EOF) {
     271          return 1;
     272      } else if (ch1 == 0xEF) {
     273          ch2 = get_char(tok);
     274          if (ch2 != 0xBB) {
     275              unget_char(ch2, tok);
     276              unget_char(ch1, tok);
     277              return 1;
     278          }
     279          ch3 = get_char(tok);
     280          if (ch3 != 0xBF) {
     281              unget_char(ch3, tok);
     282              unget_char(ch2, tok);
     283              unget_char(ch1, tok);
     284              return 1;
     285          }
     286      } else {
     287          unget_char(ch1, tok);
     288          return 1;
     289      }
     290      if (tok->encoding != NULL)
     291          PyMem_Free(tok->encoding);
     292      tok->encoding = new_string("utf-8", 5, tok);
     293      if (!tok->encoding)
     294          return 0;
     295      /* No need to set_readline: input is already utf-8 */
     296      return 1;
     297  }
     298  
     299  static int
     300  tok_concatenate_interactive_new_line(struct tok_state *tok, const char *line) {
     301      assert(tok->fp_interactive);
     302  
     303      if (!line) {
     304          return 0;
     305      }
     306  
     307      Py_ssize_t current_size = tok->interactive_src_end - tok->interactive_src_start;
     308      Py_ssize_t line_size = strlen(line);
     309      char last_char = line[line_size > 0 ? line_size - 1 : line_size];
     310      if (last_char != '\n') {
     311          line_size += 1;
     312      }
     313      char* new_str = tok->interactive_src_start;
     314  
     315      new_str = PyMem_Realloc(new_str, current_size + line_size + 1);
     316      if (!new_str) {
     317          if (tok->interactive_src_start) {
     318              PyMem_Free(tok->interactive_src_start);
     319          }
     320          tok->interactive_src_start = NULL;
     321          tok->interactive_src_end = NULL;
     322          tok->done = E_NOMEM;
     323          return -1;
     324      }
     325      strcpy(new_str + current_size, line);
     326      if (last_char != '\n') {
     327          /* Last line does not end in \n, fake one */
     328          new_str[current_size + line_size - 1] = '\n';
     329          new_str[current_size + line_size] = '\0';
     330      }
     331      tok->interactive_src_start = new_str;
     332      tok->interactive_src_end = new_str + current_size + line_size;
     333      return 0;
     334  }
     335  
     336  
     337  /* Read a line of text from TOK into S, using the stream in TOK.
     338     Return NULL on failure, else S.
     339  
     340     On entry, tok->decoding_buffer will be one of:
     341       1) NULL: need to call tok->decoding_readline to get a new line
     342       2) PyUnicodeObject *: decoding_feof has called tok->decoding_readline and
     343         stored the result in tok->decoding_buffer
     344       3) PyByteArrayObject *: previous call to tok_readline_recode did not have enough room
     345         (in the s buffer) to copy entire contents of the line read
     346         by tok->decoding_readline.  tok->decoding_buffer has the overflow.
     347         In this case, tok_readline_recode is called in a loop (with an expanded buffer)
     348         until the buffer ends with a '\n' (or until the end of the file is
     349         reached): see tok_nextc and its calls to tok_reserve_buf.
     350  */
     351  
     352  static int
     353  tok_reserve_buf(struct tok_state *tok, Py_ssize_t size)
     354  {
     355      Py_ssize_t cur = tok->cur - tok->buf;
     356      Py_ssize_t oldsize = tok->inp - tok->buf;
     357      Py_ssize_t newsize = oldsize + Py_MAX(size, oldsize >> 1);
     358      if (newsize > tok->end - tok->buf) {
     359          char *newbuf = tok->buf;
     360          Py_ssize_t start = tok->start == NULL ? -1 : tok->start - tok->buf;
     361          Py_ssize_t line_start = tok->start == NULL ? -1 : tok->line_start - tok->buf;
     362          Py_ssize_t multi_line_start = tok->multi_line_start - tok->buf;
     363          newbuf = (char *)PyMem_Realloc(newbuf, newsize);
     364          if (newbuf == NULL) {
     365              tok->done = E_NOMEM;
     366              return 0;
     367          }
     368          tok->buf = newbuf;
     369          tok->cur = tok->buf + cur;
     370          tok->inp = tok->buf + oldsize;
     371          tok->end = tok->buf + newsize;
     372          tok->start = start < 0 ? NULL : tok->buf + start;
     373          tok->line_start = line_start < 0 ? NULL : tok->buf + line_start;
     374          tok->multi_line_start = multi_line_start < 0 ? NULL : tok->buf + multi_line_start;
     375      }
     376      return 1;
     377  }
     378  
     379  static inline int
     380  contains_null_bytes(const char* str, size_t size) {
     381      return memchr(str, 0, size) != NULL;
     382  }
     383  
     384  static int
     385  tok_readline_recode(struct tok_state *tok) {
     386      PyObject *line;
     387      const  char *buf;
     388      Py_ssize_t buflen;
     389      line = tok->decoding_buffer;
     390      if (line == NULL) {
     391          line = PyObject_CallNoArgs(tok->decoding_readline);
     392          if (line == NULL) {
     393              error_ret(tok);
     394              goto error;
     395          }
     396      }
     397      else {
     398          tok->decoding_buffer = NULL;
     399      }
     400      buf = PyUnicode_AsUTF8AndSize(line, &buflen);
     401      if (buf == NULL) {
     402          error_ret(tok);
     403          goto error;
     404      }
     405      // Make room for the null terminator *and* potentially
     406      // an extra newline character that we may need to artificially
     407      // add.
     408      size_t buffer_size = buflen + 2;
     409      if (!tok_reserve_buf(tok, buffer_size)) {
     410          goto error;
     411      }
     412      memcpy(tok->inp, buf, buflen);
     413      tok->inp += buflen;
     414      *tok->inp = '\0';
     415      if (tok->fp_interactive &&
     416          tok_concatenate_interactive_new_line(tok, buf) == -1) {
     417          goto error;
     418      }
     419      Py_DECREF(line);
     420      return 1;
     421  error:
     422      Py_XDECREF(line);
     423      return 0;
     424  }
     425  
     426  /* Set the readline function for TOK to a StreamReader's
     427     readline function. The StreamReader is named ENC.
     428  
     429     This function is called from check_bom and check_coding_spec.
     430  
     431     ENC is usually identical to the future value of tok->encoding,
     432     except for the (currently unsupported) case of UTF-16.
     433  
     434     Return 1 on success, 0 on failure. */
     435  
     436  static int
     437  fp_setreadl(struct tok_state *tok, const char* enc)
     438  {
     439      PyObject *readline, *io, *stream;
     440      int fd;
     441      long pos;
     442  
     443      fd = fileno(tok->fp);
     444      /* Due to buffering the file offset for fd can be different from the file
     445       * position of tok->fp.  If tok->fp was opened in text mode on Windows,
     446       * its file position counts CRLF as one char and can't be directly mapped
     447       * to the file offset for fd.  Instead we step back one byte and read to
     448       * the end of line.*/
     449      pos = ftell(tok->fp);
     450      if (pos == -1 ||
     451          lseek(fd, (off_t)(pos > 0 ? pos - 1 : pos), SEEK_SET) == (off_t)-1) {
     452          PyErr_SetFromErrnoWithFilename(PyExc_OSError, NULL);
     453          return 0;
     454      }
     455  
     456      io = PyImport_ImportModule("io");
     457      if (io == NULL) {
     458          return 0;
     459      }
     460      stream = _PyObject_CallMethod(io, &_Py_ID(open), "isisOOO",
     461                      fd, "r", -1, enc, Py_None, Py_None, Py_False);
     462      Py_DECREF(io);
     463      if (stream == NULL) {
     464          return 0;
     465      }
     466  
     467      readline = PyObject_GetAttr(stream, &_Py_ID(readline));
     468      Py_DECREF(stream);
     469      if (readline == NULL) {
     470          return 0;
     471      }
     472      Py_XSETREF(tok->decoding_readline, readline);
     473  
     474      if (pos > 0) {
     475          PyObject *bufobj = _PyObject_CallNoArgs(readline);
     476          if (bufobj == NULL) {
     477              return 0;
     478          }
     479          Py_DECREF(bufobj);
     480      }
     481  
     482      return 1;
     483  }
     484  
     485  /* Fetch the next byte from TOK. */
     486  
     487  static int fp_getc(struct tok_state *tok) {
     488      return getc(tok->fp);
     489  }
     490  
     491  /* Unfetch the last byte back into TOK.  */
     492  
     493  static void fp_ungetc(int c, struct tok_state *tok) {
     494      ungetc(c, tok->fp);
     495  }
     496  
     497  /* Check whether the characters at s start a valid
     498     UTF-8 sequence. Return the number of characters forming
     499     the sequence if yes, 0 if not.  The special cases match
     500     those in stringlib/codecs.h:utf8_decode.
     501  */
     502  static int
     503  valid_utf8(const unsigned char* s)
     504  {
     505      int expected = 0;
     506      int length;
     507      if (*s < 0x80) {
     508          /* single-byte code */
     509          return 1;
     510      }
     511      else if (*s < 0xE0) {
     512          /* \xC2\x80-\xDF\xBF -- 0080-07FF */
     513          if (*s < 0xC2) {
     514              /* invalid sequence
     515                 \x80-\xBF -- continuation byte
     516                 \xC0-\xC1 -- fake 0000-007F */
     517              return 0;
     518          }
     519          expected = 1;
     520      }
     521      else if (*s < 0xF0) {
     522          /* \xE0\xA0\x80-\xEF\xBF\xBF -- 0800-FFFF */
     523          if (*s == 0xE0 && *(s + 1) < 0xA0) {
     524              /* invalid sequence
     525                 \xE0\x80\x80-\xE0\x9F\xBF -- fake 0000-0800 */
     526              return 0;
     527          }
     528          else if (*s == 0xED && *(s + 1) >= 0xA0) {
     529              /* Decoding UTF-8 sequences in range \xED\xA0\x80-\xED\xBF\xBF
     530                 will result in surrogates in range D800-DFFF. Surrogates are
     531                 not valid UTF-8 so they are rejected.
     532                 See https://www.unicode.org/versions/Unicode5.2.0/ch03.pdf
     533                 (table 3-7) and http://www.rfc-editor.org/rfc/rfc3629.txt */
     534              return 0;
     535          }
     536          expected = 2;
     537      }
     538      else if (*s < 0xF5) {
     539          /* \xF0\x90\x80\x80-\xF4\x8F\xBF\xBF -- 10000-10FFFF */
     540          if (*(s + 1) < 0x90 ? *s == 0xF0 : *s == 0xF4) {
     541              /* invalid sequence -- one of:
     542                 \xF0\x80\x80\x80-\xF0\x8F\xBF\xBF -- fake 0000-FFFF
     543                 \xF4\x90\x80\x80- -- 110000- overflow */
     544              return 0;
     545          }
     546          expected = 3;
     547      }
     548      else {
     549          /* invalid start byte */
     550          return 0;
     551      }
     552      length = expected + 1;
     553      for (; expected; expected--)
     554          if (s[expected] < 0x80 || s[expected] >= 0xC0)
     555              return 0;
     556      return length;
     557  }
     558  
     559  static int
     560  ensure_utf8(char *line, struct tok_state *tok)
     561  {
     562      int badchar = 0;
     563      unsigned char *c;
     564      int length;
     565      for (c = (unsigned char *)line; *c; c += length) {
     566          if (!(length = valid_utf8(c))) {
     567              badchar = *c;
     568              break;
     569          }
     570      }
     571      if (badchar) {
     572          PyErr_Format(PyExc_SyntaxError,
     573                       "Non-UTF-8 code starting with '\\x%.2x' "
     574                       "in file %U on line %i, "
     575                       "but no encoding declared; "
     576                       "see https://peps.python.org/pep-0263/ for details",
     577                       badchar, tok->filename, tok->lineno);
     578          return 0;
     579      }
     580      return 1;
     581  }
     582  
     583  /* Fetch a byte from TOK, using the string buffer. */
     584  
     585  static int
     586  buf_getc(struct tok_state *tok) {
     587      return Py_CHARMASK(*tok->str++);
     588  }
     589  
     590  /* Unfetch a byte from TOK, using the string buffer. */
     591  
     592  static void
     593  buf_ungetc(int c, struct tok_state *tok) {
     594      tok->str--;
     595      assert(Py_CHARMASK(*tok->str) == c);        /* tok->cur may point to read-only segment */
     596  }
     597  
     598  /* Set the readline function for TOK to ENC. For the string-based
     599     tokenizer, this means to just record the encoding. */
     600  
     601  static int
     602  buf_setreadl(struct tok_state *tok, const char* enc) {
     603      tok->enc = enc;
     604      return 1;
     605  }
     606  
     607  /* Return a UTF-8 encoding Python string object from the
     608     C byte string STR, which is encoded with ENC. */
     609  
     610  static PyObject *
     611  translate_into_utf8(const char* str, const char* enc) {
     612      PyObject *utf8;
     613      PyObject* buf = PyUnicode_Decode(str, strlen(str), enc, NULL);
     614      if (buf == NULL)
     615          return NULL;
     616      utf8 = PyUnicode_AsUTF8String(buf);
     617      Py_DECREF(buf);
     618      return utf8;
     619  }
     620  
     621  
     622  static char *
     623  translate_newlines(const char *s, int exec_input, struct tok_state *tok) {
     624      int skip_next_lf = 0;
     625      size_t needed_length = strlen(s) + 2, final_length;
     626      char *buf, *current;
     627      char c = '\0';
     628      buf = PyMem_Malloc(needed_length);
     629      if (buf == NULL) {
     630          tok->done = E_NOMEM;
     631          return NULL;
     632      }
     633      for (current = buf; *s; s++, current++) {
     634          c = *s;
     635          if (skip_next_lf) {
     636              skip_next_lf = 0;
     637              if (c == '\n') {
     638                  c = *++s;
     639                  if (!c)
     640                      break;
     641              }
     642          }
     643          if (c == '\r') {
     644              skip_next_lf = 1;
     645              c = '\n';
     646          }
     647          *current = c;
     648      }
     649      /* If this is exec input, add a newline to the end of the string if
     650         there isn't one already. */
     651      if (exec_input && c != '\n') {
     652          *current = '\n';
     653          current++;
     654      }
     655      *current = '\0';
     656      final_length = current - buf + 1;
     657      if (final_length < needed_length && final_length) {
     658          /* should never fail */
     659          char* result = PyMem_Realloc(buf, final_length);
     660          if (result == NULL) {
     661              PyMem_Free(buf);
     662          }
     663          buf = result;
     664      }
     665      return buf;
     666  }
     667  
     668  /* Decode a byte string STR for use as the buffer of TOK.
     669     Look for encoding declarations inside STR, and record them
     670     inside TOK.  */
     671  
     672  static char *
     673  decode_str(const char *input, int single, struct tok_state *tok)
     674  {
     675      PyObject* utf8 = NULL;
     676      char *str;
     677      const char *s;
     678      const char *newl[2] = {NULL, NULL};
     679      int lineno = 0;
     680      tok->input = str = translate_newlines(input, single, tok);
     681      if (str == NULL)
     682          return NULL;
     683      tok->enc = NULL;
     684      tok->str = str;
     685      if (!check_bom(buf_getc, buf_ungetc, buf_setreadl, tok))
     686          return error_ret(tok);
     687      str = tok->str;             /* string after BOM if any */
     688      assert(str);
     689      if (tok->enc != NULL) {
     690          utf8 = translate_into_utf8(str, tok->enc);
     691          if (utf8 == NULL)
     692              return error_ret(tok);
     693          str = PyBytes_AsString(utf8);
     694      }
     695      for (s = str;; s++) {
     696          if (*s == '\0') break;
     697          else if (*s == '\n') {
     698              assert(lineno < 2);
     699              newl[lineno] = s;
     700              lineno++;
     701              if (lineno == 2) break;
     702          }
     703      }
     704      tok->enc = NULL;
     705      /* need to check line 1 and 2 separately since check_coding_spec
     706         assumes a single line as input */
     707      if (newl[0]) {
     708          if (!check_coding_spec(str, newl[0] - str, tok, buf_setreadl)) {
     709              return NULL;
     710          }
     711          if (tok->enc == NULL && tok->decoding_state != STATE_NORMAL && newl[1]) {
     712              if (!check_coding_spec(newl[0]+1, newl[1] - newl[0],
     713                                     tok, buf_setreadl))
     714                  return NULL;
     715          }
     716      }
     717      if (tok->enc != NULL) {
     718          assert(utf8 == NULL);
     719          utf8 = translate_into_utf8(str, tok->enc);
     720          if (utf8 == NULL)
     721              return error_ret(tok);
     722          str = PyBytes_AS_STRING(utf8);
     723      }
     724      assert(tok->decoding_buffer == NULL);
     725      tok->decoding_buffer = utf8; /* CAUTION */
     726      return str;
     727  }
     728  
     729  /* Set up tokenizer for string */
     730  
     731  struct tok_state *
     732  _PyTokenizer_FromString(const char *str, int exec_input)
     733  {
     734      struct tok_state *tok = tok_new();
     735      char *decoded;
     736  
     737      if (tok == NULL)
     738          return NULL;
     739      decoded = decode_str(str, exec_input, tok);
     740      if (decoded == NULL) {
     741          _PyTokenizer_Free(tok);
     742          return NULL;
     743      }
     744  
     745      tok->buf = tok->cur = tok->inp = decoded;
     746      tok->end = decoded;
     747      return tok;
     748  }
     749  
     750  /* Set up tokenizer for UTF-8 string */
     751  
     752  struct tok_state *
     753  _PyTokenizer_FromUTF8(const char *str, int exec_input)
     754  {
     755      struct tok_state *tok = tok_new();
     756      char *translated;
     757      if (tok == NULL)
     758          return NULL;
     759      tok->input = translated = translate_newlines(str, exec_input, tok);
     760      if (translated == NULL) {
     761          _PyTokenizer_Free(tok);
     762          return NULL;
     763      }
     764      tok->decoding_state = STATE_NORMAL;
     765      tok->enc = NULL;
     766      tok->str = translated;
     767      tok->encoding = new_string("utf-8", 5, tok);
     768      if (!tok->encoding) {
     769          _PyTokenizer_Free(tok);
     770          return NULL;
     771      }
     772  
     773      tok->buf = tok->cur = tok->inp = translated;
     774      tok->end = translated;
     775      return tok;
     776  }
     777  
     778  /* Set up tokenizer for file */
     779  
     780  struct tok_state *
     781  _PyTokenizer_FromFile(FILE *fp, const char* enc,
     782                        const char *ps1, const char *ps2)
     783  {
     784      struct tok_state *tok = tok_new();
     785      if (tok == NULL)
     786          return NULL;
     787      if ((tok->buf = (char *)PyMem_Malloc(BUFSIZ)) == NULL) {
     788          _PyTokenizer_Free(tok);
     789          return NULL;
     790      }
     791      tok->cur = tok->inp = tok->buf;
     792      tok->end = tok->buf + BUFSIZ;
     793      tok->fp = fp;
     794      tok->prompt = ps1;
     795      tok->nextprompt = ps2;
     796      if (enc != NULL) {
     797          /* Must copy encoding declaration since it
     798             gets copied into the parse tree. */
     799          tok->encoding = new_string(enc, strlen(enc), tok);
     800          if (!tok->encoding) {
     801              _PyTokenizer_Free(tok);
     802              return NULL;
     803          }
     804          tok->decoding_state = STATE_NORMAL;
     805      }
     806      return tok;
     807  }
     808  
     809  /* Free a tok_state structure */
     810  
     811  void
     812  _PyTokenizer_Free(struct tok_state *tok)
     813  {
     814      if (tok->encoding != NULL) {
     815          PyMem_Free(tok->encoding);
     816      }
     817      Py_XDECREF(tok->decoding_readline);
     818      Py_XDECREF(tok->decoding_buffer);
     819      Py_XDECREF(tok->filename);
     820      if (tok->fp != NULL && tok->buf != NULL) {
     821          PyMem_Free(tok->buf);
     822      }
     823      if (tok->input) {
     824          PyMem_Free(tok->input);
     825      }
     826      if (tok->interactive_src_start != NULL) {
     827          PyMem_Free(tok->interactive_src_start);
     828      }
     829      PyMem_Free(tok);
     830  }
     831  
     832  static int
     833  tok_readline_raw(struct tok_state *tok)
     834  {
     835      do {
     836          if (!tok_reserve_buf(tok, BUFSIZ)) {
     837              return 0;
     838          }
     839          int n_chars = (int)(tok->end - tok->inp);
     840          size_t line_size = 0;
     841          char *line = _Py_UniversalNewlineFgetsWithSize(tok->inp, n_chars, tok->fp, NULL, &line_size);
     842          if (line == NULL) {
     843              return 1;
     844          }
     845          if (tok->fp_interactive &&
     846              tok_concatenate_interactive_new_line(tok, line) == -1) {
     847              return 0;
     848          }
     849          tok->inp += line_size;
     850          if (tok->inp == tok->buf) {
     851              return 0;
     852          }
     853      } while (tok->inp[-1] != '\n');
     854      return 1;
     855  }
     856  
     857  static int
     858  tok_underflow_string(struct tok_state *tok) {
     859      char *end = strchr(tok->inp, '\n');
     860      if (end != NULL) {
     861          end++;
     862      }
     863      else {
     864          end = strchr(tok->inp, '\0');
     865          if (end == tok->inp) {
     866              tok->done = E_EOF;
     867              return 0;
     868          }
     869      }
     870      if (tok->start == NULL) {
     871          tok->buf = tok->cur;
     872      }
     873      tok->line_start = tok->cur;
     874      tok->lineno++;
     875      tok->inp = end;
     876      return 1;
     877  }
     878  
     879  static int
     880  tok_underflow_interactive(struct tok_state *tok) {
     881      if (tok->interactive_underflow == IUNDERFLOW_STOP) {
     882          tok->done = E_INTERACT_STOP;
     883          return 1;
     884      }
     885      char *newtok = PyOS_Readline(tok->fp ? tok->fp : stdin, stdout, tok->prompt);
     886      if (newtok != NULL) {
     887          char *translated = translate_newlines(newtok, 0, tok);
     888          PyMem_Free(newtok);
     889          if (translated == NULL) {
     890              return 0;
     891          }
     892          newtok = translated;
     893      }
     894      if (tok->encoding && newtok && *newtok) {
     895          /* Recode to UTF-8 */
     896          Py_ssize_t buflen;
     897          const char* buf;
     898          PyObject *u = translate_into_utf8(newtok, tok->encoding);
     899          PyMem_Free(newtok);
     900          if (u == NULL) {
     901              tok->done = E_DECODE;
     902              return 0;
     903          }
     904          buflen = PyBytes_GET_SIZE(u);
     905          buf = PyBytes_AS_STRING(u);
     906          newtok = PyMem_Malloc(buflen+1);
     907          if (newtok == NULL) {
     908              Py_DECREF(u);
     909              tok->done = E_NOMEM;
     910              return 0;
     911          }
     912          strcpy(newtok, buf);
     913          Py_DECREF(u);
     914      }
     915      if (tok->fp_interactive &&
     916          tok_concatenate_interactive_new_line(tok, newtok) == -1) {
     917          PyMem_Free(newtok);
     918          return 0;
     919      }
     920      if (tok->nextprompt != NULL) {
     921          tok->prompt = tok->nextprompt;
     922      }
     923      if (newtok == NULL) {
     924          tok->done = E_INTR;
     925      }
     926      else if (*newtok == '\0') {
     927          PyMem_Free(newtok);
     928          tok->done = E_EOF;
     929      }
     930      else if (tok->start != NULL) {
     931          Py_ssize_t cur_multi_line_start = tok->multi_line_start - tok->buf;
     932          size_t size = strlen(newtok);
     933          tok->lineno++;
     934          if (!tok_reserve_buf(tok, size + 1)) {
     935              PyMem_Free(tok->buf);
     936              tok->buf = NULL;
     937              PyMem_Free(newtok);
     938              return 0;
     939          }
     940          memcpy(tok->cur, newtok, size + 1);
     941          PyMem_Free(newtok);
     942          tok->inp += size;
     943          tok->multi_line_start = tok->buf + cur_multi_line_start;
     944      }
     945      else {
     946          tok->lineno++;
     947          PyMem_Free(tok->buf);
     948          tok->buf = newtok;
     949          tok->cur = tok->buf;
     950          tok->line_start = tok->buf;
     951          tok->inp = strchr(tok->buf, '\0');
     952          tok->end = tok->inp + 1;
     953      }
     954      if (tok->done != E_OK) {
     955          if (tok->prompt != NULL) {
     956              PySys_WriteStderr("\n");
     957          }
     958          return 0;
     959      }
     960      return 1;
     961  }
     962  
     963  static int
     964  tok_underflow_file(struct tok_state *tok) {
     965      if (tok->start == NULL) {
     966          tok->cur = tok->inp = tok->buf;
     967      }
     968      if (tok->decoding_state == STATE_INIT) {
     969          /* We have not yet determined the encoding.
     970             If an encoding is found, use the file-pointer
     971             reader functions from now on. */
     972          if (!check_bom(fp_getc, fp_ungetc, fp_setreadl, tok)) {
     973              error_ret(tok);
     974              return 0;
     975          }
     976          assert(tok->decoding_state != STATE_INIT);
     977      }
     978      /* Read until '\n' or EOF */
     979      if (tok->decoding_readline != NULL) {
     980          /* We already have a codec associated with this input. */
     981          if (!tok_readline_recode(tok)) {
     982              return 0;
     983          }
     984      }
     985      else {
     986          /* We want a 'raw' read. */
     987          if (!tok_readline_raw(tok)) {
     988              return 0;
     989          }
     990      }
     991      if (tok->inp == tok->cur) {
     992          tok->done = E_EOF;
     993          return 0;
     994      }
     995      if (tok->inp[-1] != '\n') {
     996          assert(tok->inp + 1 < tok->end);
     997          /* Last line does not end in \n, fake one */
     998          *tok->inp++ = '\n';
     999          *tok->inp = '\0';
    1000      }
    1001  
    1002      tok->lineno++;
    1003      if (tok->decoding_state != STATE_NORMAL) {
    1004          if (tok->lineno > 2) {
    1005              tok->decoding_state = STATE_NORMAL;
    1006          }
    1007          else if (!check_coding_spec(tok->cur, strlen(tok->cur),
    1008                                      tok, fp_setreadl))
    1009          {
    1010              return 0;
    1011          }
    1012      }
    1013      /* The default encoding is UTF-8, so make sure we don't have any
    1014         non-UTF-8 sequences in it. */
    1015      if (!tok->encoding && !ensure_utf8(tok->cur, tok)) {
    1016          error_ret(tok);
    1017          return 0;
    1018      }
    1019      assert(tok->done == E_OK);
    1020      return tok->done == E_OK;
    1021  }
    1022  
    1023  #if defined(Py_DEBUG)
    1024  static void
    1025  print_escape(FILE *f, const char *s, Py_ssize_t size)
    1026  {
    1027      if (s == NULL) {
    1028          fputs("NULL", f);
    1029          return;
    1030      }
    1031      putc('"', f);
    1032      while (size-- > 0) {
    1033          unsigned char c = *s++;
    1034          switch (c) {
    1035              case '\n': fputs("\\n", f); break;
    1036              case '\r': fputs("\\r", f); break;
    1037              case '\t': fputs("\\t", f); break;
    1038              case '\f': fputs("\\f", f); break;
    1039              case '\'': fputs("\\'", f); break;
    1040              case '"': fputs("\\\"", f); break;
    1041              default:
    1042                  if (0x20 <= c && c <= 0x7f)
    1043                      putc(c, f);
    1044                  else
    1045                      fprintf(f, "\\x%02x", c);
    1046          }
    1047      }
    1048      putc('"', f);
    1049  }
    1050  #endif
    1051  
    1052  /* Get next char, updating state; error code goes into tok->done */
    1053  
    1054  static int
    1055  tok_nextc(struct tok_state *tok)
    1056  {
    1057      int rc;
    1058      for (;;) {
    1059          if (tok->cur != tok->inp) {
    1060              if (tok->cur - tok->buf >= INT_MAX) {
    1061                  tok->done = E_COLUMNOVERFLOW;
    1062                  return EOF;
    1063              }
    1064              return Py_CHARMASK(*tok->cur++); /* Fast path */
    1065          }
    1066          if (tok->done != E_OK) {
    1067             return EOF;
    1068          }
    1069          if (tok->fp == NULL) {
    1070              rc = tok_underflow_string(tok);
    1071          }
    1072          else if (tok->prompt != NULL) {
    1073              rc = tok_underflow_interactive(tok);
    1074          }
    1075          else {
    1076              rc = tok_underflow_file(tok);
    1077          }
    1078  #if defined(Py_DEBUG)
    1079          if (Py_DebugFlag) {
    1080              fprintf(stderr, "line[%d] = ", tok->lineno);
    1081              print_escape(stderr, tok->cur, tok->inp - tok->cur);
    1082              fprintf(stderr, "  tok->done = %d\n", tok->done);
    1083          }
    1084  #endif
    1085          if (!rc) {
    1086              tok->cur = tok->inp;
    1087              return EOF;
    1088          }
    1089          tok->line_start = tok->cur;
    1090  
    1091          if (contains_null_bytes(tok->line_start, tok->inp - tok->line_start)) {
    1092              syntaxerror(tok, "source code cannot contain null bytes");
    1093              tok->cur = tok->inp;
    1094              return EOF;
    1095          }
    1096      }
    1097      Py_UNREACHABLE();
    1098  }
    1099  
    1100  /* Back-up one character */
    1101  
    1102  static void
    1103  tok_backup(struct tok_state *tok, int c)
    1104  {
    1105      if (c != EOF) {
    1106          if (--tok->cur < tok->buf) {
    1107              Py_FatalError("tokenizer beginning of buffer");
    1108          }
    1109          if ((int)(unsigned char)*tok->cur != c) {
    1110              Py_FatalError("tok_backup: wrong character");
    1111          }
    1112      }
    1113  }
    1114  
    1115  static int
    1116  _syntaxerror_range(struct tok_state *tok, const char *format,
    1117                     int col_offset, int end_col_offset,
    1118                     va_list vargs)
    1119  {
    1120      PyObject *errmsg, *errtext, *args;
    1121      errmsg = PyUnicode_FromFormatV(format, vargs);
    1122      if (!errmsg) {
    1123          goto error;
    1124      }
    1125  
    1126      errtext = PyUnicode_DecodeUTF8(tok->line_start, tok->cur - tok->line_start,
    1127                                     "replace");
    1128      if (!errtext) {
    1129          goto error;
    1130      }
    1131  
    1132      if (col_offset == -1) {
    1133          col_offset = (int)PyUnicode_GET_LENGTH(errtext);
    1134      }
    1135      if (end_col_offset == -1) {
    1136          end_col_offset = col_offset;
    1137      }
    1138  
    1139      Py_ssize_t line_len = strcspn(tok->line_start, "\n");
    1140      if (line_len != tok->cur - tok->line_start) {
    1141          Py_DECREF(errtext);
    1142          errtext = PyUnicode_DecodeUTF8(tok->line_start, line_len,
    1143                                         "replace");
    1144      }
    1145      if (!errtext) {
    1146          goto error;
    1147      }
    1148  
    1149      args = Py_BuildValue("(O(OiiNii))", errmsg, tok->filename, tok->lineno,
    1150                           col_offset, errtext, tok->lineno, end_col_offset);
    1151      if (args) {
    1152          PyErr_SetObject(PyExc_SyntaxError, args);
    1153          Py_DECREF(args);
    1154      }
    1155  
    1156  error:
    1157      Py_XDECREF(errmsg);
    1158      tok->done = E_ERROR;
    1159      return ERRORTOKEN;
    1160  }
    1161  
    1162  static int
    1163  syntaxerror(struct tok_state *tok, const char *format, ...)
    1164  {
    1165      va_list vargs;
    1166  #ifdef HAVE_STDARG_PROTOTYPES
    1167      va_start(vargs, format);
    1168  #else
    1169      va_start(vargs);
    1170  #endif
    1171      int ret = _syntaxerror_range(tok, format, -1, -1, vargs);
    1172      va_end(vargs);
    1173      return ret;
    1174  }
    1175  
    1176  static int
    1177  syntaxerror_known_range(struct tok_state *tok,
    1178                          int col_offset, int end_col_offset,
    1179                          const char *format, ...)
    1180  {
    1181      va_list vargs;
    1182  #ifdef HAVE_STDARG_PROTOTYPES
    1183      va_start(vargs, format);
    1184  #else
    1185      va_start(vargs);
    1186  #endif
    1187      int ret = _syntaxerror_range(tok, format, col_offset, end_col_offset, vargs);
    1188      va_end(vargs);
    1189      return ret;
    1190  }
    1191  
    1192  
    1193  
    1194  static int
    1195  indenterror(struct tok_state *tok)
    1196  {
    1197      tok->done = E_TABSPACE;
    1198      tok->cur = tok->inp;
    1199      return ERRORTOKEN;
    1200  }
    1201  
    1202  static int
    1203  parser_warn(struct tok_state *tok, PyObject *category, const char *format, ...)
    1204  {
    1205      if (!tok->report_warnings) {
    1206          return 0;
    1207      }
    1208  
    1209      PyObject *errmsg;
    1210      va_list vargs;
    1211  #ifdef HAVE_STDARG_PROTOTYPES
    1212      va_start(vargs, format);
    1213  #else
    1214      va_start(vargs);
    1215  #endif
    1216      errmsg = PyUnicode_FromFormatV(format, vargs);
    1217      va_end(vargs);
    1218      if (!errmsg) {
    1219          goto error;
    1220      }
    1221  
    1222      if (PyErr_WarnExplicitObject(category, errmsg, tok->filename,
    1223                                   tok->lineno, NULL, NULL) < 0) {
    1224          if (PyErr_ExceptionMatches(category)) {
    1225              /* Replace the DeprecationWarning exception with a SyntaxError
    1226                 to get a more accurate error report */
    1227              PyErr_Clear();
    1228              syntaxerror(tok, "%U", errmsg);
    1229          }
    1230          goto error;
    1231      }
    1232      Py_DECREF(errmsg);
    1233      return 0;
    1234  
    1235  error:
    1236      Py_XDECREF(errmsg);
    1237      tok->done = E_ERROR;
    1238      return -1;
    1239  }
    1240  
    1241  static int
    1242  lookahead(struct tok_state *tok, const char *test)
    1243  {
    1244      const char *s = test;
    1245      int res = 0;
    1246      while (1) {
    1247          int c = tok_nextc(tok);
    1248          if (*s == 0) {
    1249              res = !is_potential_identifier_char(c);
    1250          }
    1251          else if (c == *s) {
    1252              s++;
    1253              continue;
    1254          }
    1255  
    1256          tok_backup(tok, c);
    1257          while (s != test) {
    1258              tok_backup(tok, *--s);
    1259          }
    1260          return res;
    1261      }
    1262  }
    1263  
    1264  static int
    1265  verify_end_of_number(struct tok_state *tok, int c, const char *kind)
    1266  {
    1267      /* Emit a deprecation warning only if the numeric literal is immediately
    1268       * followed by one of keywords which can occur after a numeric literal
    1269       * in valid code: "and", "else", "for", "if", "in", "is" and "or".
    1270       * It allows to gradually deprecate existing valid code without adding
    1271       * warning before error in most cases of invalid numeric literal (which
    1272       * would be confusing and break existing tests).
    1273       * Raise a syntax error with slightly better message than plain
    1274       * "invalid syntax" if the numeric literal is immediately followed by
    1275       * other keyword or identifier.
    1276       */
    1277      int r = 0;
    1278      if (c == 'a') {
    1279          r = lookahead(tok, "nd");
    1280      }
    1281      else if (c == 'e') {
    1282          r = lookahead(tok, "lse");
    1283      }
    1284      else if (c == 'f') {
    1285          r = lookahead(tok, "or");
    1286      }
    1287      else if (c == 'i') {
    1288          int c2 = tok_nextc(tok);
    1289          if (c2 == 'f' || c2 == 'n' || c2 == 's') {
    1290              r = 1;
    1291          }
    1292          tok_backup(tok, c2);
    1293      }
    1294      else if (c == 'o') {
    1295          r = lookahead(tok, "r");
    1296      }
    1297      else if (c == 'n') {
    1298          r = lookahead(tok, "ot");
    1299      }
    1300      if (r) {
    1301          tok_backup(tok, c);
    1302          if (parser_warn(tok, PyExc_SyntaxWarning,
    1303                  "invalid %s literal", kind))
    1304          {
    1305              return 0;
    1306          }
    1307          tok_nextc(tok);
    1308      }
    1309      else /* In future releases, only error will remain. */
    1310      if (c < 128 && is_potential_identifier_char(c)) {
    1311          tok_backup(tok, c);
    1312          syntaxerror(tok, "invalid %s literal", kind);
    1313          return 0;
    1314      }
    1315      return 1;
    1316  }
    1317  
    1318  /* Verify that the identifier follows PEP 3131.
    1319     All identifier strings are guaranteed to be "ready" unicode objects.
    1320   */
    1321  static int
    1322  verify_identifier(struct tok_state *tok)
    1323  {
    1324      PyObject *s;
    1325      if (tok->decoding_erred)
    1326          return 0;
    1327      s = PyUnicode_DecodeUTF8(tok->start, tok->cur - tok->start, NULL);
    1328      if (s == NULL) {
    1329          if (PyErr_ExceptionMatches(PyExc_UnicodeDecodeError)) {
    1330              tok->done = E_DECODE;
    1331          }
    1332          else {
    1333              tok->done = E_ERROR;
    1334          }
    1335          return 0;
    1336      }
    1337      Py_ssize_t invalid = _PyUnicode_ScanIdentifier(s);
    1338      if (invalid < 0) {
    1339          Py_DECREF(s);
    1340          tok->done = E_ERROR;
    1341          return 0;
    1342      }
    1343      assert(PyUnicode_GET_LENGTH(s) > 0);
    1344      if (invalid < PyUnicode_GET_LENGTH(s)) {
    1345          Py_UCS4 ch = PyUnicode_READ_CHAR(s, invalid);
    1346          if (invalid + 1 < PyUnicode_GET_LENGTH(s)) {
    1347              /* Determine the offset in UTF-8 encoded input */
    1348              Py_SETREF(s, PyUnicode_Substring(s, 0, invalid + 1));
    1349              if (s != NULL) {
    1350                  Py_SETREF(s, PyUnicode_AsUTF8String(s));
    1351              }
    1352              if (s == NULL) {
    1353                  tok->done = E_ERROR;
    1354                  return 0;
    1355              }
    1356              tok->cur = (char *)tok->start + PyBytes_GET_SIZE(s);
    1357          }
    1358          Py_DECREF(s);
    1359          // PyUnicode_FromFormatV() does not support %X
    1360          char hex[9];
    1361          (void)PyOS_snprintf(hex, sizeof(hex), "%04X", ch);
    1362          if (Py_UNICODE_ISPRINTABLE(ch)) {
    1363              syntaxerror(tok, "invalid character '%c' (U+%s)", ch, hex);
    1364          }
    1365          else {
    1366              syntaxerror(tok, "invalid non-printable character U+%s", hex);
    1367          }
    1368          return 0;
    1369      }
    1370      Py_DECREF(s);
    1371      return 1;
    1372  }
    1373  
    1374  static int
    1375  tok_decimal_tail(struct tok_state *tok)
    1376  {
    1377      int c;
    1378  
    1379      while (1) {
    1380          do {
    1381              c = tok_nextc(tok);
    1382          } while (isdigit(c));
    1383          if (c != '_') {
    1384              break;
    1385          }
    1386          c = tok_nextc(tok);
    1387          if (!isdigit(c)) {
    1388              tok_backup(tok, c);
    1389              syntaxerror(tok, "invalid decimal literal");
    1390              return 0;
    1391          }
    1392      }
    1393      return c;
    1394  }
    1395  
    1396  /* Get next token, after space stripping etc. */
    1397  
    1398  static inline int
    1399  tok_continuation_line(struct tok_state *tok) {
    1400      int c = tok_nextc(tok);
    1401      if (c != '\n') {
    1402          tok->done = E_LINECONT;
    1403          return -1;
    1404      }
    1405      c = tok_nextc(tok);
    1406      if (c == EOF) {
    1407          tok->done = E_EOF;
    1408          tok->cur = tok->inp;
    1409          return -1;
    1410      } else {
    1411          tok_backup(tok, c);
    1412      }
    1413      return c;
    1414  }
    1415  
    1416  static int
    1417  tok_get(struct tok_state *tok, const char **p_start, const char **p_end)
    1418  {
    1419      int c;
    1420      int blankline, nonascii;
    1421  
    1422      *p_start = *p_end = NULL;
    1423    nextline:
    1424      tok->start = NULL;
    1425      blankline = 0;
    1426  
    1427      /* Get indentation level */
    1428      if (tok->atbol) {
    1429          int col = 0;
    1430          int altcol = 0;
    1431          tok->atbol = 0;
    1432          int cont_line_col = 0;
    1433          for (;;) {
    1434              c = tok_nextc(tok);
    1435              if (c == ' ') {
    1436                  col++, altcol++;
    1437              }
    1438              else if (c == '\t') {
    1439                  col = (col / tok->tabsize + 1) * tok->tabsize;
    1440                  altcol = (altcol / ALTTABSIZE + 1) * ALTTABSIZE;
    1441              }
    1442              else if (c == '\014')  {/* Control-L (formfeed) */
    1443                  col = altcol = 0; /* For Emacs users */
    1444              }
    1445              else if (c == '\\') {
    1446                  // Indentation cannot be split over multiple physical lines
    1447                  // using backslashes. This means that if we found a backslash
    1448                  // preceded by whitespace, **the first one we find** determines
    1449                  // the level of indentation of whatever comes next.
    1450                  cont_line_col = cont_line_col ? cont_line_col : col;
    1451                  if ((c = tok_continuation_line(tok)) == -1) {
    1452                      return ERRORTOKEN;
    1453                  }
    1454              }
    1455              else {
    1456                  break;
    1457              }
    1458          }
    1459          tok_backup(tok, c);
    1460          if (c == '#' || c == '\n') {
    1461              /* Lines with only whitespace and/or comments
    1462                 shouldn't affect the indentation and are
    1463                 not passed to the parser as NEWLINE tokens,
    1464                 except *totally* empty lines in interactive
    1465                 mode, which signal the end of a command group. */
    1466              if (col == 0 && c == '\n' && tok->prompt != NULL) {
    1467                  blankline = 0; /* Let it through */
    1468              }
    1469              else if (tok->prompt != NULL && tok->lineno == 1) {
    1470                  /* In interactive mode, if the first line contains
    1471                     only spaces and/or a comment, let it through. */
    1472                  blankline = 0;
    1473                  col = altcol = 0;
    1474              }
    1475              else {
    1476                  blankline = 1; /* Ignore completely */
    1477              }
    1478              /* We can't jump back right here since we still
    1479                 may need to skip to the end of a comment */
    1480          }
    1481          if (!blankline && tok->level == 0) {
    1482              col = cont_line_col ? cont_line_col : col;
    1483              altcol = cont_line_col ? cont_line_col : altcol;
    1484              if (col == tok->indstack[tok->indent]) {
    1485                  /* No change */
    1486                  if (altcol != tok->altindstack[tok->indent]) {
    1487                      return indenterror(tok);
    1488                  }
    1489              }
    1490              else if (col > tok->indstack[tok->indent]) {
    1491                  /* Indent -- always one */
    1492                  if (tok->indent+1 >= MAXINDENT) {
    1493                      tok->done = E_TOODEEP;
    1494                      tok->cur = tok->inp;
    1495                      return ERRORTOKEN;
    1496                  }
    1497                  if (altcol <= tok->altindstack[tok->indent]) {
    1498                      return indenterror(tok);
    1499                  }
    1500                  tok->pendin++;
    1501                  tok->indstack[++tok->indent] = col;
    1502                  tok->altindstack[tok->indent] = altcol;
    1503              }
    1504              else /* col < tok->indstack[tok->indent] */ {
    1505                  /* Dedent -- any number, must be consistent */
    1506                  while (tok->indent > 0 &&
    1507                      col < tok->indstack[tok->indent]) {
    1508                      tok->pendin--;
    1509                      tok->indent--;
    1510                  }
    1511                  if (col != tok->indstack[tok->indent]) {
    1512                      tok->done = E_DEDENT;
    1513                      tok->cur = tok->inp;
    1514                      return ERRORTOKEN;
    1515                  }
    1516                  if (altcol != tok->altindstack[tok->indent]) {
    1517                      return indenterror(tok);
    1518                  }
    1519              }
    1520          }
    1521      }
    1522  
    1523      tok->start = tok->cur;
    1524  
    1525      /* Return pending indents/dedents */
    1526      if (tok->pendin != 0) {
    1527          if (tok->pendin < 0) {
    1528              tok->pendin++;
    1529              return DEDENT;
    1530          }
    1531          else {
    1532              tok->pendin--;
    1533              return INDENT;
    1534          }
    1535      }
    1536  
    1537      /* Peek ahead at the next character */
    1538      c = tok_nextc(tok);
    1539      tok_backup(tok, c);
    1540      /* Check if we are closing an async function */
    1541      if (tok->async_def
    1542          && !blankline
    1543          /* Due to some implementation artifacts of type comments,
    1544           * a TYPE_COMMENT at the start of a function won't set an
    1545           * indentation level and it will produce a NEWLINE after it.
    1546           * To avoid spuriously ending an async function due to this,
    1547           * wait until we have some non-newline char in front of us. */
    1548          && c != '\n'
    1549          && tok->level == 0
    1550          /* There was a NEWLINE after ASYNC DEF,
    1551             so we're past the signature. */
    1552          && tok->async_def_nl
    1553          /* Current indentation level is less than where
    1554             the async function was defined */
    1555          && tok->async_def_indent >= tok->indent)
    1556      {
    1557          tok->async_def = 0;
    1558          tok->async_def_indent = 0;
    1559          tok->async_def_nl = 0;
    1560      }
    1561  
    1562   again:
    1563      tok->start = NULL;
    1564      /* Skip spaces */
    1565      do {
    1566          c = tok_nextc(tok);
    1567      } while (c == ' ' || c == '\t' || c == '\014');
    1568  
    1569      /* Set start of current token */
    1570      tok->start = tok->cur == NULL ? NULL : tok->cur - 1;
    1571  
    1572      /* Skip comment, unless it's a type comment */
    1573      if (c == '#') {
    1574          const char *prefix, *p, *type_start;
    1575  
    1576          while (c != EOF && c != '\n') {
    1577              c = tok_nextc(tok);
    1578          }
    1579  
    1580          if (tok->type_comments) {
    1581              p = tok->start;
    1582              prefix = type_comment_prefix;
    1583              while (*prefix && p < tok->cur) {
    1584                  if (*prefix == ' ') {
    1585                      while (*p == ' ' || *p == '\t') {
    1586                          p++;
    1587                      }
    1588                  } else if (*prefix == *p) {
    1589                      p++;
    1590                  } else {
    1591                      break;
    1592                  }
    1593  
    1594                  prefix++;
    1595              }
    1596  
    1597              /* This is a type comment if we matched all of type_comment_prefix. */
    1598              if (!*prefix) {
    1599                  int is_type_ignore = 1;
    1600                  const char *ignore_end = p + 6;
    1601                  tok_backup(tok, c);  /* don't eat the newline or EOF */
    1602  
    1603                  type_start = p;
    1604  
    1605                  /* A TYPE_IGNORE is "type: ignore" followed by the end of the token
    1606                   * or anything ASCII and non-alphanumeric. */
    1607                  is_type_ignore = (
    1608                      tok->cur >= ignore_end && memcmp(p, "ignore", 6) == 0
    1609                      && !(tok->cur > ignore_end
    1610                           && ((unsigned char)ignore_end[0] >= 128 || Py_ISALNUM(ignore_end[0]))));
    1611  
    1612                  if (is_type_ignore) {
    1613                      *p_start = ignore_end;
    1614                      *p_end = tok->cur;
    1615  
    1616                      /* If this type ignore is the only thing on the line, consume the newline also. */
    1617                      if (blankline) {
    1618                          tok_nextc(tok);
    1619                          tok->atbol = 1;
    1620                      }
    1621                      return TYPE_IGNORE;
    1622                  } else {
    1623                      *p_start = type_start;  /* after type_comment_prefix */
    1624                      *p_end = tok->cur;
    1625                      return TYPE_COMMENT;
    1626                  }
    1627              }
    1628          }
    1629      }
    1630  
    1631      if (tok->done == E_INTERACT_STOP) {
    1632          return ENDMARKER;
    1633      }
    1634  
    1635      /* Check for EOF and errors now */
    1636      if (c == EOF) {
    1637          if (tok->level) {
    1638              return ERRORTOKEN;
    1639          }
    1640          return tok->done == E_EOF ? ENDMARKER : ERRORTOKEN;
    1641      }
    1642  
    1643      /* Identifier (most frequent token!) */
    1644      nonascii = 0;
    1645      if (is_potential_identifier_start(c)) {
    1646          /* Process the various legal combinations of b"", r"", u"", and f"". */
    1647          int saw_b = 0, saw_r = 0, saw_u = 0, saw_f = 0;
    1648          while (1) {
    1649              if (!(saw_b || saw_u || saw_f) && (c == 'b' || c == 'B'))
    1650                  saw_b = 1;
    1651              /* Since this is a backwards compatibility support literal we don't
    1652                 want to support it in arbitrary order like byte literals. */
    1653              else if (!(saw_b || saw_u || saw_r || saw_f)
    1654                       && (c == 'u'|| c == 'U')) {
    1655                  saw_u = 1;
    1656              }
    1657              /* ur"" and ru"" are not supported */
    1658              else if (!(saw_r || saw_u) && (c == 'r' || c == 'R')) {
    1659                  saw_r = 1;
    1660              }
    1661              else if (!(saw_f || saw_b || saw_u) && (c == 'f' || c == 'F')) {
    1662                  saw_f = 1;
    1663              }
    1664              else {
    1665                  break;
    1666              }
    1667              c = tok_nextc(tok);
    1668              if (c == '"' || c == '\'') {
    1669                  goto letter_quote;
    1670              }
    1671          }
    1672          while (is_potential_identifier_char(c)) {
    1673              if (c >= 128) {
    1674                  nonascii = 1;
    1675              }
    1676              c = tok_nextc(tok);
    1677          }
    1678          tok_backup(tok, c);
    1679          if (nonascii && !verify_identifier(tok)) {
    1680              return ERRORTOKEN;
    1681          }
    1682  
    1683          *p_start = tok->start;
    1684          *p_end = tok->cur;
    1685  
    1686          /* async/await parsing block. */
    1687          if (tok->cur - tok->start == 5 && tok->start[0] == 'a') {
    1688              /* May be an 'async' or 'await' token.  For Python 3.7 or
    1689                 later we recognize them unconditionally.  For Python
    1690                 3.5 or 3.6 we recognize 'async' in front of 'def', and
    1691                 either one inside of 'async def'.  (Technically we
    1692                 shouldn't recognize these at all for 3.4 or earlier,
    1693                 but there's no *valid* Python 3.4 code that would be
    1694                 rejected, and async functions will be rejected in a
    1695                 later phase.) */
    1696              if (!tok->async_hacks || tok->async_def) {
    1697                  /* Always recognize the keywords. */
    1698                  if (memcmp(tok->start, "async", 5) == 0) {
    1699                      return ASYNC;
    1700                  }
    1701                  if (memcmp(tok->start, "await", 5) == 0) {
    1702                      return AWAIT;
    1703                  }
    1704              }
    1705              else if (memcmp(tok->start, "async", 5) == 0) {
    1706                  /* The current token is 'async'.
    1707                     Look ahead one token to see if that is 'def'. */
    1708  
    1709                  struct tok_state ahead_tok;
    1710                  const char *ahead_tok_start = NULL;
    1711                  const char *ahead_tok_end = NULL;
    1712                  int ahead_tok_kind;
    1713  
    1714                  memcpy(&ahead_tok, tok, sizeof(ahead_tok));
    1715                  ahead_tok_kind = tok_get(&ahead_tok, &ahead_tok_start,
    1716                                           &ahead_tok_end);
    1717  
    1718                  if (ahead_tok_kind == NAME
    1719                      && ahead_tok.cur - ahead_tok.start == 3
    1720                      && memcmp(ahead_tok.start, "def", 3) == 0)
    1721                  {
    1722                      /* The next token is going to be 'def', so instead of
    1723                         returning a plain NAME token, return ASYNC. */
    1724                      tok->async_def_indent = tok->indent;
    1725                      tok->async_def = 1;
    1726                      return ASYNC;
    1727                  }
    1728              }
    1729          }
    1730  
    1731          return NAME;
    1732      }
    1733  
    1734      /* Newline */
    1735      if (c == '\n') {
    1736          tok->atbol = 1;
    1737          if (blankline || tok->level > 0) {
    1738              goto nextline;
    1739          }
    1740          *p_start = tok->start;
    1741          *p_end = tok->cur - 1; /* Leave '\n' out of the string */
    1742          tok->cont_line = 0;
    1743          if (tok->async_def) {
    1744              /* We're somewhere inside an 'async def' function, and
    1745                 we've encountered a NEWLINE after its signature. */
    1746              tok->async_def_nl = 1;
    1747          }
    1748          return NEWLINE;
    1749      }
    1750  
    1751      /* Period or number starting with period? */
    1752      if (c == '.') {
    1753          c = tok_nextc(tok);
    1754          if (isdigit(c)) {
    1755              goto fraction;
    1756          } else if (c == '.') {
    1757              c = tok_nextc(tok);
    1758              if (c == '.') {
    1759                  *p_start = tok->start;
    1760                  *p_end = tok->cur;
    1761                  return ELLIPSIS;
    1762              }
    1763              else {
    1764                  tok_backup(tok, c);
    1765              }
    1766              tok_backup(tok, '.');
    1767          }
    1768          else {
    1769              tok_backup(tok, c);
    1770          }
    1771          *p_start = tok->start;
    1772          *p_end = tok->cur;
    1773          return DOT;
    1774      }
    1775  
    1776      /* Number */
    1777      if (isdigit(c)) {
    1778          if (c == '0') {
    1779              /* Hex, octal or binary -- maybe. */
    1780              c = tok_nextc(tok);
    1781              if (c == 'x' || c == 'X') {
    1782                  /* Hex */
    1783                  c = tok_nextc(tok);
    1784                  do {
    1785                      if (c == '_') {
    1786                          c = tok_nextc(tok);
    1787                      }
    1788                      if (!isxdigit(c)) {
    1789                          tok_backup(tok, c);
    1790                          return syntaxerror(tok, "invalid hexadecimal literal");
    1791                      }
    1792                      do {
    1793                          c = tok_nextc(tok);
    1794                      } while (isxdigit(c));
    1795                  } while (c == '_');
    1796                  if (!verify_end_of_number(tok, c, "hexadecimal")) {
    1797                      return ERRORTOKEN;
    1798                  }
    1799              }
    1800              else if (c == 'o' || c == 'O') {
    1801                  /* Octal */
    1802                  c = tok_nextc(tok);
    1803                  do {
    1804                      if (c == '_') {
    1805                          c = tok_nextc(tok);
    1806                      }
    1807                      if (c < '0' || c >= '8') {
    1808                          if (isdigit(c)) {
    1809                              return syntaxerror(tok,
    1810                                      "invalid digit '%c' in octal literal", c);
    1811                          }
    1812                          else {
    1813                              tok_backup(tok, c);
    1814                              return syntaxerror(tok, "invalid octal literal");
    1815                          }
    1816                      }
    1817                      do {
    1818                          c = tok_nextc(tok);
    1819                      } while ('0' <= c && c < '8');
    1820                  } while (c == '_');
    1821                  if (isdigit(c)) {
    1822                      return syntaxerror(tok,
    1823                              "invalid digit '%c' in octal literal", c);
    1824                  }
    1825                  if (!verify_end_of_number(tok, c, "octal")) {
    1826                      return ERRORTOKEN;
    1827                  }
    1828              }
    1829              else if (c == 'b' || c == 'B') {
    1830                  /* Binary */
    1831                  c = tok_nextc(tok);
    1832                  do {
    1833                      if (c == '_') {
    1834                          c = tok_nextc(tok);
    1835                      }
    1836                      if (c != '0' && c != '1') {
    1837                          if (isdigit(c)) {
    1838                              return syntaxerror(tok,
    1839                                      "invalid digit '%c' in binary literal", c);
    1840                          }
    1841                          else {
    1842                              tok_backup(tok, c);
    1843                              return syntaxerror(tok, "invalid binary literal");
    1844                          }
    1845                      }
    1846                      do {
    1847                          c = tok_nextc(tok);
    1848                      } while (c == '0' || c == '1');
    1849                  } while (c == '_');
    1850                  if (isdigit(c)) {
    1851                      return syntaxerror(tok,
    1852                              "invalid digit '%c' in binary literal", c);
    1853                  }
    1854                  if (!verify_end_of_number(tok, c, "binary")) {
    1855                      return ERRORTOKEN;
    1856                  }
    1857              }
    1858              else {
    1859                  int nonzero = 0;
    1860                  /* maybe old-style octal; c is first char of it */
    1861                  /* in any case, allow '0' as a literal */
    1862                  while (1) {
    1863                      if (c == '_') {
    1864                          c = tok_nextc(tok);
    1865                          if (!isdigit(c)) {
    1866                              tok_backup(tok, c);
    1867                              return syntaxerror(tok, "invalid decimal literal");
    1868                          }
    1869                      }
    1870                      if (c != '0') {
    1871                          break;
    1872                      }
    1873                      c = tok_nextc(tok);
    1874                  }
    1875                  char* zeros_end = tok->cur;
    1876                  if (isdigit(c)) {
    1877                      nonzero = 1;
    1878                      c = tok_decimal_tail(tok);
    1879                      if (c == 0) {
    1880                          return ERRORTOKEN;
    1881                      }
    1882                  }
    1883                  if (c == '.') {
    1884                      c = tok_nextc(tok);
    1885                      goto fraction;
    1886                  }
    1887                  else if (c == 'e' || c == 'E') {
    1888                      goto exponent;
    1889                  }
    1890                  else if (c == 'j' || c == 'J') {
    1891                      goto imaginary;
    1892                  }
    1893                  else if (nonzero) {
    1894                      /* Old-style octal: now disallowed. */
    1895                      tok_backup(tok, c);
    1896                      return syntaxerror_known_range(
    1897                              tok, (int)(tok->start + 1 - tok->line_start),
    1898                              (int)(zeros_end - tok->line_start),
    1899                              "leading zeros in decimal integer "
    1900                              "literals are not permitted; "
    1901                              "use an 0o prefix for octal integers");
    1902                  }
    1903                  if (!verify_end_of_number(tok, c, "decimal")) {
    1904                      return ERRORTOKEN;
    1905                  }
    1906              }
    1907          }
    1908          else {
    1909              /* Decimal */
    1910              c = tok_decimal_tail(tok);
    1911              if (c == 0) {
    1912                  return ERRORTOKEN;
    1913              }
    1914              {
    1915                  /* Accept floating point numbers. */
    1916                  if (c == '.') {
    1917                      c = tok_nextc(tok);
    1918          fraction:
    1919                      /* Fraction */
    1920                      if (isdigit(c)) {
    1921                          c = tok_decimal_tail(tok);
    1922                          if (c == 0) {
    1923                              return ERRORTOKEN;
    1924                          }
    1925                      }
    1926                  }
    1927                  if (c == 'e' || c == 'E') {
    1928                      int e;
    1929                    exponent:
    1930                      e = c;
    1931                      /* Exponent part */
    1932                      c = tok_nextc(tok);
    1933                      if (c == '+' || c == '-') {
    1934                          c = tok_nextc(tok);
    1935                          if (!isdigit(c)) {
    1936                              tok_backup(tok, c);
    1937                              return syntaxerror(tok, "invalid decimal literal");
    1938                          }
    1939                      } else if (!isdigit(c)) {
    1940                          tok_backup(tok, c);
    1941                          if (!verify_end_of_number(tok, e, "decimal")) {
    1942                              return ERRORTOKEN;
    1943                          }
    1944                          tok_backup(tok, e);
    1945                          *p_start = tok->start;
    1946                          *p_end = tok->cur;
    1947                          return NUMBER;
    1948                      }
    1949                      c = tok_decimal_tail(tok);
    1950                      if (c == 0) {
    1951                          return ERRORTOKEN;
    1952                      }
    1953                  }
    1954                  if (c == 'j' || c == 'J') {
    1955                      /* Imaginary part */
    1956          imaginary:
    1957                      c = tok_nextc(tok);
    1958                      if (!verify_end_of_number(tok, c, "imaginary")) {
    1959                          return ERRORTOKEN;
    1960                      }
    1961                  }
    1962                  else if (!verify_end_of_number(tok, c, "decimal")) {
    1963                      return ERRORTOKEN;
    1964                  }
    1965              }
    1966          }
    1967          tok_backup(tok, c);
    1968          *p_start = tok->start;
    1969          *p_end = tok->cur;
    1970          return NUMBER;
    1971      }
    1972  
    1973    letter_quote:
    1974      /* String */
    1975      if (c == '\'' || c == '"') {
    1976          int quote = c;
    1977          int quote_size = 1;             /* 1 or 3 */
    1978          int end_quote_size = 0;
    1979  
    1980          /* Nodes of type STRING, especially multi line strings
    1981             must be handled differently in order to get both
    1982             the starting line number and the column offset right.
    1983             (cf. issue 16806) */
    1984          tok->first_lineno = tok->lineno;
    1985          tok->multi_line_start = tok->line_start;
    1986  
    1987          /* Find the quote size and start of string */
    1988          c = tok_nextc(tok);
    1989          if (c == quote) {
    1990              c = tok_nextc(tok);
    1991              if (c == quote) {
    1992                  quote_size = 3;
    1993              }
    1994              else {
    1995                  end_quote_size = 1;     /* empty string found */
    1996              }
    1997          }
    1998          if (c != quote) {
    1999              tok_backup(tok, c);
    2000          }
    2001  
    2002          /* Get rest of string */
    2003          while (end_quote_size != quote_size) {
    2004              c = tok_nextc(tok);
    2005              if (tok->done == E_ERROR) {
    2006                  return ERRORTOKEN;
    2007              }
    2008              if (tok->done == E_DECODE) {
    2009                  break;
    2010              }
    2011              if (c == EOF || (quote_size == 1 && c == '\n')) {
    2012                  assert(tok->multi_line_start != NULL);
    2013                  // shift the tok_state's location into
    2014                  // the start of string, and report the error
    2015                  // from the initial quote character
    2016                  tok->cur = (char *)tok->start;
    2017                  tok->cur++;
    2018                  tok->line_start = tok->multi_line_start;
    2019                  int start = tok->lineno;
    2020                  tok->lineno = tok->first_lineno;
    2021                  if (quote_size == 3) {
    2022                      syntaxerror(tok, "unterminated triple-quoted string literal"
    2023                                       " (detected at line %d)", start);
    2024                      if (c != '\n') {
    2025                          tok->done = E_EOFS;
    2026                      }
    2027                      return ERRORTOKEN;
    2028                  }
    2029                  else {
    2030                      syntaxerror(tok, "unterminated string literal (detected at"
    2031                                       " line %d)", start);
    2032                      if (c != '\n') {
    2033                          tok->done = E_EOLS;
    2034                      }
    2035                      return ERRORTOKEN;
    2036                  }
    2037              }
    2038              if (c == quote) {
    2039                  end_quote_size += 1;
    2040              }
    2041              else {
    2042                  end_quote_size = 0;
    2043                  if (c == '\\') {
    2044                      tok_nextc(tok);  /* skip escaped char */
    2045                  }
    2046              }
    2047          }
    2048  
    2049          *p_start = tok->start;
    2050          *p_end = tok->cur;
    2051          return STRING;
    2052      }
    2053  
    2054      /* Line continuation */
    2055      if (c == '\\') {
    2056          if ((c = tok_continuation_line(tok)) == -1) {
    2057              return ERRORTOKEN;
    2058          }
    2059          tok->cont_line = 1;
    2060          goto again; /* Read next line */
    2061      }
    2062  
    2063      /* Check for two-character token */
    2064      {
    2065          int c2 = tok_nextc(tok);
    2066          int token = PyToken_TwoChars(c, c2);
    2067          if (token != OP) {
    2068              int c3 = tok_nextc(tok);
    2069              int token3 = PyToken_ThreeChars(c, c2, c3);
    2070              if (token3 != OP) {
    2071                  token = token3;
    2072              }
    2073              else {
    2074                  tok_backup(tok, c3);
    2075              }
    2076              *p_start = tok->start;
    2077              *p_end = tok->cur;
    2078              return token;
    2079          }
    2080          tok_backup(tok, c2);
    2081      }
    2082  
    2083      /* Keep track of parentheses nesting level */
    2084      switch (c) {
    2085      case '(':
    2086      case '[':
    2087      case '{':
    2088          if (tok->level >= MAXLEVEL) {
    2089              return syntaxerror(tok, "too many nested parentheses");
    2090          }
    2091          tok->parenstack[tok->level] = c;
    2092          tok->parenlinenostack[tok->level] = tok->lineno;
    2093          tok->parencolstack[tok->level] = (int)(tok->start - tok->line_start);
    2094          tok->level++;
    2095          break;
    2096      case ')':
    2097      case ']':
    2098      case '}':
    2099          if (!tok->level) {
    2100              return syntaxerror(tok, "unmatched '%c'", c);
    2101          }
    2102          tok->level--;
    2103          int opening = tok->parenstack[tok->level];
    2104          if (!((opening == '(' && c == ')') ||
    2105                (opening == '[' && c == ']') ||
    2106                (opening == '{' && c == '}')))
    2107          {
    2108              if (tok->parenlinenostack[tok->level] != tok->lineno) {
    2109                  return syntaxerror(tok,
    2110                          "closing parenthesis '%c' does not match "
    2111                          "opening parenthesis '%c' on line %d",
    2112                          c, opening, tok->parenlinenostack[tok->level]);
    2113              }
    2114              else {
    2115                  return syntaxerror(tok,
    2116                          "closing parenthesis '%c' does not match "
    2117                          "opening parenthesis '%c'",
    2118                          c, opening);
    2119              }
    2120          }
    2121          break;
    2122      }
    2123  
    2124      if (!Py_UNICODE_ISPRINTABLE(c)) {
    2125          char hex[9];
    2126          (void)PyOS_snprintf(hex, sizeof(hex), "%04X", c);
    2127          return syntaxerror(tok, "invalid non-printable character U+%s", hex);
    2128      }
    2129  
    2130      /* Punctuation character */
    2131      *p_start = tok->start;
    2132      *p_end = tok->cur;
    2133      return PyToken_OneChar(c);
    2134  }
    2135  
    2136  int
    2137  _PyTokenizer_Get(struct tok_state *tok,
    2138                   const char **p_start, const char **p_end)
    2139  {
    2140      int result = tok_get(tok, p_start, p_end);
    2141      if (tok->decoding_erred) {
    2142          result = ERRORTOKEN;
    2143          tok->done = E_DECODE;
    2144      }
    2145      return result;
    2146  }
    2147  
    2148  #if defined(__wasi__) || (defined(__EMSCRIPTEN__) && (__EMSCRIPTEN_major__ >= 3))
    2149  // fdopen() with borrowed fd. WASI does not provide dup() and Emscripten's
    2150  // dup() emulation with open() is slow.
    2151  typedef union {
    2152      void *cookie;
    2153      int fd;
    2154  } borrowed;
    2155  
    2156  static ssize_t
    2157  borrow_read(void *cookie, char *buf, size_t size)
    2158  {
    2159      borrowed b = {.cookie = cookie};
    2160      return read(b.fd, (void *)buf, size);
    2161  }
    2162  
    2163  static FILE *
    2164  fdopen_borrow(int fd) {
    2165      // supports only reading. seek fails. close and write are no-ops.
    2166      cookie_io_functions_t io_cb = {borrow_read, NULL, NULL, NULL};
    2167      borrowed b = {.fd = fd};
    2168      return fopencookie(b.cookie, "r", io_cb);
    2169  }
    2170  #else
    2171  static FILE *
    2172  fdopen_borrow(int fd) {
    2173      fd = _Py_dup(fd);
    2174      if (fd < 0) {
    2175          return NULL;
    2176      }
    2177      return fdopen(fd, "r");
    2178  }
    2179  #endif
    2180  
    2181  /* Get the encoding of a Python file. Check for the coding cookie and check if
    2182     the file starts with a BOM.
    2183  
    2184     _PyTokenizer_FindEncodingFilename() returns NULL when it can't find the
    2185     encoding in the first or second line of the file (in which case the encoding
    2186     should be assumed to be UTF-8).
    2187  
    2188     The char* returned is malloc'ed via PyMem_Malloc() and thus must be freed
    2189     by the caller. */
    2190  
    2191  char *
    2192  _PyTokenizer_FindEncodingFilename(int fd, PyObject *filename)
    2193  {
    2194      struct tok_state *tok;
    2195      FILE *fp;
    2196      const char *p_start = NULL;
    2197      const char *p_end = NULL;
    2198      char *encoding = NULL;
    2199  
    2200      fp = fdopen_borrow(fd);
    2201      if (fp == NULL) {
    2202          return NULL;
    2203      }
    2204      tok = _PyTokenizer_FromFile(fp, NULL, NULL, NULL);
    2205      if (tok == NULL) {
    2206          fclose(fp);
    2207          return NULL;
    2208      }
    2209      if (filename != NULL) {
    2210          Py_INCREF(filename);
    2211          tok->filename = filename;
    2212      }
    2213      else {
    2214          tok->filename = PyUnicode_FromString("<string>");
    2215          if (tok->filename == NULL) {
    2216              fclose(fp);
    2217              _PyTokenizer_Free(tok);
    2218              return encoding;
    2219          }
    2220      }
    2221      // We don't want to report warnings here because it could cause infinite recursion
    2222      // if fetching the encoding shows a warning.
    2223      tok->report_warnings = 0;
    2224      while (tok->lineno < 2 && tok->done == E_OK) {
    2225          _PyTokenizer_Get(tok, &p_start, &p_end);
    2226      }
    2227      fclose(fp);
    2228      if (tok->encoding) {
    2229          encoding = (char *)PyMem_Malloc(strlen(tok->encoding) + 1);
    2230          if (encoding) {
    2231              strcpy(encoding, tok->encoding);
    2232          }
    2233      }
    2234      _PyTokenizer_Free(tok);
    2235      return encoding;
    2236  }
    2237  
    2238  #ifdef Py_DEBUG
    2239  void
    2240  tok_dump(int type, char *start, char *end)
    2241  {
    2242      fprintf(stderr, "%s", _PyParser_TokenNames[type]);
    2243      if (type == NAME || type == NUMBER || type == STRING || type == OP)
    2244          fprintf(stderr, "(%.*s)", (int)(end - start), start);
    2245  }
    2246  #endif  // Py_DEBUG