1  #include <stdbool.h>
       2  
       3  #include <Python.h>
       4  
       5  #include "tokenizer.h"
       6  #include "pegen.h"
       7  #include "string_parser.h"
       8  
       9  //// STRING HANDLING FUNCTIONS ////
      10  
      11  static int
      12  warn_invalid_escape_sequence(Parser *p, const char *first_invalid_escape, Token *t)
      13  {
      14      unsigned char c = *first_invalid_escape;
      15      if ((t->type == FSTRING_MIDDLE || t->type == FSTRING_END) && (c == '{' || c == '}')) {  // in this case the tokenizer has already emitted a warning,
      16                                                                                              // see tokenizer.c:warn_invalid_escape_sequence
      17          return 0;
      18      }
      19  
      20      int octal = ('4' <= c && c <= '7');
      21      PyObject *msg =
      22          octal
      23          ? PyUnicode_FromFormat("invalid octal escape sequence '\\%.3s'",
      24                                 first_invalid_escape)
      25          : PyUnicode_FromFormat("invalid escape sequence '\\%c'", c);
      26      if (msg == NULL) {
      27          return -1;
      28      }
      29      PyObject *category;
      30      if (p->feature_version >= 12) {
      31          category = PyExc_SyntaxWarning;
      32      }
      33      else {
      34          category = PyExc_DeprecationWarning;
      35      }
      36      if (PyErr_WarnExplicitObject(category, msg, p->tok->filename,
      37                                   t->lineno, NULL, NULL) < 0) {
      38          if (PyErr_ExceptionMatches(category)) {
      39              /* Replace the Syntax/DeprecationWarning exception with a SyntaxError
      40                 to get a more accurate error report */
      41              PyErr_Clear();
      42  
      43              /* This is needed, in order for the SyntaxError to point to the token t,
      44                 since _PyPegen_raise_error uses p->tokens[p->fill - 1] for the
      45                 error location, if p->known_err_token is not set. */
      46              p->known_err_token = t;
      47              if (octal) {
      48                  RAISE_SYNTAX_ERROR("invalid octal escape sequence '\\%.3s'",
      49                                     first_invalid_escape);
      50              }
      51              else {
      52                  RAISE_SYNTAX_ERROR("invalid escape sequence '\\%c'", c);
      53              }
      54          }
      55          Py_DECREF(msg);
      56          return -1;
      57      }
      58      Py_DECREF(msg);
      59      return 0;
      60  }
      61  
      62  static PyObject *
      63  decode_utf8(const char **sPtr, const char *end)
      64  {
      65      const char *s;
      66      const char *t;
      67      t = s = *sPtr;
      68      while (s < end && (*s & 0x80)) {
      69          s++;
      70      }
      71      *sPtr = s;
      72      return PyUnicode_DecodeUTF8(t, s - t, NULL);
      73  }
      74  
      75  static PyObject *
      76  decode_unicode_with_escapes(Parser *parser, const char *s, size_t len, Token *t)
      77  {
      78      PyObject *v;
      79      PyObject *u;
      80      char *buf;
      81      char *p;
      82      const char *end;
      83  
      84      /* check for integer overflow */
      85      if (len > SIZE_MAX / 6) {
      86          return NULL;
      87      }
      88      /* "ä" (2 bytes) may become "\U000000E4" (10 bytes), or 1:5
      89         "\ä" (3 bytes) may become "\u005c\U000000E4" (16 bytes), or ~1:6 */
      90      u = PyBytes_FromStringAndSize((char *)NULL, len * 6);
      91      if (u == NULL) {
      92          return NULL;
      93      }
      94      p = buf = PyBytes_AsString(u);
      95      if (p == NULL) {
      96          return NULL;
      97      }
      98      end = s + len;
      99      while (s < end) {
     100          if (*s == '\\') {
     101              *p++ = *s++;
     102              if (s >= end || *s & 0x80) {
     103                  strcpy(p, "u005c");
     104                  p += 5;
     105                  if (s >= end) {
     106                      break;
     107                  }
     108              }
     109          }
     110          if (*s & 0x80) {
     111              PyObject *w;
     112              int kind;
     113              const void *data;
     114              Py_ssize_t w_len;
     115              Py_ssize_t i;
     116              w = decode_utf8(&s, end);
     117              if (w == NULL) {
     118                  Py_DECREF(u);
     119                  return NULL;
     120              }
     121              kind = PyUnicode_KIND(w);
     122              data = PyUnicode_DATA(w);
     123              w_len = PyUnicode_GET_LENGTH(w);
     124              for (i = 0; i < w_len; i++) {
     125                  Py_UCS4 chr = PyUnicode_READ(kind, data, i);
     126                  sprintf(p, "\\U%08x", chr);
     127                  p += 10;
     128              }
     129              /* Should be impossible to overflow */
     130              assert(p - buf <= PyBytes_GET_SIZE(u));
     131              Py_DECREF(w);
     132          }
     133          else {
     134              *p++ = *s++;
     135          }
     136      }
     137      len = p - buf;
     138      s = buf;
     139  
     140      const char *first_invalid_escape;
     141      v = _PyUnicode_DecodeUnicodeEscapeInternal(s, len, NULL, NULL, &first_invalid_escape);
     142  
     143      // HACK: later we can simply pass the line no, since we don't preserve the tokens
     144      // when we are decoding the string but we preserve the line numbers.
     145      if (v != NULL && first_invalid_escape != NULL && t != NULL) {
     146          if (warn_invalid_escape_sequence(parser, first_invalid_escape, t) < 0) {
     147              /* We have not decref u before because first_invalid_escape points
     148                 inside u. */
     149              Py_XDECREF(u);
     150              Py_DECREF(v);
     151              return NULL;
     152          }
     153      }
     154      Py_XDECREF(u);
     155      return v;
     156  }
     157  
     158  static PyObject *
     159  decode_bytes_with_escapes(Parser *p, const char *s, Py_ssize_t len, Token *t)
     160  {
     161      const char *first_invalid_escape;
     162      PyObject *result = _PyBytes_DecodeEscape(s, len, NULL, &first_invalid_escape);
     163      if (result == NULL) {
     164          return NULL;
     165      }
     166  
     167      if (first_invalid_escape != NULL) {
     168          if (warn_invalid_escape_sequence(p, first_invalid_escape, t) < 0) {
     169              Py_DECREF(result);
     170              return NULL;
     171          }
     172      }
     173      return result;
     174  }
     175  
     176  PyObject *
     177  _PyPegen_decode_string(Parser *p, int raw, const char *s, size_t len, Token *t)
     178  {
     179      if (raw) {
     180          return PyUnicode_DecodeUTF8Stateful(s, len, NULL, NULL);
     181      }
     182      return decode_unicode_with_escapes(p, s, len, t);
     183  }
     184  
     185  /* s must include the bracketing quote characters, and r, b &/or f prefixes
     186      (if any), and embedded escape sequences (if any). (f-strings are handled by the parser)
     187     _PyPegen_parse_string parses it, and returns the decoded Python string object. */
     188  PyObject *
     189  _PyPegen_parse_string(Parser *p, Token *t)
     190  {
     191      const char *s = PyBytes_AsString(t->bytes);
     192      if (s == NULL) {
     193          return NULL;
     194      }
     195  
     196      size_t len;
     197      int quote = Py_CHARMASK(*s);
     198      int bytesmode = 0;
     199      int rawmode = 0;
     200  
     201      if (Py_ISALPHA(quote)) {
     202          while (!bytesmode || !rawmode) {
     203              if (quote == 'b' || quote == 'B') {
     204                  quote =(unsigned char)*++s;
     205                  bytesmode = 1;
     206              }
     207              else if (quote == 'u' || quote == 'U') {
     208                  quote = (unsigned char)*++s;
     209              }
     210              else if (quote == 'r' || quote == 'R') {
     211                  quote = (unsigned char)*++s;
     212                  rawmode = 1;
     213              }
     214              else {
     215                  break;
     216              }
     217          }
     218      }
     219  
     220      if (quote != '\'' && quote != '\"') {
     221          PyErr_BadInternalCall();
     222          return NULL;
     223      }
     224      /* Skip the leading quote char. */
     225      s++;
     226      len = strlen(s);
     227      if (len > INT_MAX) {
     228          PyErr_SetString(PyExc_OverflowError, "string to parse is too long");
     229          return NULL;
     230      }
     231      if (s[--len] != quote) {
     232          /* Last quote char must match the first. */
     233          PyErr_BadInternalCall();
     234          return NULL;
     235      }
     236      if (len >= 4 && s[0] == quote && s[1] == quote) {
     237          /* A triple quoted string. We've already skipped one quote at
     238             the start and one at the end of the string. Now skip the
     239             two at the start. */
     240          s += 2;
     241          len -= 2;
     242          /* And check that the last two match. */
     243          if (s[--len] != quote || s[--len] != quote) {
     244              PyErr_BadInternalCall();
     245              return NULL;
     246          }
     247      }
     248  
     249      /* Avoid invoking escape decoding routines if possible. */
     250      rawmode = rawmode || strchr(s, '\\') == NULL;
     251      if (bytesmode) {
     252          /* Disallow non-ASCII characters. */
     253          const char *ch;
     254          for (ch = s; *ch; ch++) {
     255              if (Py_CHARMASK(*ch) >= 0x80) {
     256                  RAISE_SYNTAX_ERROR_KNOWN_LOCATION(
     257                                     t,
     258                                     "bytes can only contain ASCII "
     259                                     "literal characters");
     260                  return NULL;
     261              }
     262          }
     263          if (rawmode) {
     264              return PyBytes_FromStringAndSize(s, len);
     265          }
     266          return decode_bytes_with_escapes(p, s, len, t);
     267      }
     268      return _PyPegen_decode_string(p, rawmode, s, len, t);
     269  }