1 #include <stdbool.h>
2
3 #include <Python.h>
4
5 #include "tokenizer.h"
6 #include "pegen.h"
7 #include "string_parser.h"
8
9 //// STRING HANDLING FUNCTIONS ////
10
11 static int
12 warn_invalid_escape_sequence(Parser *p, const char *first_invalid_escape, Token *t)
13 {
14 unsigned char c = *first_invalid_escape;
15 if ((t->type == FSTRING_MIDDLE || t->type == FSTRING_END) && (c == '{' || c == '}')) { // in this case the tokenizer has already emitted a warning,
16 // see tokenizer.c:warn_invalid_escape_sequence
17 return 0;
18 }
19
20 int octal = ('4' <= c && c <= '7');
21 PyObject *msg =
22 octal
23 ? PyUnicode_FromFormat("invalid octal escape sequence '\\%.3s'",
24 first_invalid_escape)
25 : PyUnicode_FromFormat("invalid escape sequence '\\%c'", c);
26 if (msg == NULL) {
27 return -1;
28 }
29 PyObject *category;
30 if (p->feature_version >= 12) {
31 category = PyExc_SyntaxWarning;
32 }
33 else {
34 category = PyExc_DeprecationWarning;
35 }
36 if (PyErr_WarnExplicitObject(category, msg, p->tok->filename,
37 t->lineno, NULL, NULL) < 0) {
38 if (PyErr_ExceptionMatches(category)) {
39 /* Replace the Syntax/DeprecationWarning exception with a SyntaxError
40 to get a more accurate error report */
41 PyErr_Clear();
42
43 /* This is needed, in order for the SyntaxError to point to the token t,
44 since _PyPegen_raise_error uses p->tokens[p->fill - 1] for the
45 error location, if p->known_err_token is not set. */
46 p->known_err_token = t;
47 if (octal) {
48 RAISE_SYNTAX_ERROR("invalid octal escape sequence '\\%.3s'",
49 first_invalid_escape);
50 }
51 else {
52 RAISE_SYNTAX_ERROR("invalid escape sequence '\\%c'", c);
53 }
54 }
55 Py_DECREF(msg);
56 return -1;
57 }
58 Py_DECREF(msg);
59 return 0;
60 }
61
62 static PyObject *
63 decode_utf8(const char **sPtr, const char *end)
64 {
65 const char *s;
66 const char *t;
67 t = s = *sPtr;
68 while (s < end && (*s & 0x80)) {
69 s++;
70 }
71 *sPtr = s;
72 return PyUnicode_DecodeUTF8(t, s - t, NULL);
73 }
74
75 static PyObject *
76 decode_unicode_with_escapes(Parser *parser, const char *s, size_t len, Token *t)
77 {
78 PyObject *v;
79 PyObject *u;
80 char *buf;
81 char *p;
82 const char *end;
83
84 /* check for integer overflow */
85 if (len > SIZE_MAX / 6) {
86 return NULL;
87 }
88 /* "ä" (2 bytes) may become "\U000000E4" (10 bytes), or 1:5
89 "\ä" (3 bytes) may become "\u005c\U000000E4" (16 bytes), or ~1:6 */
90 u = PyBytes_FromStringAndSize((char *)NULL, len * 6);
91 if (u == NULL) {
92 return NULL;
93 }
94 p = buf = PyBytes_AsString(u);
95 if (p == NULL) {
96 return NULL;
97 }
98 end = s + len;
99 while (s < end) {
100 if (*s == '\\') {
101 *p++ = *s++;
102 if (s >= end || *s & 0x80) {
103 strcpy(p, "u005c");
104 p += 5;
105 if (s >= end) {
106 break;
107 }
108 }
109 }
110 if (*s & 0x80) {
111 PyObject *w;
112 int kind;
113 const void *data;
114 Py_ssize_t w_len;
115 Py_ssize_t i;
116 w = decode_utf8(&s, end);
117 if (w == NULL) {
118 Py_DECREF(u);
119 return NULL;
120 }
121 kind = PyUnicode_KIND(w);
122 data = PyUnicode_DATA(w);
123 w_len = PyUnicode_GET_LENGTH(w);
124 for (i = 0; i < w_len; i++) {
125 Py_UCS4 chr = PyUnicode_READ(kind, data, i);
126 sprintf(p, "\\U%08x", chr);
127 p += 10;
128 }
129 /* Should be impossible to overflow */
130 assert(p - buf <= PyBytes_GET_SIZE(u));
131 Py_DECREF(w);
132 }
133 else {
134 *p++ = *s++;
135 }
136 }
137 len = p - buf;
138 s = buf;
139
140 const char *first_invalid_escape;
141 v = _PyUnicode_DecodeUnicodeEscapeInternal(s, len, NULL, NULL, &first_invalid_escape);
142
143 // HACK: later we can simply pass the line no, since we don't preserve the tokens
144 // when we are decoding the string but we preserve the line numbers.
145 if (v != NULL && first_invalid_escape != NULL && t != NULL) {
146 if (warn_invalid_escape_sequence(parser, first_invalid_escape, t) < 0) {
147 /* We have not decref u before because first_invalid_escape points
148 inside u. */
149 Py_XDECREF(u);
150 Py_DECREF(v);
151 return NULL;
152 }
153 }
154 Py_XDECREF(u);
155 return v;
156 }
157
158 static PyObject *
159 decode_bytes_with_escapes(Parser *p, const char *s, Py_ssize_t len, Token *t)
160 {
161 const char *first_invalid_escape;
162 PyObject *result = _PyBytes_DecodeEscape(s, len, NULL, &first_invalid_escape);
163 if (result == NULL) {
164 return NULL;
165 }
166
167 if (first_invalid_escape != NULL) {
168 if (warn_invalid_escape_sequence(p, first_invalid_escape, t) < 0) {
169 Py_DECREF(result);
170 return NULL;
171 }
172 }
173 return result;
174 }
175
176 PyObject *
177 _PyPegen_decode_string(Parser *p, int raw, const char *s, size_t len, Token *t)
178 {
179 if (raw) {
180 return PyUnicode_DecodeUTF8Stateful(s, len, NULL, NULL);
181 }
182 return decode_unicode_with_escapes(p, s, len, t);
183 }
184
185 /* s must include the bracketing quote characters, and r, b &/or f prefixes
186 (if any), and embedded escape sequences (if any). (f-strings are handled by the parser)
187 _PyPegen_parse_string parses it, and returns the decoded Python string object. */
188 PyObject *
189 _PyPegen_parse_string(Parser *p, Token *t)
190 {
191 const char *s = PyBytes_AsString(t->bytes);
192 if (s == NULL) {
193 return NULL;
194 }
195
196 size_t len;
197 int quote = Py_CHARMASK(*s);
198 int bytesmode = 0;
199 int rawmode = 0;
200
201 if (Py_ISALPHA(quote)) {
202 while (!bytesmode || !rawmode) {
203 if (quote == 'b' || quote == 'B') {
204 quote =(unsigned char)*++s;
205 bytesmode = 1;
206 }
207 else if (quote == 'u' || quote == 'U') {
208 quote = (unsigned char)*++s;
209 }
210 else if (quote == 'r' || quote == 'R') {
211 quote = (unsigned char)*++s;
212 rawmode = 1;
213 }
214 else {
215 break;
216 }
217 }
218 }
219
220 if (quote != '\'' && quote != '\"') {
221 PyErr_BadInternalCall();
222 return NULL;
223 }
224 /* Skip the leading quote char. */
225 s++;
226 len = strlen(s);
227 if (len > INT_MAX) {
228 PyErr_SetString(PyExc_OverflowError, "string to parse is too long");
229 return NULL;
230 }
231 if (s[--len] != quote) {
232 /* Last quote char must match the first. */
233 PyErr_BadInternalCall();
234 return NULL;
235 }
236 if (len >= 4 && s[0] == quote && s[1] == quote) {
237 /* A triple quoted string. We've already skipped one quote at
238 the start and one at the end of the string. Now skip the
239 two at the start. */
240 s += 2;
241 len -= 2;
242 /* And check that the last two match. */
243 if (s[--len] != quote || s[--len] != quote) {
244 PyErr_BadInternalCall();
245 return NULL;
246 }
247 }
248
249 /* Avoid invoking escape decoding routines if possible. */
250 rawmode = rawmode || strchr(s, '\\') == NULL;
251 if (bytesmode) {
252 /* Disallow non-ASCII characters. */
253 const char *ch;
254 for (ch = s; *ch; ch++) {
255 if (Py_CHARMASK(*ch) >= 0x80) {
256 RAISE_SYNTAX_ERROR_KNOWN_LOCATION(
257 t,
258 "bytes can only contain ASCII "
259 "literal characters");
260 return NULL;
261 }
262 }
263 if (rawmode) {
264 return PyBytes_FromStringAndSize(s, len);
265 }
266 return decode_bytes_with_escapes(p, s, len, t);
267 }
268 return _PyPegen_decode_string(p, rawmode, s, len, t);
269 }