1 /* Python brace format strings.
2 Copyright (C) 2004, 2006-2007, 2013-2014, 2016, 2019, 2023 Free Software Foundation, Inc.
3 Written by Daiki Ueno <ueno@gnu.org>, 2013.
4
5 This program is free software: you can redistribute it and/or modify
6 it under the terms of the GNU General Public License as published by
7 the Free Software Foundation; either version 3 of the License, or
8 (at your option) any later version.
9
10 This program is distributed in the hope that it will be useful,
11 but WITHOUT ANY WARRANTY; without even the implied warranty of
12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13 GNU General Public License for more details.
14
15 You should have received a copy of the GNU General Public License
16 along with this program. If not, see <https://www.gnu.org/licenses/>. */
17
18 #ifdef HAVE_CONFIG_H
19 # include <config.h>
20 #endif
21
22 #include <stdbool.h>
23 #include <stdlib.h>
24 #include <string.h>
25
26 #include "format.h"
27 #include "c-ctype.h"
28 #include "xalloc.h"
29 #include "xvasprintf.h"
30 #include "format-invalid.h"
31 #include "gettext.h"
32
33 #define _(str) gettext (str)
34
35 /* Python brace format strings are defined by PEP3101 together with the
36 'format' method of the string class.
37 Documentation:
38 https://peps.python.org/pep-3101/
39 https://docs.python.org/3/library/string.html#formatstrings
40 A format string directive here consists of
41 - an opening brace '{',
42 - an identifier [_A-Za-z][_0-9A-Za-z]*|[0-9]+,
43 - an optional sequence of
44 - getattr ('.' identifier) or
45 - getitem ('[' identifier ']')
46 operators,
47 - optionally, a ':' and a format specifier, where a format specifier is
48 - either a format directive of the form '{' ... '}' without a format
49 specifier, or
50 - of the form [[fill]align][sign][#][0][minimumwidth][.precision][type]
51 where
52 - the fill character is any character,
53 - the align flag is one of '<', '>', '=', '^',
54 - the sign is one of '+', '-', ' ',
55 - the # flag is '#',
56 - the 0 flag is '0',
57 - minimumwidth is a non-empty sequence of digits,
58 - precision is a non-empty sequence of digits,
59 - type is one of
60 - 'b', 'c', 'd', 'o', 'x', 'X', 'n' for integers,
61 - 'e', 'E', 'f', 'F', 'g', 'G', 'n', '%' for floating-point values,
62 - a closing brace '}'.
63 Brace characters '{' and '}' can be escaped by doubling them: '{{' and '}}'.
64 */
65
66 struct named_arg
67 {
68 char *name;
69 };
70
71 struct spec
72 {
73 unsigned int directives;
74 unsigned int named_arg_count;
75 unsigned int allocated;
76 struct named_arg *named;
77 };
78
79
80 /* Forward declaration of local functions. */
81 static void free_named_args (struct spec *spec);
82
83
84 /* All the parse_* functions (except parse_upto) follow the same
85 calling convention. FORMATP shall point to the beginning of a token.
86 If parsing succeeds, FORMATP will point to the next character after
87 the token, and true is returned. Otherwise, FORMATP will be
88 unchanged and false is returned. */
89
90 static bool
91 parse_named_field (struct spec *spec,
92 const char **formatp, bool translated, char *fdi,
93 char **invalid_reason)
94 {
95 const char *format = *formatp;
96 char c;
97
98 c = *format;
99 if ((c >= 'A' && c <= 'Z') || (c >= 'a' && c <= 'z') || c == '_')
100 {
101 do
102 c = *++format;
103 while ((c >= 'A' && c <= 'Z') || (c >= 'a' && c <= 'z') || c == '_'
104 || (c >= '0' && c <= '9'));
105 *formatp = format;
106 return true;
107 }
108 return false;
109 }
110
111 static bool
112 parse_numeric_field (struct spec *spec,
113 const char **formatp, bool translated, char *fdi,
114 char **invalid_reason)
115 {
116 const char *format = *formatp;
117 char c;
118
119 c = *format;
120 if (c >= '0' && c <= '9')
121 {
122 do
123 c = *++format;
124 while (c >= '0' && c <= '9');
125 *formatp = format;
126 return true;
127 }
128 return false;
129 }
130
131 /* Parses a directive.
132 When this function is invoked, *formatp points to the start of the directive,
133 i.e. to the '{' character.
134 When this function returns true, *formatp points to the first character after
135 the directive, i.e. in most cases to the character after the '}' character.
136 */
137 static bool
138 parse_directive (struct spec *spec,
139 const char **formatp, bool is_toplevel,
140 bool translated, char *fdi, char **invalid_reason)
141 {
142 const char *format = *formatp;
143 const char *const format_start = format;
144 const char *name_start;
145 char c;
146
147 c = *++format;
148 if (c == '{')
149 {
150 /* An escaped '{'. */
151 *formatp = ++format;
152 return true;
153 }
154
155 name_start = format;
156 if (!parse_named_field (spec, &format, translated, fdi, invalid_reason)
157 && !parse_numeric_field (spec, &format, translated, fdi, invalid_reason))
158 {
159 *invalid_reason =
160 xasprintf (_("In the directive number %u, '%c' cannot start a field name."),
161 spec->directives, *format);
162 FDI_SET (format, FMTDIR_ERROR);
163 return false;
164 }
165
166 /* Parse '.' (getattr) or '[..]' (getitem) operators followed by a
167 name. If must not recurse, but can be specifed in a chain, such
168 as "foo.bar.baz[0]". */
169 for (;;)
170 {
171 c = *format;
172
173 if (c == '.')
174 {
175 format++;
176 if (!parse_named_field (spec, &format, translated, fdi,
177 invalid_reason))
178 {
179 *invalid_reason =
180 xasprintf (_("In the directive number %u, '%c' cannot start a getattr argument."),
181 spec->directives, *format);
182 FDI_SET (format, FMTDIR_ERROR);
183 return false;
184 }
185 }
186 else if (c == '[')
187 {
188 format++;
189 if (!parse_named_field (spec, &format, translated, fdi,
190 invalid_reason)
191 && !parse_numeric_field (spec, &format, translated, fdi,
192 invalid_reason))
193 {
194 *invalid_reason =
195 xasprintf (_("In the directive number %u, '%c' cannot start a getitem argument."),
196 spec->directives, *format);
197 FDI_SET (format, FMTDIR_ERROR);
198 return false;
199 }
200
201 if (*format != ']')
202 {
203 *invalid_reason =
204 xasprintf (_("In the directive number %u, there is an unterminated getitem argument."),
205 spec->directives);
206 FDI_SET (format, FMTDIR_ERROR);
207 return false;
208 }
209 format++;
210 }
211 else
212 break;
213 }
214
215 /* Here c == *format. */
216 if (c == ':')
217 {
218 if (!is_toplevel)
219 {
220 *invalid_reason =
221 xasprintf (_("In the directive number %u, no more nesting is allowed in a format specifier."),
222 spec->directives);
223 FDI_SET (format, FMTDIR_ERROR);
224 return false;
225 }
226
227 format++;
228
229 /* Format specifiers. Although a format specifier can be any
230 string in theory, we can only recognize two types of format
231 specifiers below, because otherwise we would need to evaluate
232 Python expressions by ourselves:
233
234 - A nested format directive expanding to an argument
235 - The Standard Format Specifiers, as described in PEP3101,
236 not including a nested format directive */
237 if (*format == '{')
238 {
239 /* Nested format directive. */
240 if (!parse_directive (spec, &format, false, translated, fdi,
241 invalid_reason))
242 {
243 /* FDI and INVALID_REASON will be set by a recursive call of
244 parse_directive. */
245 return false;
246 }
247 }
248 else
249 {
250 /* Standard format specifiers is in the form:
251 [[fill]align][sign][#][0][minimumwidth][.precision][type] */
252
253 /* Look ahead two characters to skip [[fill]align]. */
254 int c1, c2;
255
256 c1 = format[0];
257 if (c1 == '\0')
258 {
259 *invalid_reason =
260 xasprintf (_("In the directive number %u, there is an unterminated format directive."),
261 spec->directives);
262 FDI_SET (format, FMTDIR_ERROR);
263 return false;
264 }
265
266 c2 = format[1];
267
268 if (c2 == '<' || c2 == '>' || c2 == '=' || c2 == '^')
269 format += 2;
270 else if (c1 == '<' || c1 == '>' || c1 == '=' || c1 == '^')
271 format++;
272
273 if (*format == '+' || *format == '-' || *format == ' ')
274 format++;
275 if (*format == '#')
276 format++;
277 if (*format == '0')
278 format++;
279
280 /* Parse the optional minimumwidth. */
281 while (c_isdigit (*format))
282 format++;
283
284 /* Parse the optional .precision. */
285 if (*format == '.')
286 {
287 format++;
288 if (c_isdigit (*format))
289 do
290 format++;
291 while (c_isdigit (*format));
292 else
293 format--;
294 }
295
296 switch (*format)
297 {
298 case 'b': case 'c': case 'd': case 'o': case 'x': case 'X':
299 case 'n':
300 case 'e': case 'E': case 'f': case 'F': case 'g': case 'G':
301 case '%':
302 format++;
303 break;
304 default:
305 break;
306 }
307 }
308 }
309
310 if (*format != '}')
311 {
312 *invalid_reason =
313 xasprintf (_("In the directive number %u, there is an unterminated format directive."),
314 spec->directives);
315 FDI_SET (format, FMTDIR_ERROR);
316 return false;
317 }
318
319 if (is_toplevel)
320 {
321 char *name;
322 size_t n = format - name_start;
323
324 FDI_SET (name_start - 1, FMTDIR_START);
325
326 name = XNMALLOC (n + 1, char);
327 memcpy (name, name_start, n);
328 name[n] = '\0';
329
330 spec->directives++;
331
332 if (spec->allocated == spec->named_arg_count)
333 {
334 spec->allocated = 2 * spec->allocated + 1;
335 spec->named = (struct named_arg *) xrealloc (spec->named, spec->allocated * sizeof (struct named_arg));
336 }
337 spec->named[spec->named_arg_count].name = name;
338 spec->named_arg_count++;
339
340 FDI_SET (format, FMTDIR_END);
341 }
342
343 *formatp = ++format;
344 return true;
345 }
346
347 static bool
348 parse_upto (struct spec *spec,
349 const char **formatp, bool is_toplevel, char terminator,
350 bool translated, char *fdi, char **invalid_reason)
351 {
352 const char *format = *formatp;
353
354 for (; *format != terminator && *format != '\0';)
355 {
356 if (*format == '{')
357 {
358 if (!parse_directive (spec, &format, is_toplevel, translated, fdi,
359 invalid_reason))
360 return false;
361 }
362 else
363 format++;
364 }
365
366 *formatp = format;
367 return true;
368 }
369
370 static int
371 named_arg_compare (const void *p1, const void *p2)
372 {
373 return strcmp (((const struct named_arg *) p1)->name,
374 ((const struct named_arg *) p2)->name);
375 }
376
377 static void *
378 format_parse (const char *format, bool translated, char *fdi,
379 char **invalid_reason)
380 {
381 struct spec spec;
382 struct spec *result;
383
384 spec.directives = 0;
385 spec.named_arg_count = 0;
386 spec.allocated = 0;
387 spec.named = NULL;
388
389 if (!parse_upto (&spec, &format, true, '\0', translated, fdi, invalid_reason))
390 {
391 free_named_args (&spec);
392 return NULL;
393 }
394
395 /* Sort the named argument array, and eliminate duplicates. */
396 if (spec.named_arg_count > 1)
397 {
398 unsigned int i, j;
399
400 qsort (spec.named, spec.named_arg_count, sizeof (struct named_arg),
401 named_arg_compare);
402
403 /* Remove duplicates: Copy from i to j, keeping 0 <= j <= i. */
404 for (i = j = 0; i < spec.named_arg_count; i++)
405 if (j > 0 && strcmp (spec.named[i].name, spec.named[j-1].name) == 0)
406 free (spec.named[i].name);
407 else
408 {
409 if (j < i)
410 spec.named[j].name = spec.named[i].name;
411 j++;
412 }
413 spec.named_arg_count = j;
414 }
415
416 result = XMALLOC (struct spec);
417 *result = spec;
418 return result;
419 }
420
421 static void
422 free_named_args (struct spec *spec)
423 {
424 if (spec->named != NULL)
425 {
426 unsigned int i;
427 for (i = 0; i < spec->named_arg_count; i++)
428 free (spec->named[i].name);
429 free (spec->named);
430 }
431 }
432
433 static void
434 format_free (void *descr)
435 {
436 struct spec *spec = (struct spec *) descr;
437
438 free_named_args (spec);
439 free (spec);
440 }
441
442 static int
443 format_get_number_of_directives (void *descr)
444 {
445 struct spec *spec = (struct spec *) descr;
446
447 return spec->directives;
448 }
449
450 static bool
451 format_check (void *msgid_descr, void *msgstr_descr, bool equality,
452 formatstring_error_logger_t error_logger,
453 const char *pretty_msgid, const char *pretty_msgstr)
454 {
455 struct spec *spec1 = (struct spec *) msgid_descr;
456 struct spec *spec2 = (struct spec *) msgstr_descr;
457 bool err = false;
458
459 if (spec1->named_arg_count + spec2->named_arg_count > 0)
460 {
461 unsigned int i, j;
462 unsigned int n1 = spec1->named_arg_count;
463 unsigned int n2 = spec2->named_arg_count;
464
465 /* Check the argument names in spec1 are contained in those of spec2.
466 Both arrays are sorted. We search for the differences. */
467 for (i = 0, j = 0; i < n1 || j < n2; )
468 {
469 int cmp = (i >= n1 ? 1 :
470 j >= n2 ? -1 :
471 strcmp (spec1->named[i].name, spec2->named[j].name));
472
473 if (cmp > 0)
474 {
475 if (equality)
476 {
477 if (error_logger)
478 error_logger (_("a format specification for argument '%s' doesn't exist in '%s'"),
479 spec2->named[i].name, pretty_msgid);
480 err = true;
481 break;
482 }
483 else
484 j++;
485 }
486 else if (cmp < 0)
487 {
488 if (equality)
489 {
490 if (error_logger)
491 error_logger (_("a format specification for argument '%s' doesn't exist in '%s'"),
492 spec1->named[i].name, pretty_msgstr);
493 err = true;
494 break;
495 }
496 else
497 i++;
498 }
499 else
500 j++, i++;
501 }
502 }
503
504 return err;
505 }
506
507
508 struct formatstring_parser formatstring_python_brace =
509 {
510 format_parse,
511 format_free,
512 format_get_number_of_directives,
513 NULL,
514 format_check
515 };
516
517
518 #ifdef TEST
519
520 /* Test program: Print the argument list specification returned by
521 format_parse for strings read from standard input. */
522
523 #include <stdio.h>
524
525 static void
526 format_print (void *descr)
527 {
528 struct spec *spec = (struct spec *) descr;
529 unsigned int i;
530
531 if (spec == NULL)
532 {
533 printf ("INVALID");
534 return;
535 }
536
537 printf ("{");
538 for (i = 0; i < spec->named_arg_count; i++)
539 {
540 if (i > 0)
541 printf (", ");
542 printf ("'%s'", spec->named[i].name);
543 }
544 printf ("}");
545 }
546
547 int
548 main ()
549 {
550 for (;;)
551 {
552 char *line = NULL;
553 size_t line_size = 0;
554 int line_len;
555 char *invalid_reason;
556 void *descr;
557
558 line_len = getline (&line, &line_size, stdin);
559 if (line_len < 0)
560 break;
561 if (line_len > 0 && line[line_len - 1] == '\n')
562 line[--line_len] = '\0';
563
564 invalid_reason = NULL;
565 descr = format_parse (line, false, NULL, &invalid_reason);
566
567 format_print (descr);
568 printf ("\n");
569 if (descr == NULL)
570 printf ("%s\n", invalid_reason);
571
572 free (invalid_reason);
573 free (line);
574 }
575
576 return 0;
577 }
578
579 /*
580 * For Emacs M-x compile
581 * Local Variables:
582 * compile-command: "/bin/sh ../libtool --tag=CC --mode=link gcc -o a.out -static -O -g -Wall -I.. -I../gnulib-lib -I../../gettext-runtime/intl -DHAVE_CONFIG_H -DTEST format-python-brace.c ../gnulib-lib/libgettextlib.la"
583 * End:
584 */
585
586 #endif /* TEST */