1 /* xgettext Vala backend.
2 Copyright (C) 2013-2014, 2018-2023 Free Software Foundation, Inc.
3
4 This file was written by Daiki Ueno <ueno@gnu.org>, 2013.
5
6 This program is free software: you can redistribute it and/or modify
7 it under the terms of the GNU General Public License as published by
8 the Free Software Foundation; either version 3 of the License, or
9 (at your option) any later version.
10
11 This program is distributed in the hope that it will be useful,
12 but WITHOUT ANY WARRANTY; without even the implied warranty of
13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 GNU General Public License for more details.
15
16 You should have received a copy of the GNU General Public License
17 along with this program. If not, see <https://www.gnu.org/licenses/>. */
18
19 #ifdef HAVE_CONFIG_H
20 # include "config.h"
21 #endif
22
23 /* Specification. */
24 #include "x-vala.h"
25
26 #include <assert.h>
27 #include <errno.h>
28 #include <stdbool.h>
29 #include <stdio.h>
30 #include <stdlib.h>
31 #include <string.h>
32
33 #include "attribute.h"
34 #include "message.h"
35 #include "rc-str-list.h"
36 #include "xgettext.h"
37 #include "xg-pos.h"
38 #include "xg-encoding.h"
39 #include "xg-mixed-string.h"
40 #include "xg-arglist-context.h"
41 #include "xg-arglist-callshape.h"
42 #include "xg-arglist-parser.h"
43 #include "xg-message.h"
44 #include "error.h"
45 #include "error-progname.h"
46 #include "xalloc.h"
47 #include "xvasprintf.h"
48 #include "mem-hash-map.h"
49 #include "po-charset.h"
50 #include "gettext.h"
51
52 #define _(s) gettext(s)
53
54 #define SIZEOF(a) (sizeof(a) / sizeof(a[0]))
55
56 /* The Vala syntax is defined in the Vala Reference Manual
57 https://www.vala-project.org/doc/vala/.
58 See also vala/valascanner.vala. */
59
60 /* ====================== Keyword set customization. ====================== */
61
62 /* If true extract all strings. */
63 static bool extract_all = false;
64
65 static hash_table keywords;
66 static bool default_keywords = true;
67
68
69 void
70 x_vala_extract_all ()
71 {
72 extract_all = true;
73 }
74
75
76 static void
77 add_keyword (const char *name, hash_table *keywords)
78 {
79 if (name == NULL)
80 default_keywords = false;
81 else
82 {
83 const char *end;
84 struct callshape shape;
85 const char *colon;
86
87 if (keywords->table == NULL)
88 hash_init (keywords, 100);
89
90 split_keywordspec (name, &end, &shape);
91
92 /* The characters between name and end should form a valid C identifier.
93 A colon means an invalid parse in split_keywordspec(). */
94 colon = strchr (name, ':');
95 if (colon == NULL || colon >= end)
96 insert_keyword_callshape (keywords, name, end - name, &shape);
97 }
98 }
99
100 void
101 x_vala_keyword (const char *name)
102 {
103 add_keyword (name, &keywords);
104 }
105
106 static void
107 init_keywords ()
108 {
109 if (default_keywords)
110 {
111 /* When adding new keywords here, also update the documentation in
112 xgettext.texi! */
113 x_vala_keyword ("dgettext:2");
114 x_vala_keyword ("dcgettext:2");
115 x_vala_keyword ("ngettext:1,2");
116 x_vala_keyword ("dngettext:2,3");
117 x_vala_keyword ("dpgettext:2g");
118 x_vala_keyword ("dpgettext2:2c,3");
119 x_vala_keyword ("_");
120 x_vala_keyword ("Q_");
121 x_vala_keyword ("N_");
122 x_vala_keyword ("NC_:1c,2");
123
124 default_keywords = false;
125 }
126 }
127
128 void
129 init_flag_table_vala ()
130 {
131 /* Vala leaves string formatting to Glib functions and thus the
132 format string is exactly same as C. See also
133 vapi/glib-2.0.vapi. */
134
135 xgettext_record_flag ("dgettext:2:pass-c-format!Vala");
136 xgettext_record_flag ("dcgettext:2:pass-c-format!Vala");
137 xgettext_record_flag ("ngettext:1:pass-c-format!Vala");
138 xgettext_record_flag ("ngettext:2:pass-c-format!Vala");
139 xgettext_record_flag ("dngettext:2:pass-c-format!Vala");
140 xgettext_record_flag ("dngettext:3:pass-c-format!Vala");
141 xgettext_record_flag ("dpgettext:2:pass-c-format!Vala");
142 xgettext_record_flag ("dpgettext2:3:pass-c-format!Vala");
143 xgettext_record_flag ("_:1:pass-c-format!Vala");
144 xgettext_record_flag ("Q_:1:pass-c-format!Vala");
145 xgettext_record_flag ("N_:1:pass-c-format!Vala");
146 xgettext_record_flag ("NC_:2:pass-c-format!Vala");
147
148 xgettext_record_flag ("printf:1:c-format!Vala");
149 xgettext_record_flag ("vprintf:1:c-format!Vala");
150 }
151
152
153 /* ======================== Reading of characters. ======================== */
154
155 /* The input file stream. */
156 static FILE *fp;
157
158
159 /* 1. line_number handling. */
160
161 #define MAX_PHASE1_PUSHBACK 16
162 static unsigned char phase1_pushback[MAX_PHASE1_PUSHBACK];
163 static int phase1_pushback_length;
164
165
166 static int
167 phase1_getc ()
168 {
169 int c;
170
171 if (phase1_pushback_length)
172 c = phase1_pushback[--phase1_pushback_length];
173 else
174 {
175 c = getc (fp);
176 if (c == EOF)
177 {
178 if (ferror (fp))
179 error (EXIT_FAILURE, errno, _("error while reading \"%s\""),
180 real_file_name);
181 return EOF;
182 }
183 }
184
185 if (c == '\n')
186 ++line_number;
187 return c;
188 }
189
190
191 /* Supports 2 characters of pushback. */
192 static void
193 phase1_ungetc (int c)
194 {
195 if (c != EOF)
196 {
197 if (c == '\n')
198 --line_number;
199
200 if (phase1_pushback_length == SIZEOF (phase1_pushback))
201 abort ();
202 phase1_pushback[phase1_pushback_length++] = c;
203 }
204 }
205
206
207 /* These are for tracking whether comments count as immediately before
208 keyword. */
209 static int last_comment_line;
210 static int last_non_comment_line;
211
212 /* Accumulating comments. */
213
214 static char *buffer;
215 static size_t bufmax;
216 static size_t buflen;
217
218 static inline void
219 comment_start ()
220 {
221 buflen = 0;
222 }
223
224 static inline void
225 comment_add (int c)
226 {
227 if (buflen >= bufmax)
228 {
229 bufmax = 2 * bufmax + 10;
230 buffer = xrealloc (buffer, bufmax);
231 }
232 buffer[buflen++] = c;
233 }
234
235 static inline void
236 comment_line_end (size_t chars_to_remove)
237 {
238 buflen -= chars_to_remove;
239 while (buflen >= 1
240 && (buffer[buflen - 1] == ' ' || buffer[buflen - 1] == '\t'))
241 --buflen;
242 if (chars_to_remove == 0 && buflen >= bufmax)
243 {
244 bufmax = 2 * bufmax + 10;
245 buffer = xrealloc (buffer, bufmax);
246 }
247 buffer[buflen] = '\0';
248 savable_comment_add (buffer);
249 }
250
251
252 /* 2. Replace each comment that is not inside a character constant or
253 string literal with a space character. */
254
255 static int
256 phase2_getc ()
257 {
258 int c;
259 bool last_was_star;
260
261 c = phase1_getc ();
262 if (c != '/')
263 return c;
264 c = phase1_getc ();
265 switch (c)
266 {
267 default:
268 phase1_ungetc (c);
269 return '/';
270
271 case '*':
272 /* C comment. */
273 comment_start ();
274 last_was_star = false;
275 for (;;)
276 {
277 c = phase1_getc ();
278 if (c == EOF)
279 break;
280 /* We skip all leading white space, but not EOLs. */
281 if (!(buflen == 0 && (c == ' ' || c == '\t')))
282 comment_add (c);
283 switch (c)
284 {
285 case '\n':
286 comment_line_end (1);
287 comment_start ();
288 last_was_star = false;
289 continue;
290
291 case '*':
292 last_was_star = true;
293 continue;
294
295 case '/':
296 if (last_was_star)
297 {
298 comment_line_end (2);
299 break;
300 }
301 FALLTHROUGH;
302
303 default:
304 last_was_star = false;
305 continue;
306 }
307 break;
308 }
309 last_comment_line = line_number;
310 return ' ';
311
312 case '/':
313 /* C++ or ISO C 99 comment. */
314 comment_start ();
315 for (;;)
316 {
317 c = phase1_getc ();
318 if (c == '\n' || c == EOF)
319 break;
320 /* We skip all leading white space, but not EOLs. */
321 if (!(buflen == 0 && (c == ' ' || c == '\t')))
322 comment_add (c);
323 }
324 comment_line_end (0);
325 last_comment_line = line_number;
326 return '\n';
327 }
328 }
329
330
331 static void
332 phase2_ungetc (int c)
333 {
334 phase1_ungetc (c);
335 }
336
337
338 /* ========================== Reading of tokens. ========================== */
339
340 enum token_type_ty
341 {
342 token_type_character_constant, /* 'x' */
343 token_type_eof,
344 token_type_lparen, /* ( */
345 token_type_rparen, /* ) */
346 token_type_lbrace, /* { */
347 token_type_rbrace, /* } */
348 token_type_assign, /* = += -= *= /= %= <<= >>= &= |= ^= */
349 token_type_return, /* return */
350 token_type_plus, /* + */
351 token_type_arithmetic_operator, /* - * / % << >> & | ^ */
352 token_type_equality_test_operator, /* == < > >= <= != */
353 token_type_logic_operator, /* ! && || */
354 token_type_comma, /* , */
355 token_type_question, /* ? */
356 token_type_colon, /* : */
357 token_type_number, /* 2.7 */
358 token_type_string_literal, /* "abc" */
359 token_type_string_template, /* @"abc" */
360 token_type_regex_literal, /* /.../ */
361 token_type_symbol, /* if else etc. */
362 token_type_other
363 };
364 typedef enum token_type_ty token_type_ty;
365
366 typedef struct token_ty token_ty;
367 struct token_ty
368 {
369 token_type_ty type;
370 char *string; /* for token_type_symbol */
371 mixed_string_ty *mixed_string; /* for token_type_string_literal */
372 refcounted_string_list_ty *comment; /* for token_type_string_literal */
373 int line_number;
374 };
375
376 /* Free the memory pointed to by a 'struct token_ty'. */
377 static inline void
378 free_token (token_ty *tp)
379 {
380 if (tp->type == token_type_symbol)
381 free (tp->string);
382 if (tp->type == token_type_string_literal)
383 {
384 mixed_string_free (tp->mixed_string);
385 drop_reference (tp->comment);
386 }
387 }
388
389
390 /* Return value of phase7_getc when EOF is reached. */
391 #define P7_EOF (-1)
392
393 /* Replace escape sequences within character strings with their single
394 character equivalents. */
395 #define P7_QUOTES (-3)
396 #define P7_QUOTE (-4)
397 #define P7_NEWLINE (-5)
398
399 /* Convert an UTF-16 or UTF-32 code point to a return value that can be
400 distinguished from a single-byte return value. */
401 #define UNICODE(code) (0x100 + (code))
402
403 /* Test a return value of phase7_getuc whether it designates an UTF-16 or
404 UTF-32 code point. */
405 #define IS_UNICODE(p7_result) ((p7_result) >= 0x100)
406
407 /* Extract the UTF-16 or UTF-32 code of a return value that satisfies
408 IS_UNICODE. */
409 #define UNICODE_VALUE(p7_result) ((p7_result) - 0x100)
410
411
412 static int
413 phase7_getc ()
414 {
415 int c, j;
416
417 /* Use phase 1, because phase 2 elides comments. */
418 c = phase1_getc ();
419
420 if (c == EOF)
421 return P7_EOF;
422
423 /* Return a magic newline indicator, so that we can distinguish
424 between the user requesting a newline in the string (e.g. using
425 "\n" or "\012") from the user failing to terminate the string or
426 character constant. The ANSI C standard says: 3.1.3.4 Character
427 Constants contain "any character except single quote, backslash or
428 newline; or an escape sequence" and 3.1.4 String Literals contain
429 "any character except double quote, backslash or newline; or an
430 escape sequence".
431
432 Most compilers give a fatal error in this case, however gcc is
433 stupidly silent, even though this is a very common typo. OK, so
434 "gcc --pedantic" will tell me, but that gripes about too much other
435 stuff. Could I have a "gcc -Wnewline-in-string" option, or
436 better yet a "gcc -fno-newline-in-string" option, please? Gcc is
437 also inconsistent between string literals and character constants:
438 you may not embed newlines in character constants; try it, you get
439 a useful diagnostic. --PMiller */
440 if (c == '\n')
441 return P7_NEWLINE;
442
443 if (c == '"')
444 return P7_QUOTES;
445 if (c == '\'')
446 return P7_QUOTE;
447 if (c != '\\')
448 return c;
449 c = phase1_getc ();
450 switch (c)
451 {
452 default:
453 /* Unknown escape sequences really should be an error, but just
454 ignore them, and let the real compiler complain. */
455 phase1_ungetc (c);
456 return '\\';
457
458 case '"':
459 case '\'':
460 case '\\':
461 case '$':
462 return c;
463
464 case 'b':
465 return '\b';
466
467 case 'f':
468 return '\f';
469 case 'n':
470 return '\n';
471 case 'r':
472 return '\r';
473 case 't':
474 return '\t';
475 case 'v':
476 return '\v';
477
478 case 'x':
479 c = phase1_getc ();
480 switch (c)
481 {
482 default:
483 phase1_ungetc (c);
484 phase1_ungetc ('x');
485 return '\\';
486
487 case '0': case '1': case '2': case '3': case '4':
488 case '5': case '6': case '7': case '8': case '9':
489 case 'A': case 'B': case 'C': case 'D': case 'E': case 'F':
490 case 'a': case 'b': case 'c': case 'd': case 'e': case 'f':
491 break;
492 }
493 {
494 int n;
495 bool overflow;
496
497 n = 0;
498 overflow = false;
499
500 for (;;)
501 {
502 switch (c)
503 {
504 default:
505 phase1_ungetc (c);
506 if (overflow)
507 {
508 error_with_progname = false;
509 error (0, 0, _("%s:%d: warning: hexadecimal escape sequence out of range"),
510 logical_file_name, line_number);
511 error_with_progname = true;
512 }
513 return n;
514
515 case '0': case '1': case '2': case '3': case '4':
516 case '5': case '6': case '7': case '8': case '9':
517 if (n < 0x100 / 16)
518 n = n * 16 + c - '0';
519 else
520 overflow = true;
521 break;
522
523 case 'A': case 'B': case 'C': case 'D': case 'E': case 'F':
524 if (n < 0x100 / 16)
525 n = n * 16 + 10 + c - 'A';
526 else
527 overflow = true;
528 break;
529
530 case 'a': case 'b': case 'c': case 'd': case 'e': case 'f':
531 if (n < 0x100 / 16)
532 n = n * 16 + 10 + c - 'a';
533 else
534 overflow = true;
535 break;
536 }
537 c = phase1_getc ();
538 }
539 }
540
541 case '0':
542 {
543 int n;
544
545 n = 0;
546 for (j = 0; j < 3; ++j)
547 {
548 n = n * 8 + c - '0';
549 c = phase1_getc ();
550 switch (c)
551 {
552 default:
553 break;
554
555 case '0': case '1': case '2': case '3':
556 case '4': case '5': case '6': case '7':
557 continue;
558 }
559 break;
560 }
561 phase1_ungetc (c);
562 return n;
563 }
564
565 case 'u':
566 {
567 unsigned char buf[8];
568 int n;
569
570 n = 0;
571 for (j = 0; j < 4; j++)
572 {
573 int c1 = phase1_getc ();
574
575 if (c1 >= '0' && c1 <= '9')
576 n = (n << 4) + (c1 - '0');
577 else if (c1 >= 'A' && c1 <= 'F')
578 n = (n << 4) + (c1 - 'A' + 10);
579 else if (c1 >= 'a' && c1 <= 'f')
580 n = (n << 4) + (c1 - 'a' + 10);
581 else
582 {
583 phase1_ungetc (c1);
584 while (--j >= 0)
585 phase1_ungetc (buf[j]);
586 phase1_ungetc (c);
587 return '\\';
588 }
589
590 buf[j] = c1;
591 }
592
593 if (n < 0x110000)
594 return UNICODE (n);
595
596 error_with_progname = false;
597 error (0, 0, _("%s:%d: warning: invalid Unicode character"),
598 logical_file_name, line_number);
599 error_with_progname = true;
600
601 while (--j >= 0)
602 phase1_ungetc (buf[j]);
603 phase1_ungetc (c);
604 return '\\';
605 }
606 }
607 }
608
609
610 static void
611 phase7_ungetc (int c)
612 {
613 phase1_ungetc (c);
614 }
615
616
617 /* 3. Parse each resulting logical line as preprocessing tokens and
618 white space. Preprocessing tokens and Vala tokens don't always
619 match. */
620
621 static token_ty phase3_pushback[2];
622 static int phase3_pushback_length;
623
624
625 static token_type_ty last_token_type;
626
627 static void
628 phase3_scan_regex ()
629 {
630 int c;
631
632 for (;;)
633 {
634 c = phase1_getc ();
635 if (c == '/')
636 break;
637 if (c == '\\')
638 {
639 c = phase1_getc ();
640 if (c != EOF)
641 continue;
642 }
643 if (c == EOF)
644 {
645 error_with_progname = false;
646 error (0, 0,
647 _("%s:%d: warning: regular expression literal terminated too early"),
648 logical_file_name, line_number);
649 error_with_progname = true;
650 return;
651 }
652 }
653
654 c = phase2_getc ();
655 if (!(c == 'i' || c == 's' || c == 'm' || c == 'x'))
656 phase2_ungetc (c);
657 }
658
659 static void
660 phase3_get (token_ty *tp)
661 {
662 static char *buffer;
663 static int bufmax;
664 int bufpos;
665
666 #undef APPEND
667 #define APPEND(c) \
668 do \
669 { \
670 if (bufpos >= bufmax) \
671 { \
672 bufmax = 2 * bufmax + 10; \
673 buffer = xrealloc (buffer, bufmax); \
674 } \
675 buffer[bufpos++] = c; \
676 } \
677 while (0)
678
679 if (phase3_pushback_length)
680 {
681 *tp = phase3_pushback[--phase3_pushback_length];
682 last_token_type = tp->type;
683 return;
684 }
685
686 for (;;)
687 {
688 bool template;
689 bool verbatim;
690 int c;
691
692 tp->line_number = line_number;
693 c = phase2_getc ();
694
695 switch (c)
696 {
697 case EOF:
698 tp->type = last_token_type = token_type_eof;
699 return;
700
701 case '\n':
702 if (last_non_comment_line > last_comment_line)
703 savable_comment_reset ();
704 FALLTHROUGH;
705 case ' ':
706 case '\f':
707 case '\t':
708 /* Ignore whitespace and comments. */
709 continue;
710 default:
711 break;
712 }
713
714 last_non_comment_line = tp->line_number;
715 template = false;
716 verbatim = false;
717
718 switch (c)
719 {
720 case 'A': case 'B': case 'C': case 'D': case 'E': case 'F': case 'G':
721 case 'H': case 'I': case 'J': case 'K': case 'L': case 'M': case 'N':
722 case 'O': case 'P': case 'Q': case 'R': case 'S': case 'T': case 'U':
723 case 'V': case 'W': case 'X': case 'Y': case 'Z':
724 case '_':
725 case 'a': case 'b': case 'c': case 'd': case 'e': case 'f': case 'g':
726 case 'h': case 'i': case 'j': case 'k': case 'l': case 'm': case 'n':
727 case 'o': case 'p': case 'q': case 'r': case 's': case 't': case 'u':
728 case 'v': case 'w': case 'x': case 'y': case 'z':
729 bufpos = 0;
730 for (;;)
731 {
732 APPEND (c);
733 c = phase2_getc ();
734 switch (c)
735 {
736 case 'A': case 'B': case 'C': case 'D': case 'E': case 'F':
737 case 'G': case 'H': case 'I': case 'J': case 'K': case 'L':
738 case 'M': case 'N': case 'O': case 'P': case 'Q': case 'R':
739 case 'S': case 'T': case 'U': case 'V': case 'W': case 'X':
740 case 'Y': case 'Z':
741 case '_':
742 case 'a': case 'b': case 'c': case 'd': case 'e': case 'f':
743 case 'g': case 'h': case 'i': case 'j': case 'k': case 'l':
744 case 'm': case 'n': case 'o': case 'p': case 'q': case 'r':
745 case 's': case 't': case 'u': case 'v': case 'w': case 'x':
746 case 'y': case 'z':
747 case '0': case '1': case '2': case '3': case '4':
748 case '5': case '6': case '7': case '8': case '9':
749 continue;
750
751 default:
752 phase2_ungetc (c);
753 break;
754 }
755 break;
756 }
757 APPEND (0);
758 if (strcmp (buffer, "return") == 0)
759 tp->type = last_token_type = token_type_return;
760 else
761 {
762 tp->string = xstrdup (buffer);
763 tp->type = last_token_type = token_type_symbol;
764 }
765 return;
766
767 case '.':
768 c = phase2_getc ();
769 phase2_ungetc (c);
770 switch (c)
771 {
772 default:
773 tp->string = xstrdup (".");
774 tp->type = last_token_type = token_type_symbol;
775 return;
776
777 case '0': case '1': case '2': case '3': case '4':
778 case '5': case '6': case '7': case '8': case '9':
779 c = '.';
780 break;
781 }
782 FALLTHROUGH;
783
784 case '0': case '1': case '2': case '3': case '4':
785 case '5': case '6': case '7': case '8': case '9':
786 /* The preprocessing number token is more "generous" than the C
787 number tokens. This is mostly due to token pasting (another
788 thing we can ignore here). */
789 bufpos = 0;
790 for (;;)
791 {
792 APPEND (c);
793 c = phase2_getc ();
794 switch (c)
795 {
796 case 'e':
797 case 'E':
798 APPEND (c);
799 c = phase2_getc ();
800 if (c != '+' && c != '-')
801 {
802 phase2_ungetc (c);
803 break;
804 }
805 continue;
806
807 case 'A': case 'B': case 'C': case 'D': case 'F':
808 case 'G': case 'H': case 'I': case 'J': case 'K': case 'L':
809 case 'M': case 'N': case 'O': case 'P': case 'Q': case 'R':
810 case 'S': case 'T': case 'U': case 'V': case 'W': case 'X':
811 case 'Y': case 'Z':
812 case 'a': case 'b': case 'c': case 'd': case 'f':
813 case 'g': case 'h': case 'i': case 'j': case 'k': case 'l':
814 case 'm': case 'n': case 'o': case 'p': case 'q': case 'r':
815 case 's': case 't': case 'u': case 'v': case 'w': case 'x':
816 case 'y': case 'z':
817 case '0': case '1': case '2': case '3': case '4':
818 case '5': case '6': case '7': case '8': case '9':
819 case '.':
820 continue;
821
822 default:
823 phase2_ungetc (c);
824 break;
825 }
826 break;
827 }
828 APPEND (0);
829 tp->type = last_token_type = token_type_number;
830 return;
831
832 case '\'':
833 for (;;)
834 {
835 c = phase7_getc ();
836 if (c == P7_NEWLINE)
837 {
838 error_with_progname = false;
839 error (0, 0, _("%s:%d: warning: unterminated character constant"),
840 logical_file_name, line_number - 1);
841 error_with_progname = true;
842 phase7_ungetc ('\n');
843 break;
844 }
845 if (c == P7_EOF || c == P7_QUOTE)
846 break;
847 }
848 tp->type = last_token_type = token_type_character_constant;
849 return;
850
851 /* Vala provides strings in three different formats.
852
853 Usual string literals:
854 "..."
855 Verbatim string literals:
856 """...""" (where ... can include newlines and double quotes)
857 String templates.
858 @"...", @"""..."""
859
860 Note that, with the current implementation string
861 templates are not subject to translation, because they are
862 inspected at compile time. For example, the following code
863
864 string bar = "bar";
865 string foo = _(@"foo $bar");
866
867 will be translated into the C code, like:
868
869 _(g_strconcat ("foo ", "bar", NULL)); */
870 case '@':
871 c = phase2_getc ();
872 if (c != '"')
873 {
874 phase2_ungetc (c);
875 tp->type = last_token_type = token_type_other;
876 return;
877 }
878 template = true;
879 FALLTHROUGH;
880 case '"':
881 {
882 struct mixed_string_buffer msb;
883 {
884 int c2 = phase1_getc ();
885
886 if (c2 == '"')
887 {
888 int c3 = phase1_getc ();
889 if (c3 == '"')
890 verbatim = true;
891 else
892 {
893 phase1_ungetc (c3);
894 phase1_ungetc (c2);
895 }
896 }
897 else
898 phase2_ungetc (c2);
899 }
900
901 /* Start accumulating the string. */
902 mixed_string_buffer_init (&msb, lc_string,
903 logical_file_name, line_number);
904 if (verbatim)
905 for (;;)
906 {
907 c = phase1_getc ();
908
909 /* Keep line_number in sync. */
910 msb.line_number = line_number;
911
912 if (c == '"')
913 {
914 int c2 = phase1_getc ();
915 if (c2 == '"')
916 {
917 int c3 = phase1_getc ();
918 if (c3 == '"')
919 break;
920 phase1_ungetc (c3);
921 }
922 phase1_ungetc (c2);
923 }
924 if (c == EOF)
925 break;
926 mixed_string_buffer_append_char (&msb, c);
927 }
928 else
929 for (;;)
930 {
931 c = phase7_getc ();
932
933 /* Keep line_number in sync. */
934 msb.line_number = line_number;
935
936 if (c == P7_NEWLINE)
937 {
938 error_with_progname = false;
939 error (0, 0,
940 _("%s:%d: warning: unterminated string literal"),
941 logical_file_name, line_number - 1);
942 error_with_progname = true;
943 phase7_ungetc ('\n');
944 break;
945 }
946 if (c == P7_QUOTES)
947 break;
948 if (c == P7_EOF)
949 break;
950 if (c == P7_QUOTE)
951 c = '\'';
952 if (IS_UNICODE (c))
953 {
954 assert (UNICODE_VALUE (c) >= 0
955 && UNICODE_VALUE (c) < 0x110000);
956 mixed_string_buffer_append_unicode (&msb,
957 UNICODE_VALUE (c));
958 }
959 else
960 mixed_string_buffer_append_char (&msb, c);
961 }
962 /* Done accumulating the string. */
963 if (template)
964 {
965 tp->type = token_type_string_template;
966 mixed_string_buffer_destroy (&msb);
967 }
968 else
969 {
970 tp->type = token_type_string_literal;
971 tp->mixed_string = mixed_string_buffer_result (&msb);
972 tp->comment = add_reference (savable_comment);
973 }
974 last_token_type = tp->type;
975 return;
976 }
977
978 case '/':
979 switch (last_token_type)
980 {
981 case token_type_lparen:
982 case token_type_lbrace:
983 case token_type_assign:
984 case token_type_return:
985 case token_type_plus:
986 case token_type_arithmetic_operator:
987 case token_type_equality_test_operator:
988 case token_type_logic_operator:
989 case token_type_comma:
990 case token_type_question:
991 case token_type_colon:
992 phase3_scan_regex ();
993 tp->type = last_token_type = token_type_regex_literal;
994 break;
995 default:
996 {
997 int c2 = phase2_getc ();
998 if (c2 == '=')
999 tp->type = last_token_type = token_type_assign;
1000 else
1001 {
1002 phase2_ungetc (c2);
1003 tp->type = last_token_type = token_type_arithmetic_operator;
1004 }
1005 break;
1006 }
1007 }
1008 return;
1009
1010 case '(':
1011 tp->type = last_token_type = token_type_lparen;
1012 return;
1013
1014 case ')':
1015 tp->type = last_token_type = token_type_rparen;
1016 return;
1017
1018 case '{':
1019 tp->type = last_token_type = token_type_lbrace;
1020 return;
1021
1022 case '}':
1023 tp->type = last_token_type = token_type_rbrace;
1024 return;
1025
1026 case '+':
1027 {
1028 int c2 = phase2_getc ();
1029 switch (c2)
1030 {
1031 case '+':
1032 tp->type = last_token_type = token_type_other;
1033 break;
1034 case '=':
1035 tp->type = last_token_type = token_type_assign;
1036 break;
1037 default:
1038 phase2_ungetc (c2);
1039 tp->type = last_token_type = token_type_plus;
1040 break;
1041 }
1042 return;
1043 }
1044
1045 case '-':
1046 {
1047 int c2 = phase2_getc ();
1048 switch (c2)
1049 {
1050 case '-':
1051 tp->type = last_token_type = token_type_other;
1052 break;
1053 case '=':
1054 tp->type = last_token_type = token_type_assign;
1055 break;
1056 default:
1057 phase2_ungetc (c2);
1058 tp->type = last_token_type = token_type_arithmetic_operator;
1059 break;
1060 }
1061 return;
1062 }
1063
1064 case '%':
1065 case '^':
1066 {
1067 int c2 = phase2_getc ();
1068 if (c2 == '=')
1069 tp->type = last_token_type = token_type_assign;
1070 else
1071 {
1072 phase2_ungetc (c2);
1073 tp->type = last_token_type = token_type_logic_operator;
1074 }
1075 return;
1076 }
1077
1078 case '=':
1079 {
1080 int c2 = phase2_getc ();
1081 switch (c2)
1082 {
1083 case '=':
1084 tp->type = last_token_type = token_type_equality_test_operator;
1085 break;
1086 case '>':
1087 tp->type = last_token_type = token_type_other;
1088 break;
1089 default:
1090 phase2_ungetc (c2);
1091 tp->type = last_token_type = token_type_assign;
1092 break;
1093 }
1094 return;
1095 }
1096
1097 case '!':
1098 {
1099 int c2 = phase2_getc ();
1100 if (c2 == '=')
1101 tp->type = last_token_type = token_type_equality_test_operator;
1102 else
1103 {
1104 phase2_ungetc (c2);
1105 tp->type = last_token_type = token_type_logic_operator;
1106 }
1107 return;
1108 }
1109
1110 case '>':
1111 case '<':
1112 {
1113 int c2 = phase2_getc ();
1114 if (c2 == '=')
1115 tp->type = last_token_type = token_type_equality_test_operator;
1116 else if (c2 == c)
1117 {
1118 int c3 = phase2_getc ();
1119 if (c3 == '=')
1120 tp->type = last_token_type = token_type_assign;
1121 else
1122 {
1123 phase2_ungetc (c2);
1124 phase2_ungetc (c3);
1125 tp->type = last_token_type = token_type_other;
1126 }
1127 }
1128 else
1129 {
1130 phase2_ungetc (c2);
1131 tp->type = last_token_type = token_type_equality_test_operator;
1132 }
1133 return;
1134 }
1135
1136 case ',':
1137 tp->type = last_token_type = token_type_comma;
1138 return;
1139
1140 case ':':
1141 tp->type = last_token_type = token_type_colon;
1142 return;
1143
1144 case '&':
1145 case '|':
1146 {
1147 int c2 = phase2_getc ();
1148 if (c2 == c)
1149 tp->type = last_token_type = token_type_logic_operator;
1150 else if (c2 == '=')
1151 tp->type = last_token_type = token_type_assign;
1152 else
1153 {
1154 phase2_ungetc (c2);
1155 tp->type = last_token_type = token_type_arithmetic_operator;
1156 }
1157 return;
1158 }
1159
1160 case '?':
1161 {
1162 int c2 = phase2_getc ();
1163 if (c2 == '?')
1164 tp->type = last_token_type = token_type_logic_operator;
1165 else
1166 {
1167 phase2_ungetc (c2);
1168 tp->type = last_token_type = token_type_question;
1169 }
1170 return;
1171 }
1172
1173 default:
1174 tp->type = last_token_type = token_type_other;
1175 return;
1176 }
1177 }
1178 #undef APPEND
1179 }
1180
1181 static void
1182 phase3_unget (token_ty *tp)
1183 {
1184 if (tp->type != token_type_eof)
1185 {
1186 if (phase3_pushback_length == SIZEOF (phase3_pushback))
1187 abort ();
1188 phase3_pushback[phase3_pushback_length++] = *tp;
1189 }
1190 }
1191
1192
1193 /* String concatenation with '+'. */
1194
1195 static void
1196 x_vala_lex (token_ty *tp)
1197 {
1198 phase3_get (tp);
1199 if (tp->type == token_type_string_literal)
1200 {
1201 mixed_string_ty *sum = tp->mixed_string;
1202
1203 for (;;)
1204 {
1205 token_ty token2;
1206
1207 phase3_get (&token2);
1208 if (token2.type == token_type_plus)
1209 {
1210 token_ty token3;
1211
1212 phase3_get (&token3);
1213 if (token3.type == token_type_string_literal)
1214 {
1215 sum = mixed_string_concat_free1 (sum, token3.mixed_string);
1216
1217 free_token (&token3);
1218 free_token (&token2);
1219 continue;
1220 }
1221 phase3_unget (&token3);
1222 }
1223 phase3_unget (&token2);
1224 break;
1225 }
1226 tp->mixed_string = sum;
1227 }
1228 }
1229
1230
1231 /* ========================= Extracting strings. ========================== */
1232
1233
1234 /* Context lookup table. */
1235 static flag_context_list_table_ty *flag_context_list_table;
1236
1237
1238 /* Maximum supported nesting depth. */
1239 #define MAX_NESTING_DEPTH 1000
1240
1241 /* Current nesting depth. */
1242 static int nesting_depth;
1243
1244
1245 /* The file is broken into tokens. Scan the token stream, looking for
1246 a keyword, followed by a left paren, followed by a string. When we
1247 see this sequence, we have something to remember. We assume we are
1248 looking at a valid Vala program, and leave the complaints about the
1249 grammar to the compiler.
1250
1251 Normal handling: Look for
1252 keyword ( ... msgid ... )
1253 keyword msgid
1254 Plural handling: Look for
1255 keyword ( ... msgid ... msgid_plural ... )
1256
1257 We use recursion because the arguments before msgid or between msgid
1258 and msgid_plural can contain subexpressions of the same form. */
1259
1260 /* Extract messages until the next balanced closing parenthesis or bracket.
1261 Extracted messages are added to MLP.
1262 DELIM can be either token_type_rparen or token_type_rbracket, or
1263 token_type_eof to accept both.
1264 Return true upon eof, false upon closing parenthesis or bracket. */
1265 static bool
1266 extract_balanced (message_list_ty *mlp, token_type_ty delim,
1267 flag_context_ty outer_context,
1268 flag_context_list_iterator_ty context_iter,
1269 struct arglist_parser *argparser)
1270 {
1271 /* Current argument number. */
1272 int arg = 1;
1273 /* 0 when no keyword has been seen. 1 right after a keyword is seen. */
1274 int state;
1275 /* Parameters of the keyword just seen. Defined only in state 1. */
1276 const struct callshapes *next_shapes = NULL;
1277 /* Context iterator that will be used if the next token is a '('. */
1278 flag_context_list_iterator_ty next_context_iter =
1279 passthrough_context_list_iterator;
1280 /* Current context. */
1281 flag_context_ty inner_context =
1282 inherited_context (outer_context,
1283 flag_context_list_iterator_advance (&context_iter));
1284
1285 /* Start state is 0. */
1286 state = 0;
1287
1288 for (;;)
1289 {
1290 token_ty token;
1291
1292 x_vala_lex (&token);
1293
1294 switch (token.type)
1295 {
1296 case token_type_symbol:
1297 {
1298 void *keyword_value;
1299
1300 if (hash_find_entry (&keywords, token.string, strlen (token.string),
1301 &keyword_value)
1302 == 0)
1303 {
1304 next_shapes = (const struct callshapes *) keyword_value;
1305 state = 1;
1306 }
1307 else
1308 state = 0;
1309 }
1310 next_context_iter =
1311 flag_context_list_iterator (
1312 flag_context_list_table_lookup (
1313 flag_context_list_table,
1314 token.string, strlen (token.string)));
1315 free (token.string);
1316 continue;
1317
1318 case token_type_lparen:
1319 if (++nesting_depth > MAX_NESTING_DEPTH)
1320 {
1321 error_with_progname = false;
1322 error (EXIT_FAILURE, 0, _("%s:%d: error: too many open parentheses"),
1323 logical_file_name, line_number);
1324 }
1325 if (extract_balanced (mlp, token_type_rparen,
1326 inner_context, next_context_iter,
1327 arglist_parser_alloc (mlp,
1328 state ? next_shapes : NULL)))
1329 {
1330 arglist_parser_done (argparser, arg);
1331 return true;
1332 }
1333 nesting_depth--;
1334 next_context_iter = null_context_list_iterator;
1335 state = 0;
1336 break;
1337
1338 case token_type_rparen:
1339 if (delim == token_type_rparen || delim == token_type_eof)
1340 {
1341 arglist_parser_done (argparser, arg);
1342 return false;
1343 }
1344
1345 next_context_iter = null_context_list_iterator;
1346 state = 0;
1347 continue;
1348
1349 case token_type_comma:
1350 arg++;
1351 inner_context =
1352 inherited_context (outer_context,
1353 flag_context_list_iterator_advance (
1354 &context_iter));
1355 next_context_iter = passthrough_context_list_iterator;
1356 state = 0;
1357 continue;
1358
1359 case token_type_eof:
1360 arglist_parser_done (argparser, arg);
1361 return true;
1362
1363 case token_type_string_literal:
1364 {
1365 lex_pos_ty pos;
1366
1367 pos.file_name = logical_file_name;
1368 pos.line_number = token.line_number;
1369
1370 if (extract_all)
1371 {
1372 char *string = mixed_string_contents (token.mixed_string);
1373 mixed_string_free (token.mixed_string);
1374 remember_a_message (mlp, NULL, string, true, false,
1375 inner_context, &pos,
1376 NULL, token.comment, false);
1377 }
1378 else
1379 {
1380 /* A string immediately after a symbol means a function call. */
1381 if (state)
1382 {
1383 struct arglist_parser *tmp_argparser;
1384 tmp_argparser = arglist_parser_alloc (mlp, next_shapes);
1385
1386 arglist_parser_remember (tmp_argparser, 1,
1387 token.mixed_string, inner_context,
1388 pos.file_name, pos.line_number,
1389 token.comment, false);
1390 arglist_parser_done (tmp_argparser, 1);
1391 }
1392 else
1393 arglist_parser_remember (argparser, arg,
1394 token.mixed_string, inner_context,
1395 pos.file_name, pos.line_number,
1396 token.comment, false);
1397 }
1398 }
1399 drop_reference (token.comment);
1400 next_context_iter = null_context_list_iterator;
1401 state = 0;
1402 continue;
1403
1404 case token_type_character_constant:
1405 case token_type_lbrace:
1406 case token_type_rbrace:
1407 case token_type_assign:
1408 case token_type_return:
1409 case token_type_plus:
1410 case token_type_arithmetic_operator:
1411 case token_type_equality_test_operator:
1412 case token_type_logic_operator:
1413 case token_type_question:
1414 case token_type_colon:
1415 case token_type_number:
1416 case token_type_string_template:
1417 case token_type_regex_literal:
1418 case token_type_other:
1419 next_context_iter = null_context_list_iterator;
1420 state = 0;
1421 continue;
1422
1423 default:
1424 abort ();
1425 }
1426 }
1427 }
1428
1429 void
1430 extract_vala (FILE *f,
1431 const char *real_filename, const char *logical_filename,
1432 flag_context_list_table_ty *flag_table,
1433 msgdomain_list_ty *mdlp)
1434 {
1435 message_list_ty *mlp = mdlp->item[0]->messages;
1436
1437 fp = f;
1438 real_file_name = real_filename;
1439 logical_file_name = xstrdup (logical_filename);
1440 line_number = 1;
1441
1442 phase1_pushback_length = 0;
1443
1444 last_comment_line = -1;
1445 last_non_comment_line = -1;
1446
1447 phase3_pushback_length = 0;
1448 last_token_type = token_type_other;
1449
1450 flag_context_list_table = flag_table;
1451 nesting_depth = 0;
1452
1453 init_keywords ();
1454
1455 /* Eat tokens until eof is seen. When extract_parenthesized returns
1456 due to an unbalanced closing parenthesis, just restart it. */
1457 while (!extract_balanced (mlp, token_type_eof,
1458 null_context, null_context_list_iterator,
1459 arglist_parser_alloc (mlp, NULL)))
1460 ;
1461
1462 fp = NULL;
1463 real_file_name = NULL;
1464 logical_file_name = NULL;
1465 line_number = 0;
1466 }