1 /* xgettext Lua backend.
2 Copyright (C) 2012-2013, 2016, 2018-2023 Free Software Foundation, Inc.
3
4 This file was written by Ľubomír Remák <lubomirr@lubomirr.eu>, 2012.
5
6 This program is free software: you can redistribute it and/or modify
7 it under the terms of the GNU General Public License as published by
8 the Free Software Foundation; either version 3 of the License, or
9 (at your option) any later version.
10
11 This program is distributed in the hope that it will be useful,
12 but WITHOUT ANY WARRANTY; without even the implied warranty of
13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 GNU General Public License for more details.
15
16 You should have received a copy of the GNU General Public License
17 along with this program. If not, see <https://www.gnu.org/licenses/>. */
18
19 #ifdef HAVE_CONFIG_H
20 #include "config.h"
21 #endif
22
23 /* Specification. */
24 #include "x-lua.h"
25
26 #include <errno.h>
27 #include <stdbool.h>
28 #include <stdio.h>
29 #include <stdlib.h>
30
31 #include "attribute.h"
32 #include "message.h"
33 #include "rc-str-list.h"
34 #include "xgettext.h"
35 #include "xg-pos.h"
36 #include "xg-mixed-string.h"
37 #include "xg-arglist-context.h"
38 #include "xg-arglist-callshape.h"
39 #include "xg-arglist-parser.h"
40 #include "xg-message.h"
41 #include "error.h"
42 #include "error-progname.h"
43 #include "xalloc.h"
44 #include "gettext.h"
45 #include "po-charset.h"
46
47 #define _(s) gettext(s)
48
49 #define SIZEOF(a) (sizeof(a) / sizeof(a[0]))
50
51 /* The Lua syntax is defined in the Lua manual sections 3.1 and 9,
52 which can be found at
53 https://www.lua.org/manual/5.2/manual.html#3.1
54 https://www.lua.org/manual/5.2/manual.html#9 */
55
56 /* If true extract all strings. */
57 static bool extract_all = false;
58
59 /* A hash table for keywords. */
60 static hash_table keywords;
61 static bool default_keywords = true;
62
63 /* Set extract_all flag (gettext will extract all strings). */
64 void
65 x_lua_extract_all ()
66 {
67 extract_all = true;
68 }
69
70 /* Adds a keyword. Copied from other lexers. */
71 void
72 x_lua_keyword (const char *name)
73 {
74 if (name == NULL)
75 default_keywords = false;
76 else
77 {
78 const char *end;
79 struct callshape shape;
80 const char *colon;
81
82 if (keywords.table == NULL)
83 hash_init (&keywords, 100);
84
85 split_keywordspec (name, &end, &shape);
86
87 /* The characters between name and end should form a valid C identifier.
88 A colon means an invalid parse in split_keywordspec(). */
89 colon = strchr (name, ':');
90 if (colon == NULL || colon >= end)
91 insert_keyword_callshape (&keywords, name, end - name, &shape);
92 }
93 }
94
95 /* Finish initializing the keywords hash table.
96 Called after argument processing, before each file is processed. */
97 static void
98 init_keywords ()
99 {
100 if (default_keywords)
101 {
102 /* When adding new keywords here, also update the documentation in
103 xgettext.texi! */
104 x_lua_keyword ("_");
105 x_lua_keyword ("gettext.gettext");
106 x_lua_keyword ("gettext.dgettext:2");
107 x_lua_keyword ("gettext.dcgettext:2");
108 x_lua_keyword ("gettext.ngettext:1,2");
109 x_lua_keyword ("gettext.dngettext:2,3");
110 x_lua_keyword ("gettext.dcngettext:2,3");
111 default_keywords = false;
112 }
113 }
114
115 void
116 init_flag_table_lua ()
117 {
118 xgettext_record_flag ("_:1:pass-lua-format");
119 xgettext_record_flag ("gettext.gettext:1:pass-lua-format");
120 xgettext_record_flag ("gettext.dgettext:2:pass-lua-format");
121 xgettext_record_flag ("gettext.dcgettext:2:pass-lua-format");
122 xgettext_record_flag ("gettext.ngettext:1:pass-lua-format");
123 xgettext_record_flag ("gettext.ngettext:2:pass-lua-format");
124 xgettext_record_flag ("gettext.dngettext:2:pass-lua-format");
125 xgettext_record_flag ("gettext.dngettext:3:pass-lua-format");
126 xgettext_record_flag ("gettext.dcngettext:2:pass-lua-format");
127 xgettext_record_flag ("gettext.dcngettext:3:pass-lua-format");
128 xgettext_record_flag ("string.format:1:lua-format");
129 }
130
131
132 /* ======================== Reading of characters. ======================== */
133
134 /* The input file stream. */
135 static FILE *fp;
136
137
138 /* 1. line_number handling. */
139
140 static unsigned char phase1_pushback[2];
141 static int phase1_pushback_length;
142
143 static bool first_character;
144
145 static int
146 phase1_getc ()
147 {
148 int c;
149
150 if (phase1_pushback_length)
151 c = phase1_pushback[--phase1_pushback_length];
152 else
153 {
154 c = getc (fp);
155
156 if (first_character)
157 {
158 first_character = false;
159
160 /* Ignore shebang line. No pushback required in this case. */
161 if (c == '#')
162 {
163 while (c != '\n' && c != EOF)
164 c = getc (fp);
165 if (c == '\n')
166 {
167 line_number++;
168 c = getc (fp);
169 }
170 }
171 }
172
173 if (c == EOF)
174 {
175 if (ferror (fp))
176 error (EXIT_FAILURE, errno, _("error while reading \"%s\""),
177 real_file_name);
178 return EOF;
179 }
180 }
181
182 if (c == '\n')
183 line_number++;
184
185 return c;
186 }
187
188 /* Supports 2 characters of pushback. */
189
190 static void
191 phase1_ungetc (int c)
192 {
193 if (c != EOF)
194 {
195 if (c == '\n')
196 --line_number;
197
198 if (phase1_pushback_length == SIZEOF (phase1_pushback))
199 abort ();
200 phase1_pushback[phase1_pushback_length++] = c;
201 }
202 }
203
204
205 /* These are for tracking whether comments count as immediately before
206 keyword. */
207 static int last_comment_line;
208 static int last_non_comment_line;
209
210 /* Accumulating comments. */
211
212 static char *buffer;
213 static size_t bufmax;
214 static size_t buflen;
215
216 static inline void
217 comment_start ()
218 {
219 buflen = 0;
220 }
221
222 static inline void
223 comment_add (int c)
224 {
225 if (buflen >= bufmax)
226 {
227 bufmax = 2 * bufmax + 10;
228 buffer = xrealloc (buffer, bufmax);
229 }
230 buffer[buflen++] = c;
231 }
232
233 static inline void
234 comment_line_end (size_t chars_to_remove)
235 {
236 buflen -= chars_to_remove;
237 while (buflen >= 1
238 && (buffer[buflen - 1] == ' ' || buffer[buflen - 1] == '\t'))
239 --buflen;
240 if (chars_to_remove == 0 && buflen >= bufmax)
241 {
242 bufmax = 2 * bufmax + 10;
243 buffer = xrealloc (buffer, bufmax);
244 }
245 buffer[buflen] = '\0';
246 savable_comment_add (buffer);
247 }
248
249 /* Eats characters until '\n' and adds them to the comment. */
250 static void
251 eat_comment_line ()
252 {
253 for (;;)
254 {
255 int c = phase1_getc ();
256 if (c == '\n' || c == EOF)
257 {
258 comment_line_end (0);
259 break;
260 }
261
262 if (!(buflen == 0 && (c == ' ' || c == '\t')))
263 comment_add (c);
264 }
265 }
266
267 static int
268 phase2_getc ()
269 {
270 int c;
271 int lineno;
272
273 c = phase1_getc ();
274
275 if (c == '-')
276 {
277 c = phase1_getc ();
278
279 if (c == '-')
280 {
281 /* It starts with '--', so it must be either a short or a long
282 comment. */
283 c = phase1_getc ();
284
285 if (c == '[')
286 {
287 c = phase1_getc ();
288
289 int esigns = 0;
290 while (c == '=')
291 {
292 esigns++;
293 c = phase1_getc ();
294 }
295
296 if (c == '[')
297 {
298 /* Long comment. */
299 bool right_bracket = false;
300 bool end = false;
301 int esigns2 = 0;
302
303 lineno = line_number;
304 comment_start ();
305 while (!end)
306 {
307 c = phase1_getc ();
308
309 if (c == EOF)
310 break;
311
312 /* Ignore leading spaces and tabs. */
313 if (!(buflen == 0 && (c == ' ' || c == '\t')))
314 {
315 comment_add (c);
316
317 switch (c)
318 {
319 case ']':
320 if (!right_bracket)
321 {
322 right_bracket = true;
323 esigns2 = 0;
324 }
325 else
326 {
327 if (esigns2 == esigns)
328 {
329 comment_line_end (2 + esigns);
330 end = true;
331 }
332 }
333 break;
334
335 case '=':
336 if (right_bracket)
337 esigns2++;
338 break;
339
340 case '\n':
341 comment_line_end (1);
342 comment_start ();
343 lineno = line_number;
344 FALLTHROUGH;
345 default:
346 right_bracket = false;
347 }
348 }
349 }
350 last_comment_line = lineno;
351 return ' ';
352 }
353 else
354 {
355 /* One line (short) comment, starting with '--[=...='. */
356 lineno = last_comment_line;
357 comment_start ();
358 comment_add ('[');
359 while (esigns--)
360 comment_add ('=');
361 phase1_ungetc (c);
362 eat_comment_line ();
363 last_comment_line = lineno;
364 return '\n';
365 }
366 }
367 else
368 {
369 /* One line (short) comment. */
370 lineno = line_number;
371 comment_start ();
372 phase1_ungetc (c);
373 eat_comment_line ();
374 last_comment_line = lineno;
375 return '\n';
376 }
377 }
378 else
379 {
380 /* Minus sign. */
381 phase1_ungetc (c);
382 return '-';
383 }
384 }
385 else
386 return c;
387 }
388
389
390 /* ========================== Reading of tokens. ========================== */
391
392 enum token_type_ty
393 {
394 token_type_eof,
395 token_type_lparen, /* ( */
396 token_type_rparen, /* ) */
397 token_type_lbracket, /* [ */
398 token_type_rbracket, /* ] */
399 token_type_comma, /* , */
400 token_type_dot, /* . */
401 token_type_doubledot, /* .. */
402 token_type_operator1, /* + - * / % not # - ^ */
403 token_type_operator2, /* < > <= >= ~= == and or */
404 token_type_string,
405 token_type_number,
406 token_type_symbol,
407 token_type_other
408 };
409
410 typedef enum token_type_ty token_type_ty;
411
412 typedef struct token_ty token_ty;
413 struct token_ty
414 {
415 token_type_ty type;
416 char *string; /* for token_type_string_literal, token_type_symbol */
417 refcounted_string_list_ty *comment; /* for token_type_string_literal */
418 int line_number;
419 };
420
421 /* Free the memory pointed to by a 'struct token_ty'. */
422 static inline void
423 free_token (token_ty *tp)
424 {
425 if (tp->type == token_type_string || tp->type == token_type_symbol)
426 free (tp->string);
427 if (tp->type == token_type_string)
428 drop_reference (tp->comment);
429 }
430
431 /* Our current string. */
432 static int string_buf_length;
433 static int string_buf_alloc;
434 static char *string_buf;
435
436 static void
437 string_start ()
438 {
439 string_buf_length = 0;
440 }
441
442 static void
443 string_add (int c)
444 {
445 if (string_buf_length >= string_buf_alloc)
446 {
447 string_buf_alloc = 2 * string_buf_alloc + 10;
448 string_buf = xrealloc (string_buf, string_buf_alloc);
449 }
450
451 string_buf[string_buf_length++] = c;
452 }
453
454 static void
455 string_end ()
456 {
457 if (string_buf_length >= string_buf_alloc)
458 {
459 string_buf_alloc = string_buf_alloc + 1;
460 string_buf = xrealloc (string_buf, string_buf_alloc);
461 }
462
463 string_buf[string_buf_length] = '\0';
464 }
465
466
467 /* We need 3 pushback tokens for string optimization. */
468 static int phase3_pushback_length;
469 static token_ty phase3_pushback[3];
470
471
472 static void
473 phase3_unget (token_ty *tp)
474 {
475 if (tp->type != token_type_eof)
476 {
477 if (phase3_pushback_length == SIZEOF (phase3_pushback))
478 abort ();
479 phase3_pushback[phase3_pushback_length++] = *tp;
480 }
481 }
482
483 static void
484 phase3_get (token_ty *tp)
485 {
486 int c;
487 int c2;
488 int c_start;
489
490 if (phase3_pushback_length)
491 {
492 *tp = phase3_pushback[--phase3_pushback_length];
493 return;
494 }
495
496 tp->string = NULL;
497
498 for (;;)
499 {
500 tp->line_number = line_number;
501 c = phase2_getc ();
502
503 switch (c)
504 {
505 case EOF:
506 tp->type = token_type_eof;
507 return;
508
509 case '\n':
510 if (last_non_comment_line > last_comment_line)
511 savable_comment_reset ();
512 FALLTHROUGH;
513 case ' ':
514 case '\t':
515 case '\f':
516 continue;
517
518 case '+':
519 case '-':
520 case '*':
521 case '/':
522 case '^':
523 case '%':
524 case '#':
525 tp->type = token_type_operator1;
526 return;
527 case '<':
528 case '>':
529 case '=':
530 c2 = phase1_getc ();
531 if (c2 != '=')
532 phase1_ungetc (c2);
533 tp->type = token_type_operator2;
534 return;
535 case '~':
536 c2 = phase1_getc ();
537 if (c2 == '=')
538 {
539 tp->type = token_type_operator2;
540 return;
541 }
542 else
543 phase1_ungetc (c2);
544 continue;
545 case '(':
546 tp->type = token_type_lparen;
547 return;
548 case ')':
549 tp->type = token_type_rparen;
550 return;
551 case ',':
552 tp->type = token_type_comma;
553 return;
554
555 case ';':
556 tp->type = token_type_other;
557 return;
558
559 /* There are three operators beginning with a dot. '.',
560 '..' and '...'. The most useful for us is the string
561 concatenation operator ('..'). */
562 case '.':
563 c = phase1_getc ();
564 if (c == '.')
565 {
566 c = phase1_getc ();
567 if (c == '.')
568 {
569 tp->type = token_type_other;
570 return;
571 }
572 else
573 {
574 phase1_ungetc (c);
575 tp->type = token_type_doubledot;
576 return;
577 }
578 }
579 else if (c >= '0' && c <= '9')
580 {
581 /* It's a number. We aren't interested in the actual
582 numeric value, so ignore the dot and let next
583 iteration eat the number. */
584 phase1_ungetc (c);
585 continue;
586 }
587 else
588 {
589 phase1_ungetc (c);
590 tp->type = token_type_dot;
591 return;
592 }
593
594 case '"':
595 case '\'':
596 c_start = c;
597 string_start ();
598
599 for (;;)
600 {
601 /* We need unprocessed characters from phase 1. */
602 c = phase1_getc ();
603
604 if (c == EOF || c == c_start || c == '\n')
605 {
606 /* End of string. */
607 string_end ();
608 tp->string = xstrdup (string_buf);
609 tp->comment = add_reference (savable_comment);
610 tp->type = token_type_string;
611 return;
612 }
613
614 /* We got '\', this is probably an escape sequence. */
615 if (c == '\\')
616 {
617 c = phase1_getc ();
618 switch (c)
619 {
620 case 'a':
621 string_add ('\a');
622 break;
623 case 'b':
624 string_add ('\b');
625 break;
626 case 'f':
627 string_add ('\f');
628 break;
629 case 'n':
630 string_add ('\n');
631 break;
632 case 'r':
633 string_add ('\r');
634 break;
635 case 't':
636 string_add ('\t');
637 break;
638 case 'v':
639 string_add ('\v');
640 break;
641 case 'x':
642 {
643 int num = 0;
644 int i = 0;
645
646 for (i = 0; i < 2; i++)
647 {
648 c = phase1_getc ();
649 if (c >= '0' && c <= '9')
650 num += c - '0';
651 else if (c >= 'a' && c <= 'f')
652 num += c - 'a' + 10;
653 else if (c >= 'A' && c <= 'F')
654 num += c - 'A' + 10;
655 else
656 {
657 phase1_ungetc (c);
658 break;
659 }
660
661 if (i == 0)
662 num *= 16;
663 }
664
665 if (i == 2)
666 string_add (num);
667 }
668
669 break;
670 case 'z':
671 /* Ignore the following whitespace. */
672 do
673 {
674 c = phase1_getc ();
675 }
676 while (c == ' ' || c == '\n' || c == '\t' || c == '\r'
677 || c == '\f' || c == '\v');
678
679 phase1_ungetc (c);
680
681 break;
682 default:
683 /* Check if it's a '\ddd' sequence. */
684 if (c >= '0' && c <= '9')
685 {
686 int num = 0;
687 int i = 0;
688
689 while (c >= '0' && c <= '9' && i < 3)
690 {
691 num *= 10;
692 num += (c - '0');
693 c = phase1_getc ();
694 i++;
695 }
696
697 /* The last read character is either a
698 non-number or another number after our
699 '\ddd' sequence. We need to ungetc it. */
700 phase1_ungetc (c);
701
702 /* The sequence number is too big, this
703 causes a lexical error. Ignore it. */
704 if (num < 256)
705 string_add (num);
706 }
707 else
708 string_add (c);
709 }
710 }
711 else
712 string_add (c);
713 }
714 break;
715
716 case '[':
717 c = phase1_getc ();
718
719 /* Count the number of equal signs. */
720 int esigns = 0;
721 while (c == '=')
722 {
723 esigns++;
724 c = phase1_getc ();
725 }
726
727 if (c != '[')
728 {
729 /* We did not find what we were looking for, ungetc it. */
730 phase1_ungetc (c);
731 if (esigns == 0)
732 {
733 /* Our current character isn't '[' and we got 0 equal
734 signs, so the first '[' must have been a left
735 bracket. */
736 tp->type = token_type_lbracket;
737 return;
738 }
739 else
740 /* Lexical error, ignore it. */
741 continue;
742 }
743
744 /* Found an opening long bracket. */
745 string_start ();
746
747 /* See if it is immediately followed by a newline. */
748 c = phase1_getc ();
749 if (c != '\n')
750 phase1_ungetc (c);
751
752 for (;;)
753 {
754 c = phase1_getc ();
755
756 if (c == EOF)
757 {
758 string_end ();
759 tp->string = xstrdup (string_buf);
760 tp->comment = add_reference (savable_comment);
761 tp->type = token_type_string;
762 return;
763 }
764 if (c == ']')
765 {
766 c = phase1_getc ();
767
768 /* Count the number of equal signs. */
769 int esigns2 = 0;
770 while (c == '=')
771 {
772 esigns2++;
773 c = phase1_getc ();
774 }
775
776 if (c == ']' && esigns == esigns2)
777 {
778 /* We got ']==...==]', where the number of equal
779 signs matches the number of equal signs in
780 the opening bracket. */
781 string_end ();
782 tp->string = xstrdup (string_buf);
783 tp->comment = add_reference (savable_comment);
784 tp->type = token_type_string;
785 return;
786 }
787 else
788 {
789 /* Otherwise we got either ']==' garbage or
790 ']==...==]' with a different number of equal
791 signs.
792
793 Add ']' and equal signs to the string, and
794 ungetc the current character, because the
795 second ']' might be a part of another closing
796 long bracket, e.g. '==]===]'. */
797 phase1_ungetc (c);
798
799 string_add (']');
800 while (esigns2--)
801 string_add ('=');
802 }
803 }
804 else
805 string_add (c);
806 }
807 break;
808
809 case ']':
810 tp->type = token_type_rbracket;
811 return;
812
813 default:
814 if (c >= '0' && c <= '9')
815 {
816 while (c >= '0' && c <= '9')
817 c = phase1_getc ();
818
819 if (c == '.')
820 {
821 c = phase1_getc ();
822 while (c >= '0' && c <= '9')
823 c = phase1_getc ();
824 }
825
826 if (c == 'e' || c == 'E')
827 {
828 if (c == '+' || c == '-')
829 c = phase1_getc ();
830 while (c >= '0' && c <= '9')
831 c = phase1_getc ();
832 }
833
834 phase1_ungetc (c);
835
836 tp->type = token_type_number;
837 return;
838 }
839 else if ((c >= 'a' && c <= 'z') || (c >= 'A' && c <= 'Z')
840 || c == '_')
841 {
842 string_start ();
843 while ((c >= 'a' && c <= 'z') || (c >= 'A' && c <= 'Z')
844 || c == '_' || (c >= '0' && c <= '9'))
845 {
846 string_add (c);
847 c = phase1_getc ();
848 }
849 string_end ();
850 phase1_ungetc (c);
851
852 if (strcmp (string_buf, "not") == 0)
853 tp->type = token_type_operator1;
854 else if (strcmp (string_buf, "and") == 0)
855 tp->type = token_type_operator2;
856 else if (strcmp (string_buf, "or") == 0)
857 tp->type = token_type_operator2;
858 else
859 {
860 tp->string = xstrdup (string_buf);
861 tp->type = token_type_symbol;
862 }
863 return;
864 }
865 else
866 tp->type = token_type_other;
867 }
868 }
869 }
870
871 /* String and symbol concatenation. */
872
873 static token_type_ty phase4_last;
874
875 /* We need 3 pushback tokens for string and symbol concatenation. */
876 static int phase4_pushback_length;
877 static token_ty phase4_pushback[3];
878
879 static void
880 phase4_unget (token_ty *tp)
881 {
882 if (tp->type != token_type_eof)
883 {
884 if (phase4_pushback_length == SIZEOF (phase4_pushback))
885 abort ();
886 phase4_pushback[phase4_pushback_length++] = *tp;
887 }
888 }
889
890 static void
891 phase4_get (token_ty *tp)
892 {
893 if (phase4_pushback_length)
894 {
895 *tp = phase4_pushback[--phase4_pushback_length];
896 phase4_last = tp->type;
897 return;
898 }
899
900 phase3_get (tp);
901 if (tp->type == token_type_string
902 && !(phase4_last == token_type_operator1
903 || phase4_last == token_type_dot
904 || phase4_last == token_type_symbol
905 || phase4_last == token_type_doubledot
906 || phase4_last == token_type_rparen))
907 {
908 char *sum = tp->string;
909 size_t sum_len = strlen (sum);
910
911 for (;;)
912 {
913 token_ty token2;
914
915 phase3_get (&token2);
916 if (token2.type == token_type_doubledot)
917 {
918 token_ty token3;
919
920 phase3_get (&token3);
921 if (token3.type == token_type_string)
922 {
923 token_ty token_after;
924
925 phase3_get (&token_after);
926 if (token_after.type != token_type_operator1)
927 {
928 char *addend = token3.string;
929 size_t addend_len = strlen (addend);
930
931 sum = (char *) xrealloc (sum, sum_len + addend_len + 1);
932 memcpy (sum + sum_len, addend, addend_len + 1);
933 sum_len += addend_len;
934
935 phase3_unget (&token_after);
936 free_token (&token3);
937 free_token (&token2);
938 continue;
939 }
940 phase3_unget (&token_after);
941 }
942 phase3_unget (&token3);
943 }
944 phase3_unget (&token2);
945 break;
946 }
947 tp->string = sum;
948 }
949 phase4_last = tp->type;
950 }
951
952 static void
953 phase5_get (token_ty *tp)
954 {
955 phase4_get (tp);
956
957 /* Combine symbol1 . ... . symbolN to a single strings, so that
958 we can recognize function calls like
959 gettext.gettext. The information present for
960 symbolI.....symbolN has precedence over the information for
961 symbolJ.....symbolN with J > I. */
962 if (tp->type == token_type_symbol)
963 {
964 char *sum = tp->string;
965 size_t sum_len = strlen (sum);
966
967 for (;;)
968 {
969 token_ty token2;
970
971 phase4_get (&token2);
972 if (token2.type == token_type_dot)
973 {
974 token_ty token3;
975
976 phase4_get (&token3);
977 if (token3.type == token_type_symbol)
978 {
979 char *addend = token3.string;
980 size_t addend_len = strlen (addend);
981
982 sum = (char *) xrealloc (sum, sum_len + 1 + addend_len + 1);
983 sum[sum_len] = '.';
984 memcpy (sum + sum_len + 1, addend, addend_len + 1);
985 sum_len += 1 + addend_len;
986
987 free_token (&token2);
988 free_token (&token3);
989 continue;
990 }
991 phase4_unget (&token3);
992 }
993 phase4_unget (&token2);
994 break;
995 }
996 tp->string = sum;
997 }
998 }
999
1000 static void
1001 x_lua_lex (token_ty *tok)
1002 {
1003 phase5_get (tok);
1004 }
1005
1006
1007 /* ========================= Extracting strings. ========================== */
1008
1009
1010 /* Context lookup table. */
1011 static flag_context_list_table_ty *flag_context_list_table;
1012
1013
1014 /* Maximum supported nesting depth. */
1015 #define MAX_NESTING_DEPTH 1000
1016
1017 /* Current nesting depths. */
1018 static int paren_nesting_depth;
1019 static int bracket_nesting_depth;
1020
1021
1022 /* The file is broken into tokens. Scan the token stream, looking for
1023 a keyword, followed by a left paren, followed by a string. When we
1024 see this sequence, we have something to remember. We assume we are
1025 looking at a valid Lua program, and leave the complaints about the
1026 grammar to the compiler.
1027
1028 Normal handling: Look for
1029 keyword ( ... msgid ... )
1030 keyword msgid
1031 Plural handling: Look for
1032 keyword ( ... msgid ... msgid_plural ... )
1033
1034 We use recursion because the arguments before msgid or between msgid
1035 and msgid_plural can contain subexpressions of the same form. */
1036
1037 /* Extract messages until the next balanced closing parenthesis or bracket.
1038 Extracted messages are added to MLP.
1039 DELIM can be either token_type_rparen or token_type_rbracket, or
1040 token_type_eof to accept both.
1041 Return true upon eof, false upon closing parenthesis or bracket. */
1042 static bool
1043 extract_balanced (message_list_ty *mlp, token_type_ty delim,
1044 flag_context_ty outer_context,
1045 flag_context_list_iterator_ty context_iter,
1046 struct arglist_parser *argparser)
1047 {
1048 /* Current argument number. */
1049 int arg = 1;
1050 /* 0 when no keyword has been seen. 1 right after a keyword is seen. */
1051 int state;
1052 /* Parameters of the keyword just seen. Defined only in state 1. */
1053 const struct callshapes *next_shapes = NULL;
1054 /* Context iterator that will be used if the next token is a '('. */
1055 flag_context_list_iterator_ty next_context_iter =
1056 passthrough_context_list_iterator;
1057 /* Current context. */
1058 flag_context_ty inner_context =
1059 inherited_context (outer_context,
1060 flag_context_list_iterator_advance (&context_iter));
1061
1062 /* Start state is 0. */
1063 state = 0;
1064
1065 for (;;)
1066 {
1067 token_ty token;
1068
1069 x_lua_lex (&token);
1070
1071 switch (token.type)
1072 {
1073 case token_type_symbol:
1074 {
1075 void *keyword_value;
1076
1077 if (hash_find_entry (&keywords, token.string, strlen (token.string),
1078 &keyword_value)
1079 == 0)
1080 {
1081 next_shapes = (const struct callshapes *) keyword_value;
1082 state = 1;
1083 }
1084 else
1085 state = 0;
1086 }
1087 next_context_iter =
1088 flag_context_list_iterator (
1089 flag_context_list_table_lookup (
1090 flag_context_list_table,
1091 token.string, strlen (token.string)));
1092 free (token.string);
1093 continue;
1094
1095 case token_type_lparen:
1096 if (++paren_nesting_depth > MAX_NESTING_DEPTH)
1097 {
1098 error_with_progname = false;
1099 error (EXIT_FAILURE, 0, _("%s:%d: error: too many open parentheses"),
1100 logical_file_name, line_number);
1101 }
1102 if (extract_balanced (mlp, token_type_rparen,
1103 inner_context, next_context_iter,
1104 arglist_parser_alloc (mlp,
1105 state ? next_shapes : NULL)))
1106 {
1107 arglist_parser_done (argparser, arg);
1108 return true;
1109 }
1110 paren_nesting_depth--;
1111 next_context_iter = null_context_list_iterator;
1112 state = 0;
1113 break;
1114
1115 case token_type_rparen:
1116 if (delim == token_type_rparen || delim == token_type_eof)
1117 {
1118 arglist_parser_done (argparser, arg);
1119 return false;
1120 }
1121
1122 next_context_iter = null_context_list_iterator;
1123 state = 0;
1124 continue;
1125
1126 case token_type_lbracket:
1127 if (++bracket_nesting_depth > MAX_NESTING_DEPTH)
1128 {
1129 error_with_progname = false;
1130 error (EXIT_FAILURE, 0, _("%s:%d: error: too many open brackets"),
1131 logical_file_name, line_number);
1132 }
1133 if (extract_balanced (mlp, token_type_rbracket,
1134 null_context, null_context_list_iterator,
1135 arglist_parser_alloc (mlp, NULL)))
1136 {
1137 arglist_parser_done (argparser, arg);
1138 return true;
1139 }
1140 bracket_nesting_depth--;
1141 next_context_iter = null_context_list_iterator;
1142 state = 0;
1143 break;
1144
1145 case token_type_rbracket:
1146 if (delim == token_type_rbracket || delim == token_type_eof)
1147 {
1148 arglist_parser_done (argparser, arg);
1149 return false;
1150 }
1151
1152 next_context_iter = null_context_list_iterator;
1153 state = 0;
1154 continue;
1155
1156 case token_type_comma:
1157 arg++;
1158 inner_context =
1159 inherited_context (outer_context,
1160 flag_context_list_iterator_advance (
1161 &context_iter));
1162 next_context_iter = passthrough_context_list_iterator;
1163 state = 0;
1164 continue;
1165
1166 case token_type_eof:
1167 arglist_parser_done (argparser, arg);
1168 return true;
1169
1170 case token_type_string:
1171 {
1172 lex_pos_ty pos;
1173 pos.file_name = logical_file_name;
1174 pos.line_number = token.line_number;
1175
1176 if (extract_all)
1177 remember_a_message (mlp, NULL, token.string, false, false,
1178 inner_context, &pos,
1179 NULL, token.comment, false);
1180 else
1181 {
1182 mixed_string_ty *ms =
1183 mixed_string_alloc_simple (token.string, lc_string,
1184 pos.file_name, pos.line_number);
1185 free (token.string);
1186 /* A string immediately after a symbol means a function call. */
1187 if (state)
1188 {
1189 struct arglist_parser *tmp_argparser;
1190 tmp_argparser = arglist_parser_alloc (mlp, next_shapes);
1191
1192 arglist_parser_remember (tmp_argparser, 1, ms,
1193 inner_context,
1194 pos.file_name, pos.line_number,
1195 token.comment, false);
1196 arglist_parser_done (tmp_argparser, 1);
1197 }
1198 else
1199 arglist_parser_remember (argparser, arg, ms,
1200 inner_context,
1201 pos.file_name, pos.line_number,
1202 token.comment, false);
1203 }
1204 }
1205 drop_reference (token.comment);
1206 next_context_iter = null_context_list_iterator;
1207 state = 0;
1208 continue;
1209
1210 case token_type_dot:
1211 case token_type_doubledot:
1212 case token_type_operator1:
1213 case token_type_operator2:
1214 case token_type_number:
1215 case token_type_other:
1216 next_context_iter = null_context_list_iterator;
1217 state = 0;
1218 continue;
1219
1220 default:
1221 abort ();
1222 }
1223 }
1224 }
1225
1226 void
1227 extract_lua (FILE *f,
1228 const char *real_filename, const char *logical_filename,
1229 flag_context_list_table_ty *flag_table,
1230 msgdomain_list_ty *mdlp)
1231 {
1232 message_list_ty *mlp = mdlp->item[0]->messages;
1233
1234 fp = f;
1235 real_file_name = real_filename;
1236 logical_file_name = xstrdup (logical_filename);
1237 line_number = 1;
1238
1239 phase1_pushback_length = 0;
1240 first_character = true;
1241
1242 last_comment_line = -1;
1243 last_non_comment_line = -1;
1244
1245 phase3_pushback_length = 0;
1246
1247 phase4_last = token_type_eof;
1248 phase4_pushback_length = 0;
1249
1250 flag_context_list_table = flag_table;
1251 paren_nesting_depth = 0;
1252 bracket_nesting_depth = 0;
1253
1254 init_keywords ();
1255
1256 /* Eat tokens until eof is seen. When extract_parenthesized returns
1257 due to an unbalanced closing parenthesis, just restart it. */
1258 while (!extract_balanced (mlp, token_type_eof,
1259 null_context, null_context_list_iterator,
1260 arglist_parser_alloc (mlp, NULL)))
1261 ;
1262
1263 fp = NULL;
1264 real_file_name = NULL;
1265 logical_file_name = NULL;
1266 line_number = 0;
1267 }