1 /* xgettext sh backend.
2 Copyright (C) 2003, 2005-2009, 2014, 2018-2023 Free Software Foundation, Inc.
3 Written by Bruno Haible <bruno@clisp.org>, 2003.
4
5 This program is free software: you can redistribute it and/or modify
6 it under the terms of the GNU General Public License as published by
7 the Free Software Foundation; either version 3 of the License, or
8 (at your option) any later version.
9
10 This program is distributed in the hope that it will be useful,
11 but WITHOUT ANY WARRANTY; without even the implied warranty of
12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13 GNU General Public License for more details.
14
15 You should have received a copy of the GNU General Public License
16 along with this program. If not, see <https://www.gnu.org/licenses/>. */
17
18 #ifdef HAVE_CONFIG_H
19 # include "config.h"
20 #endif
21
22 /* Specification. */
23 #include "x-sh.h"
24
25 #include <errno.h>
26 #include <limits.h>
27 #include <stdbool.h>
28 #include <stdio.h>
29 #include <stdlib.h>
30 #include <string.h>
31
32 #include "attribute.h"
33 #include "message.h"
34 #include "xgettext.h"
35 #include "xg-pos.h"
36 #include "xg-mixed-string.h"
37 #include "xg-arglist-context.h"
38 #include "xg-arglist-callshape.h"
39 #include "xg-arglist-parser.h"
40 #include "xg-message.h"
41 #include "error.h"
42 #include "error-progname.h"
43 #include "xalloc.h"
44 #include "mem-hash-map.h"
45 #include "../../gettext-runtime/src/escapes.h"
46 #include "gettext.h"
47
48 #define _(s) gettext(s)
49
50 #define SIZEOF(a) (sizeof(a) / sizeof(a[0]))
51
52
53 /* The sh syntax is defined in POSIX:2001, see
54 http://www.opengroup.org/onlinepubs/007904975/utilities/xcu_chap02.html
55 Summary of sh syntax:
56 - Input is broken into words, which are then subject to
57 - tilde expansion ~...
58 - command substitution `...`
59 - variable substitution $var
60 - arithmetic substitution $((...))
61 - field splitting at whitespace (IFS)
62 - wildcard pattern expansion *?
63 - quote removal
64 - Strings are enclosed in "..."; command substitution, variable
65 substitution and arithmetic substitution are performed here as well.
66 - '...' is a string without substitutions.
67 - The list of resulting words is split into commands by semicolon and
68 newline.
69 - '#' at the beginning of a word introduces a comment until end of line.
70 The parser is implemented in bash-2.05b/parse.y. */
71
72
73 /* ====================== Keyword set customization. ====================== */
74
75 /* If true extract all strings. */
76 static bool extract_all = false;
77
78 static hash_table keywords;
79 static bool default_keywords = true;
80
81
82 void
83 x_sh_extract_all ()
84 {
85 extract_all = true;
86 }
87
88
89 void
90 x_sh_keyword (const char *name)
91 {
92 if (name == NULL)
93 default_keywords = false;
94 else
95 {
96 const char *end;
97 struct callshape shape;
98 const char *colon;
99
100 if (keywords.table == NULL)
101 hash_init (&keywords, 100);
102
103 split_keywordspec (name, &end, &shape);
104
105 /* The characters between name and end should form a valid C identifier.
106 A colon means an invalid parse in split_keywordspec(). */
107 colon = strchr (name, ':');
108 if (colon == NULL || colon >= end)
109 insert_keyword_callshape (&keywords, name, end - name, &shape);
110 }
111 }
112
113 /* Finish initializing the keywords hash table.
114 Called after argument processing, before each file is processed. */
115 static void
116 init_keywords ()
117 {
118 if (default_keywords)
119 {
120 /* When adding new keywords here, also update the documentation in
121 xgettext.texi! */
122 x_sh_keyword ("gettext");
123 x_sh_keyword ("ngettext:1,2");
124 /* Note: There is also special handling for 'gettext' and 'ngettext'
125 in read_command, below. */
126 x_sh_keyword ("eval_gettext");
127 x_sh_keyword ("eval_ngettext:1,2");
128 x_sh_keyword ("eval_pgettext:1c,2");
129 x_sh_keyword ("eval_npgettext:1c,2,3");
130 default_keywords = false;
131 }
132 }
133
134 void
135 init_flag_table_sh ()
136 {
137 xgettext_record_flag ("gettext:1:pass-sh-format");
138 xgettext_record_flag ("ngettext:1:pass-sh-format");
139 xgettext_record_flag ("ngettext:2:pass-sh-format");
140 xgettext_record_flag ("eval_gettext:1:sh-format");
141 xgettext_record_flag ("eval_ngettext:1:sh-format");
142 xgettext_record_flag ("eval_ngettext:2:sh-format");
143 xgettext_record_flag ("eval_pgettext:2:sh-format");
144 xgettext_record_flag ("eval_npgettext:2:sh-format");
145 xgettext_record_flag ("eval_npgettext:3:sh-format");
146 }
147
148
149 /* ======================== Reading of characters. ======================== */
150
151 /* The input file stream. */
152 static FILE *fp;
153
154
155 /* Fetch the next character from the input file. */
156 static int
157 do_getc ()
158 {
159 int c = getc (fp);
160
161 if (c == EOF)
162 {
163 if (ferror (fp))
164 error (EXIT_FAILURE, errno,
165 _("error while reading \"%s\""), real_file_name);
166 }
167 else if (c == '\n')
168 line_number++;
169
170 return c;
171 }
172
173 /* Put back the last fetched character, not EOF. */
174 static void
175 do_ungetc (int c)
176 {
177 if (c == '\n')
178 line_number--;
179 ungetc (c, fp);
180 }
181
182
183 /* Remove backslash followed by newline from the input stream. */
184
185 static int phase1_pushback[2];
186 static int phase1_pushback_length;
187
188 static int
189 phase1_getc ()
190 {
191 int c;
192
193 if (phase1_pushback_length)
194 {
195 c = phase1_pushback[--phase1_pushback_length];
196 if (c == '\n')
197 ++line_number;
198 return c;
199 }
200 for (;;)
201 {
202 c = do_getc ();
203 if (c != '\\')
204 return c;
205 c = do_getc ();
206 if (c != '\n')
207 {
208 if (c != EOF)
209 do_ungetc (c);
210 return '\\';
211 }
212 }
213 }
214
215 /* Supports only one pushback character. */
216 static void
217 phase1_ungetc (int c)
218 {
219 switch (c)
220 {
221 case EOF:
222 break;
223
224 case '\n':
225 --line_number;
226 FALLTHROUGH;
227
228 default:
229 if (phase1_pushback_length == SIZEOF (phase1_pushback))
230 abort ();
231 phase1_pushback[phase1_pushback_length++] = c;
232 break;
233 }
234 }
235
236
237 /* ========================== Reading of tokens. ========================== */
238
239
240 /* A token consists of a sequence of characters. */
241 struct token
242 {
243 int allocated; /* number of allocated 'token_char's */
244 int charcount; /* number of used 'token_char's */
245 char *chars; /* the token's constituents */
246 };
247
248 /* Initialize a 'struct token'. */
249 static inline void
250 init_token (struct token *tp)
251 {
252 tp->allocated = 10;
253 tp->chars = XNMALLOC (tp->allocated, char);
254 tp->charcount = 0;
255 }
256
257 /* Free the memory pointed to by a 'struct token'. */
258 static inline void
259 free_token (struct token *tp)
260 {
261 free (tp->chars);
262 }
263
264 /* Ensure there is enough room in the token for one more character. */
265 static inline void
266 grow_token (struct token *tp)
267 {
268 if (tp->charcount == tp->allocated)
269 {
270 tp->allocated *= 2;
271 tp->chars = (char *) xrealloc (tp->chars, tp->allocated * sizeof (char));
272 }
273 }
274
275 /* Convert a struct token * to a char*. */
276 static char *
277 string_of_token (const struct token *tp)
278 {
279 char *str;
280 int n;
281
282 n = tp->charcount;
283 str = XNMALLOC (n + 1, char);
284 memcpy (str, tp->chars, n);
285 str[n] = '\0';
286 return str;
287 }
288
289
290 /* ========================= Accumulating messages ========================= */
291
292
293 static message_list_ty *mlp;
294
295
296 /* ========================= Accumulating comments ========================= */
297
298
299 static char *buffer;
300 static size_t bufmax;
301 static size_t buflen;
302
303 static inline void
304 comment_start ()
305 {
306 buflen = 0;
307 }
308
309 static inline void
310 comment_add (int c)
311 {
312 if (buflen >= bufmax)
313 {
314 bufmax = 2 * bufmax + 10;
315 buffer = xrealloc (buffer, bufmax);
316 }
317 buffer[buflen++] = c;
318 }
319
320 static inline void
321 comment_line_end ()
322 {
323 while (buflen >= 1
324 && (buffer[buflen - 1] == ' ' || buffer[buflen - 1] == '\t'))
325 --buflen;
326 if (buflen >= bufmax)
327 {
328 bufmax = 2 * bufmax + 10;
329 buffer = xrealloc (buffer, bufmax);
330 }
331 buffer[buflen] = '\0';
332 savable_comment_add (buffer);
333 }
334
335
336 /* These are for tracking whether comments count as immediately before
337 keyword. */
338 static int last_comment_line;
339 static int last_non_comment_line;
340
341
342 /* ========================= Debackslashification ========================== */
343
344 /* This state tracks the effect of backquotes, double-quotes and single-quotes
345 on the parsing of backslashes. We make a single pass through the input
346 file, keeping the state up to date. This is much faster than accumulating
347 strings and processing them with explicit debackslashification, like the
348 shell does it. */
349
350 /* The number of nested `...` or "`...`" constructs. Assumed to be <= 32. */
351 static unsigned int nested_backquotes;
352
353 /* A bit mask indicating which of the currently open `...` or "`...`"
354 constructs is with double-quotes: "`...`".
355 A bit value of 1 stands for "`...`", a bit value of 0 stands for `...`.
356 Bit position 0 designates the outermost backquotes nesting,
357 bit position 1 the second-outermost backquotes nesting,
358 ...
359 bit position (nested_backquotes-1) the innermost backquotes nesting. */
360 static unsigned int open_doublequotes_mask;
361
362 /* A bit indicating whether a double-quote is currently open inside the
363 innermost backquotes nesting. */
364 static bool open_doublequote;
365
366 /* A bit indicating whether a single-quote is currently open inside the
367 innermost backquotes nesting. */
368 static bool open_singlequote;
369
370 /* The expected terminator of the currently open single-quote.
371 Usually '\'', but can be '"' for i18n-quotes. */
372 static char open_singlequote_terminator;
373
374
375 /* Functions to update the state. */
376
377 static inline void
378 saw_opening_backquote ()
379 {
380 if (open_singlequote)
381 abort ();
382 if (open_doublequote)
383 open_doublequotes_mask |= (unsigned int) 1 << nested_backquotes;
384 nested_backquotes++;
385 open_doublequote = false;
386 }
387
388 static inline void
389 saw_closing_backquote ()
390 {
391 nested_backquotes--;
392 open_doublequote = (open_doublequotes_mask >> nested_backquotes) & 1;
393 open_doublequotes_mask &= ((unsigned int) 1 << nested_backquotes) - 1;
394 open_singlequote = false; /* just for safety */
395 }
396
397 static inline void
398 saw_opening_doublequote ()
399 {
400 if (open_singlequote || open_doublequote)
401 abort ();
402 open_doublequote = true;
403 }
404
405 static inline void
406 saw_closing_doublequote ()
407 {
408 if (open_singlequote || !open_doublequote)
409 abort ();
410 open_doublequote = false;
411 }
412
413 static inline void
414 saw_opening_singlequote ()
415 {
416 if (open_doublequote || open_singlequote)
417 abort ();
418 open_singlequote = true;
419 open_singlequote_terminator = '\'';
420 }
421
422 static inline void
423 saw_closing_singlequote ()
424 {
425 if (open_doublequote || !open_singlequote)
426 abort ();
427 open_singlequote = false;
428 }
429
430
431 /* ========================== Reading of commands ========================== */
432
433 /* We are only interested in constant strings. Other words need not to be
434 represented precisely. */
435 enum word_type
436 {
437 t_string, /* constant string */
438 t_assignment, /* variable assignment */
439 t_other, /* other string */
440 t_separator, /* command separator: semicolon or newline */
441 t_redirect, /* redirection: one of < > >| << <<- >> <> <& >& */
442 t_backquote, /* closing '`' pseudo word */
443 t_paren, /* closing ')' pseudo word */
444 t_eof /* EOF marker */
445 };
446
447 struct word
448 {
449 enum word_type type;
450 struct token *token; /* for t_string */
451 int line_number_at_start; /* for t_string */
452 };
453
454 /* Free the memory pointed to by a 'struct word'. */
455 static inline void
456 free_word (struct word *wp)
457 {
458 if (wp->type == t_string)
459 {
460 free_token (wp->token);
461 free (wp->token);
462 }
463 }
464
465 /* Convert a t_string token to a char*. */
466 static char *
467 string_of_word (const struct word *wp)
468 {
469 char *str;
470 int n;
471
472 if (!(wp->type == t_string))
473 abort ();
474 n = wp->token->charcount;
475 str = XNMALLOC (n + 1, char);
476 memcpy (str, wp->token->chars, n);
477 str[n] = '\0';
478 return str;
479 }
480
481 /* Convert a t_string token to a char*, ignoring the first OFFSET bytes. */
482 static char *
483 substring_of_word (const struct word *wp, size_t offset)
484 {
485 char *str;
486 int n;
487
488 if (!(wp->type == t_string))
489 abort ();
490 n = wp->token->charcount;
491 if (!(offset <= n))
492 abort ();
493 str = XNMALLOC (n - offset + 1, char);
494 memcpy (str, wp->token->chars + offset, n - offset);
495 str[n - offset] = '\0';
496 return str;
497 }
498
499
500 /* Whitespace recognition. */
501
502 static inline bool
503 is_whitespace (int c)
504 {
505 return (c == ' ' || c == '\t' || c == '\n');
506 }
507
508 /* Operator character recognition. */
509
510 static inline bool
511 is_operator_start (int c)
512 {
513 return (c == '|' || c == '&' || c == ';' || c == '<' || c == '>'
514 || c == '(' || c == ')');
515 }
516
517
518 /* Denotation of a quoted character.
519 The distinction between quoted and unquoted character is important only for
520 the special, whitespace and operator characters; it is irrelevant for
521 alphanumeric characters, '\\' and many others. */
522 #define QUOTED(c) (UCHAR_MAX + 1 + (c))
523 /* Values in the 'unsigned char' range are implicitly unquoted. Among these,
524 the following are important:
525 '"' opening or closing double quote
526 '\'' opening or closing single quote
527 '$' the unknown result of a dollar expansion
528 '`' does not occur - replaced with OPENING_BACKQUOTE or
529 CLOSING_BACKQUOTE
530 */
531 #define OPENING_BACKQUOTE (2 * (UCHAR_MAX + 1) + '`')
532 #define CLOSING_BACKQUOTE (3 * (UCHAR_MAX + 1) + '`')
533
534 /* 2 characters of pushback are supported.
535 2 characters of pushback occur only when the first is an 'x'; in all
536 other cases only one character of pushback is needed. */
537 static int phase2_pushback[2];
538 static int phase2_pushback_length;
539
540 /* Return the next character, with backslashes removed.
541 The result is QUOTED(c) for some unsigned char c, if the next character
542 is escaped sufficiently often to make it a regular constituent character,
543 or simply an 'unsigned char' if it has its special meaning (of special,
544 whitespace or operator charcter), or OPENING_BACKQUOTE, CLOSING_BACKQUOTE,
545 EOF.
546 It's the caller's responsibility to update the state. */
547 static int
548 phase2_getc ()
549 {
550 int c;
551
552 if (phase2_pushback_length)
553 {
554 c = phase2_pushback[--phase2_pushback_length];
555 if (c == '\n')
556 ++line_number;
557 return c;
558 }
559
560 c = phase1_getc ();
561 if (c == EOF)
562 return c;
563 if (c == '\'')
564 return ((open_doublequote
565 || (open_singlequote && open_singlequote_terminator != c))
566 ? QUOTED (c)
567 : c);
568 if (open_singlequote)
569 {
570 if (c == open_singlequote_terminator)
571 return c;
572 }
573 else
574 {
575 if (c == '"' || c == '$')
576 return c;
577 if (c == '`')
578 return (nested_backquotes > 0 ? CLOSING_BACKQUOTE : OPENING_BACKQUOTE);
579 }
580 if (c == '\\')
581 {
582 /* Number of debackslashification passes that are active at the
583 current point. */
584 unsigned int debackslashify =
585 nested_backquotes + (open_singlequote ? 0 : 1);
586 /* Normal number of backslashes that yield a single backslash in the
587 final output. */
588 unsigned int expected_count =
589 (unsigned int) 1 << debackslashify;
590 /* Number of backslashes found. */
591 unsigned int count;
592
593 for (count = 1; count < expected_count; count++)
594 {
595 c = phase1_getc ();
596 if (c != '\\')
597 break;
598 }
599 if (count == expected_count)
600 return '\\';
601
602 /* The count of backslashes is > 0 and < expected_count, therefore the
603 result depends on c, the first character after the backslashes.
604 Note: The formulas below don't necessarily have a logic; they were
605 empirically determined such that 1. the xgettext-sh-1 test succeeds,
606 2. the behaviour for count == 0 would correspond to the one without
607 any baskslash. */
608 if (c == '\'')
609 {
610 if (!open_singlequote && count > (expected_count >> 1))
611 {
612 phase1_ungetc (c);
613 return '\\';
614 }
615 else
616 return ((open_doublequote
617 || (open_singlequote
618 ? open_singlequote_terminator != c
619 : count == (expected_count >> 1)))
620 ? QUOTED (c)
621 : c);
622 }
623 else if (c == '"')
624 {
625 /* Each debackslashification pass converts \\ to \ and \" to ";
626 passes corresponding to `...` drop a lone " whereas passes
627 corresponding to "`...`" leave it alone. Therefore, the
628 minimum number of backslashes needed to get one double-quote
629 in the end is open_doublequotes_mask + 1. */
630 if (open_singlequote)
631 {
632 if (count > open_doublequotes_mask)
633 {
634 phase1_ungetc (c);
635 return '\\';
636 }
637 else
638 return (open_singlequote_terminator != c ? QUOTED (c) : c);
639 }
640 else
641 {
642 if (count > open_doublequotes_mask)
643 return QUOTED (c);
644 else
645 /* Some of the count values <= open_doublequotes_mask are
646 actually invalid here, but we assume a syntactically
647 correct input file anyway. */
648 return c;
649 }
650 }
651 else if (c == '`')
652 {
653 /* FIXME: This code looks fishy. */
654 if (count == expected_count - 1)
655 return c;
656 else
657 /* Some of the count values < expected_count - 1 are
658 actually invalid here, but we assume a syntactically
659 correct input file anyway. */
660 if (nested_backquotes > 0 && !open_singlequote
661 && count >= (expected_count >> 2))
662 return OPENING_BACKQUOTE;
663 else
664 return CLOSING_BACKQUOTE;
665 }
666 else if (c == '$')
667 {
668 if (open_singlequote)
669 return QUOTED (c);
670 if (count >= (expected_count >> 1))
671 return QUOTED (c);
672 else
673 return c;
674 }
675 else
676 {
677 /* When not followed by a quoting character or backslash or dollar,
678 a backslash survives a debackslashification pass unmodified.
679 Therefore each debackslashification pass performs a
680 count := (count + 1) >> 1
681 operation. Therefore the minimum number of backslashes needed
682 to get one backslash in the end is (expected_count >> 1) + 1. */
683 if (open_doublequote || open_singlequote)
684 {
685 if (count > 0)
686 {
687 phase1_ungetc (c);
688 return '\\';
689 }
690 else
691 return QUOTED (c);
692 }
693 else
694 {
695 if (count > (expected_count >> 1))
696 {
697 phase1_ungetc (c);
698 return '\\';
699 }
700 else if (count > 0)
701 return QUOTED (c);
702 else
703 return c;
704 }
705 }
706 }
707
708 return (open_singlequote || open_doublequote ? QUOTED (c) : c);
709 }
710
711 /* Supports 2 characters of pushback. */
712 static void
713 phase2_ungetc (int c)
714 {
715 switch (c)
716 {
717 case EOF:
718 break;
719
720 case '\n':
721 --line_number;
722 FALLTHROUGH;
723
724 default:
725 if (phase2_pushback_length == SIZEOF (phase2_pushback))
726 abort ();
727 phase2_pushback[phase2_pushback_length++] = c;
728 break;
729 }
730 }
731
732
733 /* Context lookup table. */
734 static flag_context_list_table_ty *flag_context_list_table;
735
736
737 /* Maximum supported nesting depth. */
738 #define MAX_NESTING_DEPTH 1000
739
740 /* Current nesting depth. */
741 static int nesting_depth;
742
743
744 /* Forward declaration of local functions. */
745 static enum word_type read_command_list (int looking_for,
746 flag_context_ty outer_context);
747
748
749
750 /* Read the next word.
751 'looking_for' denotes a parse terminator, either CLOSING_BACKQUOTE, ')'
752 or '\0'. */
753 static void
754 read_word (struct word *wp, int looking_for, flag_context_ty context)
755 {
756 int c;
757 bool all_unquoted_digits;
758 bool all_unquoted_name_characters;
759
760 do
761 {
762 c = phase2_getc ();
763 if (c == '#')
764 {
765 /* Skip a comment up to end of line. */
766 last_comment_line = line_number;
767 comment_start ();
768 for (;;)
769 {
770 c = phase1_getc ();
771 if (c == EOF || c == '\n')
772 break;
773 /* We skip all leading white space, but not EOLs. */
774 if (!(buflen == 0 && (c == ' ' || c == '\t')))
775 comment_add (c);
776 }
777 comment_line_end ();
778 }
779 if (c == '\n')
780 {
781 /* Comments assumed to be grouped with a message must immediately
782 precede it, with no non-whitespace token on a line between
783 both. */
784 if (last_non_comment_line > last_comment_line)
785 savable_comment_reset ();
786 wp->type = t_separator;
787 return;
788 }
789 }
790 while (is_whitespace (c));
791
792 if (c == EOF)
793 {
794 wp->type = t_eof;
795 return;
796 }
797
798 if (c == '<' || c == '>')
799 {
800 /* Recognize the redirection operators < > >| << <<- >> <> <& >&
801 But <( and >) are handled below, not here. */
802 int c2 = phase2_getc ();
803 if (c2 != '(')
804 {
805 if ((c == '<' ? c2 == '<' : c2 == '|') || c2 == '>' || c2 == '&')
806 {
807 if (c == '<' && c2 == '<')
808 {
809 int c3 = phase2_getc ();
810 if (c3 != '-')
811 phase2_ungetc (c3);
812 }
813 }
814 else
815 phase2_ungetc (c2);
816 wp->type = t_redirect;
817 return;
818 }
819 else
820 phase2_ungetc (c2);
821 }
822
823 if (c == CLOSING_BACKQUOTE)
824 {
825 if (looking_for == CLOSING_BACKQUOTE)
826 {
827 saw_closing_backquote ();
828 wp->type = t_backquote;
829 last_non_comment_line = line_number;
830 return;
831 }
832 else if (looking_for == ')')
833 {
834 /* The input is invalid syntax, such as `a<(`
835 Push back the closing backquote and pretend that we have seen a
836 closing parenthesis. */
837 phase2_ungetc (c);
838 wp->type = t_paren;
839 last_non_comment_line = line_number;
840 return;
841 }
842 else
843 /* We shouldn't be reading a CLOSING_BACKQUOTE when
844 looking_for == '\0'. */
845 abort ();
846 }
847
848 if (looking_for == ')' && c == ')')
849 {
850 wp->type = t_paren;
851 last_non_comment_line = line_number;
852 return;
853 }
854
855 if (is_operator_start (c))
856 {
857 wp->type = (c == ';' ? t_separator : t_other);
858 return;
859 }
860
861 wp->type = t_string;
862 wp->token = XMALLOC (struct token);
863 init_token (wp->token);
864 wp->line_number_at_start = line_number;
865 /* True while all characters in the token seen so far are digits. */
866 all_unquoted_digits = true;
867 /* True while all characters in the token seen so far form a "name":
868 all characters are unquoted underscores, digits, or alphabetics from the
869 portable character set, and the first character is not a digit. Cf.
870 <https://pubs.opengroup.org/onlinepubs/9699919799/basedefs/V1_chap03.html#tag_03_235>
871 */
872 all_unquoted_name_characters = true;
873
874 for (;; c = phase2_getc ())
875 {
876 if (c == EOF)
877 break;
878
879 if (all_unquoted_digits && (c == '<' || c == '>'))
880 {
881 /* Recognize the redirection operators < > >| << <<- >> <> <& >&
882 prefixed with a nonempty sequence of unquoted digits. */
883 int c2 = phase2_getc ();
884 if ((c == '<' ? c2 == '<' : c2 == '|') || c2 == '>' || c2 == '&')
885 {
886 if (c == '<' && c2 == '<')
887 {
888 int c3 = phase2_getc ();
889 if (c3 != '-')
890 phase2_ungetc (c3);
891 }
892 }
893 else
894 phase2_ungetc (c2);
895
896 wp->type = t_redirect;
897 free_token (wp->token);
898 free (wp->token);
899
900 last_non_comment_line = line_number;
901
902 return;
903 }
904
905 all_unquoted_digits = all_unquoted_digits && (c >= '0' && c <= '9');
906
907 if (all_unquoted_name_characters && wp->token->charcount > 0 && c == '=')
908 {
909 wp->type = t_assignment;
910 continue;
911 }
912
913 all_unquoted_name_characters =
914 all_unquoted_name_characters
915 && ((c >= 'A' && c <= 'Z') || (c >= 'a' && c <= 'z') || c == '_'
916 || (wp->token->charcount > 0 && c >= '0' && c <= '9'));
917
918 if (c == '$')
919 {
920 int c2;
921
922 /* An unquoted dollar indicates we are not inside '...'. */
923 if (open_singlequote)
924 abort ();
925 /* After reading a dollar, we know that there is no pushed back
926 character from an earlier lookahead. */
927 if (phase2_pushback_length > 0)
928 abort ();
929 /* Therefore we can use phase1 without interfering with phase2.
930 We need to recognize $( outside and inside double-quotes.
931 It would be incorrect to do
932 c2 = phase2_getc ();
933 if (c2 == '(' || c2 == QUOTED ('('))
934 because that would also trigger for $\(. */
935 c2 = phase1_getc ();
936 if (c2 == '(')
937 {
938 bool saved_open_doublequote;
939 int c3;
940
941 phase1_ungetc (c2);
942
943 /* The entire inner command or arithmetic expression is read
944 ignoring possible surrounding double-quotes. */
945 saved_open_doublequote = open_doublequote;
946 open_doublequote = false;
947
948 c2 = phase2_getc ();
949 if (c2 != '(')
950 abort ();
951
952 c3 = phase2_getc ();
953 if (c3 == '(')
954 {
955 /* Arithmetic expression (Bash syntax). Skip until the
956 matching closing parenthesis. */
957 unsigned int depth = 2;
958
959 do
960 {
961 c = phase2_getc ();
962 if (c == '(')
963 depth++;
964 else if (c == ')')
965 if (--depth == 0)
966 break;
967 }
968 while (c != EOF);
969 }
970 else
971 {
972 /* Command substitution (Bash syntax). */
973 phase2_ungetc (c3);
974 ++nesting_depth;
975 read_command_list (')', context);
976 nesting_depth--;
977 }
978
979 open_doublequote = saved_open_doublequote;
980 }
981 else
982 {
983 phase1_ungetc (c2);
984 c2 = phase2_getc ();
985
986 if (c2 == '\'' && !open_singlequote)
987 {
988 /* Bash builtin for string with ANSI-C escape sequences. */
989 for (;;)
990 {
991 /* We have to use phase1 throughout this loop,
992 because phase2 does debackslashification,
993 which is undesirable when parsing ANSI-C
994 escape sequences. */
995 c = phase1_getc ();
996 if (c == EOF)
997 break;
998 if (c == '\'')
999 break;
1000 if (c == '\\')
1001 {
1002 c = phase1_getc ();
1003 switch (c)
1004 {
1005 default:
1006 phase1_ungetc (c);
1007 c = '\\';
1008 break;
1009
1010 case '\\':
1011 break;
1012 case '\'':
1013 break;
1014 case '"':
1015 break;
1016
1017 case 'a':
1018 c = '\a';
1019 break;
1020 case 'b':
1021 c = '\b';
1022 break;
1023 case 'e':
1024 case 'E':
1025 c = 0x1b; /* ESC */
1026 break;
1027 case 'f':
1028 c = '\f';
1029 break;
1030 case 'n':
1031 c = '\n';
1032 break;
1033 case 'r':
1034 c = '\r';
1035 break;
1036 case 't':
1037 c = '\t';
1038 break;
1039 case 'v':
1040 c = '\v';
1041 break;
1042
1043 case 'x':
1044 c = phase1_getc ();
1045 if ((c >= '0' && c <= '9')
1046 || (c >= 'A' && c <= 'F')
1047 || (c >= 'a' && c <= 'f'))
1048 {
1049 int n;
1050
1051 if (c >= '0' && c <= '9')
1052 n = c - '0';
1053 else if (c >= 'A' && c <= 'F')
1054 n = 10 + c - 'A';
1055 else if (c >= 'a' && c <= 'f')
1056 n = 10 + c - 'a';
1057 else
1058 abort ();
1059
1060 c = phase1_getc ();
1061 if ((c >= '0' && c <= '9')
1062 || (c >= 'A' && c <= 'F')
1063 || (c >= 'a' && c <= 'f'))
1064 {
1065 if (c >= '0' && c <= '9')
1066 n = n * 16 + c - '0';
1067 else if (c >= 'A' && c <= 'F')
1068 n = n * 16 + 10 + c - 'A';
1069 else if (c >= 'a' && c <= 'f')
1070 n = n * 16 + 10 + c - 'a';
1071 else
1072 abort ();
1073 }
1074 else
1075 phase1_ungetc (c);
1076
1077 c = n;
1078 }
1079 else
1080 {
1081 phase1_ungetc (c);
1082 phase1_ungetc ('x');
1083 c = '\\';
1084 }
1085 break;
1086
1087 case '0': case '1': case '2': case '3':
1088 case '4': case '5': case '6': case '7':
1089 {
1090 int n = c - '0';
1091
1092 c = phase1_getc ();
1093 if (c >= '0' && c <= '7')
1094 {
1095 n = n * 8 + c - '0';
1096
1097 c = phase1_getc ();
1098 if (c >= '0' && c <= '7')
1099 n = n * 8 + c - '0';
1100 else
1101 phase1_ungetc (c);
1102 }
1103 else
1104 phase1_ungetc (c);
1105
1106 c = n;
1107 }
1108 break;
1109 }
1110 }
1111 if (wp->type == t_string)
1112 {
1113 grow_token (wp->token);
1114 wp->token->chars[wp->token->charcount++] =
1115 (unsigned char) c;
1116 }
1117 }
1118 /* The result is a literal string. Don't change wp->type. */
1119 continue;
1120 }
1121 else if (c2 == '"' && !open_doublequote)
1122 {
1123 /* Bash builtin for internationalized string. */
1124 lex_pos_ty pos;
1125 struct token string;
1126
1127 saw_opening_singlequote ();
1128 open_singlequote_terminator = '"';
1129 pos.file_name = logical_file_name;
1130 pos.line_number = line_number;
1131 init_token (&string);
1132 for (;;)
1133 {
1134 c = phase2_getc ();
1135 if (c == EOF)
1136 break;
1137 if (c == '"')
1138 {
1139 saw_closing_singlequote ();
1140 break;
1141 }
1142 grow_token (&string);
1143 string.chars[string.charcount++] = (unsigned char) c;
1144 }
1145 remember_a_message (mlp, NULL, string_of_token (&string),
1146 false, false, context, &pos,
1147 NULL, savable_comment, false);
1148 free_token (&string);
1149
1150 error_with_progname = false;
1151 error (0, 0, _("%s:%lu: warning: the syntax $\"...\" is deprecated due to security reasons; use eval_gettext instead"),
1152 pos.file_name, (unsigned long) pos.line_number);
1153 error_with_progname = true;
1154
1155 /* The result at runtime is not constant. Therefore we
1156 change wp->type. */
1157 }
1158 else
1159 phase2_ungetc (c2);
1160 }
1161 wp->type = t_other;
1162 continue;
1163 }
1164
1165 if (c == '\'')
1166 {
1167 if (!open_singlequote)
1168 {
1169 /* Handle an opening single quote. */
1170 saw_opening_singlequote ();
1171 }
1172 else
1173 {
1174 /* Handle a closing single quote. */
1175 saw_closing_singlequote ();
1176 }
1177 continue;
1178 }
1179
1180 if (c == '"')
1181 {
1182 if (open_singlequote && open_singlequote_terminator == '"')
1183 {
1184 /* Handle a closing i18n quote. */
1185 saw_closing_singlequote ();
1186 }
1187 else if (!open_doublequote)
1188 {
1189 /* Handle an opening double quote. */
1190 saw_opening_doublequote ();
1191 }
1192 else
1193 {
1194 /* Handle a closing double quote. */
1195 saw_closing_doublequote ();
1196 }
1197 continue;
1198 }
1199
1200 if (c == OPENING_BACKQUOTE)
1201 {
1202 /* Handle an opening backquote. */
1203 saw_opening_backquote ();
1204
1205 ++nesting_depth;
1206 read_command_list (CLOSING_BACKQUOTE, context);
1207 nesting_depth--;
1208
1209 wp->type = t_other;
1210 continue;
1211 }
1212 if (c == CLOSING_BACKQUOTE)
1213 break;
1214
1215 if (c == '<' || c == '>')
1216 {
1217 int c2;
1218
1219 /* An unquoted c indicates we are not inside '...' nor "...". */
1220 if (open_singlequote || open_doublequote)
1221 abort ();
1222
1223 c2 = phase2_getc ();
1224 if (c2 == '(')
1225 {
1226 /* Process substitution (Bash syntax). */
1227 ++nesting_depth;
1228 read_command_list (')', context);
1229 nesting_depth--;
1230
1231 wp->type = t_other;
1232 continue;
1233 }
1234 else
1235 phase2_ungetc (c2);
1236 }
1237
1238 if (!open_singlequote && !open_doublequote
1239 && (is_whitespace (c) || is_operator_start (c)))
1240 break;
1241
1242 if (wp->type == t_string)
1243 {
1244 grow_token (wp->token);
1245 wp->token->chars[wp->token->charcount++] = (unsigned char) c;
1246 }
1247 }
1248
1249 phase2_ungetc (c);
1250
1251 if (wp->type != t_string)
1252 {
1253 free_token (wp->token);
1254 free (wp->token);
1255 }
1256 last_non_comment_line = line_number;
1257 }
1258
1259
1260 /* Read the next command.
1261 'looking_for' denotes a parse terminator, either CLOSING_BACKQUOTE, ')'
1262 or '\0'.
1263 Returns the type of the word that terminated the command. */
1264 static enum word_type
1265 read_command (int looking_for, flag_context_ty outer_context)
1266 {
1267 /* Read the words that make up the command.
1268 Here we completely ignore field splitting at whitespace and wildcard
1269 expansions; i.e. we assume that the source is written in such a way that
1270 every word in the program determines exactly one word in the resulting
1271 command.
1272 But we do not require that the 'gettext'/'ngettext' command is the
1273 first in the command; this is because 1. we want to allow for prefixes
1274 like "$verbose" that may expand to nothing, and 2. it's a big effort
1275 to know where a command starts in a $(for ...) or $(case ...) compound
1276 command. */
1277 int arg = 0; /* Current argument number. */
1278 bool arg_of_redirect = false; /* True right after a redirection operator. */
1279 bool must_expand_arg_strings = false; /* True if need to expand escape
1280 sequences in arguments. */
1281 flag_context_list_iterator_ty context_iter;
1282 const struct callshapes *shapes = NULL;
1283 struct arglist_parser *argparser = NULL;
1284
1285 for (;;)
1286 {
1287 struct word inner;
1288 flag_context_ty inner_context;
1289
1290 if (arg == 0)
1291 inner_context = null_context;
1292 else
1293 inner_context =
1294 inherited_context (outer_context,
1295 flag_context_list_iterator_advance (
1296 &context_iter));
1297
1298 read_word (&inner, looking_for, inner_context);
1299
1300 /* Recognize end of command. */
1301 if (inner.type == t_separator
1302 || inner.type == t_backquote || inner.type == t_paren
1303 || inner.type == t_eof)
1304 {
1305 if (argparser != NULL)
1306 arglist_parser_done (argparser, arg);
1307 return inner.type;
1308 }
1309
1310 if (extract_all)
1311 {
1312 if (inner.type == t_string)
1313 {
1314 lex_pos_ty pos;
1315
1316 pos.file_name = logical_file_name;
1317 pos.line_number = inner.line_number_at_start;
1318 remember_a_message (mlp, NULL, string_of_word (&inner), false,
1319 false, inner_context, &pos,
1320 NULL, savable_comment, false);
1321 }
1322 }
1323
1324 if (arg_of_redirect)
1325 {
1326 /* Ignore arguments of redirection operators. */
1327 arg_of_redirect = false;
1328 }
1329 else if (inner.type == t_redirect)
1330 {
1331 /* Ignore this word and the following one. */
1332 arg_of_redirect = true;
1333 }
1334 else
1335 {
1336 bool matters_for_argparser = true;
1337
1338 if (argparser == NULL)
1339 {
1340 /* This is the function position. */
1341 arg = 0;
1342 if (inner.type == t_assignment)
1343 {
1344 /* An assignment just sets an environment variable.
1345 Ignore it. */
1346 /* Don't increment arg in this round. */
1347 matters_for_argparser = false;
1348 }
1349 else if (inner.type == t_string)
1350 {
1351 char *function_name = string_of_word (&inner);
1352
1353 if (strcmp (function_name, "env") == 0)
1354 {
1355 /* The 'env' command just introduces more assignments.
1356 Ignore it. */
1357 /* Don't increment arg in this round. */
1358 matters_for_argparser = false;
1359 }
1360 else
1361 {
1362 void *keyword_value;
1363
1364 if (hash_find_entry (&keywords,
1365 function_name,
1366 strlen (function_name),
1367 &keyword_value)
1368 == 0)
1369 shapes = (const struct callshapes *) keyword_value;
1370
1371 argparser = arglist_parser_alloc (mlp, shapes);
1372
1373 context_iter =
1374 flag_context_list_iterator (
1375 flag_context_list_table_lookup (
1376 flag_context_list_table,
1377 function_name, strlen (function_name)));
1378 }
1379
1380 free (function_name);
1381 }
1382 else
1383 context_iter = null_context_list_iterator;
1384 }
1385 else
1386 {
1387 /* These are the argument positions. */
1388 if (inner.type == t_string)
1389 {
1390 bool accepts_context =
1391 ((argparser->keyword_len == 7
1392 && memcmp (argparser->keyword, "gettext", 7) == 0)
1393 || (argparser->keyword_len == 8
1394 && memcmp (argparser->keyword, "ngettext", 8) == 0));
1395 bool accepts_expand =
1396 ((argparser->keyword_len == 7
1397 && memcmp (argparser->keyword, "gettext", 7) == 0)
1398 || (argparser->keyword_len == 8
1399 && memcmp (argparser->keyword, "ngettext", 8) == 0));
1400 if (accepts_context && argparser->next_is_msgctxt)
1401 {
1402 char *s = string_of_word (&inner);
1403 mixed_string_ty *ms =
1404 mixed_string_alloc_simple (s, lc_string,
1405 logical_file_name,
1406 inner.line_number_at_start);
1407 free (s);
1408 argparser->next_is_msgctxt = false;
1409 arglist_parser_remember_msgctxt (argparser, ms,
1410 inner_context,
1411 logical_file_name,
1412 inner.line_number_at_start);
1413 matters_for_argparser = false;
1414 }
1415 else if (accepts_context
1416 && ((inner.token->charcount == 2
1417 && memcmp (inner.token->chars, "-c", 2) == 0)
1418 || (inner.token->charcount == 9
1419 && memcmp (inner.token->chars, "--context", 9) == 0)))
1420 {
1421 argparser->next_is_msgctxt = true;
1422 matters_for_argparser = false;
1423 }
1424 else if (accepts_context
1425 && (inner.token->charcount >= 10
1426 && memcmp (inner.token->chars, "--context=", 10) == 0))
1427 {
1428 char *s = substring_of_word (&inner, 10);
1429 mixed_string_ty *ms =
1430 mixed_string_alloc_simple (s, lc_string,
1431 logical_file_name,
1432 inner.line_number_at_start);
1433 free (s);
1434 argparser->next_is_msgctxt = false;
1435 arglist_parser_remember_msgctxt (argparser, ms,
1436 inner_context,
1437 logical_file_name,
1438 inner.line_number_at_start);
1439 matters_for_argparser = false;
1440 }
1441 else if (accepts_expand
1442 && inner.token->charcount == 2
1443 && memcmp (inner.token->chars, "-e", 2) == 0)
1444 {
1445 must_expand_arg_strings = true;
1446 matters_for_argparser = false;
1447 }
1448 else
1449 {
1450 char *s = string_of_word (&inner);
1451 mixed_string_ty *ms;
1452
1453 /* When '-e' was specified, expand escape sequences in s. */
1454 if (accepts_expand && must_expand_arg_strings)
1455 {
1456 bool expands_backslash_c =
1457 (argparser->keyword_len == 7
1458 && memcmp (argparser->keyword, "gettext", 7) == 0);
1459 bool backslash_c = false;
1460 char *expanded =
1461 (char *)
1462 expand_escapes (s, expands_backslash_c ? &backslash_c : NULL);
1463 /* We can ignore the value of expands_backslash_c, because
1464 here we don't support the gettext '-s' option. */
1465 if (expanded != s)
1466 free (s);
1467 s = expanded;
1468 }
1469
1470 ms = mixed_string_alloc_simple (s, lc_string,
1471 logical_file_name,
1472 inner.line_number_at_start);
1473 free (s);
1474 arglist_parser_remember (argparser, arg, ms,
1475 inner_context,
1476 logical_file_name,
1477 inner.line_number_at_start,
1478 savable_comment, false);
1479 }
1480 }
1481
1482 if (matters_for_argparser)
1483 if (arglist_parser_decidedp (argparser, arg))
1484 {
1485 /* Stop looking for arguments of the last function_name. */
1486 /* FIXME: What about context_iter? */
1487 arglist_parser_done (argparser, arg);
1488 shapes = NULL;
1489 argparser = NULL;
1490 }
1491 }
1492
1493 if (matters_for_argparser)
1494 arg++;
1495 }
1496
1497 free_word (&inner);
1498 }
1499 }
1500
1501
1502 /* Read a list of commands.
1503 'looking_for' denotes a parse terminator, either CLOSING_BACKQUOTE, ')'
1504 or '\0'.
1505 Returns the type of the word that terminated the command list. */
1506 static enum word_type
1507 read_command_list (int looking_for, flag_context_ty outer_context)
1508 {
1509 if (nesting_depth > MAX_NESTING_DEPTH)
1510 {
1511 error_with_progname = false;
1512 error (EXIT_FAILURE, 0, _("%s:%d: error: too deeply nested command list"),
1513 logical_file_name, line_number);
1514 }
1515 for (;;)
1516 {
1517 enum word_type terminator;
1518
1519 terminator = read_command (looking_for, outer_context);
1520 if (terminator != t_separator)
1521 return terminator;
1522 }
1523 }
1524
1525
1526 void
1527 extract_sh (FILE *f,
1528 const char *real_filename, const char *logical_filename,
1529 flag_context_list_table_ty *flag_table,
1530 msgdomain_list_ty *mdlp)
1531 {
1532 mlp = mdlp->item[0]->messages;
1533
1534 fp = f;
1535 real_file_name = real_filename;
1536 logical_file_name = xstrdup (logical_filename);
1537 line_number = 1;
1538
1539 phase1_pushback_length = 0;
1540
1541 last_comment_line = -1;
1542 last_non_comment_line = -1;
1543
1544 nested_backquotes = 0;
1545 open_doublequotes_mask = 0;
1546 open_doublequote = false;
1547 open_singlequote = false;
1548
1549 phase2_pushback_length = 0;
1550
1551 flag_context_list_table = flag_table;
1552 nesting_depth = 0;
1553
1554 init_keywords ();
1555
1556 /* Eat tokens until eof is seen. */
1557 read_command_list ('\0', null_context);
1558
1559 fp = NULL;
1560 real_file_name = NULL;
1561 logical_file_name = NULL;
1562 line_number = 0;
1563 }