1 /* Keeping track of the encoding of strings to be extracted.
2 Copyright (C) 2001-2023 Free Software Foundation, Inc.
3
4 This program is free software: you can redistribute it and/or modify
5 it under the terms of the GNU General Public License as published by
6 the Free Software Foundation; either version 3 of the License, or
7 (at your option) any later version.
8
9 This program is distributed in the hope that it will be useful,
10 but WITHOUT ANY WARRANTY; without even the implied warranty of
11 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 GNU General Public License for more details.
13
14 You should have received a copy of the GNU General Public License
15 along with this program. If not, see <https://www.gnu.org/licenses/>. */
16
17 #ifdef HAVE_CONFIG_H
18 # include <config.h>
19 #endif
20
21 /* Specification. */
22 #include "xg-encoding.h"
23
24 #include <stdio.h>
25 #include <stdlib.h>
26
27 #include "msgl-ascii.h"
28 #include "msgl-iconv.h"
29 #include "po-charset.h"
30 #include "unistr.h"
31 #include "xalloc.h"
32 #include "xerror.h"
33 #include "xvasprintf.h"
34
35 #include "gettext.h"
36 #define _(str) gettext (str)
37
38
39 /* Canonicalized encoding name for all input files.
40 It can be NULL when the --from-code option has not been specified. In this
41 case, the default (ASCII or UTF-8) depends on the programming language. */
42 const char *xgettext_global_source_encoding;
43
44 #if HAVE_ICONV
45 /* Converter from xgettext_global_source_encoding to UTF-8 (except from
46 ASCII or UTF-8, when this conversion is a no-op). */
47 iconv_t xgettext_global_source_iconv;
48 #endif
49
50 /* Canonicalized encoding name for the current input file. */
51 const char *xgettext_current_source_encoding;
52
53 #if HAVE_ICONV
54 /* Converter from xgettext_current_source_encoding to UTF-8 (except from
55 ASCII or UTF-8, when this conversion is a no-op). */
56 iconv_t xgettext_current_source_iconv;
57 #endif
58
59
60 /* Error message about non-ASCII character in a specific lexical context. */
61 char *
62 non_ascii_error_message (lexical_context_ty lcontext,
63 const char *file_name, size_t line_number)
64 {
65 char buffer[22];
66 char *errmsg;
67
68 if (line_number == (size_t)(-1))
69 buffer[0] = '\0';
70 else
71 sprintf (buffer, ":%ld", (long) line_number);
72
73 switch (lcontext)
74 {
75 case lc_outside:
76 case lc_xml_content:
77 errmsg =
78 xasprintf (_("Non-ASCII character at %s%s."), file_name, buffer);
79 break;
80 case lc_comment:
81 errmsg =
82 xasprintf (_("Non-ASCII comment at or before %s%s."),
83 file_name, buffer);
84 break;
85 case lc_string:
86 errmsg =
87 xasprintf (_("Non-ASCII string at %s%s."), file_name, buffer);
88 break;
89 case lc_xml_open_tag:
90 case lc_xml_close_tag:
91 errmsg =
92 xasprintf (_("Non-ASCII XML tag at %s%s."), file_name, buffer);
93 break;
94 default:
95 abort ();
96 }
97 return errmsg;
98 }
99
100 /* Error message about non-UTF-8 character in a specific lexical context. */
101 static char *
102 non_utf8_error_message (lexical_context_ty lcontext,
103 const char *file_name, size_t line_number)
104 {
105 char buffer[22];
106 char *errmsg;
107
108 if (line_number == (size_t)(-1))
109 buffer[0] = '\0';
110 else
111 sprintf (buffer, ":%ld", (long) line_number);
112
113 switch (lcontext)
114 {
115 case lc_outside:
116 case lc_xml_content:
117 errmsg =
118 xasprintf (_("Character at %s%s is not UTF-8 encoded."),
119 file_name, buffer);
120 break;
121 case lc_comment:
122 errmsg =
123 xasprintf (_("Comment at or before %s%s is not UTF-8 encoded."),
124 file_name, buffer);
125 break;
126 case lc_string:
127 errmsg =
128 xasprintf (_("String at %s%s is not UTF-8 encoded."),
129 file_name, buffer);
130 break;
131 case lc_xml_open_tag:
132 case lc_xml_close_tag:
133 errmsg =
134 xasprintf (_("XML tag at %s%s is not UTF-8 encoded."), file_name, buffer);
135 break;
136 default:
137 abort ();
138 }
139 return errmsg;
140 }
141
142 /* Convert the given string from xgettext_current_source_encoding to
143 the output file encoding (i.e. ASCII or UTF-8).
144 The resulting string is either the argument string, or freshly allocated.
145 The file_name and line_number are only used for error message purposes. */
146 char *
147 from_current_source_encoding (const char *string,
148 lexical_context_ty lcontext,
149 const char *file_name, size_t line_number)
150 {
151 if (xgettext_current_source_encoding == po_charset_ascii)
152 {
153 if (!is_ascii_string (string))
154 {
155 multiline_error (xstrdup (""),
156 xasprintf ("%s\n%s\n",
157 non_ascii_error_message (lcontext,
158 file_name,
159 line_number),
160 _("Please specify the source encoding through --from-code.")));
161 exit (EXIT_FAILURE);
162 }
163 }
164 else if (xgettext_current_source_encoding == po_charset_utf8)
165 {
166 if (u8_check ((const uint8_t *) string, strlen (string)) != NULL)
167 {
168 multiline_error (xstrdup (""),
169 xasprintf ("%s\n%s\n",
170 non_utf8_error_message (lcontext,
171 file_name,
172 line_number),
173 _("Please specify the source encoding through --from-code.")));
174 exit (EXIT_FAILURE);
175 }
176 }
177 else
178 {
179 #if HAVE_ICONV
180 struct conversion_context context;
181
182 context.from_code = xgettext_current_source_encoding;
183 context.to_code = po_charset_utf8;
184 context.from_filename = file_name;
185 context.message = NULL;
186
187 string = convert_string_directly (xgettext_current_source_iconv, string,
188 &context);
189 #else
190 /* If we don't have iconv(), the only supported values for
191 xgettext_global_source_encoding and thus also for
192 xgettext_current_source_encoding are ASCII and UTF-8.
193 convert_string_directly() should not be called in this case. */
194 abort ();
195 #endif
196 }
197
198 return (char *) string;
199 }
200
201 /* Like from_current_source_encoding, for a string that may contain NULs. */
202 string_desc_t
203 string_desc_from_current_source_encoding (string_desc_t string,
204 lexical_context_ty lcontext,
205 const char *file_name,
206 size_t line_number)
207 {
208 if (xgettext_current_source_encoding == po_charset_ascii)
209 {
210 if (!is_ascii_string_desc (string))
211 {
212 multiline_error (xstrdup (""),
213 xasprintf ("%s\n%s\n",
214 non_ascii_error_message (lcontext,
215 file_name,
216 line_number),
217 _("Please specify the source encoding through --from-code.")));
218 exit (EXIT_FAILURE);
219 }
220 }
221 else if (xgettext_current_source_encoding == po_charset_utf8)
222 {
223 if (u8_check ((const uint8_t *) string_desc_data (string),
224 string_desc_length (string))
225 != NULL)
226 {
227 multiline_error (xstrdup (""),
228 xasprintf ("%s\n%s\n",
229 non_utf8_error_message (lcontext,
230 file_name,
231 line_number),
232 _("Please specify the source encoding through --from-code.")));
233 exit (EXIT_FAILURE);
234 }
235 }
236 else
237 {
238 #if HAVE_ICONV
239 struct conversion_context context;
240
241 context.from_code = xgettext_current_source_encoding;
242 context.to_code = po_charset_utf8;
243 context.from_filename = file_name;
244 context.message = NULL;
245
246 string = convert_string_desc_directly (xgettext_current_source_iconv,
247 string, &context);
248 #else
249 /* If we don't have iconv(), the only supported values for
250 xgettext_global_source_encoding and thus also for
251 xgettext_current_source_encoding are ASCII and UTF-8.
252 convert_string_desc_directly() should not be called in this case. */
253 abort ();
254 #endif
255 }
256
257 return string;
258 }