1 /* Localization of proper names.
2 Copyright (C) 2006-2023 Free Software Foundation, Inc.
3 Written by Bruno Haible <bruno@clisp.org>, 2006.
4
5 This program is free software: you can redistribute it and/or modify
6 it under the terms of the GNU General Public License as published by
7 the Free Software Foundation, either version 3 of the License, or
8 (at your option) any later version.
9
10 This program is distributed in the hope that it will be useful,
11 but WITHOUT ANY WARRANTY; without even the implied warranty of
12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13 GNU General Public License for more details.
14
15 You should have received a copy of the GNU General Public License
16 along with this program. If not, see <https://www.gnu.org/licenses/>. */
17
18 /* Without this pragma, gcc 4.7.0 20111124 mistakenly suggests that
19 the proper_name function might be candidate for attribute 'const' */
20 #if (__GNUC__ == 4 && 6 <= __GNUC_MINOR__) || 4 < __GNUC__
21 # pragma GCC diagnostic ignored "-Wsuggest-attribute=const"
22 #endif
23
24 #include <config.h>
25
26 /* Specification. */
27 #include "propername.h"
28
29 #include <ctype.h>
30 #include <stdio.h>
31 #include <stdlib.h>
32 #include <string.h>
33 #if HAVE_ICONV
34 # include <iconv.h>
35 #endif
36
37 #include "trim.h"
38 #if GNULIB_MCEL_PREFER
39 # include "mcel.h"
40 #else
41 # include "mbchar.h"
42 # include "mbuiter.h"
43 #endif
44 #include "localcharset.h"
45 #include "c-strcase.h"
46 #include "xstriconv.h"
47 #include "xalloc.h"
48 #include "gettext.h"
49
50
51 /* Tests whether STRING contains trim (SUB), starting and ending at word
52 boundaries.
53 Here, instead of implementing Unicode Standard Annex #29 for determining
54 word boundaries, we assume that trim (SUB) starts and ends with words and
55 only test whether the part before it ends with a non-word and the part
56 after it starts with a non-word. */
57 static bool
58 mbsstr_trimmed_wordbounded (const char *string, const char *sub)
59 {
60 char *tsub = trim (sub);
61 bool found = false;
62 bool multibyte_locale = MB_CUR_MAX > 1;
63 size_t tsublen;
64 if (! multibyte_locale)
65 tsublen = strlen (tsub);
66
67 while (*string != '\0')
68 {
69 const char *tsub_in_string = mbsstr (string, tsub);
70 if (tsub_in_string == NULL)
71 break;
72 else
73 {
74 if (multibyte_locale)
75 {
76 #if GNULIB_MCEL_PREFER
77 char const *string_iter = string;
78
79 char32_t last_char_before_tsub = 0;
80 while (string_iter < tsub_in_string)
81 {
82 mcel_t g = mcel_scanz (string_iter);
83 last_char_before_tsub = g.ch;
84 string_iter += g.len;
85 }
86
87 string_iter = tsub_in_string;
88 for (char const *tsub_iter = tsub; *tsub_iter;
89 tsub_iter += mcel_scanz (tsub_iter).len)
90 string_iter += mcel_scanz (string_iter).len;
91
92 if (!c32isalnum (last_char_before_tsub)
93 && !c32isalnum (mcel_scanz (string_iter).ch))
94 {
95 found = true;
96 break;
97 }
98
99 if (!*tsub_in_string)
100 break;
101 string = tsub_in_string + mcel_scanz (tsub_in_string).len;
102 #else
103 mbui_iterator_t string_iter;
104 bool word_boundary_before;
105 bool word_boundary_after;
106
107 mbui_init (string_iter, string);
108 word_boundary_before = true;
109 if (mbui_cur_ptr (string_iter) < tsub_in_string)
110 {
111 mbchar_t last_char_before_tsub;
112 do
113 {
114 if (!mbui_avail (string_iter))
115 abort ();
116 last_char_before_tsub = mbui_cur (string_iter);
117 mbui_advance (string_iter);
118 }
119 while (mbui_cur_ptr (string_iter) < tsub_in_string);
120 if (mb_isalnum (last_char_before_tsub))
121 word_boundary_before = false;
122 }
123
124 mbui_init (string_iter, tsub_in_string);
125 {
126 mbui_iterator_t tsub_iter;
127
128 for (mbui_init (tsub_iter, tsub);
129 mbui_avail (tsub_iter);
130 mbui_advance (tsub_iter))
131 {
132 if (!mbui_avail (string_iter))
133 abort ();
134 mbui_advance (string_iter);
135 }
136 }
137 word_boundary_after = true;
138 if (mbui_avail (string_iter))
139 {
140 mbchar_t first_char_after_tsub = mbui_cur (string_iter);
141 if (mb_isalnum (first_char_after_tsub))
142 word_boundary_after = false;
143 }
144
145 if (word_boundary_before && word_boundary_after)
146 {
147 found = true;
148 break;
149 }
150
151 mbui_init (string_iter, tsub_in_string);
152 if (!mbui_avail (string_iter))
153 break;
154 string = tsub_in_string + mb_len (mbui_cur (string_iter));
155 #endif
156 }
157 else
158 {
159 if ((string == tsub_in_string
160 || !isalnum ((unsigned char) tsub_in_string[-1]))
161 && !isalnum ((unsigned char) tsub_in_string[tsublen]))
162 {
163 found = true;
164 break;
165 }
166
167 if (*tsub_in_string == '\0')
168 break;
169 string = tsub_in_string + 1;
170 }
171 }
172 }
173 free (tsub);
174 return found;
175 }
176
177 /* Return the localization of NAME. NAME is written in ASCII. */
178
179 const char *
180 proper_name (const char *name)
181 {
182 /* See whether there is a translation. */
183 const char *translation = gettext (name);
184
185 if (translation != name)
186 {
187 /* See whether the translation contains the original name. */
188 if (mbsstr_trimmed_wordbounded (translation, name))
189 return translation;
190 else
191 {
192 /* Return "TRANSLATION (NAME)". */
193 char *result =
194 XNMALLOC (strlen (translation) + 2 + strlen (name) + 1 + 1, char);
195
196 sprintf (result, "%s (%s)", translation, name);
197 return result;
198 }
199 }
200 else
201 return name;
202 }
203
204 /* Return the localization of a name whose original writing is not ASCII.
205 NAME_UTF8 is the real name, written in UTF-8 with octal or hexadecimal
206 escape sequences. NAME_ASCII is a fallback written only with ASCII
207 characters. */
208
209 const char *
210 proper_name_utf8 (const char *name_ascii, const char *name_utf8)
211 {
212 /* See whether there is a translation. */
213 const char *translation = gettext (name_ascii);
214
215 /* Try to convert NAME_UTF8 to the locale encoding. */
216 const char *locale_code = locale_charset ();
217 char *alloc_name_converted = NULL;
218 char *alloc_name_converted_translit = NULL;
219 const char *name_converted = NULL;
220 const char *name_converted_translit = NULL;
221 const char *name;
222
223 if (c_strcasecmp (locale_code, "UTF-8") != 0)
224 {
225 #if HAVE_ICONV
226 name_converted = alloc_name_converted =
227 xstr_iconv (name_utf8, "UTF-8", locale_code);
228
229 # if (((__GLIBC__ == 2 && __GLIBC_MINOR__ >= 2) || __GLIBC__ > 2) \
230 && !defined __UCLIBC__) \
231 || _LIBICONV_VERSION >= 0x0105
232 {
233 char *converted_translit;
234
235 size_t len = strlen (locale_code);
236 char *locale_code_translit = XNMALLOC (len + 10 + 1, char);
237 memcpy (locale_code_translit, locale_code, len);
238 memcpy (locale_code_translit + len, "//TRANSLIT", 10 + 1);
239
240 converted_translit =
241 xstr_iconv (name_utf8, "UTF-8", locale_code_translit);
242
243 free (locale_code_translit);
244
245 if (converted_translit != NULL)
246 {
247 # if !_LIBICONV_VERSION
248 /* Don't use the transliteration if it added question marks.
249 glibc's transliteration falls back to question marks; libiconv's
250 transliteration does not.
251 mbschr is equivalent to strchr in this case. */
252 if (strchr (converted_translit, '?') != NULL)
253 free (converted_translit);
254 else
255 # endif
256 name_converted_translit = alloc_name_converted_translit =
257 converted_translit;
258 }
259 }
260 # endif
261 #endif
262 }
263 else
264 {
265 name_converted = name_utf8;
266 name_converted_translit = name_utf8;
267 }
268
269 /* The name in locale encoding. */
270 name = (name_converted != NULL ? name_converted :
271 name_converted_translit != NULL ? name_converted_translit :
272 name_ascii);
273
274 /* See whether we have a translation. Some translators have not understood
275 that they should use the UTF-8 form of the name, if possible. So if the
276 translator provided a no-op translation, we ignore it. */
277 if (strcmp (translation, name_ascii) != 0)
278 {
279 /* See whether the translation contains the original name. */
280 if (mbsstr_trimmed_wordbounded (translation, name_ascii)
281 || (name_converted != NULL
282 && mbsstr_trimmed_wordbounded (translation, name_converted))
283 || (name_converted_translit != NULL
284 && mbsstr_trimmed_wordbounded (translation, name_converted_translit)))
285 {
286 if (alloc_name_converted != NULL)
287 free (alloc_name_converted);
288 if (alloc_name_converted_translit != NULL)
289 free (alloc_name_converted_translit);
290 return translation;
291 }
292 else
293 {
294 /* Return "TRANSLATION (NAME)". */
295 char *result =
296 XNMALLOC (strlen (translation) + 2 + strlen (name) + 1 + 1, char);
297
298 sprintf (result, "%s (%s)", translation, name);
299
300 if (alloc_name_converted != NULL)
301 free (alloc_name_converted);
302 if (alloc_name_converted_translit != NULL)
303 free (alloc_name_converted_translit);
304 return result;
305 }
306 }
307 else
308 {
309 if (alloc_name_converted != NULL && alloc_name_converted != name)
310 free (alloc_name_converted);
311 if (alloc_name_converted_translit != NULL
312 && alloc_name_converted_translit != name)
313 free (alloc_name_converted_translit);
314 return name;
315 }
316 }
317
318 #ifdef TEST1
319 # include <locale.h>
320 int
321 main (int argc, char *argv[])
322 {
323 setlocale (LC_ALL, "");
324 if (mbsstr_trimmed_wordbounded (argv[1], argv[2]))
325 printf("found\n");
326 return 0;
327 }
328 #endif
329
330 #ifdef TEST2
331 # include <locale.h>
332 # include <stdio.h>
333 int
334 main (int argc, char *argv[])
335 {
336 setlocale (LC_ALL, "");
337 printf ("%s\n", proper_name_utf8 ("Franc,ois Pinard", "Fran\303\247ois Pinard"));
338 return 0;
339 }
340 #endif