1 /* Localization of proper names.
2 Copyright (C) 2006-2023 Free Software Foundation, Inc.
3 Written by Bruno Haible <bruno@clisp.org>, 2006.
4
5 This program is free software: you can redistribute it and/or modify
6 it under the terms of the GNU General Public License as published by
7 the Free Software Foundation, either version 3 of the License, or
8 (at your option) any later version.
9
10 This program is distributed in the hope that it will be useful,
11 but WITHOUT ANY WARRANTY; without even the implied warranty of
12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13 GNU General Public License for more details.
14
15 You should have received a copy of the GNU General Public License
16 along with this program. If not, see <https://www.gnu.org/licenses/>. */
17
18 /* Without this pragma, gcc 4.7.0 20111124 mistakenly suggests that
19 the proper_name function might be candidate for attribute 'const' */
20 #if (__GNUC__ == 4 && 6 <= __GNUC_MINOR__) || 4 < __GNUC__
21 # pragma GCC diagnostic ignored "-Wsuggest-attribute=const"
22 #endif
23
24 #include <config.h>
25
26 /* Specification. */
27 #include "propername.h"
28
29 #include <ctype.h>
30 #include <stdio.h>
31 #include <stdlib.h>
32 #include <string.h>
33 #if HAVE_ICONV
34 # include <iconv.h>
35 #endif
36
37 #include "trim.h"
38 #include "mbchar.h"
39 #include "mbuiter.h"
40 #include "localcharset.h"
41 #include "c-strcase.h"
42 #include "xstriconv.h"
43 #include "xalloc.h"
44 #include "gettext.h"
45
46
47 /* Tests whether STRING contains trim (SUB), starting and ending at word
48 boundaries.
49 Here, instead of implementing Unicode Standard Annex #29 for determining
50 word boundaries, we assume that trim (SUB) starts and ends with words and
51 only test whether the part before it ends with a non-word and the part
52 after it starts with a non-word. */
53 static bool
54 mbsstr_trimmed_wordbounded (const char *string, const char *sub)
55 {
56 char *tsub = trim (sub);
57 bool found = false;
58
59 for (; *string != '\0';)
60 {
61 const char *tsub_in_string = mbsstr (string, tsub);
62 if (tsub_in_string == NULL)
63 break;
64 else
65 {
66 if (MB_CUR_MAX > 1)
67 {
68 mbui_iterator_t string_iter;
69 bool word_boundary_before;
70 bool word_boundary_after;
71
72 mbui_init (string_iter, string);
73 word_boundary_before = true;
74 if (mbui_cur_ptr (string_iter) < tsub_in_string)
75 {
76 mbchar_t last_char_before_tsub;
77 do
78 {
79 if (!mbui_avail (string_iter))
80 abort ();
81 last_char_before_tsub = mbui_cur (string_iter);
82 mbui_advance (string_iter);
83 }
84 while (mbui_cur_ptr (string_iter) < tsub_in_string);
85 if (mb_isalnum (last_char_before_tsub))
86 word_boundary_before = false;
87 }
88
89 mbui_init (string_iter, tsub_in_string);
90 {
91 mbui_iterator_t tsub_iter;
92
93 for (mbui_init (tsub_iter, tsub);
94 mbui_avail (tsub_iter);
95 mbui_advance (tsub_iter))
96 {
97 if (!mbui_avail (string_iter))
98 abort ();
99 mbui_advance (string_iter);
100 }
101 }
102 word_boundary_after = true;
103 if (mbui_avail (string_iter))
104 {
105 mbchar_t first_char_after_tsub = mbui_cur (string_iter);
106 if (mb_isalnum (first_char_after_tsub))
107 word_boundary_after = false;
108 }
109
110 if (word_boundary_before && word_boundary_after)
111 {
112 found = true;
113 break;
114 }
115
116 mbui_init (string_iter, tsub_in_string);
117 if (!mbui_avail (string_iter))
118 break;
119 string = tsub_in_string + mb_len (mbui_cur (string_iter));
120 }
121 else
122 {
123 bool word_boundary_before;
124 const char *p;
125 bool word_boundary_after;
126
127 word_boundary_before = true;
128 if (string < tsub_in_string)
129 if (isalnum ((unsigned char) tsub_in_string[-1]))
130 word_boundary_before = false;
131
132 p = tsub_in_string + strlen (tsub);
133 word_boundary_after = true;
134 if (*p != '\0')
135 if (isalnum ((unsigned char) *p))
136 word_boundary_after = false;
137
138 if (word_boundary_before && word_boundary_after)
139 {
140 found = true;
141 break;
142 }
143
144 if (*tsub_in_string == '\0')
145 break;
146 string = tsub_in_string + 1;
147 }
148 }
149 }
150 free (tsub);
151 return found;
152 }
153
154 /* Return the localization of NAME. NAME is written in ASCII. */
155
156 const char *
157 proper_name (const char *name)
158 {
159 /* See whether there is a translation. */
160 const char *translation = gettext (name);
161
162 if (translation != name)
163 {
164 /* See whether the translation contains the original name. */
165 if (mbsstr_trimmed_wordbounded (translation, name))
166 return translation;
167 else
168 {
169 /* Return "TRANSLATION (NAME)". */
170 char *result =
171 XNMALLOC (strlen (translation) + 2 + strlen (name) + 1 + 1, char);
172
173 sprintf (result, "%s (%s)", translation, name);
174 return result;
175 }
176 }
177 else
178 return name;
179 }
180
181 /* Return the localization of a name whose original writing is not ASCII.
182 NAME_UTF8 is the real name, written in UTF-8 with octal or hexadecimal
183 escape sequences. NAME_ASCII is a fallback written only with ASCII
184 characters. */
185
186 const char *
187 proper_name_utf8 (const char *name_ascii, const char *name_utf8)
188 {
189 /* See whether there is a translation. */
190 const char *translation = gettext (name_ascii);
191
192 /* Try to convert NAME_UTF8 to the locale encoding. */
193 const char *locale_code = locale_charset ();
194 char *alloc_name_converted = NULL;
195 char *alloc_name_converted_translit = NULL;
196 const char *name_converted = NULL;
197 const char *name_converted_translit = NULL;
198 const char *name;
199
200 if (c_strcasecmp (locale_code, "UTF-8") != 0)
201 {
202 #if HAVE_ICONV
203 name_converted = alloc_name_converted =
204 xstr_iconv (name_utf8, "UTF-8", locale_code);
205
206 # if (((__GLIBC__ == 2 && __GLIBC_MINOR__ >= 2) || __GLIBC__ > 2) \
207 && !defined __UCLIBC__) \
208 || _LIBICONV_VERSION >= 0x0105
209 {
210 char *converted_translit;
211
212 size_t len = strlen (locale_code);
213 char *locale_code_translit = XNMALLOC (len + 10 + 1, char);
214 memcpy (locale_code_translit, locale_code, len);
215 memcpy (locale_code_translit + len, "//TRANSLIT", 10 + 1);
216
217 converted_translit =
218 xstr_iconv (name_utf8, "UTF-8", locale_code_translit);
219
220 free (locale_code_translit);
221
222 if (converted_translit != NULL)
223 {
224 # if !_LIBICONV_VERSION
225 /* Don't use the transliteration if it added question marks.
226 glibc's transliteration falls back to question marks; libiconv's
227 transliteration does not.
228 mbschr is equivalent to strchr in this case. */
229 if (strchr (converted_translit, '?') != NULL)
230 free (converted_translit);
231 else
232 # endif
233 name_converted_translit = alloc_name_converted_translit =
234 converted_translit;
235 }
236 }
237 # endif
238 #endif
239 }
240 else
241 {
242 name_converted = name_utf8;
243 name_converted_translit = name_utf8;
244 }
245
246 /* The name in locale encoding. */
247 name = (name_converted != NULL ? name_converted :
248 name_converted_translit != NULL ? name_converted_translit :
249 name_ascii);
250
251 /* See whether we have a translation. Some translators have not understood
252 that they should use the UTF-8 form of the name, if possible. So if the
253 translator provided a no-op translation, we ignore it. */
254 if (strcmp (translation, name_ascii) != 0)
255 {
256 /* See whether the translation contains the original name. */
257 if (mbsstr_trimmed_wordbounded (translation, name_ascii)
258 || (name_converted != NULL
259 && mbsstr_trimmed_wordbounded (translation, name_converted))
260 || (name_converted_translit != NULL
261 && mbsstr_trimmed_wordbounded (translation, name_converted_translit)))
262 {
263 if (alloc_name_converted != NULL)
264 free (alloc_name_converted);
265 if (alloc_name_converted_translit != NULL)
266 free (alloc_name_converted_translit);
267 return translation;
268 }
269 else
270 {
271 /* Return "TRANSLATION (NAME)". */
272 char *result =
273 XNMALLOC (strlen (translation) + 2 + strlen (name) + 1 + 1, char);
274
275 sprintf (result, "%s (%s)", translation, name);
276
277 if (alloc_name_converted != NULL)
278 free (alloc_name_converted);
279 if (alloc_name_converted_translit != NULL)
280 free (alloc_name_converted_translit);
281 return result;
282 }
283 }
284 else
285 {
286 if (alloc_name_converted != NULL && alloc_name_converted != name)
287 free (alloc_name_converted);
288 if (alloc_name_converted_translit != NULL
289 && alloc_name_converted_translit != name)
290 free (alloc_name_converted_translit);
291 return name;
292 }
293 }
294
295 #ifdef TEST1
296 # include <locale.h>
297 int
298 main (int argc, char *argv[])
299 {
300 setlocale (LC_ALL, "");
301 if (mbsstr_trimmed_wordbounded (argv[1], argv[2]))
302 printf("found\n");
303 return 0;
304 }
305 #endif
306
307 #ifdef TEST2
308 # include <locale.h>
309 # include <stdio.h>
310 int
311 main (int argc, char *argv[])
312 {
313 setlocale (LC_ALL, "");
314 printf ("%s\n", proper_name_utf8 ("Franc,ois Pinard", "Fran\303\247ois Pinard"));
315 return 0;
316 }
317 #endif