1 /*
2 * encodings.c: locale and encoding handling for man
3 *
4 * Copyright (C) 2003, 2004, 2005, 2006, 2007, 2008, 2009, 2010, 2011
5 * Colin Watson.
6 *
7 * This file is part of man-db.
8 *
9 * man-db is free software; you can redistribute it and/or modify it
10 * under the terms of the GNU General Public License as published by
11 * the Free Software Foundation; either version 2 of the License, or
12 * (at your option) any later version.
13 *
14 * man-db is distributed in the hope that it will be useful, but
15 * WITHOUT ANY WARRANTY; without even the implied warranty of
16 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17 * GNU General Public License for more details.
18 *
19 * You should have received a copy of the GNU General Public License
20 * along with man-db; if not, write to the Free Software Foundation,
21 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
22 */
23
24 #ifdef HAVE_CONFIG_H
25 # include "config.h"
26 #endif /* HAVE_CONFIG_H */
27
28 #include <stdbool.h>
29 #include <stdio.h>
30 #include <string.h>
31 #include <stdlib.h>
32 #include <unistd.h>
33 #include <locale.h>
34 #include <ctype.h>
35
36 #include "attribute.h"
37 #include "gettext.h"
38 #include "localcharset.h"
39 #include "xalloc.h"
40 #include "xstrndup.h"
41
42 #include "manconfig.h"
43
44 #include "debug.h"
45 #include "encodings.h"
46 #include "pathsearch.h"
47
48
49 /* Due to historical limitations in groff (which may be removed in the
50 * future), there is no mechanism for a man page to specify its own
51 * encoding. This means that each national language directory needs to carry
52 * with it information about its encoding, and each groff device needs to
53 * have a default encoding associated with it. Out of the box, groff
54 * formally allows only ISO-8859-1 on input; however, patches originating
55 * with Debian and imported by many other GNU/Linux distributions change
56 * this somewhat.
57 *
58 * Eventually, groff will support proper Unicode input, and much of this
59 * horror can go away.
60 *
61 * Do *not* confuse source encoding with groff encoding. The encoding
62 * specified in this table is the encoding in which the source man pages in
63 * each language directory are expected to be written. The groff encoding is
64 * determined by the selected groff device and sometimes also by the user's
65 * locale.
66 *
67 * The standard output encoding is the encoding assumed for cat pages for
68 * each language directory. It must *not* be used to discover the actual
69 * output encoding displayed to the user; that is determined by the locale.
70 * TODO: it would be useful to be able to change the standard output
71 * encoding in the configuration file.
72 *
73 * This table is expected to change over time, particularly as man pages
74 * begin to move towards UTF-8. Feel free to patch this for your
75 * distribution; send me updates for languages I've missed.
76 *
77 * Explicit encodings in the directory name (e.g. de_DE.UTF-8) override this
78 * table.
79 */
80 struct directory_entry {
81 const char *lang_dir;
82 const char *source_encoding;
83 };
84
85 static struct directory_entry directory_table[] = {
86 { "C", "ISO-8859-1" }, /* English */
87 { "POSIX", "ISO-8859-1" }, /* English */
88 { "da", "ISO-8859-1" }, /* Danish */
89 { "de", "ISO-8859-1" }, /* German */
90 { "en", "ISO-8859-1" }, /* English */
91 { "es", "ISO-8859-1" }, /* Spanish */
92 { "et", "ISO-8859-1" }, /* Estonian */
93 { "fi", "ISO-8859-1" }, /* Finnish */
94 { "fr", "ISO-8859-1" }, /* French */
95 { "ga", "ISO-8859-1" }, /* Irish */
96 { "gl", "ISO-8859-1" }, /* Galician */
97 { "id", "ISO-8859-1" }, /* Indonesian */
98 { "is", "ISO-8859-1" }, /* Icelandic */
99 { "it", "ISO-8859-1" }, /* Italian */
100 { "nb", "ISO-8859-1" }, /* Norwegian Bokmål */
101 { "nl", "ISO-8859-1" }, /* Dutch */
102 { "nn", "ISO-8859-1" }, /* Norwegian Nynorsk */
103 { "no", "ISO-8859-1" }, /* Norwegian */
104 { "pt", "ISO-8859-1" }, /* Portuguese */
105 { "sv", "ISO-8859-1" }, /* Swedish */
106
107 #ifdef MULTIBYTE_GROFF
108 /* These languages require a patched version of groff with the
109 * ascii8 and nippon devices.
110 */
111 { "be", "CP1251" }, /* Belarusian */
112 { "bg", "CP1251" }, /* Bulgarian */
113 { "cs", "ISO-8859-2" }, /* Czech */
114 { "el", "ISO-8859-7" }, /* Greek */
115 { "hr", "ISO-8859-2" }, /* Croatian */
116 { "hu", "ISO-8859-2" }, /* Hungarian */
117 { "ja", "EUC-JP" }, /* Japanese */
118 { "ko", "EUC-KR" }, /* Korean */
119 { "lt", "ISO-8859-13" }, /* Lithuanian */
120 { "lv", "ISO-8859-13" }, /* Latvian */
121 { "mk", "ISO-8859-5" }, /* Macedonian */
122 { "pl", "ISO-8859-2" }, /* Polish */
123 { "ro", "ISO-8859-2" }, /* Romanian */
124 { "ru", "KOI8-R" }, /* Russian */
125 { "sk", "ISO-8859-2" }, /* Slovak */
126 { "sl", "ISO-8859-2" }, /* Slovenian */
127 /* sr@latin must precede sr, due to top-down left-substring matching later */
128 { "sr@latin", "ISO-8859-2" }, /* Serbian Latin */
129 { "sr", "ISO-8859-5" }, /* Serbian */
130 { "tr", "ISO-8859-9" }, /* Turkish */
131 { "uk", "KOI8-U" }, /* Ukrainian */
132 { "vi", "TCVN5712-1" }, /* Vietnamese */
133 { "zh_CN", "GBK" }, /* Simplified Chinese */
134 { "zh_SG", "GBK" }, /* Simplified Chinese, Singapore */
135 { "zh_HK", "BIG5HKSCS" }, /* Traditional Chinese, Hong Kong */
136 { "zh_TW", "BIG5" }, /* Traditional Chinese */
137 #endif /* MULTIBYTE_GROFF */
138
139 { NULL, NULL }
140 };
141
142 static const char fallback_source_encoding[] = "ISO-8859-1";
143
144 /* Unfortunately, there is no portable way to inspect iconv's internal table
145 * of character set aliases. We copy the most interesting ones here so that
146 * we can deal with them if they appear in directory names. Note that all
147 * names will be converted to upper case before looking them up in this
148 * table.
149 */
150 struct charset_alias_entry {
151 const char *alias;
152 const char *canonical_name;
153 };
154
155 static struct charset_alias_entry charset_alias_table[] = {
156 /* The FHS is silly and requires numeric-only aliases that iconv
157 * does not support.
158 */
159 { "88591", "ISO-8859-1" },
160 { "88592", "ISO-8859-2" },
161 { "88593", "ISO-8859-3" },
162 { "88594", "ISO-8859-4" },
163 { "88595", "ISO-8859-5" },
164 { "88596", "ISO-8859-6" },
165 { "88597", "ISO-8859-7" },
166 { "88598", "ISO-8859-8" },
167 { "88599", "ISO-8859-9" },
168 { "885910", "ISO-8859-10" },
169 { "885911", "ISO-8859-11" },
170 { "885913", "ISO-8859-13" },
171 { "885914", "ISO-8859-14" },
172 { "885915", "ISO-8859-15" },
173 { "885916", "ISO-8859-16" },
174
175 { "ASCII", "ANSI_X3.4-1968" },
176 { "BIG-5", "BIG5" },
177 { "BIG5-HKSCS", "BIG5HKSCS" },
178 { "EUCCN", "EUC-CN" },
179 { "EUCJP", "EUC-JP" },
180 { "EUCKR", "EUC-KR" },
181 { "EUCTW", "EUC-TW" },
182 { "GB2312", "EUC-CN" },
183 { "ISO8859-1", "ISO-8859-1" },
184 { "ISO8859-2", "ISO-8859-2" },
185 { "ISO8859-3", "ISO-8859-3" },
186 { "ISO8859-4", "ISO-8859-4" },
187 { "ISO8859-5", "ISO-8859-5" },
188 { "ISO8859-6", "ISO-8859-6" },
189 { "ISO8859-7", "ISO-8859-7" },
190 { "ISO8859-8", "ISO-8859-8" },
191 { "ISO8859-9", "ISO-8859-9" },
192 { "ISO8859-10", "ISO-8859-10" },
193 { "ISO8859-11", "ISO-8859-11" },
194 { "ISO8859-13", "ISO-8859-13" },
195 { "ISO8859-14", "ISO-8859-14" },
196 { "ISO8859-15", "ISO-8859-15" },
197 { "ISO8859-16", "ISO-8859-16" },
198 { "KOI8R", "KOI8-R" },
199 { "KOI8U", "KOI8-U" },
200 { "UJIS", "EUC-JP" },
201 { "US-ASCII", "ANSI_X3.4-1968" },
202 { "UTF8", "UTF-8" },
203
204 { NULL, NULL }
205 };
206
207 /* The default groff terminal output device to be used is determined based
208 * on locale_charset (), which returns the character set used by the current
209 * locale.
210 */
211 struct charset_entry {
212 const char *charset_from_locale;
213 const char *default_device;
214 };
215
216 static struct charset_entry charset_table[] = {
217 { "ANSI_X3.4-1968", "ascii" },
218 #ifndef HEIRLOOM_NROFF
219 { "ISO-8859-1", "latin1" },
220 #endif /* HEIRLOOM_NROFF */
221 { "UTF-8", "utf8" },
222
223 #ifndef HEIRLOOM_NROFF
224 # ifdef MULTIBYTE_GROFF
225 { "BIG5", "nippon" },
226 { "BIG5HKSCS", "nippon" },
227 { "EUC-CN", "nippon" },
228 { "EUC-JP", "nippon" },
229 { "EUC-TW", "nippon" },
230 { "GBK", "nippon" },
231 # else /* !MULTIBYTE_GROFF */
232 /* If we have a smarter version of groff, this is better dealt with
233 * using either ascii8 (Debian multibyte patch) or preconv (as of
234 * groff 1.20). This is a not-quite-right stopgap in case we have
235 * neither.
236 */
237 { "ISO-8859-15", "latin1" },
238 # endif /* MULTIBYTE_GROFF */
239 #endif /* HEIRLOOM_NROFF */
240
241 { NULL, NULL }
242 };
243
244 static const char *fallback_default_device =
245 #ifdef MULTIBYTE_GROFF
246 "ascii8"
247 #else /* !MULTIBYTE_GROFF */
248 "ascii"
249 #endif /* MULTIBYTE_GROFF */
250 ;
251
252 /* The encoding used for the text passed to groff is a function of the
253 * selected groff device. Traditional devices expect ISO-8859-1 on input
254 * (yes, even the utf8 device); devices added in the Debian multibyte patch
255 * expect other encodings. The ascii8 device passes top-bit-set characters
256 * straight through so is (probably ...) encoding-agnostic. If this encoding
257 * does not match the source encoding, an iconv pipe is used (if available)
258 * to perform recoding.
259 */
260 struct device_entry {
261 const char *roff_device;
262 const char *roff_encoding;
263 const char *output_encoding;
264 };
265
266 static struct device_entry device_table[] = {
267 /* nroff devices */
268 { "ascii", "ANSI_X3.4-1968", "ANSI_X3.4-1968" },
269 { "latin1", "ISO-8859-1", "ISO-8859-1" },
270 { "utf8", "ISO-8859-1", "UTF-8" },
271
272 #ifdef MULTIBYTE_GROFF
273 { "ascii8", NULL, NULL },
274 { "nippon", NULL, NULL },
275 #endif /* MULTIBYTE_GROFF */
276
277 #ifdef HEIRLOOM_NROFF
278 /* Not strictly accurate, but we only use this in UTF-8 locales. */
279 { "locale", "UTF-8", "UTF-8" },
280 #endif /* HEIRLOOM_NROFF */
281
282 /* troff devices */
283 { "X75", NULL, NULL },
284 { "X75-12", NULL, NULL },
285 { "X100", NULL, NULL },
286 { "X100-12", NULL, NULL },
287 { "dvi", NULL, NULL },
288 { "html", NULL, NULL },
289 { "lbp", NULL, NULL },
290 { "lj4", NULL, NULL },
291 { "ps", NULL, NULL },
292
293 { NULL, NULL, NULL }
294 };
295
296 static const char fallback_roff_encoding[] = "ISO-8859-1";
297
298 /* Setting less_charset to iso8859 tells the less pager that characters
299 * between 0xA0 and 0xFF are displayable, not that its input is encoded in
300 * ISO-8859-*. TODO: Perhaps using LESSCHARDEF would be better.
301 *
302 * Character set names compatible only with jless go in jless_charset.
303 */
304 struct less_charset_entry {
305 const char *charset_from_locale;
306 const char *less_charset;
307 const char *jless_charset;
308 };
309
310 static struct less_charset_entry less_charset_table[] = {
311 { "ANSI_X3.4-1968", "ascii", NULL },
312 { "ISO-8859-1", "iso8859", NULL },
313 { "UTF-8", "utf-8", NULL },
314
315 #ifdef MULTIBYTE_GROFF
316 { "CP1251", "windows", NULL },
317 { "EUC-JP", "iso8859", "japanese-ujis" },
318 { "KOI8-R", "koi8-r", NULL },
319 /* close enough? */
320 { "KOI8-U", "koi8-r", NULL },
321 #endif /* MULTIBYTE_GROFF */
322
323 { NULL, NULL, NULL }
324 };
325
326 static const char fallback_less_charset[] = "iso8859";
327
328 const char *groff_preconv = NULL;
329
330 /* Is the groff "preconv" helper available? If so, return its name.
331 * Otherwise, return NULL.
332 */
333 const char *get_groff_preconv (void)
334 {
335 if (groff_preconv) {
336 if (*groff_preconv)
337 return groff_preconv;
338 else
339 return NULL;
340 }
341
342 if (pathsearch_executable ("gpreconv"))
343 groff_preconv = "gpreconv";
344 else if (pathsearch_executable ("preconv"))
345 groff_preconv = "preconv";
346 else
347 groff_preconv = "";
348
349 if (*groff_preconv)
350 return groff_preconv;
351 else
352 return NULL;
353 }
354
355 /* Return the assumed encoding of the source man page, based on the
356 * directory in which it was found. The caller should attempt to recode from
357 * this to whatever encoding is expected by groff.
358 *
359 * The caller should free the returned string when it is finished with it.
360 */
361 char * ATTRIBUTE_MALLOC get_page_encoding (const char *lang)
362 {
363 const struct directory_entry *entry;
364 const char *dot;
365
366 if (!lang || !*lang) {
367 /* Guess based on the locale. */
368 lang = setlocale (LC_MESSAGES, NULL);
369 if (!lang)
370 return xstrdup (fallback_source_encoding);
371 }
372
373 dot = strchr (lang, '.');
374 if (dot) {
375 /* The FHS has the worst specification of what's supposed to
376 * go after the dot here that I've ever seen. To quote from
377 * version 2.1:
378 *
379 * "It is recommended that this be a numeric representation
380 * if possible (ISO standards, especially), not include
381 * additional punctuation symbols, and that any letters be
382 * in lowercase."
383 *
384 * Any sane standard would use directory names like
385 * de_DE.ISO-8859-1; the examples in the FHS recommend
386 * de_DE.88591 instead. Considering that there is no other
387 * conceivable use for encodings in directory names other
388 * than to pass them to iconv or similar, this is quite
389 * startlingly useless.
390 *
391 * While we now support this thanks to
392 * get_canonical_charset_name, the FHS specification is
393 * obviously wrong and I plan to petition to have it
394 * changed. I recommend ignoring this part of the FHS.
395 */
396 char *dir_encoding =
397 xstrndup (dot + 1, strcspn (dot + 1, ",@"));
398 char *canonical_dir_encoding =
399 xstrdup (get_canonical_charset_name (dir_encoding));
400 free (dir_encoding);
401 return canonical_dir_encoding;
402 }
403
404 for (entry = directory_table; entry->lang_dir; ++entry)
405 if (STRNEQ (entry->lang_dir, lang, strlen (entry->lang_dir)))
406 return xstrdup (entry->source_encoding);
407
408 return xstrdup (fallback_source_encoding);
409 }
410
411 /* Return the canonical encoding for source man pages in the specified
412 * language. This ignores any encoding specification in the language
413 * directory name. The source encoding should be used as a basis for
414 * determining the correct roff device to use: that is, the caller should
415 * behave as if it is recoding from the page encoding to the source encoding
416 * first, although in practice it should recode directly from the page
417 * encoding to the roff encoding.
418 *
419 * You should normally only call this function if the page encoding is
420 * UTF-8, in which case older versions of groff that lack preconv need to
421 * have the page recoded to some legacy encoding). If the page is in a
422 * legacy encoding, then attempting to recode from that to some other legacy
423 * encoding will probably do more harm than good.
424 *
425 * Here are a few concrete examples of why these distinctions are important:
426 *
427 * /usr/share/man/en_GB.UTF-8, locale C
428 * page encoding = UTF-8
429 * source encoding = ISO-8859-1
430 * roff encoding = ISO-8859-1
431 * output encoding = UTF-8
432 * UTF-8 -> iconv -> ISO-8859-1 -> groff -Tascii -> ANSI_X3.4-1968
433 *
434 * /usr/share/man/pl_PL.UTF-8, locale pl_PL.UTF-8
435 * page encoding = UTF-8
436 * source encoding = ISO-8859-2
437 * roff encoding = ISO-8859-2
438 * output encoding = ISO-8859-2
439 * UTF-8 -> iconv -> ISO-8859-2 -> groff -Tascii8
440 * -> ISO-8859-2 -> iconv -> UTF-8
441 *
442 * /usr/share/man/ja_JP.EUC-JP, locale ja_JP.UTF-8
443 * page encoding = EUC-JP
444 * source encoding = EUC-JP
445 * roff encoding = UTF-8
446 * output encoding = UTF-8
447 * EUC-JP -> iconv -> UTF-8 -> groff -Tutf8 -> UTF-8
448 *
449 * /usr/share/man/en_GB.ISO-8859-15, locale en_GB.UTF-8
450 * page encoding = ISO-8859-15
451 * source encoding = ISO-8859-15
452 * roff encoding = ISO-8859-15
453 * output encoding = ISO-8859-15
454 * ISO-8859-15 -> groff -Tascii8 -> ISO-8859-15 -> iconv -> UTF-8
455 */
456 const char *get_source_encoding (const char *lang)
457 {
458 const struct directory_entry *entry;
459
460 if (!lang || !*lang) {
461 /* Guess based on the locale. */
462 lang = setlocale (LC_MESSAGES, NULL);
463 if (!lang)
464 return fallback_source_encoding;
465 }
466
467 for (entry = directory_table; entry->lang_dir; ++entry)
468 if (STRNEQ (entry->lang_dir, lang, strlen (entry->lang_dir)))
469 return entry->source_encoding;
470
471 return fallback_source_encoding;
472 }
473
474 const char * ATTRIBUTE_NONNULL ((1)) ATTRIBUTE_RETURNS_NONNULL
475 get_canonical_charset_name (const char *charset)
476 {
477 const struct charset_alias_entry *entry;
478 char *charset_upper = xstrdup (charset);
479 char *p;
480
481 for (p = charset_upper; *p; ++p)
482 *p = CTYPE (toupper, *p);
483
484 for (entry = charset_alias_table; entry->alias; ++entry)
485 if (STREQ (entry->alias, charset_upper)) {
486 free (charset_upper);
487 return entry->canonical_name;
488 }
489
490 free (charset_upper);
491 return charset;
492 }
493
494 /* Return the current locale's character set. */
495 const char * ATTRIBUTE_RETURNS_NONNULL get_locale_charset (void)
496 {
497 const char *charset;
498 char *saved_locale;
499
500 /* We need to modify LC_CTYPE temporarily in order to look at the
501 * codeset, so save it first.
502 */
503 saved_locale = setlocale (LC_CTYPE, NULL);
504 if (saved_locale)
505 saved_locale = xstrdup (saved_locale);
506
507 setlocale (LC_CTYPE, "");
508
509 charset = locale_charset ();
510
511 /* Restore LC_CTYPE to its value on entry to this function. */
512 setlocale (LC_CTYPE, saved_locale);
513 free (saved_locale);
514
515 if (!charset || !*charset)
516 charset = "ANSI_X3.4-1968";
517 return get_canonical_charset_name (charset);
518 }
519
520 /* Find a locale with this character set. This is a non-portable operation,
521 * but required to make col(1) work correctly with -E. If no locale can be
522 * found, or if none needs to be set, return NULL.
523 *
524 * The caller should free the returned string when it is finished with it.
525 */
526 char *find_charset_locale (const char *charset)
527 {
528 const char *canonical_charset = get_canonical_charset_name (charset);
529 char *saved_locale;
530 const char supported_path[] = "/usr/share/i18n/SUPPORTED";
531 FILE *supported = NULL;
532 char *line = NULL;
533 size_t n = 0;
534 char *locale = NULL;
535
536 if (STREQ (charset, get_locale_charset ()))
537 return NULL;
538
539 saved_locale = setlocale (LC_CTYPE, NULL);
540 if (saved_locale)
541 saved_locale = xstrdup (saved_locale);
542
543 supported = fopen (supported_path, "r");
544 while (supported && getline (&line, &n, supported) >= 0) {
545 const char *space = strchr (line, ' ');
546 if (space) {
547 char *encoding = xstrdup (space + 1);
548 char *newline = strchr (encoding, '\n');
549 if (newline)
550 *newline = 0;
551 if (STREQ (canonical_charset,
552 get_canonical_charset_name (encoding))) {
553 locale = xstrndup (line, space - line);
554 /* Is this locale actually installed? */
555 if (setlocale (LC_CTYPE, locale)) {
556 free (encoding);
557 goto out;
558 } else {
559 free (locale);
560 locale = NULL;
561 }
562 }
563 free (encoding);
564 }
565 free (line);
566 line = NULL;
567 }
568
569 if (strlen (canonical_charset) >= 5 &&
570 STRNEQ (canonical_charset, "UTF-8", 5)) {
571 locale = xstrdup ("C.UTF-8");
572 if (setlocale (LC_CTYPE, locale))
573 goto out;
574 free (locale);
575 locale = xstrdup ("en_US.UTF-8");
576 if (setlocale (LC_CTYPE, locale))
577 goto out;
578 free (locale);
579 locale = NULL;
580 }
581
582 out:
583 free (line);
584 setlocale (LC_CTYPE, saved_locale);
585 free (saved_locale);
586 if (supported)
587 fclose (supported);
588 return locale;
589 }
590
591 /* Can we take this input encoding and produce this output encoding, perhaps
592 * with the help of some iconv pipes? */
593 static bool ATTRIBUTE_PURE compatible_encodings (const char *input,
594 const char *output)
595 {
596 if (STREQ (input, output))
597 return true;
598
599 /* If the input is ASCII, recoding should be easy. Try it. */
600 if (STREQ (input, "ANSI_X3.4-1968"))
601 return true;
602
603 /* If the input is UTF-8, it's either a simple recoding of whatever
604 * we want or else it probably won't work at all no matter what we
605 * do. We might as well try it for now.
606 */
607 if (STREQ (input, "UTF-8"))
608 return true;
609
610 /* If the output is ASCII, this is probably because the caller
611 * explicitly asked for it, so we have little choice but to try.
612 */
613 if (STREQ (output, "ANSI_X3.4-1968"))
614 return true;
615
616 #ifdef MULTIBYTE_GROFF
617 /* Special case for some CJK UTF-8 locales, which take UTF-8 input
618 * recoded from EUC-JP (etc.) and produce UTF-8 output. This is
619 * rather filthy.
620 */
621 if ((STREQ (input, "BIG5") || STREQ (input, "BIG5HKSCS") ||
622 STREQ (input, "EUC-JP") ||
623 STREQ (input, "EUC-CN") || STREQ (input, "GBK") ||
624 STREQ (input, "EUC-KR") ||
625 STREQ (input, "EUC-TW")) &&
626 STREQ (output, "UTF-8"))
627 return true;
628 #endif /* MULTIBYTE_GROFF */
629
630 return false;
631 }
632
633 /* Return the default groff device for the given character set. This may be
634 * overridden by the user. The page's source encoding is needed to ensure
635 * that the device is compatible: consider ru_RU.UTF-8, which needs ascii8
636 * and a trailing iconv pipe to recode to UTF-8.
637 *
638 * All this encoding compatibility stuff feels like a slightly nasty hack,
639 * but I haven't yet come up with a cleaner way to do it.
640 */
641 const char *get_default_device (const char *charset_from_locale,
642 const char *source_encoding)
643 {
644 const struct charset_entry *entry;
645
646 if (get_groff_preconv ()) {
647 /* ASCII is a special case, and the only way we can get
648 * things like bullet marks to come out right is by using
649 * the ascii device. People using such a basic locale
650 * probably don't want anything fancy anyway.
651 */
652 if (charset_from_locale &&
653 STREQ (charset_from_locale, "ANSI_X3.4-1968"))
654 return "ascii";
655 else
656 return "utf8";
657 }
658
659 if (!charset_from_locale)
660 return fallback_default_device;
661
662 for (entry = charset_table; entry->charset_from_locale; ++entry) {
663 if (STREQ (entry->charset_from_locale, charset_from_locale)) {
664 const char *roff_encoding =
665 get_roff_encoding (entry->default_device,
666 source_encoding);
667 if (compatible_encodings (source_encoding,
668 roff_encoding))
669 return entry->default_device;
670 }
671 }
672
673 return fallback_default_device;
674 }
675
676 /* Is this a known *roff device name? */
677 bool ATTRIBUTE_PURE is_roff_device (const char *device)
678 {
679 const struct device_entry *entry;
680
681 for (entry = device_table; entry->roff_device; ++entry) {
682 if (STREQ (entry->roff_device, device))
683 return true;
684 }
685
686 return false;
687 }
688
689 /* Find the input encoding expected by groff, and set the LESSCHARSET
690 * environment variable appropriately.
691 */
692 const char *get_roff_encoding (const char *device, const char *source_encoding)
693 {
694 const struct device_entry *entry;
695 bool found = false;
696 const char *roff_encoding = NULL;
697
698 if (device) {
699 for (entry = device_table; entry->roff_device; ++entry) {
700 if (STREQ (entry->roff_device, device)) {
701 found = true;
702 roff_encoding = entry->roff_encoding;
703 break;
704 }
705 }
706 }
707
708 if (!found)
709 roff_encoding = fallback_roff_encoding;
710
711 #ifdef MULTIBYTE_GROFF
712 /* An ugly special case is needed here. The utf8 device normally
713 * takes ISO-8859-1 input. However, with the multibyte patch, when
714 * recoding from CJK character sets it takes UTF-8 input instead.
715 * This is evil, but there's not much that can be done about it
716 * apart from waiting for groff 2.0.
717 */
718 if (device && STREQ (device, "utf8") && !get_groff_preconv () &&
719 STREQ (get_locale_charset (), "UTF-8")) {
720 const char *ctype = setlocale (LC_CTYPE, NULL);
721 if (STRNEQ (ctype, "ja_JP", 5) ||
722 STRNEQ (ctype, "ko_KR", 5) ||
723 STRNEQ (ctype, "zh_CN", 5) ||
724 STRNEQ (ctype, "zh_HK", 5) ||
725 STRNEQ (ctype, "zh_SG", 5) ||
726 STRNEQ (ctype, "zh_TW", 5))
727 roff_encoding = "UTF-8";
728 }
729 #endif /* MULTIBYTE_GROFF */
730
731 return roff_encoding ? roff_encoding : source_encoding;
732 }
733
734 /* Find the output encoding that this device will produce, or NULL if it
735 * will simply pass through the input encoding.
736 */
737 const char * ATTRIBUTE_PURE get_output_encoding (const char *device)
738 {
739 const struct device_entry *entry;
740
741 for (entry = device_table; entry->roff_device; ++entry)
742 if (STREQ (entry->roff_device, device))
743 return entry->output_encoding;
744
745 return NULL;
746 }
747
748 /* Return the value of LESSCHARSET appropriate for this locale. */
749 const char * ATTRIBUTE_PURE get_less_charset (const char *charset_from_locale)
750 {
751 const struct less_charset_entry *entry;
752
753 if (charset_from_locale) {
754 for (entry = less_charset_table; entry->charset_from_locale;
755 ++entry)
756 if (STREQ (entry->charset_from_locale,
757 charset_from_locale))
758 return entry->less_charset;
759 }
760
761 return fallback_less_charset;
762 }
763
764 /* Return the value of JLESSCHARSET appropriate for this locale. May return
765 * NULL.
766 */
767 const char * ATTRIBUTE_PURE get_jless_charset (const char *charset_from_locale)
768 {
769 const struct less_charset_entry *entry;
770
771 if (charset_from_locale) {
772 for (entry = less_charset_table; entry->charset_from_locale;
773 ++entry)
774 if (STREQ (entry->charset_from_locale,
775 charset_from_locale))
776 return entry->jless_charset;
777 }
778
779 return NULL;
780 }