1  /*
       2   * encodings.c: locale and encoding handling for man
       3   *
       4   * Copyright (C) 2003, 2004, 2005, 2006, 2007, 2008, 2009, 2010, 2011
       5   *               Colin Watson.
       6   *
       7   * This file is part of man-db.
       8   *
       9   * man-db is free software; you can redistribute it and/or modify it
      10   * under the terms of the GNU General Public License as published by
      11   * the Free Software Foundation; either version 2 of the License, or
      12   * (at your option) any later version.
      13   *
      14   * man-db is distributed in the hope that it will be useful, but
      15   * WITHOUT ANY WARRANTY; without even the implied warranty of
      16   * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
      17   * GNU General Public License for more details.
      18   *
      19   * You should have received a copy of the GNU General Public License
      20   * along with man-db; if not, write to the Free Software Foundation,
      21   * Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
      22   */
      23  
      24  #ifdef HAVE_CONFIG_H
      25  #  include "config.h"
      26  #endif /* HAVE_CONFIG_H */
      27  
      28  #include <stdbool.h>
      29  #include <stdio.h>
      30  #include <string.h>
      31  #include <stdlib.h>
      32  #include <unistd.h>
      33  #include <locale.h>
      34  #include <ctype.h>
      35  
      36  #include "attribute.h"
      37  #include "gettext.h"
      38  #include "localcharset.h"
      39  #include "xalloc.h"
      40  #include "xstrndup.h"
      41  
      42  #include "manconfig.h"
      43  
      44  #include "debug.h"
      45  #include "encodings.h"
      46  #include "pathsearch.h"
      47  
      48  
      49  /* Due to historical limitations in groff (which may be removed in the
      50   * future), there is no mechanism for a man page to specify its own
      51   * encoding. This means that each national language directory needs to carry
      52   * with it information about its encoding, and each groff device needs to
      53   * have a default encoding associated with it. Out of the box, groff
      54   * formally allows only ISO-8859-1 on input; however, patches originating
      55   * with Debian and imported by many other GNU/Linux distributions change
      56   * this somewhat.
      57   *
      58   * Eventually, groff will support proper Unicode input, and much of this
      59   * horror can go away.
      60   *
      61   * Do *not* confuse source encoding with groff encoding. The encoding
      62   * specified in this table is the encoding in which the source man pages in
      63   * each language directory are expected to be written. The groff encoding is
      64   * determined by the selected groff device and sometimes also by the user's
      65   * locale.
      66   *
      67   * The standard output encoding is the encoding assumed for cat pages for
      68   * each language directory. It must *not* be used to discover the actual
      69   * output encoding displayed to the user; that is determined by the locale.
      70   * TODO: it would be useful to be able to change the standard output
      71   * encoding in the configuration file.
      72   *
      73   * This table is expected to change over time, particularly as man pages
      74   * begin to move towards UTF-8. Feel free to patch this for your
      75   * distribution; send me updates for languages I've missed.
      76   *
      77   * Explicit encodings in the directory name (e.g. de_DE.UTF-8) override this
      78   * table.
      79   */
      80  struct directory_entry {
      81  	const char *lang_dir;
      82  	const char *source_encoding;
      83  };
      84  
      85  static struct directory_entry directory_table[] = {
      86  	{ "C",		"ISO-8859-1"	}, /* English */
      87  	{ "POSIX",	"ISO-8859-1"	}, /* English */
      88  	{ "da",		"ISO-8859-1"	}, /* Danish */
      89  	{ "de",		"ISO-8859-1"	}, /* German */
      90  	{ "en",		"ISO-8859-1"	}, /* English */
      91  	{ "es",		"ISO-8859-1"	}, /* Spanish */
      92  	{ "et",		"ISO-8859-1"	}, /* Estonian */
      93  	{ "fi",		"ISO-8859-1"	}, /* Finnish */
      94  	{ "fr",		"ISO-8859-1"	}, /* French */
      95  	{ "ga",		"ISO-8859-1"	}, /* Irish */
      96  	{ "gl",		"ISO-8859-1"	}, /* Galician */
      97  	{ "id",		"ISO-8859-1"	}, /* Indonesian */
      98  	{ "is",		"ISO-8859-1"	}, /* Icelandic */
      99  	{ "it",		"ISO-8859-1"	}, /* Italian */
     100  	{ "nb",		"ISO-8859-1"	}, /* Norwegian Bokmål */
     101  	{ "nl",		"ISO-8859-1"	}, /* Dutch */
     102  	{ "nn",		"ISO-8859-1"	}, /* Norwegian Nynorsk */
     103  	{ "no",		"ISO-8859-1"	}, /* Norwegian */
     104  	{ "pt",		"ISO-8859-1"	}, /* Portuguese */
     105  	{ "sv",		"ISO-8859-1"	}, /* Swedish */
     106  
     107  #ifdef MULTIBYTE_GROFF
     108  	/* These languages require a patched version of groff with the
     109  	 * ascii8 and nippon devices.
     110  	 */
     111  	{ "be",		"CP1251"	}, /* Belarusian */
     112  	{ "bg",		"CP1251"	}, /* Bulgarian */
     113  	{ "cs",		"ISO-8859-2"	}, /* Czech */
     114  	{ "el",		"ISO-8859-7"	}, /* Greek */
     115  	{ "hr",		"ISO-8859-2"	}, /* Croatian */
     116  	{ "hu",		"ISO-8859-2"	}, /* Hungarian */
     117  	{ "ja",		"EUC-JP"	}, /* Japanese */
     118  	{ "ko",		"EUC-KR"	}, /* Korean */
     119  	{ "lt",		"ISO-8859-13"	}, /* Lithuanian */
     120  	{ "lv",		"ISO-8859-13"	}, /* Latvian */
     121  	{ "mk",		"ISO-8859-5"	}, /* Macedonian */
     122  	{ "pl",		"ISO-8859-2"	}, /* Polish */
     123  	{ "ro",		"ISO-8859-2"	}, /* Romanian */
     124  	{ "ru",		"KOI8-R"	}, /* Russian */
     125  	{ "sk",		"ISO-8859-2"	}, /* Slovak */
     126  	{ "sl",		"ISO-8859-2"	}, /* Slovenian */
     127  	/* sr@latin must precede sr, due to top-down left-substring matching later */
     128  	{ "sr@latin",	"ISO-8859-2"	}, /* Serbian Latin */
     129  	{ "sr",		"ISO-8859-5"	}, /* Serbian */
     130  	{ "tr",		"ISO-8859-9"	}, /* Turkish */
     131  	{ "uk",		"KOI8-U"	}, /* Ukrainian */
     132  	{ "vi",		"TCVN5712-1"	}, /* Vietnamese */
     133  	{ "zh_CN",	"GBK"		}, /* Simplified Chinese */
     134  	{ "zh_SG",	"GBK"		}, /* Simplified Chinese, Singapore */
     135  	{ "zh_HK",	"BIG5HKSCS"	}, /* Traditional Chinese, Hong Kong */
     136  	{ "zh_TW",	"BIG5"		}, /* Traditional Chinese */
     137  #endif /* MULTIBYTE_GROFF */
     138  
     139  	{ NULL,		NULL		}
     140  };
     141  
     142  static const char fallback_source_encoding[] = "ISO-8859-1";
     143  
     144  /* Unfortunately, there is no portable way to inspect iconv's internal table
     145   * of character set aliases. We copy the most interesting ones here so that
     146   * we can deal with them if they appear in directory names. Note that all
     147   * names will be converted to upper case before looking them up in this
     148   * table.
     149   */
     150  struct charset_alias_entry {
     151  	const char *alias;
     152  	const char *canonical_name;
     153  };
     154  
     155  static struct charset_alias_entry charset_alias_table[] = {
     156  	/* The FHS is silly and requires numeric-only aliases that iconv
     157  	 * does not support.
     158  	 */
     159  	{ "88591",		"ISO-8859-1"		},
     160  	{ "88592",		"ISO-8859-2"		},
     161  	{ "88593",		"ISO-8859-3"		},
     162  	{ "88594",		"ISO-8859-4"		},
     163  	{ "88595",		"ISO-8859-5"		},
     164  	{ "88596",		"ISO-8859-6"		},
     165  	{ "88597",		"ISO-8859-7"		},
     166  	{ "88598",		"ISO-8859-8"		},
     167  	{ "88599",		"ISO-8859-9"		},
     168  	{ "885910",		"ISO-8859-10"		},
     169  	{ "885911",		"ISO-8859-11"		},
     170  	{ "885913",		"ISO-8859-13"		},
     171  	{ "885914",		"ISO-8859-14"		},
     172  	{ "885915",		"ISO-8859-15"		},
     173  	{ "885916",		"ISO-8859-16"		},
     174  
     175  	{ "ASCII",		"ANSI_X3.4-1968"	},
     176  	{ "BIG-5",		"BIG5"			},
     177  	{ "BIG5-HKSCS",		"BIG5HKSCS"		},
     178  	{ "EUCCN",		"EUC-CN"		},
     179  	{ "EUCJP",		"EUC-JP"		},
     180  	{ "EUCKR",		"EUC-KR"		},
     181  	{ "EUCTW",		"EUC-TW"		},
     182  	{ "GB2312",		"EUC-CN"		},
     183  	{ "ISO8859-1",		"ISO-8859-1"		},
     184  	{ "ISO8859-2",		"ISO-8859-2"		},
     185  	{ "ISO8859-3",		"ISO-8859-3"		},
     186  	{ "ISO8859-4",		"ISO-8859-4"		},
     187  	{ "ISO8859-5",		"ISO-8859-5"		},
     188  	{ "ISO8859-6",		"ISO-8859-6"		},
     189  	{ "ISO8859-7",		"ISO-8859-7"		},
     190  	{ "ISO8859-8",		"ISO-8859-8"		},
     191  	{ "ISO8859-9",		"ISO-8859-9"		},
     192  	{ "ISO8859-10",		"ISO-8859-10"		},
     193  	{ "ISO8859-11",		"ISO-8859-11"		},
     194  	{ "ISO8859-13",		"ISO-8859-13"		},
     195  	{ "ISO8859-14",		"ISO-8859-14"		},
     196  	{ "ISO8859-15",		"ISO-8859-15"		},
     197  	{ "ISO8859-16",		"ISO-8859-16"		},
     198  	{ "KOI8R",		"KOI8-R"		},
     199  	{ "KOI8U",		"KOI8-U"		},
     200  	{ "UJIS",		"EUC-JP"		},
     201  	{ "US-ASCII",		"ANSI_X3.4-1968"	},
     202  	{ "UTF8",		"UTF-8"			},
     203  
     204  	{ NULL,			NULL			}
     205  };
     206  
     207  /* The default groff terminal output device to be used is determined based
     208   * on locale_charset (), which returns the character set used by the current
     209   * locale.
     210   */
     211  struct charset_entry {
     212  	const char *charset_from_locale;
     213  	const char *default_device;
     214  };
     215  
     216  static struct charset_entry charset_table[] = {
     217  	{ "ANSI_X3.4-1968",	"ascii"		},
     218  #ifndef HEIRLOOM_NROFF
     219  	{ "ISO-8859-1",		"latin1"	},
     220  #endif /* HEIRLOOM_NROFF */
     221  	{ "UTF-8",		"utf8"		},
     222  
     223  #ifndef HEIRLOOM_NROFF
     224  # ifdef MULTIBYTE_GROFF
     225  	{ "BIG5",		"nippon"	},
     226  	{ "BIG5HKSCS",		"nippon"	},
     227  	{ "EUC-CN",		"nippon"	},
     228  	{ "EUC-JP",		"nippon"	},
     229  	{ "EUC-TW",		"nippon"	},
     230  	{ "GBK",		"nippon"	},
     231  # else /* !MULTIBYTE_GROFF */
     232  	/* If we have a smarter version of groff, this is better dealt with
     233  	 * using either ascii8 (Debian multibyte patch) or preconv (as of
     234  	 * groff 1.20). This is a not-quite-right stopgap in case we have
     235  	 * neither.
     236  	 */
     237  	{ "ISO-8859-15",    	"latin1"	},
     238  # endif /* MULTIBYTE_GROFF */
     239  #endif /* HEIRLOOM_NROFF */
     240  
     241  	{ NULL,			NULL		}
     242  };
     243  
     244  static const char *fallback_default_device =
     245  #ifdef MULTIBYTE_GROFF
     246  	"ascii8"
     247  #else /* !MULTIBYTE_GROFF */
     248  	"ascii"
     249  #endif /* MULTIBYTE_GROFF */
     250  	;
     251  
     252  /* The encoding used for the text passed to groff is a function of the
     253   * selected groff device. Traditional devices expect ISO-8859-1 on input
     254   * (yes, even the utf8 device); devices added in the Debian multibyte patch
     255   * expect other encodings. The ascii8 device passes top-bit-set characters
     256   * straight through so is (probably ...) encoding-agnostic. If this encoding
     257   * does not match the source encoding, an iconv pipe is used (if available)
     258   * to perform recoding.
     259   */
     260  struct device_entry {
     261  	const char *roff_device;
     262  	const char *roff_encoding;
     263  	const char *output_encoding;
     264  };
     265  
     266  static struct device_entry device_table[] = {
     267  	/* nroff devices */
     268  	{ "ascii",	"ANSI_X3.4-1968",	"ANSI_X3.4-1968"	},
     269  	{ "latin1",	"ISO-8859-1",		"ISO-8859-1"		},
     270  	{ "utf8",	"ISO-8859-1",		"UTF-8"			},
     271  
     272  #ifdef MULTIBYTE_GROFF
     273  	{ "ascii8",	NULL,			NULL			},
     274  	{ "nippon",	NULL,			NULL			},
     275  #endif /* MULTIBYTE_GROFF */
     276  
     277  #ifdef HEIRLOOM_NROFF
     278  	/* Not strictly accurate, but we only use this in UTF-8 locales. */
     279  	{ "locale",	"UTF-8",		"UTF-8"			},
     280  #endif /* HEIRLOOM_NROFF */
     281  
     282  	/* troff devices */
     283  	{ "X75",	NULL,			NULL			},
     284  	{ "X75-12",	NULL,			NULL			},
     285  	{ "X100",	NULL,			NULL			},
     286  	{ "X100-12",	NULL,			NULL			},
     287  	{ "dvi",	NULL,			NULL			},
     288  	{ "html",	NULL,			NULL			},
     289  	{ "lbp",	NULL,			NULL			},
     290  	{ "lj4",	NULL,			NULL			},
     291  	{ "ps",		NULL,			NULL			},
     292  
     293  	{ NULL,		NULL,			NULL			}
     294  };
     295  
     296  static const char fallback_roff_encoding[] = "ISO-8859-1";
     297  
     298  /* Setting less_charset to iso8859 tells the less pager that characters
     299   * between 0xA0 and 0xFF are displayable, not that its input is encoded in
     300   * ISO-8859-*. TODO: Perhaps using LESSCHARDEF would be better.
     301   *
     302   * Character set names compatible only with jless go in jless_charset.
     303   */
     304  struct less_charset_entry {
     305  	const char *charset_from_locale;
     306  	const char *less_charset;
     307  	const char *jless_charset;
     308  };
     309  
     310  static struct less_charset_entry less_charset_table[] = {
     311  	{ "ANSI_X3.4-1968",	"ascii",	NULL		},
     312  	{ "ISO-8859-1",		"iso8859",	NULL		},
     313  	{ "UTF-8",		"utf-8",	NULL		},
     314  
     315  #ifdef MULTIBYTE_GROFF
     316  	{ "CP1251",		"windows",	NULL		},
     317  	{ "EUC-JP",		"iso8859",	"japanese-ujis"	},
     318  	{ "KOI8-R",		"koi8-r",	NULL		},
     319  	/* close enough? */
     320  	{ "KOI8-U",		"koi8-r",	NULL		},
     321  #endif /* MULTIBYTE_GROFF */
     322  
     323  	{ NULL,			NULL,		NULL		}
     324  };
     325  
     326  static const char fallback_less_charset[] = "iso8859";
     327  
     328  const char *groff_preconv = NULL;
     329  
     330  /* Is the groff "preconv" helper available? If so, return its name.
     331   * Otherwise, return NULL.
     332   */
     333  const char *get_groff_preconv (void)
     334  {
     335  	if (groff_preconv) {
     336  		if (*groff_preconv)
     337  			return groff_preconv;
     338  		else
     339  			return NULL;
     340  	}
     341  
     342  	if (pathsearch_executable ("gpreconv"))
     343  		groff_preconv = "gpreconv";
     344  	else if (pathsearch_executable ("preconv"))
     345  		groff_preconv = "preconv";
     346  	else
     347  		groff_preconv = "";
     348  
     349  	if (*groff_preconv)
     350  		return groff_preconv;
     351  	else
     352  		return NULL;
     353  }
     354  
     355  /* Return the assumed encoding of the source man page, based on the
     356   * directory in which it was found. The caller should attempt to recode from
     357   * this to whatever encoding is expected by groff.
     358   *
     359   * The caller should free the returned string when it is finished with it.
     360   */
     361  char * ATTRIBUTE_MALLOC get_page_encoding (const char *lang)
     362  {
     363  	const struct directory_entry *entry;
     364  	const char *dot;
     365  
     366  	if (!lang || !*lang) {
     367  		/* Guess based on the locale. */
     368  		lang = setlocale (LC_MESSAGES, NULL);
     369  		if (!lang)
     370  			return xstrdup (fallback_source_encoding);
     371  	}
     372  
     373  	dot = strchr (lang, '.');
     374  	if (dot) {
     375  		/* The FHS has the worst specification of what's supposed to
     376  		 * go after the dot here that I've ever seen. To quote from
     377  		 * version 2.1:
     378  		 *
     379  		 * "It is recommended that this be a numeric representation
     380  		 * if possible (ISO standards, especially), not include
     381  		 * additional punctuation symbols, and that any letters be
     382  		 * in lowercase."
     383  		 *
     384  		 * Any sane standard would use directory names like
     385  		 * de_DE.ISO-8859-1; the examples in the FHS recommend
     386  		 * de_DE.88591 instead. Considering that there is no other
     387  		 * conceivable use for encodings in directory names other
     388  		 * than to pass them to iconv or similar, this is quite
     389  		 * startlingly useless.
     390  		 *
     391  		 * While we now support this thanks to
     392  		 * get_canonical_charset_name, the FHS specification is
     393  		 * obviously wrong and I plan to petition to have it
     394  		 * changed. I recommend ignoring this part of the FHS.
     395  		 */
     396  		char *dir_encoding =
     397  			xstrndup (dot + 1, strcspn (dot + 1, ",@"));
     398  		char *canonical_dir_encoding =
     399  			xstrdup (get_canonical_charset_name (dir_encoding));
     400  		free (dir_encoding);
     401  		return canonical_dir_encoding;
     402  	}
     403  
     404  	for (entry = directory_table; entry->lang_dir; ++entry)
     405  		if (STRNEQ (entry->lang_dir, lang, strlen (entry->lang_dir)))
     406  			return xstrdup (entry->source_encoding);
     407  
     408  	return xstrdup (fallback_source_encoding);
     409  }
     410  
     411  /* Return the canonical encoding for source man pages in the specified
     412   * language. This ignores any encoding specification in the language
     413   * directory name. The source encoding should be used as a basis for
     414   * determining the correct roff device to use: that is, the caller should
     415   * behave as if it is recoding from the page encoding to the source encoding
     416   * first, although in practice it should recode directly from the page
     417   * encoding to the roff encoding.
     418   *
     419   * You should normally only call this function if the page encoding is
     420   * UTF-8, in which case older versions of groff that lack preconv need to
     421   * have the page recoded to some legacy encoding). If the page is in a
     422   * legacy encoding, then attempting to recode from that to some other legacy
     423   * encoding will probably do more harm than good.
     424   *
     425   * Here are a few concrete examples of why these distinctions are important:
     426   *
     427   *   /usr/share/man/en_GB.UTF-8, locale C
     428   *     page encoding = UTF-8
     429   *     source encoding = ISO-8859-1
     430   *     roff encoding = ISO-8859-1
     431   *     output encoding = UTF-8
     432   *     UTF-8 -> iconv -> ISO-8859-1 -> groff -Tascii -> ANSI_X3.4-1968
     433   *
     434   *   /usr/share/man/pl_PL.UTF-8, locale pl_PL.UTF-8
     435   *     page encoding = UTF-8
     436   *     source encoding = ISO-8859-2
     437   *     roff encoding = ISO-8859-2
     438   *     output encoding = ISO-8859-2
     439   *     UTF-8 -> iconv -> ISO-8859-2 -> groff -Tascii8
     440   *                    -> ISO-8859-2 -> iconv -> UTF-8
     441   *
     442   *   /usr/share/man/ja_JP.EUC-JP, locale ja_JP.UTF-8
     443   *     page encoding = EUC-JP
     444   *     source encoding = EUC-JP
     445   *     roff encoding = UTF-8
     446   *     output encoding = UTF-8
     447   *     EUC-JP -> iconv -> UTF-8 -> groff -Tutf8 -> UTF-8
     448   *
     449   *   /usr/share/man/en_GB.ISO-8859-15, locale en_GB.UTF-8
     450   *     page encoding = ISO-8859-15
     451   *     source encoding = ISO-8859-15
     452   *     roff encoding = ISO-8859-15
     453   *     output encoding = ISO-8859-15
     454   *     ISO-8859-15 -> groff -Tascii8 -> ISO-8859-15 -> iconv -> UTF-8
     455   */
     456  const char *get_source_encoding (const char *lang)
     457  {
     458  	const struct directory_entry *entry;
     459  
     460  	if (!lang || !*lang) {
     461  		/* Guess based on the locale. */
     462  		lang = setlocale (LC_MESSAGES, NULL);
     463  		if (!lang)
     464  			return fallback_source_encoding;
     465  	}
     466  
     467  	for (entry = directory_table; entry->lang_dir; ++entry)
     468  		if (STRNEQ (entry->lang_dir, lang, strlen (entry->lang_dir)))
     469  			return entry->source_encoding;
     470  
     471  	return fallback_source_encoding;
     472  }
     473  
     474  const char * ATTRIBUTE_NONNULL ((1)) ATTRIBUTE_RETURNS_NONNULL
     475  	get_canonical_charset_name (const char *charset)
     476  {
     477  	const struct charset_alias_entry *entry;
     478  	char *charset_upper = xstrdup (charset);
     479  	char *p;
     480  
     481  	for (p = charset_upper; *p; ++p)
     482  		*p = CTYPE (toupper, *p);
     483  
     484  	for (entry = charset_alias_table; entry->alias; ++entry)
     485  		if (STREQ (entry->alias, charset_upper)) {
     486  			free (charset_upper);
     487  			return entry->canonical_name;
     488  		}
     489  
     490  	free (charset_upper);
     491  	return charset;
     492  }
     493  
     494  /* Return the current locale's character set. */
     495  const char * ATTRIBUTE_RETURNS_NONNULL get_locale_charset (void)
     496  {
     497  	const char *charset;
     498  	char *saved_locale;
     499  
     500  	/* We need to modify LC_CTYPE temporarily in order to look at the
     501  	 * codeset, so save it first.
     502  	 */
     503  	saved_locale = setlocale (LC_CTYPE, NULL);
     504  	if (saved_locale)
     505  		saved_locale = xstrdup (saved_locale);
     506  
     507  	setlocale (LC_CTYPE, "");
     508  
     509  	charset = locale_charset ();
     510  
     511  	/* Restore LC_CTYPE to its value on entry to this function. */
     512  	setlocale (LC_CTYPE, saved_locale);
     513  	free (saved_locale);
     514  
     515  	if (!charset || !*charset)
     516  		charset = "ANSI_X3.4-1968";
     517  	return get_canonical_charset_name (charset);
     518  }
     519  
     520  /* Find a locale with this character set. This is a non-portable operation,
     521   * but required to make col(1) work correctly with -E. If no locale can be
     522   * found, or if none needs to be set, return NULL.
     523   *
     524   * The caller should free the returned string when it is finished with it.
     525   */
     526  char *find_charset_locale (const char *charset)
     527  {
     528  	const char *canonical_charset = get_canonical_charset_name (charset);
     529  	char *saved_locale;
     530  	const char supported_path[] = "/usr/share/i18n/SUPPORTED";
     531  	FILE *supported = NULL;
     532  	char *line = NULL;
     533  	size_t n = 0;
     534  	char *locale = NULL;
     535  
     536  	if (STREQ (charset, get_locale_charset ()))
     537  		return NULL;
     538  
     539  	saved_locale = setlocale (LC_CTYPE, NULL);
     540  	if (saved_locale)
     541  		saved_locale = xstrdup (saved_locale);
     542  
     543  	supported = fopen (supported_path, "r");
     544  	while (supported && getline (&line, &n, supported) >= 0) {
     545  		const char *space = strchr (line, ' ');
     546  		if (space) {
     547  			char *encoding = xstrdup (space + 1);
     548  			char *newline = strchr (encoding, '\n');
     549  			if (newline)
     550  				*newline = 0;
     551  			if (STREQ (canonical_charset,
     552  				   get_canonical_charset_name (encoding))) {
     553  				locale = xstrndup (line, space - line);
     554  				/* Is this locale actually installed? */
     555  				if (setlocale (LC_CTYPE, locale)) {
     556  					free (encoding);
     557  					goto out;
     558  				} else {
     559  					free (locale);
     560  					locale = NULL;
     561  				}
     562  			}
     563  			free (encoding);
     564  		}
     565  		free (line);
     566  		line = NULL;
     567  	}
     568  
     569  	if (strlen (canonical_charset) >= 5 &&
     570  	    STRNEQ (canonical_charset, "UTF-8", 5)) {
     571  		locale = xstrdup ("C.UTF-8");
     572  		if (setlocale (LC_CTYPE, locale))
     573  			goto out;
     574  		free (locale);
     575  		locale = xstrdup ("en_US.UTF-8");
     576  		if (setlocale (LC_CTYPE, locale))
     577  			goto out;
     578  		free (locale);
     579  		locale = NULL;
     580  	}
     581  
     582  out:
     583  	free (line);
     584  	setlocale (LC_CTYPE, saved_locale);
     585  	free (saved_locale);
     586  	if (supported)
     587  		fclose (supported);
     588  	return locale;
     589  }
     590  
     591  /* Can we take this input encoding and produce this output encoding, perhaps
     592   * with the help of some iconv pipes? */
     593  static bool ATTRIBUTE_PURE compatible_encodings (const char *input,
     594  						 const char *output)
     595  {
     596  	if (STREQ (input, output))
     597  		return true;
     598  
     599  	/* If the input is ASCII, recoding should be easy. Try it. */
     600  	if (STREQ (input, "ANSI_X3.4-1968"))
     601  		return true;
     602  
     603  	/* If the input is UTF-8, it's either a simple recoding of whatever
     604  	 * we want or else it probably won't work at all no matter what we
     605  	 * do. We might as well try it for now.
     606  	 */
     607  	if (STREQ (input, "UTF-8"))
     608  		return true;
     609  
     610  	/* If the output is ASCII, this is probably because the caller
     611  	 * explicitly asked for it, so we have little choice but to try.
     612  	 */
     613  	if (STREQ (output, "ANSI_X3.4-1968"))
     614  		return true;
     615  
     616  #ifdef MULTIBYTE_GROFF
     617  	/* Special case for some CJK UTF-8 locales, which take UTF-8 input
     618  	 * recoded from EUC-JP (etc.) and produce UTF-8 output. This is
     619  	 * rather filthy.
     620  	 */
     621  	if ((STREQ (input, "BIG5") || STREQ (input, "BIG5HKSCS") ||
     622  	     STREQ (input, "EUC-JP") ||
     623  	     STREQ (input, "EUC-CN") || STREQ (input, "GBK") ||
     624  	     STREQ (input, "EUC-KR") ||
     625  	     STREQ (input, "EUC-TW")) &&
     626  	    STREQ (output, "UTF-8"))
     627  		return true;
     628  #endif /* MULTIBYTE_GROFF */
     629  
     630  	return false;
     631  }
     632  
     633  /* Return the default groff device for the given character set. This may be
     634   * overridden by the user. The page's source encoding is needed to ensure
     635   * that the device is compatible: consider ru_RU.UTF-8, which needs ascii8
     636   * and a trailing iconv pipe to recode to UTF-8.
     637   *
     638   * All this encoding compatibility stuff feels like a slightly nasty hack,
     639   * but I haven't yet come up with a cleaner way to do it.
     640   */
     641  const char *get_default_device (const char *charset_from_locale,
     642  				const char *source_encoding)
     643  {
     644  	const struct charset_entry *entry;
     645  
     646  	if (get_groff_preconv ()) {
     647  		/* ASCII is a special case, and the only way we can get
     648  		 * things like bullet marks to come out right is by using
     649  		 * the ascii device. People using such a basic locale
     650  		 * probably don't want anything fancy anyway.
     651  		 */
     652  		if (charset_from_locale &&
     653  		    STREQ (charset_from_locale, "ANSI_X3.4-1968"))
     654  			return "ascii";
     655  		else
     656  			return "utf8";
     657  	}
     658  
     659  	if (!charset_from_locale)
     660  		return fallback_default_device;
     661  
     662  	for (entry = charset_table; entry->charset_from_locale; ++entry) {
     663  		if (STREQ (entry->charset_from_locale, charset_from_locale)) {
     664  			const char *roff_encoding =
     665  				get_roff_encoding (entry->default_device,
     666  						   source_encoding);
     667  			if (compatible_encodings (source_encoding,
     668  						  roff_encoding))
     669  				return entry->default_device;
     670  		}
     671  	}
     672  
     673  	return fallback_default_device;
     674  }
     675  
     676  /* Is this a known *roff device name? */
     677  bool ATTRIBUTE_PURE is_roff_device (const char *device)
     678  {
     679  	const struct device_entry *entry;
     680  
     681  	for (entry = device_table; entry->roff_device; ++entry) {
     682  		if (STREQ (entry->roff_device, device))
     683  			return true;
     684  	}
     685  
     686  	return false;
     687  }
     688  
     689  /* Find the input encoding expected by groff, and set the LESSCHARSET
     690   * environment variable appropriately.
     691   */
     692  const char *get_roff_encoding (const char *device, const char *source_encoding)
     693  {
     694  	const struct device_entry *entry;
     695  	bool found = false;
     696  	const char *roff_encoding = NULL;
     697  
     698  	if (device) {
     699  		for (entry = device_table; entry->roff_device; ++entry) {
     700  			if (STREQ (entry->roff_device, device)) {
     701  				found = true;
     702  				roff_encoding = entry->roff_encoding;
     703  				break;
     704  			}
     705  		}
     706  	}
     707  
     708  	if (!found)
     709  		roff_encoding = fallback_roff_encoding;
     710  
     711  #ifdef MULTIBYTE_GROFF
     712  	/* An ugly special case is needed here. The utf8 device normally
     713  	 * takes ISO-8859-1 input. However, with the multibyte patch, when
     714  	 * recoding from CJK character sets it takes UTF-8 input instead.
     715  	 * This is evil, but there's not much that can be done about it
     716  	 * apart from waiting for groff 2.0.
     717  	 */
     718  	if (device && STREQ (device, "utf8") && !get_groff_preconv () &&
     719  	    STREQ (get_locale_charset (), "UTF-8")) {
     720  		const char *ctype = setlocale (LC_CTYPE, NULL);
     721  		if (STRNEQ (ctype, "ja_JP", 5) ||
     722  		    STRNEQ (ctype, "ko_KR", 5) ||
     723  		    STRNEQ (ctype, "zh_CN", 5) ||
     724  		    STRNEQ (ctype, "zh_HK", 5) ||
     725  		    STRNEQ (ctype, "zh_SG", 5) ||
     726  		    STRNEQ (ctype, "zh_TW", 5))
     727  			roff_encoding = "UTF-8";
     728  	}
     729  #endif /* MULTIBYTE_GROFF */
     730  
     731  	return roff_encoding ? roff_encoding : source_encoding;
     732  }
     733  
     734  /* Find the output encoding that this device will produce, or NULL if it
     735   * will simply pass through the input encoding.
     736   */
     737  const char * ATTRIBUTE_PURE get_output_encoding (const char *device)
     738  {
     739  	const struct device_entry *entry;
     740  
     741  	for (entry = device_table; entry->roff_device; ++entry)
     742  		if (STREQ (entry->roff_device, device))
     743  			return entry->output_encoding;
     744  
     745  	return NULL;
     746  }
     747  
     748  /* Return the value of LESSCHARSET appropriate for this locale. */
     749  const char * ATTRIBUTE_PURE get_less_charset (const char *charset_from_locale)
     750  {
     751  	const struct less_charset_entry *entry;
     752  
     753  	if (charset_from_locale) {
     754  		for (entry = less_charset_table; entry->charset_from_locale;
     755  		     ++entry)
     756  			if (STREQ (entry->charset_from_locale,
     757  				   charset_from_locale))
     758  				return entry->less_charset;
     759  	}
     760  
     761  	return fallback_less_charset;
     762  }
     763  
     764  /* Return the value of JLESSCHARSET appropriate for this locale. May return
     765   * NULL.
     766   */
     767  const char * ATTRIBUTE_PURE get_jless_charset (const char *charset_from_locale)
     768  {
     769  	const struct less_charset_entry *entry;
     770  
     771  	if (charset_from_locale) {
     772  		for (entry = less_charset_table; entry->charset_from_locale;
     773  		     ++entry)
     774  			if (STREQ (entry->charset_from_locale,
     775  				   charset_from_locale))
     776  				return entry->jless_charset;
     777  	}
     778  
     779  	return NULL;
     780  }