1  /*
       2   * manconv.c: convert manual page from one encoding to another
       3   *
       4   * Copyright (C) 2007, 2008, 2009, 2010, 2012 Colin Watson.
       5   * Based loosely on parts of glibc's iconv_prog.c, which is:
       6   * Copyright (C) 1998-2004, 2005, 2006, 2007 Free Software Foundation, Inc.
       7   *
       8   * This file is part of man-db.
       9   *
      10   * man-db is free software; you can redistribute it and/or modify it
      11   * under the terms of the GNU General Public License as published by
      12   * the Free Software Foundation; either version 2 of the License, or
      13   * (at your option) any later version.
      14   *
      15   * man-db is distributed in the hope that it will be useful, but
      16   * WITHOUT ANY WARRANTY; without even the implied warranty of
      17   * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
      18   * GNU General Public License for more details.
      19   *
      20   * You should have received a copy of the GNU General Public License
      21   * along with man-db; if not, write to the Free Software Foundation,
      22   * Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
      23   */
      24  
      25  /* This program arose during a discussion with Adam Borowski. See:
      26   *   https://lists.debian.org/debian-mentors/2007/09/msg00245.html
      27   * It behaves like iconv, but allows multiple source encodings and
      28   * attempts to guess the first one that works. An Emacs-style
      29   * "-*- coding:" declaration overrides this.
      30   */
      31  
      32  #ifdef HAVE_CONFIG_H
      33  #  include "config.h"
      34  #endif /* HAVE_CONFIG_H */
      35  
      36  #include <assert.h>
      37  #include <stdio.h>
      38  #include <errno.h>
      39  #include <stdlib.h>
      40  #include <string.h>
      41  #include <stdbool.h>
      42  #include <stdint.h>
      43  #include <unistd.h>
      44  
      45  #ifdef HAVE_ICONV
      46  #  include <iconv.h>
      47  #endif /* HAVE_ICONV */
      48  
      49  #include "argp.h"
      50  #include "attribute.h"
      51  #include "error.h"
      52  #include "gl_list.h"
      53  #include "xalloc.h"
      54  #include "xstrndup.h"
      55  #include "xvasprintf.h"
      56  
      57  #include "gettext.h"
      58  #include <locale.h>
      59  #define _(String) gettext (String)
      60  
      61  #include "manconfig.h"
      62  
      63  #include "debug.h"
      64  #include "fatal.h"
      65  #include "glcontainers.h"
      66  
      67  #include "decompress.h"
      68  #include "manconv.h"
      69  
      70  /* Encoding conversions from groff-1.20/src/preproc/preconv/preconv.cpp.
      71   * I've only included those not already recognised by GNU libiconv.
      72   */
      73  struct conversion_entry {
      74  	const char *from;
      75  	const char *to;
      76  };
      77  
      78  static struct conversion_entry conversion_table[] = {
      79  	{ "chinese-big5",			"Big5" },
      80  	{ "chinese-euc",			"GB2312" },
      81  	{ "chinese-iso-8bit",			"GB2312" },
      82  	{ "cn-gb-2312",				"GB2312" },
      83  	{ "cp878",				"KOI8-R" },
      84  	{ "cyrillic-iso-8bit",			"ISO-8859-5" },
      85  	{ "cyrillic-koi8",			"KOI8-R" },
      86  	{ "euc-china",				"GB2312" },
      87  	{ "euc-japan",				"EUC-JP" },
      88  	{ "euc-japan-1990",			"EUC-JP" },
      89  	{ "euc-kr",				"EUC-KR" },
      90  	{ "greek-iso-8bit",			"ISO-8859-7" },
      91  	{ "iso-latin-1",			"ISO-8859-1" },
      92  	{ "iso-latin-2",			"ISO-8859-2" },
      93  	{ "iso-latin-5",			"ISO-8859-9" },
      94  	{ "iso-latin-7",			"ISO-8859-13" },
      95  	{ "iso-latin-9",			"ISO-8859-15" },
      96  	{ "japanese-iso-8bit",			"EUC-JP" },
      97  	{ "japanese-euc",			"EUC-JP" },
      98  	{ "jis8",				"EUC-JP" },
      99  	{ "korean-euc",				"EUC-KR" },
     100  	{ "korean-iso-8bit",			"EUC-KR" },
     101  	{ "latin-0",				"ISO-8859-15" },
     102  	{ "latin-1",				"ISO-8859-1" },
     103  	{ "latin-2",				"ISO-8859-2" },
     104  	{ "latin-5",				"ISO-8859-9" },
     105  	{ "latin-7",				"ISO-8859-13" },
     106  	{ "mule-utf-16",			"UTF-16" },
     107  	{ "mule-utf-16be",			"UTF-16BE" },
     108  	{ "mule-utf-16-be",			"UTF-16BE" },
     109  	{ "mule-utf-16be-with-signature",	"UTF-16" },
     110  	{ "mule-utf-16le",			"UTF-16LE" },
     111  	{ "mule-utf-16-le",			"UTF-16LE" },
     112  	{ "mule-utf-16le-with-signature",	"UTF-16" },
     113  	{ "mule-utf-8",				"UTF-8" },
     114  	{ "utf-16-be",				"UTF-16BE" },
     115  	{ "utf-16be-with-signature",		"UTF-16" },
     116  	{ "utf-16-be-with-signature",		"UTF-16" },
     117  	{ "utf-16-le",				"UTF-16LE" },
     118  	{ "utf-16le-with-signature",		"UTF-16" },
     119  	{ "utf-16-le-with-signature",		"UTF-16" },
     120  	{ NULL,					NULL }
     121  };
     122  
     123  /* Convert Emacs-style coding tags to ones that libiconv understands. */
     124  static char *convert_encoding (char *encoding)
     125  {
     126  	size_t encoding_len = strlen (encoding);
     127  	const struct conversion_entry *entry;
     128  
     129  #define STRIP(s, l) do { \
     130  	if (encoding_len > (l) && \
     131  	    !strcasecmp (encoding + encoding_len - (l), (s))) \
     132  		encoding[encoding_len - (l)] = '\0'; \
     133  } while (0)
     134  
     135  	STRIP ("-dos", 4);
     136  	STRIP ("-mac", 4);
     137  	STRIP ("-unix", 5);
     138  
     139  #undef STRIP
     140  
     141  	for (entry = conversion_table; entry->from; ++entry)
     142  		if (!strcasecmp (entry->from, encoding)) {
     143  			free (encoding);
     144  			return xstrdup (entry->to);
     145  		}
     146  
     147  	return encoding;
     148  }
     149  
     150  /* Inspect the first line of data from a decompressor for preprocessor
     151   * encoding declarations.
     152   *
     153   * If to_encoding and modified_line are both non-NULL, and if the encoding
     154   * declaration in the input does not match to_encoding, then return an
     155   * encoding declaration line modified to refer to the given to_encoding in
     156   * *modified_line.  The caller should free *modified_line.
     157   */
     158  char *check_preprocessor_encoding (decompress *decomp, const char *to_encoding,
     159  				   char **modified_line)
     160  {
     161  	char *pp_encoding = NULL;
     162  	const char *line = decompress_peekline (decomp);
     163  	const char *directive = NULL, *directive_end = NULL, *pp_search = NULL;
     164  	size_t pp_encoding_len = 0;
     165  
     166  	/* Some people use .\" incorrectly. We allow it for encoding
     167  	 * declarations but not for preprocessor declarations.
     168  	 */
     169  	if (line &&
     170  	    (STRNEQ (line, PP_COOKIE, 4) || STRNEQ (line, ".\\\" ", 4))) {
     171  		const char *newline = strchr (line, '\n');
     172  
     173  		directive = line + 4;
     174  		directive_end = newline ? newline : strchr (directive, '\0');
     175  		pp_search = memmem (directive, directive_end - directive,
     176  				    "-*-", 3);
     177  	}
     178  
     179  	if (directive && pp_search) {
     180  		pp_search += 3;
     181  		while (pp_search && pp_search < directive_end && *pp_search) {
     182  			while (*pp_search == ' ')
     183  				++pp_search;
     184  			if (STRNEQ (pp_search, "coding:", 7)) {
     185  				const char *pp_encoding_allow;
     186  				pp_search += 7;
     187  				while (*pp_search == ' ')
     188  					++pp_search;
     189  				pp_encoding_allow = "ABCDEFGHIJKLMNOPQRSTUVWXYZ"
     190  						    "abcdefghijklmnopqrstuvwxyz"
     191  						    "0123456789-_/:.()";
     192  				pp_encoding_len = strspn (pp_search,
     193  							  pp_encoding_allow);
     194  				pp_encoding = xstrndup (pp_search,
     195  							pp_encoding_len);
     196  				pp_encoding = convert_encoding (pp_encoding);
     197  				debug ("preprocessor encoding: %s\n",
     198  				       pp_encoding);
     199  				break;
     200  			} else {
     201  				pp_search = memchr (pp_search, ';',
     202  						    directive_end - pp_search);
     203  				if (pp_search)
     204  					++pp_search;
     205  			}
     206  		}
     207  	}
     208  
     209  	if (to_encoding && modified_line &&
     210  	    pp_encoding && strcasecmp (pp_encoding, to_encoding)) {
     211  		assert (directive_end);
     212  		assert (pp_search);
     213  		*modified_line = xasprintf
     214  			("%.*s%s%.*s\n",
     215  			 (int) (pp_search - line), line,
     216  			 to_encoding,
     217  			 (int) (directive_end - (pp_search + pp_encoding_len)),
     218  			 pp_search + pp_encoding_len);
     219  	}
     220  
     221  	return pp_encoding;
     222  }
     223  
     224  static int add_output (const char *inbuf, size_t inlen,
     225  		       struct manconv_outbuf *outbuf)
     226  {
     227  	int ret = 0;
     228  
     229  	if (outbuf) {
     230  		if (outbuf->len + inlen >= outbuf->max)
     231  			fatal (0, "out of space in output buffer");
     232  		memcpy (outbuf->buf + outbuf->len, inbuf, inlen);
     233  		outbuf->len += inlen;
     234  	} else {
     235  		int errno_save = errno;
     236  		if (fwrite (inbuf, 1, inlen, stdout) < inlen ||
     237  		    ferror (stdout)) {
     238  			error (0, 0, _("can't write to standard output"));
     239  			ret = -1;
     240  		}
     241  		errno = errno_save;
     242  	}
     243  
     244  	return ret;
     245  }
     246  
     247  #ifdef HAVE_ICONV
     248  
     249  /* When converting text containing an invalid multibyte sequence to
     250   * UTF-8//IGNORE, GNU libc's iconv returns EILSEQ but sets *inbuf to the end
     251   * of the input buffer.  I'm not sure whether this is a bug or not (it seems
     252   * to contradict the documentation), but work around it anyway by recoding
     253   * to UTF-8 so that we can accurately position the error.
     254   */
     255  static off_t locate_error (const char *try_from_code,
     256  			   const char *input, size_t input_size,
     257  			   char *utf8, size_t utf8_size)
     258  {
     259  	iconv_t cd_utf8_strict;
     260  	char *inptr = (char *) input, *utf8ptr = utf8;
     261  	size_t inleft = input_size, utf8left = utf8_size;
     262  	size_t n;
     263  	off_t ret;
     264  
     265  	cd_utf8_strict = iconv_open ("UTF-8", try_from_code);
     266  	if (cd_utf8_strict == (iconv_t) -1) {
     267  		error (0, errno, "iconv_open (\"UTF-8\", \"%s\")",
     268  		       try_from_code);
     269  		return 0;
     270  	}
     271  
     272  	n = iconv (cd_utf8_strict, (ICONV_CONST char **) &inptr, &inleft,
     273  		   &utf8ptr, &utf8left);
     274  	if (n == (size_t) -1)
     275  		ret = inptr - input;
     276  	else
     277  		ret = 0;
     278  
     279  	iconv_close (cd_utf8_strict);
     280  
     281  	return ret;
     282  }
     283  
     284  typedef enum {
     285  	TRIED_ICONV_OK = 0,
     286  	TRIED_ICONV_ERROR = -1,  /* can continue with another encoding */
     287  	TRIED_ICONV_FATAL = -2   /* must give up */
     288  } tried_iconv;
     289  
     290  static tried_iconv try_iconv (decompress *decomp, const char *try_from_code,
     291  			      const char *to, bool last,
     292  			      struct manconv_outbuf *outbuf)
     293  {
     294  	char *try_to_code = xstrdup (to);
     295  	static const size_t buf_size = 65536;
     296  	size_t input_size = buf_size;
     297  	off_t input_pos = 0;
     298  	const char *input;
     299  	static char *utf8 = NULL, *output = NULL;
     300  	size_t utf8left = 0;
     301  	iconv_t cd_utf8, cd = NULL;
     302  	bool to_utf8 = STREQ (try_to_code, "UTF-8") ||
     303  		       STRNEQ (try_to_code, "UTF-8//", 7);
     304  	const char *utf8_target = last ? "UTF-8//IGNORE" : "UTF-8";
     305  	bool ignore_errors = (strstr (try_to_code, "//IGNORE") != NULL);
     306  	tried_iconv ret = TRIED_ICONV_OK;
     307  
     308  	debug ("trying encoding %s -> %s\n", try_from_code, try_to_code);
     309  
     310  	cd_utf8 = iconv_open (utf8_target, try_from_code);
     311  	if (cd_utf8 == (iconv_t) -1) {
     312  		error (0, errno, "iconv_open (\"%s\", \"%s\")",
     313  		       utf8_target, try_from_code);
     314  		free (try_to_code);
     315  		return TRIED_ICONV_ERROR;
     316  	}
     317  
     318  	if (!to_utf8) {
     319  		cd = iconv_open (try_to_code, "UTF-8");
     320  		if (cd == (iconv_t) -1) {
     321  			error (0, errno, "iconv_open (\"%s\", \"UTF-8\")",
     322  			       try_to_code);
     323  			free (try_to_code);
     324  			return TRIED_ICONV_ERROR;
     325  		}
     326  	}
     327  
     328  	input = decompress_peek (decomp, &input_size);
     329  	if (input_size < buf_size) {
     330  		/* End of file, error, or just a short read? Repeat until we
     331  		 * have either a full buffer or EOF/error.
     332  		 */
     333  		while (input_size < buf_size) {
     334  			size_t old_input_size = input_size;
     335  			input_size = buf_size;
     336  			input = decompress_peek (decomp, &input_size);
     337  			if (input_size == old_input_size)
     338  				break;
     339  		}
     340  	}
     341  
     342  	if (!utf8)
     343  		utf8 = xmalloc (buf_size);
     344  	if (!output)
     345  		output = xmalloc (buf_size);
     346  
     347  	while (input_size || utf8left) {
     348  		int handle_iconv_errors = 0;
     349  		char *inptr = (char *) input, *utf8ptr = utf8, *outptr;
     350  		size_t inleft = input_size, outleft;
     351  		size_t n, n2 = -1;
     352  
     353  		if (!utf8left) {
     354  			/* First, convert the text to UTF-8. By assumption,
     355  			 * all validly-encoded text can be converted to
     356  			 * UTF-8 assuming that we picked the correct
     357  			 * encoding. Any errors at this stage are due to
     358  			 * selecting an incorrect encoding, or due to
     359  			 * misencoded source text.
     360  			 */
     361  			utf8left = buf_size;
     362  			n = iconv (cd_utf8, (ICONV_CONST char **) &inptr,
     363  				   &inleft, &utf8ptr, &utf8left);
     364  			utf8left = buf_size - utf8left;
     365  
     366  			/* If we need to try the next encoding, do that
     367  			 * before writing anything.
     368  			 */
     369  			if (!last && n == (size_t) -1 &&
     370  			    (errno == EILSEQ ||
     371  			     (errno == EINVAL && input_size < buf_size))) {
     372  				ret = TRIED_ICONV_ERROR;
     373  				break;
     374  			} else if (n == (size_t) -1)
     375  				handle_iconv_errors = errno;
     376  		}
     377  
     378  		/* If the target encoding is UTF-8 (the common case), then
     379  		 * we can just write out what we've got. Otherwise, we need
     380  		 * to convert to the target encoding. Any errors at this
     381  		 * stage are due to characters that are not representable in
     382  		 * the target encoding.
     383  		 */
     384  		if (handle_iconv_errors)
     385  			/* Fall back to error handling below.  If we have
     386  			 * anything to write out, we'll do it next time
     387  			 * round the loop.
     388  			 */
     389  			outptr = output;
     390  		else if (to_utf8) {
     391  			memcpy (output, utf8, utf8left);
     392  			outptr = output + utf8left;
     393  			outleft = utf8left;
     394  			utf8left = 0;
     395  		} else if (utf8left) {
     396  			outptr = output;
     397  			outleft = buf_size;
     398  			utf8ptr = utf8;
     399  			n2 = iconv (
     400  				cd, (ICONV_CONST char **) &utf8ptr, &utf8left,
     401  				&outptr, &outleft);
     402  			outleft = buf_size - outleft;
     403  			if (n2 == (size_t) -1)
     404  				handle_iconv_errors = errno;
     405  
     406  			if (n2 == (size_t) -1 &&
     407  			    errno == EILSEQ && ignore_errors)
     408  				errno = 0;
     409  		} else
     410  			/* We appear to have converted some input text, but
     411  			 * not actually ended up with any UTF-8 text.  This
     412  			 * is odd.  However, we can at least continue round
     413  			 * the loop, skip the input text we converted, and
     414  			 * then we should get a different result next time.
     415  			 */
     416  			outptr = output;
     417  
     418  		if (outptr != output) {
     419  			/* We have something to write out. */
     420  			if (add_output (output, outleft, outbuf) != 0) {
     421  				ret = TRIED_ICONV_FATAL;
     422  				goto out;
     423  			}
     424  		}
     425  
     426  		if (!to_utf8 && n2 != (size_t) -1) {
     427  			/* All the UTF-8 text we have so far was processed.
     428  			 * For state-dependent character sets we have to
     429  			 * flush the state now.
     430  			 */
     431  			outptr = output;
     432  			outleft = buf_size;
     433  			iconv (cd, NULL, NULL, &outptr, &outleft);
     434  			outleft = buf_size - outleft;
     435  
     436  			if (outptr != output) {
     437  				/* We have something to write out. */
     438  				if (add_output (output, outleft,
     439  						outbuf) != 0) {
     440  					ret = TRIED_ICONV_FATAL;
     441  					goto out;
     442  				}
     443  			}
     444  		} else if (handle_iconv_errors) {
     445  			intmax_t error_pos;
     446  
     447  			if (handle_iconv_errors == EILSEQ && !ignore_errors) {
     448  				if (!quiet) {
     449  					error_pos = input_pos + locate_error (
     450  						try_from_code,
     451  						input, input_size,
     452  						utf8, buf_size);
     453  					error (0, handle_iconv_errors,
     454  					       "byte %jd: iconv", error_pos);
     455  				}
     456  				ret = TRIED_ICONV_FATAL;
     457  				goto out;
     458  			} else if (handle_iconv_errors == EINVAL &&
     459  				   input_size < buf_size) {
     460  				if (!quiet) {
     461  					error_pos = input_pos + locate_error (
     462  						try_from_code,
     463  						input, input_size,
     464  						utf8, buf_size);
     465  					error (0, 0, "byte %jd: %s", error_pos,
     466  					       _("iconv: incomplete character "
     467  						 "at end of buffer"));
     468  				}
     469  				ret = TRIED_ICONV_FATAL;
     470  				goto out;
     471  			}
     472  		}
     473  
     474  		if (inptr != input) {
     475  			decompress_peek_skip (decomp, input_size - inleft);
     476  			input_pos += input_size - inleft;
     477  		}
     478  
     479  		/* Unless we have some UTF-8 text left (which will only
     480  		 * happen if the output encoding is more verbose than UTF-8,
     481  		 * so is unlikely for legacy encodings), we need to fetch
     482  		 * more input text now.
     483  		 */
     484  		if (!utf8left) {
     485  			input_size = buf_size;
     486  			input = decompress_peek (decomp, &input_size);
     487  			while (input_size < buf_size) {
     488  				size_t old_input_size = input_size;
     489  				input_size = buf_size;
     490  				input = decompress_peek (decomp, &input_size);
     491  				if (input_size == old_input_size)
     492  					break;
     493  			}
     494  		}
     495  	}
     496  
     497  out:
     498  	if (!to_utf8)
     499  		iconv_close (cd);
     500  	iconv_close (cd_utf8);
     501  	free (try_to_code);
     502  
     503  	return ret;
     504  }
     505  
     506  int manconv (decompress *decomp, gl_list_t from, const char *to,
     507  	     struct manconv_outbuf *outbuf)
     508  {
     509  	char *pp_encoding;
     510  	const char *try_from_code;
     511  	char *plain_to, *modified_pp_line = NULL;
     512  	tried_iconv tried;
     513  	int ret = 0;
     514  
     515  	plain_to = xstrndup (to, strcspn (to, "/"));
     516  	pp_encoding = check_preprocessor_encoding
     517  		(decomp, plain_to, &modified_pp_line);
     518  	if (pp_encoding) {
     519  		if (modified_pp_line) {
     520  			size_t len = strlen (modified_pp_line);
     521  			decompress_readline (decomp);
     522  			if (add_output (modified_pp_line, len, outbuf) != 0) {
     523  				ret = -1;
     524  				goto out;
     525  			}
     526  		}
     527  		tried = try_iconv (decomp, pp_encoding, to, 1, outbuf);
     528  		if (tried == TRIED_ICONV_FATAL)
     529  			ret = -1;
     530  	} else {
     531  		GL_LIST_FOREACH (from, try_from_code) {
     532  			bool last = !gl_list_next_node (from, from_node);
     533  			tried = try_iconv (decomp, try_from_code, to, last,
     534  					   outbuf);
     535  			if (tried == TRIED_ICONV_OK)
     536  				break;
     537  			else if (tried == TRIED_ICONV_FATAL) {
     538  				ret = -1;
     539  				goto out;
     540  			}
     541  		}
     542  	}
     543  
     544  out:
     545  	free (modified_pp_line);
     546  	free (pp_encoding);
     547  	free (plain_to);
     548  	return ret;
     549  }
     550  
     551  #else /* !HAVE_ICONV */
     552  
     553  /* If we don't have iconv, there isn't much we can do; just pass everything
     554   * through unchanged.
     555   */
     556  int manconv (decompress *decomp, gl_list_t from MAYBE_UNUSED,
     557  	     const char *to MAYBE_UNUSED, struct manconv_outbuf *outbuf)
     558  {
     559  	for (;;) {
     560  		size_t len = 4096;
     561  		const char *buffer = decompress_read (decomp, &len);
     562  		if (len == 0)
     563  			break;
     564  		if (add_output (buffer, len, outbuf) != 0)
     565  			return -1;
     566  	}
     567  	return 0;
     568  }
     569  
     570  #endif /* HAVE_ICONV */