1 /*
2 * manconv.c: convert manual page from one encoding to another
3 *
4 * Copyright (C) 2007, 2008, 2009, 2010, 2012 Colin Watson.
5 * Based loosely on parts of glibc's iconv_prog.c, which is:
6 * Copyright (C) 1998-2004, 2005, 2006, 2007 Free Software Foundation, Inc.
7 *
8 * This file is part of man-db.
9 *
10 * man-db is free software; you can redistribute it and/or modify it
11 * under the terms of the GNU General Public License as published by
12 * the Free Software Foundation; either version 2 of the License, or
13 * (at your option) any later version.
14 *
15 * man-db is distributed in the hope that it will be useful, but
16 * WITHOUT ANY WARRANTY; without even the implied warranty of
17 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18 * GNU General Public License for more details.
19 *
20 * You should have received a copy of the GNU General Public License
21 * along with man-db; if not, write to the Free Software Foundation,
22 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
23 */
24
25 /* This program arose during a discussion with Adam Borowski. See:
26 * https://lists.debian.org/debian-mentors/2007/09/msg00245.html
27 * It behaves like iconv, but allows multiple source encodings and
28 * attempts to guess the first one that works. An Emacs-style
29 * "-*- coding:" declaration overrides this.
30 */
31
32 #ifdef HAVE_CONFIG_H
33 # include "config.h"
34 #endif /* HAVE_CONFIG_H */
35
36 #include <assert.h>
37 #include <stdio.h>
38 #include <errno.h>
39 #include <stdlib.h>
40 #include <string.h>
41 #include <stdbool.h>
42 #include <stdint.h>
43 #include <unistd.h>
44
45 #ifdef HAVE_ICONV
46 # include <iconv.h>
47 #endif /* HAVE_ICONV */
48
49 #include "argp.h"
50 #include "attribute.h"
51 #include "error.h"
52 #include "gl_list.h"
53 #include "xalloc.h"
54 #include "xstrndup.h"
55 #include "xvasprintf.h"
56
57 #include "gettext.h"
58 #include <locale.h>
59 #define _(String) gettext (String)
60
61 #include "manconfig.h"
62
63 #include "debug.h"
64 #include "fatal.h"
65 #include "glcontainers.h"
66
67 #include "decompress.h"
68 #include "manconv.h"
69
70 /* Encoding conversions from groff-1.20/src/preproc/preconv/preconv.cpp.
71 * I've only included those not already recognised by GNU libiconv.
72 */
73 struct conversion_entry {
74 const char *from;
75 const char *to;
76 };
77
78 static struct conversion_entry conversion_table[] = {
79 { "chinese-big5", "Big5" },
80 { "chinese-euc", "GB2312" },
81 { "chinese-iso-8bit", "GB2312" },
82 { "cn-gb-2312", "GB2312" },
83 { "cp878", "KOI8-R" },
84 { "cyrillic-iso-8bit", "ISO-8859-5" },
85 { "cyrillic-koi8", "KOI8-R" },
86 { "euc-china", "GB2312" },
87 { "euc-japan", "EUC-JP" },
88 { "euc-japan-1990", "EUC-JP" },
89 { "euc-kr", "EUC-KR" },
90 { "greek-iso-8bit", "ISO-8859-7" },
91 { "iso-latin-1", "ISO-8859-1" },
92 { "iso-latin-2", "ISO-8859-2" },
93 { "iso-latin-5", "ISO-8859-9" },
94 { "iso-latin-7", "ISO-8859-13" },
95 { "iso-latin-9", "ISO-8859-15" },
96 { "japanese-iso-8bit", "EUC-JP" },
97 { "japanese-euc", "EUC-JP" },
98 { "jis8", "EUC-JP" },
99 { "korean-euc", "EUC-KR" },
100 { "korean-iso-8bit", "EUC-KR" },
101 { "latin-0", "ISO-8859-15" },
102 { "latin-1", "ISO-8859-1" },
103 { "latin-2", "ISO-8859-2" },
104 { "latin-5", "ISO-8859-9" },
105 { "latin-7", "ISO-8859-13" },
106 { "mule-utf-16", "UTF-16" },
107 { "mule-utf-16be", "UTF-16BE" },
108 { "mule-utf-16-be", "UTF-16BE" },
109 { "mule-utf-16be-with-signature", "UTF-16" },
110 { "mule-utf-16le", "UTF-16LE" },
111 { "mule-utf-16-le", "UTF-16LE" },
112 { "mule-utf-16le-with-signature", "UTF-16" },
113 { "mule-utf-8", "UTF-8" },
114 { "utf-16-be", "UTF-16BE" },
115 { "utf-16be-with-signature", "UTF-16" },
116 { "utf-16-be-with-signature", "UTF-16" },
117 { "utf-16-le", "UTF-16LE" },
118 { "utf-16le-with-signature", "UTF-16" },
119 { "utf-16-le-with-signature", "UTF-16" },
120 { NULL, NULL }
121 };
122
123 /* Convert Emacs-style coding tags to ones that libiconv understands. */
124 static char *convert_encoding (char *encoding)
125 {
126 size_t encoding_len = strlen (encoding);
127 const struct conversion_entry *entry;
128
129 #define STRIP(s, l) do { \
130 if (encoding_len > (l) && \
131 !strcasecmp (encoding + encoding_len - (l), (s))) \
132 encoding[encoding_len - (l)] = '\0'; \
133 } while (0)
134
135 STRIP ("-dos", 4);
136 STRIP ("-mac", 4);
137 STRIP ("-unix", 5);
138
139 #undef STRIP
140
141 for (entry = conversion_table; entry->from; ++entry)
142 if (!strcasecmp (entry->from, encoding)) {
143 free (encoding);
144 return xstrdup (entry->to);
145 }
146
147 return encoding;
148 }
149
150 /* Inspect the first line of data from a decompressor for preprocessor
151 * encoding declarations.
152 *
153 * If to_encoding and modified_line are both non-NULL, and if the encoding
154 * declaration in the input does not match to_encoding, then return an
155 * encoding declaration line modified to refer to the given to_encoding in
156 * *modified_line. The caller should free *modified_line.
157 */
158 char *check_preprocessor_encoding (decompress *decomp, const char *to_encoding,
159 char **modified_line)
160 {
161 char *pp_encoding = NULL;
162 const char *line = decompress_peekline (decomp);
163 const char *directive = NULL, *directive_end = NULL, *pp_search = NULL;
164 size_t pp_encoding_len = 0;
165
166 /* Some people use .\" incorrectly. We allow it for encoding
167 * declarations but not for preprocessor declarations.
168 */
169 if (line &&
170 (STRNEQ (line, PP_COOKIE, 4) || STRNEQ (line, ".\\\" ", 4))) {
171 const char *newline = strchr (line, '\n');
172
173 directive = line + 4;
174 directive_end = newline ? newline : strchr (directive, '\0');
175 pp_search = memmem (directive, directive_end - directive,
176 "-*-", 3);
177 }
178
179 if (directive && pp_search) {
180 pp_search += 3;
181 while (pp_search && pp_search < directive_end && *pp_search) {
182 while (*pp_search == ' ')
183 ++pp_search;
184 if (STRNEQ (pp_search, "coding:", 7)) {
185 const char *pp_encoding_allow;
186 pp_search += 7;
187 while (*pp_search == ' ')
188 ++pp_search;
189 pp_encoding_allow = "ABCDEFGHIJKLMNOPQRSTUVWXYZ"
190 "abcdefghijklmnopqrstuvwxyz"
191 "0123456789-_/:.()";
192 pp_encoding_len = strspn (pp_search,
193 pp_encoding_allow);
194 pp_encoding = xstrndup (pp_search,
195 pp_encoding_len);
196 pp_encoding = convert_encoding (pp_encoding);
197 debug ("preprocessor encoding: %s\n",
198 pp_encoding);
199 break;
200 } else {
201 pp_search = memchr (pp_search, ';',
202 directive_end - pp_search);
203 if (pp_search)
204 ++pp_search;
205 }
206 }
207 }
208
209 if (to_encoding && modified_line &&
210 pp_encoding && strcasecmp (pp_encoding, to_encoding)) {
211 assert (directive_end);
212 assert (pp_search);
213 *modified_line = xasprintf
214 ("%.*s%s%.*s\n",
215 (int) (pp_search - line), line,
216 to_encoding,
217 (int) (directive_end - (pp_search + pp_encoding_len)),
218 pp_search + pp_encoding_len);
219 }
220
221 return pp_encoding;
222 }
223
224 static int add_output (const char *inbuf, size_t inlen,
225 struct manconv_outbuf *outbuf)
226 {
227 int ret = 0;
228
229 if (outbuf) {
230 if (outbuf->len + inlen >= outbuf->max)
231 fatal (0, "out of space in output buffer");
232 memcpy (outbuf->buf + outbuf->len, inbuf, inlen);
233 outbuf->len += inlen;
234 } else {
235 int errno_save = errno;
236 if (fwrite (inbuf, 1, inlen, stdout) < inlen ||
237 ferror (stdout)) {
238 error (0, 0, _("can't write to standard output"));
239 ret = -1;
240 }
241 errno = errno_save;
242 }
243
244 return ret;
245 }
246
247 #ifdef HAVE_ICONV
248
249 /* When converting text containing an invalid multibyte sequence to
250 * UTF-8//IGNORE, GNU libc's iconv returns EILSEQ but sets *inbuf to the end
251 * of the input buffer. I'm not sure whether this is a bug or not (it seems
252 * to contradict the documentation), but work around it anyway by recoding
253 * to UTF-8 so that we can accurately position the error.
254 */
255 static off_t locate_error (const char *try_from_code,
256 const char *input, size_t input_size,
257 char *utf8, size_t utf8_size)
258 {
259 iconv_t cd_utf8_strict;
260 char *inptr = (char *) input, *utf8ptr = utf8;
261 size_t inleft = input_size, utf8left = utf8_size;
262 size_t n;
263 off_t ret;
264
265 cd_utf8_strict = iconv_open ("UTF-8", try_from_code);
266 if (cd_utf8_strict == (iconv_t) -1) {
267 error (0, errno, "iconv_open (\"UTF-8\", \"%s\")",
268 try_from_code);
269 return 0;
270 }
271
272 n = iconv (cd_utf8_strict, (ICONV_CONST char **) &inptr, &inleft,
273 &utf8ptr, &utf8left);
274 if (n == (size_t) -1)
275 ret = inptr - input;
276 else
277 ret = 0;
278
279 iconv_close (cd_utf8_strict);
280
281 return ret;
282 }
283
284 typedef enum {
285 TRIED_ICONV_OK = 0,
286 TRIED_ICONV_ERROR = -1, /* can continue with another encoding */
287 TRIED_ICONV_FATAL = -2 /* must give up */
288 } tried_iconv;
289
290 static tried_iconv try_iconv (decompress *decomp, const char *try_from_code,
291 const char *to, bool last,
292 struct manconv_outbuf *outbuf)
293 {
294 char *try_to_code = xstrdup (to);
295 static const size_t buf_size = 65536;
296 size_t input_size = buf_size;
297 off_t input_pos = 0;
298 const char *input;
299 static char *utf8 = NULL, *output = NULL;
300 size_t utf8left = 0;
301 iconv_t cd_utf8, cd = NULL;
302 bool to_utf8 = STREQ (try_to_code, "UTF-8") ||
303 STRNEQ (try_to_code, "UTF-8//", 7);
304 const char *utf8_target = last ? "UTF-8//IGNORE" : "UTF-8";
305 bool ignore_errors = (strstr (try_to_code, "//IGNORE") != NULL);
306 tried_iconv ret = TRIED_ICONV_OK;
307
308 debug ("trying encoding %s -> %s\n", try_from_code, try_to_code);
309
310 cd_utf8 = iconv_open (utf8_target, try_from_code);
311 if (cd_utf8 == (iconv_t) -1) {
312 error (0, errno, "iconv_open (\"%s\", \"%s\")",
313 utf8_target, try_from_code);
314 free (try_to_code);
315 return TRIED_ICONV_ERROR;
316 }
317
318 if (!to_utf8) {
319 cd = iconv_open (try_to_code, "UTF-8");
320 if (cd == (iconv_t) -1) {
321 error (0, errno, "iconv_open (\"%s\", \"UTF-8\")",
322 try_to_code);
323 free (try_to_code);
324 return TRIED_ICONV_ERROR;
325 }
326 }
327
328 input = decompress_peek (decomp, &input_size);
329 if (input_size < buf_size) {
330 /* End of file, error, or just a short read? Repeat until we
331 * have either a full buffer or EOF/error.
332 */
333 while (input_size < buf_size) {
334 size_t old_input_size = input_size;
335 input_size = buf_size;
336 input = decompress_peek (decomp, &input_size);
337 if (input_size == old_input_size)
338 break;
339 }
340 }
341
342 if (!utf8)
343 utf8 = xmalloc (buf_size);
344 if (!output)
345 output = xmalloc (buf_size);
346
347 while (input_size || utf8left) {
348 int handle_iconv_errors = 0;
349 char *inptr = (char *) input, *utf8ptr = utf8, *outptr;
350 size_t inleft = input_size, outleft;
351 size_t n, n2 = -1;
352
353 if (!utf8left) {
354 /* First, convert the text to UTF-8. By assumption,
355 * all validly-encoded text can be converted to
356 * UTF-8 assuming that we picked the correct
357 * encoding. Any errors at this stage are due to
358 * selecting an incorrect encoding, or due to
359 * misencoded source text.
360 */
361 utf8left = buf_size;
362 n = iconv (cd_utf8, (ICONV_CONST char **) &inptr,
363 &inleft, &utf8ptr, &utf8left);
364 utf8left = buf_size - utf8left;
365
366 /* If we need to try the next encoding, do that
367 * before writing anything.
368 */
369 if (!last && n == (size_t) -1 &&
370 (errno == EILSEQ ||
371 (errno == EINVAL && input_size < buf_size))) {
372 ret = TRIED_ICONV_ERROR;
373 break;
374 } else if (n == (size_t) -1)
375 handle_iconv_errors = errno;
376 }
377
378 /* If the target encoding is UTF-8 (the common case), then
379 * we can just write out what we've got. Otherwise, we need
380 * to convert to the target encoding. Any errors at this
381 * stage are due to characters that are not representable in
382 * the target encoding.
383 */
384 if (handle_iconv_errors)
385 /* Fall back to error handling below. If we have
386 * anything to write out, we'll do it next time
387 * round the loop.
388 */
389 outptr = output;
390 else if (to_utf8) {
391 memcpy (output, utf8, utf8left);
392 outptr = output + utf8left;
393 outleft = utf8left;
394 utf8left = 0;
395 } else if (utf8left) {
396 outptr = output;
397 outleft = buf_size;
398 utf8ptr = utf8;
399 n2 = iconv (
400 cd, (ICONV_CONST char **) &utf8ptr, &utf8left,
401 &outptr, &outleft);
402 outleft = buf_size - outleft;
403 if (n2 == (size_t) -1)
404 handle_iconv_errors = errno;
405
406 if (n2 == (size_t) -1 &&
407 errno == EILSEQ && ignore_errors)
408 errno = 0;
409 } else
410 /* We appear to have converted some input text, but
411 * not actually ended up with any UTF-8 text. This
412 * is odd. However, we can at least continue round
413 * the loop, skip the input text we converted, and
414 * then we should get a different result next time.
415 */
416 outptr = output;
417
418 if (outptr != output) {
419 /* We have something to write out. */
420 if (add_output (output, outleft, outbuf) != 0) {
421 ret = TRIED_ICONV_FATAL;
422 goto out;
423 }
424 }
425
426 if (!to_utf8 && n2 != (size_t) -1) {
427 /* All the UTF-8 text we have so far was processed.
428 * For state-dependent character sets we have to
429 * flush the state now.
430 */
431 outptr = output;
432 outleft = buf_size;
433 iconv (cd, NULL, NULL, &outptr, &outleft);
434 outleft = buf_size - outleft;
435
436 if (outptr != output) {
437 /* We have something to write out. */
438 if (add_output (output, outleft,
439 outbuf) != 0) {
440 ret = TRIED_ICONV_FATAL;
441 goto out;
442 }
443 }
444 } else if (handle_iconv_errors) {
445 intmax_t error_pos;
446
447 if (handle_iconv_errors == EILSEQ && !ignore_errors) {
448 if (!quiet) {
449 error_pos = input_pos + locate_error (
450 try_from_code,
451 input, input_size,
452 utf8, buf_size);
453 error (0, handle_iconv_errors,
454 "byte %jd: iconv", error_pos);
455 }
456 ret = TRIED_ICONV_FATAL;
457 goto out;
458 } else if (handle_iconv_errors == EINVAL &&
459 input_size < buf_size) {
460 if (!quiet) {
461 error_pos = input_pos + locate_error (
462 try_from_code,
463 input, input_size,
464 utf8, buf_size);
465 error (0, 0, "byte %jd: %s", error_pos,
466 _("iconv: incomplete character "
467 "at end of buffer"));
468 }
469 ret = TRIED_ICONV_FATAL;
470 goto out;
471 }
472 }
473
474 if (inptr != input) {
475 decompress_peek_skip (decomp, input_size - inleft);
476 input_pos += input_size - inleft;
477 }
478
479 /* Unless we have some UTF-8 text left (which will only
480 * happen if the output encoding is more verbose than UTF-8,
481 * so is unlikely for legacy encodings), we need to fetch
482 * more input text now.
483 */
484 if (!utf8left) {
485 input_size = buf_size;
486 input = decompress_peek (decomp, &input_size);
487 while (input_size < buf_size) {
488 size_t old_input_size = input_size;
489 input_size = buf_size;
490 input = decompress_peek (decomp, &input_size);
491 if (input_size == old_input_size)
492 break;
493 }
494 }
495 }
496
497 out:
498 if (!to_utf8)
499 iconv_close (cd);
500 iconv_close (cd_utf8);
501 free (try_to_code);
502
503 return ret;
504 }
505
506 int manconv (decompress *decomp, gl_list_t from, const char *to,
507 struct manconv_outbuf *outbuf)
508 {
509 char *pp_encoding;
510 const char *try_from_code;
511 char *plain_to, *modified_pp_line = NULL;
512 tried_iconv tried;
513 int ret = 0;
514
515 plain_to = xstrndup (to, strcspn (to, "/"));
516 pp_encoding = check_preprocessor_encoding
517 (decomp, plain_to, &modified_pp_line);
518 if (pp_encoding) {
519 if (modified_pp_line) {
520 size_t len = strlen (modified_pp_line);
521 decompress_readline (decomp);
522 if (add_output (modified_pp_line, len, outbuf) != 0) {
523 ret = -1;
524 goto out;
525 }
526 }
527 tried = try_iconv (decomp, pp_encoding, to, 1, outbuf);
528 if (tried == TRIED_ICONV_FATAL)
529 ret = -1;
530 } else {
531 GL_LIST_FOREACH (from, try_from_code) {
532 bool last = !gl_list_next_node (from, from_node);
533 tried = try_iconv (decomp, try_from_code, to, last,
534 outbuf);
535 if (tried == TRIED_ICONV_OK)
536 break;
537 else if (tried == TRIED_ICONV_FATAL) {
538 ret = -1;
539 goto out;
540 }
541 }
542 }
543
544 out:
545 free (modified_pp_line);
546 free (pp_encoding);
547 free (plain_to);
548 return ret;
549 }
550
551 #else /* !HAVE_ICONV */
552
553 /* If we don't have iconv, there isn't much we can do; just pass everything
554 * through unchanged.
555 */
556 int manconv (decompress *decomp, gl_list_t from MAYBE_UNUSED,
557 const char *to MAYBE_UNUSED, struct manconv_outbuf *outbuf)
558 {
559 for (;;) {
560 size_t len = 4096;
561 const char *buffer = decompress_read (decomp, &len);
562 if (len == 0)
563 break;
564 if (add_output (buffer, len, outbuf) != 0)
565 return -1;
566 }
567 return 0;
568 }
569
570 #endif /* HAVE_ICONV */