1 /* Character set conversion with error handling.
2 Copyright (C) 2001-2023 Free Software Foundation, Inc.
3 Written by Bruno Haible and Simon Josefsson.
4
5 This file is free software: you can redistribute it and/or modify
6 it under the terms of the GNU Lesser General Public License as
7 published by the Free Software Foundation; either version 2.1 of the
8 License, or (at your option) any later version.
9
10 This file is distributed in the hope that it will be useful,
11 but WITHOUT ANY WARRANTY; without even the implied warranty of
12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13 GNU Lesser General Public License for more details.
14
15 You should have received a copy of the GNU Lesser General Public License
16 along with this program. If not, see <https://www.gnu.org/licenses/>. */
17
18 #include <config.h>
19
20 /* Specification. */
21 #include "striconveh.h"
22
23 #include <errno.h>
24 #include <stdlib.h>
25 #include <string.h>
26
27 #if HAVE_ICONV
28 # include <iconv.h>
29 # include "unistr.h"
30 #endif
31
32 #include "c-strcase.h"
33 #include "c-strcaseeq.h"
34
35 #ifndef SIZE_MAX
36 # define SIZE_MAX ((size_t) -1)
37 #endif
38
39
40 #if HAVE_ICONV
41
42 /* The caller must provide an iconveh_t, not just an iconv_t, because when a
43 conversion error occurs, we may have to determine the Unicode representation
44 of the inconvertible character. */
45
46 int
47 iconveh_open (const char *to_codeset, const char *from_codeset, iconveh_t *cdp)
48 {
49 iconv_t cd;
50 iconv_t cd1;
51 iconv_t cd2;
52
53 /* Avoid glibc-2.1 bug with EUC-KR. */
54 # if ((__GLIBC__ == 2 && __GLIBC_MINOR__ <= 1) && !defined __UCLIBC__) \
55 && !defined _LIBICONV_VERSION
56 if (c_strcasecmp (from_codeset, "EUC-KR") == 0
57 || c_strcasecmp (to_codeset, "EUC-KR") == 0)
58 {
59 errno = EINVAL;
60 return -1;
61 }
62 # endif
63
64 cd = iconv_open (to_codeset, from_codeset);
65
66 if (STRCASEEQ (from_codeset, "UTF-8", 'U','T','F','-','8',0,0,0,0))
67 cd1 = (iconv_t)(-1);
68 else
69 {
70 cd1 = iconv_open ("UTF-8", from_codeset);
71 if (cd1 == (iconv_t)(-1))
72 {
73 int saved_errno = errno;
74 if (cd != (iconv_t)(-1))
75 iconv_close (cd);
76 errno = saved_errno;
77 return -1;
78 }
79 }
80
81 if (STRCASEEQ (to_codeset, "UTF-8", 'U','T','F','-','8',0,0,0,0)
82 # if (((__GLIBC__ == 2 && __GLIBC_MINOR__ >= 2) || __GLIBC__ > 2) \
83 && !defined __UCLIBC__) \
84 || _LIBICONV_VERSION >= 0x0105
85 || c_strcasecmp (to_codeset, "UTF-8//TRANSLIT") == 0
86 # endif
87 )
88 cd2 = (iconv_t)(-1);
89 else
90 {
91 cd2 = iconv_open (to_codeset, "UTF-8");
92 if (cd2 == (iconv_t)(-1))
93 {
94 int saved_errno = errno;
95 if (cd1 != (iconv_t)(-1))
96 iconv_close (cd1);
97 if (cd != (iconv_t)(-1))
98 iconv_close (cd);
99 errno = saved_errno;
100 return -1;
101 }
102 }
103
104 cdp->cd = cd;
105 cdp->cd1 = cd1;
106 cdp->cd2 = cd2;
107 return 0;
108 }
109
110 int
111 iconveh_close (const iconveh_t *cd)
112 {
113 if (cd->cd2 != (iconv_t)(-1) && iconv_close (cd->cd2) < 0)
114 {
115 /* Return -1, but preserve the errno from iconv_close. */
116 int saved_errno = errno;
117 if (cd->cd1 != (iconv_t)(-1))
118 iconv_close (cd->cd1);
119 if (cd->cd != (iconv_t)(-1))
120 iconv_close (cd->cd);
121 errno = saved_errno;
122 return -1;
123 }
124 if (cd->cd1 != (iconv_t)(-1) && iconv_close (cd->cd1) < 0)
125 {
126 /* Return -1, but preserve the errno from iconv_close. */
127 int saved_errno = errno;
128 if (cd->cd != (iconv_t)(-1))
129 iconv_close (cd->cd);
130 errno = saved_errno;
131 return -1;
132 }
133 if (cd->cd != (iconv_t)(-1) && iconv_close (cd->cd) < 0)
134 return -1;
135 return 0;
136 }
137
138 /* iconv_carefully is like iconv, except that it stops as soon as it encounters
139 a conversion error, and it returns in *INCREMENTED a boolean telling whether
140 it has incremented the input pointers past the error location. */
141 # if !defined _LIBICONV_VERSION && !(defined __GLIBC__ && !defined __UCLIBC__)
142 /* Irix iconv() inserts a NUL byte if it cannot convert.
143 NetBSD iconv() inserts a question mark if it cannot convert.
144 Only GNU libiconv and GNU libc are known to prefer to fail rather
145 than doing a lossy conversion. */
146 static size_t
147 iconv_carefully (iconv_t cd,
148 const char **inbuf, size_t *inbytesleft,
149 char **outbuf, size_t *outbytesleft,
150 bool *incremented)
151 {
152 const char *inptr = *inbuf;
153 const char *inptr_end = inptr + *inbytesleft;
154 char *outptr = *outbuf;
155 size_t outsize = *outbytesleft;
156 const char *inptr_before;
157 size_t res;
158
159 do
160 {
161 size_t insize;
162
163 inptr_before = inptr;
164 res = (size_t)(-1);
165
166 for (insize = 1; inptr + insize <= inptr_end; insize++)
167 {
168 res = iconv (cd,
169 (ICONV_CONST char **) &inptr, &insize,
170 &outptr, &outsize);
171 if (!(res == (size_t)(-1) && errno == EINVAL))
172 break;
173 /* iconv can eat up a shift sequence but give EINVAL while attempting
174 to convert the first character. E.g. libiconv does this. */
175 if (inptr > inptr_before)
176 {
177 res = 0;
178 break;
179 }
180 }
181
182 if (res == 0)
183 {
184 *outbuf = outptr;
185 *outbytesleft = outsize;
186 }
187 }
188 while (res == 0 && inptr < inptr_end);
189
190 *inbuf = inptr;
191 *inbytesleft = inptr_end - inptr;
192 if (res != (size_t)(-1) && res > 0)
193 {
194 /* iconv() has already incremented INPTR. We cannot go back to a
195 previous INPTR, otherwise the state inside CD would become invalid,
196 if FROM_CODESET is a stateful encoding. So, tell the caller that
197 *INBUF has already been incremented. */
198 *incremented = (inptr > inptr_before);
199 errno = EILSEQ;
200 return (size_t)(-1);
201 }
202 else
203 {
204 *incremented = false;
205 return res;
206 }
207 }
208 # else
209 # define iconv_carefully(cd, inbuf, inbytesleft, outbuf, outbytesleft, incremented) \
210 (*(incremented) = false, \
211 iconv (cd, (ICONV_CONST char **) (inbuf), inbytesleft, outbuf, outbytesleft))
212 # endif
213
214 /* iconv_carefully_1 is like iconv_carefully, except that it stops after
215 converting one character or one shift sequence. */
216 static size_t
217 iconv_carefully_1 (iconv_t cd,
218 const char **inbuf, size_t *inbytesleft,
219 char **outbuf, size_t *outbytesleft,
220 bool *incremented)
221 {
222 const char *inptr_before = *inbuf;
223 const char *inptr = inptr_before;
224 const char *inptr_end = inptr_before + *inbytesleft;
225 char *outptr = *outbuf;
226 size_t outsize = *outbytesleft;
227 size_t res = (size_t)(-1);
228 size_t insize;
229
230 for (insize = 1; inptr_before + insize <= inptr_end; insize++)
231 {
232 inptr = inptr_before;
233 res = iconv (cd,
234 (ICONV_CONST char **) &inptr, &insize,
235 &outptr, &outsize);
236 if (!(res == (size_t)(-1) && errno == EINVAL))
237 break;
238 /* iconv can eat up a shift sequence but give EINVAL while attempting
239 to convert the first character. E.g. libiconv does this. */
240 if (inptr > inptr_before)
241 {
242 res = 0;
243 break;
244 }
245 }
246
247 *inbuf = inptr;
248 *inbytesleft = inptr_end - inptr;
249 # if !defined _LIBICONV_VERSION && !(defined __GLIBC__ && !defined __UCLIBC__)
250 /* Irix iconv() inserts a NUL byte if it cannot convert.
251 NetBSD iconv() inserts a question mark if it cannot convert.
252 Only GNU libiconv and GNU libc are known to prefer to fail rather
253 than doing a lossy conversion. */
254 if (res != (size_t)(-1) && res > 0)
255 {
256 /* iconv() has already incremented INPTR. We cannot go back to a
257 previous INPTR, otherwise the state inside CD would become invalid,
258 if FROM_CODESET is a stateful encoding. So, tell the caller that
259 *INBUF has already been incremented. */
260 *incremented = (inptr > inptr_before);
261 errno = EILSEQ;
262 return (size_t)(-1);
263 }
264 # endif
265
266 if (res != (size_t)(-1))
267 {
268 *outbuf = outptr;
269 *outbytesleft = outsize;
270 }
271 *incremented = false;
272 return res;
273 }
274
275 /* utf8conv_carefully is like iconv, except that
276 - it converts from UTF-8 to UTF-8,
277 - it stops as soon as it encounters a conversion error, and it returns
278 in *INCREMENTED a boolean telling whether it has incremented the input
279 pointers past the error location,
280 - if one_character_only is true, it stops after converting one
281 character. */
282 static size_t
283 utf8conv_carefully (bool one_character_only,
284 const char **inbuf, size_t *inbytesleft,
285 char **outbuf, size_t *outbytesleft,
286 bool *incremented)
287 {
288 const char *inptr = *inbuf;
289 size_t insize = *inbytesleft;
290 char *outptr = *outbuf;
291 size_t outsize = *outbytesleft;
292 size_t res;
293
294 res = 0;
295 do
296 {
297 ucs4_t uc;
298 int n;
299 int m;
300
301 n = u8_mbtoucr (&uc, (const uint8_t *) inptr, insize);
302 if (n < 0)
303 {
304 errno = (n == -2 ? EINVAL : EILSEQ);
305 n = u8_mbtouc (&uc, (const uint8_t *) inptr, insize);
306 inptr += n;
307 insize -= n;
308 res = (size_t)(-1);
309 *incremented = true;
310 break;
311 }
312 if (outsize == 0)
313 {
314 errno = E2BIG;
315 res = (size_t)(-1);
316 *incremented = false;
317 break;
318 }
319 m = u8_uctomb ((uint8_t *) outptr, uc, outsize);
320 if (m == -2)
321 {
322 errno = E2BIG;
323 res = (size_t)(-1);
324 *incremented = false;
325 break;
326 }
327 inptr += n;
328 insize -= n;
329 if (m == -1)
330 {
331 errno = EILSEQ;
332 res = (size_t)(-1);
333 *incremented = true;
334 break;
335 }
336 outptr += m;
337 outsize -= m;
338 }
339 while (!one_character_only && insize > 0);
340
341 *inbuf = inptr;
342 *inbytesleft = insize;
343 *outbuf = outptr;
344 *outbytesleft = outsize;
345 return res;
346 }
347
348 static int
349 mem_cd_iconveh_internal (const char *src, size_t srclen,
350 iconv_t cd, iconv_t cd1, iconv_t cd2,
351 enum iconv_ilseq_handler handler,
352 size_t extra_alloc,
353 size_t *offsets,
354 char **resultp, size_t *lengthp)
355 {
356 /* When a conversion error occurs, we cannot start using CD1 and CD2 at
357 this point: FROM_CODESET may be a stateful encoding like ISO-2022-KR.
358 Instead, we have to start afresh from the beginning of SRC. */
359 /* Use a temporary buffer, so that for small strings, a single malloc()
360 call will be sufficient. */
361 # define tmpbufsize 4096
362 /* The alignment is needed when converting e.g. to glibc's WCHAR_T or
363 libiconv's UCS-4-INTERNAL encoding. */
364 union { unsigned int align; char buf[tmpbufsize]; } tmp;
365 # define tmpbuf tmp.buf
366
367 char *initial_result;
368 char *result;
369 size_t allocated;
370 size_t length;
371 size_t last_length = (size_t)(-1); /* only needed if offsets != NULL */
372
373 if (*resultp != NULL && *lengthp >= sizeof (tmpbuf))
374 {
375 initial_result = *resultp;
376 allocated = *lengthp;
377 }
378 else
379 {
380 initial_result = tmpbuf;
381 allocated = sizeof (tmpbuf);
382 }
383 result = initial_result;
384
385 /* Test whether a direct conversion is possible at all. */
386 if (cd == (iconv_t)(-1))
387 goto indirectly;
388
389 if (offsets != NULL)
390 {
391 size_t i;
392
393 for (i = 0; i < srclen; i++)
394 offsets[i] = (size_t)(-1);
395
396 last_length = (size_t)(-1);
397 }
398 length = 0;
399
400 /* First, try a direct conversion, and see whether a conversion error
401 occurs at all. */
402 {
403 const char *inptr = src;
404 size_t insize = srclen;
405
406 /* Avoid glibc-2.1 bug and Solaris 2.7-2.9 bug. */
407 # if defined _LIBICONV_VERSION \
408 || !(((__GLIBC__ == 2 && __GLIBC_MINOR__ <= 1) && !defined __UCLIBC__) \
409 || defined __sun)
410 /* Set to the initial state. */
411 iconv (cd, NULL, NULL, NULL, NULL);
412 # endif
413
414 while (insize > 0)
415 {
416 char *outptr = result + length;
417 size_t outsize = allocated - extra_alloc - length;
418 bool incremented;
419 size_t res;
420 bool grow;
421
422 if (offsets != NULL)
423 {
424 if (length != last_length) /* ensure that offset[] be increasing */
425 {
426 offsets[inptr - src] = length;
427 last_length = length;
428 }
429 res = iconv_carefully_1 (cd,
430 &inptr, &insize,
431 &outptr, &outsize,
432 &incremented);
433 }
434 else
435 /* Use iconv_carefully instead of iconv here, because:
436 - If TO_CODESET is UTF-8, we can do the error handling in this
437 loop, no need for a second loop,
438 - With iconv() implementations other than GNU libiconv and GNU
439 libc, if we use iconv() in a big swoop, checking for an E2BIG
440 return, we lose the number of irreversible conversions. */
441 res = iconv_carefully (cd,
442 &inptr, &insize,
443 &outptr, &outsize,
444 &incremented);
445
446 length = outptr - result;
447 grow = (length + extra_alloc > allocated / 2);
448 if (res == (size_t)(-1))
449 {
450 if (errno == E2BIG)
451 grow = true;
452 else if (errno == EINVAL)
453 break;
454 else if (errno == EILSEQ && handler != iconveh_error)
455 {
456 if (cd2 == (iconv_t)(-1))
457 {
458 /* TO_CODESET is UTF-8. */
459 /* Error handling can produce up to 1 or 3 bytes of
460 output. */
461 size_t extra_need =
462 (handler == iconveh_replacement_character ? 3 : 1);
463 if (length + extra_need + extra_alloc > allocated)
464 {
465 char *memory;
466
467 allocated = 2 * allocated;
468 if (length + extra_need + extra_alloc > allocated)
469 allocated = 2 * allocated;
470 if (length + extra_need + extra_alloc > allocated)
471 abort ();
472 if (result == initial_result)
473 memory = (char *) malloc (allocated);
474 else
475 memory = (char *) realloc (result, allocated);
476 if (memory == NULL)
477 {
478 if (result != initial_result)
479 free (result);
480 errno = ENOMEM;
481 return -1;
482 }
483 if (result == initial_result)
484 memcpy (memory, initial_result, length);
485 result = memory;
486 grow = false;
487 }
488 /* The input is invalid in FROM_CODESET. Eat up one byte
489 and emit a replacement character or a question mark. */
490 if (!incremented)
491 {
492 if (insize == 0)
493 abort ();
494 inptr++;
495 insize--;
496 }
497 if (handler == iconveh_replacement_character)
498 {
499 /* U+FFFD in UTF-8 encoding. */
500 result[length+0] = '\357';
501 result[length+1] = '\277';
502 result[length+2] = '\275';
503 length += 3;
504 }
505 else
506 {
507 result[length] = '?';
508 length++;
509 }
510 }
511 else
512 goto indirectly;
513 }
514 else
515 {
516 if (result != initial_result)
517 free (result);
518 return -1;
519 }
520 }
521 if (insize == 0)
522 break;
523 if (grow)
524 {
525 char *memory;
526
527 allocated = 2 * allocated;
528 if (result == initial_result)
529 memory = (char *) malloc (allocated);
530 else
531 memory = (char *) realloc (result, allocated);
532 if (memory == NULL)
533 {
534 if (result != initial_result)
535 free (result);
536 errno = ENOMEM;
537 return -1;
538 }
539 if (result == initial_result)
540 memcpy (memory, initial_result, length);
541 result = memory;
542 }
543 }
544 }
545
546 /* Now get the conversion state back to the initial state.
547 But avoid glibc-2.1 bug and Solaris 2.7 bug. */
548 #if defined _LIBICONV_VERSION \
549 || !(((__GLIBC__ == 2 && __GLIBC_MINOR__ <= 1) && !defined __UCLIBC__) \
550 || defined __sun)
551 for (;;)
552 {
553 char *outptr = result + length;
554 size_t outsize = allocated - extra_alloc - length;
555 size_t res;
556
557 res = iconv (cd, NULL, NULL, &outptr, &outsize);
558 length = outptr - result;
559 if (res == (size_t)(-1))
560 {
561 if (errno == E2BIG)
562 {
563 char *memory;
564
565 allocated = 2 * allocated;
566 if (result == initial_result)
567 memory = (char *) malloc (allocated);
568 else
569 memory = (char *) realloc (result, allocated);
570 if (memory == NULL)
571 {
572 if (result != initial_result)
573 free (result);
574 errno = ENOMEM;
575 return -1;
576 }
577 if (result == initial_result)
578 memcpy (memory, initial_result, length);
579 result = memory;
580 }
581 else
582 {
583 if (result != initial_result)
584 free (result);
585 return -1;
586 }
587 }
588 else
589 break;
590 }
591 #endif
592
593 /* The direct conversion succeeded. */
594 goto done;
595
596 indirectly:
597 /* The direct conversion failed.
598 Use a conversion through UTF-8. */
599 if (offsets != NULL)
600 {
601 size_t i;
602
603 for (i = 0; i < srclen; i++)
604 offsets[i] = (size_t)(-1);
605
606 last_length = (size_t)(-1);
607 }
608 length = 0;
609 {
610 const bool slowly = (offsets != NULL || handler == iconveh_error);
611 # define utf8bufsize 4096 /* may also be smaller or larger than tmpbufsize */
612 char utf8buf[utf8bufsize + 3];
613 size_t utf8len = 0;
614 const char *in1ptr = src;
615 size_t in1size = srclen;
616 bool do_final_flush1 = true;
617 bool do_final_flush2 = true;
618
619 /* Avoid glibc-2.1 bug and Solaris 2.7-2.9 bug. */
620 # if defined _LIBICONV_VERSION \
621 || !(((__GLIBC__ == 2 && __GLIBC_MINOR__ <= 1) && !defined __UCLIBC__) \
622 || defined __sun)
623 /* Set to the initial state. */
624 if (cd1 != (iconv_t)(-1))
625 iconv (cd1, NULL, NULL, NULL, NULL);
626 if (cd2 != (iconv_t)(-1))
627 iconv (cd2, NULL, NULL, NULL, NULL);
628 # endif
629
630 while (in1size > 0 || do_final_flush1 || utf8len > 0 || do_final_flush2)
631 {
632 char *out1ptr = utf8buf + utf8len;
633 size_t out1size = utf8bufsize - utf8len;
634 bool incremented1;
635 size_t res1;
636 int errno1;
637
638 /* Conversion step 1: from FROM_CODESET to UTF-8. */
639 if (in1size > 0)
640 {
641 if (offsets != NULL
642 && length != last_length) /* ensure that offset[] be increasing */
643 {
644 offsets[in1ptr - src] = length;
645 last_length = length;
646 }
647 if (cd1 != (iconv_t)(-1))
648 {
649 if (slowly)
650 res1 = iconv_carefully_1 (cd1,
651 &in1ptr, &in1size,
652 &out1ptr, &out1size,
653 &incremented1);
654 else
655 res1 = iconv_carefully (cd1,
656 &in1ptr, &in1size,
657 &out1ptr, &out1size,
658 &incremented1);
659 }
660 else
661 {
662 /* FROM_CODESET is UTF-8. */
663 res1 = utf8conv_carefully (slowly,
664 &in1ptr, &in1size,
665 &out1ptr, &out1size,
666 &incremented1);
667 }
668 }
669 else if (do_final_flush1)
670 {
671 /* Now get the conversion state of CD1 back to the initial state.
672 But avoid glibc-2.1 bug and Solaris 2.7 bug. */
673 # if defined _LIBICONV_VERSION \
674 || !(((__GLIBC__ == 2 && __GLIBC_MINOR__ <= 1) && !defined __UCLIBC__) \
675 || defined __sun)
676 if (cd1 != (iconv_t)(-1))
677 res1 = iconv (cd1, NULL, NULL, &out1ptr, &out1size);
678 else
679 # endif
680 res1 = 0;
681 do_final_flush1 = false;
682 incremented1 = true;
683 }
684 else
685 {
686 res1 = 0;
687 incremented1 = true;
688 }
689 if (res1 == (size_t)(-1)
690 && !(errno == E2BIG || errno == EINVAL || errno == EILSEQ))
691 {
692 if (result != initial_result)
693 free (result);
694 return -1;
695 }
696 if (res1 == (size_t)(-1)
697 && errno == EILSEQ && handler != iconveh_error)
698 {
699 /* The input is invalid in FROM_CODESET. Eat up one byte and
700 emit a U+FFFD character or a question mark. Room for this
701 character was allocated at the end of utf8buf. */
702 if (!incremented1)
703 {
704 if (in1size == 0)
705 abort ();
706 in1ptr++;
707 in1size--;
708 }
709 if (handler == iconveh_replacement_character)
710 {
711 /* U+FFFD in UTF-8 encoding. */
712 out1ptr[0] = '\357';
713 out1ptr[1] = '\277';
714 out1ptr[2] = '\275';
715 out1ptr += 3;
716 }
717 else
718 *out1ptr++ = '?';
719 res1 = 0;
720 }
721 errno1 = errno;
722 utf8len = out1ptr - utf8buf;
723
724 if (offsets != NULL
725 || in1size == 0
726 || utf8len > utf8bufsize / 2
727 || (res1 == (size_t)(-1) && errno1 == E2BIG))
728 {
729 /* Conversion step 2: from UTF-8 to TO_CODESET. */
730 const char *in2ptr = utf8buf;
731 size_t in2size = utf8len;
732
733 while (in2size > 0
734 || (in1size == 0 && !do_final_flush1 && do_final_flush2))
735 {
736 char *out2ptr = result + length;
737 size_t out2size = allocated - extra_alloc - length;
738 bool incremented2;
739 size_t res2;
740 bool grow;
741
742 if (in2size > 0)
743 {
744 if (cd2 != (iconv_t)(-1))
745 res2 = iconv_carefully (cd2,
746 &in2ptr, &in2size,
747 &out2ptr, &out2size,
748 &incremented2);
749 else
750 /* TO_CODESET is UTF-8. */
751 res2 = utf8conv_carefully (false,
752 &in2ptr, &in2size,
753 &out2ptr, &out2size,
754 &incremented2);
755 }
756 else /* in1size == 0 && !do_final_flush1
757 && in2size == 0 && do_final_flush2 */
758 {
759 /* Now get the conversion state of CD1 back to the initial
760 state. But avoid glibc-2.1 bug and Solaris 2.7 bug. */
761 # if defined _LIBICONV_VERSION \
762 || !(((__GLIBC__ == 2 && __GLIBC_MINOR__ <= 1) && !defined __UCLIBC__) \
763 || defined __sun)
764 if (cd2 != (iconv_t)(-1))
765 res2 = iconv (cd2, NULL, NULL, &out2ptr, &out2size);
766 else
767 # endif
768 res2 = 0;
769 do_final_flush2 = false;
770 incremented2 = true;
771 }
772
773 length = out2ptr - result;
774 grow = (length + extra_alloc > allocated / 2);
775 if (res2 == (size_t)(-1))
776 {
777 if (errno == E2BIG)
778 grow = true;
779 else if (errno == EINVAL)
780 break;
781 else if (errno == EILSEQ && handler != iconveh_error)
782 {
783 /* Error handling can produce up to 10 bytes of UTF-8
784 output. But TO_CODESET may be UCS-2, UTF-16 or
785 UCS-4, so use CD2 here as well. */
786 char scratchbuf[10];
787 size_t scratchlen;
788 ucs4_t uc;
789 const char *inptr;
790 size_t insize;
791 size_t res;
792
793 if (incremented2)
794 {
795 if (u8_prev (&uc, (const uint8_t *) in2ptr,
796 (const uint8_t *) utf8buf)
797 == NULL)
798 abort ();
799 }
800 else
801 {
802 int n;
803 if (in2size == 0)
804 abort ();
805 n = u8_mbtouc_unsafe (&uc, (const uint8_t *) in2ptr,
806 in2size);
807 in2ptr += n;
808 in2size -= n;
809 }
810
811 if (handler == iconveh_escape_sequence)
812 {
813 static char const hex[16] = "0123456789ABCDEF";
814 scratchlen = 0;
815 scratchbuf[scratchlen++] = '\\';
816 if (uc < 0x10000)
817 scratchbuf[scratchlen++] = 'u';
818 else
819 {
820 scratchbuf[scratchlen++] = 'U';
821 scratchbuf[scratchlen++] = hex[(uc>>28) & 15];
822 scratchbuf[scratchlen++] = hex[(uc>>24) & 15];
823 scratchbuf[scratchlen++] = hex[(uc>>20) & 15];
824 scratchbuf[scratchlen++] = hex[(uc>>16) & 15];
825 }
826 scratchbuf[scratchlen++] = hex[(uc>>12) & 15];
827 scratchbuf[scratchlen++] = hex[(uc>>8) & 15];
828 scratchbuf[scratchlen++] = hex[(uc>>4) & 15];
829 scratchbuf[scratchlen++] = hex[uc & 15];
830 }
831 else if (handler == iconveh_replacement_character)
832 {
833 /* U+FFFD in UTF-8 encoding. */
834 scratchbuf[0] = '\357';
835 scratchbuf[1] = '\277';
836 scratchbuf[2] = '\275';
837 scratchlen = 3;
838 }
839 else
840 {
841 scratchbuf[0] = '?';
842 scratchlen = 1;
843 }
844
845 inptr = scratchbuf;
846 insize = scratchlen;
847 if (cd2 != (iconv_t)(-1))
848 {
849 char *out2ptr_try = out2ptr;
850 size_t out2size_try = out2size;
851 res = iconv (cd2,
852 (ICONV_CONST char **) &inptr, &insize,
853 &out2ptr_try, &out2size_try);
854 if (handler == iconveh_replacement_character
855 && (res == (size_t)(-1)
856 ? errno == EILSEQ
857 /* FreeBSD iconv(), NetBSD iconv(), and
858 Solaris 11 iconv() insert a '?' if they
859 cannot convert. This is what we want.
860 But IRIX iconv() inserts a NUL byte if it
861 cannot convert.
862 And musl libc iconv() inserts a '*' if it
863 cannot convert. */
864 : (res > 0
865 && !(out2ptr_try - out2ptr == 1
866 && *out2ptr == '?'))))
867 {
868 /* The iconv() call failed.
869 U+FFFD can't be converted to TO_CODESET.
870 Use '?' instead. */
871 scratchbuf[0] = '?';
872 scratchlen = 1;
873 inptr = scratchbuf;
874 insize = scratchlen;
875 res = iconv (cd2,
876 (ICONV_CONST char **) &inptr, &insize,
877 &out2ptr, &out2size);
878 }
879 else
880 {
881 /* Accept the results of the iconv() call. */
882 out2ptr = out2ptr_try;
883 out2size = out2size_try;
884 res = 0;
885 }
886 }
887 else
888 {
889 /* TO_CODESET is UTF-8. */
890 if (out2size >= insize)
891 {
892 memcpy (out2ptr, inptr, insize);
893 out2ptr += insize;
894 out2size -= insize;
895 inptr += insize;
896 insize = 0;
897 res = 0;
898 }
899 else
900 {
901 errno = E2BIG;
902 res = (size_t)(-1);
903 }
904 }
905 length = out2ptr - result;
906 if (res == (size_t)(-1) && errno == E2BIG)
907 {
908 char *memory;
909
910 allocated = 2 * allocated;
911 if (length + 1 + extra_alloc > allocated)
912 abort ();
913 if (result == initial_result)
914 memory = (char *) malloc (allocated);
915 else
916 memory = (char *) realloc (result, allocated);
917 if (memory == NULL)
918 {
919 if (result != initial_result)
920 free (result);
921 errno = ENOMEM;
922 return -1;
923 }
924 if (result == initial_result)
925 memcpy (memory, initial_result, length);
926 result = memory;
927 grow = false;
928
929 out2ptr = result + length;
930 out2size = allocated - extra_alloc - length;
931 if (cd2 != (iconv_t)(-1))
932 res = iconv (cd2,
933 (ICONV_CONST char **) &inptr,
934 &insize,
935 &out2ptr, &out2size);
936 else
937 {
938 /* TO_CODESET is UTF-8. */
939 if (!(out2size >= insize))
940 abort ();
941 memcpy (out2ptr, inptr, insize);
942 out2ptr += insize;
943 out2size -= insize;
944 inptr += insize;
945 insize = 0;
946 res = 0;
947 }
948 length = out2ptr - result;
949 }
950 # if !defined _LIBICONV_VERSION && !(defined __GLIBC__ && !defined __UCLIBC__)
951 /* IRIX iconv() inserts a NUL byte if it cannot convert.
952 FreeBSD iconv(), NetBSD iconv(), and Solaris 11
953 iconv() insert a '?' if they cannot convert.
954 musl libc iconv() inserts a '*' if it cannot convert.
955 Only GNU libiconv and GNU libc are known to prefer
956 to fail rather than doing a lossy conversion. */
957 if (res != (size_t)(-1) && res > 0)
958 {
959 errno = EILSEQ;
960 res = (size_t)(-1);
961 }
962 # endif
963 if (res == (size_t)(-1))
964 {
965 /* Failure converting the ASCII replacement. */
966 if (result != initial_result)
967 free (result);
968 return -1;
969 }
970 }
971 else
972 {
973 if (result != initial_result)
974 free (result);
975 return -1;
976 }
977 }
978 if (!(in2size > 0
979 || (in1size == 0 && !do_final_flush1 && do_final_flush2)))
980 break;
981 if (grow)
982 {
983 char *memory;
984
985 allocated = 2 * allocated;
986 if (result == initial_result)
987 memory = (char *) malloc (allocated);
988 else
989 memory = (char *) realloc (result, allocated);
990 if (memory == NULL)
991 {
992 if (result != initial_result)
993 free (result);
994 errno = ENOMEM;
995 return -1;
996 }
997 if (result == initial_result)
998 memcpy (memory, initial_result, length);
999 result = memory;
1000 }
1001 }
1002
1003 /* Move the remaining bytes to the beginning of utf8buf. */
1004 if (in2size > 0)
1005 memmove (utf8buf, in2ptr, in2size);
1006 utf8len = in2size;
1007 }
1008
1009 if (res1 == (size_t)(-1))
1010 {
1011 if (errno1 == EINVAL)
1012 in1size = 0;
1013 else if (errno1 == EILSEQ)
1014 {
1015 if (result != initial_result)
1016 free (result);
1017 errno = errno1;
1018 return -1;
1019 }
1020 }
1021 }
1022 # undef utf8bufsize
1023 }
1024
1025 done:
1026 /* Now the final memory allocation. */
1027 if (result == tmpbuf)
1028 {
1029 size_t memsize = length + extra_alloc;
1030
1031 if (*resultp != NULL && *lengthp >= memsize)
1032 result = *resultp;
1033 else
1034 {
1035 char *memory;
1036
1037 memory = (char *) malloc (memsize > 0 ? memsize : 1);
1038 if (memory != NULL)
1039 result = memory;
1040 else
1041 {
1042 errno = ENOMEM;
1043 return -1;
1044 }
1045 }
1046 memcpy (result, tmpbuf, length);
1047 }
1048 else if (result != *resultp && length + extra_alloc < allocated)
1049 {
1050 /* Shrink the allocated memory if possible. */
1051 size_t memsize = length + extra_alloc;
1052 char *memory;
1053
1054 memory = (char *) realloc (result, memsize > 0 ? memsize : 1);
1055 if (memory != NULL)
1056 result = memory;
1057 }
1058 *resultp = result;
1059 *lengthp = length;
1060 return 0;
1061 # undef tmpbuf
1062 # undef tmpbufsize
1063 }
1064
1065 int
1066 mem_cd_iconveh (const char *src, size_t srclen,
1067 const iconveh_t *cd,
1068 enum iconv_ilseq_handler handler,
1069 size_t *offsets,
1070 char **resultp, size_t *lengthp)
1071 {
1072 return mem_cd_iconveh_internal (src, srclen, cd->cd, cd->cd1, cd->cd2,
1073 handler, 0, offsets, resultp, lengthp);
1074 }
1075
1076 char *
1077 str_cd_iconveh (const char *src,
1078 const iconveh_t *cd,
1079 enum iconv_ilseq_handler handler)
1080 {
1081 /* For most encodings, a trailing NUL byte in the input will be converted
1082 to a trailing NUL byte in the output. But not for UTF-7. So that this
1083 function is usable for UTF-7, we have to exclude the NUL byte from the
1084 conversion and add it by hand afterwards. */
1085 char *result = NULL;
1086 size_t length = 0;
1087 int retval = mem_cd_iconveh_internal (src, strlen (src),
1088 cd->cd, cd->cd1, cd->cd2, handler, 1,
1089 NULL, &result, &length);
1090
1091 if (retval < 0)
1092 {
1093 free (result);
1094 return NULL;
1095 }
1096
1097 /* Add the terminating NUL byte. */
1098 result[length] = '\0';
1099
1100 return result;
1101 }
1102
1103 #endif
1104
1105 int
1106 mem_iconveh (const char *src, size_t srclen,
1107 const char *from_codeset, const char *to_codeset,
1108 enum iconv_ilseq_handler handler,
1109 size_t *offsets,
1110 char **resultp, size_t *lengthp)
1111 {
1112 if (srclen == 0)
1113 {
1114 /* Nothing to convert. */
1115 *lengthp = 0;
1116 return 0;
1117 }
1118 else if (offsets == NULL && c_strcasecmp (from_codeset, to_codeset) == 0)
1119 {
1120 char *result;
1121
1122 if (*resultp != NULL && *lengthp >= srclen)
1123 result = *resultp;
1124 else
1125 {
1126 result = (char *) malloc (srclen);
1127 if (result == NULL)
1128 {
1129 errno = ENOMEM;
1130 return -1;
1131 }
1132 }
1133 memcpy (result, src, srclen);
1134 *resultp = result;
1135 *lengthp = srclen;
1136 return 0;
1137 }
1138 else
1139 {
1140 #if HAVE_ICONV
1141 iconveh_t cd;
1142 char *result;
1143 size_t length;
1144 int retval;
1145
1146 if (iconveh_open (to_codeset, from_codeset, &cd) < 0)
1147 return -1;
1148
1149 result = *resultp;
1150 length = *lengthp;
1151 retval = mem_cd_iconveh (src, srclen, &cd, handler, offsets,
1152 &result, &length);
1153
1154 if (retval < 0)
1155 {
1156 /* Close cd, but preserve the errno from str_cd_iconv. */
1157 int saved_errno = errno;
1158 iconveh_close (&cd);
1159 errno = saved_errno;
1160 }
1161 else
1162 {
1163 if (iconveh_close (&cd) < 0)
1164 {
1165 if (result != *resultp)
1166 free (result);
1167 return -1;
1168 }
1169 *resultp = result;
1170 *lengthp = length;
1171 }
1172 return retval;
1173 #else
1174 /* This is a different error code than if iconv_open existed but didn't
1175 support from_codeset and to_codeset, so that the caller can emit
1176 an error message such as
1177 "iconv() is not supported. Installing GNU libiconv and
1178 then reinstalling this package would fix this." */
1179 errno = ENOSYS;
1180 return -1;
1181 #endif
1182 }
1183 }
1184
1185 char *
1186 str_iconveh (const char *src,
1187 const char *from_codeset, const char *to_codeset,
1188 enum iconv_ilseq_handler handler)
1189 {
1190 if (*src == '\0' || c_strcasecmp (from_codeset, to_codeset) == 0)
1191 {
1192 char *result = strdup (src);
1193
1194 if (result == NULL)
1195 errno = ENOMEM;
1196 return result;
1197 }
1198 else
1199 {
1200 #if HAVE_ICONV
1201 iconveh_t cd;
1202 char *result;
1203
1204 if (iconveh_open (to_codeset, from_codeset, &cd) < 0)
1205 return NULL;
1206
1207 result = str_cd_iconveh (src, &cd, handler);
1208
1209 if (result == NULL)
1210 {
1211 /* Close cd, but preserve the errno from str_cd_iconv. */
1212 int saved_errno = errno;
1213 iconveh_close (&cd);
1214 errno = saved_errno;
1215 }
1216 else
1217 {
1218 if (iconveh_close (&cd) < 0)
1219 {
1220 free (result);
1221 return NULL;
1222 }
1223 }
1224 return result;
1225 #else
1226 /* This is a different error code than if iconv_open existed but didn't
1227 support from_codeset and to_codeset, so that the caller can emit
1228 an error message such as
1229 "iconv() is not supported. Installing GNU libiconv and
1230 then reinstalling this package would fix this." */
1231 errno = ENOSYS;
1232 return NULL;
1233 #endif
1234 }
1235 }