1  /* bug 19727: Testing UTF conversions with UTF16 surrogates as input.
       2     Copyright (C) 2016-2023 Free Software Foundation, Inc.
       3     This file is part of the GNU C Library.
       4  
       5     The GNU C Library is free software; you can redistribute it and/or
       6     modify it under the terms of the GNU Lesser General Public
       7     License as published by the Free Software Foundation; either
       8     version 2.1 of the License, or (at your option) any later version.
       9  
      10     The GNU C Library is distributed in the hope that it will be useful,
      11     but WITHOUT ANY WARRANTY; without even the implied warranty of
      12     MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
      13     Lesser General Public License for more details.
      14  
      15     You should have received a copy of the GNU Lesser General Public
      16     License along with the GNU C Library; if not, see
      17     <https://www.gnu.org/licenses/>.  */
      18  
      19  #include <stdio.h>
      20  #include <stdlib.h>
      21  #include <errno.h>
      22  #include <string.h>
      23  #include <inttypes.h>
      24  #include <iconv.h>
      25  #include <byteswap.h>
      26  
      27  static int
      28  run_conversion (const char *from, const char *to, char *inbuf, size_t inbuflen,
      29  		int exp_errno, int line)
      30  {
      31    char outbuf[16];
      32    iconv_t cd;
      33    char *inptr;
      34    size_t inlen;
      35    char *outptr;
      36    size_t outlen;
      37    size_t n;
      38    int e;
      39    int fails = 0;
      40  
      41    cd = iconv_open (to, from);
      42    if (cd == (iconv_t) -1)
      43      {
      44        printf ("line %d: cannot convert from %s to %s: %m\n", line, from, to);
      45        return 1;
      46      }
      47  
      48    inptr = (char *) inbuf;
      49    inlen = inbuflen;
      50    outptr = outbuf;
      51    outlen = sizeof (outbuf);
      52  
      53    errno = 0;
      54    n = iconv (cd, &inptr, &inlen, &outptr, &outlen);
      55    e = errno;
      56  
      57    if (exp_errno == 0)
      58      {
      59        if (n == (size_t) -1)
      60  	{
      61  	  puts ("n should be >= 0, but n == -1");
      62  	  fails ++;
      63  	}
      64  
      65        if (e != 0)
      66  	{
      67  	  printf ("errno should be 0: 'Success', but errno == %d: '%s'\n"
      68  		  , e, strerror(e));
      69  	  fails ++;
      70  	}
      71      }
      72    else
      73      {
      74        if (n != (size_t) -1)
      75  	{
      76  	  printf ("n should be -1, but n == %zd\n", n);
      77  	  fails ++;
      78  	}
      79  
      80        if (e != exp_errno)
      81  	{
      82  	  printf ("errno should be %d: '%s', but errno == %d: '%s'\n"
      83  		  , exp_errno, strerror (exp_errno), e, strerror (e));
      84  	  fails ++;
      85  	}
      86      }
      87  
      88    iconv_close (cd);
      89  
      90    if (fails > 0)
      91      {
      92        printf ("Errors in line %d while converting %s to %s.\n\n"
      93  	      , line, from, to);
      94      }
      95  
      96    return fails;
      97  }
      98  
      99  static int
     100  do_test (void)
     101  {
     102    int fails = 0;
     103    char buf[4];
     104  
     105    /* This test runs iconv() with UTF character in range of an UTF16 surrogate.
     106       UTF-16 high surrogate is in range 0xD800..0xDBFF and
     107       UTF-16 low surrogate is in range 0xDC00..0xDFFF.
     108       Converting from or to UTF-xx has to report errors in those cases.
     109       In UTF-16, surrogate pairs with a high surrogate in front of a low
     110       surrogate is valid.  */
     111  
     112    /* Use RUN_UCS4_UTF32_INPUT to test conversion ...
     113  
     114       ... from INTERNAL to UTF-xx[LE|BE]:
     115       Converting from UCS4 to UTF-xx[LE|BE] first converts UCS4 to INTERNAL
     116       without checking for UTF-16 surrogate values
     117       and then converts from INTERNAL to UTF-xx[LE|BE].
     118       The latter conversion has to report an error in those cases.
     119  
     120       ... from UTF-32[LE|BE] to INTERNAL:
     121       Converting directly from UTF-32LE to UTF-8|16 is needed,
     122       because e.g. s390x has iconv-modules which converts directly.  */
     123  #define RUN_UCS4_UTF32_INPUT(b0, b1, b2, b3, err, line)			\
     124    buf[0] = b0;								\
     125    buf[1] = b1;								\
     126    buf[2] = b2;								\
     127    buf[3] = b3;								\
     128    fails += run_conversion ("UCS4", "UTF-8", buf, 4, err, line);		\
     129    fails += run_conversion ("UCS4", "UTF-16LE", buf, 4, err, line);	\
     130    fails += run_conversion ("UCS4", "UTF-16BE", buf, 4, err, line);	\
     131    fails += run_conversion ("UCS4", "UTF-32LE", buf, 4, err, line);	\
     132    fails += run_conversion ("UCS4", "UTF-32BE", buf, 4, err, line);	\
     133    fails += run_conversion ("UTF-32BE", "WCHAR_T", buf, 4, err, line);	\
     134    fails += run_conversion ("UTF-32BE", "UTF-8", buf, 4, err, line);	\
     135    fails += run_conversion ("UTF-32BE", "UTF-16LE", buf, 4, err, line);	\
     136    fails += run_conversion ("UTF-32BE", "UTF-16BE", buf, 4, err, line);	\
     137    buf[0] = b3;								\
     138    buf[1] = b2;								\
     139    buf[2] = b1;								\
     140    buf[3] = b0;								\
     141    fails += run_conversion ("UTF-32LE", "WCHAR_T", buf, 4, err, line);	\
     142    fails += run_conversion ("UTF-32LE", "UTF-8", buf, 4, err, line);	\
     143    fails += run_conversion ("UTF-32LE", "UTF-16LE", buf, 4, err, line);	\
     144    fails += run_conversion ("UTF-32LE", "UTF-16BE", buf, 4, err, line);
     145  
     146    /* Use UCS4/UTF32 input of 0xD7FF.  */
     147    RUN_UCS4_UTF32_INPUT (0x0, 0x0, 0xD7, 0xFF, 0, __LINE__);
     148  
     149    /* Use UCS4/UTF32 input of 0xD800.  */
     150    RUN_UCS4_UTF32_INPUT (0x0, 0x0, 0xD8, 0x00, EILSEQ, __LINE__);
     151  
     152    /* Use UCS4/UTF32 input of 0xDBFF.  */
     153    RUN_UCS4_UTF32_INPUT (0x0, 0x0, 0xDB, 0xFF, EILSEQ, __LINE__);
     154  
     155    /* Use UCS4/UTF32 input of 0xDC00.  */
     156    RUN_UCS4_UTF32_INPUT (0x0, 0x0, 0xDC, 0x00, EILSEQ, __LINE__);
     157  
     158    /* Use UCS4/UTF32 input of 0xDFFF.  */
     159    RUN_UCS4_UTF32_INPUT (0x0, 0x0, 0xDF, 0xFF, EILSEQ, __LINE__);
     160  
     161    /* Use UCS4/UTF32 input of 0xE000.  */
     162    RUN_UCS4_UTF32_INPUT (0x0, 0x0, 0xE0, 0x00, 0, __LINE__);
     163  
     164  
     165    /* Use RUN_UTF16_INPUT to test conversion from UTF16[LE|BE] to INTERNAL.
     166       Converting directly from UTF-16 to UTF-8|32 is needed,
     167       because e.g. s390x has iconv-modules which converts directly.
     168       Use len == 2 or 4 to specify one or two UTF-16 characters.  */
     169  #define RUN_UTF16_INPUT(b0, b1, b2, b3, len, err, line)			\
     170    buf[0] = b0;								\
     171    buf[1] = b1;								\
     172    buf[2] = b2;								\
     173    buf[3] = b3;								\
     174    fails += run_conversion ("UTF-16BE", "WCHAR_T", buf, len, err, line);	\
     175    fails += run_conversion ("UTF-16BE", "UTF-8", buf, len, err, line);	\
     176    fails += run_conversion ("UTF-16BE", "UTF-32LE", buf, len, err, line); \
     177    fails += run_conversion ("UTF-16BE", "UTF-32BE", buf, len, err, line); \
     178    buf[0] = b1;								\
     179    buf[1] = b0;								\
     180    buf[2] = b3;								\
     181    buf[3] = b2;								\
     182    fails += run_conversion ("UTF-16LE", "WCHAR_T", buf, len, err, line);	\
     183    fails += run_conversion ("UTF-16LE", "UTF-8", buf, len, err, line);	\
     184    fails += run_conversion ("UTF-16LE", "UTF-32LE", buf, len, err, line); \
     185    fails += run_conversion ("UTF-16LE", "UTF-32BE", buf, len, err, line);
     186  
     187    /* Use UTF16 input of 0xD7FF.  */
     188    RUN_UTF16_INPUT (0xD7, 0xFF, 0xD7, 0xFF, 4, 0, __LINE__);
     189  
     190    /* Use [single] UTF16 high surrogate 0xD800 [with a valid character behind].
     191       And check an UTF16 surrogate pair [without valid low surrogate].  */
     192    RUN_UTF16_INPUT (0xD8, 0x0, 0x0, 0x0, 2, EINVAL, __LINE__);
     193    RUN_UTF16_INPUT (0xD8, 0x0, 0xD7, 0xFF, 4, EILSEQ, __LINE__);
     194    RUN_UTF16_INPUT (0xD8, 0x0, 0xD8, 0x0, 4, EILSEQ, __LINE__);
     195    RUN_UTF16_INPUT (0xD8, 0x0, 0xE0, 0x0, 4, EILSEQ, __LINE__);
     196    RUN_UTF16_INPUT (0xD8, 0x0, 0xDC, 0x0, 4, 0, __LINE__);
     197  
     198    /* Use [single] UTF16 high surrogate 0xDBFF [with a valid character behind].
     199       And check an UTF16 surrogate pair [without valid low surrogate].  */
     200    RUN_UTF16_INPUT (0xDB, 0xFF, 0x0, 0x0, 2, EINVAL, __LINE__);
     201    RUN_UTF16_INPUT (0xDB, 0xFF, 0xD7, 0xFF, 4, EILSEQ, __LINE__);
     202    RUN_UTF16_INPUT (0xDB, 0xFF, 0xDB, 0xFF, 4, EILSEQ, __LINE__);
     203    RUN_UTF16_INPUT (0xDB, 0xFF, 0xE0, 0x0, 4, EILSEQ, __LINE__);
     204    RUN_UTF16_INPUT (0xDB, 0xFF, 0xDF, 0xFF, 4, 0, __LINE__);
     205  
     206    /* Use single UTF16 low surrogate 0xDC00 [with a valid character behind].
     207       And check an UTF16 surrogate pair [without valid high surrogate].   */
     208    RUN_UTF16_INPUT (0xDC, 0x0, 0x0, 0x0, 2, EILSEQ, __LINE__);
     209    RUN_UTF16_INPUT (0xDC, 0x0, 0xD7, 0xFF, 4, EILSEQ, __LINE__);
     210    RUN_UTF16_INPUT (0xD8, 0x0, 0xDC, 0x0, 4, 0, __LINE__);
     211    RUN_UTF16_INPUT (0xD7, 0xFF, 0xDC, 0x0, 4, EILSEQ, __LINE__);
     212    RUN_UTF16_INPUT (0xDC, 0x0, 0xDC, 0x0, 4, EILSEQ, __LINE__);
     213    RUN_UTF16_INPUT (0xE0, 0x0, 0xDC, 0x0, 4, EILSEQ, __LINE__);
     214  
     215    /* Use single UTF16 low surrogate 0xDFFF [with a valid character behind].
     216       And check an UTF16 surrogate pair [without valid high surrogate].   */
     217    RUN_UTF16_INPUT (0xDF, 0xFF, 0x0, 0x0, 2, EILSEQ, __LINE__);
     218    RUN_UTF16_INPUT (0xDF, 0xFF, 0xD7, 0xFF, 4, EILSEQ, __LINE__);
     219    RUN_UTF16_INPUT (0xDB, 0xFF, 0xDF, 0xFF, 4, 0, __LINE__);
     220    RUN_UTF16_INPUT (0xD7, 0xFF, 0xDF, 0xFF, 4, EILSEQ, __LINE__);
     221    RUN_UTF16_INPUT (0xDF, 0xFF, 0xDF, 0xFF, 4, EILSEQ, __LINE__);
     222    RUN_UTF16_INPUT (0xE0, 0x0, 0xDF, 0xFF, 4, EILSEQ, __LINE__);
     223  
     224    /* Use UCS4/UTF32 input of 0xE000.  */
     225    RUN_UTF16_INPUT (0xE0, 0x0, 0xE0, 0x0, 4, 0, __LINE__);
     226  
     227  
     228    /* Use RUN_UTF8_3BYTE_INPUT to test conversion from UTF-8 to INTERNAL.
     229       Converting directly from UTF-8 to UTF-16|32 is needed,
     230       because e.g. s390x has iconv-modules which converts directly.  */
     231  #define RUN_UTF8_3BYTE_INPUT(b0, b1, b2, err, line)			\
     232    buf[0] = b0;								\
     233    buf[1] = b1;								\
     234    buf[2] = b2;								\
     235    fails += run_conversion ("UTF-8", "WCHAR_T", buf, 3, err, line);	\
     236    fails += run_conversion ("UTF-8", "UTF-16LE", buf, 3, err, line);	\
     237    fails += run_conversion ("UTF-8", "UTF-16BE", buf, 3, err, line);	\
     238    fails += run_conversion ("UTF-8", "UTF-32LE", buf, 3, err, line);	\
     239    fails += run_conversion ("UTF-8", "UTF-32BE", buf, 3, err, line);
     240  
     241    /* Use UTF-8 input of 0xD7FF.  */
     242    RUN_UTF8_3BYTE_INPUT (0xED, 0x9F, 0xBF, 0, __LINE__);
     243  
     244    /* Use UTF-8 input of 0xD800.  */
     245    RUN_UTF8_3BYTE_INPUT (0xED, 0xA0, 0x80, EILSEQ, __LINE__);
     246  
     247    /* Use UTF-8 input of 0xDBFF.  */
     248    RUN_UTF8_3BYTE_INPUT (0xED, 0xAF, 0xBF, EILSEQ, __LINE__);
     249  
     250    /* Use UTF-8 input of 0xDC00.  */
     251    RUN_UTF8_3BYTE_INPUT (0xED, 0xB0, 0x80, EILSEQ, __LINE__);
     252  
     253    /* Use UTF-8 input of 0xDFFF.  */
     254    RUN_UTF8_3BYTE_INPUT (0xED, 0xBF, 0xBF, EILSEQ, __LINE__);
     255  
     256    /* Use UTF-8 input of 0xF000.  */
     257    RUN_UTF8_3BYTE_INPUT (0xEF, 0x80, 0x80, 0, __LINE__);
     258  
     259    return fails > 0 ? EXIT_FAILURE : EXIT_SUCCESS;
     260  }
     261  
     262  #define TEST_FUNCTION do_test ()
     263  #include "../test-skeleton.c"