(root)/
glibc-2.38/
iconv/
tst-iconv-opt.c
       1  /* Test iconv's TRANSLIT and IGNORE option handling
       2  
       3     Copyright (C) 2020-2023 Free Software Foundation, Inc.
       4     This file is part of the GNU C Library.
       5  
       6     The GNU C Library is free software; you can redistribute it and/or
       7     modify it under the terms of the GNU Lesser General Public
       8     License as published by the Free Software Foundation; either
       9     version 2.1 of the License, or (at your option) any later version.
      10  
      11     The GNU C Library is distributed in the hope that it will be useful,
      12     but WITHOUT ANY WARRANTY; without even the implied warranty of
      13     MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
      14     Lesser General Public License for more details.
      15  
      16     You should have received a copy of the GNU Lesser General Public
      17     License along with the GNU C Library; if not, see
      18     <https://www.gnu.org/licenses/>.  */
      19  
      20  
      21  #include <iconv.h>
      22  #include <locale.h>
      23  #include <errno.h>
      24  #include <string.h>
      25  #include <support/support.h>
      26  #include <support/check.h>
      27  
      28  
      29  /* Run one iconv test.  Arguments:
      30     to: destination character set and options
      31     from: source character set
      32     input: input string to be converted
      33     exp_in: expected number of bytes consumed
      34     exp_ret: expected return value (error or number of irreversible conversions)
      35     exp_out: expected output string
      36     exp_err: expected value of `errno' after iconv returns.  */
      37  static void
      38  test_iconv (const char *to, const char *from, char *input, size_t exp_in,
      39              size_t exp_ret, const char *exp_out, int exp_err)
      40  {
      41    iconv_t cd;
      42    char outbuf[500];
      43    size_t inlen, outlen;
      44    char *inptr, *outptr;
      45    size_t n;
      46  
      47    cd = iconv_open (to, from);
      48    TEST_VERIFY (cd != (iconv_t) -1);
      49  
      50    inlen = strlen (input);
      51    outlen = sizeof (outbuf);
      52    inptr = input;
      53    outptr = outbuf;
      54  
      55    errno = 0;
      56    n = iconv (cd, &inptr, &inlen, &outptr, &outlen);
      57  
      58    TEST_COMPARE (n, exp_ret);
      59    TEST_VERIFY (inptr == input + exp_in);
      60    TEST_COMPARE (errno, exp_err);
      61    TEST_COMPARE_BLOB (outbuf, outptr - outbuf, exp_out, strlen (exp_out));
      62    TEST_VERIFY (iconv_close (cd) == 0);
      63  }
      64  
      65  
      66  /* We test option parsing by converting UTF-8 inputs to ASCII under various
      67     option combinations. The UTF-8 inputs fall into three categories:
      68     - ASCII-only,
      69     - non-ASCII,
      70     - non-ASCII with invalid UTF-8 characters.  */
      71  
      72  /* 1.  */
      73  char ascii[] = "Just some ASCII text";
      74  
      75  /* 2. Valid UTF-8 input and some corresponding expected outputs with various
      76     options.  The two non-ASCII characters below are accented alphabets:
      77     an `a' then an `o'.  */
      78  char utf8[] = "UTF-8 text with \u00E1 couple \u00F3f non-ASCII characters";
      79  char u2a[] = "UTF-8 text with ";
      80  char u2a_translit[] = "UTF-8 text with a couple of non-ASCII characters";
      81  char u2a_ignore[] = "UTF-8 text with  couple f non-ASCII characters";
      82  
      83  /* 3. Invalid UTF-8 input and some corresponding expected outputs.  \xff is
      84     invalid UTF-8. It's followed by some valid but non-ASCII UTF-8.  */
      85  char iutf8[] = "Invalid UTF-8 \xff\u27E6text\u27E7";
      86  char iu2a[] = "Invalid UTF-8 ";
      87  char iu2a_ignore[] = "Invalid UTF-8 text";
      88  char iu2a_both[] = "Invalid UTF-8 [|text|]";
      89  
      90  /* 4. Another invalid UTF-8 input and corresponding expected outputs. This time
      91     the valid non-ASCII UTF-8 characters appear before the invalid \xff.  */
      92  char jutf8[] = "Invalid \u27E6UTF-8\u27E7 \xfftext";
      93  char ju2a[] = "Invalid ";
      94  char ju2a_translit[] = "Invalid [|UTF-8|] ";
      95  char ju2a_ignore[] = "Invalid UTF-8 text";
      96  char ju2a_both[] = "Invalid [|UTF-8|] text";
      97  
      98  /* We also test option handling for character set names that have the form
      99     "A/B".  In this test, we test conversions "ISO-10646/UTF-8", and either
     100     ISO-8859-1 or ASCII.  */
     101  
     102  /* 5. Accented 'A' and 'a' characters in ISO-8859-1 and UTF-8, and an
     103     equivalent ASCII transliteration.  */
     104  char iso8859_1_a[] = {0xc0, 0xc1, 0xc2, 0xc3, 0xc4, 0xc5, /* Accented A's.  */
     105                        0xe0, 0xe1, 0xe2, 0xe3, 0xe4, 0xe5, /* Accented a's.  */
     106                        0x00};
     107  char utf8_a[] = "\u00C0\u00C1\u00C2\u00C3\u00C4\u00C5"
     108                  "\u00E0\u00E1\u00E2\u00E3\u00E4\u00E5";
     109  char ascii_a[] = "AAAAAAaaaaaa";
     110  
     111  /* 6. An invalid ASCII string where [0] is invalid and [1] is '~'.  */
     112  char iascii [] = {0x80, '~', '\0'};
     113  char empty[] = "";
     114  char ia2u_ignore[] = "~";
     115  
     116  static int
     117  do_test (void)
     118  {
     119    xsetlocale (LC_ALL, "en_US.UTF-8");
     120  
     121  
     122    /* 0. iconv_open should gracefully fail for invalid character sets.  */
     123  
     124    TEST_VERIFY (iconv_open ("INVALID", "UTF-8") == (iconv_t) -1);
     125    TEST_VERIFY (iconv_open ("UTF-8", "INVALID") == (iconv_t) -1);
     126    TEST_VERIFY (iconv_open ("INVALID", "INVALID") == (iconv_t) -1);
     127  
     128  
     129    /* 1. ASCII-only UTF-8 input should convert to ASCII with no changes:  */
     130  
     131    test_iconv ("ASCII", "UTF-8", ascii, strlen (ascii), 0, ascii, 0);
     132    test_iconv ("ASCII//", "UTF-8", ascii, strlen (ascii), 0, ascii, 0);
     133    test_iconv ("ASCII//TRANSLIT", "UTF-8", ascii, strlen (ascii), 0, ascii, 0);
     134    test_iconv ("ASCII//TRANSLIT//", "UTF-8", ascii, strlen (ascii), 0, ascii,
     135                0);
     136    test_iconv ("ASCII//IGNORE", "UTF-8", ascii, strlen (ascii), 0, ascii, 0);
     137    test_iconv ("ASCII//IGNORE//", "UTF-8", ascii, strlen (ascii), 0, ascii, 0);
     138  
     139  
     140    /* 2. Valid UTF-8 input with non-ASCII characters:  */
     141  
     142    /* EILSEQ when converted to ASCII.  */
     143    test_iconv ("ASCII", "UTF-8", utf8, strlen (u2a), (size_t) -1, u2a, EILSEQ);
     144  
     145    /* Converted without error with TRANSLIT enabled.  */
     146    test_iconv ("ASCII//TRANSLIT", "UTF-8", utf8, strlen (utf8), 2, u2a_translit,
     147                0);
     148  
     149    /* EILSEQ with IGNORE enabled.  Non-ASCII chars dropped from output.  */
     150    test_iconv ("ASCII//IGNORE", "UTF-8", utf8, strlen (utf8), (size_t) -1,
     151                u2a_ignore, EILSEQ);
     152  
     153    /* With TRANSLIT and IGNORE enabled, transliterated without error.  We test
     154       four combinations.  */
     155  
     156    test_iconv ("ASCII//TRANSLIT,IGNORE", "UTF-8", utf8, strlen (utf8), 2,
     157                u2a_translit, 0);
     158    test_iconv ("ASCII//TRANSLIT//IGNORE", "UTF-8", utf8, strlen (utf8), 2,
     159                u2a_translit, 0);
     160    test_iconv ("ASCII//IGNORE,TRANSLIT", "UTF-8", utf8, strlen (utf8), 2,
     161                u2a_translit, 0);
     162    /* Due to bug 19519, iconv was ignoring TRANSLIT for the following input.  */
     163    test_iconv ("ASCII//IGNORE//TRANSLIT", "UTF-8", utf8, strlen (utf8), 2,
     164                u2a_translit, 0);
     165  
     166    /* Misspellings of TRANSLIT and IGNORE are ignored, but conversion still
     167       works while respecting any other correctly spelled options.  */
     168  
     169    test_iconv ("ASCII//T", "UTF-8", utf8, strlen (u2a), (size_t) -1, u2a,
     170                EILSEQ);
     171    test_iconv ("ASCII//TRANSLITERATE", "UTF-8", utf8, strlen (u2a), (size_t) -1,
     172                u2a, EILSEQ);
     173    test_iconv ("ASCII//I", "UTF-8", utf8, strlen (u2a), (size_t) -1, u2a,
     174                EILSEQ);
     175    test_iconv ("ASCII//IGNORED", "UTF-8", utf8, strlen (u2a), (size_t) -1, u2a,
     176                EILSEQ);
     177    test_iconv ("ASCII//TRANSLITERATE//IGNORED", "UTF-8", utf8, strlen (u2a),
     178                (size_t) -1, u2a, EILSEQ);
     179    test_iconv ("ASCII//IGNORED,TRANSLITERATE", "UTF-8", utf8, strlen (u2a),
     180                (size_t) -1, u2a, EILSEQ);
     181    test_iconv ("ASCII//T//I", "UTF-8", utf8, strlen (u2a), (size_t) -1, u2a,
     182                EILSEQ);
     183  
     184    test_iconv ("ASCII//TRANSLIT//I", "UTF-8", utf8, strlen (utf8), 2,
     185                u2a_translit, 0);
     186    /* Due to bug 19519, iconv was ignoring TRANSLIT for the following input.  */
     187    test_iconv ("ASCII//I//TRANSLIT", "UTF-8", utf8, strlen (utf8), 2,
     188                u2a_translit, 0);
     189    test_iconv ("ASCII//IGNORED,TRANSLIT", "UTF-8", utf8, strlen (utf8), 2,
     190                u2a_translit, 0);
     191    test_iconv ("ASCII//TRANSLIT,IGNORED", "UTF-8", utf8, strlen (utf8), 2,
     192                u2a_translit, 0);
     193  
     194    test_iconv ("ASCII//IGNORE,T", "UTF-8", utf8, strlen (utf8), (size_t) -1,
     195                u2a_ignore, EILSEQ);
     196    test_iconv ("ASCII//T,IGNORE", "UTF-8", utf8, strlen (utf8), (size_t) -1,
     197                u2a_ignore, EILSEQ);
     198    /* Due to bug 19519, iconv was ignoring IGNORE for the following input.  */
     199    test_iconv ("ASCII//TRANSLITERATE//IGNORE", "UTF-8", utf8, strlen (utf8),
     200                (size_t) -1, u2a_ignore, EILSEQ);
     201    test_iconv ("ASCII//IGNORE//TRANSLITERATE", "UTF-8", utf8, strlen (utf8),
     202                (size_t) -1, u2a_ignore, EILSEQ);
     203  
     204  
     205    /* 3. Invalid UTF-8 followed by some valid non-ASCII UTF-8 characters:  */
     206  
     207    /* EILSEQ; output is truncated at the first invalid UTF-8 character.  */
     208    test_iconv ("ASCII", "UTF-8", iutf8, strlen (iu2a), (size_t) -1, iu2a,
     209                EILSEQ);
     210  
     211    /* With TRANSLIT enabled: EILSEQ; output still truncated at the first invalid
     212       UTF-8 character.  */
     213    test_iconv ("ASCII//TRANSLIT", "UTF-8", iutf8, strlen (iu2a), (size_t) -1,
     214                iu2a, EILSEQ);
     215  
     216    /* With IGNORE enabled: EILSEQ; output omits invalid UTF-8 characters and
     217       valid UTF-8 non-ASCII characters.  */
     218    test_iconv ("ASCII//IGNORE", "UTF-8", iutf8, strlen (iutf8), (size_t) -1,
     219                iu2a_ignore, EILSEQ);
     220  
     221    /* With TRANSLIT and IGNORE enabled, output omits only invalid UTF-8
     222       characters and transliterates valid non-ASCII UTF-8 characters.  We test
     223       four combinations.  */
     224  
     225    test_iconv ("ASCII//TRANSLIT,IGNORE", "UTF-8", iutf8, strlen (iutf8), 2,
     226                iu2a_both, 0);
     227    /* Due to bug 19519, iconv was ignoring IGNORE for the following input.  */
     228    test_iconv ("ASCII//TRANSLIT//IGNORE", "UTF-8", iutf8, strlen (iutf8), 2,
     229                iu2a_both, 0);
     230    test_iconv ("ASCII//IGNORE,TRANSLIT", "UTF-8", iutf8, strlen (iutf8), 2,
     231                iu2a_both, 0);
     232    /* Due to bug 19519, iconv was ignoring TRANSLIT for the following input.  */
     233    test_iconv ("ASCII//IGNORE//TRANSLIT", "UTF-8", iutf8, strlen (iutf8), 2,
     234                iu2a_both, 0);
     235  
     236  
     237    /* 4. Invalid UTF-8 with valid non-ASCII UTF-8 chars appearing first:  */
     238  
     239    /* EILSEQ; output is truncated at the first non-ASCII character.  */
     240    test_iconv ("ASCII", "UTF-8", jutf8, strlen (ju2a), (size_t) -1, ju2a,
     241                EILSEQ);
     242  
     243    /* With TRANSLIT enabled: EILSEQ; output now truncated at the first invalid
     244       UTF-8 character.  */
     245    test_iconv ("ASCII//TRANSLIT", "UTF-8", jutf8, strlen (jutf8) - 5,
     246                (size_t) -1, ju2a_translit, EILSEQ);
     247    test_iconv ("ASCII//translit", "UTF-8", jutf8, strlen (jutf8) - 5,
     248                (size_t) -1, ju2a_translit, EILSEQ);
     249  
     250    /* With IGNORE enabled: EILSEQ; output omits invalid UTF-8 characters and
     251       valid UTF-8 non-ASCII characters.  */
     252    test_iconv ("ASCII//IGNORE", "UTF-8", jutf8, strlen (jutf8), (size_t) -1,
     253                ju2a_ignore, EILSEQ);
     254    test_iconv ("ASCII//ignore", "UTF-8", jutf8, strlen (jutf8), (size_t) -1,
     255                ju2a_ignore, EILSEQ);
     256  
     257    /* With TRANSLIT and IGNORE enabled, output omits only invalid UTF-8
     258       characters and transliterates valid non-ASCII UTF-8 characters.  We test
     259       several combinations.  */
     260  
     261    test_iconv ("ASCII//TRANSLIT,IGNORE", "UTF-8", jutf8, strlen (jutf8), 2,
     262                ju2a_both, 0);
     263    /* Due to bug 19519, iconv was ignoring IGNORE for the following input.  */
     264    test_iconv ("ASCII//TRANSLIT//IGNORE", "UTF-8", jutf8, strlen (jutf8), 2,
     265                ju2a_both, 0);
     266    test_iconv ("ASCII//IGNORE,TRANSLIT", "UTF-8", jutf8, strlen (jutf8), 2,
     267                ju2a_both, 0);
     268    /* Due to bug 19519, iconv was ignoring TRANSLIT for the following input.  */
     269    test_iconv ("ASCII//IGNORE//TRANSLIT", "UTF-8", jutf8, strlen (jutf8), 2,
     270                ju2a_both, 0);
     271    test_iconv ("ASCII//translit,ignore", "UTF-8", jutf8, strlen (jutf8), 2,
     272                ju2a_both, 0);
     273    /* Trailing whitespace and separators should be ignored.  */
     274    test_iconv ("ASCII//IGNORE,TRANSLIT ", "UTF-8", jutf8, strlen (jutf8), 2,
     275                ju2a_both, 0);
     276    test_iconv ("ASCII//IGNORE,TRANSLIT/", "UTF-8", jutf8, strlen (jutf8), 2,
     277                ju2a_both, 0);
     278    test_iconv ("ASCII//IGNORE,TRANSLIT//", "UTF-8", jutf8, strlen (jutf8), 2,
     279                ju2a_both, 0);
     280    test_iconv ("ASCII//IGNORE,TRANSLIT,", "UTF-8", jutf8, strlen (jutf8), 2,
     281                ju2a_both, 0);
     282    test_iconv ("ASCII//IGNORE,TRANSLIT,,", "UTF-8", jutf8, strlen (jutf8), 2,
     283                ju2a_both, 0);
     284    test_iconv ("ASCII//IGNORE,TRANSLIT /,", "UTF-8", jutf8, strlen (jutf8), 2,
     285                ju2a_both, 0);
     286  
     287    /* TRANSLIT or IGNORE suffixes in fromcode should be ignored.  */
     288    test_iconv ("ASCII", "UTF-8//TRANSLIT", jutf8, strlen (ju2a), (size_t) -1,
     289                ju2a, EILSEQ);
     290    test_iconv ("ASCII", "UTF-8//IGNORE", jutf8, strlen (ju2a), (size_t) -1,
     291                ju2a, EILSEQ);
     292    test_iconv ("ASCII", "UTF-8//TRANSLIT,IGNORE", jutf8, strlen (ju2a),
     293                (size_t) -1, ju2a, EILSEQ);
     294  
     295  
     296    /* 5. Charset names of the form "A/B/":  */
     297  
     298    /* ISO-8859-1 is converted to UTF-8 without needing transliteration.  */
     299    test_iconv ("ISO-10646/UTF-8", "ISO-8859-1", iso8859_1_a,
     300                strlen (iso8859_1_a), 0, utf8_a, 0);
     301    test_iconv ("ISO-10646/UTF-8/", "ISO-8859-1", iso8859_1_a,
     302                strlen (iso8859_1_a), 0, utf8_a, 0);
     303    test_iconv ("ISO-10646/UTF-8/IGNORE", "ISO-8859-1", iso8859_1_a,
     304                strlen (iso8859_1_a), 0, utf8_a, 0);
     305    test_iconv ("ISO-10646/UTF-8//IGNORE", "ISO-8859-1", iso8859_1_a,
     306                strlen (iso8859_1_a), 0, utf8_a, 0);
     307    test_iconv ("ISO-10646/UTF-8/TRANSLIT", "ISO-8859-1", iso8859_1_a,
     308                strlen (iso8859_1_a), 0, utf8_a, 0);
     309    test_iconv ("ISO-10646/UTF-8//TRANSLIT", "ISO-8859-1", iso8859_1_a,
     310                strlen (iso8859_1_a), 0, utf8_a, 0);
     311    test_iconv ("ISO-10646/UTF-8//TRANSLIT/IGNORE", "ISO-8859-1", iso8859_1_a,
     312                strlen (iso8859_1_a), 0, utf8_a, 0);
     313    test_iconv ("ISO-10646/UTF-8//TRANSLIT//IGNORE", "ISO-8859-1", iso8859_1_a,
     314                strlen (iso8859_1_a), 0, utf8_a, 0);
     315    test_iconv ("ISO-10646/UTF-8/TRANSLIT,IGNORE", "ISO-8859-1", iso8859_1_a,
     316                strlen (iso8859_1_a), 0, utf8_a, 0);
     317  
     318    /* UTF-8 with accented A's is converted to ASCII with transliteration.  */
     319    test_iconv ("ASCII", "ISO-10646/UTF-8", utf8_a,
     320                0, (size_t) -1, empty, EILSEQ);
     321    test_iconv ("ASCII//IGNORE", "ISO-10646/UTF-8", utf8_a,
     322                strlen (utf8_a), (size_t) -1, empty, EILSEQ);
     323    test_iconv ("ASCII//TRANSLIT", "ISO-10646/UTF-8", utf8_a,
     324                strlen (utf8_a), 12, ascii_a, 0);
     325  
     326    /* Invalid ASCII is converted to UTF-8 only with IGNORE.  */
     327    test_iconv ("ISO-10646/UTF-8", "ASCII", iascii, strlen (empty), (size_t) -1,
     328                empty, EILSEQ);
     329    test_iconv ("ISO-10646/UTF-8/TRANSLIT", "ASCII", iascii, strlen (empty),
     330                (size_t) -1, empty, EILSEQ);
     331    test_iconv ("ISO-10646/UTF-8/IGNORE", "ASCII", iascii, strlen (iascii),
     332                (size_t) -1, ia2u_ignore, EILSEQ);
     333    test_iconv ("ISO-10646/UTF-8/TRANSLIT,IGNORE", "ASCII", iascii,
     334                strlen (iascii), (size_t) -1, ia2u_ignore, EILSEQ);
     335    /* Due to bug 19519, iconv was ignoring IGNORE for the following three
     336       inputs: */
     337    test_iconv ("ISO-10646/UTF-8/TRANSLIT/IGNORE", "ASCII", iascii,
     338                strlen (iascii), (size_t) -1, ia2u_ignore, EILSEQ);
     339    test_iconv ("ISO-10646/UTF-8//TRANSLIT,IGNORE", "ASCII", iascii,
     340                strlen (iascii), (size_t) -1, ia2u_ignore, EILSEQ);
     341    test_iconv ("ISO-10646/UTF-8//TRANSLIT//IGNORE", "ASCII", iascii,
     342                strlen (iascii), (size_t) -1, ia2u_ignore, EILSEQ);
     343  
     344    return 0;
     345  }
     346  
     347  #include <support/test-driver.c>