1  /* Auxiliary program to test mbrtowc(3) behaviour.
       2     Copyright 2016-2022 Free Software Foundation, Inc.
       3  
       4     This program is free software; you can redistribute it and/or modify
       5     it under the terms of the GNU General Public License as published by
       6     the Free Software Foundation; either version 3, or (at your option)
       7     any later version.
       8  
       9     This program is distributed in the hope that it will be useful,
      10     but WITHOUT ANY WARRANTY; without even the implied warranty of
      11     MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
      12     GNU General Public License for more details.
      13  
      14     You should have received a copy of the GNU General Public License
      15     along with this program; If not, see <https://www.gnu.org/licenses/>. */
      16  
      17  /* Test the operating-system's native mbrtowc(3) function,
      18     by feeding it multibyte seqeunces one byte at a time,
      19     and reporting the result.
      20  
      21     The program prints the following values after each mbrtowc invocation,
      22     separated by commas:
      23  
      24     -2  the octet is contributes to a valid yet incomplete multibyte sequence
      25         in the current locale.
      26  
      27     -1  the octet causes an encoding error.
      28  
      29      0  the octet represents a NUL byte
      30  
      31      1  the octet is a valid single-byte character, OR
      32         completes a valid multibyte sequence.
      33  
      34    Because the program invokes mbrtowc(3) byte-by-byte, the reported
      35    result should never be larger than 1.
      36  
      37    Example of typical output with UTF-8 encoding
      38    ---------------------------------------------
      39  
      40    The unicode character 'N-ARY SUMMATION' (U+2211), encoded in UTF-8 as:
      41      hex: 0xE2 0x88 0x91
      42      oct:  342  210  211
      43  
      44    Decoding the valid sequence byte-by-byte gives:
      45      $ printf '\342\210\221' | LC_ALL=en_US.UTF-8 test-mbrtowc
      46      -2,-2,1
      47  
      48    '\210' is not a valid leading byte in UTF-8,
      49    thus the first byte gives -1, and the 'X' is treated
      50    as a valid single-byte character:
      51  
      52      $ printf '\210X' | LC_ALL=en_US.UTF-8 test-mbrtowc
      53      -1,1
      54  
      55    '\342' is a valid yet incomplete multibyte sequence.
      56    Passing it to mbrtowc results in value '-2'.
      57    The following value 'X' gives an encoding error '-1'
      58    (as 'X' is not a valid trailing byte in a multibyte UTF-8 sequence):
      59  
      60      $ printf '\342X' | LC_ALL=en_US.UTF-8 test-mbrtowc
      61      -2,-1
      62  
      63  
      64    Detecting implementation bugs in mbrtowc
      65    ----------------------------------------
      66  
      67    UTF-8 implementation is correct on most operating systems.
      68    Other multibyte locales might present more difficulties.
      69    An example is the Japanese SHIFT-JIS locale under Mac OS X.
      70    NOTE: The locale is 'ja_JP.SJIS' under Mac OS X, 'ja_JP.shiftjis'
      71    under Ubuntu. 'ja_JP.sjis' was also found on some systems.
      72  
      73    Using unicode character 'KATAKANA LETTER ZE' (U+30BC)
      74     UTF-8:    hex: 0xE3  0x82  0xBC
      75     Shift-jis hex: 0x83  0x5B
      76               oct:  203   133
      77  
      78    The following is a valid multibyte sequence in SHIFT-JIS,
      79    the first byte should result in '-2' (valid yet incomplete),
      80    and the second byte should result in '1' (a valid multibyte sequence
      81    completed):
      82  
      83      $ printf '\203\133' | LC_ALL=ja_JP.SJIS test-mbrtowc
      84      -2,1
      85  
      86    The follwing is an INVALID multibyte sequence in SHIFT-JIS
      87    (The byte ':' is not valid as a second octet).
      88    Buggy implementations will accept this as a valid multibyte sequence:
      89  
      90      # NOTE: this result indicates a buggy mbrtowc
      91      $ printf '\203:' | LC_ALL=ja_JP.SJIS test-mbrtowc
      92      -2,1
      93  
      94    A correct implementations should report '-1' for the second byte (i.e.
      95    an encoding error):
      96  
      97      $ printf '\203:' | LC_ALL=ja_JP.SJIS test-mbrtowc
      98      -2,-1
      99  
     100  
     101    Expected results with correct implementations
     102    ---------------------------------------------
     103  
     104    In GNU Sed some tests purposely use invalid multibyte sequences
     105    to test sed's behaviour. A buggy implemetation of mbrtowc
     106    would result in false-alarm failures.
     107  
     108    The following are expected results in correct implementations:
     109    (locale names are from Mac OS X):
     110  
     111      $ printf '\203\133' | LC_ALL=ja_JP.SJIS test-mbrtowc
     112      -2,1
     113      $ printf '\203:' | LC_ALL=ja_JP.SJIS test-mbrtowc
     114      -2,-1
     115      $ printf '\262C' | LC_ALL=ja_JP.eucJP test-mbrtowc
     116      -2,-1
     117  */
     118  
     119  #include <config.h>
     120  #include <locale.h>
     121  #include <stdio.h>
     122  #include <stdlib.h>
     123  #include <wchar.h>
     124  
     125  #include "closeout.h"
     126  #include "error.h"
     127  #include "progname.h"
     128  
     129  /* stub replacement for non-standard err(3) */
     130  static int
     131  die (const char *msg)
     132  {
     133    error (0, 0, "%s: error: %s\n", program_name, msg);
     134    exit (EXIT_FAILURE);
     135  }
     136  
     137  int
     138  main (int argc, char **argv)
     139  {
     140    int c;
     141    int first = 1;
     142  
     143    set_program_name (argv[0]);
     144    if (!setlocale (LC_ALL, ""))
     145      die ("failed to set locale");
     146  
     147    while ((c = getchar ()) != EOF)
     148      {
     149        wchar_t wc;
     150        char ch = (unsigned char) c;
     151        int i = (int) mbrtowc (&wc, &ch, 1, NULL);
     152  
     153        if (!first)
     154          putchar (',');
     155        first = 0;
     156  
     157        printf ("%d", i);
     158      }
     159  
     160    if (first)
     161      die ("empty input");
     162  
     163    putchar ('\n');
     164  
     165    if (ferror (stdin))
     166      die ("read error");
     167    close_stdout ();
     168  
     169    exit (EXIT_SUCCESS);
     170  }