(root)/
sed-4.9/
sed/
mbcs.c
       1  /*  GNU SED, a batch stream editor.
       2      Copyright (C) 2003-2022 Free Software Foundation, Inc.
       3  
       4      This program is free software; you can redistribute it and/or modify
       5      it under the terms of the GNU General Public License as published by
       6      the Free Software Foundation; either version 3, or (at your option)
       7      any later version.
       8  
       9      This program is distributed in the hope that it will be useful,
      10      but WITHOUT ANY WARRANTY; without even the implied warranty of
      11      MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
      12      GNU General Public License for more details.
      13  
      14      You should have received a copy of the GNU General Public License
      15      along with this program; If not, see <https://www.gnu.org/licenses/>. */
      16  
      17  #include "sed.h"
      18  #include <stdlib.h>
      19  #include <string.h>
      20  
      21  #include "localcharset.h"
      22  
      23  int mb_cur_max;
      24  bool is_utf8;
      25  
      26  /* Return non-zero if CH is part of a valid multibyte sequence:
      27     Either incomplete yet valid sequence (in case of a leading byte),
      28     or the last byte of a valid multibyte sequence.
      29  
      30     Return zero in all other cases:
      31      CH is a valid single-byte character (e.g. 0x01-0x7F in UTF-8 locales);
      32      CH is an invalid byte in a multibyte sequence for the currentl locale,
      33      CH is the NUL byte.
      34  
      35     Reset CUR_STAT in the case of an invalid byte.
      36  */
      37  int
      38  is_mb_char (int ch, mbstate_t *cur_stat)
      39  {
      40    const char c = ch ;
      41    const int mb_pending = !mbsinit (cur_stat);
      42    const int result = mbrtowc (NULL, &c, 1, cur_stat);
      43  
      44    switch (result)
      45      {
      46      case -2: /* Beginning or middle of valid multibyte sequence */
      47        return 1;
      48  
      49      case -1: /* Invalid sequence, byte treated like a single-byte character */
      50        memset (cur_stat, 0, sizeof (mbstate_t));
      51        return 0;
      52  
      53      case 1: /* A valid byte, check if part of on-going multibyte sequence */
      54        return mb_pending;
      55  
      56      case 0: /* Special case of mbrtowc(3): the NUL character */
      57        /* TODO: test this */
      58        return 1;
      59  
      60      default: /* Should never happen, as per mbrtowc(3) documentation */
      61        panic ("is_mb_char: mbrtowc (0x%x) returned %d",
      62               (unsigned int) ch, result);
      63      }
      64  }
      65  
      66  void
      67  initialize_mbcs (void)
      68  {
      69    /* For UTF-8, we know that the encoding is stateless.  */
      70    const char *codeset_name;
      71  
      72    codeset_name = locale_charset ();
      73    is_utf8 = (strcmp (codeset_name, "UTF-8") == 0);
      74  
      75    mb_cur_max = MB_CUR_MAX;
      76  }