1 /* GNU SED, a batch stream editor.
2 Copyright (C) 2003-2022 Free Software Foundation, Inc.
3
4 This program is free software; you can redistribute it and/or modify
5 it under the terms of the GNU General Public License as published by
6 the Free Software Foundation; either version 3, or (at your option)
7 any later version.
8
9 This program is distributed in the hope that it will be useful,
10 but WITHOUT ANY WARRANTY; without even the implied warranty of
11 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 GNU General Public License for more details.
13
14 You should have received a copy of the GNU General Public License
15 along with this program; If not, see <https://www.gnu.org/licenses/>. */
16
17 #include "sed.h"
18 #include <stdlib.h>
19 #include <string.h>
20
21 #include "localcharset.h"
22
23 int mb_cur_max;
24 bool is_utf8;
25
26 /* Return non-zero if CH is part of a valid multibyte sequence:
27 Either incomplete yet valid sequence (in case of a leading byte),
28 or the last byte of a valid multibyte sequence.
29
30 Return zero in all other cases:
31 CH is a valid single-byte character (e.g. 0x01-0x7F in UTF-8 locales);
32 CH is an invalid byte in a multibyte sequence for the currentl locale,
33 CH is the NUL byte.
34
35 Reset CUR_STAT in the case of an invalid byte.
36 */
37 int
38 is_mb_char (int ch, mbstate_t *cur_stat)
39 {
40 const char c = ch ;
41 const int mb_pending = !mbsinit (cur_stat);
42 const int result = mbrtowc (NULL, &c, 1, cur_stat);
43
44 switch (result)
45 {
46 case -2: /* Beginning or middle of valid multibyte sequence */
47 return 1;
48
49 case -1: /* Invalid sequence, byte treated like a single-byte character */
50 memset (cur_stat, 0, sizeof (mbstate_t));
51 return 0;
52
53 case 1: /* A valid byte, check if part of on-going multibyte sequence */
54 return mb_pending;
55
56 case 0: /* Special case of mbrtowc(3): the NUL character */
57 /* TODO: test this */
58 return 1;
59
60 default: /* Should never happen, as per mbrtowc(3) documentation */
61 panic ("is_mb_char: mbrtowc (0x%x) returned %d",
62 (unsigned int) ch, result);
63 }
64 }
65
66 void
67 initialize_mbcs (void)
68 {
69 /* For UTF-8, we know that the encoding is stateless. */
70 const char *codeset_name;
71
72 codeset_name = locale_charset ();
73 is_utf8 = (strcmp (codeset_name, "UTF-8") == 0);
74
75 mb_cur_max = MB_CUR_MAX;
76 }