(root)/
coreutils-9.4/
lib/
mbuiterf.h
       1  /* Iterating through multibyte strings, faster: macros for multi-byte encodings.
       2     Copyright (C) 2001, 2005, 2007, 2009-2023 Free Software Foundation, Inc.
       3  
       4     This file is free software: you can redistribute it and/or modify
       5     it under the terms of the GNU Lesser General Public License as
       6     published by the Free Software Foundation, either version 3 of the
       7     License, or (at your option) any later version.
       8  
       9     This file is distributed in the hope that it will be useful,
      10     but WITHOUT ANY WARRANTY; without even the implied warranty of
      11     MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
      12     GNU Lesser General Public License for more details.
      13  
      14     You should have received a copy of the GNU Lesser General Public License
      15     along with this program.  If not, see <https://www.gnu.org/licenses/>.  */
      16  
      17  /* Written by Bruno Haible <bruno@clisp.org>,
      18     with insights from Paul Eggert.  */
      19  
      20  /* The macros in this file implement forward iteration through a
      21     multi-byte string, without knowing its length a-priori.
      22  
      23     With these macros, an iteration loop that looks like
      24  
      25        char *iter;
      26        for (iter = buf; *iter != '\0'; iter++)
      27          {
      28            do_something (*iter);
      29          }
      30  
      31     becomes
      32  
      33        mbuif_state_t state;
      34        [const] char *iter;
      35        for (mbuif_init (state), iter = buf; mbuif_avail (state, iter); )
      36          {
      37            mbchar_t cur = mbuif_next (state, iter);
      38            // Note: Here always mb_ptr (cur) == iter.
      39            do_something (iter, mb_len (cur));
      40            iter += mb_len (cur);
      41          }
      42  
      43     The benefit of these macros over plain use of mbrtowc or mbrtoc32 is:
      44     - Handling of invalid multibyte sequences is possible without
      45       making the code more complicated, while still preserving the
      46       invalid multibyte sequences.
      47  
      48     Compared to mbiterf.h, the macros here don't need to know the string's
      49     length a-priori.  The downside is that at each step, the look-ahead
      50     that guards against overrunning the terminating '\0' is more expensive.
      51     The mbuif_* macros are therefore suitable when there is a high probability
      52     that only the first few multibyte characters need to be inspected.
      53     Whereas the mbif_* macros are better if usually the iteration runs
      54     through the entire string.
      55  
      56     The benefit of these macros over those from mbuiter.h is that it
      57     produces faster code with today's optimizing compilers (because mbuif_next
      58     returns its result by value).
      59  
      60     mbuif_state_t
      61       is a type usable for variable declarations.
      62  
      63     mbuif_init (state)
      64       initializes the state.
      65  
      66     mbuif_avail (state, iter)
      67       returns true if another loop round is needed.
      68  
      69     mbuif_next (state, iter)
      70       returns the next multibyte character.
      71       It asssumes that the state is initialized and that *iter != '\0'.
      72  
      73     Here are the function prototypes of the macros.
      74  
      75     extern void      mbuif_init (mbuif_state_t state);
      76     extern bool      mbuif_avail (mbuif_state_t state, const char *iter);
      77     extern mbchar_t  mbuif_next (mbuif_state_t state, const char *iter);
      78   */
      79  
      80  #ifndef _MBUITERF_H
      81  #define _MBUITERF_H 1
      82  
      83  /* This file uses _GL_INLINE_HEADER_BEGIN, _GL_INLINE,
      84     _GL_ATTRIBUTE_ALWAYS_INLINE.  */
      85  #if !_GL_CONFIG_H_INCLUDED
      86   #error "Please include config.h first."
      87  #endif
      88  
      89  #include <assert.h>
      90  #include <stddef.h>
      91  #include <stdlib.h>
      92  #include <string.h>
      93  #include <uchar.h>
      94  #include <wchar.h>
      95  
      96  #include "mbchar.h"
      97  #include "strnlen1.h"
      98  
      99  _GL_INLINE_HEADER_BEGIN
     100  #ifndef MBUITERF_INLINE
     101  # define MBUITERF_INLINE _GL_INLINE _GL_ATTRIBUTE_ALWAYS_INLINE
     102  #endif
     103  
     104  struct mbuif_state
     105  {
     106    #if !GNULIB_MBRTOC32_REGULAR
     107    bool in_shift;        /* true if next byte may not be interpreted as ASCII */
     108                          /* If GNULIB_MBRTOC32_REGULAR, it is always false,
     109                             so optimize it away.  */
     110    #endif
     111    mbstate_t state;      /* if in_shift: current shift state */
     112                          /* If GNULIB_MBRTOC32_REGULAR, it is in an initial state
     113                             before and after every mbuiterf_next invocation.
     114                           */
     115    unsigned int cur_max; /* A cache of MB_CUR_MAX.  */
     116  };
     117  
     118  MBUITERF_INLINE mbchar_t
     119  mbuiterf_next (struct mbuif_state *ps, const char *iter)
     120  {
     121    #if !GNULIB_MBRTOC32_REGULAR
     122    if (ps->in_shift)
     123      goto with_shift;
     124    #endif
     125    /* Handle most ASCII characters quickly, without calling mbrtowc().  */
     126    if (is_basic (*iter))
     127      {
     128        /* These characters are part of the POSIX portable character set.
     129           For most of them, namely those in the ISO C basic character set,
     130           ISO C 99 guarantees that their wide character code is identical to
     131           their char code.  For the few other ones, this is the case as well,
     132           in all locale encodings that are in use.  The 32-bit wide character
     133           code is the same as well.  */
     134        return (mbchar_t) { .ptr = iter, .bytes = 1, .wc_valid = true, .wc = *iter };
     135      }
     136    else
     137      {
     138        assert (mbsinit (&ps->state));
     139        #if !GNULIB_MBRTOC32_REGULAR
     140        ps->in_shift = true;
     141      with_shift:;
     142        #endif
     143        size_t bytes;
     144        char32_t wc;
     145        bytes = mbrtoc32 (&wc, iter, strnlen1 (iter, ps->cur_max), &ps->state);
     146        if (bytes == (size_t) -1)
     147          {
     148            /* An invalid multibyte sequence was encountered.  */
     149            /* Allow the next invocation to continue from a sane state.  */
     150            #if !GNULIB_MBRTOC32_REGULAR
     151            ps->in_shift = false;
     152            #endif
     153            mbszero (&ps->state);
     154            return (mbchar_t) { .ptr = iter, .bytes = 1, .wc_valid = false };
     155          }
     156        else if (bytes == (size_t) -2)
     157          {
     158            /* An incomplete multibyte character at the end.  */
     159            /* Whether to set ps->in_shift = false and reset ps->state or not is
     160               not important; the string end is reached anyway.  */
     161            return (mbchar_t) { .ptr = iter, .bytes = strlen (iter), .wc_valid = false };
     162          }
     163        else
     164          {
     165            if (bytes == 0)
     166              {
     167                /* A null wide character was encountered.  */
     168                bytes = 1;
     169                assert (*iter == '\0');
     170                assert (wc == 0);
     171              }
     172            #if !GNULIB_MBRTOC32_REGULAR
     173            else if (bytes == (size_t) -3)
     174              /* The previous multibyte sequence produced an additional 32-bit
     175                 wide character.  */
     176              bytes = 0;
     177            #endif
     178  
     179            /* When in an initial state, we can go back treating ASCII
     180               characters more quickly.  */
     181            #if !GNULIB_MBRTOC32_REGULAR
     182            if (mbsinit (&ps->state))
     183              ps->in_shift = false;
     184            #endif
     185            return (mbchar_t) { .ptr = iter, .bytes = bytes, .wc_valid = true, .wc = wc };
     186          }
     187      }
     188  }
     189  
     190  /* Iteration macros.  */
     191  typedef struct mbuif_state mbuif_state_t;
     192  #if !GNULIB_MBRTOC32_REGULAR
     193  #define mbuif_init(st) \
     194    ((st).in_shift = false, mbszero (&(st).state), \
     195     (st).cur_max = MB_CUR_MAX)
     196  #else
     197  /* Optimized: no in_shift.  */
     198  #define mbuif_init(st) \
     199    (mbszero (&(st).state), \
     200     (st).cur_max = MB_CUR_MAX)
     201  #endif
     202  #if !GNULIB_MBRTOC32_REGULAR
     203  #define mbuif_avail(st, iter) ((st).in_shift || (*(iter) != '\0'))
     204  #else
     205  /* Optimized: no in_shift.  */
     206  #define mbuif_avail(st, iter) (*(iter) != '\0')
     207  #endif
     208  #define mbuif_next(st, iter) \
     209    mbuiterf_next (&(st), (iter))
     210  
     211  _GL_INLINE_HEADER_END
     212  
     213  #endif /* _MBUITERF_H */