(root)/
m4-1.4.19/
lib/
mbiter.h
       1  /* Iterating through multibyte strings: macros for multi-byte encodings.
       2     Copyright (C) 2001, 2005, 2007, 2009-2021 Free Software Foundation, Inc.
       3  
       4     This program is free software: you can redistribute it and/or modify
       5     it under the terms of the GNU General Public License as published by
       6     the Free Software Foundation; either version 3 of the License, or
       7     (at your option) any later version.
       8  
       9     This program is distributed in the hope that it will be useful,
      10     but WITHOUT ANY WARRANTY; without even the implied warranty of
      11     MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
      12     GNU General Public License for more details.
      13  
      14     You should have received a copy of the GNU General Public License
      15     along with this program.  If not, see <https://www.gnu.org/licenses/>.  */
      16  
      17  /* Written by Bruno Haible <bruno@clisp.org>.  */
      18  
      19  /* The macros in this file implement forward iteration through a
      20     multi-byte string.
      21  
      22     With these macros, an iteration loop that looks like
      23  
      24        char *iter;
      25        for (iter = buf; iter < buf + buflen; iter++)
      26          {
      27            do_something (*iter);
      28          }
      29  
      30     becomes
      31  
      32        mbi_iterator_t iter;
      33        for (mbi_init (iter, buf, buflen); mbi_avail (iter); mbi_advance (iter))
      34          {
      35            do_something (mbi_cur_ptr (iter), mb_len (mbi_cur (iter)));
      36          }
      37  
      38     The benefit of these macros over plain use of mbrtowc is:
      39     - Handling of invalid multibyte sequences is possible without
      40       making the code more complicated, while still preserving the
      41       invalid multibyte sequences.
      42  
      43     mbi_iterator_t
      44       is a type usable for variable declarations.
      45  
      46     mbi_init (iter, startptr, length)
      47       initializes the iterator, starting at startptr and crossing length bytes.
      48  
      49     mbi_avail (iter)
      50       returns true if there are more multibyte characters available before
      51       the end of string is reached. In this case, mbi_cur (iter) is
      52       initialized to the next multibyte character.
      53  
      54     mbi_advance (iter)
      55       advances the iterator by one multibyte character.
      56  
      57     mbi_cur (iter)
      58       returns the current multibyte character, of type mbchar_t.  All the
      59       macros defined in mbchar.h can be used on it.
      60  
      61     mbi_cur_ptr (iter)
      62       return a pointer to the beginning of the current multibyte character.
      63  
      64     mbi_reloc (iter, ptrdiff)
      65       relocates iterator when the string is moved by ptrdiff bytes.
      66  
      67     mbi_copy (&destiter, &srciter)
      68       copies srciter to destiter.
      69  
      70     Here are the function prototypes of the macros.
      71  
      72     extern void          mbi_init (mbi_iterator_t iter,
      73                                    const char *startptr, size_t length);
      74     extern bool          mbi_avail (mbi_iterator_t iter);
      75     extern void          mbi_advance (mbi_iterator_t iter);
      76     extern mbchar_t      mbi_cur (mbi_iterator_t iter);
      77     extern const char *  mbi_cur_ptr (mbi_iterator_t iter);
      78     extern void          mbi_reloc (mbi_iterator_t iter, ptrdiff_t ptrdiff);
      79     extern void          mbi_copy (mbi_iterator_t *new, const mbi_iterator_t *old);
      80   */
      81  
      82  #ifndef _MBITER_H
      83  #define _MBITER_H 1
      84  
      85  #include <assert.h>
      86  #include <stdbool.h>
      87  #include <stddef.h>
      88  #include <string.h>
      89  #include <wchar.h>
      90  
      91  #include "mbchar.h"
      92  
      93  #ifndef _GL_INLINE_HEADER_BEGIN
      94   #error "Please include config.h first."
      95  #endif
      96  _GL_INLINE_HEADER_BEGIN
      97  #ifndef MBITER_INLINE
      98  # define MBITER_INLINE _GL_INLINE
      99  #endif
     100  
     101  struct mbiter_multi
     102  {
     103    const char *limit;    /* pointer to end of string */
     104    bool in_shift;        /* true if next byte may not be interpreted as ASCII */
     105    mbstate_t state;      /* if in_shift: current shift state */
     106    bool next_done;       /* true if mbi_avail has already filled the following */
     107    struct mbchar cur;    /* the current character:
     108          const char *cur.ptr             pointer to current character
     109          The following are only valid after mbi_avail.
     110          size_t cur.bytes                number of bytes of current character
     111          bool cur.wc_valid               true if wc is a valid wide character
     112          wchar_t cur.wc                  if wc_valid: the current character
     113          */
     114  };
     115  
     116  MBITER_INLINE void
     117  mbiter_multi_next (struct mbiter_multi *iter)
     118  {
     119    if (iter->next_done)
     120      return;
     121    if (iter->in_shift)
     122      goto with_shift;
     123    /* Handle most ASCII characters quickly, without calling mbrtowc().  */
     124    if (is_basic (*iter->cur.ptr))
     125      {
     126        /* These characters are part of the basic character set.  ISO C 99
     127           guarantees that their wide character code is identical to their
     128           char code.  */
     129        iter->cur.bytes = 1;
     130        iter->cur.wc = *iter->cur.ptr;
     131        iter->cur.wc_valid = true;
     132      }
     133    else
     134      {
     135        assert (mbsinit (&iter->state));
     136        iter->in_shift = true;
     137      with_shift:
     138        iter->cur.bytes = mbrtowc (&iter->cur.wc, iter->cur.ptr,
     139                                   iter->limit - iter->cur.ptr, &iter->state);
     140        if (iter->cur.bytes == (size_t) -1)
     141          {
     142            /* An invalid multibyte sequence was encountered.  */
     143            iter->cur.bytes = 1;
     144            iter->cur.wc_valid = false;
     145            /* Whether to set iter->in_shift = false and reset iter->state
     146               or not is not very important; the string is bogus anyway.  */
     147          }
     148        else if (iter->cur.bytes == (size_t) -2)
     149          {
     150            /* An incomplete multibyte character at the end.  */
     151            iter->cur.bytes = iter->limit - iter->cur.ptr;
     152            iter->cur.wc_valid = false;
     153            /* Whether to set iter->in_shift = false and reset iter->state
     154               or not is not important; the string end is reached anyway.  */
     155          }
     156        else
     157          {
     158            if (iter->cur.bytes == 0)
     159              {
     160                /* A null wide character was encountered.  */
     161                iter->cur.bytes = 1;
     162                assert (*iter->cur.ptr == '\0');
     163                assert (iter->cur.wc == 0);
     164              }
     165            iter->cur.wc_valid = true;
     166  
     167            /* When in the initial state, we can go back treating ASCII
     168               characters more quickly.  */
     169            if (mbsinit (&iter->state))
     170              iter->in_shift = false;
     171          }
     172      }
     173    iter->next_done = true;
     174  }
     175  
     176  MBITER_INLINE void
     177  mbiter_multi_reloc (struct mbiter_multi *iter, ptrdiff_t ptrdiff)
     178  {
     179    iter->cur.ptr += ptrdiff;
     180    iter->limit += ptrdiff;
     181  }
     182  
     183  MBITER_INLINE void
     184  mbiter_multi_copy (struct mbiter_multi *new_iter, const struct mbiter_multi *old_iter)
     185  {
     186    new_iter->limit = old_iter->limit;
     187    if ((new_iter->in_shift = old_iter->in_shift))
     188      memcpy (&new_iter->state, &old_iter->state, sizeof (mbstate_t));
     189    else
     190      memset (&new_iter->state, 0, sizeof (mbstate_t));
     191    new_iter->next_done = old_iter->next_done;
     192    mb_copy (&new_iter->cur, &old_iter->cur);
     193  }
     194  
     195  /* Iteration macros.  */
     196  typedef struct mbiter_multi mbi_iterator_t;
     197  #define mbi_init(iter, startptr, length) \
     198    ((iter).cur.ptr = (startptr), (iter).limit = (iter).cur.ptr + (length), \
     199     (iter).in_shift = false, memset (&(iter).state, '\0', sizeof (mbstate_t)), \
     200     (iter).next_done = false)
     201  #define mbi_avail(iter) \
     202    ((iter).cur.ptr < (iter).limit && (mbiter_multi_next (&(iter)), true))
     203  #define mbi_advance(iter) \
     204    ((iter).cur.ptr += (iter).cur.bytes, (iter).next_done = false)
     205  
     206  /* Access to the current character.  */
     207  #define mbi_cur(iter) (iter).cur
     208  #define mbi_cur_ptr(iter) (iter).cur.ptr
     209  
     210  /* Relocation.  */
     211  #define mbi_reloc(iter, ptrdiff) mbiter_multi_reloc (&iter, ptrdiff)
     212  
     213  /* Copying an iterator.  */
     214  #define mbi_copy mbiter_multi_copy
     215  
     216  _GL_INLINE_HEADER_END
     217  
     218  #endif /* _MBITER_H */