(root)/
grep-3.11/
lib/
mbiter.h
       1  /* Iterating through multibyte strings: macros for multi-byte encodings.
       2     Copyright (C) 2001, 2005, 2007, 2009-2023 Free Software Foundation, Inc.
       3  
       4     This file is free software: you can redistribute it and/or modify
       5     it under the terms of the GNU Lesser General Public License as
       6     published by the Free Software Foundation; either version 2.1 of the
       7     License, or (at your option) any later version.
       8  
       9     This file is distributed in the hope that it will be useful,
      10     but WITHOUT ANY WARRANTY; without even the implied warranty of
      11     MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
      12     GNU Lesser General Public License for more details.
      13  
      14     You should have received a copy of the GNU Lesser General Public License
      15     along with this program.  If not, see <https://www.gnu.org/licenses/>.  */
      16  
      17  /* Written by Bruno Haible <bruno@clisp.org>.  */
      18  
      19  /* The macros in this file implement forward iteration through a
      20     multi-byte string.
      21  
      22     With these macros, an iteration loop that looks like
      23  
      24        char *iter;
      25        for (iter = buf; iter < buf + buflen; iter++)
      26          {
      27            do_something (*iter);
      28          }
      29  
      30     becomes
      31  
      32        mbi_iterator_t iter;
      33        for (mbi_init (iter, buf, buflen); mbi_avail (iter); mbi_advance (iter))
      34          {
      35            do_something (mbi_cur_ptr (iter), mb_len (mbi_cur (iter)));
      36          }
      37  
      38     The benefit of these macros over plain use of mbrtowc is:
      39     - Handling of invalid multibyte sequences is possible without
      40       making the code more complicated, while still preserving the
      41       invalid multibyte sequences.
      42  
      43     mbi_iterator_t
      44       is a type usable for variable declarations.
      45  
      46     mbi_init (iter, startptr, length)
      47       initializes the iterator, starting at startptr and crossing length bytes.
      48  
      49     mbi_avail (iter)
      50       returns true if there are more multibyte characters available before
      51       the end of string is reached. In this case, mbi_cur (iter) is
      52       initialized to the next multibyte character.
      53  
      54     mbi_advance (iter)
      55       advances the iterator by one multibyte character.
      56  
      57     mbi_cur (iter)
      58       returns the current multibyte character, of type mbchar_t.  All the
      59       macros defined in mbchar.h can be used on it.
      60  
      61     mbi_cur_ptr (iter)
      62       return a pointer to the beginning of the current multibyte character.
      63  
      64     mbi_reloc (iter, ptrdiff)
      65       relocates iterator when the string is moved by ptrdiff bytes.
      66  
      67     mbi_copy (&destiter, &srciter)
      68       copies srciter to destiter.
      69  
      70     Here are the function prototypes of the macros.
      71  
      72     extern void          mbi_init (mbi_iterator_t iter,
      73                                    const char *startptr, size_t length);
      74     extern bool          mbi_avail (mbi_iterator_t iter);
      75     extern void          mbi_advance (mbi_iterator_t iter);
      76     extern mbchar_t      mbi_cur (mbi_iterator_t iter);
      77     extern const char *  mbi_cur_ptr (mbi_iterator_t iter);
      78     extern void          mbi_reloc (mbi_iterator_t iter, ptrdiff_t ptrdiff);
      79     extern void          mbi_copy (mbi_iterator_t *new, const mbi_iterator_t *old);
      80   */
      81  
      82  #ifndef _MBITER_H
      83  #define _MBITER_H 1
      84  
      85  /* This file uses _GL_INLINE_HEADER_BEGIN, _GL_INLINE.  */
      86  #if !_GL_CONFIG_H_INCLUDED
      87   #error "Please include config.h first."
      88  #endif
      89  
      90  #include <assert.h>
      91  #include <stddef.h>
      92  #include <string.h>
      93  #include <wchar.h>
      94  
      95  #include "mbchar.h"
      96  
      97  _GL_INLINE_HEADER_BEGIN
      98  #ifndef MBITER_INLINE
      99  # define MBITER_INLINE _GL_INLINE
     100  #endif
     101  
     102  struct mbiter_multi
     103  {
     104    const char *limit;    /* pointer to end of string */
     105    bool in_shift;        /* true if next byte may not be interpreted as ASCII */
     106    mbstate_t state;      /* if in_shift: current shift state */
     107    bool next_done;       /* true if mbi_avail has already filled the following */
     108    struct mbchar cur;    /* the current character:
     109          const char *cur.ptr             pointer to current character
     110          The following are only valid after mbi_avail.
     111          size_t cur.bytes                number of bytes of current character
     112          bool cur.wc_valid               true if wc is a valid wide character
     113          wchar_t cur.wc                  if wc_valid: the current character
     114          */
     115  };
     116  
     117  MBITER_INLINE void
     118  mbiter_multi_next (struct mbiter_multi *iter)
     119  {
     120    if (iter->next_done)
     121      return;
     122    if (iter->in_shift)
     123      goto with_shift;
     124    /* Handle most ASCII characters quickly, without calling mbrtowc().  */
     125    if (is_basic (*iter->cur.ptr))
     126      {
     127        /* These characters are part of the basic character set.  ISO C 99
     128           guarantees that their wide character code is identical to their
     129           char code.  */
     130        iter->cur.bytes = 1;
     131        iter->cur.wc = *iter->cur.ptr;
     132        iter->cur.wc_valid = true;
     133      }
     134    else
     135      {
     136        assert (mbsinit (&iter->state));
     137        iter->in_shift = true;
     138      with_shift:
     139        iter->cur.bytes = mbrtowc (&iter->cur.wc, iter->cur.ptr,
     140                                   iter->limit - iter->cur.ptr, &iter->state);
     141        if (iter->cur.bytes == (size_t) -1)
     142          {
     143            /* An invalid multibyte sequence was encountered.  */
     144            iter->cur.bytes = 1;
     145            iter->cur.wc_valid = false;
     146            /* Whether to set iter->in_shift = false and reset iter->state
     147               or not is not very important; the string is bogus anyway.  */
     148          }
     149        else if (iter->cur.bytes == (size_t) -2)
     150          {
     151            /* An incomplete multibyte character at the end.  */
     152            iter->cur.bytes = iter->limit - iter->cur.ptr;
     153            iter->cur.wc_valid = false;
     154            /* Whether to set iter->in_shift = false and reset iter->state
     155               or not is not important; the string end is reached anyway.  */
     156          }
     157        else
     158          {
     159            if (iter->cur.bytes == 0)
     160              {
     161                /* A null wide character was encountered.  */
     162                iter->cur.bytes = 1;
     163                assert (*iter->cur.ptr == '\0');
     164                assert (iter->cur.wc == 0);
     165              }
     166            iter->cur.wc_valid = true;
     167  
     168            /* When in the initial state, we can go back treating ASCII
     169               characters more quickly.  */
     170            if (mbsinit (&iter->state))
     171              iter->in_shift = false;
     172          }
     173      }
     174    iter->next_done = true;
     175  }
     176  
     177  MBITER_INLINE void
     178  mbiter_multi_reloc (struct mbiter_multi *iter, ptrdiff_t ptrdiff)
     179  {
     180    iter->cur.ptr += ptrdiff;
     181    iter->limit += ptrdiff;
     182  }
     183  
     184  MBITER_INLINE void
     185  mbiter_multi_copy (struct mbiter_multi *new_iter, const struct mbiter_multi *old_iter)
     186  {
     187    new_iter->limit = old_iter->limit;
     188    if ((new_iter->in_shift = old_iter->in_shift))
     189      memcpy (&new_iter->state, &old_iter->state, sizeof (mbstate_t));
     190    else
     191      memset (&new_iter->state, 0, sizeof (mbstate_t));
     192    new_iter->next_done = old_iter->next_done;
     193    mb_copy (&new_iter->cur, &old_iter->cur);
     194  }
     195  
     196  /* Iteration macros.  */
     197  typedef struct mbiter_multi mbi_iterator_t;
     198  #define mbi_init(iter, startptr, length) \
     199    ((iter).cur.ptr = (startptr), (iter).limit = (iter).cur.ptr + (length), \
     200     (iter).in_shift = false, memset (&(iter).state, '\0', sizeof (mbstate_t)), \
     201     (iter).next_done = false)
     202  #define mbi_avail(iter) \
     203    ((iter).cur.ptr < (iter).limit && (mbiter_multi_next (&(iter)), true))
     204  #define mbi_advance(iter) \
     205    ((iter).cur.ptr += (iter).cur.bytes, (iter).next_done = false)
     206  
     207  /* Access to the current character.  */
     208  #define mbi_cur(iter) (iter).cur
     209  #define mbi_cur_ptr(iter) (iter).cur.ptr
     210  
     211  /* Relocation.  */
     212  #define mbi_reloc(iter, ptrdiff) mbiter_multi_reloc (&iter, ptrdiff)
     213  
     214  /* Copying an iterator.  */
     215  #define mbi_copy mbiter_multi_copy
     216  
     217  _GL_INLINE_HEADER_END
     218  
     219  #endif /* _MBITER_H */