(root)/
bison-3.8.2/
lib/
mbfile.h
       1  /* Multibyte character I/O: macros for multi-byte encodings.
       2     Copyright (C) 2001, 2005, 2009-2021 Free Software Foundation, Inc.
       3  
       4     This file is free software: you can redistribute it and/or modify
       5     it under the terms of the GNU Lesser General Public License as
       6     published by the Free Software Foundation; either version 3 of the
       7     License, or (at your option) any later version.
       8  
       9     This file is distributed in the hope that it will be useful,
      10     but WITHOUT ANY WARRANTY; without even the implied warranty of
      11     MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
      12     GNU Lesser General Public License for more details.
      13  
      14     You should have received a copy of the GNU Lesser General Public License
      15     along with this program.  If not, see <https://www.gnu.org/licenses/>.  */
      16  
      17  /* Written by Mitsuru Chinen <mchinen@yamato.ibm.com>
      18     and Bruno Haible <bruno@clisp.org>.  */
      19  
      20  /* The macros in this file implement multi-byte character input from a
      21     stream.
      22  
      23     mb_file_t
      24       is the type for multibyte character input stream, usable for variable
      25       declarations.
      26  
      27     mbf_char_t
      28       is the type for multibyte character or EOF, usable for variable
      29       declarations.
      30  
      31     mbf_init (mbf, stream)
      32       initializes the MB_FILE for reading from stream.
      33  
      34     mbf_getc (mbc, mbf)
      35       reads the next multibyte character from mbf and stores it in mbc.
      36  
      37     mb_iseof (mbc)
      38       returns true if mbc represents the EOF value.
      39  
      40     Here are the function prototypes of the macros.
      41  
      42     extern void          mbf_init (mb_file_t mbf, FILE *stream);
      43     extern void          mbf_getc (mbf_char_t mbc, mb_file_t mbf);
      44     extern bool          mb_iseof (const mbf_char_t mbc);
      45   */
      46  
      47  #ifndef _MBFILE_H
      48  #define _MBFILE_H 1
      49  
      50  #include <assert.h>
      51  #include <stdbool.h>
      52  #include <stdio.h>
      53  #include <string.h>
      54  #include <wchar.h>
      55  
      56  #include "mbchar.h"
      57  
      58  #ifndef _GL_INLINE_HEADER_BEGIN
      59   #error "Please include config.h first."
      60  #endif
      61  _GL_INLINE_HEADER_BEGIN
      62  #ifndef MBFILE_INLINE
      63  # define MBFILE_INLINE _GL_INLINE
      64  #endif
      65  
      66  struct mbfile_multi {
      67    FILE *fp;
      68    bool eof_seen;
      69    bool have_pushback;
      70    mbstate_t state;
      71    unsigned int bufcount;
      72    char buf[MBCHAR_BUF_SIZE];
      73    struct mbchar pushback;
      74  };
      75  
      76  MBFILE_INLINE void
      77  mbfile_multi_getc (struct mbchar *mbc, struct mbfile_multi *mbf)
      78  {
      79    size_t bytes;
      80  
      81    /* If EOF has already been seen, don't use getc.  This matters if
      82       mbf->fp is connected to an interactive tty.  */
      83    if (mbf->eof_seen)
      84      goto eof;
      85  
      86    /* Return character pushed back, if there is one.  */
      87    if (mbf->have_pushback)
      88      {
      89        mb_copy (mbc, &mbf->pushback);
      90        mbf->have_pushback = false;
      91        return;
      92      }
      93  
      94    /* Before using mbrtowc, we need at least one byte.  */
      95    if (mbf->bufcount == 0)
      96      {
      97        int c = getc (mbf->fp);
      98        if (c == EOF)
      99          {
     100            mbf->eof_seen = true;
     101            goto eof;
     102          }
     103        mbf->buf[0] = (unsigned char) c;
     104        mbf->bufcount++;
     105      }
     106  
     107    /* Handle most ASCII characters quickly, without calling mbrtowc().  */
     108    if (mbf->bufcount == 1 && mbsinit (&mbf->state) && is_basic (mbf->buf[0]))
     109      {
     110        /* These characters are part of the basic character set.  ISO C 99
     111           guarantees that their wide character code is identical to their
     112           char code.  */
     113        mbc->wc = mbc->buf[0] = mbf->buf[0];
     114        mbc->wc_valid = true;
     115        mbc->ptr = &mbc->buf[0];
     116        mbc->bytes = 1;
     117        mbf->bufcount = 0;
     118        return;
     119      }
     120  
     121    /* Use mbrtowc on an increasing number of bytes.  Read only as many bytes
     122       from mbf->fp as needed.  This is needed to give reasonable interactive
     123       behaviour when mbf->fp is connected to an interactive tty.  */
     124    for (;;)
     125      {
     126        /* We don't know whether the 'mbrtowc' function updates the state when
     127           it returns -2, - this is the ISO C 99 and glibc-2.2 behaviour - or
     128           not - amended ANSI C, glibc-2.1 and Solaris 2.7 behaviour.  We
     129           don't have an autoconf test for this, yet.
     130           The new behaviour would allow us to feed the bytes one by one into
     131           mbrtowc.  But the old behaviour forces us to feed all bytes since
     132           the end of the last character into mbrtowc.  Since we want to retry
     133           with more bytes when mbrtowc returns -2, we must backup the state
     134           before calling mbrtowc, because implementations with the new
     135           behaviour will clobber it.  */
     136        mbstate_t backup_state = mbf->state;
     137  
     138        bytes = mbrtowc (&mbc->wc, &mbf->buf[0], mbf->bufcount, &mbf->state);
     139  
     140        if (bytes == (size_t) -1)
     141          {
     142            /* An invalid multibyte sequence was encountered.  */
     143            /* Return a single byte.  */
     144            bytes = 1;
     145            mbc->wc_valid = false;
     146            break;
     147          }
     148        else if (bytes == (size_t) -2)
     149          {
     150            /* An incomplete multibyte character.  */
     151            mbf->state = backup_state;
     152            if (mbf->bufcount == MBCHAR_BUF_SIZE)
     153              {
     154                /* An overlong incomplete multibyte sequence was encountered.  */
     155                /* Return a single byte.  */
     156                bytes = 1;
     157                mbc->wc_valid = false;
     158                break;
     159              }
     160            else
     161              {
     162                /* Read one more byte and retry mbrtowc.  */
     163                int c = getc (mbf->fp);
     164                if (c == EOF)
     165                  {
     166                    /* An incomplete multibyte character at the end.  */
     167                    mbf->eof_seen = true;
     168                    bytes = mbf->bufcount;
     169                    mbc->wc_valid = false;
     170                    break;
     171                  }
     172                mbf->buf[mbf->bufcount] = (unsigned char) c;
     173                mbf->bufcount++;
     174              }
     175          }
     176        else
     177          {
     178            if (bytes == 0)
     179              {
     180                /* A null wide character was encountered.  */
     181                bytes = 1;
     182                assert (mbf->buf[0] == '\0');
     183                assert (mbc->wc == 0);
     184              }
     185            mbc->wc_valid = true;
     186            break;
     187          }
     188      }
     189  
     190    /* Return the multibyte sequence mbf->buf[0..bytes-1].  */
     191    mbc->ptr = &mbc->buf[0];
     192    memcpy (&mbc->buf[0], &mbf->buf[0], bytes);
     193    mbc->bytes = bytes;
     194  
     195    mbf->bufcount -= bytes;
     196    if (mbf->bufcount > 0)
     197      {
     198        /* It's not worth calling memmove() for so few bytes.  */
     199        unsigned int count = mbf->bufcount;
     200        char *p = &mbf->buf[0];
     201  
     202        do
     203          {
     204            *p = *(p + bytes);
     205            p++;
     206          }
     207        while (--count > 0);
     208      }
     209    return;
     210  
     211  eof:
     212    /* An mbchar_t with bytes == 0 is used to indicate EOF.  */
     213    mbc->ptr = NULL;
     214    mbc->bytes = 0;
     215    mbc->wc_valid = false;
     216    return;
     217  }
     218  
     219  MBFILE_INLINE void
     220  mbfile_multi_ungetc (const struct mbchar *mbc, struct mbfile_multi *mbf)
     221  {
     222    mb_copy (&mbf->pushback, mbc);
     223    mbf->have_pushback = true;
     224  }
     225  
     226  typedef struct mbfile_multi mb_file_t;
     227  
     228  typedef mbchar_t mbf_char_t;
     229  
     230  #define mbf_init(mbf, stream)                                           \
     231    ((mbf).fp = (stream),                                                 \
     232     (mbf).eof_seen = false,                                              \
     233     (mbf).have_pushback = false,                                         \
     234     memset (&(mbf).state, '\0', sizeof (mbstate_t)),                     \
     235     (mbf).bufcount = 0)
     236  
     237  #define mbf_getc(mbc, mbf) mbfile_multi_getc (&(mbc), &(mbf))
     238  
     239  #define mbf_ungetc(mbc, mbf) mbfile_multi_ungetc (&(mbc), &(mbf))
     240  
     241  #define mb_iseof(mbc) ((mbc).bytes == 0)
     242  
     243  _GL_INLINE_HEADER_END
     244  
     245  #endif /* _MBFILE_H */