1  /* qmark.c -- quote 'dangerous' filenames
       2     Derived from coreutils' ls.c.
       3     Copyright (C) 1985-2022 Free Software Foundation, Inc.
       4  
       5     This program is free software: you can redistribute it and/or modify
       6     it under the terms of the GNU General Public License as published by
       7     the Free Software Foundation, either version 3 of the License, or
       8     (at your option) any later version.
       9  
      10     This program is distributed in the hope that it will be useful,
      11     but WITHOUT ANY WARRANTY; without even the implied warranty of
      12     MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
      13     GNU General Public License for more details.
      14  
      15     You should have received a copy of the GNU General Public License
      16     along with this program.  If not, see <https://www.gnu.org/licenses/>.
      17  */
      18  /* config.h must be included first. */
      19  #include <config.h>
      20  
      21  /* system headers. */
      22  #include <ctype.h>
      23  #include <stdlib.h>
      24  #include <string.h>
      25  #include <wchar.h>
      26  
      27  /* gnulib headers would go here if any needed to be included. */
      28  
      29  /* find headers. */
      30  #include "printquoted.h"
      31  
      32  
      33  
      34  /*
      35     This comment, IN_CTYPE_DOMAIN and ISPRINT were borrowed from
      36     coreutils at Sun Jun  5 21:17:40 2005 UTC.
      37  
      38     Jim Meyering writes:
      39  
      40     "... Some ctype macros are valid only for character codes that
      41     isascii says are ASCII (SGI's IRIX-4.0.5 is one such system --when
      42     using /bin/cc or gcc but without giving an ansi option).  So, all
      43     ctype uses should be through macros like ISPRINT...  If
      44     STDC_HEADERS is defined, then autoconf has verified that the ctype
      45     macros don't need to be guarded with references to isascii. ...
      46     Defining isascii to 1 should let any compiler worth its salt
      47     eliminate the && through constant folding."
      48  
      49     Bruno Haible adds:
      50  
      51     "... Furthermore, isupper(c) etc. have an undefined result if c is
      52     outside the range -1 <= c <= 255. One is tempted to write isupper(c)
      53     with c being of type `char', but this is wrong if c is an 8-bit
      54     character >= 128 which gets sign-extended to a negative value.
      55     The macro ISUPPER protects against this as well."
      56  
      57     (Actually that rule of ISUPPER is now taken by to_uchar).
      58  */
      59  
      60  #if STDC_HEADERS
      61  # define IN_CTYPE_DOMAIN(c) 1
      62  #else
      63  # define IN_CTYPE_DOMAIN(c) isascii(c)
      64  #endif
      65  
      66  /* ISPRINT is defined in <sys/euc.h> on at least Solaris2.6 systems.  */
      67  #undef ISPRINT
      68  #define ISPRINT(c) (IN_CTYPE_DOMAIN (c) && isprint (c))
      69  
      70  
      71  
      72  
      73  
      74  /* Convert a possibly-signed character to an unsigned character.  This is
      75   * a bit safer than casting to unsigned char, since it catches some type
      76   * errors that the cast doesn't.
      77   *
      78   * This code taken from coreutils' system.h header at
      79   * Sun Jun  5 21:05:21 2005 UTC.
      80   */
      81  static inline unsigned char to_uchar (char ch)
      82  {
      83    return ch;
      84  }
      85  
      86  
      87  static size_t
      88  unibyte_qmark_chars (char *buf, size_t len)
      89  {
      90    char *p = buf;
      91    char const *plimit = buf + len;
      92  
      93    while (p < plimit)
      94      {
      95        if (! ISPRINT (to_uchar (*p)))
      96  	*p = '?';
      97        p++;
      98      }
      99    return len;
     100  }
     101  
     102  
     103  
     104  
     105  
     106  /* Scan BUF, replacing any dangerous-looking characters with question
     107   * marks.  This code is taken from the ls.c file in coreutils as at
     108   * Sun Jun  5 20:51:54 2005 UTC.
     109   *
     110   * This function may shrink the buffer.   Either way, the new length
     111   * is returned.
     112   */
     113  size_t
     114  qmark_chars (char *buf, size_t len)
     115  {
     116    if (MB_CUR_MAX <= 1)
     117      {
     118        return unibyte_qmark_chars (buf, len);
     119      }
     120    else
     121      {
     122        char const *p = buf;
     123        char const *plimit = buf + len;
     124        char *q = buf;
     125  
     126        while (p < plimit)
     127  	switch (*p)
     128  	  {
     129  	  case ' ': case '!': case '"': case '#': case '%':
     130  	  case '&': case '\'': case '(': case ')': case '*':
     131  	  case '+': case ',': case '-': case '.': case '/':
     132  	  case '0': case '1': case '2': case '3': case '4':
     133  	  case '5': case '6': case '7': case '8': case '9':
     134  	  case ':': case ';': case '<': case '=': case '>':
     135  	  case '?':
     136  	  case 'A': case 'B': case 'C': case 'D': case 'E':
     137  	  case 'F': case 'G': case 'H': case 'I': case 'J':
     138  	  case 'K': case 'L': case 'M': case 'N': case 'O':
     139  	  case 'P': case 'Q': case 'R': case 'S': case 'T':
     140  	  case 'U': case 'V': case 'W': case 'X': case 'Y':
     141  	  case 'Z':
     142  	  case '[': case '\\': case ']': case '^': case '_':
     143  	  case 'a': case 'b': case 'c': case 'd': case 'e':
     144  	  case 'f': case 'g': case 'h': case 'i': case 'j':
     145  	  case 'k': case 'l': case 'm': case 'n': case 'o':
     146  	  case 'p': case 'q': case 'r': case 's': case 't':
     147  	  case 'u': case 'v': case 'w': case 'x': case 'y':
     148  	  case 'z': case '{': case '|': case '}': case '~':
     149  	    /* These characters are printable ASCII characters.  */
     150  	    *q++ = *p++;
     151  	    break;
     152  	  default:
     153  	    /* If we have a multibyte sequence, copy it until we
     154  	       reach its end, replacing each non-printable multibyte
     155  	       character with a single question mark.  */
     156  	    {
     157  	      mbstate_t mbstate;
     158  	      memset (&mbstate, 0, sizeof mbstate);
     159  	      do
     160  		{
     161  		  wchar_t wc;
     162  		  size_t bytes;
     163  		  int w;
     164  
     165  		  bytes = mbrtowc (&wc, p, plimit - p, &mbstate);
     166  
     167  		  if (bytes == (size_t) -1)
     168  		    {
     169  		      /* An invalid multibyte sequence was
     170  			 encountered.  Skip one input byte, and
     171  			 put a question mark.  */
     172  		      p++;
     173  		      *q++ = '?';
     174  		      break;
     175  		    }
     176  
     177  		  if (bytes == (size_t) -2)
     178  		    {
     179  		      /* An incomplete multibyte character
     180  			 at the end.  Replace it entirely with
     181  			 a question mark.  */
     182  		      p = plimit;
     183  		      *q++ = '?';
     184  		      break;
     185  		    }
     186  
     187  		  if (bytes == 0)
     188  		    /* A null wide character was encountered.  */
     189  		    bytes = 1;
     190  
     191  		  w = wcwidth (wc);
     192  		  if (w >= 0)
     193  		    {
     194  		      /* A printable multibyte character.
     195  			 Keep it.  */
     196  		      for (; bytes > 0; --bytes)
     197  			*q++ = *p++;
     198  		    }
     199  		  else
     200  		    {
     201  		      /* An unprintable multibyte character.
     202  			 Replace it entirely with a question
     203  			 mark.  */
     204  		      p += bytes;
     205  		      *q++ = '?';
     206  		    }
     207  		}
     208  	      while (! mbsinit (&mbstate));
     209  	    }
     210  	    break;
     211  	  }
     212  
     213        /* The buffer may have shrunk.  */
     214        len = q - buf;
     215        return len;
     216      }
     217  }