1  /* locale information
       2  
       3     Copyright 2016-2023 Free Software Foundation, Inc.
       4  
       5     This program is free software; you can redistribute it and/or modify
       6     it under the terms of the GNU General Public License as published by
       7     the Free Software Foundation, either version 3, or (at your option)
       8     any later version.
       9  
      10     This program is distributed in the hope that it will be useful,
      11     but WITHOUT ANY WARRANTY; without even the implied warranty of
      12     MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
      13     GNU General Public License for more details.
      14  
      15     You should have received a copy of the GNU General Public License
      16     along with this program; if not, write to the Free Software
      17     Foundation, Inc., 51 Franklin Street - Fifth Floor, Boston, MA
      18     02110-1301, USA.  */
      19  
      20  /* Written by Paul Eggert.  */
      21  
      22  #include <config.h>
      23  
      24  #include <localeinfo.h>
      25  
      26  #include <verify.h>
      27  
      28  #include <limits.h>
      29  #include <locale.h>
      30  #include <stdlib.h>
      31  #include <string.h>
      32  #include <wctype.h>
      33  
      34  /* The sbclen implementation relies on this.  */
      35  verify (MB_LEN_MAX <= SCHAR_MAX);
      36  
      37  /* Return true if the locale uses UTF-8.  */
      38  
      39  static bool
      40  is_using_utf8 (void)
      41  {
      42    wchar_t wc;
      43    mbstate_t mbs = {0};
      44    return mbrtowc (&wc, "\xc4\x80", 2, &mbs) == 2 && wc == 0x100;
      45  }
      46  
      47  /* Return true if the locale is compatible enough with the C locale so
      48     that the locale is single-byte, bytes are in collating-sequence
      49     order, and there are no multi-character collating elements.  */
      50  
      51  static bool
      52  using_simple_locale (bool multibyte)
      53  {
      54    /* The native character set is known to be compatible with
      55       the C locale.  The following test isn't perfect, but it's good
      56       enough in practice, as only ASCII and EBCDIC are in common use
      57       and this test correctly accepts ASCII and rejects EBCDIC.  */
      58    enum { native_c_charset =
      59      ('\b' == 8 && '\t' == 9 && '\n' == 10 && '\v' == 11 && '\f' == 12
      60       && '\r' == 13 && ' ' == 32 && '!' == 33 && '"' == 34 && '#' == 35
      61       && '%' == 37 && '&' == 38 && '\'' == 39 && '(' == 40 && ')' == 41
      62       && '*' == 42 && '+' == 43 && ',' == 44 && '-' == 45 && '.' == 46
      63       && '/' == 47 && '0' == 48 && '9' == 57 && ':' == 58 && ';' == 59
      64       && '<' == 60 && '=' == 61 && '>' == 62 && '?' == 63 && 'A' == 65
      65       && 'Z' == 90 && '[' == 91 && '\\' == 92 && ']' == 93 && '^' == 94
      66       && '_' == 95 && 'a' == 97 && 'z' == 122 && '{' == 123 && '|' == 124
      67       && '}' == 125 && '~' == 126)
      68    };
      69  
      70    if (!native_c_charset || multibyte)
      71      return false;
      72  
      73    /* As a heuristic, use strcoll to compare native character order.
      74       If this agrees with byte order the locale should be simple.
      75       This heuristic should work for all known practical locales,
      76       although it would be invalid for artificially-constructed locales
      77       where the native order is the collating-sequence order but there
      78       are multi-character collating elements.  */
      79    for (int i = 0; i < UCHAR_MAX; i++)
      80      if (0 <= strcoll (((char []) {i, 0}), ((char []) {i + 1, 0})))
      81        return false;
      82  
      83    return true;
      84  }
      85  
      86  /* Initialize *LOCALEINFO from the current locale.  */
      87  
      88  void
      89  init_localeinfo (struct localeinfo *localeinfo)
      90  {
      91    localeinfo->multibyte = MB_CUR_MAX > 1;
      92    localeinfo->simple = using_simple_locale (localeinfo->multibyte);
      93    localeinfo->using_utf8 = is_using_utf8 ();
      94  
      95    for (int i = CHAR_MIN; i <= CHAR_MAX; i++)
      96      {
      97        char c = i;
      98        unsigned char uc = i;
      99        mbstate_t s = {0};
     100        wchar_t wc;
     101        size_t len = mbrtowc (&wc, &c, 1, &s);
     102        localeinfo->sbclen[uc] = len <= 1 ? 1 : - (int) - len;
     103        localeinfo->sbctowc[uc] = len <= 1 ? wc : WEOF;
     104      }
     105  }
     106  
     107  /* The set of wchar_t values C such that there's a useful locale
     108     somewhere where C != towupper (C) && C != towlower (towupper (C)).
     109     For example, 0x00B5 (U+00B5 MICRO SIGN) is in this table, because
     110     towupper (0x00B5) == 0x039C (U+039C GREEK CAPITAL LETTER MU), and
     111     towlower (0x039C) == 0x03BC (U+03BC GREEK SMALL LETTER MU).  */
     112  static short const lonesome_lower[] =
     113    {
     114      0x00B5, 0x0131, 0x017F, 0x01C5, 0x01C8, 0x01CB, 0x01F2, 0x0345,
     115      0x03C2, 0x03D0, 0x03D1, 0x03D5, 0x03D6, 0x03F0, 0x03F1,
     116  
     117      /* U+03F2 GREEK LUNATE SIGMA SYMBOL lacks a specific uppercase
     118         counterpart in locales predating Unicode 4.0.0 (April 2003).  */
     119      0x03F2,
     120  
     121      0x03F5, 0x1E9B, 0x1FBE,
     122    };
     123  
     124  /* Verify that the worst case fits.  This is 1 for towupper, 1 for
     125     towlower, and 1 for each entry in LONESOME_LOWER.  */
     126  verify (1 + 1 + sizeof lonesome_lower / sizeof *lonesome_lower
     127                 <= CASE_FOLDED_BUFSIZE);
     128  
     129  /* Find the characters equal to C after case-folding, other than C
     130     itself, and store them into FOLDED.  Return the number of characters
     131     stored; this is zero if C is WEOF.  */
     132  
     133  int
     134  case_folded_counterparts (wint_t c, wchar_t folded[CASE_FOLDED_BUFSIZE])
     135  {
     136    int i;
     137    int n = 0;
     138    wint_t uc = towupper (c);
     139    wint_t lc = towlower (uc);
     140    if (uc != c)
     141      folded[n++] = uc;
     142    if (lc != uc && lc != c && towupper (lc) == uc)
     143      folded[n++] = lc;
     144    for (i = 0; i < sizeof lonesome_lower / sizeof *lonesome_lower; i++)
     145      {
     146        wint_t li = lonesome_lower[i];
     147        if (li != lc && li != uc && li != c && towupper (li) == uc)
     148          folded[n++] = li;
     149      }
     150    return n;
     151  }