1  /* locale information
       2  
       3     Copyright 2016-2022 Free Software Foundation, Inc.
       4  
       5     This program is free software; you can redistribute it and/or modify
       6     it under the terms of the GNU General Public License as published by
       7     the Free Software Foundation, either version 3, or (at your option)
       8     any later version.
       9  
      10     This program is distributed in the hope that it will be useful,
      11     but WITHOUT ANY WARRANTY; without even the implied warranty of
      12     MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
      13     GNU General Public License for more details.
      14  
      15     You should have received a copy of the GNU General Public License
      16     along with this program; if not, write to the Free Software
      17     Foundation, Inc., 51 Franklin Street - Fifth Floor, Boston, MA
      18     02110-1301, USA.  */
      19  
      20  /* Written by Paul Eggert.  */
      21  
      22  #include <config.h>
      23  
      24  #include <localeinfo.h>
      25  
      26  #include <limits.h>
      27  #include <locale.h>
      28  #include <stdlib.h>
      29  #include <string.h>
      30  #include <wctype.h>
      31  
      32  /* The sbclen implementation relies on this.  */
      33  static_assert (MB_LEN_MAX <= SCHAR_MAX);
      34  
      35  /* Return true if the locale uses UTF-8.  */
      36  
      37  static bool
      38  is_using_utf8 (void)
      39  {
      40    wchar_t wc;
      41    mbstate_t mbs = {0};
      42    return mbrtowc (&wc, "\xc4\x80", 2, &mbs) == 2 && wc == 0x100;
      43  }
      44  
      45  /* Return true if the locale is compatible enough with the C locale so
      46     that the locale is single-byte, bytes are in collating-sequence
      47     order, and there are no multi-character collating elements.  */
      48  
      49  static bool
      50  using_simple_locale (bool multibyte)
      51  {
      52    /* The native character set is known to be compatible with
      53       the C locale.  The following test isn't perfect, but it's good
      54       enough in practice, as only ASCII and EBCDIC are in common use
      55       and this test correctly accepts ASCII and rejects EBCDIC.  */
      56    enum { native_c_charset =
      57      ('\b' == 8 && '\t' == 9 && '\n' == 10 && '\v' == 11 && '\f' == 12
      58       && '\r' == 13 && ' ' == 32 && '!' == 33 && '"' == 34 && '#' == 35
      59       && '%' == 37 && '&' == 38 && '\'' == 39 && '(' == 40 && ')' == 41
      60       && '*' == 42 && '+' == 43 && ',' == 44 && '-' == 45 && '.' == 46
      61       && '/' == 47 && '0' == 48 && '9' == 57 && ':' == 58 && ';' == 59
      62       && '<' == 60 && '=' == 61 && '>' == 62 && '?' == 63 && 'A' == 65
      63       && 'Z' == 90 && '[' == 91 && '\\' == 92 && ']' == 93 && '^' == 94
      64       && '_' == 95 && 'a' == 97 && 'z' == 122 && '{' == 123 && '|' == 124
      65       && '}' == 125 && '~' == 126)
      66    };
      67  
      68    if (!native_c_charset || multibyte)
      69      return false;
      70  
      71    /* As a heuristic, use strcoll to compare native character order.
      72       If this agrees with byte order the locale should be simple.
      73       This heuristic should work for all known practical locales,
      74       although it would be invalid for artificially-constructed locales
      75       where the native order is the collating-sequence order but there
      76       are multi-character collating elements.  */
      77    for (int i = 0; i < UCHAR_MAX; i++)
      78      if (0 <= strcoll (((char []) {i, 0}), ((char []) {i + 1, 0})))
      79        return false;
      80  
      81    return true;
      82  }
      83  
      84  /* Initialize *LOCALEINFO from the current locale.  */
      85  
      86  void
      87  init_localeinfo (struct localeinfo *localeinfo)
      88  {
      89    localeinfo->multibyte = MB_CUR_MAX > 1;
      90    localeinfo->simple = using_simple_locale (localeinfo->multibyte);
      91    localeinfo->using_utf8 = is_using_utf8 ();
      92  
      93    for (int i = CHAR_MIN; i <= CHAR_MAX; i++)
      94      {
      95        char c = i;
      96        unsigned char uc = i;
      97        mbstate_t s = {0};
      98        wchar_t wc;
      99        size_t len = mbrtowc (&wc, &c, 1, &s);
     100        localeinfo->sbclen[uc] = len <= 1 ? 1 : - (int) - len;
     101        localeinfo->sbctowc[uc] = len <= 1 ? wc : WEOF;
     102      }
     103  }
     104  
     105  /* The set of wchar_t values C such that there's a useful locale
     106     somewhere where C != towupper (C) && C != towlower (towupper (C)).
     107     For example, 0x00B5 (U+00B5 MICRO SIGN) is in this table, because
     108     towupper (0x00B5) == 0x039C (U+039C GREEK CAPITAL LETTER MU), and
     109     towlower (0x039C) == 0x03BC (U+03BC GREEK SMALL LETTER MU).  */
     110  static short const lonesome_lower[] =
     111    {
     112      0x00B5, 0x0131, 0x017F, 0x01C5, 0x01C8, 0x01CB, 0x01F2, 0x0345,
     113      0x03C2, 0x03D0, 0x03D1, 0x03D5, 0x03D6, 0x03F0, 0x03F1,
     114  
     115      /* U+03F2 GREEK LUNATE SIGMA SYMBOL lacks a specific uppercase
     116         counterpart in locales predating Unicode 4.0.0 (April 2003).  */
     117      0x03F2,
     118  
     119      0x03F5, 0x1E9B, 0x1FBE,
     120    };
     121  
     122  /* Verify that the worst case fits.  This is 1 for towupper, 1 for
     123     towlower, and 1 for each entry in LONESOME_LOWER.  */
     124  static_assert (1 + 1 + sizeof lonesome_lower / sizeof *lonesome_lower
     125                 <= CASE_FOLDED_BUFSIZE);
     126  
     127  /* Find the characters equal to C after case-folding, other than C
     128     itself, and store them into FOLDED.  Return the number of characters
     129     stored; this is zero if C is WEOF.  */
     130  
     131  int
     132  case_folded_counterparts (wint_t c, wchar_t folded[CASE_FOLDED_BUFSIZE])
     133  {
     134    int i;
     135    int n = 0;
     136    wint_t uc = towupper (c);
     137    wint_t lc = towlower (uc);
     138    if (uc != c)
     139      folded[n++] = uc;
     140    if (lc != uc && lc != c && towupper (lc) == uc)
     141      folded[n++] = lc;
     142    for (i = 0; i < sizeof lonesome_lower / sizeof *lonesome_lower; i++)
     143      {
     144        wint_t li = lonesome_lower[i];
     145        if (li != lc && li != uc && li != c && towupper (li) == uc)
     146          folded[n++] = li;
     147      }
     148    return n;
     149  }