1  /* Association between Unicode characters and their names.
       2     Copyright (C) 2000-2002, 2005-2007, 2009-2023 Free Software Foundation, Inc.
       3  
       4     This file is free software.
       5     It is dual-licensed under "the GNU LGPLv3+ or the GNU GPLv2+".
       6     You can redistribute it and/or modify it under either
       7       - the terms of the GNU Lesser General Public License as published
       8         by the Free Software Foundation, either version 3, or (at your
       9         option) any later version, or
      10       - the terms of the GNU General Public License as published by the
      11         Free Software Foundation; either version 2, or (at your option)
      12         any later version, or
      13       - the same dual license "the GNU LGPLv3+ or the GNU GPLv2+".
      14  
      15     This file is distributed in the hope that it will be useful,
      16     but WITHOUT ANY WARRANTY; without even the implied warranty of
      17     MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
      18     Lesser General Public License and the GNU General Public License
      19     for more details.
      20  
      21     You should have received a copy of the GNU Lesser General Public
      22     License and of the GNU General Public License along with this
      23     program.  If not, see <https://www.gnu.org/licenses/>.  */
      24  
      25  #include <config.h>
      26  
      27  /* Specification.  */
      28  #include "uniname.h"
      29  
      30  #include <assert.h>
      31  #include <stdint.h>
      32  #include <stdio.h>
      33  #include <string.h>
      34  
      35  #include "attribute.h"
      36  
      37  #define SIZEOF(a) (sizeof(a) / sizeof(a[0]))
      38  
      39  
      40  /* Table of Unicode character names, derived from UnicodeData.txt.
      41     This table is generated in a way to minimize the memory footprint:
      42       1. its compiled size is small (less than 350 KB),
      43       2. it resides entirely in the text or read-only data segment of the
      44          executable or shared library: the table contains only immediate
      45          integers, no pointers, and the functions don't do heap allocation.
      46   */
      47  #include "uninames.h"
      48  /* It contains:
      49    static const char unicode_name_words[36303] = ...;
      50    #define UNICODE_CHARNAME_NUM_WORDS 6260
      51    static const struct { uint16_t extra_offset; uint16_t ind_offset; } unicode_name_by_length[26] = ...;
      52    #define UNICODE_CHARNAME_WORD_HANGUL 3902
      53    #define UNICODE_CHARNAME_WORD_SYLLABLE 4978
      54    #define UNICODE_CHARNAME_WORD_CJK 417
      55    #define UNICODE_CHARNAME_WORD_COMPATIBILITY 6107
      56    static const uint16_t unicode_names[68940] = ...;
      57    static const struct { uint16_t index; uint32_t name:24; } unicode_name_to_index[16626] = ...;
      58    static const struct { uint16_t index; uint32_t name:24; } unicode_index_to_name[16626] = ...;
      59    #define UNICODE_CHARNAME_MAX_LENGTH 83
      60    #define UNICODE_CHARNAME_MAX_WORDS 13
      61    static const struct { uint32_t index; uint32_t gap; uint16_t length; } unicode_ranges[401] = ...;
      62  */
      63  
      64  /* Returns the word with a given index.  */
      65  static const char *
      66  unicode_name_word (unsigned int index, unsigned int *lengthp)
      67  {
      68    unsigned int i1;
      69    unsigned int i2;
      70  
      71    assert (index < UNICODE_CHARNAME_NUM_WORDS);
      72  
      73    /* Binary search for i with
      74         unicode_name_by_length[i].ind_offset <= index
      75       and
      76         index < unicode_name_by_length[i+1].ind_offset
      77     */
      78  
      79    i1 = 0;
      80    i2 = SIZEOF (unicode_name_by_length) - 1;
      81    while (i2 - i1 > 1)
      82      {
      83        unsigned int i = (i1 + i2) >> 1;
      84        if (unicode_name_by_length[i].ind_offset <= index)
      85          i1 = i;
      86        else
      87          i2 = i;
      88      }
      89    unsigned int i = i1;
      90    assert (unicode_name_by_length[i].ind_offset <= index
      91            && index < unicode_name_by_length[i+1].ind_offset);
      92    *lengthp = i;
      93    return &unicode_name_words[unicode_name_by_length[i].extra_offset
      94                               + (index-unicode_name_by_length[i].ind_offset)*i];
      95  }
      96  
      97  /* Looks up the index of a word.  */
      98  static int
      99  unicode_name_word_lookup (const char *word, size_t length)
     100  {
     101    if (length > 0 && length < SIZEOF (unicode_name_by_length) - 1)
     102      {
     103        /* Binary search among the words of given length.  */
     104        unsigned int extra_offset = unicode_name_by_length[length].extra_offset;
     105        unsigned int i0 = unicode_name_by_length[length].ind_offset;
     106        unsigned int i1 = i0;
     107        unsigned int i2 = unicode_name_by_length[length+1].ind_offset;
     108        while (i2 - i1 > 0)
     109          {
     110            unsigned int i = (i1 + i2) >> 1;
     111            const char *p = &unicode_name_words[extra_offset + (i-i0)*length];
     112            const char *w = word;
     113            unsigned int n = length;
     114            for (;;)
     115              {
     116                if (*p < *w)
     117                  {
     118                    if (i1 == i)
     119                      return -1;
     120                    /* Note here: i1 < i < i2.  */
     121                    i1 = i;
     122                    break;
     123                  }
     124                if (*p > *w)
     125                  {
     126                    /* Note here: i1 <= i < i2.  */
     127                    i2 = i;
     128                    break;
     129                  }
     130                p++; w++; n--;
     131                if (n == 0)
     132                  return i;
     133              }
     134          }
     135      }
     136    return -1;
     137  }
     138  
     139  #define UNINAME_INVALID_INDEX UINT16_MAX
     140  
     141  /* Looks up the internal index of a Unicode character.  */
     142  static uint16_t
     143  unicode_code_to_index (ucs4_t c)
     144  {
     145    /* Binary search in unicode_ranges.  */
     146    unsigned int i1 = 0;
     147    unsigned int i2 = SIZEOF (unicode_ranges);
     148  
     149    for (;;)
     150      {
     151        unsigned int i = (i1 + i2) >> 1;
     152        ucs4_t start_code =
     153          unicode_ranges[i].index + unicode_ranges[i].gap;
     154        ucs4_t end_code =
     155          start_code + unicode_ranges[i].length - 1;
     156  
     157        if (start_code <= c && c <= end_code)
     158          return c - unicode_ranges[i].gap;
     159  
     160        if (end_code < c)
     161          {
     162            if (i1 == i)
     163              break;
     164            /* Note here: i1 < i < i2.  */
     165            i1 = i;
     166          }
     167        else if (c < start_code)
     168          {
     169            if (i2 == i)
     170              break;
     171            /* Note here: i1 <= i < i2.  */
     172            i2 = i;
     173          }
     174      }
     175    return UNINAME_INVALID_INDEX;
     176  }
     177  
     178  /* Looks up the codepoint of a Unicode character, from the given
     179     internal index.  */
     180  static ucs4_t
     181  unicode_index_to_code (uint16_t index)
     182  {
     183    /* Binary search in unicode_ranges.  */
     184    unsigned int i1 = 0;
     185    unsigned int i2 = SIZEOF (unicode_ranges);
     186  
     187    for (;;)
     188      {
     189        unsigned int i = (i1 + i2) >> 1;
     190        uint16_t start_index = unicode_ranges[i].index;
     191        uint16_t end_index = start_index + unicode_ranges[i].length - 1;
     192  
     193        if (start_index <= index && index <= end_index)
     194          return index + unicode_ranges[i].gap;
     195  
     196        if (end_index < index)
     197          {
     198            if (i1 == i)
     199              break;
     200            /* Note here: i1 < i < i2.  */
     201            i1 = i;
     202          }
     203        else if (index < start_index)
     204          {
     205            if (i2 == i)
     206              break;
     207            /* Note here: i1 <= i < i2.  */
     208            i2 = i;
     209          }
     210      }
     211    return UNINAME_INVALID;
     212  }
     213  
     214  
     215  /* Auxiliary tables for Hangul syllable names, see the Unicode 3.0 book,
     216     sections 3.11 and 4.4.  */
     217  static const char jamo_initial_short_name[19][3] =
     218  {
     219    "G", "GG", "N", "D", "DD", "R", "M", "B", "BB", "S", "SS", "", "J", "JJ",
     220    "C", "K", "T", "P", "H"
     221  };
     222  static const char jamo_medial_short_name[21][4] =
     223  {
     224    "A", "AE", "YA", "YAE", "EO", "E", "YEO", "YE", "O", "WA", "WAE", "OE", "YO",
     225    "U", "WEO", "WE", "WI", "YU", "EU", "YI", "I"
     226  };
     227  static const char jamo_final_short_name[28][3] =
     228  {
     229    "", "G", "GG", "GS", "N", "NI", "NH", "D", "L", "LG", "LM", "LB", "LS", "LT",
     230    "LP", "LH", "M", "B", "BS", "S", "SS", "NG", "J", "C", "K", "T", "P", "H"
     231  };
     232  
     233  /* Looks up the name of a Unicode character, in uppercase ASCII.
     234     Returns the filled buf, or NULL if the character does not have a name.  */
     235  char *
     236  unicode_character_name (ucs4_t c, char *buf)
     237  {
     238    if (c >= 0xAC00 && c <= 0xD7A3)
     239      {
     240        /* Special case for Hangul syllables. Keeps the tables small.  */
     241        char *ptr;
     242        unsigned int tmp;
     243        unsigned int index1;
     244        unsigned int index2;
     245        unsigned int index3;
     246        const char *q;
     247  
     248        /* buf needs to have at least 16 + 7 + 1 bytes here.  */
     249        memcpy (buf, "HANGUL SYLLABLE ", 16);
     250        ptr = buf + 16;
     251  
     252        tmp = c - 0xAC00;
     253        index3 = tmp % 28; tmp = tmp / 28;
     254        index2 = tmp % 21; tmp = tmp / 21;
     255        index1 = tmp;
     256  
     257        q = jamo_initial_short_name[index1];
     258        while (*q != '\0')
     259          *ptr++ = *q++;
     260        q = jamo_medial_short_name[index2];
     261        while (*q != '\0')
     262          *ptr++ = *q++;
     263        q = jamo_final_short_name[index3];
     264        while (*q != '\0')
     265          *ptr++ = *q++;
     266        *ptr = '\0';
     267        return buf;
     268      }
     269    else if ((c >= 0xF900 && c <= 0xFA2D) || (c >= 0xFA30 && c <= 0xFA6A)
     270             || (c >= 0xFA70 && c <= 0xFAD9) || (c >= 0x2F800 && c <= 0x2FA1D))
     271      {
     272        /* Special case for CJK compatibility ideographs. Keeps the tables
     273           small.  */
     274        char *ptr;
     275        int i;
     276  
     277        /* buf needs to have at least 28 + 5 + 1 bytes here.  */
     278        memcpy (buf, "CJK COMPATIBILITY IDEOGRAPH-", 28);
     279        ptr = buf + 28;
     280  
     281        for (i = (c < 0x10000 ? 12 : 16); i >= 0; i -= 4)
     282          {
     283            unsigned int x = (c >> i) & 0xf;
     284            *ptr++ = (x < 10 ? '0' : 'A' - 10) + x;
     285          }
     286        *ptr = '\0';
     287        return buf;
     288      }
     289    else if ((c >= 0xFE00 && c <= 0xFE0F) || (c >= 0xE0100 && c <= 0xE01EF))
     290      {
     291        /* Special case for variation selectors. Keeps the tables
     292           small.  */
     293  
     294        /* buf needs to have at least 19 + 3 + 1 bytes here.  */
     295        sprintf (buf, "VARIATION SELECTOR-%u",
     296                 c <= 0xFE0F ? c - 0xFE00 + 1 : c - 0xE0100 + 17);
     297        return buf;
     298      }
     299    else
     300      {
     301        uint16_t index = unicode_code_to_index (c);
     302        const uint16_t *words = NULL;
     303  
     304        if (index != UNINAME_INVALID_INDEX)
     305          {
     306            /* Binary search in unicode_code_to_name.  */
     307            unsigned int i1 = 0;
     308            unsigned int i2 = SIZEOF (unicode_index_to_name);
     309            for (;;)
     310              {
     311                unsigned int i = (i1 + i2) >> 1;
     312                if (unicode_index_to_name[i].index == index)
     313                  {
     314                    words = &unicode_names[unicode_index_to_name[i].name];
     315                    break;
     316                  }
     317                else if (unicode_index_to_name[i].index < index)
     318                  {
     319                    if (i1 == i)
     320                      {
     321                        words = NULL;
     322                        break;
     323                      }
     324                    /* Note here: i1 < i < i2.  */
     325                    i1 = i;
     326                  }
     327                else if (unicode_index_to_name[i].index > index)
     328                  {
     329                    if (i2 == i)
     330                      {
     331                        words = NULL;
     332                        break;
     333                      }
     334                    /* Note here: i1 <= i < i2.  */
     335                    i2 = i;
     336                  }
     337              }
     338          }
     339        if (words != NULL)
     340          {
     341            /* Found it in unicode_index_to_name. Now concatenate the words.  */
     342            /* buf needs to have at least UNICODE_CHARNAME_MAX_LENGTH + 1
     343               bytes.  */
     344            char *ptr = buf;
     345            for (;;)
     346              {
     347                unsigned int wordlen;
     348                const char *word = unicode_name_word (*words>>1, &wordlen);
     349                do
     350                  *ptr++ = *word++;
     351                while (--wordlen > 0);
     352                if ((*words & 1) == 0)
     353                  break;
     354                *ptr++ = ' ';
     355                words++;
     356              }
     357            *ptr = '\0';
     358            return buf;
     359          }
     360        return NULL;
     361      }
     362  }
     363  
     364  /* Looks up the Unicode character with a given name, in upper- or lowercase
     365     ASCII.  Returns the character if found, or UNINAME_INVALID if not found.  */
     366  ucs4_t
     367  unicode_name_character (const char *name)
     368  {
     369    size_t len = strlen (name);
     370    if (len > 1 && len <= UNICODE_CHARNAME_MAX_LENGTH)
     371      {
     372        /* Test for "word1 word2 ..." syntax.  */
     373        char buf[UNICODE_CHARNAME_MAX_LENGTH];
     374        char *ptr = buf;
     375        for (;;)
     376          {
     377            char c = *name++;
     378            if (!(c >= ' ' && c <= '~'))
     379              break;
     380            *ptr++ = (c >= 'a' && c <= 'z' ? c - 'a' + 'A' : c);
     381            if (--len == 0)
     382              goto filled_buf;
     383          }
     384        if (false)
     385        filled_buf:
     386          {
     387            {
     388              /* Special case for variation selector aliases. Keeps the
     389                 tables small.  */
     390              const char *p1 = buf;
     391              if (ptr >= buf + 3 && *p1++ == 'V')
     392                {
     393                  if (*p1++ == 'S')
     394                    {
     395                      if (*p1 != '0')
     396                        {
     397                          unsigned int c = 0;
     398                          for (;;)
     399                            {
     400                              if (*p1 >= '0' && *p1 <= '9')
     401                                c += (*p1 - '0');
     402                              p1++;
     403                              if (p1 == ptr)
     404                                {
     405                                  if (c >= 1 && c <= 16)
     406                                    return c - 1 + 0xFE00;
     407                                  else if (c >= 17 && c <= 256)
     408                                    return c - 17 + 0xE0100;
     409                                  else
     410                                    break;
     411                                }
     412                              c = c * 10;
     413                            }
     414                        }
     415                    }
     416                }
     417            }
     418            {
     419              /* Convert the constituents to uint16_t words.  */
     420              uint16_t words[UNICODE_CHARNAME_MAX_WORDS];
     421              uint16_t *wordptr = words;
     422              {
     423                const char *p1 = buf;
     424                for (;;)
     425                  {
     426                    {
     427                      int word;
     428                      const char *p2 = p1;
     429                      while (p2 < ptr && *p2 != ' ')
     430                        p2++;
     431                      word = unicode_name_word_lookup (p1, p2 - p1);
     432                      if (word < 0)
     433                        break;
     434                      if (wordptr == &words[UNICODE_CHARNAME_MAX_WORDS])
     435                        break;
     436                      *wordptr++ = word;
     437                      if (p2 == ptr)
     438                        goto filled_words;
     439                      p1 = p2 + 1;
     440                    }
     441                    /* Special case for Hangul syllables. Keeps the tables small. */
     442                    if (wordptr == &words[2]
     443                        && words[0] == UNICODE_CHARNAME_WORD_HANGUL
     444                        && words[1] == UNICODE_CHARNAME_WORD_SYLLABLE)
     445                      {
     446                        /* Split the last word [p1..ptr) into three parts:
     447                             1) [BCDGHJKMNPRST]
     448                             2) [AEIOUWY]
     449                             3) [BCDGHIJKLMNPST]
     450                         */
     451                        const char *p2;
     452                        const char *p3;
     453                        const char *p4;
     454  
     455                        p2 = p1;
     456                        while (p2 < ptr
     457                               && (*p2 == 'B' || *p2 == 'C' || *p2 == 'D'
     458                                   || *p2 == 'G' || *p2 == 'H' || *p2 == 'J'
     459                                   || *p2 == 'K' || *p2 == 'M' || *p2 == 'N'
     460                                   || *p2 == 'P' || *p2 == 'R' || *p2 == 'S'
     461                                   || *p2 == 'T'))
     462                          p2++;
     463                        p3 = p2;
     464                        while (p3 < ptr
     465                               && (*p3 == 'A' || *p3 == 'E' || *p3 == 'I'
     466                                   || *p3 == 'O' || *p3 == 'U' || *p3 == 'W'
     467                                   || *p3 == 'Y'))
     468                          p3++;
     469                        p4 = p3;
     470                        while (p4 < ptr
     471                               && (*p4 == 'B' || *p4 == 'C' || *p4 == 'D'
     472                                   || *p4 == 'G' || *p4 == 'H' || *p4 == 'I'
     473                                   || *p4 == 'J' || *p4 == 'K' || *p4 == 'L'
     474                                   || *p4 == 'M' || *p4 == 'N' || *p4 == 'P'
     475                                   || *p4 == 'S' || *p4 == 'T'))
     476                          p4++;
     477                        if (p4 == ptr)
     478                          {
     479                            size_t n1 = p2 - p1;
     480                            size_t n2 = p3 - p2;
     481                            size_t n3 = p4 - p3;
     482  
     483                            if (n1 <= 2 && (n2 >= 1 && n2 <= 3) && n3 <= 2)
     484                              {
     485                                unsigned int index1;
     486  
     487                                for (index1 = 0; index1 < 19; index1++)
     488                                  if (memcmp (jamo_initial_short_name[index1], p1, n1) == 0
     489                                      && jamo_initial_short_name[index1][n1] == '\0')
     490                                    {
     491                                      unsigned int index2;
     492  
     493                                      for (index2 = 0; index2 < 21; index2++)
     494                                        if (memcmp (jamo_medial_short_name[index2], p2, n2) == 0
     495                                            && jamo_medial_short_name[index2][n2] == '\0')
     496                                          {
     497                                            unsigned int index3;
     498  
     499                                            for (index3 = 0; index3 < 28; index3++)
     500                                              if (memcmp (jamo_final_short_name[index3], p3, n3) == 0
     501                                                  && jamo_final_short_name[index3][n3] == '\0')
     502                                                {
     503                                                  return 0xAC00 + (index1 * 21 + index2) * 28 + index3;
     504                                                }
     505                                            break;
     506                                          }
     507                                      break;
     508                                    }
     509                              }
     510                          }
     511                      }
     512                    /* Special case for CJK compatibility ideographs. Keeps the
     513                       tables small.  */
     514                    if (wordptr == &words[2]
     515                        && words[0] == UNICODE_CHARNAME_WORD_CJK
     516                        && words[1] == UNICODE_CHARNAME_WORD_COMPATIBILITY
     517                        && p1 + 14 <= ptr
     518                        && p1 + 15 >= ptr
     519                        && memcmp (p1, "IDEOGRAPH-", 10) == 0)
     520                      {
     521                        const char *p2 = p1 + 10;
     522  
     523                        if (*p2 != '0')
     524                          {
     525                            unsigned int c = 0;
     526  
     527                            for (;;)
     528                              {
     529                                if (*p2 >= '0' && *p2 <= '9')
     530                                  c += (*p2 - '0');
     531                                else if (*p2 >= 'A' && *p2 <= 'F')
     532                                  c += (*p2 - 'A' + 10);
     533                                else
     534                                  break;
     535                                p2++;
     536                                if (p2 == ptr)
     537                                  {
     538                                    if ((c >= 0xF900 && c <= 0xFA2D)
     539                                        || (c >= 0xFA30 && c <= 0xFA6A)
     540                                        || (c >= 0xFA70 && c <= 0xFAD9)
     541                                        || (c >= 0x2F800 && c <= 0x2FA1D))
     542                                      return c;
     543                                    else
     544                                      break;
     545                                  }
     546                                c = c << 4;
     547                              }
     548                          }
     549                      }
     550                    /* Special case for variation selectors. Keeps the
     551                       tables small.  */
     552                    if (wordptr == &words[1]
     553                        && words[0] == UNICODE_CHARNAME_WORD_VARIATION
     554                        && p1 + 10 <= ptr
     555                        && p1 + 12 >= ptr
     556                        && memcmp (p1, "SELECTOR-", 9) == 0)
     557                      {
     558                        const char *p2 = p1 + 9;
     559  
     560                        if (*p2 != '0')
     561                          {
     562                            unsigned int c = 0;
     563  
     564                            for (;;)
     565                              {
     566                                if (*p2 >= '0' && *p2 <= '9')
     567                                  c += (*p2 - '0');
     568                                p2++;
     569                                if (p2 == ptr)
     570                                  {
     571                                    if (c >= 1 && c <= 16)
     572                                      return c - 1 + 0xFE00;
     573                                    else if (c >= 17 && c <= 256)
     574                                      return c - 17 + 0xE0100;
     575                                    else
     576                                      break;
     577                                  }
     578                                c = c * 10;
     579                              }
     580                          }
     581                      }
     582                  }
     583              }
     584              if (false)
     585              filled_words:
     586                {
     587                  /* Multiply by 2, to simplify later comparisons.  */
     588                  size_t words_length = wordptr - words;
     589                  {
     590                    size_t i = words_length - 1;
     591                    words[i] = 2 * words[i];
     592                    for (; i > 0; )
     593                      {
     594                        --i;
     595                        words[i] = 2 * words[i] + 1;
     596                      }
     597                  }
     598                  /* Binary search in unicode_name_to_index.  */
     599                  {
     600                    unsigned int i1 = 0;
     601                    unsigned int i2 = SIZEOF (unicode_name_to_index);
     602                    for (;;)
     603                      {
     604                        unsigned int i = (i1 + i2) >> 1;
     605                        const uint16_t *w = words;
     606                        const uint16_t *p = &unicode_names[unicode_name_to_index[i].name];
     607                        size_t n = words_length;
     608                        for (;;)
     609                          {
     610                            if (*p < *w)
     611                              {
     612                                if (i1 == i)
     613                                  goto name_not_found;
     614                                /* Note here: i1 < i < i2.  */
     615                                i1 = i;
     616                                break;
     617                              }
     618                            else if (*p > *w)
     619                              {
     620                                if (i2 == i)
     621                                  goto name_not_found;
     622                                /* Note here: i1 <= i < i2.  */
     623                                i2 = i;
     624                                break;
     625                              }
     626                            p++; w++; n--;
     627                            if (n == 0)
     628                              return unicode_index_to_code (unicode_name_to_index[i].index);
     629                          }
     630                      }
     631                  }
     632                name_not_found: ;
     633                }
     634            }
     635          }
     636      }
     637    return UNINAME_INVALID;
     638  }