1  /* Determine display width of Unicode character.
       2     Copyright (C) 2001-2002, 2006-2023 Free Software Foundation, Inc.
       3     Written by Bruno Haible <bruno@clisp.org>, 2002.
       4  
       5     This file is free software: you can redistribute it and/or modify
       6     it under the terms of the GNU Lesser General Public License as
       7     published by the Free Software Foundation; either version 2.1 of the
       8     License, or (at your option) any later version.
       9  
      10     This file is distributed in the hope that it will be useful,
      11     but WITHOUT ANY WARRANTY; without even the implied warranty of
      12     MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
      13     GNU Lesser General Public License for more details.
      14  
      15     You should have received a copy of the GNU Lesser General Public License
      16     along with this program.  If not, see <https://www.gnu.org/licenses/>.  */
      17  
      18  #include <config.h>
      19  
      20  /* Specification.  */
      21  #include "uniwidth.h"
      22  
      23  #include "cjk.h"
      24  
      25  /* The non-spacing attribute table consists of:
      26     * Non-spacing characters; generated from PropList.txt or
      27       "grep '^[^;]*;[^;]*;[^;]*;[^;]*;NSM;' UnicodeData.txt"
      28     * Format control characters; generated from
      29       "grep '^[^;]*;[^;]*;Cf;' UnicodeData.txt"
      30     * Zero width characters; generated from
      31       "grep '^[^;]*;ZERO WIDTH ' UnicodeData.txt"
      32     * Hangul Jamo characters that have conjoining behaviour:
      33         - jungseong = syllable-middle vowels
      34         - jongseong = syllable-final consonants
      35       Rationale:
      36       1) These characters act like combining characters. They have no
      37       equivalent in legacy character sets. Therefore the EastAsianWidth.txt
      38       file does not really matter for them; UAX #11 East Asian Width
      39       <https://www.unicode.org/reports/tr11/> makes it clear that it focus
      40       is on compatibility with traditional Japanese layout.
      41       By contrast, the same glyphs without conjoining behaviour are available
      42       in the U+3130..U+318F block, and these characters are mapped to legacy
      43       character sets, and traditional Japanese layout matters for them.
      44       2) glibc does the same thing, see
      45       <https://sourceware.org/bugzilla/show_bug.cgi?id=21750>
      46       <https://sourceware.org/bugzilla/show_bug.cgi?id=26120>
      47   */
      48  #include "uniwidth/width0.h"
      49  
      50  #include "uniwidth/width2.h"
      51  #include "unictype/bitmap.h"
      52  
      53  #define SIZEOF(a) (sizeof(a) / sizeof(a[0]))
      54  
      55  
      56  /* Determine number of column positions required for UC.  */
      57  int
      58  uc_width (ucs4_t uc, const char *encoding)
      59  {
      60    /* Test for non-spacing or control character.  */
      61    if ((uc >> 9) < SIZEOF (nonspacing_table_ind))
      62      {
      63        int ind = nonspacing_table_ind[uc >> 9];
      64        if (ind >= 0)
      65          if ((nonspacing_table_data[64*ind + ((uc >> 3) & 63)] >> (uc & 7)) & 1)
      66            {
      67              if (uc > 0 && uc < 0xa0)
      68                return -1;
      69              else
      70                return 0;
      71            }
      72      }
      73    else if ((uc >> 9) == (0xe0000 >> 9))
      74      {
      75        if (uc >= 0xe0100)
      76          {
      77            if (uc <= 0xe01ef)
      78              return 0;
      79          }
      80        else
      81          {
      82            if (uc >= 0xe0020 ? uc <= 0xe007f : uc == 0xe0001)
      83              return 0;
      84          }
      85      }
      86    /* Test for double-width character.  */
      87    if (bitmap_lookup (&u_width2, uc))
      88      return 2;
      89    /* In ancient CJK encodings, Cyrillic and most other characters are
      90       double-width as well.  */
      91    if (uc >= 0x00A1 && uc < 0xFF61 && uc != 0x20A9
      92        && is_cjk_encoding (encoding))
      93      return 2;
      94    return 1;
      95  }