1  #!/usr/bin/env python3
       2  #
       3  # Script to generate tables for cpp_wcwidth, leveraging glibc's utf8_gen.py.
       4  #
       5  # This file is part of GCC.
       6  #
       7  # GCC is free software; you can redistribute it and/or modify it under
       8  # the terms of the GNU General Public License as published by the Free
       9  # Software Foundation; either version 3, or (at your option) any later
      10  # version.
      11  #
      12  # GCC is distributed in the hope that it will be useful, but WITHOUT ANY
      13  # WARRANTY; without even the implied warranty of MERCHANTABILITY or
      14  # FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
      15  # for more details.
      16  #
      17  # You should have received a copy of the GNU General Public License
      18  # along with GCC; see the file COPYING3.  If not see
      19  # <http://www.gnu.org/licenses/>.  */
      20  
      21  import sys
      22  import os
      23  
      24  if len(sys.argv) != 2:
      25      print("usage: %s <unicode version>", file=sys.stderr)
      26      sys.exit(1)
      27  unicode_version = sys.argv[1]
      28  
      29  # Parse a codepoint in the format output by glibc tools.
      30  def parse_ucn(s):
      31      if not (s.startswith("<U") and s.endswith(">")):
      32          raise ValueError
      33      return int(s[2:-1], base=16)
      34  
      35  # Process a line of width output from utf_gen.py and update global array.
      36  widths = [1] * (1 + 0x10FFFF)
      37  def process_width(line):
      38      # Example lines:
      39      # <UA8FF>	0
      40      # <UA926>...<UA92D>	0
      41  
      42      s = line.split()
      43      width = int(s[1])
      44      r = s[0].split("...")
      45      if len(r) == 1:
      46          begin = parse_ucn(r[0])
      47          end = begin + 1
      48      elif len(r) == 2:
      49          begin = parse_ucn(r[0])
      50          end = parse_ucn(r[1]) + 1
      51      else:
      52          raise ValueError
      53      widths[begin:end] = [width] * (end - begin)
      54  
      55  # To keep things simple, we use glibc utf8_gen.py as-is.  It only outputs to a
      56  # file named UTF-8, which is not configurable.  Then we parse this into the form
      57  # we want it.
      58  os.system("from_glibc/utf8_gen.py --unicode_version %s" % unicode_version)
      59  processing = False
      60  for line in open("UTF-8", "r"):
      61      if processing:
      62          if line == "END WIDTH\n":
      63              processing = False
      64          else:
      65              try:
      66                  process_width(line)
      67              except (ValueError, IndexError):
      68                  print(e, "warning: ignored unexpected line: %s" % line,
      69                          file=sys.stderr, end="")
      70      elif line == "WIDTH\n":
      71          processing = True
      72  
      73  # All bytes < 256 we treat as width 1.
      74  widths[0:255] = [1] * 255
      75  
      76  # Condense the list to contiguous ranges.
      77  cur_range = [-1, 1]
      78  all_ranges = []
      79  for i, width in enumerate(widths):
      80      if width == cur_range[1]:
      81          cur_range[0] = i
      82      else:
      83          all_ranges.append(cur_range)
      84          cur_range = [i, width]
      85  
      86  # Output the arrays for generated_cpp_wcwidth.h
      87  print("/*  Generated by contrib/unicode/gen_wcwidth.py,",
      88            "with the help of glibc's")
      89  print("    utf8_gen.py, using version %s" % unicode_version,
      90            "of the Unicode standard.  */")
      91  print("\nstatic const cppchar_t wcwidth_range_ends[] = {", end="")
      92  for i, r in enumerate(all_ranges):
      93      if i % 8:
      94          print(" ", end="")
      95      else:
      96          print("\n  ", end="")
      97      print("0x%x," % (r[0]), end="")
      98  print("\n};\n")
      99  print("static const unsigned char wcwidth_widths[] = {", end="")
     100  for i, r in enumerate(all_ranges):
     101      if i % 24:
     102          print(" ", end="")
     103      else:
     104          print("\n  ", end="")
     105      print("%d," % r[1], end="")
     106  print("\n};")