(root)/
glibc-2.38/
locale/
gen-translit.py
       1  #!/usr/bin/python3
       2  # Generate the locale/C-translit.h file.
       3  # Copyright (C) 2018-2023 Free Software Foundation, Inc.
       4  # This file is part of the GNU C Library.
       5  #
       6  # The GNU C Library is free software; you can redistribute it and/or
       7  # modify it under the terms of the GNU Lesser General Public
       8  # License as published by the Free Software Foundation; either
       9  # version 2.1 of the License, or (at your option) any later version.
      10  #
      11  # The GNU C Library is distributed in the hope that it will be useful,
      12  # but WITHOUT ANY WARRANTY; without even the implied warranty of
      13  # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
      14  # Lesser General Public License for more details.
      15  #
      16  # You should have received a copy of the GNU Lesser General Public
      17  # License along with the GNU C Library; if not, see
      18  # <https://www.gnu.org/licenses/>.
      19  
      20  import re
      21  import sys
      22  
      23  
      24  class ESC[4;38;5;81mStringLiteral:
      25      "Source of a string literal and its decomposition into code points."
      26      def __init__(self, s):
      27          # States:
      28          #  0 regular character sequence
      29          #  1 backslash seen
      30          #  2 in hexadecimal escape sequence
      31          state = 0
      32          result = []
      33          for ch in s:
      34              if state == 0:
      35                  if ch == '\\':
      36                      state = 1
      37                  else:
      38                      result.append(ord(ch))
      39              elif state == 1:
      40                  if ch in "\\\"":
      41                      result.append(ord(ch))
      42                      state = 0
      43                  elif ch == 'x':
      44                      state = 2
      45                      result.append(0)
      46                  else:
      47                      raise ValueError("invalid character {!r} in {!r}".format(
      48                          ch, s))
      49              elif state == 2:
      50                  if ch in "0123456789abcdefABCDEF":
      51                      result[-1] = result[-1] * 16 + int(ch, 16)
      52                  else:
      53                      if ch == '\\':
      54                          state = 1
      55                      else:
      56                          state = 0
      57          if state == 1:
      58              raise ValueError("trailing backslash in {!r}".format(s))
      59  
      60          self.source = s
      61          self.decoded = tuple(result)
      62  
      63  
      64  class ESC[4;38;5;81mTranslit:
      65      "Pair of transliteration and source."
      66  
      67      __RE_TRANSLIT = re.compile(
      68          r'^"((?:[^"\\]|\\x[0-9a-fA-F])+)"\s+'
      69          r'"((?:[^"\\]|\\["\\])*)"\s*(?:#.*)?$')
      70  
      71      def __init__(self, line):
      72          match = self.__RE_TRANSLIT.match(line)
      73          if not match:
      74              raise IOError("invalid line {}: {!r}".format(
      75                  lineno + 1, line))
      76          codepoints, replacement = match.groups()
      77          self.codepoints = StringLiteral(codepoints)
      78          self.replacement = StringLiteral(replacement)
      79  
      80  
      81  # List of Translit objects.
      82  translits = []
      83  
      84  # Read transliterations from standard input.
      85  for lineno, line in enumerate(sys.stdin):
      86      line = line.strip()
      87      # Skip empty lines and comments.
      88      if (not line) or line[0] == '#':
      89          continue
      90      translit = Translit(line)
      91      # Check ordering of codepoints.
      92      if translits \
      93         and translit.codepoints.decoded <= translits[-1].codepoints.decoded:
      94          raise IOError("unexpected codepoint {!r} on line {}: {!r}".format(
      95              translit.codepoints.decoded, lineno + 1, line))
      96      translits.append(translit)
      97  
      98  # Generate the C sources.
      99  write = sys.stdout.write
     100  write("#include <stdint.h>\n")
     101  write("#define NTRANSLIT {}\n".format(len(translits)))
     102  
     103  write("static const uint32_t translit_from_idx[] =\n{\n  ")
     104  col = 2
     105  total = 0
     106  for translit in translits:
     107      if total > 0:
     108          if col + 7 >= 79:
     109              write(",\n  ")
     110              col = 2
     111          else:
     112              write(", ")
     113              col += 2
     114      write("{:4}".format(total))
     115      total += len(translit.codepoints.decoded) + 1
     116      col += 4
     117  write("\n};\n")
     118  
     119  write("static const wchar_t translit_from_tbl[] =\n ")
     120  col = 1
     121  first = True
     122  for translit in translits:
     123      if first:
     124          first = False
     125      else:
     126          if col + 6 >= 79:
     127              write("\n ")
     128              col = 1
     129          write(" L\"\\0\"")
     130          col += 6
     131      if col > 2 and col + len(translit.codepoints.source) + 4 >= 79:
     132          write("\n  ")
     133          col = 2
     134      else:
     135          write(" ")
     136          col += 1
     137      write("L\"{}\"".format(translit.codepoints.source))
     138      col += len(translit.codepoints.source) + 3
     139  write(";\n")
     140  
     141  write("static const uint32_t translit_to_idx[] =\n{\n  ")
     142  col = 2
     143  total = 0
     144  for translit in translits:
     145      if total > 0:
     146          if col + 7 >= 79:
     147              write(",\n  ")
     148              col = 2
     149          else:
     150              write(", ")
     151              col += 2
     152      write("{:4}".format(total))
     153      total += len(translit.replacement.decoded) + 2
     154      col += 4
     155  write("\n};\n")
     156  
     157  write("static const wchar_t translit_to_tbl[] =\n ")
     158  col = 1
     159  first = True
     160  for translit in translits:
     161      if first:
     162          first = False
     163      else:
     164          if col + 6 >= 79:
     165              write("\n ")
     166              col = 1
     167          write(" L\"\\0\"")
     168          col += 6
     169      if col > 2 and col + len(translit.replacement.source) + 6 >= 79:
     170          write("\n  ")
     171          col = 2
     172      else:
     173          write(" ")
     174          col += 1
     175      write("L\"{}\\0\"".format(translit.replacement.source))
     176      col += len(translit.replacement.source) + 5
     177  write(";\n")