1  #! /usr/bin/env python3
       2  # This script generates token related files from Grammar/Tokens:
       3  #
       4  #   Doc/library/token-list.inc
       5  #   Include/token.h
       6  #   Parser/token.c
       7  #   Lib/token.py
       8  
       9  
      10  SCRIPT_NAME = 'Tools/build/generate_token.py'
      11  AUTO_GENERATED_BY_SCRIPT = f'Auto-generated by {SCRIPT_NAME}'
      12  NT_OFFSET = 256
      13  
      14  def load_tokens(path):
      15      tok_names = []
      16      string_to_tok = {}
      17      ERRORTOKEN = None
      18      with open(path) as fp:
      19          for line in fp:
      20              line = line.strip()
      21              # strip comments
      22              i = line.find('#')
      23              if i >= 0:
      24                  line = line[:i].strip()
      25              if not line:
      26                  continue
      27              fields = line.split()
      28              name = fields[0]
      29              value = len(tok_names)
      30              if name == 'ERRORTOKEN':
      31                  ERRORTOKEN = value
      32              string = fields[1] if len(fields) > 1 else None
      33              if string:
      34                  string = eval(string)
      35                  string_to_tok[string] = value
      36              tok_names.append(name)
      37      return tok_names, ERRORTOKEN, string_to_tok
      38  
      39  
      40  def update_file(file, content):
      41      try:
      42          with open(file, 'r') as fobj:
      43              if fobj.read() == content:
      44                  return False
      45      except (OSError, ValueError):
      46          pass
      47      with open(file, 'w') as fobj:
      48          fobj.write(content)
      49      return True
      50  
      51  
      52  token_h_template = f"""\
      53  /* {AUTO_GENERATED_BY_SCRIPT} */
      54  """
      55  token_h_template += """\
      56  
      57  /* Token types */
      58  #ifndef Py_INTERNAL_TOKEN_H
      59  #define Py_INTERNAL_TOKEN_H
      60  #ifdef __cplusplus
      61  extern "C" {
      62  #endif
      63  
      64  #ifndef Py_BUILD_CORE
      65  #  error "this header requires Py_BUILD_CORE define"
      66  #endif
      67  
      68  #undef TILDE   /* Prevent clash of our definition with system macro. Ex AIX, ioctl.h */
      69  
      70  %s\
      71  #define N_TOKENS        %d
      72  #define NT_OFFSET       %d
      73  
      74  /* Special definitions for cooperation with parser */
      75  
      76  #define ISTERMINAL(x)           ((x) < NT_OFFSET)
      77  #define ISNONTERMINAL(x)        ((x) >= NT_OFFSET)
      78  #define ISEOF(x)                ((x) == ENDMARKER)
      79  #define ISWHITESPACE(x)         ((x) == ENDMARKER || \\
      80                                   (x) == NEWLINE   || \\
      81                                   (x) == INDENT    || \\
      82                                   (x) == DEDENT)
      83  #define ISSTRINGLIT(x)          ((x) == STRING           || \\
      84                                   (x) == FSTRING_MIDDLE)
      85  
      86  
      87  // Symbols exported for test_peg_generator
      88  PyAPI_DATA(const char * const) _PyParser_TokenNames[]; /* Token names */
      89  PyAPI_FUNC(int) _PyToken_OneChar(int);
      90  PyAPI_FUNC(int) _PyToken_TwoChars(int, int);
      91  PyAPI_FUNC(int) _PyToken_ThreeChars(int, int, int);
      92  
      93  #ifdef __cplusplus
      94  }
      95  #endif
      96  #endif  // !Py_INTERNAL_TOKEN_H
      97  """
      98  
      99  def make_h(infile, outfile='Include/internal/pycore_token.h'):
     100      tok_names, ERRORTOKEN, string_to_tok = load_tokens(infile)
     101  
     102      defines = []
     103      for value, name in enumerate(tok_names[:ERRORTOKEN + 1]):
     104          defines.append("#define %-15s %d\n" % (name, value))
     105  
     106      if update_file(outfile, token_h_template % (
     107              ''.join(defines),
     108              len(tok_names),
     109              NT_OFFSET
     110          )):
     111          print("%s regenerated from %s" % (outfile, infile))
     112  
     113  
     114  token_c_template = f"""\
     115  /* {AUTO_GENERATED_BY_SCRIPT} */
     116  """
     117  token_c_template += """\
     118  
     119  #include "Python.h"
     120  #include "pycore_token.h"
     121  
     122  /* Token names */
     123  
     124  const char * const _PyParser_TokenNames[] = {
     125  %s\
     126  };
     127  
     128  /* Return the token corresponding to a single character */
     129  
     130  int
     131  _PyToken_OneChar(int c1)
     132  {
     133  %s\
     134      return OP;
     135  }
     136  
     137  int
     138  _PyToken_TwoChars(int c1, int c2)
     139  {
     140  %s\
     141      return OP;
     142  }
     143  
     144  int
     145  _PyToken_ThreeChars(int c1, int c2, int c3)
     146  {
     147  %s\
     148      return OP;
     149  }
     150  """
     151  
     152  def generate_chars_to_token(mapping, n=1):
     153      result = []
     154      write = result.append
     155      indent = '    ' * n
     156      write(indent)
     157      write('switch (c%d) {\n' % (n,))
     158      for c in sorted(mapping):
     159          write(indent)
     160          value = mapping[c]
     161          if isinstance(value, dict):
     162              write("case '%s':\n" % (c,))
     163              write(generate_chars_to_token(value, n + 1))
     164              write(indent)
     165              write('    break;\n')
     166          else:
     167              write("case '%s': return %s;\n" % (c, value))
     168      write(indent)
     169      write('}\n')
     170      return ''.join(result)
     171  
     172  def make_c(infile, outfile='Parser/token.c'):
     173      tok_names, ERRORTOKEN, string_to_tok = load_tokens(infile)
     174      string_to_tok['<>'] = string_to_tok['!=']
     175      chars_to_token = {}
     176      for string, value in string_to_tok.items():
     177          assert 1 <= len(string) <= 3
     178          name = tok_names[value]
     179          m = chars_to_token.setdefault(len(string), {})
     180          for c in string[:-1]:
     181              m = m.setdefault(c, {})
     182          m[string[-1]] = name
     183  
     184      names = []
     185      for value, name in enumerate(tok_names):
     186          if value >= ERRORTOKEN:
     187              name = '<%s>' % name
     188          names.append('    "%s",\n' % name)
     189      names.append('    "<N_TOKENS>",\n')
     190  
     191      if update_file(outfile, token_c_template % (
     192              ''.join(names),
     193              generate_chars_to_token(chars_to_token[1]),
     194              generate_chars_to_token(chars_to_token[2]),
     195              generate_chars_to_token(chars_to_token[3])
     196          )):
     197          print("%s regenerated from %s" % (outfile, infile))
     198  
     199  
     200  token_inc_template = f"""\
     201  .. {AUTO_GENERATED_BY_SCRIPT}
     202  %s
     203  .. data:: N_TOKENS
     204  
     205  .. data:: NT_OFFSET
     206  """
     207  
     208  def make_rst(infile, outfile='Doc/library/token-list.inc'):
     209      tok_names, ERRORTOKEN, string_to_tok = load_tokens(infile)
     210      tok_to_string = {value: s for s, value in string_to_tok.items()}
     211  
     212      names = []
     213      for value, name in enumerate(tok_names[:ERRORTOKEN + 1]):
     214          names.append('.. data:: %s' % (name,))
     215          if value in tok_to_string:
     216              names.append('')
     217              names.append('   Token value for ``"%s"``.' % tok_to_string[value])
     218          names.append('')
     219  
     220      if update_file(outfile, token_inc_template % '\n'.join(names)):
     221          print("%s regenerated from %s" % (outfile, infile))
     222  
     223  
     224  token_py_template = f'''\
     225  """Token constants."""
     226  # {AUTO_GENERATED_BY_SCRIPT}
     227  '''
     228  token_py_template += '''
     229  __all__ = ['tok_name', 'ISTERMINAL', 'ISNONTERMINAL', 'ISEOF']
     230  
     231  %s
     232  N_TOKENS = %d
     233  # Special definitions for cooperation with parser
     234  NT_OFFSET = %d
     235  
     236  tok_name = {value: name
     237              for name, value in globals().items()
     238              if isinstance(value, int) and not name.startswith('_')}
     239  __all__.extend(tok_name.values())
     240  
     241  EXACT_TOKEN_TYPES = {
     242  %s
     243  }
     244  
     245  def ISTERMINAL(x):
     246      return x < NT_OFFSET
     247  
     248  def ISNONTERMINAL(x):
     249      return x >= NT_OFFSET
     250  
     251  def ISEOF(x):
     252      return x == ENDMARKER
     253  '''
     254  
     255  def make_py(infile, outfile='Lib/token.py'):
     256      tok_names, ERRORTOKEN, string_to_tok = load_tokens(infile)
     257  
     258      constants = []
     259      for value, name in enumerate(tok_names):
     260          constants.append('%s = %d' % (name, value))
     261      constants.insert(ERRORTOKEN,
     262          "# These aren't used by the C tokenizer but are needed for tokenize.py")
     263  
     264      token_types = []
     265      for s, value in sorted(string_to_tok.items()):
     266          token_types.append('    %r: %s,' % (s, tok_names[value]))
     267  
     268      if update_file(outfile, token_py_template % (
     269              '\n'.join(constants),
     270              len(tok_names),
     271              NT_OFFSET,
     272              '\n'.join(token_types),
     273          )):
     274          print("%s regenerated from %s" % (outfile, infile))
     275  
     276  
     277  def main(op, infile='Grammar/Tokens', *args):
     278      make = globals()['make_' + op]
     279      make(infile, *args)
     280  
     281  
     282  if __name__ == '__main__':
     283      import sys
     284      main(*sys.argv[1:])