(root)/
Python-3.12.0/
Tools/
build/
parse_html5_entities.py
       1  #!/usr/bin/env python3
       2  """
       3  Utility for parsing HTML5 entity definitions available from:
       4  
       5      https://html.spec.whatwg.org/entities.json
       6      https://html.spec.whatwg.org/multipage/named-characters.html
       7  
       8  The page now contains the following note:
       9  
      10      "This list is static and will not be expanded or changed in the future."
      11  
      12  Written by Ezio Melotti and Iuliia Proskurnia.
      13  """
      14  
      15  import os
      16  import sys
      17  import json
      18  from urllib.request import urlopen
      19  from html.entities import html5
      20  
      21  SCRIPT_NAME = 'Tools/build/parse_html5_entities.py'
      22  PAGE_URL = 'https://html.spec.whatwg.org/multipage/named-characters.html'
      23  ENTITIES_URL = 'https://html.spec.whatwg.org/entities.json'
      24  HTML5_SECTION_START = '# HTML5 named character references'
      25  
      26  def get_json(url):
      27      """Download the json file from the url and returns a decoded object."""
      28      with urlopen(url) as f:
      29          data = f.read().decode('utf-8')
      30      return json.loads(data)
      31  
      32  def create_dict(entities):
      33      """Create the html5 dict from the decoded json object."""
      34      new_html5 = {}
      35      for name, value in entities.items():
      36          new_html5[name.lstrip('&')] = value['characters']
      37      return new_html5
      38  
      39  def compare_dicts(old, new):
      40      """Compare the old and new dicts and print the differences."""
      41      added = new.keys() - old.keys()
      42      if added:
      43          print('{} entitie(s) have been added:'.format(len(added)))
      44          for name in sorted(added):
      45              print('  {!r}: {!r}'.format(name, new[name]))
      46      removed = old.keys() - new.keys()
      47      if removed:
      48          print('{} entitie(s) have been removed:'.format(len(removed)))
      49          for name in sorted(removed):
      50              print('  {!r}: {!r}'.format(name, old[name]))
      51      changed = set()
      52      for name in (old.keys() & new.keys()):
      53          if old[name] != new[name]:
      54              changed.add((name, old[name], new[name]))
      55      if changed:
      56          print('{} entitie(s) have been modified:'.format(len(changed)))
      57          for item in sorted(changed):
      58              print('  {!r}: {!r} -> {!r}'.format(*item))
      59  
      60  def write_items(entities, file=sys.stdout):
      61      """Write the items of the dictionary in the specified file."""
      62      # The keys in the generated dictionary should be sorted
      63      # in a case-insensitive way, however, when two keys are equal,
      64      # the uppercase version should come first so that the result
      65      # looks like: ['Aacute', 'aacute', 'Aacute;', 'aacute;', ...]
      66      # To do this we first sort in a case-sensitive way (so all the
      67      # uppercase chars come first) and then sort with key=str.lower.
      68      # Since the sorting is stable the uppercase keys will eventually
      69      # be before their equivalent lowercase version.
      70      keys = sorted(entities.keys())
      71      keys = sorted(keys, key=str.lower)
      72      print(HTML5_SECTION_START, file=file)
      73      print(f'# Generated by {SCRIPT_NAME}\n'
      74            f'# from {ENTITIES_URL} and\n'
      75            f'# {PAGE_URL}.\n'
      76            f'# Map HTML5 named character references to the '
      77            f'equivalent Unicode character(s).', file=file)
      78      print('html5 = {', file=file)
      79      for name in keys:
      80          print(f'    {name!r}: {entities[name]!a},', file=file)
      81      print('}', file=file)
      82  
      83  
      84  if __name__ == '__main__':
      85      # without args print a diff between html.entities.html5 and new_html5
      86      # with --create print the new html5 dict
      87      # with --patch patch the Lib/html/entities.py file
      88      new_html5 = create_dict(get_json(ENTITIES_URL))
      89      if '--create' in sys.argv:
      90          write_items(new_html5)
      91      elif '--patch' in sys.argv:
      92          fname = 'Lib/html/entities.py'
      93          temp_fname = fname + '.temp'
      94          with open(fname) as f1, open(temp_fname, 'w') as f2:
      95              skip = False
      96              for line in f1:
      97                  if line.startswith(HTML5_SECTION_START):
      98                      write_items(new_html5, file=f2)
      99                      skip = True
     100                      continue
     101                  if skip:
     102                      # skip the old items until the }
     103                      if line.startswith('}'):
     104                          skip = False
     105                      continue
     106                  f2.write(line)
     107          os.remove(fname)
     108          os.rename(temp_fname, fname)
     109      else:
     110          if html5 == new_html5:
     111              print('The current dictionary is updated.')
     112          else:
     113              compare_dicts(html5, new_html5)
     114              print('Run "./python {0} --patch" to update Lib/html/entities.html '
     115                    'or "./python {0} --create" to see the generated ' 'dictionary.'.format(__file__))