1  #!/usr/bin/env python3
       2  #
       3  # Script to dump a UTF-8 file as a list of numbered lines (mimicking GCC's
       4  # diagnostic output format), interleaved with lines per character showing
       5  # the Unicode codepoints, the UTF-8 encoding bytes, the name of the
       6  # character, and, where printable, the characters themselves.
       7  # The lines are printed in logical order, which may help the reader to grok
       8  # the relationship between visual and logical ordering in bi-di files.
       9  #
      10  # SPDX-License-Identifier: MIT
      11  #
      12  # Copyright (C) 2021 David Malcolm <dmalcolm@redhat.com>.
      13  #
      14  # Permission is hereby granted, free of charge, to any person obtaining a
      15  # copy of this software and associated documentation files (the "Software"),
      16  # to deal in the Software without restriction, including without limitation
      17  # the rights to use, copy, modify, merge, publish, distribute, sublicense,
      18  # and/or sell copies of the Software, and to permit persons to whom the
      19  # Software is furnished to do so, subject to the following conditions:
      20  #
      21  # The above copyright notice and this permission notice shall be included
      22  # in all copies or substantial portions of the Software.
      23  #
      24  # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
      25  # OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
      26  # MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
      27  # IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
      28  # CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT
      29  # OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE
      30  # OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
      31  
      32  import sys
      33  import unicodedata
      34  
      35  
      36  def get_name(ch):
      37      try:
      38          return unicodedata.name(ch)
      39      except ValueError:
      40          if ch == '\n':
      41              return 'LINE FEED (LF)'
      42          return '(unknown)'
      43  
      44  
      45  def get_printable(ch):
      46      cat = unicodedata.category(ch)
      47      if cat == 'Cc':
      48          return '(control character)'
      49      elif cat == 'Cf':
      50          return '(format control)'
      51      elif cat[0] == 'Z':
      52          return '(separator)'
      53      return ch
      54  
      55  
      56  def dump_file(f_in):
      57      line_num = 1
      58      for line in f_in:
      59          print('%4i | %s' % (line_num, line.rstrip()))
      60          for ch in line:
      61              utf8_desc = '%15s' % (' '.join(['0x%02x' % b
      62                                              for b in ch.encode('utf-8')]))
      63              print('%4s |   U+%04X %s %40s %s'
      64                    % ('', ord(ch), utf8_desc, get_name(ch), get_printable(ch)))
      65          line_num += 1
      66  
      67  
      68  with open(sys.argv[1], mode='r') as f_in:
      69      dump_file(f_in)