1  #!/usr/bin/env python3
       2  #
       3  # Original script modified in November 2003 to take advantage of
       4  # the character-validation range routines, and updated to the
       5  # current Unicode information (Version 4.0.1)
       6  #
       7  # NOTE: there is an 'alias' facility for blocks which are not present in
       8  #	the current release, but are needed for ABI compatibility.  This
       9  #	must be accomplished MANUALLY!  Please see the comments below under
      10  #     'blockAliases'
      11  #
      12  import sys
      13  import string
      14  import time
      15  
      16  webpage = "http://www.unicode.org/Public/4.0-Update1/UCD-4.0.1.html"
      17  sources = "Blocks-4.0.1.txt UnicodeData-4.0.1.txt"
      18  
      19  #
      20  # blockAliases is a small hack - it is used for mapping block names which
      21  # were were used in the 3.1 release, but are missing or changed in the current
      22  # release.  The format is "OldBlockName:NewBlockName1[,NewBlockName2[,...]]"
      23  blockAliases = []
      24  blockAliases.append("CombiningMarksforSymbols:CombiningDiacriticalMarksforSymbols")
      25  blockAliases.append("Greek:GreekandCoptic")
      26  blockAliases.append("PrivateUse:PrivateUseArea,SupplementaryPrivateUseArea-A," + 
      27  	"SupplementaryPrivateUseArea-B")
      28  
      29  # minTableSize gives the minimum number of ranges which must be present
      30  # before a range table is produced.  If there are less than this
      31  # number, inline comparisons are generated
      32  minTableSize = 8
      33  
      34  (blockfile, catfile) = sources.split()
      35  
      36  
      37  #
      38  # Now process the "blocks" file, reducing it to a dictionary
      39  # indexed by blockname, containing a tuple with the applicable
      40  # block range
      41  #
      42  BlockNames = {}
      43  try:
      44      blocks = open(blockfile, "r")
      45  except:
      46      print("Missing %s, aborting ..." % blockfile)
      47      sys.exit(1)
      48  
      49  for line in blocks.readlines():
      50      if line[0] == '#':
      51          continue
      52      line = line.strip()
      53      if line == '':
      54          continue
      55      try:
      56          fields = line.split(';')
      57          range = fields[0].strip()
      58          (start, end) = range.split("..")
      59          name = fields[1].strip()
      60          name = name.replace(' ', '')
      61      except:
      62          print("Failed to process line: %s" % (line))
      63          continue
      64      start = "0x" + start
      65      end = "0x" + end
      66      try:
      67          BlockNames[name].append((start, end))
      68      except:
      69          BlockNames[name] = [(start, end)]
      70  blocks.close()
      71  print("Parsed %d blocks descriptions" % (len(BlockNames.keys())))
      72  
      73  for block in blockAliases:
      74      alias = block.split(':')
      75      alist = alias[1].split(',')
      76      for comp in alist:
      77          if comp in BlockNames:
      78              if alias[0] not in BlockNames:
      79                  BlockNames[alias[0]] = []
      80              for r in BlockNames[comp]:
      81                  BlockNames[alias[0]].append(r)
      82          else:
      83              print("Alias %s: %s not in Blocks" % (alias[0], comp))
      84              continue
      85  
      86  #
      87  # Next process the Categories file. This is more complex, since
      88  # the file is in code sequence, and we need to invert it.  We use
      89  # a dictionary with index category-name, with each entry containing
      90  # all the ranges (codepoints) of that category.  Note that category
      91  # names comprise two parts - the general category, and the "subclass"
      92  # within that category.  Therefore, both "general category" (which is
      93  # the first character of the 2-character category-name) and the full
      94  # (2-character) name are entered into this dictionary.
      95  #
      96  try:
      97      data = open(catfile, "r")
      98  except:
      99      print("Missing %s, aborting ..." % catfile)
     100      sys.exit(1)
     101  
     102  nbchar = 0;
     103  Categories = {}
     104  for line in data.readlines():
     105      if line[0] == '#':
     106          continue
     107      line = line.strip()
     108      if line == '':
     109          continue
     110      try:
     111          fields = line.split(';')
     112          point = fields[0].strip()
     113          value = 0
     114          while point != '':
     115              value = value * 16
     116              if point[0] >= '0' and point[0] <= '9':
     117                  value = value + ord(point[0]) - ord('0')
     118              elif point[0] >= 'A' and point[0] <= 'F':
     119                  value = value + 10 + ord(point[0]) - ord('A')
     120              elif point[0] >= 'a' and point[0] <= 'f':
     121                  value = value + 10 + ord(point[0]) - ord('a')
     122              point = point[1:]
     123          name = fields[2]
     124      except:
     125          print("Failed to process line: %s" % (line))
     126          continue
     127      
     128      nbchar = nbchar + 1
     129      # update entry for "full name"
     130      try:
     131          Categories[name].append(value)
     132      except:
     133          try:
     134              Categories[name] = [value]
     135          except:
     136              print("Failed to process line: %s" % (line))
     137      # update "general category" name
     138      try:
     139          Categories[name[0]].append(value)
     140      except:
     141          try:
     142              Categories[name[0]] = [value]
     143          except:
     144              print("Failed to process line: %s" % (line))
     145  
     146  blocks.close()
     147  print("Parsed %d char generating %d categories" % (nbchar, len(Categories.keys())))
     148  
     149  #
     150  # The data is now all read.  Time to process it into a more useful form.
     151  #
     152  # reduce the number list into ranges
     153  for cat in Categories.keys():
     154      list = Categories[cat]
     155      start = -1
     156      prev = -1
     157      end = -1
     158      ranges = []
     159      for val in list:
     160          if start == -1:
     161              start = val
     162              prev = val
     163              continue
     164          elif val == prev + 1:
     165              prev = val
     166              continue
     167          elif prev == start:
     168              ranges.append((prev, prev))
     169              start = val
     170              prev = val
     171              continue
     172          else:
     173              ranges.append((start, prev))
     174              start = val
     175              prev = val
     176              continue
     177      if prev == start:
     178          ranges.append((prev, prev))
     179      else:
     180          ranges.append((start, prev))
     181      Categories[cat] = ranges
     182  
     183  #
     184  # Assure all data is in alphabetic order, since we will be doing binary
     185  # searches on the tables.
     186  #
     187  bkeys = sorted(BlockNames.keys())
     188  
     189  ckeys = sorted(Categories.keys())
     190  
     191  #
     192  # Generate the resulting files
     193  #
     194  try:
     195      header = open("include/libxml/xmlunicode.h", "w")
     196  except:
     197      print("Failed to open include/libxml/xmlunicode.h")
     198      sys.exit(1)
     199  
     200  try:
     201      output = open("xmlunicode.c", "w")
     202  except:
     203      print("Failed to open xmlunicode.c")
     204      sys.exit(1)
     205  
     206  date = time.asctime(time.localtime(time.time()))
     207  
     208  header.write(
     209  """/*
     210   * Summary: Unicode character APIs
     211   * Description: API for the Unicode character APIs
     212   *
     213   * This file is automatically generated from the
     214   * UCS description files of the Unicode Character Database
     215   * %s
     216   * using the genUnicode.py Python script.
     217   *
     218   * Generation date: %s
     219   * Sources: %s
     220   * Author: Daniel Veillard
     221   */
     222  
     223  #ifndef __XML_UNICODE_H__
     224  #define __XML_UNICODE_H__
     225  
     226  #include <libxml/xmlversion.h>
     227  
     228  #ifdef LIBXML_UNICODE_ENABLED
     229  
     230  #ifdef __cplusplus
     231  extern "C" {
     232  #endif
     233  
     234  """ % (webpage, date, sources));
     235  
     236  output.write(
     237  """/*
     238   * xmlunicode.c: this module implements the Unicode character APIs
     239   *
     240   * This file is automatically generated from the
     241   * UCS description files of the Unicode Character Database
     242   * %s
     243   * using the genUnicode.py Python script.
     244   *
     245   * Generation date: %s
     246   * Sources: %s
     247   * Daniel Veillard <veillard@redhat.com>
     248   */
     249  
     250  #define IN_LIBXML
     251  #include "libxml.h"
     252  
     253  #ifdef LIBXML_UNICODE_ENABLED
     254  
     255  #include <string.h>
     256  #include <libxml/xmlversion.h>
     257  #include <libxml/xmlunicode.h>
     258  #include <libxml/chvalid.h>
     259  
     260  typedef int (xmlIntFunc)(int);	/* just to keep one's mind untwisted */
     261  
     262  typedef struct {
     263      const char *rangename;
     264      xmlIntFunc *func;
     265  } xmlUnicodeRange;
     266  
     267  typedef struct {
     268      const xmlUnicodeRange *table;
     269      int		    numentries;
     270  } xmlUnicodeNameTable;
     271  
     272  
     273  static xmlIntFunc *xmlUnicodeLookup(const xmlUnicodeNameTable *tptr, const char *tname);
     274  
     275  static const xmlUnicodeRange xmlUnicodeBlocks[] = {
     276  """ % (webpage, date, sources));
     277  
     278  flag = 0
     279  for block in bkeys:
     280      name = block.replace('-', '')
     281      if flag:
     282          output.write(',\n')
     283      else:
     284          flag = 1
     285      output.write('  {"%s", xmlUCSIs%s}' % (block, name))
     286  output.write('};\n\n')
     287  
     288  output.write('static const xmlUnicodeRange xmlUnicodeCats[] = {\n')
     289  flag = 0;
     290  for name in ckeys:
     291      if flag:
     292          output.write(',\n')
     293      else:
     294          flag = 1
     295      output.write('  {"%s", xmlUCSIsCat%s}' % (name, name))
     296  output.write('};\n\n')
     297  
     298  #
     299  # For any categories with more than minTableSize ranges we generate
     300  # a range table suitable for xmlCharInRange
     301  #
     302  for name in ckeys:
     303    if len(Categories[name]) > minTableSize:
     304      numshort = 0
     305      numlong = 0
     306      ranges = Categories[name]
     307      sptr = "NULL"
     308      lptr = "NULL"
     309      for range in ranges:
     310        (low, high) = range
     311        if high < 0x10000:
     312          if numshort == 0:
     313            pline = "static const xmlChSRange xml%sS[] = {" % name
     314            sptr = "xml%sS" % name
     315          else:
     316            pline += ","
     317          numshort += 1
     318        else:
     319          if numlong == 0:
     320            if numshort > 0:
     321              output.write(pline + " };\n")
     322            pline = "static const xmlChLRange xml%sL[] = {" % name
     323            lptr = "xml%sL" % name
     324          else:
     325            pline += ","
     326          numlong += 1
     327        if len(pline) > 60:
     328          output.write(pline + "\n")
     329          pline = "    "
     330        elif pline[-1:] == ",":
     331          pline += " "
     332        pline += "{%s, %s}" % (hex(low), hex(high))
     333      output.write(pline + " };\nstatic const xmlChRangeGroup xml%sG = {%s,%s,%s,%s};\n\n"
     334           % (name, numshort, numlong, sptr, lptr))
     335  
     336  
     337  output.write(
     338  """static const xmlUnicodeNameTable xmlUnicodeBlockTbl = {xmlUnicodeBlocks, %s};
     339  static const xmlUnicodeNameTable xmlUnicodeCatTbl = {xmlUnicodeCats, %s};
     340  
     341  /**
     342   * xmlUnicodeLookup:
     343   * @tptr: pointer to the name table
     344   * @name: name to be found
     345   *
     346   * binary table lookup for user-supplied name
     347   *
     348   * Returns pointer to range function if found, otherwise NULL
     349   */
     350  static xmlIntFunc
     351  *xmlUnicodeLookup(const xmlUnicodeNameTable *tptr, const char *tname) {
     352      int low, high, mid, cmp;
     353      const xmlUnicodeRange *sptr;
     354  
     355      if ((tptr == NULL) || (tname == NULL)) return(NULL);
     356  
     357      low = 0;
     358      high = tptr->numentries - 1;
     359      sptr = tptr->table;
     360      while (low <= high) {
     361  	mid = (low + high) / 2;
     362  	if ((cmp=strcmp(tname, sptr[mid].rangename)) == 0)
     363  	    return (sptr[mid].func);
     364  	if (cmp < 0)
     365  	    high = mid - 1;
     366  	else
     367  	    low = mid + 1;
     368      }
     369      return (NULL);
     370  }
     371  
     372  """ % (len(BlockNames), len(Categories)) )
     373  
     374  for block in bkeys:
     375      name = block.replace('-', '')
     376      header.write("XMLPUBFUN int xmlUCSIs%s\t(int code);\n" % name)
     377      output.write("/**\n * xmlUCSIs%s:\n * @code: UCS code point\n" % (name))
     378      output.write(" *\n * Check whether the character is part of %s UCS Block\n"%
     379                   (block))
     380      output.write(" *\n * Returns 1 if true 0 otherwise\n */\n");
     381      output.write("int\nxmlUCSIs%s(int code) {\n    return(" % name)
     382      flag = 0
     383      for (start, end) in BlockNames[block]:
     384          if flag:
     385              output.write(" ||\n           ")
     386          else:
     387              flag = 1
     388          output.write("((code >= %s) && (code <= %s))" % (start, end))
     389      output.write(");\n}\n\n")
     390  
     391  header.write("\nXMLPUBFUN int xmlUCSIsBlock\t(int code, const char *block);\n\n")
     392  output.write(
     393  """/**
     394   * xmlUCSIsBlock:
     395   * @code: UCS code point
     396   * @block: UCS block name
     397   *
     398   * Check whether the character is part of the UCS Block
     399   *
     400   * Returns 1 if true, 0 if false and -1 on unknown block
     401   */
     402  int
     403  xmlUCSIsBlock(int code, const char *block) {
     404      xmlIntFunc *func;
     405  
     406      func = xmlUnicodeLookup(&xmlUnicodeBlockTbl, block);
     407      if (func == NULL)
     408  	return (-1);
     409      return (func(code));
     410  }
     411  
     412  """)
     413  
     414  for name in ckeys:
     415      ranges = Categories[name]
     416      header.write("XMLPUBFUN int xmlUCSIsCat%s\t(int code);\n" % name)
     417      output.write("/**\n * xmlUCSIsCat%s:\n * @code: UCS code point\n" % (name))
     418      output.write(" *\n * Check whether the character is part of %s UCS Category\n"%
     419                   (name))
     420      output.write(" *\n * Returns 1 if true 0 otherwise\n */\n");
     421      output.write("int\nxmlUCSIsCat%s(int code) {\n" % name)
     422      if len(Categories[name]) > minTableSize:
     423          output.write("    return(xmlCharInRange((unsigned int)code, &xml%sG)"
     424              % name)
     425      else:
     426          start = 1
     427          for range in ranges:
     428              (begin, end) = range;
     429              if start:
     430                  output.write("    return(");
     431                  start = 0
     432              else:
     433                  output.write(" ||\n           ");
     434              if (begin == end):
     435                  output.write("(code == %s)" % (hex(begin)))
     436              else:
     437                  output.write("((code >= %s) && (code <= %s))" % (
     438                           hex(begin), hex(end)))
     439      output.write(");\n}\n\n")
     440  
     441  header.write("\nXMLPUBFUN int xmlUCSIsCat\t(int code, const char *cat);\n")
     442  output.write(
     443  """/**
     444   * xmlUCSIsCat:
     445   * @code: UCS code point
     446   * @cat: UCS Category name
     447   *
     448   * Check whether the character is part of the UCS Category
     449   *
     450   * Returns 1 if true, 0 if false and -1 on unknown category
     451   */
     452  int
     453  xmlUCSIsCat(int code, const char *cat) {
     454      xmlIntFunc *func;
     455  
     456      func = xmlUnicodeLookup(&xmlUnicodeCatTbl, cat);
     457      if (func == NULL)
     458  	return (-1);
     459      return (func(code));
     460  }
     461  
     462  #endif /* LIBXML_UNICODE_ENABLED */
     463  """)
     464  
     465  header.write("""
     466  #ifdef __cplusplus
     467  }
     468  #endif
     469  
     470  #endif /* LIBXML_UNICODE_ENABLED */
     471  
     472  #endif /* __XML_UNICODE_H__ */
     473  """);
     474  
     475  header.close()
     476  output.close()