(root)/
libxml2-2.12.3/
os400/
iconv/
bldcsndfa/
bldcsndfa.c
       1  /**
       2  ***     Build a deterministic finite automaton to associate CCSIDs with
       3  ***             character set names.
       4  ***
       5  ***     Compile on OS/400 with options SYSIFCOPT(*IFSIO).
       6  ***
       7  ***     See Copyright for the status of this software.
       8  ***
       9  ***     Author: Patrick Monnerat <pm@datasphere.ch>, DATASPHERE S.A.
      10  **/
      11  
      12  #include <stdio.h>
      13  #include <errno.h>
      14  #include <stdlib.h>
      15  #include <string.h>
      16  #include <fcntl.h>
      17  #include <ctype.h>
      18  
      19  #include <iconv.h>
      20  
      21  
      22  #ifdef OLDXML
      23  #include "xml.h"
      24  #else
      25  #include <libxml/hash.h>
      26  #include <libxml/parser.h>
      27  #include <libxml/xpath.h>
      28  #include <libxml/xpathInternals.h>
      29  #endif
      30  
      31  
      32  #ifdef __OS400__
      33  #define iconv_open_error(cd)            ((cd).return_value == -1)
      34  #define set_iconv_open_error(cd)        ((cd).return_value = -1)
      35  #else
      36  #define iconv_open_error(cd)            ((cd) == (iconv_t) -1)
      37  #define set_iconv_open_error(cd)        ((cd) = (iconv_t) -1)
      38  #endif
      39  
      40  
      41  #define C_SOURCE_CCSID          500
      42  #define C_UTF8_CCSID            1208
      43  
      44  
      45  #define UTF8_SPACE      0x20
      46  #define UTF8_HT         0x09
      47  #define UTF8_0          0x30
      48  #define UTF8_9          0x39
      49  #define UTF8_A          0x41
      50  #define UTF8_Z          0x5A
      51  #define UTF8_a          0x61
      52  #define UTF8_z          0x7A
      53  
      54  
      55  #define GRANULE         128             /* Memory allocation granule. */
      56  
      57  #define EPSILON         0x100           /* Token for empty transition. */
      58  
      59  
      60  #ifndef OFFSETOF
      61  #define OFFSETOF(t, f)  ((unsigned int) ((char *) &((t *) 0)->f - (char *) 0))
      62  #endif
      63  
      64  #ifndef OFFSETBY
      65  #define OFFSETBY(t, p, o)       ((t *) ((char *) (p) + (unsigned int) (o)))
      66  #endif
      67  
      68  
      69  typedef struct t_transition     t_transition;   /* NFA/DFA transition. */
      70  typedef struct t_state          t_state;        /* NFA/DFA state node. */
      71  typedef struct t_symlist        t_symlist;      /* Symbol (i.e.: name) list. */
      72  typedef struct t_chset          t_chset;        /* Character set. */
      73  typedef struct t_stategroup     t_stategroup;   /* Optimization group. */
      74  typedef unsigned char           utf8char;       /* UTF-8 character byte. */
      75  typedef unsigned char           byte;           /* Untyped data byte. */
      76  
      77  
      78  typedef struct {                        /* Set of pointers. */
      79          unsigned int    p_size;         /* Current allocated size. */
      80          unsigned int    p_card;         /* Current element count. */
      81          void *          p_set[1];       /* Element array. */
      82  }               t_powerset;
      83  
      84  
      85  struct t_transition {
      86          t_transition *  t_forwprev;     /* Head of forward transition list. */
      87          t_transition *  t_forwnext;     /* Tail of forward transition list. */
      88          t_transition *  t_backprev;     /* Head of backward transition list. */
      89          t_transition *  t_backnext;     /* Tail of backward transition list. */
      90          t_state *       t_from;         /* Incoming state. */
      91          t_state *       t_to;           /* Destination state. */
      92          unsigned short  t_token;        /* Transition token. */
      93          unsigned int    t_index;        /* Transition array index. */
      94  };
      95  
      96  
      97  struct t_state {
      98          t_state *       s_next;         /* Next state (for DFA construction). */
      99          t_state *       s_stack;        /* Unprocessed DFA states stack. */
     100          t_transition *  s_forward;      /* Forward transitions. */
     101          t_transition *  s_backward;     /* Backward transitions. */
     102          t_chset *       s_final;        /* Recognized character set. */
     103          t_powerset *    s_nfastates;    /* Corresponding NFA states. */
     104          unsigned int    s_index;        /* State index. */
     105  };
     106  
     107  
     108  struct t_symlist {
     109          t_symlist *     l_next;         /* Next name in list. */
     110          utf8char        l_symbol[1];    /* Name bytes. */
     111  };
     112  
     113  
     114  struct t_chset {
     115          t_chset *       c_next;         /* Next character set. */
     116          t_symlist *     c_names;        /* Character set name list. */
     117          iconv_t         c_fromUTF8;     /* Conversion from UTF-8. */
     118          unsigned int    c_ccsid;        /* IBM character set code. */
     119          unsigned int    c_mibenum;      /* IANA character code. */
     120  };
     121  
     122  
     123  struct t_stategroup {
     124          t_stategroup *  g_next;         /* Next group. */
     125          t_state *       g_member;       /* Group member (s_stack) list. */
     126          unsigned int    g_id;           /* Group ident. */
     127  };
     128  
     129  
     130  
     131  t_chset *       chset_list;             /* Character set list. */
     132  t_state *       initial_state;          /* Initial NFA state. */
     133  iconv_t         job2utf8;               /* Job CCSID to UTF-8 conversion. */
     134  iconv_t         utf82job;               /* UTF-8 to job CCSID conversion. */
     135  t_state *       dfa_states;             /* List of DFA states. */
     136  unsigned int    groupid;                /* Group ident counter. */
     137  
     138  
     139  /**
     140  ***     UTF-8 strings.
     141  **/
     142  
     143  #pragma convert(819)
     144  
     145  static const utf8char   utf8_MIBenum[] = "MIBenum";
     146  static const utf8char   utf8_mibenum[] = "mibenum";
     147  static const utf8char   utf8_ibm_[] = "ibm-";
     148  static const utf8char   utf8_IBMCCSID[] = "IBMCCSID";
     149  static const utf8char   utf8_iana_[] = "iana-";
     150  static const utf8char   utf8_Name[] = "Name";
     151  static const utf8char   utf8_Pref_MIME_Name[] = "Preferred MIME Name";
     152  static const utf8char   utf8_Aliases[] = "Aliases";
     153  static const utf8char   utf8_html[] = "html";
     154  static const utf8char   utf8_htmluri[] = "http://www.w3.org/1999/xhtml";
     155  static const utf8char   utf8_A[] = "A";
     156  static const utf8char   utf8_C[] = "C";
     157  static const utf8char   utf8_M[] = "M";
     158  static const utf8char   utf8_N[] = "N";
     159  static const utf8char   utf8_P[] = "P";
     160  static const utf8char   utf8_T[] = "T";
     161  static const utf8char   utf8_ccsid[] = "ccsid";
     162  static const utf8char   utf8_EBCDIC[] = "EBCDIC";
     163  static const utf8char   utf8_ASCII[] = "ASCII";
     164  static const utf8char   utf8_assocnodes[] = "/ccsid_mibenum/assoc[@ccsid]";
     165  static const utf8char   utf8_aliastext[] =
     166                                  "/ccsid_mibenum/assoc[@ccsid=$C]/alias/text()";
     167  #ifdef OLDXML
     168  static const utf8char   utf8_tablerows[] =
     169                          "//table[@id='table-character-sets-1']/*/tr";
     170  static const utf8char   utf8_headerpos[] =
     171                  "count(th[text()=$T]/preceding-sibling::th)+1";
     172  static const utf8char   utf8_getmibenum[] = "number(td[$M])";
     173  static const utf8char   utf8_getprefname[] = "string(td[$P])";
     174  static const utf8char   utf8_getname[] = "string(td[$N])";
     175  static const utf8char   utf8_getaliases[] = "td[$A]/text()";
     176  #else
     177  static const utf8char   utf8_tablerows[] =
     178                          "//html:table[@id='table-character-sets-1']/*/html:tr";
     179  static const utf8char   utf8_headerpos[] =
     180                  "count(html:th[text()=$T]/preceding-sibling::html:th)+1";
     181  static const utf8char   utf8_getmibenum[] = "number(html:td[$M])";
     182  static const utf8char   utf8_getprefname[] = "string(html:td[$P])";
     183  static const utf8char   utf8_getname[] = "string(html:td[$N])";
     184  static const utf8char   utf8_getaliases[] = "html:td[$A]/text()";
     185  #endif
     186  
     187  #pragma convert(0)
     188  
     189  
     190  /**
     191  ***     UTF-8 character length table.
     192  ***
     193  ***     Index is first character byte, value is the character byte count.
     194  **/
     195  
     196  static signed char      utf8_chlen[] = {
     197  /* 00-07 */     1,      1,      1,      1,      1,      1,      1,      1,
     198  /* 08-0F */     1,      1,      1,      1,      1,      1,      1,      1,
     199  /* 10-17 */     1,      1,      1,      1,      1,      1,      1,      1,
     200  /* 18-1F */     1,      1,      1,      1,      1,      1,      1,      1,
     201  /* 20-27 */     1,      1,      1,      1,      1,      1,      1,      1,
     202  /* 28-2F */     1,      1,      1,      1,      1,      1,      1,      1,
     203  /* 30-37 */     1,      1,      1,      1,      1,      1,      1,      1,
     204  /* 38-3F */     1,      1,      1,      1,      1,      1,      1,      1,
     205  /* 40-47 */     1,      1,      1,      1,      1,      1,      1,      1,
     206  /* 48-4F */     1,      1,      1,      1,      1,      1,      1,      1,
     207  /* 50-57 */     1,      1,      1,      1,      1,      1,      1,      1,
     208  /* 58-5F */     1,      1,      1,      1,      1,      1,      1,      1,
     209  /* 60-67 */     1,      1,      1,      1,      1,      1,      1,      1,
     210  /* 68-6F */     1,      1,      1,      1,      1,      1,      1,      1,
     211  /* 70-77 */     1,      1,      1,      1,      1,      1,      1,      1,
     212  /* 78-7F */     1,      1,      1,      1,      1,      1,      1,      1,
     213  /* 80-87 */     -1,     -1,     -1,     -1,     -1,     -1,     -1,     -1,
     214  /* 88-8F */     -1,     -1,     -1,     -1,     -1,     -1,     -1,     -1,
     215  /* 90-97 */     -1,     -1,     -1,     -1,     -1,     -1,     -1,     -1,
     216  /* 98-9F */     -1,     -1,     -1,     -1,     -1,     -1,     -1,     -1,
     217  /* A0-A7 */     -1,     -1,     -1,     -1,     -1,     -1,     -1,     -1,
     218  /* A8-AF */     -1,     -1,     -1,     -1,     -1,     -1,     -1,     -1,
     219  /* B0-B7 */     -1,     -1,     -1,     -1,     -1,     -1,     -1,     -1,
     220  /* B8-BF */     -1,     -1,     -1,     -1,     -1,     -1,     -1,     -1,
     221  /* C0-C7 */     2,      2,      2,      2,      2,      2,      2,      2,
     222  /* C8-CF */     2,      2,      2,      2,      2,      2,      2,      2,
     223  /* D0-D7 */     2,      2,      2,      2,      2,      2,      2,      2,
     224  /* D8-DF */     2,      2,      2,      2,      2,      2,      2,      2,
     225  /* E0-E7 */     3,      3,      3,      3,      3,      3,      3,      3,
     226  /* E8-EF */     3,      3,      3,      3,      3,      3,      3,      3,
     227  /* F0-F7 */     4,      4,      4,      4,      4,      4,      4,      4,
     228  /* F8-FF */     5,      5,      5,      5,      6,      6,      -1,     -1
     229  };
     230  
     231  
     232  
     233  void
     234  chknull(void * p)
     235  
     236  {
     237          if (p)
     238                  return;
     239  
     240          fprintf(stderr, "Not enough memory\n");
     241          exit(1);
     242  }
     243  
     244  
     245  void
     246  makecode(char * buf, unsigned int ccsid)
     247  
     248  {
     249          ccsid &= 0xFFFF;
     250          memset(buf, 0, 32);
     251          sprintf(buf, "IBMCCSID%05u0000000", ccsid);
     252  }
     253  
     254  
     255  iconv_t
     256  iconv_open_ccsid(unsigned int ccsidout,
     257                                  unsigned int ccsidin, unsigned int nullflag)
     258  
     259  {
     260          char fromcode[33];
     261          char tocode[33];
     262  
     263          makecode(fromcode, ccsidin);
     264          makecode(tocode, ccsidout);
     265          memset(tocode + 13, 0, sizeof tocode - 13);
     266  
     267          if (nullflag)
     268                  fromcode[18] = '1';
     269  
     270          return iconv_open(tocode, fromcode);
     271  }
     272  
     273  
     274  unsigned int
     275  getnum(char * * cpp)
     276  
     277  {
     278          unsigned int n;
     279          char * cp;
     280  
     281          cp = *cpp;
     282          n = 0;
     283  
     284          while (isdigit(*cp))
     285                  n = 10 * n + *cp++ - '0';
     286  
     287          *cpp = cp;
     288          return n;
     289  }
     290  
     291  
     292  const utf8char *
     293  hashBinaryKey(const byte * bytes, unsigned int len)
     294  
     295  {
     296          const byte * bp;
     297          utf8char * key;
     298          utf8char * cp;
     299          unsigned int n;
     300          unsigned int n4;
     301          unsigned int i;
     302  
     303          /**
     304          ***     Encode binary data in character form to be used as hash
     305          ***             table key.
     306          **/
     307  
     308          n = (4 * len + 2) / 3;
     309          key = (utf8char *) malloc(n + 1);
     310          chknull(key);
     311          bp = bytes;
     312          cp = key;
     313  
     314          for (n4 = n >> 2; n4; n4--) {
     315                  i = (bp[0] << 16) | (bp[1] << 8) | bp[2];
     316                  *cp++ = 0x21 + ((i >> 18) & 0x3F);
     317                  *cp++ = 0x21 + ((i >> 12) & 0x3F);
     318                  *cp++ = 0x21 + ((i >> 6) & 0x3F);
     319                  *cp++ = 0x21 + (i & 0x3F);
     320                  bp += 3;
     321                  }
     322  
     323          switch (n & 0x3) {
     324  
     325          case 2:
     326                  *cp++ = 0x21 + ((*bp >> 2) & 0x3F);
     327                  *cp++ = 0x21 + ((*bp << 4) & 0x3F);
     328                  break;
     329  
     330          case 3:
     331                  i = (bp[0] << 8) | bp[1];
     332                  *cp++ = 0x21 + ((i >> 10) & 0x3F);
     333                  *cp++ = 0x21 + ((i >> 4) & 0x3F);
     334                  *cp++ = 0x21 + ((i << 2) & 0x3F);
     335                  break;
     336                  }
     337  
     338          *cp = '\0';
     339          return key;
     340  }
     341  
     342  
     343  void *
     344  hash_get(xmlHashTablePtr h, const void * binkey, unsigned int len)
     345  
     346  {
     347          const utf8char * key;
     348          void * result;
     349  
     350          key = hashBinaryKey((const byte *) binkey, len);
     351          result = xmlHashLookup(h, key);
     352          free((char *) key);
     353          return result;
     354  }
     355  
     356  
     357  int
     358  hash_add(xmlHashTablePtr h, const void * binkey, unsigned int len, void * data)
     359  
     360  {
     361          const utf8char * key;
     362          int result;
     363  
     364          key = hashBinaryKey((const byte *) binkey, len);
     365          result = xmlHashAddEntry(h, key, data);
     366          free((char *) key);
     367          return result;
     368  }
     369  
     370  
     371  xmlDocPtr
     372  loadXMLFile(const char * filename)
     373  
     374  {
     375          struct stat sbuf;
     376          byte * databuf;
     377          int fd;
     378          int i;
     379          xmlDocPtr doc;
     380  
     381          if (stat(filename, &sbuf))
     382                  return (xmlDocPtr) NULL;
     383  
     384          databuf = malloc(sbuf.st_size + 4);
     385  
     386          if (!databuf)
     387                  return (xmlDocPtr) NULL;
     388  
     389          fd = open(filename, O_RDONLY
     390  #ifdef O_BINARY
     391                                           | O_BINARY
     392  #endif
     393                                                          );
     394  
     395          if (fd < 0) {
     396                  free((char *) databuf);
     397                  return (xmlDocPtr) NULL;
     398                  }
     399  
     400          i = read(fd, (char *) databuf, sbuf.st_size);
     401          close(fd);
     402  
     403          if (i != sbuf.st_size) {
     404                  free((char *) databuf);
     405                  return (xmlDocPtr) NULL;
     406                  }
     407  
     408          databuf[i] = databuf[i + 1] = databuf[i + 2] = databuf[i + 3] = 0;
     409          doc = xmlParseMemory((xmlChar *) databuf, i);
     410          free((char *) databuf);
     411          return doc;
     412  }
     413  
     414  
     415  int
     416  match(char * * cpp, char * s)
     417  
     418  {
     419          char * cp;
     420          int c1;
     421          int c2;
     422  
     423          cp = *cpp;
     424  
     425          for (cp = *cpp; c2 = *s++; cp++) {
     426                  c1 = *cp;
     427  
     428                  if (c1 != c2) {
     429                          if (isupper(c1))
     430                                  c1 = tolower(c1);
     431  
     432                          if (isupper(c2))
     433                                  c2 = tolower(c2);
     434                          }
     435  
     436                  if (c1 != c2)
     437                          return 0;
     438                  }
     439  
     440          c1 = *cp;
     441  
     442          while (c1 == ' ' || c1 == '\t')
     443                  c1 = *++cp;
     444  
     445          *cpp = cp;
     446          return 1;
     447  }
     448  
     449  
     450  t_state *
     451  newstate(void)
     452  
     453  {
     454          t_state * s;
     455  
     456          s = (t_state *) malloc(sizeof *s);
     457          chknull(s);
     458          memset((char *) s, 0, sizeof *s);
     459          return s;
     460  }
     461  
     462  
     463  void
     464  unlink_transition(t_transition * t)
     465  
     466  {
     467          if (t->t_backnext)
     468                  t->t_backnext->t_backprev = t->t_backprev;
     469  
     470          if (t->t_backprev)
     471                  t->t_backprev->t_backnext = t->t_backnext;
     472          else if (t->t_to)
     473                  t->t_to->s_backward = t->t_backnext;
     474  
     475          if (t->t_forwnext)
     476                  t->t_forwnext->t_forwprev = t->t_forwprev;
     477  
     478          if (t->t_forwprev)
     479                  t->t_forwprev->t_forwnext = t->t_forwnext;
     480          else if (t->t_from)
     481                  t->t_from->s_forward = t->t_forwnext;
     482  
     483          t->t_backprev = (t_transition *) NULL;
     484          t->t_backnext = (t_transition *) NULL;
     485          t->t_forwprev = (t_transition *) NULL;
     486          t->t_forwnext = (t_transition *) NULL;
     487          t->t_from = (t_state *) NULL;
     488          t->t_to = (t_state *) NULL;
     489  }
     490  
     491  
     492  void
     493  link_transition(t_transition * t, t_state * from, t_state * to)
     494  
     495  {
     496          if (!from)
     497                  from = t->t_from;
     498  
     499          if (!to)
     500                  to = t->t_to;
     501  
     502          unlink_transition(t);
     503  
     504          if ((t->t_from = from)) {
     505                  if ((t->t_forwnext = from->s_forward))
     506                          t->t_forwnext->t_forwprev = t;
     507  
     508                  from->s_forward = t;
     509                  }
     510  
     511          if ((t->t_to = to)) {
     512                  if ((t->t_backnext = to->s_backward))
     513                          t->t_backnext->t_backprev = t;
     514  
     515                  to->s_backward = t;
     516                  }
     517  }
     518  
     519  
     520  t_transition *
     521  newtransition(unsigned int token, t_state * from, t_state * to)
     522  
     523  {
     524          t_transition * t;
     525  
     526          t = (t_transition *) malloc(sizeof *t);
     527          chknull(t);
     528          memset((char *) t, 0, sizeof *t);
     529          t->t_token = token;
     530          link_transition(t, from, to);
     531          return t;
     532  }
     533  
     534  
     535  t_transition *
     536  uniquetransition(unsigned int token, t_state * from, t_state * to)
     537  
     538  {
     539          t_transition * t;
     540  
     541          for (t = from->s_forward; t; t = t->t_forwnext)
     542                  if (t->t_token == token && (t->t_to == to || !to))
     543                          return t;
     544  
     545          return to? newtransition(token, from, to): (t_transition *) NULL;
     546  }
     547  
     548  
     549  int
     550  set_position(t_powerset * s, void * e)
     551  
     552  {
     553          unsigned int l;
     554          unsigned int h;
     555          unsigned int m;
     556          int i;
     557  
     558          l = 0;
     559          h = s->p_card;
     560  
     561          while (l < h) {
     562                  m = (l + h) >> 1;
     563  
     564                  /**
     565                  ***     If both pointers belong to different allocation arenas,
     566                  ***             native comparison may find them neither
     567                  ***             equal, nor greater, nor smaller.
     568                  ***     We thus compare using memcmp() to get an orthogonal
     569                  ***             result.
     570                  **/
     571  
     572                  i = memcmp(&e, s->p_set + m, sizeof e);
     573  
     574                  if (i < 0)
     575                          h = m;
     576                  else if (!i)
     577                          return m;
     578                  else
     579                          l = m + 1;
     580                  }
     581  
     582          return l;
     583  }
     584  
     585  
     586  t_powerset *
     587  set_include(t_powerset * s, void * e)
     588  
     589  {
     590          unsigned int pos;
     591          unsigned int n;
     592  
     593          if (!s) {
     594                  s = (t_powerset *) malloc(sizeof *s +
     595                      GRANULE * sizeof s->p_set);
     596                  chknull(s);
     597                  s->p_size = GRANULE;
     598                  s->p_set[GRANULE] = (t_state *) NULL;
     599                  s->p_set[0] = e;
     600                  s->p_card = 1;
     601                  return s;
     602                  }
     603  
     604          pos = set_position(s, e);
     605  
     606          if (pos < s->p_card && s->p_set[pos] == e)
     607                  return s;
     608  
     609          if (s->p_card >= s->p_size) {
     610                  s->p_size += GRANULE;
     611                  s = (t_powerset *) realloc(s,
     612                      sizeof *s + s->p_size * sizeof s->p_set);
     613                  chknull(s);
     614                  s->p_set[s->p_size] = (t_state *) NULL;
     615                  }
     616  
     617          n = s->p_card - pos;
     618  
     619          if (n)
     620                  memmove((char *) (s->p_set + pos + 1),
     621                      (char *) (s->p_set + pos), n * sizeof s->p_set[0]);
     622  
     623          s->p_set[pos] = e;
     624          s->p_card++;
     625          return s;
     626  }
     627  
     628  
     629  t_state *
     630  nfatransition(t_state * to, byte token)
     631  
     632  {
     633          t_state * from;
     634  
     635          from = newstate();
     636          newtransition(token, from, to);
     637          return from;
     638  }
     639  
     640  
     641  static t_state *        nfadevelop(t_state * from, t_state * final, iconv_t icc,
     642                                  const utf8char * name, unsigned int len);
     643  
     644  
     645  void
     646  nfaslice(t_state * * from, t_state * * to, iconv_t icc,
     647                  const utf8char * chr, unsigned int chlen,
     648                  const utf8char * name, unsigned int len, t_state * final)
     649  
     650  {
     651          char * srcp;
     652          char * dstp;
     653          size_t srcc;
     654          size_t dstc;
     655          unsigned int cnt;
     656          t_state * f;
     657          t_state * t;
     658          t_transition * tp;
     659          byte bytebuf[8];
     660  
     661          srcp = (char *) chr;
     662          srcc = chlen;
     663          dstp = (char *) bytebuf;
     664          dstc = sizeof bytebuf;
     665          iconv(icc, &srcp, &srcc, &dstp, &dstc);
     666          dstp = (char *) bytebuf;
     667          cnt = sizeof bytebuf - dstc;
     668          t = *to;
     669          f = *from;
     670  
     671          /**
     672          ***     Check for end of string.
     673          **/
     674  
     675          if (!len)
     676                  if (t && t != final)
     677                          uniquetransition(EPSILON, t, final);
     678                  else
     679                          t = final;
     680  
     681          if (f)
     682                  while (cnt) {
     683                          tp = uniquetransition(*dstp, f, (t_state *) NULL);
     684  
     685                          if (!tp)
     686                                  break;
     687  
     688                          f = tp->t_to;
     689                          dstp++;
     690                          cnt--;
     691                          }
     692  
     693          if (!cnt) {
     694                  if (!t)
     695                          t = nfadevelop(f, final, icc, name, len);
     696  
     697                  *to = t;
     698                  return;
     699                  }
     700  
     701          if (!t) {
     702                  t = nfadevelop((t_state *) NULL, final, icc, name, len);
     703                  *to = t;
     704                  }
     705  
     706          if (!f)
     707                  *from = f = newstate();
     708  
     709          while (cnt > 1)
     710                  t = nfatransition(t, dstp[--cnt]);
     711  
     712          newtransition(*dstp, f, t);
     713  }
     714  
     715  
     716  t_state *
     717  nfadevelop(t_state * from, t_state * final, iconv_t icc,
     718                                          const utf8char * name, unsigned int len)
     719  
     720  {
     721          int chlen;
     722          int i;
     723          t_state * to;
     724          int uccnt;
     725          int lccnt;
     726          utf8char chr;
     727  
     728          chlen = utf8_chlen[*name];
     729  
     730          for (i = 1; i < chlen; i++)
     731                  if ((name[i] & 0xC0) != 0x80)
     732                          break;
     733  
     734          if (i != chlen) {
     735                  fprintf(stderr,
     736                      "Invalid UTF8 character in character set name\n");
     737                  return (t_state *) NULL;
     738                  }
     739  
     740          to = (t_state *) NULL;
     741          nfaslice(&from, &to,
     742              icc, name, chlen, name + chlen, len - chlen, final);
     743  
     744          if (*name >= UTF8_a && *name <= UTF8_z)
     745                  chr = *name - UTF8_a + UTF8_A;
     746          else if (*name >= UTF8_A && *name <= UTF8_Z)
     747                  chr = *name - UTF8_A + UTF8_a;
     748          else
     749                  return from;
     750  
     751          nfaslice(&from, &to, icc, &chr, 1, name + chlen, len - chlen, final);
     752          return from;
     753  }
     754  
     755  
     756  
     757  void
     758  nfaenter(const utf8char * name, int len, t_chset * charset)
     759  
     760  {
     761          t_chset * s;
     762          t_state * final;
     763          t_state * sp;
     764          t_symlist * lp;
     765  
     766          /**
     767          ***     Enter case-insensitive `name' in NFA in all known
     768          ***             character codes.
     769          ***     Redundant shift state changes as well as shift state
     770          ***             differences between uppercase and lowercase are
     771          ***             not handled.
     772          **/
     773  
     774          if (len < 0)
     775                  len = strlen(name) + 1;
     776  
     777          for (lp = charset->c_names; lp; lp = lp->l_next)
     778                  if (!memcmp(name, lp->l_symbol, len))
     779                          return;         /* Already entered. */
     780  
     781          lp = (t_symlist *) malloc(sizeof *lp + len);
     782          chknull(lp);
     783          memcpy(lp->l_symbol, name, len);
     784          lp->l_symbol[len] = '\0';
     785          lp->l_next = charset->c_names;
     786          charset->c_names = lp;
     787          final = newstate();
     788          final->s_final = charset;
     789  
     790          for (s = chset_list; s; s = s->c_next)
     791                  if (!iconv_open_error(s->c_fromUTF8))
     792                          sp = nfadevelop(initial_state, final,
     793                              s->c_fromUTF8, name, len);
     794  }
     795  
     796  
     797  unsigned int
     798  utf8_utostr(utf8char * s, unsigned int v)
     799  
     800  {
     801          unsigned int d;
     802          unsigned int i;
     803  
     804          d = v / 10;
     805          v -= d * 10;
     806          i = d? utf8_utostr(s, d): 0;
     807          s[i++] = v + UTF8_0;
     808          s[i] = '\0';
     809          return i;
     810  }
     811  
     812  
     813  unsigned int
     814  utf8_utostrpad(utf8char * s, unsigned int v, int digits)
     815  
     816  {
     817          unsigned int i = utf8_utostr(s, v);
     818          utf8char pad = UTF8_SPACE;
     819  
     820          if (digits < 0) {
     821                  pad = UTF8_0;
     822                  digits = -digits;
     823                  }
     824  
     825          if (i >= digits)
     826                  return i;
     827  
     828          memmove(s + digits - i, s, i + 1);
     829          memset(s, pad, digits - i);
     830          return digits;
     831  }
     832  
     833  
     834  unsigned int
     835  utf8_strtou(const utf8char * s)
     836  
     837  {
     838          unsigned int v;
     839  
     840          while (*s == UTF8_SPACE || *s == UTF8_HT)
     841                  s++;
     842  
     843          for (v = 0; *s >= UTF8_0 && *s <= UTF8_9;)
     844                  v = 10 * v + *s++ - UTF8_0;
     845  
     846          return v;
     847  }
     848  
     849  
     850  unsigned int
     851  getNumAttr(xmlNodePtr node, const xmlChar * name)
     852  
     853  {
     854          const xmlChar * s;
     855          unsigned int val;
     856  
     857          s = xmlGetProp(node, name);
     858  
     859          if (!s)
     860                  return 0;
     861  
     862          val = utf8_strtou(s);
     863          xmlFree((xmlChar *) s);
     864          return val;
     865  }
     866  
     867  
     868  void
     869  read_assocs(const char * filename)
     870  
     871  {
     872          xmlDocPtr doc;
     873          xmlXPathContextPtr ctxt;
     874          xmlXPathObjectPtr obj;
     875          xmlNodePtr node;
     876          t_chset * sp;
     877          int i;
     878          unsigned int ccsid;
     879          unsigned int mibenum;
     880          utf8char symbuf[32];
     881  
     882          doc = loadXMLFile(filename);
     883  
     884          if (!doc) {
     885                  fprintf(stderr, "Cannot load file %s\n", filename);
     886                  exit(1);
     887                  }
     888  
     889          ctxt = xmlXPathNewContext(doc);
     890          obj = xmlXPathEval(utf8_assocnodes, ctxt);
     891  
     892          if (!obj || obj->type != XPATH_NODESET || !obj->nodesetval ||
     893              !obj->nodesetval->nodeTab || !obj->nodesetval->nodeNr) {
     894                  fprintf(stderr, "No association found in %s\n", filename);
     895                  exit(1);
     896                  }
     897  
     898          for (i = 0; i < obj->nodesetval->nodeNr; i++) {
     899                  node = obj->nodesetval->nodeTab[i];
     900                  ccsid = getNumAttr(node, utf8_ccsid);
     901                  mibenum = getNumAttr(node, utf8_mibenum);
     902  
     903                  /**
     904                  ***     Check for duplicate.
     905                  **/
     906  
     907                  for (sp = chset_list; sp; sp = sp->c_next)
     908                          if (ccsid && ccsid == sp->c_ccsid ||
     909                              mibenum && mibenum == sp->c_mibenum) {
     910                                  fprintf(stderr, "Duplicate character set: ");
     911                                  fprintf(stderr, "CCSID = %u/%u, ",
     912                                      ccsid, sp->c_ccsid);
     913                                  fprintf(stderr, "MIBenum = %u/%u\n",
     914                                      mibenum, sp->c_mibenum);
     915                                  break;
     916                                  }
     917  
     918                  if (sp)
     919                          continue;
     920  
     921                  /**
     922                  ***     Allocate the new character set.
     923                  **/
     924  
     925                  sp = (t_chset *) malloc(sizeof *sp);
     926                  chknull(sp);
     927                  memset(sp, 0, sizeof *sp);
     928  
     929                  if (!ccsid)     /* Do not attempt with current job CCSID. */
     930                          set_iconv_open_error(sp->c_fromUTF8);
     931                  else {
     932                          sp->c_fromUTF8 =
     933                              iconv_open_ccsid(ccsid, C_UTF8_CCSID, 0);
     934  
     935                          if (iconv_open_error(sp->c_fromUTF8) == -1)
     936                                  fprintf(stderr,
     937                                      "Cannot convert into CCSID %u: ignored\n",
     938                                      ccsid);
     939                          }
     940  
     941                  sp->c_ccsid = ccsid;
     942                  sp->c_mibenum = mibenum;
     943                  sp->c_next = chset_list;
     944                  chset_list = sp;
     945                  }
     946  
     947          xmlXPathFreeObject(obj);
     948  
     949          /**
     950          ***     Enter aliases.
     951          **/
     952  
     953          for (sp = chset_list; sp; sp = sp->c_next) {
     954                  strcpy(symbuf, utf8_ibm_);
     955                  utf8_utostr(symbuf + 4, sp->c_ccsid);
     956                  nfaenter(symbuf, -1, sp);
     957                  strcpy(symbuf, utf8_IBMCCSID);
     958                  utf8_utostrpad(symbuf + 8, sp->c_ccsid, -5);
     959                  nfaenter(symbuf, 13, sp);       /* Not null-terminated. */
     960  
     961                  if (sp->c_mibenum) {
     962                          strcpy(symbuf, utf8_iana_);
     963                          utf8_utostr(symbuf + 5, sp->c_mibenum);
     964                          nfaenter(symbuf, -1, sp);
     965                          }
     966  
     967                  xmlXPathRegisterVariable(ctxt, utf8_C,
     968                      xmlXPathNewFloat((double) sp->c_ccsid));
     969                  obj = xmlXPathEval(utf8_aliastext, ctxt);
     970  
     971                  if (!obj || obj->type != XPATH_NODESET) {
     972                          fprintf(stderr, "getAlias failed in %s\n", filename);
     973                          exit(1);
     974                          }
     975  
     976                  if (obj->nodesetval &&
     977                      obj->nodesetval->nodeTab && obj->nodesetval->nodeNr) {
     978                          for (i = 0; i < obj->nodesetval->nodeNr; i++) {
     979                                  node = obj->nodesetval->nodeTab[i];
     980                                  nfaenter(node->content, -1, sp);
     981                                  }
     982                          }
     983  
     984                  xmlXPathFreeObject(obj);
     985                  }
     986  
     987          xmlXPathFreeContext(ctxt);
     988          xmlFreeDoc(doc);
     989  }
     990  
     991  
     992  unsigned int
     993  columnPosition(xmlXPathContextPtr ctxt, const xmlChar * header)
     994  
     995  {
     996          xmlXPathObjectPtr obj;
     997          unsigned int res = 0;
     998  
     999          xmlXPathRegisterVariable(ctxt, utf8_T, xmlXPathNewString(header));
    1000          obj = xmlXPathEval(utf8_headerpos, ctxt);
    1001  
    1002          if (obj) {
    1003                  if (obj->type == XPATH_NUMBER)
    1004                          res = (unsigned int) obj->floatval;
    1005  
    1006                  xmlXPathFreeObject(obj);
    1007                  }
    1008  
    1009          return res;
    1010  }
    1011  
    1012  
    1013  void
    1014  read_iana(const char * filename)
    1015  
    1016  {
    1017          xmlDocPtr doc;
    1018          xmlXPathContextPtr ctxt;
    1019          xmlXPathObjectPtr obj1;
    1020          xmlXPathObjectPtr obj2;
    1021          xmlNodePtr node;
    1022          int prefnamecol;
    1023          int namecol;
    1024          int mibenumcol;
    1025          int aliascol;
    1026          int mibenum;
    1027          t_chset * sp;
    1028          int n;
    1029          int i;
    1030  
    1031          doc = loadXMLFile(filename);
    1032  
    1033          if (!doc) {
    1034                  fprintf(stderr, "Cannot load file %s\n", filename);
    1035                  exit(1);
    1036                  }
    1037  
    1038          ctxt = xmlXPathNewContext(doc);
    1039  
    1040  #ifndef OLDXML
    1041          xmlXPathRegisterNs(ctxt, utf8_html, utf8_htmluri);
    1042  #endif
    1043  
    1044          obj1 = xmlXPathEval(utf8_tablerows, ctxt);
    1045  
    1046          if (!obj1 || obj1->type != XPATH_NODESET || !obj1->nodesetval ||
    1047              !obj1->nodesetval->nodeTab || obj1->nodesetval->nodeNr <= 1) {
    1048                  fprintf(stderr, "No data in %s\n", filename);
    1049                  exit(1);
    1050                  }
    1051  
    1052          /**
    1053          ***     Identify columns.
    1054          **/
    1055  
    1056          xmlXPathSetContextNode(obj1->nodesetval->nodeTab[0], ctxt);
    1057          prefnamecol = columnPosition(ctxt, utf8_Pref_MIME_Name);
    1058          namecol = columnPosition(ctxt, utf8_Name);
    1059          mibenumcol = columnPosition(ctxt, utf8_MIBenum);
    1060          aliascol = columnPosition(ctxt, utf8_Aliases);
    1061  
    1062          if (!prefnamecol || !namecol || !mibenumcol || !aliascol) {
    1063                  fprintf(stderr, "Key column(s) missing in %s\n", filename);
    1064                  exit(1);
    1065                  }
    1066  
    1067          xmlXPathRegisterVariable(ctxt, utf8_P,
    1068              xmlXPathNewFloat((double) prefnamecol));
    1069          xmlXPathRegisterVariable(ctxt, utf8_N,
    1070              xmlXPathNewFloat((double) namecol));
    1071          xmlXPathRegisterVariable(ctxt, utf8_M,
    1072              xmlXPathNewFloat((double) mibenumcol));
    1073          xmlXPathRegisterVariable(ctxt, utf8_A,
    1074              xmlXPathNewFloat((double) aliascol));
    1075  
    1076          /**
    1077          ***     Process each row.
    1078          **/
    1079  
    1080          for (n = 1; n < obj1->nodesetval->nodeNr; n++) {
    1081                  xmlXPathSetContextNode(obj1->nodesetval->nodeTab[n], ctxt);
    1082  
    1083                  /**
    1084                  ***     Get the MIBenum from current row.
    1085                  */
    1086  
    1087                  obj2 = xmlXPathEval(utf8_getmibenum, ctxt);
    1088  
    1089                  if (!obj2 || obj2->type != XPATH_NUMBER) {
    1090                          fprintf(stderr, "get MIBenum failed at row %u\n", n);
    1091                          exit(1);
    1092                          }
    1093  
    1094                  if (xmlXPathIsNaN(obj2->floatval) ||
    1095                      obj2->floatval < 1.0 || obj2->floatval > 65535.0 ||
    1096                      ((unsigned int) obj2->floatval) != obj2->floatval) {
    1097                          fprintf(stderr, "invalid MIBenum at row %u\n", n);
    1098                          xmlXPathFreeObject(obj2);
    1099                          continue;
    1100                          }
    1101  
    1102                  mibenum = obj2->floatval;
    1103                  xmlXPathFreeObject(obj2);
    1104  
    1105                  /**
    1106                  ***     Search the associations for a corresponding CCSID.
    1107                  **/
    1108  
    1109                  for (sp = chset_list; sp; sp = sp->c_next)
    1110                          if (sp->c_mibenum == mibenum)
    1111                                  break;
    1112  
    1113                  if (!sp)
    1114                          continue;       /* No CCSID for this MIBenum. */
    1115  
    1116                  /**
    1117                  ***     Process preferred MIME name.
    1118                  **/
    1119  
    1120                  obj2 = xmlXPathEval(utf8_getprefname, ctxt);
    1121  
    1122                  if (!obj2 || obj2->type != XPATH_STRING) {
    1123                          fprintf(stderr,
    1124                              "get Preferred_MIME_Name failed at row %u\n", n);
    1125                          exit(1);
    1126                          }
    1127  
    1128                  if (obj2->stringval && obj2->stringval[0])
    1129                          nfaenter(obj2->stringval, -1, sp);
    1130  
    1131                  xmlXPathFreeObject(obj2);
    1132  
    1133                  /**
    1134                  ***     Process name.
    1135                  **/
    1136  
    1137                  obj2 = xmlXPathEval(utf8_getname, ctxt);
    1138  
    1139                  if (!obj2 || obj2->type != XPATH_STRING) {
    1140                          fprintf(stderr, "get name failed at row %u\n", n);
    1141                          exit(1);
    1142                          }
    1143  
    1144                  if (obj2->stringval && obj2->stringval[0])
    1145                          nfaenter(obj2->stringval, -1, sp);
    1146  
    1147                  xmlXPathFreeObject(obj2);
    1148  
    1149                  /**
    1150                  ***     Process aliases.
    1151                  **/
    1152  
    1153                  obj2 = xmlXPathEval(utf8_getaliases, ctxt);
    1154  
    1155                  if (!obj2 || obj2->type != XPATH_NODESET) {
    1156                          fprintf(stderr, "get aliases failed at row %u\n", n);
    1157                          exit(1);
    1158                          }
    1159  
    1160                  if (obj2->nodesetval && obj2->nodesetval->nodeTab)
    1161                          for (i = 0; i < obj2->nodesetval->nodeNr; i++) {
    1162                                  node = obj2->nodesetval->nodeTab[i];
    1163  
    1164                                  if (node && node->content && node->content[0])
    1165                                          nfaenter(node->content, -1, sp);
    1166                                  }
    1167  
    1168                  xmlXPathFreeObject(obj2);
    1169                  }
    1170  
    1171          xmlXPathFreeObject(obj1);
    1172          xmlXPathFreeContext(ctxt);
    1173          xmlFreeDoc(doc);
    1174  }
    1175  
    1176  
    1177  t_powerset *    closureset(t_powerset * dst, t_powerset * src);
    1178  
    1179  
    1180  t_powerset *
    1181  closure(t_powerset * dst, t_state * src)
    1182  
    1183  {
    1184          t_transition * t;
    1185          unsigned int oldcard;
    1186  
    1187          if (src->s_nfastates) {
    1188                  /**
    1189                  ***     Is a DFA state: return closure of set of equivalent
    1190                  ***             NFA states.
    1191                  **/
    1192  
    1193                  return closureset(dst, src->s_nfastates);
    1194                  }
    1195  
    1196          /**
    1197          ***     Compute closure of NFA state.
    1198          **/
    1199  
    1200          dst = set_include(dst, src);
    1201  
    1202          for (t = src->s_forward; t; t = t->t_forwnext)
    1203                  if (t->t_token == EPSILON) {
    1204                          oldcard = dst->p_card;
    1205                          dst = set_include(dst, t->t_to);
    1206  
    1207                          if (oldcard != dst->p_card)
    1208                                  dst = closure(dst, t->t_to);
    1209                          }
    1210  
    1211          return dst;
    1212  }
    1213  
    1214  
    1215  t_powerset *
    1216  closureset(t_powerset * dst, t_powerset * src)
    1217  
    1218  {
    1219          unsigned int i;
    1220  
    1221          for (i = 0; i < src->p_card; i++)
    1222                  dst = closure(dst, (t_state *) src->p_set[i]);
    1223  
    1224          return dst;
    1225  }
    1226  
    1227  
    1228  t_state *
    1229  get_dfa_state(t_state * * stack,
    1230                          t_powerset * nfastates, xmlHashTablePtr sethash)
    1231  
    1232  {
    1233          t_state * s;
    1234  
    1235          if (s = hash_get(sethash, nfastates->p_set,
    1236              nfastates->p_card * sizeof nfastates->p_set[0])) {
    1237                  /**
    1238                  ***     DFA state already present.
    1239                  ***     Release the NFA state set and return
    1240                  ***             the address of the old DFA state.
    1241                  **/
    1242  
    1243                  free((char *) nfastates);
    1244                  return s;
    1245                  }
    1246  
    1247          /**
    1248          ***     Build the new state.
    1249          **/
    1250  
    1251          s = newstate();
    1252          s->s_nfastates = nfastates;
    1253          s->s_next = dfa_states;
    1254          dfa_states = s;
    1255          s->s_stack = *stack;
    1256          *stack = s;
    1257  
    1258          /**
    1259          ***     Enter it in hash.
    1260          **/
    1261  
    1262          if (hash_add(sethash, nfastates->p_set,
    1263              nfastates->p_card * sizeof nfastates->p_set[0], s))
    1264                  chknull(NULL);          /* Memory allocation error. */
    1265  
    1266          return s;
    1267  }
    1268  
    1269  
    1270  int
    1271  transcmp(const void * p1, const void * p2)
    1272  
    1273  {
    1274          t_transition * t1;
    1275          t_transition * t2;
    1276  
    1277          t1 = *(t_transition * *) p1;
    1278          t2 = *(t_transition * *) p2;
    1279          return ((int) t1->t_token) - ((int) t2->t_token);
    1280  }
    1281  
    1282  
    1283  void
    1284  builddfa(void)
    1285  
    1286  {
    1287          t_powerset * transset;
    1288          t_powerset * stateset;
    1289          t_state * s;
    1290          t_state * s2;
    1291          unsigned int n;
    1292          unsigned int i;
    1293          unsigned int token;
    1294          t_transition * t;
    1295          t_state * stack;
    1296          xmlHashTablePtr sethash;
    1297          unsigned int nst;
    1298  
    1299          transset = set_include(NULL, NULL);
    1300          chknull(transset);
    1301          stateset = set_include(NULL, NULL);
    1302          chknull(stateset);
    1303          sethash = xmlHashCreate(1);
    1304          chknull(sethash);
    1305          dfa_states = (t_state *) NULL;
    1306          stack = (t_state *) NULL;
    1307          nst = 0;
    1308  
    1309          /**
    1310          ***     Build the DFA initial state.
    1311          **/
    1312  
    1313          get_dfa_state(&stack, closure(NULL, initial_state), sethash);
    1314  
    1315          /**
    1316          ***     Build the other DFA states by looking at each
    1317          ***             possible transition from stacked DFA states.
    1318          **/
    1319  
    1320          do {
    1321                  if (!(++nst % 100))
    1322                          fprintf(stderr, "%u DFA states\n", nst);
    1323  
    1324                  s = stack;
    1325                  stack = s->s_stack;
    1326                  s->s_stack = (t_state *) NULL;
    1327  
    1328                  /**
    1329                  ***     Build a set of all non-epsilon transitions from this
    1330                  ***             state.
    1331                  **/
    1332  
    1333                  transset->p_card = 0;
    1334  
    1335                  for (n = 0; n < s->s_nfastates->p_card; n++) {
    1336                          s2 = s->s_nfastates->p_set[n];
    1337  
    1338                          for (t = s2->s_forward; t; t = t->t_forwnext)
    1339                                  if (t->t_token != EPSILON) {
    1340                                          transset = set_include(transset, t);
    1341                                          chknull(transset);
    1342                                          }
    1343                          }
    1344  
    1345                  /**
    1346                  ***     Sort transitions by token.
    1347                  **/
    1348  
    1349                  qsort(transset->p_set, transset->p_card,
    1350                      sizeof transset->p_set[0], transcmp);
    1351  
    1352                  /**
    1353                  ***     Process all transitions, grouping them by token.
    1354                  **/
    1355  
    1356                  stateset->p_card = 0;
    1357                  token = EPSILON;
    1358  
    1359                  for (i = 0; i < transset->p_card; i++) {
    1360                          t = transset->p_set[i];
    1361  
    1362                          if (token != t->t_token) {
    1363                                  if (stateset->p_card) {
    1364                                          /**
    1365                                          ***     Get the equivalent DFA state
    1366                                          ***             and create transition.
    1367                                          **/
    1368  
    1369                                          newtransition(token, s,
    1370                                              get_dfa_state(&stack,
    1371                                              closureset(NULL, stateset),
    1372                                              sethash));
    1373                                          stateset->p_card = 0;
    1374                                          }
    1375  
    1376                                  token = t->t_token;
    1377                                  }
    1378  
    1379                          stateset = set_include(stateset, t->t_to);
    1380                          }
    1381  
    1382                  if (stateset->p_card)
    1383                          newtransition(token, s, get_dfa_state(&stack,
    1384                              closureset(NULL, stateset), sethash));
    1385          } while (stack);
    1386  
    1387          free((char *) transset);
    1388          free((char *) stateset);
    1389          xmlHashFree(sethash, NULL);
    1390  
    1391          /**
    1392          ***     Reverse the state list to get the initial state first,
    1393          ***             check for ambiguous prefixes, determine final states,
    1394          ***             destroy NFA state sets.
    1395          **/
    1396  
    1397          while (s = dfa_states) {
    1398                  dfa_states = s->s_next;
    1399                  s->s_next = stack;
    1400                  stack = s;
    1401                  stateset = s->s_nfastates;
    1402                  s->s_nfastates = (t_powerset *) NULL;
    1403  
    1404                  for (n = 0; n < stateset->p_card; n++) {
    1405                          s2 = (t_state *) stateset->p_set[n];
    1406  
    1407                          if (s2->s_final) {
    1408                                  if (s->s_final && s->s_final != s2->s_final)
    1409                                          fprintf(stderr,
    1410                                              "Ambiguous name for CCSIDs %u/%u\n",
    1411                                              s->s_final->c_ccsid,
    1412                                              s2->s_final->c_ccsid);
    1413  
    1414                                  s->s_final = s2->s_final;
    1415                                  }
    1416                          }
    1417  
    1418                  free((char *) stateset);
    1419                  }
    1420  
    1421          dfa_states = stack;
    1422  }
    1423  
    1424  
    1425  void
    1426  deletenfa(void)
    1427  
    1428  {
    1429          t_transition * t;
    1430          t_state * s;
    1431          t_state * u;
    1432          t_state * stack;
    1433  
    1434          stack = initial_state;
    1435          stack->s_stack = (t_state *) NULL;
    1436  
    1437          while ((s = stack)) {
    1438                  stack = s->s_stack;
    1439  
    1440                  while ((t = s->s_forward)) {
    1441                          u = t->t_to;
    1442                          unlink_transition(t);
    1443                          free((char *) t);
    1444  
    1445                          if (!u->s_backward) {
    1446                                  u->s_stack = stack;
    1447                                  stack = u;
    1448                                  }
    1449                          }
    1450  
    1451                  free((char *) s);
    1452                  }
    1453  }
    1454  
    1455  
    1456  t_stategroup *
    1457  newgroup(void)
    1458  
    1459  {
    1460          t_stategroup * g;
    1461  
    1462          g = (t_stategroup *) malloc(sizeof *g);
    1463          chknull(g);
    1464          memset((char *) g, 0, sizeof *g);
    1465          g->g_id = groupid++;
    1466          return g;
    1467  }
    1468  
    1469  
    1470  void
    1471  optimizedfa(void)
    1472  
    1473  {
    1474          unsigned int i;
    1475          xmlHashTablePtr h;
    1476          t_state * s1;
    1477          t_state * s2;
    1478          t_state * finstates;
    1479          t_state * * sp;
    1480          t_stategroup * g1;
    1481          t_stategroup * g2;
    1482          t_stategroup * ghead;
    1483          t_transition * t1;
    1484          t_transition * t2;
    1485          unsigned int done;
    1486          unsigned int startgroup;
    1487          unsigned int gtrans[1 << (8 * sizeof(unsigned char))];
    1488  
    1489          /**
    1490          ***     Reduce DFA state count.
    1491          **/
    1492  
    1493          groupid = 0;
    1494          ghead = (t_stategroup *) NULL;
    1495  
    1496          /**
    1497          ***     First split: non-final and each distinct final states.
    1498          **/
    1499  
    1500          h = xmlHashCreate(4);
    1501          chknull(h);
    1502  
    1503          for (s1 = dfa_states; s1; s1 = s1->s_next) {
    1504                  if (!(g1 = hash_get(h, &s1->s_final, sizeof s1->s_final))) {
    1505                          g1 = newgroup();
    1506                          g1->g_next = ghead;
    1507                          ghead = g1;
    1508  
    1509                          if (hash_add(h, &s1->s_final, sizeof s1->s_final, g1))
    1510                                  chknull(NULL);  /* Memory allocation error. */
    1511                          }
    1512  
    1513                  s1->s_index = g1->g_id;
    1514                  s1->s_stack = g1->g_member;
    1515                  g1->g_member = s1;
    1516                  }
    1517  
    1518          xmlHashFree(h, NULL);
    1519  
    1520          /**
    1521          ***     Subsequent splits: states that have the same forward
    1522          ***             transition tokens to states in the same group.
    1523          **/
    1524  
    1525          do {
    1526                  done = 1;
    1527  
    1528                  for (g2 = ghead; g2; g2 = g2->g_next) {
    1529                          s1 = g2->g_member;
    1530  
    1531                          if (!s1->s_stack)
    1532                                  continue;
    1533  
    1534                          h = xmlHashCreate(1);
    1535                          chknull(h);
    1536  
    1537                          /**
    1538                          ***     Build the group transition map.
    1539                          **/
    1540  
    1541                          memset((char *) gtrans, ~0, sizeof gtrans);
    1542  
    1543                          for (t1 = s1->s_forward; t1; t1 = t1->t_forwnext)
    1544                                  gtrans[t1->t_token] = t1->t_to->s_index;
    1545  
    1546                          if (hash_add(h, gtrans, sizeof gtrans, g2))
    1547                                  chknull(NULL);
    1548  
    1549                          /**
    1550                          ***     Process other states in group.
    1551                          **/
    1552  
    1553                          sp = &s1->s_stack;
    1554                          s1 = *sp;
    1555  
    1556                          do {
    1557                                  *sp = s1->s_stack;
    1558  
    1559                                  /**
    1560                                  ***     Build the transition map.
    1561                                  **/
    1562  
    1563                                  memset((char *) gtrans, ~0, sizeof gtrans);
    1564  
    1565                                  for (t1 = s1->s_forward;
    1566                                      t1; t1 = t1->t_forwnext)
    1567                                          gtrans[t1->t_token] = t1->t_to->s_index;
    1568  
    1569                                  g1 = hash_get(h, gtrans, sizeof gtrans);
    1570  
    1571                                  if (g1 == g2) {
    1572                                          *sp = s1;
    1573                                          sp = &s1->s_stack;
    1574                                          }
    1575                                  else {
    1576                                          if (!g1) {
    1577                                                  g1 = newgroup();
    1578                                                  g1->g_next = ghead;
    1579                                                  ghead = g1;
    1580  
    1581                                                  if (hash_add(h, gtrans,
    1582                                                      sizeof gtrans, g1))
    1583                                                          chknull(NULL);
    1584                                                  }
    1585  
    1586                                          s1->s_index = g1->g_id;
    1587                                          s1->s_stack = g1->g_member;
    1588                                          g1->g_member = s1;
    1589                                          done = 0;
    1590                                          }
    1591                          } while (s1 = *sp);
    1592  
    1593                          xmlHashFree(h, NULL);
    1594                          }
    1595          } while (!done);
    1596  
    1597          /**
    1598          ***     Establish group leaders and remap transitions.
    1599          **/
    1600  
    1601          startgroup = dfa_states->s_index;
    1602  
    1603          for (g1 = ghead; g1; g1 = g1->g_next)
    1604                  for (s1 = g1->g_member->s_stack; s1; s1 = s1->s_stack)
    1605                          for (t1 = s1->s_backward; t1; t1 = t2) {
    1606                                  t2 = t1->t_backnext;
    1607                                  link_transition(t1, NULL, g1->g_member);
    1608                                  }
    1609  
    1610          /**
    1611          ***     Remove redundant states and transitions.
    1612          **/
    1613  
    1614          for (g1 = ghead; g1; g1 = g1->g_next) {
    1615                  g1->g_member->s_next = (t_state *) NULL;
    1616  
    1617                  while ((s1 = g1->g_member->s_stack)) {
    1618                          g1->g_member->s_stack = s1->s_stack;
    1619  
    1620                          for (t1 = s1->s_forward; t1; t1 = t2) {
    1621                                  t2 = t1->t_forwnext;
    1622                                  unlink_transition(t1);
    1623                                  free((char *) t1);
    1624                                  }
    1625  
    1626                          free((char *) s1);
    1627                          }
    1628                  }
    1629  
    1630          /**
    1631          ***     Remove group support and relink DFA states.
    1632          **/
    1633  
    1634          dfa_states = (t_state *) NULL;
    1635          s2 = (t_state *) NULL;
    1636          finstates = (t_state *) NULL;
    1637  
    1638          while (g1 = ghead) {
    1639                  ghead = g1->g_next;
    1640                  s1 = g1->g_member;
    1641  
    1642                  if (g1->g_id == startgroup)
    1643                          dfa_states = s1;        /* Keep start state first. */
    1644                  else if (s1->s_final) {         /* Then final states. */
    1645                          s1->s_next = finstates;
    1646                          finstates = s1;
    1647                          }
    1648                  else {                  /* Finish with non-final states. */
    1649                          s1->s_next = s2;
    1650                          s2 = s1;
    1651                          }
    1652  
    1653                  free((char *) g1);
    1654                  }
    1655  
    1656          for (dfa_states->s_next = finstates; finstates->s_next;)
    1657                  finstates = finstates->s_next;
    1658  
    1659          finstates->s_next = s2;
    1660  }
    1661  
    1662  
    1663  const char *
    1664  inttype(unsigned long max)
    1665  
    1666  {
    1667          int i;
    1668  
    1669          for (i = 0; max; i++)
    1670                  max >>= 1;
    1671  
    1672          if (i > 8 * sizeof(unsigned int))
    1673                  return "unsigned long";
    1674  
    1675          if (i > 8 * sizeof(unsigned short))
    1676                  return "unsigned int";
    1677  
    1678          if (i > 8 * sizeof(unsigned char))
    1679                  return "unsigned short";
    1680  
    1681          return "unsigned char";
    1682  }
    1683  
    1684  
    1685  listids(FILE * fp)
    1686  
    1687  {
    1688          unsigned int pos;
    1689          t_chset * cp;
    1690          t_symlist * lp;
    1691          char * srcp;
    1692          char * dstp;
    1693          size_t srcc;
    1694          size_t dstc;
    1695          char buf[80];
    1696  
    1697          fprintf(fp, "/**\n***     CCSID   For arg   Recognized name.\n");
    1698          pos = 0;
    1699  
    1700          for (cp = chset_list; cp; cp = cp->c_next) {
    1701                  if (pos) {
    1702                          fprintf(fp, "\n");
    1703                          pos = 0;
    1704                          }
    1705  
    1706                  if (!cp->c_names)
    1707                          continue;
    1708  
    1709                  pos = fprintf(fp, "***     %5u      %c     ", cp->c_ccsid,
    1710                      iconv_open_error(cp->c_fromUTF8)? ' ': 'X');
    1711  
    1712                  for (lp = cp->c_names; lp; lp = lp->l_next) {
    1713                          srcp = (char *) lp->l_symbol;
    1714                          srcc = strlen(srcp);
    1715                          dstp = buf;
    1716                          dstc = sizeof buf;
    1717                          iconv(utf82job, &srcp, &srcc, &dstp, &dstc);
    1718                          srcc = dstp - buf;
    1719  
    1720                          if (pos + srcc > 79) {
    1721                                  fprintf(fp, "\n***%22c", ' ');
    1722                                  pos = 25;
    1723                                  }
    1724  
    1725                          pos += fprintf(fp, " %.*s", srcc, buf);
    1726                          }
    1727                  }
    1728  
    1729          if (pos)
    1730                  fprintf(fp, "\n");
    1731  
    1732          fprintf(fp, "**/\n\n");
    1733  }
    1734  
    1735  
    1736  void
    1737  generate(FILE * fp)
    1738  
    1739  {
    1740          unsigned int nstates;
    1741          unsigned int ntrans;
    1742          unsigned int maxfinal;
    1743          t_state * s;
    1744          t_transition * t;
    1745          unsigned int i;
    1746          unsigned int pos;
    1747          char * ns;
    1748  
    1749          /**
    1750          ***     Assign indexes to states and transitions.
    1751          **/
    1752  
    1753          nstates = 0;
    1754          ntrans = 0;
    1755          maxfinal = 0;
    1756  
    1757          for (s = dfa_states; s; s = s->s_next) {
    1758                  s->s_index = nstates++;
    1759  
    1760                  if (s->s_final)
    1761                          maxfinal = nstates;
    1762  
    1763                  for (t = s->s_forward; t; t = t->t_forwnext)
    1764                          t->t_index = ntrans++;
    1765                  }
    1766  
    1767          fprintf(fp,
    1768              "/**\n***     %u states, %u finals, %u transitions.\n**/\n\n",
    1769              nstates, maxfinal, ntrans);
    1770          fprintf(stderr, "%u states, %u finals, %u transitions.\n",
    1771              nstates, maxfinal, ntrans);
    1772  
    1773          /**
    1774          ***     Generate types.
    1775          **/
    1776  
    1777          fprintf(fp, "typedef unsigned short          t_ccsid;\n");
    1778          fprintf(fp, "typedef %-23s t_staterange;\n", inttype(nstates));
    1779          fprintf(fp, "typedef %-23s t_transrange;\n\n", inttype(ntrans));
    1780  
    1781          /**
    1782          ***     Generate first transition index for each state.
    1783          **/
    1784  
    1785          fprintf(fp, "static t_transrange     trans_array[] = {\n");
    1786          pos = 0;
    1787          ntrans = 0;
    1788  
    1789          for (s = dfa_states; s; s = s->s_next) {
    1790                  pos += fprintf(fp, " %u,", ntrans);
    1791  
    1792                  if (pos > 72) {
    1793                          fprintf(fp, "\n");
    1794                          pos = 0;
    1795                          }
    1796  
    1797                  for (t = s->s_forward; t; t = t->t_forwnext)
    1798                          ntrans++;
    1799                  }
    1800  
    1801          fprintf(fp, " %u\n};\n\n", ntrans);
    1802  
    1803          /**
    1804          ***     Generate final state info.
    1805          **/
    1806  
    1807          fprintf(fp, "static t_ccsid          final_array[] = {\n");
    1808          pos = 0;
    1809          ns ="";
    1810          i = 0;
    1811  
    1812          for (s = dfa_states; s && i++ < maxfinal; s = s->s_next) {
    1813                  pos += fprintf(fp, "%s", ns);
    1814                  ns = ",";
    1815  
    1816                  if (pos > 72) {
    1817                          fprintf(fp, "\n");
    1818                          pos = 0;
    1819                          }
    1820  
    1821                  pos += fprintf(fp, " %u",
    1822                      s->s_final? s->s_final->c_ccsid + 1: 0);
    1823                  }
    1824  
    1825          fprintf(fp, "\n};\n\n");
    1826  
    1827          /**
    1828          ***     Generate goto table.
    1829          **/
    1830  
    1831          fprintf(fp, "static t_staterange     goto_array[] = {\n");
    1832          pos = 0;
    1833  
    1834          for (s = dfa_states; s; s = s->s_next)
    1835                  for (t = s->s_forward; t; t = t->t_forwnext) {
    1836                          pos += fprintf(fp, " %u,", t->t_to->s_index);
    1837  
    1838                          if (pos > 72) {
    1839                                  fprintf(fp, "\n");
    1840                                  pos = 0;
    1841                                  }
    1842                          }
    1843  
    1844          fprintf(fp, " %u\n};\n\n", nstates);
    1845  
    1846          /**
    1847          ***     Generate transition label table.
    1848          **/
    1849  
    1850          fprintf(fp, "static unsigned char    label_array[] = {\n");
    1851          pos = 0;
    1852          ns ="";
    1853  
    1854          for (s = dfa_states; s; s = s->s_next)
    1855                  for (t = s->s_forward; t; t = t->t_forwnext) {
    1856                          pos += fprintf(fp, "%s", ns);
    1857                          ns = ",";
    1858  
    1859                          if (pos > 72) {
    1860                                  fprintf(fp, "\n");
    1861                                  pos = 0;
    1862                                  }
    1863  
    1864                          pos += fprintf(fp, " 0x%02X", t->t_token);
    1865                          }
    1866  
    1867          fprintf(fp, "\n};\n", nstates);
    1868  }
    1869  
    1870  
    1871  main(argc, argv)
    1872  int argc;
    1873  char * * argv;
    1874  
    1875  {
    1876          FILE * fp;
    1877          t_chset * csp;
    1878          char symbuf[20];
    1879  
    1880          chset_list = (t_chset *) NULL;
    1881          initial_state = newstate();
    1882          job2utf8 = iconv_open_ccsid(C_UTF8_CCSID, C_SOURCE_CCSID, 0);
    1883          utf82job = iconv_open_ccsid(C_SOURCE_CCSID, C_UTF8_CCSID, 0);
    1884  
    1885          if (argc != 4) {
    1886                  fprintf(stderr, "Usage: %s <ccsid-mibenum file> ", *argv);
    1887                  fprintf(stderr, "<iana-character-set file> <output file>\n");
    1888                  exit(1);
    1889                  }
    1890  
    1891          /**
    1892          ***     Read CCSID/MIBenum associations. Define special names.
    1893          **/
    1894  
    1895          read_assocs(argv[1]);
    1896  
    1897          /**
    1898          ***     Read character set names and establish the case-independent
    1899          ***             name DFA in all possible CCSIDs.
    1900          **/
    1901  
    1902          read_iana(argv[2]);
    1903  
    1904          /**
    1905          ***     Build DFA from NFA.
    1906          **/
    1907  
    1908          builddfa();
    1909  
    1910          /**
    1911          ***     Delete NFA.
    1912          **/
    1913  
    1914          deletenfa();
    1915  
    1916          /**
    1917          ***     Minimize the DFA state count.
    1918          **/
    1919  
    1920          optimizedfa();
    1921  
    1922          /**
    1923          ***     Generate the table.
    1924          **/
    1925  
    1926          fp = fopen(argv[3], "w+");
    1927  
    1928          if (!fp) {
    1929                  perror(argv[3]);
    1930                  exit(1);
    1931                  }
    1932  
    1933          fprintf(fp, "/**\n");
    1934          fprintf(fp, "***     Character set names table.\n");
    1935          fprintf(fp, "***     Generated by program BLDCSNDFA from");
    1936          fprintf(fp, " IANA character set assignment file\n");
    1937          fprintf(fp, "***          and CCSID/MIBenum equivalence file.\n");
    1938          fprintf(fp, "***     *** Do not edit by hand ***\n");
    1939          fprintf(fp, "**/\n\n");
    1940          listids(fp);
    1941          generate(fp);
    1942  
    1943          if (ferror(fp)) {
    1944                  perror(argv[3]);
    1945                  fclose(fp);
    1946                  exit(1);
    1947                  }
    1948  
    1949          fclose(fp);
    1950          iconv_close(job2utf8);
    1951          iconv_close(utf82job);
    1952          exit(0);
    1953  }