1  /*
       2                              __  __            _
       3                           ___\ \/ /_ __   __ _| |_
       4                          / _ \\  /| '_ \ / _` | __|
       5                         |  __//  \| |_) | (_| | |_
       6                          \___/_/\_\ .__/ \__,_|\__|
       7                                   |_| XML parser
       8  
       9     Copyright (c) 1997-2000 Thai Open Source Software Center Ltd
      10     Copyright (c) 2000      Clark Cooper <coopercc@users.sourceforge.net>
      11     Copyright (c) 2001-2003 Fred L. Drake, Jr. <fdrake@users.sourceforge.net>
      12     Copyright (c) 2002      Greg Stein <gstein@users.sourceforge.net>
      13     Copyright (c) 2002-2016 Karl Waclawek <karl@waclawek.net>
      14     Copyright (c) 2005-2009 Steven Solie <steven@solie.ca>
      15     Copyright (c) 2016-2022 Sebastian Pipping <sebastian@pipping.org>
      16     Copyright (c) 2016      Pascal Cuoq <cuoq@trust-in-soft.com>
      17     Copyright (c) 2016      Don Lewis <truckman@apache.org>
      18     Copyright (c) 2017      Rhodri James <rhodri@wildebeest.org.uk>
      19     Copyright (c) 2017      Alexander Bluhm <alexander.bluhm@gmx.net>
      20     Copyright (c) 2017      Benbuck Nason <bnason@netflix.com>
      21     Copyright (c) 2017      José Gutiérrez de la Concha <jose@zeroc.com>
      22     Copyright (c) 2019      David Loffredo <loffredo@steptools.com>
      23     Copyright (c) 2021      Dong-hee Na <donghee.na@python.org>
      24     Copyright (c) 2022      Martin Ettl <ettl.martin78@googlemail.com>
      25     Licensed under the MIT license:
      26  
      27     Permission is  hereby granted,  free of charge,  to any  person obtaining
      28     a  copy  of  this  software   and  associated  documentation  files  (the
      29     "Software"),  to  deal in  the  Software  without restriction,  including
      30     without  limitation the  rights  to use,  copy,  modify, merge,  publish,
      31     distribute, sublicense, and/or sell copies of the Software, and to permit
      32     persons  to whom  the Software  is  furnished to  do so,  subject to  the
      33     following conditions:
      34  
      35     The above copyright  notice and this permission notice  shall be included
      36     in all copies or substantial portions of the Software.
      37  
      38     THE  SOFTWARE  IS  PROVIDED  "AS  IS",  WITHOUT  WARRANTY  OF  ANY  KIND,
      39     EXPRESS  OR IMPLIED,  INCLUDING  BUT  NOT LIMITED  TO  THE WARRANTIES  OF
      40     MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN
      41     NO EVENT SHALL THE AUTHORS OR  COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM,
      42     DAMAGES OR  OTHER LIABILITY, WHETHER  IN AN  ACTION OF CONTRACT,  TORT OR
      43     OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
      44     USE OR OTHER DEALINGS IN THE SOFTWARE.
      45  */
      46  
      47  #include <expat_config.h>
      48  
      49  #include <stddef.h>
      50  #include <string.h> /* memcpy */
      51  #include <stdbool.h>
      52  
      53  #ifdef _WIN32
      54  #  include "winconfig.h"
      55  #endif
      56  
      57  #include "expat_external.h"
      58  #include "internal.h"
      59  #include "xmltok.h"
      60  #include "nametab.h"
      61  
      62  #ifdef XML_DTD
      63  #  define IGNORE_SECTION_TOK_VTABLE , PREFIX(ignoreSectionTok)
      64  #else
      65  #  define IGNORE_SECTION_TOK_VTABLE /* as nothing */
      66  #endif
      67  
      68  #define VTABLE1                                                                \
      69    {PREFIX(prologTok), PREFIX(contentTok),                                      \
      70     PREFIX(cdataSectionTok) IGNORE_SECTION_TOK_VTABLE},                         \
      71        {PREFIX(attributeValueTok), PREFIX(entityValueTok)},                     \
      72        PREFIX(nameMatchesAscii), PREFIX(nameLength), PREFIX(skipS),             \
      73        PREFIX(getAtts), PREFIX(charRefNumber), PREFIX(predefinedEntityName),    \
      74        PREFIX(updatePosition), PREFIX(isPublicId)
      75  
      76  #define VTABLE VTABLE1, PREFIX(toUtf8), PREFIX(toUtf16)
      77  
      78  #define UCS2_GET_NAMING(pages, hi, lo)                                         \
      79    (namingBitmap[(pages[hi] << 3) + ((lo) >> 5)] & (1u << ((lo)&0x1F)))
      80  
      81  /* A 2 byte UTF-8 representation splits the characters 11 bits between
      82     the bottom 5 and 6 bits of the bytes.  We need 8 bits to index into
      83     pages, 3 bits to add to that index and 5 bits to generate the mask.
      84  */
      85  #define UTF8_GET_NAMING2(pages, byte)                                          \
      86    (namingBitmap[((pages)[(((byte)[0]) >> 2) & 7] << 3)                         \
      87                  + ((((byte)[0]) & 3) << 1) + ((((byte)[1]) >> 5) & 1)]         \
      88     & (1u << (((byte)[1]) & 0x1F)))
      89  
      90  /* A 3 byte UTF-8 representation splits the characters 16 bits between
      91     the bottom 4, 6 and 6 bits of the bytes.  We need 8 bits to index
      92     into pages, 3 bits to add to that index and 5 bits to generate the
      93     mask.
      94  */
      95  #define UTF8_GET_NAMING3(pages, byte)                                          \
      96    (namingBitmap                                                                \
      97         [((pages)[((((byte)[0]) & 0xF) << 4) + ((((byte)[1]) >> 2) & 0xF)]      \
      98           << 3)                                                                 \
      99          + ((((byte)[1]) & 3) << 1) + ((((byte)[2]) >> 5) & 1)]                 \
     100     & (1u << (((byte)[2]) & 0x1F)))
     101  
     102  /* Detection of invalid UTF-8 sequences is based on Table 3.1B
     103     of Unicode 3.2: http://www.unicode.org/unicode/reports/tr28/
     104     with the additional restriction of not allowing the Unicode
     105     code points 0xFFFF and 0xFFFE (sequences EF,BF,BF and EF,BF,BE).
     106     Implementation details:
     107       (A & 0x80) == 0     means A < 0x80
     108     and
     109       (A & 0xC0) == 0xC0  means A > 0xBF
     110  */
     111  
     112  #define UTF8_INVALID2(p)                                                       \
     113    ((*p) < 0xC2 || ((p)[1] & 0x80) == 0 || ((p)[1] & 0xC0) == 0xC0)
     114  
     115  #define UTF8_INVALID3(p)                                                       \
     116    (((p)[2] & 0x80) == 0                                                        \
     117     || ((*p) == 0xEF && (p)[1] == 0xBF ? (p)[2] > 0xBD                          \
     118                                        : ((p)[2] & 0xC0) == 0xC0)               \
     119     || ((*p) == 0xE0                                                            \
     120             ? (p)[1] < 0xA0 || ((p)[1] & 0xC0) == 0xC0                          \
     121             : ((p)[1] & 0x80) == 0                                              \
     122                   || ((*p) == 0xED ? (p)[1] > 0x9F : ((p)[1] & 0xC0) == 0xC0)))
     123  
     124  #define UTF8_INVALID4(p)                                                       \
     125    (((p)[3] & 0x80) == 0 || ((p)[3] & 0xC0) == 0xC0 || ((p)[2] & 0x80) == 0     \
     126     || ((p)[2] & 0xC0) == 0xC0                                                  \
     127     || ((*p) == 0xF0                                                            \
     128             ? (p)[1] < 0x90 || ((p)[1] & 0xC0) == 0xC0                          \
     129             : ((p)[1] & 0x80) == 0                                              \
     130                   || ((*p) == 0xF4 ? (p)[1] > 0x8F : ((p)[1] & 0xC0) == 0xC0)))
     131  
     132  static int PTRFASTCALL
     133  isNever(const ENCODING *enc, const char *p) {
     134    UNUSED_P(enc);
     135    UNUSED_P(p);
     136    return 0;
     137  }
     138  
     139  static int PTRFASTCALL
     140  utf8_isName2(const ENCODING *enc, const char *p) {
     141    UNUSED_P(enc);
     142    return UTF8_GET_NAMING2(namePages, (const unsigned char *)p);
     143  }
     144  
     145  static int PTRFASTCALL
     146  utf8_isName3(const ENCODING *enc, const char *p) {
     147    UNUSED_P(enc);
     148    return UTF8_GET_NAMING3(namePages, (const unsigned char *)p);
     149  }
     150  
     151  #define utf8_isName4 isNever
     152  
     153  static int PTRFASTCALL
     154  utf8_isNmstrt2(const ENCODING *enc, const char *p) {
     155    UNUSED_P(enc);
     156    return UTF8_GET_NAMING2(nmstrtPages, (const unsigned char *)p);
     157  }
     158  
     159  static int PTRFASTCALL
     160  utf8_isNmstrt3(const ENCODING *enc, const char *p) {
     161    UNUSED_P(enc);
     162    return UTF8_GET_NAMING3(nmstrtPages, (const unsigned char *)p);
     163  }
     164  
     165  #define utf8_isNmstrt4 isNever
     166  
     167  static int PTRFASTCALL
     168  utf8_isInvalid2(const ENCODING *enc, const char *p) {
     169    UNUSED_P(enc);
     170    return UTF8_INVALID2((const unsigned char *)p);
     171  }
     172  
     173  static int PTRFASTCALL
     174  utf8_isInvalid3(const ENCODING *enc, const char *p) {
     175    UNUSED_P(enc);
     176    return UTF8_INVALID3((const unsigned char *)p);
     177  }
     178  
     179  static int PTRFASTCALL
     180  utf8_isInvalid4(const ENCODING *enc, const char *p) {
     181    UNUSED_P(enc);
     182    return UTF8_INVALID4((const unsigned char *)p);
     183  }
     184  
     185  struct normal_encoding {
     186    ENCODING enc;
     187    unsigned char type[256];
     188  #ifdef XML_MIN_SIZE
     189    int(PTRFASTCALL *byteType)(const ENCODING *, const char *);
     190    int(PTRFASTCALL *isNameMin)(const ENCODING *, const char *);
     191    int(PTRFASTCALL *isNmstrtMin)(const ENCODING *, const char *);
     192    int(PTRFASTCALL *byteToAscii)(const ENCODING *, const char *);
     193    int(PTRCALL *charMatches)(const ENCODING *, const char *, int);
     194  #endif /* XML_MIN_SIZE */
     195    int(PTRFASTCALL *isName2)(const ENCODING *, const char *);
     196    int(PTRFASTCALL *isName3)(const ENCODING *, const char *);
     197    int(PTRFASTCALL *isName4)(const ENCODING *, const char *);
     198    int(PTRFASTCALL *isNmstrt2)(const ENCODING *, const char *);
     199    int(PTRFASTCALL *isNmstrt3)(const ENCODING *, const char *);
     200    int(PTRFASTCALL *isNmstrt4)(const ENCODING *, const char *);
     201    int(PTRFASTCALL *isInvalid2)(const ENCODING *, const char *);
     202    int(PTRFASTCALL *isInvalid3)(const ENCODING *, const char *);
     203    int(PTRFASTCALL *isInvalid4)(const ENCODING *, const char *);
     204  };
     205  
     206  #define AS_NORMAL_ENCODING(enc) ((const struct normal_encoding *)(enc))
     207  
     208  #ifdef XML_MIN_SIZE
     209  
     210  #  define STANDARD_VTABLE(E)                                                   \
     211      E##byteType, E##isNameMin, E##isNmstrtMin, E##byteToAscii, E##charMatches,
     212  
     213  #else
     214  
     215  #  define STANDARD_VTABLE(E) /* as nothing */
     216  
     217  #endif
     218  
     219  #define NORMAL_VTABLE(E)                                                       \
     220    E##isName2, E##isName3, E##isName4, E##isNmstrt2, E##isNmstrt3,              \
     221        E##isNmstrt4, E##isInvalid2, E##isInvalid3, E##isInvalid4
     222  
     223  #define NULL_VTABLE                                                            \
     224    /* isName2 */ NULL, /* isName3 */ NULL, /* isName4 */ NULL,                  \
     225        /* isNmstrt2 */ NULL, /* isNmstrt3 */ NULL, /* isNmstrt4 */ NULL,        \
     226        /* isInvalid2 */ NULL, /* isInvalid3 */ NULL, /* isInvalid4 */ NULL
     227  
     228  static int FASTCALL checkCharRefNumber(int);
     229  
     230  #include "xmltok_impl.h"
     231  #include "ascii.h"
     232  
     233  #ifdef XML_MIN_SIZE
     234  #  define sb_isNameMin isNever
     235  #  define sb_isNmstrtMin isNever
     236  #endif
     237  
     238  #ifdef XML_MIN_SIZE
     239  #  define MINBPC(enc) ((enc)->minBytesPerChar)
     240  #else
     241  /* minimum bytes per character */
     242  #  define MINBPC(enc) 1
     243  #endif
     244  
     245  #define SB_BYTE_TYPE(enc, p)                                                   \
     246    (((struct normal_encoding *)(enc))->type[(unsigned char)*(p)])
     247  
     248  #ifdef XML_MIN_SIZE
     249  static int PTRFASTCALL
     250  sb_byteType(const ENCODING *enc, const char *p) {
     251    return SB_BYTE_TYPE(enc, p);
     252  }
     253  #  define BYTE_TYPE(enc, p) (AS_NORMAL_ENCODING(enc)->byteType(enc, p))
     254  #else
     255  #  define BYTE_TYPE(enc, p) SB_BYTE_TYPE(enc, p)
     256  #endif
     257  
     258  #ifdef XML_MIN_SIZE
     259  #  define BYTE_TO_ASCII(enc, p) (AS_NORMAL_ENCODING(enc)->byteToAscii(enc, p))
     260  static int PTRFASTCALL
     261  sb_byteToAscii(const ENCODING *enc, const char *p) {
     262    UNUSED_P(enc);
     263    return *p;
     264  }
     265  #else
     266  #  define BYTE_TO_ASCII(enc, p) (*(p))
     267  #endif
     268  
     269  #define IS_NAME_CHAR(enc, p, n) (AS_NORMAL_ENCODING(enc)->isName##n(enc, p))
     270  #define IS_NMSTRT_CHAR(enc, p, n) (AS_NORMAL_ENCODING(enc)->isNmstrt##n(enc, p))
     271  #ifdef XML_MIN_SIZE
     272  #  define IS_INVALID_CHAR(enc, p, n)                                           \
     273      (AS_NORMAL_ENCODING(enc)->isInvalid##n                                     \
     274       && AS_NORMAL_ENCODING(enc)->isInvalid##n(enc, p))
     275  #else
     276  #  define IS_INVALID_CHAR(enc, p, n)                                           \
     277      (AS_NORMAL_ENCODING(enc)->isInvalid##n(enc, p))
     278  #endif
     279  
     280  #ifdef XML_MIN_SIZE
     281  #  define IS_NAME_CHAR_MINBPC(enc, p)                                          \
     282      (AS_NORMAL_ENCODING(enc)->isNameMin(enc, p))
     283  #  define IS_NMSTRT_CHAR_MINBPC(enc, p)                                        \
     284      (AS_NORMAL_ENCODING(enc)->isNmstrtMin(enc, p))
     285  #else
     286  #  define IS_NAME_CHAR_MINBPC(enc, p) (0)
     287  #  define IS_NMSTRT_CHAR_MINBPC(enc, p) (0)
     288  #endif
     289  
     290  #ifdef XML_MIN_SIZE
     291  #  define CHAR_MATCHES(enc, p, c)                                              \
     292      (AS_NORMAL_ENCODING(enc)->charMatches(enc, p, c))
     293  static int PTRCALL
     294  sb_charMatches(const ENCODING *enc, const char *p, int c) {
     295    UNUSED_P(enc);
     296    return *p == c;
     297  }
     298  #else
     299  /* c is an ASCII character */
     300  #  define CHAR_MATCHES(enc, p, c) (*(p) == (c))
     301  #endif
     302  
     303  #define PREFIX(ident) normal_##ident
     304  #define XML_TOK_IMPL_C
     305  #include "xmltok_impl.c"
     306  #undef XML_TOK_IMPL_C
     307  
     308  #undef MINBPC
     309  #undef BYTE_TYPE
     310  #undef BYTE_TO_ASCII
     311  #undef CHAR_MATCHES
     312  #undef IS_NAME_CHAR
     313  #undef IS_NAME_CHAR_MINBPC
     314  #undef IS_NMSTRT_CHAR
     315  #undef IS_NMSTRT_CHAR_MINBPC
     316  #undef IS_INVALID_CHAR
     317  
     318  enum { /* UTF8_cvalN is value of masked first byte of N byte sequence */
     319         UTF8_cval1 = 0x00,
     320         UTF8_cval2 = 0xc0,
     321         UTF8_cval3 = 0xe0,
     322         UTF8_cval4 = 0xf0
     323  };
     324  
     325  void
     326  _INTERNAL_trim_to_complete_utf8_characters(const char *from,
     327                                             const char **fromLimRef) {
     328    const char *fromLim = *fromLimRef;
     329    size_t walked = 0;
     330    for (; fromLim > from; fromLim--, walked++) {
     331      const unsigned char prev = (unsigned char)fromLim[-1];
     332      if ((prev & 0xf8u)
     333          == 0xf0u) { /* 4-byte character, lead by 0b11110xxx byte */
     334        if (walked + 1 >= 4) {
     335          fromLim += 4 - 1;
     336          break;
     337        } else {
     338          walked = 0;
     339        }
     340      } else if ((prev & 0xf0u)
     341                 == 0xe0u) { /* 3-byte character, lead by 0b1110xxxx byte */
     342        if (walked + 1 >= 3) {
     343          fromLim += 3 - 1;
     344          break;
     345        } else {
     346          walked = 0;
     347        }
     348      } else if ((prev & 0xe0u)
     349                 == 0xc0u) { /* 2-byte character, lead by 0b110xxxxx byte */
     350        if (walked + 1 >= 2) {
     351          fromLim += 2 - 1;
     352          break;
     353        } else {
     354          walked = 0;
     355        }
     356      } else if ((prev & 0x80u)
     357                 == 0x00u) { /* 1-byte character, matching 0b0xxxxxxx */
     358        break;
     359      }
     360    }
     361    *fromLimRef = fromLim;
     362  }
     363  
     364  static enum XML_Convert_Result PTRCALL
     365  utf8_toUtf8(const ENCODING *enc, const char **fromP, const char *fromLim,
     366              char **toP, const char *toLim) {
     367    bool input_incomplete = false;
     368    bool output_exhausted = false;
     369  
     370    /* Avoid copying partial characters (due to limited space). */
     371    const ptrdiff_t bytesAvailable = fromLim - *fromP;
     372    const ptrdiff_t bytesStorable = toLim - *toP;
     373    UNUSED_P(enc);
     374    if (bytesAvailable > bytesStorable) {
     375      fromLim = *fromP + bytesStorable;
     376      output_exhausted = true;
     377    }
     378  
     379    /* Avoid copying partial characters (from incomplete input). */
     380    {
     381      const char *const fromLimBefore = fromLim;
     382      _INTERNAL_trim_to_complete_utf8_characters(*fromP, &fromLim);
     383      if (fromLim < fromLimBefore) {
     384        input_incomplete = true;
     385      }
     386    }
     387  
     388    {
     389      const ptrdiff_t bytesToCopy = fromLim - *fromP;
     390      memcpy(*toP, *fromP, bytesToCopy);
     391      *fromP += bytesToCopy;
     392      *toP += bytesToCopy;
     393    }
     394  
     395    if (output_exhausted) /* needs to go first */
     396      return XML_CONVERT_OUTPUT_EXHAUSTED;
     397    else if (input_incomplete)
     398      return XML_CONVERT_INPUT_INCOMPLETE;
     399    else
     400      return XML_CONVERT_COMPLETED;
     401  }
     402  
     403  static enum XML_Convert_Result PTRCALL
     404  utf8_toUtf16(const ENCODING *enc, const char **fromP, const char *fromLim,
     405               unsigned short **toP, const unsigned short *toLim) {
     406    enum XML_Convert_Result res = XML_CONVERT_COMPLETED;
     407    unsigned short *to = *toP;
     408    const char *from = *fromP;
     409    while (from < fromLim && to < toLim) {
     410      switch (((struct normal_encoding *)enc)->type[(unsigned char)*from]) {
     411      case BT_LEAD2:
     412        if (fromLim - from < 2) {
     413          res = XML_CONVERT_INPUT_INCOMPLETE;
     414          goto after;
     415        }
     416        *to++ = (unsigned short)(((from[0] & 0x1f) << 6) | (from[1] & 0x3f));
     417        from += 2;
     418        break;
     419      case BT_LEAD3:
     420        if (fromLim - from < 3) {
     421          res = XML_CONVERT_INPUT_INCOMPLETE;
     422          goto after;
     423        }
     424        *to++ = (unsigned short)(((from[0] & 0xf) << 12) | ((from[1] & 0x3f) << 6)
     425                                 | (from[2] & 0x3f));
     426        from += 3;
     427        break;
     428      case BT_LEAD4: {
     429        unsigned long n;
     430        if (toLim - to < 2) {
     431          res = XML_CONVERT_OUTPUT_EXHAUSTED;
     432          goto after;
     433        }
     434        if (fromLim - from < 4) {
     435          res = XML_CONVERT_INPUT_INCOMPLETE;
     436          goto after;
     437        }
     438        n = ((from[0] & 0x7) << 18) | ((from[1] & 0x3f) << 12)
     439            | ((from[2] & 0x3f) << 6) | (from[3] & 0x3f);
     440        n -= 0x10000;
     441        to[0] = (unsigned short)((n >> 10) | 0xD800);
     442        to[1] = (unsigned short)((n & 0x3FF) | 0xDC00);
     443        to += 2;
     444        from += 4;
     445      } break;
     446      default:
     447        *to++ = *from++;
     448        break;
     449      }
     450    }
     451    if (from < fromLim)
     452      res = XML_CONVERT_OUTPUT_EXHAUSTED;
     453  after:
     454    *fromP = from;
     455    *toP = to;
     456    return res;
     457  }
     458  
     459  #ifdef XML_NS
     460  static const struct normal_encoding utf8_encoding_ns
     461      = {{VTABLE1, utf8_toUtf8, utf8_toUtf16, 1, 1, 0},
     462         {
     463  #  include "asciitab.h"
     464  #  include "utf8tab.h"
     465         },
     466         STANDARD_VTABLE(sb_) NORMAL_VTABLE(utf8_)};
     467  #endif
     468  
     469  static const struct normal_encoding utf8_encoding
     470      = {{VTABLE1, utf8_toUtf8, utf8_toUtf16, 1, 1, 0},
     471         {
     472  #define BT_COLON BT_NMSTRT
     473  #include "asciitab.h"
     474  #undef BT_COLON
     475  #include "utf8tab.h"
     476         },
     477         STANDARD_VTABLE(sb_) NORMAL_VTABLE(utf8_)};
     478  
     479  #ifdef XML_NS
     480  
     481  static const struct normal_encoding internal_utf8_encoding_ns
     482      = {{VTABLE1, utf8_toUtf8, utf8_toUtf16, 1, 1, 0},
     483         {
     484  #  include "iasciitab.h"
     485  #  include "utf8tab.h"
     486         },
     487         STANDARD_VTABLE(sb_) NORMAL_VTABLE(utf8_)};
     488  
     489  #endif
     490  
     491  static const struct normal_encoding internal_utf8_encoding
     492      = {{VTABLE1, utf8_toUtf8, utf8_toUtf16, 1, 1, 0},
     493         {
     494  #define BT_COLON BT_NMSTRT
     495  #include "iasciitab.h"
     496  #undef BT_COLON
     497  #include "utf8tab.h"
     498         },
     499         STANDARD_VTABLE(sb_) NORMAL_VTABLE(utf8_)};
     500  
     501  static enum XML_Convert_Result PTRCALL
     502  latin1_toUtf8(const ENCODING *enc, const char **fromP, const char *fromLim,
     503                char **toP, const char *toLim) {
     504    UNUSED_P(enc);
     505    for (;;) {
     506      unsigned char c;
     507      if (*fromP == fromLim)
     508        return XML_CONVERT_COMPLETED;
     509      c = (unsigned char)**fromP;
     510      if (c & 0x80) {
     511        if (toLim - *toP < 2)
     512          return XML_CONVERT_OUTPUT_EXHAUSTED;
     513        *(*toP)++ = (char)((c >> 6) | UTF8_cval2);
     514        *(*toP)++ = (char)((c & 0x3f) | 0x80);
     515        (*fromP)++;
     516      } else {
     517        if (*toP == toLim)
     518          return XML_CONVERT_OUTPUT_EXHAUSTED;
     519        *(*toP)++ = *(*fromP)++;
     520      }
     521    }
     522  }
     523  
     524  static enum XML_Convert_Result PTRCALL
     525  latin1_toUtf16(const ENCODING *enc, const char **fromP, const char *fromLim,
     526                 unsigned short **toP, const unsigned short *toLim) {
     527    UNUSED_P(enc);
     528    while (*fromP < fromLim && *toP < toLim)
     529      *(*toP)++ = (unsigned char)*(*fromP)++;
     530  
     531    if ((*toP == toLim) && (*fromP < fromLim))
     532      return XML_CONVERT_OUTPUT_EXHAUSTED;
     533    else
     534      return XML_CONVERT_COMPLETED;
     535  }
     536  
     537  #ifdef XML_NS
     538  
     539  static const struct normal_encoding latin1_encoding_ns
     540      = {{VTABLE1, latin1_toUtf8, latin1_toUtf16, 1, 0, 0},
     541         {
     542  #  include "asciitab.h"
     543  #  include "latin1tab.h"
     544         },
     545         STANDARD_VTABLE(sb_) NULL_VTABLE};
     546  
     547  #endif
     548  
     549  static const struct normal_encoding latin1_encoding
     550      = {{VTABLE1, latin1_toUtf8, latin1_toUtf16, 1, 0, 0},
     551         {
     552  #define BT_COLON BT_NMSTRT
     553  #include "asciitab.h"
     554  #undef BT_COLON
     555  #include "latin1tab.h"
     556         },
     557         STANDARD_VTABLE(sb_) NULL_VTABLE};
     558  
     559  static enum XML_Convert_Result PTRCALL
     560  ascii_toUtf8(const ENCODING *enc, const char **fromP, const char *fromLim,
     561               char **toP, const char *toLim) {
     562    UNUSED_P(enc);
     563    while (*fromP < fromLim && *toP < toLim)
     564      *(*toP)++ = *(*fromP)++;
     565  
     566    if ((*toP == toLim) && (*fromP < fromLim))
     567      return XML_CONVERT_OUTPUT_EXHAUSTED;
     568    else
     569      return XML_CONVERT_COMPLETED;
     570  }
     571  
     572  #ifdef XML_NS
     573  
     574  static const struct normal_encoding ascii_encoding_ns
     575      = {{VTABLE1, ascii_toUtf8, latin1_toUtf16, 1, 1, 0},
     576         {
     577  #  include "asciitab.h"
     578             /* BT_NONXML == 0 */
     579         },
     580         STANDARD_VTABLE(sb_) NULL_VTABLE};
     581  
     582  #endif
     583  
     584  static const struct normal_encoding ascii_encoding
     585      = {{VTABLE1, ascii_toUtf8, latin1_toUtf16, 1, 1, 0},
     586         {
     587  #define BT_COLON BT_NMSTRT
     588  #include "asciitab.h"
     589  #undef BT_COLON
     590             /* BT_NONXML == 0 */
     591         },
     592         STANDARD_VTABLE(sb_) NULL_VTABLE};
     593  
     594  static int PTRFASTCALL
     595  unicode_byte_type(char hi, char lo) {
     596    switch ((unsigned char)hi) {
     597    /* 0xD800-0xDBFF first 16-bit code unit or high surrogate (W1) */
     598    case 0xD8:
     599    case 0xD9:
     600    case 0xDA:
     601    case 0xDB:
     602      return BT_LEAD4;
     603    /* 0xDC00-0xDFFF second 16-bit code unit or low surrogate (W2) */
     604    case 0xDC:
     605    case 0xDD:
     606    case 0xDE:
     607    case 0xDF:
     608      return BT_TRAIL;
     609    case 0xFF:
     610      switch ((unsigned char)lo) {
     611      case 0xFF: /* noncharacter-FFFF */
     612      case 0xFE: /* noncharacter-FFFE */
     613        return BT_NONXML;
     614      }
     615      break;
     616    }
     617    return BT_NONASCII;
     618  }
     619  
     620  #define DEFINE_UTF16_TO_UTF8(E)                                                \
     621    static enum XML_Convert_Result PTRCALL E##toUtf8(                            \
     622        const ENCODING *enc, const char **fromP, const char *fromLim,            \
     623        char **toP, const char *toLim) {                                         \
     624      const char *from = *fromP;                                                 \
     625      UNUSED_P(enc);                                                             \
     626      fromLim = from + (((fromLim - from) >> 1) << 1); /* shrink to even */      \
     627      for (; from < fromLim; from += 2) {                                        \
     628        int plane;                                                               \
     629        unsigned char lo2;                                                       \
     630        unsigned char lo = GET_LO(from);                                         \
     631        unsigned char hi = GET_HI(from);                                         \
     632        switch (hi) {                                                            \
     633        case 0:                                                                  \
     634          if (lo < 0x80) {                                                       \
     635            if (*toP == toLim) {                                                 \
     636              *fromP = from;                                                     \
     637              return XML_CONVERT_OUTPUT_EXHAUSTED;                               \
     638            }                                                                    \
     639            *(*toP)++ = lo;                                                      \
     640            break;                                                               \
     641          }                                                                      \
     642          /* fall through */                                                     \
     643        case 0x1:                                                                \
     644        case 0x2:                                                                \
     645        case 0x3:                                                                \
     646        case 0x4:                                                                \
     647        case 0x5:                                                                \
     648        case 0x6:                                                                \
     649        case 0x7:                                                                \
     650          if (toLim - *toP < 2) {                                                \
     651            *fromP = from;                                                       \
     652            return XML_CONVERT_OUTPUT_EXHAUSTED;                                 \
     653          }                                                                      \
     654          *(*toP)++ = ((lo >> 6) | (hi << 2) | UTF8_cval2);                      \
     655          *(*toP)++ = ((lo & 0x3f) | 0x80);                                      \
     656          break;                                                                 \
     657        default:                                                                 \
     658          if (toLim - *toP < 3) {                                                \
     659            *fromP = from;                                                       \
     660            return XML_CONVERT_OUTPUT_EXHAUSTED;                                 \
     661          }                                                                      \
     662          /* 16 bits divided 4, 6, 6 amongst 3 bytes */                          \
     663          *(*toP)++ = ((hi >> 4) | UTF8_cval3);                                  \
     664          *(*toP)++ = (((hi & 0xf) << 2) | (lo >> 6) | 0x80);                    \
     665          *(*toP)++ = ((lo & 0x3f) | 0x80);                                      \
     666          break;                                                                 \
     667        case 0xD8:                                                               \
     668        case 0xD9:                                                               \
     669        case 0xDA:                                                               \
     670        case 0xDB:                                                               \
     671          if (toLim - *toP < 4) {                                                \
     672            *fromP = from;                                                       \
     673            return XML_CONVERT_OUTPUT_EXHAUSTED;                                 \
     674          }                                                                      \
     675          if (fromLim - from < 4) {                                              \
     676            *fromP = from;                                                       \
     677            return XML_CONVERT_INPUT_INCOMPLETE;                                 \
     678          }                                                                      \
     679          plane = (((hi & 0x3) << 2) | ((lo >> 6) & 0x3)) + 1;                   \
     680          *(*toP)++ = (char)((plane >> 2) | UTF8_cval4);                         \
     681          *(*toP)++ = (((lo >> 2) & 0xF) | ((plane & 0x3) << 4) | 0x80);         \
     682          from += 2;                                                             \
     683          lo2 = GET_LO(from);                                                    \
     684          *(*toP)++ = (((lo & 0x3) << 4) | ((GET_HI(from) & 0x3) << 2)           \
     685                       | (lo2 >> 6) | 0x80);                                     \
     686          *(*toP)++ = ((lo2 & 0x3f) | 0x80);                                     \
     687          break;                                                                 \
     688        }                                                                        \
     689      }                                                                          \
     690      *fromP = from;                                                             \
     691      if (from < fromLim)                                                        \
     692        return XML_CONVERT_INPUT_INCOMPLETE;                                     \
     693      else                                                                       \
     694        return XML_CONVERT_COMPLETED;                                            \
     695    }
     696  
     697  #define DEFINE_UTF16_TO_UTF16(E)                                               \
     698    static enum XML_Convert_Result PTRCALL E##toUtf16(                           \
     699        const ENCODING *enc, const char **fromP, const char *fromLim,            \
     700        unsigned short **toP, const unsigned short *toLim) {                     \
     701      enum XML_Convert_Result res = XML_CONVERT_COMPLETED;                       \
     702      UNUSED_P(enc);                                                             \
     703      fromLim = *fromP + (((fromLim - *fromP) >> 1) << 1); /* shrink to even */  \
     704      /* Avoid copying first half only of surrogate */                           \
     705      if (fromLim - *fromP > ((toLim - *toP) << 1)                               \
     706          && (GET_HI(fromLim - 2) & 0xF8) == 0xD8) {                             \
     707        fromLim -= 2;                                                            \
     708        res = XML_CONVERT_INPUT_INCOMPLETE;                                      \
     709      }                                                                          \
     710      for (; *fromP < fromLim && *toP < toLim; *fromP += 2)                      \
     711        *(*toP)++ = (GET_HI(*fromP) << 8) | GET_LO(*fromP);                      \
     712      if ((*toP == toLim) && (*fromP < fromLim))                                 \
     713        return XML_CONVERT_OUTPUT_EXHAUSTED;                                     \
     714      else                                                                       \
     715        return res;                                                              \
     716    }
     717  
     718  #define SET2(ptr, ch) (((ptr)[0] = ((ch)&0xff)), ((ptr)[1] = ((ch) >> 8)))
     719  #define GET_LO(ptr) ((unsigned char)(ptr)[0])
     720  #define GET_HI(ptr) ((unsigned char)(ptr)[1])
     721  
     722  DEFINE_UTF16_TO_UTF8(little2_)
     723  DEFINE_UTF16_TO_UTF16(little2_)
     724  
     725  #undef SET2
     726  #undef GET_LO
     727  #undef GET_HI
     728  
     729  #define SET2(ptr, ch) (((ptr)[0] = ((ch) >> 8)), ((ptr)[1] = ((ch)&0xFF)))
     730  #define GET_LO(ptr) ((unsigned char)(ptr)[1])
     731  #define GET_HI(ptr) ((unsigned char)(ptr)[0])
     732  
     733  DEFINE_UTF16_TO_UTF8(big2_)
     734  DEFINE_UTF16_TO_UTF16(big2_)
     735  
     736  #undef SET2
     737  #undef GET_LO
     738  #undef GET_HI
     739  
     740  #define LITTLE2_BYTE_TYPE(enc, p)                                              \
     741    ((p)[1] == 0 ? ((struct normal_encoding *)(enc))->type[(unsigned char)*(p)]  \
     742                 : unicode_byte_type((p)[1], (p)[0]))
     743  #define LITTLE2_BYTE_TO_ASCII(p) ((p)[1] == 0 ? (p)[0] : -1)
     744  #define LITTLE2_CHAR_MATCHES(p, c) ((p)[1] == 0 && (p)[0] == (c))
     745  #define LITTLE2_IS_NAME_CHAR_MINBPC(p)                                         \
     746    UCS2_GET_NAMING(namePages, (unsigned char)p[1], (unsigned char)p[0])
     747  #define LITTLE2_IS_NMSTRT_CHAR_MINBPC(p)                                       \
     748    UCS2_GET_NAMING(nmstrtPages, (unsigned char)p[1], (unsigned char)p[0])
     749  
     750  #ifdef XML_MIN_SIZE
     751  
     752  static int PTRFASTCALL
     753  little2_byteType(const ENCODING *enc, const char *p) {
     754    return LITTLE2_BYTE_TYPE(enc, p);
     755  }
     756  
     757  static int PTRFASTCALL
     758  little2_byteToAscii(const ENCODING *enc, const char *p) {
     759    UNUSED_P(enc);
     760    return LITTLE2_BYTE_TO_ASCII(p);
     761  }
     762  
     763  static int PTRCALL
     764  little2_charMatches(const ENCODING *enc, const char *p, int c) {
     765    UNUSED_P(enc);
     766    return LITTLE2_CHAR_MATCHES(p, c);
     767  }
     768  
     769  static int PTRFASTCALL
     770  little2_isNameMin(const ENCODING *enc, const char *p) {
     771    UNUSED_P(enc);
     772    return LITTLE2_IS_NAME_CHAR_MINBPC(p);
     773  }
     774  
     775  static int PTRFASTCALL
     776  little2_isNmstrtMin(const ENCODING *enc, const char *p) {
     777    UNUSED_P(enc);
     778    return LITTLE2_IS_NMSTRT_CHAR_MINBPC(p);
     779  }
     780  
     781  #  undef VTABLE
     782  #  define VTABLE VTABLE1, little2_toUtf8, little2_toUtf16
     783  
     784  #else /* not XML_MIN_SIZE */
     785  
     786  #  undef PREFIX
     787  #  define PREFIX(ident) little2_##ident
     788  #  define MINBPC(enc) 2
     789  /* CHAR_MATCHES is guaranteed to have MINBPC bytes available. */
     790  #  define BYTE_TYPE(enc, p) LITTLE2_BYTE_TYPE(enc, p)
     791  #  define BYTE_TO_ASCII(enc, p) LITTLE2_BYTE_TO_ASCII(p)
     792  #  define CHAR_MATCHES(enc, p, c) LITTLE2_CHAR_MATCHES(p, c)
     793  #  define IS_NAME_CHAR(enc, p, n) 0
     794  #  define IS_NAME_CHAR_MINBPC(enc, p) LITTLE2_IS_NAME_CHAR_MINBPC(p)
     795  #  define IS_NMSTRT_CHAR(enc, p, n) (0)
     796  #  define IS_NMSTRT_CHAR_MINBPC(enc, p) LITTLE2_IS_NMSTRT_CHAR_MINBPC(p)
     797  
     798  #  define XML_TOK_IMPL_C
     799  #  include "xmltok_impl.c"
     800  #  undef XML_TOK_IMPL_C
     801  
     802  #  undef MINBPC
     803  #  undef BYTE_TYPE
     804  #  undef BYTE_TO_ASCII
     805  #  undef CHAR_MATCHES
     806  #  undef IS_NAME_CHAR
     807  #  undef IS_NAME_CHAR_MINBPC
     808  #  undef IS_NMSTRT_CHAR
     809  #  undef IS_NMSTRT_CHAR_MINBPC
     810  #  undef IS_INVALID_CHAR
     811  
     812  #endif /* not XML_MIN_SIZE */
     813  
     814  #ifdef XML_NS
     815  
     816  static const struct normal_encoding little2_encoding_ns
     817      = {{VTABLE, 2, 0,
     818  #  if BYTEORDER == 1234
     819          1
     820  #  else
     821          0
     822  #  endif
     823         },
     824         {
     825  #  include "asciitab.h"
     826  #  include "latin1tab.h"
     827         },
     828         STANDARD_VTABLE(little2_) NULL_VTABLE};
     829  
     830  #endif
     831  
     832  static const struct normal_encoding little2_encoding
     833      = {{VTABLE, 2, 0,
     834  #if BYTEORDER == 1234
     835          1
     836  #else
     837          0
     838  #endif
     839         },
     840         {
     841  #define BT_COLON BT_NMSTRT
     842  #include "asciitab.h"
     843  #undef BT_COLON
     844  #include "latin1tab.h"
     845         },
     846         STANDARD_VTABLE(little2_) NULL_VTABLE};
     847  
     848  #if BYTEORDER != 4321
     849  
     850  #  ifdef XML_NS
     851  
     852  static const struct normal_encoding internal_little2_encoding_ns
     853      = {{VTABLE, 2, 0, 1},
     854         {
     855  #    include "iasciitab.h"
     856  #    include "latin1tab.h"
     857         },
     858         STANDARD_VTABLE(little2_) NULL_VTABLE};
     859  
     860  #  endif
     861  
     862  static const struct normal_encoding internal_little2_encoding
     863      = {{VTABLE, 2, 0, 1},
     864         {
     865  #  define BT_COLON BT_NMSTRT
     866  #  include "iasciitab.h"
     867  #  undef BT_COLON
     868  #  include "latin1tab.h"
     869         },
     870         STANDARD_VTABLE(little2_) NULL_VTABLE};
     871  
     872  #endif
     873  
     874  #define BIG2_BYTE_TYPE(enc, p)                                                 \
     875    ((p)[0] == 0                                                                 \
     876         ? ((struct normal_encoding *)(enc))->type[(unsigned char)(p)[1]]        \
     877         : unicode_byte_type((p)[0], (p)[1]))
     878  #define BIG2_BYTE_TO_ASCII(p) ((p)[0] == 0 ? (p)[1] : -1)
     879  #define BIG2_CHAR_MATCHES(p, c) ((p)[0] == 0 && (p)[1] == (c))
     880  #define BIG2_IS_NAME_CHAR_MINBPC(p)                                            \
     881    UCS2_GET_NAMING(namePages, (unsigned char)p[0], (unsigned char)p[1])
     882  #define BIG2_IS_NMSTRT_CHAR_MINBPC(p)                                          \
     883    UCS2_GET_NAMING(nmstrtPages, (unsigned char)p[0], (unsigned char)p[1])
     884  
     885  #ifdef XML_MIN_SIZE
     886  
     887  static int PTRFASTCALL
     888  big2_byteType(const ENCODING *enc, const char *p) {
     889    return BIG2_BYTE_TYPE(enc, p);
     890  }
     891  
     892  static int PTRFASTCALL
     893  big2_byteToAscii(const ENCODING *enc, const char *p) {
     894    UNUSED_P(enc);
     895    return BIG2_BYTE_TO_ASCII(p);
     896  }
     897  
     898  static int PTRCALL
     899  big2_charMatches(const ENCODING *enc, const char *p, int c) {
     900    UNUSED_P(enc);
     901    return BIG2_CHAR_MATCHES(p, c);
     902  }
     903  
     904  static int PTRFASTCALL
     905  big2_isNameMin(const ENCODING *enc, const char *p) {
     906    UNUSED_P(enc);
     907    return BIG2_IS_NAME_CHAR_MINBPC(p);
     908  }
     909  
     910  static int PTRFASTCALL
     911  big2_isNmstrtMin(const ENCODING *enc, const char *p) {
     912    UNUSED_P(enc);
     913    return BIG2_IS_NMSTRT_CHAR_MINBPC(p);
     914  }
     915  
     916  #  undef VTABLE
     917  #  define VTABLE VTABLE1, big2_toUtf8, big2_toUtf16
     918  
     919  #else /* not XML_MIN_SIZE */
     920  
     921  #  undef PREFIX
     922  #  define PREFIX(ident) big2_##ident
     923  #  define MINBPC(enc) 2
     924  /* CHAR_MATCHES is guaranteed to have MINBPC bytes available. */
     925  #  define BYTE_TYPE(enc, p) BIG2_BYTE_TYPE(enc, p)
     926  #  define BYTE_TO_ASCII(enc, p) BIG2_BYTE_TO_ASCII(p)
     927  #  define CHAR_MATCHES(enc, p, c) BIG2_CHAR_MATCHES(p, c)
     928  #  define IS_NAME_CHAR(enc, p, n) 0
     929  #  define IS_NAME_CHAR_MINBPC(enc, p) BIG2_IS_NAME_CHAR_MINBPC(p)
     930  #  define IS_NMSTRT_CHAR(enc, p, n) (0)
     931  #  define IS_NMSTRT_CHAR_MINBPC(enc, p) BIG2_IS_NMSTRT_CHAR_MINBPC(p)
     932  
     933  #  define XML_TOK_IMPL_C
     934  #  include "xmltok_impl.c"
     935  #  undef XML_TOK_IMPL_C
     936  
     937  #  undef MINBPC
     938  #  undef BYTE_TYPE
     939  #  undef BYTE_TO_ASCII
     940  #  undef CHAR_MATCHES
     941  #  undef IS_NAME_CHAR
     942  #  undef IS_NAME_CHAR_MINBPC
     943  #  undef IS_NMSTRT_CHAR
     944  #  undef IS_NMSTRT_CHAR_MINBPC
     945  #  undef IS_INVALID_CHAR
     946  
     947  #endif /* not XML_MIN_SIZE */
     948  
     949  #ifdef XML_NS
     950  
     951  static const struct normal_encoding big2_encoding_ns
     952      = {{VTABLE, 2, 0,
     953  #  if BYTEORDER == 4321
     954          1
     955  #  else
     956          0
     957  #  endif
     958         },
     959         {
     960  #  include "asciitab.h"
     961  #  include "latin1tab.h"
     962         },
     963         STANDARD_VTABLE(big2_) NULL_VTABLE};
     964  
     965  #endif
     966  
     967  static const struct normal_encoding big2_encoding
     968      = {{VTABLE, 2, 0,
     969  #if BYTEORDER == 4321
     970          1
     971  #else
     972          0
     973  #endif
     974         },
     975         {
     976  #define BT_COLON BT_NMSTRT
     977  #include "asciitab.h"
     978  #undef BT_COLON
     979  #include "latin1tab.h"
     980         },
     981         STANDARD_VTABLE(big2_) NULL_VTABLE};
     982  
     983  #if BYTEORDER != 1234
     984  
     985  #  ifdef XML_NS
     986  
     987  static const struct normal_encoding internal_big2_encoding_ns
     988      = {{VTABLE, 2, 0, 1},
     989         {
     990  #    include "iasciitab.h"
     991  #    include "latin1tab.h"
     992         },
     993         STANDARD_VTABLE(big2_) NULL_VTABLE};
     994  
     995  #  endif
     996  
     997  static const struct normal_encoding internal_big2_encoding
     998      = {{VTABLE, 2, 0, 1},
     999         {
    1000  #  define BT_COLON BT_NMSTRT
    1001  #  include "iasciitab.h"
    1002  #  undef BT_COLON
    1003  #  include "latin1tab.h"
    1004         },
    1005         STANDARD_VTABLE(big2_) NULL_VTABLE};
    1006  
    1007  #endif
    1008  
    1009  #undef PREFIX
    1010  
    1011  static int FASTCALL
    1012  streqci(const char *s1, const char *s2) {
    1013    for (;;) {
    1014      char c1 = *s1++;
    1015      char c2 = *s2++;
    1016      if (ASCII_a <= c1 && c1 <= ASCII_z)
    1017        c1 += ASCII_A - ASCII_a;
    1018      if (ASCII_a <= c2 && c2 <= ASCII_z)
    1019        /* The following line will never get executed.  streqci() is
    1020         * only called from two places, both of which guarantee to put
    1021         * upper-case strings into s2.
    1022         */
    1023        c2 += ASCII_A - ASCII_a; /* LCOV_EXCL_LINE */
    1024      if (c1 != c2)
    1025        return 0;
    1026      if (! c1)
    1027        break;
    1028    }
    1029    return 1;
    1030  }
    1031  
    1032  static void PTRCALL
    1033  initUpdatePosition(const ENCODING *enc, const char *ptr, const char *end,
    1034                     POSITION *pos) {
    1035    UNUSED_P(enc);
    1036    normal_updatePosition(&utf8_encoding.enc, ptr, end, pos);
    1037  }
    1038  
    1039  static int
    1040  toAscii(const ENCODING *enc, const char *ptr, const char *end) {
    1041    char buf[1];
    1042    char *p = buf;
    1043    XmlUtf8Convert(enc, &ptr, end, &p, p + 1);
    1044    if (p == buf)
    1045      return -1;
    1046    else
    1047      return buf[0];
    1048  }
    1049  
    1050  static int FASTCALL
    1051  isSpace(int c) {
    1052    switch (c) {
    1053    case 0x20:
    1054    case 0xD:
    1055    case 0xA:
    1056    case 0x9:
    1057      return 1;
    1058    }
    1059    return 0;
    1060  }
    1061  
    1062  /* Return 1 if there's just optional white space or there's an S
    1063     followed by name=val.
    1064  */
    1065  static int
    1066  parsePseudoAttribute(const ENCODING *enc, const char *ptr, const char *end,
    1067                       const char **namePtr, const char **nameEndPtr,
    1068                       const char **valPtr, const char **nextTokPtr) {
    1069    int c;
    1070    char open;
    1071    if (ptr == end) {
    1072      *namePtr = NULL;
    1073      return 1;
    1074    }
    1075    if (! isSpace(toAscii(enc, ptr, end))) {
    1076      *nextTokPtr = ptr;
    1077      return 0;
    1078    }
    1079    do {
    1080      ptr += enc->minBytesPerChar;
    1081    } while (isSpace(toAscii(enc, ptr, end)));
    1082    if (ptr == end) {
    1083      *namePtr = NULL;
    1084      return 1;
    1085    }
    1086    *namePtr = ptr;
    1087    for (;;) {
    1088      c = toAscii(enc, ptr, end);
    1089      if (c == -1) {
    1090        *nextTokPtr = ptr;
    1091        return 0;
    1092      }
    1093      if (c == ASCII_EQUALS) {
    1094        *nameEndPtr = ptr;
    1095        break;
    1096      }
    1097      if (isSpace(c)) {
    1098        *nameEndPtr = ptr;
    1099        do {
    1100          ptr += enc->minBytesPerChar;
    1101        } while (isSpace(c = toAscii(enc, ptr, end)));
    1102        if (c != ASCII_EQUALS) {
    1103          *nextTokPtr = ptr;
    1104          return 0;
    1105        }
    1106        break;
    1107      }
    1108      ptr += enc->minBytesPerChar;
    1109    }
    1110    if (ptr == *namePtr) {
    1111      *nextTokPtr = ptr;
    1112      return 0;
    1113    }
    1114    ptr += enc->minBytesPerChar;
    1115    c = toAscii(enc, ptr, end);
    1116    while (isSpace(c)) {
    1117      ptr += enc->minBytesPerChar;
    1118      c = toAscii(enc, ptr, end);
    1119    }
    1120    if (c != ASCII_QUOT && c != ASCII_APOS) {
    1121      *nextTokPtr = ptr;
    1122      return 0;
    1123    }
    1124    open = (char)c;
    1125    ptr += enc->minBytesPerChar;
    1126    *valPtr = ptr;
    1127    for (;; ptr += enc->minBytesPerChar) {
    1128      c = toAscii(enc, ptr, end);
    1129      if (c == open)
    1130        break;
    1131      if (! (ASCII_a <= c && c <= ASCII_z) && ! (ASCII_A <= c && c <= ASCII_Z)
    1132          && ! (ASCII_0 <= c && c <= ASCII_9) && c != ASCII_PERIOD
    1133          && c != ASCII_MINUS && c != ASCII_UNDERSCORE) {
    1134        *nextTokPtr = ptr;
    1135        return 0;
    1136      }
    1137    }
    1138    *nextTokPtr = ptr + enc->minBytesPerChar;
    1139    return 1;
    1140  }
    1141  
    1142  static const char KW_version[]
    1143      = {ASCII_v, ASCII_e, ASCII_r, ASCII_s, ASCII_i, ASCII_o, ASCII_n, '\0'};
    1144  
    1145  static const char KW_encoding[] = {ASCII_e, ASCII_n, ASCII_c, ASCII_o, ASCII_d,
    1146                                     ASCII_i, ASCII_n, ASCII_g, '\0'};
    1147  
    1148  static const char KW_standalone[]
    1149      = {ASCII_s, ASCII_t, ASCII_a, ASCII_n, ASCII_d, ASCII_a,
    1150         ASCII_l, ASCII_o, ASCII_n, ASCII_e, '\0'};
    1151  
    1152  static const char KW_yes[] = {ASCII_y, ASCII_e, ASCII_s, '\0'};
    1153  
    1154  static const char KW_no[] = {ASCII_n, ASCII_o, '\0'};
    1155  
    1156  static int
    1157  doParseXmlDecl(const ENCODING *(*encodingFinder)(const ENCODING *, const char *,
    1158                                                   const char *),
    1159                 int isGeneralTextEntity, const ENCODING *enc, const char *ptr,
    1160                 const char *end, const char **badPtr, const char **versionPtr,
    1161                 const char **versionEndPtr, const char **encodingName,
    1162                 const ENCODING **encoding, int *standalone) {
    1163    const char *val = NULL;
    1164    const char *name = NULL;
    1165    const char *nameEnd = NULL;
    1166    ptr += 5 * enc->minBytesPerChar;
    1167    end -= 2 * enc->minBytesPerChar;
    1168    if (! parsePseudoAttribute(enc, ptr, end, &name, &nameEnd, &val, &ptr)
    1169        || ! name) {
    1170      *badPtr = ptr;
    1171      return 0;
    1172    }
    1173    if (! XmlNameMatchesAscii(enc, name, nameEnd, KW_version)) {
    1174      if (! isGeneralTextEntity) {
    1175        *badPtr = name;
    1176        return 0;
    1177      }
    1178    } else {
    1179      if (versionPtr)
    1180        *versionPtr = val;
    1181      if (versionEndPtr)
    1182        *versionEndPtr = ptr;
    1183      if (! parsePseudoAttribute(enc, ptr, end, &name, &nameEnd, &val, &ptr)) {
    1184        *badPtr = ptr;
    1185        return 0;
    1186      }
    1187      if (! name) {
    1188        if (isGeneralTextEntity) {
    1189          /* a TextDecl must have an EncodingDecl */
    1190          *badPtr = ptr;
    1191          return 0;
    1192        }
    1193        return 1;
    1194      }
    1195    }
    1196    if (XmlNameMatchesAscii(enc, name, nameEnd, KW_encoding)) {
    1197      int c = toAscii(enc, val, end);
    1198      if (! (ASCII_a <= c && c <= ASCII_z) && ! (ASCII_A <= c && c <= ASCII_Z)) {
    1199        *badPtr = val;
    1200        return 0;
    1201      }
    1202      if (encodingName)
    1203        *encodingName = val;
    1204      if (encoding)
    1205        *encoding = encodingFinder(enc, val, ptr - enc->minBytesPerChar);
    1206      if (! parsePseudoAttribute(enc, ptr, end, &name, &nameEnd, &val, &ptr)) {
    1207        *badPtr = ptr;
    1208        return 0;
    1209      }
    1210      if (! name)
    1211        return 1;
    1212    }
    1213    if (! XmlNameMatchesAscii(enc, name, nameEnd, KW_standalone)
    1214        || isGeneralTextEntity) {
    1215      *badPtr = name;
    1216      return 0;
    1217    }
    1218    if (XmlNameMatchesAscii(enc, val, ptr - enc->minBytesPerChar, KW_yes)) {
    1219      if (standalone)
    1220        *standalone = 1;
    1221    } else if (XmlNameMatchesAscii(enc, val, ptr - enc->minBytesPerChar, KW_no)) {
    1222      if (standalone)
    1223        *standalone = 0;
    1224    } else {
    1225      *badPtr = val;
    1226      return 0;
    1227    }
    1228    while (isSpace(toAscii(enc, ptr, end)))
    1229      ptr += enc->minBytesPerChar;
    1230    if (ptr != end) {
    1231      *badPtr = ptr;
    1232      return 0;
    1233    }
    1234    return 1;
    1235  }
    1236  
    1237  static int FASTCALL
    1238  checkCharRefNumber(int result) {
    1239    switch (result >> 8) {
    1240    case 0xD8:
    1241    case 0xD9:
    1242    case 0xDA:
    1243    case 0xDB:
    1244    case 0xDC:
    1245    case 0xDD:
    1246    case 0xDE:
    1247    case 0xDF:
    1248      return -1;
    1249    case 0:
    1250      if (latin1_encoding.type[result] == BT_NONXML)
    1251        return -1;
    1252      break;
    1253    case 0xFF:
    1254      if (result == 0xFFFE || result == 0xFFFF)
    1255        return -1;
    1256      break;
    1257    }
    1258    return result;
    1259  }
    1260  
    1261  int FASTCALL
    1262  XmlUtf8Encode(int c, char *buf) {
    1263    enum {
    1264      /* minN is minimum legal resulting value for N byte sequence */
    1265      min2 = 0x80,
    1266      min3 = 0x800,
    1267      min4 = 0x10000
    1268    };
    1269  
    1270    if (c < 0)
    1271      return 0; /* LCOV_EXCL_LINE: this case is always eliminated beforehand */
    1272    if (c < min2) {
    1273      buf[0] = (char)(c | UTF8_cval1);
    1274      return 1;
    1275    }
    1276    if (c < min3) {
    1277      buf[0] = (char)((c >> 6) | UTF8_cval2);
    1278      buf[1] = (char)((c & 0x3f) | 0x80);
    1279      return 2;
    1280    }
    1281    if (c < min4) {
    1282      buf[0] = (char)((c >> 12) | UTF8_cval3);
    1283      buf[1] = (char)(((c >> 6) & 0x3f) | 0x80);
    1284      buf[2] = (char)((c & 0x3f) | 0x80);
    1285      return 3;
    1286    }
    1287    if (c < 0x110000) {
    1288      buf[0] = (char)((c >> 18) | UTF8_cval4);
    1289      buf[1] = (char)(((c >> 12) & 0x3f) | 0x80);
    1290      buf[2] = (char)(((c >> 6) & 0x3f) | 0x80);
    1291      buf[3] = (char)((c & 0x3f) | 0x80);
    1292      return 4;
    1293    }
    1294    return 0; /* LCOV_EXCL_LINE: this case too is eliminated before calling */
    1295  }
    1296  
    1297  int FASTCALL
    1298  XmlUtf16Encode(int charNum, unsigned short *buf) {
    1299    if (charNum < 0)
    1300      return 0;
    1301    if (charNum < 0x10000) {
    1302      buf[0] = (unsigned short)charNum;
    1303      return 1;
    1304    }
    1305    if (charNum < 0x110000) {
    1306      charNum -= 0x10000;
    1307      buf[0] = (unsigned short)((charNum >> 10) + 0xD800);
    1308      buf[1] = (unsigned short)((charNum & 0x3FF) + 0xDC00);
    1309      return 2;
    1310    }
    1311    return 0;
    1312  }
    1313  
    1314  struct unknown_encoding {
    1315    struct normal_encoding normal;
    1316    CONVERTER convert;
    1317    void *userData;
    1318    unsigned short utf16[256];
    1319    char utf8[256][4];
    1320  };
    1321  
    1322  #define AS_UNKNOWN_ENCODING(enc) ((const struct unknown_encoding *)(enc))
    1323  
    1324  int
    1325  XmlSizeOfUnknownEncoding(void) {
    1326    return sizeof(struct unknown_encoding);
    1327  }
    1328  
    1329  static int PTRFASTCALL
    1330  unknown_isName(const ENCODING *enc, const char *p) {
    1331    const struct unknown_encoding *uenc = AS_UNKNOWN_ENCODING(enc);
    1332    int c = uenc->convert(uenc->userData, p);
    1333    if (c & ~0xFFFF)
    1334      return 0;
    1335    return UCS2_GET_NAMING(namePages, c >> 8, c & 0xFF);
    1336  }
    1337  
    1338  static int PTRFASTCALL
    1339  unknown_isNmstrt(const ENCODING *enc, const char *p) {
    1340    const struct unknown_encoding *uenc = AS_UNKNOWN_ENCODING(enc);
    1341    int c = uenc->convert(uenc->userData, p);
    1342    if (c & ~0xFFFF)
    1343      return 0;
    1344    return UCS2_GET_NAMING(nmstrtPages, c >> 8, c & 0xFF);
    1345  }
    1346  
    1347  static int PTRFASTCALL
    1348  unknown_isInvalid(const ENCODING *enc, const char *p) {
    1349    const struct unknown_encoding *uenc = AS_UNKNOWN_ENCODING(enc);
    1350    int c = uenc->convert(uenc->userData, p);
    1351    return (c & ~0xFFFF) || checkCharRefNumber(c) < 0;
    1352  }
    1353  
    1354  static enum XML_Convert_Result PTRCALL
    1355  unknown_toUtf8(const ENCODING *enc, const char **fromP, const char *fromLim,
    1356                 char **toP, const char *toLim) {
    1357    const struct unknown_encoding *uenc = AS_UNKNOWN_ENCODING(enc);
    1358    char buf[XML_UTF8_ENCODE_MAX];
    1359    for (;;) {
    1360      const char *utf8;
    1361      int n;
    1362      if (*fromP == fromLim)
    1363        return XML_CONVERT_COMPLETED;
    1364      utf8 = uenc->utf8[(unsigned char)**fromP];
    1365      n = *utf8++;
    1366      if (n == 0) {
    1367        int c = uenc->convert(uenc->userData, *fromP);
    1368        n = XmlUtf8Encode(c, buf);
    1369        if (n > toLim - *toP)
    1370          return XML_CONVERT_OUTPUT_EXHAUSTED;
    1371        utf8 = buf;
    1372        *fromP += (AS_NORMAL_ENCODING(enc)->type[(unsigned char)**fromP]
    1373                   - (BT_LEAD2 - 2));
    1374      } else {
    1375        if (n > toLim - *toP)
    1376          return XML_CONVERT_OUTPUT_EXHAUSTED;
    1377        (*fromP)++;
    1378      }
    1379      memcpy(*toP, utf8, n);
    1380      *toP += n;
    1381    }
    1382  }
    1383  
    1384  static enum XML_Convert_Result PTRCALL
    1385  unknown_toUtf16(const ENCODING *enc, const char **fromP, const char *fromLim,
    1386                  unsigned short **toP, const unsigned short *toLim) {
    1387    const struct unknown_encoding *uenc = AS_UNKNOWN_ENCODING(enc);
    1388    while (*fromP < fromLim && *toP < toLim) {
    1389      unsigned short c = uenc->utf16[(unsigned char)**fromP];
    1390      if (c == 0) {
    1391        c = (unsigned short)uenc->convert(uenc->userData, *fromP);
    1392        *fromP += (AS_NORMAL_ENCODING(enc)->type[(unsigned char)**fromP]
    1393                   - (BT_LEAD2 - 2));
    1394      } else
    1395        (*fromP)++;
    1396      *(*toP)++ = c;
    1397    }
    1398  
    1399    if ((*toP == toLim) && (*fromP < fromLim))
    1400      return XML_CONVERT_OUTPUT_EXHAUSTED;
    1401    else
    1402      return XML_CONVERT_COMPLETED;
    1403  }
    1404  
    1405  ENCODING *
    1406  XmlInitUnknownEncoding(void *mem, int *table, CONVERTER convert,
    1407                         void *userData) {
    1408    int i;
    1409    struct unknown_encoding *e = (struct unknown_encoding *)mem;
    1410    memcpy(mem, &latin1_encoding, sizeof(struct normal_encoding));
    1411    for (i = 0; i < 128; i++)
    1412      if (latin1_encoding.type[i] != BT_OTHER
    1413          && latin1_encoding.type[i] != BT_NONXML && table[i] != i)
    1414        return 0;
    1415    for (i = 0; i < 256; i++) {
    1416      int c = table[i];
    1417      if (c == -1) {
    1418        e->normal.type[i] = BT_MALFORM;
    1419        /* This shouldn't really get used. */
    1420        e->utf16[i] = 0xFFFF;
    1421        e->utf8[i][0] = 1;
    1422        e->utf8[i][1] = 0;
    1423      } else if (c < 0) {
    1424        if (c < -4)
    1425          return 0;
    1426        /* Multi-byte sequences need a converter function */
    1427        if (! convert)
    1428          return 0;
    1429        e->normal.type[i] = (unsigned char)(BT_LEAD2 - (c + 2));
    1430        e->utf8[i][0] = 0;
    1431        e->utf16[i] = 0;
    1432      } else if (c < 0x80) {
    1433        if (latin1_encoding.type[c] != BT_OTHER
    1434            && latin1_encoding.type[c] != BT_NONXML && c != i)
    1435          return 0;
    1436        e->normal.type[i] = latin1_encoding.type[c];
    1437        e->utf8[i][0] = 1;
    1438        e->utf8[i][1] = (char)c;
    1439        e->utf16[i] = (unsigned short)(c == 0 ? 0xFFFF : c);
    1440      } else if (checkCharRefNumber(c) < 0) {
    1441        e->normal.type[i] = BT_NONXML;
    1442        /* This shouldn't really get used. */
    1443        e->utf16[i] = 0xFFFF;
    1444        e->utf8[i][0] = 1;
    1445        e->utf8[i][1] = 0;
    1446      } else {
    1447        if (c > 0xFFFF)
    1448          return 0;
    1449        if (UCS2_GET_NAMING(nmstrtPages, c >> 8, c & 0xff))
    1450          e->normal.type[i] = BT_NMSTRT;
    1451        else if (UCS2_GET_NAMING(namePages, c >> 8, c & 0xff))
    1452          e->normal.type[i] = BT_NAME;
    1453        else
    1454          e->normal.type[i] = BT_OTHER;
    1455        e->utf8[i][0] = (char)XmlUtf8Encode(c, e->utf8[i] + 1);
    1456        e->utf16[i] = (unsigned short)c;
    1457      }
    1458    }
    1459    e->userData = userData;
    1460    e->convert = convert;
    1461    if (convert) {
    1462      e->normal.isName2 = unknown_isName;
    1463      e->normal.isName3 = unknown_isName;
    1464      e->normal.isName4 = unknown_isName;
    1465      e->normal.isNmstrt2 = unknown_isNmstrt;
    1466      e->normal.isNmstrt3 = unknown_isNmstrt;
    1467      e->normal.isNmstrt4 = unknown_isNmstrt;
    1468      e->normal.isInvalid2 = unknown_isInvalid;
    1469      e->normal.isInvalid3 = unknown_isInvalid;
    1470      e->normal.isInvalid4 = unknown_isInvalid;
    1471    }
    1472    e->normal.enc.utf8Convert = unknown_toUtf8;
    1473    e->normal.enc.utf16Convert = unknown_toUtf16;
    1474    return &(e->normal.enc);
    1475  }
    1476  
    1477  /* If this enumeration is changed, getEncodingIndex and encodings
    1478  must also be changed. */
    1479  enum {
    1480    UNKNOWN_ENC = -1,
    1481    ISO_8859_1_ENC = 0,
    1482    US_ASCII_ENC,
    1483    UTF_8_ENC,
    1484    UTF_16_ENC,
    1485    UTF_16BE_ENC,
    1486    UTF_16LE_ENC,
    1487    /* must match encodingNames up to here */
    1488    NO_ENC
    1489  };
    1490  
    1491  static const char KW_ISO_8859_1[]
    1492      = {ASCII_I, ASCII_S, ASCII_O,     ASCII_MINUS, ASCII_8, ASCII_8,
    1493         ASCII_5, ASCII_9, ASCII_MINUS, ASCII_1,     '\0'};
    1494  static const char KW_US_ASCII[]
    1495      = {ASCII_U, ASCII_S, ASCII_MINUS, ASCII_A, ASCII_S,
    1496         ASCII_C, ASCII_I, ASCII_I,     '\0'};
    1497  static const char KW_UTF_8[]
    1498      = {ASCII_U, ASCII_T, ASCII_F, ASCII_MINUS, ASCII_8, '\0'};
    1499  static const char KW_UTF_16[]
    1500      = {ASCII_U, ASCII_T, ASCII_F, ASCII_MINUS, ASCII_1, ASCII_6, '\0'};
    1501  static const char KW_UTF_16BE[]
    1502      = {ASCII_U, ASCII_T, ASCII_F, ASCII_MINUS, ASCII_1,
    1503         ASCII_6, ASCII_B, ASCII_E, '\0'};
    1504  static const char KW_UTF_16LE[]
    1505      = {ASCII_U, ASCII_T, ASCII_F, ASCII_MINUS, ASCII_1,
    1506         ASCII_6, ASCII_L, ASCII_E, '\0'};
    1507  
    1508  static int FASTCALL
    1509  getEncodingIndex(const char *name) {
    1510    static const char *const encodingNames[] = {
    1511        KW_ISO_8859_1, KW_US_ASCII, KW_UTF_8, KW_UTF_16, KW_UTF_16BE, KW_UTF_16LE,
    1512    };
    1513    int i;
    1514    if (name == NULL)
    1515      return NO_ENC;
    1516    for (i = 0; i < (int)(sizeof(encodingNames) / sizeof(encodingNames[0])); i++)
    1517      if (streqci(name, encodingNames[i]))
    1518        return i;
    1519    return UNKNOWN_ENC;
    1520  }
    1521  
    1522  /* For binary compatibility, we store the index of the encoding
    1523     specified at initialization in the isUtf16 member.
    1524  */
    1525  
    1526  #define INIT_ENC_INDEX(enc) ((int)(enc)->initEnc.isUtf16)
    1527  #define SET_INIT_ENC_INDEX(enc, i) ((enc)->initEnc.isUtf16 = (char)i)
    1528  
    1529  /* This is what detects the encoding.  encodingTable maps from
    1530     encoding indices to encodings; INIT_ENC_INDEX(enc) is the index of
    1531     the external (protocol) specified encoding; state is
    1532     XML_CONTENT_STATE if we're parsing an external text entity, and
    1533     XML_PROLOG_STATE otherwise.
    1534  */
    1535  
    1536  static int
    1537  initScan(const ENCODING *const *encodingTable, const INIT_ENCODING *enc,
    1538           int state, const char *ptr, const char *end, const char **nextTokPtr) {
    1539    const ENCODING **encPtr;
    1540  
    1541    if (ptr >= end)
    1542      return XML_TOK_NONE;
    1543    encPtr = enc->encPtr;
    1544    if (ptr + 1 == end) {
    1545      /* only a single byte available for auto-detection */
    1546  #ifndef XML_DTD /* FIXME */
    1547      /* a well-formed document entity must have more than one byte */
    1548      if (state != XML_CONTENT_STATE)
    1549        return XML_TOK_PARTIAL;
    1550  #endif
    1551      /* so we're parsing an external text entity... */
    1552      /* if UTF-16 was externally specified, then we need at least 2 bytes */
    1553      switch (INIT_ENC_INDEX(enc)) {
    1554      case UTF_16_ENC:
    1555      case UTF_16LE_ENC:
    1556      case UTF_16BE_ENC:
    1557        return XML_TOK_PARTIAL;
    1558      }
    1559      switch ((unsigned char)*ptr) {
    1560      case 0xFE:
    1561      case 0xFF:
    1562      case 0xEF: /* possibly first byte of UTF-8 BOM */
    1563        if (INIT_ENC_INDEX(enc) == ISO_8859_1_ENC && state == XML_CONTENT_STATE)
    1564          break;
    1565        /* fall through */
    1566      case 0x00:
    1567      case 0x3C:
    1568        return XML_TOK_PARTIAL;
    1569      }
    1570    } else {
    1571      switch (((unsigned char)ptr[0] << 8) | (unsigned char)ptr[1]) {
    1572      case 0xFEFF:
    1573        if (INIT_ENC_INDEX(enc) == ISO_8859_1_ENC && state == XML_CONTENT_STATE)
    1574          break;
    1575        *nextTokPtr = ptr + 2;
    1576        *encPtr = encodingTable[UTF_16BE_ENC];
    1577        return XML_TOK_BOM;
    1578      /* 00 3C is handled in the default case */
    1579      case 0x3C00:
    1580        if ((INIT_ENC_INDEX(enc) == UTF_16BE_ENC
    1581             || INIT_ENC_INDEX(enc) == UTF_16_ENC)
    1582            && state == XML_CONTENT_STATE)
    1583          break;
    1584        *encPtr = encodingTable[UTF_16LE_ENC];
    1585        return XmlTok(*encPtr, state, ptr, end, nextTokPtr);
    1586      case 0xFFFE:
    1587        if (INIT_ENC_INDEX(enc) == ISO_8859_1_ENC && state == XML_CONTENT_STATE)
    1588          break;
    1589        *nextTokPtr = ptr + 2;
    1590        *encPtr = encodingTable[UTF_16LE_ENC];
    1591        return XML_TOK_BOM;
    1592      case 0xEFBB:
    1593        /* Maybe a UTF-8 BOM (EF BB BF) */
    1594        /* If there's an explicitly specified (external) encoding
    1595           of ISO-8859-1 or some flavour of UTF-16
    1596           and this is an external text entity,
    1597           don't look for the BOM,
    1598           because it might be a legal data.
    1599        */
    1600        if (state == XML_CONTENT_STATE) {
    1601          int e = INIT_ENC_INDEX(enc);
    1602          if (e == ISO_8859_1_ENC || e == UTF_16BE_ENC || e == UTF_16LE_ENC
    1603              || e == UTF_16_ENC)
    1604            break;
    1605        }
    1606        if (ptr + 2 == end)
    1607          return XML_TOK_PARTIAL;
    1608        if ((unsigned char)ptr[2] == 0xBF) {
    1609          *nextTokPtr = ptr + 3;
    1610          *encPtr = encodingTable[UTF_8_ENC];
    1611          return XML_TOK_BOM;
    1612        }
    1613        break;
    1614      default:
    1615        if (ptr[0] == '\0') {
    1616          /* 0 isn't a legal data character. Furthermore a document
    1617             entity can only start with ASCII characters.  So the only
    1618             way this can fail to be big-endian UTF-16 if it it's an
    1619             external parsed general entity that's labelled as
    1620             UTF-16LE.
    1621          */
    1622          if (state == XML_CONTENT_STATE && INIT_ENC_INDEX(enc) == UTF_16LE_ENC)
    1623            break;
    1624          *encPtr = encodingTable[UTF_16BE_ENC];
    1625          return XmlTok(*encPtr, state, ptr, end, nextTokPtr);
    1626        } else if (ptr[1] == '\0') {
    1627          /* We could recover here in the case:
    1628              - parsing an external entity
    1629              - second byte is 0
    1630              - no externally specified encoding
    1631              - no encoding declaration
    1632             by assuming UTF-16LE.  But we don't, because this would mean when
    1633             presented just with a single byte, we couldn't reliably determine
    1634             whether we needed further bytes.
    1635          */
    1636          if (state == XML_CONTENT_STATE)
    1637            break;
    1638          *encPtr = encodingTable[UTF_16LE_ENC];
    1639          return XmlTok(*encPtr, state, ptr, end, nextTokPtr);
    1640        }
    1641        break;
    1642      }
    1643    }
    1644    *encPtr = encodingTable[INIT_ENC_INDEX(enc)];
    1645    return XmlTok(*encPtr, state, ptr, end, nextTokPtr);
    1646  }
    1647  
    1648  #define NS(x) x
    1649  #define ns(x) x
    1650  #define XML_TOK_NS_C
    1651  #include "xmltok_ns.c"
    1652  #undef XML_TOK_NS_C
    1653  #undef NS
    1654  #undef ns
    1655  
    1656  #ifdef XML_NS
    1657  
    1658  #  define NS(x) x##NS
    1659  #  define ns(x) x##_ns
    1660  
    1661  #  define XML_TOK_NS_C
    1662  #  include "xmltok_ns.c"
    1663  #  undef XML_TOK_NS_C
    1664  
    1665  #  undef NS
    1666  #  undef ns
    1667  
    1668  ENCODING *
    1669  XmlInitUnknownEncodingNS(void *mem, int *table, CONVERTER convert,
    1670                           void *userData) {
    1671    ENCODING *enc = XmlInitUnknownEncoding(mem, table, convert, userData);
    1672    if (enc)
    1673      ((struct normal_encoding *)enc)->type[ASCII_COLON] = BT_COLON;
    1674    return enc;
    1675  }
    1676  
    1677  #endif /* XML_NS */