1  /* stringlib: codec implementations */
       2  
       3  #if !STRINGLIB_IS_UNICODE
       4  # error "codecs.h is specific to Unicode"
       5  #endif
       6  
       7  #include "pycore_bitutils.h"      // _Py_bswap32()
       8  
       9  /* Mask to quickly check whether a C 'size_t' contains a
      10     non-ASCII, UTF8-encoded char. */
      11  #if (SIZEOF_SIZE_T == 8)
      12  # define ASCII_CHAR_MASK 0x8080808080808080ULL
      13  #elif (SIZEOF_SIZE_T == 4)
      14  # define ASCII_CHAR_MASK 0x80808080U
      15  #else
      16  # error C 'size_t' size should be either 4 or 8!
      17  #endif
      18  
      19  /* 10xxxxxx */
      20  #define IS_CONTINUATION_BYTE(ch) ((ch) >= 0x80 && (ch) < 0xC0)
      21  
      22  Py_LOCAL_INLINE(Py_UCS4)
      23  STRINGLIB(utf8_decode)(const char **inptr, const char *end,
      24                         STRINGLIB_CHAR *dest,
      25                         Py_ssize_t *outpos)
      26  {
      27      Py_UCS4 ch;
      28      const char *s = *inptr;
      29      STRINGLIB_CHAR *p = dest + *outpos;
      30  
      31      while (s < end) {
      32          ch = (unsigned char)*s;
      33  
      34          if (ch < 0x80) {
      35              /* Fast path for runs of ASCII characters. Given that common UTF-8
      36                 input will consist of an overwhelming majority of ASCII
      37                 characters, we try to optimize for this case by checking
      38                 as many characters as a C 'size_t' can contain.
      39                 First, check if we can do an aligned read, as most CPUs have
      40                 a penalty for unaligned reads.
      41              */
      42              if (_Py_IS_ALIGNED(s, ALIGNOF_SIZE_T)) {
      43                  /* Help register allocation */
      44                  const char *_s = s;
      45                  STRINGLIB_CHAR *_p = p;
      46                  while (_s + SIZEOF_SIZE_T <= end) {
      47                      /* Read a whole size_t at a time (either 4 or 8 bytes),
      48                         and do a fast unrolled copy if it only contains ASCII
      49                         characters. */
      50                      size_t value = *(const size_t *) _s;
      51                      if (value & ASCII_CHAR_MASK)
      52                          break;
      53  #if PY_LITTLE_ENDIAN
      54                      _p[0] = (STRINGLIB_CHAR)(value & 0xFFu);
      55                      _p[1] = (STRINGLIB_CHAR)((value >> 8) & 0xFFu);
      56                      _p[2] = (STRINGLIB_CHAR)((value >> 16) & 0xFFu);
      57                      _p[3] = (STRINGLIB_CHAR)((value >> 24) & 0xFFu);
      58  # if SIZEOF_SIZE_T == 8
      59                      _p[4] = (STRINGLIB_CHAR)((value >> 32) & 0xFFu);
      60                      _p[5] = (STRINGLIB_CHAR)((value >> 40) & 0xFFu);
      61                      _p[6] = (STRINGLIB_CHAR)((value >> 48) & 0xFFu);
      62                      _p[7] = (STRINGLIB_CHAR)((value >> 56) & 0xFFu);
      63  # endif
      64  #else
      65  # if SIZEOF_SIZE_T == 8
      66                      _p[0] = (STRINGLIB_CHAR)((value >> 56) & 0xFFu);
      67                      _p[1] = (STRINGLIB_CHAR)((value >> 48) & 0xFFu);
      68                      _p[2] = (STRINGLIB_CHAR)((value >> 40) & 0xFFu);
      69                      _p[3] = (STRINGLIB_CHAR)((value >> 32) & 0xFFu);
      70                      _p[4] = (STRINGLIB_CHAR)((value >> 24) & 0xFFu);
      71                      _p[5] = (STRINGLIB_CHAR)((value >> 16) & 0xFFu);
      72                      _p[6] = (STRINGLIB_CHAR)((value >> 8) & 0xFFu);
      73                      _p[7] = (STRINGLIB_CHAR)(value & 0xFFu);
      74  # else
      75                      _p[0] = (STRINGLIB_CHAR)((value >> 24) & 0xFFu);
      76                      _p[1] = (STRINGLIB_CHAR)((value >> 16) & 0xFFu);
      77                      _p[2] = (STRINGLIB_CHAR)((value >> 8) & 0xFFu);
      78                      _p[3] = (STRINGLIB_CHAR)(value & 0xFFu);
      79  # endif
      80  #endif
      81                      _s += SIZEOF_SIZE_T;
      82                      _p += SIZEOF_SIZE_T;
      83                  }
      84                  s = _s;
      85                  p = _p;
      86                  if (s == end)
      87                      break;
      88                  ch = (unsigned char)*s;
      89              }
      90              if (ch < 0x80) {
      91                  s++;
      92                  *p++ = ch;
      93                  continue;
      94              }
      95          }
      96  
      97          if (ch < 0xE0) {
      98              /* \xC2\x80-\xDF\xBF -- 0080-07FF */
      99              Py_UCS4 ch2;
     100              if (ch < 0xC2) {
     101                  /* invalid sequence
     102                  \x80-\xBF -- continuation byte
     103                  \xC0-\xC1 -- fake 0000-007F */
     104                  goto InvalidStart;
     105              }
     106              if (end - s < 2) {
     107                  /* unexpected end of data: the caller will decide whether
     108                     it's an error or not */
     109                  break;
     110              }
     111              ch2 = (unsigned char)s[1];
     112              if (!IS_CONTINUATION_BYTE(ch2))
     113                  /* invalid continuation byte */
     114                  goto InvalidContinuation1;
     115              ch = (ch << 6) + ch2 -
     116                   ((0xC0 << 6) + 0x80);
     117              assert ((ch > 0x007F) && (ch <= 0x07FF));
     118              s += 2;
     119              if (STRINGLIB_MAX_CHAR <= 0x007F ||
     120                  (STRINGLIB_MAX_CHAR < 0x07FF && ch > STRINGLIB_MAX_CHAR))
     121                  /* Out-of-range */
     122                  goto Return;
     123              *p++ = ch;
     124              continue;
     125          }
     126  
     127          if (ch < 0xF0) {
     128              /* \xE0\xA0\x80-\xEF\xBF\xBF -- 0800-FFFF */
     129              Py_UCS4 ch2, ch3;
     130              if (end - s < 3) {
     131                  /* unexpected end of data: the caller will decide whether
     132                     it's an error or not */
     133                  if (end - s < 2)
     134                      break;
     135                  ch2 = (unsigned char)s[1];
     136                  if (!IS_CONTINUATION_BYTE(ch2) ||
     137                      (ch2 < 0xA0 ? ch == 0xE0 : ch == 0xED))
     138                      /* for clarification see comments below */
     139                      goto InvalidContinuation1;
     140                  break;
     141              }
     142              ch2 = (unsigned char)s[1];
     143              ch3 = (unsigned char)s[2];
     144              if (!IS_CONTINUATION_BYTE(ch2)) {
     145                  /* invalid continuation byte */
     146                  goto InvalidContinuation1;
     147              }
     148              if (ch == 0xE0) {
     149                  if (ch2 < 0xA0)
     150                      /* invalid sequence
     151                         \xE0\x80\x80-\xE0\x9F\xBF -- fake 0000-0800 */
     152                      goto InvalidContinuation1;
     153              } else if (ch == 0xED && ch2 >= 0xA0) {
     154                  /* Decoding UTF-8 sequences in range \xED\xA0\x80-\xED\xBF\xBF
     155                     will result in surrogates in range D800-DFFF. Surrogates are
     156                     not valid UTF-8 so they are rejected.
     157                     See https://www.unicode.org/versions/Unicode5.2.0/ch03.pdf
     158                     (table 3-7) and http://www.rfc-editor.org/rfc/rfc3629.txt */
     159                  goto InvalidContinuation1;
     160              }
     161              if (!IS_CONTINUATION_BYTE(ch3)) {
     162                  /* invalid continuation byte */
     163                  goto InvalidContinuation2;
     164              }
     165              ch = (ch << 12) + (ch2 << 6) + ch3 -
     166                   ((0xE0 << 12) + (0x80 << 6) + 0x80);
     167              assert ((ch > 0x07FF) && (ch <= 0xFFFF));
     168              s += 3;
     169              if (STRINGLIB_MAX_CHAR <= 0x07FF ||
     170                  (STRINGLIB_MAX_CHAR < 0xFFFF && ch > STRINGLIB_MAX_CHAR))
     171                  /* Out-of-range */
     172                  goto Return;
     173              *p++ = ch;
     174              continue;
     175          }
     176  
     177          if (ch < 0xF5) {
     178              /* \xF0\x90\x80\x80-\xF4\x8F\xBF\xBF -- 10000-10FFFF */
     179              Py_UCS4 ch2, ch3, ch4;
     180              if (end - s < 4) {
     181                  /* unexpected end of data: the caller will decide whether
     182                     it's an error or not */
     183                  if (end - s < 2)
     184                      break;
     185                  ch2 = (unsigned char)s[1];
     186                  if (!IS_CONTINUATION_BYTE(ch2) ||
     187                      (ch2 < 0x90 ? ch == 0xF0 : ch == 0xF4))
     188                      /* for clarification see comments below */
     189                      goto InvalidContinuation1;
     190                  if (end - s < 3)
     191                      break;
     192                  ch3 = (unsigned char)s[2];
     193                  if (!IS_CONTINUATION_BYTE(ch3))
     194                      goto InvalidContinuation2;
     195                  break;
     196              }
     197              ch2 = (unsigned char)s[1];
     198              ch3 = (unsigned char)s[2];
     199              ch4 = (unsigned char)s[3];
     200              if (!IS_CONTINUATION_BYTE(ch2)) {
     201                  /* invalid continuation byte */
     202                  goto InvalidContinuation1;
     203              }
     204              if (ch == 0xF0) {
     205                  if (ch2 < 0x90)
     206                      /* invalid sequence
     207                         \xF0\x80\x80\x80-\xF0\x8F\xBF\xBF -- fake 0000-FFFF */
     208                      goto InvalidContinuation1;
     209              } else if (ch == 0xF4 && ch2 >= 0x90) {
     210                  /* invalid sequence
     211                     \xF4\x90\x80\x80- -- 110000- overflow */
     212                  goto InvalidContinuation1;
     213              }
     214              if (!IS_CONTINUATION_BYTE(ch3)) {
     215                  /* invalid continuation byte */
     216                  goto InvalidContinuation2;
     217              }
     218              if (!IS_CONTINUATION_BYTE(ch4)) {
     219                  /* invalid continuation byte */
     220                  goto InvalidContinuation3;
     221              }
     222              ch = (ch << 18) + (ch2 << 12) + (ch3 << 6) + ch4 -
     223                   ((0xF0 << 18) + (0x80 << 12) + (0x80 << 6) + 0x80);
     224              assert ((ch > 0xFFFF) && (ch <= 0x10FFFF));
     225              s += 4;
     226              if (STRINGLIB_MAX_CHAR <= 0xFFFF ||
     227                  (STRINGLIB_MAX_CHAR < 0x10FFFF && ch > STRINGLIB_MAX_CHAR))
     228                  /* Out-of-range */
     229                  goto Return;
     230              *p++ = ch;
     231              continue;
     232          }
     233          goto InvalidStart;
     234      }
     235      ch = 0;
     236  Return:
     237      *inptr = s;
     238      *outpos = p - dest;
     239      return ch;
     240  InvalidStart:
     241      ch = 1;
     242      goto Return;
     243  InvalidContinuation1:
     244      ch = 2;
     245      goto Return;
     246  InvalidContinuation2:
     247      ch = 3;
     248      goto Return;
     249  InvalidContinuation3:
     250      ch = 4;
     251      goto Return;
     252  }
     253  
     254  #undef ASCII_CHAR_MASK
     255  
     256  
     257  /* UTF-8 encoder specialized for a Unicode kind to avoid the slow
     258     PyUnicode_READ() macro. Delete some parts of the code depending on the kind:
     259     UCS-1 strings don't need to handle surrogates for example. */
     260  Py_LOCAL_INLINE(char *)
     261  STRINGLIB(utf8_encoder)(_PyBytesWriter *writer,
     262                          PyObject *unicode,
     263                          const STRINGLIB_CHAR *data,
     264                          Py_ssize_t size,
     265                          _Py_error_handler error_handler,
     266                          const char *errors)
     267  {
     268      Py_ssize_t i;                /* index into data of next input character */
     269      char *p;                     /* next free byte in output buffer */
     270  #if STRINGLIB_SIZEOF_CHAR > 1
     271      PyObject *error_handler_obj = NULL;
     272      PyObject *exc = NULL;
     273      PyObject *rep = NULL;
     274  #endif
     275  #if STRINGLIB_SIZEOF_CHAR == 1
     276      const Py_ssize_t max_char_size = 2;
     277  #elif STRINGLIB_SIZEOF_CHAR == 2
     278      const Py_ssize_t max_char_size = 3;
     279  #else /*  STRINGLIB_SIZEOF_CHAR == 4 */
     280      const Py_ssize_t max_char_size = 4;
     281  #endif
     282  
     283      assert(size >= 0);
     284      if (size > PY_SSIZE_T_MAX / max_char_size) {
     285          /* integer overflow */
     286          PyErr_NoMemory();
     287          return NULL;
     288      }
     289  
     290      _PyBytesWriter_Init(writer);
     291      p = _PyBytesWriter_Alloc(writer, size * max_char_size);
     292      if (p == NULL)
     293          return NULL;
     294  
     295      for (i = 0; i < size;) {
     296          Py_UCS4 ch = data[i++];
     297  
     298          if (ch < 0x80) {
     299              /* Encode ASCII */
     300              *p++ = (char) ch;
     301  
     302          }
     303          else
     304  #if STRINGLIB_SIZEOF_CHAR > 1
     305          if (ch < 0x0800)
     306  #endif
     307          {
     308              /* Encode Latin-1 */
     309              *p++ = (char)(0xc0 | (ch >> 6));
     310              *p++ = (char)(0x80 | (ch & 0x3f));
     311          }
     312  #if STRINGLIB_SIZEOF_CHAR > 1
     313          else if (Py_UNICODE_IS_SURROGATE(ch)) {
     314              Py_ssize_t startpos, endpos, newpos;
     315              Py_ssize_t k;
     316              if (error_handler == _Py_ERROR_UNKNOWN) {
     317                  error_handler = _Py_GetErrorHandler(errors);
     318              }
     319  
     320              startpos = i-1;
     321              endpos = startpos+1;
     322  
     323              while ((endpos < size) && Py_UNICODE_IS_SURROGATE(data[endpos]))
     324                  endpos++;
     325  
     326              /* Only overallocate the buffer if it's not the last write */
     327              writer->overallocate = (endpos < size);
     328  
     329              switch (error_handler)
     330              {
     331              case _Py_ERROR_REPLACE:
     332                  memset(p, '?', endpos - startpos);
     333                  p += (endpos - startpos);
     334                  /* fall through */
     335              case _Py_ERROR_IGNORE:
     336                  i += (endpos - startpos - 1);
     337                  break;
     338  
     339              case _Py_ERROR_SURROGATEPASS:
     340                  for (k=startpos; k<endpos; k++) {
     341                      ch = data[k];
     342                      *p++ = (char)(0xe0 | (ch >> 12));
     343                      *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
     344                      *p++ = (char)(0x80 | (ch & 0x3f));
     345                  }
     346                  i += (endpos - startpos - 1);
     347                  break;
     348  
     349              case _Py_ERROR_BACKSLASHREPLACE:
     350                  /* subtract preallocated bytes */
     351                  writer->min_size -= max_char_size * (endpos - startpos);
     352                  p = backslashreplace(writer, p,
     353                                       unicode, startpos, endpos);
     354                  if (p == NULL)
     355                      goto error;
     356                  i += (endpos - startpos - 1);
     357                  break;
     358  
     359              case _Py_ERROR_XMLCHARREFREPLACE:
     360                  /* subtract preallocated bytes */
     361                  writer->min_size -= max_char_size * (endpos - startpos);
     362                  p = xmlcharrefreplace(writer, p,
     363                                        unicode, startpos, endpos);
     364                  if (p == NULL)
     365                      goto error;
     366                  i += (endpos - startpos - 1);
     367                  break;
     368  
     369              case _Py_ERROR_SURROGATEESCAPE:
     370                  for (k=startpos; k<endpos; k++) {
     371                      ch = data[k];
     372                      if (!(0xDC80 <= ch && ch <= 0xDCFF))
     373                          break;
     374                      *p++ = (char)(ch & 0xff);
     375                  }
     376                  if (k >= endpos) {
     377                      i += (endpos - startpos - 1);
     378                      break;
     379                  }
     380                  startpos = k;
     381                  assert(startpos < endpos);
     382                  /* fall through */
     383              default:
     384                  rep = unicode_encode_call_errorhandler(
     385                        errors, &error_handler_obj, "utf-8", "surrogates not allowed",
     386                        unicode, &exc, startpos, endpos, &newpos);
     387                  if (!rep)
     388                      goto error;
     389  
     390                  if (newpos < startpos) {
     391                      writer->overallocate = 1;
     392                      p = _PyBytesWriter_Prepare(writer, p,
     393                                                 max_char_size * (startpos - newpos));
     394                      if (p == NULL)
     395                          goto error;
     396                  }
     397                  else {
     398                      /* subtract preallocated bytes */
     399                      writer->min_size -= max_char_size * (newpos - startpos);
     400                      /* Only overallocate the buffer if it's not the last write */
     401                      writer->overallocate = (newpos < size);
     402                  }
     403  
     404                  if (PyBytes_Check(rep)) {
     405                      p = _PyBytesWriter_WriteBytes(writer, p,
     406                                                    PyBytes_AS_STRING(rep),
     407                                                    PyBytes_GET_SIZE(rep));
     408                  }
     409                  else {
     410                      /* rep is unicode */
     411                      if (PyUnicode_READY(rep) < 0)
     412                          goto error;
     413  
     414                      if (!PyUnicode_IS_ASCII(rep)) {
     415                          raise_encode_exception(&exc, "utf-8", unicode,
     416                                                 startpos, endpos,
     417                                                 "surrogates not allowed");
     418                          goto error;
     419                      }
     420  
     421                      p = _PyBytesWriter_WriteBytes(writer, p,
     422                                                    PyUnicode_DATA(rep),
     423                                                    PyUnicode_GET_LENGTH(rep));
     424                  }
     425  
     426                  if (p == NULL)
     427                      goto error;
     428                  Py_CLEAR(rep);
     429  
     430                  i = newpos;
     431              }
     432  
     433              /* If overallocation was disabled, ensure that it was the last
     434                 write. Otherwise, we missed an optimization */
     435              assert(writer->overallocate || i == size);
     436          }
     437          else
     438  #if STRINGLIB_SIZEOF_CHAR > 2
     439          if (ch < 0x10000)
     440  #endif
     441          {
     442              *p++ = (char)(0xe0 | (ch >> 12));
     443              *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
     444              *p++ = (char)(0x80 | (ch & 0x3f));
     445          }
     446  #if STRINGLIB_SIZEOF_CHAR > 2
     447          else /* ch >= 0x10000 */
     448          {
     449              assert(ch <= MAX_UNICODE);
     450              /* Encode UCS4 Unicode ordinals */
     451              *p++ = (char)(0xf0 | (ch >> 18));
     452              *p++ = (char)(0x80 | ((ch >> 12) & 0x3f));
     453              *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
     454              *p++ = (char)(0x80 | (ch & 0x3f));
     455          }
     456  #endif /* STRINGLIB_SIZEOF_CHAR > 2 */
     457  #endif /* STRINGLIB_SIZEOF_CHAR > 1 */
     458      }
     459  
     460  #if STRINGLIB_SIZEOF_CHAR > 1
     461      Py_XDECREF(error_handler_obj);
     462      Py_XDECREF(exc);
     463  #endif
     464      return p;
     465  
     466  #if STRINGLIB_SIZEOF_CHAR > 1
     467   error:
     468      Py_XDECREF(rep);
     469      Py_XDECREF(error_handler_obj);
     470      Py_XDECREF(exc);
     471      return NULL;
     472  #endif
     473  }
     474  
     475  /* The pattern for constructing UCS2-repeated masks. */
     476  #if SIZEOF_LONG == 8
     477  # define UCS2_REPEAT_MASK 0x0001000100010001ul
     478  #elif SIZEOF_LONG == 4
     479  # define UCS2_REPEAT_MASK 0x00010001ul
     480  #else
     481  # error C 'long' size should be either 4 or 8!
     482  #endif
     483  
     484  /* The mask for fast checking. */
     485  #if STRINGLIB_SIZEOF_CHAR == 1
     486  /* The mask for fast checking of whether a C 'long' contains a
     487     non-ASCII or non-Latin1 UTF16-encoded characters. */
     488  # define FAST_CHAR_MASK         (UCS2_REPEAT_MASK * (0xFFFFu & ~STRINGLIB_MAX_CHAR))
     489  #else
     490  /* The mask for fast checking of whether a C 'long' may contain
     491     UTF16-encoded surrogate characters. This is an efficient heuristic,
     492     assuming that non-surrogate characters with a code point >= 0x8000 are
     493     rare in most input.
     494  */
     495  # define FAST_CHAR_MASK         (UCS2_REPEAT_MASK * 0x8000u)
     496  #endif
     497  /* The mask for fast byte-swapping. */
     498  #define STRIPPED_MASK           (UCS2_REPEAT_MASK * 0x00FFu)
     499  /* Swap bytes. */
     500  #define SWAB(value)             ((((value) >> 8) & STRIPPED_MASK) | \
     501                                   (((value) & STRIPPED_MASK) << 8))
     502  
     503  Py_LOCAL_INLINE(Py_UCS4)
     504  STRINGLIB(utf16_decode)(const unsigned char **inptr, const unsigned char *e,
     505                          STRINGLIB_CHAR *dest, Py_ssize_t *outpos,
     506                          int native_ordering)
     507  {
     508      Py_UCS4 ch;
     509      const unsigned char *q = *inptr;
     510      STRINGLIB_CHAR *p = dest + *outpos;
     511      /* Offsets from q for retrieving byte pairs in the right order. */
     512  #if PY_LITTLE_ENDIAN
     513      int ihi = !!native_ordering, ilo = !native_ordering;
     514  #else
     515      int ihi = !native_ordering, ilo = !!native_ordering;
     516  #endif
     517      --e;
     518  
     519      while (q < e) {
     520          Py_UCS4 ch2;
     521          /* First check for possible aligned read of a C 'long'. Unaligned
     522             reads are more expensive, better to defer to another iteration. */
     523          if (_Py_IS_ALIGNED(q, ALIGNOF_LONG)) {
     524              /* Fast path for runs of in-range non-surrogate chars. */
     525              const unsigned char *_q = q;
     526              while (_q + SIZEOF_LONG <= e) {
     527                  unsigned long block = * (const unsigned long *) _q;
     528                  if (native_ordering) {
     529                      /* Can use buffer directly */
     530                      if (block & FAST_CHAR_MASK)
     531                          break;
     532                  }
     533                  else {
     534                      /* Need to byte-swap */
     535                      if (block & SWAB(FAST_CHAR_MASK))
     536                          break;
     537  #if STRINGLIB_SIZEOF_CHAR == 1
     538                      block >>= 8;
     539  #else
     540                      block = SWAB(block);
     541  #endif
     542                  }
     543  #if PY_LITTLE_ENDIAN
     544  # if SIZEOF_LONG == 4
     545                  p[0] = (STRINGLIB_CHAR)(block & 0xFFFFu);
     546                  p[1] = (STRINGLIB_CHAR)(block >> 16);
     547  # elif SIZEOF_LONG == 8
     548                  p[0] = (STRINGLIB_CHAR)(block & 0xFFFFu);
     549                  p[1] = (STRINGLIB_CHAR)((block >> 16) & 0xFFFFu);
     550                  p[2] = (STRINGLIB_CHAR)((block >> 32) & 0xFFFFu);
     551                  p[3] = (STRINGLIB_CHAR)(block >> 48);
     552  # endif
     553  #else
     554  # if SIZEOF_LONG == 4
     555                  p[0] = (STRINGLIB_CHAR)(block >> 16);
     556                  p[1] = (STRINGLIB_CHAR)(block & 0xFFFFu);
     557  # elif SIZEOF_LONG == 8
     558                  p[0] = (STRINGLIB_CHAR)(block >> 48);
     559                  p[1] = (STRINGLIB_CHAR)((block >> 32) & 0xFFFFu);
     560                  p[2] = (STRINGLIB_CHAR)((block >> 16) & 0xFFFFu);
     561                  p[3] = (STRINGLIB_CHAR)(block & 0xFFFFu);
     562  # endif
     563  #endif
     564                  _q += SIZEOF_LONG;
     565                  p += SIZEOF_LONG / 2;
     566              }
     567              q = _q;
     568              if (q >= e)
     569                  break;
     570          }
     571  
     572          ch = (q[ihi] << 8) | q[ilo];
     573          q += 2;
     574          if (!Py_UNICODE_IS_SURROGATE(ch)) {
     575  #if STRINGLIB_SIZEOF_CHAR < 2
     576              if (ch > STRINGLIB_MAX_CHAR)
     577                  /* Out-of-range */
     578                  goto Return;
     579  #endif
     580              *p++ = (STRINGLIB_CHAR)ch;
     581              continue;
     582          }
     583  
     584          /* UTF-16 code pair: */
     585          if (!Py_UNICODE_IS_HIGH_SURROGATE(ch))
     586              goto IllegalEncoding;
     587          if (q >= e)
     588              goto UnexpectedEnd;
     589          ch2 = (q[ihi] << 8) | q[ilo];
     590          q += 2;
     591          if (!Py_UNICODE_IS_LOW_SURROGATE(ch2))
     592              goto IllegalSurrogate;
     593          ch = Py_UNICODE_JOIN_SURROGATES(ch, ch2);
     594  #if STRINGLIB_SIZEOF_CHAR < 4
     595          /* Out-of-range */
     596          goto Return;
     597  #else
     598          *p++ = (STRINGLIB_CHAR)ch;
     599  #endif
     600      }
     601      ch = 0;
     602  Return:
     603      *inptr = q;
     604      *outpos = p - dest;
     605      return ch;
     606  UnexpectedEnd:
     607      ch = 1;
     608      goto Return;
     609  IllegalEncoding:
     610      ch = 2;
     611      goto Return;
     612  IllegalSurrogate:
     613      ch = 3;
     614      goto Return;
     615  }
     616  #undef UCS2_REPEAT_MASK
     617  #undef FAST_CHAR_MASK
     618  #undef STRIPPED_MASK
     619  #undef SWAB
     620  
     621  
     622  #if STRINGLIB_MAX_CHAR >= 0x80
     623  Py_LOCAL_INLINE(Py_ssize_t)
     624  STRINGLIB(utf16_encode)(const STRINGLIB_CHAR *in,
     625                          Py_ssize_t len,
     626                          unsigned short **outptr,
     627                          int native_ordering)
     628  {
     629      unsigned short *out = *outptr;
     630      const STRINGLIB_CHAR *end = in + len;
     631  #if STRINGLIB_SIZEOF_CHAR == 1
     632      if (native_ordering) {
     633          const STRINGLIB_CHAR *unrolled_end = in + _Py_SIZE_ROUND_DOWN(len, 4);
     634          while (in < unrolled_end) {
     635              out[0] = in[0];
     636              out[1] = in[1];
     637              out[2] = in[2];
     638              out[3] = in[3];
     639              in += 4; out += 4;
     640          }
     641          while (in < end) {
     642              *out++ = *in++;
     643          }
     644      } else {
     645  # define SWAB2(CH)  ((CH) << 8) /* high byte is zero */
     646          const STRINGLIB_CHAR *unrolled_end = in + _Py_SIZE_ROUND_DOWN(len, 4);
     647          while (in < unrolled_end) {
     648              out[0] = SWAB2(in[0]);
     649              out[1] = SWAB2(in[1]);
     650              out[2] = SWAB2(in[2]);
     651              out[3] = SWAB2(in[3]);
     652              in += 4; out += 4;
     653          }
     654          while (in < end) {
     655              Py_UCS4 ch = *in++;
     656              *out++ = SWAB2((Py_UCS2)ch);
     657          }
     658  #undef SWAB2
     659      }
     660      *outptr = out;
     661      return len;
     662  #else
     663      if (native_ordering) {
     664  #if STRINGLIB_MAX_CHAR < 0x10000
     665          const STRINGLIB_CHAR *unrolled_end = in + _Py_SIZE_ROUND_DOWN(len, 4);
     666          while (in < unrolled_end) {
     667              /* check if any character is a surrogate character */
     668              if (((in[0] ^ 0xd800) &
     669                   (in[1] ^ 0xd800) &
     670                   (in[2] ^ 0xd800) &
     671                   (in[3] ^ 0xd800) & 0xf800) == 0)
     672                  break;
     673              out[0] = in[0];
     674              out[1] = in[1];
     675              out[2] = in[2];
     676              out[3] = in[3];
     677              in += 4; out += 4;
     678          }
     679  #endif
     680          while (in < end) {
     681              Py_UCS4 ch;
     682              ch = *in++;
     683              if (ch < 0xd800)
     684                  *out++ = ch;
     685              else if (ch < 0xe000)
     686                  /* reject surrogate characters (U+D800-U+DFFF) */
     687                  goto fail;
     688  #if STRINGLIB_MAX_CHAR >= 0x10000
     689              else if (ch >= 0x10000) {
     690                  out[0] = Py_UNICODE_HIGH_SURROGATE(ch);
     691                  out[1] = Py_UNICODE_LOW_SURROGATE(ch);
     692                  out += 2;
     693              }
     694  #endif
     695              else
     696                  *out++ = ch;
     697          }
     698      } else {
     699  #define SWAB2(CH)  (((CH) << 8) | ((CH) >> 8))
     700  #if STRINGLIB_MAX_CHAR < 0x10000
     701          const STRINGLIB_CHAR *unrolled_end = in + _Py_SIZE_ROUND_DOWN(len, 4);
     702          while (in < unrolled_end) {
     703              /* check if any character is a surrogate character */
     704              if (((in[0] ^ 0xd800) &
     705                   (in[1] ^ 0xd800) &
     706                   (in[2] ^ 0xd800) &
     707                   (in[3] ^ 0xd800) & 0xf800) == 0)
     708                  break;
     709              out[0] = SWAB2(in[0]);
     710              out[1] = SWAB2(in[1]);
     711              out[2] = SWAB2(in[2]);
     712              out[3] = SWAB2(in[3]);
     713              in += 4; out += 4;
     714          }
     715  #endif
     716          while (in < end) {
     717              Py_UCS4 ch = *in++;
     718              if (ch < 0xd800)
     719                  *out++ = SWAB2((Py_UCS2)ch);
     720              else if (ch < 0xe000)
     721                  /* reject surrogate characters (U+D800-U+DFFF) */
     722                  goto fail;
     723  #if STRINGLIB_MAX_CHAR >= 0x10000
     724              else if (ch >= 0x10000) {
     725                  Py_UCS2 ch1 = Py_UNICODE_HIGH_SURROGATE(ch);
     726                  Py_UCS2 ch2 = Py_UNICODE_LOW_SURROGATE(ch);
     727                  out[0] = SWAB2(ch1);
     728                  out[1] = SWAB2(ch2);
     729                  out += 2;
     730              }
     731  #endif
     732              else
     733                  *out++ = SWAB2((Py_UCS2)ch);
     734          }
     735  #undef SWAB2
     736      }
     737      *outptr = out;
     738      return len;
     739    fail:
     740      *outptr = out;
     741      return len - (end - in + 1);
     742  #endif
     743  }
     744  
     745  static inline uint32_t
     746  STRINGLIB(SWAB4)(STRINGLIB_CHAR ch)
     747  {
     748      uint32_t word = ch;
     749  #if STRINGLIB_SIZEOF_CHAR == 1
     750      /* high bytes are zero */
     751      return (word << 24);
     752  #elif STRINGLIB_SIZEOF_CHAR == 2
     753      /* high bytes are zero */
     754      return ((word & 0x00FFu) << 24) | ((word & 0xFF00u) << 8);
     755  #else
     756      return _Py_bswap32(word);
     757  #endif
     758  }
     759  
     760  Py_LOCAL_INLINE(Py_ssize_t)
     761  STRINGLIB(utf32_encode)(const STRINGLIB_CHAR *in,
     762                          Py_ssize_t len,
     763                          uint32_t **outptr,
     764                          int native_ordering)
     765  {
     766      uint32_t *out = *outptr;
     767      const STRINGLIB_CHAR *end = in + len;
     768      if (native_ordering) {
     769          const STRINGLIB_CHAR *unrolled_end = in + _Py_SIZE_ROUND_DOWN(len, 4);
     770          while (in < unrolled_end) {
     771  #if STRINGLIB_SIZEOF_CHAR > 1
     772              /* check if any character is a surrogate character */
     773              if (((in[0] ^ 0xd800) &
     774                   (in[1] ^ 0xd800) &
     775                   (in[2] ^ 0xd800) &
     776                   (in[3] ^ 0xd800) & 0xf800) == 0)
     777                  break;
     778  #endif
     779              out[0] = in[0];
     780              out[1] = in[1];
     781              out[2] = in[2];
     782              out[3] = in[3];
     783              in += 4; out += 4;
     784          }
     785          while (in < end) {
     786              Py_UCS4 ch;
     787              ch = *in++;
     788  #if STRINGLIB_SIZEOF_CHAR > 1
     789              if (Py_UNICODE_IS_SURROGATE(ch)) {
     790                  /* reject surrogate characters (U+D800-U+DFFF) */
     791                  goto fail;
     792              }
     793  #endif
     794              *out++ = ch;
     795          }
     796      } else {
     797          const STRINGLIB_CHAR *unrolled_end = in + _Py_SIZE_ROUND_DOWN(len, 4);
     798          while (in < unrolled_end) {
     799  #if STRINGLIB_SIZEOF_CHAR > 1
     800              /* check if any character is a surrogate character */
     801              if (((in[0] ^ 0xd800) &
     802                   (in[1] ^ 0xd800) &
     803                   (in[2] ^ 0xd800) &
     804                   (in[3] ^ 0xd800) & 0xf800) == 0)
     805                  break;
     806  #endif
     807              out[0] = STRINGLIB(SWAB4)(in[0]);
     808              out[1] = STRINGLIB(SWAB4)(in[1]);
     809              out[2] = STRINGLIB(SWAB4)(in[2]);
     810              out[3] = STRINGLIB(SWAB4)(in[3]);
     811              in += 4; out += 4;
     812          }
     813          while (in < end) {
     814              Py_UCS4 ch = *in++;
     815  #if STRINGLIB_SIZEOF_CHAR > 1
     816              if (Py_UNICODE_IS_SURROGATE(ch)) {
     817                  /* reject surrogate characters (U+D800-U+DFFF) */
     818                  goto fail;
     819              }
     820  #endif
     821              *out++ = STRINGLIB(SWAB4)(ch);
     822          }
     823      }
     824      *outptr = out;
     825      return len;
     826  #if STRINGLIB_SIZEOF_CHAR > 1
     827    fail:
     828      *outptr = out;
     829      return len - (end - in + 1);
     830  #endif
     831  }
     832  
     833  #endif