1  /* ------------------------------------------------------------------------
       2  
       3     Python Codec Registry and support functions
       4  
       5  Written by Marc-Andre Lemburg (mal@lemburg.com).
       6  
       7  Copyright (c) Corporation for National Research Initiatives.
       8  
       9     ------------------------------------------------------------------------ */
      10  
      11  #include "Python.h"
      12  #include "pycore_call.h"          // _PyObject_CallNoArgs()
      13  #include "pycore_interp.h"        // PyInterpreterState.codec_search_path
      14  #include "pycore_pystate.h"       // _PyInterpreterState_GET()
      15  #include "pycore_ucnhash.h"       // _PyUnicode_Name_CAPI
      16  #include <ctype.h>
      17  
      18  const char *Py_hexdigits = "0123456789abcdef";
      19  
      20  /* --- Codec Registry ----------------------------------------------------- */
      21  
      22  /* Import the standard encodings package which will register the first
      23     codec search function.
      24  
      25     This is done in a lazy way so that the Unicode implementation does
      26     not downgrade startup time of scripts not needing it.
      27  
      28     ImportErrors are silently ignored by this function. Only one try is
      29     made.
      30  
      31  */
      32  
      33  static int _PyCodecRegistry_Init(void); /* Forward */
      34  
      35  int PyCodec_Register(PyObject *search_function)
      36  {
      37      PyInterpreterState *interp = _PyInterpreterState_GET();
      38      if (interp->codec_search_path == NULL && _PyCodecRegistry_Init())
      39          goto onError;
      40      if (search_function == NULL) {
      41          PyErr_BadArgument();
      42          goto onError;
      43      }
      44      if (!PyCallable_Check(search_function)) {
      45          PyErr_SetString(PyExc_TypeError, "argument must be callable");
      46          goto onError;
      47      }
      48      return PyList_Append(interp->codec_search_path, search_function);
      49  
      50   onError:
      51      return -1;
      52  }
      53  
      54  int
      55  PyCodec_Unregister(PyObject *search_function)
      56  {
      57      PyInterpreterState *interp = PyInterpreterState_Get();
      58      PyObject *codec_search_path = interp->codec_search_path;
      59      /* Do nothing if codec_search_path is not created yet or was cleared. */
      60      if (codec_search_path == NULL) {
      61          return 0;
      62      }
      63  
      64      assert(PyList_CheckExact(codec_search_path));
      65      Py_ssize_t n = PyList_GET_SIZE(codec_search_path);
      66      for (Py_ssize_t i = 0; i < n; i++) {
      67          PyObject *item = PyList_GET_ITEM(codec_search_path, i);
      68          if (item == search_function) {
      69              if (interp->codec_search_cache != NULL) {
      70                  assert(PyDict_CheckExact(interp->codec_search_cache));
      71                  PyDict_Clear(interp->codec_search_cache);
      72              }
      73              return PyList_SetSlice(codec_search_path, i, i+1, NULL);
      74          }
      75      }
      76      return 0;
      77  }
      78  
      79  extern int _Py_normalize_encoding(const char *, char *, size_t);
      80  
      81  /* Convert a string to a normalized Python string(decoded from UTF-8): all characters are
      82     converted to lower case, spaces and hyphens are replaced with underscores. */
      83  
      84  static
      85  PyObject *normalizestring(const char *string)
      86  {
      87      size_t len = strlen(string);
      88      char *encoding;
      89      PyObject *v;
      90  
      91      if (len > PY_SSIZE_T_MAX) {
      92          PyErr_SetString(PyExc_OverflowError, "string is too large");
      93          return NULL;
      94      }
      95  
      96      encoding = PyMem_Malloc(len + 1);
      97      if (encoding == NULL)
      98          return PyErr_NoMemory();
      99  
     100      if (!_Py_normalize_encoding(string, encoding, len + 1))
     101      {
     102          PyErr_SetString(PyExc_RuntimeError, "_Py_normalize_encoding() failed");
     103          PyMem_Free(encoding);
     104          return NULL;
     105      }
     106  
     107      v = PyUnicode_FromString(encoding);
     108      PyMem_Free(encoding);
     109      return v;
     110  }
     111  
     112  /* Lookup the given encoding and return a tuple providing the codec
     113     facilities.
     114  
     115     The encoding string is looked up converted to all lower-case
     116     characters. This makes encodings looked up through this mechanism
     117     effectively case-insensitive.
     118  
     119     If no codec is found, a LookupError is set and NULL returned.
     120  
     121     As side effect, this tries to load the encodings package, if not
     122     yet done. This is part of the lazy load strategy for the encodings
     123     package.
     124  
     125  */
     126  
     127  PyObject *_PyCodec_Lookup(const char *encoding)
     128  {
     129      if (encoding == NULL) {
     130          PyErr_BadArgument();
     131          return NULL;
     132      }
     133  
     134      PyInterpreterState *interp = _PyInterpreterState_GET();
     135      if (interp->codec_search_path == NULL && _PyCodecRegistry_Init()) {
     136          return NULL;
     137      }
     138  
     139      /* Convert the encoding to a normalized Python string: all
     140         characters are converted to lower case, spaces and hyphens are
     141         replaced with underscores. */
     142      PyObject *v = normalizestring(encoding);
     143      if (v == NULL) {
     144          return NULL;
     145      }
     146      PyUnicode_InternInPlace(&v);
     147  
     148      /* First, try to lookup the name in the registry dictionary */
     149      PyObject *result = PyDict_GetItemWithError(interp->codec_search_cache, v);
     150      if (result != NULL) {
     151          Py_INCREF(result);
     152          Py_DECREF(v);
     153          return result;
     154      }
     155      else if (PyErr_Occurred()) {
     156          goto onError;
     157      }
     158  
     159      /* Next, scan the search functions in order of registration */
     160      const Py_ssize_t len = PyList_Size(interp->codec_search_path);
     161      if (len < 0)
     162          goto onError;
     163      if (len == 0) {
     164          PyErr_SetString(PyExc_LookupError,
     165                          "no codec search functions registered: "
     166                          "can't find encoding");
     167          goto onError;
     168      }
     169  
     170      Py_ssize_t i;
     171      for (i = 0; i < len; i++) {
     172          PyObject *func;
     173  
     174          func = PyList_GetItem(interp->codec_search_path, i);
     175          if (func == NULL)
     176              goto onError;
     177          result = PyObject_CallOneArg(func, v);
     178          if (result == NULL)
     179              goto onError;
     180          if (result == Py_None) {
     181              Py_DECREF(result);
     182              continue;
     183          }
     184          if (!PyTuple_Check(result) || PyTuple_GET_SIZE(result) != 4) {
     185              PyErr_SetString(PyExc_TypeError,
     186                              "codec search functions must return 4-tuples");
     187              Py_DECREF(result);
     188              goto onError;
     189          }
     190          break;
     191      }
     192      if (i == len) {
     193          /* XXX Perhaps we should cache misses too ? */
     194          PyErr_Format(PyExc_LookupError,
     195                       "unknown encoding: %s", encoding);
     196          goto onError;
     197      }
     198  
     199      /* Cache and return the result */
     200      if (PyDict_SetItem(interp->codec_search_cache, v, result) < 0) {
     201          Py_DECREF(result);
     202          goto onError;
     203      }
     204      Py_DECREF(v);
     205      return result;
     206  
     207   onError:
     208      Py_DECREF(v);
     209      return NULL;
     210  }
     211  
     212  /* Codec registry encoding check API. */
     213  
     214  int PyCodec_KnownEncoding(const char *encoding)
     215  {
     216      PyObject *codecs;
     217  
     218      codecs = _PyCodec_Lookup(encoding);
     219      if (!codecs) {
     220          PyErr_Clear();
     221          return 0;
     222      }
     223      else {
     224          Py_DECREF(codecs);
     225          return 1;
     226      }
     227  }
     228  
     229  static
     230  PyObject *args_tuple(PyObject *object,
     231                       const char *errors)
     232  {
     233      PyObject *args;
     234  
     235      args = PyTuple_New(1 + (errors != NULL));
     236      if (args == NULL)
     237          return NULL;
     238      Py_INCREF(object);
     239      PyTuple_SET_ITEM(args,0,object);
     240      if (errors) {
     241          PyObject *v;
     242  
     243          v = PyUnicode_FromString(errors);
     244          if (v == NULL) {
     245              Py_DECREF(args);
     246              return NULL;
     247          }
     248          PyTuple_SET_ITEM(args, 1, v);
     249      }
     250      return args;
     251  }
     252  
     253  /* Helper function to get a codec item */
     254  
     255  static
     256  PyObject *codec_getitem(const char *encoding, int index)
     257  {
     258      PyObject *codecs;
     259      PyObject *v;
     260  
     261      codecs = _PyCodec_Lookup(encoding);
     262      if (codecs == NULL)
     263          return NULL;
     264      v = PyTuple_GET_ITEM(codecs, index);
     265      Py_DECREF(codecs);
     266      Py_INCREF(v);
     267      return v;
     268  }
     269  
     270  /* Helper functions to create an incremental codec. */
     271  static
     272  PyObject *codec_makeincrementalcodec(PyObject *codec_info,
     273                                       const char *errors,
     274                                       const char *attrname)
     275  {
     276      PyObject *ret, *inccodec;
     277  
     278      inccodec = PyObject_GetAttrString(codec_info, attrname);
     279      if (inccodec == NULL)
     280          return NULL;
     281      if (errors)
     282          ret = PyObject_CallFunction(inccodec, "s", errors);
     283      else
     284          ret = _PyObject_CallNoArgs(inccodec);
     285      Py_DECREF(inccodec);
     286      return ret;
     287  }
     288  
     289  static
     290  PyObject *codec_getincrementalcodec(const char *encoding,
     291                                      const char *errors,
     292                                      const char *attrname)
     293  {
     294      PyObject *codec_info, *ret;
     295  
     296      codec_info = _PyCodec_Lookup(encoding);
     297      if (codec_info == NULL)
     298          return NULL;
     299      ret = codec_makeincrementalcodec(codec_info, errors, attrname);
     300      Py_DECREF(codec_info);
     301      return ret;
     302  }
     303  
     304  /* Helper function to create a stream codec. */
     305  
     306  static
     307  PyObject *codec_getstreamcodec(const char *encoding,
     308                                 PyObject *stream,
     309                                 const char *errors,
     310                                 const int index)
     311  {
     312      PyObject *codecs, *streamcodec, *codeccls;
     313  
     314      codecs = _PyCodec_Lookup(encoding);
     315      if (codecs == NULL)
     316          return NULL;
     317  
     318      codeccls = PyTuple_GET_ITEM(codecs, index);
     319      if (errors != NULL)
     320          streamcodec = PyObject_CallFunction(codeccls, "Os", stream, errors);
     321      else
     322          streamcodec = PyObject_CallOneArg(codeccls, stream);
     323      Py_DECREF(codecs);
     324      return streamcodec;
     325  }
     326  
     327  /* Helpers to work with the result of _PyCodec_Lookup
     328  
     329   */
     330  PyObject *_PyCodecInfo_GetIncrementalDecoder(PyObject *codec_info,
     331                                               const char *errors)
     332  {
     333      return codec_makeincrementalcodec(codec_info, errors,
     334                                        "incrementaldecoder");
     335  }
     336  
     337  PyObject *_PyCodecInfo_GetIncrementalEncoder(PyObject *codec_info,
     338                                               const char *errors)
     339  {
     340      return codec_makeincrementalcodec(codec_info, errors,
     341                                        "incrementalencoder");
     342  }
     343  
     344  
     345  /* Convenience APIs to query the Codec registry.
     346  
     347     All APIs return a codec object with incremented refcount.
     348  
     349   */
     350  
     351  PyObject *PyCodec_Encoder(const char *encoding)
     352  {
     353      return codec_getitem(encoding, 0);
     354  }
     355  
     356  PyObject *PyCodec_Decoder(const char *encoding)
     357  {
     358      return codec_getitem(encoding, 1);
     359  }
     360  
     361  PyObject *PyCodec_IncrementalEncoder(const char *encoding,
     362                                       const char *errors)
     363  {
     364      return codec_getincrementalcodec(encoding, errors, "incrementalencoder");
     365  }
     366  
     367  PyObject *PyCodec_IncrementalDecoder(const char *encoding,
     368                                       const char *errors)
     369  {
     370      return codec_getincrementalcodec(encoding, errors, "incrementaldecoder");
     371  }
     372  
     373  PyObject *PyCodec_StreamReader(const char *encoding,
     374                                 PyObject *stream,
     375                                 const char *errors)
     376  {
     377      return codec_getstreamcodec(encoding, stream, errors, 2);
     378  }
     379  
     380  PyObject *PyCodec_StreamWriter(const char *encoding,
     381                                 PyObject *stream,
     382                                 const char *errors)
     383  {
     384      return codec_getstreamcodec(encoding, stream, errors, 3);
     385  }
     386  
     387  /* Helper that tries to ensure the reported exception chain indicates the
     388   * codec that was invoked to trigger the failure without changing the type
     389   * of the exception raised.
     390   */
     391  static void
     392  wrap_codec_error(const char *operation,
     393                   const char *encoding)
     394  {
     395      /* TrySetFromCause will replace the active exception with a suitably
     396       * updated clone if it can, otherwise it will leave the original
     397       * exception alone.
     398       */
     399      _PyErr_TrySetFromCause("%s with '%s' codec failed",
     400                             operation, encoding);
     401  }
     402  
     403  /* Encode an object (e.g. a Unicode object) using the given encoding
     404     and return the resulting encoded object (usually a Python string).
     405  
     406     errors is passed to the encoder factory as argument if non-NULL. */
     407  
     408  static PyObject *
     409  _PyCodec_EncodeInternal(PyObject *object,
     410                          PyObject *encoder,
     411                          const char *encoding,
     412                          const char *errors)
     413  {
     414      PyObject *args = NULL, *result = NULL;
     415      PyObject *v = NULL;
     416  
     417      args = args_tuple(object, errors);
     418      if (args == NULL)
     419          goto onError;
     420  
     421      result = PyObject_Call(encoder, args, NULL);
     422      if (result == NULL) {
     423          wrap_codec_error("encoding", encoding);
     424          goto onError;
     425      }
     426  
     427      if (!PyTuple_Check(result) ||
     428          PyTuple_GET_SIZE(result) != 2) {
     429          PyErr_SetString(PyExc_TypeError,
     430                          "encoder must return a tuple (object, integer)");
     431          goto onError;
     432      }
     433      v = PyTuple_GET_ITEM(result,0);
     434      Py_INCREF(v);
     435      /* We don't check or use the second (integer) entry. */
     436  
     437      Py_DECREF(args);
     438      Py_DECREF(encoder);
     439      Py_DECREF(result);
     440      return v;
     441  
     442   onError:
     443      Py_XDECREF(result);
     444      Py_XDECREF(args);
     445      Py_XDECREF(encoder);
     446      return NULL;
     447  }
     448  
     449  /* Decode an object (usually a Python string) using the given encoding
     450     and return an equivalent object (e.g. a Unicode object).
     451  
     452     errors is passed to the decoder factory as argument if non-NULL. */
     453  
     454  static PyObject *
     455  _PyCodec_DecodeInternal(PyObject *object,
     456                          PyObject *decoder,
     457                          const char *encoding,
     458                          const char *errors)
     459  {
     460      PyObject *args = NULL, *result = NULL;
     461      PyObject *v;
     462  
     463      args = args_tuple(object, errors);
     464      if (args == NULL)
     465          goto onError;
     466  
     467      result = PyObject_Call(decoder, args, NULL);
     468      if (result == NULL) {
     469          wrap_codec_error("decoding", encoding);
     470          goto onError;
     471      }
     472      if (!PyTuple_Check(result) ||
     473          PyTuple_GET_SIZE(result) != 2) {
     474          PyErr_SetString(PyExc_TypeError,
     475                          "decoder must return a tuple (object,integer)");
     476          goto onError;
     477      }
     478      v = PyTuple_GET_ITEM(result,0);
     479      Py_INCREF(v);
     480      /* We don't check or use the second (integer) entry. */
     481  
     482      Py_DECREF(args);
     483      Py_DECREF(decoder);
     484      Py_DECREF(result);
     485      return v;
     486  
     487   onError:
     488      Py_XDECREF(args);
     489      Py_XDECREF(decoder);
     490      Py_XDECREF(result);
     491      return NULL;
     492  }
     493  
     494  /* Generic encoding/decoding API */
     495  PyObject *PyCodec_Encode(PyObject *object,
     496                           const char *encoding,
     497                           const char *errors)
     498  {
     499      PyObject *encoder;
     500  
     501      encoder = PyCodec_Encoder(encoding);
     502      if (encoder == NULL)
     503          return NULL;
     504  
     505      return _PyCodec_EncodeInternal(object, encoder, encoding, errors);
     506  }
     507  
     508  PyObject *PyCodec_Decode(PyObject *object,
     509                           const char *encoding,
     510                           const char *errors)
     511  {
     512      PyObject *decoder;
     513  
     514      decoder = PyCodec_Decoder(encoding);
     515      if (decoder == NULL)
     516          return NULL;
     517  
     518      return _PyCodec_DecodeInternal(object, decoder, encoding, errors);
     519  }
     520  
     521  /* Text encoding/decoding API */
     522  PyObject * _PyCodec_LookupTextEncoding(const char *encoding,
     523                                         const char *alternate_command)
     524  {
     525      PyObject *codec;
     526      PyObject *attr;
     527      int is_text_codec;
     528  
     529      codec = _PyCodec_Lookup(encoding);
     530      if (codec == NULL)
     531          return NULL;
     532  
     533      /* Backwards compatibility: assume any raw tuple describes a text
     534       * encoding, and the same for anything lacking the private
     535       * attribute.
     536       */
     537      if (!PyTuple_CheckExact(codec)) {
     538          if (_PyObject_LookupAttr(codec, &_Py_ID(_is_text_encoding), &attr) < 0) {
     539              Py_DECREF(codec);
     540              return NULL;
     541          }
     542          if (attr != NULL) {
     543              is_text_codec = PyObject_IsTrue(attr);
     544              Py_DECREF(attr);
     545              if (is_text_codec <= 0) {
     546                  Py_DECREF(codec);
     547                  if (!is_text_codec)
     548                      PyErr_Format(PyExc_LookupError,
     549                                   "'%.400s' is not a text encoding; "
     550                                   "use %s to handle arbitrary codecs",
     551                                   encoding, alternate_command);
     552                  return NULL;
     553              }
     554          }
     555      }
     556  
     557      /* This appears to be a valid text encoding */
     558      return codec;
     559  }
     560  
     561  
     562  static
     563  PyObject *codec_getitem_checked(const char *encoding,
     564                                  const char *alternate_command,
     565                                  int index)
     566  {
     567      PyObject *codec;
     568      PyObject *v;
     569  
     570      codec = _PyCodec_LookupTextEncoding(encoding, alternate_command);
     571      if (codec == NULL)
     572          return NULL;
     573  
     574      v = PyTuple_GET_ITEM(codec, index);
     575      Py_INCREF(v);
     576      Py_DECREF(codec);
     577      return v;
     578  }
     579  
     580  static PyObject * _PyCodec_TextEncoder(const char *encoding)
     581  {
     582      return codec_getitem_checked(encoding, "codecs.encode()", 0);
     583  }
     584  
     585  static PyObject * _PyCodec_TextDecoder(const char *encoding)
     586  {
     587      return codec_getitem_checked(encoding, "codecs.decode()", 1);
     588  }
     589  
     590  PyObject *_PyCodec_EncodeText(PyObject *object,
     591                                const char *encoding,
     592                                const char *errors)
     593  {
     594      PyObject *encoder;
     595  
     596      encoder = _PyCodec_TextEncoder(encoding);
     597      if (encoder == NULL)
     598          return NULL;
     599  
     600      return _PyCodec_EncodeInternal(object, encoder, encoding, errors);
     601  }
     602  
     603  PyObject *_PyCodec_DecodeText(PyObject *object,
     604                                const char *encoding,
     605                                const char *errors)
     606  {
     607      PyObject *decoder;
     608  
     609      decoder = _PyCodec_TextDecoder(encoding);
     610      if (decoder == NULL)
     611          return NULL;
     612  
     613      return _PyCodec_DecodeInternal(object, decoder, encoding, errors);
     614  }
     615  
     616  /* Register the error handling callback function error under the name
     617     name. This function will be called by the codec when it encounters
     618     an unencodable characters/undecodable bytes and doesn't know the
     619     callback name, when name is specified as the error parameter
     620     in the call to the encode/decode function.
     621     Return 0 on success, -1 on error */
     622  int PyCodec_RegisterError(const char *name, PyObject *error)
     623  {
     624      PyInterpreterState *interp = _PyInterpreterState_GET();
     625      if (interp->codec_search_path == NULL && _PyCodecRegistry_Init())
     626          return -1;
     627      if (!PyCallable_Check(error)) {
     628          PyErr_SetString(PyExc_TypeError, "handler must be callable");
     629          return -1;
     630      }
     631      return PyDict_SetItemString(interp->codec_error_registry,
     632                                  name, error);
     633  }
     634  
     635  /* Lookup the error handling callback function registered under the
     636     name error. As a special case NULL can be passed, in which case
     637     the error handling callback for strict encoding will be returned. */
     638  PyObject *PyCodec_LookupError(const char *name)
     639  {
     640      PyObject *handler = NULL;
     641  
     642      PyInterpreterState *interp = _PyInterpreterState_GET();
     643      if (interp->codec_search_path == NULL && _PyCodecRegistry_Init())
     644          return NULL;
     645  
     646      if (name==NULL)
     647          name = "strict";
     648      handler = _PyDict_GetItemStringWithError(interp->codec_error_registry, name);
     649      if (handler) {
     650          Py_INCREF(handler);
     651      }
     652      else if (!PyErr_Occurred()) {
     653          PyErr_Format(PyExc_LookupError, "unknown error handler name '%.400s'", name);
     654      }
     655      return handler;
     656  }
     657  
     658  static void wrong_exception_type(PyObject *exc)
     659  {
     660      PyErr_Format(PyExc_TypeError,
     661                   "don't know how to handle %.200s in error callback",
     662                   Py_TYPE(exc)->tp_name);
     663  }
     664  
     665  PyObject *PyCodec_StrictErrors(PyObject *exc)
     666  {
     667      if (PyExceptionInstance_Check(exc))
     668          PyErr_SetObject(PyExceptionInstance_Class(exc), exc);
     669      else
     670          PyErr_SetString(PyExc_TypeError, "codec must pass exception instance");
     671      return NULL;
     672  }
     673  
     674  
     675  PyObject *PyCodec_IgnoreErrors(PyObject *exc)
     676  {
     677      Py_ssize_t end;
     678  
     679      if (PyObject_TypeCheck(exc, (PyTypeObject *)PyExc_UnicodeEncodeError)) {
     680          if (PyUnicodeEncodeError_GetEnd(exc, &end))
     681              return NULL;
     682      }
     683      else if (PyObject_TypeCheck(exc, (PyTypeObject *)PyExc_UnicodeDecodeError)) {
     684          if (PyUnicodeDecodeError_GetEnd(exc, &end))
     685              return NULL;
     686      }
     687      else if (PyObject_TypeCheck(exc, (PyTypeObject *)PyExc_UnicodeTranslateError)) {
     688          if (PyUnicodeTranslateError_GetEnd(exc, &end))
     689              return NULL;
     690      }
     691      else {
     692          wrong_exception_type(exc);
     693          return NULL;
     694      }
     695      return Py_BuildValue("(Nn)", PyUnicode_New(0, 0), end);
     696  }
     697  
     698  
     699  PyObject *PyCodec_ReplaceErrors(PyObject *exc)
     700  {
     701      Py_ssize_t start, end, i, len;
     702  
     703      if (PyObject_TypeCheck(exc, (PyTypeObject *)PyExc_UnicodeEncodeError)) {
     704          PyObject *res;
     705          Py_UCS1 *outp;
     706          if (PyUnicodeEncodeError_GetStart(exc, &start))
     707              return NULL;
     708          if (PyUnicodeEncodeError_GetEnd(exc, &end))
     709              return NULL;
     710          len = end - start;
     711          res = PyUnicode_New(len, '?');
     712          if (res == NULL)
     713              return NULL;
     714          assert(PyUnicode_KIND(res) == PyUnicode_1BYTE_KIND);
     715          outp = PyUnicode_1BYTE_DATA(res);
     716          for (i = 0; i < len; ++i)
     717              outp[i] = '?';
     718          assert(_PyUnicode_CheckConsistency(res, 1));
     719          return Py_BuildValue("(Nn)", res, end);
     720      }
     721      else if (PyObject_TypeCheck(exc, (PyTypeObject *)PyExc_UnicodeDecodeError)) {
     722          if (PyUnicodeDecodeError_GetEnd(exc, &end))
     723              return NULL;
     724          return Py_BuildValue("(Cn)",
     725                               (int)Py_UNICODE_REPLACEMENT_CHARACTER,
     726                               end);
     727      }
     728      else if (PyObject_TypeCheck(exc, (PyTypeObject *)PyExc_UnicodeTranslateError)) {
     729          PyObject *res;
     730          Py_UCS2 *outp;
     731          if (PyUnicodeTranslateError_GetStart(exc, &start))
     732              return NULL;
     733          if (PyUnicodeTranslateError_GetEnd(exc, &end))
     734              return NULL;
     735          len = end - start;
     736          res = PyUnicode_New(len, Py_UNICODE_REPLACEMENT_CHARACTER);
     737          if (res == NULL)
     738              return NULL;
     739          assert(PyUnicode_KIND(res) == PyUnicode_2BYTE_KIND);
     740          outp = PyUnicode_2BYTE_DATA(res);
     741          for (i = 0; i < len; i++)
     742              outp[i] = Py_UNICODE_REPLACEMENT_CHARACTER;
     743          assert(_PyUnicode_CheckConsistency(res, 1));
     744          return Py_BuildValue("(Nn)", res, end);
     745      }
     746      else {
     747          wrong_exception_type(exc);
     748          return NULL;
     749      }
     750  }
     751  
     752  PyObject *PyCodec_XMLCharRefReplaceErrors(PyObject *exc)
     753  {
     754      if (PyObject_TypeCheck(exc, (PyTypeObject *)PyExc_UnicodeEncodeError)) {
     755          PyObject *restuple;
     756          PyObject *object;
     757          Py_ssize_t i;
     758          Py_ssize_t start;
     759          Py_ssize_t end;
     760          PyObject *res;
     761          Py_UCS1 *outp;
     762          Py_ssize_t ressize;
     763          Py_UCS4 ch;
     764          if (PyUnicodeEncodeError_GetStart(exc, &start))
     765              return NULL;
     766          if (PyUnicodeEncodeError_GetEnd(exc, &end))
     767              return NULL;
     768          if (!(object = PyUnicodeEncodeError_GetObject(exc)))
     769              return NULL;
     770          if (end - start > PY_SSIZE_T_MAX / (2+7+1))
     771              end = start + PY_SSIZE_T_MAX / (2+7+1);
     772          for (i = start, ressize = 0; i < end; ++i) {
     773              /* object is guaranteed to be "ready" */
     774              ch = PyUnicode_READ_CHAR(object, i);
     775              if (ch<10)
     776                  ressize += 2+1+1;
     777              else if (ch<100)
     778                  ressize += 2+2+1;
     779              else if (ch<1000)
     780                  ressize += 2+3+1;
     781              else if (ch<10000)
     782                  ressize += 2+4+1;
     783              else if (ch<100000)
     784                  ressize += 2+5+1;
     785              else if (ch<1000000)
     786                  ressize += 2+6+1;
     787              else
     788                  ressize += 2+7+1;
     789          }
     790          /* allocate replacement */
     791          res = PyUnicode_New(ressize, 127);
     792          if (res == NULL) {
     793              Py_DECREF(object);
     794              return NULL;
     795          }
     796          outp = PyUnicode_1BYTE_DATA(res);
     797          /* generate replacement */
     798          for (i = start; i < end; ++i) {
     799              int digits;
     800              int base;
     801              ch = PyUnicode_READ_CHAR(object, i);
     802              *outp++ = '&';
     803              *outp++ = '#';
     804              if (ch<10) {
     805                  digits = 1;
     806                  base = 1;
     807              }
     808              else if (ch<100) {
     809                  digits = 2;
     810                  base = 10;
     811              }
     812              else if (ch<1000) {
     813                  digits = 3;
     814                  base = 100;
     815              }
     816              else if (ch<10000) {
     817                  digits = 4;
     818                  base = 1000;
     819              }
     820              else if (ch<100000) {
     821                  digits = 5;
     822                  base = 10000;
     823              }
     824              else if (ch<1000000) {
     825                  digits = 6;
     826                  base = 100000;
     827              }
     828              else {
     829                  digits = 7;
     830                  base = 1000000;
     831              }
     832              while (digits-->0) {
     833                  *outp++ = '0' + ch/base;
     834                  ch %= base;
     835                  base /= 10;
     836              }
     837              *outp++ = ';';
     838          }
     839          assert(_PyUnicode_CheckConsistency(res, 1));
     840          restuple = Py_BuildValue("(Nn)", res, end);
     841          Py_DECREF(object);
     842          return restuple;
     843      }
     844      else {
     845          wrong_exception_type(exc);
     846          return NULL;
     847      }
     848  }
     849  
     850  PyObject *PyCodec_BackslashReplaceErrors(PyObject *exc)
     851  {
     852      PyObject *object;
     853      Py_ssize_t i;
     854      Py_ssize_t start;
     855      Py_ssize_t end;
     856      PyObject *res;
     857      Py_UCS1 *outp;
     858      int ressize;
     859      Py_UCS4 c;
     860  
     861      if (PyObject_TypeCheck(exc, (PyTypeObject *)PyExc_UnicodeDecodeError)) {
     862          const unsigned char *p;
     863          if (PyUnicodeDecodeError_GetStart(exc, &start))
     864              return NULL;
     865          if (PyUnicodeDecodeError_GetEnd(exc, &end))
     866              return NULL;
     867          if (!(object = PyUnicodeDecodeError_GetObject(exc)))
     868              return NULL;
     869          p = (const unsigned char*)PyBytes_AS_STRING(object);
     870          res = PyUnicode_New(4 * (end - start), 127);
     871          if (res == NULL) {
     872              Py_DECREF(object);
     873              return NULL;
     874          }
     875          outp = PyUnicode_1BYTE_DATA(res);
     876          for (i = start; i < end; i++, outp += 4) {
     877              unsigned char c = p[i];
     878              outp[0] = '\\';
     879              outp[1] = 'x';
     880              outp[2] = Py_hexdigits[(c>>4)&0xf];
     881              outp[3] = Py_hexdigits[c&0xf];
     882          }
     883  
     884          assert(_PyUnicode_CheckConsistency(res, 1));
     885          Py_DECREF(object);
     886          return Py_BuildValue("(Nn)", res, end);
     887      }
     888      if (PyObject_TypeCheck(exc, (PyTypeObject *)PyExc_UnicodeEncodeError)) {
     889          if (PyUnicodeEncodeError_GetStart(exc, &start))
     890              return NULL;
     891          if (PyUnicodeEncodeError_GetEnd(exc, &end))
     892              return NULL;
     893          if (!(object = PyUnicodeEncodeError_GetObject(exc)))
     894              return NULL;
     895      }
     896      else if (PyObject_TypeCheck(exc, (PyTypeObject *)PyExc_UnicodeTranslateError)) {
     897          if (PyUnicodeTranslateError_GetStart(exc, &start))
     898              return NULL;
     899          if (PyUnicodeTranslateError_GetEnd(exc, &end))
     900              return NULL;
     901          if (!(object = PyUnicodeTranslateError_GetObject(exc)))
     902              return NULL;
     903      }
     904      else {
     905          wrong_exception_type(exc);
     906          return NULL;
     907      }
     908  
     909      if (end - start > PY_SSIZE_T_MAX / (1+1+8))
     910          end = start + PY_SSIZE_T_MAX / (1+1+8);
     911      for (i = start, ressize = 0; i < end; ++i) {
     912          /* object is guaranteed to be "ready" */
     913          c = PyUnicode_READ_CHAR(object, i);
     914          if (c >= 0x10000) {
     915              ressize += 1+1+8;
     916          }
     917          else if (c >= 0x100) {
     918              ressize += 1+1+4;
     919          }
     920          else
     921              ressize += 1+1+2;
     922      }
     923      res = PyUnicode_New(ressize, 127);
     924      if (res == NULL) {
     925          Py_DECREF(object);
     926          return NULL;
     927      }
     928      outp = PyUnicode_1BYTE_DATA(res);
     929      for (i = start; i < end; ++i) {
     930          c = PyUnicode_READ_CHAR(object, i);
     931          *outp++ = '\\';
     932          if (c >= 0x00010000) {
     933              *outp++ = 'U';
     934              *outp++ = Py_hexdigits[(c>>28)&0xf];
     935              *outp++ = Py_hexdigits[(c>>24)&0xf];
     936              *outp++ = Py_hexdigits[(c>>20)&0xf];
     937              *outp++ = Py_hexdigits[(c>>16)&0xf];
     938              *outp++ = Py_hexdigits[(c>>12)&0xf];
     939              *outp++ = Py_hexdigits[(c>>8)&0xf];
     940          }
     941          else if (c >= 0x100) {
     942              *outp++ = 'u';
     943              *outp++ = Py_hexdigits[(c>>12)&0xf];
     944              *outp++ = Py_hexdigits[(c>>8)&0xf];
     945          }
     946          else
     947              *outp++ = 'x';
     948          *outp++ = Py_hexdigits[(c>>4)&0xf];
     949          *outp++ = Py_hexdigits[c&0xf];
     950      }
     951  
     952      assert(_PyUnicode_CheckConsistency(res, 1));
     953      Py_DECREF(object);
     954      return Py_BuildValue("(Nn)", res, end);
     955  }
     956  
     957  static _PyUnicode_Name_CAPI *ucnhash_capi = NULL;
     958  
     959  PyObject *PyCodec_NameReplaceErrors(PyObject *exc)
     960  {
     961      if (PyObject_TypeCheck(exc, (PyTypeObject *)PyExc_UnicodeEncodeError)) {
     962          PyObject *restuple;
     963          PyObject *object;
     964          Py_ssize_t i;
     965          Py_ssize_t start;
     966          Py_ssize_t end;
     967          PyObject *res;
     968          Py_UCS1 *outp;
     969          Py_ssize_t ressize;
     970          int replsize;
     971          Py_UCS4 c;
     972          char buffer[256]; /* NAME_MAXLEN */
     973          if (PyUnicodeEncodeError_GetStart(exc, &start))
     974              return NULL;
     975          if (PyUnicodeEncodeError_GetEnd(exc, &end))
     976              return NULL;
     977          if (!(object = PyUnicodeEncodeError_GetObject(exc)))
     978              return NULL;
     979          if (!ucnhash_capi) {
     980              /* load the unicode data module */
     981              ucnhash_capi = (_PyUnicode_Name_CAPI *)PyCapsule_Import(
     982                                              PyUnicodeData_CAPSULE_NAME, 1);
     983              if (!ucnhash_capi) {
     984                  return NULL;
     985              }
     986          }
     987          for (i = start, ressize = 0; i < end; ++i) {
     988              /* object is guaranteed to be "ready" */
     989              c = PyUnicode_READ_CHAR(object, i);
     990              if (ucnhash_capi->getname(c, buffer, sizeof(buffer), 1)) {
     991                  replsize = 1+1+1+(int)strlen(buffer)+1;
     992              }
     993              else if (c >= 0x10000) {
     994                  replsize = 1+1+8;
     995              }
     996              else if (c >= 0x100) {
     997                  replsize = 1+1+4;
     998              }
     999              else
    1000                  replsize = 1+1+2;
    1001              if (ressize > PY_SSIZE_T_MAX - replsize)
    1002                  break;
    1003              ressize += replsize;
    1004          }
    1005          end = i;
    1006          res = PyUnicode_New(ressize, 127);
    1007          if (res==NULL)
    1008              return NULL;
    1009          for (i = start, outp = PyUnicode_1BYTE_DATA(res);
    1010              i < end; ++i) {
    1011              c = PyUnicode_READ_CHAR(object, i);
    1012              *outp++ = '\\';
    1013              if (ucnhash_capi->getname(c, buffer, sizeof(buffer), 1)) {
    1014                  *outp++ = 'N';
    1015                  *outp++ = '{';
    1016                  strcpy((char *)outp, buffer);
    1017                  outp += strlen(buffer);
    1018                  *outp++ = '}';
    1019                  continue;
    1020              }
    1021              if (c >= 0x00010000) {
    1022                  *outp++ = 'U';
    1023                  *outp++ = Py_hexdigits[(c>>28)&0xf];
    1024                  *outp++ = Py_hexdigits[(c>>24)&0xf];
    1025                  *outp++ = Py_hexdigits[(c>>20)&0xf];
    1026                  *outp++ = Py_hexdigits[(c>>16)&0xf];
    1027                  *outp++ = Py_hexdigits[(c>>12)&0xf];
    1028                  *outp++ = Py_hexdigits[(c>>8)&0xf];
    1029              }
    1030              else if (c >= 0x100) {
    1031                  *outp++ = 'u';
    1032                  *outp++ = Py_hexdigits[(c>>12)&0xf];
    1033                  *outp++ = Py_hexdigits[(c>>8)&0xf];
    1034              }
    1035              else
    1036                  *outp++ = 'x';
    1037              *outp++ = Py_hexdigits[(c>>4)&0xf];
    1038              *outp++ = Py_hexdigits[c&0xf];
    1039          }
    1040  
    1041          assert(outp == PyUnicode_1BYTE_DATA(res) + ressize);
    1042          assert(_PyUnicode_CheckConsistency(res, 1));
    1043          restuple = Py_BuildValue("(Nn)", res, end);
    1044          Py_DECREF(object);
    1045          return restuple;
    1046      }
    1047      else {
    1048          wrong_exception_type(exc);
    1049          return NULL;
    1050      }
    1051  }
    1052  
    1053  #define ENC_UNKNOWN     -1
    1054  #define ENC_UTF8        0
    1055  #define ENC_UTF16BE     1
    1056  #define ENC_UTF16LE     2
    1057  #define ENC_UTF32BE     3
    1058  #define ENC_UTF32LE     4
    1059  
    1060  static int
    1061  get_standard_encoding(const char *encoding, int *bytelength)
    1062  {
    1063      if (Py_TOLOWER(encoding[0]) == 'u' &&
    1064          Py_TOLOWER(encoding[1]) == 't' &&
    1065          Py_TOLOWER(encoding[2]) == 'f') {
    1066          encoding += 3;
    1067          if (*encoding == '-' || *encoding == '_' )
    1068              encoding++;
    1069          if (encoding[0] == '8' && encoding[1] == '\0') {
    1070              *bytelength = 3;
    1071              return ENC_UTF8;
    1072          }
    1073          else if (encoding[0] == '1' && encoding[1] == '6') {
    1074              encoding += 2;
    1075              *bytelength = 2;
    1076              if (*encoding == '\0') {
    1077  #ifdef WORDS_BIGENDIAN
    1078                  return ENC_UTF16BE;
    1079  #else
    1080                  return ENC_UTF16LE;
    1081  #endif
    1082              }
    1083              if (*encoding == '-' || *encoding == '_' )
    1084                  encoding++;
    1085              if (Py_TOLOWER(encoding[1]) == 'e' && encoding[2] == '\0') {
    1086                  if (Py_TOLOWER(encoding[0]) == 'b')
    1087                      return ENC_UTF16BE;
    1088                  if (Py_TOLOWER(encoding[0]) == 'l')
    1089                      return ENC_UTF16LE;
    1090              }
    1091          }
    1092          else if (encoding[0] == '3' && encoding[1] == '2') {
    1093              encoding += 2;
    1094              *bytelength = 4;
    1095              if (*encoding == '\0') {
    1096  #ifdef WORDS_BIGENDIAN
    1097                  return ENC_UTF32BE;
    1098  #else
    1099                  return ENC_UTF32LE;
    1100  #endif
    1101              }
    1102              if (*encoding == '-' || *encoding == '_' )
    1103                  encoding++;
    1104              if (Py_TOLOWER(encoding[1]) == 'e' && encoding[2] == '\0') {
    1105                  if (Py_TOLOWER(encoding[0]) == 'b')
    1106                      return ENC_UTF32BE;
    1107                  if (Py_TOLOWER(encoding[0]) == 'l')
    1108                      return ENC_UTF32LE;
    1109              }
    1110          }
    1111      }
    1112      else if (strcmp(encoding, "CP_UTF8") == 0) {
    1113          *bytelength = 3;
    1114          return ENC_UTF8;
    1115      }
    1116      return ENC_UNKNOWN;
    1117  }
    1118  
    1119  /* This handler is declared static until someone demonstrates
    1120     a need to call it directly. */
    1121  static PyObject *
    1122  PyCodec_SurrogatePassErrors(PyObject *exc)
    1123  {
    1124      PyObject *restuple;
    1125      PyObject *object;
    1126      PyObject *encode;
    1127      const char *encoding;
    1128      int code;
    1129      int bytelength;
    1130      Py_ssize_t i;
    1131      Py_ssize_t start;
    1132      Py_ssize_t end;
    1133      PyObject *res;
    1134  
    1135      if (PyObject_TypeCheck(exc, (PyTypeObject *)PyExc_UnicodeEncodeError)) {
    1136          unsigned char *outp;
    1137          if (PyUnicodeEncodeError_GetStart(exc, &start))
    1138              return NULL;
    1139          if (PyUnicodeEncodeError_GetEnd(exc, &end))
    1140              return NULL;
    1141          if (!(object = PyUnicodeEncodeError_GetObject(exc)))
    1142              return NULL;
    1143          if (!(encode = PyUnicodeEncodeError_GetEncoding(exc))) {
    1144              Py_DECREF(object);
    1145              return NULL;
    1146          }
    1147          if (!(encoding = PyUnicode_AsUTF8(encode))) {
    1148              Py_DECREF(object);
    1149              Py_DECREF(encode);
    1150              return NULL;
    1151          }
    1152          code = get_standard_encoding(encoding, &bytelength);
    1153          Py_DECREF(encode);
    1154          if (code == ENC_UNKNOWN) {
    1155              /* Not supported, fail with original exception */
    1156              PyErr_SetObject(PyExceptionInstance_Class(exc), exc);
    1157              Py_DECREF(object);
    1158              return NULL;
    1159          }
    1160  
    1161          if (end - start > PY_SSIZE_T_MAX / bytelength)
    1162              end = start + PY_SSIZE_T_MAX / bytelength;
    1163          res = PyBytes_FromStringAndSize(NULL, bytelength*(end-start));
    1164          if (!res) {
    1165              Py_DECREF(object);
    1166              return NULL;
    1167          }
    1168          outp = (unsigned char*)PyBytes_AsString(res);
    1169          for (i = start; i < end; i++) {
    1170              /* object is guaranteed to be "ready" */
    1171              Py_UCS4 ch = PyUnicode_READ_CHAR(object, i);
    1172              if (!Py_UNICODE_IS_SURROGATE(ch)) {
    1173                  /* Not a surrogate, fail with original exception */
    1174                  PyErr_SetObject(PyExceptionInstance_Class(exc), exc);
    1175                  Py_DECREF(res);
    1176                  Py_DECREF(object);
    1177                  return NULL;
    1178              }
    1179              switch (code) {
    1180              case ENC_UTF8:
    1181                  *outp++ = (unsigned char)(0xe0 | (ch >> 12));
    1182                  *outp++ = (unsigned char)(0x80 | ((ch >> 6) & 0x3f));
    1183                  *outp++ = (unsigned char)(0x80 | (ch & 0x3f));
    1184                  break;
    1185              case ENC_UTF16LE:
    1186                  *outp++ = (unsigned char) ch;
    1187                  *outp++ = (unsigned char)(ch >> 8);
    1188                  break;
    1189              case ENC_UTF16BE:
    1190                  *outp++ = (unsigned char)(ch >> 8);
    1191                  *outp++ = (unsigned char) ch;
    1192                  break;
    1193              case ENC_UTF32LE:
    1194                  *outp++ = (unsigned char) ch;
    1195                  *outp++ = (unsigned char)(ch >> 8);
    1196                  *outp++ = (unsigned char)(ch >> 16);
    1197                  *outp++ = (unsigned char)(ch >> 24);
    1198                  break;
    1199              case ENC_UTF32BE:
    1200                  *outp++ = (unsigned char)(ch >> 24);
    1201                  *outp++ = (unsigned char)(ch >> 16);
    1202                  *outp++ = (unsigned char)(ch >> 8);
    1203                  *outp++ = (unsigned char) ch;
    1204                  break;
    1205              }
    1206          }
    1207          restuple = Py_BuildValue("(On)", res, end);
    1208          Py_DECREF(res);
    1209          Py_DECREF(object);
    1210          return restuple;
    1211      }
    1212      else if (PyObject_TypeCheck(exc, (PyTypeObject *)PyExc_UnicodeDecodeError)) {
    1213          const unsigned char *p;
    1214          Py_UCS4 ch = 0;
    1215          if (PyUnicodeDecodeError_GetStart(exc, &start))
    1216              return NULL;
    1217          if (PyUnicodeDecodeError_GetEnd(exc, &end))
    1218              return NULL;
    1219          if (!(object = PyUnicodeDecodeError_GetObject(exc)))
    1220              return NULL;
    1221          p = (const unsigned char*)PyBytes_AS_STRING(object);
    1222          if (!(encode = PyUnicodeDecodeError_GetEncoding(exc))) {
    1223              Py_DECREF(object);
    1224              return NULL;
    1225          }
    1226          if (!(encoding = PyUnicode_AsUTF8(encode))) {
    1227              Py_DECREF(object);
    1228              Py_DECREF(encode);
    1229              return NULL;
    1230          }
    1231          code = get_standard_encoding(encoding, &bytelength);
    1232          Py_DECREF(encode);
    1233          if (code == ENC_UNKNOWN) {
    1234              /* Not supported, fail with original exception */
    1235              PyErr_SetObject(PyExceptionInstance_Class(exc), exc);
    1236              Py_DECREF(object);
    1237              return NULL;
    1238          }
    1239  
    1240          /* Try decoding a single surrogate character. If
    1241             there are more, let the codec call us again. */
    1242          p += start;
    1243          if (PyBytes_GET_SIZE(object) - start >= bytelength) {
    1244              switch (code) {
    1245              case ENC_UTF8:
    1246                  if ((p[0] & 0xf0) == 0xe0 &&
    1247                      (p[1] & 0xc0) == 0x80 &&
    1248                      (p[2] & 0xc0) == 0x80) {
    1249                      /* it's a three-byte code */
    1250                      ch = ((p[0] & 0x0f) << 12) + ((p[1] & 0x3f) << 6) + (p[2] & 0x3f);
    1251                  }
    1252                  break;
    1253              case ENC_UTF16LE:
    1254                  ch = p[1] << 8 | p[0];
    1255                  break;
    1256              case ENC_UTF16BE:
    1257                  ch = p[0] << 8 | p[1];
    1258                  break;
    1259              case ENC_UTF32LE:
    1260                  ch = (p[3] << 24) | (p[2] << 16) | (p[1] << 8) | p[0];
    1261                  break;
    1262              case ENC_UTF32BE:
    1263                  ch = (p[0] << 24) | (p[1] << 16) | (p[2] << 8) | p[3];
    1264                  break;
    1265              }
    1266          }
    1267  
    1268          Py_DECREF(object);
    1269          if (!Py_UNICODE_IS_SURROGATE(ch)) {
    1270              /* it's not a surrogate - fail */
    1271              PyErr_SetObject(PyExceptionInstance_Class(exc), exc);
    1272              return NULL;
    1273          }
    1274          res = PyUnicode_FromOrdinal(ch);
    1275          if (res == NULL)
    1276              return NULL;
    1277          return Py_BuildValue("(Nn)", res, start + bytelength);
    1278      }
    1279      else {
    1280          wrong_exception_type(exc);
    1281          return NULL;
    1282      }
    1283  }
    1284  
    1285  static PyObject *
    1286  PyCodec_SurrogateEscapeErrors(PyObject *exc)
    1287  {
    1288      PyObject *restuple;
    1289      PyObject *object;
    1290      Py_ssize_t i;
    1291      Py_ssize_t start;
    1292      Py_ssize_t end;
    1293      PyObject *res;
    1294  
    1295      if (PyObject_TypeCheck(exc, (PyTypeObject *)PyExc_UnicodeEncodeError)) {
    1296          char *outp;
    1297          if (PyUnicodeEncodeError_GetStart(exc, &start))
    1298              return NULL;
    1299          if (PyUnicodeEncodeError_GetEnd(exc, &end))
    1300              return NULL;
    1301          if (!(object = PyUnicodeEncodeError_GetObject(exc)))
    1302              return NULL;
    1303          res = PyBytes_FromStringAndSize(NULL, end-start);
    1304          if (!res) {
    1305              Py_DECREF(object);
    1306              return NULL;
    1307          }
    1308          outp = PyBytes_AsString(res);
    1309          for (i = start; i < end; i++) {
    1310              /* object is guaranteed to be "ready" */
    1311              Py_UCS4 ch = PyUnicode_READ_CHAR(object, i);
    1312              if (ch < 0xdc80 || ch > 0xdcff) {
    1313                  /* Not a UTF-8b surrogate, fail with original exception */
    1314                  PyErr_SetObject(PyExceptionInstance_Class(exc), exc);
    1315                  Py_DECREF(res);
    1316                  Py_DECREF(object);
    1317                  return NULL;
    1318              }
    1319              *outp++ = ch - 0xdc00;
    1320          }
    1321          restuple = Py_BuildValue("(On)", res, end);
    1322          Py_DECREF(res);
    1323          Py_DECREF(object);
    1324          return restuple;
    1325      }
    1326      else if (PyObject_TypeCheck(exc, (PyTypeObject *)PyExc_UnicodeDecodeError)) {
    1327          PyObject *str;
    1328          const unsigned char *p;
    1329          Py_UCS2 ch[4]; /* decode up to 4 bad bytes. */
    1330          int consumed = 0;
    1331          if (PyUnicodeDecodeError_GetStart(exc, &start))
    1332              return NULL;
    1333          if (PyUnicodeDecodeError_GetEnd(exc, &end))
    1334              return NULL;
    1335          if (!(object = PyUnicodeDecodeError_GetObject(exc)))
    1336              return NULL;
    1337          p = (const unsigned char*)PyBytes_AS_STRING(object);
    1338          while (consumed < 4 && consumed < end-start) {
    1339              /* Refuse to escape ASCII bytes. */
    1340              if (p[start+consumed] < 128)
    1341                  break;
    1342              ch[consumed] = 0xdc00 + p[start+consumed];
    1343              consumed++;
    1344          }
    1345          Py_DECREF(object);
    1346          if (!consumed) {
    1347              /* codec complained about ASCII byte. */
    1348              PyErr_SetObject(PyExceptionInstance_Class(exc), exc);
    1349              return NULL;
    1350          }
    1351          str = PyUnicode_FromKindAndData(PyUnicode_2BYTE_KIND, ch, consumed);
    1352          if (str == NULL)
    1353              return NULL;
    1354          return Py_BuildValue("(Nn)", str, start+consumed);
    1355      }
    1356      else {
    1357          wrong_exception_type(exc);
    1358          return NULL;
    1359      }
    1360  }
    1361  
    1362  
    1363  static PyObject *strict_errors(PyObject *self, PyObject *exc)
    1364  {
    1365      return PyCodec_StrictErrors(exc);
    1366  }
    1367  
    1368  
    1369  static PyObject *ignore_errors(PyObject *self, PyObject *exc)
    1370  {
    1371      return PyCodec_IgnoreErrors(exc);
    1372  }
    1373  
    1374  
    1375  static PyObject *replace_errors(PyObject *self, PyObject *exc)
    1376  {
    1377      return PyCodec_ReplaceErrors(exc);
    1378  }
    1379  
    1380  
    1381  static PyObject *xmlcharrefreplace_errors(PyObject *self, PyObject *exc)
    1382  {
    1383      return PyCodec_XMLCharRefReplaceErrors(exc);
    1384  }
    1385  
    1386  
    1387  static PyObject *backslashreplace_errors(PyObject *self, PyObject *exc)
    1388  {
    1389      return PyCodec_BackslashReplaceErrors(exc);
    1390  }
    1391  
    1392  static PyObject *namereplace_errors(PyObject *self, PyObject *exc)
    1393  {
    1394      return PyCodec_NameReplaceErrors(exc);
    1395  }
    1396  
    1397  static PyObject *surrogatepass_errors(PyObject *self, PyObject *exc)
    1398  {
    1399      return PyCodec_SurrogatePassErrors(exc);
    1400  }
    1401  
    1402  static PyObject *surrogateescape_errors(PyObject *self, PyObject *exc)
    1403  {
    1404      return PyCodec_SurrogateEscapeErrors(exc);
    1405  }
    1406  
    1407  static int _PyCodecRegistry_Init(void)
    1408  {
    1409      static struct {
    1410          const char *name;
    1411          PyMethodDef def;
    1412      } methods[] =
    1413      {
    1414          {
    1415              "strict",
    1416              {
    1417                  "strict_errors",
    1418                  strict_errors,
    1419                  METH_O,
    1420                  PyDoc_STR("Implements the 'strict' error handling, which "
    1421                            "raises a UnicodeError on coding errors.")
    1422              }
    1423          },
    1424          {
    1425              "ignore",
    1426              {
    1427                  "ignore_errors",
    1428                  ignore_errors,
    1429                  METH_O,
    1430                  PyDoc_STR("Implements the 'ignore' error handling, which "
    1431                            "ignores malformed data and continues.")
    1432              }
    1433          },
    1434          {
    1435              "replace",
    1436              {
    1437                  "replace_errors",
    1438                  replace_errors,
    1439                  METH_O,
    1440                  PyDoc_STR("Implements the 'replace' error handling, which "
    1441                            "replaces malformed data with a replacement marker.")
    1442              }
    1443          },
    1444          {
    1445              "xmlcharrefreplace",
    1446              {
    1447                  "xmlcharrefreplace_errors",
    1448                  xmlcharrefreplace_errors,
    1449                  METH_O,
    1450                  PyDoc_STR("Implements the 'xmlcharrefreplace' error handling, "
    1451                            "which replaces an unencodable character with the "
    1452                            "appropriate XML character reference.")
    1453              }
    1454          },
    1455          {
    1456              "backslashreplace",
    1457              {
    1458                  "backslashreplace_errors",
    1459                  backslashreplace_errors,
    1460                  METH_O,
    1461                  PyDoc_STR("Implements the 'backslashreplace' error handling, "
    1462                            "which replaces malformed data with a backslashed "
    1463                            "escape sequence.")
    1464              }
    1465          },
    1466          {
    1467              "namereplace",
    1468              {
    1469                  "namereplace_errors",
    1470                  namereplace_errors,
    1471                  METH_O,
    1472                  PyDoc_STR("Implements the 'namereplace' error handling, "
    1473                            "which replaces an unencodable character with a "
    1474                            "\\N{...} escape sequence.")
    1475              }
    1476          },
    1477          {
    1478              "surrogatepass",
    1479              {
    1480                  "surrogatepass",
    1481                  surrogatepass_errors,
    1482                  METH_O
    1483              }
    1484          },
    1485          {
    1486              "surrogateescape",
    1487              {
    1488                  "surrogateescape",
    1489                  surrogateescape_errors,
    1490                  METH_O
    1491              }
    1492          }
    1493      };
    1494  
    1495      PyInterpreterState *interp = _PyInterpreterState_GET();
    1496      PyObject *mod;
    1497  
    1498      if (interp->codec_search_path != NULL)
    1499          return 0;
    1500  
    1501      interp->codec_search_path = PyList_New(0);
    1502      if (interp->codec_search_path == NULL) {
    1503          return -1;
    1504      }
    1505  
    1506      interp->codec_search_cache = PyDict_New();
    1507      if (interp->codec_search_cache == NULL) {
    1508          return -1;
    1509      }
    1510  
    1511      interp->codec_error_registry = PyDict_New();
    1512      if (interp->codec_error_registry == NULL) {
    1513          return -1;
    1514      }
    1515  
    1516      for (size_t i = 0; i < Py_ARRAY_LENGTH(methods); ++i) {
    1517          PyObject *func = PyCFunction_NewEx(&methods[i].def, NULL, NULL);
    1518          if (!func) {
    1519              return -1;
    1520          }
    1521  
    1522          int res = PyCodec_RegisterError(methods[i].name, func);
    1523          Py_DECREF(func);
    1524          if (res) {
    1525              return -1;
    1526          }
    1527      }
    1528  
    1529      mod = PyImport_ImportModule("encodings");
    1530      if (mod == NULL) {
    1531          return -1;
    1532      }
    1533      Py_DECREF(mod);
    1534      interp->codecs_initialized = 1;
    1535      return 0;
    1536  }