(root)/
Python-3.11.7/
Objects/
stringlib/
split.h
       1  /* stringlib: split implementation */
       2  
       3  #ifndef STRINGLIB_FASTSEARCH_H
       4  #error must include "stringlib/fastsearch.h" before including this module
       5  #endif
       6  
       7  /* Overallocate the initial list to reduce the number of reallocs for small
       8     split sizes.  Eg, "A A A A A A A A A A".split() (10 elements) has three
       9     resizes, to sizes 4, 8, then 16.  Most observed string splits are for human
      10     text (roughly 11 words per line) and field delimited data (usually 1-10
      11     fields).  For large strings the split algorithms are bandwidth limited
      12     so increasing the preallocation likely will not improve things.*/
      13  
      14  #define MAX_PREALLOC 12
      15  
      16  /* 5 splits gives 6 elements */
      17  #define PREALLOC_SIZE(maxsplit) \
      18      (maxsplit >= MAX_PREALLOC ? MAX_PREALLOC : maxsplit+1)
      19  
      20  #define SPLIT_APPEND(data, left, right)         \
      21      sub = STRINGLIB_NEW((data) + (left),        \
      22                          (right) - (left));      \
      23      if (sub == NULL)                            \
      24          goto onError;                           \
      25      if (PyList_Append(list, sub)) {             \
      26          Py_DECREF(sub);                         \
      27          goto onError;                           \
      28      }                                           \
      29      else                                        \
      30          Py_DECREF(sub);
      31  
      32  #define SPLIT_ADD(data, left, right) {          \
      33      sub = STRINGLIB_NEW((data) + (left),        \
      34                          (right) - (left));      \
      35      if (sub == NULL)                            \
      36          goto onError;                           \
      37      if (count < MAX_PREALLOC) {                 \
      38          PyList_SET_ITEM(list, count, sub);      \
      39      } else {                                    \
      40          if (PyList_Append(list, sub)) {         \
      41              Py_DECREF(sub);                     \
      42              goto onError;                       \
      43          }                                       \
      44          else                                    \
      45              Py_DECREF(sub);                     \
      46      }                                           \
      47      count++; }
      48  
      49  
      50  /* Always force the list to the expected size. */
      51  #define FIX_PREALLOC_SIZE(list) Py_SET_SIZE(list, count)
      52  
      53  Py_LOCAL_INLINE(PyObject *)
      54  STRINGLIB(split_whitespace)(PyObject* str_obj,
      55                             const STRINGLIB_CHAR* str, Py_ssize_t str_len,
      56                             Py_ssize_t maxcount)
      57  {
      58      Py_ssize_t i, j, count=0;
      59      PyObject *list = PyList_New(PREALLOC_SIZE(maxcount));
      60      PyObject *sub;
      61  
      62      if (list == NULL)
      63          return NULL;
      64  
      65      i = j = 0;
      66      while (maxcount-- > 0) {
      67          while (i < str_len && STRINGLIB_ISSPACE(str[i]))
      68              i++;
      69          if (i == str_len) break;
      70          j = i; i++;
      71          while (i < str_len && !STRINGLIB_ISSPACE(str[i]))
      72              i++;
      73  #if !STRINGLIB_MUTABLE
      74          if (j == 0 && i == str_len && STRINGLIB_CHECK_EXACT(str_obj)) {
      75              /* No whitespace in str_obj, so just use it as list[0] */
      76              Py_INCREF(str_obj);
      77              PyList_SET_ITEM(list, 0, (PyObject *)str_obj);
      78              count++;
      79              break;
      80          }
      81  #endif
      82          SPLIT_ADD(str, j, i);
      83      }
      84  
      85      if (i < str_len) {
      86          /* Only occurs when maxcount was reached */
      87          /* Skip any remaining whitespace and copy to end of string */
      88          while (i < str_len && STRINGLIB_ISSPACE(str[i]))
      89              i++;
      90          if (i != str_len)
      91              SPLIT_ADD(str, i, str_len);
      92      }
      93      FIX_PREALLOC_SIZE(list);
      94      return list;
      95  
      96    onError:
      97      Py_DECREF(list);
      98      return NULL;
      99  }
     100  
     101  Py_LOCAL_INLINE(PyObject *)
     102  STRINGLIB(split_char)(PyObject* str_obj,
     103                       const STRINGLIB_CHAR* str, Py_ssize_t str_len,
     104                       const STRINGLIB_CHAR ch,
     105                       Py_ssize_t maxcount)
     106  {
     107      Py_ssize_t i, j, count=0;
     108      PyObject *list = PyList_New(PREALLOC_SIZE(maxcount));
     109      PyObject *sub;
     110  
     111      if (list == NULL)
     112          return NULL;
     113  
     114      i = j = 0;
     115      while ((j < str_len) && (maxcount-- > 0)) {
     116          for(; j < str_len; j++) {
     117              /* I found that using memchr makes no difference */
     118              if (str[j] == ch) {
     119                  SPLIT_ADD(str, i, j);
     120                  i = j = j + 1;
     121                  break;
     122              }
     123          }
     124      }
     125  #if !STRINGLIB_MUTABLE
     126      if (count == 0 && STRINGLIB_CHECK_EXACT(str_obj)) {
     127          /* ch not in str_obj, so just use str_obj as list[0] */
     128          Py_INCREF(str_obj);
     129          PyList_SET_ITEM(list, 0, (PyObject *)str_obj);
     130          count++;
     131      } else
     132  #endif
     133      if (i <= str_len) {
     134          SPLIT_ADD(str, i, str_len);
     135      }
     136      FIX_PREALLOC_SIZE(list);
     137      return list;
     138  
     139    onError:
     140      Py_DECREF(list);
     141      return NULL;
     142  }
     143  
     144  Py_LOCAL_INLINE(PyObject *)
     145  STRINGLIB(split)(PyObject* str_obj,
     146                  const STRINGLIB_CHAR* str, Py_ssize_t str_len,
     147                  const STRINGLIB_CHAR* sep, Py_ssize_t sep_len,
     148                  Py_ssize_t maxcount)
     149  {
     150      Py_ssize_t i, j, pos, count=0;
     151      PyObject *list, *sub;
     152  
     153      if (sep_len == 0) {
     154          PyErr_SetString(PyExc_ValueError, "empty separator");
     155          return NULL;
     156      }
     157      else if (sep_len == 1)
     158          return STRINGLIB(split_char)(str_obj, str, str_len, sep[0], maxcount);
     159  
     160      list = PyList_New(PREALLOC_SIZE(maxcount));
     161      if (list == NULL)
     162          return NULL;
     163  
     164      i = j = 0;
     165      while (maxcount-- > 0) {
     166          pos = FASTSEARCH(str+i, str_len-i, sep, sep_len, -1, FAST_SEARCH);
     167          if (pos < 0)
     168              break;
     169          j = i + pos;
     170          SPLIT_ADD(str, i, j);
     171          i = j + sep_len;
     172      }
     173  #if !STRINGLIB_MUTABLE
     174      if (count == 0 && STRINGLIB_CHECK_EXACT(str_obj)) {
     175          /* No match in str_obj, so just use it as list[0] */
     176          Py_INCREF(str_obj);
     177          PyList_SET_ITEM(list, 0, (PyObject *)str_obj);
     178          count++;
     179      } else
     180  #endif
     181      {
     182          SPLIT_ADD(str, i, str_len);
     183      }
     184      FIX_PREALLOC_SIZE(list);
     185      return list;
     186  
     187    onError:
     188      Py_DECREF(list);
     189      return NULL;
     190  }
     191  
     192  Py_LOCAL_INLINE(PyObject *)
     193  STRINGLIB(rsplit_whitespace)(PyObject* str_obj,
     194                              const STRINGLIB_CHAR* str, Py_ssize_t str_len,
     195                              Py_ssize_t maxcount)
     196  {
     197      Py_ssize_t i, j, count=0;
     198      PyObject *list = PyList_New(PREALLOC_SIZE(maxcount));
     199      PyObject *sub;
     200  
     201      if (list == NULL)
     202          return NULL;
     203  
     204      i = j = str_len - 1;
     205      while (maxcount-- > 0) {
     206          while (i >= 0 && STRINGLIB_ISSPACE(str[i]))
     207              i--;
     208          if (i < 0) break;
     209          j = i; i--;
     210          while (i >= 0 && !STRINGLIB_ISSPACE(str[i]))
     211              i--;
     212  #if !STRINGLIB_MUTABLE
     213          if (j == str_len - 1 && i < 0 && STRINGLIB_CHECK_EXACT(str_obj)) {
     214              /* No whitespace in str_obj, so just use it as list[0] */
     215              Py_INCREF(str_obj);
     216              PyList_SET_ITEM(list, 0, (PyObject *)str_obj);
     217              count++;
     218              break;
     219          }
     220  #endif
     221          SPLIT_ADD(str, i + 1, j + 1);
     222      }
     223  
     224      if (i >= 0) {
     225          /* Only occurs when maxcount was reached */
     226          /* Skip any remaining whitespace and copy to beginning of string */
     227          while (i >= 0 && STRINGLIB_ISSPACE(str[i]))
     228              i--;
     229          if (i >= 0)
     230              SPLIT_ADD(str, 0, i + 1);
     231      }
     232      FIX_PREALLOC_SIZE(list);
     233      if (PyList_Reverse(list) < 0)
     234          goto onError;
     235      return list;
     236  
     237    onError:
     238      Py_DECREF(list);
     239      return NULL;
     240  }
     241  
     242  Py_LOCAL_INLINE(PyObject *)
     243  STRINGLIB(rsplit_char)(PyObject* str_obj,
     244                        const STRINGLIB_CHAR* str, Py_ssize_t str_len,
     245                        const STRINGLIB_CHAR ch,
     246                        Py_ssize_t maxcount)
     247  {
     248      Py_ssize_t i, j, count=0;
     249      PyObject *list = PyList_New(PREALLOC_SIZE(maxcount));
     250      PyObject *sub;
     251  
     252      if (list == NULL)
     253          return NULL;
     254  
     255      i = j = str_len - 1;
     256      while ((i >= 0) && (maxcount-- > 0)) {
     257          for(; i >= 0; i--) {
     258              if (str[i] == ch) {
     259                  SPLIT_ADD(str, i + 1, j + 1);
     260                  j = i = i - 1;
     261                  break;
     262              }
     263          }
     264      }
     265  #if !STRINGLIB_MUTABLE
     266      if (count == 0 && STRINGLIB_CHECK_EXACT(str_obj)) {
     267          /* ch not in str_obj, so just use str_obj as list[0] */
     268          Py_INCREF(str_obj);
     269          PyList_SET_ITEM(list, 0, (PyObject *)str_obj);
     270          count++;
     271      } else
     272  #endif
     273      if (j >= -1) {
     274          SPLIT_ADD(str, 0, j + 1);
     275      }
     276      FIX_PREALLOC_SIZE(list);
     277      if (PyList_Reverse(list) < 0)
     278          goto onError;
     279      return list;
     280  
     281    onError:
     282      Py_DECREF(list);
     283      return NULL;
     284  }
     285  
     286  Py_LOCAL_INLINE(PyObject *)
     287  STRINGLIB(rsplit)(PyObject* str_obj,
     288                   const STRINGLIB_CHAR* str, Py_ssize_t str_len,
     289                   const STRINGLIB_CHAR* sep, Py_ssize_t sep_len,
     290                   Py_ssize_t maxcount)
     291  {
     292      Py_ssize_t j, pos, count=0;
     293      PyObject *list, *sub;
     294  
     295      if (sep_len == 0) {
     296          PyErr_SetString(PyExc_ValueError, "empty separator");
     297          return NULL;
     298      }
     299      else if (sep_len == 1)
     300          return STRINGLIB(rsplit_char)(str_obj, str, str_len, sep[0], maxcount);
     301  
     302      list = PyList_New(PREALLOC_SIZE(maxcount));
     303      if (list == NULL)
     304          return NULL;
     305  
     306      j = str_len;
     307      while (maxcount-- > 0) {
     308          pos = FASTSEARCH(str, j, sep, sep_len, -1, FAST_RSEARCH);
     309          if (pos < 0)
     310              break;
     311          SPLIT_ADD(str, pos + sep_len, j);
     312          j = pos;
     313      }
     314  #if !STRINGLIB_MUTABLE
     315      if (count == 0 && STRINGLIB_CHECK_EXACT(str_obj)) {
     316          /* No match in str_obj, so just use it as list[0] */
     317          Py_INCREF(str_obj);
     318          PyList_SET_ITEM(list, 0, (PyObject *)str_obj);
     319          count++;
     320      } else
     321  #endif
     322      {
     323          SPLIT_ADD(str, 0, j);
     324      }
     325      FIX_PREALLOC_SIZE(list);
     326      if (PyList_Reverse(list) < 0)
     327          goto onError;
     328      return list;
     329  
     330    onError:
     331      Py_DECREF(list);
     332      return NULL;
     333  }
     334  
     335  Py_LOCAL_INLINE(PyObject *)
     336  STRINGLIB(splitlines)(PyObject* str_obj,
     337                       const STRINGLIB_CHAR* str, Py_ssize_t str_len,
     338                       int keepends)
     339  {
     340      /* This does not use the preallocated list because splitlines is
     341         usually run with hundreds of newlines.  The overhead of
     342         switching between PyList_SET_ITEM and append causes about a
     343         2-3% slowdown for that common case.  A smarter implementation
     344         could move the if check out, so the SET_ITEMs are done first
     345         and the appends only done when the prealloc buffer is full.
     346         That's too much work for little gain.*/
     347  
     348      Py_ssize_t i;
     349      Py_ssize_t j;
     350      PyObject *list = PyList_New(0);
     351      PyObject *sub;
     352  
     353      if (list == NULL)
     354          return NULL;
     355  
     356      for (i = j = 0; i < str_len; ) {
     357          Py_ssize_t eol;
     358  
     359          /* Find a line and append it */
     360          while (i < str_len && !STRINGLIB_ISLINEBREAK(str[i]))
     361              i++;
     362  
     363          /* Skip the line break reading CRLF as one line break */
     364          eol = i;
     365          if (i < str_len) {
     366              if (str[i] == '\r' && i + 1 < str_len && str[i+1] == '\n')
     367                  i += 2;
     368              else
     369                  i++;
     370              if (keepends)
     371                  eol = i;
     372          }
     373  #if !STRINGLIB_MUTABLE
     374          if (j == 0 && eol == str_len && STRINGLIB_CHECK_EXACT(str_obj)) {
     375              /* No linebreak in str_obj, so just use it as list[0] */
     376              if (PyList_Append(list, str_obj))
     377                  goto onError;
     378              break;
     379          }
     380  #endif
     381          SPLIT_APPEND(str, j, eol);
     382          j = i;
     383      }
     384      return list;
     385  
     386    onError:
     387      Py_DECREF(list);
     388      return NULL;
     389  }
     390