1  #if STRINGLIB_IS_UNICODE
       2  # error "transmogrify.h only compatible with byte-wise strings"
       3  #endif
       4  
       5  /* the more complicated methods.  parts of these should be pulled out into the
       6     shared code in bytes_methods.c to cut down on duplicate code bloat.  */
       7  
       8  /*[clinic input]
       9  class B "PyObject *" "&PyType_Type"
      10  [clinic start generated code]*/
      11  /*[clinic end generated code: output=da39a3ee5e6b4b0d input=2935558188d97c76]*/
      12  
      13  #include "clinic/transmogrify.h.h"
      14  
      15  static inline PyObject *
      16  return_self(PyObject *self)
      17  {
      18  #if !STRINGLIB_MUTABLE
      19      if (STRINGLIB_CHECK_EXACT(self)) {
      20          return Py_NewRef(self);
      21      }
      22  #endif
      23      return STRINGLIB_NEW(STRINGLIB_STR(self), STRINGLIB_LEN(self));
      24  }
      25  
      26  /*[clinic input]
      27  B.expandtabs as stringlib_expandtabs
      28  
      29      tabsize: int = 8
      30  
      31  Return a copy where all tab characters are expanded using spaces.
      32  
      33  If tabsize is not given, a tab size of 8 characters is assumed.
      34  [clinic start generated code]*/
      35  
      36  static PyObject *
      37  stringlib_expandtabs_impl(PyObject *self, int tabsize)
      38  /*[clinic end generated code: output=069cb7fae72e4c2b input=3c6d3b12aa3ccbea]*/
      39  {
      40      const char *e, *p;
      41      char *q;
      42      Py_ssize_t i, j;
      43      PyObject *u;
      44  
      45      /* First pass: determine size of output string */
      46      i = j = 0;
      47      e = STRINGLIB_STR(self) + STRINGLIB_LEN(self);
      48      for (p = STRINGLIB_STR(self); p < e; p++) {
      49          if (*p == '\t') {
      50              if (tabsize > 0) {
      51                  Py_ssize_t incr = tabsize - (j % tabsize);
      52                  if (j > PY_SSIZE_T_MAX - incr)
      53                      goto overflow;
      54                  j += incr;
      55              }
      56          }
      57          else {
      58              if (j > PY_SSIZE_T_MAX - 1)
      59                  goto overflow;
      60              j++;
      61              if (*p == '\n' || *p == '\r') {
      62                  if (i > PY_SSIZE_T_MAX - j)
      63                      goto overflow;
      64                  i += j;
      65                  j = 0;
      66              }
      67          }
      68      }
      69  
      70      if (i > PY_SSIZE_T_MAX - j)
      71          goto overflow;
      72  
      73      /* Second pass: create output string and fill it */
      74      u = STRINGLIB_NEW(NULL, i + j);
      75      if (!u)
      76          return NULL;
      77  
      78      j = 0;
      79      q = STRINGLIB_STR(u);
      80  
      81      for (p = STRINGLIB_STR(self); p < e; p++) {
      82          if (*p == '\t') {
      83              if (tabsize > 0) {
      84                  i = tabsize - (j % tabsize);
      85                  j += i;
      86                  while (i--)
      87                      *q++ = ' ';
      88              }
      89          }
      90          else {
      91              j++;
      92              *q++ = *p;
      93              if (*p == '\n' || *p == '\r')
      94                  j = 0;
      95          }
      96      }
      97  
      98      return u;
      99    overflow:
     100      PyErr_SetString(PyExc_OverflowError, "result too long");
     101      return NULL;
     102  }
     103  
     104  static inline PyObject *
     105  pad(PyObject *self, Py_ssize_t left, Py_ssize_t right, char fill)
     106  {
     107      PyObject *u;
     108  
     109      if (left < 0)
     110          left = 0;
     111      if (right < 0)
     112          right = 0;
     113  
     114      if (left == 0 && right == 0) {
     115          return return_self(self);
     116      }
     117  
     118      u = STRINGLIB_NEW(NULL, left + STRINGLIB_LEN(self) + right);
     119      if (u) {
     120          if (left)
     121              memset(STRINGLIB_STR(u), fill, left);
     122          memcpy(STRINGLIB_STR(u) + left,
     123                 STRINGLIB_STR(self),
     124                 STRINGLIB_LEN(self));
     125          if (right)
     126              memset(STRINGLIB_STR(u) + left + STRINGLIB_LEN(self),
     127                     fill, right);
     128      }
     129  
     130      return u;
     131  }
     132  
     133  /*[clinic input]
     134  B.ljust as stringlib_ljust
     135  
     136      width: Py_ssize_t
     137      fillchar: char = b' '
     138      /
     139  
     140  Return a left-justified string of length width.
     141  
     142  Padding is done using the specified fill character.
     143  [clinic start generated code]*/
     144  
     145  static PyObject *
     146  stringlib_ljust_impl(PyObject *self, Py_ssize_t width, char fillchar)
     147  /*[clinic end generated code: output=c79ca173c5ff8337 input=eff2d014bc7d80df]*/
     148  {
     149      if (STRINGLIB_LEN(self) >= width) {
     150          return return_self(self);
     151      }
     152  
     153      return pad(self, 0, width - STRINGLIB_LEN(self), fillchar);
     154  }
     155  
     156  
     157  /*[clinic input]
     158  B.rjust as stringlib_rjust
     159  
     160      width: Py_ssize_t
     161      fillchar: char = b' '
     162      /
     163  
     164  Return a right-justified string of length width.
     165  
     166  Padding is done using the specified fill character.
     167  [clinic start generated code]*/
     168  
     169  static PyObject *
     170  stringlib_rjust_impl(PyObject *self, Py_ssize_t width, char fillchar)
     171  /*[clinic end generated code: output=7df5d728a5439570 input=218b0bd31308955d]*/
     172  {
     173      if (STRINGLIB_LEN(self) >= width) {
     174          return return_self(self);
     175      }
     176  
     177      return pad(self, width - STRINGLIB_LEN(self), 0, fillchar);
     178  }
     179  
     180  
     181  /*[clinic input]
     182  B.center as stringlib_center
     183  
     184      width: Py_ssize_t
     185      fillchar: char = b' '
     186      /
     187  
     188  Return a centered string of length width.
     189  
     190  Padding is done using the specified fill character.
     191  [clinic start generated code]*/
     192  
     193  static PyObject *
     194  stringlib_center_impl(PyObject *self, Py_ssize_t width, char fillchar)
     195  /*[clinic end generated code: output=d8da2e055288b4c2 input=3776fd278765d89b]*/
     196  {
     197      Py_ssize_t marg, left;
     198  
     199      if (STRINGLIB_LEN(self) >= width) {
     200          return return_self(self);
     201      }
     202  
     203      marg = width - STRINGLIB_LEN(self);
     204      left = marg / 2 + (marg & width & 1);
     205  
     206      return pad(self, left, marg - left, fillchar);
     207  }
     208  
     209  /*[clinic input]
     210  B.zfill as stringlib_zfill
     211  
     212      width: Py_ssize_t
     213      /
     214  
     215  Pad a numeric string with zeros on the left, to fill a field of the given width.
     216  
     217  The original string is never truncated.
     218  [clinic start generated code]*/
     219  
     220  static PyObject *
     221  stringlib_zfill_impl(PyObject *self, Py_ssize_t width)
     222  /*[clinic end generated code: output=0b3c684a7f1b2319 input=2da6d7b8e9bcb19a]*/
     223  {
     224      Py_ssize_t fill;
     225      PyObject *s;
     226      char *p;
     227  
     228      if (STRINGLIB_LEN(self) >= width) {
     229          return return_self(self);
     230      }
     231  
     232      fill = width - STRINGLIB_LEN(self);
     233  
     234      s = pad(self, fill, 0, '0');
     235  
     236      if (s == NULL)
     237          return NULL;
     238  
     239      p = STRINGLIB_STR(s);
     240      if (p[fill] == '+' || p[fill] == '-') {
     241          /* move sign to beginning of string */
     242          p[0] = p[fill];
     243          p[fill] = '0';
     244      }
     245  
     246      return s;
     247  }
     248  
     249  
     250  /* find and count characters and substrings */
     251  
     252  #define findchar(target, target_len, c)                         \
     253    ((char *)memchr((const void *)(target), c, target_len))
     254  
     255  
     256  static Py_ssize_t
     257  countchar(const char *target, Py_ssize_t target_len, char c,
     258            Py_ssize_t maxcount)
     259  {
     260      Py_ssize_t count = 0;
     261      const char *start = target;
     262      const char *end = target + target_len;
     263  
     264      while ((start = findchar(start, end - start, c)) != NULL) {
     265          count++;
     266          if (count >= maxcount)
     267              break;
     268          start += 1;
     269      }
     270      return count;
     271  }
     272  
     273  
     274  /* Algorithms for different cases of string replacement */
     275  
     276  /* len(self)>=1, from="", len(to)>=1, maxcount>=1 */
     277  static PyObject *
     278  stringlib_replace_interleave(PyObject *self,
     279                               const char *to_s, Py_ssize_t to_len,
     280                               Py_ssize_t maxcount)
     281  {
     282      const char *self_s;
     283      char *result_s;
     284      Py_ssize_t self_len, result_len;
     285      Py_ssize_t count, i;
     286      PyObject *result;
     287  
     288      self_len = STRINGLIB_LEN(self);
     289  
     290      /* 1 at the end plus 1 after every character;
     291         count = min(maxcount, self_len + 1) */
     292      if (maxcount <= self_len) {
     293          count = maxcount;
     294      }
     295      else {
     296          /* Can't overflow: self_len + 1 <= maxcount <= PY_SSIZE_T_MAX. */
     297          count = self_len + 1;
     298      }
     299  
     300      /* Check for overflow */
     301      /*   result_len = count * to_len + self_len; */
     302      assert(count > 0);
     303      if (to_len > (PY_SSIZE_T_MAX - self_len) / count) {
     304          PyErr_SetString(PyExc_OverflowError,
     305                          "replace bytes is too long");
     306          return NULL;
     307      }
     308      result_len = count * to_len + self_len;
     309      result = STRINGLIB_NEW(NULL, result_len);
     310      if (result == NULL) {
     311          return NULL;
     312      }
     313  
     314      self_s = STRINGLIB_STR(self);
     315      result_s = STRINGLIB_STR(result);
     316  
     317      if (to_len > 1) {
     318          /* Lay the first one down (guaranteed this will occur) */
     319          memcpy(result_s, to_s, to_len);
     320          result_s += to_len;
     321          count -= 1;
     322  
     323          for (i = 0; i < count; i++) {
     324              *result_s++ = *self_s++;
     325              memcpy(result_s, to_s, to_len);
     326              result_s += to_len;
     327          }
     328      }
     329      else {
     330          result_s[0] = to_s[0];
     331          result_s += to_len;
     332          count -= 1;
     333          for (i = 0; i < count; i++) {
     334              *result_s++ = *self_s++;
     335              result_s[0] = to_s[0];
     336              result_s += to_len;
     337          }
     338      }
     339  
     340      /* Copy the rest of the original string */
     341      memcpy(result_s, self_s, self_len - i);
     342  
     343      return result;
     344  }
     345  
     346  /* Special case for deleting a single character */
     347  /* len(self)>=1, len(from)==1, to="", maxcount>=1 */
     348  static PyObject *
     349  stringlib_replace_delete_single_character(PyObject *self,
     350                                            char from_c, Py_ssize_t maxcount)
     351  {
     352      const char *self_s, *start, *next, *end;
     353      char *result_s;
     354      Py_ssize_t self_len, result_len;
     355      Py_ssize_t count;
     356      PyObject *result;
     357  
     358      self_len = STRINGLIB_LEN(self);
     359      self_s = STRINGLIB_STR(self);
     360  
     361      count = countchar(self_s, self_len, from_c, maxcount);
     362      if (count == 0) {
     363          return return_self(self);
     364      }
     365  
     366      result_len = self_len - count;  /* from_len == 1 */
     367      assert(result_len>=0);
     368  
     369      result = STRINGLIB_NEW(NULL, result_len);
     370      if (result == NULL) {
     371          return NULL;
     372      }
     373      result_s = STRINGLIB_STR(result);
     374  
     375      start = self_s;
     376      end = self_s + self_len;
     377      while (count-- > 0) {
     378          next = findchar(start, end - start, from_c);
     379          if (next == NULL)
     380              break;
     381          memcpy(result_s, start, next - start);
     382          result_s += (next - start);
     383          start = next + 1;
     384      }
     385      memcpy(result_s, start, end - start);
     386  
     387      return result;
     388  }
     389  
     390  /* len(self)>=1, len(from)>=2, to="", maxcount>=1 */
     391  
     392  static PyObject *
     393  stringlib_replace_delete_substring(PyObject *self,
     394                                     const char *from_s, Py_ssize_t from_len,
     395                                     Py_ssize_t maxcount)
     396  {
     397      const char *self_s, *start, *next, *end;
     398      char *result_s;
     399      Py_ssize_t self_len, result_len;
     400      Py_ssize_t count, offset;
     401      PyObject *result;
     402  
     403      self_len = STRINGLIB_LEN(self);
     404      self_s = STRINGLIB_STR(self);
     405  
     406      count = stringlib_count(self_s, self_len,
     407                              from_s, from_len,
     408                              maxcount);
     409  
     410      if (count == 0) {
     411          /* no matches */
     412          return return_self(self);
     413      }
     414  
     415      result_len = self_len - (count * from_len);
     416      assert (result_len>=0);
     417  
     418      result = STRINGLIB_NEW(NULL, result_len);
     419      if (result == NULL) {
     420          return NULL;
     421      }
     422      result_s = STRINGLIB_STR(result);
     423  
     424      start = self_s;
     425      end = self_s + self_len;
     426      while (count-- > 0) {
     427          offset = stringlib_find(start, end - start,
     428                                  from_s, from_len,
     429                                  0);
     430          if (offset == -1)
     431              break;
     432          next = start + offset;
     433  
     434          memcpy(result_s, start, next - start);
     435  
     436          result_s += (next - start);
     437          start = next + from_len;
     438      }
     439      memcpy(result_s, start, end - start);
     440      return result;
     441  }
     442  
     443  /* len(self)>=1, len(from)==len(to)==1, maxcount>=1 */
     444  static PyObject *
     445  stringlib_replace_single_character_in_place(PyObject *self,
     446                                              char from_c, char to_c,
     447                                              Py_ssize_t maxcount)
     448  {
     449      const char *self_s, *end;
     450      char *result_s, *start, *next;
     451      Py_ssize_t self_len;
     452      PyObject *result;
     453  
     454      /* The result string will be the same size */
     455      self_s = STRINGLIB_STR(self);
     456      self_len = STRINGLIB_LEN(self);
     457  
     458      next = findchar(self_s, self_len, from_c);
     459  
     460      if (next == NULL) {
     461          /* No matches; return the original bytes */
     462          return return_self(self);
     463      }
     464  
     465      /* Need to make a new bytes */
     466      result = STRINGLIB_NEW(NULL, self_len);
     467      if (result == NULL) {
     468          return NULL;
     469      }
     470      result_s = STRINGLIB_STR(result);
     471      memcpy(result_s, self_s, self_len);
     472  
     473      /* change everything in-place, starting with this one */
     474      start =  result_s + (next - self_s);
     475      *start = to_c;
     476      start++;
     477      end = result_s + self_len;
     478  
     479      while (--maxcount > 0) {
     480          next = findchar(start, end - start, from_c);
     481          if (next == NULL)
     482              break;
     483          *next = to_c;
     484          start = next + 1;
     485      }
     486  
     487      return result;
     488  }
     489  
     490  /* len(self)>=1, len(from)==len(to)>=2, maxcount>=1 */
     491  static PyObject *
     492  stringlib_replace_substring_in_place(PyObject *self,
     493                                       const char *from_s, Py_ssize_t from_len,
     494                                       const char *to_s, Py_ssize_t to_len,
     495                                       Py_ssize_t maxcount)
     496  {
     497      const char *self_s, *end;
     498      char *result_s, *start;
     499      Py_ssize_t self_len, offset;
     500      PyObject *result;
     501  
     502      /* The result bytes will be the same size */
     503  
     504      self_s = STRINGLIB_STR(self);
     505      self_len = STRINGLIB_LEN(self);
     506  
     507      offset = stringlib_find(self_s, self_len,
     508                              from_s, from_len,
     509                              0);
     510      if (offset == -1) {
     511          /* No matches; return the original bytes */
     512          return return_self(self);
     513      }
     514  
     515      /* Need to make a new bytes */
     516      result = STRINGLIB_NEW(NULL, self_len);
     517      if (result == NULL) {
     518          return NULL;
     519      }
     520      result_s = STRINGLIB_STR(result);
     521      memcpy(result_s, self_s, self_len);
     522  
     523      /* change everything in-place, starting with this one */
     524      start =  result_s + offset;
     525      memcpy(start, to_s, from_len);
     526      start += from_len;
     527      end = result_s + self_len;
     528  
     529      while ( --maxcount > 0) {
     530          offset = stringlib_find(start, end - start,
     531                                  from_s, from_len,
     532                                  0);
     533          if (offset == -1)
     534              break;
     535          memcpy(start + offset, to_s, from_len);
     536          start += offset + from_len;
     537      }
     538  
     539      return result;
     540  }
     541  
     542  /* len(self)>=1, len(from)==1, len(to)>=2, maxcount>=1 */
     543  static PyObject *
     544  stringlib_replace_single_character(PyObject *self,
     545                                     char from_c,
     546                                     const char *to_s, Py_ssize_t to_len,
     547                                     Py_ssize_t maxcount)
     548  {
     549      const char *self_s, *start, *next, *end;
     550      char *result_s;
     551      Py_ssize_t self_len, result_len;
     552      Py_ssize_t count;
     553      PyObject *result;
     554  
     555      self_s = STRINGLIB_STR(self);
     556      self_len = STRINGLIB_LEN(self);
     557  
     558      count = countchar(self_s, self_len, from_c, maxcount);
     559      if (count == 0) {
     560          /* no matches, return unchanged */
     561          return return_self(self);
     562      }
     563  
     564      /* use the difference between current and new, hence the "-1" */
     565      /*   result_len = self_len + count * (to_len-1)  */
     566      assert(count > 0);
     567      if (to_len - 1 > (PY_SSIZE_T_MAX - self_len) / count) {
     568          PyErr_SetString(PyExc_OverflowError, "replace bytes is too long");
     569          return NULL;
     570      }
     571      result_len = self_len + count * (to_len - 1);
     572  
     573      result = STRINGLIB_NEW(NULL, result_len);
     574      if (result == NULL) {
     575          return NULL;
     576      }
     577      result_s = STRINGLIB_STR(result);
     578  
     579      start = self_s;
     580      end = self_s + self_len;
     581      while (count-- > 0) {
     582          next = findchar(start, end - start, from_c);
     583          if (next == NULL)
     584              break;
     585  
     586          if (next == start) {
     587              /* replace with the 'to' */
     588              memcpy(result_s, to_s, to_len);
     589              result_s += to_len;
     590              start += 1;
     591          } else {
     592              /* copy the unchanged old then the 'to' */
     593              memcpy(result_s, start, next - start);
     594              result_s += (next - start);
     595              memcpy(result_s, to_s, to_len);
     596              result_s += to_len;
     597              start = next + 1;
     598          }
     599      }
     600      /* Copy the remainder of the remaining bytes */
     601      memcpy(result_s, start, end - start);
     602  
     603      return result;
     604  }
     605  
     606  /* len(self)>=1, len(from)>=2, len(to)>=2, maxcount>=1 */
     607  static PyObject *
     608  stringlib_replace_substring(PyObject *self,
     609                              const char *from_s, Py_ssize_t from_len,
     610                              const char *to_s, Py_ssize_t to_len,
     611                              Py_ssize_t maxcount)
     612  {
     613      const char *self_s, *start, *next, *end;
     614      char *result_s;
     615      Py_ssize_t self_len, result_len;
     616      Py_ssize_t count, offset;
     617      PyObject *result;
     618  
     619      self_s = STRINGLIB_STR(self);
     620      self_len = STRINGLIB_LEN(self);
     621  
     622      count = stringlib_count(self_s, self_len,
     623                              from_s, from_len,
     624                              maxcount);
     625  
     626      if (count == 0) {
     627          /* no matches, return unchanged */
     628          return return_self(self);
     629      }
     630  
     631      /* Check for overflow */
     632      /*    result_len = self_len + count * (to_len-from_len) */
     633      assert(count > 0);
     634      if (to_len - from_len > (PY_SSIZE_T_MAX - self_len) / count) {
     635          PyErr_SetString(PyExc_OverflowError, "replace bytes is too long");
     636          return NULL;
     637      }
     638      result_len = self_len + count * (to_len - from_len);
     639  
     640      result = STRINGLIB_NEW(NULL, result_len);
     641      if (result == NULL) {
     642          return NULL;
     643      }
     644      result_s = STRINGLIB_STR(result);
     645  
     646      start = self_s;
     647      end = self_s + self_len;
     648      while (count-- > 0) {
     649          offset = stringlib_find(start, end - start,
     650                                  from_s, from_len,
     651                                  0);
     652          if (offset == -1)
     653              break;
     654          next = start + offset;
     655          if (next == start) {
     656              /* replace with the 'to' */
     657              memcpy(result_s, to_s, to_len);
     658              result_s += to_len;
     659              start += from_len;
     660          } else {
     661              /* copy the unchanged old then the 'to' */
     662              memcpy(result_s, start, next - start);
     663              result_s += (next - start);
     664              memcpy(result_s, to_s, to_len);
     665              result_s += to_len;
     666              start = next + from_len;
     667          }
     668      }
     669      /* Copy the remainder of the remaining bytes */
     670      memcpy(result_s, start, end - start);
     671  
     672      return result;
     673  }
     674  
     675  
     676  static PyObject *
     677  stringlib_replace(PyObject *self,
     678                    const char *from_s, Py_ssize_t from_len,
     679                    const char *to_s, Py_ssize_t to_len,
     680                    Py_ssize_t maxcount)
     681  {
     682      if (STRINGLIB_LEN(self) < from_len) {
     683          /* nothing to do; return the original bytes */
     684          return return_self(self);
     685      }
     686      if (maxcount < 0) {
     687          maxcount = PY_SSIZE_T_MAX;
     688      } else if (maxcount == 0) {
     689          /* nothing to do; return the original bytes */
     690          return return_self(self);
     691      }
     692  
     693      /* Handle zero-length special cases */
     694      if (from_len == 0) {
     695          if (to_len == 0) {
     696              /* nothing to do; return the original bytes */
     697              return return_self(self);
     698          }
     699          /* insert the 'to' bytes everywhere.    */
     700          /*    >>> b"Python".replace(b"", b".")  */
     701          /*    b'.P.y.t.h.o.n.'                  */
     702          return stringlib_replace_interleave(self, to_s, to_len, maxcount);
     703      }
     704  
     705      if (to_len == 0) {
     706          /* delete all occurrences of 'from' bytes */
     707          if (from_len == 1) {
     708              return stringlib_replace_delete_single_character(
     709                  self, from_s[0], maxcount);
     710          } else {
     711              return stringlib_replace_delete_substring(
     712                  self, from_s, from_len, maxcount);
     713          }
     714      }
     715  
     716      /* Handle special case where both bytes have the same length */
     717  
     718      if (from_len == to_len) {
     719          if (from_len == 1) {
     720              return stringlib_replace_single_character_in_place(
     721                  self, from_s[0], to_s[0], maxcount);
     722          } else {
     723              return stringlib_replace_substring_in_place(
     724                  self, from_s, from_len, to_s, to_len, maxcount);
     725          }
     726      }
     727  
     728      /* Otherwise use the more generic algorithms */
     729      if (from_len == 1) {
     730          return stringlib_replace_single_character(
     731              self, from_s[0], to_s, to_len, maxcount);
     732      } else {
     733          /* len('from')>=2, len('to')>=1 */
     734          return stringlib_replace_substring(
     735              self, from_s, from_len, to_s, to_len, maxcount);
     736      }
     737  }
     738  
     739  #undef findchar