(root)/
glibc-2.38/
sysdeps/
x86_64/
multiarch/
strcspn-sse4.c
       1  /* strcspn with SSE4.2 intrinsics
       2     Copyright (C) 2009-2023 Free Software Foundation, Inc.
       3     This file is part of the GNU C Library.
       4  
       5     The GNU C Library is free software; you can redistribute it and/or
       6     modify it under the terms of the GNU Lesser General Public
       7     License as published by the Free Software Foundation; either
       8     version 2.1 of the License, or (at your option) any later version.
       9  
      10     The GNU C Library is distributed in the hope that it will be useful,
      11     but WITHOUT ANY WARRANTY; without even the implied warranty of
      12     MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
      13     Lesser General Public License for more details.
      14  
      15     You should have received a copy of the GNU Lesser General Public
      16     License along with the GNU C Library; if not, see
      17     <https://www.gnu.org/licenses/>.  */
      18  
      19  #include <isa-level.h>
      20  #if IS_IN (libc) || MINIMUM_X86_ISA_LEVEL >= 2
      21  
      22  # include <nmmintrin.h>
      23  # include <string.h>
      24  # include "varshift.h"
      25  
      26  /* We use 0x2:
      27  	_SIDD_SBYTE_OPS
      28  	| _SIDD_CMP_EQUAL_ANY
      29  	| _SIDD_POSITIVE_POLARITY
      30  	| _SIDD_LEAST_SIGNIFICANT
      31     on pcmpistri to compare xmm/mem128
      32  
      33     0 1 2 3 4 5 6 7 8 9 A B C D E F
      34     X X X X X X X X X X X X X X X X
      35  
      36     against xmm
      37  
      38     0 1 2 3 4 5 6 7 8 9 A B C D E F
      39     A A A A A A A A A A A A A A A A
      40  
      41     to find out if the first 16byte data element has any byte A and
      42     the offset of the first byte.  There are 3 cases:
      43  
      44     1. The first 16byte data element has the byte A at the offset X.
      45     2. The first 16byte data element has EOS and doesn't have the byte A.
      46     3. The first 16byte data element is valid and doesn't have the byte A.
      47  
      48     Here is the table of ECX, CFlag, ZFlag and SFlag for 2 cases:
      49  
      50      1		 X	  1	 0/1	  0
      51      2		16	  0	  1	  0
      52      3		16	  0	  0	  0
      53  
      54     We exit from the loop for cases 1 and 2 with jbe which branches
      55     when either CFlag or ZFlag is 1.  If CFlag == 1, ECX has the offset
      56     X for case 1.  */
      57  
      58  # ifndef STRCSPN
      59  #  define STRCSPN __strcspn_sse42
      60  # endif
      61  # ifndef STRCSPN_GENERIC
      62  #  define STRCSPN_GENERIC __strcspn_generic
      63  # endif
      64  
      65  # ifdef USE_AS_STRPBRK
      66  #  define RETURN(val1, val2) return val1
      67  # else
      68  #  define RETURN(val1, val2) return val2
      69  # endif
      70  
      71  extern
      72  # ifdef USE_AS_STRPBRK
      73  char *
      74  # else
      75  size_t
      76  # endif
      77  STRCSPN_GENERIC (const char *, const char *) attribute_hidden;
      78  
      79  
      80  # ifdef USE_AS_STRPBRK
      81  char *
      82  # else
      83  size_t
      84  # endif
      85  __attribute__ ((section (".text.sse4.2")))
      86  STRCSPN (const char *s, const char *a)
      87  {
      88    if (*a == 0)
      89      RETURN (NULL, strlen (s));
      90  
      91    const char *aligned;
      92    __m128i mask, maskz, zero;
      93    unsigned int maskz_bits;
      94    unsigned int offset = (unsigned int) ((size_t) a & 15);
      95    zero = _mm_set1_epi8 (0);
      96    if (offset != 0)
      97      {
      98        /* Load masks.  */
      99        aligned = (const char *) ((size_t) a & -16L);
     100        __m128i mask0 = _mm_load_si128 ((__m128i *) aligned);
     101        maskz = _mm_cmpeq_epi8 (mask0, zero);
     102  
     103        /* Find where the NULL terminator is.  */
     104        maskz_bits = _mm_movemask_epi8 (maskz) >> offset;
     105        if (maskz_bits != 0)
     106          {
     107            mask = __m128i_shift_right (mask0, offset);
     108            offset = (unsigned int) ((size_t) s & 15);
     109            if (offset)
     110              goto start_unaligned;
     111  
     112            aligned = s;
     113            goto start_loop;
     114          }
     115      }
     116  
     117    /* A is aligned.  */
     118    mask = _mm_loadu_si128 ((__m128i *) a);
     119    /* Find where the NULL terminator is.  */
     120    maskz = _mm_cmpeq_epi8 (mask, zero);
     121    maskz_bits = _mm_movemask_epi8 (maskz);
     122    if (maskz_bits == 0)
     123      {
     124        /* There is no NULL terminator.  Don't use SSE4.2 if the length
     125           of A > 16.  */
     126        if (a[16] != 0)
     127          return STRCSPN_GENERIC (s, a);
     128      }
     129  
     130    aligned = s;
     131    offset = (unsigned int) ((size_t) s & 15);
     132    if (offset != 0)
     133      {
     134      start_unaligned:
     135        /* Check partial string.  */
     136        aligned = (const char *) ((size_t) s & -16L);
     137        __m128i value = _mm_load_si128 ((__m128i *) aligned);
     138  
     139        value = __m128i_shift_right (value, offset);
     140  
     141        unsigned int length = _mm_cmpistri (mask, value, 0x2);
     142        /* No need to check ZFlag since ZFlag is always 1.  */
     143        unsigned int cflag = _mm_cmpistrc (mask, value, 0x2);
     144        if (cflag)
     145  	RETURN ((char *) (s + length), length);
     146        /* Find where the NULL terminator is.  */
     147        unsigned int index = _mm_cmpistri (value, value, 0x3a);
     148        if (index < 16 - offset)
     149  	RETURN (NULL, index);
     150        aligned += 16;
     151      }
     152  
     153  start_loop:
     154    while (1)
     155      {
     156        __m128i value = _mm_load_si128 ((__m128i *) aligned);
     157        unsigned int index = _mm_cmpistri (mask, value, 0x2);
     158        unsigned int cflag = _mm_cmpistrc (mask, value, 0x2);
     159        unsigned int zflag = _mm_cmpistrz (mask, value, 0x2);
     160        if (cflag)
     161  	RETURN ((char *) (aligned + index), (size_t) (aligned + index - s));
     162        if (zflag)
     163  	RETURN (NULL,
     164  		/* Find where the NULL terminator is.  */
     165  		(size_t) (aligned + _mm_cmpistri (value, value, 0x3a) - s));
     166        aligned += 16;
     167      }
     168  }
     169  #endif