(root)/
gcc-13.2.0/
gcc/
testsuite/
gcc.target/
i386/
reload-1.c
       1  /* { dg-do compile } */
       2  /* { dg-require-effective-target ia32 } */
       3  /* { dg-options "-O3 -msse2 -fdump-rtl-csa" } */
       4  /* { dg-skip-if "no stdint" { vxworks_kernel } } */
       5  
       6  #include <emmintrin.h>
       7  #include <stdint.h>
       8  
       9  typedef __SIZE_TYPE__ size_t;
      10  typedef float vFloat __attribute__ ((__vector_size__ (16)));
      11  typedef double vDouble __attribute__ ((__vector_size__ (16)));
      12  typedef struct buf
      13  {
      14    void *data;
      15    unsigned long h;
      16    unsigned long  w;
      17    size_t bytes;
      18  } buf;
      19  
      20  typedef struct job
      21  {
      22    struct Job *next;
      23    void * info;
      24    long (*func)(struct Job *job);
      25    long error;
      26  } job;
      27  
      28  typedef struct fj
      29  {
      30      job hd;
      31      buf src;
      32      buf dest;
      33      float g;
      34      unsigned int flags;
      35  } fj;
      36  
      37  static const double r[256], t[256];
      38  
      39  long bar (const buf *src, const buf *dest, float g, unsigned int flags)
      40  {
      41    float *d0 = (float*) src->data;
      42    float *d1 = (float*) dest->data;
      43    uintptr_t w = dest->w;
      44    uintptr_t idx;
      45    vFloat p0;
      46    static const vFloat m0;
      47    static const vDouble p[3], m, b;
      48    float *sr = d0;
      49    float *dr = d1;
      50    for( idx = 0; idx + 8 <= w; idx += 8 )
      51    {
      52      vFloat f0 = _mm_loadu_ps (sr);
      53      vFloat f1 = _mm_loadu_ps (sr + 4);
      54      sr += 8;
      55      vFloat fa0 = _mm_andnot_ps (m0, f0);
      56      vFloat fa1 = _mm_andnot_ps (m0, f1);
      57      vDouble v0 = _mm_cvtps_pd (fa0);
      58      vDouble v1 = _mm_cvtps_pd (_mm_movehl_ps (fa0, fa0));
      59      vDouble v2 = _mm_cvtps_pd (fa1);
      60      vDouble v3 = _mm_cvtps_pd (_mm_movehl_ps (fa1, fa1));
      61      vDouble  vi0, vi1, vi2, vi3;
      62      __m128i b0, b1, b2, b3;
      63      b0 = _mm_packs_epi32 (_mm_packs_epi32 (b0, b1), _mm_packs_epi32 (b2, b3));
      64      b1 = _mm_srli_epi64 (b0, 32);
      65      unsigned int i0 = _mm_cvtsi128_si32 (b0); 
      66      unsigned int i2 = _mm_cvtsi128_si32 (b1);
      67      v0 -= _mm_loadh_pd (_mm_load_sd (r + (i0 & 0xff)), r + (i0 >> 16));
      68      v1 -= _mm_loadh_pd (_mm_load_sd (r + (i2 & 0xff)), r + (i2 >> 16));
      69      b0 = _mm_unpackhi_epi64 (b0, b0);
      70      b1 = _mm_unpackhi_epi64 (b1, b1);
      71      unsigned int i4 = _mm_cvtsi128_si32 (b0);
      72      unsigned int i6 = _mm_cvtsi128_si32 (b1);
      73      v2 -= _mm_loadh_pd (_mm_load_sd (r + (i4 & 0xff)), r + (i4 >> 16));
      74      v3 -= _mm_loadh_pd (_mm_load_sd (r + (i6 & 0xff)), r + (i6 >> 16));
      75      v0 = p[0] + (p[1] + p[2] * v0) * v0;
      76      v1 = p[0] + (p[1] + p[2] * v1) * v1;
      77      v2 = p[0] + (p[1] + p[2] * v2) * v2;
      78      v3 = p[0] + (p[1] + p[2] * v3) * v3;
      79      vi0 = (vDouble) _mm_slli_epi64 ((__m128i)((vi0 + b) + m), 52);
      80      vi1 = (vDouble) _mm_slli_epi64 ((__m128i)((vi1 + b) + m), 52);
      81      vi2 = (vDouble) _mm_slli_epi64 ((__m128i)((vi2 + b) + m), 52);
      82      vi3 = (vDouble) _mm_slli_epi64 ((__m128i)((vi3 + b) + m), 52);
      83      vi0 *= _mm_loadh_pd (_mm_load_sd (t + (i0 & 0xff)), t + (i0 >> 16));
      84      vi1 *= _mm_loadh_pd (_mm_load_sd (t + (i2 & 0xff)), t + (i2 >> 16));
      85      vi2 *= _mm_loadh_pd (_mm_load_sd (t + (i4 & 0xff)), t + (i4 >> 16));
      86      vi3 *= _mm_loadh_pd (_mm_load_sd (t + (i6 & 0xff)), t + (i6 >> 16));
      87      v0 *= vi0;
      88      v1 *= vi1;
      89      v2 *= vi2;
      90      v3 *= vi3;
      91      vFloat r0 = _mm_movelh_ps (_mm_cvtpd_ps( v0 ), _mm_cvtpd_ps (v1));
      92      vFloat r1 = _mm_movelh_ps (_mm_cvtpd_ps( v2 ), _mm_cvtpd_ps (v3));
      93      vFloat z0 = _mm_cmpeq_ps (f0, _mm_setzero_ps());
      94      vFloat z1 = _mm_cmpeq_ps (f1, _mm_setzero_ps());
      95      r0 = _mm_andnot_ps (z0, r0);
      96      r1 = _mm_andnot_ps (z1, r1);
      97      z0 = _mm_and_ps (z0, p0);
      98      z1 = _mm_and_ps (z1, p0);
      99      r0 = _mm_or_ps (r0, z0);
     100      r1 = _mm_or_ps (r1, z1);
     101      _mm_storeu_ps (dr, r0);
     102      _mm_storeu_ps (dr + 4, r1);
     103      dr += 8;
     104    }
     105    return 0;
     106  }
     107  
     108  long foo (job *j )
     109  {
     110    fj *jd = (fj*) j;
     111    return bar (&jd->src, &jd->dest, jd->g, jd->flags);
     112  }
     113  
     114  /* { dg-final { scan-rtl-dump-not "deleted 1 dead insns" "csa" } } */