1  extern void abort (void);
       2  
       3  typedef float v4flt __attribute__ ((vector_size (16)));
       4  
       5  void __attribute__ ((noinline)) foo (float *dst, float **src, int a, int n)
       6  {
       7    int i, j;
       8    int z = sizeof (v4flt) / sizeof (float);
       9    unsigned m = sizeof (v4flt) - 1;
      10  
      11    for (j = 0; j < n && (((unsigned long) dst + j) & m); ++j)
      12      {
      13        float t = src[0][j];
      14        for (i = 1; i < a; ++i)
      15  	t += src[i][j];
      16        dst[j] = t;
      17      }
      18  
      19    for (; j < (n - (4 * z - 1)); j += 4 * z)
      20      {
      21        v4flt t0 = *(v4flt *) (src[0] + j + 0 * z);
      22        v4flt t1 = *(v4flt *) (src[0] + j + 1 * z);
      23        v4flt t2 = *(v4flt *) (src[0] + j + 2 * z);
      24        v4flt t3 = *(v4flt *) (src[0] + j + 3 * z);
      25        for (i = 1; i < a; ++i)
      26  	{
      27  	  t0 += *(v4flt *) (src[i] + j + 0 * z);
      28  	  t1 += *(v4flt *) (src[i] + j + 1 * z);
      29  	  t2 += *(v4flt *) (src[i] + j + 2 * z);
      30  	  t3 += *(v4flt *) (src[i] + j + 3 * z);
      31  	}
      32        *(v4flt *) (dst + j + 0 * z) = t0;
      33        *(v4flt *) (dst + j + 1 * z) = t1;
      34        *(v4flt *) (dst + j + 2 * z) = t2;
      35        *(v4flt *) (dst + j + 3 * z) = t3;
      36      }
      37    for (; j < n; ++j)
      38      {
      39        float t = src[0][j];
      40        for (i = 1; i < a; ++i)
      41  	t += src[i][j];
      42        dst[j] = t;
      43      }
      44  }
      45  
      46  float buffer[64];
      47  
      48  int
      49  main (void)
      50  {
      51    int i;
      52    float *dst, *src[2];
      53    char *cptr;
      54  
      55    cptr = (char *)buffer;
      56    cptr += (-(long int) buffer & (16 * sizeof (float) - 1));
      57    dst = (float *)cptr;
      58    src[0] = dst + 16;
      59    src[1] = dst + 32;
      60    for (i = 0; i < 16; ++i)
      61      {
      62        src[0][i] = (float) i + 11 * (float) i;
      63        src[1][i] = (float) i + 12 * (float) i;
      64      }
      65    foo (dst, src, 2, 16);
      66    for (i = 0; i < 16; ++i)
      67      {
      68        float e = (float) i + 11 * (float) i + (float) i + 12 * (float) i;
      69        if (dst[i] != e)
      70  	abort ();
      71      }
      72    return 0;
      73  }