1  /* { dg-do compile } */
       2  /* { dg-require-effective-target powerpc_p9vector_ok } */
       3  /* { dg-options "-O3 -mdejagnu-cpu=power9" } */
       4  
       5  /* Verify that we vectorize this SAD loop using vabsduh. */
       6  
       7  extern int abs (int __x) __attribute__ ((__nothrow__, __leaf__)) __attribute__ ((__const__));
       8  
       9  static int
      10  foo (unsigned short *w, int i, unsigned short *x, int j)
      11  {
      12    int tot = 0;
      13    for (int a = 0; a < 16; a++)
      14      {
      15        for (int b = 0; b < 8; b++)
      16  	tot += abs (w[b] - x[b]);
      17        w += i;
      18        x += j;
      19      }
      20    return tot;
      21  }
      22  
      23  void
      24  bar (unsigned short *w, unsigned short *x, int i, int *result)
      25  {
      26    *result = foo (w, 8, x, i);
      27  }
      28  
      29  /* { dg-final { scan-assembler-times "vabsduh" 16 } } */
      30  /* { dg-final { scan-assembler-times "vsum4shs" 16 } } */
      31  /* { dg-final { scan-assembler-times "vadduwm" 17 } } */
      32  
      33  /* Note: One of the 16 adds is optimized out (add with zero),
      34     leaving 15.  The extra two adds are for the final reduction.  */