1  /* { dg-do compile } */
       2  #include <altivec.h>
       3  
       4  inline void
       5  transpose_vmx (vector signed short *input, vector signed short *output)
       6  {
       7    vector signed short v0, v1, v2, v3, v4, v5, v6, v7;
       8    vector signed short x0, x1, x2, x3, x4, x5, x6, x7;
       9  
      10    /* Matrix transpose */
      11    v0 = vec_mergeh (input[0], input[4]);
      12    v1 = vec_mergel (input[0], input[4]);
      13    v2 = vec_mergeh (input[1], input[5]);
      14    v3 = vec_mergel (input[1], input[5]);
      15    v4 = vec_mergeh (input[2], input[6]);
      16    v5 = vec_mergel (input[2], input[6]);
      17    v6 = vec_mergeh (input[3], input[7]);
      18    v7 = vec_mergel (input[3], input[7]);
      19  
      20    x0 = vec_mergeh (v0, v4);
      21    x1 = vec_mergel (v0, v4);
      22    x2 = vec_mergeh (v1, v5);
      23    x3 = vec_mergel (v1, v5);
      24    x4 = vec_mergeh (v2, v6);
      25    x5 = vec_mergel (v2, v6);
      26    x6 = vec_mergeh (v3, v7);
      27    x7 = vec_mergel (v3, v7);
      28  
      29    output[0] = vec_mergeh (x0, x4);
      30    output[1] = vec_mergel (x0, x4);
      31    output[2] = vec_mergeh (x1, x5);
      32    output[3] = vec_mergel (x1, x5);
      33    output[4] = vec_mergeh (x2, x6);
      34    output[5] = vec_mergel (x2, x6);
      35    output[6] = vec_mergeh (x3, x7);
      36    output[7] = vec_mergel (x3, x7);
      37  }
      38  
      39  void
      40  dct_vmx (vector signed short *input, vector signed short *output,
      41  	 vector signed short *postscale)
      42  {
      43    vector signed short mul0, mul1, mul2, mul3, mul4, mul5, mul6, mul;
      44    vector signed short v0, v1, v2, v3, v4, v5, v6, v7, v8, v9;
      45    vector signed short v20, v21, v22, v23, v24, v25, v26, v27, v31;
      46    int i;
      47    vector signed short in[8], out[8];
      48  
      49    /* Load first eight rows of input data */
      50  
      51    /* Load multiplication constants */
      52  
      53    /* Splat multiplication constants */
      54    mul0 = vec_splat(input[8],0);
      55    mul1 = vec_splat(input[8],1);
      56    mul2 = vec_splat(input[8],2);
      57    mul3 = vec_splat(input[8],3);
      58    mul4 = vec_splat(input[8],4);
      59    mul5 = vec_splat(input[8],5);
      60    mul6 = vec_splat(input[8],6);
      61  
      62    /* Perform DCT on the eight columns */
      63  
      64    /*********** Stage 1 ***********/
      65  
      66    v8 = vec_adds (input[0], input[7]);
      67    v9 = vec_subs (input[0], input[7]);
      68    v0 = vec_adds (input[1], input[6]);
      69    v7 = vec_subs (input[1], input[6]);
      70    v1 = vec_adds (input[2], input[5]);
      71    v6 = vec_subs (input[2], input[5]);
      72    v2 = vec_adds (input[3], input[4]);
      73    v5 = vec_subs (input[3], input[4]);
      74  
      75    /*********** Stage 2 ***********/
      76  
      77    /* Top */
      78    v3 = vec_adds (v8, v2);		/* (V0+V7) + (V3+V4) */
      79    v4 = vec_subs (v8, v2);		/* (V0+V7) - (V3+V4) */
      80    v2 = vec_adds (v0, v1);		/* (V1+V6) + (V2+V5) */
      81    v8 = vec_subs (v0, v1);		/* (V1+V6) - (V2+V5) */
      82  
      83    /* Bottom */
      84    v0 = vec_subs (v7, v6);		/* (V1-V6) - (V2-V5) */
      85    v1 = vec_adds (v7, v6);		/* (V1-V6) + (V2-V5) */
      86  
      87    /*********** Stage 3 ***********/
      88  
      89    /* Top */
      90    in[0] = vec_adds (v3, v2);		/* y0 = v3 + v2 */
      91    in[4] = vec_subs (v3, v2);		/* y4 = v3 - v2 */
      92    in[2] = vec_mradds (v8, mul2, v4);	/* y2 = v8 * a0 + v4 */
      93    v6 = vec_mradds (v4, mul2, mul6);	
      94    in[6] = vec_subs (v6, v8);		/* y6 = v4 * a0 - v8 */
      95  
      96    /* Bottom */
      97    v6 = vec_mradds (v0, mul0, v5);	/* v6 = v0 * (c4) + v5 */
      98    v7 = vec_mradds (v0, mul4, v5);	/* v7 = v0 * (-c4) + v5 */
      99    v2 = vec_mradds (v1, mul4, v9);	/* v2 = v1 * (-c4) + v9 */
     100    v3 = vec_mradds (v1, mul0, v9);	/* v3 = v1 * (c4) + v9 */
     101  
     102    /*********** Stage 4 ***********/
     103  
     104    /* Bottom */
     105    in[1] = vec_mradds (v6, mul3, v3);	/* y1 = v6 * (a1) + v3 */
     106    v23 = vec_mradds (v3, mul3, mul6);
     107    in[7] = vec_subs (v23, v6);		/* y7 = v3 * (a1) - v6 */
     108    in[5] = vec_mradds (v2, mul1, v7);	/* y5 = v2 * (a2) + v7 */
     109    in[3] = vec_mradds (v7, mul5, v2);	/* y3 = v7 * (-a2) + v2 */
     110  
     111    transpose_vmx (in, out);
     112  
     113    /* Perform DCT on the eight rows */
     114  
     115    /*********** Stage 1 ***********/
     116  
     117    v8 = vec_adds (out[0], out[7]);
     118    v9 = vec_subs (out[0], out[7]);
     119    v0 = vec_adds (out[1], out[6]);
     120    v7 = vec_subs (out[1], out[6]);
     121    v1 = vec_adds (out[2], out[5]);
     122    v6 = vec_subs (out[2], out[5]);
     123    v2 = vec_adds (out[3], out[4]);
     124    v5 = vec_subs (out[3], out[4]);
     125  
     126    /*********** Stage 2 ***********/
     127  
     128    /* Top */
     129    v3 = vec_adds (v8, v2);		/* (V0+V7) + (V3+V4) */
     130    v4 = vec_subs (v8, v2);		/* (V0+V7) - (V3+V4) */
     131    v2 = vec_adds (v0, v1);		/* (V1+V6) + (V2+V5) */
     132    v8 = vec_subs (v0, v1);		/* (V1+V6) - (V2+V5) */
     133  
     134    /* Bottom */
     135    v0 = vec_subs (v7, v6);		/* (V1-V6) - (V2-V5) */
     136    v1 = vec_adds (v7, v6);		/* (V1-V6) + (V2-V5) */
     137  
     138    /*********** Stage 3 ***********/
     139  
     140    /* Top */
     141    v25 = vec_subs (v25, v25);          /* reinit v25 = 0 */
     142  
     143    v20 = vec_adds (v3, v2);		/* y0 = v3 + v2 */
     144    v24 = vec_subs (v3, v2);		/* y4 = v3 - v2 */
     145    v22 = vec_mradds (v8, mul2, v4);	/* y2 = v8 * a0 + v4 */
     146    v6 = vec_mradds (v4, mul2, v25);	
     147    v26 = vec_subs (v6, v8);		/* y6 = v4 * a0 - v8 */
     148  
     149    /* Bottom */
     150    v6 = vec_mradds (v0, mul0, v5);	/* v6 = v0 * (c4) + v5 */
     151    v7 = vec_mradds (v0, mul4, v5);	/* v7 = v0 * (-c4) + v5 */
     152    v2 = vec_mradds (v1, mul4, v9);	/* v2 = v1 * (-c4) + v9 */
     153    v3 = vec_mradds (v1, mul0, v9);	/* v3 = v1 * (c4) + v9 */
     154  
     155    /*********** Stage 4 ***********/
     156  
     157    /* Bottom */
     158    v21 = vec_mradds (v6, mul3, v3);	/* y1 = v6 * (a1) + v3 */
     159    v23 = vec_mradds (v3, mul3, v25);
     160    v27 = vec_subs (v23, v6);		/* y7 = v3 * (a1) - v6 */
     161    v25 = vec_mradds (v2, mul1, v7);	/* y5 = v2 * (a2) + v7 */
     162    v23 = vec_mradds (v7, mul5, v2);	/* y3 = v7 * (-a2) + v2 */
     163  
     164    /* Post-scale and store reults */
     165  
     166    v31 = vec_subs (v31, v31);          /* reinit v25 = 0 */
     167  
     168    output[0] = vec_mradds (postscale[0], v20, v31);
     169    output[2] = vec_mradds (postscale[2], v22, v31);
     170    output[4] = vec_mradds (postscale[4], v24, v31);
     171    output[6] = vec_mradds (postscale[6], v26, v31);
     172    output[1] = vec_mradds (postscale[1], v21, v31);
     173    output[3] = vec_mradds (postscale[3], v23, v31);
     174    output[5] = vec_mradds (postscale[5], v25, v31);
     175    output[7] = vec_mradds (postscale[7], v27, v31);
     176  }