1  /* { dg-do run } */
       2  /* { dg-options "-mips3d forbid_cpu=octeon.* (REQUIRES_STDLIB)" } */
       3  
       4  /* Matrix Multiplications */
       5  #include <stdlib.h>
       6  #include <stdio.h>
       7  
       8  typedef float v2sf __attribute__((vector_size(8)));
       9  
      10  float a[4] = {1.1, 2.2, 3.3, 4.4};
      11  float b[4][4] = {{1, 2, 3, 4},
      12                   {5, 6, 7, 8},
      13                   {9, 10, 11, 12},
      14                   {13, 14, 15, 16}};
      15  
      16  float c[4]; /* Result for matrix_multiply1() */
      17  float d[4]; /* Result for matrix_multiply2() */
      18  float e[4]; /* Result for matrix_multiply3() */
      19  float f[4]; /* Result for matrix_multiply4() */
      20  
      21  void matrix_multiply1();
      22  NOMIPS16 void matrix_multiply2();
      23  NOMIPS16 void matrix_multiply3();
      24  NOMIPS16 void matrix_multiply4();
      25  
      26  int main ()
      27  {
      28    int i;
      29  
      30    /* Version 1. Use float calculations */
      31    matrix_multiply1();
      32  
      33    /* Version 2. Use paired-single instructions inside the inner loop*/
      34    matrix_multiply2();
      35    for (i = 0; i < 4; i++)
      36      if (d[i] != c[i])
      37        abort();
      38  
      39    /* Version 3. Use paired-single instructions and unroll the inner loop */
      40    matrix_multiply3();
      41    for (i = 0; i < 4; i++)
      42      if (e[i] != c[i])
      43        abort();
      44  
      45    /* Version 4. Use paired-single instructions and unroll all loops */
      46    matrix_multiply4();
      47    for (i = 0; i < 4; i++)
      48      if (f[i] != c[i])
      49        abort();
      50  
      51    printf ("Test Passes\n");
      52    exit (0);
      53  }
      54  
      55  void matrix_multiply1()
      56  {
      57    int i, j;
      58  
      59    for (i = 0; i < 4; i++)
      60     {
      61       c[i] = 0.0;
      62  
      63       for (j = 0; j < 4; j ++)
      64         c[i] += a[j] * b[j][i]; 
      65     }
      66  }
      67  
      68  NOMIPS16 void matrix_multiply2()
      69  {
      70    int i, j;
      71    v2sf m1, m2;
      72    v2sf result, temp;
      73  
      74    for (i = 0; i < 4; i++)
      75     {
      76       result = (v2sf) {0.0, 0.0};
      77  
      78       for (j = 0; j < 4; j+=2)
      79       {
      80         /* Load two float values into m1 */
      81         m1 = (v2sf) {a[j], a[j+1]};
      82         m2 = (v2sf) {b[j][i], b[j+1][i]};
      83  
      84         /* Multiply and add */
      85         result += m1 * m2;
      86       }
      87       
      88       /* Reduction add at the end */
      89       temp = __builtin_mips_addr_ps (result, result);
      90       d[i] = __builtin_mips_cvt_s_pl (temp);
      91     }
      92  }
      93  
      94  NOMIPS16 void matrix_multiply3()
      95  {
      96    int i;
      97    v2sf m1, m2, n1, n2;
      98    v2sf result, temp;
      99  
     100    m1 = (v2sf) {a[0], a[1]};
     101    m2 = (v2sf) {a[2], a[3]};
     102  
     103    for (i = 0; i < 4; i++)
     104     {
     105       n1 = (v2sf) {b[0][i], b[1][i]};
     106       n2 = (v2sf) {b[2][i], b[3][i]};
     107  
     108       /* Multiply and add */
     109       result = m1 * n1 + m2 * n2;
     110       
     111       /* Reduction add at the end */
     112       temp = __builtin_mips_addr_ps (result, result);
     113       e[i] = __builtin_mips_cvt_s_pl (temp);
     114     }
     115  }
     116  
     117  NOMIPS16 void matrix_multiply4()
     118  {
     119    v2sf m1, m2;
     120    v2sf n1, n2, n3, n4, n5, n6, n7, n8;
     121    v2sf temp1, temp2, temp3, temp4;
     122    v2sf result1, result2;
     123  
     124    /* Load a[0] a[1] values into m1
     125       Load a[2] a[3] values into m2 */
     126    m1 = (v2sf) {a[0], a[1]};
     127    m2 = (v2sf) {a[2], a[3]};
     128  
     129    /* Load b[0][0] b[1][0] values into n1
     130       Load b[2][0] b[3][0] values into n2
     131       Load b[0][1] b[1][1] values into n3
     132       Load b[2][1] b[3][1] values into n4
     133       Load b[0][2] b[1][2] values into n5
     134       Load b[2][2] b[3][2] values into n6
     135       Load b[0][3] b[1][3] values into n7
     136       Load b[2][3] b[3][3] values into n8 */
     137    n1 = (v2sf) {b[0][0], b[1][0]};
     138    n2 = (v2sf) {b[2][0], b[3][0]};
     139    n3 = (v2sf) {b[0][1], b[1][1]};
     140    n4 = (v2sf) {b[2][1], b[3][1]};
     141    n5 = (v2sf) {b[0][2], b[1][2]};
     142    n6 = (v2sf) {b[2][2], b[3][2]};
     143    n7 = (v2sf) {b[0][3], b[1][3]};
     144    n8 = (v2sf) {b[2][3], b[3][3]};
     145  
     146    temp1 = m1 * n1 + m2 * n2;
     147    temp2 = m1 * n3 + m2 * n4;
     148    temp3 = m1 * n5 + m2 * n6;
     149    temp4 = m1 * n7 + m2 * n8;
     150  
     151    result1 = __builtin_mips_addr_ps (temp1, temp2);
     152    result2 = __builtin_mips_addr_ps (temp3, temp4);
     153    
     154    f[0] = __builtin_mips_cvt_s_pu (result1);
     155    f[1] = __builtin_mips_cvt_s_pl (result1);
     156    f[2] = __builtin_mips_cvt_s_pu (result2);
     157    f[3] = __builtin_mips_cvt_s_pl (result2);
     158  }