(root)/
gcc-13.2.0/
libgfortran/
generated/
matmulavx128_r10.c
       1  /* Implementation of the MATMUL intrinsic
       2     Copyright (C) 2002-2023 Free Software Foundation, Inc.
       3     Contributed by Thomas Koenig <tkoenig@gcc.gnu.org>.
       4  
       5  This file is part of the GNU Fortran runtime library (libgfortran).
       6  
       7  Libgfortran is free software; you can redistribute it and/or
       8  modify it under the terms of the GNU General Public
       9  License as published by the Free Software Foundation; either
      10  version 3 of the License, or (at your option) any later version.
      11  
      12  Libgfortran is distributed in the hope that it will be useful,
      13  but WITHOUT ANY WARRANTY; without even the implied warranty of
      14  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
      15  GNU General Public License for more details.
      16  
      17  Under Section 7 of GPL version 3, you are granted additional
      18  permissions described in the GCC Runtime Library Exception, version
      19  3.1, as published by the Free Software Foundation.
      20  
      21  You should have received a copy of the GNU General Public License and
      22  a copy of the GCC Runtime Library Exception along with this program;
      23  see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see
      24  <http://www.gnu.org/licenses/>.  */
      25  
      26  #include "libgfortran.h"
      27  #include <string.h>
      28  #include <assert.h>
      29  
      30  
      31  /* These are the specific versions of matmul with -mprefer-avx128.  */
      32  
      33  #if defined (HAVE_GFC_REAL_10)
      34  
      35  /* Prototype for the BLAS ?gemm subroutine, a pointer to which can be
      36     passed to us by the front-end, in which case we call it for large
      37     matrices.  */
      38  
      39  typedef void (*blas_call)(const char *, const char *, const int *, const int *,
      40                            const int *, const GFC_REAL_10 *, const GFC_REAL_10 *,
      41                            const int *, const GFC_REAL_10 *, const int *,
      42                            const GFC_REAL_10 *, GFC_REAL_10 *, const int *,
      43                            int, int);
      44  
      45  #if defined(HAVE_AVX) && defined(HAVE_FMA3) && defined(HAVE_AVX128)
      46  void
      47  matmul_r10_avx128_fma3 (gfc_array_r10 * const restrict retarray, 
      48  	gfc_array_r10 * const restrict a, gfc_array_r10 * const restrict b, int try_blas,
      49  	int blas_limit, blas_call gemm) __attribute__((__target__("avx,fma")));
      50  internal_proto(matmul_r10_avx128_fma3);
      51  void
      52  matmul_r10_avx128_fma3 (gfc_array_r10 * const restrict retarray, 
      53  	gfc_array_r10 * const restrict a, gfc_array_r10 * const restrict b, int try_blas,
      54  	int blas_limit, blas_call gemm)
      55  {
      56    const GFC_REAL_10 * restrict abase;
      57    const GFC_REAL_10 * restrict bbase;
      58    GFC_REAL_10 * restrict dest;
      59  
      60    index_type rxstride, rystride, axstride, aystride, bxstride, bystride;
      61    index_type x, y, n, count, xcount, ycount;
      62  
      63    assert (GFC_DESCRIPTOR_RANK (a) == 2
      64            || GFC_DESCRIPTOR_RANK (b) == 2);
      65  
      66  /* C[xcount,ycount] = A[xcount, count] * B[count,ycount]
      67  
      68     Either A or B (but not both) can be rank 1:
      69  
      70     o One-dimensional argument A is implicitly treated as a row matrix
      71       dimensioned [1,count], so xcount=1.
      72  
      73     o One-dimensional argument B is implicitly treated as a column matrix
      74       dimensioned [count, 1], so ycount=1.
      75  */
      76  
      77    if (retarray->base_addr == NULL)
      78      {
      79        if (GFC_DESCRIPTOR_RANK (a) == 1)
      80          {
      81  	  GFC_DIMENSION_SET(retarray->dim[0], 0,
      82  	                    GFC_DESCRIPTOR_EXTENT(b,1) - 1, 1);
      83          }
      84        else if (GFC_DESCRIPTOR_RANK (b) == 1)
      85          {
      86  	  GFC_DIMENSION_SET(retarray->dim[0], 0,
      87  	                    GFC_DESCRIPTOR_EXTENT(a,0) - 1, 1);
      88          }
      89        else
      90          {
      91  	  GFC_DIMENSION_SET(retarray->dim[0], 0,
      92  	                    GFC_DESCRIPTOR_EXTENT(a,0) - 1, 1);
      93  
      94            GFC_DIMENSION_SET(retarray->dim[1], 0,
      95  	                    GFC_DESCRIPTOR_EXTENT(b,1) - 1,
      96  			    GFC_DESCRIPTOR_EXTENT(retarray,0));
      97          }
      98  
      99        retarray->base_addr
     100  	= xmallocarray (size0 ((array_t *) retarray), sizeof (GFC_REAL_10));
     101        retarray->offset = 0;
     102      }
     103    else if (unlikely (compile_options.bounds_check))
     104      {
     105        index_type ret_extent, arg_extent;
     106  
     107        if (GFC_DESCRIPTOR_RANK (a) == 1)
     108  	{
     109  	  arg_extent = GFC_DESCRIPTOR_EXTENT(b,1);
     110  	  ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,0);
     111  	  if (arg_extent != ret_extent)
     112  	    runtime_error ("Array bound mismatch for dimension 1 of "
     113  	    		   "array (%ld/%ld) ",
     114  			   (long int) ret_extent, (long int) arg_extent);
     115  	}
     116        else if (GFC_DESCRIPTOR_RANK (b) == 1)
     117  	{
     118  	  arg_extent = GFC_DESCRIPTOR_EXTENT(a,0);
     119  	  ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,0);
     120  	  if (arg_extent != ret_extent)
     121  	    runtime_error ("Array bound mismatch for dimension 1 of "
     122  	    		   "array (%ld/%ld) ",
     123  			   (long int) ret_extent, (long int) arg_extent);
     124  	}
     125        else
     126  	{
     127  	  arg_extent = GFC_DESCRIPTOR_EXTENT(a,0);
     128  	  ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,0);
     129  	  if (arg_extent != ret_extent)
     130  	    runtime_error ("Array bound mismatch for dimension 1 of "
     131  	    		   "array (%ld/%ld) ",
     132  			   (long int) ret_extent, (long int) arg_extent);
     133  
     134  	  arg_extent = GFC_DESCRIPTOR_EXTENT(b,1);
     135  	  ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,1);
     136  	  if (arg_extent != ret_extent)
     137  	    runtime_error ("Array bound mismatch for dimension 2 of "
     138  	    		   "array (%ld/%ld) ",
     139  			   (long int) ret_extent, (long int) arg_extent);
     140  	}
     141      }
     142  
     143  
     144    if (GFC_DESCRIPTOR_RANK (retarray) == 1)
     145      {
     146        /* One-dimensional result may be addressed in the code below
     147  	 either as a row or a column matrix. We want both cases to
     148  	 work. */
     149        rxstride = rystride = GFC_DESCRIPTOR_STRIDE(retarray,0);
     150      }
     151    else
     152      {
     153        rxstride = GFC_DESCRIPTOR_STRIDE(retarray,0);
     154        rystride = GFC_DESCRIPTOR_STRIDE(retarray,1);
     155      }
     156  
     157  
     158    if (GFC_DESCRIPTOR_RANK (a) == 1)
     159      {
     160        /* Treat it as a a row matrix A[1,count]. */
     161        axstride = GFC_DESCRIPTOR_STRIDE(a,0);
     162        aystride = 1;
     163  
     164        xcount = 1;
     165        count = GFC_DESCRIPTOR_EXTENT(a,0);
     166      }
     167    else
     168      {
     169        axstride = GFC_DESCRIPTOR_STRIDE(a,0);
     170        aystride = GFC_DESCRIPTOR_STRIDE(a,1);
     171  
     172        count = GFC_DESCRIPTOR_EXTENT(a,1);
     173        xcount = GFC_DESCRIPTOR_EXTENT(a,0);
     174      }
     175  
     176    if (count != GFC_DESCRIPTOR_EXTENT(b,0))
     177      {
     178        if (count > 0 || GFC_DESCRIPTOR_EXTENT(b,0) > 0)
     179  	runtime_error ("Incorrect extent in argument B in MATMUL intrinsic "
     180  		       "in dimension 1: is %ld, should be %ld",
     181  		       (long int) GFC_DESCRIPTOR_EXTENT(b,0), (long int) count);
     182      }
     183  
     184    if (GFC_DESCRIPTOR_RANK (b) == 1)
     185      {
     186        /* Treat it as a column matrix B[count,1] */
     187        bxstride = GFC_DESCRIPTOR_STRIDE(b,0);
     188  
     189        /* bystride should never be used for 1-dimensional b.
     190           The value is only used for calculation of the
     191           memory by the buffer.  */
     192        bystride = 256;
     193        ycount = 1;
     194      }
     195    else
     196      {
     197        bxstride = GFC_DESCRIPTOR_STRIDE(b,0);
     198        bystride = GFC_DESCRIPTOR_STRIDE(b,1);
     199        ycount = GFC_DESCRIPTOR_EXTENT(b,1);
     200      }
     201  
     202    abase = a->base_addr;
     203    bbase = b->base_addr;
     204    dest = retarray->base_addr;
     205  
     206    /* Now that everything is set up, we perform the multiplication
     207       itself.  */
     208  
     209  #define POW3(x) (((float) (x)) * ((float) (x)) * ((float) (x)))
     210  #define min(a,b) ((a) <= (b) ? (a) : (b))
     211  #define max(a,b) ((a) >= (b) ? (a) : (b))
     212  
     213    if (try_blas && rxstride == 1 && (axstride == 1 || aystride == 1)
     214        && (bxstride == 1 || bystride == 1)
     215        && (((float) xcount) * ((float) ycount) * ((float) count)
     216            > POW3(blas_limit)))
     217      {
     218        const int m = xcount, n = ycount, k = count, ldc = rystride;
     219        const GFC_REAL_10 one = 1, zero = 0;
     220        const int lda = (axstride == 1) ? aystride : axstride,
     221  		ldb = (bxstride == 1) ? bystride : bxstride;
     222  
     223        if (lda > 0 && ldb > 0 && ldc > 0 && m > 1 && n > 1 && k > 1)
     224  	{
     225  	  assert (gemm != NULL);
     226  	  const char *transa, *transb;
     227  	  if (try_blas & 2)
     228  	    transa = "C";
     229  	  else
     230  	    transa = axstride == 1 ? "N" : "T";
     231  
     232  	  if (try_blas & 4)
     233  	    transb = "C";
     234  	  else
     235  	    transb = bxstride == 1 ? "N" : "T";
     236  
     237  	  gemm (transa, transb , &m,
     238  		&n, &k,	&one, abase, &lda, bbase, &ldb, &zero, dest,
     239  		&ldc, 1, 1);
     240  	  return;
     241  	}
     242      }
     243  
     244    if (rxstride == 1 && axstride == 1 && bxstride == 1
     245        && GFC_DESCRIPTOR_RANK (b) != 1)
     246      {
     247        /* This block of code implements a tuned matmul, derived from
     248           Superscalar GEMM-based level 3 BLAS,  Beta version 0.1
     249  
     250                 Bo Kagstrom and Per Ling
     251                 Department of Computing Science
     252                 Umea University
     253                 S-901 87 Umea, Sweden
     254  
     255  	 from netlib.org, translated to C, and modified for matmul.m4.  */
     256  
     257        const GFC_REAL_10 *a, *b;
     258        GFC_REAL_10 *c;
     259        const index_type m = xcount, n = ycount, k = count;
     260  
     261        /* System generated locals */
     262        index_type a_dim1, a_offset, b_dim1, b_offset, c_dim1, c_offset,
     263  		 i1, i2, i3, i4, i5, i6;
     264  
     265        /* Local variables */
     266        GFC_REAL_10 f11, f12, f21, f22, f31, f32, f41, f42,
     267  		 f13, f14, f23, f24, f33, f34, f43, f44;
     268        index_type i, j, l, ii, jj, ll;
     269        index_type isec, jsec, lsec, uisec, ujsec, ulsec;
     270        GFC_REAL_10 *t1;
     271  
     272        a = abase;
     273        b = bbase;
     274        c = retarray->base_addr;
     275  
     276        /* Parameter adjustments */
     277        c_dim1 = rystride;
     278        c_offset = 1 + c_dim1;
     279        c -= c_offset;
     280        a_dim1 = aystride;
     281        a_offset = 1 + a_dim1;
     282        a -= a_offset;
     283        b_dim1 = bystride;
     284        b_offset = 1 + b_dim1;
     285        b -= b_offset;
     286  
     287        /* Empty c first.  */
     288        for (j=1; j<=n; j++)
     289  	for (i=1; i<=m; i++)
     290  	  c[i + j * c_dim1] = (GFC_REAL_10)0;
     291  
     292        /* Early exit if possible */
     293        if (m == 0 || n == 0 || k == 0)
     294  	return;
     295  
     296        /* Adjust size of t1 to what is needed.  */
     297        index_type t1_dim, a_sz;
     298        if (aystride == 1)
     299          a_sz = rystride;
     300        else
     301          a_sz = a_dim1;
     302  
     303        t1_dim = a_sz * 256 + b_dim1;
     304        if (t1_dim > 65536)
     305  	t1_dim = 65536;
     306  
     307        t1 = malloc (t1_dim * sizeof(GFC_REAL_10));
     308  
     309        /* Start turning the crank. */
     310        i1 = n;
     311        for (jj = 1; jj <= i1; jj += 512)
     312  	{
     313  	  /* Computing MIN */
     314  	  i2 = 512;
     315  	  i3 = n - jj + 1;
     316  	  jsec = min(i2,i3);
     317  	  ujsec = jsec - jsec % 4;
     318  	  i2 = k;
     319  	  for (ll = 1; ll <= i2; ll += 256)
     320  	    {
     321  	      /* Computing MIN */
     322  	      i3 = 256;
     323  	      i4 = k - ll + 1;
     324  	      lsec = min(i3,i4);
     325  	      ulsec = lsec - lsec % 2;
     326  
     327  	      i3 = m;
     328  	      for (ii = 1; ii <= i3; ii += 256)
     329  		{
     330  		  /* Computing MIN */
     331  		  i4 = 256;
     332  		  i5 = m - ii + 1;
     333  		  isec = min(i4,i5);
     334  		  uisec = isec - isec % 2;
     335  		  i4 = ll + ulsec - 1;
     336  		  for (l = ll; l <= i4; l += 2)
     337  		    {
     338  		      i5 = ii + uisec - 1;
     339  		      for (i = ii; i <= i5; i += 2)
     340  			{
     341  			  t1[l - ll + 1 + ((i - ii + 1) << 8) - 257] =
     342  					a[i + l * a_dim1];
     343  			  t1[l - ll + 2 + ((i - ii + 1) << 8) - 257] =
     344  					a[i + (l + 1) * a_dim1];
     345  			  t1[l - ll + 1 + ((i - ii + 2) << 8) - 257] =
     346  					a[i + 1 + l * a_dim1];
     347  			  t1[l - ll + 2 + ((i - ii + 2) << 8) - 257] =
     348  					a[i + 1 + (l + 1) * a_dim1];
     349  			}
     350  		      if (uisec < isec)
     351  			{
     352  			  t1[l - ll + 1 + (isec << 8) - 257] =
     353  				    a[ii + isec - 1 + l * a_dim1];
     354  			  t1[l - ll + 2 + (isec << 8) - 257] =
     355  				    a[ii + isec - 1 + (l + 1) * a_dim1];
     356  			}
     357  		    }
     358  		  if (ulsec < lsec)
     359  		    {
     360  		      i4 = ii + isec - 1;
     361  		      for (i = ii; i<= i4; ++i)
     362  			{
     363  			  t1[lsec + ((i - ii + 1) << 8) - 257] =
     364  				    a[i + (ll + lsec - 1) * a_dim1];
     365  			}
     366  		    }
     367  
     368  		  uisec = isec - isec % 4;
     369  		  i4 = jj + ujsec - 1;
     370  		  for (j = jj; j <= i4; j += 4)
     371  		    {
     372  		      i5 = ii + uisec - 1;
     373  		      for (i = ii; i <= i5; i += 4)
     374  			{
     375  			  f11 = c[i + j * c_dim1];
     376  			  f21 = c[i + 1 + j * c_dim1];
     377  			  f12 = c[i + (j + 1) * c_dim1];
     378  			  f22 = c[i + 1 + (j + 1) * c_dim1];
     379  			  f13 = c[i + (j + 2) * c_dim1];
     380  			  f23 = c[i + 1 + (j + 2) * c_dim1];
     381  			  f14 = c[i + (j + 3) * c_dim1];
     382  			  f24 = c[i + 1 + (j + 3) * c_dim1];
     383  			  f31 = c[i + 2 + j * c_dim1];
     384  			  f41 = c[i + 3 + j * c_dim1];
     385  			  f32 = c[i + 2 + (j + 1) * c_dim1];
     386  			  f42 = c[i + 3 + (j + 1) * c_dim1];
     387  			  f33 = c[i + 2 + (j + 2) * c_dim1];
     388  			  f43 = c[i + 3 + (j + 2) * c_dim1];
     389  			  f34 = c[i + 2 + (j + 3) * c_dim1];
     390  			  f44 = c[i + 3 + (j + 3) * c_dim1];
     391  			  i6 = ll + lsec - 1;
     392  			  for (l = ll; l <= i6; ++l)
     393  			    {
     394  			      f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257]
     395  				      * b[l + j * b_dim1];
     396  			      f21 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257]
     397  				      * b[l + j * b_dim1];
     398  			      f12 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257]
     399  				      * b[l + (j + 1) * b_dim1];
     400  			      f22 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257]
     401  				      * b[l + (j + 1) * b_dim1];
     402  			      f13 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257]
     403  				      * b[l + (j + 2) * b_dim1];
     404  			      f23 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257]
     405  				      * b[l + (j + 2) * b_dim1];
     406  			      f14 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257]
     407  				      * b[l + (j + 3) * b_dim1];
     408  			      f24 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257]
     409  				      * b[l + (j + 3) * b_dim1];
     410  			      f31 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257]
     411  				      * b[l + j * b_dim1];
     412  			      f41 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257]
     413  				      * b[l + j * b_dim1];
     414  			      f32 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257]
     415  				      * b[l + (j + 1) * b_dim1];
     416  			      f42 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257]
     417  				      * b[l + (j + 1) * b_dim1];
     418  			      f33 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257]
     419  				      * b[l + (j + 2) * b_dim1];
     420  			      f43 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257]
     421  				      * b[l + (j + 2) * b_dim1];
     422  			      f34 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257]
     423  				      * b[l + (j + 3) * b_dim1];
     424  			      f44 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257]
     425  				      * b[l + (j + 3) * b_dim1];
     426  			    }
     427  			  c[i + j * c_dim1] = f11;
     428  			  c[i + 1 + j * c_dim1] = f21;
     429  			  c[i + (j + 1) * c_dim1] = f12;
     430  			  c[i + 1 + (j + 1) * c_dim1] = f22;
     431  			  c[i + (j + 2) * c_dim1] = f13;
     432  			  c[i + 1 + (j + 2) * c_dim1] = f23;
     433  			  c[i + (j + 3) * c_dim1] = f14;
     434  			  c[i + 1 + (j + 3) * c_dim1] = f24;
     435  			  c[i + 2 + j * c_dim1] = f31;
     436  			  c[i + 3 + j * c_dim1] = f41;
     437  			  c[i + 2 + (j + 1) * c_dim1] = f32;
     438  			  c[i + 3 + (j + 1) * c_dim1] = f42;
     439  			  c[i + 2 + (j + 2) * c_dim1] = f33;
     440  			  c[i + 3 + (j + 2) * c_dim1] = f43;
     441  			  c[i + 2 + (j + 3) * c_dim1] = f34;
     442  			  c[i + 3 + (j + 3) * c_dim1] = f44;
     443  			}
     444  		      if (uisec < isec)
     445  			{
     446  			  i5 = ii + isec - 1;
     447  			  for (i = ii + uisec; i <= i5; ++i)
     448  			    {
     449  			      f11 = c[i + j * c_dim1];
     450  			      f12 = c[i + (j + 1) * c_dim1];
     451  			      f13 = c[i + (j + 2) * c_dim1];
     452  			      f14 = c[i + (j + 3) * c_dim1];
     453  			      i6 = ll + lsec - 1;
     454  			      for (l = ll; l <= i6; ++l)
     455  				{
     456  				  f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
     457  					  257] * b[l + j * b_dim1];
     458  				  f12 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
     459  					  257] * b[l + (j + 1) * b_dim1];
     460  				  f13 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
     461  					  257] * b[l + (j + 2) * b_dim1];
     462  				  f14 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
     463  					  257] * b[l + (j + 3) * b_dim1];
     464  				}
     465  			      c[i + j * c_dim1] = f11;
     466  			      c[i + (j + 1) * c_dim1] = f12;
     467  			      c[i + (j + 2) * c_dim1] = f13;
     468  			      c[i + (j + 3) * c_dim1] = f14;
     469  			    }
     470  			}
     471  		    }
     472  		  if (ujsec < jsec)
     473  		    {
     474  		      i4 = jj + jsec - 1;
     475  		      for (j = jj + ujsec; j <= i4; ++j)
     476  			{
     477  			  i5 = ii + uisec - 1;
     478  			  for (i = ii; i <= i5; i += 4)
     479  			    {
     480  			      f11 = c[i + j * c_dim1];
     481  			      f21 = c[i + 1 + j * c_dim1];
     482  			      f31 = c[i + 2 + j * c_dim1];
     483  			      f41 = c[i + 3 + j * c_dim1];
     484  			      i6 = ll + lsec - 1;
     485  			      for (l = ll; l <= i6; ++l)
     486  				{
     487  				  f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
     488  					  257] * b[l + j * b_dim1];
     489  				  f21 += t1[l - ll + 1 + ((i - ii + 2) << 8) -
     490  					  257] * b[l + j * b_dim1];
     491  				  f31 += t1[l - ll + 1 + ((i - ii + 3) << 8) -
     492  					  257] * b[l + j * b_dim1];
     493  				  f41 += t1[l - ll + 1 + ((i - ii + 4) << 8) -
     494  					  257] * b[l + j * b_dim1];
     495  				}
     496  			      c[i + j * c_dim1] = f11;
     497  			      c[i + 1 + j * c_dim1] = f21;
     498  			      c[i + 2 + j * c_dim1] = f31;
     499  			      c[i + 3 + j * c_dim1] = f41;
     500  			    }
     501  			  i5 = ii + isec - 1;
     502  			  for (i = ii + uisec; i <= i5; ++i)
     503  			    {
     504  			      f11 = c[i + j * c_dim1];
     505  			      i6 = ll + lsec - 1;
     506  			      for (l = ll; l <= i6; ++l)
     507  				{
     508  				  f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
     509  					  257] * b[l + j * b_dim1];
     510  				}
     511  			      c[i + j * c_dim1] = f11;
     512  			    }
     513  			}
     514  		    }
     515  		}
     516  	    }
     517  	}
     518        free(t1);
     519        return;
     520      }
     521    else if (rxstride == 1 && aystride == 1 && bxstride == 1)
     522      {
     523        if (GFC_DESCRIPTOR_RANK (a) != 1)
     524  	{
     525  	  const GFC_REAL_10 *restrict abase_x;
     526  	  const GFC_REAL_10 *restrict bbase_y;
     527  	  GFC_REAL_10 *restrict dest_y;
     528  	  GFC_REAL_10 s;
     529  
     530  	  for (y = 0; y < ycount; y++)
     531  	    {
     532  	      bbase_y = &bbase[y*bystride];
     533  	      dest_y = &dest[y*rystride];
     534  	      for (x = 0; x < xcount; x++)
     535  		{
     536  		  abase_x = &abase[x*axstride];
     537  		  s = (GFC_REAL_10) 0;
     538  		  for (n = 0; n < count; n++)
     539  		    s += abase_x[n] * bbase_y[n];
     540  		  dest_y[x] = s;
     541  		}
     542  	    }
     543  	}
     544        else
     545  	{
     546  	  const GFC_REAL_10 *restrict bbase_y;
     547  	  GFC_REAL_10 s;
     548  
     549  	  for (y = 0; y < ycount; y++)
     550  	    {
     551  	      bbase_y = &bbase[y*bystride];
     552  	      s = (GFC_REAL_10) 0;
     553  	      for (n = 0; n < count; n++)
     554  		s += abase[n*axstride] * bbase_y[n];
     555  	      dest[y*rystride] = s;
     556  	    }
     557  	}
     558      }
     559    else if (GFC_DESCRIPTOR_RANK (a) == 1)
     560      {
     561        const GFC_REAL_10 *restrict bbase_y;
     562        GFC_REAL_10 s;
     563  
     564        for (y = 0; y < ycount; y++)
     565  	{
     566  	  bbase_y = &bbase[y*bystride];
     567  	  s = (GFC_REAL_10) 0;
     568  	  for (n = 0; n < count; n++)
     569  	    s += abase[n*axstride] * bbase_y[n*bxstride];
     570  	  dest[y*rxstride] = s;
     571  	}
     572      }
     573    else if (axstride < aystride)
     574      {
     575        for (y = 0; y < ycount; y++)
     576  	for (x = 0; x < xcount; x++)
     577  	  dest[x*rxstride + y*rystride] = (GFC_REAL_10)0;
     578  
     579        for (y = 0; y < ycount; y++)
     580  	for (n = 0; n < count; n++)
     581  	  for (x = 0; x < xcount; x++)
     582  	    /* dest[x,y] += a[x,n] * b[n,y] */
     583  	    dest[x*rxstride + y*rystride] +=
     584  					abase[x*axstride + n*aystride] *
     585  					bbase[n*bxstride + y*bystride];
     586      }
     587    else
     588      {
     589        const GFC_REAL_10 *restrict abase_x;
     590        const GFC_REAL_10 *restrict bbase_y;
     591        GFC_REAL_10 *restrict dest_y;
     592        GFC_REAL_10 s;
     593  
     594        for (y = 0; y < ycount; y++)
     595  	{
     596  	  bbase_y = &bbase[y*bystride];
     597  	  dest_y = &dest[y*rystride];
     598  	  for (x = 0; x < xcount; x++)
     599  	    {
     600  	      abase_x = &abase[x*axstride];
     601  	      s = (GFC_REAL_10) 0;
     602  	      for (n = 0; n < count; n++)
     603  		s += abase_x[n*aystride] * bbase_y[n*bxstride];
     604  	      dest_y[x*rxstride] = s;
     605  	    }
     606  	}
     607      }
     608  }
     609  #undef POW3
     610  #undef min
     611  #undef max
     612  
     613  #endif
     614  
     615  #if defined(HAVE_AVX) && defined(HAVE_FMA4) && defined(HAVE_AVX128)
     616  void
     617  matmul_r10_avx128_fma4 (gfc_array_r10 * const restrict retarray, 
     618  	gfc_array_r10 * const restrict a, gfc_array_r10 * const restrict b, int try_blas,
     619  	int blas_limit, blas_call gemm) __attribute__((__target__("avx,fma4")));
     620  internal_proto(matmul_r10_avx128_fma4);
     621  void
     622  matmul_r10_avx128_fma4 (gfc_array_r10 * const restrict retarray, 
     623  	gfc_array_r10 * const restrict a, gfc_array_r10 * const restrict b, int try_blas,
     624  	int blas_limit, blas_call gemm)
     625  {
     626    const GFC_REAL_10 * restrict abase;
     627    const GFC_REAL_10 * restrict bbase;
     628    GFC_REAL_10 * restrict dest;
     629  
     630    index_type rxstride, rystride, axstride, aystride, bxstride, bystride;
     631    index_type x, y, n, count, xcount, ycount;
     632  
     633    assert (GFC_DESCRIPTOR_RANK (a) == 2
     634            || GFC_DESCRIPTOR_RANK (b) == 2);
     635  
     636  /* C[xcount,ycount] = A[xcount, count] * B[count,ycount]
     637  
     638     Either A or B (but not both) can be rank 1:
     639  
     640     o One-dimensional argument A is implicitly treated as a row matrix
     641       dimensioned [1,count], so xcount=1.
     642  
     643     o One-dimensional argument B is implicitly treated as a column matrix
     644       dimensioned [count, 1], so ycount=1.
     645  */
     646  
     647    if (retarray->base_addr == NULL)
     648      {
     649        if (GFC_DESCRIPTOR_RANK (a) == 1)
     650          {
     651  	  GFC_DIMENSION_SET(retarray->dim[0], 0,
     652  	                    GFC_DESCRIPTOR_EXTENT(b,1) - 1, 1);
     653          }
     654        else if (GFC_DESCRIPTOR_RANK (b) == 1)
     655          {
     656  	  GFC_DIMENSION_SET(retarray->dim[0], 0,
     657  	                    GFC_DESCRIPTOR_EXTENT(a,0) - 1, 1);
     658          }
     659        else
     660          {
     661  	  GFC_DIMENSION_SET(retarray->dim[0], 0,
     662  	                    GFC_DESCRIPTOR_EXTENT(a,0) - 1, 1);
     663  
     664            GFC_DIMENSION_SET(retarray->dim[1], 0,
     665  	                    GFC_DESCRIPTOR_EXTENT(b,1) - 1,
     666  			    GFC_DESCRIPTOR_EXTENT(retarray,0));
     667          }
     668  
     669        retarray->base_addr
     670  	= xmallocarray (size0 ((array_t *) retarray), sizeof (GFC_REAL_10));
     671        retarray->offset = 0;
     672      }
     673    else if (unlikely (compile_options.bounds_check))
     674      {
     675        index_type ret_extent, arg_extent;
     676  
     677        if (GFC_DESCRIPTOR_RANK (a) == 1)
     678  	{
     679  	  arg_extent = GFC_DESCRIPTOR_EXTENT(b,1);
     680  	  ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,0);
     681  	  if (arg_extent != ret_extent)
     682  	    runtime_error ("Array bound mismatch for dimension 1 of "
     683  	    		   "array (%ld/%ld) ",
     684  			   (long int) ret_extent, (long int) arg_extent);
     685  	}
     686        else if (GFC_DESCRIPTOR_RANK (b) == 1)
     687  	{
     688  	  arg_extent = GFC_DESCRIPTOR_EXTENT(a,0);
     689  	  ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,0);
     690  	  if (arg_extent != ret_extent)
     691  	    runtime_error ("Array bound mismatch for dimension 1 of "
     692  	    		   "array (%ld/%ld) ",
     693  			   (long int) ret_extent, (long int) arg_extent);
     694  	}
     695        else
     696  	{
     697  	  arg_extent = GFC_DESCRIPTOR_EXTENT(a,0);
     698  	  ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,0);
     699  	  if (arg_extent != ret_extent)
     700  	    runtime_error ("Array bound mismatch for dimension 1 of "
     701  	    		   "array (%ld/%ld) ",
     702  			   (long int) ret_extent, (long int) arg_extent);
     703  
     704  	  arg_extent = GFC_DESCRIPTOR_EXTENT(b,1);
     705  	  ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,1);
     706  	  if (arg_extent != ret_extent)
     707  	    runtime_error ("Array bound mismatch for dimension 2 of "
     708  	    		   "array (%ld/%ld) ",
     709  			   (long int) ret_extent, (long int) arg_extent);
     710  	}
     711      }
     712  
     713  
     714    if (GFC_DESCRIPTOR_RANK (retarray) == 1)
     715      {
     716        /* One-dimensional result may be addressed in the code below
     717  	 either as a row or a column matrix. We want both cases to
     718  	 work. */
     719        rxstride = rystride = GFC_DESCRIPTOR_STRIDE(retarray,0);
     720      }
     721    else
     722      {
     723        rxstride = GFC_DESCRIPTOR_STRIDE(retarray,0);
     724        rystride = GFC_DESCRIPTOR_STRIDE(retarray,1);
     725      }
     726  
     727  
     728    if (GFC_DESCRIPTOR_RANK (a) == 1)
     729      {
     730        /* Treat it as a a row matrix A[1,count]. */
     731        axstride = GFC_DESCRIPTOR_STRIDE(a,0);
     732        aystride = 1;
     733  
     734        xcount = 1;
     735        count = GFC_DESCRIPTOR_EXTENT(a,0);
     736      }
     737    else
     738      {
     739        axstride = GFC_DESCRIPTOR_STRIDE(a,0);
     740        aystride = GFC_DESCRIPTOR_STRIDE(a,1);
     741  
     742        count = GFC_DESCRIPTOR_EXTENT(a,1);
     743        xcount = GFC_DESCRIPTOR_EXTENT(a,0);
     744      }
     745  
     746    if (count != GFC_DESCRIPTOR_EXTENT(b,0))
     747      {
     748        if (count > 0 || GFC_DESCRIPTOR_EXTENT(b,0) > 0)
     749  	runtime_error ("Incorrect extent in argument B in MATMUL intrinsic "
     750  		       "in dimension 1: is %ld, should be %ld",
     751  		       (long int) GFC_DESCRIPTOR_EXTENT(b,0), (long int) count);
     752      }
     753  
     754    if (GFC_DESCRIPTOR_RANK (b) == 1)
     755      {
     756        /* Treat it as a column matrix B[count,1] */
     757        bxstride = GFC_DESCRIPTOR_STRIDE(b,0);
     758  
     759        /* bystride should never be used for 1-dimensional b.
     760           The value is only used for calculation of the
     761           memory by the buffer.  */
     762        bystride = 256;
     763        ycount = 1;
     764      }
     765    else
     766      {
     767        bxstride = GFC_DESCRIPTOR_STRIDE(b,0);
     768        bystride = GFC_DESCRIPTOR_STRIDE(b,1);
     769        ycount = GFC_DESCRIPTOR_EXTENT(b,1);
     770      }
     771  
     772    abase = a->base_addr;
     773    bbase = b->base_addr;
     774    dest = retarray->base_addr;
     775  
     776    /* Now that everything is set up, we perform the multiplication
     777       itself.  */
     778  
     779  #define POW3(x) (((float) (x)) * ((float) (x)) * ((float) (x)))
     780  #define min(a,b) ((a) <= (b) ? (a) : (b))
     781  #define max(a,b) ((a) >= (b) ? (a) : (b))
     782  
     783    if (try_blas && rxstride == 1 && (axstride == 1 || aystride == 1)
     784        && (bxstride == 1 || bystride == 1)
     785        && (((float) xcount) * ((float) ycount) * ((float) count)
     786            > POW3(blas_limit)))
     787      {
     788        const int m = xcount, n = ycount, k = count, ldc = rystride;
     789        const GFC_REAL_10 one = 1, zero = 0;
     790        const int lda = (axstride == 1) ? aystride : axstride,
     791  		ldb = (bxstride == 1) ? bystride : bxstride;
     792  
     793        if (lda > 0 && ldb > 0 && ldc > 0 && m > 1 && n > 1 && k > 1)
     794  	{
     795  	  assert (gemm != NULL);
     796  	  const char *transa, *transb;
     797  	  if (try_blas & 2)
     798  	    transa = "C";
     799  	  else
     800  	    transa = axstride == 1 ? "N" : "T";
     801  
     802  	  if (try_blas & 4)
     803  	    transb = "C";
     804  	  else
     805  	    transb = bxstride == 1 ? "N" : "T";
     806  
     807  	  gemm (transa, transb , &m,
     808  		&n, &k,	&one, abase, &lda, bbase, &ldb, &zero, dest,
     809  		&ldc, 1, 1);
     810  	  return;
     811  	}
     812      }
     813  
     814    if (rxstride == 1 && axstride == 1 && bxstride == 1
     815        && GFC_DESCRIPTOR_RANK (b) != 1)
     816      {
     817        /* This block of code implements a tuned matmul, derived from
     818           Superscalar GEMM-based level 3 BLAS,  Beta version 0.1
     819  
     820                 Bo Kagstrom and Per Ling
     821                 Department of Computing Science
     822                 Umea University
     823                 S-901 87 Umea, Sweden
     824  
     825  	 from netlib.org, translated to C, and modified for matmul.m4.  */
     826  
     827        const GFC_REAL_10 *a, *b;
     828        GFC_REAL_10 *c;
     829        const index_type m = xcount, n = ycount, k = count;
     830  
     831        /* System generated locals */
     832        index_type a_dim1, a_offset, b_dim1, b_offset, c_dim1, c_offset,
     833  		 i1, i2, i3, i4, i5, i6;
     834  
     835        /* Local variables */
     836        GFC_REAL_10 f11, f12, f21, f22, f31, f32, f41, f42,
     837  		 f13, f14, f23, f24, f33, f34, f43, f44;
     838        index_type i, j, l, ii, jj, ll;
     839        index_type isec, jsec, lsec, uisec, ujsec, ulsec;
     840        GFC_REAL_10 *t1;
     841  
     842        a = abase;
     843        b = bbase;
     844        c = retarray->base_addr;
     845  
     846        /* Parameter adjustments */
     847        c_dim1 = rystride;
     848        c_offset = 1 + c_dim1;
     849        c -= c_offset;
     850        a_dim1 = aystride;
     851        a_offset = 1 + a_dim1;
     852        a -= a_offset;
     853        b_dim1 = bystride;
     854        b_offset = 1 + b_dim1;
     855        b -= b_offset;
     856  
     857        /* Empty c first.  */
     858        for (j=1; j<=n; j++)
     859  	for (i=1; i<=m; i++)
     860  	  c[i + j * c_dim1] = (GFC_REAL_10)0;
     861  
     862        /* Early exit if possible */
     863        if (m == 0 || n == 0 || k == 0)
     864  	return;
     865  
     866        /* Adjust size of t1 to what is needed.  */
     867        index_type t1_dim, a_sz;
     868        if (aystride == 1)
     869          a_sz = rystride;
     870        else
     871          a_sz = a_dim1;
     872  
     873        t1_dim = a_sz * 256 + b_dim1;
     874        if (t1_dim > 65536)
     875  	t1_dim = 65536;
     876  
     877        t1 = malloc (t1_dim * sizeof(GFC_REAL_10));
     878  
     879        /* Start turning the crank. */
     880        i1 = n;
     881        for (jj = 1; jj <= i1; jj += 512)
     882  	{
     883  	  /* Computing MIN */
     884  	  i2 = 512;
     885  	  i3 = n - jj + 1;
     886  	  jsec = min(i2,i3);
     887  	  ujsec = jsec - jsec % 4;
     888  	  i2 = k;
     889  	  for (ll = 1; ll <= i2; ll += 256)
     890  	    {
     891  	      /* Computing MIN */
     892  	      i3 = 256;
     893  	      i4 = k - ll + 1;
     894  	      lsec = min(i3,i4);
     895  	      ulsec = lsec - lsec % 2;
     896  
     897  	      i3 = m;
     898  	      for (ii = 1; ii <= i3; ii += 256)
     899  		{
     900  		  /* Computing MIN */
     901  		  i4 = 256;
     902  		  i5 = m - ii + 1;
     903  		  isec = min(i4,i5);
     904  		  uisec = isec - isec % 2;
     905  		  i4 = ll + ulsec - 1;
     906  		  for (l = ll; l <= i4; l += 2)
     907  		    {
     908  		      i5 = ii + uisec - 1;
     909  		      for (i = ii; i <= i5; i += 2)
     910  			{
     911  			  t1[l - ll + 1 + ((i - ii + 1) << 8) - 257] =
     912  					a[i + l * a_dim1];
     913  			  t1[l - ll + 2 + ((i - ii + 1) << 8) - 257] =
     914  					a[i + (l + 1) * a_dim1];
     915  			  t1[l - ll + 1 + ((i - ii + 2) << 8) - 257] =
     916  					a[i + 1 + l * a_dim1];
     917  			  t1[l - ll + 2 + ((i - ii + 2) << 8) - 257] =
     918  					a[i + 1 + (l + 1) * a_dim1];
     919  			}
     920  		      if (uisec < isec)
     921  			{
     922  			  t1[l - ll + 1 + (isec << 8) - 257] =
     923  				    a[ii + isec - 1 + l * a_dim1];
     924  			  t1[l - ll + 2 + (isec << 8) - 257] =
     925  				    a[ii + isec - 1 + (l + 1) * a_dim1];
     926  			}
     927  		    }
     928  		  if (ulsec < lsec)
     929  		    {
     930  		      i4 = ii + isec - 1;
     931  		      for (i = ii; i<= i4; ++i)
     932  			{
     933  			  t1[lsec + ((i - ii + 1) << 8) - 257] =
     934  				    a[i + (ll + lsec - 1) * a_dim1];
     935  			}
     936  		    }
     937  
     938  		  uisec = isec - isec % 4;
     939  		  i4 = jj + ujsec - 1;
     940  		  for (j = jj; j <= i4; j += 4)
     941  		    {
     942  		      i5 = ii + uisec - 1;
     943  		      for (i = ii; i <= i5; i += 4)
     944  			{
     945  			  f11 = c[i + j * c_dim1];
     946  			  f21 = c[i + 1 + j * c_dim1];
     947  			  f12 = c[i + (j + 1) * c_dim1];
     948  			  f22 = c[i + 1 + (j + 1) * c_dim1];
     949  			  f13 = c[i + (j + 2) * c_dim1];
     950  			  f23 = c[i + 1 + (j + 2) * c_dim1];
     951  			  f14 = c[i + (j + 3) * c_dim1];
     952  			  f24 = c[i + 1 + (j + 3) * c_dim1];
     953  			  f31 = c[i + 2 + j * c_dim1];
     954  			  f41 = c[i + 3 + j * c_dim1];
     955  			  f32 = c[i + 2 + (j + 1) * c_dim1];
     956  			  f42 = c[i + 3 + (j + 1) * c_dim1];
     957  			  f33 = c[i + 2 + (j + 2) * c_dim1];
     958  			  f43 = c[i + 3 + (j + 2) * c_dim1];
     959  			  f34 = c[i + 2 + (j + 3) * c_dim1];
     960  			  f44 = c[i + 3 + (j + 3) * c_dim1];
     961  			  i6 = ll + lsec - 1;
     962  			  for (l = ll; l <= i6; ++l)
     963  			    {
     964  			      f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257]
     965  				      * b[l + j * b_dim1];
     966  			      f21 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257]
     967  				      * b[l + j * b_dim1];
     968  			      f12 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257]
     969  				      * b[l + (j + 1) * b_dim1];
     970  			      f22 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257]
     971  				      * b[l + (j + 1) * b_dim1];
     972  			      f13 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257]
     973  				      * b[l + (j + 2) * b_dim1];
     974  			      f23 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257]
     975  				      * b[l + (j + 2) * b_dim1];
     976  			      f14 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257]
     977  				      * b[l + (j + 3) * b_dim1];
     978  			      f24 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257]
     979  				      * b[l + (j + 3) * b_dim1];
     980  			      f31 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257]
     981  				      * b[l + j * b_dim1];
     982  			      f41 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257]
     983  				      * b[l + j * b_dim1];
     984  			      f32 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257]
     985  				      * b[l + (j + 1) * b_dim1];
     986  			      f42 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257]
     987  				      * b[l + (j + 1) * b_dim1];
     988  			      f33 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257]
     989  				      * b[l + (j + 2) * b_dim1];
     990  			      f43 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257]
     991  				      * b[l + (j + 2) * b_dim1];
     992  			      f34 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257]
     993  				      * b[l + (j + 3) * b_dim1];
     994  			      f44 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257]
     995  				      * b[l + (j + 3) * b_dim1];
     996  			    }
     997  			  c[i + j * c_dim1] = f11;
     998  			  c[i + 1 + j * c_dim1] = f21;
     999  			  c[i + (j + 1) * c_dim1] = f12;
    1000  			  c[i + 1 + (j + 1) * c_dim1] = f22;
    1001  			  c[i + (j + 2) * c_dim1] = f13;
    1002  			  c[i + 1 + (j + 2) * c_dim1] = f23;
    1003  			  c[i + (j + 3) * c_dim1] = f14;
    1004  			  c[i + 1 + (j + 3) * c_dim1] = f24;
    1005  			  c[i + 2 + j * c_dim1] = f31;
    1006  			  c[i + 3 + j * c_dim1] = f41;
    1007  			  c[i + 2 + (j + 1) * c_dim1] = f32;
    1008  			  c[i + 3 + (j + 1) * c_dim1] = f42;
    1009  			  c[i + 2 + (j + 2) * c_dim1] = f33;
    1010  			  c[i + 3 + (j + 2) * c_dim1] = f43;
    1011  			  c[i + 2 + (j + 3) * c_dim1] = f34;
    1012  			  c[i + 3 + (j + 3) * c_dim1] = f44;
    1013  			}
    1014  		      if (uisec < isec)
    1015  			{
    1016  			  i5 = ii + isec - 1;
    1017  			  for (i = ii + uisec; i <= i5; ++i)
    1018  			    {
    1019  			      f11 = c[i + j * c_dim1];
    1020  			      f12 = c[i + (j + 1) * c_dim1];
    1021  			      f13 = c[i + (j + 2) * c_dim1];
    1022  			      f14 = c[i + (j + 3) * c_dim1];
    1023  			      i6 = ll + lsec - 1;
    1024  			      for (l = ll; l <= i6; ++l)
    1025  				{
    1026  				  f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
    1027  					  257] * b[l + j * b_dim1];
    1028  				  f12 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
    1029  					  257] * b[l + (j + 1) * b_dim1];
    1030  				  f13 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
    1031  					  257] * b[l + (j + 2) * b_dim1];
    1032  				  f14 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
    1033  					  257] * b[l + (j + 3) * b_dim1];
    1034  				}
    1035  			      c[i + j * c_dim1] = f11;
    1036  			      c[i + (j + 1) * c_dim1] = f12;
    1037  			      c[i + (j + 2) * c_dim1] = f13;
    1038  			      c[i + (j + 3) * c_dim1] = f14;
    1039  			    }
    1040  			}
    1041  		    }
    1042  		  if (ujsec < jsec)
    1043  		    {
    1044  		      i4 = jj + jsec - 1;
    1045  		      for (j = jj + ujsec; j <= i4; ++j)
    1046  			{
    1047  			  i5 = ii + uisec - 1;
    1048  			  for (i = ii; i <= i5; i += 4)
    1049  			    {
    1050  			      f11 = c[i + j * c_dim1];
    1051  			      f21 = c[i + 1 + j * c_dim1];
    1052  			      f31 = c[i + 2 + j * c_dim1];
    1053  			      f41 = c[i + 3 + j * c_dim1];
    1054  			      i6 = ll + lsec - 1;
    1055  			      for (l = ll; l <= i6; ++l)
    1056  				{
    1057  				  f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
    1058  					  257] * b[l + j * b_dim1];
    1059  				  f21 += t1[l - ll + 1 + ((i - ii + 2) << 8) -
    1060  					  257] * b[l + j * b_dim1];
    1061  				  f31 += t1[l - ll + 1 + ((i - ii + 3) << 8) -
    1062  					  257] * b[l + j * b_dim1];
    1063  				  f41 += t1[l - ll + 1 + ((i - ii + 4) << 8) -
    1064  					  257] * b[l + j * b_dim1];
    1065  				}
    1066  			      c[i + j * c_dim1] = f11;
    1067  			      c[i + 1 + j * c_dim1] = f21;
    1068  			      c[i + 2 + j * c_dim1] = f31;
    1069  			      c[i + 3 + j * c_dim1] = f41;
    1070  			    }
    1071  			  i5 = ii + isec - 1;
    1072  			  for (i = ii + uisec; i <= i5; ++i)
    1073  			    {
    1074  			      f11 = c[i + j * c_dim1];
    1075  			      i6 = ll + lsec - 1;
    1076  			      for (l = ll; l <= i6; ++l)
    1077  				{
    1078  				  f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
    1079  					  257] * b[l + j * b_dim1];
    1080  				}
    1081  			      c[i + j * c_dim1] = f11;
    1082  			    }
    1083  			}
    1084  		    }
    1085  		}
    1086  	    }
    1087  	}
    1088        free(t1);
    1089        return;
    1090      }
    1091    else if (rxstride == 1 && aystride == 1 && bxstride == 1)
    1092      {
    1093        if (GFC_DESCRIPTOR_RANK (a) != 1)
    1094  	{
    1095  	  const GFC_REAL_10 *restrict abase_x;
    1096  	  const GFC_REAL_10 *restrict bbase_y;
    1097  	  GFC_REAL_10 *restrict dest_y;
    1098  	  GFC_REAL_10 s;
    1099  
    1100  	  for (y = 0; y < ycount; y++)
    1101  	    {
    1102  	      bbase_y = &bbase[y*bystride];
    1103  	      dest_y = &dest[y*rystride];
    1104  	      for (x = 0; x < xcount; x++)
    1105  		{
    1106  		  abase_x = &abase[x*axstride];
    1107  		  s = (GFC_REAL_10) 0;
    1108  		  for (n = 0; n < count; n++)
    1109  		    s += abase_x[n] * bbase_y[n];
    1110  		  dest_y[x] = s;
    1111  		}
    1112  	    }
    1113  	}
    1114        else
    1115  	{
    1116  	  const GFC_REAL_10 *restrict bbase_y;
    1117  	  GFC_REAL_10 s;
    1118  
    1119  	  for (y = 0; y < ycount; y++)
    1120  	    {
    1121  	      bbase_y = &bbase[y*bystride];
    1122  	      s = (GFC_REAL_10) 0;
    1123  	      for (n = 0; n < count; n++)
    1124  		s += abase[n*axstride] * bbase_y[n];
    1125  	      dest[y*rystride] = s;
    1126  	    }
    1127  	}
    1128      }
    1129    else if (GFC_DESCRIPTOR_RANK (a) == 1)
    1130      {
    1131        const GFC_REAL_10 *restrict bbase_y;
    1132        GFC_REAL_10 s;
    1133  
    1134        for (y = 0; y < ycount; y++)
    1135  	{
    1136  	  bbase_y = &bbase[y*bystride];
    1137  	  s = (GFC_REAL_10) 0;
    1138  	  for (n = 0; n < count; n++)
    1139  	    s += abase[n*axstride] * bbase_y[n*bxstride];
    1140  	  dest[y*rxstride] = s;
    1141  	}
    1142      }
    1143    else if (axstride < aystride)
    1144      {
    1145        for (y = 0; y < ycount; y++)
    1146  	for (x = 0; x < xcount; x++)
    1147  	  dest[x*rxstride + y*rystride] = (GFC_REAL_10)0;
    1148  
    1149        for (y = 0; y < ycount; y++)
    1150  	for (n = 0; n < count; n++)
    1151  	  for (x = 0; x < xcount; x++)
    1152  	    /* dest[x,y] += a[x,n] * b[n,y] */
    1153  	    dest[x*rxstride + y*rystride] +=
    1154  					abase[x*axstride + n*aystride] *
    1155  					bbase[n*bxstride + y*bystride];
    1156      }
    1157    else
    1158      {
    1159        const GFC_REAL_10 *restrict abase_x;
    1160        const GFC_REAL_10 *restrict bbase_y;
    1161        GFC_REAL_10 *restrict dest_y;
    1162        GFC_REAL_10 s;
    1163  
    1164        for (y = 0; y < ycount; y++)
    1165  	{
    1166  	  bbase_y = &bbase[y*bystride];
    1167  	  dest_y = &dest[y*rystride];
    1168  	  for (x = 0; x < xcount; x++)
    1169  	    {
    1170  	      abase_x = &abase[x*axstride];
    1171  	      s = (GFC_REAL_10) 0;
    1172  	      for (n = 0; n < count; n++)
    1173  		s += abase_x[n*aystride] * bbase_y[n*bxstride];
    1174  	      dest_y[x*rxstride] = s;
    1175  	    }
    1176  	}
    1177      }
    1178  }
    1179  #undef POW3
    1180  #undef min
    1181  #undef max
    1182  
    1183  #endif
    1184  
    1185  #endif
    1186