1  /* matrix22_mul.c.
       2  
       3     Contributed by Niels Möller and Marco Bodrato.
       4  
       5     THE FUNCTIONS IN THIS FILE ARE INTERNAL WITH MUTABLE INTERFACES.  IT IS ONLY
       6     SAFE TO REACH THEM THROUGH DOCUMENTED INTERFACES.  IN FACT, IT IS ALMOST
       7     GUARANTEED THAT THEY'LL CHANGE OR DISAPPEAR IN A FUTURE GNU MP RELEASE.
       8  
       9  Copyright 2003-2005, 2008, 2009 Free Software Foundation, Inc.
      10  
      11  This file is part of the GNU MP Library.
      12  
      13  The GNU MP Library is free software; you can redistribute it and/or modify
      14  it under the terms of either:
      15  
      16    * the GNU Lesser General Public License as published by the Free
      17      Software Foundation; either version 3 of the License, or (at your
      18      option) any later version.
      19  
      20  or
      21  
      22    * the GNU General Public License as published by the Free Software
      23      Foundation; either version 2 of the License, or (at your option) any
      24      later version.
      25  
      26  or both in parallel, as here.
      27  
      28  The GNU MP Library is distributed in the hope that it will be useful, but
      29  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
      30  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
      31  for more details.
      32  
      33  You should have received copies of the GNU General Public License and the
      34  GNU Lesser General Public License along with the GNU MP Library.  If not,
      35  see https://www.gnu.org/licenses/.  */
      36  
      37  #include "gmp-impl.h"
      38  #include "longlong.h"
      39  
      40  #define MUL(rp, ap, an, bp, bn) do {		\
      41    if (an >= bn)					\
      42      mpn_mul (rp, ap, an, bp, bn);		\
      43    else						\
      44      mpn_mul (rp, bp, bn, ap, an);		\
      45  } while (0)
      46  
      47  /* Inputs are unsigned. */
      48  static int
      49  abs_sub_n (mp_ptr rp, mp_srcptr ap, mp_srcptr bp, mp_size_t n)
      50  {
      51    int c;
      52    MPN_CMP (c, ap, bp, n);
      53    if (c >= 0)
      54      {
      55        mpn_sub_n (rp, ap, bp, n);
      56        return 0;
      57      }
      58    else
      59      {
      60        mpn_sub_n (rp, bp, ap, n);
      61        return 1;
      62      }
      63  }
      64  
      65  static int
      66  add_signed_n (mp_ptr rp,
      67  	      mp_srcptr ap, int as, mp_srcptr bp, int bs, mp_size_t n)
      68  {
      69    if (as != bs)
      70      return as ^ abs_sub_n (rp, ap, bp, n);
      71    else
      72      {
      73        ASSERT_NOCARRY (mpn_add_n (rp, ap, bp, n));
      74        return as;
      75      }
      76  }
      77  
      78  mp_size_t
      79  mpn_matrix22_mul_itch (mp_size_t rn, mp_size_t mn)
      80  {
      81    if (BELOW_THRESHOLD (rn, MATRIX22_STRASSEN_THRESHOLD)
      82        || BELOW_THRESHOLD (mn, MATRIX22_STRASSEN_THRESHOLD))
      83      return 3*rn + 2*mn;
      84    else
      85      return 3*(rn + mn) + 5;
      86  }
      87  
      88  /* Algorithm:
      89  
      90      / s0 \   /  1  0  0  0 \ / r0 \
      91      | s1 |   |  0  1  0  1 | | r1 |
      92      | s2 |   |  0  0 -1  1 | | r2 |
      93      | s3 | = |  0  1 -1  1 | \ r3 /
      94      | s4 |   | -1  1 -1  1 |
      95      | s5 |   |  0  1  0  0 |
      96      \ s6 /   \  0  0  1  0 /
      97  
      98      / t0 \   /  1  0  0  0 \ / m0 \
      99      | t1 |   |  0  1  0  1 | | m1 |
     100      | t2 |   |  0  0 -1  1 | | m2 |
     101      | t3 | = |  0  1 -1  1 | \ m3 /
     102      | t4 |   | -1  1 -1  1 |
     103      | t5 |   |  0  1  0  0 |
     104      \ t6 /   \  0  0  1  0 /
     105  
     106    Note: the two matrices above are the same, but s_i and t_i are used
     107    in the same product, only for i<4, see "A Strassen-like Matrix
     108    Multiplication suited for squaring and higher power computation" by
     109    M. Bodrato, in Proceedings of ISSAC 2010.
     110  
     111      / r0 \   / 1 0  0  0  0  1  0 \ / s0*t0 \
     112      | r1 | = | 0 0 -1  1 -1  1  0 | | s1*t1 |
     113      | r2 |   | 0 1  0 -1  0 -1 -1 | | s2*t2 |
     114      \ r3 /   \ 0 1  1 -1  0 -1  0 / | s3*t3 |
     115  				    | s4*t5 |
     116  				    | s5*t6 |
     117  				    \ s6*t4 /
     118  
     119    The scheduling uses two temporaries U0 and U1 to store products, and
     120    two, S0 and T0, to store combinations of entries of the two
     121    operands.
     122  */
     123  
     124  /* Computes R = R * M. Elements are numbers R = (r0, r1; r2, r3).
     125   *
     126   * Resulting elements are of size up to rn + mn + 1.
     127   *
     128   * Temporary storage: 3 rn + 3 mn + 5. */
     129  static void
     130  mpn_matrix22_mul_strassen (mp_ptr r0, mp_ptr r1, mp_ptr r2, mp_ptr r3, mp_size_t rn,
     131  			   mp_srcptr m0, mp_srcptr m1, mp_srcptr m2, mp_srcptr m3, mp_size_t mn,
     132  			   mp_ptr tp)
     133  {
     134    mp_ptr s0, t0, u0, u1;
     135    int r1s, r3s, s0s, t0s, u1s;
     136    s0 = tp; tp += rn + 1;
     137    t0 = tp; tp += mn + 1;
     138    u0 = tp; tp += rn + mn + 1;
     139    u1 = tp; /* rn + mn + 2 */
     140  
     141    MUL (u0, r1, rn, m2, mn);		/* u5 = s5 * t6 */
     142    r3s = abs_sub_n (r3, r3, r2, rn);	/* r3 - r2 */
     143    if (r3s)
     144      {
     145        r1s = abs_sub_n (r1, r1, r3, rn);
     146        r1[rn] = 0;
     147      }
     148    else
     149      {
     150        r1[rn] = mpn_add_n (r1, r1, r3, rn);
     151        r1s = 0;				/* r1 - r2 + r3  */
     152      }
     153    if (r1s)
     154      {
     155        s0[rn] = mpn_add_n (s0, r1, r0, rn);
     156        s0s = 0;
     157      }
     158    else if (r1[rn] != 0)
     159      {
     160        s0[rn] = r1[rn] - mpn_sub_n (s0, r1, r0, rn);
     161        s0s = 1;				/* s4 = -r0 + r1 - r2 + r3 */
     162  					/* Reverse sign! */
     163      }
     164    else
     165      {
     166        s0s = abs_sub_n (s0, r0, r1, rn);
     167        s0[rn] = 0;
     168      }
     169    MUL (u1, r0, rn, m0, mn);		/* u0 = s0 * t0 */
     170    r0[rn+mn] = mpn_add_n (r0, u0, u1, rn + mn);
     171    ASSERT (r0[rn+mn] < 2);		/* u0 + u5 */
     172  
     173    t0s = abs_sub_n (t0, m3, m2, mn);
     174    u1s = r3s^t0s^1;			/* Reverse sign! */
     175    MUL (u1, r3, rn, t0, mn);		/* u2 = s2 * t2 */
     176    u1[rn+mn] = 0;
     177    if (t0s)
     178      {
     179        t0s = abs_sub_n (t0, m1, t0, mn);
     180        t0[mn] = 0;
     181      }
     182    else
     183      {
     184        t0[mn] = mpn_add_n (t0, t0, m1, mn);
     185      }
     186  
     187    /* FIXME: Could be simplified if we had space for rn + mn + 2 limbs
     188       at r3. I'd expect that for matrices of random size, the high
     189       words t0[mn] and r1[rn] are non-zero with a pretty small
     190       probability. If that can be confirmed this should be done as an
     191       unconditional rn x (mn+1) followed by an if (UNLIKELY (r1[rn]))
     192       add_n. */
     193    if (t0[mn] != 0)
     194      {
     195        MUL (r3, r1, rn, t0, mn + 1);	/* u3 = s3 * t3 */
     196        ASSERT (r1[rn] < 2);
     197        if (r1[rn] != 0)
     198  	mpn_add_n (r3 + rn, r3 + rn, t0, mn + 1);
     199      }
     200    else
     201      {
     202        MUL (r3, r1, rn + 1, t0, mn);
     203      }
     204  
     205    ASSERT (r3[rn+mn] < 4);
     206  
     207    u0[rn+mn] = 0;
     208    if (r1s^t0s)
     209      {
     210        r3s = abs_sub_n (r3, u0, r3, rn + mn + 1);
     211      }
     212    else
     213      {
     214        ASSERT_NOCARRY (mpn_add_n (r3, r3, u0, rn + mn + 1));
     215        r3s = 0;				/* u3 + u5 */
     216      }
     217  
     218    if (t0s)
     219      {
     220        t0[mn] = mpn_add_n (t0, t0, m0, mn);
     221      }
     222    else if (t0[mn] != 0)
     223      {
     224        t0[mn] -= mpn_sub_n (t0, t0, m0, mn);
     225      }
     226    else
     227      {
     228        t0s = abs_sub_n (t0, t0, m0, mn);
     229      }
     230    MUL (u0, r2, rn, t0, mn + 1);		/* u6 = s6 * t4 */
     231    ASSERT (u0[rn+mn] < 2);
     232    if (r1s)
     233      {
     234        ASSERT_NOCARRY (mpn_sub_n (r1, r2, r1, rn));
     235      }
     236    else
     237      {
     238        r1[rn] += mpn_add_n (r1, r1, r2, rn);
     239      }
     240    rn++;
     241    t0s = add_signed_n (r2, r3, r3s, u0, t0s, rn + mn);
     242  					/* u3 + u5 + u6 */
     243    ASSERT (r2[rn+mn-1] < 4);
     244    r3s = add_signed_n (r3, r3, r3s, u1, u1s, rn + mn);
     245  					/* -u2 + u3 + u5  */
     246    ASSERT (r3[rn+mn-1] < 3);
     247    MUL (u0, s0, rn, m1, mn);		/* u4 = s4 * t5 */
     248    ASSERT (u0[rn+mn-1] < 2);
     249    t0[mn] = mpn_add_n (t0, m3, m1, mn);
     250    MUL (u1, r1, rn, t0, mn + 1);		/* u1 = s1 * t1 */
     251    mn += rn;
     252    ASSERT (u1[mn-1] < 4);
     253    ASSERT (u1[mn] == 0);
     254    ASSERT_NOCARRY (add_signed_n (r1, r3, r3s, u0, s0s, mn));
     255  					/* -u2 + u3 - u4 + u5  */
     256    ASSERT (r1[mn-1] < 2);
     257    if (r3s)
     258      {
     259        ASSERT_NOCARRY (mpn_add_n (r3, u1, r3, mn));
     260      }
     261    else
     262      {
     263        ASSERT_NOCARRY (mpn_sub_n (r3, u1, r3, mn));
     264  					/* u1 + u2 - u3 - u5  */
     265      }
     266    ASSERT (r3[mn-1] < 2);
     267    if (t0s)
     268      {
     269        ASSERT_NOCARRY (mpn_add_n (r2, u1, r2, mn));
     270      }
     271    else
     272      {
     273        ASSERT_NOCARRY (mpn_sub_n (r2, u1, r2, mn));
     274  					/* u1 - u3 - u5 - u6  */
     275      }
     276    ASSERT (r2[mn-1] < 2);
     277  }
     278  
     279  void
     280  mpn_matrix22_mul (mp_ptr r0, mp_ptr r1, mp_ptr r2, mp_ptr r3, mp_size_t rn,
     281  		  mp_srcptr m0, mp_srcptr m1, mp_srcptr m2, mp_srcptr m3, mp_size_t mn,
     282  		  mp_ptr tp)
     283  {
     284    if (BELOW_THRESHOLD (rn, MATRIX22_STRASSEN_THRESHOLD)
     285        || BELOW_THRESHOLD (mn, MATRIX22_STRASSEN_THRESHOLD))
     286      {
     287        mp_ptr p0, p1;
     288        unsigned i;
     289  
     290        /* Temporary storage: 3 rn + 2 mn */
     291        p0 = tp + rn;
     292        p1 = p0 + rn + mn;
     293  
     294        for (i = 0; i < 2; i++)
     295  	{
     296  	  MPN_COPY (tp, r0, rn);
     297  
     298  	  if (rn >= mn)
     299  	    {
     300  	      mpn_mul (p0, r0, rn, m0, mn);
     301  	      mpn_mul (p1, r1, rn, m3, mn);
     302  	      mpn_mul (r0, r1, rn, m2, mn);
     303  	      mpn_mul (r1, tp, rn, m1, mn);
     304  	    }
     305  	  else
     306  	    {
     307  	      mpn_mul (p0, m0, mn, r0, rn);
     308  	      mpn_mul (p1, m3, mn, r1, rn);
     309  	      mpn_mul (r0, m2, mn, r1, rn);
     310  	      mpn_mul (r1, m1, mn, tp, rn);
     311  	    }
     312  	  r0[rn+mn] = mpn_add_n (r0, r0, p0, rn + mn);
     313  	  r1[rn+mn] = mpn_add_n (r1, r1, p1, rn + mn);
     314  
     315  	  r0 = r2; r1 = r3;
     316  	}
     317      }
     318    else
     319      mpn_matrix22_mul_strassen (r0, r1, r2, r3, rn,
     320  			       m0, m1, m2, m3, mn, tp);
     321  }