1  /* mpn_toom3_sqr -- Square {ap,an}.
       2  
       3     Contributed to the GNU project by Torbjorn Granlund.
       4     Additional improvements by Marco Bodrato.
       5  
       6     THE FUNCTION IN THIS FILE IS INTERNAL WITH A MUTABLE INTERFACE.  IT IS ONLY
       7     SAFE TO REACH IT THROUGH DOCUMENTED INTERFACES.  IN FACT, IT IS ALMOST
       8     GUARANTEED THAT IT WILL CHANGE OR DISAPPEAR IN A FUTURE GNU MP RELEASE.
       9  
      10  Copyright 2006-2010, 2012, 2015, 2021 Free Software Foundation, Inc.
      11  
      12  This file is part of the GNU MP Library.
      13  
      14  The GNU MP Library is free software; you can redistribute it and/or modify
      15  it under the terms of either:
      16  
      17    * the GNU Lesser General Public License as published by the Free
      18      Software Foundation; either version 3 of the License, or (at your
      19      option) any later version.
      20  
      21  or
      22  
      23    * the GNU General Public License as published by the Free Software
      24      Foundation; either version 2 of the License, or (at your option) any
      25      later version.
      26  
      27  or both in parallel, as here.
      28  
      29  The GNU MP Library is distributed in the hope that it will be useful, but
      30  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
      31  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
      32  for more details.
      33  
      34  You should have received copies of the GNU General Public License and the
      35  GNU Lesser General Public License along with the GNU MP Library.  If not,
      36  see https://www.gnu.org/licenses/.  */
      37  
      38  
      39  #include "gmp-impl.h"
      40  
      41  /* Evaluate in: -1, 0, +1, +2, +inf
      42  
      43    <-s--><--n--><--n-->
      44     ____ ______ ______
      45    |_a2_|___a1_|___a0_|
      46  
      47    v0  =  a0         ^2 #   A(0)^2
      48    v1  = (a0+ a1+ a2)^2 #   A(1)^2    ah  <= 2
      49    vm1 = (a0- a1+ a2)^2 #  A(-1)^2   |ah| <= 1
      50    v2  = (a0+2a1+4a2)^2 #   A(2)^2    ah  <= 6
      51    vinf=          a2 ^2 # A(inf)^2
      52  */
      53  
      54  #if TUNE_PROGRAM_BUILD || WANT_FAT_BINARY
      55  #define MAYBE_sqr_basecase 1
      56  #define MAYBE_sqr_toom3   1
      57  #else
      58  #define MAYBE_sqr_basecase						\
      59    (SQR_TOOM3_THRESHOLD < 3 * SQR_TOOM2_THRESHOLD)
      60  #define MAYBE_sqr_toom3							\
      61    (SQR_TOOM4_THRESHOLD >= 3 * SQR_TOOM3_THRESHOLD)
      62  #endif
      63  
      64  #define TOOM3_SQR_REC(p, a, n, ws)					\
      65    do {									\
      66      if (MAYBE_sqr_basecase						\
      67  	&& BELOW_THRESHOLD (n, SQR_TOOM2_THRESHOLD))			\
      68        mpn_sqr_basecase (p, a, n);					\
      69      else if (! MAYBE_sqr_toom3						\
      70  	     || BELOW_THRESHOLD (n, SQR_TOOM3_THRESHOLD))		\
      71        mpn_toom2_sqr (p, a, n, ws);					\
      72      else								\
      73        mpn_toom3_sqr (p, a, n, ws);					\
      74    } while (0)
      75  
      76  void
      77  mpn_toom3_sqr (mp_ptr pp,
      78  	       mp_srcptr ap, mp_size_t an,
      79  	       mp_ptr scratch)
      80  {
      81    const int __gmpn_cpuvec_initialized = 1;
      82    mp_size_t n, s;
      83    mp_limb_t cy, vinf0;
      84    mp_ptr gp;
      85    mp_ptr as1, asm1, as2;
      86  
      87  #define a0  ap
      88  #define a1  (ap + n)
      89  #define a2  (ap + 2*n)
      90  
      91    n = (an + 2) / (size_t) 3;
      92  
      93    s = an - 2 * n;
      94  
      95    ASSERT (0 < s && s <= n);
      96  
      97    as1 = scratch + 4 * n + 4;
      98    asm1 = scratch + 2 * n + 2;
      99    as2 = pp + n + 1;
     100  
     101    gp = scratch;
     102  
     103    /* Compute as1 and asm1.  */
     104    cy = mpn_add (gp, a0, n, a2, s);
     105  #if HAVE_NATIVE_mpn_add_n_sub_n
     106    if (cy == 0 && mpn_cmp (gp, a1, n) < 0)
     107      {
     108        cy = mpn_add_n_sub_n (as1, asm1, a1, gp, n);
     109        as1[n] = cy >> 1;
     110        asm1[n] = 0;
     111      }
     112    else
     113      {
     114        mp_limb_t cy2;
     115        cy2 = mpn_add_n_sub_n (as1, asm1, gp, a1, n);
     116        as1[n] = cy + (cy2 >> 1);
     117        asm1[n] = cy - (cy2 & 1);
     118      }
     119  #else
     120    as1[n] = cy + mpn_add_n (as1, gp, a1, n);
     121    if (cy == 0 && mpn_cmp (gp, a1, n) < 0)
     122      {
     123        mpn_sub_n (asm1, a1, gp, n);
     124        asm1[n] = 0;
     125      }
     126    else
     127      {
     128        cy -= mpn_sub_n (asm1, gp, a1, n);
     129        asm1[n] = cy;
     130      }
     131  #endif
     132  
     133    /* Compute as2.  */
     134  #if HAVE_NATIVE_mpn_rsblsh1_n
     135    cy = mpn_add_n (as2, a2, as1, s);
     136    if (s != n)
     137      cy = mpn_add_1 (as2 + s, as1 + s, n - s, cy);
     138    cy += as1[n];
     139    cy = 2 * cy + mpn_rsblsh1_n (as2, a0, as2, n);
     140  #else
     141  #if HAVE_NATIVE_mpn_addlsh1_n
     142    cy  = mpn_addlsh1_n (as2, a1, a2, s);
     143    if (s != n)
     144      cy = mpn_add_1 (as2 + s, a1 + s, n - s, cy);
     145    cy = 2 * cy + mpn_addlsh1_n (as2, a0, as2, n);
     146  #else
     147    cy = mpn_add_n (as2, a2, as1, s);
     148    if (s != n)
     149      cy = mpn_add_1 (as2 + s, as1 + s, n - s, cy);
     150    cy += as1[n];
     151    cy = 2 * cy + mpn_lshift (as2, as2, n, 1);
     152    cy -= mpn_sub_n (as2, as2, a0, n);
     153  #endif
     154  #endif
     155    as2[n] = cy;
     156  
     157    ASSERT (as1[n] <= 2);
     158    ASSERT (asm1[n] <= 1);
     159  
     160  #define v0    pp				/* 2n */
     161  #define v1    (pp + 2 * n)			/* 2n+1 */
     162  #define vinf  (pp + 4 * n)			/* s+s */
     163  #define vm1   scratch				/* 2n+1 */
     164  #define v2    (scratch + 2 * n + 1)		/* 2n+2 */
     165  #define scratch_out  (scratch + 5 * n + 5)
     166  
     167    /* vm1, 2n+1 limbs */
     168  #ifdef SMALLER_RECURSION
     169    TOOM3_SQR_REC (vm1, asm1, n, scratch_out);
     170    cy = asm1[n];
     171    if (cy != 0)
     172      {
     173  #if HAVE_NATIVE_mpn_addlsh1_n_ip1
     174        cy += mpn_addlsh1_n_ip1 (vm1 + n, asm1, n);
     175  #else
     176        cy += mpn_addmul_1 (vm1 + n, asm1, n, CNST_LIMB(2));
     177  #endif
     178      }
     179    vm1[2 * n] = cy;
     180  #else
     181    vm1[2 * n] = 0;
     182    TOOM3_SQR_REC (vm1, asm1, n + asm1[n], scratch_out);
     183  #endif
     184  
     185    TOOM3_SQR_REC (v2, as2, n + 1, scratch_out);	/* v2, 2n+1 limbs */
     186  
     187    TOOM3_SQR_REC (vinf, a2, s, scratch_out);	/* vinf, s+s limbs */
     188  
     189    vinf0 = vinf[0];				/* v1 overlaps with this */
     190  
     191  #ifdef SMALLER_RECURSION
     192    /* v1, 2n+1 limbs */
     193    TOOM3_SQR_REC (v1, as1, n, scratch_out);
     194    cy = as1[n];
     195    if (cy == 1)
     196      {
     197  #if HAVE_NATIVE_mpn_addlsh1_n_ip1
     198        cy += mpn_addlsh1_n_ip1 (v1 + n, as1, n);
     199  #else
     200        cy += mpn_addmul_1 (v1 + n, as1, n, CNST_LIMB(2));
     201  #endif
     202      }
     203    else if (cy != 0)
     204      {
     205  #if HAVE_NATIVE_mpn_addlsh2_n_ip1
     206        cy = 4 + mpn_addlsh2_n_ip1 (v1 + n, as1, n);
     207  #else
     208        cy = 4 + mpn_addmul_1 (v1 + n, as1, n, CNST_LIMB(4));
     209  #endif
     210      }
     211    v1[2 * n] = cy;
     212  #else
     213    cy = vinf[1];
     214    TOOM3_SQR_REC (v1, as1, n + 1, scratch_out);
     215    vinf[1] = cy;
     216  #endif
     217  
     218    TOOM3_SQR_REC (v0, ap, n, scratch_out);	/* v0, 2n limbs */
     219  
     220    mpn_toom_interpolate_5pts (pp, v2, vm1, n, s + s, 0, vinf0);
     221  }