1  /* Copyright (C) 2007-2023 Free Software Foundation, Inc.
       2  
       3  This file is part of GCC.
       4  
       5  GCC is free software; you can redistribute it and/or modify it under
       6  the terms of the GNU General Public License as published by the Free
       7  Software Foundation; either version 3, or (at your option) any later
       8  version.
       9  
      10  GCC is distributed in the hope that it will be useful, but WITHOUT ANY
      11  WARRANTY; without even the implied warranty of MERCHANTABILITY or
      12  FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
      13  for more details.
      14  
      15  Under Section 7 of GPL version 3, you are granted additional
      16  permissions described in the GCC Runtime Library Exception, version
      17  3.1, as published by the Free Software Foundation.
      18  
      19  You should have received a copy of the GNU General Public License and
      20  a copy of the GCC Runtime Library Exception along with this program;
      21  see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see
      22  <http://www.gnu.org/licenses/>.  */
      23  
      24  /*****************************************************************************
      25   *    BID64 square root
      26   *****************************************************************************
      27   *
      28   *  Algorithm description:
      29   *
      30   *  if(exponent_x is odd)
      31   *     scale coefficient_x by 10, adjust exponent
      32   *  - get lower estimate for number of digits in coefficient_x
      33   *  - scale coefficient x to between 31 and 33 decimal digits
      34   *  - in parallel, check for exact case and return if true
      35   *  - get high part of result coefficient using double precision sqrt
      36   *  - compute remainder and refine coefficient in one iteration (which 
      37   *                                 modifies it by at most 1)
      38   *  - result exponent is easy to compute from the adjusted arg. exponent 
      39   *
      40   ****************************************************************************/
      41  
      42  #include "bid_internal.h"
      43  #include "bid_sqrt_macros.h"
      44  #ifdef UNCHANGED_BINARY_STATUS_FLAGS
      45  #include <fenv.h>
      46  
      47  #define FE_ALL_FLAGS FE_INVALID|FE_DIVBYZERO|FE_OVERFLOW|FE_UNDERFLOW|FE_INEXACT
      48  #endif
      49  
      50  extern double sqrt (double);
      51  
      52  #if DECIMAL_CALL_BY_REFERENCE
      53  
      54  void
      55  bid64_sqrt (UINT64 * pres,
      56  	    UINT64 *
      57  	    px _RND_MODE_PARAM _EXC_FLAGS_PARAM _EXC_MASKS_PARAM
      58  	    _EXC_INFO_PARAM) {
      59    UINT64 x;
      60  #else
      61  
      62  UINT64
      63  bid64_sqrt (UINT64 x _RND_MODE_PARAM _EXC_FLAGS_PARAM
      64  	    _EXC_MASKS_PARAM _EXC_INFO_PARAM) {
      65  #endif
      66    UINT128 CA, CT;
      67    UINT64 sign_x, coefficient_x;
      68    UINT64 Q, Q2, A10, C4, R, R2, QE, res;
      69    SINT64 D;
      70    int_double t_scale;
      71    int_float tempx;
      72    double da, dq, da_h, da_l, dqe;
      73    int exponent_x, exponent_q, bin_expon_cx;
      74    int digits_x;
      75    int scale;
      76  #ifdef UNCHANGED_BINARY_STATUS_FLAGS
      77    fexcept_t binaryflags = 0;
      78  #endif
      79  
      80  #if DECIMAL_CALL_BY_REFERENCE
      81  #if !DECIMAL_GLOBAL_ROUNDING
      82    _IDEC_round rnd_mode = *prnd_mode;
      83  #endif
      84    x = *px;
      85  #endif
      86  
      87    // unpack arguments, check for NaN or Infinity
      88    if (!unpack_BID64 (&sign_x, &exponent_x, &coefficient_x, x)) {
      89      // x is Inf. or NaN or 0
      90      if ((x & INFINITY_MASK64) == INFINITY_MASK64) {
      91        res = coefficient_x;
      92        if ((coefficient_x & SSNAN_MASK64) == SINFINITY_MASK64)	// -Infinity
      93        {
      94  	res = NAN_MASK64;
      95  #ifdef SET_STATUS_FLAGS
      96  	__set_status_flags (pfpsf, INVALID_EXCEPTION);
      97  #endif
      98        }
      99  #ifdef SET_STATUS_FLAGS
     100        if ((x & SNAN_MASK64) == SNAN_MASK64)	// sNaN
     101  	__set_status_flags (pfpsf, INVALID_EXCEPTION);
     102  #endif
     103        BID_RETURN (res & QUIET_MASK64);
     104      }
     105      // x is 0
     106      exponent_x = (exponent_x + DECIMAL_EXPONENT_BIAS) >> 1;
     107      res = sign_x | (((UINT64) exponent_x) << 53);
     108      BID_RETURN (res);
     109    }
     110    // x<0?
     111    if (sign_x && coefficient_x) {
     112      res = NAN_MASK64;
     113  #ifdef SET_STATUS_FLAGS
     114      __set_status_flags (pfpsf, INVALID_EXCEPTION);
     115  #endif
     116      BID_RETURN (res);
     117    }
     118  #ifdef UNCHANGED_BINARY_STATUS_FLAGS
     119    (void) fegetexceptflag (&binaryflags, FE_ALL_FLAGS);
     120  #endif
     121    //--- get number of bits in the coefficient of x ---
     122    tempx.d = (float) coefficient_x;
     123    bin_expon_cx = ((tempx.i >> 23) & 0xff) - 0x7f;
     124    digits_x = estimate_decimal_digits[bin_expon_cx];
     125    // add test for range
     126    if (coefficient_x >= power10_index_binexp[bin_expon_cx])
     127      digits_x++;
     128  
     129    A10 = coefficient_x;
     130    if (exponent_x & 1) {
     131      A10 = (A10 << 2) + A10;
     132      A10 += A10;
     133    }
     134  
     135    dqe = sqrt ((double) A10);
     136    QE = (UINT32) dqe;
     137    if (QE * QE == A10) {
     138      res =
     139        very_fast_get_BID64 (0, (exponent_x + DECIMAL_EXPONENT_BIAS) >> 1,
     140  			   QE);
     141  #ifdef UNCHANGED_BINARY_STATUS_FLAGS
     142      (void) fesetexceptflag (&binaryflags, FE_ALL_FLAGS);
     143  #endif
     144      BID_RETURN (res);
     145    }
     146    // if exponent is odd, scale coefficient by 10
     147    scale = 31 - digits_x;
     148    exponent_q = exponent_x - scale;
     149    scale += (exponent_q & 1);	// exp. bias is even
     150  
     151    CT = power10_table_128[scale];
     152    __mul_64x128_short (CA, coefficient_x, CT);
     153  
     154    // 2^64
     155    t_scale.i = 0x43f0000000000000ull;
     156    // convert CA to DP
     157    da_h = CA.w[1];
     158    da_l = CA.w[0];
     159    da = da_h * t_scale.d + da_l;
     160  
     161    dq = sqrt (da);
     162  
     163    Q = (UINT64) dq;
     164  
     165    // get sign(sqrt(CA)-Q)
     166    R = CA.w[0] - Q * Q;
     167    R = ((SINT64) R) >> 63;
     168    D = R + R + 1;
     169  
     170    exponent_q = (exponent_q + DECIMAL_EXPONENT_BIAS) >> 1;
     171  
     172  #ifdef SET_STATUS_FLAGS
     173    __set_status_flags (pfpsf, INEXACT_EXCEPTION);
     174  #endif
     175  
     176  #ifndef IEEE_ROUND_NEAREST
     177  #ifndef IEEE_ROUND_NEAREST_TIES_AWAY
     178    if (!((rnd_mode) & 3)) {
     179  #endif
     180  #endif
     181  
     182      // midpoint to check
     183      Q2 = Q + Q + D;
     184      C4 = CA.w[0] << 2;
     185  
     186      // get sign(-sqrt(CA)+Midpoint)
     187      R2 = Q2 * Q2 - C4;
     188      R2 = ((SINT64) R2) >> 63;
     189  
     190      // adjust Q if R!=R2
     191      Q += (D & (R ^ R2));
     192  #ifndef IEEE_ROUND_NEAREST
     193  #ifndef IEEE_ROUND_NEAREST_TIES_AWAY
     194    } else {
     195      C4 = CA.w[0];
     196      Q += D;
     197      if ((SINT64) (Q * Q - C4) > 0)
     198        Q--;
     199      if (rnd_mode == ROUNDING_UP)
     200        Q++;
     201    }
     202  #endif
     203  #endif
     204  
     205    res = fast_get_BID64 (0, exponent_q, Q);
     206  #ifdef UNCHANGED_BINARY_STATUS_FLAGS
     207    (void) fesetexceptflag (&binaryflags, FE_ALL_FLAGS);
     208  #endif
     209    BID_RETURN (res);
     210  }
     211  
     212  
     213  TYPE0_FUNCTION_ARG1 (UINT64, bid64q_sqrt, x)
     214  
     215       UINT256 M256, C4, C8;
     216       UINT128 CX, CX2, A10, S2, T128, CS, CSM, CS2, C256, CS1,
     217         mul_factor2_long = { {0x0ull, 0x0ull} }, QH, Tmp, TP128, Qh, Ql;
     218  UINT64 sign_x, Carry, B10, res, mul_factor, mul_factor2 = 0x0ull, CS0;
     219  SINT64 D;
     220  int_float fx, f64;
     221  int exponent_x, bin_expon_cx, done = 0;
     222  int digits, scale, exponent_q = 0, exact = 1, amount, extra_digits;
     223  #ifdef UNCHANGED_BINARY_STATUS_FLAGS
     224  fexcept_t binaryflags = 0;
     225  #endif
     226  
     227  	// unpack arguments, check for NaN or Infinity
     228  if (!unpack_BID128_value (&sign_x, &exponent_x, &CX, x)) {
     229    res = CX.w[1];
     230    // NaN ?
     231    if ((x.w[1] & 0x7c00000000000000ull) == 0x7c00000000000000ull) {
     232  #ifdef SET_STATUS_FLAGS
     233      if ((x.w[1] & 0x7e00000000000000ull) == 0x7e00000000000000ull)	// sNaN
     234        __set_status_flags (pfpsf, INVALID_EXCEPTION);
     235  #endif
     236      Tmp.w[1] = (CX.w[1] & 0x00003fffffffffffull);
     237      Tmp.w[0] = CX.w[0];
     238      TP128 = reciprocals10_128[18];
     239      __mul_128x128_full (Qh, Ql, Tmp, TP128);
     240      amount = recip_scale[18];
     241      __shr_128 (Tmp, Qh, amount);
     242      res = (CX.w[1] & 0xfc00000000000000ull) | Tmp.w[0];
     243      BID_RETURN (res);
     244    }
     245    // x is Infinity?
     246    if ((x.w[1] & 0x7800000000000000ull) == 0x7800000000000000ull) {
     247      if (sign_x) {
     248        // -Inf, return NaN
     249        res = 0x7c00000000000000ull;
     250  #ifdef SET_STATUS_FLAGS
     251        __set_status_flags (pfpsf, INVALID_EXCEPTION);
     252  #endif
     253      }
     254      BID_RETURN (res);
     255    }
     256    // x is 0 otherwise
     257  
     258    exponent_x =
     259      ((exponent_x - DECIMAL_EXPONENT_BIAS_128) >> 1) +
     260      DECIMAL_EXPONENT_BIAS;
     261    if (exponent_x < 0)
     262      exponent_x = 0;
     263    if (exponent_x > DECIMAL_MAX_EXPON_64)
     264      exponent_x = DECIMAL_MAX_EXPON_64;
     265    //res= sign_x | (((UINT64)exponent_x)<<53);
     266    res = get_BID64 (sign_x, exponent_x, 0, rnd_mode, pfpsf);
     267    BID_RETURN (res);
     268  }
     269  if (sign_x) {
     270    res = 0x7c00000000000000ull;
     271  #ifdef SET_STATUS_FLAGS
     272    __set_status_flags (pfpsf, INVALID_EXCEPTION);
     273  #endif
     274    BID_RETURN (res);
     275  }
     276  #ifdef UNCHANGED_BINARY_STATUS_FLAGS
     277  (void) fegetexceptflag (&binaryflags, FE_ALL_FLAGS);
     278  #endif
     279  
     280  	   // 2^64
     281  f64.i = 0x5f800000;
     282  
     283  	   // fx ~ CX
     284  fx.d = (float) CX.w[1] * f64.d + (float) CX.w[0];
     285  bin_expon_cx = ((fx.i >> 23) & 0xff) - 0x7f;
     286  digits = estimate_decimal_digits[bin_expon_cx];
     287  
     288  A10 = CX;
     289  if (exponent_x & 1) {
     290    A10.w[1] = (CX.w[1] << 3) | (CX.w[0] >> 61);
     291    A10.w[0] = CX.w[0] << 3;
     292    CX2.w[1] = (CX.w[1] << 1) | (CX.w[0] >> 63);
     293    CX2.w[0] = CX.w[0] << 1;
     294    __add_128_128 (A10, A10, CX2);
     295  }
     296  
     297  C256.w[1] = A10.w[1];
     298  C256.w[0] = A10.w[0];
     299  CS.w[0] = short_sqrt128 (A10);
     300  CS.w[1] = 0;
     301  mul_factor = 0;
     302  	   // check for exact result  
     303  if (CS.w[0] < 10000000000000000ull) {
     304    if (CS.w[0] * CS.w[0] == A10.w[0]) {
     305      __sqr64_fast (S2, CS.w[0]);
     306      if (S2.w[1] == A10.w[1])	// && S2.w[0]==A10.w[0])
     307      {
     308        res =
     309  	get_BID64 (0,
     310  		   ((exponent_x - DECIMAL_EXPONENT_BIAS_128) >> 1) +
     311  		   DECIMAL_EXPONENT_BIAS, CS.w[0], rnd_mode, pfpsf);
     312  #ifdef UNCHANGED_BINARY_STATUS_FLAGS
     313        (void) fesetexceptflag (&binaryflags, FE_ALL_FLAGS);
     314  #endif
     315        BID_RETURN (res);
     316      }
     317    }
     318    if (CS.w[0] >= 1000000000000000ull) {
     319      done = 1;
     320      exponent_q = exponent_x;
     321      C256.w[1] = A10.w[1];
     322      C256.w[0] = A10.w[0];
     323    }
     324  #ifdef SET_STATUS_FLAGS
     325    __set_status_flags (pfpsf, INEXACT_EXCEPTION);
     326  #endif
     327    exact = 0;
     328  } else {
     329    B10 = 0x3333333333333334ull;
     330    __mul_64x64_to_128_full (CS2, CS.w[0], B10);
     331    CS0 = CS2.w[1] >> 1;
     332    if (CS.w[0] != ((CS0 << 3) + (CS0 << 1))) {
     333  #ifdef SET_STATUS_FLAGS
     334      __set_status_flags (pfpsf, INEXACT_EXCEPTION);
     335  #endif
     336      exact = 0;
     337    }
     338    done = 1;
     339    CS.w[0] = CS0;
     340    exponent_q = exponent_x + 2;
     341    mul_factor = 10;
     342    mul_factor2 = 100;
     343    if (CS.w[0] >= 10000000000000000ull) {
     344      __mul_64x64_to_128_full (CS2, CS.w[0], B10);
     345      CS0 = CS2.w[1] >> 1;
     346      if (CS.w[0] != ((CS0 << 3) + (CS0 << 1))) {
     347  #ifdef SET_STATUS_FLAGS
     348        __set_status_flags (pfpsf, INEXACT_EXCEPTION);
     349  #endif
     350        exact = 0;
     351      }
     352      exponent_q += 2;
     353      CS.w[0] = CS0;
     354      mul_factor = 100;
     355      mul_factor2 = 10000;
     356    }
     357    if (exact) {
     358      CS0 = CS.w[0] * mul_factor;
     359      __sqr64_fast (CS1, CS0)
     360        if ((CS1.w[0] != A10.w[0]) || (CS1.w[1] != A10.w[1])) {
     361  #ifdef SET_STATUS_FLAGS
     362        __set_status_flags (pfpsf, INEXACT_EXCEPTION);
     363  #endif
     364        exact = 0;
     365      }
     366    }
     367  }
     368  
     369  if (!done) {
     370    // get number of digits in CX
     371    D = CX.w[1] - power10_index_binexp_128[bin_expon_cx].w[1];
     372    if (D > 0
     373        || (!D && CX.w[0] >= power10_index_binexp_128[bin_expon_cx].w[0]))
     374      digits++;
     375  
     376    // if exponent is odd, scale coefficient by 10
     377    scale = 31 - digits;
     378    exponent_q = exponent_x - scale;
     379    scale += (exponent_q & 1);	// exp. bias is even
     380  
     381    T128 = power10_table_128[scale];
     382    __mul_128x128_low (C256, CX, T128);
     383  
     384  
     385    CS.w[0] = short_sqrt128 (C256);
     386  }
     387     //printf("CS=%016I64x\n",CS.w[0]);
     388  
     389  exponent_q =
     390    ((exponent_q - DECIMAL_EXPONENT_BIAS_128) >> 1) +
     391    DECIMAL_EXPONENT_BIAS;
     392  if ((exponent_q < 0) && (exponent_q + MAX_FORMAT_DIGITS >= 0)) {
     393    extra_digits = -exponent_q;
     394    exponent_q = 0;
     395  
     396    // get coeff*(2^M[extra_digits])/10^extra_digits
     397    __mul_64x64_to_128 (QH, CS.w[0], reciprocals10_64[extra_digits]);
     398  
     399    // now get P/10^extra_digits: shift Q_high right by M[extra_digits]-128
     400    amount = short_recip_scale[extra_digits];
     401  
     402    CS0 = QH.w[1] >> amount;
     403  
     404  #ifdef SET_STATUS_FLAGS
     405    if (exact) {
     406      if (CS.w[0] != CS0 * power10_table_128[extra_digits].w[0])
     407        exact = 0;
     408    }
     409    if (!exact)
     410      __set_status_flags (pfpsf, UNDERFLOW_EXCEPTION | INEXACT_EXCEPTION);
     411  #endif
     412  
     413    CS.w[0] = CS0;
     414    if (!mul_factor)
     415      mul_factor = 1;
     416    mul_factor *= power10_table_128[extra_digits].w[0];
     417    __mul_64x64_to_128 (mul_factor2_long, mul_factor, mul_factor);
     418    if (mul_factor2_long.w[1])
     419      mul_factor2 = 0;
     420    else
     421      mul_factor2 = mul_factor2_long.w[1];
     422  }
     423  	   // 4*C256
     424  C4.w[1] = (C256.w[1] << 2) | (C256.w[0] >> 62);
     425  C4.w[0] = C256.w[0] << 2;
     426  
     427  #ifndef IEEE_ROUND_NEAREST
     428  #ifndef IEEE_ROUND_NEAREST_TIES_AWAY
     429  if (!((rnd_mode) & 3)) {
     430  #endif
     431  #endif
     432    // compare to midpoints
     433    CSM.w[0] = (CS.w[0] + CS.w[0]) | 1;
     434    //printf("C256=%016I64x %016I64x, CSM=%016I64x %016I64x %016I64x\n",C4.w[1],C4.w[0],CSM.w[1],CSM.w[0], CS.w[0]);
     435    if (mul_factor)
     436      CSM.w[0] *= mul_factor;
     437    // CSM^2
     438    __mul_64x64_to_128 (M256, CSM.w[0], CSM.w[0]);
     439    //__mul_128x128_to_256(M256, CSM, CSM);
     440  
     441    if (C4.w[1] > M256.w[1] ||
     442        (C4.w[1] == M256.w[1] && C4.w[0] > M256.w[0])) {
     443      // round up
     444      CS.w[0]++;
     445    } else {
     446      C8.w[0] = CS.w[0] << 3;
     447      C8.w[1] = 0;
     448      if (mul_factor) {
     449        if (mul_factor2) {
     450  	__mul_64x64_to_128 (C8, C8.w[0], mul_factor2);
     451        } else {
     452  	__mul_64x128_low (C8, C8.w[0], mul_factor2_long);
     453        }
     454      }
     455      // M256 - 8*CSM
     456      __sub_borrow_out (M256.w[0], Carry, M256.w[0], C8.w[0]);
     457      M256.w[1] = M256.w[1] - C8.w[1] - Carry;
     458  
     459      // if CSM' > C256, round up
     460      if (M256.w[1] > C4.w[1] ||
     461  	(M256.w[1] == C4.w[1] && M256.w[0] > C4.w[0])) {
     462        // round down
     463        if (CS.w[0])
     464  	CS.w[0]--;
     465      }
     466    }
     467  #ifndef IEEE_ROUND_NEAREST
     468  #ifndef IEEE_ROUND_NEAREST_TIES_AWAY
     469  } else {
     470    CS.w[0]++;
     471    CSM.w[0] = CS.w[0];
     472    C8.w[0] = CSM.w[0] << 1;
     473    if (mul_factor)
     474      CSM.w[0] *= mul_factor;
     475    __mul_64x64_to_128 (M256, CSM.w[0], CSM.w[0]);
     476    C8.w[1] = 0;
     477    if (mul_factor) {
     478      if (mul_factor2) {
     479        __mul_64x64_to_128 (C8, C8.w[0], mul_factor2);
     480      } else {
     481        __mul_64x128_low (C8, C8.w[0], mul_factor2_long);
     482      }
     483    }
     484    //printf("C256=%016I64x %016I64x, CSM=%016I64x %016I64x %016I64x\n",C256.w[1],C256.w[0],M256.w[1],M256.w[0], CS.w[0]);
     485  
     486    if (M256.w[1] > C256.w[1] ||
     487        (M256.w[1] == C256.w[1] && M256.w[0] > C256.w[0])) {
     488      __sub_borrow_out (M256.w[0], Carry, M256.w[0], C8.w[0]);
     489      M256.w[1] = M256.w[1] - Carry - C8.w[1];
     490      M256.w[0]++;
     491      if (!M256.w[0]) {
     492        M256.w[1]++;
     493  
     494      }
     495  
     496      if ((M256.w[1] > C256.w[1] ||
     497  	 (M256.w[1] == C256.w[1] && M256.w[0] > C256.w[0]))
     498  	&& (CS.w[0] > 1)) {
     499  
     500        CS.w[0]--;
     501  
     502        if (CS.w[0] > 1) {
     503  	__sub_borrow_out (M256.w[0], Carry, M256.w[0], C8.w[0]);
     504  	M256.w[1] = M256.w[1] - Carry - C8.w[1];
     505  	M256.w[0]++;
     506  	if (!M256.w[0]) {
     507  	  M256.w[1]++;
     508  	}
     509  
     510  	if (M256.w[1] > C256.w[1] ||
     511  	    (M256.w[1] == C256.w[1] && M256.w[0] > C256.w[0]))
     512  	  CS.w[0]--;
     513        }
     514      }
     515    }
     516  
     517    else {
     518  				/*__add_carry_out(M256.w[0], Carry, M256.w[0], C8.w[0]);
     519  				M256.w[1] = M256.w[1] + Carry + C8.w[1];
     520  				M256.w[0]++;
     521  				if(!M256.w[0]) 
     522  				{
     523  					M256.w[1]++;
     524  				}
     525  				CS.w[0]++;
     526  			if(M256.w[1]<C256.w[1] ||
     527  				(M256.w[1]==C256.w[1] && M256.w[0]<=C256.w[0]))
     528  			{
     529  				CS.w[0]++;
     530  			}*/
     531      CS.w[0]++;
     532    }
     533    //printf("C256=%016I64x %016I64x, CSM=%016I64x %016I64x %016I64x %d\n",C4.w[1],C4.w[0],M256.w[1],M256.w[0], CS.w[0], exact);
     534    // RU?
     535    if (((rnd_mode) != ROUNDING_UP) || exact) {
     536      if (CS.w[0])
     537        CS.w[0]--;
     538    }
     539  
     540  }
     541  #endif
     542  #endif
     543   //printf("C256=%016I64x %016I64x, CSM=%016I64x %016I64x %016I64x %d\n",C4.w[1],C4.w[0],M256.w[1],M256.w[0], CS.w[0], exact);
     544  
     545  res = get_BID64 (0, exponent_q, CS.w[0], rnd_mode, pfpsf);
     546  #ifdef UNCHANGED_BINARY_STATUS_FLAGS
     547  (void) fesetexceptflag (&binaryflags, FE_ALL_FLAGS);
     548  #endif
     549  BID_RETURN (res);
     550  
     551  
     552  }