1  /* Compute x * y + z as ternary operation.
       2     Copyright (C) 2010-2023 Free Software Foundation, Inc.
       3     This file is part of the GNU C Library.
       4  
       5     The GNU C Library is free software; you can redistribute it and/or
       6     modify it under the terms of the GNU Lesser General Public
       7     License as published by the Free Software Foundation; either
       8     version 2.1 of the License, or (at your option) any later version.
       9  
      10     The GNU C Library is distributed in the hope that it will be useful,
      11     but WITHOUT ANY WARRANTY; without even the implied warranty of
      12     MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
      13     Lesser General Public License for more details.
      14  
      15     You should have received a copy of the GNU Lesser General Public
      16     License along with the GNU C Library; if not, see
      17     <https://www.gnu.org/licenses/>.  */
      18  
      19  #define NO_MATH_REDIRECT
      20  #include <float.h>
      21  #define f64xfmaf128 __hide_f64xfmaf128
      22  #include <math.h>
      23  #undef f64xfmaf128
      24  #include <fenv.h>
      25  #include <ieee754.h>
      26  #include <math-barriers.h>
      27  #include <math_private.h>
      28  #include <libm-alias-ldouble.h>
      29  #include <math-narrow-alias.h>
      30  #include <tininess.h>
      31  #include <math-use-builtins.h>
      32  
      33  /* This implementation uses rounding to odd to avoid problems with
      34     double rounding.  See a paper by Boldo and Melquiond:
      35     http://www.lri.fr/~melquion/doc/08-tc.pdf  */
      36  
      37  _Float128
      38  __fmal (_Float128 x, _Float128 y, _Float128 z)
      39  {
      40  #if USE_FMAL_BUILTIN
      41    return __builtin_fmal (x, y, z);
      42  #else
      43    union ieee854_long_double u, v, w;
      44    int adjust = 0;
      45    u.d = x;
      46    v.d = y;
      47    w.d = z;
      48    if (__builtin_expect (u.ieee.exponent + v.ieee.exponent
      49  			>= 0x7fff + IEEE854_LONG_DOUBLE_BIAS
      50  			   - LDBL_MANT_DIG, 0)
      51        || __builtin_expect (u.ieee.exponent >= 0x7fff - LDBL_MANT_DIG, 0)
      52        || __builtin_expect (v.ieee.exponent >= 0x7fff - LDBL_MANT_DIG, 0)
      53        || __builtin_expect (w.ieee.exponent >= 0x7fff - LDBL_MANT_DIG, 0)
      54        || __builtin_expect (u.ieee.exponent + v.ieee.exponent
      55  			   <= IEEE854_LONG_DOUBLE_BIAS + LDBL_MANT_DIG, 0))
      56      {
      57        /* If z is Inf, but x and y are finite, the result should be
      58  	 z rather than NaN.  */
      59        if (w.ieee.exponent == 0x7fff
      60  	  && u.ieee.exponent != 0x7fff
      61            && v.ieee.exponent != 0x7fff)
      62  	return (z + x) + y;
      63        /* If z is zero and x are y are nonzero, compute the result
      64  	 as x * y to avoid the wrong sign of a zero result if x * y
      65  	 underflows to 0.  */
      66        if (z == 0 && x != 0 && y != 0)
      67  	return x * y;
      68        /* If x or y or z is Inf/NaN, or if x * y is zero, compute as
      69  	 x * y + z.  */
      70        if (u.ieee.exponent == 0x7fff
      71  	  || v.ieee.exponent == 0x7fff
      72  	  || w.ieee.exponent == 0x7fff
      73  	  || x == 0
      74  	  || y == 0)
      75  	return x * y + z;
      76        /* If fma will certainly overflow, compute as x * y.  */
      77        if (u.ieee.exponent + v.ieee.exponent
      78  	  > 0x7fff + IEEE854_LONG_DOUBLE_BIAS)
      79  	return x * y;
      80        /* If x * y is less than 1/4 of LDBL_TRUE_MIN, neither the
      81  	 result nor whether there is underflow depends on its exact
      82  	 value, only on its sign.  */
      83        if (u.ieee.exponent + v.ieee.exponent
      84  	  < IEEE854_LONG_DOUBLE_BIAS - LDBL_MANT_DIG - 2)
      85  	{
      86  	  int neg = u.ieee.negative ^ v.ieee.negative;
      87  	  _Float128 tiny = neg ? L(-0x1p-16494) : L(0x1p-16494);
      88  	  if (w.ieee.exponent >= 3)
      89  	    return tiny + z;
      90  	  /* Scaling up, adding TINY and scaling down produces the
      91  	     correct result, because in round-to-nearest mode adding
      92  	     TINY has no effect and in other modes double rounding is
      93  	     harmless.  But it may not produce required underflow
      94  	     exceptions.  */
      95  	  v.d = z * L(0x1p114) + tiny;
      96  	  if (TININESS_AFTER_ROUNDING
      97  	      ? v.ieee.exponent < 115
      98  	      : (w.ieee.exponent == 0
      99  		 || (w.ieee.exponent == 1
     100  		     && w.ieee.negative != neg
     101  		     && w.ieee.mantissa3 == 0
     102  		     && w.ieee.mantissa2 == 0
     103  		     && w.ieee.mantissa1 == 0
     104  		     && w.ieee.mantissa0 == 0)))
     105  	    {
     106  	      _Float128 force_underflow = x * y;
     107  	      math_force_eval (force_underflow);
     108  	    }
     109  	  return v.d * L(0x1p-114);
     110  	}
     111        if (u.ieee.exponent + v.ieee.exponent
     112  	  >= 0x7fff + IEEE854_LONG_DOUBLE_BIAS - LDBL_MANT_DIG)
     113  	{
     114  	  /* Compute 1p-113 times smaller result and multiply
     115  	     at the end.  */
     116  	  if (u.ieee.exponent > v.ieee.exponent)
     117  	    u.ieee.exponent -= LDBL_MANT_DIG;
     118  	  else
     119  	    v.ieee.exponent -= LDBL_MANT_DIG;
     120  	  /* If x + y exponent is very large and z exponent is very small,
     121  	     it doesn't matter if we don't adjust it.  */
     122  	  if (w.ieee.exponent > LDBL_MANT_DIG)
     123  	    w.ieee.exponent -= LDBL_MANT_DIG;
     124  	  adjust = 1;
     125  	}
     126        else if (w.ieee.exponent >= 0x7fff - LDBL_MANT_DIG)
     127  	{
     128  	  /* Similarly.
     129  	     If z exponent is very large and x and y exponents are
     130  	     very small, adjust them up to avoid spurious underflows,
     131  	     rather than down.  */
     132  	  if (u.ieee.exponent + v.ieee.exponent
     133  	      <= IEEE854_LONG_DOUBLE_BIAS + 2 * LDBL_MANT_DIG)
     134  	    {
     135  	      if (u.ieee.exponent > v.ieee.exponent)
     136  		u.ieee.exponent += 2 * LDBL_MANT_DIG + 2;
     137  	      else
     138  		v.ieee.exponent += 2 * LDBL_MANT_DIG + 2;
     139  	    }
     140  	  else if (u.ieee.exponent > v.ieee.exponent)
     141  	    {
     142  	      if (u.ieee.exponent > LDBL_MANT_DIG)
     143  		u.ieee.exponent -= LDBL_MANT_DIG;
     144  	    }
     145  	  else if (v.ieee.exponent > LDBL_MANT_DIG)
     146  	    v.ieee.exponent -= LDBL_MANT_DIG;
     147  	  w.ieee.exponent -= LDBL_MANT_DIG;
     148  	  adjust = 1;
     149  	}
     150        else if (u.ieee.exponent >= 0x7fff - LDBL_MANT_DIG)
     151  	{
     152  	  u.ieee.exponent -= LDBL_MANT_DIG;
     153  	  if (v.ieee.exponent)
     154  	    v.ieee.exponent += LDBL_MANT_DIG;
     155  	  else
     156  	    v.d *= L(0x1p113);
     157  	}
     158        else if (v.ieee.exponent >= 0x7fff - LDBL_MANT_DIG)
     159  	{
     160  	  v.ieee.exponent -= LDBL_MANT_DIG;
     161  	  if (u.ieee.exponent)
     162  	    u.ieee.exponent += LDBL_MANT_DIG;
     163  	  else
     164  	    u.d *= L(0x1p113);
     165  	}
     166        else /* if (u.ieee.exponent + v.ieee.exponent
     167  		  <= IEEE854_LONG_DOUBLE_BIAS + LDBL_MANT_DIG) */
     168  	{
     169  	  if (u.ieee.exponent > v.ieee.exponent)
     170  	    u.ieee.exponent += 2 * LDBL_MANT_DIG + 2;
     171  	  else
     172  	    v.ieee.exponent += 2 * LDBL_MANT_DIG + 2;
     173  	  if (w.ieee.exponent <= 4 * LDBL_MANT_DIG + 6)
     174  	    {
     175  	      if (w.ieee.exponent)
     176  		w.ieee.exponent += 2 * LDBL_MANT_DIG + 2;
     177  	      else
     178  		w.d *= L(0x1p228);
     179  	      adjust = -1;
     180  	    }
     181  	  /* Otherwise x * y should just affect inexact
     182  	     and nothing else.  */
     183  	}
     184        x = u.d;
     185        y = v.d;
     186        z = w.d;
     187      }
     188  
     189    /* Ensure correct sign of exact 0 + 0.  */
     190    if (__glibc_unlikely ((x == 0 || y == 0) && z == 0))
     191      {
     192        x = math_opt_barrier (x);
     193        return x * y + z;
     194      }
     195  
     196    fenv_t env;
     197    feholdexcept (&env);
     198    fesetround (FE_TONEAREST);
     199  
     200    /* Multiplication m1 + m2 = x * y using Dekker's algorithm.  */
     201  #define C ((1LL << (LDBL_MANT_DIG + 1) / 2) + 1)
     202    _Float128 x1 = x * C;
     203    _Float128 y1 = y * C;
     204    _Float128 m1 = x * y;
     205    x1 = (x - x1) + x1;
     206    y1 = (y - y1) + y1;
     207    _Float128 x2 = x - x1;
     208    _Float128 y2 = y - y1;
     209    _Float128 m2 = (((x1 * y1 - m1) + x1 * y2) + x2 * y1) + x2 * y2;
     210  
     211    /* Addition a1 + a2 = z + m1 using Knuth's algorithm.  */
     212    _Float128 a1 = z + m1;
     213    _Float128 t1 = a1 - z;
     214    _Float128 t2 = a1 - t1;
     215    t1 = m1 - t1;
     216    t2 = z - t2;
     217    _Float128 a2 = t1 + t2;
     218    /* Ensure the arithmetic is not scheduled after feclearexcept call.  */
     219    math_force_eval (m2);
     220    math_force_eval (a2);
     221    feclearexcept (FE_INEXACT);
     222  
     223    /* If the result is an exact zero, ensure it has the correct sign.  */
     224    if (a1 == 0 && m2 == 0)
     225      {
     226        feupdateenv (&env);
     227        /* Ensure that round-to-nearest value of z + m1 is not reused.  */
     228        z = math_opt_barrier (z);
     229        return z + m1;
     230      }
     231  
     232    fesetround (FE_TOWARDZERO);
     233    /* Perform m2 + a2 addition with round to odd.  */
     234    u.d = a2 + m2;
     235  
     236    if (__glibc_likely (adjust == 0))
     237      {
     238        if ((u.ieee.mantissa3 & 1) == 0 && u.ieee.exponent != 0x7fff)
     239  	u.ieee.mantissa3 |= fetestexcept (FE_INEXACT) != 0;
     240        feupdateenv (&env);
     241        /* Result is a1 + u.d.  */
     242        return a1 + u.d;
     243      }
     244    else if (__glibc_likely (adjust > 0))
     245      {
     246        if ((u.ieee.mantissa3 & 1) == 0 && u.ieee.exponent != 0x7fff)
     247  	u.ieee.mantissa3 |= fetestexcept (FE_INEXACT) != 0;
     248        feupdateenv (&env);
     249        /* Result is a1 + u.d, scaled up.  */
     250        return (a1 + u.d) * L(0x1p113);
     251      }
     252    else
     253      {
     254        if ((u.ieee.mantissa3 & 1) == 0)
     255  	u.ieee.mantissa3 |= fetestexcept (FE_INEXACT) != 0;
     256        v.d = a1 + u.d;
     257        /* Ensure the addition is not scheduled after fetestexcept call.  */
     258        math_force_eval (v.d);
     259        int j = fetestexcept (FE_INEXACT) != 0;
     260        feupdateenv (&env);
     261        /* Ensure the following computations are performed in default rounding
     262  	 mode instead of just reusing the round to zero computation.  */
     263        asm volatile ("" : "=m" (u) : "m" (u));
     264        /* If a1 + u.d is exact, the only rounding happens during
     265  	 scaling down.  */
     266        if (j == 0)
     267  	return v.d * L(0x1p-228);
     268        /* If result rounded to zero is not subnormal, no double
     269  	 rounding will occur.  */
     270        if (v.ieee.exponent > 228)
     271  	return (a1 + u.d) * L(0x1p-228);
     272        /* If v.d * 0x1p-228L with round to zero is a subnormal above
     273  	 or equal to LDBL_MIN / 2, then v.d * 0x1p-228L shifts mantissa
     274  	 down just by 1 bit, which means v.ieee.mantissa3 |= j would
     275  	 change the round bit, not sticky or guard bit.
     276  	 v.d * 0x1p-228L never normalizes by shifting up,
     277  	 so round bit plus sticky bit should be already enough
     278  	 for proper rounding.  */
     279        if (v.ieee.exponent == 228)
     280  	{
     281  	  /* If the exponent would be in the normal range when
     282  	     rounding to normal precision with unbounded exponent
     283  	     range, the exact result is known and spurious underflows
     284  	     must be avoided on systems detecting tininess after
     285  	     rounding.  */
     286  	  if (TININESS_AFTER_ROUNDING)
     287  	    {
     288  	      w.d = a1 + u.d;
     289  	      if (w.ieee.exponent == 229)
     290  		return w.d * L(0x1p-228);
     291  	    }
     292  	  /* v.ieee.mantissa3 & 2 is LSB bit of the result before rounding,
     293  	     v.ieee.mantissa3 & 1 is the round bit and j is our sticky
     294  	     bit.  */
     295  	  w.d = 0;
     296  	  w.ieee.mantissa3 = ((v.ieee.mantissa3 & 3) << 1) | j;
     297  	  w.ieee.negative = v.ieee.negative;
     298  	  v.ieee.mantissa3 &= ~3U;
     299  	  v.d *= L(0x1p-228);
     300  	  w.d *= L(0x1p-2);
     301  	  return v.d + w.d;
     302  	}
     303        v.ieee.mantissa3 |= j;
     304        return v.d * L(0x1p-228);
     305      }
     306  #endif /* ! USE_FMAL_BUILTIN  */
     307  }
     308  libm_alias_ldouble (__fma, fma)
     309  libm_alias_ldouble_narrow (__fma, fma)