1  /* Compute x * y + z as ternary operation.
       2     Copyright (C) 2010-2023 Free Software Foundation, Inc.
       3     This file is part of the GNU C Library.
       4  
       5     The GNU C Library is free software; you can redistribute it and/or
       6     modify it under the terms of the GNU Lesser General Public
       7     License as published by the Free Software Foundation; either
       8     version 2.1 of the License, or (at your option) any later version.
       9  
      10     The GNU C Library is distributed in the hope that it will be useful,
      11     but WITHOUT ANY WARRANTY; without even the implied warranty of
      12     MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
      13     Lesser General Public License for more details.
      14  
      15     You should have received a copy of the GNU Lesser General Public
      16     License along with the GNU C Library; if not, see
      17     <https://www.gnu.org/licenses/>.  */
      18  
      19  #define NO_MATH_REDIRECT
      20  #include <float.h>
      21  #include <math.h>
      22  #include <fenv.h>
      23  #include <ieee754.h>
      24  #include <math-barriers.h>
      25  #include <libm-alias-ldouble.h>
      26  #include <tininess.h>
      27  
      28  /* This implementation uses rounding to odd to avoid problems with
      29     double rounding.  See a paper by Boldo and Melquiond:
      30     http://www.lri.fr/~melquion/doc/08-tc.pdf  */
      31  
      32  long double
      33  __fmal (long double x, long double y, long double z)
      34  {
      35    union ieee854_long_double u, v, w;
      36    int adjust = 0;
      37    u.d = x;
      38    v.d = y;
      39    w.d = z;
      40    if (__builtin_expect (u.ieee.exponent + v.ieee.exponent
      41  			>= 0x7fff + IEEE854_LONG_DOUBLE_BIAS
      42  			   - LDBL_MANT_DIG, 0)
      43        || __builtin_expect (u.ieee.exponent >= 0x7fff - LDBL_MANT_DIG, 0)
      44        || __builtin_expect (v.ieee.exponent >= 0x7fff - LDBL_MANT_DIG, 0)
      45        || __builtin_expect (w.ieee.exponent >= 0x7fff - LDBL_MANT_DIG, 0)
      46        || __builtin_expect (u.ieee.exponent + v.ieee.exponent
      47  			   <= IEEE854_LONG_DOUBLE_BIAS + LDBL_MANT_DIG, 0))
      48      {
      49        /* If z is Inf, but x and y are finite, the result should be
      50  	 z rather than NaN.  */
      51        if (w.ieee.exponent == 0x7fff
      52  	  && u.ieee.exponent != 0x7fff
      53            && v.ieee.exponent != 0x7fff)
      54  	return (z + x) + y;
      55        /* If z is zero and x are y are nonzero, compute the result
      56  	 as x * y to avoid the wrong sign of a zero result if x * y
      57  	 underflows to 0.  */
      58        if (z == 0 && x != 0 && y != 0)
      59  	return x * y;
      60        /* If x or y or z is Inf/NaN, or if x * y is zero, compute as
      61  	 x * y + z.  */
      62        if (u.ieee.exponent == 0x7fff
      63  	  || v.ieee.exponent == 0x7fff
      64  	  || w.ieee.exponent == 0x7fff
      65  	  || x == 0
      66  	  || y == 0)
      67  	return x * y + z;
      68        /* If fma will certainly overflow, compute as x * y.  */
      69        if (u.ieee.exponent + v.ieee.exponent
      70  	  > 0x7fff + IEEE854_LONG_DOUBLE_BIAS)
      71  	return x * y;
      72        /* If x * y is less than 1/4 of LDBL_TRUE_MIN, neither the
      73  	 result nor whether there is underflow depends on its exact
      74  	 value, only on its sign.  */
      75        if (u.ieee.exponent + v.ieee.exponent
      76  	  < IEEE854_LONG_DOUBLE_BIAS - LDBL_MANT_DIG - 2)
      77  	{
      78  	  int neg = u.ieee.negative ^ v.ieee.negative;
      79  	  long double tiny = neg ? -0x1p-16445L : 0x1p-16445L;
      80  	  if (w.ieee.exponent >= 3)
      81  	    return tiny + z;
      82  	  /* Scaling up, adding TINY and scaling down produces the
      83  	     correct result, because in round-to-nearest mode adding
      84  	     TINY has no effect and in other modes double rounding is
      85  	     harmless.  But it may not produce required underflow
      86  	     exceptions.  */
      87  	  v.d = z * 0x1p65L + tiny;
      88  	  if (TININESS_AFTER_ROUNDING
      89  	      ? v.ieee.exponent < 66
      90  	      : (w.ieee.exponent == 0
      91  		 || (w.ieee.exponent == 1
      92  		     && w.ieee.negative != neg
      93  		     && w.ieee.mantissa1 == 0
      94  		     && w.ieee.mantissa0 == 0x80000000)))
      95  	    {
      96  	      long double force_underflow = x * y;
      97  	      math_force_eval (force_underflow);
      98  	    }
      99  	  return v.d * 0x1p-65L;
     100  	}
     101        if (u.ieee.exponent + v.ieee.exponent
     102  	  >= 0x7fff + IEEE854_LONG_DOUBLE_BIAS - LDBL_MANT_DIG)
     103  	{
     104  	  /* Compute 1p-64 times smaller result and multiply
     105  	     at the end.  */
     106  	  if (u.ieee.exponent > v.ieee.exponent)
     107  	    u.ieee.exponent -= LDBL_MANT_DIG;
     108  	  else
     109  	    v.ieee.exponent -= LDBL_MANT_DIG;
     110  	  /* If x + y exponent is very large and z exponent is very small,
     111  	     it doesn't matter if we don't adjust it.  */
     112  	  if (w.ieee.exponent > LDBL_MANT_DIG)
     113  	    w.ieee.exponent -= LDBL_MANT_DIG;
     114  	  adjust = 1;
     115  	}
     116        else if (w.ieee.exponent >= 0x7fff - LDBL_MANT_DIG)
     117  	{
     118  	  /* Similarly.
     119  	     If z exponent is very large and x and y exponents are
     120  	     very small, adjust them up to avoid spurious underflows,
     121  	     rather than down.  */
     122  	  if (u.ieee.exponent + v.ieee.exponent
     123  	      <= IEEE854_LONG_DOUBLE_BIAS + 2 * LDBL_MANT_DIG)
     124  	    {
     125  	      if (u.ieee.exponent > v.ieee.exponent)
     126  		u.ieee.exponent += 2 * LDBL_MANT_DIG + 2;
     127  	      else
     128  		v.ieee.exponent += 2 * LDBL_MANT_DIG + 2;
     129  	    }
     130  	  else if (u.ieee.exponent > v.ieee.exponent)
     131  	    {
     132  	      if (u.ieee.exponent > LDBL_MANT_DIG)
     133  		u.ieee.exponent -= LDBL_MANT_DIG;
     134  	    }
     135  	  else if (v.ieee.exponent > LDBL_MANT_DIG)
     136  	    v.ieee.exponent -= LDBL_MANT_DIG;
     137  	  w.ieee.exponent -= LDBL_MANT_DIG;
     138  	  adjust = 1;
     139  	}
     140        else if (u.ieee.exponent >= 0x7fff - LDBL_MANT_DIG)
     141  	{
     142  	  u.ieee.exponent -= LDBL_MANT_DIG;
     143  	  if (v.ieee.exponent)
     144  	    v.ieee.exponent += LDBL_MANT_DIG;
     145  	  else
     146  	    v.d *= 0x1p64L;
     147  	}
     148        else if (v.ieee.exponent >= 0x7fff - LDBL_MANT_DIG)
     149  	{
     150  	  v.ieee.exponent -= LDBL_MANT_DIG;
     151  	  if (u.ieee.exponent)
     152  	    u.ieee.exponent += LDBL_MANT_DIG;
     153  	  else
     154  	    u.d *= 0x1p64L;
     155  	}
     156        else /* if (u.ieee.exponent + v.ieee.exponent
     157  		  <= IEEE854_LONG_DOUBLE_BIAS + LDBL_MANT_DIG) */
     158  	{
     159  	  if (u.ieee.exponent > v.ieee.exponent)
     160  	    u.ieee.exponent += 2 * LDBL_MANT_DIG + 2;
     161  	  else
     162  	    v.ieee.exponent += 2 * LDBL_MANT_DIG + 2;
     163  	  if (w.ieee.exponent <= 4 * LDBL_MANT_DIG + 6)
     164  	    {
     165  	      if (w.ieee.exponent)
     166  		w.ieee.exponent += 2 * LDBL_MANT_DIG + 2;
     167  	      else
     168  		w.d *= 0x1p130L;
     169  	      adjust = -1;
     170  	    }
     171  	  /* Otherwise x * y should just affect inexact
     172  	     and nothing else.  */
     173  	}
     174        x = u.d;
     175        y = v.d;
     176        z = w.d;
     177      }
     178  
     179    /* Ensure correct sign of exact 0 + 0.  */
     180    if (__glibc_unlikely ((x == 0 || y == 0) && z == 0))
     181      {
     182        x = math_opt_barrier (x);
     183        return x * y + z;
     184      }
     185  
     186    fenv_t env;
     187    feholdexcept (&env);
     188    fesetround (FE_TONEAREST);
     189  
     190    /* Multiplication m1 + m2 = x * y using Dekker's algorithm.  */
     191  #define C ((1LL << (LDBL_MANT_DIG + 1) / 2) + 1)
     192    long double x1 = x * C;
     193    long double y1 = y * C;
     194    long double m1 = x * y;
     195    x1 = (x - x1) + x1;
     196    y1 = (y - y1) + y1;
     197    long double x2 = x - x1;
     198    long double y2 = y - y1;
     199    long double m2 = (((x1 * y1 - m1) + x1 * y2) + x2 * y1) + x2 * y2;
     200  
     201    /* Addition a1 + a2 = z + m1 using Knuth's algorithm.  */
     202    long double a1 = z + m1;
     203    long double t1 = a1 - z;
     204    long double t2 = a1 - t1;
     205    t1 = m1 - t1;
     206    t2 = z - t2;
     207    long double a2 = t1 + t2;
     208    /* Ensure the arithmetic is not scheduled after feclearexcept call.  */
     209    math_force_eval (m2);
     210    math_force_eval (a2);
     211    feclearexcept (FE_INEXACT);
     212  
     213    /* If the result is an exact zero, ensure it has the correct sign.  */
     214    if (a1 == 0 && m2 == 0)
     215      {
     216        feupdateenv (&env);
     217        /* Ensure that round-to-nearest value of z + m1 is not reused.  */
     218        z = math_opt_barrier (z);
     219        return z + m1;
     220      }
     221  
     222    fesetround (FE_TOWARDZERO);
     223    /* Perform m2 + a2 addition with round to odd.  */
     224    u.d = a2 + m2;
     225  
     226    if (__glibc_likely (adjust == 0))
     227      {
     228        if ((u.ieee.mantissa1 & 1) == 0 && u.ieee.exponent != 0x7fff)
     229  	u.ieee.mantissa1 |= fetestexcept (FE_INEXACT) != 0;
     230        feupdateenv (&env);
     231        /* Result is a1 + u.d.  */
     232        return a1 + u.d;
     233      }
     234    else if (__glibc_likely (adjust > 0))
     235      {
     236        if ((u.ieee.mantissa1 & 1) == 0 && u.ieee.exponent != 0x7fff)
     237  	u.ieee.mantissa1 |= fetestexcept (FE_INEXACT) != 0;
     238        feupdateenv (&env);
     239        /* Result is a1 + u.d, scaled up.  */
     240        return (a1 + u.d) * 0x1p64L;
     241      }
     242    else
     243      {
     244        if ((u.ieee.mantissa1 & 1) == 0)
     245  	u.ieee.mantissa1 |= fetestexcept (FE_INEXACT) != 0;
     246        v.d = a1 + u.d;
     247        /* Ensure the addition is not scheduled after fetestexcept call.  */
     248        math_force_eval (v.d);
     249        int j = fetestexcept (FE_INEXACT) != 0;
     250        feupdateenv (&env);
     251        /* Ensure the following computations are performed in default rounding
     252  	 mode instead of just reusing the round to zero computation.  */
     253        asm volatile ("" : "=m" (u) : "m" (u));
     254        /* If a1 + u.d is exact, the only rounding happens during
     255  	 scaling down.  */
     256        if (j == 0)
     257  	return v.d * 0x1p-130L;
     258        /* If result rounded to zero is not subnormal, no double
     259  	 rounding will occur.  */
     260        if (v.ieee.exponent > 130)
     261  	return (a1 + u.d) * 0x1p-130L;
     262        /* If v.d * 0x1p-130L with round to zero is a subnormal above
     263  	 or equal to LDBL_MIN / 2, then v.d * 0x1p-130L shifts mantissa
     264  	 down just by 1 bit, which means v.ieee.mantissa1 |= j would
     265  	 change the round bit, not sticky or guard bit.
     266  	 v.d * 0x1p-130L never normalizes by shifting up,
     267  	 so round bit plus sticky bit should be already enough
     268  	 for proper rounding.  */
     269        if (v.ieee.exponent == 130)
     270  	{
     271  	  /* If the exponent would be in the normal range when
     272  	     rounding to normal precision with unbounded exponent
     273  	     range, the exact result is known and spurious underflows
     274  	     must be avoided on systems detecting tininess after
     275  	     rounding.  */
     276  	  if (TININESS_AFTER_ROUNDING)
     277  	    {
     278  	      w.d = a1 + u.d;
     279  	      if (w.ieee.exponent == 131)
     280  		return w.d * 0x1p-130L;
     281  	    }
     282  	  /* v.ieee.mantissa1 & 2 is LSB bit of the result before rounding,
     283  	     v.ieee.mantissa1 & 1 is the round bit and j is our sticky
     284  	     bit.  */
     285  	  w.d = 0.0L;
     286  	  w.ieee.mantissa1 = ((v.ieee.mantissa1 & 3) << 1) | j;
     287  	  w.ieee.negative = v.ieee.negative;
     288  	  v.ieee.mantissa1 &= ~3U;
     289  	  v.d *= 0x1p-130L;
     290  	  w.d *= 0x1p-2L;
     291  	  return v.d + w.d;
     292  	}
     293        v.ieee.mantissa1 |= j;
     294        return v.d * 0x1p-130L;
     295      }
     296  }
     297  libm_alias_ldouble (__fma, fma)