1  /* Implement powl for x86 using extra-precision log.
       2     Copyright (C) 2012-2023 Free Software Foundation, Inc.
       3     This file is part of the GNU C Library.
       4  
       5     The GNU C Library is free software; you can redistribute it and/or
       6     modify it under the terms of the GNU Lesser General Public
       7     License as published by the Free Software Foundation; either
       8     version 2.1 of the License, or (at your option) any later version.
       9  
      10     The GNU C Library is distributed in the hope that it will be useful,
      11     but WITHOUT ANY WARRANTY; without even the implied warranty of
      12     MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
      13     Lesser General Public License for more details.
      14  
      15     You should have received a copy of the GNU Lesser General Public
      16     License along with the GNU C Library; if not, see
      17     <https://www.gnu.org/licenses/>.  */
      18  
      19  #include <math.h>
      20  #include <math_private.h>
      21  #include <math-underflow.h>
      22  #include <stdbool.h>
      23  
      24  /* High parts and low parts of -log (k/16), for integer k from 12 to
      25     24.  */
      26  
      27  static const long double powl_log_table[] =
      28    {
      29      0x4.9a58844d36e49e1p-4L, -0x1.0522624fd558f574p-68L,
      30      0x3.527da7915b3c6de4p-4L, 0x1.7d4ef4b901b99b9ep-68L,
      31      0x2.22f1d044fc8f7bc8p-4L, -0x1.8e97c071a42fc388p-68L,
      32      0x1.08598b59e3a0688ap-4L, 0x3.fd9bf503372c12fcp-72L,
      33      -0x0p+0L, 0x0p+0L,
      34      -0xf.85186008b15330cp-8L, 0x1.9b47488a6687672cp-72L,
      35      -0x1.e27076e2af2e5e9ep-4L, -0xa.87ffe1fe9e155dcp-72L,
      36      -0x2.bfe60e14f27a791p-4L, 0x1.83bebf1bdb88a032p-68L,
      37      -0x3.91fef8f353443584p-4L, -0xb.b03de5ff734495cp-72L,
      38      -0x4.59d72aeae98380e8p-4L, 0xc.e0aa3be4747dc1p-72L,
      39      -0x5.1862f08717b09f4p-4L, -0x2.decdeccf1cd10578p-68L,
      40      -0x5.ce75fdaef401a738p-4L, -0x9.314feb4fbde5aaep-72L,
      41      -0x6.7cc8fb2fe612fcbp-4L, 0x2.5ca2642feb779f98p-68L,
      42    };
      43  
      44  /* High 32 bits of log2 (e), and remainder rounded to 64 bits.  */
      45  static const long double log2e_hi = 0x1.71547652p+0L;
      46  static const long double log2e_lo = 0xb.82fe1777d0ffda1p-36L;
      47  
      48  /* Given a number with high part HI and low part LO, add the number X
      49     to it and store the result in *RHI and *RLO.  It is given that
      50     either |X| < |0.7 * HI|, or HI == LO == 0, and that the values are
      51     small enough that no overflow occurs.  The result does not need to
      52     be exact to 128 bits; 78-bit accuracy of the final accumulated
      53     result suffices.  */
      54  
      55  static inline void
      56  acc_split (long double *rhi, long double *rlo, long double hi, long double lo,
      57  	   long double x)
      58  {
      59    long double thi = hi + x;
      60    long double tlo = (hi - thi) + x + lo;
      61    *rhi = thi + tlo;
      62    *rlo = (thi - *rhi) + tlo;
      63  }
      64  
      65  extern long double __powl_helper (long double x, long double y);
      66  libm_hidden_proto (__powl_helper)
      67  
      68  /* Given X a value that is finite and nonzero, or a NaN, and Y a
      69     finite nonzero value with 0x1p-79 <= |Y| <= 0x1p78, compute X to
      70     the power Y.  */
      71  
      72  long double
      73  __powl_helper (long double x, long double y)
      74  {
      75    if (isnan (x))
      76      return __ieee754_expl (y * __ieee754_logl (x));
      77    bool negate;
      78    if (x < 0)
      79      {
      80        long double absy = fabsl (y);
      81        if (absy >= 0x1p64L)
      82  	negate = false;
      83        else
      84  	{
      85  	  unsigned long long yll = absy;
      86  	  if (yll != absy)
      87  	    return __ieee754_expl (y * __ieee754_logl (x));
      88  	  negate = (yll & 1) != 0;
      89  	}
      90        x = fabsl (x);
      91      }
      92    else
      93      negate = false;
      94  
      95    /* We need to compute Y * log2 (X) to at least 64 bits after the
      96       point for normal results (that is, to at least 78 bits
      97       precision).  */
      98    int x_int_exponent;
      99    long double x_frac;
     100    x_frac = __frexpl (x, &x_int_exponent);
     101    if (x_frac <= 0x0.aaaaaaaaaaaaaaaap0L) /* 2.0L / 3.0L, rounded down */
     102      {
     103        x_frac *= 2.0;
     104        x_int_exponent--;
     105      }
     106  
     107    long double log_x_frac_hi, log_x_frac_lo;
     108    /* Determine an initial approximation to log (X_FRAC) using
     109       POWL_LOG_TABLE, and multiply by a value K/16 to reduce to an
     110       interval (24/25, 26/25).  */
     111    int k = (int) ((16.0L / x_frac) + 0.5L);
     112    log_x_frac_hi = powl_log_table[2 * k - 24];
     113    log_x_frac_lo = powl_log_table[2 * k - 23];
     114    long double x_frac_low;
     115    if (k == 16)
     116      x_frac_low = 0.0L;
     117    else
     118      {
     119        /* Mask off low 5 bits of X_FRAC so the multiplication by K/16
     120  	 is exact.  These bits are small enough that they can be
     121  	 corrected for by adding log2 (e) * X_FRAC_LOW to the final
     122  	 result.  */
     123        int32_t se;
     124        uint32_t i0, i1;
     125        GET_LDOUBLE_WORDS (se, i0, i1, x_frac);
     126        x_frac_low = x_frac;
     127        i1 &= 0xffffffe0;
     128        SET_LDOUBLE_WORDS (x_frac, se, i0, i1);
     129        x_frac_low -= x_frac;
     130        x_frac_low /= x_frac;
     131        x_frac *= k / 16.0L;
     132      }
     133  
     134    /* Now compute log (X_FRAC) for X_FRAC in (24/25, 26/25).  Separate
     135       W = X_FRAC - 1 into high 16 bits and remaining bits, so that
     136       multiplications for low-order power series terms are exact.  The
     137       remaining bits are small enough that adding a 64-bit value of
     138       log2 (1 + W_LO / (1 + W_HI)) will be a sufficient correction for
     139       them.  */
     140    long double w = x_frac - 1;
     141    long double w_hi, w_lo;
     142    int32_t se;
     143    uint32_t i0, i1;
     144    GET_LDOUBLE_WORDS (se, i0, i1, w);
     145    i0 &= 0xffff0000;
     146    i1 = 0;
     147    SET_LDOUBLE_WORDS (w_hi, se, i0, i1);
     148    w_lo = w - w_hi;
     149    long double wp = w_hi;
     150    acc_split (&log_x_frac_hi, &log_x_frac_lo, log_x_frac_hi, log_x_frac_lo, wp);
     151    wp *= -w_hi;
     152    acc_split (&log_x_frac_hi, &log_x_frac_lo, log_x_frac_hi, log_x_frac_lo,
     153  	     wp / 2.0L);
     154    wp *= -w_hi;
     155    acc_split (&log_x_frac_hi, &log_x_frac_lo, log_x_frac_hi, log_x_frac_lo,
     156  	     wp * 0x0.5555p0L); /* -W_HI**3 / 3, high part.  */
     157    acc_split (&log_x_frac_hi, &log_x_frac_lo, log_x_frac_hi, log_x_frac_lo,
     158  	     wp * 0x0.5555555555555555p-16L); /* -W_HI**3 / 3, low part.  */
     159    wp *= -w_hi;
     160    acc_split (&log_x_frac_hi, &log_x_frac_lo, log_x_frac_hi, log_x_frac_lo,
     161  	     wp / 4.0L);
     162    /* Subsequent terms are small enough that they only need be computed
     163       to 64 bits.  */
     164    for (int i = 5; i <= 17; i++)
     165      {
     166        wp *= -w_hi;
     167        acc_split (&log_x_frac_hi, &log_x_frac_lo, log_x_frac_hi, log_x_frac_lo,
     168  		 wp / i);
     169      }
     170  
     171    /* Convert LOG_X_FRAC_HI + LOG_X_FRAC_LO to a base-2 logarithm.  */
     172    long double log2_x_frac_hi, log2_x_frac_lo;
     173    long double log_x_frac_hi32, log_x_frac_lo64;
     174    GET_LDOUBLE_WORDS (se, i0, i1, log_x_frac_hi);
     175    i1 = 0;
     176    SET_LDOUBLE_WORDS (log_x_frac_hi32, se, i0, i1);
     177    log_x_frac_lo64 = (log_x_frac_hi - log_x_frac_hi32) + log_x_frac_lo;
     178    long double log2_x_frac_hi1 = log_x_frac_hi32 * log2e_hi;
     179    long double log2_x_frac_lo1
     180      = log_x_frac_lo64 * log2e_hi + log_x_frac_hi * log2e_lo;
     181    log2_x_frac_hi = log2_x_frac_hi1 + log2_x_frac_lo1;
     182    log2_x_frac_lo = (log2_x_frac_hi1 - log2_x_frac_hi) + log2_x_frac_lo1;
     183  
     184    /* Correct for the masking off of W_LO.  */
     185    long double log2_1p_w_lo;
     186    asm ("fyl2xp1"
     187         : "=t" (log2_1p_w_lo)
     188         : "0" (w_lo / (1.0L + w_hi)), "u" (1.0L)
     189         : "st(1)");
     190    acc_split (&log2_x_frac_hi, &log2_x_frac_lo, log2_x_frac_hi, log2_x_frac_lo,
     191  	     log2_1p_w_lo);
     192  
     193    /* Correct for the masking off of X_FRAC_LOW.  */
     194    acc_split (&log2_x_frac_hi, &log2_x_frac_lo, log2_x_frac_hi, log2_x_frac_lo,
     195  	     x_frac_low * M_LOG2El);
     196  
     197    /* Add the integer and fractional parts of the base-2 logarithm.  */
     198    long double log2_x_hi, log2_x_lo;
     199    log2_x_hi = x_int_exponent + log2_x_frac_hi;
     200    log2_x_lo = ((x_int_exponent - log2_x_hi) + log2_x_frac_hi) + log2_x_frac_lo;
     201  
     202    /* Compute the base-2 logarithm of the result.  */
     203    long double log2_res_hi, log2_res_lo;
     204    long double log2_x_hi32, log2_x_lo64;
     205    GET_LDOUBLE_WORDS (se, i0, i1, log2_x_hi);
     206    i1 = 0;
     207    SET_LDOUBLE_WORDS (log2_x_hi32, se, i0, i1);
     208    log2_x_lo64 = (log2_x_hi - log2_x_hi32) + log2_x_lo;
     209    long double y_hi32, y_lo32;
     210    GET_LDOUBLE_WORDS (se, i0, i1, y);
     211    i1 = 0;
     212    SET_LDOUBLE_WORDS (y_hi32, se, i0, i1);
     213    y_lo32 = y - y_hi32;
     214    log2_res_hi = log2_x_hi32 * y_hi32;
     215    log2_res_lo = log2_x_hi32 * y_lo32 + log2_x_lo64 * y;
     216  
     217    /* Split the base-2 logarithm of the result into integer and
     218       fractional parts.  */
     219    long double log2_res_int = roundl (log2_res_hi);
     220    long double log2_res_frac = log2_res_hi - log2_res_int + log2_res_lo;
     221    /* If the integer part is very large, the computed fractional part
     222       may be outside the valid range for f2xm1.  */
     223    if (fabsl (log2_res_int) > 16500)
     224      log2_res_frac = 0;
     225  
     226    /* Compute the final result.  */
     227    long double res;
     228    asm ("f2xm1" : "=t" (res) : "0" (log2_res_frac));
     229    res += 1.0L;
     230    if (negate)
     231      res = -res;
     232    asm ("fscale" : "=t" (res) : "0" (res), "u" (log2_res_int));
     233    math_check_force_underflow (res);
     234    return res;
     235  }
     236  
     237  libm_hidden_def (__powl_helper)