1  /* @(#)k_rem_pio2.c 5.1 93/09/24 */
       2  /*
       3   * ====================================================
       4   * Copyright (C) 1993 by Sun Microsystems, Inc. All rights reserved.
       5   *
       6   * Developed at SunPro, a Sun Microsystems, Inc. business.
       7   * Permission to use, copy, modify, and distribute this
       8   * software is freely granted, provided that this notice
       9   * is preserved.
      10   * ====================================================
      11   */
      12  
      13  #if defined(LIBM_SCCS) && !defined(lint)
      14  static char rcsid[] = "$NetBSD: k_rem_pio2.c,v 1.7 1995/05/10 20:46:25 jtc Exp $";
      15  #endif
      16  
      17  /*
      18   * __kernel_rem_pio2(x,y,e0,nx,prec,ipio2)
      19   * double x[],y[]; int e0,nx,prec; int ipio2[];
      20   *
      21   * __kernel_rem_pio2 return the last three digits of N with
      22   *		y = x - N*pi/2
      23   * so that |y| < pi/2.
      24   *
      25   * The method is to compute the integer (mod 8) and fraction parts of
      26   * (2/pi)*x without doing the full multiplication. In general we
      27   * skip the part of the product that are known to be a huge integer (
      28   * more accurately, = 0 mod 8 ). Thus the number of operations are
      29   * independent of the exponent of the input.
      30   *
      31   * (2/pi) is represented by an array of 24-bit integers in ipio2[].
      32   *
      33   * Input parameters:
      34   * 	x[]	The input value (must be positive) is broken into nx
      35   *		pieces of 24-bit integers in double precision format.
      36   *		x[i] will be the i-th 24 bit of x. The scaled exponent
      37   *		of x[0] is given in input parameter e0 (i.e., x[0]*2^e0
      38   *		match x's up to 24 bits.
      39   *
      40   *		Example of breaking a double positive z into x[0]+x[1]+x[2]:
      41   *			e0 = ilogb(z)-23
      42   *			z  = scalbn(z,-e0)
      43   *		for i = 0,1,2
      44   *			x[i] = floor(z)
      45   *			z    = (z-x[i])*2**24
      46   *
      47   *
      48   *	y[]	output result in an array of double precision numbers.
      49   *		The dimension of y[] is:
      50   *			24-bit  precision	1
      51   *			53-bit  precision	2
      52   *			64-bit  precision	2
      53   *			113-bit precision	3
      54   *		The actual value is the sum of them. Thus for 113-bit
      55   *		precision, one may have to do something like:
      56   *
      57   *		long double t,w,r_head, r_tail;
      58   *		t = (long double)y[2] + (long double)y[1];
      59   *		w = (long double)y[0];
      60   *		r_head = t+w;
      61   *		r_tail = w - (r_head - t);
      62   *
      63   *	e0	The exponent of x[0]
      64   *
      65   *	nx	dimension of x[]
      66   *
      67   *  	prec	an integer indicating the precision:
      68   *			0	24  bits (single)
      69   *			1	53  bits (double)
      70   *			2	64  bits (extended)
      71   *			3	113 bits (quad)
      72   *
      73   *	ipio2[]
      74   *		integer array, contains the (24*i)-th to (24*i+23)-th
      75   *		bit of 2/pi after binary point. The corresponding
      76   *		floating value is
      77   *
      78   *			ipio2[i] * 2^(-24(i+1)).
      79   *
      80   * External function:
      81   *	double scalbn(), floor();
      82   *
      83   *
      84   * Here is the description of some local variables:
      85   *
      86   * 	jk	jk+1 is the initial number of terms of ipio2[] needed
      87   *		in the computation. The recommended value is 2,3,4,
      88   *		6 for single, double, extended,and quad.
      89   *
      90   * 	jz	local integer variable indicating the number of
      91   *		terms of ipio2[] used.
      92   *
      93   *	jx	nx - 1
      94   *
      95   *	jv	index for pointing to the suitable ipio2[] for the
      96   *		computation. In general, we want
      97   *			( 2^e0*x[0] * ipio2[jv-1]*2^(-24jv) )/8
      98   *		is an integer. Thus
      99   *			e0-3-24*jv >= 0 or (e0-3)/24 >= jv
     100   *		Hence jv = max(0,(e0-3)/24).
     101   *
     102   *	jp	jp+1 is the number of terms in PIo2[] needed, jp = jk.
     103   *
     104   * 	q[]	double array with integral value, representing the
     105   *		24-bits chunk of the product of x and 2/pi.
     106   *
     107   *	q0	the corresponding exponent of q[0]. Note that the
     108   *		exponent for q[i] would be q0-24*i.
     109   *
     110   *	PIo2[]	double precision array, obtained by cutting pi/2
     111   *		into 24 bits chunks.
     112   *
     113   *	f[]	ipio2[] in floating point
     114   *
     115   *	iq[]	integer array by breaking up q[] in 24-bits chunk.
     116   *
     117   *	fq[]	final product of x*(2/pi) in fq[0],..,fq[jk]
     118   *
     119   *	ih	integer. If >0 it indicates q[] is >= 0.5, hence
     120   *		it also indicates the *sign* of the result.
     121   *
     122   */
     123  
     124  
     125  /*
     126   * Constants:
     127   * The hexadecimal values are the intended ones for the following
     128   * constants. The decimal values may be used, provided that the
     129   * compiler will convert from decimal to binary accurately enough
     130   * to produce the hexadecimal values shown.
     131   */
     132  
     133  #include <math.h>
     134  #include <math-narrow-eval.h>
     135  #include <math_private.h>
     136  #include <libc-diag.h>
     137  
     138  static const int init_jk[] = {2,3,4,6}; /* initial value for jk */
     139  
     140  static const double PIo2[] = {
     141    1.57079625129699707031e+00, /* 0x3FF921FB, 0x40000000 */
     142    7.54978941586159635335e-08, /* 0x3E74442D, 0x00000000 */
     143    5.39030252995776476554e-15, /* 0x3CF84698, 0x80000000 */
     144    3.28200341580791294123e-22, /* 0x3B78CC51, 0x60000000 */
     145    1.27065575308067607349e-29, /* 0x39F01B83, 0x80000000 */
     146    1.22933308981111328932e-36, /* 0x387A2520, 0x40000000 */
     147    2.73370053816464559624e-44, /* 0x36E38222, 0x80000000 */
     148    2.16741683877804819444e-51, /* 0x3569F31D, 0x00000000 */
     149  };
     150  
     151  static const double
     152    zero   = 0.0,
     153    one    = 1.0,
     154    two24  = 1.67772160000000000000e+07, /* 0x41700000, 0x00000000 */
     155    twon24 = 5.96046447753906250000e-08; /* 0x3E700000, 0x00000000 */
     156  
     157  int
     158  __kernel_rem_pio2 (double *x, double *y, int e0, int nx, int prec,
     159                     const int32_t *ipio2)
     160  {
     161    int32_t jz, jx, jv, jp, jk, carry, n, iq[20], i, j, k, m, q0, ih;
     162    double z, fw, f[20], fq[20], q[20];
     163  
     164    /* initialize jk*/
     165    jk = init_jk[prec];
     166    jp = jk;
     167  
     168    /* determine jx,jv,q0, note that 3>q0 */
     169    jx = nx - 1;
     170    jv = (e0 - 3) / 24; if (jv < 0)
     171      jv = 0;
     172    q0 = e0 - 24 * (jv + 1);
     173  
     174    /* set up f[0] to f[jx+jk] where f[jx+jk] = ipio2[jv+jk] */
     175    j = jv - jx; m = jx + jk;
     176    for (i = 0; i <= m; i++, j++)
     177      f[i] = (j < 0) ? zero : (double) ipio2[j];
     178  
     179    /* compute q[0],q[1],...q[jk] */
     180    for (i = 0; i <= jk; i++)
     181      {
     182        for (j = 0, fw = 0.0; j <= jx; j++)
     183  	fw += x[j] * f[jx + i - j];
     184        q[i] = fw;
     185      }
     186  
     187    jz = jk;
     188  recompute:
     189    /* distill q[] into iq[] reversingly */
     190    for (i = 0, j = jz, z = q[jz]; j > 0; i++, j--)
     191      {
     192        fw = (double) ((int32_t) (twon24 * z));
     193        iq[i] = (int32_t) (z - two24 * fw);
     194        z = q[j - 1] + fw;
     195      }
     196  
     197    /* compute n */
     198    z = __scalbn (z, q0);                 /* actual value of z */
     199    z -= 8.0 * floor (z * 0.125);               /* trim off integer >= 8 */
     200    n = (int32_t) z;
     201    z -= (double) n;
     202    ih = 0;
     203    if (q0 > 0)           /* need iq[jz-1] to determine n */
     204      {
     205        i = (iq[jz - 1] >> (24 - q0)); n += i;
     206        iq[jz - 1] -= i << (24 - q0);
     207        ih = iq[jz - 1] >> (23 - q0);
     208      }
     209    else if (q0 == 0)
     210      ih = iq[jz - 1] >> 23;
     211    else if (z >= 0.5)
     212      ih = 2;
     213  
     214    if (ih > 0)           /* q > 0.5 */
     215      {
     216        n += 1; carry = 0;
     217        for (i = 0; i < jz; i++)          /* compute 1-q */
     218  	{
     219  	  j = iq[i];
     220  	  if (carry == 0)
     221  	    {
     222  	      if (j != 0)
     223  		{
     224  		  carry = 1; iq[i] = 0x1000000 - j;
     225  		}
     226  	    }
     227  	  else
     228  	    iq[i] = 0xffffff - j;
     229  	}
     230        if (q0 > 0)               /* rare case: chance is 1 in 12 */
     231  	{
     232  	  switch (q0)
     233  	    {
     234  	    case 1:
     235  	      iq[jz - 1] &= 0x7fffff; break;
     236  	    case 2:
     237  	      iq[jz - 1] &= 0x3fffff; break;
     238  	    }
     239  	}
     240        if (ih == 2)
     241  	{
     242  	  z = one - z;
     243  	  if (carry != 0)
     244  	    z -= __scalbn (one, q0);
     245  	}
     246      }
     247  
     248    /* check if recomputation is needed */
     249    if (z == zero)
     250      {
     251        j = 0;
     252        for (i = jz - 1; i >= jk; i--)
     253  	j |= iq[i];
     254        if (j == 0)      /* need recomputation */
     255  	{
     256  	  /* On s390x gcc 6.1 -O3 produces the warning "array subscript is below
     257  	     array bounds [-Werror=array-bounds]".  Only __ieee754_rem_pio2l
     258  	     calls __kernel_rem_pio2 for normal numbers and |x| > pi/4 in case
     259  	     of ldbl-96 and |x| > 3pi/4 in case of ldbl-128[ibm].
     260  	     Thus x can't be zero and ipio2 is not zero, too.  Thus not all iq[]
     261  	     values can't be zero.  */
     262  	  DIAG_PUSH_NEEDS_COMMENT;
     263  	  DIAG_IGNORE_NEEDS_COMMENT (6.1, "-Warray-bounds");
     264  	  for (k = 1; iq[jk - k] == 0; k++)
     265  	    ;                               /* k = no. of terms needed */
     266  	  DIAG_POP_NEEDS_COMMENT;
     267  
     268  	  for (i = jz + 1; i <= jz + k; i++) /* add q[jz+1] to q[jz+k] */
     269  	    {
     270  	      f[jx + i] = (double) ipio2[jv + i];
     271  	      for (j = 0, fw = 0.0; j <= jx; j++)
     272  		fw += x[j] * f[jx + i - j];
     273  	      q[i] = fw;
     274  	    }
     275  	  jz += k;
     276  	  goto recompute;
     277  	}
     278      }
     279  
     280    /* chop off zero terms */
     281    if (z == 0.0)
     282      {
     283        jz -= 1; q0 -= 24;
     284        while (iq[jz] == 0)
     285  	{
     286  	  jz--; q0 -= 24;
     287  	}
     288      }
     289    else           /* break z into 24-bit if necessary */
     290      {
     291        z = __scalbn (z, -q0);
     292        if (z >= two24)
     293  	{
     294  	  fw = (double) ((int32_t) (twon24 * z));
     295  	  iq[jz] = (int32_t) (z - two24 * fw);
     296  	  jz += 1; q0 += 24;
     297  	  iq[jz] = (int32_t) fw;
     298  	}
     299        else
     300  	iq[jz] = (int32_t) z;
     301      }
     302  
     303    /* convert integer "bit" chunk to floating-point value */
     304    fw = __scalbn (one, q0);
     305    for (i = jz; i >= 0; i--)
     306      {
     307        q[i] = fw * (double) iq[i]; fw *= twon24;
     308      }
     309  
     310    /* compute PIo2[0,...,jp]*q[jz,...,0] */
     311    for (i = jz; i >= 0; i--)
     312      {
     313        for (fw = 0.0, k = 0; k <= jp && k <= jz - i; k++)
     314  	fw += PIo2[k] * q[i + k];
     315        fq[jz - i] = fw;
     316      }
     317  
     318    /* compress fq[] into y[] */
     319    switch (prec)
     320      {
     321      case 0:
     322        fw = 0.0;
     323        for (i = jz; i >= 0; i--)
     324  	fw += fq[i];
     325        y[0] = (ih == 0) ? fw : -fw;
     326        break;
     327      case 1:
     328      case 2:;
     329        double fv = 0.0;
     330        for (i = jz; i >= 0; i--)
     331  	fv = math_narrow_eval (fv + fq[i]);
     332        y[0] = (ih == 0) ? fv : -fv;
     333        /* GCC mainline (to be GCC 9), as of 2018-05-22 on i686, warns
     334  	 that fq[0] may be used uninitialized.  This is not possible
     335  	 because jz is always nonnegative when the above loop
     336  	 initializing fq is executed, because the result is never zero
     337  	 to full precision (this function is not called for zero
     338  	 arguments).  */
     339        DIAG_PUSH_NEEDS_COMMENT;
     340        DIAG_IGNORE_NEEDS_COMMENT (9, "-Wmaybe-uninitialized");
     341        fv = math_narrow_eval (fq[0] - fv);
     342        DIAG_POP_NEEDS_COMMENT;
     343        for (i = 1; i <= jz; i++)
     344  	fv = math_narrow_eval (fv + fq[i]);
     345        y[1] = (ih == 0) ? fv : -fv;
     346        break;
     347      case 3:             /* painful */
     348        for (i = jz; i > 0; i--)
     349  	{
     350  	  double fv = math_narrow_eval (fq[i - 1] + fq[i]);
     351  	  fq[i] += fq[i - 1] - fv;
     352  	  fq[i - 1] = fv;
     353  	}
     354        for (i = jz; i > 1; i--)
     355  	{
     356  	  double fv = math_narrow_eval (fq[i - 1] + fq[i]);
     357  	  fq[i] += fq[i - 1] - fv;
     358  	  fq[i - 1] = fv;
     359  	}
     360        for (fw = 0.0, i = jz; i >= 2; i--)
     361  	fw += fq[i];
     362        if (ih == 0)
     363  	{
     364  	  y[0] = fq[0]; y[1] = fq[1]; y[2] = fw;
     365  	}
     366        else
     367  	{
     368  	  y[0] = -fq[0]; y[1] = -fq[1]; y[2] = -fw;
     369  	}
     370      }
     371    return n & 7;
     372  }