1  /* Copyright (C) 2007-2023 Free Software Foundation, Inc.
       2  
       3  This file is part of GCC.
       4  
       5  GCC is free software; you can redistribute it and/or modify it under
       6  the terms of the GNU General Public License as published by the Free
       7  Software Foundation; either version 3, or (at your option) any later
       8  version.
       9  
      10  GCC is distributed in the hope that it will be useful, but WITHOUT ANY
      11  WARRANTY; without even the implied warranty of MERCHANTABILITY or
      12  FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
      13  for more details.
      14  
      15  Under Section 7 of GPL version 3, you are granted additional
      16  permissions described in the GCC Runtime Library Exception, version
      17  3.1, as published by the Free Software Foundation.
      18  
      19  You should have received a copy of the GNU General Public License and
      20  a copy of the GCC Runtime Library Exception along with this program;
      21  see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see
      22  <http://www.gnu.org/licenses/>.  */
      23  
      24  #include "bid_internal.h"
      25  
      26  
      27  #if DECIMAL_CALL_BY_REFERENCE
      28  void
      29  bid64dq_add (UINT64 * pres, UINT64 * px, UINT128 * py
      30  	     _RND_MODE_PARAM _EXC_FLAGS_PARAM _EXC_MASKS_PARAM
      31  	     _EXC_INFO_PARAM) {
      32    UINT64 x = *px;
      33  #if !DECIMAL_GLOBAL_ROUNDING
      34    unsigned int rnd_mode = *prnd_mode;
      35  #endif
      36  #else
      37  UINT64
      38  bid64dq_add (UINT64 x, UINT128 y
      39  	     _RND_MODE_PARAM _EXC_FLAGS_PARAM _EXC_MASKS_PARAM
      40  	     _EXC_INFO_PARAM) {
      41  #endif
      42    UINT64 res = 0xbaddbaddbaddbaddull;
      43    UINT128 x1;
      44  
      45  #if DECIMAL_CALL_BY_REFERENCE
      46    bid64_to_bid128 (&x1, &x _EXC_FLAGS_ARG _EXC_MASKS_ARG _EXC_INFO_ARG);
      47    bid64qq_add (&res, &x1, py
      48  	       _RND_MODE_ARG _EXC_FLAGS_ARG _EXC_MASKS_ARG
      49  	       _EXC_INFO_ARG);
      50  #else
      51    x1 = bid64_to_bid128 (x _EXC_FLAGS_ARG _EXC_MASKS_ARG _EXC_INFO_ARG);
      52    res = bid64qq_add (x1, y
      53  		     _RND_MODE_ARG _EXC_FLAGS_ARG _EXC_MASKS_ARG
      54  		     _EXC_INFO_ARG);
      55  #endif
      56    BID_RETURN (res);
      57  }
      58  
      59  
      60  #if DECIMAL_CALL_BY_REFERENCE
      61  void
      62  bid64qd_add (UINT64 * pres, UINT128 * px, UINT64 * py
      63  	     _RND_MODE_PARAM _EXC_FLAGS_PARAM _EXC_MASKS_PARAM
      64  	     _EXC_INFO_PARAM) {
      65    UINT64 y = *py;
      66  #if !DECIMAL_GLOBAL_ROUNDING
      67    unsigned int rnd_mode = *prnd_mode;
      68  #endif
      69  #else
      70  UINT64
      71  bid64qd_add (UINT128 x, UINT64 y
      72  	     _RND_MODE_PARAM _EXC_FLAGS_PARAM _EXC_MASKS_PARAM
      73  	     _EXC_INFO_PARAM) {
      74  #endif
      75    UINT64 res = 0xbaddbaddbaddbaddull;
      76    UINT128 y1;
      77  
      78  #if DECIMAL_CALL_BY_REFERENCE
      79    bid64_to_bid128 (&y1, &y _EXC_FLAGS_ARG _EXC_MASKS_ARG _EXC_INFO_ARG);
      80    bid64qq_add (&res, px, &y1
      81  	       _RND_MODE_ARG _EXC_FLAGS_ARG _EXC_MASKS_ARG
      82  	       _EXC_INFO_ARG);
      83  #else
      84    y1 = bid64_to_bid128 (y _EXC_FLAGS_ARG _EXC_MASKS_ARG _EXC_INFO_ARG);
      85    res = bid64qq_add (x, y1
      86  		     _RND_MODE_ARG _EXC_FLAGS_ARG _EXC_MASKS_ARG
      87  		     _EXC_INFO_ARG);
      88  #endif
      89    BID_RETURN (res);
      90  }
      91  
      92  
      93  #if DECIMAL_CALL_BY_REFERENCE
      94  void
      95  bid64qq_add (UINT64 * pres, UINT128 * px, UINT128 * py
      96  	     _RND_MODE_PARAM _EXC_FLAGS_PARAM _EXC_MASKS_PARAM
      97  	     _EXC_INFO_PARAM) {
      98    UINT128 x = *px, y = *py;
      99  #if !DECIMAL_GLOBAL_ROUNDING
     100    unsigned int rnd_mode = *prnd_mode;
     101  #endif
     102  #else
     103  UINT64
     104  bid64qq_add (UINT128 x, UINT128 y
     105  	     _RND_MODE_PARAM _EXC_FLAGS_PARAM _EXC_MASKS_PARAM
     106  	     _EXC_INFO_PARAM) {
     107  #endif
     108  
     109    UINT128 one = { {0x0000000000000001ull, 0x3040000000000000ull}
     110    };
     111    UINT64 res = 0xbaddbaddbaddbaddull;
     112  
     113    BID_SWAP128 (one);
     114  #if DECIMAL_CALL_BY_REFERENCE
     115    bid64qqq_fma (&res, &one, &x, &y
     116  		_RND_MODE_ARG _EXC_FLAGS_ARG _EXC_MASKS_ARG
     117  		_EXC_INFO_ARG);
     118  #else
     119    res = bid64qqq_fma (one, x, y
     120  		      _RND_MODE_ARG _EXC_FLAGS_ARG _EXC_MASKS_ARG
     121  		      _EXC_INFO_ARG);
     122  #endif
     123    BID_RETURN (res);
     124  }
     125  
     126  
     127  #if DECIMAL_CALL_BY_REFERENCE
     128  void
     129  bid128dd_add (UINT128 * pres, UINT64 * px, UINT64 * py
     130  	      _RND_MODE_PARAM _EXC_FLAGS_PARAM _EXC_MASKS_PARAM
     131  	      _EXC_INFO_PARAM) {
     132    UINT64 x = *px, y = *py;
     133  #if !DECIMAL_GLOBAL_ROUNDING
     134    unsigned int rnd_mode = *prnd_mode;
     135  #endif
     136  #else
     137  UINT128
     138  bid128dd_add (UINT64 x, UINT64 y
     139  	      _RND_MODE_PARAM _EXC_FLAGS_PARAM _EXC_MASKS_PARAM
     140  	      _EXC_INFO_PARAM) {
     141  #endif
     142    UINT128 res = { {0xbaddbaddbaddbaddull, 0xbaddbaddbaddbaddull}
     143    };
     144    UINT128 x1, y1;
     145  
     146  #if DECIMAL_CALL_BY_REFERENCE
     147    bid64_to_bid128 (&x1, &x _EXC_FLAGS_ARG _EXC_MASKS_ARG _EXC_INFO_ARG);
     148    bid64_to_bid128 (&y1, &y _EXC_FLAGS_ARG _EXC_MASKS_ARG _EXC_INFO_ARG);
     149    bid128_add (&res, &x1, &y1
     150  	      _RND_MODE_ARG _EXC_FLAGS_ARG _EXC_MASKS_ARG
     151  	      _EXC_INFO_ARG);
     152  #else
     153    x1 = bid64_to_bid128 (x _EXC_FLAGS_ARG _EXC_MASKS_ARG _EXC_INFO_ARG);
     154    y1 = bid64_to_bid128 (y _EXC_FLAGS_ARG _EXC_MASKS_ARG _EXC_INFO_ARG);
     155    res = bid128_add (x1, y1
     156  		    _RND_MODE_ARG _EXC_FLAGS_ARG _EXC_MASKS_ARG
     157  		    _EXC_INFO_ARG);
     158  #endif
     159    BID_RETURN (res);
     160  }
     161  
     162  
     163  #if DECIMAL_CALL_BY_REFERENCE
     164  void
     165  bid128dq_add (UINT128 * pres, UINT64 * px, UINT128 * py
     166  	      _RND_MODE_PARAM _EXC_FLAGS_PARAM _EXC_MASKS_PARAM
     167  	      _EXC_INFO_PARAM) {
     168    UINT64 x = *px;
     169  #if !DECIMAL_GLOBAL_ROUNDING
     170    unsigned int rnd_mode = *prnd_mode;
     171  #endif
     172  #else
     173  UINT128
     174  bid128dq_add (UINT64 x, UINT128 y
     175  	      _RND_MODE_PARAM _EXC_FLAGS_PARAM _EXC_MASKS_PARAM
     176  	      _EXC_INFO_PARAM) {
     177  #endif
     178    UINT128 res = { {0xbaddbaddbaddbaddull, 0xbaddbaddbaddbaddull}
     179    };
     180    UINT128 x1;
     181  
     182  #if DECIMAL_CALL_BY_REFERENCE
     183    bid64_to_bid128 (&x1, &x _EXC_FLAGS_ARG _EXC_MASKS_ARG _EXC_INFO_ARG);
     184    bid128_add (&res, &x1, py
     185  	      _RND_MODE_ARG _EXC_FLAGS_ARG _EXC_MASKS_ARG
     186  	      _EXC_INFO_ARG);
     187  #else
     188    x1 = bid64_to_bid128 (x _EXC_FLAGS_ARG _EXC_MASKS_ARG _EXC_INFO_ARG);
     189    res = bid128_add (x1, y
     190  		    _RND_MODE_ARG _EXC_FLAGS_ARG _EXC_MASKS_ARG
     191  		    _EXC_INFO_ARG);
     192  #endif
     193    BID_RETURN (res);
     194  }
     195  
     196  
     197  #if DECIMAL_CALL_BY_REFERENCE
     198  void
     199  bid128qd_add (UINT128 * pres, UINT128 * px, UINT64 * py
     200  	      _RND_MODE_PARAM _EXC_FLAGS_PARAM _EXC_MASKS_PARAM
     201  	      _EXC_INFO_PARAM) {
     202    UINT64 y = *py;
     203  #if !DECIMAL_GLOBAL_ROUNDING
     204    unsigned int rnd_mode = *prnd_mode;
     205  #endif
     206  #else
     207  UINT128
     208  bid128qd_add (UINT128 x, UINT64 y
     209  	      _RND_MODE_PARAM _EXC_FLAGS_PARAM _EXC_MASKS_PARAM
     210  	      _EXC_INFO_PARAM) {
     211  #endif
     212    UINT128 res = { {0xbaddbaddbaddbaddull, 0xbaddbaddbaddbaddull}
     213    };
     214    UINT128 y1;
     215  
     216  #if DECIMAL_CALL_BY_REFERENCE
     217    bid64_to_bid128 (&y1, &y _EXC_FLAGS_ARG _EXC_MASKS_ARG _EXC_INFO_ARG);
     218    bid128_add (&res, px, &y1
     219  	      _RND_MODE_ARG _EXC_FLAGS_ARG _EXC_MASKS_ARG
     220  	      _EXC_INFO_ARG);
     221  #else
     222    y1 = bid64_to_bid128 (y _EXC_FLAGS_ARG _EXC_MASKS_ARG _EXC_INFO_ARG);
     223    res = bid128_add (x, y1
     224  		    _RND_MODE_ARG _EXC_FLAGS_ARG _EXC_MASKS_ARG
     225  		    _EXC_INFO_ARG);
     226  #endif
     227    BID_RETURN (res);
     228  }
     229  
     230  
     231  // bid128_add stands for bid128qq_add
     232  
     233  
     234  /*****************************************************************************
     235   *  BID64/BID128 sub
     236   ****************************************************************************/
     237  
     238  #if DECIMAL_CALL_BY_REFERENCE
     239  void
     240  bid64dq_sub (UINT64 * pres, UINT64 * px, UINT128 * py
     241  	     _RND_MODE_PARAM _EXC_FLAGS_PARAM _EXC_MASKS_PARAM
     242  	     _EXC_INFO_PARAM) {
     243    UINT64 x = *px;
     244  #if !DECIMAL_GLOBAL_ROUNDING
     245    unsigned int rnd_mode = *prnd_mode;
     246  #endif
     247  #else
     248  UINT64
     249  bid64dq_sub (UINT64 x, UINT128 y
     250  	     _RND_MODE_PARAM _EXC_FLAGS_PARAM _EXC_MASKS_PARAM
     251  	     _EXC_INFO_PARAM) {
     252  #endif
     253    UINT64 res = 0xbaddbaddbaddbaddull;
     254    UINT128 x1;
     255  
     256  #if DECIMAL_CALL_BY_REFERENCE
     257    bid64_to_bid128 (&x1, &x _EXC_FLAGS_ARG _EXC_MASKS_ARG _EXC_INFO_ARG);
     258    bid64qq_sub (&res, &x1, py
     259  	       _RND_MODE_ARG _EXC_FLAGS_ARG _EXC_MASKS_ARG
     260  	       _EXC_INFO_ARG);
     261  #else
     262    x1 = bid64_to_bid128 (x _EXC_FLAGS_ARG _EXC_MASKS_ARG _EXC_INFO_ARG);
     263    res = bid64qq_sub (x1, y
     264  		     _RND_MODE_ARG _EXC_FLAGS_ARG _EXC_MASKS_ARG
     265  		     _EXC_INFO_ARG);
     266  #endif
     267    BID_RETURN (res);
     268  }
     269  
     270  
     271  #if DECIMAL_CALL_BY_REFERENCE
     272  void
     273  bid64qd_sub (UINT64 * pres, UINT128 * px, UINT64 * py
     274  	     _RND_MODE_PARAM _EXC_FLAGS_PARAM _EXC_MASKS_PARAM
     275  	     _EXC_INFO_PARAM) {
     276    UINT64 y = *py;
     277  #if !DECIMAL_GLOBAL_ROUNDING
     278    unsigned int rnd_mode = *prnd_mode;
     279  #endif
     280  #else
     281  UINT64
     282  bid64qd_sub (UINT128 x, UINT64 y
     283  	     _RND_MODE_PARAM _EXC_FLAGS_PARAM _EXC_MASKS_PARAM
     284  	     _EXC_INFO_PARAM) {
     285  #endif
     286    UINT64 res = 0xbaddbaddbaddbaddull;
     287    UINT128 y1;
     288  
     289  #if DECIMAL_CALL_BY_REFERENCE
     290    bid64_to_bid128 (&y1, &y _EXC_FLAGS_ARG _EXC_MASKS_ARG _EXC_INFO_ARG);
     291    bid64qq_sub (&res, px, &y1
     292  	       _RND_MODE_ARG _EXC_FLAGS_ARG _EXC_MASKS_ARG
     293  	       _EXC_INFO_ARG);
     294  #else
     295    y1 = bid64_to_bid128 (y _EXC_FLAGS_ARG _EXC_MASKS_ARG _EXC_INFO_ARG);
     296    res = bid64qq_sub (x, y1
     297  		     _RND_MODE_ARG _EXC_FLAGS_ARG _EXC_MASKS_ARG
     298  		     _EXC_INFO_ARG);
     299  #endif
     300    BID_RETURN (res);
     301  }
     302  
     303  
     304  #if DECIMAL_CALL_BY_REFERENCE
     305  void
     306  bid64qq_sub (UINT64 * pres, UINT128 * px, UINT128 * py
     307  	     _RND_MODE_PARAM _EXC_FLAGS_PARAM _EXC_MASKS_PARAM
     308  	     _EXC_INFO_PARAM) {
     309    UINT128 x = *px, y = *py;
     310  #if !DECIMAL_GLOBAL_ROUNDING
     311    unsigned int rnd_mode = *prnd_mode;
     312  #endif
     313  #else
     314  UINT64
     315  bid64qq_sub (UINT128 x, UINT128 y
     316  	     _RND_MODE_PARAM _EXC_FLAGS_PARAM _EXC_MASKS_PARAM
     317  	     _EXC_INFO_PARAM) {
     318  #endif
     319  
     320    UINT128 one = { {0x0000000000000001ull, 0x3040000000000000ull}
     321    };
     322    UINT64 res = 0xbaddbaddbaddbaddull;
     323    UINT64 y_sign;
     324  
     325    BID_SWAP128 (one);
     326    if ((y.w[HIGH_128W] & MASK_NAN) != MASK_NAN) {	// y is not NAN
     327      // change its sign
     328      y_sign = y.w[HIGH_128W] & MASK_SIGN;	// 0 for positive, MASK_SIGN for negative
     329      if (y_sign)
     330        y.w[HIGH_128W] = y.w[HIGH_128W] & 0x7fffffffffffffffull;
     331      else
     332        y.w[HIGH_128W] = y.w[HIGH_128W] | 0x8000000000000000ull;
     333    }
     334  #if DECIMAL_CALL_BY_REFERENCE
     335    bid64qqq_fma (&res, &one, &x, &y
     336  		_RND_MODE_ARG _EXC_FLAGS_ARG _EXC_MASKS_ARG
     337  		_EXC_INFO_ARG);
     338  #else
     339    res = bid64qqq_fma (one, x, y
     340  		      _RND_MODE_ARG _EXC_FLAGS_ARG _EXC_MASKS_ARG
     341  		      _EXC_INFO_ARG);
     342  #endif
     343    BID_RETURN (res);
     344  }
     345  
     346  
     347  #if DECIMAL_CALL_BY_REFERENCE
     348  void
     349  bid128dd_sub (UINT128 * pres, UINT64 * px, UINT64 * py
     350  	      _RND_MODE_PARAM _EXC_FLAGS_PARAM _EXC_MASKS_PARAM
     351  	      _EXC_INFO_PARAM) {
     352    UINT64 x = *px, y = *py;
     353  #if !DECIMAL_GLOBAL_ROUNDING
     354    unsigned int rnd_mode = *prnd_mode;
     355  #endif
     356  #else
     357  UINT128
     358  bid128dd_sub (UINT64 x, UINT64 y
     359  	      _RND_MODE_PARAM _EXC_FLAGS_PARAM _EXC_MASKS_PARAM
     360  	      _EXC_INFO_PARAM) {
     361  #endif
     362    UINT128 res = { {0xbaddbaddbaddbaddull, 0xbaddbaddbaddbaddull}
     363    };
     364    UINT128 x1, y1;
     365  
     366  #if DECIMAL_CALL_BY_REFERENCE
     367    bid64_to_bid128 (&x1, &x _EXC_FLAGS_ARG _EXC_MASKS_ARG _EXC_INFO_ARG);
     368    bid64_to_bid128 (&y1, &y _EXC_FLAGS_ARG _EXC_MASKS_ARG _EXC_INFO_ARG);
     369    bid128_sub (&res, &x1, &y1
     370  	      _RND_MODE_ARG _EXC_FLAGS_ARG _EXC_MASKS_ARG
     371  	      _EXC_INFO_ARG);
     372  #else
     373    x1 = bid64_to_bid128 (x _EXC_FLAGS_ARG _EXC_MASKS_ARG _EXC_INFO_ARG);
     374    y1 = bid64_to_bid128 (y _EXC_FLAGS_ARG _EXC_MASKS_ARG _EXC_INFO_ARG);
     375    res = bid128_sub (x1, y1
     376  		    _RND_MODE_ARG _EXC_FLAGS_ARG _EXC_MASKS_ARG
     377  		    _EXC_INFO_ARG);
     378  #endif
     379    BID_RETURN (res);
     380  }
     381  
     382  
     383  #if DECIMAL_CALL_BY_REFERENCE
     384  void
     385  bid128dq_sub (UINT128 * pres, UINT64 * px, UINT128 * py
     386  	      _RND_MODE_PARAM _EXC_FLAGS_PARAM _EXC_MASKS_PARAM
     387  	      _EXC_INFO_PARAM) {
     388    UINT64 x = *px;
     389  #if !DECIMAL_GLOBAL_ROUNDING
     390    unsigned int rnd_mode = *prnd_mode;
     391  #endif
     392  #else
     393  UINT128
     394  bid128dq_sub (UINT64 x, UINT128 y
     395  	      _RND_MODE_PARAM _EXC_FLAGS_PARAM _EXC_MASKS_PARAM
     396  	      _EXC_INFO_PARAM) {
     397  #endif
     398    UINT128 res = { {0xbaddbaddbaddbaddull, 0xbaddbaddbaddbaddull}
     399    };
     400    UINT128 x1;
     401  
     402  #if DECIMAL_CALL_BY_REFERENCE
     403    bid64_to_bid128 (&x1, &x _EXC_FLAGS_ARG _EXC_MASKS_ARG _EXC_INFO_ARG);
     404    bid128_sub (&res, &x1, py
     405  	      _RND_MODE_ARG _EXC_FLAGS_ARG _EXC_MASKS_ARG
     406  	      _EXC_INFO_ARG);
     407  #else
     408    x1 = bid64_to_bid128 (x _EXC_FLAGS_ARG _EXC_MASKS_ARG _EXC_INFO_ARG);
     409    res = bid128_sub (x1, y
     410  		    _RND_MODE_ARG _EXC_FLAGS_ARG _EXC_MASKS_ARG
     411  		    _EXC_INFO_ARG);
     412  #endif
     413    BID_RETURN (res);
     414  }
     415  
     416  
     417  #if DECIMAL_CALL_BY_REFERENCE
     418  void
     419  bid128qd_sub (UINT128 * pres, UINT128 * px, UINT64 * py
     420  	      _RND_MODE_PARAM _EXC_FLAGS_PARAM _EXC_MASKS_PARAM
     421  	      _EXC_INFO_PARAM) {
     422    UINT64 y = *py;
     423  #if !DECIMAL_GLOBAL_ROUNDING
     424    unsigned int rnd_mode = *prnd_mode;
     425  #endif
     426  #else
     427  UINT128
     428  bid128qd_sub (UINT128 x, UINT64 y
     429  	      _RND_MODE_PARAM _EXC_FLAGS_PARAM _EXC_MASKS_PARAM
     430  	      _EXC_INFO_PARAM) {
     431  #endif
     432    UINT128 res = { {0xbaddbaddbaddbaddull, 0xbaddbaddbaddbaddull}
     433    };
     434    UINT128 y1;
     435  
     436  #if DECIMAL_CALL_BY_REFERENCE
     437    bid64_to_bid128 (&y1, &y _EXC_FLAGS_ARG _EXC_MASKS_ARG _EXC_INFO_ARG);
     438    bid128_sub (&res, px, &y1
     439  	      _RND_MODE_ARG _EXC_FLAGS_ARG _EXC_MASKS_ARG
     440  	      _EXC_INFO_ARG);
     441  #else
     442    y1 = bid64_to_bid128 (y _EXC_FLAGS_ARG _EXC_MASKS_ARG _EXC_INFO_ARG);
     443    res = bid128_sub (x, y1
     444  		    _RND_MODE_ARG _EXC_FLAGS_ARG _EXC_MASKS_ARG
     445  		    _EXC_INFO_ARG);
     446  #endif
     447    BID_RETURN (res);
     448  }
     449  
     450  #if DECIMAL_CALL_BY_REFERENCE
     451  void
     452  bid128_add (UINT128 * pres, UINT128 * px, UINT128 * py
     453  	    _RND_MODE_PARAM _EXC_FLAGS_PARAM _EXC_MASKS_PARAM
     454  	    _EXC_INFO_PARAM) {
     455    UINT128 x = *px, y = *py;
     456  #if !DECIMAL_GLOBAL_ROUNDING
     457    unsigned int rnd_mode = *prnd_mode;
     458  #endif
     459  #else
     460  UINT128
     461  bid128_add (UINT128 x, UINT128 y
     462  	    _RND_MODE_PARAM _EXC_FLAGS_PARAM _EXC_MASKS_PARAM
     463  	    _EXC_INFO_PARAM) {
     464  #endif
     465    UINT128 res = { {0xbaddbaddbaddbaddull, 0xbaddbaddbaddbaddull}
     466    };
     467    UINT64 x_sign, y_sign, tmp_sign;
     468    UINT64 x_exp, y_exp, tmp_exp;	// e1 = x_exp, e2 = y_exp
     469    UINT64 C1_hi, C2_hi, tmp_signif_hi;
     470    UINT64 C1_lo, C2_lo, tmp_signif_lo;
     471    // Note: C1.w[1], C1.w[0] represent C1_hi, C1_lo (all UINT64)
     472    // Note: C2.w[1], C2.w[0] represent C2_hi, C2_lo (all UINT64)
     473    UINT64 tmp64, tmp64A, tmp64B;
     474    BID_UI64DOUBLE tmp1, tmp2;
     475    int x_nr_bits, y_nr_bits;
     476    int q1, q2, delta, scale, x1, ind, shift, tmp_inexact = 0;
     477    UINT64 halfulp64;
     478    UINT128 halfulp128;
     479    UINT128 C1, C2;
     480    UINT128 ten2m1;
     481    UINT128 highf2star;		// top 128 bits in f2*; low 128 bits in R256[1], R256[0]
     482    UINT256 P256, Q256, R256;
     483    int is_inexact = 0, is_midpoint_lt_even = 0, is_midpoint_gt_even = 0;
     484    int is_inexact_lt_midpoint = 0, is_inexact_gt_midpoint = 0;
     485    int second_pass = 0;
     486  
     487    BID_SWAP128 (x);
     488    BID_SWAP128 (y);
     489    x_sign = x.w[1] & MASK_SIGN;	// 0 for positive, MASK_SIGN for negative
     490    y_sign = y.w[1] & MASK_SIGN;	// 0 for positive, MASK_SIGN for negative
     491  
     492    // check for NaN or Infinity
     493    if (((x.w[1] & MASK_SPECIAL) == MASK_SPECIAL)
     494        || ((y.w[1] & MASK_SPECIAL) == MASK_SPECIAL)) {
     495      // x is special or y is special
     496      if ((x.w[1] & MASK_NAN) == MASK_NAN) {	// x is NAN
     497        // check first for non-canonical NaN payload
     498        if (((x.w[1] & 0x00003fffffffffffull) > 0x0000314dc6448d93ull) ||
     499  	  (((x.w[1] & 0x00003fffffffffffull) == 0x0000314dc6448d93ull)
     500  	   && (x.w[0] > 0x38c15b09ffffffffull))) {
     501  	x.w[1] = x.w[1] & 0xffffc00000000000ull;
     502  	x.w[0] = 0x0ull;
     503        }
     504        if ((x.w[1] & MASK_SNAN) == MASK_SNAN) {	// x is SNAN
     505  	// set invalid flag
     506  	*pfpsf |= INVALID_EXCEPTION;
     507  	// return quiet (x)
     508  	res.w[1] = x.w[1] & 0xfc003fffffffffffull;
     509  	// clear out also G[6]-G[16]
     510  	res.w[0] = x.w[0];
     511        } else {	// x is QNaN
     512  	// return x
     513  	res.w[1] = x.w[1] & 0xfc003fffffffffffull;
     514  	// clear out G[6]-G[16]
     515  	res.w[0] = x.w[0];
     516  	// if y = SNaN signal invalid exception
     517  	if ((y.w[1] & MASK_SNAN) == MASK_SNAN) {
     518  	  // set invalid flag
     519  	  *pfpsf |= INVALID_EXCEPTION;
     520  	}
     521        }
     522        BID_SWAP128 (res);
     523        BID_RETURN (res);
     524      } else if ((y.w[1] & MASK_NAN) == MASK_NAN) {	// y is NAN
     525        // check first for non-canonical NaN payload
     526        if (((y.w[1] & 0x00003fffffffffffull) > 0x0000314dc6448d93ull) ||
     527  	  (((y.w[1] & 0x00003fffffffffffull) == 0x0000314dc6448d93ull)
     528  	   && (y.w[0] > 0x38c15b09ffffffffull))) {
     529  	y.w[1] = y.w[1] & 0xffffc00000000000ull;
     530  	y.w[0] = 0x0ull;
     531        }
     532        if ((y.w[1] & MASK_SNAN) == MASK_SNAN) {	// y is SNAN
     533  	// set invalid flag
     534  	*pfpsf |= INVALID_EXCEPTION;
     535  	// return quiet (y)
     536  	res.w[1] = y.w[1] & 0xfc003fffffffffffull;
     537  	// clear out also G[6]-G[16]
     538  	res.w[0] = y.w[0];
     539        } else {	// y is QNaN
     540  	// return y
     541  	res.w[1] = y.w[1] & 0xfc003fffffffffffull;
     542  	// clear out G[6]-G[16]
     543  	res.w[0] = y.w[0];
     544        }
     545        BID_SWAP128 (res);
     546        BID_RETURN (res);
     547      } else {	// neither x not y is NaN; at least one is infinity
     548        if ((x.w[1] & MASK_ANY_INF) == MASK_INF) {	// x is infinity
     549  	if ((y.w[1] & MASK_ANY_INF) == MASK_INF) {	// y is infinity
     550  	  // if same sign, return either of them
     551  	  if ((x.w[1] & MASK_SIGN) == (y.w[1] & MASK_SIGN)) {
     552  	    res.w[1] = x_sign | MASK_INF;
     553  	    res.w[0] = 0x0ull;
     554  	  } else {	// x and y are infinities of opposite signs
     555  	    // set invalid flag
     556  	    *pfpsf |= INVALID_EXCEPTION;
     557  	    // return QNaN Indefinite
     558  	    res.w[1] = 0x7c00000000000000ull;
     559  	    res.w[0] = 0x0000000000000000ull;
     560  	  }
     561  	} else {	// y is 0 or finite
     562  	  // return x
     563  	  res.w[1] = x_sign | MASK_INF;
     564  	  res.w[0] = 0x0ull;
     565  	}
     566        } else {	// x is not NaN or infinity, so y must be infinity
     567  	res.w[1] = y_sign | MASK_INF;
     568  	res.w[0] = 0x0ull;
     569        }
     570        BID_SWAP128 (res);
     571        BID_RETURN (res);
     572      }
     573    }
     574    // unpack the arguments
     575  
     576    // unpack x 
     577    C1_hi = x.w[1] & MASK_COEFF;
     578    C1_lo = x.w[0];
     579    // test for non-canonical values:
     580    // - values whose encoding begins with x00, x01, or x10 and whose 
     581    //   coefficient is larger than 10^34 -1, or
     582    // - values whose encoding begins with x1100, x1101, x1110 (if NaNs 
     583    //   and infinitis were eliminated already this test is reduced to 
     584    //   checking for x10x) 
     585  
     586    // x is not infinity; check for non-canonical values - treated as zero
     587    if ((x.w[1] & 0x6000000000000000ull) == 0x6000000000000000ull) {
     588      // G0_G1=11; non-canonical
     589      x_exp = (x.w[1] << 2) & MASK_EXP;	// biased and shifted left 49 bits
     590      C1_hi = 0;	// significand high
     591      C1_lo = 0;	// significand low
     592    } else {	// G0_G1 != 11
     593      x_exp = x.w[1] & MASK_EXP;	// biased and shifted left 49 bits
     594      if (C1_hi > 0x0001ed09bead87c0ull ||
     595  	(C1_hi == 0x0001ed09bead87c0ull
     596  	 && C1_lo > 0x378d8e63ffffffffull)) {
     597        // x is non-canonical if coefficient is larger than 10^34 -1
     598        C1_hi = 0;
     599        C1_lo = 0;
     600      } else {	// canonical
     601        ;
     602      }
     603    }
     604  
     605    // unpack y  
     606    C2_hi = y.w[1] & MASK_COEFF;
     607    C2_lo = y.w[0];
     608    // y is not infinity; check for non-canonical values - treated as zero 
     609    if ((y.w[1] & 0x6000000000000000ull) == 0x6000000000000000ull) {
     610      // G0_G1=11; non-canonical 
     611      y_exp = (y.w[1] << 2) & MASK_EXP;	// biased and shifted left 49 bits
     612      C2_hi = 0;	// significand high
     613      C2_lo = 0;	// significand low 
     614    } else {	// G0_G1 != 11 
     615      y_exp = y.w[1] & MASK_EXP;	// biased and shifted left 49 bits
     616      if (C2_hi > 0x0001ed09bead87c0ull ||
     617  	(C2_hi == 0x0001ed09bead87c0ull
     618  	 && C2_lo > 0x378d8e63ffffffffull)) {
     619        // y is non-canonical if coefficient is larger than 10^34 -1 
     620        C2_hi = 0;
     621        C2_lo = 0;
     622      } else {	// canonical
     623        ;
     624      }
     625    }
     626  
     627    if ((C1_hi == 0x0ull) && (C1_lo == 0x0ull)) {
     628      // x is 0 and y is not special
     629      // if y is 0 return 0 with the smaller exponent
     630      if ((C2_hi == 0x0ull) && (C2_lo == 0x0ull)) {
     631        if (x_exp < y_exp)
     632  	res.w[1] = x_exp;
     633        else
     634  	res.w[1] = y_exp;
     635        if (x_sign && y_sign)
     636  	res.w[1] = res.w[1] | x_sign;	// both negative
     637        else if (rnd_mode == ROUNDING_DOWN && x_sign != y_sign)
     638  	res.w[1] = res.w[1] | 0x8000000000000000ull;	// -0
     639        // else; // res = +0
     640        res.w[0] = 0;
     641      } else {
     642        // for 0 + y return y, with the preferred exponent
     643        if (y_exp <= x_exp) {
     644  	res.w[1] = y.w[1];
     645  	res.w[0] = y.w[0];
     646        } else {	// if y_exp > x_exp
     647  	// return (C2 * 10^scale) * 10^(y_exp - scale)
     648  	// where scale = min (P34-q2, y_exp-x_exp)
     649  	// determine q2 = nr. of decimal digits in y
     650  	//  determine first the nr. of bits in y (y_nr_bits)
     651  
     652  	if (C2_hi == 0) {	// y_bits is the nr. of bits in C2_lo
     653  	  if (C2_lo >= 0x0020000000000000ull) {	// y >= 2^53
     654  	    // split the 64-bit value in two 32-bit halves to avoid 
     655  	    // rounding errors
     656  	    if (C2_lo >= 0x0000000100000000ull) {	// y >= 2^32
     657  	      tmp2.d = (double) (C2_lo >> 32);	// exact conversion
     658  	      y_nr_bits =
     659  		32 +
     660  		((((unsigned int) (tmp2.ui64 >> 52)) & 0x7ff) - 0x3ff);
     661  	    } else {	// y < 2^32
     662  	      tmp2.d = (double) (C2_lo);	// exact conversion
     663  	      y_nr_bits =
     664  		((((unsigned int) (tmp2.ui64 >> 52)) & 0x7ff) - 0x3ff);
     665  	    }
     666  	  } else {	// if y < 2^53
     667  	    tmp2.d = (double) C2_lo;	// exact conversion
     668  	    y_nr_bits =
     669  	      ((((unsigned int) (tmp2.ui64 >> 52)) & 0x7ff) - 0x3ff);
     670  	  }
     671  	} else {	// C2_hi != 0 => nr. bits = 64 + nr_bits (C2_hi)
     672  	  tmp2.d = (double) C2_hi;	// exact conversion
     673  	  y_nr_bits =
     674  	    64 + ((((unsigned int) (tmp2.ui64 >> 52)) & 0x7ff) - 0x3ff);
     675  	}
     676  	q2 = nr_digits[y_nr_bits].digits;
     677  	if (q2 == 0) {
     678  	  q2 = nr_digits[y_nr_bits].digits1;
     679  	  if (C2_hi > nr_digits[y_nr_bits].threshold_hi ||
     680  	      (C2_hi == nr_digits[y_nr_bits].threshold_hi &&
     681  	       C2_lo >= nr_digits[y_nr_bits].threshold_lo))
     682  	    q2++;
     683  	}
     684  	// return (C2 * 10^scale) * 10^(y_exp - scale)
     685  	// where scale = min (P34-q2, y_exp-x_exp)
     686  	scale = P34 - q2;
     687  	ind = (y_exp - x_exp) >> 49;
     688  	if (ind < scale)
     689  	  scale = ind;
     690  	if (scale == 0) {
     691  	  res.w[1] = y.w[1];
     692  	  res.w[0] = y.w[0];
     693  	} else if (q2 <= 19) {	// y fits in 64 bits 
     694  	  if (scale <= 19) {	// 10^scale fits in 64 bits
     695  	    // 64 x 64 C2_lo * ten2k64[scale]
     696  	    __mul_64x64_to_128MACH (res, C2_lo, ten2k64[scale]);
     697  	  } else {	// 10^scale fits in 128 bits
     698  	    // 64 x 128 C2_lo * ten2k128[scale - 20]
     699  	    __mul_128x64_to_128 (res, C2_lo, ten2k128[scale - 20]);
     700  	  }
     701  	} else {	// y fits in 128 bits, but 10^scale must fit in 64 bits 
     702  	  // 64 x 128 ten2k64[scale] * C2
     703  	  C2.w[1] = C2_hi;
     704  	  C2.w[0] = C2_lo;
     705  	  __mul_128x64_to_128 (res, ten2k64[scale], C2);
     706  	}
     707  	// subtract scale from the exponent
     708  	y_exp = y_exp - ((UINT64) scale << 49);
     709  	res.w[1] = res.w[1] | y_sign | y_exp;
     710        }
     711      }
     712      BID_SWAP128 (res);
     713      BID_RETURN (res);
     714    } else if ((C2_hi == 0x0ull) && (C2_lo == 0x0ull)) {
     715      // y is 0 and x is not special, and not zero
     716      // for x + 0 return x, with the preferred exponent
     717      if (x_exp <= y_exp) {
     718        res.w[1] = x.w[1];
     719        res.w[0] = x.w[0];
     720      } else {	// if x_exp > y_exp
     721        // return (C1 * 10^scale) * 10^(x_exp - scale)
     722        // where scale = min (P34-q1, x_exp-y_exp)
     723        // determine q1 = nr. of decimal digits in x
     724        //  determine first the nr. of bits in x
     725        if (C1_hi == 0) {	// x_bits is the nr. of bits in C1_lo
     726  	if (C1_lo >= 0x0020000000000000ull) {	// x >= 2^53
     727  	  // split the 64-bit value in two 32-bit halves to avoid 
     728  	  // rounding errors
     729  	  if (C1_lo >= 0x0000000100000000ull) {	// x >= 2^32
     730  	    tmp1.d = (double) (C1_lo >> 32);	// exact conversion
     731  	    x_nr_bits =
     732  	      32 + ((((unsigned int) (tmp1.ui64 >> 52)) & 0x7ff) -
     733  		    0x3ff);
     734  	  } else {	// x < 2^32
     735  	    tmp1.d = (double) (C1_lo);	// exact conversion
     736  	    x_nr_bits =
     737  	      ((((unsigned int) (tmp1.ui64 >> 52)) & 0x7ff) - 0x3ff);
     738  	  }
     739  	} else {	// if x < 2^53
     740  	  tmp1.d = (double) C1_lo;	// exact conversion
     741  	  x_nr_bits =
     742  	    ((((unsigned int) (tmp1.ui64 >> 52)) & 0x7ff) - 0x3ff);
     743  	}
     744        } else {	// C1_hi != 0 => nr. bits = 64 + nr_bits (C1_hi)
     745  	tmp1.d = (double) C1_hi;	// exact conversion
     746  	x_nr_bits =
     747  	  64 + ((((unsigned int) (tmp1.ui64 >> 52)) & 0x7ff) - 0x3ff);
     748        }
     749        q1 = nr_digits[x_nr_bits].digits;
     750        if (q1 == 0) {
     751  	q1 = nr_digits[x_nr_bits].digits1;
     752  	if (C1_hi > nr_digits[x_nr_bits].threshold_hi ||
     753  	    (C1_hi == nr_digits[x_nr_bits].threshold_hi &&
     754  	     C1_lo >= nr_digits[x_nr_bits].threshold_lo))
     755  	  q1++;
     756        }
     757        // return (C1 * 10^scale) * 10^(x_exp - scale)
     758        // where scale = min (P34-q1, x_exp-y_exp)  
     759        scale = P34 - q1;
     760        ind = (x_exp - y_exp) >> 49;
     761        if (ind < scale)
     762  	scale = ind;
     763        if (scale == 0) {
     764  	res.w[1] = x.w[1];
     765  	res.w[0] = x.w[0];
     766        } else if (q1 <= 19) {	// x fits in 64 bits  
     767  	if (scale <= 19) {	// 10^scale fits in 64 bits
     768  	  // 64 x 64 C1_lo * ten2k64[scale] 
     769  	  __mul_64x64_to_128MACH (res, C1_lo, ten2k64[scale]);
     770  	} else {	// 10^scale fits in 128 bits
     771  	  // 64 x 128 C1_lo * ten2k128[scale - 20]
     772  	  __mul_128x64_to_128 (res, C1_lo, ten2k128[scale - 20]);
     773  	}
     774        } else {	// x fits in 128 bits, but 10^scale must fit in 64 bits
     775  	// 64 x 128 ten2k64[scale] * C1
     776  	C1.w[1] = C1_hi;
     777  	C1.w[0] = C1_lo;
     778  	__mul_128x64_to_128 (res, ten2k64[scale], C1);
     779        }
     780        // subtract scale from the exponent
     781        x_exp = x_exp - ((UINT64) scale << 49);
     782        res.w[1] = res.w[1] | x_sign | x_exp;
     783      }
     784      BID_SWAP128 (res);
     785      BID_RETURN (res);
     786    } else {	// x and y are not canonical, not special, and are not zero
     787      // note that the result may still be zero, and then it has to have the
     788      // preferred exponent
     789      if (x_exp < y_exp) {	// if exp_x < exp_y then swap x and y 
     790        tmp_sign = x_sign;
     791        tmp_exp = x_exp;
     792        tmp_signif_hi = C1_hi;
     793        tmp_signif_lo = C1_lo;
     794        x_sign = y_sign;
     795        x_exp = y_exp;
     796        C1_hi = C2_hi;
     797        C1_lo = C2_lo;
     798        y_sign = tmp_sign;
     799        y_exp = tmp_exp;
     800        C2_hi = tmp_signif_hi;
     801        C2_lo = tmp_signif_lo;
     802      }
     803      // q1 = nr. of decimal digits in x
     804      //  determine first the nr. of bits in x
     805      if (C1_hi == 0) {	// x_bits is the nr. of bits in C1_lo
     806        if (C1_lo >= 0x0020000000000000ull) {	// x >= 2^53
     807  	//split the 64-bit value in two 32-bit halves to avoid rounding errors
     808  	if (C1_lo >= 0x0000000100000000ull) {	// x >= 2^32
     809  	  tmp1.d = (double) (C1_lo >> 32);	// exact conversion
     810  	  x_nr_bits =
     811  	    32 + ((((unsigned int) (tmp1.ui64 >> 52)) & 0x7ff) - 0x3ff);
     812  	} else {	// x < 2^32
     813  	  tmp1.d = (double) (C1_lo);	// exact conversion
     814  	  x_nr_bits =
     815  	    ((((unsigned int) (tmp1.ui64 >> 52)) & 0x7ff) - 0x3ff);
     816  	}
     817        } else {	// if x < 2^53
     818  	tmp1.d = (double) C1_lo;	// exact conversion
     819  	x_nr_bits =
     820  	  ((((unsigned int) (tmp1.ui64 >> 52)) & 0x7ff) - 0x3ff);
     821        }
     822      } else {	// C1_hi != 0 => nr. bits = 64 + nr_bits (C1_hi)
     823        tmp1.d = (double) C1_hi;	// exact conversion
     824        x_nr_bits =
     825  	64 + ((((unsigned int) (tmp1.ui64 >> 52)) & 0x7ff) - 0x3ff);
     826      }
     827  
     828      q1 = nr_digits[x_nr_bits].digits;
     829      if (q1 == 0) {
     830        q1 = nr_digits[x_nr_bits].digits1;
     831        if (C1_hi > nr_digits[x_nr_bits].threshold_hi ||
     832  	  (C1_hi == nr_digits[x_nr_bits].threshold_hi &&
     833  	   C1_lo >= nr_digits[x_nr_bits].threshold_lo))
     834  	q1++;
     835      }
     836      // q2 = nr. of decimal digits in y
     837      //  determine first the nr. of bits in y (y_nr_bits)
     838      if (C2_hi == 0) {	// y_bits is the nr. of bits in C2_lo
     839        if (C2_lo >= 0x0020000000000000ull) {	// y >= 2^53
     840  	//split the 64-bit value in two 32-bit halves to avoid rounding errors
     841  	if (C2_lo >= 0x0000000100000000ull) {	// y >= 2^32
     842  	  tmp2.d = (double) (C2_lo >> 32);	// exact conversion
     843  	  y_nr_bits =
     844  	    32 + ((((unsigned int) (tmp2.ui64 >> 52)) & 0x7ff) - 0x3ff);
     845  	} else {	// y < 2^32
     846  	  tmp2.d = (double) (C2_lo);	// exact conversion
     847  	  y_nr_bits =
     848  	    ((((unsigned int) (tmp2.ui64 >> 52)) & 0x7ff) - 0x3ff);
     849  	}
     850        } else {	// if y < 2^53
     851  	tmp2.d = (double) C2_lo;	// exact conversion
     852  	y_nr_bits =
     853  	  ((((unsigned int) (tmp2.ui64 >> 52)) & 0x7ff) - 0x3ff);
     854        }
     855      } else {	// C2_hi != 0 => nr. bits = 64 + nr_bits (C2_hi)
     856        tmp2.d = (double) C2_hi;	// exact conversion
     857        y_nr_bits =
     858  	64 + ((((unsigned int) (tmp2.ui64 >> 52)) & 0x7ff) - 0x3ff);
     859      }
     860  
     861      q2 = nr_digits[y_nr_bits].digits;
     862      if (q2 == 0) {
     863        q2 = nr_digits[y_nr_bits].digits1;
     864        if (C2_hi > nr_digits[y_nr_bits].threshold_hi ||
     865  	  (C2_hi == nr_digits[y_nr_bits].threshold_hi &&
     866  	   C2_lo >= nr_digits[y_nr_bits].threshold_lo))
     867  	q2++;
     868      }
     869  
     870      delta = q1 + (int) (x_exp >> 49) - q2 - (int) (y_exp >> 49);
     871  
     872      if (delta >= P34) {
     873        // round the result directly because 0 < C2 < ulp (C1 * 10^(x_exp-e2))
     874        // n = C1 * 10^e1 or n = C1 +/- 10^(q1-P34)) * 10^e1
     875        // the result is inexact; the preferred exponent is the least possible
     876  
     877        if (delta >= P34 + 1) {
     878  	// for RN the result is the operand with the larger magnitude,
     879  	// possibly scaled up by 10^(P34-q1)
     880  	// an overflow cannot occur in this case (rounding to nearest)
     881  	if (q1 < P34) {	// scale C1 up by 10^(P34-q1)
     882  	  // Note: because delta >= P34+1 it is certain that 
     883  	  //     x_exp - ((UINT64)scale << 49) will stay above e_min
     884  	  scale = P34 - q1;
     885  	  if (q1 <= 19) {	// C1 fits in 64 bits
     886  	    // 1 <= q1 <= 19 => 15 <= scale <= 33
     887  	    if (scale <= 19) {	// 10^scale fits in 64 bits
     888  	      __mul_64x64_to_128MACH (C1, ten2k64[scale], C1_lo);
     889  	    } else {	// if 20 <= scale <= 33
     890  	      // C1 * 10^scale = (C1 * 10^(scale-19)) * 10^19 where
     891  	      // (C1 * 10^(scale-19)) fits in 64 bits
     892  	      C1_lo = C1_lo * ten2k64[scale - 19];
     893  	      __mul_64x64_to_128MACH (C1, ten2k64[19], C1_lo);
     894  	    }
     895  	  } else {	//if 20 <= q1 <= 33=P34-1 then C1 fits only in 128 bits
     896  	    // => 1 <= P34 - q1 <= 14 so 10^(P34-q1) fits in 64 bits
     897  	    C1.w[1] = C1_hi;
     898  	    C1.w[0] = C1_lo;
     899  	    // C1 = ten2k64[P34 - q1] * C1
     900  	    __mul_128x64_to_128 (C1, ten2k64[P34 - q1], C1);
     901  	  }
     902  	  x_exp = x_exp - ((UINT64) scale << 49);
     903  	  C1_hi = C1.w[1];
     904  	  C1_lo = C1.w[0];
     905  	}
     906  	// some special cases arise: if delta = P34 + 1 and C1 = 10^(P34-1) 
     907  	// (after scaling) and x_sign != y_sign and C2 > 5*10^(q2-1) => 
     908  	// subtract 1 ulp
     909  	// Note: do this only for rounding to nearest; for other rounding 
     910  	// modes the correction will be applied next
     911  	if ((rnd_mode == ROUNDING_TO_NEAREST
     912  	     || rnd_mode == ROUNDING_TIES_AWAY) && delta == (P34 + 1)
     913  	    && C1_hi == 0x0000314dc6448d93ull
     914  	    && C1_lo == 0x38c15b0a00000000ull && x_sign != y_sign
     915  	    && ((q2 <= 19 && C2_lo > midpoint64[q2 - 1]) || (q2 >= 20
     916  							     && (C2_hi >
     917  								 midpoint128
     918  								 [q2 -
     919  								  20].
     920  								 w[1]
     921  								 ||
     922  								 (C2_hi
     923  								  ==
     924  								  midpoint128
     925  								  [q2 -
     926  								   20].
     927  								  w[1]
     928  								  &&
     929  								  C2_lo
     930  								  >
     931  								  midpoint128
     932  								  [q2 -
     933  								   20].
     934  								  w
     935  								  [0])))))
     936  	{
     937  	  // C1 = 10^34 - 1 and decrement x_exp by 1 (no underflow possible)
     938  	  C1_hi = 0x0001ed09bead87c0ull;
     939  	  C1_lo = 0x378d8e63ffffffffull;
     940  	  x_exp = x_exp - EXP_P1;
     941  	}
     942  	if (rnd_mode != ROUNDING_TO_NEAREST) {
     943  	  if ((rnd_mode == ROUNDING_DOWN && x_sign && y_sign) ||
     944  	      (rnd_mode == ROUNDING_UP && !x_sign && !y_sign)) {
     945  	    // add 1 ulp and then check for overflow
     946  	    C1_lo = C1_lo + 1;
     947  	    if (C1_lo == 0) {	// rounding overflow in the low 64 bits
     948  	      C1_hi = C1_hi + 1;
     949  	    }
     950  	    if (C1_hi == 0x0001ed09bead87c0ull
     951  		&& C1_lo == 0x378d8e6400000000ull) {
     952  	      // C1 = 10^34 => rounding overflow
     953  	      C1_hi = 0x0000314dc6448d93ull;
     954  	      C1_lo = 0x38c15b0a00000000ull;	// 10^33
     955  	      x_exp = x_exp + EXP_P1;
     956  	      if (x_exp == EXP_MAX_P1) {	// overflow
     957  		C1_hi = 0x7800000000000000ull;	// +inf
     958  		C1_lo = 0x0ull;
     959  		x_exp = 0;	// x_sign is preserved
     960  		// set overflow flag (the inexact flag was set too)
     961  		*pfpsf |= OVERFLOW_EXCEPTION;
     962  	      }
     963  	    }
     964  	  } else if ((rnd_mode == ROUNDING_DOWN && !x_sign && y_sign) ||
     965  		     (rnd_mode == ROUNDING_UP && x_sign && !y_sign) ||
     966  		     (rnd_mode == ROUNDING_TO_ZERO
     967  		      && x_sign != y_sign)) {
     968  	    // subtract 1 ulp from C1
     969  	    // Note: because delta >= P34 + 1 the result cannot be zero
     970  	    C1_lo = C1_lo - 1;
     971  	    if (C1_lo == 0xffffffffffffffffull)
     972  	      C1_hi = C1_hi - 1;
     973  	    // if the coefficient is 10^33 - 1 then make it 10^34 - 1 and 
     974  	    // decrease the exponent by 1 (because delta >= P34 + 1 the
     975  	    // exponent will not become less than e_min)
     976  	    // 10^33 - 1 = 0x0000314dc6448d9338c15b09ffffffff
     977  	    // 10^34 - 1 = 0x0001ed09bead87c0378d8e63ffffffff
     978  	    if (C1_hi == 0x0000314dc6448d93ull
     979  		&& C1_lo == 0x38c15b09ffffffffull) {
     980  	      // make C1 = 10^34  - 1
     981  	      C1_hi = 0x0001ed09bead87c0ull;
     982  	      C1_lo = 0x378d8e63ffffffffull;
     983  	      x_exp = x_exp - EXP_P1;
     984  	    }
     985  	  } else {
     986  	    ;	// the result is already correct
     987  	  }
     988  	}
     989  	// set the inexact flag
     990  	*pfpsf |= INEXACT_EXCEPTION;
     991  	// assemble the result
     992  	res.w[1] = x_sign | x_exp | C1_hi;
     993  	res.w[0] = C1_lo;
     994        } else {	// delta = P34 
     995  	// in most cases, the smaller operand may be < or = or > 1/2 ulp of the
     996  	// larger operand
     997  	// however, the case C1 = 10^(q1-1) and x_sign != y_sign is special due
     998  	// to accuracy loss after subtraction, and will be treated separately
     999  	if (x_sign == y_sign || (q1 <= 20
    1000  				 && (C1_hi != 0
    1001  				     || C1_lo != ten2k64[q1 - 1]))
    1002  	    || (q1 >= 21 && (C1_hi != ten2k128[q1 - 21].w[1]
    1003  			     || C1_lo != ten2k128[q1 - 21].w[0]))) {
    1004  	  // if x_sign == y_sign or C1 != 10^(q1-1)
    1005  	  // compare C2 with 1/2 ulp = 5 * 10^(q2-1), the latter read from table
    1006  	  // Note: cases q1<=19 and q1>=20 can be coalesced at some latency cost
    1007  	  if (q2 <= 19) {	// C2 and 5*10^(q2-1) both fit in 64 bits
    1008  	    halfulp64 = midpoint64[q2 - 1];	// 5 * 10^(q2-1)
    1009  	    if (C2_lo < halfulp64) {	// n2 < 1/2 ulp (n1)
    1010  	      // for RN the result is the operand with the larger magnitude, 
    1011  	      // possibly scaled up by 10^(P34-q1)
    1012  	      // an overflow cannot occur in this case (rounding to nearest)
    1013  	      if (q1 < P34) {	// scale C1 up by 10^(P34-q1)
    1014  		// Note: because delta = P34 it is certain that
    1015  		//     x_exp - ((UINT64)scale << 49) will stay above e_min
    1016  		scale = P34 - q1;
    1017  		if (q1 <= 19) {	// C1 fits in 64 bits
    1018  		  // 1 <= q1 <= 19 => 15 <= scale <= 33
    1019  		  if (scale <= 19) {	// 10^scale fits in 64 bits
    1020  		    __mul_64x64_to_128MACH (C1, ten2k64[scale], C1_lo);
    1021  		  } else {	// if 20 <= scale <= 33
    1022  		    // C1 * 10^scale = (C1 * 10^(scale-19)) * 10^19 where
    1023  		    // (C1 * 10^(scale-19)) fits in 64 bits
    1024  		    C1_lo = C1_lo * ten2k64[scale - 19];
    1025  		    __mul_64x64_to_128MACH (C1, ten2k64[19], C1_lo);
    1026  		  }
    1027  		} else {	//if 20 <= q1 <= 33=P34-1 then C1 fits only in 128 bits
    1028  		  // => 1 <= P34 - q1 <= 14 so 10^(P34-q1) fits in 64 bits
    1029  		  C1.w[1] = C1_hi;
    1030  		  C1.w[0] = C1_lo;
    1031  		  // C1 = ten2k64[P34 - q1] * C1
    1032  		  __mul_128x64_to_128 (C1, ten2k64[P34 - q1], C1);
    1033  		}
    1034  		x_exp = x_exp - ((UINT64) scale << 49);
    1035  		C1_hi = C1.w[1];
    1036  		C1_lo = C1.w[0];
    1037  	      }
    1038  	      if (rnd_mode != ROUNDING_TO_NEAREST) {
    1039  		if ((rnd_mode == ROUNDING_DOWN && x_sign && y_sign) ||
    1040  		    (rnd_mode == ROUNDING_UP && !x_sign && !y_sign)) {
    1041  		  // add 1 ulp and then check for overflow
    1042  		  C1_lo = C1_lo + 1;
    1043  		  if (C1_lo == 0) {	// rounding overflow in the low 64 bits
    1044  		    C1_hi = C1_hi + 1;
    1045  		  }
    1046  		  if (C1_hi == 0x0001ed09bead87c0ull
    1047  		      && C1_lo == 0x378d8e6400000000ull) {
    1048  		    // C1 = 10^34 => rounding overflow
    1049  		    C1_hi = 0x0000314dc6448d93ull;
    1050  		    C1_lo = 0x38c15b0a00000000ull;	// 10^33
    1051  		    x_exp = x_exp + EXP_P1;
    1052  		    if (x_exp == EXP_MAX_P1) {	// overflow
    1053  		      C1_hi = 0x7800000000000000ull;	// +inf
    1054  		      C1_lo = 0x0ull;
    1055  		      x_exp = 0;	// x_sign is preserved
    1056  		      // set overflow flag (the inexact flag was set too)
    1057  		      *pfpsf |= OVERFLOW_EXCEPTION;
    1058  		    }
    1059  		  }
    1060  		} else
    1061  		  if ((rnd_mode == ROUNDING_DOWN && !x_sign && y_sign)
    1062  		      || (rnd_mode == ROUNDING_UP && x_sign && !y_sign)
    1063  		      || (rnd_mode == ROUNDING_TO_ZERO
    1064  			  && x_sign != y_sign)) {
    1065  		  // subtract 1 ulp from C1
    1066  		  // Note: because delta >= P34 + 1 the result cannot be zero
    1067  		  C1_lo = C1_lo - 1;
    1068  		  if (C1_lo == 0xffffffffffffffffull)
    1069  		    C1_hi = C1_hi - 1;
    1070  		  // if the coefficient is 10^33-1 then make it 10^34-1 and 
    1071  		  // decrease the exponent by 1 (because delta >= P34 + 1 the
    1072  		  // exponent will not become less than e_min)
    1073  		  // 10^33 - 1 = 0x0000314dc6448d9338c15b09ffffffff
    1074  		  // 10^34 - 1 = 0x0001ed09bead87c0378d8e63ffffffff
    1075  		  if (C1_hi == 0x0000314dc6448d93ull
    1076  		      && C1_lo == 0x38c15b09ffffffffull) {
    1077  		    // make C1 = 10^34  - 1
    1078  		    C1_hi = 0x0001ed09bead87c0ull;
    1079  		    C1_lo = 0x378d8e63ffffffffull;
    1080  		    x_exp = x_exp - EXP_P1;
    1081  		  }
    1082  		} else {
    1083  		  ;	// the result is already correct
    1084  		}
    1085  	      }
    1086  	      // set the inexact flag
    1087  	      *pfpsf |= INEXACT_EXCEPTION;
    1088  	      // assemble the result
    1089  	      res.w[1] = x_sign | x_exp | C1_hi;
    1090  	      res.w[0] = C1_lo;
    1091  	    } else if ((C2_lo == halfulp64)
    1092  		       && (q1 < P34 || ((C1_lo & 0x1) == 0))) {
    1093  	      // n2 = 1/2 ulp (n1) and C1 is even
    1094  	      // the result is the operand with the larger magnitude,
    1095  	      // possibly scaled up by 10^(P34-q1)
    1096  	      // an overflow cannot occur in this case (rounding to nearest)
    1097  	      if (q1 < P34) {	// scale C1 up by 10^(P34-q1)
    1098  		// Note: because delta = P34 it is certain that
    1099  		//     x_exp - ((UINT64)scale << 49) will stay above e_min
    1100  		scale = P34 - q1;
    1101  		if (q1 <= 19) {	// C1 fits in 64 bits
    1102  		  // 1 <= q1 <= 19 => 15 <= scale <= 33
    1103  		  if (scale <= 19) {	// 10^scale fits in 64 bits
    1104  		    __mul_64x64_to_128MACH (C1, ten2k64[scale], C1_lo);
    1105  		  } else {	// if 20 <= scale <= 33 
    1106  		    // C1 * 10^scale = (C1 * 10^(scale-19)) * 10^19 where
    1107  		    // (C1 * 10^(scale-19)) fits in 64 bits  
    1108  		    C1_lo = C1_lo * ten2k64[scale - 19];
    1109  		    __mul_64x64_to_128MACH (C1, ten2k64[19], C1_lo);
    1110  		  }
    1111  		} else {	//if 20 <= q1 <= 33=P34-1 then C1 fits only in 128 bits
    1112  		  // => 1 <= P34 - q1 <= 14 so 10^(P34-q1) fits in 64 bits 
    1113  		  C1.w[1] = C1_hi;
    1114  		  C1.w[0] = C1_lo;
    1115  		  // C1 = ten2k64[P34 - q1] * C1 
    1116  		  __mul_128x64_to_128 (C1, ten2k64[P34 - q1], C1);
    1117  		}
    1118  		x_exp = x_exp - ((UINT64) scale << 49);
    1119  		C1_hi = C1.w[1];
    1120  		C1_lo = C1.w[0];
    1121  	      }
    1122  	      if ((rnd_mode == ROUNDING_TO_NEAREST && x_sign == y_sign
    1123  		   && (C1_lo & 0x01)) || (rnd_mode == ROUNDING_TIES_AWAY
    1124  					  && x_sign == y_sign)
    1125  		  || (rnd_mode == ROUNDING_UP && !x_sign && !y_sign)
    1126  		  || (rnd_mode == ROUNDING_DOWN && x_sign && y_sign)) {
    1127  		// add 1 ulp and then check for overflow
    1128  		C1_lo = C1_lo + 1;
    1129  		if (C1_lo == 0) {	// rounding overflow in the low 64 bits
    1130  		  C1_hi = C1_hi + 1;
    1131  		}
    1132  		if (C1_hi == 0x0001ed09bead87c0ull
    1133  		    && C1_lo == 0x378d8e6400000000ull) {
    1134  		  // C1 = 10^34 => rounding overflow
    1135  		  C1_hi = 0x0000314dc6448d93ull;
    1136  		  C1_lo = 0x38c15b0a00000000ull;	// 10^33
    1137  		  x_exp = x_exp + EXP_P1;
    1138  		  if (x_exp == EXP_MAX_P1) {	// overflow
    1139  		    C1_hi = 0x7800000000000000ull;	// +inf
    1140  		    C1_lo = 0x0ull;
    1141  		    x_exp = 0;	// x_sign is preserved
    1142  		    // set overflow flag (the inexact flag was set too)
    1143  		    *pfpsf |= OVERFLOW_EXCEPTION;
    1144  		  }
    1145  		}
    1146  	      } else
    1147  		if ((rnd_mode == ROUNDING_TO_NEAREST && x_sign != y_sign
    1148  		     && (C1_lo & 0x01)) || (rnd_mode == ROUNDING_DOWN
    1149  					    && !x_sign && y_sign)
    1150  		    || (rnd_mode == ROUNDING_UP && x_sign && !y_sign)
    1151  		    || (rnd_mode == ROUNDING_TO_ZERO
    1152  			&& x_sign != y_sign)) {
    1153  		// subtract 1 ulp from C1
    1154  		// Note: because delta >= P34 + 1 the result cannot be zero
    1155  		C1_lo = C1_lo - 1;
    1156  		if (C1_lo == 0xffffffffffffffffull)
    1157  		  C1_hi = C1_hi - 1;
    1158  		// if the coefficient is 10^33 - 1 then make it 10^34 - 1
    1159  		// and decrease the exponent by 1 (because delta >= P34 + 1
    1160  		// the exponent will not become less than e_min)
    1161  		// 10^33 - 1 = 0x0000314dc6448d9338c15b09ffffffff
    1162  		// 10^34 - 1 = 0x0001ed09bead87c0378d8e63ffffffff
    1163  		if (C1_hi == 0x0000314dc6448d93ull
    1164  		    && C1_lo == 0x38c15b09ffffffffull) {
    1165  		  // make C1 = 10^34  - 1
    1166  		  C1_hi = 0x0001ed09bead87c0ull;
    1167  		  C1_lo = 0x378d8e63ffffffffull;
    1168  		  x_exp = x_exp - EXP_P1;
    1169  		}
    1170  	      } else {
    1171  		;	// the result is already correct
    1172  	      }
    1173  	      // set the inexact flag
    1174  	      *pfpsf |= INEXACT_EXCEPTION;
    1175  	      // assemble the result 
    1176  	      res.w[1] = x_sign | x_exp | C1_hi;
    1177  	      res.w[0] = C1_lo;
    1178  	    } else {	// if C2_lo > halfulp64 || 
    1179  	      // (C2_lo == halfulp64 && q1 == P34 && ((C1_lo & 0x1) == 1)), i.e.
    1180  	      // 1/2 ulp(n1) < n2 < 1 ulp(n1) or n2 = 1/2 ulp(n1) and C1 odd
    1181  	      // res = x+1 ulp if n1*n2 > 0 and res = x-1 ulp if n1*n2 < 0
    1182  	      if (q1 < P34) {	// then 1 ulp = 10^(e1+q1-P34) < 10^e1
    1183  		// Note: if (q1 == P34) then 1 ulp = 10^(e1+q1-P34) = 10^e1
    1184  		// because q1 < P34 we must first replace C1 by 
    1185  		// C1 * 10^(P34-q1), and must decrease the exponent by 
    1186  		// (P34-q1) (it will still be at least e_min)
    1187  		scale = P34 - q1;
    1188  		if (q1 <= 19) {	// C1 fits in 64 bits
    1189  		  // 1 <= q1 <= 19 => 15 <= scale <= 33
    1190  		  if (scale <= 19) {	// 10^scale fits in 64 bits
    1191  		    __mul_64x64_to_128MACH (C1, ten2k64[scale], C1_lo);
    1192  		  } else {	// if 20 <= scale <= 33
    1193  		    // C1 * 10^scale = (C1 * 10^(scale-19)) * 10^19 where
    1194  		    // (C1 * 10^(scale-19)) fits in 64 bits
    1195  		    C1_lo = C1_lo * ten2k64[scale - 19];
    1196  		    __mul_64x64_to_128MACH (C1, ten2k64[19], C1_lo);
    1197  		  }
    1198  		} else {	//if 20 <= q1 <= 33=P34-1 then C1 fits only in 128 bits
    1199  		  // => 1 <= P34 - q1 <= 14 so 10^(P34-q1) fits in 64 bits
    1200  		  C1.w[1] = C1_hi;
    1201  		  C1.w[0] = C1_lo;
    1202  		  // C1 = ten2k64[P34 - q1] * C1
    1203  		  __mul_128x64_to_128 (C1, ten2k64[P34 - q1], C1);
    1204  		}
    1205  		x_exp = x_exp - ((UINT64) scale << 49);
    1206  		C1_hi = C1.w[1];
    1207  		C1_lo = C1.w[0];
    1208  		// check for rounding overflow
    1209  		if (C1_hi == 0x0001ed09bead87c0ull
    1210  		    && C1_lo == 0x378d8e6400000000ull) {
    1211  		  // C1 = 10^34 => rounding overflow 
    1212  		  C1_hi = 0x0000314dc6448d93ull;
    1213  		  C1_lo = 0x38c15b0a00000000ull;	// 10^33
    1214  		  x_exp = x_exp + EXP_P1;
    1215  		}
    1216  	      }
    1217  	      if ((rnd_mode == ROUNDING_TO_NEAREST && x_sign != y_sign)
    1218  		  || (rnd_mode == ROUNDING_TIES_AWAY && x_sign != y_sign
    1219  		      && C2_lo != halfulp64)
    1220  		  || (rnd_mode == ROUNDING_DOWN && !x_sign && y_sign)
    1221  		  || (rnd_mode == ROUNDING_UP && x_sign && !y_sign)
    1222  		  || (rnd_mode == ROUNDING_TO_ZERO
    1223  		      && x_sign != y_sign)) {
    1224  		// the result is x - 1
    1225  		// for RN n1 * n2 < 0; underflow not possible
    1226  		C1_lo = C1_lo - 1;
    1227  		if (C1_lo == 0xffffffffffffffffull)
    1228  		  C1_hi--;
    1229  		// check if we crossed into the lower decade
    1230  		if (C1_hi == 0x0000314dc6448d93ull && C1_lo == 0x38c15b09ffffffffull) {	// 10^33 - 1
    1231  		  C1_hi = 0x0001ed09bead87c0ull;	// 10^34 - 1
    1232  		  C1_lo = 0x378d8e63ffffffffull;
    1233  		  x_exp = x_exp - EXP_P1;	// no underflow, because n1 >> n2
    1234  		}
    1235  	      } else
    1236  		if ((rnd_mode == ROUNDING_TO_NEAREST
    1237  		     && x_sign == y_sign)
    1238  		    || (rnd_mode == ROUNDING_TIES_AWAY
    1239  			&& x_sign == y_sign)
    1240  		    || (rnd_mode == ROUNDING_DOWN && x_sign && y_sign)
    1241  		    || (rnd_mode == ROUNDING_UP && !x_sign
    1242  			&& !y_sign)) {
    1243  		// the result is x + 1
    1244  		// for RN x_sign = y_sign, i.e. n1*n2 > 0
    1245  		C1_lo = C1_lo + 1;
    1246  		if (C1_lo == 0) {	// rounding overflow in the low 64 bits
    1247  		  C1_hi = C1_hi + 1;
    1248  		}
    1249  		if (C1_hi == 0x0001ed09bead87c0ull
    1250  		    && C1_lo == 0x378d8e6400000000ull) {
    1251  		  // C1 = 10^34 => rounding overflow
    1252  		  C1_hi = 0x0000314dc6448d93ull;
    1253  		  C1_lo = 0x38c15b0a00000000ull;	// 10^33
    1254  		  x_exp = x_exp + EXP_P1;
    1255  		  if (x_exp == EXP_MAX_P1) {	// overflow
    1256  		    C1_hi = 0x7800000000000000ull;	// +inf
    1257  		    C1_lo = 0x0ull;
    1258  		    x_exp = 0;	// x_sign is preserved
    1259  		    // set the overflow flag
    1260  		    *pfpsf |= OVERFLOW_EXCEPTION;
    1261  		  }
    1262  		}
    1263  	      } else {
    1264  		;	// the result is x
    1265  	      }
    1266  	      // set the inexact flag
    1267  	      *pfpsf |= INEXACT_EXCEPTION;
    1268  	      // assemble the result
    1269  	      res.w[1] = x_sign | x_exp | C1_hi;
    1270  	      res.w[0] = C1_lo;
    1271  	    }
    1272  	  } else {	// if q2 >= 20 then 5*10^(q2-1) and C2 (the latter in 
    1273  	    // most cases) fit only in more than 64 bits
    1274  	    halfulp128 = midpoint128[q2 - 20];	// 5 * 10^(q2-1)
    1275  	    if ((C2_hi < halfulp128.w[1])
    1276  		|| (C2_hi == halfulp128.w[1]
    1277  		    && C2_lo < halfulp128.w[0])) {
    1278  	      // n2 < 1/2 ulp (n1)
    1279  	      // the result is the operand with the larger magnitude,
    1280  	      // possibly scaled up by 10^(P34-q1)
    1281  	      // an overflow cannot occur in this case (rounding to nearest)
    1282  	      if (q1 < P34) {	// scale C1 up by 10^(P34-q1)
    1283  		// Note: because delta = P34 it is certain that
    1284  		//     x_exp - ((UINT64)scale << 49) will stay above e_min
    1285  		scale = P34 - q1;
    1286  		if (q1 <= 19) {	// C1 fits in 64 bits
    1287  		  // 1 <= q1 <= 19 => 15 <= scale <= 33
    1288  		  if (scale <= 19) {	// 10^scale fits in 64 bits
    1289  		    __mul_64x64_to_128MACH (C1, ten2k64[scale], C1_lo);
    1290  		  } else {	// if 20 <= scale <= 33 
    1291  		    // C1 * 10^scale = (C1 * 10^(scale-19)) * 10^19 where
    1292  		    // (C1 * 10^(scale-19)) fits in 64 bits  
    1293  		    C1_lo = C1_lo * ten2k64[scale - 19];
    1294  		    __mul_64x64_to_128MACH (C1, ten2k64[19], C1_lo);
    1295  		  }
    1296  		} else {	//if 20 <= q1 <= 33=P34-1 then C1 fits only in 128 bits
    1297  		  // => 1 <= P34 - q1 <= 14 so 10^(P34-q1) fits in 64 bits 
    1298  		  C1.w[1] = C1_hi;
    1299  		  C1.w[0] = C1_lo;
    1300  		  // C1 = ten2k64[P34 - q1] * C1 
    1301  		  __mul_128x64_to_128 (C1, ten2k64[P34 - q1], C1);
    1302  		}
    1303  		C1_hi = C1.w[1];
    1304  		C1_lo = C1.w[0];
    1305  		x_exp = x_exp - ((UINT64) scale << 49);
    1306  	      }
    1307  	      if (rnd_mode != ROUNDING_TO_NEAREST) {
    1308  		if ((rnd_mode == ROUNDING_DOWN && x_sign && y_sign) ||
    1309  		    (rnd_mode == ROUNDING_UP && !x_sign && !y_sign)) {
    1310  		  // add 1 ulp and then check for overflow
    1311  		  C1_lo = C1_lo + 1;
    1312  		  if (C1_lo == 0) {	// rounding overflow in the low 64 bits
    1313  		    C1_hi = C1_hi + 1;
    1314  		  }
    1315  		  if (C1_hi == 0x0001ed09bead87c0ull
    1316  		      && C1_lo == 0x378d8e6400000000ull) {
    1317  		    // C1 = 10^34 => rounding overflow
    1318  		    C1_hi = 0x0000314dc6448d93ull;
    1319  		    C1_lo = 0x38c15b0a00000000ull;	// 10^33
    1320  		    x_exp = x_exp + EXP_P1;
    1321  		    if (x_exp == EXP_MAX_P1) {	// overflow
    1322  		      C1_hi = 0x7800000000000000ull;	// +inf
    1323  		      C1_lo = 0x0ull;
    1324  		      x_exp = 0;	// x_sign is preserved
    1325  		      // set overflow flag (the inexact flag was set too)
    1326  		      *pfpsf |= OVERFLOW_EXCEPTION;
    1327  		    }
    1328  		  }
    1329  		} else
    1330  		  if ((rnd_mode == ROUNDING_DOWN && !x_sign && y_sign)
    1331  		      || (rnd_mode == ROUNDING_UP && x_sign && !y_sign)
    1332  		      || (rnd_mode == ROUNDING_TO_ZERO
    1333  			  && x_sign != y_sign)) {
    1334  		  // subtract 1 ulp from C1
    1335  		  // Note: because delta >= P34 + 1 the result cannot be zero
    1336  		  C1_lo = C1_lo - 1;
    1337  		  if (C1_lo == 0xffffffffffffffffull)
    1338  		    C1_hi = C1_hi - 1;
    1339  		  // if the coefficient is 10^33-1 then make it 10^34-1 and
    1340  		  // decrease the exponent by 1 (because delta >= P34 + 1 the
    1341  		  // exponent will not become less than e_min)
    1342  		  // 10^33 - 1 = 0x0000314dc6448d9338c15b09ffffffff
    1343  		  // 10^34 - 1 = 0x0001ed09bead87c0378d8e63ffffffff
    1344  		  if (C1_hi == 0x0000314dc6448d93ull
    1345  		      && C1_lo == 0x38c15b09ffffffffull) {
    1346  		    // make C1 = 10^34  - 1
    1347  		    C1_hi = 0x0001ed09bead87c0ull;
    1348  		    C1_lo = 0x378d8e63ffffffffull;
    1349  		    x_exp = x_exp - EXP_P1;
    1350  		  }
    1351  		} else {
    1352  		  ;	// the result is already correct
    1353  		}
    1354  	      }
    1355  	      // set the inexact flag 
    1356  	      *pfpsf |= INEXACT_EXCEPTION;
    1357  	      // assemble the result 
    1358  	      res.w[1] = x_sign | x_exp | C1_hi;
    1359  	      res.w[0] = C1_lo;
    1360  	    } else if ((C2_hi == halfulp128.w[1]
    1361  			&& C2_lo == halfulp128.w[0])
    1362  		       && (q1 < P34 || ((C1_lo & 0x1) == 0))) {
    1363  	      // midpoint & lsb in C1 is 0
    1364  	      // n2 = 1/2 ulp (n1) and C1 is even
    1365  	      // the result is the operand with the larger magnitude,
    1366  	      // possibly scaled up by 10^(P34-q1)
    1367  	      // an overflow cannot occur in this case (rounding to nearest)
    1368  	      if (q1 < P34) {	// scale C1 up by 10^(P34-q1)
    1369  		// Note: because delta = P34 it is certain that
    1370  		//     x_exp - ((UINT64)scale << 49) will stay above e_min
    1371  		scale = P34 - q1;
    1372  		if (q1 <= 19) {	// C1 fits in 64 bits
    1373  		  // 1 <= q1 <= 19 => 15 <= scale <= 33
    1374  		  if (scale <= 19) {	// 10^scale fits in 64 bits
    1375  		    __mul_64x64_to_128MACH (C1, ten2k64[scale], C1_lo);
    1376  		  } else {	// if 20 <= scale <= 33
    1377  		    // C1 * 10^scale = (C1 * 10^(scale-19)) * 10^19 where
    1378  		    // (C1 * 10^(scale-19)) fits in 64 bits
    1379  		    C1_lo = C1_lo * ten2k64[scale - 19];
    1380  		    __mul_64x64_to_128MACH (C1, ten2k64[19], C1_lo);
    1381  		  }
    1382  		} else {	//if 20 <= q1 <= 33=P34-1 then C1 fits only in 128 bits
    1383  		  // => 1 <= P34 - q1 <= 14 so 10^(P34-q1) fits in 64 bits
    1384  		  C1.w[1] = C1_hi;
    1385  		  C1.w[0] = C1_lo;
    1386  		  // C1 = ten2k64[P34 - q1] * C1
    1387  		  __mul_128x64_to_128 (C1, ten2k64[P34 - q1], C1);
    1388  		}
    1389  		x_exp = x_exp - ((UINT64) scale << 49);
    1390  		C1_hi = C1.w[1];
    1391  		C1_lo = C1.w[0];
    1392  	      }
    1393  	      if (rnd_mode != ROUNDING_TO_NEAREST) {
    1394  		if ((rnd_mode == ROUNDING_TIES_AWAY && x_sign == y_sign)
    1395  		    || (rnd_mode == ROUNDING_UP && !y_sign)) {
    1396  		  // add 1 ulp and then check for overflow
    1397  		  C1_lo = C1_lo + 1;
    1398  		  if (C1_lo == 0) {	// rounding overflow in the low 64 bits
    1399  		    C1_hi = C1_hi + 1;
    1400  		  }
    1401  		  if (C1_hi == 0x0001ed09bead87c0ull
    1402  		      && C1_lo == 0x378d8e6400000000ull) {
    1403  		    // C1 = 10^34 => rounding overflow
    1404  		    C1_hi = 0x0000314dc6448d93ull;
    1405  		    C1_lo = 0x38c15b0a00000000ull;	// 10^33
    1406  		    x_exp = x_exp + EXP_P1;
    1407  		    if (x_exp == EXP_MAX_P1) {	// overflow
    1408  		      C1_hi = 0x7800000000000000ull;	// +inf
    1409  		      C1_lo = 0x0ull;
    1410  		      x_exp = 0;	// x_sign is preserved
    1411  		      // set overflow flag (the inexact flag was set too)
    1412  		      *pfpsf |= OVERFLOW_EXCEPTION;
    1413  		    }
    1414  		  }
    1415  		} else if ((rnd_mode == ROUNDING_DOWN && y_sign)
    1416  			   || (rnd_mode == ROUNDING_TO_ZERO
    1417  			       && x_sign != y_sign)) {
    1418  		  // subtract 1 ulp from C1
    1419  		  // Note: because delta >= P34 + 1 the result cannot be zero
    1420  		  C1_lo = C1_lo - 1;
    1421  		  if (C1_lo == 0xffffffffffffffffull)
    1422  		    C1_hi = C1_hi - 1;
    1423  		  // if the coefficient is 10^33 - 1 then make it 10^34 - 1
    1424  		  // and decrease the exponent by 1 (because delta >= P34 + 1
    1425  		  // the exponent will not become less than e_min)
    1426  		  // 10^33 - 1 = 0x0000314dc6448d9338c15b09ffffffff
    1427  		  // 10^34 - 1 = 0x0001ed09bead87c0378d8e63ffffffff
    1428  		  if (C1_hi == 0x0000314dc6448d93ull
    1429  		      && C1_lo == 0x38c15b09ffffffffull) {
    1430  		    // make C1 = 10^34  - 1
    1431  		    C1_hi = 0x0001ed09bead87c0ull;
    1432  		    C1_lo = 0x378d8e63ffffffffull;
    1433  		    x_exp = x_exp - EXP_P1;
    1434  		  }
    1435  		} else {
    1436  		  ;	// the result is already correct
    1437  		}
    1438  	      }
    1439  	      // set the inexact flag
    1440  	      *pfpsf |= INEXACT_EXCEPTION;
    1441  	      // assemble the result
    1442  	      res.w[1] = x_sign | x_exp | C1_hi;
    1443  	      res.w[0] = C1_lo;
    1444  	    } else {	// if C2 > halfulp128 ||
    1445  	      // (C2 == halfulp128 && q1 == P34 && ((C1 & 0x1) == 1)), i.e.
    1446  	      // 1/2 ulp(n1) < n2 < 1 ulp(n1) or n2 = 1/2 ulp(n1) and C1 odd
    1447  	      // res = x+1 ulp if n1*n2 > 0 and res = x-1 ulp if n1*n2 < 0
    1448  	      if (q1 < P34) {	// then 1 ulp = 10^(e1+q1-P34) < 10^e1
    1449  		// Note: if (q1 == P34) then 1 ulp = 10^(e1+q1-P34) = 10^e1
    1450  		// because q1 < P34 we must first replace C1 by C1*10^(P34-q1),
    1451  		// and must decrease the exponent by (P34-q1) (it will still be
    1452  		// at least e_min)
    1453  		scale = P34 - q1;
    1454  		if (q1 <= 19) {	// C1 fits in 64 bits
    1455  		  // 1 <= q1 <= 19 => 15 <= scale <= 33
    1456  		  if (scale <= 19) {	// 10^scale fits in 64 bits
    1457  		    __mul_64x64_to_128MACH (C1, ten2k64[scale], C1_lo);
    1458  		  } else {	// if 20 <= scale <= 33
    1459  		    // C1 * 10^scale = (C1 * 10^(scale-19)) * 10^19 where
    1460  		    // (C1 * 10^(scale-19)) fits in 64 bits
    1461  		    C1_lo = C1_lo * ten2k64[scale - 19];
    1462  		    __mul_64x64_to_128MACH (C1, ten2k64[19], C1_lo);
    1463  		  }
    1464  		} else {	//if 20 <= q1 <= 33=P34-1 then C1 fits only in 128 bits
    1465  		  // => 1 <= P34 - q1 <= 14 so 10^(P34-q1) fits in 64 bits
    1466  		  C1.w[1] = C1_hi;
    1467  		  C1.w[0] = C1_lo;
    1468  		  // C1 = ten2k64[P34 - q1] * C1
    1469  		  __mul_128x64_to_128 (C1, ten2k64[P34 - q1], C1);
    1470  		}
    1471  		C1_hi = C1.w[1];
    1472  		C1_lo = C1.w[0];
    1473  		x_exp = x_exp - ((UINT64) scale << 49);
    1474  	      }
    1475  	      if ((rnd_mode == ROUNDING_TO_NEAREST && x_sign != y_sign)
    1476  		  || (rnd_mode == ROUNDING_TIES_AWAY && x_sign != y_sign
    1477  		      && (C2_hi != halfulp128.w[1]
    1478  			  || C2_lo != halfulp128.w[0]))
    1479  		  || (rnd_mode == ROUNDING_DOWN && !x_sign && y_sign)
    1480  		  || (rnd_mode == ROUNDING_UP && x_sign && !y_sign)
    1481  		  || (rnd_mode == ROUNDING_TO_ZERO
    1482  		      && x_sign != y_sign)) {
    1483  		// the result is x - 1
    1484  		// for RN n1 * n2 < 0; underflow not possible
    1485  		C1_lo = C1_lo - 1;
    1486  		if (C1_lo == 0xffffffffffffffffull)
    1487  		  C1_hi--;
    1488  		// check if we crossed into the lower decade
    1489  		if (C1_hi == 0x0000314dc6448d93ull && C1_lo == 0x38c15b09ffffffffull) {	// 10^33 - 1
    1490  		  C1_hi = 0x0001ed09bead87c0ull;	// 10^34 - 1
    1491  		  C1_lo = 0x378d8e63ffffffffull;
    1492  		  x_exp = x_exp - EXP_P1;	// no underflow, because n1 >> n2
    1493  		}
    1494  	      } else
    1495  		if ((rnd_mode == ROUNDING_TO_NEAREST
    1496  		     && x_sign == y_sign)
    1497  		    || (rnd_mode == ROUNDING_TIES_AWAY
    1498  			&& x_sign == y_sign)
    1499  		    || (rnd_mode == ROUNDING_DOWN && x_sign && y_sign)
    1500  		    || (rnd_mode == ROUNDING_UP && !x_sign
    1501  			&& !y_sign)) {
    1502  		// the result is x + 1
    1503  		// for RN x_sign = y_sign, i.e. n1*n2 > 0
    1504  		C1_lo = C1_lo + 1;
    1505  		if (C1_lo == 0) {	// rounding overflow in the low 64 bits
    1506  		  C1_hi = C1_hi + 1;
    1507  		}
    1508  		if (C1_hi == 0x0001ed09bead87c0ull
    1509  		    && C1_lo == 0x378d8e6400000000ull) {
    1510  		  // C1 = 10^34 => rounding overflow
    1511  		  C1_hi = 0x0000314dc6448d93ull;
    1512  		  C1_lo = 0x38c15b0a00000000ull;	// 10^33
    1513  		  x_exp = x_exp + EXP_P1;
    1514  		  if (x_exp == EXP_MAX_P1) {	// overflow
    1515  		    C1_hi = 0x7800000000000000ull;	// +inf
    1516  		    C1_lo = 0x0ull;
    1517  		    x_exp = 0;	// x_sign is preserved
    1518  		    // set the overflow flag
    1519  		    *pfpsf |= OVERFLOW_EXCEPTION;
    1520  		  }
    1521  		}
    1522  	      } else {
    1523  		;	// the result is x
    1524  	      }
    1525  	      // set the inexact flag
    1526  	      *pfpsf |= INEXACT_EXCEPTION;
    1527  	      // assemble the result
    1528  	      res.w[1] = x_sign | x_exp | C1_hi;
    1529  	      res.w[0] = C1_lo;
    1530  	    }
    1531  	  }	// end q1 >= 20
    1532  	  // end case where C1 != 10^(q1-1)
    1533  	} else {	// C1 = 10^(q1-1) and x_sign != y_sign
    1534  	  // instead of C' = (C1 * 10^(e1-e2) + C2)rnd,P34
    1535  	  // calculate C' = C1 * 10^(e1-e2-x1) + (C2 * 10^(-x1))rnd,P34 
    1536  	  // where x1 = q2 - 1, 0 <= x1 <= P34 - 1
    1537  	  // Because C1 = 10^(q1-1) and x_sign != y_sign, C' will have P34 
    1538  	  // digits and n = C' * 10^(e2+x1)
    1539  	  // If the result has P34+1 digits, redo the steps above with x1+1
    1540  	  // If the result has P34-1 digits or less, redo the steps above with 
    1541  	  // x1-1 but only if initially x1 >= 1
    1542  	  // NOTE: these two steps can be improved, e.g we could guess if
    1543  	  // P34+1 or P34-1 digits will be obtained by adding/subtracting 
    1544  	  // just the top 64 bits of the two operands
    1545  	  // The result cannot be zero, and it cannot overflow
    1546  	  x1 = q2 - 1;	// 0 <= x1 <= P34-1
    1547  	  // Calculate C1 * 10^(e1-e2-x1) where 1 <= e1-e2-x1 <= P34
    1548  	  // scale = (int)(e1 >> 49) - (int)(e2 >> 49) - x1; 0 <= scale <= P34-1
    1549  	  scale = P34 - q1 + 1;	// scale=e1-e2-x1 = P34+1-q1; 1<=scale<=P34
    1550  	  // either C1 or 10^(e1-e2-x1) may not fit is 64 bits,
    1551  	  // but their product fits with certainty in 128 bits
    1552  	  if (scale >= 20) {	//10^(e1-e2-x1) doesn't fit in 64 bits, but C1 does
    1553  	    __mul_128x64_to_128 (C1, C1_lo, ten2k128[scale - 20]);
    1554  	  } else {	// if (scale >= 1
    1555  	    // if 1 <= scale <= 19 then 10^(e1-e2-x1) fits in 64 bits
    1556  	    if (q1 <= 19) {	// C1 fits in 64 bits
    1557  	      __mul_64x64_to_128MACH (C1, C1_lo, ten2k64[scale]);
    1558  	    } else {	// q1 >= 20
    1559  	      C1.w[1] = C1_hi;
    1560  	      C1.w[0] = C1_lo;
    1561  	      __mul_128x64_to_128 (C1, ten2k64[scale], C1);
    1562  	    }
    1563  	  }
    1564  	  tmp64 = C1.w[0];	// C1.w[1], C1.w[0] contains C1 * 10^(e1-e2-x1)
    1565  
    1566  	  // now round C2 to q2-x1 = 1 decimal digit
    1567  	  // C2' = C2 + 1/2 * 10^x1 = C2 + 5 * 10^(x1-1)
    1568  	  ind = x1 - 1;	// -1 <= ind <= P34 - 2
    1569  	  if (ind >= 0) {	// if (x1 >= 1)
    1570  	    C2.w[0] = C2_lo;
    1571  	    C2.w[1] = C2_hi;
    1572  	    if (ind <= 18) {
    1573  	      C2.w[0] = C2.w[0] + midpoint64[ind];
    1574  	      if (C2.w[0] < C2_lo)
    1575  		C2.w[1]++;
    1576  	    } else {	// 19 <= ind <= 32
    1577  	      C2.w[0] = C2.w[0] + midpoint128[ind - 19].w[0];
    1578  	      C2.w[1] = C2.w[1] + midpoint128[ind - 19].w[1];
    1579  	      if (C2.w[0] < C2_lo)
    1580  		C2.w[1]++;
    1581  	    }
    1582  	    // the approximation of 10^(-x1) was rounded up to 118 bits
    1583  	    __mul_128x128_to_256 (R256, C2, ten2mk128[ind]);	// R256 = C2*, f2*
    1584  	    // calculate C2* and f2*
    1585  	    // C2* is actually floor(C2*) in this case
    1586  	    // C2* and f2* need shifting and masking, as shown by
    1587  	    // shiftright128[] and maskhigh128[]
    1588  	    // the top Ex bits of 10^(-x1) are T* = ten2mk128trunc[ind], e.g.
    1589  	    // if x1=1, T*=ten2mk128trunc[0]=0x19999999999999999999999999999999
    1590  	    // if (0 < f2* < 10^(-x1)) then
    1591  	    //   if floor(C1+C2*) is even then C2* = floor(C2*) - logical right
    1592  	    //       shift; C2* has p decimal digits, correct by Prop. 1)
    1593  	    //   else if floor(C1+C2*) is odd C2* = floor(C2*)-1 (logical right
    1594  	    //       shift; C2* has p decimal digits, correct by Pr. 1)
    1595  	    // else
    1596  	    //   C2* = floor(C2*) (logical right shift; C has p decimal digits,
    1597  	    //       correct by Property 1)
    1598  	    // n = C2* * 10^(e2+x1)
    1599  
    1600  	    if (ind <= 2) {
    1601  	      highf2star.w[1] = 0x0;
    1602  	      highf2star.w[0] = 0x0;	// low f2* ok
    1603  	    } else if (ind <= 21) {
    1604  	      highf2star.w[1] = 0x0;
    1605  	      highf2star.w[0] = R256.w[2] & maskhigh128[ind];	// low f2* ok
    1606  	    } else {
    1607  	      highf2star.w[1] = R256.w[3] & maskhigh128[ind];
    1608  	      highf2star.w[0] = R256.w[2];	// low f2* is ok
    1609  	    }
    1610  	    // shift right C2* by Ex-128 = shiftright128[ind]
    1611  	    if (ind >= 3) {
    1612  	      shift = shiftright128[ind];
    1613  	      if (shift < 64) {	// 3 <= shift <= 63
    1614  		R256.w[2] =
    1615  		  (R256.w[2] >> shift) | (R256.w[3] << (64 - shift));
    1616  		R256.w[3] = (R256.w[3] >> shift);
    1617  	      } else {	// 66 <= shift <= 102
    1618  		R256.w[2] = (R256.w[3] >> (shift - 64));
    1619  		R256.w[3] = 0x0ULL;
    1620  	      }
    1621  	    }
    1622  	    // redundant
    1623  	    is_inexact_lt_midpoint = 0;
    1624  	    is_inexact_gt_midpoint = 0;
    1625  	    is_midpoint_lt_even = 0;
    1626  	    is_midpoint_gt_even = 0;
    1627  	    // determine inexactness of the rounding of C2*
    1628  	    // (cannot be followed by a second rounding)
    1629  	    // if (0 < f2* - 1/2 < 10^(-x1)) then
    1630  	    //   the result is exact
    1631  	    // else (if f2* - 1/2 > T* then)
    1632  	    //   the result of is inexact
    1633  	    if (ind <= 2) {
    1634  	      if (R256.w[1] > 0x8000000000000000ull ||
    1635  		  (R256.w[1] == 0x8000000000000000ull
    1636  		   && R256.w[0] > 0x0ull)) {
    1637  		// f2* > 1/2 and the result may be exact
    1638  		tmp64A = R256.w[1] - 0x8000000000000000ull;	// f* - 1/2
    1639  		if ((tmp64A > ten2mk128trunc[ind].w[1]
    1640  		     || (tmp64A == ten2mk128trunc[ind].w[1]
    1641  			 && R256.w[0] >= ten2mk128trunc[ind].w[0]))) {
    1642  		  // set the inexact flag
    1643  		  *pfpsf |= INEXACT_EXCEPTION;
    1644  		  // this rounding is applied to C2 only!
    1645  		  // x_sign != y_sign
    1646  		  is_inexact_gt_midpoint = 1;
    1647  		}	// else the result is exact
    1648  		// rounding down, unless a midpoint in [ODD, EVEN]
    1649  	      } else {	// the result is inexact; f2* <= 1/2
    1650  		// set the inexact flag
    1651  		*pfpsf |= INEXACT_EXCEPTION;
    1652  		// this rounding is applied to C2 only!
    1653  		// x_sign != y_sign
    1654  		is_inexact_lt_midpoint = 1;
    1655  	      }
    1656  	    } else if (ind <= 21) {	// if 3 <= ind <= 21
    1657  	      if (highf2star.w[1] > 0x0 || (highf2star.w[1] == 0x0
    1658  					    && highf2star.w[0] >
    1659  					    onehalf128[ind])
    1660  		  || (highf2star.w[1] == 0x0
    1661  		      && highf2star.w[0] == onehalf128[ind]
    1662  		      && (R256.w[1] || R256.w[0]))) {
    1663  		// f2* > 1/2 and the result may be exact
    1664  		// Calculate f2* - 1/2
    1665  		tmp64A = highf2star.w[0] - onehalf128[ind];
    1666  		tmp64B = highf2star.w[1];
    1667  		if (tmp64A > highf2star.w[0])
    1668  		  tmp64B--;
    1669  		if (tmp64B || tmp64A
    1670  		    || R256.w[1] > ten2mk128trunc[ind].w[1]
    1671  		    || (R256.w[1] == ten2mk128trunc[ind].w[1]
    1672  			&& R256.w[0] > ten2mk128trunc[ind].w[0])) {
    1673  		  // set the inexact flag
    1674  		  *pfpsf |= INEXACT_EXCEPTION;
    1675  		  // this rounding is applied to C2 only!
    1676  		  // x_sign != y_sign
    1677  		  is_inexact_gt_midpoint = 1;
    1678  		}	// else the result is exact
    1679  	      } else {	// the result is inexact; f2* <= 1/2
    1680  		// set the inexact flag
    1681  		*pfpsf |= INEXACT_EXCEPTION;
    1682  		// this rounding is applied to C2 only!
    1683  		// x_sign != y_sign
    1684  		is_inexact_lt_midpoint = 1;
    1685  	      }
    1686  	    } else {	// if 22 <= ind <= 33
    1687  	      if (highf2star.w[1] > onehalf128[ind]
    1688  		  || (highf2star.w[1] == onehalf128[ind]
    1689  		      && (highf2star.w[0] || R256.w[1]
    1690  			  || R256.w[0]))) {
    1691  		// f2* > 1/2 and the result may be exact
    1692  		// Calculate f2* - 1/2
    1693  		// tmp64A = highf2star.w[0];
    1694  		tmp64B = highf2star.w[1] - onehalf128[ind];
    1695  		if (tmp64B || highf2star.w[0]
    1696  		    || R256.w[1] > ten2mk128trunc[ind].w[1]
    1697  		    || (R256.w[1] == ten2mk128trunc[ind].w[1]
    1698  			&& R256.w[0] > ten2mk128trunc[ind].w[0])) {
    1699  		  // set the inexact flag
    1700  		  *pfpsf |= INEXACT_EXCEPTION;
    1701  		  // this rounding is applied to C2 only!
    1702  		  // x_sign != y_sign
    1703  		  is_inexact_gt_midpoint = 1;
    1704  		}	// else the result is exact
    1705  	      } else {	// the result is inexact; f2* <= 1/2
    1706  		// set the inexact flag
    1707  		*pfpsf |= INEXACT_EXCEPTION;
    1708  		// this rounding is applied to C2 only!
    1709  		// x_sign != y_sign
    1710  		is_inexact_lt_midpoint = 1;
    1711  	      }
    1712  	    }
    1713  	    // check for midpoints after determining inexactness
    1714  	    if ((R256.w[1] || R256.w[0]) && (highf2star.w[1] == 0)
    1715  		&& (highf2star.w[0] == 0)
    1716  		&& (R256.w[1] < ten2mk128trunc[ind].w[1]
    1717  		    || (R256.w[1] == ten2mk128trunc[ind].w[1]
    1718  			&& R256.w[0] <= ten2mk128trunc[ind].w[0]))) {
    1719  	      // the result is a midpoint
    1720  	      if ((tmp64 + R256.w[2]) & 0x01) {	// MP in [EVEN, ODD]
    1721  		// if floor(C2*) is odd C = floor(C2*) - 1; the result may be 0
    1722  		R256.w[2]--;
    1723  		if (R256.w[2] == 0xffffffffffffffffull)
    1724  		  R256.w[3]--;
    1725  		// this rounding is applied to C2 only!
    1726  		// x_sign != y_sign
    1727  		is_midpoint_lt_even = 1;
    1728  		is_inexact_lt_midpoint = 0;
    1729  		is_inexact_gt_midpoint = 0;
    1730  	      } else {
    1731  		// else MP in [ODD, EVEN]
    1732  		// this rounding is applied to C2 only!
    1733  		// x_sign != y_sign
    1734  		is_midpoint_gt_even = 1;
    1735  		is_inexact_lt_midpoint = 0;
    1736  		is_inexact_gt_midpoint = 0;
    1737  	      }
    1738  	    }
    1739  	  } else {	// if (ind == -1) only when x1 = 0
    1740  	    R256.w[2] = C2_lo;
    1741  	    R256.w[3] = C2_hi;
    1742  	    is_midpoint_lt_even = 0;
    1743  	    is_midpoint_gt_even = 0;
    1744  	    is_inexact_lt_midpoint = 0;
    1745  	    is_inexact_gt_midpoint = 0;
    1746  	  }
    1747  	  // and now subtract C1 * 10^(e1-e2-x1) - (C2 * 10^(-x1))rnd,P34
    1748  	  // because x_sign != y_sign this last operation is exact
    1749  	  C1.w[0] = C1.w[0] - R256.w[2];
    1750  	  C1.w[1] = C1.w[1] - R256.w[3];
    1751  	  if (C1.w[0] > tmp64)
    1752  	    C1.w[1]--;	// borrow
    1753  	  if (C1.w[1] >= 0x8000000000000000ull) {	// negative coefficient!
    1754  	    C1.w[0] = ~C1.w[0];
    1755  	    C1.w[0]++;
    1756  	    C1.w[1] = ~C1.w[1];
    1757  	    if (C1.w[0] == 0x0)
    1758  	      C1.w[1]++;
    1759  	    tmp_sign = y_sign;	// the result will have the sign of y
    1760  	  } else {
    1761  	    tmp_sign = x_sign;
    1762  	  }
    1763  	  // the difference has exactly P34 digits
    1764  	  x_sign = tmp_sign;
    1765  	  if (x1 >= 1)
    1766  	    y_exp = y_exp + ((UINT64) x1 << 49);
    1767  	  C1_hi = C1.w[1];
    1768  	  C1_lo = C1.w[0];
    1769  	  // general correction from RN to RA, RM, RP, RZ; result uses y_exp
    1770  	  if (rnd_mode != ROUNDING_TO_NEAREST) {
    1771  	    if ((!x_sign
    1772  		 && ((rnd_mode == ROUNDING_UP && is_inexact_lt_midpoint)
    1773  		     ||
    1774  		     ((rnd_mode == ROUNDING_TIES_AWAY
    1775  		       || rnd_mode == ROUNDING_UP)
    1776  		      && is_midpoint_gt_even))) || (x_sign
    1777  						    &&
    1778  						    ((rnd_mode ==
    1779  						      ROUNDING_DOWN
    1780  						      &&
    1781  						      is_inexact_lt_midpoint)
    1782  						     ||
    1783  						     ((rnd_mode ==
    1784  						       ROUNDING_TIES_AWAY
    1785  						       || rnd_mode ==
    1786  						       ROUNDING_DOWN)
    1787  						      &&
    1788  						      is_midpoint_gt_even))))
    1789  	    {
    1790  	      // C1 = C1 + 1
    1791  	      C1_lo = C1_lo + 1;
    1792  	      if (C1_lo == 0) {	// rounding overflow in the low 64 bits
    1793  		C1_hi = C1_hi + 1;
    1794  	      }
    1795  	      if (C1_hi == 0x0001ed09bead87c0ull
    1796  		  && C1_lo == 0x378d8e6400000000ull) {
    1797  		// C1 = 10^34 => rounding overflow
    1798  		C1_hi = 0x0000314dc6448d93ull;
    1799  		C1_lo = 0x38c15b0a00000000ull;	// 10^33
    1800  		y_exp = y_exp + EXP_P1;
    1801  	      }
    1802  	    } else if ((is_midpoint_lt_even || is_inexact_gt_midpoint)
    1803  		       &&
    1804  		       ((x_sign
    1805  			 && (rnd_mode == ROUNDING_UP
    1806  			     || rnd_mode == ROUNDING_TO_ZERO))
    1807  			|| (!x_sign
    1808  			    && (rnd_mode == ROUNDING_DOWN
    1809  				|| rnd_mode == ROUNDING_TO_ZERO)))) {
    1810  	      // C1 = C1 - 1
    1811  	      C1_lo = C1_lo - 1;
    1812  	      if (C1_lo == 0xffffffffffffffffull)
    1813  		C1_hi--;
    1814  	      // check if we crossed into the lower decade
    1815  	      if (C1_hi == 0x0000314dc6448d93ull && C1_lo == 0x38c15b09ffffffffull) {	// 10^33 - 1
    1816  		C1_hi = 0x0001ed09bead87c0ull;	// 10^34 - 1
    1817  		C1_lo = 0x378d8e63ffffffffull;
    1818  		y_exp = y_exp - EXP_P1;
    1819  		// no underflow, because delta + q2 >= P34 + 1
    1820  	      }
    1821  	    } else {
    1822  	      ;	// exact, the result is already correct
    1823  	    }
    1824  	  }
    1825  	  // assemble the result
    1826  	  res.w[1] = x_sign | y_exp | C1_hi;
    1827  	  res.w[0] = C1_lo;
    1828  	}
    1829        }	// end delta = P34
    1830      } else {	// if (|delta| <= P34 - 1)
    1831        if (delta >= 0) {	// if (0 <= delta <= P34 - 1)
    1832  	if (delta <= P34 - 1 - q2) {
    1833  	  // calculate C' directly; the result is exact
    1834  	  // in this case 1<=q1<=P34-1, 1<=q2<=P34-1 and 0 <= e1-e2 <= P34-2
    1835  	  // The coefficient of the result is C1 * 10^(e1-e2) + C2 and the
    1836  	  // exponent is e2; either C1 or 10^(e1-e2) may not fit is 64 bits,
    1837  	  // but their product fits with certainty in 128 bits (actually in 113)
    1838  	  scale = delta - q1 + q2;	// scale = (int)(e1 >> 49) - (int)(e2 >> 49) 
    1839  
    1840  	  if (scale >= 20) {	// 10^(e1-e2) does not fit in 64 bits, but C1 does
    1841  	    __mul_128x64_to_128 (C1, C1_lo, ten2k128[scale - 20]);
    1842  	    C1_hi = C1.w[1];
    1843  	    C1_lo = C1.w[0];
    1844  	  } else if (scale >= 1) {
    1845  	    // if 1 <= scale <= 19 then 10^(e1-e2) fits in 64 bits 
    1846  	    if (q1 <= 19) {	// C1 fits in 64 bits
    1847  	      __mul_64x64_to_128MACH (C1, C1_lo, ten2k64[scale]);
    1848  	    } else {	// q1 >= 20
    1849  	      C1.w[1] = C1_hi;
    1850  	      C1.w[0] = C1_lo;
    1851  	      __mul_128x64_to_128 (C1, ten2k64[scale], C1);
    1852  	    }
    1853  	    C1_hi = C1.w[1];
    1854  	    C1_lo = C1.w[0];
    1855  	  } else {	// if (scale == 0) C1 is unchanged
    1856  	    C1.w[0] = C1_lo;	// C1.w[1] = C1_hi; 
    1857  	  }
    1858  	  // now add C2
    1859  	  if (x_sign == y_sign) {
    1860  	    // the result cannot overflow
    1861  	    C1_lo = C1_lo + C2_lo;
    1862  	    C1_hi = C1_hi + C2_hi;
    1863  	    if (C1_lo < C1.w[0])
    1864  	      C1_hi++;
    1865  	  } else {	// if x_sign != y_sign
    1866  	    C1_lo = C1_lo - C2_lo;
    1867  	    C1_hi = C1_hi - C2_hi;
    1868  	    if (C1_lo > C1.w[0])
    1869  	      C1_hi--;
    1870  	    // the result can be zero, but it cannot overflow
    1871  	    if (C1_lo == 0 && C1_hi == 0) {
    1872  	      // assemble the result
    1873  	      if (x_exp < y_exp)
    1874  		res.w[1] = x_exp;
    1875  	      else
    1876  		res.w[1] = y_exp;
    1877  	      res.w[0] = 0;
    1878  	      if (rnd_mode == ROUNDING_DOWN) {
    1879  		res.w[1] |= 0x8000000000000000ull;
    1880  	      }
    1881  	      BID_SWAP128 (res);
    1882  	      BID_RETURN (res);
    1883  	    }
    1884  	    if (C1_hi >= 0x8000000000000000ull) {	// negative coefficient!
    1885  	      C1_lo = ~C1_lo;
    1886  	      C1_lo++;
    1887  	      C1_hi = ~C1_hi;
    1888  	      if (C1_lo == 0x0)
    1889  		C1_hi++;
    1890  	      x_sign = y_sign;	// the result will have the sign of y
    1891  	    }
    1892  	  }
    1893  	  // assemble the result
    1894  	  res.w[1] = x_sign | y_exp | C1_hi;
    1895  	  res.w[0] = C1_lo;
    1896  	} else if (delta == P34 - q2) {
    1897  	  // calculate C' directly; the result may be inexact if it requires 
    1898  	  // P34+1 decimal digits; in this case the 'cutoff' point for addition
    1899  	  // is at the position of the lsb of C2, so 0 <= e1-e2 <= P34-1
    1900  	  // The coefficient of the result is C1 * 10^(e1-e2) + C2 and the
    1901  	  // exponent is e2; either C1 or 10^(e1-e2) may not fit is 64 bits,
    1902  	  // but their product fits with certainty in 128 bits (actually in 113)
    1903  	  scale = delta - q1 + q2;	// scale = (int)(e1 >> 49) - (int)(e2 >> 49)
    1904  	  if (scale >= 20) {	// 10^(e1-e2) does not fit in 64 bits, but C1 does
    1905  	    __mul_128x64_to_128 (C1, C1_lo, ten2k128[scale - 20]);
    1906  	  } else if (scale >= 1) {
    1907  	    // if 1 <= scale <= 19 then 10^(e1-e2) fits in 64 bits
    1908  	    if (q1 <= 19) {	// C1 fits in 64 bits
    1909  	      __mul_64x64_to_128MACH (C1, C1_lo, ten2k64[scale]);
    1910  	    } else {	// q1 >= 20
    1911  	      C1.w[1] = C1_hi;
    1912  	      C1.w[0] = C1_lo;
    1913  	      __mul_128x64_to_128 (C1, ten2k64[scale], C1);
    1914  	    }
    1915  	  } else {	// if (scale == 0) C1 is unchanged
    1916  	    C1.w[1] = C1_hi;
    1917  	    C1.w[0] = C1_lo;	// only the low part is necessary
    1918  	  }
    1919  	  C1_hi = C1.w[1];
    1920  	  C1_lo = C1.w[0];
    1921  	  // now add C2
    1922  	  if (x_sign == y_sign) {
    1923  	    // the result can overflow!
    1924  	    C1_lo = C1_lo + C2_lo;
    1925  	    C1_hi = C1_hi + C2_hi;
    1926  	    if (C1_lo < C1.w[0])
    1927  	      C1_hi++;
    1928  	    // test for overflow, possible only when C1 >= 10^34
    1929  	    if (C1_hi > 0x0001ed09bead87c0ull || (C1_hi == 0x0001ed09bead87c0ull && C1_lo >= 0x378d8e6400000000ull)) {	// C1 >= 10^34
    1930  	      // in this case q = P34 + 1 and x = q - P34 = 1, so multiply 
    1931  	      // C'' = C'+ 5 = C1 + 5 by k1 ~ 10^(-1) calculated for P34 + 1 
    1932  	      // decimal digits
    1933  	      // Calculate C'' = C' + 1/2 * 10^x
    1934  	      if (C1_lo >= 0xfffffffffffffffbull) {	// low half add has carry
    1935  		C1_lo = C1_lo + 5;
    1936  		C1_hi = C1_hi + 1;
    1937  	      } else {
    1938  		C1_lo = C1_lo + 5;
    1939  	      }
    1940  	      // the approximation of 10^(-1) was rounded up to 118 bits
    1941  	      // 10^(-1) =~ 33333333333333333333333333333400 * 2^-129
    1942  	      // 10^(-1) =~ 19999999999999999999999999999a00 * 2^-128
    1943  	      C1.w[1] = C1_hi;
    1944  	      C1.w[0] = C1_lo;	// C''
    1945  	      ten2m1.w[1] = 0x1999999999999999ull;
    1946  	      ten2m1.w[0] = 0x9999999999999a00ull;
    1947  	      __mul_128x128_to_256 (P256, C1, ten2m1);	// P256 = C*, f*
    1948  	      // C* is actually floor(C*) in this case
    1949  	      // the top Ex = 128 bits of 10^(-1) are 
    1950  	      // T* = 0x00199999999999999999999999999999
    1951  	      // if (0 < f* < 10^(-x)) then
    1952  	      //   if floor(C*) is even then C = floor(C*) - logical right 
    1953  	      //       shift; C has p decimal digits, correct by Prop. 1)
    1954  	      //   else if floor(C*) is odd C = floor(C*) - 1 (logical right
    1955  	      //       shift; C has p decimal digits, correct by Pr. 1)
    1956  	      // else
    1957  	      //   C = floor(C*) (logical right shift; C has p decimal digits,
    1958  	      //       correct by Property 1)
    1959  	      // n = C * 10^(e2+x)
    1960  	      if ((P256.w[1] || P256.w[0])
    1961  		  && (P256.w[1] < 0x1999999999999999ull
    1962  		      || (P256.w[1] == 0x1999999999999999ull
    1963  			  && P256.w[0] <= 0x9999999999999999ull))) {
    1964  		// the result is a midpoint
    1965  		if (P256.w[2] & 0x01) {
    1966  		  is_midpoint_gt_even = 1;
    1967  		  // if floor(C*) is odd C = floor(C*) - 1; the result is not 0
    1968  		  P256.w[2]--;
    1969  		  if (P256.w[2] == 0xffffffffffffffffull)
    1970  		    P256.w[3]--;
    1971  		} else {
    1972  		  is_midpoint_lt_even = 1;
    1973  		}
    1974  	      }
    1975  	      // n = Cstar * 10^(e2+1)
    1976  	      y_exp = y_exp + EXP_P1;
    1977  	      // C* != 10^P because C* has P34 digits
    1978  	      // check for overflow
    1979  	      if (y_exp == EXP_MAX_P1
    1980  		  && (rnd_mode == ROUNDING_TO_NEAREST
    1981  		      || rnd_mode == ROUNDING_TIES_AWAY)) {
    1982  		// overflow for RN
    1983  		res.w[1] = x_sign | 0x7800000000000000ull;	// +/-inf
    1984  		res.w[0] = 0x0ull;
    1985  		// set the inexact flag
    1986  		*pfpsf |= INEXACT_EXCEPTION;
    1987  		// set the overflow flag
    1988  		*pfpsf |= OVERFLOW_EXCEPTION;
    1989  		BID_SWAP128 (res);
    1990  		BID_RETURN (res);
    1991  	      }
    1992  	      // if (0 < f* - 1/2 < 10^(-x)) then 
    1993  	      //   the result of the addition is exact 
    1994  	      // else 
    1995  	      //   the result of the addition is inexact
    1996  	      if (P256.w[1] > 0x8000000000000000ull || (P256.w[1] == 0x8000000000000000ull && P256.w[0] > 0x0ull)) {	// the result may be exact
    1997  		tmp64 = P256.w[1] - 0x8000000000000000ull;	// f* - 1/2
    1998  		if ((tmp64 > 0x1999999999999999ull
    1999  		     || (tmp64 == 0x1999999999999999ull
    2000  			 && P256.w[0] >= 0x9999999999999999ull))) {
    2001  		  // set the inexact flag
    2002  		  *pfpsf |= INEXACT_EXCEPTION;
    2003  		  is_inexact = 1;
    2004  		}	// else the result is exact
    2005  	      } else {	// the result is inexact
    2006  		// set the inexact flag
    2007  		*pfpsf |= INEXACT_EXCEPTION;
    2008  		is_inexact = 1;
    2009  	      }
    2010  	      C1_hi = P256.w[3];
    2011  	      C1_lo = P256.w[2];
    2012  	      if (!is_midpoint_gt_even && !is_midpoint_lt_even) {
    2013  		is_inexact_lt_midpoint = is_inexact
    2014  		  && (P256.w[1] & 0x8000000000000000ull);
    2015  		is_inexact_gt_midpoint = is_inexact
    2016  		  && !(P256.w[1] & 0x8000000000000000ull);
    2017  	      }
    2018  	      // general correction from RN to RA, RM, RP, RZ; 
    2019  	      // result uses y_exp
    2020  	      if (rnd_mode != ROUNDING_TO_NEAREST) {
    2021  		if ((!x_sign
    2022  		     &&
    2023  		     ((rnd_mode == ROUNDING_UP
    2024  		       && is_inexact_lt_midpoint)
    2025  		      ||
    2026  		      ((rnd_mode == ROUNDING_TIES_AWAY
    2027  			|| rnd_mode == ROUNDING_UP)
    2028  		       && is_midpoint_gt_even))) || (x_sign
    2029  						     &&
    2030  						     ((rnd_mode ==
    2031  						       ROUNDING_DOWN
    2032  						       &&
    2033  						       is_inexact_lt_midpoint)
    2034  						      ||
    2035  						      ((rnd_mode ==
    2036  							ROUNDING_TIES_AWAY
    2037  							|| rnd_mode ==
    2038  							ROUNDING_DOWN)
    2039  						       &&
    2040  						       is_midpoint_gt_even))))
    2041  		{
    2042  		  // C1 = C1 + 1
    2043  		  C1_lo = C1_lo + 1;
    2044  		  if (C1_lo == 0) {	// rounding overflow in the low 64 bits
    2045  		    C1_hi = C1_hi + 1;
    2046  		  }
    2047  		  if (C1_hi == 0x0001ed09bead87c0ull
    2048  		      && C1_lo == 0x378d8e6400000000ull) {
    2049  		    // C1 = 10^34 => rounding overflow
    2050  		    C1_hi = 0x0000314dc6448d93ull;
    2051  		    C1_lo = 0x38c15b0a00000000ull;	// 10^33
    2052  		    y_exp = y_exp + EXP_P1;
    2053  		  }
    2054  		} else
    2055  		  if ((is_midpoint_lt_even || is_inexact_gt_midpoint)
    2056  		      &&
    2057  		      ((x_sign
    2058  			&& (rnd_mode == ROUNDING_UP
    2059  			    || rnd_mode == ROUNDING_TO_ZERO))
    2060  		       || (!x_sign
    2061  			   && (rnd_mode == ROUNDING_DOWN
    2062  			       || rnd_mode == ROUNDING_TO_ZERO)))) {
    2063  		  // C1 = C1 - 1
    2064  		  C1_lo = C1_lo - 1;
    2065  		  if (C1_lo == 0xffffffffffffffffull)
    2066  		    C1_hi--;
    2067  		  // check if we crossed into the lower decade
    2068  		  if (C1_hi == 0x0000314dc6448d93ull && C1_lo == 0x38c15b09ffffffffull) {	// 10^33 - 1
    2069  		    C1_hi = 0x0001ed09bead87c0ull;	// 10^34 - 1
    2070  		    C1_lo = 0x378d8e63ffffffffull;
    2071  		    y_exp = y_exp - EXP_P1;
    2072  		    // no underflow, because delta + q2 >= P34 + 1
    2073  		  }
    2074  		} else {
    2075  		  ;	// exact, the result is already correct
    2076  		}
    2077  		// in all cases check for overflow (RN and RA solved already)
    2078  		if (y_exp == EXP_MAX_P1) {	// overflow
    2079  		  if ((rnd_mode == ROUNDING_DOWN && x_sign) ||	// RM and res < 0
    2080  		      (rnd_mode == ROUNDING_UP && !x_sign)) {	// RP and res > 0
    2081  		    C1_hi = 0x7800000000000000ull;	// +inf
    2082  		    C1_lo = 0x0ull;
    2083  		  } else {	// RM and res > 0, RP and res < 0, or RZ
    2084  		    C1_hi = 0x5fffed09bead87c0ull;
    2085  		    C1_lo = 0x378d8e63ffffffffull;
    2086  		  }
    2087  		  y_exp = 0;	// x_sign is preserved
    2088  		  // set the inexact flag (in case the exact addition was exact)
    2089  		  *pfpsf |= INEXACT_EXCEPTION;
    2090  		  // set the overflow flag
    2091  		  *pfpsf |= OVERFLOW_EXCEPTION;
    2092  		}
    2093  	      }
    2094  	    }	// else if (C1 < 10^34) then C1 is the coeff.; the result is exact
    2095  	  } else {	// if x_sign != y_sign the result is exact
    2096  	    C1_lo = C1_lo - C2_lo;
    2097  	    C1_hi = C1_hi - C2_hi;
    2098  	    if (C1_lo > C1.w[0])
    2099  	      C1_hi--;
    2100  	    // the result can be zero, but it cannot overflow
    2101  	    if (C1_lo == 0 && C1_hi == 0) {
    2102  	      // assemble the result
    2103  	      if (x_exp < y_exp)
    2104  		res.w[1] = x_exp;
    2105  	      else
    2106  		res.w[1] = y_exp;
    2107  	      res.w[0] = 0;
    2108  	      if (rnd_mode == ROUNDING_DOWN) {
    2109  		res.w[1] |= 0x8000000000000000ull;
    2110  	      }
    2111  	      BID_SWAP128 (res);
    2112  	      BID_RETURN (res);
    2113  	    }
    2114  	    if (C1_hi >= 0x8000000000000000ull) {	// negative coefficient!
    2115  	      C1_lo = ~C1_lo;
    2116  	      C1_lo++;
    2117  	      C1_hi = ~C1_hi;
    2118  	      if (C1_lo == 0x0)
    2119  		C1_hi++;
    2120  	      x_sign = y_sign;	// the result will have the sign of y
    2121  	    }
    2122  	  }
    2123  	  // assemble the result
    2124  	  res.w[1] = x_sign | y_exp | C1_hi;
    2125  	  res.w[0] = C1_lo;
    2126  	} else {	// if (delta >= P34 + 1 - q2)
    2127  	  // instead of C' = (C1 * 10^(e1-e2) + C2)rnd,P34
    2128  	  // calculate C' = C1 * 10^(e1-e2-x1) + (C2 * 10^(-x1))rnd,P34 
    2129  	  // where x1 = q1 + e1 - e2 - P34, 1 <= x1 <= P34 - 1
    2130  	  // In most cases C' will have P34 digits, and n = C' * 10^(e2+x1)
    2131  	  // If the result has P34+1 digits, redo the steps above with x1+1
    2132  	  // If the result has P34-1 digits or less, redo the steps above with 
    2133  	  // x1-1 but only if initially x1 >= 1
    2134  	  // NOTE: these two steps can be improved, e.g we could guess if
    2135  	  // P34+1 or P34-1 digits will be obtained by adding/subtracting just
    2136  	  // the top 64 bits of the two operands
    2137  	  // The result cannot be zero, but it can overflow
    2138  	  x1 = delta + q2 - P34;	// 1 <= x1 <= P34-1
    2139  	roundC2:
    2140  	  // Calculate C1 * 10^(e1-e2-x1) where 0 <= e1-e2-x1 <= P34 - 1
    2141  	  // scale = (int)(e1 >> 49) - (int)(e2 >> 49) - x1; 0 <= scale <= P34-1
    2142  	  scale = delta - q1 + q2 - x1;	// scale = e1 - e2 - x1 = P34 - q1
    2143  	  // either C1 or 10^(e1-e2-x1) may not fit is 64 bits,
    2144  	  // but their product fits with certainty in 128 bits (actually in 113)
    2145  	  if (scale >= 20) {	//10^(e1-e2-x1) doesn't fit in 64 bits, but C1 does
    2146  	    __mul_128x64_to_128 (C1, C1_lo, ten2k128[scale - 20]);
    2147  	  } else if (scale >= 1) {
    2148  	    // if 1 <= scale <= 19 then 10^(e1-e2-x1) fits in 64 bits
    2149  	    if (q1 <= 19) {	// C1 fits in 64 bits
    2150  	      __mul_64x64_to_128MACH (C1, C1_lo, ten2k64[scale]);
    2151  	    } else {	// q1 >= 20
    2152  	      C1.w[1] = C1_hi;
    2153  	      C1.w[0] = C1_lo;
    2154  	      __mul_128x64_to_128 (C1, ten2k64[scale], C1);
    2155  	    }
    2156  	  } else {	// if (scale == 0) C1 is unchanged
    2157  	    C1.w[1] = C1_hi;
    2158  	    C1.w[0] = C1_lo;
    2159  	  }
    2160  	  tmp64 = C1.w[0];	// C1.w[1], C1.w[0] contains C1 * 10^(e1-e2-x1)
    2161  
    2162  	  // now round C2 to q2-x1 decimal digits, where 1<=x1<=q2-1<=P34-1
    2163  	  // (but if we got here a second time after x1 = x1 - 1, then 
    2164  	  // x1 >= 0; note that for x1 = 0 C2 is unchanged)
    2165  	  // C2' = C2 + 1/2 * 10^x1 = C2 + 5 * 10^(x1-1)
    2166  	  ind = x1 - 1;	// 0 <= ind <= q2-2<=P34-2=32; but note that if x1 = 0
    2167  	  // during a second pass, then ind = -1
    2168  	  if (ind >= 0) {	// if (x1 >= 1)
    2169  	    C2.w[0] = C2_lo;
    2170  	    C2.w[1] = C2_hi;
    2171  	    if (ind <= 18) {
    2172  	      C2.w[0] = C2.w[0] + midpoint64[ind];
    2173  	      if (C2.w[0] < C2_lo)
    2174  		C2.w[1]++;
    2175  	    } else {	// 19 <= ind <= 32
    2176  	      C2.w[0] = C2.w[0] + midpoint128[ind - 19].w[0];
    2177  	      C2.w[1] = C2.w[1] + midpoint128[ind - 19].w[1];
    2178  	      if (C2.w[0] < C2_lo)
    2179  		C2.w[1]++;
    2180  	    }
    2181  	    // the approximation of 10^(-x1) was rounded up to 118 bits
    2182  	    __mul_128x128_to_256 (R256, C2, ten2mk128[ind]);	// R256 = C2*, f2*
    2183  	    // calculate C2* and f2*
    2184  	    // C2* is actually floor(C2*) in this case
    2185  	    // C2* and f2* need shifting and masking, as shown by
    2186  	    // shiftright128[] and maskhigh128[]
    2187  	    // the top Ex bits of 10^(-x1) are T* = ten2mk128trunc[ind], e.g.
    2188  	    // if x1=1, T*=ten2mk128trunc[0]=0x19999999999999999999999999999999
    2189  	    // if (0 < f2* < 10^(-x1)) then
    2190  	    //   if floor(C1+C2*) is even then C2* = floor(C2*) - logical right
    2191  	    //       shift; C2* has p decimal digits, correct by Prop. 1)
    2192  	    //   else if floor(C1+C2*) is odd C2* = floor(C2*)-1 (logical right
    2193  	    //       shift; C2* has p decimal digits, correct by Pr. 1)
    2194  	    // else
    2195  	    //   C2* = floor(C2*) (logical right shift; C has p decimal digits,
    2196  	    //       correct by Property 1)
    2197  	    // n = C2* * 10^(e2+x1)
    2198  
    2199  	    if (ind <= 2) {
    2200  	      highf2star.w[1] = 0x0;
    2201  	      highf2star.w[0] = 0x0;	// low f2* ok
    2202  	    } else if (ind <= 21) {
    2203  	      highf2star.w[1] = 0x0;
    2204  	      highf2star.w[0] = R256.w[2] & maskhigh128[ind];	// low f2* ok
    2205  	    } else {
    2206  	      highf2star.w[1] = R256.w[3] & maskhigh128[ind];
    2207  	      highf2star.w[0] = R256.w[2];	// low f2* is ok
    2208  	    }
    2209  	    // shift right C2* by Ex-128 = shiftright128[ind]
    2210  	    if (ind >= 3) {
    2211  	      shift = shiftright128[ind];
    2212  	      if (shift < 64) {	// 3 <= shift <= 63
    2213  		R256.w[2] =
    2214  		  (R256.w[2] >> shift) | (R256.w[3] << (64 - shift));
    2215  		R256.w[3] = (R256.w[3] >> shift);
    2216  	      } else {	// 66 <= shift <= 102
    2217  		R256.w[2] = (R256.w[3] >> (shift - 64));
    2218  		R256.w[3] = 0x0ULL;
    2219  	      }
    2220  	    }
    2221  	    if (second_pass) {
    2222  	      is_inexact_lt_midpoint = 0;
    2223  	      is_inexact_gt_midpoint = 0;
    2224  	      is_midpoint_lt_even = 0;
    2225  	      is_midpoint_gt_even = 0;
    2226  	    }
    2227  	    // determine inexactness of the rounding of C2* (this may be 
    2228  	    // followed by a second rounding only if we get P34+1 
    2229  	    // decimal digits)
    2230  	    // if (0 < f2* - 1/2 < 10^(-x1)) then
    2231  	    //   the result is exact
    2232  	    // else (if f2* - 1/2 > T* then)
    2233  	    //   the result of is inexact
    2234  	    if (ind <= 2) {
    2235  	      if (R256.w[1] > 0x8000000000000000ull ||
    2236  		  (R256.w[1] == 0x8000000000000000ull
    2237  		   && R256.w[0] > 0x0ull)) {
    2238  		// f2* > 1/2 and the result may be exact
    2239  		tmp64A = R256.w[1] - 0x8000000000000000ull;	// f* - 1/2
    2240  		if ((tmp64A > ten2mk128trunc[ind].w[1]
    2241  		     || (tmp64A == ten2mk128trunc[ind].w[1]
    2242  			 && R256.w[0] >= ten2mk128trunc[ind].w[0]))) {
    2243  		  // set the inexact flag
    2244  		  // *pfpsf |= INEXACT_EXCEPTION;
    2245  		  tmp_inexact = 1;	// may be set again during a second pass
    2246  		  // this rounding is applied to C2 only!
    2247  		  if (x_sign == y_sign)
    2248  		    is_inexact_lt_midpoint = 1;
    2249  		  else	// if (x_sign != y_sign)
    2250  		    is_inexact_gt_midpoint = 1;
    2251  		}	// else the result is exact
    2252  		// rounding down, unless a midpoint in [ODD, EVEN]
    2253  	      } else {	// the result is inexact; f2* <= 1/2
    2254  		// set the inexact flag
    2255  		// *pfpsf |= INEXACT_EXCEPTION;
    2256  		tmp_inexact = 1;	// just in case we will round a second time
    2257  		// rounding up, unless a midpoint in [EVEN, ODD]
    2258  		// this rounding is applied to C2 only!
    2259  		if (x_sign == y_sign)
    2260  		  is_inexact_gt_midpoint = 1;
    2261  		else	// if (x_sign != y_sign)
    2262  		  is_inexact_lt_midpoint = 1;
    2263  	      }
    2264  	    } else if (ind <= 21) {	// if 3 <= ind <= 21
    2265  	      if (highf2star.w[1] > 0x0 || (highf2star.w[1] == 0x0
    2266  					    && highf2star.w[0] >
    2267  					    onehalf128[ind])
    2268  		  || (highf2star.w[1] == 0x0
    2269  		      && highf2star.w[0] == onehalf128[ind]
    2270  		      && (R256.w[1] || R256.w[0]))) {
    2271  		// f2* > 1/2 and the result may be exact
    2272  		// Calculate f2* - 1/2
    2273  		tmp64A = highf2star.w[0] - onehalf128[ind];
    2274  		tmp64B = highf2star.w[1];
    2275  		if (tmp64A > highf2star.w[0])
    2276  		  tmp64B--;
    2277  		if (tmp64B || tmp64A
    2278  		    || R256.w[1] > ten2mk128trunc[ind].w[1]
    2279  		    || (R256.w[1] == ten2mk128trunc[ind].w[1]
    2280  			&& R256.w[0] > ten2mk128trunc[ind].w[0])) {
    2281  		  // set the inexact flag
    2282  		  // *pfpsf |= INEXACT_EXCEPTION;
    2283  		  tmp_inexact = 1;	// may be set again during a second pass
    2284  		  // this rounding is applied to C2 only!
    2285  		  if (x_sign == y_sign)
    2286  		    is_inexact_lt_midpoint = 1;
    2287  		  else	// if (x_sign != y_sign)
    2288  		    is_inexact_gt_midpoint = 1;
    2289  		}	// else the result is exact
    2290  	      } else {	// the result is inexact; f2* <= 1/2
    2291  		// set the inexact flag
    2292  		// *pfpsf |= INEXACT_EXCEPTION;
    2293  		tmp_inexact = 1;	// may be set again during a second pass
    2294  		// rounding up, unless a midpoint in [EVEN, ODD]
    2295  		// this rounding is applied to C2 only!
    2296  		if (x_sign == y_sign)
    2297  		  is_inexact_gt_midpoint = 1;
    2298  		else	// if (x_sign != y_sign)
    2299  		  is_inexact_lt_midpoint = 1;
    2300  	      }
    2301  	    } else {	// if 22 <= ind <= 33
    2302  	      if (highf2star.w[1] > onehalf128[ind]
    2303  		  || (highf2star.w[1] == onehalf128[ind]
    2304  		      && (highf2star.w[0] || R256.w[1]
    2305  			  || R256.w[0]))) {
    2306  		// f2* > 1/2 and the result may be exact
    2307  		// Calculate f2* - 1/2
    2308  		// tmp64A = highf2star.w[0];
    2309  		tmp64B = highf2star.w[1] - onehalf128[ind];
    2310  		if (tmp64B || highf2star.w[0]
    2311  		    || R256.w[1] > ten2mk128trunc[ind].w[1]
    2312  		    || (R256.w[1] == ten2mk128trunc[ind].w[1]
    2313  			&& R256.w[0] > ten2mk128trunc[ind].w[0])) {
    2314  		  // set the inexact flag
    2315  		  // *pfpsf |= INEXACT_EXCEPTION;
    2316  		  tmp_inexact = 1;	// may be set again during a second pass
    2317  		  // this rounding is applied to C2 only!
    2318  		  if (x_sign == y_sign)
    2319  		    is_inexact_lt_midpoint = 1;
    2320  		  else	// if (x_sign != y_sign)
    2321  		    is_inexact_gt_midpoint = 1;
    2322  		}	// else the result is exact
    2323  	      } else {	// the result is inexact; f2* <= 1/2
    2324  		// set the inexact flag
    2325  		// *pfpsf |= INEXACT_EXCEPTION;
    2326  		tmp_inexact = 1;	// may be set again during a second pass
    2327  		// rounding up, unless a midpoint in [EVEN, ODD]
    2328  		// this rounding is applied to C2 only!
    2329  		if (x_sign == y_sign)
    2330  		  is_inexact_gt_midpoint = 1;
    2331  		else	// if (x_sign != y_sign)
    2332  		  is_inexact_lt_midpoint = 1;
    2333  	      }
    2334  	    }
    2335  	    // check for midpoints
    2336  	    if ((R256.w[1] || R256.w[0]) && (highf2star.w[1] == 0)
    2337  		&& (highf2star.w[0] == 0)
    2338  		&& (R256.w[1] < ten2mk128trunc[ind].w[1]
    2339  		    || (R256.w[1] == ten2mk128trunc[ind].w[1]
    2340  			&& R256.w[0] <= ten2mk128trunc[ind].w[0]))) {
    2341  	      // the result is a midpoint
    2342  	      if ((tmp64 + R256.w[2]) & 0x01) {	// MP in [EVEN, ODD]
    2343  		// if floor(C2*) is odd C = floor(C2*) - 1; the result may be 0
    2344  		R256.w[2]--;
    2345  		if (R256.w[2] == 0xffffffffffffffffull)
    2346  		  R256.w[3]--;
    2347  		// this rounding is applied to C2 only!
    2348  		if (x_sign == y_sign)
    2349  		  is_midpoint_gt_even = 1;
    2350  		else	// if (x_sign != y_sign)
    2351  		  is_midpoint_lt_even = 1;
    2352  		is_inexact_lt_midpoint = 0;
    2353  		is_inexact_gt_midpoint = 0;
    2354  	      } else {
    2355  		// else MP in [ODD, EVEN]
    2356  		// this rounding is applied to C2 only!
    2357  		if (x_sign == y_sign)
    2358  		  is_midpoint_lt_even = 1;
    2359  		else	// if (x_sign != y_sign)
    2360  		  is_midpoint_gt_even = 1;
    2361  		is_inexact_lt_midpoint = 0;
    2362  		is_inexact_gt_midpoint = 0;
    2363  	      }
    2364  	    }
    2365  	    // end if (ind >= 0)
    2366  	  } else {	// if (ind == -1); only during a 2nd pass, and when x1 = 0
    2367  	    R256.w[2] = C2_lo;
    2368  	    R256.w[3] = C2_hi;
    2369  	    tmp_inexact = 0;
    2370  	    // to correct a possible setting to 1 from 1st pass
    2371  	    if (second_pass) {
    2372  	      is_midpoint_lt_even = 0;
    2373  	      is_midpoint_gt_even = 0;
    2374  	      is_inexact_lt_midpoint = 0;
    2375  	      is_inexact_gt_midpoint = 0;
    2376  	    }
    2377  	  }
    2378  	  // and now add/subtract C1 * 10^(e1-e2-x1) +/- (C2 * 10^(-x1))rnd,P34
    2379  	  if (x_sign == y_sign) {	// addition; could overflow
    2380  	    // no second pass is possible this way (only for x_sign != y_sign)
    2381  	    C1.w[0] = C1.w[0] + R256.w[2];
    2382  	    C1.w[1] = C1.w[1] + R256.w[3];
    2383  	    if (C1.w[0] < tmp64)
    2384  	      C1.w[1]++;	// carry
    2385  	    // if the sum has P34+1 digits, i.e. C1>=10^34 redo the calculation
    2386  	    // with x1=x1+1 
    2387  	    if (C1.w[1] > 0x0001ed09bead87c0ull || (C1.w[1] == 0x0001ed09bead87c0ull && C1.w[0] >= 0x378d8e6400000000ull)) {	// C1 >= 10^34
    2388  	      // chop off one more digit from the sum, but make sure there is
    2389  	      // no double-rounding error (see table - double rounding logic)
    2390  	      // now round C1 from P34+1 to P34 decimal digits
    2391  	      // C1' = C1 + 1/2 * 10 = C1 + 5
    2392  	      if (C1.w[0] >= 0xfffffffffffffffbull) {	// low half add has carry
    2393  		C1.w[0] = C1.w[0] + 5;
    2394  		C1.w[1] = C1.w[1] + 1;
    2395  	      } else {
    2396  		C1.w[0] = C1.w[0] + 5;
    2397  	      }
    2398  	      // the approximation of 10^(-1) was rounded up to 118 bits
    2399  	      __mul_128x128_to_256 (Q256, C1, ten2mk128[0]);	// Q256 = C1*, f1*
    2400  	      // C1* is actually floor(C1*) in this case
    2401  	      // the top 128 bits of 10^(-1) are
    2402  	      // T* = ten2mk128trunc[0]=0x19999999999999999999999999999999
    2403  	      // if (0 < f1* < 10^(-1)) then
    2404  	      //   if floor(C1*) is even then C1* = floor(C1*) - logical right
    2405  	      //       shift; C1* has p decimal digits, correct by Prop. 1)
    2406  	      //   else if floor(C1*) is odd C1* = floor(C1*) - 1 (logical right
    2407  	      //       shift; C1* has p decimal digits, correct by Pr. 1)
    2408  	      // else
    2409  	      //   C1* = floor(C1*) (logical right shift; C has p decimal digits
    2410  	      //       correct by Property 1)
    2411  	      // n = C1* * 10^(e2+x1+1)
    2412  	      if ((Q256.w[1] || Q256.w[0])
    2413  		  && (Q256.w[1] < ten2mk128trunc[0].w[1]
    2414  		      || (Q256.w[1] == ten2mk128trunc[0].w[1]
    2415  			  && Q256.w[0] <= ten2mk128trunc[0].w[0]))) {
    2416  		// the result is a midpoint
    2417  		if (is_inexact_lt_midpoint) {	// for the 1st rounding
    2418  		  is_inexact_gt_midpoint = 1;
    2419  		  is_inexact_lt_midpoint = 0;
    2420  		  is_midpoint_gt_even = 0;
    2421  		  is_midpoint_lt_even = 0;
    2422  		} else if (is_inexact_gt_midpoint) {	// for the 1st rounding
    2423  		  Q256.w[2]--;
    2424  		  if (Q256.w[2] == 0xffffffffffffffffull)
    2425  		    Q256.w[3]--;
    2426  		  is_inexact_gt_midpoint = 0;
    2427  		  is_inexact_lt_midpoint = 1;
    2428  		  is_midpoint_gt_even = 0;
    2429  		  is_midpoint_lt_even = 0;
    2430  		} else if (is_midpoint_gt_even) {	// for the 1st rounding
    2431  		  // Note: cannot have is_midpoint_lt_even
    2432  		  is_inexact_gt_midpoint = 0;
    2433  		  is_inexact_lt_midpoint = 1;
    2434  		  is_midpoint_gt_even = 0;
    2435  		  is_midpoint_lt_even = 0;
    2436  		} else {	// the first rounding must have been exact
    2437  		  if (Q256.w[2] & 0x01) {	// MP in [EVEN, ODD]
    2438  		    // the truncated result is correct
    2439  		    Q256.w[2]--;
    2440  		    if (Q256.w[2] == 0xffffffffffffffffull)
    2441  		      Q256.w[3]--;
    2442  		    is_inexact_gt_midpoint = 0;
    2443  		    is_inexact_lt_midpoint = 0;
    2444  		    is_midpoint_gt_even = 1;
    2445  		    is_midpoint_lt_even = 0;
    2446  		  } else {	// MP in [ODD, EVEN]
    2447  		    is_inexact_gt_midpoint = 0;
    2448  		    is_inexact_lt_midpoint = 0;
    2449  		    is_midpoint_gt_even = 0;
    2450  		    is_midpoint_lt_even = 1;
    2451  		  }
    2452  		}
    2453  		tmp_inexact = 1;	// in all cases
    2454  	      } else {	// the result is not a midpoint 
    2455  		// determine inexactness of the rounding of C1 (the sum C1+C2*)
    2456  		// if (0 < f1* - 1/2 < 10^(-1)) then
    2457  		//   the result is exact
    2458  		// else (if f1* - 1/2 > T* then)
    2459  		//   the result of is inexact
    2460  		// ind = 0
    2461  		if (Q256.w[1] > 0x8000000000000000ull
    2462  		    || (Q256.w[1] == 0x8000000000000000ull
    2463  			&& Q256.w[0] > 0x0ull)) {
    2464  		  // f1* > 1/2 and the result may be exact
    2465  		  Q256.w[1] = Q256.w[1] - 0x8000000000000000ull;	// f1* - 1/2
    2466  		  if ((Q256.w[1] > ten2mk128trunc[0].w[1]
    2467  		       || (Q256.w[1] == ten2mk128trunc[0].w[1]
    2468  			   && Q256.w[0] > ten2mk128trunc[0].w[0]))) {
    2469  		    is_inexact_gt_midpoint = 0;
    2470  		    is_inexact_lt_midpoint = 1;
    2471  		    is_midpoint_gt_even = 0;
    2472  		    is_midpoint_lt_even = 0;
    2473  		    // set the inexact flag
    2474  		    tmp_inexact = 1;
    2475  		    // *pfpsf |= INEXACT_EXCEPTION;
    2476  		  } else {	// else the result is exact for the 2nd rounding
    2477  		    if (tmp_inexact) {	// if the previous rounding was inexact
    2478  		      if (is_midpoint_lt_even) {
    2479  			is_inexact_gt_midpoint = 1;
    2480  			is_midpoint_lt_even = 0;
    2481  		      } else if (is_midpoint_gt_even) {
    2482  			is_inexact_lt_midpoint = 1;
    2483  			is_midpoint_gt_even = 0;
    2484  		      } else {
    2485  			;	// no change
    2486  		      }
    2487  		    }
    2488  		  }
    2489  		  // rounding down, unless a midpoint in [ODD, EVEN]
    2490  		} else {	// the result is inexact; f1* <= 1/2
    2491  		  is_inexact_gt_midpoint = 1;
    2492  		  is_inexact_lt_midpoint = 0;
    2493  		  is_midpoint_gt_even = 0;
    2494  		  is_midpoint_lt_even = 0;
    2495  		  // set the inexact flag
    2496  		  tmp_inexact = 1;
    2497  		  // *pfpsf |= INEXACT_EXCEPTION;
    2498  		}
    2499  	      }	// end 'the result is not a midpoint'
    2500  	      // n = C1 * 10^(e2+x1)
    2501  	      C1.w[1] = Q256.w[3];
    2502  	      C1.w[0] = Q256.w[2];
    2503  	      y_exp = y_exp + ((UINT64) (x1 + 1) << 49);
    2504  	    } else {	// C1 < 10^34
    2505  	      // C1.w[1] and C1.w[0] already set
    2506  	      // n = C1 * 10^(e2+x1)
    2507  	      y_exp = y_exp + ((UINT64) x1 << 49);
    2508  	    }
    2509  	    // check for overflow
    2510  	    if (y_exp == EXP_MAX_P1
    2511  		&& (rnd_mode == ROUNDING_TO_NEAREST
    2512  		    || rnd_mode == ROUNDING_TIES_AWAY)) {
    2513  	      res.w[1] = 0x7800000000000000ull | x_sign;	// +/-inf
    2514  	      res.w[0] = 0x0ull;
    2515  	      // set the inexact flag
    2516  	      *pfpsf |= INEXACT_EXCEPTION;
    2517  	      // set the overflow flag
    2518  	      *pfpsf |= OVERFLOW_EXCEPTION;
    2519  	      BID_SWAP128 (res);
    2520  	      BID_RETURN (res);
    2521  	    }	// else no overflow
    2522  	  } else {	// if x_sign != y_sign the result of this subtract. is exact
    2523  	    C1.w[0] = C1.w[0] - R256.w[2];
    2524  	    C1.w[1] = C1.w[1] - R256.w[3];
    2525  	    if (C1.w[0] > tmp64)
    2526  	      C1.w[1]--;	// borrow
    2527  	    if (C1.w[1] >= 0x8000000000000000ull) {	// negative coefficient!
    2528  	      C1.w[0] = ~C1.w[0];
    2529  	      C1.w[0]++;
    2530  	      C1.w[1] = ~C1.w[1];
    2531  	      if (C1.w[0] == 0x0)
    2532  		C1.w[1]++;
    2533  	      tmp_sign = y_sign;
    2534  	      // the result will have the sign of y if last rnd
    2535  	    } else {
    2536  	      tmp_sign = x_sign;
    2537  	    }
    2538  	    // if the difference has P34-1 digits or less, i.e. C1 < 10^33 then
    2539  	    //   redo the calculation with x1=x1-1;
    2540  	    // redo the calculation also if C1 = 10^33 and 
    2541  	    //   (is_inexact_gt_midpoint or is_midpoint_lt_even);
    2542  	    //   (the last part should have really been 
    2543  	    //   (is_inexact_lt_midpoint or is_midpoint_gt_even) from
    2544  	    //    the rounding of C2, but the position flags have been reversed)
    2545  	    // 10^33 = 0x0000314dc6448d93 0x38c15b0a00000000
    2546  	    if ((C1.w[1] < 0x0000314dc6448d93ull || (C1.w[1] == 0x0000314dc6448d93ull && C1.w[0] < 0x38c15b0a00000000ull)) || (C1.w[1] == 0x0000314dc6448d93ull && C1.w[0] == 0x38c15b0a00000000ull && (is_inexact_gt_midpoint || is_midpoint_lt_even))) {	// C1=10^33
    2547  	      x1 = x1 - 1;	// x1 >= 0
    2548  	      if (x1 >= 0) {
    2549  		// clear position flags and tmp_inexact
    2550  		is_midpoint_lt_even = 0;
    2551  		is_midpoint_gt_even = 0;
    2552  		is_inexact_lt_midpoint = 0;
    2553  		is_inexact_gt_midpoint = 0;
    2554  		tmp_inexact = 0;
    2555  		second_pass = 1;
    2556  		goto roundC2;	// else result has less than P34 digits
    2557  	      }
    2558  	    }
    2559  	    // if the coefficient of the result is 10^34 it means that this
    2560  	    // must be the second pass, and we are done 
    2561  	    if (C1.w[1] == 0x0001ed09bead87c0ull && C1.w[0] == 0x378d8e6400000000ull) {	// if  C1 = 10^34
    2562  	      C1.w[1] = 0x0000314dc6448d93ull;	// C1 = 10^33
    2563  	      C1.w[0] = 0x38c15b0a00000000ull;
    2564  	      y_exp = y_exp + ((UINT64) 1 << 49);
    2565  	    }
    2566  	    x_sign = tmp_sign;
    2567  	    if (x1 >= 1)
    2568  	      y_exp = y_exp + ((UINT64) x1 << 49);
    2569  	    // x1 = -1 is possible at the end of a second pass when the 
    2570  	    // first pass started with x1 = 1 
    2571  	  }
    2572  	  C1_hi = C1.w[1];
    2573  	  C1_lo = C1.w[0];
    2574  	  // general correction from RN to RA, RM, RP, RZ; result uses y_exp
    2575  	  if (rnd_mode != ROUNDING_TO_NEAREST) {
    2576  	    if ((!x_sign
    2577  		 && ((rnd_mode == ROUNDING_UP && is_inexact_lt_midpoint)
    2578  		     ||
    2579  		     ((rnd_mode == ROUNDING_TIES_AWAY
    2580  		       || rnd_mode == ROUNDING_UP)
    2581  		      && is_midpoint_gt_even))) || (x_sign
    2582  						    &&
    2583  						    ((rnd_mode ==
    2584  						      ROUNDING_DOWN
    2585  						      &&
    2586  						      is_inexact_lt_midpoint)
    2587  						     ||
    2588  						     ((rnd_mode ==
    2589  						       ROUNDING_TIES_AWAY
    2590  						       || rnd_mode ==
    2591  						       ROUNDING_DOWN)
    2592  						      &&
    2593  						      is_midpoint_gt_even))))
    2594  	    {
    2595  	      // C1 = C1 + 1
    2596  	      C1_lo = C1_lo + 1;
    2597  	      if (C1_lo == 0) {	// rounding overflow in the low 64 bits
    2598  		C1_hi = C1_hi + 1;
    2599  	      }
    2600  	      if (C1_hi == 0x0001ed09bead87c0ull
    2601  		  && C1_lo == 0x378d8e6400000000ull) {
    2602  		// C1 = 10^34 => rounding overflow
    2603  		C1_hi = 0x0000314dc6448d93ull;
    2604  		C1_lo = 0x38c15b0a00000000ull;	// 10^33
    2605  		y_exp = y_exp + EXP_P1;
    2606  	      }
    2607  	    } else if ((is_midpoint_lt_even || is_inexact_gt_midpoint)
    2608  		       &&
    2609  		       ((x_sign
    2610  			 && (rnd_mode == ROUNDING_UP
    2611  			     || rnd_mode == ROUNDING_TO_ZERO))
    2612  			|| (!x_sign
    2613  			    && (rnd_mode == ROUNDING_DOWN
    2614  				|| rnd_mode == ROUNDING_TO_ZERO)))) {
    2615  	      // C1 = C1 - 1
    2616  	      C1_lo = C1_lo - 1;
    2617  	      if (C1_lo == 0xffffffffffffffffull)
    2618  		C1_hi--;
    2619  	      // check if we crossed into the lower decade
    2620  	      if (C1_hi == 0x0000314dc6448d93ull && C1_lo == 0x38c15b09ffffffffull) {	// 10^33 - 1
    2621  		C1_hi = 0x0001ed09bead87c0ull;	// 10^34 - 1
    2622  		C1_lo = 0x378d8e63ffffffffull;
    2623  		y_exp = y_exp - EXP_P1;
    2624  		// no underflow, because delta + q2 >= P34 + 1
    2625  	      }
    2626  	    } else {
    2627  	      ;	// exact, the result is already correct
    2628  	    }
    2629  	    // in all cases check for overflow (RN and RA solved already)
    2630  	    if (y_exp == EXP_MAX_P1) {	// overflow
    2631  	      if ((rnd_mode == ROUNDING_DOWN && x_sign) ||	// RM and res < 0
    2632  		  (rnd_mode == ROUNDING_UP && !x_sign)) {	// RP and res > 0
    2633  		C1_hi = 0x7800000000000000ull;	// +inf
    2634  		C1_lo = 0x0ull;
    2635  	      } else {	// RM and res > 0, RP and res < 0, or RZ
    2636  		C1_hi = 0x5fffed09bead87c0ull;
    2637  		C1_lo = 0x378d8e63ffffffffull;
    2638  	      }
    2639  	      y_exp = 0;	// x_sign is preserved
    2640  	      // set the inexact flag (in case the exact addition was exact)
    2641  	      *pfpsf |= INEXACT_EXCEPTION;
    2642  	      // set the overflow flag
    2643  	      *pfpsf |= OVERFLOW_EXCEPTION;
    2644  	    }
    2645  	  }
    2646  	  // assemble the result
    2647  	  res.w[1] = x_sign | y_exp | C1_hi;
    2648  	  res.w[0] = C1_lo;
    2649  	  if (tmp_inexact)
    2650  	    *pfpsf |= INEXACT_EXCEPTION;
    2651  	}
    2652        } else {	// if (-P34 + 1 <= delta <= -1) <=> 1 <= -delta <= P34 - 1
    2653  	// NOTE: the following, up to "} else { // if x_sign != y_sign 
    2654  	// the result is exact" is identical to "else if (delta == P34 - q2) {"
    2655  	// from above; also, the code is not symmetric: a+b and b+a may take
    2656  	// different paths (need to unify eventually!) 
    2657  	// calculate C' = C2 + C1 * 10^(e1-e2) directly; the result may be 
    2658  	// inexact if it requires P34 + 1 decimal digits; in either case the 
    2659  	// 'cutoff' point for addition is at the position of the lsb of C2
    2660  	// The coefficient of the result is C1 * 10^(e1-e2) + C2 and the
    2661  	// exponent is e2; either C1 or 10^(e1-e2) may not fit is 64 bits,
    2662  	// but their product fits with certainty in 128 bits (actually in 113)
    2663  	// Note that 0 <= e1 - e2 <= P34 - 2
    2664  	//   -P34 + 1 <= delta <= -1 <=> -P34 + 1 <= delta <= -1 <=>
    2665  	//   -P34 + 1 <= q1 + e1 - q2 - e2 <= -1 <=>
    2666  	//   q2 - q1 - P34 + 1 <= e1 - e2 <= q2 - q1 - 1 <=>
    2667  	//   1 - P34 - P34 + 1 <= e1-e2 <= P34 - 1 - 1 => 0 <= e1-e2 <= P34 - 2
    2668  	scale = delta - q1 + q2;	// scale = (int)(e1 >> 49) - (int)(e2 >> 49)
    2669  	if (scale >= 20) {	// 10^(e1-e2) does not fit in 64 bits, but C1 does
    2670  	  __mul_128x64_to_128 (C1, C1_lo, ten2k128[scale - 20]);
    2671  	} else if (scale >= 1) {
    2672  	  // if 1 <= scale <= 19 then 10^(e1-e2) fits in 64 bits
    2673  	  if (q1 <= 19) {	// C1 fits in 64 bits
    2674  	    __mul_64x64_to_128MACH (C1, C1_lo, ten2k64[scale]);
    2675  	  } else {	// q1 >= 20
    2676  	    C1.w[1] = C1_hi;
    2677  	    C1.w[0] = C1_lo;
    2678  	    __mul_128x64_to_128 (C1, ten2k64[scale], C1);
    2679  	  }
    2680  	} else {	// if (scale == 0) C1 is unchanged
    2681  	  C1.w[1] = C1_hi;
    2682  	  C1.w[0] = C1_lo;	// only the low part is necessary
    2683  	}
    2684  	C1_hi = C1.w[1];
    2685  	C1_lo = C1.w[0];
    2686  	// now add C2
    2687  	if (x_sign == y_sign) {
    2688  	  // the result can overflow!
    2689  	  C1_lo = C1_lo + C2_lo;
    2690  	  C1_hi = C1_hi + C2_hi;
    2691  	  if (C1_lo < C1.w[0])
    2692  	    C1_hi++;
    2693  	  // test for overflow, possible only when C1 >= 10^34
    2694  	  if (C1_hi > 0x0001ed09bead87c0ull || (C1_hi == 0x0001ed09bead87c0ull && C1_lo >= 0x378d8e6400000000ull)) {	// C1 >= 10^34
    2695  	    // in this case q = P34 + 1 and x = q - P34 = 1, so multiply 
    2696  	    // C'' = C'+ 5 = C1 + 5 by k1 ~ 10^(-1) calculated for P34 + 1 
    2697  	    // decimal digits
    2698  	    // Calculate C'' = C' + 1/2 * 10^x
    2699  	    if (C1_lo >= 0xfffffffffffffffbull) {	// low half add has carry
    2700  	      C1_lo = C1_lo + 5;
    2701  	      C1_hi = C1_hi + 1;
    2702  	    } else {
    2703  	      C1_lo = C1_lo + 5;
    2704  	    }
    2705  	    // the approximation of 10^(-1) was rounded up to 118 bits
    2706  	    // 10^(-1) =~ 33333333333333333333333333333400 * 2^-129
    2707  	    // 10^(-1) =~ 19999999999999999999999999999a00 * 2^-128
    2708  	    C1.w[1] = C1_hi;
    2709  	    C1.w[0] = C1_lo;	// C''
    2710  	    ten2m1.w[1] = 0x1999999999999999ull;
    2711  	    ten2m1.w[0] = 0x9999999999999a00ull;
    2712  	    __mul_128x128_to_256 (P256, C1, ten2m1);	// P256 = C*, f*
    2713  	    // C* is actually floor(C*) in this case
    2714  	    // the top Ex = 128 bits of 10^(-1) are 
    2715  	    // T* = 0x00199999999999999999999999999999
    2716  	    // if (0 < f* < 10^(-x)) then
    2717  	    //   if floor(C*) is even then C = floor(C*) - logical right 
    2718  	    //       shift; C has p decimal digits, correct by Prop. 1)
    2719  	    //   else if floor(C*) is odd C = floor(C*) - 1 (logical right
    2720  	    //       shift; C has p decimal digits, correct by Pr. 1)
    2721  	    // else
    2722  	    //   C = floor(C*) (logical right shift; C has p decimal digits,
    2723  	    //       correct by Property 1)
    2724  	    // n = C * 10^(e2+x)
    2725  	    if ((P256.w[1] || P256.w[0])
    2726  		&& (P256.w[1] < 0x1999999999999999ull
    2727  		    || (P256.w[1] == 0x1999999999999999ull
    2728  			&& P256.w[0] <= 0x9999999999999999ull))) {
    2729  	      // the result is a midpoint
    2730  	      if (P256.w[2] & 0x01) {
    2731  		is_midpoint_gt_even = 1;
    2732  		// if floor(C*) is odd C = floor(C*) - 1; the result is not 0
    2733  		P256.w[2]--;
    2734  		if (P256.w[2] == 0xffffffffffffffffull)
    2735  		  P256.w[3]--;
    2736  	      } else {
    2737  		is_midpoint_lt_even = 1;
    2738  	      }
    2739  	    }
    2740  	    // n = Cstar * 10^(e2+1)
    2741  	    y_exp = y_exp + EXP_P1;
    2742  	    // C* != 10^P34 because C* has P34 digits
    2743  	    // check for overflow
    2744  	    if (y_exp == EXP_MAX_P1
    2745  		&& (rnd_mode == ROUNDING_TO_NEAREST
    2746  		    || rnd_mode == ROUNDING_TIES_AWAY)) {
    2747  	      // overflow for RN
    2748  	      res.w[1] = x_sign | 0x7800000000000000ull;	// +/-inf
    2749  	      res.w[0] = 0x0ull;
    2750  	      // set the inexact flag
    2751  	      *pfpsf |= INEXACT_EXCEPTION;
    2752  	      // set the overflow flag
    2753  	      *pfpsf |= OVERFLOW_EXCEPTION;
    2754  	      BID_SWAP128 (res);
    2755  	      BID_RETURN (res);
    2756  	    }
    2757  	    // if (0 < f* - 1/2 < 10^(-x)) then 
    2758  	    //   the result of the addition is exact 
    2759  	    // else 
    2760  	    //   the result of the addition is inexact
    2761  	    if (P256.w[1] > 0x8000000000000000ull || (P256.w[1] == 0x8000000000000000ull && P256.w[0] > 0x0ull)) {	// the result may be exact
    2762  	      tmp64 = P256.w[1] - 0x8000000000000000ull;	// f* - 1/2
    2763  	      if ((tmp64 > 0x1999999999999999ull
    2764  		   || (tmp64 == 0x1999999999999999ull
    2765  		       && P256.w[0] >= 0x9999999999999999ull))) {
    2766  		// set the inexact flag
    2767  		*pfpsf |= INEXACT_EXCEPTION;
    2768  		is_inexact = 1;
    2769  	      }	// else the result is exact
    2770  	    } else {	// the result is inexact
    2771  	      // set the inexact flag
    2772  	      *pfpsf |= INEXACT_EXCEPTION;
    2773  	      is_inexact = 1;
    2774  	    }
    2775  	    C1_hi = P256.w[3];
    2776  	    C1_lo = P256.w[2];
    2777  	    if (!is_midpoint_gt_even && !is_midpoint_lt_even) {
    2778  	      is_inexact_lt_midpoint = is_inexact
    2779  		&& (P256.w[1] & 0x8000000000000000ull);
    2780  	      is_inexact_gt_midpoint = is_inexact
    2781  		&& !(P256.w[1] & 0x8000000000000000ull);
    2782  	    }
    2783  	    // general correction from RN to RA, RM, RP, RZ; result uses y_exp
    2784  	    if (rnd_mode != ROUNDING_TO_NEAREST) {
    2785  	      if ((!x_sign
    2786  		   && ((rnd_mode == ROUNDING_UP
    2787  			&& is_inexact_lt_midpoint)
    2788  		       || ((rnd_mode == ROUNDING_TIES_AWAY
    2789  			    || rnd_mode == ROUNDING_UP)
    2790  			   && is_midpoint_gt_even))) || (x_sign
    2791  							 &&
    2792  							 ((rnd_mode ==
    2793  							   ROUNDING_DOWN
    2794  							   &&
    2795  							   is_inexact_lt_midpoint)
    2796  							  ||
    2797  							  ((rnd_mode ==
    2798  							    ROUNDING_TIES_AWAY
    2799  							    || rnd_mode
    2800  							    ==
    2801  							    ROUNDING_DOWN)
    2802  							   &&
    2803  							   is_midpoint_gt_even))))
    2804  	      {
    2805  		// C1 = C1 + 1
    2806  		C1_lo = C1_lo + 1;
    2807  		if (C1_lo == 0) {	// rounding overflow in the low 64 bits
    2808  		  C1_hi = C1_hi + 1;
    2809  		}
    2810  		if (C1_hi == 0x0001ed09bead87c0ull
    2811  		    && C1_lo == 0x378d8e6400000000ull) {
    2812  		  // C1 = 10^34 => rounding overflow
    2813  		  C1_hi = 0x0000314dc6448d93ull;
    2814  		  C1_lo = 0x38c15b0a00000000ull;	// 10^33
    2815  		  y_exp = y_exp + EXP_P1;
    2816  		}
    2817  	      } else
    2818  		if ((is_midpoint_lt_even || is_inexact_gt_midpoint) &&
    2819  		    ((x_sign && (rnd_mode == ROUNDING_UP ||
    2820  				 rnd_mode == ROUNDING_TO_ZERO)) ||
    2821  		     (!x_sign && (rnd_mode == ROUNDING_DOWN ||
    2822  				  rnd_mode == ROUNDING_TO_ZERO)))) {
    2823  		// C1 = C1 - 1
    2824  		C1_lo = C1_lo - 1;
    2825  		if (C1_lo == 0xffffffffffffffffull)
    2826  		  C1_hi--;
    2827  		// check if we crossed into the lower decade
    2828  		if (C1_hi == 0x0000314dc6448d93ull && C1_lo == 0x38c15b09ffffffffull) {	// 10^33 - 1
    2829  		  C1_hi = 0x0001ed09bead87c0ull;	// 10^34 - 1
    2830  		  C1_lo = 0x378d8e63ffffffffull;
    2831  		  y_exp = y_exp - EXP_P1;
    2832  		  // no underflow, because delta + q2 >= P34 + 1
    2833  		}
    2834  	      } else {
    2835  		;	// exact, the result is already correct
    2836  	      }
    2837  	      // in all cases check for overflow (RN and RA solved already)
    2838  	      if (y_exp == EXP_MAX_P1) {	// overflow
    2839  		if ((rnd_mode == ROUNDING_DOWN && x_sign) ||	// RM and res < 0
    2840  		    (rnd_mode == ROUNDING_UP && !x_sign)) {	// RP and res > 0
    2841  		  C1_hi = 0x7800000000000000ull;	// +inf
    2842  		  C1_lo = 0x0ull;
    2843  		} else {	// RM and res > 0, RP and res < 0, or RZ
    2844  		  C1_hi = 0x5fffed09bead87c0ull;
    2845  		  C1_lo = 0x378d8e63ffffffffull;
    2846  		}
    2847  		y_exp = 0;	// x_sign is preserved
    2848  		// set the inexact flag (in case the exact addition was exact)
    2849  		*pfpsf |= INEXACT_EXCEPTION;
    2850  		// set the overflow flag
    2851  		*pfpsf |= OVERFLOW_EXCEPTION;
    2852  	      }
    2853  	    }
    2854  	  }	// else if (C1 < 10^34) then C1 is the coeff.; the result is exact
    2855  	  // assemble the result
    2856  	  res.w[1] = x_sign | y_exp | C1_hi;
    2857  	  res.w[0] = C1_lo;
    2858  	} else {	// if x_sign != y_sign the result is exact
    2859  	  C1_lo = C2_lo - C1_lo;
    2860  	  C1_hi = C2_hi - C1_hi;
    2861  	  if (C1_lo > C2_lo)
    2862  	    C1_hi--;
    2863  	  if (C1_hi >= 0x8000000000000000ull) {	// negative coefficient!
    2864  	    C1_lo = ~C1_lo;
    2865  	    C1_lo++;
    2866  	    C1_hi = ~C1_hi;
    2867  	    if (C1_lo == 0x0)
    2868  	      C1_hi++;
    2869  	    x_sign = y_sign;	// the result will have the sign of y
    2870  	  }
    2871  	  // the result can be zero, but it cannot overflow
    2872  	  if (C1_lo == 0 && C1_hi == 0) {
    2873  	    // assemble the result
    2874  	    if (x_exp < y_exp)
    2875  	      res.w[1] = x_exp;
    2876  	    else
    2877  	      res.w[1] = y_exp;
    2878  	    res.w[0] = 0;
    2879  	    if (rnd_mode == ROUNDING_DOWN) {
    2880  	      res.w[1] |= 0x8000000000000000ull;
    2881  	    }
    2882  	    BID_SWAP128 (res);
    2883  	    BID_RETURN (res);
    2884  	  }
    2885  	  // assemble the result
    2886  	  res.w[1] = y_sign | y_exp | C1_hi;
    2887  	  res.w[0] = C1_lo;
    2888  	}
    2889        }
    2890      }
    2891      BID_SWAP128 (res);
    2892      BID_RETURN (res)
    2893    }
    2894  }
    2895  
    2896  
    2897  
    2898  // bid128_sub stands for bid128qq_sub
    2899  
    2900  /*****************************************************************************
    2901   *  BID128 sub
    2902   ****************************************************************************/
    2903  
    2904  #if DECIMAL_CALL_BY_REFERENCE
    2905  void
    2906  bid128_sub (UINT128 * pres, UINT128 * px, UINT128 * py
    2907  	    _RND_MODE_PARAM _EXC_FLAGS_PARAM _EXC_MASKS_PARAM
    2908  	    _EXC_INFO_PARAM) {
    2909    UINT128 x = *px, y = *py;
    2910  #if !DECIMAL_GLOBAL_ROUNDING
    2911    unsigned int rnd_mode = *prnd_mode;
    2912  #endif
    2913  #else
    2914  UINT128
    2915  bid128_sub (UINT128 x, UINT128 y
    2916  	    _RND_MODE_PARAM _EXC_FLAGS_PARAM _EXC_MASKS_PARAM
    2917  	    _EXC_INFO_PARAM) {
    2918  #endif
    2919  
    2920    UINT128 res;
    2921    UINT64 y_sign;
    2922  
    2923    if ((y.w[HIGH_128W] & MASK_NAN) != MASK_NAN) {	// y is not NAN
    2924      // change its sign
    2925      y_sign = y.w[HIGH_128W] & MASK_SIGN;	// 0 for positive, MASK_SIGN for negative
    2926      if (y_sign)
    2927        y.w[HIGH_128W] = y.w[HIGH_128W] & 0x7fffffffffffffffull;
    2928      else
    2929        y.w[HIGH_128W] = y.w[HIGH_128W] | 0x8000000000000000ull;
    2930    }
    2931  #if DECIMAL_CALL_BY_REFERENCE
    2932    bid128_add (&res, &x, &y
    2933  	      _RND_MODE_ARG _EXC_FLAGS_ARG _EXC_MASKS_ARG
    2934  	      _EXC_INFO_ARG);
    2935  #else
    2936    res = bid128_add (x, y
    2937  		    _RND_MODE_ARG _EXC_FLAGS_ARG _EXC_MASKS_ARG
    2938  		    _EXC_INFO_ARG);
    2939  #endif
    2940    BID_RETURN (res);
    2941  }