1  // Copyright 2018 Ulf Adams
       2  //
       3  // The contents of this file may be used under the terms of the Apache License,
       4  // Version 2.0.
       5  //
       6  //    (See accompanying file LICENSE-Apache or copy at
       7  //     http://www.apache.org/licenses/LICENSE-2.0)
       8  //
       9  // Alternatively, the contents of this file may be used under the terms of
      10  // the Boost Software License, Version 1.0.
      11  //    (See accompanying file LICENSE-Boost or copy at
      12  //     https://www.boost.org/LICENSE_1_0.txt)
      13  //
      14  // Unless required by applicable law or agreed to in writing, this software
      15  // is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
      16  // KIND, either express or implied.
      17  #ifndef RYU_D2S_INTRINSICS_H
      18  #define RYU_D2S_INTRINSICS_H
      19  
      20  
      21  // Defines RYU_32_BIT_PLATFORM if applicable.
      22  
      23  // ABSL avoids uint128_t on Win32 even if __SIZEOF_INT128__ is defined.
      24  // Let's do the same for now.
      25  #if defined(__SIZEOF_INT128__) && !defined(_MSC_VER) && !defined(RYU_ONLY_64_BIT_OPS)
      26  #define HAS_UINT128
      27  #elif defined(_MSC_VER) && !defined(RYU_ONLY_64_BIT_OPS) && defined(_M_X64)
      28  #define HAS_64_BIT_INTRINSICS
      29  #endif
      30  
      31  #if defined(HAS_64_BIT_INTRINSICS)
      32  
      33  
      34  static inline uint64_t umul128(const uint64_t a, const uint64_t b, uint64_t* const productHi) {
      35    return _umul128(a, b, productHi);
      36  }
      37  
      38  // Returns the lower 64 bits of (hi*2^64 + lo) >> dist, with 0 < dist < 64.
      39  static inline uint64_t shiftright128(const uint64_t lo, const uint64_t hi, const uint32_t dist) {
      40    // For the __shiftright128 intrinsic, the shift value is always
      41    // modulo 64.
      42    // In the current implementation of the double-precision version
      43    // of Ryu, the shift value is always < 64. (In the case
      44    // RYU_OPTIMIZE_SIZE == 0, the shift value is in the range [49, 58].
      45    // Otherwise in the range [2, 59].)
      46    // However, this function is now also called by s2d, which requires supporting
      47    // the larger shift range (TODO: what is the actual range?).
      48    // Check this here in case a future change requires larger shift
      49    // values. In this case this function needs to be adjusted.
      50    assert(dist < 64);
      51    return __shiftright128(lo, hi, (unsigned char) dist);
      52  }
      53  
      54  #else // defined(HAS_64_BIT_INTRINSICS)
      55  
      56  static inline uint64_t umul128(const uint64_t a, const uint64_t b, uint64_t* const productHi) {
      57    // The casts here help MSVC to avoid calls to the __allmul library function.
      58    const uint32_t aLo = (uint32_t)a;
      59    const uint32_t aHi = (uint32_t)(a >> 32);
      60    const uint32_t bLo = (uint32_t)b;
      61    const uint32_t bHi = (uint32_t)(b >> 32);
      62  
      63    const uint64_t b00 = (uint64_t)aLo * bLo;
      64    const uint64_t b01 = (uint64_t)aLo * bHi;
      65    const uint64_t b10 = (uint64_t)aHi * bLo;
      66    const uint64_t b11 = (uint64_t)aHi * bHi;
      67  
      68    const uint32_t b00Lo = (uint32_t)b00;
      69    const uint32_t b00Hi = (uint32_t)(b00 >> 32);
      70  
      71    const uint64_t mid1 = b10 + b00Hi;
      72    const uint32_t mid1Lo = (uint32_t)(mid1);
      73    const uint32_t mid1Hi = (uint32_t)(mid1 >> 32);
      74  
      75    const uint64_t mid2 = b01 + mid1Lo;
      76    const uint32_t mid2Lo = (uint32_t)(mid2);
      77    const uint32_t mid2Hi = (uint32_t)(mid2 >> 32);
      78  
      79    const uint64_t pHi = b11 + mid1Hi + mid2Hi;
      80    const uint64_t pLo = ((uint64_t)mid2Lo << 32) | b00Lo;
      81  
      82    *productHi = pHi;
      83    return pLo;
      84  }
      85  
      86  static inline uint64_t shiftright128(const uint64_t lo, const uint64_t hi, const uint32_t dist) {
      87    // We don't need to handle the case dist >= 64 here (see above).
      88    assert(dist < 64);
      89    assert(dist > 0);
      90    return (hi << (64 - dist)) | (lo >> dist);
      91  }
      92  
      93  #endif // defined(HAS_64_BIT_INTRINSICS)
      94  
      95  #if defined(RYU_32_BIT_PLATFORM)
      96  
      97  // Returns the high 64 bits of the 128-bit product of a and b.
      98  static inline uint64_t umulh(const uint64_t a, const uint64_t b) {
      99    // Reuse the umul128 implementation.
     100    // Optimizers will likely eliminate the instructions used to compute the
     101    // low part of the product.
     102    uint64_t hi;
     103    umul128(a, b, &hi);
     104    return hi;
     105  }
     106  
     107  // On 32-bit platforms, compilers typically generate calls to library
     108  // functions for 64-bit divisions, even if the divisor is a constant.
     109  //
     110  // E.g.:
     111  // https://bugs.llvm.org/show_bug.cgi?id=37932
     112  // https://gcc.gnu.org/bugzilla/show_bug.cgi?id=17958
     113  // https://gcc.gnu.org/bugzilla/show_bug.cgi?id=37443
     114  //
     115  // The functions here perform division-by-constant using multiplications
     116  // in the same way as 64-bit compilers would do.
     117  //
     118  // NB:
     119  // The multipliers and shift values are the ones generated by clang x64
     120  // for expressions like x/5, x/10, etc.
     121  
     122  static inline uint64_t div5(const uint64_t x) {
     123    return umulh(x, 0xCCCCCCCCCCCCCCCDu) >> 2;
     124  }
     125  
     126  static inline uint64_t div10(const uint64_t x) {
     127    return umulh(x, 0xCCCCCCCCCCCCCCCDu) >> 3;
     128  }
     129  
     130  static inline uint64_t div100(const uint64_t x) {
     131    return umulh(x >> 2, 0x28F5C28F5C28F5C3u) >> 2;
     132  }
     133  
     134  static inline uint64_t div1e8(const uint64_t x) {
     135    return umulh(x, 0xABCC77118461CEFDu) >> 26;
     136  }
     137  
     138  static inline uint64_t div1e9(const uint64_t x) {
     139    return umulh(x >> 9, 0x44B82FA09B5A53u) >> 11;
     140  }
     141  
     142  static inline uint32_t mod1e9(const uint64_t x) {
     143    // Avoid 64-bit math as much as possible.
     144    // Returning (uint32_t) (x - 1000000000 * div1e9(x)) would
     145    // perform 32x64-bit multiplication and 64-bit subtraction.
     146    // x and 1000000000 * div1e9(x) are guaranteed to differ by
     147    // less than 10^9, so their highest 32 bits must be identical,
     148    // so we can truncate both sides to uint32_t before subtracting.
     149    // We can also simplify (uint32_t) (1000000000 * div1e9(x)).
     150    // We can truncate before multiplying instead of after, as multiplying
     151    // the highest 32 bits of div1e9(x) can't affect the lowest 32 bits.
     152    return ((uint32_t) x) - 1000000000 * ((uint32_t) div1e9(x));
     153  }
     154  
     155  #else // defined(RYU_32_BIT_PLATFORM)
     156  
     157  static inline uint64_t div5(const uint64_t x) {
     158    return x / 5;
     159  }
     160  
     161  static inline uint64_t div10(const uint64_t x) {
     162    return x / 10;
     163  }
     164  
     165  static inline uint64_t div100(const uint64_t x) {
     166    return x / 100;
     167  }
     168  
     169  static inline uint64_t div1e8(const uint64_t x) {
     170    return x / 100000000;
     171  }
     172  
     173  static inline uint64_t div1e9(const uint64_t x) {
     174    return x / 1000000000;
     175  }
     176  
     177  static inline uint32_t mod1e9(const uint64_t x) {
     178    return (uint32_t) (x - 1000000000 * div1e9(x));
     179  }
     180  
     181  #endif // defined(RYU_32_BIT_PLATFORM)
     182  
     183  static inline uint32_t pow5Factor(uint64_t value) {
     184    const uint64_t m_inv_5 = 14757395258967641293u; // 5 * m_inv_5 = 1 (mod 2^64)
     185    const uint64_t n_div_5 = 3689348814741910323u;  // #{ n | n = 0 (mod 2^64) } = 2^64 / 5
     186    uint32_t count = 0;
     187    for (;;) {
     188      assert(value != 0);
     189      value *= m_inv_5;
     190      if (value > n_div_5)
     191        break;
     192      ++count;
     193    }
     194    return count;
     195  }
     196  
     197  // Returns true if value is divisible by 5^p.
     198  static inline bool multipleOfPowerOf5(const uint64_t value, const uint32_t p) {
     199    // I tried a case distinction on p, but there was no performance difference.
     200    return pow5Factor(value) >= p;
     201  }
     202  
     203  // Returns true if value is divisible by 2^p.
     204  static inline bool multipleOfPowerOf2(const uint64_t value, const uint32_t p) {
     205    assert(value != 0);
     206    assert(p < 64);
     207    // __builtin_ctzll doesn't appear to be faster here.
     208    return (value & ((1ull << p) - 1)) == 0;
     209  }
     210  
     211  // We need a 64x128-bit multiplication and a subsequent 128-bit shift.
     212  // Multiplication:
     213  //   The 64-bit factor is variable and passed in, the 128-bit factor comes
     214  //   from a lookup table. We know that the 64-bit factor only has 55
     215  //   significant bits (i.e., the 9 topmost bits are zeros). The 128-bit
     216  //   factor only has 124 significant bits (i.e., the 4 topmost bits are
     217  //   zeros).
     218  // Shift:
     219  //   In principle, the multiplication result requires 55 + 124 = 179 bits to
     220  //   represent. However, we then shift this value to the right by j, which is
     221  //   at least j >= 115, so the result is guaranteed to fit into 179 - 115 = 64
     222  //   bits. This means that we only need the topmost 64 significant bits of
     223  //   the 64x128-bit multiplication.
     224  //
     225  // There are several ways to do this:
     226  // 1. Best case: the compiler exposes a 128-bit type.
     227  //    We perform two 64x64-bit multiplications, add the higher 64 bits of the
     228  //    lower result to the higher result, and shift by j - 64 bits.
     229  //
     230  //    We explicitly cast from 64-bit to 128-bit, so the compiler can tell
     231  //    that these are only 64-bit inputs, and can map these to the best
     232  //    possible sequence of assembly instructions.
     233  //    x64 machines happen to have matching assembly instructions for
     234  //    64x64-bit multiplications and 128-bit shifts.
     235  //
     236  // 2. Second best case: the compiler exposes intrinsics for the x64 assembly
     237  //    instructions mentioned in 1.
     238  //
     239  // 3. We only have 64x64 bit instructions that return the lower 64 bits of
     240  //    the result, i.e., we have to use plain C.
     241  //    Our inputs are less than the full width, so we have three options:
     242  //    a. Ignore this fact and just implement the intrinsics manually.
     243  //    b. Split both into 31-bit pieces, which guarantees no internal overflow,
     244  //       but requires extra work upfront (unless we change the lookup table).
     245  //    c. Split only the first factor into 31-bit pieces, which also guarantees
     246  //       no internal overflow, but requires extra work since the intermediate
     247  //       results are not perfectly aligned.
     248  #if defined(HAS_UINT128)
     249  
     250  // Best case: use 128-bit type.
     251  static inline uint64_t mulShift64(const uint64_t m, const uint64_t* const mul, const int32_t j) {
     252    const uint128_t b0 = ((uint128_t) m) * mul[0];
     253    const uint128_t b2 = ((uint128_t) m) * mul[1];
     254    return (uint64_t) (((b0 >> 64) + b2) >> (j - 64));
     255  }
     256  
     257  static inline uint64_t mulShiftAll64(const uint64_t m, const uint64_t* const mul, const int32_t j,
     258    uint64_t* const vp, uint64_t* const vm, const uint32_t mmShift) {
     259  //  m <<= 2;
     260  //  uint128_t b0 = ((uint128_t) m) * mul[0]; // 0
     261  //  uint128_t b2 = ((uint128_t) m) * mul[1]; // 64
     262  //
     263  //  uint128_t hi = (b0 >> 64) + b2;
     264  //  uint128_t lo = b0 & 0xffffffffffffffffull;
     265  //  uint128_t factor = (((uint128_t) mul[1]) << 64) + mul[0];
     266  //  uint128_t vpLo = lo + (factor << 1);
     267  //  *vp = (uint64_t) ((hi + (vpLo >> 64)) >> (j - 64));
     268  //  uint128_t vmLo = lo - (factor << mmShift);
     269  //  *vm = (uint64_t) ((hi + (vmLo >> 64) - (((uint128_t) 1ull) << 64)) >> (j - 64));
     270  //  return (uint64_t) (hi >> (j - 64));
     271    *vp = mulShift64(4 * m + 2, mul, j);
     272    *vm = mulShift64(4 * m - 1 - mmShift, mul, j);
     273    return mulShift64(4 * m, mul, j);
     274  }
     275  
     276  #elif defined(HAS_64_BIT_INTRINSICS)
     277  
     278  static inline uint64_t mulShift64(const uint64_t m, const uint64_t* const mul, const int32_t j) {
     279    // m is maximum 55 bits
     280    uint64_t high1;                                   // 128
     281    const uint64_t low1 = umul128(m, mul[1], &high1); // 64
     282    uint64_t high0;                                   // 64
     283    umul128(m, mul[0], &high0);                       // 0
     284    const uint64_t sum = high0 + low1;
     285    if (sum < high0) {
     286      ++high1; // overflow into high1
     287    }
     288    return shiftright128(sum, high1, j - 64);
     289  }
     290  
     291  static inline uint64_t mulShiftAll64(const uint64_t m, const uint64_t* const mul, const int32_t j,
     292    uint64_t* const vp, uint64_t* const vm, const uint32_t mmShift) {
     293    *vp = mulShift64(4 * m + 2, mul, j);
     294    *vm = mulShift64(4 * m - 1 - mmShift, mul, j);
     295    return mulShift64(4 * m, mul, j);
     296  }
     297  
     298  #else // !defined(HAS_UINT128) && !defined(HAS_64_BIT_INTRINSICS)
     299  
     300  static inline uint64_t mulShift64(const uint64_t m, const uint64_t* const mul, const int32_t j) {
     301    // m is maximum 55 bits
     302    uint64_t high1;                                   // 128
     303    const uint64_t low1 = umul128(m, mul[1], &high1); // 64
     304    uint64_t high0;                                   // 64
     305    umul128(m, mul[0], &high0);                       // 0
     306    const uint64_t sum = high0 + low1;
     307    if (sum < high0) {
     308      ++high1; // overflow into high1
     309    }
     310    return shiftright128(sum, high1, j - 64);
     311  }
     312  
     313  // This is faster if we don't have a 64x64->128-bit multiplication.
     314  static inline uint64_t mulShiftAll64(uint64_t m, const uint64_t* const mul, const int32_t j,
     315    uint64_t* const vp, uint64_t* const vm, const uint32_t mmShift) {
     316    m <<= 1;
     317    // m is maximum 55 bits
     318    uint64_t tmp;
     319    const uint64_t lo = umul128(m, mul[0], &tmp);
     320    uint64_t hi;
     321    const uint64_t mid = tmp + umul128(m, mul[1], &hi);
     322    hi += mid < tmp; // overflow into hi
     323  
     324    const uint64_t lo2 = lo + mul[0];
     325    const uint64_t mid2 = mid + mul[1] + (lo2 < lo);
     326    const uint64_t hi2 = hi + (mid2 < mid);
     327    *vp = shiftright128(mid2, hi2, (uint32_t) (j - 64 - 1));
     328  
     329    if (mmShift == 1) {
     330      const uint64_t lo3 = lo - mul[0];
     331      const uint64_t mid3 = mid - mul[1] - (lo3 > lo);
     332      const uint64_t hi3 = hi - (mid3 > mid);
     333      *vm = shiftright128(mid3, hi3, (uint32_t) (j - 64 - 1));
     334    } else {
     335      const uint64_t lo3 = lo + lo;
     336      const uint64_t mid3 = mid + mid + (lo3 < lo);
     337      const uint64_t hi3 = hi + hi + (mid3 < mid);
     338      const uint64_t lo4 = lo3 - mul[0];
     339      const uint64_t mid4 = mid3 - mul[1] - (lo4 > lo3);
     340      const uint64_t hi4 = hi3 - (mid4 > mid3);
     341      *vm = shiftright128(mid4, hi4, (uint32_t) (j - 64));
     342    }
     343  
     344    return shiftright128(mid, hi, (uint32_t) (j - 64 - 1));
     345  }
     346  
     347  #endif // HAS_64_BIT_INTRINSICS
     348  
     349  #endif // RYU_D2S_INTRINSICS_H