1  /*
       2   * Copyright (c) 2008-2020 Stefan Krah. All rights reserved.
       3   *
       4   * Redistribution and use in source and binary forms, with or without
       5   * modification, are permitted provided that the following conditions
       6   * are met:
       7   *
       8   * 1. Redistributions of source code must retain the above copyright
       9   *    notice, this list of conditions and the following disclaimer.
      10   *
      11   * 2. Redistributions in binary form must reproduce the above copyright
      12   *    notice, this list of conditions and the following disclaimer in the
      13   *    documentation and/or other materials provided with the distribution.
      14   *
      15   * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS "AS IS" AND
      16   * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
      17   * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
      18   * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
      19   * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
      20   * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
      21   * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
      22   * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
      23   * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
      24   * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
      25   * SUCH DAMAGE.
      26   */
      27  
      28  
      29  #ifndef LIBMPDEC_UMODARITH_H_
      30  #define LIBMPDEC_UMODARITH_H_
      31  
      32  
      33  #include "mpdecimal.h"
      34  
      35  #include "constants.h"
      36  #include "typearith.h"
      37  
      38  
      39  /* Bignum: Low level routines for unsigned modular arithmetic. These are
      40     used in the fast convolution functions for very large coefficients. */
      41  
      42  
      43  /**************************************************************************/
      44  /*                        ANSI modular arithmetic                         */
      45  /**************************************************************************/
      46  
      47  
      48  /*
      49   * Restrictions: a < m and b < m
      50   * ACL2 proof: umodarith.lisp: addmod-correct
      51   */
      52  static inline mpd_uint_t
      53  addmod(mpd_uint_t a, mpd_uint_t b, mpd_uint_t m)
      54  {
      55      mpd_uint_t s;
      56  
      57      s = a + b;
      58      s = (s < a) ? s - m : s;
      59      s = (s >= m) ? s - m : s;
      60  
      61      return s;
      62  }
      63  
      64  /*
      65   * Restrictions: a < m and b < m
      66   * ACL2 proof: umodarith.lisp: submod-2-correct
      67   */
      68  static inline mpd_uint_t
      69  submod(mpd_uint_t a, mpd_uint_t b, mpd_uint_t m)
      70  {
      71      mpd_uint_t d;
      72  
      73      d = a - b;
      74      d = (a < b) ? d + m : d;
      75  
      76      return d;
      77  }
      78  
      79  /*
      80   * Restrictions: a < 2m and b < 2m
      81   * ACL2 proof: umodarith.lisp: section ext-submod
      82   */
      83  static inline mpd_uint_t
      84  ext_submod(mpd_uint_t a, mpd_uint_t b, mpd_uint_t m)
      85  {
      86      mpd_uint_t d;
      87  
      88      a = (a >= m) ? a - m : a;
      89      b = (b >= m) ? b - m : b;
      90  
      91      d = a - b;
      92      d = (a < b) ? d + m : d;
      93  
      94      return d;
      95  }
      96  
      97  /*
      98   * Reduce double word modulo m.
      99   * Restrictions: m != 0
     100   * ACL2 proof: umodarith.lisp: section dw-reduce
     101   */
     102  static inline mpd_uint_t
     103  dw_reduce(mpd_uint_t hi, mpd_uint_t lo, mpd_uint_t m)
     104  {
     105      mpd_uint_t r1, r2, w;
     106  
     107      _mpd_div_word(&w, &r1, hi, m);
     108      _mpd_div_words(&w, &r2, r1, lo, m);
     109  
     110      return r2;
     111  }
     112  
     113  /*
     114   * Subtract double word from a.
     115   * Restrictions: a < m
     116   * ACL2 proof: umodarith.lisp: section dw-submod
     117   */
     118  static inline mpd_uint_t
     119  dw_submod(mpd_uint_t a, mpd_uint_t hi, mpd_uint_t lo, mpd_uint_t m)
     120  {
     121      mpd_uint_t d, r;
     122  
     123      r = dw_reduce(hi, lo, m);
     124      d = a - r;
     125      d = (a < r) ? d + m : d;
     126  
     127      return d;
     128  }
     129  
     130  #ifdef CONFIG_64
     131  
     132  /**************************************************************************/
     133  /*                        64-bit modular arithmetic                       */
     134  /**************************************************************************/
     135  
     136  /*
     137   * A proof of the algorithm is in literature/mulmod-64.txt. An ACL2
     138   * proof is in umodarith.lisp: section "Fast modular reduction".
     139   *
     140   * Algorithm: calculate (a * b) % p:
     141   *
     142   *   a) hi, lo <- a * b       # Calculate a * b.
     143   *
     144   *   b) hi, lo <-  R(hi, lo)  # Reduce modulo p.
     145   *
     146   *   c) Repeat step b) until 0 <= hi * 2**64 + lo < 2*p.
     147   *
     148   *   d) If the result is less than p, return lo. Otherwise return lo - p.
     149   */
     150  
     151  static inline mpd_uint_t
     152  x64_mulmod(mpd_uint_t a, mpd_uint_t b, mpd_uint_t m)
     153  {
     154      mpd_uint_t hi, lo, x, y;
     155  
     156  
     157      _mpd_mul_words(&hi, &lo, a, b);
     158  
     159      if (m & (1ULL<<32)) { /* P1 */
     160  
     161          /* first reduction */
     162          x = y = hi;
     163          hi >>= 32;
     164  
     165          x = lo - x;
     166          if (x > lo) hi--;
     167  
     168          y <<= 32;
     169          lo = y + x;
     170          if (lo < y) hi++;
     171  
     172          /* second reduction */
     173          x = y = hi;
     174          hi >>= 32;
     175  
     176          x = lo - x;
     177          if (x > lo) hi--;
     178  
     179          y <<= 32;
     180          lo = y + x;
     181          if (lo < y) hi++;
     182  
     183          return (hi || lo >= m ? lo - m : lo);
     184      }
     185      else if (m & (1ULL<<34)) { /* P2 */
     186  
     187          /* first reduction */
     188          x = y = hi;
     189          hi >>= 30;
     190  
     191          x = lo - x;
     192          if (x > lo) hi--;
     193  
     194          y <<= 34;
     195          lo = y + x;
     196          if (lo < y) hi++;
     197  
     198          /* second reduction */
     199          x = y = hi;
     200          hi >>= 30;
     201  
     202          x = lo - x;
     203          if (x > lo) hi--;
     204  
     205          y <<= 34;
     206          lo = y + x;
     207          if (lo < y) hi++;
     208  
     209          /* third reduction */
     210          x = y = hi;
     211          hi >>= 30;
     212  
     213          x = lo - x;
     214          if (x > lo) hi--;
     215  
     216          y <<= 34;
     217          lo = y + x;
     218          if (lo < y) hi++;
     219  
     220          return (hi || lo >= m ? lo - m : lo);
     221      }
     222      else { /* P3 */
     223  
     224          /* first reduction */
     225          x = y = hi;
     226          hi >>= 24;
     227  
     228          x = lo - x;
     229          if (x > lo) hi--;
     230  
     231          y <<= 40;
     232          lo = y + x;
     233          if (lo < y) hi++;
     234  
     235          /* second reduction */
     236          x = y = hi;
     237          hi >>= 24;
     238  
     239          x = lo - x;
     240          if (x > lo) hi--;
     241  
     242          y <<= 40;
     243          lo = y + x;
     244          if (lo < y) hi++;
     245  
     246          /* third reduction */
     247          x = y = hi;
     248          hi >>= 24;
     249  
     250          x = lo - x;
     251          if (x > lo) hi--;
     252  
     253          y <<= 40;
     254          lo = y + x;
     255          if (lo < y) hi++;
     256  
     257          return (hi || lo >= m ? lo - m : lo);
     258      }
     259  }
     260  
     261  static inline void
     262  x64_mulmod2c(mpd_uint_t *a, mpd_uint_t *b, mpd_uint_t w, mpd_uint_t m)
     263  {
     264      *a = x64_mulmod(*a, w, m);
     265      *b = x64_mulmod(*b, w, m);
     266  }
     267  
     268  static inline void
     269  x64_mulmod2(mpd_uint_t *a0, mpd_uint_t b0, mpd_uint_t *a1, mpd_uint_t b1,
     270              mpd_uint_t m)
     271  {
     272      *a0 = x64_mulmod(*a0, b0, m);
     273      *a1 = x64_mulmod(*a1, b1, m);
     274  }
     275  
     276  static inline mpd_uint_t
     277  x64_powmod(mpd_uint_t base, mpd_uint_t exp, mpd_uint_t umod)
     278  {
     279      mpd_uint_t r = 1;
     280  
     281      while (exp > 0) {
     282          if (exp & 1)
     283              r = x64_mulmod(r, base, umod);
     284          base = x64_mulmod(base, base, umod);
     285          exp >>= 1;
     286      }
     287  
     288      return r;
     289  }
     290  
     291  /* END CONFIG_64 */
     292  #else /* CONFIG_32 */
     293  
     294  
     295  /**************************************************************************/
     296  /*                        32-bit modular arithmetic                       */
     297  /**************************************************************************/
     298  
     299  #if defined(ANSI)
     300  #if !defined(LEGACY_COMPILER)
     301  /* HAVE_UINT64_T */
     302  static inline mpd_uint_t
     303  std_mulmod(mpd_uint_t a, mpd_uint_t b, mpd_uint_t m)
     304  {
     305      return ((mpd_uuint_t) a * b) % m;
     306  }
     307  
     308  static inline void
     309  std_mulmod2c(mpd_uint_t *a, mpd_uint_t *b, mpd_uint_t w, mpd_uint_t m)
     310  {
     311      *a = ((mpd_uuint_t) *a * w) % m;
     312      *b = ((mpd_uuint_t) *b * w) % m;
     313  }
     314  
     315  static inline void
     316  std_mulmod2(mpd_uint_t *a0, mpd_uint_t b0, mpd_uint_t *a1, mpd_uint_t b1,
     317              mpd_uint_t m)
     318  {
     319      *a0 = ((mpd_uuint_t) *a0 * b0) % m;
     320      *a1 = ((mpd_uuint_t) *a1 * b1) % m;
     321  }
     322  /* END HAVE_UINT64_T */
     323  #else
     324  /* LEGACY_COMPILER */
     325  static inline mpd_uint_t
     326  std_mulmod(mpd_uint_t a, mpd_uint_t b, mpd_uint_t m)
     327  {
     328      mpd_uint_t hi, lo, q, r;
     329      _mpd_mul_words(&hi, &lo, a, b);
     330      _mpd_div_words(&q, &r, hi, lo, m);
     331      return r;
     332  }
     333  
     334  static inline void
     335  std_mulmod2c(mpd_uint_t *a, mpd_uint_t *b, mpd_uint_t w, mpd_uint_t m)
     336  {
     337      *a = std_mulmod(*a, w, m);
     338      *b = std_mulmod(*b, w, m);
     339  }
     340  
     341  static inline void
     342  std_mulmod2(mpd_uint_t *a0, mpd_uint_t b0, mpd_uint_t *a1, mpd_uint_t b1,
     343              mpd_uint_t m)
     344  {
     345      *a0 = std_mulmod(*a0, b0, m);
     346      *a1 = std_mulmod(*a1, b1, m);
     347  }
     348  /* END LEGACY_COMPILER */
     349  #endif
     350  
     351  static inline mpd_uint_t
     352  std_powmod(mpd_uint_t base, mpd_uint_t exp, mpd_uint_t umod)
     353  {
     354      mpd_uint_t r = 1;
     355  
     356      while (exp > 0) {
     357          if (exp & 1)
     358              r = std_mulmod(r, base, umod);
     359          base = std_mulmod(base, base, umod);
     360          exp >>= 1;
     361      }
     362  
     363      return r;
     364  }
     365  #endif /* ANSI CONFIG_32 */
     366  
     367  
     368  /**************************************************************************/
     369  /*                    Pentium Pro modular arithmetic                      */
     370  /**************************************************************************/
     371  
     372  /*
     373   * A proof of the algorithm is in literature/mulmod-ppro.txt. The FPU
     374   * control word must be set to 64-bit precision and truncation mode
     375   * prior to using these functions.
     376   *
     377   * Algorithm: calculate (a * b) % p:
     378   *
     379   *   p    := prime < 2**31
     380   *   pinv := (long double)1.0 / p (precalculated)
     381   *
     382   *   a) n = a * b              # Calculate exact product.
     383   *   b) qest = n * pinv        # Calculate estimate for q = n / p.
     384   *   c) q = (qest+2**63)-2**63 # Truncate qest to the exact quotient.
     385   *   d) r = n - q * p          # Calculate remainder.
     386   *
     387   * Remarks:
     388   *
     389   *   - p = dmod and pinv = dinvmod.
     390   *   - dinvmod points to an array of three uint32_t, which is interpreted
     391   *     as an 80 bit long double by fldt.
     392   *   - Intel compilers prior to version 11 do not seem to handle the
     393   *     __GNUC__ inline assembly correctly.
     394   *   - random tests are provided in tests/extended/ppro_mulmod.c
     395   */
     396  
     397  #if defined(PPRO)
     398  #if defined(ASM)
     399  
     400  /* Return (a * b) % dmod */
     401  static inline mpd_uint_t
     402  ppro_mulmod(mpd_uint_t a, mpd_uint_t b, double *dmod, uint32_t *dinvmod)
     403  {
     404      mpd_uint_t retval;
     405  
     406      __asm__ (
     407              "fildl  %2\n\t"
     408              "fildl  %1\n\t"
     409              "fmulp  %%st, %%st(1)\n\t"
     410              "fldt   (%4)\n\t"
     411              "fmul   %%st(1), %%st\n\t"
     412              "flds   %5\n\t"
     413              "fadd   %%st, %%st(1)\n\t"
     414              "fsubrp %%st, %%st(1)\n\t"
     415              "fldl   (%3)\n\t"
     416              "fmulp  %%st, %%st(1)\n\t"
     417              "fsubrp %%st, %%st(1)\n\t"
     418              "fistpl %0\n\t"
     419              : "=m" (retval)
     420              : "m" (a), "m" (b), "r" (dmod), "r" (dinvmod), "m" (MPD_TWO63)
     421              : "st", "memory"
     422      );
     423  
     424      return retval;
     425  }
     426  
     427  /*
     428   * Two modular multiplications in parallel:
     429   *      *a0 = (*a0 * w) % dmod
     430   *      *a1 = (*a1 * w) % dmod
     431   */
     432  static inline void
     433  ppro_mulmod2c(mpd_uint_t *a0, mpd_uint_t *a1, mpd_uint_t w,
     434                double *dmod, uint32_t *dinvmod)
     435  {
     436      __asm__ (
     437              "fildl  %2\n\t"
     438              "fildl  (%1)\n\t"
     439              "fmul   %%st(1), %%st\n\t"
     440              "fxch   %%st(1)\n\t"
     441              "fildl  (%0)\n\t"
     442              "fmulp  %%st, %%st(1) \n\t"
     443              "fldt   (%4)\n\t"
     444              "flds   %5\n\t"
     445              "fld    %%st(2)\n\t"
     446              "fmul   %%st(2)\n\t"
     447              "fadd   %%st(1)\n\t"
     448              "fsub   %%st(1)\n\t"
     449              "fmull  (%3)\n\t"
     450              "fsubrp %%st, %%st(3)\n\t"
     451              "fxch   %%st(2)\n\t"
     452              "fistpl (%0)\n\t"
     453              "fmul   %%st(2)\n\t"
     454              "fadd   %%st(1)\n\t"
     455              "fsubp  %%st, %%st(1)\n\t"
     456              "fmull  (%3)\n\t"
     457              "fsubrp %%st, %%st(1)\n\t"
     458              "fistpl (%1)\n\t"
     459              : : "r" (a0), "r" (a1), "m" (w),
     460                  "r" (dmod), "r" (dinvmod),
     461                  "m" (MPD_TWO63)
     462              : "st", "memory"
     463      );
     464  }
     465  
     466  /*
     467   * Two modular multiplications in parallel:
     468   *      *a0 = (*a0 * b0) % dmod
     469   *      *a1 = (*a1 * b1) % dmod
     470   */
     471  static inline void
     472  ppro_mulmod2(mpd_uint_t *a0, mpd_uint_t b0, mpd_uint_t *a1, mpd_uint_t b1,
     473               double *dmod, uint32_t *dinvmod)
     474  {
     475      __asm__ (
     476              "fildl  %3\n\t"
     477              "fildl  (%2)\n\t"
     478              "fmulp  %%st, %%st(1)\n\t"
     479              "fildl  %1\n\t"
     480              "fildl  (%0)\n\t"
     481              "fmulp  %%st, %%st(1)\n\t"
     482              "fldt   (%5)\n\t"
     483              "fld    %%st(2)\n\t"
     484              "fmul   %%st(1), %%st\n\t"
     485              "fxch   %%st(1)\n\t"
     486              "fmul   %%st(2), %%st\n\t"
     487              "flds   %6\n\t"
     488              "fldl   (%4)\n\t"
     489              "fxch   %%st(3)\n\t"
     490              "fadd   %%st(1), %%st\n\t"
     491              "fxch   %%st(2)\n\t"
     492              "fadd   %%st(1), %%st\n\t"
     493              "fxch   %%st(2)\n\t"
     494              "fsub   %%st(1), %%st\n\t"
     495              "fxch   %%st(2)\n\t"
     496              "fsubp  %%st, %%st(1)\n\t"
     497              "fxch   %%st(1)\n\t"
     498              "fmul   %%st(2), %%st\n\t"
     499              "fxch   %%st(1)\n\t"
     500              "fmulp  %%st, %%st(2)\n\t"
     501              "fsubrp %%st, %%st(3)\n\t"
     502              "fsubrp %%st, %%st(1)\n\t"
     503              "fxch   %%st(1)\n\t"
     504              "fistpl (%2)\n\t"
     505              "fistpl (%0)\n\t"
     506              : : "r" (a0), "m" (b0), "r" (a1), "m" (b1),
     507                  "r" (dmod), "r" (dinvmod),
     508                  "m" (MPD_TWO63)
     509              : "st", "memory"
     510      );
     511  }
     512  /* END PPRO GCC ASM */
     513  #elif defined(MASM)
     514  
     515  /* Return (a * b) % dmod */
     516  static inline mpd_uint_t __cdecl
     517  ppro_mulmod(mpd_uint_t a, mpd_uint_t b, double *dmod, uint32_t *dinvmod)
     518  {
     519      mpd_uint_t retval;
     520  
     521      __asm {
     522          mov     eax, dinvmod
     523          mov     edx, dmod
     524          fild    b
     525          fild    a
     526          fmulp   st(1), st
     527          fld     TBYTE PTR [eax]
     528          fmul    st, st(1)
     529          fld     MPD_TWO63
     530          fadd    st(1), st
     531          fsubp   st(1), st
     532          fld     QWORD PTR [edx]
     533          fmulp   st(1), st
     534          fsubp   st(1), st
     535          fistp   retval
     536      }
     537  
     538      return retval;
     539  }
     540  
     541  /*
     542   * Two modular multiplications in parallel:
     543   *      *a0 = (*a0 * w) % dmod
     544   *      *a1 = (*a1 * w) % dmod
     545   */
     546  static inline mpd_uint_t __cdecl
     547  ppro_mulmod2c(mpd_uint_t *a0, mpd_uint_t *a1, mpd_uint_t w,
     548                double *dmod, uint32_t *dinvmod)
     549  {
     550      __asm {
     551          mov     ecx, dmod
     552          mov     edx, a1
     553          mov     ebx, dinvmod
     554          mov     eax, a0
     555          fild    w
     556          fild    DWORD PTR [edx]
     557          fmul    st, st(1)
     558          fxch    st(1)
     559          fild    DWORD PTR [eax]
     560          fmulp   st(1), st
     561          fld     TBYTE PTR [ebx]
     562          fld     MPD_TWO63
     563          fld     st(2)
     564          fmul    st, st(2)
     565          fadd    st, st(1)
     566          fsub    st, st(1)
     567          fmul    QWORD PTR [ecx]
     568          fsubp   st(3), st
     569          fxch    st(2)
     570          fistp   DWORD PTR [eax]
     571          fmul    st, st(2)
     572          fadd    st, st(1)
     573          fsubrp  st(1), st
     574          fmul    QWORD PTR [ecx]
     575          fsubp   st(1), st
     576          fistp   DWORD PTR [edx]
     577      }
     578  }
     579  
     580  /*
     581   * Two modular multiplications in parallel:
     582   *      *a0 = (*a0 * b0) % dmod
     583   *      *a1 = (*a1 * b1) % dmod
     584   */
     585  static inline void __cdecl
     586  ppro_mulmod2(mpd_uint_t *a0, mpd_uint_t b0, mpd_uint_t *a1, mpd_uint_t b1,
     587               double *dmod, uint32_t *dinvmod)
     588  {
     589      __asm {
     590          mov     ecx, dmod
     591          mov     edx, a1
     592          mov     ebx, dinvmod
     593          mov     eax, a0
     594          fild    b1
     595          fild    DWORD PTR [edx]
     596          fmulp   st(1), st
     597          fild    b0
     598          fild    DWORD PTR [eax]
     599          fmulp   st(1), st
     600          fld     TBYTE PTR [ebx]
     601          fld     st(2)
     602          fmul    st, st(1)
     603          fxch    st(1)
     604          fmul    st, st(2)
     605          fld     DWORD PTR MPD_TWO63
     606          fld     QWORD PTR [ecx]
     607          fxch    st(3)
     608          fadd    st, st(1)
     609          fxch    st(2)
     610          fadd    st, st(1)
     611          fxch    st(2)
     612          fsub    st, st(1)
     613          fxch    st(2)
     614          fsubrp  st(1), st
     615          fxch    st(1)
     616          fmul    st, st(2)
     617          fxch    st(1)
     618          fmulp   st(2), st
     619          fsubp   st(3), st
     620          fsubp   st(1), st
     621          fxch    st(1)
     622          fistp   DWORD PTR [edx]
     623          fistp   DWORD PTR [eax]
     624      }
     625  }
     626  #endif /* PPRO MASM (_MSC_VER) */
     627  
     628  
     629  /* Return (base ** exp) % dmod */
     630  static inline mpd_uint_t
     631  ppro_powmod(mpd_uint_t base, mpd_uint_t exp, double *dmod, uint32_t *dinvmod)
     632  {
     633      mpd_uint_t r = 1;
     634  
     635      while (exp > 0) {
     636          if (exp & 1)
     637              r = ppro_mulmod(r, base, dmod, dinvmod);
     638          base = ppro_mulmod(base, base, dmod, dinvmod);
     639          exp >>= 1;
     640      }
     641  
     642      return r;
     643  }
     644  #endif /* PPRO */
     645  #endif /* CONFIG_32 */
     646  
     647  
     648  #endif /* LIBMPDEC_UMODARITH_H_ */