1  /* Addmul_2 / mul_2 for IBM z13 or later
       2  
       3  Copyright 2021 Free Software Foundation, Inc.
       4  
       5  This file is part of the GNU MP Library.
       6  
       7  The GNU MP Library is free software; you can redistribute it and/or modify
       8  it under the terms of either:
       9  
      10    * the GNU Lesser General Public License as published by the Free
      11      Software Foundation; either version 3 of the License, or (at your
      12      option) any later version.
      13  
      14  or
      15  
      16    * the GNU General Public License as published by the Free Software
      17      Foundation; either version 2 of the License, or (at your option) any
      18      later version.
      19  
      20  or both in parallel, as here.
      21  
      22  The GNU MP Library is distributed in the hope that it will be useful, but
      23  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
      24  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
      25  for more details.
      26  
      27  You should have received copies of the GNU General Public License and the
      28  GNU Lesser General Public License along with the GNU MP Library.  If not,
      29  see https://www.gnu.org/licenses/.  */
      30  
      31  #include "gmp-impl.h"
      32  
      33  #include "s390_64/z13/common-vec.h"
      34  
      35  #undef FUNCNAME
      36  
      37  #ifdef DO_INLINE
      38  #  ifdef OPERATION_addmul_2
      39  #    define ADD
      40  #    define FUNCNAME inline_addmul_2
      41  #  elif defined(OPERATION_mul_2)
      42  #    define FUNCNAME inline_mul_2
      43  #  else
      44  #    error Missing define for operation to perform
      45  #  endif
      46  #else
      47  #  ifdef OPERATION_addmul_2
      48  #    define ADD
      49  #    define FUNCNAME mpn_addmul_2
      50  #  elif defined(OPERATION_mul_2)
      51  #    define FUNCNAME mpn_mul_2
      52  #  else
      53  #    error Missing define for operation to perform
      54  #  endif
      55  #endif
      56  
      57  #ifdef DO_INLINE
      58  static inline mp_limb_t
      59  FUNCNAME (mp_limb_t *rp, const mp_limb_t *up, mp_size_t n, const mp_limb_t *vp)
      60      __attribute__ ((always_inline));
      61  
      62  static inline
      63  #endif
      64  mp_limb_t
      65  FUNCNAME (mp_limb_t *rp, const mp_limb_t *up, mp_size_t n,
      66            const mp_limb_t *vp)
      67  {
      68  
      69    /* Combine 64x64 multiplication into GPR pairs (MLGR) with 128-bit adds in
      70       VRs (using each VR as a single 128-bit accumulator).
      71       The inner loop is unrolled to four limbs, with two blocks of four
      72       multiplications each. Since the MLGR operation operates on even/odd GPR
      73       pairs, pin the products appropriately. */
      74  
      75    register mp_limb_t p0_high asm("r0");
      76    register mp_limb_t p0_low asm("r1");
      77  
      78    register mp_limb_t p1_high asm("r8");
      79    register mp_limb_t p1_low asm("r9");
      80  
      81    register mp_limb_t p2_high asm("r6");
      82    register mp_limb_t p2_low asm("r7");
      83  
      84    register mp_limb_t p3_high asm("r10");
      85    register mp_limb_t p3_low asm("r11");
      86  
      87    vec_t carry_prod = { .dw = vec_splat_u64 (0) };
      88    vec_t zero = { .dw = vec_splat_u64 (0) };
      89  
      90    /* two carry-bits for the 128-bit VR adds - stored in VRs */
      91  #ifdef ADD
      92    vec_t carry_vec0 = { .dw = vec_splat_u64 (0) };
      93  #endif
      94    vec_t carry_vec1 = { .dw = vec_splat_u64 (0) };
      95  
      96    vec_t tmp;
      97  
      98    vec_t sum0, sum1;
      99  
     100    /* products transferred into VRs for accumulating there */
     101    vec_t pv0, pv3;
     102    vec_t pv1_low, pv1_high, pv2_low, pv2_high;
     103    vec_t low, middle, high;
     104  #ifdef ADD
     105    vec_t rp0, rp1;
     106  #endif
     107  
     108    register mp_limb_t v0 asm("r12");
     109    register mp_limb_t v1 asm("r5");
     110    v0 = vp[0];
     111    v1 = vp[1];
     112  
     113    /* The scalar multiplications compete with pointer and index increments for
     114     * issue ports. Thus, increment the loop index in the middle of the loop so
     115     * that the operations for the next iteration's multiplications can be
     116     * loaded in time (looks horrible, yet helps performance) and make sure we
     117     * use addressing with base reg + index reg + immediate displacement
     118     * (so that only the single index needs incrementing, instead of multiple
     119     * pointers). */
     120  #undef LOOP_ADVANCE
     121  #define LOOP_ADVANCE (4 * sizeof (mp_limb_t))
     122  #define IDX_OFFSET (LOOP_ADVANCE)
     123  
     124    register ssize_t idx = 0 - IDX_OFFSET;
     125  #ifdef BRCTG
     126    ssize_t iterations = (size_t)n / 4;
     127  #else
     128    ssize_t const idx_bound = n * sizeof (mp_limb_t) - IDX_OFFSET;
     129  #endif
     130  
     131    /*
     132     * To minimize latency in the carry chain, accumulate in VRs with 128-bit
     133     * adds with carry in and out. As a downside, these require two insns for
     134     * each add - one to calculate the sum, one to deliver the carry out.
     135     * To reduce the overall number of insns to execute, combine adding up
     136     * product limbs such that there cannot be a carry out and one (for mul) or
     137     * two (for addmul) adds with carry chains.
     138     *
     139     * Since (2^64-1) * (2^64-1) = (2^128-1) - 2 * (2^64-1), we can add two
     140     * limbs into each 128-bit product without causing carry out.
     141     *
     142     * For each block of 2 limbs * 2 limbs
     143     *
     144     *                             |  u[i] * v[0] (p2) |
     145     *                   |  u[i] * v[1] (p0) |
     146     *                   | u[i+1] * v[0](p1) |
     147     *         | u[i+1] * v[1](p3) |
     148     *         <     128 bits     > <    128 bits      >
     149     *
     150     * we can begin accumulating with "simple" carry-oblivious 128-bit adds:
     151     * - p0 + low limb of p1
     152     *      + high limb of p2
     153     *      and combine resulting low limb with p2's low limb
     154     * - p3 + high limb of p1
     155     *      + high limb of sum above
     156     * ... which will will result in two 128-bit limbs to be fed into the carry
     157     * chain(s).
     158     * Overall, that scheme saves instructions and improves performance, despite
     159     * slightly increasing latency between multiplications and carry chain (yet
     160     * not in the carry chain).
     161     */
     162  
     163  #define LOAD_LOW_LIMB(VEC, LIMB)                                              \
     164    do                                                                          \
     165      {                                                                         \
     166        asm("vzero\t%[vec]\n\t"                                                 \
     167            "vlvgg\t%[vec],%[limb],1"                                           \
     168            : [vec] "=v"(VEC)                                                   \
     169            : [limb] "r"(LIMB));                                                \
     170      }                                                                         \
     171    while (0)
     172  
     173    /* for the 128-bit adds in the carry chain, to calculate a + b + carry-in we
     174     * need paired vec_adde_u128 (delivers sum) and vec_addec_u128 (delivers new
     175     * carry) */
     176  #define ADD_UP2_CARRY_INOUT(SUMIDX, CARRYIDX, ADDEND1, ADDEND2)               \
     177    do                                                                          \
     178      {                                                                         \
     179        sum##SUMIDX.sw                                                          \
     180            = vec_adde_u128 (ADDEND1.sw, ADDEND2.sw, carry_vec##CARRYIDX.sw);   \
     181        carry_vec##CARRYIDX.sw                                                  \
     182            = vec_addec_u128 (ADDEND1.sw, ADDEND2.sw, carry_vec##CARRYIDX.sw);  \
     183      }                                                                         \
     184    while (0)
     185  
     186  #define ADD_UP_CARRY_INOUT(SUMIDX, ADDEND1, ADDEND2)                          \
     187    ADD_UP2_CARRY_INOUT (SUMIDX, SUMIDX, ADDEND1, ADDEND2)
     188  
     189    /* variant without carry-in for prologue */
     190  #define ADD_UP2_CARRY_OUT(SUMIDX, CARRYIDX, ADDEND1, ADDEND2)                 \
     191    do                                                                          \
     192      {                                                                         \
     193        sum##SUMIDX.sw = vec_add_u128 (ADDEND1.sw, ADDEND2.sw);                 \
     194        carry_vec##CARRYIDX.sw = vec_addc_u128 (ADDEND1.sw, ADDEND2.sw);        \
     195      }                                                                         \
     196    while (0)
     197  
     198  #define ADD_UP_CARRY_OUT(SUMIDX, ADDEND1, ADDEND2)                            \
     199    ADD_UP2_CARRY_OUT (SUMIDX, SUMIDX, ADDEND1, ADDEND2)
     200  
     201    /* prologue for 4x-unrolled main loop */
     202    switch ((size_t)n % 4)
     203      {
     204      case 1:
     205        ASM_LOADGPR_BASE (p0_low, up, 0);
     206        ASM_LOADGPR_BASE (p1_low, up, 0);
     207        s390_double_umul_ppmm_distinct (p0_high, p0_low, p1_high, p1_low, v0, v1);
     208        carry_prod.dw = vec_load_2di_as_pair (p1_high, p1_low);
     209  
     210  /* gcc tries to be too clever and vlr from a reg that is already zero. vzero is
     211   * cheaper. */
     212  #  define NEW_CARRY(VEC, LIMB)                                                \
     213      do                                                                        \
     214        {                                                                       \
     215          asm("vzero\t%[vec]\n\t"                                               \
     216              "vlvgg\t%[vec],%[limb],1"                                         \
     217              : [vec] "=v"(VEC)                                                 \
     218              : [limb] "r"(LIMB));                                              \
     219        }                                                                       \
     220      while (0)
     221  
     222        NEW_CARRY (tmp, p0_high);
     223  
     224        carry_prod.sw = vec_add_u128 (carry_prod.sw, tmp.sw);
     225  #ifdef ADD
     226        carry_vec1.dw[1] = __builtin_add_overflow (rp[0], p0_low, rp);
     227  #else
     228        rp[0] = p0_low;
     229  #endif
     230        idx += sizeof (mp_limb_t);
     231        break;
     232  
     233      case 2:
     234        ASM_LOADGPR_BASE (p0_low, up, 0);
     235        ASM_LOADGPR_BASE (p1_low, up, 8);
     236        ASM_LOADGPR_BASE (p2_low, up, 0);
     237        ASM_LOADGPR_BASE (p3_low, up, 8);
     238  
     239        asm(""
     240            : "=r"(p0_low), "=r"(p2_low)
     241            : "r"(p3_low), "0"(p0_low), "r"(p1_low), "1"(p2_low));
     242        s390_double_umul_ppmm_distinct (p0_high, p0_low, p1_high, p1_low, v1, v0);
     243        s390_double_umul_ppmm_distinct (p2_high, p2_low, p3_high, p3_low, v0, v1);
     244  
     245        pv0.dw = vec_load_2di_as_pair (p0_high, p0_low);
     246        LOAD_LOW_LIMB (pv1_low, p1_low);
     247        LOAD_LOW_LIMB (pv1_high, p1_high);
     248        pv0.sw = vec_add_u128 (pv0.sw, pv1_low.sw);
     249        LOAD_LOW_LIMB (pv2_high, p2_high);
     250        pv3.dw = vec_load_2di_as_pair (p3_high, p3_low);
     251        LOAD_LOW_LIMB (pv2_low, p2_low);
     252        pv3.sw = vec_add_u128 (pv3.sw, pv1_high.sw);
     253        middle.sw = vec_add_u128 (pv0.sw, pv2_high.sw);
     254        low.dw = vec_permi (middle.dw, pv2_low.dw, 3);
     255        middle.dw = vec_permi (zero.dw, middle.dw, 0);
     256        high.sw = vec_add_u128 (middle.sw, pv3.sw);
     257  #ifdef ADD
     258        rp0 = vec_load_elements_reversed (rp, 0);
     259        ADD_UP_CARRY_OUT (0, rp0, carry_prod);
     260  #else
     261        sum0 = carry_prod;
     262  #endif
     263        ADD_UP_CARRY_OUT (1, sum0, low);
     264        vec_store_elements_reversed (rp, 0, sum1);
     265        carry_prod = high;
     266  
     267        idx += 2 * sizeof (mp_limb_t);
     268        break;
     269  
     270      case 3:
     271        ASM_LOADGPR_BASE (p0_low, up, 0);
     272        ASM_LOADGPR_BASE (p1_low, up, 0);
     273        s390_double_umul_ppmm_distinct (p0_high, p0_low, p1_high, p1_low, v0, v1);
     274        carry_prod.dw = vec_load_2di_as_pair (p1_high, p1_low);
     275        NEW_CARRY (tmp, p0_high);
     276        carry_prod.sw = vec_add_u128 (carry_prod.sw, tmp.sw);
     277  
     278  #ifdef ADD
     279        carry_vec1.dw[1] = __builtin_add_overflow (rp[0], p0_low, rp);
     280  #else
     281        rp[0] = p0_low;
     282  #endif
     283  
     284        ASM_LOADGPR_BASE (p0_low, up, 8);
     285        ASM_LOADGPR_BASE (p1_low, up, 16);
     286        ASM_LOADGPR_BASE (p2_low, up, 8);
     287        ASM_LOADGPR_BASE (p3_low, up, 16);
     288  
     289        asm(""
     290            : "=r"(p0_low), "=r"(p2_low)
     291            : "r"(p3_low), "0"(p0_low), "r"(p1_low), "1"(p2_low));
     292        s390_double_umul_ppmm_distinct (p0_high, p0_low, p1_high, p1_low, v1, v0);
     293        s390_double_umul_ppmm_distinct (p2_high, p2_low, p3_high, p3_low, v0, v1);
     294  
     295        pv0.dw = vec_load_2di_as_pair (p0_high, p0_low);
     296  
     297        LOAD_LOW_LIMB (pv1_low, p1_low);
     298        LOAD_LOW_LIMB (pv1_high, p1_high);
     299  
     300        pv0.sw = vec_add_u128 (pv0.sw, pv1_low.sw);
     301        LOAD_LOW_LIMB (pv2_high, p2_high);
     302        pv3.dw = vec_load_2di_as_pair (p3_high, p3_low);
     303  
     304        LOAD_LOW_LIMB (pv2_low, p2_low);
     305  
     306        pv3.sw = vec_add_u128 (pv3.sw, pv1_high.sw);
     307        middle.sw = vec_add_u128 (pv0.sw, pv2_high.sw);
     308  
     309        low.dw = vec_permi (middle.dw, pv2_low.dw, 3);
     310        middle.dw = vec_permi (zero.dw, middle.dw, 0);
     311        high.sw = vec_add_u128 (middle.sw, pv3.sw);
     312  
     313  #ifdef ADD
     314        vec_t rp0 = vec_load_elements_reversed (rp, 8);
     315        ADD_UP_CARRY_OUT (0, rp0, carry_prod);
     316  #else
     317        sum0 = carry_prod;
     318  #endif
     319        ADD_UP_CARRY_INOUT (1, sum0, low);
     320  
     321        vec_store_elements_reversed (rp, 8, sum1);
     322  
     323        carry_prod = high;
     324  
     325        idx += 3 * sizeof (mp_limb_t);
     326        break;
     327      }
     328  
     329      /*
     330       * branch-on-count implicitly hint to the branch prediction as taken, while
     331       * compare-and-branch hints as not taken. currently, using branch-on-count
     332       * has a performance advantage, but it is not clear that it is generally
     333       * the better choice (e.g., branch-on-count requires decrementing the
     334       * separate counter). so, allow switching the loop condition to enable
     335       * either category of branch instructions:
     336       * - idx is less than an upper bound, for compare-and-branch
     337       * - iteration counter greater than zero, for branch-on-count
     338       */
     339  #ifdef BRCTG
     340    for (; iterations > 0; iterations--)
     341      {
     342  #else
     343    while (idx < idx_bound)
     344      {
     345  #endif
     346        /* The 64x64->128 MLGR multiplies two factors in GPRs and stores the
     347         * result in a GPR pair. One of the factors is taken from the GPR pair
     348         * and overwritten.
     349         * To reuse factors, it turned out cheaper to load limbs multiple times
     350         * than copying GPR contents. Enforce that and the use of addressing by
     351         * base + index gpr + immediate displacement via inline asm.
     352         */
     353        ASM_LOADGPR (p0_low, up, idx, 0 + IDX_OFFSET);
     354        ASM_LOADGPR (p1_low, up, idx, 8 + IDX_OFFSET);
     355        ASM_LOADGPR (p2_low, up, idx, 0 + IDX_OFFSET);
     356        ASM_LOADGPR (p3_low, up, idx, 8 + IDX_OFFSET);
     357  
     358        s390_double_umul_ppmm_distinct (p0_high, p0_low, p1_high, p1_low, v1, v0);
     359  
     360        pv0.dw = vec_load_2di_as_pair (p0_high, p0_low);
     361  
     362        LOAD_LOW_LIMB (pv1_low, p1_low);
     363        LOAD_LOW_LIMB (pv1_high, p1_high);
     364  
     365        s390_double_umul_ppmm_distinct (p2_high, p2_low, p3_high, p3_low, v0, v1);
     366  
     367        pv0.sw = vec_add_u128 (pv0.sw, pv1_low.sw);
     368        LOAD_LOW_LIMB (pv2_high, p2_high);
     369        pv3.dw = vec_load_2di_as_pair (p3_high, p3_low);
     370  
     371        LOAD_LOW_LIMB (pv2_low, p2_low);
     372  
     373        ASM_LOADGPR (p0_low, up, idx, 16 + IDX_OFFSET);
     374        ASM_LOADGPR (p1_low, up, idx, 24 + IDX_OFFSET);
     375        ASM_LOADGPR (p2_low, up, idx, 16 + IDX_OFFSET);
     376        ASM_LOADGPR (p3_low, up, idx, 24 + IDX_OFFSET);
     377  
     378        idx += LOOP_ADVANCE;
     379  
     380        /*
     381         * "barrier" to enforce scheduling the index increment before the second
     382         * block of multiplications. not required for clang.
     383         */
     384  #ifndef __clang__
     385        asm(""
     386            : "=r"(idx), "=r"(p0_high), "=r"(p2_high)
     387            : "0"(idx), "1"(p0_high), "2"(p2_high));
     388  #endif
     389  
     390        s390_double_umul_ppmm_distinct (p0_high, p0_low, p1_high, p1_low, v1, v0);
     391        s390_double_umul_ppmm_distinct (p2_high, p2_low, p3_high, p3_low, v0, v1);
     392  
     393        /*
     394         * "barrier" to enforce scheduling all MLGRs first, before any adding
     395         * up. note that clang produces better code without.
     396         */
     397  #ifndef __clang__
     398        asm(""
     399            : "=v"(pv0.sw), "=v"(pv3.sw)
     400            : "1"(pv3.sw), "0"(pv0.sw), "r"(p0_high), "r"(p2_high));
     401  #endif
     402  
     403        pv3.sw = vec_add_u128 (pv3.sw, pv1_high.sw);
     404        middle.sw = vec_add_u128 (pv0.sw, pv2_high.sw);
     405  
     406        low.dw = vec_permi (middle.dw, pv2_low.dw,
     407                            3); /* least-significant doubleword from both vectors */
     408        middle.dw = vec_permi (zero.dw, middle.dw, 0);
     409        high.sw = vec_add_u128 (middle.sw, pv3.sw);
     410  
     411  #ifdef ADD
     412        rp0 = vec_load_elements_reversed_idx (rp, idx,
     413                                              0 + IDX_OFFSET - LOOP_ADVANCE);
     414        ADD_UP_CARRY_INOUT (0, rp0, carry_prod);
     415  #else
     416        sum0 = carry_prod;
     417  #endif
     418        ADD_UP_CARRY_INOUT (1, sum0, low);
     419  
     420        vec_store_elements_reversed_idx (rp, idx, 0 + IDX_OFFSET - LOOP_ADVANCE,
     421                                         sum1);
     422  
     423        carry_prod = high;
     424  
     425        vec_t pv0_2, pv3_2;
     426        vec_t pv1_low_2, pv1_high_2, pv2_low_2, pv2_high_2;
     427        vec_t low_2, middle_2, high_2;
     428        vec_t sum2, sum3;
     429  
     430        pv0_2.dw = vec_load_2di_as_pair (p0_high, p0_low);
     431        LOAD_LOW_LIMB (pv1_low_2, p1_low);
     432        LOAD_LOW_LIMB (pv1_high_2, p1_high);
     433  
     434        pv0_2.sw = vec_add_u128 (pv0_2.sw, pv1_low_2.sw);
     435        LOAD_LOW_LIMB (pv2_high_2, p2_high);
     436        pv3_2.dw = vec_load_2di_as_pair (p3_high, p3_low);
     437        pv3_2.sw = vec_add_u128 (pv3_2.sw, pv1_high_2.sw);
     438        middle_2.sw = vec_add_u128 (pv0_2.sw, pv2_high_2.sw);
     439  
     440        LOAD_LOW_LIMB (pv2_low_2, p2_low);
     441        low_2.dw
     442            = vec_permi (middle_2.dw, pv2_low_2.dw,
     443                         3); /* least-significant doubleword from both vectors */
     444        middle_2.dw = vec_permi (zero.dw, middle_2.dw, 0);
     445        high_2.sw = vec_add_u128 (middle_2.sw, pv3_2.sw);
     446  
     447        /*
     448         * another "barrier" to influence scheduling. (also helps in clang)
     449         */
     450        asm("" : : "v"(pv0_2.sw), "r"(p2_high), "r"(p3_high), "v"(pv3_2.sw));
     451  
     452  #ifdef ADD
     453        rp1 = vec_load_elements_reversed_idx (rp, idx,
     454                                              16 + IDX_OFFSET - LOOP_ADVANCE);
     455        ADD_UP2_CARRY_INOUT (2, 0, rp1, carry_prod);
     456  #else
     457        sum2 = carry_prod;
     458  #endif
     459        ADD_UP2_CARRY_INOUT (3, 1, sum2, low_2);
     460  
     461        vec_store_elements_reversed_idx (rp, idx, 16 + IDX_OFFSET - LOOP_ADVANCE,
     462                                         sum3);
     463  
     464        carry_prod = high_2;
     465      }
     466  
     467  #ifdef ADD
     468    sum0.sw = vec_adde_u128 (carry_prod.sw, carry_vec0.sw, carry_vec1.sw);
     469  #else
     470    sum0.sw = vec_add_u128 (carry_prod.sw, carry_vec1.sw);
     471  #endif
     472  
     473    *(mp_ptr) (((char *)rp) + idx + 0 + IDX_OFFSET) = (mp_limb_t)sum0.dw[1];
     474  
     475    return (mp_limb_t)sum0.dw[0];
     476  }