1  /* mpn_mul_basecase for IBM z13 and later -- Internal routine to multiply two
       2     natural numbers of length m and n.
       3  
       4     THIS IS AN INTERNAL FUNCTION WITH A MUTABLE INTERFACE.  IT IS ONLY
       5     SAFE TO REACH THIS FUNCTION THROUGH DOCUMENTED INTERFACES.
       6  
       7  Copyright 2021 Free Software Foundation, Inc.
       8  
       9  This file is part of the GNU MP Library.
      10  
      11  The GNU MP Library is free software; you can redistribute it and/or modify
      12  it under the terms of either:
      13  
      14    * the GNU Lesser General Public License as published by the Free
      15      Software Foundation; either version 3 of the License, or (at your
      16      option) any later version.
      17  
      18  or
      19  
      20    * the GNU General Public License as published by the Free Software
      21      Foundation; either version 2 of the License, or (at your option) any
      22      later version.
      23  
      24  or both in parallel, as here.
      25  
      26  The GNU MP Library is distributed in the hope that it will be useful, but
      27  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
      28  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
      29  for more details.
      30  
      31  You should have received copies of the GNU General Public License and the
      32  GNU Lesser General Public License along with the GNU MP Library.  If not,
      33  see https://www.gnu.org/licenses/.  */
      34  
      35  #include <stdlib.h>
      36  
      37  #include "gmp-impl.h"
      38  
      39  /* Note: we explicitly inline all mul and addmul routines here to reduce the
      40   * number of branches in prologues of unrolled functions. That comes at the
      41     cost of duplicating common loop bodies in object code. */
      42  #define DO_INLINE
      43  
      44  /*
      45   * tweak loop conditions in addmul subroutines to enable use of
      46   * branch-relative-on-count (BRCTG) instructions, which currently results in
      47   * better performance.
      48   */
      49  #define BRCTG
      50  
      51  #include "s390_64/z13/common-vec.h"
      52  
      53  #define OPERATION_mul_1
      54  #include "s390_64/z13/addmul_1.c"
      55  #undef OPERATION_mul_1
      56  
      57  #define OPERATION_addmul_1
      58  #include "s390_64/z13/addmul_1.c"
      59  #undef OPERATION_addmul_1
      60  
      61  #define OPERATION_mul_2
      62  #include "s390_64/z13/aormul_2.c"
      63  #undef OPERATION_mul_2
      64  
      65  #define OPERATION_addmul_2
      66  #include "s390_64/z13/aormul_2.c"
      67  #undef OPERATION_addmul_2
      68  
      69  void
      70  mpn_mul_basecase (mp_ptr rp, mp_srcptr up, mp_size_t un, mp_srcptr vp,
      71                    mp_size_t vn)
      72  {
      73    ASSERT (un >= vn);
      74    ASSERT (vn >= 1);
      75    ASSERT (!MPN_OVERLAP_P (rp, un + vn, up, un));
      76    ASSERT (!MPN_OVERLAP_P (rp, un + vn, vp, vn));
      77  
      78    /* The implementations of (add)mul_1/2 are 4x-unrolled. Pull out the branch
      79     * for un%4 and inline specific variants. */
      80  
      81  #define BRANCH_FOR_MOD(N)                                                     \
      82    do                                                                          \
      83      {                                                                         \
      84        if (vn >= 2)                                                            \
      85          {                                                                     \
      86            rp[un + 1] = inline_mul_2 (rp, up, un, vp);                         \
      87            rp += 2, vp += 2, vn -= 2;                                          \
      88          }                                                                     \
      89        else                                                                    \
      90          {                                                                     \
      91            rp[un] = inline_mul_1 (rp, up, un, vp[0]);                          \
      92            return;                                                             \
      93          }                                                                     \
      94                                                                                \
      95        while (vn >= 2)                                                         \
      96          {                                                                     \
      97            rp[un + 2 - 1] = inline_addmul_2 (rp, up, un, vp);                  \
      98            rp += 2, vp += 2, vn -= 2;                                          \
      99          }                                                                     \
     100                                                                                \
     101        while (vn >= 1)                                                         \
     102          {                                                                     \
     103            rp[un] = inline_addmul_1 (rp, up, un, vp[0]);                       \
     104            rp += 1, vp += 1, vn -= 1;                                          \
     105          }                                                                     \
     106      }                                                                         \
     107    while (0);
     108  
     109    switch (((size_t)un) % 4)
     110      {
     111      case 0:
     112        BRANCH_FOR_MOD (0);
     113        break;
     114      case 1:
     115        BRANCH_FOR_MOD (1);
     116        break;
     117      case 2:
     118        BRANCH_FOR_MOD (2);
     119        break;
     120      case 3:
     121        BRANCH_FOR_MOD (3);
     122        break;
     123      }
     124  }