1  /* Common vector helpers and macros for IBM z13 and later
       2  
       3  Copyright 2021 Free Software Foundation, Inc.
       4  
       5  This file is part of the GNU MP Library.
       6  
       7  The GNU MP Library is free software; you can redistribute it and/or modify
       8  it under the terms of either:
       9  
      10    * the GNU Lesser General Public License as published by the Free
      11      Software Foundation; either version 3 of the License, or (at your
      12      option) any later version.
      13  
      14  or
      15  
      16    * the GNU General Public License as published by the Free Software
      17      Foundation; either version 2 of the License, or (at your option) any
      18      later version.
      19  
      20  or both in parallel, as here.
      21  
      22  The GNU MP Library is distributed in the hope that it will be useful, but
      23  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
      24  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
      25  for more details.
      26  
      27  You should have received copies of the GNU General Public License and the
      28  GNU Lesser General Public License along with the GNU MP Library.  If not,
      29  see https://www.gnu.org/licenses/.  */
      30  
      31  #ifndef __S390_64_Z13_COMMON_VEC_H
      32  #define __S390_64_Z13_COMMON_VEC_H
      33  
      34  #include <unistd.h>
      35  #include <vecintrin.h>
      36  
      37  /*
      38   * Vector intrinsics use vector element types that kind-of make sense for the
      39   * specific operation (e.g., vec_permi permutes doublewords). To use VRs
      40   * interchangeably with different intrinsics, typedef the two variants and wrap
      41   * them in a union.
      42   */
      43  #define VLEN_BYTES 16
      44  typedef unsigned long long v2di __attribute__ ((vector_size (VLEN_BYTES)));
      45  typedef unsigned char v16qi __attribute__ ((vector_size (VLEN_BYTES)));
      46  
      47  /*
      48   * The Z vector intrinsics use vectors with different element types (e.g.,
      49   * v16qi for the 128-bit adds and v2di for vec_permi).
      50   */
      51  union vec
      52  {
      53    v2di dw;
      54    v16qi sw;
      55  };
      56  
      57  typedef union vec vec_t;
      58  
      59  /*
      60   * single-instruction combine of two GPRs into a VR
      61   */
      62  static inline v2di
      63  vec_load_2di_as_pair (unsigned long a, unsigned long b)
      64  {
      65    v2di res;
      66    __asm__("vlvgp\t%0,%1,%2" : "=v"(res) : "r"(a), "r"(b));
      67    return res;
      68  }
      69  
      70  /*
      71   * 64x64 mult where caller needs to care about proper register allocation:
      72   * multiply xl with m1, treating both as unsigned, and place the result in
      73   * xh:xl.
      74   * mlgr operates on register pairs, so xh must be an even gpr followed by xl
      75   */
      76  #define s390_umul_ppmm(xh, xl, m1)                                              \
      77    do                                                                          \
      78      {                                                                         \
      79        asm("mlgr\t%0,%3" : "=r"(xh), "=r"(xl) : "%1"(xl), "r"(m1));            \
      80      }                                                                         \
      81    while (0);
      82  
      83  /*
      84   * two 64x64 multiplications, scheduled so that they will dispatch and issue to
      85   * different sides: each mlgr is dispatched alone in an instruction group and
      86   * subsequent groups will issue on different execution sides.
      87   * there is a variant where both products use the same multiplicand and one
      88   * that uses two different multiplicands. constraints from s390_umul_ppmm apply
      89   * here.
      90   */
      91  #define s390_double_umul_ppmm(X0H, X0L, X1H, X1L, MX)                           \
      92    do                                                                          \
      93      {                                                                         \
      94        asm("mlgr\t%[x0h],%[mx]\n\t"                                            \
      95            "mlgr\t%[x1h],%[mx]"                                                \
      96            : [x0h] "=&r"(X0H), [x0l] "=&r"(X0L), [x1h] "=r"(X1H),              \
      97              [x1l] "=r"(X1L)                                                   \
      98            : "[x0l]"(X0L), "[x1l]"(X1L), [mx] "r"(MX));                        \
      99      }                                                                         \
     100    while (0);
     101  
     102  #define s390_double_umul_ppmm_distinct(X0H, X0L, X1H, X1L, MX0, MX1)            \
     103    do                                                                          \
     104      {                                                                         \
     105        asm("mlgr\t%[x0h],%[mx0]\n\t"                                           \
     106            "mlgr\t%[x1h],%[mx1]"                                               \
     107            : [x0h] "=&r"(X0H), [x0l] "=&r"(X0L), [x1h] "=r"(X1H),              \
     108              [x1l] "=r"(X1L)                                                   \
     109            : "[x0l]"(X0L), "[x1l]"(X1L), [mx0] "r"(MX0), [mx1] "r"(MX1));      \
     110      }                                                                         \
     111    while (0);
     112  
     113  #define ASM_LOADGPR_BASE(DST, BASE, OFFSET)                                   \
     114    asm volatile("lg\t%[r],%[off](%[b])"                                        \
     115                 : [r] "=r"(DST)                                                \
     116                 : [b] "a"(BASE), [off] "L"(OFFSET)                             \
     117                 : "memory");
     118  
     119  #define ASM_LOADGPR(DST, BASE, INDEX, OFFSET)                                 \
     120    asm volatile("lg\t%[r],%[off](%[b],%[x])"                                   \
     121                 : [r] "=r"(DST)                                                \
     122                 : [b] "a"(BASE), [x] "a"(INDEX), [off] "L"(OFFSET)             \
     123                 : "memory");
     124  
     125  /*
     126   * Load a vector register from memory and swap the two 64-bit doubleword
     127   * elements.
     128   */
     129  static inline vec_t
     130  vec_load_elements_reversed_idx (mp_limb_t const *base, ssize_t const index,
     131                                  ssize_t const offset)
     132  {
     133    vec_t res;
     134    char *ptr = (char *)base;
     135  
     136    res.sw = *(v16qi *)(ptr + index + offset);
     137    res.dw = vec_permi (res.dw, res.dw, 2);
     138  
     139    return res;
     140  }
     141  
     142  static inline vec_t
     143  vec_load_elements_reversed (mp_limb_t const *base, ssize_t const offset)
     144  {
     145    return vec_load_elements_reversed_idx (base, 0, offset);
     146  }
     147  
     148  /*
     149   * Store a vector register to memory and swap the two 64-bit doubleword
     150   * elements.
     151   */
     152  static inline void
     153  vec_store_elements_reversed_idx (mp_limb_t *base, ssize_t const index,
     154                                   ssize_t const offset, vec_t vec)
     155  {
     156    char *ptr = (char *)base;
     157  
     158    vec.dw = vec_permi (vec.dw, vec.dw, 2);
     159    *(v16qi *)(ptr + index + offset) = vec.sw;
     160  }
     161  
     162  static inline void
     163  vec_store_elements_reversed (mp_limb_t *base, ssize_t const offset, vec_t vec)
     164  {
     165    vec_store_elements_reversed_idx (base, 0, offset, vec);
     166  }
     167  
     168  #define ASM_VZERO(VEC)                                                        \
     169    do                                                                          \
     170      {                                                                         \
     171        asm("vzero\t%[vec]" : [vec] "=v"(VEC));                                 \
     172      }                                                                         \
     173    while (0)
     174  
     175  #endif