(root)/
glibc-2.38/
sysdeps/
x86_64/
fpu/
svml_sd_wrapper_impl.h
       1  /* Common float/double wrapper implementations of vector math
       2     functions.
       3     Copyright (C) 2022-2023 Free Software Foundation, Inc.
       4     This file is part of the GNU C Library.
       5  
       6     The GNU C Library is free software; you can redistribute it and/or
       7     modify it under the terms of the GNU Lesser General Public
       8     License as published by the Free Software Foundation; either
       9     version 2.1 of the License, or (at your option) any later version.
      10  
      11     The GNU C Library is distributed in the hope that it will be useful,
      12     but WITHOUT ANY WARRANTY; without even the implied warranty of
      13     MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
      14     Lesser General Public License for more details.
      15  
      16     You should have received a copy of the GNU Lesser General Public
      17     License along with the GNU C Library; if not, see
      18     <https://www.gnu.org/licenses/>.  */
      19  
      20  /* AVX/AVX2 ISA version as wrapper to SSE ISA version.  */
      21  .macro WRAPPER_IMPL_AVX callee
      22  	pushq	%rbp
      23  	cfi_adjust_cfa_offset (8)
      24  	cfi_rel_offset (%rbp, 0)
      25  	movq	%rsp, %rbp
      26  	cfi_def_cfa_register (%rbp)
      27  	andq	$-32, %rsp
      28  	subq	$32, %rsp
      29  	vmovaps	%ymm0, (%rsp)
      30  	vzeroupper
      31  	call	HIDDEN_JUMPTARGET(\callee)
      32  	vmovaps	%xmm0, (%rsp)
      33  	vmovaps	16(%rsp), %xmm0
      34  	call	HIDDEN_JUMPTARGET(\callee)
      35  	/* combine xmm0 (return of second call) with result of first
      36  	   call (saved on stack). Might be worth exploring logic that
      37  	   uses `vpblend` and reads in ymm1 using -16(rsp).  */
      38  	vmovaps	(%rsp), %xmm1
      39  	vinsertf128 $1, %xmm0, %ymm1, %ymm0
      40  	movq	%rbp, %rsp
      41  	cfi_def_cfa_register (%rsp)
      42  	popq	%rbp
      43  	cfi_adjust_cfa_offset (-8)
      44  	cfi_restore (%rbp)
      45  	ret
      46  .endm
      47  
      48  /* 2 argument AVX/AVX2 ISA version as wrapper to SSE ISA version.  */
      49  .macro WRAPPER_IMPL_AVX_ff callee
      50  	pushq	%rbp
      51  	cfi_adjust_cfa_offset (8)
      52  	cfi_rel_offset (%rbp, 0)
      53  	movq	%rsp, %rbp
      54  	cfi_def_cfa_register (%rbp)
      55  	andq	$-32, %rsp
      56  	subq	$64, %rsp
      57  	vmovaps	%ymm0, (%rsp)
      58  	vmovaps	%ymm1, 32(%rsp)
      59  	vzeroupper
      60  	call	HIDDEN_JUMPTARGET(\callee)
      61  	vmovaps	48(%rsp), %xmm1
      62  	vmovaps	%xmm0, (%rsp)
      63  	vmovaps	16(%rsp), %xmm0
      64  	call	HIDDEN_JUMPTARGET(\callee)
      65  	/* combine xmm0 (return of second call) with result of first
      66  	   call (saved on stack). Might be worth exploring logic that
      67  	   uses `vpblend` and reads in ymm1 using -16(rsp).  */
      68  	vmovaps	(%rsp), %xmm1
      69  	vinsertf128 $1, %xmm0, %ymm1, %ymm0
      70  	movq	%rbp, %rsp
      71  	cfi_def_cfa_register (%rsp)
      72  	popq	%rbp
      73  	cfi_adjust_cfa_offset (-8)
      74  	cfi_restore (%rbp)
      75  	ret
      76  .endm
      77  
      78  /* 3 argument AVX/AVX2 ISA version as wrapper to SSE ISA version.  */
      79  .macro WRAPPER_IMPL_AVX_fFF callee
      80  	pushq	%rbp
      81  	cfi_adjust_cfa_offset (8)
      82  	cfi_rel_offset (%rbp, 0)
      83  	movq	%rsp, %rbp
      84  	andq	$-32, %rsp
      85  	subq	$32, %rsp
      86  	vmovaps	%ymm0, (%rsp)
      87  	pushq	%rbx
      88  	pushq	%r14
      89  	movq	%rdi, %rbx
      90  	movq	%rsi, %r14
      91  	vzeroupper
      92  	call	HIDDEN_JUMPTARGET(\callee)
      93  	vmovaps	32(%rsp), %xmm0
      94  	leaq	16(%rbx), %rdi
      95  	leaq	16(%r14), %rsi
      96  	call	HIDDEN_JUMPTARGET(\callee)
      97  	popq	%r14
      98  	popq	%rbx
      99  	movq	%rbp, %rsp
     100  	cfi_def_cfa_register (%rsp)
     101  	popq	%rbp
     102  	cfi_adjust_cfa_offset (-8)
     103  	cfi_restore (%rbp)
     104  	ret
     105  .endm
     106  
     107  /* AVX512 ISA version as wrapper to AVX2 ISA version.  */
     108  .macro WRAPPER_IMPL_AVX512 callee
     109  	pushq	%rbp
     110  	cfi_adjust_cfa_offset (8)
     111  	cfi_rel_offset (%rbp, 0)
     112  	movq	%rsp, %rbp
     113  	cfi_def_cfa_register (%rbp)
     114  	andq	$-64, %rsp
     115  	subq	$64, %rsp
     116  	vmovups	%zmm0, (%rsp)
     117  	call	HIDDEN_JUMPTARGET(\callee)
     118  	vmovupd	%ymm0, (%rsp)
     119  	vmovupd	32(%rsp), %ymm0
     120  	call	HIDDEN_JUMPTARGET(\callee)
     121  	/* combine ymm0 (return of second call) with result of first
     122  	   call (saved on stack).  */
     123  	vmovaps	(%rsp), %ymm1
     124  	vinserti64x4 $0x1, %ymm0, %zmm1, %zmm0
     125  	movq	%rbp, %rsp
     126  	cfi_def_cfa_register (%rsp)
     127  	popq	%rbp
     128  	cfi_adjust_cfa_offset (-8)
     129  	cfi_restore (%rbp)
     130  	ret
     131  .endm
     132  
     133  /* 2 argument AVX512 ISA version as wrapper to AVX2 ISA version.  */
     134  .macro WRAPPER_IMPL_AVX512_ff callee
     135  	pushq	%rbp
     136  	cfi_adjust_cfa_offset (8)
     137  	cfi_rel_offset (%rbp, 0)
     138  	movq	%rsp, %rbp
     139  	cfi_def_cfa_register (%rbp)
     140  	andq	$-64, %rsp
     141  	addq	$-128, %rsp
     142  	vmovups	%zmm0, (%rsp)
     143  	vmovups	%zmm1, 64(%rsp)
     144  	/* ymm0 and ymm1 are already set.  */
     145  	call	HIDDEN_JUMPTARGET(\callee)
     146  	vmovups	96(%rsp), %ymm1
     147  	vmovaps	%ymm0, (%rsp)
     148  	vmovups	32(%rsp), %ymm0
     149  	call	HIDDEN_JUMPTARGET(\callee)
     150  	/* combine ymm0 (return of second call) with result of first
     151  	   call (saved on stack).  */
     152  	vmovaps	(%rsp), %ymm1
     153  	vinserti64x4 $0x1, %ymm0, %zmm1, %zmm0
     154  	movq	%rbp, %rsp
     155  	cfi_def_cfa_register (%rsp)
     156  	popq	%rbp
     157  	cfi_adjust_cfa_offset (-8)
     158  	cfi_restore (%rbp)
     159  	ret
     160  .endm
     161  
     162  /* 3 argument AVX512 ISA version as wrapper to AVX2 ISA version.  */
     163  .macro WRAPPER_IMPL_AVX512_fFF callee
     164  	pushq	%rbp
     165  	cfi_adjust_cfa_offset (8)
     166  	cfi_rel_offset (%rbp, 0)
     167  	movq	%rsp, %rbp
     168  	cfi_def_cfa_register (%rbp)
     169  	andq	$-64, %rsp
     170  	subq	$64, %rsp
     171  	vmovaps	%zmm0, (%rsp)
     172  	pushq	%rbx
     173  	pushq	%r14
     174  	movq	%rdi, %rbx
     175  	movq	%rsi, %r14
     176  	/* ymm0 is already set.  */
     177  	call	HIDDEN_JUMPTARGET(\callee)
     178  	vmovaps	48(%rsp), %ymm0
     179  	leaq	32(%rbx), %rdi
     180  	leaq	32(%r14), %rsi
     181  	call	HIDDEN_JUMPTARGET(\callee)
     182  	popq	%r14
     183  	popq	%rbx
     184  	movq	%rbp, %rsp
     185  	cfi_def_cfa_register (%rsp)
     186  	popq	%rbp
     187  	cfi_adjust_cfa_offset (-8)
     188  	cfi_restore (%rbp)
     189  	ret
     190  .endm