1  /* { dg-do assemble { target aarch64_asm_sve_ok } } */
       2  /* { dg-options "-O -msve-vector-bits=2048 -mlittle-endian --save-temps" } */
       3  /* { dg-final { check-function-bodies "**" "" } } */
       4  
       5  typedef unsigned char v128qi __attribute__((vector_size(128)));
       6  typedef unsigned char v64qi __attribute__((vector_size(64)));
       7  typedef unsigned char v32qi __attribute__((vector_size(32)));
       8  typedef unsigned short v64hi __attribute__((vector_size(128)));
       9  typedef unsigned short v32hi __attribute__((vector_size(64)));
      10  typedef _Float16 v64hf __attribute__((vector_size(128)));
      11  typedef _Float16 v32hf __attribute__((vector_size(64)));
      12  typedef __bf16 v64bf __attribute__((vector_size(128)));
      13  typedef __bf16 v32bf __attribute__((vector_size(64)));
      14  typedef unsigned int v32si __attribute__((vector_size(128)));
      15  typedef float v32sf __attribute__((vector_size(128)));
      16  
      17  #define PERM0(B) B, B
      18  #define PERM1(B) PERM0 (B), PERM0 (B)
      19  #define PERM2(B) PERM1 (B), PERM1 (B)
      20  #define PERM3(B) PERM2 (B), PERM2 (B)
      21  #define PERM4(B) PERM3 (B), PERM3 (B)
      22  #define PERM5(B) PERM4 (B), PERM4 (B)
      23  #define PERM6(B) PERM5 (B), PERM5 (B)
      24  
      25  /*
      26  ** qi_dup_h_1:
      27  **	ptrue	(p[0-7])\.b, vl256
      28  **	ld1b	(z[0-9]+)\.h, \1/z, \[x0\]
      29  **	dup	(z[0-9]+)\.h, \2\.h\[1\]
      30  **	st1b	\3\.h, \1, \[x8\]
      31  **	ret
      32  */
      33  v128qi
      34  qi_dup_h_1 (v128qi x)
      35  {
      36    return __builtin_shuffle (x, x, (v128qi) { PERM6 (1) });
      37  }
      38  
      39  /*
      40  ** qi_dup_h_31:
      41  **	ptrue	(p[0-7])\.b, vl256
      42  **	ld1b	(z[0-9]+)\.h, \1/z, \[x0\]
      43  **	dup	(z[0-9]+)\.h, \2\.h\[31\]
      44  **	st1b	\3\.h, \1, \[x8\]
      45  **	ret
      46  */
      47  v128qi
      48  qi_dup_h_31 (v128qi x)
      49  {
      50    return __builtin_shuffle (x, x, (v128qi) { PERM6 (31) });
      51  }
      52  
      53  /*
      54  ** qi_dup_s_1:
      55  **	ptrue	(p[0-7])\.b, vl256
      56  **	ld1b	(z[0-9]+)\.s, \1/z, \[x0\]
      57  **	dup	(z[0-9]+)\.s, \2\.s\[1\]
      58  **	st1b	\3\.s, \1, \[x8\]
      59  **	ret
      60  */
      61  v64qi
      62  qi_dup_s_1 (v64qi x)
      63  {
      64    return __builtin_shuffle (x, x, (v64qi) { PERM5 (1) });
      65  }
      66  
      67  /*
      68  ** qi_dup_s_15:
      69  **	ptrue	(p[0-7])\.b, vl256
      70  **	ld1b	(z[0-9]+)\.s, \1/z, \[x0\]
      71  **	dup	(z[0-9]+)\.s, \2\.s\[15\]
      72  **	st1b	\3\.s, \1, \[x8\]
      73  **	ret
      74  */
      75  v64qi
      76  qi_dup_s_15 (v64qi x)
      77  {
      78    return __builtin_shuffle (x, x, (v64qi) { PERM5 (15) });
      79  }
      80  
      81  /*
      82  ** qi_dup_d_1:
      83  **	ptrue	(p[0-7])\.b, vl256
      84  **	ld1b	(z[0-9]+)\.d, \1/z, \[x0\]
      85  **	dup	(z[0-9]+)\.d, \2\.d\[1\]
      86  **	st1b	\3\.d, \1, \[x8\]
      87  **	ret
      88  */
      89  v32qi
      90  qi_dup_d_1 (v32qi x)
      91  {
      92    return __builtin_shuffle (x, x, (v32qi) { PERM4 (1) });
      93  }
      94  
      95  /*
      96  ** qi_dup_d_7:
      97  **	ptrue	(p[0-7])\.b, vl256
      98  **	ld1b	(z[0-9]+)\.d, \1/z, \[x0\]
      99  **	dup	(z[0-9]+)\.d, \2\.d\[7\]
     100  **	st1b	\3\.d, \1, \[x8\]
     101  **	ret
     102  */
     103  v32qi
     104  qi_dup_d_7 (v32qi x)
     105  {
     106    return __builtin_shuffle (x, x, (v32qi) { PERM4 (7) });
     107  }
     108  
     109  /*
     110  ** hi_dup_s_1:
     111  **	ptrue	(p[0-7])\.b, vl256
     112  **	ld1h	(z[0-9]+)\.s, \1/z, \[x0\]
     113  **	dup	(z[0-9]+)\.s, \2\.s\[1\]
     114  **	st1h	\3\.s, \1, \[x8\]
     115  **	ret
     116  */
     117  v64hi
     118  hi_dup_s_1 (v64hi x)
     119  {
     120    return __builtin_shuffle (x, x, (v64hi) { PERM5 (1) });
     121  }
     122  
     123  /*
     124  ** hi_dup_s_15:
     125  **	ptrue	(p[0-7])\.b, vl256
     126  **	ld1h	(z[0-9]+)\.s, \1/z, \[x0\]
     127  **	dup	(z[0-9]+)\.s, \2\.s\[15\]
     128  **	st1h	\3\.s, \1, \[x8\]
     129  **	ret
     130  */
     131  v64hi
     132  hi_dup_s_15 (v64hi x)
     133  {
     134    return __builtin_shuffle (x, x, (v64hi) { PERM5 (15) });
     135  }
     136  
     137  /*
     138  ** hf_dup_s_1:
     139  **	ptrue	(p[0-7])\.b, vl256
     140  **	ld1h	(z[0-9]+)\.s, \1/z, \[x0\]
     141  **	dup	(z[0-9]+)\.s, \2\.s\[1\]
     142  **	st1h	\3\.s, \1, \[x8\]
     143  **	ret
     144  */
     145  v64hf
     146  hf_dup_s_1 (v64hf x)
     147  {
     148    return __builtin_shuffle (x, x, (v64hi) { PERM5 (1) });
     149  }
     150  
     151  /*
     152  ** hf_dup_s_11:
     153  **	ptrue	(p[0-7])\.b, vl256
     154  **	ld1h	(z[0-9]+)\.s, \1/z, \[x0\]
     155  **	dup	(z[0-9]+)\.s, \2\.s\[11\]
     156  **	st1h	\3\.s, \1, \[x8\]
     157  **	ret
     158  */
     159  v64hf
     160  hf_dup_s_11 (v64hf x)
     161  {
     162    return __builtin_shuffle (x, x, (v64hi) { PERM5 (11) });
     163  }
     164  
     165  /*
     166  ** bf_dup_s_1:
     167  **	ptrue	(p[0-7])\.b, vl256
     168  **	ld1h	(z[0-9]+)\.s, \1/z, \[x0\]
     169  **	dup	(z[0-9]+)\.s, \2\.s\[1\]
     170  **	st1h	\3\.s, \1, \[x8\]
     171  **	ret
     172  */
     173  v64bf
     174  bf_dup_s_1 (v64bf x)
     175  {
     176    return __builtin_shuffle (x, x, (v64hi) { PERM5 (1) });
     177  }
     178  
     179  /*
     180  ** bf_dup_s_13:
     181  **	ptrue	(p[0-7])\.b, vl256
     182  **	ld1h	(z[0-9]+)\.s, \1/z, \[x0\]
     183  **	dup	(z[0-9]+)\.s, \2\.s\[13\]
     184  **	st1h	\3\.s, \1, \[x8\]
     185  **	ret
     186  */
     187  v64bf
     188  bf_dup_s_13 (v64bf x)
     189  {
     190    return __builtin_shuffle (x, x, (v64hi) { PERM5 (13) });
     191  }
     192  
     193  /*
     194  ** hi_dup_d_1:
     195  **	ptrue	(p[0-7])\.b, vl256
     196  **	ld1h	(z[0-9]+)\.d, \1/z, \[x0\]
     197  **	dup	(z[0-9]+)\.d, \2\.d\[1\]
     198  **	st1h	\3\.d, \1, \[x8\]
     199  **	ret
     200  */
     201  v32hi
     202  hi_dup_d_1 (v32hi x)
     203  {
     204    return __builtin_shuffle (x, x, (v32hi) { PERM4 (1) });
     205  }
     206  
     207  /*
     208  ** hi_dup_d_7:
     209  **	ptrue	(p[0-7])\.b, vl256
     210  **	ld1h	(z[0-9]+)\.d, \1/z, \[x0\]
     211  **	dup	(z[0-9]+)\.d, \2\.d\[7\]
     212  **	st1h	\3\.d, \1, \[x8\]
     213  **	ret
     214  */
     215  v32hi
     216  hi_dup_d_7 (v32hi x)
     217  {
     218    return __builtin_shuffle (x, x, (v32hi) { PERM4 (7) });
     219  }
     220  
     221  /*
     222  ** hf_dup_d_1:
     223  **	ptrue	(p[0-7])\.b, vl256
     224  **	ld1h	(z[0-9]+)\.d, \1/z, \[x0\]
     225  **	dup	(z[0-9]+)\.d, \2\.d\[1\]
     226  **	st1h	\3\.d, \1, \[x8\]
     227  **	ret
     228  */
     229  v32hf
     230  hf_dup_d_1 (v32hf x)
     231  {
     232    return __builtin_shuffle (x, x, (v32hi) { PERM4 (1) });
     233  }
     234  
     235  /*
     236  ** hf_dup_d_5:
     237  **	ptrue	(p[0-7])\.b, vl256
     238  **	ld1h	(z[0-9]+)\.d, \1/z, \[x0\]
     239  **	dup	(z[0-9]+)\.d, \2\.d\[5\]
     240  **	st1h	\3\.d, \1, \[x8\]
     241  **	ret
     242  */
     243  v32hf
     244  hf_dup_d_5 (v32hf x)
     245  {
     246    return __builtin_shuffle (x, x, (v32hi) { PERM4 (5) });
     247  }
     248  
     249  /*
     250  ** bf_dup_d_1:
     251  **	ptrue	(p[0-7])\.b, vl256
     252  **	ld1h	(z[0-9]+)\.d, \1/z, \[x0\]
     253  **	dup	(z[0-9]+)\.d, \2\.d\[1\]
     254  **	st1h	\3\.d, \1, \[x8\]
     255  **	ret
     256  */
     257  v32bf
     258  bf_dup_d_1 (v32bf x)
     259  {
     260    return __builtin_shuffle (x, x, (v32hi) { PERM4 (1) });
     261  }
     262  
     263  /*
     264  ** bf_dup_d_6:
     265  **	ptrue	(p[0-7])\.b, vl256
     266  **	ld1h	(z[0-9]+)\.d, \1/z, \[x0\]
     267  **	dup	(z[0-9]+)\.d, \2\.d\[6\]
     268  **	st1h	\3\.d, \1, \[x8\]
     269  **	ret
     270  */
     271  v32bf
     272  bf_dup_d_6 (v32bf x)
     273  {
     274    return __builtin_shuffle (x, x, (v32hi) { PERM4 (6) });
     275  }
     276  
     277  /*
     278  ** si_dup_d_1:
     279  **	ptrue	(p[0-7])\.b, vl256
     280  **	ld1w	(z[0-9]+)\.d, \1/z, \[x0\]
     281  **	dup	(z[0-9]+)\.d, \2\.d\[1\]
     282  **	st1w	\3\.d, \1, \[x8\]
     283  **	ret
     284  */
     285  v32si
     286  si_dup_d_1 (v32si x)
     287  {
     288    return __builtin_shuffle (x, x, (v32si) { PERM4 (1) });
     289  }
     290  
     291  /*
     292  ** si_dup_d_7:
     293  **	ptrue	(p[0-7])\.b, vl256
     294  **	ld1w	(z[0-9]+)\.d, \1/z, \[x0\]
     295  **	dup	(z[0-9]+)\.d, \2\.d\[7\]
     296  **	st1w	\3\.d, \1, \[x8\]
     297  **	ret
     298  */
     299  v32si
     300  si_dup_d_7 (v32si x)
     301  {
     302    return __builtin_shuffle (x, x, (v32si) { PERM4 (7) });
     303  }
     304  
     305  /*
     306  ** sf_dup_d_1:
     307  **	ptrue	(p[0-7])\.b, vl256
     308  **	ld1w	(z[0-9]+)\.d, \1/z, \[x0\]
     309  **	dup	(z[0-9]+)\.d, \2\.d\[1\]
     310  **	st1w	\3\.d, \1, \[x8\]
     311  **	ret
     312  */
     313  v32sf
     314  sf_dup_d_1 (v32sf x)
     315  {
     316    return __builtin_shuffle (x, x, (v32si) { PERM4 (1) });
     317  }
     318  
     319  /*
     320  ** sf_dup_d_7:
     321  **	ptrue	(p[0-7])\.b, vl256
     322  **	ld1w	(z[0-9]+)\.d, \1/z, \[x0\]
     323  **	dup	(z[0-9]+)\.d, \2\.d\[7\]
     324  **	st1w	\3\.d, \1, \[x8\]
     325  **	ret
     326  */
     327  v32sf
     328  sf_dup_d_7 (v32sf x)
     329  {
     330    return __builtin_shuffle (x, x, (v32si) { PERM4 (7) });
     331  }