1  /* { dg-do assemble { target aarch64_asm_sve_ok } } */
       2  /* { dg-options "-O -msve-vector-bits=2048 -mlittle-endian --save-temps" } */
       3  /* { dg-final { check-function-bodies "**" "" } } */
       4  
       5  typedef unsigned char v128qi __attribute__((vector_size(128)));
       6  typedef unsigned char v64qi __attribute__((vector_size(64)));
       7  typedef unsigned char v32qi __attribute__((vector_size(32)));
       8  typedef unsigned short v64hi __attribute__((vector_size(128)));
       9  typedef unsigned short v32hi __attribute__((vector_size(64)));
      10  typedef _Float16 v64hf __attribute__((vector_size(128)));
      11  typedef _Float16 v32hf __attribute__((vector_size(64)));
      12  typedef __bf16 v64bf __attribute__((vector_size(128)));
      13  typedef __bf16 v32bf __attribute__((vector_size(64)));
      14  typedef unsigned int v32si __attribute__((vector_size(128)));
      15  typedef float v32sf __attribute__((vector_size(128)));
      16  
      17  #define PERM0(B, C) B, B + C
      18  #define PERM1(B, C) PERM0 (B, C), PERM0 (B + 2, C)
      19  #define PERM2(B, C) PERM1 (B, C), PERM1 (B + 4, C)
      20  #define PERM3(B, C) PERM2 (B, C), PERM2 (B + 8, C)
      21  #define PERM4(B, C) PERM3 (B, C), PERM3 (B + 16, C)
      22  #define PERM5(B, C) PERM4 (B, C), PERM4 (B + 32, C)
      23  #define PERM6(B, C) PERM5 (B, C), PERM5 (B + 64, C)
      24  
      25  /*
      26  ** qi_trn1_h_a:
      27  **	ptrue	(p[0-7])\.b, vl256
      28  **	ld1b	(z[0-9]+)\.h, \1/z, \[x0\]
      29  **	trn1	(z[0-9]+)\.h, \2\.h, \2\.h
      30  **	st1b	\3\.h, \1, \[x8\]
      31  **	ret
      32  */
      33  v128qi
      34  qi_trn1_h_a (v128qi x)
      35  {
      36    return __builtin_shuffle (x, x, (v128qi) { PERM6 (0, 0) });
      37  }
      38  
      39  /*
      40  ** qi_trn1_h_b:
      41  **	ptrue	(p[0-7])\.b, vl256
      42  **	ld1b	(z[0-9]+)\.h, \1/z, \[x0\]
      43  **	trn1	(z[0-9]+)\.h, \2\.h, \2\.h
      44  **	st1b	\3\.h, \1, \[x8\]
      45  **	ret
      46  */
      47  v128qi
      48  qi_trn1_h_b (v128qi x)
      49  {
      50    return __builtin_shuffle (x, x, (v128qi) { PERM6 (0, 128) });
      51  }
      52  
      53  /*
      54  ** qi_trn1_h_c:
      55  **	ptrue	(p[0-7])\.b, vl256
      56  **	ld1b	(z[0-9]+)\.h, \1/z, \[x0\]
      57  **	trn1	(z[0-9]+)\.h, \2\.h, \2\.h
      58  **	st1b	\3\.h, \1, \[x8\]
      59  **	ret
      60  */
      61  v128qi
      62  qi_trn1_h_c (v128qi x)
      63  {
      64    return __builtin_shuffle (x, x, (v128qi) { PERM6 (128, 0) });
      65  }
      66  
      67  /*
      68  ** qi_trn1_h_two_op:
      69  **	ptrue	(p[0-7])\.b, vl256
      70  ** (
      71  **	ld1b	(z[0-9]+)\.h, \1/z, \[x1\]
      72  **	ld1b	(z[0-9]+)\.h, \1/z, \[x0\]
      73  **	trn1	\3\.h, \3\.h, \2\.h
      74  **	st1b	\3\.h, \1, \[x8\]
      75  ** |
      76  **	ld1b	(z[0-9]+)\.h, \1/z, \[x0\]
      77  **	ld1b	(z[0-9]+)\.h, \1/z, \[x1\]
      78  **	trn1	\4\.h, \4\.h, \5\.h
      79  **	st1b	\4\.h, \1, \[x8\]
      80  ** )
      81  **	ret
      82  */
      83  v128qi
      84  qi_trn1_h_two_op (v128qi x, v128qi y)
      85  {
      86    return __builtin_shuffle (x, y, (v128qi) { PERM6 (0, 128) });
      87  }
      88  
      89  /*
      90  ** qi_trn1_s:
      91  **	ptrue	(p[0-7])\.b, vl256
      92  **	ld1b	(z[0-9]+)\.s, \1/z, \[x0\]
      93  **	trn1	(z[0-9]+)\.s, \2\.s, \2\.s
      94  **	st1b	\3\.s, \1, \[x8\]
      95  **	ret
      96  */
      97  v64qi
      98  qi_trn1_s (v64qi x)
      99  {
     100    return __builtin_shuffle (x, x, (v64qi) { PERM5 (0, 64) });
     101  }
     102  
     103  /*
     104  ** qi_trn1_s_two_op:
     105  **	ptrue	(p[0-7])\.b, vl256
     106  ** (
     107  **	ld1b	(z[0-9]+)\.s, \1/z, \[x1\]
     108  **	ld1b	(z[0-9]+)\.s, \1/z, \[x0\]
     109  **	trn1	\3\.s, \3\.s, \2\.s
     110  **	st1b	\3\.s, \1, \[x8\]
     111  ** |
     112  **	ld1b	(z[0-9]+)\.s, \1/z, \[x0\]
     113  **	ld1b	(z[0-9]+)\.s, \1/z, \[x1\]
     114  **	trn1	\4\.s, \4\.s, \5\.s
     115  **	st1b	\4\.s, \1, \[x8\]
     116  ** )
     117  **	ret
     118  */
     119  v64qi
     120  qi_trn1_s_two_op (v64qi x, v64qi y)
     121  {
     122    return __builtin_shuffle (x, y, (v64qi) { PERM5 (0, 64) });
     123  }
     124  
     125  /*
     126  ** qi_trn1_d:
     127  **	ptrue	(p[0-7])\.b, vl256
     128  **	ld1b	(z[0-9]+)\.d, \1/z, \[x0\]
     129  **	trn1	(z[0-9]+)\.d, \2\.d, \2\.d
     130  **	st1b	\3\.d, \1, \[x8\]
     131  **	ret
     132  */
     133  v32qi
     134  qi_trn1_d (v32qi x)
     135  {
     136    return __builtin_shuffle (x, x, (v32qi) { PERM4 (0, 32) });
     137  }
     138  
     139  /*
     140  ** qi_trn1_d_two_op:
     141  **	ptrue	(p[0-7])\.b, vl256
     142  ** (
     143  **	ld1b	(z[0-9]+)\.d, \1/z, \[x1\]
     144  **	ld1b	(z[0-9]+)\.d, \1/z, \[x0\]
     145  **	trn1	\3\.d, \3\.d, \2\.d
     146  **	st1b	\3\.d, \1, \[x8\]
     147  ** |
     148  **	ld1b	(z[0-9]+)\.d, \1/z, \[x0\]
     149  **	ld1b	(z[0-9]+)\.d, \1/z, \[x1\]
     150  **	trn1	\4\.d, \4\.d, \5\.d
     151  **	st1b	\4\.d, \1, \[x8\]
     152  ** )
     153  **	ret
     154  */
     155  v32qi
     156  qi_trn1_d_two_op (v32qi x, v32qi y)
     157  {
     158    return __builtin_shuffle (x, y, (v32qi) { PERM4 (0, 32) });
     159  }
     160  
     161  /*
     162  ** hi_trn1_s:
     163  **	ptrue	(p[0-7])\.b, vl256
     164  **	ld1h	(z[0-9]+)\.s, \1/z, \[x0\]
     165  **	trn1	(z[0-9]+)\.s, \2\.s, \2\.s
     166  **	st1h	\3\.s, \1, \[x8\]
     167  **	ret
     168  */
     169  v64hi
     170  hi_trn1_s (v64hi x)
     171  {
     172    return __builtin_shuffle (x, x, (v64hi) { PERM5 (0, 64) });
     173  }
     174  
     175  /*
     176  ** hi_trn1_s_two_op:
     177  **	ptrue	(p[0-7])\.b, vl256
     178  ** (
     179  **	ld1h	(z[0-9]+)\.s, \1/z, \[x1\]
     180  **	ld1h	(z[0-9]+)\.s, \1/z, \[x0\]
     181  **	trn1	\3\.s, \3\.s, \2\.s
     182  **	st1h	\3\.s, \1, \[x8\]
     183  ** |
     184  **	ld1h	(z[0-9]+)\.s, \1/z, \[x0\]
     185  **	ld1h	(z[0-9]+)\.s, \1/z, \[x1\]
     186  **	trn1	\4\.s, \4\.s, \5\.s
     187  **	st1h	\4\.s, \1, \[x8\]
     188  ** )
     189  **	ret
     190  */
     191  v64hi
     192  hi_trn1_s_two_op (v64hi x, v64hi y)
     193  {
     194    return __builtin_shuffle (x, y, (v64hi) { PERM5 (0, 64) });
     195  }
     196  
     197  /*
     198  ** hf_trn1_s:
     199  **	ptrue	(p[0-7])\.b, vl256
     200  **	ld1h	(z[0-9]+)\.s, \1/z, \[x0\]
     201  **	trn1	(z[0-9]+)\.s, \2\.s, \2\.s
     202  **	st1h	\3\.s, \1, \[x8\]
     203  **	ret
     204  */
     205  v64hf
     206  hf_trn1_s (v64hf x)
     207  {
     208    return __builtin_shuffle (x, x, (v64hi) { PERM5 (0, 64) });
     209  }
     210  
     211  /*
     212  ** hf_trn1_s_two_op:
     213  **	ptrue	(p[0-7])\.b, vl256
     214  ** (
     215  **	ld1h	(z[0-9]+)\.s, \1/z, \[x1\]
     216  **	ld1h	(z[0-9]+)\.s, \1/z, \[x0\]
     217  **	trn1	\3\.s, \3\.s, \2\.s
     218  **	st1h	\3\.s, \1, \[x8\]
     219  ** |
     220  **	ld1h	(z[0-9]+)\.s, \1/z, \[x0\]
     221  **	ld1h	(z[0-9]+)\.s, \1/z, \[x1\]
     222  **	trn1	\4\.s, \4\.s, \5\.s
     223  **	st1h	\4\.s, \1, \[x8\]
     224  ** )
     225  **	ret
     226  */
     227  v64hf
     228  hf_trn1_s_two_op (v64hf x, v64hf y)
     229  {
     230    return __builtin_shuffle (x, y, (v64hi) { PERM5 (0, 64) });
     231  }
     232  
     233  /*
     234  ** bf_trn1_s:
     235  **	ptrue	(p[0-7])\.b, vl256
     236  **	ld1h	(z[0-9]+)\.s, \1/z, \[x0\]
     237  **	trn1	(z[0-9]+)\.s, \2\.s, \2\.s
     238  **	st1h	\3\.s, \1, \[x8\]
     239  **	ret
     240  */
     241  v64bf
     242  bf_trn1_s (v64bf x)
     243  {
     244    return __builtin_shuffle (x, x, (v64hi) { PERM5 (0, 64) });
     245  }
     246  
     247  /*
     248  ** bf_trn1_s_two_op:
     249  **	ptrue	(p[0-7])\.b, vl256
     250  ** (
     251  **	ld1h	(z[0-9]+)\.s, \1/z, \[x1\]
     252  **	ld1h	(z[0-9]+)\.s, \1/z, \[x0\]
     253  **	trn1	\3\.s, \3\.s, \2\.s
     254  **	st1h	\3\.s, \1, \[x8\]
     255  ** |
     256  **	ld1h	(z[0-9]+)\.s, \1/z, \[x0\]
     257  **	ld1h	(z[0-9]+)\.s, \1/z, \[x1\]
     258  **	trn1	\4\.s, \4\.s, \5\.s
     259  **	st1h	\4\.s, \1, \[x8\]
     260  ** )
     261  **	ret
     262  */
     263  v64bf
     264  bf_trn1_s_two_op (v64bf x, v64bf y)
     265  {
     266    return __builtin_shuffle (x, y, (v64hi) { PERM5 (0, 64) });
     267  }
     268  
     269  /*
     270  ** hi_trn1_d:
     271  **	ptrue	(p[0-7])\.b, vl256
     272  **	ld1h	(z[0-9]+)\.d, \1/z, \[x0\]
     273  **	trn1	(z[0-9]+)\.d, \2\.d, \2\.d
     274  **	st1h	\3\.d, \1, \[x8\]
     275  **	ret
     276  */
     277  v32hi
     278  hi_trn1_d (v32hi x)
     279  {
     280    return __builtin_shuffle (x, x, (v32hi) { PERM4 (0, 32) });
     281  }
     282  
     283  /*
     284  ** hi_trn1_d_two_op:
     285  **	ptrue	(p[0-7])\.b, vl256
     286  ** (
     287  **	ld1h	(z[0-9]+)\.d, \1/z, \[x1\]
     288  **	ld1h	(z[0-9]+)\.d, \1/z, \[x0\]
     289  **	trn1	\3\.d, \3\.d, \2\.d
     290  **	st1h	\3\.d, \1, \[x8\]
     291  ** |
     292  **	ld1h	(z[0-9]+)\.d, \1/z, \[x0\]
     293  **	ld1h	(z[0-9]+)\.d, \1/z, \[x1\]
     294  **	trn1	\4\.d, \4\.d, \5\.d
     295  **	st1h	\4\.d, \1, \[x8\]
     296  ** )
     297  **	ret
     298  */
     299  v32hi
     300  hi_trn1_d_two_op (v32hi x, v32hi y)
     301  {
     302    return __builtin_shuffle (x, y, (v32hi) { PERM4 (0, 32) });
     303  }
     304  
     305  /*
     306  ** hf_trn1_d:
     307  **	ptrue	(p[0-7])\.b, vl256
     308  **	ld1h	(z[0-9]+)\.d, \1/z, \[x0\]
     309  **	trn1	(z[0-9]+)\.d, \2\.d, \2\.d
     310  **	st1h	\3\.d, \1, \[x8\]
     311  **	ret
     312  */
     313  v32hf
     314  hf_trn1_d (v32hf x)
     315  {
     316    return __builtin_shuffle (x, x, (v32hi) { PERM4 (0, 32) });
     317  }
     318  
     319  /*
     320  ** hf_trn1_d_two_op:
     321  **	ptrue	(p[0-7])\.b, vl256
     322  ** (
     323  **	ld1h	(z[0-9]+)\.d, \1/z, \[x1\]
     324  **	ld1h	(z[0-9]+)\.d, \1/z, \[x0\]
     325  **	trn1	\3\.d, \3\.d, \2\.d
     326  **	st1h	\3\.d, \1, \[x8\]
     327  ** |
     328  **	ld1h	(z[0-9]+)\.d, \1/z, \[x0\]
     329  **	ld1h	(z[0-9]+)\.d, \1/z, \[x1\]
     330  **	trn1	\4\.d, \4\.d, \5\.d
     331  **	st1h	\4\.d, \1, \[x8\]
     332  ** )
     333  **	ret
     334  */
     335  v32hf
     336  hf_trn1_d_two_op (v32hf x, v32hf y)
     337  {
     338    return __builtin_shuffle (x, y, (v32hi) { PERM4 (0, 32) });
     339  }
     340  
     341  /*
     342  ** bf_trn1_d:
     343  **	ptrue	(p[0-7])\.b, vl256
     344  **	ld1h	(z[0-9]+)\.d, \1/z, \[x0\]
     345  **	trn1	(z[0-9]+)\.d, \2\.d, \2\.d
     346  **	st1h	\3\.d, \1, \[x8\]
     347  **	ret
     348  */
     349  v32bf
     350  bf_trn1_d (v32bf x)
     351  {
     352    return __builtin_shuffle (x, x, (v32hi) { PERM4 (0, 32) });
     353  }
     354  
     355  /*
     356  ** bf_trn1_d_two_op:
     357  **	ptrue	(p[0-7])\.b, vl256
     358  ** (
     359  **	ld1h	(z[0-9]+)\.d, \1/z, \[x1\]
     360  **	ld1h	(z[0-9]+)\.d, \1/z, \[x0\]
     361  **	trn1	\3\.d, \3\.d, \2\.d
     362  **	st1h	\3\.d, \1, \[x8\]
     363  ** |
     364  **	ld1h	(z[0-9]+)\.d, \1/z, \[x0\]
     365  **	ld1h	(z[0-9]+)\.d, \1/z, \[x1\]
     366  **	trn1	\4\.d, \4\.d, \5\.d
     367  **	st1h	\4\.d, \1, \[x8\]
     368  ** )
     369  **	ret
     370  */
     371  v32bf
     372  bf_trn1_d_two_op (v32bf x, v32bf y)
     373  {
     374    return __builtin_shuffle (x, y, (v32hi) { PERM4 (0, 32) });
     375  }
     376  
     377  /*
     378  ** si_trn1_d:
     379  **	ptrue	(p[0-7])\.b, vl256
     380  **	ld1w	(z[0-9]+)\.d, \1/z, \[x0\]
     381  **	trn1	(z[0-9]+)\.d, \2\.d, \2\.d
     382  **	st1w	\3\.d, \1, \[x8\]
     383  **	ret
     384  */
     385  v32si
     386  si_trn1_d (v32si x)
     387  {
     388    return __builtin_shuffle (x, x, (v32si) { PERM4 (0, 32) });
     389  }
     390  
     391  /*
     392  ** sf_trn1_d:
     393  **	ptrue	(p[0-7])\.b, vl256
     394  **	ld1w	(z[0-9]+)\.d, \1/z, \[x0\]
     395  **	trn1	(z[0-9]+)\.d, \2\.d, \2\.d
     396  **	st1w	\3\.d, \1, \[x8\]
     397  **	ret
     398  */
     399  v32sf
     400  sf_trn1_d (v32sf x)
     401  {
     402    return __builtin_shuffle (x, x, (v32si) { PERM4 (0, 32) });
     403  }