1  /* { dg-do assemble { target aarch64_asm_sve_ok } } */
       2  /* { dg-options "-O -msve-vector-bits=2048 -mlittle-endian --save-temps" } */
       3  /* { dg-final { check-function-bodies "**" "" } } */
       4  
       5  typedef unsigned char v128qi __attribute__((vector_size(128)));
       6  typedef unsigned char v64qi __attribute__((vector_size(64)));
       7  typedef unsigned char v32qi __attribute__((vector_size(32)));
       8  typedef unsigned short v64hi __attribute__((vector_size(128)));
       9  typedef unsigned short v32hi __attribute__((vector_size(64)));
      10  typedef _Float16 v64hf __attribute__((vector_size(128)));
      11  typedef _Float16 v32hf __attribute__((vector_size(64)));
      12  typedef __bf16 v64bf __attribute__((vector_size(128)));
      13  typedef __bf16 v32bf __attribute__((vector_size(64)));
      14  typedef unsigned int v32si __attribute__((vector_size(128)));
      15  typedef float v32sf __attribute__((vector_size(128)));
      16  
      17  #define PERM0(B) B, B + 2
      18  #define PERM1(B) PERM0 (B), PERM0 (B + 4)
      19  #define PERM2(B) PERM1 (B), PERM1 (B + 8)
      20  #define PERM3(B) PERM2 (B), PERM2 (B + 16)
      21  #define PERM4(B) PERM3 (B), PERM3 (B + 32)
      22  #define PERM5(B) PERM4 (B), PERM4 (B + 64)
      23  #define PERM6(B) PERM5 (B), PERM5 (B + 128)
      24  
      25  /*
      26  ** qi_uzp1_h:
      27  **	ptrue	(p[0-7])\.b, vl256
      28  **	ld1b	(z[0-9]+)\.h, \1/z, \[x0\]
      29  **	uzp1	(z[0-9]+)\.h, \2\.h, \2\.h
      30  **	st1b	\3\.h, \1, \[x8\]
      31  **	ret
      32  */
      33  v128qi
      34  qi_uzp1_h (v128qi x)
      35  {
      36    return __builtin_shuffle (x, x, (v128qi) { PERM6 (0) });
      37  }
      38  
      39  /*
      40  ** qi_uzp1_h_two_op:
      41  **	ptrue	(p[0-7])\.b, vl256
      42  ** (
      43  **	ld1b	(z[0-9]+)\.h, \1/z, \[x1\]
      44  **	ld1b	(z[0-9]+)\.h, \1/z, \[x0\]
      45  **	uzp1	\3\.h, \3\.h, \2\.h
      46  **	st1b	\3\.h, \1, \[x8\]
      47  ** |
      48  **	ld1b	(z[0-9]+)\.h, \1/z, \[x0\]
      49  **	ld1b	(z[0-9]+)\.h, \1/z, \[x1\]
      50  **	uzp1	\4\.h, \4\.h, \5\.h
      51  **	st1b	\4\.h, \1, \[x8\]
      52  ** )
      53  **	ret
      54  */
      55  v128qi
      56  qi_uzp1_h_two_op (v128qi x, v128qi y)
      57  {
      58    return __builtin_shuffle (x, y, (v128qi) { PERM6 (0) });
      59  }
      60  
      61  /*
      62  ** qi_uzp1_s:
      63  **	ptrue	(p[0-7])\.b, vl256
      64  **	ld1b	(z[0-9]+)\.s, \1/z, \[x0\]
      65  **	uzp1	(z[0-9]+)\.s, \2\.s, \2\.s
      66  **	st1b	\3\.s, \1, \[x8\]
      67  **	ret
      68  */
      69  v64qi
      70  qi_uzp1_s (v64qi x)
      71  {
      72    return __builtin_shuffle (x, x, (v64qi) { PERM5 (0) });
      73  }
      74  
      75  /*
      76  ** qi_uzp1_s_two_op:
      77  **	ptrue	(p[0-7])\.b, vl256
      78  ** (
      79  **	ld1b	(z[0-9]+)\.s, \1/z, \[x1\]
      80  **	ld1b	(z[0-9]+)\.s, \1/z, \[x0\]
      81  **	uzp1	\3\.s, \3\.s, \2\.s
      82  **	st1b	\3\.s, \1, \[x8\]
      83  ** |
      84  **	ld1b	(z[0-9]+)\.s, \1/z, \[x0\]
      85  **	ld1b	(z[0-9]+)\.s, \1/z, \[x1\]
      86  **	uzp1	\4\.s, \4\.s, \5\.s
      87  **	st1b	\4\.s, \1, \[x8\]
      88  ** )
      89  **	ret
      90  */
      91  v64qi
      92  qi_uzp1_s_two_op (v64qi x, v64qi y)
      93  {
      94    return __builtin_shuffle (x, y, (v64qi) { PERM5 (0) });
      95  }
      96  
      97  /*
      98  ** qi_uzp1_d:
      99  **	ptrue	(p[0-7])\.b, vl256
     100  **	ld1b	(z[0-9]+)\.d, \1/z, \[x0\]
     101  **	uzp1	(z[0-9]+)\.d, \2\.d, \2\.d
     102  **	st1b	\3\.d, \1, \[x8\]
     103  **	ret
     104  */
     105  v32qi
     106  qi_uzp1_d (v32qi x)
     107  {
     108    return __builtin_shuffle (x, x, (v32qi) { PERM4 (0) });
     109  }
     110  
     111  /*
     112  ** qi_uzp1_d_two_op:
     113  **	ptrue	(p[0-7])\.b, vl256
     114  ** (
     115  **	ld1b	(z[0-9]+)\.d, \1/z, \[x1\]
     116  **	ld1b	(z[0-9]+)\.d, \1/z, \[x0\]
     117  **	uzp1	\3\.d, \3\.d, \2\.d
     118  **	st1b	\3\.d, \1, \[x8\]
     119  ** |
     120  **	ld1b	(z[0-9]+)\.d, \1/z, \[x0\]
     121  **	ld1b	(z[0-9]+)\.d, \1/z, \[x1\]
     122  **	uzp1	\4\.d, \4\.d, \5\.d
     123  **	st1b	\4\.d, \1, \[x8\]
     124  ** )
     125  **	ret
     126  */
     127  v32qi
     128  qi_uzp1_d_two_op (v32qi x, v32qi y)
     129  {
     130    return __builtin_shuffle (x, y, (v32qi) { PERM4 (0) });
     131  }
     132  
     133  /*
     134  ** hi_uzp1_s:
     135  **	ptrue	(p[0-7])\.b, vl256
     136  **	ld1h	(z[0-9]+)\.s, \1/z, \[x0\]
     137  **	uzp1	(z[0-9]+)\.s, \2\.s, \2\.s
     138  **	st1h	\3\.s, \1, \[x8\]
     139  **	ret
     140  */
     141  v64hi
     142  hi_uzp1_s (v64hi x)
     143  {
     144    return __builtin_shuffle (x, x, (v64hi) { PERM5 (0) });
     145  }
     146  
     147  /*
     148  ** hi_uzp1_s_two_op:
     149  **	ptrue	(p[0-7])\.b, vl256
     150  ** (
     151  **	ld1h	(z[0-9]+)\.s, \1/z, \[x1\]
     152  **	ld1h	(z[0-9]+)\.s, \1/z, \[x0\]
     153  **	uzp1	\3\.s, \3\.s, \2\.s
     154  **	st1h	\3\.s, \1, \[x8\]
     155  ** |
     156  **	ld1h	(z[0-9]+)\.s, \1/z, \[x0\]
     157  **	ld1h	(z[0-9]+)\.s, \1/z, \[x1\]
     158  **	uzp1	\4\.s, \4\.s, \5\.s
     159  **	st1h	\4\.s, \1, \[x8\]
     160  ** )
     161  **	ret
     162  */
     163  v64hi
     164  hi_uzp1_s_two_op (v64hi x, v64hi y)
     165  {
     166    return __builtin_shuffle (x, y, (v64hi) { PERM5 (0) });
     167  }
     168  
     169  /*
     170  ** hf_uzp1_s:
     171  **	ptrue	(p[0-7])\.b, vl256
     172  **	ld1h	(z[0-9]+)\.s, \1/z, \[x0\]
     173  **	uzp1	(z[0-9]+)\.s, \2\.s, \2\.s
     174  **	st1h	\3\.s, \1, \[x8\]
     175  **	ret
     176  */
     177  v64hf
     178  hf_uzp1_s (v64hf x)
     179  {
     180    return __builtin_shuffle (x, x, (v64hi) { PERM5 (0) });
     181  }
     182  
     183  /*
     184  ** hf_uzp1_s_two_op:
     185  **	ptrue	(p[0-7])\.b, vl256
     186  ** (
     187  **	ld1h	(z[0-9]+)\.s, \1/z, \[x1\]
     188  **	ld1h	(z[0-9]+)\.s, \1/z, \[x0\]
     189  **	uzp1	\3\.s, \3\.s, \2\.s
     190  **	st1h	\3\.s, \1, \[x8\]
     191  ** |
     192  **	ld1h	(z[0-9]+)\.s, \1/z, \[x0\]
     193  **	ld1h	(z[0-9]+)\.s, \1/z, \[x1\]
     194  **	uzp1	\4\.s, \4\.s, \5\.s
     195  **	st1h	\4\.s, \1, \[x8\]
     196  ** )
     197  **	ret
     198  */
     199  v64hf
     200  hf_uzp1_s_two_op (v64hf x, v64hf y)
     201  {
     202    return __builtin_shuffle (x, y, (v64hi) { PERM5 (0) });
     203  }
     204  
     205  /*
     206  ** bf_uzp1_s:
     207  **	ptrue	(p[0-7])\.b, vl256
     208  **	ld1h	(z[0-9]+)\.s, \1/z, \[x0\]
     209  **	uzp1	(z[0-9]+)\.s, \2\.s, \2\.s
     210  **	st1h	\3\.s, \1, \[x8\]
     211  **	ret
     212  */
     213  v64bf
     214  bf_uzp1_s (v64bf x)
     215  {
     216    return __builtin_shuffle (x, x, (v64hi) { PERM5 (0) });
     217  }
     218  
     219  /*
     220  ** bf_uzp1_s_two_op:
     221  **	ptrue	(p[0-7])\.b, vl256
     222  ** (
     223  **	ld1h	(z[0-9]+)\.s, \1/z, \[x1\]
     224  **	ld1h	(z[0-9]+)\.s, \1/z, \[x0\]
     225  **	uzp1	\3\.s, \3\.s, \2\.s
     226  **	st1h	\3\.s, \1, \[x8\]
     227  ** |
     228  **	ld1h	(z[0-9]+)\.s, \1/z, \[x0\]
     229  **	ld1h	(z[0-9]+)\.s, \1/z, \[x1\]
     230  **	uzp1	\4\.s, \4\.s, \5\.s
     231  **	st1h	\4\.s, \1, \[x8\]
     232  ** )
     233  **	ret
     234  */
     235  v64bf
     236  bf_uzp1_s_two_op (v64bf x, v64bf y)
     237  {
     238    return __builtin_shuffle (x, y, (v64hi) { PERM5 (0) });
     239  }
     240  
     241  /*
     242  ** hi_uzp1_d:
     243  **	ptrue	(p[0-7])\.b, vl256
     244  **	ld1h	(z[0-9]+)\.d, \1/z, \[x0\]
     245  **	uzp1	(z[0-9]+)\.d, \2\.d, \2\.d
     246  **	st1h	\3\.d, \1, \[x8\]
     247  **	ret
     248  */
     249  v32hi
     250  hi_uzp1_d (v32hi x)
     251  {
     252    return __builtin_shuffle (x, x, (v32hi) { PERM4 (0) });
     253  }
     254  
     255  /*
     256  ** hi_uzp1_d_two_op:
     257  **	ptrue	(p[0-7])\.b, vl256
     258  ** (
     259  **	ld1h	(z[0-9]+)\.d, \1/z, \[x1\]
     260  **	ld1h	(z[0-9]+)\.d, \1/z, \[x0\]
     261  **	uzp1	\3\.d, \3\.d, \2\.d
     262  **	st1h	\3\.d, \1, \[x8\]
     263  ** |
     264  **	ld1h	(z[0-9]+)\.d, \1/z, \[x0\]
     265  **	ld1h	(z[0-9]+)\.d, \1/z, \[x1\]
     266  **	uzp1	\4\.d, \4\.d, \5\.d
     267  **	st1h	\4\.d, \1, \[x8\]
     268  ** )
     269  **	ret
     270  */
     271  v32hi
     272  hi_uzp1_d_two_op (v32hi x, v32hi y)
     273  {
     274    return __builtin_shuffle (x, y, (v32hi) { PERM4 (0) });
     275  }
     276  
     277  /*
     278  ** hf_uzp1_d:
     279  **	ptrue	(p[0-7])\.b, vl256
     280  **	ld1h	(z[0-9]+)\.d, \1/z, \[x0\]
     281  **	uzp1	(z[0-9]+)\.d, \2\.d, \2\.d
     282  **	st1h	\3\.d, \1, \[x8\]
     283  **	ret
     284  */
     285  v32hf
     286  hf_uzp1_d (v32hf x)
     287  {
     288    return __builtin_shuffle (x, x, (v32hi) { PERM4 (0) });
     289  }
     290  
     291  /*
     292  ** hf_uzp1_d_two_op:
     293  **	ptrue	(p[0-7])\.b, vl256
     294  ** (
     295  **	ld1h	(z[0-9]+)\.d, \1/z, \[x1\]
     296  **	ld1h	(z[0-9]+)\.d, \1/z, \[x0\]
     297  **	uzp1	\3\.d, \3\.d, \2\.d
     298  **	st1h	\3\.d, \1, \[x8\]
     299  ** |
     300  **	ld1h	(z[0-9]+)\.d, \1/z, \[x0\]
     301  **	ld1h	(z[0-9]+)\.d, \1/z, \[x1\]
     302  **	uzp1	\4\.d, \4\.d, \5\.d
     303  **	st1h	\4\.d, \1, \[x8\]
     304  ** )
     305  **	ret
     306  */
     307  v32hf
     308  hf_uzp1_d_two_op (v32hf x, v32hf y)
     309  {
     310    return __builtin_shuffle (x, y, (v32hi) { PERM4 (0) });
     311  }
     312  
     313  /*
     314  ** bf_uzp1_d:
     315  **	ptrue	(p[0-7])\.b, vl256
     316  **	ld1h	(z[0-9]+)\.d, \1/z, \[x0\]
     317  **	uzp1	(z[0-9]+)\.d, \2\.d, \2\.d
     318  **	st1h	\3\.d, \1, \[x8\]
     319  **	ret
     320  */
     321  v32bf
     322  bf_uzp1_d (v32bf x)
     323  {
     324    return __builtin_shuffle (x, x, (v32hi) { PERM4 (0) });
     325  }
     326  
     327  /*
     328  ** bf_uzp1_d_two_op:
     329  **	ptrue	(p[0-7])\.b, vl256
     330  ** (
     331  **	ld1h	(z[0-9]+)\.d, \1/z, \[x1\]
     332  **	ld1h	(z[0-9]+)\.d, \1/z, \[x0\]
     333  **	uzp1	\3\.d, \3\.d, \2\.d
     334  **	st1h	\3\.d, \1, \[x8\]
     335  ** |
     336  **	ld1h	(z[0-9]+)\.d, \1/z, \[x0\]
     337  **	ld1h	(z[0-9]+)\.d, \1/z, \[x1\]
     338  **	uzp1	\4\.d, \4\.d, \5\.d
     339  **	st1h	\4\.d, \1, \[x8\]
     340  ** )
     341  **	ret
     342  */
     343  v32bf
     344  bf_uzp1_d_two_op (v32bf x, v32bf y)
     345  {
     346    return __builtin_shuffle (x, y, (v32hi) { PERM4 (0) });
     347  }
     348  
     349  /*
     350  ** si_uzp1_d:
     351  **	ptrue	(p[0-7])\.b, vl256
     352  **	ld1w	(z[0-9]+)\.d, \1/z, \[x0\]
     353  **	uzp1	(z[0-9]+)\.d, \2\.d, \2\.d
     354  **	st1w	\3\.d, \1, \[x8\]
     355  **	ret
     356  */
     357  v32si
     358  si_uzp1_d (v32si x)
     359  {
     360    return __builtin_shuffle (x, x, (v32si) { PERM4 (0) });
     361  }
     362  
     363  /*
     364  ** sf_uzp1_d:
     365  **	ptrue	(p[0-7])\.b, vl256
     366  **	ld1w	(z[0-9]+)\.d, \1/z, \[x0\]
     367  **	uzp1	(z[0-9]+)\.d, \2\.d, \2\.d
     368  **	st1w	\3\.d, \1, \[x8\]
     369  **	ret
     370  */
     371  v32sf
     372  sf_uzp1_d (v32sf x)
     373  {
     374    return __builtin_shuffle (x, x, (v32si) { PERM4 (0) });
     375  }