(root)/
gcc-13.2.0/
gcc/
testsuite/
gcc.target/
aarch64/
advsimd-intrinsics/
p64_p128.c
       1  /* This file contains tests for all the *p64 intrinsics, except for
       2     vreinterpret which have their own testcase.  */
       3  
       4  /* { dg-require-effective-target arm_crypto_ok { target { arm*-*-* } } } */
       5  /* { dg-add-options arm_crypto } */
       6  /* { dg-additional-options "-march=armv8-a+crypto" { target { aarch64*-*-* } } }*/
       7  
       8  #include <arm_neon.h>
       9  #include "arm-neon-ref.h"
      10  #include "compute-ref-data.h"
      11  
      12  /* Expected results: vbsl.  */
      13  VECT_VAR_DECL(vbsl_expected,poly,64,1) [] = { 0xfffffff1 };
      14  VECT_VAR_DECL(vbsl_expected,poly,64,2) [] = { 0xfffffff1,
      15  					      0xfffffff1 };
      16  
      17  /* Expected results: vceq.  */
      18  VECT_VAR_DECL(vceq_expected,uint,64,1) [] = { 0x0 };
      19  VECT_VAR_DECL(vceq_expected,uint,64,2) [] = { 0x0, 0xffffffffffffffff };
      20  
      21  /* Expected results: vceqz.  */
      22  VECT_VAR_DECL(vceqz_expected,uint,64,1) [] = { 0x0 };
      23  VECT_VAR_DECL(vceqz_expected,uint,64,2) [] = { 0x0, 0xffffffffffffffff };
      24  
      25  /* Expected results: vcombine.  */
      26  VECT_VAR_DECL(vcombine_expected,poly,64,2) [] = { 0xfffffffffffffff0, 0x88 };
      27  
      28  /* Expected results: vcreate.  */
      29  VECT_VAR_DECL(vcreate_expected,poly,64,1) [] = { 0x123456789abcdef0 };
      30  
      31  /* Expected results: vdup_lane.  */
      32  VECT_VAR_DECL(vdup_lane_expected,poly,64,1) [] = { 0xfffffffffffffff0 };
      33  VECT_VAR_DECL(vdup_lane_expected,poly,64,2) [] = { 0xfffffffffffffff0,
      34  						   0xfffffffffffffff0 };
      35  
      36  /* Expected results: vdup_n.  */
      37  VECT_VAR_DECL(vdup_n_expected0,poly,64,1) [] = { 0xfffffffffffffff0 };
      38  VECT_VAR_DECL(vdup_n_expected0,poly,64,2) [] = { 0xfffffffffffffff0,
      39  						 0xfffffffffffffff0 };
      40  VECT_VAR_DECL(vdup_n_expected1,poly,64,1) [] = { 0xfffffffffffffff1 };
      41  VECT_VAR_DECL(vdup_n_expected1,poly,64,2) [] = { 0xfffffffffffffff1,
      42  						 0xfffffffffffffff1 };
      43  VECT_VAR_DECL(vdup_n_expected2,poly,64,1) [] = { 0xfffffffffffffff2 };
      44  VECT_VAR_DECL(vdup_n_expected2,poly,64,2) [] = { 0xfffffffffffffff2,
      45  						 0xfffffffffffffff2 };
      46  
      47  /* Expected results: vext.  */
      48  VECT_VAR_DECL(vext_expected,poly,64,1) [] = { 0xfffffffffffffff0 };
      49  VECT_VAR_DECL(vext_expected,poly,64,2) [] = { 0xfffffffffffffff1, 0x88 };
      50  
      51  /* Expected results: vget_low.  */
      52  VECT_VAR_DECL(vget_low_expected,poly,64,1) [] = { 0xfffffffffffffff0 };
      53  
      54  /* Expected results: vget_high.  */
      55  VECT_VAR_DECL(vget_high_expected,poly,64,1) [] = { 0xfffffffffffffff1 };
      56  
      57  /* Expected results: vld1.  */
      58  VECT_VAR_DECL(vld1_expected,poly,64,1) [] = { 0xfffffffffffffff0 };
      59  VECT_VAR_DECL(vld1_expected,poly,64,2) [] = { 0xfffffffffffffff0,
      60  					      0xfffffffffffffff1 };
      61  
      62  /* Expected results: vld1_dup.  */
      63  VECT_VAR_DECL(vld1_dup_expected0,poly,64,1) [] = { 0xfffffffffffffff0 };
      64  VECT_VAR_DECL(vld1_dup_expected0,poly,64,2) [] = { 0xfffffffffffffff0,
      65  						   0xfffffffffffffff0 };
      66  VECT_VAR_DECL(vld1_dup_expected1,poly,64,1) [] = { 0xfffffffffffffff1 };
      67  VECT_VAR_DECL(vld1_dup_expected1,poly,64,2) [] = { 0xfffffffffffffff1,
      68  						   0xfffffffffffffff1 };
      69  VECT_VAR_DECL(vld1_dup_expected2,poly,64,1) [] = { 0xfffffffffffffff2 };
      70  VECT_VAR_DECL(vld1_dup_expected2,poly,64,2) [] = { 0xfffffffffffffff2,
      71  						   0xfffffffffffffff2 };
      72  
      73  /* Expected results: vld1_lane.  */
      74  VECT_VAR_DECL(vld1_lane_expected,poly,64,1) [] = { 0xfffffffffffffff0 };
      75  VECT_VAR_DECL(vld1_lane_expected,poly,64,2) [] = { 0xfffffffffffffff0,
      76  						   0xaaaaaaaaaaaaaaaa };
      77  
      78  /* Expected results: vldX.  */
      79  VECT_VAR_DECL(vld2_expected_0,poly,64,1) [] = { 0xfffffffffffffff0 };
      80  VECT_VAR_DECL(vld2_expected_1,poly,64,1) [] = { 0xfffffffffffffff1 };
      81  VECT_VAR_DECL(vld3_expected_0,poly,64,1) [] = { 0xfffffffffffffff0 };
      82  VECT_VAR_DECL(vld3_expected_1,poly,64,1) [] = { 0xfffffffffffffff1 };
      83  VECT_VAR_DECL(vld3_expected_2,poly,64,1) [] = { 0xfffffffffffffff2 };
      84  VECT_VAR_DECL(vld4_expected_0,poly,64,1) [] = { 0xfffffffffffffff0 };
      85  VECT_VAR_DECL(vld4_expected_1,poly,64,1) [] = { 0xfffffffffffffff1 };
      86  VECT_VAR_DECL(vld4_expected_2,poly,64,1) [] = { 0xfffffffffffffff2 };
      87  VECT_VAR_DECL(vld4_expected_3,poly,64,1) [] = { 0xfffffffffffffff3 };
      88  
      89  /* Expected results: vldX_dup.  */
      90  VECT_VAR_DECL(vld2_dup_expected_0,poly,64,1) [] = { 0xfffffffffffffff0 };
      91  VECT_VAR_DECL(vld2_dup_expected_1,poly,64,1) [] = { 0xfffffffffffffff1 };
      92  VECT_VAR_DECL(vld3_dup_expected_0,poly,64,1) [] = { 0xfffffffffffffff0 };
      93  VECT_VAR_DECL(vld3_dup_expected_1,poly,64,1) [] = { 0xfffffffffffffff1 };
      94  VECT_VAR_DECL(vld3_dup_expected_2,poly,64,1) [] = { 0xfffffffffffffff2 };
      95  VECT_VAR_DECL(vld4_dup_expected_0,poly,64,1) [] = { 0xfffffffffffffff0 };
      96  VECT_VAR_DECL(vld4_dup_expected_1,poly,64,1) [] = { 0xfffffffffffffff1 };
      97  VECT_VAR_DECL(vld4_dup_expected_2,poly,64,1) [] = { 0xfffffffffffffff2 };
      98  VECT_VAR_DECL(vld4_dup_expected_3,poly,64,1) [] = { 0xfffffffffffffff3 };
      99  
     100  /* Expected results: vsli.  */
     101  VECT_VAR_DECL(vsli_expected,poly,64,1) [] = { 0x10 };
     102  VECT_VAR_DECL(vsli_expected,poly,64,2) [] = { 0x7ffffffffffff0,
     103  					      0x7ffffffffffff1 };
     104  VECT_VAR_DECL(vsli_expected_max_shift,poly,64,1) [] = { 0x7ffffffffffffff0 };
     105  VECT_VAR_DECL(vsli_expected_max_shift,poly,64,2) [] = { 0xfffffffffffffff0,
     106  							0xfffffffffffffff1 };
     107  
     108  /* Expected results: vsri.  */
     109  VECT_VAR_DECL(vsri_expected,poly,64,1) [] = { 0xe000000000000000 };
     110  VECT_VAR_DECL(vsri_expected,poly,64,2) [] = { 0xfffffffffffff800,
     111  					      0xfffffffffffff800 };
     112  VECT_VAR_DECL(vsri_expected_max_shift,poly,64,1) [] = { 0xfffffffffffffff0 };
     113  VECT_VAR_DECL(vsri_expected_max_shift,poly,64,2) [] = { 0xfffffffffffffff0,
     114  							0xfffffffffffffff1 };
     115  
     116  /* Expected results: vst1_lane.  */
     117  VECT_VAR_DECL(vst1_lane_expected,poly,64,1) [] = { 0xfffffffffffffff0 };
     118  VECT_VAR_DECL(vst1_lane_expected,poly,64,2) [] = { 0xfffffffffffffff0,
     119  						   0x3333333333333333 };
     120  
     121  /* Expected results: vget_lane.  */
     122  VECT_VAR_DECL(vget_lane_expected,poly,64,1) = 0xfffffffffffffff0;
     123  VECT_VAR_DECL(vget_lane_expected,poly,64,2) = 0xfffffffffffffff0;
     124  
     125  /* Expected results: vset_lane.  */
     126  VECT_VAR_DECL(vset_lane_expected,poly,64,1) [] = { 0x88 };
     127  VECT_VAR_DECL(vset_lane_expected,poly,64,2) [] = { 0xfffffffffffffff0, 0x11 };
     128  
     129  /* Expected results: vtst.  */
     130  VECT_VAR_DECL(vtst_expected,uint,64,1) [] = { 0x0 };
     131  
     132  #ifdef __aarch64__
     133  /* Expected results: vmov_n.  */
     134  VECT_VAR_DECL(vmov_n_expected0,poly,64,1) [] = { 0xfffffffffffffff0 };
     135  VECT_VAR_DECL(vmov_n_expected0,poly,64,2) [] = { 0xfffffffffffffff0,
     136  						 0xfffffffffffffff0 };
     137  VECT_VAR_DECL(vmov_n_expected1,poly,64,1) [] = { 0xfffffffffffffff1 };
     138  VECT_VAR_DECL(vmov_n_expected1,poly,64,2) [] = { 0xfffffffffffffff1,
     139  						 0xfffffffffffffff1 };
     140  VECT_VAR_DECL(vmov_n_expected2,poly,64,1) [] = { 0xfffffffffffffff2 };
     141  VECT_VAR_DECL(vmov_n_expected2,poly,64,2) [] = { 0xfffffffffffffff2,
     142  						 0xfffffffffffffff2 };
     143  
     144  /* Expected results: vldX_lane.  */
     145  VECT_VAR_DECL(expected_vld_st2_0,poly,64,1) [] = { 0xfffffffffffffff0 };
     146  VECT_VAR_DECL(expected_vld_st2_0,poly,64,2) [] = { 0xfffffffffffffff0,
     147  						   0xfffffffffffffff1 };
     148  VECT_VAR_DECL(expected_vld_st2_1,poly,64,1) [] = { 0xfffffffffffffff1 };
     149  VECT_VAR_DECL(expected_vld_st2_1,poly,64,2) [] = { 0xaaaaaaaaaaaaaaaa,
     150  						   0xaaaaaaaaaaaaaaaa };
     151  VECT_VAR_DECL(expected_vld_st3_0,poly,64,1) [] = { 0xfffffffffffffff0 };
     152  VECT_VAR_DECL(expected_vld_st3_0,poly,64,2) [] = { 0xfffffffffffffff0,
     153  						   0xfffffffffffffff1 };
     154  VECT_VAR_DECL(expected_vld_st3_1,poly,64,1) [] = { 0xfffffffffffffff1 };
     155  VECT_VAR_DECL(expected_vld_st3_1,poly,64,2) [] = { 0xfffffffffffffff2,
     156  						   0xaaaaaaaaaaaaaaaa };
     157  VECT_VAR_DECL(expected_vld_st3_2,poly,64,1) [] = { 0xfffffffffffffff2 };
     158  VECT_VAR_DECL(expected_vld_st3_2,poly,64,2) [] = { 0xaaaaaaaaaaaaaaaa,
     159  						   0xaaaaaaaaaaaaaaaa };
     160  VECT_VAR_DECL(expected_vld_st4_0,poly,64,1) [] = { 0xfffffffffffffff0 };
     161  VECT_VAR_DECL(expected_vld_st4_0,poly,64,2) [] = { 0xfffffffffffffff0,
     162  						   0xfffffffffffffff1 };
     163  VECT_VAR_DECL(expected_vld_st4_1,poly,64,1) [] = { 0xfffffffffffffff1 };
     164  VECT_VAR_DECL(expected_vld_st4_1,poly,64,2) [] = { 0xfffffffffffffff2,
     165  						   0xfffffffffffffff3 };
     166  VECT_VAR_DECL(expected_vld_st4_2,poly,64,1) [] = { 0xfffffffffffffff2 };
     167  VECT_VAR_DECL(expected_vld_st4_2,poly,64,2) [] = { 0xaaaaaaaaaaaaaaaa,
     168  						   0xaaaaaaaaaaaaaaaa };
     169  VECT_VAR_DECL(expected_vld_st4_3,poly,64,1) [] = { 0xfffffffffffffff3 };
     170  VECT_VAR_DECL(expected_vld_st4_3,poly,64,2) [] = { 0xaaaaaaaaaaaaaaaa,
     171  						   0xaaaaaaaaaaaaaaaa };
     172  
     173  /* Expected results: vtst.  */
     174  VECT_VAR_DECL(vtst_expected,uint,64,2) [] = { 0x0, 0xffffffffffffffff };
     175  #endif
     176  
     177  int main (void)
     178  {
     179    int i;
     180  
     181    /* vbsl_p64 tests.  */
     182  #define TEST_MSG "VBSL/VBSLQ"
     183  
     184  #define TEST_VBSL(T3, Q, T1, T2, W, N)					\
     185    VECT_VAR(vbsl_vector_res, T1, W, N) =					\
     186      vbsl##Q##_##T2##W(VECT_VAR(vbsl_vector_first, T3, W, N),		\
     187  		      VECT_VAR(vbsl_vector, T1, W, N),			\
     188  		      VECT_VAR(vbsl_vector2, T1, W, N));		\
     189    vst1##Q##_##T2##W(VECT_VAR(result, T1, W, N), VECT_VAR(vbsl_vector_res, T1, W, N))
     190  
     191    DECL_VARIABLE(vbsl_vector, poly, 64, 1);
     192    DECL_VARIABLE(vbsl_vector, poly, 64, 2);
     193    DECL_VARIABLE(vbsl_vector2, poly, 64, 1);
     194    DECL_VARIABLE(vbsl_vector2, poly, 64, 2);
     195    DECL_VARIABLE(vbsl_vector_res, poly, 64, 1);
     196    DECL_VARIABLE(vbsl_vector_res, poly, 64, 2);
     197  
     198    DECL_VARIABLE(vbsl_vector_first, uint, 64, 1);
     199    DECL_VARIABLE(vbsl_vector_first, uint, 64, 2);
     200  
     201    CLEAN(result, poly, 64, 1);
     202    CLEAN(result, poly, 64, 2);
     203  
     204    VLOAD(vbsl_vector, buffer, , poly, p, 64, 1);
     205    VLOAD(vbsl_vector, buffer, q, poly, p, 64, 2);
     206  
     207    VDUP(vbsl_vector2, , poly, p, 64, 1, 0xFFFFFFF3);
     208    VDUP(vbsl_vector2, q, poly, p, 64, 2, 0xFFFFFFF3);
     209  
     210    VDUP(vbsl_vector_first, , uint, u, 64, 1, 0xFFFFFFF2);
     211    VDUP(vbsl_vector_first, q, uint, u, 64, 2, 0xFFFFFFF2);
     212  
     213    TEST_VBSL(uint, , poly, p, 64, 1);
     214    TEST_VBSL(uint, q, poly, p, 64, 2);
     215  
     216    CHECK_POLY(TEST_MSG, poly, 64, 1, PRIx64, vbsl_expected, "");
     217    CHECK_POLY(TEST_MSG, poly, 64, 2, PRIx64, vbsl_expected, "");
     218  
     219    /* vceq_p64 tests. */
     220  #undef TEST_MSG
     221  #define TEST_MSG "VCEQ/VCEQQ"
     222  
     223  #define TEST_VCOMP1(INSN, Q, T1, T2, T3, W, N)				\
     224    VECT_VAR(vceq_vector_res, T3, W, N) =					\
     225      INSN##Q##_##T2##W(VECT_VAR(vceq_vector, T1, W, N),			\
     226  		      VECT_VAR(vceq_vector2, T1, W, N));		\
     227    vst1##Q##_u##W(VECT_VAR(result, T3, W, N), VECT_VAR(vceq_vector_res, T3, W, N))
     228  
     229  #define TEST_VCOMP(INSN, Q, T1, T2, T3, W, N)				\
     230    TEST_VCOMP1(INSN, Q, T1, T2, T3, W, N)
     231  
     232    DECL_VARIABLE(vceq_vector, poly, 64, 1);
     233    DECL_VARIABLE(vceq_vector2, poly, 64, 1);
     234    DECL_VARIABLE(vceq_vector_res, uint, 64, 1);
     235    DECL_VARIABLE(vceq_vector, poly, 64, 2);
     236    DECL_VARIABLE(vceq_vector2, poly, 64, 2);
     237    DECL_VARIABLE(vceq_vector_res, uint, 64, 2);
     238  
     239    CLEAN(result, uint, 64, 1);
     240    CLEAN(result, uint, 64, 2);
     241  
     242    VLOAD(vceq_vector, buffer, , poly, p, 64, 1);
     243    VLOAD(vceq_vector, buffer, q, poly, p, 64, 2);
     244  
     245    VDUP(vceq_vector2, , poly, p, 64, 1, 0x88);
     246    VSET_LANE(vceq_vector2, q, poly, p, 64, 2, 0, 0x88);
     247    VSET_LANE(vceq_vector2, q, poly, p, 64, 2, 1, 0xFFFFFFFFFFFFFFF1);
     248  
     249    TEST_VCOMP(vceq, , poly, p, uint, 64, 1);
     250    TEST_VCOMP(vceq, q, poly, p, uint, 64, 2);
     251  
     252    CHECK(TEST_MSG, uint, 64, 1, PRIx64, vceq_expected, "");
     253    CHECK(TEST_MSG, uint, 64, 2, PRIx64, vceq_expected, "");
     254  
     255    /* vceqz_p64 tests. */
     256  #undef TEST_MSG
     257  #define TEST_MSG "VCEQZ/VCEQZQ"
     258  
     259  #define TEST_VCOMPZ1(INSN, Q, T1, T2, T3, W, N)				\
     260    VECT_VAR(vceqz_vector_res, T3, W, N) =				\
     261      INSN##Q##_##T2##W(VECT_VAR(vceqz_vector, T1, W, N));		\
     262    vst1##Q##_u##W(VECT_VAR(result, T3, W, N), VECT_VAR(vceqz_vector_res, T3, W, N))
     263  
     264  #define TEST_VCOMPZ(INSN, Q, T1, T2, T3, W, N)				\
     265    TEST_VCOMPZ1(INSN, Q, T1, T2, T3, W, N)
     266  
     267    DECL_VARIABLE(vceqz_vector, poly, 64, 1);
     268    DECL_VARIABLE(vceqz_vector_res, uint, 64, 1);
     269    DECL_VARIABLE(vceqz_vector, poly, 64, 2);
     270    DECL_VARIABLE(vceqz_vector_res, uint, 64, 2);
     271  
     272    CLEAN(result, uint, 64, 1);
     273    CLEAN(result, uint, 64, 2);
     274  
     275    VLOAD(vceqz_vector, buffer, , poly, p, 64, 1);
     276    VLOAD(vceqz_vector, buffer, q, poly, p, 64, 2);
     277    VSET_LANE(vceqz_vector, q, poly, p, 64, 2, 1, 0);
     278  
     279    TEST_VCOMPZ(vceqz, , poly, p, uint, 64, 1);
     280    TEST_VCOMPZ(vceqz, q, poly, p, uint, 64, 2);
     281  
     282    CHECK(TEST_MSG, uint, 64, 1, PRIx64, vceqz_expected, "");
     283    CHECK(TEST_MSG, uint, 64, 2, PRIx64, vceqz_expected, "");
     284  
     285    /* vcombine_p64 tests.  */
     286  #undef TEST_MSG
     287  #define TEST_MSG "VCOMBINE"
     288  
     289  #define TEST_VCOMBINE(T1, T2, W, N, N2)					\
     290    VECT_VAR(vcombine_vector128, T1, W, N2) =				\
     291      vcombine_##T2##W(VECT_VAR(vcombine_vector64_a, T1, W, N),		\
     292  		     VECT_VAR(vcombine_vector64_b, T1, W, N));		\
     293    vst1q_##T2##W(VECT_VAR(result, T1, W, N2), VECT_VAR(vcombine_vector128, T1, W, N2))
     294  
     295    DECL_VARIABLE(vcombine_vector64_a, poly, 64, 1);
     296    DECL_VARIABLE(vcombine_vector64_b, poly, 64, 1);
     297    DECL_VARIABLE(vcombine_vector128, poly, 64, 2);
     298  
     299    CLEAN(result, poly, 64, 2);
     300  
     301    VLOAD(vcombine_vector64_a, buffer, , poly, p, 64, 1);
     302  
     303    VDUP(vcombine_vector64_b, , poly, p, 64, 1, 0x88);
     304  
     305    TEST_VCOMBINE(poly, p, 64, 1, 2);
     306  
     307    CHECK_POLY(TEST_MSG, poly, 64, 2, PRIx64, vcombine_expected, "");
     308  
     309    /* vcreate_p64 tests.  */
     310  #undef TEST_MSG
     311  #define TEST_MSG "VCREATE"
     312  
     313  #define TEST_VCREATE(T1, T2, W, N)					\
     314    VECT_VAR(vcreate_vector_res, T1, W, N) =				\
     315      vcreate_##T2##W(VECT_VAR(vcreate_val, T1, W, N));			\
     316    vst1_##T2##W(VECT_VAR(result, T1, W, N), VECT_VAR(vcreate_vector_res, T1, W, N))
     317  
     318  #define DECL_VAL(VAR, T1, W, N)			\
     319    uint64_t VECT_VAR(VAR, T1, W, N)
     320  
     321    DECL_VAL(vcreate_val, poly, 64, 1);
     322    DECL_VARIABLE(vcreate_vector_res, poly, 64, 1);
     323  
     324    CLEAN(result, poly, 64, 2);
     325  
     326    VECT_VAR(vcreate_val, poly, 64, 1) = 0x123456789abcdef0ULL;
     327  
     328    TEST_VCREATE(poly, p, 64, 1);
     329  
     330    CHECK_POLY(TEST_MSG, poly, 64, 1, PRIx64, vcreate_expected, "");
     331  
     332    /* vdup_lane_p64 tests.  */
     333  #undef TEST_MSG
     334  #define TEST_MSG "VDUP_LANE/VDUP_LANEQ"
     335  
     336  #define TEST_VDUP_LANE(Q, T1, T2, W, N, N2, L)				\
     337    VECT_VAR(vdup_lane_vector_res, T1, W, N) =				\
     338      vdup##Q##_lane_##T2##W(VECT_VAR(vdup_lane_vector, T1, W, N2), L);	\
     339    vst1##Q##_##T2##W(VECT_VAR(result, T1, W, N), VECT_VAR(vdup_lane_vector_res, T1, W, N))
     340  
     341    DECL_VARIABLE(vdup_lane_vector, poly, 64, 1);
     342    DECL_VARIABLE(vdup_lane_vector, poly, 64, 2);
     343    DECL_VARIABLE(vdup_lane_vector_res, poly, 64, 1);
     344    DECL_VARIABLE(vdup_lane_vector_res, poly, 64, 2);
     345  
     346    CLEAN(result, poly, 64, 1);
     347    CLEAN(result, poly, 64, 2);
     348  
     349    VLOAD(vdup_lane_vector, buffer, , poly, p, 64, 1);
     350  
     351    TEST_VDUP_LANE(, poly, p, 64, 1, 1, 0);
     352    TEST_VDUP_LANE(q, poly, p, 64, 2, 1, 0);
     353  
     354    CHECK_POLY(TEST_MSG, poly, 64, 1, PRIx64, vdup_lane_expected, "");
     355    CHECK_POLY(TEST_MSG, poly, 64, 2, PRIx64, vdup_lane_expected, "");
     356  
     357    /* vdup_n_p64 tests.  */
     358  #undef TEST_MSG
     359  #define TEST_MSG "VDUP/VDUPQ"
     360  
     361  #define TEST_VDUP(Q, T1, T2, W, N)					\
     362    VECT_VAR(vdup_n_vector, T1, W, N) =					\
     363      vdup##Q##_n_##T2##W(VECT_VAR(buffer_dup, T1, W, N)[i]);		\
     364    vst1##Q##_##T2##W(VECT_VAR(result, T1, W, N), VECT_VAR(vdup_n_vector, T1, W, N))
     365  
     366    DECL_VARIABLE(vdup_n_vector, poly, 64, 1);
     367    DECL_VARIABLE(vdup_n_vector, poly, 64, 2);
     368  
     369    /* Try to read different places from the input buffer.  */
     370    for (i=0; i< 3; i++) {
     371      CLEAN(result, poly, 64, 1);
     372      CLEAN(result, poly, 64, 2);
     373  
     374      TEST_VDUP(, poly, p, 64, 1);
     375      TEST_VDUP(q, poly, p, 64, 2);
     376  
     377      switch (i) {
     378      case 0:
     379        CHECK_POLY(TEST_MSG, poly, 64, 1, PRIx64, vdup_n_expected0, "");
     380        CHECK_POLY(TEST_MSG, poly, 64, 2, PRIx64, vdup_n_expected0, "");
     381        break;
     382      case 1:
     383        CHECK_POLY(TEST_MSG, poly, 64, 1, PRIx64, vdup_n_expected1, "");
     384        CHECK_POLY(TEST_MSG, poly, 64, 2, PRIx64, vdup_n_expected1, "");
     385        break;
     386      case 2:
     387        CHECK_POLY(TEST_MSG, poly, 64, 1, PRIx64, vdup_n_expected2, "");
     388        CHECK_POLY(TEST_MSG, poly, 64, 2, PRIx64, vdup_n_expected2, "");
     389        break;
     390      default:
     391        abort();
     392      }
     393    }
     394  
     395    /* vexit_p64 tests.  */
     396  #undef TEST_MSG
     397  #define TEST_MSG "VEXT/VEXTQ"
     398  
     399  #define TEST_VEXT(Q, T1, T2, W, N, V)					\
     400    VECT_VAR(vext_vector_res, T1, W, N) =					\
     401      vext##Q##_##T2##W(VECT_VAR(vext_vector1, T1, W, N),			\
     402  		      VECT_VAR(vext_vector2, T1, W, N),			\
     403  		      V);						\
     404    vst1##Q##_##T2##W(VECT_VAR(result, T1, W, N), VECT_VAR(vext_vector_res, T1, W, N))
     405  
     406    DECL_VARIABLE(vext_vector1, poly, 64, 1);
     407    DECL_VARIABLE(vext_vector1, poly, 64, 2);
     408    DECL_VARIABLE(vext_vector2, poly, 64, 1);
     409    DECL_VARIABLE(vext_vector2, poly, 64, 2);
     410    DECL_VARIABLE(vext_vector_res, poly, 64, 1);
     411    DECL_VARIABLE(vext_vector_res, poly, 64, 2);
     412  
     413    CLEAN(result, poly, 64, 1);
     414    CLEAN(result, poly, 64, 2);
     415  
     416    VLOAD(vext_vector1, buffer, , poly, p, 64, 1);
     417    VLOAD(vext_vector1, buffer, q, poly, p, 64, 2);
     418  
     419    VDUP(vext_vector2, , poly, p, 64, 1, 0x88);
     420    VDUP(vext_vector2, q, poly, p, 64, 2, 0x88);
     421  
     422    TEST_VEXT(, poly, p, 64, 1, 0);
     423    TEST_VEXT(q, poly, p, 64, 2, 1);
     424  
     425    CHECK_POLY(TEST_MSG, poly, 64, 1, PRIx64, vext_expected, "");
     426    CHECK_POLY(TEST_MSG, poly, 64, 2, PRIx64, vext_expected, "");
     427  
     428    /* vget_low_p64 tests.  */
     429  #undef TEST_MSG
     430  #define TEST_MSG "VGET_LOW"
     431  
     432  #define TEST_VGET_LOW(T1, T2, W, N, N2)					\
     433    VECT_VAR(vget_low_vector64, T1, W, N) =				\
     434      vget_low_##T2##W(VECT_VAR(vget_low_vector128, T1, W, N2));		\
     435    vst1_##T2##W(VECT_VAR(result, T1, W, N), VECT_VAR(vget_low_vector64, T1, W, N))
     436  
     437    DECL_VARIABLE(vget_low_vector64, poly, 64, 1);
     438    DECL_VARIABLE(vget_low_vector128, poly, 64, 2);
     439  
     440    CLEAN(result, poly, 64, 1);
     441  
     442    VLOAD(vget_low_vector128, buffer, q, poly, p, 64, 2);
     443  
     444    TEST_VGET_LOW(poly, p, 64, 1, 2);
     445  
     446    CHECK_POLY(TEST_MSG, poly, 64, 1, PRIx64, vget_low_expected, "");
     447  
     448    /* vget_high_p64 tests.  */
     449  #undef TEST_MSG
     450  #define TEST_MSG "VGET_HIGH"
     451  
     452  #define TEST_VGET_HIGH(T1, T2, W, N, N2)					\
     453    VECT_VAR(vget_high_vector64, T1, W, N) =				\
     454      vget_high_##T2##W(VECT_VAR(vget_high_vector128, T1, W, N2));		\
     455    vst1_##T2##W(VECT_VAR(result, T1, W, N), VECT_VAR(vget_high_vector64, T1, W, N))
     456  
     457    DECL_VARIABLE(vget_high_vector64, poly, 64, 1);
     458    DECL_VARIABLE(vget_high_vector128, poly, 64, 2);
     459  
     460    CLEAN(result, poly, 64, 1);
     461  
     462    VLOAD(vget_high_vector128, buffer, q, poly, p, 64, 2);
     463  
     464    TEST_VGET_HIGH(poly, p, 64, 1, 2);
     465  
     466    CHECK_POLY(TEST_MSG, poly, 64, 1, PRIx64, vget_high_expected, "");
     467  
     468    /* vld1_p64 tests.  */
     469  #undef TEST_MSG
     470  #define TEST_MSG "VLD1/VLD1Q"
     471  
     472  #define TEST_VLD1(VAR, BUF, Q, T1, T2, W, N)				\
     473    VECT_VAR(VAR, T1, W, N) = vld1##Q##_##T2##W(VECT_VAR(BUF, T1, W, N)); \
     474    vst1##Q##_##T2##W(VECT_VAR(result, T1, W, N), VECT_VAR(VAR, T1, W, N))
     475  
     476    DECL_VARIABLE(vld1_vector, poly, 64, 1);
     477    DECL_VARIABLE(vld1_vector, poly, 64, 2);
     478  
     479    CLEAN(result, poly, 64, 1);
     480    CLEAN(result, poly, 64, 2);
     481  
     482    VLOAD(vld1_vector, buffer, , poly, p, 64, 1);
     483    VLOAD(vld1_vector, buffer, q, poly, p, 64, 2);
     484  
     485    TEST_VLD1(vld1_vector, buffer, , poly, p, 64, 1);
     486    TEST_VLD1(vld1_vector, buffer, q, poly, p, 64, 2);
     487  
     488    CHECK_POLY(TEST_MSG, poly, 64, 1, PRIx64, vld1_expected, "");
     489    CHECK_POLY(TEST_MSG, poly, 64, 2, PRIx64, vld1_expected, "");
     490  
     491    /* vld1_dup_p64 tests.  */
     492  #undef TEST_MSG
     493  #define TEST_MSG "VLD1_DUP/VLD1_DUPQ"
     494  
     495  #define TEST_VLD1_DUP(VAR, BUF, Q, T1, T2, W, N)			\
     496    VECT_VAR(VAR, T1, W, N) =						\
     497      vld1##Q##_dup_##T2##W(&VECT_VAR(BUF, T1, W, N)[i]);			\
     498    vst1##Q##_##T2##W(VECT_VAR(result, T1, W, N), VECT_VAR(VAR, T1, W, N))
     499  
     500    DECL_VARIABLE(vld1_dup_vector, poly, 64, 1);
     501    DECL_VARIABLE(vld1_dup_vector, poly, 64, 2);
     502  
     503    /* Try to read different places from the input buffer.  */
     504    for (i=0; i<3; i++) {
     505      CLEAN(result, poly, 64, 1);
     506      CLEAN(result, poly, 64, 2);
     507  
     508      TEST_VLD1_DUP(vld1_dup_vector, buffer_dup, , poly, p, 64, 1);
     509      TEST_VLD1_DUP(vld1_dup_vector, buffer_dup, q, poly, p, 64, 2);
     510  
     511      switch (i) {
     512      case 0:
     513        CHECK_POLY(TEST_MSG, poly, 64, 1, PRIx64, vld1_dup_expected0, "");
     514        CHECK_POLY(TEST_MSG, poly, 64, 2, PRIx64, vld1_dup_expected0, "");
     515        break;
     516      case 1:
     517        CHECK_POLY(TEST_MSG, poly, 64, 1, PRIx64, vld1_dup_expected1, "");
     518        CHECK_POLY(TEST_MSG, poly, 64, 2, PRIx64, vld1_dup_expected1, "");
     519        break;
     520      case 2:
     521        CHECK_POLY(TEST_MSG, poly, 64, 1, PRIx64, vld1_dup_expected2, "");
     522        CHECK_POLY(TEST_MSG, poly, 64, 2, PRIx64, vld1_dup_expected2, "");
     523        break;
     524      default:
     525        abort();
     526      }
     527    }
     528  
     529    /* vld1_lane_p64 tests.  */
     530  #undef TEST_MSG
     531  #define TEST_MSG "VLD1_LANE/VLD1_LANEQ"
     532  
     533  #define TEST_VLD1_LANE(Q, T1, T2, W, N, L)				\
     534    memset (VECT_VAR(vld1_lane_buffer_src, T1, W, N), 0xAA, W/8*N);	\
     535    VECT_VAR(vld1_lane_vector_src, T1, W, N) =				\
     536      vld1##Q##_##T2##W(VECT_VAR(vld1_lane_buffer_src, T1, W, N));	\
     537    VECT_VAR(vld1_lane_vector, T1, W, N) =				\
     538      vld1##Q##_lane_##T2##W(VECT_VAR(buffer, T1, W, N),			\
     539  			   VECT_VAR(vld1_lane_vector_src, T1, W, N), L); \
     540    vst1##Q##_##T2##W(VECT_VAR(result, T1, W, N), VECT_VAR(vld1_lane_vector, T1, W, N))
     541  
     542    DECL_VARIABLE(vld1_lane_vector, poly, 64, 1);
     543    DECL_VARIABLE(vld1_lane_vector, poly, 64, 2);
     544    DECL_VARIABLE(vld1_lane_vector_src, poly, 64, 1);
     545    DECL_VARIABLE(vld1_lane_vector_src, poly, 64, 2);
     546  
     547    ARRAY(vld1_lane_buffer_src, poly, 64, 1);
     548    ARRAY(vld1_lane_buffer_src, poly, 64, 2);
     549  
     550    CLEAN(result, poly, 64, 1);
     551    CLEAN(result, poly, 64, 2);
     552  
     553    TEST_VLD1_LANE(, poly, p, 64, 1, 0);
     554    TEST_VLD1_LANE(q, poly, p, 64, 2, 0);
     555  
     556    CHECK_POLY(TEST_MSG, poly, 64, 1, PRIx64, vld1_lane_expected, "");
     557    CHECK_POLY(TEST_MSG, poly, 64, 2, PRIx64, vld1_lane_expected, "");
     558  
     559    /* vldX_p64 tests.  */
     560  #define DECL_VLDX(T1, W, N, X)						\
     561    VECT_ARRAY_TYPE(T1, W, N, X) VECT_ARRAY_VAR(vldX_vector, T1, W, N, X); \
     562    VECT_VAR_DECL(vldX_result_bis_##X, T1, W, N)[X * N]
     563  
     564  #define TEST_VLDX(Q, T1, T2, W, N, X)					\
     565    VECT_ARRAY_VAR(vldX_vector, T1, W, N, X) =				\
     566      /* Use dedicated init buffer, of size X */				\
     567      vld##X##Q##_##T2##W(VECT_ARRAY_VAR(buffer_vld##X, T1, W, N, X));	\
     568    vst##X##Q##_##T2##W(VECT_VAR(vldX_result_bis_##X, T1, W, N),		\
     569  		      VECT_ARRAY_VAR(vldX_vector, T1, W, N, X));	\
     570    memcpy(VECT_VAR(result, T1, W, N), VECT_VAR(vldX_result_bis_##X, T1, W, N), \
     571  	 sizeof(VECT_VAR(result, T1, W, N)));
     572  
     573    /* Overwrite "result" with the contents of "result_bis"[Y].  */
     574  #define TEST_EXTRA_CHUNK(T1, W, N, X,Y)				\
     575    memcpy(VECT_VAR(result, T1, W, N),				\
     576  	 &(VECT_VAR(vldX_result_bis_##X, T1, W, N)[Y*N]),	\
     577  	 sizeof(VECT_VAR(result, T1, W, N)));
     578  
     579    DECL_VLDX(poly, 64, 1, 2);
     580    DECL_VLDX(poly, 64, 1, 3);
     581    DECL_VLDX(poly, 64, 1, 4);
     582  
     583    VECT_ARRAY_INIT2(buffer_vld2, poly, 64, 1);
     584    PAD(buffer_vld2_pad, poly, 64, 1);
     585    VECT_ARRAY_INIT3(buffer_vld3, poly, 64, 1);
     586    PAD(buffer_vld3_pad, poly, 64, 1);
     587    VECT_ARRAY_INIT4(buffer_vld4, poly, 64, 1);
     588    PAD(buffer_vld4_pad, poly, 64, 1);
     589  
     590  #undef TEST_MSG
     591  #define TEST_MSG "VLD2/VLD2Q"
     592    CLEAN(result, poly, 64, 1);
     593    TEST_VLDX(, poly, p, 64, 1, 2);
     594    CHECK_POLY(TEST_MSG, poly, 64, 1, PRIx64, vld2_expected_0, "chunk 0");
     595    CLEAN(result, poly, 64, 1);
     596    TEST_EXTRA_CHUNK(poly, 64, 1, 2, 1);
     597    CHECK_POLY(TEST_MSG, poly, 64, 1, PRIx64, vld2_expected_1, "chunk 1");
     598  
     599  #undef TEST_MSG
     600  #define TEST_MSG "VLD3/VLD3Q"
     601    CLEAN(result, poly, 64, 1);
     602    TEST_VLDX(, poly, p, 64, 1, 3);
     603    CHECK_POLY(TEST_MSG, poly, 64, 1, PRIx64, vld3_expected_0, "chunk 0");
     604    CLEAN(result, poly, 64, 1);
     605    TEST_EXTRA_CHUNK(poly, 64, 1, 3, 1);
     606    CHECK_POLY(TEST_MSG, poly, 64, 1, PRIx64, vld3_expected_1, "chunk 1");
     607    CLEAN(result, poly, 64, 1);
     608    TEST_EXTRA_CHUNK(poly, 64, 1, 3, 2);
     609    CHECK_POLY(TEST_MSG, poly, 64, 1, PRIx64, vld3_expected_2, "chunk 2");
     610  
     611  #undef TEST_MSG
     612  #define TEST_MSG "VLD4/VLD4Q"
     613    CLEAN(result, poly, 64, 1);
     614    TEST_VLDX(, poly, p, 64, 1, 4);
     615    CHECK_POLY(TEST_MSG, poly, 64, 1, PRIx64, vld4_expected_0, "chunk 0");
     616    CLEAN(result, poly, 64, 1);
     617    TEST_EXTRA_CHUNK(poly, 64, 1, 4, 1);
     618    CHECK_POLY(TEST_MSG, poly, 64, 1, PRIx64, vld4_expected_1, "chunk 1");
     619    CLEAN(result, poly, 64, 1);
     620    TEST_EXTRA_CHUNK(poly, 64, 1, 4, 2);
     621    CHECK_POLY(TEST_MSG, poly, 64, 1, PRIx64, vld4_expected_2, "chunk 2");
     622    CLEAN(result, poly, 64, 1);
     623    TEST_EXTRA_CHUNK(poly, 64, 1, 4, 3);
     624    CHECK_POLY(TEST_MSG, poly, 64, 1, PRIx64, vld4_expected_3, "chunk 3");
     625  
     626    /* vldX_dup_p64 tests.  */
     627  #define DECL_VLDX_DUP(T1, W, N, X)					\
     628    VECT_ARRAY_TYPE(T1, W, N, X) VECT_ARRAY_VAR(vldX_dup_vector, T1, W, N, X); \
     629    VECT_VAR_DECL(vldX_dup_result_bis_##X, T1, W, N)[X * N]
     630  
     631  #define TEST_VLDX_DUP(Q, T1, T2, W, N, X)				\
     632    VECT_ARRAY_VAR(vldX_dup_vector, T1, W, N, X) =			\
     633      vld##X##Q##_dup_##T2##W(&VECT_VAR(buffer_dup, T1, W, N)[0]);	\
     634      									\
     635    vst##X##Q##_##T2##W(VECT_VAR(vldX_dup_result_bis_##X, T1, W, N),	\
     636  		      VECT_ARRAY_VAR(vldX_dup_vector, T1, W, N, X));	\
     637    memcpy(VECT_VAR(result, T1, W, N), VECT_VAR(vldX_dup_result_bis_##X, T1, W, N), \
     638  	 sizeof(VECT_VAR(result, T1, W, N)));
     639  
     640    /* Overwrite "result" with the contents of "result_bis"[Y].  */
     641  #define TEST_VLDX_DUP_EXTRA_CHUNK(T1, W, N, X,Y)		\
     642    memcpy(VECT_VAR(result, T1, W, N),				\
     643  	 &(VECT_VAR(vldX_dup_result_bis_##X, T1, W, N)[Y*N]),	\
     644  	 sizeof(VECT_VAR(result, T1, W, N)));
     645  
     646    DECL_VLDX_DUP(poly, 64, 1, 2);
     647    DECL_VLDX_DUP(poly, 64, 1, 3);
     648    DECL_VLDX_DUP(poly, 64, 1, 4);
     649  
     650  
     651  #undef TEST_MSG
     652  #define TEST_MSG "VLD2_DUP/VLD2Q_DUP"
     653    CLEAN(result, poly, 64, 1);
     654    TEST_VLDX_DUP(, poly, p, 64, 1, 2);
     655    CHECK_POLY(TEST_MSG, poly, 64, 1, PRIx64, vld2_dup_expected_0, "chunk 0");
     656    CLEAN(result, poly, 64, 1);
     657    TEST_VLDX_DUP_EXTRA_CHUNK(poly, 64, 1, 2, 1);
     658    CHECK_POLY(TEST_MSG, poly, 64, 1, PRIx64, vld2_dup_expected_1, "chunk 1");
     659  
     660  #undef TEST_MSG
     661  #define TEST_MSG "VLD3_DUP/VLD3Q_DUP"
     662    CLEAN(result, poly, 64, 1);
     663    TEST_VLDX_DUP(, poly, p, 64, 1, 3);
     664    CHECK_POLY(TEST_MSG, poly, 64, 1, PRIx64, vld3_dup_expected_0, "chunk 0");
     665    CLEAN(result, poly, 64, 1);
     666    TEST_VLDX_DUP_EXTRA_CHUNK(poly, 64, 1, 3, 1);
     667    CHECK_POLY(TEST_MSG, poly, 64, 1, PRIx64, vld3_dup_expected_1, "chunk 1");
     668    CLEAN(result, poly, 64, 1);
     669    TEST_VLDX_DUP_EXTRA_CHUNK(poly, 64, 1, 3, 2);
     670    CHECK_POLY(TEST_MSG, poly, 64, 1, PRIx64, vld3_dup_expected_2, "chunk 2");
     671  
     672  #undef TEST_MSG
     673  #define TEST_MSG "VLD4_DUP/VLD4Q_DUP"
     674    CLEAN(result, poly, 64, 1);
     675    TEST_VLDX_DUP(, poly, p, 64, 1, 4);
     676    CHECK_POLY(TEST_MSG, poly, 64, 1, PRIx64, vld4_dup_expected_0, "chunk 0");
     677    CLEAN(result, poly, 64, 1);
     678    TEST_VLDX_DUP_EXTRA_CHUNK(poly, 64, 1, 4, 1);
     679    CHECK_POLY(TEST_MSG, poly, 64, 1, PRIx64, vld4_dup_expected_1, "chunk 1");
     680    CLEAN(result, poly, 64, 1);
     681    TEST_VLDX_DUP_EXTRA_CHUNK(poly, 64, 1, 4, 2);
     682    CHECK_POLY(TEST_MSG, poly, 64, 1, PRIx64, vld4_dup_expected_2, "chunk 2");
     683    CLEAN(result, poly, 64, 1);
     684    TEST_VLDX_DUP_EXTRA_CHUNK(poly, 64, 1, 4, 3);
     685    CHECK_POLY(TEST_MSG, poly, 64, 1, PRIx64, vld4_dup_expected_3, "chunk 3");
     686  
     687    /* vsli_p64 tests.  */
     688  #undef TEST_MSG
     689  #define TEST_MSG "VSLI"
     690  
     691  #define TEST_VSXI1(INSN, Q, T1, T2, W, N, V)				\
     692    VECT_VAR(vsXi_vector_res, T1, W, N) =					\
     693      INSN##Q##_n_##T2##W(VECT_VAR(vsXi_vector, T1, W, N),		\
     694  		      VECT_VAR(vsXi_vector2, T1, W, N),			\
     695  		      V);						\
     696    vst1##Q##_##T2##W(VECT_VAR(result, T1, W, N), VECT_VAR(vsXi_vector_res, T1, W, N))
     697  
     698  #define TEST_VSXI(INSN, Q, T1, T2, W, N, V)	\
     699    TEST_VSXI1(INSN, Q, T1, T2, W, N, V)
     700  
     701    DECL_VARIABLE(vsXi_vector, poly, 64, 1);
     702    DECL_VARIABLE(vsXi_vector, poly, 64, 2);
     703    DECL_VARIABLE(vsXi_vector2, poly, 64, 1);
     704    DECL_VARIABLE(vsXi_vector2, poly, 64, 2);
     705    DECL_VARIABLE(vsXi_vector_res, poly, 64, 1);
     706    DECL_VARIABLE(vsXi_vector_res, poly, 64, 2);
     707  
     708    CLEAN(result, poly, 64, 1);
     709    CLEAN(result, poly, 64, 2);
     710  
     711    VLOAD(vsXi_vector, buffer, , poly, p, 64, 1);
     712    VLOAD(vsXi_vector, buffer, q, poly, p, 64, 2);
     713  
     714    VDUP(vsXi_vector2, , poly, p, 64, 1, 2);
     715    VDUP(vsXi_vector2, q, poly, p, 64, 2, 3);
     716  
     717    TEST_VSXI(vsli, , poly, p, 64, 1, 3);
     718    TEST_VSXI(vsli, q, poly, p, 64, 2, 53);
     719  
     720    CHECK_POLY(TEST_MSG, poly, 64, 1, PRIx64, vsli_expected, "");
     721    CHECK_POLY(TEST_MSG, poly, 64, 2, PRIx64, vsli_expected, "");
     722  
     723    /* Test cases with maximum shift amount.  */
     724    CLEAN(result, poly, 64, 1);
     725    CLEAN(result, poly, 64, 2);
     726  
     727    TEST_VSXI(vsli, , poly, p, 64, 1, 63);
     728    TEST_VSXI(vsli, q, poly, p, 64, 2, 63);
     729  
     730  #define COMMENT "(max shift amount)"
     731    CHECK_POLY(TEST_MSG, poly, 64, 1, PRIx64, vsli_expected_max_shift, COMMENT);
     732    CHECK_POLY(TEST_MSG, poly, 64, 2, PRIx64, vsli_expected_max_shift, COMMENT);
     733  
     734    /* vsri_p64 tests.  */
     735  #undef TEST_MSG
     736  #define TEST_MSG "VSRI"
     737  
     738    CLEAN(result, poly, 64, 1);
     739    CLEAN(result, poly, 64, 2);
     740  
     741    VLOAD(vsXi_vector, buffer, , poly, p, 64, 1);
     742    VLOAD(vsXi_vector, buffer, q, poly, p, 64, 2);
     743  
     744    VDUP(vsXi_vector2, , poly, p, 64, 1, 2);
     745    VDUP(vsXi_vector2, q, poly, p, 64, 2, 3);
     746  
     747    TEST_VSXI(vsri, , poly, p, 64, 1, 3);
     748    TEST_VSXI(vsri, q, poly, p, 64, 2, 53);
     749  
     750    CHECK_POLY(TEST_MSG, poly, 64, 1, PRIx64, vsri_expected, "");
     751    CHECK_POLY(TEST_MSG, poly, 64, 2, PRIx64, vsri_expected, "");
     752  
     753    /* Test cases with maximum shift amount.  */
     754    CLEAN(result, poly, 64, 1);
     755    CLEAN(result, poly, 64, 2);
     756  
     757    TEST_VSXI(vsri, , poly, p, 64, 1, 64);
     758    TEST_VSXI(vsri, q, poly, p, 64, 2, 64);
     759  
     760  #define COMMENT "(max shift amount)"
     761    CHECK_POLY(TEST_MSG, poly, 64, 1, PRIx64, vsri_expected_max_shift, COMMENT);
     762    CHECK_POLY(TEST_MSG, poly, 64, 2, PRIx64, vsri_expected_max_shift, COMMENT);
     763  
     764    /* vst1_lane_p64 tests.  */
     765  #undef TEST_MSG
     766  #define TEST_MSG "VST1_LANE/VST1_LANEQ"
     767  
     768  #define TEST_VST1_LANE(Q, T1, T2, W, N, L)				\
     769    VECT_VAR(vst1_lane_vector, T1, W, N) =				\
     770      vld1##Q##_##T2##W(VECT_VAR(buffer, T1, W, N));			\
     771    vst1##Q##_lane_##T2##W(VECT_VAR(result, T1, W, N),			\
     772  			 VECT_VAR(vst1_lane_vector, T1, W, N), L);
     773  
     774    DECL_VARIABLE(vst1_lane_vector, poly, 64, 1);
     775    DECL_VARIABLE(vst1_lane_vector, poly, 64, 2);
     776  
     777    CLEAN(result, poly, 64, 1);
     778    CLEAN(result, poly, 64, 2);
     779  
     780    TEST_VST1_LANE(, poly, p, 64, 1, 0);
     781    TEST_VST1_LANE(q, poly, p, 64, 2, 0);
     782  
     783    CHECK_POLY(TEST_MSG, poly, 64, 1, PRIx64, vst1_lane_expected, "");
     784    CHECK_POLY(TEST_MSG, poly, 64, 2, PRIx64, vst1_lane_expected, "");
     785  
     786    /* vget_lane_p64 tests.  */
     787  #undef TEST_MSG
     788  #define TEST_MSG "VGET_LANE/VGETQ_LANE"
     789  
     790  #define TEST_VGET_LANE(Q, T1, T2, W, N, L)				   \
     791    VECT_VAR(vget_lane_vector, T1, W, N) = vget##Q##_lane_##T2##W(VECT_VAR(vget_lane_vector1, T1, W, N), L); \
     792    if (VECT_VAR(vget_lane_vector, T1, W, N) != VECT_VAR(vget_lane_expected, T1, W, N)) {		\
     793      fprintf(stderr,							   \
     794  	    "ERROR in %s (%s line %d in result '%s') at type %s "	   \
     795  	    "got 0x%" PRIx##W " != 0x%" PRIx##W "\n",			   \
     796  	    TEST_MSG, __FILE__, __LINE__,				   \
     797  	    STR(VECT_VAR(vget_lane_expected, T1, W, N)),		   \
     798  	    STR(VECT_NAME(T1, W, N)),					   \
     799  	    VECT_VAR(vget_lane_vector, T1, W, N),			   \
     800  	    VECT_VAR(vget_lane_expected, T1, W, N));			   \
     801      abort ();								   \
     802    }
     803  
     804    /* Initialize input values.  */
     805    DECL_VARIABLE(vget_lane_vector1, poly, 64, 1);
     806    DECL_VARIABLE(vget_lane_vector1, poly, 64, 2);
     807  
     808    VLOAD(vget_lane_vector1, buffer,  , poly, p, 64, 1);
     809    VLOAD(vget_lane_vector1, buffer, q, poly, p, 64, 2);
     810  
     811    VECT_VAR_DECL(vget_lane_vector, poly, 64, 1);
     812    VECT_VAR_DECL(vget_lane_vector, poly, 64, 2);
     813  
     814    TEST_VGET_LANE( , poly, p, 64, 1, 0);
     815    TEST_VGET_LANE(q, poly, p, 64, 2, 0);
     816  
     817  
     818    /* vset_lane_p64 tests.  */
     819  #undef TEST_MSG
     820  #define TEST_MSG "VSET_LANE/VSETQ_LANE"
     821  
     822  #define TEST_VSET_LANE(Q, T1, T2, W, N, V, L)				\
     823    VECT_VAR(vset_lane_vector, T1, W, N) =						\
     824      vset##Q##_lane_##T2##W(V,						\
     825  			   VECT_VAR(vset_lane_vector, T1, W, N),			\
     826  			   L);						\
     827    vst1##Q##_##T2##W(VECT_VAR(result, T1, W, N), VECT_VAR(vset_lane_vector, T1, W, N))
     828  
     829    /* Initialize input values.  */
     830    DECL_VARIABLE(vset_lane_vector, poly, 64, 1);
     831    DECL_VARIABLE(vset_lane_vector, poly, 64, 2);
     832  
     833    CLEAN(result, uint, 64, 1);
     834    CLEAN(result, uint, 64, 2);
     835  
     836    VLOAD(vset_lane_vector, buffer, , poly, p, 64, 1);
     837    VLOAD(vset_lane_vector, buffer, q, poly, p, 64, 2);
     838  
     839    /* Choose value and lane arbitrarily.  */
     840    TEST_VSET_LANE(, poly, p, 64, 1, 0x88, 0);
     841    TEST_VSET_LANE(q, poly, p, 64, 2, 0x11, 1);
     842  
     843    CHECK(TEST_MSG, poly, 64, 1, PRIx64, vset_lane_expected, "");
     844    CHECK(TEST_MSG, poly, 64, 2, PRIx64, vset_lane_expected, "");
     845  
     846  
     847    /* vtst_p64 tests.  */
     848  #undef TEST_MSG
     849  #define TEST_MSG "VTST"
     850    
     851  #define TEST_VTST1(INSN, Q, T1, T2, W, N)			\
     852    VECT_VAR(vtst_vector_res, uint, W, N) =			\
     853      INSN##Q##_##T2##W(VECT_VAR(vtst_vector, T1, W, N),		\
     854  		      VECT_VAR(vtst_vector2, T1, W, N));	\
     855      vst1##Q##_u##W(VECT_VAR(result, uint, W, N),		\
     856  		   VECT_VAR(vtst_vector_res, uint, W, N))
     857  
     858  #define TEST_VTST(INSN, Q, T1, T2, W, N)	\
     859    TEST_VTST1(INSN, Q, T1, T2, W, N)		\
     860  
     861    /* Initialize input values.  */
     862    DECL_VARIABLE(vtst_vector, poly, 64, 1);
     863    DECL_VARIABLE(vtst_vector2, poly, 64, 1);
     864    DECL_VARIABLE(vtst_vector_res, uint, 64, 1);
     865  
     866    CLEAN(result, uint, 64, 1);
     867  
     868    VLOAD(vtst_vector, buffer,  , poly, p, 64, 1);
     869    VDUP(vtst_vector2, , poly, p, 64, 1, 5);
     870  
     871    TEST_VTST(vtst, , poly, p, 64, 1);
     872  
     873    CHECK(TEST_MSG, uint, 64, 1, PRIx64, vtst_expected, "");
     874  
     875    /* vtstq_p64 is supported by aarch64 only.  */
     876  #ifdef __aarch64__
     877    DECL_VARIABLE(vtst_vector, poly, 64, 2);
     878    DECL_VARIABLE(vtst_vector2, poly, 64, 2);
     879    DECL_VARIABLE(vtst_vector_res, uint, 64, 2);
     880    CLEAN(result, uint, 64, 2);
     881    VLOAD(vtst_vector, buffer, q, poly, p, 64, 2);
     882    VDUP(vtst_vector2, q, poly, p, 64, 2, 5);
     883    TEST_VTST(vtst, q, poly, p, 64, 2);
     884    CHECK(TEST_MSG, uint, 64, 2, PRIx64, vtst_expected, "");
     885  
     886    /* vmov_n_p64 tests.  */
     887  #undef TEST_MSG
     888  #define TEST_MSG "VMOV/VMOVQ"
     889  
     890  #define TEST_VMOV(Q, T1, T2, W, N)					\
     891    VECT_VAR(vmov_n_vector, T1, W, N) =					\
     892      vmov##Q##_n_##T2##W(VECT_VAR(buffer_dup, T1, W, N)[i]);		\
     893    vst1##Q##_##T2##W(VECT_VAR(result, T1, W, N), VECT_VAR(vmov_n_vector, T1, W, N))
     894  
     895    DECL_VARIABLE(vmov_n_vector, poly, 64, 1);
     896    DECL_VARIABLE(vmov_n_vector, poly, 64, 2);
     897  
     898    /* Try to read different places from the input buffer.  */
     899    for (i=0; i< 3; i++) {
     900      CLEAN(result, poly, 64, 1);
     901      CLEAN(result, poly, 64, 2);
     902  
     903      TEST_VMOV(, poly, p, 64, 1);
     904      TEST_VMOV(q, poly, p, 64, 2);
     905  
     906      switch (i) {
     907      case 0:
     908        CHECK_POLY(TEST_MSG, poly, 64, 1, PRIx64, vmov_n_expected0, "");
     909        CHECK_POLY(TEST_MSG, poly, 64, 2, PRIx64, vmov_n_expected0, "");
     910        break;
     911      case 1:
     912        CHECK_POLY(TEST_MSG, poly, 64, 1, PRIx64, vmov_n_expected1, "");
     913        CHECK_POLY(TEST_MSG, poly, 64, 2, PRIx64, vmov_n_expected1, "");
     914        break;
     915      case 2:
     916        CHECK_POLY(TEST_MSG, poly, 64, 1, PRIx64, vmov_n_expected2, "");
     917        CHECK_POLY(TEST_MSG, poly, 64, 2, PRIx64, vmov_n_expected2, "");
     918        break;
     919      default:
     920        abort();
     921      }
     922    }
     923  
     924    /* vldx_lane_p64 tests.  */
     925  #undef TEST_MSG
     926  #define TEST_MSG "VLDX_LANE/VLDXQ_LANE"
     927  
     928  VECT_VAR_DECL_INIT(buffer_vld2_lane, poly, 64, 2);
     929  VECT_VAR_DECL_INIT(buffer_vld3_lane, poly, 64, 3);
     930  VECT_VAR_DECL_INIT(buffer_vld4_lane, poly, 64, 4);
     931  
     932    /* In this case, input variables are arrays of vectors.  */
     933  #define DECL_VLD_STX_LANE(T1, W, N, X)					\
     934    VECT_ARRAY_TYPE(T1, W, N, X) VECT_ARRAY_VAR(vector, T1, W, N, X);	\
     935    VECT_ARRAY_TYPE(T1, W, N, X) VECT_ARRAY_VAR(vector_src, T1, W, N, X);	\
     936    VECT_VAR_DECL(result_bis_##X, T1, W, N)[X * N]
     937  
     938    /* We need to use a temporary result buffer (result_bis), because
     939       the one used for other tests is not large enough. A subset of the
     940       result data is moved from result_bis to result, and it is this
     941       subset which is used to check the actual behavior. The next
     942       macro enables to move another chunk of data from result_bis to
     943       result.  */
     944    /* We also use another extra input buffer (buffer_src), which we
     945       fill with 0xAA, and which it used to load a vector from which we
     946       read a given lane.  */
     947  
     948  #define TEST_VLDX_LANE(Q, T1, T2, W, N, X, L)				\
     949    memset (VECT_VAR(buffer_src, T1, W, N), 0xAA,				\
     950  	  sizeof(VECT_VAR(buffer_src, T1, W, N)));			\
     951  									\
     952    VECT_ARRAY_VAR(vector_src, T1, W, N, X) =				\
     953      vld##X##Q##_##T2##W(VECT_VAR(buffer_src, T1, W, N));		\
     954  									\
     955    VECT_ARRAY_VAR(vector, T1, W, N, X) =					\
     956      /* Use dedicated init buffer, of size.  X */			\
     957      vld##X##Q##_lane_##T2##W(VECT_VAR(buffer_vld##X##_lane, T1, W, X),	\
     958  			     VECT_ARRAY_VAR(vector_src, T1, W, N, X),	\
     959  			     L);					\
     960    vst##X##Q##_##T2##W(VECT_VAR(result_bis_##X, T1, W, N),		\
     961  		      VECT_ARRAY_VAR(vector, T1, W, N, X));		\
     962    memcpy(VECT_VAR(result, T1, W, N), VECT_VAR(result_bis_##X, T1, W, N), \
     963  	 sizeof(VECT_VAR(result, T1, W, N)))
     964  
     965    /* Overwrite "result" with the contents of "result_bis"[Y].  */
     966  #undef TEST_EXTRA_CHUNK
     967  #define TEST_EXTRA_CHUNK(T1, W, N, X, Y)		\
     968    memcpy(VECT_VAR(result, T1, W, N),			\
     969  	 &(VECT_VAR(result_bis_##X, T1, W, N)[Y*N]),	\
     970  	 sizeof(VECT_VAR(result, T1, W, N)));
     971  
     972    /* Add some padding to try to catch out of bound accesses.  */
     973  #define ARRAY1(V, T, W, N) VECT_VAR_DECL(V,T,W,N)[1]={42}
     974  #define DUMMY_ARRAY(V, T, W, N, L) \
     975    VECT_VAR_DECL(V,T,W,N)[N*L]={0}; \
     976    ARRAY1(V##_pad,T,W,N)
     977  
     978  #define DECL_ALL_VLD_STX_LANE(X)     \
     979    DECL_VLD_STX_LANE(poly, 64, 1, X); \
     980    DECL_VLD_STX_LANE(poly, 64, 2, X);
     981  
     982  #define TEST_ALL_VLDX_LANE(X)		  \
     983    TEST_VLDX_LANE(, poly, p, 64, 1, X, 0); \
     984    TEST_VLDX_LANE(q, poly, p, 64, 2, X, 0);
     985  
     986  #define TEST_ALL_EXTRA_CHUNKS(X,Y)	     \
     987    TEST_EXTRA_CHUNK(poly, 64, 1, X, Y) \
     988    TEST_EXTRA_CHUNK(poly, 64, 2, X, Y)
     989  
     990  #define CHECK_RESULTS_VLD_STX_LANE(test_name,EXPECTED,comment)		\
     991    CHECK_POLY(test_name, poly, 64, 1, PRIx64, EXPECTED, comment);	\
     992    CHECK_POLY(test_name, poly, 64, 2, PRIx64, EXPECTED, comment);
     993  
     994    /* Declare the temporary buffers / variables.  */
     995    DECL_ALL_VLD_STX_LANE(2);
     996    DECL_ALL_VLD_STX_LANE(3);
     997    DECL_ALL_VLD_STX_LANE(4);
     998  
     999    DUMMY_ARRAY(buffer_src, poly, 64, 1, 4);
    1000    DUMMY_ARRAY(buffer_src, poly, 64, 2, 4);
    1001  
    1002    /* Check vld2_lane/vld2q_lane.  */
    1003    clean_results ();
    1004  #undef TEST_MSG
    1005  #define TEST_MSG "VLD2_LANE/VLD2Q_LANE"
    1006    TEST_ALL_VLDX_LANE(2);
    1007    CHECK_RESULTS_VLD_STX_LANE (TEST_MSG, expected_vld_st2_0, " chunk 0");
    1008  
    1009    TEST_ALL_EXTRA_CHUNKS(2, 1);
    1010    CHECK_RESULTS_VLD_STX_LANE (TEST_MSG, expected_vld_st2_1, " chunk 1");
    1011  
    1012    /* Check vld3_lane/vld3q_lane.  */
    1013    clean_results ();
    1014  #undef TEST_MSG
    1015  #define TEST_MSG "VLD3_LANE/VLD3Q_LANE"
    1016    TEST_ALL_VLDX_LANE(3);
    1017    CHECK_RESULTS_VLD_STX_LANE (TEST_MSG, expected_vld_st3_0, " chunk 0");
    1018  
    1019    TEST_ALL_EXTRA_CHUNKS(3, 1);
    1020    CHECK_RESULTS_VLD_STX_LANE (TEST_MSG, expected_vld_st3_1, " chunk 1");
    1021  
    1022    TEST_ALL_EXTRA_CHUNKS(3, 2);
    1023    CHECK_RESULTS_VLD_STX_LANE (TEST_MSG, expected_vld_st3_2, " chunk 2");
    1024  
    1025    /* Check vld4_lane/vld4q_lane.  */
    1026    clean_results ();
    1027  #undef TEST_MSG
    1028  #define TEST_MSG "VLD4_LANE/VLD4Q_LANE"
    1029    TEST_ALL_VLDX_LANE(4);
    1030    CHECK_RESULTS_VLD_STX_LANE (TEST_MSG, expected_vld_st4_0, " chunk 0");
    1031  
    1032    TEST_ALL_EXTRA_CHUNKS(4, 1);
    1033    CHECK_RESULTS_VLD_STX_LANE (TEST_MSG, expected_vld_st4_1, " chunk 1");
    1034  
    1035    TEST_ALL_EXTRA_CHUNKS(4, 2);
    1036    CHECK_RESULTS_VLD_STX_LANE (TEST_MSG, expected_vld_st4_2, " chunk 2");
    1037  
    1038    TEST_ALL_EXTRA_CHUNKS(4, 3);
    1039    CHECK_RESULTS_VLD_STX_LANE (TEST_MSG, expected_vld_st4_3, " chunk 3");
    1040  
    1041    /* In this case, input variables are arrays of vectors.  */
    1042  #define DECL_VSTX_LANE(T1, W, N, X)					\
    1043    VECT_ARRAY_TYPE(T1, W, N, X) VECT_ARRAY_VAR(vector, T1, W, N, X);	\
    1044    VECT_ARRAY_TYPE(T1, W, N, X) VECT_ARRAY_VAR(vector_src, T1, W, N, X);	\
    1045    VECT_VAR_DECL(result_bis_##X, T1, W, N)[X * N]
    1046  
    1047    /* We need to use a temporary result buffer (result_bis), because
    1048       the one used for other tests is not large enough. A subset of the
    1049       result data is moved from result_bis to result, and it is this
    1050       subset which is used to check the actual behavior. The next
    1051       macro enables to move another chunk of data from result_bis to
    1052       result.  */
    1053    /* We also use another extra input buffer (buffer_src), which we
    1054       fill with 0xAA, and which it used to load a vector from which we
    1055       read a given lane.  */
    1056  #define TEST_VSTX_LANE(Q, T1, T2, W, N, X, L)				 \
    1057    memset (VECT_VAR(buffer_src, T1, W, N), 0xAA,				 \
    1058  	  sizeof(VECT_VAR(buffer_src, T1, W, N)));			 \
    1059    memset (VECT_VAR(result_bis_##X, T1, W, N), 0,			 \
    1060  	  sizeof(VECT_VAR(result_bis_##X, T1, W, N)));			 \
    1061  									 \
    1062    VECT_ARRAY_VAR(vector_src, T1, W, N, X) =				 \
    1063      vld##X##Q##_##T2##W(VECT_VAR(buffer_src, T1, W, N));		 \
    1064  									 \
    1065    VECT_ARRAY_VAR(vector, T1, W, N, X) =					 \
    1066      /* Use dedicated init buffer, of size X.  */			 \
    1067      vld##X##Q##_lane_##T2##W(VECT_VAR(buffer_vld##X##_lane, T1, W, X),	 \
    1068  			     VECT_ARRAY_VAR(vector_src, T1, W, N, X),	 \
    1069  			     L);					 \
    1070    vst##X##Q##_lane_##T2##W(VECT_VAR(result_bis_##X, T1, W, N),		 \
    1071  			   VECT_ARRAY_VAR(vector, T1, W, N, X),		 \
    1072  			   L);						 \
    1073    memcpy(VECT_VAR(result, T1, W, N), VECT_VAR(result_bis_##X, T1, W, N), \
    1074  	 sizeof(VECT_VAR(result, T1, W, N)));
    1075  
    1076  #define TEST_ALL_VSTX_LANE(X)		  \
    1077    TEST_VSTX_LANE(, poly, p, 64, 1, X, 0); \
    1078    TEST_VSTX_LANE(q, poly, p, 64, 2, X, 0);
    1079  
    1080    /* Check vst2_lane/vst2q_lane.  */
    1081    clean_results ();
    1082  #undef TEST_MSG
    1083  #define TEST_MSG "VST2_LANE/VST2Q_LANE"
    1084    TEST_ALL_VSTX_LANE(2);
    1085  
    1086  #define CMT " (chunk 0)"
    1087    CHECK_POLY(TEST_MSG, poly, 64, 1, PRIx64, expected_vld_st2_0, CMT);
    1088  
    1089    TEST_ALL_EXTRA_CHUNKS(2, 1);
    1090  #undef CMT
    1091  #define CMT " chunk 1"
    1092    CHECK_POLY(TEST_MSG, poly, 64, 1, PRIx64, expected_vld_st2_1, CMT);
    1093  
    1094    /* Check vst3_lane/vst3q_lane.  */
    1095    clean_results ();
    1096  #undef TEST_MSG
    1097  #define TEST_MSG "VST3_LANE/VST3Q_LANE"
    1098    TEST_ALL_VSTX_LANE(3);
    1099  
    1100  #undef CMT
    1101  #define CMT " (chunk 0)"
    1102    CHECK_POLY(TEST_MSG, poly, 64, 1, PRIx64, expected_vld_st3_0, CMT);
    1103  
    1104    TEST_ALL_EXTRA_CHUNKS(3, 1);
    1105  
    1106  #undef CMT
    1107  #define CMT " (chunk 1)"
    1108    CHECK_POLY(TEST_MSG, poly, 64, 1, PRIx64, expected_vld_st3_1, CMT);
    1109  
    1110    TEST_ALL_EXTRA_CHUNKS(3, 2);
    1111  
    1112  #undef CMT
    1113  #define CMT " (chunk 2)"
    1114    CHECK_POLY(TEST_MSG, poly, 64, 1, PRIx64, expected_vld_st3_2, CMT);
    1115  
    1116    /* Check vst4_lane/vst4q_lane.  */
    1117    clean_results ();
    1118  #undef TEST_MSG
    1119  #define TEST_MSG "VST4_LANE/VST4Q_LANE"
    1120    TEST_ALL_VSTX_LANE(4);
    1121  
    1122  #undef CMT
    1123  #define CMT " (chunk 0)"
    1124    CHECK_POLY(TEST_MSG, poly, 64, 1, PRIx64, expected_vld_st4_0, CMT);
    1125  
    1126    TEST_ALL_EXTRA_CHUNKS(4, 1);
    1127  
    1128  #undef CMT
    1129  #define CMT " (chunk 1)"
    1130    CHECK_POLY(TEST_MSG, poly, 64, 1, PRIx64, expected_vld_st4_1, CMT);
    1131  
    1132    TEST_ALL_EXTRA_CHUNKS(4, 2);
    1133  
    1134  #undef CMT
    1135  #define CMT " (chunk 2)"
    1136    CHECK_POLY(TEST_MSG, poly, 64, 1, PRIx64, expected_vld_st4_2, CMT);
    1137  
    1138    TEST_ALL_EXTRA_CHUNKS(4, 3);
    1139  
    1140  #undef CMT
    1141  #define CMT " (chunk 3)"
    1142    CHECK_POLY(TEST_MSG, poly, 64, 1, PRIx64, expected_vld_st4_3, CMT);
    1143  
    1144  #endif /* __aarch64__.  */
    1145  
    1146    return 0;
    1147  }