(root)/
gcc-13.2.0/
gcc/
testsuite/
gcc.target/
aarch64/
advsimd-intrinsics/
bf16_vstN_lane_1.c
       1  /* { dg-do run { target { aarch64*-*-* } } } */
       2  /* { dg-require-effective-target arm_v8_2a_bf16_neon_ok } */
       3  /* { dg-add-options arm_v8_2a_bf16_neon }  */
       4  
       5  #include <arm_neon.h>
       6  #include "arm-neon-ref.h"
       7  #include "compute-ref-data.h"
       8  
       9  /* Expected results for vst2, chunk 0.  */
      10  VECT_VAR_DECL(expected_st2_0,hbfloat,16,4) [] = { 0xABAB, 0x3210, 0x0, 0x0 };
      11  VECT_VAR_DECL(expected_st2_0,hbfloat,16,8) [] = { 0xABAB, 0x3210, 0x0, 0x0,
      12  						  0x0, 0x0, 0x0, 0x0 };
      13  
      14  /* Expected results for vst2, chunk 1.  */
      15  VECT_VAR_DECL(expected_st2_1,hbfloat,16,4) [] = { 0x0, 0x0, 0x0, 0x0 };
      16  VECT_VAR_DECL(expected_st2_1,hbfloat,16,8) [] = { 0x0, 0x0, 0x0, 0x0,
      17  						  0x0, 0x0, 0x0, 0x0 };
      18  
      19  /* Expected results for vst3, chunk 0.  */
      20  VECT_VAR_DECL(expected_st3_0,hbfloat,16,4) [] = { 0xABAB, 0x3210, 0xCAFE, 0x0 };
      21  VECT_VAR_DECL(expected_st3_0,hbfloat,16,8) [] = { 0xABAB, 0x3210, 0xCAFE, 0x0,
      22  						  0x0, 0x0, 0x0, 0x0 };
      23  
      24  /* Expected results for vst3, chunk 1.  */
      25  VECT_VAR_DECL(expected_st3_1,hbfloat,16,4) [] = { 0x0, 0x0, 0x0, 0x0 };
      26  VECT_VAR_DECL(expected_st3_1,hbfloat,16,8) [] = { 0x0, 0x0, 0x0, 0x0,
      27  						 0x0, 0x0, 0x0, 0x0 };
      28  
      29  /* Expected results for vst3, chunk 2.  */
      30  VECT_VAR_DECL(expected_st3_2,hbfloat,16,4) [] = { 0x0, 0x0, 0x0, 0x0 };
      31  VECT_VAR_DECL(expected_st3_2,hbfloat,16,8) [] = { 0x0, 0x0, 0x0, 0x0,
      32  						  0x0, 0x0, 0x0, 0x0 };
      33  
      34  /* Expected results for vst4, chunk 0.  */
      35  VECT_VAR_DECL(expected_st4_0,hbfloat,16,4) [] =
      36    { 0xABAB, 0x3210, 0xCAFE, 0x1234 };
      37  VECT_VAR_DECL(expected_st4_0,hbfloat,16,8) [] =
      38    { 0xABAB, 0x3210, 0xCAFE, 0x1234, 0x0, 0x0, 0x0, 0x0 };
      39  
      40  /* Expected results for vst4, chunk 1.  */
      41  VECT_VAR_DECL(expected_st4_1,hbfloat,16,4) [] = { 0x0, 0x0, 0x0, 0x0 };
      42  VECT_VAR_DECL(expected_st4_1,hbfloat,16,8) [] = { 0x0, 0x0, 0x0, 0x0,
      43  						  0x0, 0x0, 0x0, 0x0 };
      44  
      45  /* Expected results for vst4, chunk 2.  */
      46  VECT_VAR_DECL(expected_st4_2,hbfloat,16,4) [] = { 0x0, 0x0, 0x0, 0x0 };
      47  VECT_VAR_DECL(expected_st4_2,hbfloat,16,8) [] = { 0x0, 0x0, 0x0, 0x0,
      48  					          0x0, 0x0, 0x0, 0x0 };
      49  
      50  /* Expected results for vst4, chunk 3.  */
      51  VECT_VAR_DECL(expected_st4_3,hbfloat,16,4) [] = { 0x0, 0x0, 0x0, 0x0 };
      52  VECT_VAR_DECL(expected_st4_3,hbfloat,16,8) [] = { 0x0, 0x0, 0x0, 0x0,
      53  						  0x0, 0x0, 0x0, 0x0 };
      54  
      55  typedef union
      56  {
      57    bfloat16_t bf16;
      58    uint16_t u16;
      59  } bfloat16_u_t;
      60  
      61  static bfloat16_t result_bfloat16x4[4];
      62  static bfloat16_t result_bfloat16x8[8];
      63  
      64  void exec_vstX_lane (void)
      65  {
      66    bfloat16_u_t bfloat16_data[4];
      67    bfloat16_data[0].u16 = 0xABAB;
      68    bfloat16_data[1].u16 = 0x3210;
      69    bfloat16_data[2].u16 = 0xCAFE;
      70    bfloat16_data[3].u16 = 0x1234;
      71  
      72    bfloat16_t buffer_vld2_lane_bfloat16x2 [2] =
      73      { bfloat16_data[0].bf16,
      74        bfloat16_data[1].bf16 };
      75    bfloat16_t buffer_vld3_lane_bfloat16x3 [3] =
      76      { bfloat16_data[0].bf16,
      77        bfloat16_data[1].bf16,
      78        bfloat16_data[2].bf16 };
      79    bfloat16_t buffer_vld4_lane_bfloat16x4 [4] =
      80      { bfloat16_data[0].bf16,
      81        bfloat16_data[1].bf16,
      82        bfloat16_data[2].bf16,
      83        bfloat16_data[3].bf16 };
      84  
      85    /* In this case, input variables are arrays of vectors.  */
      86  #define DECL_VSTX_LANE(T1, W, N, X)					\
      87    VECT_ARRAY_TYPE(T1, W, N, X) VECT_ARRAY_VAR(vector, T1, W, N, X);	\
      88    VECT_ARRAY_TYPE(T1, W, N, X) VECT_ARRAY_VAR(vector_src, T1, W, N, X);	\
      89    VECT_VAR_DECL(result_bis_##X, T1, W, N)[X * N]
      90  
      91    /* We need to use a temporary result buffer (result_bis), because
      92       the one used for other tests is not large enough. A subset of the
      93       result data is moved from result_bis to result, and it is this
      94       subset which is used to check the actual behavior. The next
      95       macro enables to move another chunk of data from result_bis to
      96       result.  */
      97    /* We also use another extra input buffer (buffer_src), which we
      98       fill with 0xAA, and which it used to load a vector from which we
      99       read a given lane.  */
     100  #define TEST_VSTX_LANE(Q, T1, T2, W, N, X, L)				 \
     101    memset (VECT_VAR(buffer_src, T1, W, N), 0xAA,				 \
     102  	  sizeof(VECT_VAR(buffer_src, T1, W, N)));			 \
     103    memset (VECT_VAR(result_bis_##X, T1, W, N), 0,			 \
     104  	  sizeof(VECT_VAR(result_bis_##X, T1, W, N)));			 \
     105  									 \
     106    VECT_ARRAY_VAR(vector_src, T1, W, N, X) =				 \
     107      vld##X##Q##_##T2##W(VECT_VAR(buffer_src, T1, W, N));		 \
     108  									 \
     109    VECT_ARRAY_VAR(vector, T1, W, N, X) =					 \
     110      /* Use dedicated init buffer, of size X.  */			 \
     111      vld##X##Q##_lane_##T2##W(VECT_VAR(buffer_vld##X##_lane, T1, W, X),	 \
     112  			     VECT_ARRAY_VAR(vector_src, T1, W, N, X),	 \
     113  			     L);					 \
     114    vst##X##Q##_lane_##T2##W(VECT_VAR(result_bis_##X, T1, W, N),		 \
     115  			   VECT_ARRAY_VAR(vector, T1, W, N, X),		 \
     116  			   L);						 \
     117    memcpy(VECT_VAR(result, T1, W, N), VECT_VAR(result_bis_##X, T1, W, N), \
     118  	 sizeof(VECT_VAR(result, T1, W, N)));
     119  
     120    /* Overwrite "result" with the contents of "result_bis"[Y].  */
     121  #define TEST_EXTRA_CHUNK(T1, W, N, X, Y)		\
     122    memcpy(VECT_VAR(result, T1, W, N),			\
     123  	 &(VECT_VAR(result_bis_##X, T1, W, N)[Y*N]),	\
     124  	 sizeof(VECT_VAR(result, T1, W, N)));
     125  
     126  #define DUMMY_ARRAY(V, T, W, N, L) VECT_VAR_DECL(V,T,W,N)[N*L]
     127  
     128    DECL_VSTX_LANE(bfloat, 16, 4, 2);
     129    DECL_VSTX_LANE(bfloat, 16, 8, 2);
     130    DECL_VSTX_LANE(bfloat, 16, 4, 3);
     131    DECL_VSTX_LANE(bfloat, 16, 8, 3);
     132    DECL_VSTX_LANE(bfloat, 16, 4, 4);
     133    DECL_VSTX_LANE(bfloat, 16, 8, 4);
     134  
     135    DUMMY_ARRAY(buffer_src, bfloat, 16, 4, 4);
     136    DUMMY_ARRAY(buffer_src, bfloat, 16, 8, 4);
     137  
     138    /* Check vst2_lane/vst2q_lane.  */
     139    clean_results ();
     140    TEST_VSTX_LANE(, bfloat, bf, 16, 4, 2, 2);
     141    TEST_VSTX_LANE(q, bfloat, bf, 16, 8, 2, 6);
     142  
     143  #undef CMT
     144  #define CMT " (chunk 0)"
     145  #undef TEST_MSG
     146  #define TEST_MSG "VST2_LANE/VST2Q_LANE"
     147    CHECK_FP(TEST_MSG, bfloat, 16, 4, PRIx16, expected_st2_0, CMT);
     148    CHECK_FP(TEST_MSG, bfloat, 16, 8, PRIx16, expected_st2_0, CMT);
     149    TEST_EXTRA_CHUNK(bfloat, 16, 4, 2, 1);
     150    TEST_EXTRA_CHUNK(bfloat, 16, 8, 2, 1);
     151  
     152  #undef CMT
     153  #define CMT " (chunk 1)"
     154    CHECK_FP(TEST_MSG, bfloat, 16, 4, PRIx16, expected_st2_1, CMT);
     155    CHECK_FP(TEST_MSG, bfloat, 16, 8, PRIx16, expected_st2_1, CMT);
     156  
     157    /* Check vst3_lane/vst3q_lane.  */
     158    clean_results ();
     159  #undef TEST_MSG
     160  #define TEST_MSG "VST3_LANE/VST3Q_LANE"
     161    TEST_VSTX_LANE(, bfloat, bf, 16, 4, 3, 2);
     162    TEST_VSTX_LANE(q, bfloat, bf, 16, 8, 3, 6);
     163  
     164  #undef CMT
     165  #define CMT " (chunk 0)"
     166    CHECK_FP(TEST_MSG, bfloat, 16, 4, PRIx16, expected_st3_0, CMT);
     167    CHECK_FP(TEST_MSG, bfloat, 16, 8, PRIx16, expected_st3_0, CMT);
     168  
     169    TEST_EXTRA_CHUNK(bfloat, 16, 4, 3, 1);
     170    TEST_EXTRA_CHUNK(bfloat, 16, 8, 3, 1);
     171  
     172  
     173  #undef CMT
     174  #define CMT " (chunk 1)"
     175    CHECK_FP(TEST_MSG, bfloat, 16, 4, PRIx16, expected_st3_1, CMT);
     176    CHECK_FP(TEST_MSG, bfloat, 16, 8, PRIx16, expected_st3_1, CMT);
     177  
     178    TEST_EXTRA_CHUNK(bfloat, 16, 4, 3, 2);
     179    TEST_EXTRA_CHUNK(bfloat, 16, 8, 3, 2);
     180  
     181  #undef CMT
     182  #define CMT " (chunk 2)"
     183    CHECK_FP(TEST_MSG, bfloat, 16, 4, PRIx16, expected_st3_2, CMT);
     184    CHECK_FP(TEST_MSG, bfloat, 16, 8, PRIx16, expected_st3_2, CMT);
     185  
     186    /* Check vst4_lane/vst4q_lane.  */
     187    clean_results ();
     188  #undef TEST_MSG
     189  #define TEST_MSG "VST4_LANE/VST4Q_LANE"
     190    TEST_VSTX_LANE(, bfloat, bf, 16, 4, 4, 2);
     191    TEST_VSTX_LANE(q, bfloat, bf, 16, 8, 4, 6);
     192  
     193  #undef CMT
     194  #define CMT " (chunk 0)"
     195    CHECK_FP(TEST_MSG, bfloat, 16, 4, PRIx16, expected_st4_0, CMT);
     196    CHECK_FP(TEST_MSG, bfloat, 16, 8, PRIx16, expected_st4_0, CMT);
     197  
     198    TEST_EXTRA_CHUNK(bfloat, 16, 4, 4, 1);
     199    TEST_EXTRA_CHUNK(bfloat, 16, 8, 4, 1);
     200  
     201  #undef CMT
     202  #define CMT " (chunk 1)"
     203    CHECK_FP(TEST_MSG, bfloat, 16, 4, PRIx16, expected_st4_1, CMT);
     204    CHECK_FP(TEST_MSG, bfloat, 16, 8, PRIx16, expected_st4_1, CMT);
     205  
     206    TEST_EXTRA_CHUNK(bfloat, 16, 4, 4, 2);
     207    TEST_EXTRA_CHUNK(bfloat, 16, 8, 4, 2);
     208  
     209  #undef CMT
     210  #define CMT " (chunk 2)"
     211    CHECK_FP(TEST_MSG, bfloat, 16, 4, PRIx16, expected_st4_2, CMT);
     212    CHECK_FP(TEST_MSG, bfloat, 16, 8, PRIx16, expected_st4_2, CMT);
     213  
     214    TEST_EXTRA_CHUNK(bfloat, 16, 4, 4, 3);
     215    TEST_EXTRA_CHUNK(bfloat, 16, 8, 4, 3);
     216  
     217  #undef CMT
     218  #define CMT " (chunk 3)"
     219    CHECK_FP(TEST_MSG, bfloat, 16, 4, PRIx16, expected_st4_3, CMT);
     220    CHECK_FP(TEST_MSG, bfloat, 16, 8, PRIx16, expected_st4_3, CMT);
     221  }
     222  
     223  int main (void)
     224  {
     225    exec_vstX_lane ();
     226    return 0;
     227  }