1  /* { dg-do assemble { target { aarch64*-*-* } } } */
       2  /* { dg-require-effective-target arm_v8_2a_bf16_neon_ok } */
       3  /* { dg-add-options arm_v8_2a_bf16_neon }  */
       4  /* { dg-additional-options "-save-temps" } */
       5  
       6  #include <arm_neon.h>
       7  
       8  float32x2_t
       9  test_vbfdot_f32_s8 (float32x2_t r, int8x8_t a, int8x8_t b)
      10  {
      11    bfloat16x4_t _a = vreinterpret_bf16_s8(a);
      12    bfloat16x4_t _b = vreinterpret_bf16_s8(b);
      13  
      14    return vbfdot_f32 (r, _a, _b);
      15  }
      16  
      17  float32x2_t
      18  test_vbfdot_f32_s16 (float32x2_t r, int16x4_t a, int16x4_t b)
      19  {
      20    bfloat16x4_t _a = vreinterpret_bf16_s16(a);
      21    bfloat16x4_t _b = vreinterpret_bf16_s16(b);
      22  
      23    return vbfdot_f32 (r, _a, _b);
      24  }
      25  
      26  float32x2_t
      27  test_vbfdot_f32_s32 (float32x2_t r, int32x2_t a, int32x2_t b)
      28  {
      29    bfloat16x4_t _a = vreinterpret_bf16_s32(a);
      30    bfloat16x4_t _b = vreinterpret_bf16_s32(b);
      31  
      32    return vbfdot_f32 (r, _a, _b);
      33  }
      34  
      35  float32x2_t
      36  test_vbfdot_f32_s64 (float32x2_t r, int64x1_t a, int64x1_t b)
      37  {
      38    bfloat16x4_t _a = vreinterpret_bf16_s64(a);
      39    bfloat16x4_t _b = vreinterpret_bf16_s64(b);
      40  
      41    return vbfdot_f32 (r, _a, _b);
      42  }
      43  
      44  float32x2_t
      45  test_vbfdot_f32_u8 (float32x2_t r, uint8x8_t a, uint8x8_t b)
      46  {
      47    bfloat16x4_t _a = vreinterpret_bf16_u8(a);
      48    bfloat16x4_t _b = vreinterpret_bf16_u8(b);
      49  
      50    return vbfdot_f32 (r, _a, _b);
      51  }
      52  
      53  float32x2_t
      54  test_vbfdot_f32_u16 (float32x2_t r, uint16x4_t a, uint16x4_t b)
      55  {
      56    bfloat16x4_t _a = vreinterpret_bf16_u16(a);
      57    bfloat16x4_t _b = vreinterpret_bf16_u16(b);
      58  
      59    return vbfdot_f32 (r, _a, _b);
      60  }
      61  
      62  float32x2_t
      63  test_vbfdot_f32_u32 (float32x2_t r, uint32x2_t a, uint32x2_t b)
      64  {
      65    bfloat16x4_t _a = vreinterpret_bf16_u32(a);
      66    bfloat16x4_t _b = vreinterpret_bf16_u32(b);
      67  
      68    return vbfdot_f32 (r, _a, _b);
      69  }
      70  
      71  float32x2_t
      72  test_vbfdot_f32_u64 (float32x2_t r, uint64x1_t a, uint64x1_t b)
      73  {
      74    bfloat16x4_t _a = vreinterpret_bf16_u64(a);
      75    bfloat16x4_t _b = vreinterpret_bf16_u64(b);
      76  
      77    return vbfdot_f32 (r, _a, _b);
      78  }
      79  
      80  float32x2_t
      81  test_vbfdot_f32_p8 (float32x2_t r, poly8x8_t a, poly8x8_t b)
      82  {
      83    bfloat16x4_t _a = vreinterpret_bf16_p8(a);
      84    bfloat16x4_t _b = vreinterpret_bf16_p8(b);
      85  
      86    return vbfdot_f32 (r, _a, _b);
      87  }
      88  
      89  float32x2_t
      90  test_vbfdot_f32_p16 (float32x2_t r, poly16x4_t a, poly16x4_t b)
      91  {
      92    bfloat16x4_t _a = vreinterpret_bf16_p16(a);
      93    bfloat16x4_t _b = vreinterpret_bf16_p16(b);
      94  
      95    return vbfdot_f32 (r, _a, _b);
      96  }
      97  
      98  float32x2_t
      99  test_vbfdot_f32_p64 (float32x2_t r, poly64x1_t a, poly64x1_t b)
     100  {
     101    bfloat16x4_t _a = vreinterpret_bf16_p64(a);
     102    bfloat16x4_t _b = vreinterpret_bf16_p64(b);
     103  
     104    return vbfdot_f32 (r, _a, _b);
     105  }
     106  
     107  float32x2_t
     108  test_vbfdot_f32_f16 (float32x2_t r, float16x4_t a, float16x4_t b)
     109  {
     110    bfloat16x4_t _a = vreinterpret_bf16_f16(a);
     111    bfloat16x4_t _b = vreinterpret_bf16_f16(b);
     112  
     113    return vbfdot_f32 (r, _a, _b);
     114  }
     115  
     116  float32x2_t
     117  test_vbfdot_f32_f32 (float32x2_t r, float32x2_t a, float32x2_t b)
     118  {
     119    bfloat16x4_t _a = vreinterpret_bf16_f32(a);
     120    bfloat16x4_t _b = vreinterpret_bf16_f32(b);
     121  
     122    return vbfdot_f32 (r, _a, _b);
     123  }
     124  
     125  float32x2_t
     126  test_vbfdot_f32_f64 (float32x2_t r, float64x1_t a, float64x1_t b)
     127  {
     128    bfloat16x4_t _a = vreinterpret_bf16_f64(a);
     129    bfloat16x4_t _b = vreinterpret_bf16_f64(b);
     130  
     131    return vbfdot_f32 (r, _a, _b);
     132  }
     133  
     134  float32x4_t
     135  test_vbfdotq_f32_s8 (float32x4_t r, int8x16_t a, int8x16_t b)
     136  {
     137    bfloat16x8_t _a = vreinterpretq_bf16_s8(a);
     138    bfloat16x8_t _b = vreinterpretq_bf16_s8(b);
     139  
     140    return vbfdotq_f32 (r, _a, _b);
     141  }
     142  
     143  float32x4_t
     144  test_vbfdotq_f32_s16 (float32x4_t r, int16x8_t a, int16x8_t b)
     145  {
     146    bfloat16x8_t _a = vreinterpretq_bf16_s16(a);
     147    bfloat16x8_t _b = vreinterpretq_bf16_s16(b);
     148  
     149    return vbfdotq_f32 (r, _a, _b);
     150  }
     151  
     152  float32x4_t
     153  test_vbfdotq_f32_s32 (float32x4_t r, int32x4_t a, int32x4_t b)
     154  {
     155    bfloat16x8_t _a = vreinterpretq_bf16_s32(a);
     156    bfloat16x8_t _b = vreinterpretq_bf16_s32(b);
     157  
     158    return vbfdotq_f32 (r, _a, _b);
     159  }
     160  
     161  float32x4_t
     162  test_vbfdotq_f32_s64 (float32x4_t r, int64x2_t a, int64x2_t b)
     163  {
     164    bfloat16x8_t _a = vreinterpretq_bf16_s64(a);
     165    bfloat16x8_t _b = vreinterpretq_bf16_s64(b);
     166  
     167    return vbfdotq_f32 (r, _a, _b);
     168  }
     169  
     170  float32x4_t
     171  test_vbfdotq_f32_u8 (float32x4_t r, uint8x16_t a, uint8x16_t b)
     172  {
     173    bfloat16x8_t _a = vreinterpretq_bf16_u8(a);
     174    bfloat16x8_t _b = vreinterpretq_bf16_u8(b);
     175  
     176    return vbfdotq_f32 (r, _a, _b);
     177  }
     178  
     179  float32x4_t
     180  test_vbfdotq_f32_u16 (float32x4_t r, uint16x8_t a, uint16x8_t b)
     181  {
     182    bfloat16x8_t _a = vreinterpretq_bf16_u16(a);
     183    bfloat16x8_t _b = vreinterpretq_bf16_u16(b);
     184  
     185    return vbfdotq_f32 (r, _a, _b);
     186  }
     187  
     188  float32x4_t
     189  test_vbfdotq_f32_u32 (float32x4_t r, uint32x4_t a, uint32x4_t b)
     190  {
     191    bfloat16x8_t _a = vreinterpretq_bf16_u32(a);
     192    bfloat16x8_t _b = vreinterpretq_bf16_u32(b);
     193  
     194    return vbfdotq_f32 (r, _a, _b);
     195  }
     196  
     197  float32x4_t
     198  test_vbfdotq_f32_u64 (float32x4_t r, uint64x2_t a, uint64x2_t b)
     199  {
     200    bfloat16x8_t _a = vreinterpretq_bf16_u64(a);
     201    bfloat16x8_t _b = vreinterpretq_bf16_u64(b);
     202  
     203    return vbfdotq_f32 (r, _a, _b);
     204  }
     205  
     206  float32x4_t
     207  test_vbfdotq_f32_p8 (float32x4_t r, poly8x16_t a, poly8x16_t b)
     208  {
     209    bfloat16x8_t _a = vreinterpretq_bf16_p8(a);
     210    bfloat16x8_t _b = vreinterpretq_bf16_p8(b);
     211  
     212    return vbfdotq_f32 (r, _a, _b);
     213  }
     214  
     215  float32x4_t
     216  test_vbfdotq_f32_p16 (float32x4_t r, poly16x8_t a, poly16x8_t b)
     217  {
     218    bfloat16x8_t _a = vreinterpretq_bf16_p16(a);
     219    bfloat16x8_t _b = vreinterpretq_bf16_p16(b);
     220  
     221    return vbfdotq_f32 (r, _a, _b);
     222  }
     223  
     224  float32x4_t
     225  test_vbfdotq_f32_p64 (float32x4_t r, poly64x2_t a, poly64x2_t b)
     226  {
     227    bfloat16x8_t _a = vreinterpretq_bf16_p64(a);
     228    bfloat16x8_t _b = vreinterpretq_bf16_p64(b);
     229  
     230    return vbfdotq_f32 (r, _a, _b);
     231  }
     232  
     233  float32x4_t
     234  test_vbfdotq_f32_p128 (float32x4_t r, poly128_t a, poly128_t b)
     235  {
     236    bfloat16x8_t _a = vreinterpretq_bf16_p128(a);
     237    bfloat16x8_t _b = vreinterpretq_bf16_p128(b);
     238  
     239    return vbfdotq_f32 (r, _a, _b);
     240  }
     241  
     242  float32x4_t
     243  test_vbfdotq_f32_f16 (float32x4_t r, float16x8_t a, float16x8_t b)
     244  {
     245    bfloat16x8_t _a = vreinterpretq_bf16_f16(a);
     246    bfloat16x8_t _b = vreinterpretq_bf16_f16(b);
     247  
     248    return vbfdotq_f32 (r, _a, _b);
     249  }
     250  
     251  float32x4_t
     252  test_vbfdotq_f32_f32 (float32x4_t r, float32x4_t a, float32x4_t b)
     253  {
     254    bfloat16x8_t _a = vreinterpretq_bf16_f32(a);
     255    bfloat16x8_t _b = vreinterpretq_bf16_f32(b);
     256  
     257    return vbfdotq_f32 (r, _a, _b);
     258  }
     259  
     260  float32x4_t
     261  test_vbfdotq_f32_f64 (float32x4_t r, float64x2_t a, float64x2_t b)
     262  {
     263    bfloat16x8_t _a = vreinterpretq_bf16_f64(a);
     264    bfloat16x8_t _b = vreinterpretq_bf16_f64(b);
     265  
     266    return vbfdotq_f32 (r, _a, _b);
     267  }
     268  
     269  /* { dg-final { scan-assembler-times {bfdot\tv[0-9]+.2s, v[0-9]+.4h, v[0-9]+.4h} 14 } } */
     270  /* { dg-final { scan-assembler-times {bfdot\tv[0-9]+.4s, v[0-9]+.8h, v[0-9]+.8h} 15 } } */
     271  
     272  int8x8_t test_vreinterpret_s8_bf16 (bfloat16x4_t a, int8x8_t b)
     273  {
     274    int8x8_t _a = vreinterpret_s8_bf16 (a);
     275    return vadd_s8 (_a, b);
     276  }
     277  
     278  int16x4_t test_vreinterpret_s16_bf16 (bfloat16x4_t a, int16x4_t b)
     279  {
     280    int16x4_t _a = vreinterpret_s16_bf16 (a);
     281    return vadd_s16 (_a, b);
     282  }
     283  
     284  int32x2_t test_vreinterpret_s32_bf16 (bfloat16x4_t a, int32x2_t b)
     285  {
     286    int32x2_t _a = vreinterpret_s32_bf16 (a);
     287    return vadd_s32 (_a, b);
     288  }
     289  
     290  int64x1_t test_vreinterpret_s64_bf16 (bfloat16x4_t a, int64x1_t b)
     291  {
     292    int64x1_t _a = vreinterpret_s64_bf16 (a);
     293    return vrshl_s64 (_a, b);
     294  }
     295  
     296  uint8x8_t test_vreinterpret_u8_bf16 (bfloat16x4_t a, uint8x8_t b)
     297  {
     298    uint8x8_t _a = vreinterpret_u8_bf16 (a);
     299    return vadd_u8 (_a, b);
     300  }
     301  
     302  uint16x4_t test_vreinterpret_u16_bf16 (bfloat16x4_t a, uint16x4_t b)
     303  {
     304    uint16x4_t _a = vreinterpret_u16_bf16 (a);
     305    return vadd_u16 (_a, b);
     306  }
     307  
     308  uint32x2_t test_vreinterpret_u32_bf16 (bfloat16x4_t a, uint32x2_t b)
     309  {
     310    uint32x2_t _a = vreinterpret_u32_bf16 (a);
     311    return vadd_u32 (_a, b);
     312  }
     313  
     314  uint64x1_t test_vreinterpret_u64_bf16 (bfloat16x4_t a, int64x1_t b)
     315  {
     316    uint64x1_t _a = vreinterpret_u64_bf16 (a);
     317    return vrshl_u64 (_a, b);
     318  }
     319  
     320  poly8x8_t test_vreinterpret_p8_bf16 (bfloat16x4_t a, poly8x8_t b)
     321  {
     322    poly8x8_t _a = vreinterpret_p8_bf16 (a);
     323    return vzip1_p8 (_a, b);
     324  }
     325  
     326  poly16x4_t test_vreinterpret_p16_bf16 (bfloat16x4_t a, poly16x4_t b)
     327  {
     328    poly16x4_t _a = vreinterpret_p16_bf16 (a);
     329    return vzip1_p16 (_a, b);
     330  }
     331  
     332  poly64x1_t test_vreinterpret_p64_bf16 (bfloat16x4_t a, poly64x1_t b)
     333  {
     334    poly64x1_t _a = vreinterpret_p64_bf16 (a);
     335    return vsli_n_p64 (_a, b, 3);
     336  }
     337  
     338  float32x2_t test_vreinterpret_f32_bf16 (bfloat16x4_t a, float32x2_t b)
     339  {
     340    float32x2_t _a = vreinterpret_f32_bf16 (a);
     341    return vsub_f32 (_a, b);
     342  }
     343  
     344  float64x1_t test_vreinterpret_f64_bf16 (bfloat16x4_t a, float64x1_t b)
     345  {
     346    float64x1_t _a = vreinterpret_f64_bf16 (a);
     347    return vsub_f64 (_a, b);
     348  }
     349  
     350  int8x16_t test_vreinterpretq_s8_bf16 (bfloat16x8_t a, int8x16_t b)
     351  {
     352    int8x16_t _a = vreinterpretq_s8_bf16 (a);
     353    return vaddq_s8 (_a, b);
     354  }
     355  
     356  int16x8_t test_vreinterpretq_s16_bf16 (bfloat16x8_t a, int16x8_t b)
     357  {
     358    int16x8_t _a = vreinterpretq_s16_bf16 (a);
     359    return vaddq_s16 (_a, b);
     360  }
     361  
     362  int32x4_t test_vreinterpretq_s32_bf16 (bfloat16x8_t a, int32x4_t b)
     363  {
     364    int32x4_t _a = vreinterpretq_s32_bf16 (a);
     365    return vaddq_s32 (_a, b);
     366  }
     367  
     368  int64x2_t test_vreinterpretq_s64_bf16 (bfloat16x8_t a, int64x2_t b)
     369  {
     370    int64x2_t _a = vreinterpretq_s64_bf16 (a);
     371    return vaddq_s64 (_a, b);
     372  }
     373  
     374  uint8x16_t test_vreinterpretq_u8_bf16 (bfloat16x8_t a, uint8x16_t b)
     375  {
     376    uint8x16_t _a = vreinterpretq_u8_bf16 (a);
     377    return vaddq_u8 (_a, b);
     378  }
     379  
     380  uint16x8_t test_vreinterpretq_u16_bf16 (bfloat16x8_t a, uint16x8_t b)
     381  {
     382    uint16x8_t _a = vreinterpretq_u16_bf16 (a);
     383    return vaddq_u16 (_a, b);
     384  }
     385  
     386  uint32x4_t test_vreinterpretq_u32_bf16 (bfloat16x8_t a, uint32x4_t b)
     387  {
     388    uint32x4_t _a = vreinterpretq_u32_bf16 (a);
     389    return vaddq_u32 (_a, b);
     390  }
     391  
     392  uint64x2_t test_vreinterpretq_u64_bf16 (bfloat16x8_t a, uint64x2_t b)
     393  {
     394    uint64x2_t _a = vreinterpretq_u64_bf16 (a);
     395    return vaddq_u64 (_a, b);
     396  }
     397  
     398  poly8x16_t test_vreinterpretq_p8_bf16 (bfloat16x8_t a, poly8x16_t b)
     399  {
     400    poly8x16_t _a = vreinterpretq_p8_bf16 (a);
     401    return vzip1q_p8 (_a, b);
     402  }
     403  
     404  poly16x8_t test_vreinterpretq_p16_bf16 (bfloat16x8_t a, poly16x8_t b)
     405  {
     406    poly16x8_t _a = vreinterpretq_p16_bf16 (a);
     407    return vzip1q_p16 (_a, b);
     408  }
     409  
     410  poly64x2_t test_vreinterpretq_p64_bf16 (bfloat16x8_t a, poly64x2_t b)
     411  {
     412    poly64x2_t _a = vreinterpretq_p64_bf16 (a);
     413    return vsliq_n_p64 (_a, b, 3);
     414  }
     415  
     416  poly128_t test_vreinterpretq_p128_bf16 (bfloat16x8_t a, poly16x8_t b)
     417  {
     418    poly128_t _a = vreinterpretq_p128_bf16 (a);
     419    return _a;
     420  }
     421  
     422  float32x4_t test_vreinterpretq_f32_bf16 (bfloat16x8_t a, float32x4_t b)
     423  {
     424    float32x4_t _a = vreinterpretq_f32_bf16 (a);
     425    return vsubq_f32 (_a, b);
     426  }
     427  
     428  float64x2_t test_vreinterpretq_f64_bf16 (bfloat16x8_t a, float64x2_t b)
     429  {
     430    float64x2_t _a = vreinterpretq_f64_bf16 (a);
     431    return vsubq_f64 (_a, b);
     432  }
     433  
     434  float16x4_t test_vreinterpret_f16_bf16 (bfloat16x4_t a)
     435  {
     436    return vreinterpret_f16_bf16 (a);
     437  }
     438  
     439  float16x8_t test_vreinterpretq_f16_bf16 (bfloat16x8_t a)
     440  {
     441    return vreinterpretq_f16_bf16 (a);
     442  }
     443  
     444  /* { dg-final { scan-assembler-times {add\tv[0-9]+.2s, v[0-9]+.2s, v[0-9]+.2s} 2 } } */
     445  /* { dg-final { scan-assembler-times {add\tv[0-9]+.4h, v[0-9]+.4h, v[0-9]+.4h} 2 } } */
     446  /* { dg-final { scan-assembler-times {add\tv[0-9]+.8b, v[0-9]+.8b, v[0-9]+.8b} 2 } } */
     447  
     448  /* { dg-final { scan-assembler-times {add\tv[0-9]+.4s, v[0-9]+.4s, v[0-9]+.4s} 2 } } */
     449  /* { dg-final { scan-assembler-times {add\tv[0-9]+.8h, v[0-9]+.8h, v[0-9]+.8h} 2 } } */
     450  /* { dg-final { scan-assembler-times {add\tv[0-9]+.16b, v[0-9]+.16b, v[0-9]+.16b} 2 } } */
     451  
     452  /* { dg-final { scan-assembler {fsub\tv[0-9]+.2s, v[0-9]+.2s, v[0-9]+.2s} } } */
     453  /* { dg-final { scan-assembler {fsub\tv[0-9]+.4s, v[0-9]+.4s, v[0-9]+.4s} } } */
     454  /* { dg-final { scan-assembler {fsub\tv[0-9]+.2d, v[0-9]+.2d, v[0-9]+.2d} } } */
     455  /* { dg-final { scan-assembler {fsub\td[0-9]+, d[0-9]+, d[0-9]+} } } */
     456  
     457  /* { dg-final { scan-assembler {zip1\tv[0-9]+.8b, v[0-9]+.8b, v[0-9]+.8b} } } */
     458  /* { dg-final { scan-assembler {zip1\tv[0-9]+.16b, v[0-9]+.16b, v[0-9]+.16b} } } */
     459  /* { dg-final { scan-assembler {zip1\tv[0-9]+.4h, v[0-9]+.4h, v[0-9]+.4h} } } */
     460  /* { dg-final { scan-assembler {zip1\tv[0-9]+.8h, v[0-9]+.8h, v[0-9]+.8h} } } */
     461  
     462  /* { dg-final { scan-assembler {sli\tv[0-9]+.2d, v[0-9]+.2d, 3} } } */
     463  /* { dg-final { scan-assembler {sli\td[0-9]+, d[0-9]+, 3} } } */
     464  
     465  /* { dg-final { scan-assembler {urshl\td[0-9]+, d[0-9]+, d[0-9]+} } } */
     466  /* { dg-final { scan-assembler {srshl\td[0-9]+, d[0-9]+, d[0-9]+} } } */