(root)/
Python-3.11.7/
Modules/
_blake2/
impl/
blake2s-load-sse41.h
       1  /*
       2     BLAKE2 reference source code package - optimized C implementations
       3  
       4     Written in 2012 by Samuel Neves <sneves@dei.uc.pt>
       5  
       6     To the extent possible under law, the author(s) have dedicated all copyright
       7     and related and neighboring rights to this software to the public domain
       8     worldwide. This software is distributed without any warranty.
       9  
      10     You should have received a copy of the CC0 Public Domain Dedication along with
      11     this software. If not, see <http://creativecommons.org/publicdomain/zero/1.0/>.
      12  */
      13  #pragma once
      14  #ifndef __BLAKE2S_LOAD_SSE41_H__
      15  #define __BLAKE2S_LOAD_SSE41_H__
      16  
      17  #define LOAD_MSG_0_1(buf) \
      18  buf = TOI(_mm_shuffle_ps(TOF(m0), TOF(m1), _MM_SHUFFLE(2,0,2,0)));
      19  
      20  #define LOAD_MSG_0_2(buf) \
      21  buf = TOI(_mm_shuffle_ps(TOF(m0), TOF(m1), _MM_SHUFFLE(3,1,3,1)));
      22  
      23  #define LOAD_MSG_0_3(buf) \
      24  buf = TOI(_mm_shuffle_ps(TOF(m2), TOF(m3), _MM_SHUFFLE(2,0,2,0)));
      25  
      26  #define LOAD_MSG_0_4(buf) \
      27  buf = TOI(_mm_shuffle_ps(TOF(m2), TOF(m3), _MM_SHUFFLE(3,1,3,1)));
      28  
      29  #define LOAD_MSG_1_1(buf) \
      30  t0 = _mm_blend_epi16(m1, m2, 0x0C); \
      31  t1 = _mm_slli_si128(m3, 4); \
      32  t2 = _mm_blend_epi16(t0, t1, 0xF0); \
      33  buf = _mm_shuffle_epi32(t2, _MM_SHUFFLE(2,1,0,3));
      34  
      35  #define LOAD_MSG_1_2(buf) \
      36  t0 = _mm_shuffle_epi32(m2,_MM_SHUFFLE(0,0,2,0)); \
      37  t1 = _mm_blend_epi16(m1,m3,0xC0); \
      38  t2 = _mm_blend_epi16(t0, t1, 0xF0); \
      39  buf = _mm_shuffle_epi32(t2, _MM_SHUFFLE(2,3,0,1));
      40  
      41  #define LOAD_MSG_1_3(buf) \
      42  t0 = _mm_slli_si128(m1, 4); \
      43  t1 = _mm_blend_epi16(m2, t0, 0x30); \
      44  t2 = _mm_blend_epi16(m0, t1, 0xF0); \
      45  buf = _mm_shuffle_epi32(t2, _MM_SHUFFLE(2,3,0,1));
      46  
      47  #define LOAD_MSG_1_4(buf) \
      48  t0 = _mm_unpackhi_epi32(m0,m1); \
      49  t1 = _mm_slli_si128(m3, 4); \
      50  t2 = _mm_blend_epi16(t0, t1, 0x0C); \
      51  buf = _mm_shuffle_epi32(t2, _MM_SHUFFLE(2,3,0,1));
      52  
      53  #define LOAD_MSG_2_1(buf) \
      54  t0 = _mm_unpackhi_epi32(m2,m3); \
      55  t1 = _mm_blend_epi16(m3,m1,0x0C); \
      56  t2 = _mm_blend_epi16(t0, t1, 0x0F); \
      57  buf = _mm_shuffle_epi32(t2, _MM_SHUFFLE(3,1,0,2));
      58  
      59  #define LOAD_MSG_2_2(buf) \
      60  t0 = _mm_unpacklo_epi32(m2,m0); \
      61  t1 = _mm_blend_epi16(t0, m0, 0xF0); \
      62  t2 = _mm_slli_si128(m3, 8); \
      63  buf = _mm_blend_epi16(t1, t2, 0xC0);
      64  
      65  #define LOAD_MSG_2_3(buf) \
      66  t0 = _mm_blend_epi16(m0, m2, 0x3C); \
      67  t1 = _mm_srli_si128(m1, 12); \
      68  t2 = _mm_blend_epi16(t0,t1,0x03); \
      69  buf = _mm_shuffle_epi32(t2, _MM_SHUFFLE(1,0,3,2));
      70  
      71  #define LOAD_MSG_2_4(buf) \
      72  t0 = _mm_slli_si128(m3, 4); \
      73  t1 = _mm_blend_epi16(m0, m1, 0x33); \
      74  t2 = _mm_blend_epi16(t1, t0, 0xC0); \
      75  buf = _mm_shuffle_epi32(t2, _MM_SHUFFLE(0,1,2,3));
      76  
      77  #define LOAD_MSG_3_1(buf) \
      78  t0 = _mm_unpackhi_epi32(m0,m1); \
      79  t1 = _mm_unpackhi_epi32(t0, m2); \
      80  t2 = _mm_blend_epi16(t1, m3, 0x0C); \
      81  buf = _mm_shuffle_epi32(t2, _MM_SHUFFLE(3,1,0,2));
      82  
      83  #define LOAD_MSG_3_2(buf) \
      84  t0 = _mm_slli_si128(m2, 8); \
      85  t1 = _mm_blend_epi16(m3,m0,0x0C); \
      86  t2 = _mm_blend_epi16(t1, t0, 0xC0); \
      87  buf = _mm_shuffle_epi32(t2, _MM_SHUFFLE(2,0,1,3));
      88  
      89  #define LOAD_MSG_3_3(buf) \
      90  t0 = _mm_blend_epi16(m0,m1,0x0F); \
      91  t1 = _mm_blend_epi16(t0, m3, 0xC0); \
      92  buf = _mm_shuffle_epi32(t1, _MM_SHUFFLE(3,0,1,2));
      93  
      94  #define LOAD_MSG_3_4(buf) \
      95  t0 = _mm_unpacklo_epi32(m0,m2); \
      96  t1 = _mm_unpackhi_epi32(m1,m2); \
      97  buf = _mm_unpacklo_epi64(t1,t0);
      98  
      99  #define LOAD_MSG_4_1(buf) \
     100  t0 = _mm_unpacklo_epi64(m1,m2); \
     101  t1 = _mm_unpackhi_epi64(m0,m2); \
     102  t2 = _mm_blend_epi16(t0,t1,0x33); \
     103  buf = _mm_shuffle_epi32(t2, _MM_SHUFFLE(2,0,1,3));
     104  
     105  #define LOAD_MSG_4_2(buf) \
     106  t0 = _mm_unpackhi_epi64(m1,m3); \
     107  t1 = _mm_unpacklo_epi64(m0,m1); \
     108  buf = _mm_blend_epi16(t0,t1,0x33);
     109  
     110  #define LOAD_MSG_4_3(buf) \
     111  t0 = _mm_unpackhi_epi64(m3,m1); \
     112  t1 = _mm_unpackhi_epi64(m2,m0); \
     113  buf = _mm_blend_epi16(t1,t0,0x33);
     114  
     115  #define LOAD_MSG_4_4(buf) \
     116  t0 = _mm_blend_epi16(m0,m2,0x03); \
     117  t1 = _mm_slli_si128(t0, 8); \
     118  t2 = _mm_blend_epi16(t1,m3,0x0F); \
     119  buf = _mm_shuffle_epi32(t2, _MM_SHUFFLE(1,2,0,3));
     120  
     121  #define LOAD_MSG_5_1(buf) \
     122  t0 = _mm_unpackhi_epi32(m0,m1); \
     123  t1 = _mm_unpacklo_epi32(m0,m2); \
     124  buf = _mm_unpacklo_epi64(t0,t1);
     125  
     126  #define LOAD_MSG_5_2(buf) \
     127  t0 = _mm_srli_si128(m2, 4); \
     128  t1 = _mm_blend_epi16(m0,m3,0x03); \
     129  buf = _mm_blend_epi16(t1,t0,0x3C);
     130  
     131  #define LOAD_MSG_5_3(buf) \
     132  t0 = _mm_blend_epi16(m1,m0,0x0C); \
     133  t1 = _mm_srli_si128(m3, 4); \
     134  t2 = _mm_blend_epi16(t0,t1,0x30); \
     135  buf = _mm_shuffle_epi32(t2, _MM_SHUFFLE(1,2,3,0));
     136  
     137  #define LOAD_MSG_5_4(buf) \
     138  t0 = _mm_unpacklo_epi64(m1,m2); \
     139  t1= _mm_shuffle_epi32(m3, _MM_SHUFFLE(0,2,0,1)); \
     140  buf = _mm_blend_epi16(t0,t1,0x33);
     141  
     142  #define LOAD_MSG_6_1(buf) \
     143  t0 = _mm_slli_si128(m1, 12); \
     144  t1 = _mm_blend_epi16(m0,m3,0x33); \
     145  buf = _mm_blend_epi16(t1,t0,0xC0);
     146  
     147  #define LOAD_MSG_6_2(buf) \
     148  t0 = _mm_blend_epi16(m3,m2,0x30); \
     149  t1 = _mm_srli_si128(m1, 4); \
     150  t2 = _mm_blend_epi16(t0,t1,0x03); \
     151  buf = _mm_shuffle_epi32(t2, _MM_SHUFFLE(2,1,3,0));
     152  
     153  #define LOAD_MSG_6_3(buf) \
     154  t0 = _mm_unpacklo_epi64(m0,m2); \
     155  t1 = _mm_srli_si128(m1, 4); \
     156  buf = _mm_shuffle_epi32(_mm_blend_epi16(t0,t1,0x0C), _MM_SHUFFLE(2,3,1,0));
     157  
     158  #define LOAD_MSG_6_4(buf) \
     159  t0 = _mm_unpackhi_epi32(m1,m2); \
     160  t1 = _mm_unpackhi_epi64(m0,t0); \
     161  buf = _mm_shuffle_epi32(t1, _MM_SHUFFLE(3,0,1,2));
     162  
     163  #define LOAD_MSG_7_1(buf) \
     164  t0 = _mm_unpackhi_epi32(m0,m1); \
     165  t1 = _mm_blend_epi16(t0,m3,0x0F); \
     166  buf = _mm_shuffle_epi32(t1,_MM_SHUFFLE(2,0,3,1));
     167  
     168  #define LOAD_MSG_7_2(buf) \
     169  t0 = _mm_blend_epi16(m2,m3,0x30); \
     170  t1 = _mm_srli_si128(m0,4); \
     171  t2 = _mm_blend_epi16(t0,t1,0x03); \
     172  buf = _mm_shuffle_epi32(t2, _MM_SHUFFLE(1,0,2,3));
     173  
     174  #define LOAD_MSG_7_3(buf) \
     175  t0 = _mm_unpackhi_epi64(m0,m3); \
     176  t1 = _mm_unpacklo_epi64(m1,m2); \
     177  t2 = _mm_blend_epi16(t0,t1,0x3C); \
     178  buf = _mm_shuffle_epi32(t2,_MM_SHUFFLE(0,2,3,1));
     179  
     180  #define LOAD_MSG_7_4(buf) \
     181  t0 = _mm_unpacklo_epi32(m0,m1); \
     182  t1 = _mm_unpackhi_epi32(m1,m2); \
     183  buf = _mm_unpacklo_epi64(t0,t1);
     184  
     185  #define LOAD_MSG_8_1(buf) \
     186  t0 = _mm_unpackhi_epi32(m1,m3); \
     187  t1 = _mm_unpacklo_epi64(t0,m0); \
     188  t2 = _mm_blend_epi16(t1,m2,0xC0); \
     189  buf = _mm_shufflehi_epi16(t2,_MM_SHUFFLE(1,0,3,2));
     190  
     191  #define LOAD_MSG_8_2(buf) \
     192  t0 = _mm_unpackhi_epi32(m0,m3); \
     193  t1 = _mm_blend_epi16(m2,t0,0xF0); \
     194  buf = _mm_shuffle_epi32(t1,_MM_SHUFFLE(0,2,1,3));
     195  
     196  #define LOAD_MSG_8_3(buf) \
     197  t0 = _mm_blend_epi16(m2,m0,0x0C); \
     198  t1 = _mm_slli_si128(t0,4); \
     199  buf = _mm_blend_epi16(t1,m3,0x0F);
     200  
     201  #define LOAD_MSG_8_4(buf) \
     202  t0 = _mm_blend_epi16(m1,m0,0x30); \
     203  buf = _mm_shuffle_epi32(t0,_MM_SHUFFLE(1,0,3,2));
     204  
     205  #define LOAD_MSG_9_1(buf) \
     206  t0 = _mm_blend_epi16(m0,m2,0x03); \
     207  t1 = _mm_blend_epi16(m1,m2,0x30); \
     208  t2 = _mm_blend_epi16(t1,t0,0x0F); \
     209  buf = _mm_shuffle_epi32(t2,_MM_SHUFFLE(1,3,0,2));
     210  
     211  #define LOAD_MSG_9_2(buf) \
     212  t0 = _mm_slli_si128(m0,4); \
     213  t1 = _mm_blend_epi16(m1,t0,0xC0); \
     214  buf = _mm_shuffle_epi32(t1,_MM_SHUFFLE(1,2,0,3));
     215  
     216  #define LOAD_MSG_9_3(buf) \
     217  t0 = _mm_unpackhi_epi32(m0,m3); \
     218  t1 = _mm_unpacklo_epi32(m2,m3); \
     219  t2 = _mm_unpackhi_epi64(t0,t1); \
     220  buf = _mm_shuffle_epi32(t2,_MM_SHUFFLE(3,0,2,1));
     221  
     222  #define LOAD_MSG_9_4(buf) \
     223  t0 = _mm_blend_epi16(m3,m2,0xC0); \
     224  t1 = _mm_unpacklo_epi32(m0,m3); \
     225  t2 = _mm_blend_epi16(t0,t1,0x0F); \
     226  buf = _mm_shuffle_epi32(t2,_MM_SHUFFLE(0,1,2,3));
     227  
     228  #endif
     229