1  /*
       2     BLAKE2 reference source code package - optimized C implementations
       3  
       4     Written in 2012 by Samuel Neves <sneves@dei.uc.pt>
       5  
       6     To the extent possible under law, the author(s) have dedicated all copyright
       7     and related and neighboring rights to this software to the public domain
       8     worldwide. This software is distributed without any warranty.
       9  
      10     You should have received a copy of the CC0 Public Domain Dedication along with
      11     this software. If not, see <http://creativecommons.org/publicdomain/zero/1.0/>.
      12  */
      13  #pragma once
      14  #ifndef __BLAKE2B_ROUND_H__
      15  #define __BLAKE2B_ROUND_H__
      16  
      17  #define LOAD(p)  _mm_load_si128( (__m128i *)(p) )
      18  #define STORE(p,r) _mm_store_si128((__m128i *)(p), r)
      19  
      20  #define LOADU(p)  _mm_loadu_si128( (__m128i *)(p) )
      21  #define STOREU(p,r) _mm_storeu_si128((__m128i *)(p), r)
      22  
      23  #define TOF(reg) _mm_castsi128_ps((reg))
      24  #define TOI(reg) _mm_castps_si128((reg))
      25  
      26  #define LIKELY(x) __builtin_expect((x),1)
      27  
      28  
      29  /* Microarchitecture-specific macros */
      30  #ifndef HAVE_XOP
      31  #ifdef HAVE_SSSE3
      32  #define _mm_roti_epi64(x, c) \
      33      (-(c) == 32) ? _mm_shuffle_epi32((x), _MM_SHUFFLE(2,3,0,1))  \
      34      : (-(c) == 24) ? _mm_shuffle_epi8((x), r24) \
      35      : (-(c) == 16) ? _mm_shuffle_epi8((x), r16) \
      36      : (-(c) == 63) ? _mm_xor_si128(_mm_srli_epi64((x), -(c)), _mm_add_epi64((x), (x)))  \
      37      : _mm_xor_si128(_mm_srli_epi64((x), -(c)), _mm_slli_epi64((x), 64-(-(c))))
      38  #else
      39  #define _mm_roti_epi64(r, c) _mm_xor_si128(_mm_srli_epi64( (r), -(c) ),_mm_slli_epi64( (r), 64-(-(c)) ))
      40  #endif
      41  #else
      42  /* ... */
      43  #endif
      44  
      45  
      46  
      47  #define G1(row1l,row2l,row3l,row4l,row1h,row2h,row3h,row4h,b0,b1) \
      48    row1l = _mm_add_epi64(_mm_add_epi64(row1l, b0), row2l); \
      49    row1h = _mm_add_epi64(_mm_add_epi64(row1h, b1), row2h); \
      50    \
      51    row4l = _mm_xor_si128(row4l, row1l); \
      52    row4h = _mm_xor_si128(row4h, row1h); \
      53    \
      54    row4l = _mm_roti_epi64(row4l, -32); \
      55    row4h = _mm_roti_epi64(row4h, -32); \
      56    \
      57    row3l = _mm_add_epi64(row3l, row4l); \
      58    row3h = _mm_add_epi64(row3h, row4h); \
      59    \
      60    row2l = _mm_xor_si128(row2l, row3l); \
      61    row2h = _mm_xor_si128(row2h, row3h); \
      62    \
      63    row2l = _mm_roti_epi64(row2l, -24); \
      64    row2h = _mm_roti_epi64(row2h, -24); \
      65  
      66  #define G2(row1l,row2l,row3l,row4l,row1h,row2h,row3h,row4h,b0,b1) \
      67    row1l = _mm_add_epi64(_mm_add_epi64(row1l, b0), row2l); \
      68    row1h = _mm_add_epi64(_mm_add_epi64(row1h, b1), row2h); \
      69    \
      70    row4l = _mm_xor_si128(row4l, row1l); \
      71    row4h = _mm_xor_si128(row4h, row1h); \
      72    \
      73    row4l = _mm_roti_epi64(row4l, -16); \
      74    row4h = _mm_roti_epi64(row4h, -16); \
      75    \
      76    row3l = _mm_add_epi64(row3l, row4l); \
      77    row3h = _mm_add_epi64(row3h, row4h); \
      78    \
      79    row2l = _mm_xor_si128(row2l, row3l); \
      80    row2h = _mm_xor_si128(row2h, row3h); \
      81    \
      82    row2l = _mm_roti_epi64(row2l, -63); \
      83    row2h = _mm_roti_epi64(row2h, -63); \
      84  
      85  #if defined(HAVE_SSSE3)
      86  #define DIAGONALIZE(row1l,row2l,row3l,row4l,row1h,row2h,row3h,row4h) \
      87    t0 = _mm_alignr_epi8(row2h, row2l, 8); \
      88    t1 = _mm_alignr_epi8(row2l, row2h, 8); \
      89    row2l = t0; \
      90    row2h = t1; \
      91    \
      92    t0 = row3l; \
      93    row3l = row3h; \
      94    row3h = t0;    \
      95    \
      96    t0 = _mm_alignr_epi8(row4h, row4l, 8); \
      97    t1 = _mm_alignr_epi8(row4l, row4h, 8); \
      98    row4l = t1; \
      99    row4h = t0;
     100  
     101  #define UNDIAGONALIZE(row1l,row2l,row3l,row4l,row1h,row2h,row3h,row4h) \
     102    t0 = _mm_alignr_epi8(row2l, row2h, 8); \
     103    t1 = _mm_alignr_epi8(row2h, row2l, 8); \
     104    row2l = t0; \
     105    row2h = t1; \
     106    \
     107    t0 = row3l; \
     108    row3l = row3h; \
     109    row3h = t0; \
     110    \
     111    t0 = _mm_alignr_epi8(row4l, row4h, 8); \
     112    t1 = _mm_alignr_epi8(row4h, row4l, 8); \
     113    row4l = t1; \
     114    row4h = t0;
     115  #else
     116  
     117  #define DIAGONALIZE(row1l,row2l,row3l,row4l,row1h,row2h,row3h,row4h) \
     118    t0 = row4l;\
     119    t1 = row2l;\
     120    row4l = row3l;\
     121    row3l = row3h;\
     122    row3h = row4l;\
     123    row4l = _mm_unpackhi_epi64(row4h, _mm_unpacklo_epi64(t0, t0)); \
     124    row4h = _mm_unpackhi_epi64(t0, _mm_unpacklo_epi64(row4h, row4h)); \
     125    row2l = _mm_unpackhi_epi64(row2l, _mm_unpacklo_epi64(row2h, row2h)); \
     126    row2h = _mm_unpackhi_epi64(row2h, _mm_unpacklo_epi64(t1, t1))
     127  
     128  #define UNDIAGONALIZE(row1l,row2l,row3l,row4l,row1h,row2h,row3h,row4h) \
     129    t0 = row3l;\
     130    row3l = row3h;\
     131    row3h = t0;\
     132    t0 = row2l;\
     133    t1 = row4l;\
     134    row2l = _mm_unpackhi_epi64(row2h, _mm_unpacklo_epi64(row2l, row2l)); \
     135    row2h = _mm_unpackhi_epi64(t0, _mm_unpacklo_epi64(row2h, row2h)); \
     136    row4l = _mm_unpackhi_epi64(row4l, _mm_unpacklo_epi64(row4h, row4h)); \
     137    row4h = _mm_unpackhi_epi64(row4h, _mm_unpacklo_epi64(t1, t1))
     138  
     139  #endif
     140  
     141  #if defined(HAVE_SSE4_1)
     142  #include "blake2b-load-sse41.h"
     143  #else
     144  #include "blake2b-load-sse2.h"
     145  #endif
     146  
     147  #define ROUND(r) \
     148    LOAD_MSG_ ##r ##_1(b0, b1); \
     149    G1(row1l,row2l,row3l,row4l,row1h,row2h,row3h,row4h,b0,b1); \
     150    LOAD_MSG_ ##r ##_2(b0, b1); \
     151    G2(row1l,row2l,row3l,row4l,row1h,row2h,row3h,row4h,b0,b1); \
     152    DIAGONALIZE(row1l,row2l,row3l,row4l,row1h,row2h,row3h,row4h); \
     153    LOAD_MSG_ ##r ##_3(b0, b1); \
     154    G1(row1l,row2l,row3l,row4l,row1h,row2h,row3h,row4h,b0,b1); \
     155    LOAD_MSG_ ##r ##_4(b0, b1); \
     156    G2(row1l,row2l,row3l,row4l,row1h,row2h,row3h,row4h,b0,b1); \
     157    UNDIAGONALIZE(row1l,row2l,row3l,row4l,row1h,row2h,row3h,row4h);
     158  
     159  #endif
     160