Crypto++  8.2
Free C++ class library of cryptographic schemes
blake2b_simd.cpp
1 // blake2-simd.cpp - written and placed in the public domain by
2 // Samuel Neves, Jeffrey Walton, Uri Blumenthal
3 // and Marcel Raad.
4 //
5 // This source file uses intrinsics to gain access to ARMv7a/ARMv8a
6 // NEON, Power8 and SSE4.1 instructions. A separate source file is
7 // needed because additional CXXFLAGS are required to enable the
8 // appropriate instructions sets in some build configurations.
9 
10 #include "pch.h"
11 #include "config.h"
12 #include "misc.h"
13 #include "blake2.h"
14 
15 // Uncomment for benchmarking C++ against SSE2 or NEON.
16 // Do so in both blake2.cpp and blake2-simd.cpp.
17 // #undef CRYPTOPP_SSE41_AVAILABLE
18 // #undef CRYPTOPP_ARM_NEON_AVAILABLE
19 // #undef CRYPTOPP_ALTIVEC_AVAILABLE
20 
21 // Disable NEON/ASIMD for Cortex-A53 and A57. The shifts are too slow and C/C++ is about
22 // 3 cpb faster than NEON/ASIMD. Also see http://github.com/weidai11/cryptopp/issues/367.
23 #if (defined(__aarch32__) || defined(__aarch64__)) && defined(CRYPTOPP_SLOW_ARMV8_SHIFT)
24 # undef CRYPTOPP_ARM_NEON_AVAILABLE
25 #endif
26 
27 // BLAKE2s bug on AIX 7.1 (POWER7) with XLC 12.01
28 // https://github.com/weidai11/cryptopp/issues/743
29 #if defined(__xlC__) && (__xlC__ < 0x0d01)
30 # define CRYPTOPP_DISABLE_ALTIVEC 1
31 # undef CRYPTOPP_POWER7_AVAILABLE
32 # undef CRYPTOPP_POWER8_AVAILABLE
33 # undef CRYPTOPP_ALTIVEC_AVAILABLE
34 #endif
35 
36 #if (CRYPTOPP_SSE41_AVAILABLE)
37 # include <emmintrin.h>
38 # include <tmmintrin.h>
39 # include <smmintrin.h>
40 #endif
41 
42 // C1189: error: This header is specific to ARM targets
43 #if (CRYPTOPP_ARM_NEON_AVAILABLE) && !defined(_M_ARM64)
44 # include <arm_neon.h>
45 #endif
46 
47 #if (CRYPTOPP_ARM_ACLE_AVAILABLE)
48 # include <stdint.h>
49 # include <arm_acle.h>
50 #endif
51 
52 #if (CRYPTOPP_POWER8_AVAILABLE)
53 # include "ppc_simd.h"
54 #endif
55 
56 // Squash MS LNK4221 and libtool warnings
57 extern const char BLAKE2B_SIMD_FNAME[] = __FILE__;
58 
59 NAMESPACE_BEGIN(CryptoPP)
60 
61 // Exported by blake2.cpp
62 extern const word32 BLAKE2S_IV[8];
63 extern const word64 BLAKE2B_IV[8];
64 
65 #if CRYPTOPP_SSE41_AVAILABLE
66 
67 #define LOADU(p) _mm_loadu_si128((const __m128i *)(const void*)(p))
68 #define STOREU(p,r) _mm_storeu_si128((__m128i *)(void*)(p), r)
69 #define TOF(reg) _mm_castsi128_ps((reg))
70 #define TOI(reg) _mm_castps_si128((reg))
71 
72 void BLAKE2_Compress64_SSE4(const byte* input, BLAKE2b_State& state)
73 {
74  #define BLAKE2B_LOAD_MSG_0_1(b0, b1) \
75  do { \
76  b0 = _mm_unpacklo_epi64(m0, m1); \
77  b1 = _mm_unpacklo_epi64(m2, m3); \
78  } while(0)
79 
80  #define BLAKE2B_LOAD_MSG_0_2(b0, b1) \
81  do { \
82  b0 = _mm_unpackhi_epi64(m0, m1); \
83  b1 = _mm_unpackhi_epi64(m2, m3); \
84  } while(0)
85 
86  #define BLAKE2B_LOAD_MSG_0_3(b0, b1) \
87  do { \
88  b0 = _mm_unpacklo_epi64(m4, m5); \
89  b1 = _mm_unpacklo_epi64(m6, m7); \
90  } while(0)
91 
92  #define BLAKE2B_LOAD_MSG_0_4(b0, b1) \
93  do { \
94  b0 = _mm_unpackhi_epi64(m4, m5); \
95  b1 = _mm_unpackhi_epi64(m6, m7); \
96  } while(0)
97 
98  #define BLAKE2B_LOAD_MSG_1_1(b0, b1) \
99  do { \
100  b0 = _mm_unpacklo_epi64(m7, m2); \
101  b1 = _mm_unpackhi_epi64(m4, m6); \
102  } while(0)
103 
104  #define BLAKE2B_LOAD_MSG_1_2(b0, b1) \
105  do { \
106  b0 = _mm_unpacklo_epi64(m5, m4); \
107  b1 = _mm_alignr_epi8(m3, m7, 8); \
108  } while(0)
109 
110  #define BLAKE2B_LOAD_MSG_1_3(b0, b1) \
111  do { \
112  b0 = _mm_shuffle_epi32(m0, _MM_SHUFFLE(1,0,3,2)); \
113  b1 = _mm_unpackhi_epi64(m5, m2); \
114  } while(0)
115 
116  #define BLAKE2B_LOAD_MSG_1_4(b0, b1) \
117  do { \
118  b0 = _mm_unpacklo_epi64(m6, m1); \
119  b1 = _mm_unpackhi_epi64(m3, m1); \
120  } while(0)
121 
122  #define BLAKE2B_LOAD_MSG_2_1(b0, b1) \
123  do { \
124  b0 = _mm_alignr_epi8(m6, m5, 8); \
125  b1 = _mm_unpackhi_epi64(m2, m7); \
126  } while(0)
127 
128  #define BLAKE2B_LOAD_MSG_2_2(b0, b1) \
129  do { \
130  b0 = _mm_unpacklo_epi64(m4, m0); \
131  b1 = _mm_blend_epi16(m1, m6, 0xF0); \
132  } while(0)
133 
134  #define BLAKE2B_LOAD_MSG_2_3(b0, b1) \
135  do { \
136  b0 = _mm_blend_epi16(m5, m1, 0xF0); \
137  b1 = _mm_unpackhi_epi64(m3, m4); \
138  } while(0)
139 
140  #define BLAKE2B_LOAD_MSG_2_4(b0, b1) \
141  do { \
142  b0 = _mm_unpacklo_epi64(m7, m3); \
143  b1 = _mm_alignr_epi8(m2, m0, 8); \
144  } while(0)
145 
146  #define BLAKE2B_LOAD_MSG_3_1(b0, b1) \
147  do { \
148  b0 = _mm_unpackhi_epi64(m3, m1); \
149  b1 = _mm_unpackhi_epi64(m6, m5); \
150  } while(0)
151 
152  #define BLAKE2B_LOAD_MSG_3_2(b0, b1) \
153  do { \
154  b0 = _mm_unpackhi_epi64(m4, m0); \
155  b1 = _mm_unpacklo_epi64(m6, m7); \
156  } while(0)
157 
158  #define BLAKE2B_LOAD_MSG_3_3(b0, b1) \
159  do { \
160  b0 = _mm_blend_epi16(m1, m2, 0xF0); \
161  b1 = _mm_blend_epi16(m2, m7, 0xF0); \
162  } while(0)
163 
164  #define BLAKE2B_LOAD_MSG_3_4(b0, b1) \
165  do { \
166  b0 = _mm_unpacklo_epi64(m3, m5); \
167  b1 = _mm_unpacklo_epi64(m0, m4); \
168  } while(0)
169 
170  #define BLAKE2B_LOAD_MSG_4_1(b0, b1) \
171  do { \
172  b0 = _mm_unpackhi_epi64(m4, m2); \
173  b1 = _mm_unpacklo_epi64(m1, m5); \
174  } while(0)
175 
176  #define BLAKE2B_LOAD_MSG_4_2(b0, b1) \
177  do { \
178  b0 = _mm_blend_epi16(m0, m3, 0xF0); \
179  b1 = _mm_blend_epi16(m2, m7, 0xF0); \
180  } while(0)
181 
182  #define BLAKE2B_LOAD_MSG_4_3(b0, b1) \
183  do { \
184  b0 = _mm_blend_epi16(m7, m5, 0xF0); \
185  b1 = _mm_blend_epi16(m3, m1, 0xF0); \
186  } while(0)
187 
188  #define BLAKE2B_LOAD_MSG_4_4(b0, b1) \
189  do { \
190  b0 = _mm_alignr_epi8(m6, m0, 8); \
191  b1 = _mm_blend_epi16(m4, m6, 0xF0); \
192  } while(0)
193 
194  #define BLAKE2B_LOAD_MSG_5_1(b0, b1) \
195  do { \
196  b0 = _mm_unpacklo_epi64(m1, m3); \
197  b1 = _mm_unpacklo_epi64(m0, m4); \
198  } while(0)
199 
200  #define BLAKE2B_LOAD_MSG_5_2(b0, b1) \
201  do { \
202  b0 = _mm_unpacklo_epi64(m6, m5); \
203  b1 = _mm_unpackhi_epi64(m5, m1); \
204  } while(0)
205 
206  #define BLAKE2B_LOAD_MSG_5_3(b0, b1) \
207  do { \
208  b0 = _mm_blend_epi16(m2, m3, 0xF0); \
209  b1 = _mm_unpackhi_epi64(m7, m0); \
210  } while(0)
211 
212  #define BLAKE2B_LOAD_MSG_5_4(b0, b1) \
213  do { \
214  b0 = _mm_unpackhi_epi64(m6, m2); \
215  b1 = _mm_blend_epi16(m7, m4, 0xF0); \
216  } while(0)
217 
218  #define BLAKE2B_LOAD_MSG_6_1(b0, b1) \
219  do { \
220  b0 = _mm_blend_epi16(m6, m0, 0xF0); \
221  b1 = _mm_unpacklo_epi64(m7, m2); \
222  } while(0)
223 
224  #define BLAKE2B_LOAD_MSG_6_2(b0, b1) \
225  do { \
226  b0 = _mm_unpackhi_epi64(m2, m7); \
227  b1 = _mm_alignr_epi8(m5, m6, 8); \
228  } while(0)
229 
230  #define BLAKE2B_LOAD_MSG_6_3(b0, b1) \
231  do { \
232  b0 = _mm_unpacklo_epi64(m0, m3); \
233  b1 = _mm_shuffle_epi32(m4, _MM_SHUFFLE(1,0,3,2)); \
234  } while(0)
235 
236  #define BLAKE2B_LOAD_MSG_6_4(b0, b1) \
237  do { \
238  b0 = _mm_unpackhi_epi64(m3, m1); \
239  b1 = _mm_blend_epi16(m1, m5, 0xF0); \
240  } while(0)
241 
242  #define BLAKE2B_LOAD_MSG_7_1(b0, b1) \
243  do { \
244  b0 = _mm_unpackhi_epi64(m6, m3); \
245  b1 = _mm_blend_epi16(m6, m1, 0xF0); \
246  } while(0)
247 
248  #define BLAKE2B_LOAD_MSG_7_2(b0, b1) \
249  do { \
250  b0 = _mm_alignr_epi8(m7, m5, 8); \
251  b1 = _mm_unpackhi_epi64(m0, m4); \
252  } while(0)
253 
254  #define BLAKE2B_LOAD_MSG_7_3(b0, b1) \
255  do { \
256  b0 = _mm_unpackhi_epi64(m2, m7); \
257  b1 = _mm_unpacklo_epi64(m4, m1); \
258  } while(0)
259 
260  #define BLAKE2B_LOAD_MSG_7_4(b0, b1) \
261  do { \
262  b0 = _mm_unpacklo_epi64(m0, m2); \
263  b1 = _mm_unpacklo_epi64(m3, m5); \
264  } while(0)
265 
266  #define BLAKE2B_LOAD_MSG_8_1(b0, b1) \
267  do { \
268  b0 = _mm_unpacklo_epi64(m3, m7); \
269  b1 = _mm_alignr_epi8(m0, m5, 8); \
270  } while(0)
271 
272  #define BLAKE2B_LOAD_MSG_8_2(b0, b1) \
273  do { \
274  b0 = _mm_unpackhi_epi64(m7, m4); \
275  b1 = _mm_alignr_epi8(m4, m1, 8); \
276  } while(0)
277 
278  #define BLAKE2B_LOAD_MSG_8_3(b0, b1) \
279  do { \
280  b0 = m6; \
281  b1 = _mm_alignr_epi8(m5, m0, 8); \
282  } while(0)
283 
284  #define BLAKE2B_LOAD_MSG_8_4(b0, b1) \
285  do { \
286  b0 = _mm_blend_epi16(m1, m3, 0xF0); \
287  b1 = m2; \
288  } while(0)
289 
290  #define BLAKE2B_LOAD_MSG_9_1(b0, b1) \
291  do { \
292  b0 = _mm_unpacklo_epi64(m5, m4); \
293  b1 = _mm_unpackhi_epi64(m3, m0); \
294  } while(0)
295 
296  #define BLAKE2B_LOAD_MSG_9_2(b0, b1) \
297  do { \
298  b0 = _mm_unpacklo_epi64(m1, m2); \
299  b1 = _mm_blend_epi16(m3, m2, 0xF0); \
300  } while(0)
301 
302  #define BLAKE2B_LOAD_MSG_9_3(b0, b1) \
303  do { \
304  b0 = _mm_unpackhi_epi64(m7, m4); \
305  b1 = _mm_unpackhi_epi64(m1, m6); \
306  } while(0)
307 
308  #define BLAKE2B_LOAD_MSG_9_4(b0, b1) \
309  do { \
310  b0 = _mm_alignr_epi8(m7, m5, 8); \
311  b1 = _mm_unpacklo_epi64(m6, m0); \
312  } while(0)
313 
314  #define BLAKE2B_LOAD_MSG_10_1(b0, b1) \
315  do { \
316  b0 = _mm_unpacklo_epi64(m0, m1); \
317  b1 = _mm_unpacklo_epi64(m2, m3); \
318  } while(0)
319 
320  #define BLAKE2B_LOAD_MSG_10_2(b0, b1) \
321  do { \
322  b0 = _mm_unpackhi_epi64(m0, m1); \
323  b1 = _mm_unpackhi_epi64(m2, m3); \
324  } while(0)
325 
326  #define BLAKE2B_LOAD_MSG_10_3(b0, b1) \
327  do { \
328  b0 = _mm_unpacklo_epi64(m4, m5); \
329  b1 = _mm_unpacklo_epi64(m6, m7); \
330  } while(0)
331 
332  #define BLAKE2B_LOAD_MSG_10_4(b0, b1) \
333  do { \
334  b0 = _mm_unpackhi_epi64(m4, m5); \
335  b1 = _mm_unpackhi_epi64(m6, m7); \
336  } while(0)
337 
338  #define BLAKE2B_LOAD_MSG_11_1(b0, b1) \
339  do { \
340  b0 = _mm_unpacklo_epi64(m7, m2); \
341  b1 = _mm_unpackhi_epi64(m4, m6); \
342  } while(0)
343 
344  #define BLAKE2B_LOAD_MSG_11_2(b0, b1) \
345  do { \
346  b0 = _mm_unpacklo_epi64(m5, m4); \
347  b1 = _mm_alignr_epi8(m3, m7, 8); \
348  } while(0)
349 
350  #define BLAKE2B_LOAD_MSG_11_3(b0, b1) \
351  do { \
352  b0 = _mm_shuffle_epi32(m0, _MM_SHUFFLE(1,0,3,2)); \
353  b1 = _mm_unpackhi_epi64(m5, m2); \
354  } while(0)
355 
356  #define BLAKE2B_LOAD_MSG_11_4(b0, b1) \
357  do { \
358  b0 = _mm_unpacklo_epi64(m6, m1); \
359  b1 = _mm_unpackhi_epi64(m3, m1); \
360  } while(0)
361 
362 #ifdef __XOP__
363 # define MM_ROTI_EPI64(r, c) \
364  _mm_roti_epi64(r, c)
365 #else
366 # define MM_ROTI_EPI64(x, c) \
367  (-(c) == 32) ? _mm_shuffle_epi32((x), _MM_SHUFFLE(2,3,0,1)) \
368  : (-(c) == 24) ? _mm_shuffle_epi8((x), r24) \
369  : (-(c) == 16) ? _mm_shuffle_epi8((x), r16) \
370  : (-(c) == 63) ? _mm_xor_si128(_mm_srli_epi64((x), -(c)), _mm_add_epi64((x), (x))) \
371  : _mm_xor_si128(_mm_srli_epi64((x), -(c)), _mm_slli_epi64((x), 64-(-(c))))
372 #endif
373 
374 #define BLAKE2B_G1(row1l,row2l,row3l,row4l,row1h,row2h,row3h,row4h,b0,b1) \
375  row1l = _mm_add_epi64(_mm_add_epi64(row1l, b0), row2l); \
376  row1h = _mm_add_epi64(_mm_add_epi64(row1h, b1), row2h); \
377  \
378  row4l = _mm_xor_si128(row4l, row1l); \
379  row4h = _mm_xor_si128(row4h, row1h); \
380  \
381  row4l = MM_ROTI_EPI64(row4l, -32); \
382  row4h = MM_ROTI_EPI64(row4h, -32); \
383  \
384  row3l = _mm_add_epi64(row3l, row4l); \
385  row3h = _mm_add_epi64(row3h, row4h); \
386  \
387  row2l = _mm_xor_si128(row2l, row3l); \
388  row2h = _mm_xor_si128(row2h, row3h); \
389  \
390  row2l = MM_ROTI_EPI64(row2l, -24); \
391  row2h = MM_ROTI_EPI64(row2h, -24);
392 
393 #define BLAKE2B_G2(row1l,row2l,row3l,row4l,row1h,row2h,row3h,row4h,b0,b1) \
394  row1l = _mm_add_epi64(_mm_add_epi64(row1l, b0), row2l); \
395  row1h = _mm_add_epi64(_mm_add_epi64(row1h, b1), row2h); \
396  \
397  row4l = _mm_xor_si128(row4l, row1l); \
398  row4h = _mm_xor_si128(row4h, row1h); \
399  \
400  row4l = MM_ROTI_EPI64(row4l, -16); \
401  row4h = MM_ROTI_EPI64(row4h, -16); \
402  \
403  row3l = _mm_add_epi64(row3l, row4l); \
404  row3h = _mm_add_epi64(row3h, row4h); \
405  \
406  row2l = _mm_xor_si128(row2l, row3l); \
407  row2h = _mm_xor_si128(row2h, row3h); \
408  \
409  row2l = MM_ROTI_EPI64(row2l, -63); \
410  row2h = MM_ROTI_EPI64(row2h, -63); \
411 
412 #define BLAKE2B_DIAGONALIZE(row1l,row2l,row3l,row4l,row1h,row2h,row3h,row4h) \
413  t0 = row4l;\
414  t1 = row2l;\
415  row4l = row3l;\
416  row3l = row3h;\
417  row3h = row4l;\
418  row4l = _mm_unpackhi_epi64(row4h, _mm_unpacklo_epi64(t0, t0)); \
419  row4h = _mm_unpackhi_epi64(t0, _mm_unpacklo_epi64(row4h, row4h)); \
420  row2l = _mm_unpackhi_epi64(row2l, _mm_unpacklo_epi64(row2h, row2h)); \
421  row2h = _mm_unpackhi_epi64(row2h, _mm_unpacklo_epi64(t1, t1))
422 
423 #define BLAKE2B_UNDIAGONALIZE(row1l,row2l,row3l,row4l,row1h,row2h,row3h,row4h) \
424  t0 = row3l;\
425  row3l = row3h;\
426  row3h = t0;\
427  t0 = row2l;\
428  t1 = row4l;\
429  row2l = _mm_unpackhi_epi64(row2h, _mm_unpacklo_epi64(row2l, row2l)); \
430  row2h = _mm_unpackhi_epi64(t0, _mm_unpacklo_epi64(row2h, row2h)); \
431  row4l = _mm_unpackhi_epi64(row4l, _mm_unpacklo_epi64(row4h, row4h)); \
432  row4h = _mm_unpackhi_epi64(row4h, _mm_unpacklo_epi64(t1, t1))
433 
434 #define BLAKE2B_ROUND(r) \
435  BLAKE2B_LOAD_MSG_ ##r ##_1(b0, b1); \
436  BLAKE2B_G1(row1l,row2l,row3l,row4l,row1h,row2h,row3h,row4h,b0,b1); \
437  BLAKE2B_LOAD_MSG_ ##r ##_2(b0, b1); \
438  BLAKE2B_G2(row1l,row2l,row3l,row4l,row1h,row2h,row3h,row4h,b0,b1); \
439  BLAKE2B_DIAGONALIZE(row1l,row2l,row3l,row4l,row1h,row2h,row3h,row4h); \
440  BLAKE2B_LOAD_MSG_ ##r ##_3(b0, b1); \
441  BLAKE2B_G1(row1l,row2l,row3l,row4l,row1h,row2h,row3h,row4h,b0,b1); \
442  BLAKE2B_LOAD_MSG_ ##r ##_4(b0, b1); \
443  BLAKE2B_G2(row1l,row2l,row3l,row4l,row1h,row2h,row3h,row4h,b0,b1); \
444  BLAKE2B_UNDIAGONALIZE(row1l,row2l,row3l,row4l,row1h,row2h,row3h,row4h);
445 
446  __m128i row1l, row1h;
447  __m128i row2l, row2h;
448  __m128i row3l, row3h;
449  __m128i row4l, row4h;
450  __m128i b0, b1;
451  __m128i t0, t1;
452 
453  const __m128i r16 = _mm_setr_epi8(2, 3, 4, 5, 6, 7, 0, 1, 10, 11, 12, 13, 14, 15, 8, 9);
454  const __m128i r24 = _mm_setr_epi8(3, 4, 5, 6, 7, 0, 1, 2, 11, 12, 13, 14, 15, 8, 9, 10);
455 
456  const __m128i m0 = LOADU(input + 00);
457  const __m128i m1 = LOADU(input + 16);
458  const __m128i m2 = LOADU(input + 32);
459  const __m128i m3 = LOADU(input + 48);
460  const __m128i m4 = LOADU(input + 64);
461  const __m128i m5 = LOADU(input + 80);
462  const __m128i m6 = LOADU(input + 96);
463  const __m128i m7 = LOADU(input + 112);
464 
465  row1l = LOADU(state.h()+0);
466  row1h = LOADU(state.h()+2);
467  row2l = LOADU(state.h()+4);
468  row2h = LOADU(state.h()+6);
469  row3l = LOADU(BLAKE2B_IV+0);
470  row3h = LOADU(BLAKE2B_IV+2);
471  row4l = _mm_xor_si128(LOADU(BLAKE2B_IV+4), LOADU(state.t()+0));
472  row4h = _mm_xor_si128(LOADU(BLAKE2B_IV+6), LOADU(state.f()+0));
473 
474  BLAKE2B_ROUND(0);
475  BLAKE2B_ROUND(1);
476  BLAKE2B_ROUND(2);
477  BLAKE2B_ROUND(3);
478  BLAKE2B_ROUND(4);
479  BLAKE2B_ROUND(5);
480  BLAKE2B_ROUND(6);
481  BLAKE2B_ROUND(7);
482  BLAKE2B_ROUND(8);
483  BLAKE2B_ROUND(9);
484  BLAKE2B_ROUND(10);
485  BLAKE2B_ROUND(11);
486 
487  row1l = _mm_xor_si128(row3l, row1l);
488  row1h = _mm_xor_si128(row3h, row1h);
489  STOREU(state.h()+0, _mm_xor_si128(LOADU(state.h()+0), row1l));
490  STOREU(state.h()+2, _mm_xor_si128(LOADU(state.h()+2), row1h));
491  row2l = _mm_xor_si128(row4l, row2l);
492  row2h = _mm_xor_si128(row4h, row2h);
493  STOREU(state.h()+4, _mm_xor_si128(LOADU(state.h()+4), row2l));
494  STOREU(state.h()+6, _mm_xor_si128(LOADU(state.h()+6), row2h));
495 }
496 #endif // CRYPTOPP_SSE41_AVAILABLE
497 
498 #if CRYPTOPP_ARM_NEON_AVAILABLE
499 void BLAKE2_Compress64_NEON(const byte* input, BLAKE2b_State& state)
500 {
501  #define BLAKE2B_LOAD_MSG_0_1(b0, b1) \
502  do { b0 = vcombine_u64(vget_low_u64(m0), vget_low_u64(m1)); b1 = vcombine_u64(vget_low_u64(m2), vget_low_u64(m3)); } while(0)
503 
504  #define BLAKE2B_LOAD_MSG_0_2(b0, b1) \
505  do { b0 = vcombine_u64(vget_high_u64(m0), vget_high_u64(m1)); b1 = vcombine_u64(vget_high_u64(m2), vget_high_u64(m3)); } while(0)
506 
507  #define BLAKE2B_LOAD_MSG_0_3(b0, b1) \
508  do { b0 = vcombine_u64(vget_low_u64(m4), vget_low_u64(m5)); b1 = vcombine_u64(vget_low_u64(m6), vget_low_u64(m7)); } while(0)
509 
510  #define BLAKE2B_LOAD_MSG_0_4(b0, b1) \
511  do { b0 = vcombine_u64(vget_high_u64(m4), vget_high_u64(m5)); b1 = vcombine_u64(vget_high_u64(m6), vget_high_u64(m7)); } while(0)
512 
513  #define BLAKE2B_LOAD_MSG_1_1(b0, b1) \
514  do { b0 = vcombine_u64(vget_low_u64(m7), vget_low_u64(m2)); b1 = vcombine_u64(vget_high_u64(m4), vget_high_u64(m6)); } while(0)
515 
516  #define BLAKE2B_LOAD_MSG_1_2(b0, b1) \
517  do { b0 = vcombine_u64(vget_low_u64(m5), vget_low_u64(m4)); b1 = vextq_u64(m7, m3, 1); } while(0)
518 
519  #define BLAKE2B_LOAD_MSG_1_3(b0, b1) \
520  do { b0 = vextq_u64(m0, m0, 1); b1 = vcombine_u64(vget_high_u64(m5), vget_high_u64(m2)); } while(0)
521 
522  #define BLAKE2B_LOAD_MSG_1_4(b0, b1) \
523  do { b0 = vcombine_u64(vget_low_u64(m6), vget_low_u64(m1)); b1 = vcombine_u64(vget_high_u64(m3), vget_high_u64(m1)); } while(0)
524 
525  #define BLAKE2B_LOAD_MSG_2_1(b0, b1) \
526  do { b0 = vextq_u64(m5, m6, 1); b1 = vcombine_u64(vget_high_u64(m2), vget_high_u64(m7)); } while(0)
527 
528  #define BLAKE2B_LOAD_MSG_2_2(b0, b1) \
529  do { b0 = vcombine_u64(vget_low_u64(m4), vget_low_u64(m0)); b1 = vcombine_u64(vget_low_u64(m1), vget_high_u64(m6)); } while(0)
530 
531  #define BLAKE2B_LOAD_MSG_2_3(b0, b1) \
532  do { b0 = vcombine_u64(vget_low_u64(m5), vget_high_u64(m1)); b1 = vcombine_u64(vget_high_u64(m3), vget_high_u64(m4)); } while(0)
533 
534  #define BLAKE2B_LOAD_MSG_2_4(b0, b1) \
535  do { b0 = vcombine_u64(vget_low_u64(m7), vget_low_u64(m3)); b1 = vextq_u64(m0, m2, 1); } while(0)
536 
537  #define BLAKE2B_LOAD_MSG_3_1(b0, b1) \
538  do { b0 = vcombine_u64(vget_high_u64(m3), vget_high_u64(m1)); b1 = vcombine_u64(vget_high_u64(m6), vget_high_u64(m5)); } while(0)
539 
540  #define BLAKE2B_LOAD_MSG_3_2(b0, b1) \
541  do { b0 = vcombine_u64(vget_high_u64(m4), vget_high_u64(m0)); b1 = vcombine_u64(vget_low_u64(m6), vget_low_u64(m7)); } while(0)
542 
543  #define BLAKE2B_LOAD_MSG_3_3(b0, b1) \
544  do { b0 = vcombine_u64(vget_low_u64(m1), vget_high_u64(m2)); b1 = vcombine_u64(vget_low_u64(m2), vget_high_u64(m7)); } while(0)
545 
546  #define BLAKE2B_LOAD_MSG_3_4(b0, b1) \
547  do { b0 = vcombine_u64(vget_low_u64(m3), vget_low_u64(m5)); b1 = vcombine_u64(vget_low_u64(m0), vget_low_u64(m4)); } while(0)
548 
549  #define BLAKE2B_LOAD_MSG_4_1(b0, b1) \
550  do { b0 = vcombine_u64(vget_high_u64(m4), vget_high_u64(m2)); b1 = vcombine_u64(vget_low_u64(m1), vget_low_u64(m5)); } while(0)
551 
552  #define BLAKE2B_LOAD_MSG_4_2(b0, b1) \
553  do { b0 = vcombine_u64(vget_low_u64(m0), vget_high_u64(m3)); b1 = vcombine_u64(vget_low_u64(m2), vget_high_u64(m7)); } while(0)
554 
555  #define BLAKE2B_LOAD_MSG_4_3(b0, b1) \
556  do { b0 = vcombine_u64(vget_low_u64(m7), vget_high_u64(m5)); b1 = vcombine_u64(vget_low_u64(m3), vget_high_u64(m1)); } while(0)
557 
558  #define BLAKE2B_LOAD_MSG_4_4(b0, b1) \
559  do { b0 = vextq_u64(m0, m6, 1); b1 = vcombine_u64(vget_low_u64(m4), vget_high_u64(m6)); } while(0)
560 
561  #define BLAKE2B_LOAD_MSG_5_1(b0, b1) \
562  do { b0 = vcombine_u64(vget_low_u64(m1), vget_low_u64(m3)); b1 = vcombine_u64(vget_low_u64(m0), vget_low_u64(m4)); } while(0)
563 
564  #define BLAKE2B_LOAD_MSG_5_2(b0, b1) \
565  do { b0 = vcombine_u64(vget_low_u64(m6), vget_low_u64(m5)); b1 = vcombine_u64(vget_high_u64(m5), vget_high_u64(m1)); } while(0)
566 
567  #define BLAKE2B_LOAD_MSG_5_3(b0, b1) \
568  do { b0 = vcombine_u64(vget_low_u64(m2), vget_high_u64(m3)); b1 = vcombine_u64(vget_high_u64(m7), vget_high_u64(m0)); } while(0)
569 
570  #define BLAKE2B_LOAD_MSG_5_4(b0, b1) \
571  do { b0 = vcombine_u64(vget_high_u64(m6), vget_high_u64(m2)); b1 = vcombine_u64(vget_low_u64(m7), vget_high_u64(m4)); } while(0)
572 
573  #define BLAKE2B_LOAD_MSG_6_1(b0, b1) \
574  do { b0 = vcombine_u64(vget_low_u64(m6), vget_high_u64(m0)); b1 = vcombine_u64(vget_low_u64(m7), vget_low_u64(m2)); } while(0)
575 
576  #define BLAKE2B_LOAD_MSG_6_2(b0, b1) \
577  do { b0 = vcombine_u64(vget_high_u64(m2), vget_high_u64(m7)); b1 = vextq_u64(m6, m5, 1); } while(0)
578 
579  #define BLAKE2B_LOAD_MSG_6_3(b0, b1) \
580  do { b0 = vcombine_u64(vget_low_u64(m0), vget_low_u64(m3)); b1 = vextq_u64(m4, m4, 1); } while(0)
581 
582  #define BLAKE2B_LOAD_MSG_6_4(b0, b1) \
583  do { b0 = vcombine_u64(vget_high_u64(m3), vget_high_u64(m1)); b1 = vcombine_u64(vget_low_u64(m1), vget_high_u64(m5)); } while(0)
584 
585  #define BLAKE2B_LOAD_MSG_7_1(b0, b1) \
586  do { b0 = vcombine_u64(vget_high_u64(m6), vget_high_u64(m3)); b1 = vcombine_u64(vget_low_u64(m6), vget_high_u64(m1)); } while(0)
587 
588  #define BLAKE2B_LOAD_MSG_7_2(b0, b1) \
589  do { b0 = vextq_u64(m5, m7, 1); b1 = vcombine_u64(vget_high_u64(m0), vget_high_u64(m4)); } while(0)
590 
591  #define BLAKE2B_LOAD_MSG_7_3(b0, b1) \
592  do { b0 = vcombine_u64(vget_high_u64(m2), vget_high_u64(m7)); b1 = vcombine_u64(vget_low_u64(m4), vget_low_u64(m1)); } while(0)
593 
594  #define BLAKE2B_LOAD_MSG_7_4(b0, b1) \
595  do { b0 = vcombine_u64(vget_low_u64(m0), vget_low_u64(m2)); b1 = vcombine_u64(vget_low_u64(m3), vget_low_u64(m5)); } while(0)
596 
597  #define BLAKE2B_LOAD_MSG_8_1(b0, b1) \
598  do { b0 = vcombine_u64(vget_low_u64(m3), vget_low_u64(m7)); b1 = vextq_u64(m5, m0, 1); } while(0)
599 
600  #define BLAKE2B_LOAD_MSG_8_2(b0, b1) \
601  do { b0 = vcombine_u64(vget_high_u64(m7), vget_high_u64(m4)); b1 = vextq_u64(m1, m4, 1); } while(0)
602 
603  #define BLAKE2B_LOAD_MSG_8_3(b0, b1) \
604  do { b0 = m6; b1 = vextq_u64(m0, m5, 1); } while(0)
605 
606  #define BLAKE2B_LOAD_MSG_8_4(b0, b1) \
607  do { b0 = vcombine_u64(vget_low_u64(m1), vget_high_u64(m3)); b1 = m2; } while(0)
608 
609  #define BLAKE2B_LOAD_MSG_9_1(b0, b1) \
610  do { b0 = vcombine_u64(vget_low_u64(m5), vget_low_u64(m4)); b1 = vcombine_u64(vget_high_u64(m3), vget_high_u64(m0)); } while(0)
611 
612  #define BLAKE2B_LOAD_MSG_9_2(b0, b1) \
613  do { b0 = vcombine_u64(vget_low_u64(m1), vget_low_u64(m2)); b1 = vcombine_u64(vget_low_u64(m3), vget_high_u64(m2)); } while(0)
614 
615  #define BLAKE2B_LOAD_MSG_9_3(b0, b1) \
616  do { b0 = vcombine_u64(vget_high_u64(m7), vget_high_u64(m4)); b1 = vcombine_u64(vget_high_u64(m1), vget_high_u64(m6)); } while(0)
617 
618  #define BLAKE2B_LOAD_MSG_9_4(b0, b1) \
619  do { b0 = vextq_u64(m5, m7, 1); b1 = vcombine_u64(vget_low_u64(m6), vget_low_u64(m0)); } while(0)
620 
621  #define BLAKE2B_LOAD_MSG_10_1(b0, b1) \
622  do { b0 = vcombine_u64(vget_low_u64(m0), vget_low_u64(m1)); b1 = vcombine_u64(vget_low_u64(m2), vget_low_u64(m3)); } while(0)
623 
624  #define BLAKE2B_LOAD_MSG_10_2(b0, b1) \
625  do { b0 = vcombine_u64(vget_high_u64(m0), vget_high_u64(m1)); b1 = vcombine_u64(vget_high_u64(m2), vget_high_u64(m3)); } while(0)
626 
627  #define BLAKE2B_LOAD_MSG_10_3(b0, b1) \
628  do { b0 = vcombine_u64(vget_low_u64(m4), vget_low_u64(m5)); b1 = vcombine_u64(vget_low_u64(m6), vget_low_u64(m7)); } while(0)
629 
630  #define BLAKE2B_LOAD_MSG_10_4(b0, b1) \
631  do { b0 = vcombine_u64(vget_high_u64(m4), vget_high_u64(m5)); b1 = vcombine_u64(vget_high_u64(m6), vget_high_u64(m7)); } while(0)
632 
633  #define BLAKE2B_LOAD_MSG_11_1(b0, b1) \
634  do { b0 = vcombine_u64(vget_low_u64(m7), vget_low_u64(m2)); b1 = vcombine_u64(vget_high_u64(m4), vget_high_u64(m6)); } while(0)
635 
636  #define BLAKE2B_LOAD_MSG_11_2(b0, b1) \
637  do { b0 = vcombine_u64(vget_low_u64(m5), vget_low_u64(m4)); b1 = vextq_u64(m7, m3, 1); } while(0)
638 
639  #define BLAKE2B_LOAD_MSG_11_3(b0, b1) \
640  do { b0 = vextq_u64(m0, m0, 1); b1 = vcombine_u64(vget_high_u64(m5), vget_high_u64(m2)); } while(0)
641 
642  #define BLAKE2B_LOAD_MSG_11_4(b0, b1) \
643  do { b0 = vcombine_u64(vget_low_u64(m6), vget_low_u64(m1)); b1 = vcombine_u64(vget_high_u64(m3), vget_high_u64(m1)); } while(0)
644 
645  #define vrorq_n_u64_32(x) vreinterpretq_u64_u32(vrev64q_u32(vreinterpretq_u32_u64((x))))
646 
647  #define vrorq_n_u64_24(x) vcombine_u64( \
648  vreinterpret_u64_u8(vext_u8(vreinterpret_u8_u64(vget_low_u64(x)), vreinterpret_u8_u64(vget_low_u64(x)), 3)), \
649  vreinterpret_u64_u8(vext_u8(vreinterpret_u8_u64(vget_high_u64(x)), vreinterpret_u8_u64(vget_high_u64(x)), 3)))
650 
651  #define vrorq_n_u64_16(x) vcombine_u64( \
652  vreinterpret_u64_u8(vext_u8(vreinterpret_u8_u64(vget_low_u64(x)), vreinterpret_u8_u64(vget_low_u64(x)), 2)), \
653  vreinterpret_u64_u8(vext_u8(vreinterpret_u8_u64(vget_high_u64(x)), vreinterpret_u8_u64(vget_high_u64(x)), 2)))
654 
655  #define vrorq_n_u64_63(x) veorq_u64(vaddq_u64(x, x), vshrq_n_u64(x, 63))
656 
657  #define BLAKE2B_G1(row1l,row2l,row3l,row4l,row1h,row2h,row3h,row4h,b0,b1) \
658  do { \
659  row1l = vaddq_u64(vaddq_u64(row1l, b0), row2l); \
660  row1h = vaddq_u64(vaddq_u64(row1h, b1), row2h); \
661  row4l = veorq_u64(row4l, row1l); row4h = veorq_u64(row4h, row1h); \
662  row4l = vrorq_n_u64_32(row4l); row4h = vrorq_n_u64_32(row4h); \
663  row3l = vaddq_u64(row3l, row4l); row3h = vaddq_u64(row3h, row4h); \
664  row2l = veorq_u64(row2l, row3l); row2h = veorq_u64(row2h, row3h); \
665  row2l = vrorq_n_u64_24(row2l); row2h = vrorq_n_u64_24(row2h); \
666  } while(0)
667 
668  #define BLAKE2B_G2(row1l,row2l,row3l,row4l,row1h,row2h,row3h,row4h,b0,b1) \
669  do { \
670  row1l = vaddq_u64(vaddq_u64(row1l, b0), row2l); \
671  row1h = vaddq_u64(vaddq_u64(row1h, b1), row2h); \
672  row4l = veorq_u64(row4l, row1l); row4h = veorq_u64(row4h, row1h); \
673  row4l = vrorq_n_u64_16(row4l); row4h = vrorq_n_u64_16(row4h); \
674  row3l = vaddq_u64(row3l, row4l); row3h = vaddq_u64(row3h, row4h); \
675  row2l = veorq_u64(row2l, row3l); row2h = veorq_u64(row2h, row3h); \
676  row2l = vrorq_n_u64_63(row2l); row2h = vrorq_n_u64_63(row2h); \
677  } while(0)
678 
679  #define BLAKE2B_DIAGONALIZE(row1l,row2l,row3l,row4l,row1h,row2h,row3h,row4h) \
680  do { \
681  uint64x2_t t0 = vextq_u64(row2l, row2h, 1); \
682  uint64x2_t t1 = vextq_u64(row2h, row2l, 1); \
683  row2l = t0; row2h = t1; t0 = row3l; row3l = row3h; row3h = t0; \
684  t0 = vextq_u64(row4h, row4l, 1); t1 = vextq_u64(row4l, row4h, 1); \
685  row4l = t0; row4h = t1; \
686  } while(0)
687 
688  #define BLAKE2B_UNDIAGONALIZE(row1l,row2l,row3l,row4l,row1h,row2h,row3h,row4h) \
689  do { \
690  uint64x2_t t0 = vextq_u64(row2h, row2l, 1); \
691  uint64x2_t t1 = vextq_u64(row2l, row2h, 1); \
692  row2l = t0; row2h = t1; t0 = row3l; row3l = row3h; row3h = t0; \
693  t0 = vextq_u64(row4l, row4h, 1); t1 = vextq_u64(row4h, row4l, 1); \
694  row4l = t0; row4h = t1; \
695  } while(0)
696 
697  #define BLAKE2B_ROUND(r) \
698  do { \
699  uint64x2_t b0, b1; \
700  BLAKE2B_LOAD_MSG_ ##r ##_1(b0, b1); \
701  BLAKE2B_G1(row1l,row2l,row3l,row4l,row1h,row2h,row3h,row4h,b0,b1); \
702  BLAKE2B_LOAD_MSG_ ##r ##_2(b0, b1); \
703  BLAKE2B_G2(row1l,row2l,row3l,row4l,row1h,row2h,row3h,row4h,b0,b1); \
704  BLAKE2B_DIAGONALIZE(row1l,row2l,row3l,row4l,row1h,row2h,row3h,row4h); \
705  BLAKE2B_LOAD_MSG_ ##r ##_3(b0, b1); \
706  BLAKE2B_G1(row1l,row2l,row3l,row4l,row1h,row2h,row3h,row4h,b0,b1); \
707  BLAKE2B_LOAD_MSG_ ##r ##_4(b0, b1); \
708  BLAKE2B_G2(row1l,row2l,row3l,row4l,row1h,row2h,row3h,row4h,b0,b1); \
709  BLAKE2B_UNDIAGONALIZE(row1l,row2l,row3l,row4l,row1h,row2h,row3h,row4h); \
710  } while(0)
711 
712  const uint64x2_t m0 = vreinterpretq_u64_u8(vld1q_u8(input + 00));
713  const uint64x2_t m1 = vreinterpretq_u64_u8(vld1q_u8(input + 16));
714  const uint64x2_t m2 = vreinterpretq_u64_u8(vld1q_u8(input + 32));
715  const uint64x2_t m3 = vreinterpretq_u64_u8(vld1q_u8(input + 48));
716  const uint64x2_t m4 = vreinterpretq_u64_u8(vld1q_u8(input + 64));
717  const uint64x2_t m5 = vreinterpretq_u64_u8(vld1q_u8(input + 80));
718  const uint64x2_t m6 = vreinterpretq_u64_u8(vld1q_u8(input + 96));
719  const uint64x2_t m7 = vreinterpretq_u64_u8(vld1q_u8(input + 112));
720 
721  uint64x2_t row1l, row1h, row2l, row2h;
722  uint64x2_t row3l, row3h, row4l, row4h;
723 
724  const uint64x2_t h0 = row1l = vld1q_u64(state.h()+0);
725  const uint64x2_t h1 = row1h = vld1q_u64(state.h()+2);
726  const uint64x2_t h2 = row2l = vld1q_u64(state.h()+4);
727  const uint64x2_t h3 = row2h = vld1q_u64(state.h()+6);
728 
729  row3l = vld1q_u64(BLAKE2B_IV+0);
730  row3h = vld1q_u64(BLAKE2B_IV+2);
731  row4l = veorq_u64(vld1q_u64(BLAKE2B_IV+4), vld1q_u64(state.t()+0));
732  row4h = veorq_u64(vld1q_u64(BLAKE2B_IV+6), vld1q_u64(state.f()+0));
733 
734  BLAKE2B_ROUND(0);
735  BLAKE2B_ROUND(1);
736  BLAKE2B_ROUND(2);
737  BLAKE2B_ROUND(3);
738  BLAKE2B_ROUND(4);
739  BLAKE2B_ROUND(5);
740  BLAKE2B_ROUND(6);
741  BLAKE2B_ROUND(7);
742  BLAKE2B_ROUND(8);
743  BLAKE2B_ROUND(9);
744  BLAKE2B_ROUND(10);
745  BLAKE2B_ROUND(11);
746 
747  vst1q_u64(state.h()+0, veorq_u64(h0, veorq_u64(row1l, row3l)));
748  vst1q_u64(state.h()+2, veorq_u64(h1, veorq_u64(row1h, row3h)));
749  vst1q_u64(state.h()+4, veorq_u64(h2, veorq_u64(row2l, row4l)));
750  vst1q_u64(state.h()+6, veorq_u64(h3, veorq_u64(row2h, row4h)));
751 }
752 #endif // CRYPTOPP_ARM_NEON_AVAILABLE
753 
754 #if (CRYPTOPP_POWER8_AVAILABLE)
755 
756 inline uint64x2_p VecLoad64(const void* p)
757 {
758 #if defined(__xlc__) || defined(__xlC__) || defined(__clang__)
759  return (uint64x2_p)vec_xl(0, (uint8_t*)p);
760 #else
761  return (uint64x2_p)vec_vsx_ld(0, (uint8_t*)p);
762 #endif
763 }
764 
765 inline uint64x2_p VecLoad64LE(const void* p)
766 {
767 #if __BIG_ENDIAN__
768  const uint8x16_p m = {7,6,5,4, 3,2,1,0, 15,14,13,12, 11,10,9,8};
769  const uint64x2_p v = VecLoad64(p);
770  return VecPermute(v, v, m);
771 #else
772  return VecLoad64(p);
773 #endif
774 }
775 
776 inline void VecStore64(void* p, const uint64x2_p x)
777 {
778 #if defined(__xlc__) || defined(__xlC__) || defined(__clang__)
779  vec_xst((uint8x16_p)x,0,(uint8_t*)p);
780 #else
781  vec_vsx_st((uint8x16_p)x,0,(uint8_t*)p);
782 #endif
783 }
784 
785 inline void VecStore64LE(void* p, const uint64x2_p x)
786 {
787 #if __BIG_ENDIAN__
788  const uint8x16_p m = {7,6,5,4, 3,2,1,0, 15,14,13,12, 11,10,9,8};
789  VecStore64(p, VecPermute(x, x, m));
790 #else
791  VecStore64(p, x);
792 #endif
793 }
794 
795 template <unsigned int C>
796 inline uint64x2_p VecShiftLeftOctet(const uint64x2_p a, const uint64x2_p b)
797 {
798 #if __BIG_ENDIAN__
799  return (uint64x2_p)vec_sld((uint8x16_p)a, (uint8x16_p)b, C);
800 #else
801  return (uint64x2_p)vec_sld((uint8x16_p)b, (uint8x16_p)a, 16-C);
802 #endif
803 }
804 
805 #define vec_shl_octet(a,b,c) VecShiftLeftOctet<c*8>(a, b)
806 
807 // vec_mergeh(a,b) is equivalent to VecPermute(a,b,HH_MASK); and
808 // vec_mergel(a,b) is equivalent VecPermute(a,b,LL_MASK). Benchmarks
809 // show vec_mergeh and vec_mergel is faster on little-endian
810 // machines by 0.4 cpb. Benchmarks show VecPermute is faster on
811 // big-endian machines by 1.5 cpb. The code that uses
812 // vec_mergeh and vec_mergel is about 880 bytes shorter.
813 
814 #if defined(__GNUC__) && (__BIG_ENDIAN__)
815 # define vec_merge_hi(a,b) VecPermute(a,b, HH_MASK)
816 # define vec_merge_lo(a,b) VecPermute(a,b, LL_MASK)
817 #else
818 # define vec_merge_hi(a,b) vec_mergeh(a,b)
819 # define vec_merge_lo(a,b) vec_mergel(a,b)
820 #endif
821 
822 void BLAKE2_Compress64_POWER8(const byte* input, BLAKE2b_State& state)
823 {
824  // Permute masks. High is element 0 (most significant),
825  // low is element 1 (least significant).
826 
827 #if defined(__GNUC__) && (__BIG_ENDIAN__)
828  const uint8x16_p HH_MASK = { 0,1,2,3,4,5,6,7, 16,17,18,19,20,21,22,23 };
829  const uint8x16_p LL_MASK = { 8,9,10,11,12,13,14,15, 24,25,26,27,28,29,30,31 };
830 #endif
831 
832  const uint8x16_p HL_MASK = { 0,1,2,3,4,5,6,7, 24,25,26,27,28,29,30,31 };
833  const uint8x16_p LH_MASK = { 8,9,10,11,12,13,14,15, 16,17,18,19,20,21,22,23 };
834 
835  #define BLAKE2B_LOAD_MSG_0_1(b0, b1) \
836  do { \
837  b0 = vec_merge_hi(m0, m1); \
838  b1 = vec_merge_hi(m2, m3); \
839  } while(0)
840 
841  #define BLAKE2B_LOAD_MSG_0_2(b0, b1) \
842  do { \
843  b0 = vec_merge_lo(m0, m1); \
844  b1 = vec_merge_lo(m2, m3); \
845  } while(0)
846 
847  #define BLAKE2B_LOAD_MSG_0_3(b0, b1) \
848  do { \
849  b0 = vec_merge_hi(m4, m5); \
850  b1 = vec_merge_hi(m6, m7); \
851  } while(0)
852 
853  #define BLAKE2B_LOAD_MSG_0_4(b0, b1) \
854  do { \
855  b0 = vec_merge_lo(m4, m5); \
856  b1 = vec_merge_lo(m6, m7); \
857  } while(0)
858 
859  #define BLAKE2B_LOAD_MSG_1_1(b0, b1) \
860  do { \
861  b0 = vec_merge_hi(m7, m2); \
862  b1 = vec_merge_lo(m4, m6); \
863  } while(0)
864 
865  #define BLAKE2B_LOAD_MSG_1_2(b0, b1) \
866  do { \
867  b0 = vec_merge_hi(m5, m4); \
868  b1 = vec_shl_octet(m7, m3, 1); \
869  } while(0)
870 
871  #define BLAKE2B_LOAD_MSG_1_3(b0, b1) \
872  do { \
873  b0 = vec_shl_octet(m0, m0, 1); \
874  b1 = vec_merge_lo(m5, m2); \
875  } while(0)
876 
877  #define BLAKE2B_LOAD_MSG_1_4(b0, b1) \
878  do { \
879  b0 = vec_merge_hi(m6, m1); \
880  b1 = vec_merge_lo(m3, m1); \
881  } while(0)
882 
883  #define BLAKE2B_LOAD_MSG_2_1(b0, b1) \
884  do { \
885  b0 = vec_shl_octet(m5, m6, 1); \
886  b1 = vec_merge_lo(m2, m7); \
887  } while(0)
888 
889  #define BLAKE2B_LOAD_MSG_2_2(b0, b1) \
890  do { \
891  b0 = vec_merge_hi(m4, m0); \
892  b1 = VecPermute(m1, m6, HL_MASK); \
893  } while(0)
894 
895  #define BLAKE2B_LOAD_MSG_2_3(b0, b1) \
896  do { \
897  b0 = VecPermute(m5, m1, HL_MASK); \
898  b1 = vec_merge_lo(m3, m4); \
899  } while(0)
900 
901  #define BLAKE2B_LOAD_MSG_2_4(b0, b1) \
902  do { \
903  b0 = vec_merge_hi(m7, m3); \
904  b1 = vec_shl_octet(m0, m2, 1); \
905  } while(0)
906 
907  #define BLAKE2B_LOAD_MSG_3_1(b0, b1) \
908  do { \
909  b0 = vec_merge_lo(m3, m1); \
910  b1 = vec_merge_lo(m6, m5); \
911  } while(0)
912 
913  #define BLAKE2B_LOAD_MSG_3_2(b0, b1) \
914  do { \
915  b0 = vec_merge_lo(m4, m0); \
916  b1 = vec_merge_hi(m6, m7); \
917  } while(0)
918 
919  #define BLAKE2B_LOAD_MSG_3_3(b0, b1) \
920  do { \
921  b0 = VecPermute(m1, m2, HL_MASK); \
922  b1 = VecPermute(m2, m7, HL_MASK); \
923  } while(0)
924 
925  #define BLAKE2B_LOAD_MSG_3_4(b0, b1) \
926  do { \
927  b0 = vec_merge_hi(m3, m5); \
928  b1 = vec_merge_hi(m0, m4); \
929  } while(0)
930 
931  #define BLAKE2B_LOAD_MSG_4_1(b0, b1) \
932  do { \
933  b0 = vec_merge_lo(m4, m2); \
934  b1 = vec_merge_hi(m1, m5); \
935  } while(0)
936 
937  #define BLAKE2B_LOAD_MSG_4_2(b0, b1) \
938  do { \
939  b0 = VecPermute(m0, m3, HL_MASK); \
940  b1 = VecPermute(m2, m7, HL_MASK); \
941  } while(0)
942 
943  #define BLAKE2B_LOAD_MSG_4_3(b0, b1) \
944  do { \
945  b0 = VecPermute(m7, m5, HL_MASK); \
946  b1 = VecPermute(m3, m1, HL_MASK); \
947  } while(0)
948 
949  #define BLAKE2B_LOAD_MSG_4_4(b0, b1) \
950  do { \
951  b0 = vec_shl_octet(m0, m6, 1); \
952  b1 = VecPermute(m4, m6, HL_MASK); \
953  } while(0)
954 
955  #define BLAKE2B_LOAD_MSG_5_1(b0, b1) \
956  do { \
957  b0 = vec_merge_hi(m1, m3); \
958  b1 = vec_merge_hi(m0, m4); \
959  } while(0)
960 
961  #define BLAKE2B_LOAD_MSG_5_2(b0, b1) \
962  do { \
963  b0 = vec_merge_hi(m6, m5); \
964  b1 = vec_merge_lo(m5, m1); \
965  } while(0)
966 
967  #define BLAKE2B_LOAD_MSG_5_3(b0, b1) \
968  do { \
969  b0 = VecPermute(m2, m3, HL_MASK); \
970  b1 = vec_merge_lo(m7, m0); \
971  } while(0)
972 
973  #define BLAKE2B_LOAD_MSG_5_4(b0, b1) \
974  do { \
975  b0 = vec_merge_lo(m6, m2); \
976  b1 = VecPermute(m7, m4, HL_MASK); \
977  } while(0)
978 
979  #define BLAKE2B_LOAD_MSG_6_1(b0, b1) \
980  do { \
981  b0 = VecPermute(m6, m0, HL_MASK); \
982  b1 = vec_merge_hi(m7, m2); \
983  } while(0)
984 
985  #define BLAKE2B_LOAD_MSG_6_2(b0, b1) \
986  do { \
987  b0 = vec_merge_lo(m2, m7); \
988  b1 = vec_shl_octet(m6, m5, 1); \
989  } while(0)
990 
991  #define BLAKE2B_LOAD_MSG_6_3(b0, b1) \
992  do { \
993  b0 = vec_merge_hi(m0, m3); \
994  b1 = vec_shl_octet(m4, m4, 1); \
995  } while(0)
996 
997  #define BLAKE2B_LOAD_MSG_6_4(b0, b1) \
998  do { \
999  b0 = vec_merge_lo(m3, m1); \
1000  b1 = VecPermute(m1, m5, HL_MASK); \
1001  } while(0)
1002 
1003  #define BLAKE2B_LOAD_MSG_7_1(b0, b1) \
1004  do { \
1005  b0 = vec_merge_lo(m6, m3); \
1006  b1 = VecPermute(m6, m1, HL_MASK); \
1007  } while(0)
1008 
1009  #define BLAKE2B_LOAD_MSG_7_2(b0, b1) \
1010  do { \
1011  b0 = vec_shl_octet(m5, m7, 1); \
1012  b1 = vec_merge_lo(m0, m4); \
1013  } while(0)
1014 
1015  #define BLAKE2B_LOAD_MSG_7_3(b0, b1) \
1016  do { \
1017  b0 = vec_merge_lo(m2, m7); \
1018  b1 = vec_merge_hi(m4, m1); \
1019  } while(0)
1020 
1021  #define BLAKE2B_LOAD_MSG_7_4(b0, b1) \
1022  do { \
1023  b0 = vec_merge_hi(m0, m2); \
1024  b1 = vec_merge_hi(m3, m5); \
1025  } while(0)
1026 
1027  #define BLAKE2B_LOAD_MSG_8_1(b0, b1) \
1028  do { \
1029  b0 = vec_merge_hi(m3, m7); \
1030  b1 = vec_shl_octet(m5, m0, 1); \
1031  } while(0)
1032 
1033  #define BLAKE2B_LOAD_MSG_8_2(b0, b1) \
1034  do { \
1035  b0 = vec_merge_lo(m7, m4); \
1036  b1 = vec_shl_octet(m1, m4, 1); \
1037  } while(0)
1038 
1039  #define BLAKE2B_LOAD_MSG_8_3(b0, b1) \
1040  do { \
1041  b0 = m6; \
1042  b1 = vec_shl_octet(m0, m5, 1); \
1043  } while(0)
1044 
1045  #define BLAKE2B_LOAD_MSG_8_4(b0, b1) \
1046  do { \
1047  b0 = VecPermute(m1, m3, HL_MASK); \
1048  b1 = m2; \
1049  } while(0)
1050 
1051  #define BLAKE2B_LOAD_MSG_9_1(b0, b1) \
1052  do { \
1053  b0 = vec_merge_hi(m5, m4); \
1054  b1 = vec_merge_lo(m3, m0); \
1055  } while(0)
1056 
1057  #define BLAKE2B_LOAD_MSG_9_2(b0, b1) \
1058  do { \
1059  b0 = vec_merge_hi(m1, m2); \
1060  b1 = VecPermute(m3, m2, HL_MASK); \
1061  } while(0)
1062 
1063  #define BLAKE2B_LOAD_MSG_9_3(b0, b1) \
1064  do { \
1065  b0 = vec_merge_lo(m7, m4); \
1066  b1 = vec_merge_lo(m1, m6); \
1067  } while(0)
1068 
1069  #define BLAKE2B_LOAD_MSG_9_4(b0, b1) \
1070  do { \
1071  b0 = vec_shl_octet(m5, m7, 1); \
1072  b1 = vec_merge_hi(m6, m0); \
1073  } while(0)
1074 
1075  #define BLAKE2B_LOAD_MSG_10_1(b0, b1) \
1076  do { \
1077  b0 = vec_merge_hi(m0, m1); \
1078  b1 = vec_merge_hi(m2, m3); \
1079  } while(0)
1080 
1081  #define BLAKE2B_LOAD_MSG_10_2(b0, b1) \
1082  do { \
1083  b0 = vec_merge_lo(m0, m1); \
1084  b1 = vec_merge_lo(m2, m3); \
1085  } while(0)
1086 
1087  #define BLAKE2B_LOAD_MSG_10_3(b0, b1) \
1088  do { \
1089  b0 = vec_merge_hi(m4, m5); \
1090  b1 = vec_merge_hi(m6, m7); \
1091  } while(0)
1092 
1093  #define BLAKE2B_LOAD_MSG_10_4(b0, b1) \
1094  do { \
1095  b0 = vec_merge_lo(m4, m5); \
1096  b1 = vec_merge_lo(m6, m7); \
1097  } while(0)
1098 
1099  #define BLAKE2B_LOAD_MSG_11_1(b0, b1) \
1100  do { \
1101  b0 = vec_merge_hi(m7, m2); \
1102  b1 = vec_merge_lo(m4, m6); \
1103  } while(0)
1104 
1105  #define BLAKE2B_LOAD_MSG_11_2(b0, b1) \
1106  do { \
1107  b0 = vec_merge_hi(m5, m4); \
1108  b1 = vec_shl_octet(m7, m3, 1); \
1109  } while(0)
1110 
1111  #define BLAKE2B_LOAD_MSG_11_3(b0, b1) \
1112  do { \
1113  b0 = vec_shl_octet(m0, m0, 1); \
1114  b1 = vec_merge_lo(m5, m2); \
1115  } while(0)
1116 
1117  #define BLAKE2B_LOAD_MSG_11_4(b0, b1) \
1118  do { \
1119  b0 = vec_merge_hi(m6, m1); \
1120  b1 = vec_merge_lo(m3, m1); \
1121  } while(0)
1122 
1123  // Power8 has packed 64-bit rotate, but in terms of left rotate
1124  const uint64x2_p ROR16_MASK = { 64-16, 64-16 };
1125  const uint64x2_p ROR24_MASK = { 64-24, 64-24 };
1126  const uint64x2_p ROR32_MASK = { 64-32, 64-32 };
1127  const uint64x2_p ROR63_MASK = { 64-63, 64-63 };
1128 
1129  #define vec_ror_32(x) vec_rl(x, ROR32_MASK)
1130  #define vec_ror_24(x) vec_rl(x, ROR24_MASK)
1131  #define vec_ror_16(x) vec_rl(x, ROR16_MASK)
1132  #define vec_ror_63(x) vec_rl(x, ROR63_MASK)
1133 
1134  #define BLAKE2B_G1(row1l,row2l,row3l,row4l,row1h,row2h,row3h,row4h,b0,b1) \
1135  do { \
1136  row1l = VecAdd(VecAdd(row1l, b0), row2l); \
1137  row1h = VecAdd(VecAdd(row1h, b1), row2h); \
1138  row4l = VecXor(row4l, row1l); row4h = VecXor(row4h, row1h); \
1139  row4l = vec_ror_32(row4l); row4h = vec_ror_32(row4h); \
1140  row3l = VecAdd(row3l, row4l); row3h = VecAdd(row3h, row4h); \
1141  row2l = VecXor(row2l, row3l); row2h = VecXor(row2h, row3h); \
1142  row2l = vec_ror_24(row2l); row2h = vec_ror_24(row2h); \
1143  } while(0)
1144 
1145  #define BLAKE2B_G2(row1l,row2l,row3l,row4l,row1h,row2h,row3h,row4h,b0,b1) \
1146  do { \
1147  row1l = VecAdd(VecAdd(row1l, b0), row2l); \
1148  row1h = VecAdd(VecAdd(row1h, b1), row2h); \
1149  row4l = VecXor(row4l, row1l); row4h = VecXor(row4h, row1h); \
1150  row4l = vec_ror_16(row4l); row4h = vec_ror_16(row4h); \
1151  row3l = VecAdd(row3l, row4l); row3h = VecAdd(row3h, row4h); \
1152  row2l = VecXor(row2l, row3l); row2h = VecXor(row2h, row3h); \
1153  row2l = vec_ror_63(row2l); row2h = vec_ror_63(row2h); \
1154  } while(0)
1155 
1156  #define BLAKE2B_DIAGONALIZE(row1l,row2l,row3l,row4l,row1h,row2h,row3h,row4h) \
1157  do { \
1158  uint64x2_p t0 = vec_shl_octet(row2l, row2h, 1); \
1159  uint64x2_p t1 = vec_shl_octet(row2h, row2l, 1); \
1160  row2l = t0; row2h = t1; t0 = row3l; row3l = row3h; row3h = t0; \
1161  t0 = vec_shl_octet(row4h, row4l, 1); t1 = vec_shl_octet(row4l, row4h, 1); \
1162  row4l = t0; row4h = t1; \
1163  } while(0)
1164 
1165  #define BLAKE2B_UNDIAGONALIZE(row1l,row2l,row3l,row4l,row1h,row2h,row3h,row4h) \
1166  do { \
1167  uint64x2_p t0 = vec_shl_octet(row2h, row2l, 1); \
1168  uint64x2_p t1 = vec_shl_octet(row2l, row2h, 1); \
1169  row2l = t0; row2h = t1; t0 = row3l; row3l = row3h; row3h = t0; \
1170  t0 = vec_shl_octet(row4l, row4h, 1); t1 = vec_shl_octet(row4h, row4l, 1); \
1171  row4l = t0; row4h = t1; \
1172  } while(0)
1173 
1174  #define BLAKE2B_ROUND(r) \
1175  do { \
1176  uint64x2_p b0, b1; \
1177  BLAKE2B_LOAD_MSG_ ##r ##_1(b0, b1); \
1178  BLAKE2B_G1(row1l,row2l,row3l,row4l,row1h,row2h,row3h,row4h,b0,b1); \
1179  BLAKE2B_LOAD_MSG_ ##r ##_2(b0, b1); \
1180  BLAKE2B_G2(row1l,row2l,row3l,row4l,row1h,row2h,row3h,row4h,b0,b1); \
1181  BLAKE2B_DIAGONALIZE(row1l,row2l,row3l,row4l,row1h,row2h,row3h,row4h); \
1182  BLAKE2B_LOAD_MSG_ ##r ##_3(b0, b1); \
1183  BLAKE2B_G1(row1l,row2l,row3l,row4l,row1h,row2h,row3h,row4h,b0,b1); \
1184  BLAKE2B_LOAD_MSG_ ##r ##_4(b0, b1); \
1185  BLAKE2B_G2(row1l,row2l,row3l,row4l,row1h,row2h,row3h,row4h,b0,b1); \
1186  BLAKE2B_UNDIAGONALIZE(row1l,row2l,row3l,row4l,row1h,row2h,row3h,row4h); \
1187  } while(0)
1188 
1189  const uint64x2_p m0 = VecLoad64LE(input + 00);
1190  const uint64x2_p m1 = VecLoad64LE(input + 16);
1191  const uint64x2_p m2 = VecLoad64LE(input + 32);
1192  const uint64x2_p m3 = VecLoad64LE(input + 48);
1193  const uint64x2_p m4 = VecLoad64LE(input + 64);
1194  const uint64x2_p m5 = VecLoad64LE(input + 80);
1195  const uint64x2_p m6 = VecLoad64LE(input + 96);
1196  const uint64x2_p m7 = VecLoad64LE(input + 112);
1197 
1198  uint64x2_p row1l, row1h, row2l, row2h;
1199  uint64x2_p row3l, row3h, row4l, row4h;
1200 
1201  const uint64x2_p h0 = row1l = VecLoad64LE(state.h()+0);
1202  const uint64x2_p h1 = row1h = VecLoad64LE(state.h()+2);
1203  const uint64x2_p h2 = row2l = VecLoad64LE(state.h()+4);
1204  const uint64x2_p h3 = row2h = VecLoad64LE(state.h()+6);
1205 
1206  row3l = VecLoad64(BLAKE2B_IV+0);
1207  row3h = VecLoad64(BLAKE2B_IV+2);
1208  row4l = VecXor(VecLoad64(BLAKE2B_IV+4), VecLoad64(state.t()+0));
1209  row4h = VecXor(VecLoad64(BLAKE2B_IV+6), VecLoad64(state.f()+0));
1210 
1211  BLAKE2B_ROUND(0);
1212  BLAKE2B_ROUND(1);
1213  BLAKE2B_ROUND(2);
1214  BLAKE2B_ROUND(3);
1215  BLAKE2B_ROUND(4);
1216  BLAKE2B_ROUND(5);
1217  BLAKE2B_ROUND(6);
1218  BLAKE2B_ROUND(7);
1219  BLAKE2B_ROUND(8);
1220  BLAKE2B_ROUND(9);
1221  BLAKE2B_ROUND(10);
1222  BLAKE2B_ROUND(11);
1223 
1224  VecStore64LE(state.h()+0, VecXor(h0, VecXor(row1l, row3l)));
1225  VecStore64LE(state.h()+2, VecXor(h1, VecXor(row1h, row3h)));
1226  VecStore64LE(state.h()+4, VecXor(h2, VecXor(row2l, row4l)));
1227  VecStore64LE(state.h()+6, VecXor(h3, VecXor(row2h, row4h)));
1228 }
1229 #endif // CRYPTOPP_POWER8_AVAILABLE
1230 
1231 NAMESPACE_END
Utility functions for the Crypto++ library.
Library configuration file.
T1 VecPermute(const T1 vec, const T2 mask)
Permutes a vector.
Definition: ppc_simd.h:1010
Support functions for PowerPC and vector operations.
Precompiled header file.
Classes for BLAKE2b and BLAKE2s message digests and keyed message digests.
T1 VecXor(const T1 vec1, const T2 vec2)
XOR two vectors.
Definition: ppc_simd.h:916
BLAKE2b state information.
Definition: blake2.h:196
__vector unsigned long long uint64x2_p
Vector of 64-bit elements.
Definition: ppc_simd.h:139
Crypto++ library namespace.
__vector unsigned char uint8x16_p
Vector of 8-bit elements.
Definition: ppc_simd.h:119
T VecShiftLeftOctet(const T vec)
Shift a vector left.
Definition: ppc_simd.h:1056