23 #if (defined(__aarch32__) || defined(__aarch64__)) && defined(CRYPTOPP_SLOW_ARMV8_SHIFT) 24 # undef CRYPTOPP_ARM_NEON_AVAILABLE 29 #if defined(__xlC__) && (__xlC__ < 0x0d01) 30 # define CRYPTOPP_DISABLE_ALTIVEC 1 31 # undef CRYPTOPP_POWER7_AVAILABLE 32 # undef CRYPTOPP_POWER8_AVAILABLE 33 # undef CRYPTOPP_ALTIVEC_AVAILABLE 36 #if (CRYPTOPP_SSE41_AVAILABLE) 37 # include <emmintrin.h> 38 # include <tmmintrin.h> 39 # include <smmintrin.h> 43 #if (CRYPTOPP_ARM_NEON_AVAILABLE) && !defined(_M_ARM64) 44 # include <arm_neon.h> 47 #if (CRYPTOPP_ARM_ACLE_AVAILABLE) 49 # include <arm_acle.h> 52 #if (CRYPTOPP_POWER8_AVAILABLE) 57 extern const char BLAKE2B_SIMD_FNAME[] = __FILE__;
62 extern const word32 BLAKE2S_IV[8];
63 extern const word64 BLAKE2B_IV[8];
65 #if CRYPTOPP_SSE41_AVAILABLE 67 #define LOADU(p) _mm_loadu_si128((const __m128i *)(const void*)(p)) 68 #define STOREU(p,r) _mm_storeu_si128((__m128i *)(void*)(p), r) 69 #define TOF(reg) _mm_castsi128_ps((reg)) 70 #define TOI(reg) _mm_castps_si128((reg)) 72 void BLAKE2_Compress64_SSE4(
const byte* input,
BLAKE2b_State& state)
74 #define BLAKE2B_LOAD_MSG_0_1(b0, b1) \ 76 b0 = _mm_unpacklo_epi64(m0, m1); \ 77 b1 = _mm_unpacklo_epi64(m2, m3); \ 80 #define BLAKE2B_LOAD_MSG_0_2(b0, b1) \ 82 b0 = _mm_unpackhi_epi64(m0, m1); \ 83 b1 = _mm_unpackhi_epi64(m2, m3); \ 86 #define BLAKE2B_LOAD_MSG_0_3(b0, b1) \ 88 b0 = _mm_unpacklo_epi64(m4, m5); \ 89 b1 = _mm_unpacklo_epi64(m6, m7); \ 92 #define BLAKE2B_LOAD_MSG_0_4(b0, b1) \ 94 b0 = _mm_unpackhi_epi64(m4, m5); \ 95 b1 = _mm_unpackhi_epi64(m6, m7); \ 98 #define BLAKE2B_LOAD_MSG_1_1(b0, b1) \ 100 b0 = _mm_unpacklo_epi64(m7, m2); \ 101 b1 = _mm_unpackhi_epi64(m4, m6); \ 104 #define BLAKE2B_LOAD_MSG_1_2(b0, b1) \ 106 b0 = _mm_unpacklo_epi64(m5, m4); \ 107 b1 = _mm_alignr_epi8(m3, m7, 8); \ 110 #define BLAKE2B_LOAD_MSG_1_3(b0, b1) \ 112 b0 = _mm_shuffle_epi32(m0, _MM_SHUFFLE(1,0,3,2)); \ 113 b1 = _mm_unpackhi_epi64(m5, m2); \ 116 #define BLAKE2B_LOAD_MSG_1_4(b0, b1) \ 118 b0 = _mm_unpacklo_epi64(m6, m1); \ 119 b1 = _mm_unpackhi_epi64(m3, m1); \ 122 #define BLAKE2B_LOAD_MSG_2_1(b0, b1) \ 124 b0 = _mm_alignr_epi8(m6, m5, 8); \ 125 b1 = _mm_unpackhi_epi64(m2, m7); \ 128 #define BLAKE2B_LOAD_MSG_2_2(b0, b1) \ 130 b0 = _mm_unpacklo_epi64(m4, m0); \ 131 b1 = _mm_blend_epi16(m1, m6, 0xF0); \ 134 #define BLAKE2B_LOAD_MSG_2_3(b0, b1) \ 136 b0 = _mm_blend_epi16(m5, m1, 0xF0); \ 137 b1 = _mm_unpackhi_epi64(m3, m4); \ 140 #define BLAKE2B_LOAD_MSG_2_4(b0, b1) \ 142 b0 = _mm_unpacklo_epi64(m7, m3); \ 143 b1 = _mm_alignr_epi8(m2, m0, 8); \ 146 #define BLAKE2B_LOAD_MSG_3_1(b0, b1) \ 148 b0 = _mm_unpackhi_epi64(m3, m1); \ 149 b1 = _mm_unpackhi_epi64(m6, m5); \ 152 #define BLAKE2B_LOAD_MSG_3_2(b0, b1) \ 154 b0 = _mm_unpackhi_epi64(m4, m0); \ 155 b1 = _mm_unpacklo_epi64(m6, m7); \ 158 #define BLAKE2B_LOAD_MSG_3_3(b0, b1) \ 160 b0 = _mm_blend_epi16(m1, m2, 0xF0); \ 161 b1 = _mm_blend_epi16(m2, m7, 0xF0); \ 164 #define BLAKE2B_LOAD_MSG_3_4(b0, b1) \ 166 b0 = _mm_unpacklo_epi64(m3, m5); \ 167 b1 = _mm_unpacklo_epi64(m0, m4); \ 170 #define BLAKE2B_LOAD_MSG_4_1(b0, b1) \ 172 b0 = _mm_unpackhi_epi64(m4, m2); \ 173 b1 = _mm_unpacklo_epi64(m1, m5); \ 176 #define BLAKE2B_LOAD_MSG_4_2(b0, b1) \ 178 b0 = _mm_blend_epi16(m0, m3, 0xF0); \ 179 b1 = _mm_blend_epi16(m2, m7, 0xF0); \ 182 #define BLAKE2B_LOAD_MSG_4_3(b0, b1) \ 184 b0 = _mm_blend_epi16(m7, m5, 0xF0); \ 185 b1 = _mm_blend_epi16(m3, m1, 0xF0); \ 188 #define BLAKE2B_LOAD_MSG_4_4(b0, b1) \ 190 b0 = _mm_alignr_epi8(m6, m0, 8); \ 191 b1 = _mm_blend_epi16(m4, m6, 0xF0); \ 194 #define BLAKE2B_LOAD_MSG_5_1(b0, b1) \ 196 b0 = _mm_unpacklo_epi64(m1, m3); \ 197 b1 = _mm_unpacklo_epi64(m0, m4); \ 200 #define BLAKE2B_LOAD_MSG_5_2(b0, b1) \ 202 b0 = _mm_unpacklo_epi64(m6, m5); \ 203 b1 = _mm_unpackhi_epi64(m5, m1); \ 206 #define BLAKE2B_LOAD_MSG_5_3(b0, b1) \ 208 b0 = _mm_blend_epi16(m2, m3, 0xF0); \ 209 b1 = _mm_unpackhi_epi64(m7, m0); \ 212 #define BLAKE2B_LOAD_MSG_5_4(b0, b1) \ 214 b0 = _mm_unpackhi_epi64(m6, m2); \ 215 b1 = _mm_blend_epi16(m7, m4, 0xF0); \ 218 #define BLAKE2B_LOAD_MSG_6_1(b0, b1) \ 220 b0 = _mm_blend_epi16(m6, m0, 0xF0); \ 221 b1 = _mm_unpacklo_epi64(m7, m2); \ 224 #define BLAKE2B_LOAD_MSG_6_2(b0, b1) \ 226 b0 = _mm_unpackhi_epi64(m2, m7); \ 227 b1 = _mm_alignr_epi8(m5, m6, 8); \ 230 #define BLAKE2B_LOAD_MSG_6_3(b0, b1) \ 232 b0 = _mm_unpacklo_epi64(m0, m3); \ 233 b1 = _mm_shuffle_epi32(m4, _MM_SHUFFLE(1,0,3,2)); \ 236 #define BLAKE2B_LOAD_MSG_6_4(b0, b1) \ 238 b0 = _mm_unpackhi_epi64(m3, m1); \ 239 b1 = _mm_blend_epi16(m1, m5, 0xF0); \ 242 #define BLAKE2B_LOAD_MSG_7_1(b0, b1) \ 244 b0 = _mm_unpackhi_epi64(m6, m3); \ 245 b1 = _mm_blend_epi16(m6, m1, 0xF0); \ 248 #define BLAKE2B_LOAD_MSG_7_2(b0, b1) \ 250 b0 = _mm_alignr_epi8(m7, m5, 8); \ 251 b1 = _mm_unpackhi_epi64(m0, m4); \ 254 #define BLAKE2B_LOAD_MSG_7_3(b0, b1) \ 256 b0 = _mm_unpackhi_epi64(m2, m7); \ 257 b1 = _mm_unpacklo_epi64(m4, m1); \ 260 #define BLAKE2B_LOAD_MSG_7_4(b0, b1) \ 262 b0 = _mm_unpacklo_epi64(m0, m2); \ 263 b1 = _mm_unpacklo_epi64(m3, m5); \ 266 #define BLAKE2B_LOAD_MSG_8_1(b0, b1) \ 268 b0 = _mm_unpacklo_epi64(m3, m7); \ 269 b1 = _mm_alignr_epi8(m0, m5, 8); \ 272 #define BLAKE2B_LOAD_MSG_8_2(b0, b1) \ 274 b0 = _mm_unpackhi_epi64(m7, m4); \ 275 b1 = _mm_alignr_epi8(m4, m1, 8); \ 278 #define BLAKE2B_LOAD_MSG_8_3(b0, b1) \ 281 b1 = _mm_alignr_epi8(m5, m0, 8); \ 284 #define BLAKE2B_LOAD_MSG_8_4(b0, b1) \ 286 b0 = _mm_blend_epi16(m1, m3, 0xF0); \ 290 #define BLAKE2B_LOAD_MSG_9_1(b0, b1) \ 292 b0 = _mm_unpacklo_epi64(m5, m4); \ 293 b1 = _mm_unpackhi_epi64(m3, m0); \ 296 #define BLAKE2B_LOAD_MSG_9_2(b0, b1) \ 298 b0 = _mm_unpacklo_epi64(m1, m2); \ 299 b1 = _mm_blend_epi16(m3, m2, 0xF0); \ 302 #define BLAKE2B_LOAD_MSG_9_3(b0, b1) \ 304 b0 = _mm_unpackhi_epi64(m7, m4); \ 305 b1 = _mm_unpackhi_epi64(m1, m6); \ 308 #define BLAKE2B_LOAD_MSG_9_4(b0, b1) \ 310 b0 = _mm_alignr_epi8(m7, m5, 8); \ 311 b1 = _mm_unpacklo_epi64(m6, m0); \ 314 #define BLAKE2B_LOAD_MSG_10_1(b0, b1) \ 316 b0 = _mm_unpacklo_epi64(m0, m1); \ 317 b1 = _mm_unpacklo_epi64(m2, m3); \ 320 #define BLAKE2B_LOAD_MSG_10_2(b0, b1) \ 322 b0 = _mm_unpackhi_epi64(m0, m1); \ 323 b1 = _mm_unpackhi_epi64(m2, m3); \ 326 #define BLAKE2B_LOAD_MSG_10_3(b0, b1) \ 328 b0 = _mm_unpacklo_epi64(m4, m5); \ 329 b1 = _mm_unpacklo_epi64(m6, m7); \ 332 #define BLAKE2B_LOAD_MSG_10_4(b0, b1) \ 334 b0 = _mm_unpackhi_epi64(m4, m5); \ 335 b1 = _mm_unpackhi_epi64(m6, m7); \ 338 #define BLAKE2B_LOAD_MSG_11_1(b0, b1) \ 340 b0 = _mm_unpacklo_epi64(m7, m2); \ 341 b1 = _mm_unpackhi_epi64(m4, m6); \ 344 #define BLAKE2B_LOAD_MSG_11_2(b0, b1) \ 346 b0 = _mm_unpacklo_epi64(m5, m4); \ 347 b1 = _mm_alignr_epi8(m3, m7, 8); \ 350 #define BLAKE2B_LOAD_MSG_11_3(b0, b1) \ 352 b0 = _mm_shuffle_epi32(m0, _MM_SHUFFLE(1,0,3,2)); \ 353 b1 = _mm_unpackhi_epi64(m5, m2); \ 356 #define BLAKE2B_LOAD_MSG_11_4(b0, b1) \ 358 b0 = _mm_unpacklo_epi64(m6, m1); \ 359 b1 = _mm_unpackhi_epi64(m3, m1); \ 363 # define MM_ROTI_EPI64(r, c) \ 366 # define MM_ROTI_EPI64(x, c) \ 367 (-(c) == 32) ? _mm_shuffle_epi32((x), _MM_SHUFFLE(2,3,0,1)) \ 368 : (-(c) == 24) ? _mm_shuffle_epi8((x), r24) \ 369 : (-(c) == 16) ? _mm_shuffle_epi8((x), r16) \ 370 : (-(c) == 63) ? _mm_xor_si128(_mm_srli_epi64((x), -(c)), _mm_add_epi64((x), (x))) \ 371 : _mm_xor_si128(_mm_srli_epi64((x), -(c)), _mm_slli_epi64((x), 64-(-(c)))) 374 #define BLAKE2B_G1(row1l,row2l,row3l,row4l,row1h,row2h,row3h,row4h,b0,b1) \ 375 row1l = _mm_add_epi64(_mm_add_epi64(row1l, b0), row2l); \ 376 row1h = _mm_add_epi64(_mm_add_epi64(row1h, b1), row2h); \ 378 row4l = _mm_xor_si128(row4l, row1l); \ 379 row4h = _mm_xor_si128(row4h, row1h); \ 381 row4l = MM_ROTI_EPI64(row4l, -32); \ 382 row4h = MM_ROTI_EPI64(row4h, -32); \ 384 row3l = _mm_add_epi64(row3l, row4l); \ 385 row3h = _mm_add_epi64(row3h, row4h); \ 387 row2l = _mm_xor_si128(row2l, row3l); \ 388 row2h = _mm_xor_si128(row2h, row3h); \ 390 row2l = MM_ROTI_EPI64(row2l, -24); \ 391 row2h = MM_ROTI_EPI64(row2h, -24); 393 #define BLAKE2B_G2(row1l,row2l,row3l,row4l,row1h,row2h,row3h,row4h,b0,b1) \ 394 row1l = _mm_add_epi64(_mm_add_epi64(row1l, b0), row2l); \ 395 row1h = _mm_add_epi64(_mm_add_epi64(row1h, b1), row2h); \ 397 row4l = _mm_xor_si128(row4l, row1l); \ 398 row4h = _mm_xor_si128(row4h, row1h); \ 400 row4l = MM_ROTI_EPI64(row4l, -16); \ 401 row4h = MM_ROTI_EPI64(row4h, -16); \ 403 row3l = _mm_add_epi64(row3l, row4l); \ 404 row3h = _mm_add_epi64(row3h, row4h); \ 406 row2l = _mm_xor_si128(row2l, row3l); \ 407 row2h = _mm_xor_si128(row2h, row3h); \ 409 row2l = MM_ROTI_EPI64(row2l, -63); \ 410 row2h = MM_ROTI_EPI64(row2h, -63); \ 412 #define BLAKE2B_DIAGONALIZE(row1l,row2l,row3l,row4l,row1h,row2h,row3h,row4h) \ 418 row4l = _mm_unpackhi_epi64(row4h, _mm_unpacklo_epi64(t0, t0)); \ 419 row4h = _mm_unpackhi_epi64(t0, _mm_unpacklo_epi64(row4h, row4h)); \ 420 row2l = _mm_unpackhi_epi64(row2l, _mm_unpacklo_epi64(row2h, row2h)); \ 421 row2h = _mm_unpackhi_epi64(row2h, _mm_unpacklo_epi64(t1, t1)) 423 #define BLAKE2B_UNDIAGONALIZE(row1l,row2l,row3l,row4l,row1h,row2h,row3h,row4h) \ 429 row2l = _mm_unpackhi_epi64(row2h, _mm_unpacklo_epi64(row2l, row2l)); \ 430 row2h = _mm_unpackhi_epi64(t0, _mm_unpacklo_epi64(row2h, row2h)); \ 431 row4l = _mm_unpackhi_epi64(row4l, _mm_unpacklo_epi64(row4h, row4h)); \ 432 row4h = _mm_unpackhi_epi64(row4h, _mm_unpacklo_epi64(t1, t1)) 434 #define BLAKE2B_ROUND(r) \ 435 BLAKE2B_LOAD_MSG_ ##r ##_1(b0, b1); \ 436 BLAKE2B_G1(row1l,row2l,row3l,row4l,row1h,row2h,row3h,row4h,b0,b1); \ 437 BLAKE2B_LOAD_MSG_ ##r ##_2(b0, b1); \ 438 BLAKE2B_G2(row1l,row2l,row3l,row4l,row1h,row2h,row3h,row4h,b0,b1); \ 439 BLAKE2B_DIAGONALIZE(row1l,row2l,row3l,row4l,row1h,row2h,row3h,row4h); \ 440 BLAKE2B_LOAD_MSG_ ##r ##_3(b0, b1); \ 441 BLAKE2B_G1(row1l,row2l,row3l,row4l,row1h,row2h,row3h,row4h,b0,b1); \ 442 BLAKE2B_LOAD_MSG_ ##r ##_4(b0, b1); \ 443 BLAKE2B_G2(row1l,row2l,row3l,row4l,row1h,row2h,row3h,row4h,b0,b1); \ 444 BLAKE2B_UNDIAGONALIZE(row1l,row2l,row3l,row4l,row1h,row2h,row3h,row4h); 446 __m128i row1l, row1h;
447 __m128i row2l, row2h;
448 __m128i row3l, row3h;
449 __m128i row4l, row4h;
453 const __m128i r16 = _mm_setr_epi8(2, 3, 4, 5, 6, 7, 0, 1, 10, 11, 12, 13, 14, 15, 8, 9);
454 const __m128i r24 = _mm_setr_epi8(3, 4, 5, 6, 7, 0, 1, 2, 11, 12, 13, 14, 15, 8, 9, 10);
456 const __m128i m0 = LOADU(input + 00);
457 const __m128i m1 = LOADU(input + 16);
458 const __m128i m2 = LOADU(input + 32);
459 const __m128i m3 = LOADU(input + 48);
460 const __m128i m4 = LOADU(input + 64);
461 const __m128i m5 = LOADU(input + 80);
462 const __m128i m6 = LOADU(input + 96);
463 const __m128i m7 = LOADU(input + 112);
465 row1l = LOADU(state.h()+0);
466 row1h = LOADU(state.h()+2);
467 row2l = LOADU(state.h()+4);
468 row2h = LOADU(state.h()+6);
469 row3l = LOADU(BLAKE2B_IV+0);
470 row3h = LOADU(BLAKE2B_IV+2);
471 row4l = _mm_xor_si128(LOADU(BLAKE2B_IV+4), LOADU(state.t()+0));
472 row4h = _mm_xor_si128(LOADU(BLAKE2B_IV+6), LOADU(state.f()+0));
487 row1l = _mm_xor_si128(row3l, row1l);
488 row1h = _mm_xor_si128(row3h, row1h);
489 STOREU(state.h()+0, _mm_xor_si128(LOADU(state.h()+0), row1l));
490 STOREU(state.h()+2, _mm_xor_si128(LOADU(state.h()+2), row1h));
491 row2l = _mm_xor_si128(row4l, row2l);
492 row2h = _mm_xor_si128(row4h, row2h);
493 STOREU(state.h()+4, _mm_xor_si128(LOADU(state.h()+4), row2l));
494 STOREU(state.h()+6, _mm_xor_si128(LOADU(state.h()+6), row2h));
496 #endif // CRYPTOPP_SSE41_AVAILABLE 498 #if CRYPTOPP_ARM_NEON_AVAILABLE 499 void BLAKE2_Compress64_NEON(
const byte* input,
BLAKE2b_State& state)
501 #define BLAKE2B_LOAD_MSG_0_1(b0, b1) \ 502 do { b0 = vcombine_u64(vget_low_u64(m0), vget_low_u64(m1)); b1 = vcombine_u64(vget_low_u64(m2), vget_low_u64(m3)); } while(0) 504 #define BLAKE2B_LOAD_MSG_0_2(b0, b1) \ 505 do { b0 = vcombine_u64(vget_high_u64(m0), vget_high_u64(m1)); b1 = vcombine_u64(vget_high_u64(m2), vget_high_u64(m3)); } while(0) 507 #define BLAKE2B_LOAD_MSG_0_3(b0, b1) \ 508 do { b0 = vcombine_u64(vget_low_u64(m4), vget_low_u64(m5)); b1 = vcombine_u64(vget_low_u64(m6), vget_low_u64(m7)); } while(0) 510 #define BLAKE2B_LOAD_MSG_0_4(b0, b1) \ 511 do { b0 = vcombine_u64(vget_high_u64(m4), vget_high_u64(m5)); b1 = vcombine_u64(vget_high_u64(m6), vget_high_u64(m7)); } while(0) 513 #define BLAKE2B_LOAD_MSG_1_1(b0, b1) \ 514 do { b0 = vcombine_u64(vget_low_u64(m7), vget_low_u64(m2)); b1 = vcombine_u64(vget_high_u64(m4), vget_high_u64(m6)); } while(0) 516 #define BLAKE2B_LOAD_MSG_1_2(b0, b1) \ 517 do { b0 = vcombine_u64(vget_low_u64(m5), vget_low_u64(m4)); b1 = vextq_u64(m7, m3, 1); } while(0) 519 #define BLAKE2B_LOAD_MSG_1_3(b0, b1) \ 520 do { b0 = vextq_u64(m0, m0, 1); b1 = vcombine_u64(vget_high_u64(m5), vget_high_u64(m2)); } while(0) 522 #define BLAKE2B_LOAD_MSG_1_4(b0, b1) \ 523 do { b0 = vcombine_u64(vget_low_u64(m6), vget_low_u64(m1)); b1 = vcombine_u64(vget_high_u64(m3), vget_high_u64(m1)); } while(0) 525 #define BLAKE2B_LOAD_MSG_2_1(b0, b1) \ 526 do { b0 = vextq_u64(m5, m6, 1); b1 = vcombine_u64(vget_high_u64(m2), vget_high_u64(m7)); } while(0) 528 #define BLAKE2B_LOAD_MSG_2_2(b0, b1) \ 529 do { b0 = vcombine_u64(vget_low_u64(m4), vget_low_u64(m0)); b1 = vcombine_u64(vget_low_u64(m1), vget_high_u64(m6)); } while(0) 531 #define BLAKE2B_LOAD_MSG_2_3(b0, b1) \ 532 do { b0 = vcombine_u64(vget_low_u64(m5), vget_high_u64(m1)); b1 = vcombine_u64(vget_high_u64(m3), vget_high_u64(m4)); } while(0) 534 #define BLAKE2B_LOAD_MSG_2_4(b0, b1) \ 535 do { b0 = vcombine_u64(vget_low_u64(m7), vget_low_u64(m3)); b1 = vextq_u64(m0, m2, 1); } while(0) 537 #define BLAKE2B_LOAD_MSG_3_1(b0, b1) \ 538 do { b0 = vcombine_u64(vget_high_u64(m3), vget_high_u64(m1)); b1 = vcombine_u64(vget_high_u64(m6), vget_high_u64(m5)); } while(0) 540 #define BLAKE2B_LOAD_MSG_3_2(b0, b1) \ 541 do { b0 = vcombine_u64(vget_high_u64(m4), vget_high_u64(m0)); b1 = vcombine_u64(vget_low_u64(m6), vget_low_u64(m7)); } while(0) 543 #define BLAKE2B_LOAD_MSG_3_3(b0, b1) \ 544 do { b0 = vcombine_u64(vget_low_u64(m1), vget_high_u64(m2)); b1 = vcombine_u64(vget_low_u64(m2), vget_high_u64(m7)); } while(0) 546 #define BLAKE2B_LOAD_MSG_3_4(b0, b1) \ 547 do { b0 = vcombine_u64(vget_low_u64(m3), vget_low_u64(m5)); b1 = vcombine_u64(vget_low_u64(m0), vget_low_u64(m4)); } while(0) 549 #define BLAKE2B_LOAD_MSG_4_1(b0, b1) \ 550 do { b0 = vcombine_u64(vget_high_u64(m4), vget_high_u64(m2)); b1 = vcombine_u64(vget_low_u64(m1), vget_low_u64(m5)); } while(0) 552 #define BLAKE2B_LOAD_MSG_4_2(b0, b1) \ 553 do { b0 = vcombine_u64(vget_low_u64(m0), vget_high_u64(m3)); b1 = vcombine_u64(vget_low_u64(m2), vget_high_u64(m7)); } while(0) 555 #define BLAKE2B_LOAD_MSG_4_3(b0, b1) \ 556 do { b0 = vcombine_u64(vget_low_u64(m7), vget_high_u64(m5)); b1 = vcombine_u64(vget_low_u64(m3), vget_high_u64(m1)); } while(0) 558 #define BLAKE2B_LOAD_MSG_4_4(b0, b1) \ 559 do { b0 = vextq_u64(m0, m6, 1); b1 = vcombine_u64(vget_low_u64(m4), vget_high_u64(m6)); } while(0) 561 #define BLAKE2B_LOAD_MSG_5_1(b0, b1) \ 562 do { b0 = vcombine_u64(vget_low_u64(m1), vget_low_u64(m3)); b1 = vcombine_u64(vget_low_u64(m0), vget_low_u64(m4)); } while(0) 564 #define BLAKE2B_LOAD_MSG_5_2(b0, b1) \ 565 do { b0 = vcombine_u64(vget_low_u64(m6), vget_low_u64(m5)); b1 = vcombine_u64(vget_high_u64(m5), vget_high_u64(m1)); } while(0) 567 #define BLAKE2B_LOAD_MSG_5_3(b0, b1) \ 568 do { b0 = vcombine_u64(vget_low_u64(m2), vget_high_u64(m3)); b1 = vcombine_u64(vget_high_u64(m7), vget_high_u64(m0)); } while(0) 570 #define BLAKE2B_LOAD_MSG_5_4(b0, b1) \ 571 do { b0 = vcombine_u64(vget_high_u64(m6), vget_high_u64(m2)); b1 = vcombine_u64(vget_low_u64(m7), vget_high_u64(m4)); } while(0) 573 #define BLAKE2B_LOAD_MSG_6_1(b0, b1) \ 574 do { b0 = vcombine_u64(vget_low_u64(m6), vget_high_u64(m0)); b1 = vcombine_u64(vget_low_u64(m7), vget_low_u64(m2)); } while(0) 576 #define BLAKE2B_LOAD_MSG_6_2(b0, b1) \ 577 do { b0 = vcombine_u64(vget_high_u64(m2), vget_high_u64(m7)); b1 = vextq_u64(m6, m5, 1); } while(0) 579 #define BLAKE2B_LOAD_MSG_6_3(b0, b1) \ 580 do { b0 = vcombine_u64(vget_low_u64(m0), vget_low_u64(m3)); b1 = vextq_u64(m4, m4, 1); } while(0) 582 #define BLAKE2B_LOAD_MSG_6_4(b0, b1) \ 583 do { b0 = vcombine_u64(vget_high_u64(m3), vget_high_u64(m1)); b1 = vcombine_u64(vget_low_u64(m1), vget_high_u64(m5)); } while(0) 585 #define BLAKE2B_LOAD_MSG_7_1(b0, b1) \ 586 do { b0 = vcombine_u64(vget_high_u64(m6), vget_high_u64(m3)); b1 = vcombine_u64(vget_low_u64(m6), vget_high_u64(m1)); } while(0) 588 #define BLAKE2B_LOAD_MSG_7_2(b0, b1) \ 589 do { b0 = vextq_u64(m5, m7, 1); b1 = vcombine_u64(vget_high_u64(m0), vget_high_u64(m4)); } while(0) 591 #define BLAKE2B_LOAD_MSG_7_3(b0, b1) \ 592 do { b0 = vcombine_u64(vget_high_u64(m2), vget_high_u64(m7)); b1 = vcombine_u64(vget_low_u64(m4), vget_low_u64(m1)); } while(0) 594 #define BLAKE2B_LOAD_MSG_7_4(b0, b1) \ 595 do { b0 = vcombine_u64(vget_low_u64(m0), vget_low_u64(m2)); b1 = vcombine_u64(vget_low_u64(m3), vget_low_u64(m5)); } while(0) 597 #define BLAKE2B_LOAD_MSG_8_1(b0, b1) \ 598 do { b0 = vcombine_u64(vget_low_u64(m3), vget_low_u64(m7)); b1 = vextq_u64(m5, m0, 1); } while(0) 600 #define BLAKE2B_LOAD_MSG_8_2(b0, b1) \ 601 do { b0 = vcombine_u64(vget_high_u64(m7), vget_high_u64(m4)); b1 = vextq_u64(m1, m4, 1); } while(0) 603 #define BLAKE2B_LOAD_MSG_8_3(b0, b1) \ 604 do { b0 = m6; b1 = vextq_u64(m0, m5, 1); } while(0) 606 #define BLAKE2B_LOAD_MSG_8_4(b0, b1) \ 607 do { b0 = vcombine_u64(vget_low_u64(m1), vget_high_u64(m3)); b1 = m2; } while(0) 609 #define BLAKE2B_LOAD_MSG_9_1(b0, b1) \ 610 do { b0 = vcombine_u64(vget_low_u64(m5), vget_low_u64(m4)); b1 = vcombine_u64(vget_high_u64(m3), vget_high_u64(m0)); } while(0) 612 #define BLAKE2B_LOAD_MSG_9_2(b0, b1) \ 613 do { b0 = vcombine_u64(vget_low_u64(m1), vget_low_u64(m2)); b1 = vcombine_u64(vget_low_u64(m3), vget_high_u64(m2)); } while(0) 615 #define BLAKE2B_LOAD_MSG_9_3(b0, b1) \ 616 do { b0 = vcombine_u64(vget_high_u64(m7), vget_high_u64(m4)); b1 = vcombine_u64(vget_high_u64(m1), vget_high_u64(m6)); } while(0) 618 #define BLAKE2B_LOAD_MSG_9_4(b0, b1) \ 619 do { b0 = vextq_u64(m5, m7, 1); b1 = vcombine_u64(vget_low_u64(m6), vget_low_u64(m0)); } while(0) 621 #define BLAKE2B_LOAD_MSG_10_1(b0, b1) \ 622 do { b0 = vcombine_u64(vget_low_u64(m0), vget_low_u64(m1)); b1 = vcombine_u64(vget_low_u64(m2), vget_low_u64(m3)); } while(0) 624 #define BLAKE2B_LOAD_MSG_10_2(b0, b1) \ 625 do { b0 = vcombine_u64(vget_high_u64(m0), vget_high_u64(m1)); b1 = vcombine_u64(vget_high_u64(m2), vget_high_u64(m3)); } while(0) 627 #define BLAKE2B_LOAD_MSG_10_3(b0, b1) \ 628 do { b0 = vcombine_u64(vget_low_u64(m4), vget_low_u64(m5)); b1 = vcombine_u64(vget_low_u64(m6), vget_low_u64(m7)); } while(0) 630 #define BLAKE2B_LOAD_MSG_10_4(b0, b1) \ 631 do { b0 = vcombine_u64(vget_high_u64(m4), vget_high_u64(m5)); b1 = vcombine_u64(vget_high_u64(m6), vget_high_u64(m7)); } while(0) 633 #define BLAKE2B_LOAD_MSG_11_1(b0, b1) \ 634 do { b0 = vcombine_u64(vget_low_u64(m7), vget_low_u64(m2)); b1 = vcombine_u64(vget_high_u64(m4), vget_high_u64(m6)); } while(0) 636 #define BLAKE2B_LOAD_MSG_11_2(b0, b1) \ 637 do { b0 = vcombine_u64(vget_low_u64(m5), vget_low_u64(m4)); b1 = vextq_u64(m7, m3, 1); } while(0) 639 #define BLAKE2B_LOAD_MSG_11_3(b0, b1) \ 640 do { b0 = vextq_u64(m0, m0, 1); b1 = vcombine_u64(vget_high_u64(m5), vget_high_u64(m2)); } while(0) 642 #define BLAKE2B_LOAD_MSG_11_4(b0, b1) \ 643 do { b0 = vcombine_u64(vget_low_u64(m6), vget_low_u64(m1)); b1 = vcombine_u64(vget_high_u64(m3), vget_high_u64(m1)); } while(0) 645 #define vrorq_n_u64_32(x) vreinterpretq_u64_u32(vrev64q_u32(vreinterpretq_u32_u64((x)))) 647 #define vrorq_n_u64_24(x) vcombine_u64( \ 648 vreinterpret_u64_u8(vext_u8(vreinterpret_u8_u64(vget_low_u64(x)), vreinterpret_u8_u64(vget_low_u64(x)), 3)), \ 649 vreinterpret_u64_u8(vext_u8(vreinterpret_u8_u64(vget_high_u64(x)), vreinterpret_u8_u64(vget_high_u64(x)), 3))) 651 #define vrorq_n_u64_16(x) vcombine_u64( \ 652 vreinterpret_u64_u8(vext_u8(vreinterpret_u8_u64(vget_low_u64(x)), vreinterpret_u8_u64(vget_low_u64(x)), 2)), \ 653 vreinterpret_u64_u8(vext_u8(vreinterpret_u8_u64(vget_high_u64(x)), vreinterpret_u8_u64(vget_high_u64(x)), 2))) 655 #define vrorq_n_u64_63(x) veorq_u64(vaddq_u64(x, x), vshrq_n_u64(x, 63)) 657 #define BLAKE2B_G1(row1l,row2l,row3l,row4l,row1h,row2h,row3h,row4h,b0,b1) \ 659 row1l = vaddq_u64(vaddq_u64(row1l, b0), row2l); \ 660 row1h = vaddq_u64(vaddq_u64(row1h, b1), row2h); \ 661 row4l = veorq_u64(row4l, row1l); row4h = veorq_u64(row4h, row1h); \ 662 row4l = vrorq_n_u64_32(row4l); row4h = vrorq_n_u64_32(row4h); \ 663 row3l = vaddq_u64(row3l, row4l); row3h = vaddq_u64(row3h, row4h); \ 664 row2l = veorq_u64(row2l, row3l); row2h = veorq_u64(row2h, row3h); \ 665 row2l = vrorq_n_u64_24(row2l); row2h = vrorq_n_u64_24(row2h); \ 668 #define BLAKE2B_G2(row1l,row2l,row3l,row4l,row1h,row2h,row3h,row4h,b0,b1) \ 670 row1l = vaddq_u64(vaddq_u64(row1l, b0), row2l); \ 671 row1h = vaddq_u64(vaddq_u64(row1h, b1), row2h); \ 672 row4l = veorq_u64(row4l, row1l); row4h = veorq_u64(row4h, row1h); \ 673 row4l = vrorq_n_u64_16(row4l); row4h = vrorq_n_u64_16(row4h); \ 674 row3l = vaddq_u64(row3l, row4l); row3h = vaddq_u64(row3h, row4h); \ 675 row2l = veorq_u64(row2l, row3l); row2h = veorq_u64(row2h, row3h); \ 676 row2l = vrorq_n_u64_63(row2l); row2h = vrorq_n_u64_63(row2h); \ 679 #define BLAKE2B_DIAGONALIZE(row1l,row2l,row3l,row4l,row1h,row2h,row3h,row4h) \ 681 uint64x2_t t0 = vextq_u64(row2l, row2h, 1); \ 682 uint64x2_t t1 = vextq_u64(row2h, row2l, 1); \ 683 row2l = t0; row2h = t1; t0 = row3l; row3l = row3h; row3h = t0; \ 684 t0 = vextq_u64(row4h, row4l, 1); t1 = vextq_u64(row4l, row4h, 1); \ 685 row4l = t0; row4h = t1; \ 688 #define BLAKE2B_UNDIAGONALIZE(row1l,row2l,row3l,row4l,row1h,row2h,row3h,row4h) \ 690 uint64x2_t t0 = vextq_u64(row2h, row2l, 1); \ 691 uint64x2_t t1 = vextq_u64(row2l, row2h, 1); \ 692 row2l = t0; row2h = t1; t0 = row3l; row3l = row3h; row3h = t0; \ 693 t0 = vextq_u64(row4l, row4h, 1); t1 = vextq_u64(row4h, row4l, 1); \ 694 row4l = t0; row4h = t1; \ 697 #define BLAKE2B_ROUND(r) \ 700 BLAKE2B_LOAD_MSG_ ##r ##_1(b0, b1); \ 701 BLAKE2B_G1(row1l,row2l,row3l,row4l,row1h,row2h,row3h,row4h,b0,b1); \ 702 BLAKE2B_LOAD_MSG_ ##r ##_2(b0, b1); \ 703 BLAKE2B_G2(row1l,row2l,row3l,row4l,row1h,row2h,row3h,row4h,b0,b1); \ 704 BLAKE2B_DIAGONALIZE(row1l,row2l,row3l,row4l,row1h,row2h,row3h,row4h); \ 705 BLAKE2B_LOAD_MSG_ ##r ##_3(b0, b1); \ 706 BLAKE2B_G1(row1l,row2l,row3l,row4l,row1h,row2h,row3h,row4h,b0,b1); \ 707 BLAKE2B_LOAD_MSG_ ##r ##_4(b0, b1); \ 708 BLAKE2B_G2(row1l,row2l,row3l,row4l,row1h,row2h,row3h,row4h,b0,b1); \ 709 BLAKE2B_UNDIAGONALIZE(row1l,row2l,row3l,row4l,row1h,row2h,row3h,row4h); \ 712 const uint64x2_t m0 = vreinterpretq_u64_u8(vld1q_u8(input + 00));
713 const uint64x2_t m1 = vreinterpretq_u64_u8(vld1q_u8(input + 16));
714 const uint64x2_t m2 = vreinterpretq_u64_u8(vld1q_u8(input + 32));
715 const uint64x2_t m3 = vreinterpretq_u64_u8(vld1q_u8(input + 48));
716 const uint64x2_t m4 = vreinterpretq_u64_u8(vld1q_u8(input + 64));
717 const uint64x2_t m5 = vreinterpretq_u64_u8(vld1q_u8(input + 80));
718 const uint64x2_t m6 = vreinterpretq_u64_u8(vld1q_u8(input + 96));
719 const uint64x2_t m7 = vreinterpretq_u64_u8(vld1q_u8(input + 112));
721 uint64x2_t row1l, row1h, row2l, row2h;
722 uint64x2_t row3l, row3h, row4l, row4h;
724 const uint64x2_t h0 = row1l = vld1q_u64(state.h()+0);
725 const uint64x2_t h1 = row1h = vld1q_u64(state.h()+2);
726 const uint64x2_t h2 = row2l = vld1q_u64(state.h()+4);
727 const uint64x2_t h3 = row2h = vld1q_u64(state.h()+6);
729 row3l = vld1q_u64(BLAKE2B_IV+0);
730 row3h = vld1q_u64(BLAKE2B_IV+2);
731 row4l = veorq_u64(vld1q_u64(BLAKE2B_IV+4), vld1q_u64(state.t()+0));
732 row4h = veorq_u64(vld1q_u64(BLAKE2B_IV+6), vld1q_u64(state.f()+0));
747 vst1q_u64(state.h()+0, veorq_u64(h0, veorq_u64(row1l, row3l)));
748 vst1q_u64(state.h()+2, veorq_u64(h1, veorq_u64(row1h, row3h)));
749 vst1q_u64(state.h()+4, veorq_u64(h2, veorq_u64(row2l, row4l)));
750 vst1q_u64(state.h()+6, veorq_u64(h3, veorq_u64(row2h, row4h)));
752 #endif // CRYPTOPP_ARM_NEON_AVAILABLE 754 #if (CRYPTOPP_POWER8_AVAILABLE) 758 #if defined(__xlc__) || defined(__xlC__) || defined(__clang__) 761 return (
uint64x2_p)vec_vsx_ld(0, (uint8_t*)p);
768 const uint8x16_p m = {7,6,5,4, 3,2,1,0, 15,14,13,12, 11,10,9,8};
776 inline void VecStore64(
void* p,
const uint64x2_p x)
778 #if defined(__xlc__) || defined(__xlC__) || defined(__clang__) 785 inline void VecStore64LE(
void* p,
const uint64x2_p x)
788 const uint8x16_p m = {7,6,5,4, 3,2,1,0, 15,14,13,12, 11,10,9,8};
795 template <
unsigned int C>
805 #define vec_shl_octet(a,b,c) VecShiftLeftOctet<c*8>(a, b) 814 #if defined(__GNUC__) && (__BIG_ENDIAN__) 815 # define vec_merge_hi(a,b) VecPermute(a,b, HH_MASK) 816 # define vec_merge_lo(a,b) VecPermute(a,b, LL_MASK) 818 # define vec_merge_hi(a,b) vec_mergeh(a,b) 819 # define vec_merge_lo(a,b) vec_mergel(a,b) 822 void BLAKE2_Compress64_POWER8(
const byte* input,
BLAKE2b_State& state)
827 #if defined(__GNUC__) && (__BIG_ENDIAN__) 828 const uint8x16_p HH_MASK = { 0,1,2,3,4,5,6,7, 16,17,18,19,20,21,22,23 };
829 const uint8x16_p LL_MASK = { 8,9,10,11,12,13,14,15, 24,25,26,27,28,29,30,31 };
832 const uint8x16_p HL_MASK = { 0,1,2,3,4,5,6,7, 24,25,26,27,28,29,30,31 };
833 const uint8x16_p LH_MASK = { 8,9,10,11,12,13,14,15, 16,17,18,19,20,21,22,23 };
835 #define BLAKE2B_LOAD_MSG_0_1(b0, b1) \ 837 b0 = vec_merge_hi(m0, m1); \ 838 b1 = vec_merge_hi(m2, m3); \ 841 #define BLAKE2B_LOAD_MSG_0_2(b0, b1) \ 843 b0 = vec_merge_lo(m0, m1); \ 844 b1 = vec_merge_lo(m2, m3); \ 847 #define BLAKE2B_LOAD_MSG_0_3(b0, b1) \ 849 b0 = vec_merge_hi(m4, m5); \ 850 b1 = vec_merge_hi(m6, m7); \ 853 #define BLAKE2B_LOAD_MSG_0_4(b0, b1) \ 855 b0 = vec_merge_lo(m4, m5); \ 856 b1 = vec_merge_lo(m6, m7); \ 859 #define BLAKE2B_LOAD_MSG_1_1(b0, b1) \ 861 b0 = vec_merge_hi(m7, m2); \ 862 b1 = vec_merge_lo(m4, m6); \ 865 #define BLAKE2B_LOAD_MSG_1_2(b0, b1) \ 867 b0 = vec_merge_hi(m5, m4); \ 868 b1 = vec_shl_octet(m7, m3, 1); \ 871 #define BLAKE2B_LOAD_MSG_1_3(b0, b1) \ 873 b0 = vec_shl_octet(m0, m0, 1); \ 874 b1 = vec_merge_lo(m5, m2); \ 877 #define BLAKE2B_LOAD_MSG_1_4(b0, b1) \ 879 b0 = vec_merge_hi(m6, m1); \ 880 b1 = vec_merge_lo(m3, m1); \ 883 #define BLAKE2B_LOAD_MSG_2_1(b0, b1) \ 885 b0 = vec_shl_octet(m5, m6, 1); \ 886 b1 = vec_merge_lo(m2, m7); \ 889 #define BLAKE2B_LOAD_MSG_2_2(b0, b1) \ 891 b0 = vec_merge_hi(m4, m0); \ 892 b1 = VecPermute(m1, m6, HL_MASK); \ 895 #define BLAKE2B_LOAD_MSG_2_3(b0, b1) \ 897 b0 = VecPermute(m5, m1, HL_MASK); \ 898 b1 = vec_merge_lo(m3, m4); \ 901 #define BLAKE2B_LOAD_MSG_2_4(b0, b1) \ 903 b0 = vec_merge_hi(m7, m3); \ 904 b1 = vec_shl_octet(m0, m2, 1); \ 907 #define BLAKE2B_LOAD_MSG_3_1(b0, b1) \ 909 b0 = vec_merge_lo(m3, m1); \ 910 b1 = vec_merge_lo(m6, m5); \ 913 #define BLAKE2B_LOAD_MSG_3_2(b0, b1) \ 915 b0 = vec_merge_lo(m4, m0); \ 916 b1 = vec_merge_hi(m6, m7); \ 919 #define BLAKE2B_LOAD_MSG_3_3(b0, b1) \ 921 b0 = VecPermute(m1, m2, HL_MASK); \ 922 b1 = VecPermute(m2, m7, HL_MASK); \ 925 #define BLAKE2B_LOAD_MSG_3_4(b0, b1) \ 927 b0 = vec_merge_hi(m3, m5); \ 928 b1 = vec_merge_hi(m0, m4); \ 931 #define BLAKE2B_LOAD_MSG_4_1(b0, b1) \ 933 b0 = vec_merge_lo(m4, m2); \ 934 b1 = vec_merge_hi(m1, m5); \ 937 #define BLAKE2B_LOAD_MSG_4_2(b0, b1) \ 939 b0 = VecPermute(m0, m3, HL_MASK); \ 940 b1 = VecPermute(m2, m7, HL_MASK); \ 943 #define BLAKE2B_LOAD_MSG_4_3(b0, b1) \ 945 b0 = VecPermute(m7, m5, HL_MASK); \ 946 b1 = VecPermute(m3, m1, HL_MASK); \ 949 #define BLAKE2B_LOAD_MSG_4_4(b0, b1) \ 951 b0 = vec_shl_octet(m0, m6, 1); \ 952 b1 = VecPermute(m4, m6, HL_MASK); \ 955 #define BLAKE2B_LOAD_MSG_5_1(b0, b1) \ 957 b0 = vec_merge_hi(m1, m3); \ 958 b1 = vec_merge_hi(m0, m4); \ 961 #define BLAKE2B_LOAD_MSG_5_2(b0, b1) \ 963 b0 = vec_merge_hi(m6, m5); \ 964 b1 = vec_merge_lo(m5, m1); \ 967 #define BLAKE2B_LOAD_MSG_5_3(b0, b1) \ 969 b0 = VecPermute(m2, m3, HL_MASK); \ 970 b1 = vec_merge_lo(m7, m0); \ 973 #define BLAKE2B_LOAD_MSG_5_4(b0, b1) \ 975 b0 = vec_merge_lo(m6, m2); \ 976 b1 = VecPermute(m7, m4, HL_MASK); \ 979 #define BLAKE2B_LOAD_MSG_6_1(b0, b1) \ 981 b0 = VecPermute(m6, m0, HL_MASK); \ 982 b1 = vec_merge_hi(m7, m2); \ 985 #define BLAKE2B_LOAD_MSG_6_2(b0, b1) \ 987 b0 = vec_merge_lo(m2, m7); \ 988 b1 = vec_shl_octet(m6, m5, 1); \ 991 #define BLAKE2B_LOAD_MSG_6_3(b0, b1) \ 993 b0 = vec_merge_hi(m0, m3); \ 994 b1 = vec_shl_octet(m4, m4, 1); \ 997 #define BLAKE2B_LOAD_MSG_6_4(b0, b1) \ 999 b0 = vec_merge_lo(m3, m1); \ 1000 b1 = VecPermute(m1, m5, HL_MASK); \ 1003 #define BLAKE2B_LOAD_MSG_7_1(b0, b1) \ 1005 b0 = vec_merge_lo(m6, m3); \ 1006 b1 = VecPermute(m6, m1, HL_MASK); \ 1009 #define BLAKE2B_LOAD_MSG_7_2(b0, b1) \ 1011 b0 = vec_shl_octet(m5, m7, 1); \ 1012 b1 = vec_merge_lo(m0, m4); \ 1015 #define BLAKE2B_LOAD_MSG_7_3(b0, b1) \ 1017 b0 = vec_merge_lo(m2, m7); \ 1018 b1 = vec_merge_hi(m4, m1); \ 1021 #define BLAKE2B_LOAD_MSG_7_4(b0, b1) \ 1023 b0 = vec_merge_hi(m0, m2); \ 1024 b1 = vec_merge_hi(m3, m5); \ 1027 #define BLAKE2B_LOAD_MSG_8_1(b0, b1) \ 1029 b0 = vec_merge_hi(m3, m7); \ 1030 b1 = vec_shl_octet(m5, m0, 1); \ 1033 #define BLAKE2B_LOAD_MSG_8_2(b0, b1) \ 1035 b0 = vec_merge_lo(m7, m4); \ 1036 b1 = vec_shl_octet(m1, m4, 1); \ 1039 #define BLAKE2B_LOAD_MSG_8_3(b0, b1) \ 1042 b1 = vec_shl_octet(m0, m5, 1); \ 1045 #define BLAKE2B_LOAD_MSG_8_4(b0, b1) \ 1047 b0 = VecPermute(m1, m3, HL_MASK); \ 1051 #define BLAKE2B_LOAD_MSG_9_1(b0, b1) \ 1053 b0 = vec_merge_hi(m5, m4); \ 1054 b1 = vec_merge_lo(m3, m0); \ 1057 #define BLAKE2B_LOAD_MSG_9_2(b0, b1) \ 1059 b0 = vec_merge_hi(m1, m2); \ 1060 b1 = VecPermute(m3, m2, HL_MASK); \ 1063 #define BLAKE2B_LOAD_MSG_9_3(b0, b1) \ 1065 b0 = vec_merge_lo(m7, m4); \ 1066 b1 = vec_merge_lo(m1, m6); \ 1069 #define BLAKE2B_LOAD_MSG_9_4(b0, b1) \ 1071 b0 = vec_shl_octet(m5, m7, 1); \ 1072 b1 = vec_merge_hi(m6, m0); \ 1075 #define BLAKE2B_LOAD_MSG_10_1(b0, b1) \ 1077 b0 = vec_merge_hi(m0, m1); \ 1078 b1 = vec_merge_hi(m2, m3); \ 1081 #define BLAKE2B_LOAD_MSG_10_2(b0, b1) \ 1083 b0 = vec_merge_lo(m0, m1); \ 1084 b1 = vec_merge_lo(m2, m3); \ 1087 #define BLAKE2B_LOAD_MSG_10_3(b0, b1) \ 1089 b0 = vec_merge_hi(m4, m5); \ 1090 b1 = vec_merge_hi(m6, m7); \ 1093 #define BLAKE2B_LOAD_MSG_10_4(b0, b1) \ 1095 b0 = vec_merge_lo(m4, m5); \ 1096 b1 = vec_merge_lo(m6, m7); \ 1099 #define BLAKE2B_LOAD_MSG_11_1(b0, b1) \ 1101 b0 = vec_merge_hi(m7, m2); \ 1102 b1 = vec_merge_lo(m4, m6); \ 1105 #define BLAKE2B_LOAD_MSG_11_2(b0, b1) \ 1107 b0 = vec_merge_hi(m5, m4); \ 1108 b1 = vec_shl_octet(m7, m3, 1); \ 1111 #define BLAKE2B_LOAD_MSG_11_3(b0, b1) \ 1113 b0 = vec_shl_octet(m0, m0, 1); \ 1114 b1 = vec_merge_lo(m5, m2); \ 1117 #define BLAKE2B_LOAD_MSG_11_4(b0, b1) \ 1119 b0 = vec_merge_hi(m6, m1); \ 1120 b1 = vec_merge_lo(m3, m1); \ 1124 const uint64x2_p ROR16_MASK = { 64-16, 64-16 };
1125 const uint64x2_p ROR24_MASK = { 64-24, 64-24 };
1126 const uint64x2_p ROR32_MASK = { 64-32, 64-32 };
1127 const uint64x2_p ROR63_MASK = { 64-63, 64-63 };
1129 #define vec_ror_32(x) vec_rl(x, ROR32_MASK) 1130 #define vec_ror_24(x) vec_rl(x, ROR24_MASK) 1131 #define vec_ror_16(x) vec_rl(x, ROR16_MASK) 1132 #define vec_ror_63(x) vec_rl(x, ROR63_MASK) 1134 #define BLAKE2B_G1(row1l,row2l,row3l,row4l,row1h,row2h,row3h,row4h,b0,b1) \ 1136 row1l = VecAdd(VecAdd(row1l, b0), row2l); \ 1137 row1h = VecAdd(VecAdd(row1h, b1), row2h); \ 1138 row4l = VecXor(row4l, row1l); row4h = VecXor(row4h, row1h); \ 1139 row4l = vec_ror_32(row4l); row4h = vec_ror_32(row4h); \ 1140 row3l = VecAdd(row3l, row4l); row3h = VecAdd(row3h, row4h); \ 1141 row2l = VecXor(row2l, row3l); row2h = VecXor(row2h, row3h); \ 1142 row2l = vec_ror_24(row2l); row2h = vec_ror_24(row2h); \ 1145 #define BLAKE2B_G2(row1l,row2l,row3l,row4l,row1h,row2h,row3h,row4h,b0,b1) \ 1147 row1l = VecAdd(VecAdd(row1l, b0), row2l); \ 1148 row1h = VecAdd(VecAdd(row1h, b1), row2h); \ 1149 row4l = VecXor(row4l, row1l); row4h = VecXor(row4h, row1h); \ 1150 row4l = vec_ror_16(row4l); row4h = vec_ror_16(row4h); \ 1151 row3l = VecAdd(row3l, row4l); row3h = VecAdd(row3h, row4h); \ 1152 row2l = VecXor(row2l, row3l); row2h = VecXor(row2h, row3h); \ 1153 row2l = vec_ror_63(row2l); row2h = vec_ror_63(row2h); \ 1156 #define BLAKE2B_DIAGONALIZE(row1l,row2l,row3l,row4l,row1h,row2h,row3h,row4h) \ 1158 uint64x2_p t0 = vec_shl_octet(row2l, row2h, 1); \ 1159 uint64x2_p t1 = vec_shl_octet(row2h, row2l, 1); \ 1160 row2l = t0; row2h = t1; t0 = row3l; row3l = row3h; row3h = t0; \ 1161 t0 = vec_shl_octet(row4h, row4l, 1); t1 = vec_shl_octet(row4l, row4h, 1); \ 1162 row4l = t0; row4h = t1; \ 1165 #define BLAKE2B_UNDIAGONALIZE(row1l,row2l,row3l,row4l,row1h,row2h,row3h,row4h) \ 1167 uint64x2_p t0 = vec_shl_octet(row2h, row2l, 1); \ 1168 uint64x2_p t1 = vec_shl_octet(row2l, row2h, 1); \ 1169 row2l = t0; row2h = t1; t0 = row3l; row3l = row3h; row3h = t0; \ 1170 t0 = vec_shl_octet(row4l, row4h, 1); t1 = vec_shl_octet(row4h, row4l, 1); \ 1171 row4l = t0; row4h = t1; \ 1174 #define BLAKE2B_ROUND(r) \ 1176 uint64x2_p b0, b1; \ 1177 BLAKE2B_LOAD_MSG_ ##r ##_1(b0, b1); \ 1178 BLAKE2B_G1(row1l,row2l,row3l,row4l,row1h,row2h,row3h,row4h,b0,b1); \ 1179 BLAKE2B_LOAD_MSG_ ##r ##_2(b0, b1); \ 1180 BLAKE2B_G2(row1l,row2l,row3l,row4l,row1h,row2h,row3h,row4h,b0,b1); \ 1181 BLAKE2B_DIAGONALIZE(row1l,row2l,row3l,row4l,row1h,row2h,row3h,row4h); \ 1182 BLAKE2B_LOAD_MSG_ ##r ##_3(b0, b1); \ 1183 BLAKE2B_G1(row1l,row2l,row3l,row4l,row1h,row2h,row3h,row4h,b0,b1); \ 1184 BLAKE2B_LOAD_MSG_ ##r ##_4(b0, b1); \ 1185 BLAKE2B_G2(row1l,row2l,row3l,row4l,row1h,row2h,row3h,row4h,b0,b1); \ 1186 BLAKE2B_UNDIAGONALIZE(row1l,row2l,row3l,row4l,row1h,row2h,row3h,row4h); \ 1189 const uint64x2_p m0 = VecLoad64LE(input + 00);
1190 const uint64x2_p m1 = VecLoad64LE(input + 16);
1191 const uint64x2_p m2 = VecLoad64LE(input + 32);
1192 const uint64x2_p m3 = VecLoad64LE(input + 48);
1193 const uint64x2_p m4 = VecLoad64LE(input + 64);
1194 const uint64x2_p m5 = VecLoad64LE(input + 80);
1195 const uint64x2_p m6 = VecLoad64LE(input + 96);
1196 const uint64x2_p m7 = VecLoad64LE(input + 112);
1201 const uint64x2_p h0 = row1l = VecLoad64LE(state.h()+0);
1202 const uint64x2_p h1 = row1h = VecLoad64LE(state.h()+2);
1203 const uint64x2_p h2 = row2l = VecLoad64LE(state.h()+4);
1204 const uint64x2_p h3 = row2h = VecLoad64LE(state.h()+6);
1206 row3l = VecLoad64(BLAKE2B_IV+0);
1207 row3h = VecLoad64(BLAKE2B_IV+2);
1208 row4l =
VecXor(VecLoad64(BLAKE2B_IV+4), VecLoad64(state.t()+0));
1209 row4h =
VecXor(VecLoad64(BLAKE2B_IV+6), VecLoad64(state.f()+0));
1224 VecStore64LE(state.h()+0,
VecXor(h0,
VecXor(row1l, row3l)));
1225 VecStore64LE(state.h()+2,
VecXor(h1,
VecXor(row1h, row3h)));
1226 VecStore64LE(state.h()+4,
VecXor(h2,
VecXor(row2l, row4l)));
1227 VecStore64LE(state.h()+6,
VecXor(h3,
VecXor(row2h, row4h)));
1229 #endif // CRYPTOPP_POWER8_AVAILABLE Utility functions for the Crypto++ library.
Library configuration file.
T1 VecPermute(const T1 vec, const T2 mask)
Permutes a vector.
Support functions for PowerPC and vector operations.
Classes for BLAKE2b and BLAKE2s message digests and keyed message digests.
T1 VecXor(const T1 vec1, const T2 vec2)
XOR two vectors.
BLAKE2b state information.
__vector unsigned long long uint64x2_p
Vector of 64-bit elements.
Crypto++ library namespace.
__vector unsigned char uint8x16_p
Vector of 8-bit elements.
T VecShiftLeftOctet(const T vec)
Shift a vector left.