34 #if defined(__GNUC__) && (__GNUC__ < 4) 35 # undef CRYPTOPP_SSE2_INTRIN_AVAILABLE 38 #if (CRYPTOPP_SSE2_INTRIN_AVAILABLE) 39 # include <xmmintrin.h> 40 # include <emmintrin.h> 43 #if defined(__SSSE3__) 44 # include <tmmintrin.h> 48 # include <ammintrin.h> 52 #if (CRYPTOPP_ARM_NEON_AVAILABLE) && !defined(_M_ARM64) 53 # include <arm_neon.h> 56 #if (CRYPTOPP_ARM_ACLE_AVAILABLE) 58 # include <arm_acle.h> 61 #if defined(CRYPTOPP_ALTIVEC_AVAILABLE) 66 extern const char CHACHA_SIMD_FNAME[] = __FILE__;
68 ANONYMOUS_NAMESPACE_BEGIN
72 #if (CRYPTOPP_ARM_NEON_AVAILABLE) 74 template <
unsigned int R>
75 inline uint32x4_t RotateLeft(
const uint32x4_t& val)
77 return vorrq_u32(vshlq_n_u32(val, R), vshrq_n_u32(val, 32 - R));
80 template <
unsigned int R>
81 inline uint32x4_t RotateRight(
const uint32x4_t& val)
83 return vorrq_u32(vshlq_n_u32(val, 32 - R), vshrq_n_u32(val, R));
87 inline uint32x4_t RotateLeft<8>(
const uint32x4_t& val)
89 #if defined(__aarch32__) || defined(__aarch64__) 90 const uint8_t maskb[16] = { 3,0,1,2, 7,4,5,6, 11,8,9,10, 15,12,13,14 };
91 const uint8x16_t mask = vld1q_u8(maskb);
93 return vreinterpretq_u32_u8(
94 vqtbl1q_u8(vreinterpretq_u8_u32(val), mask));
97 return vorrq_u32(vshlq_n_u32(val, 8),
98 vshrq_n_u32(val, 32 - 8));
103 inline uint32x4_t RotateLeft<16>(
const uint32x4_t& val)
105 #if defined(__aarch32__) || defined(__aarch64__) 106 return vreinterpretq_u32_u16(
107 vrev32q_u16(vreinterpretq_u16_u32(val)));
110 return vorrq_u32(vshlq_n_u32(val, 16),
111 vshrq_n_u32(val, 32 - 16));
116 inline uint32x4_t RotateRight<8>(
const uint32x4_t& val)
118 #if defined(__aarch32__) || defined(__aarch64__) 119 const uint8_t maskb[16] = { 1,2,3,0, 5,6,7,4, 9,10,11,8, 13,14,15,12 };
120 const uint8x16_t mask = vld1q_u8(maskb);
122 return vreinterpretq_u32_u8(
123 vqtbl1q_u8(vreinterpretq_u8_u32(val), mask));
126 return vorrq_u32(vshrq_n_u32(val, 8),
127 vshlq_n_u32(val, 32 - 8));
132 inline uint32x4_t RotateRight<16>(
const uint32x4_t& val)
134 #if defined(__aarch32__) || defined(__aarch64__) 135 return vreinterpretq_u32_u16(
136 vrev32q_u16(vreinterpretq_u16_u32(val)));
139 return vorrq_u32(vshrq_n_u32(val, 16),
140 vshlq_n_u32(val, 32 - 16));
149 template <
unsigned int S>
150 inline uint32x4_t Extract(
const uint32x4_t& val)
152 return vextq_u32(val, val, S);
156 inline uint32x4_t Add64(
const uint32x4_t& a,
const uint32x4_t& b)
158 return vreinterpretq_u32_u64(
160 vreinterpretq_u64_u32(a),
161 vreinterpretq_u64_u32(b)));
164 #endif // CRYPTOPP_ARM_NEON_AVAILABLE 168 #if (CRYPTOPP_SSE2_INTRIN_AVAILABLE) 170 template <
unsigned int R>
171 inline __m128i RotateLeft(
const __m128i val)
174 return _mm_roti_epi32(val, R);
176 return _mm_or_si128(_mm_slli_epi32(val, R), _mm_srli_epi32(val, 32-R));
181 inline __m128i RotateLeft<8>(
const __m128i val)
184 return _mm_roti_epi32(val, 8);
185 #elif defined(__SSSE3__) 186 const __m128i mask = _mm_set_epi8(14,13,12,15, 10,9,8,11, 6,5,4,7, 2,1,0,3);
187 return _mm_shuffle_epi8(val, mask);
189 return _mm_or_si128(_mm_slli_epi32(val, 8), _mm_srli_epi32(val, 32-8));
194 inline __m128i RotateLeft<16>(
const __m128i val)
197 return _mm_roti_epi32(val, 16);
198 #elif defined(__SSSE3__) 199 const __m128i mask = _mm_set_epi8(13,12,15,14, 9,8,11,10, 5,4,7,6, 1,0,3,2);
200 return _mm_shuffle_epi8(val, mask);
202 return _mm_or_si128(_mm_slli_epi32(val, 16), _mm_srli_epi32(val, 32-16));
206 #endif // CRYPTOPP_SSE2_INTRIN_AVAILABLE 210 #if (CRYPTOPP_ALTIVEC_AVAILABLE) 230 inline uint32x4_p VecLoad32LE(
const uint8_t src[16])
232 #if (CRYPTOPP_BIG_ENDIAN) 233 const uint8x16_p mask = {3,2,1,0, 7,6,5,4, 11,10,9,8, 15,14,13,12};
244 inline void VecStore32LE(uint8_t dest[16],
const uint32x4_p& val)
246 #if (CRYPTOPP_BIG_ENDIAN) 247 const uint8x16_p mask = {3,2,1,0, 7,6,5,4, 11,10,9,8, 15,14,13,12};
259 template <
unsigned int S>
269 const uint8x16_p mask = {4,5,6,7, 8,9,10,11, 12,13,14,15, 0,1,2,3};
276 const uint8x16_p mask = {8,9,10,11, 12,13,14,15, 0,1,2,3, 4,5,6,7};
283 const uint8x16_p mask = {12,13,14,15, 0,1,2,3, 4,5,6,7, 8,9,10,11};
287 #endif // CRYPTOPP_ALTIVEC_AVAILABLE 289 ANONYMOUS_NAMESPACE_END
295 #if (CRYPTOPP_ARM_NEON_AVAILABLE) 297 void ChaCha_OperateKeystream_NEON(
const word32 *state,
const byte* input, byte *output,
unsigned int rounds)
299 const uint32x4_t state0 = vld1q_u32(state + 0*4);
300 const uint32x4_t state1 = vld1q_u32(state + 1*4);
301 const uint32x4_t state2 = vld1q_u32(state + 2*4);
302 const uint32x4_t state3 = vld1q_u32(state + 3*4);
304 const unsigned int w[] = {1,0,0,0, 2,0,0,0, 3,0,0,0};
305 const uint32x4_t CTRS[3] = {
306 vld1q_u32(w+0), vld1q_u32(w+4), vld1q_u32(w+8)
309 uint32x4_t r0_0 = state0;
310 uint32x4_t r0_1 = state1;
311 uint32x4_t r0_2 = state2;
312 uint32x4_t r0_3 = state3;
314 uint32x4_t r1_0 = state0;
315 uint32x4_t r1_1 = state1;
316 uint32x4_t r1_2 = state2;
317 uint32x4_t r1_3 = Add64(r0_3, CTRS[0]);
319 uint32x4_t r2_0 = state0;
320 uint32x4_t r2_1 = state1;
321 uint32x4_t r2_2 = state2;
322 uint32x4_t r2_3 = Add64(r0_3, CTRS[1]);
324 uint32x4_t r3_0 = state0;
325 uint32x4_t r3_1 = state1;
326 uint32x4_t r3_2 = state2;
327 uint32x4_t r3_3 = Add64(r0_3, CTRS[2]);
329 for (
int i = static_cast<int>(rounds); i > 0; i -= 2)
331 r0_0 = vaddq_u32(r0_0, r0_1);
332 r1_0 = vaddq_u32(r1_0, r1_1);
333 r2_0 = vaddq_u32(r2_0, r2_1);
334 r3_0 = vaddq_u32(r3_0, r3_1);
336 r0_3 = veorq_u32(r0_3, r0_0);
337 r1_3 = veorq_u32(r1_3, r1_0);
338 r2_3 = veorq_u32(r2_3, r2_0);
339 r3_3 = veorq_u32(r3_3, r3_0);
341 r0_3 = RotateLeft<16>(r0_3);
342 r1_3 = RotateLeft<16>(r1_3);
343 r2_3 = RotateLeft<16>(r2_3);
344 r3_3 = RotateLeft<16>(r3_3);
346 r0_2 = vaddq_u32(r0_2, r0_3);
347 r1_2 = vaddq_u32(r1_2, r1_3);
348 r2_2 = vaddq_u32(r2_2, r2_3);
349 r3_2 = vaddq_u32(r3_2, r3_3);
351 r0_1 = veorq_u32(r0_1, r0_2);
352 r1_1 = veorq_u32(r1_1, r1_2);
353 r2_1 = veorq_u32(r2_1, r2_2);
354 r3_1 = veorq_u32(r3_1, r3_2);
356 r0_1 = RotateLeft<12>(r0_1);
357 r1_1 = RotateLeft<12>(r1_1);
358 r2_1 = RotateLeft<12>(r2_1);
359 r3_1 = RotateLeft<12>(r3_1);
361 r0_0 = vaddq_u32(r0_0, r0_1);
362 r1_0 = vaddq_u32(r1_0, r1_1);
363 r2_0 = vaddq_u32(r2_0, r2_1);
364 r3_0 = vaddq_u32(r3_0, r3_1);
366 r0_3 = veorq_u32(r0_3, r0_0);
367 r1_3 = veorq_u32(r1_3, r1_0);
368 r2_3 = veorq_u32(r2_3, r2_0);
369 r3_3 = veorq_u32(r3_3, r3_0);
371 r0_3 = RotateLeft<8>(r0_3);
372 r1_3 = RotateLeft<8>(r1_3);
373 r2_3 = RotateLeft<8>(r2_3);
374 r3_3 = RotateLeft<8>(r3_3);
376 r0_2 = vaddq_u32(r0_2, r0_3);
377 r1_2 = vaddq_u32(r1_2, r1_3);
378 r2_2 = vaddq_u32(r2_2, r2_3);
379 r3_2 = vaddq_u32(r3_2, r3_3);
381 r0_1 = veorq_u32(r0_1, r0_2);
382 r1_1 = veorq_u32(r1_1, r1_2);
383 r2_1 = veorq_u32(r2_1, r2_2);
384 r3_1 = veorq_u32(r3_1, r3_2);
386 r0_1 = RotateLeft<7>(r0_1);
387 r1_1 = RotateLeft<7>(r1_1);
388 r2_1 = RotateLeft<7>(r2_1);
389 r3_1 = RotateLeft<7>(r3_1);
391 r0_1 = Extract<1>(r0_1);
392 r0_2 = Extract<2>(r0_2);
393 r0_3 = Extract<3>(r0_3);
395 r1_1 = Extract<1>(r1_1);
396 r1_2 = Extract<2>(r1_2);
397 r1_3 = Extract<3>(r1_3);
399 r2_1 = Extract<1>(r2_1);
400 r2_2 = Extract<2>(r2_2);
401 r2_3 = Extract<3>(r2_3);
403 r3_1 = Extract<1>(r3_1);
404 r3_2 = Extract<2>(r3_2);
405 r3_3 = Extract<3>(r3_3);
407 r0_0 = vaddq_u32(r0_0, r0_1);
408 r1_0 = vaddq_u32(r1_0, r1_1);
409 r2_0 = vaddq_u32(r2_0, r2_1);
410 r3_0 = vaddq_u32(r3_0, r3_1);
412 r0_3 = veorq_u32(r0_3, r0_0);
413 r1_3 = veorq_u32(r1_3, r1_0);
414 r2_3 = veorq_u32(r2_3, r2_0);
415 r3_3 = veorq_u32(r3_3, r3_0);
417 r0_3 = RotateLeft<16>(r0_3);
418 r1_3 = RotateLeft<16>(r1_3);
419 r2_3 = RotateLeft<16>(r2_3);
420 r3_3 = RotateLeft<16>(r3_3);
422 r0_2 = vaddq_u32(r0_2, r0_3);
423 r1_2 = vaddq_u32(r1_2, r1_3);
424 r2_2 = vaddq_u32(r2_2, r2_3);
425 r3_2 = vaddq_u32(r3_2, r3_3);
427 r0_1 = veorq_u32(r0_1, r0_2);
428 r1_1 = veorq_u32(r1_1, r1_2);
429 r2_1 = veorq_u32(r2_1, r2_2);
430 r3_1 = veorq_u32(r3_1, r3_2);
432 r0_1 = RotateLeft<12>(r0_1);
433 r1_1 = RotateLeft<12>(r1_1);
434 r2_1 = RotateLeft<12>(r2_1);
435 r3_1 = RotateLeft<12>(r3_1);
437 r0_0 = vaddq_u32(r0_0, r0_1);
438 r1_0 = vaddq_u32(r1_0, r1_1);
439 r2_0 = vaddq_u32(r2_0, r2_1);
440 r3_0 = vaddq_u32(r3_0, r3_1);
442 r0_3 = veorq_u32(r0_3, r0_0);
443 r1_3 = veorq_u32(r1_3, r1_0);
444 r2_3 = veorq_u32(r2_3, r2_0);
445 r3_3 = veorq_u32(r3_3, r3_0);
447 r0_3 = RotateLeft<8>(r0_3);
448 r1_3 = RotateLeft<8>(r1_3);
449 r2_3 = RotateLeft<8>(r2_3);
450 r3_3 = RotateLeft<8>(r3_3);
452 r0_2 = vaddq_u32(r0_2, r0_3);
453 r1_2 = vaddq_u32(r1_2, r1_3);
454 r2_2 = vaddq_u32(r2_2, r2_3);
455 r3_2 = vaddq_u32(r3_2, r3_3);
457 r0_1 = veorq_u32(r0_1, r0_2);
458 r1_1 = veorq_u32(r1_1, r1_2);
459 r2_1 = veorq_u32(r2_1, r2_2);
460 r3_1 = veorq_u32(r3_1, r3_2);
462 r0_1 = RotateLeft<7>(r0_1);
463 r1_1 = RotateLeft<7>(r1_1);
464 r2_1 = RotateLeft<7>(r2_1);
465 r3_1 = RotateLeft<7>(r3_1);
467 r0_1 = Extract<3>(r0_1);
468 r0_2 = Extract<2>(r0_2);
469 r0_3 = Extract<1>(r0_3);
471 r1_1 = Extract<3>(r1_1);
472 r1_2 = Extract<2>(r1_2);
473 r1_3 = Extract<1>(r1_3);
475 r2_1 = Extract<3>(r2_1);
476 r2_2 = Extract<2>(r2_2);
477 r2_3 = Extract<1>(r2_3);
479 r3_1 = Extract<3>(r3_1);
480 r3_2 = Extract<2>(r3_2);
481 r3_3 = Extract<1>(r3_3);
484 r0_0 = vaddq_u32(r0_0, state0);
485 r0_1 = vaddq_u32(r0_1, state1);
486 r0_2 = vaddq_u32(r0_2, state2);
487 r0_3 = vaddq_u32(r0_3, state3);
489 r1_0 = vaddq_u32(r1_0, state0);
490 r1_1 = vaddq_u32(r1_1, state1);
491 r1_2 = vaddq_u32(r1_2, state2);
492 r1_3 = vaddq_u32(r1_3, state3);
493 r1_3 = Add64(r1_3, CTRS[0]);
495 r2_0 = vaddq_u32(r2_0, state0);
496 r2_1 = vaddq_u32(r2_1, state1);
497 r2_2 = vaddq_u32(r2_2, state2);
498 r2_3 = vaddq_u32(r2_3, state3);
499 r2_3 = Add64(r2_3, CTRS[1]);
501 r3_0 = vaddq_u32(r3_0, state0);
502 r3_1 = vaddq_u32(r3_1, state1);
503 r3_2 = vaddq_u32(r3_2, state2);
504 r3_3 = vaddq_u32(r3_3, state3);
505 r3_3 = Add64(r3_3, CTRS[2]);
509 r0_0 = veorq_u32(vreinterpretq_u32_u8(vld1q_u8(input + 0*16)), r0_0);
510 r0_1 = veorq_u32(vreinterpretq_u32_u8(vld1q_u8(input + 1*16)), r0_1);
511 r0_2 = veorq_u32(vreinterpretq_u32_u8(vld1q_u8(input + 2*16)), r0_2);
512 r0_3 = veorq_u32(vreinterpretq_u32_u8(vld1q_u8(input + 3*16)), r0_3);
515 vst1q_u8(output + 0*16, vreinterpretq_u8_u32(r0_0));
516 vst1q_u8(output + 1*16, vreinterpretq_u8_u32(r0_1));
517 vst1q_u8(output + 2*16, vreinterpretq_u8_u32(r0_2));
518 vst1q_u8(output + 3*16, vreinterpretq_u8_u32(r0_3));
522 r1_0 = veorq_u32(vreinterpretq_u32_u8(vld1q_u8(input + 4*16)), r1_0);
523 r1_1 = veorq_u32(vreinterpretq_u32_u8(vld1q_u8(input + 5*16)), r1_1);
524 r1_2 = veorq_u32(vreinterpretq_u32_u8(vld1q_u8(input + 6*16)), r1_2);
525 r1_3 = veorq_u32(vreinterpretq_u32_u8(vld1q_u8(input + 7*16)), r1_3);
528 vst1q_u8(output + 4*16, vreinterpretq_u8_u32(r1_0));
529 vst1q_u8(output + 5*16, vreinterpretq_u8_u32(r1_1));
530 vst1q_u8(output + 6*16, vreinterpretq_u8_u32(r1_2));
531 vst1q_u8(output + 7*16, vreinterpretq_u8_u32(r1_3));
535 r2_0 = veorq_u32(vreinterpretq_u32_u8(vld1q_u8(input + 8*16)), r2_0);
536 r2_1 = veorq_u32(vreinterpretq_u32_u8(vld1q_u8(input + 9*16)), r2_1);
537 r2_2 = veorq_u32(vreinterpretq_u32_u8(vld1q_u8(input + 10*16)), r2_2);
538 r2_3 = veorq_u32(vreinterpretq_u32_u8(vld1q_u8(input + 11*16)), r2_3);
541 vst1q_u8(output + 8*16, vreinterpretq_u8_u32(r2_0));
542 vst1q_u8(output + 9*16, vreinterpretq_u8_u32(r2_1));
543 vst1q_u8(output + 10*16, vreinterpretq_u8_u32(r2_2));
544 vst1q_u8(output + 11*16, vreinterpretq_u8_u32(r2_3));
548 r3_0 = veorq_u32(vreinterpretq_u32_u8(vld1q_u8(input + 12*16)), r3_0);
549 r3_1 = veorq_u32(vreinterpretq_u32_u8(vld1q_u8(input + 13*16)), r3_1);
550 r3_2 = veorq_u32(vreinterpretq_u32_u8(vld1q_u8(input + 14*16)), r3_2);
551 r3_3 = veorq_u32(vreinterpretq_u32_u8(vld1q_u8(input + 15*16)), r3_3);
554 vst1q_u8(output + 12*16, vreinterpretq_u8_u32(r3_0));
555 vst1q_u8(output + 13*16, vreinterpretq_u8_u32(r3_1));
556 vst1q_u8(output + 14*16, vreinterpretq_u8_u32(r3_2));
557 vst1q_u8(output + 15*16, vreinterpretq_u8_u32(r3_3));
560 #endif // CRYPTOPP_ARM_NEON_AVAILABLE 564 #if (CRYPTOPP_SSE2_INTRIN_AVAILABLE) 566 void ChaCha_OperateKeystream_SSE2(
const word32 *state,
const byte* input, byte *output,
unsigned int rounds)
568 const __m128i* state_mm =
reinterpret_cast<const __m128i*
>(state);
569 const __m128i* input_mm =
reinterpret_cast<const __m128i*
>(input);
570 __m128i* output_mm =
reinterpret_cast<__m128i*
>(output);
572 const __m128i state0 = _mm_load_si128(state_mm + 0);
573 const __m128i state1 = _mm_load_si128(state_mm + 1);
574 const __m128i state2 = _mm_load_si128(state_mm + 2);
575 const __m128i state3 = _mm_load_si128(state_mm + 3);
577 __m128i r0_0 = state0;
578 __m128i r0_1 = state1;
579 __m128i r0_2 = state2;
580 __m128i r0_3 = state3;
582 __m128i r1_0 = state0;
583 __m128i r1_1 = state1;
584 __m128i r1_2 = state2;
585 __m128i r1_3 = _mm_add_epi64(r0_3, _mm_set_epi32(0, 0, 0, 1));
587 __m128i r2_0 = state0;
588 __m128i r2_1 = state1;
589 __m128i r2_2 = state2;
590 __m128i r2_3 = _mm_add_epi64(r0_3, _mm_set_epi32(0, 0, 0, 2));
592 __m128i r3_0 = state0;
593 __m128i r3_1 = state1;
594 __m128i r3_2 = state2;
595 __m128i r3_3 = _mm_add_epi64(r0_3, _mm_set_epi32(0, 0, 0, 3));
597 for (
int i = static_cast<int>(rounds); i > 0; i -= 2)
599 r0_0 = _mm_add_epi32(r0_0, r0_1);
600 r1_0 = _mm_add_epi32(r1_0, r1_1);
601 r2_0 = _mm_add_epi32(r2_0, r2_1);
602 r3_0 = _mm_add_epi32(r3_0, r3_1);
604 r0_3 = _mm_xor_si128(r0_3, r0_0);
605 r1_3 = _mm_xor_si128(r1_3, r1_0);
606 r2_3 = _mm_xor_si128(r2_3, r2_0);
607 r3_3 = _mm_xor_si128(r3_3, r3_0);
609 r0_3 = RotateLeft<16>(r0_3);
610 r1_3 = RotateLeft<16>(r1_3);
611 r2_3 = RotateLeft<16>(r2_3);
612 r3_3 = RotateLeft<16>(r3_3);
614 r0_2 = _mm_add_epi32(r0_2, r0_3);
615 r1_2 = _mm_add_epi32(r1_2, r1_3);
616 r2_2 = _mm_add_epi32(r2_2, r2_3);
617 r3_2 = _mm_add_epi32(r3_2, r3_3);
619 r0_1 = _mm_xor_si128(r0_1, r0_2);
620 r1_1 = _mm_xor_si128(r1_1, r1_2);
621 r2_1 = _mm_xor_si128(r2_1, r2_2);
622 r3_1 = _mm_xor_si128(r3_1, r3_2);
624 r0_1 = RotateLeft<12>(r0_1);
625 r1_1 = RotateLeft<12>(r1_1);
626 r2_1 = RotateLeft<12>(r2_1);
627 r3_1 = RotateLeft<12>(r3_1);
629 r0_0 = _mm_add_epi32(r0_0, r0_1);
630 r1_0 = _mm_add_epi32(r1_0, r1_1);
631 r2_0 = _mm_add_epi32(r2_0, r2_1);
632 r3_0 = _mm_add_epi32(r3_0, r3_1);
634 r0_3 = _mm_xor_si128(r0_3, r0_0);
635 r1_3 = _mm_xor_si128(r1_3, r1_0);
636 r2_3 = _mm_xor_si128(r2_3, r2_0);
637 r3_3 = _mm_xor_si128(r3_3, r3_0);
639 r0_3 = RotateLeft<8>(r0_3);
640 r1_3 = RotateLeft<8>(r1_3);
641 r2_3 = RotateLeft<8>(r2_3);
642 r3_3 = RotateLeft<8>(r3_3);
644 r0_2 = _mm_add_epi32(r0_2, r0_3);
645 r1_2 = _mm_add_epi32(r1_2, r1_3);
646 r2_2 = _mm_add_epi32(r2_2, r2_3);
647 r3_2 = _mm_add_epi32(r3_2, r3_3);
649 r0_1 = _mm_xor_si128(r0_1, r0_2);
650 r1_1 = _mm_xor_si128(r1_1, r1_2);
651 r2_1 = _mm_xor_si128(r2_1, r2_2);
652 r3_1 = _mm_xor_si128(r3_1, r3_2);
654 r0_1 = RotateLeft<7>(r0_1);
655 r1_1 = RotateLeft<7>(r1_1);
656 r2_1 = RotateLeft<7>(r2_1);
657 r3_1 = RotateLeft<7>(r3_1);
659 r0_1 = _mm_shuffle_epi32(r0_1, _MM_SHUFFLE(0, 3, 2, 1));
660 r0_2 = _mm_shuffle_epi32(r0_2, _MM_SHUFFLE(1, 0, 3, 2));
661 r0_3 = _mm_shuffle_epi32(r0_3, _MM_SHUFFLE(2, 1, 0, 3));
663 r1_1 = _mm_shuffle_epi32(r1_1, _MM_SHUFFLE(0, 3, 2, 1));
664 r1_2 = _mm_shuffle_epi32(r1_2, _MM_SHUFFLE(1, 0, 3, 2));
665 r1_3 = _mm_shuffle_epi32(r1_3, _MM_SHUFFLE(2, 1, 0, 3));
667 r2_1 = _mm_shuffle_epi32(r2_1, _MM_SHUFFLE(0, 3, 2, 1));
668 r2_2 = _mm_shuffle_epi32(r2_2, _MM_SHUFFLE(1, 0, 3, 2));
669 r2_3 = _mm_shuffle_epi32(r2_3, _MM_SHUFFLE(2, 1, 0, 3));
671 r3_1 = _mm_shuffle_epi32(r3_1, _MM_SHUFFLE(0, 3, 2, 1));
672 r3_2 = _mm_shuffle_epi32(r3_2, _MM_SHUFFLE(1, 0, 3, 2));
673 r3_3 = _mm_shuffle_epi32(r3_3, _MM_SHUFFLE(2, 1, 0, 3));
675 r0_0 = _mm_add_epi32(r0_0, r0_1);
676 r1_0 = _mm_add_epi32(r1_0, r1_1);
677 r2_0 = _mm_add_epi32(r2_0, r2_1);
678 r3_0 = _mm_add_epi32(r3_0, r3_1);
680 r0_3 = _mm_xor_si128(r0_3, r0_0);
681 r1_3 = _mm_xor_si128(r1_3, r1_0);
682 r2_3 = _mm_xor_si128(r2_3, r2_0);
683 r3_3 = _mm_xor_si128(r3_3, r3_0);
685 r0_3 = RotateLeft<16>(r0_3);
686 r1_3 = RotateLeft<16>(r1_3);
687 r2_3 = RotateLeft<16>(r2_3);
688 r3_3 = RotateLeft<16>(r3_3);
690 r0_2 = _mm_add_epi32(r0_2, r0_3);
691 r1_2 = _mm_add_epi32(r1_2, r1_3);
692 r2_2 = _mm_add_epi32(r2_2, r2_3);
693 r3_2 = _mm_add_epi32(r3_2, r3_3);
695 r0_1 = _mm_xor_si128(r0_1, r0_2);
696 r1_1 = _mm_xor_si128(r1_1, r1_2);
697 r2_1 = _mm_xor_si128(r2_1, r2_2);
698 r3_1 = _mm_xor_si128(r3_1, r3_2);
700 r0_1 = RotateLeft<12>(r0_1);
701 r1_1 = RotateLeft<12>(r1_1);
702 r2_1 = RotateLeft<12>(r2_1);
703 r3_1 = RotateLeft<12>(r3_1);
705 r0_0 = _mm_add_epi32(r0_0, r0_1);
706 r1_0 = _mm_add_epi32(r1_0, r1_1);
707 r2_0 = _mm_add_epi32(r2_0, r2_1);
708 r3_0 = _mm_add_epi32(r3_0, r3_1);
710 r0_3 = _mm_xor_si128(r0_3, r0_0);
711 r1_3 = _mm_xor_si128(r1_3, r1_0);
712 r2_3 = _mm_xor_si128(r2_3, r2_0);
713 r3_3 = _mm_xor_si128(r3_3, r3_0);
715 r0_3 = RotateLeft<8>(r0_3);
716 r1_3 = RotateLeft<8>(r1_3);
717 r2_3 = RotateLeft<8>(r2_3);
718 r3_3 = RotateLeft<8>(r3_3);
720 r0_2 = _mm_add_epi32(r0_2, r0_3);
721 r1_2 = _mm_add_epi32(r1_2, r1_3);
722 r2_2 = _mm_add_epi32(r2_2, r2_3);
723 r3_2 = _mm_add_epi32(r3_2, r3_3);
725 r0_1 = _mm_xor_si128(r0_1, r0_2);
726 r1_1 = _mm_xor_si128(r1_1, r1_2);
727 r2_1 = _mm_xor_si128(r2_1, r2_2);
728 r3_1 = _mm_xor_si128(r3_1, r3_2);
730 r0_1 = RotateLeft<7>(r0_1);
731 r1_1 = RotateLeft<7>(r1_1);
732 r2_1 = RotateLeft<7>(r2_1);
733 r3_1 = RotateLeft<7>(r3_1);
735 r0_1 = _mm_shuffle_epi32(r0_1, _MM_SHUFFLE(2, 1, 0, 3));
736 r0_2 = _mm_shuffle_epi32(r0_2, _MM_SHUFFLE(1, 0, 3, 2));
737 r0_3 = _mm_shuffle_epi32(r0_3, _MM_SHUFFLE(0, 3, 2, 1));
739 r1_1 = _mm_shuffle_epi32(r1_1, _MM_SHUFFLE(2, 1, 0, 3));
740 r1_2 = _mm_shuffle_epi32(r1_2, _MM_SHUFFLE(1, 0, 3, 2));
741 r1_3 = _mm_shuffle_epi32(r1_3, _MM_SHUFFLE(0, 3, 2, 1));
743 r2_1 = _mm_shuffle_epi32(r2_1, _MM_SHUFFLE(2, 1, 0, 3));
744 r2_2 = _mm_shuffle_epi32(r2_2, _MM_SHUFFLE(1, 0, 3, 2));
745 r2_3 = _mm_shuffle_epi32(r2_3, _MM_SHUFFLE(0, 3, 2, 1));
747 r3_1 = _mm_shuffle_epi32(r3_1, _MM_SHUFFLE(2, 1, 0, 3));
748 r3_2 = _mm_shuffle_epi32(r3_2, _MM_SHUFFLE(1, 0, 3, 2));
749 r3_3 = _mm_shuffle_epi32(r3_3, _MM_SHUFFLE(0, 3, 2, 1));
752 r0_0 = _mm_add_epi32(r0_0, state0);
753 r0_1 = _mm_add_epi32(r0_1, state1);
754 r0_2 = _mm_add_epi32(r0_2, state2);
755 r0_3 = _mm_add_epi32(r0_3, state3);
757 r1_0 = _mm_add_epi32(r1_0, state0);
758 r1_1 = _mm_add_epi32(r1_1, state1);
759 r1_2 = _mm_add_epi32(r1_2, state2);
760 r1_3 = _mm_add_epi32(r1_3, state3);
761 r1_3 = _mm_add_epi64(r1_3, _mm_set_epi32(0, 0, 0, 1));
763 r2_0 = _mm_add_epi32(r2_0, state0);
764 r2_1 = _mm_add_epi32(r2_1, state1);
765 r2_2 = _mm_add_epi32(r2_2, state2);
766 r2_3 = _mm_add_epi32(r2_3, state3);
767 r2_3 = _mm_add_epi64(r2_3, _mm_set_epi32(0, 0, 0, 2));
769 r3_0 = _mm_add_epi32(r3_0, state0);
770 r3_1 = _mm_add_epi32(r3_1, state1);
771 r3_2 = _mm_add_epi32(r3_2, state2);
772 r3_3 = _mm_add_epi32(r3_3, state3);
773 r3_3 = _mm_add_epi64(r3_3, _mm_set_epi32(0, 0, 0, 3));
777 r0_0 = _mm_xor_si128(_mm_loadu_si128(input_mm + 0), r0_0);
778 r0_1 = _mm_xor_si128(_mm_loadu_si128(input_mm + 1), r0_1);
779 r0_2 = _mm_xor_si128(_mm_loadu_si128(input_mm + 2), r0_2);
780 r0_3 = _mm_xor_si128(_mm_loadu_si128(input_mm + 3), r0_3);
783 _mm_storeu_si128(output_mm + 0, r0_0);
784 _mm_storeu_si128(output_mm + 1, r0_1);
785 _mm_storeu_si128(output_mm + 2, r0_2);
786 _mm_storeu_si128(output_mm + 3, r0_3);
790 r1_0 = _mm_xor_si128(_mm_loadu_si128(input_mm + 4), r1_0);
791 r1_1 = _mm_xor_si128(_mm_loadu_si128(input_mm + 5), r1_1);
792 r1_2 = _mm_xor_si128(_mm_loadu_si128(input_mm + 6), r1_2);
793 r1_3 = _mm_xor_si128(_mm_loadu_si128(input_mm + 7), r1_3);
796 _mm_storeu_si128(output_mm + 4, r1_0);
797 _mm_storeu_si128(output_mm + 5, r1_1);
798 _mm_storeu_si128(output_mm + 6, r1_2);
799 _mm_storeu_si128(output_mm + 7, r1_3);
803 r2_0 = _mm_xor_si128(_mm_loadu_si128(input_mm + 8), r2_0);
804 r2_1 = _mm_xor_si128(_mm_loadu_si128(input_mm + 9), r2_1);
805 r2_2 = _mm_xor_si128(_mm_loadu_si128(input_mm + 10), r2_2);
806 r2_3 = _mm_xor_si128(_mm_loadu_si128(input_mm + 11), r2_3);
809 _mm_storeu_si128(output_mm + 8, r2_0);
810 _mm_storeu_si128(output_mm + 9, r2_1);
811 _mm_storeu_si128(output_mm + 10, r2_2);
812 _mm_storeu_si128(output_mm + 11, r2_3);
816 r3_0 = _mm_xor_si128(_mm_loadu_si128(input_mm + 12), r3_0);
817 r3_1 = _mm_xor_si128(_mm_loadu_si128(input_mm + 13), r3_1);
818 r3_2 = _mm_xor_si128(_mm_loadu_si128(input_mm + 14), r3_2);
819 r3_3 = _mm_xor_si128(_mm_loadu_si128(input_mm + 15), r3_3);
822 _mm_storeu_si128(output_mm + 12, r3_0);
823 _mm_storeu_si128(output_mm + 13, r3_1);
824 _mm_storeu_si128(output_mm + 14, r3_2);
825 _mm_storeu_si128(output_mm + 15, r3_3);
828 #endif // CRYPTOPP_SSE2_INTRIN_AVAILABLE 830 #if (CRYPTOPP_POWER8_AVAILABLE || CRYPTOPP_ALTIVEC_AVAILABLE) 837 inline void ChaCha_OperateKeystream_CORE(
const word32 *state,
const byte* input, byte *output,
unsigned int rounds)
845 {1,0,0,0}, {2,0,0,0}, {3,0,0,0}
868 for (
int i = static_cast<int>(rounds); i > 0; i -= 2)
870 r0_0 =
VecAdd(r0_0, r0_1);
871 r1_0 =
VecAdd(r1_0, r1_1);
872 r2_0 =
VecAdd(r2_0, r2_1);
873 r3_0 =
VecAdd(r3_0, r3_1);
875 r0_3 =
VecXor(r0_3, r0_0);
876 r1_3 =
VecXor(r1_3, r1_0);
877 r2_3 =
VecXor(r2_3, r2_0);
878 r3_3 =
VecXor(r3_3, r3_0);
880 r0_3 = VecRotateLeft<16>(r0_3);
881 r1_3 = VecRotateLeft<16>(r1_3);
882 r2_3 = VecRotateLeft<16>(r2_3);
883 r3_3 = VecRotateLeft<16>(r3_3);
885 r0_2 =
VecAdd(r0_2, r0_3);
886 r1_2 =
VecAdd(r1_2, r1_3);
887 r2_2 =
VecAdd(r2_2, r2_3);
888 r3_2 =
VecAdd(r3_2, r3_3);
890 r0_1 =
VecXor(r0_1, r0_2);
891 r1_1 =
VecXor(r1_1, r1_2);
892 r2_1 =
VecXor(r2_1, r2_2);
893 r3_1 =
VecXor(r3_1, r3_2);
895 r0_1 = VecRotateLeft<12>(r0_1);
896 r1_1 = VecRotateLeft<12>(r1_1);
897 r2_1 = VecRotateLeft<12>(r2_1);
898 r3_1 = VecRotateLeft<12>(r3_1);
900 r0_0 =
VecAdd(r0_0, r0_1);
901 r1_0 =
VecAdd(r1_0, r1_1);
902 r2_0 =
VecAdd(r2_0, r2_1);
903 r3_0 =
VecAdd(r3_0, r3_1);
905 r0_3 =
VecXor(r0_3, r0_0);
906 r1_3 =
VecXor(r1_3, r1_0);
907 r2_3 =
VecXor(r2_3, r2_0);
908 r3_3 =
VecXor(r3_3, r3_0);
910 r0_3 = VecRotateLeft<8>(r0_3);
911 r1_3 = VecRotateLeft<8>(r1_3);
912 r2_3 = VecRotateLeft<8>(r2_3);
913 r3_3 = VecRotateLeft<8>(r3_3);
915 r0_2 =
VecAdd(r0_2, r0_3);
916 r1_2 =
VecAdd(r1_2, r1_3);
917 r2_2 =
VecAdd(r2_2, r2_3);
918 r3_2 =
VecAdd(r3_2, r3_3);
920 r0_1 =
VecXor(r0_1, r0_2);
921 r1_1 =
VecXor(r1_1, r1_2);
922 r2_1 =
VecXor(r2_1, r2_2);
923 r3_1 =
VecXor(r3_1, r3_2);
925 r0_1 = VecRotateLeft<7>(r0_1);
926 r1_1 = VecRotateLeft<7>(r1_1);
927 r2_1 = VecRotateLeft<7>(r2_1);
928 r3_1 = VecRotateLeft<7>(r3_1);
930 r0_1 = Shuffle<1>(r0_1);
931 r0_2 = Shuffle<2>(r0_2);
932 r0_3 = Shuffle<3>(r0_3);
934 r1_1 = Shuffle<1>(r1_1);
935 r1_2 = Shuffle<2>(r1_2);
936 r1_3 = Shuffle<3>(r1_3);
938 r2_1 = Shuffle<1>(r2_1);
939 r2_2 = Shuffle<2>(r2_2);
940 r2_3 = Shuffle<3>(r2_3);
942 r3_1 = Shuffle<1>(r3_1);
943 r3_2 = Shuffle<2>(r3_2);
944 r3_3 = Shuffle<3>(r3_3);
946 r0_0 =
VecAdd(r0_0, r0_1);
947 r1_0 =
VecAdd(r1_0, r1_1);
948 r2_0 =
VecAdd(r2_0, r2_1);
949 r3_0 =
VecAdd(r3_0, r3_1);
951 r0_3 =
VecXor(r0_3, r0_0);
952 r1_3 =
VecXor(r1_3, r1_0);
953 r2_3 =
VecXor(r2_3, r2_0);
954 r3_3 =
VecXor(r3_3, r3_0);
956 r0_3 = VecRotateLeft<16>(r0_3);
957 r1_3 = VecRotateLeft<16>(r1_3);
958 r2_3 = VecRotateLeft<16>(r2_3);
959 r3_3 = VecRotateLeft<16>(r3_3);
961 r0_2 =
VecAdd(r0_2, r0_3);
962 r1_2 =
VecAdd(r1_2, r1_3);
963 r2_2 =
VecAdd(r2_2, r2_3);
964 r3_2 =
VecAdd(r3_2, r3_3);
966 r0_1 =
VecXor(r0_1, r0_2);
967 r1_1 =
VecXor(r1_1, r1_2);
968 r2_1 =
VecXor(r2_1, r2_2);
969 r3_1 =
VecXor(r3_1, r3_2);
971 r0_1 = VecRotateLeft<12>(r0_1);
972 r1_1 = VecRotateLeft<12>(r1_1);
973 r2_1 = VecRotateLeft<12>(r2_1);
974 r3_1 = VecRotateLeft<12>(r3_1);
976 r0_0 =
VecAdd(r0_0, r0_1);
977 r1_0 =
VecAdd(r1_0, r1_1);
978 r2_0 =
VecAdd(r2_0, r2_1);
979 r3_0 =
VecAdd(r3_0, r3_1);
981 r0_3 =
VecXor(r0_3, r0_0);
982 r1_3 =
VecXor(r1_3, r1_0);
983 r2_3 =
VecXor(r2_3, r2_0);
984 r3_3 =
VecXor(r3_3, r3_0);
986 r0_3 = VecRotateLeft<8>(r0_3);
987 r1_3 = VecRotateLeft<8>(r1_3);
988 r2_3 = VecRotateLeft<8>(r2_3);
989 r3_3 = VecRotateLeft<8>(r3_3);
991 r0_2 =
VecAdd(r0_2, r0_3);
992 r1_2 =
VecAdd(r1_2, r1_3);
993 r2_2 =
VecAdd(r2_2, r2_3);
994 r3_2 =
VecAdd(r3_2, r3_3);
996 r0_1 =
VecXor(r0_1, r0_2);
997 r1_1 =
VecXor(r1_1, r1_2);
998 r2_1 =
VecXor(r2_1, r2_2);
999 r3_1 =
VecXor(r3_1, r3_2);
1001 r0_1 = VecRotateLeft<7>(r0_1);
1002 r1_1 = VecRotateLeft<7>(r1_1);
1003 r2_1 = VecRotateLeft<7>(r2_1);
1004 r3_1 = VecRotateLeft<7>(r3_1);
1006 r0_1 = Shuffle<3>(r0_1);
1007 r0_2 = Shuffle<2>(r0_2);
1008 r0_3 = Shuffle<1>(r0_3);
1010 r1_1 = Shuffle<3>(r1_1);
1011 r1_2 = Shuffle<2>(r1_2);
1012 r1_3 = Shuffle<1>(r1_3);
1014 r2_1 = Shuffle<3>(r2_1);
1015 r2_2 = Shuffle<2>(r2_2);
1016 r2_3 = Shuffle<1>(r2_3);
1018 r3_1 = Shuffle<3>(r3_1);
1019 r3_2 = Shuffle<2>(r3_2);
1020 r3_3 = Shuffle<1>(r3_3);
1023 r0_0 =
VecAdd(r0_0, state0);
1024 r0_1 =
VecAdd(r0_1, state1);
1025 r0_2 =
VecAdd(r0_2, state2);
1026 r0_3 =
VecAdd(r0_3, state3);
1028 r1_0 =
VecAdd(r1_0, state0);
1029 r1_1 =
VecAdd(r1_1, state1);
1030 r1_2 =
VecAdd(r1_2, state2);
1031 r1_3 =
VecAdd(r1_3, state3);
1034 r2_0 =
VecAdd(r2_0, state0);
1035 r2_1 =
VecAdd(r2_1, state1);
1036 r2_2 =
VecAdd(r2_2, state2);
1037 r2_3 =
VecAdd(r2_3, state3);
1040 r3_0 =
VecAdd(r3_0, state0);
1041 r3_1 =
VecAdd(r3_1, state1);
1042 r3_2 =
VecAdd(r3_2, state2);
1043 r3_3 =
VecAdd(r3_3, state3);
1048 r0_0 =
VecXor(VecLoad32LE(input + 0*16), r0_0);
1049 r0_1 =
VecXor(VecLoad32LE(input + 1*16), r0_1);
1050 r0_2 =
VecXor(VecLoad32LE(input + 2*16), r0_2);
1051 r0_3 =
VecXor(VecLoad32LE(input + 3*16), r0_3);
1054 VecStore32LE(output + 0*16, r0_0);
1055 VecStore32LE(output + 1*16, r0_1);
1056 VecStore32LE(output + 2*16, r0_2);
1057 VecStore32LE(output + 3*16, r0_3);
1061 r1_0 =
VecXor(VecLoad32LE(input + 4*16), r1_0);
1062 r1_1 =
VecXor(VecLoad32LE(input + 5*16), r1_1);
1063 r1_2 =
VecXor(VecLoad32LE(input + 6*16), r1_2);
1064 r1_3 =
VecXor(VecLoad32LE(input + 7*16), r1_3);
1067 VecStore32LE(output + 4*16, r1_0);
1068 VecStore32LE(output + 5*16, r1_1);
1069 VecStore32LE(output + 6*16, r1_2);
1070 VecStore32LE(output + 7*16, r1_3);
1074 r2_0 =
VecXor(VecLoad32LE(input + 8*16), r2_0);
1075 r2_1 =
VecXor(VecLoad32LE(input + 9*16), r2_1);
1076 r2_2 =
VecXor(VecLoad32LE(input + 10*16), r2_2);
1077 r2_3 =
VecXor(VecLoad32LE(input + 11*16), r2_3);
1080 VecStore32LE(output + 8*16, r2_0);
1081 VecStore32LE(output + 9*16, r2_1);
1082 VecStore32LE(output + 10*16, r2_2);
1083 VecStore32LE(output + 11*16, r2_3);
1087 r3_0 =
VecXor(VecLoad32LE(input + 12*16), r3_0);
1088 r3_1 =
VecXor(VecLoad32LE(input + 13*16), r3_1);
1089 r3_2 =
VecXor(VecLoad32LE(input + 14*16), r3_2);
1090 r3_3 =
VecXor(VecLoad32LE(input + 15*16), r3_3);
1093 VecStore32LE(output + 12*16, r3_0);
1094 VecStore32LE(output + 13*16, r3_1);
1095 VecStore32LE(output + 14*16, r3_2);
1096 VecStore32LE(output + 15*16, r3_3);
1099 #endif // CRYPTOPP_POWER8_AVAILABLE || CRYPTOPP_ALTIVEC_AVAILABLE 1101 #if (CRYPTOPP_POWER8_AVAILABLE) 1103 void ChaCha_OperateKeystream_POWER8(
const word32 *state,
const byte* input, byte *output,
unsigned int rounds)
1105 ChaCha_OperateKeystream_CORE(state, input, output, rounds);
1108 #elif (CRYPTOPP_ALTIVEC_AVAILABLE) 1110 void ChaCha_OperateKeystream_ALTIVEC(
const word32 *state,
const byte* input, byte *output,
unsigned int rounds)
1112 ChaCha_OperateKeystream_CORE(state, input, output, rounds);
Utility functions for the Crypto++ library.
uint32x4_p VecAdd64(const uint32x4_p &vec1, const uint32x4_p &vec2)
Add two vectors.
Library configuration file.
T1 VecAdd(const T1 vec1, const T2 vec2)
Add two vectors.
T1 VecPermute(const T1 vec, const T2 mask)
Permutes a vector.
__vector unsigned int uint32x4_p
Vector of 32-bit elements.
Support functions for PowerPC and vector operations.
void VecStore(const T data, byte dest[16])
Stores a vector to a byte array.
#define CRYPTOPP_ASSERT(exp)
Debugging and diagnostic assertion.
T1 VecXor(const T1 vec1, const T2 vec2)
XOR two vectors.
Crypto++ library namespace.
uint32x4_p VecLoad(const byte src[16])
Loads a vector from a byte array.
__vector unsigned char uint8x16_p
Vector of 8-bit elements.
Classes for ChaCha8, ChaCha12 and ChaCha20 stream ciphers.