Crypto++  8.2
Free C++ class library of cryptographic schemes
chacha_simd.cpp
1 // chacha_simd.cpp - written and placed in the public domain by
2 // Jack Lloyd and Jeffrey Walton
3 //
4 // This source file uses intrinsics and built-ins to gain access to
5 // SSE2, ARM NEON and ARMv8a, Power7 and Altivec instructions. A separate
6 // source file is needed because additional CXXFLAGS are required to enable
7 // the appropriate instructions sets in some build configurations.
8 //
9 // SSE2 implementation based on Botan's chacha_sse2.cpp. Many thanks
10 // to Jack Lloyd and the Botan team for allowing us to use it.
11 //
12 // The SSE2 implementation is kind of unusual among Crypto++ algorithms.
13 // We guard on CRYTPOPP_SSE2_AVAILABLE and use HasSSE2() at runtime. However,
14 // if the compiler says a target machine has SSSE3 or XOP available (say, by
15 // way of -march=native), then we can pull another 150 to 800 MB/s out of
16 // ChaCha. To capture SSSE3 and XOP we use the compiler defines __SSSE3__ and
17 // __XOP__ and forgo runtime tests.
18 //
19 // Runtime tests for HasSSSE3() and HasXop() are too expensive to make a
20 // sub-case of SSE2. The rotates are on a critical path and the runtime tests
21 // crush performance.
22 //
23 // Here are some relative numbers for ChaCha8:
24 // * Intel Skylake, 3.0 GHz: SSE2 at 2160 MB/s; SSSE3 at 2310 MB/s.
25 // * AMD Bulldozer, 3.3 GHz: SSE2 at 1680 MB/s; XOP at 2510 MB/s.
26 
27 #include "pch.h"
28 #include "config.h"
29 
30 #include "chacha.h"
31 #include "misc.h"
32 
33 // Internal compiler error in GCC 3.3 and below
34 #if defined(__GNUC__) && (__GNUC__ < 4)
35 # undef CRYPTOPP_SSE2_INTRIN_AVAILABLE
36 #endif
37 
38 #if (CRYPTOPP_SSE2_INTRIN_AVAILABLE)
39 # include <xmmintrin.h>
40 # include <emmintrin.h>
41 #endif
42 
43 #if defined(__SSSE3__)
44 # include <tmmintrin.h>
45 #endif
46 
47 #if defined(__XOP__)
48 # include <ammintrin.h>
49 #endif
50 
51 // C1189: error: This header is specific to ARM targets
52 #if (CRYPTOPP_ARM_NEON_AVAILABLE) && !defined(_M_ARM64)
53 # include <arm_neon.h>
54 #endif
55 
56 #if (CRYPTOPP_ARM_ACLE_AVAILABLE)
57 # include <stdint.h>
58 # include <arm_acle.h>
59 #endif
60 
61 #if defined(CRYPTOPP_ALTIVEC_AVAILABLE)
62 # include "ppc_simd.h"
63 #endif
64 
65 // Squash MS LNK4221 and libtool warnings
66 extern const char CHACHA_SIMD_FNAME[] = __FILE__;
67 
68 ANONYMOUS_NAMESPACE_BEGIN
69 
70 // ***************************** NEON ***************************** //
71 
72 #if (CRYPTOPP_ARM_NEON_AVAILABLE)
73 
74 template <unsigned int R>
75 inline uint32x4_t RotateLeft(const uint32x4_t& val)
76 {
77  return vorrq_u32(vshlq_n_u32(val, R), vshrq_n_u32(val, 32 - R));
78 }
79 
80 template <unsigned int R>
81 inline uint32x4_t RotateRight(const uint32x4_t& val)
82 {
83  return vorrq_u32(vshlq_n_u32(val, 32 - R), vshrq_n_u32(val, R));
84 }
85 
86 template <>
87 inline uint32x4_t RotateLeft<8>(const uint32x4_t& val)
88 {
89 #if defined(__aarch32__) || defined(__aarch64__)
90  const uint8_t maskb[16] = { 3,0,1,2, 7,4,5,6, 11,8,9,10, 15,12,13,14 };
91  const uint8x16_t mask = vld1q_u8(maskb);
92 
93  return vreinterpretq_u32_u8(
94  vqtbl1q_u8(vreinterpretq_u8_u32(val), mask));
95 #else
96  // fallback to slower C++ rotation.
97  return vorrq_u32(vshlq_n_u32(val, 8),
98  vshrq_n_u32(val, 32 - 8));
99 #endif
100 }
101 
102 template <>
103 inline uint32x4_t RotateLeft<16>(const uint32x4_t& val)
104 {
105 #if defined(__aarch32__) || defined(__aarch64__)
106  return vreinterpretq_u32_u16(
107  vrev32q_u16(vreinterpretq_u16_u32(val)));
108 #else
109  // fallback to slower C++ rotation.
110  return vorrq_u32(vshlq_n_u32(val, 16),
111  vshrq_n_u32(val, 32 - 16));
112 #endif
113 }
114 
115 template <>
116 inline uint32x4_t RotateRight<8>(const uint32x4_t& val)
117 {
118 #if defined(__aarch32__) || defined(__aarch64__)
119  const uint8_t maskb[16] = { 1,2,3,0, 5,6,7,4, 9,10,11,8, 13,14,15,12 };
120  const uint8x16_t mask = vld1q_u8(maskb);
121 
122  return vreinterpretq_u32_u8(
123  vqtbl1q_u8(vreinterpretq_u8_u32(val), mask));
124 #else
125  // fallback to slower C++ rotation.
126  return vorrq_u32(vshrq_n_u32(val, 8),
127  vshlq_n_u32(val, 32 - 8));
128 #endif
129 }
130 
131 template <>
132 inline uint32x4_t RotateRight<16>(const uint32x4_t& val)
133 {
134 #if defined(__aarch32__) || defined(__aarch64__)
135  return vreinterpretq_u32_u16(
136  vrev32q_u16(vreinterpretq_u16_u32(val)));
137 #else
138  // fallback to slower C++ rotation.
139  return vorrq_u32(vshrq_n_u32(val, 16),
140  vshlq_n_u32(val, 32 - 16));
141 #endif
142 }
143 
144 // ChaCha's use of x86 shuffle is really a 4, 8, or 12 byte
145 // rotation on the 128-bit vector word:
146 // * [3,2,1,0] => [0,3,2,1] is Extract<1>(x)
147 // * [3,2,1,0] => [1,0,3,2] is Extract<2>(x)
148 // * [3,2,1,0] => [2,1,0,3] is Extract<3>(x)
149 template <unsigned int S>
150 inline uint32x4_t Extract(const uint32x4_t& val)
151 {
152  return vextq_u32(val, val, S);
153 }
154 
155 // Helper to perform 64-bit addition across two elements of 32-bit vectors
156 inline uint32x4_t Add64(const uint32x4_t& a, const uint32x4_t& b)
157 {
158  return vreinterpretq_u32_u64(
159  vaddq_u64(
160  vreinterpretq_u64_u32(a),
161  vreinterpretq_u64_u32(b)));
162 }
163 
164 #endif // CRYPTOPP_ARM_NEON_AVAILABLE
165 
166 // ***************************** SSE2 ***************************** //
167 
168 #if (CRYPTOPP_SSE2_INTRIN_AVAILABLE)
169 
170 template <unsigned int R>
171 inline __m128i RotateLeft(const __m128i val)
172 {
173 #ifdef __XOP__
174  return _mm_roti_epi32(val, R);
175 #else
176  return _mm_or_si128(_mm_slli_epi32(val, R), _mm_srli_epi32(val, 32-R));
177 #endif
178 }
179 
180 template <>
181 inline __m128i RotateLeft<8>(const __m128i val)
182 {
183 #if defined(__XOP__)
184  return _mm_roti_epi32(val, 8);
185 #elif defined(__SSSE3__)
186  const __m128i mask = _mm_set_epi8(14,13,12,15, 10,9,8,11, 6,5,4,7, 2,1,0,3);
187  return _mm_shuffle_epi8(val, mask);
188 #else
189  return _mm_or_si128(_mm_slli_epi32(val, 8), _mm_srli_epi32(val, 32-8));
190 #endif
191 }
192 
193 template <>
194 inline __m128i RotateLeft<16>(const __m128i val)
195 {
196 #if defined(__XOP__)
197  return _mm_roti_epi32(val, 16);
198 #elif defined(__SSSE3__)
199  const __m128i mask = _mm_set_epi8(13,12,15,14, 9,8,11,10, 5,4,7,6, 1,0,3,2);
200  return _mm_shuffle_epi8(val, mask);
201 #else
202  return _mm_or_si128(_mm_slli_epi32(val, 16), _mm_srli_epi32(val, 32-16));
203 #endif
204 }
205 
206 #endif // CRYPTOPP_SSE2_INTRIN_AVAILABLE
207 
208 // **************************** Altivec **************************** //
209 
210 #if (CRYPTOPP_ALTIVEC_AVAILABLE)
211 
212 // ChaCha_OperateKeystream_POWER8 is optimized for POWER7. However, Altivec
213 // is supported by using vec_ld and vec_st, and using a composite VecAdd
214 // that supports 64-bit element adds. vec_ld and vec_st add significant
215 // overhead when memory is not aligned. Despite the drawbacks Altivec
216 // is profitable. The numbers for ChaCha8 are:
217 //
218 // PowerMac, C++, 2.0 GHz: 205 MB/s, 9.29 cpb
219 // PowerMac, Altivec, 2.0 GHz: 471 MB/s, 4.09 cpb
220 
223 using CryptoPP::VecLoad;
224 using CryptoPP::VecStore;
226 
227 // Permutes bytes in packed 32-bit words to little endian.
228 // State is already in proper endian order. Input and
229 // output must be permuted during load and save.
230 inline uint32x4_p VecLoad32LE(const uint8_t src[16])
231 {
232 #if (CRYPTOPP_BIG_ENDIAN)
233  const uint8x16_p mask = {3,2,1,0, 7,6,5,4, 11,10,9,8, 15,14,13,12};
234  const uint32x4_p val = VecLoad(src);
235  return VecPermute(val, val, mask);
236 #else
237  return VecLoad(src);
238 #endif
239 }
240 
241 // Permutes bytes in packed 32-bit words to little endian.
242 // State is already in proper endian order. Input and
243 // output must be permuted during load and save.
244 inline void VecStore32LE(uint8_t dest[16], const uint32x4_p& val)
245 {
246 #if (CRYPTOPP_BIG_ENDIAN)
247  const uint8x16_p mask = {3,2,1,0, 7,6,5,4, 11,10,9,8, 15,14,13,12};
248  VecStore(VecPermute(val, val, mask), dest);
249 #else
250  return VecStore(val, dest);
251 #endif
252 }
253 
254 // ChaCha's use of x86 shuffle is really a 4, 8, or 12 byte
255 // rotation on the 128-bit vector word:
256 // * [3,2,1,0] => [0,3,2,1] is Shuffle<1>(x)
257 // * [3,2,1,0] => [1,0,3,2] is Shuffle<2>(x)
258 // * [3,2,1,0] => [2,1,0,3] is Shuffle<3>(x)
259 template <unsigned int S>
260 inline uint32x4_p Shuffle(const uint32x4_p& val)
261 {
262  CRYPTOPP_ASSERT(0);
263  return val;
264 }
265 
266 template <>
267 inline uint32x4_p Shuffle<1>(const uint32x4_p& val)
268 {
269  const uint8x16_p mask = {4,5,6,7, 8,9,10,11, 12,13,14,15, 0,1,2,3};
270  return VecPermute(val, val, mask);
271 }
272 
273 template <>
274 inline uint32x4_p Shuffle<2>(const uint32x4_p& val)
275 {
276  const uint8x16_p mask = {8,9,10,11, 12,13,14,15, 0,1,2,3, 4,5,6,7};
277  return VecPermute(val, val, mask);
278 }
279 
280 template <>
281 inline uint32x4_p Shuffle<3>(const uint32x4_p& val)
282 {
283  const uint8x16_p mask = {12,13,14,15, 0,1,2,3, 4,5,6,7, 8,9,10,11};
284  return VecPermute(val, val, mask);
285 }
286 
287 #endif // CRYPTOPP_ALTIVEC_AVAILABLE
288 
289 ANONYMOUS_NAMESPACE_END
290 
291 NAMESPACE_BEGIN(CryptoPP)
292 
293 // ***************************** NEON ***************************** //
294 
295 #if (CRYPTOPP_ARM_NEON_AVAILABLE)
296 
297 void ChaCha_OperateKeystream_NEON(const word32 *state, const byte* input, byte *output, unsigned int rounds)
298 {
299  const uint32x4_t state0 = vld1q_u32(state + 0*4);
300  const uint32x4_t state1 = vld1q_u32(state + 1*4);
301  const uint32x4_t state2 = vld1q_u32(state + 2*4);
302  const uint32x4_t state3 = vld1q_u32(state + 3*4);
303 
304  const unsigned int w[] = {1,0,0,0, 2,0,0,0, 3,0,0,0};
305  const uint32x4_t CTRS[3] = {
306  vld1q_u32(w+0), vld1q_u32(w+4), vld1q_u32(w+8)
307  };
308 
309  uint32x4_t r0_0 = state0;
310  uint32x4_t r0_1 = state1;
311  uint32x4_t r0_2 = state2;
312  uint32x4_t r0_3 = state3;
313 
314  uint32x4_t r1_0 = state0;
315  uint32x4_t r1_1 = state1;
316  uint32x4_t r1_2 = state2;
317  uint32x4_t r1_3 = Add64(r0_3, CTRS[0]);
318 
319  uint32x4_t r2_0 = state0;
320  uint32x4_t r2_1 = state1;
321  uint32x4_t r2_2 = state2;
322  uint32x4_t r2_3 = Add64(r0_3, CTRS[1]);
323 
324  uint32x4_t r3_0 = state0;
325  uint32x4_t r3_1 = state1;
326  uint32x4_t r3_2 = state2;
327  uint32x4_t r3_3 = Add64(r0_3, CTRS[2]);
328 
329  for (int i = static_cast<int>(rounds); i > 0; i -= 2)
330  {
331  r0_0 = vaddq_u32(r0_0, r0_1);
332  r1_0 = vaddq_u32(r1_0, r1_1);
333  r2_0 = vaddq_u32(r2_0, r2_1);
334  r3_0 = vaddq_u32(r3_0, r3_1);
335 
336  r0_3 = veorq_u32(r0_3, r0_0);
337  r1_3 = veorq_u32(r1_3, r1_0);
338  r2_3 = veorq_u32(r2_3, r2_0);
339  r3_3 = veorq_u32(r3_3, r3_0);
340 
341  r0_3 = RotateLeft<16>(r0_3);
342  r1_3 = RotateLeft<16>(r1_3);
343  r2_3 = RotateLeft<16>(r2_3);
344  r3_3 = RotateLeft<16>(r3_3);
345 
346  r0_2 = vaddq_u32(r0_2, r0_3);
347  r1_2 = vaddq_u32(r1_2, r1_3);
348  r2_2 = vaddq_u32(r2_2, r2_3);
349  r3_2 = vaddq_u32(r3_2, r3_3);
350 
351  r0_1 = veorq_u32(r0_1, r0_2);
352  r1_1 = veorq_u32(r1_1, r1_2);
353  r2_1 = veorq_u32(r2_1, r2_2);
354  r3_1 = veorq_u32(r3_1, r3_2);
355 
356  r0_1 = RotateLeft<12>(r0_1);
357  r1_1 = RotateLeft<12>(r1_1);
358  r2_1 = RotateLeft<12>(r2_1);
359  r3_1 = RotateLeft<12>(r3_1);
360 
361  r0_0 = vaddq_u32(r0_0, r0_1);
362  r1_0 = vaddq_u32(r1_0, r1_1);
363  r2_0 = vaddq_u32(r2_0, r2_1);
364  r3_0 = vaddq_u32(r3_0, r3_1);
365 
366  r0_3 = veorq_u32(r0_3, r0_0);
367  r1_3 = veorq_u32(r1_3, r1_0);
368  r2_3 = veorq_u32(r2_3, r2_0);
369  r3_3 = veorq_u32(r3_3, r3_0);
370 
371  r0_3 = RotateLeft<8>(r0_3);
372  r1_3 = RotateLeft<8>(r1_3);
373  r2_3 = RotateLeft<8>(r2_3);
374  r3_3 = RotateLeft<8>(r3_3);
375 
376  r0_2 = vaddq_u32(r0_2, r0_3);
377  r1_2 = vaddq_u32(r1_2, r1_3);
378  r2_2 = vaddq_u32(r2_2, r2_3);
379  r3_2 = vaddq_u32(r3_2, r3_3);
380 
381  r0_1 = veorq_u32(r0_1, r0_2);
382  r1_1 = veorq_u32(r1_1, r1_2);
383  r2_1 = veorq_u32(r2_1, r2_2);
384  r3_1 = veorq_u32(r3_1, r3_2);
385 
386  r0_1 = RotateLeft<7>(r0_1);
387  r1_1 = RotateLeft<7>(r1_1);
388  r2_1 = RotateLeft<7>(r2_1);
389  r3_1 = RotateLeft<7>(r3_1);
390 
391  r0_1 = Extract<1>(r0_1);
392  r0_2 = Extract<2>(r0_2);
393  r0_3 = Extract<3>(r0_3);
394 
395  r1_1 = Extract<1>(r1_1);
396  r1_2 = Extract<2>(r1_2);
397  r1_3 = Extract<3>(r1_3);
398 
399  r2_1 = Extract<1>(r2_1);
400  r2_2 = Extract<2>(r2_2);
401  r2_3 = Extract<3>(r2_3);
402 
403  r3_1 = Extract<1>(r3_1);
404  r3_2 = Extract<2>(r3_2);
405  r3_3 = Extract<3>(r3_3);
406 
407  r0_0 = vaddq_u32(r0_0, r0_1);
408  r1_0 = vaddq_u32(r1_0, r1_1);
409  r2_0 = vaddq_u32(r2_0, r2_1);
410  r3_0 = vaddq_u32(r3_0, r3_1);
411 
412  r0_3 = veorq_u32(r0_3, r0_0);
413  r1_3 = veorq_u32(r1_3, r1_0);
414  r2_3 = veorq_u32(r2_3, r2_0);
415  r3_3 = veorq_u32(r3_3, r3_0);
416 
417  r0_3 = RotateLeft<16>(r0_3);
418  r1_3 = RotateLeft<16>(r1_3);
419  r2_3 = RotateLeft<16>(r2_3);
420  r3_3 = RotateLeft<16>(r3_3);
421 
422  r0_2 = vaddq_u32(r0_2, r0_3);
423  r1_2 = vaddq_u32(r1_2, r1_3);
424  r2_2 = vaddq_u32(r2_2, r2_3);
425  r3_2 = vaddq_u32(r3_2, r3_3);
426 
427  r0_1 = veorq_u32(r0_1, r0_2);
428  r1_1 = veorq_u32(r1_1, r1_2);
429  r2_1 = veorq_u32(r2_1, r2_2);
430  r3_1 = veorq_u32(r3_1, r3_2);
431 
432  r0_1 = RotateLeft<12>(r0_1);
433  r1_1 = RotateLeft<12>(r1_1);
434  r2_1 = RotateLeft<12>(r2_1);
435  r3_1 = RotateLeft<12>(r3_1);
436 
437  r0_0 = vaddq_u32(r0_0, r0_1);
438  r1_0 = vaddq_u32(r1_0, r1_1);
439  r2_0 = vaddq_u32(r2_0, r2_1);
440  r3_0 = vaddq_u32(r3_0, r3_1);
441 
442  r0_3 = veorq_u32(r0_3, r0_0);
443  r1_3 = veorq_u32(r1_3, r1_0);
444  r2_3 = veorq_u32(r2_3, r2_0);
445  r3_3 = veorq_u32(r3_3, r3_0);
446 
447  r0_3 = RotateLeft<8>(r0_3);
448  r1_3 = RotateLeft<8>(r1_3);
449  r2_3 = RotateLeft<8>(r2_3);
450  r3_3 = RotateLeft<8>(r3_3);
451 
452  r0_2 = vaddq_u32(r0_2, r0_3);
453  r1_2 = vaddq_u32(r1_2, r1_3);
454  r2_2 = vaddq_u32(r2_2, r2_3);
455  r3_2 = vaddq_u32(r3_2, r3_3);
456 
457  r0_1 = veorq_u32(r0_1, r0_2);
458  r1_1 = veorq_u32(r1_1, r1_2);
459  r2_1 = veorq_u32(r2_1, r2_2);
460  r3_1 = veorq_u32(r3_1, r3_2);
461 
462  r0_1 = RotateLeft<7>(r0_1);
463  r1_1 = RotateLeft<7>(r1_1);
464  r2_1 = RotateLeft<7>(r2_1);
465  r3_1 = RotateLeft<7>(r3_1);
466 
467  r0_1 = Extract<3>(r0_1);
468  r0_2 = Extract<2>(r0_2);
469  r0_3 = Extract<1>(r0_3);
470 
471  r1_1 = Extract<3>(r1_1);
472  r1_2 = Extract<2>(r1_2);
473  r1_3 = Extract<1>(r1_3);
474 
475  r2_1 = Extract<3>(r2_1);
476  r2_2 = Extract<2>(r2_2);
477  r2_3 = Extract<1>(r2_3);
478 
479  r3_1 = Extract<3>(r3_1);
480  r3_2 = Extract<2>(r3_2);
481  r3_3 = Extract<1>(r3_3);
482  }
483 
484  r0_0 = vaddq_u32(r0_0, state0);
485  r0_1 = vaddq_u32(r0_1, state1);
486  r0_2 = vaddq_u32(r0_2, state2);
487  r0_3 = vaddq_u32(r0_3, state3);
488 
489  r1_0 = vaddq_u32(r1_0, state0);
490  r1_1 = vaddq_u32(r1_1, state1);
491  r1_2 = vaddq_u32(r1_2, state2);
492  r1_3 = vaddq_u32(r1_3, state3);
493  r1_3 = Add64(r1_3, CTRS[0]);
494 
495  r2_0 = vaddq_u32(r2_0, state0);
496  r2_1 = vaddq_u32(r2_1, state1);
497  r2_2 = vaddq_u32(r2_2, state2);
498  r2_3 = vaddq_u32(r2_3, state3);
499  r2_3 = Add64(r2_3, CTRS[1]);
500 
501  r3_0 = vaddq_u32(r3_0, state0);
502  r3_1 = vaddq_u32(r3_1, state1);
503  r3_2 = vaddq_u32(r3_2, state2);
504  r3_3 = vaddq_u32(r3_3, state3);
505  r3_3 = Add64(r3_3, CTRS[2]);
506 
507  if (input)
508  {
509  r0_0 = veorq_u32(vreinterpretq_u32_u8(vld1q_u8(input + 0*16)), r0_0);
510  r0_1 = veorq_u32(vreinterpretq_u32_u8(vld1q_u8(input + 1*16)), r0_1);
511  r0_2 = veorq_u32(vreinterpretq_u32_u8(vld1q_u8(input + 2*16)), r0_2);
512  r0_3 = veorq_u32(vreinterpretq_u32_u8(vld1q_u8(input + 3*16)), r0_3);
513  }
514 
515  vst1q_u8(output + 0*16, vreinterpretq_u8_u32(r0_0));
516  vst1q_u8(output + 1*16, vreinterpretq_u8_u32(r0_1));
517  vst1q_u8(output + 2*16, vreinterpretq_u8_u32(r0_2));
518  vst1q_u8(output + 3*16, vreinterpretq_u8_u32(r0_3));
519 
520  if (input)
521  {
522  r1_0 = veorq_u32(vreinterpretq_u32_u8(vld1q_u8(input + 4*16)), r1_0);
523  r1_1 = veorq_u32(vreinterpretq_u32_u8(vld1q_u8(input + 5*16)), r1_1);
524  r1_2 = veorq_u32(vreinterpretq_u32_u8(vld1q_u8(input + 6*16)), r1_2);
525  r1_3 = veorq_u32(vreinterpretq_u32_u8(vld1q_u8(input + 7*16)), r1_3);
526  }
527 
528  vst1q_u8(output + 4*16, vreinterpretq_u8_u32(r1_0));
529  vst1q_u8(output + 5*16, vreinterpretq_u8_u32(r1_1));
530  vst1q_u8(output + 6*16, vreinterpretq_u8_u32(r1_2));
531  vst1q_u8(output + 7*16, vreinterpretq_u8_u32(r1_3));
532 
533  if (input)
534  {
535  r2_0 = veorq_u32(vreinterpretq_u32_u8(vld1q_u8(input + 8*16)), r2_0);
536  r2_1 = veorq_u32(vreinterpretq_u32_u8(vld1q_u8(input + 9*16)), r2_1);
537  r2_2 = veorq_u32(vreinterpretq_u32_u8(vld1q_u8(input + 10*16)), r2_2);
538  r2_3 = veorq_u32(vreinterpretq_u32_u8(vld1q_u8(input + 11*16)), r2_3);
539  }
540 
541  vst1q_u8(output + 8*16, vreinterpretq_u8_u32(r2_0));
542  vst1q_u8(output + 9*16, vreinterpretq_u8_u32(r2_1));
543  vst1q_u8(output + 10*16, vreinterpretq_u8_u32(r2_2));
544  vst1q_u8(output + 11*16, vreinterpretq_u8_u32(r2_3));
545 
546  if (input)
547  {
548  r3_0 = veorq_u32(vreinterpretq_u32_u8(vld1q_u8(input + 12*16)), r3_0);
549  r3_1 = veorq_u32(vreinterpretq_u32_u8(vld1q_u8(input + 13*16)), r3_1);
550  r3_2 = veorq_u32(vreinterpretq_u32_u8(vld1q_u8(input + 14*16)), r3_2);
551  r3_3 = veorq_u32(vreinterpretq_u32_u8(vld1q_u8(input + 15*16)), r3_3);
552  }
553 
554  vst1q_u8(output + 12*16, vreinterpretq_u8_u32(r3_0));
555  vst1q_u8(output + 13*16, vreinterpretq_u8_u32(r3_1));
556  vst1q_u8(output + 14*16, vreinterpretq_u8_u32(r3_2));
557  vst1q_u8(output + 15*16, vreinterpretq_u8_u32(r3_3));
558 }
559 
560 #endif // CRYPTOPP_ARM_NEON_AVAILABLE
561 
562 // ***************************** SSE2 ***************************** //
563 
564 #if (CRYPTOPP_SSE2_INTRIN_AVAILABLE)
565 
566 void ChaCha_OperateKeystream_SSE2(const word32 *state, const byte* input, byte *output, unsigned int rounds)
567 {
568  const __m128i* state_mm = reinterpret_cast<const __m128i*>(state);
569  const __m128i* input_mm = reinterpret_cast<const __m128i*>(input);
570  __m128i* output_mm = reinterpret_cast<__m128i*>(output);
571 
572  const __m128i state0 = _mm_load_si128(state_mm + 0);
573  const __m128i state1 = _mm_load_si128(state_mm + 1);
574  const __m128i state2 = _mm_load_si128(state_mm + 2);
575  const __m128i state3 = _mm_load_si128(state_mm + 3);
576 
577  __m128i r0_0 = state0;
578  __m128i r0_1 = state1;
579  __m128i r0_2 = state2;
580  __m128i r0_3 = state3;
581 
582  __m128i r1_0 = state0;
583  __m128i r1_1 = state1;
584  __m128i r1_2 = state2;
585  __m128i r1_3 = _mm_add_epi64(r0_3, _mm_set_epi32(0, 0, 0, 1));
586 
587  __m128i r2_0 = state0;
588  __m128i r2_1 = state1;
589  __m128i r2_2 = state2;
590  __m128i r2_3 = _mm_add_epi64(r0_3, _mm_set_epi32(0, 0, 0, 2));
591 
592  __m128i r3_0 = state0;
593  __m128i r3_1 = state1;
594  __m128i r3_2 = state2;
595  __m128i r3_3 = _mm_add_epi64(r0_3, _mm_set_epi32(0, 0, 0, 3));
596 
597  for (int i = static_cast<int>(rounds); i > 0; i -= 2)
598  {
599  r0_0 = _mm_add_epi32(r0_0, r0_1);
600  r1_0 = _mm_add_epi32(r1_0, r1_1);
601  r2_0 = _mm_add_epi32(r2_0, r2_1);
602  r3_0 = _mm_add_epi32(r3_0, r3_1);
603 
604  r0_3 = _mm_xor_si128(r0_3, r0_0);
605  r1_3 = _mm_xor_si128(r1_3, r1_0);
606  r2_3 = _mm_xor_si128(r2_3, r2_0);
607  r3_3 = _mm_xor_si128(r3_3, r3_0);
608 
609  r0_3 = RotateLeft<16>(r0_3);
610  r1_3 = RotateLeft<16>(r1_3);
611  r2_3 = RotateLeft<16>(r2_3);
612  r3_3 = RotateLeft<16>(r3_3);
613 
614  r0_2 = _mm_add_epi32(r0_2, r0_3);
615  r1_2 = _mm_add_epi32(r1_2, r1_3);
616  r2_2 = _mm_add_epi32(r2_2, r2_3);
617  r3_2 = _mm_add_epi32(r3_2, r3_3);
618 
619  r0_1 = _mm_xor_si128(r0_1, r0_2);
620  r1_1 = _mm_xor_si128(r1_1, r1_2);
621  r2_1 = _mm_xor_si128(r2_1, r2_2);
622  r3_1 = _mm_xor_si128(r3_1, r3_2);
623 
624  r0_1 = RotateLeft<12>(r0_1);
625  r1_1 = RotateLeft<12>(r1_1);
626  r2_1 = RotateLeft<12>(r2_1);
627  r3_1 = RotateLeft<12>(r3_1);
628 
629  r0_0 = _mm_add_epi32(r0_0, r0_1);
630  r1_0 = _mm_add_epi32(r1_0, r1_1);
631  r2_0 = _mm_add_epi32(r2_0, r2_1);
632  r3_0 = _mm_add_epi32(r3_0, r3_1);
633 
634  r0_3 = _mm_xor_si128(r0_3, r0_0);
635  r1_3 = _mm_xor_si128(r1_3, r1_0);
636  r2_3 = _mm_xor_si128(r2_3, r2_0);
637  r3_3 = _mm_xor_si128(r3_3, r3_0);
638 
639  r0_3 = RotateLeft<8>(r0_3);
640  r1_3 = RotateLeft<8>(r1_3);
641  r2_3 = RotateLeft<8>(r2_3);
642  r3_3 = RotateLeft<8>(r3_3);
643 
644  r0_2 = _mm_add_epi32(r0_2, r0_3);
645  r1_2 = _mm_add_epi32(r1_2, r1_3);
646  r2_2 = _mm_add_epi32(r2_2, r2_3);
647  r3_2 = _mm_add_epi32(r3_2, r3_3);
648 
649  r0_1 = _mm_xor_si128(r0_1, r0_2);
650  r1_1 = _mm_xor_si128(r1_1, r1_2);
651  r2_1 = _mm_xor_si128(r2_1, r2_2);
652  r3_1 = _mm_xor_si128(r3_1, r3_2);
653 
654  r0_1 = RotateLeft<7>(r0_1);
655  r1_1 = RotateLeft<7>(r1_1);
656  r2_1 = RotateLeft<7>(r2_1);
657  r3_1 = RotateLeft<7>(r3_1);
658 
659  r0_1 = _mm_shuffle_epi32(r0_1, _MM_SHUFFLE(0, 3, 2, 1));
660  r0_2 = _mm_shuffle_epi32(r0_2, _MM_SHUFFLE(1, 0, 3, 2));
661  r0_3 = _mm_shuffle_epi32(r0_3, _MM_SHUFFLE(2, 1, 0, 3));
662 
663  r1_1 = _mm_shuffle_epi32(r1_1, _MM_SHUFFLE(0, 3, 2, 1));
664  r1_2 = _mm_shuffle_epi32(r1_2, _MM_SHUFFLE(1, 0, 3, 2));
665  r1_3 = _mm_shuffle_epi32(r1_3, _MM_SHUFFLE(2, 1, 0, 3));
666 
667  r2_1 = _mm_shuffle_epi32(r2_1, _MM_SHUFFLE(0, 3, 2, 1));
668  r2_2 = _mm_shuffle_epi32(r2_2, _MM_SHUFFLE(1, 0, 3, 2));
669  r2_3 = _mm_shuffle_epi32(r2_3, _MM_SHUFFLE(2, 1, 0, 3));
670 
671  r3_1 = _mm_shuffle_epi32(r3_1, _MM_SHUFFLE(0, 3, 2, 1));
672  r3_2 = _mm_shuffle_epi32(r3_2, _MM_SHUFFLE(1, 0, 3, 2));
673  r3_3 = _mm_shuffle_epi32(r3_3, _MM_SHUFFLE(2, 1, 0, 3));
674 
675  r0_0 = _mm_add_epi32(r0_0, r0_1);
676  r1_0 = _mm_add_epi32(r1_0, r1_1);
677  r2_0 = _mm_add_epi32(r2_0, r2_1);
678  r3_0 = _mm_add_epi32(r3_0, r3_1);
679 
680  r0_3 = _mm_xor_si128(r0_3, r0_0);
681  r1_3 = _mm_xor_si128(r1_3, r1_0);
682  r2_3 = _mm_xor_si128(r2_3, r2_0);
683  r3_3 = _mm_xor_si128(r3_3, r3_0);
684 
685  r0_3 = RotateLeft<16>(r0_3);
686  r1_3 = RotateLeft<16>(r1_3);
687  r2_3 = RotateLeft<16>(r2_3);
688  r3_3 = RotateLeft<16>(r3_3);
689 
690  r0_2 = _mm_add_epi32(r0_2, r0_3);
691  r1_2 = _mm_add_epi32(r1_2, r1_3);
692  r2_2 = _mm_add_epi32(r2_2, r2_3);
693  r3_2 = _mm_add_epi32(r3_2, r3_3);
694 
695  r0_1 = _mm_xor_si128(r0_1, r0_2);
696  r1_1 = _mm_xor_si128(r1_1, r1_2);
697  r2_1 = _mm_xor_si128(r2_1, r2_2);
698  r3_1 = _mm_xor_si128(r3_1, r3_2);
699 
700  r0_1 = RotateLeft<12>(r0_1);
701  r1_1 = RotateLeft<12>(r1_1);
702  r2_1 = RotateLeft<12>(r2_1);
703  r3_1 = RotateLeft<12>(r3_1);
704 
705  r0_0 = _mm_add_epi32(r0_0, r0_1);
706  r1_0 = _mm_add_epi32(r1_0, r1_1);
707  r2_0 = _mm_add_epi32(r2_0, r2_1);
708  r3_0 = _mm_add_epi32(r3_0, r3_1);
709 
710  r0_3 = _mm_xor_si128(r0_3, r0_0);
711  r1_3 = _mm_xor_si128(r1_3, r1_0);
712  r2_3 = _mm_xor_si128(r2_3, r2_0);
713  r3_3 = _mm_xor_si128(r3_3, r3_0);
714 
715  r0_3 = RotateLeft<8>(r0_3);
716  r1_3 = RotateLeft<8>(r1_3);
717  r2_3 = RotateLeft<8>(r2_3);
718  r3_3 = RotateLeft<8>(r3_3);
719 
720  r0_2 = _mm_add_epi32(r0_2, r0_3);
721  r1_2 = _mm_add_epi32(r1_2, r1_3);
722  r2_2 = _mm_add_epi32(r2_2, r2_3);
723  r3_2 = _mm_add_epi32(r3_2, r3_3);
724 
725  r0_1 = _mm_xor_si128(r0_1, r0_2);
726  r1_1 = _mm_xor_si128(r1_1, r1_2);
727  r2_1 = _mm_xor_si128(r2_1, r2_2);
728  r3_1 = _mm_xor_si128(r3_1, r3_2);
729 
730  r0_1 = RotateLeft<7>(r0_1);
731  r1_1 = RotateLeft<7>(r1_1);
732  r2_1 = RotateLeft<7>(r2_1);
733  r3_1 = RotateLeft<7>(r3_1);
734 
735  r0_1 = _mm_shuffle_epi32(r0_1, _MM_SHUFFLE(2, 1, 0, 3));
736  r0_2 = _mm_shuffle_epi32(r0_2, _MM_SHUFFLE(1, 0, 3, 2));
737  r0_3 = _mm_shuffle_epi32(r0_3, _MM_SHUFFLE(0, 3, 2, 1));
738 
739  r1_1 = _mm_shuffle_epi32(r1_1, _MM_SHUFFLE(2, 1, 0, 3));
740  r1_2 = _mm_shuffle_epi32(r1_2, _MM_SHUFFLE(1, 0, 3, 2));
741  r1_3 = _mm_shuffle_epi32(r1_3, _MM_SHUFFLE(0, 3, 2, 1));
742 
743  r2_1 = _mm_shuffle_epi32(r2_1, _MM_SHUFFLE(2, 1, 0, 3));
744  r2_2 = _mm_shuffle_epi32(r2_2, _MM_SHUFFLE(1, 0, 3, 2));
745  r2_3 = _mm_shuffle_epi32(r2_3, _MM_SHUFFLE(0, 3, 2, 1));
746 
747  r3_1 = _mm_shuffle_epi32(r3_1, _MM_SHUFFLE(2, 1, 0, 3));
748  r3_2 = _mm_shuffle_epi32(r3_2, _MM_SHUFFLE(1, 0, 3, 2));
749  r3_3 = _mm_shuffle_epi32(r3_3, _MM_SHUFFLE(0, 3, 2, 1));
750  }
751 
752  r0_0 = _mm_add_epi32(r0_0, state0);
753  r0_1 = _mm_add_epi32(r0_1, state1);
754  r0_2 = _mm_add_epi32(r0_2, state2);
755  r0_3 = _mm_add_epi32(r0_3, state3);
756 
757  r1_0 = _mm_add_epi32(r1_0, state0);
758  r1_1 = _mm_add_epi32(r1_1, state1);
759  r1_2 = _mm_add_epi32(r1_2, state2);
760  r1_3 = _mm_add_epi32(r1_3, state3);
761  r1_3 = _mm_add_epi64(r1_3, _mm_set_epi32(0, 0, 0, 1));
762 
763  r2_0 = _mm_add_epi32(r2_0, state0);
764  r2_1 = _mm_add_epi32(r2_1, state1);
765  r2_2 = _mm_add_epi32(r2_2, state2);
766  r2_3 = _mm_add_epi32(r2_3, state3);
767  r2_3 = _mm_add_epi64(r2_3, _mm_set_epi32(0, 0, 0, 2));
768 
769  r3_0 = _mm_add_epi32(r3_0, state0);
770  r3_1 = _mm_add_epi32(r3_1, state1);
771  r3_2 = _mm_add_epi32(r3_2, state2);
772  r3_3 = _mm_add_epi32(r3_3, state3);
773  r3_3 = _mm_add_epi64(r3_3, _mm_set_epi32(0, 0, 0, 3));
774 
775  if (input_mm)
776  {
777  r0_0 = _mm_xor_si128(_mm_loadu_si128(input_mm + 0), r0_0);
778  r0_1 = _mm_xor_si128(_mm_loadu_si128(input_mm + 1), r0_1);
779  r0_2 = _mm_xor_si128(_mm_loadu_si128(input_mm + 2), r0_2);
780  r0_3 = _mm_xor_si128(_mm_loadu_si128(input_mm + 3), r0_3);
781  }
782 
783  _mm_storeu_si128(output_mm + 0, r0_0);
784  _mm_storeu_si128(output_mm + 1, r0_1);
785  _mm_storeu_si128(output_mm + 2, r0_2);
786  _mm_storeu_si128(output_mm + 3, r0_3);
787 
788  if (input_mm)
789  {
790  r1_0 = _mm_xor_si128(_mm_loadu_si128(input_mm + 4), r1_0);
791  r1_1 = _mm_xor_si128(_mm_loadu_si128(input_mm + 5), r1_1);
792  r1_2 = _mm_xor_si128(_mm_loadu_si128(input_mm + 6), r1_2);
793  r1_3 = _mm_xor_si128(_mm_loadu_si128(input_mm + 7), r1_3);
794  }
795 
796  _mm_storeu_si128(output_mm + 4, r1_0);
797  _mm_storeu_si128(output_mm + 5, r1_1);
798  _mm_storeu_si128(output_mm + 6, r1_2);
799  _mm_storeu_si128(output_mm + 7, r1_3);
800 
801  if (input_mm)
802  {
803  r2_0 = _mm_xor_si128(_mm_loadu_si128(input_mm + 8), r2_0);
804  r2_1 = _mm_xor_si128(_mm_loadu_si128(input_mm + 9), r2_1);
805  r2_2 = _mm_xor_si128(_mm_loadu_si128(input_mm + 10), r2_2);
806  r2_3 = _mm_xor_si128(_mm_loadu_si128(input_mm + 11), r2_3);
807  }
808 
809  _mm_storeu_si128(output_mm + 8, r2_0);
810  _mm_storeu_si128(output_mm + 9, r2_1);
811  _mm_storeu_si128(output_mm + 10, r2_2);
812  _mm_storeu_si128(output_mm + 11, r2_3);
813 
814  if (input_mm)
815  {
816  r3_0 = _mm_xor_si128(_mm_loadu_si128(input_mm + 12), r3_0);
817  r3_1 = _mm_xor_si128(_mm_loadu_si128(input_mm + 13), r3_1);
818  r3_2 = _mm_xor_si128(_mm_loadu_si128(input_mm + 14), r3_2);
819  r3_3 = _mm_xor_si128(_mm_loadu_si128(input_mm + 15), r3_3);
820  }
821 
822  _mm_storeu_si128(output_mm + 12, r3_0);
823  _mm_storeu_si128(output_mm + 13, r3_1);
824  _mm_storeu_si128(output_mm + 14, r3_2);
825  _mm_storeu_si128(output_mm + 15, r3_3);
826 }
827 
828 #endif // CRYPTOPP_SSE2_INTRIN_AVAILABLE
829 
830 #if (CRYPTOPP_POWER8_AVAILABLE || CRYPTOPP_ALTIVEC_AVAILABLE)
831 
832 // ChaCha_OperateKeystream_CORE will use either POWER7 or ALTIVEC,
833 // depending on the flags used to compile this source file. The
834 // abstractions are handled in VecLoad, VecStore and friends. In
835 // the future we may to provide both POWER7 or ALTIVEC at the same
836 // time to better support distros.
837 inline void ChaCha_OperateKeystream_CORE(const word32 *state, const byte* input, byte *output, unsigned int rounds)
838 {
839  const uint32x4_p state0 = VecLoad(state + 0*4);
840  const uint32x4_p state1 = VecLoad(state + 1*4);
841  const uint32x4_p state2 = VecLoad(state + 2*4);
842  const uint32x4_p state3 = VecLoad(state + 3*4);
843 
844  const uint32x4_p CTRS[3] = {
845  {1,0,0,0}, {2,0,0,0}, {3,0,0,0}
846  };
847 
848  uint32x4_p r0_0 = state0;
849  uint32x4_p r0_1 = state1;
850  uint32x4_p r0_2 = state2;
851  uint32x4_p r0_3 = state3;
852 
853  uint32x4_p r1_0 = state0;
854  uint32x4_p r1_1 = state1;
855  uint32x4_p r1_2 = state2;
856  uint32x4_p r1_3 = VecAdd64(r0_3, CTRS[0]);
857 
858  uint32x4_p r2_0 = state0;
859  uint32x4_p r2_1 = state1;
860  uint32x4_p r2_2 = state2;
861  uint32x4_p r2_3 = VecAdd64(r0_3, CTRS[1]);
862 
863  uint32x4_p r3_0 = state0;
864  uint32x4_p r3_1 = state1;
865  uint32x4_p r3_2 = state2;
866  uint32x4_p r3_3 = VecAdd64(r0_3, CTRS[2]);
867 
868  for (int i = static_cast<int>(rounds); i > 0; i -= 2)
869  {
870  r0_0 = VecAdd(r0_0, r0_1);
871  r1_0 = VecAdd(r1_0, r1_1);
872  r2_0 = VecAdd(r2_0, r2_1);
873  r3_0 = VecAdd(r3_0, r3_1);
874 
875  r0_3 = VecXor(r0_3, r0_0);
876  r1_3 = VecXor(r1_3, r1_0);
877  r2_3 = VecXor(r2_3, r2_0);
878  r3_3 = VecXor(r3_3, r3_0);
879 
880  r0_3 = VecRotateLeft<16>(r0_3);
881  r1_3 = VecRotateLeft<16>(r1_3);
882  r2_3 = VecRotateLeft<16>(r2_3);
883  r3_3 = VecRotateLeft<16>(r3_3);
884 
885  r0_2 = VecAdd(r0_2, r0_3);
886  r1_2 = VecAdd(r1_2, r1_3);
887  r2_2 = VecAdd(r2_2, r2_3);
888  r3_2 = VecAdd(r3_2, r3_3);
889 
890  r0_1 = VecXor(r0_1, r0_2);
891  r1_1 = VecXor(r1_1, r1_2);
892  r2_1 = VecXor(r2_1, r2_2);
893  r3_1 = VecXor(r3_1, r3_2);
894 
895  r0_1 = VecRotateLeft<12>(r0_1);
896  r1_1 = VecRotateLeft<12>(r1_1);
897  r2_1 = VecRotateLeft<12>(r2_1);
898  r3_1 = VecRotateLeft<12>(r3_1);
899 
900  r0_0 = VecAdd(r0_0, r0_1);
901  r1_0 = VecAdd(r1_0, r1_1);
902  r2_0 = VecAdd(r2_0, r2_1);
903  r3_0 = VecAdd(r3_0, r3_1);
904 
905  r0_3 = VecXor(r0_3, r0_0);
906  r1_3 = VecXor(r1_3, r1_0);
907  r2_3 = VecXor(r2_3, r2_0);
908  r3_3 = VecXor(r3_3, r3_0);
909 
910  r0_3 = VecRotateLeft<8>(r0_3);
911  r1_3 = VecRotateLeft<8>(r1_3);
912  r2_3 = VecRotateLeft<8>(r2_3);
913  r3_3 = VecRotateLeft<8>(r3_3);
914 
915  r0_2 = VecAdd(r0_2, r0_3);
916  r1_2 = VecAdd(r1_2, r1_3);
917  r2_2 = VecAdd(r2_2, r2_3);
918  r3_2 = VecAdd(r3_2, r3_3);
919 
920  r0_1 = VecXor(r0_1, r0_2);
921  r1_1 = VecXor(r1_1, r1_2);
922  r2_1 = VecXor(r2_1, r2_2);
923  r3_1 = VecXor(r3_1, r3_2);
924 
925  r0_1 = VecRotateLeft<7>(r0_1);
926  r1_1 = VecRotateLeft<7>(r1_1);
927  r2_1 = VecRotateLeft<7>(r2_1);
928  r3_1 = VecRotateLeft<7>(r3_1);
929 
930  r0_1 = Shuffle<1>(r0_1);
931  r0_2 = Shuffle<2>(r0_2);
932  r0_3 = Shuffle<3>(r0_3);
933 
934  r1_1 = Shuffle<1>(r1_1);
935  r1_2 = Shuffle<2>(r1_2);
936  r1_3 = Shuffle<3>(r1_3);
937 
938  r2_1 = Shuffle<1>(r2_1);
939  r2_2 = Shuffle<2>(r2_2);
940  r2_3 = Shuffle<3>(r2_3);
941 
942  r3_1 = Shuffle<1>(r3_1);
943  r3_2 = Shuffle<2>(r3_2);
944  r3_3 = Shuffle<3>(r3_3);
945 
946  r0_0 = VecAdd(r0_0, r0_1);
947  r1_0 = VecAdd(r1_0, r1_1);
948  r2_0 = VecAdd(r2_0, r2_1);
949  r3_0 = VecAdd(r3_0, r3_1);
950 
951  r0_3 = VecXor(r0_3, r0_0);
952  r1_3 = VecXor(r1_3, r1_0);
953  r2_3 = VecXor(r2_3, r2_0);
954  r3_3 = VecXor(r3_3, r3_0);
955 
956  r0_3 = VecRotateLeft<16>(r0_3);
957  r1_3 = VecRotateLeft<16>(r1_3);
958  r2_3 = VecRotateLeft<16>(r2_3);
959  r3_3 = VecRotateLeft<16>(r3_3);
960 
961  r0_2 = VecAdd(r0_2, r0_3);
962  r1_2 = VecAdd(r1_2, r1_3);
963  r2_2 = VecAdd(r2_2, r2_3);
964  r3_2 = VecAdd(r3_2, r3_3);
965 
966  r0_1 = VecXor(r0_1, r0_2);
967  r1_1 = VecXor(r1_1, r1_2);
968  r2_1 = VecXor(r2_1, r2_2);
969  r3_1 = VecXor(r3_1, r3_2);
970 
971  r0_1 = VecRotateLeft<12>(r0_1);
972  r1_1 = VecRotateLeft<12>(r1_1);
973  r2_1 = VecRotateLeft<12>(r2_1);
974  r3_1 = VecRotateLeft<12>(r3_1);
975 
976  r0_0 = VecAdd(r0_0, r0_1);
977  r1_0 = VecAdd(r1_0, r1_1);
978  r2_0 = VecAdd(r2_0, r2_1);
979  r3_0 = VecAdd(r3_0, r3_1);
980 
981  r0_3 = VecXor(r0_3, r0_0);
982  r1_3 = VecXor(r1_3, r1_0);
983  r2_3 = VecXor(r2_3, r2_0);
984  r3_3 = VecXor(r3_3, r3_0);
985 
986  r0_3 = VecRotateLeft<8>(r0_3);
987  r1_3 = VecRotateLeft<8>(r1_3);
988  r2_3 = VecRotateLeft<8>(r2_3);
989  r3_3 = VecRotateLeft<8>(r3_3);
990 
991  r0_2 = VecAdd(r0_2, r0_3);
992  r1_2 = VecAdd(r1_2, r1_3);
993  r2_2 = VecAdd(r2_2, r2_3);
994  r3_2 = VecAdd(r3_2, r3_3);
995 
996  r0_1 = VecXor(r0_1, r0_2);
997  r1_1 = VecXor(r1_1, r1_2);
998  r2_1 = VecXor(r2_1, r2_2);
999  r3_1 = VecXor(r3_1, r3_2);
1000 
1001  r0_1 = VecRotateLeft<7>(r0_1);
1002  r1_1 = VecRotateLeft<7>(r1_1);
1003  r2_1 = VecRotateLeft<7>(r2_1);
1004  r3_1 = VecRotateLeft<7>(r3_1);
1005 
1006  r0_1 = Shuffle<3>(r0_1);
1007  r0_2 = Shuffle<2>(r0_2);
1008  r0_3 = Shuffle<1>(r0_3);
1009 
1010  r1_1 = Shuffle<3>(r1_1);
1011  r1_2 = Shuffle<2>(r1_2);
1012  r1_3 = Shuffle<1>(r1_3);
1013 
1014  r2_1 = Shuffle<3>(r2_1);
1015  r2_2 = Shuffle<2>(r2_2);
1016  r2_3 = Shuffle<1>(r2_3);
1017 
1018  r3_1 = Shuffle<3>(r3_1);
1019  r3_2 = Shuffle<2>(r3_2);
1020  r3_3 = Shuffle<1>(r3_3);
1021  }
1022 
1023  r0_0 = VecAdd(r0_0, state0);
1024  r0_1 = VecAdd(r0_1, state1);
1025  r0_2 = VecAdd(r0_2, state2);
1026  r0_3 = VecAdd(r0_3, state3);
1027 
1028  r1_0 = VecAdd(r1_0, state0);
1029  r1_1 = VecAdd(r1_1, state1);
1030  r1_2 = VecAdd(r1_2, state2);
1031  r1_3 = VecAdd(r1_3, state3);
1032  r1_3 = VecAdd64(r1_3, CTRS[0]);
1033 
1034  r2_0 = VecAdd(r2_0, state0);
1035  r2_1 = VecAdd(r2_1, state1);
1036  r2_2 = VecAdd(r2_2, state2);
1037  r2_3 = VecAdd(r2_3, state3);
1038  r2_3 = VecAdd64(r2_3, CTRS[1]);
1039 
1040  r3_0 = VecAdd(r3_0, state0);
1041  r3_1 = VecAdd(r3_1, state1);
1042  r3_2 = VecAdd(r3_2, state2);
1043  r3_3 = VecAdd(r3_3, state3);
1044  r3_3 = VecAdd64(r3_3, CTRS[2]);
1045 
1046  if (input)
1047  {
1048  r0_0 = VecXor(VecLoad32LE(input + 0*16), r0_0);
1049  r0_1 = VecXor(VecLoad32LE(input + 1*16), r0_1);
1050  r0_2 = VecXor(VecLoad32LE(input + 2*16), r0_2);
1051  r0_3 = VecXor(VecLoad32LE(input + 3*16), r0_3);
1052  }
1053 
1054  VecStore32LE(output + 0*16, r0_0);
1055  VecStore32LE(output + 1*16, r0_1);
1056  VecStore32LE(output + 2*16, r0_2);
1057  VecStore32LE(output + 3*16, r0_3);
1058 
1059  if (input)
1060  {
1061  r1_0 = VecXor(VecLoad32LE(input + 4*16), r1_0);
1062  r1_1 = VecXor(VecLoad32LE(input + 5*16), r1_1);
1063  r1_2 = VecXor(VecLoad32LE(input + 6*16), r1_2);
1064  r1_3 = VecXor(VecLoad32LE(input + 7*16), r1_3);
1065  }
1066 
1067  VecStore32LE(output + 4*16, r1_0);
1068  VecStore32LE(output + 5*16, r1_1);
1069  VecStore32LE(output + 6*16, r1_2);
1070  VecStore32LE(output + 7*16, r1_3);
1071 
1072  if (input)
1073  {
1074  r2_0 = VecXor(VecLoad32LE(input + 8*16), r2_0);
1075  r2_1 = VecXor(VecLoad32LE(input + 9*16), r2_1);
1076  r2_2 = VecXor(VecLoad32LE(input + 10*16), r2_2);
1077  r2_3 = VecXor(VecLoad32LE(input + 11*16), r2_3);
1078  }
1079 
1080  VecStore32LE(output + 8*16, r2_0);
1081  VecStore32LE(output + 9*16, r2_1);
1082  VecStore32LE(output + 10*16, r2_2);
1083  VecStore32LE(output + 11*16, r2_3);
1084 
1085  if (input)
1086  {
1087  r3_0 = VecXor(VecLoad32LE(input + 12*16), r3_0);
1088  r3_1 = VecXor(VecLoad32LE(input + 13*16), r3_1);
1089  r3_2 = VecXor(VecLoad32LE(input + 14*16), r3_2);
1090  r3_3 = VecXor(VecLoad32LE(input + 15*16), r3_3);
1091  }
1092 
1093  VecStore32LE(output + 12*16, r3_0);
1094  VecStore32LE(output + 13*16, r3_1);
1095  VecStore32LE(output + 14*16, r3_2);
1096  VecStore32LE(output + 15*16, r3_3);
1097 }
1098 
1099 #endif // CRYPTOPP_POWER8_AVAILABLE || CRYPTOPP_ALTIVEC_AVAILABLE
1100 
1101 #if (CRYPTOPP_POWER8_AVAILABLE)
1102 
1103 void ChaCha_OperateKeystream_POWER8(const word32 *state, const byte* input, byte *output, unsigned int rounds)
1104 {
1105  ChaCha_OperateKeystream_CORE(state, input, output, rounds);
1106 }
1107 
1108 #elif (CRYPTOPP_ALTIVEC_AVAILABLE)
1109 
1110 void ChaCha_OperateKeystream_ALTIVEC(const word32 *state, const byte* input, byte *output, unsigned int rounds)
1111 {
1112  ChaCha_OperateKeystream_CORE(state, input, output, rounds);
1113 }
1114 
1115 #endif
1116 
1117 NAMESPACE_END
Utility functions for the Crypto++ library.
uint32x4_p VecAdd64(const uint32x4_p &vec1, const uint32x4_p &vec2)
Add two vectors.
Definition: ppc_simd.h:974
Library configuration file.
T1 VecAdd(const T1 vec1, const T2 vec2)
Add two vectors.
Definition: ppc_simd.h:939
T1 VecPermute(const T1 vec, const T2 mask)
Permutes a vector.
Definition: ppc_simd.h:1010
__vector unsigned int uint32x4_p
Vector of 32-bit elements.
Definition: ppc_simd.h:129
Support functions for PowerPC and vector operations.
Precompiled header file.
void VecStore(const T data, byte dest[16])
Stores a vector to a byte array.
Definition: ppc_simd.h:605
#define CRYPTOPP_ASSERT(exp)
Debugging and diagnostic assertion.
Definition: trap.h:69
T1 VecXor(const T1 vec1, const T2 vec2)
XOR two vectors.
Definition: ppc_simd.h:916
Crypto++ library namespace.
uint32x4_p VecLoad(const byte src[16])
Loads a vector from a byte array.
Definition: ppc_simd.h:253
__vector unsigned char uint8x16_p
Vector of 8-bit elements.
Definition: ppc_simd.h:119
Classes for ChaCha8, ChaCha12 and ChaCha20 stream ciphers.