23 #if defined(CRYPTOPP_AVX2_AVAILABLE) 24 # include <xmmintrin.h> 25 # include <emmintrin.h> 26 # include <immintrin.h> 30 extern const char CHACHA_AVX_FNAME[] = __FILE__;
33 #if (__SUNPRO_CC >= 0x5140) && (__SUNPRO_CC <= 0x5150) 36 # define MAYBE_CONST const 45 #if (_MSC_VER >= 1910) 46 # ifndef CRYPTOPP_DEBUG 47 # pragma optimize("", off) 48 # pragma optimize("ts", on) 54 #if CRYPTOPP_GCC_DIAGNOSTIC_AVAILABLE 55 # pragma GCC diagnostic ignored "-Wcast-align" 58 ANONYMOUS_NAMESPACE_BEGIN
60 #if (CRYPTOPP_AVX2_AVAILABLE) 62 template <
unsigned int R>
63 inline __m256i RotateLeft(
const __m256i val)
65 return _mm256_or_si256(_mm256_slli_epi32(val, R), _mm256_srli_epi32(val, 32-R));
69 inline __m256i RotateLeft<8>(
const __m256i val)
71 const __m256i mask = _mm256_set_epi8(14,13,12,15, 10,9,8,11, 6,5,4,7, 2,1,0,3,
72 14,13,12,15, 10,9,8,11, 6,5,4,7, 2,1,0,3);
73 return _mm256_shuffle_epi8(val, mask);
77 inline __m256i RotateLeft<16>(
const __m256i val)
79 const __m256i mask = _mm256_set_epi8(13,12,15,14, 9,8,11,10, 5,4,7,6, 1,0,3,2,
80 13,12,15,14, 9,8,11,10, 5,4,7,6, 1,0,3,2);
81 return _mm256_shuffle_epi8(val, mask);
84 #endif // CRYPTOPP_AVX2_AVAILABLE 86 ANONYMOUS_NAMESPACE_END
90 #if (CRYPTOPP_AVX2_AVAILABLE) 92 void ChaCha_OperateKeystream_AVX2(
const word32 *state,
const byte* input, byte *output,
unsigned int rounds)
94 MAYBE_CONST __m128i* state_mm = (MAYBE_CONST __m128i*)(state);
95 MAYBE_CONST __m256i* input_mm = (MAYBE_CONST __m256i*)(input);
96 __m256i* output_mm =
reinterpret_cast<__m256i*
>(output);
98 const __m256i state0 = _mm256_broadcastsi128_si256(_mm_loadu_si128(state_mm + 0));
99 const __m256i state1 = _mm256_broadcastsi128_si256(_mm_loadu_si128(state_mm + 1));
100 const __m256i state2 = _mm256_broadcastsi128_si256(_mm_loadu_si128(state_mm + 2));
101 const __m256i state3 = _mm256_broadcastsi128_si256(_mm_loadu_si128(state_mm + 3));
103 const __m256i CTR0 = _mm256_set_epi32(0, 0, 0, 0, 0, 0, 0, 4);
104 const __m256i CTR1 = _mm256_set_epi32(0, 0, 0, 1, 0, 0, 0, 5);
105 const __m256i CTR2 = _mm256_set_epi32(0, 0, 0, 2, 0, 0, 0, 6);
106 const __m256i CTR3 = _mm256_set_epi32(0, 0, 0, 3, 0, 0, 0, 7);
108 __m256i X0_0 = state0;
109 __m256i X0_1 = state1;
110 __m256i X0_2 = state2;
111 __m256i X0_3 = _mm256_add_epi64(state3, CTR0);
113 __m256i X1_0 = state0;
114 __m256i X1_1 = state1;
115 __m256i X1_2 = state2;
116 __m256i X1_3 = _mm256_add_epi64(state3, CTR1);
118 __m256i X2_0 = state0;
119 __m256i X2_1 = state1;
120 __m256i X2_2 = state2;
121 __m256i X2_3 = _mm256_add_epi64(state3, CTR2);
123 __m256i X3_0 = state0;
124 __m256i X3_1 = state1;
125 __m256i X3_2 = state2;
126 __m256i X3_3 = _mm256_add_epi64(state3, CTR3);
128 for (
int i = static_cast<int>(rounds); i > 0; i -= 2)
130 X0_0 = _mm256_add_epi32(X0_0, X0_1);
131 X1_0 = _mm256_add_epi32(X1_0, X1_1);
132 X2_0 = _mm256_add_epi32(X2_0, X2_1);
133 X3_0 = _mm256_add_epi32(X3_0, X3_1);
135 X0_3 = _mm256_xor_si256(X0_3, X0_0);
136 X1_3 = _mm256_xor_si256(X1_3, X1_0);
137 X2_3 = _mm256_xor_si256(X2_3, X2_0);
138 X3_3 = _mm256_xor_si256(X3_3, X3_0);
140 X0_3 = RotateLeft<16>(X0_3);
141 X1_3 = RotateLeft<16>(X1_3);
142 X2_3 = RotateLeft<16>(X2_3);
143 X3_3 = RotateLeft<16>(X3_3);
145 X0_2 = _mm256_add_epi32(X0_2, X0_3);
146 X1_2 = _mm256_add_epi32(X1_2, X1_3);
147 X2_2 = _mm256_add_epi32(X2_2, X2_3);
148 X3_2 = _mm256_add_epi32(X3_2, X3_3);
150 X0_1 = _mm256_xor_si256(X0_1, X0_2);
151 X1_1 = _mm256_xor_si256(X1_1, X1_2);
152 X2_1 = _mm256_xor_si256(X2_1, X2_2);
153 X3_1 = _mm256_xor_si256(X3_1, X3_2);
155 X0_1 = RotateLeft<12>(X0_1);
156 X1_1 = RotateLeft<12>(X1_1);
157 X2_1 = RotateLeft<12>(X2_1);
158 X3_1 = RotateLeft<12>(X3_1);
160 X0_0 = _mm256_add_epi32(X0_0, X0_1);
161 X1_0 = _mm256_add_epi32(X1_0, X1_1);
162 X2_0 = _mm256_add_epi32(X2_0, X2_1);
163 X3_0 = _mm256_add_epi32(X3_0, X3_1);
165 X0_3 = _mm256_xor_si256(X0_3, X0_0);
166 X1_3 = _mm256_xor_si256(X1_3, X1_0);
167 X2_3 = _mm256_xor_si256(X2_3, X2_0);
168 X3_3 = _mm256_xor_si256(X3_3, X3_0);
170 X0_3 = RotateLeft<8>(X0_3);
171 X1_3 = RotateLeft<8>(X1_3);
172 X2_3 = RotateLeft<8>(X2_3);
173 X3_3 = RotateLeft<8>(X3_3);
175 X0_2 = _mm256_add_epi32(X0_2, X0_3);
176 X1_2 = _mm256_add_epi32(X1_2, X1_3);
177 X2_2 = _mm256_add_epi32(X2_2, X2_3);
178 X3_2 = _mm256_add_epi32(X3_2, X3_3);
180 X0_1 = _mm256_xor_si256(X0_1, X0_2);
181 X1_1 = _mm256_xor_si256(X1_1, X1_2);
182 X2_1 = _mm256_xor_si256(X2_1, X2_2);
183 X3_1 = _mm256_xor_si256(X3_1, X3_2);
185 X0_1 = RotateLeft<7>(X0_1);
186 X1_1 = RotateLeft<7>(X1_1);
187 X2_1 = RotateLeft<7>(X2_1);
188 X3_1 = RotateLeft<7>(X3_1);
190 X0_1 = _mm256_shuffle_epi32(X0_1, _MM_SHUFFLE(0, 3, 2, 1));
191 X0_2 = _mm256_shuffle_epi32(X0_2, _MM_SHUFFLE(1, 0, 3, 2));
192 X0_3 = _mm256_shuffle_epi32(X0_3, _MM_SHUFFLE(2, 1, 0, 3));
194 X1_1 = _mm256_shuffle_epi32(X1_1, _MM_SHUFFLE(0, 3, 2, 1));
195 X1_2 = _mm256_shuffle_epi32(X1_2, _MM_SHUFFLE(1, 0, 3, 2));
196 X1_3 = _mm256_shuffle_epi32(X1_3, _MM_SHUFFLE(2, 1, 0, 3));
198 X2_1 = _mm256_shuffle_epi32(X2_1, _MM_SHUFFLE(0, 3, 2, 1));
199 X2_2 = _mm256_shuffle_epi32(X2_2, _MM_SHUFFLE(1, 0, 3, 2));
200 X2_3 = _mm256_shuffle_epi32(X2_3, _MM_SHUFFLE(2, 1, 0, 3));
202 X3_1 = _mm256_shuffle_epi32(X3_1, _MM_SHUFFLE(0, 3, 2, 1));
203 X3_2 = _mm256_shuffle_epi32(X3_2, _MM_SHUFFLE(1, 0, 3, 2));
204 X3_3 = _mm256_shuffle_epi32(X3_3, _MM_SHUFFLE(2, 1, 0, 3));
206 X0_0 = _mm256_add_epi32(X0_0, X0_1);
207 X1_0 = _mm256_add_epi32(X1_0, X1_1);
208 X2_0 = _mm256_add_epi32(X2_0, X2_1);
209 X3_0 = _mm256_add_epi32(X3_0, X3_1);
211 X0_3 = _mm256_xor_si256(X0_3, X0_0);
212 X1_3 = _mm256_xor_si256(X1_3, X1_0);
213 X2_3 = _mm256_xor_si256(X2_3, X2_0);
214 X3_3 = _mm256_xor_si256(X3_3, X3_0);
216 X0_3 = RotateLeft<16>(X0_3);
217 X1_3 = RotateLeft<16>(X1_3);
218 X2_3 = RotateLeft<16>(X2_3);
219 X3_3 = RotateLeft<16>(X3_3);
221 X0_2 = _mm256_add_epi32(X0_2, X0_3);
222 X1_2 = _mm256_add_epi32(X1_2, X1_3);
223 X2_2 = _mm256_add_epi32(X2_2, X2_3);
224 X3_2 = _mm256_add_epi32(X3_2, X3_3);
226 X0_1 = _mm256_xor_si256(X0_1, X0_2);
227 X1_1 = _mm256_xor_si256(X1_1, X1_2);
228 X2_1 = _mm256_xor_si256(X2_1, X2_2);
229 X3_1 = _mm256_xor_si256(X3_1, X3_2);
231 X0_1 = RotateLeft<12>(X0_1);
232 X1_1 = RotateLeft<12>(X1_1);
233 X2_1 = RotateLeft<12>(X2_1);
234 X3_1 = RotateLeft<12>(X3_1);
236 X0_0 = _mm256_add_epi32(X0_0, X0_1);
237 X1_0 = _mm256_add_epi32(X1_0, X1_1);
238 X2_0 = _mm256_add_epi32(X2_0, X2_1);
239 X3_0 = _mm256_add_epi32(X3_0, X3_1);
241 X0_3 = _mm256_xor_si256(X0_3, X0_0);
242 X1_3 = _mm256_xor_si256(X1_3, X1_0);
243 X2_3 = _mm256_xor_si256(X2_3, X2_0);
244 X3_3 = _mm256_xor_si256(X3_3, X3_0);
246 X0_3 = RotateLeft<8>(X0_3);
247 X1_3 = RotateLeft<8>(X1_3);
248 X2_3 = RotateLeft<8>(X2_3);
249 X3_3 = RotateLeft<8>(X3_3);
251 X0_2 = _mm256_add_epi32(X0_2, X0_3);
252 X1_2 = _mm256_add_epi32(X1_2, X1_3);
253 X2_2 = _mm256_add_epi32(X2_2, X2_3);
254 X3_2 = _mm256_add_epi32(X3_2, X3_3);
256 X0_1 = _mm256_xor_si256(X0_1, X0_2);
257 X1_1 = _mm256_xor_si256(X1_1, X1_2);
258 X2_1 = _mm256_xor_si256(X2_1, X2_2);
259 X3_1 = _mm256_xor_si256(X3_1, X3_2);
261 X0_1 = RotateLeft<7>(X0_1);
262 X1_1 = RotateLeft<7>(X1_1);
263 X2_1 = RotateLeft<7>(X2_1);
264 X3_1 = RotateLeft<7>(X3_1);
266 X0_1 = _mm256_shuffle_epi32(X0_1, _MM_SHUFFLE(2, 1, 0, 3));
267 X0_2 = _mm256_shuffle_epi32(X0_2, _MM_SHUFFLE(1, 0, 3, 2));
268 X0_3 = _mm256_shuffle_epi32(X0_3, _MM_SHUFFLE(0, 3, 2, 1));
270 X1_1 = _mm256_shuffle_epi32(X1_1, _MM_SHUFFLE(2, 1, 0, 3));
271 X1_2 = _mm256_shuffle_epi32(X1_2, _MM_SHUFFLE(1, 0, 3, 2));
272 X1_3 = _mm256_shuffle_epi32(X1_3, _MM_SHUFFLE(0, 3, 2, 1));
274 X2_1 = _mm256_shuffle_epi32(X2_1, _MM_SHUFFLE(2, 1, 0, 3));
275 X2_2 = _mm256_shuffle_epi32(X2_2, _MM_SHUFFLE(1, 0, 3, 2));
276 X2_3 = _mm256_shuffle_epi32(X2_3, _MM_SHUFFLE(0, 3, 2, 1));
278 X3_1 = _mm256_shuffle_epi32(X3_1, _MM_SHUFFLE(2, 1, 0, 3));
279 X3_2 = _mm256_shuffle_epi32(X3_2, _MM_SHUFFLE(1, 0, 3, 2));
280 X3_3 = _mm256_shuffle_epi32(X3_3, _MM_SHUFFLE(0, 3, 2, 1));
283 X0_0 = _mm256_add_epi32(X0_0, state0);
284 X0_1 = _mm256_add_epi32(X0_1, state1);
285 X0_2 = _mm256_add_epi32(X0_2, state2);
286 X0_3 = _mm256_add_epi32(X0_3, state3);
287 X0_3 = _mm256_add_epi64(X0_3, CTR0);
289 X1_0 = _mm256_add_epi32(X1_0, state0);
290 X1_1 = _mm256_add_epi32(X1_1, state1);
291 X1_2 = _mm256_add_epi32(X1_2, state2);
292 X1_3 = _mm256_add_epi32(X1_3, state3);
293 X1_3 = _mm256_add_epi64(X1_3, CTR1);
295 X2_0 = _mm256_add_epi32(X2_0, state0);
296 X2_1 = _mm256_add_epi32(X2_1, state1);
297 X2_2 = _mm256_add_epi32(X2_2, state2);
298 X2_3 = _mm256_add_epi32(X2_3, state3);
299 X2_3 = _mm256_add_epi64(X2_3, CTR2);
301 X3_0 = _mm256_add_epi32(X3_0, state0);
302 X3_1 = _mm256_add_epi32(X3_1, state1);
303 X3_2 = _mm256_add_epi32(X3_2, state2);
304 X3_3 = _mm256_add_epi32(X3_3, state3);
305 X3_3 = _mm256_add_epi64(X3_3, CTR3);
309 _mm256_storeu_si256(output_mm + 0, _mm256_xor_si256(_mm256_loadu_si256(input_mm + 0),
310 _mm256_permute2x128_si256(X0_0, X0_1, 1 + (3 << 4))));
311 _mm256_storeu_si256(output_mm + 1, _mm256_xor_si256(_mm256_loadu_si256(input_mm + 1),
312 _mm256_permute2x128_si256(X0_2, X0_3, 1 + (3 << 4))));
313 _mm256_storeu_si256(output_mm + 2, _mm256_xor_si256(_mm256_loadu_si256(input_mm + 2),
314 _mm256_permute2x128_si256(X1_0, X1_1, 1 + (3 << 4))));
315 _mm256_storeu_si256(output_mm + 3, _mm256_xor_si256(_mm256_loadu_si256(input_mm + 3),
316 _mm256_permute2x128_si256(X1_2, X1_3, 1 + (3 << 4))));
320 _mm256_storeu_si256(output_mm + 0, _mm256_permute2x128_si256(X0_0, X0_1, 1 + (3 << 4)));
321 _mm256_storeu_si256(output_mm + 1, _mm256_permute2x128_si256(X0_2, X0_3, 1 + (3 << 4)));
322 _mm256_storeu_si256(output_mm + 2, _mm256_permute2x128_si256(X1_0, X1_1, 1 + (3 << 4)));
323 _mm256_storeu_si256(output_mm + 3, _mm256_permute2x128_si256(X1_2, X1_3, 1 + (3 << 4)));
328 _mm256_storeu_si256(output_mm + 4, _mm256_xor_si256(_mm256_loadu_si256(input_mm + 4),
329 _mm256_permute2x128_si256(X2_0, X2_1, 1 + (3 << 4))));
330 _mm256_storeu_si256(output_mm + 5, _mm256_xor_si256(_mm256_loadu_si256(input_mm + 5),
331 _mm256_permute2x128_si256(X2_2, X2_3, 1 + (3 << 4))));
332 _mm256_storeu_si256(output_mm + 6, _mm256_xor_si256(_mm256_loadu_si256(input_mm + 6),
333 _mm256_permute2x128_si256(X3_0, X3_1, 1 + (3 << 4))));
334 _mm256_storeu_si256(output_mm + 7, _mm256_xor_si256(_mm256_loadu_si256(input_mm + 7),
335 _mm256_permute2x128_si256(X3_2, X3_3, 1 + (3 << 4))));
339 _mm256_storeu_si256(output_mm + 4, _mm256_permute2x128_si256(X2_0, X2_1, 1 + (3 << 4)));
340 _mm256_storeu_si256(output_mm + 5, _mm256_permute2x128_si256(X2_2, X2_3, 1 + (3 << 4)));
341 _mm256_storeu_si256(output_mm + 6, _mm256_permute2x128_si256(X3_0, X3_1, 1 + (3 << 4)));
342 _mm256_storeu_si256(output_mm + 7, _mm256_permute2x128_si256(X3_2, X3_3, 1 + (3 << 4)));
347 _mm256_storeu_si256(output_mm + 8, _mm256_xor_si256(_mm256_loadu_si256(input_mm + 8),
348 _mm256_permute2x128_si256(X0_0, X0_1, 0 + (2 << 4))));
349 _mm256_storeu_si256(output_mm + 9, _mm256_xor_si256(_mm256_loadu_si256(input_mm + 9),
350 _mm256_permute2x128_si256(X0_2, X0_3, 0 + (2 << 4))));
351 _mm256_storeu_si256(output_mm + 10, _mm256_xor_si256(_mm256_loadu_si256(input_mm + 10),
352 _mm256_permute2x128_si256(X1_0, X1_1, 0 + (2 << 4))));
353 _mm256_storeu_si256(output_mm + 11, _mm256_xor_si256(_mm256_loadu_si256(input_mm + 11),
354 _mm256_permute2x128_si256(X1_2, X1_3, 0 + (2 << 4))));
358 _mm256_storeu_si256(output_mm + 8, _mm256_permute2x128_si256(X0_0, X0_1, 0 + (2 << 4)));
359 _mm256_storeu_si256(output_mm + 9, _mm256_permute2x128_si256(X0_2, X0_3, 0 + (2 << 4)));
360 _mm256_storeu_si256(output_mm + 10, _mm256_permute2x128_si256(X1_0, X1_1, 0 + (2 << 4)));
361 _mm256_storeu_si256(output_mm + 11, _mm256_permute2x128_si256(X1_2, X1_3, 0 + (2 << 4)));
366 _mm256_storeu_si256(output_mm + 12, _mm256_xor_si256(_mm256_loadu_si256(input_mm + 12),
367 _mm256_permute2x128_si256(X2_0, X2_1, 0 + (2 << 4))));
368 _mm256_storeu_si256(output_mm + 13, _mm256_xor_si256(_mm256_loadu_si256(input_mm + 13),
369 _mm256_permute2x128_si256(X2_2, X2_3, 0 + (2 << 4))));
370 _mm256_storeu_si256(output_mm + 14, _mm256_xor_si256(_mm256_loadu_si256(input_mm + 14),
371 _mm256_permute2x128_si256(X3_0, X3_1, 0 + (2 << 4))));
372 _mm256_storeu_si256(output_mm + 15, _mm256_xor_si256(_mm256_loadu_si256(input_mm + 15),
373 _mm256_permute2x128_si256(X3_2, X3_3, 0 + (2 << 4))));
377 _mm256_storeu_si256(output_mm + 12, _mm256_permute2x128_si256(X2_0, X2_1, 0 + (2 << 4)));
378 _mm256_storeu_si256(output_mm + 13, _mm256_permute2x128_si256(X2_2, X2_3, 0 + (2 << 4)));
379 _mm256_storeu_si256(output_mm + 14, _mm256_permute2x128_si256(X3_0, X3_1, 0 + (2 << 4)));
380 _mm256_storeu_si256(output_mm + 15, _mm256_permute2x128_si256(X3_2, X3_3, 0 + (2 << 4)));
387 #endif // CRYPTOPP_AVX2_AVAILABLE Utility functions for the Crypto++ library.
Library configuration file.
Crypto++ library namespace.
Classes for ChaCha8, ChaCha12 and ChaCha20 stream ciphers.