19 #if (CRYPTOPP_SSSE3_AVAILABLE) 21 # include <pmmintrin.h> 22 # include <tmmintrin.h> 26 # include <ammintrin.h> 29 #if defined(__AVX512F__) 30 # define CRYPTOPP_AVX512_ROTATE 1 31 # include <immintrin.h> 35 extern const char CHAM_SIMD_FNAME[] = __FILE__;
37 ANONYMOUS_NAMESPACE_BEGIN
39 using CryptoPP::word16;
40 using CryptoPP::word32;
42 #if (CRYPTOPP_SSSE3_AVAILABLE) 48 template <
unsigned int R>
49 inline __m128i RotateLeft16(
const __m128i& val)
52 return _mm_roti_epi16(val, R);
55 _mm_slli_epi16(val, R), _mm_srli_epi16(val, 16-R));
59 template <
unsigned int R>
60 inline __m128i RotateRight16(
const __m128i& val)
63 return _mm_roti_epi16(val, 16-R);
66 _mm_slli_epi16(val, 16-R), _mm_srli_epi16(val, R));
71 inline __m128i RotateLeft16<8>(
const __m128i& val)
74 return _mm_roti_epi16(val, 8);
76 const __m128i mask = _mm_set_epi8(14,15, 12,13, 10,11, 8,9, 6,7, 4,5, 2,3, 0,1);
77 return _mm_shuffle_epi8(val, mask);
82 inline __m128i RotateRight16<8>(
const __m128i& val)
85 return _mm_roti_epi16(val, 16-8);
87 const __m128i mask = _mm_set_epi8(14,15, 12,13, 10,11, 8,9, 6,7, 4,5, 2,3, 0,1);
88 return _mm_shuffle_epi8(val, mask);
92 template <
unsigned int IDX>
93 inline __m128i UnpackXMM(
const __m128i& a,
const __m128i& b,
const __m128i& c,
const __m128i& d,
94 const __m128i& e,
const __m128i& f,
const __m128i& g,
const __m128i& h)
97 CRYPTOPP_UNUSED(a); CRYPTOPP_UNUSED(b);
98 CRYPTOPP_UNUSED(c); CRYPTOPP_UNUSED(d);
99 CRYPTOPP_UNUSED(e); CRYPTOPP_UNUSED(f);
100 CRYPTOPP_UNUSED(g); CRYPTOPP_UNUSED(h);
102 return _mm_setzero_si128();
106 inline __m128i UnpackXMM<0>(
const __m128i& a,
const __m128i& b,
const __m128i& c,
const __m128i& d,
107 const __m128i& e,
const __m128i& f,
const __m128i& g,
const __m128i& h)
113 const __m128i r1 = _mm_unpacklo_epi16(a, b);
114 const __m128i r2 = _mm_unpacklo_epi16(c, d);
115 const __m128i r3 = _mm_unpacklo_epi16(e, f);
116 const __m128i r4 = _mm_unpacklo_epi16(g, h);
118 const __m128i r5 = _mm_unpacklo_epi32(r1, r2);
119 const __m128i r6 = _mm_unpacklo_epi32(r3, r4);
120 return _mm_shuffle_epi8(_mm_unpacklo_epi64(r5, r6),
121 _mm_set_epi8(14,15,12,13, 10,11,8,9, 6,7,4,5, 2,3,0,1));
125 inline __m128i UnpackXMM<1>(
const __m128i& a,
const __m128i& b,
const __m128i& c,
const __m128i& d,
126 const __m128i& e,
const __m128i& f,
const __m128i& g,
const __m128i& h)
132 const __m128i r1 = _mm_unpacklo_epi16(a, b);
133 const __m128i r2 = _mm_unpacklo_epi16(c, d);
134 const __m128i r3 = _mm_unpacklo_epi16(e, f);
135 const __m128i r4 = _mm_unpacklo_epi16(g, h);
137 const __m128i r5 = _mm_unpacklo_epi32(r1, r2);
138 const __m128i r6 = _mm_unpacklo_epi32(r3, r4);
139 return _mm_shuffle_epi8(_mm_unpackhi_epi64(r5, r6),
140 _mm_set_epi8(14,15,12,13, 10,11,8,9, 6,7,4,5, 2,3,0,1));
144 inline __m128i UnpackXMM<2>(
const __m128i& a,
const __m128i& b,
const __m128i& c,
const __m128i& d,
145 const __m128i& e,
const __m128i& f,
const __m128i& g,
const __m128i& h)
151 const __m128i r1 = _mm_unpacklo_epi16(a, b);
152 const __m128i r2 = _mm_unpacklo_epi16(c, d);
153 const __m128i r3 = _mm_unpacklo_epi16(e, f);
154 const __m128i r4 = _mm_unpacklo_epi16(g, h);
156 const __m128i r5 = _mm_unpackhi_epi32(r1, r2);
157 const __m128i r6 = _mm_unpackhi_epi32(r3, r4);
158 return _mm_shuffle_epi8(_mm_unpacklo_epi64(r5, r6),
159 _mm_set_epi8(14,15,12,13, 10,11,8,9, 6,7,4,5, 2,3,0,1));
163 inline __m128i UnpackXMM<3>(
const __m128i& a,
const __m128i& b,
const __m128i& c,
const __m128i& d,
164 const __m128i& e,
const __m128i& f,
const __m128i& g,
const __m128i& h)
170 const __m128i r1 = _mm_unpacklo_epi16(a, b);
171 const __m128i r2 = _mm_unpacklo_epi16(c, d);
172 const __m128i r3 = _mm_unpacklo_epi16(e, f);
173 const __m128i r4 = _mm_unpacklo_epi16(g, h);
175 const __m128i r5 = _mm_unpackhi_epi32(r1, r2);
176 const __m128i r6 = _mm_unpackhi_epi32(r3, r4);
177 return _mm_shuffle_epi8(_mm_unpackhi_epi64(r5, r6),
178 _mm_set_epi8(14,15,12,13, 10,11,8,9, 6,7,4,5, 2,3,0,1));
182 inline __m128i UnpackXMM<4>(
const __m128i& a,
const __m128i& b,
const __m128i& c,
const __m128i& d,
183 const __m128i& e,
const __m128i& f,
const __m128i& g,
const __m128i& h)
189 const __m128i r1 = _mm_unpackhi_epi16(a, b);
190 const __m128i r2 = _mm_unpackhi_epi16(c, d);
191 const __m128i r3 = _mm_unpackhi_epi16(e, f);
192 const __m128i r4 = _mm_unpackhi_epi16(g, h);
194 const __m128i r5 = _mm_unpacklo_epi32(r1, r2);
195 const __m128i r6 = _mm_unpacklo_epi32(r3, r4);
196 return _mm_shuffle_epi8(_mm_unpacklo_epi64(r5, r6),
197 _mm_set_epi8(14,15,12,13, 10,11,8,9, 6,7,4,5, 2,3,0,1));
201 inline __m128i UnpackXMM<5>(
const __m128i& a,
const __m128i& b,
const __m128i& c,
const __m128i& d,
202 const __m128i& e,
const __m128i& f,
const __m128i& g,
const __m128i& h)
208 const __m128i r1 = _mm_unpackhi_epi16(a, b);
209 const __m128i r2 = _mm_unpackhi_epi16(c, d);
210 const __m128i r3 = _mm_unpackhi_epi16(e, f);
211 const __m128i r4 = _mm_unpackhi_epi16(g, h);
213 const __m128i r5 = _mm_unpacklo_epi32(r1, r2);
214 const __m128i r6 = _mm_unpacklo_epi32(r3, r4);
215 return _mm_shuffle_epi8(_mm_unpackhi_epi64(r5, r6),
216 _mm_set_epi8(14,15,12,13, 10,11,8,9, 6,7,4,5, 2,3,0,1));
220 inline __m128i UnpackXMM<6>(
const __m128i& a,
const __m128i& b,
const __m128i& c,
const __m128i& d,
221 const __m128i& e,
const __m128i& f,
const __m128i& g,
const __m128i& h)
227 const __m128i r1 = _mm_unpackhi_epi16(a, b);
228 const __m128i r2 = _mm_unpackhi_epi16(c, d);
229 const __m128i r3 = _mm_unpackhi_epi16(e, f);
230 const __m128i r4 = _mm_unpackhi_epi16(g, h);
232 const __m128i r5 = _mm_unpackhi_epi32(r1, r2);
233 const __m128i r6 = _mm_unpackhi_epi32(r3, r4);
234 return _mm_shuffle_epi8(_mm_unpacklo_epi64(r5, r6),
235 _mm_set_epi8(14,15,12,13, 10,11,8,9, 6,7,4,5, 2,3,0,1));
239 inline __m128i UnpackXMM<7>(
const __m128i& a,
const __m128i& b,
const __m128i& c,
const __m128i& d,
240 const __m128i& e,
const __m128i& f,
const __m128i& g,
const __m128i& h)
246 const __m128i r1 = _mm_unpackhi_epi16(a, b);
247 const __m128i r2 = _mm_unpackhi_epi16(c, d);
248 const __m128i r3 = _mm_unpackhi_epi16(e, f);
249 const __m128i r4 = _mm_unpackhi_epi16(g, h);
251 const __m128i r5 = _mm_unpackhi_epi32(r1, r2);
252 const __m128i r6 = _mm_unpackhi_epi32(r3, r4);
253 return _mm_shuffle_epi8(_mm_unpackhi_epi64(r5, r6),
254 _mm_set_epi8(14,15,12,13, 10,11,8,9, 6,7,4,5, 2,3,0,1));
257 template <
unsigned int IDX>
258 inline __m128i UnpackXMM(
const __m128i& v)
263 return _mm_setzero_si128();
267 inline __m128i UnpackXMM<0>(
const __m128i& v)
269 return _mm_shuffle_epi8(v, _mm_set_epi8(0,1, 0,1, 0,1, 0,1, 0,1, 0,1, 0,1, 0,1));
273 inline __m128i UnpackXMM<1>(
const __m128i& v)
275 return _mm_shuffle_epi8(v, _mm_set_epi8(2,3, 2,3, 2,3, 2,3, 2,3, 2,3, 2,3, 2,3));
279 inline __m128i UnpackXMM<2>(
const __m128i& v)
281 return _mm_shuffle_epi8(v, _mm_set_epi8(4,5, 4,5, 4,5, 4,5, 4,5, 4,5, 4,5, 4,5));
285 inline __m128i UnpackXMM<3>(
const __m128i& v)
287 return _mm_shuffle_epi8(v, _mm_set_epi8(6,7, 6,7, 6,7, 6,7, 6,7, 6,7, 6,7, 6,7));
291 inline __m128i UnpackXMM<4>(
const __m128i& v)
293 return _mm_shuffle_epi8(v, _mm_set_epi8(8,9, 8,9, 8,9, 8,9, 8,9, 8,9, 8,9, 8,9));
297 inline __m128i UnpackXMM<5>(
const __m128i& v)
299 return _mm_shuffle_epi8(v, _mm_set_epi8(10,11, 10,11, 10,11, 10,11, 10,11, 10,11, 10,11, 10,11));
303 inline __m128i UnpackXMM<6>(
const __m128i& v)
305 return _mm_shuffle_epi8(v, _mm_set_epi8(12,13, 12,13, 12,13, 12,13, 12,13, 12,13, 12,13, 12,13));
309 inline __m128i UnpackXMM<7>(
const __m128i& v)
311 return _mm_shuffle_epi8(v, _mm_set_epi8(14,15, 14,15, 14,15, 14,15, 14,15, 14,15, 14,15, 14,15));
314 template <
unsigned int IDX>
315 inline __m128i UnpackXMM(
const __m128i& a,
const __m128i& b)
317 const __m128i& z = _mm_setzero_si128();
318 return UnpackXMM<IDX>(a, b, z, z, z, z, z, z);
321 template <
unsigned int IDX>
322 inline __m128i RepackXMM(
const __m128i& a,
const __m128i& b,
const __m128i& c,
const __m128i& d,
323 const __m128i& e,
const __m128i& f,
const __m128i& g,
const __m128i& h)
325 return UnpackXMM<IDX>(a, b, c, d, e, f, g, h);
328 template <
unsigned int IDX>
329 inline __m128i RepackXMM(
const __m128i& v)
331 return UnpackXMM<IDX>(v);
334 inline void CHAM64_Enc_Block(__m128i &block0,
335 const word16 *subkeys,
unsigned int )
341 __m128i a = UnpackXMM<0>(block0);
342 __m128i b = UnpackXMM<1>(block0);
343 __m128i c = UnpackXMM<2>(block0);
344 __m128i d = UnpackXMM<3>(block0);
345 __m128i e = UnpackXMM<4>(block0);
346 __m128i f = UnpackXMM<5>(block0);
347 __m128i g = UnpackXMM<6>(block0);
348 __m128i h = UnpackXMM<7>(block0);
350 const unsigned int rounds = 80;
351 __m128i counter = _mm_set_epi16(0,0,0,0,0,0,0,0);
352 __m128i increment = _mm_set_epi16(1,1,1,1,1,1,1,1);
354 const unsigned int MASK = 15;
355 for (
int i=0; i<static_cast<int>(rounds); i+=4)
357 __m128i k, kr, t1, t2, t3, t4;
358 k = _mm_castpd_si128(_mm_load_sd((
const double*)(&subkeys[(i+0) & MASK])));
361 kr = _mm_shuffle_epi8(k, _mm_set_epi8(1,0,1,0, 1,0,1,0, 1,0,1,0, 1,0,1,0));
363 t1 = _mm_xor_si128(a, counter);
364 t3 = _mm_xor_si128(e, counter);
365 t2 = _mm_xor_si128(RotateLeft16<1>(b), kr);
366 t4 = _mm_xor_si128(RotateLeft16<1>(f), kr);
367 a = RotateLeft16<8>(_mm_add_epi16(t1, t2));
368 e = RotateLeft16<8>(_mm_add_epi16(t3, t4));
370 counter = _mm_add_epi16(counter, increment);
371 kr = _mm_shuffle_epi8(k, _mm_set_epi8(3,2,3,2, 3,2,3,2, 3,2,3,2, 3,2,3,2));
373 t1 = _mm_xor_si128(b, counter);
374 t3 = _mm_xor_si128(f, counter);
375 t2 = _mm_xor_si128(RotateLeft16<8>(c), kr);
376 t4 = _mm_xor_si128(RotateLeft16<8>(g), kr);
377 b = RotateLeft16<1>(_mm_add_epi16(t1, t2));
378 f = RotateLeft16<1>(_mm_add_epi16(t3, t4));
380 counter = _mm_add_epi16(counter, increment);
381 kr = _mm_shuffle_epi8(k, _mm_set_epi8(5,4,5,4, 5,4,5,4, 5,4,5,4, 5,4,5,4));
383 t1 = _mm_xor_si128(c, counter);
384 t3 = _mm_xor_si128(g, counter);
385 t2 = _mm_xor_si128(RotateLeft16<1>(d), kr);
386 t4 = _mm_xor_si128(RotateLeft16<1>(h), kr);
387 c = RotateLeft16<8>(_mm_add_epi16(t1, t2));
388 g = RotateLeft16<8>(_mm_add_epi16(t3, t4));
390 counter = _mm_add_epi16(counter, increment);
391 kr = _mm_shuffle_epi8(k, _mm_set_epi8(7,6,7,6, 7,6,7,6, 7,6,7,6, 7,6,7,6));
393 t1 = _mm_xor_si128(d, counter);
394 t3 = _mm_xor_si128(h, counter);
395 t2 = _mm_xor_si128(RotateLeft16<8>(a), kr);
396 t4 = _mm_xor_si128(RotateLeft16<8>(e), kr);
397 d = RotateLeft16<1>(_mm_add_epi16(t1, t2));
398 h = RotateLeft16<1>(_mm_add_epi16(t3, t4));
400 counter = _mm_add_epi16(counter, increment);
404 block0 = RepackXMM<0>(a,b,c,d,e,f,g,h);
407 inline void CHAM64_Dec_Block(__m128i &block0,
408 const word16 *subkeys,
unsigned int )
414 __m128i a = UnpackXMM<0>(block0);
415 __m128i b = UnpackXMM<1>(block0);
416 __m128i c = UnpackXMM<2>(block0);
417 __m128i d = UnpackXMM<3>(block0);
418 __m128i e = UnpackXMM<4>(block0);
419 __m128i f = UnpackXMM<5>(block0);
420 __m128i g = UnpackXMM<6>(block0);
421 __m128i h = UnpackXMM<7>(block0);
423 const unsigned int rounds = 80;
424 __m128i counter = _mm_set_epi16(rounds-1,rounds-1,rounds-1,rounds-1, rounds-1,rounds-1,rounds-1,rounds-1);
425 __m128i decrement = _mm_set_epi16(1,1,1,1,1,1,1,1);
427 const unsigned int MASK = 15;
428 for (
int i = static_cast<int>(rounds)-1; i >= 0; i-=4)
430 __m128i k, kr, t1, t2, t3, t4;
431 k = _mm_castpd_si128(_mm_load_sd((
const double*)(&subkeys[(i-3) & MASK])));
434 kr = _mm_shuffle_epi8(k, _mm_set_epi8(7,6,7,6, 7,6,7,6, 7,6,7,6, 7,6,7,6));
437 t1 = RotateRight16<1>(d);
438 t3 = RotateRight16<1>(h);
439 t2 = _mm_xor_si128(RotateLeft16<8>(a), kr);
440 t4 = _mm_xor_si128(RotateLeft16<8>(e), kr);
441 d = _mm_xor_si128(_mm_sub_epi16(t1, t2), counter);
442 h = _mm_xor_si128(_mm_sub_epi16(t3, t4), counter);
444 counter = _mm_sub_epi16(counter, decrement);
445 kr = _mm_shuffle_epi8(k, _mm_set_epi8(5,4,5,4, 5,4,5,4, 5,4,5,4, 5,4,5,4));
448 t1 = RotateRight16<8>(c);
449 t3 = RotateRight16<8>(g);
450 t2 = _mm_xor_si128(RotateLeft16<1>(d), kr);
451 t4 = _mm_xor_si128(RotateLeft16<1>(h), kr);
452 c = _mm_xor_si128(_mm_sub_epi16(t1, t2), counter);
453 g = _mm_xor_si128(_mm_sub_epi16(t3, t4), counter);
455 counter = _mm_sub_epi16(counter, decrement);
456 kr = _mm_shuffle_epi8(k, _mm_set_epi8(3,2,3,2, 3,2,3,2, 3,2,3,2, 3,2,3,2));
459 t1 = RotateRight16<1>(b);
460 t3 = RotateRight16<1>(f);
461 t2 = _mm_xor_si128(RotateLeft16<8>(c), kr);
462 t4 = _mm_xor_si128(RotateLeft16<8>(g), kr);
463 b = _mm_xor_si128(_mm_sub_epi16(t1, t2), counter);
464 f = _mm_xor_si128(_mm_sub_epi16(t3, t4), counter);
466 counter = _mm_sub_epi16(counter, decrement);
467 kr = _mm_shuffle_epi8(k, _mm_set_epi8(1,0,1,0, 1,0,1,0, 1,0,1,0, 1,0,1,0));
470 t1 = RotateRight16<8>(a);
471 t3 = RotateRight16<8>(e);
472 t2 = _mm_xor_si128(RotateLeft16<1>(b), kr);
473 t4 = _mm_xor_si128(RotateLeft16<1>(f), kr);
474 a = _mm_xor_si128(_mm_sub_epi16(t1, t2), counter);
475 e = _mm_xor_si128(_mm_sub_epi16(t3, t4), counter);
477 counter = _mm_sub_epi16(counter, decrement);
481 block0 = RepackXMM<0>(a,b,c,d,e,f,g,h);
484 inline void CHAM64_Enc_2_Blocks(__m128i &block0,
485 __m128i &block1,
const word16 *subkeys,
unsigned int )
491 __m128i a = UnpackXMM<0>(block0, block1);
492 __m128i b = UnpackXMM<1>(block0, block1);
493 __m128i c = UnpackXMM<2>(block0, block1);
494 __m128i d = UnpackXMM<3>(block0, block1);
495 __m128i e = UnpackXMM<4>(block0, block1);
496 __m128i f = UnpackXMM<5>(block0, block1);
497 __m128i g = UnpackXMM<6>(block0, block1);
498 __m128i h = UnpackXMM<7>(block0, block1);
500 const unsigned int rounds = 80;
501 __m128i counter = _mm_set_epi16(0,0,0,0,0,0,0,0);
502 __m128i increment = _mm_set_epi16(1,1,1,1,1,1,1,1);
504 const unsigned int MASK = 15;
505 for (
int i=0; i<static_cast<int>(rounds); i+=4)
507 __m128i k, kr, t1, t2, t3, t4;
508 k = _mm_castpd_si128(_mm_load_sd((
const double*)(&subkeys[i & MASK])));
511 kr = _mm_shuffle_epi8(k, _mm_set_epi8(1,0,1,0, 1,0,1,0, 1,0,1,0, 1,0,1,0));
513 t1 = _mm_xor_si128(a, counter);
514 t3 = _mm_xor_si128(e, counter);
515 t2 = _mm_xor_si128(RotateLeft16<1>(b), kr);
516 t4 = _mm_xor_si128(RotateLeft16<1>(f), kr);
517 a = RotateLeft16<8>(_mm_add_epi16(t1, t2));
518 e = RotateLeft16<8>(_mm_add_epi16(t3, t4));
520 counter = _mm_add_epi16(counter, increment);
521 kr = _mm_shuffle_epi8(k, _mm_set_epi8(3,2,3,2, 3,2,3,2, 3,2,3,2, 3,2,3,2));
523 t1 = _mm_xor_si128(b, counter);
524 t3 = _mm_xor_si128(f, counter);
525 t2 = _mm_xor_si128(RotateLeft16<8>(c), kr);
526 t4 = _mm_xor_si128(RotateLeft16<8>(g), kr);
527 b = RotateLeft16<1>(_mm_add_epi16(t1, t2));
528 f = RotateLeft16<1>(_mm_add_epi16(t3, t4));
530 counter = _mm_add_epi16(counter, increment);
531 kr = _mm_shuffle_epi8(k, _mm_set_epi8(5,4,5,4, 5,4,5,4, 5,4,5,4, 5,4,5,4));
533 t1 = _mm_xor_si128(c, counter);
534 t3 = _mm_xor_si128(g, counter);
535 t2 = _mm_xor_si128(RotateLeft16<1>(d), kr);
536 t4 = _mm_xor_si128(RotateLeft16<1>(h), kr);
537 c = RotateLeft16<8>(_mm_add_epi16(t1, t2));
538 g = RotateLeft16<8>(_mm_add_epi16(t3, t4));
540 counter = _mm_add_epi16(counter, increment);
541 kr = _mm_shuffle_epi8(k, _mm_set_epi8(7,6,7,6, 7,6,7,6, 7,6,7,6, 7,6,7,6));
543 t1 = _mm_xor_si128(d, counter);
544 t3 = _mm_xor_si128(h, counter);
545 t2 = _mm_xor_si128(RotateLeft16<8>(a), kr);
546 t4 = _mm_xor_si128(RotateLeft16<8>(e), kr);
547 d = RotateLeft16<1>(_mm_add_epi16(t1, t2));
548 h = RotateLeft16<1>(_mm_add_epi16(t3, t4));
550 counter = _mm_add_epi16(counter, increment);
554 block0 = RepackXMM<0>(a,b,c,d,e,f,g,h);
555 block1 = RepackXMM<1>(a,b,c,d,e,f,g,h);
558 inline void CHAM64_Dec_2_Blocks(__m128i &block0,
559 __m128i &block1,
const word16 *subkeys,
unsigned int )
565 __m128i a = UnpackXMM<0>(block0, block1);
566 __m128i b = UnpackXMM<1>(block0, block1);
567 __m128i c = UnpackXMM<2>(block0, block1);
568 __m128i d = UnpackXMM<3>(block0, block1);
569 __m128i e = UnpackXMM<4>(block0, block1);
570 __m128i f = UnpackXMM<5>(block0, block1);
571 __m128i g = UnpackXMM<6>(block0, block1);
572 __m128i h = UnpackXMM<7>(block0, block1);
574 const unsigned int rounds = 80;
575 __m128i counter = _mm_set_epi16(rounds-1,rounds-1,rounds-1,rounds-1, rounds-1,rounds-1,rounds-1,rounds-1);
576 __m128i decrement = _mm_set_epi16(1,1,1,1,1,1,1,1);
578 const unsigned int MASK = 15;
579 for (
int i = static_cast<int>(rounds)-1; i >= 0; i-=4)
581 __m128i k, kr, t1, t2, t3, t4;
582 k = _mm_castpd_si128(_mm_load_sd((
const double*)(&subkeys[(i-3) & MASK])));
585 kr = _mm_shuffle_epi8(k, _mm_set_epi8(7,6,7,6, 7,6,7,6, 7,6,7,6, 7,6,7,6));
588 t1 = RotateRight16<1>(d);
589 t3 = RotateRight16<1>(h);
590 t2 = _mm_xor_si128(RotateLeft16<8>(a), kr);
591 t4 = _mm_xor_si128(RotateLeft16<8>(e), kr);
592 d = _mm_xor_si128(_mm_sub_epi16(t1, t2), counter);
593 h = _mm_xor_si128(_mm_sub_epi16(t3, t4), counter);
595 counter = _mm_sub_epi16(counter, decrement);
596 kr = _mm_shuffle_epi8(k, _mm_set_epi8(5,4,5,4, 5,4,5,4, 5,4,5,4, 5,4,5,4));
599 t1 = RotateRight16<8>(c);
600 t3 = RotateRight16<8>(g);
601 t2 = _mm_xor_si128(RotateLeft16<1>(d), kr);
602 t4 = _mm_xor_si128(RotateLeft16<1>(h), kr);
603 c = _mm_xor_si128(_mm_sub_epi16(t1, t2), counter);
604 g = _mm_xor_si128(_mm_sub_epi16(t3, t4), counter);
606 counter = _mm_sub_epi16(counter, decrement);
607 kr = _mm_shuffle_epi8(k, _mm_set_epi8(3,2,3,2, 3,2,3,2, 3,2,3,2, 3,2,3,2));
610 t1 = RotateRight16<1>(b);
611 t3 = RotateRight16<1>(f);
612 t2 = _mm_xor_si128(RotateLeft16<8>(c), kr);
613 t4 = _mm_xor_si128(RotateLeft16<8>(g), kr);
614 b = _mm_xor_si128(_mm_sub_epi16(t1, t2), counter);
615 f = _mm_xor_si128(_mm_sub_epi16(t3, t4), counter);
617 counter = _mm_sub_epi16(counter, decrement);
618 kr = _mm_shuffle_epi8(k, _mm_set_epi8(1,0,1,0, 1,0,1,0, 1,0,1,0, 1,0,1,0));
621 t1 = RotateRight16<8>(a);
622 t3 = RotateRight16<8>(e);
623 t2 = _mm_xor_si128(RotateLeft16<1>(b), kr);
624 t4 = _mm_xor_si128(RotateLeft16<1>(f), kr);
625 a = _mm_xor_si128(_mm_sub_epi16(t1, t2), counter);
626 e = _mm_xor_si128(_mm_sub_epi16(t3, t4), counter);
628 counter = _mm_sub_epi16(counter, decrement);
632 block0 = RepackXMM<0>(a,b,c,d,e,f,g,h);
633 block1 = RepackXMM<1>(a,b,c,d,e,f,g,h);
642 template <
unsigned int R>
643 inline __m128i RotateLeft32(
const __m128i& val)
645 #if defined(CRYPTOPP_AVX512_ROTATE) 646 return _mm_rol_epi32(val, R);
647 #elif defined(__XOP__) 648 return _mm_roti_epi32(val, R);
651 _mm_slli_epi32(val, R), _mm_srli_epi32(val, 32-R));
655 template <
unsigned int R>
656 inline __m128i RotateRight32(
const __m128i& val)
658 #if defined(CRYPTOPP_AVX512_ROTATE) 659 return _mm_ror_epi32(val, R);
660 #elif defined(__XOP__) 661 return _mm_roti_epi32(val, 32-R);
664 _mm_slli_epi32(val, 32-R), _mm_srli_epi32(val, R));
670 inline __m128i RotateLeft32<8>(
const __m128i& val)
673 return _mm_roti_epi32(val, 8);
675 const __m128i mask = _mm_set_epi8(14,13,12,15, 10,9,8,11, 6,5,4,7, 2,1,0,3);
676 return _mm_shuffle_epi8(val, mask);
682 inline __m128i RotateRight32<8>(
const __m128i& val)
685 return _mm_roti_epi32(val, 32-8);
687 const __m128i mask = _mm_set_epi8(12,15,14,13, 8,11,10,9, 4,7,6,5, 0,3,2,1);
688 return _mm_shuffle_epi8(val, mask);
692 template <
unsigned int IDX>
693 inline __m128i UnpackXMM(
const __m128i& a,
const __m128i& b,
const __m128i& c,
const __m128i& d)
696 CRYPTOPP_UNUSED(a); CRYPTOPP_UNUSED(b);
697 CRYPTOPP_UNUSED(c); CRYPTOPP_UNUSED(d);
699 return _mm_setzero_si128();
703 inline __m128i UnpackXMM<0>(
const __m128i& a,
const __m128i& b,
const __m128i& c,
const __m128i& d)
709 const __m128i r1 = _mm_unpacklo_epi32(a, b);
710 const __m128i r2 = _mm_unpacklo_epi32(c, d);
711 return _mm_shuffle_epi8(_mm_unpacklo_epi64(r1, r2),
712 _mm_set_epi8(12,13,14,15, 8,9,10,11, 4,5,6,7, 0,1,2,3));
716 inline __m128i UnpackXMM<1>(
const __m128i& a,
const __m128i& b,
const __m128i& c,
const __m128i& d)
722 const __m128i r1 = _mm_unpacklo_epi32(a, b);
723 const __m128i r2 = _mm_unpacklo_epi32(c, d);
724 return _mm_shuffle_epi8(_mm_unpackhi_epi64(r1, r2),
725 _mm_set_epi8(12,13,14,15, 8,9,10,11, 4,5,6,7, 0,1,2,3));
729 inline __m128i UnpackXMM<2>(
const __m128i& a,
const __m128i& b,
const __m128i& c,
const __m128i& d)
735 const __m128i r1 = _mm_unpackhi_epi32(a, b);
736 const __m128i r2 = _mm_unpackhi_epi32(c, d);
737 return _mm_shuffle_epi8(_mm_unpacklo_epi64(r1, r2),
738 _mm_set_epi8(12,13,14,15, 8,9,10,11, 4,5,6,7, 0,1,2,3));
742 inline __m128i UnpackXMM<3>(
const __m128i& a,
const __m128i& b,
const __m128i& c,
const __m128i& d)
748 const __m128i r1 = _mm_unpackhi_epi32(a, b);
749 const __m128i r2 = _mm_unpackhi_epi32(c, d);
750 return _mm_shuffle_epi8(_mm_unpackhi_epi64(r1, r2),
751 _mm_set_epi8(12,13,14,15, 8,9,10,11, 4,5,6,7, 0,1,2,3));
754 template <
unsigned int IDX>
755 inline __m128i UnpackXMM(
const __m128i& v)
759 return _mm_setzero_si128();
763 inline __m128i UnpackXMM<0>(
const __m128i& v)
765 return _mm_shuffle_epi8(v, _mm_set_epi8(0,1,2,3, 0,1,2,3, 0,1,2,3, 0,1,2,3));
769 inline __m128i UnpackXMM<1>(
const __m128i& v)
771 return _mm_shuffle_epi8(v, _mm_set_epi8(4,5,6,7, 4,5,6,7, 4,5,6,7, 4,5,6,7));
775 inline __m128i UnpackXMM<2>(
const __m128i& v)
777 return _mm_shuffle_epi8(v, _mm_set_epi8(8,9,10,11, 8,9,10,11, 8,9,10,11, 8,9,10,11));
781 inline __m128i UnpackXMM<3>(
const __m128i& v)
783 return _mm_shuffle_epi8(v, _mm_set_epi8(12,13,14,15, 12,13,14,15, 12,13,14,15, 12,13,14,15));
786 template <
unsigned int IDX>
787 inline __m128i RepackXMM(
const __m128i& a,
const __m128i& b,
const __m128i& c,
const __m128i& d)
789 return UnpackXMM<IDX>(a, b, c, d);
792 template <
unsigned int IDX>
793 inline __m128i RepackXMM(
const __m128i& v)
795 return UnpackXMM<IDX>(v);
798 inline void CHAM128_Enc_Block(__m128i &block0,
799 const word32 *subkeys,
unsigned int rounds)
805 __m128i a = UnpackXMM<0>(block0);
806 __m128i b = UnpackXMM<1>(block0);
807 __m128i c = UnpackXMM<2>(block0);
808 __m128i d = UnpackXMM<3>(block0);
810 __m128i counter = _mm_set_epi32(0,0,0,0);
811 __m128i increment = _mm_set_epi32(1,1,1,1);
813 const unsigned int MASK = (rounds == 80 ? 7 : 15);
814 for (
int i=0; i<static_cast<int>(rounds); i+=4)
816 __m128i k, k1, k2, t1, t2;
817 k = _mm_castpd_si128(_mm_load_sd((
const double*)(&subkeys[(i+0) & MASK])));
820 k1 = _mm_shuffle_epi8(k, _mm_set_epi8(3,2,1,0, 3,2,1,0, 3,2,1,0, 3,2,1,0));
821 k2 = _mm_shuffle_epi8(k, _mm_set_epi8(7,6,5,4, 7,6,5,4, 7,6,5,4, 7,6,5,4));
823 t1 = _mm_xor_si128(a, counter);
824 t2 = _mm_xor_si128(RotateLeft32<1>(b), k1);
825 a = RotateLeft32<8>(_mm_add_epi32(t1, t2));
827 counter = _mm_add_epi32(counter, increment);
829 t1 = _mm_xor_si128(b, counter);
830 t2 = _mm_xor_si128(RotateLeft32<8>(c), k2);
831 b = RotateLeft32<1>(_mm_add_epi32(t1, t2));
833 counter = _mm_add_epi32(counter, increment);
835 k = _mm_castpd_si128(_mm_load_sd((
const double*)(&subkeys[(i+2) & MASK])));
838 k1 = _mm_shuffle_epi8(k, _mm_set_epi8(3,2,1,0, 3,2,1,0, 3,2,1,0, 3,2,1,0));
839 k2 = _mm_shuffle_epi8(k, _mm_set_epi8(7,6,5,4, 7,6,5,4, 7,6,5,4, 7,6,5,4));
841 t1 = _mm_xor_si128(c, counter);
842 t2 = _mm_xor_si128(RotateLeft32<1>(d), k1);
843 c = RotateLeft32<8>(_mm_add_epi32(t1, t2));
845 counter = _mm_add_epi32(counter, increment);
847 t1 = _mm_xor_si128(d, counter);
848 t2 = _mm_xor_si128(RotateLeft32<8>(a), k2);
849 d = RotateLeft32<1>(_mm_add_epi32(t1, t2));
851 counter = _mm_add_epi32(counter, increment);
855 block0 = RepackXMM<0>(a,b,c,d);
858 inline void CHAM128_Dec_Block(__m128i &block0,
859 const word32 *subkeys,
unsigned int rounds)
865 __m128i a = UnpackXMM<0>(block0);
866 __m128i b = UnpackXMM<1>(block0);
867 __m128i c = UnpackXMM<2>(block0);
868 __m128i d = UnpackXMM<3>(block0);
870 __m128i counter = _mm_set_epi32(rounds-1,rounds-1,rounds-1,rounds-1);
871 __m128i decrement = _mm_set_epi32(1,1,1,1);
873 const unsigned int MASK = (rounds == 80 ? 7 : 15);
874 for (
int i = static_cast<int>(rounds)-1; i >= 0; i-=4)
876 __m128i k, k1, k2, t1, t2;
877 k = _mm_castpd_si128(_mm_load_sd((
const double*)(&subkeys[(i-1) & MASK])));
880 k1 = _mm_shuffle_epi8(k, _mm_set_epi8(7,6,5,4, 7,6,5,4, 7,6,5,4, 7,6,5,4));
881 k2 = _mm_shuffle_epi8(k, _mm_set_epi8(3,2,1,0, 3,2,1,0, 3,2,1,0, 3,2,1,0));
884 t1 = RotateRight32<1>(d);
885 t2 = _mm_xor_si128(RotateLeft32<8>(a), k1);
886 d = _mm_xor_si128(_mm_sub_epi32(t1, t2), counter);
888 counter = _mm_sub_epi32(counter, decrement);
891 t1 = RotateRight32<8>(c);
892 t2 = _mm_xor_si128(RotateLeft32<1>(d), k2);
893 c = _mm_xor_si128(_mm_sub_epi32(t1, t2), counter);
895 counter = _mm_sub_epi32(counter, decrement);
896 k = _mm_castpd_si128(_mm_load_sd((
const double*)(&subkeys[(i-3) & MASK])));
899 k1 = _mm_shuffle_epi8(k, _mm_set_epi8(7,6,5,4, 7,6,5,4, 7,6,5,4, 7,6,5,4));
900 k2 = _mm_shuffle_epi8(k, _mm_set_epi8(3,2,1,0, 3,2,1,0, 3,2,1,0, 3,2,1,0));
903 t1 = RotateRight32<1>(b);
904 t2 = _mm_xor_si128(RotateLeft32<8>(c), k1);
905 b = _mm_xor_si128(_mm_sub_epi32(t1, t2), counter);
907 counter = _mm_sub_epi32(counter, decrement);
910 t1 = RotateRight32<8>(a);
911 t2 = _mm_xor_si128(RotateLeft32<1>(b), k2);
912 a = _mm_xor_si128(_mm_sub_epi32(t1, t2), counter);
914 counter = _mm_sub_epi32(counter, decrement);
918 block0 = RepackXMM<0>(a,b,c,d);
921 inline void CHAM128_Enc_4_Blocks(__m128i &block0, __m128i &block1,
922 __m128i &block2, __m128i &block3,
const word32 *subkeys,
unsigned int rounds)
928 __m128i a = UnpackXMM<0>(block0, block1, block2, block3);
929 __m128i b = UnpackXMM<1>(block0, block1, block2, block3);
930 __m128i c = UnpackXMM<2>(block0, block1, block2, block3);
931 __m128i d = UnpackXMM<3>(block0, block1, block2, block3);
933 __m128i counter = _mm_set_epi32(0,0,0,0);
934 __m128i increment = _mm_set_epi32(1,1,1,1);
936 const unsigned int MASK = (rounds == 80 ? 7 : 15);
937 for (
int i=0; i<static_cast<int>(rounds); i+=4)
939 __m128i k, k1, k2, t1, t2;
940 k = _mm_castpd_si128(_mm_load_sd((
const double*)(&subkeys[(i+0) & MASK])));
943 k1 = _mm_shuffle_epi8(k, _mm_set_epi8(3,2,1,0, 3,2,1,0, 3,2,1,0, 3,2,1,0));
944 k2 = _mm_shuffle_epi8(k, _mm_set_epi8(7,6,5,4, 7,6,5,4, 7,6,5,4, 7,6,5,4));
946 t1 = _mm_xor_si128(a, counter);
947 t2 = _mm_xor_si128(RotateLeft32<1>(b), k1);
948 a = RotateLeft32<8>(_mm_add_epi32(t1, t2));
950 counter = _mm_add_epi32(counter, increment);
952 t1 = _mm_xor_si128(b, counter);
953 t2 = _mm_xor_si128(RotateLeft32<8>(c), k2);
954 b = RotateLeft32<1>(_mm_add_epi32(t1, t2));
956 counter = _mm_add_epi32(counter, increment);
957 k = _mm_castpd_si128(_mm_load_sd((
const double*)(&subkeys[(i+2) & MASK])));
960 k1 = _mm_shuffle_epi8(k, _mm_set_epi8(3,2,1,0, 3,2,1,0, 3,2,1,0, 3,2,1,0));
961 k2 = _mm_shuffle_epi8(k, _mm_set_epi8(7,6,5,4, 7,6,5,4, 7,6,5,4, 7,6,5,4));
963 t1 = _mm_xor_si128(c, counter);
964 t2 = _mm_xor_si128(RotateLeft32<1>(d), k1);
965 c = RotateLeft32<8>(_mm_add_epi32(t1, t2));
967 counter = _mm_add_epi32(counter, increment);
969 t1 = _mm_xor_si128(d, counter);
970 t2 = _mm_xor_si128(RotateLeft32<8>(a), k2);
971 d = RotateLeft32<1>(_mm_add_epi32(t1, t2));
973 counter = _mm_add_epi32(counter, increment);
977 block0 = RepackXMM<0>(a,b,c,d);
978 block1 = RepackXMM<1>(a,b,c,d);
979 block2 = RepackXMM<2>(a,b,c,d);
980 block3 = RepackXMM<3>(a,b,c,d);
983 inline void CHAM128_Dec_4_Blocks(__m128i &block0, __m128i &block1,
984 __m128i &block2, __m128i &block3,
const word32 *subkeys,
unsigned int rounds)
990 __m128i a = UnpackXMM<0>(block0, block1, block2, block3);
991 __m128i b = UnpackXMM<1>(block0, block1, block2, block3);
992 __m128i c = UnpackXMM<2>(block0, block1, block2, block3);
993 __m128i d = UnpackXMM<3>(block0, block1, block2, block3);
995 __m128i counter = _mm_set_epi32(rounds-1,rounds-1,rounds-1,rounds-1);
996 __m128i decrement = _mm_set_epi32(1,1,1,1);
998 const unsigned int MASK = (rounds == 80 ? 7 : 15);
999 for (
int i = static_cast<int>(rounds)-1; i >= 0; i-=4)
1001 __m128i k, k1, k2, t1, t2;
1002 k = _mm_castpd_si128(_mm_load_sd((
const double*)(&subkeys[(i-1) & MASK])));
1005 k1 = _mm_shuffle_epi8(k, _mm_set_epi8(7,6,5,4, 7,6,5,4, 7,6,5,4, 7,6,5,4));
1006 k2 = _mm_shuffle_epi8(k, _mm_set_epi8(3,2,1,0, 3,2,1,0, 3,2,1,0, 3,2,1,0));
1009 t1 = RotateRight32<1>(d);
1010 t2 = _mm_xor_si128(RotateLeft32<8>(a), k1);
1011 d = _mm_xor_si128(_mm_sub_epi32(t1, t2), counter);
1013 counter = _mm_sub_epi32(counter, decrement);
1016 t1 = RotateRight32<8>(c);
1017 t2 = _mm_xor_si128(RotateLeft32<1>(d), k2);
1018 c = _mm_xor_si128(_mm_sub_epi32(t1, t2), counter);
1020 counter = _mm_sub_epi32(counter, decrement);
1021 k = _mm_castpd_si128(_mm_load_sd((
const double*)(&subkeys[(i-3) & MASK])));
1024 k1 = _mm_shuffle_epi8(k, _mm_set_epi8(7,6,5,4, 7,6,5,4, 7,6,5,4, 7,6,5,4));
1025 k2 = _mm_shuffle_epi8(k, _mm_set_epi8(3,2,1,0, 3,2,1,0, 3,2,1,0, 3,2,1,0));
1028 t1 = RotateRight32<1>(b);
1029 t2 = _mm_xor_si128(RotateLeft32<8>(c), k1);
1030 b = _mm_xor_si128(_mm_sub_epi32(t1, t2), counter);
1032 counter = _mm_sub_epi32(counter, decrement);
1035 t1 = RotateRight32<8>(a);
1036 t2 = _mm_xor_si128(RotateLeft32<1>(b), k2);
1037 a = _mm_xor_si128(_mm_sub_epi32(t1, t2), counter);
1039 counter = _mm_sub_epi32(counter, decrement);
1043 block0 = RepackXMM<0>(a,b,c,d);
1044 block1 = RepackXMM<1>(a,b,c,d);
1045 block2 = RepackXMM<2>(a,b,c,d);
1046 block3 = RepackXMM<3>(a,b,c,d);
1053 #endif // CRYPTOPP_SSSE3_AVAILABLE 1055 ANONYMOUS_NAMESPACE_END
1059 #if defined(CRYPTOPP_SSSE3_AVAILABLE) 1060 size_t CHAM64_Enc_AdvancedProcessBlocks_SSSE3(
const word16* subKeys,
size_t rounds,
1061 const byte *inBlocks,
const byte *xorBlocks, byte *outBlocks,
size_t length, word32 flags)
1063 return AdvancedProcessBlocks64_2x1_SSE(W16::CHAM64_Enc_Block, W16::CHAM64_Enc_2_Blocks,
1064 subKeys, rounds, inBlocks, xorBlocks, outBlocks, length, flags);
1067 size_t CHAM64_Dec_AdvancedProcessBlocks_SSSE3(
const word16* subKeys,
size_t rounds,
1068 const byte *inBlocks,
const byte *xorBlocks, byte *outBlocks,
size_t length, word32 flags)
1070 return AdvancedProcessBlocks64_2x1_SSE(W16::CHAM64_Dec_Block, W16::CHAM64_Dec_2_Blocks,
1071 subKeys, rounds, inBlocks, xorBlocks, outBlocks, length, flags);
1074 size_t CHAM128_Enc_AdvancedProcessBlocks_SSSE3(
const word32* subKeys,
size_t rounds,
1075 const byte *inBlocks,
const byte *xorBlocks, byte *outBlocks,
size_t length, word32 flags)
1077 return AdvancedProcessBlocks128_4x1_SSE(W32::CHAM128_Enc_Block, W32::CHAM128_Enc_4_Blocks,
1078 subKeys, rounds, inBlocks, xorBlocks, outBlocks, length, flags);
1081 size_t CHAM128_Dec_AdvancedProcessBlocks_SSSE3(
const word32* subKeys,
size_t rounds,
1082 const byte *inBlocks,
const byte *xorBlocks, byte *outBlocks,
size_t length, word32 flags)
1084 return AdvancedProcessBlocks128_4x1_SSE(W32::CHAM128_Dec_Block, W32::CHAM128_Dec_4_Blocks,
1085 subKeys, rounds, inBlocks, xorBlocks, outBlocks, length, flags);
1087 #endif // CRYPTOPP_SSSE3_AVAILABLE Utility functions for the Crypto++ library.
Library configuration file.
Classes for the CHAM block cipher.
Template for AdvancedProcessBlocks and SIMD processing.
#define CRYPTOPP_ASSERT(exp)
Debugging and diagnostic assertion.
Crypto++ library namespace.