19 #if (CRYPTOPP_SSSE3_AVAILABLE) 21 # include <pmmintrin.h> 22 # include <tmmintrin.h> 25 #if (CRYPTOPP_SSE41_AVAILABLE) 26 # include <smmintrin.h> 30 # include <ammintrin.h> 33 #if defined(__AVX512F__) 34 # define CRYPTOPP_AVX512_ROTATE 1 35 # include <immintrin.h> 39 #if (CRYPTOPP_ARM_NEON_AVAILABLE) 42 # include <arm_neon.h> 46 #if (CRYPTOPP_ARM_ACLE_AVAILABLE) 48 # include <arm_acle.h> 51 #if defined(CRYPTOPP_ALTIVEC_AVAILABLE) 57 extern const char SIMON64_SIMD_FNAME[] = __FILE__;
59 ANONYMOUS_NAMESPACE_BEGIN
62 using CryptoPP::word32;
63 using CryptoPP::word64;
68 #if (CRYPTOPP_ARM_NEON_AVAILABLE) 71 inline T UnpackHigh32(
const T& a,
const T& b)
73 const uint32x2_t x(vget_high_u32((uint32x4_t)a));
74 const uint32x2_t y(vget_high_u32((uint32x4_t)b));
75 const uint32x2x2_t r = vzip_u32(x, y);
76 return (T)vcombine_u32(r.val[0], r.val[1]);
80 inline T UnpackLow32(
const T& a,
const T& b)
82 const uint32x2_t x(vget_low_u32((uint32x4_t)a));
83 const uint32x2_t y(vget_low_u32((uint32x4_t)b));
84 const uint32x2x2_t r = vzip_u32(x, y);
85 return (T)vcombine_u32(r.val[0], r.val[1]);
88 template <
unsigned int R>
89 inline uint32x4_t RotateLeft32(
const uint32x4_t& val)
91 const uint32x4_t a(vshlq_n_u32(val, R));
92 const uint32x4_t b(vshrq_n_u32(val, 32 - R));
93 return vorrq_u32(a, b);
96 template <
unsigned int R>
97 inline uint32x4_t RotateRight32(
const uint32x4_t& val)
99 const uint32x4_t a(vshlq_n_u32(val, 32 - R));
100 const uint32x4_t b(vshrq_n_u32(val, R));
101 return vorrq_u32(a, b);
104 #if defined(__aarch32__) || defined(__aarch64__) 107 inline uint32x4_t RotateLeft32<8>(
const uint32x4_t& val)
109 const uint8_t maskb[16] = { 3,0,1,2, 7,4,5,6, 11,8,9,10, 15,12,13,14 };
110 const uint8x16_t mask = vld1q_u8(maskb);
112 return vreinterpretq_u32_u8(
113 vqtbl1q_u8(vreinterpretq_u8_u32(val), mask));
118 inline uint32x4_t RotateRight32<8>(
const uint32x4_t& val)
120 const uint8_t maskb[16] = { 1,2,3,0, 5,6,7,4, 9,10,11,8, 13,14,14,12 };
121 const uint8x16_t mask = vld1q_u8(maskb);
123 return vreinterpretq_u32_u8(
124 vqtbl1q_u8(vreinterpretq_u8_u32(val), mask));
128 inline uint32x4_t SIMON64_f(
const uint32x4_t& val)
130 return veorq_u32(RotateLeft32<2>(val),
131 vandq_u32(RotateLeft32<1>(val), RotateLeft32<8>(val)));
134 inline void SIMON64_Enc_Block(uint32x4_t &block1, uint32x4_t &block0,
135 const word32 *subkeys,
unsigned int rounds)
138 uint32x4_t x1 = vuzpq_u32(block0, block1).val[1];
139 uint32x4_t y1 = vuzpq_u32(block0, block1).val[0];
141 for (
int i = 0; i < static_cast<int>(rounds & ~1)-1; i += 2)
143 const uint32x4_t rk1 = vld1q_dup_u32(subkeys+i);
144 y1 = veorq_u32(veorq_u32(y1, SIMON64_f(x1)), rk1);
146 const uint32x4_t rk2 = vld1q_dup_u32(subkeys+i+1);
147 x1 = veorq_u32(veorq_u32(x1, SIMON64_f(y1)), rk2);
152 const uint32x4_t rk = vld1q_dup_u32(subkeys+rounds-1);
154 y1 = veorq_u32(veorq_u32(y1, SIMON64_f(x1)), rk);
159 block0 = UnpackLow32(y1, x1);
160 block1 = UnpackHigh32(y1, x1);
163 inline void SIMON64_Dec_Block(uint32x4_t &block0, uint32x4_t &block1,
164 const word32 *subkeys,
unsigned int rounds)
167 uint32x4_t x1 = vuzpq_u32(block0, block1).val[1];
168 uint32x4_t y1 = vuzpq_u32(block0, block1).val[0];
173 const uint32x4_t rk = vld1q_dup_u32(subkeys + rounds - 1);
175 y1 = veorq_u32(veorq_u32(y1, rk), SIMON64_f(x1));
179 for (
int i = static_cast<int>(rounds-2); i >= 0; i -= 2)
181 const uint32x4_t rk1 = vld1q_dup_u32(subkeys+i+1);
182 x1 = veorq_u32(veorq_u32(x1, SIMON64_f(y1)), rk1);
184 const uint32x4_t rk2 = vld1q_dup_u32(subkeys+i);
185 y1 = veorq_u32(veorq_u32(y1, SIMON64_f(x1)), rk2);
189 block0 = UnpackLow32(y1, x1);
190 block1 = UnpackHigh32(y1, x1);
193 inline void SIMON64_Enc_6_Blocks(uint32x4_t &block0, uint32x4_t &block1,
194 uint32x4_t &block2, uint32x4_t &block3, uint32x4_t &block4, uint32x4_t &block5,
195 const word32 *subkeys,
unsigned int rounds)
198 uint32x4_t x1 = vuzpq_u32(block0, block1).val[1];
199 uint32x4_t y1 = vuzpq_u32(block0, block1).val[0];
200 uint32x4_t x2 = vuzpq_u32(block2, block3).val[1];
201 uint32x4_t y2 = vuzpq_u32(block2, block3).val[0];
202 uint32x4_t x3 = vuzpq_u32(block4, block5).val[1];
203 uint32x4_t y3 = vuzpq_u32(block4, block5).val[0];
205 for (
int i = 0; i < static_cast<int>(rounds & ~1) - 1; i += 2)
207 const uint32x4_t rk1 = vld1q_dup_u32(subkeys+i);
208 y1 = veorq_u32(veorq_u32(y1, SIMON64_f(x1)), rk1);
209 y2 = veorq_u32(veorq_u32(y2, SIMON64_f(x2)), rk1);
210 y3 = veorq_u32(veorq_u32(y3, SIMON64_f(x3)), rk1);
212 const uint32x4_t rk2 = vld1q_dup_u32(subkeys+i+1);
213 x1 = veorq_u32(veorq_u32(x1, SIMON64_f(y1)), rk2);
214 x2 = veorq_u32(veorq_u32(x2, SIMON64_f(y2)), rk2);
215 x3 = veorq_u32(veorq_u32(x3, SIMON64_f(y3)), rk2);
220 const uint32x4_t rk = vld1q_dup_u32(subkeys + rounds - 1);
222 y1 = veorq_u32(veorq_u32(y1, SIMON64_f(x1)), rk);
223 y2 = veorq_u32(veorq_u32(y2, SIMON64_f(x2)), rk);
224 y3 = veorq_u32(veorq_u32(y3, SIMON64_f(x3)), rk);
225 std::swap(x1, y1); std::swap(x2, y2); std::swap(x3, y3);
229 block0 = UnpackLow32(y1, x1);
230 block1 = UnpackHigh32(y1, x1);
231 block2 = UnpackLow32(y2, x2);
232 block3 = UnpackHigh32(y2, x2);
233 block4 = UnpackLow32(y3, x3);
234 block5 = UnpackHigh32(y3, x3);
237 inline void SIMON64_Dec_6_Blocks(uint32x4_t &block0, uint32x4_t &block1,
238 uint32x4_t &block2, uint32x4_t &block3, uint32x4_t &block4, uint32x4_t &block5,
239 const word32 *subkeys,
unsigned int rounds)
242 uint32x4_t x1 = vuzpq_u32(block0, block1).val[1];
243 uint32x4_t y1 = vuzpq_u32(block0, block1).val[0];
244 uint32x4_t x2 = vuzpq_u32(block2, block3).val[1];
245 uint32x4_t y2 = vuzpq_u32(block2, block3).val[0];
246 uint32x4_t x3 = vuzpq_u32(block4, block5).val[1];
247 uint32x4_t y3 = vuzpq_u32(block4, block5).val[0];
251 std::swap(x1, y1); std::swap(x2, y2); std::swap(x3, y3);
252 const uint32x4_t rk = vld1q_dup_u32(subkeys + rounds - 1);
254 y1 = veorq_u32(veorq_u32(y1, rk), SIMON64_f(x1));
255 y2 = veorq_u32(veorq_u32(y2, rk), SIMON64_f(x2));
256 y3 = veorq_u32(veorq_u32(y3, rk), SIMON64_f(x3));
260 for (
int i = static_cast<int>(rounds-2); i >= 0; i -= 2)
262 const uint32x4_t rk1 = vld1q_dup_u32(subkeys + i + 1);
263 x1 = veorq_u32(veorq_u32(x1, SIMON64_f(y1)), rk1);
264 x2 = veorq_u32(veorq_u32(x2, SIMON64_f(y2)), rk1);
265 x3 = veorq_u32(veorq_u32(x3, SIMON64_f(y3)), rk1);
267 const uint32x4_t rk2 = vld1q_dup_u32(subkeys + i);
268 y1 = veorq_u32(veorq_u32(y1, SIMON64_f(x1)), rk2);
269 y2 = veorq_u32(veorq_u32(y2, SIMON64_f(x2)), rk2);
270 y3 = veorq_u32(veorq_u32(y3, SIMON64_f(x3)), rk2);
274 block0 = UnpackLow32(y1, x1);
275 block1 = UnpackHigh32(y1, x1);
276 block2 = UnpackLow32(y2, x2);
277 block3 = UnpackHigh32(y2, x2);
278 block4 = UnpackLow32(y3, x3);
279 block5 = UnpackHigh32(y3, x3);
282 #endif // CRYPTOPP_ARM_NEON_AVAILABLE 286 #if defined(CRYPTOPP_SSE41_AVAILABLE) 288 inline void Swap128(__m128i& a,__m128i& b)
290 #if defined(__SUNPRO_CC) && (__SUNPRO_CC <= 0x5120) 299 template <
unsigned int R>
300 inline __m128i RotateLeft32(
const __m128i& val)
303 return _mm_roti_epi32(val, R);
306 _mm_slli_epi32(val, R), _mm_srli_epi32(val, 32-R));
310 template <
unsigned int R>
311 inline __m128i RotateRight32(
const __m128i& val)
314 return _mm_roti_epi32(val, 32-R);
317 _mm_slli_epi32(val, 32-R), _mm_srli_epi32(val, R));
323 __m128i RotateLeft32<8>(
const __m128i& val)
326 return _mm_roti_epi32(val, 8);
328 const __m128i mask = _mm_set_epi8(14,13,12,15, 10,9,8,11, 6,5,4,7, 2,1,0,3);
329 return _mm_shuffle_epi8(val, mask);
335 __m128i RotateRight32<8>(
const __m128i& val)
338 return _mm_roti_epi32(val, 32-8);
340 const __m128i mask = _mm_set_epi8(12,15,14,13, 8,11,10,9, 4,7,6,5, 0,3,2,1);
341 return _mm_shuffle_epi8(val, mask);
345 inline __m128i SIMON64_f(
const __m128i& v)
347 return _mm_xor_si128(RotateLeft32<2>(v),
348 _mm_and_si128(RotateLeft32<1>(v), RotateLeft32<8>(v)));
351 inline void SIMON64_Enc_Block(__m128i &block0, __m128i &block1,
352 const word32 *subkeys,
unsigned int rounds)
355 const __m128 t0 = _mm_castsi128_ps(block0);
356 const __m128 t1 = _mm_castsi128_ps(block1);
357 __m128i x1 = _mm_castps_si128(_mm_shuffle_ps(t0, t1, _MM_SHUFFLE(3,1,3,1)));
358 __m128i y1 = _mm_castps_si128(_mm_shuffle_ps(t0, t1, _MM_SHUFFLE(2,0,2,0)));
360 for (
int i = 0; i < static_cast<int>(rounds & ~1)-1; i += 2)
362 const __m128i rk1 = _mm_set1_epi32(subkeys[i]);
363 y1 = _mm_xor_si128(_mm_xor_si128(y1, SIMON64_f(x1)), rk1);
365 const __m128i rk2 = _mm_set1_epi32(subkeys[i+1]);
366 x1 = _mm_xor_si128(_mm_xor_si128(x1, SIMON64_f(y1)), rk2);
371 const __m128i rk = _mm_set1_epi32(subkeys[rounds-1]);
372 y1 = _mm_xor_si128(_mm_xor_si128(y1, SIMON64_f(x1)), rk);
377 block0 = _mm_unpacklo_epi32(y1, x1);
378 block1 = _mm_unpackhi_epi32(y1, x1);
381 inline void SIMON64_Dec_Block(__m128i &block0, __m128i &block1,
382 const word32 *subkeys,
unsigned int rounds)
385 const __m128 t0 = _mm_castsi128_ps(block0);
386 const __m128 t1 = _mm_castsi128_ps(block1);
387 __m128i x1 = _mm_castps_si128(_mm_shuffle_ps(t0, t1, _MM_SHUFFLE(3,1,3,1)));
388 __m128i y1 = _mm_castps_si128(_mm_shuffle_ps(t0, t1, _MM_SHUFFLE(2,0,2,0)));
393 const __m128i rk = _mm_set1_epi32(subkeys[rounds-1]);
394 y1 = _mm_xor_si128(_mm_xor_si128(y1, rk), SIMON64_f(x1));
398 for (
int i = static_cast<int>(rounds-2); i >= 0; i -= 2)
400 const __m128i rk1 = _mm_set1_epi32(subkeys[i+1]);
401 x1 = _mm_xor_si128(_mm_xor_si128(x1, SIMON64_f(y1)), rk1);
403 const __m128i rk2 = _mm_set1_epi32(subkeys[i]);
404 y1 = _mm_xor_si128(_mm_xor_si128(y1, SIMON64_f(x1)), rk2);
408 block0 = _mm_unpacklo_epi32(y1, x1);
409 block1 = _mm_unpackhi_epi32(y1, x1);
412 inline void SIMON64_Enc_6_Blocks(__m128i &block0, __m128i &block1,
413 __m128i &block2, __m128i &block3, __m128i &block4, __m128i &block5,
414 const word32 *subkeys,
unsigned int rounds)
417 const __m128 t0 = _mm_castsi128_ps(block0);
418 const __m128 t1 = _mm_castsi128_ps(block1);
419 __m128i x1 = _mm_castps_si128(_mm_shuffle_ps(t0, t1, _MM_SHUFFLE(3,1,3,1)));
420 __m128i y1 = _mm_castps_si128(_mm_shuffle_ps(t0, t1, _MM_SHUFFLE(2,0,2,0)));
422 const __m128 t2 = _mm_castsi128_ps(block2);
423 const __m128 t3 = _mm_castsi128_ps(block3);
424 __m128i x2 = _mm_castps_si128(_mm_shuffle_ps(t2, t3, _MM_SHUFFLE(3,1,3,1)));
425 __m128i y2 = _mm_castps_si128(_mm_shuffle_ps(t2, t3, _MM_SHUFFLE(2,0,2,0)));
427 const __m128 t4 = _mm_castsi128_ps(block4);
428 const __m128 t5 = _mm_castsi128_ps(block5);
429 __m128i x3 = _mm_castps_si128(_mm_shuffle_ps(t4, t5, _MM_SHUFFLE(3,1,3,1)));
430 __m128i y3 = _mm_castps_si128(_mm_shuffle_ps(t4, t5, _MM_SHUFFLE(2,0,2,0)));
432 for (
int i = 0; i < static_cast<int>(rounds & ~1)-1; i += 2)
434 const __m128i rk1 = _mm_set1_epi32(subkeys[i]);
435 y1 = _mm_xor_si128(_mm_xor_si128(y1, SIMON64_f(x1)), rk1);
436 y2 = _mm_xor_si128(_mm_xor_si128(y2, SIMON64_f(x2)), rk1);
437 y3 = _mm_xor_si128(_mm_xor_si128(y3, SIMON64_f(x3)), rk1);
439 const __m128i rk2 = _mm_set1_epi32(subkeys[i+1]);
440 x1 = _mm_xor_si128(_mm_xor_si128(x1, SIMON64_f(y1)), rk2);
441 x2 = _mm_xor_si128(_mm_xor_si128(x2, SIMON64_f(y2)), rk2);
442 x3 = _mm_xor_si128(_mm_xor_si128(x3, SIMON64_f(y3)), rk2);
447 const __m128i rk = _mm_set1_epi32(subkeys[rounds-1]);
448 y1 = _mm_xor_si128(_mm_xor_si128(y1, SIMON64_f(x1)), rk);
449 y2 = _mm_xor_si128(_mm_xor_si128(y2, SIMON64_f(x2)), rk);
450 y3 = _mm_xor_si128(_mm_xor_si128(y3, SIMON64_f(x3)), rk);
451 Swap128(x1, y1); Swap128(x2, y2); Swap128(x3, y3);
455 block0 = _mm_unpacklo_epi32(y1, x1);
456 block1 = _mm_unpackhi_epi32(y1, x1);
457 block2 = _mm_unpacklo_epi32(y2, x2);
458 block3 = _mm_unpackhi_epi32(y2, x2);
459 block4 = _mm_unpacklo_epi32(y3, x3);
460 block5 = _mm_unpackhi_epi32(y3, x3);
463 inline void SIMON64_Dec_6_Blocks(__m128i &block0, __m128i &block1,
464 __m128i &block2, __m128i &block3, __m128i &block4, __m128i &block5,
465 const word32 *subkeys,
unsigned int rounds)
468 const __m128 t0 = _mm_castsi128_ps(block0);
469 const __m128 t1 = _mm_castsi128_ps(block1);
470 __m128i x1 = _mm_castps_si128(_mm_shuffle_ps(t0, t1, _MM_SHUFFLE(3,1,3,1)));
471 __m128i y1 = _mm_castps_si128(_mm_shuffle_ps(t0, t1, _MM_SHUFFLE(2,0,2,0)));
473 const __m128 t2 = _mm_castsi128_ps(block2);
474 const __m128 t3 = _mm_castsi128_ps(block3);
475 __m128i x2 = _mm_castps_si128(_mm_shuffle_ps(t2, t3, _MM_SHUFFLE(3,1,3,1)));
476 __m128i y2 = _mm_castps_si128(_mm_shuffle_ps(t2, t3, _MM_SHUFFLE(2,0,2,0)));
478 const __m128 t4 = _mm_castsi128_ps(block4);
479 const __m128 t5 = _mm_castsi128_ps(block5);
480 __m128i x3 = _mm_castps_si128(_mm_shuffle_ps(t4, t5, _MM_SHUFFLE(3,1,3,1)));
481 __m128i y3 = _mm_castps_si128(_mm_shuffle_ps(t4, t5, _MM_SHUFFLE(2,0,2,0)));
485 Swap128(x1, y1); Swap128(x2, y2); Swap128(x3, y3);
486 const __m128i rk = _mm_set1_epi32(subkeys[rounds-1]);
487 y1 = _mm_xor_si128(_mm_xor_si128(y1, rk), SIMON64_f(x1));
488 y2 = _mm_xor_si128(_mm_xor_si128(y2, rk), SIMON64_f(x2));
489 y3 = _mm_xor_si128(_mm_xor_si128(y3, rk), SIMON64_f(x3));
493 for (
int i = static_cast<int>(rounds-2); i >= 0; i -= 2)
495 const __m128i rk1 = _mm_set1_epi32(subkeys[i+1]);
496 x1 = _mm_xor_si128(_mm_xor_si128(x1, SIMON64_f(y1)), rk1);
497 x2 = _mm_xor_si128(_mm_xor_si128(x2, SIMON64_f(y2)), rk1);
498 x3 = _mm_xor_si128(_mm_xor_si128(x3, SIMON64_f(y3)), rk1);
500 const __m128i rk2 = _mm_set1_epi32(subkeys[i]);
501 y1 = _mm_xor_si128(_mm_xor_si128(y1, SIMON64_f(x1)), rk2);
502 y2 = _mm_xor_si128(_mm_xor_si128(y2, SIMON64_f(x2)), rk2);
503 y3 = _mm_xor_si128(_mm_xor_si128(y3, SIMON64_f(x3)), rk2);
507 block0 = _mm_unpacklo_epi32(y1, x1);
508 block1 = _mm_unpackhi_epi32(y1, x1);
509 block2 = _mm_unpacklo_epi32(y2, x2);
510 block3 = _mm_unpackhi_epi32(y2, x2);
511 block4 = _mm_unpacklo_epi32(y3, x3);
512 block5 = _mm_unpackhi_epi32(y3, x3);
515 #endif // CRYPTOPP_SSE41_AVAILABLE 519 #if defined(CRYPTOPP_ALTIVEC_AVAILABLE) 531 template<
unsigned int C>
535 return vec_rl(val, m);
539 template<
unsigned int C>
542 const uint32x4_p m = {32-C, 32-C, 32-C, 32-C};
543 return vec_rl(val, m);
548 return VecXor(RotateLeft32<2>(val),
549 VecAnd(RotateLeft32<1>(val), RotateLeft32<8>(val)));
553 const word32 *subkeys,
unsigned int rounds)
555 #if (CRYPTOPP_BIG_ENDIAN) 556 const uint8x16_p m1 = {7,6,5,4, 15,14,13,12, 23,22,21,20, 31,30,29,28};
557 const uint8x16_p m2 = {3,2,1,0, 11,10,9,8, 19,18,17,16, 27,26,25,24};
559 const uint8x16_p m1 = {3,2,1,0, 11,10,9,8, 19,18,17,16, 27,26,25,24};
560 const uint8x16_p m2 = {7,6,5,4, 15,14,13,12, 23,22,21,20, 31,30,29,28};
567 for (
int i = 0; i < static_cast<int>(rounds & ~1)-1; i += 2)
569 #if CRYPTOPP_POWER8_AVAILABLE 570 const uint32x4_p rk1 = vec_splats(subkeys[i]);
571 const uint32x4_p rk2 = vec_splats(subkeys[i+1]);
573 const uint8x16_p m = {0,1,2,3, 0,1,2,3, 0,1,2,3, 0,1,2,3};
585 #if CRYPTOPP_POWER8_AVAILABLE 586 const uint32x4_p rk = vec_splats(subkeys[rounds-1]);
588 const uint8x16_p m = {0,1,2,3, 0,1,2,3, 0,1,2,3, 0,1,2,3};
596 #if (CRYPTOPP_BIG_ENDIAN) 597 const uint8x16_p m3 = {19,18,17,16, 3,2,1,0, 23,22,21,20, 7,6,5,4};
598 const uint8x16_p m4 = {27,26,25,24, 11,10,9,8, 31,30,29,28, 15,14,13,12};
600 const uint8x16_p m3 = {3,2,1,0, 19,18,17,16, 7,6,5,4, 23,22,21,20};
601 const uint8x16_p m4 = {11,10,9,8, 27,26,25,24, 15,14,13,12, 31,30,29,28};
610 const word32 *subkeys,
unsigned int rounds)
612 #if (CRYPTOPP_BIG_ENDIAN) 613 const uint8x16_p m1 = {7,6,5,4, 15,14,13,12, 23,22,21,20, 31,30,29,28};
614 const uint8x16_p m2 = {3,2,1,0, 11,10,9,8, 19,18,17,16, 27,26,25,24};
616 const uint8x16_p m1 = {3,2,1,0, 11,10,9,8, 19,18,17,16, 27,26,25,24};
617 const uint8x16_p m2 = {7,6,5,4, 15,14,13,12, 23,22,21,20, 31,30,29,28};
627 #if CRYPTOPP_POWER8_AVAILABLE 628 const uint32x4_p rk = vec_splats(subkeys[rounds-1]);
630 const uint8x16_p m = {0,1,2,3, 0,1,2,3, 0,1,2,3, 0,1,2,3};
638 for (
int i = static_cast<int>(rounds-2); i >= 0; i -= 2)
640 #if CRYPTOPP_POWER8_AVAILABLE 641 const uint32x4_p rk1 = vec_splats(subkeys[i+1]);
642 const uint32x4_p rk2 = vec_splats(subkeys[i]);
644 const uint8x16_p m = {0,1,2,3, 0,1,2,3, 0,1,2,3, 0,1,2,3};
654 #if (CRYPTOPP_BIG_ENDIAN) 655 const uint8x16_p m3 = {19,18,17,16, 3,2,1,0, 23,22,21,20, 7,6,5,4};
656 const uint8x16_p m4 = {27,26,25,24, 11,10,9,8, 31,30,29,28, 15,14,13,12};
658 const uint8x16_p m3 = {3,2,1,0, 19,18,17,16, 7,6,5,4, 23,22,21,20};
659 const uint8x16_p m4 = {11,10,9,8, 27,26,25,24, 15,14,13,12, 31,30,29,28};
669 uint32x4_p &block5,
const word32 *subkeys,
unsigned int rounds)
671 #if (CRYPTOPP_BIG_ENDIAN) 672 const uint8x16_p m1 = {7,6,5,4, 15,14,13,12, 23,22,21,20, 31,30,29,28};
673 const uint8x16_p m2 = {3,2,1,0, 11,10,9,8, 19,18,17,16, 27,26,25,24};
675 const uint8x16_p m1 = {3,2,1,0, 11,10,9,8, 19,18,17,16, 27,26,25,24};
676 const uint8x16_p m2 = {7,6,5,4, 15,14,13,12, 23,22,21,20, 31,30,29,28};
687 for (
int i = 0; i < static_cast<int>(rounds & ~1)-1; i += 2)
689 #if CRYPTOPP_POWER8_AVAILABLE 690 const uint32x4_p rk1 = vec_splats(subkeys[i]);
691 const uint32x4_p rk2 = vec_splats(subkeys[i+1]);
693 const uint8x16_p m = {0,1,2,3, 0,1,2,3, 0,1,2,3, 0,1,2,3};
710 #if CRYPTOPP_POWER8_AVAILABLE 711 const uint32x4_p rk = vec_splats(subkeys[rounds-1]);
713 const uint8x16_p m = {0,1,2,3, 0,1,2,3, 0,1,2,3, 0,1,2,3};
720 std::swap(x1, y1); std::swap(x2, y2); std::swap(x3, y3);
723 #if (CRYPTOPP_BIG_ENDIAN) 724 const uint8x16_p m3 = {19,18,17,16, 3,2,1,0, 23,22,21,20, 7,6,5,4};
725 const uint8x16_p m4 = {27,26,25,24, 11,10,9,8, 31,30,29,28, 15,14,13,12};
727 const uint8x16_p m3 = {3,2,1,0, 19,18,17,16, 7,6,5,4, 23,22,21,20};
728 const uint8x16_p m4 = {11,10,9,8, 27,26,25,24, 15,14,13,12, 31,30,29,28};
742 uint32x4_p &block5,
const word32 *subkeys,
unsigned int rounds)
744 #if (CRYPTOPP_BIG_ENDIAN) 745 const uint8x16_p m1 = {7,6,5,4, 15,14,13,12, 23,22,21,20, 31,30,29,28};
746 const uint8x16_p m2 = {3,2,1,0, 11,10,9,8, 19,18,17,16, 27,26,25,24};
748 const uint8x16_p m1 = {3,2,1,0, 11,10,9,8, 19,18,17,16, 27,26,25,24};
749 const uint8x16_p m2 = {7,6,5,4, 15,14,13,12, 23,22,21,20, 31,30,29,28};
762 std::swap(x1, y1); std::swap(x2, y2); std::swap(x3, y3);
764 #if CRYPTOPP_POWER8_AVAILABLE 765 const uint32x4_p rk = vec_splats(subkeys[rounds-1]);
767 const uint8x16_p m = {0,1,2,3, 0,1,2,3, 0,1,2,3, 0,1,2,3};
777 for (
int i = static_cast<int>(rounds-2); i >= 0; i -= 2)
779 #if CRYPTOPP_POWER8_AVAILABLE 780 const uint32x4_p rk1 = vec_splats(subkeys[i+1]);
781 const uint32x4_p rk2 = vec_splats(subkeys[i]);
783 const uint8x16_p m = {0,1,2,3, 0,1,2,3, 0,1,2,3, 0,1,2,3};
798 #if (CRYPTOPP_BIG_ENDIAN) 799 const uint8x16_p m3 = {19,18,17,16, 3,2,1,0, 23,22,21,20, 7,6,5,4};
800 const uint8x16_p m4 = {27,26,25,24, 11,10,9,8, 31,30,29,28, 15,14,13,12};
802 const uint8x16_p m3 = {3,2,1,0, 19,18,17,16, 7,6,5,4, 23,22,21,20};
803 const uint8x16_p m4 = {11,10,9,8, 27,26,25,24, 15,14,13,12, 31,30,29,28};
815 #endif // CRYPTOPP_ALTIVEC_AVAILABLE 817 ANONYMOUS_NAMESPACE_END
825 #if (CRYPTOPP_ARM_NEON_AVAILABLE) 826 size_t SIMON64_Enc_AdvancedProcessBlocks_NEON(
const word32* subKeys,
size_t rounds,
827 const byte *inBlocks,
const byte *xorBlocks, byte *outBlocks,
size_t length, word32 flags)
829 return AdvancedProcessBlocks64_6x2_NEON(SIMON64_Enc_Block, SIMON64_Enc_6_Blocks,
830 subKeys, rounds, inBlocks, xorBlocks, outBlocks, length, flags);
833 size_t SIMON64_Dec_AdvancedProcessBlocks_NEON(
const word32* subKeys,
size_t rounds,
834 const byte *inBlocks,
const byte *xorBlocks, byte *outBlocks,
size_t length, word32 flags)
836 return AdvancedProcessBlocks64_6x2_NEON(SIMON64_Dec_Block, SIMON64_Dec_6_Blocks,
837 subKeys, rounds, inBlocks, xorBlocks, outBlocks, length, flags);
839 #endif // CRYPTOPP_ARM_NEON_AVAILABLE 843 #if defined(CRYPTOPP_SSE41_AVAILABLE) 844 size_t SIMON64_Enc_AdvancedProcessBlocks_SSE41(
const word32* subKeys,
size_t rounds,
845 const byte *inBlocks,
const byte *xorBlocks, byte *outBlocks,
size_t length, word32 flags)
847 return AdvancedProcessBlocks64_6x2_SSE(SIMON64_Enc_Block, SIMON64_Enc_6_Blocks,
848 subKeys, rounds, inBlocks, xorBlocks, outBlocks, length, flags);
851 size_t SIMON64_Dec_AdvancedProcessBlocks_SSE41(
const word32* subKeys,
size_t rounds,
852 const byte *inBlocks,
const byte *xorBlocks, byte *outBlocks,
size_t length, word32 flags)
854 return AdvancedProcessBlocks64_6x2_SSE(SIMON64_Dec_Block, SIMON64_Dec_6_Blocks,
855 subKeys, rounds, inBlocks, xorBlocks, outBlocks, length, flags);
861 #if defined(CRYPTOPP_ALTIVEC_AVAILABLE) 862 size_t SIMON64_Enc_AdvancedProcessBlocks_ALTIVEC(
const word32* subKeys,
size_t rounds,
863 const byte *inBlocks,
const byte *xorBlocks, byte *outBlocks,
size_t length, word32 flags)
865 return AdvancedProcessBlocks64_6x2_ALTIVEC(SIMON64_Enc_Block, SIMON64_Enc_6_Blocks,
866 subKeys, rounds, inBlocks, xorBlocks, outBlocks, length, flags);
869 size_t SIMON64_Dec_AdvancedProcessBlocks_ALTIVEC(
const word32* subKeys,
size_t rounds,
870 const byte *inBlocks,
const byte *xorBlocks, byte *outBlocks,
size_t length, word32 flags)
872 return AdvancedProcessBlocks64_6x2_ALTIVEC(SIMON64_Dec_Block, SIMON64_Dec_6_Blocks,
873 subKeys, rounds, inBlocks, xorBlocks, outBlocks, length, flags);
Utility functions for the Crypto++ library.
Library configuration file.
T1 VecPermute(const T1 vec, const T2 mask)
Permutes a vector.
__vector unsigned int uint32x4_p
Vector of 32-bit elements.
Support functions for PowerPC and vector operations.
Template for AdvancedProcessBlocks and SIMD processing.
T1 VecXor(const T1 vec1, const T2 vec2)
XOR two vectors.
uint32x4_p VecLoadBE(const byte src[16])
Loads a vector from a byte array.
Classes for the Simon block cipher.
Crypto++ library namespace.
uint32x4_p VecLoad(const byte src[16])
Loads a vector from a byte array.
__vector unsigned char uint8x16_p
Vector of 8-bit elements.
T1 VecAnd(const T1 vec1, const T2 vec2)
AND two vectors.
void vec_swap(T &a, T &b)
Swaps two variables which are arrays.