Crypto++  8.2
Free C++ class library of cryptographic schemes
simon128_simd.cpp
1 // simon-simd.cpp - written and placed in the public domain by Jeffrey Walton
2 //
3 // This source file uses intrinsics and built-ins to gain access to
4 // SSSE3, ARM NEON and ARMv8a, and Power7 Altivec instructions. A separate
5 // source file is needed because additional CXXFLAGS are required to enable
6 // the appropriate instructions sets in some build configurations.
7 
8 #include "pch.h"
9 #include "config.h"
10 
11 #include "simon.h"
12 #include "misc.h"
13 
14 // Uncomment for benchmarking C++ against SSE or NEON.
15 // Do so in both simon.cpp and simon-simd.cpp.
16 // #undef CRYPTOPP_SSSE3_AVAILABLE
17 // #undef CRYPTOPP_ARM_NEON_AVAILABLE
18 
19 #if (CRYPTOPP_SSSE3_AVAILABLE)
20 # include "adv_simd.h"
21 # include <pmmintrin.h>
22 # include <tmmintrin.h>
23 #endif
24 
25 #if defined(__XOP__)
26 # include <ammintrin.h>
27 #endif
28 
29 #if defined(__AVX512F__)
30 # define CRYPTOPP_AVX512_ROTATE 1
31 # include <immintrin.h>
32 #endif
33 
34 // C1189: error: This header is specific to ARM targets
35 #if (CRYPTOPP_ARM_NEON_AVAILABLE)
36 # include "adv_simd.h"
37 # ifndef _M_ARM64
38 # include <arm_neon.h>
39 # endif
40 #endif
41 
42 #if (CRYPTOPP_ARM_ACLE_AVAILABLE)
43 # include <stdint.h>
44 # include <arm_acle.h>
45 #endif
46 
47 #if defined(CRYPTOPP_POWER8_AVAILABLE)
48 # include "adv_simd.h"
49 # include "ppc_simd.h"
50 #endif
51 
52 // Squash MS LNK4221 and libtool warnings
53 extern const char SIMON128_SIMD_FNAME[] = __FILE__;
54 
55 ANONYMOUS_NAMESPACE_BEGIN
56 
57 using CryptoPP::byte;
58 using CryptoPP::word32;
59 using CryptoPP::word64;
60 using CryptoPP::vec_swap; // SunCC
61 
62 // *************************** ARM NEON ************************** //
63 
64 #if (CRYPTOPP_ARM_NEON_AVAILABLE)
65 
66 // Missing from Microsoft's ARM A-32 implementation
67 #if defined(_MSC_VER) && !defined(_M_ARM64)
68 inline uint64x2_t vld1q_dup_u64(const uint64_t* ptr)
69 {
70  return vmovq_n_u64(*ptr);
71 }
72 #endif
73 
74 template <class T>
75 inline T UnpackHigh64(const T& a, const T& b)
76 {
77  const uint64x1_t x(vget_high_u64((uint64x2_t)a));
78  const uint64x1_t y(vget_high_u64((uint64x2_t)b));
79  return (T)vcombine_u64(x, y);
80 }
81 
82 template <class T>
83 inline T UnpackLow64(const T& a, const T& b)
84 {
85  const uint64x1_t x(vget_low_u64((uint64x2_t)a));
86  const uint64x1_t y(vget_low_u64((uint64x2_t)b));
87  return (T)vcombine_u64(x, y);
88 }
89 
90 template <unsigned int R>
91 inline uint64x2_t RotateLeft64(const uint64x2_t& val)
92 {
93  const uint64x2_t a(vshlq_n_u64(val, R));
94  const uint64x2_t b(vshrq_n_u64(val, 64 - R));
95  return vorrq_u64(a, b);
96 }
97 
98 template <unsigned int R>
99 inline uint64x2_t RotateRight64(const uint64x2_t& val)
100 {
101  const uint64x2_t a(vshlq_n_u64(val, 64 - R));
102  const uint64x2_t b(vshrq_n_u64(val, R));
103  return vorrq_u64(a, b);
104 }
105 
106 #if defined(__aarch32__) || defined(__aarch64__)
107 // Faster than two Shifts and an Or. Thanks to Louis Wingers and Bryan Weeks.
108 template <>
109 inline uint64x2_t RotateLeft64<8>(const uint64x2_t& val)
110 {
111  const uint8_t maskb[16] = { 7,0,1,2, 3,4,5,6, 15,8,9,10, 11,12,13,14 };
112  const uint8x16_t mask = vld1q_u8(maskb);
113 
114  return vreinterpretq_u64_u8(
115  vqtbl1q_u8(vreinterpretq_u8_u64(val), mask));
116 }
117 
118 // Faster than two Shifts and an Or. Thanks to Louis Wingers and Bryan Weeks.
119 template <>
120 inline uint64x2_t RotateRight64<8>(const uint64x2_t& val)
121 {
122  const uint8_t maskb[16] = { 1,2,3,4, 5,6,7,0, 9,10,11,12, 13,14,15,8 };
123  const uint8x16_t mask = vld1q_u8(maskb);
124 
125  return vreinterpretq_u64_u8(
126  vqtbl1q_u8(vreinterpretq_u8_u64(val), mask));
127 }
128 #endif
129 
130 inline uint64x2_t SIMON128_f(const uint64x2_t& val)
131 {
132  return veorq_u64(RotateLeft64<2>(val),
133  vandq_u64(RotateLeft64<1>(val), RotateLeft64<8>(val)));
134 }
135 
136 inline void SIMON128_Enc_Block(uint64x2_t &block0, uint64x2_t &block1,
137  const word64 *subkeys, unsigned int rounds)
138 {
139  // [A1 A2][B1 B2] ... => [A1 B1][A2 B2] ...
140  uint64x2_t x1 = UnpackHigh64(block0, block1);
141  uint64x2_t y1 = UnpackLow64(block0, block1);
142 
143  for (int i = 0; i < static_cast<int>(rounds & ~1)-1; i += 2)
144  {
145  const uint64x2_t rk1 = vld1q_dup_u64(subkeys+i);
146  y1 = veorq_u64(veorq_u64(y1, SIMON128_f(x1)), rk1);
147 
148  const uint64x2_t rk2 = vld1q_dup_u64(subkeys+i+1);
149  x1 = veorq_u64(veorq_u64(x1, SIMON128_f(y1)), rk2);
150  }
151 
152  if (rounds & 1)
153  {
154  const uint64x2_t rk = vld1q_dup_u64(subkeys+rounds-1);
155 
156  y1 = veorq_u64(veorq_u64(y1, SIMON128_f(x1)), rk);
157  std::swap(x1, y1);
158  }
159 
160  // [A1 B1][A2 B2] ... => [A1 A2][B1 B2] ...
161  block0 = UnpackLow64(y1, x1);
162  block1 = UnpackHigh64(y1, x1);
163 }
164 
165 inline void SIMON128_Enc_6_Blocks(uint64x2_t &block0, uint64x2_t &block1,
166  uint64x2_t &block2, uint64x2_t &block3, uint64x2_t &block4, uint64x2_t &block5,
167  const word64 *subkeys, unsigned int rounds)
168 {
169  // [A1 A2][B1 B2] ... => [A1 B1][A2 B2] ...
170  uint64x2_t x1 = UnpackHigh64(block0, block1);
171  uint64x2_t y1 = UnpackLow64(block0, block1);
172  uint64x2_t x2 = UnpackHigh64(block2, block3);
173  uint64x2_t y2 = UnpackLow64(block2, block3);
174  uint64x2_t x3 = UnpackHigh64(block4, block5);
175  uint64x2_t y3 = UnpackLow64(block4, block5);
176 
177  for (int i = 0; i < static_cast<int>(rounds & ~1) - 1; i += 2)
178  {
179  const uint64x2_t rk1 = vld1q_dup_u64(subkeys+i);
180  y1 = veorq_u64(veorq_u64(y1, SIMON128_f(x1)), rk1);
181  y2 = veorq_u64(veorq_u64(y2, SIMON128_f(x2)), rk1);
182  y3 = veorq_u64(veorq_u64(y3, SIMON128_f(x3)), rk1);
183 
184  const uint64x2_t rk2 = vld1q_dup_u64(subkeys+i+1);
185  x1 = veorq_u64(veorq_u64(x1, SIMON128_f(y1)), rk2);
186  x2 = veorq_u64(veorq_u64(x2, SIMON128_f(y2)), rk2);
187  x3 = veorq_u64(veorq_u64(x3, SIMON128_f(y3)), rk2);
188  }
189 
190  if (rounds & 1)
191  {
192  const uint64x2_t rk = vld1q_dup_u64(subkeys + rounds - 1);
193 
194  y1 = veorq_u64(veorq_u64(y1, SIMON128_f(x1)), rk);
195  y2 = veorq_u64(veorq_u64(y2, SIMON128_f(x2)), rk);
196  y3 = veorq_u64(veorq_u64(y3, SIMON128_f(x3)), rk);
197  std::swap(x1, y1); std::swap(x2, y2); std::swap(x3, y3);
198  }
199 
200  // [A1 B1][A2 B2] ... => [A1 A2][B1 B2] ...
201  block0 = UnpackLow64(y1, x1);
202  block1 = UnpackHigh64(y1, x1);
203  block2 = UnpackLow64(y2, x2);
204  block3 = UnpackHigh64(y2, x2);
205  block4 = UnpackLow64(y3, x3);
206  block5 = UnpackHigh64(y3, x3);
207 }
208 
209 inline void SIMON128_Dec_Block(uint64x2_t &block0, uint64x2_t &block1,
210  const word64 *subkeys, unsigned int rounds)
211 {
212  // [A1 A2][B1 B2] ... => [A1 B1][A2 B2] ...
213  uint64x2_t x1 = UnpackHigh64(block0, block1);
214  uint64x2_t y1 = UnpackLow64(block0, block1);
215 
216  if (rounds & 1)
217  {
218  std::swap(x1, y1);
219  const uint64x2_t rk = vld1q_dup_u64(subkeys + rounds - 1);
220 
221  y1 = veorq_u64(veorq_u64(y1, rk), SIMON128_f(x1));
222  rounds--;
223  }
224 
225  for (int i = static_cast<int>(rounds-2); i >= 0; i -= 2)
226  {
227  const uint64x2_t rk1 = vld1q_dup_u64(subkeys+i+1);
228  x1 = veorq_u64(veorq_u64(x1, SIMON128_f(y1)), rk1);
229 
230  const uint64x2_t rk2 = vld1q_dup_u64(subkeys+i);
231  y1 = veorq_u64(veorq_u64(y1, SIMON128_f(x1)), rk2);
232  }
233 
234  // [A1 B1][A2 B2] ... => [A1 A2][B1 B2] ...
235  block0 = UnpackLow64(y1, x1);
236  block1 = UnpackHigh64(y1, x1);
237 }
238 
239 inline void SIMON128_Dec_6_Blocks(uint64x2_t &block0, uint64x2_t &block1,
240  uint64x2_t &block2, uint64x2_t &block3, uint64x2_t &block4, uint64x2_t &block5,
241  const word64 *subkeys, unsigned int rounds)
242 {
243  // [A1 A2][B1 B2] ... => [A1 B1][A2 B2] ...
244  uint64x2_t x1 = UnpackHigh64(block0, block1);
245  uint64x2_t y1 = UnpackLow64(block0, block1);
246  uint64x2_t x2 = UnpackHigh64(block2, block3);
247  uint64x2_t y2 = UnpackLow64(block2, block3);
248  uint64x2_t x3 = UnpackHigh64(block4, block5);
249  uint64x2_t y3 = UnpackLow64(block4, block5);
250 
251  if (rounds & 1)
252  {
253  std::swap(x1, y1); std::swap(x2, y2); std::swap(x3, y3);
254  const uint64x2_t rk = vld1q_dup_u64(subkeys + rounds - 1);
255 
256  y1 = veorq_u64(veorq_u64(y1, rk), SIMON128_f(x1));
257  y2 = veorq_u64(veorq_u64(y2, rk), SIMON128_f(x2));
258  y3 = veorq_u64(veorq_u64(y3, rk), SIMON128_f(x3));
259  rounds--;
260  }
261 
262  for (int i = static_cast<int>(rounds-2); i >= 0; i -= 2)
263  {
264  const uint64x2_t rk1 = vld1q_dup_u64(subkeys + i + 1);
265  x1 = veorq_u64(veorq_u64(x1, SIMON128_f(y1)), rk1);
266  x2 = veorq_u64(veorq_u64(x2, SIMON128_f(y2)), rk1);
267  x3 = veorq_u64(veorq_u64(x3, SIMON128_f(y3)), rk1);
268 
269  const uint64x2_t rk2 = vld1q_dup_u64(subkeys + i);
270  y1 = veorq_u64(veorq_u64(y1, SIMON128_f(x1)), rk2);
271  y2 = veorq_u64(veorq_u64(y2, SIMON128_f(x2)), rk2);
272  y3 = veorq_u64(veorq_u64(y3, SIMON128_f(x3)), rk2);
273  }
274 
275  // [A1 B1][A2 B2] ... => [A1 A2][B1 B2] ...
276  block0 = UnpackLow64(y1, x1);
277  block1 = UnpackHigh64(y1, x1);
278  block2 = UnpackLow64(y2, x2);
279  block3 = UnpackHigh64(y2, x2);
280  block4 = UnpackLow64(y3, x3);
281  block5 = UnpackHigh64(y3, x3);
282 }
283 
284 #endif // CRYPTOPP_ARM_NEON_AVAILABLE
285 
286 // ***************************** IA-32 ***************************** //
287 
288 #if defined(CRYPTOPP_SSSE3_AVAILABLE)
289 
290 // Clang __m128i casts, http://bugs.llvm.org/show_bug.cgi?id=20670
291 #ifndef M128_CAST
292 # define M128_CAST(x) ((__m128i *)(void *)(x))
293 #endif
294 #ifndef CONST_M128_CAST
295 # define CONST_M128_CAST(x) ((const __m128i *)(const void *)(x))
296 #endif
297 
298 // GCC double casts, https://www.spinics.net/lists/gcchelp/msg47735.html
299 #ifndef DOUBLE_CAST
300 # define DOUBLE_CAST(x) ((double *)(void *)(x))
301 #endif
302 #ifndef CONST_DOUBLE_CAST
303 # define CONST_DOUBLE_CAST(x) ((const double *)(const void *)(x))
304 #endif
305 
306 inline void Swap128(__m128i& a,__m128i& b)
307 {
308 #if defined(__SUNPRO_CC) && (__SUNPRO_CC <= 0x5120)
309  // __m128i is an unsigned long long[2], and support for swapping it was not added until C++11.
310  // SunCC 12.1 - 12.3 fail to consume the swap; while SunCC 12.4 consumes it without -std=c++11.
311  vec_swap(a, b);
312 #else
313  std::swap(a, b);
314 #endif
315 }
316 
317 template <unsigned int R>
318 inline __m128i RotateLeft64(const __m128i& val)
319 {
320 #if defined(CRYPTOPP_AVX512_ROTATE)
321  return _mm_rol_epi64(val, R);
322 #elif defined(__XOP__)
323  return _mm_roti_epi64(val, R);
324 #else
325  return _mm_or_si128(
326  _mm_slli_epi64(val, R), _mm_srli_epi64(val, 64-R));
327 #endif
328 }
329 
330 template <unsigned int R>
331 inline __m128i RotateRight64(const __m128i& val)
332 {
333 #if defined(CRYPTOPP_AVX512_ROTATE)
334  return _mm_ror_epi64(val, R);
335 #elif defined(__XOP__)
336  return _mm_roti_epi64(val, 64-R);
337 #else
338  return _mm_or_si128(
339  _mm_slli_epi64(val, 64-R), _mm_srli_epi64(val, R));
340 #endif
341 }
342 
343 // Faster than two Shifts and an Or. Thanks to Louis Wingers and Bryan Weeks.
344 template <>
345 __m128i RotateLeft64<8>(const __m128i& val)
346 {
347 #if defined(__XOP__)
348  return _mm_roti_epi64(val, 8);
349 #else
350  const __m128i mask = _mm_set_epi8(14,13,12,11, 10,9,8,15, 6,5,4,3, 2,1,0,7);
351  return _mm_shuffle_epi8(val, mask);
352 #endif
353 }
354 
355 // Faster than two Shifts and an Or. Thanks to Louis Wingers and Bryan Weeks.
356 template <>
357 __m128i RotateRight64<8>(const __m128i& val)
358 {
359 #if defined(__XOP__)
360  return _mm_roti_epi64(val, 64-8);
361 #else
362  const __m128i mask = _mm_set_epi8(8,15,14,13, 12,11,10,9, 0,7,6,5, 4,3,2,1);
363  return _mm_shuffle_epi8(val, mask);
364 #endif
365 }
366 
367 inline __m128i SIMON128_f(const __m128i& v)
368 {
369  return _mm_xor_si128(RotateLeft64<2>(v),
370  _mm_and_si128(RotateLeft64<1>(v), RotateLeft64<8>(v)));
371 }
372 
373 inline void SIMON128_Enc_Block(__m128i &block0, __m128i &block1,
374  const word64 *subkeys, unsigned int rounds)
375 {
376  // [A1 A2][B1 B2] ... => [A1 B1][A2 B2] ...
377  __m128i x1 = _mm_unpackhi_epi64(block0, block1);
378  __m128i y1 = _mm_unpacklo_epi64(block0, block1);
379 
380  for (int i = 0; i < static_cast<int>(rounds & ~1)-1; i += 2)
381  {
382  const __m128i rk1 = _mm_castpd_si128(
383  _mm_loaddup_pd(CONST_DOUBLE_CAST(subkeys+i)));
384  y1 = _mm_xor_si128(_mm_xor_si128(y1, SIMON128_f(x1)), rk1);
385 
386  const __m128i rk2 = _mm_castpd_si128(
387  _mm_loaddup_pd(CONST_DOUBLE_CAST(subkeys+i+1)));
388  x1 = _mm_xor_si128(_mm_xor_si128(x1, SIMON128_f(y1)), rk2);
389  }
390 
391  if (rounds & 1)
392  {
393  const __m128i rk = _mm_castpd_si128(
394  _mm_loaddup_pd(CONST_DOUBLE_CAST(subkeys+rounds-1)));
395 
396  y1 = _mm_xor_si128(_mm_xor_si128(y1, SIMON128_f(x1)), rk);
397  Swap128(x1, y1);
398  }
399 
400  // [A1 B1][A2 B2] ... => [A1 A2][B1 B2] ...
401  block0 = _mm_unpacklo_epi64(y1, x1);
402  block1 = _mm_unpackhi_epi64(y1, x1);
403 }
404 
405 inline void SIMON128_Enc_6_Blocks(__m128i &block0, __m128i &block1,
406  __m128i &block2, __m128i &block3, __m128i &block4, __m128i &block5,
407  const word64 *subkeys, unsigned int rounds)
408 {
409  // [A1 A2][B1 B2] ... => [A1 B1][A2 B2] ...
410  __m128i x1 = _mm_unpackhi_epi64(block0, block1);
411  __m128i y1 = _mm_unpacklo_epi64(block0, block1);
412  __m128i x2 = _mm_unpackhi_epi64(block2, block3);
413  __m128i y2 = _mm_unpacklo_epi64(block2, block3);
414  __m128i x3 = _mm_unpackhi_epi64(block4, block5);
415  __m128i y3 = _mm_unpacklo_epi64(block4, block5);
416 
417  for (int i = 0; i < static_cast<int>(rounds & ~1) - 1; i += 2)
418  {
419  const __m128i rk1 = _mm_castpd_si128(
420  _mm_loaddup_pd(CONST_DOUBLE_CAST(subkeys + i)));
421  y1 = _mm_xor_si128(_mm_xor_si128(y1, SIMON128_f(x1)), rk1);
422  y2 = _mm_xor_si128(_mm_xor_si128(y2, SIMON128_f(x2)), rk1);
423  y3 = _mm_xor_si128(_mm_xor_si128(y3, SIMON128_f(x3)), rk1);
424 
425  const __m128i rk2 = _mm_castpd_si128(
426  _mm_loaddup_pd(CONST_DOUBLE_CAST(subkeys + i + 1)));
427  x1 = _mm_xor_si128(_mm_xor_si128(x1, SIMON128_f(y1)), rk2);
428  x2 = _mm_xor_si128(_mm_xor_si128(x2, SIMON128_f(y2)), rk2);
429  x3 = _mm_xor_si128(_mm_xor_si128(x3, SIMON128_f(y3)), rk2);
430  }
431 
432  if (rounds & 1)
433  {
434  const __m128i rk = _mm_castpd_si128(
435  _mm_loaddup_pd(CONST_DOUBLE_CAST(subkeys + rounds - 1)));
436  y1 = _mm_xor_si128(_mm_xor_si128(y1, SIMON128_f(x1)), rk);
437  y2 = _mm_xor_si128(_mm_xor_si128(y2, SIMON128_f(x2)), rk);
438  y3 = _mm_xor_si128(_mm_xor_si128(y3, SIMON128_f(x3)), rk);
439  Swap128(x1, y1); Swap128(x2, y2); Swap128(x3, y3);
440  }
441 
442  // [A1 B1][A2 B2] ... => [A1 A2][B1 B2] ...
443  block0 = _mm_unpacklo_epi64(y1, x1);
444  block1 = _mm_unpackhi_epi64(y1, x1);
445  block2 = _mm_unpacklo_epi64(y2, x2);
446  block3 = _mm_unpackhi_epi64(y2, x2);
447  block4 = _mm_unpacklo_epi64(y3, x3);
448  block5 = _mm_unpackhi_epi64(y3, x3);
449 }
450 
451 inline void SIMON128_Dec_Block(__m128i &block0, __m128i &block1,
452  const word64 *subkeys, unsigned int rounds)
453 {
454  // [A1 A2][B1 B2] ... => [A1 B1][A2 B2] ...
455  __m128i x1 = _mm_unpackhi_epi64(block0, block1);
456  __m128i y1 = _mm_unpacklo_epi64(block0, block1);
457 
458  if (rounds & 1)
459  {
460  const __m128i rk = _mm_castpd_si128(
461  _mm_loaddup_pd(CONST_DOUBLE_CAST(subkeys + rounds - 1)));
462 
463  Swap128(x1, y1);
464  y1 = _mm_xor_si128(_mm_xor_si128(y1, rk), SIMON128_f(x1));
465  rounds--;
466  }
467 
468  for (int i = static_cast<int>(rounds-2); i >= 0; i -= 2)
469  {
470  const __m128i rk1 = _mm_castpd_si128(
471  _mm_loaddup_pd(CONST_DOUBLE_CAST(subkeys+i+1)));
472  x1 = _mm_xor_si128(_mm_xor_si128(x1, SIMON128_f(y1)), rk1);
473 
474  const __m128i rk2 = _mm_castpd_si128(
475  _mm_loaddup_pd(CONST_DOUBLE_CAST(subkeys+i)));
476  y1 = _mm_xor_si128(_mm_xor_si128(y1, SIMON128_f(x1)), rk2);
477  }
478 
479  // [A1 B1][A2 B2] ... => [A1 A2][B1 B2] ...
480  block0 = _mm_unpacklo_epi64(y1, x1);
481  block1 = _mm_unpackhi_epi64(y1, x1);
482 }
483 
484 inline void SIMON128_Dec_6_Blocks(__m128i &block0, __m128i &block1,
485  __m128i &block2, __m128i &block3, __m128i &block4, __m128i &block5,
486  const word64 *subkeys, unsigned int rounds)
487 {
488  // [A1 A2][B1 B2] ... => [A1 B1][A2 B2] ...
489  __m128i x1 = _mm_unpackhi_epi64(block0, block1);
490  __m128i y1 = _mm_unpacklo_epi64(block0, block1);
491  __m128i x2 = _mm_unpackhi_epi64(block2, block3);
492  __m128i y2 = _mm_unpacklo_epi64(block2, block3);
493  __m128i x3 = _mm_unpackhi_epi64(block4, block5);
494  __m128i y3 = _mm_unpacklo_epi64(block4, block5);
495 
496  if (rounds & 1)
497  {
498  const __m128i rk = _mm_castpd_si128(
499  _mm_loaddup_pd(CONST_DOUBLE_CAST(subkeys + rounds - 1)));
500 
501  Swap128(x1, y1); Swap128(x2, y2); Swap128(x3, y3);
502  y1 = _mm_xor_si128(_mm_xor_si128(y1, rk), SIMON128_f(x1));
503  y2 = _mm_xor_si128(_mm_xor_si128(y2, rk), SIMON128_f(x2));
504  y3 = _mm_xor_si128(_mm_xor_si128(y3, rk), SIMON128_f(x3));
505  rounds--;
506  }
507 
508  for (int i = static_cast<int>(rounds-2); i >= 0; i -= 2)
509  {
510  const __m128i rk1 = _mm_castpd_si128(
511  _mm_loaddup_pd(CONST_DOUBLE_CAST(subkeys + i + 1)));
512  x1 = _mm_xor_si128(_mm_xor_si128(x1, SIMON128_f(y1)), rk1);
513  x2 = _mm_xor_si128(_mm_xor_si128(x2, SIMON128_f(y2)), rk1);
514  x3 = _mm_xor_si128(_mm_xor_si128(x3, SIMON128_f(y3)), rk1);
515 
516  const __m128i rk2 = _mm_castpd_si128(
517  _mm_loaddup_pd(CONST_DOUBLE_CAST(subkeys + i)));
518  y1 = _mm_xor_si128(_mm_xor_si128(y1, SIMON128_f(x1)), rk2);
519  y2 = _mm_xor_si128(_mm_xor_si128(y2, SIMON128_f(x2)), rk2);
520  y3 = _mm_xor_si128(_mm_xor_si128(y3, SIMON128_f(x3)), rk2);
521  }
522 
523  // [A1 B1][A2 B2] ... => [A1 A2][B1 B2] ...
524  block0 = _mm_unpacklo_epi64(y1, x1);
525  block1 = _mm_unpackhi_epi64(y1, x1);
526  block2 = _mm_unpacklo_epi64(y2, x2);
527  block3 = _mm_unpackhi_epi64(y2, x2);
528  block4 = _mm_unpacklo_epi64(y3, x3);
529  block5 = _mm_unpackhi_epi64(y3, x3);
530 }
531 
532 #endif // CRYPTOPP_SSSE3_AVAILABLE
533 
534 // ***************************** Power8 ***************************** //
535 
536 #if defined(CRYPTOPP_POWER8_AVAILABLE)
537 
541 
542 using CryptoPP::VecAnd;
543 using CryptoPP::VecXor;
545 
546 // Rotate left by bit count
547 template<unsigned int C>
548 inline uint64x2_p RotateLeft64(const uint64x2_p val)
549 {
550  const uint64x2_p m = {C, C};
551  return vec_rl(val, m);
552 }
553 
554 // Rotate right by bit count
555 template<unsigned int C>
556 inline uint64x2_p RotateRight64(const uint64x2_p val)
557 {
558  const uint64x2_p m = {64-C, 64-C};
559  return vec_rl(val, m);
560 }
561 
562 inline uint64x2_p SIMON128_f(const uint64x2_p val)
563 {
564  return VecXor(RotateLeft64<2>(val),
565  VecAnd(RotateLeft64<1>(val), RotateLeft64<8>(val)));
566 }
567 
568 inline void SIMON128_Enc_Block(uint32x4_p &block, const word64 *subkeys, unsigned int rounds)
569 {
570 #if (CRYPTOPP_BIG_ENDIAN)
571  const uint8x16_p m1 = {31,30,29,28,27,26,25,24, 15,14,13,12,11,10,9,8};
572  const uint8x16_p m2 = {23,22,21,20,19,18,17,16, 7,6,5,4,3,2,1,0};
573 #else
574  const uint8x16_p m1 = {7,6,5,4,3,2,1,0, 23,22,21,20,19,18,17,16};
575  const uint8x16_p m2 = {15,14,13,12,11,10,9,8, 31,30,29,28,27,26,25,24};
576 #endif
577 
578  // [A1 A2][B1 B2] ... => [A1 B1][A2 B2] ...
579  uint64x2_p x1 = (uint64x2_p)VecPermute(block, block, m1);
580  uint64x2_p y1 = (uint64x2_p)VecPermute(block, block, m2);
581 
582  for (int i = 0; i < static_cast<int>(rounds & ~1)-1; i += 2)
583  {
584  const uint64x2_p rk1 = vec_splats((unsigned long long)subkeys[i]);
585  y1 = VecXor(VecXor(y1, SIMON128_f(x1)), rk1);
586 
587  const uint64x2_p rk2 = vec_splats((unsigned long long)subkeys[i+1]);
588  x1 = VecXor(VecXor(x1, SIMON128_f(y1)), rk2);
589  }
590 
591  if (rounds & 1)
592  {
593  const uint64x2_p rk = vec_splats((unsigned long long)subkeys[rounds-1]);
594  y1 = VecXor(VecXor(y1, SIMON128_f(x1)), rk);
595  std::swap(x1, y1);
596  }
597 
598 #if (CRYPTOPP_BIG_ENDIAN)
599  const uint8x16_p m3 = {31,30,29,28,27,26,25,24, 15,14,13,12,11,10,9,8};
600  //const uint8x16_p m4 = {23,22,21,20,19,18,17,16, 7,6,5,4,3,2,1,0};
601 #else
602  const uint8x16_p m3 = {7,6,5,4,3,2,1,0, 23,22,21,20,19,18,17,16};
603  //const uint8x16_p m4 = {15,14,13,12,11,10,9,8, 31,30,29,28,27,26,25,24};
604 #endif
605 
606  // [A1 B1][A2 B2] ... => [A1 A2][B1 B2] ...
607  block = (uint32x4_p)VecPermute(x1, y1, m3);
608 }
609 
610 inline void SIMON128_Dec_Block(uint32x4_p &block, const word64 *subkeys, unsigned int rounds)
611 {
612 #if (CRYPTOPP_BIG_ENDIAN)
613  const uint8x16_p m1 = {31,30,29,28,27,26,25,24, 15,14,13,12,11,10,9,8};
614  const uint8x16_p m2 = {23,22,21,20,19,18,17,16, 7,6,5,4,3,2,1,0};
615 #else
616  const uint8x16_p m1 = {7,6,5,4,3,2,1,0, 23,22,21,20,19,18,17,16};
617  const uint8x16_p m2 = {15,14,13,12,11,10,9,8, 31,30,29,28,27,26,25,24};
618 #endif
619 
620  // [A1 A2][B1 B2] ... => [A1 B1][A2 B2] ...
621  uint64x2_p x1 = (uint64x2_p)VecPermute(block, block, m1);
622  uint64x2_p y1 = (uint64x2_p)VecPermute(block, block, m2);
623 
624  if (rounds & 1)
625  {
626  std::swap(x1, y1);
627  const uint64x2_p rk = vec_splats((unsigned long long)subkeys[rounds-1]);
628  y1 = VecXor(VecXor(y1, rk), SIMON128_f(x1));
629  rounds--;
630  }
631 
632  for (int i = static_cast<int>(rounds-2); i >= 0; i -= 2)
633  {
634  const uint64x2_p rk1 = vec_splats((unsigned long long)subkeys[i+1]);
635  x1 = VecXor(VecXor(x1, SIMON128_f(y1)), rk1);
636 
637  const uint64x2_p rk2 = vec_splats((unsigned long long)subkeys[i]);
638  y1 = VecXor(VecXor(y1, SIMON128_f(x1)), rk2);
639  }
640 
641 #if (CRYPTOPP_BIG_ENDIAN)
642  const uint8x16_p m3 = {31,30,29,28,27,26,25,24, 15,14,13,12,11,10,9,8};
643  //const uint8x16_p m4 = {23,22,21,20,19,18,17,16, 7,6,5,4,3,2,1,0};
644 #else
645  const uint8x16_p m3 = {7,6,5,4,3,2,1,0, 23,22,21,20,19,18,17,16};
646  //const uint8x16_p m4 = {15,14,13,12,11,10,9,8, 31,30,29,28,27,26,25,24};
647 #endif
648 
649  // [A1 B1][A2 B2] ... => [A1 A2][B1 B2] ...
650  block = (uint32x4_p)VecPermute(x1, y1, m3);
651 }
652 
653 inline void SIMON128_Enc_6_Blocks(uint32x4_p &block0, uint32x4_p &block1,
654  uint32x4_p &block2, uint32x4_p &block3, uint32x4_p &block4,
655  uint32x4_p &block5, const word64 *subkeys, unsigned int rounds)
656 {
657 #if (CRYPTOPP_BIG_ENDIAN)
658  const uint8x16_p m1 = {31,30,29,28,27,26,25,24, 15,14,13,12,11,10,9,8};
659  const uint8x16_p m2 = {23,22,21,20,19,18,17,16, 7,6,5,4,3,2,1,0};
660 #else
661  const uint8x16_p m1 = {7,6,5,4,3,2,1,0, 23,22,21,20,19,18,17,16};
662  const uint8x16_p m2 = {15,14,13,12,11,10,9,8, 31,30,29,28,27,26,25,24};
663 #endif
664 
665  // [A1 A2][B1 B2] ... => [A1 B1][A2 B2] ...
666  uint64x2_p x1 = (uint64x2_p)VecPermute(block0, block1, m1);
667  uint64x2_p y1 = (uint64x2_p)VecPermute(block0, block1, m2);
668  uint64x2_p x2 = (uint64x2_p)VecPermute(block2, block3, m1);
669  uint64x2_p y2 = (uint64x2_p)VecPermute(block2, block3, m2);
670  uint64x2_p x3 = (uint64x2_p)VecPermute(block4, block5, m1);
671  uint64x2_p y3 = (uint64x2_p)VecPermute(block4, block5, m2);
672 
673  for (int i = 0; i < static_cast<int>(rounds & ~1)-1; i += 2)
674  {
675  const uint64x2_p rk1 = vec_splats((unsigned long long)subkeys[i]);
676  y1 = VecXor(VecXor(y1, SIMON128_f(x1)), rk1);
677  y2 = VecXor(VecXor(y2, SIMON128_f(x2)), rk1);
678  y3 = VecXor(VecXor(y3, SIMON128_f(x3)), rk1);
679 
680  const uint64x2_p rk2 = vec_splats((unsigned long long)subkeys[i+1]);
681  x1 = VecXor(VecXor(x1, SIMON128_f(y1)), rk2);
682  x2 = VecXor(VecXor(x2, SIMON128_f(y2)), rk2);
683  x3 = VecXor(VecXor(x3, SIMON128_f(y3)), rk2);
684  }
685 
686  if (rounds & 1)
687  {
688  const uint64x2_p rk = vec_splats((unsigned long long)subkeys[rounds-1]);
689  y1 = VecXor(VecXor(y1, SIMON128_f(x1)), rk);
690  y2 = VecXor(VecXor(y2, SIMON128_f(x2)), rk);
691  y3 = VecXor(VecXor(y3, SIMON128_f(x3)), rk);
692  std::swap(x1, y1); std::swap(x2, y2); std::swap(x3, y3);
693  }
694 
695 #if (CRYPTOPP_BIG_ENDIAN)
696  const uint8x16_p m3 = {31,30,29,28,27,26,25,24, 15,14,13,12,11,10,9,8};
697  const uint8x16_p m4 = {23,22,21,20,19,18,17,16, 7,6,5,4,3,2,1,0};
698 #else
699  const uint8x16_p m3 = {7,6,5,4,3,2,1,0, 23,22,21,20,19,18,17,16};
700  const uint8x16_p m4 = {15,14,13,12,11,10,9,8, 31,30,29,28,27,26,25,24};
701 #endif
702 
703  // [A1 B1][A2 B2] ... => [A1 A2][B1 B2] ...
704  block0 = (uint32x4_p)VecPermute(x1, y1, m3);
705  block1 = (uint32x4_p)VecPermute(x1, y1, m4);
706  block2 = (uint32x4_p)VecPermute(x2, y2, m3);
707  block3 = (uint32x4_p)VecPermute(x2, y2, m4);
708  block4 = (uint32x4_p)VecPermute(x3, y3, m3);
709  block5 = (uint32x4_p)VecPermute(x3, y3, m4);
710 }
711 
712 inline void SIMON128_Dec_6_Blocks(uint32x4_p &block0, uint32x4_p &block1,
713  uint32x4_p &block2, uint32x4_p &block3, uint32x4_p &block4,
714  uint32x4_p &block5, const word64 *subkeys, unsigned int rounds)
715 {
716 #if (CRYPTOPP_BIG_ENDIAN)
717  const uint8x16_p m1 = {31,30,29,28,27,26,25,24, 15,14,13,12,11,10,9,8};
718  const uint8x16_p m2 = {23,22,21,20,19,18,17,16, 7,6,5,4,3,2,1,0};
719 #else
720  const uint8x16_p m1 = {7,6,5,4,3,2,1,0, 23,22,21,20,19,18,17,16};
721  const uint8x16_p m2 = {15,14,13,12,11,10,9,8, 31,30,29,28,27,26,25,24};
722 #endif
723 
724  // [A1 A2][B1 B2] ... => [A1 B1][A2 B2] ...
725  uint64x2_p x1 = (uint64x2_p)VecPermute(block0, block1, m1);
726  uint64x2_p y1 = (uint64x2_p)VecPermute(block0, block1, m2);
727  uint64x2_p x2 = (uint64x2_p)VecPermute(block2, block3, m1);
728  uint64x2_p y2 = (uint64x2_p)VecPermute(block2, block3, m2);
729  uint64x2_p x3 = (uint64x2_p)VecPermute(block4, block5, m1);
730  uint64x2_p y3 = (uint64x2_p)VecPermute(block4, block5, m2);
731 
732  if (rounds & 1)
733  {
734  std::swap(x1, y1); std::swap(x2, y2); std::swap(x3, y3);
735  const uint64x2_p rk = vec_splats((unsigned long long)subkeys[rounds-1]);
736  y1 = VecXor(VecXor(y1, rk), SIMON128_f(x1));
737  y2 = VecXor(VecXor(y2, rk), SIMON128_f(x2));
738  y3 = VecXor(VecXor(y3, rk), SIMON128_f(x3));
739  rounds--;
740  }
741 
742  for (int i = static_cast<int>(rounds-2); i >= 0; i -= 2)
743  {
744  const uint64x2_p rk1 = vec_splats((unsigned long long)subkeys[i+1]);
745  x1 = VecXor(VecXor(x1, SIMON128_f(y1)), rk1);
746  x2 = VecXor(VecXor(x2, SIMON128_f(y2)), rk1);
747  x3 = VecXor(VecXor(x3, SIMON128_f(y3)), rk1);
748 
749  const uint64x2_p rk2 = vec_splats((unsigned long long)subkeys[i]);
750  y1 = VecXor(VecXor(y1, SIMON128_f(x1)), rk2);
751  y2 = VecXor(VecXor(y2, SIMON128_f(x2)), rk2);
752  y3 = VecXor(VecXor(y3, SIMON128_f(x3)), rk2);
753  }
754 
755 #if (CRYPTOPP_BIG_ENDIAN)
756  const uint8x16_p m3 = {31,30,29,28,27,26,25,24, 15,14,13,12,11,10,9,8};
757  const uint8x16_p m4 = {23,22,21,20,19,18,17,16, 7,6,5,4,3,2,1,0};
758 #else
759  const uint8x16_p m3 = {7,6,5,4,3,2,1,0, 23,22,21,20,19,18,17,16};
760  const uint8x16_p m4 = {15,14,13,12,11,10,9,8, 31,30,29,28,27,26,25,24};
761 #endif
762 
763  // [A1 B1][A2 B2] ... => [A1 A2][B1 B2] ...
764  block0 = (uint32x4_p)VecPermute(x1, y1, m3);
765  block1 = (uint32x4_p)VecPermute(x1, y1, m4);
766  block2 = (uint32x4_p)VecPermute(x2, y2, m3);
767  block3 = (uint32x4_p)VecPermute(x2, y2, m4);
768  block4 = (uint32x4_p)VecPermute(x3, y3, m3);
769  block5 = (uint32x4_p)VecPermute(x3, y3, m4);
770 }
771 
772 #endif // CRYPTOPP_POWER8_AVAILABLE
773 
774 ANONYMOUS_NAMESPACE_END
775 
776 ///////////////////////////////////////////////////////////////////////
777 
778 NAMESPACE_BEGIN(CryptoPP)
779 
780 // *************************** ARM NEON **************************** //
781 
782 #if (CRYPTOPP_ARM_NEON_AVAILABLE)
783 size_t SIMON128_Enc_AdvancedProcessBlocks_NEON(const word64* subKeys, size_t rounds,
784  const byte *inBlocks, const byte *xorBlocks, byte *outBlocks, size_t length, word32 flags)
785 {
786  return AdvancedProcessBlocks128_6x2_NEON(SIMON128_Enc_Block, SIMON128_Enc_6_Blocks,
787  subKeys, rounds, inBlocks, xorBlocks, outBlocks, length, flags);
788 }
789 
790 size_t SIMON128_Dec_AdvancedProcessBlocks_NEON(const word64* subKeys, size_t rounds,
791  const byte *inBlocks, const byte *xorBlocks, byte *outBlocks, size_t length, word32 flags)
792 {
793  return AdvancedProcessBlocks128_6x2_NEON(SIMON128_Dec_Block, SIMON128_Dec_6_Blocks,
794  subKeys, rounds, inBlocks, xorBlocks, outBlocks, length, flags);
795 }
796 #endif // CRYPTOPP_ARM_NEON_AVAILABLE
797 
798 // ***************************** IA-32 ***************************** //
799 
800 #if defined(CRYPTOPP_SSSE3_AVAILABLE)
801 size_t SIMON128_Enc_AdvancedProcessBlocks_SSSE3(const word64* subKeys, size_t rounds,
802  const byte *inBlocks, const byte *xorBlocks, byte *outBlocks, size_t length, word32 flags)
803 {
804  return AdvancedProcessBlocks128_6x2_SSE(SIMON128_Enc_Block, SIMON128_Enc_6_Blocks,
805  subKeys, rounds, inBlocks, xorBlocks, outBlocks, length, flags);
806 }
807 
808 size_t SIMON128_Dec_AdvancedProcessBlocks_SSSE3(const word64* subKeys, size_t rounds,
809  const byte *inBlocks, const byte *xorBlocks, byte *outBlocks, size_t length, word32 flags)
810 {
811  return AdvancedProcessBlocks128_6x2_SSE(SIMON128_Dec_Block, SIMON128_Dec_6_Blocks,
812  subKeys, rounds, inBlocks, xorBlocks, outBlocks, length, flags);
813 }
814 #endif // CRYPTOPP_SSSE3_AVAILABLE
815 
816 // ***************************** Power8 ***************************** //
817 
818 #if defined(CRYPTOPP_POWER8_AVAILABLE)
819 size_t SIMON128_Enc_AdvancedProcessBlocks_POWER8(const word64* subKeys, size_t rounds,
820  const byte *inBlocks, const byte *xorBlocks, byte *outBlocks, size_t length, word32 flags)
821 {
822  return AdvancedProcessBlocks128_6x1_ALTIVEC(SIMON128_Enc_Block, SIMON128_Enc_6_Blocks,
823  subKeys, rounds, inBlocks, xorBlocks, outBlocks, length, flags);
824 }
825 
826 size_t SIMON128_Dec_AdvancedProcessBlocks_POWER8(const word64* subKeys, size_t rounds,
827  const byte *inBlocks, const byte *xorBlocks, byte *outBlocks, size_t length, word32 flags)
828 {
829  return AdvancedProcessBlocks128_6x1_ALTIVEC(SIMON128_Dec_Block, SIMON128_Dec_6_Blocks,
830  subKeys, rounds, inBlocks, xorBlocks, outBlocks, length, flags);
831 }
832 #endif // CRYPTOPP_POWER8_AVAILABLE
833 
834 NAMESPACE_END
Utility functions for the Crypto++ library.
Library configuration file.
T1 VecPermute(const T1 vec, const T2 mask)
Permutes a vector.
Definition: ppc_simd.h:1010
__vector unsigned int uint32x4_p
Vector of 32-bit elements.
Definition: ppc_simd.h:129
Support functions for PowerPC and vector operations.
Template for AdvancedProcessBlocks and SIMD processing.
Precompiled header file.
T1 VecXor(const T1 vec1, const T2 vec2)
XOR two vectors.
Definition: ppc_simd.h:916
__vector unsigned long long uint64x2_p
Vector of 64-bit elements.
Definition: ppc_simd.h:139
Classes for the Simon block cipher.
Crypto++ library namespace.
__vector unsigned char uint8x16_p
Vector of 8-bit elements.
Definition: ppc_simd.h:119
T1 VecAnd(const T1 vec1, const T2 vec2)
AND two vectors.
Definition: ppc_simd.h:882
void vec_swap(T &a, T &b)
Swaps two variables which are arrays.
Definition: misc.h:531