Crypto++  8.2
Free C++ class library of cryptographic schemes
speck128_simd.cpp
1 // speck128_simd.cpp - written and placed in the public domain by Jeffrey Walton
2 //
3 // This source file uses intrinsics and built-ins to gain access to
4 // SSSE3, ARM NEON and ARMv8a, and Power7 Altivec instructions. A separate
5 // source file is needed because additional CXXFLAGS are required to enable
6 // the appropriate instructions sets in some build configurations.
7 
8 #include "pch.h"
9 #include "config.h"
10 
11 #include "speck.h"
12 #include "misc.h"
13 
14 // Uncomment for benchmarking C++ against SSE or NEON.
15 // Do so in both speck.cpp and speck-simd.cpp.
16 // #undef CRYPTOPP_SSSE3_AVAILABLE
17 // #undef CRYPTOPP_ARM_NEON_AVAILABLE
18 
19 #if (CRYPTOPP_SSSE3_AVAILABLE)
20 # include "adv_simd.h"
21 # include <pmmintrin.h>
22 # include <tmmintrin.h>
23 #endif
24 
25 #if defined(__XOP__)
26 # include <ammintrin.h>
27 #endif
28 
29 #if defined(__AVX512F__)
30 # define CRYPTOPP_AVX512_ROTATE 1
31 # include <immintrin.h>
32 #endif
33 
34 // C1189: error: This header is specific to ARM targets
35 #if (CRYPTOPP_ARM_NEON_AVAILABLE)
36 # include "adv_simd.h"
37 # ifndef _M_ARM64
38 # include <arm_neon.h>
39 # endif
40 #endif
41 
42 #if (CRYPTOPP_ARM_ACLE_AVAILABLE)
43 # include <stdint.h>
44 # include <arm_acle.h>
45 #endif
46 
47 #if defined(CRYPTOPP_POWER8_AVAILABLE)
48 # include "adv_simd.h"
49 # include "ppc_simd.h"
50 #endif
51 
52 // Squash MS LNK4221 and libtool warnings
53 extern const char SPECK128_SIMD_FNAME[] = __FILE__;
54 
55 ANONYMOUS_NAMESPACE_BEGIN
56 
57 using CryptoPP::byte;
58 using CryptoPP::word32;
59 using CryptoPP::word64;
60 
61 // *************************** ARM NEON ************************** //
62 
63 #if (CRYPTOPP_ARM_NEON_AVAILABLE)
64 
65 // Missing from Microsoft's ARM A-32 implementation
66 #if defined(_MSC_VER) && !defined(_M_ARM64)
67 inline uint64x2_t vld1q_dup_u64(const uint64_t* ptr)
68 {
69  return vmovq_n_u64(*ptr);
70 }
71 #endif
72 
73 template <class T>
74 inline T UnpackHigh64(const T& a, const T& b)
75 {
76  const uint64x1_t x(vget_high_u64((uint64x2_t)a));
77  const uint64x1_t y(vget_high_u64((uint64x2_t)b));
78  return (T)vcombine_u64(x, y);
79 }
80 
81 template <class T>
82 inline T UnpackLow64(const T& a, const T& b)
83 {
84  const uint64x1_t x(vget_low_u64((uint64x2_t)a));
85  const uint64x1_t y(vget_low_u64((uint64x2_t)b));
86  return (T)vcombine_u64(x, y);
87 }
88 
89 template <unsigned int R>
90 inline uint64x2_t RotateLeft64(const uint64x2_t& val)
91 {
92  const uint64x2_t a(vshlq_n_u64(val, R));
93  const uint64x2_t b(vshrq_n_u64(val, 64 - R));
94  return vorrq_u64(a, b);
95 }
96 
97 template <unsigned int R>
98 inline uint64x2_t RotateRight64(const uint64x2_t& val)
99 {
100  const uint64x2_t a(vshlq_n_u64(val, 64 - R));
101  const uint64x2_t b(vshrq_n_u64(val, R));
102  return vorrq_u64(a, b);
103 }
104 
105 #if defined(__aarch32__) || defined(__aarch64__)
106 // Faster than two Shifts and an Or. Thanks to Louis Wingers and Bryan Weeks.
107 template <>
108 inline uint64x2_t RotateLeft64<8>(const uint64x2_t& val)
109 {
110  const uint8_t maskb[16] = { 7,0,1,2, 3,4,5,6, 15,8,9,10, 11,12,13,14 };
111  const uint8x16_t mask = vld1q_u8(maskb);
112 
113  return vreinterpretq_u64_u8(
114  vqtbl1q_u8(vreinterpretq_u8_u64(val), mask));
115 }
116 
117 // Faster than two Shifts and an Or. Thanks to Louis Wingers and Bryan Weeks.
118 template <>
119 inline uint64x2_t RotateRight64<8>(const uint64x2_t& val)
120 {
121  const uint8_t maskb[16] = { 1,2,3,4, 5,6,7,0, 9,10,11,12, 13,14,15,8 };
122  const uint8x16_t mask = vld1q_u8(maskb);
123 
124  return vreinterpretq_u64_u8(
125  vqtbl1q_u8(vreinterpretq_u8_u64(val), mask));
126 }
127 #endif
128 
129 inline void SPECK128_Enc_Block(uint64x2_t &block0, uint64x2_t &block1,
130  const word64 *subkeys, unsigned int rounds)
131 {
132  // [A1 A2][B1 B2] ... => [A1 B1][A2 B2] ...
133  uint64x2_t x1 = UnpackHigh64(block0, block1);
134  uint64x2_t y1 = UnpackLow64(block0, block1);
135 
136  for (int i=0; i < static_cast<int>(rounds); ++i)
137  {
138  const uint64x2_t rk = vld1q_dup_u64(subkeys+i);
139 
140  x1 = RotateRight64<8>(x1);
141  x1 = vaddq_u64(x1, y1);
142  x1 = veorq_u64(x1, rk);
143  y1 = RotateLeft64<3>(y1);
144  y1 = veorq_u64(y1, x1);
145  }
146 
147  // [A1 B1][A2 B2] ... => [A1 A2][B1 B2] ...
148  block0 = UnpackLow64(y1, x1);
149  block1 = UnpackHigh64(y1, x1);
150 }
151 
152 inline void SPECK128_Enc_6_Blocks(uint64x2_t &block0, uint64x2_t &block1,
153  uint64x2_t &block2, uint64x2_t &block3, uint64x2_t &block4, uint64x2_t &block5,
154  const word64 *subkeys, unsigned int rounds)
155 {
156  // [A1 A2][B1 B2] ... => [A1 B1][A2 B2] ...
157  uint64x2_t x1 = UnpackHigh64(block0, block1);
158  uint64x2_t y1 = UnpackLow64(block0, block1);
159  uint64x2_t x2 = UnpackHigh64(block2, block3);
160  uint64x2_t y2 = UnpackLow64(block2, block3);
161  uint64x2_t x3 = UnpackHigh64(block4, block5);
162  uint64x2_t y3 = UnpackLow64(block4, block5);
163 
164  for (int i=0; i < static_cast<int>(rounds); ++i)
165  {
166  const uint64x2_t rk = vld1q_dup_u64(subkeys+i);
167 
168  x1 = RotateRight64<8>(x1);
169  x2 = RotateRight64<8>(x2);
170  x3 = RotateRight64<8>(x3);
171  x1 = vaddq_u64(x1, y1);
172  x2 = vaddq_u64(x2, y2);
173  x3 = vaddq_u64(x3, y3);
174  x1 = veorq_u64(x1, rk);
175  x2 = veorq_u64(x2, rk);
176  x3 = veorq_u64(x3, rk);
177  y1 = RotateLeft64<3>(y1);
178  y2 = RotateLeft64<3>(y2);
179  y3 = RotateLeft64<3>(y3);
180  y1 = veorq_u64(y1, x1);
181  y2 = veorq_u64(y2, x2);
182  y3 = veorq_u64(y3, x3);
183  }
184 
185  // [A1 B1][A2 B2] ... => [A1 A2][B1 B2] ...
186  block0 = UnpackLow64(y1, x1);
187  block1 = UnpackHigh64(y1, x1);
188  block2 = UnpackLow64(y2, x2);
189  block3 = UnpackHigh64(y2, x2);
190  block4 = UnpackLow64(y3, x3);
191  block5 = UnpackHigh64(y3, x3);
192 }
193 
194 inline void SPECK128_Dec_Block(uint64x2_t &block0, uint64x2_t &block1,
195  const word64 *subkeys, unsigned int rounds)
196 {
197  // [A1 A2][B1 B2] ... => [A1 B1][A2 B2] ...
198  uint64x2_t x1 = UnpackHigh64(block0, block1);
199  uint64x2_t y1 = UnpackLow64(block0, block1);
200 
201  for (int i = static_cast<int>(rounds-1); i >= 0; --i)
202  {
203  const uint64x2_t rk = vld1q_dup_u64(subkeys+i);
204 
205  y1 = veorq_u64(y1, x1);
206  y1 = RotateRight64<3>(y1);
207  x1 = veorq_u64(x1, rk);
208  x1 = vsubq_u64(x1, y1);
209  x1 = RotateLeft64<8>(x1);
210  }
211 
212  // [A1 B1][A2 B2] ... => [A1 A2][B1 B2] ...
213  block0 = UnpackLow64(y1, x1);
214  block1 = UnpackHigh64(y1, x1);
215 }
216 
217 inline void SPECK128_Dec_6_Blocks(uint64x2_t &block0, uint64x2_t &block1,
218  uint64x2_t &block2, uint64x2_t &block3, uint64x2_t &block4, uint64x2_t &block5,
219  const word64 *subkeys, unsigned int rounds)
220 {
221  // [A1 A2][B1 B2] ... => [A1 B1][A2 B2] ...
222  uint64x2_t x1 = UnpackHigh64(block0, block1);
223  uint64x2_t y1 = UnpackLow64(block0, block1);
224  uint64x2_t x2 = UnpackHigh64(block2, block3);
225  uint64x2_t y2 = UnpackLow64(block2, block3);
226  uint64x2_t x3 = UnpackHigh64(block4, block5);
227  uint64x2_t y3 = UnpackLow64(block4, block5);
228 
229  for (int i = static_cast<int>(rounds-1); i >= 0; --i)
230  {
231  const uint64x2_t rk = vld1q_dup_u64(subkeys+i);
232 
233  y1 = veorq_u64(y1, x1);
234  y2 = veorq_u64(y2, x2);
235  y3 = veorq_u64(y3, x3);
236  y1 = RotateRight64<3>(y1);
237  y2 = RotateRight64<3>(y2);
238  y3 = RotateRight64<3>(y3);
239  x1 = veorq_u64(x1, rk);
240  x2 = veorq_u64(x2, rk);
241  x3 = veorq_u64(x3, rk);
242  x1 = vsubq_u64(x1, y1);
243  x2 = vsubq_u64(x2, y2);
244  x3 = vsubq_u64(x3, y3);
245  x1 = RotateLeft64<8>(x1);
246  x2 = RotateLeft64<8>(x2);
247  x3 = RotateLeft64<8>(x3);
248  }
249 
250  // [A1 B1][A2 B2] ... => [A1 A2][B1 B2] ...
251  block0 = UnpackLow64(y1, x1);
252  block1 = UnpackHigh64(y1, x1);
253  block2 = UnpackLow64(y2, x2);
254  block3 = UnpackHigh64(y2, x2);
255  block4 = UnpackLow64(y3, x3);
256  block5 = UnpackHigh64(y3, x3);
257 }
258 
259 #endif // CRYPTOPP_ARM_NEON_AVAILABLE
260 
261 // ***************************** IA-32 ***************************** //
262 
263 #if defined(CRYPTOPP_SSSE3_AVAILABLE)
264 
265 // Clang __m128i casts, http://bugs.llvm.org/show_bug.cgi?id=20670
266 #ifndef M128_CAST
267 # define M128_CAST(x) ((__m128i *)(void *)(x))
268 #endif
269 #ifndef CONST_M128_CAST
270 # define CONST_M128_CAST(x) ((const __m128i *)(const void *)(x))
271 #endif
272 
273 // GCC double casts, https://www.spinics.net/lists/gcchelp/msg47735.html
274 #ifndef DOUBLE_CAST
275 # define DOUBLE_CAST(x) ((double *)(void *)(x))
276 #endif
277 #ifndef CONST_DOUBLE_CAST
278 # define CONST_DOUBLE_CAST(x) ((const double *)(const void *)(x))
279 #endif
280 
281 template <unsigned int R>
282 inline __m128i RotateLeft64(const __m128i& val)
283 {
284 #if defined(CRYPTOPP_AVX512_ROTATE)
285  return _mm_rol_epi64(val, R);
286 #elif defined(__XOP__)
287  return _mm_roti_epi64(val, R);
288 #else
289  return _mm_or_si128(
290  _mm_slli_epi64(val, R), _mm_srli_epi64(val, 64-R));
291 #endif
292 }
293 
294 template <unsigned int R>
295 inline __m128i RotateRight64(const __m128i& val)
296 {
297 #if defined(CRYPTOPP_AVX512_ROTATE)
298  return _mm_ror_epi64(val, R);
299 #elif defined(__XOP__)
300  return _mm_roti_epi64(val, 64-R);
301 #else
302  return _mm_or_si128(
303  _mm_slli_epi64(val, 64-R), _mm_srli_epi64(val, R));
304 #endif
305 }
306 
307 // Faster than two Shifts and an Or. Thanks to Louis Wingers and Bryan Weeks.
308 template <>
309 __m128i RotateLeft64<8>(const __m128i& val)
310 {
311 #if defined(__XOP__)
312  return _mm_roti_epi64(val, 8);
313 #else
314  const __m128i mask = _mm_set_epi8(14,13,12,11, 10,9,8,15, 6,5,4,3, 2,1,0,7);
315  return _mm_shuffle_epi8(val, mask);
316 #endif
317 }
318 
319 // Faster than two Shifts and an Or. Thanks to Louis Wingers and Bryan Weeks.
320 template <>
321 __m128i RotateRight64<8>(const __m128i& val)
322 {
323 #if defined(__XOP__)
324  return _mm_roti_epi64(val, 64-8);
325 #else
326  const __m128i mask = _mm_set_epi8(8,15,14,13, 12,11,10,9, 0,7,6,5, 4,3,2,1);
327  return _mm_shuffle_epi8(val, mask);
328 #endif
329 }
330 
331 inline void SPECK128_Enc_Block(__m128i &block0, __m128i &block1,
332  const word64 *subkeys, unsigned int rounds)
333 {
334  // [A1 A2][B1 B2] ... => [A1 B1][A2 B2] ...
335  __m128i x1 = _mm_unpackhi_epi64(block0, block1);
336  __m128i y1 = _mm_unpacklo_epi64(block0, block1);
337 
338  for (int i=0; i < static_cast<int>(rounds); ++i)
339  {
340  const __m128i rk = _mm_castpd_si128(
341  _mm_loaddup_pd(CONST_DOUBLE_CAST(subkeys+i)));
342 
343  x1 = RotateRight64<8>(x1);
344  x1 = _mm_add_epi64(x1, y1);
345  x1 = _mm_xor_si128(x1, rk);
346  y1 = RotateLeft64<3>(y1);
347  y1 = _mm_xor_si128(y1, x1);
348  }
349 
350  // [A1 B1][A2 B2] ... => [A1 A2][B1 B2] ...
351  block0 = _mm_unpacklo_epi64(y1, x1);
352  block1 = _mm_unpackhi_epi64(y1, x1);
353 }
354 
355 inline void SPECK128_Enc_6_Blocks(__m128i &block0, __m128i &block1,
356  __m128i &block2, __m128i &block3, __m128i &block4, __m128i &block5,
357  const word64 *subkeys, unsigned int rounds)
358 {
359  // [A1 A2][B1 B2] ... => [A1 B1][A2 B2] ...
360  __m128i x1 = _mm_unpackhi_epi64(block0, block1);
361  __m128i y1 = _mm_unpacklo_epi64(block0, block1);
362  __m128i x2 = _mm_unpackhi_epi64(block2, block3);
363  __m128i y2 = _mm_unpacklo_epi64(block2, block3);
364  __m128i x3 = _mm_unpackhi_epi64(block4, block5);
365  __m128i y3 = _mm_unpacklo_epi64(block4, block5);
366 
367  for (int i=0; i < static_cast<int>(rounds); ++i)
368  {
369  const __m128i rk = _mm_castpd_si128(
370  _mm_loaddup_pd(CONST_DOUBLE_CAST(subkeys+i)));
371 
372  x1 = RotateRight64<8>(x1);
373  x2 = RotateRight64<8>(x2);
374  x3 = RotateRight64<8>(x3);
375  x1 = _mm_add_epi64(x1, y1);
376  x2 = _mm_add_epi64(x2, y2);
377  x3 = _mm_add_epi64(x3, y3);
378  x1 = _mm_xor_si128(x1, rk);
379  x2 = _mm_xor_si128(x2, rk);
380  x3 = _mm_xor_si128(x3, rk);
381  y1 = RotateLeft64<3>(y1);
382  y2 = RotateLeft64<3>(y2);
383  y3 = RotateLeft64<3>(y3);
384  y1 = _mm_xor_si128(y1, x1);
385  y2 = _mm_xor_si128(y2, x2);
386  y3 = _mm_xor_si128(y3, x3);
387  }
388 
389  // [A1 B1][A2 B2] ... => [A1 A2][B1 B2] ...
390  block0 = _mm_unpacklo_epi64(y1, x1);
391  block1 = _mm_unpackhi_epi64(y1, x1);
392  block2 = _mm_unpacklo_epi64(y2, x2);
393  block3 = _mm_unpackhi_epi64(y2, x2);
394  block4 = _mm_unpacklo_epi64(y3, x3);
395  block5 = _mm_unpackhi_epi64(y3, x3);
396 }
397 
398 inline void SPECK128_Dec_Block(__m128i &block0, __m128i &block1,
399  const word64 *subkeys, unsigned int rounds)
400 {
401  // [A1 A2][B1 B2] ... => [A1 B1][A2 B2] ...
402  __m128i x1 = _mm_unpackhi_epi64(block0, block1);
403  __m128i y1 = _mm_unpacklo_epi64(block0, block1);
404 
405  for (int i = static_cast<int>(rounds-1); i >= 0; --i)
406  {
407  const __m128i rk = _mm_castpd_si128(
408  _mm_loaddup_pd(CONST_DOUBLE_CAST(subkeys+i)));
409 
410  y1 = _mm_xor_si128(y1, x1);
411  y1 = RotateRight64<3>(y1);
412  x1 = _mm_xor_si128(x1, rk);
413  x1 = _mm_sub_epi64(x1, y1);
414  x1 = RotateLeft64<8>(x1);
415  }
416 
417  // [A1 B1][A2 B2] ... => [A1 A2][B1 B2] ...
418  block0 = _mm_unpacklo_epi64(y1, x1);
419  block1 = _mm_unpackhi_epi64(y1, x1);
420 }
421 
422 inline void SPECK128_Dec_6_Blocks(__m128i &block0, __m128i &block1,
423  __m128i &block2, __m128i &block3, __m128i &block4, __m128i &block5,
424  const word64 *subkeys, unsigned int rounds)
425 {
426  // [A1 A2][B1 B2] ... => [A1 B1][A2 B2] ...
427  __m128i x1 = _mm_unpackhi_epi64(block0, block1);
428  __m128i y1 = _mm_unpacklo_epi64(block0, block1);
429  __m128i x2 = _mm_unpackhi_epi64(block2, block3);
430  __m128i y2 = _mm_unpacklo_epi64(block2, block3);
431  __m128i x3 = _mm_unpackhi_epi64(block4, block5);
432  __m128i y3 = _mm_unpacklo_epi64(block4, block5);
433 
434  for (int i = static_cast<int>(rounds-1); i >= 0; --i)
435  {
436  const __m128i rk = _mm_castpd_si128(
437  _mm_loaddup_pd(CONST_DOUBLE_CAST(subkeys+i)));
438 
439  y1 = _mm_xor_si128(y1, x1);
440  y2 = _mm_xor_si128(y2, x2);
441  y3 = _mm_xor_si128(y3, x3);
442  y1 = RotateRight64<3>(y1);
443  y2 = RotateRight64<3>(y2);
444  y3 = RotateRight64<3>(y3);
445  x1 = _mm_xor_si128(x1, rk);
446  x2 = _mm_xor_si128(x2, rk);
447  x3 = _mm_xor_si128(x3, rk);
448  x1 = _mm_sub_epi64(x1, y1);
449  x2 = _mm_sub_epi64(x2, y2);
450  x3 = _mm_sub_epi64(x3, y3);
451  x1 = RotateLeft64<8>(x1);
452  x2 = RotateLeft64<8>(x2);
453  x3 = RotateLeft64<8>(x3);
454  }
455 
456  // [A1 B1][A2 B2] ... => [A1 A2][B1 B2] ...
457  block0 = _mm_unpacklo_epi64(y1, x1);
458  block1 = _mm_unpackhi_epi64(y1, x1);
459  block2 = _mm_unpacklo_epi64(y2, x2);
460  block3 = _mm_unpackhi_epi64(y2, x2);
461  block4 = _mm_unpacklo_epi64(y3, x3);
462  block5 = _mm_unpackhi_epi64(y3, x3);
463 }
464 
465 #endif // CRYPTOPP_SSSE3_AVAILABLE
466 
467 // ***************************** Power8 ***************************** //
468 
469 #if defined(CRYPTOPP_POWER8_AVAILABLE)
470 
474 
475 using CryptoPP::VecAdd;
476 using CryptoPP::VecSub;
477 using CryptoPP::VecXor;
479 
480 // Rotate left by bit count
481 template<unsigned int C>
482 inline uint64x2_p RotateLeft64(const uint64x2_p val)
483 {
484  const uint64x2_p m = {C, C};
485  return vec_rl(val, m);
486 }
487 
488 // Rotate right by bit count
489 template<unsigned int C>
490 inline uint64x2_p RotateRight64(const uint64x2_p val)
491 {
492  const uint64x2_p m = {64-C, 64-C};
493  return vec_rl(val, m);
494 }
495 
496 void SPECK128_Enc_Block(uint32x4_p &block, const word64 *subkeys, unsigned int rounds)
497 {
498 #if (CRYPTOPP_BIG_ENDIAN)
499  const uint8x16_p m1 = {31,30,29,28,27,26,25,24, 15,14,13,12,11,10,9,8};
500  const uint8x16_p m2 = {23,22,21,20,19,18,17,16, 7,6,5,4,3,2,1,0};
501 #else
502  const uint8x16_p m1 = {7,6,5,4,3,2,1,0, 23,22,21,20,19,18,17,16};
503  const uint8x16_p m2 = {15,14,13,12,11,10,9,8, 31,30,29,28,27,26,25,24};
504 #endif
505 
506  // [A1 A2][B1 B2] ... => [A1 B1][A2 B2] ...
507  uint64x2_p x1 = (uint64x2_p)VecPermute(block, block, m1);
508  uint64x2_p y1 = (uint64x2_p)VecPermute(block, block, m2);
509 
510  for (int i=0; i < static_cast<int>(rounds); ++i)
511  {
512  const uint64x2_p rk = vec_splats((unsigned long long)subkeys[i]);
513 
514  x1 = RotateRight64<8>(x1);
515  x1 = VecAdd(x1, y1);
516  x1 = VecXor(x1, rk);
517 
518  y1 = RotateLeft64<3>(y1);
519  y1 = VecXor(y1, x1);
520  }
521 
522 #if (CRYPTOPP_BIG_ENDIAN)
523  const uint8x16_p m3 = {31,30,29,28,27,26,25,24, 15,14,13,12,11,10,9,8};
524  //const uint8x16_p m4 = {23,22,21,20,19,18,17,16, 7,6,5,4,3,2,1,0};
525 #else
526  const uint8x16_p m3 = {7,6,5,4,3,2,1,0, 23,22,21,20,19,18,17,16};
527  //const uint8x16_p m4 = {15,14,13,12,11,10,9,8, 31,30,29,28,27,26,25,24};
528 #endif
529 
530  // [A1 B1][A2 B2] ... => [A1 A2][B1 B2] ...
531  block = (uint32x4_p)VecPermute(x1, y1, m3);
532 }
533 
534 void SPECK128_Dec_Block(uint32x4_p &block, const word64 *subkeys, unsigned int rounds)
535 {
536 #if (CRYPTOPP_BIG_ENDIAN)
537  const uint8x16_p m1 = {31,30,29,28,27,26,25,24, 15,14,13,12,11,10,9,8};
538  const uint8x16_p m2 = {23,22,21,20,19,18,17,16, 7,6,5,4,3,2,1,0};
539 #else
540  const uint8x16_p m1 = {7,6,5,4,3,2,1,0, 23,22,21,20,19,18,17,16};
541  const uint8x16_p m2 = {15,14,13,12,11,10,9,8, 31,30,29,28,27,26,25,24};
542 #endif
543 
544  // [A1 A2][B1 B2] ... => [A1 B1][A2 B2] ...
545  uint64x2_p x1 = (uint64x2_p)VecPermute(block, block, m1);
546  uint64x2_p y1 = (uint64x2_p)VecPermute(block, block, m2);
547 
548  for (int i = static_cast<int>(rounds-1); i >= 0; --i)
549  {
550  const uint64x2_p rk = vec_splats((unsigned long long)subkeys[i]);
551 
552  y1 = VecXor(y1, x1);
553  y1 = RotateRight64<3>(y1);
554  x1 = VecXor(x1, rk);
555  x1 = VecSub(x1, y1);
556  x1 = RotateLeft64<8>(x1);
557  }
558 
559 #if (CRYPTOPP_BIG_ENDIAN)
560  const uint8x16_p m3 = {31,30,29,28,27,26,25,24, 15,14,13,12,11,10,9,8};
561  //const uint8x16_p m4 = {23,22,21,20,19,18,17,16, 7,6,5,4,3,2,1,0};
562 #else
563  const uint8x16_p m3 = {7,6,5,4,3,2,1,0, 23,22,21,20,19,18,17,16};
564  //const uint8x16_p m4 = {15,14,13,12,11,10,9,8, 31,30,29,28,27,26,25,24};
565 #endif
566 
567  // [A1 B1][A2 B2] ... => [A1 A2][B1 B2] ...
568  block = (uint32x4_p)VecPermute(x1, y1, m3);
569 }
570 
571 void SPECK128_Enc_6_Blocks(uint32x4_p &block0, uint32x4_p &block1,
572  uint32x4_p &block2, uint32x4_p &block3, uint32x4_p &block4,
573  uint32x4_p &block5, const word64 *subkeys, unsigned int rounds)
574 {
575 #if (CRYPTOPP_BIG_ENDIAN)
576  const uint8x16_p m1 = {31,30,29,28,27,26,25,24, 15,14,13,12,11,10,9,8};
577  const uint8x16_p m2 = {23,22,21,20,19,18,17,16, 7,6,5,4,3,2,1,0};
578 #else
579  const uint8x16_p m1 = {7,6,5,4,3,2,1,0, 23,22,21,20,19,18,17,16};
580  const uint8x16_p m2 = {15,14,13,12,11,10,9,8, 31,30,29,28,27,26,25,24};
581 #endif
582 
583  // [A1 A2][B1 B2] ... => [A1 B1][A2 B2] ...
584  uint64x2_p x1 = (uint64x2_p)VecPermute(block0, block1, m1);
585  uint64x2_p y1 = (uint64x2_p)VecPermute(block0, block1, m2);
586  uint64x2_p x2 = (uint64x2_p)VecPermute(block2, block3, m1);
587  uint64x2_p y2 = (uint64x2_p)VecPermute(block2, block3, m2);
588  uint64x2_p x3 = (uint64x2_p)VecPermute(block4, block5, m1);
589  uint64x2_p y3 = (uint64x2_p)VecPermute(block4, block5, m2);
590 
591  for (int i=0; i < static_cast<int>(rounds); ++i)
592  {
593  const uint64x2_p rk = vec_splats((unsigned long long)subkeys[i]);
594 
595  x1 = RotateRight64<8>(x1);
596  x2 = RotateRight64<8>(x2);
597  x3 = RotateRight64<8>(x3);
598  x1 = VecAdd(x1, y1);
599  x2 = VecAdd(x2, y2);
600  x3 = VecAdd(x3, y3);
601  x1 = VecXor(x1, rk);
602  x2 = VecXor(x2, rk);
603  x3 = VecXor(x3, rk);
604 
605  y1 = RotateLeft64<3>(y1);
606  y2 = RotateLeft64<3>(y2);
607  y3 = RotateLeft64<3>(y3);
608  y1 = VecXor(y1, x1);
609  y2 = VecXor(y2, x2);
610  y3 = VecXor(y3, x3);
611  }
612 
613 #if (CRYPTOPP_BIG_ENDIAN)
614  const uint8x16_p m3 = {31,30,29,28,27,26,25,24, 15,14,13,12,11,10,9,8};
615  const uint8x16_p m4 = {23,22,21,20,19,18,17,16, 7,6,5,4,3,2,1,0};
616 #else
617  const uint8x16_p m3 = {7,6,5,4,3,2,1,0, 23,22,21,20,19,18,17,16};
618  const uint8x16_p m4 = {15,14,13,12,11,10,9,8, 31,30,29,28,27,26,25,24};
619 #endif
620 
621  // [A1 B1][A2 B2] ... => [A1 A2][B1 B2] ...
622  block0 = (uint32x4_p)VecPermute(x1, y1, m3);
623  block1 = (uint32x4_p)VecPermute(x1, y1, m4);
624  block2 = (uint32x4_p)VecPermute(x2, y2, m3);
625  block3 = (uint32x4_p)VecPermute(x2, y2, m4);
626  block4 = (uint32x4_p)VecPermute(x3, y3, m3);
627  block5 = (uint32x4_p)VecPermute(x3, y3, m4);
628 }
629 
630 void SPECK128_Dec_6_Blocks(uint32x4_p &block0, uint32x4_p &block1,
631  uint32x4_p &block2, uint32x4_p &block3, uint32x4_p &block4,
632  uint32x4_p &block5, const word64 *subkeys, unsigned int rounds)
633 {
634 #if (CRYPTOPP_BIG_ENDIAN)
635  const uint8x16_p m1 = {31,30,29,28,27,26,25,24, 15,14,13,12,11,10,9,8};
636  const uint8x16_p m2 = {23,22,21,20,19,18,17,16, 7,6,5,4,3,2,1,0};
637 #else
638  const uint8x16_p m1 = {7,6,5,4,3,2,1,0, 23,22,21,20,19,18,17,16};
639  const uint8x16_p m2 = {15,14,13,12,11,10,9,8, 31,30,29,28,27,26,25,24};
640 #endif
641 
642  // [A1 A2][B1 B2] ... => [A1 B1][A2 B2] ...
643  uint64x2_p x1 = (uint64x2_p)VecPermute(block0, block1, m1);
644  uint64x2_p y1 = (uint64x2_p)VecPermute(block0, block1, m2);
645  uint64x2_p x2 = (uint64x2_p)VecPermute(block2, block3, m1);
646  uint64x2_p y2 = (uint64x2_p)VecPermute(block2, block3, m2);
647  uint64x2_p x3 = (uint64x2_p)VecPermute(block4, block5, m1);
648  uint64x2_p y3 = (uint64x2_p)VecPermute(block4, block5, m2);
649 
650  for (int i = static_cast<int>(rounds-1); i >= 0; --i)
651  {
652  const uint64x2_p rk = vec_splats((unsigned long long)subkeys[i]);
653 
654  y1 = VecXor(y1, x1);
655  y2 = VecXor(y2, x2);
656  y3 = VecXor(y3, x3);
657  y1 = RotateRight64<3>(y1);
658  y2 = RotateRight64<3>(y2);
659  y3 = RotateRight64<3>(y3);
660 
661  x1 = VecXor(x1, rk);
662  x2 = VecXor(x2, rk);
663  x3 = VecXor(x3, rk);
664  x1 = VecSub(x1, y1);
665  x2 = VecSub(x2, y2);
666  x3 = VecSub(x3, y3);
667  x1 = RotateLeft64<8>(x1);
668  x2 = RotateLeft64<8>(x2);
669  x3 = RotateLeft64<8>(x3);
670  }
671 
672 #if (CRYPTOPP_BIG_ENDIAN)
673  const uint8x16_p m3 = {31,30,29,28,27,26,25,24, 15,14,13,12,11,10,9,8};
674  const uint8x16_p m4 = {23,22,21,20,19,18,17,16, 7,6,5,4,3,2,1,0};
675 #else
676  const uint8x16_p m3 = {7,6,5,4,3,2,1,0, 23,22,21,20,19,18,17,16};
677  const uint8x16_p m4 = {15,14,13,12,11,10,9,8, 31,30,29,28,27,26,25,24};
678 #endif
679 
680  // [A1 B1][A2 B2] ... => [A1 A2][B1 B2] ...
681  block0 = (uint32x4_p)VecPermute(x1, y1, m3);
682  block1 = (uint32x4_p)VecPermute(x1, y1, m4);
683  block2 = (uint32x4_p)VecPermute(x2, y2, m3);
684  block3 = (uint32x4_p)VecPermute(x2, y2, m4);
685  block4 = (uint32x4_p)VecPermute(x3, y3, m3);
686  block5 = (uint32x4_p)VecPermute(x3, y3, m4);
687 }
688 
689 #endif // CRYPTOPP_POWER8_AVAILABLE
690 
691 ANONYMOUS_NAMESPACE_END
692 
693 ///////////////////////////////////////////////////////////////////////
694 
695 NAMESPACE_BEGIN(CryptoPP)
696 
697 // *************************** ARM NEON **************************** //
698 
699 #if (CRYPTOPP_ARM_NEON_AVAILABLE)
700 size_t SPECK128_Enc_AdvancedProcessBlocks_NEON(const word64* subKeys, size_t rounds,
701  const byte *inBlocks, const byte *xorBlocks, byte *outBlocks, size_t length, word32 flags)
702 {
703  return AdvancedProcessBlocks128_6x2_NEON(SPECK128_Enc_Block, SPECK128_Enc_6_Blocks,
704  subKeys, rounds, inBlocks, xorBlocks, outBlocks, length, flags);
705 }
706 
707 size_t SPECK128_Dec_AdvancedProcessBlocks_NEON(const word64* subKeys, size_t rounds,
708  const byte *inBlocks, const byte *xorBlocks, byte *outBlocks, size_t length, word32 flags)
709 {
710  return AdvancedProcessBlocks128_6x2_NEON(SPECK128_Dec_Block, SPECK128_Dec_6_Blocks,
711  subKeys, rounds, inBlocks, xorBlocks, outBlocks, length, flags);
712 }
713 #endif // CRYPTOPP_ARM_NEON_AVAILABLE
714 
715 // ***************************** IA-32 ***************************** //
716 
717 #if defined(CRYPTOPP_SSSE3_AVAILABLE)
718 size_t SPECK128_Enc_AdvancedProcessBlocks_SSSE3(const word64* subKeys, size_t rounds,
719  const byte *inBlocks, const byte *xorBlocks, byte *outBlocks, size_t length, word32 flags)
720 {
721  return AdvancedProcessBlocks128_6x2_SSE(SPECK128_Enc_Block, SPECK128_Enc_6_Blocks,
722  subKeys, rounds, inBlocks, xorBlocks, outBlocks, length, flags);
723 }
724 
725 size_t SPECK128_Dec_AdvancedProcessBlocks_SSSE3(const word64* subKeys, size_t rounds,
726  const byte *inBlocks, const byte *xorBlocks, byte *outBlocks, size_t length, word32 flags)
727 {
728  return AdvancedProcessBlocks128_6x2_SSE(SPECK128_Dec_Block, SPECK128_Dec_6_Blocks,
729  subKeys, rounds, inBlocks, xorBlocks, outBlocks, length, flags);
730 }
731 #endif // CRYPTOPP_SSSE3_AVAILABLE
732 
733 // ***************************** Power8 ***************************** //
734 
735 #if defined(CRYPTOPP_POWER8_AVAILABLE)
736 size_t SPECK128_Enc_AdvancedProcessBlocks_POWER8(const word64* subKeys, size_t rounds,
737  const byte *inBlocks, const byte *xorBlocks, byte *outBlocks, size_t length, word32 flags)
738 {
739  return AdvancedProcessBlocks128_6x1_ALTIVEC(SPECK128_Enc_Block, SPECK128_Enc_6_Blocks,
740  subKeys, rounds, inBlocks, xorBlocks, outBlocks, length, flags);
741 }
742 
743 size_t SPECK128_Dec_AdvancedProcessBlocks_POWER8(const word64* subKeys, size_t rounds,
744  const byte *inBlocks, const byte *xorBlocks, byte *outBlocks, size_t length, word32 flags)
745 {
746  return AdvancedProcessBlocks128_6x1_ALTIVEC(SPECK128_Dec_Block, SPECK128_Dec_6_Blocks,
747  subKeys, rounds, inBlocks, xorBlocks, outBlocks, length, flags);
748 }
749 #endif // CRYPTOPP_POWER8_AVAILABLE
750 
751 NAMESPACE_END
Utility functions for the Crypto++ library.
T1 VecSub(const T1 vec1, const T2 vec2)
Subtract two vectors.
Definition: ppc_simd.h:956
Library configuration file.
T1 VecAdd(const T1 vec1, const T2 vec2)
Add two vectors.
Definition: ppc_simd.h:939
T1 VecPermute(const T1 vec, const T2 mask)
Permutes a vector.
Definition: ppc_simd.h:1010
__vector unsigned int uint32x4_p
Vector of 32-bit elements.
Definition: ppc_simd.h:129
Support functions for PowerPC and vector operations.
Template for AdvancedProcessBlocks and SIMD processing.
Precompiled header file.
Classes for the Speck block cipher.
T1 VecXor(const T1 vec1, const T2 vec2)
XOR two vectors.
Definition: ppc_simd.h:916
__vector unsigned long long uint64x2_p
Vector of 64-bit elements.
Definition: ppc_simd.h:139
Crypto++ library namespace.
__vector unsigned char uint8x16_p
Vector of 8-bit elements.
Definition: ppc_simd.h:119