Crypto++  8.2
Free C++ class library of cryptographic schemes
speck64_simd.cpp
1 // speck64_simd.cpp - written and placed in the public domain by Jeffrey Walton
2 //
3 // This source file uses intrinsics and built-ins to gain access to
4 // SSSE3, ARM NEON and ARMv8a, and Altivec instructions. A separate
5 // source file is needed because additional CXXFLAGS are required to enable
6 // the appropriate instructions sets in some build configurations.
7 
8 #include "pch.h"
9 #include "config.h"
10 
11 #include "speck.h"
12 #include "misc.h"
13 
14 // Uncomment for benchmarking C++ against SSE or NEON.
15 // Do so in both speck.cpp and speck-simd.cpp.
16 // #undef CRYPTOPP_SSE41_AVAILABLE
17 // #undef CRYPTOPP_ARM_NEON_AVAILABLE
18 
19 #if (CRYPTOPP_SSSE3_AVAILABLE)
20 # include "adv_simd.h"
21 # include <pmmintrin.h>
22 # include <tmmintrin.h>
23 #endif
24 
25 #if (CRYPTOPP_SSE41_AVAILABLE)
26 # include <smmintrin.h>
27 #endif
28 
29 #if defined(__XOP__)
30 # include <ammintrin.h>
31 #endif
32 
33 #if defined(__AVX512F__)
34 # define CRYPTOPP_AVX512_ROTATE 1
35 # include <immintrin.h>
36 #endif
37 
38 // C1189: error: This header is specific to ARM targets
39 #if (CRYPTOPP_ARM_NEON_AVAILABLE)
40 # include "adv_simd.h"
41 # ifndef _M_ARM64
42 # include <arm_neon.h>
43 # endif
44 #endif
45 
46 #if (CRYPTOPP_ARM_ACLE_AVAILABLE)
47 # include <stdint.h>
48 # include <arm_acle.h>
49 #endif
50 
51 #if defined(CRYPTOPP_ALTIVEC_AVAILABLE)
52 # include "adv_simd.h"
53 # include "ppc_simd.h"
54 #endif
55 
56 // Squash MS LNK4221 and libtool warnings
57 extern const char SPECK64_SIMD_FNAME[] = __FILE__;
58 
59 ANONYMOUS_NAMESPACE_BEGIN
60 
61 using CryptoPP::byte;
62 using CryptoPP::word32;
63 using CryptoPP::word64;
64 
65 // *************************** ARM NEON ************************** //
66 
67 #if (CRYPTOPP_ARM_NEON_AVAILABLE)
68 
69 template <class T>
70 inline T UnpackHigh32(const T& a, const T& b)
71 {
72  const uint32x2_t x(vget_high_u32((uint32x4_t)a));
73  const uint32x2_t y(vget_high_u32((uint32x4_t)b));
74  const uint32x2x2_t r = vzip_u32(x, y);
75  return (T)vcombine_u32(r.val[0], r.val[1]);
76 }
77 
78 template <class T>
79 inline T UnpackLow32(const T& a, const T& b)
80 {
81  const uint32x2_t x(vget_low_u32((uint32x4_t)a));
82  const uint32x2_t y(vget_low_u32((uint32x4_t)b));
83  const uint32x2x2_t r = vzip_u32(x, y);
84  return (T)vcombine_u32(r.val[0], r.val[1]);
85 }
86 
87 template <unsigned int R>
88 inline uint32x4_t RotateLeft32(const uint32x4_t& val)
89 {
90  const uint32x4_t a(vshlq_n_u32(val, R));
91  const uint32x4_t b(vshrq_n_u32(val, 32 - R));
92  return vorrq_u32(a, b);
93 }
94 
95 template <unsigned int R>
96 inline uint32x4_t RotateRight32(const uint32x4_t& val)
97 {
98  const uint32x4_t a(vshlq_n_u32(val, 32 - R));
99  const uint32x4_t b(vshrq_n_u32(val, R));
100  return vorrq_u32(a, b);
101 }
102 
103 #if defined(__aarch32__) || defined(__aarch64__)
104 // Faster than two Shifts and an Or. Thanks to Louis Wingers and Bryan Weeks.
105 template <>
106 inline uint32x4_t RotateLeft32<8>(const uint32x4_t& val)
107 {
108  const uint8_t maskb[16] = { 3,0,1,2, 7,4,5,6, 11,8,9,10, 15,12,13,14 };
109  const uint8x16_t mask = vld1q_u8(maskb);
110 
111  return vreinterpretq_u32_u8(
112  vqtbl1q_u8(vreinterpretq_u8_u32(val), mask));
113 }
114 
115 // Faster than two Shifts and an Or. Thanks to Louis Wingers and Bryan Weeks.
116 template <>
117 inline uint32x4_t RotateRight32<8>(const uint32x4_t& val)
118 {
119  const uint8_t maskb[16] = { 1,2,3,0, 5,6,7,4, 9,10,11,8, 13,14,15,12 };
120  const uint8x16_t mask = vld1q_u8(maskb);
121 
122  return vreinterpretq_u32_u8(
123  vqtbl1q_u8(vreinterpretq_u8_u32(val), mask));
124 }
125 #endif // Aarch32 or Aarch64
126 
127 inline void SPECK64_Enc_Block(uint32x4_t &block0, uint32x4_t &block1,
128  const word32 *subkeys, unsigned int rounds)
129 {
130  // [A1 A2 A3 A4][B1 B2 B3 B4] ... => [A1 A3 B1 B3][A2 A4 B2 B4] ...
131  uint32x4_t x1 = vuzpq_u32(block0, block1).val[1];
132  uint32x4_t y1 = vuzpq_u32(block0, block1).val[0];
133 
134  for (int i=0; i < static_cast<int>(rounds); ++i)
135  {
136  const uint32x4_t rk = vdupq_n_u32(subkeys[i]);
137 
138  x1 = RotateRight32<8>(x1);
139  x1 = vaddq_u32(x1, y1);
140  x1 = veorq_u32(x1, rk);
141  y1 = RotateLeft32<3>(y1);
142  y1 = veorq_u32(y1, x1);
143  }
144 
145  // [A1 A3 B1 B3][A2 A4 B2 B4] => [A1 A2 A3 A4][B1 B2 B3 B4]
146  block0 = UnpackLow32(y1, x1);
147  block1 = UnpackHigh32(y1, x1);
148 }
149 
150 inline void SPECK64_Dec_Block(uint32x4_t &block0, uint32x4_t &block1,
151  const word32 *subkeys, unsigned int rounds)
152 {
153  // [A1 A2 A3 A4][B1 B2 B3 B4] ... => [A1 A3 B1 B3][A2 A4 B2 B4] ...
154  uint32x4_t x1 = vuzpq_u32(block0, block1).val[1];
155  uint32x4_t y1 = vuzpq_u32(block0, block1).val[0];
156 
157  for (int i = static_cast<int>(rounds-1); i >= 0; --i)
158  {
159  const uint32x4_t rk = vdupq_n_u32(subkeys[i]);
160 
161  y1 = veorq_u32(y1, x1);
162  y1 = RotateRight32<3>(y1);
163  x1 = veorq_u32(x1, rk);
164  x1 = vsubq_u32(x1, y1);
165  x1 = RotateLeft32<8>(x1);
166  }
167 
168  // [A1 A3 B1 B3][A2 A4 B2 B4] => [A1 A2 A3 A4][B1 B2 B3 B4]
169  block0 = UnpackLow32(y1, x1);
170  block1 = UnpackHigh32(y1, x1);
171 }
172 
173 inline void SPECK64_Enc_6_Blocks(uint32x4_t &block0, uint32x4_t &block1,
174  uint32x4_t &block2, uint32x4_t &block3, uint32x4_t &block4, uint32x4_t &block5,
175  const word32 *subkeys, unsigned int rounds)
176 {
177  // [A1 A2 A3 A4][B1 B2 B3 B4] ... => [A1 A3 B1 B3][A2 A4 B2 B4] ...
178  uint32x4_t x1 = vuzpq_u32(block0, block1).val[1];
179  uint32x4_t y1 = vuzpq_u32(block0, block1).val[0];
180  uint32x4_t x2 = vuzpq_u32(block2, block3).val[1];
181  uint32x4_t y2 = vuzpq_u32(block2, block3).val[0];
182  uint32x4_t x3 = vuzpq_u32(block4, block5).val[1];
183  uint32x4_t y3 = vuzpq_u32(block4, block5).val[0];
184 
185  for (int i=0; i < static_cast<int>(rounds); ++i)
186  {
187  const uint32x4_t rk = vdupq_n_u32(subkeys[i]);
188 
189  x1 = RotateRight32<8>(x1);
190  x2 = RotateRight32<8>(x2);
191  x3 = RotateRight32<8>(x3);
192  x1 = vaddq_u32(x1, y1);
193  x2 = vaddq_u32(x2, y2);
194  x3 = vaddq_u32(x3, y3);
195  x1 = veorq_u32(x1, rk);
196  x2 = veorq_u32(x2, rk);
197  x3 = veorq_u32(x3, rk);
198  y1 = RotateLeft32<3>(y1);
199  y2 = RotateLeft32<3>(y2);
200  y3 = RotateLeft32<3>(y3);
201  y1 = veorq_u32(y1, x1);
202  y2 = veorq_u32(y2, x2);
203  y3 = veorq_u32(y3, x3);
204  }
205 
206  // [A1 A3 B1 B3][A2 A4 B2 B4] => [A1 A2 A3 A4][B1 B2 B3 B4]
207  block0 = UnpackLow32(y1, x1);
208  block1 = UnpackHigh32(y1, x1);
209  block2 = UnpackLow32(y2, x2);
210  block3 = UnpackHigh32(y2, x2);
211  block4 = UnpackLow32(y3, x3);
212  block5 = UnpackHigh32(y3, x3);
213 }
214 
215 inline void SPECK64_Dec_6_Blocks(uint32x4_t &block0, uint32x4_t &block1,
216  uint32x4_t &block2, uint32x4_t &block3, uint32x4_t &block4, uint32x4_t &block5,
217  const word32 *subkeys, unsigned int rounds)
218 {
219  // [A1 A2 A3 A4][B1 B2 B3 B4] ... => [A1 A3 B1 B3][A2 A4 B2 B4] ...
220  uint32x4_t x1 = vuzpq_u32(block0, block1).val[1];
221  uint32x4_t y1 = vuzpq_u32(block0, block1).val[0];
222  uint32x4_t x2 = vuzpq_u32(block2, block3).val[1];
223  uint32x4_t y2 = vuzpq_u32(block2, block3).val[0];
224  uint32x4_t x3 = vuzpq_u32(block4, block5).val[1];
225  uint32x4_t y3 = vuzpq_u32(block4, block5).val[0];
226 
227  for (int i = static_cast<int>(rounds-1); i >= 0; --i)
228  {
229  const uint32x4_t rk = vdupq_n_u32(subkeys[i]);
230 
231  y1 = veorq_u32(y1, x1);
232  y2 = veorq_u32(y2, x2);
233  y3 = veorq_u32(y3, x3);
234  y1 = RotateRight32<3>(y1);
235  y2 = RotateRight32<3>(y2);
236  y3 = RotateRight32<3>(y3);
237  x1 = veorq_u32(x1, rk);
238  x2 = veorq_u32(x2, rk);
239  x3 = veorq_u32(x3, rk);
240  x1 = vsubq_u32(x1, y1);
241  x2 = vsubq_u32(x2, y2);
242  x3 = vsubq_u32(x3, y3);
243  x1 = RotateLeft32<8>(x1);
244  x2 = RotateLeft32<8>(x2);
245  x3 = RotateLeft32<8>(x3);
246  }
247 
248  // [A1 A3 B1 B3][A2 A4 B2 B4] => [A1 A2 A3 A4][B1 B2 B3 B4]
249  block0 = UnpackLow32(y1, x1);
250  block1 = UnpackHigh32(y1, x1);
251  block2 = UnpackLow32(y2, x2);
252  block3 = UnpackHigh32(y2, x2);
253  block4 = UnpackLow32(y3, x3);
254  block5 = UnpackHigh32(y3, x3);
255 }
256 
257 #endif // CRYPTOPP_ARM_NEON_AVAILABLE
258 
259 // ***************************** IA-32 ***************************** //
260 
261 #if defined(CRYPTOPP_SSE41_AVAILABLE)
262 
263 template <unsigned int R>
264 inline __m128i RotateLeft32(const __m128i& val)
265 {
266 #if defined(__XOP__)
267  return _mm_roti_epi32(val, R);
268 #else
269  return _mm_or_si128(
270  _mm_slli_epi32(val, R), _mm_srli_epi32(val, 32-R));
271 #endif
272 }
273 
274 template <unsigned int R>
275 inline __m128i RotateRight32(const __m128i& val)
276 {
277 #if defined(__XOP__)
278  return _mm_roti_epi32(val, 32-R);
279 #else
280  return _mm_or_si128(
281  _mm_slli_epi32(val, 32-R), _mm_srli_epi32(val, R));
282 #endif
283 }
284 
285 // Faster than two Shifts and an Or. Thanks to Louis Wingers and Bryan Weeks.
286 template <>
287 __m128i RotateLeft32<8>(const __m128i& val)
288 {
289 #if defined(__XOP__)
290  return _mm_roti_epi32(val, 8);
291 #else
292  const __m128i mask = _mm_set_epi8(14,13,12,15, 10,9,8,11, 6,5,4,7, 2,1,0,3);
293  return _mm_shuffle_epi8(val, mask);
294 #endif
295 }
296 
297 // Faster than two Shifts and an Or. Thanks to Louis Wingers and Bryan Weeks.
298 template <>
299 __m128i RotateRight32<8>(const __m128i& val)
300 {
301 #if defined(__XOP__)
302  return _mm_roti_epi32(val, 32-8);
303 #else
304  const __m128i mask = _mm_set_epi8(12,15,14,13, 8,11,10,9, 4,7,6,5, 0,3,2,1);
305  return _mm_shuffle_epi8(val, mask);
306 #endif
307 }
308 
309 inline void SPECK64_Enc_Block(__m128i &block0, __m128i &block1,
310  const word32 *subkeys, unsigned int rounds)
311 {
312  // [A1 A2 A3 A4][B1 B2 B3 B4] ... => [A1 A3 B1 B3][A2 A4 B2 B4] ...
313  const __m128 t0 = _mm_castsi128_ps(block0);
314  const __m128 t1 = _mm_castsi128_ps(block1);
315  __m128i x1 = _mm_castps_si128(_mm_shuffle_ps(t0, t1, _MM_SHUFFLE(3,1,3,1)));
316  __m128i y1 = _mm_castps_si128(_mm_shuffle_ps(t0, t1, _MM_SHUFFLE(2,0,2,0)));
317 
318  for (int i=0; i < static_cast<int>(rounds); ++i)
319  {
320  const __m128i rk = _mm_set1_epi32(subkeys[i]);
321 
322  x1 = RotateRight32<8>(x1);
323  x1 = _mm_add_epi32(x1, y1);
324  x1 = _mm_xor_si128(x1, rk);
325  y1 = RotateLeft32<3>(y1);
326  y1 = _mm_xor_si128(y1, x1);
327  }
328 
329  // The is roughly the SSE equivalent to ARM vzp32
330  // [A1 A3 B1 B3][A2 A4 B2 B4] => [A1 A2 A3 A4][B1 B2 B3 B4]
331  block0 = _mm_unpacklo_epi32(y1, x1);
332  block1 = _mm_unpackhi_epi32(y1, x1);
333 }
334 
335 inline void SPECK64_Dec_Block(__m128i &block0, __m128i &block1,
336  const word32 *subkeys, unsigned int rounds)
337 {
338  // [A1 A2 A3 A4][B1 B2 B3 B4] ... => [A1 A3 B1 B3][A2 A4 B2 B4] ...
339  const __m128 t0 = _mm_castsi128_ps(block0);
340  const __m128 t1 = _mm_castsi128_ps(block1);
341  __m128i x1 = _mm_castps_si128(_mm_shuffle_ps(t0, t1, _MM_SHUFFLE(3,1,3,1)));
342  __m128i y1 = _mm_castps_si128(_mm_shuffle_ps(t0, t1, _MM_SHUFFLE(2,0,2,0)));
343 
344  for (int i = static_cast<int>(rounds-1); i >= 0; --i)
345  {
346  const __m128i rk = _mm_set1_epi32(subkeys[i]);
347 
348  y1 = _mm_xor_si128(y1, x1);
349  y1 = RotateRight32<3>(y1);
350  x1 = _mm_xor_si128(x1, rk);
351  x1 = _mm_sub_epi32(x1, y1);
352  x1 = RotateLeft32<8>(x1);
353  }
354 
355  // The is roughly the SSE equivalent to ARM vzp32
356  // [A1 A3 B1 B3][A2 A4 B2 B4] => [A1 A2 A3 A4][B1 B2 B3 B4]
357  block0 = _mm_unpacklo_epi32(y1, x1);
358  block1 = _mm_unpackhi_epi32(y1, x1);
359 }
360 
361 inline void SPECK64_Enc_6_Blocks(__m128i &block0, __m128i &block1,
362  __m128i &block2, __m128i &block3, __m128i &block4, __m128i &block5,
363  const word32 *subkeys, unsigned int rounds)
364 {
365  // [A1 A2 A3 A4][B1 B2 B3 B4] ... => [A1 A3 B1 B3][A2 A4 B2 B4] ...
366  const __m128 t0 = _mm_castsi128_ps(block0);
367  const __m128 t1 = _mm_castsi128_ps(block1);
368  __m128i x1 = _mm_castps_si128(_mm_shuffle_ps(t0, t1, _MM_SHUFFLE(3,1,3,1)));
369  __m128i y1 = _mm_castps_si128(_mm_shuffle_ps(t0, t1, _MM_SHUFFLE(2,0,2,0)));
370 
371  const __m128 t2 = _mm_castsi128_ps(block2);
372  const __m128 t3 = _mm_castsi128_ps(block3);
373  __m128i x2 = _mm_castps_si128(_mm_shuffle_ps(t2, t3, _MM_SHUFFLE(3,1,3,1)));
374  __m128i y2 = _mm_castps_si128(_mm_shuffle_ps(t2, t3, _MM_SHUFFLE(2,0,2,0)));
375 
376  const __m128 t4 = _mm_castsi128_ps(block4);
377  const __m128 t5 = _mm_castsi128_ps(block5);
378  __m128i x3 = _mm_castps_si128(_mm_shuffle_ps(t4, t5, _MM_SHUFFLE(3,1,3,1)));
379  __m128i y3 = _mm_castps_si128(_mm_shuffle_ps(t4, t5, _MM_SHUFFLE(2,0,2,0)));
380 
381  for (int i=0; i < static_cast<int>(rounds); ++i)
382  {
383  const __m128i rk = _mm_set1_epi32(subkeys[i]);
384 
385  x1 = RotateRight32<8>(x1);
386  x2 = RotateRight32<8>(x2);
387  x3 = RotateRight32<8>(x3);
388  x1 = _mm_add_epi32(x1, y1);
389  x2 = _mm_add_epi32(x2, y2);
390  x3 = _mm_add_epi32(x3, y3);
391  x1 = _mm_xor_si128(x1, rk);
392  x2 = _mm_xor_si128(x2, rk);
393  x3 = _mm_xor_si128(x3, rk);
394  y1 = RotateLeft32<3>(y1);
395  y2 = RotateLeft32<3>(y2);
396  y3 = RotateLeft32<3>(y3);
397  y1 = _mm_xor_si128(y1, x1);
398  y2 = _mm_xor_si128(y2, x2);
399  y3 = _mm_xor_si128(y3, x3);
400  }
401 
402  // The is roughly the SSE equivalent to ARM vzp32
403  // [A1 A3 B1 B3][A2 A4 B2 B4] => [A1 A2 A3 A4][B1 B2 B3 B4]
404  block0 = _mm_unpacklo_epi32(y1, x1);
405  block1 = _mm_unpackhi_epi32(y1, x1);
406  block2 = _mm_unpacklo_epi32(y2, x2);
407  block3 = _mm_unpackhi_epi32(y2, x2);
408  block4 = _mm_unpacklo_epi32(y3, x3);
409  block5 = _mm_unpackhi_epi32(y3, x3);
410 }
411 
412 inline void SPECK64_Dec_6_Blocks(__m128i &block0, __m128i &block1,
413  __m128i &block2, __m128i &block3, __m128i &block4, __m128i &block5,
414  const word32 *subkeys, unsigned int rounds)
415 {
416  // [A1 A2 A3 A4][B1 B2 B3 B4] ... => [A1 A3 B1 B3][A2 A4 B2 B4] ...
417  const __m128 t0 = _mm_castsi128_ps(block0);
418  const __m128 t1 = _mm_castsi128_ps(block1);
419  __m128i x1 = _mm_castps_si128(_mm_shuffle_ps(t0, t1, _MM_SHUFFLE(3,1,3,1)));
420  __m128i y1 = _mm_castps_si128(_mm_shuffle_ps(t0, t1, _MM_SHUFFLE(2,0,2,0)));
421 
422  const __m128 t2 = _mm_castsi128_ps(block2);
423  const __m128 t3 = _mm_castsi128_ps(block3);
424  __m128i x2 = _mm_castps_si128(_mm_shuffle_ps(t2, t3, _MM_SHUFFLE(3,1,3,1)));
425  __m128i y2 = _mm_castps_si128(_mm_shuffle_ps(t2, t3, _MM_SHUFFLE(2,0,2,0)));
426 
427  const __m128 t4 = _mm_castsi128_ps(block4);
428  const __m128 t5 = _mm_castsi128_ps(block5);
429  __m128i x3 = _mm_castps_si128(_mm_shuffle_ps(t4, t5, _MM_SHUFFLE(3,1,3,1)));
430  __m128i y3 = _mm_castps_si128(_mm_shuffle_ps(t4, t5, _MM_SHUFFLE(2,0,2,0)));
431 
432  for (int i = static_cast<int>(rounds-1); i >= 0; --i)
433  {
434  const __m128i rk = _mm_set1_epi32(subkeys[i]);
435 
436  y1 = _mm_xor_si128(y1, x1);
437  y2 = _mm_xor_si128(y2, x2);
438  y3 = _mm_xor_si128(y3, x3);
439  y1 = RotateRight32<3>(y1);
440  y2 = RotateRight32<3>(y2);
441  y3 = RotateRight32<3>(y3);
442  x1 = _mm_xor_si128(x1, rk);
443  x2 = _mm_xor_si128(x2, rk);
444  x3 = _mm_xor_si128(x3, rk);
445  x1 = _mm_sub_epi32(x1, y1);
446  x2 = _mm_sub_epi32(x2, y2);
447  x3 = _mm_sub_epi32(x3, y3);
448  x1 = RotateLeft32<8>(x1);
449  x2 = RotateLeft32<8>(x2);
450  x3 = RotateLeft32<8>(x3);
451  }
452 
453  // The is roughly the SSE equivalent to ARM vzp32
454  // [A1 A3 B1 B3][A2 A4 B2 B4] => [A1 A2 A3 A4][B1 B2 B3 B4]
455  block0 = _mm_unpacklo_epi32(y1, x1);
456  block1 = _mm_unpackhi_epi32(y1, x1);
457  block2 = _mm_unpacklo_epi32(y2, x2);
458  block3 = _mm_unpackhi_epi32(y2, x2);
459  block4 = _mm_unpacklo_epi32(y3, x3);
460  block5 = _mm_unpackhi_epi32(y3, x3);
461 }
462 
463 #endif // CRYPTOPP_SSE41_AVAILABLE
464 
465 // ***************************** Altivec ***************************** //
466 
467 #if defined(CRYPTOPP_ALTIVEC_AVAILABLE)
470 
471 using CryptoPP::VecAdd;
472 using CryptoPP::VecSub;
473 using CryptoPP::VecXor;
474 using CryptoPP::VecLoad;
476 
477 // Rotate left by bit count
478 template<unsigned int C>
479 inline uint32x4_p RotateLeft32(const uint32x4_p val)
480 {
481  const uint32x4_p m = {C, C, C, C};
482  return vec_rl(val, m);
483 }
484 
485 // Rotate right by bit count
486 template<unsigned int C>
487 inline uint32x4_p RotateRight32(const uint32x4_p val)
488 {
489  const uint32x4_p m = {32-C, 32-C, 32-C, 32-C};
490  return vec_rl(val, m);
491 }
492 
493 void SPECK64_Enc_Block(uint32x4_p &block0, uint32x4_p &block1,
494  const word32 *subkeys, unsigned int rounds)
495 {
496 #if (CRYPTOPP_BIG_ENDIAN)
497  const uint8x16_p m1 = {7,6,5,4, 15,14,13,12, 23,22,21,20, 31,30,29,28};
498  const uint8x16_p m2 = {3,2,1,0, 11,10,9,8, 19,18,17,16, 27,26,25,24};
499 #else
500  const uint8x16_p m1 = {3,2,1,0, 11,10,9,8, 19,18,17,16, 27,26,25,24};
501  const uint8x16_p m2 = {7,6,5,4, 15,14,13,12, 23,22,21,20, 31,30,29,28};
502 #endif
503 
504  // [A1 A2 A3 A4][B1 B2 B3 B4] ... => [A1 A3 B1 B3][A2 A4 B2 B4] ...
505  uint32x4_p x1 = VecPermute(block0, block1, m1);
506  uint32x4_p y1 = VecPermute(block0, block1, m2);
507 
508  for (int i=0; i < static_cast<int>(rounds); ++i)
509  {
510 #if CRYPTOPP_POWER8_AVAILABLE
511  const uint32x4_p rk = vec_splats(subkeys[i]);
512 #else
513  // subkeys has extra elements so memory backs the last subkey
514  const uint8x16_p m = {0,1,2,3, 0,1,2,3, 0,1,2,3, 0,1,2,3};
515  uint32x4_p rk = VecLoad(subkeys+i);
516  rk = VecPermute(rk, rk, m);
517 #endif
518 
519  x1 = RotateRight32<8>(x1);
520  x1 = VecAdd(x1, y1);
521  x1 = VecXor(x1, rk);
522 
523  y1 = RotateLeft32<3>(y1);
524  y1 = VecXor(y1, x1);
525  }
526 
527 #if (CRYPTOPP_BIG_ENDIAN)
528  const uint8x16_p m3 = {19,18,17,16, 3,2,1,0, 23,22,21,20, 7,6,5,4};
529  const uint8x16_p m4 = {27,26,25,24, 11,10,9,8, 31,30,29,28, 15,14,13,12};
530 #else
531  const uint8x16_p m3 = {3,2,1,0, 19,18,17,16, 7,6,5,4, 23,22,21,20};
532  const uint8x16_p m4 = {11,10,9,8, 27,26,25,24, 15,14,13,12, 31,30,29,28};
533 #endif
534 
535  // [A1 A3 B1 B3][A2 A4 B2 B4] => [A1 A2 A3 A4][B1 B2 B3 B4]
536  block0 = (uint32x4_p)VecPermute(x1, y1, m3);
537  block1 = (uint32x4_p)VecPermute(x1, y1, m4);
538 }
539 
540 void SPECK64_Dec_Block(uint32x4_p &block0, uint32x4_p &block1,
541  const word32 *subkeys, unsigned int rounds)
542 {
543 #if (CRYPTOPP_BIG_ENDIAN)
544  const uint8x16_p m1 = {7,6,5,4, 15,14,13,12, 23,22,21,20, 31,30,29,28};
545  const uint8x16_p m2 = {3,2,1,0, 11,10,9,8, 19,18,17,16, 27,26,25,24};
546 #else
547  const uint8x16_p m1 = {3,2,1,0, 11,10,9,8, 19,18,17,16, 27,26,25,24};
548  const uint8x16_p m2 = {7,6,5,4, 15,14,13,12, 23,22,21,20, 31,30,29,28};
549 #endif
550 
551  // [A1 A2 A3 A4][B1 B2 B3 B4] ... => [A1 A3 B1 B3][A2 A4 B2 B4] ...
552  uint32x4_p x1 = VecPermute(block0, block1, m1);
553  uint32x4_p y1 = VecPermute(block0, block1, m2);
554 
555  for (int i = static_cast<int>(rounds-1); i >= 0; --i)
556  {
557 #if CRYPTOPP_POWER8_AVAILABLE
558  const uint32x4_p rk = vec_splats(subkeys[i]);
559 #else
560  // subkeys has extra elements so memory backs the last subkey
561  const uint8x16_p m = {0,1,2,3, 0,1,2,3, 0,1,2,3, 0,1,2,3};
562  uint32x4_p rk = VecLoad(subkeys+i);
563  rk = VecPermute(rk, rk, m);
564 #endif
565 
566  y1 = VecXor(y1, x1);
567  y1 = RotateRight32<3>(y1);
568 
569  x1 = VecXor(x1, rk);
570  x1 = VecSub(x1, y1);
571  x1 = RotateLeft32<8>(x1);
572  }
573 
574 #if (CRYPTOPP_BIG_ENDIAN)
575  const uint8x16_p m3 = {19,18,17,16, 3,2,1,0, 23,22,21,20, 7,6,5,4};
576  const uint8x16_p m4 = {27,26,25,24, 11,10,9,8, 31,30,29,28, 15,14,13,12};
577 #else
578  const uint8x16_p m3 = {3,2,1,0, 19,18,17,16, 7,6,5,4, 23,22,21,20};
579  const uint8x16_p m4 = {11,10,9,8, 27,26,25,24, 15,14,13,12, 31,30,29,28};
580 #endif
581 
582  // [A1 A3 B1 B3][A2 A4 B2 B4] => [A1 A2 A3 A4][B1 B2 B3 B4]
583  block0 = (uint32x4_p)VecPermute(x1, y1, m3);
584  block1 = (uint32x4_p)VecPermute(x1, y1, m4);
585 }
586 
587 void SPECK64_Enc_6_Blocks(uint32x4_p &block0, uint32x4_p &block1,
588  uint32x4_p &block2, uint32x4_p &block3, uint32x4_p &block4,
589  uint32x4_p &block5, const word32 *subkeys, unsigned int rounds)
590 {
591 #if (CRYPTOPP_BIG_ENDIAN)
592  const uint8x16_p m1 = {7,6,5,4, 15,14,13,12, 23,22,21,20, 31,30,29,28};
593  const uint8x16_p m2 = {3,2,1,0, 11,10,9,8, 19,18,17,16, 27,26,25,24};
594 #else
595  const uint8x16_p m1 = {3,2,1,0, 11,10,9,8, 19,18,17,16, 27,26,25,24};
596  const uint8x16_p m2 = {7,6,5,4, 15,14,13,12, 23,22,21,20, 31,30,29,28};
597 #endif
598 
599  // [A1 A2 A3 A4][B1 B2 B3 B4] ... => [A1 A3 B1 B3][A2 A4 B2 B4] ...
600  uint32x4_p x1 = (uint32x4_p)VecPermute(block0, block1, m1);
601  uint32x4_p y1 = (uint32x4_p)VecPermute(block0, block1, m2);
602  uint32x4_p x2 = (uint32x4_p)VecPermute(block2, block3, m1);
603  uint32x4_p y2 = (uint32x4_p)VecPermute(block2, block3, m2);
604  uint32x4_p x3 = (uint32x4_p)VecPermute(block4, block5, m1);
605  uint32x4_p y3 = (uint32x4_p)VecPermute(block4, block5, m2);
606 
607  for (int i=0; i < static_cast<int>(rounds); ++i)
608  {
609 #if CRYPTOPP_POWER8_AVAILABLE
610  const uint32x4_p rk = vec_splats(subkeys[i]);
611 #else
612  // subkeys has extra elements so memory backs the last subkey
613  const uint8x16_p m = {0,1,2,3, 0,1,2,3, 0,1,2,3, 0,1,2,3};
614  uint32x4_p rk = VecLoad(subkeys+i);
615  rk = VecPermute(rk, rk, m);
616 #endif
617 
618  x1 = RotateRight32<8>(x1);
619  x2 = RotateRight32<8>(x2);
620  x3 = RotateRight32<8>(x3);
621 
622  x1 = VecAdd(x1, y1);
623  x2 = VecAdd(x2, y2);
624  x3 = VecAdd(x3, y3);
625 
626  x1 = VecXor(x1, rk);
627  x2 = VecXor(x2, rk);
628  x3 = VecXor(x3, rk);
629 
630  y1 = RotateLeft32<3>(y1);
631  y2 = RotateLeft32<3>(y2);
632  y3 = RotateLeft32<3>(y3);
633 
634  y1 = VecXor(y1, x1);
635  y2 = VecXor(y2, x2);
636  y3 = VecXor(y3, x3);
637  }
638 
639 #if (CRYPTOPP_BIG_ENDIAN)
640  const uint8x16_p m3 = {19,18,17,16, 3,2,1,0, 23,22,21,20, 7,6,5,4};
641  const uint8x16_p m4 = {27,26,25,24, 11,10,9,8, 31,30,29,28, 15,14,13,12};
642 #else
643  const uint8x16_p m3 = {3,2,1,0, 19,18,17,16, 7,6,5,4, 23,22,21,20};
644  const uint8x16_p m4 = {11,10,9,8, 27,26,25,24, 15,14,13,12, 31,30,29,28};
645 #endif
646 
647  // [A1 A3 B1 B3][A2 A4 B2 B4] => [A1 A2 A3 A4][B1 B2 B3 B4]
648  block0 = (uint32x4_p)VecPermute(x1, y1, m3);
649  block1 = (uint32x4_p)VecPermute(x1, y1, m4);
650  block2 = (uint32x4_p)VecPermute(x2, y2, m3);
651  block3 = (uint32x4_p)VecPermute(x2, y2, m4);
652  block4 = (uint32x4_p)VecPermute(x3, y3, m3);
653  block5 = (uint32x4_p)VecPermute(x3, y3, m4);
654 }
655 
656 void SPECK64_Dec_6_Blocks(uint32x4_p &block0, uint32x4_p &block1,
657  uint32x4_p &block2, uint32x4_p &block3, uint32x4_p &block4,
658  uint32x4_p &block5, const word32 *subkeys, unsigned int rounds)
659 {
660 #if (CRYPTOPP_BIG_ENDIAN)
661  const uint8x16_p m1 = {7,6,5,4, 15,14,13,12, 23,22,21,20, 31,30,29,28};
662  const uint8x16_p m2 = {3,2,1,0, 11,10,9,8, 19,18,17,16, 27,26,25,24};
663 #else
664  const uint8x16_p m1 = {3,2,1,0, 11,10,9,8, 19,18,17,16, 27,26,25,24};
665  const uint8x16_p m2 = {7,6,5,4, 15,14,13,12, 23,22,21,20, 31,30,29,28};
666 #endif
667 
668  // [A1 A2 A3 A4][B1 B2 B3 B4] ... => [A1 A3 B1 B3][A2 A4 B2 B4] ...
669  uint32x4_p x1 = (uint32x4_p)VecPermute(block0, block1, m1);
670  uint32x4_p y1 = (uint32x4_p)VecPermute(block0, block1, m2);
671  uint32x4_p x2 = (uint32x4_p)VecPermute(block2, block3, m1);
672  uint32x4_p y2 = (uint32x4_p)VecPermute(block2, block3, m2);
673  uint32x4_p x3 = (uint32x4_p)VecPermute(block4, block5, m1);
674  uint32x4_p y3 = (uint32x4_p)VecPermute(block4, block5, m2);
675 
676  for (int i = static_cast<int>(rounds-1); i >= 0; --i)
677  {
678 #if CRYPTOPP_POWER8_AVAILABLE
679  const uint32x4_p rk = vec_splats(subkeys[i]);
680 #else
681  // subkeys has extra elements so memory backs the last subkey
682  const uint8x16_p m = {0,1,2,3, 0,1,2,3, 0,1,2,3, 0,1,2,3};
683  uint32x4_p rk = VecLoad(subkeys+i);
684  rk = VecPermute(rk, rk, m);
685 #endif
686 
687  y1 = VecXor(y1, x1);
688  y2 = VecXor(y2, x2);
689  y3 = VecXor(y3, x3);
690 
691  y1 = RotateRight32<3>(y1);
692  y2 = RotateRight32<3>(y2);
693  y3 = RotateRight32<3>(y3);
694 
695  x1 = VecXor(x1, rk);
696  x2 = VecXor(x2, rk);
697  x3 = VecXor(x3, rk);
698 
699  x1 = VecSub(x1, y1);
700  x2 = VecSub(x2, y2);
701  x3 = VecSub(x3, y3);
702 
703  x1 = RotateLeft32<8>(x1);
704  x2 = RotateLeft32<8>(x2);
705  x3 = RotateLeft32<8>(x3);
706  }
707 
708 #if (CRYPTOPP_BIG_ENDIAN)
709  const uint8x16_p m3 = {19,18,17,16, 3,2,1,0, 23,22,21,20, 7,6,5,4};
710  const uint8x16_p m4 = {27,26,25,24, 11,10,9,8, 31,30,29,28, 15,14,13,12};
711 #else
712  const uint8x16_p m3 = {3,2,1,0, 19,18,17,16, 7,6,5,4, 23,22,21,20};
713  const uint8x16_p m4 = {11,10,9,8, 27,26,25,24, 15,14,13,12, 31,30,29,28};
714 #endif
715 
716  // [A1 A3 B1 B3][A2 A4 B2 B4] => [A1 A2 A3 A4][B1 B2 B3 B4]
717  block0 = (uint32x4_p)VecPermute(x1, y1, m3);
718  block1 = (uint32x4_p)VecPermute(x1, y1, m4);
719  block2 = (uint32x4_p)VecPermute(x2, y2, m3);
720  block3 = (uint32x4_p)VecPermute(x2, y2, m4);
721  block4 = (uint32x4_p)VecPermute(x3, y3, m3);
722  block5 = (uint32x4_p)VecPermute(x3, y3, m4);
723 }
724 
725 #endif // CRYPTOPP_ALTIVEC_AVAILABLE
726 
727 ANONYMOUS_NAMESPACE_END
728 
729 ///////////////////////////////////////////////////////////////////////
730 
731 NAMESPACE_BEGIN(CryptoPP)
732 
733 // *************************** ARM NEON **************************** //
734 
735 #if (CRYPTOPP_ARM_NEON_AVAILABLE)
736 size_t SPECK64_Enc_AdvancedProcessBlocks_NEON(const word32* subKeys, size_t rounds,
737  const byte *inBlocks, const byte *xorBlocks, byte *outBlocks, size_t length, word32 flags)
738 {
739  return AdvancedProcessBlocks64_6x2_NEON(SPECK64_Enc_Block, SPECK64_Enc_6_Blocks,
740  subKeys, rounds, inBlocks, xorBlocks, outBlocks, length, flags);
741 }
742 
743 size_t SPECK64_Dec_AdvancedProcessBlocks_NEON(const word32* subKeys, size_t rounds,
744  const byte *inBlocks, const byte *xorBlocks, byte *outBlocks, size_t length, word32 flags)
745 {
746  return AdvancedProcessBlocks64_6x2_NEON(SPECK64_Dec_Block, SPECK64_Dec_6_Blocks,
747  subKeys, rounds, inBlocks, xorBlocks, outBlocks, length, flags);
748 }
749 #endif
750 
751 // ***************************** IA-32 ***************************** //
752 
753 #if defined(CRYPTOPP_SSE41_AVAILABLE)
754 size_t SPECK64_Enc_AdvancedProcessBlocks_SSE41(const word32* subKeys, size_t rounds,
755  const byte *inBlocks, const byte *xorBlocks, byte *outBlocks, size_t length, word32 flags)
756 {
757  return AdvancedProcessBlocks64_6x2_SSE(SPECK64_Enc_Block, SPECK64_Enc_6_Blocks,
758  subKeys, rounds, inBlocks, xorBlocks, outBlocks, length, flags);
759 }
760 
761 size_t SPECK64_Dec_AdvancedProcessBlocks_SSE41(const word32* subKeys, size_t rounds,
762  const byte *inBlocks, const byte *xorBlocks, byte *outBlocks, size_t length, word32 flags)
763 {
764  return AdvancedProcessBlocks64_6x2_SSE(SPECK64_Dec_Block, SPECK64_Dec_6_Blocks,
765  subKeys, rounds, inBlocks, xorBlocks, outBlocks, length, flags);
766 }
767 #endif
768 
769 // ***************************** Altivec ***************************** //
770 
771 #if defined(CRYPTOPP_ALTIVEC_AVAILABLE)
772 size_t SPECK64_Enc_AdvancedProcessBlocks_ALTIVEC(const word32* subKeys, size_t rounds,
773  const byte *inBlocks, const byte *xorBlocks, byte *outBlocks, size_t length, word32 flags)
774 {
775  return AdvancedProcessBlocks64_6x2_ALTIVEC(SPECK64_Enc_Block, SPECK64_Enc_6_Blocks,
776  subKeys, rounds, inBlocks, xorBlocks, outBlocks, length, flags);
777 }
778 
779 size_t SPECK64_Dec_AdvancedProcessBlocks_ALTIVEC(const word32* subKeys, size_t rounds,
780  const byte *inBlocks, const byte *xorBlocks, byte *outBlocks, size_t length, word32 flags)
781 {
782  return AdvancedProcessBlocks64_6x2_ALTIVEC(SPECK64_Dec_Block, SPECK64_Dec_6_Blocks,
783  subKeys, rounds, inBlocks, xorBlocks, outBlocks, length, flags);
784 }
785 #endif
786 
787 NAMESPACE_END
Utility functions for the Crypto++ library.
T1 VecSub(const T1 vec1, const T2 vec2)
Subtract two vectors.
Definition: ppc_simd.h:956
Library configuration file.
T1 VecAdd(const T1 vec1, const T2 vec2)
Add two vectors.
Definition: ppc_simd.h:939
T1 VecPermute(const T1 vec, const T2 mask)
Permutes a vector.
Definition: ppc_simd.h:1010
__vector unsigned int uint32x4_p
Vector of 32-bit elements.
Definition: ppc_simd.h:129
Support functions for PowerPC and vector operations.
Template for AdvancedProcessBlocks and SIMD processing.
Precompiled header file.
Classes for the Speck block cipher.
T1 VecXor(const T1 vec1, const T2 vec2)
XOR two vectors.
Definition: ppc_simd.h:916
Crypto++ library namespace.
uint32x4_p VecLoad(const byte src[16])
Loads a vector from a byte array.
Definition: ppc_simd.h:253
__vector unsigned char uint8x16_p
Vector of 8-bit elements.
Definition: ppc_simd.h:119