Crypto++  8.2
Free C++ class library of cryptographic schemes
simon64_simd.cpp
1 // simon-simd.cpp - written and placed in the public domain by Jeffrey Walton
2 //
3 // This source file uses intrinsics and built-ins to gain access to
4 // SSSE3, ARM NEON and ARMv8a, and Altivec instructions. A separate
5 // source file is needed because additional CXXFLAGS are required to enable
6 // the appropriate instructions sets in some build configurations.
7 
8 #include "pch.h"
9 #include "config.h"
10 
11 #include "simon.h"
12 #include "misc.h"
13 
14 // Uncomment for benchmarking C++ against SSE or NEON.
15 // Do so in both simon.cpp and simon-simd.cpp.
16 // #undef CRYPTOPP_SSE41_AVAILABLE
17 // #undef CRYPTOPP_ARM_NEON_AVAILABLE
18 
19 #if (CRYPTOPP_SSSE3_AVAILABLE)
20 # include "adv_simd.h"
21 # include <pmmintrin.h>
22 # include <tmmintrin.h>
23 #endif
24 
25 #if (CRYPTOPP_SSE41_AVAILABLE)
26 # include <smmintrin.h>
27 #endif
28 
29 #if defined(__XOP__)
30 # include <ammintrin.h>
31 #endif
32 
33 #if defined(__AVX512F__)
34 # define CRYPTOPP_AVX512_ROTATE 1
35 # include <immintrin.h>
36 #endif
37 
38 // C1189: error: This header is specific to ARM targets
39 #if (CRYPTOPP_ARM_NEON_AVAILABLE)
40 # include "adv_simd.h"
41 # ifndef _M_ARM64
42 # include <arm_neon.h>
43 # endif
44 #endif
45 
46 #if (CRYPTOPP_ARM_ACLE_AVAILABLE)
47 # include <stdint.h>
48 # include <arm_acle.h>
49 #endif
50 
51 #if defined(CRYPTOPP_ALTIVEC_AVAILABLE)
52 # include "adv_simd.h"
53 # include "ppc_simd.h"
54 #endif
55 
56 // Squash MS LNK4221 and libtool warnings
57 extern const char SIMON64_SIMD_FNAME[] = __FILE__;
58 
59 ANONYMOUS_NAMESPACE_BEGIN
60 
61 using CryptoPP::byte;
62 using CryptoPP::word32;
63 using CryptoPP::word64;
64 using CryptoPP::vec_swap; // SunCC
65 
66 // *************************** ARM NEON ************************** //
67 
68 #if (CRYPTOPP_ARM_NEON_AVAILABLE)
69 
70 template <class T>
71 inline T UnpackHigh32(const T& a, const T& b)
72 {
73  const uint32x2_t x(vget_high_u32((uint32x4_t)a));
74  const uint32x2_t y(vget_high_u32((uint32x4_t)b));
75  const uint32x2x2_t r = vzip_u32(x, y);
76  return (T)vcombine_u32(r.val[0], r.val[1]);
77 }
78 
79 template <class T>
80 inline T UnpackLow32(const T& a, const T& b)
81 {
82  const uint32x2_t x(vget_low_u32((uint32x4_t)a));
83  const uint32x2_t y(vget_low_u32((uint32x4_t)b));
84  const uint32x2x2_t r = vzip_u32(x, y);
85  return (T)vcombine_u32(r.val[0], r.val[1]);
86 }
87 
88 template <unsigned int R>
89 inline uint32x4_t RotateLeft32(const uint32x4_t& val)
90 {
91  const uint32x4_t a(vshlq_n_u32(val, R));
92  const uint32x4_t b(vshrq_n_u32(val, 32 - R));
93  return vorrq_u32(a, b);
94 }
95 
96 template <unsigned int R>
97 inline uint32x4_t RotateRight32(const uint32x4_t& val)
98 {
99  const uint32x4_t a(vshlq_n_u32(val, 32 - R));
100  const uint32x4_t b(vshrq_n_u32(val, R));
101  return vorrq_u32(a, b);
102 }
103 
104 #if defined(__aarch32__) || defined(__aarch64__)
105 // Faster than two Shifts and an Or. Thanks to Louis Wingers and Bryan Weeks.
106 template <>
107 inline uint32x4_t RotateLeft32<8>(const uint32x4_t& val)
108 {
109  const uint8_t maskb[16] = { 3,0,1,2, 7,4,5,6, 11,8,9,10, 15,12,13,14 };
110  const uint8x16_t mask = vld1q_u8(maskb);
111 
112  return vreinterpretq_u32_u8(
113  vqtbl1q_u8(vreinterpretq_u8_u32(val), mask));
114 }
115 
116 // Faster than two Shifts and an Or. Thanks to Louis Wingers and Bryan Weeks.
117 template <>
118 inline uint32x4_t RotateRight32<8>(const uint32x4_t& val)
119 {
120  const uint8_t maskb[16] = { 1,2,3,0, 5,6,7,4, 9,10,11,8, 13,14,14,12 };
121  const uint8x16_t mask = vld1q_u8(maskb);
122 
123  return vreinterpretq_u32_u8(
124  vqtbl1q_u8(vreinterpretq_u8_u32(val), mask));
125 }
126 #endif
127 
128 inline uint32x4_t SIMON64_f(const uint32x4_t& val)
129 {
130  return veorq_u32(RotateLeft32<2>(val),
131  vandq_u32(RotateLeft32<1>(val), RotateLeft32<8>(val)));
132 }
133 
134 inline void SIMON64_Enc_Block(uint32x4_t &block1, uint32x4_t &block0,
135  const word32 *subkeys, unsigned int rounds)
136 {
137  // [A1 A2 A3 A4][B1 B2 B3 B4] ... => [A1 A3 B1 B3][A2 A4 B2 B4] ...
138  uint32x4_t x1 = vuzpq_u32(block0, block1).val[1];
139  uint32x4_t y1 = vuzpq_u32(block0, block1).val[0];
140 
141  for (int i = 0; i < static_cast<int>(rounds & ~1)-1; i += 2)
142  {
143  const uint32x4_t rk1 = vld1q_dup_u32(subkeys+i);
144  y1 = veorq_u32(veorq_u32(y1, SIMON64_f(x1)), rk1);
145 
146  const uint32x4_t rk2 = vld1q_dup_u32(subkeys+i+1);
147  x1 = veorq_u32(veorq_u32(x1, SIMON64_f(y1)), rk2);
148  }
149 
150  if (rounds & 1)
151  {
152  const uint32x4_t rk = vld1q_dup_u32(subkeys+rounds-1);
153 
154  y1 = veorq_u32(veorq_u32(y1, SIMON64_f(x1)), rk);
155  std::swap(x1, y1);
156  }
157 
158  // [A1 A3 B1 B3][A2 A4 B2 B4] => [A1 A2 A3 A4][B1 B2 B3 B4]
159  block0 = UnpackLow32(y1, x1);
160  block1 = UnpackHigh32(y1, x1);
161 }
162 
163 inline void SIMON64_Dec_Block(uint32x4_t &block0, uint32x4_t &block1,
164  const word32 *subkeys, unsigned int rounds)
165 {
166  // [A1 A2 A3 A4][B1 B2 B3 B4] ... => [A1 A3 B1 B3][A2 A4 B2 B4] ...
167  uint32x4_t x1 = vuzpq_u32(block0, block1).val[1];
168  uint32x4_t y1 = vuzpq_u32(block0, block1).val[0];
169 
170  if (rounds & 1)
171  {
172  std::swap(x1, y1);
173  const uint32x4_t rk = vld1q_dup_u32(subkeys + rounds - 1);
174 
175  y1 = veorq_u32(veorq_u32(y1, rk), SIMON64_f(x1));
176  rounds--;
177  }
178 
179  for (int i = static_cast<int>(rounds-2); i >= 0; i -= 2)
180  {
181  const uint32x4_t rk1 = vld1q_dup_u32(subkeys+i+1);
182  x1 = veorq_u32(veorq_u32(x1, SIMON64_f(y1)), rk1);
183 
184  const uint32x4_t rk2 = vld1q_dup_u32(subkeys+i);
185  y1 = veorq_u32(veorq_u32(y1, SIMON64_f(x1)), rk2);
186  }
187 
188  // [A1 A3 B1 B3][A2 A4 B2 B4] => [A1 A2 A3 A4][B1 B2 B3 B4]
189  block0 = UnpackLow32(y1, x1);
190  block1 = UnpackHigh32(y1, x1);
191 }
192 
193 inline void SIMON64_Enc_6_Blocks(uint32x4_t &block0, uint32x4_t &block1,
194  uint32x4_t &block2, uint32x4_t &block3, uint32x4_t &block4, uint32x4_t &block5,
195  const word32 *subkeys, unsigned int rounds)
196 {
197  // [A1 A2 A3 A4][B1 B2 B3 B4] ... => [A1 A3 B1 B3][A2 A4 B2 B4] ...
198  uint32x4_t x1 = vuzpq_u32(block0, block1).val[1];
199  uint32x4_t y1 = vuzpq_u32(block0, block1).val[0];
200  uint32x4_t x2 = vuzpq_u32(block2, block3).val[1];
201  uint32x4_t y2 = vuzpq_u32(block2, block3).val[0];
202  uint32x4_t x3 = vuzpq_u32(block4, block5).val[1];
203  uint32x4_t y3 = vuzpq_u32(block4, block5).val[0];
204 
205  for (int i = 0; i < static_cast<int>(rounds & ~1) - 1; i += 2)
206  {
207  const uint32x4_t rk1 = vld1q_dup_u32(subkeys+i);
208  y1 = veorq_u32(veorq_u32(y1, SIMON64_f(x1)), rk1);
209  y2 = veorq_u32(veorq_u32(y2, SIMON64_f(x2)), rk1);
210  y3 = veorq_u32(veorq_u32(y3, SIMON64_f(x3)), rk1);
211 
212  const uint32x4_t rk2 = vld1q_dup_u32(subkeys+i+1);
213  x1 = veorq_u32(veorq_u32(x1, SIMON64_f(y1)), rk2);
214  x2 = veorq_u32(veorq_u32(x2, SIMON64_f(y2)), rk2);
215  x3 = veorq_u32(veorq_u32(x3, SIMON64_f(y3)), rk2);
216  }
217 
218  if (rounds & 1)
219  {
220  const uint32x4_t rk = vld1q_dup_u32(subkeys + rounds - 1);
221 
222  y1 = veorq_u32(veorq_u32(y1, SIMON64_f(x1)), rk);
223  y2 = veorq_u32(veorq_u32(y2, SIMON64_f(x2)), rk);
224  y3 = veorq_u32(veorq_u32(y3, SIMON64_f(x3)), rk);
225  std::swap(x1, y1); std::swap(x2, y2); std::swap(x3, y3);
226  }
227 
228  // [A1 A3 B1 B3][A2 A4 B2 B4] => [A1 A2 A3 A4][B1 B2 B3 B4]
229  block0 = UnpackLow32(y1, x1);
230  block1 = UnpackHigh32(y1, x1);
231  block2 = UnpackLow32(y2, x2);
232  block3 = UnpackHigh32(y2, x2);
233  block4 = UnpackLow32(y3, x3);
234  block5 = UnpackHigh32(y3, x3);
235 }
236 
237 inline void SIMON64_Dec_6_Blocks(uint32x4_t &block0, uint32x4_t &block1,
238  uint32x4_t &block2, uint32x4_t &block3, uint32x4_t &block4, uint32x4_t &block5,
239  const word32 *subkeys, unsigned int rounds)
240 {
241  // [A1 A2 A3 A4][B1 B2 B3 B4] ... => [A1 A3 B1 B3][A2 A4 B2 B4] ...
242  uint32x4_t x1 = vuzpq_u32(block0, block1).val[1];
243  uint32x4_t y1 = vuzpq_u32(block0, block1).val[0];
244  uint32x4_t x2 = vuzpq_u32(block2, block3).val[1];
245  uint32x4_t y2 = vuzpq_u32(block2, block3).val[0];
246  uint32x4_t x3 = vuzpq_u32(block4, block5).val[1];
247  uint32x4_t y3 = vuzpq_u32(block4, block5).val[0];
248 
249  if (rounds & 1)
250  {
251  std::swap(x1, y1); std::swap(x2, y2); std::swap(x3, y3);
252  const uint32x4_t rk = vld1q_dup_u32(subkeys + rounds - 1);
253 
254  y1 = veorq_u32(veorq_u32(y1, rk), SIMON64_f(x1));
255  y2 = veorq_u32(veorq_u32(y2, rk), SIMON64_f(x2));
256  y3 = veorq_u32(veorq_u32(y3, rk), SIMON64_f(x3));
257  rounds--;
258  }
259 
260  for (int i = static_cast<int>(rounds-2); i >= 0; i -= 2)
261  {
262  const uint32x4_t rk1 = vld1q_dup_u32(subkeys + i + 1);
263  x1 = veorq_u32(veorq_u32(x1, SIMON64_f(y1)), rk1);
264  x2 = veorq_u32(veorq_u32(x2, SIMON64_f(y2)), rk1);
265  x3 = veorq_u32(veorq_u32(x3, SIMON64_f(y3)), rk1);
266 
267  const uint32x4_t rk2 = vld1q_dup_u32(subkeys + i);
268  y1 = veorq_u32(veorq_u32(y1, SIMON64_f(x1)), rk2);
269  y2 = veorq_u32(veorq_u32(y2, SIMON64_f(x2)), rk2);
270  y3 = veorq_u32(veorq_u32(y3, SIMON64_f(x3)), rk2);
271  }
272 
273  // [A1 A3 B1 B3][A2 A4 B2 B4] => [A1 A2 A3 A4][B1 B2 B3 B4]
274  block0 = UnpackLow32(y1, x1);
275  block1 = UnpackHigh32(y1, x1);
276  block2 = UnpackLow32(y2, x2);
277  block3 = UnpackHigh32(y2, x2);
278  block4 = UnpackLow32(y3, x3);
279  block5 = UnpackHigh32(y3, x3);
280 }
281 
282 #endif // CRYPTOPP_ARM_NEON_AVAILABLE
283 
284 // ***************************** IA-32 ***************************** //
285 
286 #if defined(CRYPTOPP_SSE41_AVAILABLE)
287 
288 inline void Swap128(__m128i& a,__m128i& b)
289 {
290 #if defined(__SUNPRO_CC) && (__SUNPRO_CC <= 0x5120)
291  // __m128i is an unsigned long long[2], and support for swapping it was not added until C++11.
292  // SunCC 12.1 - 12.3 fail to consume the swap; while SunCC 12.4 consumes it without -std=c++11.
293  vec_swap(a, b);
294 #else
295  std::swap(a, b);
296 #endif
297 }
298 
299 template <unsigned int R>
300 inline __m128i RotateLeft32(const __m128i& val)
301 {
302 #if defined(__XOP__)
303  return _mm_roti_epi32(val, R);
304 #else
305  return _mm_or_si128(
306  _mm_slli_epi32(val, R), _mm_srli_epi32(val, 32-R));
307 #endif
308 }
309 
310 template <unsigned int R>
311 inline __m128i RotateRight32(const __m128i& val)
312 {
313 #if defined(__XOP__)
314  return _mm_roti_epi32(val, 32-R);
315 #else
316  return _mm_or_si128(
317  _mm_slli_epi32(val, 32-R), _mm_srli_epi32(val, R));
318 #endif
319 }
320 
321 // Faster than two Shifts and an Or. Thanks to Louis Wingers and Bryan Weeks.
322 template <>
323 __m128i RotateLeft32<8>(const __m128i& val)
324 {
325 #if defined(__XOP__)
326  return _mm_roti_epi32(val, 8);
327 #else
328  const __m128i mask = _mm_set_epi8(14,13,12,15, 10,9,8,11, 6,5,4,7, 2,1,0,3);
329  return _mm_shuffle_epi8(val, mask);
330 #endif
331 }
332 
333 // Faster than two Shifts and an Or. Thanks to Louis Wingers and Bryan Weeks.
334 template <>
335 __m128i RotateRight32<8>(const __m128i& val)
336 {
337 #if defined(__XOP__)
338  return _mm_roti_epi32(val, 32-8);
339 #else
340  const __m128i mask = _mm_set_epi8(12,15,14,13, 8,11,10,9, 4,7,6,5, 0,3,2,1);
341  return _mm_shuffle_epi8(val, mask);
342 #endif
343 }
344 
345 inline __m128i SIMON64_f(const __m128i& v)
346 {
347  return _mm_xor_si128(RotateLeft32<2>(v),
348  _mm_and_si128(RotateLeft32<1>(v), RotateLeft32<8>(v)));
349 }
350 
351 inline void SIMON64_Enc_Block(__m128i &block0, __m128i &block1,
352  const word32 *subkeys, unsigned int rounds)
353 {
354  // [A1 A2 A3 A4][B1 B2 B3 B4] ... => [A1 A3 B1 B3][A2 A4 B2 B4] ...
355  const __m128 t0 = _mm_castsi128_ps(block0);
356  const __m128 t1 = _mm_castsi128_ps(block1);
357  __m128i x1 = _mm_castps_si128(_mm_shuffle_ps(t0, t1, _MM_SHUFFLE(3,1,3,1)));
358  __m128i y1 = _mm_castps_si128(_mm_shuffle_ps(t0, t1, _MM_SHUFFLE(2,0,2,0)));
359 
360  for (int i = 0; i < static_cast<int>(rounds & ~1)-1; i += 2)
361  {
362  const __m128i rk1 = _mm_set1_epi32(subkeys[i]);
363  y1 = _mm_xor_si128(_mm_xor_si128(y1, SIMON64_f(x1)), rk1);
364 
365  const __m128i rk2 = _mm_set1_epi32(subkeys[i+1]);
366  x1 = _mm_xor_si128(_mm_xor_si128(x1, SIMON64_f(y1)), rk2);
367  }
368 
369  if (rounds & 1)
370  {
371  const __m128i rk = _mm_set1_epi32(subkeys[rounds-1]);
372  y1 = _mm_xor_si128(_mm_xor_si128(y1, SIMON64_f(x1)), rk);
373  Swap128(x1, y1);
374  }
375 
376  // [A1 A3 B1 B3][A2 A4 B2 B4] => [A1 A2 A3 A4][B1 B2 B3 B4]
377  block0 = _mm_unpacklo_epi32(y1, x1);
378  block1 = _mm_unpackhi_epi32(y1, x1);
379 }
380 
381 inline void SIMON64_Dec_Block(__m128i &block0, __m128i &block1,
382  const word32 *subkeys, unsigned int rounds)
383 {
384  // [A1 A2 A3 A4][B1 B2 B3 B4] ... => [A1 A3 B1 B3][A2 A4 B2 B4] ...
385  const __m128 t0 = _mm_castsi128_ps(block0);
386  const __m128 t1 = _mm_castsi128_ps(block1);
387  __m128i x1 = _mm_castps_si128(_mm_shuffle_ps(t0, t1, _MM_SHUFFLE(3,1,3,1)));
388  __m128i y1 = _mm_castps_si128(_mm_shuffle_ps(t0, t1, _MM_SHUFFLE(2,0,2,0)));
389 
390  if (rounds & 1)
391  {
392  Swap128(x1, y1);
393  const __m128i rk = _mm_set1_epi32(subkeys[rounds-1]);
394  y1 = _mm_xor_si128(_mm_xor_si128(y1, rk), SIMON64_f(x1));
395  rounds--;
396  }
397 
398  for (int i = static_cast<int>(rounds-2); i >= 0; i -= 2)
399  {
400  const __m128i rk1 = _mm_set1_epi32(subkeys[i+1]);
401  x1 = _mm_xor_si128(_mm_xor_si128(x1, SIMON64_f(y1)), rk1);
402 
403  const __m128i rk2 = _mm_set1_epi32(subkeys[i]);
404  y1 = _mm_xor_si128(_mm_xor_si128(y1, SIMON64_f(x1)), rk2);
405  }
406 
407  // [A1 A3 B1 B3][A2 A4 B2 B4] => [A1 A2 A3 A4][B1 B2 B3 B4]
408  block0 = _mm_unpacklo_epi32(y1, x1);
409  block1 = _mm_unpackhi_epi32(y1, x1);
410 }
411 
412 inline void SIMON64_Enc_6_Blocks(__m128i &block0, __m128i &block1,
413  __m128i &block2, __m128i &block3, __m128i &block4, __m128i &block5,
414  const word32 *subkeys, unsigned int rounds)
415 {
416  // [A1 A2 A3 A4][B1 B2 B3 B4] ... => [A1 A3 B1 B3][A2 A4 B2 B4] ...
417  const __m128 t0 = _mm_castsi128_ps(block0);
418  const __m128 t1 = _mm_castsi128_ps(block1);
419  __m128i x1 = _mm_castps_si128(_mm_shuffle_ps(t0, t1, _MM_SHUFFLE(3,1,3,1)));
420  __m128i y1 = _mm_castps_si128(_mm_shuffle_ps(t0, t1, _MM_SHUFFLE(2,0,2,0)));
421 
422  const __m128 t2 = _mm_castsi128_ps(block2);
423  const __m128 t3 = _mm_castsi128_ps(block3);
424  __m128i x2 = _mm_castps_si128(_mm_shuffle_ps(t2, t3, _MM_SHUFFLE(3,1,3,1)));
425  __m128i y2 = _mm_castps_si128(_mm_shuffle_ps(t2, t3, _MM_SHUFFLE(2,0,2,0)));
426 
427  const __m128 t4 = _mm_castsi128_ps(block4);
428  const __m128 t5 = _mm_castsi128_ps(block5);
429  __m128i x3 = _mm_castps_si128(_mm_shuffle_ps(t4, t5, _MM_SHUFFLE(3,1,3,1)));
430  __m128i y3 = _mm_castps_si128(_mm_shuffle_ps(t4, t5, _MM_SHUFFLE(2,0,2,0)));
431 
432  for (int i = 0; i < static_cast<int>(rounds & ~1)-1; i += 2)
433  {
434  const __m128i rk1 = _mm_set1_epi32(subkeys[i]);
435  y1 = _mm_xor_si128(_mm_xor_si128(y1, SIMON64_f(x1)), rk1);
436  y2 = _mm_xor_si128(_mm_xor_si128(y2, SIMON64_f(x2)), rk1);
437  y3 = _mm_xor_si128(_mm_xor_si128(y3, SIMON64_f(x3)), rk1);
438 
439  const __m128i rk2 = _mm_set1_epi32(subkeys[i+1]);
440  x1 = _mm_xor_si128(_mm_xor_si128(x1, SIMON64_f(y1)), rk2);
441  x2 = _mm_xor_si128(_mm_xor_si128(x2, SIMON64_f(y2)), rk2);
442  x3 = _mm_xor_si128(_mm_xor_si128(x3, SIMON64_f(y3)), rk2);
443  }
444 
445  if (rounds & 1)
446  {
447  const __m128i rk = _mm_set1_epi32(subkeys[rounds-1]);
448  y1 = _mm_xor_si128(_mm_xor_si128(y1, SIMON64_f(x1)), rk);
449  y2 = _mm_xor_si128(_mm_xor_si128(y2, SIMON64_f(x2)), rk);
450  y3 = _mm_xor_si128(_mm_xor_si128(y3, SIMON64_f(x3)), rk);
451  Swap128(x1, y1); Swap128(x2, y2); Swap128(x3, y3);
452  }
453 
454  // [A1 A3 B1 B3][A2 A4 B2 B4] => [A1 A2 A3 A4][B1 B2 B3 B4]
455  block0 = _mm_unpacklo_epi32(y1, x1);
456  block1 = _mm_unpackhi_epi32(y1, x1);
457  block2 = _mm_unpacklo_epi32(y2, x2);
458  block3 = _mm_unpackhi_epi32(y2, x2);
459  block4 = _mm_unpacklo_epi32(y3, x3);
460  block5 = _mm_unpackhi_epi32(y3, x3);
461 }
462 
463 inline void SIMON64_Dec_6_Blocks(__m128i &block0, __m128i &block1,
464  __m128i &block2, __m128i &block3, __m128i &block4, __m128i &block5,
465  const word32 *subkeys, unsigned int rounds)
466 {
467  // [A1 A2 A3 A4][B1 B2 B3 B4] ... => [A1 A3 B1 B3][A2 A4 B2 B4] ...
468  const __m128 t0 = _mm_castsi128_ps(block0);
469  const __m128 t1 = _mm_castsi128_ps(block1);
470  __m128i x1 = _mm_castps_si128(_mm_shuffle_ps(t0, t1, _MM_SHUFFLE(3,1,3,1)));
471  __m128i y1 = _mm_castps_si128(_mm_shuffle_ps(t0, t1, _MM_SHUFFLE(2,0,2,0)));
472 
473  const __m128 t2 = _mm_castsi128_ps(block2);
474  const __m128 t3 = _mm_castsi128_ps(block3);
475  __m128i x2 = _mm_castps_si128(_mm_shuffle_ps(t2, t3, _MM_SHUFFLE(3,1,3,1)));
476  __m128i y2 = _mm_castps_si128(_mm_shuffle_ps(t2, t3, _MM_SHUFFLE(2,0,2,0)));
477 
478  const __m128 t4 = _mm_castsi128_ps(block4);
479  const __m128 t5 = _mm_castsi128_ps(block5);
480  __m128i x3 = _mm_castps_si128(_mm_shuffle_ps(t4, t5, _MM_SHUFFLE(3,1,3,1)));
481  __m128i y3 = _mm_castps_si128(_mm_shuffle_ps(t4, t5, _MM_SHUFFLE(2,0,2,0)));
482 
483  if (rounds & 1)
484  {
485  Swap128(x1, y1); Swap128(x2, y2); Swap128(x3, y3);
486  const __m128i rk = _mm_set1_epi32(subkeys[rounds-1]);
487  y1 = _mm_xor_si128(_mm_xor_si128(y1, rk), SIMON64_f(x1));
488  y2 = _mm_xor_si128(_mm_xor_si128(y2, rk), SIMON64_f(x2));
489  y3 = _mm_xor_si128(_mm_xor_si128(y3, rk), SIMON64_f(x3));
490  rounds--;
491  }
492 
493  for (int i = static_cast<int>(rounds-2); i >= 0; i -= 2)
494  {
495  const __m128i rk1 = _mm_set1_epi32(subkeys[i+1]);
496  x1 = _mm_xor_si128(_mm_xor_si128(x1, SIMON64_f(y1)), rk1);
497  x2 = _mm_xor_si128(_mm_xor_si128(x2, SIMON64_f(y2)), rk1);
498  x3 = _mm_xor_si128(_mm_xor_si128(x3, SIMON64_f(y3)), rk1);
499 
500  const __m128i rk2 = _mm_set1_epi32(subkeys[i]);
501  y1 = _mm_xor_si128(_mm_xor_si128(y1, SIMON64_f(x1)), rk2);
502  y2 = _mm_xor_si128(_mm_xor_si128(y2, SIMON64_f(x2)), rk2);
503  y3 = _mm_xor_si128(_mm_xor_si128(y3, SIMON64_f(x3)), rk2);
504  }
505 
506  // [A1 A3 B1 B3][A2 A4 B2 B4] => [A1 A2 A3 A4][B1 B2 B3 B4]
507  block0 = _mm_unpacklo_epi32(y1, x1);
508  block1 = _mm_unpackhi_epi32(y1, x1);
509  block2 = _mm_unpacklo_epi32(y2, x2);
510  block3 = _mm_unpackhi_epi32(y2, x2);
511  block4 = _mm_unpacklo_epi32(y3, x3);
512  block5 = _mm_unpackhi_epi32(y3, x3);
513 }
514 
515 #endif // CRYPTOPP_SSE41_AVAILABLE
516 
517 // ***************************** Altivec ***************************** //
518 
519 #if defined(CRYPTOPP_ALTIVEC_AVAILABLE)
520 
523 
524 using CryptoPP::VecAnd;
525 using CryptoPP::VecXor;
526 using CryptoPP::VecLoad;
527 using CryptoPP::VecLoadBE;
529 
530 // Rotate left by bit count
531 template<unsigned int C>
532 inline uint32x4_p RotateLeft32(const uint32x4_p val)
533 {
534  const uint32x4_p m = {C, C, C, C};
535  return vec_rl(val, m);
536 }
537 
538 // Rotate right by bit count
539 template<unsigned int C>
540 inline uint32x4_p RotateRight32(const uint32x4_p val)
541 {
542  const uint32x4_p m = {32-C, 32-C, 32-C, 32-C};
543  return vec_rl(val, m);
544 }
545 
546 inline uint32x4_p SIMON64_f(const uint32x4_p val)
547 {
548  return VecXor(RotateLeft32<2>(val),
549  VecAnd(RotateLeft32<1>(val), RotateLeft32<8>(val)));
550 }
551 
552 inline void SIMON64_Enc_Block(uint32x4_p &block0, uint32x4_p &block1,
553  const word32 *subkeys, unsigned int rounds)
554 {
555 #if (CRYPTOPP_BIG_ENDIAN)
556  const uint8x16_p m1 = {7,6,5,4, 15,14,13,12, 23,22,21,20, 31,30,29,28};
557  const uint8x16_p m2 = {3,2,1,0, 11,10,9,8, 19,18,17,16, 27,26,25,24};
558 #else
559  const uint8x16_p m1 = {3,2,1,0, 11,10,9,8, 19,18,17,16, 27,26,25,24};
560  const uint8x16_p m2 = {7,6,5,4, 15,14,13,12, 23,22,21,20, 31,30,29,28};
561 #endif
562 
563  // [A1 A2 A3 A4][B1 B2 B3 B4] ... => [A1 A3 B1 B3][A2 A4 B2 B4] ...
564  uint32x4_p x1 = VecPermute(block0, block1, m1);
565  uint32x4_p y1 = VecPermute(block0, block1, m2);
566 
567  for (int i = 0; i < static_cast<int>(rounds & ~1)-1; i += 2)
568  {
569 #if CRYPTOPP_POWER8_AVAILABLE
570  const uint32x4_p rk1 = vec_splats(subkeys[i]);
571  const uint32x4_p rk2 = vec_splats(subkeys[i+1]);
572 #else
573  const uint8x16_p m = {0,1,2,3, 0,1,2,3, 0,1,2,3, 0,1,2,3};
574  uint32x4_p rk1 = VecLoad(subkeys+i);
575  uint32x4_p rk2 = VecLoad(subkeys+i+1);
576  rk1 = VecPermute(rk1, rk1, m);
577  rk2 = VecPermute(rk2, rk2, m);
578 #endif
579  y1 = VecXor(VecXor(y1, SIMON64_f(x1)), rk1);
580  x1 = VecXor(VecXor(x1, SIMON64_f(y1)), rk2);
581  }
582 
583  if (rounds & 1)
584  {
585 #if CRYPTOPP_POWER8_AVAILABLE
586  const uint32x4_p rk = vec_splats(subkeys[rounds-1]);
587 #else
588  const uint8x16_p m = {0,1,2,3, 0,1,2,3, 0,1,2,3, 0,1,2,3};
589  uint32x4_p rk = VecLoad(subkeys+rounds-1);
590  rk = VecPermute(rk, rk, m);
591 #endif
592  y1 = VecXor(VecXor(y1, SIMON64_f(x1)), rk);
593  std::swap(x1, y1);
594  }
595 
596 #if (CRYPTOPP_BIG_ENDIAN)
597  const uint8x16_p m3 = {19,18,17,16, 3,2,1,0, 23,22,21,20, 7,6,5,4};
598  const uint8x16_p m4 = {27,26,25,24, 11,10,9,8, 31,30,29,28, 15,14,13,12};
599 #else
600  const uint8x16_p m3 = {3,2,1,0, 19,18,17,16, 7,6,5,4, 23,22,21,20};
601  const uint8x16_p m4 = {11,10,9,8, 27,26,25,24, 15,14,13,12, 31,30,29,28};
602 #endif
603 
604  // [A1 A3 B1 B3][A2 A4 B2 B4] => [A1 A2 A3 A4][B1 B2 B3 B4]
605  block0 = (uint32x4_p)VecPermute(x1, y1, m3);
606  block1 = (uint32x4_p)VecPermute(x1, y1, m4);
607 }
608 
609 inline void SIMON64_Dec_Block(uint32x4_p &block0, uint32x4_p &block1,
610  const word32 *subkeys, unsigned int rounds)
611 {
612 #if (CRYPTOPP_BIG_ENDIAN)
613  const uint8x16_p m1 = {7,6,5,4, 15,14,13,12, 23,22,21,20, 31,30,29,28};
614  const uint8x16_p m2 = {3,2,1,0, 11,10,9,8, 19,18,17,16, 27,26,25,24};
615 #else
616  const uint8x16_p m1 = {3,2,1,0, 11,10,9,8, 19,18,17,16, 27,26,25,24};
617  const uint8x16_p m2 = {7,6,5,4, 15,14,13,12, 23,22,21,20, 31,30,29,28};
618 #endif
619 
620  // [A1 A2 A3 A4][B1 B2 B3 B4] ... => [A1 A3 B1 B3][A2 A4 B2 B4] ...
621  uint32x4_p x1 = VecPermute(block0, block1, m1);
622  uint32x4_p y1 = VecPermute(block0, block1, m2);
623 
624  if (rounds & 1)
625  {
626  std::swap(x1, y1);
627 #if CRYPTOPP_POWER8_AVAILABLE
628  const uint32x4_p rk = vec_splats(subkeys[rounds-1]);
629 #else
630  const uint8x16_p m = {0,1,2,3, 0,1,2,3, 0,1,2,3, 0,1,2,3};
631  uint32x4_p rk = VecLoad(subkeys+rounds-1);
632  rk = VecPermute(rk, rk, m);
633 #endif
634  y1 = VecXor(VecXor(y1, rk), SIMON64_f(x1));
635  rounds--;
636  }
637 
638  for (int i = static_cast<int>(rounds-2); i >= 0; i -= 2)
639  {
640 #if CRYPTOPP_POWER8_AVAILABLE
641  const uint32x4_p rk1 = vec_splats(subkeys[i+1]);
642  const uint32x4_p rk2 = vec_splats(subkeys[i]);
643 #else
644  const uint8x16_p m = {0,1,2,3, 0,1,2,3, 0,1,2,3, 0,1,2,3};
645  uint32x4_p rk1 = VecLoad(subkeys+i+1);
646  uint32x4_p rk2 = VecLoad(subkeys+i);
647  rk1 = VecPermute(rk1, rk1, m);
648  rk2 = VecPermute(rk2, rk2, m);
649 #endif
650  x1 = VecXor(VecXor(x1, SIMON64_f(y1)), rk1);
651  y1 = VecXor(VecXor(y1, SIMON64_f(x1)), rk2);
652  }
653 
654 #if (CRYPTOPP_BIG_ENDIAN)
655  const uint8x16_p m3 = {19,18,17,16, 3,2,1,0, 23,22,21,20, 7,6,5,4};
656  const uint8x16_p m4 = {27,26,25,24, 11,10,9,8, 31,30,29,28, 15,14,13,12};
657 #else
658  const uint8x16_p m3 = {3,2,1,0, 19,18,17,16, 7,6,5,4, 23,22,21,20};
659  const uint8x16_p m4 = {11,10,9,8, 27,26,25,24, 15,14,13,12, 31,30,29,28};
660 #endif
661 
662  // [A1 A3 B1 B3][A2 A4 B2 B4] => [A1 A2 A3 A4][B1 B2 B3 B4]
663  block0 = (uint32x4_p)VecPermute(x1, y1, m3);
664  block1 = (uint32x4_p)VecPermute(x1, y1, m4);
665 }
666 
667 inline void SIMON64_Enc_6_Blocks(uint32x4_p &block0, uint32x4_p &block1,
668  uint32x4_p &block2, uint32x4_p &block3, uint32x4_p &block4,
669  uint32x4_p &block5, const word32 *subkeys, unsigned int rounds)
670 {
671 #if (CRYPTOPP_BIG_ENDIAN)
672  const uint8x16_p m1 = {7,6,5,4, 15,14,13,12, 23,22,21,20, 31,30,29,28};
673  const uint8x16_p m2 = {3,2,1,0, 11,10,9,8, 19,18,17,16, 27,26,25,24};
674 #else
675  const uint8x16_p m1 = {3,2,1,0, 11,10,9,8, 19,18,17,16, 27,26,25,24};
676  const uint8x16_p m2 = {7,6,5,4, 15,14,13,12, 23,22,21,20, 31,30,29,28};
677 #endif
678 
679  // [A1 A2][B1 B2] ... => [A1 B1][A2 B2] ...
680  uint32x4_p x1 = (uint32x4_p)VecPermute(block0, block1, m1);
681  uint32x4_p y1 = (uint32x4_p)VecPermute(block0, block1, m2);
682  uint32x4_p x2 = (uint32x4_p)VecPermute(block2, block3, m1);
683  uint32x4_p y2 = (uint32x4_p)VecPermute(block2, block3, m2);
684  uint32x4_p x3 = (uint32x4_p)VecPermute(block4, block5, m1);
685  uint32x4_p y3 = (uint32x4_p)VecPermute(block4, block5, m2);
686 
687  for (int i = 0; i < static_cast<int>(rounds & ~1)-1; i += 2)
688  {
689 #if CRYPTOPP_POWER8_AVAILABLE
690  const uint32x4_p rk1 = vec_splats(subkeys[i]);
691  const uint32x4_p rk2 = vec_splats(subkeys[i+1]);
692 #else
693  const uint8x16_p m = {0,1,2,3, 0,1,2,3, 0,1,2,3, 0,1,2,3};
694  uint32x4_p rk1 = VecLoad(subkeys+i);
695  uint32x4_p rk2 = VecLoad(subkeys+i+1);
696  rk1 = VecPermute(rk1, rk1, m);
697  rk2 = VecPermute(rk2, rk2, m);
698 #endif
699  y1 = VecXor(VecXor(y1, SIMON64_f(x1)), rk1);
700  y2 = VecXor(VecXor(y2, SIMON64_f(x2)), rk1);
701  y3 = VecXor(VecXor(y3, SIMON64_f(x3)), rk1);
702 
703  x1 = VecXor(VecXor(x1, SIMON64_f(y1)), rk2);
704  x2 = VecXor(VecXor(x2, SIMON64_f(y2)), rk2);
705  x3 = VecXor(VecXor(x3, SIMON64_f(y3)), rk2);
706  }
707 
708  if (rounds & 1)
709  {
710 #if CRYPTOPP_POWER8_AVAILABLE
711  const uint32x4_p rk = vec_splats(subkeys[rounds-1]);
712 #else
713  const uint8x16_p m = {0,1,2,3, 0,1,2,3, 0,1,2,3, 0,1,2,3};
714  uint32x4_p rk = VecLoad(subkeys+rounds-1);
715  rk = VecPermute(rk, rk, m);
716 #endif
717  y1 = VecXor(VecXor(y1, SIMON64_f(x1)), rk);
718  y2 = VecXor(VecXor(y2, SIMON64_f(x2)), rk);
719  y3 = VecXor(VecXor(y3, SIMON64_f(x3)), rk);
720  std::swap(x1, y1); std::swap(x2, y2); std::swap(x3, y3);
721  }
722 
723 #if (CRYPTOPP_BIG_ENDIAN)
724  const uint8x16_p m3 = {19,18,17,16, 3,2,1,0, 23,22,21,20, 7,6,5,4};
725  const uint8x16_p m4 = {27,26,25,24, 11,10,9,8, 31,30,29,28, 15,14,13,12};
726 #else
727  const uint8x16_p m3 = {3,2,1,0, 19,18,17,16, 7,6,5,4, 23,22,21,20};
728  const uint8x16_p m4 = {11,10,9,8, 27,26,25,24, 15,14,13,12, 31,30,29,28};
729 #endif
730 
731  // [A1 B1][A2 B2] ... => [A1 A2][B1 B2] ...
732  block0 = (uint32x4_p)VecPermute(x1, y1, m3);
733  block1 = (uint32x4_p)VecPermute(x1, y1, m4);
734  block2 = (uint32x4_p)VecPermute(x2, y2, m3);
735  block3 = (uint32x4_p)VecPermute(x2, y2, m4);
736  block4 = (uint32x4_p)VecPermute(x3, y3, m3);
737  block5 = (uint32x4_p)VecPermute(x3, y3, m4);
738 }
739 
740 inline void SIMON64_Dec_6_Blocks(uint32x4_p &block0, uint32x4_p &block1,
741  uint32x4_p &block2, uint32x4_p &block3, uint32x4_p &block4,
742  uint32x4_p &block5, const word32 *subkeys, unsigned int rounds)
743 {
744 #if (CRYPTOPP_BIG_ENDIAN)
745  const uint8x16_p m1 = {7,6,5,4, 15,14,13,12, 23,22,21,20, 31,30,29,28};
746  const uint8x16_p m2 = {3,2,1,0, 11,10,9,8, 19,18,17,16, 27,26,25,24};
747 #else
748  const uint8x16_p m1 = {3,2,1,0, 11,10,9,8, 19,18,17,16, 27,26,25,24};
749  const uint8x16_p m2 = {7,6,5,4, 15,14,13,12, 23,22,21,20, 31,30,29,28};
750 #endif
751 
752  // [A1 A2][B1 B2] ... => [A1 B1][A2 B2] ...
753  uint32x4_p x1 = (uint32x4_p)VecPermute(block0, block1, m1);
754  uint32x4_p y1 = (uint32x4_p)VecPermute(block0, block1, m2);
755  uint32x4_p x2 = (uint32x4_p)VecPermute(block2, block3, m1);
756  uint32x4_p y2 = (uint32x4_p)VecPermute(block2, block3, m2);
757  uint32x4_p x3 = (uint32x4_p)VecPermute(block4, block5, m1);
758  uint32x4_p y3 = (uint32x4_p)VecPermute(block4, block5, m2);
759 
760  if (rounds & 1)
761  {
762  std::swap(x1, y1); std::swap(x2, y2); std::swap(x3, y3);
763 
764 #if CRYPTOPP_POWER8_AVAILABLE
765  const uint32x4_p rk = vec_splats(subkeys[rounds-1]);
766 #else
767  const uint8x16_p m = {0,1,2,3, 0,1,2,3, 0,1,2,3, 0,1,2,3};
768  uint32x4_p rk = VecLoad(subkeys+rounds-1);
769  rk = VecPermute(rk, rk, m);
770 #endif
771  y1 = VecXor(VecXor(y1, rk), SIMON64_f(x1));
772  y2 = VecXor(VecXor(y2, rk), SIMON64_f(x2));
773  y3 = VecXor(VecXor(y3, rk), SIMON64_f(x3));
774  rounds--;
775  }
776 
777  for (int i = static_cast<int>(rounds-2); i >= 0; i -= 2)
778  {
779 #if CRYPTOPP_POWER8_AVAILABLE
780  const uint32x4_p rk1 = vec_splats(subkeys[i+1]);
781  const uint32x4_p rk2 = vec_splats(subkeys[i]);
782 #else
783  const uint8x16_p m = {0,1,2,3, 0,1,2,3, 0,1,2,3, 0,1,2,3};
784  uint32x4_p rk1 = VecLoad(subkeys+i+1);
785  uint32x4_p rk2 = VecLoad(subkeys+i);
786  rk1 = VecPermute(rk1, rk1, m);
787  rk2 = VecPermute(rk2, rk2, m);
788 #endif
789  x1 = VecXor(VecXor(x1, SIMON64_f(y1)), rk1);
790  x2 = VecXor(VecXor(x2, SIMON64_f(y2)), rk1);
791  x3 = VecXor(VecXor(x3, SIMON64_f(y3)), rk1);
792 
793  y1 = VecXor(VecXor(y1, SIMON64_f(x1)), rk2);
794  y2 = VecXor(VecXor(y2, SIMON64_f(x2)), rk2);
795  y3 = VecXor(VecXor(y3, SIMON64_f(x3)), rk2);
796  }
797 
798 #if (CRYPTOPP_BIG_ENDIAN)
799  const uint8x16_p m3 = {19,18,17,16, 3,2,1,0, 23,22,21,20, 7,6,5,4};
800  const uint8x16_p m4 = {27,26,25,24, 11,10,9,8, 31,30,29,28, 15,14,13,12};
801 #else
802  const uint8x16_p m3 = {3,2,1,0, 19,18,17,16, 7,6,5,4, 23,22,21,20};
803  const uint8x16_p m4 = {11,10,9,8, 27,26,25,24, 15,14,13,12, 31,30,29,28};
804 #endif
805 
806  // [A1 B1][A2 B2] ... => [A1 A2][B1 B2] ...
807  block0 = (uint32x4_p)VecPermute(x1, y1, m3);
808  block1 = (uint32x4_p)VecPermute(x1, y1, m4);
809  block2 = (uint32x4_p)VecPermute(x2, y2, m3);
810  block3 = (uint32x4_p)VecPermute(x2, y2, m4);
811  block4 = (uint32x4_p)VecPermute(x3, y3, m3);
812  block5 = (uint32x4_p)VecPermute(x3, y3, m4);
813 }
814 
815 #endif // CRYPTOPP_ALTIVEC_AVAILABLE
816 
817 ANONYMOUS_NAMESPACE_END
818 
819 ///////////////////////////////////////////////////////////////////////
820 
821 NAMESPACE_BEGIN(CryptoPP)
822 
823 // *************************** ARM NEON **************************** //
824 
825 #if (CRYPTOPP_ARM_NEON_AVAILABLE)
826 size_t SIMON64_Enc_AdvancedProcessBlocks_NEON(const word32* subKeys, size_t rounds,
827  const byte *inBlocks, const byte *xorBlocks, byte *outBlocks, size_t length, word32 flags)
828 {
829  return AdvancedProcessBlocks64_6x2_NEON(SIMON64_Enc_Block, SIMON64_Enc_6_Blocks,
830  subKeys, rounds, inBlocks, xorBlocks, outBlocks, length, flags);
831 }
832 
833 size_t SIMON64_Dec_AdvancedProcessBlocks_NEON(const word32* subKeys, size_t rounds,
834  const byte *inBlocks, const byte *xorBlocks, byte *outBlocks, size_t length, word32 flags)
835 {
836  return AdvancedProcessBlocks64_6x2_NEON(SIMON64_Dec_Block, SIMON64_Dec_6_Blocks,
837  subKeys, rounds, inBlocks, xorBlocks, outBlocks, length, flags);
838 }
839 #endif // CRYPTOPP_ARM_NEON_AVAILABLE
840 
841 // ***************************** IA-32 ***************************** //
842 
843 #if defined(CRYPTOPP_SSE41_AVAILABLE)
844 size_t SIMON64_Enc_AdvancedProcessBlocks_SSE41(const word32* subKeys, size_t rounds,
845  const byte *inBlocks, const byte *xorBlocks, byte *outBlocks, size_t length, word32 flags)
846 {
847  return AdvancedProcessBlocks64_6x2_SSE(SIMON64_Enc_Block, SIMON64_Enc_6_Blocks,
848  subKeys, rounds, inBlocks, xorBlocks, outBlocks, length, flags);
849 }
850 
851 size_t SIMON64_Dec_AdvancedProcessBlocks_SSE41(const word32* subKeys, size_t rounds,
852  const byte *inBlocks, const byte *xorBlocks, byte *outBlocks, size_t length, word32 flags)
853 {
854  return AdvancedProcessBlocks64_6x2_SSE(SIMON64_Dec_Block, SIMON64_Dec_6_Blocks,
855  subKeys, rounds, inBlocks, xorBlocks, outBlocks, length, flags);
856 }
857 #endif
858 
859 // ***************************** Altivec ***************************** //
860 
861 #if defined(CRYPTOPP_ALTIVEC_AVAILABLE)
862 size_t SIMON64_Enc_AdvancedProcessBlocks_ALTIVEC(const word32* subKeys, size_t rounds,
863  const byte *inBlocks, const byte *xorBlocks, byte *outBlocks, size_t length, word32 flags)
864 {
865  return AdvancedProcessBlocks64_6x2_ALTIVEC(SIMON64_Enc_Block, SIMON64_Enc_6_Blocks,
866  subKeys, rounds, inBlocks, xorBlocks, outBlocks, length, flags);
867 }
868 
869 size_t SIMON64_Dec_AdvancedProcessBlocks_ALTIVEC(const word32* subKeys, size_t rounds,
870  const byte *inBlocks, const byte *xorBlocks, byte *outBlocks, size_t length, word32 flags)
871 {
872  return AdvancedProcessBlocks64_6x2_ALTIVEC(SIMON64_Dec_Block, SIMON64_Dec_6_Blocks,
873  subKeys, rounds, inBlocks, xorBlocks, outBlocks, length, flags);
874 }
875 #endif
876 
877 NAMESPACE_END
Utility functions for the Crypto++ library.
Library configuration file.
T1 VecPermute(const T1 vec, const T2 mask)
Permutes a vector.
Definition: ppc_simd.h:1010
__vector unsigned int uint32x4_p
Vector of 32-bit elements.
Definition: ppc_simd.h:129
Support functions for PowerPC and vector operations.
Template for AdvancedProcessBlocks and SIMD processing.
Precompiled header file.
T1 VecXor(const T1 vec1, const T2 vec2)
XOR two vectors.
Definition: ppc_simd.h:916
uint32x4_p VecLoadBE(const byte src[16])
Loads a vector from a byte array.
Definition: ppc_simd.h:440
Classes for the Simon block cipher.
Crypto++ library namespace.
uint32x4_p VecLoad(const byte src[16])
Loads a vector from a byte array.
Definition: ppc_simd.h:253
__vector unsigned char uint8x16_p
Vector of 8-bit elements.
Definition: ppc_simd.h:119
T1 VecAnd(const T1 vec1, const T2 vec2)
AND two vectors.
Definition: ppc_simd.h:882
void vec_swap(T &a, T &b)
Swaps two variables which are arrays.
Definition: misc.h:531