Crypto++  8.2
Free C++ class library of cryptographic schemes
adv_simd.h
Go to the documentation of this file.
1 // adv_simd.h - written and placed in the public domain by Jeffrey Walton
2 
3 /// \file adv_simd.h
4 /// \brief Template for AdvancedProcessBlocks and SIMD processing
5 
6 // The SIMD based implementations for ciphers that use SSE, NEON and Power7
7 // have a commom pattern. Namely, they have a specialized implementation of
8 // AdvancedProcessBlocks which processes multiple block using hardware
9 // acceleration. After several implementations we noticed a lot of copy and
10 // paste occuring. adv_simd.h provides a template to avoid the copy and paste.
11 //
12 // There are 11 templates provided in this file. The number following the
13 // function name, 64 or 128, is the block size. The name following the block
14 // size is the arrangement and acceleration. For example 4x1_SSE means Intel
15 // SSE using two encrypt (or decrypt) functions: one that operates on 4 SIMD
16 // words, and one that operates on 1 SIMD words.
17 //
18 // The distinction between SIMD words versus cipher blocks is important
19 // because 64-bit ciphers use one SIMD word for two cipher blocks. For
20 // example, AdvancedProcessBlocks64_6x2_ALTIVEC operates on 6 and 2 SIMD
21 // words, which is 12 and 4 cipher blocks. The function will do the right
22 // thing even if there is only one 64-bit block to encrypt.
23 //
24 // * AdvancedProcessBlocks64_2x1_SSE
25 // * AdvancedProcessBlocks64_4x1_SSE
26 // * AdvancedProcessBlocks128_4x1_SSE
27 // * AdvancedProcessBlocks64_6x2_SSE
28 // * AdvancedProcessBlocks128_6x2_SSE
29 // * AdvancedProcessBlocks64_6x2_NEON
30 // * AdvancedProcessBlocks128_4x1_NEON
31 // * AdvancedProcessBlocks128_6x2_NEON
32 // * AdvancedProcessBlocks64_6x2_ALTIVEC
33 // * AdvancedProcessBlocks128_4x1_ALTIVEC
34 // * AdvancedProcessBlocks128_6x1_ALTIVEC
35 //
36 // If an arrangement ends in 2, like 6x2, then the template will handle the
37 // single block case by padding with 0's and using the two SIMD word
38 // function. This happens at most one time when processing multiple blocks.
39 // The extra processing of a zero block is trivial and worth the tradeoff.
40 //
41 // The MAYBE_CONST macro present on x86 is a SunCC workaround. Some versions
42 // of SunCC lose/drop the const-ness in the F1 and F4 functions. It eventually
43 // results in a failed link due to the const/non-const mismatch.
44 
45 #ifndef CRYPTOPP_ADVANCED_SIMD_TEMPLATES
46 #define CRYPTOPP_ADVANCED_SIMD_TEMPLATES
47 
48 #include "config.h"
49 #include "misc.h"
50 #include "stdcpp.h"
51 
52 // C1189: error: This header is specific to ARM targets
53 #if (CRYPTOPP_ARM_NEON_AVAILABLE) && !defined(_M_ARM64)
54 # include <arm_neon.h>
55 #endif
56 
57 #if (CRYPTOPP_ARM_ACLE_AVAILABLE)
58 # include <stdint.h>
59 # include <arm_acle.h>
60 #endif
61 
62 #if (CRYPTOPP_SSE2_INTRIN_AVAILABLE)
63 # include <emmintrin.h>
64 # include <xmmintrin.h>
65 #endif
66 
67 // SunCC needs CRYPTOPP_SSSE3_AVAILABLE, too
68 #if (CRYPTOPP_SSSE3_AVAILABLE)
69 # include <emmintrin.h>
70 # include <pmmintrin.h>
71 # include <xmmintrin.h>
72 #endif
73 
74 #if defined(__ALTIVEC__)
75 # include "ppc_simd.h"
76 #endif
77 
78 // ************************ All block ciphers *********************** //
79 
80 ANONYMOUS_NAMESPACE_BEGIN
81 
82 using CryptoPP::BlockTransformation;
83 
84 CRYPTOPP_CONSTANT(BT_XorInput = BlockTransformation::BT_XorInput)
85 CRYPTOPP_CONSTANT(BT_AllowParallel = BlockTransformation::BT_AllowParallel)
86 CRYPTOPP_CONSTANT(BT_InBlockIsCounter = BlockTransformation::BT_InBlockIsCounter)
87 CRYPTOPP_CONSTANT(BT_ReverseDirection = BlockTransformation::BT_ReverseDirection)
88 CRYPTOPP_CONSTANT(BT_DontIncrementInOutPointers = BlockTransformation::BT_DontIncrementInOutPointers)
89 
90 ANONYMOUS_NAMESPACE_END
91 
92 // *************************** ARM NEON ************************** //
93 
94 #if (CRYPTOPP_ARM_NEON_AVAILABLE)
95 
96 NAMESPACE_BEGIN(CryptoPP)
97 
98 /// \brief AdvancedProcessBlocks for 2 and 6 blocks
99 /// \tparam F2 function to process 2 64-bit blocks
100 /// \tparam F6 function to process 6 64-bit blocks
101 /// \tparam W word type of the subkey table
102 /// \details AdvancedProcessBlocks64_6x2_NEON processes 6 and 2 NEON SIMD words
103 /// at a time. For a single block the template uses F2 with a zero block.
104 /// \details The subkey type is usually word32 or word64. F2 and F6 must use the
105 /// same word type.
106 template <typename F2, typename F6, typename W>
107 inline size_t AdvancedProcessBlocks64_6x2_NEON(F2 func2, F6 func6,
108  const W *subKeys, size_t rounds, const byte *inBlocks,
109  const byte *xorBlocks, byte *outBlocks, size_t length, word32 flags)
110 {
111  CRYPTOPP_ASSERT(subKeys);
112  CRYPTOPP_ASSERT(inBlocks);
113  CRYPTOPP_ASSERT(outBlocks);
114  CRYPTOPP_ASSERT(length >= 8);
115 
116  const unsigned int w_one[] = {0, 0<<24, 0, 1<<24};
117  const unsigned int w_two[] = {0, 2<<24, 0, 2<<24};
118  const uint32x4_t s_one = vld1q_u32(w_one);
119  const uint32x4_t s_two = vld1q_u32(w_two);
120 
121  const size_t blockSize = 8;
122  const size_t neonBlockSize = 16;
123 
124  size_t inIncrement = (flags & (BT_InBlockIsCounter|BT_DontIncrementInOutPointers)) ? 0 : neonBlockSize;
125  size_t xorIncrement = (xorBlocks != NULLPTR) ? neonBlockSize : 0;
126  size_t outIncrement = (flags & BT_DontIncrementInOutPointers) ? 0 : neonBlockSize;
127 
128  // Clang and Coverity are generating findings using xorBlocks as a flag.
129  const bool xorInput = (xorBlocks != NULLPTR) && (flags & BT_XorInput);
130  const bool xorOutput = (xorBlocks != NULLPTR) && !(flags & BT_XorInput);
131 
132  if (flags & BT_ReverseDirection)
133  {
134  inBlocks = PtrAdd(inBlocks, length - neonBlockSize);
135  xorBlocks = PtrAdd(xorBlocks, length - neonBlockSize);
136  outBlocks = PtrAdd(outBlocks, length - neonBlockSize);
137  inIncrement = 0-inIncrement;
138  xorIncrement = 0-xorIncrement;
139  outIncrement = 0-outIncrement;
140  }
141 
142  if (flags & BT_AllowParallel)
143  {
144  while (length >= 6*neonBlockSize)
145  {
146  uint32x4_t block0, block1, block2, block3, block4, block5;
147  if (flags & BT_InBlockIsCounter)
148  {
149  // For 64-bit block ciphers we need to load the CTR block, which is 8 bytes.
150  // After the dup load we have two counters in the NEON word. Then we need
151  // to increment the low ctr by 0 and the high ctr by 1.
152  const uint8x8_t ctr = vld1_u8(inBlocks);
153  block0 = vaddq_u32(s_one, vreinterpretq_u32_u8(vcombine_u8(ctr,ctr)));
154 
155  // After initial increment of {0,1} remaining counters increment by {2,2}.
156  block1 = vaddq_u32(s_two, block0);
157  block2 = vaddq_u32(s_two, block1);
158  block3 = vaddq_u32(s_two, block2);
159  block4 = vaddq_u32(s_two, block3);
160  block5 = vaddq_u32(s_two, block4);
161 
162  vst1_u8(const_cast<byte*>(inBlocks), vget_low_u8(
163  vreinterpretq_u8_u32(vaddq_u32(s_two, block5))));
164  }
165  else
166  {
167  block0 = vreinterpretq_u32_u8(vld1q_u8(inBlocks));
168  inBlocks = PtrAdd(inBlocks, inIncrement);
169  block1 = vreinterpretq_u32_u8(vld1q_u8(inBlocks));
170  inBlocks = PtrAdd(inBlocks, inIncrement);
171  block2 = vreinterpretq_u32_u8(vld1q_u8(inBlocks));
172  inBlocks = PtrAdd(inBlocks, inIncrement);
173  block3 = vreinterpretq_u32_u8(vld1q_u8(inBlocks));
174  inBlocks = PtrAdd(inBlocks, inIncrement);
175  block4 = vreinterpretq_u32_u8(vld1q_u8(inBlocks));
176  inBlocks = PtrAdd(inBlocks, inIncrement);
177  block5 = vreinterpretq_u32_u8(vld1q_u8(inBlocks));
178  inBlocks = PtrAdd(inBlocks, inIncrement);
179  }
180 
181  if (xorInput)
182  {
183  block0 = veorq_u32(block0, vreinterpretq_u32_u8(vld1q_u8(xorBlocks)));
184  xorBlocks = PtrAdd(xorBlocks, xorIncrement);
185  block1 = veorq_u32(block1, vreinterpretq_u32_u8(vld1q_u8(xorBlocks)));
186  xorBlocks = PtrAdd(xorBlocks, xorIncrement);
187  block2 = veorq_u32(block2, vreinterpretq_u32_u8(vld1q_u8(xorBlocks)));
188  xorBlocks = PtrAdd(xorBlocks, xorIncrement);
189  block3 = veorq_u32(block3, vreinterpretq_u32_u8(vld1q_u8(xorBlocks)));
190  xorBlocks = PtrAdd(xorBlocks, xorIncrement);
191  block4 = veorq_u32(block4, vreinterpretq_u32_u8(vld1q_u8(xorBlocks)));
192  xorBlocks = PtrAdd(xorBlocks, xorIncrement);
193  block5 = veorq_u32(block5, vreinterpretq_u32_u8(vld1q_u8(xorBlocks)));
194  xorBlocks = PtrAdd(xorBlocks, xorIncrement);
195  }
196 
197  func6(block0, block1, block2, block3, block4, block5, subKeys, static_cast<unsigned int>(rounds));
198 
199  if (xorOutput)
200  {
201  block0 = veorq_u32(block0, vreinterpretq_u32_u8(vld1q_u8(xorBlocks)));
202  xorBlocks = PtrAdd(xorBlocks, xorIncrement);
203  block1 = veorq_u32(block1, vreinterpretq_u32_u8(vld1q_u8(xorBlocks)));
204  xorBlocks = PtrAdd(xorBlocks, xorIncrement);
205  block2 = veorq_u32(block2, vreinterpretq_u32_u8(vld1q_u8(xorBlocks)));
206  xorBlocks = PtrAdd(xorBlocks, xorIncrement);
207  block3 = veorq_u32(block3, vreinterpretq_u32_u8(vld1q_u8(xorBlocks)));
208  xorBlocks = PtrAdd(xorBlocks, xorIncrement);
209  block4 = veorq_u32(block4, vreinterpretq_u32_u8(vld1q_u8(xorBlocks)));
210  xorBlocks = PtrAdd(xorBlocks, xorIncrement);
211  block5 = veorq_u32(block5, vreinterpretq_u32_u8(vld1q_u8(xorBlocks)));
212  xorBlocks = PtrAdd(xorBlocks, xorIncrement);
213  }
214 
215  vst1q_u8(outBlocks, vreinterpretq_u8_u32(block0));
216  outBlocks = PtrAdd(outBlocks, outIncrement);
217  vst1q_u8(outBlocks, vreinterpretq_u8_u32(block1));
218  outBlocks = PtrAdd(outBlocks, outIncrement);
219  vst1q_u8(outBlocks, vreinterpretq_u8_u32(block2));
220  outBlocks = PtrAdd(outBlocks, outIncrement);
221  vst1q_u8(outBlocks, vreinterpretq_u8_u32(block3));
222  outBlocks = PtrAdd(outBlocks, outIncrement);
223  vst1q_u8(outBlocks, vreinterpretq_u8_u32(block4));
224  outBlocks = PtrAdd(outBlocks, outIncrement);
225  vst1q_u8(outBlocks, vreinterpretq_u8_u32(block5));
226  outBlocks = PtrAdd(outBlocks, outIncrement);
227 
228  length -= 6*neonBlockSize;
229  }
230 
231  while (length >= 2*neonBlockSize)
232  {
233  uint32x4_t block0, block1;
234  if (flags & BT_InBlockIsCounter)
235  {
236  // For 64-bit block ciphers we need to load the CTR block, which is 8 bytes.
237  // After the dup load we have two counters in the NEON word. Then we need
238  // to increment the low ctr by 0 and the high ctr by 1.
239  const uint8x8_t ctr = vld1_u8(inBlocks);
240  block0 = vaddq_u32(s_one, vreinterpretq_u32_u8(vcombine_u8(ctr,ctr)));
241 
242  // After initial increment of {0,1} remaining counters increment by {2,2}.
243  block1 = vaddq_u32(s_two, block0);
244 
245  vst1_u8(const_cast<byte*>(inBlocks), vget_low_u8(
246  vreinterpretq_u8_u32(vaddq_u32(s_two, block1))));
247  }
248  else
249  {
250  block0 = vreinterpretq_u32_u8(vld1q_u8(inBlocks));
251  inBlocks = PtrAdd(inBlocks, inIncrement);
252  block1 = vreinterpretq_u32_u8(vld1q_u8(inBlocks));
253  inBlocks = PtrAdd(inBlocks, inIncrement);
254  }
255 
256  if (xorInput)
257  {
258  block0 = veorq_u32(block0, vreinterpretq_u32_u8(vld1q_u8(xorBlocks)));
259  xorBlocks = PtrAdd(xorBlocks, xorIncrement);
260  block1 = veorq_u32(block1, vreinterpretq_u32_u8(vld1q_u8(xorBlocks)));
261  xorBlocks = PtrAdd(xorBlocks, xorIncrement);
262  }
263 
264  func2(block0, block1, subKeys, static_cast<unsigned int>(rounds));
265 
266  if (xorOutput)
267  {
268  block0 = veorq_u32(block0, vreinterpretq_u32_u8(vld1q_u8(xorBlocks)));
269  xorBlocks = PtrAdd(xorBlocks, xorIncrement);
270  block1 = veorq_u32(block1, vreinterpretq_u32_u8(vld1q_u8(xorBlocks)));
271  xorBlocks = PtrAdd(xorBlocks, xorIncrement);
272  }
273 
274  vst1q_u8(outBlocks, vreinterpretq_u8_u32(block0));
275  outBlocks = PtrAdd(outBlocks, outIncrement);
276  vst1q_u8(outBlocks, vreinterpretq_u8_u32(block1));
277  outBlocks = PtrAdd(outBlocks, outIncrement);
278 
279  length -= 2*neonBlockSize;
280  }
281  }
282 
283  if (length)
284  {
285  // Adjust to real block size
286  if (flags & BT_ReverseDirection)
287  {
288  inIncrement += inIncrement ? blockSize : 0;
289  xorIncrement += xorIncrement ? blockSize : 0;
290  outIncrement += outIncrement ? blockSize : 0;
291  inBlocks = PtrSub(inBlocks, inIncrement);
292  xorBlocks = PtrSub(xorBlocks, xorIncrement);
293  outBlocks = PtrSub(outBlocks, outIncrement);
294  }
295  else
296  {
297  inIncrement -= inIncrement ? blockSize : 0;
298  xorIncrement -= xorIncrement ? blockSize : 0;
299  outIncrement -= outIncrement ? blockSize : 0;
300  }
301 
302  while (length >= blockSize)
303  {
304  uint32x4_t block, zero = {0};
305 
306  const uint8x8_t v = vld1_u8(inBlocks);
307  block = vreinterpretq_u32_u8(vcombine_u8(v,v));
308 
309  if (xorInput)
310  {
311  const uint8x8_t x = vld1_u8(xorBlocks);
312  block = veorq_u32(block, vreinterpretq_u32_u8(vcombine_u8(x,x)));
313  }
314 
315  if (flags & BT_InBlockIsCounter)
316  const_cast<byte *>(inBlocks)[7]++;
317 
318  func2(block, zero, subKeys, static_cast<unsigned int>(rounds));
319 
320  if (xorOutput)
321  {
322  const uint8x8_t x = vld1_u8(xorBlocks);
323  block = veorq_u32(block, vreinterpretq_u32_u8(vcombine_u8(x,x)));
324  }
325 
326  vst1_u8(const_cast<byte*>(outBlocks),
327  vget_low_u8(vreinterpretq_u8_u32(block)));
328 
329  inBlocks = PtrAdd(inBlocks, inIncrement);
330  outBlocks = PtrAdd(outBlocks, outIncrement);
331  xorBlocks = PtrAdd(xorBlocks, xorIncrement);
332  length -= blockSize;
333  }
334  }
335 
336  return length;
337 }
338 
339 /// \brief AdvancedProcessBlocks for 1 and 6 blocks
340 /// \tparam F1 function to process 1 128-bit block
341 /// \tparam F6 function to process 6 128-bit blocks
342 /// \tparam W word type of the subkey table
343 /// \details AdvancedProcessBlocks128_6x1_NEON processes 6 and 2 NEON SIMD words
344 /// at a time.
345 /// \details The subkey type is usually word32 or word64. F1 and F6 must use the
346 /// same word type.
347 template <typename F1, typename F6, typename W>
348 inline size_t AdvancedProcessBlocks128_6x1_NEON(F1 func1, F6 func6,
349  const W *subKeys, size_t rounds, const byte *inBlocks,
350  const byte *xorBlocks, byte *outBlocks, size_t length, word32 flags)
351 {
352  CRYPTOPP_ASSERT(subKeys);
353  CRYPTOPP_ASSERT(inBlocks);
354  CRYPTOPP_ASSERT(outBlocks);
355  CRYPTOPP_ASSERT(length >= 16);
356 
357  const unsigned int w_one[] = {0, 0<<24, 0, 1<<24};
358  const unsigned int w_two[] = {0, 2<<24, 0, 2<<24};
359  const uint32x4_t s_one = vld1q_u32(w_one);
360  const uint32x4_t s_two = vld1q_u32(w_two);
361 
362  const size_t blockSize = 16;
363  // const size_t neonBlockSize = 16;
364 
365  size_t inIncrement = (flags & (BT_InBlockIsCounter|BT_DontIncrementInOutPointers)) ? 0 : blockSize;
366  size_t xorIncrement = (xorBlocks != NULLPTR) ? blockSize : 0;
367  size_t outIncrement = (flags & BT_DontIncrementInOutPointers) ? 0 : blockSize;
368 
369  // Clang and Coverity are generating findings using xorBlocks as a flag.
370  const bool xorInput = (xorBlocks != NULLPTR) && (flags & BT_XorInput);
371  const bool xorOutput = (xorBlocks != NULLPTR) && !(flags & BT_XorInput);
372 
373  if (flags & BT_ReverseDirection)
374  {
375  inBlocks = PtrAdd(inBlocks, length - blockSize);
376  xorBlocks = PtrAdd(xorBlocks, length - blockSize);
377  outBlocks = PtrAdd(outBlocks, length - blockSize);
378  inIncrement = 0-inIncrement;
379  xorIncrement = 0-xorIncrement;
380  outIncrement = 0-outIncrement;
381  }
382 
383  if (flags & BT_AllowParallel)
384  {
385  while (length >= 6*blockSize)
386  {
387  uint64x2_t block0, block1, block2, block3, block4, block5;
388  if (flags & BT_InBlockIsCounter)
389  {
390  const uint64x2_t one = vreinterpretq_u64_u32(s_one);
391  block0 = vreinterpretq_u64_u8(vld1q_u8(inBlocks));
392  block1 = vaddq_u64(block0, one);
393  block2 = vaddq_u64(block1, one);
394  block3 = vaddq_u64(block2, one);
395  block4 = vaddq_u64(block3, one);
396  block5 = vaddq_u64(block4, one);
397  vst1q_u8(const_cast<byte*>(inBlocks),
398  vreinterpretq_u8_u64(vaddq_u64(block5, one)));
399  }
400  else
401  {
402  block0 = vreinterpretq_u64_u8(vld1q_u8(inBlocks));
403  inBlocks = PtrAdd(inBlocks, inIncrement);
404  block1 = vreinterpretq_u64_u8(vld1q_u8(inBlocks));
405  inBlocks = PtrAdd(inBlocks, inIncrement);
406  block2 = vreinterpretq_u64_u8(vld1q_u8(inBlocks));
407  inBlocks = PtrAdd(inBlocks, inIncrement);
408  block3 = vreinterpretq_u64_u8(vld1q_u8(inBlocks));
409  inBlocks = PtrAdd(inBlocks, inIncrement);
410  block4 = vreinterpretq_u64_u8(vld1q_u8(inBlocks));
411  inBlocks = PtrAdd(inBlocks, inIncrement);
412  block5 = vreinterpretq_u64_u8(vld1q_u8(inBlocks));
413  inBlocks = PtrAdd(inBlocks, inIncrement);
414  }
415 
416  if (xorInput)
417  {
418  block0 = veorq_u64(block0, vreinterpretq_u64_u8(vld1q_u8(xorBlocks)));
419  xorBlocks = PtrAdd(xorBlocks, xorIncrement);
420  block1 = veorq_u64(block1, vreinterpretq_u64_u8(vld1q_u8(xorBlocks)));
421  xorBlocks = PtrAdd(xorBlocks, xorIncrement);
422  block2 = veorq_u64(block2, vreinterpretq_u64_u8(vld1q_u8(xorBlocks)));
423  xorBlocks = PtrAdd(xorBlocks, xorIncrement);
424  block3 = veorq_u64(block3, vreinterpretq_u64_u8(vld1q_u8(xorBlocks)));
425  xorBlocks = PtrAdd(xorBlocks, xorIncrement);
426  block4 = veorq_u64(block4, vreinterpretq_u64_u8(vld1q_u8(xorBlocks)));
427  xorBlocks = PtrAdd(xorBlocks, xorIncrement);
428  block5 = veorq_u64(block5, vreinterpretq_u64_u8(vld1q_u8(xorBlocks)));
429  xorBlocks = PtrAdd(xorBlocks, xorIncrement);
430  }
431 
432  func6(block0, block1, block2, block3, block4, block5, subKeys, static_cast<unsigned int>(rounds));
433 
434  if (xorOutput)
435  {
436  block0 = veorq_u64(block0, vreinterpretq_u64_u8(vld1q_u8(xorBlocks)));
437  xorBlocks = PtrAdd(xorBlocks, xorIncrement);
438  block1 = veorq_u64(block1, vreinterpretq_u64_u8(vld1q_u8(xorBlocks)));
439  xorBlocks = PtrAdd(xorBlocks, xorIncrement);
440  block2 = veorq_u64(block2, vreinterpretq_u64_u8(vld1q_u8(xorBlocks)));
441  xorBlocks = PtrAdd(xorBlocks, xorIncrement);
442  block3 = veorq_u64(block3, vreinterpretq_u64_u8(vld1q_u8(xorBlocks)));
443  xorBlocks = PtrAdd(xorBlocks, xorIncrement);
444  block4 = veorq_u64(block4, vreinterpretq_u64_u8(vld1q_u8(xorBlocks)));
445  xorBlocks = PtrAdd(xorBlocks, xorIncrement);
446  block5 = veorq_u64(block5, vreinterpretq_u64_u8(vld1q_u8(xorBlocks)));
447  xorBlocks = PtrAdd(xorBlocks, xorIncrement);
448  }
449 
450  vst1q_u8(outBlocks, vreinterpretq_u8_u64(block0));
451  outBlocks = PtrAdd(outBlocks, outIncrement);
452  vst1q_u8(outBlocks, vreinterpretq_u8_u64(block1));
453  outBlocks = PtrAdd(outBlocks, outIncrement);
454  vst1q_u8(outBlocks, vreinterpretq_u8_u64(block2));
455  outBlocks = PtrAdd(outBlocks, outIncrement);
456  vst1q_u8(outBlocks, vreinterpretq_u8_u64(block3));
457  outBlocks = PtrAdd(outBlocks, outIncrement);
458  vst1q_u8(outBlocks, vreinterpretq_u8_u64(block4));
459  outBlocks = PtrAdd(outBlocks, outIncrement);
460  vst1q_u8(outBlocks, vreinterpretq_u8_u64(block5));
461  outBlocks = PtrAdd(outBlocks, outIncrement);
462 
463  length -= 6*blockSize;
464  }
465  }
466 
467  while (length >= blockSize)
468  {
469  uint64x2_t block;
470  block = vreinterpretq_u64_u8(vld1q_u8(inBlocks));
471 
472  if (xorInput)
473  block = veorq_u64(block, vreinterpretq_u64_u8(vld1q_u8(xorBlocks)));
474 
475  if (flags & BT_InBlockIsCounter)
476  const_cast<byte *>(inBlocks)[15]++;
477 
478  func1(block, subKeys, static_cast<unsigned int>(rounds));
479 
480  if (xorOutput)
481  block = veorq_u64(block, vreinterpretq_u64_u8(vld1q_u8(xorBlocks)));
482 
483  vst1q_u8(outBlocks, vreinterpretq_u8_u64(block));
484 
485  inBlocks = PtrAdd(inBlocks, inIncrement);
486  outBlocks = PtrAdd(outBlocks, outIncrement);
487  xorBlocks = PtrAdd(xorBlocks, xorIncrement);
488  length -= blockSize;
489  }
490 
491  return length;
492 }
493 
494 /// \brief AdvancedProcessBlocks for 1 and 4 blocks
495 /// \tparam F1 function to process 1 128-bit block
496 /// \tparam F4 function to process 4 128-bit blocks
497 /// \tparam W word type of the subkey table
498 /// \details AdvancedProcessBlocks128_4x1_NEON processes 4 and 1 NEON SIMD words
499 /// at a time.
500 /// \details The subkey type is usually word32 or word64. V is the vector type and it is
501 /// usually uint32x4_t or uint32x4_t. F1, F4, and W must use the same word and
502 /// vector type.
503 template <typename F1, typename F4, typename W>
504 inline size_t AdvancedProcessBlocks128_4x1_NEON(F1 func1, F4 func4,
505  const W *subKeys, size_t rounds, const byte *inBlocks,
506  const byte *xorBlocks, byte *outBlocks, size_t length, word32 flags)
507 {
508  CRYPTOPP_ASSERT(subKeys);
509  CRYPTOPP_ASSERT(inBlocks);
510  CRYPTOPP_ASSERT(outBlocks);
511  CRYPTOPP_ASSERT(length >= 16);
512 
513  const unsigned int w_one[] = {0, 0<<24, 0, 1<<24};
514  const unsigned int w_two[] = {0, 2<<24, 0, 2<<24};
515  const uint32x4_t s_one = vld1q_u32(w_one);
516  const uint32x4_t s_two = vld1q_u32(w_two);
517 
518  const size_t blockSize = 16;
519  // const size_t neonBlockSize = 16;
520 
521  size_t inIncrement = (flags & (BT_InBlockIsCounter|BT_DontIncrementInOutPointers)) ? 0 : blockSize;
522  size_t xorIncrement = (xorBlocks != NULLPTR) ? blockSize : 0;
523  size_t outIncrement = (flags & BT_DontIncrementInOutPointers) ? 0 : blockSize;
524 
525  // Clang and Coverity are generating findings using xorBlocks as a flag.
526  const bool xorInput = (xorBlocks != NULLPTR) && (flags & BT_XorInput);
527  const bool xorOutput = (xorBlocks != NULLPTR) && !(flags & BT_XorInput);
528 
529  if (flags & BT_ReverseDirection)
530  {
531  inBlocks = PtrAdd(inBlocks, length - blockSize);
532  xorBlocks = PtrAdd(xorBlocks, length - blockSize);
533  outBlocks = PtrAdd(outBlocks, length - blockSize);
534  inIncrement = 0-inIncrement;
535  xorIncrement = 0-xorIncrement;
536  outIncrement = 0-outIncrement;
537  }
538 
539  if (flags & BT_AllowParallel)
540  {
541  while (length >= 4*blockSize)
542  {
543  uint32x4_t block0, block1, block2, block3;
544  if (flags & BT_InBlockIsCounter)
545  {
546  const uint32x4_t one = s_one;
547  block0 = vreinterpretq_u32_u8(vld1q_u8(inBlocks));
548  block1 = vreinterpretq_u32_u64(vaddq_u64(vreinterpretq_u64_u32(block0), vreinterpretq_u64_u32(one)));
549  block2 = vreinterpretq_u32_u64(vaddq_u64(vreinterpretq_u64_u32(block1), vreinterpretq_u64_u32(one)));
550  block3 = vreinterpretq_u32_u64(vaddq_u64(vreinterpretq_u64_u32(block2), vreinterpretq_u64_u32(one)));
551  vst1q_u8(const_cast<byte*>(inBlocks), vreinterpretq_u8_u64(vaddq_u64(
552  vreinterpretq_u64_u32(block3), vreinterpretq_u64_u32(one))));
553  }
554  else
555  {
556  block0 = vreinterpretq_u32_u8(vld1q_u8(inBlocks));
557  inBlocks = PtrAdd(inBlocks, inIncrement);
558  block1 = vreinterpretq_u32_u8(vld1q_u8(inBlocks));
559  inBlocks = PtrAdd(inBlocks, inIncrement);
560  block2 = vreinterpretq_u32_u8(vld1q_u8(inBlocks));
561  inBlocks = PtrAdd(inBlocks, inIncrement);
562  block3 = vreinterpretq_u32_u8(vld1q_u8(inBlocks));
563  inBlocks = PtrAdd(inBlocks, inIncrement);
564  }
565 
566  if (xorInput)
567  {
568  block0 = veorq_u32(block0, vreinterpretq_u32_u8(vld1q_u8(xorBlocks)));
569  xorBlocks = PtrAdd(xorBlocks, xorIncrement);
570  block1 = veorq_u32(block1, vreinterpretq_u32_u8(vld1q_u8(xorBlocks)));
571  xorBlocks = PtrAdd(xorBlocks, xorIncrement);
572  block2 = veorq_u32(block2, vreinterpretq_u32_u8(vld1q_u8(xorBlocks)));
573  xorBlocks = PtrAdd(xorBlocks, xorIncrement);
574  block3 = veorq_u32(block3, vreinterpretq_u32_u8(vld1q_u8(xorBlocks)));
575  xorBlocks = PtrAdd(xorBlocks, xorIncrement);
576  }
577 
578  func4(block0, block1, block2, block3, subKeys, static_cast<unsigned int>(rounds));
579 
580  if (xorOutput)
581  {
582  block0 = veorq_u32(block0, vreinterpretq_u32_u8(vld1q_u8(xorBlocks)));
583  xorBlocks = PtrAdd(xorBlocks, xorIncrement);
584  block1 = veorq_u32(block1, vreinterpretq_u32_u8(vld1q_u8(xorBlocks)));
585  xorBlocks = PtrAdd(xorBlocks, xorIncrement);
586  block2 = veorq_u32(block2, vreinterpretq_u32_u8(vld1q_u8(xorBlocks)));
587  xorBlocks = PtrAdd(xorBlocks, xorIncrement);
588  block3 = veorq_u32(block3, vreinterpretq_u32_u8(vld1q_u8(xorBlocks)));
589  xorBlocks = PtrAdd(xorBlocks, xorIncrement);
590  }
591 
592  vst1q_u8(outBlocks, vreinterpretq_u8_u32(block0));
593  outBlocks = PtrAdd(outBlocks, outIncrement);
594  vst1q_u8(outBlocks, vreinterpretq_u8_u32(block1));
595  outBlocks = PtrAdd(outBlocks, outIncrement);
596  vst1q_u8(outBlocks, vreinterpretq_u8_u32(block2));
597  outBlocks = PtrAdd(outBlocks, outIncrement);
598  vst1q_u8(outBlocks, vreinterpretq_u8_u32(block3));
599  outBlocks = PtrAdd(outBlocks, outIncrement);
600 
601  length -= 4*blockSize;
602  }
603  }
604 
605  while (length >= blockSize)
606  {
607  uint32x4_t block = vreinterpretq_u32_u8(vld1q_u8(inBlocks));
608 
609  if (xorInput)
610  block = veorq_u32(block, vreinterpretq_u32_u8(vld1q_u8(xorBlocks)));
611 
612  if (flags & BT_InBlockIsCounter)
613  const_cast<byte *>(inBlocks)[15]++;
614 
615  func1(block, subKeys, static_cast<unsigned int>(rounds));
616 
617  if (xorOutput)
618  block = veorq_u32(block, vreinterpretq_u32_u8(vld1q_u8(xorBlocks)));
619 
620  vst1q_u8(outBlocks, vreinterpretq_u8_u32(block));
621 
622  inBlocks = PtrAdd(inBlocks, inIncrement);
623  outBlocks = PtrAdd(outBlocks, outIncrement);
624  xorBlocks = PtrAdd(xorBlocks, xorIncrement);
625  length -= blockSize;
626  }
627 
628  return length;
629 }
630 
631 /// \brief AdvancedProcessBlocks for 2 and 6 blocks
632 /// \tparam F2 function to process 2 128-bit blocks
633 /// \tparam F6 function to process 6 128-bit blocks
634 /// \tparam W word type of the subkey table
635 /// \details AdvancedProcessBlocks128_6x2_NEON processes 6 and 2 NEON SIMD words
636 /// at a time. For a single block the template uses F2 with a zero block.
637 /// \details The subkey type is usually word32 or word64. F2 and F6 must use the
638 /// same word type.
639 template <typename F2, typename F6, typename W>
640 inline size_t AdvancedProcessBlocks128_6x2_NEON(F2 func2, F6 func6,
641  const W *subKeys, size_t rounds, const byte *inBlocks,
642  const byte *xorBlocks, byte *outBlocks, size_t length, word32 flags)
643 {
644  CRYPTOPP_ASSERT(subKeys);
645  CRYPTOPP_ASSERT(inBlocks);
646  CRYPTOPP_ASSERT(outBlocks);
647  CRYPTOPP_ASSERT(length >= 16);
648 
649  const unsigned int w_one[] = {0, 0<<24, 0, 1<<24};
650  const unsigned int w_two[] = {0, 2<<24, 0, 2<<24};
651  const uint32x4_t s_one = vld1q_u32(w_one);
652  const uint32x4_t s_two = vld1q_u32(w_two);
653 
654  const size_t blockSize = 16;
655  // const size_t neonBlockSize = 16;
656 
657  size_t inIncrement = (flags & (BT_InBlockIsCounter|BT_DontIncrementInOutPointers)) ? 0 : blockSize;
658  size_t xorIncrement = (xorBlocks != NULLPTR) ? blockSize : 0;
659  size_t outIncrement = (flags & BT_DontIncrementInOutPointers) ? 0 : blockSize;
660 
661  // Clang and Coverity are generating findings using xorBlocks as a flag.
662  const bool xorInput = (xorBlocks != NULLPTR) && (flags & BT_XorInput);
663  const bool xorOutput = (xorBlocks != NULLPTR) && !(flags & BT_XorInput);
664 
665  if (flags & BT_ReverseDirection)
666  {
667  inBlocks = PtrAdd(inBlocks, length - blockSize);
668  xorBlocks = PtrAdd(xorBlocks, length - blockSize);
669  outBlocks = PtrAdd(outBlocks, length - blockSize);
670  inIncrement = 0-inIncrement;
671  xorIncrement = 0-xorIncrement;
672  outIncrement = 0-outIncrement;
673  }
674 
675  if (flags & BT_AllowParallel)
676  {
677  while (length >= 6*blockSize)
678  {
679  uint64x2_t block0, block1, block2, block3, block4, block5;
680  if (flags & BT_InBlockIsCounter)
681  {
682  const uint64x2_t one = vreinterpretq_u64_u32(s_one);
683  block0 = vreinterpretq_u64_u8(vld1q_u8(inBlocks));
684  block1 = vaddq_u64(block0, one);
685  block2 = vaddq_u64(block1, one);
686  block3 = vaddq_u64(block2, one);
687  block4 = vaddq_u64(block3, one);
688  block5 = vaddq_u64(block4, one);
689  vst1q_u8(const_cast<byte*>(inBlocks),
690  vreinterpretq_u8_u64(vaddq_u64(block5, one)));
691  }
692  else
693  {
694  block0 = vreinterpretq_u64_u8(vld1q_u8(inBlocks));
695  inBlocks = PtrAdd(inBlocks, inIncrement);
696  block1 = vreinterpretq_u64_u8(vld1q_u8(inBlocks));
697  inBlocks = PtrAdd(inBlocks, inIncrement);
698  block2 = vreinterpretq_u64_u8(vld1q_u8(inBlocks));
699  inBlocks = PtrAdd(inBlocks, inIncrement);
700  block3 = vreinterpretq_u64_u8(vld1q_u8(inBlocks));
701  inBlocks = PtrAdd(inBlocks, inIncrement);
702  block4 = vreinterpretq_u64_u8(vld1q_u8(inBlocks));
703  inBlocks = PtrAdd(inBlocks, inIncrement);
704  block5 = vreinterpretq_u64_u8(vld1q_u8(inBlocks));
705  inBlocks = PtrAdd(inBlocks, inIncrement);
706  }
707 
708  if (xorInput)
709  {
710  block0 = veorq_u64(block0, vreinterpretq_u64_u8(vld1q_u8(xorBlocks)));
711  xorBlocks = PtrAdd(xorBlocks, xorIncrement);
712  block1 = veorq_u64(block1, vreinterpretq_u64_u8(vld1q_u8(xorBlocks)));
713  xorBlocks = PtrAdd(xorBlocks, xorIncrement);
714  block2 = veorq_u64(block2, vreinterpretq_u64_u8(vld1q_u8(xorBlocks)));
715  xorBlocks = PtrAdd(xorBlocks, xorIncrement);
716  block3 = veorq_u64(block3, vreinterpretq_u64_u8(vld1q_u8(xorBlocks)));
717  xorBlocks = PtrAdd(xorBlocks, xorIncrement);
718  block4 = veorq_u64(block4, vreinterpretq_u64_u8(vld1q_u8(xorBlocks)));
719  xorBlocks = PtrAdd(xorBlocks, xorIncrement);
720  block5 = veorq_u64(block5, vreinterpretq_u64_u8(vld1q_u8(xorBlocks)));
721  xorBlocks = PtrAdd(xorBlocks, xorIncrement);
722  }
723 
724  func6(block0, block1, block2, block3, block4, block5, subKeys, static_cast<unsigned int>(rounds));
725 
726  if (xorOutput)
727  {
728  block0 = veorq_u64(block0, vreinterpretq_u64_u8(vld1q_u8(xorBlocks)));
729  xorBlocks = PtrAdd(xorBlocks, xorIncrement);
730  block1 = veorq_u64(block1, vreinterpretq_u64_u8(vld1q_u8(xorBlocks)));
731  xorBlocks = PtrAdd(xorBlocks, xorIncrement);
732  block2 = veorq_u64(block2, vreinterpretq_u64_u8(vld1q_u8(xorBlocks)));
733  xorBlocks = PtrAdd(xorBlocks, xorIncrement);
734  block3 = veorq_u64(block3, vreinterpretq_u64_u8(vld1q_u8(xorBlocks)));
735  xorBlocks = PtrAdd(xorBlocks, xorIncrement);
736  block4 = veorq_u64(block4, vreinterpretq_u64_u8(vld1q_u8(xorBlocks)));
737  xorBlocks = PtrAdd(xorBlocks, xorIncrement);
738  block5 = veorq_u64(block5, vreinterpretq_u64_u8(vld1q_u8(xorBlocks)));
739  xorBlocks = PtrAdd(xorBlocks, xorIncrement);
740  }
741 
742  vst1q_u8(outBlocks, vreinterpretq_u8_u64(block0));
743  outBlocks = PtrAdd(outBlocks, outIncrement);
744  vst1q_u8(outBlocks, vreinterpretq_u8_u64(block1));
745  outBlocks = PtrAdd(outBlocks, outIncrement);
746  vst1q_u8(outBlocks, vreinterpretq_u8_u64(block2));
747  outBlocks = PtrAdd(outBlocks, outIncrement);
748  vst1q_u8(outBlocks, vreinterpretq_u8_u64(block3));
749  outBlocks = PtrAdd(outBlocks, outIncrement);
750  vst1q_u8(outBlocks, vreinterpretq_u8_u64(block4));
751  outBlocks = PtrAdd(outBlocks, outIncrement);
752  vst1q_u8(outBlocks, vreinterpretq_u8_u64(block5));
753  outBlocks = PtrAdd(outBlocks, outIncrement);
754 
755  length -= 6*blockSize;
756  }
757 
758  while (length >= 2*blockSize)
759  {
760  uint64x2_t block0, block1;
761  if (flags & BT_InBlockIsCounter)
762  {
763  const uint64x2_t one = vreinterpretq_u64_u32(s_one);
764  block0 = vreinterpretq_u64_u8(vld1q_u8(inBlocks));
765  block1 = vaddq_u64(block0, one);
766  vst1q_u8(const_cast<byte*>(inBlocks),
767  vreinterpretq_u8_u64(vaddq_u64(block1, one)));
768  }
769  else
770  {
771  block0 = vreinterpretq_u64_u8(vld1q_u8(inBlocks));
772  inBlocks = PtrAdd(inBlocks, inIncrement);
773  block1 = vreinterpretq_u64_u8(vld1q_u8(inBlocks));
774  inBlocks = PtrAdd(inBlocks, inIncrement);
775  }
776 
777  if (xorInput)
778  {
779  block0 = veorq_u64(block0, vreinterpretq_u64_u8(vld1q_u8(xorBlocks)));
780  xorBlocks = PtrAdd(xorBlocks, xorIncrement);
781  block1 = veorq_u64(block1, vreinterpretq_u64_u8(vld1q_u8(xorBlocks)));
782  xorBlocks = PtrAdd(xorBlocks, xorIncrement);
783  }
784 
785  func2(block0, block1, subKeys, static_cast<unsigned int>(rounds));
786 
787  if (xorOutput)
788  {
789  block0 = veorq_u64(block0, vreinterpretq_u64_u8(vld1q_u8(xorBlocks)));
790  xorBlocks = PtrAdd(xorBlocks, xorIncrement);
791  block1 = veorq_u64(block1, vreinterpretq_u64_u8(vld1q_u8(xorBlocks)));
792  xorBlocks = PtrAdd(xorBlocks, xorIncrement);
793  }
794 
795  vst1q_u8(outBlocks, vreinterpretq_u8_u64(block0));
796  outBlocks = PtrAdd(outBlocks, outIncrement);
797  vst1q_u8(outBlocks, vreinterpretq_u8_u64(block1));
798  outBlocks = PtrAdd(outBlocks, outIncrement);
799 
800  length -= 2*blockSize;
801  }
802  }
803 
804  while (length >= blockSize)
805  {
806  uint64x2_t block, zero = {0,0};
807  block = vreinterpretq_u64_u8(vld1q_u8(inBlocks));
808 
809  if (xorInput)
810  block = veorq_u64(block, vreinterpretq_u64_u8(vld1q_u8(xorBlocks)));
811 
812  if (flags & BT_InBlockIsCounter)
813  const_cast<byte *>(inBlocks)[15]++;
814 
815  func2(block, zero, subKeys, static_cast<unsigned int>(rounds));
816 
817  if (xorOutput)
818  block = veorq_u64(block, vreinterpretq_u64_u8(vld1q_u8(xorBlocks)));
819 
820  vst1q_u8(outBlocks, vreinterpretq_u8_u64(block));
821 
822  inBlocks = PtrAdd(inBlocks, inIncrement);
823  outBlocks = PtrAdd(outBlocks, outIncrement);
824  xorBlocks = PtrAdd(xorBlocks, xorIncrement);
825  length -= blockSize;
826  }
827 
828  return length;
829 }
830 
831 NAMESPACE_END // CryptoPP
832 
833 #endif // CRYPTOPP_ARM_NEON_AVAILABLE
834 
835 // *************************** Intel SSE ************************** //
836 
837 #if defined(CRYPTOPP_SSSE3_AVAILABLE)
838 
839 // Hack for SunCC, http://github.com/weidai11/cryptopp/issues/224
840 #if (__SUNPRO_CC >= 0x5130)
841 # define MAYBE_CONST
842 # define MAYBE_UNCONST_CAST(T, x) const_cast<MAYBE_CONST T>(x)
843 #else
844 # define MAYBE_CONST const
845 # define MAYBE_UNCONST_CAST(T, x) (x)
846 #endif
847 
848 // Clang __m128i casts, http://bugs.llvm.org/show_bug.cgi?id=20670
849 #ifndef M128_CAST
850 # define M128_CAST(x) ((__m128i *)(void *)(x))
851 #endif
852 #ifndef CONST_M128_CAST
853 # define CONST_M128_CAST(x) ((const __m128i *)(const void *)(x))
854 #endif
855 
856 NAMESPACE_BEGIN(CryptoPP)
857 
858 /// \brief AdvancedProcessBlocks for 1 and 2 blocks
859 /// \tparam F1 function to process 1 64-bit block
860 /// \tparam F2 function to process 2 64-bit blocks
861 /// \tparam W word type of the subkey table
862 /// \details AdvancedProcessBlocks64_2x1_SSE processes 2 and 1 SSE SIMD words
863 /// at a time.
864 /// \details The subkey type is usually word32 or word64. F1 and F2 must use the
865 /// same word type.
866 template <typename F1, typename F2, typename W>
867 inline size_t AdvancedProcessBlocks64_2x1_SSE(F1 func1, F2 func2,
868  MAYBE_CONST W *subKeys, size_t rounds, const byte *inBlocks,
869  const byte *xorBlocks, byte *outBlocks, size_t length, word32 flags)
870 {
871  CRYPTOPP_ASSERT(subKeys);
872  CRYPTOPP_ASSERT(inBlocks);
873  CRYPTOPP_ASSERT(outBlocks);
874  CRYPTOPP_ASSERT(length >= 8);
875 
876  const size_t blockSize = 8;
877  const size_t xmmBlockSize = 16;
878 
879  size_t inIncrement = (flags & (BT_InBlockIsCounter|BT_DontIncrementInOutPointers)) ? 0 : xmmBlockSize;
880  size_t xorIncrement = (xorBlocks != NULLPTR) ? xmmBlockSize : 0;
881  size_t outIncrement = (flags & BT_DontIncrementInOutPointers) ? 0 : xmmBlockSize;
882 
883  // Clang and Coverity are generating findings using xorBlocks as a flag.
884  const bool xorInput = (xorBlocks != NULLPTR) && (flags & BT_XorInput);
885  const bool xorOutput = (xorBlocks != NULLPTR) && !(flags & BT_XorInput);
886 
887  if (flags & BT_ReverseDirection)
888  {
889  inBlocks = PtrAdd(inBlocks, length - xmmBlockSize);
890  xorBlocks = PtrAdd(xorBlocks, length - xmmBlockSize);
891  outBlocks = PtrAdd(outBlocks, length - xmmBlockSize);
892  inIncrement = 0-inIncrement;
893  xorIncrement = 0-xorIncrement;
894  outIncrement = 0-outIncrement;
895  }
896 
897  if (flags & BT_AllowParallel)
898  {
899  double temp[2];
900  while (length >= 2*xmmBlockSize)
901  {
902  __m128i block0, block1;
903  if (flags & BT_InBlockIsCounter)
904  {
905  // Increment of 1 and 2 in big-endian compatible with the ctr byte array.
906  const __m128i s_one = _mm_set_epi32(1<<24, 0, 0, 0);
907  const __m128i s_two = _mm_set_epi32(2<<24, 0, 2<<24, 0);
908 
909  // For 64-bit block ciphers we need to load the CTR block, which is 8 bytes.
910  // After the dup load we have two counters in the XMM word. Then we need
911  // to increment the low ctr by 0 and the high ctr by 1.
912  std::memcpy(temp, inBlocks, blockSize);
913  block0 = _mm_add_epi32(s_one, _mm_castpd_si128(_mm_loaddup_pd(temp)));
914 
915  // After initial increment of {0,1} remaining counters increment by {2,2}.
916  block1 = _mm_add_epi32(s_two, block0);
917 
918  // Store the next counter. When BT_InBlockIsCounter is set then
919  // inBlocks is backed by m_counterArray which is non-const.
920  _mm_store_sd(temp, _mm_castsi128_pd(_mm_add_epi64(s_two, block1)));
921  std::memcpy(const_cast<byte*>(inBlocks), temp, blockSize);
922  }
923  else
924  {
925  block0 = _mm_loadu_si128(CONST_M128_CAST(inBlocks));
926  inBlocks = PtrAdd(inBlocks, inIncrement);
927  block1 = _mm_loadu_si128(CONST_M128_CAST(inBlocks));
928  inBlocks = PtrAdd(inBlocks, inIncrement);
929  }
930 
931  if (xorInput)
932  {
933  block0 = _mm_xor_si128(block0, _mm_loadu_si128(CONST_M128_CAST(xorBlocks)));
934  xorBlocks = PtrAdd(xorBlocks, xorIncrement);
935  block1 = _mm_xor_si128(block1, _mm_loadu_si128(CONST_M128_CAST(xorBlocks)));
936  xorBlocks = PtrAdd(xorBlocks, xorIncrement);
937  }
938 
939  func2(block0, block1, subKeys, static_cast<unsigned int>(rounds));
940 
941  if (xorOutput)
942  {
943  block0 = _mm_xor_si128(block0, _mm_loadu_si128(CONST_M128_CAST(xorBlocks)));
944  xorBlocks = PtrAdd(xorBlocks, xorIncrement);
945  block1 = _mm_xor_si128(block1, _mm_loadu_si128(CONST_M128_CAST(xorBlocks)));
946  xorBlocks = PtrAdd(xorBlocks, xorIncrement);
947  }
948 
949  _mm_storeu_si128(M128_CAST(outBlocks), block0);
950  outBlocks = PtrAdd(outBlocks, outIncrement);
951  _mm_storeu_si128(M128_CAST(outBlocks), block1);
952  outBlocks = PtrAdd(outBlocks, outIncrement);
953 
954  length -= 2*xmmBlockSize;
955  }
956  }
957 
958  if (length)
959  {
960  // Adjust to real block size
961  if (flags & BT_ReverseDirection)
962  {
963  inIncrement += inIncrement ? blockSize : 0;
964  xorIncrement += xorIncrement ? blockSize : 0;
965  outIncrement += outIncrement ? blockSize : 0;
966  inBlocks = PtrSub(inBlocks, inIncrement);
967  xorBlocks = PtrSub(xorBlocks, xorIncrement);
968  outBlocks = PtrSub(outBlocks, outIncrement);
969  }
970  else
971  {
972  inIncrement -= inIncrement ? blockSize : 0;
973  xorIncrement -= xorIncrement ? blockSize : 0;
974  outIncrement -= outIncrement ? blockSize : 0;
975  }
976 
977  while (length >= blockSize)
978  {
979  double temp[2];
980  std::memcpy(temp, inBlocks, blockSize);
981  __m128i block = _mm_castpd_si128(_mm_load_sd(temp));
982 
983  if (xorInput)
984  {
985  std::memcpy(temp, xorBlocks, blockSize);
986  block = _mm_xor_si128(block, _mm_castpd_si128(_mm_load_sd(temp)));
987  }
988 
989  if (flags & BT_InBlockIsCounter)
990  const_cast<byte *>(inBlocks)[7]++;
991 
992  func1(block, subKeys, static_cast<unsigned int>(rounds));
993 
994  if (xorOutput)
995  {
996  std::memcpy(temp, xorBlocks, blockSize);
997  block = _mm_xor_si128(block, _mm_castpd_si128(_mm_load_sd(temp)));
998  }
999 
1000  _mm_store_sd(temp, _mm_castsi128_pd(block));
1001  std::memcpy(outBlocks, temp, blockSize);
1002 
1003  inBlocks = PtrAdd(inBlocks, inIncrement);
1004  outBlocks = PtrAdd(outBlocks, outIncrement);
1005  xorBlocks = PtrAdd(xorBlocks, xorIncrement);
1006  length -= blockSize;
1007  }
1008  }
1009 
1010  return length;
1011 }
1012 
1013 /// \brief AdvancedProcessBlocks for 2 and 6 blocks
1014 /// \tparam F2 function to process 2 64-bit blocks
1015 /// \tparam F6 function to process 6 64-bit blocks
1016 /// \tparam W word type of the subkey table
1017 /// \details AdvancedProcessBlocks64_6x2_SSE processes 6 and 2 SSE SIMD words
1018 /// at a time. For a single block the template uses F2 with a zero block.
1019 /// \details The subkey type is usually word32 or word64. F2 and F6 must use the
1020 /// same word type.
1021 template <typename F2, typename F6, typename W>
1022 inline size_t AdvancedProcessBlocks64_6x2_SSE(F2 func2, F6 func6,
1023  MAYBE_CONST W *subKeys, size_t rounds, const byte *inBlocks,
1024  const byte *xorBlocks, byte *outBlocks, size_t length, word32 flags)
1025 {
1026  CRYPTOPP_ASSERT(subKeys);
1027  CRYPTOPP_ASSERT(inBlocks);
1028  CRYPTOPP_ASSERT(outBlocks);
1029  CRYPTOPP_ASSERT(length >= 8);
1030 
1031  const size_t blockSize = 8;
1032  const size_t xmmBlockSize = 16;
1033 
1034  size_t inIncrement = (flags & (BT_InBlockIsCounter|BT_DontIncrementInOutPointers)) ? 0 : xmmBlockSize;
1035  size_t xorIncrement = (xorBlocks != NULLPTR) ? xmmBlockSize : 0;
1036  size_t outIncrement = (flags & BT_DontIncrementInOutPointers) ? 0 : xmmBlockSize;
1037 
1038  // Clang and Coverity are generating findings using xorBlocks as a flag.
1039  const bool xorInput = (xorBlocks != NULLPTR) && (flags & BT_XorInput);
1040  const bool xorOutput = (xorBlocks != NULLPTR) && !(flags & BT_XorInput);
1041 
1042  if (flags & BT_ReverseDirection)
1043  {
1044  inBlocks = PtrAdd(inBlocks, length - xmmBlockSize);
1045  xorBlocks = PtrAdd(xorBlocks, length - xmmBlockSize);
1046  outBlocks = PtrAdd(outBlocks, length - xmmBlockSize);
1047  inIncrement = 0-inIncrement;
1048  xorIncrement = 0-xorIncrement;
1049  outIncrement = 0-outIncrement;
1050  }
1051 
1052  if (flags & BT_AllowParallel)
1053  {
1054  double temp[2];
1055  while (length >= 6*xmmBlockSize)
1056  {
1057  __m128i block0, block1, block2, block3, block4, block5;
1058  if (flags & BT_InBlockIsCounter)
1059  {
1060  // Increment of 1 and 2 in big-endian compatible with the ctr byte array.
1061  const __m128i s_one = _mm_set_epi32(1<<24, 0, 0, 0);
1062  const __m128i s_two = _mm_set_epi32(2<<24, 0, 2<<24, 0);
1063 
1064  // For 64-bit block ciphers we need to load the CTR block, which is 8 bytes.
1065  // After the dup load we have two counters in the XMM word. Then we need
1066  // to increment the low ctr by 0 and the high ctr by 1.
1067  std::memcpy(temp, inBlocks, blockSize);
1068  block0 = _mm_add_epi32(s_one, _mm_castpd_si128(_mm_loaddup_pd(temp)));
1069 
1070  // After initial increment of {0,1} remaining counters increment by {2,2}.
1071  block1 = _mm_add_epi32(s_two, block0);
1072  block2 = _mm_add_epi32(s_two, block1);
1073  block3 = _mm_add_epi32(s_two, block2);
1074  block4 = _mm_add_epi32(s_two, block3);
1075  block5 = _mm_add_epi32(s_two, block4);
1076 
1077  // Store the next counter. When BT_InBlockIsCounter is set then
1078  // inBlocks is backed by m_counterArray which is non-const.
1079  _mm_store_sd(temp, _mm_castsi128_pd(_mm_add_epi32(s_two, block5)));
1080  std::memcpy(const_cast<byte*>(inBlocks), temp, blockSize);
1081  }
1082  else
1083  {
1084  block0 = _mm_loadu_si128(CONST_M128_CAST(inBlocks));
1085  inBlocks = PtrAdd(inBlocks, inIncrement);
1086  block1 = _mm_loadu_si128(CONST_M128_CAST(inBlocks));
1087  inBlocks = PtrAdd(inBlocks, inIncrement);
1088  block2 = _mm_loadu_si128(CONST_M128_CAST(inBlocks));
1089  inBlocks = PtrAdd(inBlocks, inIncrement);
1090  block3 = _mm_loadu_si128(CONST_M128_CAST(inBlocks));
1091  inBlocks = PtrAdd(inBlocks, inIncrement);
1092  block4 = _mm_loadu_si128(CONST_M128_CAST(inBlocks));
1093  inBlocks = PtrAdd(inBlocks, inIncrement);
1094  block5 = _mm_loadu_si128(CONST_M128_CAST(inBlocks));
1095  inBlocks = PtrAdd(inBlocks, inIncrement);
1096  }
1097 
1098  if (xorInput)
1099  {
1100  block0 = _mm_xor_si128(block0, _mm_loadu_si128(CONST_M128_CAST(xorBlocks)));
1101  xorBlocks = PtrAdd(xorBlocks, xorIncrement);
1102  block1 = _mm_xor_si128(block1, _mm_loadu_si128(CONST_M128_CAST(xorBlocks)));
1103  xorBlocks = PtrAdd(xorBlocks, xorIncrement);
1104  block2 = _mm_xor_si128(block2, _mm_loadu_si128(CONST_M128_CAST(xorBlocks)));
1105  xorBlocks = PtrAdd(xorBlocks, xorIncrement);
1106  block3 = _mm_xor_si128(block3, _mm_loadu_si128(CONST_M128_CAST(xorBlocks)));
1107  xorBlocks = PtrAdd(xorBlocks, xorIncrement);
1108  block4 = _mm_xor_si128(block4, _mm_loadu_si128(CONST_M128_CAST(xorBlocks)));
1109  xorBlocks = PtrAdd(xorBlocks, xorIncrement);
1110  block5 = _mm_xor_si128(block5, _mm_loadu_si128(CONST_M128_CAST(xorBlocks)));
1111  xorBlocks = PtrAdd(xorBlocks, xorIncrement);
1112  }
1113 
1114  func6(block0, block1, block2, block3, block4, block5, subKeys, static_cast<unsigned int>(rounds));
1115 
1116  if (xorOutput)
1117  {
1118  block0 = _mm_xor_si128(block0, _mm_loadu_si128(CONST_M128_CAST(xorBlocks)));
1119  xorBlocks = PtrAdd(xorBlocks, xorIncrement);
1120  block1 = _mm_xor_si128(block1, _mm_loadu_si128(CONST_M128_CAST(xorBlocks)));
1121  xorBlocks = PtrAdd(xorBlocks, xorIncrement);
1122  block2 = _mm_xor_si128(block2, _mm_loadu_si128(CONST_M128_CAST(xorBlocks)));
1123  xorBlocks = PtrAdd(xorBlocks, xorIncrement);
1124  block3 = _mm_xor_si128(block3, _mm_loadu_si128(CONST_M128_CAST(xorBlocks)));
1125  xorBlocks = PtrAdd(xorBlocks, xorIncrement);
1126  block4 = _mm_xor_si128(block4, _mm_loadu_si128(CONST_M128_CAST(xorBlocks)));
1127  xorBlocks = PtrAdd(xorBlocks, xorIncrement);
1128  block5 = _mm_xor_si128(block5, _mm_loadu_si128(CONST_M128_CAST(xorBlocks)));
1129  xorBlocks = PtrAdd(xorBlocks, xorIncrement);
1130  }
1131 
1132  _mm_storeu_si128(M128_CAST(outBlocks), block0);
1133  outBlocks = PtrAdd(outBlocks, outIncrement);
1134  _mm_storeu_si128(M128_CAST(outBlocks), block1);
1135  outBlocks = PtrAdd(outBlocks, outIncrement);
1136  _mm_storeu_si128(M128_CAST(outBlocks), block2);
1137  outBlocks = PtrAdd(outBlocks, outIncrement);
1138  _mm_storeu_si128(M128_CAST(outBlocks), block3);
1139  outBlocks = PtrAdd(outBlocks, outIncrement);
1140  _mm_storeu_si128(M128_CAST(outBlocks), block4);
1141  outBlocks = PtrAdd(outBlocks, outIncrement);
1142  _mm_storeu_si128(M128_CAST(outBlocks), block5);
1143  outBlocks = PtrAdd(outBlocks, outIncrement);
1144 
1145  length -= 6*xmmBlockSize;
1146  }
1147 
1148  while (length >= 2*xmmBlockSize)
1149  {
1150  __m128i block0, block1;
1151  if (flags & BT_InBlockIsCounter)
1152  {
1153  // Increment of 1 and 2 in big-endian compatible with the ctr byte array.
1154  const __m128i s_one = _mm_set_epi32(1<<24, 0, 0, 0);
1155  const __m128i s_two = _mm_set_epi32(2<<24, 0, 2<<24, 0);
1156 
1157  // For 64-bit block ciphers we need to load the CTR block, which is 8 bytes.
1158  // After the dup load we have two counters in the XMM word. Then we need
1159  // to increment the low ctr by 0 and the high ctr by 1.
1160  std::memcpy(temp, inBlocks, blockSize);
1161  block0 = _mm_add_epi32(s_one, _mm_castpd_si128(_mm_loaddup_pd(temp)));
1162 
1163  // After initial increment of {0,1} remaining counters increment by {2,2}.
1164  block1 = _mm_add_epi32(s_two, block0);
1165 
1166  // Store the next counter. When BT_InBlockIsCounter is set then
1167  // inBlocks is backed by m_counterArray which is non-const.
1168  _mm_store_sd(temp, _mm_castsi128_pd(_mm_add_epi64(s_two, block1)));
1169  std::memcpy(const_cast<byte*>(inBlocks), temp, blockSize);
1170  }
1171  else
1172  {
1173  block0 = _mm_loadu_si128(CONST_M128_CAST(inBlocks));
1174  inBlocks = PtrAdd(inBlocks, inIncrement);
1175  block1 = _mm_loadu_si128(CONST_M128_CAST(inBlocks));
1176  inBlocks = PtrAdd(inBlocks, inIncrement);
1177  }
1178 
1179  if (xorInput)
1180  {
1181  block0 = _mm_xor_si128(block0, _mm_loadu_si128(CONST_M128_CAST(xorBlocks)));
1182  xorBlocks = PtrAdd(xorBlocks, xorIncrement);
1183  block1 = _mm_xor_si128(block1, _mm_loadu_si128(CONST_M128_CAST(xorBlocks)));
1184  xorBlocks = PtrAdd(xorBlocks, xorIncrement);
1185  }
1186 
1187  func2(block0, block1, subKeys, static_cast<unsigned int>(rounds));
1188 
1189  if (xorOutput)
1190  {
1191  block0 = _mm_xor_si128(block0, _mm_loadu_si128(CONST_M128_CAST(xorBlocks)));
1192  xorBlocks = PtrAdd(xorBlocks, xorIncrement);
1193  block1 = _mm_xor_si128(block1, _mm_loadu_si128(CONST_M128_CAST(xorBlocks)));
1194  xorBlocks = PtrAdd(xorBlocks, xorIncrement);
1195  }
1196 
1197  _mm_storeu_si128(M128_CAST(outBlocks), block0);
1198  outBlocks = PtrAdd(outBlocks, outIncrement);
1199  _mm_storeu_si128(M128_CAST(outBlocks), block1);
1200  outBlocks = PtrAdd(outBlocks, outIncrement);
1201 
1202  length -= 2*xmmBlockSize;
1203  }
1204  }
1205 
1206  if (length)
1207  {
1208  // Adjust to real block size
1209  if (flags & BT_ReverseDirection)
1210  {
1211  inIncrement += inIncrement ? blockSize : 0;
1212  xorIncrement += xorIncrement ? blockSize : 0;
1213  outIncrement += outIncrement ? blockSize : 0;
1214  inBlocks = PtrSub(inBlocks, inIncrement);
1215  xorBlocks = PtrSub(xorBlocks, xorIncrement);
1216  outBlocks = PtrSub(outBlocks, outIncrement);
1217  }
1218  else
1219  {
1220  inIncrement -= inIncrement ? blockSize : 0;
1221  xorIncrement -= xorIncrement ? blockSize : 0;
1222  outIncrement -= outIncrement ? blockSize : 0;
1223  }
1224 
1225  while (length >= blockSize)
1226  {
1227  double temp[2];
1228  __m128i block, zero = _mm_setzero_si128();
1229  std::memcpy(temp, inBlocks, blockSize);
1230  block = _mm_castpd_si128(_mm_load_sd(temp));
1231 
1232  if (xorInput)
1233  {
1234  std::memcpy(temp, xorBlocks, blockSize);
1235  block = _mm_xor_si128(block,
1236  _mm_castpd_si128(_mm_load_sd(temp)));
1237  }
1238 
1239  if (flags & BT_InBlockIsCounter)
1240  const_cast<byte *>(inBlocks)[7]++;
1241 
1242  func2(block, zero, subKeys, static_cast<unsigned int>(rounds));
1243 
1244  if (xorOutput)
1245  {
1246  std::memcpy(temp, xorBlocks, blockSize);
1247  block = _mm_xor_si128(block,
1248  _mm_castpd_si128(_mm_load_sd(temp)));
1249  }
1250 
1251  _mm_store_sd(temp, _mm_castsi128_pd(block));
1252  std::memcpy(outBlocks, temp, blockSize);
1253 
1254  inBlocks = PtrAdd(inBlocks, inIncrement);
1255  outBlocks = PtrAdd(outBlocks, outIncrement);
1256  xorBlocks = PtrAdd(xorBlocks, xorIncrement);
1257  length -= blockSize;
1258  }
1259  }
1260 
1261  return length;
1262 }
1263 
1264 /// \brief AdvancedProcessBlocks for 2 and 6 blocks
1265 /// \tparam F2 function to process 2 128-bit blocks
1266 /// \tparam F6 function to process 6 128-bit blocks
1267 /// \tparam W word type of the subkey table
1268 /// \details AdvancedProcessBlocks128_6x2_SSE processes 6 and 2 SSE SIMD words
1269 /// at a time. For a single block the template uses F2 with a zero block.
1270 /// \details The subkey type is usually word32 or word64. F2 and F6 must use the
1271 /// same word type.
1272 template <typename F2, typename F6, typename W>
1273 inline size_t AdvancedProcessBlocks128_6x2_SSE(F2 func2, F6 func6,
1274  MAYBE_CONST W *subKeys, size_t rounds, const byte *inBlocks,
1275  const byte *xorBlocks, byte *outBlocks, size_t length, word32 flags)
1276 {
1277  CRYPTOPP_ASSERT(subKeys);
1278  CRYPTOPP_ASSERT(inBlocks);
1279  CRYPTOPP_ASSERT(outBlocks);
1280  CRYPTOPP_ASSERT(length >= 16);
1281 
1282  const size_t blockSize = 16;
1283  // const size_t xmmBlockSize = 16;
1284 
1285  size_t inIncrement = (flags & (BT_InBlockIsCounter|BT_DontIncrementInOutPointers)) ? 0 : blockSize;
1286  size_t xorIncrement = (xorBlocks != NULLPTR) ? blockSize : 0;
1287  size_t outIncrement = (flags & BT_DontIncrementInOutPointers) ? 0 : blockSize;
1288 
1289  // Clang and Coverity are generating findings using xorBlocks as a flag.
1290  const bool xorInput = (xorBlocks != NULLPTR) && (flags & BT_XorInput);
1291  const bool xorOutput = (xorBlocks != NULLPTR) && !(flags & BT_XorInput);
1292 
1293  if (flags & BT_ReverseDirection)
1294  {
1295  inBlocks = PtrAdd(inBlocks, length - blockSize);
1296  xorBlocks = PtrAdd(xorBlocks, length - blockSize);
1297  outBlocks = PtrAdd(outBlocks, length - blockSize);
1298  inIncrement = 0-inIncrement;
1299  xorIncrement = 0-xorIncrement;
1300  outIncrement = 0-outIncrement;
1301  }
1302 
1303  if (flags & BT_AllowParallel)
1304  {
1305  while (length >= 6*blockSize)
1306  {
1307  __m128i block0, block1, block2, block3, block4, block5;
1308  if (flags & BT_InBlockIsCounter)
1309  {
1310  // Increment of 1 in big-endian compatible with the ctr byte array.
1311  const __m128i s_one = _mm_set_epi32(1<<24, 0, 0, 0);
1312  block0 = _mm_loadu_si128(CONST_M128_CAST(inBlocks));
1313  block1 = _mm_add_epi32(block0, s_one);
1314  block2 = _mm_add_epi32(block1, s_one);
1315  block3 = _mm_add_epi32(block2, s_one);
1316  block4 = _mm_add_epi32(block3, s_one);
1317  block5 = _mm_add_epi32(block4, s_one);
1318  _mm_storeu_si128(M128_CAST(inBlocks), _mm_add_epi32(block5, s_one));
1319  }
1320  else
1321  {
1322  block0 = _mm_loadu_si128(CONST_M128_CAST(inBlocks));
1323  inBlocks = PtrAdd(inBlocks, inIncrement);
1324  block1 = _mm_loadu_si128(CONST_M128_CAST(inBlocks));
1325  inBlocks = PtrAdd(inBlocks, inIncrement);
1326  block2 = _mm_loadu_si128(CONST_M128_CAST(inBlocks));
1327  inBlocks = PtrAdd(inBlocks, inIncrement);
1328  block3 = _mm_loadu_si128(CONST_M128_CAST(inBlocks));
1329  inBlocks = PtrAdd(inBlocks, inIncrement);
1330  block4 = _mm_loadu_si128(CONST_M128_CAST(inBlocks));
1331  inBlocks = PtrAdd(inBlocks, inIncrement);
1332  block5 = _mm_loadu_si128(CONST_M128_CAST(inBlocks));
1333  inBlocks = PtrAdd(inBlocks, inIncrement);
1334  }
1335 
1336  if (xorInput)
1337  {
1338  block0 = _mm_xor_si128(block0, _mm_loadu_si128(CONST_M128_CAST(xorBlocks)));
1339  xorBlocks = PtrAdd(xorBlocks, xorIncrement);
1340  block1 = _mm_xor_si128(block1, _mm_loadu_si128(CONST_M128_CAST(xorBlocks)));
1341  xorBlocks = PtrAdd(xorBlocks, xorIncrement);
1342  block2 = _mm_xor_si128(block2, _mm_loadu_si128(CONST_M128_CAST(xorBlocks)));
1343  xorBlocks = PtrAdd(xorBlocks, xorIncrement);
1344  block3 = _mm_xor_si128(block3, _mm_loadu_si128(CONST_M128_CAST(xorBlocks)));
1345  xorBlocks = PtrAdd(xorBlocks, xorIncrement);
1346  block4 = _mm_xor_si128(block4, _mm_loadu_si128(CONST_M128_CAST(xorBlocks)));
1347  xorBlocks = PtrAdd(xorBlocks, xorIncrement);
1348  block5 = _mm_xor_si128(block5, _mm_loadu_si128(CONST_M128_CAST(xorBlocks)));
1349  xorBlocks = PtrAdd(xorBlocks, xorIncrement);
1350  }
1351 
1352  func6(block0, block1, block2, block3, block4, block5, subKeys, static_cast<unsigned int>(rounds));
1353 
1354  if (xorOutput)
1355  {
1356  block0 = _mm_xor_si128(block0, _mm_loadu_si128(CONST_M128_CAST(xorBlocks)));
1357  xorBlocks = PtrAdd(xorBlocks, xorIncrement);
1358  block1 = _mm_xor_si128(block1, _mm_loadu_si128(CONST_M128_CAST(xorBlocks)));
1359  xorBlocks = PtrAdd(xorBlocks, xorIncrement);
1360  block2 = _mm_xor_si128(block2, _mm_loadu_si128(CONST_M128_CAST(xorBlocks)));
1361  xorBlocks = PtrAdd(xorBlocks, xorIncrement);
1362  block3 = _mm_xor_si128(block3, _mm_loadu_si128(CONST_M128_CAST(xorBlocks)));
1363  xorBlocks = PtrAdd(xorBlocks, xorIncrement);
1364  block4 = _mm_xor_si128(block4, _mm_loadu_si128(CONST_M128_CAST(xorBlocks)));
1365  xorBlocks = PtrAdd(xorBlocks, xorIncrement);
1366  block5 = _mm_xor_si128(block5, _mm_loadu_si128(CONST_M128_CAST(xorBlocks)));
1367  xorBlocks = PtrAdd(xorBlocks, xorIncrement);
1368  }
1369 
1370  _mm_storeu_si128(M128_CAST(outBlocks), block0);
1371  outBlocks = PtrAdd(outBlocks, outIncrement);
1372  _mm_storeu_si128(M128_CAST(outBlocks), block1);
1373  outBlocks = PtrAdd(outBlocks, outIncrement);
1374  _mm_storeu_si128(M128_CAST(outBlocks), block2);
1375  outBlocks = PtrAdd(outBlocks, outIncrement);
1376  _mm_storeu_si128(M128_CAST(outBlocks), block3);
1377  outBlocks = PtrAdd(outBlocks, outIncrement);
1378  _mm_storeu_si128(M128_CAST(outBlocks), block4);
1379  outBlocks = PtrAdd(outBlocks, outIncrement);
1380  _mm_storeu_si128(M128_CAST(outBlocks), block5);
1381  outBlocks = PtrAdd(outBlocks, outIncrement);
1382 
1383  length -= 6*blockSize;
1384  }
1385 
1386  while (length >= 2*blockSize)
1387  {
1388  __m128i block0, block1;
1389  if (flags & BT_InBlockIsCounter)
1390  {
1391  // Increment of 1 in big-endian compatible with the ctr byte array.
1392  const __m128i s_one = _mm_set_epi32(1<<24, 0, 0, 0);
1393  block0 = _mm_loadu_si128(CONST_M128_CAST(inBlocks));
1394  block1 = _mm_add_epi32(block0, s_one);
1395  _mm_storeu_si128(M128_CAST(inBlocks), _mm_add_epi32(block1, s_one));
1396  }
1397  else
1398  {
1399  block0 = _mm_loadu_si128(CONST_M128_CAST(inBlocks));
1400  inBlocks = PtrAdd(inBlocks, inIncrement);
1401  block1 = _mm_loadu_si128(CONST_M128_CAST(inBlocks));
1402  inBlocks = PtrAdd(inBlocks, inIncrement);
1403  }
1404 
1405  if (xorInput)
1406  {
1407  block0 = _mm_xor_si128(block0, _mm_loadu_si128(CONST_M128_CAST(xorBlocks)));
1408  xorBlocks = PtrAdd(xorBlocks, xorIncrement);
1409  block1 = _mm_xor_si128(block1, _mm_loadu_si128(CONST_M128_CAST(xorBlocks)));
1410  xorBlocks = PtrAdd(xorBlocks, xorIncrement);
1411  }
1412 
1413  func2(block0, block1, subKeys, static_cast<unsigned int>(rounds));
1414 
1415  if (xorOutput)
1416  {
1417  block0 = _mm_xor_si128(block0, _mm_loadu_si128(CONST_M128_CAST(xorBlocks)));
1418  xorBlocks = PtrAdd(xorBlocks, xorIncrement);
1419  block1 = _mm_xor_si128(block1, _mm_loadu_si128(CONST_M128_CAST(xorBlocks)));
1420  xorBlocks = PtrAdd(xorBlocks, xorIncrement);
1421  }
1422 
1423  _mm_storeu_si128(M128_CAST(outBlocks), block0);
1424  outBlocks = PtrAdd(outBlocks, outIncrement);
1425  _mm_storeu_si128(M128_CAST(outBlocks), block1);
1426  outBlocks = PtrAdd(outBlocks, outIncrement);
1427 
1428  length -= 2*blockSize;
1429  }
1430  }
1431 
1432  while (length >= blockSize)
1433  {
1434  __m128i block, zero = _mm_setzero_si128();
1435  block = _mm_loadu_si128(CONST_M128_CAST(inBlocks));
1436 
1437  if (xorInput)
1438  block = _mm_xor_si128(block, _mm_loadu_si128(CONST_M128_CAST(xorBlocks)));
1439 
1440  if (flags & BT_InBlockIsCounter)
1441  const_cast<byte *>(inBlocks)[15]++;
1442 
1443  func2(block, zero, subKeys, static_cast<unsigned int>(rounds));
1444 
1445  if (xorOutput)
1446  block = _mm_xor_si128(block, _mm_loadu_si128(CONST_M128_CAST(xorBlocks)));
1447 
1448  _mm_storeu_si128(M128_CAST(outBlocks), block);
1449 
1450  inBlocks = PtrAdd(inBlocks, inIncrement);
1451  outBlocks = PtrAdd(outBlocks, outIncrement);
1452  xorBlocks = PtrAdd(xorBlocks, xorIncrement);
1453  length -= blockSize;
1454  }
1455 
1456  return length;
1457 }
1458 
1459 /// \brief AdvancedProcessBlocks for 1 and 4 blocks
1460 /// \tparam F1 function to process 1 128-bit block
1461 /// \tparam F4 function to process 4 128-bit blocks
1462 /// \tparam W word type of the subkey table
1463 /// \details AdvancedProcessBlocks128_4x1_SSE processes 4 and 1 SSE SIMD words
1464 /// at a time.
1465 /// \details The subkey type is usually word32 or word64. F1 and F4 must use the
1466 /// same word type.
1467 template <typename F1, typename F4, typename W>
1468 inline size_t AdvancedProcessBlocks128_4x1_SSE(F1 func1, F4 func4,
1469  MAYBE_CONST W *subKeys, size_t rounds, const byte *inBlocks,
1470  const byte *xorBlocks, byte *outBlocks, size_t length, word32 flags)
1471 {
1472  CRYPTOPP_ASSERT(subKeys);
1473  CRYPTOPP_ASSERT(inBlocks);
1474  CRYPTOPP_ASSERT(outBlocks);
1475  CRYPTOPP_ASSERT(length >= 16);
1476 
1477  const size_t blockSize = 16;
1478  // const size_t xmmBlockSize = 16;
1479 
1480  size_t inIncrement = (flags & (BT_InBlockIsCounter|BT_DontIncrementInOutPointers)) ? 0 : blockSize;
1481  size_t xorIncrement = (xorBlocks != NULLPTR) ? blockSize : 0;
1482  size_t outIncrement = (flags & BT_DontIncrementInOutPointers) ? 0 : blockSize;
1483 
1484  // Clang and Coverity are generating findings using xorBlocks as a flag.
1485  const bool xorInput = (xorBlocks != NULLPTR) && (flags & BT_XorInput);
1486  const bool xorOutput = (xorBlocks != NULLPTR) && !(flags & BT_XorInput);
1487 
1488  if (flags & BT_ReverseDirection)
1489  {
1490  inBlocks = PtrAdd(inBlocks, length - blockSize);
1491  xorBlocks = PtrAdd(xorBlocks, length - blockSize);
1492  outBlocks = PtrAdd(outBlocks, length - blockSize);
1493  inIncrement = 0-inIncrement;
1494  xorIncrement = 0-xorIncrement;
1495  outIncrement = 0-outIncrement;
1496  }
1497 
1498  if (flags & BT_AllowParallel)
1499  {
1500  while (length >= 4*blockSize)
1501  {
1502  __m128i block0, block1, block2, block3;
1503  if (flags & BT_InBlockIsCounter)
1504  {
1505  // Increment of 1 in big-endian compatible with the ctr byte array.
1506  const __m128i s_one = _mm_set_epi32(1<<24, 0, 0, 0);
1507  block0 = _mm_loadu_si128(CONST_M128_CAST(inBlocks));
1508  block1 = _mm_add_epi32(block0, s_one);
1509  block2 = _mm_add_epi32(block1, s_one);
1510  block3 = _mm_add_epi32(block2, s_one);
1511  _mm_storeu_si128(M128_CAST(inBlocks), _mm_add_epi32(block3, s_one));
1512  }
1513  else
1514  {
1515  block0 = _mm_loadu_si128(CONST_M128_CAST(inBlocks));
1516  inBlocks = PtrAdd(inBlocks, inIncrement);
1517  block1 = _mm_loadu_si128(CONST_M128_CAST(inBlocks));
1518  inBlocks = PtrAdd(inBlocks, inIncrement);
1519  block2 = _mm_loadu_si128(CONST_M128_CAST(inBlocks));
1520  inBlocks = PtrAdd(inBlocks, inIncrement);
1521  block3 = _mm_loadu_si128(CONST_M128_CAST(inBlocks));
1522  inBlocks = PtrAdd(inBlocks, inIncrement);
1523  }
1524 
1525  if (xorInput)
1526  {
1527  block0 = _mm_xor_si128(block0, _mm_loadu_si128(CONST_M128_CAST(xorBlocks)));
1528  xorBlocks = PtrAdd(xorBlocks, xorIncrement);
1529  block1 = _mm_xor_si128(block1, _mm_loadu_si128(CONST_M128_CAST(xorBlocks)));
1530  xorBlocks = PtrAdd(xorBlocks, xorIncrement);
1531  block2 = _mm_xor_si128(block2, _mm_loadu_si128(CONST_M128_CAST(xorBlocks)));
1532  xorBlocks = PtrAdd(xorBlocks, xorIncrement);
1533  block3 = _mm_xor_si128(block3, _mm_loadu_si128(CONST_M128_CAST(xorBlocks)));
1534  xorBlocks = PtrAdd(xorBlocks, xorIncrement);
1535  }
1536 
1537  func4(block0, block1, block2, block3, subKeys, static_cast<unsigned int>(rounds));
1538 
1539  if (xorOutput)
1540  {
1541  block0 = _mm_xor_si128(block0, _mm_loadu_si128(CONST_M128_CAST(xorBlocks)));
1542  xorBlocks = PtrAdd(xorBlocks, xorIncrement);
1543  block1 = _mm_xor_si128(block1, _mm_loadu_si128(CONST_M128_CAST(xorBlocks)));
1544  xorBlocks = PtrAdd(xorBlocks, xorIncrement);
1545  block2 = _mm_xor_si128(block2, _mm_loadu_si128(CONST_M128_CAST(xorBlocks)));
1546  xorBlocks = PtrAdd(xorBlocks, xorIncrement);
1547  block3 = _mm_xor_si128(block3, _mm_loadu_si128(CONST_M128_CAST(xorBlocks)));
1548  xorBlocks = PtrAdd(xorBlocks, xorIncrement);
1549  }
1550 
1551  _mm_storeu_si128(M128_CAST(outBlocks), block0);
1552  outBlocks = PtrAdd(outBlocks, outIncrement);
1553  _mm_storeu_si128(M128_CAST(outBlocks), block1);
1554  outBlocks = PtrAdd(outBlocks, outIncrement);
1555  _mm_storeu_si128(M128_CAST(outBlocks), block2);
1556  outBlocks = PtrAdd(outBlocks, outIncrement);
1557  _mm_storeu_si128(M128_CAST(outBlocks), block3);
1558  outBlocks = PtrAdd(outBlocks, outIncrement);
1559 
1560  length -= 4*blockSize;
1561  }
1562  }
1563 
1564  while (length >= blockSize)
1565  {
1566  __m128i block = _mm_loadu_si128(CONST_M128_CAST(inBlocks));
1567 
1568  if (xorInput)
1569  block = _mm_xor_si128(block, _mm_loadu_si128(CONST_M128_CAST(xorBlocks)));
1570 
1571  if (flags & BT_InBlockIsCounter)
1572  const_cast<byte *>(inBlocks)[15]++;
1573 
1574  func1(block, subKeys, static_cast<unsigned int>(rounds));
1575 
1576  if (xorOutput)
1577  block = _mm_xor_si128(block, _mm_loadu_si128(CONST_M128_CAST(xorBlocks)));
1578 
1579  _mm_storeu_si128(M128_CAST(outBlocks), block);
1580 
1581  inBlocks = PtrAdd(inBlocks, inIncrement);
1582  outBlocks = PtrAdd(outBlocks, outIncrement);
1583  xorBlocks = PtrAdd(xorBlocks, xorIncrement);
1584  length -= blockSize;
1585  }
1586 
1587  return length;
1588 }
1589 
1590 /// \brief AdvancedProcessBlocks for 1 and 4 blocks
1591 /// \tparam F1 function to process 1 64-bit block
1592 /// \tparam F4 function to process 6 64-bit blocks
1593 /// \tparam W word type of the subkey table
1594 /// \details AdvancedProcessBlocks64_4x1_SSE processes 4 and 1 SSE SIMD words
1595 /// at a time.
1596 /// \details The subkey type is usually word32 or word64. F1 and F4 must use the
1597 /// same word type.
1598 template <typename F1, typename F4, typename W>
1599 inline size_t AdvancedProcessBlocks64_4x1_SSE(F1 func1, F4 func4,
1600  MAYBE_CONST W *subKeys, size_t rounds, const byte *inBlocks,
1601  const byte *xorBlocks, byte *outBlocks, size_t length, word32 flags)
1602 {
1603  CRYPTOPP_ASSERT(subKeys);
1604  CRYPTOPP_ASSERT(inBlocks);
1605  CRYPTOPP_ASSERT(outBlocks);
1606  CRYPTOPP_ASSERT(length >= 8);
1607 
1608  const size_t blockSize = 8;
1609  const size_t xmmBlockSize = 16;
1610 
1611  size_t inIncrement = (flags & (BT_InBlockIsCounter | BT_DontIncrementInOutPointers)) ? 0 : xmmBlockSize;
1612  size_t xorIncrement = (xorBlocks != NULLPTR) ? xmmBlockSize : 0;
1613  size_t outIncrement = (flags & BT_DontIncrementInOutPointers) ? 0 : xmmBlockSize;
1614 
1615  // Clang and Coverity are generating findings using xorBlocks as a flag.
1616  const bool xorInput = (xorBlocks != NULLPTR) && (flags & BT_XorInput);
1617  const bool xorOutput = (xorBlocks != NULLPTR) && !(flags & BT_XorInput);
1618 
1619  if (flags & BT_ReverseDirection)
1620  {
1621  inBlocks = PtrAdd(inBlocks, length - xmmBlockSize);
1622  xorBlocks = PtrAdd(xorBlocks, length - xmmBlockSize);
1623  outBlocks = PtrAdd(outBlocks, length - xmmBlockSize);
1624  inIncrement = 0 - inIncrement;
1625  xorIncrement = 0 - xorIncrement;
1626  outIncrement = 0 - outIncrement;
1627  }
1628 
1629  if (flags & BT_AllowParallel)
1630  {
1631  while (length >= 4*xmmBlockSize)
1632  {
1633  __m128i block0, block1, block2, block3;
1634  if (flags & BT_InBlockIsCounter)
1635  {
1636  // Increment of 1 and 2 in big-endian compatible with the ctr byte array.
1637  const __m128i s_one = _mm_set_epi32(1<<24, 0, 0, 0);
1638  const __m128i s_two = _mm_set_epi32(2<<24, 0, 2<<24, 0);
1639  double temp[2];
1640 
1641  // For 64-bit block ciphers we need to load the CTR block, which is 8 bytes.
1642  // After the dup load we have two counters in the XMM word. Then we need
1643  // to increment the low ctr by 0 and the high ctr by 1.
1644  std::memcpy(temp, inBlocks, blockSize);
1645  block0 = _mm_add_epi32(s_one, _mm_castpd_si128(_mm_loaddup_pd(temp)));
1646 
1647  // After initial increment of {0,1} remaining counters increment by {2,2}.
1648  block1 = _mm_add_epi32(s_two, block0);
1649  block2 = _mm_add_epi32(s_two, block1);
1650  block3 = _mm_add_epi32(s_two, block2);
1651 
1652  // Store the next counter. When BT_InBlockIsCounter is set then
1653  // inBlocks is backed by m_counterArray which is non-const.
1654  _mm_store_sd(temp, _mm_castsi128_pd(_mm_add_epi64(s_two, block3)));
1655  std::memcpy(const_cast<byte*>(inBlocks), temp, blockSize);
1656  }
1657  else
1658  {
1659  block0 = _mm_loadu_si128(CONST_M128_CAST(inBlocks));
1660  inBlocks = PtrAdd(inBlocks, inIncrement);
1661  block1 = _mm_loadu_si128(CONST_M128_CAST(inBlocks));
1662  inBlocks = PtrAdd(inBlocks, inIncrement);
1663  block2 = _mm_loadu_si128(CONST_M128_CAST(inBlocks));
1664  inBlocks = PtrAdd(inBlocks, inIncrement);
1665  block3 = _mm_loadu_si128(CONST_M128_CAST(inBlocks));
1666  inBlocks = PtrAdd(inBlocks, inIncrement);
1667  }
1668 
1669  if (xorInput)
1670  {
1671  block0 = _mm_xor_si128(block0, _mm_loadu_si128(CONST_M128_CAST(xorBlocks)));
1672  xorBlocks = PtrAdd(xorBlocks, xorIncrement);
1673  block1 = _mm_xor_si128(block1, _mm_loadu_si128(CONST_M128_CAST(xorBlocks)));
1674  xorBlocks = PtrAdd(xorBlocks, xorIncrement);
1675  block2 = _mm_xor_si128(block2, _mm_loadu_si128(CONST_M128_CAST(xorBlocks)));
1676  xorBlocks = PtrAdd(xorBlocks, xorIncrement);
1677  block3 = _mm_xor_si128(block3, _mm_loadu_si128(CONST_M128_CAST(xorBlocks)));
1678  xorBlocks = PtrAdd(xorBlocks, xorIncrement);
1679  }
1680 
1681  func4(block0, block1, block2, block3, subKeys, static_cast<unsigned int>(rounds));
1682 
1683  if (xorOutput)
1684  {
1685  block0 = _mm_xor_si128(block0, _mm_loadu_si128(CONST_M128_CAST(xorBlocks)));
1686  xorBlocks = PtrAdd(xorBlocks, xorIncrement);
1687  block1 = _mm_xor_si128(block1, _mm_loadu_si128(CONST_M128_CAST(xorBlocks)));
1688  xorBlocks = PtrAdd(xorBlocks, xorIncrement);
1689  block2 = _mm_xor_si128(block2, _mm_loadu_si128(CONST_M128_CAST(xorBlocks)));
1690  xorBlocks = PtrAdd(xorBlocks, xorIncrement);
1691  block3 = _mm_xor_si128(block3, _mm_loadu_si128(CONST_M128_CAST(xorBlocks)));
1692  xorBlocks = PtrAdd(xorBlocks, xorIncrement);
1693  }
1694 
1695  _mm_storeu_si128(M128_CAST(outBlocks), block0);
1696  outBlocks = PtrAdd(outBlocks, outIncrement);
1697  _mm_storeu_si128(M128_CAST(outBlocks), block1);
1698  outBlocks = PtrAdd(outBlocks, outIncrement);
1699  _mm_storeu_si128(M128_CAST(outBlocks), block2);
1700  outBlocks = PtrAdd(outBlocks, outIncrement);
1701  _mm_storeu_si128(M128_CAST(outBlocks), block3);
1702  outBlocks = PtrAdd(outBlocks, outIncrement);
1703 
1704  length -= 4*xmmBlockSize;
1705  }
1706  }
1707 
1708  if (length)
1709  {
1710  // Adjust to real block size
1711  if (flags & BT_ReverseDirection)
1712  {
1713  inIncrement += inIncrement ? blockSize : 0;
1714  xorIncrement += xorIncrement ? blockSize : 0;
1715  outIncrement += outIncrement ? blockSize : 0;
1716  inBlocks = PtrSub(inBlocks, inIncrement);
1717  xorBlocks = PtrSub(xorBlocks, xorIncrement);
1718  outBlocks = PtrSub(outBlocks, outIncrement);
1719  }
1720  else
1721  {
1722  inIncrement -= inIncrement ? blockSize : 0;
1723  xorIncrement -= xorIncrement ? blockSize : 0;
1724  outIncrement -= outIncrement ? blockSize : 0;
1725  }
1726 
1727  while (length >= blockSize)
1728  {
1729  double temp[2];
1730  std::memcpy(temp, inBlocks, blockSize);
1731  __m128i block = _mm_castpd_si128(_mm_load_sd(temp));
1732 
1733  if (xorInput)
1734  {
1735  std::memcpy(temp, xorBlocks, blockSize);
1736  block = _mm_xor_si128(block, _mm_castpd_si128(_mm_load_sd(temp)));
1737  }
1738 
1739  if (flags & BT_InBlockIsCounter)
1740  const_cast<byte *>(inBlocks)[7]++;
1741 
1742  func1(block, subKeys, static_cast<unsigned int>(rounds));
1743 
1744  if (xorOutput)
1745  {
1746  std::memcpy(temp, xorBlocks, blockSize);
1747  block = _mm_xor_si128(block, _mm_castpd_si128(_mm_load_sd(temp)));
1748  }
1749 
1750  _mm_store_sd(temp, _mm_castsi128_pd(block));
1751  std::memcpy(outBlocks, temp, blockSize);
1752 
1753  inBlocks = PtrAdd(inBlocks, inIncrement);
1754  outBlocks = PtrAdd(outBlocks, outIncrement);
1755  xorBlocks = PtrAdd(xorBlocks, xorIncrement);
1756  length -= blockSize;
1757  }
1758  }
1759 
1760  return length;
1761 }
1762 
1763 NAMESPACE_END // CryptoPP
1764 
1765 #endif // CRYPTOPP_SSSE3_AVAILABLE
1766 
1767 // *********************** Altivec/Power 4 ********************** //
1768 
1769 #if defined(__ALTIVEC__)
1770 
1771 NAMESPACE_BEGIN(CryptoPP)
1772 
1773 /// \brief AdvancedProcessBlocks for 2 and 6 blocks
1774 /// \tparam F2 function to process 2 128-bit blocks
1775 /// \tparam F6 function to process 6 128-bit blocks
1776 /// \tparam W word type of the subkey table
1777 /// \details AdvancedProcessBlocks64_6x2_Altivec processes 6 and 2 Altivec SIMD words
1778 /// at a time. For a single block the template uses F2 with a zero block.
1779 /// \details The subkey type is usually word32 or word64. F2 and F6 must use the
1780 /// same word type.
1781 template <typename F2, typename F6, typename W>
1782 inline size_t AdvancedProcessBlocks64_6x2_ALTIVEC(F2 func2, F6 func6,
1783  const W *subKeys, size_t rounds, const byte *inBlocks,
1784  const byte *xorBlocks, byte *outBlocks, size_t length, word32 flags)
1785 {
1786  CRYPTOPP_ASSERT(subKeys);
1787  CRYPTOPP_ASSERT(inBlocks);
1788  CRYPTOPP_ASSERT(outBlocks);
1789  CRYPTOPP_ASSERT(length >= 8);
1790 
1791 #if (CRYPTOPP_LITTLE_ENDIAN)
1792  enum {LowOffset=8, HighOffset=0};
1793  const uint32x4_p s_one = {1,0,0,0};
1794  const uint32x4_p s_two = {2,0,2,0};
1795 #else
1796  enum {LowOffset=8, HighOffset=0};
1797  const uint32x4_p s_one = {0,0,0,1};
1798  const uint32x4_p s_two = {0,2,0,2};
1799 #endif
1800 
1801  const size_t blockSize = 8;
1802  const size_t vsxBlockSize = 16;
1803  CRYPTOPP_ALIGN_DATA(16) uint8_t temp[16];
1804 
1805  size_t inIncrement = (flags & (BT_InBlockIsCounter|BT_DontIncrementInOutPointers)) ? 0 : vsxBlockSize;
1806  size_t xorIncrement = (xorBlocks != NULLPTR) ? vsxBlockSize : 0;
1807  size_t outIncrement = (flags & BT_DontIncrementInOutPointers) ? 0 : vsxBlockSize;
1808 
1809  // Clang and Coverity are generating findings using xorBlocks as a flag.
1810  const bool xorInput = (xorBlocks != NULLPTR) && (flags & BT_XorInput);
1811  const bool xorOutput = (xorBlocks != NULLPTR) && !(flags & BT_XorInput);
1812 
1813  if (flags & BT_ReverseDirection)
1814  {
1815  inBlocks = PtrAdd(inBlocks, length - vsxBlockSize);
1816  xorBlocks = PtrAdd(xorBlocks, length - vsxBlockSize);
1817  outBlocks = PtrAdd(outBlocks, length - vsxBlockSize);
1818  inIncrement = 0-inIncrement;
1819  xorIncrement = 0-xorIncrement;
1820  outIncrement = 0-outIncrement;
1821  }
1822 
1823  if (flags & BT_AllowParallel)
1824  {
1825  while (length >= 6*vsxBlockSize)
1826  {
1827  uint32x4_p block0, block1, block2, block3, block4, block5;
1828  if (flags & BT_InBlockIsCounter)
1829  {
1830  // There is no easy way to load 8-bytes into a vector. It is
1831  // even harder without POWER8 due to lack of 64-bit elements.
1832  std::memcpy(temp+LowOffset, inBlocks, 8);
1833  std::memcpy(temp+HighOffset, inBlocks, 8);
1834  uint32x4_p ctr = (uint32x4_p)VecLoadBE(temp);
1835 
1836  // For 64-bit block ciphers we need to load the CTR block,
1837  // which is 8 bytes. After the dup load we have two counters
1838  // in the Altivec word. Then we need to increment the low ctr
1839  // by 0 and the high ctr by 1.
1840  block0 = VecAdd(s_one, ctr);
1841 
1842  // After initial increment of {0,1} remaining counters
1843  // increment by {2,2}.
1844  block1 = VecAdd(s_two, block0);
1845  block2 = VecAdd(s_two, block1);
1846  block3 = VecAdd(s_two, block2);
1847  block4 = VecAdd(s_two, block3);
1848  block5 = VecAdd(s_two, block4);
1849 
1850  // Update the counter in the caller.
1851  const_cast<byte*>(inBlocks)[7] += 12;
1852  }
1853  else
1854  {
1855  block0 = VecLoadBE(inBlocks);
1856  inBlocks = PtrAdd(inBlocks, inIncrement);
1857  block1 = VecLoadBE(inBlocks);
1858  inBlocks = PtrAdd(inBlocks, inIncrement);
1859  block2 = VecLoadBE(inBlocks);
1860  inBlocks = PtrAdd(inBlocks, inIncrement);
1861  block3 = VecLoadBE(inBlocks);
1862  inBlocks = PtrAdd(inBlocks, inIncrement);
1863  block4 = VecLoadBE(inBlocks);
1864  inBlocks = PtrAdd(inBlocks, inIncrement);
1865  block5 = VecLoadBE(inBlocks);
1866  inBlocks = PtrAdd(inBlocks, inIncrement);
1867  }
1868 
1869  if (xorInput)
1870  {
1871  block0 = VecXor(block0, VecLoadBE(xorBlocks));
1872  xorBlocks = PtrAdd(xorBlocks, xorIncrement);
1873  block1 = VecXor(block1, VecLoadBE(xorBlocks));
1874  xorBlocks = PtrAdd(xorBlocks, xorIncrement);
1875  block2 = VecXor(block2, VecLoadBE(xorBlocks));
1876  xorBlocks = PtrAdd(xorBlocks, xorIncrement);
1877  block3 = VecXor(block3, VecLoadBE(xorBlocks));
1878  xorBlocks = PtrAdd(xorBlocks, xorIncrement);
1879  block4 = VecXor(block4, VecLoadBE(xorBlocks));
1880  xorBlocks = PtrAdd(xorBlocks, xorIncrement);
1881  block5 = VecXor(block5, VecLoadBE(xorBlocks));
1882  xorBlocks = PtrAdd(xorBlocks, xorIncrement);
1883  }
1884 
1885  func6(block0, block1, block2, block3, block4, block5, subKeys, static_cast<unsigned int>(rounds));
1886 
1887  if (xorOutput)
1888  {
1889  block0 = VecXor(block0, VecLoadBE(xorBlocks));
1890  xorBlocks = PtrAdd(xorBlocks, xorIncrement);
1891  block1 = VecXor(block1, VecLoadBE(xorBlocks));
1892  xorBlocks = PtrAdd(xorBlocks, xorIncrement);
1893  block2 = VecXor(block2, VecLoadBE(xorBlocks));
1894  xorBlocks = PtrAdd(xorBlocks, xorIncrement);
1895  block3 = VecXor(block3, VecLoadBE(xorBlocks));
1896  xorBlocks = PtrAdd(xorBlocks, xorIncrement);
1897  block4 = VecXor(block4, VecLoadBE(xorBlocks));
1898  xorBlocks = PtrAdd(xorBlocks, xorIncrement);
1899  block5 = VecXor(block5, VecLoadBE(xorBlocks));
1900  xorBlocks = PtrAdd(xorBlocks, xorIncrement);
1901  }
1902 
1903  VecStoreBE(block0, outBlocks);
1904  outBlocks = PtrAdd(outBlocks, outIncrement);
1905  VecStoreBE(block1, outBlocks);
1906  outBlocks = PtrAdd(outBlocks, outIncrement);
1907  VecStoreBE(block2, outBlocks);
1908  outBlocks = PtrAdd(outBlocks, outIncrement);
1909  VecStoreBE(block3, outBlocks);
1910  outBlocks = PtrAdd(outBlocks, outIncrement);
1911  VecStoreBE(block4, outBlocks);
1912  outBlocks = PtrAdd(outBlocks, outIncrement);
1913  VecStoreBE(block5, outBlocks);
1914  outBlocks = PtrAdd(outBlocks, outIncrement);
1915 
1916  length -= 6*vsxBlockSize;
1917  }
1918 
1919  while (length >= 2*vsxBlockSize)
1920  {
1921  uint32x4_p block0, block1;
1922  if (flags & BT_InBlockIsCounter)
1923  {
1924  // There is no easy way to load 8-bytes into a vector. It is
1925  // even harder without POWER8 due to lack of 64-bit elements.
1926  std::memcpy(temp+LowOffset, inBlocks, 8);
1927  std::memcpy(temp+HighOffset, inBlocks, 8);
1928  uint32x4_p ctr = (uint32x4_p)VecLoadBE(temp);
1929 
1930  // For 64-bit block ciphers we need to load the CTR block,
1931  // which is 8 bytes. After the dup load we have two counters
1932  // in the Altivec word. Then we need to increment the low ctr
1933  // by 0 and the high ctr by 1.
1934  block0 = VecAdd(s_one, ctr);
1935 
1936  // After initial increment of {0,1} remaining counters
1937  // increment by {2,2}.
1938  block1 = VecAdd(s_two, block0);
1939 
1940  // Update the counter in the caller.
1941  const_cast<byte*>(inBlocks)[7] += 4;
1942  }
1943  else
1944  {
1945  block0 = VecLoadBE(inBlocks);
1946  inBlocks = PtrAdd(inBlocks, inIncrement);
1947  block1 = VecLoadBE(inBlocks);
1948  inBlocks = PtrAdd(inBlocks, inIncrement);
1949  }
1950 
1951  if (xorInput)
1952  {
1953  block0 = VecXor(block0, VecLoadBE(xorBlocks));
1954  xorBlocks = PtrAdd(xorBlocks, xorIncrement);
1955  block1 = VecXor(block1, VecLoadBE(xorBlocks));
1956  xorBlocks = PtrAdd(xorBlocks, xorIncrement);
1957  }
1958 
1959  func2(block0, block1, subKeys, static_cast<unsigned int>(rounds));
1960 
1961  if (xorOutput)
1962  {
1963  block0 = VecXor(block0, VecLoadBE(xorBlocks));
1964  xorBlocks = PtrAdd(xorBlocks, xorIncrement);
1965  block1 = VecXor(block1, VecLoadBE(xorBlocks));
1966  xorBlocks = PtrAdd(xorBlocks, xorIncrement);
1967  }
1968 
1969  VecStoreBE(block0, outBlocks);
1970  outBlocks = PtrAdd(outBlocks, outIncrement);
1971  VecStoreBE(block1, outBlocks);
1972  outBlocks = PtrAdd(outBlocks, outIncrement);
1973 
1974  length -= 2*vsxBlockSize;
1975  }
1976  }
1977 
1978  if (length)
1979  {
1980  // Adjust to real block size
1981  if (flags & BT_ReverseDirection)
1982  {
1983  inIncrement += inIncrement ? blockSize : 0;
1984  xorIncrement += xorIncrement ? blockSize : 0;
1985  outIncrement += outIncrement ? blockSize : 0;
1986  inBlocks = PtrSub(inBlocks, inIncrement);
1987  xorBlocks = PtrSub(xorBlocks, xorIncrement);
1988  outBlocks = PtrSub(outBlocks, outIncrement);
1989  }
1990  else
1991  {
1992  inIncrement -= inIncrement ? blockSize : 0;
1993  xorIncrement -= xorIncrement ? blockSize : 0;
1994  outIncrement -= outIncrement ? blockSize : 0;
1995  }
1996 
1997  while (length >= blockSize)
1998  {
1999  uint32x4_p block, zero = {0};
2000 
2001  // There is no easy way to load 8-bytes into a vector. It is
2002  // even harder without POWER8 due to lack of 64-bit elements.
2003  // The high 8 bytes are "don't care" but it if we don't
2004  // initialize the block then it generates warnings.
2005  std::memcpy(temp+LowOffset, inBlocks, 8);
2006  std::memcpy(temp+HighOffset, inBlocks, 8); // don't care
2007  block = (uint32x4_p)VecLoadBE(temp);
2008 
2009  if (xorInput)
2010  {
2011  std::memcpy(temp+LowOffset, xorBlocks, 8);
2012  std::memcpy(temp+HighOffset, xorBlocks, 8); // don't care
2013  uint32x4_p x = (uint32x4_p)VecLoadBE(temp);
2014  block = VecXor(block, x);
2015  }
2016 
2017  // Update the counter in the caller.
2018  if (flags & BT_InBlockIsCounter)
2019  const_cast<byte *>(inBlocks)[7]++;
2020 
2021  func2(block, zero, subKeys, static_cast<unsigned int>(rounds));
2022 
2023  if (xorOutput)
2024  {
2025  std::memcpy(temp+LowOffset, xorBlocks, 8);
2026  std::memcpy(temp+HighOffset, xorBlocks, 8); // don't care
2027  uint32x4_p x = (uint32x4_p)VecLoadBE(temp);
2028  block = VecXor(block, x);
2029  }
2030 
2031  VecStoreBE(block, temp);
2032  std::memcpy(outBlocks, temp+LowOffset, 8);
2033 
2034  inBlocks = PtrAdd(inBlocks, inIncrement);
2035  outBlocks = PtrAdd(outBlocks, outIncrement);
2036  xorBlocks = PtrAdd(xorBlocks, xorIncrement);
2037  length -= blockSize;
2038  }
2039  }
2040 
2041  return length;
2042 }
2043 
2044 /// \brief AdvancedProcessBlocks for 1 and 4 blocks
2045 /// \tparam F1 function to process 1 128-bit block
2046 /// \tparam F4 function to process 4 128-bit blocks
2047 /// \tparam W word type of the subkey table
2048 /// \details AdvancedProcessBlocks128_4x1_ALTIVEC processes 4 and 1 Altivec SIMD words
2049 /// at a time.
2050 /// \details The subkey type is usually word32 or word64. F1 and F4 must use the
2051 /// same word type.
2052 template <typename F1, typename F4, typename W>
2053 inline size_t AdvancedProcessBlocks128_4x1_ALTIVEC(F1 func1, F4 func4,
2054  const W *subKeys, size_t rounds, const byte *inBlocks,
2055  const byte *xorBlocks, byte *outBlocks, size_t length, word32 flags)
2056 {
2057  CRYPTOPP_ASSERT(subKeys);
2058  CRYPTOPP_ASSERT(inBlocks);
2059  CRYPTOPP_ASSERT(outBlocks);
2060  CRYPTOPP_ASSERT(length >= 16);
2061 
2062 #if (CRYPTOPP_LITTLE_ENDIAN)
2063  const uint32x4_p s_one = {1,0,0,0};
2064 #else
2065  const uint32x4_p s_one = {0,0,0,1};
2066 #endif
2067 
2068  const size_t blockSize = 16;
2069  // const size_t vsxBlockSize = 16;
2070 
2071  size_t inIncrement = (flags & (BT_InBlockIsCounter|BT_DontIncrementInOutPointers)) ? 0 : blockSize;
2072  size_t xorIncrement = (xorBlocks != NULLPTR) ? blockSize : 0;
2073  size_t outIncrement = (flags & BT_DontIncrementInOutPointers) ? 0 : blockSize;
2074 
2075  // Clang and Coverity are generating findings using xorBlocks as a flag.
2076  const bool xorInput = (xorBlocks != NULLPTR) && (flags & BT_XorInput);
2077  const bool xorOutput = (xorBlocks != NULLPTR) && !(flags & BT_XorInput);
2078 
2079  if (flags & BT_ReverseDirection)
2080  {
2081  inBlocks = PtrAdd(inBlocks, length - blockSize);
2082  xorBlocks = PtrAdd(xorBlocks, length - blockSize);
2083  outBlocks = PtrAdd(outBlocks, length - blockSize);
2084  inIncrement = 0-inIncrement;
2085  xorIncrement = 0-xorIncrement;
2086  outIncrement = 0-outIncrement;
2087  }
2088 
2089  if (flags & BT_AllowParallel)
2090  {
2091  while (length >= 4*blockSize)
2092  {
2093  uint32x4_p block0, block1, block2, block3;
2094 
2095  if (flags & BT_InBlockIsCounter)
2096  {
2097  block0 = VecLoadBE(inBlocks);
2098  block1 = VecAdd(block0, s_one);
2099  block2 = VecAdd(block1, s_one);
2100  block3 = VecAdd(block2, s_one);
2101 
2102  // Hack due to big-endian loads used by POWER8 (and maybe ARM-BE).
2103  // CTR_ModePolicy::OperateKeystream is wired such that after
2104  // returning from this function CTR_ModePolicy will detect wrap on
2105  // on the last counter byte and increment the next to last byte.
2106  // The problem is, with a big-endian load, inBlocks[15] is really
2107  // located at index 15. The vector addition using a 32-bit element
2108  // generates a carry into inBlocks[14] and then CTR_ModePolicy
2109  // increments inBlocks[14] too.
2110  const_cast<byte*>(inBlocks)[15] += 6;
2111  }
2112  else
2113  {
2114  block0 = VecLoadBE(inBlocks);
2115  inBlocks = PtrAdd(inBlocks, inIncrement);
2116  block1 = VecLoadBE(inBlocks);
2117  inBlocks = PtrAdd(inBlocks, inIncrement);
2118  block2 = VecLoadBE(inBlocks);
2119  inBlocks = PtrAdd(inBlocks, inIncrement);
2120  block3 = VecLoadBE(inBlocks);
2121  inBlocks = PtrAdd(inBlocks, inIncrement);
2122  }
2123 
2124  if (xorInput)
2125  {
2126  block0 = VecXor(block0, VecLoadBE(xorBlocks));
2127  xorBlocks = PtrAdd(xorBlocks, xorIncrement);
2128  block1 = VecXor(block1, VecLoadBE(xorBlocks));
2129  xorBlocks = PtrAdd(xorBlocks, xorIncrement);
2130  block2 = VecXor(block2, VecLoadBE(xorBlocks));
2131  xorBlocks = PtrAdd(xorBlocks, xorIncrement);
2132  block3 = VecXor(block3, VecLoadBE(xorBlocks));
2133  xorBlocks = PtrAdd(xorBlocks, xorIncrement);
2134  }
2135 
2136  func4(block0, block1, block2, block3, subKeys, rounds);
2137 
2138  if (xorOutput)
2139  {
2140  block0 = VecXor(block0, VecLoadBE(xorBlocks));
2141  xorBlocks = PtrAdd(xorBlocks, xorIncrement);
2142  block1 = VecXor(block1, VecLoadBE(xorBlocks));
2143  xorBlocks = PtrAdd(xorBlocks, xorIncrement);
2144  block2 = VecXor(block2, VecLoadBE(xorBlocks));
2145  xorBlocks = PtrAdd(xorBlocks, xorIncrement);
2146  block3 = VecXor(block3, VecLoadBE(xorBlocks));
2147  xorBlocks = PtrAdd(xorBlocks, xorIncrement);
2148  }
2149 
2150  VecStoreBE(block0, outBlocks);
2151  outBlocks = PtrAdd(outBlocks, outIncrement);
2152  VecStoreBE(block1, outBlocks);
2153  outBlocks = PtrAdd(outBlocks, outIncrement);
2154  VecStoreBE(block2, outBlocks);
2155  outBlocks = PtrAdd(outBlocks, outIncrement);
2156  VecStoreBE(block3, outBlocks);
2157  outBlocks = PtrAdd(outBlocks, outIncrement);
2158 
2159  length -= 4*blockSize;
2160  }
2161  }
2162 
2163  while (length >= blockSize)
2164  {
2165  uint32x4_p block = VecLoadBE(inBlocks);
2166 
2167  if (xorInput)
2168  block = VecXor(block, VecLoadBE(xorBlocks));
2169 
2170  if (flags & BT_InBlockIsCounter)
2171  const_cast<byte *>(inBlocks)[15]++;
2172 
2173  func1(block, subKeys, rounds);
2174 
2175  if (xorOutput)
2176  block = VecXor(block, VecLoadBE(xorBlocks));
2177 
2178  VecStoreBE(block, outBlocks);
2179 
2180  inBlocks = PtrAdd(inBlocks, inIncrement);
2181  outBlocks = PtrAdd(outBlocks, outIncrement);
2182  xorBlocks = PtrAdd(xorBlocks, xorIncrement);
2183  length -= blockSize;
2184  }
2185 
2186  return length;
2187 }
2188 
2189 /// \brief AdvancedProcessBlocks for 1 and 6 blocks
2190 /// \tparam F1 function to process 1 128-bit block
2191 /// \tparam F6 function to process 6 128-bit blocks
2192 /// \tparam W word type of the subkey table
2193 /// \details AdvancedProcessBlocks128_6x1_ALTIVEC processes 6 and 1 Altivec SIMD words
2194 /// at a time.
2195 /// \details The subkey type is usually word32 or word64. F1 and F6 must use the
2196 /// same word type.
2197 template <typename F1, typename F6, typename W>
2198 inline size_t AdvancedProcessBlocks128_6x1_ALTIVEC(F1 func1, F6 func6,
2199  const W *subKeys, size_t rounds, const byte *inBlocks,
2200  const byte *xorBlocks, byte *outBlocks, size_t length, word32 flags)
2201 {
2202  CRYPTOPP_ASSERT(subKeys);
2203  CRYPTOPP_ASSERT(inBlocks);
2204  CRYPTOPP_ASSERT(outBlocks);
2205  CRYPTOPP_ASSERT(length >= 16);
2206 
2207 #if (CRYPTOPP_LITTLE_ENDIAN)
2208  const uint32x4_p s_one = {1,0,0,0};
2209 #else
2210  const uint32x4_p s_one = {0,0,0,1};
2211 #endif
2212 
2213  const size_t blockSize = 16;
2214  // const size_t vsxBlockSize = 16;
2215 
2216  size_t inIncrement = (flags & (BT_InBlockIsCounter|BT_DontIncrementInOutPointers)) ? 0 : blockSize;
2217  size_t xorIncrement = (xorBlocks != NULLPTR) ? blockSize : 0;
2218  size_t outIncrement = (flags & BT_DontIncrementInOutPointers) ? 0 : blockSize;
2219 
2220  // Clang and Coverity are generating findings using xorBlocks as a flag.
2221  const bool xorInput = (xorBlocks != NULLPTR) && (flags & BT_XorInput);
2222  const bool xorOutput = (xorBlocks != NULLPTR) && !(flags & BT_XorInput);
2223 
2224  if (flags & BT_ReverseDirection)
2225  {
2226  inBlocks = PtrAdd(inBlocks, length - blockSize);
2227  xorBlocks = PtrAdd(xorBlocks, length - blockSize);
2228  outBlocks = PtrAdd(outBlocks, length - blockSize);
2229  inIncrement = 0-inIncrement;
2230  xorIncrement = 0-xorIncrement;
2231  outIncrement = 0-outIncrement;
2232  }
2233 
2234  if (flags & BT_AllowParallel)
2235  {
2236  while (length >= 6*blockSize)
2237  {
2238  uint32x4_p block0, block1, block2, block3, block4, block5;
2239 
2240  if (flags & BT_InBlockIsCounter)
2241  {
2242  block0 = VecLoadBE(inBlocks);
2243  block1 = VecAdd(block0, s_one);
2244  block2 = VecAdd(block1, s_one);
2245  block3 = VecAdd(block2, s_one);
2246  block4 = VecAdd(block3, s_one);
2247  block5 = VecAdd(block4, s_one);
2248 
2249  // Hack due to big-endian loads used by POWER8 (and maybe ARM-BE).
2250  // CTR_ModePolicy::OperateKeystream is wired such that after
2251  // returning from this function CTR_ModePolicy will detect wrap on
2252  // on the last counter byte and increment the next to last byte.
2253  // The problem is, with a big-endian load, inBlocks[15] is really
2254  // located at index 15. The vector addition using a 32-bit element
2255  // generates a carry into inBlocks[14] and then CTR_ModePolicy
2256  // increments inBlocks[14] too.
2257  //
2258  // To find this bug we needed a test case with a ctr of 0xNN...FA.
2259  // The last octet is 0xFA and adding 6 creates the wrap to trigger
2260  // the issue. If the last octet was 0xFC then 4 would trigger it.
2261  // We dumb-lucked into the test with SPECK-128. The test case of
2262  // interest is the one with IV 348ECA9766C09F04 826520DE47A212FA.
2263  uint8x16_p temp = VecAdd((uint8x16_p)block5, (uint8x16_p)s_one);
2264  VecStoreBE(temp, const_cast<byte*>(inBlocks));
2265  }
2266  else
2267  {
2268  block0 = VecLoadBE(inBlocks);
2269  inBlocks = PtrAdd(inBlocks, inIncrement);
2270  block1 = VecLoadBE(inBlocks);
2271  inBlocks = PtrAdd(inBlocks, inIncrement);
2272  block2 = VecLoadBE(inBlocks);
2273  inBlocks = PtrAdd(inBlocks, inIncrement);
2274  block3 = VecLoadBE(inBlocks);
2275  inBlocks = PtrAdd(inBlocks, inIncrement);
2276  block4 = VecLoadBE(inBlocks);
2277  inBlocks = PtrAdd(inBlocks, inIncrement);
2278  block5 = VecLoadBE(inBlocks);
2279  inBlocks = PtrAdd(inBlocks, inIncrement);
2280  }
2281 
2282  if (xorInput)
2283  {
2284  block0 = VecXor(block0, VecLoadBE(xorBlocks));
2285  xorBlocks = PtrAdd(xorBlocks, xorIncrement);
2286  block1 = VecXor(block1, VecLoadBE(xorBlocks));
2287  xorBlocks = PtrAdd(xorBlocks, xorIncrement);
2288  block2 = VecXor(block2, VecLoadBE(xorBlocks));
2289  xorBlocks = PtrAdd(xorBlocks, xorIncrement);
2290  block3 = VecXor(block3, VecLoadBE(xorBlocks));
2291  xorBlocks = PtrAdd(xorBlocks, xorIncrement);
2292  block4 = VecXor(block4, VecLoadBE(xorBlocks));
2293  xorBlocks = PtrAdd(xorBlocks, xorIncrement);
2294  block5 = VecXor(block5, VecLoadBE(xorBlocks));
2295  xorBlocks = PtrAdd(xorBlocks, xorIncrement);
2296  }
2297 
2298  func6(block0, block1, block2, block3, block4, block5, subKeys, rounds);
2299 
2300  if (xorOutput)
2301  {
2302  block0 = VecXor(block0, VecLoadBE(xorBlocks));
2303  xorBlocks = PtrAdd(xorBlocks, xorIncrement);
2304  block1 = VecXor(block1, VecLoadBE(xorBlocks));
2305  xorBlocks = PtrAdd(xorBlocks, xorIncrement);
2306  block2 = VecXor(block2, VecLoadBE(xorBlocks));
2307  xorBlocks = PtrAdd(xorBlocks, xorIncrement);
2308  block3 = VecXor(block3, VecLoadBE(xorBlocks));
2309  xorBlocks = PtrAdd(xorBlocks, xorIncrement);
2310  block4 = VecXor(block4, VecLoadBE(xorBlocks));
2311  xorBlocks = PtrAdd(xorBlocks, xorIncrement);
2312  block5 = VecXor(block5, VecLoadBE(xorBlocks));
2313  xorBlocks = PtrAdd(xorBlocks, xorIncrement);
2314  }
2315 
2316  VecStoreBE(block0, outBlocks);
2317  outBlocks = PtrAdd(outBlocks, outIncrement);
2318  VecStoreBE(block1, outBlocks);
2319  outBlocks = PtrAdd(outBlocks, outIncrement);
2320  VecStoreBE(block2, outBlocks);
2321  outBlocks = PtrAdd(outBlocks, outIncrement);
2322  VecStoreBE(block3, outBlocks);
2323  outBlocks = PtrAdd(outBlocks, outIncrement);
2324  VecStoreBE(block4, outBlocks);
2325  outBlocks = PtrAdd(outBlocks, outIncrement);
2326  VecStoreBE(block5, outBlocks);
2327  outBlocks = PtrAdd(outBlocks, outIncrement);
2328 
2329  length -= 6*blockSize;
2330  }
2331  }
2332 
2333  while (length >= blockSize)
2334  {
2335  uint32x4_p block = VecLoadBE(inBlocks);
2336 
2337  if (xorInput)
2338  block = VecXor(block, VecLoadBE(xorBlocks));
2339 
2340  if (flags & BT_InBlockIsCounter)
2341  const_cast<byte *>(inBlocks)[15]++;
2342 
2343  func1(block, subKeys, rounds);
2344 
2345  if (xorOutput)
2346  block = VecXor(block, VecLoadBE(xorBlocks));
2347 
2348  VecStoreBE(block, outBlocks);
2349 
2350  inBlocks = PtrAdd(inBlocks, inIncrement);
2351  outBlocks = PtrAdd(outBlocks, outIncrement);
2352  xorBlocks = PtrAdd(xorBlocks, xorIncrement);
2353  length -= blockSize;
2354  }
2355 
2356  return length;
2357 }
2358 
2359 NAMESPACE_END // CryptoPP
2360 
2361 #endif // __ALTIVEC__
2362 
2363 #endif // CRYPTOPP_ADVANCED_SIMD_TEMPLATES
Allow parallel transformations.
Definition: cryptlib.h:897
Utility functions for the Crypto++ library.
Library configuration file.
should not modify block pointers
Definition: cryptlib.h:891
Common C++ header files.
T1 VecAdd(const T1 vec1, const T2 vec2)
Add two vectors.
Definition: ppc_simd.h:939
__vector unsigned int uint32x4_p
Vector of 32-bit elements.
Definition: ppc_simd.h:129
Support functions for PowerPC and vector operations.
void VecStoreBE(const T data, byte dest[16])
Stores a vector to a byte array.
Definition: ppc_simd.h:751
#define CRYPTOPP_ASSERT(exp)
Debugging and diagnostic assertion.
Definition: trap.h:69
T1 VecXor(const T1 vec1, const T2 vec2)
XOR two vectors.
Definition: ppc_simd.h:916
PTR PtrSub(PTR pointer, OFF offset)
Create a pointer with an offset.
Definition: misc.h:356
PTR PtrAdd(PTR pointer, OFF offset)
Create a pointer with an offset.
Definition: misc.h:343
Xor inputs before transformation.
Definition: cryptlib.h:893
uint32x4_p VecLoadBE(const byte src[16])
Loads a vector from a byte array.
Definition: ppc_simd.h:440
perform the transformation in reverse
Definition: cryptlib.h:895
Crypto++ library namespace.
__vector unsigned char uint8x16_p
Vector of 8-bit elements.
Definition: ppc_simd.h:119