45 #ifndef CRYPTOPP_ADVANCED_SIMD_TEMPLATES 46 #define CRYPTOPP_ADVANCED_SIMD_TEMPLATES 53 #if (CRYPTOPP_ARM_NEON_AVAILABLE) && !defined(_M_ARM64) 54 # include <arm_neon.h> 57 #if (CRYPTOPP_ARM_ACLE_AVAILABLE) 59 # include <arm_acle.h> 62 #if (CRYPTOPP_SSE2_INTRIN_AVAILABLE) 63 # include <emmintrin.h> 64 # include <xmmintrin.h> 68 #if (CRYPTOPP_SSSE3_AVAILABLE) 69 # include <emmintrin.h> 70 # include <pmmintrin.h> 71 # include <xmmintrin.h> 74 #if defined(__ALTIVEC__) 80 ANONYMOUS_NAMESPACE_BEGIN
82 using CryptoPP::BlockTransformation;
90 ANONYMOUS_NAMESPACE_END
94 #if (CRYPTOPP_ARM_NEON_AVAILABLE) 106 template <
typename F2,
typename F6,
typename W>
107 inline size_t AdvancedProcessBlocks64_6x2_NEON(F2 func2, F6 func6,
108 const W *subKeys,
size_t rounds,
const byte *inBlocks,
109 const byte *xorBlocks, byte *outBlocks,
size_t length, word32 flags)
116 const unsigned int w_one[] = {0, 0<<24, 0, 1<<24};
117 const unsigned int w_two[] = {0, 2<<24, 0, 2<<24};
118 const uint32x4_t s_one = vld1q_u32(w_one);
119 const uint32x4_t s_two = vld1q_u32(w_two);
121 const size_t blockSize = 8;
122 const size_t neonBlockSize = 16;
124 size_t inIncrement = (flags & (BT_InBlockIsCounter|BT_DontIncrementInOutPointers)) ? 0 : neonBlockSize;
125 size_t xorIncrement = (xorBlocks != NULLPTR) ? neonBlockSize : 0;
126 size_t outIncrement = (flags & BT_DontIncrementInOutPointers) ? 0 : neonBlockSize;
129 const bool xorInput = (xorBlocks != NULLPTR) && (flags & BT_XorInput);
130 const bool xorOutput = (xorBlocks != NULLPTR) && !(flags & BT_XorInput);
132 if (flags & BT_ReverseDirection)
134 inBlocks =
PtrAdd(inBlocks, length - neonBlockSize);
135 xorBlocks =
PtrAdd(xorBlocks, length - neonBlockSize);
136 outBlocks =
PtrAdd(outBlocks, length - neonBlockSize);
137 inIncrement = 0-inIncrement;
138 xorIncrement = 0-xorIncrement;
139 outIncrement = 0-outIncrement;
142 if (flags & BT_AllowParallel)
144 while (length >= 6*neonBlockSize)
146 uint32x4_t block0, block1, block2, block3, block4, block5;
147 if (flags & BT_InBlockIsCounter)
152 const uint8x8_t ctr = vld1_u8(inBlocks);
153 block0 = vaddq_u32(s_one, vreinterpretq_u32_u8(vcombine_u8(ctr,ctr)));
156 block1 = vaddq_u32(s_two, block0);
157 block2 = vaddq_u32(s_two, block1);
158 block3 = vaddq_u32(s_two, block2);
159 block4 = vaddq_u32(s_two, block3);
160 block5 = vaddq_u32(s_two, block4);
162 vst1_u8(const_cast<byte*>(inBlocks), vget_low_u8(
163 vreinterpretq_u8_u32(vaddq_u32(s_two, block5))));
167 block0 = vreinterpretq_u32_u8(vld1q_u8(inBlocks));
168 inBlocks =
PtrAdd(inBlocks, inIncrement);
169 block1 = vreinterpretq_u32_u8(vld1q_u8(inBlocks));
170 inBlocks =
PtrAdd(inBlocks, inIncrement);
171 block2 = vreinterpretq_u32_u8(vld1q_u8(inBlocks));
172 inBlocks =
PtrAdd(inBlocks, inIncrement);
173 block3 = vreinterpretq_u32_u8(vld1q_u8(inBlocks));
174 inBlocks =
PtrAdd(inBlocks, inIncrement);
175 block4 = vreinterpretq_u32_u8(vld1q_u8(inBlocks));
176 inBlocks =
PtrAdd(inBlocks, inIncrement);
177 block5 = vreinterpretq_u32_u8(vld1q_u8(inBlocks));
178 inBlocks =
PtrAdd(inBlocks, inIncrement);
183 block0 = veorq_u32(block0, vreinterpretq_u32_u8(vld1q_u8(xorBlocks)));
184 xorBlocks =
PtrAdd(xorBlocks, xorIncrement);
185 block1 = veorq_u32(block1, vreinterpretq_u32_u8(vld1q_u8(xorBlocks)));
186 xorBlocks =
PtrAdd(xorBlocks, xorIncrement);
187 block2 = veorq_u32(block2, vreinterpretq_u32_u8(vld1q_u8(xorBlocks)));
188 xorBlocks =
PtrAdd(xorBlocks, xorIncrement);
189 block3 = veorq_u32(block3, vreinterpretq_u32_u8(vld1q_u8(xorBlocks)));
190 xorBlocks =
PtrAdd(xorBlocks, xorIncrement);
191 block4 = veorq_u32(block4, vreinterpretq_u32_u8(vld1q_u8(xorBlocks)));
192 xorBlocks =
PtrAdd(xorBlocks, xorIncrement);
193 block5 = veorq_u32(block5, vreinterpretq_u32_u8(vld1q_u8(xorBlocks)));
194 xorBlocks =
PtrAdd(xorBlocks, xorIncrement);
197 func6(block0, block1, block2, block3, block4, block5, subKeys, static_cast<unsigned int>(rounds));
201 block0 = veorq_u32(block0, vreinterpretq_u32_u8(vld1q_u8(xorBlocks)));
202 xorBlocks =
PtrAdd(xorBlocks, xorIncrement);
203 block1 = veorq_u32(block1, vreinterpretq_u32_u8(vld1q_u8(xorBlocks)));
204 xorBlocks =
PtrAdd(xorBlocks, xorIncrement);
205 block2 = veorq_u32(block2, vreinterpretq_u32_u8(vld1q_u8(xorBlocks)));
206 xorBlocks =
PtrAdd(xorBlocks, xorIncrement);
207 block3 = veorq_u32(block3, vreinterpretq_u32_u8(vld1q_u8(xorBlocks)));
208 xorBlocks =
PtrAdd(xorBlocks, xorIncrement);
209 block4 = veorq_u32(block4, vreinterpretq_u32_u8(vld1q_u8(xorBlocks)));
210 xorBlocks =
PtrAdd(xorBlocks, xorIncrement);
211 block5 = veorq_u32(block5, vreinterpretq_u32_u8(vld1q_u8(xorBlocks)));
212 xorBlocks =
PtrAdd(xorBlocks, xorIncrement);
215 vst1q_u8(outBlocks, vreinterpretq_u8_u32(block0));
216 outBlocks =
PtrAdd(outBlocks, outIncrement);
217 vst1q_u8(outBlocks, vreinterpretq_u8_u32(block1));
218 outBlocks =
PtrAdd(outBlocks, outIncrement);
219 vst1q_u8(outBlocks, vreinterpretq_u8_u32(block2));
220 outBlocks =
PtrAdd(outBlocks, outIncrement);
221 vst1q_u8(outBlocks, vreinterpretq_u8_u32(block3));
222 outBlocks =
PtrAdd(outBlocks, outIncrement);
223 vst1q_u8(outBlocks, vreinterpretq_u8_u32(block4));
224 outBlocks =
PtrAdd(outBlocks, outIncrement);
225 vst1q_u8(outBlocks, vreinterpretq_u8_u32(block5));
226 outBlocks =
PtrAdd(outBlocks, outIncrement);
228 length -= 6*neonBlockSize;
231 while (length >= 2*neonBlockSize)
233 uint32x4_t block0, block1;
234 if (flags & BT_InBlockIsCounter)
239 const uint8x8_t ctr = vld1_u8(inBlocks);
240 block0 = vaddq_u32(s_one, vreinterpretq_u32_u8(vcombine_u8(ctr,ctr)));
243 block1 = vaddq_u32(s_two, block0);
245 vst1_u8(const_cast<byte*>(inBlocks), vget_low_u8(
246 vreinterpretq_u8_u32(vaddq_u32(s_two, block1))));
250 block0 = vreinterpretq_u32_u8(vld1q_u8(inBlocks));
251 inBlocks =
PtrAdd(inBlocks, inIncrement);
252 block1 = vreinterpretq_u32_u8(vld1q_u8(inBlocks));
253 inBlocks =
PtrAdd(inBlocks, inIncrement);
258 block0 = veorq_u32(block0, vreinterpretq_u32_u8(vld1q_u8(xorBlocks)));
259 xorBlocks =
PtrAdd(xorBlocks, xorIncrement);
260 block1 = veorq_u32(block1, vreinterpretq_u32_u8(vld1q_u8(xorBlocks)));
261 xorBlocks =
PtrAdd(xorBlocks, xorIncrement);
264 func2(block0, block1, subKeys, static_cast<unsigned int>(rounds));
268 block0 = veorq_u32(block0, vreinterpretq_u32_u8(vld1q_u8(xorBlocks)));
269 xorBlocks =
PtrAdd(xorBlocks, xorIncrement);
270 block1 = veorq_u32(block1, vreinterpretq_u32_u8(vld1q_u8(xorBlocks)));
271 xorBlocks =
PtrAdd(xorBlocks, xorIncrement);
274 vst1q_u8(outBlocks, vreinterpretq_u8_u32(block0));
275 outBlocks =
PtrAdd(outBlocks, outIncrement);
276 vst1q_u8(outBlocks, vreinterpretq_u8_u32(block1));
277 outBlocks =
PtrAdd(outBlocks, outIncrement);
279 length -= 2*neonBlockSize;
286 if (flags & BT_ReverseDirection)
288 inIncrement += inIncrement ? blockSize : 0;
289 xorIncrement += xorIncrement ? blockSize : 0;
290 outIncrement += outIncrement ? blockSize : 0;
291 inBlocks =
PtrSub(inBlocks, inIncrement);
292 xorBlocks =
PtrSub(xorBlocks, xorIncrement);
293 outBlocks =
PtrSub(outBlocks, outIncrement);
297 inIncrement -= inIncrement ? blockSize : 0;
298 xorIncrement -= xorIncrement ? blockSize : 0;
299 outIncrement -= outIncrement ? blockSize : 0;
302 while (length >= blockSize)
304 uint32x4_t block, zero = {0};
306 const uint8x8_t v = vld1_u8(inBlocks);
307 block = vreinterpretq_u32_u8(vcombine_u8(v,v));
311 const uint8x8_t x = vld1_u8(xorBlocks);
312 block = veorq_u32(block, vreinterpretq_u32_u8(vcombine_u8(x,x)));
315 if (flags & BT_InBlockIsCounter)
316 const_cast<byte *
>(inBlocks)[7]++;
318 func2(block, zero, subKeys, static_cast<unsigned int>(rounds));
322 const uint8x8_t x = vld1_u8(xorBlocks);
323 block = veorq_u32(block, vreinterpretq_u32_u8(vcombine_u8(x,x)));
326 vst1_u8(const_cast<byte*>(outBlocks),
327 vget_low_u8(vreinterpretq_u8_u32(block)));
329 inBlocks =
PtrAdd(inBlocks, inIncrement);
330 outBlocks =
PtrAdd(outBlocks, outIncrement);
331 xorBlocks =
PtrAdd(xorBlocks, xorIncrement);
347 template <
typename F1,
typename F6,
typename W>
348 inline size_t AdvancedProcessBlocks128_6x1_NEON(F1 func1, F6 func6,
349 const W *subKeys,
size_t rounds,
const byte *inBlocks,
350 const byte *xorBlocks, byte *outBlocks,
size_t length, word32 flags)
357 const unsigned int w_one[] = {0, 0<<24, 0, 1<<24};
358 const unsigned int w_two[] = {0, 2<<24, 0, 2<<24};
359 const uint32x4_t s_one = vld1q_u32(w_one);
360 const uint32x4_t s_two = vld1q_u32(w_two);
362 const size_t blockSize = 16;
365 size_t inIncrement = (flags & (BT_InBlockIsCounter|BT_DontIncrementInOutPointers)) ? 0 : blockSize;
366 size_t xorIncrement = (xorBlocks != NULLPTR) ? blockSize : 0;
367 size_t outIncrement = (flags & BT_DontIncrementInOutPointers) ? 0 : blockSize;
370 const bool xorInput = (xorBlocks != NULLPTR) && (flags & BT_XorInput);
371 const bool xorOutput = (xorBlocks != NULLPTR) && !(flags & BT_XorInput);
373 if (flags & BT_ReverseDirection)
375 inBlocks =
PtrAdd(inBlocks, length - blockSize);
376 xorBlocks =
PtrAdd(xorBlocks, length - blockSize);
377 outBlocks =
PtrAdd(outBlocks, length - blockSize);
378 inIncrement = 0-inIncrement;
379 xorIncrement = 0-xorIncrement;
380 outIncrement = 0-outIncrement;
383 if (flags & BT_AllowParallel)
385 while (length >= 6*blockSize)
387 uint64x2_t block0, block1, block2, block3, block4, block5;
388 if (flags & BT_InBlockIsCounter)
390 const uint64x2_t one = vreinterpretq_u64_u32(s_one);
391 block0 = vreinterpretq_u64_u8(vld1q_u8(inBlocks));
392 block1 = vaddq_u64(block0, one);
393 block2 = vaddq_u64(block1, one);
394 block3 = vaddq_u64(block2, one);
395 block4 = vaddq_u64(block3, one);
396 block5 = vaddq_u64(block4, one);
397 vst1q_u8(const_cast<byte*>(inBlocks),
398 vreinterpretq_u8_u64(vaddq_u64(block5, one)));
402 block0 = vreinterpretq_u64_u8(vld1q_u8(inBlocks));
403 inBlocks =
PtrAdd(inBlocks, inIncrement);
404 block1 = vreinterpretq_u64_u8(vld1q_u8(inBlocks));
405 inBlocks =
PtrAdd(inBlocks, inIncrement);
406 block2 = vreinterpretq_u64_u8(vld1q_u8(inBlocks));
407 inBlocks =
PtrAdd(inBlocks, inIncrement);
408 block3 = vreinterpretq_u64_u8(vld1q_u8(inBlocks));
409 inBlocks =
PtrAdd(inBlocks, inIncrement);
410 block4 = vreinterpretq_u64_u8(vld1q_u8(inBlocks));
411 inBlocks =
PtrAdd(inBlocks, inIncrement);
412 block5 = vreinterpretq_u64_u8(vld1q_u8(inBlocks));
413 inBlocks =
PtrAdd(inBlocks, inIncrement);
418 block0 = veorq_u64(block0, vreinterpretq_u64_u8(vld1q_u8(xorBlocks)));
419 xorBlocks =
PtrAdd(xorBlocks, xorIncrement);
420 block1 = veorq_u64(block1, vreinterpretq_u64_u8(vld1q_u8(xorBlocks)));
421 xorBlocks =
PtrAdd(xorBlocks, xorIncrement);
422 block2 = veorq_u64(block2, vreinterpretq_u64_u8(vld1q_u8(xorBlocks)));
423 xorBlocks =
PtrAdd(xorBlocks, xorIncrement);
424 block3 = veorq_u64(block3, vreinterpretq_u64_u8(vld1q_u8(xorBlocks)));
425 xorBlocks =
PtrAdd(xorBlocks, xorIncrement);
426 block4 = veorq_u64(block4, vreinterpretq_u64_u8(vld1q_u8(xorBlocks)));
427 xorBlocks =
PtrAdd(xorBlocks, xorIncrement);
428 block5 = veorq_u64(block5, vreinterpretq_u64_u8(vld1q_u8(xorBlocks)));
429 xorBlocks =
PtrAdd(xorBlocks, xorIncrement);
432 func6(block0, block1, block2, block3, block4, block5, subKeys, static_cast<unsigned int>(rounds));
436 block0 = veorq_u64(block0, vreinterpretq_u64_u8(vld1q_u8(xorBlocks)));
437 xorBlocks =
PtrAdd(xorBlocks, xorIncrement);
438 block1 = veorq_u64(block1, vreinterpretq_u64_u8(vld1q_u8(xorBlocks)));
439 xorBlocks =
PtrAdd(xorBlocks, xorIncrement);
440 block2 = veorq_u64(block2, vreinterpretq_u64_u8(vld1q_u8(xorBlocks)));
441 xorBlocks =
PtrAdd(xorBlocks, xorIncrement);
442 block3 = veorq_u64(block3, vreinterpretq_u64_u8(vld1q_u8(xorBlocks)));
443 xorBlocks =
PtrAdd(xorBlocks, xorIncrement);
444 block4 = veorq_u64(block4, vreinterpretq_u64_u8(vld1q_u8(xorBlocks)));
445 xorBlocks =
PtrAdd(xorBlocks, xorIncrement);
446 block5 = veorq_u64(block5, vreinterpretq_u64_u8(vld1q_u8(xorBlocks)));
447 xorBlocks =
PtrAdd(xorBlocks, xorIncrement);
450 vst1q_u8(outBlocks, vreinterpretq_u8_u64(block0));
451 outBlocks =
PtrAdd(outBlocks, outIncrement);
452 vst1q_u8(outBlocks, vreinterpretq_u8_u64(block1));
453 outBlocks =
PtrAdd(outBlocks, outIncrement);
454 vst1q_u8(outBlocks, vreinterpretq_u8_u64(block2));
455 outBlocks =
PtrAdd(outBlocks, outIncrement);
456 vst1q_u8(outBlocks, vreinterpretq_u8_u64(block3));
457 outBlocks =
PtrAdd(outBlocks, outIncrement);
458 vst1q_u8(outBlocks, vreinterpretq_u8_u64(block4));
459 outBlocks =
PtrAdd(outBlocks, outIncrement);
460 vst1q_u8(outBlocks, vreinterpretq_u8_u64(block5));
461 outBlocks =
PtrAdd(outBlocks, outIncrement);
463 length -= 6*blockSize;
467 while (length >= blockSize)
470 block = vreinterpretq_u64_u8(vld1q_u8(inBlocks));
473 block = veorq_u64(block, vreinterpretq_u64_u8(vld1q_u8(xorBlocks)));
475 if (flags & BT_InBlockIsCounter)
476 const_cast<byte *
>(inBlocks)[15]++;
478 func1(block, subKeys, static_cast<unsigned int>(rounds));
481 block = veorq_u64(block, vreinterpretq_u64_u8(vld1q_u8(xorBlocks)));
483 vst1q_u8(outBlocks, vreinterpretq_u8_u64(block));
485 inBlocks =
PtrAdd(inBlocks, inIncrement);
486 outBlocks =
PtrAdd(outBlocks, outIncrement);
487 xorBlocks =
PtrAdd(xorBlocks, xorIncrement);
503 template <
typename F1,
typename F4,
typename W>
504 inline size_t AdvancedProcessBlocks128_4x1_NEON(F1 func1, F4 func4,
505 const W *subKeys,
size_t rounds,
const byte *inBlocks,
506 const byte *xorBlocks, byte *outBlocks,
size_t length, word32 flags)
513 const unsigned int w_one[] = {0, 0<<24, 0, 1<<24};
514 const unsigned int w_two[] = {0, 2<<24, 0, 2<<24};
515 const uint32x4_t s_one = vld1q_u32(w_one);
516 const uint32x4_t s_two = vld1q_u32(w_two);
518 const size_t blockSize = 16;
521 size_t inIncrement = (flags & (BT_InBlockIsCounter|BT_DontIncrementInOutPointers)) ? 0 : blockSize;
522 size_t xorIncrement = (xorBlocks != NULLPTR) ? blockSize : 0;
523 size_t outIncrement = (flags & BT_DontIncrementInOutPointers) ? 0 : blockSize;
526 const bool xorInput = (xorBlocks != NULLPTR) && (flags & BT_XorInput);
527 const bool xorOutput = (xorBlocks != NULLPTR) && !(flags & BT_XorInput);
529 if (flags & BT_ReverseDirection)
531 inBlocks =
PtrAdd(inBlocks, length - blockSize);
532 xorBlocks =
PtrAdd(xorBlocks, length - blockSize);
533 outBlocks =
PtrAdd(outBlocks, length - blockSize);
534 inIncrement = 0-inIncrement;
535 xorIncrement = 0-xorIncrement;
536 outIncrement = 0-outIncrement;
539 if (flags & BT_AllowParallel)
541 while (length >= 4*blockSize)
543 uint32x4_t block0, block1, block2, block3;
544 if (flags & BT_InBlockIsCounter)
546 const uint32x4_t one = s_one;
547 block0 = vreinterpretq_u32_u8(vld1q_u8(inBlocks));
548 block1 = vreinterpretq_u32_u64(vaddq_u64(vreinterpretq_u64_u32(block0), vreinterpretq_u64_u32(one)));
549 block2 = vreinterpretq_u32_u64(vaddq_u64(vreinterpretq_u64_u32(block1), vreinterpretq_u64_u32(one)));
550 block3 = vreinterpretq_u32_u64(vaddq_u64(vreinterpretq_u64_u32(block2), vreinterpretq_u64_u32(one)));
551 vst1q_u8(const_cast<byte*>(inBlocks), vreinterpretq_u8_u64(vaddq_u64(
552 vreinterpretq_u64_u32(block3), vreinterpretq_u64_u32(one))));
556 block0 = vreinterpretq_u32_u8(vld1q_u8(inBlocks));
557 inBlocks =
PtrAdd(inBlocks, inIncrement);
558 block1 = vreinterpretq_u32_u8(vld1q_u8(inBlocks));
559 inBlocks =
PtrAdd(inBlocks, inIncrement);
560 block2 = vreinterpretq_u32_u8(vld1q_u8(inBlocks));
561 inBlocks =
PtrAdd(inBlocks, inIncrement);
562 block3 = vreinterpretq_u32_u8(vld1q_u8(inBlocks));
563 inBlocks =
PtrAdd(inBlocks, inIncrement);
568 block0 = veorq_u32(block0, vreinterpretq_u32_u8(vld1q_u8(xorBlocks)));
569 xorBlocks =
PtrAdd(xorBlocks, xorIncrement);
570 block1 = veorq_u32(block1, vreinterpretq_u32_u8(vld1q_u8(xorBlocks)));
571 xorBlocks =
PtrAdd(xorBlocks, xorIncrement);
572 block2 = veorq_u32(block2, vreinterpretq_u32_u8(vld1q_u8(xorBlocks)));
573 xorBlocks =
PtrAdd(xorBlocks, xorIncrement);
574 block3 = veorq_u32(block3, vreinterpretq_u32_u8(vld1q_u8(xorBlocks)));
575 xorBlocks =
PtrAdd(xorBlocks, xorIncrement);
578 func4(block0, block1, block2, block3, subKeys, static_cast<unsigned int>(rounds));
582 block0 = veorq_u32(block0, vreinterpretq_u32_u8(vld1q_u8(xorBlocks)));
583 xorBlocks =
PtrAdd(xorBlocks, xorIncrement);
584 block1 = veorq_u32(block1, vreinterpretq_u32_u8(vld1q_u8(xorBlocks)));
585 xorBlocks =
PtrAdd(xorBlocks, xorIncrement);
586 block2 = veorq_u32(block2, vreinterpretq_u32_u8(vld1q_u8(xorBlocks)));
587 xorBlocks =
PtrAdd(xorBlocks, xorIncrement);
588 block3 = veorq_u32(block3, vreinterpretq_u32_u8(vld1q_u8(xorBlocks)));
589 xorBlocks =
PtrAdd(xorBlocks, xorIncrement);
592 vst1q_u8(outBlocks, vreinterpretq_u8_u32(block0));
593 outBlocks =
PtrAdd(outBlocks, outIncrement);
594 vst1q_u8(outBlocks, vreinterpretq_u8_u32(block1));
595 outBlocks =
PtrAdd(outBlocks, outIncrement);
596 vst1q_u8(outBlocks, vreinterpretq_u8_u32(block2));
597 outBlocks =
PtrAdd(outBlocks, outIncrement);
598 vst1q_u8(outBlocks, vreinterpretq_u8_u32(block3));
599 outBlocks =
PtrAdd(outBlocks, outIncrement);
601 length -= 4*blockSize;
605 while (length >= blockSize)
607 uint32x4_t block = vreinterpretq_u32_u8(vld1q_u8(inBlocks));
610 block = veorq_u32(block, vreinterpretq_u32_u8(vld1q_u8(xorBlocks)));
612 if (flags & BT_InBlockIsCounter)
613 const_cast<byte *
>(inBlocks)[15]++;
615 func1(block, subKeys, static_cast<unsigned int>(rounds));
618 block = veorq_u32(block, vreinterpretq_u32_u8(vld1q_u8(xorBlocks)));
620 vst1q_u8(outBlocks, vreinterpretq_u8_u32(block));
622 inBlocks =
PtrAdd(inBlocks, inIncrement);
623 outBlocks =
PtrAdd(outBlocks, outIncrement);
624 xorBlocks =
PtrAdd(xorBlocks, xorIncrement);
639 template <
typename F2,
typename F6,
typename W>
640 inline size_t AdvancedProcessBlocks128_6x2_NEON(F2 func2, F6 func6,
641 const W *subKeys,
size_t rounds,
const byte *inBlocks,
642 const byte *xorBlocks, byte *outBlocks,
size_t length, word32 flags)
649 const unsigned int w_one[] = {0, 0<<24, 0, 1<<24};
650 const unsigned int w_two[] = {0, 2<<24, 0, 2<<24};
651 const uint32x4_t s_one = vld1q_u32(w_one);
652 const uint32x4_t s_two = vld1q_u32(w_two);
654 const size_t blockSize = 16;
657 size_t inIncrement = (flags & (BT_InBlockIsCounter|BT_DontIncrementInOutPointers)) ? 0 : blockSize;
658 size_t xorIncrement = (xorBlocks != NULLPTR) ? blockSize : 0;
659 size_t outIncrement = (flags & BT_DontIncrementInOutPointers) ? 0 : blockSize;
662 const bool xorInput = (xorBlocks != NULLPTR) && (flags & BT_XorInput);
663 const bool xorOutput = (xorBlocks != NULLPTR) && !(flags & BT_XorInput);
665 if (flags & BT_ReverseDirection)
667 inBlocks =
PtrAdd(inBlocks, length - blockSize);
668 xorBlocks =
PtrAdd(xorBlocks, length - blockSize);
669 outBlocks =
PtrAdd(outBlocks, length - blockSize);
670 inIncrement = 0-inIncrement;
671 xorIncrement = 0-xorIncrement;
672 outIncrement = 0-outIncrement;
675 if (flags & BT_AllowParallel)
677 while (length >= 6*blockSize)
679 uint64x2_t block0, block1, block2, block3, block4, block5;
680 if (flags & BT_InBlockIsCounter)
682 const uint64x2_t one = vreinterpretq_u64_u32(s_one);
683 block0 = vreinterpretq_u64_u8(vld1q_u8(inBlocks));
684 block1 = vaddq_u64(block0, one);
685 block2 = vaddq_u64(block1, one);
686 block3 = vaddq_u64(block2, one);
687 block4 = vaddq_u64(block3, one);
688 block5 = vaddq_u64(block4, one);
689 vst1q_u8(const_cast<byte*>(inBlocks),
690 vreinterpretq_u8_u64(vaddq_u64(block5, one)));
694 block0 = vreinterpretq_u64_u8(vld1q_u8(inBlocks));
695 inBlocks =
PtrAdd(inBlocks, inIncrement);
696 block1 = vreinterpretq_u64_u8(vld1q_u8(inBlocks));
697 inBlocks =
PtrAdd(inBlocks, inIncrement);
698 block2 = vreinterpretq_u64_u8(vld1q_u8(inBlocks));
699 inBlocks =
PtrAdd(inBlocks, inIncrement);
700 block3 = vreinterpretq_u64_u8(vld1q_u8(inBlocks));
701 inBlocks =
PtrAdd(inBlocks, inIncrement);
702 block4 = vreinterpretq_u64_u8(vld1q_u8(inBlocks));
703 inBlocks =
PtrAdd(inBlocks, inIncrement);
704 block5 = vreinterpretq_u64_u8(vld1q_u8(inBlocks));
705 inBlocks =
PtrAdd(inBlocks, inIncrement);
710 block0 = veorq_u64(block0, vreinterpretq_u64_u8(vld1q_u8(xorBlocks)));
711 xorBlocks =
PtrAdd(xorBlocks, xorIncrement);
712 block1 = veorq_u64(block1, vreinterpretq_u64_u8(vld1q_u8(xorBlocks)));
713 xorBlocks =
PtrAdd(xorBlocks, xorIncrement);
714 block2 = veorq_u64(block2, vreinterpretq_u64_u8(vld1q_u8(xorBlocks)));
715 xorBlocks =
PtrAdd(xorBlocks, xorIncrement);
716 block3 = veorq_u64(block3, vreinterpretq_u64_u8(vld1q_u8(xorBlocks)));
717 xorBlocks =
PtrAdd(xorBlocks, xorIncrement);
718 block4 = veorq_u64(block4, vreinterpretq_u64_u8(vld1q_u8(xorBlocks)));
719 xorBlocks =
PtrAdd(xorBlocks, xorIncrement);
720 block5 = veorq_u64(block5, vreinterpretq_u64_u8(vld1q_u8(xorBlocks)));
721 xorBlocks =
PtrAdd(xorBlocks, xorIncrement);
724 func6(block0, block1, block2, block3, block4, block5, subKeys, static_cast<unsigned int>(rounds));
728 block0 = veorq_u64(block0, vreinterpretq_u64_u8(vld1q_u8(xorBlocks)));
729 xorBlocks =
PtrAdd(xorBlocks, xorIncrement);
730 block1 = veorq_u64(block1, vreinterpretq_u64_u8(vld1q_u8(xorBlocks)));
731 xorBlocks =
PtrAdd(xorBlocks, xorIncrement);
732 block2 = veorq_u64(block2, vreinterpretq_u64_u8(vld1q_u8(xorBlocks)));
733 xorBlocks =
PtrAdd(xorBlocks, xorIncrement);
734 block3 = veorq_u64(block3, vreinterpretq_u64_u8(vld1q_u8(xorBlocks)));
735 xorBlocks =
PtrAdd(xorBlocks, xorIncrement);
736 block4 = veorq_u64(block4, vreinterpretq_u64_u8(vld1q_u8(xorBlocks)));
737 xorBlocks =
PtrAdd(xorBlocks, xorIncrement);
738 block5 = veorq_u64(block5, vreinterpretq_u64_u8(vld1q_u8(xorBlocks)));
739 xorBlocks =
PtrAdd(xorBlocks, xorIncrement);
742 vst1q_u8(outBlocks, vreinterpretq_u8_u64(block0));
743 outBlocks =
PtrAdd(outBlocks, outIncrement);
744 vst1q_u8(outBlocks, vreinterpretq_u8_u64(block1));
745 outBlocks =
PtrAdd(outBlocks, outIncrement);
746 vst1q_u8(outBlocks, vreinterpretq_u8_u64(block2));
747 outBlocks =
PtrAdd(outBlocks, outIncrement);
748 vst1q_u8(outBlocks, vreinterpretq_u8_u64(block3));
749 outBlocks =
PtrAdd(outBlocks, outIncrement);
750 vst1q_u8(outBlocks, vreinterpretq_u8_u64(block4));
751 outBlocks =
PtrAdd(outBlocks, outIncrement);
752 vst1q_u8(outBlocks, vreinterpretq_u8_u64(block5));
753 outBlocks =
PtrAdd(outBlocks, outIncrement);
755 length -= 6*blockSize;
758 while (length >= 2*blockSize)
760 uint64x2_t block0, block1;
761 if (flags & BT_InBlockIsCounter)
763 const uint64x2_t one = vreinterpretq_u64_u32(s_one);
764 block0 = vreinterpretq_u64_u8(vld1q_u8(inBlocks));
765 block1 = vaddq_u64(block0, one);
766 vst1q_u8(const_cast<byte*>(inBlocks),
767 vreinterpretq_u8_u64(vaddq_u64(block1, one)));
771 block0 = vreinterpretq_u64_u8(vld1q_u8(inBlocks));
772 inBlocks =
PtrAdd(inBlocks, inIncrement);
773 block1 = vreinterpretq_u64_u8(vld1q_u8(inBlocks));
774 inBlocks =
PtrAdd(inBlocks, inIncrement);
779 block0 = veorq_u64(block0, vreinterpretq_u64_u8(vld1q_u8(xorBlocks)));
780 xorBlocks =
PtrAdd(xorBlocks, xorIncrement);
781 block1 = veorq_u64(block1, vreinterpretq_u64_u8(vld1q_u8(xorBlocks)));
782 xorBlocks =
PtrAdd(xorBlocks, xorIncrement);
785 func2(block0, block1, subKeys, static_cast<unsigned int>(rounds));
789 block0 = veorq_u64(block0, vreinterpretq_u64_u8(vld1q_u8(xorBlocks)));
790 xorBlocks =
PtrAdd(xorBlocks, xorIncrement);
791 block1 = veorq_u64(block1, vreinterpretq_u64_u8(vld1q_u8(xorBlocks)));
792 xorBlocks =
PtrAdd(xorBlocks, xorIncrement);
795 vst1q_u8(outBlocks, vreinterpretq_u8_u64(block0));
796 outBlocks =
PtrAdd(outBlocks, outIncrement);
797 vst1q_u8(outBlocks, vreinterpretq_u8_u64(block1));
798 outBlocks =
PtrAdd(outBlocks, outIncrement);
800 length -= 2*blockSize;
804 while (length >= blockSize)
806 uint64x2_t block, zero = {0,0};
807 block = vreinterpretq_u64_u8(vld1q_u8(inBlocks));
810 block = veorq_u64(block, vreinterpretq_u64_u8(vld1q_u8(xorBlocks)));
812 if (flags & BT_InBlockIsCounter)
813 const_cast<byte *
>(inBlocks)[15]++;
815 func2(block, zero, subKeys, static_cast<unsigned int>(rounds));
818 block = veorq_u64(block, vreinterpretq_u64_u8(vld1q_u8(xorBlocks)));
820 vst1q_u8(outBlocks, vreinterpretq_u8_u64(block));
822 inBlocks =
PtrAdd(inBlocks, inIncrement);
823 outBlocks =
PtrAdd(outBlocks, outIncrement);
824 xorBlocks =
PtrAdd(xorBlocks, xorIncrement);
833 #endif // CRYPTOPP_ARM_NEON_AVAILABLE 837 #if defined(CRYPTOPP_SSSE3_AVAILABLE) 840 #if (__SUNPRO_CC >= 0x5130) 842 # define MAYBE_UNCONST_CAST(T, x) const_cast<MAYBE_CONST T>(x) 844 # define MAYBE_CONST const 845 # define MAYBE_UNCONST_CAST(T, x) (x) 850 # define M128_CAST(x) ((__m128i *)(void *)(x)) 852 #ifndef CONST_M128_CAST 853 # define CONST_M128_CAST(x) ((const __m128i *)(const void *)(x)) 866 template <
typename F1,
typename F2,
typename W>
867 inline size_t AdvancedProcessBlocks64_2x1_SSE(F1 func1, F2 func2,
868 MAYBE_CONST W *subKeys,
size_t rounds,
const byte *inBlocks,
869 const byte *xorBlocks, byte *outBlocks,
size_t length, word32 flags)
876 const size_t blockSize = 8;
877 const size_t xmmBlockSize = 16;
879 size_t inIncrement = (flags & (BT_InBlockIsCounter|BT_DontIncrementInOutPointers)) ? 0 : xmmBlockSize;
880 size_t xorIncrement = (xorBlocks != NULLPTR) ? xmmBlockSize : 0;
881 size_t outIncrement = (flags & BT_DontIncrementInOutPointers) ? 0 : xmmBlockSize;
884 const bool xorInput = (xorBlocks != NULLPTR) && (flags & BT_XorInput);
885 const bool xorOutput = (xorBlocks != NULLPTR) && !(flags & BT_XorInput);
887 if (flags & BT_ReverseDirection)
889 inBlocks =
PtrAdd(inBlocks, length - xmmBlockSize);
890 xorBlocks =
PtrAdd(xorBlocks, length - xmmBlockSize);
891 outBlocks =
PtrAdd(outBlocks, length - xmmBlockSize);
892 inIncrement = 0-inIncrement;
893 xorIncrement = 0-xorIncrement;
894 outIncrement = 0-outIncrement;
897 if (flags & BT_AllowParallel)
900 while (length >= 2*xmmBlockSize)
902 __m128i block0, block1;
903 if (flags & BT_InBlockIsCounter)
906 const __m128i s_one = _mm_set_epi32(1<<24, 0, 0, 0);
907 const __m128i s_two = _mm_set_epi32(2<<24, 0, 2<<24, 0);
912 std::memcpy(temp, inBlocks, blockSize);
913 block0 = _mm_add_epi32(s_one, _mm_castpd_si128(_mm_loaddup_pd(temp)));
916 block1 = _mm_add_epi32(s_two, block0);
920 _mm_store_sd(temp, _mm_castsi128_pd(_mm_add_epi64(s_two, block1)));
921 std::memcpy(const_cast<byte*>(inBlocks), temp, blockSize);
925 block0 = _mm_loadu_si128(CONST_M128_CAST(inBlocks));
926 inBlocks =
PtrAdd(inBlocks, inIncrement);
927 block1 = _mm_loadu_si128(CONST_M128_CAST(inBlocks));
928 inBlocks =
PtrAdd(inBlocks, inIncrement);
933 block0 = _mm_xor_si128(block0, _mm_loadu_si128(CONST_M128_CAST(xorBlocks)));
934 xorBlocks =
PtrAdd(xorBlocks, xorIncrement);
935 block1 = _mm_xor_si128(block1, _mm_loadu_si128(CONST_M128_CAST(xorBlocks)));
936 xorBlocks =
PtrAdd(xorBlocks, xorIncrement);
939 func2(block0, block1, subKeys, static_cast<unsigned int>(rounds));
943 block0 = _mm_xor_si128(block0, _mm_loadu_si128(CONST_M128_CAST(xorBlocks)));
944 xorBlocks =
PtrAdd(xorBlocks, xorIncrement);
945 block1 = _mm_xor_si128(block1, _mm_loadu_si128(CONST_M128_CAST(xorBlocks)));
946 xorBlocks =
PtrAdd(xorBlocks, xorIncrement);
949 _mm_storeu_si128(M128_CAST(outBlocks), block0);
950 outBlocks =
PtrAdd(outBlocks, outIncrement);
951 _mm_storeu_si128(M128_CAST(outBlocks), block1);
952 outBlocks =
PtrAdd(outBlocks, outIncrement);
954 length -= 2*xmmBlockSize;
961 if (flags & BT_ReverseDirection)
963 inIncrement += inIncrement ? blockSize : 0;
964 xorIncrement += xorIncrement ? blockSize : 0;
965 outIncrement += outIncrement ? blockSize : 0;
966 inBlocks =
PtrSub(inBlocks, inIncrement);
967 xorBlocks =
PtrSub(xorBlocks, xorIncrement);
968 outBlocks =
PtrSub(outBlocks, outIncrement);
972 inIncrement -= inIncrement ? blockSize : 0;
973 xorIncrement -= xorIncrement ? blockSize : 0;
974 outIncrement -= outIncrement ? blockSize : 0;
977 while (length >= blockSize)
980 std::memcpy(temp, inBlocks, blockSize);
981 __m128i block = _mm_castpd_si128(_mm_load_sd(temp));
985 std::memcpy(temp, xorBlocks, blockSize);
986 block = _mm_xor_si128(block, _mm_castpd_si128(_mm_load_sd(temp)));
989 if (flags & BT_InBlockIsCounter)
990 const_cast<byte *
>(inBlocks)[7]++;
992 func1(block, subKeys, static_cast<unsigned int>(rounds));
996 std::memcpy(temp, xorBlocks, blockSize);
997 block = _mm_xor_si128(block, _mm_castpd_si128(_mm_load_sd(temp)));
1000 _mm_store_sd(temp, _mm_castsi128_pd(block));
1001 std::memcpy(outBlocks, temp, blockSize);
1003 inBlocks =
PtrAdd(inBlocks, inIncrement);
1004 outBlocks =
PtrAdd(outBlocks, outIncrement);
1005 xorBlocks =
PtrAdd(xorBlocks, xorIncrement);
1006 length -= blockSize;
1021 template <
typename F2,
typename F6,
typename W>
1022 inline size_t AdvancedProcessBlocks64_6x2_SSE(F2 func2, F6 func6,
1023 MAYBE_CONST W *subKeys,
size_t rounds,
const byte *inBlocks,
1024 const byte *xorBlocks, byte *outBlocks,
size_t length, word32 flags)
1031 const size_t blockSize = 8;
1032 const size_t xmmBlockSize = 16;
1034 size_t inIncrement = (flags & (BT_InBlockIsCounter|BT_DontIncrementInOutPointers)) ? 0 : xmmBlockSize;
1035 size_t xorIncrement = (xorBlocks != NULLPTR) ? xmmBlockSize : 0;
1036 size_t outIncrement = (flags & BT_DontIncrementInOutPointers) ? 0 : xmmBlockSize;
1039 const bool xorInput = (xorBlocks != NULLPTR) && (flags & BT_XorInput);
1040 const bool xorOutput = (xorBlocks != NULLPTR) && !(flags & BT_XorInput);
1042 if (flags & BT_ReverseDirection)
1044 inBlocks =
PtrAdd(inBlocks, length - xmmBlockSize);
1045 xorBlocks =
PtrAdd(xorBlocks, length - xmmBlockSize);
1046 outBlocks =
PtrAdd(outBlocks, length - xmmBlockSize);
1047 inIncrement = 0-inIncrement;
1048 xorIncrement = 0-xorIncrement;
1049 outIncrement = 0-outIncrement;
1052 if (flags & BT_AllowParallel)
1055 while (length >= 6*xmmBlockSize)
1057 __m128i block0, block1, block2, block3, block4, block5;
1058 if (flags & BT_InBlockIsCounter)
1061 const __m128i s_one = _mm_set_epi32(1<<24, 0, 0, 0);
1062 const __m128i s_two = _mm_set_epi32(2<<24, 0, 2<<24, 0);
1067 std::memcpy(temp, inBlocks, blockSize);
1068 block0 = _mm_add_epi32(s_one, _mm_castpd_si128(_mm_loaddup_pd(temp)));
1071 block1 = _mm_add_epi32(s_two, block0);
1072 block2 = _mm_add_epi32(s_two, block1);
1073 block3 = _mm_add_epi32(s_two, block2);
1074 block4 = _mm_add_epi32(s_two, block3);
1075 block5 = _mm_add_epi32(s_two, block4);
1079 _mm_store_sd(temp, _mm_castsi128_pd(_mm_add_epi32(s_two, block5)));
1080 std::memcpy(const_cast<byte*>(inBlocks), temp, blockSize);
1084 block0 = _mm_loadu_si128(CONST_M128_CAST(inBlocks));
1085 inBlocks =
PtrAdd(inBlocks, inIncrement);
1086 block1 = _mm_loadu_si128(CONST_M128_CAST(inBlocks));
1087 inBlocks =
PtrAdd(inBlocks, inIncrement);
1088 block2 = _mm_loadu_si128(CONST_M128_CAST(inBlocks));
1089 inBlocks =
PtrAdd(inBlocks, inIncrement);
1090 block3 = _mm_loadu_si128(CONST_M128_CAST(inBlocks));
1091 inBlocks =
PtrAdd(inBlocks, inIncrement);
1092 block4 = _mm_loadu_si128(CONST_M128_CAST(inBlocks));
1093 inBlocks =
PtrAdd(inBlocks, inIncrement);
1094 block5 = _mm_loadu_si128(CONST_M128_CAST(inBlocks));
1095 inBlocks =
PtrAdd(inBlocks, inIncrement);
1100 block0 = _mm_xor_si128(block0, _mm_loadu_si128(CONST_M128_CAST(xorBlocks)));
1101 xorBlocks =
PtrAdd(xorBlocks, xorIncrement);
1102 block1 = _mm_xor_si128(block1, _mm_loadu_si128(CONST_M128_CAST(xorBlocks)));
1103 xorBlocks =
PtrAdd(xorBlocks, xorIncrement);
1104 block2 = _mm_xor_si128(block2, _mm_loadu_si128(CONST_M128_CAST(xorBlocks)));
1105 xorBlocks =
PtrAdd(xorBlocks, xorIncrement);
1106 block3 = _mm_xor_si128(block3, _mm_loadu_si128(CONST_M128_CAST(xorBlocks)));
1107 xorBlocks =
PtrAdd(xorBlocks, xorIncrement);
1108 block4 = _mm_xor_si128(block4, _mm_loadu_si128(CONST_M128_CAST(xorBlocks)));
1109 xorBlocks =
PtrAdd(xorBlocks, xorIncrement);
1110 block5 = _mm_xor_si128(block5, _mm_loadu_si128(CONST_M128_CAST(xorBlocks)));
1111 xorBlocks =
PtrAdd(xorBlocks, xorIncrement);
1114 func6(block0, block1, block2, block3, block4, block5, subKeys, static_cast<unsigned int>(rounds));
1118 block0 = _mm_xor_si128(block0, _mm_loadu_si128(CONST_M128_CAST(xorBlocks)));
1119 xorBlocks =
PtrAdd(xorBlocks, xorIncrement);
1120 block1 = _mm_xor_si128(block1, _mm_loadu_si128(CONST_M128_CAST(xorBlocks)));
1121 xorBlocks =
PtrAdd(xorBlocks, xorIncrement);
1122 block2 = _mm_xor_si128(block2, _mm_loadu_si128(CONST_M128_CAST(xorBlocks)));
1123 xorBlocks =
PtrAdd(xorBlocks, xorIncrement);
1124 block3 = _mm_xor_si128(block3, _mm_loadu_si128(CONST_M128_CAST(xorBlocks)));
1125 xorBlocks =
PtrAdd(xorBlocks, xorIncrement);
1126 block4 = _mm_xor_si128(block4, _mm_loadu_si128(CONST_M128_CAST(xorBlocks)));
1127 xorBlocks =
PtrAdd(xorBlocks, xorIncrement);
1128 block5 = _mm_xor_si128(block5, _mm_loadu_si128(CONST_M128_CAST(xorBlocks)));
1129 xorBlocks =
PtrAdd(xorBlocks, xorIncrement);
1132 _mm_storeu_si128(M128_CAST(outBlocks), block0);
1133 outBlocks =
PtrAdd(outBlocks, outIncrement);
1134 _mm_storeu_si128(M128_CAST(outBlocks), block1);
1135 outBlocks =
PtrAdd(outBlocks, outIncrement);
1136 _mm_storeu_si128(M128_CAST(outBlocks), block2);
1137 outBlocks =
PtrAdd(outBlocks, outIncrement);
1138 _mm_storeu_si128(M128_CAST(outBlocks), block3);
1139 outBlocks =
PtrAdd(outBlocks, outIncrement);
1140 _mm_storeu_si128(M128_CAST(outBlocks), block4);
1141 outBlocks =
PtrAdd(outBlocks, outIncrement);
1142 _mm_storeu_si128(M128_CAST(outBlocks), block5);
1143 outBlocks =
PtrAdd(outBlocks, outIncrement);
1145 length -= 6*xmmBlockSize;
1148 while (length >= 2*xmmBlockSize)
1150 __m128i block0, block1;
1151 if (flags & BT_InBlockIsCounter)
1154 const __m128i s_one = _mm_set_epi32(1<<24, 0, 0, 0);
1155 const __m128i s_two = _mm_set_epi32(2<<24, 0, 2<<24, 0);
1160 std::memcpy(temp, inBlocks, blockSize);
1161 block0 = _mm_add_epi32(s_one, _mm_castpd_si128(_mm_loaddup_pd(temp)));
1164 block1 = _mm_add_epi32(s_two, block0);
1168 _mm_store_sd(temp, _mm_castsi128_pd(_mm_add_epi64(s_two, block1)));
1169 std::memcpy(const_cast<byte*>(inBlocks), temp, blockSize);
1173 block0 = _mm_loadu_si128(CONST_M128_CAST(inBlocks));
1174 inBlocks =
PtrAdd(inBlocks, inIncrement);
1175 block1 = _mm_loadu_si128(CONST_M128_CAST(inBlocks));
1176 inBlocks =
PtrAdd(inBlocks, inIncrement);
1181 block0 = _mm_xor_si128(block0, _mm_loadu_si128(CONST_M128_CAST(xorBlocks)));
1182 xorBlocks =
PtrAdd(xorBlocks, xorIncrement);
1183 block1 = _mm_xor_si128(block1, _mm_loadu_si128(CONST_M128_CAST(xorBlocks)));
1184 xorBlocks =
PtrAdd(xorBlocks, xorIncrement);
1187 func2(block0, block1, subKeys, static_cast<unsigned int>(rounds));
1191 block0 = _mm_xor_si128(block0, _mm_loadu_si128(CONST_M128_CAST(xorBlocks)));
1192 xorBlocks =
PtrAdd(xorBlocks, xorIncrement);
1193 block1 = _mm_xor_si128(block1, _mm_loadu_si128(CONST_M128_CAST(xorBlocks)));
1194 xorBlocks =
PtrAdd(xorBlocks, xorIncrement);
1197 _mm_storeu_si128(M128_CAST(outBlocks), block0);
1198 outBlocks =
PtrAdd(outBlocks, outIncrement);
1199 _mm_storeu_si128(M128_CAST(outBlocks), block1);
1200 outBlocks =
PtrAdd(outBlocks, outIncrement);
1202 length -= 2*xmmBlockSize;
1209 if (flags & BT_ReverseDirection)
1211 inIncrement += inIncrement ? blockSize : 0;
1212 xorIncrement += xorIncrement ? blockSize : 0;
1213 outIncrement += outIncrement ? blockSize : 0;
1214 inBlocks =
PtrSub(inBlocks, inIncrement);
1215 xorBlocks =
PtrSub(xorBlocks, xorIncrement);
1216 outBlocks =
PtrSub(outBlocks, outIncrement);
1220 inIncrement -= inIncrement ? blockSize : 0;
1221 xorIncrement -= xorIncrement ? blockSize : 0;
1222 outIncrement -= outIncrement ? blockSize : 0;
1225 while (length >= blockSize)
1228 __m128i block, zero = _mm_setzero_si128();
1229 std::memcpy(temp, inBlocks, blockSize);
1230 block = _mm_castpd_si128(_mm_load_sd(temp));
1234 std::memcpy(temp, xorBlocks, blockSize);
1235 block = _mm_xor_si128(block,
1236 _mm_castpd_si128(_mm_load_sd(temp)));
1239 if (flags & BT_InBlockIsCounter)
1240 const_cast<byte *
>(inBlocks)[7]++;
1242 func2(block, zero, subKeys, static_cast<unsigned int>(rounds));
1246 std::memcpy(temp, xorBlocks, blockSize);
1247 block = _mm_xor_si128(block,
1248 _mm_castpd_si128(_mm_load_sd(temp)));
1251 _mm_store_sd(temp, _mm_castsi128_pd(block));
1252 std::memcpy(outBlocks, temp, blockSize);
1254 inBlocks =
PtrAdd(inBlocks, inIncrement);
1255 outBlocks =
PtrAdd(outBlocks, outIncrement);
1256 xorBlocks =
PtrAdd(xorBlocks, xorIncrement);
1257 length -= blockSize;
1272 template <
typename F2,
typename F6,
typename W>
1273 inline size_t AdvancedProcessBlocks128_6x2_SSE(F2 func2, F6 func6,
1274 MAYBE_CONST W *subKeys,
size_t rounds,
const byte *inBlocks,
1275 const byte *xorBlocks, byte *outBlocks,
size_t length, word32 flags)
1282 const size_t blockSize = 16;
1285 size_t inIncrement = (flags & (BT_InBlockIsCounter|BT_DontIncrementInOutPointers)) ? 0 : blockSize;
1286 size_t xorIncrement = (xorBlocks != NULLPTR) ? blockSize : 0;
1287 size_t outIncrement = (flags & BT_DontIncrementInOutPointers) ? 0 : blockSize;
1290 const bool xorInput = (xorBlocks != NULLPTR) && (flags & BT_XorInput);
1291 const bool xorOutput = (xorBlocks != NULLPTR) && !(flags & BT_XorInput);
1293 if (flags & BT_ReverseDirection)
1295 inBlocks =
PtrAdd(inBlocks, length - blockSize);
1296 xorBlocks =
PtrAdd(xorBlocks, length - blockSize);
1297 outBlocks =
PtrAdd(outBlocks, length - blockSize);
1298 inIncrement = 0-inIncrement;
1299 xorIncrement = 0-xorIncrement;
1300 outIncrement = 0-outIncrement;
1303 if (flags & BT_AllowParallel)
1305 while (length >= 6*blockSize)
1307 __m128i block0, block1, block2, block3, block4, block5;
1308 if (flags & BT_InBlockIsCounter)
1311 const __m128i s_one = _mm_set_epi32(1<<24, 0, 0, 0);
1312 block0 = _mm_loadu_si128(CONST_M128_CAST(inBlocks));
1313 block1 = _mm_add_epi32(block0, s_one);
1314 block2 = _mm_add_epi32(block1, s_one);
1315 block3 = _mm_add_epi32(block2, s_one);
1316 block4 = _mm_add_epi32(block3, s_one);
1317 block5 = _mm_add_epi32(block4, s_one);
1318 _mm_storeu_si128(M128_CAST(inBlocks), _mm_add_epi32(block5, s_one));
1322 block0 = _mm_loadu_si128(CONST_M128_CAST(inBlocks));
1323 inBlocks =
PtrAdd(inBlocks, inIncrement);
1324 block1 = _mm_loadu_si128(CONST_M128_CAST(inBlocks));
1325 inBlocks =
PtrAdd(inBlocks, inIncrement);
1326 block2 = _mm_loadu_si128(CONST_M128_CAST(inBlocks));
1327 inBlocks =
PtrAdd(inBlocks, inIncrement);
1328 block3 = _mm_loadu_si128(CONST_M128_CAST(inBlocks));
1329 inBlocks =
PtrAdd(inBlocks, inIncrement);
1330 block4 = _mm_loadu_si128(CONST_M128_CAST(inBlocks));
1331 inBlocks =
PtrAdd(inBlocks, inIncrement);
1332 block5 = _mm_loadu_si128(CONST_M128_CAST(inBlocks));
1333 inBlocks =
PtrAdd(inBlocks, inIncrement);
1338 block0 = _mm_xor_si128(block0, _mm_loadu_si128(CONST_M128_CAST(xorBlocks)));
1339 xorBlocks =
PtrAdd(xorBlocks, xorIncrement);
1340 block1 = _mm_xor_si128(block1, _mm_loadu_si128(CONST_M128_CAST(xorBlocks)));
1341 xorBlocks =
PtrAdd(xorBlocks, xorIncrement);
1342 block2 = _mm_xor_si128(block2, _mm_loadu_si128(CONST_M128_CAST(xorBlocks)));
1343 xorBlocks =
PtrAdd(xorBlocks, xorIncrement);
1344 block3 = _mm_xor_si128(block3, _mm_loadu_si128(CONST_M128_CAST(xorBlocks)));
1345 xorBlocks =
PtrAdd(xorBlocks, xorIncrement);
1346 block4 = _mm_xor_si128(block4, _mm_loadu_si128(CONST_M128_CAST(xorBlocks)));
1347 xorBlocks =
PtrAdd(xorBlocks, xorIncrement);
1348 block5 = _mm_xor_si128(block5, _mm_loadu_si128(CONST_M128_CAST(xorBlocks)));
1349 xorBlocks =
PtrAdd(xorBlocks, xorIncrement);
1352 func6(block0, block1, block2, block3, block4, block5, subKeys, static_cast<unsigned int>(rounds));
1356 block0 = _mm_xor_si128(block0, _mm_loadu_si128(CONST_M128_CAST(xorBlocks)));
1357 xorBlocks =
PtrAdd(xorBlocks, xorIncrement);
1358 block1 = _mm_xor_si128(block1, _mm_loadu_si128(CONST_M128_CAST(xorBlocks)));
1359 xorBlocks =
PtrAdd(xorBlocks, xorIncrement);
1360 block2 = _mm_xor_si128(block2, _mm_loadu_si128(CONST_M128_CAST(xorBlocks)));
1361 xorBlocks =
PtrAdd(xorBlocks, xorIncrement);
1362 block3 = _mm_xor_si128(block3, _mm_loadu_si128(CONST_M128_CAST(xorBlocks)));
1363 xorBlocks =
PtrAdd(xorBlocks, xorIncrement);
1364 block4 = _mm_xor_si128(block4, _mm_loadu_si128(CONST_M128_CAST(xorBlocks)));
1365 xorBlocks =
PtrAdd(xorBlocks, xorIncrement);
1366 block5 = _mm_xor_si128(block5, _mm_loadu_si128(CONST_M128_CAST(xorBlocks)));
1367 xorBlocks =
PtrAdd(xorBlocks, xorIncrement);
1370 _mm_storeu_si128(M128_CAST(outBlocks), block0);
1371 outBlocks =
PtrAdd(outBlocks, outIncrement);
1372 _mm_storeu_si128(M128_CAST(outBlocks), block1);
1373 outBlocks =
PtrAdd(outBlocks, outIncrement);
1374 _mm_storeu_si128(M128_CAST(outBlocks), block2);
1375 outBlocks =
PtrAdd(outBlocks, outIncrement);
1376 _mm_storeu_si128(M128_CAST(outBlocks), block3);
1377 outBlocks =
PtrAdd(outBlocks, outIncrement);
1378 _mm_storeu_si128(M128_CAST(outBlocks), block4);
1379 outBlocks =
PtrAdd(outBlocks, outIncrement);
1380 _mm_storeu_si128(M128_CAST(outBlocks), block5);
1381 outBlocks =
PtrAdd(outBlocks, outIncrement);
1383 length -= 6*blockSize;
1386 while (length >= 2*blockSize)
1388 __m128i block0, block1;
1389 if (flags & BT_InBlockIsCounter)
1392 const __m128i s_one = _mm_set_epi32(1<<24, 0, 0, 0);
1393 block0 = _mm_loadu_si128(CONST_M128_CAST(inBlocks));
1394 block1 = _mm_add_epi32(block0, s_one);
1395 _mm_storeu_si128(M128_CAST(inBlocks), _mm_add_epi32(block1, s_one));
1399 block0 = _mm_loadu_si128(CONST_M128_CAST(inBlocks));
1400 inBlocks =
PtrAdd(inBlocks, inIncrement);
1401 block1 = _mm_loadu_si128(CONST_M128_CAST(inBlocks));
1402 inBlocks =
PtrAdd(inBlocks, inIncrement);
1407 block0 = _mm_xor_si128(block0, _mm_loadu_si128(CONST_M128_CAST(xorBlocks)));
1408 xorBlocks =
PtrAdd(xorBlocks, xorIncrement);
1409 block1 = _mm_xor_si128(block1, _mm_loadu_si128(CONST_M128_CAST(xorBlocks)));
1410 xorBlocks =
PtrAdd(xorBlocks, xorIncrement);
1413 func2(block0, block1, subKeys, static_cast<unsigned int>(rounds));
1417 block0 = _mm_xor_si128(block0, _mm_loadu_si128(CONST_M128_CAST(xorBlocks)));
1418 xorBlocks =
PtrAdd(xorBlocks, xorIncrement);
1419 block1 = _mm_xor_si128(block1, _mm_loadu_si128(CONST_M128_CAST(xorBlocks)));
1420 xorBlocks =
PtrAdd(xorBlocks, xorIncrement);
1423 _mm_storeu_si128(M128_CAST(outBlocks), block0);
1424 outBlocks =
PtrAdd(outBlocks, outIncrement);
1425 _mm_storeu_si128(M128_CAST(outBlocks), block1);
1426 outBlocks =
PtrAdd(outBlocks, outIncrement);
1428 length -= 2*blockSize;
1432 while (length >= blockSize)
1434 __m128i block, zero = _mm_setzero_si128();
1435 block = _mm_loadu_si128(CONST_M128_CAST(inBlocks));
1438 block = _mm_xor_si128(block, _mm_loadu_si128(CONST_M128_CAST(xorBlocks)));
1440 if (flags & BT_InBlockIsCounter)
1441 const_cast<byte *
>(inBlocks)[15]++;
1443 func2(block, zero, subKeys, static_cast<unsigned int>(rounds));
1446 block = _mm_xor_si128(block, _mm_loadu_si128(CONST_M128_CAST(xorBlocks)));
1448 _mm_storeu_si128(M128_CAST(outBlocks), block);
1450 inBlocks =
PtrAdd(inBlocks, inIncrement);
1451 outBlocks =
PtrAdd(outBlocks, outIncrement);
1452 xorBlocks =
PtrAdd(xorBlocks, xorIncrement);
1453 length -= blockSize;
1467 template <
typename F1,
typename F4,
typename W>
1468 inline size_t AdvancedProcessBlocks128_4x1_SSE(F1 func1, F4 func4,
1469 MAYBE_CONST W *subKeys,
size_t rounds,
const byte *inBlocks,
1470 const byte *xorBlocks, byte *outBlocks,
size_t length, word32 flags)
1477 const size_t blockSize = 16;
1480 size_t inIncrement = (flags & (BT_InBlockIsCounter|BT_DontIncrementInOutPointers)) ? 0 : blockSize;
1481 size_t xorIncrement = (xorBlocks != NULLPTR) ? blockSize : 0;
1482 size_t outIncrement = (flags & BT_DontIncrementInOutPointers) ? 0 : blockSize;
1485 const bool xorInput = (xorBlocks != NULLPTR) && (flags & BT_XorInput);
1486 const bool xorOutput = (xorBlocks != NULLPTR) && !(flags & BT_XorInput);
1488 if (flags & BT_ReverseDirection)
1490 inBlocks =
PtrAdd(inBlocks, length - blockSize);
1491 xorBlocks =
PtrAdd(xorBlocks, length - blockSize);
1492 outBlocks =
PtrAdd(outBlocks, length - blockSize);
1493 inIncrement = 0-inIncrement;
1494 xorIncrement = 0-xorIncrement;
1495 outIncrement = 0-outIncrement;
1498 if (flags & BT_AllowParallel)
1500 while (length >= 4*blockSize)
1502 __m128i block0, block1, block2, block3;
1503 if (flags & BT_InBlockIsCounter)
1506 const __m128i s_one = _mm_set_epi32(1<<24, 0, 0, 0);
1507 block0 = _mm_loadu_si128(CONST_M128_CAST(inBlocks));
1508 block1 = _mm_add_epi32(block0, s_one);
1509 block2 = _mm_add_epi32(block1, s_one);
1510 block3 = _mm_add_epi32(block2, s_one);
1511 _mm_storeu_si128(M128_CAST(inBlocks), _mm_add_epi32(block3, s_one));
1515 block0 = _mm_loadu_si128(CONST_M128_CAST(inBlocks));
1516 inBlocks =
PtrAdd(inBlocks, inIncrement);
1517 block1 = _mm_loadu_si128(CONST_M128_CAST(inBlocks));
1518 inBlocks =
PtrAdd(inBlocks, inIncrement);
1519 block2 = _mm_loadu_si128(CONST_M128_CAST(inBlocks));
1520 inBlocks =
PtrAdd(inBlocks, inIncrement);
1521 block3 = _mm_loadu_si128(CONST_M128_CAST(inBlocks));
1522 inBlocks =
PtrAdd(inBlocks, inIncrement);
1527 block0 = _mm_xor_si128(block0, _mm_loadu_si128(CONST_M128_CAST(xorBlocks)));
1528 xorBlocks =
PtrAdd(xorBlocks, xorIncrement);
1529 block1 = _mm_xor_si128(block1, _mm_loadu_si128(CONST_M128_CAST(xorBlocks)));
1530 xorBlocks =
PtrAdd(xorBlocks, xorIncrement);
1531 block2 = _mm_xor_si128(block2, _mm_loadu_si128(CONST_M128_CAST(xorBlocks)));
1532 xorBlocks =
PtrAdd(xorBlocks, xorIncrement);
1533 block3 = _mm_xor_si128(block3, _mm_loadu_si128(CONST_M128_CAST(xorBlocks)));
1534 xorBlocks =
PtrAdd(xorBlocks, xorIncrement);
1537 func4(block0, block1, block2, block3, subKeys, static_cast<unsigned int>(rounds));
1541 block0 = _mm_xor_si128(block0, _mm_loadu_si128(CONST_M128_CAST(xorBlocks)));
1542 xorBlocks =
PtrAdd(xorBlocks, xorIncrement);
1543 block1 = _mm_xor_si128(block1, _mm_loadu_si128(CONST_M128_CAST(xorBlocks)));
1544 xorBlocks =
PtrAdd(xorBlocks, xorIncrement);
1545 block2 = _mm_xor_si128(block2, _mm_loadu_si128(CONST_M128_CAST(xorBlocks)));
1546 xorBlocks =
PtrAdd(xorBlocks, xorIncrement);
1547 block3 = _mm_xor_si128(block3, _mm_loadu_si128(CONST_M128_CAST(xorBlocks)));
1548 xorBlocks =
PtrAdd(xorBlocks, xorIncrement);
1551 _mm_storeu_si128(M128_CAST(outBlocks), block0);
1552 outBlocks =
PtrAdd(outBlocks, outIncrement);
1553 _mm_storeu_si128(M128_CAST(outBlocks), block1);
1554 outBlocks =
PtrAdd(outBlocks, outIncrement);
1555 _mm_storeu_si128(M128_CAST(outBlocks), block2);
1556 outBlocks =
PtrAdd(outBlocks, outIncrement);
1557 _mm_storeu_si128(M128_CAST(outBlocks), block3);
1558 outBlocks =
PtrAdd(outBlocks, outIncrement);
1560 length -= 4*blockSize;
1564 while (length >= blockSize)
1566 __m128i block = _mm_loadu_si128(CONST_M128_CAST(inBlocks));
1569 block = _mm_xor_si128(block, _mm_loadu_si128(CONST_M128_CAST(xorBlocks)));
1571 if (flags & BT_InBlockIsCounter)
1572 const_cast<byte *
>(inBlocks)[15]++;
1574 func1(block, subKeys, static_cast<unsigned int>(rounds));
1577 block = _mm_xor_si128(block, _mm_loadu_si128(CONST_M128_CAST(xorBlocks)));
1579 _mm_storeu_si128(M128_CAST(outBlocks), block);
1581 inBlocks =
PtrAdd(inBlocks, inIncrement);
1582 outBlocks =
PtrAdd(outBlocks, outIncrement);
1583 xorBlocks =
PtrAdd(xorBlocks, xorIncrement);
1584 length -= blockSize;
1598 template <
typename F1,
typename F4,
typename W>
1599 inline size_t AdvancedProcessBlocks64_4x1_SSE(F1 func1, F4 func4,
1600 MAYBE_CONST W *subKeys,
size_t rounds,
const byte *inBlocks,
1601 const byte *xorBlocks, byte *outBlocks,
size_t length, word32 flags)
1608 const size_t blockSize = 8;
1609 const size_t xmmBlockSize = 16;
1611 size_t inIncrement = (flags & (BT_InBlockIsCounter | BT_DontIncrementInOutPointers)) ? 0 : xmmBlockSize;
1612 size_t xorIncrement = (xorBlocks != NULLPTR) ? xmmBlockSize : 0;
1613 size_t outIncrement = (flags & BT_DontIncrementInOutPointers) ? 0 : xmmBlockSize;
1616 const bool xorInput = (xorBlocks != NULLPTR) && (flags & BT_XorInput);
1617 const bool xorOutput = (xorBlocks != NULLPTR) && !(flags & BT_XorInput);
1619 if (flags & BT_ReverseDirection)
1621 inBlocks =
PtrAdd(inBlocks, length - xmmBlockSize);
1622 xorBlocks =
PtrAdd(xorBlocks, length - xmmBlockSize);
1623 outBlocks =
PtrAdd(outBlocks, length - xmmBlockSize);
1624 inIncrement = 0 - inIncrement;
1625 xorIncrement = 0 - xorIncrement;
1626 outIncrement = 0 - outIncrement;
1629 if (flags & BT_AllowParallel)
1631 while (length >= 4*xmmBlockSize)
1633 __m128i block0, block1, block2, block3;
1634 if (flags & BT_InBlockIsCounter)
1637 const __m128i s_one = _mm_set_epi32(1<<24, 0, 0, 0);
1638 const __m128i s_two = _mm_set_epi32(2<<24, 0, 2<<24, 0);
1644 std::memcpy(temp, inBlocks, blockSize);
1645 block0 = _mm_add_epi32(s_one, _mm_castpd_si128(_mm_loaddup_pd(temp)));
1648 block1 = _mm_add_epi32(s_two, block0);
1649 block2 = _mm_add_epi32(s_two, block1);
1650 block3 = _mm_add_epi32(s_two, block2);
1654 _mm_store_sd(temp, _mm_castsi128_pd(_mm_add_epi64(s_two, block3)));
1655 std::memcpy(const_cast<byte*>(inBlocks), temp, blockSize);
1659 block0 = _mm_loadu_si128(CONST_M128_CAST(inBlocks));
1660 inBlocks =
PtrAdd(inBlocks, inIncrement);
1661 block1 = _mm_loadu_si128(CONST_M128_CAST(inBlocks));
1662 inBlocks =
PtrAdd(inBlocks, inIncrement);
1663 block2 = _mm_loadu_si128(CONST_M128_CAST(inBlocks));
1664 inBlocks =
PtrAdd(inBlocks, inIncrement);
1665 block3 = _mm_loadu_si128(CONST_M128_CAST(inBlocks));
1666 inBlocks =
PtrAdd(inBlocks, inIncrement);
1671 block0 = _mm_xor_si128(block0, _mm_loadu_si128(CONST_M128_CAST(xorBlocks)));
1672 xorBlocks =
PtrAdd(xorBlocks, xorIncrement);
1673 block1 = _mm_xor_si128(block1, _mm_loadu_si128(CONST_M128_CAST(xorBlocks)));
1674 xorBlocks =
PtrAdd(xorBlocks, xorIncrement);
1675 block2 = _mm_xor_si128(block2, _mm_loadu_si128(CONST_M128_CAST(xorBlocks)));
1676 xorBlocks =
PtrAdd(xorBlocks, xorIncrement);
1677 block3 = _mm_xor_si128(block3, _mm_loadu_si128(CONST_M128_CAST(xorBlocks)));
1678 xorBlocks =
PtrAdd(xorBlocks, xorIncrement);
1681 func4(block0, block1, block2, block3, subKeys, static_cast<unsigned int>(rounds));
1685 block0 = _mm_xor_si128(block0, _mm_loadu_si128(CONST_M128_CAST(xorBlocks)));
1686 xorBlocks =
PtrAdd(xorBlocks, xorIncrement);
1687 block1 = _mm_xor_si128(block1, _mm_loadu_si128(CONST_M128_CAST(xorBlocks)));
1688 xorBlocks =
PtrAdd(xorBlocks, xorIncrement);
1689 block2 = _mm_xor_si128(block2, _mm_loadu_si128(CONST_M128_CAST(xorBlocks)));
1690 xorBlocks =
PtrAdd(xorBlocks, xorIncrement);
1691 block3 = _mm_xor_si128(block3, _mm_loadu_si128(CONST_M128_CAST(xorBlocks)));
1692 xorBlocks =
PtrAdd(xorBlocks, xorIncrement);
1695 _mm_storeu_si128(M128_CAST(outBlocks), block0);
1696 outBlocks =
PtrAdd(outBlocks, outIncrement);
1697 _mm_storeu_si128(M128_CAST(outBlocks), block1);
1698 outBlocks =
PtrAdd(outBlocks, outIncrement);
1699 _mm_storeu_si128(M128_CAST(outBlocks), block2);
1700 outBlocks =
PtrAdd(outBlocks, outIncrement);
1701 _mm_storeu_si128(M128_CAST(outBlocks), block3);
1702 outBlocks =
PtrAdd(outBlocks, outIncrement);
1704 length -= 4*xmmBlockSize;
1711 if (flags & BT_ReverseDirection)
1713 inIncrement += inIncrement ? blockSize : 0;
1714 xorIncrement += xorIncrement ? blockSize : 0;
1715 outIncrement += outIncrement ? blockSize : 0;
1716 inBlocks =
PtrSub(inBlocks, inIncrement);
1717 xorBlocks =
PtrSub(xorBlocks, xorIncrement);
1718 outBlocks =
PtrSub(outBlocks, outIncrement);
1722 inIncrement -= inIncrement ? blockSize : 0;
1723 xorIncrement -= xorIncrement ? blockSize : 0;
1724 outIncrement -= outIncrement ? blockSize : 0;
1727 while (length >= blockSize)
1730 std::memcpy(temp, inBlocks, blockSize);
1731 __m128i block = _mm_castpd_si128(_mm_load_sd(temp));
1735 std::memcpy(temp, xorBlocks, blockSize);
1736 block = _mm_xor_si128(block, _mm_castpd_si128(_mm_load_sd(temp)));
1739 if (flags & BT_InBlockIsCounter)
1740 const_cast<byte *
>(inBlocks)[7]++;
1742 func1(block, subKeys, static_cast<unsigned int>(rounds));
1746 std::memcpy(temp, xorBlocks, blockSize);
1747 block = _mm_xor_si128(block, _mm_castpd_si128(_mm_load_sd(temp)));
1750 _mm_store_sd(temp, _mm_castsi128_pd(block));
1751 std::memcpy(outBlocks, temp, blockSize);
1753 inBlocks =
PtrAdd(inBlocks, inIncrement);
1754 outBlocks =
PtrAdd(outBlocks, outIncrement);
1755 xorBlocks =
PtrAdd(xorBlocks, xorIncrement);
1756 length -= blockSize;
1765 #endif // CRYPTOPP_SSSE3_AVAILABLE 1769 #if defined(__ALTIVEC__) 1781 template <
typename F2,
typename F6,
typename W>
1782 inline size_t AdvancedProcessBlocks64_6x2_ALTIVEC(F2 func2, F6 func6,
1783 const W *subKeys,
size_t rounds,
const byte *inBlocks,
1784 const byte *xorBlocks, byte *outBlocks,
size_t length, word32 flags)
1791 #if (CRYPTOPP_LITTLE_ENDIAN) 1792 enum {LowOffset=8, HighOffset=0};
1796 enum {LowOffset=8, HighOffset=0};
1801 const size_t blockSize = 8;
1802 const size_t vsxBlockSize = 16;
1803 CRYPTOPP_ALIGN_DATA(16) uint8_t temp[16];
1805 size_t inIncrement = (flags & (BT_InBlockIsCounter|BT_DontIncrementInOutPointers)) ? 0 : vsxBlockSize;
1806 size_t xorIncrement = (xorBlocks != NULLPTR) ? vsxBlockSize : 0;
1807 size_t outIncrement = (flags & BT_DontIncrementInOutPointers) ? 0 : vsxBlockSize;
1810 const
bool xorInput = (xorBlocks != NULLPTR) && (flags & BT_XorInput);
1811 const
bool xorOutput = (xorBlocks != NULLPTR) && !(flags & BT_XorInput);
1813 if (flags & BT_ReverseDirection)
1815 inBlocks =
PtrAdd(inBlocks, length - vsxBlockSize);
1816 xorBlocks =
PtrAdd(xorBlocks, length - vsxBlockSize);
1817 outBlocks =
PtrAdd(outBlocks, length - vsxBlockSize);
1818 inIncrement = 0-inIncrement;
1819 xorIncrement = 0-xorIncrement;
1820 outIncrement = 0-outIncrement;
1823 if (flags & BT_AllowParallel)
1825 while (length >= 6*vsxBlockSize)
1827 uint32x4_p block0, block1, block2, block3, block4, block5;
1828 if (flags & BT_InBlockIsCounter)
1832 std::memcpy(temp+LowOffset, inBlocks, 8);
1833 std::memcpy(temp+HighOffset, inBlocks, 8);
1840 block0 =
VecAdd(s_one, ctr);
1844 block1 =
VecAdd(s_two, block0);
1845 block2 =
VecAdd(s_two, block1);
1846 block3 =
VecAdd(s_two, block2);
1847 block4 =
VecAdd(s_two, block3);
1848 block5 =
VecAdd(s_two, block4);
1851 const_cast<byte*
>(inBlocks)[7] += 12;
1856 inBlocks =
PtrAdd(inBlocks, inIncrement);
1858 inBlocks =
PtrAdd(inBlocks, inIncrement);
1860 inBlocks =
PtrAdd(inBlocks, inIncrement);
1862 inBlocks =
PtrAdd(inBlocks, inIncrement);
1864 inBlocks =
PtrAdd(inBlocks, inIncrement);
1866 inBlocks =
PtrAdd(inBlocks, inIncrement);
1872 xorBlocks =
PtrAdd(xorBlocks, xorIncrement);
1874 xorBlocks =
PtrAdd(xorBlocks, xorIncrement);
1876 xorBlocks =
PtrAdd(xorBlocks, xorIncrement);
1878 xorBlocks =
PtrAdd(xorBlocks, xorIncrement);
1880 xorBlocks =
PtrAdd(xorBlocks, xorIncrement);
1882 xorBlocks =
PtrAdd(xorBlocks, xorIncrement);
1885 func6(block0, block1, block2, block3, block4, block5, subKeys, static_cast<unsigned int>(rounds));
1890 xorBlocks =
PtrAdd(xorBlocks, xorIncrement);
1892 xorBlocks =
PtrAdd(xorBlocks, xorIncrement);
1894 xorBlocks =
PtrAdd(xorBlocks, xorIncrement);
1896 xorBlocks =
PtrAdd(xorBlocks, xorIncrement);
1898 xorBlocks =
PtrAdd(xorBlocks, xorIncrement);
1900 xorBlocks =
PtrAdd(xorBlocks, xorIncrement);
1904 outBlocks =
PtrAdd(outBlocks, outIncrement);
1906 outBlocks =
PtrAdd(outBlocks, outIncrement);
1908 outBlocks =
PtrAdd(outBlocks, outIncrement);
1910 outBlocks =
PtrAdd(outBlocks, outIncrement);
1912 outBlocks =
PtrAdd(outBlocks, outIncrement);
1914 outBlocks =
PtrAdd(outBlocks, outIncrement);
1916 length -= 6*vsxBlockSize;
1919 while (length >= 2*vsxBlockSize)
1922 if (flags & BT_InBlockIsCounter)
1926 std::memcpy(temp+LowOffset, inBlocks, 8);
1927 std::memcpy(temp+HighOffset, inBlocks, 8);
1934 block0 =
VecAdd(s_one, ctr);
1938 block1 =
VecAdd(s_two, block0);
1941 const_cast<byte*
>(inBlocks)[7] += 4;
1946 inBlocks =
PtrAdd(inBlocks, inIncrement);
1948 inBlocks =
PtrAdd(inBlocks, inIncrement);
1954 xorBlocks =
PtrAdd(xorBlocks, xorIncrement);
1956 xorBlocks =
PtrAdd(xorBlocks, xorIncrement);
1959 func2(block0, block1, subKeys, static_cast<unsigned int>(rounds));
1964 xorBlocks =
PtrAdd(xorBlocks, xorIncrement);
1966 xorBlocks =
PtrAdd(xorBlocks, xorIncrement);
1970 outBlocks =
PtrAdd(outBlocks, outIncrement);
1972 outBlocks =
PtrAdd(outBlocks, outIncrement);
1974 length -= 2*vsxBlockSize;
1981 if (flags & BT_ReverseDirection)
1983 inIncrement += inIncrement ? blockSize : 0;
1984 xorIncrement += xorIncrement ? blockSize : 0;
1985 outIncrement += outIncrement ? blockSize : 0;
1986 inBlocks =
PtrSub(inBlocks, inIncrement);
1987 xorBlocks =
PtrSub(xorBlocks, xorIncrement);
1988 outBlocks =
PtrSub(outBlocks, outIncrement);
1992 inIncrement -= inIncrement ? blockSize : 0;
1993 xorIncrement -= xorIncrement ? blockSize : 0;
1994 outIncrement -= outIncrement ? blockSize : 0;
1997 while (length >= blockSize)
2005 std::memcpy(temp+LowOffset, inBlocks, 8);
2006 std::memcpy(temp+HighOffset, inBlocks, 8);
2011 std::memcpy(temp+LowOffset, xorBlocks, 8);
2012 std::memcpy(temp+HighOffset, xorBlocks, 8);
2014 block =
VecXor(block, x);
2018 if (flags & BT_InBlockIsCounter)
2019 const_cast<byte *
>(inBlocks)[7]++;
2021 func2(block, zero, subKeys, static_cast<unsigned int>(rounds));
2025 std::memcpy(temp+LowOffset, xorBlocks, 8);
2026 std::memcpy(temp+HighOffset, xorBlocks, 8);
2028 block =
VecXor(block, x);
2032 std::memcpy(outBlocks, temp+LowOffset, 8);
2034 inBlocks =
PtrAdd(inBlocks, inIncrement);
2035 outBlocks =
PtrAdd(outBlocks, outIncrement);
2036 xorBlocks =
PtrAdd(xorBlocks, xorIncrement);
2037 length -= blockSize;
2052 template <
typename F1,
typename F4,
typename W>
2053 inline size_t AdvancedProcessBlocks128_4x1_ALTIVEC(F1 func1, F4 func4,
2054 const W *subKeys,
size_t rounds,
const byte *inBlocks,
2055 const byte *xorBlocks, byte *outBlocks,
size_t length, word32 flags)
2062 #if (CRYPTOPP_LITTLE_ENDIAN) 2068 const size_t blockSize = 16;
2071 size_t inIncrement = (flags & (BT_InBlockIsCounter|BT_DontIncrementInOutPointers)) ? 0 : blockSize;
2072 size_t xorIncrement = (xorBlocks != NULLPTR) ? blockSize : 0;
2073 size_t outIncrement = (flags & BT_DontIncrementInOutPointers) ? 0 : blockSize;
2076 const bool xorInput = (xorBlocks != NULLPTR) && (flags & BT_XorInput);
2077 const bool xorOutput = (xorBlocks != NULLPTR) && !(flags & BT_XorInput);
2079 if (flags & BT_ReverseDirection)
2081 inBlocks =
PtrAdd(inBlocks, length - blockSize);
2082 xorBlocks =
PtrAdd(xorBlocks, length - blockSize);
2083 outBlocks =
PtrAdd(outBlocks, length - blockSize);
2084 inIncrement = 0-inIncrement;
2085 xorIncrement = 0-xorIncrement;
2086 outIncrement = 0-outIncrement;
2089 if (flags & BT_AllowParallel)
2091 while (length >= 4*blockSize)
2095 if (flags & BT_InBlockIsCounter)
2098 block1 =
VecAdd(block0, s_one);
2099 block2 =
VecAdd(block1, s_one);
2100 block3 =
VecAdd(block2, s_one);
2110 const_cast<byte*
>(inBlocks)[15] += 6;
2115 inBlocks =
PtrAdd(inBlocks, inIncrement);
2117 inBlocks =
PtrAdd(inBlocks, inIncrement);
2119 inBlocks =
PtrAdd(inBlocks, inIncrement);
2121 inBlocks =
PtrAdd(inBlocks, inIncrement);
2127 xorBlocks =
PtrAdd(xorBlocks, xorIncrement);
2129 xorBlocks =
PtrAdd(xorBlocks, xorIncrement);
2131 xorBlocks =
PtrAdd(xorBlocks, xorIncrement);
2133 xorBlocks =
PtrAdd(xorBlocks, xorIncrement);
2136 func4(block0, block1, block2, block3, subKeys, rounds);
2141 xorBlocks =
PtrAdd(xorBlocks, xorIncrement);
2143 xorBlocks =
PtrAdd(xorBlocks, xorIncrement);
2145 xorBlocks =
PtrAdd(xorBlocks, xorIncrement);
2147 xorBlocks =
PtrAdd(xorBlocks, xorIncrement);
2151 outBlocks =
PtrAdd(outBlocks, outIncrement);
2153 outBlocks =
PtrAdd(outBlocks, outIncrement);
2155 outBlocks =
PtrAdd(outBlocks, outIncrement);
2157 outBlocks =
PtrAdd(outBlocks, outIncrement);
2159 length -= 4*blockSize;
2163 while (length >= blockSize)
2170 if (flags & BT_InBlockIsCounter)
2171 const_cast<byte *
>(inBlocks)[15]++;
2173 func1(block, subKeys, rounds);
2180 inBlocks =
PtrAdd(inBlocks, inIncrement);
2181 outBlocks =
PtrAdd(outBlocks, outIncrement);
2182 xorBlocks =
PtrAdd(xorBlocks, xorIncrement);
2183 length -= blockSize;
2197 template <
typename F1,
typename F6,
typename W>
2198 inline size_t AdvancedProcessBlocks128_6x1_ALTIVEC(F1 func1, F6 func6,
2199 const W *subKeys,
size_t rounds,
const byte *inBlocks,
2200 const byte *xorBlocks, byte *outBlocks,
size_t length, word32 flags)
2207 #if (CRYPTOPP_LITTLE_ENDIAN) 2213 const size_t blockSize = 16;
2216 size_t inIncrement = (flags & (BT_InBlockIsCounter|BT_DontIncrementInOutPointers)) ? 0 : blockSize;
2217 size_t xorIncrement = (xorBlocks != NULLPTR) ? blockSize : 0;
2218 size_t outIncrement = (flags & BT_DontIncrementInOutPointers) ? 0 : blockSize;
2221 const bool xorInput = (xorBlocks != NULLPTR) && (flags & BT_XorInput);
2222 const bool xorOutput = (xorBlocks != NULLPTR) && !(flags & BT_XorInput);
2224 if (flags & BT_ReverseDirection)
2226 inBlocks =
PtrAdd(inBlocks, length - blockSize);
2227 xorBlocks =
PtrAdd(xorBlocks, length - blockSize);
2228 outBlocks =
PtrAdd(outBlocks, length - blockSize);
2229 inIncrement = 0-inIncrement;
2230 xorIncrement = 0-xorIncrement;
2231 outIncrement = 0-outIncrement;
2234 if (flags & BT_AllowParallel)
2236 while (length >= 6*blockSize)
2238 uint32x4_p block0, block1, block2, block3, block4, block5;
2240 if (flags & BT_InBlockIsCounter)
2243 block1 =
VecAdd(block0, s_one);
2244 block2 =
VecAdd(block1, s_one);
2245 block3 =
VecAdd(block2, s_one);
2246 block4 =
VecAdd(block3, s_one);
2247 block5 =
VecAdd(block4, s_one);
2264 VecStoreBE(temp, const_cast<byte*>(inBlocks));
2269 inBlocks =
PtrAdd(inBlocks, inIncrement);
2271 inBlocks =
PtrAdd(inBlocks, inIncrement);
2273 inBlocks =
PtrAdd(inBlocks, inIncrement);
2275 inBlocks =
PtrAdd(inBlocks, inIncrement);
2277 inBlocks =
PtrAdd(inBlocks, inIncrement);
2279 inBlocks =
PtrAdd(inBlocks, inIncrement);
2285 xorBlocks =
PtrAdd(xorBlocks, xorIncrement);
2287 xorBlocks =
PtrAdd(xorBlocks, xorIncrement);
2289 xorBlocks =
PtrAdd(xorBlocks, xorIncrement);
2291 xorBlocks =
PtrAdd(xorBlocks, xorIncrement);
2293 xorBlocks =
PtrAdd(xorBlocks, xorIncrement);
2295 xorBlocks =
PtrAdd(xorBlocks, xorIncrement);
2298 func6(block0, block1, block2, block3, block4, block5, subKeys, rounds);
2303 xorBlocks =
PtrAdd(xorBlocks, xorIncrement);
2305 xorBlocks =
PtrAdd(xorBlocks, xorIncrement);
2307 xorBlocks =
PtrAdd(xorBlocks, xorIncrement);
2309 xorBlocks =
PtrAdd(xorBlocks, xorIncrement);
2311 xorBlocks =
PtrAdd(xorBlocks, xorIncrement);
2313 xorBlocks =
PtrAdd(xorBlocks, xorIncrement);
2317 outBlocks =
PtrAdd(outBlocks, outIncrement);
2319 outBlocks =
PtrAdd(outBlocks, outIncrement);
2321 outBlocks =
PtrAdd(outBlocks, outIncrement);
2323 outBlocks =
PtrAdd(outBlocks, outIncrement);
2325 outBlocks =
PtrAdd(outBlocks, outIncrement);
2327 outBlocks =
PtrAdd(outBlocks, outIncrement);
2329 length -= 6*blockSize;
2333 while (length >= blockSize)
2340 if (flags & BT_InBlockIsCounter)
2341 const_cast<byte *
>(inBlocks)[15]++;
2343 func1(block, subKeys, rounds);
2350 inBlocks =
PtrAdd(inBlocks, inIncrement);
2351 outBlocks =
PtrAdd(outBlocks, outIncrement);
2352 xorBlocks =
PtrAdd(xorBlocks, xorIncrement);
2353 length -= blockSize;
2361 #endif // __ALTIVEC__ 2363 #endif // CRYPTOPP_ADVANCED_SIMD_TEMPLATES
Utility functions for the Crypto++ library.
Library configuration file.
T1 VecAdd(const T1 vec1, const T2 vec2)
Add two vectors.
__vector unsigned int uint32x4_p
Vector of 32-bit elements.
Support functions for PowerPC and vector operations.
void VecStoreBE(const T data, byte dest[16])
Stores a vector to a byte array.
#define CRYPTOPP_ASSERT(exp)
Debugging and diagnostic assertion.
T1 VecXor(const T1 vec1, const T2 vec2)
XOR two vectors.
PTR PtrSub(PTR pointer, OFF offset)
Create a pointer with an offset.
PTR PtrAdd(PTR pointer, OFF offset)
Create a pointer with an offset.
uint32x4_p VecLoadBE(const byte src[16])
Loads a vector from a byte array.
Crypto++ library namespace.
__vector unsigned char uint8x16_p
Vector of 8-bit elements.