Crypto++  8.2
Free C++ class library of cryptographic schemes
lea_simd.cpp
1 // lea_simd.cpp - written and placed in the public domain by Jeffrey Walton
2 //
3 // This source file uses intrinsics and built-ins to gain access to
4 // SSSE3, ARM NEON and ARMv8a, and Power8 Altivec instructions. A separate
5 // source file is needed because additional CXXFLAGS are required to enable
6 // the appropriate instructions sets in some build configurations.
7 
8 #include "pch.h"
9 #include "config.h"
10 
11 #include "lea.h"
12 #include "misc.h"
13 
14 // Uncomment for benchmarking C++ against SSE or NEON.
15 // Do so in both simon.cpp and simon-simd.cpp.
16 // #undef CRYPTOPP_SSSE3_AVAILABLE
17 // #undef CRYPTOPP_ARM_NEON_AVAILABLE
18 
19 #if (CRYPTOPP_SSSE3_AVAILABLE)
20 # include "adv_simd.h"
21 # include <pmmintrin.h>
22 # include <tmmintrin.h>
23 #endif
24 
25 #if defined(__XOP__)
26 # include <ammintrin.h>
27 #endif
28 
29 #if defined(__AVX512F__)
30 # define CRYPTOPP_AVX512_ROTATE 1
31 # include <immintrin.h>
32 #endif
33 
34 // C1189: error: This header is specific to ARM targets
35 #if (CRYPTOPP_ARM_NEON_AVAILABLE)
36 # include "adv_simd.h"
37 # ifndef _M_ARM64
38 # include <arm_neon.h>
39 # endif
40 #endif
41 
42 #if (CRYPTOPP_ARM_ACLE_AVAILABLE)
43 # include <stdint.h>
44 # include <arm_acle.h>
45 #endif
46 
47 // Do not port this to POWER architecture. Naively we hoped
48 // for a 2x to 3x speedup. The result was a 5x slow down.
49 // The table below shows MiB/s and cpb.
50 //
51 // C++:
52 // <TD>LEA-128(128)/CTR (128-bit key)<TD>C++<TD>207<TD>15.64
53 // <TD>LEA-128(192)/CTR (192-bit key)<TD>C++<TD>186<TD>17.48
54 // <TD>LEA-128(256)/CTR (256-bit key)<TD>C++<TD>124<TD>26.2
55 //
56 // Power8:
57 // <TD>LEA-128(128)/CTR (128-bit key)<TD>Power8<TD>37<TD>88.7
58 // <TD>LEA-128(192)/CTR (192-bit key)<TD>Power8<TD>40<TD>82.1
59 // <TD>LEA-128(256)/CTR (256-bit key)<TD>Power8<TD>28<TD>116.0
60 
61 #undef CRYPTOPP_POWER8_AVAILABLE
62 #if defined(CRYPTOPP_POWER8_AVAILABLE)
63 # include "adv_simd.h"
64 # include "ppc_simd.h"
65 #endif
66 
67 // Squash MS LNK4221 and libtool warnings
68 extern const char LEA_SIMD_FNAME[] = __FILE__;
69 
70 ANONYMOUS_NAMESPACE_BEGIN
71 
72 using CryptoPP::word32;
73 
74 // *************************** ARM NEON ***************************//
75 
76 #if (CRYPTOPP_ARM_NEON_AVAILABLE)
77 
78 inline uint32x4_t Xor(const uint32x4_t& a, const uint32x4_t& b)
79 {
80  return veorq_u32(a, b);
81 }
82 
83 inline uint32x4_t Add(const uint32x4_t& a, const uint32x4_t& b)
84 {
85  return vaddq_u32(a, b);
86 }
87 
88 inline uint32x4_t Sub(const uint32x4_t& a, const uint32x4_t& b)
89 {
90  return vsubq_u32(a, b);
91 }
92 
93 template <unsigned int R>
94 inline uint32x4_t RotateLeft(const uint32x4_t& val)
95 {
96  const uint32x4_t a(vshlq_n_u32(val, R));
97  const uint32x4_t b(vshrq_n_u32(val, 32 - R));
98  return vorrq_u32(a, b);
99 }
100 
101 template <unsigned int R>
102 inline uint32x4_t RotateRight(const uint32x4_t& val)
103 {
104  const uint32x4_t a(vshlq_n_u32(val, 32 - R));
105  const uint32x4_t b(vshrq_n_u32(val, R));
106  return vorrq_u32(a, b);
107 }
108 
109 #if defined(__aarch32__) || defined(__aarch64__)
110 template <>
111 inline uint32x4_t RotateLeft<8>(const uint32x4_t& val)
112 {
113 #if (CRYPTOPP_BIG_ENDIAN)
114  const uint8_t maskb[16] = { 14,13,12,15, 10,9,8,11, 6,5,4,7, 2,1,0,3 };
115  const uint8x16_t mask = vld1q_u8(maskb);
116 #else
117  const uint8_t maskb[16] = { 3,0,1,2, 7,4,5,6, 11,8,9,10, 15,12,13,14 };
118  const uint8x16_t mask = vld1q_u8(maskb);
119 #endif
120 
121  return vreinterpretq_u32_u8(
122  vqtbl1q_u8(vreinterpretq_u8_u32(val), mask));
123 }
124 
125 template <>
126 inline uint32x4_t RotateRight<8>(const uint32x4_t& val)
127 {
128 #if (CRYPTOPP_BIG_ENDIAN)
129  const uint8_t maskb[16] = { 12,15,14,13, 8,11,10,9, 4,7,6,5, 0,3,2,1 };
130  const uint8x16_t mask = vld1q_u8(maskb);
131 #else
132  const uint8_t maskb[16] = { 1,2,3,0, 5,6,7,4, 9,10,11,8, 13,14,14,12 };
133  const uint8x16_t mask = vld1q_u8(maskb);
134 #endif
135 
136  return vreinterpretq_u32_u8(
137  vqtbl1q_u8(vreinterpretq_u8_u32(val), mask));
138 }
139 #endif
140 
141 uint32x4_t UnpackLow32(uint32x4_t a, uint32x4_t b)
142 {
143  uint32x2_t a1 = vget_low_u32(a);
144  uint32x2_t b1 = vget_low_u32(b);
145  uint32x2x2_t result = vzip_u32(a1, b1);
146  return vcombine_u32(result.val[0], result.val[1]);
147 }
148 
149 uint32x4_t UnpackHigh32(uint32x4_t a, uint32x4_t b)
150 {
151  uint32x2_t a1 = vget_high_u32(a);
152  uint32x2_t b1 = vget_high_u32(b);
153  uint32x2x2_t result = vzip_u32(a1, b1);
154  return vcombine_u32(result.val[0], result.val[1]);
155 }
156 
157 uint32x4_t UnpackLow64(uint32x4_t a, uint32x4_t b)
158 {
159  uint64x1_t a1 = vget_low_u64((uint64x2_t)a);
160  uint64x1_t b1 = vget_low_u64((uint64x2_t)b);
161  return (uint32x4_t)vcombine_u64(a1, b1);
162 }
163 
164 uint32x4_t UnpackHigh64(uint32x4_t a, uint32x4_t b)
165 {
166  uint64x1_t a1 = vget_high_u64((uint64x2_t)a);
167  uint64x1_t b1 = vget_high_u64((uint64x2_t)b);
168  return (uint32x4_t)vcombine_u64(a1, b1);
169 }
170 
171 template <unsigned int IDX>
172 inline uint32x4_t LoadKey(const word32 rkey[])
173 {
174  return vdupq_n_u32(rkey[IDX]);
175 }
176 
177 template <unsigned int IDX>
178 inline uint32x4_t UnpackNEON(const uint32x4_t& a, const uint32x4_t& b, const uint32x4_t& c, const uint32x4_t& d)
179 {
180  // Should not be instantiated
181  CRYPTOPP_ASSERT(0);;
182  return vmovq_n_u32(0);
183 }
184 
185 template <>
186 inline uint32x4_t UnpackNEON<0>(const uint32x4_t& a, const uint32x4_t& b, const uint32x4_t& c, const uint32x4_t& d)
187 {
188  const uint32x4_t r1 = UnpackLow32(a, b);
189  const uint32x4_t r2 = UnpackLow32(c, d);
190  return UnpackLow64(r1, r2);
191 }
192 
193 template <>
194 inline uint32x4_t UnpackNEON<1>(const uint32x4_t& a, const uint32x4_t& b, const uint32x4_t& c, const uint32x4_t& d)
195 {
196  const uint32x4_t r1 = UnpackLow32(a, b);
197  const uint32x4_t r2 = UnpackLow32(c, d);
198  return UnpackHigh64(r1, r2);
199 }
200 
201 template <>
202 inline uint32x4_t UnpackNEON<2>(const uint32x4_t& a, const uint32x4_t& b, const uint32x4_t& c, const uint32x4_t& d)
203 {
204  const uint32x4_t r1 = UnpackHigh32(a, b);
205  const uint32x4_t r2 = UnpackHigh32(c, d);
206  return UnpackLow64(r1, r2);
207 }
208 
209 template <>
210 inline uint32x4_t UnpackNEON<3>(const uint32x4_t& a, const uint32x4_t& b, const uint32x4_t& c, const uint32x4_t& d)
211 {
212  const uint32x4_t r1 = UnpackHigh32(a, b);
213  const uint32x4_t r2 = UnpackHigh32(c, d);
214  return UnpackHigh64(r1, r2);
215 }
216 
217 template <unsigned int IDX>
218 inline uint32x4_t UnpackNEON(const uint32x4_t& v)
219 {
220  // Should not be instantiated
221  CRYPTOPP_ASSERT(0);;
222  return vmovq_n_u32(0);
223 }
224 
225 template <>
226 inline uint32x4_t UnpackNEON<0>(const uint32x4_t& v)
227 {
228  // Splat to all lanes
229  return vdupq_n_u32(vgetq_lane_u32(v, 0));
230 }
231 
232 template <>
233 inline uint32x4_t UnpackNEON<1>(const uint32x4_t& v)
234 {
235  // Splat to all lanes
236  return vdupq_n_u32(vgetq_lane_u32(v, 1));
237 }
238 
239 template <>
240 inline uint32x4_t UnpackNEON<2>(const uint32x4_t& v)
241 {
242  // Splat to all lanes
243  return vdupq_n_u32(vgetq_lane_u32(v, 2));
244 }
245 
246 template <>
247 inline uint32x4_t UnpackNEON<3>(const uint32x4_t& v)
248 {
249  // Splat to all lanes
250  return vdupq_n_u32(vgetq_lane_u32(v, 3));
251 }
252 
253 template <unsigned int IDX>
254 inline uint32x4_t RepackNEON(const uint32x4_t& a, const uint32x4_t& b, const uint32x4_t& c, const uint32x4_t& d)
255 {
256  return UnpackNEON<IDX>(a, b, c, d);
257 }
258 
259 template <unsigned int IDX>
260 inline uint32x4_t RepackNEON(const uint32x4_t& v)
261 {
262  return UnpackNEON<IDX>(v);
263 }
264 
265 #endif // CRYPTOPP_ARM_NEON_AVAILABLE
266 
267 // *************************** IA-32 ***************************//
268 
269 #if (CRYPTOPP_SSSE3_AVAILABLE)
270 
271 inline __m128i Xor(const __m128i& a, const __m128i& b)
272 {
273  return _mm_xor_si128(a, b);
274 }
275 
276 inline __m128i Add(const __m128i& a, const __m128i& b)
277 {
278  return _mm_add_epi32(a, b);
279 }
280 
281 inline __m128i Sub(const __m128i& a, const __m128i& b)
282 {
283  return _mm_sub_epi32(a, b);
284 }
285 
286 template <unsigned int R>
287 inline __m128i RotateLeft(const __m128i& val)
288 {
289 #if defined(__XOP__)
290  return _mm_roti_epi32(val, R);
291 #else
292  return _mm_or_si128(
293  _mm_slli_epi32(val, R), _mm_srli_epi32(val, 32-R));
294 #endif
295 }
296 
297 template <unsigned int R>
298 inline __m128i RotateRight(const __m128i& val)
299 {
300 #if defined(__XOP__)
301  return _mm_roti_epi32(val, 32-R);
302 #else
303  return _mm_or_si128(
304  _mm_slli_epi32(val, 32-R), _mm_srli_epi32(val, R));
305 #endif
306 }
307 
308 // Faster than two Shifts and an Or.
309 template <>
310 inline __m128i RotateLeft<8>(const __m128i& val)
311 {
312 #if defined(__XOP__)
313  return _mm_roti_epi32(val, 8);
314 #else
315  const __m128i mask = _mm_set_epi8(14,13,12,15, 10,9,8,11, 6,5,4,7, 2,1,0,3);
316  return _mm_shuffle_epi8(val, mask);
317 #endif
318 }
319 
320 // Faster than two Shifts and an Or.
321 template <>
322 inline __m128i RotateRight<8>(const __m128i& val)
323 {
324 #if defined(__XOP__)
325  return _mm_roti_epi32(val, 32-8);
326 #else
327  const __m128i mask = _mm_set_epi8(12,15,14,13, 8,11,10,9, 4,7,6,5, 0,3,2,1);
328  return _mm_shuffle_epi8(val, mask);
329 #endif
330 }
331 
332 template <unsigned int IDX>
333 inline __m128i LoadKey(const word32 rkey[])
334 {
335  float rk; std::memcpy(&rk, rkey+IDX, sizeof(rk));
336  return _mm_castps_si128(_mm_load_ps1(&rk));
337 }
338 
339 template <unsigned int IDX>
340 inline __m128i UnpackXMM(const __m128i& a, const __m128i& b, const __m128i& c, const __m128i& d)
341 {
342  // Should not be instantiated
343  CRYPTOPP_UNUSED(a); CRYPTOPP_UNUSED(b);
344  CRYPTOPP_UNUSED(c); CRYPTOPP_UNUSED(d);
345  CRYPTOPP_ASSERT(0);
346  return _mm_setzero_si128();
347 }
348 
349 template <>
350 inline __m128i UnpackXMM<0>(const __m128i& a, const __m128i& b, const __m128i& c, const __m128i& d)
351 {
352  // LEA is little-endian oriented, so there is no need for a separate shuffle.
353  const __m128i r1 = _mm_unpacklo_epi32(a, b);
354  const __m128i r2 = _mm_unpacklo_epi32(c, d);
355  return _mm_unpacklo_epi64(r1, r2);
356 }
357 
358 template <>
359 inline __m128i UnpackXMM<1>(const __m128i& a, const __m128i& b, const __m128i& c, const __m128i& d)
360 {
361  // LEA is little-endian oriented, so there is no need for a separate shuffle.
362  const __m128i r1 = _mm_unpacklo_epi32(a, b);
363  const __m128i r2 = _mm_unpacklo_epi32(c, d);
364  return _mm_unpackhi_epi64(r1, r2);
365 }
366 
367 template <>
368 inline __m128i UnpackXMM<2>(const __m128i& a, const __m128i& b, const __m128i& c, const __m128i& d)
369 {
370  // LEA is little-endian oriented, so there is no need for a separate shuffle.
371  const __m128i r1 = _mm_unpackhi_epi32(a, b);
372  const __m128i r2 = _mm_unpackhi_epi32(c, d);
373  return _mm_unpacklo_epi64(r1, r2);
374 }
375 
376 template <>
377 inline __m128i UnpackXMM<3>(const __m128i& a, const __m128i& b, const __m128i& c, const __m128i& d)
378 {
379  // LEA is little-endian oriented, so there is no need for a separate shuffle.
380  const __m128i r1 = _mm_unpackhi_epi32(a, b);
381  const __m128i r2 = _mm_unpackhi_epi32(c, d);
382  return _mm_unpackhi_epi64(r1, r2);
383 }
384 
385 template <unsigned int IDX>
386 inline __m128i UnpackXMM(const __m128i& v)
387 {
388  // Should not be instantiated
389  CRYPTOPP_UNUSED(v); CRYPTOPP_ASSERT(0);
390  return _mm_setzero_si128();
391 }
392 
393 template <>
394 inline __m128i UnpackXMM<0>(const __m128i& v)
395 {
396  // Splat to all lanes
397  return _mm_shuffle_epi8(v, _mm_set_epi8(3,2,1,0, 3,2,1,0, 3,2,1,0, 3,2,1,0));
398 }
399 
400 template <>
401 inline __m128i UnpackXMM<1>(const __m128i& v)
402 {
403  // Splat to all lanes
404  return _mm_shuffle_epi8(v, _mm_set_epi8(7,6,5,4, 7,6,5,4, 7,6,5,4, 7,6,5,4));
405 }
406 
407 template <>
408 inline __m128i UnpackXMM<2>(const __m128i& v)
409 {
410  // Splat to all lanes
411  return _mm_shuffle_epi8(v, _mm_set_epi8(11,10,9,8, 11,10,9,8, 11,10,9,8, 11,10,9,8));
412 }
413 
414 template <>
415 inline __m128i UnpackXMM<3>(const __m128i& v)
416 {
417  // Splat to all lanes
418  return _mm_shuffle_epi8(v, _mm_set_epi8(15,14,13,12, 15,14,13,12, 15,14,13,12, 15,14,13,12));
419 }
420 
421 template <unsigned int IDX>
422 inline __m128i RepackXMM(const __m128i& a, const __m128i& b, const __m128i& c, const __m128i& d)
423 {
424  return UnpackXMM<IDX>(a, b, c, d);
425 }
426 
427 template <unsigned int IDX>
428 inline __m128i RepackXMM(const __m128i& v)
429 {
430  return UnpackXMM<IDX>(v);
431 }
432 
433 #endif // CRYPTOPP_SSSE3_AVAILABLE
434 
435 // *************************** Power8 ***************************//
436 
437 #if (CRYPTOPP_POWER8_AVAILABLE)
438 
442 
443 inline uint32x4_p Xor(const uint32x4_p& a, const uint32x4_p& b)
444 {
445  return VecXor(a, b);
446 }
447 
448 inline uint32x4_p Add(const uint32x4_p& a, const uint32x4_p& b)
449 {
450  return VecAdd(a, b);
451 }
452 
453 inline uint32x4_p Sub(const uint32x4_p& a, const uint32x4_p& b)
454 {
455  return VecSub(a, b);
456 }
457 
458 template <unsigned int R>
459 inline uint32x4_p RotateLeft(const uint32x4_p& val)
460 {
461  const uint32x4_p m = {R, R, R, R};
462  return vec_rl(val, m);
463 }
464 
465 template <unsigned int R>
466 inline uint32x4_p RotateRight(const uint32x4_p& val)
467 {
468  const uint32x4_p m = {32-R, 32-R, 32-R, 32-R};
469  return vec_rl(val, m);
470 }
471 
472 template <unsigned int IDX>
473 inline uint32x4_p LoadKey(const word32 rkey[])
474 {
475  return vec_splats(rkey[IDX]);
476 }
477 
478 template <unsigned int IDX>
479 inline uint32x4_p UnpackSIMD(const uint32x4_p& a, const uint32x4_p& b, const uint32x4_p& c, const uint32x4_p& d)
480 {
481  // Should not be instantiated
482  CRYPTOPP_UNUSED(a); CRYPTOPP_UNUSED(b);
483  CRYPTOPP_UNUSED(c); CRYPTOPP_UNUSED(d);
484  CRYPTOPP_ASSERT(0);
485  return VecXor(a, a);
486 }
487 
488 template <>
489 inline uint32x4_p UnpackSIMD<0>(const uint32x4_p& a, const uint32x4_p& b, const uint32x4_p& c, const uint32x4_p& d)
490 {
491  const uint64x2_p r1 = (uint64x2_p)vec_mergel(a, b);
492  const uint64x2_p r2 = (uint64x2_p)vec_mergel(c, d);
493  return (uint32x4_p)vec_mergel(r1, r2);
494 }
495 
496 template <>
497 inline uint32x4_p UnpackSIMD<1>(const uint32x4_p& a, const uint32x4_p& b, const uint32x4_p& c, const uint32x4_p& d)
498 {
499  const uint64x2_p r1 = (uint64x2_p)vec_mergel(a, b);
500  const uint64x2_p r2 = (uint64x2_p)vec_mergel(c, d);
501  return (uint32x4_p)vec_mergeh(r1, r2);
502 }
503 
504 template <>
505 inline uint32x4_p UnpackSIMD<2>(const uint32x4_p& a, const uint32x4_p& b, const uint32x4_p& c, const uint32x4_p& d)
506 {
507  const uint64x2_p r1 = (uint64x2_p)vec_mergeh(a, b);
508  const uint64x2_p r2 = (uint64x2_p)vec_mergeh(c, d);
509  return (uint32x4_p)vec_mergel(r1, r2);
510 }
511 
512 template <>
513 inline uint32x4_p UnpackSIMD<3>(const uint32x4_p& a, const uint32x4_p& b, const uint32x4_p& c, const uint32x4_p& d)
514 {
515  const uint64x2_p r1 = (uint64x2_p)vec_mergeh(a, b);
516  const uint64x2_p r2 = (uint64x2_p)vec_mergeh(c, d);
517  return (uint32x4_p)vec_mergeh(r1, r2);
518 }
519 
520 template <unsigned int IDX>
521 inline uint32x4_p UnpackSIMD(const uint32x4_p& v)
522 {
523  // Should not be instantiated
524  CRYPTOPP_ASSERT(0);
525  return VecXor(v, v);
526 }
527 
528 template <>
529 inline uint32x4_p UnpackSIMD<0>(const uint32x4_p& v)
530 {
531  // Splat to all lanes
532  const uint8x16_p m = {3,2,1,0, 3,2,1,0, 3,2,1,0, 3,2,1,0};
533  return (uint32x4_p)VecPermute(v, v, m);
534 }
535 
536 template <>
537 inline uint32x4_p UnpackSIMD<1>(const uint32x4_p& v)
538 {
539  // Splat to all lanes
540  const uint8x16_p m = {7,6,5,4, 7,6,5,4, 7,6,5,4, 7,6,5,4};
541  return (uint32x4_p)VecPermute(v, v, m);
542 }
543 
544 template <>
545 inline uint32x4_p UnpackSIMD<2>(const uint32x4_p& v)
546 {
547  // Splat to all lanes
548  const uint8x16_p m = {11,10,9,8, 11,10,9,8, 11,10,9,8, 11,10,9,8};
549  return (uint32x4_p)VecPermute(v, v, m);
550 }
551 
552 template <>
553 inline uint32x4_p UnpackSIMD<3>(const uint32x4_p& v)
554 {
555  // Splat to all lanes
556  const uint8x16_p m = {15,14,13,12, 15,14,13,12, 15,14,13,12, 15,14,13,12};
557  return (uint32x4_p)VecPermute(v, v, m);
558 }
559 
560 template <unsigned int IDX>
561 inline uint32x4_p RepackSIMD(const uint32x4_p& a, const uint32x4_p& b, const uint32x4_p& c, const uint32x4_p& d)
562 {
563  return UnpackSIMD<IDX>(a, b, c, d);
564 }
565 
566 template <unsigned int IDX>
567 inline uint32x4_p RepackSIMD(const uint32x4_p& v)
568 {
569  return UnpackSIMD<IDX>(v);
570 }
571 
572 #endif // CRYPTOPP_POWER8_AVAILABLE
573 
574 // *************************** LEA Encryption ***************************//
575 
576 #if (CRYPTOPP_ARM_NEON_AVAILABLE || CRYPTOPP_SSSE3_AVAILABLE)
577 
578 template <class W>
579 inline void LEA_Encryption(W temp[4], const word32 *subkeys, unsigned int rounds)
580 {
581  temp[3] = RotateRight<3>(Add(Xor(temp[2], LoadKey<4>(subkeys)), Xor(temp[3], LoadKey<5>(subkeys))));
582  temp[2] = RotateRight<5>(Add(Xor(temp[1], LoadKey<2>(subkeys)), Xor(temp[2], LoadKey<3>(subkeys))));
583  temp[1] = RotateLeft<9>(Add(Xor(temp[0], LoadKey<0>(subkeys)), Xor(temp[1], LoadKey<1>(subkeys))));
584  temp[0] = RotateRight<3>(Add(Xor(temp[3], LoadKey<10>(subkeys)), Xor(temp[0], LoadKey<11>(subkeys))));
585  temp[3] = RotateRight<5>(Add(Xor(temp[2], LoadKey<8>(subkeys)), Xor(temp[3], LoadKey<9>(subkeys))));
586  temp[2] = RotateLeft<9>(Add(Xor(temp[1], LoadKey<6>(subkeys)), Xor(temp[2], LoadKey<7>(subkeys))));
587  temp[1] = RotateRight<3>(Add(Xor(temp[0], LoadKey<16>(subkeys)), Xor(temp[1], LoadKey<17>(subkeys))));
588  temp[0] = RotateRight<5>(Add(Xor(temp[3], LoadKey<14>(subkeys)), Xor(temp[0], LoadKey<15>(subkeys))));
589  temp[3] = RotateLeft<9>(Add(Xor(temp[2], LoadKey<12>(subkeys)), Xor(temp[3], LoadKey<13>(subkeys))));
590  temp[2] = RotateRight<3>(Add(Xor(temp[1], LoadKey<22>(subkeys)), Xor(temp[2], LoadKey<23>(subkeys))));
591  temp[1] = RotateRight<5>(Add(Xor(temp[0], LoadKey<20>(subkeys)), Xor(temp[1], LoadKey<21>(subkeys))));
592  temp[0] = RotateLeft<9>(Add(Xor(temp[3], LoadKey<18>(subkeys)), Xor(temp[0], LoadKey<19>(subkeys))));
593 
594  temp[3] = RotateRight<3>(Add(Xor(temp[2], LoadKey<28>(subkeys)), Xor(temp[3], LoadKey<29>(subkeys))));
595  temp[2] = RotateRight<5>(Add(Xor(temp[1], LoadKey<26>(subkeys)), Xor(temp[2], LoadKey<27>(subkeys))));
596  temp[1] = RotateLeft<9>(Add(Xor(temp[0], LoadKey<24>(subkeys)), Xor(temp[1], LoadKey<25>(subkeys))));
597  temp[0] = RotateRight<3>(Add(Xor(temp[3], LoadKey<34>(subkeys)), Xor(temp[0], LoadKey<35>(subkeys))));
598  temp[3] = RotateRight<5>(Add(Xor(temp[2], LoadKey<32>(subkeys)), Xor(temp[3], LoadKey<33>(subkeys))));
599  temp[2] = RotateLeft<9>(Add(Xor(temp[1], LoadKey<30>(subkeys)), Xor(temp[2], LoadKey<31>(subkeys))));
600  temp[1] = RotateRight<3>(Add(Xor(temp[0], LoadKey<40>(subkeys)), Xor(temp[1], LoadKey<41>(subkeys))));
601  temp[0] = RotateRight<5>(Add(Xor(temp[3], LoadKey<38>(subkeys)), Xor(temp[0], LoadKey<39>(subkeys))));
602  temp[3] = RotateLeft<9>(Add(Xor(temp[2], LoadKey<36>(subkeys)), Xor(temp[3], LoadKey<37>(subkeys))));
603  temp[2] = RotateRight<3>(Add(Xor(temp[1], LoadKey<46>(subkeys)), Xor(temp[2], LoadKey<47>(subkeys))));
604  temp[1] = RotateRight<5>(Add(Xor(temp[0], LoadKey<44>(subkeys)), Xor(temp[1], LoadKey<45>(subkeys))));
605  temp[0] = RotateLeft<9>(Add(Xor(temp[3], LoadKey<42>(subkeys)), Xor(temp[0], LoadKey<43>(subkeys))));
606 
607  temp[3] = RotateRight<3>(Add(Xor(temp[2], LoadKey<52>(subkeys)), Xor(temp[3], LoadKey<53>(subkeys))));
608  temp[2] = RotateRight<5>(Add(Xor(temp[1], LoadKey<50>(subkeys)), Xor(temp[2], LoadKey<51>(subkeys))));
609  temp[1] = RotateLeft<9>(Add(Xor(temp[0], LoadKey<48>(subkeys)), Xor(temp[1], LoadKey<49>(subkeys))));
610  temp[0] = RotateRight<3>(Add(Xor(temp[3], LoadKey<58>(subkeys)), Xor(temp[0], LoadKey<59>(subkeys))));
611  temp[3] = RotateRight<5>(Add(Xor(temp[2], LoadKey<56>(subkeys)), Xor(temp[3], LoadKey<57>(subkeys))));
612  temp[2] = RotateLeft<9>(Add(Xor(temp[1], LoadKey<54>(subkeys)), Xor(temp[2], LoadKey<55>(subkeys))));
613  temp[1] = RotateRight<3>(Add(Xor(temp[0], LoadKey<64>(subkeys)), Xor(temp[1], LoadKey<65>(subkeys))));
614  temp[0] = RotateRight<5>(Add(Xor(temp[3], LoadKey<62>(subkeys)), Xor(temp[0], LoadKey<63>(subkeys))));
615  temp[3] = RotateLeft<9>(Add(Xor(temp[2], LoadKey<60>(subkeys)), Xor(temp[3], LoadKey<61>(subkeys))));
616  temp[2] = RotateRight<3>(Add(Xor(temp[1], LoadKey<70>(subkeys)), Xor(temp[2], LoadKey<71>(subkeys))));
617  temp[1] = RotateRight<5>(Add(Xor(temp[0], LoadKey<68>(subkeys)), Xor(temp[1], LoadKey<69>(subkeys))));
618  temp[0] = RotateLeft<9>(Add(Xor(temp[3], LoadKey<66>(subkeys)), Xor(temp[0], LoadKey<67>(subkeys))));
619 
620  temp[3] = RotateRight<3>(Add(Xor(temp[2], LoadKey<76>(subkeys)), Xor(temp[3], LoadKey<77>(subkeys))));
621  temp[2] = RotateRight<5>(Add(Xor(temp[1], LoadKey<74>(subkeys)), Xor(temp[2], LoadKey<75>(subkeys))));
622  temp[1] = RotateLeft<9>(Add(Xor(temp[0], LoadKey<72>(subkeys)), Xor(temp[1], LoadKey<73>(subkeys))));
623  temp[0] = RotateRight<3>(Add(Xor(temp[3], LoadKey<82>(subkeys)), Xor(temp[0], LoadKey<83>(subkeys))));
624  temp[3] = RotateRight<5>(Add(Xor(temp[2], LoadKey<80>(subkeys)), Xor(temp[3], LoadKey<81>(subkeys))));
625  temp[2] = RotateLeft<9>(Add(Xor(temp[1], LoadKey<78>(subkeys)), Xor(temp[2], LoadKey<79>(subkeys))));
626  temp[1] = RotateRight<3>(Add(Xor(temp[0], LoadKey<88>(subkeys)), Xor(temp[1], LoadKey<89>(subkeys))));
627  temp[0] = RotateRight<5>(Add(Xor(temp[3], LoadKey<86>(subkeys)), Xor(temp[0], LoadKey<87>(subkeys))));
628  temp[3] = RotateLeft<9>(Add(Xor(temp[2], LoadKey<84>(subkeys)), Xor(temp[3], LoadKey<85>(subkeys))));
629  temp[2] = RotateRight<3>(Add(Xor(temp[1], LoadKey<94>(subkeys)), Xor(temp[2], LoadKey<95>(subkeys))));
630  temp[1] = RotateRight<5>(Add(Xor(temp[0], LoadKey<92>(subkeys)), Xor(temp[1], LoadKey<93>(subkeys))));
631  temp[0] = RotateLeft<9>(Add(Xor(temp[3], LoadKey<90>(subkeys)), Xor(temp[0], LoadKey<91>(subkeys))));
632 
633  temp[3] = RotateRight<3>(Add(Xor(temp[2], LoadKey<100>(subkeys)), Xor(temp[3], LoadKey<101>(subkeys))));
634  temp[2] = RotateRight<5>(Add(Xor(temp[1], LoadKey<98>(subkeys)), Xor(temp[2], LoadKey<99>(subkeys))));
635  temp[1] = RotateLeft<9>(Add(Xor(temp[0], LoadKey<96>(subkeys)), Xor(temp[1], LoadKey<97>(subkeys))));
636  temp[0] = RotateRight<3>(Add(Xor(temp[3], LoadKey<106>(subkeys)), Xor(temp[0], LoadKey<107>(subkeys))));
637  temp[3] = RotateRight<5>(Add(Xor(temp[2], LoadKey<104>(subkeys)), Xor(temp[3], LoadKey<105>(subkeys))));
638  temp[2] = RotateLeft<9>(Add(Xor(temp[1], LoadKey<102>(subkeys)), Xor(temp[2], LoadKey<103>(subkeys))));
639  temp[1] = RotateRight<3>(Add(Xor(temp[0], LoadKey<112>(subkeys)), Xor(temp[1], LoadKey<113>(subkeys))));
640  temp[0] = RotateRight<5>(Add(Xor(temp[3], LoadKey<110>(subkeys)), Xor(temp[0], LoadKey<111>(subkeys))));
641  temp[3] = RotateLeft<9>(Add(Xor(temp[2], LoadKey<108>(subkeys)), Xor(temp[3], LoadKey<109>(subkeys))));
642  temp[2] = RotateRight<3>(Add(Xor(temp[1], LoadKey<118>(subkeys)), Xor(temp[2], LoadKey<119>(subkeys))));
643  temp[1] = RotateRight<5>(Add(Xor(temp[0], LoadKey<116>(subkeys)), Xor(temp[1], LoadKey<117>(subkeys))));
644  temp[0] = RotateLeft<9>(Add(Xor(temp[3], LoadKey<114>(subkeys)), Xor(temp[0], LoadKey<115>(subkeys))));
645 
646  temp[3] = RotateRight<3>(Add(Xor(temp[2], LoadKey<124>(subkeys)), Xor(temp[3], LoadKey<125>(subkeys))));
647  temp[2] = RotateRight<5>(Add(Xor(temp[1], LoadKey<122>(subkeys)), Xor(temp[2], LoadKey<123>(subkeys))));
648  temp[1] = RotateLeft<9>(Add(Xor(temp[0], LoadKey<120>(subkeys)), Xor(temp[1], LoadKey<121>(subkeys))));
649  temp[0] = RotateRight<3>(Add(Xor(temp[3], LoadKey<130>(subkeys)), Xor(temp[0], LoadKey<131>(subkeys))));
650  temp[3] = RotateRight<5>(Add(Xor(temp[2], LoadKey<128>(subkeys)), Xor(temp[3], LoadKey<129>(subkeys))));
651  temp[2] = RotateLeft<9>(Add(Xor(temp[1], LoadKey<126>(subkeys)), Xor(temp[2], LoadKey<127>(subkeys))));
652  temp[1] = RotateRight<3>(Add(Xor(temp[0], LoadKey<136>(subkeys)), Xor(temp[1], LoadKey<137>(subkeys))));
653  temp[0] = RotateRight<5>(Add(Xor(temp[3], LoadKey<134>(subkeys)), Xor(temp[0], LoadKey<135>(subkeys))));
654  temp[3] = RotateLeft<9>(Add(Xor(temp[2], LoadKey<132>(subkeys)), Xor(temp[3], LoadKey<133>(subkeys))));
655  temp[2] = RotateRight<3>(Add(Xor(temp[1], LoadKey<142>(subkeys)), Xor(temp[2], LoadKey<143>(subkeys))));
656  temp[1] = RotateRight<5>(Add(Xor(temp[0], LoadKey<140>(subkeys)), Xor(temp[1], LoadKey<141>(subkeys))));
657  temp[0] = RotateLeft<9>(Add(Xor(temp[3], LoadKey<138>(subkeys)), Xor(temp[0], LoadKey<139>(subkeys))));
658 
659  if(rounds > 24)
660  {
661  temp[3] = RotateRight<3>(Add(Xor(temp[2], LoadKey<148>(subkeys)), Xor(temp[3], LoadKey<149>(subkeys))));
662  temp[2] = RotateRight<5>(Add(Xor(temp[1], LoadKey<146>(subkeys)), Xor(temp[2], LoadKey<147>(subkeys))));
663  temp[1] = RotateLeft<9>(Add(Xor(temp[0], LoadKey<144>(subkeys)), Xor(temp[1], LoadKey<145>(subkeys))));
664  temp[0] = RotateRight<3>(Add(Xor(temp[3], LoadKey<154>(subkeys)), Xor(temp[0], LoadKey<155>(subkeys))));
665  temp[3] = RotateRight<5>(Add(Xor(temp[2], LoadKey<152>(subkeys)), Xor(temp[3], LoadKey<153>(subkeys))));
666  temp[2] = RotateLeft<9>(Add(Xor(temp[1], LoadKey<150>(subkeys)), Xor(temp[2], LoadKey<151>(subkeys))));
667  temp[1] = RotateRight<3>(Add(Xor(temp[0], LoadKey<160>(subkeys)), Xor(temp[1], LoadKey<161>(subkeys))));
668  temp[0] = RotateRight<5>(Add(Xor(temp[3], LoadKey<158>(subkeys)), Xor(temp[0], LoadKey<159>(subkeys))));
669  temp[3] = RotateLeft<9>(Add(Xor(temp[2], LoadKey<156>(subkeys)), Xor(temp[3], LoadKey<157>(subkeys))));
670  temp[2] = RotateRight<3>(Add(Xor(temp[1], LoadKey<166>(subkeys)), Xor(temp[2], LoadKey<167>(subkeys))));
671  temp[1] = RotateRight<5>(Add(Xor(temp[0], LoadKey<164>(subkeys)), Xor(temp[1], LoadKey<165>(subkeys))));
672  temp[0] = RotateLeft<9>(Add(Xor(temp[3], LoadKey<162>(subkeys)), Xor(temp[0], LoadKey<163>(subkeys))));
673  }
674 
675  if(rounds > 28)
676  {
677  temp[3] = RotateRight<3>(Add(Xor(temp[2], LoadKey<172>(subkeys)), Xor(temp[3], LoadKey<173>(subkeys))));
678  temp[2] = RotateRight<5>(Add(Xor(temp[1], LoadKey<170>(subkeys)), Xor(temp[2], LoadKey<171>(subkeys))));
679  temp[1] = RotateLeft<9>(Add(Xor(temp[0], LoadKey<168>(subkeys)), Xor(temp[1], LoadKey<169>(subkeys))));
680  temp[0] = RotateRight<3>(Add(Xor(temp[3], LoadKey<178>(subkeys)), Xor(temp[0], LoadKey<179>(subkeys))));
681  temp[3] = RotateRight<5>(Add(Xor(temp[2], LoadKey<176>(subkeys)), Xor(temp[3], LoadKey<177>(subkeys))));
682  temp[2] = RotateLeft<9>(Add(Xor(temp[1], LoadKey<174>(subkeys)), Xor(temp[2], LoadKey<175>(subkeys))));
683  temp[1] = RotateRight<3>(Add(Xor(temp[0], LoadKey<184>(subkeys)), Xor(temp[1], LoadKey<185>(subkeys))));
684  temp[0] = RotateRight<5>(Add(Xor(temp[3], LoadKey<182>(subkeys)), Xor(temp[0], LoadKey<183>(subkeys))));
685  temp[3] = RotateLeft<9>(Add(Xor(temp[2], LoadKey<180>(subkeys)), Xor(temp[3], LoadKey<181>(subkeys))));
686  temp[2] = RotateRight<3>(Add(Xor(temp[1], LoadKey<190>(subkeys)), Xor(temp[2], LoadKey<191>(subkeys))));
687  temp[1] = RotateRight<5>(Add(Xor(temp[0], LoadKey<188>(subkeys)), Xor(temp[1], LoadKey<189>(subkeys))));
688  temp[0] = RotateLeft<9>(Add(Xor(temp[3], LoadKey<186>(subkeys)), Xor(temp[0], LoadKey<187>(subkeys))));
689  }
690 }
691 
692 // *************************** LEA Decryption ***************************//
693 
694 template <class W>
695 inline void LEA_Decryption(W temp[4], const word32 *subkeys, unsigned int rounds)
696 {
697  if(rounds > 28)
698  {
699  temp[0] = Xor(Sub(RotateRight<9>(temp[0]), Xor(temp[3], LoadKey<186>(subkeys))), LoadKey<187>(subkeys));
700  temp[1] = Xor(Sub(RotateLeft<5>(temp[1]), Xor(temp[0], LoadKey<188>(subkeys))), LoadKey<189>(subkeys));
701  temp[2] = Xor(Sub(RotateLeft<3>(temp[2]), Xor(temp[1], LoadKey<190>(subkeys))), LoadKey<191>(subkeys));
702  temp[3] = Xor(Sub(RotateRight<9>(temp[3]), Xor(temp[2], LoadKey<180>(subkeys))), LoadKey<181>(subkeys));
703  temp[0] = Xor(Sub(RotateLeft<5>(temp[0]), Xor(temp[3], LoadKey<182>(subkeys))), LoadKey<183>(subkeys));
704  temp[1] = Xor(Sub(RotateLeft<3>(temp[1]), Xor(temp[0], LoadKey<184>(subkeys))), LoadKey<185>(subkeys));
705  temp[2] = Xor(Sub(RotateRight<9>(temp[2]), Xor(temp[1], LoadKey<174>(subkeys))), LoadKey<175>(subkeys));
706  temp[3] = Xor(Sub(RotateLeft<5>(temp[3]), Xor(temp[2], LoadKey<176>(subkeys))), LoadKey<177>(subkeys));
707  temp[0] = Xor(Sub(RotateLeft<3>(temp[0]), Xor(temp[3], LoadKey<178>(subkeys))), LoadKey<179>(subkeys));
708  temp[1] = Xor(Sub(RotateRight<9>(temp[1]), Xor(temp[0], LoadKey<168>(subkeys))), LoadKey<169>(subkeys));
709  temp[2] = Xor(Sub(RotateLeft<5>(temp[2]), Xor(temp[1], LoadKey<170>(subkeys))), LoadKey<171>(subkeys));
710  temp[3] = Xor(Sub(RotateLeft<3>(temp[3]), Xor(temp[2], LoadKey<172>(subkeys))), LoadKey<173>(subkeys));
711  }
712 
713  if(rounds > 24)
714  {
715  temp[0] = Xor(Sub(RotateRight<9>(temp[0]), Xor(temp[3], LoadKey<162>(subkeys))), LoadKey<163>(subkeys));
716  temp[1] = Xor(Sub(RotateLeft<5>(temp[1]), Xor(temp[0], LoadKey<164>(subkeys))), LoadKey<165>(subkeys));
717  temp[2] = Xor(Sub(RotateLeft<3>(temp[2]), Xor(temp[1], LoadKey<166>(subkeys))), LoadKey<167>(subkeys));
718  temp[3] = Xor(Sub(RotateRight<9>(temp[3]), Xor(temp[2], LoadKey<156>(subkeys))), LoadKey<157>(subkeys));
719  temp[0] = Xor(Sub(RotateLeft<5>(temp[0]), Xor(temp[3], LoadKey<158>(subkeys))), LoadKey<159>(subkeys));
720  temp[1] = Xor(Sub(RotateLeft<3>(temp[1]), Xor(temp[0], LoadKey<160>(subkeys))), LoadKey<161>(subkeys));
721  temp[2] = Xor(Sub(RotateRight<9>(temp[2]), Xor(temp[1], LoadKey<150>(subkeys))), LoadKey<151>(subkeys));
722  temp[3] = Xor(Sub(RotateLeft<5>(temp[3]), Xor(temp[2], LoadKey<152>(subkeys))), LoadKey<153>(subkeys));
723  temp[0] = Xor(Sub(RotateLeft<3>(temp[0]), Xor(temp[3], LoadKey<154>(subkeys))), LoadKey<155>(subkeys));
724  temp[1] = Xor(Sub(RotateRight<9>(temp[1]), Xor(temp[0], LoadKey<144>(subkeys))), LoadKey<145>(subkeys));
725  temp[2] = Xor(Sub(RotateLeft<5>(temp[2]), Xor(temp[1], LoadKey<146>(subkeys))), LoadKey<147>(subkeys));
726  temp[3] = Xor(Sub(RotateLeft<3>(temp[3]), Xor(temp[2], LoadKey<148>(subkeys))), LoadKey<149>(subkeys));
727  }
728 
729  temp[0] = Xor(Sub(RotateRight<9>(temp[0]), Xor(temp[3], LoadKey<138>(subkeys))), LoadKey<139>(subkeys));
730  temp[1] = Xor(Sub(RotateLeft<5>(temp[1]), Xor(temp[0], LoadKey<140>(subkeys))), LoadKey<141>(subkeys));
731  temp[2] = Xor(Sub(RotateLeft<3>(temp[2]), Xor(temp[1], LoadKey<142>(subkeys))), LoadKey<143>(subkeys));
732  temp[3] = Xor(Sub(RotateRight<9>(temp[3]), Xor(temp[2], LoadKey<132>(subkeys))), LoadKey<133>(subkeys));
733  temp[0] = Xor(Sub(RotateLeft<5>(temp[0]), Xor(temp[3], LoadKey<134>(subkeys))), LoadKey<135>(subkeys));
734  temp[1] = Xor(Sub(RotateLeft<3>(temp[1]), Xor(temp[0], LoadKey<136>(subkeys))), LoadKey<137>(subkeys));
735  temp[2] = Xor(Sub(RotateRight<9>(temp[2]), Xor(temp[1], LoadKey<126>(subkeys))), LoadKey<127>(subkeys));
736  temp[3] = Xor(Sub(RotateLeft<5>(temp[3]), Xor(temp[2], LoadKey<128>(subkeys))), LoadKey<129>(subkeys));
737  temp[0] = Xor(Sub(RotateLeft<3>(temp[0]), Xor(temp[3], LoadKey<130>(subkeys))), LoadKey<131>(subkeys));
738  temp[1] = Xor(Sub(RotateRight<9>(temp[1]), Xor(temp[0], LoadKey<120>(subkeys))), LoadKey<121>(subkeys));
739  temp[2] = Xor(Sub(RotateLeft<5>(temp[2]), Xor(temp[1], LoadKey<122>(subkeys))), LoadKey<123>(subkeys));
740  temp[3] = Xor(Sub(RotateLeft<3>(temp[3]), Xor(temp[2], LoadKey<124>(subkeys))), LoadKey<125>(subkeys));
741 
742  temp[0] = Xor(Sub(RotateRight<9>(temp[0]), Xor(temp[3], LoadKey<114>(subkeys))), LoadKey<115>(subkeys));
743  temp[1] = Xor(Sub(RotateLeft<5>(temp[1]), Xor(temp[0], LoadKey<116>(subkeys))), LoadKey<117>(subkeys));
744  temp[2] = Xor(Sub(RotateLeft<3>(temp[2]), Xor(temp[1], LoadKey<118>(subkeys))), LoadKey<119>(subkeys));
745  temp[3] = Xor(Sub(RotateRight<9>(temp[3]), Xor(temp[2], LoadKey<108>(subkeys))), LoadKey<109>(subkeys));
746  temp[0] = Xor(Sub(RotateLeft<5>(temp[0]), Xor(temp[3], LoadKey<110>(subkeys))), LoadKey<111>(subkeys));
747  temp[1] = Xor(Sub(RotateLeft<3>(temp[1]), Xor(temp[0], LoadKey<112>(subkeys))), LoadKey<113>(subkeys));
748  temp[2] = Xor(Sub(RotateRight<9>(temp[2]), Xor(temp[1], LoadKey<102>(subkeys))), LoadKey<103>(subkeys));
749  temp[3] = Xor(Sub(RotateLeft<5>(temp[3]), Xor(temp[2], LoadKey<104>(subkeys))), LoadKey<105>(subkeys));
750  temp[0] = Xor(Sub(RotateLeft<3>(temp[0]), Xor(temp[3], LoadKey<106>(subkeys))), LoadKey<107>(subkeys));
751  temp[1] = Xor(Sub(RotateRight<9>(temp[1]), Xor(temp[0], LoadKey<96>(subkeys))), LoadKey<97>(subkeys));
752  temp[2] = Xor(Sub(RotateLeft<5>(temp[2]), Xor(temp[1], LoadKey<98>(subkeys))), LoadKey<99>(subkeys));
753  temp[3] = Xor(Sub(RotateLeft<3>(temp[3]), Xor(temp[2], LoadKey<100>(subkeys))), LoadKey<101>(subkeys));
754 
755  temp[0] = Xor(Sub(RotateRight<9>(temp[0]), Xor(temp[3], LoadKey<90>(subkeys))), LoadKey<91>(subkeys));
756  temp[1] = Xor(Sub(RotateLeft<5>(temp[1]), Xor(temp[0], LoadKey<92>(subkeys))), LoadKey<93>(subkeys));
757  temp[2] = Xor(Sub(RotateLeft<3>(temp[2]), Xor(temp[1], LoadKey<94>(subkeys))), LoadKey<95>(subkeys));
758  temp[3] = Xor(Sub(RotateRight<9>(temp[3]), Xor(temp[2], LoadKey<84>(subkeys))), LoadKey<85>(subkeys));
759  temp[0] = Xor(Sub(RotateLeft<5>(temp[0]), Xor(temp[3], LoadKey<86>(subkeys))), LoadKey<87>(subkeys));
760  temp[1] = Xor(Sub(RotateLeft<3>(temp[1]), Xor(temp[0], LoadKey<88>(subkeys))), LoadKey<89>(subkeys));
761  temp[2] = Xor(Sub(RotateRight<9>(temp[2]), Xor(temp[1], LoadKey<78>(subkeys))), LoadKey<79>(subkeys));
762  temp[3] = Xor(Sub(RotateLeft<5>(temp[3]), Xor(temp[2], LoadKey<80>(subkeys))), LoadKey<81>(subkeys));
763  temp[0] = Xor(Sub(RotateLeft<3>(temp[0]), Xor(temp[3], LoadKey<82>(subkeys))), LoadKey<83>(subkeys));
764  temp[1] = Xor(Sub(RotateRight<9>(temp[1]), Xor(temp[0], LoadKey<72>(subkeys))), LoadKey<73>(subkeys));
765  temp[2] = Xor(Sub(RotateLeft<5>(temp[2]), Xor(temp[1], LoadKey<74>(subkeys))), LoadKey<75>(subkeys));
766  temp[3] = Xor(Sub(RotateLeft<3>(temp[3]), Xor(temp[2], LoadKey<76>(subkeys))), LoadKey<77>(subkeys));
767 
768  temp[0] = Xor(Sub(RotateRight<9>(temp[0]), Xor(temp[3], LoadKey<66>(subkeys))), LoadKey<67>(subkeys));
769  temp[1] = Xor(Sub(RotateLeft<5>(temp[1]), Xor(temp[0], LoadKey<68>(subkeys))), LoadKey<69>(subkeys));
770  temp[2] = Xor(Sub(RotateLeft<3>(temp[2]), Xor(temp[1], LoadKey<70>(subkeys))), LoadKey<71>(subkeys));
771  temp[3] = Xor(Sub(RotateRight<9>(temp[3]), Xor(temp[2], LoadKey<60>(subkeys))), LoadKey<61>(subkeys));
772  temp[0] = Xor(Sub(RotateLeft<5>(temp[0]), Xor(temp[3], LoadKey<62>(subkeys))), LoadKey<63>(subkeys));
773  temp[1] = Xor(Sub(RotateLeft<3>(temp[1]), Xor(temp[0], LoadKey<64>(subkeys))), LoadKey<65>(subkeys));
774  temp[2] = Xor(Sub(RotateRight<9>(temp[2]), Xor(temp[1], LoadKey<54>(subkeys))), LoadKey<55>(subkeys));
775  temp[3] = Xor(Sub(RotateLeft<5>(temp[3]), Xor(temp[2], LoadKey<56>(subkeys))), LoadKey<57>(subkeys));
776  temp[0] = Xor(Sub(RotateLeft<3>(temp[0]), Xor(temp[3], LoadKey<58>(subkeys))), LoadKey<59>(subkeys));
777  temp[1] = Xor(Sub(RotateRight<9>(temp[1]), Xor(temp[0], LoadKey<48>(subkeys))), LoadKey<49>(subkeys));
778  temp[2] = Xor(Sub(RotateLeft<5>(temp[2]), Xor(temp[1], LoadKey<50>(subkeys))), LoadKey<51>(subkeys));
779  temp[3] = Xor(Sub(RotateLeft<3>(temp[3]), Xor(temp[2], LoadKey<52>(subkeys))), LoadKey<53>(subkeys));
780 
781  temp[0] = Xor(Sub(RotateRight<9>(temp[0]), Xor(temp[3], LoadKey<42>(subkeys))), LoadKey<43>(subkeys));
782  temp[1] = Xor(Sub(RotateLeft<5>(temp[1]), Xor(temp[0], LoadKey<44>(subkeys))), LoadKey<45>(subkeys));
783  temp[2] = Xor(Sub(RotateLeft<3>(temp[2]), Xor(temp[1], LoadKey<46>(subkeys))), LoadKey<47>(subkeys));
784  temp[3] = Xor(Sub(RotateRight<9>(temp[3]), Xor(temp[2], LoadKey<36>(subkeys))), LoadKey<37>(subkeys));
785  temp[0] = Xor(Sub(RotateLeft<5>(temp[0]), Xor(temp[3], LoadKey<38>(subkeys))), LoadKey<39>(subkeys));
786  temp[1] = Xor(Sub(RotateLeft<3>(temp[1]), Xor(temp[0], LoadKey<40>(subkeys))), LoadKey<41>(subkeys));
787  temp[2] = Xor(Sub(RotateRight<9>(temp[2]), Xor(temp[1], LoadKey<30>(subkeys))), LoadKey<31>(subkeys));
788  temp[3] = Xor(Sub(RotateLeft<5>(temp[3]), Xor(temp[2], LoadKey<32>(subkeys))), LoadKey<33>(subkeys));
789  temp[0] = Xor(Sub(RotateLeft<3>(temp[0]), Xor(temp[3], LoadKey<34>(subkeys))), LoadKey<35>(subkeys));
790  temp[1] = Xor(Sub(RotateRight<9>(temp[1]), Xor(temp[0], LoadKey<24>(subkeys))), LoadKey<25>(subkeys));
791  temp[2] = Xor(Sub(RotateLeft<5>(temp[2]), Xor(temp[1], LoadKey<26>(subkeys))), LoadKey<27>(subkeys));
792  temp[3] = Xor(Sub(RotateLeft<3>(temp[3]), Xor(temp[2], LoadKey<28>(subkeys))), LoadKey<29>(subkeys));
793 
794  temp[0] = Xor(Sub(RotateRight<9>(temp[0]), Xor(temp[3], LoadKey<18>(subkeys))), LoadKey<19>(subkeys));
795  temp[1] = Xor(Sub(RotateLeft<5>(temp[1]), Xor(temp[0], LoadKey<20>(subkeys))), LoadKey<21>(subkeys));
796  temp[2] = Xor(Sub(RotateLeft<3>(temp[2]), Xor(temp[1], LoadKey<22>(subkeys))), LoadKey<23>(subkeys));
797  temp[3] = Xor(Sub(RotateRight<9>(temp[3]), Xor(temp[2], LoadKey<12>(subkeys))), LoadKey<13>(subkeys));
798  temp[0] = Xor(Sub(RotateLeft<5>(temp[0]), Xor(temp[3], LoadKey<14>(subkeys))), LoadKey<15>(subkeys));
799  temp[1] = Xor(Sub(RotateLeft<3>(temp[1]), Xor(temp[0], LoadKey<16>(subkeys))), LoadKey<17>(subkeys));
800  temp[2] = Xor(Sub(RotateRight<9>(temp[2]), Xor(temp[1], LoadKey<6>(subkeys))), LoadKey<7>(subkeys));
801  temp[3] = Xor(Sub(RotateLeft<5>(temp[3]), Xor(temp[2], LoadKey<8>(subkeys))), LoadKey<9>(subkeys));
802  temp[0] = Xor(Sub(RotateLeft<3>(temp[0]), Xor(temp[3], LoadKey<10>(subkeys))), LoadKey<11>(subkeys));
803  temp[1] = Xor(Sub(RotateRight<9>(temp[1]), Xor(temp[0], LoadKey<0>(subkeys))), LoadKey<1>(subkeys));
804  temp[2] = Xor(Sub(RotateLeft<5>(temp[2]), Xor(temp[1], LoadKey<2>(subkeys))), LoadKey<3>(subkeys));
805  temp[3] = Xor(Sub(RotateLeft<3>(temp[3]), Xor(temp[2], LoadKey<4>(subkeys))), LoadKey<5>(subkeys));
806 }
807 
808 #endif // LEA Encryption and Decryption
809 
810 // *************************** ARM NEON ***************************//
811 
812 #if (CRYPTOPP_ARM_NEON_AVAILABLE)
813 
814 inline void LEA_Enc_Block(uint32x4_t &block0,
815  const word32 *subkeys, unsigned int rounds)
816 {
817  uint32x4_t temp[4];
818  temp[0] = UnpackNEON<0>(block0);
819  temp[1] = UnpackNEON<1>(block0);
820  temp[2] = UnpackNEON<2>(block0);
821  temp[3] = UnpackNEON<3>(block0);
822 
823  LEA_Encryption(temp, subkeys, rounds);
824 
825  block0 = RepackNEON<0>(temp[0], temp[1], temp[2], temp[3]);
826 }
827 
828 inline void LEA_Dec_Block(uint32x4_t &block0,
829  const word32 *subkeys, unsigned int rounds)
830 {
831  uint32x4_t temp[4];
832  temp[0] = UnpackNEON<0>(block0);
833  temp[1] = UnpackNEON<1>(block0);
834  temp[2] = UnpackNEON<2>(block0);
835  temp[3] = UnpackNEON<3>(block0);
836 
837  LEA_Decryption(temp, subkeys, rounds);
838 
839  block0 = RepackNEON<0>(temp[0], temp[1], temp[2], temp[3]);
840 }
841 
842 inline void LEA_Enc_4_Blocks(uint32x4_t &block0, uint32x4_t &block1,
843  uint32x4_t &block2, uint32x4_t &block3, const word32 *subkeys, unsigned int rounds)
844 {
845  uint32x4_t temp[4];
846  temp[0] = UnpackNEON<0>(block0, block1, block2, block3);
847  temp[1] = UnpackNEON<1>(block0, block1, block2, block3);
848  temp[2] = UnpackNEON<2>(block0, block1, block2, block3);
849  temp[3] = UnpackNEON<3>(block0, block1, block2, block3);
850 
851  LEA_Encryption(temp, subkeys, rounds);
852 
853  block0 = RepackNEON<0>(temp[0], temp[1], temp[2], temp[3]);
854  block1 = RepackNEON<1>(temp[0], temp[1], temp[2], temp[3]);
855  block2 = RepackNEON<2>(temp[0], temp[1], temp[2], temp[3]);
856  block3 = RepackNEON<3>(temp[0], temp[1], temp[2], temp[3]);
857 }
858 
859 inline void LEA_Dec_4_Blocks(uint32x4_t &block0, uint32x4_t &block1,
860  uint32x4_t &block2, uint32x4_t &block3, const word32 *subkeys, unsigned int rounds)
861 {
862  uint32x4_t temp[4];
863  temp[0] = UnpackNEON<0>(block0, block1, block2, block3);
864  temp[1] = UnpackNEON<1>(block0, block1, block2, block3);
865  temp[2] = UnpackNEON<2>(block0, block1, block2, block3);
866  temp[3] = UnpackNEON<3>(block0, block1, block2, block3);
867 
868  LEA_Decryption(temp, subkeys, rounds);
869 
870  block0 = RepackNEON<0>(temp[0], temp[1], temp[2], temp[3]);
871  block1 = RepackNEON<1>(temp[0], temp[1], temp[2], temp[3]);
872  block2 = RepackNEON<2>(temp[0], temp[1], temp[2], temp[3]);
873  block3 = RepackNEON<3>(temp[0], temp[1], temp[2], temp[3]);
874 }
875 
876 #endif // CRYPTOPP_ARM_NEON_AVAILABLE
877 
878 // *************************** IA-32 ***************************//
879 
880 #if (CRYPTOPP_SSSE3_AVAILABLE)
881 
882 inline void LEA_Enc_Block(__m128i &block0,
883  const word32 *subkeys, unsigned int rounds)
884 {
885  __m128i temp[4];
886  temp[0] = UnpackXMM<0>(block0);
887  temp[1] = UnpackXMM<1>(block0);
888  temp[2] = UnpackXMM<2>(block0);
889  temp[3] = UnpackXMM<3>(block0);
890 
891  LEA_Encryption(temp, subkeys, rounds);
892 
893  block0 = RepackXMM<0>(temp[0], temp[1], temp[2], temp[3]);
894 }
895 
896 inline void LEA_Dec_Block(__m128i &block0,
897  const word32 *subkeys, unsigned int rounds)
898 {
899  __m128i temp[4];
900  temp[0] = UnpackXMM<0>(block0);
901  temp[1] = UnpackXMM<1>(block0);
902  temp[2] = UnpackXMM<2>(block0);
903  temp[3] = UnpackXMM<3>(block0);
904 
905  LEA_Decryption(temp, subkeys, rounds);
906 
907  block0 = RepackXMM<0>(temp[0], temp[1], temp[2], temp[3]);
908 }
909 
910 inline void LEA_Enc_4_Blocks(__m128i &block0, __m128i &block1,
911  __m128i &block2, __m128i &block3, const word32 *subkeys, unsigned int rounds)
912 {
913  __m128i temp[4];
914  temp[0] = UnpackXMM<0>(block0, block1, block2, block3);
915  temp[1] = UnpackXMM<1>(block0, block1, block2, block3);
916  temp[2] = UnpackXMM<2>(block0, block1, block2, block3);
917  temp[3] = UnpackXMM<3>(block0, block1, block2, block3);
918 
919  LEA_Encryption(temp, subkeys, rounds);
920 
921  block0 = RepackXMM<0>(temp[0], temp[1], temp[2], temp[3]);
922  block1 = RepackXMM<1>(temp[0], temp[1], temp[2], temp[3]);
923  block2 = RepackXMM<2>(temp[0], temp[1], temp[2], temp[3]);
924  block3 = RepackXMM<3>(temp[0], temp[1], temp[2], temp[3]);
925 }
926 
927 inline void LEA_Dec_4_Blocks(__m128i &block0, __m128i &block1,
928  __m128i &block2, __m128i &block3, const word32 *subkeys, unsigned int rounds)
929 {
930  __m128i temp[4];
931  temp[0] = UnpackXMM<0>(block0, block1, block2, block3);
932  temp[1] = UnpackXMM<1>(block0, block1, block2, block3);
933  temp[2] = UnpackXMM<2>(block0, block1, block2, block3);
934  temp[3] = UnpackXMM<3>(block0, block1, block2, block3);
935 
936  LEA_Decryption(temp, subkeys, rounds);
937 
938  block0 = RepackXMM<0>(temp[0], temp[1], temp[2], temp[3]);
939  block1 = RepackXMM<1>(temp[0], temp[1], temp[2], temp[3]);
940  block2 = RepackXMM<2>(temp[0], temp[1], temp[2], temp[3]);
941  block3 = RepackXMM<3>(temp[0], temp[1], temp[2], temp[3]);
942 }
943 
944 #endif // CRYPTOPP_SSSE3_AVAILABLE
945 
946 // *************************** Power8 ***************************//
947 
948 #if (CRYPTOPP_POWER8_AVAILABLE)
949 
950 inline void LEA_Enc_Block(uint32x4_p &block0,
951  const word32 *subkeys, unsigned int rounds)
952 {
953  uint32x4_p temp[4];
954  temp[0] = UnpackSIMD<0>(block0);
955  temp[1] = UnpackSIMD<1>(block0);
956  temp[2] = UnpackSIMD<2>(block0);
957  temp[3] = UnpackSIMD<3>(block0);
958 
959  LEA_Encryption(temp, subkeys, rounds);
960 
961  block0 = RepackSIMD<0>(temp[0], temp[1], temp[2], temp[3]);
962 }
963 
964 inline void LEA_Dec_Block(uint32x4_p &block0,
965  const word32 *subkeys, unsigned int rounds)
966 {
967  uint32x4_p temp[4];
968  temp[0] = UnpackSIMD<0>(block0);
969  temp[1] = UnpackSIMD<1>(block0);
970  temp[2] = UnpackSIMD<2>(block0);
971  temp[3] = UnpackSIMD<3>(block0);
972 
973  LEA_Decryption(temp, subkeys, rounds);
974 
975  block0 = RepackSIMD<0>(temp[0], temp[1], temp[2], temp[3]);
976 }
977 
978 inline void LEA_Enc_4_Blocks(uint32x4_p &block0, uint32x4_p &block1,
979  uint32x4_p &block2, uint32x4_p &block3, const word32 *subkeys, unsigned int rounds)
980 {
981  uint32x4_p temp[4];
982  temp[0] = UnpackSIMD<0>(block0, block1, block2, block3);
983  temp[1] = UnpackSIMD<1>(block0, block1, block2, block3);
984  temp[2] = UnpackSIMD<2>(block0, block1, block2, block3);
985  temp[3] = UnpackSIMD<3>(block0, block1, block2, block3);
986 
987  LEA_Encryption(temp, subkeys, rounds);
988 
989  block0 = RepackSIMD<0>(temp[0], temp[1], temp[2], temp[3]);
990  block1 = RepackSIMD<1>(temp[0], temp[1], temp[2], temp[3]);
991  block2 = RepackSIMD<2>(temp[0], temp[1], temp[2], temp[3]);
992  block3 = RepackSIMD<3>(temp[0], temp[1], temp[2], temp[3]);
993 }
994 
995 inline void LEA_Dec_4_Blocks(uint32x4_p &block0, uint32x4_p &block1,
996  uint32x4_p &block2, uint32x4_p &block3, const word32 *subkeys, unsigned int rounds)
997 {
998  uint32x4_p temp[4];
999  temp[0] = UnpackSIMD<0>(block0, block1, block2, block3);
1000  temp[1] = UnpackSIMD<1>(block0, block1, block2, block3);
1001  temp[2] = UnpackSIMD<2>(block0, block1, block2, block3);
1002  temp[3] = UnpackSIMD<3>(block0, block1, block2, block3);
1003 
1004  LEA_Decryption(temp, subkeys, rounds);
1005 
1006  block0 = RepackSIMD<0>(temp[0], temp[1], temp[2], temp[3]);
1007  block1 = RepackSIMD<1>(temp[0], temp[1], temp[2], temp[3]);
1008  block2 = RepackSIMD<2>(temp[0], temp[1], temp[2], temp[3]);
1009  block3 = RepackSIMD<3>(temp[0], temp[1], temp[2], temp[3]);
1010 }
1011 
1012 #endif // CRYPTOPP_POWER8_AVAILABLE
1013 
1014 ANONYMOUS_NAMESPACE_END
1015 
1016 // *************************** SIMD Templates ***************************//
1017 
1018 NAMESPACE_BEGIN(CryptoPP)
1019 
1020 #if defined(CRYPTOPP_SSSE3_AVAILABLE)
1021 size_t LEA_Enc_AdvancedProcessBlocks_SSSE3(const word32* subKeys, size_t rounds,
1022  const byte *inBlocks, const byte *xorBlocks, byte *outBlocks, size_t length, word32 flags)
1023 {
1024  return AdvancedProcessBlocks128_4x1_SSE(LEA_Enc_Block, LEA_Enc_4_Blocks,
1025  subKeys, rounds, inBlocks, xorBlocks, outBlocks, length, flags);
1026 }
1027 
1028 size_t LEA_Dec_AdvancedProcessBlocks_SSSE3(const word32* subKeys, size_t rounds,
1029  const byte *inBlocks, const byte *xorBlocks, byte *outBlocks, size_t length, word32 flags)
1030 {
1031  return AdvancedProcessBlocks128_4x1_SSE(LEA_Dec_Block, LEA_Dec_4_Blocks,
1032  subKeys, rounds, inBlocks, xorBlocks, outBlocks, length, flags);
1033 }
1034 #endif // CRYPTOPP_SSSE3_AVAILABLE
1035 
1036 #if defined(CRYPTOPP_ARM_NEON_AVAILABLE)
1037 size_t LEA_Enc_AdvancedProcessBlocks_NEON(const word32* subKeys, size_t rounds,
1038  const byte *inBlocks, const byte *xorBlocks, byte *outBlocks, size_t length, word32 flags)
1039 {
1040  return AdvancedProcessBlocks128_4x1_NEON(LEA_Enc_Block, LEA_Enc_4_Blocks,
1041  subKeys, rounds, inBlocks, xorBlocks, outBlocks, length, flags);
1042 }
1043 
1044 size_t LEA_Dec_AdvancedProcessBlocks_NEON(const word32* subKeys, size_t rounds,
1045  const byte *inBlocks, const byte *xorBlocks, byte *outBlocks, size_t length, word32 flags)
1046 {
1047  return AdvancedProcessBlocks128_4x1_NEON(LEA_Dec_Block, LEA_Dec_4_Blocks,
1048  subKeys, rounds, inBlocks, xorBlocks, outBlocks, length, flags);
1049 }
1050 #endif // CRYPTOPP_ARM_NEON_AVAILABLE
1051 
1052 #if defined(CRYPTOPP_POWER8_AVAILABLE)
1053 size_t LEA_Enc_AdvancedProcessBlocks_POWER8(const word32* subKeys, size_t rounds,
1054  const byte *inBlocks, const byte *xorBlocks, byte *outBlocks, size_t length, word32 flags)
1055 {
1056  return AdvancedProcessBlocks128_4x1_ALTIVEC(LEA_Enc_Block, LEA_Enc_4_Blocks,
1057  subKeys, rounds, inBlocks, xorBlocks, outBlocks, length, flags);
1058 }
1059 
1060 size_t LEA_Dec_AdvancedProcessBlocks_POWER8(const word32* subKeys, size_t rounds,
1061  const byte *inBlocks, const byte *xorBlocks, byte *outBlocks, size_t length, word32 flags)
1062 {
1063  return AdvancedProcessBlocks128_4x1_ALTIVEC(LEA_Dec_Block, LEA_Dec_4_Blocks,
1064  subKeys, rounds, inBlocks, xorBlocks, outBlocks, length, flags);
1065 }
1066 #endif // CRYPTOPP_POWER8_AVAILABLE
1067 
1068 NAMESPACE_END
Utility functions for the Crypto++ library.
Classes for the LEA block cipher.
T1 VecSub(const T1 vec1, const T2 vec2)
Subtract two vectors.
Definition: ppc_simd.h:956
Library configuration file.
T1 VecAdd(const T1 vec1, const T2 vec2)
Add two vectors.
Definition: ppc_simd.h:939
T1 VecPermute(const T1 vec, const T2 mask)
Permutes a vector.
Definition: ppc_simd.h:1010
__vector unsigned int uint32x4_p
Vector of 32-bit elements.
Definition: ppc_simd.h:129
Support functions for PowerPC and vector operations.
Template for AdvancedProcessBlocks and SIMD processing.
Precompiled header file.
#define CRYPTOPP_ASSERT(exp)
Debugging and diagnostic assertion.
Definition: trap.h:69
T1 VecXor(const T1 vec1, const T2 vec2)
XOR two vectors.
Definition: ppc_simd.h:916
__vector unsigned long long uint64x2_p
Vector of 64-bit elements.
Definition: ppc_simd.h:139
Crypto++ library namespace.
__vector unsigned char uint8x16_p
Vector of 8-bit elements.
Definition: ppc_simd.h:119