Crypto++  8.2
Free C++ class library of cryptographic schemes
sha_simd.cpp
1 // sha_simd.cpp - written and placed in the public domain by
2 // Jeffrey Walton, Uri Blumenthal and Marcel Raad.
3 //
4 // This source file uses intrinsics to gain access to SHA-NI and
5 // ARMv8a SHA instructions. A separate source file is needed
6 // because additional CXXFLAGS are required to enable the
7 // appropriate instructions sets in some build configurations.
8 
9 #include "pch.h"
10 #include "config.h"
11 #include "sha.h"
12 #include "misc.h"
13 
14 #if defined(CRYPTOPP_DISABLE_SHA_ASM)
15 # undef CRYPTOPP_X86_ASM_AVAILABLE
16 # undef CRYPTOPP_X32_ASM_AVAILABLE
17 # undef CRYPTOPP_X64_ASM_AVAILABLE
18 # undef CRYPTOPP_SSE2_ASM_AVAILABLE
19 #endif
20 
21 #if (CRYPTOPP_SHANI_AVAILABLE)
22 # include <nmmintrin.h>
23 # include <immintrin.h>
24 #endif
25 
26 // C1189: error: This header is specific to ARM targets
27 #if (CRYPTOPP_ARM_NEON_AVAILABLE) && !defined(_M_ARM64)
28 # include <arm_neon.h>
29 #endif
30 
31 #if (CRYPTOPP_ARM_ACLE_AVAILABLE)
32 # include <stdint.h>
33 # include <arm_acle.h>
34 #endif
35 
36 #if CRYPTOPP_POWER8_SHA_AVAILABLE
37 # include "ppc_simd.h"
38 #endif
39 
40 #ifdef CRYPTOPP_GNU_STYLE_INLINE_ASSEMBLY
41 # include <signal.h>
42 # include <setjmp.h>
43 #endif
44 
45 #ifndef EXCEPTION_EXECUTE_HANDLER
46 # define EXCEPTION_EXECUTE_HANDLER 1
47 #endif
48 
49 // Clang __m128i casts
50 #define M128_CAST(x) ((__m128i *)(void *)(x))
51 #define CONST_M128_CAST(x) ((const __m128i *)(const void *)(x))
52 
53 // Squash MS LNK4221 and libtool warnings
54 extern const char SHA_SIMD_FNAME[] = __FILE__;
55 
56 NAMESPACE_BEGIN(CryptoPP)
57 
58 // ***************** SHA key tables ********************
59 
60 extern const word32 SHA256_K[64];
61 extern const word64 SHA512_K[80];
62 
63 // ***************** SIGILL probes ********************
64 
65 #ifdef CRYPTOPP_GNU_STYLE_INLINE_ASSEMBLY
66 extern "C" {
67  typedef void (*SigHandler)(int);
68 
69  static jmp_buf s_jmpSIGILL;
70  static void SigIllHandler(int)
71  {
72  longjmp(s_jmpSIGILL, 1);
73  }
74 }
75 #endif // Not CRYPTOPP_MS_STYLE_INLINE_ASSEMBLY
76 
77 #if (CRYPTOPP_BOOL_ARM32 || CRYPTOPP_BOOL_ARMV8)
78 bool CPU_ProbeSHA1()
79 {
80 #if defined(CRYPTOPP_NO_CPU_FEATURE_PROBES)
81  return false;
82 #elif (CRYPTOPP_ARM_SHA1_AVAILABLE)
83 # if defined(CRYPTOPP_MS_STYLE_INLINE_ASSEMBLY)
84  volatile bool result = true;
85  __try
86  {
87  unsigned int w[] = {1,2,3,4, 5,6,7,8, 9,10,11,12};
88  uint32x4_t data1 = vld1q_u32(w+0);
89  uint32x4_t data2 = vld1q_u32(w+4);
90  uint32x4_t data3 = vld1q_u32(w+8);
91 
92  uint32x4_t r1 = vsha1cq_u32 (data1, 0, data2);
93  uint32x4_t r2 = vsha1mq_u32 (data1, 0, data2);
94  uint32x4_t r3 = vsha1pq_u32 (data1, 0, data2);
95  uint32x4_t r4 = vsha1su0q_u32 (data1, data2, data3);
96  uint32x4_t r5 = vsha1su1q_u32 (data1, data2);
97 
98  result = !!(vgetq_lane_u32(r1,0) | vgetq_lane_u32(r2,1) | vgetq_lane_u32(r3,2) | vgetq_lane_u32(r4,3) | vgetq_lane_u32(r5,0));
99  }
100  __except (EXCEPTION_EXECUTE_HANDLER)
101  {
102  return false;
103  }
104  return result;
105 # else
106 
107  // longjmp and clobber warnings. Volatile is required.
108  // http://github.com/weidai11/cryptopp/issues/24 and http://stackoverflow.com/q/7721854
109  volatile bool result = true;
110 
111  volatile SigHandler oldHandler = signal(SIGILL, SigIllHandler);
112  if (oldHandler == SIG_ERR)
113  return false;
114 
115  volatile sigset_t oldMask;
116  if (sigprocmask(0, NULLPTR, (sigset_t*)&oldMask))
117  return false;
118 
119  if (setjmp(s_jmpSIGILL))
120  result = false;
121  else
122  {
123  unsigned int w[] = {1,2,3,4, 5,6,7,8, 9,10,11,12};
124  uint32x4_t data1 = vld1q_u32(w+0);
125  uint32x4_t data2 = vld1q_u32(w+4);
126  uint32x4_t data3 = vld1q_u32(w+8);
127 
128  uint32x4_t r1 = vsha1cq_u32 (data1, 0, data2);
129  uint32x4_t r2 = vsha1mq_u32 (data1, 0, data2);
130  uint32x4_t r3 = vsha1pq_u32 (data1, 0, data2);
131  uint32x4_t r4 = vsha1su0q_u32 (data1, data2, data3);
132  uint32x4_t r5 = vsha1su1q_u32 (data1, data2);
133 
134  result = !!(vgetq_lane_u32(r1,0) | vgetq_lane_u32(r2,1) | vgetq_lane_u32(r3,2) | vgetq_lane_u32(r4,3) | vgetq_lane_u32(r5,0));
135  }
136 
137  sigprocmask(SIG_SETMASK, (sigset_t*)&oldMask, NULLPTR);
138  signal(SIGILL, oldHandler);
139  return result;
140 # endif
141 #else
142  return false;
143 #endif // CRYPTOPP_ARM_SHA1_AVAILABLE
144 }
145 
146 bool CPU_ProbeSHA2()
147 {
148 #if defined(CRYPTOPP_NO_CPU_FEATURE_PROBES)
149  return false;
150 #elif (CRYPTOPP_ARM_SHA2_AVAILABLE)
151 # if defined(CRYPTOPP_MS_STYLE_INLINE_ASSEMBLY)
152  volatile bool result = true;
153  __try
154  {
155  unsigned int w[] = {1,2,3,4, 5,6,7,8, 9,10,11,12};
156  uint32x4_t data1 = vld1q_u32(w+0);
157  uint32x4_t data2 = vld1q_u32(w+4);
158  uint32x4_t data3 = vld1q_u32(w+8);
159 
160  uint32x4_t r1 = vsha256hq_u32 (data1, data2, data3);
161  uint32x4_t r2 = vsha256h2q_u32 (data1, data2, data3);
162  uint32x4_t r3 = vsha256su0q_u32 (data1, data2);
163  uint32x4_t r4 = vsha256su1q_u32 (data1, data2, data3);
164 
165  result = !!(vgetq_lane_u32(r1,0) | vgetq_lane_u32(r2,1) | vgetq_lane_u32(r3,2) | vgetq_lane_u32(r4,3));
166  }
167  __except (EXCEPTION_EXECUTE_HANDLER)
168  {
169  return false;
170  }
171  return result;
172 #else
173 
174  // longjmp and clobber warnings. Volatile is required.
175  // http://github.com/weidai11/cryptopp/issues/24 and http://stackoverflow.com/q/7721854
176  volatile bool result = true;
177 
178  volatile SigHandler oldHandler = signal(SIGILL, SigIllHandler);
179  if (oldHandler == SIG_ERR)
180  return false;
181 
182  volatile sigset_t oldMask;
183  if (sigprocmask(0, NULLPTR, (sigset_t*)&oldMask))
184  return false;
185 
186  if (setjmp(s_jmpSIGILL))
187  result = false;
188  else
189  {
190  unsigned int w[] = {1,2,3,4, 5,6,7,8, 9,10,11,12};
191  uint32x4_t data1 = vld1q_u32(w+0);
192  uint32x4_t data2 = vld1q_u32(w+4);
193  uint32x4_t data3 = vld1q_u32(w+8);
194 
195  uint32x4_t r1 = vsha256hq_u32 (data1, data2, data3);
196  uint32x4_t r2 = vsha256h2q_u32 (data1, data2, data3);
197  uint32x4_t r3 = vsha256su0q_u32 (data1, data2);
198  uint32x4_t r4 = vsha256su1q_u32 (data1, data2, data3);
199 
200  result = !!(vgetq_lane_u32(r1,0) | vgetq_lane_u32(r2,1) | vgetq_lane_u32(r3,2) | vgetq_lane_u32(r4,3));
201  }
202 
203  sigprocmask(SIG_SETMASK, (sigset_t*)&oldMask, NULLPTR);
204  signal(SIGILL, oldHandler);
205  return result;
206 # endif
207 #else
208  return false;
209 #endif // CRYPTOPP_ARM_SHA2_AVAILABLE
210 }
211 #endif // ARM32 or ARM64
212 
213 // ***************** Intel x86 SHA ********************
214 
215 /////////////////////////////////////
216 // start of Walton and Gulley code //
217 /////////////////////////////////////
218 
219 #if CRYPTOPP_SHANI_AVAILABLE
220 // Based on http://software.intel.com/en-us/articles/intel-sha-extensions and code by Sean Gulley.
221 void SHA1_HashMultipleBlocks_SHANI(word32 *state, const word32 *data, size_t length, ByteOrder order)
222 {
223  CRYPTOPP_ASSERT(state);
224  CRYPTOPP_ASSERT(data);
225  CRYPTOPP_ASSERT(length >= SHA1::BLOCKSIZE);
226 
227  __m128i ABCD, ABCD_SAVE, E0, E0_SAVE, E1;
228  __m128i MASK, MSG0, MSG1, MSG2, MSG3;
229 
230  // Load initial values
231  ABCD = _mm_loadu_si128(CONST_M128_CAST(state));
232  E0 = _mm_set_epi32(state[4], 0, 0, 0);
233  ABCD = _mm_shuffle_epi32(ABCD, 0x1B);
234 
235  // IA-32 SHA is little endian, SHA::Transform is big endian,
236  // and SHA::HashMultipleBlocks can be either. ByteOrder
237  // allows us to avoid extra endian reversals. It saves 1.0 cpb.
238  MASK = order == BIG_ENDIAN_ORDER ? // Data arrangement
239  _mm_set_epi8(0,1,2,3, 4,5,6,7, 8,9,10,11, 12,13,14,15) :
240  _mm_set_epi8(3,2,1,0, 7,6,5,4, 11,10,9,8, 15,14,13,12) ;
241 
242  while (length >= SHA1::BLOCKSIZE)
243  {
244  // Save current hash
245  ABCD_SAVE = ABCD;
246  E0_SAVE = E0;
247 
248  // Rounds 0-3
249  MSG0 = _mm_loadu_si128(CONST_M128_CAST(data+0));
250  MSG0 = _mm_shuffle_epi8(MSG0, MASK);
251  E0 = _mm_add_epi32(E0, MSG0);
252  E1 = ABCD;
253  ABCD = _mm_sha1rnds4_epu32(ABCD, E0, 0);
254 
255  // Rounds 4-7
256  MSG1 = _mm_loadu_si128(CONST_M128_CAST(data+4));
257  MSG1 = _mm_shuffle_epi8(MSG1, MASK);
258  E1 = _mm_sha1nexte_epu32(E1, MSG1);
259  E0 = ABCD;
260  ABCD = _mm_sha1rnds4_epu32(ABCD, E1, 0);
261  MSG0 = _mm_sha1msg1_epu32(MSG0, MSG1);
262 
263  // Rounds 8-11
264  MSG2 = _mm_loadu_si128(CONST_M128_CAST(data+8));
265  MSG2 = _mm_shuffle_epi8(MSG2, MASK);
266  E0 = _mm_sha1nexte_epu32(E0, MSG2);
267  E1 = ABCD;
268  ABCD = _mm_sha1rnds4_epu32(ABCD, E0, 0);
269  MSG1 = _mm_sha1msg1_epu32(MSG1, MSG2);
270  MSG0 = _mm_xor_si128(MSG0, MSG2);
271 
272  // Rounds 12-15
273  MSG3 = _mm_loadu_si128(CONST_M128_CAST(data+12));
274  MSG3 = _mm_shuffle_epi8(MSG3, MASK);
275  E1 = _mm_sha1nexte_epu32(E1, MSG3);
276  E0 = ABCD;
277  MSG0 = _mm_sha1msg2_epu32(MSG0, MSG3);
278  ABCD = _mm_sha1rnds4_epu32(ABCD, E1, 0);
279  MSG2 = _mm_sha1msg1_epu32(MSG2, MSG3);
280  MSG1 = _mm_xor_si128(MSG1, MSG3);
281 
282  // Rounds 16-19
283  E0 = _mm_sha1nexte_epu32(E0, MSG0);
284  E1 = ABCD;
285  MSG1 = _mm_sha1msg2_epu32(MSG1, MSG0);
286  ABCD = _mm_sha1rnds4_epu32(ABCD, E0, 0);
287  MSG3 = _mm_sha1msg1_epu32(MSG3, MSG0);
288  MSG2 = _mm_xor_si128(MSG2, MSG0);
289 
290  // Rounds 20-23
291  E1 = _mm_sha1nexte_epu32(E1, MSG1);
292  E0 = ABCD;
293  MSG2 = _mm_sha1msg2_epu32(MSG2, MSG1);
294  ABCD = _mm_sha1rnds4_epu32(ABCD, E1, 1);
295  MSG0 = _mm_sha1msg1_epu32(MSG0, MSG1);
296  MSG3 = _mm_xor_si128(MSG3, MSG1);
297 
298  // Rounds 24-27
299  E0 = _mm_sha1nexte_epu32(E0, MSG2);
300  E1 = ABCD;
301  MSG3 = _mm_sha1msg2_epu32(MSG3, MSG2);
302  ABCD = _mm_sha1rnds4_epu32(ABCD, E0, 1);
303  MSG1 = _mm_sha1msg1_epu32(MSG1, MSG2);
304  MSG0 = _mm_xor_si128(MSG0, MSG2);
305 
306  // Rounds 28-31
307  E1 = _mm_sha1nexte_epu32(E1, MSG3);
308  E0 = ABCD;
309  MSG0 = _mm_sha1msg2_epu32(MSG0, MSG3);
310  ABCD = _mm_sha1rnds4_epu32(ABCD, E1, 1);
311  MSG2 = _mm_sha1msg1_epu32(MSG2, MSG3);
312  MSG1 = _mm_xor_si128(MSG1, MSG3);
313 
314  // Rounds 32-35
315  E0 = _mm_sha1nexte_epu32(E0, MSG0);
316  E1 = ABCD;
317  MSG1 = _mm_sha1msg2_epu32(MSG1, MSG0);
318  ABCD = _mm_sha1rnds4_epu32(ABCD, E0, 1);
319  MSG3 = _mm_sha1msg1_epu32(MSG3, MSG0);
320  MSG2 = _mm_xor_si128(MSG2, MSG0);
321 
322  // Rounds 36-39
323  E1 = _mm_sha1nexte_epu32(E1, MSG1);
324  E0 = ABCD;
325  MSG2 = _mm_sha1msg2_epu32(MSG2, MSG1);
326  ABCD = _mm_sha1rnds4_epu32(ABCD, E1, 1);
327  MSG0 = _mm_sha1msg1_epu32(MSG0, MSG1);
328  MSG3 = _mm_xor_si128(MSG3, MSG1);
329 
330  // Rounds 40-43
331  E0 = _mm_sha1nexte_epu32(E0, MSG2);
332  E1 = ABCD;
333  MSG3 = _mm_sha1msg2_epu32(MSG3, MSG2);
334  ABCD = _mm_sha1rnds4_epu32(ABCD, E0, 2);
335  MSG1 = _mm_sha1msg1_epu32(MSG1, MSG2);
336  MSG0 = _mm_xor_si128(MSG0, MSG2);
337 
338  // Rounds 44-47
339  E1 = _mm_sha1nexte_epu32(E1, MSG3);
340  E0 = ABCD;
341  MSG0 = _mm_sha1msg2_epu32(MSG0, MSG3);
342  ABCD = _mm_sha1rnds4_epu32(ABCD, E1, 2);
343  MSG2 = _mm_sha1msg1_epu32(MSG2, MSG3);
344  MSG1 = _mm_xor_si128(MSG1, MSG3);
345 
346  // Rounds 48-51
347  E0 = _mm_sha1nexte_epu32(E0, MSG0);
348  E1 = ABCD;
349  MSG1 = _mm_sha1msg2_epu32(MSG1, MSG0);
350  ABCD = _mm_sha1rnds4_epu32(ABCD, E0, 2);
351  MSG3 = _mm_sha1msg1_epu32(MSG3, MSG0);
352  MSG2 = _mm_xor_si128(MSG2, MSG0);
353 
354  // Rounds 52-55
355  E1 = _mm_sha1nexte_epu32(E1, MSG1);
356  E0 = ABCD;
357  MSG2 = _mm_sha1msg2_epu32(MSG2, MSG1);
358  ABCD = _mm_sha1rnds4_epu32(ABCD, E1, 2);
359  MSG0 = _mm_sha1msg1_epu32(MSG0, MSG1);
360  MSG3 = _mm_xor_si128(MSG3, MSG1);
361 
362  // Rounds 56-59
363  E0 = _mm_sha1nexte_epu32(E0, MSG2);
364  E1 = ABCD;
365  MSG3 = _mm_sha1msg2_epu32(MSG3, MSG2);
366  ABCD = _mm_sha1rnds4_epu32(ABCD, E0, 2);
367  MSG1 = _mm_sha1msg1_epu32(MSG1, MSG2);
368  MSG0 = _mm_xor_si128(MSG0, MSG2);
369 
370  // Rounds 60-63
371  E1 = _mm_sha1nexte_epu32(E1, MSG3);
372  E0 = ABCD;
373  MSG0 = _mm_sha1msg2_epu32(MSG0, MSG3);
374  ABCD = _mm_sha1rnds4_epu32(ABCD, E1, 3);
375  MSG2 = _mm_sha1msg1_epu32(MSG2, MSG3);
376  MSG1 = _mm_xor_si128(MSG1, MSG3);
377 
378  // Rounds 64-67
379  E0 = _mm_sha1nexte_epu32(E0, MSG0);
380  E1 = ABCD;
381  MSG1 = _mm_sha1msg2_epu32(MSG1, MSG0);
382  ABCD = _mm_sha1rnds4_epu32(ABCD, E0, 3);
383  MSG3 = _mm_sha1msg1_epu32(MSG3, MSG0);
384  MSG2 = _mm_xor_si128(MSG2, MSG0);
385 
386  // Rounds 68-71
387  E1 = _mm_sha1nexte_epu32(E1, MSG1);
388  E0 = ABCD;
389  MSG2 = _mm_sha1msg2_epu32(MSG2, MSG1);
390  ABCD = _mm_sha1rnds4_epu32(ABCD, E1, 3);
391  MSG3 = _mm_xor_si128(MSG3, MSG1);
392 
393  // Rounds 72-75
394  E0 = _mm_sha1nexte_epu32(E0, MSG2);
395  E1 = ABCD;
396  MSG3 = _mm_sha1msg2_epu32(MSG3, MSG2);
397  ABCD = _mm_sha1rnds4_epu32(ABCD, E0, 3);
398 
399  // Rounds 76-79
400  E1 = _mm_sha1nexte_epu32(E1, MSG3);
401  E0 = ABCD;
402  ABCD = _mm_sha1rnds4_epu32(ABCD, E1, 3);
403 
404  // Add values back to state
405  E0 = _mm_sha1nexte_epu32(E0, E0_SAVE);
406  ABCD = _mm_add_epi32(ABCD, ABCD_SAVE);
407 
408  data += SHA1::BLOCKSIZE/sizeof(word32);
409  length -= SHA1::BLOCKSIZE;
410  }
411 
412  // Save state
413  ABCD = _mm_shuffle_epi32(ABCD, 0x1B);
414  _mm_storeu_si128(M128_CAST(state), ABCD);
415  state[4] = _mm_extract_epi32(E0, 3);
416 }
417 
418 // Based on http://software.intel.com/en-us/articles/intel-sha-extensions and code by Sean Gulley.
419 void SHA256_HashMultipleBlocks_SHANI(word32 *state, const word32 *data, size_t length, ByteOrder order)
420 {
421  CRYPTOPP_ASSERT(state);
422  CRYPTOPP_ASSERT(data);
423  CRYPTOPP_ASSERT(length >= SHA256::BLOCKSIZE);
424 
425  __m128i STATE0, STATE1;
426  __m128i MSG, TMP, MASK;
427  __m128i TMSG0, TMSG1, TMSG2, TMSG3;
428  __m128i ABEF_SAVE, CDGH_SAVE;
429 
430  // Load initial values
431  TMP = _mm_loadu_si128(M128_CAST(&state[0]));
432  STATE1 = _mm_loadu_si128(M128_CAST(&state[4]));
433 
434  // IA-32 SHA is little endian, SHA::Transform is big endian,
435  // and SHA::HashMultipleBlocks can be either. ByteOrder
436  // allows us to avoid extra endian reversals. It saves 1.0 cpb.
437  MASK = order == BIG_ENDIAN_ORDER ? // Data arrangement
438  _mm_set_epi8(12,13,14,15, 8,9,10,11, 4,5,6,7, 0,1,2,3) :
439  _mm_set_epi8(15,14,13,12, 11,10,9,8, 7,6,5,4, 3,2,1,0) ;
440 
441  TMP = _mm_shuffle_epi32(TMP, 0xB1); // CDAB
442  STATE1 = _mm_shuffle_epi32(STATE1, 0x1B); // EFGH
443  STATE0 = _mm_alignr_epi8(TMP, STATE1, 8); // ABEF
444  STATE1 = _mm_blend_epi16(STATE1, TMP, 0xF0); // CDGH
445 
446  while (length >= SHA256::BLOCKSIZE)
447  {
448  // Save current hash
449  ABEF_SAVE = STATE0;
450  CDGH_SAVE = STATE1;
451 
452  // Rounds 0-3
453  MSG = _mm_loadu_si128(CONST_M128_CAST(data+0));
454  TMSG0 = _mm_shuffle_epi8(MSG, MASK);
455  MSG = _mm_add_epi32(TMSG0, _mm_set_epi64x(W64LIT(0xE9B5DBA5B5C0FBCF), W64LIT(0x71374491428A2F98)));
456  STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG);
457  MSG = _mm_shuffle_epi32(MSG, 0x0E);
458  STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG);
459 
460  // Rounds 4-7
461  TMSG1 = _mm_loadu_si128(CONST_M128_CAST(data+4));
462  TMSG1 = _mm_shuffle_epi8(TMSG1, MASK);
463  MSG = _mm_add_epi32(TMSG1, _mm_set_epi64x(W64LIT(0xAB1C5ED5923F82A4), W64LIT(0x59F111F13956C25B)));
464  STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG);
465  MSG = _mm_shuffle_epi32(MSG, 0x0E);
466  STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG);
467  TMSG0 = _mm_sha256msg1_epu32(TMSG0, TMSG1);
468 
469  // Rounds 8-11
470  TMSG2 = _mm_loadu_si128(CONST_M128_CAST(data+8));
471  TMSG2 = _mm_shuffle_epi8(TMSG2, MASK);
472  MSG = _mm_add_epi32(TMSG2, _mm_set_epi64x(W64LIT(0x550C7DC3243185BE), W64LIT(0x12835B01D807AA98)));
473  STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG);
474  MSG = _mm_shuffle_epi32(MSG, 0x0E);
475  STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG);
476  TMSG1 = _mm_sha256msg1_epu32(TMSG1, TMSG2);
477 
478  // Rounds 12-15
479  TMSG3 = _mm_loadu_si128(CONST_M128_CAST(data+12));
480  TMSG3 = _mm_shuffle_epi8(TMSG3, MASK);
481  MSG = _mm_add_epi32(TMSG3, _mm_set_epi64x(W64LIT(0xC19BF1749BDC06A7), W64LIT(0x80DEB1FE72BE5D74)));
482  STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG);
483  TMP = _mm_alignr_epi8(TMSG3, TMSG2, 4);
484  TMSG0 = _mm_add_epi32(TMSG0, TMP);
485  TMSG0 = _mm_sha256msg2_epu32(TMSG0, TMSG3);
486  MSG = _mm_shuffle_epi32(MSG, 0x0E);
487  STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG);
488  TMSG2 = _mm_sha256msg1_epu32(TMSG2, TMSG3);
489 
490  // Rounds 16-19
491  MSG = _mm_add_epi32(TMSG0, _mm_set_epi64x(W64LIT(0x240CA1CC0FC19DC6), W64LIT(0xEFBE4786E49B69C1)));
492  STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG);
493  TMP = _mm_alignr_epi8(TMSG0, TMSG3, 4);
494  TMSG1 = _mm_add_epi32(TMSG1, TMP);
495  TMSG1 = _mm_sha256msg2_epu32(TMSG1, TMSG0);
496  MSG = _mm_shuffle_epi32(MSG, 0x0E);
497  STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG);
498  TMSG3 = _mm_sha256msg1_epu32(TMSG3, TMSG0);
499 
500  // Rounds 20-23
501  MSG = _mm_add_epi32(TMSG1, _mm_set_epi64x(W64LIT(0x76F988DA5CB0A9DC), W64LIT(0x4A7484AA2DE92C6F)));
502  STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG);
503  TMP = _mm_alignr_epi8(TMSG1, TMSG0, 4);
504  TMSG2 = _mm_add_epi32(TMSG2, TMP);
505  TMSG2 = _mm_sha256msg2_epu32(TMSG2, TMSG1);
506  MSG = _mm_shuffle_epi32(MSG, 0x0E);
507  STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG);
508  TMSG0 = _mm_sha256msg1_epu32(TMSG0, TMSG1);
509 
510  // Rounds 24-27
511  MSG = _mm_add_epi32(TMSG2, _mm_set_epi64x(W64LIT(0xBF597FC7B00327C8), W64LIT(0xA831C66D983E5152)));
512  STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG);
513  TMP = _mm_alignr_epi8(TMSG2, TMSG1, 4);
514  TMSG3 = _mm_add_epi32(TMSG3, TMP);
515  TMSG3 = _mm_sha256msg2_epu32(TMSG3, TMSG2);
516  MSG = _mm_shuffle_epi32(MSG, 0x0E);
517  STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG);
518  TMSG1 = _mm_sha256msg1_epu32(TMSG1, TMSG2);
519 
520  // Rounds 28-31
521  MSG = _mm_add_epi32(TMSG3, _mm_set_epi64x(W64LIT(0x1429296706CA6351), W64LIT(0xD5A79147C6E00BF3)));
522  STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG);
523  TMP = _mm_alignr_epi8(TMSG3, TMSG2, 4);
524  TMSG0 = _mm_add_epi32(TMSG0, TMP);
525  TMSG0 = _mm_sha256msg2_epu32(TMSG0, TMSG3);
526  MSG = _mm_shuffle_epi32(MSG, 0x0E);
527  STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG);
528  TMSG2 = _mm_sha256msg1_epu32(TMSG2, TMSG3);
529 
530  // Rounds 32-35
531  MSG = _mm_add_epi32(TMSG0, _mm_set_epi64x(W64LIT(0x53380D134D2C6DFC), W64LIT(0x2E1B213827B70A85)));
532  STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG);
533  TMP = _mm_alignr_epi8(TMSG0, TMSG3, 4);
534  TMSG1 = _mm_add_epi32(TMSG1, TMP);
535  TMSG1 = _mm_sha256msg2_epu32(TMSG1, TMSG0);
536  MSG = _mm_shuffle_epi32(MSG, 0x0E);
537  STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG);
538  TMSG3 = _mm_sha256msg1_epu32(TMSG3, TMSG0);
539 
540  // Rounds 36-39
541  MSG = _mm_add_epi32(TMSG1, _mm_set_epi64x(W64LIT(0x92722C8581C2C92E), W64LIT(0x766A0ABB650A7354)));
542  STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG);
543  TMP = _mm_alignr_epi8(TMSG1, TMSG0, 4);
544  TMSG2 = _mm_add_epi32(TMSG2, TMP);
545  TMSG2 = _mm_sha256msg2_epu32(TMSG2, TMSG1);
546  MSG = _mm_shuffle_epi32(MSG, 0x0E);
547  STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG);
548  TMSG0 = _mm_sha256msg1_epu32(TMSG0, TMSG1);
549 
550  // Rounds 40-43
551  MSG = _mm_add_epi32(TMSG2, _mm_set_epi64x(W64LIT(0xC76C51A3C24B8B70), W64LIT(0xA81A664BA2BFE8A1)));
552  STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG);
553  TMP = _mm_alignr_epi8(TMSG2, TMSG1, 4);
554  TMSG3 = _mm_add_epi32(TMSG3, TMP);
555  TMSG3 = _mm_sha256msg2_epu32(TMSG3, TMSG2);
556  MSG = _mm_shuffle_epi32(MSG, 0x0E);
557  STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG);
558  TMSG1 = _mm_sha256msg1_epu32(TMSG1, TMSG2);
559 
560  // Rounds 44-47
561  MSG = _mm_add_epi32(TMSG3, _mm_set_epi64x(W64LIT(0x106AA070F40E3585), W64LIT(0xD6990624D192E819)));
562  STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG);
563  TMP = _mm_alignr_epi8(TMSG3, TMSG2, 4);
564  TMSG0 = _mm_add_epi32(TMSG0, TMP);
565  TMSG0 = _mm_sha256msg2_epu32(TMSG0, TMSG3);
566  MSG = _mm_shuffle_epi32(MSG, 0x0E);
567  STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG);
568  TMSG2 = _mm_sha256msg1_epu32(TMSG2, TMSG3);
569 
570  // Rounds 48-51
571  MSG = _mm_add_epi32(TMSG0, _mm_set_epi64x(W64LIT(0x34B0BCB52748774C), W64LIT(0x1E376C0819A4C116)));
572  STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG);
573  TMP = _mm_alignr_epi8(TMSG0, TMSG3, 4);
574  TMSG1 = _mm_add_epi32(TMSG1, TMP);
575  TMSG1 = _mm_sha256msg2_epu32(TMSG1, TMSG0);
576  MSG = _mm_shuffle_epi32(MSG, 0x0E);
577  STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG);
578  TMSG3 = _mm_sha256msg1_epu32(TMSG3, TMSG0);
579 
580  // Rounds 52-55
581  MSG = _mm_add_epi32(TMSG1, _mm_set_epi64x(W64LIT(0x682E6FF35B9CCA4F), W64LIT(0x4ED8AA4A391C0CB3)));
582  STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG);
583  TMP = _mm_alignr_epi8(TMSG1, TMSG0, 4);
584  TMSG2 = _mm_add_epi32(TMSG2, TMP);
585  TMSG2 = _mm_sha256msg2_epu32(TMSG2, TMSG1);
586  MSG = _mm_shuffle_epi32(MSG, 0x0E);
587  STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG);
588 
589  // Rounds 56-59
590  MSG = _mm_add_epi32(TMSG2, _mm_set_epi64x(W64LIT(0x8CC7020884C87814), W64LIT(0x78A5636F748F82EE)));
591  STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG);
592  TMP = _mm_alignr_epi8(TMSG2, TMSG1, 4);
593  TMSG3 = _mm_add_epi32(TMSG3, TMP);
594  TMSG3 = _mm_sha256msg2_epu32(TMSG3, TMSG2);
595  MSG = _mm_shuffle_epi32(MSG, 0x0E);
596  STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG);
597 
598  // Rounds 60-63
599  MSG = _mm_add_epi32(TMSG3, _mm_set_epi64x(W64LIT(0xC67178F2BEF9A3F7), W64LIT(0xA4506CEB90BEFFFA)));
600  STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG);
601  MSG = _mm_shuffle_epi32(MSG, 0x0E);
602  STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG);
603 
604  // Add values back to state
605  STATE0 = _mm_add_epi32(STATE0, ABEF_SAVE);
606  STATE1 = _mm_add_epi32(STATE1, CDGH_SAVE);
607 
608  data += SHA256::BLOCKSIZE/sizeof(word32);
609  length -= SHA256::BLOCKSIZE;
610  }
611 
612  TMP = _mm_shuffle_epi32(STATE0, 0x1B); // FEBA
613  STATE1 = _mm_shuffle_epi32(STATE1, 0xB1); // DCHG
614  STATE0 = _mm_blend_epi16(TMP, STATE1, 0xF0); // DCBA
615  STATE1 = _mm_alignr_epi8(STATE1, TMP, 8); // ABEF
616 
617  // Save state
618  _mm_storeu_si128(M128_CAST(&state[0]), STATE0);
619  _mm_storeu_si128(M128_CAST(&state[4]), STATE1);
620 }
621 #endif // CRYPTOPP_SHANI_AVAILABLE
622 
623 ///////////////////////////////////
624 // end of Walton and Gulley code //
625 ///////////////////////////////////
626 
627 // ***************** ARMV8 SHA ********************
628 
629 /////////////////////////////////////////////////////////////
630 // start of Walton, Schneiders, O'Rourke and Hovsmith code //
631 /////////////////////////////////////////////////////////////
632 
633 #if CRYPTOPP_ARM_SHA1_AVAILABLE
634 void SHA1_HashMultipleBlocks_ARMV8(word32 *state, const word32 *data, size_t length, ByteOrder order)
635 {
636  CRYPTOPP_ASSERT(state);
637  CRYPTOPP_ASSERT(data);
638  CRYPTOPP_ASSERT(length >= SHA1::BLOCKSIZE);
639 
640  uint32x4_t C0, C1, C2, C3;
641  uint32x4_t ABCD, ABCD_SAVED;
642  uint32x4_t MSG0, MSG1, MSG2, MSG3;
643  uint32x4_t TMP0, TMP1;
644  uint32_t E0, E0_SAVED, E1;
645 
646  // Load initial values
647  C0 = vdupq_n_u32(0x5A827999);
648  C1 = vdupq_n_u32(0x6ED9EBA1);
649  C2 = vdupq_n_u32(0x8F1BBCDC);
650  C3 = vdupq_n_u32(0xCA62C1D6);
651 
652  ABCD = vld1q_u32(&state[0]);
653  E0 = state[4];
654 
655  while (length >= SHA1::BLOCKSIZE)
656  {
657  // Save current hash
658  ABCD_SAVED = ABCD;
659  E0_SAVED = E0;
660 
661  MSG0 = vld1q_u32(data + 0);
662  MSG1 = vld1q_u32(data + 4);
663  MSG2 = vld1q_u32(data + 8);
664  MSG3 = vld1q_u32(data + 12);
665 
666  if (order == BIG_ENDIAN_ORDER) // Data arrangement
667  {
668  MSG0 = vreinterpretq_u32_u8(vrev32q_u8(vreinterpretq_u8_u32(MSG0)));
669  MSG1 = vreinterpretq_u32_u8(vrev32q_u8(vreinterpretq_u8_u32(MSG1)));
670  MSG2 = vreinterpretq_u32_u8(vrev32q_u8(vreinterpretq_u8_u32(MSG2)));
671  MSG3 = vreinterpretq_u32_u8(vrev32q_u8(vreinterpretq_u8_u32(MSG3)));
672  }
673 
674  TMP0 = vaddq_u32(MSG0, C0);
675  TMP1 = vaddq_u32(MSG1, C0);
676 
677  // Rounds 0-3
678  E1 = vsha1h_u32(vgetq_lane_u32(ABCD, 0));
679  ABCD = vsha1cq_u32(ABCD, E0, TMP0);
680  TMP0 = vaddq_u32(MSG2, C0);
681  MSG0 = vsha1su0q_u32(MSG0, MSG1, MSG2);
682 
683  // Rounds 4-7
684  E0 = vsha1h_u32(vgetq_lane_u32(ABCD, 0));
685  ABCD = vsha1cq_u32(ABCD, E1, TMP1);
686  TMP1 = vaddq_u32(MSG3, C0);
687  MSG0 = vsha1su1q_u32(MSG0, MSG3);
688  MSG1 = vsha1su0q_u32(MSG1, MSG2, MSG3);
689 
690  // Rounds 8-11
691  E1 = vsha1h_u32(vgetq_lane_u32(ABCD, 0));
692  ABCD = vsha1cq_u32(ABCD, E0, TMP0);
693  TMP0 = vaddq_u32(MSG0, C0);
694  MSG1 = vsha1su1q_u32(MSG1, MSG0);
695  MSG2 = vsha1su0q_u32(MSG2, MSG3, MSG0);
696 
697  // Rounds 12-15
698  E0 = vsha1h_u32(vgetq_lane_u32(ABCD, 0));
699  ABCD = vsha1cq_u32(ABCD, E1, TMP1);
700  TMP1 = vaddq_u32(MSG1, C1);
701  MSG2 = vsha1su1q_u32(MSG2, MSG1);
702  MSG3 = vsha1su0q_u32(MSG3, MSG0, MSG1);
703 
704  // Rounds 16-19
705  E1 = vsha1h_u32(vgetq_lane_u32(ABCD, 0));
706  ABCD = vsha1cq_u32(ABCD, E0, TMP0);
707  TMP0 = vaddq_u32(MSG2, C1);
708  MSG3 = vsha1su1q_u32(MSG3, MSG2);
709  MSG0 = vsha1su0q_u32(MSG0, MSG1, MSG2);
710 
711  // Rounds 20-23
712  E0 = vsha1h_u32(vgetq_lane_u32(ABCD, 0));
713  ABCD = vsha1pq_u32(ABCD, E1, TMP1);
714  TMP1 = vaddq_u32(MSG3, C1);
715  MSG0 = vsha1su1q_u32(MSG0, MSG3);
716  MSG1 = vsha1su0q_u32(MSG1, MSG2, MSG3);
717 
718  // Rounds 24-27
719  E1 = vsha1h_u32(vgetq_lane_u32(ABCD, 0));
720  ABCD = vsha1pq_u32(ABCD, E0, TMP0);
721  TMP0 = vaddq_u32(MSG0, C1);
722  MSG1 = vsha1su1q_u32(MSG1, MSG0);
723  MSG2 = vsha1su0q_u32(MSG2, MSG3, MSG0);
724 
725  // Rounds 28-31
726  E0 = vsha1h_u32(vgetq_lane_u32(ABCD, 0));
727  ABCD = vsha1pq_u32(ABCD, E1, TMP1);
728  TMP1 = vaddq_u32(MSG1, C1);
729  MSG2 = vsha1su1q_u32(MSG2, MSG1);
730  MSG3 = vsha1su0q_u32(MSG3, MSG0, MSG1);
731 
732  // Rounds 32-35
733  E1 = vsha1h_u32(vgetq_lane_u32(ABCD, 0));
734  ABCD = vsha1pq_u32(ABCD, E0, TMP0);
735  TMP0 = vaddq_u32(MSG2, C2);
736  MSG3 = vsha1su1q_u32(MSG3, MSG2);
737  MSG0 = vsha1su0q_u32(MSG0, MSG1, MSG2);
738 
739  // Rounds 36-39
740  E0 = vsha1h_u32(vgetq_lane_u32(ABCD, 0));
741  ABCD = vsha1pq_u32(ABCD, E1, TMP1);
742  TMP1 = vaddq_u32(MSG3, C2);
743  MSG0 = vsha1su1q_u32(MSG0, MSG3);
744  MSG1 = vsha1su0q_u32(MSG1, MSG2, MSG3);
745 
746  // Rounds 40-43
747  E1 = vsha1h_u32(vgetq_lane_u32(ABCD, 0));
748  ABCD = vsha1mq_u32(ABCD, E0, TMP0);
749  TMP0 = vaddq_u32(MSG0, C2);
750  MSG1 = vsha1su1q_u32(MSG1, MSG0);
751  MSG2 = vsha1su0q_u32(MSG2, MSG3, MSG0);
752 
753  // Rounds 44-47
754  E0 = vsha1h_u32(vgetq_lane_u32(ABCD, 0));
755  ABCD = vsha1mq_u32(ABCD, E1, TMP1);
756  TMP1 = vaddq_u32(MSG1, C2);
757  MSG2 = vsha1su1q_u32(MSG2, MSG1);
758  MSG3 = vsha1su0q_u32(MSG3, MSG0, MSG1);
759 
760  // Rounds 48-51
761  E1 = vsha1h_u32(vgetq_lane_u32(ABCD, 0));
762  ABCD = vsha1mq_u32(ABCD, E0, TMP0);
763  TMP0 = vaddq_u32(MSG2, C2);
764  MSG3 = vsha1su1q_u32(MSG3, MSG2);
765  MSG0 = vsha1su0q_u32(MSG0, MSG1, MSG2);
766 
767  // Rounds 52-55
768  E0 = vsha1h_u32(vgetq_lane_u32(ABCD, 0));
769  ABCD = vsha1mq_u32(ABCD, E1, TMP1);
770  TMP1 = vaddq_u32(MSG3, C3);
771  MSG0 = vsha1su1q_u32(MSG0, MSG3);
772  MSG1 = vsha1su0q_u32(MSG1, MSG2, MSG3);
773 
774  // Rounds 56-59
775  E1 = vsha1h_u32(vgetq_lane_u32(ABCD, 0));
776  ABCD = vsha1mq_u32(ABCD, E0, TMP0);
777  TMP0 = vaddq_u32(MSG0, C3);
778  MSG1 = vsha1su1q_u32(MSG1, MSG0);
779  MSG2 = vsha1su0q_u32(MSG2, MSG3, MSG0);
780 
781  // Rounds 60-63
782  E0 = vsha1h_u32(vgetq_lane_u32(ABCD, 0));
783  ABCD = vsha1pq_u32(ABCD, E1, TMP1);
784  TMP1 = vaddq_u32(MSG1, C3);
785  MSG2 = vsha1su1q_u32(MSG2, MSG1);
786  MSG3 = vsha1su0q_u32(MSG3, MSG0, MSG1);
787 
788  // Rounds 64-67
789  E1 = vsha1h_u32(vgetq_lane_u32(ABCD, 0));
790  ABCD = vsha1pq_u32(ABCD, E0, TMP0);
791  TMP0 = vaddq_u32(MSG2, C3);
792  MSG3 = vsha1su1q_u32(MSG3, MSG2);
793  MSG0 = vsha1su0q_u32(MSG0, MSG1, MSG2);
794 
795  // Rounds 68-71
796  E0 = vsha1h_u32(vgetq_lane_u32(ABCD, 0));
797  ABCD = vsha1pq_u32(ABCD, E1, TMP1);
798  TMP1 = vaddq_u32(MSG3, C3);
799  MSG0 = vsha1su1q_u32(MSG0, MSG3);
800 
801  // Rounds 72-75
802  E1 = vsha1h_u32(vgetq_lane_u32(ABCD, 0));
803  ABCD = vsha1pq_u32(ABCD, E0, TMP0);
804 
805  // Rounds 76-79
806  E0 = vsha1h_u32(vgetq_lane_u32(ABCD, 0));
807  ABCD = vsha1pq_u32(ABCD, E1, TMP1);
808 
809  E0 += E0_SAVED;
810  ABCD = vaddq_u32(ABCD_SAVED, ABCD);
811 
812  data += SHA1::BLOCKSIZE/sizeof(word32);
813  length -= SHA1::BLOCKSIZE;
814  }
815 
816  // Save state
817  vst1q_u32(&state[0], ABCD);
818  state[4] = E0;
819 }
820 #endif // CRYPTOPP_ARM_SHA1_AVAILABLE
821 
822 #if CRYPTOPP_ARM_SHA2_AVAILABLE
823 void SHA256_HashMultipleBlocks_ARMV8(word32 *state, const word32 *data, size_t length, ByteOrder order)
824 {
825  CRYPTOPP_ASSERT(state);
826  CRYPTOPP_ASSERT(data);
827  CRYPTOPP_ASSERT(length >= SHA256::BLOCKSIZE);
828 
829  uint32x4_t STATE0, STATE1, ABEF_SAVE, CDGH_SAVE;
830  uint32x4_t MSG0, MSG1, MSG2, MSG3;
831  uint32x4_t TMP0, TMP1, TMP2;
832 
833  // Load initial values
834  STATE0 = vld1q_u32(&state[0]);
835  STATE1 = vld1q_u32(&state[4]);
836 
837  while (length >= SHA256::BLOCKSIZE)
838  {
839  // Save current hash
840  ABEF_SAVE = STATE0;
841  CDGH_SAVE = STATE1;
842 
843  // Load message
844  MSG0 = vld1q_u32(data + 0);
845  MSG1 = vld1q_u32(data + 4);
846  MSG2 = vld1q_u32(data + 8);
847  MSG3 = vld1q_u32(data + 12);
848 
849  if (order == BIG_ENDIAN_ORDER) // Data arrangement
850  {
851  MSG0 = vreinterpretq_u32_u8(vrev32q_u8(vreinterpretq_u8_u32(MSG0)));
852  MSG1 = vreinterpretq_u32_u8(vrev32q_u8(vreinterpretq_u8_u32(MSG1)));
853  MSG2 = vreinterpretq_u32_u8(vrev32q_u8(vreinterpretq_u8_u32(MSG2)));
854  MSG3 = vreinterpretq_u32_u8(vrev32q_u8(vreinterpretq_u8_u32(MSG3)));
855  }
856 
857  TMP0 = vaddq_u32(MSG0, vld1q_u32(&SHA256_K[0x00]));
858 
859  // Rounds 0-3
860  MSG0 = vsha256su0q_u32(MSG0, MSG1);
861  TMP2 = STATE0;
862  TMP1 = vaddq_u32(MSG1, vld1q_u32(&SHA256_K[0x04]));
863  STATE0 = vsha256hq_u32(STATE0, STATE1, TMP0);
864  STATE1 = vsha256h2q_u32(STATE1, TMP2, TMP0);
865  MSG0 = vsha256su1q_u32(MSG0, MSG2, MSG3);;
866 
867  // Rounds 4-7
868  MSG1 = vsha256su0q_u32(MSG1, MSG2);
869  TMP2 = STATE0;
870  TMP0 = vaddq_u32(MSG2, vld1q_u32(&SHA256_K[0x08]));
871  STATE0 = vsha256hq_u32(STATE0, STATE1, TMP1);
872  STATE1 = vsha256h2q_u32(STATE1, TMP2, TMP1);
873  MSG1 = vsha256su1q_u32(MSG1, MSG3, MSG0);;
874 
875  // Rounds 8-11
876  MSG2 = vsha256su0q_u32(MSG2, MSG3);
877  TMP2 = STATE0;
878  TMP1 = vaddq_u32(MSG3, vld1q_u32(&SHA256_K[0x0c]));
879  STATE0 = vsha256hq_u32(STATE0, STATE1, TMP0);
880  STATE1 = vsha256h2q_u32(STATE1, TMP2, TMP0);
881  MSG2 = vsha256su1q_u32(MSG2, MSG0, MSG1);;
882 
883  // Rounds 12-15
884  MSG3 = vsha256su0q_u32(MSG3, MSG0);
885  TMP2 = STATE0;
886  TMP0 = vaddq_u32(MSG0, vld1q_u32(&SHA256_K[0x10]));
887  STATE0 = vsha256hq_u32(STATE0, STATE1, TMP1);
888  STATE1 = vsha256h2q_u32(STATE1, TMP2, TMP1);
889  MSG3 = vsha256su1q_u32(MSG3, MSG1, MSG2);;
890 
891  // Rounds 16-19
892  MSG0 = vsha256su0q_u32(MSG0, MSG1);
893  TMP2 = STATE0;
894  TMP1 = vaddq_u32(MSG1, vld1q_u32(&SHA256_K[0x14]));
895  STATE0 = vsha256hq_u32(STATE0, STATE1, TMP0);
896  STATE1 = vsha256h2q_u32(STATE1, TMP2, TMP0);
897  MSG0 = vsha256su1q_u32(MSG0, MSG2, MSG3);;
898 
899  // Rounds 20-23
900  MSG1 = vsha256su0q_u32(MSG1, MSG2);
901  TMP2 = STATE0;
902  TMP0 = vaddq_u32(MSG2, vld1q_u32(&SHA256_K[0x18]));
903  STATE0 = vsha256hq_u32(STATE0, STATE1, TMP1);
904  STATE1 = vsha256h2q_u32(STATE1, TMP2, TMP1);
905  MSG1 = vsha256su1q_u32(MSG1, MSG3, MSG0);;
906 
907  // Rounds 24-27
908  MSG2 = vsha256su0q_u32(MSG2, MSG3);
909  TMP2 = STATE0;
910  TMP1 = vaddq_u32(MSG3, vld1q_u32(&SHA256_K[0x1c]));
911  STATE0 = vsha256hq_u32(STATE0, STATE1, TMP0);
912  STATE1 = vsha256h2q_u32(STATE1, TMP2, TMP0);
913  MSG2 = vsha256su1q_u32(MSG2, MSG0, MSG1);;
914 
915  // Rounds 28-31
916  MSG3 = vsha256su0q_u32(MSG3, MSG0);
917  TMP2 = STATE0;
918  TMP0 = vaddq_u32(MSG0, vld1q_u32(&SHA256_K[0x20]));
919  STATE0 = vsha256hq_u32(STATE0, STATE1, TMP1);
920  STATE1 = vsha256h2q_u32(STATE1, TMP2, TMP1);
921  MSG3 = vsha256su1q_u32(MSG3, MSG1, MSG2);;
922 
923  // Rounds 32-35
924  MSG0 = vsha256su0q_u32(MSG0, MSG1);
925  TMP2 = STATE0;
926  TMP1 = vaddq_u32(MSG1, vld1q_u32(&SHA256_K[0x24]));
927  STATE0 = vsha256hq_u32(STATE0, STATE1, TMP0);
928  STATE1 = vsha256h2q_u32(STATE1, TMP2, TMP0);
929  MSG0 = vsha256su1q_u32(MSG0, MSG2, MSG3);;
930 
931  // Rounds 36-39
932  MSG1 = vsha256su0q_u32(MSG1, MSG2);
933  TMP2 = STATE0;
934  TMP0 = vaddq_u32(MSG2, vld1q_u32(&SHA256_K[0x28]));
935  STATE0 = vsha256hq_u32(STATE0, STATE1, TMP1);
936  STATE1 = vsha256h2q_u32(STATE1, TMP2, TMP1);
937  MSG1 = vsha256su1q_u32(MSG1, MSG3, MSG0);;
938 
939  // Rounds 40-43
940  MSG2 = vsha256su0q_u32(MSG2, MSG3);
941  TMP2 = STATE0;
942  TMP1 = vaddq_u32(MSG3, vld1q_u32(&SHA256_K[0x2c]));
943  STATE0 = vsha256hq_u32(STATE0, STATE1, TMP0);
944  STATE1 = vsha256h2q_u32(STATE1, TMP2, TMP0);
945  MSG2 = vsha256su1q_u32(MSG2, MSG0, MSG1);;
946 
947  // Rounds 44-47
948  MSG3 = vsha256su0q_u32(MSG3, MSG0);
949  TMP2 = STATE0;
950  TMP0 = vaddq_u32(MSG0, vld1q_u32(&SHA256_K[0x30]));
951  STATE0 = vsha256hq_u32(STATE0, STATE1, TMP1);
952  STATE1 = vsha256h2q_u32(STATE1, TMP2, TMP1);
953  MSG3 = vsha256su1q_u32(MSG3, MSG1, MSG2);;
954 
955  // Rounds 48-51
956  TMP2 = STATE0;
957  TMP1 = vaddq_u32(MSG1, vld1q_u32(&SHA256_K[0x34]));
958  STATE0 = vsha256hq_u32(STATE0, STATE1, TMP0);
959  STATE1 = vsha256h2q_u32(STATE1, TMP2, TMP0);;
960 
961  // Rounds 52-55
962  TMP2 = STATE0;
963  TMP0 = vaddq_u32(MSG2, vld1q_u32(&SHA256_K[0x38]));
964  STATE0 = vsha256hq_u32(STATE0, STATE1, TMP1);
965  STATE1 = vsha256h2q_u32(STATE1, TMP2, TMP1);;
966 
967  // Rounds 56-59
968  TMP2 = STATE0;
969  TMP1 = vaddq_u32(MSG3, vld1q_u32(&SHA256_K[0x3c]));
970  STATE0 = vsha256hq_u32(STATE0, STATE1, TMP0);
971  STATE1 = vsha256h2q_u32(STATE1, TMP2, TMP0);;
972 
973  // Rounds 60-63
974  TMP2 = STATE0;
975  STATE0 = vsha256hq_u32(STATE0, STATE1, TMP1);
976  STATE1 = vsha256h2q_u32(STATE1, TMP2, TMP1);;
977 
978  // Add back to state
979  STATE0 = vaddq_u32(STATE0, ABEF_SAVE);
980  STATE1 = vaddq_u32(STATE1, CDGH_SAVE);
981 
982  data += SHA256::BLOCKSIZE/sizeof(word32);
983  length -= SHA256::BLOCKSIZE;
984  }
985 
986  // Save state
987  vst1q_u32(&state[0], STATE0);
988  vst1q_u32(&state[4], STATE1);
989 }
990 #endif // CRYPTOPP_ARM_SHA2_AVAILABLE
991 
992 ///////////////////////////////////////////////////////////
993 // end of Walton, Schneiders, O'Rourke and Hovsmith code //
994 ///////////////////////////////////////////////////////////
995 
996 // ***************** Power8 SHA ********************
997 
998 //////////////////////////////////////////////////
999 // start Gustavo, Serra, Scalet and Walton code //
1000 //////////////////////////////////////////////////
1001 
1002 #if CRYPTOPP_POWER8_SHA_AVAILABLE
1003 
1004 // Indexes into the S[] array
1005 enum {A=0, B=1, C, D, E, F, G, H};
1006 
1007 inline
1008 uint32x4_p VecLoad32(const word32* data, int offset)
1009 {
1010 #if (CRYPTOPP_LITTLE_ENDIAN)
1011  const uint8x16_p mask = {3,2,1,0, 7,6,5,4, 11,10,9,8, 15,14,13,12};
1012  const uint32x4_p val = VecLoad(offset, data);
1013  return (uint32x4_p)VecPermute(val, val, mask);
1014 #else
1015  return VecLoad(offset, data);
1016 #endif
1017 }
1018 
1019 template<class T> inline
1020 void VecStore32(const T data, word32 dest[4])
1021 {
1022  VecStore(data, dest);
1023 }
1024 
1025 inline
1026 uint32x4_p VectorCh(const uint32x4_p x, const uint32x4_p y, const uint32x4_p z)
1027 {
1028  // The trick below is due to Andy Polyakov and Jack Lloyd
1029  return vec_sel(z,y,x);
1030 }
1031 
1032 inline
1033 uint32x4_p VectorMaj(const uint32x4_p x, const uint32x4_p y, const uint32x4_p z)
1034 {
1035  // The trick below is due to Andy Polyakov and Jack Lloyd
1036  return vec_sel(y, z, VecXor(x, y));
1037 }
1038 
1039 inline
1040 uint32x4_p Vector_sigma0(const uint32x4_p val)
1041 {
1042  return VecSHA256<0,0>(val);
1043 }
1044 
1045 inline
1046 uint32x4_p Vector_sigma1(const uint32x4_p val)
1047 {
1048  return VecSHA256<0,0xf>(val);
1049 }
1050 
1051 inline
1052 uint32x4_p VectorSigma0(const uint32x4_p val)
1053 {
1054  return VecSHA256<1,0>(val);
1055 }
1056 
1057 inline
1058 uint32x4_p VectorSigma1(const uint32x4_p val)
1059 {
1060  return VecSHA256<1,0xf>(val);
1061 }
1062 
1063 inline
1064 uint32x4_p VectorPack(const uint32x4_p a, const uint32x4_p b,
1065  const uint32x4_p c, const uint32x4_p d)
1066 {
1067  const uint8x16_p m1 = {0,1,2,3, 16,17,18,19, 0,0,0,0, 0,0,0,0};
1068  const uint8x16_p m2 = {0,1,2,3, 4,5,6,7, 16,17,18,19, 20,21,22,23};
1069  return VecPermute(VecPermute(a,b,m1), VecPermute(c,d,m1), m2);
1070 }
1071 
1072 template <unsigned int R> inline
1073 void SHA256_ROUND1(uint32x4_p W[16], uint32x4_p S[8], const uint32x4_p K, const uint32x4_p M)
1074 {
1075  uint32x4_p T1, T2;
1076 
1077  W[R] = M;
1078  T1 = S[H] + VectorSigma1(S[E]) + VectorCh(S[E],S[F],S[G]) + K + M;
1079  T2 = VectorSigma0(S[A]) + VectorMaj(S[A],S[B],S[C]);
1080 
1081  S[H] = S[G]; S[G] = S[F]; S[F] = S[E];
1082  S[E] = S[D] + T1;
1083  S[D] = S[C]; S[C] = S[B]; S[B] = S[A];
1084  S[A] = T1 + T2;
1085 }
1086 
1087 template <unsigned int R> inline
1088 void SHA256_ROUND2(uint32x4_p W[16], uint32x4_p S[8], const uint32x4_p K)
1089 {
1090  // Indexes into the W[] array
1091  enum {IDX0=(R+0)&0xf, IDX1=(R+1)&0xf, IDX9=(R+9)&0xf, IDX14=(R+14)&0xf};
1092 
1093  const uint32x4_p s0 = Vector_sigma0(W[IDX1]);
1094  const uint32x4_p s1 = Vector_sigma1(W[IDX14]);
1095 
1096  uint32x4_p T1 = (W[IDX0] += s0 + s1 + W[IDX9]);
1097  T1 += S[H] + VectorSigma1(S[E]) + VectorCh(S[E],S[F],S[G]) + K;
1098  uint32x4_p T2 = VectorSigma0(S[A]) + VectorMaj(S[A],S[B],S[C]);
1099 
1100  S[H] = S[G]; S[G] = S[F]; S[F] = S[E];
1101  S[E] = S[D] + T1;
1102  S[D] = S[C]; S[C] = S[B]; S[B] = S[A];
1103  S[A] = T1 + T2;
1104 }
1105 
1106 void SHA256_HashMultipleBlocks_POWER8(word32 *state, const word32 *data, size_t length, ByteOrder order)
1107 {
1108  CRYPTOPP_ASSERT(state); CRYPTOPP_ASSERT(data);
1109  CRYPTOPP_ASSERT(length >= SHA256::BLOCKSIZE);
1110  CRYPTOPP_UNUSED(order);
1111 
1112  const uint32_t* k = reinterpret_cast<const uint32_t*>(SHA256_K);
1113  const uint32_t* m = reinterpret_cast<const uint32_t*>(data);
1114 
1115  uint32x4_p abcd = VecLoad(state+0);
1116  uint32x4_p efgh = VecLoad(state+4);
1117  uint32x4_p W[16], S[8], vm, vk;
1118 
1119  size_t blocks = length / SHA256::BLOCKSIZE;
1120  while (blocks--)
1121  {
1122  unsigned int offset=0;
1123 
1124  S[A] = abcd; S[E] = efgh;
1125  S[B] = VecShiftLeftOctet<4>(S[A]);
1126  S[F] = VecShiftLeftOctet<4>(S[E]);
1127  S[C] = VecShiftLeftOctet<4>(S[B]);
1128  S[G] = VecShiftLeftOctet<4>(S[F]);
1129  S[D] = VecShiftLeftOctet<4>(S[C]);
1130  S[H] = VecShiftLeftOctet<4>(S[G]);
1131 
1132  // Rounds 0-16
1133  vk = VecLoad(offset, k);
1134  vm = VecLoad32(m, offset);
1135  SHA256_ROUND1<0>(W,S, vk,vm);
1136  offset+=16;
1137 
1138  vk = VecShiftLeftOctet<4>(vk);
1139  vm = VecShiftLeftOctet<4>(vm);
1140  SHA256_ROUND1<1>(W,S, vk,vm);
1141 
1142  vk = VecShiftLeftOctet<4>(vk);
1143  vm = VecShiftLeftOctet<4>(vm);
1144  SHA256_ROUND1<2>(W,S, vk,vm);
1145 
1146  vk = VecShiftLeftOctet<4>(vk);
1147  vm = VecShiftLeftOctet<4>(vm);
1148  SHA256_ROUND1<3>(W,S, vk,vm);
1149 
1150  vk = VecLoad(offset, k);
1151  vm = VecLoad32(m, offset);
1152  SHA256_ROUND1<4>(W,S, vk,vm);
1153  offset+=16;
1154 
1155  vk = VecShiftLeftOctet<4>(vk);
1156  vm = VecShiftLeftOctet<4>(vm);
1157  SHA256_ROUND1<5>(W,S, vk,vm);
1158 
1159  vk = VecShiftLeftOctet<4>(vk);
1160  vm = VecShiftLeftOctet<4>(vm);
1161  SHA256_ROUND1<6>(W,S, vk,vm);
1162 
1163  vk = VecShiftLeftOctet<4>(vk);
1164  vm = VecShiftLeftOctet<4>(vm);
1165  SHA256_ROUND1<7>(W,S, vk,vm);
1166 
1167  vk = VecLoad(offset, k);
1168  vm = VecLoad32(m, offset);
1169  SHA256_ROUND1<8>(W,S, vk,vm);
1170  offset+=16;
1171 
1172  vk = VecShiftLeftOctet<4>(vk);
1173  vm = VecShiftLeftOctet<4>(vm);
1174  SHA256_ROUND1<9>(W,S, vk,vm);
1175 
1176  vk = VecShiftLeftOctet<4>(vk);
1177  vm = VecShiftLeftOctet<4>(vm);
1178  SHA256_ROUND1<10>(W,S, vk,vm);
1179 
1180  vk = VecShiftLeftOctet<4>(vk);
1181  vm = VecShiftLeftOctet<4>(vm);
1182  SHA256_ROUND1<11>(W,S, vk,vm);
1183 
1184  vk = VecLoad(offset, k);
1185  vm = VecLoad32(m, offset);
1186  SHA256_ROUND1<12>(W,S, vk,vm);
1187  offset+=16;
1188 
1189  vk = VecShiftLeftOctet<4>(vk);
1190  vm = VecShiftLeftOctet<4>(vm);
1191  SHA256_ROUND1<13>(W,S, vk,vm);
1192 
1193  vk = VecShiftLeftOctet<4>(vk);
1194  vm = VecShiftLeftOctet<4>(vm);
1195  SHA256_ROUND1<14>(W,S, vk,vm);
1196 
1197  vk = VecShiftLeftOctet<4>(vk);
1198  vm = VecShiftLeftOctet<4>(vm);
1199  SHA256_ROUND1<15>(W,S, vk,vm);
1200 
1201  m += 16; // 32-bit words, not bytes
1202 
1203  // Rounds 16-64
1204  for (unsigned int i=16; i<64; i+=16)
1205  {
1206  vk = VecLoad(offset, k);
1207  SHA256_ROUND2<0>(W,S, vk);
1208  SHA256_ROUND2<1>(W,S, VecShiftLeftOctet<4>(vk));
1209  SHA256_ROUND2<2>(W,S, VecShiftLeftOctet<8>(vk));
1210  SHA256_ROUND2<3>(W,S, VecShiftLeftOctet<12>(vk));
1211  offset+=16;
1212 
1213  vk = VecLoad(offset, k);
1214  SHA256_ROUND2<4>(W,S, vk);
1215  SHA256_ROUND2<5>(W,S, VecShiftLeftOctet<4>(vk));
1216  SHA256_ROUND2<6>(W,S, VecShiftLeftOctet<8>(vk));
1217  SHA256_ROUND2<7>(W,S, VecShiftLeftOctet<12>(vk));
1218  offset+=16;
1219 
1220  vk = VecLoad(offset, k);
1221  SHA256_ROUND2<8>(W,S, vk);
1222  SHA256_ROUND2<9>(W,S, VecShiftLeftOctet<4>(vk));
1223  SHA256_ROUND2<10>(W,S, VecShiftLeftOctet<8>(vk));
1224  SHA256_ROUND2<11>(W,S, VecShiftLeftOctet<12>(vk));
1225  offset+=16;
1226 
1227  vk = VecLoad(offset, k);
1228  SHA256_ROUND2<12>(W,S, vk);
1229  SHA256_ROUND2<13>(W,S, VecShiftLeftOctet<4>(vk));
1230  SHA256_ROUND2<14>(W,S, VecShiftLeftOctet<8>(vk));
1231  SHA256_ROUND2<15>(W,S, VecShiftLeftOctet<12>(vk));
1232  offset+=16;
1233  }
1234 
1235  abcd += VectorPack(S[A],S[B],S[C],S[D]);
1236  efgh += VectorPack(S[E],S[F],S[G],S[H]);
1237  }
1238 
1239  VecStore32(abcd, state+0);
1240  VecStore32(efgh, state+4);
1241 }
1242 
1243 inline
1244 void VecStore64(const uint64x2_p val, word64* data)
1245 {
1246  VecStore(val, data);
1247 }
1248 
1249 inline
1250 uint64x2_p VecLoad64(const word64* data, int offset)
1251 {
1252 #if (CRYPTOPP_LITTLE_ENDIAN)
1253  const uint8x16_p mask = {0,1,2,3, 4,5,6,7, 8,9,10,11, 12,13,14,15};
1254  return VecPermute(VecLoad(offset, data), mask);
1255 #else
1256  return VecLoad(offset, data);
1257 #endif
1258 }
1259 
1260 inline
1261 uint64x2_p VectorCh(const uint64x2_p x, const uint64x2_p y, const uint64x2_p z)
1262 {
1263  // The trick below is due to Andy Polyakov and Jack Lloyd
1264  return vec_sel(z,y,x);
1265 }
1266 
1267 inline
1268 uint64x2_p VectorMaj(const uint64x2_p x, const uint64x2_p y, const uint64x2_p z)
1269 {
1270  // The trick below is due to Andy Polyakov and Jack Lloyd
1271  return vec_sel(y, z, VecXor(x, y));
1272 }
1273 
1274 inline
1275 uint64x2_p Vector_sigma0(const uint64x2_p val)
1276 {
1277  return VecSHA512<0,0>(val);
1278 }
1279 
1280 inline
1281 uint64x2_p Vector_sigma1(const uint64x2_p val)
1282 {
1283  return VecSHA512<0,0xf>(val);
1284 }
1285 
1286 inline
1287 uint64x2_p VectorSigma0(const uint64x2_p val)
1288 {
1289  return VecSHA512<1,0>(val);
1290 }
1291 
1292 inline
1293 uint64x2_p VectorSigma1(const uint64x2_p val)
1294 {
1295  return VecSHA512<1,0xf>(val);
1296 }
1297 
1298 inline
1299 uint64x2_p VectorPack(const uint64x2_p x, const uint64x2_p y)
1300 {
1301  const uint8x16_p m = {0,1,2,3, 4,5,6,7, 16,17,18,19, 20,21,22,23};
1302  return VecPermute(x,y,m);
1303 }
1304 
1305 template <unsigned int R> inline
1306 void SHA512_ROUND1(uint64x2_p W[16], uint64x2_p S[8], const uint64x2_p K, const uint64x2_p M)
1307 {
1308  uint64x2_p T1, T2;
1309 
1310  W[R] = M;
1311  T1 = S[H] + VectorSigma1(S[E]) + VectorCh(S[E],S[F],S[G]) + K + M;
1312  T2 = VectorSigma0(S[A]) + VectorMaj(S[A],S[B],S[C]);
1313 
1314  S[H] = S[G]; S[G] = S[F]; S[F] = S[E];
1315  S[E] = S[D] + T1;
1316  S[D] = S[C]; S[C] = S[B]; S[B] = S[A];
1317  S[A] = T1 + T2;
1318 }
1319 
1320 template <unsigned int R> inline
1321 void SHA512_ROUND2(uint64x2_p W[16], uint64x2_p S[8], const uint64x2_p K)
1322 {
1323  // Indexes into the W[] array
1324  enum {IDX0=(R+0)&0xf, IDX1=(R+1)&0xf, IDX9=(R+9)&0xf, IDX14=(R+14)&0xf};
1325 
1326  const uint64x2_p s0 = Vector_sigma0(W[IDX1]);
1327  const uint64x2_p s1 = Vector_sigma1(W[IDX14]);
1328 
1329  uint64x2_p T1 = (W[IDX0] += s0 + s1 + W[IDX9]);
1330  T1 += S[H] + VectorSigma1(S[E]) + VectorCh(S[E],S[F],S[G]) + K;
1331  uint64x2_p T2 = VectorSigma0(S[A]) + VectorMaj(S[A],S[B],S[C]);
1332 
1333  S[H] = S[G]; S[G] = S[F]; S[F] = S[E];
1334  S[E] = S[D] + T1;
1335  S[D] = S[C]; S[C] = S[B]; S[B] = S[A];
1336  S[A] = T1 + T2;
1337 }
1338 
1339 void SHA512_HashMultipleBlocks_POWER8(word64 *state, const word64 *data, size_t length, ByteOrder order)
1340 {
1341  CRYPTOPP_ASSERT(state); CRYPTOPP_ASSERT(data);
1342  CRYPTOPP_ASSERT(length >= SHA512::BLOCKSIZE);
1343  CRYPTOPP_UNUSED(order);
1344 
1345  const uint64_t* k = reinterpret_cast<const uint64_t*>(SHA512_K);
1346  const uint64_t* m = reinterpret_cast<const uint64_t*>(data);
1347 
1348  uint64x2_p ab = VecLoad(state+0);
1349  uint64x2_p cd = VecLoad(state+2);
1350  uint64x2_p ef = VecLoad(state+4);
1351  uint64x2_p gh = VecLoad(state+6);
1352  uint64x2_p W[16], S[8], vm, vk;
1353 
1354  size_t blocks = length / SHA512::BLOCKSIZE;
1355  while (blocks--)
1356  {
1357  unsigned int offset=0;
1358 
1359  S[A] = ab; S[C] = cd;
1360  S[E] = ef; S[G] = gh;
1361  S[B] = VecShiftLeftOctet<8>(S[A]);
1362  S[D] = VecShiftLeftOctet<8>(S[C]);
1363  S[F] = VecShiftLeftOctet<8>(S[E]);
1364  S[H] = VecShiftLeftOctet<8>(S[G]);
1365 
1366  // Rounds 0-16
1367  vk = VecLoad(offset, k);
1368  vm = VecLoad64(m, offset);
1369  SHA512_ROUND1<0>(W,S, vk,vm);
1370  offset+=16;
1371 
1372  vk = VecShiftLeftOctet<8>(vk);
1373  vm = VecShiftLeftOctet<8>(vm);
1374  SHA512_ROUND1<1>(W,S, vk,vm);
1375 
1376  vk = VecLoad(offset, k);
1377  vm = VecLoad64(m, offset);
1378  SHA512_ROUND1<2>(W,S, vk,vm);
1379  offset+=16;
1380 
1381  vk = VecShiftLeftOctet<8>(vk);
1382  vm = VecShiftLeftOctet<8>(vm);
1383  SHA512_ROUND1<3>(W,S, vk,vm);
1384 
1385  vk = VecLoad(offset, k);
1386  vm = VecLoad64(m, offset);
1387  SHA512_ROUND1<4>(W,S, vk,vm);
1388  offset+=16;
1389 
1390  vk = VecShiftLeftOctet<8>(vk);
1391  vm = VecShiftLeftOctet<8>(vm);
1392  SHA512_ROUND1<5>(W,S, vk,vm);
1393 
1394  vk = VecLoad(offset, k);
1395  vm = VecLoad64(m, offset);
1396  SHA512_ROUND1<6>(W,S, vk,vm);
1397  offset+=16;
1398 
1399  vk = VecShiftLeftOctet<8>(vk);
1400  vm = VecShiftLeftOctet<8>(vm);
1401  SHA512_ROUND1<7>(W,S, vk,vm);
1402 
1403  vk = VecLoad(offset, k);
1404  vm = VecLoad64(m, offset);
1405  SHA512_ROUND1<8>(W,S, vk,vm);
1406  offset+=16;
1407 
1408  vk = VecShiftLeftOctet<8>(vk);
1409  vm = VecShiftLeftOctet<8>(vm);
1410  SHA512_ROUND1<9>(W,S, vk,vm);
1411 
1412  vk = VecLoad(offset, k);
1413  vm = VecLoad64(m, offset);
1414  SHA512_ROUND1<10>(W,S, vk,vm);
1415  offset+=16;
1416 
1417  vk = VecShiftLeftOctet<8>(vk);
1418  vm = VecShiftLeftOctet<8>(vm);
1419  SHA512_ROUND1<11>(W,S, vk,vm);
1420 
1421  vk = VecLoad(offset, k);
1422  vm = VecLoad64(m, offset);
1423  SHA512_ROUND1<12>(W,S, vk,vm);
1424  offset+=16;
1425 
1426  vk = VecShiftLeftOctet<8>(vk);
1427  vm = VecShiftLeftOctet<8>(vm);
1428  SHA512_ROUND1<13>(W,S, vk,vm);
1429 
1430  vk = VecLoad(offset, k);
1431  vm = VecLoad64(m, offset);
1432  SHA512_ROUND1<14>(W,S, vk,vm);
1433  offset+=16;
1434 
1435  vk = VecShiftLeftOctet<8>(vk);
1436  vm = VecShiftLeftOctet<8>(vm);
1437  SHA512_ROUND1<15>(W,S, vk,vm);
1438 
1439  m += 16; // 64-bit words, not bytes
1440 
1441  // Rounds 16-80
1442  for (unsigned int i=16; i<80; i+=16)
1443  {
1444  vk = VecLoad(offset, k);
1445  SHA512_ROUND2<0>(W,S, vk);
1446  SHA512_ROUND2<1>(W,S, VecShiftLeftOctet<8>(vk));
1447  offset+=16;
1448 
1449  vk = VecLoad(offset, k);
1450  SHA512_ROUND2<2>(W,S, vk);
1451  SHA512_ROUND2<3>(W,S, VecShiftLeftOctet<8>(vk));
1452  offset+=16;
1453 
1454  vk = VecLoad(offset, k);
1455  SHA512_ROUND2<4>(W,S, vk);
1456  SHA512_ROUND2<5>(W,S, VecShiftLeftOctet<8>(vk));
1457  offset+=16;
1458 
1459  vk = VecLoad(offset, k);
1460  SHA512_ROUND2<6>(W,S, vk);
1461  SHA512_ROUND2<7>(W,S, VecShiftLeftOctet<8>(vk));
1462  offset+=16;
1463 
1464  vk = VecLoad(offset, k);
1465  SHA512_ROUND2<8>(W,S, vk);
1466  SHA512_ROUND2<9>(W,S, VecShiftLeftOctet<8>(vk));
1467  offset+=16;
1468 
1469  vk = VecLoad(offset, k);
1470  SHA512_ROUND2<10>(W,S, vk);
1471  SHA512_ROUND2<11>(W,S, VecShiftLeftOctet<8>(vk));
1472  offset+=16;
1473 
1474  vk = VecLoad(offset, k);
1475  SHA512_ROUND2<12>(W,S, vk);
1476  SHA512_ROUND2<13>(W,S, VecShiftLeftOctet<8>(vk));
1477  offset+=16;
1478 
1479  vk = VecLoad(offset, k);
1480  SHA512_ROUND2<14>(W,S, vk);
1481  SHA512_ROUND2<15>(W,S, VecShiftLeftOctet<8>(vk));
1482  offset+=16;
1483  }
1484 
1485  ab += VectorPack(S[A],S[B]);
1486  cd += VectorPack(S[C],S[D]);
1487  ef += VectorPack(S[E],S[F]);
1488  gh += VectorPack(S[G],S[H]);
1489  }
1490 
1491  VecStore64(ab, state+0);
1492  VecStore64(cd, state+2);
1493  VecStore64(ef, state+4);
1494  VecStore64(gh, state+6);
1495 }
1496 
1497 #endif // CRYPTOPP_POWER8_SHA_AVAILABLE
1498 
1499 ////////////////////////////////////////////////
1500 // end Gustavo, Serra, Scalet and Walton code //
1501 ////////////////////////////////////////////////
1502 
1503 NAMESPACE_END
Utility functions for the Crypto++ library.
ByteOrder
Provides the byte ordering.
Definition: cryptlib.h:143
Library configuration file.
T1 VecPermute(const T1 vec, const T2 mask)
Permutes a vector.
Definition: ppc_simd.h:1010
__vector unsigned int uint32x4_p
Vector of 32-bit elements.
Definition: ppc_simd.h:129
Support functions for PowerPC and vector operations.
Precompiled header file.
void VecStore(const T data, byte dest[16])
Stores a vector to a byte array.
Definition: ppc_simd.h:605
byte order is big-endian
Definition: cryptlib.h:147
#define CRYPTOPP_ASSERT(exp)
Debugging and diagnostic assertion.
Definition: trap.h:69
T1 VecXor(const T1 vec1, const T2 vec2)
XOR two vectors.
Definition: ppc_simd.h:916
Classes for SHA-1 and SHA-2 family of message digests.
__vector unsigned long long uint64x2_p
Vector of 64-bit elements.
Definition: ppc_simd.h:139
Crypto++ library namespace.
uint32x4_p VecLoad(const byte src[16])
Loads a vector from a byte array.
Definition: ppc_simd.h:253
__vector unsigned char uint8x16_p
Vector of 8-bit elements.
Definition: ppc_simd.h:119