Crypto++  8.2
Free C++ class library of cryptographic schemes
gf2n_simd.cpp
1 // gf2n_simd.cpp - written and placed in the public domain by Jeffrey Walton
2 // Also based on PCLMULQDQ code by Jankowski, Laurent and
3 // O'Mahony from Intel (see reference below).
4 //
5 // This source file uses intrinsics and built-ins to gain access to
6 // CLMUL, ARMv8a, and Power8 instructions. A separate source file is
7 // needed because additional CXXFLAGS are required to enable the
8 // appropriate instructions sets in some build configurations.
9 //
10 // Several speedups were taken from Intel Polynomial Multiplication
11 // Instruction and its Usage for Elliptic Curve Cryptography, by
12 // Krzysztof Jankowski, Pierre Laurent and Aidan O'Mahony,
13 // https://www.intel.com/content/dam/www/public/us/en/documents/white-papers/polynomial-multiplication-instructions-paper.pdf
14 // There may be more speedups available, see https://eprint.iacr.org/2011/589.pdf.
15 // The IACR paper performs some optimizations that the compiler is
16 // expected to perform, like Common Subexpression Elimination to save
17 // on variables (among others). Note that the compiler may miss the
18 // optimization so the IACR paper is useful. However, the code is GPL3
19 // and toxic for some users of the library...
20 
21 #include "pch.h"
22 #include "config.h"
23 
24 #ifndef CRYPTOPP_IMPORTS
25 
26 #include "gf2n.h"
27 
28 #if (CRYPTOPP_CLMUL_AVAILABLE)
29 # include <emmintrin.h>
30 # include <wmmintrin.h>
31 #endif
32 
33 #if (CRYPTOPP_ARM_PMULL_AVAILABLE)
34 # include "arm_simd.h"
35 #endif
36 
37 #if defined(CRYPTOPP_ALTIVEC_AVAILABLE)
38 # include "ppc_simd.h"
39 #endif
40 
41 ANONYMOUS_NAMESPACE_BEGIN
42 
43 // ************************** ARMv8 ************************** //
44 
45 using CryptoPP::word;
46 
47 #if (CRYPTOPP_ARM_PMULL_AVAILABLE)
48 
49 // c1c0 = a * b
50 inline void
51 F2N_Multiply_128x128_ARMv8(uint64x2_t& c1, uint64x2_t& c0, const uint64x2_t& a, const uint64x2_t& b)
52 {
53  uint64x2_t t1, t2, z0={0};
54 
55  c0 = PMULL_00(a, b);
56  c1 = PMULL_11(a, b);
57  t1 = vmovq_n_u64(vgetq_lane_u64(a, 1));
58  t1 = veorq_u64(a, t1);
59  t2 = vmovq_n_u64(vgetq_lane_u64(b, 1));
60  t2 = veorq_u64(b, t2);
61  t1 = PMULL_00(t1, t2);
62  t1 = veorq_u64(c0, t1);
63  t1 = veorq_u64(c1, t1);
64  t2 = t1;
65  t1 = vextq_u64(z0, t1, 1);
66  t2 = vextq_u64(t2, z0, 1);
67  c0 = veorq_u64(c0, t1);
68  c1 = veorq_u64(c1, t2);
69 }
70 
71 // c3c2c1c0 = a1a0 * b1b0
72 inline void
73 F2N_Multiply_256x256_ARMv8(uint64x2_t& c3, uint64x2_t& c2, uint64x2_t& c1, uint64x2_t& c0,
74  const uint64x2_t& b1, const uint64x2_t& b0, const uint64x2_t& a1, const uint64x2_t& a0)
75 {
76  uint64x2_t c4, c5;
77  uint64x2_t x0=a0, x1=a1, y0=b0, y1=b1;
78 
79  F2N_Multiply_128x128_ARMv8(c1, c0, x0, y0);
80  F2N_Multiply_128x128_ARMv8(c3, c2, x1, y1);
81 
82  x0 = veorq_u64(x0, x1);
83  y0 = veorq_u64(y0, y1);
84 
85  F2N_Multiply_128x128_ARMv8(c5, c4, x0, y0);
86 
87  c4 = veorq_u64(c4, c0);
88  c4 = veorq_u64(c4, c2);
89  c5 = veorq_u64(c5, c1);
90  c5 = veorq_u64(c5, c3);
91  c1 = veorq_u64(c1, c4);
92  c2 = veorq_u64(c2, c5);
93 }
94 
95 // c3c2c1c0 = a1a0 * a1a0
96 inline void
97 F2N_Square_256_ARMv8(uint64x2_t& c3, uint64x2_t& c2, uint64x2_t& c1,
98  uint64x2_t& c0, const uint64x2_t& a1, const uint64x2_t& a0)
99 {
100  c0 = PMULL_00(a0, a0);
101  c1 = PMULL_11(a0, a0);
102  c2 = PMULL_00(a1, a1);
103  c3 = PMULL_11(a1, a1);
104 }
105 
106 // x = (x << n), z = 0
107 template <unsigned int N>
108 inline uint64x2_t ShiftLeft128_ARMv8(uint64x2_t x)
109 {
110  uint64x2_t u=x, v, z={0};
111  x = vshlq_n_u64(x, N);
112  u = vshrq_n_u64(u, (64-N));
113  v = vcombine_u64(vget_low_u64(z), vget_low_u64(u));
114  x = vorrq_u64(x, v);
115  return x;
116 }
117 
118 // c1c0 = c3c2c1c0 MOD p. This is a Barrett reduction. Reading at
119 // Intel paper or https://github.com/antonblanchard/crc32-vpmsum.
120 inline void
121 GF2NT_233_Reduce_ARMv8(uint64x2_t& c3, uint64x2_t& c2, uint64x2_t& c1, uint64x2_t& c0)
122 {
123  const unsigned int mask[4] = {
124  0xffffffff, 0xffffffff, 0xffffffff, 0x000001ff,
125  };
126 
127  uint64x2_t b3, b2, b1, /*b0,*/ a1, a0, m0, z0={0};
128  m0 = vreinterpretq_u64_u32(vld1q_u32(mask));
129  b1 = c1; a1 = c1;
130  a0 = vcombine_u64(vget_low_u64(c1), vget_low_u64(z0));
131  a1 = vshlq_n_u64(a1, 23);
132  a1 = vshrq_n_u64(a1, 23);
133  c1 = vorrq_u64(a1, a0);
134  b2 = vshrq_n_u64(c2, (64-23));
135  c3 = ShiftLeft128_ARMv8<23>(c3);
136  a0 = vcombine_u64(vget_high_u64(b2), vget_high_u64(z0));
137  c3 = vorrq_u64(c3, a0);
138  b1 = vshrq_n_u64(b1, (64-23));
139  c2 = ShiftLeft128_ARMv8<23>(c2);
140  a0 = vcombine_u64(vget_high_u64(b1), vget_high_u64(z0));
141  c2 = vorrq_u64(c2, a0);
142  b3 = c3;
143  b2 = vshrq_n_u64(c2, (64-10));
144  b3 = ShiftLeft128_ARMv8<10>(b3);
145  a0 = vcombine_u64(vget_high_u64(b2), vget_high_u64(z0));
146  b3 = vorrq_u64(b3, a0);
147  a0 = vcombine_u64(vget_high_u64(c3), vget_high_u64(z0));
148  b3 = veorq_u64(b3, a0);
149  b1 = vshrq_n_u64(b3, (64-23));
150  b3 = ShiftLeft128_ARMv8<23>(b3);
151  b3 = vcombine_u64(vget_high_u64(b3), vget_high_u64(z0));
152  b3 = vorrq_u64(b3, b1);
153  c2 = veorq_u64(c2, b3);
154  b3 = c3;
155  b2 = vshrq_n_u64(c2, (64-10));
156  b3 = ShiftLeft128_ARMv8<10>(b3);
157  b2 = vcombine_u64(vget_high_u64(b2), vget_high_u64(z0));
158  b3 = vorrq_u64(b3, b2);
159  b2 = c2;
160  b2 = ShiftLeft128_ARMv8<10>(b2);
161  a0 = vcombine_u64(vget_low_u64(z0), vget_low_u64(b2));
162  c2 = veorq_u64(c2, a0);
163  a0 = vcombine_u64(vget_low_u64(z0), vget_low_u64(b3));
164  a1 = vcombine_u64(vget_high_u64(b2), vget_high_u64(z0));
165  a0 = vorrq_u64(a0, a1);
166  c3 = veorq_u64(c3, a0);
167  c0 = veorq_u64(c0, c2);
168  c1 = veorq_u64(c1, c3);
169  c1 = vandq_u64(c1, m0);
170 }
171 
172 #endif
173 
174 // ************************** SSE ************************** //
175 
176 #if (CRYPTOPP_CLMUL_AVAILABLE)
177 
178 using CryptoPP::word;
179 
180 // c1c0 = a * b
181 inline void
182 F2N_Multiply_128x128_CLMUL(__m128i& c1, __m128i& c0, const __m128i& a, const __m128i& b)
183 {
184  __m128i t1, t2;
185 
186  c0 = _mm_clmulepi64_si128(a, b, 0x00);
187  c1 = _mm_clmulepi64_si128(a, b, 0x11);
188  t1 = _mm_shuffle_epi32(a, 0xEE);
189  t1 = _mm_xor_si128(a, t1);
190  t2 = _mm_shuffle_epi32(b, 0xEE);
191  t2 = _mm_xor_si128(b, t2);
192  t1 = _mm_clmulepi64_si128(t1, t2, 0x00);
193  t1 = _mm_xor_si128(c0, t1);
194  t1 = _mm_xor_si128(c1, t1);
195  t2 = t1;
196  t1 = _mm_slli_si128(t1, 8);
197  t2 = _mm_srli_si128(t2, 8);
198  c0 = _mm_xor_si128(c0, t1);
199  c1 = _mm_xor_si128(c1, t2);
200 }
201 
202 // c3c2c1c0 = a1a0 * b1b0
203 inline void
204 F2N_Multiply_256x256_CLMUL(__m128i& c3, __m128i& c2, __m128i& c1, __m128i& c0,
205  const __m128i& b1, const __m128i& b0, const __m128i& a1, const __m128i& a0)
206 {
207  __m128i c4, c5;
208  __m128i x0=a0, x1=a1, y0=b0, y1=b1;
209 
210  F2N_Multiply_128x128_CLMUL(c1, c0, x0, y0);
211  F2N_Multiply_128x128_CLMUL(c3, c2, x1, y1);
212 
213  x0 = _mm_xor_si128(x0, x1);
214  y0 = _mm_xor_si128(y0, y1);
215 
216  F2N_Multiply_128x128_CLMUL(c5, c4, x0, y0);
217 
218  c4 = _mm_xor_si128(c4, c0);
219  c4 = _mm_xor_si128(c4, c2);
220  c5 = _mm_xor_si128(c5, c1);
221  c5 = _mm_xor_si128(c5, c3);
222  c1 = _mm_xor_si128(c1, c4);
223  c2 = _mm_xor_si128(c2, c5);
224 }
225 
226 // c3c2c1c0 = a1a0 * a1a0
227 inline void
228 F2N_Square_256_CLMUL(__m128i& c3, __m128i& c2, __m128i& c1,
229  __m128i& c0, const __m128i& a1, const __m128i& a0)
230 {
231  c0 = _mm_clmulepi64_si128(a0, a0, 0x00);
232  c1 = _mm_clmulepi64_si128(a0, a0, 0x11);
233  c2 = _mm_clmulepi64_si128(a1, a1, 0x00);
234  c3 = _mm_clmulepi64_si128(a1, a1, 0x11);
235 }
236 
237 // x = (x << n), z = 0
238 template <unsigned int N>
239 inline __m128i ShiftLeft128_SSE(__m128i x, const __m128i& z)
240 {
241  __m128i u=x, v;
242  x = _mm_slli_epi64(x, N);
243  u = _mm_srli_epi64(u, (64-N));
244  v = _mm_unpacklo_epi64(z, u);
245  x = _mm_or_si128(x, v);
246  return x;
247 }
248 
249 // c1c0 = c3c2c1c0 MOD p. This is a Barrett reduction. Reading at
250 // Intel paper or https://github.com/antonblanchard/crc32-vpmsum.
251 inline void
252 GF2NT_233_Reduce_CLMUL(__m128i& c3, __m128i& c2, __m128i& c1, __m128i& c0)
253 {
254  const unsigned int m[4] = {
255  0xffffffff, 0xffffffff, 0xffffffff, 0x000001ff
256  };
257 
258  __m128i b3, b2, b1, /*b0,*/ a1, a0, m0, z0;
259  m0 = _mm_set_epi32(m[3], m[2], m[1], m[0]);
260  z0 = _mm_setzero_si128();
261  b1 = c1; a1 = c1;
262  a0 = _mm_move_epi64(c1);
263  a1 = _mm_slli_epi64(a1, 23);
264  a1 = _mm_srli_epi64(a1, 23);
265  c1 = _mm_or_si128(a1, a0);
266  b2 = _mm_srli_epi64(c2, (64-23));
267  c3 = ShiftLeft128_SSE<23>(c3, z0);
268  a0 = _mm_unpackhi_epi64(b2, z0);
269  c3 = _mm_or_si128(c3, a0);
270  b1 = _mm_srli_epi64(b1, (64-23));
271  c2 = ShiftLeft128_SSE<23>(c2, z0);
272  a0 = _mm_unpackhi_epi64(b1, z0);
273  c2 = _mm_or_si128(c2, a0);
274  b3 = c3;
275  b2 = _mm_srli_epi64(c2, (64-10));
276  b3 = ShiftLeft128_SSE<10>(b3, z0);
277  a0 = _mm_unpackhi_epi64(b2, z0);
278  b3 = _mm_or_si128(b3, a0);
279  a0 = _mm_unpackhi_epi64(c3, z0);
280  b3 = _mm_xor_si128(b3, a0);
281  b1 = _mm_srli_epi64(b3, (64-23));
282  b3 = ShiftLeft128_SSE<23>(b3, z0);
283  b3 = _mm_unpackhi_epi64(b3, z0);
284  b3 = _mm_or_si128(b3, b1);
285  c2 = _mm_xor_si128(c2, b3);
286  b3 = c3;
287  b2 = _mm_srli_epi64(c2, (64-10));
288  b3 = ShiftLeft128_SSE<10>(b3, z0);
289  b2 = _mm_unpackhi_epi64(b2, z0);
290  b3 = _mm_or_si128(b3, b2);
291  b2 = c2;
292  b2 = ShiftLeft128_SSE<10>(b2, z0);
293  a0 = _mm_unpacklo_epi64(z0, b2);
294  c2 = _mm_xor_si128(c2, a0);
295  a0 = _mm_unpacklo_epi64(z0, b3);
296  a1 = _mm_unpackhi_epi64(b2, z0);
297  a0 = _mm_or_si128(a0, a1);
298  c3 = _mm_xor_si128(c3, a0);
299  c0 = _mm_xor_si128(c0, c2);
300  c1 = _mm_xor_si128(c1, c3);
301  c1 = _mm_and_si128(c1, m0);
302 }
303 
304 #endif
305 
306 // ************************* Power8 ************************* //
307 
308 #if (CRYPTOPP_POWER8_VMULL_AVAILABLE)
309 
310 using CryptoPP::byte;
311 using CryptoPP::word;
314 
315 using CryptoPP::VecLoad;
316 using CryptoPP::VecStore;
317 
318 using CryptoPP::VecOr;
319 using CryptoPP::VecXor;
320 using CryptoPP::VecAnd;
321 
327 
330 
331 // c1c0 = a * b
332 inline void
333 F2N_Multiply_128x128_POWER8(uint64x2_p& c1, uint64x2_p& c0, const uint64x2_p& a, const uint64x2_p& b)
334 {
335  uint64x2_p t1, t2;
336  const uint64x2_p z0={0};
337 
338  c0 = VecPolyMultiply00LE(a, b);
339  c1 = VecPolyMultiply11LE(a, b);
340  t1 = VecMergeLow(a, a);
341  t1 = VecXor(a, t1);
342  t2 = VecMergeLow(b, b);
343  t2 = VecXor(b, t2);
344  t1 = VecPolyMultiply00LE(t1, t2);
345  t1 = VecXor(c0, t1);
346  t1 = VecXor(c1, t1);
347  t2 = t1;
348  t1 = VecMergeHigh(z0, t1);
349  t2 = VecMergeLow(t2, z0);
350  c0 = VecXor(c0, t1);
351  c1 = VecXor(c1, t2);
352 }
353 
354 // c3c2c1c0 = a1a0 * b1b0
355 inline void
356 F2N_Multiply_256x256_POWER8(uint64x2_p& c3, uint64x2_p& c2, uint64x2_p& c1, uint64x2_p& c0,
357  const uint64x2_p& b1, const uint64x2_p& b0, const uint64x2_p& a1, const uint64x2_p& a0)
358 {
359  uint64x2_p c4, c5;
360  uint64x2_p x0=a0, x1=a1, y0=b0, y1=b1;
361 
362  F2N_Multiply_128x128_POWER8(c1, c0, x0, y0);
363  F2N_Multiply_128x128_POWER8(c3, c2, x1, y1);
364 
365  x0 = VecXor(x0, x1);
366  y0 = VecXor(y0, y1);
367 
368  F2N_Multiply_128x128_POWER8(c5, c4, x0, y0);
369 
370  c4 = VecXor(c4, c0);
371  c4 = VecXor(c4, c2);
372  c5 = VecXor(c5, c1);
373  c5 = VecXor(c5, c3);
374  c1 = VecXor(c1, c4);
375  c2 = VecXor(c2, c5);
376 }
377 
378 // c3c2c1c0 = a1a0 * a1a0
379 inline void
380 F2N_Square_256_POWER8(uint64x2_p& c3, uint64x2_p& c2, uint64x2_p& c1,
381  uint64x2_p& c0, const uint64x2_p& a1, const uint64x2_p& a0)
382 {
383  c0 = VecPolyMultiply00LE(a0, a0);
384  c1 = VecPolyMultiply11LE(a0, a0);
385  c2 = VecPolyMultiply00LE(a1, a1);
386  c3 = VecPolyMultiply11LE(a1, a1);
387 }
388 
389 // x = (x << n), z = 0
390 template <unsigned int N>
391 inline uint64x2_p ShiftLeft128_POWER8(uint64x2_p x)
392 {
393  uint64x2_p u=x, v;
394  const uint64x2_p z={0};
395 
396  x = VecShiftLeft<N>(x);
397  u = VecShiftRight<64-N>(u);
398  v = VecMergeHigh(z, u);
399  x = VecOr(x, v);
400  return x;
401 }
402 
403 // c1c0 = c3c2c1c0 MOD p. This is a Barrett reduction. Reading at
404 // Intel paper or https://github.com/antonblanchard/crc32-vpmsum.
405 inline void
406 GF2NT_233_Reduce_POWER8(uint64x2_p& c3, uint64x2_p& c2, uint64x2_p& c1, uint64x2_p& c0)
407 {
408  const uint64_t mod[] = {W64LIT(0xffffffffffffffff), W64LIT(0x01ffffffffff)};
409  const uint64x2_p m0 = (uint64x2_p)VecLoad(mod);
410 
411  uint64x2_p b3, b2, b1, /*b0,*/ a1, a0;
412  const uint64x2_p z0={0};
413 
414  b1 = c1; a1 = c1;
415  a0 = VecMergeHigh(c1, z0);
416  a1 = VecShiftLeft<23>(a1);
417  a1 = VecShiftRight<23>(a1);
418  c1 = VecOr(a1, a0);
419  b2 = VecShiftRight<64-23>(c2);
420  c3 = ShiftLeft128_POWER8<23>(c3);
421  a0 = VecMergeLow(b2, z0);
422  c3 = VecOr(c3, a0);
423  b1 = VecShiftRight<64-23>(b1);
424  c2 = ShiftLeft128_POWER8<23>(c2);
425  a0 = VecMergeLow(b1, z0);
426  c2 = VecOr(c2, a0);
427  b3 = c3;
428  b2 = VecShiftRight<64-10>(c2);
429  b3 = ShiftLeft128_POWER8<10>(b3);
430  a0 = VecMergeLow(b2, z0);
431  b3 = VecOr(b3, a0);
432  a0 = VecMergeLow(c3, z0);
433  b3 = VecXor(b3, a0);
434  b1 = VecShiftRight<64-23>(b3);
435  b3 = ShiftLeft128_POWER8<23>(b3);
436  b3 = VecMergeLow(b3, z0);
437  b3 = VecOr(b3, b1);
438  c2 = VecXor(c2, b3);
439  b3 = c3;
440  b2 = VecShiftRight<64-10>(c2);
441  b3 = ShiftLeft128_POWER8<10>(b3);
442  b2 = VecMergeLow(b2, z0);
443  b3 = VecOr(b3, b2);
444  b2 = c2;
445  b2 = ShiftLeft128_POWER8<10>(b2);
446  a0 = VecMergeHigh(z0, b2);
447  c2 = VecXor(c2, a0);
448  a0 = VecMergeHigh(z0, b3);
449  a1 = VecMergeLow(b2, z0);
450  a0 = VecOr(a0, a1);
451  c3 = VecXor(c3, a0);
452  c0 = VecXor(c0, c2);
453  c1 = VecXor(c1, c3);
454  c1 = VecAnd(c1, m0);
455 }
456 
457 #endif
458 
459 ANONYMOUS_NAMESPACE_END
460 
461 NAMESPACE_BEGIN(CryptoPP)
462 
463 #if (CRYPTOPP_CLMUL_AVAILABLE)
464 
465 void
466 GF2NT_233_Multiply_Reduce_CLMUL(const word* pA, const word* pB, word* pC)
467 {
468  const __m128i* pAA = reinterpret_cast<const __m128i*>(pA);
469  const __m128i* pBB = reinterpret_cast<const __m128i*>(pB);
470  __m128i a0 = _mm_loadu_si128(pAA+0);
471  __m128i a1 = _mm_loadu_si128(pAA+1);
472  __m128i b0 = _mm_loadu_si128(pBB+0);
473  __m128i b1 = _mm_loadu_si128(pBB+1);
474 
475  __m128i c0, c1, c2, c3;
476  F2N_Multiply_256x256_CLMUL(c3, c2, c1, c0, a1, a0, b1, b0);
477  GF2NT_233_Reduce_CLMUL(c3, c2, c1, c0);
478 
479  __m128i* pCC = reinterpret_cast<__m128i*>(pC);
480  _mm_storeu_si128(pCC+0, c0);
481  _mm_storeu_si128(pCC+1, c1);
482 }
483 
484 void
485 GF2NT_233_Square_Reduce_CLMUL(const word* pA, word* pC)
486 {
487  const __m128i* pAA = reinterpret_cast<const __m128i*>(pA);
488  __m128i a0 = _mm_loadu_si128(pAA+0);
489  __m128i a1 = _mm_loadu_si128(pAA+1);
490 
491  __m128i c0, c1, c2, c3;
492  F2N_Square_256_CLMUL(c3, c2, c1, c0, a1, a0);
493  GF2NT_233_Reduce_CLMUL(c3, c2, c1, c0);
494 
495  __m128i* pCC = reinterpret_cast<__m128i*>(pC);
496  _mm_storeu_si128(pCC+0, c0);
497  _mm_storeu_si128(pCC+1, c1);
498 }
499 
500 #elif (CRYPTOPP_ARM_PMULL_AVAILABLE)
501 
502 void
503 GF2NT_233_Multiply_Reduce_ARMv8(const word* pA, const word* pB, word* pC)
504 {
505  // word is either 32-bit or 64-bit, depending on the platform.
506  // Load using a 32-bit pointer to avoid possible alignment issues.
507  const uint32_t* pAA = reinterpret_cast<const uint32_t*>(pA);
508  const uint32_t* pBB = reinterpret_cast<const uint32_t*>(pB);
509 
510  uint64x2_t a0 = vreinterpretq_u64_u32(vld1q_u32(pAA+0));
511  uint64x2_t a1 = vreinterpretq_u64_u32(vld1q_u32(pAA+4));
512  uint64x2_t b0 = vreinterpretq_u64_u32(vld1q_u32(pBB+0));
513  uint64x2_t b1 = vreinterpretq_u64_u32(vld1q_u32(pBB+4));
514 
515  uint64x2_t c0, c1, c2, c3;
516  F2N_Multiply_256x256_ARMv8(c3, c2, c1, c0, a1, a0, b1, b0);
517  GF2NT_233_Reduce_ARMv8(c3, c2, c1, c0);
518 
519  uint32_t* pCC = reinterpret_cast<uint32_t*>(pC);
520  vst1q_u32(pCC+0, vreinterpretq_u32_u64(c0));
521  vst1q_u32(pCC+4, vreinterpretq_u32_u64(c1));
522 }
523 
524 void
525 GF2NT_233_Square_Reduce_ARMv8(const word* pA, word* pC)
526 {
527  // word is either 32-bit or 64-bit, depending on the platform.
528  // Load using a 32-bit pointer to avoid possible alignment issues.
529  const uint32_t* pAA = reinterpret_cast<const uint32_t*>(pA);
530  uint64x2_t a0 = vreinterpretq_u64_u32(vld1q_u32(pAA+0));
531  uint64x2_t a1 = vreinterpretq_u64_u32(vld1q_u32(pAA+4));
532 
533  uint64x2_t c0, c1, c2, c3;
534  F2N_Square_256_ARMv8(c3, c2, c1, c0, a1, a0);
535  GF2NT_233_Reduce_ARMv8(c3, c2, c1, c0);
536 
537  uint32_t* pCC = reinterpret_cast<uint32_t*>(pC);
538  vst1q_u32(pCC+0, vreinterpretq_u32_u64(c0));
539  vst1q_u32(pCC+4, vreinterpretq_u32_u64(c1));
540 }
541 
542 #elif (CRYPTOPP_POWER8_VMULL_AVAILABLE)
543 
544 void
545 GF2NT_233_Multiply_Reduce_POWER8(const word* pA, const word* pB, word* pC)
546 {
547  // word is either 32-bit or 64-bit, depending on the platform.
548  // Load using a byte pointer to avoid possible alignment issues.
549  const byte* pAA = reinterpret_cast<const byte*>(pA);
550  const byte* pBB = reinterpret_cast<const byte*>(pB);
551 
552  uint64x2_p a0 = (uint64x2_p)VecLoad(pAA+0);
553  uint64x2_p a1 = (uint64x2_p)VecLoad(pAA+16);
554  uint64x2_p b0 = (uint64x2_p)VecLoad(pBB+0);
555  uint64x2_p b1 = (uint64x2_p)VecLoad(pBB+16);
556 
557 #if (CRYPTOPP_BIG_ENDIAN)
558  const uint8_t mb[] = {4,5,6,7, 0,1,2,3, 12,13,14,15, 8,9,10,11};
559  const uint8x16_p m = (uint8x16_p)VecLoad(mb);
560  a0 = VecPermute(a0, m);
561  a1 = VecPermute(a1, m);
562  b0 = VecPermute(b0, m);
563  b1 = VecPermute(b1, m);
564 #endif
565 
566  uint64x2_p c0, c1, c2, c3;
567  F2N_Multiply_256x256_POWER8(c3, c2, c1, c0, a1, a0, b1, b0);
568  GF2NT_233_Reduce_POWER8(c3, c2, c1, c0);
569 
570 #if (CRYPTOPP_BIG_ENDIAN)
571  c0 = VecPermute(c0, m);
572  c1 = VecPermute(c1, m);
573 #endif
574 
575  byte* pCC = reinterpret_cast<byte*>(pC);
576  VecStore(c0, pCC+0);
577  VecStore(c1, pCC+16);
578 }
579 
580 void
581 GF2NT_233_Square_Reduce_POWER8(const word* pA, word* pC)
582 {
583  // word is either 32-bit or 64-bit, depending on the platform.
584  // Load using a byte pointer to avoid possible alignment issues.
585  const byte* pAA = reinterpret_cast<const byte*>(pA);
586  uint64x2_p a0 = (uint64x2_p)VecLoad(pAA+0);
587  uint64x2_p a1 = (uint64x2_p)VecLoad(pAA+16);
588 
589 #if (CRYPTOPP_BIG_ENDIAN)
590  const uint8_t mb[] = {4,5,6,7, 0,1,2,3, 12,13,14,15, 8,9,10,11};
591  const uint8x16_p m = (uint8x16_p)VecLoad(mb);
592  a0 = VecPermute(a0, m);
593  a1 = VecPermute(a1, m);
594 #endif
595 
596  uint64x2_p c0, c1, c2, c3;
597  F2N_Square_256_POWER8(c3, c2, c1, c0, a1, a0);
598  GF2NT_233_Reduce_POWER8(c3, c2, c1, c0);
599 
600 #if (CRYPTOPP_BIG_ENDIAN)
601  c0 = VecPermute(c0, m);
602  c1 = VecPermute(c1, m);
603 #endif
604 
605  byte* pCC = reinterpret_cast<byte*>(pC);
606  VecStore(c0, pCC+0);
607  VecStore(c1, pCC+16);
608 }
609 
610 #endif
611 
612 NAMESPACE_END
613 
614 #endif // CRYPTOPP_IMPORTS
Library configuration file.
uint32x4_p VecShiftLeft(const uint32x4_p vec)
Shift a packed vector left.
Definition: ppc_simd.h:1202
T1 VecPermute(const T1 vec, const T2 mask)
Permutes a vector.
Definition: ppc_simd.h:1010
uint64x2_p VecPolyMultiply00LE(const uint64x2_p &a, const uint64x2_p &b)
Polynomial multiplication.
Definition: ppc_simd.h:1501
uint64x2_t PMULL_00(const uint64x2_t a, const uint64x2_t b)
Polynomial multiplication.
Definition: arm_simd.h:35
Support functions for PowerPC and vector operations.
Precompiled header file.
T VecMergeHigh(const T vec1, const T vec2)
Merge two vectors.
Definition: ppc_simd.h:1217
void VecStore(const T data, byte dest[16])
Stores a vector to a byte array.
Definition: ppc_simd.h:605
Classes and functions for schemes over GF(2^n)
T VecMergeLow(const T vec1, const T vec2)
Merge two vectors.
Definition: ppc_simd.h:1231
T1 VecXor(const T1 vec1, const T2 vec2)
XOR two vectors.
Definition: ppc_simd.h:916
uint32x4_p VecShiftRight(const uint32x4_p vec)
Shift a packed vector right.
Definition: ppc_simd.h:1296
__vector unsigned long long uint64x2_p
Vector of 64-bit elements.
Definition: ppc_simd.h:139
T1 VecOr(const T1 vec1, const T2 vec2)
OR two vectors.
Definition: ppc_simd.h:899
uint64x2_t PMULL_11(const uint64x2_t a, const uint64x2_t b)
Polynomial multiplication.
Definition: arm_simd.h:125
Crypto++ library namespace.
uint32x4_p VecLoad(const byte src[16])
Loads a vector from a byte array.
Definition: ppc_simd.h:253
__vector unsigned char uint8x16_p
Vector of 8-bit elements.
Definition: ppc_simd.h:119
T1 VecAnd(const T1 vec1, const T2 vec2)
AND two vectors.
Definition: ppc_simd.h:882
uint64x2_p VecPolyMultiply11LE(const uint64x2_p &a, const uint64x2_p &b)
Polynomial multiplication.
Definition: ppc_simd.h:1567
Support functions for ARM and vector operations.