Crypto++  8.2
Free C++ class library of cryptographic schemes
ppc_simd.h
Go to the documentation of this file.
1 // ppc_simd.h - written and placed in public domain by Jeffrey Walton
2 
3 /// \file ppc_simd.h
4 /// \brief Support functions for PowerPC and vector operations
5 /// \details This header provides an agnostic interface into Clang, GCC
6 /// and IBM XL C/C++ compilers modulo their different built-in functions
7 /// for accessing vector intructions.
8 /// \details The abstractions are necesssary to support back to GCC 4.8 and
9 /// XLC 11 and 12. GCC 4.8 and 4.9 are still popular, and they are the
10 /// default compiler for GCC112, GCC118 and others on the compile farm.
11 /// Older IBM XL C/C++ compilers also experience it due to lack of
12 /// <tt>vec_xl</tt> and <tt>vec_xst</tt> support on some platforms. Modern
13 /// compilers provide best support and don't need many of the hacks
14 /// below.
15 /// \details The library is tested with the following PowerPC machines and
16 /// compilers. GCC110, GCC111, GCC112, GCC119 and GCC135 are provided by
17 /// the <A HREF="https://cfarm.tetaneutral.net/">GCC Compile Farm</A>
18 /// - PowerMac G5, OSX 10.5, POWER4, Apple GCC 4.0
19 /// - PowerMac G5, OSX 10.5, POWER4, Macports GCC 5.0
20 /// - GCC110, Linux, POWER7, GCC 4.8.5
21 /// - GCC110, Linux, POWER7, XLC 12.01
22 /// - GCC111, AIX, POWER7, GCC 4.8.1
23 /// - GCC111, AIX, POWER7, XLC 12.01
24 /// - GCC112, Linux, POWER8, GCC 4.8.5
25 /// - GCC112, Linux, POWER8, XLC 13.01
26 /// - GCC112, Linux, POWER8, Clang 7.0
27 /// - GCC119, AIX, POWER8, GCC 7.2.0
28 /// - GCC119, AIX, POWER8, XLC 13.01
29 /// - GCC135, Linux, POWER9, GCC 7.0
30 /// \details 12 machines are used for testing because the three compilers form
31 /// five profiles. The profiles are listed below.
32 /// - GCC (Linux GCC, Macports GCC, etc. Consistent across machines)
33 /// - XLC 13.0 and earlier (all IBM components)
34 /// - XLC 13.1 and later on Linux (LLVM front-end, no compatibility macros)
35 /// - XLC 13.1 and later on Linux (LLVM front-end, -qxlcompatmacros option)
36 /// - LLVM Clang (traditional Clang compiler)
37 /// \details The LLVM front-end makes it tricky to write portable code because
38 /// LLVM pretends to be other compilers but cannot consume other compiler's
39 /// builtins. When using XLC with -qxlcompatmacros the compiler pretends to
40 /// be GCC, Clang and XLC all at once but it can only consume it's variety
41 /// of builtins.
42 /// \details At Crypto++ 8.0 the various <tt>Vector{FuncName}</tt> were
43 /// renamed to <tt>Vec{FuncName}</tt>. For example, <tt>VectorAnd</tt> was
44 /// changed to <tt>VecAnd</tt>. The name change helped consolidate two
45 /// slightly different implementations.
46 /// \since Crypto++ 6.0, LLVM Clang compiler support since Crypto++ 8.0
47 
48 // Use __ALTIVEC__, _ARCH_PWR7 and _ARCH_PWR8 when detecting actual
49 // availaibility of the feature for the source file being compiled. The
50 // preprocessor macros depend on compiler options like -maltivec; and
51 // not compiler versions.
52 
53 // DO NOT USE this pattern in VecLoad and VecStore. We have to use the
54 // spaghetti code tangled in preprocessor macros because XLC 12 generates
55 // bad code in some places. To verify the bad code generation test on
56 // GCC111 with XLC 12.01 installed. XLC 13.01 on GCC112 and GCC119 are OK.
57 //
58 // inline uint32x4_p VecLoad(const byte src[16])
59 // {
60 // #if defined(_ARCH_PWR8)
61 // return (uint32x4_p) *(uint8x16_p*)((byte*)src);
62 // #else
63 // return VecLoad_ALTIVEC(src);
64 // #endif
65 // }
66 
67 #ifndef CRYPTOPP_PPC_CRYPTO_H
68 #define CRYPTOPP_PPC_CRYPTO_H
69 
70 #include "config.h"
71 #include "misc.h"
72 
73 #if defined(__ALTIVEC__)
74 # include <altivec.h>
75 # undef vector
76 # undef pixel
77 # undef bool
78 #endif
79 
80 // IBM XLC on AIX does not define __CRYPTO__ like it should with -qarch=pwr8.
81 // Crypto is available in XLC 13.1 and above. More LLVM front-end goodness.
82 #if defined(_AIX) && defined(_ARCH_PWR8) && (__xlC__ >= 0xd01)
83 # undef __CRYPTO__
84 # define __CRYPTO__ 1
85 #endif
86 
87 // Hack to detect early XLC compilers. XLC compilers for POWER7 use
88 // vec_xlw4 and vec_xstw4 (and ld2 variants); not vec_xl and vec_st.
89 // Some XLC compilers for POWER7 and above use vec_xl and vec_xst.
90 // The way to tell the difference is, XLC compilers version 13.0 and
91 // earlier use vec_xlw4 and vec_xstw4. XLC compilers 13.1 and later
92 // are use vec_xl and vec_xst. The open question is, how to handle
93 // early Clang compilers for POWER7. We know the latest Clang
94 // compilers support vec_xl and vec_xst. Also see
95 // https://www-01.ibm.com/support/docview.wss?uid=swg21683541.
96 
97 #if defined(__xlc__) && (__xlc__ < 0x0d01)
98 # define __early_xlc__ 1
99 #endif
100 #if defined(__xlC__) && (__xlC__ < 0x0d01)
101 # define __early_xlC__ 1
102 #endif
103 
104 // VecLoad_ALTIVEC and VecStore_ALTIVEC are
105 // too noisy on modern compilers
106 #if CRYPTOPP_GCC_DIAGNOSTIC_AVAILABLE
107 # pragma GCC diagnostic push
108 # pragma GCC diagnostic ignored "-Wdeprecated"
109 #endif
110 
111 NAMESPACE_BEGIN(CryptoPP)
112 
113 #if defined(__ALTIVEC__) || defined(CRYPTOPP_DOXYGEN_PROCESSING)
114 
115 /// \brief Vector of 8-bit elements
116 /// \par Wraps
117 /// __vector unsigned char
118 /// \since Crypto++ 6.0
119 typedef __vector unsigned char uint8x16_p;
120 /// \brief Vector of 16-bit elements
121 /// \par Wraps
122 /// __vector unsigned short
123 /// \since Crypto++ 6.0
124 typedef __vector unsigned short uint16x8_p;
125 /// \brief Vector of 32-bit elements
126 /// \par Wraps
127 /// __vector unsigned int
128 /// \since Crypto++ 6.0
129 typedef __vector unsigned int uint32x4_p;
130 
131 #if defined(_ARCH_PWR8) || defined(CRYPTOPP_DOXYGEN_PROCESSING)
132 /// \brief Vector of 64-bit elements
133 /// \details uint64x2_p is available on POWER7 and above. Some supporting
134 /// functions, like 64-bit <tt>vec_add</tt> (<tt>vaddudm</tt>), did not
135 /// arrive until POWER8.
136 /// \par Wraps
137 /// __vector unsigned long long
138 /// \since Crypto++ 6.0
139 typedef __vector unsigned long long uint64x2_p;
140 #endif // _ARCH_PWR8
141 
142 /// \brief The 0 vector
143 /// \returns a 32-bit vector of 0's
144 /// \since Crypto++ 8.0
146 {
147  const uint32x4_p v = {0,0,0,0};
148  return v;
149 }
150 
151 /// \brief The 1 vector
152 /// \returns a 32-bit vector of 1's
153 /// \since Crypto++ 8.0
155 {
156  const uint32x4_p v = {1,1,1,1};
157  return v;
158 }
159 
160 /// \brief Reverse bytes in a vector
161 /// \tparam T vector type
162 /// \param data the vector
163 /// \returns vector
164 /// \details VecReverse() reverses the bytes in a vector
165 /// \par Wraps
166 /// vec_perm
167 /// \since Crypto++ 6.0
168 template <class T>
169 inline T VecReverse(const T data)
170 {
171 #if (_ARCH_PWR9)
172  return (T)vec_revb((uint8x16_p)data);
173 #else
174  const uint8x16_p mask = {15,14,13,12, 11,10,9,8, 7,6,5,4, 3,2,1,0};
175  return (T)vec_perm(data, data, mask);
176 #endif
177 }
178 
179 /// \name LOAD OPERATIONS
180 //@{
181 
182 /// \brief Loads a vector from a byte array
183 /// \param src the byte array
184 /// \details Loads a vector in native endian format from a byte array.
185 /// \details VecLoad_ALTIVEC() uses <tt>vec_ld</tt> if the effective address
186 /// of <tt>src</tt> is aligned. If unaligned it uses <tt>vec_lvsl</tt>,
187 /// <tt>vec_ld</tt>, <tt>vec_perm</tt> and <tt>src</tt>. The fixups using
188 /// <tt>vec_lvsl</tt> and <tt>vec_perm</tt> are relatively expensive so
189 /// you should provide aligned memory adresses.
190 /// \par Wraps
191 /// vec_ld, vec_lvsl, vec_perm
192 /// \since Crypto++ 6.0
193 inline uint32x4_p VecLoad_ALTIVEC(const byte src[16])
194 {
195  // Avoid IsAlignedOn for convenience.
196  uintptr_t eff = reinterpret_cast<uintptr_t>(src)+0;
197  if (eff % 16 == 0)
198  {
199  return (uint32x4_p)vec_ld(0, src);
200  }
201  else
202  {
203  // http://www.nxp.com/docs/en/reference-manual/ALTIVECPEM.pdf
204  const uint8x16_p perm = vec_lvsl(0, src);
205  const uint8x16_p low = vec_ld(0, src);
206  const uint8x16_p high = vec_ld(15, src);
207  return (uint32x4_p)vec_perm(low, high, perm);
208  }
209 }
210 
211 /// \brief Loads a vector from a byte array
212 /// \param src the byte array
213 /// \param off offset into the src byte array
214 /// \details Loads a vector in native endian format from a byte array.
215 /// \details VecLoad_ALTIVEC() uses <tt>vec_ld</tt> if the effective address
216 /// of <tt>src</tt> is aligned. If unaligned it uses <tt>vec_lvsl</tt>,
217 /// <tt>vec_ld</tt>, <tt>vec_perm</tt> and <tt>src</tt>.
218 /// \details The fixups using <tt>vec_lvsl</tt> and <tt>vec_perm</tt> are
219 /// relatively expensive so you should provide aligned memory adresses.
220 /// \par Wraps
221 /// vec_ld, vec_lvsl, vec_perm
222 /// \since Crypto++ 6.0
223 inline uint32x4_p VecLoad_ALTIVEC(int off, const byte src[16])
224 {
225  // Avoid IsAlignedOn for convenience.
226  uintptr_t eff = reinterpret_cast<uintptr_t>(src)+off;
227  if (eff % 16 == 0)
228  {
229  return (uint32x4_p)vec_ld(off, src);
230  }
231  else
232  {
233  // http://www.nxp.com/docs/en/reference-manual/ALTIVECPEM.pdf
234  const uint8x16_p perm = vec_lvsl(off, src);
235  const uint8x16_p low = vec_ld(off, src);
236  const uint8x16_p high = vec_ld(15, src);
237  return (uint32x4_p)vec_perm(low, high, perm);
238  }
239 }
240 
241 /// \brief Loads a vector from a byte array
242 /// \param src the byte array
243 /// \details VecLoad() loads a vector in from a byte array.
244 /// \details VecLoad() uses POWER7's <tt>vec_xl</tt> or
245 /// <tt>vec_vsx_ld</tt> if available. The instructions do not require
246 /// aligned effective memory addresses. VecLoad_ALTIVEC() is used if POWER7
247 /// is not available. VecLoad_ALTIVEC() can be relatively expensive if
248 /// extra instructions are required to fix up unaligned memory
249 /// addresses.
250 /// \par Wraps
251 /// vec_xlw4, vec_xld2, vec_xl, vec_vsx_ld (and Altivec load)
252 /// \since Crypto++ 6.0
253 inline uint32x4_p VecLoad(const byte src[16])
254 {
255 #if defined(_ARCH_PWR8)
256 # if defined(__early_xlc__) || defined(__early_xlC__)
257  return (uint32x4_p)vec_xlw4(0, (byte*)src);
258 # elif defined(__xlc__) || defined(__xlC__) || defined(__clang__)
259  return (uint32x4_p)vec_xl(0, (byte*)src);
260 # else
261  return (uint32x4_p)vec_vsx_ld(0, (byte*)src);
262 # endif
263 #else
264  return VecLoad_ALTIVEC(src);
265 #endif
266 }
267 
268 /// \brief Loads a vector from a byte array
269 /// \param src the byte array
270 /// \param off offset into the byte array
271 /// \details VecLoad() loads a vector in from a byte array.
272 /// \details VecLoad() uses POWER7's <tt>vec_xl</tt> or
273 /// <tt>vec_vsx_ld</tt> if available. The instructions do not require
274 /// aligned effective memory addresses. VecLoad_ALTIVEC() is used if POWER7
275 /// is not available. VecLoad_ALTIVEC() can be relatively expensive if
276 /// extra instructions are required to fix up unaligned memory
277 /// addresses.
278 /// \par Wraps
279 /// vec_xlw4, vec_xld2, vec_xl, vec_vsx_ld (and Altivec load)
280 /// \since Crypto++ 6.0
281 inline uint32x4_p VecLoad(int off, const byte src[16])
282 {
283 #if defined(_ARCH_PWR8)
284 # if defined(__early_xlc__) || defined(__early_xlC__)
285  return (uint32x4_p)vec_xlw4(off, (byte*)src);
286 # elif defined(__xlc__) || defined(__xlC__) || defined(__clang__)
287  return (uint32x4_p)vec_xl(off, (byte*)src);
288 # else
289  return (uint32x4_p)vec_vsx_ld(off, (byte*)src);
290 # endif
291 #else
292  return VecLoad_ALTIVEC(off, src);
293 #endif
294 }
295 
296 /// \brief Loads a vector from a word array
297 /// \param src the word array
298 /// \details VecLoad() loads a vector in from a word array.
299 /// \details VecLoad() uses POWER7's <tt>vec_xl</tt> or
300 /// <tt>vec_vsx_ld</tt> if available. The instructions do not require
301 /// aligned effective memory addresses. VecLoad_ALTIVEC() is used if POWER7
302 /// is not available. VecLoad_ALTIVEC() can be relatively expensive if
303 /// extra instructions are required to fix up unaligned memory
304 /// addresses.
305 /// \par Wraps
306 /// vec_xlw4, vec_xld2, vec_xl, vec_vsx_ld (and Altivec load)
307 /// \since Crypto++ 8.0
308 inline uint32x4_p VecLoad(const word32 src[4])
309 {
310  return VecLoad((const byte*)src);
311 }
312 
313 /// \brief Loads a vector from a word array
314 /// \param src the word array
315 /// \param off offset into the word array
316 /// \details VecLoad() loads a vector in from a word array.
317 /// \details VecLoad() uses POWER7's <tt>vec_xl</tt> or
318 /// <tt>vec_vsx_ld</tt> if available. The instructions do not require
319 /// aligned effective memory addresses. VecLoad_ALTIVEC() is used if POWER7
320 /// is not available. VecLoad_ALTIVEC() can be relatively expensive if
321 /// extra instructions are required to fix up unaligned memory
322 /// addresses.
323 /// \par Wraps
324 /// vec_xlw4, vec_xld2, vec_xl, vec_vsx_ld (and Altivec load)
325 /// \since Crypto++ 8.0
326 inline uint32x4_p VecLoad(int off, const word32 src[4])
327 {
328  return VecLoad(off, (const byte*)src);
329 }
330 
331 #if defined(_ARCH_PWR8) || defined(CRYPTOPP_DOXYGEN_PROCESSING)
332 
333 /// \brief Loads a vector from a word array
334 /// \param src the word array
335 /// \details VecLoad() loads a vector in from a word array.
336 /// \details VecLoad() uses POWER7's <tt>vec_xl</tt> or
337 /// <tt>vec_vsx_ld</tt> if available. The instructions do not require
338 /// aligned effective memory addresses. VecLoad_ALTIVEC() is used if POWER7
339 /// is not available. VecLoad_ALTIVEC() can be relatively expensive if
340 /// extra instructions are required to fix up unaligned memory
341 /// addresses.
342 /// \details VecLoad() with 64-bit elements is available on POWER7 and above.
343 /// \par Wraps
344 /// vec_xlw4, vec_xld2, vec_xl, vec_vsx_ld (and Altivec load)
345 /// \since Crypto++ 8.0
346 inline uint64x2_p VecLoad(const word64 src[2])
347 {
348  return (uint64x2_p)VecLoad((const byte*)src);
349 }
350 
351 /// \brief Loads a vector from a word array
352 /// \param src the word array
353 /// \param off offset into the word array
354 /// \details VecLoad() loads a vector in from a word array.
355 /// \details VecLoad() uses POWER7's <tt>vec_xl</tt> or
356 /// <tt>vec_vsx_ld</tt> if available. The instructions do not require
357 /// aligned effective memory addresses. VecLoad_ALTIVEC() is used if POWER7
358 /// is not available. VecLoad_ALTIVEC() can be relatively expensive if
359 /// extra instructions are required to fix up unaligned memory
360 /// addresses.
361 /// \details VecLoad() with 64-bit elements is available on POWER8 and above.
362 /// \par Wraps
363 /// vec_xlw4, vec_xld2, vec_xl, vec_vsx_ld (and Altivec load)
364 /// \since Crypto++ 8.0
365 inline uint64x2_p VecLoad(int off, const word64 src[2])
366 {
367  return (uint64x2_p)VecLoad(off, (const byte*)src);
368 }
369 
370 #endif // _ARCH_PWR8
371 
372 /// \brief Loads a vector from an aligned byte array
373 /// \param src the byte array
374 /// \details VecLoadAligned() loads a vector in from an aligned byte array.
375 /// \details VecLoadAligned() uses POWER7's <tt>vec_xl</tt> or
376 /// <tt>vec_vsx_ld</tt> if available. The instructions do not require
377 /// aligned effective memory addresses. Altivec's <tt>vec_ld</tt> is used
378 /// if POWER7 is not available. The effective address of <tt>src</tt> must
379 /// be aligned.
380 /// \par Wraps
381 /// vec_ld, vec_xlw4, vec_xld2, vec_xl, vec_vsx_ld
382 /// \since Crypto++ 8.0
383 inline uint32x4_p VecLoadAligned(const byte src[16])
384 {
385 #if defined(_ARCH_PWR8)
386 # if defined(__early_xlc__) || defined(__early_xlC__)
387  return (uint32x4_p)vec_xlw4(0, (byte*)src);
388 # elif defined(__xlc__) || defined(__xlC__) || defined(__clang__)
389  return (uint32x4_p)vec_xl(0, (byte*)src);
390 # else
391  return (uint32x4_p)vec_vsx_ld(0, (byte*)src);
392 # endif
393 #else // _ARCH_PWR8
394  CRYPTOPP_ASSERT(((uintptr_t)src) % 16 == 0);
395  return (uint32x4_p)vec_ld(0, (byte*)src);
396 #endif // _ARCH_PWR8
397 }
398 
399 /// \brief Loads a vector from an aligned byte array
400 /// \param src the byte array
401 /// \param off offset into the byte array
402 /// \details VecLoadAligned() loads a vector in from an aligned byte array.
403 /// \details VecLoadAligned() uses POWER7's <tt>vec_xl</tt> or
404 /// <tt>vec_vsx_ld</tt> if available. The instructions do not require
405 /// aligned effective memory addresses. Altivec's <tt>vec_ld</tt> is used
406 /// if POWER7 is not available. The effective address of <tt>src</tt> must
407 /// be aligned.
408 /// \par Wraps
409 /// vec_ld, vec_xlw4, vec_xld2, vec_xl, vec_vsx_ld
410 /// \since Crypto++ 8.0
411 inline uint32x4_p VecLoadAligned(int off, const byte src[16])
412 {
413 #if defined(_ARCH_PWR8)
414 # if defined(__early_xlc__) || defined(__early_xlC__)
415  return (uint32x4_p)vec_xlw4(off, (byte*)src);
416 # elif defined(__xlc__) || defined(__xlC__) || defined(__clang__)
417  return (uint32x4_p)vec_xl(off, (byte*)src);
418 # else
419  return (uint32x4_p)vec_vsx_ld(off, (byte*)src);
420 # endif
421 #else // _ARCH_PWR8
422  CRYPTOPP_ASSERT((((uintptr_t)src)+off) % 16 == 0);
423  return (uint32x4_p)vec_ld(off, (byte*)src);
424 #endif // _ARCH_PWR8
425 }
426 
427 /// \brief Loads a vector from a byte array
428 /// \param src the byte array
429 /// \details VecLoadBE() loads a vector in from a byte array. VecLoadBE
430 /// will reverse all bytes in the array on a little endian system.
431 /// \details VecLoadBE() uses POWER7's <tt>vec_xl</tt> or
432 /// <tt>vec_vsx_ld</tt> if available. The instructions do not require
433 /// aligned effective memory addresses. VecLoad_ALTIVEC() is used if POWER7
434 /// is not available. VecLoad_ALTIVEC() can be relatively expensive if
435 /// extra instructions are required to fix up unaligned memory
436 /// addresses.
437 /// \par Wraps
438 /// vec_xlw4, vec_xld2, vec_xl, vec_vsx_ld (and Altivec load)
439 /// \since Crypto++ 6.0
440 inline uint32x4_p VecLoadBE(const byte src[16])
441 {
442 #if defined(_ARCH_PWR8)
443 # if defined(__early_xlc__) || defined(__early_xlC__)
444 # if (CRYPTOPP_BIG_ENDIAN)
445  return (uint32x4_p)vec_xlw4(0, (byte*)src);
446 # else
447  return (uint32x4_p)VecReverse(vec_xlw4(0, (byte*)src));
448 # endif
449 # elif defined(__xlc__) || defined(__xlC__) || defined(__clang__)
450  return (uint32x4_p)vec_xl_be(0, (byte*)src);
451 # else
452 # if (CRYPTOPP_BIG_ENDIAN)
453  return (uint32x4_p)vec_vsx_ld(0, (byte*)src);
454 # else
455  return (uint32x4_p)VecReverse(vec_vsx_ld(0, (byte*)src));
456 # endif
457 # endif
458 #else // _ARCH_PWR8
459 # if (CRYPTOPP_BIG_ENDIAN)
460  return (uint32x4_p)VecLoad((const byte*)src);
461 # else
462  return (uint32x4_p)VecReverse(VecLoad((const byte*)src));
463 # endif
464 #endif // _ARCH_PWR8
465 }
466 
467 /// \brief Loads a vector from a byte array
468 /// \param src the byte array
469 /// \param off offset into the src byte array
470 /// \details VecLoadBE() loads a vector in from a byte array. VecLoadBE
471 /// will reverse all bytes in the array on a little endian system.
472 /// \details VecLoadBE() uses POWER7's <tt>vec_xl</tt> or
473 /// <tt>vec_vsx_ld</tt> if available. The instructions do not require
474 /// aligned effective memory addresses. VecLoad_ALTIVEC() is used if POWER7
475 /// is not available. VecLoad_ALTIVEC() can be relatively expensive if
476 /// extra instructions are required to fix up unaligned memory
477 /// addresses.
478 /// \par Wraps
479 /// vec_xlw4, vec_xld2, vec_xl, vec_vsx_ld (and Altivec load)
480 /// \since Crypto++ 6.0
481 inline uint32x4_p VecLoadBE(int off, const byte src[16])
482 {
483 #if defined(_ARCH_PWR8)
484 # if defined(__early_xlc__) || defined(__early_xlC__)
485 # if (CRYPTOPP_BIG_ENDIAN)
486  return (uint32x4_p)vec_xlw4(off, (byte*)src);
487 # else
488  return (uint32x4_p)VecReverse(vec_xlw4(off, (byte*)src));
489 # endif
490 # elif defined(__xlc__) || defined(__xlC__) || defined(__clang__)
491  return (uint32x4_p)vec_xl_be(off, (byte*)src);
492 # else
493 # if (CRYPTOPP_BIG_ENDIAN)
494  return (uint32x4_p)vec_vsx_ld(off, (byte*)src);
495 # else
496  return (uint32x4_p)VecReverse(vec_vsx_ld(off, (byte*)src));
497 # endif
498 # endif
499 #else // _ARCH_PWR8
500 # if (CRYPTOPP_BIG_ENDIAN)
501  return (uint32x4_p)VecLoad(off, (const byte*)src);
502 # else
503  return (uint32x4_p)VecReverse(VecLoad(off, (const byte*)src));
504 # endif
505 #endif // _ARCH_PWR8
506 }
507 
508 //@}
509 
510 /// \name STORE OPERATIONS
511 //@{
512 
513 /// \brief Stores a vector to a byte array
514 /// \tparam T vector type
515 /// \param data the vector
516 /// \param dest the byte array
517 /// \details VecStore_ALTIVEC() stores a vector to a byte array.
518 /// \details VecStore_ALTIVEC() uses <tt>vec_st</tt> if the effective address
519 /// of <tt>dest</tt> is aligned, and uses <tt>vec_ste</tt> otherwise.
520 /// <tt>vec_ste</tt> is relatively expensive so you should provide aligned
521 /// memory adresses.
522 /// \details VecStore_ALTIVEC() is used automatically when POWER7 or above
523 /// and unaligned loads is not available.
524 /// \par Wraps
525 /// vec_st, vec_ste, vec_lvsr, vec_perm
526 /// \since Crypto++ 8.0
527 template<class T>
528 inline void VecStore_ALTIVEC(const T data, byte dest[16])
529 {
530  // Avoid IsAlignedOn for convenience.
531  uintptr_t eff = reinterpret_cast<uintptr_t>(dest)+0;
532  if (eff % 16 == 0)
533  {
534  vec_st((uint8x16_p)data, 0, dest);
535  }
536  else
537  {
538  // http://www.nxp.com/docs/en/reference-manual/ALTIVECPEM.pdf
539  uint8x16_p perm = (uint8x16_p)vec_perm(data, data, vec_lvsr(0, dest));
540  vec_ste((uint8x16_p) perm, 0, (unsigned char*) dest);
541  vec_ste((uint16x8_p) perm, 1, (unsigned short*)dest);
542  vec_ste((uint32x4_p) perm, 3, (unsigned int*) dest);
543  vec_ste((uint32x4_p) perm, 4, (unsigned int*) dest);
544  vec_ste((uint32x4_p) perm, 8, (unsigned int*) dest);
545  vec_ste((uint32x4_p) perm, 12, (unsigned int*) dest);
546  vec_ste((uint16x8_p) perm, 14, (unsigned short*)dest);
547  vec_ste((uint8x16_p) perm, 15, (unsigned char*) dest);
548  }
549 }
550 
551 /// \brief Stores a vector to a byte array
552 /// \tparam T vector type
553 /// \param data the vector
554 /// \param off the byte offset into the array
555 /// \param dest the byte array
556 /// \details VecStore_ALTIVEC() stores a vector to a byte array.
557 /// \details VecStore_ALTIVEC() uses <tt>vec_st</tt> if the effective address
558 /// of <tt>dest</tt> is aligned, and uses <tt>vec_ste</tt> otherwise.
559 /// <tt>vec_ste</tt> is relatively expensive so you should provide aligned
560 /// memory adresses.
561 /// \details VecStore_ALTIVEC() is used automatically when POWER7 or above
562 /// and unaligned loads is not available.
563 /// \par Wraps
564 /// vec_st, vec_ste, vec_lvsr, vec_perm
565 /// \since Crypto++ 8.0
566 template<class T>
567 inline void VecStore_ALTIVEC(const T data, int off, byte dest[16])
568 {
569  // Avoid IsAlignedOn for convenience.
570  uintptr_t eff = reinterpret_cast<uintptr_t>(dest)+off;
571  if (eff % 16 == 0)
572  {
573  vec_st((uint8x16_p)data, off, dest);
574  }
575  else
576  {
577  // http://www.nxp.com/docs/en/reference-manual/ALTIVECPEM.pdf
578  uint8x16_p perm = (uint8x16_p)vec_perm(data, data, vec_lvsr(off, dest));
579  vec_ste((uint8x16_p) perm, 0, (unsigned char*) dest);
580  vec_ste((uint16x8_p) perm, 1, (unsigned short*)dest);
581  vec_ste((uint32x4_p) perm, 3, (unsigned int*) dest);
582  vec_ste((uint32x4_p) perm, 4, (unsigned int*) dest);
583  vec_ste((uint32x4_p) perm, 8, (unsigned int*) dest);
584  vec_ste((uint32x4_p) perm, 12, (unsigned int*) dest);
585  vec_ste((uint16x8_p) perm, 14, (unsigned short*)dest);
586  vec_ste((uint8x16_p) perm, 15, (unsigned char*) dest);
587  }
588 }
589 
590 /// \brief Stores a vector to a byte array
591 /// \tparam T vector type
592 /// \param data the vector
593 /// \param dest the byte array
594 /// \details VecStore() stores a vector to a byte array.
595 /// \details VecStore() uses POWER7's <tt>vec_xst</tt> or
596 /// <tt>vec_vsx_st</tt> if available. The instructions do not require
597 /// aligned effective memory addresses. VecStore_ALTIVEC() is used if POWER7
598 /// is not available. VecStore_ALTIVEC() can be relatively expensive if
599 /// extra instructions are required to fix up unaligned memory
600 /// addresses.
601 /// \par Wraps
602 /// vec_xstw4, vec_xstld2, vec_xst, vec_vsx_st (and Altivec store)
603 /// \since Crypto++ 6.0
604 template<class T>
605 inline void VecStore(const T data, byte dest[16])
606 {
607 #if defined(_ARCH_PWR8)
608 # if defined(__early_xlc__) || defined(__early_xlC__)
609  vec_xstw4((uint8x16_p)data, 0, (byte*)dest);
610 # elif defined(__xlc__) || defined(__xlC__) || defined(__clang__)
611  vec_xst((uint8x16_p)data, 0, (byte*)dest);
612 # else
613  vec_vsx_st((uint8x16_p)data, 0, (byte*)dest);
614 # endif
615 #else
616  VecStore_ALTIVEC((uint8x16_p)data, 0, (byte*)dest);
617 #endif
618 }
619 
620 /// \brief Stores a vector to a byte array
621 /// \tparam T vector type
622 /// \param data the vector
623 /// \param off the byte offset into the array
624 /// \param dest the byte array
625 /// \details VecStore() stores a vector to a byte array.
626 /// \details VecStore() uses POWER7's <tt>vec_xst</tt> or
627 /// <tt>vec_vsx_st</tt> if available. The instructions do not require
628 /// aligned effective memory addresses. VecStore_ALTIVEC() is used if POWER7
629 /// is not available. VecStore_ALTIVEC() can be relatively expensive if
630 /// extra instructions are required to fix up unaligned memory
631 /// addresses.
632 /// \par Wraps
633 /// vec_xstw4, vec_xstld2, vec_xst, vec_vsx_st (and Altivec store)
634 /// \since Crypto++ 6.0
635 template<class T>
636 inline void VecStore(const T data, int off, byte dest[16])
637 {
638 #if defined(_ARCH_PWR8)
639 # if defined(__early_xlc__) || defined(__early_xlC__)
640  vec_xstw4((uint8x16_p)data, off, (byte*)dest);
641 # elif defined(__xlc__) || defined(__xlC__) || defined(__clang__)
642  vec_xst((uint8x16_p)data, off, (byte*)dest);
643 # else
644  vec_vsx_st((uint8x16_p)data, off, (byte*)dest);
645 # endif
646 #else
647  VecStore_ALTIVEC((uint8x16_p)data, off, (byte*)dest);
648 #endif
649 }
650 
651 /// \brief Stores a vector to a word array
652 /// \tparam T vector type
653 /// \param data the vector
654 /// \param dest the word array
655 /// \details VecStore() stores a vector to a word array.
656 /// \details VecStore() uses POWER7's <tt>vec_xst</tt> or
657 /// <tt>vec_vsx_st</tt> if available. The instructions do not require
658 /// aligned effective memory addresses. VecStore_ALTIVEC() is used if POWER7
659 /// is not available. VecStore_ALTIVEC() can be relatively expensive if
660 /// extra instructions are required to fix up unaligned memory
661 /// addresses.
662 /// \par Wraps
663 /// vec_xstw4, vec_xstld2, vec_xst, vec_vsx_st (and Altivec store)
664 /// \since Crypto++ 8.0
665 template<class T>
666 inline void VecStore(const T data, word32 dest[4])
667 {
668  VecStore((uint8x16_p)data, 0, (byte*)dest);
669 }
670 
671 /// \brief Stores a vector to a word array
672 /// \tparam T vector type
673 /// \param data the vector
674 /// \param off the byte offset into the array
675 /// \param dest the word array
676 /// \details VecStore() stores a vector to a word array.
677 /// \details VecStore() uses POWER7's <tt>vec_xst</tt> or
678 /// <tt>vec_vsx_st</tt> if available. The instructions do not require
679 /// aligned effective memory addresses. VecStore_ALTIVEC() is used if POWER7
680 /// is not available. VecStore_ALTIVEC() can be relatively expensive if
681 /// extra instructions are required to fix up unaligned memory
682 /// addresses.
683 /// \par Wraps
684 /// vec_xstw4, vec_xstld2, vec_xst, vec_vsx_st (and Altivec store)
685 /// \since Crypto++ 8.0
686 template<class T>
687 inline void VecStore(const T data, int off, word32 dest[4])
688 {
689  VecStore((uint8x16_p)data, off, (byte*)dest);
690 }
691 
692 /// \brief Stores a vector to a word array
693 /// \tparam T vector type
694 /// \param data the vector
695 /// \param dest the word array
696 /// \details VecStore() stores a vector to a word array.
697 /// \details VecStore() uses POWER7's <tt>vec_xst</tt> or
698 /// <tt>vec_vsx_st</tt> if available. The instructions do not require
699 /// aligned effective memory addresses. VecStore_ALTIVEC() is used if POWER7
700 /// is not available. VecStore_ALTIVEC() can be relatively expensive if
701 /// extra instructions are required to fix up unaligned memory
702 /// addresses.
703 /// \details VecStore() with 64-bit elements is available on POWER8 and above.
704 /// \par Wraps
705 /// vec_xstw4, vec_xstld2, vec_xst, vec_vsx_st (and Altivec store)
706 /// \since Crypto++ 8.0
707 template<class T>
708 inline void VecStore(const T data, word64 dest[2])
709 {
710  VecStore((uint8x16_p)data, 0, (byte*)dest);
711 }
712 
713 /// \brief Stores a vector to a word array
714 /// \tparam T vector type
715 /// \param data the vector
716 /// \param off the byte offset into the array
717 /// \param dest the word array
718 /// \details VecStore() stores a vector to a word array.
719 /// \details VecStore() uses POWER7's <tt>vec_xst</tt> or
720 /// <tt>vec_vsx_st</tt> if available. The instructions do not require
721 /// aligned effective memory addresses. VecStore_ALTIVEC() is used if POWER7
722 /// is not available. VecStore_ALTIVEC() can be relatively expensive if
723 /// extra instructions are required to fix up unaligned memory
724 /// addresses.
725 /// \details VecStore() with 64-bit elements is available on POWER8 and above.
726 /// \par Wraps
727 /// vec_xstw4, vec_xstld2, vec_xst, vec_vsx_st (and Altivec store)
728 /// \since Crypto++ 8.0
729 template<class T>
730 inline void VecStore(const T data, int off, word64 dest[2])
731 {
732  VecStore((uint8x16_p)data, off, (byte*)dest);
733 }
734 
735 /// \brief Stores a vector to a byte array
736 /// \tparam T vector type
737 /// \param data the vector
738 /// \param dest the byte array
739 /// \details VecStoreBE() stores a vector to a byte array. VecStoreBE
740 /// will reverse all bytes in the array on a little endian system.
741 /// \details VecStoreBE() uses POWER7's <tt>vec_xst</tt> or
742 /// <tt>vec_vsx_st</tt> if available. The instructions do not require
743 /// aligned effective memory addresses. VecStore_ALTIVEC() is used if POWER7
744 /// is not available. VecStore_ALTIVEC() can be relatively expensive if
745 /// extra instructions are required to fix up unaligned memory
746 /// addresses.
747 /// \par Wraps
748 /// vec_xstw4, vec_xstld2, vec_xst, vec_vsx_st (and Altivec store)
749 /// \since Crypto++ 6.0
750 template <class T>
751 inline void VecStoreBE(const T data, byte dest[16])
752 {
753 #if defined(_ARCH_PWR8)
754 # if defined(__early_xlc__) || defined(__early_xlC__)
755 # if (CRYPTOPP_BIG_ENDIAN)
756  vec_xstw4((uint8x16_p)data, 0, (byte*)dest);
757 # else
758  vec_xstw4((uint8x16_p)VecReverse(data), 0, (byte*)dest);
759 # endif
760 # elif defined(__xlc__) || defined(__xlC__) || defined(__clang__)
761  vec_xst_be((uint8x16_p)data, 0, (byte*)dest);
762 # else
763 # if (CRYPTOPP_BIG_ENDIAN)
764  vec_vsx_st((uint8x16_p)data, 0, (byte*)dest);
765 # else
766  vec_vsx_st((uint8x16_p)VecReverse(data), 0, (byte*)dest);
767 # endif
768 # endif
769 #else // _ARCH_PWR8
770 # if (CRYPTOPP_BIG_ENDIAN)
771  VecStore_ALTIVEC((uint8x16_p)data, 0, (byte*)dest);
772 # else
773  VecStore_ALTIVEC((uint8x16_p)VecReverse(data), 0, (byte*)dest);
774 # endif
775 #endif // _ARCH_PWR8
776 }
777 
778 /// \brief Stores a vector to a byte array
779 /// \tparam T vector type
780 /// \param data the vector
781 /// \param off offset into the dest byte array
782 /// \param dest the byte array
783 /// \details VecStoreBE() stores a vector to a byte array. VecStoreBE
784 /// will reverse all bytes in the array on a little endian system.
785 /// \details VecStoreBE() uses POWER7's <tt>vec_xst</tt> or
786 /// <tt>vec_vsx_st</tt> if available. The instructions do not require
787 /// aligned effective memory addresses. VecStore_ALTIVEC() is used if POWER7
788 /// is not available. VecStore_ALTIVEC() can be relatively expensive if
789 /// extra instructions are required to fix up unaligned memory
790 /// addresses.
791 /// \par Wraps
792 /// vec_xstw4, vec_xstld2, vec_xst, vec_vsx_st (and Altivec store)
793 /// \since Crypto++ 6.0
794 template <class T>
795 inline void VecStoreBE(const T data, int off, byte dest[16])
796 {
797 #if defined(_ARCH_PWR8)
798 # if defined(__early_xlc__) || defined(__early_xlC__)
799 # if (CRYPTOPP_BIG_ENDIAN)
800  vec_xstw4((uint8x16_p)data, off, (byte*)dest);
801 # else
802  vec_xstw4((uint8x16_p)VecReverse(data), off, (byte*)dest);
803 # endif
804 # elif defined(__xlc__) || defined(__xlC__) || defined(__clang__)
805  vec_xst_be((uint8x16_p)data, off, (byte*)dest);
806 # else
807 # if (CRYPTOPP_BIG_ENDIAN)
808  vec_vsx_st((uint8x16_p)data, off, (byte*)dest);
809 # else
810  vec_vsx_st((uint8x16_p)VecReverse(data), off, (byte*)dest);
811 # endif
812 # endif
813 #else // _ARCH_PWR8
814 # if (CRYPTOPP_BIG_ENDIAN)
815  VecStore_ALTIVEC((uint8x16_p)data, off, (byte*)dest);
816 # else
817  VecStore_ALTIVEC((uint8x16_p)VecReverse(data), off, (byte*)dest);
818 # endif
819 #endif // _ARCH_PWR8
820 }
821 
822 /// \brief Stores a vector to a word array
823 /// \tparam T vector type
824 /// \param data the vector
825 /// \param dest the word array
826 /// \details VecStoreBE() stores a vector to a word array. VecStoreBE
827 /// will reverse all bytes in the array on a little endian system.
828 /// \details VecStoreBE() uses POWER7's <tt>vec_xst</tt> or
829 /// <tt>vec_vsx_st</tt> if available. The instructions do not require
830 /// aligned effective memory addresses. VecStore_ALTIVEC() is used if POWER7
831 /// is not available. VecStore_ALTIVEC() can be relatively expensive if
832 /// extra instructions are required to fix up unaligned memory
833 /// addresses.
834 /// \par Wraps
835 /// vec_xstw4, vec_xstld2, vec_xst, vec_vsx_st (and Altivec store)
836 /// \since Crypto++ 8.0
837 template <class T>
838 inline void VecStoreBE(const T data, word32 dest[4])
839 {
840  return VecStoreBE((uint8x16_p)data, (byte*)dest);
841 }
842 
843 /// \brief Stores a vector to a word array
844 /// \tparam T vector type
845 /// \param data the vector
846 /// \param off offset into the dest word array
847 /// \param dest the word array
848 /// \details VecStoreBE() stores a vector to a word array. VecStoreBE
849 /// will reverse all words in the array on a little endian system.
850 /// \details VecStoreBE() uses POWER7's <tt>vec_xst</tt> or
851 /// <tt>vec_vsx_st</tt> if available. The instructions do not require
852 /// aligned effective memory addresses. VecStore_ALTIVEC() is used if POWER7
853 /// is not available. VecStore_ALTIVEC() can be relatively expensive if
854 /// extra instructions are required to fix up unaligned memory
855 /// addresses.
856 /// \par Wraps
857 /// vec_xstw4, vec_xstld2, vec_xst, vec_vsx_st (and Altivec store)
858 /// \since Crypto++ 8.0
859 template <class T>
860 inline void VecStoreBE(const T data, int off, word32 dest[4])
861 {
862  return VecStoreBE((uint8x16_p)data, off, (byte*)dest);
863 }
864 
865 //@}
866 
867 /// \name LOGICAL OPERATIONS
868 //@{
869 
870 /// \brief AND two vectors
871 /// \tparam T1 vector type
872 /// \tparam T2 vector type
873 /// \param vec1 the first vector
874 /// \param vec2 the second vector
875 /// \returns vector
876 /// \details VecAnd() returns a new vector from vec1 and vec2. The return
877 /// vector is the same type as vec1.
878 /// \par Wraps
879 /// vec_and
880 /// \since Crypto++ 6.0
881 template <class T1, class T2>
882 inline T1 VecAnd(const T1 vec1, const T2 vec2)
883 {
884  return (T1)vec_and(vec1, (T1)vec2);
885 }
886 
887 /// \brief OR two vectors
888 /// \tparam T1 vector type
889 /// \tparam T2 vector type
890 /// \param vec1 the first vector
891 /// \param vec2 the second vector
892 /// \returns vector
893 /// \details VecOr() returns a new vector from vec1 and vec2. The return
894 /// vector is the same type as vec1.
895 /// \par Wraps
896 /// vec_or
897 /// \since Crypto++ 6.0
898 template <class T1, class T2>
899 inline T1 VecOr(const T1 vec1, const T2 vec2)
900 {
901  return (T1)vec_or(vec1, (T1)vec2);
902 }
903 
904 /// \brief XOR two vectors
905 /// \tparam T1 vector type
906 /// \tparam T2 vector type
907 /// \param vec1 the first vector
908 /// \param vec2 the second vector
909 /// \returns vector
910 /// \details VecXor() returns a new vector from vec1 and vec2. The return
911 /// vector is the same type as vec1.
912 /// \par Wraps
913 /// vec_xor
914 /// \since Crypto++ 6.0
915 template <class T1, class T2>
916 inline T1 VecXor(const T1 vec1, const T2 vec2)
917 {
918  return (T1)vec_xor(vec1, (T1)vec2);
919 }
920 
921 //@}
922 
923 /// \name ARITHMETIC OPERATIONS
924 //@{
925 
926 /// \brief Add two vectors
927 /// \tparam T1 vector type
928 /// \tparam T2 vector type
929 /// \param vec1 the first vector
930 /// \param vec2 the second vector
931 /// \returns vector
932 /// \details VecAdd() returns a new vector from vec1 and vec2.
933 /// vec2 is cast to the same type as vec1. The return vector
934 /// is the same type as vec1.
935 /// \par Wraps
936 /// vec_add
937 /// \since Crypto++ 6.0
938 template <class T1, class T2>
939 inline T1 VecAdd(const T1 vec1, const T2 vec2)
940 {
941  return (T1)vec_add(vec1, (T1)vec2);
942 }
943 
944 /// \brief Subtract two vectors
945 /// \tparam T1 vector type
946 /// \tparam T2 vector type
947 /// \param vec1 the first vector
948 /// \param vec2 the second vector
949 /// \details VecSub() returns a new vector from vec1 and vec2.
950 /// vec2 is cast to the same type as vec1. The return vector
951 /// is the same type as vec1.
952 /// \par Wraps
953 /// vec_sub
954 /// \since Crypto++ 6.0
955 template <class T1, class T2>
956 inline T1 VecSub(const T1 vec1, const T2 vec2)
957 {
958  return (T1)vec_sub(vec1, (T1)vec2);
959 }
960 
961 /// \brief Add two vectors
962 /// \tparam T1 vector type
963 /// \tparam T2 vector type
964 /// \param vec1 the first vector
965 /// \param vec2 the second vector
966 /// \returns vector
967 /// \details VecAdd64() returns a new vector from vec1 and vec2.
968 /// vec1 and vec2 are added as if uint64x2_p vectors. On POWER7
969 /// and below VecAdd64() manages the carries from two elements in
970 /// a uint32x4_p vector.
971 /// \par Wraps
972 /// vec_add for POWER8, vec_addc, vec_perm, vec_add for Altivec
973 /// \since Crypto++ 8.0
974 inline uint32x4_p VecAdd64(const uint32x4_p& vec1, const uint32x4_p& vec2)
975 {
976  // 64-bit elements available at POWER7, but addudm requires POWER8
977 #if defined(_ARCH_PWR8)
978  return (uint32x4_p)vec_add((uint64x2_p)vec1, (uint64x2_p)vec2);
979 #else
980  // The carry mask selects carries from elements 1 and 3 and sets remaining
981  // elements to 0. The mask also shifts the carried values left by 4 bytes
982  // so the carries are added to elements 0 and 2.
983  const uint8x16_p cmask = {4,5,6,7, 16,16,16,16, 12,13,14,15, 16,16,16,16};
984  const uint32x4_p zero = {0, 0, 0, 0};
985 
986  uint32x4_p cy = vec_addc(vec1, vec2);
987  cy = vec_perm(cy, zero, cmask);
988  return vec_add(vec_add(vec1, vec2), cy);
989 #endif
990 }
991 
992 //@}
993 
994 /// \name OTHER OPERATIONS
995 //@{
996 
997 /// \brief Permutes a vector
998 /// \tparam T1 vector type
999 /// \tparam T2 vector type
1000 /// \param vec the vector
1001 /// \param mask vector mask
1002 /// \returns vector
1003 /// \details VecPermute() returns a new vector from vec based on
1004 /// mask. mask is an uint8x16_p type vector. The return
1005 /// vector is the same type as vec.
1006 /// \par Wraps
1007 /// vec_perm
1008 /// \since Crypto++ 6.0
1009 template <class T1, class T2>
1010 inline T1 VecPermute(const T1 vec, const T2 mask)
1011 {
1012  return (T1)vec_perm(vec, vec, (uint8x16_p)mask);
1013 }
1014 
1015 /// \brief Permutes two vectors
1016 /// \tparam T1 vector type
1017 /// \tparam T2 vector type
1018 /// \param vec1 the first vector
1019 /// \param vec2 the second vector
1020 /// \param mask vector mask
1021 /// \returns vector
1022 /// \details VecPermute() returns a new vector from vec1 and vec2
1023 /// based on mask. mask is an uint8x16_p type vector. The return
1024 /// vector is the same type as vec1.
1025 /// \par Wraps
1026 /// vec_perm
1027 /// \since Crypto++ 6.0
1028 template <class T1, class T2>
1029 inline T1 VecPermute(const T1 vec1, const T1 vec2, const T2 mask)
1030 {
1031  return (T1)vec_perm(vec1, (T1)vec2, (uint8x16_p)mask);
1032 }
1033 
1034 /// \brief Shift a vector left
1035 /// \tparam C shift byte count
1036 /// \tparam T vector type
1037 /// \param vec the vector
1038 /// \returns vector
1039 /// \details VecShiftLeftOctet() returns a new vector after shifting the
1040 /// concatenation of the zero vector and the source vector by the specified
1041 /// number of bytes. The return vector is the same type as vec.
1042 /// \details On big endian machines VecShiftLeftOctet() is <tt>vec_sld(a, z,
1043 /// c)</tt>. On little endian machines VecShiftLeftOctet() is translated to
1044 /// <tt>vec_sld(z, a, 16-c)</tt>. You should always call the function as
1045 /// if on a big endian machine as shown below.
1046 /// <pre>
1047 /// uint8x16_p x = VecLoad(ptr);
1048 /// uint8x16_p y = VecShiftLeftOctet<12>(x);
1049 /// </pre>
1050 /// \par Wraps
1051 /// vec_sld
1052 /// \sa <A HREF="https://stackoverflow.com/q/46341923/608639">Is vec_sld
1053 /// endian sensitive?</A> on Stack Overflow
1054 /// \since Crypto++ 6.0
1055 template <unsigned int C, class T>
1056 inline T VecShiftLeftOctet(const T vec)
1057 {
1058  const T zero = {0};
1059  if (C >= 16)
1060  {
1061  // Out of range
1062  return zero;
1063  }
1064  else if (C == 0)
1065  {
1066  // Noop
1067  return vec;
1068  }
1069  else
1070  {
1071 #if (CRYPTOPP_BIG_ENDIAN)
1072  enum { R=C&0xf };
1073  return (T)vec_sld((uint8x16_p)vec, (uint8x16_p)zero, R);
1074 #else
1075  enum { R=(16-C)&0xf }; // Linux xlC 13.1 workaround in Debug builds
1076  return (T)vec_sld((uint8x16_p)zero, (uint8x16_p)vec, R);
1077 #endif
1078  }
1079 }
1080 
1081 /// \brief Shift a vector right
1082 /// \tparam C shift byte count
1083 /// \tparam T vector type
1084 /// \param vec the vector
1085 /// \returns vector
1086 /// \details VecShiftRightOctet() returns a new vector after shifting the
1087 /// concatenation of the zero vector and the source vector by the specified
1088 /// number of bytes. The return vector is the same type as vec.
1089 /// \details On big endian machines VecShiftRightOctet() is <tt>vec_sld(a, z,
1090 /// c)</tt>. On little endian machines VecShiftRightOctet() is translated to
1091 /// <tt>vec_sld(z, a, 16-c)</tt>. You should always call the function as
1092 /// if on a big endian machine as shown below.
1093 /// <pre>
1094 /// uint8x16_p x = VecLoad(ptr);
1095 /// uint8x16_p y = VecShiftRightOctet<12>(y);
1096 /// </pre>
1097 /// \par Wraps
1098 /// vec_sld
1099 /// \sa <A HREF="https://stackoverflow.com/q/46341923/608639">Is vec_sld
1100 /// endian sensitive?</A> on Stack Overflow
1101 /// \since Crypto++ 6.0
1102 template <unsigned int C, class T>
1103 inline T VecShiftRightOctet(const T vec)
1104 {
1105  const T zero = {0};
1106  if (C >= 16)
1107  {
1108  // Out of range
1109  return zero;
1110  }
1111  else if (C == 0)
1112  {
1113  // Noop
1114  return vec;
1115  }
1116  else
1117  {
1118 #if (CRYPTOPP_BIG_ENDIAN)
1119  enum { R=(16-C)&0xf }; // Linux xlC 13.1 workaround in Debug builds
1120  return (T)vec_sld((uint8x16_p)zero, (uint8x16_p)vec, R);
1121 #else
1122  enum { R=C&0xf };
1123  return (T)vec_sld((uint8x16_p)vec, (uint8x16_p)zero, R);
1124 #endif
1125  }
1126 }
1127 
1128 /// \brief Rotate a vector left
1129 /// \tparam C shift byte count
1130 /// \tparam T vector type
1131 /// \param vec the vector
1132 /// \returns vector
1133 /// \details VecRotateLeftOctet() returns a new vector after rotating the
1134 /// concatenation of the source vector with itself by the specified
1135 /// number of bytes. The return vector is the same type as vec.
1136 /// \par Wraps
1137 /// vec_sld
1138 /// \sa <A HREF="https://stackoverflow.com/q/46341923/608639">Is vec_sld
1139 /// endian sensitive?</A> on Stack Overflow
1140 /// \since Crypto++ 6.0
1141 template <unsigned int C, class T>
1142 inline T VecRotateLeftOctet(const T vec)
1143 {
1144 #if (CRYPTOPP_BIG_ENDIAN)
1145  enum { R = C&0xf };
1146  return (T)vec_sld((uint8x16_p)vec, (uint8x16_p)vec, R);
1147 #else
1148  enum { R=(16-C)&0xf }; // Linux xlC 13.1 workaround in Debug builds
1149  return (T)vec_sld((uint8x16_p)vec, (uint8x16_p)vec, R);
1150 #endif
1151 }
1152 
1153 /// \brief Rotate a vector right
1154 /// \tparam C shift byte count
1155 /// \tparam T vector type
1156 /// \param vec the vector
1157 /// \returns vector
1158 /// \details VecRotateRightOctet() returns a new vector after rotating the
1159 /// concatenation of the source vector with itself by the specified
1160 /// number of bytes. The return vector is the same type as vec.
1161 /// \par Wraps
1162 /// vec_sld
1163 /// \sa <A HREF="https://stackoverflow.com/q/46341923/608639">Is vec_sld
1164 /// endian sensitive?</A> on Stack Overflow
1165 /// \since Crypto++ 6.0
1166 template <unsigned int C, class T>
1167 inline T VecRotateRightOctet(const T vec)
1168 {
1169 #if (CRYPTOPP_BIG_ENDIAN)
1170  enum { R=(16-C)&0xf }; // Linux xlC 13.1 workaround in Debug builds
1171  return (T)vec_sld((uint8x16_p)vec, (uint8x16_p)vec, R);
1172 #else
1173  enum { R = C&0xf };
1174  return (T)vec_sld((uint8x16_p)vec, (uint8x16_p)vec, R);
1175 #endif
1176 }
1177 
1178 /// \brief Rotate a packed vector left
1179 /// \tparam C shift bit count
1180 /// \param vec the vector
1181 /// \returns vector
1182 /// \details VecRotateLeft() rotates each element in a packed vector by bit count.
1183 /// \par Wraps
1184 /// vec_rl
1185 /// \since Crypto++ 7.0
1186 template<unsigned int C>
1188 {
1189  const uint32x4_p m = {C, C, C, C};
1190  return vec_rl(vec, m);
1191 }
1192 
1193 /// \brief Shift a packed vector left
1194 /// \tparam C shift bit count
1195 /// \param vec the vector
1196 /// \returns vector
1197 /// \details VecShiftLeft() rotates each element in a packed vector by bit count.
1198 /// \par Wraps
1199 /// vec_sl
1200 /// \since Crypto++ 8.1
1201 template<unsigned int C>
1203 {
1204  const uint32x4_p m = {C, C, C, C};
1205  return vec_sl(vec, m);
1206 }
1207 
1208 /// \brief Merge two vectors
1209 /// \tparam T vector type
1210 /// \param vec1 the first vector
1211 /// \param vec2 the second vector
1212 /// \returns vector
1213 /// \par Wraps
1214 /// vec_mergeh
1215 /// \since Crypto++ 8.1
1216 template <class T>
1217 inline T VecMergeHigh(const T vec1, const T vec2)
1218 {
1219  return vec_mergeh(vec1, vec2);
1220 }
1221 
1222 /// \brief Merge two vectors
1223 /// \tparam T vector type
1224 /// \param vec1 the first vector
1225 /// \param vec2 the second vector
1226 /// \returns vector
1227 /// \par Wraps
1228 /// vec_mergel
1229 /// \since Crypto++ 8.1
1230 template <class T>
1231 inline T VecMergeLow(const T vec1, const T vec2)
1232 {
1233  return vec_mergel(vec1, vec2);
1234 }
1235 
1236 #if defined(_ARCH_PWR8) || defined(CRYPTOPP_DOXYGEN_PROCESSING)
1237 
1238 /// \brief Rotate a packed vector left
1239 /// \tparam C shift bit count
1240 /// \param vec the vector
1241 /// \returns vector
1242 /// \details VecRotateLeft() rotates each element in a packed vector by bit count.
1243 /// \details VecRotateLeft() with 64-bit elements is available on POWER8 and above.
1244 /// \par Wraps
1245 /// vec_rl
1246 /// \since Crypto++ 8.0
1247 template<unsigned int C>
1249 {
1250  const uint64x2_p m = {C, C};
1251  return vec_rl(vec, m);
1252 }
1253 
1254 /// \brief Shift a packed vector left
1255 /// \tparam C shift bit count
1256 /// \param vec the vector
1257 /// \returns vector
1258 /// \details VecShiftLeft() rotates each element in a packed vector by bit count.
1259 /// \details VecShiftLeft() with 64-bit elements is available on POWER8 and above.
1260 /// \par Wraps
1261 /// vec_sl
1262 /// \since Crypto++ 8.1
1263 template<unsigned int C>
1265 {
1266  const uint64x2_p m = {C, C};
1267  return vec_sl(vec, m);
1268 }
1269 
1270 #endif
1271 
1272 /// \brief Rotate a packed vector right
1273 /// \tparam C shift bit count
1274 /// \param vec the vector
1275 /// \returns vector
1276 /// \details VecRotateRight() rotates each element in a packed vector by bit count.
1277 /// \par Wraps
1278 /// vec_rl
1279 /// \since Crypto++ 7.0
1280 template<unsigned int C>
1282 {
1283  const uint32x4_p m = {32-C, 32-C, 32-C, 32-C};
1284  return vec_rl(vec, m);
1285 }
1286 
1287 /// \brief Shift a packed vector right
1288 /// \tparam C shift bit count
1289 /// \param vec the vector
1290 /// \returns vector
1291 /// \details VecShiftRight() rotates each element in a packed vector by bit count.
1292 /// \par Wraps
1293 /// vec_rl
1294 /// \since Crypto++ 8.1
1295 template<unsigned int C>
1297 {
1298  const uint32x4_p m = {C, C, C, C};
1299  return vec_sr(vec, m);
1300 }
1301 
1302 #if defined(_ARCH_PWR8) || defined(CRYPTOPP_DOXYGEN_PROCESSING)
1303 
1304 /// \brief Rotate a packed vector right
1305 /// \tparam C shift bit count
1306 /// \param vec the vector
1307 /// \returns vector
1308 /// \details VecRotateRight() rotates each element in a packed vector by bit count.
1309 /// \details VecRotateRight() with 64-bit elements is available on POWER8 and above.
1310 /// \par Wraps
1311 /// vec_rl
1312 /// \since Crypto++ 8.0
1313 template<unsigned int C>
1315 {
1316  const uint64x2_p m = {64-C, 64-C};
1317  return vec_rl(vec, m);
1318 }
1319 
1320 /// \brief Shift a packed vector right
1321 /// \tparam C shift bit count
1322 /// \param vec the vector
1323 /// \returns vector
1324 /// \details VecShiftRight() rotates each element in a packed vector by bit count.
1325 /// \details VecShiftRight() with 64-bit elements is available on POWER8 and above.
1326 /// \par Wraps
1327 /// vec_sr
1328 /// \since Crypto++ 8.1
1329 template<unsigned int C>
1331 {
1332  const uint64x2_p m = {C, C};
1333  return vec_sr(vec, m);
1334 }
1335 
1336 #endif
1337 
1338 /// \brief Exchange high and low double words
1339 /// \tparam T vector type
1340 /// \param vec the vector
1341 /// \returns vector
1342 /// \par Wraps
1343 /// vec_sld
1344 /// \since Crypto++ 7.0
1345 template <class T>
1346 inline T VecSwapWords(const T vec)
1347 {
1348  return (T)vec_sld((uint8x16_p)vec, (uint8x16_p)vec, 8);
1349 }
1350 
1351 /// \brief Extract a dword from a vector
1352 /// \tparam T vector type
1353 /// \param val the vector
1354 /// \returns vector created from low dword
1355 /// \details VecGetLow() extracts the low dword from a vector. The low dword
1356 /// is composed of the least significant bits and occupies bytes 8 through 15
1357 /// when viewed as a big endian array. The return vector is the same type as
1358 /// the original vector and padded with 0's in the most significant bit positions.
1359 /// \par Wraps
1360 /// vec_sld
1361 /// \since Crypto++ 7.0
1362 template <class T>
1363 inline T VecGetLow(const T val)
1364 {
1365 #if (CRYPTOPP_BIG_ENDIAN) && (_ARCH_PWR8)
1366  const T zero = {0};
1367  return (T)VecMergeLow((uint64x2_p)zero, (uint64x2_p)val);
1368 #else
1369  return VecShiftRightOctet<8>(VecShiftLeftOctet<8>(val));
1370 #endif
1371 }
1372 
1373 /// \brief Extract a dword from a vector
1374 /// \tparam T vector type
1375 /// \param val the vector
1376 /// \returns vector created from high dword
1377 /// \details VecGetHigh() extracts the high dword from a vector. The high dword
1378 /// is composed of the most significant bits and occupies bytes 0 through 7
1379 /// when viewed as a big endian array. The return vector is the same type as
1380 /// the original vector and padded with 0's in the most significant bit positions.
1381 /// \par Wraps
1382 /// vec_sld
1383 /// \since Crypto++ 7.0
1384 template <class T>
1385 inline T VecGetHigh(const T val)
1386 {
1387 #if (CRYPTOPP_BIG_ENDIAN) && (_ARCH_PWR8)
1388  const T zero = {0};
1389  return (T)VecMergeHigh((uint64x2_p)zero, (uint64x2_p)val);
1390 #else
1391  return VecShiftRightOctet<8>(val);
1392 #endif
1393 }
1394 
1395 /// \brief Compare two vectors
1396 /// \tparam T1 vector type
1397 /// \tparam T2 vector type
1398 /// \param vec1 the first vector
1399 /// \param vec2 the second vector
1400 /// \returns true if vec1 equals vec2, false otherwise
1401 /// \details VecEqual() performs a bitwise compare. The vector element types do
1402 /// not matter.
1403 /// \par Wraps
1404 /// vec_all_eq
1405 /// \since Crypto++ 8.0
1406 template <class T1, class T2>
1407 inline bool VecEqual(const T1 vec1, const T2 vec2)
1408 {
1409  return 1 == vec_all_eq((uint32x4_p)vec1, (uint32x4_p)vec2);
1410 }
1411 
1412 /// \brief Compare two vectors
1413 /// \tparam T1 vector type
1414 /// \tparam T2 vector type
1415 /// \param vec1 the first vector
1416 /// \param vec2 the second vector
1417 /// \returns true if vec1 does not equal vec2, false otherwise
1418 /// \details VecNotEqual() performs a bitwise compare. The vector element types do
1419 /// not matter.
1420 /// \par Wraps
1421 /// vec_all_eq
1422 /// \since Crypto++ 8.0
1423 template <class T1, class T2>
1424 inline bool VecNotEqual(const T1 vec1, const T2 vec2)
1425 {
1426  return 0 == vec_all_eq((uint32x4_p)vec1, (uint32x4_p)vec2);
1427 }
1428 
1429 //@}
1430 
1431 //////////////////////// Power8 Crypto ////////////////////////
1432 
1433 #if defined(__CRYPTO__) || defined(CRYPTOPP_DOXYGEN_PROCESSING)
1434 
1435 /// \name POLYNOMIAL MULTIPLICATION
1436 //@{
1437 
1438 /// \brief Polynomial multiplication
1439 /// \param a the first term
1440 /// \param b the second term
1441 /// \returns vector product
1442 /// \details VecPolyMultiply() performs polynomial multiplication. POWER8
1443 /// polynomial multiplication multiplies the high and low terms, and then
1444 /// XOR's the high and low products. That is, the result is <tt>ah*bh XOR
1445 /// al*bl</tt>. It is different behavior than Intel polynomial
1446 /// multiplication. To obtain a single product without the XOR, then set
1447 /// one of the high or low terms to 0. For example, setting <tt>ah=0</tt>
1448 /// results in <tt>0*bh XOR al*bl = al*bl</tt>.
1449 /// \par Wraps
1450 /// __vpmsumw, __builtin_altivec_crypto_vpmsumw and __builtin_crypto_vpmsumw.
1451 /// \since Crypto++ 8.1
1453 {
1454 #if defined(__ibmxl__) || (defined(_AIX) && defined(__xlC__))
1455  return __vpmsumw (a, b);
1456 #elif defined(__clang__)
1457  return __builtin_altivec_crypto_vpmsumw (a, b);
1458 #else
1459  return __builtin_crypto_vpmsumw (a, b);
1460 #endif
1461 }
1462 
1463 /// \brief Polynomial multiplication
1464 /// \param a the first term
1465 /// \param b the second term
1466 /// \returns vector product
1467 /// \details VecPolyMultiply() performs polynomial multiplication. POWER8
1468 /// polynomial multiplication multiplies the high and low terms, and then
1469 /// XOR's the high and low products. That is, the result is <tt>ah*bh XOR
1470 /// al*bl</tt>. It is different behavior than Intel polynomial
1471 /// multiplication. To obtain a single product without the XOR, then set
1472 /// one of the high or low terms to 0. For example, setting <tt>ah=0</tt>
1473 /// results in <tt>0*bh XOR al*bl = al*bl</tt>.
1474 /// \par Wraps
1475 /// __vpmsumd, __builtin_altivec_crypto_vpmsumd and __builtin_crypto_vpmsumd.
1476 /// \since Crypto++ 8.1
1478 {
1479 #if defined(__ibmxl__) || (defined(_AIX) && defined(__xlC__))
1480  return __vpmsumd (a, b);
1481 #elif defined(__clang__)
1482  return __builtin_altivec_crypto_vpmsumd (a, b);
1483 #else
1484  return __builtin_crypto_vpmsumd (a, b);
1485 #endif
1486 }
1487 
1488 /// \brief Polynomial multiplication
1489 /// \param a the first term
1490 /// \param b the second term
1491 /// \returns vector product
1492 /// \details VecPolyMultiply00LE() performs polynomial multiplication and presents
1493 /// the result like Intel's <tt>c = _mm_clmulepi64_si128(a, b, 0x00)</tt>.
1494 /// The <tt>0x00</tt> indicates the low 64-bits of <tt>a</tt> and <tt>b</tt>
1495 /// are multiplied.
1496 /// \note An Intel XMM register is composed of 128-bits. The leftmost bit
1497 /// is MSB and numbered 127, while the the rightmost bit is LSB and numbered 0.
1498 /// \par Wraps
1499 /// __vpmsumd, __builtin_altivec_crypto_vpmsumd and __builtin_crypto_vpmsumd.
1500 /// \since Crypto++ 8.0
1502 {
1503 #if (CRYPTOPP_BIG_ENDIAN)
1505 #else
1506  return VecPolyMultiply(VecGetHigh(a), VecGetHigh(b));
1507 #endif
1508 }
1509 
1510 /// \brief Polynomial multiplication
1511 /// \param a the first term
1512 /// \param b the second term
1513 /// \returns vector product
1514 /// \details VecPolyMultiply01LE performs() polynomial multiplication and presents
1515 /// the result like Intel's <tt>c = _mm_clmulepi64_si128(a, b, 0x01)</tt>.
1516 /// The <tt>0x01</tt> indicates the low 64-bits of <tt>a</tt> and high
1517 /// 64-bits of <tt>b</tt> are multiplied.
1518 /// \note An Intel XMM register is composed of 128-bits. The leftmost bit
1519 /// is MSB and numbered 127, while the the rightmost bit is LSB and numbered 0.
1520 /// \par Wraps
1521 /// __vpmsumd, __builtin_altivec_crypto_vpmsumd and __builtin_crypto_vpmsumd.
1522 /// \since Crypto++ 8.0
1524 {
1525 #if (CRYPTOPP_BIG_ENDIAN)
1526  return VecSwapWords(VecPolyMultiply(a, VecGetHigh(b)));
1527 #else
1528  return VecPolyMultiply(a, VecGetHigh(b));
1529 #endif
1530 }
1531 
1532 /// \brief Polynomial multiplication
1533 /// \param a the first term
1534 /// \param b the second term
1535 /// \returns vector product
1536 /// \details VecPolyMultiply10LE() performs polynomial multiplication and presents
1537 /// the result like Intel's <tt>c = _mm_clmulepi64_si128(a, b, 0x10)</tt>.
1538 /// The <tt>0x10</tt> indicates the high 64-bits of <tt>a</tt> and low
1539 /// 64-bits of <tt>b</tt> are multiplied.
1540 /// \note An Intel XMM register is composed of 128-bits. The leftmost bit
1541 /// is MSB and numbered 127, while the the rightmost bit is LSB and numbered 0.
1542 /// \par Wraps
1543 /// __vpmsumd, __builtin_altivec_crypto_vpmsumd and __builtin_crypto_vpmsumd.
1544 /// \since Crypto++ 8.0
1546 {
1547 #if (CRYPTOPP_BIG_ENDIAN)
1548  return VecSwapWords(VecPolyMultiply(VecGetHigh(a), b));
1549 #else
1550  return VecPolyMultiply(VecGetHigh(a), b);
1551 #endif
1552 }
1553 
1554 /// \brief Polynomial multiplication
1555 /// \param a the first term
1556 /// \param b the second term
1557 /// \returns vector product
1558 /// \details VecPolyMultiply11LE() performs polynomial multiplication and presents
1559 /// the result like Intel's <tt>c = _mm_clmulepi64_si128(a, b, 0x11)</tt>.
1560 /// The <tt>0x11</tt> indicates the high 64-bits of <tt>a</tt> and <tt>b</tt>
1561 /// are multiplied.
1562 /// \note An Intel XMM register is composed of 128-bits. The leftmost bit
1563 /// is MSB and numbered 127, while the the rightmost bit is LSB and numbered 0.
1564 /// \par Wraps
1565 /// __vpmsumd, __builtin_altivec_crypto_vpmsumd and __builtin_crypto_vpmsumd.
1566 /// \since Crypto++ 8.0
1568 {
1569 #if (CRYPTOPP_BIG_ENDIAN)
1570  return VecSwapWords(VecPolyMultiply(VecGetLow(a), b));
1571 #else
1572  return VecPolyMultiply(VecGetLow(a), b);
1573 #endif
1574 }
1575 
1576 //@}
1577 
1578 /// \name AES ENCRYPTION
1579 //@{
1580 
1581 /// \brief One round of AES encryption
1582 /// \tparam T1 vector type
1583 /// \tparam T2 vector type
1584 /// \param state the state vector
1585 /// \param key the subkey vector
1586 /// \details VecEncrypt() performs one round of AES encryption of state
1587 /// using subkey key. The return vector is the same type as vec1.
1588 /// \details VecEncrypt() is available on POWER8 and above.
1589 /// \par Wraps
1590 /// __vcipher, __builtin_altivec_crypto_vcipher, __builtin_crypto_vcipher
1591 /// \since GCC and XLC since Crypto++ 6.0, LLVM Clang since Crypto++ 8.0
1592 template <class T1, class T2>
1593 inline T1 VecEncrypt(const T1 state, const T2 key)
1594 {
1595 #if defined(__ibmxl__) || (defined(_AIX) && defined(__xlC__))
1596  return (T1)__vcipher((uint8x16_p)state, (uint8x16_p)key);
1597 #elif defined(__clang__)
1598  return (T1)__builtin_altivec_crypto_vcipher((uint64x2_p)state, (uint64x2_p)key);
1599 #elif defined(__GNUC__)
1600  return (T1)__builtin_crypto_vcipher((uint64x2_p)state, (uint64x2_p)key);
1601 #else
1602  CRYPTOPP_ASSERT(0);
1603 #endif
1604 }
1605 
1606 /// \brief Final round of AES encryption
1607 /// \tparam T1 vector type
1608 /// \tparam T2 vector type
1609 /// \param state the state vector
1610 /// \param key the subkey vector
1611 /// \details VecEncryptLast() performs the final round of AES encryption
1612 /// of state using subkey key. The return vector is the same type as vec1.
1613 /// \details VecEncryptLast() is available on POWER8 and above.
1614 /// \par Wraps
1615 /// __vcipherlast, __builtin_altivec_crypto_vcipherlast, __builtin_crypto_vcipherlast
1616 /// \since GCC and XLC since Crypto++ 6.0, LLVM Clang since Crypto++ 8.0
1617 template <class T1, class T2>
1618 inline T1 VecEncryptLast(const T1 state, const T2 key)
1619 {
1620 #if defined(__ibmxl__) || (defined(_AIX) && defined(__xlC__))
1621  return (T1)__vcipherlast((uint8x16_p)state, (uint8x16_p)key);
1622 #elif defined(__clang__)
1623  return (T1)__builtin_altivec_crypto_vcipherlast((uint64x2_p)state, (uint64x2_p)key);
1624 #elif defined(__GNUC__)
1625  return (T1)__builtin_crypto_vcipherlast((uint64x2_p)state, (uint64x2_p)key);
1626 #else
1627  CRYPTOPP_ASSERT(0);
1628 #endif
1629 }
1630 
1631 /// \brief One round of AES decryption
1632 /// \tparam T1 vector type
1633 /// \tparam T2 vector type
1634 /// \param state the state vector
1635 /// \param key the subkey vector
1636 /// \details VecDecrypt() performs one round of AES decryption of state
1637 /// using subkey key. The return vector is the same type as vec1.
1638 /// \details VecDecrypt() is available on POWER8 and above.
1639 /// \par Wraps
1640 /// __vncipher, __builtin_altivec_crypto_vncipher, __builtin_crypto_vncipher
1641 /// \since GCC and XLC since Crypto++ 6.0, LLVM Clang since Crypto++ 8.0
1642 template <class T1, class T2>
1643 inline T1 VecDecrypt(const T1 state, const T2 key)
1644 {
1645 #if defined(__ibmxl__) || (defined(_AIX) && defined(__xlC__))
1646  return (T1)__vncipher((uint8x16_p)state, (uint8x16_p)key);
1647 #elif defined(__clang__)
1648  return (T1)__builtin_altivec_crypto_vncipher((uint64x2_p)state, (uint64x2_p)key);
1649 #elif defined(__GNUC__)
1650  return (T1)__builtin_crypto_vncipher((uint64x2_p)state, (uint64x2_p)key);
1651 #else
1652  CRYPTOPP_ASSERT(0);
1653 #endif
1654 }
1655 
1656 /// \brief Final round of AES decryption
1657 /// \tparam T1 vector type
1658 /// \tparam T2 vector type
1659 /// \param state the state vector
1660 /// \param key the subkey vector
1661 /// \details VecDecryptLast() performs the final round of AES decryption
1662 /// of state using subkey key. The return vector is the same type as vec1.
1663 /// \details VecDecryptLast() is available on POWER8 and above.
1664 /// \par Wraps
1665 /// __vncipherlast, __builtin_altivec_crypto_vncipherlast, __builtin_crypto_vncipherlast
1666 /// \since GCC and XLC since Crypto++ 6.0, LLVM Clang since Crypto++ 8.0
1667 template <class T1, class T2>
1668 inline T1 VecDecryptLast(const T1 state, const T2 key)
1669 {
1670 #if defined(__ibmxl__) || (defined(_AIX) && defined(__xlC__))
1671  return (T1)__vncipherlast((uint8x16_p)state, (uint8x16_p)key);
1672 #elif defined(__clang__)
1673  return (T1)__builtin_altivec_crypto_vncipherlast((uint64x2_p)state, (uint64x2_p)key);
1674 #elif defined(__GNUC__)
1675  return (T1)__builtin_crypto_vncipherlast((uint64x2_p)state, (uint64x2_p)key);
1676 #else
1677  CRYPTOPP_ASSERT(0);
1678 #endif
1679 }
1680 
1681 //@}
1682 
1683 /// \name SHA DIGESTS
1684 //@{
1685 
1686 /// \brief SHA256 Sigma functions
1687 /// \tparam func function
1688 /// \tparam fmask function mask
1689 /// \tparam T vector type
1690 /// \param vec the block to transform
1691 /// \details VecSHA256() selects sigma0, sigma1, Sigma0, Sigma1 based on
1692 /// func and fmask. The return vector is the same type as vec.
1693 /// \details VecSHA256() is available on POWER8 and above.
1694 /// \par Wraps
1695 /// __vshasigmaw, __builtin_altivec_crypto_vshasigmaw, __builtin_crypto_vshasigmaw
1696 /// \since GCC and XLC since Crypto++ 6.0, LLVM Clang since Crypto++ 8.0
1697 template <int func, int fmask, class T>
1698 inline T VecSHA256(const T vec)
1699 {
1700 #if defined(__ibmxl__) || (defined(_AIX) && defined(__xlC__))
1701  return (T)__vshasigmaw((uint32x4_p)vec, func, fmask);
1702 #elif defined(__clang__)
1703  return (T)__builtin_altivec_crypto_vshasigmaw((uint32x4_p)vec, func, fmask);
1704 #elif defined(__GNUC__)
1705  return (T)__builtin_crypto_vshasigmaw((uint32x4_p)vec, func, fmask);
1706 #else
1707  CRYPTOPP_ASSERT(0);
1708 #endif
1709 }
1710 
1711 /// \brief SHA512 Sigma functions
1712 /// \tparam func function
1713 /// \tparam fmask function mask
1714 /// \tparam T vector type
1715 /// \param vec the block to transform
1716 /// \details VecSHA512() selects sigma0, sigma1, Sigma0, Sigma1 based on
1717 /// func and fmask. The return vector is the same type as vec.
1718 /// \details VecSHA512() is available on POWER8 and above.
1719 /// \par Wraps
1720 /// __vshasigmad, __builtin_altivec_crypto_vshasigmad, __builtin_crypto_vshasigmad
1721 /// \since GCC and XLC since Crypto++ 6.0, LLVM Clang since Crypto++ 8.0
1722 template <int func, int fmask, class T>
1723 inline T VecSHA512(const T vec)
1724 {
1725 #if defined(__ibmxl__) || (defined(_AIX) && defined(__xlC__))
1726  return (T)__vshasigmad((uint64x2_p)vec, func, fmask);
1727 #elif defined(__clang__)
1728  return (T)__builtin_altivec_crypto_vshasigmad((uint64x2_p)vec, func, fmask);
1729 #elif defined(__GNUC__)
1730  return (T)__builtin_crypto_vshasigmad((uint64x2_p)vec, func, fmask);
1731 #else
1732  CRYPTOPP_ASSERT(0);
1733 #endif
1734 }
1735 
1736 //@}
1737 
1738 #endif // __CRYPTO__
1739 
1740 #endif // _ALTIVEC_
1741 
1742 NAMESPACE_END
1743 
1744 #if CRYPTOPP_GCC_DIAGNOSTIC_AVAILABLE
1745 # pragma GCC diagnostic pop
1746 #endif
1747 
1748 #endif // CRYPTOPP_PPC_CRYPTO_H
T1 VecDecryptLast(const T1 state, const T2 key)
Final round of AES decryption.
Definition: ppc_simd.h:1668
Utility functions for the Crypto++ library.
T VecSHA512(const T vec)
SHA512 Sigma functions.
Definition: ppc_simd.h:1723
T VecShiftRightOctet(const T vec)
Shift a vector right.
Definition: ppc_simd.h:1103
uint32x4_p VecPolyMultiply(const uint32x4_p &a, const uint32x4_p &b)
Polynomial multiplication.
Definition: ppc_simd.h:1452
uint32x4_p VecLoadAligned(const byte src[16])
Loads a vector from an aligned byte array.
Definition: ppc_simd.h:383
T VecReverse(const T data)
Reverse bytes in a vector.
Definition: ppc_simd.h:169
T VecGetLow(const T val)
Extract a dword from a vector.
Definition: ppc_simd.h:1363
T1 VecSub(const T1 vec1, const T2 vec2)
Subtract two vectors.
Definition: ppc_simd.h:956
uint32x4_p VecAdd64(const uint32x4_p &vec1, const uint32x4_p &vec2)
Add two vectors.
Definition: ppc_simd.h:974
uint64x2_p VecPolyMultiply01LE(const uint64x2_p &a, const uint64x2_p &b)
Polynomial multiplication.
Definition: ppc_simd.h:1523
Library configuration file.
T1 VecAdd(const T1 vec1, const T2 vec2)
Add two vectors.
Definition: ppc_simd.h:939
T VecGetHigh(const T val)
Extract a dword from a vector.
Definition: ppc_simd.h:1385
uint64x2_p VecPolyMultiply10LE(const uint64x2_p &a, const uint64x2_p &b)
Polynomial multiplication.
Definition: ppc_simd.h:1545
uint32x4_p VecLoad_ALTIVEC(const byte src[16])
Loads a vector from a byte array.
Definition: ppc_simd.h:193
uint32x4_p VecShiftLeft(const uint32x4_p vec)
Shift a packed vector left.
Definition: ppc_simd.h:1202
T1 VecPermute(const T1 vec, const T2 mask)
Permutes a vector.
Definition: ppc_simd.h:1010
uint64x2_p VecPolyMultiply00LE(const uint64x2_p &a, const uint64x2_p &b)
Polynomial multiplication.
Definition: ppc_simd.h:1501
uint32x4_p VecOne()
The 1 vector.
Definition: ppc_simd.h:154
T VecSwapWords(const T vec)
Exchange high and low double words.
Definition: ppc_simd.h:1346
__vector unsigned int uint32x4_p
Vector of 32-bit elements.
Definition: ppc_simd.h:129
bool VecNotEqual(const T1 vec1, const T2 vec2)
Compare two vectors.
Definition: ppc_simd.h:1424
T VecRotateLeftOctet(const T vec)
Rotate a vector left.
Definition: ppc_simd.h:1142
T VecMergeHigh(const T vec1, const T vec2)
Merge two vectors.
Definition: ppc_simd.h:1217
void VecStoreBE(const T data, byte dest[16])
Stores a vector to a byte array.
Definition: ppc_simd.h:751
void VecStore(const T data, byte dest[16])
Stores a vector to a byte array.
Definition: ppc_simd.h:605
T VecMergeLow(const T vec1, const T vec2)
Merge two vectors.
Definition: ppc_simd.h:1231
#define CRYPTOPP_ASSERT(exp)
Debugging and diagnostic assertion.
Definition: trap.h:69
void VecStore_ALTIVEC(const T data, byte dest[16])
Stores a vector to a byte array.
Definition: ppc_simd.h:528
T VecSHA256(const T vec)
SHA256 Sigma functions.
Definition: ppc_simd.h:1698
T1 VecXor(const T1 vec1, const T2 vec2)
XOR two vectors.
Definition: ppc_simd.h:916
bool VecEqual(const T1 vec1, const T2 vec2)
Compare two vectors.
Definition: ppc_simd.h:1407
uint32x4_p VecZero()
The 0 vector.
Definition: ppc_simd.h:145
__vector unsigned short uint16x8_p
Vector of 16-bit elements.
Definition: ppc_simd.h:124
uint32x4_p VecShiftRight(const uint32x4_p vec)
Shift a packed vector right.
Definition: ppc_simd.h:1296
uint32x4_p VecRotateLeft(const uint32x4_p vec)
Rotate a packed vector left.
Definition: ppc_simd.h:1187
uint32x4_p VecRotateRight(const uint32x4_p vec)
Rotate a packed vector right.
Definition: ppc_simd.h:1281
__vector unsigned long long uint64x2_p
Vector of 64-bit elements.
Definition: ppc_simd.h:139
uint32x4_p VecLoadBE(const byte src[16])
Loads a vector from a byte array.
Definition: ppc_simd.h:440
T1 VecOr(const T1 vec1, const T2 vec2)
OR two vectors.
Definition: ppc_simd.h:899
T1 VecEncryptLast(const T1 state, const T2 key)
Final round of AES encryption.
Definition: ppc_simd.h:1618
Crypto++ library namespace.
T1 VecDecrypt(const T1 state, const T2 key)
One round of AES decryption.
Definition: ppc_simd.h:1643
uint32x4_p VecLoad(const byte src[16])
Loads a vector from a byte array.
Definition: ppc_simd.h:253
T VecRotateRightOctet(const T vec)
Rotate a vector right.
Definition: ppc_simd.h:1167
__vector unsigned char uint8x16_p
Vector of 8-bit elements.
Definition: ppc_simd.h:119
T VecShiftLeftOctet(const T vec)
Shift a vector left.
Definition: ppc_simd.h:1056
T1 VecAnd(const T1 vec1, const T2 vec2)
AND two vectors.
Definition: ppc_simd.h:882
uint64x2_p VecPolyMultiply11LE(const uint64x2_p &a, const uint64x2_p &b)
Polynomial multiplication.
Definition: ppc_simd.h:1567
T1 VecEncrypt(const T1 state, const T2 key)
One round of AES encryption.
Definition: ppc_simd.h:1593