Crypto++  8.2
Free C++ class library of cryptographic schemes
rijndael.cpp
1 // rijndael.cpp - modified by Chris Morgan <cmorgan@wpi.edu>
2 // and Wei Dai from Paulo Baretto's Rijndael implementation
3 // The original code and all modifications are in the public domain.
4 
5 // use "cl /EP /P /DCRYPTOPP_GENERATE_X64_MASM rijndael.cpp" to generate MASM code
6 
7 /*
8 July 2018: Added support for ARMv7 AES instructions via Cryptogams ASM.
9  See the head notes in aes_armv4.S for copyright and license.
10 */
11 
12 /*
13 September 2017: Added support for Power8 AES instructions via compiler intrinsics.
14 */
15 
16 /*
17 July 2017: Added support for ARMv8 AES instructions via compiler intrinsics.
18 */
19 
20 /*
21 July 2010: Added support for AES-NI instructions via compiler intrinsics.
22 */
23 
24 /*
25 Feb 2009: The x86/x64 assembly code was rewritten in by Wei Dai to do counter mode
26 caching, which was invented by Hongjun Wu and popularized by Daniel J. Bernstein
27 and Peter Schwabe in their paper "New AES software speed records". The round
28 function was also modified to include a trick similar to one in Brian Gladman's
29 x86 assembly code, doing an 8-bit register move to minimize the number of
30 register spills. Also switched to compressed tables and copying round keys to
31 the stack.
32 
33 The C++ implementation uses compressed tables if
34 CRYPTOPP_ALLOW_RIJNDAEL_UNALIGNED_DATA_ACCESS is defined.
35 It is defined on x86 platforms by default but no others.
36 */
37 
38 /*
39 July 2006: Defense against timing attacks was added in by Wei Dai.
40 
41 The code now uses smaller tables in the first and last rounds,
42 and preloads them into L1 cache before usage (by loading at least
43 one element in each cache line).
44 
45 We try to delay subsequent accesses to each table (used in the first
46 and last rounds) until all of the table has been preloaded. Hopefully
47 the compiler isn't smart enough to optimize that code away.
48 
49 After preloading the table, we also try not to access any memory location
50 other than the table and the stack, in order to prevent table entries from
51 being unloaded from L1 cache, until that round is finished.
52 (Some popular CPUs have 2-way associative caches.)
53 */
54 
55 // This is the original introductory comment:
56 
57 /**
58  * version 3.0 (December 2000)
59  *
60  * Optimised ANSI C code for the Rijndael cipher (now AES)
61  *
62  * author Vincent Rijmen <vincent.rijmen@esat.kuleuven.ac.be>
63  * author Antoon Bosselaers <antoon.bosselaers@esat.kuleuven.ac.be>
64  * author Paulo Barreto <paulo.barreto@terra.com.br>
65  *
66  * This code is hereby placed in the public domain.
67  *
68  * THIS SOFTWARE IS PROVIDED BY THE AUTHORS ''AS IS'' AND ANY EXPRESS
69  * OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
70  * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
71  * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE
72  * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
73  * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
74  * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
75  * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
76  * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE
77  * OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE,
78  * EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
79  */
80 
81 #include "pch.h"
82 #include "config.h"
83 
84 #ifndef CRYPTOPP_IMPORTS
85 #ifndef CRYPTOPP_GENERATE_X64_MASM
86 
87 #include "rijndael.h"
88 #include "misc.h"
89 #include "cpu.h"
90 
91 // VS2017 and global optimization bug. TODO, figure out when
92 // we can re-enable full optimizations for VS2017. Also see
93 // https://github.com/weidai11/cryptopp/issues/649
94 #if (_MSC_VER >= 1910)
95 # ifndef CRYPTOPP_DEBUG
96 # pragma optimize("", off)
97 # pragma optimize("ts", on)
98 # endif
99 #endif
100 
101 NAMESPACE_BEGIN(CryptoPP)
102 
103 // Hack for http://github.com/weidai11/cryptopp/issues/42 and http://github.com/weidai11/cryptopp/issues/132
104 #if (CRYPTOPP_SSE2_ASM_AVAILABLE || defined(CRYPTOPP_X64_MASM_AVAILABLE))
105 # define CRYPTOPP_ALLOW_RIJNDAEL_UNALIGNED_DATA_ACCESS 1
106 #endif
107 
108 // Clang __m128i casts
109 #define M128I_CAST(x) ((__m128i *)(void *)(x))
110 #define CONST_M128I_CAST(x) ((const __m128i *)(const void *)(x))
111 
112 #if defined(CRYPTOPP_ALLOW_RIJNDAEL_UNALIGNED_DATA_ACCESS)
113 # if (CRYPTOPP_SSE2_ASM_AVAILABLE || defined(CRYPTOPP_X64_MASM_AVAILABLE)) && !defined(CRYPTOPP_DISABLE_RIJNDAEL_ASM)
114 namespace rdtable {CRYPTOPP_ALIGN_DATA(16) word64 Te[256+2];}
115 using namespace rdtable;
116 # else
117 static word64 Te[256];
118 # endif
119 static word64 Td[256];
120 #else // Not CRYPTOPP_ALLOW_RIJNDAEL_UNALIGNED_DATA_ACCESS
121 # if defined(CRYPTOPP_X64_MASM_AVAILABLE)
122 // Unused; avoids linker error on Microsoft X64 non-AESNI platforms
123 namespace rdtable {CRYPTOPP_ALIGN_DATA(16) word64 Te[256+2];}
124 # endif
125 CRYPTOPP_ALIGN_DATA(16) static word32 Te[256*4];
126 CRYPTOPP_ALIGN_DATA(16) static word32 Td[256*4];
127 #endif // CRYPTOPP_ALLOW_RIJNDAEL_UNALIGNED_DATA_ACCESS
128 
129 static volatile bool s_TeFilled = false, s_TdFilled = false;
130 
131 ANONYMOUS_NAMESPACE_BEGIN
132 
133 #if CRYPTOPP_BOOL_X64 || CRYPTOPP_BOOL_X32 || CRYPTOPP_BOOL_X86
134 
135 // Determine whether the range between begin and end overlaps
136 // with the same 4k block offsets as the Te table. Logically,
137 // the code is trying to create the condition:
138 //
139 // Two sepearate memory pages:
140 //
141 // +-----+ +-----+
142 // |XXXXX| |YYYYY|
143 // |XXXXX| |YYYYY|
144 // | | | |
145 // | | | |
146 // +-----+ +-----+
147 // Te Table Locals
148 //
149 // Have a logical cache view of (X and Y may be inverted):
150 //
151 // +-----+
152 // |XXXXX|
153 // |XXXXX|
154 // |YYYYY|
155 // |YYYYY|
156 // +-----+
157 //
158 static inline bool AliasedWithTable(const byte *begin, const byte *end)
159 {
160  ptrdiff_t s0 = uintptr_t(begin)%4096, s1 = uintptr_t(end)%4096;
161  ptrdiff_t t0 = uintptr_t(Te)%4096, t1 = (uintptr_t(Te)+sizeof(Te))%4096;
162  if (t1 > t0)
163  return (s0 >= t0 && s0 < t1) || (s1 > t0 && s1 <= t1);
164  else
165  return (s0 < t1 || s1 <= t1) || (s0 >= t0 || s1 > t0);
166 }
167 
168 struct Locals
169 {
170  word32 subkeys[4*12], workspace[8];
171  const byte *inBlocks, *inXorBlocks, *outXorBlocks;
172  byte *outBlocks;
173  size_t inIncrement, inXorIncrement, outXorIncrement, outIncrement;
174  size_t regSpill, lengthAndCounterFlag, keysBegin;
175 };
176 
177 const size_t s_aliasPageSize = 4096;
178 const size_t s_aliasBlockSize = 256;
179 const size_t s_sizeToAllocate = s_aliasPageSize + s_aliasBlockSize + sizeof(Locals);
180 
181 #endif // CRYPTOPP_BOOL_X64 || CRYPTOPP_BOOL_X32 || CRYPTOPP_BOOL_X86
182 
183 ANONYMOUS_NAMESPACE_END
184 
185 // ************************* Portable Code ************************************
186 
187 #define QUARTER_ROUND(L, T, t, a, b, c, d) \
188  a ^= L(T, 3, byte(t)); t >>= 8;\
189  b ^= L(T, 2, byte(t)); t >>= 8;\
190  c ^= L(T, 1, byte(t)); t >>= 8;\
191  d ^= L(T, 0, t);
192 
193 #define QUARTER_ROUND_LE(t, a, b, c, d) \
194  tempBlock[a] = ((byte *)(Te+byte(t)))[1]; t >>= 8;\
195  tempBlock[b] = ((byte *)(Te+byte(t)))[1]; t >>= 8;\
196  tempBlock[c] = ((byte *)(Te+byte(t)))[1]; t >>= 8;\
197  tempBlock[d] = ((byte *)(Te+t))[1];
198 
199 #if defined(CRYPTOPP_ALLOW_RIJNDAEL_UNALIGNED_DATA_ACCESS)
200  #define QUARTER_ROUND_LD(t, a, b, c, d) \
201  tempBlock[a] = ((byte *)(Td+byte(t)))[GetNativeByteOrder()*7]; t >>= 8;\
202  tempBlock[b] = ((byte *)(Td+byte(t)))[GetNativeByteOrder()*7]; t >>= 8;\
203  tempBlock[c] = ((byte *)(Td+byte(t)))[GetNativeByteOrder()*7]; t >>= 8;\
204  tempBlock[d] = ((byte *)(Td+t))[GetNativeByteOrder()*7];
205 #else
206  #define QUARTER_ROUND_LD(t, a, b, c, d) \
207  tempBlock[a] = Sd[byte(t)]; t >>= 8;\
208  tempBlock[b] = Sd[byte(t)]; t >>= 8;\
209  tempBlock[c] = Sd[byte(t)]; t >>= 8;\
210  tempBlock[d] = Sd[t];
211 #endif
212 
213 #define QUARTER_ROUND_E(t, a, b, c, d) QUARTER_ROUND(TL_M, Te, t, a, b, c, d)
214 #define QUARTER_ROUND_D(t, a, b, c, d) QUARTER_ROUND(TL_M, Td, t, a, b, c, d)
215 
216 #if (CRYPTOPP_LITTLE_ENDIAN)
217  #define QUARTER_ROUND_FE(t, a, b, c, d) QUARTER_ROUND(TL_F, Te, t, d, c, b, a)
218  #define QUARTER_ROUND_FD(t, a, b, c, d) QUARTER_ROUND(TL_F, Td, t, d, c, b, a)
219  #if defined(CRYPTOPP_ALLOW_RIJNDAEL_UNALIGNED_DATA_ACCESS)
220  #define TL_F(T, i, x) (*(word32 *)(void *)((byte *)T + x*8 + (6-i)%4+1))
221  #define TL_M(T, i, x) (*(word32 *)(void *)((byte *)T + x*8 + (i+3)%4+1))
222  #else
223  #define TL_F(T, i, x) rotrFixed(T[x], (3-i)*8)
224  #define TL_M(T, i, x) T[i*256 + x]
225  #endif
226 #else
227  #define QUARTER_ROUND_FE(t, a, b, c, d) QUARTER_ROUND(TL_F, Te, t, a, b, c, d)
228  #define QUARTER_ROUND_FD(t, a, b, c, d) QUARTER_ROUND(TL_F, Td, t, a, b, c, d)
229  #if defined(CRYPTOPP_ALLOW_RIJNDAEL_UNALIGNED_DATA_ACCESS)
230  #define TL_F(T, i, x) (*(word32 *)(void *)((byte *)T + x*8 + (4-i)%4))
231  #define TL_M TL_F
232  #else
233  #define TL_F(T, i, x) rotrFixed(T[x], i*8)
234  #define TL_M(T, i, x) T[i*256 + x]
235  #endif
236 #endif
237 
238 
239 #define f2(x) ((x<<1)^(((x>>7)&1)*0x11b))
240 #define f4(x) ((x<<2)^(((x>>6)&1)*0x11b)^(((x>>6)&2)*0x11b))
241 #define f8(x) ((x<<3)^(((x>>5)&1)*0x11b)^(((x>>5)&2)*0x11b)^(((x>>5)&4)*0x11b))
242 
243 #define f3(x) (f2(x) ^ x)
244 #define f9(x) (f8(x) ^ x)
245 #define fb(x) (f8(x) ^ f2(x) ^ x)
246 #define fd(x) (f8(x) ^ f4(x) ^ x)
247 #define fe(x) (f8(x) ^ f4(x) ^ f2(x))
248 
249 unsigned int Rijndael::Base::OptimalDataAlignment() const
250 {
251 #if (CRYPTOPP_AESNI_AVAILABLE)
252  if (HasAESNI())
253  return 1;
254 #endif
255 #if (CRYPTOPP_ARM_AES_AVAILABLE)
256  if (HasAES())
257  return 1;
258 #endif
259 #if (CRYPTOGAMS_ARM_AES)
260  if (HasARMv7())
261  return 1;
262 #endif
263 #if (CRYPTOPP_POWER8_AES_AVAILABLE)
264  if (HasAES())
265  return 1;
266 #endif
268 }
269 
270 void Rijndael::Base::FillEncTable()
271 {
272  for (int i=0; i<256; i++)
273  {
274  byte x = Se[i];
275 #if defined(CRYPTOPP_ALLOW_RIJNDAEL_UNALIGNED_DATA_ACCESS)
276  word32 y = word32(x)<<8 | word32(x)<<16 | word32(f2(x))<<24;
277  Te[i] = word64(y | f3(x))<<32 | y;
278 #else
279  word32 y = f3(x) | word32(x)<<8 | word32(x)<<16 | word32(f2(x))<<24;
280  for (int j=0; j<4; j++)
281  {
282  Te[i+j*256] = y;
283  y = rotrConstant<8>(y);
284  }
285 #endif
286  }
287 #if (CRYPTOPP_SSE2_ASM_AVAILABLE || defined(CRYPTOPP_X64_MASM_AVAILABLE)) && !defined(CRYPTOPP_DISABLE_RIJNDAEL_ASM)
288  Te[256] = Te[257] = 0;
289 #endif
290  s_TeFilled = true;
291 }
292 
293 void Rijndael::Base::FillDecTable()
294 {
295  for (int i=0; i<256; i++)
296  {
297  byte x = Sd[i];
298 #if defined(CRYPTOPP_ALLOW_RIJNDAEL_UNALIGNED_DATA_ACCESS)
299  word32 y = word32(fd(x))<<8 | word32(f9(x))<<16 | word32(fe(x))<<24;
300  Td[i] = word64(y | fb(x))<<32 | y | x;
301 #else
302  word32 y = fb(x) | word32(fd(x))<<8 | word32(f9(x))<<16 | word32(fe(x))<<24;;
303  for (int j=0; j<4; j++)
304  {
305  Td[i+j*256] = y;
306  y = rotrConstant<8>(y);
307  }
308 #endif
309  }
310  s_TdFilled = true;
311 }
312 
313 #if (CRYPTOPP_AESNI_AVAILABLE)
314 extern void Rijndael_UncheckedSetKey_SSE4_AESNI(const byte *userKey, size_t keyLen, word32* rk);
315 extern void Rijndael_UncheckedSetKeyRev_AESNI(word32 *key, unsigned int rounds);
316 
317 extern size_t Rijndael_Enc_AdvancedProcessBlocks_AESNI(const word32 *subkeys, size_t rounds,
318  const byte *inBlocks, const byte *xorBlocks, byte *outBlocks, size_t length, word32 flags);
319 extern size_t Rijndael_Dec_AdvancedProcessBlocks_AESNI(const word32 *subkeys, size_t rounds,
320  const byte *inBlocks, const byte *xorBlocks, byte *outBlocks, size_t length, word32 flags);
321 #endif
322 
323 #if (CRYPTOPP_ARM_AES_AVAILABLE)
324 extern size_t Rijndael_Enc_AdvancedProcessBlocks_ARMV8(const word32 *subkeys, size_t rounds,
325  const byte *inBlocks, const byte *xorBlocks, byte *outBlocks, size_t length, word32 flags);
326 extern size_t Rijndael_Dec_AdvancedProcessBlocks_ARMV8(const word32 *subkeys, size_t rounds,
327  const byte *inBlocks, const byte *xorBlocks, byte *outBlocks, size_t length, word32 flags);
328 #endif
329 
330 #if (CRYPTOGAMS_ARM_AES)
331 extern "C" int AES_set_encrypt_key(const unsigned char *userKey, const int bitLen, word32 *rkey);
332 extern "C" int AES_set_decrypt_key(const unsigned char *userKey, const int bitLen, word32 *rkey);
333 extern "C" void AES_encrypt(const unsigned char in[16], unsigned char out[16], const word32 *rkey);
334 extern "C" void AES_decrypt(const unsigned char in[16], unsigned char out[16], const word32 *rkey);
335 #endif
336 
337 #if (CRYPTOPP_POWER8_AES_AVAILABLE)
338 extern void Rijndael_UncheckedSetKey_POWER8(const byte* userKey, size_t keyLen,
339  word32* rk, const byte* Se);
340 
341 extern size_t Rijndael_Enc_AdvancedProcessBlocks128_6x1_ALTIVEC(const word32 *subkeys, size_t rounds,
342  const byte *inBlocks, const byte *xorBlocks, byte *outBlocks, size_t length, word32 flags);
343 extern size_t Rijndael_Dec_AdvancedProcessBlocks128_6x1_ALTIVEC(const word32 *subkeys, size_t rounds,
344  const byte *inBlocks, const byte *xorBlocks, byte *outBlocks, size_t length, word32 flags);
345 #endif
346 
347 #if (CRYPTOGAMS_ARM_AES)
348 int CRYPTOGAMS_set_encrypt_key(const byte *userKey, const int bitLen, word32 *rkey)
349 {
350  return AES_set_encrypt_key(userKey, bitLen, rkey);
351 }
352 int CRYPTOGAMS_set_decrypt_key(const byte *userKey, const int bitLen, word32 *rkey)
353 {
354  return AES_set_decrypt_key(userKey, bitLen, rkey);
355 }
356 void CRYPTOGAMS_encrypt(const byte *inBlock, const byte *xorBlock, byte *outBlock, const word32 *rkey)
357 {
358  AES_encrypt(inBlock, outBlock, rkey);
359  if (xorBlock)
360  xorbuf (outBlock, xorBlock, 16);
361 }
362 void CRYPTOGAMS_decrypt(const byte *inBlock, const byte *xorBlock, byte *outBlock, const word32 *rkey)
363 {
364  AES_decrypt(inBlock, outBlock, rkey);
365  if (xorBlock)
366  xorbuf (outBlock, xorBlock, 16);
367 }
368 #endif
369 
370 std::string Rijndael::Base::AlgorithmProvider() const
371 {
372 #if (CRYPTOPP_AESNI_AVAILABLE)
373  if (HasAESNI())
374  return "AESNI";
375 #endif
376 #if CRYPTOPP_SSE2_ASM_AVAILABLE && !defined(CRYPTOPP_DISABLE_RIJNDAEL_ASM)
377  if (HasSSE2())
378  return "SSE2";
379 #endif
380 #if (CRYPTOPP_ARM_AES_AVAILABLE)
381  if (HasAES())
382  return "ARMv8";
383 #endif
384 #if (CRYPTOGAMS_ARM_AES)
385  if (HasARMv7())
386  return "ARMv7";
387 #endif
388 #if (CRYPTOPP_POWER8_AES_AVAILABLE)
389  if (HasAES())
390  return "Power8";
391 #endif
392  return "C++";
393 }
394 
395 void Rijndael::Base::UncheckedSetKey(const byte *userKey, unsigned int keyLen, const NameValuePairs &)
396 {
397  AssertValidKeyLength(keyLen);
398 
399 #if (CRYPTOGAMS_ARM_AES)
400  if (HasARMv7())
401  {
402  m_rounds = keyLen/4 + 6;
403  m_key.New(4*(15+1)+4);
404 
405  if (IsForwardTransformation())
406  CRYPTOGAMS_set_encrypt_key(userKey, keyLen*8, m_key.begin());
407  else
408  CRYPTOGAMS_set_decrypt_key(userKey, keyLen*8, m_key.begin());
409  return;
410  }
411 #endif
412 
413 #if CRYPTOPP_BOOL_X64 || CRYPTOPP_BOOL_X32 || CRYPTOPP_BOOL_X86
414  m_aliasBlock.New(s_sizeToAllocate);
415  // The alias block is only used on IA-32 when unaligned data access is in effect.
416  // Setting the low water mark to 0 avoids zeroization when m_aliasBlock is unused.
417  m_aliasBlock.SetMark(0);
418 #endif
419 
420  m_rounds = keyLen/4 + 6;
421  m_key.New(4*(m_rounds+1));
422  word32 *rk = m_key;
423 
424 #if (CRYPTOPP_AESNI_AVAILABLE && CRYPTOPP_SSE41_AVAILABLE && (!defined(_MSC_VER) || _MSC_VER >= 1600 || CRYPTOPP_BOOL_X86 || CRYPTOPP_BOOL_X32))
425  // MSVC 2008 SP1 generates bad code for _mm_extract_epi32() when compiling for X64
426  if (HasAESNI() && HasSSE41())
427  {
428  // TODO: Add non-SSE4.1 variant for low-end Atoms. The low-end
429  // Atoms have SSE2-SSSE3 and AES-NI, but not SSE4.1 or SSE4.2.
430  Rijndael_UncheckedSetKey_SSE4_AESNI(userKey, keyLen, rk);
431  if (!IsForwardTransformation())
432  Rijndael_UncheckedSetKeyRev_AESNI(m_key, m_rounds);
433 
434  return;
435  }
436 #endif
437 
438 #if CRYPTOPP_POWER8_AES_AVAILABLE
439  if (HasAES())
440  {
441  // We still need rcon and Se to fallback to C/C++ for AES-192 and AES-256.
442  // The IBM docs on AES sucks. Intel's docs on AESNI puts IBM to shame.
443  Rijndael_UncheckedSetKey_POWER8(userKey, keyLen, rk, Se);
444  return;
445  }
446 #endif
447 
448  GetUserKey(BIG_ENDIAN_ORDER, rk, keyLen/4, userKey, keyLen);
449  const word32 *rc = rcon;
450  word32 temp;
451 
452  while (true)
453  {
454  temp = rk[keyLen/4-1];
455  word32 x = (word32(Se[GETBYTE(temp, 2)]) << 24) ^ (word32(Se[GETBYTE(temp, 1)]) << 16) ^
456  (word32(Se[GETBYTE(temp, 0)]) << 8) ^ Se[GETBYTE(temp, 3)];
457  rk[keyLen/4] = rk[0] ^ x ^ *(rc++);
458  rk[keyLen/4+1] = rk[1] ^ rk[keyLen/4];
459  rk[keyLen/4+2] = rk[2] ^ rk[keyLen/4+1];
460  rk[keyLen/4+3] = rk[3] ^ rk[keyLen/4+2];
461 
462  if (rk + keyLen/4 + 4 == m_key.end())
463  break;
464 
465  if (keyLen == 24)
466  {
467  rk[10] = rk[ 4] ^ rk[ 9];
468  rk[11] = rk[ 5] ^ rk[10];
469  }
470  else if (keyLen == 32)
471  {
472  temp = rk[11];
473  rk[12] = rk[ 4] ^ (word32(Se[GETBYTE(temp, 3)]) << 24) ^ (word32(Se[GETBYTE(temp, 2)]) << 16) ^ (word32(Se[GETBYTE(temp, 1)]) << 8) ^ Se[GETBYTE(temp, 0)];
474  rk[13] = rk[ 5] ^ rk[12];
475  rk[14] = rk[ 6] ^ rk[13];
476  rk[15] = rk[ 7] ^ rk[14];
477  }
478  rk += keyLen/4;
479  }
480 
481  rk = m_key;
482 
483  if (IsForwardTransformation())
484  {
485  if (!s_TeFilled)
486  FillEncTable();
487 
489  ConditionalByteReverse(BIG_ENDIAN_ORDER, rk + m_rounds*4, rk + m_rounds*4, 16);
490  }
491  else
492  {
493  if (!s_TdFilled)
494  FillDecTable();
495 
496  #define InverseMixColumn(x) \
497  TL_M(Td, 0, Se[GETBYTE(x, 3)]) ^ TL_M(Td, 1, Se[GETBYTE(x, 2)]) ^ \
498  TL_M(Td, 2, Se[GETBYTE(x, 1)]) ^ TL_M(Td, 3, Se[GETBYTE(x, 0)])
499 
500  unsigned int i, j;
501  for (i = 4, j = 4*m_rounds-4; i < j; i += 4, j -= 4)
502  {
503  temp = InverseMixColumn(rk[i ]); rk[i ] = InverseMixColumn(rk[j ]); rk[j ] = temp;
504  temp = InverseMixColumn(rk[i + 1]); rk[i + 1] = InverseMixColumn(rk[j + 1]); rk[j + 1] = temp;
505  temp = InverseMixColumn(rk[i + 2]); rk[i + 2] = InverseMixColumn(rk[j + 2]); rk[j + 2] = temp;
506  temp = InverseMixColumn(rk[i + 3]); rk[i + 3] = InverseMixColumn(rk[j + 3]); rk[j + 3] = temp;
507  }
508 
509  rk[i+0] = InverseMixColumn(rk[i+0]);
510  rk[i+1] = InverseMixColumn(rk[i+1]);
511  rk[i+2] = InverseMixColumn(rk[i+2]);
512  rk[i+3] = InverseMixColumn(rk[i+3]);
513 
514  temp = ConditionalByteReverse(BIG_ENDIAN_ORDER, rk[0]); rk[0] = ConditionalByteReverse(BIG_ENDIAN_ORDER, rk[4*m_rounds+0]); rk[4*m_rounds+0] = temp;
515  temp = ConditionalByteReverse(BIG_ENDIAN_ORDER, rk[1]); rk[1] = ConditionalByteReverse(BIG_ENDIAN_ORDER, rk[4*m_rounds+1]); rk[4*m_rounds+1] = temp;
516  temp = ConditionalByteReverse(BIG_ENDIAN_ORDER, rk[2]); rk[2] = ConditionalByteReverse(BIG_ENDIAN_ORDER, rk[4*m_rounds+2]); rk[4*m_rounds+2] = temp;
517  temp = ConditionalByteReverse(BIG_ENDIAN_ORDER, rk[3]); rk[3] = ConditionalByteReverse(BIG_ENDIAN_ORDER, rk[4*m_rounds+3]); rk[4*m_rounds+3] = temp;
518  }
519 
520 #if CRYPTOPP_AESNI_AVAILABLE
521  if (HasAESNI())
522  ConditionalByteReverse(BIG_ENDIAN_ORDER, rk+4, rk+4, (m_rounds-1)*16);
523 #endif
524 #if CRYPTOPP_ARM_AES_AVAILABLE
525  if (HasAES())
526  ConditionalByteReverse(BIG_ENDIAN_ORDER, rk+4, rk+4, (m_rounds-1)*16);
527 #endif
528 }
529 
530 void Rijndael::Enc::ProcessAndXorBlock(const byte *inBlock, const byte *xorBlock, byte *outBlock) const
531 {
532 #if CRYPTOPP_SSE2_ASM_AVAILABLE || defined(CRYPTOPP_X64_MASM_AVAILABLE) || CRYPTOPP_AESNI_AVAILABLE
533 # if (CRYPTOPP_SSE2_ASM_AVAILABLE || defined(CRYPTOPP_X64_MASM_AVAILABLE)) && !defined(CRYPTOPP_DISABLE_RIJNDAEL_ASM)
534  if (HasSSE2())
535 # else
536  if (HasAESNI())
537 # endif
538  {
539  (void)Rijndael::Enc::AdvancedProcessBlocks(inBlock, xorBlock, outBlock, 16, 0);
540  return;
541  }
542 #endif
543 
544 #if (CRYPTOPP_ARM_AES_AVAILABLE)
545  if (HasAES())
546  {
547  (void)Rijndael::Enc::AdvancedProcessBlocks(inBlock, xorBlock, outBlock, 16, 0);
548  return;
549  }
550 #endif
551 
552 #if (CRYPTOGAMS_ARM_AES)
553  if (HasARMv7())
554  {
555  CRYPTOGAMS_encrypt(inBlock, xorBlock, outBlock, m_key.begin());
556  return;
557  }
558 #endif
559 
560 #if (CRYPTOPP_POWER8_AES_AVAILABLE)
561  if (HasAES())
562  {
563  (void)Rijndael::Enc::AdvancedProcessBlocks(inBlock, xorBlock, outBlock, 16, 0);
564  return;
565  }
566 #endif
567 
569 
570  word32 s0, s1, s2, s3, t0, t1, t2, t3;
571  Block::Get(inBlock)(s0)(s1)(s2)(s3);
572 
573  const word32 *rk = m_key;
574  s0 ^= rk[0];
575  s1 ^= rk[1];
576  s2 ^= rk[2];
577  s3 ^= rk[3];
578  t0 = rk[4];
579  t1 = rk[5];
580  t2 = rk[6];
581  t3 = rk[7];
582  rk += 8;
583 
584  // timing attack countermeasure. see comments at top for more details.
585  // also see http://github.com/weidai11/cryptopp/issues/146
586  const int cacheLineSize = GetCacheLineSize();
587  unsigned int i;
588  volatile word32 _u = 0;
589  word32 u = _u;
590 #if defined(CRYPTOPP_ALLOW_RIJNDAEL_UNALIGNED_DATA_ACCESS)
591  for (i=0; i<2048; i+=cacheLineSize)
592 #else
593  for (i=0; i<1024; i+=cacheLineSize)
594 #endif
595  u &= *(const word32 *)(const void *)(((const byte *)Te)+i);
596  u &= Te[255];
597  s0 |= u; s1 |= u; s2 |= u; s3 |= u;
598 
599  QUARTER_ROUND_FE(s3, t0, t1, t2, t3)
600  QUARTER_ROUND_FE(s2, t3, t0, t1, t2)
601  QUARTER_ROUND_FE(s1, t2, t3, t0, t1)
602  QUARTER_ROUND_FE(s0, t1, t2, t3, t0)
603 
604  // Nr - 2 full rounds:
605  unsigned int r = m_rounds/2 - 1;
606  do
607  {
608  s0 = rk[0]; s1 = rk[1]; s2 = rk[2]; s3 = rk[3];
609 
610  QUARTER_ROUND_E(t3, s0, s1, s2, s3)
611  QUARTER_ROUND_E(t2, s3, s0, s1, s2)
612  QUARTER_ROUND_E(t1, s2, s3, s0, s1)
613  QUARTER_ROUND_E(t0, s1, s2, s3, s0)
614 
615  t0 = rk[4]; t1 = rk[5]; t2 = rk[6]; t3 = rk[7];
616 
617  QUARTER_ROUND_E(s3, t0, t1, t2, t3)
618  QUARTER_ROUND_E(s2, t3, t0, t1, t2)
619  QUARTER_ROUND_E(s1, t2, t3, t0, t1)
620  QUARTER_ROUND_E(s0, t1, t2, t3, t0)
621 
622  rk += 8;
623  } while (--r);
624 
625  word32 tbw[4];
626  byte *const tempBlock = (byte *)tbw;
627 
628  QUARTER_ROUND_LE(t2, 15, 2, 5, 8)
629  QUARTER_ROUND_LE(t1, 11, 14, 1, 4)
630  QUARTER_ROUND_LE(t0, 7, 10, 13, 0)
631  QUARTER_ROUND_LE(t3, 3, 6, 9, 12)
632 
633  Block::Put(xorBlock, outBlock)(tbw[0]^rk[0])(tbw[1]^rk[1])(tbw[2]^rk[2])(tbw[3]^rk[3]);
634 }
635 
636 void Rijndael::Dec::ProcessAndXorBlock(const byte *inBlock, const byte *xorBlock, byte *outBlock) const
637 {
638 #if CRYPTOPP_AESNI_AVAILABLE
639  if (HasAESNI())
640  {
641  (void)Rijndael::Dec::AdvancedProcessBlocks(inBlock, xorBlock, outBlock, 16, 0);
642  return;
643  }
644 #endif
645 
646 #if (CRYPTOPP_ARM_AES_AVAILABLE)
647  if (HasAES())
648  {
649  (void)Rijndael::Dec::AdvancedProcessBlocks(inBlock, xorBlock, outBlock, 16, 0);
650  return;
651  }
652 #endif
653 
654 #if (CRYPTOGAMS_ARM_AES)
655  if (HasARMv7())
656  {
657  CRYPTOGAMS_decrypt(inBlock, xorBlock, outBlock, m_key.begin());
658  return;
659  }
660 #endif
661 
662 #if (CRYPTOPP_POWER8_AES_AVAILABLE)
663  if (HasAES())
664  {
665  (void)Rijndael::Dec::AdvancedProcessBlocks(inBlock, xorBlock, outBlock, 16, 0);
666  return;
667  }
668 #endif
669 
671 
672  word32 s0, s1, s2, s3, t0, t1, t2, t3;
673  Block::Get(inBlock)(s0)(s1)(s2)(s3);
674 
675  const word32 *rk = m_key;
676  s0 ^= rk[0];
677  s1 ^= rk[1];
678  s2 ^= rk[2];
679  s3 ^= rk[3];
680  t0 = rk[4];
681  t1 = rk[5];
682  t2 = rk[6];
683  t3 = rk[7];
684  rk += 8;
685 
686  // timing attack countermeasure. see comments at top for more details.
687  // also see http://github.com/weidai11/cryptopp/issues/146
688  const int cacheLineSize = GetCacheLineSize();
689  unsigned int i;
690  volatile word32 _u = 0;
691  word32 u = _u;
692 #if defined(CRYPTOPP_ALLOW_RIJNDAEL_UNALIGNED_DATA_ACCESS)
693  for (i=0; i<2048; i+=cacheLineSize)
694 #else
695  for (i=0; i<1024; i+=cacheLineSize)
696 #endif
697  u &= *(const word32 *)(const void *)(((const byte *)Td)+i);
698  u &= Td[255];
699  s0 |= u; s1 |= u; s2 |= u; s3 |= u;
700 
701  QUARTER_ROUND_FD(s3, t2, t1, t0, t3)
702  QUARTER_ROUND_FD(s2, t1, t0, t3, t2)
703  QUARTER_ROUND_FD(s1, t0, t3, t2, t1)
704  QUARTER_ROUND_FD(s0, t3, t2, t1, t0)
705 
706  // Nr - 2 full rounds:
707  unsigned int r = m_rounds/2 - 1;
708  do
709  {
710  s0 = rk[0]; s1 = rk[1]; s2 = rk[2]; s3 = rk[3];
711 
712  QUARTER_ROUND_D(t3, s2, s1, s0, s3)
713  QUARTER_ROUND_D(t2, s1, s0, s3, s2)
714  QUARTER_ROUND_D(t1, s0, s3, s2, s1)
715  QUARTER_ROUND_D(t0, s3, s2, s1, s0)
716 
717  t0 = rk[4]; t1 = rk[5]; t2 = rk[6]; t3 = rk[7];
718 
719  QUARTER_ROUND_D(s3, t2, t1, t0, t3)
720  QUARTER_ROUND_D(s2, t1, t0, t3, t2)
721  QUARTER_ROUND_D(s1, t0, t3, t2, t1)
722  QUARTER_ROUND_D(s0, t3, t2, t1, t0)
723 
724  rk += 8;
725  } while (--r);
726 
727 #if !(defined(CRYPTOPP_ALLOW_RIJNDAEL_UNALIGNED_DATA_ACCESS))
728  // timing attack countermeasure. see comments at top for more details
729  // If CRYPTOPP_ALLOW_RIJNDAEL_UNALIGNED_DATA_ACCESS is defined,
730  // QUARTER_ROUND_LD will use Td, which is already preloaded.
731  u = _u;
732  for (i=0; i<256; i+=cacheLineSize)
733  u &= *(const word32 *)(const void *)(Sd+i);
734  u &= *(const word32 *)(const void *)(Sd+252);
735  t0 |= u; t1 |= u; t2 |= u; t3 |= u;
736 #endif
737 
738  word32 tbw[4];
739  byte *const tempBlock = (byte *)tbw;
740 
741  QUARTER_ROUND_LD(t2, 7, 2, 13, 8)
742  QUARTER_ROUND_LD(t1, 3, 14, 9, 4)
743  QUARTER_ROUND_LD(t0, 15, 10, 5, 0)
744  QUARTER_ROUND_LD(t3, 11, 6, 1, 12)
745 
746  Block::Put(xorBlock, outBlock)(tbw[0]^rk[0])(tbw[1]^rk[1])(tbw[2]^rk[2])(tbw[3]^rk[3]);
747 }
748 
749 // ************************* Assembly Code ************************************
750 
751 #if CRYPTOPP_MSC_VERSION
752 # pragma warning(disable: 4731) // frame pointer register 'ebp' modified by inline assembly code
753 #endif
754 
755 #endif // #ifndef CRYPTOPP_GENERATE_X64_MASM
756 
757 #if CRYPTOPP_SSE2_ASM_AVAILABLE && !defined(CRYPTOPP_DISABLE_RIJNDAEL_ASM)
758 
759 CRYPTOPP_NAKED void CRYPTOPP_FASTCALL Rijndael_Enc_AdvancedProcessBlocks_SSE2(void *locals, const word32 *k)
760 {
761  CRYPTOPP_UNUSED(locals); CRYPTOPP_UNUSED(k);
762 
763 #if CRYPTOPP_BOOL_X86
764 
765 #define L_REG esp
766 #define L_INDEX(i) (L_REG+768+i)
767 #define L_INXORBLOCKS L_INBLOCKS+4
768 #define L_OUTXORBLOCKS L_INBLOCKS+8
769 #define L_OUTBLOCKS L_INBLOCKS+12
770 #define L_INCREMENTS L_INDEX(16*15)
771 #define L_SP L_INDEX(16*16)
772 #define L_LENGTH L_INDEX(16*16+4)
773 #define L_KEYS_BEGIN L_INDEX(16*16+8)
774 
775 #define MOVD movd
776 #define MM(i) mm##i
777 
778 #define MXOR(a,b,c) \
779  AS2( movzx esi, b)\
780  AS2( movd mm7, DWORD PTR [AS_REG_7+8*WORD_REG(si)+MAP0TO4(c)])\
781  AS2( pxor MM(a), mm7)\
782 
783 #define MMOV(a,b,c) \
784  AS2( movzx esi, b)\
785  AS2( movd MM(a), DWORD PTR [AS_REG_7+8*WORD_REG(si)+MAP0TO4(c)])\
786 
787 #else
788 
789 #define L_REG r8
790 #define L_INDEX(i) (L_REG+i)
791 #define L_INXORBLOCKS L_INBLOCKS+8
792 #define L_OUTXORBLOCKS L_INBLOCKS+16
793 #define L_OUTBLOCKS L_INBLOCKS+24
794 #define L_INCREMENTS L_INDEX(16*16)
795 #define L_LENGTH L_INDEX(16*18+8)
796 #define L_KEYS_BEGIN L_INDEX(16*19)
797 
798 #define MOVD mov
799 #define MM_0 r9d
800 #define MM_1 r12d
801 #ifdef __GNUC__
802 #define MM_2 r11d
803 #else
804 #define MM_2 r10d
805 #endif
806 #define MM(i) MM_##i
807 
808 #define MXOR(a,b,c) \
809  AS2( movzx esi, b)\
810  AS2( xor MM(a), DWORD PTR [AS_REG_7+8*WORD_REG(si)+MAP0TO4(c)])\
811 
812 #define MMOV(a,b,c) \
813  AS2( movzx esi, b)\
814  AS2( mov MM(a), DWORD PTR [AS_REG_7+8*WORD_REG(si)+MAP0TO4(c)])\
815 
816 #endif
817 
818 #define L_SUBKEYS L_INDEX(0)
819 #define L_SAVED_X L_SUBKEYS
820 #define L_KEY12 L_INDEX(16*12)
821 #define L_LASTROUND L_INDEX(16*13)
822 #define L_INBLOCKS L_INDEX(16*14)
823 #define MAP0TO4(i) (ASM_MOD(i+3,4)+1)
824 
825 #define XOR(a,b,c) \
826  AS2( movzx esi, b)\
827  AS2( xor a, DWORD PTR [AS_REG_7+8*WORD_REG(si)+MAP0TO4(c)])\
828 
829 #define MOV(a,b,c) \
830  AS2( movzx esi, b)\
831  AS2( mov a, DWORD PTR [AS_REG_7+8*WORD_REG(si)+MAP0TO4(c)])\
832 
833 #ifdef CRYPTOPP_GENERATE_X64_MASM
834  ALIGN 8
835  Rijndael_Enc_AdvancedProcessBlocks PROC FRAME
836  rex_push_reg rsi
837  push_reg rdi
838  push_reg rbx
839  push_reg r12
840  .endprolog
841  mov L_REG, rcx
842  mov AS_REG_7, ?Te@rdtable@CryptoPP@@3PA_KA
843  mov edi, DWORD PTR [?g_cacheLineSize@CryptoPP@@3IA]
844 #elif defined(__GNUC__)
845  __asm__ __volatile__
846  (
847  INTEL_NOPREFIX
848  #if CRYPTOPP_BOOL_X64
849  AS2( mov L_REG, rcx)
850  #endif
851  AS_PUSH_IF86(bx)
852  AS_PUSH_IF86(bp)
853  AS2( mov AS_REG_7, WORD_REG(si))
854 #else
855  AS_PUSH_IF86(si)
856  AS_PUSH_IF86(di)
857  AS_PUSH_IF86(bx)
858  AS_PUSH_IF86(bp)
859  AS2( lea AS_REG_7, [Te])
860  AS2( mov edi, [g_cacheLineSize])
861 #endif
862 
863 #if CRYPTOPP_BOOL_X86
864  AS2( mov [ecx+16*12+16*4], esp) // save esp to L_SP
865  AS2( lea esp, [ecx-768])
866 #endif
867 
868  // copy subkeys to stack
869  AS2( mov WORD_REG(si), [L_KEYS_BEGIN])
870  AS2( mov WORD_REG(ax), 16)
871  AS2( and WORD_REG(ax), WORD_REG(si))
872  AS2( movdqa xmm3, XMMWORD_PTR [WORD_REG(dx)+16+WORD_REG(ax)]) // subkey 1 (non-counter) or 2 (counter)
873  AS2( movdqa [L_KEY12], xmm3)
874  AS2( lea WORD_REG(ax), [WORD_REG(dx)+WORD_REG(ax)+2*16])
875  AS2( sub WORD_REG(ax), WORD_REG(si))
876  ASL(0)
877  AS2( movdqa xmm0, [WORD_REG(ax)+WORD_REG(si)])
878  AS2( movdqa XMMWORD_PTR [L_SUBKEYS+WORD_REG(si)], xmm0)
879  AS2( add WORD_REG(si), 16)
880  AS2( cmp WORD_REG(si), 16*12)
881  ATT_NOPREFIX
882  ASJ( jl, 0, b)
883  INTEL_NOPREFIX
884 
885  // read subkeys 0, 1 and last
886  AS2( movdqa xmm4, [WORD_REG(ax)+WORD_REG(si)]) // last subkey
887  AS2( movdqa xmm1, [WORD_REG(dx)]) // subkey 0
888  AS2( MOVD MM(1), [WORD_REG(dx)+4*4]) // 0,1,2,3
889  AS2( mov ebx, [WORD_REG(dx)+5*4]) // 4,5,6,7
890  AS2( mov ecx, [WORD_REG(dx)+6*4]) // 8,9,10,11
891  AS2( mov edx, [WORD_REG(dx)+7*4]) // 12,13,14,15
892 
893  // load table into cache
894  AS2( xor WORD_REG(ax), WORD_REG(ax))
895  ASL(9)
896  AS2( mov esi, [AS_REG_7+WORD_REG(ax)])
897  AS2( add WORD_REG(ax), WORD_REG(di))
898  AS2( mov esi, [AS_REG_7+WORD_REG(ax)])
899  AS2( add WORD_REG(ax), WORD_REG(di))
900  AS2( mov esi, [AS_REG_7+WORD_REG(ax)])
901  AS2( add WORD_REG(ax), WORD_REG(di))
902  AS2( mov esi, [AS_REG_7+WORD_REG(ax)])
903  AS2( add WORD_REG(ax), WORD_REG(di))
904  AS2( cmp WORD_REG(ax), 2048)
905  ATT_NOPREFIX
906  ASJ( jl, 9, b)
907  INTEL_NOPREFIX
908  AS1( lfence)
909 
910  AS2( test DWORD PTR [L_LENGTH], 1)
911  ATT_NOPREFIX
912  ASJ( jz, 8, f)
913  INTEL_NOPREFIX
914 
915  // counter mode one-time setup
916  AS2( mov WORD_REG(si), [L_INBLOCKS])
917  AS2( movdqu xmm2, [WORD_REG(si)]) // counter
918  AS2( pxor xmm2, xmm1)
919  AS2( psrldq xmm1, 14)
920  AS2( movd eax, xmm1)
921  AS2( mov al, BYTE PTR [WORD_REG(si)+15])
922  AS2( MOVD MM(2), eax)
923 #if CRYPTOPP_BOOL_X86
924  AS2( mov eax, 1)
925  AS2( movd mm3, eax)
926 #endif
927 
928  // partial first round, in: xmm2(15,14,13,12;11,10,9,8;7,6,5,4;3,2,1,0), out: mm1, ebx, ecx, edx
929  AS2( movd eax, xmm2)
930  AS2( psrldq xmm2, 4)
931  AS2( movd edi, xmm2)
932  AS2( psrldq xmm2, 4)
933  MXOR( 1, al, 0) // 0
934  XOR( edx, ah, 1) // 1
935  AS2( shr eax, 16)
936  XOR( ecx, al, 2) // 2
937  XOR( ebx, ah, 3) // 3
938  AS2( mov eax, edi)
939  AS2( movd edi, xmm2)
940  AS2( psrldq xmm2, 4)
941  XOR( ebx, al, 0) // 4
942  MXOR( 1, ah, 1) // 5
943  AS2( shr eax, 16)
944  XOR( edx, al, 2) // 6
945  XOR( ecx, ah, 3) // 7
946  AS2( mov eax, edi)
947  AS2( movd edi, xmm2)
948  XOR( ecx, al, 0) // 8
949  XOR( ebx, ah, 1) // 9
950  AS2( shr eax, 16)
951  MXOR( 1, al, 2) // 10
952  XOR( edx, ah, 3) // 11
953  AS2( mov eax, edi)
954  XOR( edx, al, 0) // 12
955  XOR( ecx, ah, 1) // 13
956  AS2( shr eax, 16)
957  XOR( ebx, al, 2) // 14
958  AS2( psrldq xmm2, 3)
959 
960  // partial second round, in: ebx(4,5,6,7), ecx(8,9,10,11), edx(12,13,14,15), out: eax, ebx, edi, mm0
961  AS2( mov eax, [L_KEY12+0*4])
962  AS2( mov edi, [L_KEY12+2*4])
963  AS2( MOVD MM(0), [L_KEY12+3*4])
964  MXOR( 0, cl, 3) /* 11 */
965  XOR( edi, bl, 3) /* 7 */
966  MXOR( 0, bh, 2) /* 6 */
967  AS2( shr ebx, 16) /* 4,5 */
968  XOR( eax, bl, 1) /* 5 */
969  MOV( ebx, bh, 0) /* 4 */
970  AS2( xor ebx, [L_KEY12+1*4])
971  XOR( eax, ch, 2) /* 10 */
972  AS2( shr ecx, 16) /* 8,9 */
973  XOR( eax, dl, 3) /* 15 */
974  XOR( ebx, dh, 2) /* 14 */
975  AS2( shr edx, 16) /* 12,13 */
976  XOR( edi, ch, 0) /* 8 */
977  XOR( ebx, cl, 1) /* 9 */
978  XOR( edi, dl, 1) /* 13 */
979  MXOR( 0, dh, 0) /* 12 */
980 
981  AS2( movd ecx, xmm2)
982  AS2( MOVD edx, MM(1))
983  AS2( MOVD [L_SAVED_X+3*4], MM(0))
984  AS2( mov [L_SAVED_X+0*4], eax)
985  AS2( mov [L_SAVED_X+1*4], ebx)
986  AS2( mov [L_SAVED_X+2*4], edi)
987  ATT_NOPREFIX
988  ASJ( jmp, 5, f)
989  INTEL_NOPREFIX
990  ASL(3)
991  // non-counter mode per-block setup
992  AS2( MOVD MM(1), [L_KEY12+0*4]) // 0,1,2,3
993  AS2( mov ebx, [L_KEY12+1*4]) // 4,5,6,7
994  AS2( mov ecx, [L_KEY12+2*4]) // 8,9,10,11
995  AS2( mov edx, [L_KEY12+3*4]) // 12,13,14,15
996  ASL(8)
997  AS2( mov WORD_REG(ax), [L_INBLOCKS])
998  AS2( movdqu xmm2, [WORD_REG(ax)])
999  AS2( mov WORD_REG(si), [L_INXORBLOCKS])
1000  AS2( movdqu xmm5, [WORD_REG(si)])
1001  AS2( pxor xmm2, xmm1)
1002  AS2( pxor xmm2, xmm5)
1003 
1004  // first round, in: xmm2(15,14,13,12;11,10,9,8;7,6,5,4;3,2,1,0), out: eax, ebx, ecx, edx
1005  AS2( movd eax, xmm2)
1006  AS2( psrldq xmm2, 4)
1007  AS2( movd edi, xmm2)
1008  AS2( psrldq xmm2, 4)
1009  MXOR( 1, al, 0) // 0
1010  XOR( edx, ah, 1) // 1
1011  AS2( shr eax, 16)
1012  XOR( ecx, al, 2) // 2
1013  XOR( ebx, ah, 3) // 3
1014  AS2( mov eax, edi)
1015  AS2( movd edi, xmm2)
1016  AS2( psrldq xmm2, 4)
1017  XOR( ebx, al, 0) // 4
1018  MXOR( 1, ah, 1) // 5
1019  AS2( shr eax, 16)
1020  XOR( edx, al, 2) // 6
1021  XOR( ecx, ah, 3) // 7
1022  AS2( mov eax, edi)
1023  AS2( movd edi, xmm2)
1024  XOR( ecx, al, 0) // 8
1025  XOR( ebx, ah, 1) // 9
1026  AS2( shr eax, 16)
1027  MXOR( 1, al, 2) // 10
1028  XOR( edx, ah, 3) // 11
1029  AS2( mov eax, edi)
1030  XOR( edx, al, 0) // 12
1031  XOR( ecx, ah, 1) // 13
1032  AS2( shr eax, 16)
1033  XOR( ebx, al, 2) // 14
1034  MXOR( 1, ah, 3) // 15
1035  AS2( MOVD eax, MM(1))
1036 
1037  AS2( add L_REG, [L_KEYS_BEGIN])
1038  AS2( add L_REG, 4*16)
1039  ATT_NOPREFIX
1040  ASJ( jmp, 2, f)
1041  INTEL_NOPREFIX
1042  ASL(1)
1043  // counter-mode per-block setup
1044  AS2( MOVD ecx, MM(2))
1045  AS2( MOVD edx, MM(1))
1046  AS2( mov eax, [L_SAVED_X+0*4])
1047  AS2( mov ebx, [L_SAVED_X+1*4])
1048  AS2( xor cl, ch)
1049  AS2( and WORD_REG(cx), 255)
1050  ASL(5)
1051 #if CRYPTOPP_BOOL_X86
1052  AS2( paddb MM(2), mm3)
1053 #else
1054  AS2( add MM(2), 1)
1055 #endif
1056  // remaining part of second round, in: edx(previous round),esi(keyed counter byte) eax,ebx,[L_SAVED_X+2*4],[L_SAVED_X+3*4], out: eax,ebx,ecx,edx
1057  AS2( xor edx, DWORD PTR [AS_REG_7+WORD_REG(cx)*8+3])
1058  XOR( ebx, dl, 3)
1059  MOV( ecx, dh, 2)
1060  AS2( shr edx, 16)
1061  AS2( xor ecx, [L_SAVED_X+2*4])
1062  XOR( eax, dh, 0)
1063  MOV( edx, dl, 1)
1064  AS2( xor edx, [L_SAVED_X+3*4])
1065 
1066  AS2( add L_REG, [L_KEYS_BEGIN])
1067  AS2( add L_REG, 3*16)
1068  ATT_NOPREFIX
1069  ASJ( jmp, 4, f)
1070  INTEL_NOPREFIX
1071 
1072 // in: eax(0,1,2,3), ebx(4,5,6,7), ecx(8,9,10,11), edx(12,13,14,15)
1073 // out: eax, ebx, edi, mm0
1074 #define ROUND() \
1075  MXOR( 0, cl, 3) /* 11 */\
1076  AS2( mov cl, al) /* 8,9,10,3 */\
1077  XOR( edi, ah, 2) /* 2 */\
1078  AS2( shr eax, 16) /* 0,1 */\
1079  XOR( edi, bl, 3) /* 7 */\
1080  MXOR( 0, bh, 2) /* 6 */\
1081  AS2( shr ebx, 16) /* 4,5 */\
1082  MXOR( 0, al, 1) /* 1 */\
1083  MOV( eax, ah, 0) /* 0 */\
1084  XOR( eax, bl, 1) /* 5 */\
1085  MOV( ebx, bh, 0) /* 4 */\
1086  XOR( eax, ch, 2) /* 10 */\
1087  XOR( ebx, cl, 3) /* 3 */\
1088  AS2( shr ecx, 16) /* 8,9 */\
1089  XOR( eax, dl, 3) /* 15 */\
1090  XOR( ebx, dh, 2) /* 14 */\
1091  AS2( shr edx, 16) /* 12,13 */\
1092  XOR( edi, ch, 0) /* 8 */\
1093  XOR( ebx, cl, 1) /* 9 */\
1094  XOR( edi, dl, 1) /* 13 */\
1095  MXOR( 0, dh, 0) /* 12 */\
1096 
1097  ASL(2) // 2-round loop
1098  AS2( MOVD MM(0), [L_SUBKEYS-4*16+3*4])
1099  AS2( mov edi, [L_SUBKEYS-4*16+2*4])
1100  ROUND()
1101  AS2( mov ecx, edi)
1102  AS2( xor eax, [L_SUBKEYS-4*16+0*4])
1103  AS2( xor ebx, [L_SUBKEYS-4*16+1*4])
1104  AS2( MOVD edx, MM(0))
1105 
1106  ASL(4)
1107  AS2( MOVD MM(0), [L_SUBKEYS-4*16+7*4])
1108  AS2( mov edi, [L_SUBKEYS-4*16+6*4])
1109  ROUND()
1110  AS2( mov ecx, edi)
1111  AS2( xor eax, [L_SUBKEYS-4*16+4*4])
1112  AS2( xor ebx, [L_SUBKEYS-4*16+5*4])
1113  AS2( MOVD edx, MM(0))
1114 
1115  AS2( add L_REG, 32)
1116  AS2( test L_REG, 255)
1117  ATT_NOPREFIX
1118  ASJ( jnz, 2, b)
1119  INTEL_NOPREFIX
1120  AS2( sub L_REG, 16*16)
1121 
1122 #define LAST(a, b, c) \
1123  AS2( movzx esi, a )\
1124  AS2( movzx edi, BYTE PTR [AS_REG_7+WORD_REG(si)*8+1] )\
1125  AS2( movzx esi, b )\
1126  AS2( xor edi, DWORD PTR [AS_REG_7+WORD_REG(si)*8+0] )\
1127  AS2( mov WORD PTR [L_LASTROUND+c], di )\
1128 
1129  // last round
1130  LAST(ch, dl, 2)
1131  LAST(dh, al, 6)
1132  AS2( shr edx, 16)
1133  LAST(ah, bl, 10)
1134  AS2( shr eax, 16)
1135  LAST(bh, cl, 14)
1136  AS2( shr ebx, 16)
1137  LAST(dh, al, 12)
1138  AS2( shr ecx, 16)
1139  LAST(ah, bl, 0)
1140  LAST(bh, cl, 4)
1141  LAST(ch, dl, 8)
1142 
1143  AS2( mov WORD_REG(ax), [L_OUTXORBLOCKS])
1144  AS2( mov WORD_REG(bx), [L_OUTBLOCKS])
1145 
1146  AS2( mov WORD_REG(cx), [L_LENGTH])
1147  AS2( sub WORD_REG(cx), 16)
1148 
1149  AS2( movdqu xmm2, [WORD_REG(ax)])
1150  AS2( pxor xmm2, xmm4)
1151 
1152 #if CRYPTOPP_BOOL_X86
1153  AS2( movdqa xmm0, [L_INCREMENTS])
1154  AS2( paddd xmm0, [L_INBLOCKS])
1155  AS2( movdqa [L_INBLOCKS], xmm0)
1156 #else
1157  AS2( movdqa xmm0, [L_INCREMENTS+16])
1158  AS2( paddq xmm0, [L_INBLOCKS+16])
1159  AS2( movdqa [L_INBLOCKS+16], xmm0)
1160 #endif
1161 
1162  AS2( pxor xmm2, [L_LASTROUND])
1163  AS2( movdqu [WORD_REG(bx)], xmm2)
1164 
1165  ATT_NOPREFIX
1166  ASJ( jle, 7, f)
1167  INTEL_NOPREFIX
1168  AS2( mov [L_LENGTH], WORD_REG(cx))
1169  AS2( test WORD_REG(cx), 1)
1170  ATT_NOPREFIX
1171  ASJ( jnz, 1, b)
1172  INTEL_NOPREFIX
1173 #if CRYPTOPP_BOOL_X64
1174  AS2( movdqa xmm0, [L_INCREMENTS])
1175  AS2( paddq xmm0, [L_INBLOCKS])
1176  AS2( movdqa [L_INBLOCKS], xmm0)
1177 #endif
1178  ATT_NOPREFIX
1179  ASJ( jmp, 3, b)
1180  INTEL_NOPREFIX
1181 
1182  ASL(7)
1183  // erase keys on stack
1184  AS2( xorps xmm0, xmm0)
1185  AS2( lea WORD_REG(ax), [L_SUBKEYS+7*16])
1186  AS2( movaps [WORD_REG(ax)-7*16], xmm0)
1187  AS2( movaps [WORD_REG(ax)-6*16], xmm0)
1188  AS2( movaps [WORD_REG(ax)-5*16], xmm0)
1189  AS2( movaps [WORD_REG(ax)-4*16], xmm0)
1190  AS2( movaps [WORD_REG(ax)-3*16], xmm0)
1191  AS2( movaps [WORD_REG(ax)-2*16], xmm0)
1192  AS2( movaps [WORD_REG(ax)-1*16], xmm0)
1193  AS2( movaps [WORD_REG(ax)+0*16], xmm0)
1194  AS2( movaps [WORD_REG(ax)+1*16], xmm0)
1195  AS2( movaps [WORD_REG(ax)+2*16], xmm0)
1196  AS2( movaps [WORD_REG(ax)+3*16], xmm0)
1197  AS2( movaps [WORD_REG(ax)+4*16], xmm0)
1198  AS2( movaps [WORD_REG(ax)+5*16], xmm0)
1199  AS2( movaps [WORD_REG(ax)+6*16], xmm0)
1200 #if CRYPTOPP_BOOL_X86
1201  AS2( mov esp, [L_SP])
1202  AS1( emms)
1203 #endif
1204  AS_POP_IF86(bp)
1205  AS_POP_IF86(bx)
1206 #if defined(_MSC_VER) && CRYPTOPP_BOOL_X86
1207  AS_POP_IF86(di)
1208  AS_POP_IF86(si)
1209  AS1(ret)
1210 #endif
1211 #ifdef CRYPTOPP_GENERATE_X64_MASM
1212  pop r12
1213  pop rbx
1214  pop rdi
1215  pop rsi
1216  ret
1217  Rijndael_Enc_AdvancedProcessBlocks ENDP
1218 #endif
1219 #ifdef __GNUC__
1220  ATT_PREFIX
1221  :
1222  : "c" (locals), "d" (k), "S" (Te), "D" (g_cacheLineSize)
1223  : "memory", "cc", "%eax"
1224  #if CRYPTOPP_BOOL_X64
1225  , "%rbx", "%r8", "%r9", "%r10", "%r11", "%r12"
1226  #endif
1227  );
1228 #endif
1229 }
1230 
1231 #endif
1232 
1233 #ifndef CRYPTOPP_GENERATE_X64_MASM
1234 
1235 #ifdef CRYPTOPP_X64_MASM_AVAILABLE
1236 extern "C" {
1237 void Rijndael_Enc_AdvancedProcessBlocks_SSE2(void *locals, const word32 *k);
1238 }
1239 #endif
1240 
1241 #if CRYPTOPP_RIJNDAEL_ADVANCED_PROCESS_BLOCKS
1242 size_t Rijndael::Enc::AdvancedProcessBlocks(const byte *inBlocks, const byte *xorBlocks, byte *outBlocks, size_t length, word32 flags) const
1243 {
1244 #if CRYPTOPP_AESNI_AVAILABLE
1245  if (HasAESNI())
1246  return Rijndael_Enc_AdvancedProcessBlocks_AESNI(m_key, m_rounds, inBlocks, xorBlocks, outBlocks, length, flags);
1247 #endif
1248 #if CRYPTOPP_ARM_AES_AVAILABLE
1249  if (HasAES())
1250  return Rijndael_Enc_AdvancedProcessBlocks_ARMV8(m_key, m_rounds, inBlocks, xorBlocks, outBlocks, length, flags);
1251 #endif
1252 #if CRYPTOPP_POWER8_AES_AVAILABLE
1253  if (HasAES())
1254  return Rijndael_Enc_AdvancedProcessBlocks128_6x1_ALTIVEC(m_key, m_rounds, inBlocks, xorBlocks, outBlocks, length, flags);
1255 #endif
1256 
1257 #if (CRYPTOPP_SSE2_ASM_AVAILABLE || defined(CRYPTOPP_X64_MASM_AVAILABLE)) && !defined(CRYPTOPP_DISABLE_RIJNDAEL_ASM)
1258  if (HasSSE2())
1259  {
1260  if (length < BLOCKSIZE)
1261  return length;
1262 
1263  static const byte *zeros = (const byte*)(Te+256);
1264  m_aliasBlock.SetMark(m_aliasBlock.size());
1265  byte *space = NULLPTR, *originalSpace = const_cast<byte*>(m_aliasBlock.data());
1266 
1267  // round up to nearest 256 byte boundary
1268  space = originalSpace + (s_aliasBlockSize - (uintptr_t)originalSpace % s_aliasBlockSize) % s_aliasBlockSize;
1269  while (AliasedWithTable(space, space + sizeof(Locals)))
1270  {
1271  space += 256;
1272  CRYPTOPP_ASSERT(space < (originalSpace + s_aliasPageSize));
1273  }
1274 
1275  size_t increment = BLOCKSIZE;
1276  if (flags & BT_ReverseDirection)
1277  {
1278  CRYPTOPP_ASSERT(length % BLOCKSIZE == 0);
1279  inBlocks += length - BLOCKSIZE;
1280  xorBlocks += length - BLOCKSIZE;
1281  outBlocks += length - BLOCKSIZE;
1282  increment = 0-increment;
1283  }
1284 
1285  Locals &locals = *(Locals *)(void *)space;
1286 
1287  locals.inBlocks = inBlocks;
1288  locals.inXorBlocks = (flags & BT_XorInput) && xorBlocks ? xorBlocks : zeros;
1289  locals.outXorBlocks = (flags & BT_XorInput) || !xorBlocks ? zeros : xorBlocks;
1290  locals.outBlocks = outBlocks;
1291 
1292  locals.inIncrement = (flags & BT_DontIncrementInOutPointers) ? 0 : increment;
1293  locals.inXorIncrement = (flags & BT_XorInput) && xorBlocks ? increment : 0;
1294  locals.outXorIncrement = (flags & BT_XorInput) || !xorBlocks ? 0 : increment;
1295  locals.outIncrement = (flags & BT_DontIncrementInOutPointers) ? 0 : increment;
1296 
1297  locals.lengthAndCounterFlag = length - (length%16) - bool(flags & BT_InBlockIsCounter);
1298  int keysToCopy = m_rounds - (flags & BT_InBlockIsCounter ? 3 : 2);
1299  locals.keysBegin = (12-keysToCopy)*16;
1300 
1301  Rijndael_Enc_AdvancedProcessBlocks_SSE2(&locals, m_key);
1302 
1303  return length % BLOCKSIZE;
1304  }
1305 #endif
1306 
1307  return BlockTransformation::AdvancedProcessBlocks(inBlocks, xorBlocks, outBlocks, length, flags);
1308 }
1309 
1310 size_t Rijndael::Dec::AdvancedProcessBlocks(const byte *inBlocks, const byte *xorBlocks, byte *outBlocks, size_t length, word32 flags) const
1311 {
1312 #if CRYPTOPP_AESNI_AVAILABLE
1313  if (HasAESNI())
1314  return Rijndael_Dec_AdvancedProcessBlocks_AESNI(m_key, m_rounds, inBlocks, xorBlocks, outBlocks, length, flags);
1315 #endif
1316 #if CRYPTOPP_ARM_AES_AVAILABLE
1317  if (HasAES())
1318  return Rijndael_Dec_AdvancedProcessBlocks_ARMV8(m_key, m_rounds, inBlocks, xorBlocks, outBlocks, length, flags);
1319 #endif
1320 #if CRYPTOPP_POWER8_AES_AVAILABLE
1321  if (HasAES())
1322  return Rijndael_Dec_AdvancedProcessBlocks128_6x1_ALTIVEC(m_key, m_rounds, inBlocks, xorBlocks, outBlocks, length, flags);
1323 #endif
1324 
1325  return BlockTransformation::AdvancedProcessBlocks(inBlocks, xorBlocks, outBlocks, length, flags);
1326 }
1327 #endif // CRYPTOPP_RIJNDAEL_ADVANCED_PROCESS_BLOCKS
1328 
1329 NAMESPACE_END
1330 
1331 #endif
1332 #endif
Utility functions for the Crypto++ library.
bool HasAES()
Determine if an ARM processor has AES available.
Definition: cpu.h:449
Library configuration file.
int GetCacheLineSize()
Provides the cache line size.
Definition: cpu.h:328
Access a block of memory.
Definition: misc.h:2532
Rijndael block cipher.
Definition: rijndael.h:45
virtual unsigned int OptimalDataAlignment() const
Provides input and output data alignment for optimal performance.
Definition: cryptlib.cpp:190
T ConditionalByteReverse(ByteOrder order, T value)
Reverses bytes in a value depending upon endianness.
Definition: misc.h:2113
virtual size_t AdvancedProcessBlocks(const byte *inBlocks, const byte *xorBlocks, byte *outBlocks, size_t length, word32 flags) const
Encrypt and xor multiple blocks using additional flags.
Definition: cryptlib.cpp:141
Precompiled header file.
byte order is big-endian
Definition: cryptlib.h:147
#define CRYPTOPP_ASSERT(exp)
Debugging and diagnostic assertion.
Definition: trap.h:69
Classes for Rijndael encryption algorithm.
Functions for CPU features and intrinsics.
bool HasAESNI()
Determines AES-NI availability.
Definition: cpu.h:165
bool HasSSE2()
Determines SSE2 availability.
Definition: cpu.h:116
void xorbuf(byte *buf, const byte *mask, size_t count)
Performs an XOR of a buffer with a mask.
Definition: misc.cpp:32
bool HasSSE41()
Determines SSE4.1 availability.
Definition: cpu.h:142
Crypto++ library namespace.
bool HasARMv7()
Determine if an ARM processor is ARMv7 or above.
Definition: cpu.h:367
Interface for retrieving values given their names.
Definition: cryptlib.h:293