Crypto++  8.2
Free C++ class library of cryptographic schemes
cham_simd.cpp
1 // cham_simd.cpp - written and placed in the public domain by Jeffrey Walton
2 //
3 // This source file uses intrinsics and built-ins to gain access to
4 // SSSE3, ARM NEON and ARMv8a, and Power7 Altivec instructions. A separate
5 // source file is needed because additional CXXFLAGS are required to enable
6 // the appropriate instructions sets in some build configurations.
7 
8 #include "pch.h"
9 #include "config.h"
10 
11 #include "cham.h"
12 #include "misc.h"
13 
14 // Uncomment for benchmarking C++ against SSE or NEON.
15 // Do so in both simon.cpp and simon-simd.cpp.
16 // #undef CRYPTOPP_SSSE3_AVAILABLE
17 // #undef CRYPTOPP_ARM_NEON_AVAILABLE
18 
19 #if (CRYPTOPP_SSSE3_AVAILABLE)
20 #include "adv_simd.h"
21 # include <pmmintrin.h>
22 # include <tmmintrin.h>
23 #endif
24 
25 #if defined(__XOP__)
26 # include <ammintrin.h>
27 #endif
28 
29 #if defined(__AVX512F__)
30 # define CRYPTOPP_AVX512_ROTATE 1
31 # include <immintrin.h>
32 #endif
33 
34 // Squash MS LNK4221 and libtool warnings
35 extern const char CHAM_SIMD_FNAME[] = __FILE__;
36 
37 ANONYMOUS_NAMESPACE_BEGIN
38 
39 using CryptoPP::word16;
40 using CryptoPP::word32;
41 
42 #if (CRYPTOPP_SSSE3_AVAILABLE)
43 
44 //////////////////////////////////////////////////////////////////////////
45 
46 NAMESPACE_BEGIN(W16) // CHAM64, 16-bit word size
47 
48 template <unsigned int R>
49 inline __m128i RotateLeft16(const __m128i& val)
50 {
51 #if defined(__XOP__)
52  return _mm_roti_epi16(val, R);
53 #else
54  return _mm_or_si128(
55  _mm_slli_epi16(val, R), _mm_srli_epi16(val, 16-R));
56 #endif
57 }
58 
59 template <unsigned int R>
60 inline __m128i RotateRight16(const __m128i& val)
61 {
62 #if defined(__XOP__)
63  return _mm_roti_epi16(val, 16-R);
64 #else
65  return _mm_or_si128(
66  _mm_slli_epi16(val, 16-R), _mm_srli_epi16(val, R));
67 #endif
68 }
69 
70 template <>
71 inline __m128i RotateLeft16<8>(const __m128i& val)
72 {
73 #if defined(__XOP__)
74  return _mm_roti_epi16(val, 8);
75 #else
76  const __m128i mask = _mm_set_epi8(14,15, 12,13, 10,11, 8,9, 6,7, 4,5, 2,3, 0,1);
77  return _mm_shuffle_epi8(val, mask);
78 #endif
79 }
80 
81 template <>
82 inline __m128i RotateRight16<8>(const __m128i& val)
83 {
84 #if defined(__XOP__)
85  return _mm_roti_epi16(val, 16-8);
86 #else
87  const __m128i mask = _mm_set_epi8(14,15, 12,13, 10,11, 8,9, 6,7, 4,5, 2,3, 0,1);
88  return _mm_shuffle_epi8(val, mask);
89 #endif
90 }
91 
92 template <unsigned int IDX>
93 inline __m128i UnpackXMM(const __m128i& a, const __m128i& b, const __m128i& c, const __m128i& d,
94  const __m128i& e, const __m128i& f, const __m128i& g, const __m128i& h)
95 {
96  // Should not be instantiated
97  CRYPTOPP_UNUSED(a); CRYPTOPP_UNUSED(b);
98  CRYPTOPP_UNUSED(c); CRYPTOPP_UNUSED(d);
99  CRYPTOPP_UNUSED(e); CRYPTOPP_UNUSED(f);
100  CRYPTOPP_UNUSED(g); CRYPTOPP_UNUSED(h);
101  CRYPTOPP_ASSERT(0);
102  return _mm_setzero_si128();
103 }
104 
105 template <>
106 inline __m128i UnpackXMM<0>(const __m128i& a, const __m128i& b, const __m128i& c, const __m128i& d,
107  const __m128i& e, const __m128i& f, const __m128i& g, const __m128i& h)
108 {
109  // The shuffle converts to and from little-endian for SSE. A specialized
110  // CHAM implementation can avoid the shuffle by framing the data for
111  // encryption, decryption and benchmarks. The library cannot take the
112  // speed-up because of the byte oriented API.
113  const __m128i r1 = _mm_unpacklo_epi16(a, b);
114  const __m128i r2 = _mm_unpacklo_epi16(c, d);
115  const __m128i r3 = _mm_unpacklo_epi16(e, f);
116  const __m128i r4 = _mm_unpacklo_epi16(g, h);
117 
118  const __m128i r5 = _mm_unpacklo_epi32(r1, r2);
119  const __m128i r6 = _mm_unpacklo_epi32(r3, r4);
120  return _mm_shuffle_epi8(_mm_unpacklo_epi64(r5, r6),
121  _mm_set_epi8(14,15,12,13, 10,11,8,9, 6,7,4,5, 2,3,0,1));
122 }
123 
124 template <>
125 inline __m128i UnpackXMM<1>(const __m128i& a, const __m128i& b, const __m128i& c, const __m128i& d,
126  const __m128i& e, const __m128i& f, const __m128i& g, const __m128i& h)
127 {
128  // The shuffle converts to and from little-endian for SSE. A specialized
129  // CHAM implementation can avoid the shuffle by framing the data for
130  // encryption, decryption and benchmarks. The library cannot take the
131  // speed-up because of the byte oriented API.
132  const __m128i r1 = _mm_unpacklo_epi16(a, b);
133  const __m128i r2 = _mm_unpacklo_epi16(c, d);
134  const __m128i r3 = _mm_unpacklo_epi16(e, f);
135  const __m128i r4 = _mm_unpacklo_epi16(g, h);
136 
137  const __m128i r5 = _mm_unpacklo_epi32(r1, r2);
138  const __m128i r6 = _mm_unpacklo_epi32(r3, r4);
139  return _mm_shuffle_epi8(_mm_unpackhi_epi64(r5, r6),
140  _mm_set_epi8(14,15,12,13, 10,11,8,9, 6,7,4,5, 2,3,0,1));
141 }
142 
143 template <>
144 inline __m128i UnpackXMM<2>(const __m128i& a, const __m128i& b, const __m128i& c, const __m128i& d,
145  const __m128i& e, const __m128i& f, const __m128i& g, const __m128i& h)
146 {
147  // The shuffle converts to and from little-endian for SSE. A specialized
148  // CHAM implementation can avoid the shuffle by framing the data for
149  // encryption, decryption and benchmarks. The library cannot take the
150  // speed-up because of the byte oriented API.
151  const __m128i r1 = _mm_unpacklo_epi16(a, b);
152  const __m128i r2 = _mm_unpacklo_epi16(c, d);
153  const __m128i r3 = _mm_unpacklo_epi16(e, f);
154  const __m128i r4 = _mm_unpacklo_epi16(g, h);
155 
156  const __m128i r5 = _mm_unpackhi_epi32(r1, r2);
157  const __m128i r6 = _mm_unpackhi_epi32(r3, r4);
158  return _mm_shuffle_epi8(_mm_unpacklo_epi64(r5, r6),
159  _mm_set_epi8(14,15,12,13, 10,11,8,9, 6,7,4,5, 2,3,0,1));
160 }
161 
162 template <>
163 inline __m128i UnpackXMM<3>(const __m128i& a, const __m128i& b, const __m128i& c, const __m128i& d,
164  const __m128i& e, const __m128i& f, const __m128i& g, const __m128i& h)
165 {
166  // The shuffle converts to and from little-endian for SSE. A specialized
167  // CHAM implementation can avoid the shuffle by framing the data for
168  // encryption, decryption and benchmarks. The library cannot take the
169  // speed-up because of the byte oriented API.
170  const __m128i r1 = _mm_unpacklo_epi16(a, b);
171  const __m128i r2 = _mm_unpacklo_epi16(c, d);
172  const __m128i r3 = _mm_unpacklo_epi16(e, f);
173  const __m128i r4 = _mm_unpacklo_epi16(g, h);
174 
175  const __m128i r5 = _mm_unpackhi_epi32(r1, r2);
176  const __m128i r6 = _mm_unpackhi_epi32(r3, r4);
177  return _mm_shuffle_epi8(_mm_unpackhi_epi64(r5, r6),
178  _mm_set_epi8(14,15,12,13, 10,11,8,9, 6,7,4,5, 2,3,0,1));
179 }
180 
181 template <>
182 inline __m128i UnpackXMM<4>(const __m128i& a, const __m128i& b, const __m128i& c, const __m128i& d,
183  const __m128i& e, const __m128i& f, const __m128i& g, const __m128i& h)
184 {
185  // The shuffle converts to and from little-endian for SSE. A specialized
186  // CHAM implementation can avoid the shuffle by framing the data for
187  // encryption, decryption and benchmarks. The library cannot take the
188  // speed-up because of the byte oriented API.
189  const __m128i r1 = _mm_unpackhi_epi16(a, b);
190  const __m128i r2 = _mm_unpackhi_epi16(c, d);
191  const __m128i r3 = _mm_unpackhi_epi16(e, f);
192  const __m128i r4 = _mm_unpackhi_epi16(g, h);
193 
194  const __m128i r5 = _mm_unpacklo_epi32(r1, r2);
195  const __m128i r6 = _mm_unpacklo_epi32(r3, r4);
196  return _mm_shuffle_epi8(_mm_unpacklo_epi64(r5, r6),
197  _mm_set_epi8(14,15,12,13, 10,11,8,9, 6,7,4,5, 2,3,0,1));
198 }
199 
200 template <>
201 inline __m128i UnpackXMM<5>(const __m128i& a, const __m128i& b, const __m128i& c, const __m128i& d,
202  const __m128i& e, const __m128i& f, const __m128i& g, const __m128i& h)
203 {
204  // The shuffle converts to and from little-endian for SSE. A specialized
205  // CHAM implementation can avoid the shuffle by framing the data for
206  // encryption, decryption and benchmarks. The library cannot take the
207  // speed-up because of the byte oriented API.
208  const __m128i r1 = _mm_unpackhi_epi16(a, b);
209  const __m128i r2 = _mm_unpackhi_epi16(c, d);
210  const __m128i r3 = _mm_unpackhi_epi16(e, f);
211  const __m128i r4 = _mm_unpackhi_epi16(g, h);
212 
213  const __m128i r5 = _mm_unpacklo_epi32(r1, r2);
214  const __m128i r6 = _mm_unpacklo_epi32(r3, r4);
215  return _mm_shuffle_epi8(_mm_unpackhi_epi64(r5, r6),
216  _mm_set_epi8(14,15,12,13, 10,11,8,9, 6,7,4,5, 2,3,0,1));
217 }
218 
219 template <>
220 inline __m128i UnpackXMM<6>(const __m128i& a, const __m128i& b, const __m128i& c, const __m128i& d,
221  const __m128i& e, const __m128i& f, const __m128i& g, const __m128i& h)
222 {
223  // The shuffle converts to and from little-endian for SSE. A specialized
224  // CHAM implementation can avoid the shuffle by framing the data for
225  // encryption, decryption and benchmarks. The library cannot take the
226  // speed-up because of the byte oriented API.
227  const __m128i r1 = _mm_unpackhi_epi16(a, b);
228  const __m128i r2 = _mm_unpackhi_epi16(c, d);
229  const __m128i r3 = _mm_unpackhi_epi16(e, f);
230  const __m128i r4 = _mm_unpackhi_epi16(g, h);
231 
232  const __m128i r5 = _mm_unpackhi_epi32(r1, r2);
233  const __m128i r6 = _mm_unpackhi_epi32(r3, r4);
234  return _mm_shuffle_epi8(_mm_unpacklo_epi64(r5, r6),
235  _mm_set_epi8(14,15,12,13, 10,11,8,9, 6,7,4,5, 2,3,0,1));
236 }
237 
238 template <>
239 inline __m128i UnpackXMM<7>(const __m128i& a, const __m128i& b, const __m128i& c, const __m128i& d,
240  const __m128i& e, const __m128i& f, const __m128i& g, const __m128i& h)
241 {
242  // The shuffle converts to and from little-endian for SSE. A specialized
243  // CHAM implementation can avoid the shuffle by framing the data for
244  // encryption, decryption and benchmarks. The library cannot take the
245  // speed-up because of the byte oriented API.
246  const __m128i r1 = _mm_unpackhi_epi16(a, b);
247  const __m128i r2 = _mm_unpackhi_epi16(c, d);
248  const __m128i r3 = _mm_unpackhi_epi16(e, f);
249  const __m128i r4 = _mm_unpackhi_epi16(g, h);
250 
251  const __m128i r5 = _mm_unpackhi_epi32(r1, r2);
252  const __m128i r6 = _mm_unpackhi_epi32(r3, r4);
253  return _mm_shuffle_epi8(_mm_unpackhi_epi64(r5, r6),
254  _mm_set_epi8(14,15,12,13, 10,11,8,9, 6,7,4,5, 2,3,0,1));
255 }
256 
257 template <unsigned int IDX>
258 inline __m128i UnpackXMM(const __m128i& v)
259 {
260  // Should not be instantiated
261  CRYPTOPP_UNUSED(v); CRYPTOPP_ASSERT(0);
262 
263  return _mm_setzero_si128();
264 }
265 
266 template <>
267 inline __m128i UnpackXMM<0>(const __m128i& v)
268 {
269  return _mm_shuffle_epi8(v, _mm_set_epi8(0,1, 0,1, 0,1, 0,1, 0,1, 0,1, 0,1, 0,1));
270 }
271 
272 template <>
273 inline __m128i UnpackXMM<1>(const __m128i& v)
274 {
275  return _mm_shuffle_epi8(v, _mm_set_epi8(2,3, 2,3, 2,3, 2,3, 2,3, 2,3, 2,3, 2,3));
276 }
277 
278 template <>
279 inline __m128i UnpackXMM<2>(const __m128i& v)
280 {
281  return _mm_shuffle_epi8(v, _mm_set_epi8(4,5, 4,5, 4,5, 4,5, 4,5, 4,5, 4,5, 4,5));
282 }
283 
284 template <>
285 inline __m128i UnpackXMM<3>(const __m128i& v)
286 {
287  return _mm_shuffle_epi8(v, _mm_set_epi8(6,7, 6,7, 6,7, 6,7, 6,7, 6,7, 6,7, 6,7));
288 }
289 
290 template <>
291 inline __m128i UnpackXMM<4>(const __m128i& v)
292 {
293  return _mm_shuffle_epi8(v, _mm_set_epi8(8,9, 8,9, 8,9, 8,9, 8,9, 8,9, 8,9, 8,9));
294 }
295 
296 template <>
297 inline __m128i UnpackXMM<5>(const __m128i& v)
298 {
299  return _mm_shuffle_epi8(v, _mm_set_epi8(10,11, 10,11, 10,11, 10,11, 10,11, 10,11, 10,11, 10,11));
300 }
301 
302 template <>
303 inline __m128i UnpackXMM<6>(const __m128i& v)
304 {
305  return _mm_shuffle_epi8(v, _mm_set_epi8(12,13, 12,13, 12,13, 12,13, 12,13, 12,13, 12,13, 12,13));
306 }
307 
308 template <>
309 inline __m128i UnpackXMM<7>(const __m128i& v)
310 {
311  return _mm_shuffle_epi8(v, _mm_set_epi8(14,15, 14,15, 14,15, 14,15, 14,15, 14,15, 14,15, 14,15));
312 }
313 
314 template <unsigned int IDX>
315 inline __m128i UnpackXMM(const __m128i& a, const __m128i& b)
316 {
317  const __m128i& z = _mm_setzero_si128();
318  return UnpackXMM<IDX>(a, b, z, z, z, z, z, z);
319 }
320 
321 template <unsigned int IDX>
322 inline __m128i RepackXMM(const __m128i& a, const __m128i& b, const __m128i& c, const __m128i& d,
323  const __m128i& e, const __m128i& f, const __m128i& g, const __m128i& h)
324 {
325  return UnpackXMM<IDX>(a, b, c, d, e, f, g, h);
326 }
327 
328 template <unsigned int IDX>
329 inline __m128i RepackXMM(const __m128i& v)
330 {
331  return UnpackXMM<IDX>(v);
332 }
333 
334 inline void CHAM64_Enc_Block(__m128i &block0,
335  const word16 *subkeys, unsigned int /*rounds*/)
336 {
337  // Rearrange the data for vectorization. UnpackXMM includes a
338  // little-endian swap for SSE. Thanks to Peter Cordes for help
339  // with packing and unpacking.
340  // [A1 A2 .. A6 A7][B1 B2 .. B6 B7] ... => [A1 B1 .. G1 H1][A2 B2 .. G2 H2] ...
341  __m128i a = UnpackXMM<0>(block0);
342  __m128i b = UnpackXMM<1>(block0);
343  __m128i c = UnpackXMM<2>(block0);
344  __m128i d = UnpackXMM<3>(block0);
345  __m128i e = UnpackXMM<4>(block0);
346  __m128i f = UnpackXMM<5>(block0);
347  __m128i g = UnpackXMM<6>(block0);
348  __m128i h = UnpackXMM<7>(block0);
349 
350  const unsigned int rounds = 80;
351  __m128i counter = _mm_set_epi16(0,0,0,0,0,0,0,0);
352  __m128i increment = _mm_set_epi16(1,1,1,1,1,1,1,1);
353 
354  const unsigned int MASK = 15;
355  for (int i=0; i<static_cast<int>(rounds); i+=4)
356  {
357  __m128i k, kr, t1, t2, t3, t4;
358  k = _mm_castpd_si128(_mm_load_sd((const double*)(&subkeys[(i+0) & MASK])));
359 
360  // Shuffle out key
361  kr = _mm_shuffle_epi8(k, _mm_set_epi8(1,0,1,0, 1,0,1,0, 1,0,1,0, 1,0,1,0));
362 
363  t1 = _mm_xor_si128(a, counter);
364  t3 = _mm_xor_si128(e, counter);
365  t2 = _mm_xor_si128(RotateLeft16<1>(b), kr);
366  t4 = _mm_xor_si128(RotateLeft16<1>(f), kr);
367  a = RotateLeft16<8>(_mm_add_epi16(t1, t2));
368  e = RotateLeft16<8>(_mm_add_epi16(t3, t4));
369 
370  counter = _mm_add_epi16(counter, increment);
371  kr = _mm_shuffle_epi8(k, _mm_set_epi8(3,2,3,2, 3,2,3,2, 3,2,3,2, 3,2,3,2));
372 
373  t1 = _mm_xor_si128(b, counter);
374  t3 = _mm_xor_si128(f, counter);
375  t2 = _mm_xor_si128(RotateLeft16<8>(c), kr);
376  t4 = _mm_xor_si128(RotateLeft16<8>(g), kr);
377  b = RotateLeft16<1>(_mm_add_epi16(t1, t2));
378  f = RotateLeft16<1>(_mm_add_epi16(t3, t4));
379 
380  counter = _mm_add_epi16(counter, increment);
381  kr = _mm_shuffle_epi8(k, _mm_set_epi8(5,4,5,4, 5,4,5,4, 5,4,5,4, 5,4,5,4));
382 
383  t1 = _mm_xor_si128(c, counter);
384  t3 = _mm_xor_si128(g, counter);
385  t2 = _mm_xor_si128(RotateLeft16<1>(d), kr);
386  t4 = _mm_xor_si128(RotateLeft16<1>(h), kr);
387  c = RotateLeft16<8>(_mm_add_epi16(t1, t2));
388  g = RotateLeft16<8>(_mm_add_epi16(t3, t4));
389 
390  counter = _mm_add_epi16(counter, increment);
391  kr = _mm_shuffle_epi8(k, _mm_set_epi8(7,6,7,6, 7,6,7,6, 7,6,7,6, 7,6,7,6));
392 
393  t1 = _mm_xor_si128(d, counter);
394  t3 = _mm_xor_si128(h, counter);
395  t2 = _mm_xor_si128(RotateLeft16<8>(a), kr);
396  t4 = _mm_xor_si128(RotateLeft16<8>(e), kr);
397  d = RotateLeft16<1>(_mm_add_epi16(t1, t2));
398  h = RotateLeft16<1>(_mm_add_epi16(t3, t4));
399 
400  counter = _mm_add_epi16(counter, increment);
401  }
402 
403  // [A1 B1 .. G1 H1][A2 B2 .. G2 H2] ... => [A1 A2 .. A6 A7][B1 B2 .. B6 B7] ...
404  block0 = RepackXMM<0>(a,b,c,d,e,f,g,h);
405 }
406 
407 inline void CHAM64_Dec_Block(__m128i &block0,
408  const word16 *subkeys, unsigned int /*rounds*/)
409 {
410  // Rearrange the data for vectorization. UnpackXMM includes a
411  // little-endian swap for SSE. Thanks to Peter Cordes for help
412  // with packing and unpacking.
413  // [A1 A2 .. A6 A7][B1 B2 .. B6 B7] ... => [A1 B1 .. G1 H1][A2 B2 .. G2 H2] ...
414  __m128i a = UnpackXMM<0>(block0);
415  __m128i b = UnpackXMM<1>(block0);
416  __m128i c = UnpackXMM<2>(block0);
417  __m128i d = UnpackXMM<3>(block0);
418  __m128i e = UnpackXMM<4>(block0);
419  __m128i f = UnpackXMM<5>(block0);
420  __m128i g = UnpackXMM<6>(block0);
421  __m128i h = UnpackXMM<7>(block0);
422 
423  const unsigned int rounds = 80;
424  __m128i counter = _mm_set_epi16(rounds-1,rounds-1,rounds-1,rounds-1, rounds-1,rounds-1,rounds-1,rounds-1);
425  __m128i decrement = _mm_set_epi16(1,1,1,1,1,1,1,1);
426 
427  const unsigned int MASK = 15;
428  for (int i = static_cast<int>(rounds)-1; i >= 0; i-=4)
429  {
430  __m128i k, kr, t1, t2, t3, t4;
431  k = _mm_castpd_si128(_mm_load_sd((const double*)(&subkeys[(i-3) & MASK])));
432 
433  // Shuffle out key
434  kr = _mm_shuffle_epi8(k, _mm_set_epi8(7,6,7,6, 7,6,7,6, 7,6,7,6, 7,6,7,6));
435 
436  // Odd round
437  t1 = RotateRight16<1>(d);
438  t3 = RotateRight16<1>(h);
439  t2 = _mm_xor_si128(RotateLeft16<8>(a), kr);
440  t4 = _mm_xor_si128(RotateLeft16<8>(e), kr);
441  d = _mm_xor_si128(_mm_sub_epi16(t1, t2), counter);
442  h = _mm_xor_si128(_mm_sub_epi16(t3, t4), counter);
443 
444  counter = _mm_sub_epi16(counter, decrement);
445  kr = _mm_shuffle_epi8(k, _mm_set_epi8(5,4,5,4, 5,4,5,4, 5,4,5,4, 5,4,5,4));
446 
447  // Even round
448  t1 = RotateRight16<8>(c);
449  t3 = RotateRight16<8>(g);
450  t2 = _mm_xor_si128(RotateLeft16<1>(d), kr);
451  t4 = _mm_xor_si128(RotateLeft16<1>(h), kr);
452  c = _mm_xor_si128(_mm_sub_epi16(t1, t2), counter);
453  g = _mm_xor_si128(_mm_sub_epi16(t3, t4), counter);
454 
455  counter = _mm_sub_epi16(counter, decrement);
456  kr = _mm_shuffle_epi8(k, _mm_set_epi8(3,2,3,2, 3,2,3,2, 3,2,3,2, 3,2,3,2));
457 
458  // Odd round
459  t1 = RotateRight16<1>(b);
460  t3 = RotateRight16<1>(f);
461  t2 = _mm_xor_si128(RotateLeft16<8>(c), kr);
462  t4 = _mm_xor_si128(RotateLeft16<8>(g), kr);
463  b = _mm_xor_si128(_mm_sub_epi16(t1, t2), counter);
464  f = _mm_xor_si128(_mm_sub_epi16(t3, t4), counter);
465 
466  counter = _mm_sub_epi16(counter, decrement);
467  kr = _mm_shuffle_epi8(k, _mm_set_epi8(1,0,1,0, 1,0,1,0, 1,0,1,0, 1,0,1,0));
468 
469  // Even round
470  t1 = RotateRight16<8>(a);
471  t3 = RotateRight16<8>(e);
472  t2 = _mm_xor_si128(RotateLeft16<1>(b), kr);
473  t4 = _mm_xor_si128(RotateLeft16<1>(f), kr);
474  a = _mm_xor_si128(_mm_sub_epi16(t1, t2), counter);
475  e = _mm_xor_si128(_mm_sub_epi16(t3, t4), counter);
476 
477  counter = _mm_sub_epi16(counter, decrement);
478  }
479 
480  // [A1 B1 .. G1 H1][A2 B2 .. G2 H2] ... => [A1 A2 .. A6 A7][B1 B2 .. B6 B7] ...
481  block0 = RepackXMM<0>(a,b,c,d,e,f,g,h);
482 }
483 
484 inline void CHAM64_Enc_2_Blocks(__m128i &block0,
485  __m128i &block1, const word16 *subkeys, unsigned int /*rounds*/)
486 {
487  // Rearrange the data for vectorization. UnpackXMM includes a
488  // little-endian swap for SSE. Thanks to Peter Cordes for help
489  // with packing and unpacking.
490  // [A1 A2 .. A6 A7][B1 B2 .. B6 B7] ... => [A1 B1 .. G1 H1][A2 B2 .. G2 H2] ...
491  __m128i a = UnpackXMM<0>(block0, block1);
492  __m128i b = UnpackXMM<1>(block0, block1);
493  __m128i c = UnpackXMM<2>(block0, block1);
494  __m128i d = UnpackXMM<3>(block0, block1);
495  __m128i e = UnpackXMM<4>(block0, block1);
496  __m128i f = UnpackXMM<5>(block0, block1);
497  __m128i g = UnpackXMM<6>(block0, block1);
498  __m128i h = UnpackXMM<7>(block0, block1);
499 
500  const unsigned int rounds = 80;
501  __m128i counter = _mm_set_epi16(0,0,0,0,0,0,0,0);
502  __m128i increment = _mm_set_epi16(1,1,1,1,1,1,1,1);
503 
504  const unsigned int MASK = 15;
505  for (int i=0; i<static_cast<int>(rounds); i+=4)
506  {
507  __m128i k, kr, t1, t2, t3, t4;
508  k = _mm_castpd_si128(_mm_load_sd((const double*)(&subkeys[i & MASK])));
509 
510  // Shuffle out key
511  kr = _mm_shuffle_epi8(k, _mm_set_epi8(1,0,1,0, 1,0,1,0, 1,0,1,0, 1,0,1,0));
512 
513  t1 = _mm_xor_si128(a, counter);
514  t3 = _mm_xor_si128(e, counter);
515  t2 = _mm_xor_si128(RotateLeft16<1>(b), kr);
516  t4 = _mm_xor_si128(RotateLeft16<1>(f), kr);
517  a = RotateLeft16<8>(_mm_add_epi16(t1, t2));
518  e = RotateLeft16<8>(_mm_add_epi16(t3, t4));
519 
520  counter = _mm_add_epi16(counter, increment);
521  kr = _mm_shuffle_epi8(k, _mm_set_epi8(3,2,3,2, 3,2,3,2, 3,2,3,2, 3,2,3,2));
522 
523  t1 = _mm_xor_si128(b, counter);
524  t3 = _mm_xor_si128(f, counter);
525  t2 = _mm_xor_si128(RotateLeft16<8>(c), kr);
526  t4 = _mm_xor_si128(RotateLeft16<8>(g), kr);
527  b = RotateLeft16<1>(_mm_add_epi16(t1, t2));
528  f = RotateLeft16<1>(_mm_add_epi16(t3, t4));
529 
530  counter = _mm_add_epi16(counter, increment);
531  kr = _mm_shuffle_epi8(k, _mm_set_epi8(5,4,5,4, 5,4,5,4, 5,4,5,4, 5,4,5,4));
532 
533  t1 = _mm_xor_si128(c, counter);
534  t3 = _mm_xor_si128(g, counter);
535  t2 = _mm_xor_si128(RotateLeft16<1>(d), kr);
536  t4 = _mm_xor_si128(RotateLeft16<1>(h), kr);
537  c = RotateLeft16<8>(_mm_add_epi16(t1, t2));
538  g = RotateLeft16<8>(_mm_add_epi16(t3, t4));
539 
540  counter = _mm_add_epi16(counter, increment);
541  kr = _mm_shuffle_epi8(k, _mm_set_epi8(7,6,7,6, 7,6,7,6, 7,6,7,6, 7,6,7,6));
542 
543  t1 = _mm_xor_si128(d, counter);
544  t3 = _mm_xor_si128(h, counter);
545  t2 = _mm_xor_si128(RotateLeft16<8>(a), kr);
546  t4 = _mm_xor_si128(RotateLeft16<8>(e), kr);
547  d = RotateLeft16<1>(_mm_add_epi16(t1, t2));
548  h = RotateLeft16<1>(_mm_add_epi16(t3, t4));
549 
550  counter = _mm_add_epi16(counter, increment);
551  }
552 
553  // [A1 B1 .. G1 H1][A2 B2 .. G2 H2] ... => [A1 A2 .. A6 A7][B1 B2 .. B6 B7] ...
554  block0 = RepackXMM<0>(a,b,c,d,e,f,g,h);
555  block1 = RepackXMM<1>(a,b,c,d,e,f,g,h);
556 }
557 
558 inline void CHAM64_Dec_2_Blocks(__m128i &block0,
559  __m128i &block1, const word16 *subkeys, unsigned int /*rounds*/)
560 {
561  // Rearrange the data for vectorization. UnpackXMM includes a
562  // little-endian swap for SSE. Thanks to Peter Cordes for help
563  // with packing and unpacking.
564  // [A1 A2 .. A6 A7][B1 B2 .. B6 B7] ... => [A1 B1 .. G1 H1][A2 B2 .. G2 H2] ...
565  __m128i a = UnpackXMM<0>(block0, block1);
566  __m128i b = UnpackXMM<1>(block0, block1);
567  __m128i c = UnpackXMM<2>(block0, block1);
568  __m128i d = UnpackXMM<3>(block0, block1);
569  __m128i e = UnpackXMM<4>(block0, block1);
570  __m128i f = UnpackXMM<5>(block0, block1);
571  __m128i g = UnpackXMM<6>(block0, block1);
572  __m128i h = UnpackXMM<7>(block0, block1);
573 
574  const unsigned int rounds = 80;
575  __m128i counter = _mm_set_epi16(rounds-1,rounds-1,rounds-1,rounds-1, rounds-1,rounds-1,rounds-1,rounds-1);
576  __m128i decrement = _mm_set_epi16(1,1,1,1,1,1,1,1);
577 
578  const unsigned int MASK = 15;
579  for (int i = static_cast<int>(rounds)-1; i >= 0; i-=4)
580  {
581  __m128i k, kr, t1, t2, t3, t4;
582  k = _mm_castpd_si128(_mm_load_sd((const double*)(&subkeys[(i-3) & MASK])));
583 
584  // Shuffle out key
585  kr = _mm_shuffle_epi8(k, _mm_set_epi8(7,6,7,6, 7,6,7,6, 7,6,7,6, 7,6,7,6));
586 
587  // Odd round
588  t1 = RotateRight16<1>(d);
589  t3 = RotateRight16<1>(h);
590  t2 = _mm_xor_si128(RotateLeft16<8>(a), kr);
591  t4 = _mm_xor_si128(RotateLeft16<8>(e), kr);
592  d = _mm_xor_si128(_mm_sub_epi16(t1, t2), counter);
593  h = _mm_xor_si128(_mm_sub_epi16(t3, t4), counter);
594 
595  counter = _mm_sub_epi16(counter, decrement);
596  kr = _mm_shuffle_epi8(k, _mm_set_epi8(5,4,5,4, 5,4,5,4, 5,4,5,4, 5,4,5,4));
597 
598  // Even round
599  t1 = RotateRight16<8>(c);
600  t3 = RotateRight16<8>(g);
601  t2 = _mm_xor_si128(RotateLeft16<1>(d), kr);
602  t4 = _mm_xor_si128(RotateLeft16<1>(h), kr);
603  c = _mm_xor_si128(_mm_sub_epi16(t1, t2), counter);
604  g = _mm_xor_si128(_mm_sub_epi16(t3, t4), counter);
605 
606  counter = _mm_sub_epi16(counter, decrement);
607  kr = _mm_shuffle_epi8(k, _mm_set_epi8(3,2,3,2, 3,2,3,2, 3,2,3,2, 3,2,3,2));
608 
609  // Odd round
610  t1 = RotateRight16<1>(b);
611  t3 = RotateRight16<1>(f);
612  t2 = _mm_xor_si128(RotateLeft16<8>(c), kr);
613  t4 = _mm_xor_si128(RotateLeft16<8>(g), kr);
614  b = _mm_xor_si128(_mm_sub_epi16(t1, t2), counter);
615  f = _mm_xor_si128(_mm_sub_epi16(t3, t4), counter);
616 
617  counter = _mm_sub_epi16(counter, decrement);
618  kr = _mm_shuffle_epi8(k, _mm_set_epi8(1,0,1,0, 1,0,1,0, 1,0,1,0, 1,0,1,0));
619 
620  // Even round
621  t1 = RotateRight16<8>(a);
622  t3 = RotateRight16<8>(e);
623  t2 = _mm_xor_si128(RotateLeft16<1>(b), kr);
624  t4 = _mm_xor_si128(RotateLeft16<1>(f), kr);
625  a = _mm_xor_si128(_mm_sub_epi16(t1, t2), counter);
626  e = _mm_xor_si128(_mm_sub_epi16(t3, t4), counter);
627 
628  counter = _mm_sub_epi16(counter, decrement);
629  }
630 
631  // [A1 B1 .. G1 H1][A2 B2 .. G2 H2] ... => [A1 A2 .. A6 A7][B1 B2 .. B6 B7] ...
632  block0 = RepackXMM<0>(a,b,c,d,e,f,g,h);
633  block1 = RepackXMM<1>(a,b,c,d,e,f,g,h);
634 }
635 
636 NAMESPACE_END // W16
637 
638 //////////////////////////////////////////////////////////////////////////
639 
640 NAMESPACE_BEGIN(W32) // CHAM128, 32-bit word size
641 
642 template <unsigned int R>
643 inline __m128i RotateLeft32(const __m128i& val)
644 {
645 #if defined(CRYPTOPP_AVX512_ROTATE)
646  return _mm_rol_epi32(val, R);
647 #elif defined(__XOP__)
648  return _mm_roti_epi32(val, R);
649 #else
650  return _mm_or_si128(
651  _mm_slli_epi32(val, R), _mm_srli_epi32(val, 32-R));
652 #endif
653 }
654 
655 template <unsigned int R>
656 inline __m128i RotateRight32(const __m128i& val)
657 {
658 #if defined(CRYPTOPP_AVX512_ROTATE)
659  return _mm_ror_epi32(val, R);
660 #elif defined(__XOP__)
661  return _mm_roti_epi32(val, 32-R);
662 #else
663  return _mm_or_si128(
664  _mm_slli_epi32(val, 32-R), _mm_srli_epi32(val, R));
665 #endif
666 }
667 
668 // Faster than two Shifts and an Or. Thanks to Louis Wingers and Bryan Weeks.
669 template <>
670 inline __m128i RotateLeft32<8>(const __m128i& val)
671 {
672 #if defined(__XOP__)
673  return _mm_roti_epi32(val, 8);
674 #else
675  const __m128i mask = _mm_set_epi8(14,13,12,15, 10,9,8,11, 6,5,4,7, 2,1,0,3);
676  return _mm_shuffle_epi8(val, mask);
677 #endif
678 }
679 
680 // Faster than two Shifts and an Or. Thanks to Louis Wingers and Bryan Weeks.
681 template <>
682 inline __m128i RotateRight32<8>(const __m128i& val)
683 {
684 #if defined(__XOP__)
685  return _mm_roti_epi32(val, 32-8);
686 #else
687  const __m128i mask = _mm_set_epi8(12,15,14,13, 8,11,10,9, 4,7,6,5, 0,3,2,1);
688  return _mm_shuffle_epi8(val, mask);
689 #endif
690 }
691 
692 template <unsigned int IDX>
693 inline __m128i UnpackXMM(const __m128i& a, const __m128i& b, const __m128i& c, const __m128i& d)
694 {
695  // Should not be instantiated
696  CRYPTOPP_UNUSED(a); CRYPTOPP_UNUSED(b);
697  CRYPTOPP_UNUSED(c); CRYPTOPP_UNUSED(d);
698  CRYPTOPP_ASSERT(0);
699  return _mm_setzero_si128();
700 }
701 
702 template <>
703 inline __m128i UnpackXMM<0>(const __m128i& a, const __m128i& b, const __m128i& c, const __m128i& d)
704 {
705  // The shuffle converts to and from little-endian for SSE. A specialized
706  // CHAM implementation can avoid the shuffle by framing the data for
707  // encryption, decryption and benchmarks. The library cannot take the
708  // speed-up because of the byte oriented API.
709  const __m128i r1 = _mm_unpacklo_epi32(a, b);
710  const __m128i r2 = _mm_unpacklo_epi32(c, d);
711  return _mm_shuffle_epi8(_mm_unpacklo_epi64(r1, r2),
712  _mm_set_epi8(12,13,14,15, 8,9,10,11, 4,5,6,7, 0,1,2,3));
713 }
714 
715 template <>
716 inline __m128i UnpackXMM<1>(const __m128i& a, const __m128i& b, const __m128i& c, const __m128i& d)
717 {
718  // The shuffle converts to and from little-endian for SSE. A specialized
719  // CHAM implementation can avoid the shuffle by framing the data for
720  // encryption, decryption and benchmarks. The library cannot take the
721  // speed-up because of the byte oriented API.
722  const __m128i r1 = _mm_unpacklo_epi32(a, b);
723  const __m128i r2 = _mm_unpacklo_epi32(c, d);
724  return _mm_shuffle_epi8(_mm_unpackhi_epi64(r1, r2),
725  _mm_set_epi8(12,13,14,15, 8,9,10,11, 4,5,6,7, 0,1,2,3));
726 }
727 
728 template <>
729 inline __m128i UnpackXMM<2>(const __m128i& a, const __m128i& b, const __m128i& c, const __m128i& d)
730 {
731  // The shuffle converts to and from little-endian for SSE. A specialized
732  // CHAM implementation can avoid the shuffle by framing the data for
733  // encryption, decryption and benchmarks. The library cannot take the
734  // speed-up because of the byte oriented API.
735  const __m128i r1 = _mm_unpackhi_epi32(a, b);
736  const __m128i r2 = _mm_unpackhi_epi32(c, d);
737  return _mm_shuffle_epi8(_mm_unpacklo_epi64(r1, r2),
738  _mm_set_epi8(12,13,14,15, 8,9,10,11, 4,5,6,7, 0,1,2,3));
739 }
740 
741 template <>
742 inline __m128i UnpackXMM<3>(const __m128i& a, const __m128i& b, const __m128i& c, const __m128i& d)
743 {
744  // The shuffle converts to and from little-endian for SSE. A specialized
745  // CHAM implementation can avoid the shuffle by framing the data for
746  // encryption, decryption and benchmarks. The library cannot take the
747  // speed-up because of the byte oriented API.
748  const __m128i r1 = _mm_unpackhi_epi32(a, b);
749  const __m128i r2 = _mm_unpackhi_epi32(c, d);
750  return _mm_shuffle_epi8(_mm_unpackhi_epi64(r1, r2),
751  _mm_set_epi8(12,13,14,15, 8,9,10,11, 4,5,6,7, 0,1,2,3));
752 }
753 
754 template <unsigned int IDX>
755 inline __m128i UnpackXMM(const __m128i& v)
756 {
757  // Should not be instantiated
758  CRYPTOPP_UNUSED(v); CRYPTOPP_ASSERT(0);
759  return _mm_setzero_si128();
760 }
761 
762 template <>
763 inline __m128i UnpackXMM<0>(const __m128i& v)
764 {
765  return _mm_shuffle_epi8(v, _mm_set_epi8(0,1,2,3, 0,1,2,3, 0,1,2,3, 0,1,2,3));
766 }
767 
768 template <>
769 inline __m128i UnpackXMM<1>(const __m128i& v)
770 {
771  return _mm_shuffle_epi8(v, _mm_set_epi8(4,5,6,7, 4,5,6,7, 4,5,6,7, 4,5,6,7));
772 }
773 
774 template <>
775 inline __m128i UnpackXMM<2>(const __m128i& v)
776 {
777  return _mm_shuffle_epi8(v, _mm_set_epi8(8,9,10,11, 8,9,10,11, 8,9,10,11, 8,9,10,11));
778 }
779 
780 template <>
781 inline __m128i UnpackXMM<3>(const __m128i& v)
782 {
783  return _mm_shuffle_epi8(v, _mm_set_epi8(12,13,14,15, 12,13,14,15, 12,13,14,15, 12,13,14,15));
784 }
785 
786 template <unsigned int IDX>
787 inline __m128i RepackXMM(const __m128i& a, const __m128i& b, const __m128i& c, const __m128i& d)
788 {
789  return UnpackXMM<IDX>(a, b, c, d);
790 }
791 
792 template <unsigned int IDX>
793 inline __m128i RepackXMM(const __m128i& v)
794 {
795  return UnpackXMM<IDX>(v);
796 }
797 
798 inline void CHAM128_Enc_Block(__m128i &block0,
799  const word32 *subkeys, unsigned int rounds)
800 {
801  // Rearrange the data for vectorization. UnpackXMM includes a
802  // little-endian swap for SSE. Thanks to Peter Cordes for help
803  // with packing and unpacking.
804  // [A1 A2 A3 A4][B1 B2 B3 B4] ... => [A1 B1 C1 D1][A2 B2 C2 D2] ...
805  __m128i a = UnpackXMM<0>(block0);
806  __m128i b = UnpackXMM<1>(block0);
807  __m128i c = UnpackXMM<2>(block0);
808  __m128i d = UnpackXMM<3>(block0);
809 
810  __m128i counter = _mm_set_epi32(0,0,0,0);
811  __m128i increment = _mm_set_epi32(1,1,1,1);
812 
813  const unsigned int MASK = (rounds == 80 ? 7 : 15);
814  for (int i=0; i<static_cast<int>(rounds); i+=4)
815  {
816  __m128i k, k1, k2, t1, t2;
817  k = _mm_castpd_si128(_mm_load_sd((const double*)(&subkeys[(i+0) & MASK])));
818 
819  // Shuffle out two subkeys
820  k1 = _mm_shuffle_epi8(k, _mm_set_epi8(3,2,1,0, 3,2,1,0, 3,2,1,0, 3,2,1,0));
821  k2 = _mm_shuffle_epi8(k, _mm_set_epi8(7,6,5,4, 7,6,5,4, 7,6,5,4, 7,6,5,4));
822 
823  t1 = _mm_xor_si128(a, counter);
824  t2 = _mm_xor_si128(RotateLeft32<1>(b), k1);
825  a = RotateLeft32<8>(_mm_add_epi32(t1, t2));
826 
827  counter = _mm_add_epi32(counter, increment);
828 
829  t1 = _mm_xor_si128(b, counter);
830  t2 = _mm_xor_si128(RotateLeft32<8>(c), k2);
831  b = RotateLeft32<1>(_mm_add_epi32(t1, t2));
832 
833  counter = _mm_add_epi32(counter, increment);
834 
835  k = _mm_castpd_si128(_mm_load_sd((const double*)(&subkeys[(i+2) & MASK])));
836 
837  // Shuffle out two subkeys
838  k1 = _mm_shuffle_epi8(k, _mm_set_epi8(3,2,1,0, 3,2,1,0, 3,2,1,0, 3,2,1,0));
839  k2 = _mm_shuffle_epi8(k, _mm_set_epi8(7,6,5,4, 7,6,5,4, 7,6,5,4, 7,6,5,4));
840 
841  t1 = _mm_xor_si128(c, counter);
842  t2 = _mm_xor_si128(RotateLeft32<1>(d), k1);
843  c = RotateLeft32<8>(_mm_add_epi32(t1, t2));
844 
845  counter = _mm_add_epi32(counter, increment);
846 
847  t1 = _mm_xor_si128(d, counter);
848  t2 = _mm_xor_si128(RotateLeft32<8>(a), k2);
849  d = RotateLeft32<1>(_mm_add_epi32(t1, t2));
850 
851  counter = _mm_add_epi32(counter, increment);
852  }
853 
854  // [A1 B1 C1 D1][A2 B2 C2 D2] ... => [A1 A2 A3 A4][B1 B2 B3 B4] ...
855  block0 = RepackXMM<0>(a,b,c,d);
856 }
857 
858 inline void CHAM128_Dec_Block(__m128i &block0,
859  const word32 *subkeys, unsigned int rounds)
860 {
861  // Rearrange the data for vectorization. UnpackXMM includes a
862  // little-endian swap for SSE. Thanks to Peter Cordes for help
863  // with packing and unpacking.
864  // [A1 A2 A3 A4][B1 B2 B3 B4] ... => [A1 B1 C1 D1][A2 B2 C2 D2] ...
865  __m128i a = UnpackXMM<0>(block0);
866  __m128i b = UnpackXMM<1>(block0);
867  __m128i c = UnpackXMM<2>(block0);
868  __m128i d = UnpackXMM<3>(block0);
869 
870  __m128i counter = _mm_set_epi32(rounds-1,rounds-1,rounds-1,rounds-1);
871  __m128i decrement = _mm_set_epi32(1,1,1,1);
872 
873  const unsigned int MASK = (rounds == 80 ? 7 : 15);
874  for (int i = static_cast<int>(rounds)-1; i >= 0; i-=4)
875  {
876  __m128i k, k1, k2, t1, t2;
877  k = _mm_castpd_si128(_mm_load_sd((const double*)(&subkeys[(i-1) & MASK])));
878 
879  // Shuffle out two subkeys
880  k1 = _mm_shuffle_epi8(k, _mm_set_epi8(7,6,5,4, 7,6,5,4, 7,6,5,4, 7,6,5,4));
881  k2 = _mm_shuffle_epi8(k, _mm_set_epi8(3,2,1,0, 3,2,1,0, 3,2,1,0, 3,2,1,0));
882 
883  // Odd round
884  t1 = RotateRight32<1>(d);
885  t2 = _mm_xor_si128(RotateLeft32<8>(a), k1);
886  d = _mm_xor_si128(_mm_sub_epi32(t1, t2), counter);
887 
888  counter = _mm_sub_epi32(counter, decrement);
889 
890  // Even round
891  t1 = RotateRight32<8>(c);
892  t2 = _mm_xor_si128(RotateLeft32<1>(d), k2);
893  c = _mm_xor_si128(_mm_sub_epi32(t1, t2), counter);
894 
895  counter = _mm_sub_epi32(counter, decrement);
896  k = _mm_castpd_si128(_mm_load_sd((const double*)(&subkeys[(i-3) & MASK])));
897 
898  // Shuffle out two subkeys
899  k1 = _mm_shuffle_epi8(k, _mm_set_epi8(7,6,5,4, 7,6,5,4, 7,6,5,4, 7,6,5,4));
900  k2 = _mm_shuffle_epi8(k, _mm_set_epi8(3,2,1,0, 3,2,1,0, 3,2,1,0, 3,2,1,0));
901 
902  // Odd round
903  t1 = RotateRight32<1>(b);
904  t2 = _mm_xor_si128(RotateLeft32<8>(c), k1);
905  b = _mm_xor_si128(_mm_sub_epi32(t1, t2), counter);
906 
907  counter = _mm_sub_epi32(counter, decrement);
908 
909  // Even round
910  t1 = RotateRight32<8>(a);
911  t2 = _mm_xor_si128(RotateLeft32<1>(b), k2);
912  a = _mm_xor_si128(_mm_sub_epi32(t1, t2), counter);
913 
914  counter = _mm_sub_epi32(counter, decrement);
915  }
916 
917  // [A1 B1 C1 D1][A2 B2 C2 D2] ... => [A1 A2 A3 A4][B1 B2 B3 B4] ...
918  block0 = RepackXMM<0>(a,b,c,d);
919 }
920 
921 inline void CHAM128_Enc_4_Blocks(__m128i &block0, __m128i &block1,
922  __m128i &block2, __m128i &block3, const word32 *subkeys, unsigned int rounds)
923 {
924  // Rearrange the data for vectorization. UnpackXMM includes a
925  // little-endian swap for SSE. Thanks to Peter Cordes for help
926  // with packing and unpacking.
927  // [A1 A2 A3 A4][B1 B2 B3 B4] ... => [A1 B1 C1 D1][A2 B2 C2 D2] ...
928  __m128i a = UnpackXMM<0>(block0, block1, block2, block3);
929  __m128i b = UnpackXMM<1>(block0, block1, block2, block3);
930  __m128i c = UnpackXMM<2>(block0, block1, block2, block3);
931  __m128i d = UnpackXMM<3>(block0, block1, block2, block3);
932 
933  __m128i counter = _mm_set_epi32(0,0,0,0);
934  __m128i increment = _mm_set_epi32(1,1,1,1);
935 
936  const unsigned int MASK = (rounds == 80 ? 7 : 15);
937  for (int i=0; i<static_cast<int>(rounds); i+=4)
938  {
939  __m128i k, k1, k2, t1, t2;
940  k = _mm_castpd_si128(_mm_load_sd((const double*)(&subkeys[(i+0) & MASK])));
941 
942  // Shuffle out two subkeys
943  k1 = _mm_shuffle_epi8(k, _mm_set_epi8(3,2,1,0, 3,2,1,0, 3,2,1,0, 3,2,1,0));
944  k2 = _mm_shuffle_epi8(k, _mm_set_epi8(7,6,5,4, 7,6,5,4, 7,6,5,4, 7,6,5,4));
945 
946  t1 = _mm_xor_si128(a, counter);
947  t2 = _mm_xor_si128(RotateLeft32<1>(b), k1);
948  a = RotateLeft32<8>(_mm_add_epi32(t1, t2));
949 
950  counter = _mm_add_epi32(counter, increment);
951 
952  t1 = _mm_xor_si128(b, counter);
953  t2 = _mm_xor_si128(RotateLeft32<8>(c), k2);
954  b = RotateLeft32<1>(_mm_add_epi32(t1, t2));
955 
956  counter = _mm_add_epi32(counter, increment);
957  k = _mm_castpd_si128(_mm_load_sd((const double*)(&subkeys[(i+2) & MASK])));
958 
959  // Shuffle out two subkeys
960  k1 = _mm_shuffle_epi8(k, _mm_set_epi8(3,2,1,0, 3,2,1,0, 3,2,1,0, 3,2,1,0));
961  k2 = _mm_shuffle_epi8(k, _mm_set_epi8(7,6,5,4, 7,6,5,4, 7,6,5,4, 7,6,5,4));
962 
963  t1 = _mm_xor_si128(c, counter);
964  t2 = _mm_xor_si128(RotateLeft32<1>(d), k1);
965  c = RotateLeft32<8>(_mm_add_epi32(t1, t2));
966 
967  counter = _mm_add_epi32(counter, increment);
968 
969  t1 = _mm_xor_si128(d, counter);
970  t2 = _mm_xor_si128(RotateLeft32<8>(a), k2);
971  d = RotateLeft32<1>(_mm_add_epi32(t1, t2));
972 
973  counter = _mm_add_epi32(counter, increment);
974  }
975 
976  // [A1 B1 C1 D1][A2 B2 C2 D2] ... => [A1 A2 A3 A4][B1 B2 B3 B4] ...
977  block0 = RepackXMM<0>(a,b,c,d);
978  block1 = RepackXMM<1>(a,b,c,d);
979  block2 = RepackXMM<2>(a,b,c,d);
980  block3 = RepackXMM<3>(a,b,c,d);
981 }
982 
983 inline void CHAM128_Dec_4_Blocks(__m128i &block0, __m128i &block1,
984  __m128i &block2, __m128i &block3, const word32 *subkeys, unsigned int rounds)
985 {
986  // Rearrange the data for vectorization. UnpackXMM includes a
987  // little-endian swap for SSE. Thanks to Peter Cordes for help
988  // with packing and unpacking.
989  // [A1 A2 A3 A4][B1 B2 B3 B4] ... => [A1 B1 C1 D1][A2 B2 C2 D2] ...
990  __m128i a = UnpackXMM<0>(block0, block1, block2, block3);
991  __m128i b = UnpackXMM<1>(block0, block1, block2, block3);
992  __m128i c = UnpackXMM<2>(block0, block1, block2, block3);
993  __m128i d = UnpackXMM<3>(block0, block1, block2, block3);
994 
995  __m128i counter = _mm_set_epi32(rounds-1,rounds-1,rounds-1,rounds-1);
996  __m128i decrement = _mm_set_epi32(1,1,1,1);
997 
998  const unsigned int MASK = (rounds == 80 ? 7 : 15);
999  for (int i = static_cast<int>(rounds)-1; i >= 0; i-=4)
1000  {
1001  __m128i k, k1, k2, t1, t2;
1002  k = _mm_castpd_si128(_mm_load_sd((const double*)(&subkeys[(i-1) & MASK])));
1003 
1004  // Shuffle out two subkeys
1005  k1 = _mm_shuffle_epi8(k, _mm_set_epi8(7,6,5,4, 7,6,5,4, 7,6,5,4, 7,6,5,4));
1006  k2 = _mm_shuffle_epi8(k, _mm_set_epi8(3,2,1,0, 3,2,1,0, 3,2,1,0, 3,2,1,0));
1007 
1008  // Odd round
1009  t1 = RotateRight32<1>(d);
1010  t2 = _mm_xor_si128(RotateLeft32<8>(a), k1);
1011  d = _mm_xor_si128(_mm_sub_epi32(t1, t2), counter);
1012 
1013  counter = _mm_sub_epi32(counter, decrement);
1014 
1015  // Even round
1016  t1 = RotateRight32<8>(c);
1017  t2 = _mm_xor_si128(RotateLeft32<1>(d), k2);
1018  c = _mm_xor_si128(_mm_sub_epi32(t1, t2), counter);
1019 
1020  counter = _mm_sub_epi32(counter, decrement);
1021  k = _mm_castpd_si128(_mm_load_sd((const double*)(&subkeys[(i-3) & MASK])));
1022 
1023  // Shuffle out two subkeys
1024  k1 = _mm_shuffle_epi8(k, _mm_set_epi8(7,6,5,4, 7,6,5,4, 7,6,5,4, 7,6,5,4));
1025  k2 = _mm_shuffle_epi8(k, _mm_set_epi8(3,2,1,0, 3,2,1,0, 3,2,1,0, 3,2,1,0));
1026 
1027  // Odd round
1028  t1 = RotateRight32<1>(b);
1029  t2 = _mm_xor_si128(RotateLeft32<8>(c), k1);
1030  b = _mm_xor_si128(_mm_sub_epi32(t1, t2), counter);
1031 
1032  counter = _mm_sub_epi32(counter, decrement);
1033 
1034  // Even round
1035  t1 = RotateRight32<8>(a);
1036  t2 = _mm_xor_si128(RotateLeft32<1>(b), k2);
1037  a = _mm_xor_si128(_mm_sub_epi32(t1, t2), counter);
1038 
1039  counter = _mm_sub_epi32(counter, decrement);
1040  }
1041 
1042  // [A1 B1 C1 D1][A2 B2 C2 D2] ... => [A1 A2 A3 A4][B1 B2 B3 B4] ...
1043  block0 = RepackXMM<0>(a,b,c,d);
1044  block1 = RepackXMM<1>(a,b,c,d);
1045  block2 = RepackXMM<2>(a,b,c,d);
1046  block3 = RepackXMM<3>(a,b,c,d);
1047 }
1048 
1049 //////////////////////////////////////////////////////////////////////////
1050 
1051 NAMESPACE_END // W32
1052 
1053 #endif // CRYPTOPP_SSSE3_AVAILABLE
1054 
1055 ANONYMOUS_NAMESPACE_END
1056 
1057 NAMESPACE_BEGIN(CryptoPP)
1058 
1059 #if defined(CRYPTOPP_SSSE3_AVAILABLE)
1060 size_t CHAM64_Enc_AdvancedProcessBlocks_SSSE3(const word16* subKeys, size_t rounds,
1061  const byte *inBlocks, const byte *xorBlocks, byte *outBlocks, size_t length, word32 flags)
1062 {
1063  return AdvancedProcessBlocks64_2x1_SSE(W16::CHAM64_Enc_Block, W16::CHAM64_Enc_2_Blocks,
1064  subKeys, rounds, inBlocks, xorBlocks, outBlocks, length, flags);
1065 }
1066 
1067 size_t CHAM64_Dec_AdvancedProcessBlocks_SSSE3(const word16* subKeys, size_t rounds,
1068  const byte *inBlocks, const byte *xorBlocks, byte *outBlocks, size_t length, word32 flags)
1069 {
1070  return AdvancedProcessBlocks64_2x1_SSE(W16::CHAM64_Dec_Block, W16::CHAM64_Dec_2_Blocks,
1071  subKeys, rounds, inBlocks, xorBlocks, outBlocks, length, flags);
1072 }
1073 
1074 size_t CHAM128_Enc_AdvancedProcessBlocks_SSSE3(const word32* subKeys, size_t rounds,
1075  const byte *inBlocks, const byte *xorBlocks, byte *outBlocks, size_t length, word32 flags)
1076 {
1077  return AdvancedProcessBlocks128_4x1_SSE(W32::CHAM128_Enc_Block, W32::CHAM128_Enc_4_Blocks,
1078  subKeys, rounds, inBlocks, xorBlocks, outBlocks, length, flags);
1079 }
1080 
1081 size_t CHAM128_Dec_AdvancedProcessBlocks_SSSE3(const word32* subKeys, size_t rounds,
1082  const byte *inBlocks, const byte *xorBlocks, byte *outBlocks, size_t length, word32 flags)
1083 {
1084  return AdvancedProcessBlocks128_4x1_SSE(W32::CHAM128_Dec_Block, W32::CHAM128_Dec_4_Blocks,
1085  subKeys, rounds, inBlocks, xorBlocks, outBlocks, length, flags);
1086 }
1087 #endif // CRYPTOPP_SSSE3_AVAILABLE
1088 
1089 NAMESPACE_END
Utility functions for the Crypto++ library.
Library configuration file.
Classes for the CHAM block cipher.
Template for AdvancedProcessBlocks and SIMD processing.
Precompiled header file.
#define CRYPTOPP_ASSERT(exp)
Debugging and diagnostic assertion.
Definition: trap.h:69
Crypto++ library namespace.