25 #if CRYPTOPP_GCC_DIAGNOSTIC_AVAILABLE 26 # pragma GCC diagnostic ignored "-Wcast-align" 27 # pragma GCC diagnostic ignored "-Wunused-function" 31 extern const char DONNA_SSE_FNAME[] = __FILE__;
33 #if (CRYPTOPP_CURVE25519_SSE2) 35 #include "donna_sse.h" 37 ANONYMOUS_NAMESPACE_BEGIN
40 using CryptoPP::word32;
41 using CryptoPP::sword32;
42 using CryptoPP::word64;
43 using CryptoPP::sword64;
44 using CryptoPP::GetBlock;
48 using namespace CryptoPP::Donna::ArchSSE;
52 curve25519_copy(bignum25519 out,
const bignum25519 in) {
54 x0 = _mm_load_si128((xmmi*)in + 0);
55 x1 = _mm_load_si128((xmmi*)in + 1);
56 x2 = _mm_load_si128((xmmi*)in + 2);
57 _mm_store_si128((xmmi*)out + 0, x0);
58 _mm_store_si128((xmmi*)out + 1, x1);
59 _mm_store_si128((xmmi*)out + 2, x2);
64 curve25519_expand(bignum25519 out,
const byte in[32]) {
65 word32 x0,x1,x2,x3,x4,x5,x6,x7;
67 x0 = *(word32 *)(in + 0);
68 x1 = *(word32 *)(in + 4);
69 x2 = *(word32 *)(in + 8);
70 x3 = *(word32 *)(in + 12);
71 x4 = *(word32 *)(in + 16);
72 x5 = *(word32 *)(in + 20);
73 x6 = *(word32 *)(in + 24);
74 x7 = *(word32 *)(in + 28);
76 out[0] = ( x0 ) & reduce_mask_26;
77 out[1] = ((((word64)x1 << 32) | x0) >> 26) & reduce_mask_25;
78 out[2] = ((((word64)x2 << 32) | x1) >> 19) & reduce_mask_26;
79 out[3] = ((((word64)x3 << 32) | x2) >> 13) & reduce_mask_25;
80 out[4] = (( x3) >> 6) & reduce_mask_26;
81 out[5] = ( x4 ) & reduce_mask_25;
82 out[6] = ((((word64)x5 << 32) | x4) >> 25) & reduce_mask_26;
83 out[7] = ((((word64)x6 << 32) | x5) >> 19) & reduce_mask_25;
84 out[8] = ((((word64)x7 << 32) | x6) >> 12) & reduce_mask_26;
85 out[9] = (( x7) >> 6) & reduce_mask_25;
95 curve25519_contract(byte out[32],
const bignum25519 in) {
96 ALIGN(16) bignum25519 f;
98 curve25519_copy(f, in);
100 #define carry_pass() \ 101 f[1] += f[0] >> 26; f[0] &= reduce_mask_26; \ 102 f[2] += f[1] >> 25; f[1] &= reduce_mask_25; \ 103 f[3] += f[2] >> 26; f[2] &= reduce_mask_26; \ 104 f[4] += f[3] >> 25; f[3] &= reduce_mask_25; \ 105 f[5] += f[4] >> 26; f[4] &= reduce_mask_26; \ 106 f[6] += f[5] >> 25; f[5] &= reduce_mask_25; \ 107 f[7] += f[6] >> 26; f[6] &= reduce_mask_26; \ 108 f[8] += f[7] >> 25; f[7] &= reduce_mask_25; \ 109 f[9] += f[8] >> 26; f[8] &= reduce_mask_26; 111 #define carry_pass_full() \ 113 f[0] += 19 * (f[9] >> 25); f[9] &= reduce_mask_25; 115 #define carry_pass_final() \ 117 f[9] &= reduce_mask_25; 128 f[0] += (1 << 26) - 19;
129 f[1] += (1 << 25) - 1;
130 f[2] += (1 << 26) - 1;
131 f[3] += (1 << 25) - 1;
132 f[4] += (1 << 26) - 1;
133 f[5] += (1 << 25) - 1;
134 f[6] += (1 << 26) - 1;
135 f[7] += (1 << 25) - 1;
136 f[8] += (1 << 26) - 1;
137 f[9] += (1 << 25) - 1;
146 *(word32 *)(out + 0) = ((f[0] ) | (f[1] << 26));
147 *(word32 *)(out + 4) = ((f[1] >> 6) | (f[2] << 19));
148 *(word32 *)(out + 8) = ((f[2] >> 13) | (f[3] << 13));
149 *(word32 *)(out + 12) = ((f[3] >> 19) | (f[4] << 6));
150 *(word32 *)(out + 16) = ((f[5] ) | (f[6] << 25));
151 *(word32 *)(out + 20) = ((f[6] >> 7) | (f[7] << 19));
152 *(word32 *)(out + 24) = ((f[7] >> 13) | (f[8] << 12));
153 *(word32 *)(out + 28) = ((f[8] >> 20) | (f[9] << 6));
161 curve25519_swap_conditional(bignum25519 a, bignum25519 b, word32 iswap) {
162 const word32 swap = (word32)(-(sword32)iswap);
163 xmmi a0,a1,a2,b0,b1,b2,x0,x1,x2;
164 xmmi mask = _mm_cvtsi32_si128(swap);
165 mask = _mm_shuffle_epi32(mask, 0);
166 a0 = _mm_load_si128((xmmi *)a + 0);
167 a1 = _mm_load_si128((xmmi *)a + 1);
168 a2 = _mm_load_si128((xmmi *)a + 2);
169 b0 = _mm_load_si128((xmmi *)b + 0);
170 b1 = _mm_load_si128((xmmi *)b + 1);
171 b2 = _mm_load_si128((xmmi *)b + 2);
172 b0 = _mm_xor_si128(a0, b0);
173 b1 = _mm_xor_si128(a1, b1);
174 b2 = _mm_xor_si128(a2, b2);
175 x0 = _mm_and_si128(b0, mask);
176 x1 = _mm_and_si128(b1, mask);
177 x2 = _mm_and_si128(b2, mask);
178 x0 = _mm_xor_si128(x0, a0);
179 x1 = _mm_xor_si128(x1, a1);
180 x2 = _mm_xor_si128(x2, a2);
181 a0 = _mm_xor_si128(x0, b0);
182 a1 = _mm_xor_si128(x1, b1);
183 a2 = _mm_xor_si128(x2, b2);
184 _mm_store_si128((xmmi *)a + 0, x0);
185 _mm_store_si128((xmmi *)a + 1, x1);
186 _mm_store_si128((xmmi *)a + 2, x2);
187 _mm_store_si128((xmmi *)b + 0, a0);
188 _mm_store_si128((xmmi *)b + 1, a1);
189 _mm_store_si128((xmmi *)b + 2, a2);
194 curve25519_tangle32(packedelem32 *out,
const bignum25519 x,
const bignum25519 z) {
195 xmmi x0,x1,x2,z0,z1,z2;
197 x0 = _mm_load_si128((xmmi *)(x + 0));
198 x1 = _mm_load_si128((xmmi *)(x + 4));
199 x2 = _mm_load_si128((xmmi *)(x + 8));
200 z0 = _mm_load_si128((xmmi *)(z + 0));
201 z1 = _mm_load_si128((xmmi *)(z + 4));
202 z2 = _mm_load_si128((xmmi *)(z + 8));
204 out[0].v = _mm_unpacklo_epi32(x0, z0);
205 out[1].v = _mm_unpackhi_epi32(x0, z0);
206 out[2].v = _mm_unpacklo_epi32(x1, z1);
207 out[3].v = _mm_unpackhi_epi32(x1, z1);
208 out[4].v = _mm_unpacklo_epi32(x2, z2);
213 curve25519_untangle64(bignum25519 x, bignum25519 z,
const packedelem64 *in) {
214 _mm_store_si128((xmmi *)(x + 0), _mm_unpacklo_epi64(_mm_unpacklo_epi32(in[0].v, in[1].v), _mm_unpacklo_epi32(in[2].v, in[3].v)));
215 _mm_store_si128((xmmi *)(x + 4), _mm_unpacklo_epi64(_mm_unpacklo_epi32(in[4].v, in[5].v), _mm_unpacklo_epi32(in[6].v, in[7].v)));
216 _mm_store_si128((xmmi *)(x + 8), _mm_unpacklo_epi32(in[8].v, in[9].v) );
217 _mm_store_si128((xmmi *)(z + 0), _mm_unpacklo_epi64(_mm_unpackhi_epi32(in[0].v, in[1].v), _mm_unpackhi_epi32(in[2].v, in[3].v)));
218 _mm_store_si128((xmmi *)(z + 4), _mm_unpacklo_epi64(_mm_unpackhi_epi32(in[4].v, in[5].v), _mm_unpackhi_epi32(in[6].v, in[7].v)));
219 _mm_store_si128((xmmi *)(z + 8), _mm_unpackhi_epi32(in[8].v, in[9].v) );
224 curve25519_add_packed32(packedelem32 *out,
const packedelem32 *r,
const packedelem32 *s) {
225 out[0].v = _mm_add_epi32(r[0].v, s[0].v);
226 out[1].v = _mm_add_epi32(r[1].v, s[1].v);
227 out[2].v = _mm_add_epi32(r[2].v, s[2].v);
228 out[3].v = _mm_add_epi32(r[3].v, s[3].v);
229 out[4].v = _mm_add_epi32(r[4].v, s[4].v);
234 curve25519_sub_packed32(packedelem32 *out,
const packedelem32 *r,
const packedelem32 *s) {
239 r0 = _mm_add_epi32(r[0].v, packed32zeromodp0.v);
240 r1 = _mm_add_epi32(r[1].v, packed32zeromodp1.v);
241 r2 = _mm_add_epi32(r[2].v, packed32zeromodp1.v);
242 r3 = _mm_add_epi32(r[3].v, packed32zeromodp1.v);
243 r4 = _mm_add_epi32(r[4].v, packed32zeromodp1.v);
244 r0 = _mm_sub_epi32(r0, s[0].v);
245 r1 = _mm_sub_epi32(r1, s[1].v);
246 r2 = _mm_sub_epi32(r2, s[2].v);
247 r3 = _mm_sub_epi32(r3, s[3].v);
248 r4 = _mm_sub_epi32(r4, s[4].v);
250 s0 = _mm_unpacklo_epi64(r0, r2);
251 s1 = _mm_unpackhi_epi64(r0, r2);
252 s2 = _mm_unpacklo_epi64(r1, r3);
253 s3 = _mm_unpackhi_epi64(r1, r3);
255 c1 = _mm_srli_epi32(s0, 26); c2 = _mm_srli_epi32(s2, 26); s0 = _mm_and_si128(s0, packedmask26262626.v); s2 = _mm_and_si128(s2, packedmask26262626.v); s1 = _mm_add_epi32(s1, c1); s3 = _mm_add_epi32(s3, c2);
256 c1 = _mm_srli_epi32(s1, 25); c2 = _mm_srli_epi32(s3, 25); s1 = _mm_and_si128(s1, packedmask25252525.v); s3 = _mm_and_si128(s3, packedmask25252525.v); s2 = _mm_add_epi32(s2, c1); r4 = _mm_add_epi32(r4, _mm_srli_si128(c2, 8)); s0 = _mm_add_epi32(s0, _mm_slli_si128(c2, 8));
258 out[0].v = _mm_unpacklo_epi64(s0, s1);
259 out[1].v = _mm_unpacklo_epi64(s2, s3);
260 out[2].v = _mm_unpackhi_epi64(s0, s1);
261 out[3].v = _mm_unpackhi_epi64(s2, s3);
267 curve25519_mul_packed64(packedelem64 *out,
const packedelem64 *r,
const packedelem64 *s) {
268 xmmi r1,r2,r3,r4,r5,r6,r7,r8,r9;
269 xmmi r1_2,r3_2,r5_2,r7_2,r9_2;
272 out[0].v = _mm_mul_epu32(r[0].v, s[0].v);
273 out[1].v = _mm_add_epi64(_mm_mul_epu32(r[0].v, s[1].v), _mm_mul_epu32(r[1].v, s[0].v));
274 r1_2 = _mm_slli_epi32(r[1].v, 1);
275 out[2].v = _mm_add_epi64(_mm_mul_epu32(r[0].v, s[2].v), _mm_add_epi64(_mm_mul_epu32(r1_2 , s[1].v), _mm_mul_epu32(r[2].v, s[0].v)));
276 out[3].v = _mm_add_epi64(_mm_mul_epu32(r[0].v, s[3].v), _mm_add_epi64(_mm_mul_epu32(r[1].v, s[2].v), _mm_add_epi64(_mm_mul_epu32(r[2].v, s[1].v), _mm_mul_epu32(r[3].v, s[0].v))));
277 r3_2 = _mm_slli_epi32(r[3].v, 1);
278 out[4].v = _mm_add_epi64(_mm_mul_epu32(r[0].v, s[4].v), _mm_add_epi64(_mm_mul_epu32(r1_2 , s[3].v), _mm_add_epi64(_mm_mul_epu32(r[2].v, s[2].v), _mm_add_epi64(_mm_mul_epu32(r3_2 , s[1].v), _mm_mul_epu32(r[4].v, s[0].v)))));
279 out[5].v = _mm_add_epi64(_mm_mul_epu32(r[0].v, s[5].v), _mm_add_epi64(_mm_mul_epu32(r[1].v, s[4].v), _mm_add_epi64(_mm_mul_epu32(r[2].v, s[3].v), _mm_add_epi64(_mm_mul_epu32(r[3].v, s[2].v), _mm_add_epi64(_mm_mul_epu32(r[4].v, s[1].v), _mm_mul_epu32(r[5].v, s[0].v))))));
280 r5_2 = _mm_slli_epi32(r[5].v, 1);
281 out[6].v = _mm_add_epi64(_mm_mul_epu32(r[0].v, s[6].v), _mm_add_epi64(_mm_mul_epu32(r1_2 , s[5].v), _mm_add_epi64(_mm_mul_epu32(r[2].v, s[4].v), _mm_add_epi64(_mm_mul_epu32(r3_2 , s[3].v), _mm_add_epi64(_mm_mul_epu32(r[4].v, s[2].v), _mm_add_epi64(_mm_mul_epu32(r5_2 , s[1].v), _mm_mul_epu32(r[6].v, s[0].v)))))));
282 out[7].v = _mm_add_epi64(_mm_mul_epu32(r[0].v, s[7].v), _mm_add_epi64(_mm_mul_epu32(r[1].v, s[6].v), _mm_add_epi64(_mm_mul_epu32(r[2].v, s[5].v), _mm_add_epi64(_mm_mul_epu32(r[3].v, s[4].v), _mm_add_epi64(_mm_mul_epu32(r[4].v, s[3].v), _mm_add_epi64(_mm_mul_epu32(r[5].v, s[2].v), _mm_add_epi64(_mm_mul_epu32(r[6].v, s[1].v), _mm_mul_epu32(r[7].v , s[0].v))))))));
283 r7_2 = _mm_slli_epi32(r[7].v, 1);
284 out[8].v = _mm_add_epi64(_mm_mul_epu32(r[0].v, s[8].v), _mm_add_epi64(_mm_mul_epu32(r1_2 , s[7].v), _mm_add_epi64(_mm_mul_epu32(r[2].v, s[6].v), _mm_add_epi64(_mm_mul_epu32(r3_2 , s[5].v), _mm_add_epi64(_mm_mul_epu32(r[4].v, s[4].v), _mm_add_epi64(_mm_mul_epu32(r5_2 , s[3].v), _mm_add_epi64(_mm_mul_epu32(r[6].v, s[2].v), _mm_add_epi64(_mm_mul_epu32(r7_2 , s[1].v), _mm_mul_epu32(r[8].v, s[0].v)))))))));
285 out[9].v = _mm_add_epi64(_mm_mul_epu32(r[0].v, s[9].v), _mm_add_epi64(_mm_mul_epu32(r[1].v, s[8].v), _mm_add_epi64(_mm_mul_epu32(r[2].v, s[7].v), _mm_add_epi64(_mm_mul_epu32(r[3].v, s[6].v), _mm_add_epi64(_mm_mul_epu32(r[4].v, s[5].v), _mm_add_epi64(_mm_mul_epu32(r[5].v, s[4].v), _mm_add_epi64(_mm_mul_epu32(r[6].v, s[3].v), _mm_add_epi64(_mm_mul_epu32(r[7].v, s[2].v), _mm_add_epi64(_mm_mul_epu32(r[8].v, s[1].v), _mm_mul_epu32(r[9].v, s[0].v))))))))));
287 r1 = _mm_mul_epu32(r[1].v, packednineteen.v);
288 r2 = _mm_mul_epu32(r[2].v, packednineteen.v);
289 r1_2 = _mm_slli_epi32(r1, 1);
290 r3 = _mm_mul_epu32(r[3].v, packednineteen.v);
291 r4 = _mm_mul_epu32(r[4].v, packednineteen.v);
292 r3_2 = _mm_slli_epi32(r3, 1);
293 r5 = _mm_mul_epu32(r[5].v, packednineteen.v);
294 r6 = _mm_mul_epu32(r[6].v, packednineteen.v);
295 r5_2 = _mm_slli_epi32(r5, 1);
296 r7 = _mm_mul_epu32(r[7].v, packednineteen.v);
297 r8 = _mm_mul_epu32(r[8].v, packednineteen.v);
298 r7_2 = _mm_slli_epi32(r7, 1);
299 r9 = _mm_mul_epu32(r[9].v, packednineteen.v);
300 r9_2 = _mm_slli_epi32(r9, 1);
302 out[0].v = _mm_add_epi64(out[0].v, _mm_add_epi64(_mm_mul_epu32(r9_2, s[1].v), _mm_add_epi64(_mm_mul_epu32(r8, s[2].v), _mm_add_epi64(_mm_mul_epu32(r7_2, s[3].v), _mm_add_epi64(_mm_mul_epu32(r6, s[4].v), _mm_add_epi64(_mm_mul_epu32(r5_2, s[5].v), _mm_add_epi64(_mm_mul_epu32(r4, s[6].v), _mm_add_epi64(_mm_mul_epu32(r3_2, s[7].v), _mm_add_epi64(_mm_mul_epu32(r2, s[8].v), _mm_mul_epu32(r1_2, s[9].v))))))))));
303 out[1].v = _mm_add_epi64(out[1].v, _mm_add_epi64(_mm_mul_epu32(r9 , s[2].v), _mm_add_epi64(_mm_mul_epu32(r8, s[3].v), _mm_add_epi64(_mm_mul_epu32(r7 , s[4].v), _mm_add_epi64(_mm_mul_epu32(r6, s[5].v), _mm_add_epi64(_mm_mul_epu32(r5 , s[6].v), _mm_add_epi64(_mm_mul_epu32(r4, s[7].v), _mm_add_epi64(_mm_mul_epu32(r3 , s[8].v), _mm_mul_epu32(r2, s[9].v)))))))));
304 out[2].v = _mm_add_epi64(out[2].v, _mm_add_epi64(_mm_mul_epu32(r9_2, s[3].v), _mm_add_epi64(_mm_mul_epu32(r8, s[4].v), _mm_add_epi64(_mm_mul_epu32(r7_2, s[5].v), _mm_add_epi64(_mm_mul_epu32(r6, s[6].v), _mm_add_epi64(_mm_mul_epu32(r5_2, s[7].v), _mm_add_epi64(_mm_mul_epu32(r4, s[8].v), _mm_mul_epu32(r3_2, s[9].v))))))));
305 out[3].v = _mm_add_epi64(out[3].v, _mm_add_epi64(_mm_mul_epu32(r9 , s[4].v), _mm_add_epi64(_mm_mul_epu32(r8, s[5].v), _mm_add_epi64(_mm_mul_epu32(r7 , s[6].v), _mm_add_epi64(_mm_mul_epu32(r6, s[7].v), _mm_add_epi64(_mm_mul_epu32(r5 , s[8].v), _mm_mul_epu32(r4, s[9].v)))))));
306 out[4].v = _mm_add_epi64(out[4].v, _mm_add_epi64(_mm_mul_epu32(r9_2, s[5].v), _mm_add_epi64(_mm_mul_epu32(r8, s[6].v), _mm_add_epi64(_mm_mul_epu32(r7_2, s[7].v), _mm_add_epi64(_mm_mul_epu32(r6, s[8].v), _mm_mul_epu32(r5_2, s[9].v))))));
307 out[5].v = _mm_add_epi64(out[5].v, _mm_add_epi64(_mm_mul_epu32(r9 , s[6].v), _mm_add_epi64(_mm_mul_epu32(r8, s[7].v), _mm_add_epi64(_mm_mul_epu32(r7 , s[8].v), _mm_mul_epu32(r6, s[9].v)))));
308 out[6].v = _mm_add_epi64(out[6].v, _mm_add_epi64(_mm_mul_epu32(r9_2, s[7].v), _mm_add_epi64(_mm_mul_epu32(r8, s[8].v), _mm_mul_epu32(r7_2, s[9].v))));
309 out[7].v = _mm_add_epi64(out[7].v, _mm_add_epi64(_mm_mul_epu32(r9 , s[8].v), _mm_mul_epu32(r8, s[9].v)));
310 out[8].v = _mm_add_epi64(out[8].v, _mm_mul_epu32(r9_2, s[9].v));
312 c1 = _mm_srli_epi64(out[0].v, 26); c2 = _mm_srli_epi64(out[4].v, 26); out[0].v = _mm_and_si128(out[0].v, packedmask26.v); out[4].v = _mm_and_si128(out[4].v, packedmask26.v); out[1].v = _mm_add_epi64(out[1].v, c1); out[5].v = _mm_add_epi64(out[5].v, c2);
313 c1 = _mm_srli_epi64(out[1].v, 25); c2 = _mm_srli_epi64(out[5].v, 25); out[1].v = _mm_and_si128(out[1].v, packedmask25.v); out[5].v = _mm_and_si128(out[5].v, packedmask25.v); out[2].v = _mm_add_epi64(out[2].v, c1); out[6].v = _mm_add_epi64(out[6].v, c2);
314 c1 = _mm_srli_epi64(out[2].v, 26); c2 = _mm_srli_epi64(out[6].v, 26); out[2].v = _mm_and_si128(out[2].v, packedmask26.v); out[6].v = _mm_and_si128(out[6].v, packedmask26.v); out[3].v = _mm_add_epi64(out[3].v, c1); out[7].v = _mm_add_epi64(out[7].v, c2);
315 c1 = _mm_srli_epi64(out[3].v, 25); c2 = _mm_srli_epi64(out[7].v, 25); out[3].v = _mm_and_si128(out[3].v, packedmask25.v); out[7].v = _mm_and_si128(out[7].v, packedmask25.v); out[4].v = _mm_add_epi64(out[4].v, c1); out[8].v = _mm_add_epi64(out[8].v, c2);
316 c2 = _mm_srli_epi64(out[8].v, 26); out[8].v = _mm_and_si128(out[8].v, packedmask26.v); out[9].v = _mm_add_epi64(out[9].v, c2);
317 c2 = _mm_srli_epi64(out[9].v, 25); out[9].v = _mm_and_si128(out[9].v, packedmask25.v); out[0].v = _mm_add_epi64(out[0].v, _mm_mul_epu32(c2, packednineteen.v));
318 c1 = _mm_srli_epi64(out[0].v, 26); c2 = _mm_srli_epi64(out[4].v, 26); out[0].v = _mm_and_si128(out[0].v, packedmask26.v); out[4].v = _mm_and_si128(out[4].v, packedmask26.v); out[1].v = _mm_add_epi64(out[1].v, c1); out[5].v = _mm_add_epi64(out[5].v, c2);
323 curve25519_mul(bignum25519 out,
const bignum25519 r,
const bignum25519 s) {
324 xmmi m01,m23,m45,m67,m89;
327 xmmi s01,s23,s45,s67,s89;
328 xmmi s12,s34,s56,s78,s9;
331 xmmi r119,r219,r319,r419,r519,r619,r719,r819,r919;
334 s0123 = _mm_load_si128((xmmi*)s + 0);
335 s01 = _mm_shuffle_epi32(s0123,_MM_SHUFFLE(3,1,2,0));
336 s12 = _mm_shuffle_epi32(s0123, _MM_SHUFFLE(2,2,1,1));
337 s23 = _mm_shuffle_epi32(s0123,_MM_SHUFFLE(3,3,2,2));
338 s4567 = _mm_load_si128((xmmi*)s + 1);
339 s34 = _mm_unpacklo_epi64(_mm_srli_si128(s0123,12),s4567);
340 s45 = _mm_shuffle_epi32(s4567,_MM_SHUFFLE(3,1,2,0));
341 s56 = _mm_shuffle_epi32(s4567, _MM_SHUFFLE(2,2,1,1));
342 s67 = _mm_shuffle_epi32(s4567,_MM_SHUFFLE(3,3,2,2));
343 s89 = _mm_load_si128((xmmi*)s + 2);
344 s78 = _mm_unpacklo_epi64(_mm_srli_si128(s4567,12),s89);
345 s89 = _mm_shuffle_epi32(s89,_MM_SHUFFLE(3,1,2,0));
346 s9 = _mm_shuffle_epi32(s89, _MM_SHUFFLE(3,3,2,2));
348 r0 = _mm_load_si128((xmmi*)r + 0);
349 r1 = _mm_shuffle_epi32(r0, _MM_SHUFFLE(1,1,1,1));
350 r1 = _mm_add_epi64(r1, _mm_and_si128(r1, sse2_top64bitmask.v));
351 r2 = _mm_shuffle_epi32(r0, _MM_SHUFFLE(2,2,2,2));
352 r3 = _mm_shuffle_epi32(r0, _MM_SHUFFLE(3,3,3,3));
353 r3 = _mm_add_epi64(r3, _mm_and_si128(r3, sse2_top64bitmask.v));
354 r0 = _mm_shuffle_epi32(r0, _MM_SHUFFLE(0,0,0,0));
355 r4 = _mm_load_si128((xmmi*)r + 1);
356 r5 = _mm_shuffle_epi32(r4, _MM_SHUFFLE(1,1,1,1));
357 r5 = _mm_add_epi64(r5, _mm_and_si128(r5, sse2_top64bitmask.v));
358 r6 = _mm_shuffle_epi32(r4, _MM_SHUFFLE(2,2,2,2));
359 r7 = _mm_shuffle_epi32(r4, _MM_SHUFFLE(3,3,3,3));
360 r7 = _mm_add_epi64(r7, _mm_and_si128(r7, sse2_top64bitmask.v));
361 r4 = _mm_shuffle_epi32(r4, _MM_SHUFFLE(0,0,0,0));
362 r8 = _mm_load_si128((xmmi*)r + 2);
363 r9 = _mm_shuffle_epi32(r8, _MM_SHUFFLE(3,1,3,1));
364 r9 = _mm_add_epi64(r9, _mm_and_si128(r9, sse2_top64bitmask.v));
365 r8 = _mm_shuffle_epi32(r8, _MM_SHUFFLE(3,0,3,0));
367 m01 = _mm_mul_epu32(r1,s01);
368 m23 = _mm_mul_epu32(r1,s23);
369 m45 = _mm_mul_epu32(r1,s45);
370 m67 = _mm_mul_epu32(r1,s67);
371 m23 = _mm_add_epi64(m23,_mm_mul_epu32(r3,s01));
372 m45 = _mm_add_epi64(m45,_mm_mul_epu32(r3,s23));
373 m67 = _mm_add_epi64(m67,_mm_mul_epu32(r3,s45));
374 m89 = _mm_mul_epu32(r1,s89);
375 m45 = _mm_add_epi64(m45,_mm_mul_epu32(r5,s01));
376 m67 = _mm_add_epi64(m67,_mm_mul_epu32(r5,s23));
377 m89 = _mm_add_epi64(m89,_mm_mul_epu32(r3,s67));
378 m67 = _mm_add_epi64(m67,_mm_mul_epu32(r7,s01));
379 m89 = _mm_add_epi64(m89,_mm_mul_epu32(r5,s45));
380 m89 = _mm_add_epi64(m89,_mm_mul_epu32(r7,s23));
381 m89 = _mm_add_epi64(m89,_mm_mul_epu32(r9,s01));
384 m89 = _mm_unpackhi_epi64(m67,_mm_slli_si128(m89,8));
385 m67 = _mm_unpackhi_epi64(m45,_mm_slli_si128(m67,8));
386 m45 = _mm_unpackhi_epi64(m23,_mm_slli_si128(m45,8));
387 m23 = _mm_unpackhi_epi64(m01,_mm_slli_si128(m23,8));
388 m01 = _mm_unpackhi_epi64(_mm_setzero_si128(),_mm_slli_si128(m01,8));
390 m01 = _mm_add_epi64(m01,_mm_mul_epu32(r0,s01));
391 m23 = _mm_add_epi64(m23,_mm_mul_epu32(r0,s23));
392 m45 = _mm_add_epi64(m45,_mm_mul_epu32(r0,s45));
393 m67 = _mm_add_epi64(m67,_mm_mul_epu32(r0,s67));
394 m23 = _mm_add_epi64(m23,_mm_mul_epu32(r2,s01));
395 m45 = _mm_add_epi64(m45,_mm_mul_epu32(r2,s23));
396 m67 = _mm_add_epi64(m67,_mm_mul_epu32(r4,s23));
397 m89 = _mm_add_epi64(m89,_mm_mul_epu32(r0,s89));
398 m45 = _mm_add_epi64(m45,_mm_mul_epu32(r4,s01));
399 m67 = _mm_add_epi64(m67,_mm_mul_epu32(r2,s45));
400 m89 = _mm_add_epi64(m89,_mm_mul_epu32(r2,s67));
401 m67 = _mm_add_epi64(m67,_mm_mul_epu32(r6,s01));
402 m89 = _mm_add_epi64(m89,_mm_mul_epu32(r4,s45));
403 m89 = _mm_add_epi64(m89,_mm_mul_epu32(r6,s23));
404 m89 = _mm_add_epi64(m89,_mm_mul_epu32(r8,s01));
406 r219 = _mm_mul_epu32(r2, packednineteen.v);
407 r419 = _mm_mul_epu32(r4, packednineteen.v);
408 r619 = _mm_mul_epu32(r6, packednineteen.v);
409 r819 = _mm_mul_epu32(r8, packednineteen.v);
410 r119 = _mm_shuffle_epi32(r1,_MM_SHUFFLE(0,0,2,2)); r119 = _mm_mul_epu32(r119, packednineteen.v);
411 r319 = _mm_shuffle_epi32(r3,_MM_SHUFFLE(0,0,2,2)); r319 = _mm_mul_epu32(r319, packednineteen.v);
412 r519 = _mm_shuffle_epi32(r5,_MM_SHUFFLE(0,0,2,2)); r519 = _mm_mul_epu32(r519, packednineteen.v);
413 r719 = _mm_shuffle_epi32(r7,_MM_SHUFFLE(0,0,2,2)); r719 = _mm_mul_epu32(r719, packednineteen.v);
414 r919 = _mm_shuffle_epi32(r9,_MM_SHUFFLE(0,0,2,2)); r919 = _mm_mul_epu32(r919, packednineteen.v);
416 m01 = _mm_add_epi64(m01,_mm_mul_epu32(r919,s12));
417 m23 = _mm_add_epi64(m23,_mm_mul_epu32(r919,s34));
418 m45 = _mm_add_epi64(m45,_mm_mul_epu32(r919,s56));
419 m67 = _mm_add_epi64(m67,_mm_mul_epu32(r919,s78));
420 m01 = _mm_add_epi64(m01,_mm_mul_epu32(r719,s34));
421 m23 = _mm_add_epi64(m23,_mm_mul_epu32(r719,s56));
422 m45 = _mm_add_epi64(m45,_mm_mul_epu32(r719,s78));
423 m67 = _mm_add_epi64(m67,_mm_mul_epu32(r719,s9));
424 m01 = _mm_add_epi64(m01,_mm_mul_epu32(r519,s56));
425 m23 = _mm_add_epi64(m23,_mm_mul_epu32(r519,s78));
426 m45 = _mm_add_epi64(m45,_mm_mul_epu32(r519,s9));
427 m67 = _mm_add_epi64(m67,_mm_mul_epu32(r819,s89));
428 m01 = _mm_add_epi64(m01,_mm_mul_epu32(r319,s78));
429 m23 = _mm_add_epi64(m23,_mm_mul_epu32(r319,s9));
430 m45 = _mm_add_epi64(m45,_mm_mul_epu32(r619,s89));
431 m89 = _mm_add_epi64(m89,_mm_mul_epu32(r919,s9));
432 m01 = _mm_add_epi64(m01,_mm_mul_epu32(r819,s23));
433 m23 = _mm_add_epi64(m23,_mm_mul_epu32(r819,s45));
434 m45 = _mm_add_epi64(m45,_mm_mul_epu32(r819,s67));
435 m01 = _mm_add_epi64(m01,_mm_mul_epu32(r619,s45));
436 m23 = _mm_add_epi64(m23,_mm_mul_epu32(r619,s67));
437 m01 = _mm_add_epi64(m01,_mm_mul_epu32(r419,s67));
438 m23 = _mm_add_epi64(m23,_mm_mul_epu32(r419,s89));
439 m01 = _mm_add_epi64(m01,_mm_mul_epu32(r219,s89));
440 m01 = _mm_add_epi64(m01,_mm_mul_epu32(r119,s9));
442 r0 = _mm_unpacklo_epi64(m01, m45);
443 r1 = _mm_unpackhi_epi64(m01, m45);
444 r2 = _mm_unpacklo_epi64(m23, m67);
445 r3 = _mm_unpackhi_epi64(m23, m67);
446 r4 = _mm_unpacklo_epi64(m89, m89);
447 r5 = _mm_unpackhi_epi64(m89, m89);
449 c1 = _mm_srli_epi64(r0, 26); c2 = _mm_srli_epi64(r2, 26); r0 = _mm_and_si128(r0, packedmask26.v); r2 = _mm_and_si128(r2, packedmask26.v); r1 = _mm_add_epi64(r1, c1); r3 = _mm_add_epi64(r3, c2);
450 c1 = _mm_srli_epi64(r1, 25); c2 = _mm_srli_epi64(r3, 25); r1 = _mm_and_si128(r1, packedmask25.v); r3 = _mm_and_si128(r3, packedmask25.v); r2 = _mm_add_epi64(r2, c1); r4 = _mm_add_epi64(r4, c2); c3 = _mm_slli_si128(c2, 8);
451 c1 = _mm_srli_epi64(r4, 26); r4 = _mm_and_si128(r4, packedmask26.v); r5 = _mm_add_epi64(r5, c1);
452 c1 = _mm_srli_epi64(r5, 25); r5 = _mm_and_si128(r5, packedmask25.v); r0 = _mm_add_epi64(r0, _mm_unpackhi_epi64(_mm_mul_epu32(c1, packednineteen.v), c3));
453 c1 = _mm_srli_epi64(r0, 26); c2 = _mm_srli_epi64(r2, 26); r0 = _mm_and_si128(r0, packedmask26.v); r2 = _mm_and_si128(r2, packedmask26.v); r1 = _mm_add_epi64(r1, c1); r3 = _mm_add_epi64(r3, c2);
455 m0123 = _mm_unpacklo_epi32(r0, r1);
456 m4567 = _mm_unpackhi_epi32(r0, r1);
457 m0123 = _mm_unpacklo_epi64(m0123, _mm_unpacklo_epi32(r2, r3));
458 m4567 = _mm_unpacklo_epi64(m4567, _mm_unpackhi_epi32(r2, r3));
459 m89 = _mm_unpackhi_epi32(r4, r5);
461 _mm_store_si128((xmmi*)out + 0, m0123);
462 _mm_store_si128((xmmi*)out + 1, m4567);
463 _mm_store_si128((xmmi*)out + 2, m89);
466 typedef struct bignum25519mulprecomp_t {
469 xmmi r119,r219,r319,r419,r519,r619,r719,r819,r919;
470 } bignum25519mulprecomp;
474 curve25519_mul_precompute(bignum25519mulprecomp *pre,
const bignum25519 r) {
475 pre->r0 = _mm_load_si128((xmmi*)r + 0);
476 pre->r1 = _mm_shuffle_epi32(pre->r0, _MM_SHUFFLE(1,1,1,1));
477 pre->r1 = _mm_add_epi64(pre->r1, _mm_and_si128(pre->r1, sse2_top64bitmask.v));
478 pre->r2 = _mm_shuffle_epi32(pre->r0, _MM_SHUFFLE(2,2,2,2));
479 pre->r3 = _mm_shuffle_epi32(pre->r0, _MM_SHUFFLE(3,3,3,3));
480 pre->r3 = _mm_add_epi64(pre->r3, _mm_and_si128(pre->r3, sse2_top64bitmask.v));
481 pre->r0 = _mm_shuffle_epi32(pre->r0, _MM_SHUFFLE(0,0,0,0));
482 pre->r4 = _mm_load_si128((xmmi*)r + 1);
483 pre->r5 = _mm_shuffle_epi32(pre->r4, _MM_SHUFFLE(1,1,1,1));
484 pre->r5 = _mm_add_epi64(pre->r5, _mm_and_si128(pre->r5, sse2_top64bitmask.v));
485 pre->r6 = _mm_shuffle_epi32(pre->r4, _MM_SHUFFLE(2,2,2,2));
486 pre->r7 = _mm_shuffle_epi32(pre->r4, _MM_SHUFFLE(3,3,3,3));
487 pre->r7 = _mm_add_epi64(pre->r7, _mm_and_si128(pre->r7, sse2_top64bitmask.v));
488 pre->r4 = _mm_shuffle_epi32(pre->r4, _MM_SHUFFLE(0,0,0,0));
489 pre->r8 = _mm_load_si128((xmmi*)r + 2);
490 pre->r9 = _mm_shuffle_epi32(pre->r8, _MM_SHUFFLE(3,1,3,1));
491 pre->r9 = _mm_add_epi64(pre->r9, _mm_and_si128(pre->r9, sse2_top64bitmask.v));
492 pre->r8 = _mm_shuffle_epi32(pre->r8, _MM_SHUFFLE(3,0,3,0));
494 pre->r219 = _mm_mul_epu32(pre->r2, packednineteen.v);
495 pre->r419 = _mm_mul_epu32(pre->r4, packednineteen.v);
496 pre->r619 = _mm_mul_epu32(pre->r6, packednineteen.v);
497 pre->r819 = _mm_mul_epu32(pre->r8, packednineteen.v);
498 pre->r119 = _mm_shuffle_epi32(pre->r1,_MM_SHUFFLE(0,0,2,2)); pre->r119 = _mm_mul_epu32(pre->r119, packednineteen.v);
499 pre->r319 = _mm_shuffle_epi32(pre->r3,_MM_SHUFFLE(0,0,2,2)); pre->r319 = _mm_mul_epu32(pre->r319, packednineteen.v);
500 pre->r519 = _mm_shuffle_epi32(pre->r5,_MM_SHUFFLE(0,0,2,2)); pre->r519 = _mm_mul_epu32(pre->r519, packednineteen.v);
501 pre->r719 = _mm_shuffle_epi32(pre->r7,_MM_SHUFFLE(0,0,2,2)); pre->r719 = _mm_mul_epu32(pre->r719, packednineteen.v);
502 pre->r919 = _mm_shuffle_epi32(pre->r9,_MM_SHUFFLE(0,0,2,2)); pre->r919 = _mm_mul_epu32(pre->r919, packednineteen.v);
508 curve25519_mul_precomputed(bignum25519 out,
const bignum25519 s,
const bignum25519mulprecomp *r) {
509 xmmi m01,m23,m45,m67,m89;
512 xmmi s01,s23,s45,s67,s89;
513 xmmi s12,s34,s56,s78,s9;
514 xmmi r0,r1,r2,r3,r4,r5;
517 s0123 = _mm_load_si128((xmmi*)s + 0);
518 s01 = _mm_shuffle_epi32(s0123,_MM_SHUFFLE(3,1,2,0));
519 s12 = _mm_shuffle_epi32(s0123, _MM_SHUFFLE(2,2,1,1));
520 s23 = _mm_shuffle_epi32(s0123,_MM_SHUFFLE(3,3,2,2));
521 s4567 = _mm_load_si128((xmmi*)s + 1);
522 s34 = _mm_unpacklo_epi64(_mm_srli_si128(s0123,12),s4567);
523 s45 = _mm_shuffle_epi32(s4567,_MM_SHUFFLE(3,1,2,0));
524 s56 = _mm_shuffle_epi32(s4567, _MM_SHUFFLE(2,2,1,1));
525 s67 = _mm_shuffle_epi32(s4567,_MM_SHUFFLE(3,3,2,2));
526 s89 = _mm_load_si128((xmmi*)s + 2);
527 s78 = _mm_unpacklo_epi64(_mm_srli_si128(s4567,12),s89);
528 s89 = _mm_shuffle_epi32(s89,_MM_SHUFFLE(3,1,2,0));
529 s9 = _mm_shuffle_epi32(s89, _MM_SHUFFLE(3,3,2,2));
531 m01 = _mm_mul_epu32(r->r1,s01);
532 m23 = _mm_mul_epu32(r->r1,s23);
533 m45 = _mm_mul_epu32(r->r1,s45);
534 m67 = _mm_mul_epu32(r->r1,s67);
535 m23 = _mm_add_epi64(m23,_mm_mul_epu32(r->r3,s01));
536 m45 = _mm_add_epi64(m45,_mm_mul_epu32(r->r3,s23));
537 m67 = _mm_add_epi64(m67,_mm_mul_epu32(r->r3,s45));
538 m89 = _mm_mul_epu32(r->r1,s89);
539 m45 = _mm_add_epi64(m45,_mm_mul_epu32(r->r5,s01));
540 m67 = _mm_add_epi64(m67,_mm_mul_epu32(r->r5,s23));
541 m89 = _mm_add_epi64(m89,_mm_mul_epu32(r->r3,s67));
542 m67 = _mm_add_epi64(m67,_mm_mul_epu32(r->r7,s01));
543 m89 = _mm_add_epi64(m89,_mm_mul_epu32(r->r5,s45));
544 m89 = _mm_add_epi64(m89,_mm_mul_epu32(r->r7,s23));
545 m89 = _mm_add_epi64(m89,_mm_mul_epu32(r->r9,s01));
548 m89 = _mm_unpackhi_epi64(m67,_mm_slli_si128(m89,8));
549 m67 = _mm_unpackhi_epi64(m45,_mm_slli_si128(m67,8));
550 m45 = _mm_unpackhi_epi64(m23,_mm_slli_si128(m45,8));
551 m23 = _mm_unpackhi_epi64(m01,_mm_slli_si128(m23,8));
552 m01 = _mm_unpackhi_epi64(_mm_setzero_si128(),_mm_slli_si128(m01,8));
554 m01 = _mm_add_epi64(m01,_mm_mul_epu32(r->r0,s01));
555 m23 = _mm_add_epi64(m23,_mm_mul_epu32(r->r0,s23));
556 m45 = _mm_add_epi64(m45,_mm_mul_epu32(r->r0,s45));
557 m67 = _mm_add_epi64(m67,_mm_mul_epu32(r->r0,s67));
558 m23 = _mm_add_epi64(m23,_mm_mul_epu32(r->r2,s01));
559 m45 = _mm_add_epi64(m45,_mm_mul_epu32(r->r2,s23));
560 m67 = _mm_add_epi64(m67,_mm_mul_epu32(r->r4,s23));
561 m89 = _mm_add_epi64(m89,_mm_mul_epu32(r->r0,s89));
562 m45 = _mm_add_epi64(m45,_mm_mul_epu32(r->r4,s01));
563 m67 = _mm_add_epi64(m67,_mm_mul_epu32(r->r2,s45));
564 m89 = _mm_add_epi64(m89,_mm_mul_epu32(r->r2,s67));
565 m67 = _mm_add_epi64(m67,_mm_mul_epu32(r->r6,s01));
566 m89 = _mm_add_epi64(m89,_mm_mul_epu32(r->r4,s45));
567 m89 = _mm_add_epi64(m89,_mm_mul_epu32(r->r6,s23));
568 m89 = _mm_add_epi64(m89,_mm_mul_epu32(r->r8,s01));
569 m01 = _mm_add_epi64(m01,_mm_mul_epu32(r->r919,s12));
570 m23 = _mm_add_epi64(m23,_mm_mul_epu32(r->r919,s34));
571 m45 = _mm_add_epi64(m45,_mm_mul_epu32(r->r919,s56));
572 m67 = _mm_add_epi64(m67,_mm_mul_epu32(r->r919,s78));
573 m01 = _mm_add_epi64(m01,_mm_mul_epu32(r->r719,s34));
574 m23 = _mm_add_epi64(m23,_mm_mul_epu32(r->r719,s56));
575 m45 = _mm_add_epi64(m45,_mm_mul_epu32(r->r719,s78));
576 m67 = _mm_add_epi64(m67,_mm_mul_epu32(r->r719,s9));
577 m01 = _mm_add_epi64(m01,_mm_mul_epu32(r->r519,s56));
578 m23 = _mm_add_epi64(m23,_mm_mul_epu32(r->r519,s78));
579 m45 = _mm_add_epi64(m45,_mm_mul_epu32(r->r519,s9));
580 m67 = _mm_add_epi64(m67,_mm_mul_epu32(r->r819,s89));
581 m01 = _mm_add_epi64(m01,_mm_mul_epu32(r->r319,s78));
582 m23 = _mm_add_epi64(m23,_mm_mul_epu32(r->r319,s9));
583 m45 = _mm_add_epi64(m45,_mm_mul_epu32(r->r619,s89));
584 m89 = _mm_add_epi64(m89,_mm_mul_epu32(r->r919,s9));
585 m01 = _mm_add_epi64(m01,_mm_mul_epu32(r->r819,s23));
586 m23 = _mm_add_epi64(m23,_mm_mul_epu32(r->r819,s45));
587 m45 = _mm_add_epi64(m45,_mm_mul_epu32(r->r819,s67));
588 m01 = _mm_add_epi64(m01,_mm_mul_epu32(r->r619,s45));
589 m23 = _mm_add_epi64(m23,_mm_mul_epu32(r->r619,s67));
590 m01 = _mm_add_epi64(m01,_mm_mul_epu32(r->r419,s67));
591 m23 = _mm_add_epi64(m23,_mm_mul_epu32(r->r419,s89));
592 m01 = _mm_add_epi64(m01,_mm_mul_epu32(r->r219,s89));
593 m01 = _mm_add_epi64(m01,_mm_mul_epu32(r->r119,s9));
595 r0 = _mm_unpacklo_epi64(m01, m45);
596 r1 = _mm_unpackhi_epi64(m01, m45);
597 r2 = _mm_unpacklo_epi64(m23, m67);
598 r3 = _mm_unpackhi_epi64(m23, m67);
599 r4 = _mm_unpacklo_epi64(m89, m89);
600 r5 = _mm_unpackhi_epi64(m89, m89);
602 c1 = _mm_srli_epi64(r0, 26); c2 = _mm_srli_epi64(r2, 26); r0 = _mm_and_si128(r0, packedmask26.v); r2 = _mm_and_si128(r2, packedmask26.v); r1 = _mm_add_epi64(r1, c1); r3 = _mm_add_epi64(r3, c2);
603 c1 = _mm_srli_epi64(r1, 25); c2 = _mm_srli_epi64(r3, 25); r1 = _mm_and_si128(r1, packedmask25.v); r3 = _mm_and_si128(r3, packedmask25.v); r2 = _mm_add_epi64(r2, c1); r4 = _mm_add_epi64(r4, c2); c3 = _mm_slli_si128(c2, 8);
604 c1 = _mm_srli_epi64(r4, 26); r4 = _mm_and_si128(r4, packedmask26.v); r5 = _mm_add_epi64(r5, c1);
605 c1 = _mm_srli_epi64(r5, 25); r5 = _mm_and_si128(r5, packedmask25.v); r0 = _mm_add_epi64(r0, _mm_unpackhi_epi64(_mm_mul_epu32(c1, packednineteen.v), c3));
606 c1 = _mm_srli_epi64(r0, 26); c2 = _mm_srli_epi64(r2, 26); r0 = _mm_and_si128(r0, packedmask26.v); r2 = _mm_and_si128(r2, packedmask26.v); r1 = _mm_add_epi64(r1, c1); r3 = _mm_add_epi64(r3, c2);
608 m0123 = _mm_unpacklo_epi32(r0, r1);
609 m4567 = _mm_unpackhi_epi32(r0, r1);
610 m0123 = _mm_unpacklo_epi64(m0123, _mm_unpacklo_epi32(r2, r3));
611 m4567 = _mm_unpacklo_epi64(m4567, _mm_unpackhi_epi32(r2, r3));
612 m89 = _mm_unpackhi_epi32(r4, r5);
614 _mm_store_si128((xmmi*)out + 0, m0123);
615 _mm_store_si128((xmmi*)out + 1, m4567);
616 _mm_store_si128((xmmi*)out + 2, m89);
620 #define curve25519_square(r,x) curve25519_square_times(r,x,1) 623 curve25519_square_times(bignum25519 r,
const bignum25519 in,
int count) {
624 xmmi m01,m23,m45,m67,m89;
625 xmmi r0,r1,r2,r3,r4,r5,r6,r7,r8,r9;
626 xmmi r0a,r1a,r2a,r3a,r7a,r9a;
628 xmmi r01,r23,r45,r67,r6x,r89,r8x;
629 xmmi r12,r34,r56,r78,r9x;
633 r0123 = _mm_load_si128((xmmi*)in + 0);
634 r01 = _mm_shuffle_epi32(r0123,_MM_SHUFFLE(3,1,2,0));
635 r23 = _mm_shuffle_epi32(r0123,_MM_SHUFFLE(3,3,2,2));
636 r4567 = _mm_load_si128((xmmi*)in + 1);
637 r45 = _mm_shuffle_epi32(r4567,_MM_SHUFFLE(3,1,2,0));
638 r67 = _mm_shuffle_epi32(r4567,_MM_SHUFFLE(3,3,2,2));
639 r89 = _mm_load_si128((xmmi*)in + 2);
640 r89 = _mm_shuffle_epi32(r89,_MM_SHUFFLE(3,1,2,0));
643 r12 = _mm_unpackhi_epi64(r01, _mm_slli_si128(r23, 8));
644 r0 = _mm_shuffle_epi32(r01, _MM_SHUFFLE(0,0,0,0));
645 r0 = _mm_add_epi64(r0, _mm_and_si128(r0, sse2_top64bitmask.v));
646 r0a = _mm_shuffle_epi32(r0,_MM_SHUFFLE(3,2,1,2));
647 r1 = _mm_shuffle_epi32(r01, _MM_SHUFFLE(2,2,2,2));
648 r2 = _mm_shuffle_epi32(r23, _MM_SHUFFLE(0,0,0,0));
649 r2 = _mm_add_epi64(r2, _mm_and_si128(r2, sse2_top64bitmask.v));
650 r2a = _mm_shuffle_epi32(r2,_MM_SHUFFLE(3,2,1,2));
651 r3 = _mm_shuffle_epi32(r23, _MM_SHUFFLE(2,2,2,2));
652 r34 = _mm_unpackhi_epi64(r23, _mm_slli_si128(r45, 8));
653 r4 = _mm_shuffle_epi32(r45, _MM_SHUFFLE(0,0,0,0));
654 r4 = _mm_add_epi64(r4, _mm_and_si128(r4, sse2_top64bitmask.v));
655 r56 = _mm_unpackhi_epi64(r45, _mm_slli_si128(r67, 8));
656 r5619 = _mm_mul_epu32(r56, packednineteen.v);
657 r5 = _mm_shuffle_epi32(r5619, _MM_SHUFFLE(1,1,1,0));
658 r6 = _mm_shuffle_epi32(r5619, _MM_SHUFFLE(3,2,3,2));
659 r78 = _mm_unpackhi_epi64(r67, _mm_slli_si128(r89, 8));
660 r6x = _mm_unpacklo_epi64(r67, _mm_setzero_si128());
661 r7 = _mm_shuffle_epi32(r67, _MM_SHUFFLE(2,2,2,2));
662 r7 = _mm_mul_epu32(r7, packed3819.v);
663 r7a = _mm_shuffle_epi32(r7, _MM_SHUFFLE(3,3,3,2));
664 r8x = _mm_unpacklo_epi64(r89, _mm_setzero_si128());
665 r8 = _mm_shuffle_epi32(r89, _MM_SHUFFLE(0,0,0,0));
666 r8 = _mm_mul_epu32(r8, packednineteen.v);
667 r9 = _mm_shuffle_epi32(r89, _MM_SHUFFLE(2,2,2,2));
668 r9x = _mm_slli_epi32(_mm_shuffle_epi32(r89, _MM_SHUFFLE(3,3,3,2)), 1);
669 r9 = _mm_mul_epu32(r9, packed3819.v);
670 r9a = _mm_shuffle_epi32(r9, _MM_SHUFFLE(2,2,2,2));
672 m01 = _mm_mul_epu32(r01, r0);
673 m23 = _mm_mul_epu32(r23, r0a);
674 m45 = _mm_mul_epu32(r45, r0a);
675 m45 = _mm_add_epi64(m45, _mm_mul_epu32(r23, r2));
676 r23 = _mm_slli_epi32(r23, 1);
677 m67 = _mm_mul_epu32(r67, r0a);
678 m67 = _mm_add_epi64(m67, _mm_mul_epu32(r45, r2a));
679 m89 = _mm_mul_epu32(r89, r0a);
680 m89 = _mm_add_epi64(m89, _mm_mul_epu32(r67, r2a));
681 r67 = _mm_slli_epi32(r67, 1);
682 m89 = _mm_add_epi64(m89, _mm_mul_epu32(r45, r4));
683 r45 = _mm_slli_epi32(r45, 1);
685 r1 = _mm_slli_epi32(r1, 1);
686 r3 = _mm_slli_epi32(r3, 1);
687 r1a = _mm_add_epi64(r1, _mm_and_si128(r1, sse2_bot64bitmask.v));
688 r3a = _mm_add_epi64(r3, _mm_and_si128(r3, sse2_bot64bitmask.v));
690 m23 = _mm_add_epi64(m23, _mm_mul_epu32(r12, r1));
691 m45 = _mm_add_epi64(m45, _mm_mul_epu32(r34, r1a));
692 m67 = _mm_add_epi64(m67, _mm_mul_epu32(r56, r1a));
693 m67 = _mm_add_epi64(m67, _mm_mul_epu32(r34, r3));
694 r34 = _mm_slli_epi32(r34, 1);
695 m89 = _mm_add_epi64(m89, _mm_mul_epu32(r78, r1a));
696 r78 = _mm_slli_epi32(r78, 1);
697 m89 = _mm_add_epi64(m89, _mm_mul_epu32(r56, r3a));
698 r56 = _mm_slli_epi32(r56, 1);
700 m01 = _mm_add_epi64(m01, _mm_mul_epu32(_mm_slli_epi32(r12, 1), r9));
701 m01 = _mm_add_epi64(m01, _mm_mul_epu32(r34, r7));
702 m23 = _mm_add_epi64(m23, _mm_mul_epu32(r34, r9));
703 m01 = _mm_add_epi64(m01, _mm_mul_epu32(r56, r5));
704 m23 = _mm_add_epi64(m23, _mm_mul_epu32(r56, r7));
705 m45 = _mm_add_epi64(m45, _mm_mul_epu32(r56, r9));
706 m01 = _mm_add_epi64(m01, _mm_mul_epu32(r23, r8));
707 m01 = _mm_add_epi64(m01, _mm_mul_epu32(r45, r6));
708 m23 = _mm_add_epi64(m23, _mm_mul_epu32(r45, r8));
709 m23 = _mm_add_epi64(m23, _mm_mul_epu32(r6x, r6));
710 m45 = _mm_add_epi64(m45, _mm_mul_epu32(r78, r7a));
711 m67 = _mm_add_epi64(m67, _mm_mul_epu32(r78, r9));
712 m45 = _mm_add_epi64(m45, _mm_mul_epu32(r67, r8));
713 m67 = _mm_add_epi64(m67, _mm_mul_epu32(r8x, r8));
714 m89 = _mm_add_epi64(m89, _mm_mul_epu32(r9x, r9a));
716 r0 = _mm_unpacklo_epi64(m01, m45);
717 r1 = _mm_unpackhi_epi64(m01, m45);
718 r2 = _mm_unpacklo_epi64(m23, m67);
719 r3 = _mm_unpackhi_epi64(m23, m67);
720 r4 = _mm_unpacklo_epi64(m89, m89);
721 r5 = _mm_unpackhi_epi64(m89, m89);
723 c1 = _mm_srli_epi64(r0, 26); c2 = _mm_srli_epi64(r2, 26); r0 = _mm_and_si128(r0, packedmask26.v); r2 = _mm_and_si128(r2, packedmask26.v); r1 = _mm_add_epi64(r1, c1); r3 = _mm_add_epi64(r3, c2);
724 c1 = _mm_srli_epi64(r1, 25); c2 = _mm_srli_epi64(r3, 25); r1 = _mm_and_si128(r1, packedmask25.v); r3 = _mm_and_si128(r3, packedmask25.v); r2 = _mm_add_epi64(r2, c1); r4 = _mm_add_epi64(r4, c2); c3 = _mm_slli_si128(c2, 8);
725 c1 = _mm_srli_epi64(r4, 26); r4 = _mm_and_si128(r4, packedmask26.v); r5 = _mm_add_epi64(r5, c1);
726 c1 = _mm_srli_epi64(r5, 25); r5 = _mm_and_si128(r5, packedmask25.v); r0 = _mm_add_epi64(r0, _mm_unpackhi_epi64(_mm_mul_epu32(c1, packednineteen.v), c3));
727 c1 = _mm_srli_epi64(r0, 26); c2 = _mm_srli_epi64(r2, 26); r0 = _mm_and_si128(r0, packedmask26.v); r2 = _mm_and_si128(r2, packedmask26.v); r1 = _mm_add_epi64(r1, c1); r3 = _mm_add_epi64(r3, c2);
729 r01 = _mm_unpacklo_epi64(r0, r1);
730 r45 = _mm_unpackhi_epi64(r0, r1);
731 r23 = _mm_unpacklo_epi64(r2, r3);
732 r67 = _mm_unpackhi_epi64(r2, r3);
733 r89 = _mm_unpackhi_epi64(r4, r5);
736 r0123 = _mm_shuffle_epi32(r23, _MM_SHUFFLE(2,0,3,3));
737 r4567 = _mm_shuffle_epi32(r67, _MM_SHUFFLE(2,0,3,3));
738 r0123 = _mm_or_si128(r0123, _mm_shuffle_epi32(r01, _MM_SHUFFLE(3,3,2,0)));
739 r4567 = _mm_or_si128(r4567, _mm_shuffle_epi32(r45, _MM_SHUFFLE(3,3,2,0)));
740 r89 = _mm_shuffle_epi32(r89, _MM_SHUFFLE(3,3,2,0));
742 _mm_store_si128((xmmi*)r + 0, r0123);
743 _mm_store_si128((xmmi*)r + 1, r4567);
744 _mm_store_si128((xmmi*)r + 2, r89);
749 curve25519_square_packed64(packedelem64 *out,
const packedelem64 *r) {
751 xmmi r1_2,r3_2,r4_2,r5_2,r6_2,r7_2;
760 out[0].v = _mm_mul_epu32(r0, r0);
761 r0 = _mm_slli_epi32(r0, 1);
762 out[1].v = _mm_mul_epu32(r0, r1);
763 r1_2 = _mm_slli_epi32(r1, 1);
764 out[2].v = _mm_add_epi64(_mm_mul_epu32(r0, r2 ), _mm_mul_epu32(r1, r1_2));
766 out[3].v = _mm_add_epi64(_mm_mul_epu32(r0, r3 ), _mm_mul_epu32(r1, r2 ));
767 r3_2 = _mm_slli_epi32(r3, 1);
768 out[4].v = _mm_add_epi64(_mm_mul_epu32(r0, r[4].v), _mm_add_epi64(_mm_mul_epu32(r1, r3_2 ), _mm_mul_epu32(r2, r2)));
769 r2 = _mm_slli_epi32(r2, 1);
770 out[5].v = _mm_add_epi64(_mm_mul_epu32(r0, r[5].v), _mm_add_epi64(_mm_mul_epu32(r1, r[4].v), _mm_mul_epu32(r2, r3)));
771 r5_2 = _mm_slli_epi32(r[5].v, 1);
772 out[6].v = _mm_add_epi64(_mm_mul_epu32(r0, r[6].v), _mm_add_epi64(_mm_mul_epu32(r1, r5_2 ), _mm_add_epi64(_mm_mul_epu32(r2, r[4].v), _mm_mul_epu32(r3, r3_2 ))));
774 out[7].v = _mm_add_epi64(_mm_mul_epu32(r0, r[7].v), _mm_add_epi64(_mm_mul_epu32(r1, r[6].v), _mm_add_epi64(_mm_mul_epu32(r2, r[5].v), _mm_mul_epu32(r3, r[4].v))));
775 r7_2 = _mm_slli_epi32(r[7].v, 1);
776 out[8].v = _mm_add_epi64(_mm_mul_epu32(r0, r[8].v), _mm_add_epi64(_mm_mul_epu32(r1, r7_2 ), _mm_add_epi64(_mm_mul_epu32(r2, r[6].v), _mm_add_epi64(_mm_mul_epu32(r3, r5_2 ), _mm_mul_epu32(r[4].v, r[4].v)))));
777 out[9].v = _mm_add_epi64(_mm_mul_epu32(r0, r[9].v), _mm_add_epi64(_mm_mul_epu32(r1, r[8].v), _mm_add_epi64(_mm_mul_epu32(r2, r[7].v), _mm_add_epi64(_mm_mul_epu32(r3, r[6].v), _mm_mul_epu32(r[4].v, r5_2 )))));
779 d5 = _mm_mul_epu32(r[5].v, packedthirtyeight.v);
780 d6 = _mm_mul_epu32(r[6].v, packednineteen.v);
781 d7 = _mm_mul_epu32(r[7].v, packedthirtyeight.v);
782 d8 = _mm_mul_epu32(r[8].v, packednineteen.v);
783 d9 = _mm_mul_epu32(r[9].v, packedthirtyeight.v);
785 r4_2 = _mm_slli_epi32(r[4].v, 1);
786 r6_2 = _mm_slli_epi32(r[6].v, 1);
787 out[0].v = _mm_add_epi64(out[0].v, _mm_add_epi64(_mm_mul_epu32(d9, r1 ), _mm_add_epi64(_mm_mul_epu32(d8, r2 ), _mm_add_epi64(_mm_mul_epu32(d7, r3 ), _mm_add_epi64(_mm_mul_epu32(d6, r4_2), _mm_mul_epu32(d5, r[5].v))))));
788 out[1].v = _mm_add_epi64(out[1].v, _mm_add_epi64(_mm_mul_epu32(d9, _mm_srli_epi32(r2, 1)), _mm_add_epi64(_mm_mul_epu32(d8, r3 ), _mm_add_epi64(_mm_mul_epu32(d7, r[4].v), _mm_mul_epu32(d6, r5_2 )))));
789 out[2].v = _mm_add_epi64(out[2].v, _mm_add_epi64(_mm_mul_epu32(d9, r3 ), _mm_add_epi64(_mm_mul_epu32(d8, r4_2), _mm_add_epi64(_mm_mul_epu32(d7, r5_2 ), _mm_mul_epu32(d6, r[6].v)))));
790 out[3].v = _mm_add_epi64(out[3].v, _mm_add_epi64(_mm_mul_epu32(d9, r[4].v ), _mm_add_epi64(_mm_mul_epu32(d8, r5_2), _mm_mul_epu32(d7, r[6].v))));
791 out[4].v = _mm_add_epi64(out[4].v, _mm_add_epi64(_mm_mul_epu32(d9, r5_2 ), _mm_add_epi64(_mm_mul_epu32(d8, r6_2), _mm_mul_epu32(d7, r[7].v))));
792 out[5].v = _mm_add_epi64(out[5].v, _mm_add_epi64(_mm_mul_epu32(d9, r[6].v ), _mm_mul_epu32(d8, r7_2 )));
793 out[6].v = _mm_add_epi64(out[6].v, _mm_add_epi64(_mm_mul_epu32(d9, r7_2 ), _mm_mul_epu32(d8, r[8].v)));
794 out[7].v = _mm_add_epi64(out[7].v, _mm_mul_epu32(d9, r[8].v));
795 out[8].v = _mm_add_epi64(out[8].v, _mm_mul_epu32(d9, r[9].v));
797 c1 = _mm_srli_epi64(out[0].v, 26); c2 = _mm_srli_epi64(out[4].v, 26); out[0].v = _mm_and_si128(out[0].v, packedmask26.v); out[4].v = _mm_and_si128(out[4].v, packedmask26.v); out[1].v = _mm_add_epi64(out[1].v, c1); out[5].v = _mm_add_epi64(out[5].v, c2);
798 c1 = _mm_srli_epi64(out[1].v, 25); c2 = _mm_srli_epi64(out[5].v, 25); out[1].v = _mm_and_si128(out[1].v, packedmask25.v); out[5].v = _mm_and_si128(out[5].v, packedmask25.v); out[2].v = _mm_add_epi64(out[2].v, c1); out[6].v = _mm_add_epi64(out[6].v, c2);
799 c1 = _mm_srli_epi64(out[2].v, 26); c2 = _mm_srli_epi64(out[6].v, 26); out[2].v = _mm_and_si128(out[2].v, packedmask26.v); out[6].v = _mm_and_si128(out[6].v, packedmask26.v); out[3].v = _mm_add_epi64(out[3].v, c1); out[7].v = _mm_add_epi64(out[7].v, c2);
800 c1 = _mm_srli_epi64(out[3].v, 25); c2 = _mm_srli_epi64(out[7].v, 25); out[3].v = _mm_and_si128(out[3].v, packedmask25.v); out[7].v = _mm_and_si128(out[7].v, packedmask25.v); out[4].v = _mm_add_epi64(out[4].v, c1); out[8].v = _mm_add_epi64(out[8].v, c2);
801 c2 = _mm_srli_epi64(out[8].v, 26); out[8].v = _mm_and_si128(out[8].v, packedmask26.v); out[9].v = _mm_add_epi64(out[9].v, c2);
802 c2 = _mm_srli_epi64(out[9].v, 25); out[9].v = _mm_and_si128(out[9].v, packedmask25.v); out[0].v = _mm_add_epi64(out[0].v, _mm_mul_epu32(c2, packednineteen.v));
803 c1 = _mm_srli_epi64(out[0].v, 26); c2 = _mm_srli_epi64(out[4].v, 26); out[0].v = _mm_and_si128(out[0].v, packedmask26.v); out[4].v = _mm_and_si128(out[4].v, packedmask26.v); out[1].v = _mm_add_epi64(out[1].v, c1); out[5].v = _mm_add_epi64(out[5].v, c2);
808 curve25519_make_nqpq(packedelem64 *primex, packedelem64 *primez,
const packedelem32 *pqx,
const packedelem32 *pqz) {
809 primex[0].v = _mm_shuffle_epi32(pqx[0].v, _MM_SHUFFLE(1,1,0,0));
810 primex[1].v = _mm_shuffle_epi32(pqx[0].v, _MM_SHUFFLE(3,3,2,2));
811 primex[2].v = _mm_shuffle_epi32(pqx[1].v, _MM_SHUFFLE(1,1,0,0));
812 primex[3].v = _mm_shuffle_epi32(pqx[1].v, _MM_SHUFFLE(3,3,2,2));
813 primex[4].v = _mm_shuffle_epi32(pqx[2].v, _MM_SHUFFLE(1,1,0,0));
814 primex[5].v = _mm_shuffle_epi32(pqx[2].v, _MM_SHUFFLE(3,3,2,2));
815 primex[6].v = _mm_shuffle_epi32(pqx[3].v, _MM_SHUFFLE(1,1,0,0));
816 primex[7].v = _mm_shuffle_epi32(pqx[3].v, _MM_SHUFFLE(3,3,2,2));
817 primex[8].v = _mm_shuffle_epi32(pqx[4].v, _MM_SHUFFLE(1,1,0,0));
818 primex[9].v = _mm_shuffle_epi32(pqx[4].v, _MM_SHUFFLE(3,3,2,2));
819 primez[0].v = _mm_shuffle_epi32(pqz[0].v, _MM_SHUFFLE(0,0,1,1));
820 primez[1].v = _mm_shuffle_epi32(pqz[0].v, _MM_SHUFFLE(2,2,3,3));
821 primez[2].v = _mm_shuffle_epi32(pqz[1].v, _MM_SHUFFLE(0,0,1,1));
822 primez[3].v = _mm_shuffle_epi32(pqz[1].v, _MM_SHUFFLE(2,2,3,3));
823 primez[4].v = _mm_shuffle_epi32(pqz[2].v, _MM_SHUFFLE(0,0,1,1));
824 primez[5].v = _mm_shuffle_epi32(pqz[2].v, _MM_SHUFFLE(2,2,3,3));
825 primez[6].v = _mm_shuffle_epi32(pqz[3].v, _MM_SHUFFLE(0,0,1,1));
826 primez[7].v = _mm_shuffle_epi32(pqz[3].v, _MM_SHUFFLE(2,2,3,3));
827 primez[8].v = _mm_shuffle_epi32(pqz[4].v, _MM_SHUFFLE(0,0,1,1));
828 primez[9].v = _mm_shuffle_epi32(pqz[4].v, _MM_SHUFFLE(2,2,3,3));
833 curve25519_make_nq(packedelem64 *nq,
const packedelem32 *pqx,
const packedelem32 *pqz) {
834 nq[0].v = _mm_unpacklo_epi64(pqx[0].v, pqz[0].v);
835 nq[1].v = _mm_unpackhi_epi64(pqx[0].v, pqz[0].v);
836 nq[2].v = _mm_unpacklo_epi64(pqx[1].v, pqz[1].v);
837 nq[3].v = _mm_unpackhi_epi64(pqx[1].v, pqz[1].v);
838 nq[4].v = _mm_unpacklo_epi64(pqx[2].v, pqz[2].v);
839 nq[5].v = _mm_unpackhi_epi64(pqx[2].v, pqz[2].v);
840 nq[6].v = _mm_unpacklo_epi64(pqx[3].v, pqz[3].v);
841 nq[7].v = _mm_unpackhi_epi64(pqx[3].v, pqz[3].v);
842 nq[8].v = _mm_unpacklo_epi64(pqx[4].v, pqz[4].v);
843 nq[9].v = _mm_unpackhi_epi64(pqx[4].v, pqz[4].v);
848 curve25519_compute_nq(packedelem64 *nq,
const bignum25519 nqx,
const bignum25519 nqz) {
855 x0 = _mm_load_si128((xmmi*)nqx + 0);
856 x1 = _mm_load_si128((xmmi*)nqx + 1);
857 x2 = _mm_load_si128((xmmi*)nqx + 2);
858 z0 = _mm_load_si128((xmmi*)nqz + 0);
859 z1 = _mm_load_si128((xmmi*)nqz + 1);
860 z2 = _mm_load_si128((xmmi*)nqz + 2);
861 a0 = _mm_add_epi32(x0, z0);
862 a1 = _mm_add_epi32(x1, z1);
863 a2 = _mm_add_epi32(x2, z2);
864 s0 = _mm_add_epi32(x0, packed2p0.v);
865 s1 = _mm_add_epi32(x1, packed2p1.v);
866 s2 = _mm_add_epi32(x2, packed2p2.v);
867 s0 = _mm_sub_epi32(s0, z0);
868 s1 = _mm_sub_epi32(s1, z1);
869 s2 = _mm_sub_epi32(s2, z2);
870 r0 = _mm_and_si128(_mm_shuffle_epi32(s0, _MM_SHUFFLE(2,2,0,0)), sse2_bot32bitmask.v);
871 r1 = _mm_and_si128(_mm_shuffle_epi32(s0, _MM_SHUFFLE(3,3,1,1)), sse2_bot32bitmask.v);
872 c1 = _mm_srli_epi32(r0, 26);
873 c2 = _mm_srli_epi32(r1, 25);
874 r0 = _mm_and_si128(r0, packedmask26.v);
875 r1 = _mm_and_si128(r1, packedmask25.v);
876 r0 = _mm_add_epi32(r0, _mm_slli_si128(c2, 8));
877 r1 = _mm_add_epi32(r1, c1);
878 s0 = _mm_unpacklo_epi64(_mm_unpacklo_epi32(r0, r1), _mm_unpackhi_epi32(r0, r1));
879 s1 = _mm_add_epi32(s1, _mm_srli_si128(c2, 8));
880 nq[0].v = _mm_unpacklo_epi64(a0, s0);
881 nq[2].v = _mm_unpackhi_epi64(a0, s0);
882 nq[4].v = _mm_unpacklo_epi64(a1, s1);
883 nq[6].v = _mm_unpackhi_epi64(a1, s1);
884 nq[8].v = _mm_unpacklo_epi64(a2, s2);
885 nq[1].v = _mm_shuffle_epi32(nq[0].v, _MM_SHUFFLE(3,3,1,1));
886 nq[3].v = _mm_shuffle_epi32(nq[2].v, _MM_SHUFFLE(3,3,1,1));
887 nq[5].v = _mm_shuffle_epi32(nq[4].v, _MM_SHUFFLE(3,3,1,1));
888 nq[7].v = _mm_shuffle_epi32(nq[6].v, _MM_SHUFFLE(3,3,1,1));
889 nq[9].v = _mm_shuffle_epi32(nq[8].v, _MM_SHUFFLE(3,3,1,1));
895 curve25519_addsub_packed64(packedelem64 *r) {
896 packed32bignum25519 x,z,add,sub;
898 x[0].v = _mm_unpacklo_epi64(r[0].v, r[1].v);
899 z[0].v = _mm_unpackhi_epi64(r[0].v, r[1].v);
900 x[1].v = _mm_unpacklo_epi64(r[2].v, r[3].v);
901 z[1].v = _mm_unpackhi_epi64(r[2].v, r[3].v);
902 x[2].v = _mm_unpacklo_epi64(r[4].v, r[5].v);
903 z[2].v = _mm_unpackhi_epi64(r[4].v, r[5].v);
904 x[3].v = _mm_unpacklo_epi64(r[6].v, r[7].v);
905 z[3].v = _mm_unpackhi_epi64(r[6].v, r[7].v);
906 x[4].v = _mm_unpacklo_epi64(r[8].v, r[9].v);
907 z[4].v = _mm_unpackhi_epi64(r[8].v, r[9].v);
909 curve25519_add_packed32(add, x, z);
910 curve25519_sub_packed32(sub, x, z);
912 r[0].v = _mm_unpacklo_epi64(add[0].v, sub[0].v);
913 r[1].v = _mm_unpackhi_epi64(add[0].v, sub[0].v);
914 r[2].v = _mm_unpacklo_epi64(add[1].v, sub[1].v);
915 r[3].v = _mm_unpackhi_epi64(add[1].v, sub[1].v);
916 r[4].v = _mm_unpacklo_epi64(add[2].v, sub[2].v);
917 r[5].v = _mm_unpackhi_epi64(add[2].v, sub[2].v);
918 r[6].v = _mm_unpacklo_epi64(add[3].v, sub[3].v);
919 r[7].v = _mm_unpackhi_epi64(add[3].v, sub[3].v);
920 r[8].v = _mm_unpacklo_epi64(add[4].v, sub[4].v);
921 r[9].v = _mm_unpackhi_epi64(add[4].v, sub[4].v);
926 curve25519_121665_packed64(packedelem64 *out,
const packedelem64 *in) {
929 out[0].v = _mm_mul_epu32(in[0].v, packed121666121665.v);
930 out[1].v = _mm_mul_epu32(in[1].v, packed121666121665.v);
931 out[2].v = _mm_mul_epu32(in[2].v, packed121666121665.v);
932 out[3].v = _mm_mul_epu32(in[3].v, packed121666121665.v);
933 out[4].v = _mm_mul_epu32(in[4].v, packed121666121665.v);
934 out[5].v = _mm_mul_epu32(in[5].v, packed121666121665.v);
935 out[6].v = _mm_mul_epu32(in[6].v, packed121666121665.v);
936 out[7].v = _mm_mul_epu32(in[7].v, packed121666121665.v);
937 out[8].v = _mm_mul_epu32(in[8].v, packed121666121665.v);
938 out[9].v = _mm_mul_epu32(in[9].v, packed121666121665.v);
940 c1 = _mm_srli_epi64(out[0].v, 26); c2 = _mm_srli_epi64(out[4].v, 26); out[0].v = _mm_and_si128(out[0].v, packedmask26.v); out[4].v = _mm_and_si128(out[4].v, packedmask26.v); out[1].v = _mm_add_epi64(out[1].v, c1); out[5].v = _mm_add_epi64(out[5].v, c2);
941 c1 = _mm_srli_epi64(out[1].v, 25); c2 = _mm_srli_epi64(out[5].v, 25); out[1].v = _mm_and_si128(out[1].v, packedmask25.v); out[5].v = _mm_and_si128(out[5].v, packedmask25.v); out[2].v = _mm_add_epi64(out[2].v, c1); out[6].v = _mm_add_epi64(out[6].v, c2);
942 c1 = _mm_srli_epi64(out[2].v, 26); c2 = _mm_srli_epi64(out[6].v, 26); out[2].v = _mm_and_si128(out[2].v, packedmask26.v); out[6].v = _mm_and_si128(out[6].v, packedmask26.v); out[3].v = _mm_add_epi64(out[3].v, c1); out[7].v = _mm_add_epi64(out[7].v, c2);
943 c1 = _mm_srli_epi64(out[3].v, 25); c2 = _mm_srli_epi64(out[7].v, 25); out[3].v = _mm_and_si128(out[3].v, packedmask25.v); out[7].v = _mm_and_si128(out[7].v, packedmask25.v); out[4].v = _mm_add_epi64(out[4].v, c1); out[8].v = _mm_add_epi64(out[8].v, c2);
944 c2 = _mm_srli_epi64(out[8].v, 26); out[8].v = _mm_and_si128(out[8].v, packedmask26.v); out[9].v = _mm_add_epi64(out[9].v, c2);
945 c2 = _mm_srli_epi64(out[9].v, 25); out[9].v = _mm_and_si128(out[9].v, packedmask25.v); out[0].v = _mm_add_epi64(out[0].v, _mm_mul_epu32(c2, packednineteen.v));
946 c1 = _mm_srli_epi64(out[0].v, 26); c2 = _mm_srli_epi64(out[4].v, 26); out[0].v = _mm_and_si128(out[0].v, packedmask26.v); out[4].v = _mm_and_si128(out[4].v, packedmask26.v); out[1].v = _mm_add_epi64(out[1].v, c1); out[5].v = _mm_add_epi64(out[5].v, c2);
951 curve25519_final_nq(packedelem64 *nq,
const packedelem64 *sq,
const packedelem64 *sq121665) {
952 packed32bignum25519 x, z, sub;
953 packed64bignum25519 t, nqa, nqb;
955 x[0].v = _mm_or_si128(_mm_unpacklo_epi64(sq[0].v, sq[1].v), _mm_slli_si128(_mm_unpacklo_epi64(sq121665[0].v, sq121665[1].v), 4));
956 z[0].v = _mm_or_si128(_mm_unpackhi_epi64(sq[0].v, sq[1].v), _mm_slli_si128(_mm_unpackhi_epi64(sq121665[0].v, sq121665[1].v), 4));
957 x[1].v = _mm_or_si128(_mm_unpacklo_epi64(sq[2].v, sq[3].v), _mm_slli_si128(_mm_unpacklo_epi64(sq121665[2].v, sq121665[3].v), 4));
958 z[1].v = _mm_or_si128(_mm_unpackhi_epi64(sq[2].v, sq[3].v), _mm_slli_si128(_mm_unpackhi_epi64(sq121665[2].v, sq121665[3].v), 4));
959 x[2].v = _mm_or_si128(_mm_unpacklo_epi64(sq[4].v, sq[5].v), _mm_slli_si128(_mm_unpacklo_epi64(sq121665[4].v, sq121665[5].v), 4));
960 z[2].v = _mm_or_si128(_mm_unpackhi_epi64(sq[4].v, sq[5].v), _mm_slli_si128(_mm_unpackhi_epi64(sq121665[4].v, sq121665[5].v), 4));
961 x[3].v = _mm_or_si128(_mm_unpacklo_epi64(sq[6].v, sq[7].v), _mm_slli_si128(_mm_unpacklo_epi64(sq121665[6].v, sq121665[7].v), 4));
962 z[3].v = _mm_or_si128(_mm_unpackhi_epi64(sq[6].v, sq[7].v), _mm_slli_si128(_mm_unpackhi_epi64(sq121665[6].v, sq121665[7].v), 4));
963 x[4].v = _mm_or_si128(_mm_unpacklo_epi64(sq[8].v, sq[9].v), _mm_slli_si128(_mm_unpacklo_epi64(sq121665[8].v, sq121665[9].v), 4));
964 z[4].v = _mm_or_si128(_mm_unpackhi_epi64(sq[8].v, sq[9].v), _mm_slli_si128(_mm_unpackhi_epi64(sq121665[8].v, sq121665[9].v), 4));
966 curve25519_sub_packed32(sub, x, z);
968 t[0].v = _mm_shuffle_epi32(sub[0].v, _MM_SHUFFLE(1,1,0,0));
969 t[1].v = _mm_shuffle_epi32(sub[0].v, _MM_SHUFFLE(3,3,2,2));
970 t[2].v = _mm_shuffle_epi32(sub[1].v, _MM_SHUFFLE(1,1,0,0));
971 t[3].v = _mm_shuffle_epi32(sub[1].v, _MM_SHUFFLE(3,3,2,2));
972 t[4].v = _mm_shuffle_epi32(sub[2].v, _MM_SHUFFLE(1,1,0,0));
973 t[5].v = _mm_shuffle_epi32(sub[2].v, _MM_SHUFFLE(3,3,2,2));
974 t[6].v = _mm_shuffle_epi32(sub[3].v, _MM_SHUFFLE(1,1,0,0));
975 t[7].v = _mm_shuffle_epi32(sub[3].v, _MM_SHUFFLE(3,3,2,2));
976 t[8].v = _mm_shuffle_epi32(sub[4].v, _MM_SHUFFLE(1,1,0,0));
977 t[9].v = _mm_shuffle_epi32(sub[4].v, _MM_SHUFFLE(3,3,2,2));
979 nqa[0].v = _mm_unpacklo_epi64(sq[0].v, t[0].v);
980 nqb[0].v = _mm_unpackhi_epi64(sq[0].v, t[0].v);
981 nqa[1].v = _mm_unpacklo_epi64(sq[1].v, t[1].v);
982 nqb[1].v = _mm_unpackhi_epi64(sq[1].v, t[1].v);
983 nqa[2].v = _mm_unpacklo_epi64(sq[2].v, t[2].v);
984 nqb[2].v = _mm_unpackhi_epi64(sq[2].v, t[2].v);
985 nqa[3].v = _mm_unpacklo_epi64(sq[3].v, t[3].v);
986 nqb[3].v = _mm_unpackhi_epi64(sq[3].v, t[3].v);
987 nqa[4].v = _mm_unpacklo_epi64(sq[4].v, t[4].v);
988 nqb[4].v = _mm_unpackhi_epi64(sq[4].v, t[4].v);
989 nqa[5].v = _mm_unpacklo_epi64(sq[5].v, t[5].v);
990 nqb[5].v = _mm_unpackhi_epi64(sq[5].v, t[5].v);
991 nqa[6].v = _mm_unpacklo_epi64(sq[6].v, t[6].v);
992 nqb[6].v = _mm_unpackhi_epi64(sq[6].v, t[6].v);
993 nqa[7].v = _mm_unpacklo_epi64(sq[7].v, t[7].v);
994 nqb[7].v = _mm_unpackhi_epi64(sq[7].v, t[7].v);
995 nqa[8].v = _mm_unpacklo_epi64(sq[8].v, t[8].v);
996 nqb[8].v = _mm_unpackhi_epi64(sq[8].v, t[8].v);
997 nqa[9].v = _mm_unpacklo_epi64(sq[9].v, t[9].v);
998 nqb[9].v = _mm_unpackhi_epi64(sq[9].v, t[9].v);
1000 curve25519_mul_packed64(nq, nqa, nqb);
1008 curve25519_pow_two5mtwo0_two250mtwo0(bignum25519 b) {
1009 ALIGN(16) bignum25519 t0,c;
1012 curve25519_square_times(t0, b, 5);
1013 curve25519_mul(b, t0, b);
1014 curve25519_square_times(t0, b, 10);
1015 curve25519_mul(c, t0, b);
1016 curve25519_square_times(t0, c, 20);
1017 curve25519_mul(t0, t0, c);
1018 curve25519_square_times(t0, t0, 10);
1019 curve25519_mul(b, t0, b);
1020 curve25519_square_times(t0, b, 50);
1021 curve25519_mul(c, t0, b);
1022 curve25519_square_times(t0, c, 100);
1023 curve25519_mul(t0, t0, c);
1024 curve25519_square_times(t0, t0, 50);
1025 curve25519_mul(b, t0, b);
1032 curve25519_recip(bignum25519 out, const bignum25519 z) {
1033 ALIGN(16) bignum25519 a, t0, b;
1035 curve25519_square(a, z);
1036 curve25519_square_times(t0, a, 2);
1037 curve25519_mul(b, t0, z);
1038 curve25519_mul(a, b, a);
1039 curve25519_square(t0, a);
1040 curve25519_mul(b, t0, b);
1041 curve25519_pow_two5mtwo0_two250mtwo0(b);
1042 curve25519_square_times(b, b, 5);
1043 curve25519_mul(out, b, a);
1046 ANONYMOUS_NAMESPACE_END
1049 NAMESPACE_BEGIN(Donna)
1051 int curve25519_mult_SSE2(byte sharedKey[32], const byte secretKey[32], const byte othersKey[32])
1054 for (
size_t i = 0;i < 32;++i)
1055 e[i] = secretKey[i];
1056 e[0] &= 0xf8; e[31] &= 0x7f; e[31] |= 0x40;
1058 ALIGN(16) bignum25519 nqx = {1}, nqpqz = {1}, nqz = {0}, nqpqx, zmone;
1059 packed32bignum25519 qx, qz, pqz, pqx;
1060 packed64bignum25519 nq, sq, sqscalar, prime, primex, primez, nqpq;
1061 bignum25519mulprecomp preq;
1062 size_t i=0, bit=0, lastbit=0;
1064 curve25519_expand(nqpqx, othersKey);
1065 curve25519_mul_precompute(&preq, nqpqx);
1068 for (i = 254, lastbit=0; i >= 3; i--) {
1069 bit = (e[i/8] >> (i & 7)) & 1;
1070 curve25519_swap_conditional(nqx, nqpqx, (word32)(bit ^ lastbit));
1071 curve25519_swap_conditional(nqz, nqpqz, (word32)(bit ^ lastbit));
1074 curve25519_tangle32(qx, nqx, nqpqx);
1075 curve25519_tangle32(qz, nqz, nqpqz);
1077 curve25519_add_packed32(pqx, qx, qz);
1078 curve25519_sub_packed32(pqz, qx, qz);
1080 curve25519_make_nqpq(primex, primez, pqx, pqz);
1081 curve25519_mul_packed64(prime, primex, primez);
1082 curve25519_addsub_packed64(prime);
1083 curve25519_square_packed64(nqpq, prime);
1084 curve25519_untangle64(nqpqx, nqpqz, nqpq);
1085 curve25519_mul_precomputed(nqpqz, nqpqz, &preq);
1088 curve25519_make_nq(nq, pqx, pqz);
1089 curve25519_square_packed64(sq, nq);
1090 curve25519_121665_packed64(sqscalar, sq);
1091 curve25519_final_nq(nq, sq, sqscalar);
1092 curve25519_untangle64(nqx, nqz, nq);
1097 curve25519_swap_conditional(nqx, nqpqx, (word32)bit);
1098 curve25519_swap_conditional(nqz, nqpqz, (word32)bit);
1101 for (i = 0; i < 3; i++) {
1102 curve25519_compute_nq(nq, nqx, nqz);
1103 curve25519_square_packed64(sq, nq);
1104 curve25519_121665_packed64(sqscalar, sq);
1105 curve25519_final_nq(nq, sq, sqscalar);
1106 curve25519_untangle64(nqx, nqz, nq);
1109 curve25519_recip(zmone, nqz);
1110 curve25519_mul(nqz, nqx, zmone);
1111 curve25519_contract(sharedKey, nqz);
1119 #endif // CRYPTOPP_CURVE25519_SSE2
Utility functions for the Crypto++ library.
EnumToType< ByteOrder, LITTLE_ENDIAN_ORDER > LittleEndian
Provides a constant for LittleEndian.
Library configuration file.
Classes and functions for secure memory allocations.
Fixed size stack-based SecBlock.
Crypto++ library namespace.