Crypto++  8.2
Free C++ class library of cryptographic schemes
rdrand.s
1 ;; rdrand.asm - written and placed in public domain by Jeffrey Walton and Uri Blumenthal.
2 ;; Copyright assigned to the Crypto++ project.
3 
4 ;; This ASM file provides RDRAND and RDSEED to downlevel Unix and Linux tool
5 ;; chains. You will need a modern Nasm, however. You can also use it in place
6 ;; of intrinsics. The routines below run a little faster than the intrinsic
7 ;; based routines.
8 
9 ;; nasm -f elf32 rdrand.s -DX86 -g -o rdrand-x86.o
10 ;; nasm -f elfx32 rdrand.s -DX32 -g -o rdrand-x32.o
11 ;; nasm -f elf64 rdrand.s -DX64 -g -o rdrand-x64.o
12 
13 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
14 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
15 
16 ;; C/C++ Function prototypes
17 ;; X86, X32 and X64:
18 ;; extern "C" void NASM_RDRAND_GenerateBlock(byte* ptr, size_t size);
19 ;; extern "C" void NASM_RDSEED_GenerateBlock(byte* ptr, size_t size);
20 
21 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
22 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
23 
24 %ifdef X86 ;; Set via the command line
25 %define arg1 [esp+04h]
26 %define arg2 [esp+08h]
27 %define buffer ecx
28 %define bsize edx
29 %define lsize dl ;; Used for tail bytes, 1-byte constants
30 %define MWSIZE 04h ;; machine word size
31 
32 %elifdef X32 ;; Set via the command line
33 %define buffer edi ;; Linux ABI
34 %define bsize esi ;; Linux ABI
35 %define lsize si
36 %define MWSIZE 04h ;; machine word size
37 
38 %elifdef X64 ;; Set via the command line
39 %ifdef CYGWIN ;; Cygwin follows Windows ABI here, not Linux ABI
40 %define buffer rcx ;; Windows ABI
41 %define bsize rdx ;; Windows ABI
42 %define lsize dx ;; Used for tail bytes, 2-byte constants
43 %else
44 %define buffer rdi ;; Linux ABI
45 %define bsize rsi ;; Linux ABI
46 %define lsize si ;; Used for tail bytes, 2-byte constants
47 %endif
48 %define MWSIZE 08h ;; machine word size
49 
50 %else
51 %error Missing or unknown architecture
52 %endif
53 
54 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
55 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
56 
57 ;; Fixups
58 
59 %ifdef DARWIN
60 %define NASM_RDRAND_GenerateBlock _NASM_RDRAND_GenerateBlock
61 %define NASM_RDSEED_GenerateBlock _NASM_RDSEED_GenerateBlock
62 %endif
63 
64 %ifdef CYGWIN
65 %ifdef X86
66 %define NASM_RDRAND_GenerateBlock _NASM_RDRAND_GenerateBlock
67 %define NASM_RDSEED_GenerateBlock _NASM_RDSEED_GenerateBlock
68 %endif
69 %endif
70 
71 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
72 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
73 
74 %ifdef X86 ;; Set via the command line
75 
76 global NASM_RDRAND_GenerateBlock
77 section .text
78 align 8
79 
80 NASM_RDRAND_GenerateBlock:
81 
82 .Load_Arguments:
83 
84  mov buffer, arg1
85  mov bsize, arg2
86 
87  ;; A block of 16-bytes appears to be optimal. Adding
88  ;; more rdrand calls degrades performance.
89  cmp bsize, 16
90  jb .GenerateBlock_4
91 
92 .GenerateBlock_16:
93 
94 .Call_RDRAND_EAX_4:
95  rdrand eax
96  jnc .Call_RDRAND_EAX_4
97  mov [buffer+0], eax
98 
99 .Call_RDRAND_EAX_3:
100  rdrand eax
101  jnc .Call_RDRAND_EAX_3
102  mov [buffer+4], eax
103 
104 .Call_RDRAND_EAX_2:
105  rdrand eax
106  jnc .Call_RDRAND_EAX_2
107  mov [buffer+8], eax
108 
109 .Call_RDRAND_EAX_1:
110  rdrand eax
111  jnc .Call_RDRAND_EAX_1
112  mov [buffer+12], eax
113 
114  sub bsize, 16
115  add buffer, 16
116 
117  cmp bsize, 16
118  jae .GenerateBlock_16
119 
120  ;; Fewer than 16 bytes remain
121 .GenerateBlock_4:
122 
123  cmp lsize, 0
124  je .GenerateBlock_Return
125 
126 .Call_RDRAND_EAX_0:
127 
128  rdrand eax
129  jnc .Call_RDRAND_EAX_0
130 
131  cmp lsize, MWSIZE
132  jb .Partial_Machine_Word
133 
134 .Full_Machine_Word:
135 
136  mov [buffer], eax
137  add buffer, MWSIZE
138  sub lsize, MWSIZE
139 
140  ;; Continue
141  jmp .GenerateBlock_4
142 
143  ;; 1,2,3 bytes remain
144 .Partial_Machine_Word:
145 
146  ;; Test bit 1 to see if size is at least 2
147  test lsize, 2
148  jz .Bit_1_Not_Set
149 
150  mov [buffer], ax
151  shr eax, 16
152  add buffer, 2
153 
154 .Bit_1_Not_Set:
155 
156  ;; Test bit 0 to see if size is at least 1
157  test lsize, 1
158  jz .Bit_0_Not_Set
159 
160  mov [buffer], al
161 
162 .Bit_0_Not_Set:
163 
164  ;; We've hit all the bits
165 
166 .GenerateBlock_Return:
167 
168  xor eax, eax
169  ret
170 
171 %endif ;; X86
172 
173 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
174 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
175 
176 %ifdef X64 or X32 ;; Set via the command line
177 
178 global NASM_RDRAND_GenerateBlock
179 section .text
180 align 16
181 
182 NASM_RDRAND_GenerateBlock:
183 
184  ;; No need for Load_Arguments due to fastcall
185 
186  ;; A block of 32-bytes appears to be optimal. Adding
187  ;; more rdrand calls degrades performance.
188  cmp bsize, 32
189  jb .GenerateBlock_8
190 
191 .GenerateBlock_32:
192 
193 .Call_RDRAND_RAX_4:
194  rdrand rax
195  jnc .Call_RDRAND_RAX_4
196  mov [buffer+0], rax
197 
198 .Call_RDRAND_RAX_3:
199  rdrand rax
200  jnc .Call_RDRAND_RAX_3
201  mov [buffer+8], rax
202 
203 .Call_RDRAND_RAX_2:
204  rdrand rax
205  jnc .Call_RDRAND_RAX_2
206  mov [buffer+16], rax
207 
208 .Call_RDRAND_RAX_1:
209  rdrand rax
210  jnc .Call_RDRAND_RAX_1
211  mov [buffer+24], rax
212 
213  sub bsize, 32
214  add buffer, 32
215 
216  cmp bsize, 32
217  jae .GenerateBlock_32
218 
219  ;; Fewer than 32 bytes remain
220 .GenerateBlock_8:
221 
222  cmp lsize, 0
223  je .GenerateBlock_Return
224 
225 .Call_RDRAND_RAX_0:
226  rdrand rax
227  jnc .Call_RDRAND_RAX_0
228 
229  cmp lsize, MWSIZE
230  jb .Partial_Machine_Word
231 
232 .Full_Machine_Word:
233 
234  mov [buffer], rax
235  add buffer, MWSIZE
236  sub lsize, MWSIZE
237 
238  ;; Continue
239  jmp .GenerateBlock_8
240 
241  ;; 1,2,3,4,5,6,7 bytes remain
242 .Partial_Machine_Word:
243 
244  ;; Test bit 2 to see if size is at least 4
245  test lsize, 4
246  jz .Bit_2_Not_Set
247 
248  mov [buffer], eax
249  shr rax, 32
250  add buffer, 4
251 
252 .Bit_2_Not_Set:
253 
254  ;; Test bit 1 to see if size is at least 2
255  test lsize, 2
256  jz .Bit_1_Not_Set
257 
258  mov [buffer], ax
259  shr eax, 16
260  add buffer, 2
261 
262 .Bit_1_Not_Set:
263 
264  ;; Test bit 0 to see if size is at least 1
265  test lsize, 1
266  jz .Bit_0_Not_Set
267 
268  mov [buffer], al
269 
270 .Bit_0_Not_Set:
271 
272  ;; We've hit all the bits
273 
274 .GenerateBlock_Return:
275 
276  xor rax, rax
277  ret
278 
279 %endif ;; X64
280 
281 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
282 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
283 
284 %ifdef X86 ;; Set via the command line
285 
286 global NASM_RDSEED_GenerateBlock
287 section .text
288 align 8
289 
290 NASM_RDSEED_GenerateBlock:
291 
292 .Load_Arguments:
293 
294  mov buffer, arg1
295  mov bsize, arg2
296 
297  ;; A block of 16-bytes appears to be optimal. Adding
298  ;; more rdrand calls degrades performance.
299  cmp bsize, 16
300  jb .GenerateBlock_4
301 
302 .GenerateBlock_16:
303 
304 .Call_RDSEED_EAX_4:
305  rdseed eax
306  jnc .Call_RDSEED_EAX_4
307  mov [buffer+0], eax
308 
309 .Call_RDSEED_EAX_3:
310  rdseed eax
311  jnc .Call_RDSEED_EAX_3
312  mov [buffer+4], eax
313 
314 .Call_RDSEED_EAX_2:
315  rdseed eax
316  jnc .Call_RDSEED_EAX_2
317  mov [buffer+8], eax
318 
319 .Call_RDSEED_EAX_1:
320  rdseed eax
321  jnc .Call_RDSEED_EAX_1
322  mov [buffer+12], eax
323 
324  sub bsize, 16
325  add buffer, 16
326 
327  cmp bsize, 16
328  jae .GenerateBlock_16
329 
330  ;; Fewer than 16 bytes remain
331 .GenerateBlock_4:
332 
333  cmp lsize, 0
334  je .GenerateBlock_Return
335 
336 .Call_RDSEED_EAX_0:
337 
338  rdseed eax
339  jnc .Call_RDSEED_EAX_0
340 
341  cmp lsize, MWSIZE
342  jb .Partial_Machine_Word
343 
344 .Full_Machine_Word:
345 
346  mov [buffer], eax
347  add buffer, MWSIZE
348  sub lsize, MWSIZE
349 
350  ;; Continue
351  jmp .GenerateBlock_4
352 
353  ;; 1,2,3 bytes remain
354 .Partial_Machine_Word:
355 
356  ;; Test bit 1 to see if size is at least 2
357  test lsize, 2
358  jz .Bit_1_Not_Set
359 
360  mov [buffer], ax
361  shr eax, 16
362  add buffer, 2
363 
364 .Bit_1_Not_Set:
365 
366  ;; Test bit 0 to see if size is at least 1
367  test lsize, 1
368  jz .Bit_0_Not_Set
369 
370  mov [buffer], al
371 
372 .Bit_0_Not_Set:
373 
374  ;; We've hit all the bits
375 
376 .GenerateBlock_Return:
377 
378  xor eax, eax
379  ret
380 
381 %endif ;; X86
382 
383 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
384 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
385 
386 %ifdef X64 or X32 ;; Set via the command line
387 
388 global NASM_RDSEED_GenerateBlock
389 section .text
390 align 16
391 
392 NASM_RDSEED_GenerateBlock:
393 
394  ;; No need for Load_Arguments due to fastcall
395 
396  ;; A block of 32-bytes appears to be optimal. Adding
397  ;; more rdrand calls degrades performance.
398  cmp bsize, 32
399  jb .GenerateBlock_8
400 
401 .GenerateBlock_32:
402 
403 .Call_RDSEED_RAX_4:
404  rdseed rax
405  jnc .Call_RDSEED_RAX_4
406  mov [buffer+0], rax
407 
408 .Call_RDSEED_RAX_3:
409  rdseed rax
410  jnc .Call_RDSEED_RAX_3
411  mov [buffer+8], rax
412 
413 .Call_RDSEED_RAX_2:
414  rdseed rax
415  jnc .Call_RDSEED_RAX_2
416  mov [buffer+16], rax
417 
418 .Call_RDSEED_RAX_1:
419  rdseed rax
420  jnc .Call_RDSEED_RAX_1
421  mov [buffer+24], rax
422 
423  sub bsize, 32
424  add buffer, 32
425 
426  cmp bsize, 32
427  jae .GenerateBlock_32
428 
429  ;; Fewer than 32 bytes remain
430 .GenerateBlock_8:
431 
432  cmp lsize, 0
433  je .GenerateBlock_Return
434 
435 .Call_RDSEED_RAX_0:
436  rdseed rax
437  jnc .Call_RDSEED_RAX_0
438 
439  cmp lsize, MWSIZE
440  jb .Partial_Machine_Word
441 
442 .Full_Machine_Word:
443 
444  mov [buffer], rax
445  add buffer, MWSIZE
446  sub lsize, MWSIZE
447 
448  ;; Continue
449  jmp .GenerateBlock_8
450 
451  ;; 1,2,3,4,5,6,7 bytes remain
452 .Partial_Machine_Word:
453 
454  ;; Test bit 2 to see if size is at least 4
455  test lsize, 4
456  jz .Bit_2_Not_Set
457 
458  mov [buffer], eax
459  shr rax, 32
460  add buffer, 4
461 
462 .Bit_2_Not_Set:
463 
464  ;; Test bit 1 to see if size is at least 2
465  test lsize, 2
466  jz .Bit_1_Not_Set
467 
468  mov [buffer], ax
469  shr eax, 16
470  add buffer, 2
471 
472 .Bit_1_Not_Set:
473 
474  ;; Test bit 0 to see if size is at least 1
475  test lsize, 1
476  jz .Bit_0_Not_Set
477 
478  mov [buffer], al
479 
480 .Bit_0_Not_Set:
481 
482  ;; We've hit all the bits
483 
484 .GenerateBlock_Return:
485 
486  xor rax, rax
487  ret
488 
489 %endif ;; X64
490 
491 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
492 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;