ext-cryptopp/rdrand.s

493 lines
11 KiB
ArmAsm
Raw Normal View History

2015-11-23 00:17:15 +00:00
;; rdrand.asm - written and placed in public domain by Jeffrey Walton and Uri Blumenthal.
;; Copyright assigned to the Crypto++ project.
;; This ASM file provides RDRAND and RDSEED to downlevel Unix and Linux tool
;; chains. You will need a modern Nasm, however. You can also use it in place
;; of intrinsics. The routines below run a little faster than the intrinsic
;; based routines.
2015-11-23 00:17:15 +00:00
2017-03-23 20:05:30 +00:00
;; nasm -f elf32 rdrand.s -DX86 -g -o rdrand-x86.o
;; nasm -f elfx32 rdrand.s -DX32 -g -o rdrand-x32.o
;; nasm -f elf64 rdrand.s -DX64 -g -o rdrand-x64.o
2015-11-23 00:17:15 +00:00
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;; C/C++ Function prototypes
;; X86, X32 and X64:
;; extern "C" void NASM_RDRAND_GenerateBlock(byte* ptr, size_t size);
;; extern "C" void NASM_RDSEED_GenerateBlock(byte* ptr, size_t size);
2015-11-23 00:17:15 +00:00
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
%ifdef X86 ;; Set via the command line
%define arg1 [esp+04h]
%define arg2 [esp+08h]
%define buffer ecx
%define bsize edx
%define lsize dl ;; Used for tail bytes, 1-byte constants
%define MWSIZE 04h ;; machine word size
2015-11-23 00:17:15 +00:00
%elifdef X32 ;; Set via the command line
%define buffer edi ;; Linux ABI
%define bsize esi ;; Linux ABI
%define lsize si
%define MWSIZE 04h ;; machine word size
2015-11-23 00:17:15 +00:00
%elifdef X64 ;; Set via the command line
%ifdef CYGWIN ;; Cygwin follows Windows ABI here, not Linux ABI
%define buffer rcx ;; Windows ABI
%define bsize rdx ;; Windows ABI
%define lsize dx ;; Used for tail bytes, 2-byte constants
2015-11-23 00:17:15 +00:00
%else
%define buffer rdi ;; Linux ABI
%define bsize rsi ;; Linux ABI
%define lsize si ;; Used for tail bytes, 2-byte constants
2015-11-23 00:17:15 +00:00
%endif
%define MWSIZE 08h ;; machine word size
2015-11-23 00:17:15 +00:00
%else
%error Missing or unknown architecture
2015-11-23 00:17:15 +00:00
%endif
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
2015-11-23 00:17:15 +00:00
;; Fixups
2015-11-23 00:17:15 +00:00
%ifdef DARWIN
%define NASM_RDRAND_GenerateBlock _NASM_RDRAND_GenerateBlock
%define NASM_RDSEED_GenerateBlock _NASM_RDSEED_GenerateBlock
2015-11-23 00:17:15 +00:00
%endif
%ifdef CYGWIN
%ifdef X86
%define NASM_RDRAND_GenerateBlock _NASM_RDRAND_GenerateBlock
%define NASM_RDSEED_GenerateBlock _NASM_RDSEED_GenerateBlock
%endif
%endif
2015-11-23 00:17:15 +00:00
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
2015-11-23 00:17:15 +00:00
%ifdef X86 ;; Set via the command line
2015-11-23 00:17:15 +00:00
global NASM_RDRAND_GenerateBlock
section .text
align 8
2015-11-23 00:17:15 +00:00
NASM_RDRAND_GenerateBlock:
2015-11-23 00:17:15 +00:00
.Load_Arguments:
2016-06-22 01:22:50 +00:00
mov buffer, arg1
mov bsize, arg2
2015-11-23 00:17:15 +00:00
;; A block of 16-bytes appears to be optimal. Adding
;; more rdrand calls degrades performance.
cmp bsize, 16
jb .GenerateBlock_4
.GenerateBlock_16:
.Call_RDRAND_EAX_4:
rdrand eax
jnc .Call_RDRAND_EAX_4
mov [buffer+0], eax
.Call_RDRAND_EAX_3:
rdrand eax
jnc .Call_RDRAND_EAX_3
mov [buffer+4], eax
.Call_RDRAND_EAX_2:
rdrand eax
jnc .Call_RDRAND_EAX_2
mov [buffer+8], eax
.Call_RDRAND_EAX_1:
rdrand eax
jnc .Call_RDRAND_EAX_1
mov [buffer+12], eax
sub bsize, 16
add buffer, 16
2015-11-23 00:17:15 +00:00
cmp bsize, 16
jae .GenerateBlock_16
;; Fewer than 16 bytes remain
.GenerateBlock_4:
cmp lsize, 0
je .GenerateBlock_Return
2016-06-22 01:22:50 +00:00
.Call_RDRAND_EAX_0:
2015-11-23 00:17:15 +00:00
rdrand eax
jnc .Call_RDRAND_EAX_0
cmp lsize, MWSIZE
jb .Partial_Machine_Word
2015-11-23 00:17:15 +00:00
.Full_Machine_Word:
2015-11-23 00:17:15 +00:00
mov [buffer], eax
add buffer, MWSIZE
sub lsize, MWSIZE
2015-11-23 00:17:15 +00:00
;; Continue
jmp .GenerateBlock_4
2015-11-23 00:17:15 +00:00
;; 1,2,3 bytes remain
.Partial_Machine_Word:
;; Test bit 1 to see if size is at least 2
test lsize, 2
jz .Bit_1_Not_Set
2015-11-23 00:17:15 +00:00
mov [buffer], ax
shr eax, 16
add buffer, 2
2016-06-22 01:22:50 +00:00
2015-11-23 00:17:15 +00:00
.Bit_1_Not_Set:
2016-06-22 01:22:50 +00:00
;; Test bit 0 to see if size is at least 1
test lsize, 1
jz .Bit_0_Not_Set
2015-11-23 00:17:15 +00:00
mov [buffer], al
2015-11-23 00:17:15 +00:00
.Bit_0_Not_Set:
;; We've hit all the bits
2016-06-22 01:22:50 +00:00
.GenerateBlock_Return:
2015-11-23 00:17:15 +00:00
xor eax, eax
ret
2015-11-23 00:17:15 +00:00
%endif ;; X86
2015-11-23 00:17:15 +00:00
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
%ifdef X64 or X32 ;; Set via the command line
2015-11-23 00:17:15 +00:00
global NASM_RDRAND_GenerateBlock
section .text
align 16
2015-11-23 00:17:15 +00:00
NASM_RDRAND_GenerateBlock:
2015-11-23 00:17:15 +00:00
;; No need for Load_Arguments due to fastcall
2015-11-23 00:17:15 +00:00
;; A block of 32-bytes appears to be optimal. Adding
;; more rdrand calls degrades performance.
cmp bsize, 32
jb .GenerateBlock_8
.GenerateBlock_32:
.Call_RDRAND_RAX_4:
rdrand rax
jnc .Call_RDRAND_RAX_4
mov [buffer+0], rax
.Call_RDRAND_RAX_3:
rdrand rax
jnc .Call_RDRAND_RAX_3
mov [buffer+8], rax
.Call_RDRAND_RAX_2:
rdrand rax
jnc .Call_RDRAND_RAX_2
mov [buffer+16], rax
.Call_RDRAND_RAX_1:
rdrand rax
jnc .Call_RDRAND_RAX_1
mov [buffer+24], rax
sub bsize, 32
add buffer, 32
2015-11-23 00:17:15 +00:00
cmp bsize, 32
jae .GenerateBlock_32
;; Fewer than 32 bytes remain
.GenerateBlock_8:
cmp lsize, 0
je .GenerateBlock_Return
2015-11-23 00:17:15 +00:00
.Call_RDRAND_RAX_0:
rdrand rax
jnc .Call_RDRAND_RAX_0
2015-11-23 00:17:15 +00:00
cmp lsize, MWSIZE
jb .Partial_Machine_Word
2016-06-22 01:22:50 +00:00
2015-11-23 00:17:15 +00:00
.Full_Machine_Word:
mov [buffer], rax
add buffer, MWSIZE
sub lsize, MWSIZE
2015-11-23 00:17:15 +00:00
;; Continue
jmp .GenerateBlock_8
2015-11-23 00:17:15 +00:00
;; 1,2,3,4,5,6,7 bytes remain
2015-11-23 00:17:15 +00:00
.Partial_Machine_Word:
;; Test bit 2 to see if size is at least 4
test lsize, 4
jz .Bit_2_Not_Set
2015-11-23 00:17:15 +00:00
mov [buffer], eax
shr rax, 32
add buffer, 4
2015-11-23 00:17:15 +00:00
.Bit_2_Not_Set:
;; Test bit 1 to see if size is at least 2
test lsize, 2
jz .Bit_1_Not_Set
2015-11-23 00:17:15 +00:00
mov [buffer], ax
shr eax, 16
add buffer, 2
2015-11-23 00:17:15 +00:00
.Bit_1_Not_Set:
;; Test bit 0 to see if size is at least 1
test lsize, 1
jz .Bit_0_Not_Set
2015-11-23 00:17:15 +00:00
mov [buffer], al
2015-11-23 00:17:15 +00:00
.Bit_0_Not_Set:
;; We've hit all the bits
2016-06-22 01:22:50 +00:00
.GenerateBlock_Return:
2015-11-23 00:17:15 +00:00
xor rax, rax
ret
2015-11-23 00:17:15 +00:00
%endif ;; X64
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
%ifdef X86 ;; Set via the command line
2015-11-23 00:17:15 +00:00
global NASM_RDSEED_GenerateBlock
section .text
align 8
2015-11-23 00:17:15 +00:00
NASM_RDSEED_GenerateBlock:
2015-11-23 00:17:15 +00:00
.Load_Arguments:
mov buffer, arg1
mov bsize, arg2
2015-11-23 00:17:15 +00:00
;; A block of 16-bytes appears to be optimal. Adding
;; more rdrand calls degrades performance.
cmp bsize, 16
jb .GenerateBlock_4
.GenerateBlock_16:
.Call_RDSEED_EAX_4:
rdseed eax
jnc .Call_RDSEED_EAX_4
mov [buffer+0], eax
.Call_RDSEED_EAX_3:
rdseed eax
jnc .Call_RDSEED_EAX_3
mov [buffer+4], eax
.Call_RDSEED_EAX_2:
rdseed eax
jnc .Call_RDSEED_EAX_2
mov [buffer+8], eax
.Call_RDSEED_EAX_1:
rdseed eax
jnc .Call_RDSEED_EAX_1
mov [buffer+12], eax
sub bsize, 16
add buffer, 16
cmp bsize, 16
jae .GenerateBlock_16
;; Fewer than 16 bytes remain
.GenerateBlock_4:
2015-11-23 00:17:15 +00:00
cmp lsize, 0
je .GenerateBlock_Return
2015-11-23 00:17:15 +00:00
.Call_RDSEED_EAX_0:
2016-06-22 01:22:50 +00:00
rdseed eax
jnc .Call_RDSEED_EAX_0
2015-11-23 00:17:15 +00:00
cmp lsize, MWSIZE
jb .Partial_Machine_Word
2016-06-22 01:22:50 +00:00
2015-11-23 00:17:15 +00:00
.Full_Machine_Word:
mov [buffer], eax
add buffer, MWSIZE
sub lsize, MWSIZE
2016-06-22 01:22:50 +00:00
;; Continue
jmp .GenerateBlock_4
2015-11-23 00:17:15 +00:00
;; 1,2,3 bytes remain
2015-11-23 00:17:15 +00:00
.Partial_Machine_Word:
;; Test bit 1 to see if size is at least 2
test lsize, 2
jz .Bit_1_Not_Set
2015-11-23 00:17:15 +00:00
mov [buffer], ax
shr eax, 16
add buffer, 2
2016-06-22 01:22:50 +00:00
2015-11-23 00:17:15 +00:00
.Bit_1_Not_Set:
2016-06-22 01:22:50 +00:00
;; Test bit 0 to see if size is at least 1
test lsize, 1
jz .Bit_0_Not_Set
2015-11-23 00:17:15 +00:00
mov [buffer], al
2015-11-23 00:17:15 +00:00
.Bit_0_Not_Set:
;; We've hit all the bits
2015-11-23 00:17:15 +00:00
.GenerateBlock_Return:
2016-06-22 01:22:50 +00:00
xor eax, eax
ret
2015-11-23 00:17:15 +00:00
%endif ;; X86
2015-11-23 00:17:15 +00:00
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
%ifdef X64 or X32 ;; Set via the command line
2015-11-23 00:17:15 +00:00
global NASM_RDSEED_GenerateBlock
section .text
align 16
2015-11-23 00:17:15 +00:00
NASM_RDSEED_GenerateBlock:
2015-11-23 00:17:15 +00:00
;; No need for Load_Arguments due to fastcall
2015-11-23 00:17:15 +00:00
;; A block of 32-bytes appears to be optimal. Adding
;; more rdrand calls degrades performance.
cmp bsize, 32
jb .GenerateBlock_8
.GenerateBlock_32:
.Call_RDSEED_RAX_4:
rdseed rax
jnc .Call_RDSEED_RAX_4
mov [buffer+0], rax
.Call_RDSEED_RAX_3:
rdseed rax
jnc .Call_RDSEED_RAX_3
mov [buffer+8], rax
.Call_RDSEED_RAX_2:
rdseed rax
jnc .Call_RDSEED_RAX_2
mov [buffer+16], rax
.Call_RDSEED_RAX_1:
rdseed rax
jnc .Call_RDSEED_RAX_1
mov [buffer+24], rax
sub bsize, 32
add buffer, 32
cmp bsize, 32
jae .GenerateBlock_32
;; Fewer than 32 bytes remain
.GenerateBlock_8:
2015-11-23 00:17:15 +00:00
cmp lsize, 0
je .GenerateBlock_Return
2015-11-23 00:17:15 +00:00
.Call_RDSEED_RAX_0:
rdseed rax
jnc .Call_RDSEED_RAX_0
2015-11-23 00:17:15 +00:00
cmp lsize, MWSIZE
jb .Partial_Machine_Word
2016-06-22 01:22:50 +00:00
2015-11-23 00:17:15 +00:00
.Full_Machine_Word:
mov [buffer], rax
add buffer, MWSIZE
sub lsize, MWSIZE
2015-11-23 00:17:15 +00:00
;; Continue
jmp .GenerateBlock_8
2015-11-23 00:17:15 +00:00
;; 1,2,3,4,5,6,7 bytes remain
2015-11-23 00:17:15 +00:00
.Partial_Machine_Word:
;; Test bit 2 to see if size is at least 4
test lsize, 4
jz .Bit_2_Not_Set
2015-11-23 00:17:15 +00:00
mov [buffer], eax
shr rax, 32
add buffer, 4
2015-11-23 00:17:15 +00:00
.Bit_2_Not_Set:
;; Test bit 1 to see if size is at least 2
test lsize, 2
jz .Bit_1_Not_Set
2015-11-23 00:17:15 +00:00
mov [buffer], ax
shr eax, 16
add buffer, 2
2015-11-23 00:17:15 +00:00
.Bit_1_Not_Set:
;; Test bit 0 to see if size is at least 1
test lsize, 1
jz .Bit_0_Not_Set
2015-11-23 00:17:15 +00:00
mov [buffer], al
2015-11-23 00:17:15 +00:00
.Bit_0_Not_Set:
;; We've hit all the bits
2016-06-22 01:22:50 +00:00
.GenerateBlock_Return:
2015-11-23 00:17:15 +00:00
xor rax, rax
ret
2016-06-22 01:22:50 +00:00
%endif ;; X64
2015-11-23 00:17:15 +00:00
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;