mirror of
https://github.com/shadps4-emu/ext-cryptopp.git
synced 2024-11-30 05:10:40 +00:00
493 lines
11 KiB
ArmAsm
493 lines
11 KiB
ArmAsm
;; rdrand.asm - written and placed in public domain by Jeffrey Walton and Uri Blumenthal.
|
|
;; Copyright assigned to the Crypto++ project.
|
|
|
|
;; This ASM file provides RDRAND and RDSEED to downlevel Unix and Linux tool
|
|
;; chains. You will need a modern Nasm, however. You can also use it in place
|
|
;; of intrinsics. The routines below run a little faster than the intrinsic
|
|
;; based routines.
|
|
|
|
;; nasm -f elf32 rdrand.s -DX86 -g -o rdrand-x86.o
|
|
;; nasm -f elfx32 rdrand.s -DX32 -g -o rdrand-x32.o
|
|
;; nasm -f elf64 rdrand.s -DX64 -g -o rdrand-x64.o
|
|
|
|
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
|
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
|
|
|
;; C/C++ Function prototypes
|
|
;; X86, X32 and X64:
|
|
;; extern "C" void NASM_RDRAND_GenerateBlock(byte* ptr, size_t size);
|
|
;; extern "C" void NASM_RDSEED_GenerateBlock(byte* ptr, size_t size);
|
|
|
|
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
|
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
|
|
|
%ifdef X86 ;; Set via the command line
|
|
%define arg1 [esp+04h]
|
|
%define arg2 [esp+08h]
|
|
%define buffer ecx
|
|
%define bsize edx
|
|
%define lsize dl ;; Used for tail bytes, 1-byte constants
|
|
%define MWSIZE 04h ;; machine word size
|
|
|
|
%elifdef X32 ;; Set via the command line
|
|
%define buffer edi ;; Linux ABI
|
|
%define bsize esi ;; Linux ABI
|
|
%define lsize si
|
|
%define MWSIZE 04h ;; machine word size
|
|
|
|
%elifdef X64 ;; Set via the command line
|
|
%ifdef CYGWIN ;; Cygwin follows Windows ABI here, not Linux ABI
|
|
%define buffer rcx ;; Windows ABI
|
|
%define bsize rdx ;; Windows ABI
|
|
%define lsize dx ;; Used for tail bytes, 2-byte constants
|
|
%else
|
|
%define buffer rdi ;; Linux ABI
|
|
%define bsize rsi ;; Linux ABI
|
|
%define lsize si ;; Used for tail bytes, 2-byte constants
|
|
%endif
|
|
%define MWSIZE 08h ;; machine word size
|
|
|
|
%else
|
|
%error Missing or unknown architecture
|
|
%endif
|
|
|
|
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
|
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
|
|
|
;; Fixups
|
|
|
|
%ifdef DARWIN
|
|
%define NASM_RDRAND_GenerateBlock _NASM_RDRAND_GenerateBlock
|
|
%define NASM_RDSEED_GenerateBlock _NASM_RDSEED_GenerateBlock
|
|
%endif
|
|
|
|
%ifdef CYGWIN
|
|
%ifdef X86
|
|
%define NASM_RDRAND_GenerateBlock _NASM_RDRAND_GenerateBlock
|
|
%define NASM_RDSEED_GenerateBlock _NASM_RDSEED_GenerateBlock
|
|
%endif
|
|
%endif
|
|
|
|
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
|
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
|
|
|
%ifdef X86 ;; Set via the command line
|
|
|
|
global NASM_RDRAND_GenerateBlock
|
|
section .text
|
|
align 8
|
|
|
|
NASM_RDRAND_GenerateBlock:
|
|
|
|
.Load_Arguments:
|
|
|
|
mov buffer, arg1
|
|
mov bsize, arg2
|
|
|
|
;; A block of 16-bytes appears to be optimal. Adding
|
|
;; more rdrand calls degrades performance.
|
|
cmp bsize, 16
|
|
jb .GenerateBlock_4
|
|
|
|
.GenerateBlock_16:
|
|
|
|
.Call_RDRAND_EAX_4:
|
|
rdrand eax
|
|
jnc .Call_RDRAND_EAX_4
|
|
mov [buffer+0], eax
|
|
|
|
.Call_RDRAND_EAX_3:
|
|
rdrand eax
|
|
jnc .Call_RDRAND_EAX_3
|
|
mov [buffer+4], eax
|
|
|
|
.Call_RDRAND_EAX_2:
|
|
rdrand eax
|
|
jnc .Call_RDRAND_EAX_2
|
|
mov [buffer+8], eax
|
|
|
|
.Call_RDRAND_EAX_1:
|
|
rdrand eax
|
|
jnc .Call_RDRAND_EAX_1
|
|
mov [buffer+12], eax
|
|
|
|
sub bsize, 16
|
|
add buffer, 16
|
|
|
|
cmp bsize, 16
|
|
jae .GenerateBlock_16
|
|
|
|
;; Fewer than 16 bytes remain
|
|
.GenerateBlock_4:
|
|
|
|
cmp lsize, 0
|
|
je .GenerateBlock_Return
|
|
|
|
.Call_RDRAND_EAX_0:
|
|
|
|
rdrand eax
|
|
jnc .Call_RDRAND_EAX_0
|
|
|
|
cmp lsize, MWSIZE
|
|
jb .Partial_Machine_Word
|
|
|
|
.Full_Machine_Word:
|
|
|
|
mov [buffer], eax
|
|
add buffer, MWSIZE
|
|
sub lsize, MWSIZE
|
|
|
|
;; Continue
|
|
jmp .GenerateBlock_4
|
|
|
|
;; 1,2,3 bytes remain
|
|
.Partial_Machine_Word:
|
|
|
|
;; Test bit 1 to see if size is at least 2
|
|
test lsize, 2
|
|
jz .Bit_1_Not_Set
|
|
|
|
mov [buffer], ax
|
|
shr eax, 16
|
|
add buffer, 2
|
|
|
|
.Bit_1_Not_Set:
|
|
|
|
;; Test bit 0 to see if size is at least 1
|
|
test lsize, 1
|
|
jz .Bit_0_Not_Set
|
|
|
|
mov [buffer], al
|
|
|
|
.Bit_0_Not_Set:
|
|
|
|
;; We've hit all the bits
|
|
|
|
.GenerateBlock_Return:
|
|
|
|
xor eax, eax
|
|
ret
|
|
|
|
%endif ;; X86
|
|
|
|
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
|
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
|
|
|
%ifdef X64 or X32 ;; Set via the command line
|
|
|
|
global NASM_RDRAND_GenerateBlock
|
|
section .text
|
|
align 16
|
|
|
|
NASM_RDRAND_GenerateBlock:
|
|
|
|
;; No need for Load_Arguments due to fastcall
|
|
|
|
;; A block of 32-bytes appears to be optimal. Adding
|
|
;; more rdrand calls degrades performance.
|
|
cmp bsize, 32
|
|
jb .GenerateBlock_8
|
|
|
|
.GenerateBlock_32:
|
|
|
|
.Call_RDRAND_RAX_4:
|
|
rdrand rax
|
|
jnc .Call_RDRAND_RAX_4
|
|
mov [buffer+0], rax
|
|
|
|
.Call_RDRAND_RAX_3:
|
|
rdrand rax
|
|
jnc .Call_RDRAND_RAX_3
|
|
mov [buffer+8], rax
|
|
|
|
.Call_RDRAND_RAX_2:
|
|
rdrand rax
|
|
jnc .Call_RDRAND_RAX_2
|
|
mov [buffer+16], rax
|
|
|
|
.Call_RDRAND_RAX_1:
|
|
rdrand rax
|
|
jnc .Call_RDRAND_RAX_1
|
|
mov [buffer+24], rax
|
|
|
|
sub bsize, 32
|
|
add buffer, 32
|
|
|
|
cmp bsize, 32
|
|
jae .GenerateBlock_32
|
|
|
|
;; Fewer than 32 bytes remain
|
|
.GenerateBlock_8:
|
|
|
|
cmp lsize, 0
|
|
je .GenerateBlock_Return
|
|
|
|
.Call_RDRAND_RAX_0:
|
|
rdrand rax
|
|
jnc .Call_RDRAND_RAX_0
|
|
|
|
cmp lsize, MWSIZE
|
|
jb .Partial_Machine_Word
|
|
|
|
.Full_Machine_Word:
|
|
|
|
mov [buffer], rax
|
|
add buffer, MWSIZE
|
|
sub lsize, MWSIZE
|
|
|
|
;; Continue
|
|
jmp .GenerateBlock_8
|
|
|
|
;; 1,2,3,4,5,6,7 bytes remain
|
|
.Partial_Machine_Word:
|
|
|
|
;; Test bit 2 to see if size is at least 4
|
|
test lsize, 4
|
|
jz .Bit_2_Not_Set
|
|
|
|
mov [buffer], eax
|
|
shr rax, 32
|
|
add buffer, 4
|
|
|
|
.Bit_2_Not_Set:
|
|
|
|
;; Test bit 1 to see if size is at least 2
|
|
test lsize, 2
|
|
jz .Bit_1_Not_Set
|
|
|
|
mov [buffer], ax
|
|
shr eax, 16
|
|
add buffer, 2
|
|
|
|
.Bit_1_Not_Set:
|
|
|
|
;; Test bit 0 to see if size is at least 1
|
|
test lsize, 1
|
|
jz .Bit_0_Not_Set
|
|
|
|
mov [buffer], al
|
|
|
|
.Bit_0_Not_Set:
|
|
|
|
;; We've hit all the bits
|
|
|
|
.GenerateBlock_Return:
|
|
|
|
xor rax, rax
|
|
ret
|
|
|
|
%endif ;; X64
|
|
|
|
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
|
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
|
|
|
%ifdef X86 ;; Set via the command line
|
|
|
|
global NASM_RDSEED_GenerateBlock
|
|
section .text
|
|
align 8
|
|
|
|
NASM_RDSEED_GenerateBlock:
|
|
|
|
.Load_Arguments:
|
|
|
|
mov buffer, arg1
|
|
mov bsize, arg2
|
|
|
|
;; A block of 16-bytes appears to be optimal. Adding
|
|
;; more rdrand calls degrades performance.
|
|
cmp bsize, 16
|
|
jb .GenerateBlock_4
|
|
|
|
.GenerateBlock_16:
|
|
|
|
.Call_RDSEED_EAX_4:
|
|
rdseed eax
|
|
jnc .Call_RDSEED_EAX_4
|
|
mov [buffer+0], eax
|
|
|
|
.Call_RDSEED_EAX_3:
|
|
rdseed eax
|
|
jnc .Call_RDSEED_EAX_3
|
|
mov [buffer+4], eax
|
|
|
|
.Call_RDSEED_EAX_2:
|
|
rdseed eax
|
|
jnc .Call_RDSEED_EAX_2
|
|
mov [buffer+8], eax
|
|
|
|
.Call_RDSEED_EAX_1:
|
|
rdseed eax
|
|
jnc .Call_RDSEED_EAX_1
|
|
mov [buffer+12], eax
|
|
|
|
sub bsize, 16
|
|
add buffer, 16
|
|
|
|
cmp bsize, 16
|
|
jae .GenerateBlock_16
|
|
|
|
;; Fewer than 16 bytes remain
|
|
.GenerateBlock_4:
|
|
|
|
cmp lsize, 0
|
|
je .GenerateBlock_Return
|
|
|
|
.Call_RDSEED_EAX_0:
|
|
|
|
rdseed eax
|
|
jnc .Call_RDSEED_EAX_0
|
|
|
|
cmp lsize, MWSIZE
|
|
jb .Partial_Machine_Word
|
|
|
|
.Full_Machine_Word:
|
|
|
|
mov [buffer], eax
|
|
add buffer, MWSIZE
|
|
sub lsize, MWSIZE
|
|
|
|
;; Continue
|
|
jmp .GenerateBlock_4
|
|
|
|
;; 1,2,3 bytes remain
|
|
.Partial_Machine_Word:
|
|
|
|
;; Test bit 1 to see if size is at least 2
|
|
test lsize, 2
|
|
jz .Bit_1_Not_Set
|
|
|
|
mov [buffer], ax
|
|
shr eax, 16
|
|
add buffer, 2
|
|
|
|
.Bit_1_Not_Set:
|
|
|
|
;; Test bit 0 to see if size is at least 1
|
|
test lsize, 1
|
|
jz .Bit_0_Not_Set
|
|
|
|
mov [buffer], al
|
|
|
|
.Bit_0_Not_Set:
|
|
|
|
;; We've hit all the bits
|
|
|
|
.GenerateBlock_Return:
|
|
|
|
xor eax, eax
|
|
ret
|
|
|
|
%endif ;; X86
|
|
|
|
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
|
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
|
|
|
%ifdef X64 or X32 ;; Set via the command line
|
|
|
|
global NASM_RDSEED_GenerateBlock
|
|
section .text
|
|
align 16
|
|
|
|
NASM_RDSEED_GenerateBlock:
|
|
|
|
;; No need for Load_Arguments due to fastcall
|
|
|
|
;; A block of 32-bytes appears to be optimal. Adding
|
|
;; more rdrand calls degrades performance.
|
|
cmp bsize, 32
|
|
jb .GenerateBlock_8
|
|
|
|
.GenerateBlock_32:
|
|
|
|
.Call_RDSEED_RAX_4:
|
|
rdseed rax
|
|
jnc .Call_RDSEED_RAX_4
|
|
mov [buffer+0], rax
|
|
|
|
.Call_RDSEED_RAX_3:
|
|
rdseed rax
|
|
jnc .Call_RDSEED_RAX_3
|
|
mov [buffer+8], rax
|
|
|
|
.Call_RDSEED_RAX_2:
|
|
rdseed rax
|
|
jnc .Call_RDSEED_RAX_2
|
|
mov [buffer+16], rax
|
|
|
|
.Call_RDSEED_RAX_1:
|
|
rdseed rax
|
|
jnc .Call_RDSEED_RAX_1
|
|
mov [buffer+24], rax
|
|
|
|
sub bsize, 32
|
|
add buffer, 32
|
|
|
|
cmp bsize, 32
|
|
jae .GenerateBlock_32
|
|
|
|
;; Fewer than 32 bytes remain
|
|
.GenerateBlock_8:
|
|
|
|
cmp lsize, 0
|
|
je .GenerateBlock_Return
|
|
|
|
.Call_RDSEED_RAX_0:
|
|
rdseed rax
|
|
jnc .Call_RDSEED_RAX_0
|
|
|
|
cmp lsize, MWSIZE
|
|
jb .Partial_Machine_Word
|
|
|
|
.Full_Machine_Word:
|
|
|
|
mov [buffer], rax
|
|
add buffer, MWSIZE
|
|
sub lsize, MWSIZE
|
|
|
|
;; Continue
|
|
jmp .GenerateBlock_8
|
|
|
|
;; 1,2,3,4,5,6,7 bytes remain
|
|
.Partial_Machine_Word:
|
|
|
|
;; Test bit 2 to see if size is at least 4
|
|
test lsize, 4
|
|
jz .Bit_2_Not_Set
|
|
|
|
mov [buffer], eax
|
|
shr rax, 32
|
|
add buffer, 4
|
|
|
|
.Bit_2_Not_Set:
|
|
|
|
;; Test bit 1 to see if size is at least 2
|
|
test lsize, 2
|
|
jz .Bit_1_Not_Set
|
|
|
|
mov [buffer], ax
|
|
shr eax, 16
|
|
add buffer, 2
|
|
|
|
.Bit_1_Not_Set:
|
|
|
|
;; Test bit 0 to see if size is at least 1
|
|
test lsize, 1
|
|
jz .Bit_0_Not_Set
|
|
|
|
mov [buffer], al
|
|
|
|
.Bit_0_Not_Set:
|
|
|
|
;; We've hit all the bits
|
|
|
|
.GenerateBlock_Return:
|
|
|
|
xor rax, rax
|
|
ret
|
|
|
|
%endif ;; X64
|
|
|
|
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
|
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|