GS: We'll use assembly where needed, and...

Inline with a maximum of 5 consecutive lines
This commit is contained in:
Correia 2024-04-23 23:18:52 -03:00
parent eee64ca34d
commit 373ac55e0a
4 changed files with 64 additions and 102 deletions

View File

@ -1,12 +1,12 @@
cmake_minimum_required(VERSION 3.22.1)
project(CosmicEmu LANGUAGES CXX VERSION 0.0)
project(CosmicEmu LANGUAGES CXX ASM VERSION 0.0)
set(CMAKE_CXX_STANDARD 20)
set(CMAKE_CXX_STANDARD_REQUIRED ON)
set(CMAKE_CXX_FLAGS "-Wall -Wno-sign-conversion -march=armv8-a+crc -fvisibility=hidden")
set(CMAKE_CXX_FLAGS "-Wall -Wno-sign-conversion -march=armv8 -fvisibility=hidden")
set(CMAKE_CXX_FLAGS_RELEASE "-DNDEBUG -Ofast -fno-stack-protector -fomit-frame-pointer -flto=full")
set(CMAKE_CXX_FLAGS_DEBUG "-glldb -fstack-protector-strong -O0")
@ -56,7 +56,7 @@ target_sources(cosmic PRIVATE
${COSMIC_DIR}/gs/synth_tables.cpp
${COSMIC_DIR}/gs/synth_engine.cpp
${COSMIC_DIR}/gs/gif_bridge.cpp
${COSMIC_DIR}/gs/transfer_queue.cpp
${COSMIC_DIR}/gs/transfer_queue.S
${COSMIC_DIR}/gs/gif_packed.cpp
${COSMIC_DIR}/vu/vecu.cpp
${COSMIC_DIR}/vu/vu_time.cpp

View File

@ -1,30 +0,0 @@
#pragma once
class [[maybe_unused]] Asm;
class [[maybe_unused]] HeyGithubThisIsACPPFILE {
};
#define PROLOGUE_ASM(alloc)\
__asm volatile("stp x29, x30, [sp, #-" #alloc "]!\n")
#define EPILOGUE_ASM(free)\
__asm volatile("ldp x29, x30, [sp], #" #free "\n")
// Straight from the Linux kernel: https://github.com/torvalds/linux/blob/master/arch/arm64/include/asm/assembler.h
#define ADR_LTO_A64(dst, sym)\
__asm volatile( \
"adrp "#dst", "#sym"\n" \
"add "#dst", "#dst", :lo12:" #sym)
/*
#define LDR_LTO_A64(dst, sym, tmp)\
__asm volatile( \
"adrp "#tmp", "#sym"\n" \
"ldr "#dst", ["#tmp", :lo12:" #sym "]")
*/
#define STR_LTO_A64(src, sym, tmp)\
__asm volatile( \
"adrp "#tmp", "#sym"\n" \
"str "#src", ["#tmp", :lo12:" #sym"]\n")

View File

@ -0,0 +1,61 @@
.data
gSize: .int 0
gResetDone: .byte 0
.align 8
gBackPtr: .quad 0
gFrontPtr: .quad 0
.bss
.align 16
gQueue: .space 16 * 16
gifQueueReset:
PROLOGUE_ASM(16);
stp, x29, x30, [sp, #-16]
eor v0.16b, v0.16b, v0.16b // v0 = 0.0, 0.0 ...
mov x9, 0
mov x0, 0
str x9, [gBackPtr, x0]
adr x10, gSize
eor x0, x0, x0
str w0, [x10]
mov w12, 0
adr x10, gQueue
cleanUp:
mov w9, w12\n
lsl x9, x9, #4
add x11, x10, x9
st1 {v0.16b}, [x11] // ((u128*)qQueue[w0]) = v0
add w12, w12, #1
sub w11, w12, #16
cbz w11, #8
b cleanUp
adr x10, gResetDone
mov w11, #1
strb w11, [x10]
ldp x29, x30, [sp], #16
ret
gifQueueSize:
stp, x29, x30, [sp, #-16]
// We can pre-load the array values into the L2 cache since we'll be accessing it shortly
mov x10, 0
mov w11, 0
adr x9, gQueue
loadIntoL2:
// ((u8*)gQueue)[w1 * 64]
mov x10, x11
lsl x10, x10, #6
add x12, x9, x10
prfm pldl2keep, [x12]
add w11, w11, #1
sub w10, w11, #4
cbz w10, #0x8
b loadIntoL2
adr x0, gSize
ldr w0, [x0]
ldp x29, x30, [sp], #16
ret

View File

@ -1,69 +0,0 @@
#include <common/types.h>
#include <os/neon_simd.h>
#include <common/asm_c++out.h>
namespace cosmic::gs {
__asm(
".data\n"
"gSize: .int 0\n"
"gResetDone: .byte 0\n"
".align 8\n"
"gBackPtr: .quad 0\n"
"gFrontPtr: .quad 0\n"
".bss\n"
".align 16\n"
"gQueue: .space 16 * 16");
[[gnu::naked]] void gifQueueReset() {
PROLOGUE_ASM(16);
__asm("eor v0.16b, v0.16b, v0.16b\n" // v0 = 0.0, 0.0 ...
"mov x9, 0\n"
"mov x0, 0\n");
STR_LTO_A64(x9, gBackPtr, x0);
ADR_LTO_A64(x10, gSize);
__asm("eor x0, x0, x0\n"
"str w0, [x10]\n"
"mov w12, 0\n");
ADR_LTO_A64(x10, gQueue);
__asm("cleanUp:\n"
"mov w9, w12\n"
"lsl x9, x9, #4\n"
"add x11, x10, x9\n"
"st1 {v0.16b}, [x11]\n" // ((u128*)qQueue[w0]) = v0
"add w12, w12, #1\n"
"sub w11, w12, #16\n"
"cbz w11, #8\n"
"b cleanUp\n");
ADR_LTO_A64(x10, gResetDone);
__asm("mov w11, #1\n"
"strb w11, [x10]\n");
EPILOGUE_ASM(16);
__asm("ret");
}
[[gnu::naked]] u8 gifQueueSize() {
PROLOGUE_ASM(16);
// We can pre-load the array values into the L2 cache since we'll be accessing it shortly
__asm("mov x10, 0\n"
"mov w11, 0\n");
ADR_LTO_A64(x9, gQueue);
__asm("loadIntoL2:\n"
// ((u8*)gQueue)[w1 * 64]
"mov x10, x11\n"
"lsl x10, x10, #6\n"
"add x12, x9, x10\n"
"prfm pldl2keep, [x12]\n"
"add w11, w11, #1\n"
"sub w10, w11, #4\n"
"cbz w10, #0x8\n"
"b loadIntoL2\n");
ADR_LTO_A64(x0, gSize);
__asm("ldr w0, [x0]\n");
EPILOGUE_ASM(16);
__asm("ret");
}
}