From 21014eb5f25c1f39a9acf22762fa4715464bdf37 Mon Sep 17 00:00:00 2001 From: Erik Abair Date: Sun, 19 Jun 2022 09:24:15 -0700 Subject: [PATCH] Switches to using a dispatch table for emulated instructions. --- src/nv2a_vsh_cpu.c | 204 ++++++++++++++++++++-------------------- src/nv2a_vsh_cpu.h | 52 +++++----- src/nv2a_vsh_emulator.c | 69 +++++++------- 3 files changed, 160 insertions(+), 165 deletions(-) diff --git a/src/nv2a_vsh_cpu.c b/src/nv2a_vsh_cpu.c index 7e14b8d..edc076b 100644 --- a/src/nv2a_vsh_cpu.c +++ b/src/nv2a_vsh_cpu.c @@ -3,138 +3,139 @@ #include #include -void nv2a_vsh_cpu_mov(Nv2aVshRegister *out, const Nv2aVshRegister *a) { - memcpy(out, a, sizeof(*out)); +void nv2a_vsh_cpu_mov(Nv2aVshRegister *out, const Nv2aVshRegister *inputs) { + memcpy(out, inputs, sizeof(*out)); } -void nv2a_vsh_cpu_arl(Nv2aVshRegister *out, const Nv2aVshRegister *a) { - float val = floorf(a->reg.x + 0.001f); +void nv2a_vsh_cpu_arl(Nv2aVshRegister *out, const Nv2aVshRegister *inputs) { + float val = floorf(inputs->reg.x + 0.001f); out->reg.x = val; out->reg.y = val; out->reg.z = val; out->reg.w = val; } -void nv2a_vsh_cpu_mul(Nv2aVshRegister *out, const Nv2aVshRegister *a, - const Nv2aVshRegister *b) { - out->reg.x = a->reg.x * b->reg.x; - out->reg.y = a->reg.y * b->reg.y; - out->reg.z = a->reg.z * b->reg.z; - out->reg.w = a->reg.w * b->reg.w; +void nv2a_vsh_cpu_mul(Nv2aVshRegister *out, const Nv2aVshRegister *inputs) { + out->reg.x = inputs[0].reg.x * inputs[1].reg.x; + out->reg.y = inputs[0].reg.y * inputs[1].reg.y; + out->reg.z = inputs[0].reg.z * inputs[1].reg.z; + out->reg.w = inputs[0].reg.w * inputs[1].reg.w; } -void nv2a_vsh_cpu_add(Nv2aVshRegister *out, const Nv2aVshRegister *a, - const Nv2aVshRegister *b) { - out->reg.x = a->reg.x + b->reg.x; - out->reg.y = a->reg.y + b->reg.y; - out->reg.z = a->reg.z + b->reg.z; - out->reg.w = a->reg.w + b->reg.w; +void nv2a_vsh_cpu_add(Nv2aVshRegister *out, const Nv2aVshRegister *inputs) { + out->reg.x = inputs[0].reg.x + inputs[1].reg.x; + out->reg.y = inputs[0].reg.y + inputs[1].reg.y; + out->reg.z = inputs[0].reg.z + inputs[1].reg.z; + out->reg.w = inputs[0].reg.w + inputs[1].reg.w; } -void nv2a_vsh_cpu_mad(Nv2aVshRegister *out, const Nv2aVshRegister *a, - const Nv2aVshRegister *b, const Nv2aVshRegister *c) { - out->reg.x = a->reg.x * b->reg.x + c->reg.x; - out->reg.y = a->reg.y * b->reg.y + c->reg.y; - out->reg.z = a->reg.z * b->reg.z + c->reg.z; - out->reg.w = a->reg.w * b->reg.w + c->reg.w; +void nv2a_vsh_cpu_mad(Nv2aVshRegister *out, const Nv2aVshRegister *inputs) { + out->reg.x = inputs[0].reg.x * inputs[1].reg.x + inputs[2].reg.x; + out->reg.y = inputs[0].reg.y * inputs[1].reg.y + inputs[2].reg.y; + out->reg.z = inputs[0].reg.z * inputs[1].reg.z + inputs[2].reg.z; + out->reg.w = inputs[0].reg.w * inputs[1].reg.w + inputs[2].reg.w; } -void nv2a_vsh_cpu_dp3(Nv2aVshRegister *out, const Nv2aVshRegister *a, - const Nv2aVshRegister *b) { +void nv2a_vsh_cpu_dp3(Nv2aVshRegister *out, const Nv2aVshRegister *inputs) { + float result = inputs[0].reg.x * inputs[1].reg.x + + inputs[0].reg.y * inputs[1].reg.y + + inputs[0].reg.z * inputs[1].reg.z; + out->reg.x = result; + out->reg.y = result; + out->reg.z = result; + out->reg.w = result; +} + +void nv2a_vsh_cpu_dph(Nv2aVshRegister *out, const Nv2aVshRegister *inputs) { + float result = inputs[0].reg.x * inputs[1].reg.x + + inputs[0].reg.y * inputs[1].reg.y + + inputs[0].reg.z * inputs[1].reg.z + inputs[1].reg.w; + out->reg.x = result; + out->reg.y = result; + out->reg.z = result; + out->reg.w = result; +} + +void nv2a_vsh_cpu_dp4(Nv2aVshRegister *out, const Nv2aVshRegister *inputs) { float result = - a->reg.x * b->reg.x + a->reg.y * b->reg.y + a->reg.z * b->reg.z; + inputs[0].reg.x * inputs[1].reg.x + inputs[0].reg.y * inputs[1].reg.y + + inputs[0].reg.z * inputs[1].reg.z + inputs[0].reg.w * inputs[1].reg.w; out->reg.x = result; out->reg.y = result; out->reg.z = result; out->reg.w = result; } -void nv2a_vsh_cpu_dph(Nv2aVshRegister *out, const Nv2aVshRegister *a, - const Nv2aVshRegister *b) { - float result = a->reg.x * b->reg.x + a->reg.y * b->reg.y + - a->reg.z * b->reg.z + b->reg.w; - out->reg.x = result; - out->reg.y = result; - out->reg.z = result; - out->reg.w = result; -} - -void nv2a_vsh_cpu_dp4(Nv2aVshRegister *out, const Nv2aVshRegister *a, - const Nv2aVshRegister *b) { - float result = a->reg.x * b->reg.x + a->reg.y * b->reg.y + - a->reg.z * b->reg.z + a->reg.w * b->reg.w; - out->reg.x = result; - out->reg.y = result; - out->reg.z = result; - out->reg.w = result; -} - -void nv2a_vsh_cpu_dst(Nv2aVshRegister *out, const Nv2aVshRegister *a, - const Nv2aVshRegister *b) { +void nv2a_vsh_cpu_dst(Nv2aVshRegister *out, const Nv2aVshRegister *inputs) { out->reg.x = 1.0f; - out->reg.y = a->reg.y * b->reg.y; - out->reg.z = a->reg.z; - out->reg.w = b->reg.w; + out->reg.y = inputs[0].reg.y * inputs[1].reg.y; + out->reg.z = inputs[0].reg.z; + out->reg.w = inputs[1].reg.w; } -void nv2a_vsh_cpu_min(Nv2aVshRegister *out, const Nv2aVshRegister *a, - const Nv2aVshRegister *b) { - out->reg.x = a->reg.x < b->reg.x ? a->reg.x : b->reg.x; - out->reg.y = a->reg.y < b->reg.y ? a->reg.y : b->reg.y; - out->reg.z = a->reg.z < b->reg.z ? a->reg.z : b->reg.z; - out->reg.w = a->reg.w < b->reg.w ? a->reg.w : b->reg.w; +void nv2a_vsh_cpu_min(Nv2aVshRegister *out, const Nv2aVshRegister *inputs) { + out->reg.x = + inputs[0].reg.x < inputs[1].reg.x ? inputs[0].reg.x : inputs[1].reg.x; + out->reg.y = + inputs[0].reg.y < inputs[1].reg.y ? inputs[0].reg.y : inputs[1].reg.y; + out->reg.z = + inputs[0].reg.z < inputs[1].reg.z ? inputs[0].reg.z : inputs[1].reg.z; + out->reg.w = + inputs[0].reg.w < inputs[1].reg.w ? inputs[0].reg.w : inputs[1].reg.w; } -void nv2a_vsh_cpu_max(Nv2aVshRegister *out, const Nv2aVshRegister *a, - const Nv2aVshRegister *b) { - out->reg.x = a->reg.x > b->reg.x ? a->reg.x : b->reg.x; - out->reg.y = a->reg.y > b->reg.y ? a->reg.y : b->reg.y; - out->reg.z = a->reg.z > b->reg.z ? a->reg.z : b->reg.z; - out->reg.w = a->reg.w > b->reg.w ? a->reg.w : b->reg.w; +void nv2a_vsh_cpu_max(Nv2aVshRegister *out, const Nv2aVshRegister *inputs) { + out->reg.x = + inputs[0].reg.x > inputs[1].reg.x ? inputs[0].reg.x : inputs[1].reg.x; + out->reg.y = + inputs[0].reg.y > inputs[1].reg.y ? inputs[0].reg.y : inputs[1].reg.y; + out->reg.z = + inputs[0].reg.z > inputs[1].reg.z ? inputs[0].reg.z : inputs[1].reg.z; + out->reg.w = + inputs[0].reg.w > inputs[1].reg.w ? inputs[0].reg.w : inputs[1].reg.w; } -void nv2a_vsh_cpu_slt(Nv2aVshRegister *out, const Nv2aVshRegister *a, - const Nv2aVshRegister *b) { - out->reg.x = a->reg.x < b->reg.x ? 1.0f : 0.0f; - out->reg.y = a->reg.y < b->reg.y ? 1.0f : 0.0f; - out->reg.z = a->reg.z < b->reg.z ? 1.0f : 0.0f; - out->reg.w = a->reg.w < b->reg.w ? 1.0f : 0.0f; +void nv2a_vsh_cpu_slt(Nv2aVshRegister *out, const Nv2aVshRegister *inputs) { + out->reg.x = inputs[0].reg.x < inputs[1].reg.x ? 1.0f : 0.0f; + out->reg.y = inputs[0].reg.y < inputs[1].reg.y ? 1.0f : 0.0f; + out->reg.z = inputs[0].reg.z < inputs[1].reg.z ? 1.0f : 0.0f; + out->reg.w = inputs[0].reg.w < inputs[1].reg.w ? 1.0f : 0.0f; } -void nv2a_vsh_cpu_sge(Nv2aVshRegister *out, const Nv2aVshRegister *a, - const Nv2aVshRegister *b) { - out->reg.x = a->reg.x >= b->reg.x ? 1.0f : 0.0f; - out->reg.y = a->reg.y >= b->reg.y ? 1.0f : 0.0f; - out->reg.z = a->reg.z >= b->reg.z ? 1.0f : 0.0f; - out->reg.w = a->reg.w >= b->reg.w ? 1.0f : 0.0f; +void nv2a_vsh_cpu_sge(Nv2aVshRegister *out, const Nv2aVshRegister *inputs) { + out->reg.x = inputs[0].reg.x >= inputs[1].reg.x ? 1.0f : 0.0f; + out->reg.y = inputs[0].reg.y >= inputs[1].reg.y ? 1.0f : 0.0f; + out->reg.z = inputs[0].reg.z >= inputs[1].reg.z ? 1.0f : 0.0f; + out->reg.w = inputs[0].reg.w >= inputs[1].reg.w ? 1.0f : 0.0f; } -void nv2a_vsh_cpu_rcp(Nv2aVshRegister *out, const Nv2aVshRegister *a) { +void nv2a_vsh_cpu_rcp(Nv2aVshRegister *out, const Nv2aVshRegister *inputs) { float result = - (a->reg.x == 1.0f ? 1.0f - : (a->reg.x == 0.0f ? INFINITY : 1.0f / a->reg.x)); + (inputs[0].reg.x == 1.0f + ? 1.0f + : (inputs[0].reg.x == 0.0f ? INFINITY : 1.0f / inputs[0].reg.x)); out->reg.x = result; out->reg.y = result; out->reg.z = result; out->reg.w = result; } -void nv2a_vsh_cpu_rcc(Nv2aVshRegister *out, const Nv2aVshRegister *a) { +void nv2a_vsh_cpu_rcc(Nv2aVshRegister *out, const Nv2aVshRegister *inputs) { // TODO: Validate this on HW. float result; - if (a->reg.x == 1.0f) { + if (inputs[0].reg.x == 1.0f) { result = 1.0f; } else { - if (a->reg.x < -1.84467e19f) { + if (inputs[0].reg.x < -1.84467e19f) { result = 1.0f / -1.84467e19f; - } else if (a->reg.x > -5.42101e-20f && a->reg.x < 0.0f) { + } else if (inputs[0].reg.x > -5.42101e-20f && inputs[0].reg.x < 0.0f) { result = 1.0f / -5.42101e-020f; - } else if (a->reg.x >= 0 && a->reg.x < 5.42101e-20f) { + } else if (inputs[0].reg.x >= 0 && inputs[0].reg.x < 5.42101e-20f) { result = 1.0f / 5.42101e-20f; - } else if (a->reg.x > 1.84467e+19f) { + } else if (inputs[0].reg.x > 1.84467e+19f) { result = 1.0f / 1.84467e+19f; } else { - result = 1.0f / a->reg.x; + result = 1.0f / inputs[0].reg.x; } } out->reg.x = result; @@ -143,27 +144,28 @@ void nv2a_vsh_cpu_rcc(Nv2aVshRegister *out, const Nv2aVshRegister *a) { out->reg.w = result; } -void nv2a_vsh_cpu_rsq(Nv2aVshRegister *out, const Nv2aVshRegister *a) { - float result = (a->reg.x == 1.0f - ? 1.0f - : (a->reg.x == 0.0f ? INFINITY : 1.0f / sqrtf(a->reg.x))); +void nv2a_vsh_cpu_rsq(Nv2aVshRegister *out, const Nv2aVshRegister *inputs) { + float result = + (inputs->reg.x == 1.0f + ? 1.0f + : (inputs->reg.x == 0.0f ? INFINITY : 1.0f / sqrtf(inputs->reg.x))); out->reg.x = result; out->reg.y = result; out->reg.z = result; out->reg.w = result; } -void nv2a_vsh_cpu_exp(Nv2aVshRegister *out, const Nv2aVshRegister *a) { - float tmp = floorf(a->reg.x); +void nv2a_vsh_cpu_exp(Nv2aVshRegister *out, const Nv2aVshRegister *inputs) { + float tmp = floorf(inputs->reg.x); out->reg.x = powf(2.0f, tmp); - out->reg.y = a->reg.x - tmp; - out->reg.z = powf(2.0f, a->reg.x); + out->reg.y = inputs->reg.x - tmp; + out->reg.z = powf(2.0f, inputs->reg.x); out->reg.w = 1.0f; } -void nv2a_vsh_cpu_log(Nv2aVshRegister *out, const Nv2aVshRegister *a) { +void nv2a_vsh_cpu_log(Nv2aVshRegister *out, const Nv2aVshRegister *inputs) { // TODO: Validate this on HW. - float tmp = fabsf(a->reg.x); + float tmp = fabsf(inputs->reg.x); if (tmp == 0.0f) { out->reg.x = -INFINITY; out->reg.y = 1.0f; @@ -182,7 +184,7 @@ void nv2a_vsh_cpu_log(Nv2aVshRegister *out, const Nv2aVshRegister *a) { out->reg.w = 1.0f; } -void nv2a_vsh_cpu_lit(Nv2aVshRegister *out, const Nv2aVshRegister *a) { +void nv2a_vsh_cpu_lit(Nv2aVshRegister *out, const Nv2aVshRegister *inputs) { static const float kMax = 127.9961f; out->reg.x = 1.0f; @@ -190,11 +192,13 @@ void nv2a_vsh_cpu_lit(Nv2aVshRegister *out, const Nv2aVshRegister *a) { out->reg.z = 0.0f; out->reg.w = 1.0f; - float power = a->reg.w < -kMax ? -kMax : (a->reg.w > kMax ? kMax : a->reg.w); - if (a->reg.x > 0.0f) { - out->reg.y = a->reg.x; - if (a->reg.y > 0.0f) { - out->reg.z = powf(a->reg.y, power); + float power = inputs->reg.w < -kMax + ? -kMax + : (inputs->reg.w > kMax ? kMax : inputs->reg.w); + if (inputs->reg.x > 0.0f) { + out->reg.y = inputs->reg.x; + if (inputs->reg.y > 0.0f) { + out->reg.z = powf(inputs->reg.y, power); } } } diff --git a/src/nv2a_vsh_cpu.h b/src/nv2a_vsh_cpu.h index 9befb18..f2497ca 100644 --- a/src/nv2a_vsh_cpu.h +++ b/src/nv2a_vsh_cpu.h @@ -17,38 +17,28 @@ typedef union Nv2aVshRegister_ { float raw[4]; } Nv2aVshRegister; -#define OP_1(name) \ - void nv2a_vsh_cpu_##name(Nv2aVshRegister *out, const Nv2aVshRegister *a) -#define OP_2(name) \ - void nv2a_vsh_cpu_##name(Nv2aVshRegister *out, const Nv2aVshRegister *a, \ - const Nv2aVshRegister *b) -#define OP_3(name) \ - void nv2a_vsh_cpu_##name(Nv2aVshRegister *out, const Nv2aVshRegister *a, \ - const Nv2aVshRegister *b, const Nv2aVshRegister *c) +typedef void (*Nv2aVshCpuFunc)(Nv2aVshRegister *out, + const Nv2aVshRegister *inputs); -OP_1(mov); -OP_1(arl); -OP_2(mul); -OP_2(add); -OP_3(mad); -OP_2(dp3); -OP_2(dph); -OP_2(dp4); -OP_2(dst); -OP_2(min); -OP_2(max); -OP_2(slt); -OP_2(sge); -OP_1(rcp); -OP_1(rcc); -OP_1(rsq); -OP_1(exp); -OP_1(log); -OP_1(lit); - -#undef OP_1 -#undef OP_2 -#undef OP_3 +void nv2a_vsh_cpu_mov(Nv2aVshRegister *out, const Nv2aVshRegister *inputs); +void nv2a_vsh_cpu_arl(Nv2aVshRegister *out, const Nv2aVshRegister *inputs); +void nv2a_vsh_cpu_mul(Nv2aVshRegister *out, const Nv2aVshRegister *inputs); +void nv2a_vsh_cpu_add(Nv2aVshRegister *out, const Nv2aVshRegister *inputs); +void nv2a_vsh_cpu_mad(Nv2aVshRegister *out, const Nv2aVshRegister *inputs); +void nv2a_vsh_cpu_dp3(Nv2aVshRegister *out, const Nv2aVshRegister *inputs); +void nv2a_vsh_cpu_dph(Nv2aVshRegister *out, const Nv2aVshRegister *inputs); +void nv2a_vsh_cpu_dp4(Nv2aVshRegister *out, const Nv2aVshRegister *inputs); +void nv2a_vsh_cpu_dst(Nv2aVshRegister *out, const Nv2aVshRegister *inputs); +void nv2a_vsh_cpu_min(Nv2aVshRegister *out, const Nv2aVshRegister *inputs); +void nv2a_vsh_cpu_max(Nv2aVshRegister *out, const Nv2aVshRegister *inputs); +void nv2a_vsh_cpu_slt(Nv2aVshRegister *out, const Nv2aVshRegister *inputs); +void nv2a_vsh_cpu_sge(Nv2aVshRegister *out, const Nv2aVshRegister *inputs); +void nv2a_vsh_cpu_rcp(Nv2aVshRegister *out, const Nv2aVshRegister *inputs); +void nv2a_vsh_cpu_rcc(Nv2aVshRegister *out, const Nv2aVshRegister *inputs); +void nv2a_vsh_cpu_rsq(Nv2aVshRegister *out, const Nv2aVshRegister *inputs); +void nv2a_vsh_cpu_exp(Nv2aVshRegister *out, const Nv2aVshRegister *inputs); +void nv2a_vsh_cpu_log(Nv2aVshRegister *out, const Nv2aVshRegister *inputs); +void nv2a_vsh_cpu_lit(Nv2aVshRegister *out, const Nv2aVshRegister *inputs); #ifdef __cplusplus }; // extern "C" diff --git a/src/nv2a_vsh_emulator.c b/src/nv2a_vsh_emulator.c index 2394fa2..695f477 100644 --- a/src/nv2a_vsh_emulator.c +++ b/src/nv2a_vsh_emulator.c @@ -3,6 +3,31 @@ #include #include +// clang format off +static Nv2aVshCpuFunc kDispatchTable[] = { + NULL, + nv2a_vsh_cpu_mov, + nv2a_vsh_cpu_mul, + nv2a_vsh_cpu_add, + nv2a_vsh_cpu_mad, + nv2a_vsh_cpu_dp3, + nv2a_vsh_cpu_dph, + nv2a_vsh_cpu_dp4, + nv2a_vsh_cpu_dst, + nv2a_vsh_cpu_min, + nv2a_vsh_cpu_max, + nv2a_vsh_cpu_slt, + nv2a_vsh_cpu_sge, + nv2a_vsh_cpu_arl, + nv2a_vsh_cpu_rcp, + nv2a_vsh_cpu_rcc, + nv2a_vsh_cpu_rsq, + nv2a_vsh_cpu_exp, + nv2a_vsh_cpu_log, + nv2a_vsh_cpu_lit, +}; +// clang format on + static inline void set_register(Nv2aVshRegister *out, const Nv2aVshRegister *in, const uint8_t *swizzle, bool negate) { float mult = negate ? -1.0f : 1.0f; @@ -48,37 +73,13 @@ static inline void fetch_value(Nv2aVshRegister *out, static inline void apply_operation(Nv2aVshExecutionState *state, const Nv2aVshOperation *op, const Nv2aVshRegister *inputs) { - Nv2aVshRegister output; - - switch (op->opcode) { - case NV2AOP_NOP: - return; - - case NV2AOP_MOV: - nv2a_vsh_cpu_mov(&output, inputs); - break; - - case NV2AOP_MUL: - case NV2AOP_ADD: - case NV2AOP_MAD: - case NV2AOP_DP3: - case NV2AOP_DPH: - case NV2AOP_DP4: - case NV2AOP_DST: - case NV2AOP_MIN: - case NV2AOP_MAX: - case NV2AOP_SLT: - case NV2AOP_SGE: - case NV2AOP_ARL: - case NV2AOP_RCP: - case NV2AOP_RCC: - case NV2AOP_RSQ: - case NV2AOP_EXP: - case NV2AOP_LOG: - case NV2AOP_LIT: - break; + if (op->opcode == NV2AOP_NOP) { + return; } + Nv2aVshRegister result; + kDispatchTable[op->opcode](&result, inputs); + const Nv2aVshOutput *out = op->outputs; for (uint32_t i = 0; i < 2; ++i, ++out) { Nv2aVshRegister *outreg; @@ -90,7 +91,7 @@ static inline void apply_operation(Nv2aVshExecutionState *state, continue; case NV2ART_OUTPUT: - assert(out->index < 13 && "Invalid output register target."); + assert(out->index < 13 && "Invalid result register target."); outreg = (Nv2aVshRegister *)(state->output_regs + out->index * 4); break; @@ -110,16 +111,16 @@ static inline void apply_operation(Nv2aVshExecutionState *state, } if (out->writemask & NV2AWM_X) { - outreg->reg.x = output.reg.x; + outreg->reg.x = result.reg.x; } if (out->writemask & NV2AWM_Y) { - outreg->reg.y = output.reg.y; + outreg->reg.y = result.reg.y; } if (out->writemask & NV2AWM_Z) { - outreg->reg.z = output.reg.z; + outreg->reg.z = result.reg.z; } if (out->writemask & NV2AWM_W) { - outreg->reg.w = output.reg.w; + outreg->reg.w = result.reg.w; } } }