Switches to using a dispatch table for emulated instructions.

This commit is contained in:
Erik Abair 2022-06-19 09:24:15 -07:00
parent 25a9fa6835
commit 21014eb5f2
3 changed files with 160 additions and 165 deletions

View File

@ -3,138 +3,139 @@
#include <math.h>
#include <string.h>
void nv2a_vsh_cpu_mov(Nv2aVshRegister *out, const Nv2aVshRegister *a) {
memcpy(out, a, sizeof(*out));
void nv2a_vsh_cpu_mov(Nv2aVshRegister *out, const Nv2aVshRegister *inputs) {
memcpy(out, inputs, sizeof(*out));
}
void nv2a_vsh_cpu_arl(Nv2aVshRegister *out, const Nv2aVshRegister *a) {
float val = floorf(a->reg.x + 0.001f);
void nv2a_vsh_cpu_arl(Nv2aVshRegister *out, const Nv2aVshRegister *inputs) {
float val = floorf(inputs->reg.x + 0.001f);
out->reg.x = val;
out->reg.y = val;
out->reg.z = val;
out->reg.w = val;
}
void nv2a_vsh_cpu_mul(Nv2aVshRegister *out, const Nv2aVshRegister *a,
const Nv2aVshRegister *b) {
out->reg.x = a->reg.x * b->reg.x;
out->reg.y = a->reg.y * b->reg.y;
out->reg.z = a->reg.z * b->reg.z;
out->reg.w = a->reg.w * b->reg.w;
void nv2a_vsh_cpu_mul(Nv2aVshRegister *out, const Nv2aVshRegister *inputs) {
out->reg.x = inputs[0].reg.x * inputs[1].reg.x;
out->reg.y = inputs[0].reg.y * inputs[1].reg.y;
out->reg.z = inputs[0].reg.z * inputs[1].reg.z;
out->reg.w = inputs[0].reg.w * inputs[1].reg.w;
}
void nv2a_vsh_cpu_add(Nv2aVshRegister *out, const Nv2aVshRegister *a,
const Nv2aVshRegister *b) {
out->reg.x = a->reg.x + b->reg.x;
out->reg.y = a->reg.y + b->reg.y;
out->reg.z = a->reg.z + b->reg.z;
out->reg.w = a->reg.w + b->reg.w;
void nv2a_vsh_cpu_add(Nv2aVshRegister *out, const Nv2aVshRegister *inputs) {
out->reg.x = inputs[0].reg.x + inputs[1].reg.x;
out->reg.y = inputs[0].reg.y + inputs[1].reg.y;
out->reg.z = inputs[0].reg.z + inputs[1].reg.z;
out->reg.w = inputs[0].reg.w + inputs[1].reg.w;
}
void nv2a_vsh_cpu_mad(Nv2aVshRegister *out, const Nv2aVshRegister *a,
const Nv2aVshRegister *b, const Nv2aVshRegister *c) {
out->reg.x = a->reg.x * b->reg.x + c->reg.x;
out->reg.y = a->reg.y * b->reg.y + c->reg.y;
out->reg.z = a->reg.z * b->reg.z + c->reg.z;
out->reg.w = a->reg.w * b->reg.w + c->reg.w;
void nv2a_vsh_cpu_mad(Nv2aVshRegister *out, const Nv2aVshRegister *inputs) {
out->reg.x = inputs[0].reg.x * inputs[1].reg.x + inputs[2].reg.x;
out->reg.y = inputs[0].reg.y * inputs[1].reg.y + inputs[2].reg.y;
out->reg.z = inputs[0].reg.z * inputs[1].reg.z + inputs[2].reg.z;
out->reg.w = inputs[0].reg.w * inputs[1].reg.w + inputs[2].reg.w;
}
void nv2a_vsh_cpu_dp3(Nv2aVshRegister *out, const Nv2aVshRegister *a,
const Nv2aVshRegister *b) {
void nv2a_vsh_cpu_dp3(Nv2aVshRegister *out, const Nv2aVshRegister *inputs) {
float result = inputs[0].reg.x * inputs[1].reg.x +
inputs[0].reg.y * inputs[1].reg.y +
inputs[0].reg.z * inputs[1].reg.z;
out->reg.x = result;
out->reg.y = result;
out->reg.z = result;
out->reg.w = result;
}
void nv2a_vsh_cpu_dph(Nv2aVshRegister *out, const Nv2aVshRegister *inputs) {
float result = inputs[0].reg.x * inputs[1].reg.x +
inputs[0].reg.y * inputs[1].reg.y +
inputs[0].reg.z * inputs[1].reg.z + inputs[1].reg.w;
out->reg.x = result;
out->reg.y = result;
out->reg.z = result;
out->reg.w = result;
}
void nv2a_vsh_cpu_dp4(Nv2aVshRegister *out, const Nv2aVshRegister *inputs) {
float result =
a->reg.x * b->reg.x + a->reg.y * b->reg.y + a->reg.z * b->reg.z;
inputs[0].reg.x * inputs[1].reg.x + inputs[0].reg.y * inputs[1].reg.y +
inputs[0].reg.z * inputs[1].reg.z + inputs[0].reg.w * inputs[1].reg.w;
out->reg.x = result;
out->reg.y = result;
out->reg.z = result;
out->reg.w = result;
}
void nv2a_vsh_cpu_dph(Nv2aVshRegister *out, const Nv2aVshRegister *a,
const Nv2aVshRegister *b) {
float result = a->reg.x * b->reg.x + a->reg.y * b->reg.y +
a->reg.z * b->reg.z + b->reg.w;
out->reg.x = result;
out->reg.y = result;
out->reg.z = result;
out->reg.w = result;
}
void nv2a_vsh_cpu_dp4(Nv2aVshRegister *out, const Nv2aVshRegister *a,
const Nv2aVshRegister *b) {
float result = a->reg.x * b->reg.x + a->reg.y * b->reg.y +
a->reg.z * b->reg.z + a->reg.w * b->reg.w;
out->reg.x = result;
out->reg.y = result;
out->reg.z = result;
out->reg.w = result;
}
void nv2a_vsh_cpu_dst(Nv2aVshRegister *out, const Nv2aVshRegister *a,
const Nv2aVshRegister *b) {
void nv2a_vsh_cpu_dst(Nv2aVshRegister *out, const Nv2aVshRegister *inputs) {
out->reg.x = 1.0f;
out->reg.y = a->reg.y * b->reg.y;
out->reg.z = a->reg.z;
out->reg.w = b->reg.w;
out->reg.y = inputs[0].reg.y * inputs[1].reg.y;
out->reg.z = inputs[0].reg.z;
out->reg.w = inputs[1].reg.w;
}
void nv2a_vsh_cpu_min(Nv2aVshRegister *out, const Nv2aVshRegister *a,
const Nv2aVshRegister *b) {
out->reg.x = a->reg.x < b->reg.x ? a->reg.x : b->reg.x;
out->reg.y = a->reg.y < b->reg.y ? a->reg.y : b->reg.y;
out->reg.z = a->reg.z < b->reg.z ? a->reg.z : b->reg.z;
out->reg.w = a->reg.w < b->reg.w ? a->reg.w : b->reg.w;
void nv2a_vsh_cpu_min(Nv2aVshRegister *out, const Nv2aVshRegister *inputs) {
out->reg.x =
inputs[0].reg.x < inputs[1].reg.x ? inputs[0].reg.x : inputs[1].reg.x;
out->reg.y =
inputs[0].reg.y < inputs[1].reg.y ? inputs[0].reg.y : inputs[1].reg.y;
out->reg.z =
inputs[0].reg.z < inputs[1].reg.z ? inputs[0].reg.z : inputs[1].reg.z;
out->reg.w =
inputs[0].reg.w < inputs[1].reg.w ? inputs[0].reg.w : inputs[1].reg.w;
}
void nv2a_vsh_cpu_max(Nv2aVshRegister *out, const Nv2aVshRegister *a,
const Nv2aVshRegister *b) {
out->reg.x = a->reg.x > b->reg.x ? a->reg.x : b->reg.x;
out->reg.y = a->reg.y > b->reg.y ? a->reg.y : b->reg.y;
out->reg.z = a->reg.z > b->reg.z ? a->reg.z : b->reg.z;
out->reg.w = a->reg.w > b->reg.w ? a->reg.w : b->reg.w;
void nv2a_vsh_cpu_max(Nv2aVshRegister *out, const Nv2aVshRegister *inputs) {
out->reg.x =
inputs[0].reg.x > inputs[1].reg.x ? inputs[0].reg.x : inputs[1].reg.x;
out->reg.y =
inputs[0].reg.y > inputs[1].reg.y ? inputs[0].reg.y : inputs[1].reg.y;
out->reg.z =
inputs[0].reg.z > inputs[1].reg.z ? inputs[0].reg.z : inputs[1].reg.z;
out->reg.w =
inputs[0].reg.w > inputs[1].reg.w ? inputs[0].reg.w : inputs[1].reg.w;
}
void nv2a_vsh_cpu_slt(Nv2aVshRegister *out, const Nv2aVshRegister *a,
const Nv2aVshRegister *b) {
out->reg.x = a->reg.x < b->reg.x ? 1.0f : 0.0f;
out->reg.y = a->reg.y < b->reg.y ? 1.0f : 0.0f;
out->reg.z = a->reg.z < b->reg.z ? 1.0f : 0.0f;
out->reg.w = a->reg.w < b->reg.w ? 1.0f : 0.0f;
void nv2a_vsh_cpu_slt(Nv2aVshRegister *out, const Nv2aVshRegister *inputs) {
out->reg.x = inputs[0].reg.x < inputs[1].reg.x ? 1.0f : 0.0f;
out->reg.y = inputs[0].reg.y < inputs[1].reg.y ? 1.0f : 0.0f;
out->reg.z = inputs[0].reg.z < inputs[1].reg.z ? 1.0f : 0.0f;
out->reg.w = inputs[0].reg.w < inputs[1].reg.w ? 1.0f : 0.0f;
}
void nv2a_vsh_cpu_sge(Nv2aVshRegister *out, const Nv2aVshRegister *a,
const Nv2aVshRegister *b) {
out->reg.x = a->reg.x >= b->reg.x ? 1.0f : 0.0f;
out->reg.y = a->reg.y >= b->reg.y ? 1.0f : 0.0f;
out->reg.z = a->reg.z >= b->reg.z ? 1.0f : 0.0f;
out->reg.w = a->reg.w >= b->reg.w ? 1.0f : 0.0f;
void nv2a_vsh_cpu_sge(Nv2aVshRegister *out, const Nv2aVshRegister *inputs) {
out->reg.x = inputs[0].reg.x >= inputs[1].reg.x ? 1.0f : 0.0f;
out->reg.y = inputs[0].reg.y >= inputs[1].reg.y ? 1.0f : 0.0f;
out->reg.z = inputs[0].reg.z >= inputs[1].reg.z ? 1.0f : 0.0f;
out->reg.w = inputs[0].reg.w >= inputs[1].reg.w ? 1.0f : 0.0f;
}
void nv2a_vsh_cpu_rcp(Nv2aVshRegister *out, const Nv2aVshRegister *a) {
void nv2a_vsh_cpu_rcp(Nv2aVshRegister *out, const Nv2aVshRegister *inputs) {
float result =
(a->reg.x == 1.0f ? 1.0f
: (a->reg.x == 0.0f ? INFINITY : 1.0f / a->reg.x));
(inputs[0].reg.x == 1.0f
? 1.0f
: (inputs[0].reg.x == 0.0f ? INFINITY : 1.0f / inputs[0].reg.x));
out->reg.x = result;
out->reg.y = result;
out->reg.z = result;
out->reg.w = result;
}
void nv2a_vsh_cpu_rcc(Nv2aVshRegister *out, const Nv2aVshRegister *a) {
void nv2a_vsh_cpu_rcc(Nv2aVshRegister *out, const Nv2aVshRegister *inputs) {
// TODO: Validate this on HW.
float result;
if (a->reg.x == 1.0f) {
if (inputs[0].reg.x == 1.0f) {
result = 1.0f;
} else {
if (a->reg.x < -1.84467e19f) {
if (inputs[0].reg.x < -1.84467e19f) {
result = 1.0f / -1.84467e19f;
} else if (a->reg.x > -5.42101e-20f && a->reg.x < 0.0f) {
} else if (inputs[0].reg.x > -5.42101e-20f && inputs[0].reg.x < 0.0f) {
result = 1.0f / -5.42101e-020f;
} else if (a->reg.x >= 0 && a->reg.x < 5.42101e-20f) {
} else if (inputs[0].reg.x >= 0 && inputs[0].reg.x < 5.42101e-20f) {
result = 1.0f / 5.42101e-20f;
} else if (a->reg.x > 1.84467e+19f) {
} else if (inputs[0].reg.x > 1.84467e+19f) {
result = 1.0f / 1.84467e+19f;
} else {
result = 1.0f / a->reg.x;
result = 1.0f / inputs[0].reg.x;
}
}
out->reg.x = result;
@ -143,27 +144,28 @@ void nv2a_vsh_cpu_rcc(Nv2aVshRegister *out, const Nv2aVshRegister *a) {
out->reg.w = result;
}
void nv2a_vsh_cpu_rsq(Nv2aVshRegister *out, const Nv2aVshRegister *a) {
float result = (a->reg.x == 1.0f
? 1.0f
: (a->reg.x == 0.0f ? INFINITY : 1.0f / sqrtf(a->reg.x)));
void nv2a_vsh_cpu_rsq(Nv2aVshRegister *out, const Nv2aVshRegister *inputs) {
float result =
(inputs->reg.x == 1.0f
? 1.0f
: (inputs->reg.x == 0.0f ? INFINITY : 1.0f / sqrtf(inputs->reg.x)));
out->reg.x = result;
out->reg.y = result;
out->reg.z = result;
out->reg.w = result;
}
void nv2a_vsh_cpu_exp(Nv2aVshRegister *out, const Nv2aVshRegister *a) {
float tmp = floorf(a->reg.x);
void nv2a_vsh_cpu_exp(Nv2aVshRegister *out, const Nv2aVshRegister *inputs) {
float tmp = floorf(inputs->reg.x);
out->reg.x = powf(2.0f, tmp);
out->reg.y = a->reg.x - tmp;
out->reg.z = powf(2.0f, a->reg.x);
out->reg.y = inputs->reg.x - tmp;
out->reg.z = powf(2.0f, inputs->reg.x);
out->reg.w = 1.0f;
}
void nv2a_vsh_cpu_log(Nv2aVshRegister *out, const Nv2aVshRegister *a) {
void nv2a_vsh_cpu_log(Nv2aVshRegister *out, const Nv2aVshRegister *inputs) {
// TODO: Validate this on HW.
float tmp = fabsf(a->reg.x);
float tmp = fabsf(inputs->reg.x);
if (tmp == 0.0f) {
out->reg.x = -INFINITY;
out->reg.y = 1.0f;
@ -182,7 +184,7 @@ void nv2a_vsh_cpu_log(Nv2aVshRegister *out, const Nv2aVshRegister *a) {
out->reg.w = 1.0f;
}
void nv2a_vsh_cpu_lit(Nv2aVshRegister *out, const Nv2aVshRegister *a) {
void nv2a_vsh_cpu_lit(Nv2aVshRegister *out, const Nv2aVshRegister *inputs) {
static const float kMax = 127.9961f;
out->reg.x = 1.0f;
@ -190,11 +192,13 @@ void nv2a_vsh_cpu_lit(Nv2aVshRegister *out, const Nv2aVshRegister *a) {
out->reg.z = 0.0f;
out->reg.w = 1.0f;
float power = a->reg.w < -kMax ? -kMax : (a->reg.w > kMax ? kMax : a->reg.w);
if (a->reg.x > 0.0f) {
out->reg.y = a->reg.x;
if (a->reg.y > 0.0f) {
out->reg.z = powf(a->reg.y, power);
float power = inputs->reg.w < -kMax
? -kMax
: (inputs->reg.w > kMax ? kMax : inputs->reg.w);
if (inputs->reg.x > 0.0f) {
out->reg.y = inputs->reg.x;
if (inputs->reg.y > 0.0f) {
out->reg.z = powf(inputs->reg.y, power);
}
}
}

View File

@ -17,38 +17,28 @@ typedef union Nv2aVshRegister_ {
float raw[4];
} Nv2aVshRegister;
#define OP_1(name) \
void nv2a_vsh_cpu_##name(Nv2aVshRegister *out, const Nv2aVshRegister *a)
#define OP_2(name) \
void nv2a_vsh_cpu_##name(Nv2aVshRegister *out, const Nv2aVshRegister *a, \
const Nv2aVshRegister *b)
#define OP_3(name) \
void nv2a_vsh_cpu_##name(Nv2aVshRegister *out, const Nv2aVshRegister *a, \
const Nv2aVshRegister *b, const Nv2aVshRegister *c)
typedef void (*Nv2aVshCpuFunc)(Nv2aVshRegister *out,
const Nv2aVshRegister *inputs);
OP_1(mov);
OP_1(arl);
OP_2(mul);
OP_2(add);
OP_3(mad);
OP_2(dp3);
OP_2(dph);
OP_2(dp4);
OP_2(dst);
OP_2(min);
OP_2(max);
OP_2(slt);
OP_2(sge);
OP_1(rcp);
OP_1(rcc);
OP_1(rsq);
OP_1(exp);
OP_1(log);
OP_1(lit);
#undef OP_1
#undef OP_2
#undef OP_3
void nv2a_vsh_cpu_mov(Nv2aVshRegister *out, const Nv2aVshRegister *inputs);
void nv2a_vsh_cpu_arl(Nv2aVshRegister *out, const Nv2aVshRegister *inputs);
void nv2a_vsh_cpu_mul(Nv2aVshRegister *out, const Nv2aVshRegister *inputs);
void nv2a_vsh_cpu_add(Nv2aVshRegister *out, const Nv2aVshRegister *inputs);
void nv2a_vsh_cpu_mad(Nv2aVshRegister *out, const Nv2aVshRegister *inputs);
void nv2a_vsh_cpu_dp3(Nv2aVshRegister *out, const Nv2aVshRegister *inputs);
void nv2a_vsh_cpu_dph(Nv2aVshRegister *out, const Nv2aVshRegister *inputs);
void nv2a_vsh_cpu_dp4(Nv2aVshRegister *out, const Nv2aVshRegister *inputs);
void nv2a_vsh_cpu_dst(Nv2aVshRegister *out, const Nv2aVshRegister *inputs);
void nv2a_vsh_cpu_min(Nv2aVshRegister *out, const Nv2aVshRegister *inputs);
void nv2a_vsh_cpu_max(Nv2aVshRegister *out, const Nv2aVshRegister *inputs);
void nv2a_vsh_cpu_slt(Nv2aVshRegister *out, const Nv2aVshRegister *inputs);
void nv2a_vsh_cpu_sge(Nv2aVshRegister *out, const Nv2aVshRegister *inputs);
void nv2a_vsh_cpu_rcp(Nv2aVshRegister *out, const Nv2aVshRegister *inputs);
void nv2a_vsh_cpu_rcc(Nv2aVshRegister *out, const Nv2aVshRegister *inputs);
void nv2a_vsh_cpu_rsq(Nv2aVshRegister *out, const Nv2aVshRegister *inputs);
void nv2a_vsh_cpu_exp(Nv2aVshRegister *out, const Nv2aVshRegister *inputs);
void nv2a_vsh_cpu_log(Nv2aVshRegister *out, const Nv2aVshRegister *inputs);
void nv2a_vsh_cpu_lit(Nv2aVshRegister *out, const Nv2aVshRegister *inputs);
#ifdef __cplusplus
}; // extern "C"

View File

@ -3,6 +3,31 @@
#include <assert.h>
#include <string.h>
// clang format off
static Nv2aVshCpuFunc kDispatchTable[] = {
NULL,
nv2a_vsh_cpu_mov,
nv2a_vsh_cpu_mul,
nv2a_vsh_cpu_add,
nv2a_vsh_cpu_mad,
nv2a_vsh_cpu_dp3,
nv2a_vsh_cpu_dph,
nv2a_vsh_cpu_dp4,
nv2a_vsh_cpu_dst,
nv2a_vsh_cpu_min,
nv2a_vsh_cpu_max,
nv2a_vsh_cpu_slt,
nv2a_vsh_cpu_sge,
nv2a_vsh_cpu_arl,
nv2a_vsh_cpu_rcp,
nv2a_vsh_cpu_rcc,
nv2a_vsh_cpu_rsq,
nv2a_vsh_cpu_exp,
nv2a_vsh_cpu_log,
nv2a_vsh_cpu_lit,
};
// clang format on
static inline void set_register(Nv2aVshRegister *out, const Nv2aVshRegister *in,
const uint8_t *swizzle, bool negate) {
float mult = negate ? -1.0f : 1.0f;
@ -48,37 +73,13 @@ static inline void fetch_value(Nv2aVshRegister *out,
static inline void apply_operation(Nv2aVshExecutionState *state,
const Nv2aVshOperation *op,
const Nv2aVshRegister *inputs) {
Nv2aVshRegister output;
switch (op->opcode) {
case NV2AOP_NOP:
return;
case NV2AOP_MOV:
nv2a_vsh_cpu_mov(&output, inputs);
break;
case NV2AOP_MUL:
case NV2AOP_ADD:
case NV2AOP_MAD:
case NV2AOP_DP3:
case NV2AOP_DPH:
case NV2AOP_DP4:
case NV2AOP_DST:
case NV2AOP_MIN:
case NV2AOP_MAX:
case NV2AOP_SLT:
case NV2AOP_SGE:
case NV2AOP_ARL:
case NV2AOP_RCP:
case NV2AOP_RCC:
case NV2AOP_RSQ:
case NV2AOP_EXP:
case NV2AOP_LOG:
case NV2AOP_LIT:
break;
if (op->opcode == NV2AOP_NOP) {
return;
}
Nv2aVshRegister result;
kDispatchTable[op->opcode](&result, inputs);
const Nv2aVshOutput *out = op->outputs;
for (uint32_t i = 0; i < 2; ++i, ++out) {
Nv2aVshRegister *outreg;
@ -90,7 +91,7 @@ static inline void apply_operation(Nv2aVshExecutionState *state,
continue;
case NV2ART_OUTPUT:
assert(out->index < 13 && "Invalid output register target.");
assert(out->index < 13 && "Invalid result register target.");
outreg = (Nv2aVshRegister *)(state->output_regs + out->index * 4);
break;
@ -110,16 +111,16 @@ static inline void apply_operation(Nv2aVshExecutionState *state,
}
if (out->writemask & NV2AWM_X) {
outreg->reg.x = output.reg.x;
outreg->reg.x = result.reg.x;
}
if (out->writemask & NV2AWM_Y) {
outreg->reg.y = output.reg.y;
outreg->reg.y = result.reg.y;
}
if (out->writemask & NV2AWM_Z) {
outreg->reg.z = output.reg.z;
outreg->reg.z = result.reg.z;
}
if (out->writemask & NV2AWM_W) {
outreg->reg.w = output.reg.w;
outreg->reg.w = result.reg.w;
}
}
}