mirror of
https://github.com/libretro/ppsspp.git
synced 2024-11-25 01:00:01 +00:00
ARM: NEON-optimize software skinning
This commit is contained in:
parent
87e81f05b4
commit
030e6460cc
@ -1226,6 +1226,17 @@ ARMReg DScalar(ARMReg dreg, int subScalar) {
|
||||
return ret;
|
||||
}
|
||||
|
||||
// Convert to a DScalar
|
||||
ARMReg QScalar(ARMReg qreg, int subScalar) {
|
||||
int dr = (int)(SubBase(qreg)) & 0xF;
|
||||
if (subScalar & 2) {
|
||||
dr++;
|
||||
}
|
||||
int scalar = (((subScalar & 1) << 4) | dr);
|
||||
ARMReg ret = (ARMReg)(D0 + scalar);
|
||||
return ret;
|
||||
}
|
||||
|
||||
void ARMXEmitter::WriteVFPDataOp(u32 Op, ARMReg Vd, ARMReg Vn, ARMReg Vm)
|
||||
{
|
||||
bool quad_reg = Vd >= Q0;
|
||||
|
@ -371,6 +371,7 @@ ARMReg SubBase(ARMReg Reg);
|
||||
// See A.7.1 in the ARMv7-A
|
||||
// VMUL F32 scalars can only be up to D15[0], D15[1] - higher scalars cannot be individually addressed
|
||||
ARMReg DScalar(ARMReg dreg, int subScalar);
|
||||
ARMReg QScalar(ARMReg qreg, int subScalar);
|
||||
|
||||
enum NEONAlignment {
|
||||
ALIGN_NONE = 0,
|
||||
|
@ -15,6 +15,7 @@
|
||||
// Official git repository and contact information can be found at
|
||||
// https://github.com/hrydgard/ppsspp and http://www.ppsspp.org/.
|
||||
|
||||
#include "base/logging.h"
|
||||
#include "Common/ChunkFile.h"
|
||||
#include "Core/Reporting.h"
|
||||
#include "Core/Core.h"
|
||||
@ -44,13 +45,13 @@ void DisassembleArm(const u8 *data, int size) {
|
||||
int reg1 = (next & 0x0000F000) >> 12;
|
||||
if (reg0 == reg1) {
|
||||
sprintf(temp, "%08x MOV32? %s, %04x%04x", (u32)inst, ArmRegName(reg0), hi, low);
|
||||
INFO_LOG(JIT, "A: %s", temp);
|
||||
ILOG("A: %s", temp);
|
||||
i += 4;
|
||||
continue;
|
||||
}
|
||||
}
|
||||
ArmDis((u32)codePtr, inst, temp);
|
||||
INFO_LOG(JIT, "A: %s", temp);
|
||||
ILOG("A: %s", temp);
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -27,8 +27,6 @@
|
||||
#include "VertexDecoder.h"
|
||||
#include "VertexShaderGenerator.h"
|
||||
|
||||
extern void DisassembleArm(const u8 *data, int size);
|
||||
|
||||
static const u8 tcsize[4] = {0,2,4,8}, tcalign[4] = {0,1,2,4};
|
||||
static const u8 colsize[8] = {0,0,0,0,2,2,2,4}, colalign[8] = {0,0,0,0,2,2,2,4};
|
||||
static const u8 nrmsize[4] = {0,3,6,12}, nrmalign[4] = {0,1,2,4};
|
||||
|
@ -15,14 +15,20 @@
|
||||
// Official git repository and contact information can be found at
|
||||
// https://github.com/hrydgard/ppsspp and http://www.ppsspp.org/.
|
||||
|
||||
#include "base/logging.h"
|
||||
#include "Common/CPUDetect.h"
|
||||
#include "Core/Config.h"
|
||||
#include "GPU/GLES/VertexDecoder.h"
|
||||
|
||||
extern void DisassembleArm(const u8 *data, int size);
|
||||
|
||||
bool NEONSkinning = false;
|
||||
|
||||
// Used only in non-NEON mode.
|
||||
static float MEMORY_ALIGNED16(skinMatrix[12]);
|
||||
|
||||
// Will be used only in NEON mode.
|
||||
static float MEMORY_ALIGNED16(bones[16 * 6]); // First two are kept in registers.
|
||||
static float MEMORY_ALIGNED16(bones[16 * 8]); // First two will be kept in registers later
|
||||
|
||||
// NEON register allocation:
|
||||
// Q0: Texture scaling parameters
|
||||
@ -74,6 +80,9 @@ static const ARMReg neonScratchRegQ = Q1; // Overlaps with all the scratch regs
|
||||
static const ARMReg src[3] = {S8, S9, S10}; // skin source
|
||||
static const ARMReg acc[3] = {S11, S12, S13}; // skin accumulator
|
||||
|
||||
static const ARMReg srcNEON = Q2;
|
||||
static const ARMReg accNEON = Q3;
|
||||
|
||||
static const JitLookup jitLookup[] = {
|
||||
{&VertexDecoder::Step_WeightsU8, &VertexDecoderJitCache::Jit_WeightsU8},
|
||||
{&VertexDecoder::Step_WeightsU16, &VertexDecoderJitCache::Jit_WeightsU16},
|
||||
@ -129,6 +138,8 @@ JittedVertexDecoder VertexDecoderJitCache::Compile(const VertexDecoder &dec) {
|
||||
bool prescaleStep = false;
|
||||
bool skinning = false;
|
||||
|
||||
NEONSkinning = cpu_info.bNEON;
|
||||
|
||||
// Look for prescaled texcoord steps
|
||||
for (int i = 0; i < dec.numSteps_; i++) {
|
||||
if (dec.steps_[i] == &VertexDecoder::Step_TcU8Prescale ||
|
||||
@ -166,6 +177,49 @@ JittedVertexDecoder VertexDecoderJitCache::Compile(const VertexDecoder &dec) {
|
||||
}
|
||||
}
|
||||
|
||||
// Add code to convert matrices to 4x4.
|
||||
// Later we might want to do this when the matrices are loaded instead.
|
||||
int boneCount = 0;
|
||||
if (NEONSkinning && dec.weighttype && g_Config.bSoftwareSkinning) {
|
||||
// Copying from R3 to R4
|
||||
MOVP2R(R3, gstate.boneMatrix);
|
||||
MOVP2R(R4, bones);
|
||||
MOVI2F(fpScratchReg, 0.0f, scratchReg);
|
||||
for (int i = 0; i < 8; i++) {
|
||||
VLD1(F_32, Q4, R3, 2); // Load 128 bits even though we just want 96
|
||||
VMOV(S19, fpScratchReg);
|
||||
ADD(R3, R3, 12);
|
||||
VLD1(F_32, Q5, R3, 2);
|
||||
VMOV(S23, fpScratchReg);
|
||||
ADD(R3, R3, 12);
|
||||
VLD1(F_32, Q6, R3, 2);
|
||||
VMOV(S27, fpScratchReg);
|
||||
ADD(R3, R3, 12);
|
||||
VLD1(F_32, Q7, R3, 2);
|
||||
VMOV(S31, fpScratchReg);
|
||||
ADD(R3, R3, 12);
|
||||
// First two matrices are in registers.
|
||||
if (i == 0) {
|
||||
VMOV(Q8, Q4);
|
||||
VMOV(Q9, Q5);
|
||||
VMOV(Q10, Q6);
|
||||
VMOV(Q11, Q7);
|
||||
ADD(R4, R4, 16 * 4);
|
||||
} else if (i == 1) {
|
||||
VMOV(Q12, Q4);
|
||||
VMOV(Q13, Q5);
|
||||
VMOV(Q14, Q6);
|
||||
VMOV(Q15, Q7);
|
||||
ADD(R4, R4, 16 * 4);
|
||||
} else {
|
||||
VST1(F_32, Q4, R4, 2, ALIGN_128, REG_UPDATE);
|
||||
VST1(F_32, Q5, R4, 2, ALIGN_128, REG_UPDATE);
|
||||
VST1(F_32, Q6, R4, 2, ALIGN_128, REG_UPDATE);
|
||||
VST1(F_32, Q7, R4, 2, ALIGN_128, REG_UPDATE);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// TODO: NEON skinning register mapping
|
||||
// The matrix will be built in Q12-Q15.
|
||||
// The temporary matrix to be added to the built matrix will be in Q8-Q11.
|
||||
@ -197,10 +251,13 @@ JittedVertexDecoder VertexDecoderJitCache::Compile(const VertexDecoder &dec) {
|
||||
|
||||
FlushLitPool();
|
||||
FlushIcache();
|
||||
// DisassembleArm(start, GetCodePtr() - start);
|
||||
// char temp[1024] = {0};
|
||||
// dec.ToString(temp);
|
||||
// INFO_LOG(HLE, "%s", temp);
|
||||
|
||||
/*
|
||||
DisassembleArm(start, GetCodePtr() - start);
|
||||
char temp[1024] = {0};
|
||||
dec.ToString(temp);
|
||||
INFO_LOG(HLE, "%s", temp);
|
||||
*/
|
||||
|
||||
return (JittedVertexDecoder)start;
|
||||
}
|
||||
@ -252,82 +309,134 @@ void VertexDecoderJitCache::Jit_WeightsFloat() {
|
||||
}
|
||||
|
||||
static const ARMReg weightRegs[8] = { S8, S9, S10, S11, S12, S13, S14, S15 };
|
||||
static const ARMReg neonWeightRegs[2] = { Q2, Q3 };
|
||||
|
||||
void VertexDecoderJitCache::Jit_ApplyWeights() {
|
||||
MOVI2R(tempReg2, (u32)skinMatrix, scratchReg);
|
||||
#if 1
|
||||
// This approach saves a few stores but accesses the matrices in a more
|
||||
// sparse order.
|
||||
const float *bone = &gstate.boneMatrix[0];
|
||||
MOVI2R(tempReg1, (u32)bone, scratchReg);
|
||||
for (int i = 0; i < 12; i++) {
|
||||
VLDR(fpScratchReg3, tempReg1, i * 4);
|
||||
VMUL(fpScratchReg3, fpScratchReg3, weightRegs[0]);
|
||||
for (int j = 1; j < dec_->nweights; j++) {
|
||||
VLDR(fpScratchReg2, tempReg1, i * 4 + j * 4 * 12);
|
||||
VMLA(fpScratchReg3, fpScratchReg2, weightRegs[j]);
|
||||
if (NEONSkinning) {
|
||||
// We construct a matrix in Q4-Q7
|
||||
// We can use Q1 as temp.
|
||||
MOVP2R(scratchReg, bones);
|
||||
for (int i = 0; i < dec_->nweights; i++) {
|
||||
switch (i) {
|
||||
case 0:
|
||||
VMUL_scalar(F_32, Q4, Q8, QScalar(neonWeightRegs[0], 0));
|
||||
VMUL_scalar(F_32, Q5, Q9, QScalar(neonWeightRegs[0], 0));
|
||||
VMUL_scalar(F_32, Q6, Q10, QScalar(neonWeightRegs[0], 0));
|
||||
VMUL_scalar(F_32, Q7, Q11, QScalar(neonWeightRegs[0], 0));
|
||||
ADD(scratchReg, scratchReg, 16 * 4);
|
||||
break;
|
||||
case 1:
|
||||
VMLA_scalar(F_32, Q4, Q12, QScalar(neonWeightRegs[0], 1));
|
||||
VMLA_scalar(F_32, Q5, Q13, QScalar(neonWeightRegs[0], 1));
|
||||
VMLA_scalar(F_32, Q6, Q14, QScalar(neonWeightRegs[0], 1));
|
||||
VMLA_scalar(F_32, Q7, Q15, QScalar(neonWeightRegs[0], 1));
|
||||
ADD(scratchReg, scratchReg, 16 * 4);
|
||||
break;
|
||||
default:
|
||||
// Matrices 2+ need to be loaded from memory.
|
||||
// Wonder if we can free up one more register so we could get some parallelism.
|
||||
VLD1(F_32, Q1, scratchReg, 2, ALIGN_128, REG_UPDATE);
|
||||
VMLA_scalar(F_32, Q4, Q1, QScalar(neonWeightRegs[i >> 2], i & 3));
|
||||
VLD1(F_32, Q1, scratchReg, 2, ALIGN_128, REG_UPDATE);
|
||||
VMLA_scalar(F_32, Q5, Q1, QScalar(neonWeightRegs[i >> 2], i & 3));
|
||||
VLD1(F_32, Q1, scratchReg, 2, ALIGN_128, REG_UPDATE);
|
||||
VMLA_scalar(F_32, Q6, Q1, QScalar(neonWeightRegs[i >> 2], i & 3));
|
||||
VLD1(F_32, Q1, scratchReg, 2, ALIGN_128, REG_UPDATE);
|
||||
VMLA_scalar(F_32, Q7, Q1, QScalar(neonWeightRegs[i >> 2], i & 3));
|
||||
break;
|
||||
}
|
||||
}
|
||||
VSTR(fpScratchReg3, tempReg2, i * 4);
|
||||
}
|
||||
#else
|
||||
// This one does accesses in linear order but wastes time storing, loading, storing.
|
||||
for (int j = 0; j < dec_->nweights; j++) {
|
||||
const float *bone = &gstate.boneMatrix[j * 12];
|
||||
} else {
|
||||
MOVI2R(tempReg2, (u32)skinMatrix, scratchReg);
|
||||
// This approach saves a few stores but accesses the matrices in a more
|
||||
// sparse order.
|
||||
const float *bone = &gstate.boneMatrix[0];
|
||||
MOVI2R(tempReg1, (u32)bone, scratchReg);
|
||||
// Okay, we have the weight.
|
||||
if (j == 0) {
|
||||
for (int i = 0; i < 12; i++) {
|
||||
VLDR(fpScratchReg2, tempReg1, i * 4);
|
||||
VMUL(fpScratchReg2, fpScratchReg2, weightRegs[j]);
|
||||
VSTR(fpScratchReg2, tempReg2, i * 4);
|
||||
}
|
||||
} else {
|
||||
for (int i = 0; i < 12; i++) {
|
||||
VLDR(fpScratchReg2, tempReg1, i * 4);
|
||||
VLDR(fpScratchReg3, tempReg2, i * 4);
|
||||
for (int i = 0; i < 12; i++) {
|
||||
VLDR(fpScratchReg3, tempReg1, i * 4);
|
||||
VMUL(fpScratchReg3, fpScratchReg3, weightRegs[0]);
|
||||
for (int j = 1; j < dec_->nweights; j++) {
|
||||
VLDR(fpScratchReg2, tempReg1, i * 4 + j * 4 * 12);
|
||||
VMLA(fpScratchReg3, fpScratchReg2, weightRegs[j]);
|
||||
VSTR(fpScratchReg3, tempReg2, i * 4);
|
||||
}
|
||||
VSTR(fpScratchReg3, tempReg2, i * 4);
|
||||
}
|
||||
}
|
||||
#endif
|
||||
}
|
||||
|
||||
void VertexDecoderJitCache::Jit_WeightsU8Skin() {
|
||||
// No need to zero skinMatrix, we'll just STR to it in the first lap,
|
||||
// then VLDR/VADD/VSTR in subsequent laps.
|
||||
for (int j = 0; j < dec_->nweights; j++) {
|
||||
LDRB(tempReg1, srcReg, dec_->weightoff + j);
|
||||
VMOV(fpScratchReg, tempReg1);
|
||||
VCVT(fpScratchReg, fpScratchReg, TO_FLOAT);
|
||||
MOVI2F(fpScratchReg2, by128, scratchReg);
|
||||
VMUL(weightRegs[j], fpScratchReg, fpScratchReg2);
|
||||
if (NEONSkinning && dec_->nweights <= 4) {
|
||||
// Most common cases.
|
||||
// Weight is first so srcReg is correct.
|
||||
switch (dec_->nweights) {
|
||||
case 1: LDRB(scratchReg2, srcReg, 0); break;
|
||||
case 2: LDRH(scratchReg2, srcReg, 0); break;
|
||||
case 3:
|
||||
LDR(scratchReg2, srcReg, 0);
|
||||
ANDI2R(scratchReg2, scratchReg2, 0xFFFFFF, scratchReg);
|
||||
break;
|
||||
case 4:
|
||||
LDR(scratchReg2, srcReg, 0);
|
||||
break;
|
||||
}
|
||||
VMOV(fpScratchReg, scratchReg2);
|
||||
MOVI2F(S12, by128, scratchReg);
|
||||
VMOVL(I_8 | I_UNSIGNED, neonScratchRegQ, neonScratchReg);
|
||||
VMOVL(I_16 | I_UNSIGNED, neonScratchRegQ, neonScratchReg);
|
||||
VCVT(F_32 | I_UNSIGNED, neonScratchRegQ, neonScratchRegQ);
|
||||
VMUL_scalar(F_32, neonWeightRegs[0], neonScratchRegQ, DScalar(D6, 0));
|
||||
} else {
|
||||
// Fallback and non-neon
|
||||
for (int j = 0; j < dec_->nweights; j++) {
|
||||
LDRB(tempReg1, srcReg, dec_->weightoff + j);
|
||||
VMOV(fpScratchReg, tempReg1);
|
||||
VCVT(fpScratchReg, fpScratchReg, TO_FLOAT);
|
||||
MOVI2F(fpScratchReg2, by128, scratchReg);
|
||||
VMUL(weightRegs[j], fpScratchReg, fpScratchReg2);
|
||||
}
|
||||
}
|
||||
|
||||
Jit_ApplyWeights();
|
||||
}
|
||||
|
||||
void VertexDecoderJitCache::Jit_WeightsU16Skin() {
|
||||
// No need to zero skinMatrix, we'll just STR to it in the first lap,
|
||||
// then VLDR/VADD/VSTR in subsequent laps.
|
||||
for (int j = 0; j < dec_->nweights; j++) {
|
||||
LDRH(tempReg1, srcReg, dec_->weightoff + j * 2);
|
||||
VMOV(fpScratchReg, tempReg1);
|
||||
VCVT(fpScratchReg, fpScratchReg, TO_FLOAT);
|
||||
MOVI2F(fpScratchReg2, 1.0f / 32768.0f, scratchReg);
|
||||
VMUL(weightRegs[j], fpScratchReg, fpScratchReg2);
|
||||
if (NEONSkinning && dec_->nweights <= 4) {
|
||||
// Most common cases.
|
||||
switch (dec_->nweights) {
|
||||
case 1: LDRH(scratchReg, srcReg, 0); break;
|
||||
case 2: LDR(scratchReg, srcReg, 0); break;
|
||||
case 3:
|
||||
LDR(scratchReg, srcReg, 0);
|
||||
LDRH(scratchReg2, srcReg, 4);
|
||||
break;
|
||||
case 4:
|
||||
LDR(scratchReg, srcReg, 0);
|
||||
LDR(scratchReg2, srcReg, 4);
|
||||
break;
|
||||
}
|
||||
VMOV(fpScratchReg, scratchReg);
|
||||
VMOV(fpScratchReg2, scratchReg2);
|
||||
MOVI2F(S12, by32768, scratchReg);
|
||||
VMOVL(I_16 | I_UNSIGNED, neonScratchRegQ, neonScratchReg);
|
||||
VCVT(F_32 | I_UNSIGNED, neonScratchRegQ, neonScratchRegQ);
|
||||
VMUL_scalar(F_32, neonWeightRegs[0], neonScratchRegQ, DScalar(D6, 0));
|
||||
} else {
|
||||
// Fallback and non-neon
|
||||
for (int j = 0; j < dec_->nweights; j++) {
|
||||
LDRH(tempReg1, srcReg, dec_->weightoff + j * 2);
|
||||
VMOV(fpScratchReg, tempReg1);
|
||||
VCVT(fpScratchReg, fpScratchReg, TO_FLOAT);
|
||||
MOVI2F(fpScratchReg2, 1.0f / 32768.0f, scratchReg);
|
||||
VMUL(weightRegs[j], fpScratchReg, fpScratchReg2);
|
||||
}
|
||||
}
|
||||
|
||||
Jit_ApplyWeights();
|
||||
}
|
||||
|
||||
void VertexDecoderJitCache::Jit_WeightsFloatSkin() {
|
||||
// No need to zero skinMatrix, we'll just STR to it in the first lap,
|
||||
// then VLDR/VADD/VSTR in subsequent laps.
|
||||
// TODO: NEON-ize (barely worth)
|
||||
for (int j = 0; j < dec_->nweights; j++) {
|
||||
VLDR(weightRegs[j], srcReg, dec_->weightoff + j * 4);
|
||||
}
|
||||
|
||||
Jit_ApplyWeights();
|
||||
}
|
||||
|
||||
@ -671,27 +780,39 @@ void VertexDecoderJitCache::Jit_NormalFloatSkin() {
|
||||
}
|
||||
|
||||
void VertexDecoderJitCache::Jit_WriteMatrixMul(int outOff, bool pos) {
|
||||
MOVI2R(tempReg1, (u32)skinMatrix, scratchReg);
|
||||
for (int i = 0; i < 3; i++) {
|
||||
VLDR(fpScratchReg, tempReg1, 4 * i);
|
||||
VMUL(acc[i], fpScratchReg, src[0]);
|
||||
}
|
||||
for (int i = 0; i < 3; i++) {
|
||||
VLDR(fpScratchReg, tempReg1, 12 + 4 * i);
|
||||
VMLA(acc[i], fpScratchReg, src[1]);
|
||||
}
|
||||
for (int i = 0; i < 3; i++) {
|
||||
VLDR(fpScratchReg, tempReg1, 24 + 4 * i);
|
||||
VMLA(acc[i], fpScratchReg, src[2]);
|
||||
}
|
||||
if (pos) {
|
||||
for (int i = 0; i < 3; i++) {
|
||||
VLDR(fpScratchReg, tempReg1, 36 + 4 * i);
|
||||
VADD(acc[i], acc[i], fpScratchReg);
|
||||
if (NEONSkinning) {
|
||||
// Multiply with the matrix sitting in Q4-Q7.
|
||||
ADD(scratchReg, dstReg, outOff);
|
||||
VMUL_scalar(F_32, accNEON, Q4, QScalar(srcNEON, 0));
|
||||
VMLA_scalar(F_32, accNEON, Q5, QScalar(srcNEON, 1));
|
||||
VMLA_scalar(F_32, accNEON, Q6, QScalar(srcNEON, 2));
|
||||
if (pos) {
|
||||
VADD(F_32, accNEON, accNEON, Q7);
|
||||
}
|
||||
VST1(F_32, accNEON, scratchReg, 2);
|
||||
} else {
|
||||
MOVI2R(tempReg1, (u32)skinMatrix, scratchReg);
|
||||
for (int i = 0; i < 3; i++) {
|
||||
VLDR(fpScratchReg, tempReg1, 4 * i);
|
||||
VMUL(acc[i], fpScratchReg, src[0]);
|
||||
}
|
||||
for (int i = 0; i < 3; i++) {
|
||||
VLDR(fpScratchReg, tempReg1, 12 + 4 * i);
|
||||
VMLA(acc[i], fpScratchReg, src[1]);
|
||||
}
|
||||
for (int i = 0; i < 3; i++) {
|
||||
VLDR(fpScratchReg, tempReg1, 24 + 4 * i);
|
||||
VMLA(acc[i], fpScratchReg, src[2]);
|
||||
}
|
||||
if (pos) {
|
||||
for (int i = 0; i < 3; i++) {
|
||||
VLDR(fpScratchReg, tempReg1, 36 + 4 * i);
|
||||
VADD(acc[i], acc[i], fpScratchReg);
|
||||
}
|
||||
}
|
||||
for (int i = 0; i < 3; i++) {
|
||||
VSTR(acc[i], dstReg, outOff + i * 4);
|
||||
}
|
||||
}
|
||||
for (int i = 0; i < 3; i++) {
|
||||
VSTR(acc[i], dstReg, outOff + i * 4);
|
||||
}
|
||||
}
|
||||
|
||||
|
Loading…
Reference in New Issue
Block a user