ARM: NEON-optimize software skinning

This commit is contained in:
Henrik Rydgard 2013-11-24 18:03:08 +01:00
parent 87e81f05b4
commit 030e6460cc
5 changed files with 213 additions and 81 deletions

View File

@ -1226,6 +1226,17 @@ ARMReg DScalar(ARMReg dreg, int subScalar) {
return ret;
}
// Convert to a DScalar
ARMReg QScalar(ARMReg qreg, int subScalar) {
int dr = (int)(SubBase(qreg)) & 0xF;
if (subScalar & 2) {
dr++;
}
int scalar = (((subScalar & 1) << 4) | dr);
ARMReg ret = (ARMReg)(D0 + scalar);
return ret;
}
void ARMXEmitter::WriteVFPDataOp(u32 Op, ARMReg Vd, ARMReg Vn, ARMReg Vm)
{
bool quad_reg = Vd >= Q0;

View File

@ -371,6 +371,7 @@ ARMReg SubBase(ARMReg Reg);
// See A.7.1 in the ARMv7-A
// VMUL F32 scalars can only be up to D15[0], D15[1] - higher scalars cannot be individually addressed
ARMReg DScalar(ARMReg dreg, int subScalar);
ARMReg QScalar(ARMReg qreg, int subScalar);
enum NEONAlignment {
ALIGN_NONE = 0,

View File

@ -15,6 +15,7 @@
// Official git repository and contact information can be found at
// https://github.com/hrydgard/ppsspp and http://www.ppsspp.org/.
#include "base/logging.h"
#include "Common/ChunkFile.h"
#include "Core/Reporting.h"
#include "Core/Core.h"
@ -44,13 +45,13 @@ void DisassembleArm(const u8 *data, int size) {
int reg1 = (next & 0x0000F000) >> 12;
if (reg0 == reg1) {
sprintf(temp, "%08x MOV32? %s, %04x%04x", (u32)inst, ArmRegName(reg0), hi, low);
INFO_LOG(JIT, "A: %s", temp);
ILOG("A: %s", temp);
i += 4;
continue;
}
}
ArmDis((u32)codePtr, inst, temp);
INFO_LOG(JIT, "A: %s", temp);
ILOG("A: %s", temp);
}
}

View File

@ -27,8 +27,6 @@
#include "VertexDecoder.h"
#include "VertexShaderGenerator.h"
extern void DisassembleArm(const u8 *data, int size);
static const u8 tcsize[4] = {0,2,4,8}, tcalign[4] = {0,1,2,4};
static const u8 colsize[8] = {0,0,0,0,2,2,2,4}, colalign[8] = {0,0,0,0,2,2,2,4};
static const u8 nrmsize[4] = {0,3,6,12}, nrmalign[4] = {0,1,2,4};

View File

@ -15,14 +15,20 @@
// Official git repository and contact information can be found at
// https://github.com/hrydgard/ppsspp and http://www.ppsspp.org/.
#include "base/logging.h"
#include "Common/CPUDetect.h"
#include "Core/Config.h"
#include "GPU/GLES/VertexDecoder.h"
extern void DisassembleArm(const u8 *data, int size);
bool NEONSkinning = false;
// Used only in non-NEON mode.
static float MEMORY_ALIGNED16(skinMatrix[12]);
// Will be used only in NEON mode.
static float MEMORY_ALIGNED16(bones[16 * 6]); // First two are kept in registers.
static float MEMORY_ALIGNED16(bones[16 * 8]); // First two will be kept in registers later
// NEON register allocation:
// Q0: Texture scaling parameters
@ -74,6 +80,9 @@ static const ARMReg neonScratchRegQ = Q1; // Overlaps with all the scratch regs
static const ARMReg src[3] = {S8, S9, S10}; // skin source
static const ARMReg acc[3] = {S11, S12, S13}; // skin accumulator
static const ARMReg srcNEON = Q2;
static const ARMReg accNEON = Q3;
static const JitLookup jitLookup[] = {
{&VertexDecoder::Step_WeightsU8, &VertexDecoderJitCache::Jit_WeightsU8},
{&VertexDecoder::Step_WeightsU16, &VertexDecoderJitCache::Jit_WeightsU16},
@ -129,6 +138,8 @@ JittedVertexDecoder VertexDecoderJitCache::Compile(const VertexDecoder &dec) {
bool prescaleStep = false;
bool skinning = false;
NEONSkinning = cpu_info.bNEON;
// Look for prescaled texcoord steps
for (int i = 0; i < dec.numSteps_; i++) {
if (dec.steps_[i] == &VertexDecoder::Step_TcU8Prescale ||
@ -166,6 +177,49 @@ JittedVertexDecoder VertexDecoderJitCache::Compile(const VertexDecoder &dec) {
}
}
// Add code to convert matrices to 4x4.
// Later we might want to do this when the matrices are loaded instead.
int boneCount = 0;
if (NEONSkinning && dec.weighttype && g_Config.bSoftwareSkinning) {
// Copying from R3 to R4
MOVP2R(R3, gstate.boneMatrix);
MOVP2R(R4, bones);
MOVI2F(fpScratchReg, 0.0f, scratchReg);
for (int i = 0; i < 8; i++) {
VLD1(F_32, Q4, R3, 2); // Load 128 bits even though we just want 96
VMOV(S19, fpScratchReg);
ADD(R3, R3, 12);
VLD1(F_32, Q5, R3, 2);
VMOV(S23, fpScratchReg);
ADD(R3, R3, 12);
VLD1(F_32, Q6, R3, 2);
VMOV(S27, fpScratchReg);
ADD(R3, R3, 12);
VLD1(F_32, Q7, R3, 2);
VMOV(S31, fpScratchReg);
ADD(R3, R3, 12);
// First two matrices are in registers.
if (i == 0) {
VMOV(Q8, Q4);
VMOV(Q9, Q5);
VMOV(Q10, Q6);
VMOV(Q11, Q7);
ADD(R4, R4, 16 * 4);
} else if (i == 1) {
VMOV(Q12, Q4);
VMOV(Q13, Q5);
VMOV(Q14, Q6);
VMOV(Q15, Q7);
ADD(R4, R4, 16 * 4);
} else {
VST1(F_32, Q4, R4, 2, ALIGN_128, REG_UPDATE);
VST1(F_32, Q5, R4, 2, ALIGN_128, REG_UPDATE);
VST1(F_32, Q6, R4, 2, ALIGN_128, REG_UPDATE);
VST1(F_32, Q7, R4, 2, ALIGN_128, REG_UPDATE);
}
}
}
// TODO: NEON skinning register mapping
// The matrix will be built in Q12-Q15.
// The temporary matrix to be added to the built matrix will be in Q8-Q11.
@ -197,10 +251,13 @@ JittedVertexDecoder VertexDecoderJitCache::Compile(const VertexDecoder &dec) {
FlushLitPool();
FlushIcache();
// DisassembleArm(start, GetCodePtr() - start);
// char temp[1024] = {0};
// dec.ToString(temp);
// INFO_LOG(HLE, "%s", temp);
/*
DisassembleArm(start, GetCodePtr() - start);
char temp[1024] = {0};
dec.ToString(temp);
INFO_LOG(HLE, "%s", temp);
*/
return (JittedVertexDecoder)start;
}
@ -252,82 +309,134 @@ void VertexDecoderJitCache::Jit_WeightsFloat() {
}
static const ARMReg weightRegs[8] = { S8, S9, S10, S11, S12, S13, S14, S15 };
static const ARMReg neonWeightRegs[2] = { Q2, Q3 };
void VertexDecoderJitCache::Jit_ApplyWeights() {
MOVI2R(tempReg2, (u32)skinMatrix, scratchReg);
#if 1
// This approach saves a few stores but accesses the matrices in a more
// sparse order.
const float *bone = &gstate.boneMatrix[0];
MOVI2R(tempReg1, (u32)bone, scratchReg);
for (int i = 0; i < 12; i++) {
VLDR(fpScratchReg3, tempReg1, i * 4);
VMUL(fpScratchReg3, fpScratchReg3, weightRegs[0]);
for (int j = 1; j < dec_->nweights; j++) {
VLDR(fpScratchReg2, tempReg1, i * 4 + j * 4 * 12);
VMLA(fpScratchReg3, fpScratchReg2, weightRegs[j]);
if (NEONSkinning) {
// We construct a matrix in Q4-Q7
// We can use Q1 as temp.
MOVP2R(scratchReg, bones);
for (int i = 0; i < dec_->nweights; i++) {
switch (i) {
case 0:
VMUL_scalar(F_32, Q4, Q8, QScalar(neonWeightRegs[0], 0));
VMUL_scalar(F_32, Q5, Q9, QScalar(neonWeightRegs[0], 0));
VMUL_scalar(F_32, Q6, Q10, QScalar(neonWeightRegs[0], 0));
VMUL_scalar(F_32, Q7, Q11, QScalar(neonWeightRegs[0], 0));
ADD(scratchReg, scratchReg, 16 * 4);
break;
case 1:
VMLA_scalar(F_32, Q4, Q12, QScalar(neonWeightRegs[0], 1));
VMLA_scalar(F_32, Q5, Q13, QScalar(neonWeightRegs[0], 1));
VMLA_scalar(F_32, Q6, Q14, QScalar(neonWeightRegs[0], 1));
VMLA_scalar(F_32, Q7, Q15, QScalar(neonWeightRegs[0], 1));
ADD(scratchReg, scratchReg, 16 * 4);
break;
default:
// Matrices 2+ need to be loaded from memory.
// Wonder if we can free up one more register so we could get some parallelism.
VLD1(F_32, Q1, scratchReg, 2, ALIGN_128, REG_UPDATE);
VMLA_scalar(F_32, Q4, Q1, QScalar(neonWeightRegs[i >> 2], i & 3));
VLD1(F_32, Q1, scratchReg, 2, ALIGN_128, REG_UPDATE);
VMLA_scalar(F_32, Q5, Q1, QScalar(neonWeightRegs[i >> 2], i & 3));
VLD1(F_32, Q1, scratchReg, 2, ALIGN_128, REG_UPDATE);
VMLA_scalar(F_32, Q6, Q1, QScalar(neonWeightRegs[i >> 2], i & 3));
VLD1(F_32, Q1, scratchReg, 2, ALIGN_128, REG_UPDATE);
VMLA_scalar(F_32, Q7, Q1, QScalar(neonWeightRegs[i >> 2], i & 3));
break;
}
}
VSTR(fpScratchReg3, tempReg2, i * 4);
}
#else
// This one does accesses in linear order but wastes time storing, loading, storing.
for (int j = 0; j < dec_->nweights; j++) {
const float *bone = &gstate.boneMatrix[j * 12];
} else {
MOVI2R(tempReg2, (u32)skinMatrix, scratchReg);
// This approach saves a few stores but accesses the matrices in a more
// sparse order.
const float *bone = &gstate.boneMatrix[0];
MOVI2R(tempReg1, (u32)bone, scratchReg);
// Okay, we have the weight.
if (j == 0) {
for (int i = 0; i < 12; i++) {
VLDR(fpScratchReg2, tempReg1, i * 4);
VMUL(fpScratchReg2, fpScratchReg2, weightRegs[j]);
VSTR(fpScratchReg2, tempReg2, i * 4);
}
} else {
for (int i = 0; i < 12; i++) {
VLDR(fpScratchReg2, tempReg1, i * 4);
VLDR(fpScratchReg3, tempReg2, i * 4);
for (int i = 0; i < 12; i++) {
VLDR(fpScratchReg3, tempReg1, i * 4);
VMUL(fpScratchReg3, fpScratchReg3, weightRegs[0]);
for (int j = 1; j < dec_->nweights; j++) {
VLDR(fpScratchReg2, tempReg1, i * 4 + j * 4 * 12);
VMLA(fpScratchReg3, fpScratchReg2, weightRegs[j]);
VSTR(fpScratchReg3, tempReg2, i * 4);
}
VSTR(fpScratchReg3, tempReg2, i * 4);
}
}
#endif
}
void VertexDecoderJitCache::Jit_WeightsU8Skin() {
// No need to zero skinMatrix, we'll just STR to it in the first lap,
// then VLDR/VADD/VSTR in subsequent laps.
for (int j = 0; j < dec_->nweights; j++) {
LDRB(tempReg1, srcReg, dec_->weightoff + j);
VMOV(fpScratchReg, tempReg1);
VCVT(fpScratchReg, fpScratchReg, TO_FLOAT);
MOVI2F(fpScratchReg2, by128, scratchReg);
VMUL(weightRegs[j], fpScratchReg, fpScratchReg2);
if (NEONSkinning && dec_->nweights <= 4) {
// Most common cases.
// Weight is first so srcReg is correct.
switch (dec_->nweights) {
case 1: LDRB(scratchReg2, srcReg, 0); break;
case 2: LDRH(scratchReg2, srcReg, 0); break;
case 3:
LDR(scratchReg2, srcReg, 0);
ANDI2R(scratchReg2, scratchReg2, 0xFFFFFF, scratchReg);
break;
case 4:
LDR(scratchReg2, srcReg, 0);
break;
}
VMOV(fpScratchReg, scratchReg2);
MOVI2F(S12, by128, scratchReg);
VMOVL(I_8 | I_UNSIGNED, neonScratchRegQ, neonScratchReg);
VMOVL(I_16 | I_UNSIGNED, neonScratchRegQ, neonScratchReg);
VCVT(F_32 | I_UNSIGNED, neonScratchRegQ, neonScratchRegQ);
VMUL_scalar(F_32, neonWeightRegs[0], neonScratchRegQ, DScalar(D6, 0));
} else {
// Fallback and non-neon
for (int j = 0; j < dec_->nweights; j++) {
LDRB(tempReg1, srcReg, dec_->weightoff + j);
VMOV(fpScratchReg, tempReg1);
VCVT(fpScratchReg, fpScratchReg, TO_FLOAT);
MOVI2F(fpScratchReg2, by128, scratchReg);
VMUL(weightRegs[j], fpScratchReg, fpScratchReg2);
}
}
Jit_ApplyWeights();
}
void VertexDecoderJitCache::Jit_WeightsU16Skin() {
// No need to zero skinMatrix, we'll just STR to it in the first lap,
// then VLDR/VADD/VSTR in subsequent laps.
for (int j = 0; j < dec_->nweights; j++) {
LDRH(tempReg1, srcReg, dec_->weightoff + j * 2);
VMOV(fpScratchReg, tempReg1);
VCVT(fpScratchReg, fpScratchReg, TO_FLOAT);
MOVI2F(fpScratchReg2, 1.0f / 32768.0f, scratchReg);
VMUL(weightRegs[j], fpScratchReg, fpScratchReg2);
if (NEONSkinning && dec_->nweights <= 4) {
// Most common cases.
switch (dec_->nweights) {
case 1: LDRH(scratchReg, srcReg, 0); break;
case 2: LDR(scratchReg, srcReg, 0); break;
case 3:
LDR(scratchReg, srcReg, 0);
LDRH(scratchReg2, srcReg, 4);
break;
case 4:
LDR(scratchReg, srcReg, 0);
LDR(scratchReg2, srcReg, 4);
break;
}
VMOV(fpScratchReg, scratchReg);
VMOV(fpScratchReg2, scratchReg2);
MOVI2F(S12, by32768, scratchReg);
VMOVL(I_16 | I_UNSIGNED, neonScratchRegQ, neonScratchReg);
VCVT(F_32 | I_UNSIGNED, neonScratchRegQ, neonScratchRegQ);
VMUL_scalar(F_32, neonWeightRegs[0], neonScratchRegQ, DScalar(D6, 0));
} else {
// Fallback and non-neon
for (int j = 0; j < dec_->nweights; j++) {
LDRH(tempReg1, srcReg, dec_->weightoff + j * 2);
VMOV(fpScratchReg, tempReg1);
VCVT(fpScratchReg, fpScratchReg, TO_FLOAT);
MOVI2F(fpScratchReg2, 1.0f / 32768.0f, scratchReg);
VMUL(weightRegs[j], fpScratchReg, fpScratchReg2);
}
}
Jit_ApplyWeights();
}
void VertexDecoderJitCache::Jit_WeightsFloatSkin() {
// No need to zero skinMatrix, we'll just STR to it in the first lap,
// then VLDR/VADD/VSTR in subsequent laps.
// TODO: NEON-ize (barely worth)
for (int j = 0; j < dec_->nweights; j++) {
VLDR(weightRegs[j], srcReg, dec_->weightoff + j * 4);
}
Jit_ApplyWeights();
}
@ -671,27 +780,39 @@ void VertexDecoderJitCache::Jit_NormalFloatSkin() {
}
void VertexDecoderJitCache::Jit_WriteMatrixMul(int outOff, bool pos) {
MOVI2R(tempReg1, (u32)skinMatrix, scratchReg);
for (int i = 0; i < 3; i++) {
VLDR(fpScratchReg, tempReg1, 4 * i);
VMUL(acc[i], fpScratchReg, src[0]);
}
for (int i = 0; i < 3; i++) {
VLDR(fpScratchReg, tempReg1, 12 + 4 * i);
VMLA(acc[i], fpScratchReg, src[1]);
}
for (int i = 0; i < 3; i++) {
VLDR(fpScratchReg, tempReg1, 24 + 4 * i);
VMLA(acc[i], fpScratchReg, src[2]);
}
if (pos) {
for (int i = 0; i < 3; i++) {
VLDR(fpScratchReg, tempReg1, 36 + 4 * i);
VADD(acc[i], acc[i], fpScratchReg);
if (NEONSkinning) {
// Multiply with the matrix sitting in Q4-Q7.
ADD(scratchReg, dstReg, outOff);
VMUL_scalar(F_32, accNEON, Q4, QScalar(srcNEON, 0));
VMLA_scalar(F_32, accNEON, Q5, QScalar(srcNEON, 1));
VMLA_scalar(F_32, accNEON, Q6, QScalar(srcNEON, 2));
if (pos) {
VADD(F_32, accNEON, accNEON, Q7);
}
VST1(F_32, accNEON, scratchReg, 2);
} else {
MOVI2R(tempReg1, (u32)skinMatrix, scratchReg);
for (int i = 0; i < 3; i++) {
VLDR(fpScratchReg, tempReg1, 4 * i);
VMUL(acc[i], fpScratchReg, src[0]);
}
for (int i = 0; i < 3; i++) {
VLDR(fpScratchReg, tempReg1, 12 + 4 * i);
VMLA(acc[i], fpScratchReg, src[1]);
}
for (int i = 0; i < 3; i++) {
VLDR(fpScratchReg, tempReg1, 24 + 4 * i);
VMLA(acc[i], fpScratchReg, src[2]);
}
if (pos) {
for (int i = 0; i < 3; i++) {
VLDR(fpScratchReg, tempReg1, 36 + 4 * i);
VADD(acc[i], acc[i], fpScratchReg);
}
}
for (int i = 0; i < 3; i++) {
VSTR(acc[i], dstReg, outOff + i * 4);
}
}
for (int i = 0; i < 3; i++) {
VSTR(acc[i], dstReg, outOff + i * 4);
}
}