ARM: Add NEON widening and narrowing moves, and float/int convert.

Experiment a little in the vertex decoder.
This commit is contained in:
Henrik Rydgard 2013-11-24 13:29:56 +01:00
parent 52d4ede2f6
commit f650b23c90
8 changed files with 177 additions and 46 deletions

View File

@ -2325,7 +2325,7 @@ void ARMXEmitter::VSWP(ARMReg Vd, ARMReg Vm)
}
void ARMXEmitter::VTRN(u32 Size, ARMReg Vd, ARMReg Vm)
{
_dbg_assert_msg_(JIT, Vd >= Q0, "Pass invalid register to " __FUNCTION__);
_dbg_assert_msg_(JIT, Vd >= D0, "Pass invalid register to " __FUNCTION__);
_dbg_assert_msg_(JIT, cpu_info.bNEON, "Can't use " __FUNCTION__ " when CPU doesn't support it");
bool register_quad = Vd >= Q0;
@ -2335,7 +2335,7 @@ void ARMXEmitter::VTRN(u32 Size, ARMReg Vd, ARMReg Vm)
}
void ARMXEmitter::VTST(u32 Size, ARMReg Vd, ARMReg Vn, ARMReg Vm)
{
_dbg_assert_msg_(JIT, Vd >= Q0, "Pass invalid register to " __FUNCTION__);
_dbg_assert_msg_(JIT, Vd >= D0, "Pass invalid register to " __FUNCTION__);
_dbg_assert_msg_(JIT, cpu_info.bNEON, "Can't use " __FUNCTION__ " when CPU doesn't support it");
bool register_quad = Vd >= Q0;
@ -2345,7 +2345,7 @@ void ARMXEmitter::VTST(u32 Size, ARMReg Vd, ARMReg Vn, ARMReg Vm)
}
void ARMXEmitter::VUZP(u32 Size, ARMReg Vd, ARMReg Vm)
{
_dbg_assert_msg_(JIT, Vd >= Q0, "Pass invalid register to " __FUNCTION__);
_dbg_assert_msg_(JIT, Vd >= D0, "Pass invalid register to " __FUNCTION__);
_dbg_assert_msg_(JIT, cpu_info.bNEON, "Can't use " __FUNCTION__ " when CPU doesn't support it");
bool register_quad = Vd >= Q0;
@ -2355,7 +2355,7 @@ void ARMXEmitter::VUZP(u32 Size, ARMReg Vd, ARMReg Vm)
}
void ARMXEmitter::VZIP(u32 Size, ARMReg Vd, ARMReg Vm)
{
_dbg_assert_msg_(JIT, Vd >= Q0, "Pass invalid register to " __FUNCTION__);
_dbg_assert_msg_(JIT, Vd >= D0, "Pass invalid register to " __FUNCTION__);
_dbg_assert_msg_(JIT, cpu_info.bNEON, "Can't use " __FUNCTION__ " when CPU doesn't support it");
bool register_quad = Vd >= Q0;
@ -2364,6 +2364,45 @@ void ARMXEmitter::VZIP(u32 Size, ARMReg Vd, ARMReg Vm)
(0x18 << 4) | (register_quad << 6) | EncodeVm(Vm));
}
void ARMXEmitter::VMOVL(u32 Size, ARMReg Vd, ARMReg Vm)
{
_dbg_assert_msg_(JIT, Vd >= Q0, "Pass invalid register to " __FUNCTION__);
_dbg_assert_msg_(JIT, Vm >= D0 && Vm <= D31, "Pass invalid register to " __FUNCTION__);
_dbg_assert_msg_(JIT, cpu_info.bNEON, "Can't use " __FUNCTION__ " when CPU doesn't support it");
_dbg_assert_msg_(JIT, (Size & (I_UNSIGNED | I_SIGNED)) != 0, "Must specify I_SIGNED or I_UNSIGNED in VMOVL");
bool unsign = (Size & I_UNSIGNED) != 0;
int imm3 = 0;
if (Size & I_8) imm3 = 1;
if (Size & I_16) imm3 = 2;
if (Size & I_32) imm3 = 4;
Write32((0xF2 << 24) | (unsign << 24) | (1 << 23) | (imm3 << 19) | EncodeVd(Vd) | \
(0xA1 << 4) | EncodeVm(Vm));
}
void ARMXEmitter::VMOVN(u32 Size, ARMReg Vd, ARMReg Vm)
{
_dbg_assert_msg_(JIT, Vm >= Q0, "Pass invalid register to " __FUNCTION__);
_dbg_assert_msg_(JIT, Vd >= D0 && Vd <= D31, "Pass invalid register to " __FUNCTION__);
_dbg_assert_msg_(JIT, cpu_info.bNEON, "Can't use " __FUNCTION__ " when CPU doesn't support it");
bool register_quad = Vd >= Q0;
Write32((0xF3B << 20) | (encodedSize(Size) << 18) | (1 << 17) | EncodeVd(Vd) | (1 << 9) | EncodeVm(Vm));
}
void ARMXEmitter::VCVT(u32 Size, ARMReg Vd, ARMReg Vm)
{
_dbg_assert_msg_(JIT, (Size & (I_UNSIGNED | I_SIGNED)) != 0, "Must specify I_SIGNED or I_UNSIGNED in VCVT NEON");
bool register_quad = Vd >= Q0;
bool toInteger = (Size & I_32) != 0;
bool isUnsigned = (Size & I_UNSIGNED) != 0;
int op = (toInteger << 1) | (int)isUnsigned;
Write32((0xF3 << 24) | (0xBB << 16) | EncodeVd(Vd) | (0x3 << 9) | (op << 7) | (register_quad << 6) | EncodeVm(Vm));
}
static int RegCountToType(int nRegs, NEONAlignment align) {
switch (nRegs) {

View File

@ -726,6 +726,15 @@ public:
void VREV32(u32 Size, ARMReg Vd, ARMReg Vm);
void VREV16(u32 Size, ARMReg Vd, ARMReg Vm);
// Widening and narrowing moves
void VMOVL(u32 Size, ARMReg Vd, ARMReg Vm);
void VMOVN(u32 Size, ARMReg Vd, ARMReg Vm);
// Vector VCVT
void VCVT(u32 DestSize, ARMReg Dest, ARMReg Src);
// Notes:
// Rm == _PC is interpreted as no offset, otherwise, effective address is sum of Rn and Rm
// Rm == R13 is interpreted as VLD1, .... [Rn]! Added a REG_UPDATE pseudo register.

View File

@ -59,6 +59,10 @@ static const ARMReg *GetMIPSAllocationOrder(int &count) {
// With NEON, we have many more.
// In the future I plan to use S0-S7 (Q0-Q1) for FPU and S8 forwards (Q2-Q15, yes, 15) for VFPU.
// VFPU will use NEON to do SIMD and it will be awkward to mix with FPU.
// We should attempt to map scalars to low Q registers and wider things to high registers,
// as the NEON instructions are all 2-vector or 4-vector, they don't do scalar, we want to be
// able to use regular VFP instructions too.
static const ARMReg allocationOrderNEON[] = {
// Reserve four temp registers. Useful when building quads until we really figure out
// how to do that best.

View File

@ -48,7 +48,6 @@ struct FPURegMIPS {
// If loc == ML_MEM, it's back in its location in the CPU context struct.
};
class ArmRegCacheFPU
{
public:

View File

@ -18,6 +18,7 @@
#include "base/basictypes.h"
#include "base/logging.h"
#include "Common/CPUDetect.h"
#include "Core/Config.h"
#include "Core/MemMap.h"
#include "GPU/ge_constants.h"
@ -46,6 +47,19 @@ static float MEMORY_ALIGNED16(skinMatrix[12]);
// using SSE / NEON and store them here.
static float MEMORY_ALIGNED16(bones[16 * 8]);
// The rest will be dumped to bones as on x86.
// NEON register allocation:
// Q0: Texture scaling parameters
// Q1: Temp storage
// Q2: Vector-by-matrix accumulator
// Q3: Unused
//
// We'll use Q4-Q7 as the "matrix accumulator".
// First two matrices will be preloaded into Q8-Q11 and Q12-Q15 to reduce
// memory bandwidth requirements.
inline int align(int n, int align) {
return (n + (align - 1)) & ~(align - 1);
}
@ -924,11 +938,18 @@ static const ARMReg counterReg = R2;
static const ARMReg fpScratchReg = S4;
static const ARMReg fpScratchReg2 = S5;
static const ARMReg fpScratchReg3 = S6;
static const ARMReg fpScratchReg4 = S7;
static const ARMReg fpUscaleReg = S0;
static const ARMReg fpVscaleReg = S1;
static const ARMReg fpUoffsetReg = S2;
static const ARMReg fpVoffsetReg = S3;
// Simpler aliases for NEON. Overlaps with corresponding VFP regs.
static const ARMReg neonUVScaleReg = D0;
static const ARMReg neonUVOffsetReg = D1;
static const ARMReg neonScratchReg = D2;
static const ARMReg neonScratchRegQ = Q1; // Overlaps with all the scratch regs
// Everything above S6 is fair game for skinning
// S8-S15 are used during matrix generation
@ -1244,48 +1265,79 @@ void VertexDecoderJitCache::Jit_TcU16ThroughDouble() {
}
void VertexDecoderJitCache::Jit_TcU8Prescale() {
// TODO: SIMD
LDRB(tempReg1, srcReg, dec_->tcoff);
LDRB(tempReg2, srcReg, dec_->tcoff + 1);
VMOV(fpScratchReg, tempReg1);
VMOV(fpScratchReg2, tempReg2);
VCVT(fpScratchReg, fpScratchReg, TO_FLOAT);
VCVT(fpScratchReg2, fpScratchReg2, TO_FLOAT);
// Could replace VMUL + VADD with VMLA but would require 2 more regs as we don't want to destroy fp*offsetReg. Later.
VMUL(fpScratchReg, fpScratchReg, fpUscaleReg);
VMUL(fpScratchReg2, fpScratchReg2, fpVscaleReg);
VADD(fpScratchReg, fpScratchReg, fpUoffsetReg);
VADD(fpScratchReg2, fpScratchReg2, fpVoffsetReg);
VSTR(fpScratchReg, dstReg, dec_->decFmt.uvoff);
VSTR(fpScratchReg2, dstReg, dec_->decFmt.uvoff + 4);
if (false && cpu_info.bNEON) {
// TODO: Needs testing
ADD(scratchReg, srcReg, dec_->tcoff);
VLD1_lane(I_16, neonScratchReg, scratchReg, 0, false);
VMOVL(I_8 | I_UNSIGNED, neonScratchRegQ, neonScratchReg); // Widen to 32-bit
VMOVL(I_16 | I_UNSIGNED, neonScratchRegQ, neonScratchReg); // Widen to 32-bit
VCVT(F_32 | I_UNSIGNED, neonScratchRegQ, neonScratchRegQ);
VMUL(F_32, neonScratchReg, neonScratchReg, neonUVScaleReg);
VADD(F_32, neonScratchReg, neonScratchReg, neonUVOffsetReg);
VST1(F_32, neonScratchReg, scratchReg2, 1, ALIGN_NONE);
} else {
// TODO: SIMD
LDRB(tempReg1, srcReg, dec_->tcoff);
LDRB(tempReg2, srcReg, dec_->tcoff + 1);
VMOV(fpScratchReg, tempReg1);
VMOV(fpScratchReg2, tempReg2);
VCVT(fpScratchReg, fpScratchReg, TO_FLOAT);
VCVT(fpScratchReg2, fpScratchReg2, TO_FLOAT);
// Could replace VMUL + VADD with VMLA but would require 2 more regs as we don't want to destroy fp*offsetReg. Later.
VMUL(fpScratchReg, fpScratchReg, fpUscaleReg);
VMUL(fpScratchReg2, fpScratchReg2, fpVscaleReg);
VADD(fpScratchReg, fpScratchReg, fpUoffsetReg);
VADD(fpScratchReg2, fpScratchReg2, fpVoffsetReg);
VSTR(fpScratchReg, dstReg, dec_->decFmt.uvoff);
VSTR(fpScratchReg2, dstReg, dec_->decFmt.uvoff + 4);
}
}
void VertexDecoderJitCache::Jit_TcU16Prescale() {
// TODO: SIMD
LDRH(tempReg1, srcReg, dec_->tcoff);
LDRH(tempReg2, srcReg, dec_->tcoff + 2);
VMOV(fpScratchReg, tempReg1);
VMOV(fpScratchReg2, tempReg2);
VCVT(fpScratchReg, fpScratchReg, TO_FLOAT);
VCVT(fpScratchReg2, fpScratchReg2, TO_FLOAT);
VMUL(fpScratchReg, fpScratchReg, fpUscaleReg);
VMUL(fpScratchReg2, fpScratchReg2, fpVscaleReg);
VADD(fpScratchReg, fpScratchReg, fpUoffsetReg);
VADD(fpScratchReg2, fpScratchReg2, fpVoffsetReg);
VSTR(fpScratchReg, dstReg, dec_->decFmt.uvoff);
VSTR(fpScratchReg2, dstReg, dec_->decFmt.uvoff + 4);
if (false && cpu_info.bNEON) {
// TODO: Needs testing
ADD(scratchReg, srcReg, dec_->tcoff);
VLD1_lane(I_32, neonScratchReg, scratchReg, 0, false);
VMOVL(I_16 | I_UNSIGNED, neonScratchRegQ, neonScratchReg); // Widen to 32-bit
VCVT(F_32 | I_UNSIGNED, neonScratchRegQ, neonScratchRegQ);
VMUL(F_32, neonScratchReg, neonScratchReg, neonUVScaleReg);
VADD(F_32, neonScratchReg, neonScratchReg, neonUVOffsetReg);
VST1(F_32, neonScratchReg, scratchReg2, 1, ALIGN_NONE);
} else {
LDRH(tempReg1, srcReg, dec_->tcoff);
LDRH(tempReg2, srcReg, dec_->tcoff + 2);
VMOV(fpScratchReg, tempReg1);
VMOV(fpScratchReg2, tempReg2);
VCVT(fpScratchReg, fpScratchReg, TO_FLOAT);
VCVT(fpScratchReg2, fpScratchReg2, TO_FLOAT);
VMUL(fpScratchReg, fpScratchReg, fpUscaleReg);
VMUL(fpScratchReg2, fpScratchReg2, fpVscaleReg);
VADD(fpScratchReg, fpScratchReg, fpUoffsetReg);
VADD(fpScratchReg2, fpScratchReg2, fpVoffsetReg);
VSTR(fpScratchReg, dstReg, dec_->decFmt.uvoff);
VSTR(fpScratchReg2, dstReg, dec_->decFmt.uvoff + 4);
}
}
void VertexDecoderJitCache::Jit_TcFloatPrescale() {
// TODO: SIMD
VLDR(fpScratchReg, srcReg, dec_->tcoff);
VLDR(fpScratchReg2, srcReg, dec_->tcoff + 4);
VMUL(fpScratchReg, fpScratchReg, fpUscaleReg);
VMUL(fpScratchReg2, fpScratchReg2, fpVscaleReg);
VADD(fpScratchReg, fpScratchReg, fpUoffsetReg);
VADD(fpScratchReg2, fpScratchReg2, fpVoffsetReg);
VSTR(fpScratchReg, dstReg, dec_->decFmt.uvoff);
VSTR(fpScratchReg2, dstReg, dec_->decFmt.uvoff + 4);
if (cpu_info.bNEON) {
ADD(scratchReg, srcReg, dec_->tcoff);
VLD1(F_32, neonScratchReg, scratchReg, 1, ALIGN_NONE);
ADD(scratchReg2, dstReg, dec_->decFmt.uvoff);
VMUL(F_32, neonScratchReg, neonScratchReg, neonUVScaleReg);
VADD(F_32, neonScratchReg, neonScratchReg, neonUVOffsetReg);
VST1(F_32, neonScratchReg, scratchReg2, 1, ALIGN_NONE);
} else {
// TODO: SIMD
VLDR(fpScratchReg, srcReg, dec_->tcoff);
VLDR(fpScratchReg2, srcReg, dec_->tcoff + 4);
VMUL(fpScratchReg, fpScratchReg, fpUscaleReg);
VMUL(fpScratchReg2, fpScratchReg2, fpVscaleReg);
VADD(fpScratchReg, fpScratchReg, fpUoffsetReg);
VADD(fpScratchReg2, fpScratchReg2, fpVoffsetReg);
VSTR(fpScratchReg, dstReg, dec_->decFmt.uvoff);
VSTR(fpScratchReg2, dstReg, dec_->decFmt.uvoff + 4);
}
}
void VertexDecoderJitCache::Jit_Color8888() {

View File

@ -73,8 +73,6 @@ private:
UI::EventReturn OnLoadState(UI::EventParams &e);
UI::EventReturn OnRewind(UI::EventParams &e);
UI::EventReturn OnLanguageChange(UI::EventParams &e);
UI::EventReturn OnStateSelected(UI::EventParams &e);
UI::EventReturn OnCwCheat(UI::EventParams &e);

View File

@ -1,6 +1,6 @@
APP_STL := gnustl_static
#APP_ABI := armeabi-v7a x86
APP_ABI := armeabi-v7a armeabi x86
#APP_ABI := armeabi-v7a
#APP_ABI := armeabi-v7a armeabi x86
APP_ABI := armeabi-v7a
APP_GNUSTL_CPP_FEATURES :=
NDK_TOOLCHAIN_VERSION := 4.8

View File

@ -2,6 +2,7 @@
#include "ArmEmitterTest.h"
#include "Common/ArmEmitter.h"
#include "Common/CPUDetect.h"
static bool functionWasCalled;
@ -26,6 +27,9 @@ static float a[4] = {1.0f, 2.0f, 3.0f, 4.5f};
static float b[4] = {1.0f, 1.0f, 1.0f, 0.5f};
static float c[4] = {0.0f, 0.0f, 0.0f, 0.0f};
static u32 x[4] = {0x04030201, 0x08070605, 0x0, 0x0};
static u32 y[4] = {0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF};
static u32 z[4] = {0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF};
void TestCode::Generate()
{
@ -34,6 +38,7 @@ void TestCode::Generate()
PUSH(2, R11, _LR);
// Load the three pointers
/*
MOVP2R(R0, a);
MOVP2R(R1, b);
MOVP2R(R2, c);
@ -43,10 +48,28 @@ void TestCode::Generate()
VLD1(F_32, D2, R1, 2); // Load another 2 doubles
// VADD(F_32, Q2, Q0, Q1); // Add them, seeing them as floating point quads
VMUL_scalar(F_32, Q2, Q0, DScalar(D3, 1)); // Multiply a quad by a scalar (ultra efficient for matrix mul! limitation: Scalar has to come out of D0-D15)
ADD(R1, R1, 12);
VLD1_all_lanes(F_32, Q2, R1, true);
ADD(R0, R0, 12);
VLD1_lane(F_32, D4, R0, 1, true);
u32 word = *(u32 *)(GetCodePtr() - 4);
ILOG("Instruction Word: %08x", word);
// VMUL(F_32, Q2, Q0, Q1);
VST1(F_32, D4, R2, 2);
*/
// Let's try some integer stuff
MOVP2R(R0, x);
MOVP2R(R1, y);
MOVP2R(R2, z);
MOVP2R(R3, c);
VLD1(I_32, D0, R0, 1); // Load 1 double
VMOVL(I_8 | I_UNSIGNED, Q1, D0);
VMOVL(I_16 | I_UNSIGNED, Q2, D2);
VCVT(F_32 | I_SIGNED, Q3, Q2);
VST1(I_32, D2, R1, 2);
VST1(I_32, D4, R2, 2);
VST1(I_32, D6, R3, 2);
// This works!
@ -92,6 +115,10 @@ void ArmEmitterTest()
// Disabled for now.
return;
// If I commit with it enabled by accident, let's not blow up.
if (!cpu_info.bNEON)
return;
for (int i = 0; i < 6; i++) {
ILOG("--------------------------");
}
@ -106,6 +133,9 @@ void ArmEmitterTest()
u32 retval = CallPtr(gen.testCodePtr);
// ILOG("ARM emitter test 1 passed if %f == 3.0! retval = %08x", abc[32 + 31], retval);
ILOG("x: %08x %08x %08x %08x", x[0], x[1], x[2], x[3]);
ILOG("y: %08x %08x %08x %08x", y[0], y[1], y[2], y[3]);
ILOG("z: %08x %08x %08x %08x", z[0], z[1], z[2], z[3]);
ILOG("c: %f %f %f %f", c[0], c[1], c[2], c[3]);
for (int i = 0; i < 6; i++) {
ILOG("--------------------------");