ARM: Use PLD (cache preload) in vertex decoder loop.

This commit is contained in:
Henrik Rydgard 2013-11-24 15:08:47 +01:00
parent f650b23c90
commit dfea160491
4 changed files with 23 additions and 5 deletions

View File

@ -801,6 +801,17 @@ void ARMXEmitter::CLZ(ARMReg rd, ARMReg rm)
Write32(condition | (0x16F << 16) | (rd << 12) | (0xF1 << 4) | rm);
}
void ARMXEmitter::PLD(ARMReg rn, int offset, bool forWrite) {
_dbg_assert_msg_(JIT, offset < 0x3ff && offset > -0x3ff, "PLD: Max 12 bits of offset allowed");
bool U = offset >= 0;
if (offset < 0) offset = -offset;
bool R = !forWrite;
// Conditions not allowed
Write32((0xF5 << 24) | (U << 23) | (R << 22) | (1 << 20) | ((int)rn << 16) | (0xF << 12) | offset);
}
void ARMXEmitter::BFI(ARMReg rd, ARMReg rn, u8 lsb, u8 width)
{
u32 msb = (lsb + width - 1);

View File

@ -541,6 +541,7 @@ public:
void BFI(ARMReg rd, ARMReg rn, u8 lsb, u8 width);
void UBFX(ARMReg dest, ARMReg op2, u8 lsb, u8 width);
void CLZ(ARMReg rd, ARMReg rm);
void PLD(ARMReg rd, int offset, bool forWrite = false);
// Using just MSR here messes with our defines on the PPC side of stuff (when this code was in dolphin...)
// Just need to put an underscore here, bit annoying.

View File

@ -1058,6 +1058,8 @@ JittedVertexDecoder VertexDecoderJitCache::Compile(const VertexDecoder &dec) {
}
JumpTarget loopStart = GetCodePtr();
// Preload data cache ahead of reading. TODO: Experiment with the offset.
PLD(srcReg, 64);
for (int i = 0; i < dec.numSteps_; i++) {
if (!CompileStep(dec, i)) {
// Reset the code ptr and return zero to indicate that we failed.
@ -1265,13 +1267,14 @@ void VertexDecoderJitCache::Jit_TcU16ThroughDouble() {
}
void VertexDecoderJitCache::Jit_TcU8Prescale() {
if (false && cpu_info.bNEON) {
if (cpu_info.bNEON) {
// TODO: Needs testing
ADD(scratchReg, srcReg, dec_->tcoff);
VLD1_lane(I_16, neonScratchReg, scratchReg, 0, false);
VMOVL(I_8 | I_UNSIGNED, neonScratchRegQ, neonScratchReg); // Widen to 32-bit
VMOVL(I_8 | I_UNSIGNED, neonScratchRegQ, neonScratchReg); // Widen to 16-bit
VMOVL(I_16 | I_UNSIGNED, neonScratchRegQ, neonScratchReg); // Widen to 32-bit
VCVT(F_32 | I_UNSIGNED, neonScratchRegQ, neonScratchRegQ);
ADD(scratchReg2, dstReg, dec_->decFmt.uvoff);
VMUL(F_32, neonScratchReg, neonScratchReg, neonUVScaleReg);
VADD(F_32, neonScratchReg, neonScratchReg, neonUVOffsetReg);
VST1(F_32, neonScratchReg, scratchReg2, 1, ALIGN_NONE);
@ -1294,12 +1297,13 @@ void VertexDecoderJitCache::Jit_TcU8Prescale() {
}
void VertexDecoderJitCache::Jit_TcU16Prescale() {
if (false && cpu_info.bNEON) {
if (cpu_info.bNEON) {
// TODO: Needs testing
ADD(scratchReg, srcReg, dec_->tcoff);
VLD1_lane(I_32, neonScratchReg, scratchReg, 0, false);
VMOVL(I_16 | I_UNSIGNED, neonScratchRegQ, neonScratchReg); // Widen to 32-bit
VCVT(F_32 | I_UNSIGNED, neonScratchRegQ, neonScratchRegQ);
ADD(scratchReg2, dstReg, dec_->decFmt.uvoff);
VMUL(F_32, neonScratchReg, neonScratchReg, neonUVScaleReg);
VADD(F_32, neonScratchReg, neonScratchReg, neonUVOffsetReg);
VST1(F_32, neonScratchReg, scratchReg2, 1, ALIGN_NONE);

View File

@ -52,8 +52,6 @@ void TestCode::Generate()
VLD1_all_lanes(F_32, Q2, R1, true);
ADD(R0, R0, 12);
VLD1_lane(F_32, D4, R0, 1, true);
u32 word = *(u32 *)(GetCodePtr() - 4);
ILOG("Instruction Word: %08x", word);
// VMUL(F_32, Q2, Q0, Q1);
VST1(F_32, D4, R2, 2);
*/
@ -70,6 +68,10 @@ void TestCode::Generate()
VST1(I_32, D2, R1, 2);
VST1(I_32, D4, R2, 2);
VST1(I_32, D6, R3, 2);
PLD(R1, 32);
u32 word = *(u32 *)(GetCodePtr() - 4);
ILOG("Instruction Word: %08x", word);
// This works!