ARM vtx dec: Avoid all unaligned accesses entirely.

Seeing so much contradictory information on the support and performance
of these.
This commit is contained in:
Henrik Rydgård 2013-11-06 12:17:41 +01:00
parent 1e158fa652
commit b3fdfc01c8

View File

@ -936,7 +936,9 @@ void VertexDecoderJitCache::Jit_TcU8() {
} }
void VertexDecoderJitCache::Jit_TcU16() { void VertexDecoderJitCache::Jit_TcU16() {
LDR(tempReg1, srcReg, dec_->tcoff); LDRH(tempReg1, srcReg, dec_->tcoff);
LDRH(tempReg2, srcReg, dec_->tcoff + 2);
ORR(tempReg1, tempReg1, Operand2(tempReg2, ST_LSL, 16));
STR(tempReg1, dstReg, dec_->decFmt.uvoff); STR(tempReg1, dstReg, dec_->decFmt.uvoff);
} }
@ -948,7 +950,9 @@ void VertexDecoderJitCache::Jit_TcFloat() {
} }
void VertexDecoderJitCache::Jit_TcU16Through() { void VertexDecoderJitCache::Jit_TcU16Through() {
LDR(tempReg1, srcReg, dec_->tcoff); // possibly unaligned access LDRH(tempReg1, srcReg, dec_->tcoff);
LDRH(tempReg2, srcReg, dec_->tcoff + 2);
ORR(tempReg1, tempReg1, Operand2(tempReg2, ST_LSL, 16));
STR(tempReg1, dstReg, dec_->decFmt.uvoff); STR(tempReg1, dstReg, dec_->decFmt.uvoff);
} }
@ -1033,23 +1037,33 @@ void VertexDecoderJitCache::Jit_Color5551() {
STR(tempReg2, dstReg, dec_->decFmt.c0off); STR(tempReg2, dstReg, dec_->decFmt.c0off);
} }
// Copy 3 bytes and then a zero. Might as well copy four.
void VertexDecoderJitCache::Jit_NormalS8() { void VertexDecoderJitCache::Jit_NormalS8() {
LDR(tempReg1, srcReg, dec_->nrmoff); LDRB(tempReg1, srcReg, dec_->nrmoff);
ANDI2R(tempReg1, tempReg1, 0x00FFFFFF, scratchReg); LDRB(tempReg2, srcReg, dec_->nrmoff + 1);
LDRB(tempReg3, srcReg, dec_->nrmoff + 2);
ORR(tempReg1, tempReg1, Operand2(tempReg2, ST_LSL, 8));
ORR(tempReg1, tempReg1, Operand2(tempReg3, ST_LSL, 16));
STR(tempReg1, dstReg, dec_->decFmt.nrmoff); STR(tempReg1, dstReg, dec_->decFmt.nrmoff);
// Copy 3 bytes and then a zero. Might as well copy four.
// LDR(tempReg1, srcReg, dec_->nrmoff);
// ANDI2R(tempReg1, tempReg1, 0x00FFFFFF, scratchReg);
// STR(tempReg1, dstReg, dec_->decFmt.nrmoff);
} }
// Copy 6 bytes and then 2 zeroes. // Copy 6 bytes and then 2 zeroes.
void VertexDecoderJitCache::Jit_NormalS16() { void VertexDecoderJitCache::Jit_NormalS16() {
LDR(tempReg1, srcReg, dec_->nrmoff); LDRH(tempReg1, srcReg, dec_->nrmoff);
LDRH(tempReg2, srcReg, dec_->nrmoff + 4); LDRH(tempReg2, srcReg, dec_->nrmoff + 2);
LDRH(tempReg3, srcReg, dec_->nrmoff + 4);
ORR(tempReg1, tempReg1, Operand2(tempReg2, ST_LSL, 16));
STR(tempReg1, dstReg, dec_->decFmt.nrmoff); STR(tempReg1, dstReg, dec_->decFmt.nrmoff);
STR(tempReg2, dstReg, dec_->decFmt.nrmoff + 4); STR(tempReg3, dstReg, dec_->decFmt.nrmoff + 4);
} }
void VertexDecoderJitCache::Jit_NormalFloat() { void VertexDecoderJitCache::Jit_NormalFloat() {
// Might not be aligned to 4, so we can't use LDMIA. // Might not be aligned to 4, so we can't use LDMIA.
// Actually - not true: This will always be aligned. TODO
LDR(tempReg1, srcReg, dec_->nrmoff); LDR(tempReg1, srcReg, dec_->nrmoff);
LDR(tempReg2, srcReg, dec_->nrmoff + 4); LDR(tempReg2, srcReg, dec_->nrmoff + 4);
LDR(tempReg3, srcReg, dec_->nrmoff + 8); LDR(tempReg3, srcReg, dec_->nrmoff + 8);
@ -1061,9 +1075,12 @@ void VertexDecoderJitCache::Jit_NormalFloat() {
// Through expands into floats, always. Might want to look at changing this. // Through expands into floats, always. Might want to look at changing this.
void VertexDecoderJitCache::Jit_PosS8Through() { void VertexDecoderJitCache::Jit_PosS8Through() {
// TODO: SIMD // TODO: SIMD
LDRSB(tempReg1, srcReg, dec_->posoff);
LDRSB(tempReg2, srcReg, dec_->posoff + 1);
LDRSB(tempReg3, srcReg, dec_->posoff + 2);
static const ARMReg tr[3] = { tempReg1, tempReg2, tempReg3 };
for (int i = 0; i < 3; i++) { for (int i = 0; i < 3; i++) {
LDRSB(tempReg1, srcReg, dec_->posoff + i); VMOV(S0, tr[i]);
VMOV(S0, tempReg1);
VCVT(S0, S0, TO_FLOAT | IS_SIGNED); VCVT(S0, S0, TO_FLOAT | IS_SIGNED);
VSTR(S0, dstReg, dec_->decFmt.posoff + i * 4); VSTR(S0, dstReg, dec_->decFmt.posoff + i * 4);
} }
@ -1071,10 +1088,13 @@ void VertexDecoderJitCache::Jit_PosS8Through() {
// Through expands into floats, always. Might want to look at changing this. // Through expands into floats, always. Might want to look at changing this.
void VertexDecoderJitCache::Jit_PosS16Through() { void VertexDecoderJitCache::Jit_PosS16Through() {
LDRSH(tempReg1, srcReg, dec_->posoff);
LDRSH(tempReg2, srcReg, dec_->posoff + 2);
LDRSH(tempReg3, srcReg, dec_->posoff + 4);
static const ARMReg tr[3] = { tempReg1, tempReg2, tempReg3 };
// TODO: SIMD // TODO: SIMD
for (int i = 0; i < 3; i++) { for (int i = 0; i < 3; i++) {
LDRSH(tempReg1, srcReg, dec_->posoff + i * 2); VMOV(S0, tr[i]);
VMOV(S0, tempReg1);
VCVT(S0, S0, TO_FLOAT | IS_SIGNED); VCVT(S0, S0, TO_FLOAT | IS_SIGNED);
VSTR(S0, dstReg, dec_->decFmt.posoff + i * 4); VSTR(S0, dstReg, dec_->decFmt.posoff + i * 4);
} }
@ -1082,17 +1102,22 @@ void VertexDecoderJitCache::Jit_PosS16Through() {
// Copy 3 bytes and then a zero. Might as well copy four. // Copy 3 bytes and then a zero. Might as well copy four.
void VertexDecoderJitCache::Jit_PosS8() { void VertexDecoderJitCache::Jit_PosS8() {
LDR(tempReg1, srcReg, dec_->posoff); LDRB(tempReg1, srcReg, dec_->posoff);
ANDI2R(tempReg1, tempReg1, 0x00FFFFFF, scratchReg); LDRB(tempReg2, srcReg, dec_->posoff + 1);
LDRB(tempReg3, srcReg, dec_->posoff + 2);
ORR(tempReg1, tempReg1, Operand2(tempReg2, ST_LSL, 8));
ORR(tempReg1, tempReg1, Operand2(tempReg3, ST_LSL, 16));
STR(tempReg1, dstReg, dec_->decFmt.posoff); STR(tempReg1, dstReg, dec_->decFmt.posoff);
} }
// Copy 6 bytes and then 2 zeroes. // Copy 6 bytes and then 2 zeroes.
void VertexDecoderJitCache::Jit_PosS16() { void VertexDecoderJitCache::Jit_PosS16() {
LDR(tempReg1, srcReg, dec_->posoff); LDRH(tempReg1, srcReg, dec_->posoff);
LDRH(tempReg2, srcReg, dec_->posoff + 4); LDRH(tempReg2, srcReg, dec_->posoff + 2);
LDRH(tempReg3, srcReg, dec_->posoff + 4);
ORR(tempReg1, tempReg1, Operand2(tempReg2, ST_LSL, 16));
STR(tempReg1, dstReg, dec_->decFmt.posoff); STR(tempReg1, dstReg, dec_->decFmt.posoff);
STR(tempReg2, dstReg, dec_->decFmt.posoff + 4); STR(tempReg3, dstReg, dec_->decFmt.posoff + 4);
} }
// Just copy 12 bytes. // Just copy 12 bytes.