vertexjit: Try to avoid a few more VFP switches.

This commit is contained in:
Unknown W. Brackets 2014-03-21 21:17:41 -07:00
parent 58fe022ecd
commit 4c48031724
2 changed files with 7 additions and 19 deletions

View File

@ -1117,8 +1117,8 @@ void VertexDecoderJitCache::Jit_PosFloat() {
void VertexDecoderJitCache::Jit_NormalS8Skin() {
if (NEONSkinning) {
ADD(scratchReg, srcReg, dec_->nrmoff);
MOVI2F(S15, 1.0f/128.0f, scratchReg2);
VLD1_lane(I_32, neonScratchReg, scratchReg, 0, false);
MOVI2F(S15, 1.0f/128.0f, scratchReg);
VMOVL(I_8 | I_SIGNED, neonScratchRegQ, neonScratchReg); // Widen to 16-bit
VMOVL(I_16 | I_SIGNED, neonScratchRegQ, neonScratchReg); // Widen to 32-bit
VCVT(F_32 | I_SIGNED, neonScratchRegQ, neonScratchRegQ);
@ -1144,8 +1144,8 @@ void VertexDecoderJitCache::Jit_NormalS8Skin() {
void VertexDecoderJitCache::Jit_NormalS16Skin() {
if (NEONSkinning) {
ADD(scratchReg, srcReg, dec_->nrmoff);
MOVI2F(S15, 1.0f/32768, scratchReg2);
VLD1(I_32, neonScratchReg, scratchReg, 1, ALIGN_NONE);
MOVI2F(S15, 1.0f/32768, scratchReg);
VMOVL(I_16 | I_SIGNED, neonScratchRegQ, neonScratchReg); // Widen to 32-bit
VCVT(F_32 | I_SIGNED, neonScratchRegQ, neonScratchRegQ);
VMUL_scalar(F_32, srcNEON, neonScratchReg, QScalar(Q3, 3)); // S15
@ -1219,8 +1219,8 @@ void VertexDecoderJitCache::Jit_WriteMatrixMul(int outOff, bool pos) {
void VertexDecoderJitCache::Jit_PosS8Skin() {
if (NEONSkinning) {
ADD(scratchReg, srcReg, dec_->posoff);
MOVI2F(S15, 1.0f/128.0f, scratchReg2);
VLD1_lane(I_32, neonScratchReg, scratchReg, 0, false);
MOVI2F(S15, 1.0f/128.0f, scratchReg);
VMOVL(I_8 | I_SIGNED, neonScratchRegQ, neonScratchReg); // Widen to 16-bit
VMOVL(I_16 | I_SIGNED, neonScratchRegQ, neonScratchReg); // Widen to 32-bit
VCVT(F_32 | I_SIGNED, neonScratchRegQ, neonScratchRegQ);
@ -1246,8 +1246,8 @@ void VertexDecoderJitCache::Jit_PosS8Skin() {
void VertexDecoderJitCache::Jit_PosS16Skin() {
if (NEONSkinning) {
ADD(scratchReg, srcReg, dec_->posoff);
MOVI2F(S15, 1.0f/32768, scratchReg2);
VLD1(I_32, neonScratchReg, scratchReg, 1, ALIGN_NONE);
MOVI2F(S15, 1.0f/32768, scratchReg);
VMOVL(I_16 | I_SIGNED, neonScratchRegQ, neonScratchReg); // Widen to 32-bit
VCVT(F_32 | I_SIGNED, neonScratchRegQ, neonScratchRegQ);
VMUL_scalar(F_32, srcNEON, neonScratchReg, QScalar(Q3, 3)); // S15

View File

@ -361,11 +361,7 @@ void VertexDecoderJitCache::Jit_WeightsFloat() {
}
void VertexDecoderJitCache::Jit_WeightsU8Skin() {
#ifdef _M_X64
MOV(PTRBITS, R(tempReg2), Imm64((uintptr_t)&bones));
#else
MOV(PTRBITS, R(tempReg2), Imm32((uintptr_t)&bones));
#endif
MOV(PTRBITS, R(tempReg2), ImmPtr(&bones));
for (int j = 0; j < dec_->nweights; j++) {
MOVZX(32, 8, tempReg1, MDisp(srcReg, dec_->weightoff + j));
CVTSI2SS(XMM1, R(tempReg1));
@ -399,11 +395,7 @@ void VertexDecoderJitCache::Jit_WeightsU8Skin() {
}
void VertexDecoderJitCache::Jit_WeightsU16Skin() {
#ifdef _M_X64
MOV(PTRBITS, R(tempReg2), Imm64((uintptr_t)&bones));
#else
MOV(PTRBITS, R(tempReg2), Imm32((uintptr_t)&bones));
#endif
MOV(PTRBITS, R(tempReg2), ImmPtr(&bones));
for (int j = 0; j < dec_->nweights; j++) {
MOVZX(32, 16, tempReg1, MDisp(srcReg, dec_->weightoff + j * 2));
CVTSI2SS(XMM1, R(tempReg1));
@ -437,11 +429,7 @@ void VertexDecoderJitCache::Jit_WeightsU16Skin() {
}
void VertexDecoderJitCache::Jit_WeightsFloatSkin() {
#ifdef _M_X64
MOV(PTRBITS, R(tempReg2), Imm64((uintptr_t)&bones));
#else
MOV(PTRBITS, R(tempReg2), Imm32((uintptr_t)&bones));
#endif
MOV(PTRBITS, R(tempReg2), ImmPtr(&bones));
for (int j = 0; j < dec_->nweights; j++) {
MOVSS(XMM1, MDisp(srcReg, dec_->weightoff + j * 4));
SHUFPS(XMM1, R(XMM1), _MM_SHUFFLE(0, 0, 0, 0));