From 61f5d3d360ef5d60ca94e685ec6f3e7971f744dd Mon Sep 17 00:00:00 2001 From: "Unknown W. Brackets" Date: Sun, 23 Mar 2014 20:37:51 -0700 Subject: [PATCH 1/6] Initial stab at tracking vertex alpha. Not sure what efficient method to use on x86... --- GPU/GLES/VertexDecoder.cpp | 9 +++++++ GPU/GLES/VertexDecoder.h | 2 +- GPU/GLES/VertexDecoderArm.cpp | 49 ++++++++++++++++++++++++++++++----- GPU/GLES/VertexDecoderX86.cpp | 24 +++++++++++++++-- 4 files changed, 75 insertions(+), 9 deletions(-) diff --git a/GPU/GLES/VertexDecoder.cpp b/GPU/GLES/VertexDecoder.cpp index 7bcab4bc35..f8cae9348b 100644 --- a/GPU/GLES/VertexDecoder.cpp +++ b/GPU/GLES/VertexDecoder.cpp @@ -219,6 +219,7 @@ void VertexDecoder::Step_Color565() const c[1] = Convert6To8((cdata>>5) & 0x3f); c[2] = Convert5To8((cdata>>11) & 0x1f); c[3] = 255; + // Always full alpha. } void VertexDecoder::Step_Color5551() const @@ -229,6 +230,7 @@ void VertexDecoder::Step_Color5551() const c[1] = Convert5To8((cdata>>5) & 0x1f); c[2] = Convert5To8((cdata>>10) & 0x1f); c[3] = (cdata >> 15) ? 255 : 0; + gstate_c.vertexFullAlpha = gstate_c.vertexFullAlpha && c[3] != 0; } void VertexDecoder::Step_Color4444() const @@ -237,6 +239,7 @@ void VertexDecoder::Step_Color4444() const u16 cdata = *(u16*)(ptr_ + coloff); for (int j = 0; j < 4; j++) c[j] = Convert4To8((cdata >> (j * 4)) & 0xF); + gstate_c.vertexFullAlpha = gstate_c.vertexFullAlpha && c[3] == 255; } void VertexDecoder::Step_Color8888() const @@ -244,6 +247,7 @@ void VertexDecoder::Step_Color8888() const u8 *c = decoded_ + decFmt.c0off; const u8 *cdata = (const u8*)(ptr_ + coloff); memcpy(c, cdata, sizeof(u8) * 4); + gstate_c.vertexFullAlpha = gstate_c.vertexFullAlpha && c[3] == 255; } void VertexDecoder::Step_Color565Morph() const @@ -262,6 +266,7 @@ void VertexDecoder::Step_Color565Morph() const c[i] = (u8)col[i]; } c[3] = 255; + // Always full alpha. } void VertexDecoder::Step_Color5551Morph() const @@ -280,6 +285,7 @@ void VertexDecoder::Step_Color5551Morph() const for (int i = 0; i < 4; i++) { c[i] = (u8)col[i]; } + gstate_c.vertexFullAlpha = gstate_c.vertexFullAlpha && c[3] == 255; } void VertexDecoder::Step_Color4444Morph() const @@ -296,6 +302,7 @@ void VertexDecoder::Step_Color4444Morph() const for (int i = 0; i < 4; i++) { c[i] = (u8)col[i]; } + gstate_c.vertexFullAlpha = gstate_c.vertexFullAlpha && c[3] == 255; } void VertexDecoder::Step_Color8888Morph() const @@ -312,6 +319,7 @@ void VertexDecoder::Step_Color8888Morph() const for (int i = 0; i < 4; i++) { c[i] = (u8)(col[i]); } + gstate_c.vertexFullAlpha = gstate_c.vertexFullAlpha && c[3] == 255; } void VertexDecoder::Step_NormalS8() const @@ -841,6 +849,7 @@ void VertexDecoder::DecodeVerts(u8 *decodedptr, const void *verts, int indexLowe jitted_(ptr_, decoded_, count); } else { // Interpret the decode steps + // TODO: Init gstate_c.vertexFullAlpha here? Or in Setup? When is it reset? for (; count; count--) { for (int i = 0; i < numSteps_; i++) { ((*this).*steps_[i])(); diff --git a/GPU/GLES/VertexDecoder.h b/GPU/GLES/VertexDecoder.h index 92643bb5ae..dad0e142c3 100644 --- a/GPU/GLES/VertexDecoder.h +++ b/GPU/GLES/VertexDecoder.h @@ -266,6 +266,6 @@ private: bool CompileStep(const VertexDecoder &dec, int i); void Jit_ApplyWeights(); void Jit_WriteMatrixMul(int outOff, bool pos); - void Jit_WriteMorphColor(int outOff); + void Jit_WriteMorphColor(int outOff, bool checkAlpha = true); const VertexDecoder *dec_; }; diff --git a/GPU/GLES/VertexDecoderArm.cpp b/GPU/GLES/VertexDecoderArm.cpp index 0efba2780a..f702246bf1 100644 --- a/GPU/GLES/VertexDecoderArm.cpp +++ b/GPU/GLES/VertexDecoderArm.cpp @@ -61,7 +61,8 @@ static const ARMReg tempReg2 = R4; static const ARMReg tempReg3 = R5; static const ARMReg scratchReg = R6; static const ARMReg scratchReg2 = R7; -static const ARMReg scratchReg3 = R12; +static const ARMReg scratchReg3 = R8; +static const ARMReg hasAlphaReg = R12; static const ARMReg srcReg = R0; static const ARMReg dstReg = R1; static const ARMReg counterReg = R2; @@ -262,6 +263,10 @@ JittedVertexDecoder VertexDecoderJitCache::Compile(const VertexDecoder &dec) { // TODO: Preload scale factors } + if (dec.col) { + MOV(hasAlphaReg, 0); + } + JumpTarget loopStart = GetCodePtr(); // Preload data cache ahead of reading. This offset seems pretty good. PLD(srcReg, 64); @@ -281,6 +286,11 @@ JittedVertexDecoder VertexDecoderJitCache::Compile(const VertexDecoder &dec) { SUBS(counterReg, counterReg, 1); B_CC(CC_NEQ, loopStart); + // TODO: Do something with hasAlphaReg. + if (dec.col) { + + } + if (NEONSkinning || NEONMorphing) { VPOP(D8, 8); } @@ -664,7 +674,12 @@ void VertexDecoderJitCache::Jit_TcFloatPrescale() { void VertexDecoderJitCache::Jit_Color8888() { LDR(tempReg1, srcReg, dec_->coloff); + // Set flags to determine if alpha != 0xFF. + MVNS(tempReg2, Operand2(tempReg1, ST_ASR, 24)); STR(tempReg1, dstReg, dec_->decFmt.c0off); + SetCC(CC_NEQ); + ORR(hasAlphaReg, hasAlphaReg, IMM(1)); + SetCC(CC_AL); } void VertexDecoderJitCache::Jit_Color4444() { @@ -679,10 +694,16 @@ void VertexDecoderJitCache::Jit_Color4444() { ANDI2R(tempReg3, tempReg1, 0xF000, scratchReg); ORR(tempReg2, tempReg2, Operand2(tempReg3, ST_LSL, 12)); - // And saturate. + // And expand to 8 bits. ORR(tempReg1, tempReg2, Operand2(tempReg2, ST_LSL, 4)); STR(tempReg1, dstReg, dec_->decFmt.c0off); + + // Set flags to determine if alpha != 0xFF. + MVNS(tempReg2, Operand2(tempReg1, ST_ASR, 24)); + SetCC(CC_NEQ); + ORR(hasAlphaReg, hasAlphaReg, IMM(1)); + SetCC(CC_AL); } void VertexDecoderJitCache::Jit_Color565() { @@ -706,7 +727,7 @@ void VertexDecoderJitCache::Jit_Color565() { ORR(tempReg3, tempReg3, Operand2(tempReg1, ST_LSR, 4)); ORR(tempReg2, tempReg2, Operand2(tempReg3, ST_LSL, 8)); - // Add in full alpha. + // Add in full alpha. No need to update hasAlphaReg. ORI2R(tempReg1, tempReg2, 0xFF000000, scratchReg); STR(tempReg1, dstReg, dec_->decFmt.c0off); @@ -731,8 +752,13 @@ void VertexDecoderJitCache::Jit_Color5551() { // Now we just need alpha. Since we loaded as signed, it'll be extended. ANDI2R(tempReg1, tempReg1, 0xFF000000, scratchReg); ORR(tempReg2, tempReg2, tempReg1); - + + // Set flags to determine if alpha != 0xFF. + MVNS(tempReg3, Operand2(tempReg1, ST_ASR, 24)); STR(tempReg2, dstReg, dec_->decFmt.c0off); + SetCC(CC_NEQ); + ORR(hasAlphaReg, hasAlphaReg, IMM(1)); + SetCC(CC_AL); } void VertexDecoderJitCache::Jit_Color8888Morph() { @@ -957,7 +983,7 @@ void VertexDecoderJitCache::Jit_Color565Morph() { } else { VMOV(S11, tempReg3); } - Jit_WriteMorphColor(dec_->decFmt.c0off); + Jit_WriteMorphColor(dec_->decFmt.c0off, false); } // First is the left shift, second is the right shift (against walls, to get the RGBA values.) @@ -1045,13 +1071,16 @@ void VertexDecoderJitCache::Jit_Color5551Morph() { } // Expects RGBA color in S8 - S11, which is Q2. -void VertexDecoderJitCache::Jit_WriteMorphColor(int outOff) { +void VertexDecoderJitCache::Jit_WriteMorphColor(int outOff, bool checkAlpha) { if (NEONMorphing) { ADDI2R(tempReg1, dstReg, outOff, scratchReg); VCVT(I_32 | I_UNSIGNED, neonScratchRegQ, neonScratchRegQ); VQMOVN(I_32 | I_UNSIGNED, neonScratchReg, neonScratchRegQ); VQMOVN(I_16 | I_UNSIGNED, neonScratchReg, neonScratchRegQ); VST1_lane(I_32, neonScratchReg, tempReg1, 0, true); + if (checkAlpha) { + VMOV_neon(I_32, scratchReg, neonScratchReg, 0); + } } else { VCVT(S8, S8, TO_INT); VCVT(S9, S9, TO_INT); @@ -1066,6 +1095,14 @@ void VertexDecoderJitCache::Jit_WriteMorphColor(int outOff) { ORR(scratchReg, scratchReg, Operand2(tempReg3, ST_LSL, 24)); STR(scratchReg, dstReg, outOff); } + + // Set flags to determine if alpha != 0xFF. + if (checkAlpha) { + MVNS(tempReg2, Operand2(scratchReg, ST_ASR, 24)); + SetCC(CC_NEQ); + ORR(hasAlphaReg, hasAlphaReg, IMM(1)); + SetCC(CC_AL); + } } void VertexDecoderJitCache::Jit_NormalS8() { diff --git a/GPU/GLES/VertexDecoderX86.cpp b/GPU/GLES/VertexDecoderX86.cpp index 32223cf42e..d1128144f7 100644 --- a/GPU/GLES/VertexDecoderX86.cpp +++ b/GPU/GLES/VertexDecoderX86.cpp @@ -54,6 +54,7 @@ static const X64Reg tempReg3 = R10; static const X64Reg srcReg = RCX; static const X64Reg dstReg = RDX; static const X64Reg counterReg = R8; +static const OpArg hasAlphaArg = R(R14); #else static const X64Reg tempReg1 = RAX; static const X64Reg tempReg2 = R9; @@ -61,6 +62,7 @@ static const X64Reg tempReg3 = R10; static const X64Reg srcReg = RDI; static const X64Reg dstReg = RSI; static const X64Reg counterReg = RDX; +static const OpArg hasAlphaArg = R(R14); #endif #else static const X64Reg tempReg1 = EAX; @@ -69,6 +71,8 @@ static const X64Reg tempReg3 = EDX; static const X64Reg srcReg = ESI; static const X64Reg dstReg = EDI; static const X64Reg counterReg = ECX; +static u32 hasAlphaValue; +static const OpArg hasAlphaArg = M(&hasAlphaValue); #endif // XMM0-XMM5 are volatile on Windows X64 @@ -234,6 +238,10 @@ JittedVertexDecoder VertexDecoderJitCache::Compile(const VertexDecoder &dec) { UNPCKLPD(fpScaleOffsetReg, R(fpScratchReg)); } + if (dec.col) { + MOV(32, hasAlphaArg, Imm32(0)); + } + // Let's not bother with a proper stack frame. We just grab the arguments and go. JumpTarget loopStart = GetCodePtr(); for (int i = 0; i < dec.numSteps_; i++) { @@ -249,6 +257,11 @@ JittedVertexDecoder VertexDecoderJitCache::Compile(const VertexDecoder &dec) { SUB(32, R(counterReg), Imm8(1)); J_CC(CC_NZ, loopStart, true); + // TODO: Do something with hasAlphaArg from EAX. + if (dec.col) { + //MOV(32, R(EAX), hasAlphaArg); + } + MOVUPS(XMM4, MDisp(ESP, 0)); MOVUPS(XMM5, MDisp(ESP, 16)); MOVUPS(XMM6, MDisp(ESP, 32)); @@ -556,6 +569,7 @@ void VertexDecoderJitCache::Jit_TcFloatThrough() { void VertexDecoderJitCache::Jit_Color8888() { MOV(32, R(tempReg1), MDisp(srcReg, dec_->coloff)); MOV(32, MDisp(dstReg, dec_->decFmt.c0off), R(tempReg1)); + // TODO: hasAlphaArg. } static const u32 MEMORY_ALIGNED16(nibbles[4]) = { 0x0f0f0f0f, 0x0f0f0f0f, 0x0f0f0f0f, 0x0f0f0f0f, }; @@ -625,6 +639,7 @@ void VertexDecoderJitCache::Jit_Color4444() { OR(32, R(tempReg2), R(tempReg3)); MOV(32, MDisp(dstReg, dec_->decFmt.c0off), R(tempReg2)); + // TODO: hasAlphaArg. } void VertexDecoderJitCache::Jit_Color565() { @@ -661,6 +676,7 @@ void VertexDecoderJitCache::Jit_Color565() { OR(32, R(tempReg2), R(tempReg1)); MOV(32, MDisp(dstReg, dec_->decFmt.c0off), R(tempReg2)); + // Never has alpha, no need to update hasAlphaArg. } void VertexDecoderJitCache::Jit_Color5551() { @@ -696,6 +712,7 @@ void VertexDecoderJitCache::Jit_Color5551() { OR(32, R(tempReg2), R(tempReg1)); MOV(32, MDisp(dstReg, dec_->decFmt.c0off), R(tempReg2)); + // TODO: hasAlphaArg. } void VertexDecoderJitCache::Jit_Color8888Morph() { @@ -825,7 +842,7 @@ void VertexDecoderJitCache::Jit_Color565Morph() { } } - Jit_WriteMorphColor(dec_->decFmt.c0off); + Jit_WriteMorphColor(dec_->decFmt.c0off, false); } // Intentionally in reverse order. @@ -884,12 +901,15 @@ void VertexDecoderJitCache::Jit_Color5551Morph() { Jit_WriteMorphColor(dec_->decFmt.c0off); } -void VertexDecoderJitCache::Jit_WriteMorphColor(int outOff) { +void VertexDecoderJitCache::Jit_WriteMorphColor(int outOff, bool checkAlpha) { // Pack back into a u32. CVTPS2DQ(fpScratchReg, R(fpScratchReg)); PACKSSDW(fpScratchReg, R(fpScratchReg)); PACKUSWB(fpScratchReg, R(fpScratchReg)); MOVD_xmm(MDisp(dstReg, outOff), fpScratchReg); + if (checkAlpha) { + // TODO: hasAlphaArg. + } } // Copy 3 bytes and then a zero. Might as well copy four. From 604160f60c90da4eba41961ce689de1e9e4aec2c Mon Sep 17 00:00:00 2001 From: "Unknown W. Brackets" Date: Mon, 24 Mar 2014 01:05:01 -0700 Subject: [PATCH 2/6] Switch to fullAlphaReg not hasAlphaReg. --- GPU/GLES/VertexDecoderArm.cpp | 22 +++++++++++++--------- GPU/GLES/VertexDecoderX86.cpp | 22 +++++++++++----------- 2 files changed, 24 insertions(+), 20 deletions(-) diff --git a/GPU/GLES/VertexDecoderArm.cpp b/GPU/GLES/VertexDecoderArm.cpp index f702246bf1..619d0ebc1d 100644 --- a/GPU/GLES/VertexDecoderArm.cpp +++ b/GPU/GLES/VertexDecoderArm.cpp @@ -62,7 +62,7 @@ static const ARMReg tempReg3 = R5; static const ARMReg scratchReg = R6; static const ARMReg scratchReg2 = R7; static const ARMReg scratchReg3 = R8; -static const ARMReg hasAlphaReg = R12; +static const ARMReg fullAlphaReg = R12; static const ARMReg srcReg = R0; static const ARMReg dstReg = R1; static const ARMReg counterReg = R2; @@ -264,7 +264,8 @@ JittedVertexDecoder VertexDecoderJitCache::Compile(const VertexDecoder &dec) { } if (dec.col) { - MOV(hasAlphaReg, 0); + // Or LDB and skip the conditional? This is probably cheaper. + MOV(fullAlphaReg, 0xFF); } JumpTarget loopStart = GetCodePtr(); @@ -286,9 +287,12 @@ JittedVertexDecoder VertexDecoderJitCache::Compile(const VertexDecoder &dec) { SUBS(counterReg, counterReg, 1); B_CC(CC_NEQ, loopStart); - // TODO: Do something with hasAlphaReg. if (dec.col) { - + MOVP2R(tempReg, &gstate_c.textureFullAlpha); + CMP(fullAlphaReg, 0); + SetCC(CC_EQ); + STRB(fullAlphaReg, tempReg, 0); + SetCC(CC_AL); } if (NEONSkinning || NEONMorphing) { @@ -678,7 +682,7 @@ void VertexDecoderJitCache::Jit_Color8888() { MVNS(tempReg2, Operand2(tempReg1, ST_ASR, 24)); STR(tempReg1, dstReg, dec_->decFmt.c0off); SetCC(CC_NEQ); - ORR(hasAlphaReg, hasAlphaReg, IMM(1)); + MOV(fullAlphaReg, 0); SetCC(CC_AL); } @@ -702,7 +706,7 @@ void VertexDecoderJitCache::Jit_Color4444() { // Set flags to determine if alpha != 0xFF. MVNS(tempReg2, Operand2(tempReg1, ST_ASR, 24)); SetCC(CC_NEQ); - ORR(hasAlphaReg, hasAlphaReg, IMM(1)); + MOV(fullAlphaReg, 0); SetCC(CC_AL); } @@ -727,7 +731,7 @@ void VertexDecoderJitCache::Jit_Color565() { ORR(tempReg3, tempReg3, Operand2(tempReg1, ST_LSR, 4)); ORR(tempReg2, tempReg2, Operand2(tempReg3, ST_LSL, 8)); - // Add in full alpha. No need to update hasAlphaReg. + // Add in full alpha. No need to update fullAlphaReg. ORI2R(tempReg1, tempReg2, 0xFF000000, scratchReg); STR(tempReg1, dstReg, dec_->decFmt.c0off); @@ -757,7 +761,7 @@ void VertexDecoderJitCache::Jit_Color5551() { MVNS(tempReg3, Operand2(tempReg1, ST_ASR, 24)); STR(tempReg2, dstReg, dec_->decFmt.c0off); SetCC(CC_NEQ); - ORR(hasAlphaReg, hasAlphaReg, IMM(1)); + MOV(fullAlphaReg, 0); SetCC(CC_AL); } @@ -1100,7 +1104,7 @@ void VertexDecoderJitCache::Jit_WriteMorphColor(int outOff, bool checkAlpha) { if (checkAlpha) { MVNS(tempReg2, Operand2(scratchReg, ST_ASR, 24)); SetCC(CC_NEQ); - ORR(hasAlphaReg, hasAlphaReg, IMM(1)); + MOV(fullAlphaReg, 0); SetCC(CC_AL); } } diff --git a/GPU/GLES/VertexDecoderX86.cpp b/GPU/GLES/VertexDecoderX86.cpp index d1128144f7..7151e7c944 100644 --- a/GPU/GLES/VertexDecoderX86.cpp +++ b/GPU/GLES/VertexDecoderX86.cpp @@ -54,7 +54,7 @@ static const X64Reg tempReg3 = R10; static const X64Reg srcReg = RCX; static const X64Reg dstReg = RDX; static const X64Reg counterReg = R8; -static const OpArg hasAlphaArg = R(R14); +static const OpArg fullAlphaArg = R(R14); #else static const X64Reg tempReg1 = RAX; static const X64Reg tempReg2 = R9; @@ -62,7 +62,7 @@ static const X64Reg tempReg3 = R10; static const X64Reg srcReg = RDI; static const X64Reg dstReg = RSI; static const X64Reg counterReg = RDX; -static const OpArg hasAlphaArg = R(R14); +static const OpArg fullAlphaArg = R(R14); #endif #else static const X64Reg tempReg1 = EAX; @@ -72,7 +72,7 @@ static const X64Reg srcReg = ESI; static const X64Reg dstReg = EDI; static const X64Reg counterReg = ECX; static u32 hasAlphaValue; -static const OpArg hasAlphaArg = M(&hasAlphaValue); +static const OpArg fullAlphaArg = M(&hasAlphaValue); #endif // XMM0-XMM5 are volatile on Windows X64 @@ -239,7 +239,7 @@ JittedVertexDecoder VertexDecoderJitCache::Compile(const VertexDecoder &dec) { } if (dec.col) { - MOV(32, hasAlphaArg, Imm32(0)); + MOV(32, fullAlphaArg, Imm32(0xFF)); } // Let's not bother with a proper stack frame. We just grab the arguments and go. @@ -257,9 +257,9 @@ JittedVertexDecoder VertexDecoderJitCache::Compile(const VertexDecoder &dec) { SUB(32, R(counterReg), Imm8(1)); J_CC(CC_NZ, loopStart, true); - // TODO: Do something with hasAlphaArg from EAX. + // TODO: Do something with fullAlphaArg from EAX. if (dec.col) { - //MOV(32, R(EAX), hasAlphaArg); + //MOV(32, R(EAX), fullAlphaArg); } MOVUPS(XMM4, MDisp(ESP, 0)); @@ -569,7 +569,7 @@ void VertexDecoderJitCache::Jit_TcFloatThrough() { void VertexDecoderJitCache::Jit_Color8888() { MOV(32, R(tempReg1), MDisp(srcReg, dec_->coloff)); MOV(32, MDisp(dstReg, dec_->decFmt.c0off), R(tempReg1)); - // TODO: hasAlphaArg. + // TODO: fullAlphaArg. } static const u32 MEMORY_ALIGNED16(nibbles[4]) = { 0x0f0f0f0f, 0x0f0f0f0f, 0x0f0f0f0f, 0x0f0f0f0f, }; @@ -639,7 +639,7 @@ void VertexDecoderJitCache::Jit_Color4444() { OR(32, R(tempReg2), R(tempReg3)); MOV(32, MDisp(dstReg, dec_->decFmt.c0off), R(tempReg2)); - // TODO: hasAlphaArg. + // TODO: fullAlphaArg. } void VertexDecoderJitCache::Jit_Color565() { @@ -676,7 +676,7 @@ void VertexDecoderJitCache::Jit_Color565() { OR(32, R(tempReg2), R(tempReg1)); MOV(32, MDisp(dstReg, dec_->decFmt.c0off), R(tempReg2)); - // Never has alpha, no need to update hasAlphaArg. + // Never has alpha, no need to update fullAlphaArg. } void VertexDecoderJitCache::Jit_Color5551() { @@ -712,7 +712,7 @@ void VertexDecoderJitCache::Jit_Color5551() { OR(32, R(tempReg2), R(tempReg1)); MOV(32, MDisp(dstReg, dec_->decFmt.c0off), R(tempReg2)); - // TODO: hasAlphaArg. + // TODO: fullAlphaArg. } void VertexDecoderJitCache::Jit_Color8888Morph() { @@ -908,7 +908,7 @@ void VertexDecoderJitCache::Jit_WriteMorphColor(int outOff, bool checkAlpha) { PACKUSWB(fpScratchReg, R(fpScratchReg)); MOVD_xmm(MDisp(dstReg, outOff), fpScratchReg); if (checkAlpha) { - // TODO: hasAlphaArg. + // TODO: fullAlphaArg. } } From 2c76e6d023f1c4c4a5d4e19ca00f7df2c26c9f2b Mon Sep 17 00:00:00 2001 From: Henrik Rydgard Date: Mon, 24 Mar 2014 11:19:11 +0100 Subject: [PATCH 3/6] Correctly keep track of "full alpha" in vertices (x86 jit only). --- GPU/GLES/TransformPipeline.cpp | 13 ++++++++----- GPU/GLES/TransformPipeline.h | 6 ++++++ GPU/GLES/VertexDecoderX86.cpp | 22 ++++++++++++++++++++++ 3 files changed, 36 insertions(+), 5 deletions(-) diff --git a/GPU/GLES/TransformPipeline.cpp b/GPU/GLES/TransformPipeline.cpp index 0b01d00a3a..263fdfb4b1 100644 --- a/GPU/GLES/TransformPipeline.cpp +++ b/GPU/GLES/TransformPipeline.cpp @@ -265,11 +265,6 @@ void TransformDrawEngine::SetupVertexDecoder(u32 vertType) { if (vertTypeID != lastVType_) { dec_ = GetVertexDecoder(vertTypeID); lastVType_ = vertTypeID; - - // TODO: Add functionality to VertexDecoder to scan for non-full alpha in the two other formats, - // which are quite common. - int colorType = vertTypeID & GE_VTYPE_COL_MASK; - gstate_c.vertexFullAlpha = colorType == GE_VTYPE_COL_NONE || colorType == GE_VTYPE_COL_565; } } @@ -566,6 +561,8 @@ void TransformDrawEngine::DoFlush() { vai->numVerts = indexGen.VertexCount(); vai->prim = indexGen.Prim(); vai->maxIndex = indexGen.MaxIndex(); + vai->flags = gstate_c.vertexFullAlpha ? VAI_FLAG_VERTEXFULLALPHA : 0; + goto rotateVBO; } @@ -645,6 +642,8 @@ void TransformDrawEngine::DoFlush() { vertexCount = vai->numVerts; maxIndex = vai->maxIndex; prim = static_cast(vai->prim); + + gstate_c.vertexFullAlpha = vai->flags & VAI_FLAG_VERTEXFULLALPHA; break; } @@ -665,6 +664,8 @@ void TransformDrawEngine::DoFlush() { vertexCount = vai->numVerts; maxIndex = vai->maxIndex; prim = static_cast(vai->prim); + + gstate_c.vertexFullAlpha = vai->flags & VAI_FLAG_VERTEXFULLALPHA; break; } @@ -717,6 +718,7 @@ rotateVBO: glBindBuffer(GL_ARRAY_BUFFER, 0); } else { DecodeVerts(); + LinkedShader *program = shaderManager_->ApplyFragmentShader(vshader, prim, lastVType_); gpuStats.numUncachedVertsDrawn += indexGen.VertexCount(); prim = indexGen.Prim(); @@ -737,6 +739,7 @@ rotateVBO: decodeCounter_ = 0; dcid_ = 0; prevPrim_ = GE_PRIM_INVALID; + gstate_c.vertexFullAlpha = true; #ifndef MOBILE_DEVICE host->GPUNotifyDraw(); diff --git a/GPU/GLES/TransformPipeline.h b/GPU/GLES/TransformPipeline.h index a9c16eaca8..bee7621bd3 100644 --- a/GPU/GLES/TransformPipeline.h +++ b/GPU/GLES/TransformPipeline.h @@ -43,6 +43,10 @@ struct DecVtxFormat; // DRAWN_ONCE -> death // DRAWN_RELIABLE -> death +enum { + VAI_FLAG_VERTEXFULLALPHA = 1, +}; + // Try to keep this POD. class VertexArrayInfo { public: @@ -57,6 +61,7 @@ public: lastFrame = gpuStats.numFlips; numVerts = 0; drawsUntilNextFullHash = 0; + flags = 0; } ~VertexArrayInfo(); @@ -85,6 +90,7 @@ public: int numFrames; int lastFrame; // So that we can forget. u16 drawsUntilNextFullHash; + u8 flags; }; // Handles transform, lighting and drawing. diff --git a/GPU/GLES/VertexDecoderX86.cpp b/GPU/GLES/VertexDecoderX86.cpp index 32223cf42e..0c770903da 100644 --- a/GPU/GLES/VertexDecoderX86.cpp +++ b/GPU/GLES/VertexDecoderX86.cpp @@ -556,6 +556,11 @@ void VertexDecoderJitCache::Jit_TcFloatThrough() { void VertexDecoderJitCache::Jit_Color8888() { MOV(32, R(tempReg1), MDisp(srcReg, dec_->coloff)); MOV(32, MDisp(dstReg, dec_->decFmt.c0off), R(tempReg1)); + + CMP(32, R(tempReg1), Imm32(0xFF000000)); + FixupBranch skip = J_CC(CC_GE, false); + MOV(8, M(&gstate_c.textureFullAlpha), Imm8(0)); + SetJumpTarget(skip); } static const u32 MEMORY_ALIGNED16(nibbles[4]) = { 0x0f0f0f0f, 0x0f0f0f0f, 0x0f0f0f0f, 0x0f0f0f0f, }; @@ -625,6 +630,11 @@ void VertexDecoderJitCache::Jit_Color4444() { OR(32, R(tempReg2), R(tempReg3)); MOV(32, MDisp(dstReg, dec_->decFmt.c0off), R(tempReg2)); + + CMP(32, R(tempReg2), Imm32(0xFF000000)); + FixupBranch skip = J_CC(CC_AE, false); + MOV(8, M(&gstate_c.textureFullAlpha), Imm8(0)); + SetJumpTarget(skip); } void VertexDecoderJitCache::Jit_Color565() { @@ -696,6 +706,11 @@ void VertexDecoderJitCache::Jit_Color5551() { OR(32, R(tempReg2), R(tempReg1)); MOV(32, MDisp(dstReg, dec_->decFmt.c0off), R(tempReg2)); + + CMP(32, R(tempReg2), Imm32(0xFF000000)); + FixupBranch skip = J_CC(CC_AE, false); + MOV(8, M(&gstate_c.textureFullAlpha), Imm8(0)); + SetJumpTarget(skip); } void VertexDecoderJitCache::Jit_Color8888Morph() { @@ -890,6 +905,13 @@ void VertexDecoderJitCache::Jit_WriteMorphColor(int outOff) { PACKSSDW(fpScratchReg, R(fpScratchReg)); PACKUSWB(fpScratchReg, R(fpScratchReg)); MOVD_xmm(MDisp(dstReg, outOff), fpScratchReg); + + // TODO: May be a faster way to do this without the MOVD. + MOVD_xmm(R(tempReg1), fpScratchReg); + CMP(32, R(tempReg1), Imm32(0xFF000000)); + FixupBranch skip = J_CC(CC_AE, false); + MOV(8, M(&gstate_c.textureFullAlpha), Imm8(0)); + SetJumpTarget(skip); } // Copy 3 bytes and then a zero. Might as well copy four. From f33ddad364509b681494e3862d500dd9f7db2daa Mon Sep 17 00:00:00 2001 From: Henrik Rydgard Date: Mon, 24 Mar 2014 12:41:33 +0100 Subject: [PATCH 4/6] Eliminate further alpha tests --- GPU/GLES/FragmentShaderGenerator.cpp | 2 ++ 1 file changed, 2 insertions(+) diff --git a/GPU/GLES/FragmentShaderGenerator.cpp b/GPU/GLES/FragmentShaderGenerator.cpp index e6ceb8e145..206aa2abd1 100644 --- a/GPU/GLES/FragmentShaderGenerator.cpp +++ b/GPU/GLES/FragmentShaderGenerator.cpp @@ -64,6 +64,8 @@ static bool IsAlphaTestTriviallyTrue() { return true; case GE_COMP_GEQUAL: + if (gstate_c.vertexFullAlpha && (gstate_c.textureFullAlpha || !gstate.isTextureAlphaUsed())) + return true; // If alpha is full, it doesn't matter what the ref value is. return gstate.getAlphaTestRef() == 0; // Non-zero check. If we have no depth testing (and thus no depth writing), and an alpha func that will result in no change if zero alpha, get rid of the alpha test. From f30d0d810eb01b20860a4d546a41cc644108bf25 Mon Sep 17 00:00:00 2001 From: Henrik Rydgard Date: Mon, 24 Mar 2014 14:44:46 +0100 Subject: [PATCH 5/6] ARM buildfix --- GPU/GLES/VertexDecoderArm.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/GPU/GLES/VertexDecoderArm.cpp b/GPU/GLES/VertexDecoderArm.cpp index 619d0ebc1d..d3ab5048b1 100644 --- a/GPU/GLES/VertexDecoderArm.cpp +++ b/GPU/GLES/VertexDecoderArm.cpp @@ -288,10 +288,10 @@ JittedVertexDecoder VertexDecoderJitCache::Compile(const VertexDecoder &dec) { B_CC(CC_NEQ, loopStart); if (dec.col) { - MOVP2R(tempReg, &gstate_c.textureFullAlpha); + MOVP2R(tempReg1, &gstate_c.textureFullAlpha); CMP(fullAlphaReg, 0); SetCC(CC_EQ); - STRB(fullAlphaReg, tempReg, 0); + STRB(fullAlphaReg, tempReg1, 0); SetCC(CC_AL); } From dc07d3410a5e7519a98dc40c9d3bb08273207e37 Mon Sep 17 00:00:00 2001 From: Henrik Rydgard Date: Mon, 24 Mar 2014 17:33:20 +0100 Subject: [PATCH 6/6] More checks for alpha test elimination --- GPU/GLES/FragmentShaderGenerator.cpp | 7 +++++++ GPU/GLES/TransformPipeline.cpp | 12 ++++++++++++ 2 files changed, 19 insertions(+) diff --git a/GPU/GLES/FragmentShaderGenerator.cpp b/GPU/GLES/FragmentShaderGenerator.cpp index 206aa2abd1..65b76c2a0b 100644 --- a/GPU/GLES/FragmentShaderGenerator.cpp +++ b/GPU/GLES/FragmentShaderGenerator.cpp @@ -71,6 +71,13 @@ static bool IsAlphaTestTriviallyTrue() { // Non-zero check. If we have no depth testing (and thus no depth writing), and an alpha func that will result in no change if zero alpha, get rid of the alpha test. // Speeds up Lumines by a LOT on PowerVR. case GE_COMP_NOTEQUAL: + if ((gstate_c.vertexFullAlpha && (gstate_c.textureFullAlpha || !gstate.isTextureAlphaUsed())) && gstate.getAlphaTestRef() == 255) { + // Likely to be rare. Let's just have the alpha test take care of this instead of adding + // complicated code to discard the draw or whatnot. + return false; + } + // Fallthrough on purpose + case GE_COMP_GREATER: { #if 0 diff --git a/GPU/GLES/TransformPipeline.cpp b/GPU/GLES/TransformPipeline.cpp index 263fdfb4b1..bd9456463d 100644 --- a/GPU/GLES/TransformPipeline.cpp +++ b/GPU/GLES/TransformPipeline.cpp @@ -699,6 +699,12 @@ rotateVBO: } VERBOSE_LOG(G3D, "Flush prim %i! %i verts in one go", prim, vertexCount); + bool hasColor = (lastVType_ & GE_VTYPE_COL_MASK) != GE_VTYPE_COL_NONE; + if (gstate.isModeThrough()) { + gstate_c.vertexFullAlpha = gstate_c.vertexFullAlpha && (hasColor || gstate.getMaterialAmbientA() == 255); + } else { + gstate_c.vertexFullAlpha = gstate_c.vertexFullAlpha && ((hasColor && (gstate.materialupdate & 1)) || gstate.getMaterialAmbientA() == 255) && (!gstate.isLightingEnabled() || gstate.getAmbientA() == 255); + } LinkedShader *program = shaderManager_->ApplyFragmentShader(vshader, prim, lastVType_); SetupDecFmtForDraw(program, dec_->GetDecVtxFmt(), vbo ? 0 : decoded); @@ -718,6 +724,12 @@ rotateVBO: glBindBuffer(GL_ARRAY_BUFFER, 0); } else { DecodeVerts(); + bool hasColor = (lastVType_ & GE_VTYPE_COL_MASK) != GE_VTYPE_COL_NONE; + if (gstate.isModeThrough()) { + gstate_c.vertexFullAlpha = gstate_c.vertexFullAlpha && (hasColor || gstate.getMaterialAmbientA() == 255); + } else { + gstate_c.vertexFullAlpha = gstate_c.vertexFullAlpha && ((hasColor && (gstate.materialupdate & 1)) || gstate.getMaterialAmbientA() == 255) && (!gstate.isLightingEnabled() || gstate.getAmbientA() == 255); + } LinkedShader *program = shaderManager_->ApplyFragmentShader(vshader, prim, lastVType_); gpuStats.numUncachedVertsDrawn += indexGen.VertexCount();