Initial stab at tracking vertex alpha.

Not sure what efficient method to use on x86...
This commit is contained in:
Unknown W. Brackets 2014-03-23 20:37:51 -07:00
parent 893a719c4e
commit 61f5d3d360
4 changed files with 75 additions and 9 deletions

View File

@ -219,6 +219,7 @@ void VertexDecoder::Step_Color565() const
c[1] = Convert6To8((cdata>>5) & 0x3f);
c[2] = Convert5To8((cdata>>11) & 0x1f);
c[3] = 255;
// Always full alpha.
}
void VertexDecoder::Step_Color5551() const
@ -229,6 +230,7 @@ void VertexDecoder::Step_Color5551() const
c[1] = Convert5To8((cdata>>5) & 0x1f);
c[2] = Convert5To8((cdata>>10) & 0x1f);
c[3] = (cdata >> 15) ? 255 : 0;
gstate_c.vertexFullAlpha = gstate_c.vertexFullAlpha && c[3] != 0;
}
void VertexDecoder::Step_Color4444() const
@ -237,6 +239,7 @@ void VertexDecoder::Step_Color4444() const
u16 cdata = *(u16*)(ptr_ + coloff);
for (int j = 0; j < 4; j++)
c[j] = Convert4To8((cdata >> (j * 4)) & 0xF);
gstate_c.vertexFullAlpha = gstate_c.vertexFullAlpha && c[3] == 255;
}
void VertexDecoder::Step_Color8888() const
@ -244,6 +247,7 @@ void VertexDecoder::Step_Color8888() const
u8 *c = decoded_ + decFmt.c0off;
const u8 *cdata = (const u8*)(ptr_ + coloff);
memcpy(c, cdata, sizeof(u8) * 4);
gstate_c.vertexFullAlpha = gstate_c.vertexFullAlpha && c[3] == 255;
}
void VertexDecoder::Step_Color565Morph() const
@ -262,6 +266,7 @@ void VertexDecoder::Step_Color565Morph() const
c[i] = (u8)col[i];
}
c[3] = 255;
// Always full alpha.
}
void VertexDecoder::Step_Color5551Morph() const
@ -280,6 +285,7 @@ void VertexDecoder::Step_Color5551Morph() const
for (int i = 0; i < 4; i++) {
c[i] = (u8)col[i];
}
gstate_c.vertexFullAlpha = gstate_c.vertexFullAlpha && c[3] == 255;
}
void VertexDecoder::Step_Color4444Morph() const
@ -296,6 +302,7 @@ void VertexDecoder::Step_Color4444Morph() const
for (int i = 0; i < 4; i++) {
c[i] = (u8)col[i];
}
gstate_c.vertexFullAlpha = gstate_c.vertexFullAlpha && c[3] == 255;
}
void VertexDecoder::Step_Color8888Morph() const
@ -312,6 +319,7 @@ void VertexDecoder::Step_Color8888Morph() const
for (int i = 0; i < 4; i++) {
c[i] = (u8)(col[i]);
}
gstate_c.vertexFullAlpha = gstate_c.vertexFullAlpha && c[3] == 255;
}
void VertexDecoder::Step_NormalS8() const
@ -841,6 +849,7 @@ void VertexDecoder::DecodeVerts(u8 *decodedptr, const void *verts, int indexLowe
jitted_(ptr_, decoded_, count);
} else {
// Interpret the decode steps
// TODO: Init gstate_c.vertexFullAlpha here? Or in Setup? When is it reset?
for (; count; count--) {
for (int i = 0; i < numSteps_; i++) {
((*this).*steps_[i])();

View File

@ -266,6 +266,6 @@ private:
bool CompileStep(const VertexDecoder &dec, int i);
void Jit_ApplyWeights();
void Jit_WriteMatrixMul(int outOff, bool pos);
void Jit_WriteMorphColor(int outOff);
void Jit_WriteMorphColor(int outOff, bool checkAlpha = true);
const VertexDecoder *dec_;
};

View File

@ -61,7 +61,8 @@ static const ARMReg tempReg2 = R4;
static const ARMReg tempReg3 = R5;
static const ARMReg scratchReg = R6;
static const ARMReg scratchReg2 = R7;
static const ARMReg scratchReg3 = R12;
static const ARMReg scratchReg3 = R8;
static const ARMReg hasAlphaReg = R12;
static const ARMReg srcReg = R0;
static const ARMReg dstReg = R1;
static const ARMReg counterReg = R2;
@ -262,6 +263,10 @@ JittedVertexDecoder VertexDecoderJitCache::Compile(const VertexDecoder &dec) {
// TODO: Preload scale factors
}
if (dec.col) {
MOV(hasAlphaReg, 0);
}
JumpTarget loopStart = GetCodePtr();
// Preload data cache ahead of reading. This offset seems pretty good.
PLD(srcReg, 64);
@ -281,6 +286,11 @@ JittedVertexDecoder VertexDecoderJitCache::Compile(const VertexDecoder &dec) {
SUBS(counterReg, counterReg, 1);
B_CC(CC_NEQ, loopStart);
// TODO: Do something with hasAlphaReg.
if (dec.col) {
}
if (NEONSkinning || NEONMorphing) {
VPOP(D8, 8);
}
@ -664,7 +674,12 @@ void VertexDecoderJitCache::Jit_TcFloatPrescale() {
void VertexDecoderJitCache::Jit_Color8888() {
LDR(tempReg1, srcReg, dec_->coloff);
// Set flags to determine if alpha != 0xFF.
MVNS(tempReg2, Operand2(tempReg1, ST_ASR, 24));
STR(tempReg1, dstReg, dec_->decFmt.c0off);
SetCC(CC_NEQ);
ORR(hasAlphaReg, hasAlphaReg, IMM(1));
SetCC(CC_AL);
}
void VertexDecoderJitCache::Jit_Color4444() {
@ -679,10 +694,16 @@ void VertexDecoderJitCache::Jit_Color4444() {
ANDI2R(tempReg3, tempReg1, 0xF000, scratchReg);
ORR(tempReg2, tempReg2, Operand2(tempReg3, ST_LSL, 12));
// And saturate.
// And expand to 8 bits.
ORR(tempReg1, tempReg2, Operand2(tempReg2, ST_LSL, 4));
STR(tempReg1, dstReg, dec_->decFmt.c0off);
// Set flags to determine if alpha != 0xFF.
MVNS(tempReg2, Operand2(tempReg1, ST_ASR, 24));
SetCC(CC_NEQ);
ORR(hasAlphaReg, hasAlphaReg, IMM(1));
SetCC(CC_AL);
}
void VertexDecoderJitCache::Jit_Color565() {
@ -706,7 +727,7 @@ void VertexDecoderJitCache::Jit_Color565() {
ORR(tempReg3, tempReg3, Operand2(tempReg1, ST_LSR, 4));
ORR(tempReg2, tempReg2, Operand2(tempReg3, ST_LSL, 8));
// Add in full alpha.
// Add in full alpha. No need to update hasAlphaReg.
ORI2R(tempReg1, tempReg2, 0xFF000000, scratchReg);
STR(tempReg1, dstReg, dec_->decFmt.c0off);
@ -731,8 +752,13 @@ void VertexDecoderJitCache::Jit_Color5551() {
// Now we just need alpha. Since we loaded as signed, it'll be extended.
ANDI2R(tempReg1, tempReg1, 0xFF000000, scratchReg);
ORR(tempReg2, tempReg2, tempReg1);
// Set flags to determine if alpha != 0xFF.
MVNS(tempReg3, Operand2(tempReg1, ST_ASR, 24));
STR(tempReg2, dstReg, dec_->decFmt.c0off);
SetCC(CC_NEQ);
ORR(hasAlphaReg, hasAlphaReg, IMM(1));
SetCC(CC_AL);
}
void VertexDecoderJitCache::Jit_Color8888Morph() {
@ -957,7 +983,7 @@ void VertexDecoderJitCache::Jit_Color565Morph() {
} else {
VMOV(S11, tempReg3);
}
Jit_WriteMorphColor(dec_->decFmt.c0off);
Jit_WriteMorphColor(dec_->decFmt.c0off, false);
}
// First is the left shift, second is the right shift (against walls, to get the RGBA values.)
@ -1045,13 +1071,16 @@ void VertexDecoderJitCache::Jit_Color5551Morph() {
}
// Expects RGBA color in S8 - S11, which is Q2.
void VertexDecoderJitCache::Jit_WriteMorphColor(int outOff) {
void VertexDecoderJitCache::Jit_WriteMorphColor(int outOff, bool checkAlpha) {
if (NEONMorphing) {
ADDI2R(tempReg1, dstReg, outOff, scratchReg);
VCVT(I_32 | I_UNSIGNED, neonScratchRegQ, neonScratchRegQ);
VQMOVN(I_32 | I_UNSIGNED, neonScratchReg, neonScratchRegQ);
VQMOVN(I_16 | I_UNSIGNED, neonScratchReg, neonScratchRegQ);
VST1_lane(I_32, neonScratchReg, tempReg1, 0, true);
if (checkAlpha) {
VMOV_neon(I_32, scratchReg, neonScratchReg, 0);
}
} else {
VCVT(S8, S8, TO_INT);
VCVT(S9, S9, TO_INT);
@ -1066,6 +1095,14 @@ void VertexDecoderJitCache::Jit_WriteMorphColor(int outOff) {
ORR(scratchReg, scratchReg, Operand2(tempReg3, ST_LSL, 24));
STR(scratchReg, dstReg, outOff);
}
// Set flags to determine if alpha != 0xFF.
if (checkAlpha) {
MVNS(tempReg2, Operand2(scratchReg, ST_ASR, 24));
SetCC(CC_NEQ);
ORR(hasAlphaReg, hasAlphaReg, IMM(1));
SetCC(CC_AL);
}
}
void VertexDecoderJitCache::Jit_NormalS8() {

View File

@ -54,6 +54,7 @@ static const X64Reg tempReg3 = R10;
static const X64Reg srcReg = RCX;
static const X64Reg dstReg = RDX;
static const X64Reg counterReg = R8;
static const OpArg hasAlphaArg = R(R14);
#else
static const X64Reg tempReg1 = RAX;
static const X64Reg tempReg2 = R9;
@ -61,6 +62,7 @@ static const X64Reg tempReg3 = R10;
static const X64Reg srcReg = RDI;
static const X64Reg dstReg = RSI;
static const X64Reg counterReg = RDX;
static const OpArg hasAlphaArg = R(R14);
#endif
#else
static const X64Reg tempReg1 = EAX;
@ -69,6 +71,8 @@ static const X64Reg tempReg3 = EDX;
static const X64Reg srcReg = ESI;
static const X64Reg dstReg = EDI;
static const X64Reg counterReg = ECX;
static u32 hasAlphaValue;
static const OpArg hasAlphaArg = M(&hasAlphaValue);
#endif
// XMM0-XMM5 are volatile on Windows X64
@ -234,6 +238,10 @@ JittedVertexDecoder VertexDecoderJitCache::Compile(const VertexDecoder &dec) {
UNPCKLPD(fpScaleOffsetReg, R(fpScratchReg));
}
if (dec.col) {
MOV(32, hasAlphaArg, Imm32(0));
}
// Let's not bother with a proper stack frame. We just grab the arguments and go.
JumpTarget loopStart = GetCodePtr();
for (int i = 0; i < dec.numSteps_; i++) {
@ -249,6 +257,11 @@ JittedVertexDecoder VertexDecoderJitCache::Compile(const VertexDecoder &dec) {
SUB(32, R(counterReg), Imm8(1));
J_CC(CC_NZ, loopStart, true);
// TODO: Do something with hasAlphaArg from EAX.
if (dec.col) {
//MOV(32, R(EAX), hasAlphaArg);
}
MOVUPS(XMM4, MDisp(ESP, 0));
MOVUPS(XMM5, MDisp(ESP, 16));
MOVUPS(XMM6, MDisp(ESP, 32));
@ -556,6 +569,7 @@ void VertexDecoderJitCache::Jit_TcFloatThrough() {
void VertexDecoderJitCache::Jit_Color8888() {
MOV(32, R(tempReg1), MDisp(srcReg, dec_->coloff));
MOV(32, MDisp(dstReg, dec_->decFmt.c0off), R(tempReg1));
// TODO: hasAlphaArg.
}
static const u32 MEMORY_ALIGNED16(nibbles[4]) = { 0x0f0f0f0f, 0x0f0f0f0f, 0x0f0f0f0f, 0x0f0f0f0f, };
@ -625,6 +639,7 @@ void VertexDecoderJitCache::Jit_Color4444() {
OR(32, R(tempReg2), R(tempReg3));
MOV(32, MDisp(dstReg, dec_->decFmt.c0off), R(tempReg2));
// TODO: hasAlphaArg.
}
void VertexDecoderJitCache::Jit_Color565() {
@ -661,6 +676,7 @@ void VertexDecoderJitCache::Jit_Color565() {
OR(32, R(tempReg2), R(tempReg1));
MOV(32, MDisp(dstReg, dec_->decFmt.c0off), R(tempReg2));
// Never has alpha, no need to update hasAlphaArg.
}
void VertexDecoderJitCache::Jit_Color5551() {
@ -696,6 +712,7 @@ void VertexDecoderJitCache::Jit_Color5551() {
OR(32, R(tempReg2), R(tempReg1));
MOV(32, MDisp(dstReg, dec_->decFmt.c0off), R(tempReg2));
// TODO: hasAlphaArg.
}
void VertexDecoderJitCache::Jit_Color8888Morph() {
@ -825,7 +842,7 @@ void VertexDecoderJitCache::Jit_Color565Morph() {
}
}
Jit_WriteMorphColor(dec_->decFmt.c0off);
Jit_WriteMorphColor(dec_->decFmt.c0off, false);
}
// Intentionally in reverse order.
@ -884,12 +901,15 @@ void VertexDecoderJitCache::Jit_Color5551Morph() {
Jit_WriteMorphColor(dec_->decFmt.c0off);
}
void VertexDecoderJitCache::Jit_WriteMorphColor(int outOff) {
void VertexDecoderJitCache::Jit_WriteMorphColor(int outOff, bool checkAlpha) {
// Pack back into a u32.
CVTPS2DQ(fpScratchReg, R(fpScratchReg));
PACKSSDW(fpScratchReg, R(fpScratchReg));
PACKUSWB(fpScratchReg, R(fpScratchReg));
MOVD_xmm(MDisp(dstReg, outOff), fpScratchReg);
if (checkAlpha) {
// TODO: hasAlphaArg.
}
}
// Copy 3 bytes and then a zero. Might as well copy four.