Merge pull request #15273 from unknownbrackets/softjit-bloom

Optimize software renderer handling of common bloom operations
This commit is contained in:
Henrik Rydgård 2022-01-02 18:11:07 +01:00 committed by GitHub
commit d3f0af7458
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
9 changed files with 298 additions and 136 deletions

View File

@ -517,6 +517,7 @@ PixelJitCache::PixelJitCache()
{
// 256k should be plenty of space for plenty of variations.
AllocCodeSpace(1024 * 64 * 4);
ClearCodeSpace(0);
// Add some random code to "help" MSVC's buggy disassembler :(
#if defined(_WIN32) && (PPSSPP_ARCH(X86) || PPSSPP_ARCH(AMD64)) && !PPSSPP_PLATFORM(UWP)
@ -591,47 +592,73 @@ void ComputePixelBlendState(PixelBlendState &state, const PixelFuncID &id) {
if (state.usesFactors) {
switch (id.AlphaBlendSrc()) {
case GE_SRCBLEND_DSTALPHA:
case GE_SRCBLEND_INVDSTALPHA:
case GE_SRCBLEND_DOUBLEDSTALPHA:
case GE_SRCBLEND_DOUBLEINVDSTALPHA:
case PixelBlendFactor::DSTALPHA:
case PixelBlendFactor::INVDSTALPHA:
case PixelBlendFactor::DOUBLEDSTALPHA:
case PixelBlendFactor::DOUBLEINVDSTALPHA:
state.usesDstAlpha = true;
break;
case PixelBlendFactor::OTHERCOLOR:
case PixelBlendFactor::INVOTHERCOLOR:
state.dstColorAsFactor = true;
break;
case PixelBlendFactor::SRCALPHA:
case PixelBlendFactor::INVSRCALPHA:
case PixelBlendFactor::DOUBLESRCALPHA:
case PixelBlendFactor::DOUBLEINVSRCALPHA:
state.srcColorAsFactor = true;
break;
default:
break;
}
switch (id.AlphaBlendDst()) {
case GE_DSTBLEND_INVSRCALPHA:
state.dstFactorIsInverse = id.AlphaBlendSrc() == GE_SRCBLEND_SRCALPHA;
case PixelBlendFactor::INVSRCALPHA:
state.dstFactorIsInverse = id.AlphaBlendSrc() == PixelBlendFactor::SRCALPHA;
state.srcColorAsFactor = true;
break;
case GE_DSTBLEND_DOUBLEINVSRCALPHA:
state.dstFactorIsInverse = id.AlphaBlendSrc() == GE_SRCBLEND_DOUBLESRCALPHA;
case PixelBlendFactor::DOUBLEINVSRCALPHA:
state.dstFactorIsInverse = id.AlphaBlendSrc() == PixelBlendFactor::DOUBLESRCALPHA;
state.srcColorAsFactor = true;
break;
case GE_DSTBLEND_DSTALPHA:
case PixelBlendFactor::DSTALPHA:
state.usesDstAlpha = true;
break;
case GE_DSTBLEND_INVDSTALPHA:
state.dstFactorIsInverse = id.AlphaBlendSrc() == GE_SRCBLEND_DSTALPHA;
case PixelBlendFactor::INVDSTALPHA:
state.dstFactorIsInverse = id.AlphaBlendSrc() == PixelBlendFactor::DSTALPHA;
state.usesDstAlpha = true;
break;
case GE_DSTBLEND_DOUBLEDSTALPHA:
case PixelBlendFactor::DOUBLEDSTALPHA:
state.usesDstAlpha = true;
break;
case GE_DSTBLEND_DOUBLEINVDSTALPHA:
state.dstFactorIsInverse = id.AlphaBlendSrc() == GE_SRCBLEND_DOUBLEDSTALPHA;
case PixelBlendFactor::DOUBLEINVDSTALPHA:
state.dstFactorIsInverse = id.AlphaBlendSrc() == PixelBlendFactor::DOUBLEDSTALPHA;
state.usesDstAlpha = true;
break;
case PixelBlendFactor::OTHERCOLOR:
case PixelBlendFactor::INVOTHERCOLOR:
state.dstColorAsFactor = true;
break;
case PixelBlendFactor::SRCALPHA:
case PixelBlendFactor::DOUBLESRCALPHA:
state.srcColorAsFactor = true;
break;
default:
break;
}
state.dstColorAsFactor = state.dstColorAsFactor || state.usesDstAlpha;
}
}

View File

@ -47,6 +47,8 @@ struct PixelBlendState {
bool usesFactors = false;
bool usesDstAlpha = false;
bool dstFactorIsInverse = false;
bool srcColorAsFactor = false;
bool dstColorAsFactor = false;
};
void ComputePixelBlendState(PixelBlendState &state, const PixelFuncID &id);
@ -88,7 +90,7 @@ private:
bool Jit_DepthTest(const PixelFuncID &id);
bool Jit_WriteDepth(const PixelFuncID &id);
bool Jit_AlphaBlend(const PixelFuncID &id);
bool Jit_BlendFactor(const PixelFuncID &id, RegCache::Reg factorReg, RegCache::Reg dstReg, GEBlendSrcFactor factor);
bool Jit_BlendFactor(const PixelFuncID &id, RegCache::Reg factorReg, RegCache::Reg dstReg, PixelBlendFactor factor);
bool Jit_DstBlendFactor(const PixelFuncID &id, RegCache::Reg srcFactorReg, RegCache::Reg dstFactorReg, RegCache::Reg dstReg);
bool Jit_Dither(const PixelFuncID &id);
bool Jit_WriteColor(const PixelFuncID &id);

View File

@ -217,10 +217,15 @@ RegCache::Reg PixelJitCache::GetColorOff(const PixelFuncID &id) {
MOV(32, R(r), R(argYReg));
SHL(32, R(r), Imm8(9));
} else {
X64Reg gstateReg = GetGState();
r = regCache_.Alloc(RegCache::GEN_COLOR_OFF);
MOVZX(32, 16, r, MDisp(gstateReg, offsetof(GPUgstate, fbwidth)));
regCache_.Unlock(gstateReg, RegCache::GEN_GSTATE);
if (RipAccessible(&gstate.fbwidth)) {
r = regCache_.Alloc(RegCache::GEN_COLOR_OFF);
MOVZX(32, 16, r, M(&gstate.fbwidth));
} else {
X64Reg gstateReg = GetGState();
r = regCache_.Alloc(RegCache::GEN_COLOR_OFF);
MOVZX(32, 16, r, MDisp(gstateReg, offsetof(GPUgstate, fbwidth)));
regCache_.Unlock(gstateReg, RegCache::GEN_GSTATE);
}
AND(16, R(r), Imm16(0x07FC));
IMUL(32, r, R(argYReg));
@ -259,10 +264,15 @@ RegCache::Reg PixelJitCache::GetDepthOff(const PixelFuncID &id) {
MOV(32, R(r), R(argYReg));
SHL(32, R(r), Imm8(9));
} else {
X64Reg gstateReg = GetGState();
r = regCache_.Alloc(RegCache::GEN_DEPTH_OFF);
MOVZX(32, 16, r, MDisp(gstateReg, offsetof(GPUgstate, zbwidth)));
regCache_.Unlock(gstateReg, RegCache::GEN_GSTATE);
if (RipAccessible(&gstate.zbwidth)) {
r = regCache_.Alloc(RegCache::GEN_DEPTH_OFF);
MOVZX(32, 16, r, M(&gstate.zbwidth));
} else {
X64Reg gstateReg = GetGState();
r = regCache_.Alloc(RegCache::GEN_DEPTH_OFF);
MOVZX(32, 16, r, MDisp(gstateReg, offsetof(GPUgstate, zbwidth)));
regCache_.Unlock(gstateReg, RegCache::GEN_GSTATE);
}
AND(16, R(r), Imm16(0x07FC));
IMUL(32, r, R(argYReg));
@ -516,9 +526,13 @@ bool PixelJitCache::Jit_ApplyFog(const PixelFuncID &id) {
// Load a set of 255s at 16 bit into a reg for later...
X64Reg invertReg = regCache_.Alloc(RegCache::VEC_TEMP2);
X64Reg constReg = GetConstBase();
MOVDQA(invertReg, MConstDisp(constReg, &const255_16s[0]));
regCache_.Unlock(constReg, RegCache::GEN_CONST_BASE);
if (RipAccessible(&const255_16s[0])) {
MOVDQA(invertReg, M(&const255_16s[0]));
} else {
X64Reg constReg = GetConstBase();
MOVDQA(invertReg, MConstDisp(constReg, &const255_16s[0]));
regCache_.Unlock(constReg, RegCache::GEN_CONST_BASE);
}
// Expand (we clamped) color to 16 bit as well, so we can multiply with fog.
X64Reg argColorReg = regCache_.Find(RegCache::VEC_ARG_COLOR);
@ -563,9 +577,13 @@ bool PixelJitCache::Jit_ApplyFog(const PixelFuncID &id) {
regCache_.Release(fogMultReg, RegCache::VEC_TEMP3);
// Now to divide by 255, we use bit tricks: multiply by 0x8081, and shift right by 16+7.
constReg = GetConstBase();
PMULHUW(argColorReg, MConstDisp(constReg, &by255i));
regCache_.Unlock(constReg, RegCache::GEN_CONST_BASE);
if (RipAccessible(&by255i[0])) {
PMULHUW(argColorReg, M(&by255i[0]));
} else {
X64Reg constReg = GetConstBase();
PMULHUW(argColorReg, MConstDisp(constReg, &by255i[0]));
regCache_.Unlock(constReg, RegCache::GEN_CONST_BASE);
}
// Now shift right by 7 (PMULHUW already did 16 of the shift.)
PSRLW(argColorReg, 7);
@ -1081,34 +1099,66 @@ bool PixelJitCache::Jit_AlphaBlend(const PixelFuncID &id) {
}
colorIs16Bit_ = true;
// Skip multiplying by factors if we can.
bool multiplySrc = id.AlphaBlendSrc() != PixelBlendFactor::ZERO && id.AlphaBlendSrc() != PixelBlendFactor::ONE;
bool multiplyDst = id.AlphaBlendDst() != PixelBlendFactor::ZERO && id.AlphaBlendDst() != PixelBlendFactor::ONE;
// We also shift left by 4, so mulhi gives us a free shift
// We also need to add a half bit later, so this gives us space.
PSLLW(argColorReg, 4);
PSLLW(dstReg, 4);
if (multiplySrc || blendState.srcColorAsFactor)
PSLLW(argColorReg, 4);
if (multiplyDst || blendState.dstColorAsFactor)
PSLLW(dstReg, 4);
// Okay, now grab our factors.
success = success && Jit_BlendFactor(id, srcFactorReg, dstReg, id.AlphaBlendSrc());
success = success && Jit_DstBlendFactor(id, srcFactorReg, dstFactorReg, dstReg);
// Okay, now grab our factors. Don't bother if they're known values.
if (id.AlphaBlendSrc() < PixelBlendFactor::ZERO)
success = success && Jit_BlendFactor(id, srcFactorReg, dstReg, id.AlphaBlendSrc());
if (id.AlphaBlendDst() < PixelBlendFactor::ZERO)
success = success && Jit_DstBlendFactor(id, srcFactorReg, dstFactorReg, dstReg);
X64Reg constReg = GetConstBase();
X64Reg halfReg = regCache_.Alloc(RegCache::VEC_TEMP3);
// We'll use this several times, so load into a reg.
MOVDQA(halfReg, MConstDisp(constReg, &blendHalf_11_4s[0]));
regCache_.Unlock(constReg, RegCache::GEN_CONST_BASE);
X64Reg halfReg = INVALID_REG;
if (multiplySrc || multiplyDst) {
halfReg = regCache_.Alloc(RegCache::VEC_TEMP3);
// We'll use this several times, so load into a reg.
if (RipAccessible(&blendHalf_11_4s[0])) {
MOVDQA(halfReg, M(&blendHalf_11_4s[0]));
} else {
X64Reg constReg = GetConstBase();
MOVDQA(halfReg, MConstDisp(constReg, &blendHalf_11_4s[0]));
regCache_.Unlock(constReg, RegCache::GEN_CONST_BASE);
}
}
// Add in the half bit to the factors and color values, then multiply.
// We take the high 16 bits to get a free right shift by 16.
POR(srcFactorReg, R(halfReg));
POR(argColorReg, R(halfReg));
PMULHUW(argColorReg, R(srcFactorReg));
if (multiplySrc) {
POR(srcFactorReg, R(halfReg));
POR(argColorReg, R(halfReg));
PMULHUW(argColorReg, R(srcFactorReg));
} else if (id.AlphaBlendSrc() == PixelBlendFactor::ZERO) {
PXOR(argColorReg, R(argColorReg));
} else if (id.AlphaBlendSrc() == PixelBlendFactor::ONE) {
if (blendState.srcColorAsFactor)
PSRLW(argColorReg, 4);
}
POR(dstFactorReg, R(halfReg));
POR(dstReg, R(halfReg));
PMULHUW(dstReg, R(dstFactorReg));
if (multiplyDst) {
POR(dstFactorReg, R(halfReg));
POR(dstReg, R(halfReg));
PMULHUW(dstReg, R(dstFactorReg));
} else if (id.AlphaBlendDst() == PixelBlendFactor::ZERO) {
// No need to add or subtract zero, unless we're negating.
// This is common for bloom preparation.
if (id.AlphaBlendEq() == GE_BLENDMODE_MUL_AND_SUBTRACT_REVERSE)
PXOR(dstReg, R(dstReg));
} else if (id.AlphaBlendDst() == PixelBlendFactor::ONE) {
if (blendState.dstColorAsFactor)
PSRLW(dstReg, 4);
}
regCache_.Release(srcFactorReg, RegCache::VEC_TEMP1);
regCache_.Release(dstFactorReg, RegCache::VEC_TEMP2);
regCache_.Release(halfReg, RegCache::VEC_TEMP3);
if (halfReg != INVALID_REG)
regCache_.Release(halfReg, RegCache::VEC_TEMP3);
} else if (colorIs16Bit_) {
// If it's expanded, shrink and clamp for our min/max/absdiff handling.
PACKUSWB(argColorReg, R(argColorReg));
@ -1121,11 +1171,13 @@ bool PixelJitCache::Jit_AlphaBlend(const PixelFuncID &id) {
X64Reg tempReg = regCache_.Alloc(RegCache::VEC_TEMP1);
switch (id.AlphaBlendEq()) {
case GE_BLENDMODE_MUL_AND_ADD:
PADDUSW(argColorReg, R(dstReg));
if (id.AlphaBlendDst() != PixelBlendFactor::ZERO)
PADDUSW(argColorReg, R(dstReg));
break;
case GE_BLENDMODE_MUL_AND_SUBTRACT:
PSUBUSW(argColorReg, R(dstReg));
if (id.AlphaBlendDst() != PixelBlendFactor::ZERO)
PSUBUSW(argColorReg, R(dstReg));
break;
case GE_BLENDMODE_MUL_AND_SUBTRACT_REVERSE:
@ -1165,7 +1217,7 @@ bool PixelJitCache::Jit_AlphaBlend(const PixelFuncID &id) {
}
bool PixelJitCache::Jit_BlendFactor(const PixelFuncID &id, RegCache::Reg factorReg, RegCache::Reg dstReg, GEBlendSrcFactor factor) {
bool PixelJitCache::Jit_BlendFactor(const PixelFuncID &id, RegCache::Reg factorReg, RegCache::Reg dstReg, PixelBlendFactor factor) {
X64Reg constReg = INVALID_REG;
X64Reg gstateReg = INVALID_REG;
X64Reg tempReg = INVALID_REG;
@ -1177,74 +1229,96 @@ bool PixelJitCache::Jit_BlendFactor(const PixelFuncID &id, RegCache::Reg factorR
// Between source and dest factors, only DSTCOLOR, INVDSTCOLOR, and FIXA differ.
// In those cases, it uses SRCCOLOR, INVSRCCOLOR, and FIXB respectively.
// Load the invert constant first off, if needed.
switch (factor) {
case GE_SRCBLEND_DSTCOLOR:
case PixelBlendFactor::INVOTHERCOLOR:
case PixelBlendFactor::INVSRCALPHA:
case PixelBlendFactor::INVDSTALPHA:
case PixelBlendFactor::DOUBLEINVSRCALPHA:
case PixelBlendFactor::DOUBLEINVDSTALPHA:
if (RipAccessible(&blendInvert_11_4s[0])) {
MOVDQA(factorReg, M(&blendInvert_11_4s[0]));
} else {
constReg = GetConstBase();
MOVDQA(factorReg, MConstDisp(constReg, &blendInvert_11_4s[0]));
regCache_.Unlock(constReg, RegCache::GEN_CONST_BASE);
}
break;
default:
break;
}
switch (factor) {
case PixelBlendFactor::OTHERCOLOR:
MOVDQA(factorReg, R(dstReg));
break;
case GE_SRCBLEND_INVDSTCOLOR:
constReg = GetConstBase();
MOVDQA(factorReg, MConstDisp(constReg, &blendInvert_11_4s[0]));
case PixelBlendFactor::INVOTHERCOLOR:
PSUBUSW(factorReg, R(dstReg));
break;
case GE_SRCBLEND_SRCALPHA:
case PixelBlendFactor::SRCALPHA:
PSHUFLW(factorReg, R(argColorReg), _MM_SHUFFLE(3, 3, 3, 3));
break;
case GE_SRCBLEND_INVSRCALPHA:
constReg = GetConstBase();
case PixelBlendFactor::INVSRCALPHA:
tempReg = regCache_.Alloc(RegCache::VEC_TEMP3);
MOVDQA(factorReg, MConstDisp(constReg, &blendInvert_11_4s[0]));
PSHUFLW(tempReg, R(argColorReg), _MM_SHUFFLE(3, 3, 3, 3));
PSUBUSW(factorReg, R(tempReg));
break;
case GE_SRCBLEND_DSTALPHA:
case PixelBlendFactor::DSTALPHA:
PSHUFLW(factorReg, R(dstReg), _MM_SHUFFLE(3, 3, 3, 3));
break;
case GE_SRCBLEND_INVDSTALPHA:
constReg = GetConstBase();
case PixelBlendFactor::INVDSTALPHA:
tempReg = regCache_.Alloc(RegCache::VEC_TEMP3);
MOVDQA(factorReg, MConstDisp(constReg, &blendInvert_11_4s[0]));
PSHUFLW(tempReg, R(dstReg), _MM_SHUFFLE(3, 3, 3, 3));
PSUBUSW(factorReg, R(tempReg));
break;
case GE_SRCBLEND_DOUBLESRCALPHA:
case PixelBlendFactor::DOUBLESRCALPHA:
PSHUFLW(factorReg, R(argColorReg), _MM_SHUFFLE(3, 3, 3, 3));
PSLLW(factorReg, 1);
break;
case GE_SRCBLEND_DOUBLEINVSRCALPHA:
constReg = GetConstBase();
case PixelBlendFactor::DOUBLEINVSRCALPHA:
tempReg = regCache_.Alloc(RegCache::VEC_TEMP3);
MOVDQA(factorReg, MConstDisp(constReg, &blendInvert_11_4s[0]));
PSHUFLW(tempReg, R(argColorReg), _MM_SHUFFLE(3, 3, 3, 3));
PSLLW(tempReg, 1);
PSUBUSW(factorReg, R(tempReg));
break;
case GE_SRCBLEND_DOUBLEDSTALPHA:
case PixelBlendFactor::DOUBLEDSTALPHA:
PSHUFLW(factorReg, R(dstReg), _MM_SHUFFLE(3, 3, 3, 3));
PSLLW(factorReg, 1);
break;
case GE_SRCBLEND_DOUBLEINVDSTALPHA:
constReg = GetConstBase();
case PixelBlendFactor::DOUBLEINVDSTALPHA:
tempReg = regCache_.Alloc(RegCache::VEC_TEMP3);
MOVDQA(factorReg, MConstDisp(constReg, &blendInvert_11_4s[0]));
PSHUFLW(tempReg, R(dstReg), _MM_SHUFFLE(3, 3, 3, 3));
PSLLW(tempReg, 1);
PSUBUSW(factorReg, R(tempReg));
break;
case GE_SRCBLEND_FIXA:
case PixelBlendFactor::ZERO:
// Special value meaning zero.
PXOR(factorReg, R(factorReg));
break;
case PixelBlendFactor::ONE:
// Special value meaning all 255s.
PCMPEQD(factorReg, R(factorReg));
PSLLW(factorReg, 8);
PSRLW(factorReg, 4);
break;
case PixelBlendFactor::FIX:
default:
gstateReg = GetGState();
if (cpu_info.bSSE4_1) {
@ -1260,8 +1334,6 @@ bool PixelJitCache::Jit_BlendFactor(const PixelFuncID &id, RegCache::Reg factorR
break;
}
if (constReg != INVALID_REG)
regCache_.Unlock(constReg, RegCache::GEN_CONST_BASE);
if (gstateReg != INVALID_REG)
regCache_.Unlock(gstateReg, RegCache::GEN_GSTATE);
if (tempReg != INVALID_REG)
@ -1285,37 +1357,47 @@ bool PixelJitCache::Jit_DstBlendFactor(const PixelFuncID &id, RegCache::Reg srcF
// We might be able to reuse srcFactorReg for dst, in some cases.
switch (id.AlphaBlendDst()) {
case GE_DSTBLEND_SRCCOLOR:
case PixelBlendFactor::OTHERCOLOR:
MOVDQA(dstFactorReg, R(argColorReg));
break;
case GE_DSTBLEND_INVSRCCOLOR:
constReg = GetConstBase();
MOVDQA(dstFactorReg, MConstDisp(constReg, &blendInvert_11_4s[0]));
case PixelBlendFactor::INVOTHERCOLOR:
if (RipAccessible(&blendInvert_11_4s[0])) {
MOVDQA(dstFactorReg, M(&blendInvert_11_4s[0]));
} else {
constReg = GetConstBase();
MOVDQA(dstFactorReg, MConstDisp(constReg, &blendInvert_11_4s[0]));
}
PSUBUSW(dstFactorReg, R(argColorReg));
break;
case GE_DSTBLEND_SRCALPHA:
case GE_DSTBLEND_INVSRCALPHA:
case GE_DSTBLEND_DSTALPHA:
case GE_DSTBLEND_INVDSTALPHA:
case GE_DSTBLEND_DOUBLESRCALPHA:
case GE_DSTBLEND_DOUBLEINVSRCALPHA:
case GE_DSTBLEND_DOUBLEDSTALPHA:
case GE_DSTBLEND_DOUBLEINVDSTALPHA:
case PixelBlendFactor::SRCALPHA:
case PixelBlendFactor::INVSRCALPHA:
case PixelBlendFactor::DSTALPHA:
case PixelBlendFactor::INVDSTALPHA:
case PixelBlendFactor::DOUBLESRCALPHA:
case PixelBlendFactor::DOUBLEINVSRCALPHA:
case PixelBlendFactor::DOUBLEDSTALPHA:
case PixelBlendFactor::DOUBLEINVDSTALPHA:
case PixelBlendFactor::ZERO:
case PixelBlendFactor::ONE:
// These are all equivalent for src factor, so reuse that logic.
if (id.AlphaBlendSrc() == GEBlendSrcFactor(id.AlphaBlendDst())) {
if (id.AlphaBlendSrc() == id.AlphaBlendDst()) {
MOVDQA(dstFactorReg, R(srcFactorReg));
} else if (blendState.dstFactorIsInverse) {
constReg = GetConstBase();
MOVDQA(dstFactorReg, MConstDisp(constReg, &blendInvert_11_4s[0]));
if (RipAccessible(&blendInvert_11_4s[0])) {
MOVDQA(dstFactorReg, M(&blendInvert_11_4s[0]));
} else {
constReg = GetConstBase();
MOVDQA(dstFactorReg, MConstDisp(constReg, &blendInvert_11_4s[0]));
}
PSUBUSW(dstFactorReg, R(srcFactorReg));
} else {
success = success && Jit_BlendFactor(id, dstFactorReg, dstReg, GEBlendSrcFactor(id.AlphaBlendDst()));
success = success && Jit_BlendFactor(id, dstFactorReg, dstReg, id.AlphaBlendDst());
}
break;
case GE_DSTBLEND_FIXB:
case PixelBlendFactor::FIX:
default:
gstateReg = GetGState();
if (cpu_info.bSSE4_1) {
@ -1697,11 +1779,17 @@ bool PixelJitCache::Jit_WriteColor(const PixelFuncID &id) {
}
bool PixelJitCache::Jit_ApplyLogicOp(const PixelFuncID &id, RegCache::Reg colorReg, RegCache::Reg maskReg) {
X64Reg gstateReg = GetGState();
X64Reg logicOpReg = regCache_.Alloc(RegCache::GEN_TEMP4);
MOVZX(32, 8, logicOpReg, MDisp(gstateReg, offsetof(GPUgstate, lop)));
X64Reg logicOpReg = INVALID_REG;
if (RipAccessible(&gstate.lop)) {
logicOpReg = regCache_.Alloc(RegCache::GEN_TEMP4);
MOVZX(32, 8, logicOpReg, M(&gstate.lop));
} else {
X64Reg gstateReg = GetGState();
logicOpReg = regCache_.Alloc(RegCache::GEN_TEMP4);
MOVZX(32, 8, logicOpReg, MDisp(gstateReg, offsetof(GPUgstate, lop)));
regCache_.Unlock(gstateReg, RegCache::GEN_GSTATE);
}
AND(8, R(logicOpReg), Imm8(0x0F));
regCache_.Unlock(gstateReg, RegCache::GEN_GSTATE);
X64Reg stencilReg = INVALID_REG;
if (regCache_.Has(RegCache::GEN_STENCIL))

View File

@ -42,6 +42,14 @@ static inline GEComparison OptimizeRefByteCompare(GEComparison func, u8 ref) {
return func;
}
static inline PixelBlendFactor OptimizeAlphaFactor(uint32_t color) {
if (color == 0x00000000)
return PixelBlendFactor::ZERO;
if (color == 0x00FFFFFF)
return PixelBlendFactor::ONE;
return PixelBlendFactor::FIX;
}
void ComputePixelFuncID(PixelFuncID *id) {
id->fullKey = 0;
@ -143,10 +151,16 @@ void ComputePixelFuncID(PixelFuncID *id) {
if (srcFixedOne && dstFixedZero)
id->alphaBlend = false;
}
if (id->alphaBlend) {
if (id->alphaBlend)
id->alphaBlendEq = gstate.getBlendEq();
if (id->alphaBlend && id->alphaBlendEq <= GE_BLENDMODE_MUL_AND_SUBTRACT_REVERSE) {
id->alphaBlendSrc = gstate.getBlendFuncA();
id->alphaBlendDst = gstate.getBlendFuncB();
// Special values.
if (id->alphaBlendSrc == GE_SRCBLEND_FIXA)
id->alphaBlendSrc = (uint8_t)OptimizeAlphaFactor(gstate.getFixA());
if (id->alphaBlendDst == GE_DSTBLEND_FIXB)
id->alphaBlendDst = (uint8_t)OptimizeAlphaFactor(gstate.getFixB());
}
id->applyLogicOp = gstate.isLogicOpEnabled() && gstate.getLogicOp() != GE_LOGIC_COPY;
@ -301,30 +315,34 @@ std::string DescribePixelFuncID(const PixelFuncID &id) {
case GE_BLENDMODE_ABSDIFF: desc += "BlendDiff<"; break;
}
switch (id.AlphaBlendSrc()) {
case GE_SRCBLEND_DSTCOLOR: desc += "DstRGB,"; break;
case GE_SRCBLEND_INVDSTCOLOR: desc += "1-DstRGB,"; break;
case GE_SRCBLEND_SRCALPHA: desc += "SrcA,"; break;
case GE_SRCBLEND_INVSRCALPHA: desc += "1-SrcA,"; break;
case GE_SRCBLEND_DSTALPHA: desc += "DstA,"; break;
case GE_SRCBLEND_INVDSTALPHA: desc += "1-DstA,"; break;
case GE_SRCBLEND_DOUBLESRCALPHA: desc += "2*SrcA,"; break;
case GE_SRCBLEND_DOUBLEINVSRCALPHA: desc += "1-2*SrcA,"; break;
case GE_SRCBLEND_DOUBLEDSTALPHA: desc += "2*DstA,"; break;
case GE_SRCBLEND_DOUBLEINVDSTALPHA: desc += "1-2*DstA,"; break;
case GE_SRCBLEND_FIXA: desc += "Fix,"; break;
case PixelBlendFactor::OTHERCOLOR: desc += "DstRGB,"; break;
case PixelBlendFactor::INVOTHERCOLOR: desc += "1-DstRGB,"; break;
case PixelBlendFactor::SRCALPHA: desc += "SrcA,"; break;
case PixelBlendFactor::INVSRCALPHA: desc += "1-SrcA,"; break;
case PixelBlendFactor::DSTALPHA: desc += "DstA,"; break;
case PixelBlendFactor::INVDSTALPHA: desc += "1-DstA,"; break;
case PixelBlendFactor::DOUBLESRCALPHA: desc += "2*SrcA,"; break;
case PixelBlendFactor::DOUBLEINVSRCALPHA: desc += "1-2*SrcA,"; break;
case PixelBlendFactor::DOUBLEDSTALPHA: desc += "2*DstA,"; break;
case PixelBlendFactor::DOUBLEINVDSTALPHA: desc += "1-2*DstA,"; break;
case PixelBlendFactor::FIX: desc += "Fix,"; break;
case PixelBlendFactor::ZERO: desc += "0,"; break;
case PixelBlendFactor::ONE: desc += "1,"; break;
}
switch (id.AlphaBlendDst()) {
case GE_DSTBLEND_SRCCOLOR: desc += "SrcRGB>:"; break;
case GE_DSTBLEND_INVSRCCOLOR: desc += "1-SrcRGB>:"; break;
case GE_DSTBLEND_SRCALPHA: desc += "SrcA>:"; break;
case GE_DSTBLEND_INVSRCALPHA: desc += "1-SrcA>:"; break;
case GE_DSTBLEND_DSTALPHA: desc += "DstA>:"; break;
case GE_DSTBLEND_INVDSTALPHA: desc += "1-DstA>:"; break;
case GE_DSTBLEND_DOUBLESRCALPHA: desc += "2*SrcA>:"; break;
case GE_DSTBLEND_DOUBLEINVSRCALPHA: desc += "1-2*SrcA>:"; break;
case GE_DSTBLEND_DOUBLEDSTALPHA: desc += "2*DstA>:"; break;
case GE_DSTBLEND_DOUBLEINVDSTALPHA: desc += "1-2*DstA>:"; break;
case GE_DSTBLEND_FIXB: desc += "Fix>:"; break;
case PixelBlendFactor::OTHERCOLOR: desc += "SrcRGB>:"; break;
case PixelBlendFactor::INVOTHERCOLOR: desc += "1-SrcRGB>:"; break;
case PixelBlendFactor::SRCALPHA: desc += "SrcA>:"; break;
case PixelBlendFactor::INVSRCALPHA: desc += "1-SrcA>:"; break;
case PixelBlendFactor::DSTALPHA: desc += "DstA>:"; break;
case PixelBlendFactor::INVDSTALPHA: desc += "1-DstA>:"; break;
case PixelBlendFactor::DOUBLESRCALPHA: desc += "2*SrcA>:"; break;
case PixelBlendFactor::DOUBLEINVSRCALPHA: desc += "1-2*SrcA>:"; break;
case PixelBlendFactor::DOUBLEDSTALPHA: desc += "2*DstA>:"; break;
case PixelBlendFactor::DOUBLEINVDSTALPHA: desc += "1-2*DstA>:"; break;
case PixelBlendFactor::FIX: desc += "Fix>:"; break;
case PixelBlendFactor::ZERO: desc += "0>:"; break;
case PixelBlendFactor::ONE: desc += "1>:"; break;
}
}

View File

@ -25,6 +25,24 @@
#define SOFTPIXEL_USE_CACHE 1
// 0-10 match GEBlendSrcFactor/GEBlendDstFactor.
enum class PixelBlendFactor {
OTHERCOLOR,
INVOTHERCOLOR,
SRCALPHA,
INVSRCALPHA,
DSTALPHA,
INVDSTALPHA,
DOUBLESRCALPHA,
DOUBLEINVSRCALPHA,
DOUBLEDSTALPHA,
DOUBLEINVDSTALPHA,
FIX,
// These are invented, but common FIX values.
ZERO,
ONE,
};
#pragma pack(push, 1)
struct PixelFuncID {
@ -110,11 +128,11 @@ struct PixelFuncID {
GEBlendMode AlphaBlendEq() const {
return GEBlendMode(alphaBlendEq);
}
GEBlendSrcFactor AlphaBlendSrc() const {
return GEBlendSrcFactor(alphaBlendSrc);
PixelBlendFactor AlphaBlendSrc() const {
return PixelBlendFactor(alphaBlendSrc);
}
GEBlendDstFactor AlphaBlendDst() const {
return GEBlendDstFactor(alphaBlendDst);
PixelBlendFactor AlphaBlendDst() const {
return PixelBlendFactor(alphaBlendDst);
}
GEStencilOp SFail() const {

View File

@ -381,8 +381,8 @@ static inline Vec3<int> GetDestFactor(GEBlendDstFactor factor, const Vec4<int> &
Vec3<int> AlphaBlendingResult(const PixelFuncID &pixelID, const Vec4<int> &source, const Vec4<int> &dst)
{
// Note: These factors cannot go below 0, but they can go above 255 when doubling.
Vec3<int> srcfactor = GetSourceFactor(pixelID.AlphaBlendSrc(), source, dst);
Vec3<int> dstfactor = GetDestFactor(pixelID.AlphaBlendDst(), source, dst);
Vec3<int> srcfactor = GetSourceFactor(GEBlendSrcFactor(pixelID.AlphaBlendSrc()), source, dst);
Vec3<int> dstfactor = GetDestFactor(GEBlendDstFactor(pixelID.AlphaBlendDst()), source, dst);
switch (pixelID.AlphaBlendEq()) {
case GE_BLENDMODE_MUL_AND_ADD:

View File

@ -94,6 +94,7 @@ void DrawSprite(const VertexData& v0, const VertexData& v1) {
ScreenCoords pprime(v0.screenpos.x, v0.screenpos.y, 0);
Sampler::FetchFunc fetchFunc = Sampler::GetFetchFunc(samplerID);
Sampler::NearestFunc nearestFunc = Sampler::GetNearestFunc(samplerID);
Rasterizer::SingleFunc drawPixel = Rasterizer::GetSingleFunc(pixelID);
DrawingCoords pos0 = TransformUnit::ScreenToDrawing(v0.screenpos);
@ -189,19 +190,25 @@ void DrawSprite(const VertexData& v0, const VertexData& v1) {
}, pos0.y, pos1.y, MIN_LINES_PER_THREAD);
}
} else {
int xoff = ((v0.screenpos.x & 15) + 1) / 2;
int yoff = ((v0.screenpos.y & 15) + 1) / 2;
float dsf = 1.0f / (float)gstate.getTextureWidth(0);
float dtf = 1.0f / (float)gstate.getTextureHeight(0);
float sf_start = s_start * dsf;
float tf_start = t_start * dtf;
ParallelRangeLoop(&g_threadManager, [=](int y1, int y2) {
int t = t_start + (y1 - pos0.y) * dt;
float t = tf_start + (y1 - pos0.y) * dtf;
for (int y = y1; y < y2; y++) {
int s = s_start;
float s = sf_start;
// Not really that fast but faster than triangle.
for (int x = pos0.x; x < pos1.x; x++) {
Vec4<int> prim_color = v1.color0;
Vec4<int> tex_color = fetchFunc(s, t, texptr, texbufw, 0);
prim_color = GetTextureFunctionOutput(ToVec4IntArg(prim_color), ToVec4IntArg(tex_color));
Vec4<int> prim_color = nearestFunc(s, t, xoff, yoff, ToVec4IntArg(v1.color0), &texptr, &texbufw, 0, 0);
drawPixel(x, y, z, 255, ToVec4IntArg(prim_color), pixelID);
s += ds;
s += dsf;
}
t += dt;
t += dtf;
}
}, pos0.y, pos1.y, MIN_LINES_PER_THREAD);
}
@ -292,6 +299,7 @@ bool RectangleFastPath(const VertexData &v0, const VertexData &v1) {
bool orient_check = xdiff >= 0 && ydiff >= 0;
// We already have a fast path for clear in ClearRectangle.
bool state_check = !gstate.isModeClear() && NoClampOrWrap(v0.texturecoords) && NoClampOrWrap(v1.texturecoords);
// TODO: No mipmap levels? Might be a font at level 1...
if ((coord_check || !gstate.isTextureMapEnabled()) && orient_check && state_check) {
Rasterizer::DrawSprite(v0, v1);
return true;

View File

@ -39,8 +39,8 @@ extern u32 clut[4096];
namespace Sampler {
static Vec4IntResult SOFTRAST_CALL SampleNearest(float s, float t, int x, int y, Vec4IntArg prim_color, const u8 **tptr, const int *bufw, int level, int levelFrac);
static Vec4IntResult SOFTRAST_CALL SampleLinear(float s, float t, int x, int y, Vec4IntArg prim_color, const u8 **tptr, const int *bufw, int level, int levelFrac);
static Vec4IntResult SOFTRAST_CALL SampleNearest(float s, float t, int x, int y, Vec4IntArg prim_color, const u8 *const *tptr, const int *bufw, int level, int levelFrac);
static Vec4IntResult SOFTRAST_CALL SampleLinear(float s, float t, int x, int y, Vec4IntArg prim_color, const u8 *const *tptr, const int *bufw, int level, int levelFrac);
static Vec4IntResult SOFTRAST_CALL SampleFetch(int u, int v, const u8 *tptr, int bufw, int level);
std::mutex jitCacheLock;
@ -101,6 +101,7 @@ SamplerJitCache::SamplerJitCache()
{
// 256k should be enough.
AllocCodeSpace(1024 * 64 * 4);
ClearCodeSpace(0);
// Add some random code to "help" MSVC's buggy disassembler :(
#if defined(_WIN32) && (PPSSPP_ARCH(X86) || PPSSPP_ARCH(AMD64)) && !PPSSPP_PLATFORM(UWP)
@ -443,7 +444,7 @@ static inline void GetTexelCoordinates(int level, float s, float t, int &out_u,
ApplyTexelClamp<1>(&out_u, &out_v, &base_u, &base_v, width, height);
}
static Vec4IntResult SOFTRAST_CALL SampleNearest(float s, float t, int x, int y, Vec4IntArg prim_color, const u8 **tptr, const int *bufw, int level, int levelFrac) {
static Vec4IntResult SOFTRAST_CALL SampleNearest(float s, float t, int x, int y, Vec4IntArg prim_color, const u8 *const *tptr, const int *bufw, int level, int levelFrac) {
int u, v;
// Nearest filtering only. Round texcoords.
@ -539,7 +540,7 @@ static inline Vec4IntResult SOFTRAST_CALL GetTexelCoordinatesQuadT(int level, fl
return ApplyTexelClampQuadT(gstate.isTexCoordClampedT(), base_v, height);
}
static Vec4IntResult SOFTRAST_CALL SampleLinearLevel(float s, float t, int x, int y, const u8 **tptr, const int *bufw, int texlevel) {
static Vec4IntResult SOFTRAST_CALL SampleLinearLevel(float s, float t, int x, int y, const u8 *const *tptr, const int *bufw, int texlevel) {
int frac_u, frac_v;
const Vec4<int> u = GetTexelCoordinatesQuadS(texlevel, s, frac_u, x);
const Vec4<int> v = GetTexelCoordinatesQuadT(texlevel, t, frac_v, y);
@ -554,7 +555,7 @@ static Vec4IntResult SOFTRAST_CALL SampleLinearLevel(float s, float t, int x, in
return ToVec4IntResult((top * (0x10 - frac_v) + bot * frac_v) / (16 * 16));
}
static Vec4IntResult SOFTRAST_CALL SampleLinear(float s, float t, int x, int y, Vec4IntArg prim_color, const u8 **tptr, const int *bufw, int texlevel, int levelFrac) {
static Vec4IntResult SOFTRAST_CALL SampleLinear(float s, float t, int x, int y, Vec4IntArg prim_color, const u8 *const *tptr, const int *bufw, int texlevel, int levelFrac) {
Vec4<int> c0 = SampleLinearLevel(s, t, x, y, tptr, bufw, texlevel);
if (levelFrac) {
const Vec4<int> c1 = SampleLinearLevel(s, t, x, y, tptr + 1, bufw + 1, texlevel + 1);

View File

@ -36,10 +36,10 @@ namespace Sampler {
typedef Rasterizer::Vec4IntResult(SOFTRAST_CALL *FetchFunc)(int u, int v, const u8 *tptr, int bufw, int level);
FetchFunc GetFetchFunc(SamplerID id);
typedef Rasterizer::Vec4IntResult (SOFTRAST_CALL *NearestFunc)(float s, float t, int x, int y, Rasterizer::Vec4IntArg prim_color, const u8 **tptr, const int *bufw, int level, int levelFrac);
typedef Rasterizer::Vec4IntResult (SOFTRAST_CALL *NearestFunc)(float s, float t, int x, int y, Rasterizer::Vec4IntArg prim_color, const u8 *const *tptr, const int *bufw, int level, int levelFrac);
NearestFunc GetNearestFunc(SamplerID id);
typedef Rasterizer::Vec4IntResult (SOFTRAST_CALL *LinearFunc)(float s, float t, int x, int y, Rasterizer::Vec4IntArg prim_color, const u8 **tptr, const int *bufw, int level, int levelFrac);
typedef Rasterizer::Vec4IntResult (SOFTRAST_CALL *LinearFunc)(float s, float t, int x, int y, Rasterizer::Vec4IntArg prim_color, const u8 *const *tptr, const int *bufw, int level, int levelFrac);
LinearFunc GetLinearFunc(SamplerID id);
struct Funcs {