Merge pull request #10691 from hrydgard/more-drawprim-opt

More DrawPrim optimizations
This commit is contained in:
Henrik Rydgård 2018-03-05 18:23:08 +01:00 committed by GitHub
commit 90dbd9a725
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
7 changed files with 90 additions and 47 deletions

View File

@ -93,7 +93,7 @@ int DrawEngineCommon::ComputeNumVertsToDecode() const {
void DrawEngineCommon::DecodeVerts(u8 *dest) {
const UVScale origUV = gstate_c.uv;
for (; decodeCounter_ < numDrawCalls; decodeCounter_++) {
gstate_c.uv = uvScale[decodeCounter_];
gstate_c.uv = drawCalls[decodeCounter_].uvScale;
DecodeVertsStep(dest, decodeCounter_, decodedVerts_); // NOTE! DecodeVertsStep can modify decodeCounter_!
}
gstate_c.uv = origUV;
@ -601,11 +601,12 @@ ReliableHashType DrawEngineCommon::ComputeHash() {
}
}
fullhash += DoReliableHash(&uvScale[0], sizeof(uvScale[0]) * numDrawCalls, 0x0123e658);
fullhash += DoReliableHash(&drawCalls[0].uvScale, sizeof(drawCalls[0].uvScale) * numDrawCalls, 0x0123e658);
return fullhash;
}
void DrawEngineCommon::SubmitPrim(void *verts, void *inds, GEPrimitiveType prim, int vertexCount, u32 vertType, int *bytesRead) {
// vertTypeID is the vertex type but with the UVGen mode smashed into the top bits.
void DrawEngineCommon::SubmitPrim(void *verts, void *inds, GEPrimitiveType prim, int vertexCount, u32 vertTypeID, int *bytesRead) {
if (!indexGen.PrimCompatible(prevPrim_, prim) || numDrawCalls >= MAX_DEFERRED_DRAW_CALLS || vertexCountInDrawCalls_ + vertexCount > VERTEX_BUFFER_MAX) {
DispatchFlush();
}
@ -617,9 +618,6 @@ void DrawEngineCommon::SubmitPrim(void *verts, void *inds, GEPrimitiveType prim,
prevPrim_ = prim;
}
// As the decoder depends on the UVGenMode when we use UV prescale, we simply mash it
// into the top of the verttype where there are unused bits.
const u32 vertTypeID = (vertType & 0xFFFFFF) | (gstate.getUVGenMode() << 24);
// If vtype has changed, setup the vertex decoder.
if (vertTypeID != lastVType_) {
dec_ = GetVertexDecoder(vertTypeID);
@ -630,35 +628,34 @@ void DrawEngineCommon::SubmitPrim(void *verts, void *inds, GEPrimitiveType prim,
if ((vertexCount < 2 && prim > 0) || (vertexCount < 3 && prim > 2 && prim != GE_PRIM_RECTANGLES))
return;
DeferredDrawCall &dc = drawCalls[numDrawCalls];
dc.verts = verts;
dc.inds = inds;
dc.indexType = (vertType & GE_VTYPE_IDX_MASK) >> GE_VTYPE_IDX_SHIFT;
dc.prim = prim;
dc.vertexCount = vertexCount;
if (g_Config.bVertexCache) {
u32 dhash = dcid_;
dhash = __rotl(dhash ^ (u32)(uintptr_t)verts, 13);
dhash = __rotl(dhash ^ (u32)(uintptr_t)inds, 13);
dhash = __rotl(dhash ^ (u32)vertType, 13);
dhash = __rotl(dhash ^ (u32)vertTypeID, 13);
dhash = __rotl(dhash ^ (u32)vertexCount, 13);
dcid_ = dhash ^ (u32)prim;
}
DeferredDrawCall &dc = drawCalls[numDrawCalls];
dc.verts = verts;
dc.inds = inds;
dc.indexType = (vertTypeID & GE_VTYPE_IDX_MASK) >> GE_VTYPE_IDX_SHIFT;
dc.prim = prim;
dc.vertexCount = vertexCount;
dc.uvScale = gstate_c.uv;
if (inds) {
GetIndexBounds(inds, vertexCount, vertType, &dc.indexLowerBound, &dc.indexUpperBound);
GetIndexBounds(inds, vertexCount, vertTypeID, &dc.indexLowerBound, &dc.indexUpperBound);
} else {
dc.indexLowerBound = 0;
dc.indexUpperBound = vertexCount - 1;
}
uvScale[numDrawCalls] = gstate_c.uv;
numDrawCalls++;
vertexCountInDrawCalls_ += vertexCount;
if (vertType & GE_VTYPE_WEIGHT_MASK) {
if (vertTypeID & GE_VTYPE_WEIGHT_MASK) {
DecodeVertsStep(decoded, decodeCounter_, decodedVerts_);
decodeCounter_++;
}

View File

@ -44,6 +44,12 @@ typedef u64 ReliableHashType;
typedef u32 ReliableHashType;
#endif
inline uint32_t GetVertTypeID(uint32_t vertType, int uvGenMode) {
// As the decoder depends on the UVGenMode when we use UV prescale, we simply mash it
// into the top of the verttype where there are unused bits.
return (vertType & 0xFFFFFF) | (uvGenMode << 24);
}
class DrawEngineCommon {
public:
DrawEngineCommon();
@ -59,13 +65,14 @@ public:
// This would seem to be unnecessary now, but is still required for splines/beziers to work in the software backend since SubmitPrim
// is different. Should probably refactor that.
virtual void DispatchSubmitPrim(void *verts, void *inds, GEPrimitiveType prim, int vertexCount, u32 vertType, int *bytesRead) {
SubmitPrim(verts, inds, prim, vertexCount, vertType, bytesRead);
// Note that vertTypeID should be computed using GetVertTypeID().
virtual void DispatchSubmitPrim(void *verts, void *inds, GEPrimitiveType prim, int vertexCount, u32 vertTypeID, int *bytesRead) {
SubmitPrim(verts, inds, prim, vertexCount, vertTypeID, bytesRead);
}
bool TestBoundingBox(void* control_points, int vertexCount, u32 vertType, int *bytesRead);
void SubmitPrim(void *verts, void *inds, GEPrimitiveType prim, int vertexCount, u32 vertType, int *bytesRead);
void SubmitPrim(void *verts, void *inds, GEPrimitiveType prim, int vertexCount, u32 vertTypeID, int *bytesRead);
void SubmitSpline(const void *control_points, const void *indices, int tess_u, int tess_v, int count_u, int count_v, int type_u, int type_v, GEPatchPrimType prim_type, bool computeNormals, bool patchFacing, u32 vertType, int *bytesRead);
void SubmitBezier(const void *control_points, const void *indices, int tess_u, int tess_v, int count_u, int count_v, GEPatchPrimType prim_type, bool computeNormals, bool patchFacing, u32 vertType, int *bytesRead);
@ -135,13 +142,13 @@ protected:
u32 vertexCount;
u16 indexLowerBound;
u16 indexUpperBound;
UVScale uvScale;
};
enum { MAX_DEFERRED_DRAW_CALLS = 128 };
DeferredDrawCall drawCalls[MAX_DEFERRED_DRAW_CALLS];
int numDrawCalls = 0;
int vertexCountInDrawCalls_ = 0;
UVScale uvScale[MAX_DEFERRED_DRAW_CALLS];
int decimationCounter_ = 0;
int decodeCounter_ = 0;

View File

@ -949,8 +949,10 @@ void DrawEngineCommon::SubmitSpline(const void *control_points, const void *indi
gstate_c.uv.vOff = 0.0f;
}
uint32_t vertTypeID = GetVertTypeID(vertTypeWithIndex16, gstate.getUVGenMode());
int generatedBytesRead;
DispatchSubmitPrim(splineBuffer, quadIndices_, primType[prim_type], count, vertTypeWithIndex16, &generatedBytesRead);
DispatchSubmitPrim(splineBuffer, quadIndices_, primType[prim_type], count, vertTypeID, &generatedBytesRead);
DispatchFlush();
@ -1091,8 +1093,9 @@ void DrawEngineCommon::SubmitBezier(const void *control_points, const void *indi
gstate_c.uv.vOff = 0;
}
uint32_t vertTypeID = GetVertTypeID(vertTypeWithIndex16, gstate.getUVGenMode());
int generatedBytesRead;
DispatchSubmitPrim(splineBuffer, quadIndices_, primType[prim_type], count, vertTypeWithIndex16, &generatedBytesRead);
DispatchSubmitPrim(splineBuffer, quadIndices_, primType[prim_type], count, vertTypeID, &generatedBytesRead);
DispatchFlush();

View File

@ -39,6 +39,13 @@ alignas(16) static const float by32768[4] = {
1.0f / 32768.0f, 1.0f / 32768.0f, 1.0f / 32768.0f, 1.0f / 32768.0f,
};
alignas(16) static const float by128_11[4] = {
1.0f / 128.0f, 1.0f / 128.0f, 1.0f, 1.0f,
};
alignas(16) static const float by32768_11[4] = {
1.0f / 32768.0f, 1.0f / 32768.0f, 1.0f, 1.0f,
};
alignas(16) static const u32 threeMasks[4] = { 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0 };
alignas(16) static const u32 aOne[4] = {0, 0, 0, 0x3F800000};
@ -222,20 +229,14 @@ JittedVertexDecoder VertexDecoderJitCache::Compile(const VertexDecoder &dec, int
// Keep the scale/offset in a few fp registers if we need it.
if (prescaleStep) {
MOV(PTRBITS, R(tempReg1), ImmPtr(&gstate_c.uv));
MOVSS(fpScaleOffsetReg, MDisp(tempReg1, 0));
MOVSS(fpScratchReg, MDisp(tempReg1, 4));
UNPCKLPS(fpScaleOffsetReg, R(fpScratchReg));
MOVUPS(fpScaleOffsetReg, MatR(tempReg1));
if ((dec.VertexType() & GE_VTYPE_TC_MASK) == GE_VTYPE_TC_8BIT) {
MOV(PTRBITS, R(tempReg2), ImmPtr(&by128));
MOV(PTRBITS, R(tempReg2), ImmPtr(&by128_11));
MULPS(fpScaleOffsetReg, MatR(tempReg2));
} else if ((dec.VertexType() & GE_VTYPE_TC_MASK) == GE_VTYPE_TC_16BIT) {
MOV(PTRBITS, R(tempReg2), ImmPtr(&by32768));
MOV(PTRBITS, R(tempReg2), ImmPtr(&by32768_11));
MULPS(fpScaleOffsetReg, MatR(tempReg2));
}
MOVSS(fpScratchReg, MDisp(tempReg1, 8));
MOVSS(fpScratchReg2, MDisp(tempReg1, 12));
UNPCKLPS(fpScratchReg, R(fpScratchReg2));
UNPCKLPD(fpScaleOffsetReg, R(fpScratchReg));
}
// Let's not bother with a proper stack frame. We just grab the arguments and go.

View File

@ -1498,7 +1498,8 @@ void GPUCommon::Execute_Prim(u32 op, u32 diff) {
int bytesRead = 0;
UpdateUVScaleOffset();
drawEngineCommon_->SubmitPrim(verts, inds, prim, count, vertexType, &bytesRead);
uint32_t vertTypeID = GetVertTypeID(vertexType, gstate.getUVGenMode());
drawEngineCommon_->SubmitPrim(verts, inds, prim, count, vertTypeID, &bytesRead);
// After drawing, we advance the vertexAddr (when non indexed) or indexAddr (when indexed).
// Some games rely on this, they don't bother reloading VADDR and IADDR.
// The VADDR/IADDR registers are NOT updated.
@ -1527,6 +1528,7 @@ void GPUCommon::Execute_Prim(u32 op, u32 diff) {
{
u32 count = data & 0xFFFF;
if (count == 0) {
// Ignore.
break;
}
@ -1535,34 +1537,65 @@ void GPUCommon::Execute_Prim(u32 op, u32 diff) {
verts = Memory::GetPointerUnchecked(gstate_c.vertexAddr);
inds = 0;
if ((vertexType & GE_VTYPE_IDX_MASK) != GE_VTYPE_IDX_NONE) {
u32 indexAddr = gstate_c.indexAddr;
if (!Memory::IsValidAddress(indexAddr)) {
ERROR_LOG_REPORT(G3D, "Bad index address %08x!", indexAddr);
return;
}
inds = Memory::GetPointerUnchecked(indexAddr);
inds = Memory::GetPointerUnchecked(gstate_c.indexAddr);
}
drawEngineCommon_->SubmitPrim(verts, inds, newPrim, count, vertexType, &bytesRead);
drawEngineCommon_->SubmitPrim(verts, inds, newPrim, count, vertTypeID, &bytesRead);
AdvanceVerts(vertexType, count, bytesRead);
totalVertCount += count;
break;
}
case GE_CMD_VERTEXTYPE:
// Some games spam redundant GE_CMD_VERTEXTYPE
if (data != vertexType) { // don't mask data, vertexType is unmasked
{
uint32_t diff = data ^ vertexType;
// don't mask upper bits, vertexType is unmasked
if (diff & ~GE_VTYPE_WEIGHTCOUNT_MASK) {
goto bail;
} else {
vertexType = data;
vertTypeID = GetVertTypeID(vertexType, gstate.getUVGenMode());
}
break;
}
case GE_CMD_VADDR:
gstate_c.vertexAddr = gstate_c.getRelativeAddress(data & 0x00FFFFFF);
break;
case GE_CMD_OFFSETADDR:
gstate.cmdmem[GE_CMD_OFFSETADDR] = data;
gstate_c.offsetAddr = data << 8;
break;
case GE_CMD_BASE:
gstate.cmdmem[GE_CMD_BASE] = data;
break;
case GE_CMD_NOP:
case GE_CMD_NOP_FF:
break;
case GE_CMD_BONEMATRIXNUMBER:
gstate.cmdmem[GE_CMD_BONEMATRIXNUMBER] = data;
break;
case GE_CMD_TEXSCALEU:
gstate.cmdmem[GE_CMD_TEXSCALEU] = data;
gstate_c.uv.uScale = getFloat24(data);
break;
case GE_CMD_TEXSCALEV:
gstate.cmdmem[GE_CMD_TEXSCALEV] = data;
gstate_c.uv.vScale = getFloat24(data);
break;
case GE_CMD_CALL:
{
// A bone matrix probably. If not we bail.
const u32 target = gstate_c.getRelativeAddress(data & 0x00FFFFFC);
if ((Memory::ReadUnchecked_U32(target) >> 24) == GE_CMD_BONEMATRIXDATA &&
(Memory::ReadUnchecked_U32(target + 11 * 4) >> 24) == GE_CMD_BONEMATRIXDATA &&
(Memory::ReadUnchecked_U32(target + 12 * 4) >> 24) == GE_CMD_RET &&
(target > currentList->stall || target + 12 * 4 < currentList->stall)) {
FastLoadBoneMatrix(target);
} else {
goto bail;
}
break;
}
default:
// All other commands might need a flush or something, stop this inner loop.
goto bail;
@ -1572,6 +1605,7 @@ void GPUCommon::Execute_Prim(u32 op, u32 diff) {
}
bail:
gstate.cmdmem[GE_CMD_VERTEXTYPE] = vertexType;
// Skip over the commands we just read out manually.
if (cmdCount > 0) {
UpdatePC(currentList->pc, currentList->pc + cmdCount * 4);
@ -2047,7 +2081,8 @@ void GPUCommon::FlushImm() {
int vtype = GE_VTYPE_POS_FLOAT | GE_VTYPE_COL_8888 | GE_VTYPE_THROUGH;
int bytesRead;
drawEngineCommon_->DispatchSubmitPrim(temp, nullptr, immPrim_, immCount_, vtype, &bytesRead);
uint32_t vertTypeID = GetVertTypeID(vtype, 0);
drawEngineCommon_->DispatchSubmitPrim(temp, nullptr, immPrim_, immCount_, vertTypeID, &bytesRead);
drawEngineCommon_->DispatchFlush();
// TOOD: In the future, make a special path for these.
// drawEngineCommon_->DispatchSubmitImm(immBuffer_, immCount_);

View File

@ -29,7 +29,7 @@ class NullDrawEngine : public DrawEngineCommon {
public:
void DispatchFlush() override {
}
void DispatchSubmitPrim(void *verts, void *inds, GEPrimitiveType prim, int vertexCount, u32 vertType, int *bytesRead) override {
void DispatchSubmitPrim(void *verts, void *inds, GEPrimitiveType prim, int vertexCount, u32 vertTypeID, int *bytesRead) override {
}
};

View File

@ -53,8 +53,8 @@ SoftwareDrawEngine::~SoftwareDrawEngine() {
void SoftwareDrawEngine::DispatchFlush() {
}
void SoftwareDrawEngine::DispatchSubmitPrim(void *verts, void *inds, GEPrimitiveType prim, int vertexCount, u32 vertType, int *bytesRead) {
transformUnit.SubmitPrimitive(verts, inds, prim, vertexCount, vertType, bytesRead, this);
void SoftwareDrawEngine::DispatchSubmitPrim(void *verts, void *inds, GEPrimitiveType prim, int vertexCount, u32 vertTypeID, int *bytesRead) {
transformUnit.SubmitPrimitive(verts, inds, prim, vertexCount, vertTypeID, bytesRead, this);
}
VertexDecoder *SoftwareDrawEngine::FindVertexDecoder(u32 vtype) {