Merge pull request #10454 from unknownbrackets/gpu-minor

Vulkan: Use depth clamping, where available
This commit is contained in:
Henrik Rydgård 2017-12-27 11:11:18 +01:00 committed by GitHub
commit ea50561c80
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
12 changed files with 123 additions and 39 deletions

View File

@ -508,6 +508,9 @@ float DepthSliceFactor() {
if (gstate_c.Supports(GPU_SCALE_DEPTH_FROM_24BIT_TO_16BIT)) {
return DEPTH_SLICE_FACTOR_16BIT;
}
if (gstate_c.Supports(GPU_SUPPORTS_DEPTH_CLAMP)) {
return 1.0f;
}
return DEPTH_SLICE_FACTOR_HIGH;
}
@ -681,6 +684,7 @@ void ConvertViewportAndScissor(bool useBufferedRendering, float renderWidth, flo
// So, we apply the depth range as minz/maxz, and transform for the viewport.
float vpZScale = gstate.getViewportZScale();
float vpZCenter = gstate.getViewportZCenter();
// TODO: This clip the entire draw if minz > maxz.
float minz = gstate.getDepthRangeMin();
float maxz = gstate.getDepthRangeMax();

View File

@ -182,7 +182,7 @@ const CommonCommandTableEntry commonCommandTable[] = {
{ GE_CMD_VIEWPORTYCENTER, FLAG_FLUSHBEFOREONCHANGE, DIRTY_FRAMEBUF | DIRTY_TEXTURE_PARAMS | DIRTY_VIEWPORTSCISSOR_STATE },
{ GE_CMD_VIEWPORTZSCALE, FLAG_FLUSHBEFOREONCHANGE, DIRTY_FRAMEBUF | DIRTY_TEXTURE_PARAMS | DIRTY_DEPTHRANGE | DIRTY_PROJMATRIX | DIRTY_VIEWPORTSCISSOR_STATE },
{ GE_CMD_VIEWPORTZCENTER, FLAG_FLUSHBEFOREONCHANGE, DIRTY_FRAMEBUF | DIRTY_TEXTURE_PARAMS | DIRTY_DEPTHRANGE | DIRTY_PROJMATRIX | DIRTY_VIEWPORTSCISSOR_STATE },
{ GE_CMD_CLIPENABLE, FLAG_FLUSHBEFOREONCHANGE, DIRTY_VIEWPORTSCISSOR_STATE },
{ GE_CMD_CLIPENABLE, FLAG_FLUSHBEFOREONCHANGE, DIRTY_VIEWPORTSCISSOR_STATE | DIRTY_RASTER_STATE },
// Z clip
{ GE_CMD_MINZ, FLAG_FLUSHBEFOREONCHANGE, DIRTY_DEPTHRANGE | DIRTY_VIEWPORTSCISSOR_STATE },
@ -1739,7 +1739,6 @@ void GPUCommon::Execute_BoneMtxNum(u32 op, u32 diff) {
const int end = 12 * 8 - (op & 0x7F);
int i = 0;
// TODO: Validate what should happen when explicitly setting num to 96 or higher.
bool fastLoad = !debugRecording_ && end > 0;
if (currentList->pc < currentList->stall && currentList->pc + end * 4 >= currentList->stall) {
fastLoad = false;

View File

@ -37,6 +37,9 @@ alignas(16) GPUgstate gstate;
// Let's align this one too for good measure.
alignas(16) GPUStateCache gstate_c;
// For save state compatibility.
static int savedContextVersion = 1;
struct CmdRange {
u8 start;
u8 end;
@ -77,6 +80,25 @@ static const CmdRange contextCmdRanges[] = {
// Skip: {0xFA, 0xFF},
};
static u32_le *SaveMatrix(u32_le *cmds, const float *mtx, int sz, int numcmd, int datacmd) {
*cmds++ = numcmd << 24;
for (int i = 0; i < sz; ++i) {
*cmds++ = (datacmd << 24) | toFloat24(mtx[i]);
}
return cmds;
}
static const u32_le *LoadMatrix(const u32_le *cmds, float *mtx, int sz) {
// Skip the reset.
cmds++;
for (int i = 0; i < sz; ++i) {
mtx[i] = getFloat24(*cmds++);
}
return cmds;
}
void GPUgstate::Reset() {
memset(gstate.cmdmem, 0, sizeof(gstate.cmdmem));
for (int i = 0; i < 256; i++) {
@ -89,6 +111,8 @@ void GPUgstate::Reset() {
memset(gstate.projMatrix, 0, sizeof(gstate.projMatrix));
memset(gstate.tgenMatrix, 0, sizeof(gstate.tgenMatrix));
memset(gstate.boneMatrix, 0, sizeof(gstate.boneMatrix));
savedContextVersion = 1;
}
void GPUgstate::Save(u32_le *ptr) {
@ -105,22 +129,37 @@ void GPUgstate::Save(u32_le *ptr) {
}
}
if (Memory::IsValidAddress(getClutAddress()))
*cmds++ = loadclut;
if (savedContextVersion == 0) {
if (Memory::IsValidAddress(getClutAddress()))
*cmds++ = loadclut;
// Seems like it actually writes commands to load the matrices and then reset the counts.
*cmds++ = boneMatrixNumber;
*cmds++ = worldmtxnum;
*cmds++ = viewmtxnum;
*cmds++ = projmtxnum;
*cmds++ = texmtxnum;
// Seems like it actually writes commands to load the matrices and then reset the counts.
*cmds++ = boneMatrixNumber;
*cmds++ = worldmtxnum;
*cmds++ = viewmtxnum;
*cmds++ = projmtxnum;
*cmds++ = texmtxnum;
u8 *matrices = (u8 *)cmds;
memcpy(matrices, boneMatrix, sizeof(boneMatrix)); matrices += sizeof(boneMatrix);
memcpy(matrices, worldMatrix, sizeof(worldMatrix)); matrices += sizeof(worldMatrix);
memcpy(matrices, viewMatrix, sizeof(viewMatrix)); matrices += sizeof(viewMatrix);
memcpy(matrices, projMatrix, sizeof(projMatrix)); matrices += sizeof(projMatrix);
memcpy(matrices, tgenMatrix, sizeof(tgenMatrix)); matrices += sizeof(tgenMatrix);
u8 *matrices = (u8 *)cmds;
memcpy(matrices, boneMatrix, sizeof(boneMatrix)); matrices += sizeof(boneMatrix);
memcpy(matrices, worldMatrix, sizeof(worldMatrix)); matrices += sizeof(worldMatrix);
memcpy(matrices, viewMatrix, sizeof(viewMatrix)); matrices += sizeof(viewMatrix);
memcpy(matrices, projMatrix, sizeof(projMatrix)); matrices += sizeof(projMatrix);
memcpy(matrices, tgenMatrix, sizeof(tgenMatrix)); matrices += sizeof(tgenMatrix);
} else {
cmds = SaveMatrix(cmds, boneMatrix, ARRAY_SIZE(boneMatrix), GE_CMD_BONEMATRIXNUMBER, GE_CMD_BONEMATRIXDATA);
cmds = SaveMatrix(cmds, worldMatrix, ARRAY_SIZE(worldMatrix), GE_CMD_WORLDMATRIXNUMBER, GE_CMD_WORLDMATRIXDATA);
cmds = SaveMatrix(cmds, viewMatrix, ARRAY_SIZE(viewMatrix), GE_CMD_VIEWMATRIXNUMBER, GE_CMD_VIEWMATRIXDATA);
cmds = SaveMatrix(cmds, projMatrix, ARRAY_SIZE(projMatrix), GE_CMD_PROJMATRIXNUMBER, GE_CMD_PROJMATRIXDATA);
cmds = SaveMatrix(cmds, tgenMatrix, ARRAY_SIZE(tgenMatrix), GE_CMD_TGENMATRIXNUMBER, GE_CMD_TGENMATRIXDATA);
*cmds++ = boneMatrixNumber;
*cmds++ = worldmtxnum;
*cmds++ = viewmtxnum;
*cmds++ = projmtxnum;
*cmds++ = texmtxnum;
*cmds++ = GE_CMD_END << 24;
}
}
void GPUgstate::FastLoadBoneMatrix(u32 addr) {
@ -165,27 +204,41 @@ void GPUgstate::Restore(u32_le *ptr) {
gstate_c.offsetAddr = ptr[7];
// Command values start 17 ints in.
u32_le *cmds = ptr + 17;
const u32_le *cmds = ptr + 17;
for (size_t i = 0; i < ARRAY_SIZE(contextCmdRanges); ++i) {
for (int n = contextCmdRanges[i].start; n <= contextCmdRanges[i].end; ++n) {
cmdmem[n] = *cmds++;
}
}
if (Memory::IsValidAddress(getClutAddress()))
loadclut = *cmds++;
boneMatrixNumber = *cmds++;
worldmtxnum = *cmds++;
viewmtxnum = *cmds++;
projmtxnum = *cmds++;
texmtxnum = *cmds++;
if (savedContextVersion == 0) {
if (Memory::IsValidAddress(getClutAddress()))
loadclut = *cmds++;
boneMatrixNumber = *cmds++;
worldmtxnum = *cmds++;
viewmtxnum = *cmds++;
projmtxnum = *cmds++;
texmtxnum = *cmds++;
u8 *matrices = (u8 *)cmds;
memcpy(boneMatrix, matrices, sizeof(boneMatrix)); matrices += sizeof(boneMatrix);
memcpy(worldMatrix, matrices, sizeof(worldMatrix)); matrices += sizeof(worldMatrix);
memcpy(viewMatrix, matrices, sizeof(viewMatrix)); matrices += sizeof(viewMatrix);
memcpy(projMatrix, matrices, sizeof(projMatrix)); matrices += sizeof(projMatrix);
memcpy(tgenMatrix, matrices, sizeof(tgenMatrix)); matrices += sizeof(tgenMatrix);
u8 *matrices = (u8 *)cmds;
memcpy(boneMatrix, matrices, sizeof(boneMatrix)); matrices += sizeof(boneMatrix);
memcpy(worldMatrix, matrices, sizeof(worldMatrix)); matrices += sizeof(worldMatrix);
memcpy(viewMatrix, matrices, sizeof(viewMatrix)); matrices += sizeof(viewMatrix);
memcpy(projMatrix, matrices, sizeof(projMatrix)); matrices += sizeof(projMatrix);
memcpy(tgenMatrix, matrices, sizeof(tgenMatrix)); matrices += sizeof(tgenMatrix);
} else {
cmds = LoadMatrix(cmds, boneMatrix, ARRAY_SIZE(boneMatrix));
cmds = LoadMatrix(cmds, worldMatrix, ARRAY_SIZE(worldMatrix));
cmds = LoadMatrix(cmds, viewMatrix, ARRAY_SIZE(viewMatrix));
cmds = LoadMatrix(cmds, projMatrix, ARRAY_SIZE(projMatrix));
cmds = LoadMatrix(cmds, tgenMatrix, ARRAY_SIZE(tgenMatrix));
boneMatrixNumber = *cmds++;
worldmtxnum = *cmds++;
viewmtxnum = *cmds++;
projmtxnum = *cmds++;
texmtxnum = *cmds++;
}
}
bool vertTypeIsSkinningEnabled(u32 vertType) {
@ -217,7 +270,7 @@ void GPUStateCache::Reset() {
}
void GPUStateCache::DoState(PointerWrap &p) {
auto s = p.Section("GPUStateCache", 0, 4);
auto s = p.Section("GPUStateCache", 0, 5);
if (!s) {
// Old state, this was not versioned.
GPUStateCache_v0 old;
@ -231,6 +284,8 @@ void GPUStateCache::DoState(PointerWrap &p) {
vertexFullAlpha = old.vertexFullAlpha;
skipDrawReason = old.skipDrawReason;
uv = old.uv;
savedContextVersion = 0;
} else {
p.Do(vertexAddr);
p.Do(indexAddr);
@ -290,4 +345,9 @@ void GPUStateCache::DoState(PointerWrap &p) {
p.Do(curRTHeight);
// curRTBufferWidth, curRTBufferHeight, and cutRTOffsetX don't need to be saved.
if (s < 5) {
savedContextVersion = 0;
} else {
p.Do(savedContextVersion);
}
}

View File

@ -477,6 +477,7 @@ enum {
GPU_SUPPORTS_VERTEX_TEXTURE_FETCH = FLAG_BIT(11),
GPU_SUPPORTS_TEXTURE_FLOAT = FLAG_BIT(12),
GPU_SUPPORTS_16BIT_FORMATS = FLAG_BIT(13),
GPU_SUPPORTS_DEPTH_CLAMP = FLAG_BIT(14),
GPU_SUPPORTS_LARGE_VIEWPORTS = FLAG_BIT(16),
GPU_SUPPORTS_ACCURATE_DEPTH = FLAG_BIT(17),
GPU_SUPPORTS_VAO = FLAG_BIT(18),

View File

@ -255,7 +255,7 @@ void ProcessLine(VertexData& v0, VertexData& v1)
return;
}
if (mask && (gstate.clipEnable & 0x1)) {
if (mask && gstate.isClippingEnabled()) {
// discard if any vertex is outside the near clipping plane
if (mask & CLIP_NEG_Z_BIT)
return;
@ -303,7 +303,7 @@ void ProcessTriangle(VertexData& v0, VertexData& v1, VertexData& v2)
mask |= CalcClipMask(v1.clippos);
mask |= CalcClipMask(v2.clippos);
if (mask && (gstate.clipEnable & 0x1)) {
if (mask && gstate.isClippingEnabled()) {
// discard if any vertex is outside the near clipping plane
if (mask & CLIP_NEG_Z_BIT)
return;

View File

@ -104,8 +104,8 @@ static inline ScreenCoords ClipToScreenInternal(const ClipCoords& coords, bool *
float y = coords.y * yScale / coords.w + yCenter;
float z = coords.z * zScale / coords.w + zCenter;
// Is this really right?
if (gstate.clipEnable & 0x1) {
// This matches hardware tests - depth is clamped when this flag is on.
if (gstate.isClippingEnabled()) {
if (z < 0.f)
z = 0.f;
if (z > 65535.f)

View File

@ -456,8 +456,20 @@ bool GenerateVulkanGLSLFragmentShader(const FShaderID &id, char *buffer) {
}
if (gstate_c.Supports(GPU_ROUND_FRAGMENT_DEPTH_TO_16BIT)) {
const double scale = DepthSliceFactor() * 65535.0;
WRITE(p, " highp float z = gl_FragCoord.z;\n");
WRITE(p, " z = (1.0/65535.0) * floor(z * 65535.0);\n");
if (gstate_c.Supports(GPU_SUPPORTS_ACCURATE_DEPTH)) {
// We center the depth with an offset, but only its fraction matters.
// When (DepthSliceFactor() - 1) is odd, it will be 0.5, otherwise 0.
if (((int)(DepthSliceFactor() - 1.0f) & 1) == 1) {
WRITE(p, " z = (floor((z * %f) - (1.0 / 2.0)) + (1.0 / 2.0)) * (1.0 / %f);\n", scale, scale);
} else {
WRITE(p, " z = floor(z * %f) * (1.0 / %f);\n", scale, scale);
}
} else {
WRITE(p, " z = (1.0/65535.0) * floor(z * 65535.0);\n");
}
WRITE(p, " gl_FragDepth = z;\n");
}

View File

@ -209,6 +209,9 @@ void GPU_Vulkan::CheckGPUFeatures() {
if (vulkan_->GetFeaturesEnabled().wideLines) {
features |= GPU_SUPPORTS_WIDE_LINES;
}
if (vulkan_->GetFeaturesEnabled().depthClamp) {
features |= GPU_SUPPORTS_DEPTH_CLAMP;
}
if (vulkan_->GetFeaturesEnabled().dualSrcBlend) {
switch (vulkan_->GetPhysicalDeviceProperties().vendorID) {
case VULKAN_VENDOR_NVIDIA:

View File

@ -188,7 +188,7 @@ static VulkanPipeline *CreateVulkanPipeline(VkDevice device, VkPipelineCache pip
rs.lineWidth = lineWidth;
rs.rasterizerDiscardEnable = false;
rs.polygonMode = VK_POLYGON_MODE_FILL;
rs.depthClampEnable = false;
rs.depthClampEnable = key.depthClampEnable;
VkPipelineMultisampleStateCreateInfo ms = { VK_STRUCTURE_TYPE_PIPELINE_MULTISAMPLE_STATE_CREATE_INFO };
ms.pSampleMask = nullptr;

View File

@ -241,10 +241,13 @@ void DrawEngineVulkan::ConvertStateToVulkanKey(FramebufferManagerVulkan &fbManag
if (gstate_c.IsDirty(DIRTY_RASTER_STATE)) {
if (gstate.isModeClear()) {
key.cullMode = VK_CULL_MODE_NONE;
// TODO: Or does it always clamp?
key.depthClampEnable = false;
} else {
// Set cull
bool wantCull = !gstate.isModeThrough() && prim != GE_PRIM_RECTANGLES && gstate.isCullEnabled();
key.cullMode = wantCull ? (gstate.getCullMode() ? VK_CULL_MODE_FRONT_BIT : VK_CULL_MODE_BACK_BIT) : VK_CULL_MODE_NONE;
key.depthClampEnable = gstate.isClippingEnabled() && gstate_c.Supports(GPU_SUPPORTS_DEPTH_CLAMP);
}
}

View File

@ -20,7 +20,7 @@ struct VulkanDynamicState {
// Let's pack this tight using bitfields.
// If an enable flag is set to 0, all the data fields for that section should
// also be set to 0.
// ~54 bits.
// ~64 bits.
// Can't use enums unfortunately, they end up signed and breaking values above half their ranges.
struct VulkanPipelineRasterStateKey {
// Blend
@ -37,6 +37,7 @@ struct VulkanPipelineRasterStateKey {
unsigned int colorWriteMask : 4;
// Depth/Stencil
unsigned int depthClampEnable : 1;
unsigned int depthTestEnable : 1;
unsigned int depthWriteEnable : 1;
unsigned int depthCompareOp : 3; // VkCompareOp
@ -57,4 +58,4 @@ struct VulkanPipelineRasterStateKey {
size_t size = sizeof(VulkanPipelineRasterStateKey);
return memcmp(this, &other, size) < 0;
}
};
};

View File

@ -373,6 +373,7 @@ int main(int argc, const char* argv[])
g_Config.bVertexDecoderJit = true;
g_Config.bBlockTransferGPU = true;
g_Config.iSplineBezierQuality = 2;
g_Config.bHighQualityDepth = true;
#ifdef _WIN32
InitSysDirectories();