shader_recompiler: VS clip distance emulation for NVIDIA GPUs (#3958)

This commit is contained in:
psucien
2026-01-26 21:17:51 +01:00
committed by GitHub
parent fa497f6bfd
commit 1e99c4b506
10 changed files with 116 additions and 24 deletions

View File

@@ -916,6 +916,7 @@ set(SHADER_RECOMPILER src/shader_recompiler/profile.h
src/shader_recompiler/ir/passes/flatten_extended_userdata_pass.cpp
src/shader_recompiler/ir/passes/hull_shader_transform.cpp
src/shader_recompiler/ir/passes/identity_removal_pass.cpp
src/shader_recompiler/ir/passes/inject_clip_distance_attributes.cpp
src/shader_recompiler/ir/passes/ir_passes.h
src/shader_recompiler/ir/passes/lower_buffer_format_to_raw.cpp
src/shader_recompiler/ir/passes/lower_fp64_to_fp32.cpp

View File

@@ -204,6 +204,7 @@ add_subdirectory(tracy)
# pugixml
if (NOT TARGET pugixml::pugixml)
option(PUGIXML_NO_EXCEPTIONS "" ON)
add_subdirectory(pugixml)
endif()

View File

@@ -364,7 +364,7 @@ void EmitContext::DefineInputs() {
}
break;
}
case LogicalStage::Fragment:
case LogicalStage::Fragment: {
if (info.loads.GetAny(IR::Attribute::FragCoord)) {
frag_coord = DefineVariable(F32[4], spv::BuiltIn::FragCoord, spv::StorageClass::Input);
}
@@ -418,7 +418,13 @@ void EmitContext::DefineInputs() {
spv::StorageClass::Input);
}
}
for (s32 i = 0; i < runtime_info.fs_info.num_inputs; i++) {
const bool has_clip_distance_inputs = runtime_info.fs_info.clip_distance_emulation;
// Clip distances attribute vector is the last in inputs array
const auto num_inputs =
runtime_info.fs_info.num_inputs - (has_clip_distance_inputs ? 1 : 0);
for (s32 i = 0; i < num_inputs; i++) {
const auto& input = runtime_info.fs_info.inputs[i];
if (input.IsDefault()) {
continue;
@@ -428,12 +434,13 @@ void EmitContext::DefineInputs() {
const auto [primary, auxiliary] = info.fs_interpolation[i];
const Id type = F32[num_components];
const Id attr_id = [&] {
const auto bind_location = input.param_index + (has_clip_distance_inputs ? 1 : 0);
if (primary == Qualifier::PerVertex &&
profile.supports_fragment_shader_barycentric) {
return Name(DefineInput(TypeArray(type, ConstU32(3U)), input.param_index),
return Name(DefineInput(TypeArray(type, ConstU32(3U)), bind_location),
fmt::format("fs_in_attr{}_p", i));
}
return Name(DefineInput(type, input.param_index), fmt::format("fs_in_attr{}", i));
return Name(DefineInput(type, bind_location), fmt::format("fs_in_attr{}", i));
}();
if (primary == Qualifier::PerVertex) {
Decorate(attr_id, profile.supports_amd_shader_explicit_vertex_parameter
@@ -450,7 +457,15 @@ void EmitContext::DefineInputs() {
input_params[i] = GetAttributeInfo(AmdGpu::NumberFormat::Float, attr_id, num_components,
false, false, primary == Qualifier::PerVertex);
}
if (has_clip_distance_inputs) {
const auto type = F32[MaxEmulatedClipDistances];
const auto attr_id = Name(DefineInput(type, 0), fmt::format("cldist_attr{}", 0));
input_params[num_inputs] = GetAttributeInfo(AmdGpu::NumberFormat::Float, attr_id,
MaxEmulatedClipDistances, false);
}
break;
}
case LogicalStage::Compute:
if (info.loads.GetAny(IR::Attribute::WorkgroupIndex) ||
info.loads.GetAny(IR::Attribute::WorkgroupId)) {
@@ -546,11 +561,16 @@ void EmitContext::DefineVertexBlock() {
const std::array<Id, 8> zero{f32_zero_value, f32_zero_value, f32_zero_value, f32_zero_value,
f32_zero_value, f32_zero_value, f32_zero_value, f32_zero_value};
output_position = DefineVariable(F32[4], spv::BuiltIn::Position, spv::StorageClass::Output);
if (info.stores.GetAny(IR::Attribute::ClipDistance)) {
const Id type{TypeArray(F32[1], ConstU32(8U))};
const Id initializer{ConstantComposite(type, zero)};
clip_distances = DefineVariable(type, spv::BuiltIn::ClipDistance, spv::StorageClass::Output,
initializer);
const bool needs_clip_distance_emulation = l_stage == LogicalStage::Vertex &&
stage == Stage::Vertex &&
profile.needs_clip_distance_emulation;
if (!needs_clip_distance_emulation) {
if (info.stores.GetAny(IR::Attribute::ClipDistance)) {
const Id type{TypeArray(F32[1], ConstU32(8U))};
const Id initializer{ConstantComposite(type, zero)};
clip_distances = DefineVariable(type, spv::BuiltIn::ClipDistance,
spv::StorageClass::Output, initializer);
}
}
if (info.stores.GetAny(IR::Attribute::CullDistance)) {
const Id type{TypeArray(F32[1], ConstU32(8U))};
@@ -583,16 +603,27 @@ void EmitContext::DefineOutputs() {
Name(output_attr_array, "out_attrs");
}
} else {
const auto has_clip_distance_outputs = info.stores.GetAny(IR::Attribute::ClipDistance);
u32 num_attrs = 0u;
for (u32 i = 0; i < IR::NumParams; i++) {
const IR::Attribute param{IR::Attribute::Param0 + i};
if (!info.stores.GetAny(param)) {
continue;
}
const u32 num_components = info.stores.NumComponents(param);
const Id id{DefineOutput(F32[num_components], i)};
const Id id{
DefineOutput(F32[num_components], i + (has_clip_distance_outputs ? 1 : 0))};
Name(id, fmt::format("out_attr{}", i));
output_params[i] =
GetAttributeInfo(AmdGpu::NumberFormat::Float, id, num_components, true);
++num_attrs;
}
if (has_clip_distance_outputs) {
clip_distances = Id{DefineOutput(F32[MaxEmulatedClipDistances], 0)};
output_params[num_attrs] = GetAttributeInfo(
AmdGpu::NumberFormat::Float, clip_distances, MaxEmulatedClipDistances, true);
Name(clip_distances, fmt::format("cldist_attr{}", 0));
}
}
break;

View File

@@ -101,7 +101,7 @@ std::string NameOf(Attribute attribute) {
case Attribute::Param31:
return "Param31";
case Attribute::ClipDistance:
return "ClipDistanace";
return "ClipDistance";
case Attribute::CullDistance:
return "CullDistance";
case Attribute::RenderTargetIndex:

View File

@@ -0,0 +1,41 @@
// SPDX-FileCopyrightText: Copyright 2026 shadPS4 Emulator Project
// SPDX-License-Identifier: GPL-2.0-or-later
#include "shader_recompiler/info.h"
#include "shader_recompiler/ir/basic_block.h"
#include "shader_recompiler/ir/ir_emitter.h"
#include "shader_recompiler/ir/program.h"
namespace Shader {
void InjectClipDistanceAttributes(IR::Program& program, RuntimeInfo& runtime_info) {
auto& info = runtime_info.fs_info;
if (!info.clip_distance_emulation || program.info.l_stage != LogicalStage::Fragment) {
return;
}
auto* first_block = *program.blocks.begin();
auto it = std::ranges::find_if(first_block->Instructions(), [](const IR::Inst& inst) {
return inst.GetOpcode() == IR::Opcode::Prologue;
});
ASSERT(it != first_block->end());
++it;
ASSERT(it != first_block->end());
++it;
IR::IREmitter ir{*first_block, it};
// We don't know how many clip distances are exported by VS as it is not processed at this point
// yet. Here is an assumption that we will have not more than 4 of them (while max is 8) to save
// one attributes export slot.
const auto attrib = IR::Attribute::Param0 + info.num_inputs;
for (u32 comp = 0; comp < MaxEmulatedClipDistances; ++comp) {
const auto attr_read = ir.GetAttribute(attrib, comp);
const auto cond_id = ir.FPLessThan(attr_read, ir.Imm32(0.0f));
ir.Discard(cond_id);
}
++info.num_inputs;
}
} // namespace Shader

View File

@@ -8,7 +8,8 @@
namespace Shader {
struct Profile;
}
void InjectClipDistanceAttributes(IR::Program& program, RuntimeInfo& runtime_info);
} // namespace Shader
namespace Shader::Optimization {

View File

@@ -41,7 +41,7 @@ struct Profile {
bool needs_lds_barriers{};
bool needs_buffer_offsets{};
bool needs_unorm_fixup{};
bool _pad0{};
bool needs_clip_distance_emulation{};
};
} // namespace Shader

View File

@@ -13,17 +13,16 @@ namespace Shader {
IR::BlockList GenerateBlocks(const IR::AbstractSyntaxList& syntax_list) {
size_t num_syntax_blocks{};
for (const auto& node : syntax_list) {
if (node.type == IR::AbstractSyntaxNode::Type::Block) {
for (const auto& [_, type] : syntax_list) {
if (type == IR::AbstractSyntaxNode::Type::Block) {
++num_syntax_blocks;
}
}
IR::BlockList blocks;
IR::BlockList blocks{};
blocks.reserve(num_syntax_blocks);
u32 order_index{};
for (const auto& node : syntax_list) {
if (node.type == IR::AbstractSyntaxNode::Type::Block) {
blocks.push_back(node.data.block);
for (const auto& [data, type] : syntax_list) {
if (type == IR::AbstractSyntaxNode::Type::Block) {
blocks.push_back(data.block);
}
}
return blocks;
@@ -60,6 +59,10 @@ IR::Program TranslateProgram(const std::span<const u32>& code, Pools& pools, Inf
program.blocks = GenerateBlocks(program.syntax_list);
program.post_order_blocks = Shader::IR::PostOrder(program.syntax_list.front());
// On NVIDIA GPUs HW interpolation of clip distance values seems broken, and we need to emulate
// it with expensive discard in PS.
Shader::InjectClipDistanceAttributes(program, runtime_info);
// Run optimization passes
if (!profile.support_float64) {
Shader::Optimization::LowerFp64ToFp32(program);

View File

@@ -34,6 +34,7 @@ enum class LogicalStage : u32 {
};
constexpr u32 MaxStageTypes = static_cast<u32>(LogicalStage::NumLogicalStages);
constexpr auto MaxEmulatedClipDistances = 4u;
constexpr Stage StageFromIndex(size_t index) noexcept {
return static_cast<Stage>(index);
@@ -201,14 +202,16 @@ struct FragmentRuntimeInfo {
std::array<PsInput, 32> inputs;
std::array<PsColorBuffer, MaxColorBuffers> color_buffers;
AmdGpu::ShaderExportFormat z_export_format;
u8 mrtz_mask;
bool dual_source_blending;
u8 mrtz_mask{};
bool dual_source_blending{false};
bool clip_distance_emulation{false};
bool operator==(const FragmentRuntimeInfo& other) const noexcept {
return std::ranges::equal(color_buffers, other.color_buffers) &&
en_flags == other.en_flags && addr_flags == other.addr_flags &&
num_inputs == other.num_inputs && z_export_format == other.z_export_format &&
mrtz_mask == other.mrtz_mask && dual_source_blending == other.dual_source_blending &&
clip_distance_emulation == other.clip_distance_emulation &&
std::ranges::equal(inputs.begin(), inputs.begin() + num_inputs, other.inputs.begin(),
other.inputs.begin() + num_inputs);
}

View File

@@ -101,7 +101,7 @@ const Shader::RuntimeInfo& PipelineCache::BuildRuntimeInfo(Stage stage, LogicalS
switch (stage) {
case Stage::Local: {
BuildCommon(regs.ls_program);
Shader::TessellationDataConstantBuffer tess_constants;
Shader::TessellationDataConstantBuffer tess_constants{};
const auto* hull_info = infos[u32(Shader::LogicalStage::TessellationControl)];
hull_info->ReadTessConstantBuffer(tess_constants);
info.ls_info.ls_stride = tess_constants.ls_stride;
@@ -199,6 +199,10 @@ const Shader::RuntimeInfo& PipelineCache::BuildRuntimeInfo(Stage stage, LogicalS
for (u32 i = 0; i < Shader::MaxColorBuffers; i++) {
info.fs_info.color_buffers[i] = graphics_key.color_buffers[i];
}
info.fs_info.clip_distance_emulation =
regs.vs_output_control.clip_distance_enable &&
!regs.stage_enable.IsStageEnabled(static_cast<u32>(Stage::Local)) &&
profile.needs_clip_distance_emulation;
break;
}
case Stage::Compute: {
@@ -266,6 +270,7 @@ PipelineCache::PipelineCache(const Instance& instance_, Scheduler& scheduler_,
instance.GetDriverID() == vk::DriverId::eMoltenvk,
.needs_buffer_offsets = instance.StorageMinAlignment() > 4,
.needs_unorm_fixup = instance.GetDriverID() == vk::DriverId::eMoltenvk,
.needs_clip_distance_emulation = instance.GetDriverID() == vk::DriverId::eNvidiaProprietary,
};
WarmUp();
@@ -460,7 +465,13 @@ bool PipelineCache::RefreshGraphicsStages() {
infos.fill(nullptr);
modules.fill(nullptr);
bind_stage(Stage::Fragment, LogicalStage::Fragment);
const auto result = bind_stage(Stage::Fragment, LogicalStage::Fragment);
if (!result && regs.vs_output_control.clip_distance_enable &&
profile.needs_clip_distance_emulation) {
// TODO: need to implement a discard only fallback shader
LOG_WARNING(Render_Vulkan,
"Clip distance emulation is ineffective due to absense of fragment shader");
}
const auto* fs_info = infos[static_cast<u32>(LogicalStage::Fragment)];
key.mrt_mask = fs_info ? fs_info->mrt_mask : 0u;