mirror of
https://github.com/shadps4-emu/shadPS4.git
synced 2026-01-31 00:55:19 +01:00
shader_recompiler: VS clip distance emulation for NVIDIA GPUs (#3958)
This commit is contained in:
@@ -916,6 +916,7 @@ set(SHADER_RECOMPILER src/shader_recompiler/profile.h
|
||||
src/shader_recompiler/ir/passes/flatten_extended_userdata_pass.cpp
|
||||
src/shader_recompiler/ir/passes/hull_shader_transform.cpp
|
||||
src/shader_recompiler/ir/passes/identity_removal_pass.cpp
|
||||
src/shader_recompiler/ir/passes/inject_clip_distance_attributes.cpp
|
||||
src/shader_recompiler/ir/passes/ir_passes.h
|
||||
src/shader_recompiler/ir/passes/lower_buffer_format_to_raw.cpp
|
||||
src/shader_recompiler/ir/passes/lower_fp64_to_fp32.cpp
|
||||
|
||||
1
externals/CMakeLists.txt
vendored
1
externals/CMakeLists.txt
vendored
@@ -204,6 +204,7 @@ add_subdirectory(tracy)
|
||||
|
||||
# pugixml
|
||||
if (NOT TARGET pugixml::pugixml)
|
||||
option(PUGIXML_NO_EXCEPTIONS "" ON)
|
||||
add_subdirectory(pugixml)
|
||||
endif()
|
||||
|
||||
|
||||
@@ -364,7 +364,7 @@ void EmitContext::DefineInputs() {
|
||||
}
|
||||
break;
|
||||
}
|
||||
case LogicalStage::Fragment:
|
||||
case LogicalStage::Fragment: {
|
||||
if (info.loads.GetAny(IR::Attribute::FragCoord)) {
|
||||
frag_coord = DefineVariable(F32[4], spv::BuiltIn::FragCoord, spv::StorageClass::Input);
|
||||
}
|
||||
@@ -418,7 +418,13 @@ void EmitContext::DefineInputs() {
|
||||
spv::StorageClass::Input);
|
||||
}
|
||||
}
|
||||
for (s32 i = 0; i < runtime_info.fs_info.num_inputs; i++) {
|
||||
|
||||
const bool has_clip_distance_inputs = runtime_info.fs_info.clip_distance_emulation;
|
||||
// Clip distances attribute vector is the last in inputs array
|
||||
const auto num_inputs =
|
||||
runtime_info.fs_info.num_inputs - (has_clip_distance_inputs ? 1 : 0);
|
||||
|
||||
for (s32 i = 0; i < num_inputs; i++) {
|
||||
const auto& input = runtime_info.fs_info.inputs[i];
|
||||
if (input.IsDefault()) {
|
||||
continue;
|
||||
@@ -428,12 +434,13 @@ void EmitContext::DefineInputs() {
|
||||
const auto [primary, auxiliary] = info.fs_interpolation[i];
|
||||
const Id type = F32[num_components];
|
||||
const Id attr_id = [&] {
|
||||
const auto bind_location = input.param_index + (has_clip_distance_inputs ? 1 : 0);
|
||||
if (primary == Qualifier::PerVertex &&
|
||||
profile.supports_fragment_shader_barycentric) {
|
||||
return Name(DefineInput(TypeArray(type, ConstU32(3U)), input.param_index),
|
||||
return Name(DefineInput(TypeArray(type, ConstU32(3U)), bind_location),
|
||||
fmt::format("fs_in_attr{}_p", i));
|
||||
}
|
||||
return Name(DefineInput(type, input.param_index), fmt::format("fs_in_attr{}", i));
|
||||
return Name(DefineInput(type, bind_location), fmt::format("fs_in_attr{}", i));
|
||||
}();
|
||||
if (primary == Qualifier::PerVertex) {
|
||||
Decorate(attr_id, profile.supports_amd_shader_explicit_vertex_parameter
|
||||
@@ -450,7 +457,15 @@ void EmitContext::DefineInputs() {
|
||||
input_params[i] = GetAttributeInfo(AmdGpu::NumberFormat::Float, attr_id, num_components,
|
||||
false, false, primary == Qualifier::PerVertex);
|
||||
}
|
||||
|
||||
if (has_clip_distance_inputs) {
|
||||
const auto type = F32[MaxEmulatedClipDistances];
|
||||
const auto attr_id = Name(DefineInput(type, 0), fmt::format("cldist_attr{}", 0));
|
||||
input_params[num_inputs] = GetAttributeInfo(AmdGpu::NumberFormat::Float, attr_id,
|
||||
MaxEmulatedClipDistances, false);
|
||||
}
|
||||
break;
|
||||
}
|
||||
case LogicalStage::Compute:
|
||||
if (info.loads.GetAny(IR::Attribute::WorkgroupIndex) ||
|
||||
info.loads.GetAny(IR::Attribute::WorkgroupId)) {
|
||||
@@ -546,11 +561,16 @@ void EmitContext::DefineVertexBlock() {
|
||||
const std::array<Id, 8> zero{f32_zero_value, f32_zero_value, f32_zero_value, f32_zero_value,
|
||||
f32_zero_value, f32_zero_value, f32_zero_value, f32_zero_value};
|
||||
output_position = DefineVariable(F32[4], spv::BuiltIn::Position, spv::StorageClass::Output);
|
||||
const bool needs_clip_distance_emulation = l_stage == LogicalStage::Vertex &&
|
||||
stage == Stage::Vertex &&
|
||||
profile.needs_clip_distance_emulation;
|
||||
if (!needs_clip_distance_emulation) {
|
||||
if (info.stores.GetAny(IR::Attribute::ClipDistance)) {
|
||||
const Id type{TypeArray(F32[1], ConstU32(8U))};
|
||||
const Id initializer{ConstantComposite(type, zero)};
|
||||
clip_distances = DefineVariable(type, spv::BuiltIn::ClipDistance, spv::StorageClass::Output,
|
||||
initializer);
|
||||
clip_distances = DefineVariable(type, spv::BuiltIn::ClipDistance,
|
||||
spv::StorageClass::Output, initializer);
|
||||
}
|
||||
}
|
||||
if (info.stores.GetAny(IR::Attribute::CullDistance)) {
|
||||
const Id type{TypeArray(F32[1], ConstU32(8U))};
|
||||
@@ -583,16 +603,27 @@ void EmitContext::DefineOutputs() {
|
||||
Name(output_attr_array, "out_attrs");
|
||||
}
|
||||
} else {
|
||||
const auto has_clip_distance_outputs = info.stores.GetAny(IR::Attribute::ClipDistance);
|
||||
u32 num_attrs = 0u;
|
||||
for (u32 i = 0; i < IR::NumParams; i++) {
|
||||
const IR::Attribute param{IR::Attribute::Param0 + i};
|
||||
if (!info.stores.GetAny(param)) {
|
||||
continue;
|
||||
}
|
||||
const u32 num_components = info.stores.NumComponents(param);
|
||||
const Id id{DefineOutput(F32[num_components], i)};
|
||||
const Id id{
|
||||
DefineOutput(F32[num_components], i + (has_clip_distance_outputs ? 1 : 0))};
|
||||
Name(id, fmt::format("out_attr{}", i));
|
||||
output_params[i] =
|
||||
GetAttributeInfo(AmdGpu::NumberFormat::Float, id, num_components, true);
|
||||
++num_attrs;
|
||||
}
|
||||
|
||||
if (has_clip_distance_outputs) {
|
||||
clip_distances = Id{DefineOutput(F32[MaxEmulatedClipDistances], 0)};
|
||||
output_params[num_attrs] = GetAttributeInfo(
|
||||
AmdGpu::NumberFormat::Float, clip_distances, MaxEmulatedClipDistances, true);
|
||||
Name(clip_distances, fmt::format("cldist_attr{}", 0));
|
||||
}
|
||||
}
|
||||
break;
|
||||
|
||||
@@ -101,7 +101,7 @@ std::string NameOf(Attribute attribute) {
|
||||
case Attribute::Param31:
|
||||
return "Param31";
|
||||
case Attribute::ClipDistance:
|
||||
return "ClipDistanace";
|
||||
return "ClipDistance";
|
||||
case Attribute::CullDistance:
|
||||
return "CullDistance";
|
||||
case Attribute::RenderTargetIndex:
|
||||
|
||||
@@ -0,0 +1,41 @@
|
||||
// SPDX-FileCopyrightText: Copyright 2026 shadPS4 Emulator Project
|
||||
// SPDX-License-Identifier: GPL-2.0-or-later
|
||||
|
||||
#include "shader_recompiler/info.h"
|
||||
#include "shader_recompiler/ir/basic_block.h"
|
||||
#include "shader_recompiler/ir/ir_emitter.h"
|
||||
#include "shader_recompiler/ir/program.h"
|
||||
|
||||
namespace Shader {
|
||||
|
||||
void InjectClipDistanceAttributes(IR::Program& program, RuntimeInfo& runtime_info) {
|
||||
auto& info = runtime_info.fs_info;
|
||||
|
||||
if (!info.clip_distance_emulation || program.info.l_stage != LogicalStage::Fragment) {
|
||||
return;
|
||||
}
|
||||
|
||||
auto* first_block = *program.blocks.begin();
|
||||
auto it = std::ranges::find_if(first_block->Instructions(), [](const IR::Inst& inst) {
|
||||
return inst.GetOpcode() == IR::Opcode::Prologue;
|
||||
});
|
||||
ASSERT(it != first_block->end());
|
||||
++it;
|
||||
ASSERT(it != first_block->end());
|
||||
++it;
|
||||
|
||||
IR::IREmitter ir{*first_block, it};
|
||||
|
||||
// We don't know how many clip distances are exported by VS as it is not processed at this point
|
||||
// yet. Here is an assumption that we will have not more than 4 of them (while max is 8) to save
|
||||
// one attributes export slot.
|
||||
const auto attrib = IR::Attribute::Param0 + info.num_inputs;
|
||||
for (u32 comp = 0; comp < MaxEmulatedClipDistances; ++comp) {
|
||||
const auto attr_read = ir.GetAttribute(attrib, comp);
|
||||
const auto cond_id = ir.FPLessThan(attr_read, ir.Imm32(0.0f));
|
||||
ir.Discard(cond_id);
|
||||
}
|
||||
++info.num_inputs;
|
||||
}
|
||||
|
||||
} // namespace Shader
|
||||
@@ -8,7 +8,8 @@
|
||||
|
||||
namespace Shader {
|
||||
struct Profile;
|
||||
}
|
||||
void InjectClipDistanceAttributes(IR::Program& program, RuntimeInfo& runtime_info);
|
||||
} // namespace Shader
|
||||
|
||||
namespace Shader::Optimization {
|
||||
|
||||
|
||||
@@ -41,7 +41,7 @@ struct Profile {
|
||||
bool needs_lds_barriers{};
|
||||
bool needs_buffer_offsets{};
|
||||
bool needs_unorm_fixup{};
|
||||
bool _pad0{};
|
||||
bool needs_clip_distance_emulation{};
|
||||
};
|
||||
|
||||
} // namespace Shader
|
||||
|
||||
@@ -13,17 +13,16 @@ namespace Shader {
|
||||
|
||||
IR::BlockList GenerateBlocks(const IR::AbstractSyntaxList& syntax_list) {
|
||||
size_t num_syntax_blocks{};
|
||||
for (const auto& node : syntax_list) {
|
||||
if (node.type == IR::AbstractSyntaxNode::Type::Block) {
|
||||
for (const auto& [_, type] : syntax_list) {
|
||||
if (type == IR::AbstractSyntaxNode::Type::Block) {
|
||||
++num_syntax_blocks;
|
||||
}
|
||||
}
|
||||
IR::BlockList blocks;
|
||||
IR::BlockList blocks{};
|
||||
blocks.reserve(num_syntax_blocks);
|
||||
u32 order_index{};
|
||||
for (const auto& node : syntax_list) {
|
||||
if (node.type == IR::AbstractSyntaxNode::Type::Block) {
|
||||
blocks.push_back(node.data.block);
|
||||
for (const auto& [data, type] : syntax_list) {
|
||||
if (type == IR::AbstractSyntaxNode::Type::Block) {
|
||||
blocks.push_back(data.block);
|
||||
}
|
||||
}
|
||||
return blocks;
|
||||
@@ -60,6 +59,10 @@ IR::Program TranslateProgram(const std::span<const u32>& code, Pools& pools, Inf
|
||||
program.blocks = GenerateBlocks(program.syntax_list);
|
||||
program.post_order_blocks = Shader::IR::PostOrder(program.syntax_list.front());
|
||||
|
||||
// On NVIDIA GPUs HW interpolation of clip distance values seems broken, and we need to emulate
|
||||
// it with expensive discard in PS.
|
||||
Shader::InjectClipDistanceAttributes(program, runtime_info);
|
||||
|
||||
// Run optimization passes
|
||||
if (!profile.support_float64) {
|
||||
Shader::Optimization::LowerFp64ToFp32(program);
|
||||
|
||||
@@ -34,6 +34,7 @@ enum class LogicalStage : u32 {
|
||||
};
|
||||
|
||||
constexpr u32 MaxStageTypes = static_cast<u32>(LogicalStage::NumLogicalStages);
|
||||
constexpr auto MaxEmulatedClipDistances = 4u;
|
||||
|
||||
constexpr Stage StageFromIndex(size_t index) noexcept {
|
||||
return static_cast<Stage>(index);
|
||||
@@ -201,14 +202,16 @@ struct FragmentRuntimeInfo {
|
||||
std::array<PsInput, 32> inputs;
|
||||
std::array<PsColorBuffer, MaxColorBuffers> color_buffers;
|
||||
AmdGpu::ShaderExportFormat z_export_format;
|
||||
u8 mrtz_mask;
|
||||
bool dual_source_blending;
|
||||
u8 mrtz_mask{};
|
||||
bool dual_source_blending{false};
|
||||
bool clip_distance_emulation{false};
|
||||
|
||||
bool operator==(const FragmentRuntimeInfo& other) const noexcept {
|
||||
return std::ranges::equal(color_buffers, other.color_buffers) &&
|
||||
en_flags == other.en_flags && addr_flags == other.addr_flags &&
|
||||
num_inputs == other.num_inputs && z_export_format == other.z_export_format &&
|
||||
mrtz_mask == other.mrtz_mask && dual_source_blending == other.dual_source_blending &&
|
||||
clip_distance_emulation == other.clip_distance_emulation &&
|
||||
std::ranges::equal(inputs.begin(), inputs.begin() + num_inputs, other.inputs.begin(),
|
||||
other.inputs.begin() + num_inputs);
|
||||
}
|
||||
|
||||
@@ -101,7 +101,7 @@ const Shader::RuntimeInfo& PipelineCache::BuildRuntimeInfo(Stage stage, LogicalS
|
||||
switch (stage) {
|
||||
case Stage::Local: {
|
||||
BuildCommon(regs.ls_program);
|
||||
Shader::TessellationDataConstantBuffer tess_constants;
|
||||
Shader::TessellationDataConstantBuffer tess_constants{};
|
||||
const auto* hull_info = infos[u32(Shader::LogicalStage::TessellationControl)];
|
||||
hull_info->ReadTessConstantBuffer(tess_constants);
|
||||
info.ls_info.ls_stride = tess_constants.ls_stride;
|
||||
@@ -199,6 +199,10 @@ const Shader::RuntimeInfo& PipelineCache::BuildRuntimeInfo(Stage stage, LogicalS
|
||||
for (u32 i = 0; i < Shader::MaxColorBuffers; i++) {
|
||||
info.fs_info.color_buffers[i] = graphics_key.color_buffers[i];
|
||||
}
|
||||
info.fs_info.clip_distance_emulation =
|
||||
regs.vs_output_control.clip_distance_enable &&
|
||||
!regs.stage_enable.IsStageEnabled(static_cast<u32>(Stage::Local)) &&
|
||||
profile.needs_clip_distance_emulation;
|
||||
break;
|
||||
}
|
||||
case Stage::Compute: {
|
||||
@@ -266,6 +270,7 @@ PipelineCache::PipelineCache(const Instance& instance_, Scheduler& scheduler_,
|
||||
instance.GetDriverID() == vk::DriverId::eMoltenvk,
|
||||
.needs_buffer_offsets = instance.StorageMinAlignment() > 4,
|
||||
.needs_unorm_fixup = instance.GetDriverID() == vk::DriverId::eMoltenvk,
|
||||
.needs_clip_distance_emulation = instance.GetDriverID() == vk::DriverId::eNvidiaProprietary,
|
||||
};
|
||||
|
||||
WarmUp();
|
||||
@@ -460,7 +465,13 @@ bool PipelineCache::RefreshGraphicsStages() {
|
||||
|
||||
infos.fill(nullptr);
|
||||
modules.fill(nullptr);
|
||||
bind_stage(Stage::Fragment, LogicalStage::Fragment);
|
||||
const auto result = bind_stage(Stage::Fragment, LogicalStage::Fragment);
|
||||
if (!result && regs.vs_output_control.clip_distance_enable &&
|
||||
profile.needs_clip_distance_emulation) {
|
||||
// TODO: need to implement a discard only fallback shader
|
||||
LOG_WARNING(Render_Vulkan,
|
||||
"Clip distance emulation is ineffective due to absense of fragment shader");
|
||||
}
|
||||
|
||||
const auto* fs_info = infos[static_cast<u32>(LogicalStage::Fragment)];
|
||||
key.mrt_mask = fs_info ? fs_info->mrt_mask : 0u;
|
||||
|
||||
Reference in New Issue
Block a user