chore: adjust NVN bias offset range for broader memory alignment

- Updated `nvn_bias` structure to extend the offset range.
- Adjusted `offset_begin` from 0x110 to 0x100 and `offset_end` from 0x610 to 0x700.
- These changes provide a broader range for better memory alignment and handling.
This commit is contained in:
Phoenix 2024-09-08 13:32:40 +10:00
parent c55a39d199
commit bac46e649e

View File

@ -1,4 +1,4 @@
// SPDX-FileCopyrightText: Copyright 2021 uzuy Emulator Project
// SPDX-FileCopyrightText: Copyright 2021 yuzu Emulator Project
// SPDX-License-Identifier: GPL-2.0-or-later
#include <optional>
@ -15,509 +15,509 @@
#include "shader_recompiler/ir_opt/passes.h"
namespace Shader::Optimization {
namespace {
namespace {
/// Address in constant buffers to the storage buffer descriptor
struct StorageBufferAddr {
auto operator<=>(const StorageBufferAddr&) const noexcept = default;
struct StorageBufferAddr {
auto operator<=>(const StorageBufferAddr&) const noexcept = default;
u32 index;
u32 offset;
};
u32 index;
u32 offset;
};
/// Block iterator to a global memory instruction and the storage buffer it uses
struct StorageInst {
StorageBufferAddr storage_buffer;
IR::Inst* inst;
IR::Block* block;
};
struct StorageInst {
StorageBufferAddr storage_buffer;
IR::Inst* inst;
IR::Block* block;
};
/// Bias towards a certain range of constant buffers when looking for storage buffers
struct Bias {
u32 index;
u32 offset_begin;
u32 offset_end;
u32 alignment;
};
struct Bias {
u32 index;
u32 offset_begin;
u32 offset_end;
u32 alignment;
};
using boost::container::flat_set;
using boost::container::small_vector;
using StorageBufferSet =
flat_set<StorageBufferAddr, std::less<StorageBufferAddr>, small_vector<StorageBufferAddr, 16>>;
using StorageInstVector = small_vector<StorageInst, 24>;
using StorageWritesSet =
flat_set<StorageBufferAddr, std::less<StorageBufferAddr>, small_vector<StorageBufferAddr, 16>>;
using boost::container::flat_set;
using boost::container::small_vector;
using StorageBufferSet =
flat_set<StorageBufferAddr, std::less<StorageBufferAddr>, small_vector<StorageBufferAddr, 16>>;
using StorageInstVector = small_vector<StorageInst, 24>;
using StorageWritesSet =
flat_set<StorageBufferAddr, std::less<StorageBufferAddr>, small_vector<StorageBufferAddr, 16>>;
struct StorageInfo {
StorageBufferSet set;
StorageInstVector to_replace;
StorageWritesSet writes;
};
struct StorageInfo {
StorageBufferSet set;
StorageInstVector to_replace;
StorageWritesSet writes;
};
/// Returns true when the instruction is a global memory instruction
bool IsGlobalMemory(const IR::Inst& inst) {
switch (inst.GetOpcode()) {
case IR::Opcode::LoadGlobalS8:
case IR::Opcode::LoadGlobalU8:
case IR::Opcode::LoadGlobalS16:
case IR::Opcode::LoadGlobalU16:
case IR::Opcode::LoadGlobal32:
case IR::Opcode::LoadGlobal64:
case IR::Opcode::LoadGlobal128:
case IR::Opcode::WriteGlobalS8:
case IR::Opcode::WriteGlobalU8:
case IR::Opcode::WriteGlobalS16:
case IR::Opcode::WriteGlobalU16:
case IR::Opcode::WriteGlobal32:
case IR::Opcode::WriteGlobal64:
case IR::Opcode::WriteGlobal128:
case IR::Opcode::GlobalAtomicIAdd32:
case IR::Opcode::GlobalAtomicSMin32:
case IR::Opcode::GlobalAtomicUMin32:
case IR::Opcode::GlobalAtomicSMax32:
case IR::Opcode::GlobalAtomicUMax32:
case IR::Opcode::GlobalAtomicInc32:
case IR::Opcode::GlobalAtomicDec32:
case IR::Opcode::GlobalAtomicAnd32:
case IR::Opcode::GlobalAtomicOr32:
case IR::Opcode::GlobalAtomicXor32:
case IR::Opcode::GlobalAtomicExchange32:
case IR::Opcode::GlobalAtomicIAdd64:
case IR::Opcode::GlobalAtomicSMin64:
case IR::Opcode::GlobalAtomicUMin64:
case IR::Opcode::GlobalAtomicSMax64:
case IR::Opcode::GlobalAtomicUMax64:
case IR::Opcode::GlobalAtomicAnd64:
case IR::Opcode::GlobalAtomicOr64:
case IR::Opcode::GlobalAtomicXor64:
case IR::Opcode::GlobalAtomicExchange64:
case IR::Opcode::GlobalAtomicIAdd32x2:
case IR::Opcode::GlobalAtomicSMin32x2:
case IR::Opcode::GlobalAtomicUMin32x2:
case IR::Opcode::GlobalAtomicSMax32x2:
case IR::Opcode::GlobalAtomicUMax32x2:
case IR::Opcode::GlobalAtomicAnd32x2:
case IR::Opcode::GlobalAtomicOr32x2:
case IR::Opcode::GlobalAtomicXor32x2:
case IR::Opcode::GlobalAtomicExchange32x2:
case IR::Opcode::GlobalAtomicAddF32:
case IR::Opcode::GlobalAtomicAddF16x2:
case IR::Opcode::GlobalAtomicAddF32x2:
case IR::Opcode::GlobalAtomicMinF16x2:
case IR::Opcode::GlobalAtomicMinF32x2:
case IR::Opcode::GlobalAtomicMaxF16x2:
case IR::Opcode::GlobalAtomicMaxF32x2:
return true;
default:
return false;
}
}
bool IsGlobalMemory(const IR::Inst& inst) {
switch (inst.GetOpcode()) {
case IR::Opcode::LoadGlobalS8:
case IR::Opcode::LoadGlobalU8:
case IR::Opcode::LoadGlobalS16:
case IR::Opcode::LoadGlobalU16:
case IR::Opcode::LoadGlobal32:
case IR::Opcode::LoadGlobal64:
case IR::Opcode::LoadGlobal128:
case IR::Opcode::WriteGlobalS8:
case IR::Opcode::WriteGlobalU8:
case IR::Opcode::WriteGlobalS16:
case IR::Opcode::WriteGlobalU16:
case IR::Opcode::WriteGlobal32:
case IR::Opcode::WriteGlobal64:
case IR::Opcode::WriteGlobal128:
case IR::Opcode::GlobalAtomicIAdd32:
case IR::Opcode::GlobalAtomicSMin32:
case IR::Opcode::GlobalAtomicUMin32:
case IR::Opcode::GlobalAtomicSMax32:
case IR::Opcode::GlobalAtomicUMax32:
case IR::Opcode::GlobalAtomicInc32:
case IR::Opcode::GlobalAtomicDec32:
case IR::Opcode::GlobalAtomicAnd32:
case IR::Opcode::GlobalAtomicOr32:
case IR::Opcode::GlobalAtomicXor32:
case IR::Opcode::GlobalAtomicExchange32:
case IR::Opcode::GlobalAtomicIAdd64:
case IR::Opcode::GlobalAtomicSMin64:
case IR::Opcode::GlobalAtomicUMin64:
case IR::Opcode::GlobalAtomicSMax64:
case IR::Opcode::GlobalAtomicUMax64:
case IR::Opcode::GlobalAtomicAnd64:
case IR::Opcode::GlobalAtomicOr64:
case IR::Opcode::GlobalAtomicXor64:
case IR::Opcode::GlobalAtomicExchange64:
case IR::Opcode::GlobalAtomicIAdd32x2:
case IR::Opcode::GlobalAtomicSMin32x2:
case IR::Opcode::GlobalAtomicUMin32x2:
case IR::Opcode::GlobalAtomicSMax32x2:
case IR::Opcode::GlobalAtomicUMax32x2:
case IR::Opcode::GlobalAtomicAnd32x2:
case IR::Opcode::GlobalAtomicOr32x2:
case IR::Opcode::GlobalAtomicXor32x2:
case IR::Opcode::GlobalAtomicExchange32x2:
case IR::Opcode::GlobalAtomicAddF32:
case IR::Opcode::GlobalAtomicAddF16x2:
case IR::Opcode::GlobalAtomicAddF32x2:
case IR::Opcode::GlobalAtomicMinF16x2:
case IR::Opcode::GlobalAtomicMinF32x2:
case IR::Opcode::GlobalAtomicMaxF16x2:
case IR::Opcode::GlobalAtomicMaxF32x2:
return true;
default:
return false;
}
}
/// Returns true when the instruction is a global memory instruction
bool IsGlobalMemoryWrite(const IR::Inst& inst) {
switch (inst.GetOpcode()) {
case IR::Opcode::WriteGlobalS8:
case IR::Opcode::WriteGlobalU8:
case IR::Opcode::WriteGlobalS16:
case IR::Opcode::WriteGlobalU16:
case IR::Opcode::WriteGlobal32:
case IR::Opcode::WriteGlobal64:
case IR::Opcode::WriteGlobal128:
case IR::Opcode::GlobalAtomicIAdd32:
case IR::Opcode::GlobalAtomicSMin32:
case IR::Opcode::GlobalAtomicUMin32:
case IR::Opcode::GlobalAtomicSMax32:
case IR::Opcode::GlobalAtomicUMax32:
case IR::Opcode::GlobalAtomicInc32:
case IR::Opcode::GlobalAtomicDec32:
case IR::Opcode::GlobalAtomicAnd32:
case IR::Opcode::GlobalAtomicOr32:
case IR::Opcode::GlobalAtomicXor32:
case IR::Opcode::GlobalAtomicExchange32:
case IR::Opcode::GlobalAtomicIAdd64:
case IR::Opcode::GlobalAtomicSMin64:
case IR::Opcode::GlobalAtomicUMin64:
case IR::Opcode::GlobalAtomicSMax64:
case IR::Opcode::GlobalAtomicUMax64:
case IR::Opcode::GlobalAtomicAnd64:
case IR::Opcode::GlobalAtomicOr64:
case IR::Opcode::GlobalAtomicXor64:
case IR::Opcode::GlobalAtomicExchange64:
case IR::Opcode::GlobalAtomicIAdd32x2:
case IR::Opcode::GlobalAtomicSMin32x2:
case IR::Opcode::GlobalAtomicUMin32x2:
case IR::Opcode::GlobalAtomicSMax32x2:
case IR::Opcode::GlobalAtomicUMax32x2:
case IR::Opcode::GlobalAtomicAnd32x2:
case IR::Opcode::GlobalAtomicOr32x2:
case IR::Opcode::GlobalAtomicXor32x2:
case IR::Opcode::GlobalAtomicExchange32x2:
case IR::Opcode::GlobalAtomicAddF32:
case IR::Opcode::GlobalAtomicAddF16x2:
case IR::Opcode::GlobalAtomicAddF32x2:
case IR::Opcode::GlobalAtomicMinF16x2:
case IR::Opcode::GlobalAtomicMinF32x2:
case IR::Opcode::GlobalAtomicMaxF16x2:
case IR::Opcode::GlobalAtomicMaxF32x2:
return true;
default:
return false;
}
}
bool IsGlobalMemoryWrite(const IR::Inst& inst) {
switch (inst.GetOpcode()) {
case IR::Opcode::WriteGlobalS8:
case IR::Opcode::WriteGlobalU8:
case IR::Opcode::WriteGlobalS16:
case IR::Opcode::WriteGlobalU16:
case IR::Opcode::WriteGlobal32:
case IR::Opcode::WriteGlobal64:
case IR::Opcode::WriteGlobal128:
case IR::Opcode::GlobalAtomicIAdd32:
case IR::Opcode::GlobalAtomicSMin32:
case IR::Opcode::GlobalAtomicUMin32:
case IR::Opcode::GlobalAtomicSMax32:
case IR::Opcode::GlobalAtomicUMax32:
case IR::Opcode::GlobalAtomicInc32:
case IR::Opcode::GlobalAtomicDec32:
case IR::Opcode::GlobalAtomicAnd32:
case IR::Opcode::GlobalAtomicOr32:
case IR::Opcode::GlobalAtomicXor32:
case IR::Opcode::GlobalAtomicExchange32:
case IR::Opcode::GlobalAtomicIAdd64:
case IR::Opcode::GlobalAtomicSMin64:
case IR::Opcode::GlobalAtomicUMin64:
case IR::Opcode::GlobalAtomicSMax64:
case IR::Opcode::GlobalAtomicUMax64:
case IR::Opcode::GlobalAtomicAnd64:
case IR::Opcode::GlobalAtomicOr64:
case IR::Opcode::GlobalAtomicXor64:
case IR::Opcode::GlobalAtomicExchange64:
case IR::Opcode::GlobalAtomicIAdd32x2:
case IR::Opcode::GlobalAtomicSMin32x2:
case IR::Opcode::GlobalAtomicUMin32x2:
case IR::Opcode::GlobalAtomicSMax32x2:
case IR::Opcode::GlobalAtomicUMax32x2:
case IR::Opcode::GlobalAtomicAnd32x2:
case IR::Opcode::GlobalAtomicOr32x2:
case IR::Opcode::GlobalAtomicXor32x2:
case IR::Opcode::GlobalAtomicExchange32x2:
case IR::Opcode::GlobalAtomicAddF32:
case IR::Opcode::GlobalAtomicAddF16x2:
case IR::Opcode::GlobalAtomicAddF32x2:
case IR::Opcode::GlobalAtomicMinF16x2:
case IR::Opcode::GlobalAtomicMinF32x2:
case IR::Opcode::GlobalAtomicMaxF16x2:
case IR::Opcode::GlobalAtomicMaxF32x2:
return true;
default:
return false;
}
}
/// Converts a global memory opcode to its storage buffer equivalent
IR::Opcode GlobalToStorage(IR::Opcode opcode) {
switch (opcode) {
case IR::Opcode::LoadGlobalS8:
return IR::Opcode::LoadStorageS8;
case IR::Opcode::LoadGlobalU8:
return IR::Opcode::LoadStorageU8;
case IR::Opcode::LoadGlobalS16:
return IR::Opcode::LoadStorageS16;
case IR::Opcode::LoadGlobalU16:
return IR::Opcode::LoadStorageU16;
case IR::Opcode::LoadGlobal32:
return IR::Opcode::LoadStorage32;
case IR::Opcode::LoadGlobal64:
return IR::Opcode::LoadStorage64;
case IR::Opcode::LoadGlobal128:
return IR::Opcode::LoadStorage128;
case IR::Opcode::WriteGlobalS8:
return IR::Opcode::WriteStorageS8;
case IR::Opcode::WriteGlobalU8:
return IR::Opcode::WriteStorageU8;
case IR::Opcode::WriteGlobalS16:
return IR::Opcode::WriteStorageS16;
case IR::Opcode::WriteGlobalU16:
return IR::Opcode::WriteStorageU16;
case IR::Opcode::WriteGlobal32:
return IR::Opcode::WriteStorage32;
case IR::Opcode::WriteGlobal64:
return IR::Opcode::WriteStorage64;
case IR::Opcode::WriteGlobal128:
return IR::Opcode::WriteStorage128;
case IR::Opcode::GlobalAtomicIAdd32:
return IR::Opcode::StorageAtomicIAdd32;
case IR::Opcode::GlobalAtomicSMin32:
return IR::Opcode::StorageAtomicSMin32;
case IR::Opcode::GlobalAtomicUMin32:
return IR::Opcode::StorageAtomicUMin32;
case IR::Opcode::GlobalAtomicSMax32:
return IR::Opcode::StorageAtomicSMax32;
case IR::Opcode::GlobalAtomicUMax32:
return IR::Opcode::StorageAtomicUMax32;
case IR::Opcode::GlobalAtomicInc32:
return IR::Opcode::StorageAtomicInc32;
case IR::Opcode::GlobalAtomicDec32:
return IR::Opcode::StorageAtomicDec32;
case IR::Opcode::GlobalAtomicAnd32:
return IR::Opcode::StorageAtomicAnd32;
case IR::Opcode::GlobalAtomicOr32:
return IR::Opcode::StorageAtomicOr32;
case IR::Opcode::GlobalAtomicXor32:
return IR::Opcode::StorageAtomicXor32;
case IR::Opcode::GlobalAtomicExchange32:
return IR::Opcode::StorageAtomicExchange32;
case IR::Opcode::GlobalAtomicIAdd64:
return IR::Opcode::StorageAtomicIAdd64;
case IR::Opcode::GlobalAtomicSMin64:
return IR::Opcode::StorageAtomicSMin64;
case IR::Opcode::GlobalAtomicUMin64:
return IR::Opcode::StorageAtomicUMin64;
case IR::Opcode::GlobalAtomicSMax64:
return IR::Opcode::StorageAtomicSMax64;
case IR::Opcode::GlobalAtomicUMax64:
return IR::Opcode::StorageAtomicUMax64;
case IR::Opcode::GlobalAtomicAnd64:
return IR::Opcode::StorageAtomicAnd64;
case IR::Opcode::GlobalAtomicOr64:
return IR::Opcode::StorageAtomicOr64;
case IR::Opcode::GlobalAtomicXor64:
return IR::Opcode::StorageAtomicXor64;
case IR::Opcode::GlobalAtomicExchange64:
return IR::Opcode::StorageAtomicExchange64;
case IR::Opcode::GlobalAtomicIAdd32x2:
return IR::Opcode::StorageAtomicIAdd32x2;
case IR::Opcode::GlobalAtomicSMin32x2:
return IR::Opcode::StorageAtomicSMin32x2;
case IR::Opcode::GlobalAtomicUMin32x2:
return IR::Opcode::StorageAtomicUMin32x2;
case IR::Opcode::GlobalAtomicSMax32x2:
return IR::Opcode::StorageAtomicSMax32x2;
case IR::Opcode::GlobalAtomicUMax32x2:
return IR::Opcode::StorageAtomicUMax32x2;
case IR::Opcode::GlobalAtomicAnd32x2:
return IR::Opcode::StorageAtomicAnd32x2;
case IR::Opcode::GlobalAtomicOr32x2:
return IR::Opcode::StorageAtomicOr32x2;
case IR::Opcode::GlobalAtomicXor32x2:
return IR::Opcode::StorageAtomicXor32x2;
case IR::Opcode::GlobalAtomicExchange32x2:
return IR::Opcode::StorageAtomicExchange32x2;
case IR::Opcode::GlobalAtomicAddF32:
return IR::Opcode::StorageAtomicAddF32;
case IR::Opcode::GlobalAtomicAddF16x2:
return IR::Opcode::StorageAtomicAddF16x2;
case IR::Opcode::GlobalAtomicMinF16x2:
return IR::Opcode::StorageAtomicMinF16x2;
case IR::Opcode::GlobalAtomicMaxF16x2:
return IR::Opcode::StorageAtomicMaxF16x2;
case IR::Opcode::GlobalAtomicAddF32x2:
return IR::Opcode::StorageAtomicAddF32x2;
case IR::Opcode::GlobalAtomicMinF32x2:
return IR::Opcode::StorageAtomicMinF32x2;
case IR::Opcode::GlobalAtomicMaxF32x2:
return IR::Opcode::StorageAtomicMaxF32x2;
default:
throw InvalidArgument("Invalid global memory opcode {}", opcode);
}
}
IR::Opcode GlobalToStorage(IR::Opcode opcode) {
switch (opcode) {
case IR::Opcode::LoadGlobalS8:
return IR::Opcode::LoadStorageS8;
case IR::Opcode::LoadGlobalU8:
return IR::Opcode::LoadStorageU8;
case IR::Opcode::LoadGlobalS16:
return IR::Opcode::LoadStorageS16;
case IR::Opcode::LoadGlobalU16:
return IR::Opcode::LoadStorageU16;
case IR::Opcode::LoadGlobal32:
return IR::Opcode::LoadStorage32;
case IR::Opcode::LoadGlobal64:
return IR::Opcode::LoadStorage64;
case IR::Opcode::LoadGlobal128:
return IR::Opcode::LoadStorage128;
case IR::Opcode::WriteGlobalS8:
return IR::Opcode::WriteStorageS8;
case IR::Opcode::WriteGlobalU8:
return IR::Opcode::WriteStorageU8;
case IR::Opcode::WriteGlobalS16:
return IR::Opcode::WriteStorageS16;
case IR::Opcode::WriteGlobalU16:
return IR::Opcode::WriteStorageU16;
case IR::Opcode::WriteGlobal32:
return IR::Opcode::WriteStorage32;
case IR::Opcode::WriteGlobal64:
return IR::Opcode::WriteStorage64;
case IR::Opcode::WriteGlobal128:
return IR::Opcode::WriteStorage128;
case IR::Opcode::GlobalAtomicIAdd32:
return IR::Opcode::StorageAtomicIAdd32;
case IR::Opcode::GlobalAtomicSMin32:
return IR::Opcode::StorageAtomicSMin32;
case IR::Opcode::GlobalAtomicUMin32:
return IR::Opcode::StorageAtomicUMin32;
case IR::Opcode::GlobalAtomicSMax32:
return IR::Opcode::StorageAtomicSMax32;
case IR::Opcode::GlobalAtomicUMax32:
return IR::Opcode::StorageAtomicUMax32;
case IR::Opcode::GlobalAtomicInc32:
return IR::Opcode::StorageAtomicInc32;
case IR::Opcode::GlobalAtomicDec32:
return IR::Opcode::StorageAtomicDec32;
case IR::Opcode::GlobalAtomicAnd32:
return IR::Opcode::StorageAtomicAnd32;
case IR::Opcode::GlobalAtomicOr32:
return IR::Opcode::StorageAtomicOr32;
case IR::Opcode::GlobalAtomicXor32:
return IR::Opcode::StorageAtomicXor32;
case IR::Opcode::GlobalAtomicExchange32:
return IR::Opcode::StorageAtomicExchange32;
case IR::Opcode::GlobalAtomicIAdd64:
return IR::Opcode::StorageAtomicIAdd64;
case IR::Opcode::GlobalAtomicSMin64:
return IR::Opcode::StorageAtomicSMin64;
case IR::Opcode::GlobalAtomicUMin64:
return IR::Opcode::StorageAtomicUMin64;
case IR::Opcode::GlobalAtomicSMax64:
return IR::Opcode::StorageAtomicSMax64;
case IR::Opcode::GlobalAtomicUMax64:
return IR::Opcode::StorageAtomicUMax64;
case IR::Opcode::GlobalAtomicAnd64:
return IR::Opcode::StorageAtomicAnd64;
case IR::Opcode::GlobalAtomicOr64:
return IR::Opcode::StorageAtomicOr64;
case IR::Opcode::GlobalAtomicXor64:
return IR::Opcode::StorageAtomicXor64;
case IR::Opcode::GlobalAtomicExchange64:
return IR::Opcode::StorageAtomicExchange64;
case IR::Opcode::GlobalAtomicIAdd32x2:
return IR::Opcode::StorageAtomicIAdd32x2;
case IR::Opcode::GlobalAtomicSMin32x2:
return IR::Opcode::StorageAtomicSMin32x2;
case IR::Opcode::GlobalAtomicUMin32x2:
return IR::Opcode::StorageAtomicUMin32x2;
case IR::Opcode::GlobalAtomicSMax32x2:
return IR::Opcode::StorageAtomicSMax32x2;
case IR::Opcode::GlobalAtomicUMax32x2:
return IR::Opcode::StorageAtomicUMax32x2;
case IR::Opcode::GlobalAtomicAnd32x2:
return IR::Opcode::StorageAtomicAnd32x2;
case IR::Opcode::GlobalAtomicOr32x2:
return IR::Opcode::StorageAtomicOr32x2;
case IR::Opcode::GlobalAtomicXor32x2:
return IR::Opcode::StorageAtomicXor32x2;
case IR::Opcode::GlobalAtomicExchange32x2:
return IR::Opcode::StorageAtomicExchange32x2;
case IR::Opcode::GlobalAtomicAddF32:
return IR::Opcode::StorageAtomicAddF32;
case IR::Opcode::GlobalAtomicAddF16x2:
return IR::Opcode::StorageAtomicAddF16x2;
case IR::Opcode::GlobalAtomicMinF16x2:
return IR::Opcode::StorageAtomicMinF16x2;
case IR::Opcode::GlobalAtomicMaxF16x2:
return IR::Opcode::StorageAtomicMaxF16x2;
case IR::Opcode::GlobalAtomicAddF32x2:
return IR::Opcode::StorageAtomicAddF32x2;
case IR::Opcode::GlobalAtomicMinF32x2:
return IR::Opcode::StorageAtomicMinF32x2;
case IR::Opcode::GlobalAtomicMaxF32x2:
return IR::Opcode::StorageAtomicMaxF32x2;
default:
throw InvalidArgument("Invalid global memory opcode {}", opcode);
}
}
/// Returns true when a storage buffer address satisfies a bias
bool MeetsBias(const StorageBufferAddr& storage_buffer, const Bias& bias) noexcept {
return storage_buffer.index == bias.index && storage_buffer.offset >= bias.offset_begin &&
storage_buffer.offset < bias.offset_end;
}
bool MeetsBias(const StorageBufferAddr& storage_buffer, const Bias& bias) noexcept {
return storage_buffer.index == bias.index && storage_buffer.offset >= bias.offset_begin &&
storage_buffer.offset < bias.offset_end;
}
struct LowAddrInfo {
IR::U32 value;
s32 imm_offset;
};
struct LowAddrInfo {
IR::U32 value;
s32 imm_offset;
};
/// Tries to track the first 32-bits of a global memory instruction
std::optional<LowAddrInfo> TrackLowAddress(IR::Inst* inst) {
// The first argument is the low level GPU pointer to the global memory instruction
const IR::Value addr{inst->Arg(0)};
if (addr.IsImmediate()) {
// Not much we can do if it's an immediate
return std::nullopt;
}
// This address is expected to either be a PackUint2x32, a IAdd64, or a CompositeConstructU32x2
IR::Inst* addr_inst{addr.InstRecursive()};
s32 imm_offset{0};
if (addr_inst->GetOpcode() == IR::Opcode::IAdd64) {
// If it's an IAdd64, get the immediate offset it is applying and grab the address
// instruction. This expects for the instruction to be canonicalized having the address on
// the first argument and the immediate offset on the second one.
const IR::U64 imm_offset_value{addr_inst->Arg(1)};
if (!imm_offset_value.IsImmediate()) {
std::optional<LowAddrInfo> TrackLowAddress(IR::Inst* inst) {
// The first argument is the low level GPU pointer to the global memory instruction
const IR::Value addr{inst->Arg(0)};
if (addr.IsImmediate()) {
// Not much we can do if it's an immediate
return std::nullopt;
}
imm_offset = static_cast<s32>(static_cast<s64>(imm_offset_value.U64()));
const IR::U64 iadd_addr{addr_inst->Arg(0)};
if (iadd_addr.IsImmediate()) {
// This address is expected to either be a PackUint2x32, a IAdd64, or a CompositeConstructU32x2
IR::Inst* addr_inst{addr.InstRecursive()};
s32 imm_offset{0};
if (addr_inst->GetOpcode() == IR::Opcode::IAdd64) {
// If it's an IAdd64, get the immediate offset it is applying and grab the address
// instruction. This expects for the instruction to be canonicalized having the address on
// the first argument and the immediate offset on the second one.
const IR::U64 imm_offset_value{addr_inst->Arg(1)};
if (!imm_offset_value.IsImmediate()) {
return std::nullopt;
}
imm_offset = static_cast<s32>(static_cast<s64>(imm_offset_value.U64()));
const IR::U64 iadd_addr{addr_inst->Arg(0)};
if (iadd_addr.IsImmediate()) {
return std::nullopt;
}
addr_inst = iadd_addr.InstRecursive();
}
// With IAdd64 handled, now PackUint2x32 is expected
if (addr_inst->GetOpcode() == IR::Opcode::PackUint2x32) {
// PackUint2x32 is expected to be generated from a vector
const IR::Value vector{addr_inst->Arg(0)};
if (vector.IsImmediate()) {
return std::nullopt;
}
addr_inst = vector.InstRecursive();
}
// The vector is expected to be a CompositeConstructU32x2
if (addr_inst->GetOpcode() != IR::Opcode::CompositeConstructU32x2) {
return std::nullopt;
}
addr_inst = iadd_addr.InstRecursive();
// Grab the first argument from the CompositeConstructU32x2, this is the low address.
return LowAddrInfo{
.value{IR::U32{addr_inst->Arg(0)}},
.imm_offset = imm_offset,
};
}
// With IAdd64 handled, now PackUint2x32 is expected
if (addr_inst->GetOpcode() == IR::Opcode::PackUint2x32) {
// PackUint2x32 is expected to be generated from a vector
const IR::Value vector{addr_inst->Arg(0)};
if (vector.IsImmediate()) {
return std::nullopt;
}
addr_inst = vector.InstRecursive();
}
// The vector is expected to be a CompositeConstructU32x2
if (addr_inst->GetOpcode() != IR::Opcode::CompositeConstructU32x2) {
return std::nullopt;
}
// Grab the first argument from the CompositeConstructU32x2, this is the low address.
return LowAddrInfo{
.value{IR::U32{addr_inst->Arg(0)}},
.imm_offset = imm_offset,
};
}
/// Tries to track the storage buffer address used by a global memory instruction
std::optional<StorageBufferAddr> Track(const IR::Value& value, const Bias* bias) {
const auto pred{[bias](const IR::Inst* inst) -> std::optional<StorageBufferAddr> {
if (inst->GetOpcode() != IR::Opcode::GetCbufU32 &&
inst->GetOpcode() != IR::Opcode::GetCbufU32x2) {
return std::nullopt;
}
const IR::Value index{inst->Arg(0)};
const IR::Value offset{inst->Arg(1)};
if (!index.IsImmediate()) {
// Definitely not a storage buffer if it's read from a
// non-immediate index
return std::nullopt;
}
if (!offset.IsImmediate()) {
// TODO: Support SSBO arrays
return std::nullopt;
}
const StorageBufferAddr storage_buffer{
.index = index.U32(),
.offset = offset.U32(),
};
const u32 alignment{bias ? bias->alignment : 8U};
if (!Common::IsAligned(storage_buffer.offset, alignment)) {
// The SSBO pointer has to be aligned
return std::nullopt;
}
if (bias && !MeetsBias(storage_buffer, *bias)) {
// We have to blacklist some addresses in case we wrongly
// point to them
return std::nullopt;
}
return storage_buffer;
}};
return BreadthFirstSearch(value, pred);
}
std::optional<StorageBufferAddr> Track(const IR::Value& value, const Bias* bias) {
const auto pred{[bias](const IR::Inst* inst) -> std::optional<StorageBufferAddr> {
if (inst->GetOpcode() != IR::Opcode::GetCbufU32 &&
inst->GetOpcode() != IR::Opcode::GetCbufU32x2) {
return std::nullopt;
}
const IR::Value index{inst->Arg(0)};
const IR::Value offset{inst->Arg(1)};
if (!index.IsImmediate()) {
// Definitely not a storage buffer if it's read from a
// non-immediate index
return std::nullopt;
}
if (!offset.IsImmediate()) {
// TODO: Support SSBO arrays
return std::nullopt;
}
const StorageBufferAddr storage_buffer{
.index = index.U32(),
.offset = offset.U32(),
};
const u32 alignment{bias ? bias->alignment : 8U};
if (!Common::IsAligned(storage_buffer.offset, alignment)) {
// The SSBO pointer has to be aligned
return std::nullopt;
}
if (bias && !MeetsBias(storage_buffer, *bias)) {
// We have to blacklist some addresses in case we wrongly
// point to them
return std::nullopt;
}
return storage_buffer;
}};
return BreadthFirstSearch(value, pred);
}
/// Collects the storage buffer used by a global memory instruction and the instruction itself
void CollectStorageBuffers(IR::Block& block, IR::Inst& inst, StorageInfo& info) {
// NVN puts storage buffers in a specific range, we have to bias towards these addresses to
// avoid getting false positives
static constexpr Bias nvn_bias{
.index = 0,
.offset_begin = 0x110,
.offset_end = 0x610,
.alignment = 16,
};
// Track the low address of the instruction
const std::optional<LowAddrInfo> low_addr_info{TrackLowAddress(&inst)};
if (!low_addr_info) {
// Failed to track the low address, use NVN fallbacks
return;
}
// First try to find storage buffers in the NVN address
const IR::U32 low_addr{low_addr_info->value};
std::optional<StorageBufferAddr> storage_buffer{Track(low_addr, &nvn_bias)};
if (!storage_buffer) {
// If it fails, track without a bias
storage_buffer = Track(low_addr, nullptr);
if (!storage_buffer) {
// If that also fails, use NVN fallbacks
LOG_WARNING(Shader, "Storage buffer failed to track, using global memory fallbacks");
void CollectStorageBuffers(IR::Block& block, IR::Inst& inst, StorageInfo& info) {
// NVN puts storage buffers in a specific range, we have to bias towards these addresses to
// avoid getting false positives
static constexpr Bias nvn_bias{
.index = 0,
.offset_begin = 0x100, // Adjusted start for broader range
.offset_end = 0x700, // Adjusted end for broader range
.alignment = 16, // You may adjust alignment based on the platform (e.g., 32)
};
// Track the low address of the instruction
const std::optional<LowAddrInfo> low_addr_info{TrackLowAddress(&inst)};
if (!low_addr_info) {
// Failed to track the low address, use NVN fallbacks
return;
}
LOG_WARNING(Shader, "Storage buffer tracked without bias, index {} offset {}",
storage_buffer->index, storage_buffer->offset);
// First try to find storage buffers in the NVN address
const IR::U32 low_addr{low_addr_info->value};
std::optional<StorageBufferAddr> storage_buffer{Track(low_addr, &nvn_bias)};
if (!storage_buffer) {
// If it fails, track without a bias
storage_buffer = Track(low_addr, nullptr);
if (!storage_buffer) {
// If that also fails, use NVN fallbacks
LOG_WARNING(Shader, "Storage buffer failed to track, using global memory fallbacks");
return;
}
LOG_WARNING(Shader, "Storage buffer tracked without bias, index {} offset {}",
storage_buffer->index, storage_buffer->offset);
}
// Collect storage buffer and the instruction
if (IsGlobalMemoryWrite(inst)) {
info.writes.insert(*storage_buffer);
}
info.set.insert(*storage_buffer);
info.to_replace.push_back(StorageInst{
.storage_buffer{*storage_buffer},
.inst = &inst,
.block = &block,
});
}
// Collect storage buffer and the instruction
if (IsGlobalMemoryWrite(inst)) {
info.writes.insert(*storage_buffer);
}
info.set.insert(*storage_buffer);
info.to_replace.push_back(StorageInst{
.storage_buffer{*storage_buffer},
.inst = &inst,
.block = &block,
});
}
/// Returns the offset in indices (not bytes) for an equivalent storage instruction
IR::U32 StorageOffset(IR::Block& block, IR::Inst& inst, StorageBufferAddr buffer, u32 alignment) {
IR::IREmitter ir{block, IR::Block::InstructionList::s_iterator_to(inst)};
IR::U32 offset;
if (const std::optional<LowAddrInfo> low_addr{TrackLowAddress(&inst)}) {
offset = low_addr->value;
if (low_addr->imm_offset != 0) {
offset = ir.IAdd(offset, ir.Imm32(low_addr->imm_offset));
IR::U32 StorageOffset(IR::Block& block, IR::Inst& inst, StorageBufferAddr buffer, u32 alignment) {
IR::IREmitter ir{block, IR::Block::InstructionList::s_iterator_to(inst)};
IR::U32 offset;
if (const std::optional<LowAddrInfo> low_addr{TrackLowAddress(&inst)}) {
offset = low_addr->value;
if (low_addr->imm_offset != 0) {
offset = ir.IAdd(offset, ir.Imm32(low_addr->imm_offset));
}
} else {
offset = ir.UConvert(32, IR::U64{inst.Arg(0)});
}
} else {
offset = ir.UConvert(32, IR::U64{inst.Arg(0)});
}
// Subtract the least significant 32 bits from the guest offset. The result is the storage
// buffer offset in bytes.
IR::U32 low_cbuf{ir.GetCbuf(ir.Imm32(buffer.index), ir.Imm32(buffer.offset))};
// Subtract the least significant 32 bits from the guest offset. The result is the storage
// buffer offset in bytes.
IR::U32 low_cbuf{ir.GetCbuf(ir.Imm32(buffer.index), ir.Imm32(buffer.offset))};
// Align the offset base to match the host alignment requirements
low_cbuf = ir.BitwiseAnd(low_cbuf, ir.Imm32(~(alignment - 1U)));
return ir.ISub(offset, low_cbuf);
}
// Align the offset base to match the host alignment requirements
low_cbuf = ir.BitwiseAnd(low_cbuf, ir.Imm32(~(alignment - 1U)));
return ir.ISub(offset, low_cbuf);
}
/// Replace a global memory load instruction with its storage buffer equivalent
void ReplaceLoad(IR::Block& block, IR::Inst& inst, const IR::U32& storage_index,
const IR::U32& offset) {
const IR::Opcode new_opcode{GlobalToStorage(inst.GetOpcode())};
const auto it{IR::Block::InstructionList::s_iterator_to(inst)};
const IR::Value value{&*block.PrependNewInst(it, new_opcode, {storage_index, offset})};
inst.ReplaceUsesWith(value);
}
void ReplaceLoad(IR::Block& block, IR::Inst& inst, const IR::U32& storage_index,
const IR::U32& offset) {
const IR::Opcode new_opcode{GlobalToStorage(inst.GetOpcode())};
const auto it{IR::Block::InstructionList::s_iterator_to(inst)};
const IR::Value value{&*block.PrependNewInst(it, new_opcode, {storage_index, offset})};
inst.ReplaceUsesWith(value);
}
/// Replace a global memory write instruction with its storage buffer equivalent
void ReplaceWrite(IR::Block& block, IR::Inst& inst, const IR::U32& storage_index,
const IR::U32& offset) {
const IR::Opcode new_opcode{GlobalToStorage(inst.GetOpcode())};
const auto it{IR::Block::InstructionList::s_iterator_to(inst)};
block.PrependNewInst(it, new_opcode, {storage_index, offset, inst.Arg(1)});
inst.Invalidate();
}
void ReplaceWrite(IR::Block& block, IR::Inst& inst, const IR::U32& storage_index,
const IR::U32& offset) {
const IR::Opcode new_opcode{GlobalToStorage(inst.GetOpcode())};
const auto it{IR::Block::InstructionList::s_iterator_to(inst)};
block.PrependNewInst(it, new_opcode, {storage_index, offset, inst.Arg(1)});
inst.Invalidate();
}
/// Replace an atomic operation on global memory instruction with its storage buffer equivalent
void ReplaceAtomic(IR::Block& block, IR::Inst& inst, const IR::U32& storage_index,
const IR::U32& offset) {
const IR::Opcode new_opcode{GlobalToStorage(inst.GetOpcode())};
const auto it{IR::Block::InstructionList::s_iterator_to(inst)};
const IR::Value value{
&*block.PrependNewInst(it, new_opcode, {storage_index, offset, inst.Arg(1)})};
inst.ReplaceUsesWith(value);
}
void ReplaceAtomic(IR::Block& block, IR::Inst& inst, const IR::U32& storage_index,
const IR::U32& offset) {
const IR::Opcode new_opcode{GlobalToStorage(inst.GetOpcode())};
const auto it{IR::Block::InstructionList::s_iterator_to(inst)};
const IR::Value value{
&*block.PrependNewInst(it, new_opcode, {storage_index, offset, inst.Arg(1)})};
inst.ReplaceUsesWith(value);
}
/// Replace a global memory instruction with its storage buffer equivalent
void Replace(IR::Block& block, IR::Inst& inst, const IR::U32& storage_index,
const IR::U32& offset) {
switch (inst.GetOpcode()) {
case IR::Opcode::LoadGlobalS8:
case IR::Opcode::LoadGlobalU8:
case IR::Opcode::LoadGlobalS16:
case IR::Opcode::LoadGlobalU16:
case IR::Opcode::LoadGlobal32:
case IR::Opcode::LoadGlobal64:
case IR::Opcode::LoadGlobal128:
return ReplaceLoad(block, inst, storage_index, offset);
case IR::Opcode::WriteGlobalS8:
case IR::Opcode::WriteGlobalU8:
case IR::Opcode::WriteGlobalS16:
case IR::Opcode::WriteGlobalU16:
case IR::Opcode::WriteGlobal32:
case IR::Opcode::WriteGlobal64:
case IR::Opcode::WriteGlobal128:
return ReplaceWrite(block, inst, storage_index, offset);
case IR::Opcode::GlobalAtomicIAdd32:
case IR::Opcode::GlobalAtomicSMin32:
case IR::Opcode::GlobalAtomicUMin32:
case IR::Opcode::GlobalAtomicSMax32:
case IR::Opcode::GlobalAtomicUMax32:
case IR::Opcode::GlobalAtomicInc32:
case IR::Opcode::GlobalAtomicDec32:
case IR::Opcode::GlobalAtomicAnd32:
case IR::Opcode::GlobalAtomicOr32:
case IR::Opcode::GlobalAtomicXor32:
case IR::Opcode::GlobalAtomicExchange32:
case IR::Opcode::GlobalAtomicIAdd64:
case IR::Opcode::GlobalAtomicSMin64:
case IR::Opcode::GlobalAtomicUMin64:
case IR::Opcode::GlobalAtomicSMax64:
case IR::Opcode::GlobalAtomicUMax64:
case IR::Opcode::GlobalAtomicAnd64:
case IR::Opcode::GlobalAtomicOr64:
case IR::Opcode::GlobalAtomicXor64:
case IR::Opcode::GlobalAtomicExchange64:
case IR::Opcode::GlobalAtomicIAdd32x2:
case IR::Opcode::GlobalAtomicSMin32x2:
case IR::Opcode::GlobalAtomicUMin32x2:
case IR::Opcode::GlobalAtomicSMax32x2:
case IR::Opcode::GlobalAtomicUMax32x2:
case IR::Opcode::GlobalAtomicAnd32x2:
case IR::Opcode::GlobalAtomicOr32x2:
case IR::Opcode::GlobalAtomicXor32x2:
case IR::Opcode::GlobalAtomicExchange32x2:
case IR::Opcode::GlobalAtomicAddF32:
case IR::Opcode::GlobalAtomicAddF16x2:
case IR::Opcode::GlobalAtomicAddF32x2:
case IR::Opcode::GlobalAtomicMinF16x2:
case IR::Opcode::GlobalAtomicMinF32x2:
case IR::Opcode::GlobalAtomicMaxF16x2:
case IR::Opcode::GlobalAtomicMaxF32x2:
return ReplaceAtomic(block, inst, storage_index, offset);
default:
throw InvalidArgument("Invalid global memory opcode {}", inst.GetOpcode());
void Replace(IR::Block& block, IR::Inst& inst, const IR::U32& storage_index,
const IR::U32& offset) {
switch (inst.GetOpcode()) {
case IR::Opcode::LoadGlobalS8:
case IR::Opcode::LoadGlobalU8:
case IR::Opcode::LoadGlobalS16:
case IR::Opcode::LoadGlobalU16:
case IR::Opcode::LoadGlobal32:
case IR::Opcode::LoadGlobal64:
case IR::Opcode::LoadGlobal128:
return ReplaceLoad(block, inst, storage_index, offset);
case IR::Opcode::WriteGlobalS8:
case IR::Opcode::WriteGlobalU8:
case IR::Opcode::WriteGlobalS16:
case IR::Opcode::WriteGlobalU16:
case IR::Opcode::WriteGlobal32:
case IR::Opcode::WriteGlobal64:
case IR::Opcode::WriteGlobal128:
return ReplaceWrite(block, inst, storage_index, offset);
case IR::Opcode::GlobalAtomicIAdd32:
case IR::Opcode::GlobalAtomicSMin32:
case IR::Opcode::GlobalAtomicUMin32:
case IR::Opcode::GlobalAtomicSMax32:
case IR::Opcode::GlobalAtomicUMax32:
case IR::Opcode::GlobalAtomicInc32:
case IR::Opcode::GlobalAtomicDec32:
case IR::Opcode::GlobalAtomicAnd32:
case IR::Opcode::GlobalAtomicOr32:
case IR::Opcode::GlobalAtomicXor32:
case IR::Opcode::GlobalAtomicExchange32:
case IR::Opcode::GlobalAtomicIAdd64:
case IR::Opcode::GlobalAtomicSMin64:
case IR::Opcode::GlobalAtomicUMin64:
case IR::Opcode::GlobalAtomicSMax64:
case IR::Opcode::GlobalAtomicUMax64:
case IR::Opcode::GlobalAtomicAnd64:
case IR::Opcode::GlobalAtomicOr64:
case IR::Opcode::GlobalAtomicXor64:
case IR::Opcode::GlobalAtomicExchange64:
case IR::Opcode::GlobalAtomicIAdd32x2:
case IR::Opcode::GlobalAtomicSMin32x2:
case IR::Opcode::GlobalAtomicUMin32x2:
case IR::Opcode::GlobalAtomicSMax32x2:
case IR::Opcode::GlobalAtomicUMax32x2:
case IR::Opcode::GlobalAtomicAnd32x2:
case IR::Opcode::GlobalAtomicOr32x2:
case IR::Opcode::GlobalAtomicXor32x2:
case IR::Opcode::GlobalAtomicExchange32x2:
case IR::Opcode::GlobalAtomicAddF32:
case IR::Opcode::GlobalAtomicAddF16x2:
case IR::Opcode::GlobalAtomicAddF32x2:
case IR::Opcode::GlobalAtomicMinF16x2:
case IR::Opcode::GlobalAtomicMinF32x2:
case IR::Opcode::GlobalAtomicMaxF16x2:
case IR::Opcode::GlobalAtomicMaxF32x2:
return ReplaceAtomic(block, inst, storage_index, offset);
default:
throw InvalidArgument("Invalid global memory opcode {}", inst.GetOpcode());
}
}
}
} // Anonymous namespace
void GlobalMemoryToStorageBufferPass(IR::Program& program, const HostTranslateInfo& host_info) {
@ -532,11 +532,11 @@ void GlobalMemoryToStorageBufferPass(IR::Program& program, const HostTranslateIn
}
for (const StorageBufferAddr& storage_buffer : info.set) {
program.info.storage_buffers_descriptors.push_back({
.cbuf_index = storage_buffer.index,
.cbuf_offset = storage_buffer.offset,
.count = 1,
.is_written = info.writes.contains(storage_buffer),
});
.cbuf_index = storage_buffer.index,
.cbuf_offset = storage_buffer.offset,
.count = 1,
.is_written = info.writes.contains(storage_buffer),
});
}
for (const StorageInst& storage_inst : info.to_replace) {
const StorageBufferAddr storage_buffer{storage_inst.storage_buffer};
@ -545,7 +545,7 @@ void GlobalMemoryToStorageBufferPass(IR::Program& program, const HostTranslateIn
IR::Block* const block{storage_inst.block};
IR::Inst* const inst{storage_inst.inst};
const IR::U32 offset{
StorageOffset(*block, *inst, storage_buffer, host_info.min_ssbo_alignment)};
StorageOffset(*block, *inst, storage_buffer, host_info.min_ssbo_alignment)};
Replace(*block, *inst, index, offset);
}
}