diff --git a/lib/Target/AMDGPU/AMDGPU.td b/lib/Target/AMDGPU/AMDGPU.td index b842ba17675..5321fe14873 100644 --- a/lib/Target/AMDGPU/AMDGPU.td +++ b/lib/Target/AMDGPU/AMDGPU.td @@ -175,6 +175,18 @@ def FeatureFPExceptions : SubtargetFeature<"fp-exceptions", "Enable floating point exceptions" >; +class FeatureMaxPrivateElementSize : SubtargetFeature< + "max-private-element-size-"#size, + "MaxPrivateElementSize", + !cast(size), + "Maximum private access size may be "#size +>; + +def FeatureMaxPrivateElementSize4 : FeatureMaxPrivateElementSize<4>; +def FeatureMaxPrivateElementSize8 : FeatureMaxPrivateElementSize<8>; +def FeatureMaxPrivateElementSize16 : FeatureMaxPrivateElementSize<16>; + + def FeatureEnableHugeScratchBuffer : SubtargetFeature< "huge-scratch-buffer", "EnableHugeScratchBuffer", diff --git a/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp b/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp index 913f3f53ff1..2af7a2ccbfe 100644 --- a/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp +++ b/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp @@ -593,6 +593,20 @@ void AMDGPUAsmPrinter::EmitProgramInfoSI(const MachineFunction &MF, } } +// This is supposed to be log2(Size) +static amd_element_byte_size_t getElementByteSizeValue(unsigned Size) { + switch (Size) { + case 4: + return AMD_ELEMENT_4_BYTES; + case 8: + return AMD_ELEMENT_8_BYTES; + case 16: + return AMD_ELEMENT_16_BYTES; + default: + llvm_unreachable("invalid private_element_size"); + } +} + void AMDGPUAsmPrinter::EmitAmdKernelCodeT(const MachineFunction &MF, const SIProgramInfo &KernelInfo) const { const SIMachineFunctionInfo *MFI = MF.getInfo(); @@ -606,6 +620,11 @@ void AMDGPUAsmPrinter::EmitAmdKernelCodeT(const MachineFunction &MF, (KernelInfo.ComputePGMRSrc2 << 32); header.code_properties = AMD_CODE_PROPERTY_IS_PTR64; + + AMD_HSA_BITS_SET(header.code_properties, + AMD_CODE_PROPERTY_PRIVATE_ELEMENT_SIZE, + getElementByteSizeValue(STM.getMaxPrivateElementSize())); + if (MFI->hasPrivateSegmentBuffer()) { header.code_properties |= AMD_CODE_PROPERTY_ENABLE_SGPR_PRIVATE_SEGMENT_BUFFER; diff --git a/lib/Target/AMDGPU/AMDGPUSubtarget.cpp b/lib/Target/AMDGPU/AMDGPUSubtarget.cpp index 39b7030aa84..8e58aae9b79 100644 --- a/lib/Target/AMDGPU/AMDGPUSubtarget.cpp +++ b/lib/Target/AMDGPU/AMDGPUSubtarget.cpp @@ -58,6 +58,11 @@ AMDGPUSubtarget::initializeSubtargetDependencies(const Triple &TT, FP32Denormals = false; FP64Denormals = false; } + + // Set defaults if needed. + if (MaxPrivateElementSize == 0) + MaxPrivateElementSize = 16; + return *this; } @@ -74,7 +79,7 @@ AMDGPUSubtarget::AMDGPUSubtarget(const Triple &TT, StringRef GPU, StringRef FS, EnableUnsafeDSOffsetFolding(false), EnableXNACK(false), WavefrontSize(0), CFALUBug(false), - LocalMemorySize(0), + LocalMemorySize(0), MaxPrivateElementSize(0), EnableVGPRSpilling(false), SGPRInitBug(false), IsGCN(false), GCN1Encoding(false), GCN3Encoding(false), CIInsts(false), LDSBankCount(0), IsaVersion(ISAVersion0_0_0), EnableHugeScratchBuffer(false), diff --git a/lib/Target/AMDGPU/AMDGPUSubtarget.h b/lib/Target/AMDGPU/AMDGPUSubtarget.h index 109ca9f9ce1..c943b2cf6f4 100644 --- a/lib/Target/AMDGPU/AMDGPUSubtarget.h +++ b/lib/Target/AMDGPU/AMDGPUSubtarget.h @@ -81,6 +81,7 @@ private: unsigned WavefrontSize; bool CFALUBug; int LocalMemorySize; + unsigned MaxPrivateElementSize; bool EnableVGPRSpilling; bool SGPRInitBug; bool IsGCN; @@ -253,6 +254,10 @@ public: return LocalMemorySize; } + unsigned getMaxPrivateElementSize() const { + return MaxPrivateElementSize; + } + bool hasSGPRInitBug() const { return SGPRInitBug; } diff --git a/lib/Target/AMDGPU/AMDKernelCodeT.h b/lib/Target/AMDGPU/AMDKernelCodeT.h index a9ba60c8cba..425261c15f9 100644 --- a/lib/Target/AMDGPU/AMDKernelCodeT.h +++ b/lib/Target/AMDGPU/AMDKernelCodeT.h @@ -44,6 +44,15 @@ enum amd_code_version_t { AMD_CODE_VERSION_MINOR = 1 }; +// Sets val bits for specified mask in specified dst packed instance. +#define AMD_HSA_BITS_SET(dst, mask, val) \ + dst &= (~(1 << mask ## _SHIFT) & ~mask); \ + dst |= (((val) << mask ## _SHIFT) & mask) + +// Gets bits for specified mask from specified src packed instance. +#define AMD_HSA_BITS_GET(src, mask) \ + ((src & mask) >> mask ## _SHIFT) \ + /// The values used to define the number of bytes to use for the /// swizzle element size. enum amd_element_byte_size_t { diff --git a/lib/Target/AMDGPU/SIInstrInfo.cpp b/lib/Target/AMDGPU/SIInstrInfo.cpp index 952b2d6fd72..7228d40e611 100644 --- a/lib/Target/AMDGPU/SIInstrInfo.cpp +++ b/lib/Target/AMDGPU/SIInstrInfo.cpp @@ -3059,6 +3059,10 @@ uint64_t SIInstrInfo::getScratchRsrcWords23() const { AMDGPU::RSRC_TID_ENABLE | 0xffffffff; // Size; + uint64_t EltSizeValue = Log2_32(ST.getMaxPrivateElementSize()) - 1; + + Rsrc23 |= (EltSizeValue << AMDGPU::RSRC_ELEMENT_SIZE_SHIFT); + // If TID_ENABLE is set, DATA_FORMAT specifies stride bits [14:17]. // Clear them unless we want a huge stride. if (ST.getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS) diff --git a/lib/Target/AMDGPU/SIInstrInfo.h b/lib/Target/AMDGPU/SIInstrInfo.h index 66e80c3f272..5fab4abf2e7 100644 --- a/lib/Target/AMDGPU/SIInstrInfo.h +++ b/lib/Target/AMDGPU/SIInstrInfo.h @@ -489,7 +489,7 @@ namespace AMDGPU { const uint64_t RSRC_DATA_FORMAT = 0xf00000000000LL; const uint64_t RSRC_TID_ENABLE = 1LL << 55; - + const uint64_t RSRC_ELEMENT_SIZE_SHIFT = 51; } // End namespace AMDGPU namespace SI { diff --git a/test/CodeGen/AMDGPU/large-alloca-compute.ll b/test/CodeGen/AMDGPU/large-alloca-compute.ll index 84380b42105..da40c8593e9 100644 --- a/test/CodeGen/AMDGPU/large-alloca-compute.ll +++ b/test/CodeGen/AMDGPU/large-alloca-compute.ll @@ -10,8 +10,8 @@ ; GCN: s_mov_b32 s8, SCRATCH_RSRC_DWORD0 ; GCN: s_mov_b32 s9, SCRATCH_RSRC_DWORD1 ; GCN: s_mov_b32 s10, -1 -; CI: s_mov_b32 s11, 0x80f000 -; VI: s_mov_b32 s11, 0x800000 +; CI: s_mov_b32 s11, 0x98f000 +; VI: s_mov_b32 s11, 0x980000 ; GCNHSA: .amd_kernel_code_t diff --git a/test/CodeGen/AMDGPU/large-alloca-graphics.ll b/test/CodeGen/AMDGPU/large-alloca-graphics.ll index b6f8093313c..7e48b16e523 100644 --- a/test/CodeGen/AMDGPU/large-alloca-graphics.ll +++ b/test/CodeGen/AMDGPU/large-alloca-graphics.ll @@ -5,8 +5,8 @@ ; GCN: s_mov_b32 s8, SCRATCH_RSRC_DWORD0 ; GCN: s_mov_b32 s9, SCRATCH_RSRC_DWORD1 ; GCN: s_mov_b32 s10, -1 -; CI: s_mov_b32 s11, 0x80f000 -; VI: s_mov_b32 s11, 0x800000 +; CI: s_mov_b32 s11, 0x98f000 +; VI: s_mov_b32 s11, 0x980000 ; GCN: buffer_store_dword {{v[0-9]+}}, {{v[0-9]+}}, s[8:11], s1 offen ; GCN: buffer_load_dword {{v[0-9]+}}, {{v[0-9]+}}, s[8:11], s1 offen @@ -26,8 +26,8 @@ define void @large_alloca_pixel_shader(i32 %x, i32 %y) #1 { ; GCN: s_mov_b32 s8, SCRATCH_RSRC_DWORD0 ; GCN: s_mov_b32 s9, SCRATCH_RSRC_DWORD1 ; GCN: s_mov_b32 s10, -1 -; CI: s_mov_b32 s11, 0x80f000 -; VI: s_mov_b32 s11, 0x800000 +; CI: s_mov_b32 s11, 0x98f000 +; VI: s_mov_b32 s11, 0x980000 ; GCN: buffer_store_dword {{v[0-9]+}}, {{v[0-9]+}}, s[8:11], s1 offen ; GCN: buffer_load_dword {{v[0-9]+}}, {{v[0-9]+}}, s[8:11], s1 offen diff --git a/test/CodeGen/AMDGPU/vgpr-spill-emergency-stack-slot-compute.ll b/test/CodeGen/AMDGPU/vgpr-spill-emergency-stack-slot-compute.ll index 390daf8caf5..69147071348 100644 --- a/test/CodeGen/AMDGPU/vgpr-spill-emergency-stack-slot-compute.ll +++ b/test/CodeGen/AMDGPU/vgpr-spill-emergency-stack-slot-compute.ll @@ -17,8 +17,8 @@ ; GCN: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 ; GCN-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 ; GCN-NEXT: s_mov_b32 s14, -1 -; SI-NEXT: s_mov_b32 s15, 0x80f000 -; VI-NEXT: s_mov_b32 s15, 0x800000 +; SI-NEXT: s_mov_b32 s15, 0x98f000 +; VI-NEXT: s_mov_b32 s15, 0x980000 ; GCN: buffer_store_dword {{v[0-9]+}}, s[12:15], s16 offset:{{[0-9]+}} ; 4-byte Folded Spill diff --git a/test/CodeGen/AMDGPU/vgpr-spill-emergency-stack-slot.ll b/test/CodeGen/AMDGPU/vgpr-spill-emergency-stack-slot.ll index 16abb89bb0b..eeecf6d2398 100644 --- a/test/CodeGen/AMDGPU/vgpr-spill-emergency-stack-slot.ll +++ b/test/CodeGen/AMDGPU/vgpr-spill-emergency-stack-slot.ll @@ -14,8 +14,8 @@ ; GCN: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 ; GCN-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 ; GCN-NEXT: s_mov_b32 s14, -1 -; SI-NEXT: s_mov_b32 s15, 0x80f000 -; VI-NEXT: s_mov_b32 s15, 0x800000 +; SI-NEXT: s_mov_b32 s15, 0x98f000 +; VI-NEXT: s_mov_b32 s15, 0x980000 ; s12 is offset user SGPR ; GCN: buffer_store_dword {{v[0-9]+}}, s[12:15], s11 offset:{{[0-9]+}} ; 4-byte Folded Spill