mirror of
https://github.com/RPCS3/llvm-mirror.git
synced 2025-04-12 12:33:17 +00:00
AMDGPU: Select DS insts without m0 initialization
GFX9 stopped using m0 for most DS instructions. Select a different instruction without the use. I think this will be less error prone than trying to manually maintain m0 uses as needed. llvm-svn: 319270
This commit is contained in:
parent
f2169e2887
commit
6fa14c4ed9
@ -721,6 +721,10 @@ def HasFlatScratchInsts : Predicate<"Subtarget->hasFlatScratchInsts()">,
|
||||
def HasD16LoadStore : Predicate<"Subtarget->hasD16LoadStore()">,
|
||||
AssemblerPredicate<"FeatureGFX9Insts">;
|
||||
|
||||
|
||||
def LDSRequiresM0Init : Predicate<"Subtarget->ldsRequiresM0Init()">;
|
||||
def NotLDSRequiresM0Init : Predicate<"!Subtarget->ldsRequiresM0Init()">;
|
||||
|
||||
def HasDSAddTid : Predicate<"Subtarget->getGeneration() >= AMDGPUSubtarget::GFX9">,
|
||||
AssemblerPredicate<"FeatureGFX9Insts">;
|
||||
|
||||
|
@ -337,7 +337,8 @@ const TargetRegisterClass *AMDGPUDAGToDAGISel::getOperandRegClass(SDNode *N,
|
||||
}
|
||||
|
||||
SDNode *AMDGPUDAGToDAGISel::glueCopyToM0(SDNode *N) const {
|
||||
if (cast<MemSDNode>(N)->getAddressSpace() != AMDGPUASI.LOCAL_ADDRESS)
|
||||
if (cast<MemSDNode>(N)->getAddressSpace() != AMDGPUASI.LOCAL_ADDRESS ||
|
||||
!Subtarget->ldsRequiresM0Init())
|
||||
return N;
|
||||
|
||||
const SITargetLowering& Lowering =
|
||||
|
@ -462,6 +462,12 @@ public:
|
||||
return getGeneration() >= GFX9;
|
||||
}
|
||||
|
||||
/// Return if most LDS instructions have an m0 use that require m0 to be
|
||||
/// iniitalized.
|
||||
bool ldsRequiresM0Init() const {
|
||||
return getGeneration() < GFX9;
|
||||
}
|
||||
|
||||
bool hasAddNoCarry() const {
|
||||
return AddNoCarryInsts;
|
||||
}
|
||||
|
@ -600,6 +600,20 @@ class DSReadPat <DS_Pseudo inst, ValueType vt, PatFrag frag> : GCNPat <
|
||||
(inst $ptr, (as_i16imm $offset), (i1 0))
|
||||
>;
|
||||
|
||||
// FIXME: Passing name of PatFrag in workaround. Why doesn't
|
||||
// !cast<PatFrag>(frag.NAME#"_m0") work!?
|
||||
multiclass DSReadPat_mc<DS_Pseudo inst, ValueType vt, string frag> {
|
||||
|
||||
let OtherPredicates = [LDSRequiresM0Init] in {
|
||||
def : DSReadPat<inst, vt, !cast<PatFrag>(frag#"_m0")>;
|
||||
}
|
||||
|
||||
let OtherPredicates = [NotLDSRequiresM0Init] in {
|
||||
def : DSReadPat<!cast<DS_Pseudo>(inst.NAME#"_gfx9"), vt, !cast<PatFrag>(frag)>;
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
multiclass DSReadPat_Hi16 <DS_Pseudo inst, PatFrag frag, ValueType vt = i16> {
|
||||
def : GCNPat <
|
||||
(build_vector vt:$lo, (vt (frag (DS1Addr1Offset i32:$ptr, i32:$offset)))),
|
||||
@ -624,30 +638,22 @@ multiclass DSReadPat_Lo16 <DS_Pseudo inst, PatFrag frag, ValueType vt = i16> {
|
||||
>;
|
||||
}
|
||||
|
||||
|
||||
def : DSReadPat <DS_READ_I8, i32, sextloadi8_local_m0>;
|
||||
def : DSReadPat <DS_READ_U8, i32, az_extloadi8_local_m0>;
|
||||
def : DSReadPat <DS_READ_I8, i16, sextloadi8_local_m0>;
|
||||
def : DSReadPat <DS_READ_U8, i16, az_extloadi8_local_m0>;
|
||||
def : DSReadPat <DS_READ_I16, i32, sextloadi16_local_m0>;
|
||||
def : DSReadPat <DS_READ_I16, i32, sextloadi16_local_m0>;
|
||||
def : DSReadPat <DS_READ_U16, i32, az_extloadi16_local_m0>;
|
||||
def : DSReadPat <DS_READ_U16, i16, load_local_m0>;
|
||||
def : DSReadPat <DS_READ_B32, i32, load_local_m0>;
|
||||
defm : DSReadPat_mc <DS_READ_I8, i32, "sextloadi8_local">;
|
||||
defm : DSReadPat_mc <DS_READ_U8, i32, "az_extloadi8_local">;
|
||||
defm : DSReadPat_mc <DS_READ_I8, i16, "sextloadi8_local">;
|
||||
defm : DSReadPat_mc <DS_READ_U8, i16, "az_extloadi8_local">;
|
||||
defm : DSReadPat_mc <DS_READ_I16, i32, "sextloadi16_local">;
|
||||
defm : DSReadPat_mc <DS_READ_I16, i32, "sextloadi16_local">;
|
||||
defm : DSReadPat_mc <DS_READ_U16, i32, "az_extloadi16_local">;
|
||||
defm : DSReadPat_mc <DS_READ_U16, i16, "load_local">;
|
||||
defm : DSReadPat_mc <DS_READ_B32, i32, "load_local">;
|
||||
|
||||
let AddedComplexity = 100 in {
|
||||
|
||||
def : DSReadPat <DS_READ_B64, v2i32, load_align8_local_m0>;
|
||||
defm : DSReadPat_mc <DS_READ_B64, v2i32, "load_align8_local">;
|
||||
|
||||
} // End AddedComplexity = 100
|
||||
|
||||
def : GCNPat <
|
||||
(v2i32 (load_local_m0 (DS64Bit4ByteAligned i32:$ptr, i8:$offset0,
|
||||
i8:$offset1))),
|
||||
(DS_READ2_B32 $ptr, $offset0, $offset1, (i1 0))
|
||||
>;
|
||||
|
||||
|
||||
let OtherPredicates = [HasD16LoadStore] in {
|
||||
let AddedComplexity = 100 in {
|
||||
defm : DSReadPat_Hi16<DS_READ_U16_D16_HI, load_local>;
|
||||
@ -666,71 +672,119 @@ class DSWritePat <DS_Pseudo inst, ValueType vt, PatFrag frag> : GCNPat <
|
||||
(inst $ptr, $value, (as_i16imm $offset), (i1 0))
|
||||
>;
|
||||
|
||||
def : DSWritePat <DS_WRITE_B8, i32, truncstorei8_local_m0>;
|
||||
def : DSWritePat <DS_WRITE_B16, i32, truncstorei16_local_m0>;
|
||||
def : DSWritePat <DS_WRITE_B8, i16, truncstorei8_local_m0>;
|
||||
def : DSWritePat <DS_WRITE_B16, i16, store_local_m0>;
|
||||
def : DSWritePat <DS_WRITE_B32, i32, store_local_m0>;
|
||||
multiclass DSWritePat_mc <DS_Pseudo inst, ValueType vt, string frag> {
|
||||
let OtherPredicates = [LDSRequiresM0Init] in {
|
||||
def : DSWritePat<inst, vt, !cast<PatFrag>(frag#"_m0")>;
|
||||
}
|
||||
|
||||
let OtherPredicates = [NotLDSRequiresM0Init] in {
|
||||
def : DSWritePat<!cast<DS_Pseudo>(inst.NAME#"_gfx9"), vt, !cast<PatFrag>(frag)>;
|
||||
}
|
||||
}
|
||||
|
||||
defm : DSWritePat_mc <DS_WRITE_B8, i32, "truncstorei8_local">;
|
||||
defm : DSWritePat_mc <DS_WRITE_B16, i32, "truncstorei16_local">;
|
||||
defm : DSWritePat_mc <DS_WRITE_B8, i16, "truncstorei8_local">;
|
||||
defm : DSWritePat_mc <DS_WRITE_B16, i16, "store_local">;
|
||||
defm : DSWritePat_mc <DS_WRITE_B32, i32, "store_local">;
|
||||
|
||||
let OtherPredicates = [HasD16LoadStore] in {
|
||||
def : DSWritePat <DS_WRITE_B16_D16_HI, i32, store_local_hi16>;
|
||||
def : DSWritePat <DS_WRITE_B8_D16_HI, i32, truncstorei8_local_hi16>;
|
||||
}
|
||||
|
||||
let AddedComplexity = 100 in {
|
||||
|
||||
def : DSWritePat <DS_WRITE_B64, v2i32, store_align8_local_m0>;
|
||||
} // End AddedComplexity = 100
|
||||
|
||||
def : GCNPat <
|
||||
(store_local_m0 v2i32:$value, (DS64Bit4ByteAligned i32:$ptr, i8:$offset0,
|
||||
i8:$offset1)),
|
||||
(DS_WRITE2_B32 $ptr, (i32 (EXTRACT_SUBREG $value, sub0)),
|
||||
(i32 (EXTRACT_SUBREG $value, sub1)), $offset0, $offset1,
|
||||
(i1 0))
|
||||
class DS64Bit4ByteAlignedReadPat<DS_Pseudo inst, PatFrag frag> : GCNPat <
|
||||
(v2i32 (frag (DS64Bit4ByteAligned i32:$ptr, i8:$offset0, i8:$offset1))),
|
||||
(inst $ptr, $offset0, $offset1, (i1 0))
|
||||
>;
|
||||
|
||||
class DS64Bit4ByteAlignedWritePat<DS_Pseudo inst, PatFrag frag> : GCNPat<
|
||||
(frag v2i32:$value, (DS64Bit4ByteAligned i32:$ptr, i8:$offset0, i8:$offset1)),
|
||||
(inst $ptr, (i32 (EXTRACT_SUBREG $value, sub0)),
|
||||
(i32 (EXTRACT_SUBREG $value, sub1)), $offset0, $offset1,
|
||||
(i1 0))
|
||||
>;
|
||||
|
||||
let OtherPredicates = [LDSRequiresM0Init] in {
|
||||
def : DS64Bit4ByteAlignedReadPat<DS_READ2_B32, load_local_m0>;
|
||||
def : DS64Bit4ByteAlignedWritePat<DS_WRITE2_B32, store_local_m0>;
|
||||
}
|
||||
|
||||
let OtherPredicates = [NotLDSRequiresM0Init] in {
|
||||
def : DS64Bit4ByteAlignedReadPat<DS_READ2_B32_gfx9, load_local>;
|
||||
def : DS64Bit4ByteAlignedWritePat<DS_WRITE2_B32_gfx9, store_local>;
|
||||
}
|
||||
|
||||
|
||||
let AddedComplexity = 100 in {
|
||||
|
||||
defm : DSWritePat_mc <DS_WRITE_B64, v2i32, "store_align8_local">;
|
||||
} // End AddedComplexity = 100
|
||||
class DSAtomicRetPat<DS_Pseudo inst, ValueType vt, PatFrag frag> : GCNPat <
|
||||
(frag (DS1Addr1Offset i32:$ptr, i32:$offset), vt:$value),
|
||||
(inst $ptr, $value, (as_i16imm $offset), (i1 0))
|
||||
>;
|
||||
|
||||
multiclass DSAtomicRetPat_mc<DS_Pseudo inst, ValueType vt, string frag> {
|
||||
let OtherPredicates = [LDSRequiresM0Init] in {
|
||||
def : DSAtomicRetPat<inst, vt, !cast<PatFrag>(frag#"_m0")>;
|
||||
}
|
||||
|
||||
let OtherPredicates = [NotLDSRequiresM0Init] in {
|
||||
def : DSAtomicRetPat<!cast<DS_Pseudo>(inst.NAME#"_gfx9"), vt, !cast<PatFrag>(frag)>;
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
|
||||
class DSAtomicCmpXChg<DS_Pseudo inst, ValueType vt, PatFrag frag> : GCNPat <
|
||||
(frag (DS1Addr1Offset i32:$ptr, i32:$offset), vt:$cmp, vt:$swap),
|
||||
(inst $ptr, $cmp, $swap, (as_i16imm $offset), (i1 0))
|
||||
>;
|
||||
|
||||
multiclass DSAtomicCmpXChg_mc<DS_Pseudo inst, ValueType vt, string frag> {
|
||||
let OtherPredicates = [LDSRequiresM0Init] in {
|
||||
def : DSAtomicCmpXChg<inst, vt, !cast<PatFrag>(frag#"_m0")>;
|
||||
}
|
||||
|
||||
let OtherPredicates = [NotLDSRequiresM0Init] in {
|
||||
def : DSAtomicCmpXChg<!cast<DS_Pseudo>(inst.NAME#"_gfx9"), vt, !cast<PatFrag>(frag)>;
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
|
||||
// 32-bit atomics.
|
||||
def : DSAtomicRetPat<DS_WRXCHG_RTN_B32, i32, atomic_swap_local_m0>;
|
||||
def : DSAtomicRetPat<DS_ADD_RTN_U32, i32, atomic_load_add_local_m0>;
|
||||
def : DSAtomicRetPat<DS_SUB_RTN_U32, i32, atomic_load_sub_local_m0>;
|
||||
def : DSAtomicRetPat<DS_INC_RTN_U32, i32, atomic_inc_local_m0>;
|
||||
def : DSAtomicRetPat<DS_DEC_RTN_U32, i32, atomic_dec_local_m0>;
|
||||
def : DSAtomicRetPat<DS_AND_RTN_B32, i32, atomic_load_and_local_m0>;
|
||||
def : DSAtomicRetPat<DS_OR_RTN_B32, i32, atomic_load_or_local_m0>;
|
||||
def : DSAtomicRetPat<DS_XOR_RTN_B32, i32, atomic_load_xor_local_m0>;
|
||||
def : DSAtomicRetPat<DS_MIN_RTN_I32, i32, atomic_load_min_local_m0>;
|
||||
def : DSAtomicRetPat<DS_MAX_RTN_I32, i32, atomic_load_max_local_m0>;
|
||||
def : DSAtomicRetPat<DS_MIN_RTN_U32, i32, atomic_load_umin_local_m0>;
|
||||
def : DSAtomicRetPat<DS_MAX_RTN_U32, i32, atomic_load_umax_local_m0>;
|
||||
def : DSAtomicCmpXChg<DS_CMPST_RTN_B32, i32, atomic_cmp_swap_local_m0>;
|
||||
defm : DSAtomicRetPat_mc<DS_WRXCHG_RTN_B32, i32, "atomic_swap_local">;
|
||||
defm : DSAtomicRetPat_mc<DS_ADD_RTN_U32, i32, "atomic_load_add_local">;
|
||||
defm : DSAtomicRetPat_mc<DS_SUB_RTN_U32, i32, "atomic_load_sub_local">;
|
||||
defm : DSAtomicRetPat_mc<DS_INC_RTN_U32, i32, "atomic_inc_local">;
|
||||
defm : DSAtomicRetPat_mc<DS_DEC_RTN_U32, i32, "atomic_dec_local">;
|
||||
defm : DSAtomicRetPat_mc<DS_AND_RTN_B32, i32, "atomic_load_and_local">;
|
||||
defm : DSAtomicRetPat_mc<DS_OR_RTN_B32, i32, "atomic_load_or_local">;
|
||||
defm : DSAtomicRetPat_mc<DS_XOR_RTN_B32, i32, "atomic_load_xor_local">;
|
||||
defm : DSAtomicRetPat_mc<DS_MIN_RTN_I32, i32, "atomic_load_min_local">;
|
||||
defm : DSAtomicRetPat_mc<DS_MAX_RTN_I32, i32, "atomic_load_max_local">;
|
||||
defm : DSAtomicRetPat_mc<DS_MIN_RTN_U32, i32, "atomic_load_umin_local">;
|
||||
defm : DSAtomicRetPat_mc<DS_MAX_RTN_U32, i32, "atomic_load_umax_local">;
|
||||
defm : DSAtomicCmpXChg_mc<DS_CMPST_RTN_B32, i32, "atomic_cmp_swap_local">;
|
||||
|
||||
// 64-bit atomics.
|
||||
def : DSAtomicRetPat<DS_WRXCHG_RTN_B64, i64, atomic_swap_local_m0>;
|
||||
def : DSAtomicRetPat<DS_ADD_RTN_U64, i64, atomic_load_add_local_m0>;
|
||||
def : DSAtomicRetPat<DS_SUB_RTN_U64, i64, atomic_load_sub_local_m0>;
|
||||
def : DSAtomicRetPat<DS_INC_RTN_U64, i64, atomic_inc_local_m0>;
|
||||
def : DSAtomicRetPat<DS_DEC_RTN_U64, i64, atomic_dec_local_m0>;
|
||||
def : DSAtomicRetPat<DS_AND_RTN_B64, i64, atomic_load_and_local_m0>;
|
||||
def : DSAtomicRetPat<DS_OR_RTN_B64, i64, atomic_load_or_local_m0>;
|
||||
def : DSAtomicRetPat<DS_XOR_RTN_B64, i64, atomic_load_xor_local_m0>;
|
||||
def : DSAtomicRetPat<DS_MIN_RTN_I64, i64, atomic_load_min_local_m0>;
|
||||
def : DSAtomicRetPat<DS_MAX_RTN_I64, i64, atomic_load_max_local_m0>;
|
||||
def : DSAtomicRetPat<DS_MIN_RTN_U64, i64, atomic_load_umin_local_m0>;
|
||||
def : DSAtomicRetPat<DS_MAX_RTN_U64, i64, atomic_load_umax_local_m0>;
|
||||
defm : DSAtomicRetPat_mc<DS_WRXCHG_RTN_B64, i64, "atomic_swap_local">;
|
||||
defm : DSAtomicRetPat_mc<DS_ADD_RTN_U64, i64, "atomic_load_add_local">;
|
||||
defm : DSAtomicRetPat_mc<DS_SUB_RTN_U64, i64, "atomic_load_sub_local">;
|
||||
defm : DSAtomicRetPat_mc<DS_INC_RTN_U64, i64, "atomic_inc_local">;
|
||||
defm : DSAtomicRetPat_mc<DS_DEC_RTN_U64, i64, "atomic_dec_local">;
|
||||
defm : DSAtomicRetPat_mc<DS_AND_RTN_B64, i64, "atomic_load_and_local">;
|
||||
defm : DSAtomicRetPat_mc<DS_OR_RTN_B64, i64, "atomic_load_or_local">;
|
||||
defm : DSAtomicRetPat_mc<DS_XOR_RTN_B64, i64, "atomic_load_xor_local">;
|
||||
defm : DSAtomicRetPat_mc<DS_MIN_RTN_I64, i64, "atomic_load_min_local">;
|
||||
defm : DSAtomicRetPat_mc<DS_MAX_RTN_I64, i64, "atomic_load_max_local">;
|
||||
defm : DSAtomicRetPat_mc<DS_MIN_RTN_U64, i64, "atomic_load_umin_local">;
|
||||
defm : DSAtomicRetPat_mc<DS_MAX_RTN_U64, i64, "atomic_load_umax_local">;
|
||||
|
||||
def : DSAtomicCmpXChg<DS_CMPST_RTN_B64, i64, atomic_cmp_swap_local_m0>;
|
||||
defm : DSAtomicCmpXChg_mc<DS_CMPST_RTN_B64, i64, "atomic_cmp_swap_local">;
|
||||
|
||||
//===----------------------------------------------------------------------===//
|
||||
// Real instructions
|
||||
|
@ -161,6 +161,9 @@ def SIpc_add_rel_offset : SDNode<"AMDGPUISD::PC_ADD_REL_OFFSET",
|
||||
defm atomic_inc_global : global_binary_atomic_op<SIatomic_inc>;
|
||||
defm atomic_dec_global : global_binary_atomic_op<SIatomic_dec>;
|
||||
|
||||
def atomic_inc_local : local_binary_atomic_op<SIatomic_inc>;
|
||||
def atomic_dec_local : local_binary_atomic_op<SIatomic_dec>;
|
||||
|
||||
//===----------------------------------------------------------------------===//
|
||||
// SDNodes PatFrags for loads/stores with a glue input.
|
||||
// This is for SDNodes and PatFrag for local loads and stores to
|
||||
|
@ -112,7 +112,13 @@ private:
|
||||
static bool offsetsCanBeCombined(CombineInfo &CI);
|
||||
|
||||
bool findMatchingInst(CombineInfo &CI);
|
||||
|
||||
unsigned read2Opcode(unsigned EltSize) const;
|
||||
unsigned read2ST64Opcode(unsigned EltSize) const;
|
||||
MachineBasicBlock::iterator mergeRead2Pair(CombineInfo &CI);
|
||||
|
||||
unsigned write2Opcode(unsigned EltSize) const;
|
||||
unsigned write2ST64Opcode(unsigned EltSize) const;
|
||||
MachineBasicBlock::iterator mergeWrite2Pair(CombineInfo &CI);
|
||||
MachineBasicBlock::iterator mergeSBufferLoadImmPair(CombineInfo &CI);
|
||||
MachineBasicBlock::iterator mergeBufferLoadPair(CombineInfo &CI);
|
||||
@ -436,6 +442,20 @@ bool SILoadStoreOptimizer::findMatchingInst(CombineInfo &CI) {
|
||||
return false;
|
||||
}
|
||||
|
||||
unsigned SILoadStoreOptimizer::read2Opcode(unsigned EltSize) const {
|
||||
if (STM->ldsRequiresM0Init())
|
||||
return (EltSize == 4) ? AMDGPU::DS_READ2_B32 : AMDGPU::DS_READ2_B64;
|
||||
return (EltSize == 4) ? AMDGPU::DS_READ2_B32_gfx9 : AMDGPU::DS_READ2_B64_gfx9;
|
||||
}
|
||||
|
||||
unsigned SILoadStoreOptimizer::read2ST64Opcode(unsigned EltSize) const {
|
||||
if (STM->ldsRequiresM0Init())
|
||||
return (EltSize == 4) ? AMDGPU::DS_READ2ST64_B32 : AMDGPU::DS_READ2ST64_B64;
|
||||
|
||||
return (EltSize == 4) ?
|
||||
AMDGPU::DS_READ2ST64_B32_gfx9 : AMDGPU::DS_READ2ST64_B64_gfx9;
|
||||
}
|
||||
|
||||
MachineBasicBlock::iterator SILoadStoreOptimizer::mergeRead2Pair(
|
||||
CombineInfo &CI) {
|
||||
MachineBasicBlock *MBB = CI.I->getParent();
|
||||
@ -449,12 +469,8 @@ MachineBasicBlock::iterator SILoadStoreOptimizer::mergeRead2Pair(
|
||||
|
||||
unsigned NewOffset0 = CI.Offset0;
|
||||
unsigned NewOffset1 = CI.Offset1;
|
||||
unsigned Opc = (CI.EltSize == 4) ? AMDGPU::DS_READ2_B32
|
||||
: AMDGPU::DS_READ2_B64;
|
||||
|
||||
if (CI.UseST64)
|
||||
Opc = (CI.EltSize == 4) ? AMDGPU::DS_READ2ST64_B32
|
||||
: AMDGPU::DS_READ2ST64_B64;
|
||||
unsigned Opc = CI.UseST64 ?
|
||||
read2ST64Opcode(CI.EltSize) : read2Opcode(CI.EltSize);
|
||||
|
||||
unsigned SubRegIdx0 = (CI.EltSize == 4) ? AMDGPU::sub0 : AMDGPU::sub0_sub1;
|
||||
unsigned SubRegIdx1 = (CI.EltSize == 4) ? AMDGPU::sub1 : AMDGPU::sub2_sub3;
|
||||
@ -517,6 +533,20 @@ MachineBasicBlock::iterator SILoadStoreOptimizer::mergeRead2Pair(
|
||||
return Next;
|
||||
}
|
||||
|
||||
unsigned SILoadStoreOptimizer::write2Opcode(unsigned EltSize) const {
|
||||
if (STM->ldsRequiresM0Init())
|
||||
return (EltSize == 4) ? AMDGPU::DS_WRITE2_B32 : AMDGPU::DS_WRITE2_B64;
|
||||
return (EltSize == 4) ? AMDGPU::DS_WRITE2_B32_gfx9 : AMDGPU::DS_WRITE2_B64_gfx9;
|
||||
}
|
||||
|
||||
unsigned SILoadStoreOptimizer::write2ST64Opcode(unsigned EltSize) const {
|
||||
if (STM->ldsRequiresM0Init())
|
||||
return (EltSize == 4) ? AMDGPU::DS_WRITE2ST64_B32 : AMDGPU::DS_WRITE2ST64_B64;
|
||||
|
||||
return (EltSize == 4) ?
|
||||
AMDGPU::DS_WRITE2ST64_B32_gfx9 : AMDGPU::DS_WRITE2ST64_B64_gfx9;
|
||||
}
|
||||
|
||||
MachineBasicBlock::iterator SILoadStoreOptimizer::mergeWrite2Pair(
|
||||
CombineInfo &CI) {
|
||||
MachineBasicBlock *MBB = CI.I->getParent();
|
||||
@ -530,12 +560,8 @@ MachineBasicBlock::iterator SILoadStoreOptimizer::mergeWrite2Pair(
|
||||
|
||||
unsigned NewOffset0 = CI.Offset0;
|
||||
unsigned NewOffset1 = CI.Offset1;
|
||||
unsigned Opc = (CI.EltSize == 4) ? AMDGPU::DS_WRITE2_B32
|
||||
: AMDGPU::DS_WRITE2_B64;
|
||||
|
||||
if (CI.UseST64)
|
||||
Opc = (CI.EltSize == 4) ? AMDGPU::DS_WRITE2ST64_B32
|
||||
: AMDGPU::DS_WRITE2ST64_B64;
|
||||
unsigned Opc = CI.UseST64 ?
|
||||
write2ST64Opcode(CI.EltSize) : write2Opcode(CI.EltSize);
|
||||
|
||||
if (NewOffset0 > NewOffset1) {
|
||||
// Canonicalize the merged instruction so the smaller offset comes first.
|
||||
@ -786,9 +812,13 @@ bool SILoadStoreOptimizer::optimizeBlock(MachineBasicBlock &MBB) {
|
||||
CombineInfo CI;
|
||||
CI.I = I;
|
||||
unsigned Opc = MI.getOpcode();
|
||||
if (Opc == AMDGPU::DS_READ_B32 || Opc == AMDGPU::DS_READ_B64) {
|
||||
if (Opc == AMDGPU::DS_READ_B32 || Opc == AMDGPU::DS_READ_B64 ||
|
||||
Opc == AMDGPU::DS_READ_B32_gfx9 || Opc == AMDGPU::DS_READ_B64_gfx9) {
|
||||
|
||||
CI.InstClass = DS_READ_WRITE;
|
||||
CI.EltSize = (Opc == AMDGPU::DS_READ_B64) ? 8 : 4;
|
||||
CI.EltSize =
|
||||
(Opc == AMDGPU::DS_READ_B64 || Opc == AMDGPU::DS_READ_B64_gfx9) ? 8 : 4;
|
||||
|
||||
if (findMatchingInst(CI)) {
|
||||
Modified = true;
|
||||
I = mergeRead2Pair(CI);
|
||||
@ -797,10 +827,13 @@ bool SILoadStoreOptimizer::optimizeBlock(MachineBasicBlock &MBB) {
|
||||
}
|
||||
|
||||
continue;
|
||||
}
|
||||
if (Opc == AMDGPU::DS_WRITE_B32 || Opc == AMDGPU::DS_WRITE_B64) {
|
||||
} else if (Opc == AMDGPU::DS_WRITE_B32 || Opc == AMDGPU::DS_WRITE_B64 ||
|
||||
Opc == AMDGPU::DS_WRITE_B32_gfx9 ||
|
||||
Opc == AMDGPU::DS_WRITE_B64_gfx9) {
|
||||
CI.InstClass = DS_READ_WRITE;
|
||||
CI.EltSize = (Opc == AMDGPU::DS_WRITE_B64) ? 8 : 4;
|
||||
CI.EltSize
|
||||
= (Opc == AMDGPU::DS_WRITE_B64 || Opc == AMDGPU::DS_WRITE_B64_gfx9) ? 8 : 4;
|
||||
|
||||
if (findMatchingInst(CI)) {
|
||||
Modified = true;
|
||||
I = mergeWrite2Pair(CI);
|
||||
|
@ -1,13 +1,17 @@
|
||||
; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=SICI -check-prefix=GCN -check-prefix=FUNC %s
|
||||
; RUN: llc -march=amdgcn -mcpu=bonaire -verify-machineinstrs < %s | FileCheck -check-prefix=SICI -check-prefix=CIVI -check-prefix=GCN -check-prefix=FUNC %s
|
||||
; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=VI -check-prefix=CIVI -check-prefix=GCN -check-prefix=FUNC %s
|
||||
; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=SI,SICI,SICIVI,GCN %s
|
||||
; RUN: llc -march=amdgcn -mcpu=bonaire -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=SICI,CIVI,SICIVI,GCN %s
|
||||
; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=VI,CIVI,SICIVI,GFX89,GCN %s
|
||||
; RUN: llc -march=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX9,GFX89,GCN %s
|
||||
|
||||
; FUNC-LABEL: {{^}}lds_atomic_cmpxchg_ret_i32_offset:
|
||||
; SICI: s_load_dword [[PTR:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0xb
|
||||
; SICI: s_load_dword [[SWAP:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0xc
|
||||
; VI: s_load_dword [[PTR:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x2c
|
||||
; VI: s_load_dword [[SWAP:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x30
|
||||
; GCN: v_mov_b32_e32 [[VCMP:v[0-9]+]], 7
|
||||
; GCN-LABEL: {{^}}lds_atomic_cmpxchg_ret_i32_offset:
|
||||
; GFX9-NOT: m0
|
||||
; SICIVI-DAG: s_mov_b32 m0
|
||||
|
||||
; SICI-DAG: s_load_dword [[PTR:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0xb
|
||||
; SICI-DAG: s_load_dword [[SWAP:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0xc
|
||||
; GFX89-DAG: s_load_dword [[PTR:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x2c
|
||||
; GFX89-DAG: s_load_dword [[SWAP:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x30
|
||||
; GCN-DAG: v_mov_b32_e32 [[VCMP:v[0-9]+]], 7
|
||||
; GCN-DAG: v_mov_b32_e32 [[VPTR:v[0-9]+]], [[PTR]]
|
||||
; GCN-DAG: v_mov_b32_e32 [[VSWAP:v[0-9]+]], [[SWAP]]
|
||||
; GCN: ds_cmpst_rtn_b32 [[RESULT:v[0-9]+]], [[VPTR]], [[VCMP]], [[VSWAP]] offset:16
|
||||
@ -20,18 +24,21 @@ define amdgpu_kernel void @lds_atomic_cmpxchg_ret_i32_offset(i32 addrspace(1)* %
|
||||
ret void
|
||||
}
|
||||
|
||||
; FUNC-LABEL: {{^}}lds_atomic_cmpxchg_ret_i64_offset:
|
||||
; GCN-LABEL: {{^}}lds_atomic_cmpxchg_ret_i64_offset:
|
||||
; GFX9-NOT: m0
|
||||
; SICIVI-DAG: s_mov_b32 m0
|
||||
|
||||
; SICI-DAG: s_load_dword [[PTR:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0xb
|
||||
; SICI-DAG: s_load_dwordx2 s{{\[}}[[LOSWAP:[0-9]+]]:[[HISWAP:[0-9]+]]{{\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0xd
|
||||
; VI-DAG: s_load_dword [[PTR:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x2c
|
||||
; VI-DAG: s_load_dwordx2 s{{\[}}[[LOSWAP:[0-9]+]]:[[HISWAP:[0-9]+]]{{\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0x34
|
||||
; GFX89-DAG: s_load_dword [[PTR:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x2c
|
||||
; GFX89-DAG: s_load_dwordx2 s{{\[}}[[LOSWAP:[0-9]+]]:[[HISWAP:[0-9]+]]{{\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0x34
|
||||
; GCN-DAG: v_mov_b32_e32 v[[LOVCMP:[0-9]+]], 7
|
||||
; GCN-DAG: v_mov_b32_e32 v[[HIVCMP:[0-9]+]], 0
|
||||
; GCN-DAG: v_mov_b32_e32 [[VPTR:v[0-9]+]], [[PTR]]
|
||||
; GCN-DAG: v_mov_b32_e32 v[[LOSWAPV:[0-9]+]], s[[LOSWAP]]
|
||||
; GCN-DAG: v_mov_b32_e32 v[[HISWAPV:[0-9]+]], s[[HISWAP]]
|
||||
; GCN: ds_cmpst_rtn_b64 [[RESULT:v\[[0-9]+:[0-9]+\]]], [[VPTR]], v{{\[}}[[LOVCMP]]:[[HIVCMP]]{{\]}}, v{{\[}}[[LOSWAPV]]:[[HISWAPV]]{{\]}} offset:32
|
||||
; GCN: buffer_store_dwordx2 [[RESULT]],
|
||||
; GCN: [[RESULT]]
|
||||
; GCN: s_endpgm
|
||||
define amdgpu_kernel void @lds_atomic_cmpxchg_ret_i64_offset(i64 addrspace(1)* %out, i64 addrspace(3)* %ptr, i64 %swap) nounwind {
|
||||
%gep = getelementptr i64, i64 addrspace(3)* %ptr, i32 4
|
||||
@ -41,9 +48,11 @@ define amdgpu_kernel void @lds_atomic_cmpxchg_ret_i64_offset(i64 addrspace(1)* %
|
||||
ret void
|
||||
}
|
||||
|
||||
; FUNC-LABEL: {{^}}lds_atomic_cmpxchg_ret_i32_bad_si_offset
|
||||
; GCN-LABEL: {{^}}lds_atomic_cmpxchg_ret_i32_bad_si_offset
|
||||
; GFX9-NOT: m0
|
||||
; SI: ds_cmpst_rtn_b32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
|
||||
; CIVI: ds_cmpst_rtn_b32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} offset:16
|
||||
; GFX9: ds_cmpst_rtn_b32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} offset:16
|
||||
; GCN: s_endpgm
|
||||
define amdgpu_kernel void @lds_atomic_cmpxchg_ret_i32_bad_si_offset(i32 addrspace(1)* %out, i32 addrspace(3)* %ptr, i32 %swap, i32 %a, i32 %b) nounwind {
|
||||
%sub = sub i32 %a, %b
|
||||
@ -55,11 +64,15 @@ define amdgpu_kernel void @lds_atomic_cmpxchg_ret_i32_bad_si_offset(i32 addrspac
|
||||
ret void
|
||||
}
|
||||
|
||||
; FUNC-LABEL: {{^}}lds_atomic_cmpxchg_noret_i32_offset:
|
||||
; SICI: s_load_dword [[PTR:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x9
|
||||
; SICI: s_load_dword [[SWAP:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0xa
|
||||
; VI: s_load_dword [[PTR:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x24
|
||||
; VI: s_load_dword [[SWAP:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x28
|
||||
; GCN-LABEL: {{^}}lds_atomic_cmpxchg_noret_i32_offset:
|
||||
; GFX9-NOT: m0
|
||||
; SICIVI-DAG: s_mov_b32 m0
|
||||
|
||||
|
||||
; SICI-DAG: s_load_dword [[PTR:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x9
|
||||
; SICI-DAG: s_load_dword [[SWAP:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0xa
|
||||
; GFX89-DAG: s_load_dword [[PTR:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x24
|
||||
; GFX89-DAG: s_load_dword [[SWAP:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x28
|
||||
; GCN-DAG: v_mov_b32_e32 [[VCMP:v[0-9]+]], 7
|
||||
; GCN-DAG: v_mov_b32_e32 [[VPTR:v[0-9]+]], [[PTR]]
|
||||
; GCN-DAG: v_mov_b32_e32 [[VSWAP:v[0-9]+]], [[SWAP]]
|
||||
@ -72,11 +85,14 @@ define amdgpu_kernel void @lds_atomic_cmpxchg_noret_i32_offset(i32 addrspace(3)*
|
||||
ret void
|
||||
}
|
||||
|
||||
; FUNC-LABEL: {{^}}lds_atomic_cmpxchg_noret_i64_offset:
|
||||
; SICI: s_load_dword [[PTR:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x9
|
||||
; SICI: s_load_dwordx2 s{{\[}}[[LOSWAP:[0-9]+]]:[[HISWAP:[0-9]+]]{{\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0xb
|
||||
; VI-DAG: s_load_dword [[PTR:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x24
|
||||
; VI-DAG: s_load_dwordx2 s{{\[}}[[LOSWAP:[0-9]+]]:[[HISWAP:[0-9]+]]{{\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0x2c
|
||||
; GCN-LABEL: {{^}}lds_atomic_cmpxchg_noret_i64_offset:
|
||||
; GFX9-NOT: m0
|
||||
; SICIVI-DAG: s_mov_b32 m0
|
||||
|
||||
; SICI-DAG: s_load_dword [[PTR:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x9
|
||||
; SICI-DAG: s_load_dwordx2 s{{\[}}[[LOSWAP:[0-9]+]]:[[HISWAP:[0-9]+]]{{\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0xb
|
||||
; GFX89-DAG: s_load_dword [[PTR:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x24
|
||||
; GFX89-DAG: s_load_dwordx2 s{{\[}}[[LOSWAP:[0-9]+]]:[[HISWAP:[0-9]+]]{{\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0x2c
|
||||
; GCN-DAG: v_mov_b32_e32 v[[LOVCMP:[0-9]+]], 7
|
||||
; GCN-DAG: v_mov_b32_e32 v[[HIVCMP:[0-9]+]], 0
|
||||
; GCN-DAG: v_mov_b32_e32 [[VPTR:v[0-9]+]], [[PTR]]
|
||||
|
@ -1,18 +1,24 @@
|
||||
; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck %s -check-prefix=SI -check-prefix=FUNC
|
||||
; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck %s -check-prefix=SI -check-prefix=FUNC
|
||||
; RUN: llc -march=r600 -mcpu=redwood < %s | FileCheck -check-prefix=R600 -check-prefix=FUNC %s
|
||||
; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,SICIVI,FUNC %s
|
||||
; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,SICIVI,FUNC %s
|
||||
; RUN: llc -march=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX9,FUNC %s
|
||||
; RUN: llc -march=r600 -mcpu=redwood < %s | FileCheck -check-prefixes=R600,FUNC %s
|
||||
|
||||
; FUNC-LABEL: {{^}}atomic_add_local:
|
||||
; SICIVI: s_mov_b32 m0
|
||||
; GFX9-NOT: m0
|
||||
; R600: LDS_ADD *
|
||||
; SI: ds_add_u32
|
||||
; GCN: ds_add_u32
|
||||
define amdgpu_kernel void @atomic_add_local(i32 addrspace(3)* %local) {
|
||||
%unused = atomicrmw volatile add i32 addrspace(3)* %local, i32 5 seq_cst
|
||||
ret void
|
||||
}
|
||||
|
||||
; FUNC-LABEL: {{^}}atomic_add_local_const_offset:
|
||||
; SICIVI: s_mov_b32 m0
|
||||
; GFX9-NOT: m0
|
||||
|
||||
; R600: LDS_ADD *
|
||||
; SI: ds_add_u32 v{{[0-9]+}}, v{{[0-9]+}} offset:16
|
||||
; GCN: ds_add_u32 v{{[0-9]+}}, v{{[0-9]+}} offset:16
|
||||
define amdgpu_kernel void @atomic_add_local_const_offset(i32 addrspace(3)* %local) {
|
||||
%gep = getelementptr i32, i32 addrspace(3)* %local, i32 4
|
||||
%val = atomicrmw volatile add i32 addrspace(3)* %gep, i32 5 seq_cst
|
||||
@ -20,8 +26,11 @@ define amdgpu_kernel void @atomic_add_local_const_offset(i32 addrspace(3)* %loca
|
||||
}
|
||||
|
||||
; FUNC-LABEL: {{^}}atomic_add_ret_local:
|
||||
; SICIVI: s_mov_b32 m0
|
||||
; GFX9-NOT: m0
|
||||
|
||||
; R600: LDS_ADD_RET *
|
||||
; SI: ds_add_rtn_u32
|
||||
; GCN: ds_add_rtn_u32
|
||||
define amdgpu_kernel void @atomic_add_ret_local(i32 addrspace(1)* %out, i32 addrspace(3)* %local) {
|
||||
%val = atomicrmw volatile add i32 addrspace(3)* %local, i32 5 seq_cst
|
||||
store i32 %val, i32 addrspace(1)* %out
|
||||
@ -29,8 +38,11 @@ define amdgpu_kernel void @atomic_add_ret_local(i32 addrspace(1)* %out, i32 addr
|
||||
}
|
||||
|
||||
; FUNC-LABEL: {{^}}atomic_add_ret_local_const_offset:
|
||||
; SICIVI: s_mov_b32 m0
|
||||
; GFX9-NOT: m0
|
||||
|
||||
; R600: LDS_ADD_RET *
|
||||
; SI: ds_add_rtn_u32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} offset:20
|
||||
; GCN: ds_add_rtn_u32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} offset:20
|
||||
define amdgpu_kernel void @atomic_add_ret_local_const_offset(i32 addrspace(1)* %out, i32 addrspace(3)* %local) {
|
||||
%gep = getelementptr i32, i32 addrspace(3)* %local, i32 5
|
||||
%val = atomicrmw volatile add i32 addrspace(3)* %gep, i32 5 seq_cst
|
||||
|
@ -1,18 +1,25 @@
|
||||
; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
|
||||
; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
|
||||
; RUN: llc -march=r600 -mcpu=redwood < %s | FileCheck -check-prefix=R600 -check-prefix=FUNC %s
|
||||
; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,SICIVI,FUNC %s
|
||||
; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,SICIVI,FUNC %s
|
||||
; RUN: llc -march=amdgcn -mcpu=gfx900 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX9,FUNC %s
|
||||
; RUN: llc -march=r600 -mcpu=redwood < %s | FileCheck -enable-var-scope -check-prefixes=R600,FUNC %s
|
||||
|
||||
; FUNC-LABEL: {{^}}atomic_sub_local:
|
||||
; SICIVI: s_mov_b32 m0
|
||||
; GFX9-NOT: m0
|
||||
|
||||
; R600: LDS_SUB *
|
||||
; SI: ds_sub_u32
|
||||
; GCN: ds_sub_u32
|
||||
define amdgpu_kernel void @atomic_sub_local(i32 addrspace(3)* %local) {
|
||||
%unused = atomicrmw volatile sub i32 addrspace(3)* %local, i32 5 seq_cst
|
||||
ret void
|
||||
}
|
||||
|
||||
; FUNC-LABEL: {{^}}atomic_sub_local_const_offset:
|
||||
; SICIVI: s_mov_b32 m0
|
||||
; GFX9-NOT: m0
|
||||
|
||||
; R600: LDS_SUB *
|
||||
; SI: ds_sub_u32 v{{[0-9]+}}, v{{[0-9]+}} offset:16
|
||||
; GCN: ds_sub_u32 v{{[0-9]+}}, v{{[0-9]+}} offset:16
|
||||
define amdgpu_kernel void @atomic_sub_local_const_offset(i32 addrspace(3)* %local) {
|
||||
%gep = getelementptr i32, i32 addrspace(3)* %local, i32 4
|
||||
%val = atomicrmw volatile sub i32 addrspace(3)* %gep, i32 5 seq_cst
|
||||
@ -20,8 +27,11 @@ define amdgpu_kernel void @atomic_sub_local_const_offset(i32 addrspace(3)* %loca
|
||||
}
|
||||
|
||||
; FUNC-LABEL: {{^}}atomic_sub_ret_local:
|
||||
; SICIVI: s_mov_b32 m0
|
||||
; GFX9-NOT: m0
|
||||
|
||||
; R600: LDS_SUB_RET *
|
||||
; SI: ds_sub_rtn_u32
|
||||
; GCN: ds_sub_rtn_u32
|
||||
define amdgpu_kernel void @atomic_sub_ret_local(i32 addrspace(1)* %out, i32 addrspace(3)* %local) {
|
||||
%val = atomicrmw volatile sub i32 addrspace(3)* %local, i32 5 seq_cst
|
||||
store i32 %val, i32 addrspace(1)* %out
|
||||
@ -29,8 +39,11 @@ define amdgpu_kernel void @atomic_sub_ret_local(i32 addrspace(1)* %out, i32 addr
|
||||
}
|
||||
|
||||
; FUNC-LABEL: {{^}}atomic_sub_ret_local_const_offset:
|
||||
; SICIVI: s_mov_b32 m0
|
||||
; GFX9-NOT: m0
|
||||
|
||||
; R600: LDS_SUB_RET *
|
||||
; SI: ds_sub_rtn_u32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} offset:20
|
||||
; GCN: ds_sub_rtn_u32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} offset:20
|
||||
define amdgpu_kernel void @atomic_sub_ret_local_const_offset(i32 addrspace(1)* %out, i32 addrspace(3)* %local) {
|
||||
%gep = getelementptr i32, i32 addrspace(3)* %local, i32 5
|
||||
%val = atomicrmw volatile sub i32 addrspace(3)* %gep, i32 5 seq_cst
|
||||
|
@ -1,4 +1,5 @@
|
||||
; RUN: llc -march=amdgcn -mcpu=bonaire -verify-machineinstrs -mattr=+load-store-opt < %s | FileCheck -strict-whitespace -check-prefix=SI %s
|
||||
; RUN: llc -march=amdgcn -mcpu=bonaire -verify-machineinstrs -mattr=+load-store-opt < %s | FileCheck -strict-whitespace -check-prefixes=GCN,CI %s
|
||||
; RUN: llc -march=amdgcn -mcpu=gfx900 -verify-machineinstrs -mattr=+load-store-opt < %s | FileCheck -strict-whitespace -check-prefixes=GCN,GFX9 %s
|
||||
|
||||
; FIXME: We don't get cases where the address was an SGPR because we
|
||||
; get a copy to the address register for each one.
|
||||
@ -6,12 +7,16 @@
|
||||
@lds = addrspace(3) global [512 x float] undef, align 4
|
||||
@lds.f64 = addrspace(3) global [512 x double] undef, align 8
|
||||
|
||||
; SI-LABEL: @simple_read2_f32
|
||||
; SI: ds_read2_b32 v{{\[}}[[LO_VREG:[0-9]+]]:[[HI_VREG:[0-9]+]]{{\]}}, v{{[0-9]+}} offset1:8
|
||||
; SI: s_waitcnt lgkmcnt(0)
|
||||
; SI: v_add_f32_e32 [[RESULT:v[0-9]+]], v[[LO_VREG]], v[[HI_VREG]]
|
||||
; SI: buffer_store_dword [[RESULT]]
|
||||
; SI: s_endpgm
|
||||
; GCN-LABEL: {{^}}simple_read2_f32:
|
||||
; CI-DAG: s_mov_b32 m0
|
||||
; GFX9-NOT: m0
|
||||
|
||||
; GCN: ds_read2_b32 v{{\[}}[[LO_VREG:[0-9]+]]:[[HI_VREG:[0-9]+]]{{\]}}, v{{[0-9]+}} offset1:8
|
||||
; GCN: s_waitcnt lgkmcnt(0)
|
||||
; GCN: v_add_f32_e32 [[RESULT:v[0-9]+]], v[[LO_VREG]], v[[HI_VREG]]
|
||||
; CI: buffer_store_dword [[RESULT]]
|
||||
; GFX9: global_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]]
|
||||
; GCN: s_endpgm
|
||||
define amdgpu_kernel void @simple_read2_f32(float addrspace(1)* %out) #0 {
|
||||
%x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1
|
||||
%arrayidx0 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 %x.i
|
||||
@ -25,12 +30,16 @@ define amdgpu_kernel void @simple_read2_f32(float addrspace(1)* %out) #0 {
|
||||
ret void
|
||||
}
|
||||
|
||||
; SI-LABEL: @simple_read2_f32_max_offset
|
||||
; SI: ds_read2_b32 v{{\[}}[[LO_VREG:[0-9]+]]:[[HI_VREG:[0-9]+]]{{\]}}, v{{[0-9]+}} offset1:255
|
||||
; SI: s_waitcnt lgkmcnt(0)
|
||||
; SI: v_add_f32_e32 [[RESULT:v[0-9]+]], v[[LO_VREG]], v[[HI_VREG]]
|
||||
; SI: buffer_store_dword [[RESULT]]
|
||||
; SI: s_endpgm
|
||||
; GCN-LABEL: {{^}}simple_read2_f32_max_offset:
|
||||
; CI-DAG: s_mov_b32 m0
|
||||
; GFX9-NOT: m0
|
||||
|
||||
; GCN: ds_read2_b32 v{{\[}}[[LO_VREG:[0-9]+]]:[[HI_VREG:[0-9]+]]{{\]}}, v{{[0-9]+}} offset1:255
|
||||
; GCN: s_waitcnt lgkmcnt(0)
|
||||
; GCN: v_add_f32_e32 [[RESULT:v[0-9]+]], v[[LO_VREG]], v[[HI_VREG]]
|
||||
|
||||
; CI: buffer_store_dword [[RESULT]]
|
||||
; GFX9: global_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]]
|
||||
define amdgpu_kernel void @simple_read2_f32_max_offset(float addrspace(1)* %out) #0 {
|
||||
%x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1
|
||||
%arrayidx0 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 %x.i
|
||||
@ -44,11 +53,14 @@ define amdgpu_kernel void @simple_read2_f32_max_offset(float addrspace(1)* %out)
|
||||
ret void
|
||||
}
|
||||
|
||||
; SI-LABEL: @simple_read2_f32_too_far
|
||||
; SI-NOT ds_read2_b32
|
||||
; SI: ds_read_b32 v{{[0-9]+}}, v{{[0-9]+}}
|
||||
; SI: ds_read_b32 v{{[0-9]+}}, v{{[0-9]+}} offset:1028
|
||||
; SI: s_endpgm
|
||||
; GCN-LABEL: @simple_read2_f32_too_far
|
||||
; CI-DAG: s_mov_b32 m0
|
||||
; GFX9-NOT: m0
|
||||
|
||||
; GCN-NOT ds_read2_b32
|
||||
; GCN: ds_read_b32 v{{[0-9]+}}, v{{[0-9]+}}
|
||||
; GCN: ds_read_b32 v{{[0-9]+}}, v{{[0-9]+}} offset:1028
|
||||
; GCN: s_endpgm
|
||||
define amdgpu_kernel void @simple_read2_f32_too_far(float addrspace(1)* %out) #0 {
|
||||
%x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1
|
||||
%arrayidx0 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 %x.i
|
||||
@ -62,10 +74,13 @@ define amdgpu_kernel void @simple_read2_f32_too_far(float addrspace(1)* %out) #0
|
||||
ret void
|
||||
}
|
||||
|
||||
; SI-LABEL: @simple_read2_f32_x2
|
||||
; SI: ds_read2_b32 v{{\[[0-9]+:[0-9]+\]}}, [[BASEADDR:v[0-9]+]] offset1:8
|
||||
; SI: ds_read2_b32 v{{\[[0-9]+:[0-9]+\]}}, [[BASEADDR]] offset0:11 offset1:27
|
||||
; SI: s_endpgm
|
||||
; GCN-LABEL: @simple_read2_f32_x2
|
||||
; CI-DAG: s_mov_b32 m0
|
||||
; GFX9-NOT: m0
|
||||
|
||||
; GCN: ds_read2_b32 v{{\[[0-9]+:[0-9]+\]}}, [[BASEADDR:v[0-9]+]] offset1:8
|
||||
; GCN: ds_read2_b32 v{{\[[0-9]+:[0-9]+\]}}, [[BASEADDR]] offset0:11 offset1:27
|
||||
; GCN: s_endpgm
|
||||
define amdgpu_kernel void @simple_read2_f32_x2(float addrspace(1)* %out) #0 {
|
||||
%tid.x = tail call i32 @llvm.amdgcn.workitem.id.x() #1
|
||||
%idx.0 = add nsw i32 %tid.x, 0
|
||||
@ -93,11 +108,14 @@ define amdgpu_kernel void @simple_read2_f32_x2(float addrspace(1)* %out) #0 {
|
||||
}
|
||||
|
||||
; Make sure there is an instruction between the two sets of reads.
|
||||
; SI-LABEL: @simple_read2_f32_x2_barrier
|
||||
; SI: ds_read2_b32 v{{\[[0-9]+:[0-9]+\]}}, [[BASEADDR:v[0-9]+]] offset1:8
|
||||
; SI: s_barrier
|
||||
; SI: ds_read2_b32 v{{\[[0-9]+:[0-9]+\]}}, [[BASEADDR]] offset0:11 offset1:27
|
||||
; SI: s_endpgm
|
||||
; GCN-LABEL: @simple_read2_f32_x2_barrier
|
||||
; CI-DAG: s_mov_b32 m0
|
||||
; GFX9-NOT: m0
|
||||
|
||||
; GCN: ds_read2_b32 v{{\[[0-9]+:[0-9]+\]}}, [[BASEADDR:v[0-9]+]] offset1:8
|
||||
; GCN: s_barrier
|
||||
; GCN: ds_read2_b32 v{{\[[0-9]+:[0-9]+\]}}, [[BASEADDR]] offset0:11 offset1:27
|
||||
; GCN: s_endpgm
|
||||
define amdgpu_kernel void @simple_read2_f32_x2_barrier(float addrspace(1)* %out) #0 {
|
||||
%tid.x = tail call i32 @llvm.amdgcn.workitem.id.x() #1
|
||||
%idx.0 = add nsw i32 %tid.x, 0
|
||||
@ -129,10 +147,13 @@ define amdgpu_kernel void @simple_read2_f32_x2_barrier(float addrspace(1)* %out)
|
||||
; For some reason adding something to the base address for the first
|
||||
; element results in only folding the inner pair.
|
||||
|
||||
; SI-LABEL: @simple_read2_f32_x2_nonzero_base
|
||||
; SI: ds_read2_b32 v{{\[[0-9]+:[0-9]+\]}}, [[BASEADDR:v[0-9]+]] offset0:2 offset1:8
|
||||
; SI: ds_read2_b32 v{{\[[0-9]+:[0-9]+\]}}, [[BASEADDR]] offset0:11 offset1:27
|
||||
; SI: s_endpgm
|
||||
; GCN-LABEL: @simple_read2_f32_x2_nonzero_base
|
||||
; CI-DAG: s_mov_b32 m0
|
||||
; GFX9-NOT: m0
|
||||
|
||||
; GCN: ds_read2_b32 v{{\[[0-9]+:[0-9]+\]}}, [[BASEADDR:v[0-9]+]] offset0:2 offset1:8
|
||||
; GCN: ds_read2_b32 v{{\[[0-9]+:[0-9]+\]}}, [[BASEADDR]] offset0:11 offset1:27
|
||||
; GCN: s_endpgm
|
||||
define amdgpu_kernel void @simple_read2_f32_x2_nonzero_base(float addrspace(1)* %out) #0 {
|
||||
%tid.x = tail call i32 @llvm.amdgcn.workitem.id.x() #1
|
||||
%idx.0 = add nsw i32 %tid.x, 2
|
||||
@ -165,11 +186,14 @@ define amdgpu_kernel void @simple_read2_f32_x2_nonzero_base(float addrspace(1)*
|
||||
; Base pointers come from different subregister of same super
|
||||
; register. We can't safely merge this.
|
||||
|
||||
; SI-LABEL: @read2_ptr_is_subreg_arg_f32
|
||||
; SI-NOT: ds_read2_b32
|
||||
; SI: ds_read_b32
|
||||
; SI: ds_read_b32
|
||||
; SI: s_endpgm
|
||||
; GCN-LABEL: @read2_ptr_is_subreg_arg_f32
|
||||
; CI-DAG: s_mov_b32 m0
|
||||
; GFX9-NOT: m0
|
||||
|
||||
; GCN-NOT: ds_read2_b32
|
||||
; GCN: ds_read_b32
|
||||
; GCN: ds_read_b32
|
||||
; GCN: s_endpgm
|
||||
define amdgpu_kernel void @read2_ptr_is_subreg_arg_f32(float addrspace(1)* %out, <2 x float addrspace(3)*> %lds.ptr) #0 {
|
||||
%x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1
|
||||
%index.0 = insertelement <2 x i32> undef, i32 %x.i, i32 0
|
||||
@ -191,11 +215,14 @@ define amdgpu_kernel void @read2_ptr_is_subreg_arg_f32(float addrspace(1)* %out,
|
||||
; sure we are really rejecting it because of the different
|
||||
; subregisters.
|
||||
|
||||
; SI-LABEL: @read2_ptr_is_subreg_arg_offset_f32
|
||||
; SI-NOT: ds_read2_b32
|
||||
; SI: ds_read_b32
|
||||
; SI: ds_read_b32
|
||||
; SI: s_endpgm
|
||||
; GCN-LABEL: @read2_ptr_is_subreg_arg_offset_f32
|
||||
; CI-DAG: s_mov_b32 m0
|
||||
; GFX9-NOT: m0
|
||||
|
||||
; GCN-NOT: ds_read2_b32
|
||||
; GCN: ds_read_b32
|
||||
; GCN: ds_read_b32
|
||||
; GCN: s_endpgm
|
||||
define amdgpu_kernel void @read2_ptr_is_subreg_arg_offset_f32(float addrspace(1)* %out, <2 x float addrspace(3)*> %lds.ptr) #0 {
|
||||
%x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1
|
||||
%index.0 = insertelement <2 x i32> undef, i32 %x.i, i32 0
|
||||
@ -216,9 +243,12 @@ define amdgpu_kernel void @read2_ptr_is_subreg_arg_offset_f32(float addrspace(1)
|
||||
ret void
|
||||
}
|
||||
|
||||
; SI-LABEL: {{^}}read2_ptr_is_subreg_f32:
|
||||
; SI: ds_read2_b32 {{v\[[0-9]+:[0-9]+\]}}, {{v[0-9]+}} offset1:8{{$}}
|
||||
; SI: s_endpgm
|
||||
; GCN-LABEL: {{^}}read2_ptr_is_subreg_f32:
|
||||
; CI-DAG: s_mov_b32 m0
|
||||
; GFX9-NOT: m0
|
||||
|
||||
; GCN: ds_read2_b32 {{v\[[0-9]+:[0-9]+\]}}, {{v[0-9]+}} offset1:8{{$}}
|
||||
; GCN: s_endpgm
|
||||
define amdgpu_kernel void @read2_ptr_is_subreg_f32(float addrspace(1)* %out) #0 {
|
||||
%x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1
|
||||
%ptr.0 = insertelement <2 x [512 x float] addrspace(3)*> undef, [512 x float] addrspace(3)* @lds, i32 0
|
||||
@ -238,11 +268,14 @@ define amdgpu_kernel void @read2_ptr_is_subreg_f32(float addrspace(1)* %out) #0
|
||||
ret void
|
||||
}
|
||||
|
||||
; SI-LABEL: @simple_read2_f32_volatile_0
|
||||
; SI-NOT ds_read2_b32
|
||||
; SI: ds_read_b32 v{{[0-9]+}}, v{{[0-9]+}}
|
||||
; SI: ds_read_b32 v{{[0-9]+}}, v{{[0-9]+}} offset:32
|
||||
; SI: s_endpgm
|
||||
; GCN-LABEL: @simple_read2_f32_volatile_0
|
||||
; CI-DAG: s_mov_b32 m0
|
||||
; GFX9-NOT: m0
|
||||
|
||||
; GCN-NOT ds_read2_b32
|
||||
; GCN: ds_read_b32 v{{[0-9]+}}, v{{[0-9]+}}
|
||||
; GCN: ds_read_b32 v{{[0-9]+}}, v{{[0-9]+}} offset:32
|
||||
; GCN: s_endpgm
|
||||
define amdgpu_kernel void @simple_read2_f32_volatile_0(float addrspace(1)* %out) #0 {
|
||||
%x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1
|
||||
%arrayidx0 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 %x.i
|
||||
@ -256,11 +289,14 @@ define amdgpu_kernel void @simple_read2_f32_volatile_0(float addrspace(1)* %out)
|
||||
ret void
|
||||
}
|
||||
|
||||
; SI-LABEL: @simple_read2_f32_volatile_1
|
||||
; SI-NOT ds_read2_b32
|
||||
; SI: ds_read_b32 v{{[0-9]+}}, v{{[0-9]+}}
|
||||
; SI: ds_read_b32 v{{[0-9]+}}, v{{[0-9]+}} offset:32
|
||||
; SI: s_endpgm
|
||||
; GCN-LABEL: @simple_read2_f32_volatile_1
|
||||
; CI-DAG: s_mov_b32 m0
|
||||
; GFX9-NOT: m0
|
||||
|
||||
; GCN-NOT ds_read2_b32
|
||||
; GCN: ds_read_b32 v{{[0-9]+}}, v{{[0-9]+}}
|
||||
; GCN: ds_read_b32 v{{[0-9]+}}, v{{[0-9]+}} offset:32
|
||||
; GCN: s_endpgm
|
||||
define amdgpu_kernel void @simple_read2_f32_volatile_1(float addrspace(1)* %out) #0 {
|
||||
%x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1
|
||||
%arrayidx0 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 %x.i
|
||||
@ -277,9 +313,12 @@ define amdgpu_kernel void @simple_read2_f32_volatile_1(float addrspace(1)* %out)
|
||||
; Can't fold since not correctly aligned.
|
||||
; XXX: This isn't really testing anything useful now. I think CI
|
||||
; allows unaligned LDS accesses, which would be a problem here.
|
||||
; SI-LABEL: @unaligned_read2_f32
|
||||
; SI-NOT: ds_read2_b32
|
||||
; SI: s_endpgm
|
||||
; GCN-LABEL: @unaligned_read2_f32
|
||||
; CI-DAG: s_mov_b32 m0
|
||||
; GFX9-NOT: m0
|
||||
|
||||
; GCN-NOT: ds_read2_b32
|
||||
; GCN: s_endpgm
|
||||
define amdgpu_kernel void @unaligned_read2_f32(float addrspace(1)* %out, float addrspace(3)* %lds) #0 {
|
||||
%x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1
|
||||
%arrayidx0 = getelementptr inbounds float, float addrspace(3)* %lds, i32 %x.i
|
||||
@ -293,9 +332,12 @@ define amdgpu_kernel void @unaligned_read2_f32(float addrspace(1)* %out, float a
|
||||
ret void
|
||||
}
|
||||
|
||||
; SI-LABEL: @misaligned_2_simple_read2_f32
|
||||
; SI-NOT: ds_read2_b32
|
||||
; SI: s_endpgm
|
||||
; GCN-LABEL: @misaligned_2_simple_read2_f32
|
||||
; CI-DAG: s_mov_b32 m0
|
||||
; GFX9-NOT: m0
|
||||
|
||||
; GCN-NOT: ds_read2_b32
|
||||
; GCN: s_endpgm
|
||||
define amdgpu_kernel void @misaligned_2_simple_read2_f32(float addrspace(1)* %out, float addrspace(3)* %lds) #0 {
|
||||
%x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1
|
||||
%arrayidx0 = getelementptr inbounds float, float addrspace(3)* %lds, i32 %x.i
|
||||
@ -309,12 +351,16 @@ define amdgpu_kernel void @misaligned_2_simple_read2_f32(float addrspace(1)* %ou
|
||||
ret void
|
||||
}
|
||||
|
||||
; SI-LABEL: @simple_read2_f64
|
||||
; SI: v_lshlrev_b32_e32 [[VPTR:v[0-9]+]], 3, {{v[0-9]+}}
|
||||
; SI: ds_read2_b64 v{{\[}}[[LO_VREG:[0-9]+]]:[[HI_VREG:[0-9]+]]{{\]}}, [[VPTR]] offset1:8
|
||||
; SI: v_add_f64 [[RESULT:v\[[0-9]+:[0-9]+\]]], v{{\[}}[[LO_VREG]]:{{[0-9]+\]}}, v{{\[[0-9]+}}:[[HI_VREG]]{{\]}}
|
||||
; SI: buffer_store_dwordx2 [[RESULT]]
|
||||
; SI: s_endpgm
|
||||
; GCN-LABEL: @simple_read2_f64
|
||||
; CI-DAG: s_mov_b32 m0
|
||||
; GFX9-NOT: m0
|
||||
|
||||
; GCN-DAG: v_lshlrev_b32_e32 [[VPTR:v[0-9]+]], 3, {{v[0-9]+}}
|
||||
; GCN: ds_read2_b64 v{{\[}}[[LO_VREG:[0-9]+]]:[[HI_VREG:[0-9]+]]{{\]}}, [[VPTR]] offset1:8
|
||||
; GCN: v_add_f64 [[RESULT:v\[[0-9]+:[0-9]+\]]], v{{\[}}[[LO_VREG]]:{{[0-9]+\]}}, v{{\[[0-9]+}}:[[HI_VREG]]{{\]}}
|
||||
|
||||
; CI: buffer_store_dwordx2 [[RESULT]]
|
||||
; GFX9: global_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]]
|
||||
define amdgpu_kernel void @simple_read2_f64(double addrspace(1)* %out) #0 {
|
||||
%x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1
|
||||
%arrayidx0 = getelementptr inbounds [512 x double], [512 x double] addrspace(3)* @lds.f64, i32 0, i32 %x.i
|
||||
@ -328,9 +374,12 @@ define amdgpu_kernel void @simple_read2_f64(double addrspace(1)* %out) #0 {
|
||||
ret void
|
||||
}
|
||||
|
||||
; SI-LABEL: @simple_read2_f64_max_offset
|
||||
; SI: ds_read2_b64 {{v\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}} offset1:255
|
||||
; SI: s_endpgm
|
||||
; GCN-LABEL: @simple_read2_f64_max_offset
|
||||
; CI-DAG: s_mov_b32 m0
|
||||
; GFX9-NOT: m0
|
||||
|
||||
; GCN: ds_read2_b64 {{v\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}} offset1:255
|
||||
; GCN: s_endpgm
|
||||
define amdgpu_kernel void @simple_read2_f64_max_offset(double addrspace(1)* %out) #0 {
|
||||
%x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1
|
||||
%arrayidx0 = getelementptr inbounds [512 x double], [512 x double] addrspace(3)* @lds.f64, i32 0, i32 %x.i
|
||||
@ -344,11 +393,14 @@ define amdgpu_kernel void @simple_read2_f64_max_offset(double addrspace(1)* %out
|
||||
ret void
|
||||
}
|
||||
|
||||
; SI-LABEL: @simple_read2_f64_too_far
|
||||
; SI-NOT ds_read2_b64
|
||||
; SI: ds_read_b64 {{v\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}}
|
||||
; SI: ds_read_b64 {{v\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}} offset:2056
|
||||
; SI: s_endpgm
|
||||
; GCN-LABEL: @simple_read2_f64_too_far
|
||||
; CI-DAG: s_mov_b32 m0
|
||||
; GFX9-NOT: m0
|
||||
|
||||
; GCN-NOT ds_read2_b64
|
||||
; GCN: ds_read_b64 {{v\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}}
|
||||
; GCN: ds_read_b64 {{v\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}} offset:2056
|
||||
; GCN: s_endpgm
|
||||
define amdgpu_kernel void @simple_read2_f64_too_far(double addrspace(1)* %out) #0 {
|
||||
%x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1
|
||||
%arrayidx0 = getelementptr inbounds [512 x double], [512 x double] addrspace(3)* @lds.f64, i32 0, i32 %x.i
|
||||
@ -363,10 +415,13 @@ define amdgpu_kernel void @simple_read2_f64_too_far(double addrspace(1)* %out) #
|
||||
}
|
||||
|
||||
; Alignment only 4
|
||||
; SI-LABEL: @misaligned_read2_f64
|
||||
; SI: ds_read2_b32 v{{\[[0-9]+:[0-9]+\]}}, {{v[0-9]+}} offset1:1
|
||||
; SI: ds_read2_b32 v{{\[[0-9]+:[0-9]+\]}}, {{v[0-9]+}} offset0:14 offset1:15
|
||||
; SI: s_endpgm
|
||||
; GCN-LABEL: @misaligned_read2_f64
|
||||
; CI-DAG: s_mov_b32 m0
|
||||
; GFX9-NOT: m0
|
||||
|
||||
; GCN: ds_read2_b32 v{{\[[0-9]+:[0-9]+\]}}, {{v[0-9]+}} offset1:1
|
||||
; GCN: ds_read2_b32 v{{\[[0-9]+:[0-9]+\]}}, {{v[0-9]+}} offset0:14 offset1:15
|
||||
; GCN: s_endpgm
|
||||
define amdgpu_kernel void @misaligned_read2_f64(double addrspace(1)* %out, double addrspace(3)* %lds) #0 {
|
||||
%x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1
|
||||
%arrayidx0 = getelementptr inbounds double, double addrspace(3)* %lds, i32 %x.i
|
||||
@ -382,9 +437,12 @@ define amdgpu_kernel void @misaligned_read2_f64(double addrspace(1)* %out, doubl
|
||||
|
||||
@foo = addrspace(3) global [4 x i32] undef, align 4
|
||||
|
||||
; SI-LABEL: @load_constant_adjacent_offsets
|
||||
; SI: v_mov_b32_e32 [[ZERO:v[0-9]+]], 0{{$}}
|
||||
; SI: ds_read2_b32 v{{\[[0-9]+:[0-9]+\]}}, [[ZERO]] offset1:1
|
||||
; GCN-LABEL: @load_constant_adjacent_offsets
|
||||
; CI-DAG: s_mov_b32 m0
|
||||
; GFX9-NOT: m0
|
||||
|
||||
; GCN-DAG: v_mov_b32_e32 [[ZERO:v[0-9]+]], 0{{$}}
|
||||
; GCN: ds_read2_b32 v{{\[[0-9]+:[0-9]+\]}}, [[ZERO]] offset1:1
|
||||
define amdgpu_kernel void @load_constant_adjacent_offsets(i32 addrspace(1)* %out) {
|
||||
%val0 = load i32, i32 addrspace(3)* getelementptr inbounds ([4 x i32], [4 x i32] addrspace(3)* @foo, i32 0, i32 0), align 4
|
||||
%val1 = load i32, i32 addrspace(3)* getelementptr inbounds ([4 x i32], [4 x i32] addrspace(3)* @foo, i32 0, i32 1), align 4
|
||||
@ -393,9 +451,12 @@ define amdgpu_kernel void @load_constant_adjacent_offsets(i32 addrspace(1)* %out
|
||||
ret void
|
||||
}
|
||||
|
||||
; SI-LABEL: @load_constant_disjoint_offsets
|
||||
; SI: v_mov_b32_e32 [[ZERO:v[0-9]+]], 0{{$}}
|
||||
; SI: ds_read2_b32 v{{\[[0-9]+:[0-9]+\]}}, [[ZERO]] offset1:2
|
||||
; GCN-LABEL: @load_constant_disjoint_offsets
|
||||
; CI-DAG: s_mov_b32 m0
|
||||
; GFX9-NOT: m0
|
||||
|
||||
; GCN-DAG: v_mov_b32_e32 [[ZERO:v[0-9]+]], 0{{$}}
|
||||
; GCN: ds_read2_b32 v{{\[[0-9]+:[0-9]+\]}}, [[ZERO]] offset1:2
|
||||
define amdgpu_kernel void @load_constant_disjoint_offsets(i32 addrspace(1)* %out) {
|
||||
%val0 = load i32, i32 addrspace(3)* getelementptr inbounds ([4 x i32], [4 x i32] addrspace(3)* @foo, i32 0, i32 0), align 4
|
||||
%val1 = load i32, i32 addrspace(3)* getelementptr inbounds ([4 x i32], [4 x i32] addrspace(3)* @foo, i32 0, i32 2), align 4
|
||||
@ -406,10 +467,13 @@ define amdgpu_kernel void @load_constant_disjoint_offsets(i32 addrspace(1)* %out
|
||||
|
||||
@bar = addrspace(3) global [4 x i64] undef, align 4
|
||||
|
||||
; SI-LABEL: @load_misaligned64_constant_offsets
|
||||
; SI: v_mov_b32_e32 [[ZERO:v[0-9]+]], 0{{$}}
|
||||
; SI: ds_read2_b32 v{{\[[0-9]+:[0-9]+\]}}, [[ZERO]] offset1:1
|
||||
; SI: ds_read2_b32 v{{\[[0-9]+:[0-9]+\]}}, [[ZERO]] offset0:2 offset1:3
|
||||
; GCN-LABEL: @load_misaligned64_constant_offsets
|
||||
; CI-DAG: s_mov_b32 m0
|
||||
; GFX9-NOT: m0
|
||||
|
||||
; GCN-DAG: v_mov_b32_e32 [[ZERO:v[0-9]+]], 0{{$}}
|
||||
; GCN: ds_read2_b32 v{{\[[0-9]+:[0-9]+\]}}, [[ZERO]] offset1:1
|
||||
; GCN: ds_read2_b32 v{{\[[0-9]+:[0-9]+\]}}, [[ZERO]] offset0:2 offset1:3
|
||||
define amdgpu_kernel void @load_misaligned64_constant_offsets(i64 addrspace(1)* %out) {
|
||||
%val0 = load i64, i64 addrspace(3)* getelementptr inbounds ([4 x i64], [4 x i64] addrspace(3)* @bar, i32 0, i32 0), align 4
|
||||
%val1 = load i64, i64 addrspace(3)* getelementptr inbounds ([4 x i64], [4 x i64] addrspace(3)* @bar, i32 0, i32 1), align 4
|
||||
@ -420,12 +484,15 @@ define amdgpu_kernel void @load_misaligned64_constant_offsets(i64 addrspace(1)*
|
||||
|
||||
@bar.large = addrspace(3) global [4096 x i64] undef, align 4
|
||||
|
||||
; SI-LABEL: @load_misaligned64_constant_large_offsets
|
||||
; SI-DAG: v_mov_b32_e32 [[BASE0:v[0-9]+]], 0x7ff8{{$}}
|
||||
; SI-DAG: v_mov_b32_e32 [[BASE1:v[0-9]+]], 0x4000
|
||||
; SI-DAG: ds_read2_b32 v{{\[[0-9]+:[0-9]+\]}}, [[BASE0]] offset1:1
|
||||
; SI-DAG: ds_read2_b32 v{{\[[0-9]+:[0-9]+\]}}, [[BASE1]] offset1:1
|
||||
; SI: s_endpgm
|
||||
; GCN-LABEL: @load_misaligned64_constant_large_offsets
|
||||
; CI-DAG: s_mov_b32 m0
|
||||
; GFX9-NOT: m0
|
||||
|
||||
; GCN-DAG: v_mov_b32_e32 [[BASE0:v[0-9]+]], 0x7ff8{{$}}
|
||||
; GCN-DAG: v_mov_b32_e32 [[BASE1:v[0-9]+]], 0x4000
|
||||
; GCN-DAG: ds_read2_b32 v{{\[[0-9]+:[0-9]+\]}}, [[BASE0]] offset1:1
|
||||
; GCN-DAG: ds_read2_b32 v{{\[[0-9]+:[0-9]+\]}}, [[BASE1]] offset1:1
|
||||
; GCN: s_endpgm
|
||||
define amdgpu_kernel void @load_misaligned64_constant_large_offsets(i64 addrspace(1)* %out) {
|
||||
%val0 = load i64, i64 addrspace(3)* getelementptr inbounds ([4096 x i64], [4096 x i64] addrspace(3)* @bar.large, i32 0, i32 2048), align 4
|
||||
%val1 = load i64, i64 addrspace(3)* getelementptr inbounds ([4096 x i64], [4096 x i64] addrspace(3)* @bar.large, i32 0, i32 4095), align 4
|
||||
@ -437,6 +504,10 @@ define amdgpu_kernel void @load_misaligned64_constant_large_offsets(i64 addrspac
|
||||
@sgemm.lA = internal unnamed_addr addrspace(3) global [264 x float] undef, align 4
|
||||
@sgemm.lB = internal unnamed_addr addrspace(3) global [776 x float] undef, align 4
|
||||
|
||||
; GCN-LABEL: {{^}}sgemm_inner_loop_read2_sequence:
|
||||
; CI-DAG: s_mov_b32 m0
|
||||
; GFX9-NOT: m0
|
||||
|
||||
define amdgpu_kernel void @sgemm_inner_loop_read2_sequence(float addrspace(1)* %C, i32 %lda, i32 %ldb) #0 {
|
||||
%x.i = tail call i32 @llvm.amdgcn.workgroup.id.x() #1
|
||||
%y.i = tail call i32 @llvm.amdgcn.workitem.id.y() #1
|
||||
@ -481,20 +552,29 @@ define amdgpu_kernel void @sgemm_inner_loop_read2_sequence(float addrspace(1)* %
|
||||
ret void
|
||||
}
|
||||
|
||||
; GCN-LABEL: {{^}}misaligned_read2_v2i32:
|
||||
; CI-DAG: s_mov_b32 m0
|
||||
; GFX9-NOT: m0
|
||||
define amdgpu_kernel void @misaligned_read2_v2i32(<2 x i32> addrspace(1)* %out, <2 x i32> addrspace(3)* %in) #0 {
|
||||
%load = load <2 x i32>, <2 x i32> addrspace(3)* %in, align 4
|
||||
store <2 x i32> %load, <2 x i32> addrspace(1)* %out, align 8
|
||||
ret void
|
||||
}
|
||||
|
||||
; GCN-LABEL: {{^}}misaligned_read2_i64:
|
||||
; CI-DAG: s_mov_b32 m0
|
||||
; GFX9-NOT: m0
|
||||
define amdgpu_kernel void @misaligned_read2_i64(i64 addrspace(1)* %out, i64 addrspace(3)* %in) #0 {
|
||||
%load = load i64, i64 addrspace(3)* %in, align 4
|
||||
store i64 %load, i64 addrspace(1)* %out, align 8
|
||||
ret void
|
||||
}
|
||||
|
||||
; SI-LABEL: ds_read_diff_base_interleaving
|
||||
; SI-NOT: ds_read_b32
|
||||
; GCN-LABEL: ds_read_diff_base_interleaving
|
||||
; CI-DAG: s_mov_b32 m0
|
||||
; GFX9-NOT: m0
|
||||
|
||||
; GCN-NOT: ds_read_b32
|
||||
define amdgpu_kernel void @ds_read_diff_base_interleaving(
|
||||
float addrspace(1)* nocapture %arg,
|
||||
[4 x [4 x float]] addrspace(3)* %arg1,
|
||||
@ -533,19 +613,10 @@ bb:
|
||||
ret void
|
||||
}
|
||||
|
||||
; Function Attrs: nounwind readnone
|
||||
declare i32 @llvm.amdgcn.workgroup.id.x() #1
|
||||
|
||||
; Function Attrs: nounwind readnone
|
||||
declare i32 @llvm.amdgcn.workgroup.id.y() #1
|
||||
|
||||
; Function Attrs: nounwind readnone
|
||||
declare i32 @llvm.amdgcn.workitem.id.x() #1
|
||||
|
||||
; Function Attrs: nounwind readnone
|
||||
declare i32 @llvm.amdgcn.workitem.id.y() #1
|
||||
|
||||
; Function Attrs: convergent nounwind
|
||||
declare void @llvm.amdgcn.s.barrier() #2
|
||||
|
||||
attributes #0 = { nounwind }
|
||||
|
@ -1,15 +1,19 @@
|
||||
; RUN: llc -march=amdgcn -mcpu=bonaire -verify-machineinstrs -mattr=+load-store-opt < %s | FileCheck -check-prefix=SI %s
|
||||
; RUN: llc -march=amdgcn -mcpu=bonaire -verify-machineinstrs -mattr=+load-store-opt < %s | FileCheck -check-prefixes=GCN,CI %s
|
||||
; RUN: llc -march=amdgcn -mcpu=gfx900 -verify-machineinstrs -mattr=+load-store-opt < %s | FileCheck -check-prefixes=GCN,GFX9 %s
|
||||
|
||||
@lds = addrspace(3) global [512 x float] undef, align 4
|
||||
@lds.f64 = addrspace(3) global [512 x double] undef, align 8
|
||||
|
||||
|
||||
; SI-LABEL: @simple_read2st64_f32_0_1
|
||||
; SI: ds_read2st64_b32 v{{\[}}[[LO_VREG:[0-9]+]]:[[HI_VREG:[0-9]+]]{{\]}}, v{{[0-9]+}} offset1:1
|
||||
; SI: s_waitcnt lgkmcnt(0)
|
||||
; SI: v_add_f32_e32 [[RESULT:v[0-9]+]], v[[LO_VREG]], v[[HI_VREG]]
|
||||
; SI: buffer_store_dword [[RESULT]]
|
||||
; SI: s_endpgm
|
||||
; GCN-LABEL: @simple_read2st64_f32_0_1
|
||||
; CI: s_mov_b32 m0
|
||||
; GFX9-NOT: m0
|
||||
|
||||
; GCN: ds_read2st64_b32 v{{\[}}[[LO_VREG:[0-9]+]]:[[HI_VREG:[0-9]+]]{{\]}}, v{{[0-9]+}} offset1:1
|
||||
; GCN: s_waitcnt lgkmcnt(0)
|
||||
; GCN: v_add_f32_e32 [[RESULT:v[0-9]+]], v[[LO_VREG]], v[[HI_VREG]]
|
||||
; CI: buffer_store_dword [[RESULT]]
|
||||
; GFX9: global_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]]
|
||||
define amdgpu_kernel void @simple_read2st64_f32_0_1(float addrspace(1)* %out) #0 {
|
||||
%x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1
|
||||
%arrayidx0 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 %x.i
|
||||
@ -23,12 +27,15 @@ define amdgpu_kernel void @simple_read2st64_f32_0_1(float addrspace(1)* %out) #0
|
||||
ret void
|
||||
}
|
||||
|
||||
; SI-LABEL: @simple_read2st64_f32_1_2
|
||||
; SI: ds_read2st64_b32 v{{\[}}[[LO_VREG:[0-9]+]]:[[HI_VREG:[0-9]+]]{{\]}}, v{{[0-9]+}} offset0:1 offset1:2
|
||||
; SI: s_waitcnt lgkmcnt(0)
|
||||
; SI: v_add_f32_e32 [[RESULT:v[0-9]+]], v[[LO_VREG]], v[[HI_VREG]]
|
||||
; SI: buffer_store_dword [[RESULT]]
|
||||
; SI: s_endpgm
|
||||
; GCN-LABEL: @simple_read2st64_f32_1_2
|
||||
; CI: s_mov_b32 m0
|
||||
; GFX9-NOT: m0
|
||||
|
||||
; GCN: ds_read2st64_b32 v{{\[}}[[LO_VREG:[0-9]+]]:[[HI_VREG:[0-9]+]]{{\]}}, v{{[0-9]+}} offset0:1 offset1:2
|
||||
; GCN: s_waitcnt lgkmcnt(0)
|
||||
; GCN: v_add_f32_e32 [[RESULT:v[0-9]+]], v[[LO_VREG]], v[[HI_VREG]]
|
||||
; CI: buffer_store_dword [[RESULT]]
|
||||
; GFX9: global_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]]
|
||||
define amdgpu_kernel void @simple_read2st64_f32_1_2(float addrspace(1)* %out, float addrspace(3)* %lds) #0 {
|
||||
%x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1
|
||||
%add.x.0 = add nsw i32 %x.i, 64
|
||||
@ -43,12 +50,15 @@ define amdgpu_kernel void @simple_read2st64_f32_1_2(float addrspace(1)* %out, fl
|
||||
ret void
|
||||
}
|
||||
|
||||
; SI-LABEL: @simple_read2st64_f32_max_offset
|
||||
; SI: ds_read2st64_b32 v{{\[}}[[LO_VREG:[0-9]+]]:[[HI_VREG:[0-9]+]]{{\]}}, v{{[0-9]+}} offset0:1 offset1:255
|
||||
; SI: s_waitcnt lgkmcnt(0)
|
||||
; SI: v_add_f32_e32 [[RESULT:v[0-9]+]], v[[LO_VREG]], v[[HI_VREG]]
|
||||
; SI: buffer_store_dword [[RESULT]]
|
||||
; SI: s_endpgm
|
||||
; GCN-LABEL: @simple_read2st64_f32_max_offset
|
||||
; CI: s_mov_b32 m0
|
||||
; GFX9-NOT: m0
|
||||
|
||||
; GCN: ds_read2st64_b32 v{{\[}}[[LO_VREG:[0-9]+]]:[[HI_VREG:[0-9]+]]{{\]}}, v{{[0-9]+}} offset0:1 offset1:255
|
||||
; GCN: s_waitcnt lgkmcnt(0)
|
||||
; GCN: v_add_f32_e32 [[RESULT:v[0-9]+]], v[[LO_VREG]], v[[HI_VREG]]
|
||||
; CI: buffer_store_dword [[RESULT]]
|
||||
; GFX9: global_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]]
|
||||
define amdgpu_kernel void @simple_read2st64_f32_max_offset(float addrspace(1)* %out, float addrspace(3)* %lds) #0 {
|
||||
%x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1
|
||||
%add.x.0 = add nsw i32 %x.i, 64
|
||||
@ -63,12 +73,15 @@ define amdgpu_kernel void @simple_read2st64_f32_max_offset(float addrspace(1)* %
|
||||
ret void
|
||||
}
|
||||
|
||||
; SI-LABEL: @simple_read2st64_f32_over_max_offset
|
||||
; SI-NOT: ds_read2st64_b32
|
||||
; SI-DAG: v_add_i32_e32 [[BIGADD:v[0-9]+]], vcc, 0x10000, {{v[0-9]+}}
|
||||
; SI-DAG: ds_read_b32 {{v[0-9]+}}, {{v[0-9]+}} offset:256
|
||||
; SI-DAG: ds_read_b32 {{v[0-9]+}}, [[BIGADD]]{{$}}
|
||||
; SI: s_endpgm
|
||||
; GCN-LABEL: @simple_read2st64_f32_over_max_offset
|
||||
; CI: s_mov_b32 m0
|
||||
; GFX9-NOT: m0
|
||||
|
||||
; GCN-NOT: ds_read2st64_b32
|
||||
; GCN-DAG: v_add{{(_co)?}}_{{i|u}}32_e32 [[BIGADD:v[0-9]+]], vcc, 0x10000, {{v[0-9]+}}
|
||||
; GCN-DAG: ds_read_b32 {{v[0-9]+}}, {{v[0-9]+}} offset:256
|
||||
; GCN-DAG: ds_read_b32 {{v[0-9]+}}, [[BIGADD]]{{$}}
|
||||
; GCN: s_endpgm
|
||||
define amdgpu_kernel void @simple_read2st64_f32_over_max_offset(float addrspace(1)* %out, float addrspace(3)* %lds) #0 {
|
||||
%x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1
|
||||
%add.x.0 = add nsw i32 %x.i, 64
|
||||
@ -83,9 +96,12 @@ define amdgpu_kernel void @simple_read2st64_f32_over_max_offset(float addrspace(
|
||||
ret void
|
||||
}
|
||||
|
||||
; SI-LABEL: @odd_invalid_read2st64_f32_0
|
||||
; SI-NOT: ds_read2st64_b32
|
||||
; SI: s_endpgm
|
||||
; GCN-LABEL: @odd_invalid_read2st64_f32_0
|
||||
; CI: s_mov_b32 m0
|
||||
; GFX9-NOT: m0
|
||||
|
||||
; GCN-NOT: ds_read2st64_b32
|
||||
; GCN: s_endpgm
|
||||
define amdgpu_kernel void @odd_invalid_read2st64_f32_0(float addrspace(1)* %out) #0 {
|
||||
%x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1
|
||||
%arrayidx0 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 %x.i
|
||||
@ -99,9 +115,12 @@ define amdgpu_kernel void @odd_invalid_read2st64_f32_0(float addrspace(1)* %out)
|
||||
ret void
|
||||
}
|
||||
|
||||
; SI-LABEL: @odd_invalid_read2st64_f32_1
|
||||
; SI-NOT: ds_read2st64_b32
|
||||
; SI: s_endpgm
|
||||
; GCN-LABEL: @odd_invalid_read2st64_f32_1
|
||||
; CI: s_mov_b32 m0
|
||||
; GFX9-NOT: m0
|
||||
|
||||
; GCN-NOT: ds_read2st64_b32
|
||||
; GCN: s_endpgm
|
||||
define amdgpu_kernel void @odd_invalid_read2st64_f32_1(float addrspace(1)* %out) #0 {
|
||||
%x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1
|
||||
%add.x.0 = add nsw i32 %x.i, 64
|
||||
@ -116,12 +135,15 @@ define amdgpu_kernel void @odd_invalid_read2st64_f32_1(float addrspace(1)* %out)
|
||||
ret void
|
||||
}
|
||||
|
||||
; SI-LABEL: @simple_read2st64_f64_0_1
|
||||
; SI: ds_read2st64_b64 v{{\[}}[[LO_VREG:[0-9]+]]:[[HI_VREG:[0-9]+]]{{\]}}, v{{[0-9]+}} offset1:1
|
||||
; SI: s_waitcnt lgkmcnt(0)
|
||||
; SI: v_add_f64 [[RESULT:v\[[0-9]+:[0-9]+\]]], v{{\[}}[[LO_VREG]]:{{[0-9]+\]}}, v{{\[[0-9]+}}:[[HI_VREG]]{{\]}}
|
||||
; SI: buffer_store_dwordx2 [[RESULT]]
|
||||
; SI: s_endpgm
|
||||
; GCN-LABEL: @simple_read2st64_f64_0_1
|
||||
; CI: s_mov_b32 m0
|
||||
; GFX9-NOT: m0
|
||||
|
||||
; GCN: ds_read2st64_b64 v{{\[}}[[LO_VREG:[0-9]+]]:[[HI_VREG:[0-9]+]]{{\]}}, v{{[0-9]+}} offset1:1
|
||||
; GCN: s_waitcnt lgkmcnt(0)
|
||||
; GCN: v_add_f64 [[RESULT:v\[[0-9]+:[0-9]+\]]], v{{\[}}[[LO_VREG]]:{{[0-9]+\]}}, v{{\[[0-9]+}}:[[HI_VREG]]{{\]}}
|
||||
; CI: buffer_store_dwordx2 [[RESULT]]
|
||||
; GFX9: global_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]]
|
||||
define amdgpu_kernel void @simple_read2st64_f64_0_1(double addrspace(1)* %out) #0 {
|
||||
%x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1
|
||||
%arrayidx0 = getelementptr inbounds [512 x double], [512 x double] addrspace(3)* @lds.f64, i32 0, i32 %x.i
|
||||
@ -135,12 +157,16 @@ define amdgpu_kernel void @simple_read2st64_f64_0_1(double addrspace(1)* %out) #
|
||||
ret void
|
||||
}
|
||||
|
||||
; SI-LABEL: @simple_read2st64_f64_1_2
|
||||
; SI: ds_read2st64_b64 v{{\[}}[[LO_VREG:[0-9]+]]:[[HI_VREG:[0-9]+]]{{\]}}, v{{[0-9]+}} offset0:1 offset1:2
|
||||
; SI: s_waitcnt lgkmcnt(0)
|
||||
; SI: v_add_f64 [[RESULT:v\[[0-9]+:[0-9]+\]]], v{{\[}}[[LO_VREG]]:{{[0-9]+\]}}, v{{\[[0-9]+}}:[[HI_VREG]]{{\]}}
|
||||
; SI: buffer_store_dwordx2 [[RESULT]]
|
||||
; SI: s_endpgm
|
||||
; GCN-LABEL: @simple_read2st64_f64_1_2
|
||||
; CI: s_mov_b32 m0
|
||||
; GFX9-NOT: m0
|
||||
|
||||
; GCN: ds_read2st64_b64 v{{\[}}[[LO_VREG:[0-9]+]]:[[HI_VREG:[0-9]+]]{{\]}}, v{{[0-9]+}} offset0:1 offset1:2
|
||||
; GCN: s_waitcnt lgkmcnt(0)
|
||||
; GCN: v_add_f64 [[RESULT:v\[[0-9]+:[0-9]+\]]], v{{\[}}[[LO_VREG]]:{{[0-9]+\]}}, v{{\[[0-9]+}}:[[HI_VREG]]{{\]}}
|
||||
|
||||
; CI: buffer_store_dwordx2 [[RESULT]]
|
||||
; GFX9: global_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]]
|
||||
define amdgpu_kernel void @simple_read2st64_f64_1_2(double addrspace(1)* %out, double addrspace(3)* %lds) #0 {
|
||||
%x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1
|
||||
%add.x.0 = add nsw i32 %x.i, 64
|
||||
@ -157,10 +183,13 @@ define amdgpu_kernel void @simple_read2st64_f64_1_2(double addrspace(1)* %out, d
|
||||
|
||||
; Alignment only
|
||||
|
||||
; SI-LABEL: @misaligned_read2st64_f64
|
||||
; SI: ds_read2_b32 v{{\[[0-9]+:[0-9]+\]}}, {{v[0-9]+}} offset1:1
|
||||
; SI: ds_read2_b32 v{{\[[0-9]+:[0-9]+\]}}, {{v[0-9]+}} offset0:128 offset1:129
|
||||
; SI: s_endpgm
|
||||
; GCN-LABEL: @misaligned_read2st64_f64
|
||||
; CI: s_mov_b32 m0
|
||||
; GFX9-NOT: m0
|
||||
|
||||
; GCN: ds_read2_b32 v{{\[[0-9]+:[0-9]+\]}}, {{v[0-9]+}} offset1:1
|
||||
; GCN: ds_read2_b32 v{{\[[0-9]+:[0-9]+\]}}, {{v[0-9]+}} offset0:128 offset1:129
|
||||
; GCN: s_endpgm
|
||||
define amdgpu_kernel void @misaligned_read2st64_f64(double addrspace(1)* %out, double addrspace(3)* %lds) #0 {
|
||||
%x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1
|
||||
%arrayidx0 = getelementptr inbounds double, double addrspace(3)* %lds, i32 %x.i
|
||||
@ -175,12 +204,16 @@ define amdgpu_kernel void @misaligned_read2st64_f64(double addrspace(1)* %out, d
|
||||
}
|
||||
|
||||
; The maximum is not the usual 0xff because 0xff * 8 * 64 > 0xffff
|
||||
; SI-LABEL: @simple_read2st64_f64_max_offset
|
||||
; SI: ds_read2st64_b64 v{{\[}}[[LO_VREG:[0-9]+]]:[[HI_VREG:[0-9]+]]{{\]}}, v{{[0-9]+}} offset0:4 offset1:127
|
||||
; SI: s_waitcnt lgkmcnt(0)
|
||||
; SI: v_add_f64 [[RESULT:v\[[0-9]+:[0-9]+\]]], v{{\[}}[[LO_VREG]]:{{[0-9]+\]}}, v{{\[[0-9]+}}:[[HI_VREG]]{{\]}}
|
||||
; SI: buffer_store_dwordx2 [[RESULT]]
|
||||
; SI: s_endpgm
|
||||
; GCN-LABEL: @simple_read2st64_f64_max_offset
|
||||
; CI: s_mov_b32 m0
|
||||
; GFX9-NOT: m0
|
||||
|
||||
; GCN: ds_read2st64_b64 v{{\[}}[[LO_VREG:[0-9]+]]:[[HI_VREG:[0-9]+]]{{\]}}, v{{[0-9]+}} offset0:4 offset1:127
|
||||
; GCN: s_waitcnt lgkmcnt(0)
|
||||
; GCN: v_add_f64 [[RESULT:v\[[0-9]+:[0-9]+\]]], v{{\[}}[[LO_VREG]]:{{[0-9]+\]}}, v{{\[[0-9]+}}:[[HI_VREG]]{{\]}}
|
||||
|
||||
; CI: buffer_store_dwordx2 [[RESULT]]
|
||||
; GFX9: global_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]]
|
||||
define amdgpu_kernel void @simple_read2st64_f64_max_offset(double addrspace(1)* %out, double addrspace(3)* %lds) #0 {
|
||||
%x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1
|
||||
%add.x.0 = add nsw i32 %x.i, 256
|
||||
@ -195,12 +228,15 @@ define amdgpu_kernel void @simple_read2st64_f64_max_offset(double addrspace(1)*
|
||||
ret void
|
||||
}
|
||||
|
||||
; SI-LABEL: @simple_read2st64_f64_over_max_offset
|
||||
; SI-NOT: ds_read2st64_b64
|
||||
; SI-DAG: ds_read_b64 {{v\[[0-9]+:[0-9]+\]}}, {{v[0-9]+}} offset:512
|
||||
; SI-DAG: v_add_i32_e32 [[BIGADD:v[0-9]+]], vcc, 0x10000, {{v[0-9]+}}
|
||||
; SI: ds_read_b64 {{v\[[0-9]+:[0-9]+\]}}, [[BIGADD]]
|
||||
; SI: s_endpgm
|
||||
; GCN-LABEL: @simple_read2st64_f64_over_max_offset
|
||||
; CI: s_mov_b32 m0
|
||||
; GFX9-NOT: m0
|
||||
|
||||
; GCN-NOT: ds_read2st64_b64
|
||||
; GCN-DAG: ds_read_b64 {{v\[[0-9]+:[0-9]+\]}}, {{v[0-9]+}} offset:512
|
||||
; GCN-DAG: v_add_{{(co_)?}}{{i|u}}32_e32 [[BIGADD:v[0-9]+]], vcc, 0x10000, {{v[0-9]+}}
|
||||
; GCN: ds_read_b64 {{v\[[0-9]+:[0-9]+\]}}, [[BIGADD]]
|
||||
; GCN: s_endpgm
|
||||
define amdgpu_kernel void @simple_read2st64_f64_over_max_offset(double addrspace(1)* %out, double addrspace(3)* %lds) #0 {
|
||||
%x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1
|
||||
%add.x.0 = add nsw i32 %x.i, 64
|
||||
@ -215,9 +251,12 @@ define amdgpu_kernel void @simple_read2st64_f64_over_max_offset(double addrspace
|
||||
ret void
|
||||
}
|
||||
|
||||
; SI-LABEL: @invalid_read2st64_f64_odd_offset
|
||||
; SI-NOT: ds_read2st64_b64
|
||||
; SI: s_endpgm
|
||||
; GCN-LABEL: @invalid_read2st64_f64_odd_offset
|
||||
; CI: s_mov_b32 m0
|
||||
; GFX9-NOT: m0
|
||||
|
||||
; GCN-NOT: ds_read2st64_b64
|
||||
; GCN: s_endpgm
|
||||
define amdgpu_kernel void @invalid_read2st64_f64_odd_offset(double addrspace(1)* %out, double addrspace(3)* %lds) #0 {
|
||||
%x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1
|
||||
%add.x.0 = add nsw i32 %x.i, 64
|
||||
@ -235,10 +274,13 @@ define amdgpu_kernel void @invalid_read2st64_f64_odd_offset(double addrspace(1)*
|
||||
; The stride of 8 elements is 8 * 8 bytes. We need to make sure the
|
||||
; stride in elements, not bytes, is a multiple of 64.
|
||||
|
||||
; SI-LABEL: @byte_size_only_divisible_64_read2_f64
|
||||
; SI-NOT: ds_read2st_b64
|
||||
; SI: ds_read2_b64 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}} offset1:8
|
||||
; SI: s_endpgm
|
||||
; GCN-LABEL: @byte_size_only_divisible_64_read2_f64
|
||||
; CI: s_mov_b32 m0
|
||||
; GFX9-NOT: m0
|
||||
|
||||
; GCN-NOT: ds_read2st_b64
|
||||
; GCN: ds_read2_b64 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}} offset1:8
|
||||
; GCN: s_endpgm
|
||||
define amdgpu_kernel void @byte_size_only_divisible_64_read2_f64(double addrspace(1)* %out, double addrspace(3)* %lds) #0 {
|
||||
%x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1
|
||||
%arrayidx0 = getelementptr inbounds double, double addrspace(3)* %lds, i32 %x.i
|
||||
@ -252,10 +294,7 @@ define amdgpu_kernel void @byte_size_only_divisible_64_read2_f64(double addrspac
|
||||
ret void
|
||||
}
|
||||
|
||||
; Function Attrs: nounwind readnone
|
||||
declare i32 @llvm.amdgcn.workitem.id.x() #1
|
||||
|
||||
; Function Attrs: nounwind readnone
|
||||
declare i32 @llvm.amdgcn.workitem.id.y() #1
|
||||
|
||||
attributes #0 = { nounwind }
|
||||
|
@ -1,14 +1,18 @@
|
||||
; RUN: llc -march=amdgcn -mcpu=bonaire -verify-machineinstrs -mattr=+load-store-opt < %s | FileCheck -strict-whitespace -check-prefix=SI %s
|
||||
; RUN: llc -march=amdgcn -mcpu=bonaire -verify-machineinstrs -mattr=+load-store-opt < %s | FileCheck -strict-whitespace -check-prefixes=GCN,CI %s
|
||||
; RUN: llc -march=amdgcn -mcpu=gfx900 -verify-machineinstrs -mattr=+load-store-opt < %s | FileCheck -strict-whitespace -check-prefixes=GCN,GFX9 %s
|
||||
|
||||
@lds = addrspace(3) global [512 x float] undef, align 4
|
||||
@lds.f64 = addrspace(3) global [512 x double] undef, align 8
|
||||
|
||||
|
||||
; SI-LABEL: @simple_write2_one_val_f32
|
||||
; SI-DAG: buffer_load_dword [[VAL:v[0-9]+]]
|
||||
; SI-DAG: v_lshlrev_b32_e32 [[VPTR:v[0-9]+]], 2, v{{[0-9]+}}
|
||||
; SI: ds_write2_b32 [[VPTR]], [[VAL]], [[VAL]] offset1:8
|
||||
; SI: s_endpgm
|
||||
; GCN-LABEL: {{^}}simple_write2_one_val_f32:
|
||||
; CI-DAG: s_mov_b32 m0
|
||||
; GFX9-NOT: m0
|
||||
|
||||
; GCN-DAG: {{buffer|global}}_load_dword [[VAL:v[0-9]+]]
|
||||
; GCN-DAG: v_lshlrev_b32_e32 [[VPTR:v[0-9]+]], 2, v{{[0-9]+}}
|
||||
; GCN: ds_write2_b32 [[VPTR]], [[VAL]], [[VAL]] offset1:8
|
||||
; GCN: s_endpgm
|
||||
define amdgpu_kernel void @simple_write2_one_val_f32(float addrspace(1)* %C, float addrspace(1)* %in) #0 {
|
||||
%x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1
|
||||
%in.gep = getelementptr float, float addrspace(1)* %in, i32 %x.i
|
||||
@ -21,12 +25,19 @@ define amdgpu_kernel void @simple_write2_one_val_f32(float addrspace(1)* %C, flo
|
||||
ret void
|
||||
}
|
||||
|
||||
; SI-LABEL: @simple_write2_two_val_f32
|
||||
; SI-DAG: buffer_load_dword [[VAL0:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
|
||||
; SI-DAG: buffer_load_dword [[VAL1:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4
|
||||
; SI-DAG: v_lshlrev_b32_e32 [[VPTR:v[0-9]+]], 2, v{{[0-9]+}}
|
||||
; SI: ds_write2_b32 [[VPTR]], [[VAL0]], [[VAL1]] offset1:8
|
||||
; SI: s_endpgm
|
||||
; GCN-LABEL: {{^}}simple_write2_two_val_f32:
|
||||
; CI-DAG: s_mov_b32 m0
|
||||
; GFX9-NOT: m0
|
||||
|
||||
; CI-DAG: buffer_load_dword [[VAL0:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
|
||||
; CI-DAG: buffer_load_dword [[VAL1:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4
|
||||
|
||||
; GFX9-DAG: global_load_dword [[VAL0:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, off{{$}}
|
||||
; GFX9-DAG: global_load_dword [[VAL1:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, off offset:4
|
||||
|
||||
; GCN-DAG: v_lshlrev_b32_e32 [[VPTR:v[0-9]+]], 2, v{{[0-9]+}}
|
||||
; GCN: ds_write2_b32 [[VPTR]], [[VAL0]], [[VAL1]] offset1:8
|
||||
; GCN: s_endpgm
|
||||
define amdgpu_kernel void @simple_write2_two_val_f32(float addrspace(1)* %C, float addrspace(1)* %in) #0 {
|
||||
%x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1
|
||||
%in.gep.0 = getelementptr float, float addrspace(1)* %in, i32 %x.i
|
||||
@ -41,11 +52,14 @@ define amdgpu_kernel void @simple_write2_two_val_f32(float addrspace(1)* %C, flo
|
||||
ret void
|
||||
}
|
||||
|
||||
; SI-LABEL: @simple_write2_two_val_f32_volatile_0
|
||||
; SI-NOT: ds_write2_b32
|
||||
; SI: ds_write_b32 {{v[0-9]+}}, {{v[0-9]+}}
|
||||
; SI: ds_write_b32 {{v[0-9]+}}, {{v[0-9]+}} offset:32
|
||||
; SI: s_endpgm
|
||||
; GCN-LABEL: @simple_write2_two_val_f32_volatile_0
|
||||
; CI-DAG: s_mov_b32 m0
|
||||
; GFX9-NOT: m0
|
||||
|
||||
; GCN-NOT: ds_write2_b32
|
||||
; GCN: ds_write_b32 {{v[0-9]+}}, {{v[0-9]+}}
|
||||
; GCN: ds_write_b32 {{v[0-9]+}}, {{v[0-9]+}} offset:32
|
||||
; GCN: s_endpgm
|
||||
define amdgpu_kernel void @simple_write2_two_val_f32_volatile_0(float addrspace(1)* %C, float addrspace(1)* %in0, float addrspace(1)* %in1) #0 {
|
||||
%x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1
|
||||
%in0.gep = getelementptr float, float addrspace(1)* %in0, i32 %x.i
|
||||
@ -60,11 +74,14 @@ define amdgpu_kernel void @simple_write2_two_val_f32_volatile_0(float addrspace(
|
||||
ret void
|
||||
}
|
||||
|
||||
; SI-LABEL: @simple_write2_two_val_f32_volatile_1
|
||||
; SI-NOT: ds_write2_b32
|
||||
; SI: ds_write_b32 {{v[0-9]+}}, {{v[0-9]+}}
|
||||
; SI: ds_write_b32 {{v[0-9]+}}, {{v[0-9]+}} offset:32
|
||||
; SI: s_endpgm
|
||||
; GCN-LABEL: @simple_write2_two_val_f32_volatile_1
|
||||
; CI-DAG: s_mov_b32 m0
|
||||
; GFX9-NOT: m0
|
||||
|
||||
; GCN-NOT: ds_write2_b32
|
||||
; GCN: ds_write_b32 {{v[0-9]+}}, {{v[0-9]+}}
|
||||
; GCN: ds_write_b32 {{v[0-9]+}}, {{v[0-9]+}} offset:32
|
||||
; GCN: s_endpgm
|
||||
define amdgpu_kernel void @simple_write2_two_val_f32_volatile_1(float addrspace(1)* %C, float addrspace(1)* %in0, float addrspace(1)* %in1) #0 {
|
||||
%x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1
|
||||
%in0.gep = getelementptr float, float addrspace(1)* %in0, i32 %x.i
|
||||
@ -80,12 +97,19 @@ define amdgpu_kernel void @simple_write2_two_val_f32_volatile_1(float addrspace(
|
||||
}
|
||||
|
||||
; 2 data subregisters from different super registers.
|
||||
; SI-LABEL: @simple_write2_two_val_subreg2_mixed_f32
|
||||
; SI: buffer_load_dwordx2 v{{\[}}[[VAL0:[0-9]+]]:{{[0-9]+\]}}
|
||||
; SI: buffer_load_dwordx2 v{{\[[0-9]+}}:[[VAL1:[0-9]+]]{{\]}}
|
||||
; SI: v_lshlrev_b32_e32 [[VPTR:v[0-9]+]], 2, v{{[0-9]+}}
|
||||
; SI: ds_write2_b32 [[VPTR]], v[[VAL0]], v[[VAL1]] offset1:8
|
||||
; SI: s_endpgm
|
||||
; GCN-LABEL: {{^}}simple_write2_two_val_subreg2_mixed_f32:
|
||||
; GFX9-NOT: m0
|
||||
|
||||
; CI: buffer_load_dwordx2 v{{\[}}[[VAL0:[0-9]+]]:{{[0-9]+\]}}
|
||||
; CI: buffer_load_dwordx2 v{{\[[0-9]+}}:[[VAL1:[0-9]+]]{{\]}}
|
||||
; CI-DAG: s_mov_b32 m0
|
||||
|
||||
; GCN-DAG: v_lshlrev_b32_e32 [[VPTR:v[0-9]+]], 2, v{{[0-9]+}}
|
||||
|
||||
; GFX9: global_load_dwordx2 v{{\[}}[[VAL0:[0-9]+]]:{{[0-9]+\]}}
|
||||
; GFX9: global_load_dwordx2 v{{\[[0-9]+}}:[[VAL1:[0-9]+]]{{\]}}
|
||||
; GCN: ds_write2_b32 [[VPTR]], v[[VAL0]], v[[VAL1]] offset1:8
|
||||
; GCN: s_endpgm
|
||||
define amdgpu_kernel void @simple_write2_two_val_subreg2_mixed_f32(float addrspace(1)* %C, <2 x float> addrspace(1)* %in) #0 {
|
||||
%x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1
|
||||
%in.gep.0 = getelementptr <2 x float>, <2 x float> addrspace(1)* %in, i32 %x.i
|
||||
@ -102,11 +126,14 @@ define amdgpu_kernel void @simple_write2_two_val_subreg2_mixed_f32(float addrspa
|
||||
ret void
|
||||
}
|
||||
|
||||
; SI-LABEL: @simple_write2_two_val_subreg2_f32
|
||||
; SI-DAG: buffer_load_dwordx2 v{{\[}}[[VAL0:[0-9]+]]:[[VAL1:[0-9]+]]{{\]}}
|
||||
; SI-DAG: v_lshlrev_b32_e32 [[VPTR:v[0-9]+]], 2, v{{[0-9]+}}
|
||||
; SI: ds_write2_b32 [[VPTR]], v[[VAL0]], v[[VAL1]] offset1:8
|
||||
; SI: s_endpgm
|
||||
; GCN-LABEL: @simple_write2_two_val_subreg2_f32
|
||||
; CI-DAG: s_mov_b32 m0
|
||||
; GFX9-NOT: m0
|
||||
|
||||
; GCN-DAG: {{buffer|global}}_load_dwordx2 v{{\[}}[[VAL0:[0-9]+]]:[[VAL1:[0-9]+]]{{\]}}
|
||||
; GCN-DAG: v_lshlrev_b32_e32 [[VPTR:v[0-9]+]], 2, v{{[0-9]+}}
|
||||
; GCN: ds_write2_b32 [[VPTR]], v[[VAL0]], v[[VAL1]] offset1:8
|
||||
; GCN: s_endpgm
|
||||
define amdgpu_kernel void @simple_write2_two_val_subreg2_f32(float addrspace(1)* %C, <2 x float> addrspace(1)* %in) #0 {
|
||||
%x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1
|
||||
%in.gep = getelementptr <2 x float>, <2 x float> addrspace(1)* %in, i32 %x.i
|
||||
@ -121,11 +148,14 @@ define amdgpu_kernel void @simple_write2_two_val_subreg2_f32(float addrspace(1)*
|
||||
ret void
|
||||
}
|
||||
|
||||
; SI-LABEL: @simple_write2_two_val_subreg4_f32
|
||||
; SI-DAG: buffer_load_dwordx4 v{{\[}}[[VAL0:[0-9]+]]:[[VAL1:[0-9]+]]{{\]}}
|
||||
; SI-DAG: v_lshlrev_b32_e32 [[VPTR:v[0-9]+]], 2, v{{[0-9]+}}
|
||||
; SI: ds_write2_b32 [[VPTR]], v[[VAL0]], v[[VAL1]] offset1:8
|
||||
; SI: s_endpgm
|
||||
; GCN-LABEL: @simple_write2_two_val_subreg4_f32
|
||||
; CI-DAG: s_mov_b32 m0
|
||||
; GFX9-NOT: m0
|
||||
|
||||
; GCN-DAG: {{buffer|global}}_load_dwordx4 v{{\[}}[[VAL0:[0-9]+]]:[[VAL1:[0-9]+]]{{\]}}
|
||||
; GCN-DAG: v_lshlrev_b32_e32 [[VPTR:v[0-9]+]], 2, v{{[0-9]+}}
|
||||
; GCN: ds_write2_b32 [[VPTR]], v[[VAL0]], v[[VAL1]] offset1:8
|
||||
; GCN: s_endpgm
|
||||
define amdgpu_kernel void @simple_write2_two_val_subreg4_f32(float addrspace(1)* %C, <4 x float> addrspace(1)* %in) #0 {
|
||||
%x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1
|
||||
%in.gep = getelementptr <4 x float>, <4 x float> addrspace(1)* %in, i32 %x.i
|
||||
@ -140,12 +170,19 @@ define amdgpu_kernel void @simple_write2_two_val_subreg4_f32(float addrspace(1)*
|
||||
ret void
|
||||
}
|
||||
|
||||
; SI-LABEL: @simple_write2_two_val_max_offset_f32
|
||||
; SI-DAG: buffer_load_dword [[VAL0:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
|
||||
; SI-DAG: buffer_load_dword [[VAL1:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4
|
||||
; SI-DAG: v_lshlrev_b32_e32 [[VPTR:v[0-9]+]], 2, v{{[0-9]+}}
|
||||
; SI: ds_write2_b32 [[VPTR]], [[VAL0]], [[VAL1]] offset1:255
|
||||
; SI: s_endpgm
|
||||
; GCN-LABEL: @simple_write2_two_val_max_offset_f32
|
||||
; CI-DAG: s_mov_b32 m0
|
||||
; GFX9-NOT: m0
|
||||
|
||||
; CI-DAG: buffer_load_dword [[VAL0:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
|
||||
; CI-DAG: buffer_load_dword [[VAL1:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4
|
||||
|
||||
; GFX9-DAG: global_load_dword [[VAL0:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, off{{$}}
|
||||
; GFX9-DAG: global_load_dword [[VAL1:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, off offset:4
|
||||
|
||||
; GCN-DAG: v_lshlrev_b32_e32 [[VPTR:v[0-9]+]], 2, v{{[0-9]+}}
|
||||
; GCN: ds_write2_b32 [[VPTR]], [[VAL0]], [[VAL1]] offset1:255
|
||||
; GCN: s_endpgm
|
||||
define amdgpu_kernel void @simple_write2_two_val_max_offset_f32(float addrspace(1)* %C, float addrspace(1)* %in) #0 {
|
||||
%x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1
|
||||
%in.gep.0 = getelementptr float, float addrspace(1)* %in, i32 %x.i
|
||||
@ -160,10 +197,13 @@ define amdgpu_kernel void @simple_write2_two_val_max_offset_f32(float addrspace(
|
||||
ret void
|
||||
}
|
||||
|
||||
; SI-LABEL: @simple_write2_two_val_too_far_f32
|
||||
; SI: ds_write_b32 v{{[0-9]+}}, v{{[0-9]+}}
|
||||
; SI: ds_write_b32 v{{[0-9]+}}, v{{[0-9]+}} offset:1028
|
||||
; SI: s_endpgm
|
||||
; GCN-LABEL: @simple_write2_two_val_too_far_f32
|
||||
; CI-DAG: s_mov_b32 m0
|
||||
; GFX9-NOT: m0
|
||||
|
||||
; GCN: ds_write_b32 v{{[0-9]+}}, v{{[0-9]+}}
|
||||
; GCN: ds_write_b32 v{{[0-9]+}}, v{{[0-9]+}} offset:1028
|
||||
; GCN: s_endpgm
|
||||
define amdgpu_kernel void @simple_write2_two_val_too_far_f32(float addrspace(1)* %C, float addrspace(1)* %in0, float addrspace(1)* %in1) #0 {
|
||||
%x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1
|
||||
%in0.gep = getelementptr float, float addrspace(1)* %in0, i32 %x.i
|
||||
@ -178,10 +218,13 @@ define amdgpu_kernel void @simple_write2_two_val_too_far_f32(float addrspace(1)*
|
||||
ret void
|
||||
}
|
||||
|
||||
; SI-LABEL: @simple_write2_two_val_f32_x2
|
||||
; SI: ds_write2_b32 [[BASEADDR:v[0-9]+]], [[VAL0:v[0-9]+]], [[VAL1:v[0-9]+]] offset1:8
|
||||
; SI: ds_write2_b32 [[BASEADDR:v[0-9]+]], [[VAL0]], [[VAL1]] offset0:11 offset1:27
|
||||
; SI: s_endpgm
|
||||
; GCN-LABEL: @simple_write2_two_val_f32_x2
|
||||
; CI-DAG: s_mov_b32 m0
|
||||
; GFX9-NOT: m0
|
||||
|
||||
; GCN: ds_write2_b32 [[BASEADDR:v[0-9]+]], [[VAL0:v[0-9]+]], [[VAL1:v[0-9]+]] offset1:8
|
||||
; GCN: ds_write2_b32 [[BASEADDR:v[0-9]+]], [[VAL0]], [[VAL1]] offset0:11 offset1:27
|
||||
; GCN: s_endpgm
|
||||
define amdgpu_kernel void @simple_write2_two_val_f32_x2(float addrspace(1)* %C, float addrspace(1)* %in0, float addrspace(1)* %in1) #0 {
|
||||
%tid.x = tail call i32 @llvm.amdgcn.workitem.id.x() #1
|
||||
%in0.gep = getelementptr float, float addrspace(1)* %in0, i32 %tid.x
|
||||
@ -208,10 +251,13 @@ define amdgpu_kernel void @simple_write2_two_val_f32_x2(float addrspace(1)* %C,
|
||||
ret void
|
||||
}
|
||||
|
||||
; SI-LABEL: @simple_write2_two_val_f32_x2_nonzero_base
|
||||
; SI: ds_write2_b32 [[BASEADDR:v[0-9]+]], [[VAL0:v[0-9]+]], [[VAL1:v[0-9]+]] offset0:3 offset1:8
|
||||
; SI: ds_write2_b32 [[BASEADDR:v[0-9]+]], [[VAL0]], [[VAL1]] offset0:11 offset1:27
|
||||
; SI: s_endpgm
|
||||
; GCN-LABEL: @simple_write2_two_val_f32_x2_nonzero_base
|
||||
; CI-DAG: s_mov_b32 m0
|
||||
; GFX9-NOT: m0
|
||||
|
||||
; GCN: ds_write2_b32 [[BASEADDR:v[0-9]+]], [[VAL0:v[0-9]+]], [[VAL1:v[0-9]+]] offset0:3 offset1:8
|
||||
; GCN: ds_write2_b32 [[BASEADDR:v[0-9]+]], [[VAL0]], [[VAL1]] offset0:11 offset1:27
|
||||
; GCN: s_endpgm
|
||||
define amdgpu_kernel void @simple_write2_two_val_f32_x2_nonzero_base(float addrspace(1)* %C, float addrspace(1)* %in0, float addrspace(1)* %in1) #0 {
|
||||
%tid.x = tail call i32 @llvm.amdgcn.workitem.id.x() #1
|
||||
%in0.gep = getelementptr float, float addrspace(1)* %in0, i32 %tid.x
|
||||
@ -238,11 +284,14 @@ define amdgpu_kernel void @simple_write2_two_val_f32_x2_nonzero_base(float addrs
|
||||
ret void
|
||||
}
|
||||
|
||||
; SI-LABEL: @write2_ptr_subreg_arg_two_val_f32
|
||||
; SI-NOT: ds_write2_b32
|
||||
; SI: ds_write_b32
|
||||
; SI: ds_write_b32
|
||||
; SI: s_endpgm
|
||||
; GCN-LABEL: @write2_ptr_subreg_arg_two_val_f32
|
||||
; CI-DAG: s_mov_b32 m0
|
||||
; GFX9-NOT: m0
|
||||
|
||||
; GCN-NOT: ds_write2_b32
|
||||
; GCN: ds_write_b32
|
||||
; GCN: ds_write_b32
|
||||
; GCN: s_endpgm
|
||||
define amdgpu_kernel void @write2_ptr_subreg_arg_two_val_f32(float addrspace(1)* %C, float addrspace(1)* %in0, float addrspace(1)* %in1, <2 x float addrspace(3)*> %lds.ptr) #0 {
|
||||
%x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1
|
||||
%in0.gep = getelementptr float, float addrspace(1)* %in0, i32 %x.i
|
||||
@ -265,11 +314,14 @@ define amdgpu_kernel void @write2_ptr_subreg_arg_two_val_f32(float addrspace(1)*
|
||||
ret void
|
||||
}
|
||||
|
||||
; SI-LABEL: @simple_write2_one_val_f64
|
||||
; SI-DAG: buffer_load_dwordx2 [[VAL:v\[[0-9]+:[0-9]+\]]],
|
||||
; SI-DAG: v_lshlrev_b32_e32 [[VPTR:v[0-9]+]], 3, v{{[0-9]+}}
|
||||
; SI: ds_write2_b64 [[VPTR]], [[VAL]], [[VAL]] offset1:8
|
||||
; SI: s_endpgm
|
||||
; GCN-LABEL: @simple_write2_one_val_f64
|
||||
; CI-DAG: s_mov_b32 m0
|
||||
; GFX9-NOT: m0
|
||||
|
||||
; GCN-DAG: {{buffer|global}}_load_dwordx2 [[VAL:v\[[0-9]+:[0-9]+\]]],
|
||||
; GCN-DAG: v_lshlrev_b32_e32 [[VPTR:v[0-9]+]], 3, v{{[0-9]+}}
|
||||
; GCN: ds_write2_b64 [[VPTR]], [[VAL]], [[VAL]] offset1:8
|
||||
; GCN: s_endpgm
|
||||
define amdgpu_kernel void @simple_write2_one_val_f64(double addrspace(1)* %C, double addrspace(1)* %in) #0 {
|
||||
%x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1
|
||||
%in.gep = getelementptr double, double addrspace(1)* %in, i32 %x.i
|
||||
@ -282,12 +334,15 @@ define amdgpu_kernel void @simple_write2_one_val_f64(double addrspace(1)* %C, do
|
||||
ret void
|
||||
}
|
||||
|
||||
; SI-LABEL: @misaligned_simple_write2_one_val_f64
|
||||
; SI-DAG: buffer_load_dwordx2 v{{\[}}[[VAL0:[0-9]+]]:[[VAL1:[0-9]+]]{{\]}}
|
||||
; SI-DAG: v_lshlrev_b32_e32 [[VPTR:v[0-9]+]], 3, v{{[0-9]+}}
|
||||
; SI: ds_write2_b32 [[VPTR]], v[[VAL0]], v[[VAL1]] offset1:1
|
||||
; SI: ds_write2_b32 [[VPTR]], v[[VAL0]], v[[VAL1]] offset0:14 offset1:15
|
||||
; SI: s_endpgm
|
||||
; GCN-LABEL: @misaligned_simple_write2_one_val_f64
|
||||
; CI-DAG: s_mov_b32 m0
|
||||
; GFX9-NOT: m0
|
||||
|
||||
; GCN-DAG: {{buffer|global}}_load_dwordx2 v{{\[}}[[VAL0:[0-9]+]]:[[VAL1:[0-9]+]]{{\]}}
|
||||
; GCN-DAG: v_lshlrev_b32_e32 [[VPTR:v[0-9]+]], 3, v{{[0-9]+}}
|
||||
; GCN: ds_write2_b32 [[VPTR]], v[[VAL0]], v[[VAL1]] offset1:1
|
||||
; GCN: ds_write2_b32 [[VPTR]], v[[VAL0]], v[[VAL1]] offset0:14 offset1:15
|
||||
; GCN: s_endpgm
|
||||
define amdgpu_kernel void @misaligned_simple_write2_one_val_f64(double addrspace(1)* %C, double addrspace(1)* %in, double addrspace(3)* %lds) #0 {
|
||||
%x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1
|
||||
%in.gep = getelementptr double, double addrspace(1)* %in, i32 %x.i
|
||||
@ -300,12 +355,20 @@ define amdgpu_kernel void @misaligned_simple_write2_one_val_f64(double addrspace
|
||||
ret void
|
||||
}
|
||||
|
||||
; SI-LABEL: @simple_write2_two_val_f64
|
||||
; SI-DAG: buffer_load_dwordx2 [[VAL0:v\[[0-9]+:[0-9]+\]]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
|
||||
; SI-DAG: buffer_load_dwordx2 [[VAL1:v\[[0-9]+:[0-9]+\]]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8
|
||||
; SI-DAG: v_lshlrev_b32_e32 [[VPTR:v[0-9]+]], 3, v{{[0-9]+}}
|
||||
; SI: ds_write2_b64 [[VPTR]], [[VAL0]], [[VAL1]] offset1:8
|
||||
; SI: s_endpgm
|
||||
; GCN-LABEL: @simple_write2_two_val_f64
|
||||
; CI-DAG: s_mov_b32 m0
|
||||
; GFX9-NOT: m0
|
||||
|
||||
; CI-DAG: buffer_load_dwordx2 [[VAL0:v\[[0-9]+:[0-9]+\]]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
|
||||
; CI-DAG: buffer_load_dwordx2 [[VAL1:v\[[0-9]+:[0-9]+\]]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8
|
||||
|
||||
; GFX9-DAG: global_load_dwordx2 [[VAL0:v\[[0-9]+:[0-9]+\]]], {{v\[[0-9]+:[0-9]+\]}}, off{{$}}
|
||||
; GFX9-DAG: global_load_dwordx2 [[VAL1:v\[[0-9]+:[0-9]+\]]], {{v\[[0-9]+:[0-9]+\]}}, off offset:8
|
||||
|
||||
|
||||
; GCN-DAG: v_lshlrev_b32_e32 [[VPTR:v[0-9]+]], 3, v{{[0-9]+}}
|
||||
; GCN: ds_write2_b64 [[VPTR]], [[VAL0]], [[VAL1]] offset1:8
|
||||
; GCN: s_endpgm
|
||||
define amdgpu_kernel void @simple_write2_two_val_f64(double addrspace(1)* %C, double addrspace(1)* %in) #0 {
|
||||
%x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1
|
||||
%in.gep.0 = getelementptr double, double addrspace(1)* %in, i32 %x.i
|
||||
@ -322,19 +385,25 @@ define amdgpu_kernel void @simple_write2_two_val_f64(double addrspace(1)* %C, do
|
||||
|
||||
@foo = addrspace(3) global [4 x i32] undef, align 4
|
||||
|
||||
; SI-LABEL: @store_constant_adjacent_offsets
|
||||
; SI: v_mov_b32_e32 [[ZERO:v[0-9]+]], 0{{$}}
|
||||
; SI: ds_write2_b32 [[ZERO]], v{{[0-9]+}}, v{{[0-9]+}} offset1:1
|
||||
; GCN-LABEL: @store_constant_adjacent_offsets
|
||||
; CI-DAG: s_mov_b32 m0
|
||||
; GFX9-NOT: m0
|
||||
|
||||
; GCN-DAG: v_mov_b32_e32 [[ZERO:v[0-9]+]], 0{{$}}
|
||||
; GCN: ds_write2_b32 [[ZERO]], v{{[0-9]+}}, v{{[0-9]+}} offset1:1
|
||||
define amdgpu_kernel void @store_constant_adjacent_offsets() {
|
||||
store i32 123, i32 addrspace(3)* getelementptr inbounds ([4 x i32], [4 x i32] addrspace(3)* @foo, i32 0, i32 0), align 4
|
||||
store i32 123, i32 addrspace(3)* getelementptr inbounds ([4 x i32], [4 x i32] addrspace(3)* @foo, i32 0, i32 1), align 4
|
||||
ret void
|
||||
}
|
||||
|
||||
; SI-LABEL: @store_constant_disjoint_offsets
|
||||
; SI-DAG: v_mov_b32_e32 [[VAL:v[0-9]+]], 0x7b{{$}}
|
||||
; SI-DAG: v_mov_b32_e32 [[ZERO:v[0-9]+]], 0{{$}}
|
||||
; SI: ds_write2_b32 [[ZERO]], [[VAL]], [[VAL]] offset1:2
|
||||
; GCN-LABEL: @store_constant_disjoint_offsets
|
||||
; CI-DAG: s_mov_b32 m0
|
||||
; GFX9-NOT: m0
|
||||
|
||||
; GCN-DAG: v_mov_b32_e32 [[VAL:v[0-9]+]], 0x7b{{$}}
|
||||
; GCN-DAG: v_mov_b32_e32 [[ZERO:v[0-9]+]], 0{{$}}
|
||||
; GCN: ds_write2_b32 [[ZERO]], [[VAL]], [[VAL]] offset1:2
|
||||
define amdgpu_kernel void @store_constant_disjoint_offsets() {
|
||||
store i32 123, i32 addrspace(3)* getelementptr inbounds ([4 x i32], [4 x i32] addrspace(3)* @foo, i32 0, i32 0), align 4
|
||||
store i32 123, i32 addrspace(3)* getelementptr inbounds ([4 x i32], [4 x i32] addrspace(3)* @foo, i32 0, i32 2), align 4
|
||||
@ -343,11 +412,14 @@ define amdgpu_kernel void @store_constant_disjoint_offsets() {
|
||||
|
||||
@bar = addrspace(3) global [4 x i64] undef, align 4
|
||||
|
||||
; SI-LABEL: @store_misaligned64_constant_offsets
|
||||
; SI: v_mov_b32_e32 [[ZERO:v[0-9]+]], 0{{$}}
|
||||
; SI-DAG: ds_write2_b32 [[ZERO]], v{{[0-9]+}}, v{{[0-9]+}} offset1:1
|
||||
; SI-DAG: ds_write2_b32 [[ZERO]], v{{[0-9]+}}, v{{[0-9]+}} offset0:2 offset1:3
|
||||
; SI: s_endpgm
|
||||
; GCN-LABEL: @store_misaligned64_constant_offsets
|
||||
; CI-DAG: s_mov_b32 m0
|
||||
; GFX9-NOT: m0
|
||||
|
||||
; GCN-DAG: v_mov_b32_e32 [[ZERO:v[0-9]+]], 0{{$}}
|
||||
; GCN-DAG: ds_write2_b32 [[ZERO]], v{{[0-9]+}}, v{{[0-9]+}} offset1:1
|
||||
; GCN-DAG: ds_write2_b32 [[ZERO]], v{{[0-9]+}}, v{{[0-9]+}} offset0:2 offset1:3
|
||||
; GCN: s_endpgm
|
||||
define amdgpu_kernel void @store_misaligned64_constant_offsets() {
|
||||
store i64 123, i64 addrspace(3)* getelementptr inbounds ([4 x i64], [4 x i64] addrspace(3)* @bar, i32 0, i32 0), align 4
|
||||
store i64 123, i64 addrspace(3)* getelementptr inbounds ([4 x i64], [4 x i64] addrspace(3)* @bar, i32 0, i32 1), align 4
|
||||
@ -356,12 +428,15 @@ define amdgpu_kernel void @store_misaligned64_constant_offsets() {
|
||||
|
||||
@bar.large = addrspace(3) global [4096 x i64] undef, align 4
|
||||
|
||||
; SI-LABEL: @store_misaligned64_constant_large_offsets
|
||||
; SI-DAG: v_mov_b32_e32 [[BASE0:v[0-9]+]], 0x7ff8{{$}}
|
||||
; SI-DAG: v_mov_b32_e32 [[BASE1:v[0-9]+]], 0x4000{{$}}
|
||||
; SI-DAG: ds_write2_b32 [[BASE0]], v{{[0-9]+}}, v{{[0-9]+}} offset1:1
|
||||
; SI-DAG: ds_write2_b32 [[BASE1]], v{{[0-9]+}}, v{{[0-9]+}} offset1:1
|
||||
; SI: s_endpgm
|
||||
; GCN-LABEL: @store_misaligned64_constant_large_offsets
|
||||
; CI-DAG: s_mov_b32 m0
|
||||
; GFX9-NOT: m0
|
||||
|
||||
; GCN-DAG: v_mov_b32_e32 [[BASE0:v[0-9]+]], 0x7ff8{{$}}
|
||||
; GCN-DAG: v_mov_b32_e32 [[BASE1:v[0-9]+]], 0x4000{{$}}
|
||||
; GCN-DAG: ds_write2_b32 [[BASE0]], v{{[0-9]+}}, v{{[0-9]+}} offset1:1
|
||||
; GCN-DAG: ds_write2_b32 [[BASE1]], v{{[0-9]+}}, v{{[0-9]+}} offset1:1
|
||||
; GCN: s_endpgm
|
||||
define amdgpu_kernel void @store_misaligned64_constant_large_offsets() {
|
||||
store i64 123, i64 addrspace(3)* getelementptr inbounds ([4096 x i64], [4096 x i64] addrspace(3)* @bar.large, i32 0, i32 2048), align 4
|
||||
store i64 123, i64 addrspace(3)* getelementptr inbounds ([4096 x i64], [4096 x i64] addrspace(3)* @bar.large, i32 0, i32 4095), align 4
|
||||
@ -406,10 +481,12 @@ define amdgpu_kernel void @write2_sgemm_sequence(float addrspace(1)* %C, i32 %ld
|
||||
ret void
|
||||
}
|
||||
|
||||
; CI-LABEL: {{^}}simple_write2_v4f32_superreg_align4:
|
||||
; CI: ds_write2_b32 {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}} offset0:3 offset1:2{{$}}
|
||||
; CI: ds_write2_b32 {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}} offset0:1{{$}}
|
||||
; CI: s_endpgm
|
||||
; GCN-LABEL: {{^}}simple_write2_v4f32_superreg_align4:
|
||||
; CI: s_mov_b32 m0
|
||||
; GFX9-NOT: m0
|
||||
|
||||
; GCN: ds_write2_b32 {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}} offset0:2 offset1:3{{$}}
|
||||
; GCN: ds_write2_b32 {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}} offset1:1{{$}}
|
||||
define amdgpu_kernel void @simple_write2_v4f32_superreg_align4(<4 x float> addrspace(3)* %out, <4 x float> addrspace(1)* %in) #0 {
|
||||
%x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1
|
||||
%in.gep = getelementptr inbounds <4 x float>, <4 x float> addrspace(1)* %in
|
||||
@ -419,16 +496,9 @@ define amdgpu_kernel void @simple_write2_v4f32_superreg_align4(<4 x float> addrs
|
||||
ret void
|
||||
}
|
||||
|
||||
; Function Attrs: nounwind readnone
|
||||
declare i32 @llvm.amdgcn.workgroup.id.x() #1
|
||||
|
||||
; Function Attrs: nounwind readnone
|
||||
declare i32 @llvm.amdgcn.workgroup.id.y() #1
|
||||
|
||||
; Function Attrs: nounwind readnone
|
||||
declare i32 @llvm.amdgcn.workitem.id.x() #1
|
||||
|
||||
; Function Attrs: nounwind readnone
|
||||
declare i32 @llvm.amdgcn.workitem.id.y() #1
|
||||
|
||||
attributes #0 = { nounwind }
|
||||
|
@ -1,12 +1,16 @@
|
||||
; RUN: llc -march=amdgcn -mcpu=bonaire -verify-machineinstrs -mattr=+load-store-opt < %s | FileCheck -check-prefix=SI %s
|
||||
; RUN: llc -march=amdgcn -mcpu=bonaire -verify-machineinstrs -mattr=+load-store-opt < %s | FileCheck -enable-var-scope -check-prefixes=GCN,CI %s
|
||||
; RUN: llc -march=amdgcn -mcpu=gfx900 -verify-machineinstrs -mattr=+load-store-opt < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX9 %s
|
||||
|
||||
@lds = addrspace(3) global [512 x float] undef, align 4
|
||||
|
||||
; SI-LABEL: @simple_write2st64_one_val_f32_0_1
|
||||
; SI-DAG: buffer_load_dword [[VAL:v[0-9]+]]
|
||||
; SI-DAG: v_lshlrev_b32_e32 [[VPTR:v[0-9]+]], 2, v{{[0-9]+}}
|
||||
; SI: ds_write2st64_b32 [[VPTR]], [[VAL]], [[VAL]] offset1:1
|
||||
; SI: s_endpgm
|
||||
; GCN-LABEL: @simple_write2st64_one_val_f32_0_1
|
||||
; CI-DAG: s_mov_b32 m0
|
||||
; GFX9-NOT: m0n
|
||||
|
||||
; GCN-DAG: {{buffer|global}}_load_dword [[VAL:v[0-9]+]]
|
||||
; GCN-DAG: v_lshlrev_b32_e32 [[VPTR:v[0-9]+]], 2, v{{[0-9]+}}
|
||||
; GCN: ds_write2st64_b32 [[VPTR]], [[VAL]], [[VAL]] offset1:1
|
||||
; GCN: s_endpgm
|
||||
define amdgpu_kernel void @simple_write2st64_one_val_f32_0_1(float addrspace(1)* %C, float addrspace(1)* %in) #0 {
|
||||
%x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1
|
||||
%in.gep = getelementptr float, float addrspace(1)* %in, i32 %x.i
|
||||
@ -19,12 +23,20 @@ define amdgpu_kernel void @simple_write2st64_one_val_f32_0_1(float addrspace(1)*
|
||||
ret void
|
||||
}
|
||||
|
||||
; SI-LABEL: @simple_write2st64_two_val_f32_2_5
|
||||
; SI-DAG: buffer_load_dword [[VAL0:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
|
||||
; SI-DAG: buffer_load_dword [[VAL1:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4
|
||||
; SI-DAG: v_lshlrev_b32_e32 [[VPTR:v[0-9]+]], 2, v{{[0-9]+}}
|
||||
; SI: ds_write2st64_b32 [[VPTR]], [[VAL0]], [[VAL1]] offset0:2 offset1:5
|
||||
; SI: s_endpgm
|
||||
; GCN-LABEL: @simple_write2st64_two_val_f32_2_5
|
||||
; CI-DAG: s_mov_b32 m0
|
||||
; GFX9-NOT: m0
|
||||
|
||||
; CI-DAG: buffer_load_dword [[VAL0:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
|
||||
; CI-DAG: buffer_load_dword [[VAL1:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4
|
||||
|
||||
; GFX9-DAG: global_load_dword [[VAL0:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, off{{$}}
|
||||
; GFX9-DAG: global_load_dword [[VAL1:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, off offset:4
|
||||
|
||||
|
||||
; GCN-DAG: v_lshlrev_b32_e32 [[VPTR:v[0-9]+]], 2, v{{[0-9]+}}
|
||||
; GCN: ds_write2st64_b32 [[VPTR]], [[VAL0]], [[VAL1]] offset0:2 offset1:5
|
||||
; GCN: s_endpgm
|
||||
define amdgpu_kernel void @simple_write2st64_two_val_f32_2_5(float addrspace(1)* %C, float addrspace(1)* %in) #0 {
|
||||
%x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1
|
||||
%in.gep.0 = getelementptr float, float addrspace(1)* %in, i32 %x.i
|
||||
@ -40,12 +52,20 @@ define amdgpu_kernel void @simple_write2st64_two_val_f32_2_5(float addrspace(1)*
|
||||
ret void
|
||||
}
|
||||
|
||||
; SI-LABEL: @simple_write2st64_two_val_max_offset_f32
|
||||
; SI-DAG: buffer_load_dword [[VAL0:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
|
||||
; SI-DAG: buffer_load_dword [[VAL1:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4
|
||||
; SI-DAG: v_lshlrev_b32_e32 [[VPTR:v[0-9]+]], 2, v{{[0-9]+}}
|
||||
; SI: ds_write2st64_b32 [[VPTR]], [[VAL0]], [[VAL1]] offset1:255
|
||||
; SI: s_endpgm
|
||||
; GCN-LABEL: @simple_write2st64_two_val_max_offset_f32
|
||||
; CI-DAG: s_mov_b32 m0
|
||||
; GFX9-NOT: m0
|
||||
|
||||
; CI-DAG: buffer_load_dword [[VAL0:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
|
||||
; CI-DAG: buffer_load_dword [[VAL1:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4
|
||||
|
||||
; GFX9-DAG: global_load_dword [[VAL0:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, off{{$}}
|
||||
; GFX9-DAG: global_load_dword [[VAL1:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, off offset:4
|
||||
|
||||
; GCN-DAG: v_lshlrev_b32_e32 [[SHL:v[0-9]+]], 2, v{{[0-9]+}}
|
||||
; GCN: v_add{{(_co)?}}_{{i|u}}32_e32 [[VPTR:v[0-9]+]], vcc, s{{[0-9]+}}, [[SHL]]
|
||||
; GCN: ds_write2st64_b32 [[VPTR]], [[VAL0]], [[VAL1]] offset1:255
|
||||
; GCN: s_endpgm
|
||||
define amdgpu_kernel void @simple_write2st64_two_val_max_offset_f32(float addrspace(1)* %C, float addrspace(1)* %in, float addrspace(3)* %lds) #0 {
|
||||
%x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1
|
||||
%in.gep.0 = getelementptr float, float addrspace(1)* %in, i32 %x.i
|
||||
@ -60,12 +80,20 @@ define amdgpu_kernel void @simple_write2st64_two_val_max_offset_f32(float addrsp
|
||||
ret void
|
||||
}
|
||||
|
||||
; SI-LABEL: @simple_write2st64_two_val_max_offset_f64
|
||||
; SI-DAG: buffer_load_dwordx2 [[VAL0:v\[[0-9]+:[0-9]+\]]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
|
||||
; SI-DAG: buffer_load_dwordx2 [[VAL1:v\[[0-9]+:[0-9]+\]]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8
|
||||
; SI-DAG: v_add_i32_e32 [[VPTR:v[0-9]+]],
|
||||
; SI: ds_write2st64_b64 [[VPTR]], [[VAL0]], [[VAL1]] offset0:4 offset1:127
|
||||
; SI: s_endpgm
|
||||
; GCN-LABEL: @simple_write2st64_two_val_max_offset_f64
|
||||
; CI-DAG: s_mov_b32 m0
|
||||
; GFX9-NOT: m0
|
||||
|
||||
; CI-DAG: buffer_load_dwordx2 [[VAL0:v\[[0-9]+:[0-9]+\]]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
|
||||
; CI-DAG: buffer_load_dwordx2 [[VAL1:v\[[0-9]+:[0-9]+\]]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8
|
||||
|
||||
; GFX9-DAG: global_load_dwordx2 [[VAL0:v\[[0-9]+:[0-9]+\]]], {{v\[[0-9]+:[0-9]+\]}}, off{{$}}
|
||||
; GFX9-DAG: global_load_dwordx2 [[VAL1:v\[[0-9]+:[0-9]+\]]], {{v\[[0-9]+:[0-9]+\]}}, off offset:8
|
||||
|
||||
; GCN-DAG: v_lshlrev_b32_e32 [[SHL:v[0-9]+]], 3, v{{[0-9]+}}
|
||||
; GCN: v_add_{{(co_)?}}{{i|u}}32_e32 [[VPTR:v[0-9]+]], vcc, s{{[0-9]+}}, [[SHL]]
|
||||
; GCN: ds_write2st64_b64 [[VPTR]], [[VAL0]], [[VAL1]] offset0:4 offset1:127
|
||||
; GCN: s_endpgm
|
||||
define amdgpu_kernel void @simple_write2st64_two_val_max_offset_f64(double addrspace(1)* %C, double addrspace(1)* %in, double addrspace(3)* %lds) #0 {
|
||||
%x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1
|
||||
%in.gep.0 = getelementptr double, double addrspace(1)* %in, i32 %x.i
|
||||
@ -81,10 +109,13 @@ define amdgpu_kernel void @simple_write2st64_two_val_max_offset_f64(double addrs
|
||||
ret void
|
||||
}
|
||||
|
||||
; SI-LABEL: @byte_size_only_divisible_64_write2st64_f64
|
||||
; SI-NOT: ds_write2st64_b64
|
||||
; SI: ds_write2_b64 {{v[0-9]+}}, {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}} offset1:8
|
||||
; SI: s_endpgm
|
||||
; GCN-LABEL: @byte_size_only_divisible_64_write2st64_f64
|
||||
; CI-DAG: s_mov_b32 m0
|
||||
; GFX9-NOT: m0
|
||||
|
||||
; GCN-NOT: ds_write2st64_b64
|
||||
; GCN: ds_write2_b64 {{v[0-9]+}}, {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}} offset1:8
|
||||
; GCN: s_endpgm
|
||||
define amdgpu_kernel void @byte_size_only_divisible_64_write2st64_f64(double addrspace(1)* %C, double addrspace(1)* %in, double addrspace(3)* %lds) #0 {
|
||||
%x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1
|
||||
%in.gep = getelementptr double, double addrspace(1)* %in, i32 %x.i
|
||||
@ -97,10 +128,7 @@ define amdgpu_kernel void @byte_size_only_divisible_64_write2st64_f64(double add
|
||||
ret void
|
||||
}
|
||||
|
||||
; Function Attrs: nounwind readnone
|
||||
declare i32 @llvm.amdgcn.workitem.id.x() #1
|
||||
|
||||
; Function Attrs: nounwind readnone
|
||||
declare i32 @llvm.amdgcn.workitem.id.y() #1
|
||||
|
||||
attributes #0 = { nounwind }
|
||||
|
@ -16,7 +16,7 @@ declare void @llvm.dbg.declare(metadata, metadata, metadata)
|
||||
; CHECK: ReservedNumVGPRs: 4
|
||||
; GFX700: ReservedFirstVGPR: 8
|
||||
; GFX800: ReservedFirstVGPR: 8
|
||||
; GFX900: ReservedFirstVGPR: 11
|
||||
; GFX900: ReservedFirstVGPR: 10
|
||||
; CHECK: PrivateSegmentBufferSGPR: 0
|
||||
; CHECK: WavefrontPrivateSegmentOffsetSGPR: 11
|
||||
define amdgpu_kernel void @test(i32 addrspace(1)* %A) #0 !dbg !7 !kernel_arg_addr_space !12 !kernel_arg_access_qual !13 !kernel_arg_type !14 !kernel_arg_base_type !14 !kernel_arg_type_qual !15 {
|
||||
|
@ -1,7 +1,7 @@
|
||||
; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=tahiti -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=MOVREL %s
|
||||
; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=MOVREL %s
|
||||
; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -amdgpu-vgpr-index-mode -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=IDXMODE %s
|
||||
; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=gfx900 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=IDXMODE %s
|
||||
; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=tahiti -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,MOVREL,PREGFX9 %s
|
||||
; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,MOVREL,PREGFX9 %s
|
||||
; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -amdgpu-vgpr-index-mode -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,IDXMODE,PREGFX9 %s
|
||||
; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=gfx900 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,IDXMODE,GFX9 %s
|
||||
|
||||
; Tests for indirect addressing on SI, which is implemented using dynamic
|
||||
; indexing of vectors.
|
||||
@ -603,7 +603,8 @@ bb7: ; preds = %bb4, %bb1
|
||||
; IDXMODE: v_mov_b32_e32 v[[VEC0_ELT2]], -4.0
|
||||
; IDXMODE: s_set_gpr_idx_off
|
||||
|
||||
; GCN: s_mov_b32 m0, -1
|
||||
; PREGFX9: s_mov_b32 m0, -1
|
||||
; GFX9-NOT: s_mov_b32 m0
|
||||
; GCN: ds_write_b32
|
||||
; GCN: ds_write_b32
|
||||
; GCN: s_endpgm
|
||||
|
@ -14,6 +14,8 @@ declare i32 @llvm.amdgcn.workitem.id.x() #1
|
||||
|
||||
; Make sure no crash on invalid non-constant
|
||||
; GCN-LABEL: {{^}}invalid_variable_order_lds_atomic_dec_ret_i32:
|
||||
; CIVI-DAG: s_mov_b32 m0
|
||||
; GFX9-NOT: m0
|
||||
define amdgpu_kernel void @invalid_variable_order_lds_atomic_dec_ret_i32(i32 addrspace(1)* %out, i32 addrspace(3)* %ptr, i32 %order.var) #0 {
|
||||
%result = call i32 @llvm.amdgcn.atomic.dec.i32.p3i32(i32 addrspace(3)* %ptr, i32 42, i32 %order.var, i32 0, i1 false)
|
||||
store i32 %result, i32 addrspace(1)* %out
|
||||
@ -22,6 +24,8 @@ define amdgpu_kernel void @invalid_variable_order_lds_atomic_dec_ret_i32(i32 add
|
||||
|
||||
; Make sure no crash on invalid non-constant
|
||||
; GCN-LABEL: {{^}}invalid_variable_scope_lds_atomic_dec_ret_i32:
|
||||
; CIVI-DAG: s_mov_b32 m0
|
||||
; GFX9-NOT: m0
|
||||
define amdgpu_kernel void @invalid_variable_scope_lds_atomic_dec_ret_i32(i32 addrspace(1)* %out, i32 addrspace(3)* %ptr, i32 %scope.var) #0 {
|
||||
%result = call i32 @llvm.amdgcn.atomic.dec.i32.p3i32(i32 addrspace(3)* %ptr, i32 42, i32 0, i32 %scope.var, i1 false)
|
||||
store i32 %result, i32 addrspace(1)* %out
|
||||
@ -37,7 +41,10 @@ define amdgpu_kernel void @invalid_variable_volatile_lds_atomic_dec_ret_i32(i32
|
||||
}
|
||||
|
||||
; GCN-LABEL: {{^}}lds_atomic_dec_ret_i32:
|
||||
; GCN: v_mov_b32_e32 [[K:v[0-9]+]], 42
|
||||
; CIVI-DAG: s_mov_b32 m0
|
||||
; GFX9-NOT: m0
|
||||
|
||||
; GCN-DAG: v_mov_b32_e32 [[K:v[0-9]+]], 42
|
||||
; GCN: ds_dec_rtn_u32 v{{[0-9]+}}, v{{[0-9]+}}, [[K]]
|
||||
define amdgpu_kernel void @lds_atomic_dec_ret_i32(i32 addrspace(1)* %out, i32 addrspace(3)* %ptr) #0 {
|
||||
%result = call i32 @llvm.amdgcn.atomic.dec.i32.p3i32(i32 addrspace(3)* %ptr, i32 42, i32 0, i32 0, i1 false)
|
||||
@ -46,7 +53,10 @@ define amdgpu_kernel void @lds_atomic_dec_ret_i32(i32 addrspace(1)* %out, i32 ad
|
||||
}
|
||||
|
||||
; GCN-LABEL: {{^}}lds_atomic_dec_ret_i32_offset:
|
||||
; GCN: v_mov_b32_e32 [[K:v[0-9]+]], 42
|
||||
; CIVI-DAG: s_mov_b32 m0
|
||||
; GFX9-NOT: m0
|
||||
|
||||
; GCN-DAG: v_mov_b32_e32 [[K:v[0-9]+]], 42
|
||||
; GCN: ds_dec_rtn_u32 v{{[0-9]+}}, v{{[0-9]+}}, [[K]] offset:16
|
||||
define amdgpu_kernel void @lds_atomic_dec_ret_i32_offset(i32 addrspace(1)* %out, i32 addrspace(3)* %ptr) #0 {
|
||||
%gep = getelementptr i32, i32 addrspace(3)* %ptr, i32 4
|
||||
@ -56,9 +66,12 @@ define amdgpu_kernel void @lds_atomic_dec_ret_i32_offset(i32 addrspace(1)* %out,
|
||||
}
|
||||
|
||||
; GCN-LABEL: {{^}}lds_atomic_dec_noret_i32:
|
||||
; GCN: s_load_dword [[SPTR:s[0-9]+]],
|
||||
; GCN: v_mov_b32_e32 [[DATA:v[0-9]+]], 4
|
||||
; GCN: v_mov_b32_e32 [[VPTR:v[0-9]+]], [[SPTR]]
|
||||
; CIVI-DAG: s_mov_b32 m0
|
||||
; GFX9-NOT: m0
|
||||
|
||||
; GCN-DAG: s_load_dword [[SPTR:s[0-9]+]],
|
||||
; GCN-DAG: v_mov_b32_e32 [[DATA:v[0-9]+]], 4
|
||||
; GCN-DAG: v_mov_b32_e32 [[VPTR:v[0-9]+]], [[SPTR]]
|
||||
; GCN: ds_dec_u32 [[VPTR]], [[DATA]]
|
||||
define amdgpu_kernel void @lds_atomic_dec_noret_i32(i32 addrspace(3)* %ptr) nounwind {
|
||||
%result = call i32 @llvm.amdgcn.atomic.dec.i32.p3i32(i32 addrspace(3)* %ptr, i32 42, i32 0, i32 0, i1 false)
|
||||
@ -66,7 +79,10 @@ define amdgpu_kernel void @lds_atomic_dec_noret_i32(i32 addrspace(3)* %ptr) noun
|
||||
}
|
||||
|
||||
; GCN-LABEL: {{^}}lds_atomic_dec_noret_i32_offset:
|
||||
; GCN: v_mov_b32_e32 [[K:v[0-9]+]], 42
|
||||
; CIVI-DAG: s_mov_b32 m0
|
||||
; GFX9-NOT: m0
|
||||
|
||||
; GCN-DAG: v_mov_b32_e32 [[K:v[0-9]+]], 42
|
||||
; GCN: ds_dec_u32 v{{[0-9]+}}, [[K]] offset:16
|
||||
define amdgpu_kernel void @lds_atomic_dec_noret_i32_offset(i32 addrspace(3)* %ptr) nounwind {
|
||||
%gep = getelementptr i32, i32 addrspace(3)* %ptr, i32 4
|
||||
@ -277,7 +293,10 @@ define amdgpu_kernel void @flat_atomic_dec_noret_i64_offset_addr64(i64 addrspace
|
||||
@lds0 = addrspace(3) global [512 x i32] undef
|
||||
|
||||
; GCN-LABEL: {{^}}atomic_dec_shl_base_lds_0:
|
||||
; GCN: v_lshlrev_b32_e32 [[PTR:v[0-9]+]], 2, {{v[0-9]+}}
|
||||
; CIVI-DAG: s_mov_b32 m0
|
||||
; GFX9-NOT: m0
|
||||
|
||||
; GCN-DAG: v_lshlrev_b32_e32 [[PTR:v[0-9]+]], 2, {{v[0-9]+}}
|
||||
; GCN: ds_dec_rtn_u32 {{v[0-9]+}}, [[PTR]], {{v[0-9]+}} offset:8
|
||||
define amdgpu_kernel void @atomic_dec_shl_base_lds_0(i32 addrspace(1)* %out, i32 addrspace(1)* %add_use) #0 {
|
||||
%tid.x = tail call i32 @llvm.amdgcn.workitem.id.x() #1
|
||||
@ -290,6 +309,9 @@ define amdgpu_kernel void @atomic_dec_shl_base_lds_0(i32 addrspace(1)* %out, i32
|
||||
}
|
||||
|
||||
; GCN-LABEL: {{^}}lds_atomic_dec_ret_i64:
|
||||
; CIVI-DAG: s_mov_b32 m0
|
||||
; GFX9-NOT: m0
|
||||
|
||||
; GCN-DAG: v_mov_b32_e32 v[[KLO:[0-9]+]], 42
|
||||
; GCN-DAG: v_mov_b32_e32 v[[KHI:[0-9]+]], 0{{$}}
|
||||
; GCN: ds_dec_rtn_u64 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}}, v{{\[}}[[KLO]]:[[KHI]]{{\]}}{{$}}
|
||||
@ -300,6 +322,9 @@ define amdgpu_kernel void @lds_atomic_dec_ret_i64(i64 addrspace(1)* %out, i64 ad
|
||||
}
|
||||
|
||||
; GCN-LABEL: {{^}}lds_atomic_dec_ret_i64_offset:
|
||||
; CIVI-DAG: s_mov_b32 m0
|
||||
; GFX9-NOT: m0
|
||||
|
||||
; GCN-DAG: v_mov_b32_e32 v[[KLO:[0-9]+]], 42
|
||||
; GCN-DAG: v_mov_b32_e32 v[[KHI:[0-9]+]], 0{{$}}
|
||||
; GCN: ds_dec_rtn_u64 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}}, v{{\[}}[[KLO]]:[[KHI]]{{\]}} offset:32
|
||||
@ -311,6 +336,9 @@ define amdgpu_kernel void @lds_atomic_dec_ret_i64_offset(i64 addrspace(1)* %out,
|
||||
}
|
||||
|
||||
; GCN-LABEL: {{^}}lds_atomic_dec_noret_i64:
|
||||
; CIVI-DAG: s_mov_b32 m0
|
||||
; GFX9-NOT: m0
|
||||
|
||||
; GCN-DAG: v_mov_b32_e32 v[[KLO:[0-9]+]], 42
|
||||
; GCN-DAG: v_mov_b32_e32 v[[KHI:[0-9]+]], 0{{$}}
|
||||
; GCN: ds_dec_u64 v{{[0-9]+}}, v{{\[}}[[KLO]]:[[KHI]]{{\]}}{{$}}
|
||||
@ -320,6 +348,9 @@ define amdgpu_kernel void @lds_atomic_dec_noret_i64(i64 addrspace(3)* %ptr) noun
|
||||
}
|
||||
|
||||
; GCN-LABEL: {{^}}lds_atomic_dec_noret_i64_offset:
|
||||
; CIVI-DAG: s_mov_b32 m0
|
||||
; GFX9-NOT: m0
|
||||
|
||||
; GCN-DAG: v_mov_b32_e32 v[[KLO:[0-9]+]], 42
|
||||
; GCN-DAG: v_mov_b32_e32 v[[KHI:[0-9]+]], 0{{$}}
|
||||
; GCN: ds_dec_u64 v{{[0-9]+}}, v{{\[}}[[KLO]]:[[KHI]]{{\]}} offset:32{{$}}
|
||||
@ -406,7 +437,10 @@ define amdgpu_kernel void @global_atomic_dec_noret_i64_offset_addr64(i64 addrspa
|
||||
@lds1 = addrspace(3) global [512 x i64] undef, align 8
|
||||
|
||||
; GCN-LABEL: {{^}}atomic_dec_shl_base_lds_0_i64:
|
||||
; GCN: v_lshlrev_b32_e32 [[PTR:v[0-9]+]], 3, {{v[0-9]+}}
|
||||
; CIVI-DAG: s_mov_b32 m0
|
||||
; GFX9-NOT: m0
|
||||
|
||||
; GCN-DAG: v_lshlrev_b32_e32 [[PTR:v[0-9]+]], 3, {{v[0-9]+}}
|
||||
; GCN: ds_dec_rtn_u64 v{{\[[0-9]+:[0-9]+\]}}, [[PTR]], v{{\[[0-9]+:[0-9]+\]}} offset:16
|
||||
define amdgpu_kernel void @atomic_dec_shl_base_lds_0_i64(i64 addrspace(1)* %out, i32 addrspace(1)* %add_use) #0 {
|
||||
%tid.x = tail call i32 @llvm.amdgcn.workitem.id.x() #1
|
||||
|
@ -13,7 +13,10 @@ declare i64 @llvm.amdgcn.atomic.inc.i64.p4i64(i64 addrspace(4)* nocapture, i64,
|
||||
declare i32 @llvm.amdgcn.workitem.id.x() #1
|
||||
|
||||
; GCN-LABEL: {{^}}lds_atomic_inc_ret_i32:
|
||||
; GCN: v_mov_b32_e32 [[K:v[0-9]+]], 42
|
||||
; CIVI-DAG: s_mov_b32 m0
|
||||
; GFX9-NOT: m0
|
||||
|
||||
; GCN-DAG: v_mov_b32_e32 [[K:v[0-9]+]], 42
|
||||
; GCN: ds_inc_rtn_u32 v{{[0-9]+}}, v{{[0-9]+}}, [[K]]
|
||||
define amdgpu_kernel void @lds_atomic_inc_ret_i32(i32 addrspace(1)* %out, i32 addrspace(3)* %ptr) #0 {
|
||||
%result = call i32 @llvm.amdgcn.atomic.inc.i32.p3i32(i32 addrspace(3)* %ptr, i32 42, i32 0, i32 0, i1 false)
|
||||
@ -22,7 +25,10 @@ define amdgpu_kernel void @lds_atomic_inc_ret_i32(i32 addrspace(1)* %out, i32 ad
|
||||
}
|
||||
|
||||
; GCN-LABEL: {{^}}lds_atomic_inc_ret_i32_offset:
|
||||
; GCN: v_mov_b32_e32 [[K:v[0-9]+]], 42
|
||||
; CIVI-DAG: s_mov_b32 m0
|
||||
; GFX9-NOT: m0
|
||||
|
||||
; GCN-DAG: v_mov_b32_e32 [[K:v[0-9]+]], 42
|
||||
; GCN: ds_inc_rtn_u32 v{{[0-9]+}}, v{{[0-9]+}}, [[K]] offset:16
|
||||
define amdgpu_kernel void @lds_atomic_inc_ret_i32_offset(i32 addrspace(1)* %out, i32 addrspace(3)* %ptr) #0 {
|
||||
%gep = getelementptr i32, i32 addrspace(3)* %ptr, i32 4
|
||||
@ -32,9 +38,12 @@ define amdgpu_kernel void @lds_atomic_inc_ret_i32_offset(i32 addrspace(1)* %out,
|
||||
}
|
||||
|
||||
; GCN-LABEL: {{^}}lds_atomic_inc_noret_i32:
|
||||
; GCN: s_load_dword [[SPTR:s[0-9]+]],
|
||||
; GCN: v_mov_b32_e32 [[DATA:v[0-9]+]], 4
|
||||
; GCN: v_mov_b32_e32 [[VPTR:v[0-9]+]], [[SPTR]]
|
||||
; CIVI-DAG: s_mov_b32 m0
|
||||
; GFX9-NOT: m0
|
||||
|
||||
; GCN-DAG: s_load_dword [[SPTR:s[0-9]+]],
|
||||
; GCN-DAG: v_mov_b32_e32 [[DATA:v[0-9]+]], 4
|
||||
; GCN-DAG: v_mov_b32_e32 [[VPTR:v[0-9]+]], [[SPTR]]
|
||||
; GCN: ds_inc_u32 [[VPTR]], [[DATA]]
|
||||
define amdgpu_kernel void @lds_atomic_inc_noret_i32(i32 addrspace(3)* %ptr) nounwind {
|
||||
%result = call i32 @llvm.amdgcn.atomic.inc.i32.p3i32(i32 addrspace(3)* %ptr, i32 42, i32 0, i32 0, i1 false)
|
||||
@ -42,7 +51,10 @@ define amdgpu_kernel void @lds_atomic_inc_noret_i32(i32 addrspace(3)* %ptr) noun
|
||||
}
|
||||
|
||||
; GCN-LABEL: {{^}}lds_atomic_inc_noret_i32_offset:
|
||||
; GCN: v_mov_b32_e32 [[K:v[0-9]+]], 42
|
||||
; CIVI-DAG: s_mov_b32 m0
|
||||
; GFX9-NOT: m0
|
||||
|
||||
; GCN-DAG: v_mov_b32_e32 [[K:v[0-9]+]], 42
|
||||
; GCN: ds_inc_u32 v{{[0-9]+}}, [[K]] offset:16
|
||||
define amdgpu_kernel void @lds_atomic_inc_noret_i32_offset(i32 addrspace(3)* %ptr) nounwind {
|
||||
%gep = getelementptr i32, i32 addrspace(3)* %ptr, i32 4
|
||||
|
@ -69,7 +69,6 @@ entry:
|
||||
; FIXME: Remove m0 initialization
|
||||
; GCN-LABEL: {{^}}load_local_hi_v2i16_zerolo_shift:
|
||||
; GCN: s_waitcnt
|
||||
; GFX9-NEXT: s_mov_b32 m0, -1
|
||||
; GFX9-NEXT: ds_read_u16 v0, v0
|
||||
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX9-NEXT: v_lshlrev_b32_e32 v0, 16, v0
|
||||
@ -563,7 +562,6 @@ entry:
|
||||
; FIXME: Is there a cost to using the extload over not?
|
||||
; GCN-LABEL: {{^}}load_local_v2i16_split:
|
||||
; GCN: s_waitcnt
|
||||
; GFX9-NEXT: s_mov_b32 m0, -1
|
||||
; GFX9-NEXT: ds_read_u16 v1, v0
|
||||
; GFX9-NEXT: s_waitcnt
|
||||
; GFX9-NEXT: ds_read_u16_d16_hi v1, v0 offset:2
|
||||
|
@ -1,9 +1,10 @@
|
||||
; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=FUNC %s
|
||||
; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=FUNC %s
|
||||
; RUN: llc -march=r600 -mcpu=redwood < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s
|
||||
; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,SICIVI,FUNC %s
|
||||
; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,SICIVI,FUNC %s
|
||||
; RUN: llc -march=r600 -mcpu=redwood < %s | FileCheck -check-prefixes=EG,FUNC %s
|
||||
|
||||
; FUNC-LABEL: {{^}}load_f32_local:
|
||||
; GCN: s_mov_b32 m0
|
||||
; SICIVI: s_mov_b32 m0
|
||||
; GFX9-NOT: m0
|
||||
; GCN: ds_read_b32
|
||||
|
||||
; EG: LDS_READ_RET
|
||||
@ -15,7 +16,9 @@ entry:
|
||||
}
|
||||
|
||||
; FUNC-LABEL: {{^}}load_v2f32_local:
|
||||
; GCN: s_mov_b32 m0
|
||||
; SICIVI: s_mov_b32 m0
|
||||
; GFX9-NOT: m0
|
||||
|
||||
; GCN: ds_read_b64
|
||||
|
||||
; EG: LDS_READ_RET
|
||||
@ -29,6 +32,9 @@ entry:
|
||||
|
||||
; FIXME: should this do a read2_b64?
|
||||
; FUNC-LABEL: {{^}}local_load_v3f32:
|
||||
; SICIVI: s_mov_b32 m0
|
||||
; GFX9-NOT: m0
|
||||
|
||||
; GCN-DAG: ds_read_b32 v{{[0-9]+}}, v{{[0-9]+}} offset:8
|
||||
; GCN-DAG: ds_read_b64 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+$}}
|
||||
; GCN: s_waitcnt
|
||||
@ -46,6 +52,9 @@ entry:
|
||||
}
|
||||
|
||||
; FUNC-LABEL: {{^}}local_load_v4f32:
|
||||
; SICIVI: s_mov_b32 m0
|
||||
; GFX9-NOT: m0
|
||||
|
||||
; GCN: ds_read2_b64
|
||||
|
||||
; EG: LDS_READ_RET
|
||||
@ -60,6 +69,9 @@ entry:
|
||||
}
|
||||
|
||||
; FUNC-LABEL: {{^}}local_load_v8f32:
|
||||
; SICIVI: s_mov_b32 m0
|
||||
; GFX9-NOT: m0
|
||||
|
||||
; GCN: ds_read2_b64
|
||||
; GCN: ds_read2_b64
|
||||
|
||||
@ -79,6 +91,9 @@ entry:
|
||||
}
|
||||
|
||||
; FUNC-LABEL: {{^}}local_load_v16f32:
|
||||
; SICIVI: s_mov_b32 m0
|
||||
; GFX9-NOT: m0
|
||||
|
||||
; GCN: ds_read2_b64
|
||||
; GCN: ds_read2_b64
|
||||
; GCN: ds_read2_b64
|
||||
|
@ -1,9 +1,13 @@
|
||||
; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=FUNC %s
|
||||
; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=kaveri -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=FUNC %s
|
||||
; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=FUNC %s
|
||||
; RUN: llc -march=r600 -mcpu=redwood < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s
|
||||
; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,SICIVI,FUNC %s
|
||||
; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=kaveri -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,SICIVI,FUNC %s
|
||||
; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,SICIVI,FUNC %s
|
||||
; RUN: llc -march=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX9,FUNC %s
|
||||
; RUN: llc -march=r600 -mcpu=redwood < %s | FileCheck -check-prefixes=EG,FUNC %s
|
||||
|
||||
; FUNC-LABEL: {{^}}local_load_f64:
|
||||
; SICIV: s_mov_b32 m0
|
||||
; GFX9-NOT: m0
|
||||
|
||||
; GCN: ds_read_b64 [[VAL:v\[[0-9]+:[0-9]+\]]], v{{[0-9]+}}{{$}}
|
||||
; GCN: ds_write_b64 v{{[0-9]+}}, [[VAL]]
|
||||
|
||||
@ -16,6 +20,9 @@ define amdgpu_kernel void @local_load_f64(double addrspace(3)* %out, double addr
|
||||
}
|
||||
|
||||
; FUNC-LABEL: {{^}}local_load_v2f64:
|
||||
; SICIV: s_mov_b32 m0
|
||||
; GFX9-NOT: m0
|
||||
|
||||
; GCN: ds_read2_b64
|
||||
|
||||
; EG: LDS_READ_RET
|
||||
@ -30,6 +37,9 @@ entry:
|
||||
}
|
||||
|
||||
; FUNC-LABEL: {{^}}local_load_v3f64:
|
||||
; SICIV: s_mov_b32 m0
|
||||
; GFX9-NOT: m0
|
||||
|
||||
; GCN-DAG: ds_read2_b64
|
||||
; GCN-DAG: ds_read_b64
|
||||
|
||||
@ -47,6 +57,9 @@ entry:
|
||||
}
|
||||
|
||||
; FUNC-LABEL: {{^}}local_load_v4f64:
|
||||
; SICIV: s_mov_b32 m0
|
||||
; GFX9-NOT: m0
|
||||
|
||||
; GCN: ds_read2_b64
|
||||
; GCN: ds_read2_b64
|
||||
|
||||
@ -67,6 +80,9 @@ entry:
|
||||
}
|
||||
|
||||
; FUNC-LABEL: {{^}}local_load_v8f64:
|
||||
; SICIV: s_mov_b32 m0
|
||||
; GFX9-NOT: m0
|
||||
|
||||
; GCN: ds_read2_b64
|
||||
; GCN: ds_read2_b64
|
||||
; GCN: ds_read2_b64
|
||||
@ -96,6 +112,9 @@ entry:
|
||||
}
|
||||
|
||||
; FUNC-LABEL: {{^}}local_load_v16f64:
|
||||
; SICIV: s_mov_b32 m0
|
||||
; GFX9-NOT: m0
|
||||
|
||||
; GCN: ds_read2_b64
|
||||
; GCN: ds_read2_b64
|
||||
; GCN: ds_read2_b64
|
||||
|
@ -1,8 +1,12 @@
|
||||
; RUN: llc -march=amdgcn -mtriple=amdgcn---amdgiz -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=FUNC %s
|
||||
; RUN: llc -march=amdgcn -mtriple=amdgcn---amdgiz -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=FUNC %s
|
||||
; RUN: llc -march=r600 -mtriple=r600---amdgiz -mcpu=cypress < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s
|
||||
; RUN: llc -march=amdgcn -mtriple=amdgcn---amdgiz -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,SICIVI,FUNC %s
|
||||
; RUN: llc -march=amdgcn -mtriple=amdgcn---amdgiz -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,SICIVI,FUNC %s
|
||||
; RUN: llc -march=amdgcn -mtriple=amdgcn---amdgiz -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX9,FUNC %s
|
||||
; RUN: llc -march=r600 -mtriple=r600---amdgiz -mcpu=cypress < %s | FileCheck -check-prefixes=EG,FUNC %s
|
||||
|
||||
; FUNC-LABEL: {{^}}local_load_i1:
|
||||
; SICIVI: s_mov_b32 m0
|
||||
; GFX9-NOT: m0
|
||||
|
||||
; GCN: ds_read_u8
|
||||
; GCN: v_and_b32_e32 v{{[0-9]+}}, 1
|
||||
; GCN: ds_write_b8
|
||||
@ -17,6 +21,8 @@ define amdgpu_kernel void @local_load_i1(i1 addrspace(3)* %out, i1 addrspace(3)*
|
||||
}
|
||||
|
||||
; FUNC-LABEL: {{^}}local_load_v2i1:
|
||||
; SICIVI: s_mov_b32 m0
|
||||
; GFX9-NOT: m0
|
||||
define amdgpu_kernel void @local_load_v2i1(<2 x i1> addrspace(3)* %out, <2 x i1> addrspace(3)* %in) #0 {
|
||||
%load = load <2 x i1>, <2 x i1> addrspace(3)* %in
|
||||
store <2 x i1> %load, <2 x i1> addrspace(3)* %out
|
||||
@ -24,6 +30,8 @@ define amdgpu_kernel void @local_load_v2i1(<2 x i1> addrspace(3)* %out, <2 x i1>
|
||||
}
|
||||
|
||||
; FUNC-LABEL: {{^}}local_load_v3i1:
|
||||
; SICIVI: s_mov_b32 m0
|
||||
; GFX9-NOT: m0
|
||||
define amdgpu_kernel void @local_load_v3i1(<3 x i1> addrspace(3)* %out, <3 x i1> addrspace(3)* %in) #0 {
|
||||
%load = load <3 x i1>, <3 x i1> addrspace(3)* %in
|
||||
store <3 x i1> %load, <3 x i1> addrspace(3)* %out
|
||||
@ -31,6 +39,8 @@ define amdgpu_kernel void @local_load_v3i1(<3 x i1> addrspace(3)* %out, <3 x i1>
|
||||
}
|
||||
|
||||
; FUNC-LABEL: {{^}}local_load_v4i1:
|
||||
; SICIVI: s_mov_b32 m0
|
||||
; GFX9-NOT: m0
|
||||
define amdgpu_kernel void @local_load_v4i1(<4 x i1> addrspace(3)* %out, <4 x i1> addrspace(3)* %in) #0 {
|
||||
%load = load <4 x i1>, <4 x i1> addrspace(3)* %in
|
||||
store <4 x i1> %load, <4 x i1> addrspace(3)* %out
|
||||
@ -38,6 +48,8 @@ define amdgpu_kernel void @local_load_v4i1(<4 x i1> addrspace(3)* %out, <4 x i1>
|
||||
}
|
||||
|
||||
; FUNC-LABEL: {{^}}local_load_v8i1:
|
||||
; SICIVI: s_mov_b32 m0
|
||||
; GFX9-NOT: m0
|
||||
define amdgpu_kernel void @local_load_v8i1(<8 x i1> addrspace(3)* %out, <8 x i1> addrspace(3)* %in) #0 {
|
||||
%load = load <8 x i1>, <8 x i1> addrspace(3)* %in
|
||||
store <8 x i1> %load, <8 x i1> addrspace(3)* %out
|
||||
@ -45,6 +57,8 @@ define amdgpu_kernel void @local_load_v8i1(<8 x i1> addrspace(3)* %out, <8 x i1>
|
||||
}
|
||||
|
||||
; FUNC-LABEL: {{^}}local_load_v16i1:
|
||||
; SICIVI: s_mov_b32 m0
|
||||
; GFX9-NOT: m0
|
||||
define amdgpu_kernel void @local_load_v16i1(<16 x i1> addrspace(3)* %out, <16 x i1> addrspace(3)* %in) #0 {
|
||||
%load = load <16 x i1>, <16 x i1> addrspace(3)* %in
|
||||
store <16 x i1> %load, <16 x i1> addrspace(3)* %out
|
||||
@ -52,6 +66,8 @@ define amdgpu_kernel void @local_load_v16i1(<16 x i1> addrspace(3)* %out, <16 x
|
||||
}
|
||||
|
||||
; FUNC-LABEL: {{^}}local_load_v32i1:
|
||||
; SICIVI: s_mov_b32 m0
|
||||
; GFX9-NOT: m0
|
||||
define amdgpu_kernel void @local_load_v32i1(<32 x i1> addrspace(3)* %out, <32 x i1> addrspace(3)* %in) #0 {
|
||||
%load = load <32 x i1>, <32 x i1> addrspace(3)* %in
|
||||
store <32 x i1> %load, <32 x i1> addrspace(3)* %out
|
||||
@ -59,6 +75,8 @@ define amdgpu_kernel void @local_load_v32i1(<32 x i1> addrspace(3)* %out, <32 x
|
||||
}
|
||||
|
||||
; FUNC-LABEL: {{^}}local_load_v64i1:
|
||||
; SICIVI: s_mov_b32 m0
|
||||
; GFX9-NOT: m0
|
||||
define amdgpu_kernel void @local_load_v64i1(<64 x i1> addrspace(3)* %out, <64 x i1> addrspace(3)* %in) #0 {
|
||||
%load = load <64 x i1>, <64 x i1> addrspace(3)* %in
|
||||
store <64 x i1> %load, <64 x i1> addrspace(3)* %out
|
||||
@ -66,6 +84,9 @@ define amdgpu_kernel void @local_load_v64i1(<64 x i1> addrspace(3)* %out, <64 x
|
||||
}
|
||||
|
||||
; FUNC-LABEL: {{^}}local_zextload_i1_to_i32:
|
||||
; SICIVI: s_mov_b32 m0
|
||||
; GFX9-NOT: m0
|
||||
|
||||
; GCN: ds_read_u8
|
||||
; GCN: ds_write_b32
|
||||
define amdgpu_kernel void @local_zextload_i1_to_i32(i32 addrspace(3)* %out, i1 addrspace(3)* %in) #0 {
|
||||
@ -76,6 +97,9 @@ define amdgpu_kernel void @local_zextload_i1_to_i32(i32 addrspace(3)* %out, i1 a
|
||||
}
|
||||
|
||||
; FUNC-LABEL: {{^}}local_sextload_i1_to_i32:
|
||||
; SICIVI: s_mov_b32 m0
|
||||
; GFX9-NOT: m0
|
||||
|
||||
; GCN: ds_read_u8
|
||||
; GCN: v_bfe_i32 {{v[0-9]+}}, {{v[0-9]+}}, 0, 1{{$}}
|
||||
; GCN: ds_write_b32
|
||||
@ -90,6 +114,8 @@ define amdgpu_kernel void @local_sextload_i1_to_i32(i32 addrspace(3)* %out, i1 a
|
||||
}
|
||||
|
||||
; FUNC-LABEL: {{^}}local_zextload_v1i1_to_v1i32:
|
||||
; SICIVI: s_mov_b32 m0
|
||||
; GFX9-NOT: m0
|
||||
define amdgpu_kernel void @local_zextload_v1i1_to_v1i32(<1 x i32> addrspace(3)* %out, <1 x i1> addrspace(3)* %in) #0 {
|
||||
%load = load <1 x i1>, <1 x i1> addrspace(3)* %in
|
||||
%ext = zext <1 x i1> %load to <1 x i32>
|
||||
@ -98,6 +124,8 @@ define amdgpu_kernel void @local_zextload_v1i1_to_v1i32(<1 x i32> addrspace(3)*
|
||||
}
|
||||
|
||||
; FUNC-LABEL: {{^}}local_sextload_v1i1_to_v1i32:
|
||||
; SICIVI: s_mov_b32 m0
|
||||
; GFX9-NOT: m0
|
||||
define amdgpu_kernel void @local_sextload_v1i1_to_v1i32(<1 x i32> addrspace(3)* %out, <1 x i1> addrspace(3)* %in) #0 {
|
||||
%load = load <1 x i1>, <1 x i1> addrspace(3)* %in
|
||||
%ext = sext <1 x i1> %load to <1 x i32>
|
||||
@ -106,6 +134,8 @@ define amdgpu_kernel void @local_sextload_v1i1_to_v1i32(<1 x i32> addrspace(3)*
|
||||
}
|
||||
|
||||
; FUNC-LABEL: {{^}}local_zextload_v2i1_to_v2i32:
|
||||
; SICIVI: s_mov_b32 m0
|
||||
; GFX9-NOT: m0
|
||||
define amdgpu_kernel void @local_zextload_v2i1_to_v2i32(<2 x i32> addrspace(3)* %out, <2 x i1> addrspace(3)* %in) #0 {
|
||||
%load = load <2 x i1>, <2 x i1> addrspace(3)* %in
|
||||
%ext = zext <2 x i1> %load to <2 x i32>
|
||||
@ -114,6 +144,8 @@ define amdgpu_kernel void @local_zextload_v2i1_to_v2i32(<2 x i32> addrspace(3)*
|
||||
}
|
||||
|
||||
; FUNC-LABEL: {{^}}local_sextload_v2i1_to_v2i32:
|
||||
; SICIVI: s_mov_b32 m0
|
||||
; GFX9-NOT: m0
|
||||
define amdgpu_kernel void @local_sextload_v2i1_to_v2i32(<2 x i32> addrspace(3)* %out, <2 x i1> addrspace(3)* %in) #0 {
|
||||
%load = load <2 x i1>, <2 x i1> addrspace(3)* %in
|
||||
%ext = sext <2 x i1> %load to <2 x i32>
|
||||
@ -122,6 +154,8 @@ define amdgpu_kernel void @local_sextload_v2i1_to_v2i32(<2 x i32> addrspace(3)*
|
||||
}
|
||||
|
||||
; FUNC-LABEL: {{^}}local_zextload_v3i1_to_v3i32:
|
||||
; SICIVI: s_mov_b32 m0
|
||||
; GFX9-NOT: m0
|
||||
define amdgpu_kernel void @local_zextload_v3i1_to_v3i32(<3 x i32> addrspace(3)* %out, <3 x i1> addrspace(3)* %in) #0 {
|
||||
%load = load <3 x i1>, <3 x i1> addrspace(3)* %in
|
||||
%ext = zext <3 x i1> %load to <3 x i32>
|
||||
@ -130,6 +164,8 @@ define amdgpu_kernel void @local_zextload_v3i1_to_v3i32(<3 x i32> addrspace(3)*
|
||||
}
|
||||
|
||||
; FUNC-LABEL: {{^}}local_sextload_v3i1_to_v3i32:
|
||||
; SICIVI: s_mov_b32 m0
|
||||
; GFX9-NOT: m0
|
||||
define amdgpu_kernel void @local_sextload_v3i1_to_v3i32(<3 x i32> addrspace(3)* %out, <3 x i1> addrspace(3)* %in) #0 {
|
||||
%load = load <3 x i1>, <3 x i1> addrspace(3)* %in
|
||||
%ext = sext <3 x i1> %load to <3 x i32>
|
||||
@ -138,6 +174,8 @@ define amdgpu_kernel void @local_sextload_v3i1_to_v3i32(<3 x i32> addrspace(3)*
|
||||
}
|
||||
|
||||
; FUNC-LABEL: {{^}}local_zextload_v4i1_to_v4i32:
|
||||
; SICIVI: s_mov_b32 m0
|
||||
; GFX9-NOT: m0
|
||||
define amdgpu_kernel void @local_zextload_v4i1_to_v4i32(<4 x i32> addrspace(3)* %out, <4 x i1> addrspace(3)* %in) #0 {
|
||||
%load = load <4 x i1>, <4 x i1> addrspace(3)* %in
|
||||
%ext = zext <4 x i1> %load to <4 x i32>
|
||||
@ -146,6 +184,8 @@ define amdgpu_kernel void @local_zextload_v4i1_to_v4i32(<4 x i32> addrspace(3)*
|
||||
}
|
||||
|
||||
; FUNC-LABEL: {{^}}local_sextload_v4i1_to_v4i32:
|
||||
; SICIVI: s_mov_b32 m0
|
||||
; GFX9-NOT: m0
|
||||
define amdgpu_kernel void @local_sextload_v4i1_to_v4i32(<4 x i32> addrspace(3)* %out, <4 x i1> addrspace(3)* %in) #0 {
|
||||
%load = load <4 x i1>, <4 x i1> addrspace(3)* %in
|
||||
%ext = sext <4 x i1> %load to <4 x i32>
|
||||
@ -154,6 +194,8 @@ define amdgpu_kernel void @local_sextload_v4i1_to_v4i32(<4 x i32> addrspace(3)*
|
||||
}
|
||||
|
||||
; FUNC-LABEL: {{^}}local_zextload_v8i1_to_v8i32:
|
||||
; SICIVI: s_mov_b32 m0
|
||||
; GFX9-NOT: m0
|
||||
define amdgpu_kernel void @local_zextload_v8i1_to_v8i32(<8 x i32> addrspace(3)* %out, <8 x i1> addrspace(3)* %in) #0 {
|
||||
%load = load <8 x i1>, <8 x i1> addrspace(3)* %in
|
||||
%ext = zext <8 x i1> %load to <8 x i32>
|
||||
@ -162,6 +204,8 @@ define amdgpu_kernel void @local_zextload_v8i1_to_v8i32(<8 x i32> addrspace(3)*
|
||||
}
|
||||
|
||||
; FUNC-LABEL: {{^}}local_sextload_v8i1_to_v8i32:
|
||||
; SICIVI: s_mov_b32 m0
|
||||
; GFX9-NOT: m0
|
||||
define amdgpu_kernel void @local_sextload_v8i1_to_v8i32(<8 x i32> addrspace(3)* %out, <8 x i1> addrspace(3)* %in) #0 {
|
||||
%load = load <8 x i1>, <8 x i1> addrspace(3)* %in
|
||||
%ext = sext <8 x i1> %load to <8 x i32>
|
||||
@ -170,6 +214,8 @@ define amdgpu_kernel void @local_sextload_v8i1_to_v8i32(<8 x i32> addrspace(3)*
|
||||
}
|
||||
|
||||
; FUNC-LABEL: {{^}}local_zextload_v16i1_to_v16i32:
|
||||
; SICIVI: s_mov_b32 m0
|
||||
; GFX9-NOT: m0
|
||||
define amdgpu_kernel void @local_zextload_v16i1_to_v16i32(<16 x i32> addrspace(3)* %out, <16 x i1> addrspace(3)* %in) #0 {
|
||||
%load = load <16 x i1>, <16 x i1> addrspace(3)* %in
|
||||
%ext = zext <16 x i1> %load to <16 x i32>
|
||||
@ -178,6 +224,8 @@ define amdgpu_kernel void @local_zextload_v16i1_to_v16i32(<16 x i32> addrspace(3
|
||||
}
|
||||
|
||||
; FUNC-LABEL: {{^}}local_sextload_v16i1_to_v16i32:
|
||||
; SICIVI: s_mov_b32 m0
|
||||
; GFX9-NOT: m0
|
||||
define amdgpu_kernel void @local_sextload_v16i1_to_v16i32(<16 x i32> addrspace(3)* %out, <16 x i1> addrspace(3)* %in) #0 {
|
||||
%load = load <16 x i1>, <16 x i1> addrspace(3)* %in
|
||||
%ext = sext <16 x i1> %load to <16 x i32>
|
||||
@ -186,6 +234,8 @@ define amdgpu_kernel void @local_sextload_v16i1_to_v16i32(<16 x i32> addrspace(3
|
||||
}
|
||||
|
||||
; FUNC-LABEL: {{^}}local_zextload_v32i1_to_v32i32:
|
||||
; SICIVI: s_mov_b32 m0
|
||||
; GFX9-NOT: m0
|
||||
define amdgpu_kernel void @local_zextload_v32i1_to_v32i32(<32 x i32> addrspace(3)* %out, <32 x i1> addrspace(3)* %in) #0 {
|
||||
%load = load <32 x i1>, <32 x i1> addrspace(3)* %in
|
||||
%ext = zext <32 x i1> %load to <32 x i32>
|
||||
@ -194,6 +244,8 @@ define amdgpu_kernel void @local_zextload_v32i1_to_v32i32(<32 x i32> addrspace(3
|
||||
}
|
||||
|
||||
; FUNC-LABEL: {{^}}local_sextload_v32i1_to_v32i32:
|
||||
; SICIVI: s_mov_b32 m0
|
||||
; GFX9-NOT: m0
|
||||
define amdgpu_kernel void @local_sextload_v32i1_to_v32i32(<32 x i32> addrspace(3)* %out, <32 x i1> addrspace(3)* %in) #0 {
|
||||
%load = load <32 x i1>, <32 x i1> addrspace(3)* %in
|
||||
%ext = sext <32 x i1> %load to <32 x i32>
|
||||
@ -202,6 +254,8 @@ define amdgpu_kernel void @local_sextload_v32i1_to_v32i32(<32 x i32> addrspace(3
|
||||
}
|
||||
|
||||
; FUNC-LABEL: {{^}}local_zextload_v64i1_to_v64i32:
|
||||
; SICIVI: s_mov_b32 m0
|
||||
; GFX9-NOT: m0
|
||||
define amdgpu_kernel void @local_zextload_v64i1_to_v64i32(<64 x i32> addrspace(3)* %out, <64 x i1> addrspace(3)* %in) #0 {
|
||||
%load = load <64 x i1>, <64 x i1> addrspace(3)* %in
|
||||
%ext = zext <64 x i1> %load to <64 x i32>
|
||||
@ -210,6 +264,8 @@ define amdgpu_kernel void @local_zextload_v64i1_to_v64i32(<64 x i32> addrspace(3
|
||||
}
|
||||
|
||||
; FUNC-LABEL: {{^}}local_sextload_v64i1_to_v64i32:
|
||||
; SICIVI: s_mov_b32 m0
|
||||
; GFX9-NOT: m0
|
||||
define amdgpu_kernel void @local_sextload_v64i1_to_v64i32(<64 x i32> addrspace(3)* %out, <64 x i1> addrspace(3)* %in) #0 {
|
||||
%load = load <64 x i1>, <64 x i1> addrspace(3)* %in
|
||||
%ext = sext <64 x i1> %load to <64 x i32>
|
||||
@ -218,6 +274,9 @@ define amdgpu_kernel void @local_sextload_v64i1_to_v64i32(<64 x i32> addrspace(3
|
||||
}
|
||||
|
||||
; FUNC-LABEL: {{^}}local_zextload_i1_to_i64:
|
||||
; SICIVI: s_mov_b32 m0
|
||||
; GFX9-NOT: m0
|
||||
|
||||
; GCN-DAG: ds_read_u8 [[LOAD:v[0-9]+]],
|
||||
; GCN-DAG: v_mov_b32_e32 {{v[0-9]+}}, 0{{$}}
|
||||
; GCN: ds_write_b64
|
||||
@ -229,6 +288,9 @@ define amdgpu_kernel void @local_zextload_i1_to_i64(i64 addrspace(3)* %out, i1 a
|
||||
}
|
||||
|
||||
; FUNC-LABEL: {{^}}local_sextload_i1_to_i64:
|
||||
; SICIVI: s_mov_b32 m0
|
||||
; GFX9-NOT: m0
|
||||
|
||||
; GCN: ds_read_u8 [[LOAD:v[0-9]+]],
|
||||
; GCN: v_bfe_i32 [[BFE:v[0-9]+]], {{v[0-9]+}}, 0, 1{{$}}
|
||||
; GCN: v_ashrrev_i32_e32 v{{[0-9]+}}, 31, [[BFE]]
|
||||
@ -241,6 +303,8 @@ define amdgpu_kernel void @local_sextload_i1_to_i64(i64 addrspace(3)* %out, i1 a
|
||||
}
|
||||
|
||||
; FUNC-LABEL: {{^}}local_zextload_v1i1_to_v1i64:
|
||||
; SICIVI: s_mov_b32 m0
|
||||
; GFX9-NOT: m0
|
||||
define amdgpu_kernel void @local_zextload_v1i1_to_v1i64(<1 x i64> addrspace(3)* %out, <1 x i1> addrspace(3)* %in) #0 {
|
||||
%load = load <1 x i1>, <1 x i1> addrspace(3)* %in
|
||||
%ext = zext <1 x i1> %load to <1 x i64>
|
||||
@ -249,6 +313,8 @@ define amdgpu_kernel void @local_zextload_v1i1_to_v1i64(<1 x i64> addrspace(3)*
|
||||
}
|
||||
|
||||
; FUNC-LABEL: {{^}}local_sextload_v1i1_to_v1i64:
|
||||
; SICIVI: s_mov_b32 m0
|
||||
; GFX9-NOT: m0
|
||||
define amdgpu_kernel void @local_sextload_v1i1_to_v1i64(<1 x i64> addrspace(3)* %out, <1 x i1> addrspace(3)* %in) #0 {
|
||||
%load = load <1 x i1>, <1 x i1> addrspace(3)* %in
|
||||
%ext = sext <1 x i1> %load to <1 x i64>
|
||||
@ -257,6 +323,8 @@ define amdgpu_kernel void @local_sextload_v1i1_to_v1i64(<1 x i64> addrspace(3)*
|
||||
}
|
||||
|
||||
; FUNC-LABEL: {{^}}local_zextload_v2i1_to_v2i64:
|
||||
; SICIVI: s_mov_b32 m0
|
||||
; GFX9-NOT: m0
|
||||
define amdgpu_kernel void @local_zextload_v2i1_to_v2i64(<2 x i64> addrspace(3)* %out, <2 x i1> addrspace(3)* %in) #0 {
|
||||
%load = load <2 x i1>, <2 x i1> addrspace(3)* %in
|
||||
%ext = zext <2 x i1> %load to <2 x i64>
|
||||
@ -265,6 +333,8 @@ define amdgpu_kernel void @local_zextload_v2i1_to_v2i64(<2 x i64> addrspace(3)*
|
||||
}
|
||||
|
||||
; FUNC-LABEL: {{^}}local_sextload_v2i1_to_v2i64:
|
||||
; SICIVI: s_mov_b32 m0
|
||||
; GFX9-NOT: m0
|
||||
define amdgpu_kernel void @local_sextload_v2i1_to_v2i64(<2 x i64> addrspace(3)* %out, <2 x i1> addrspace(3)* %in) #0 {
|
||||
%load = load <2 x i1>, <2 x i1> addrspace(3)* %in
|
||||
%ext = sext <2 x i1> %load to <2 x i64>
|
||||
@ -273,6 +343,8 @@ define amdgpu_kernel void @local_sextload_v2i1_to_v2i64(<2 x i64> addrspace(3)*
|
||||
}
|
||||
|
||||
; FUNC-LABEL: {{^}}local_zextload_v3i1_to_v3i64:
|
||||
; SICIVI: s_mov_b32 m0
|
||||
; GFX9-NOT: m0
|
||||
define amdgpu_kernel void @local_zextload_v3i1_to_v3i64(<3 x i64> addrspace(3)* %out, <3 x i1> addrspace(3)* %in) #0 {
|
||||
%load = load <3 x i1>, <3 x i1> addrspace(3)* %in
|
||||
%ext = zext <3 x i1> %load to <3 x i64>
|
||||
@ -281,6 +353,8 @@ define amdgpu_kernel void @local_zextload_v3i1_to_v3i64(<3 x i64> addrspace(3)*
|
||||
}
|
||||
|
||||
; FUNC-LABEL: {{^}}local_sextload_v3i1_to_v3i64:
|
||||
; SICIVI: s_mov_b32 m0
|
||||
; GFX9-NOT: m0
|
||||
define amdgpu_kernel void @local_sextload_v3i1_to_v3i64(<3 x i64> addrspace(3)* %out, <3 x i1> addrspace(3)* %in) #0 {
|
||||
%load = load <3 x i1>, <3 x i1> addrspace(3)* %in
|
||||
%ext = sext <3 x i1> %load to <3 x i64>
|
||||
@ -289,6 +363,8 @@ define amdgpu_kernel void @local_sextload_v3i1_to_v3i64(<3 x i64> addrspace(3)*
|
||||
}
|
||||
|
||||
; FUNC-LABEL: {{^}}local_zextload_v4i1_to_v4i64:
|
||||
; SICIVI: s_mov_b32 m0
|
||||
; GFX9-NOT: m0
|
||||
define amdgpu_kernel void @local_zextload_v4i1_to_v4i64(<4 x i64> addrspace(3)* %out, <4 x i1> addrspace(3)* %in) #0 {
|
||||
%load = load <4 x i1>, <4 x i1> addrspace(3)* %in
|
||||
%ext = zext <4 x i1> %load to <4 x i64>
|
||||
@ -297,6 +373,8 @@ define amdgpu_kernel void @local_zextload_v4i1_to_v4i64(<4 x i64> addrspace(3)*
|
||||
}
|
||||
|
||||
; FUNC-LABEL: {{^}}local_sextload_v4i1_to_v4i64:
|
||||
; SICIVI: s_mov_b32 m0
|
||||
; GFX9-NOT: m0
|
||||
define amdgpu_kernel void @local_sextload_v4i1_to_v4i64(<4 x i64> addrspace(3)* %out, <4 x i1> addrspace(3)* %in) #0 {
|
||||
%load = load <4 x i1>, <4 x i1> addrspace(3)* %in
|
||||
%ext = sext <4 x i1> %load to <4 x i64>
|
||||
@ -305,6 +383,8 @@ define amdgpu_kernel void @local_sextload_v4i1_to_v4i64(<4 x i64> addrspace(3)*
|
||||
}
|
||||
|
||||
; FUNC-LABEL: {{^}}local_zextload_v8i1_to_v8i64:
|
||||
; SICIVI: s_mov_b32 m0
|
||||
; GFX9-NOT: m0
|
||||
define amdgpu_kernel void @local_zextload_v8i1_to_v8i64(<8 x i64> addrspace(3)* %out, <8 x i1> addrspace(3)* %in) #0 {
|
||||
%load = load <8 x i1>, <8 x i1> addrspace(3)* %in
|
||||
%ext = zext <8 x i1> %load to <8 x i64>
|
||||
@ -313,6 +393,8 @@ define amdgpu_kernel void @local_zextload_v8i1_to_v8i64(<8 x i64> addrspace(3)*
|
||||
}
|
||||
|
||||
; FUNC-LABEL: {{^}}local_sextload_v8i1_to_v8i64:
|
||||
; SICIVI: s_mov_b32 m0
|
||||
; GFX9-NOT: m0
|
||||
define amdgpu_kernel void @local_sextload_v8i1_to_v8i64(<8 x i64> addrspace(3)* %out, <8 x i1> addrspace(3)* %in) #0 {
|
||||
%load = load <8 x i1>, <8 x i1> addrspace(3)* %in
|
||||
%ext = sext <8 x i1> %load to <8 x i64>
|
||||
@ -321,6 +403,8 @@ define amdgpu_kernel void @local_sextload_v8i1_to_v8i64(<8 x i64> addrspace(3)*
|
||||
}
|
||||
|
||||
; FUNC-LABEL: {{^}}local_zextload_v16i1_to_v16i64:
|
||||
; SICIVI: s_mov_b32 m0
|
||||
; GFX9-NOT: m0
|
||||
define amdgpu_kernel void @local_zextload_v16i1_to_v16i64(<16 x i64> addrspace(3)* %out, <16 x i1> addrspace(3)* %in) #0 {
|
||||
%load = load <16 x i1>, <16 x i1> addrspace(3)* %in
|
||||
%ext = zext <16 x i1> %load to <16 x i64>
|
||||
@ -329,6 +413,8 @@ define amdgpu_kernel void @local_zextload_v16i1_to_v16i64(<16 x i64> addrspace(3
|
||||
}
|
||||
|
||||
; FUNC-LABEL: {{^}}local_sextload_v16i1_to_v16i64:
|
||||
; SICIVI: s_mov_b32 m0
|
||||
; GFX9-NOT: m0
|
||||
define amdgpu_kernel void @local_sextload_v16i1_to_v16i64(<16 x i64> addrspace(3)* %out, <16 x i1> addrspace(3)* %in) #0 {
|
||||
%load = load <16 x i1>, <16 x i1> addrspace(3)* %in
|
||||
%ext = sext <16 x i1> %load to <16 x i64>
|
||||
@ -337,6 +423,8 @@ define amdgpu_kernel void @local_sextload_v16i1_to_v16i64(<16 x i64> addrspace(3
|
||||
}
|
||||
|
||||
; FUNC-LABEL: {{^}}local_zextload_v32i1_to_v32i64:
|
||||
; SICIVI: s_mov_b32 m0
|
||||
; GFX9-NOT: m0
|
||||
define amdgpu_kernel void @local_zextload_v32i1_to_v32i64(<32 x i64> addrspace(3)* %out, <32 x i1> addrspace(3)* %in) #0 {
|
||||
%load = load <32 x i1>, <32 x i1> addrspace(3)* %in
|
||||
%ext = zext <32 x i1> %load to <32 x i64>
|
||||
@ -345,6 +433,8 @@ define amdgpu_kernel void @local_zextload_v32i1_to_v32i64(<32 x i64> addrspace(3
|
||||
}
|
||||
|
||||
; FUNC-LABEL: {{^}}local_sextload_v32i1_to_v32i64:
|
||||
; SICIVI: s_mov_b32 m0
|
||||
; GFX9-NOT: m0
|
||||
define amdgpu_kernel void @local_sextload_v32i1_to_v32i64(<32 x i64> addrspace(3)* %out, <32 x i1> addrspace(3)* %in) #0 {
|
||||
%load = load <32 x i1>, <32 x i1> addrspace(3)* %in
|
||||
%ext = sext <32 x i1> %load to <32 x i64>
|
||||
@ -353,6 +443,8 @@ define amdgpu_kernel void @local_sextload_v32i1_to_v32i64(<32 x i64> addrspace(3
|
||||
}
|
||||
|
||||
; FUNC-LABEL: {{^}}local_zextload_v64i1_to_v64i64:
|
||||
; SICIVI: s_mov_b32 m0
|
||||
; GFX9-NOT: m0
|
||||
define amdgpu_kernel void @local_zextload_v64i1_to_v64i64(<64 x i64> addrspace(3)* %out, <64 x i1> addrspace(3)* %in) #0 {
|
||||
%load = load <64 x i1>, <64 x i1> addrspace(3)* %in
|
||||
%ext = zext <64 x i1> %load to <64 x i64>
|
||||
@ -361,6 +453,8 @@ define amdgpu_kernel void @local_zextload_v64i1_to_v64i64(<64 x i64> addrspace(3
|
||||
}
|
||||
|
||||
; FUNC-LABEL: {{^}}local_sextload_v64i1_to_v64i64:
|
||||
; SICIVI: s_mov_b32 m0
|
||||
; GFX9-NOT: m0
|
||||
define amdgpu_kernel void @local_sextload_v64i1_to_v64i64(<64 x i64> addrspace(3)* %out, <64 x i1> addrspace(3)* %in) #0 {
|
||||
%load = load <64 x i1>, <64 x i1> addrspace(3)* %in
|
||||
%ext = sext <64 x i1> %load to <64 x i64>
|
||||
|
@ -1,8 +1,12 @@
|
||||
; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,SI,FUNC %s
|
||||
; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,VI,FUNC %s
|
||||
; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,SI,SICIVI,FUNC %s
|
||||
; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,SICIVI,GFX89,FUNC %s
|
||||
; RUN: llc -march=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX9,GFX89,FUNC %s
|
||||
; RUN: llc -march=r600 -mcpu=redwood -verify-machineinstrs < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s
|
||||
|
||||
; FUNC-LABEL: {{^}}local_load_i16:
|
||||
; GFX9-NOT: m0
|
||||
; SICIVI: s_mov_b32 m0
|
||||
|
||||
; GCN: ds_read_u16 v{{[0-9]+}}
|
||||
|
||||
; EG: MOV {{[* ]*}}[[FROM:T[0-9]+\.[XYZW]]], KC0[2].Z
|
||||
@ -18,6 +22,9 @@ entry:
|
||||
}
|
||||
|
||||
; FUNC-LABEL: {{^}}local_load_v2i16:
|
||||
; GFX9-NOT: m0
|
||||
; SICIVI: s_mov_b32 m0
|
||||
|
||||
; GCN: ds_read_b32
|
||||
|
||||
; EG: MOV {{[* ]*}}[[FROM:T[0-9]+\.[XYZW]]], KC0[2].Z
|
||||
@ -33,6 +40,9 @@ entry:
|
||||
}
|
||||
|
||||
; FUNC-LABEL: {{^}}local_load_v3i16:
|
||||
; GFX9-NOT: m0
|
||||
; SICIVI: s_mov_b32 m0
|
||||
|
||||
; GCN: ds_read_b64
|
||||
; GCN-DAG: ds_write_b32
|
||||
; GCN-DAG: ds_write_b16
|
||||
@ -47,6 +57,9 @@ entry:
|
||||
}
|
||||
|
||||
; FUNC-LABEL: {{^}}local_load_v4i16:
|
||||
; GFX9-NOT: m0
|
||||
; SICIVI: s_mov_b32 m0
|
||||
|
||||
; GCN: ds_read_b64
|
||||
|
||||
; EG: LDS_READ_RET
|
||||
@ -59,6 +72,9 @@ entry:
|
||||
}
|
||||
|
||||
; FUNC-LABEL: {{^}}local_load_v8i16:
|
||||
; GFX9-NOT: m0
|
||||
; SICIVI: s_mov_b32 m0
|
||||
|
||||
; GCN: ds_read2_b64 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}} offset1:1{{$}}
|
||||
|
||||
; EG: LDS_READ_RET
|
||||
@ -73,6 +89,9 @@ entry:
|
||||
}
|
||||
|
||||
; FUNC-LABEL: {{^}}local_load_v16i16:
|
||||
; GFX9-NOT: m0
|
||||
; SICIVI: s_mov_b32 m0
|
||||
|
||||
; GCN-DAG: ds_read2_b64 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}} offset1:3{{$}}
|
||||
; GCN-DAG: ds_read2_b64 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}} offset0:1 offset1:2{{$}}
|
||||
|
||||
@ -94,6 +113,9 @@ entry:
|
||||
}
|
||||
|
||||
; FUNC-LABEL: {{^}}local_zextload_i16_to_i32:
|
||||
; GFX9-NOT: m0
|
||||
; SICIVI: s_mov_b32 m0
|
||||
|
||||
; GCN: ds_read_u16
|
||||
; GCN: ds_write_b32
|
||||
|
||||
@ -111,7 +133,10 @@ define amdgpu_kernel void @local_zextload_i16_to_i32(i32 addrspace(3)* %out, i16
|
||||
|
||||
; FUNC-LABEL: {{^}}local_sextload_i16_to_i32:
|
||||
; GCN-NOT: s_wqm_b64
|
||||
; GCN: s_mov_b32 m0
|
||||
|
||||
; GFX9-NOT: m0
|
||||
; SICIVI: s_mov_b32 m0
|
||||
|
||||
; GCN: ds_read_i16
|
||||
|
||||
; EG: MOV {{[* ]*}}[[FROM:T[0-9]+\.[XYZW]]], KC0[2].Z
|
||||
@ -129,6 +154,9 @@ define amdgpu_kernel void @local_sextload_i16_to_i32(i32 addrspace(3)* %out, i16
|
||||
}
|
||||
|
||||
; FUNC-LABEL: {{^}}local_zextload_v1i16_to_v1i32:
|
||||
; GFX9-NOT: m0
|
||||
; SICIVI: s_mov_b32 m0
|
||||
|
||||
; GCN: ds_read_u16
|
||||
|
||||
; EG: MOV {{[* ]*}}[[FROM:T[0-9]+\.[XYZW]]], KC0[2].Z
|
||||
@ -144,6 +172,9 @@ define amdgpu_kernel void @local_zextload_v1i16_to_v1i32(<1 x i32> addrspace(3)*
|
||||
}
|
||||
|
||||
; FUNC-LABEL: {{^}}local_sextload_v1i16_to_v1i32:
|
||||
; GFX9-NOT: m0
|
||||
; SICIVI: s_mov_b32 m0
|
||||
|
||||
; GCN: ds_read_i16
|
||||
|
||||
; EG: MOV {{[* ]*}}[[FROM:T[0-9]+\.[XYZW]]], KC0[2].Z
|
||||
@ -162,7 +193,9 @@ define amdgpu_kernel void @local_sextload_v1i16_to_v1i32(<1 x i32> addrspace(3)*
|
||||
|
||||
; FUNC-LABEL: {{^}}local_zextload_v2i16_to_v2i32:
|
||||
; GCN-NOT: s_wqm_b64
|
||||
; GCN: s_mov_b32 m0
|
||||
; GFX9-NOT: m0
|
||||
; SICIVI: s_mov_b32 m0
|
||||
|
||||
; GCN: ds_read_b32
|
||||
|
||||
; EG: LDS_READ_RET
|
||||
@ -175,7 +208,9 @@ define amdgpu_kernel void @local_zextload_v2i16_to_v2i32(<2 x i32> addrspace(3)*
|
||||
|
||||
; FUNC-LABEL: {{^}}local_sextload_v2i16_to_v2i32:
|
||||
; GCN-NOT: s_wqm_b64
|
||||
; GCN: s_mov_b32 m0
|
||||
; GFX9-NOT: m0
|
||||
; SICIVI: s_mov_b32 m0
|
||||
|
||||
; GCN: ds_read_b32
|
||||
|
||||
; EG: LDS_READ_RET
|
||||
@ -189,6 +224,9 @@ define amdgpu_kernel void @local_sextload_v2i16_to_v2i32(<2 x i32> addrspace(3)*
|
||||
}
|
||||
|
||||
; FUNC-LABEL: {{^}}local_local_zextload_v3i16_to_v3i32:
|
||||
; GFX9-NOT: m0
|
||||
; SICIVI: s_mov_b32 m0
|
||||
|
||||
; GCN: ds_read_b64
|
||||
; GCN-DAG: ds_write_b32
|
||||
; GCN-DAG: ds_write_b64
|
||||
@ -203,6 +241,9 @@ entry:
|
||||
}
|
||||
|
||||
; FUNC-LABEL: {{^}}local_local_sextload_v3i16_to_v3i32:
|
||||
; GFX9-NOT: m0
|
||||
; SICIVI: s_mov_b32 m0
|
||||
|
||||
; GCN: ds_read_b64
|
||||
; GCN-DAG: ds_write_b32
|
||||
; GCN-DAG: ds_write_b64
|
||||
@ -221,7 +262,9 @@ entry:
|
||||
|
||||
; FUNC-LABEL: {{^}}local_local_zextload_v4i16_to_v4i32:
|
||||
; GCN-NOT: s_wqm_b64
|
||||
; GCN: s_mov_b32 m0
|
||||
; GFX9-NOT: m0
|
||||
; SICIVI: s_mov_b32 m0
|
||||
|
||||
; GCN: ds_read_b64
|
||||
|
||||
; EG: LDS_READ_RET
|
||||
@ -235,7 +278,9 @@ define amdgpu_kernel void @local_local_zextload_v4i16_to_v4i32(<4 x i32> addrspa
|
||||
|
||||
; FUNC-LABEL: {{^}}local_sextload_v4i16_to_v4i32:
|
||||
; GCN-NOT: s_wqm_b64
|
||||
; GCN: s_mov_b32 m0
|
||||
; GFX9-NOT: m0
|
||||
; SICIVI: s_mov_b32 m0
|
||||
|
||||
; GCN: ds_read_b64
|
||||
|
||||
; EG: LDS_READ_RET
|
||||
@ -252,6 +297,9 @@ define amdgpu_kernel void @local_sextload_v4i16_to_v4i32(<4 x i32> addrspace(3)*
|
||||
}
|
||||
|
||||
; FUNC-LABEL: {{^}}local_zextload_v8i16_to_v8i32:
|
||||
; GFX9-NOT: m0
|
||||
; SICIVI: s_mov_b32 m0
|
||||
|
||||
; GCN: ds_read2_b64 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}} offset1:1{{$}}
|
||||
|
||||
; EG: LDS_READ_RET
|
||||
@ -266,6 +314,9 @@ define amdgpu_kernel void @local_zextload_v8i16_to_v8i32(<8 x i32> addrspace(3)*
|
||||
}
|
||||
|
||||
; FUNC-LABEL: {{^}}local_sextload_v8i16_to_v8i32:
|
||||
; GFX9-NOT: m0
|
||||
; SICIVI: s_mov_b32 m0
|
||||
|
||||
; GCN: ds_read2_b64 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}} offset1:1{{$}}
|
||||
|
||||
; EG: LDS_READ_RET
|
||||
@ -288,6 +339,9 @@ define amdgpu_kernel void @local_sextload_v8i16_to_v8i32(<8 x i32> addrspace(3)*
|
||||
}
|
||||
|
||||
; FUNC-LABEL: {{^}}local_zextload_v16i16_to_v16i32:
|
||||
; GFX9-NOT: m0
|
||||
; SICIVI: s_mov_b32 m0
|
||||
|
||||
; GCN-DAG: ds_read2_b64 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}} offset1:1{{$}}
|
||||
; GCN-DAG: ds_read2_b64 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}} offset0:2 offset1:3{{$}}
|
||||
|
||||
@ -312,6 +366,9 @@ define amdgpu_kernel void @local_zextload_v16i16_to_v16i32(<16 x i32> addrspace(
|
||||
}
|
||||
|
||||
; FUNC-LABEL: {{^}}local_sextload_v16i16_to_v16i32:
|
||||
; GFX9-NOT: m0
|
||||
; SICIVI: s_mov_b32 m0
|
||||
|
||||
|
||||
; GCN-DAG: ds_read2_b64 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}} offset1:1{{$}}
|
||||
; GCN-DAG: ds_read2_b64 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}} offset0:2 offset1:3{{$}}
|
||||
@ -348,6 +405,9 @@ define amdgpu_kernel void @local_sextload_v16i16_to_v16i32(<16 x i32> addrspace(
|
||||
}
|
||||
|
||||
; FUNC-LABEL: {{^}}local_zextload_v32i16_to_v32i32:
|
||||
; GFX9-NOT: m0
|
||||
; SICIVI: s_mov_b32 m0
|
||||
|
||||
; GCN-DAG: ds_read2_b64 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}} offset1:1{{$}}
|
||||
; GCN-DAG: ds_read2_b64 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}} offset0:2 offset1:3
|
||||
; GCN-DAG: ds_read2_b64 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}} offset0:4 offset1:5
|
||||
@ -377,6 +437,9 @@ define amdgpu_kernel void @local_zextload_v32i16_to_v32i32(<32 x i32> addrspace(
|
||||
}
|
||||
|
||||
; FUNC-LABEL: {{^}}local_sextload_v32i16_to_v32i32:
|
||||
; GFX9-NOT: m0
|
||||
; SICIVI: s_mov_b32 m0
|
||||
|
||||
; GCN-DAG: ds_read2_b64 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}} offset0:2 offset1:3{{$}}
|
||||
; GCN-DAG: ds_read2_b64 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}} offset0:4 offset1:5
|
||||
; GCN-DAG: ds_read2_b64 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}} offset1:1{{$}}
|
||||
@ -414,6 +477,9 @@ define amdgpu_kernel void @local_sextload_v32i16_to_v32i32(<32 x i32> addrspace(
|
||||
}
|
||||
|
||||
; FUNC-LABEL: {{^}}local_zextload_v64i16_to_v64i32:
|
||||
; GFX9-NOT: m0
|
||||
; SICIVI: s_mov_b32 m0
|
||||
|
||||
; GCN-DAG: ds_read2_b64 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}} offset0:14 offset1:15
|
||||
; GCN-DAG: ds_read2_b64 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}} offset1:1{{$}}
|
||||
; GCN-DAG: ds_read2_b64 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}} offset0:2 offset1:3
|
||||
@ -479,6 +545,8 @@ define amdgpu_kernel void @local_zextload_v64i16_to_v64i32(<64 x i32> addrspace(
|
||||
}
|
||||
|
||||
; FUNC-LABEL: {{^}}local_sextload_v64i16_to_v64i32:
|
||||
; GFX9-NOT: m0
|
||||
; SICIVI: s_mov_b32 m0
|
||||
|
||||
; EG: LDS_READ_RET
|
||||
; EG: LDS_READ_RET
|
||||
@ -520,6 +588,9 @@ define amdgpu_kernel void @local_sextload_v64i16_to_v64i32(<64 x i32> addrspace(
|
||||
}
|
||||
|
||||
; FUNC-LABEL: {{^}}local_zextload_i16_to_i64:
|
||||
; GFX9-NOT: m0
|
||||
; SICIVI: s_mov_b32 m0
|
||||
|
||||
; GCN-DAG: ds_read_u16 v[[LO:[0-9]+]],
|
||||
; GCN-DAG: v_mov_b32_e32 v[[HI:[0-9]+]], 0{{$}}
|
||||
|
||||
@ -538,13 +609,16 @@ define amdgpu_kernel void @local_zextload_i16_to_i64(i64 addrspace(3)* %out, i16
|
||||
}
|
||||
|
||||
; FUNC-LABEL: {{^}}local_sextload_i16_to_i64:
|
||||
; GFX9-NOT: m0
|
||||
; SICIVI: s_mov_b32 m0
|
||||
|
||||
; FIXME: Need to optimize this sequence to avoid an extra shift.
|
||||
; t25: i32,ch = load<LD2[%in(addrspace=3)], anyext from i16> t12, t10, undef:i32
|
||||
; t28: i64 = any_extend t25
|
||||
; t30: i64 = sign_extend_inreg t28, ValueType:ch:i16
|
||||
; SI: ds_read_i16 v[[LO:[0-9]+]],
|
||||
; VI: ds_read_u16 v[[ULO:[0-9]+]]
|
||||
; VI: v_bfe_i32 v[[LO:[0-9]+]], v[[ULO]], 0, 16
|
||||
; GFX89: ds_read_u16 v[[ULO:[0-9]+]]
|
||||
; GFX89: v_bfe_i32 v[[LO:[0-9]+]], v[[ULO]], 0, 16
|
||||
; GCN-DAG: v_ashrrev_i32_e32 v[[HI:[0-9]+]], 31, v[[LO]]
|
||||
|
||||
; GCN: ds_write_b64 v{{[0-9]+}}, v{{\[}}[[LO]]:[[HI]]]
|
||||
@ -565,6 +639,9 @@ define amdgpu_kernel void @local_sextload_i16_to_i64(i64 addrspace(3)* %out, i16
|
||||
}
|
||||
|
||||
; FUNC-LABEL: {{^}}local_zextload_v1i16_to_v1i64:
|
||||
; GFX9-NOT: m0
|
||||
; SICIVI: s_mov_b32 m0
|
||||
|
||||
|
||||
; EG: MOV {{[* ]*}}[[FROM:T[0-9]+\.[XYZW]]], KC0[2].Z
|
||||
; EG: LDS_USHORT_READ_RET {{.*}} [[FROM]]
|
||||
@ -579,6 +656,9 @@ define amdgpu_kernel void @local_zextload_v1i16_to_v1i64(<1 x i64> addrspace(3)*
|
||||
}
|
||||
|
||||
; FUNC-LABEL: {{^}}local_sextload_v1i16_to_v1i64:
|
||||
; GFX9-NOT: m0
|
||||
; SICIVI: s_mov_b32 m0
|
||||
|
||||
|
||||
; EG: MOV {{[* ]*}}[[FROM:T[0-9]+\.[XYZW]]], KC0[2].Z
|
||||
; EG: LDS_USHORT_READ_RET {{.*}} [[FROM]]
|
||||
@ -596,6 +676,9 @@ define amdgpu_kernel void @local_sextload_v1i16_to_v1i64(<1 x i64> addrspace(3)*
|
||||
}
|
||||
|
||||
; FUNC-LABEL: {{^}}local_zextload_v2i16_to_v2i64:
|
||||
; GFX9-NOT: m0
|
||||
; SICIVI: s_mov_b32 m0
|
||||
|
||||
|
||||
; EG: LDS_READ_RET
|
||||
define amdgpu_kernel void @local_zextload_v2i16_to_v2i64(<2 x i64> addrspace(3)* %out, <2 x i16> addrspace(3)* %in) #0 {
|
||||
@ -606,6 +689,9 @@ define amdgpu_kernel void @local_zextload_v2i16_to_v2i64(<2 x i64> addrspace(3)*
|
||||
}
|
||||
|
||||
; FUNC-LABEL: {{^}}local_sextload_v2i16_to_v2i64:
|
||||
; GFX9-NOT: m0
|
||||
; SICIVI: s_mov_b32 m0
|
||||
|
||||
|
||||
; EG: LDS_READ_RET
|
||||
; EG-DAG: BFE_INT
|
||||
@ -618,6 +704,9 @@ define amdgpu_kernel void @local_sextload_v2i16_to_v2i64(<2 x i64> addrspace(3)*
|
||||
}
|
||||
|
||||
; FUNC-LABEL: {{^}}local_zextload_v4i16_to_v4i64:
|
||||
; GFX9-NOT: m0
|
||||
; SICIVI: s_mov_b32 m0
|
||||
|
||||
|
||||
; EG: LDS_READ_RET
|
||||
; EG: LDS_READ_RET
|
||||
@ -629,6 +718,9 @@ define amdgpu_kernel void @local_zextload_v4i16_to_v4i64(<4 x i64> addrspace(3)*
|
||||
}
|
||||
|
||||
; FUNC-LABEL: {{^}}local_sextload_v4i16_to_v4i64:
|
||||
; GFX9-NOT: m0
|
||||
; SICIVI: s_mov_b32 m0
|
||||
|
||||
|
||||
; EG: LDS_READ_RET
|
||||
; EG: LDS_READ_RET
|
||||
@ -644,6 +736,9 @@ define amdgpu_kernel void @local_sextload_v4i16_to_v4i64(<4 x i64> addrspace(3)*
|
||||
}
|
||||
|
||||
; FUNC-LABEL: {{^}}local_zextload_v8i16_to_v8i64:
|
||||
; GFX9-NOT: m0
|
||||
; SICIVI: s_mov_b32 m0
|
||||
|
||||
|
||||
; EG: LDS_READ_RET
|
||||
; EG: LDS_READ_RET
|
||||
@ -657,6 +752,9 @@ define amdgpu_kernel void @local_zextload_v8i16_to_v8i64(<8 x i64> addrspace(3)*
|
||||
}
|
||||
|
||||
; FUNC-LABEL: {{^}}local_sextload_v8i16_to_v8i64:
|
||||
; GFX9-NOT: m0
|
||||
; SICIVI: s_mov_b32 m0
|
||||
|
||||
|
||||
; EG: LDS_READ_RET
|
||||
; EG: LDS_READ_RET
|
||||
@ -678,6 +776,9 @@ define amdgpu_kernel void @local_sextload_v8i16_to_v8i64(<8 x i64> addrspace(3)*
|
||||
}
|
||||
|
||||
; FUNC-LABEL: {{^}}local_zextload_v16i16_to_v16i64:
|
||||
; GFX9-NOT: m0
|
||||
; SICIVI: s_mov_b32 m0
|
||||
|
||||
|
||||
; EG: LDS_READ_RET
|
||||
; EG: LDS_READ_RET
|
||||
@ -695,6 +796,9 @@ define amdgpu_kernel void @local_zextload_v16i16_to_v16i64(<16 x i64> addrspace(
|
||||
}
|
||||
|
||||
; FUNC-LABEL: {{^}}local_sextload_v16i16_to_v16i64:
|
||||
; GFX9-NOT: m0
|
||||
; SICIVI: s_mov_b32 m0
|
||||
|
||||
|
||||
; EG: LDS_READ_RET
|
||||
; EG: LDS_READ_RET
|
||||
@ -728,6 +832,9 @@ define amdgpu_kernel void @local_sextload_v16i16_to_v16i64(<16 x i64> addrspace(
|
||||
}
|
||||
|
||||
; FUNC-LABEL: {{^}}local_zextload_v32i16_to_v32i64:
|
||||
; GFX9-NOT: m0
|
||||
; SICIVI: s_mov_b32 m0
|
||||
|
||||
|
||||
; EG: LDS_READ_RET
|
||||
; EG: LDS_READ_RET
|
||||
@ -753,6 +860,9 @@ define amdgpu_kernel void @local_zextload_v32i16_to_v32i64(<32 x i64> addrspace(
|
||||
}
|
||||
|
||||
; FUNC-LABEL: {{^}}local_sextload_v32i16_to_v32i64:
|
||||
; GFX9-NOT: m0
|
||||
; SICIVI: s_mov_b32 m0
|
||||
|
||||
|
||||
; EG: LDS_READ_RET
|
||||
; EG: LDS_READ_RET
|
||||
|
@ -1,11 +1,12 @@
|
||||
; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=FUNC %s
|
||||
; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=FUNC %s
|
||||
; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,SICIVI,FUNC %s
|
||||
; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,SICIVI,FUNC %s
|
||||
; RUN: llc -march=amdgcn -mcpu=gfx900 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,VI,FUNC %s
|
||||
; RUN: llc -march=r600 -mcpu=redwood < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s
|
||||
|
||||
|
||||
; FUNC-LABEL: {{^}}local_load_i32:
|
||||
; GCN-NOT: s_wqm_b64
|
||||
; GCN: s_mov_b32 m0, -1
|
||||
; SICIVI: s_mov_b32 m0, -1
|
||||
; GFX9-NOT: m0
|
||||
; GCN: ds_read_b32
|
||||
|
||||
; EG: LDS_READ_RET
|
||||
@ -17,6 +18,9 @@ entry:
|
||||
}
|
||||
|
||||
; FUNC-LABEL: {{^}}local_load_v2i32:
|
||||
; SICIVI: s_mov_b32 m0, -1
|
||||
; GFX9-NOT: m0
|
||||
|
||||
; GCN: ds_read_b64
|
||||
define amdgpu_kernel void @local_load_v2i32(<2 x i32> addrspace(3)* %out, <2 x i32> addrspace(3)* %in) #0 {
|
||||
entry:
|
||||
@ -26,6 +30,9 @@ entry:
|
||||
}
|
||||
|
||||
; FUNC-LABEL: {{^}}local_load_v3i32:
|
||||
; SICIVI: s_mov_b32 m0, -1
|
||||
; GFX9-NOT: m0
|
||||
|
||||
; GCN-DAG: ds_read_b64
|
||||
; GCN-DAG: ds_read_b32
|
||||
define amdgpu_kernel void @local_load_v3i32(<3 x i32> addrspace(3)* %out, <3 x i32> addrspace(3)* %in) #0 {
|
||||
@ -36,6 +43,9 @@ entry:
|
||||
}
|
||||
|
||||
; FUNC-LABEL: {{^}}local_load_v4i32:
|
||||
; SICIVI: s_mov_b32 m0, -1
|
||||
; GFX9-NOT: m0
|
||||
|
||||
; GCN: ds_read2_b64 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}} offset1:1{{$}}
|
||||
|
||||
define amdgpu_kernel void @local_load_v4i32(<4 x i32> addrspace(3)* %out, <4 x i32> addrspace(3)* %in) #0 {
|
||||
@ -46,6 +56,9 @@ entry:
|
||||
}
|
||||
|
||||
; FUNC-LABEL: {{^}}local_load_v8i32:
|
||||
; SICIVI: s_mov_b32 m0, -1
|
||||
; GFX9-NOT: m0
|
||||
|
||||
; GCN-DAG: ds_read2_b64 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}} offset0:2 offset1:3{{$}}
|
||||
; GCN-DAG: ds_read2_b64 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}} offset1:1{{$}}
|
||||
define amdgpu_kernel void @local_load_v8i32(<8 x i32> addrspace(3)* %out, <8 x i32> addrspace(3)* %in) #0 {
|
||||
@ -56,6 +69,9 @@ entry:
|
||||
}
|
||||
|
||||
; FUNC-LABEL: {{^}}local_load_v16i32:
|
||||
; SICIVI: s_mov_b32 m0, -1
|
||||
; GFX9-NOT: m0
|
||||
|
||||
; GCN-DAG: ds_read2_b64 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}} offset0:6 offset1:7{{$}}
|
||||
; GCN-DAG: ds_read2_b64 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}} offset0:4 offset1:5{{$}}
|
||||
; GCN-DAG: ds_read2_b64 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}} offset0:2 offset1:3{{$}}
|
||||
@ -72,6 +88,9 @@ entry:
|
||||
}
|
||||
|
||||
; FUNC-LABEL: {{^}}local_zextload_i32_to_i64:
|
||||
; SICIVI: s_mov_b32 m0, -1
|
||||
; GFX9-NOT: m0
|
||||
|
||||
define amdgpu_kernel void @local_zextload_i32_to_i64(i64 addrspace(3)* %out, i32 addrspace(3)* %in) #0 {
|
||||
%ld = load i32, i32 addrspace(3)* %in
|
||||
%ext = zext i32 %ld to i64
|
||||
@ -80,6 +99,9 @@ define amdgpu_kernel void @local_zextload_i32_to_i64(i64 addrspace(3)* %out, i32
|
||||
}
|
||||
|
||||
; FUNC-LABEL: {{^}}local_sextload_i32_to_i64:
|
||||
; SICIVI: s_mov_b32 m0, -1
|
||||
; GFX9-NOT: m0
|
||||
|
||||
define amdgpu_kernel void @local_sextload_i32_to_i64(i64 addrspace(3)* %out, i32 addrspace(3)* %in) #0 {
|
||||
%ld = load i32, i32 addrspace(3)* %in
|
||||
%ext = sext i32 %ld to i64
|
||||
@ -88,6 +110,9 @@ define amdgpu_kernel void @local_sextload_i32_to_i64(i64 addrspace(3)* %out, i32
|
||||
}
|
||||
|
||||
; FUNC-LABEL: {{^}}local_zextload_v1i32_to_v1i64:
|
||||
; SICIVI: s_mov_b32 m0, -1
|
||||
; GFX9-NOT: m0
|
||||
|
||||
define amdgpu_kernel void @local_zextload_v1i32_to_v1i64(<1 x i64> addrspace(3)* %out, <1 x i32> addrspace(3)* %in) #0 {
|
||||
%ld = load <1 x i32>, <1 x i32> addrspace(3)* %in
|
||||
%ext = zext <1 x i32> %ld to <1 x i64>
|
||||
@ -96,6 +121,9 @@ define amdgpu_kernel void @local_zextload_v1i32_to_v1i64(<1 x i64> addrspace(3)*
|
||||
}
|
||||
|
||||
; FUNC-LABEL: {{^}}local_sextload_v1i32_to_v1i64:
|
||||
; SICIVI: s_mov_b32 m0, -1
|
||||
; GFX9-NOT: m0
|
||||
|
||||
define amdgpu_kernel void @local_sextload_v1i32_to_v1i64(<1 x i64> addrspace(3)* %out, <1 x i32> addrspace(3)* %in) #0 {
|
||||
%ld = load <1 x i32>, <1 x i32> addrspace(3)* %in
|
||||
%ext = sext <1 x i32> %ld to <1 x i64>
|
||||
@ -104,6 +132,9 @@ define amdgpu_kernel void @local_sextload_v1i32_to_v1i64(<1 x i64> addrspace(3)*
|
||||
}
|
||||
|
||||
; FUNC-LABEL: {{^}}local_zextload_v2i32_to_v2i64:
|
||||
; SICIVI: s_mov_b32 m0, -1
|
||||
; GFX9-NOT: m0
|
||||
|
||||
define amdgpu_kernel void @local_zextload_v2i32_to_v2i64(<2 x i64> addrspace(3)* %out, <2 x i32> addrspace(3)* %in) #0 {
|
||||
%ld = load <2 x i32>, <2 x i32> addrspace(3)* %in
|
||||
%ext = zext <2 x i32> %ld to <2 x i64>
|
||||
@ -112,6 +143,9 @@ define amdgpu_kernel void @local_zextload_v2i32_to_v2i64(<2 x i64> addrspace(3)*
|
||||
}
|
||||
|
||||
; FUNC-LABEL: {{^}}local_sextload_v2i32_to_v2i64:
|
||||
; SICIVI: s_mov_b32 m0, -1
|
||||
; GFX9-NOT: m0
|
||||
|
||||
define amdgpu_kernel void @local_sextload_v2i32_to_v2i64(<2 x i64> addrspace(3)* %out, <2 x i32> addrspace(3)* %in) #0 {
|
||||
%ld = load <2 x i32>, <2 x i32> addrspace(3)* %in
|
||||
%ext = sext <2 x i32> %ld to <2 x i64>
|
||||
@ -120,6 +154,9 @@ define amdgpu_kernel void @local_sextload_v2i32_to_v2i64(<2 x i64> addrspace(3)*
|
||||
}
|
||||
|
||||
; FUNC-LABEL: {{^}}local_zextload_v4i32_to_v4i64:
|
||||
; SICIVI: s_mov_b32 m0, -1
|
||||
; GFX9-NOT: m0
|
||||
|
||||
define amdgpu_kernel void @local_zextload_v4i32_to_v4i64(<4 x i64> addrspace(3)* %out, <4 x i32> addrspace(3)* %in) #0 {
|
||||
%ld = load <4 x i32>, <4 x i32> addrspace(3)* %in
|
||||
%ext = zext <4 x i32> %ld to <4 x i64>
|
||||
@ -128,6 +165,9 @@ define amdgpu_kernel void @local_zextload_v4i32_to_v4i64(<4 x i64> addrspace(3)*
|
||||
}
|
||||
|
||||
; FUNC-LABEL: {{^}}local_sextload_v4i32_to_v4i64:
|
||||
; SICIVI: s_mov_b32 m0, -1
|
||||
; GFX9-NOT: m0
|
||||
|
||||
define amdgpu_kernel void @local_sextload_v4i32_to_v4i64(<4 x i64> addrspace(3)* %out, <4 x i32> addrspace(3)* %in) #0 {
|
||||
%ld = load <4 x i32>, <4 x i32> addrspace(3)* %in
|
||||
%ext = sext <4 x i32> %ld to <4 x i64>
|
||||
@ -136,6 +176,9 @@ define amdgpu_kernel void @local_sextload_v4i32_to_v4i64(<4 x i64> addrspace(3)*
|
||||
}
|
||||
|
||||
; FUNC-LABEL: {{^}}local_zextload_v8i32_to_v8i64:
|
||||
; SICIVI: s_mov_b32 m0, -1
|
||||
; GFX9-NOT: m0
|
||||
|
||||
define amdgpu_kernel void @local_zextload_v8i32_to_v8i64(<8 x i64> addrspace(3)* %out, <8 x i32> addrspace(3)* %in) #0 {
|
||||
%ld = load <8 x i32>, <8 x i32> addrspace(3)* %in
|
||||
%ext = zext <8 x i32> %ld to <8 x i64>
|
||||
@ -144,6 +187,9 @@ define amdgpu_kernel void @local_zextload_v8i32_to_v8i64(<8 x i64> addrspace(3)*
|
||||
}
|
||||
|
||||
; FUNC-LABEL: {{^}}local_sextload_v8i32_to_v8i64:
|
||||
; SICIVI: s_mov_b32 m0, -1
|
||||
; GFX9-NOT: m0
|
||||
|
||||
define amdgpu_kernel void @local_sextload_v8i32_to_v8i64(<8 x i64> addrspace(3)* %out, <8 x i32> addrspace(3)* %in) #0 {
|
||||
%ld = load <8 x i32>, <8 x i32> addrspace(3)* %in
|
||||
%ext = sext <8 x i32> %ld to <8 x i64>
|
||||
@ -152,6 +198,9 @@ define amdgpu_kernel void @local_sextload_v8i32_to_v8i64(<8 x i64> addrspace(3)*
|
||||
}
|
||||
|
||||
; FUNC-LABEL: {{^}}local_sextload_v16i32_to_v16i64:
|
||||
; SICIVI: s_mov_b32 m0, -1
|
||||
; GFX9-NOT: m0
|
||||
|
||||
define amdgpu_kernel void @local_sextload_v16i32_to_v16i64(<16 x i64> addrspace(3)* %out, <16 x i32> addrspace(3)* %in) #0 {
|
||||
%ld = load <16 x i32>, <16 x i32> addrspace(3)* %in
|
||||
%ext = sext <16 x i32> %ld to <16 x i64>
|
||||
@ -160,6 +209,9 @@ define amdgpu_kernel void @local_sextload_v16i32_to_v16i64(<16 x i64> addrspace(
|
||||
}
|
||||
|
||||
; FUNC-LABEL: {{^}}local_zextload_v16i32_to_v16i64
|
||||
; SICIVI: s_mov_b32 m0, -1
|
||||
; GFX9-NOT: m0
|
||||
|
||||
define amdgpu_kernel void @local_zextload_v16i32_to_v16i64(<16 x i64> addrspace(3)* %out, <16 x i32> addrspace(3)* %in) #0 {
|
||||
%ld = load <16 x i32>, <16 x i32> addrspace(3)* %in
|
||||
%ext = zext <16 x i32> %ld to <16 x i64>
|
||||
@ -168,6 +220,9 @@ define amdgpu_kernel void @local_zextload_v16i32_to_v16i64(<16 x i64> addrspace(
|
||||
}
|
||||
|
||||
; FUNC-LABEL: {{^}}local_sextload_v32i32_to_v32i64:
|
||||
; SICIVI: s_mov_b32 m0, -1
|
||||
; GFX9-NOT: m0
|
||||
|
||||
define amdgpu_kernel void @local_sextload_v32i32_to_v32i64(<32 x i64> addrspace(3)* %out, <32 x i32> addrspace(3)* %in) #0 {
|
||||
%ld = load <32 x i32>, <32 x i32> addrspace(3)* %in
|
||||
%ext = sext <32 x i32> %ld to <32 x i64>
|
||||
@ -176,6 +231,9 @@ define amdgpu_kernel void @local_sextload_v32i32_to_v32i64(<32 x i64> addrspace(
|
||||
}
|
||||
|
||||
; FUNC-LABEL: {{^}}local_zextload_v32i32_to_v32i64:
|
||||
; SICIVI: s_mov_b32 m0, -1
|
||||
; GFX9-NOT: m0
|
||||
|
||||
define amdgpu_kernel void @local_zextload_v32i32_to_v32i64(<32 x i64> addrspace(3)* %out, <32 x i32> addrspace(3)* %in) #0 {
|
||||
%ld = load <32 x i32>, <32 x i32> addrspace(3)* %in
|
||||
%ext = zext <32 x i32> %ld to <32 x i64>
|
||||
|
@ -1,9 +1,13 @@
|
||||
; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=FUNC %s
|
||||
; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=kaveri -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=FUNC %s
|
||||
; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=FUNC %s
|
||||
; RUN: llc -march=r600 -mcpu=redwood < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s
|
||||
; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,SICIVI,FUNC %s
|
||||
; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=kaveri -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,SICIVI,FUNC %s
|
||||
; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,SICIVI,FUNC %s
|
||||
; RUN: llc -march=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX9,FUNC %s
|
||||
; RUN: llc -march=r600 -mcpu=redwood < %s | FileCheck -check-prefixes=EG,FUNC %s
|
||||
|
||||
; FUNC-LABEL: {{^}}local_load_i64:
|
||||
; SICIVI: s_mov_b32 m0
|
||||
; GFX9-NOT: m0
|
||||
|
||||
; GCN: ds_read_b64 [[VAL:v\[[0-9]+:[0-9]+\]]], v{{[0-9]+}}{{$}}
|
||||
; GCN: ds_write_b64 v{{[0-9]+}}, [[VAL]]
|
||||
|
||||
@ -16,6 +20,9 @@ define amdgpu_kernel void @local_load_i64(i64 addrspace(3)* %out, i64 addrspace(
|
||||
}
|
||||
|
||||
; FUNC-LABEL: {{^}}local_load_v2i64:
|
||||
; SICIVI: s_mov_b32 m0
|
||||
; GFX9-NOT: m0
|
||||
|
||||
; GCN: ds_read2_b64
|
||||
|
||||
; EG: LDS_READ_RET
|
||||
@ -30,6 +37,9 @@ entry:
|
||||
}
|
||||
|
||||
; FUNC-LABEL: {{^}}local_load_v3i64:
|
||||
; SICIVI: s_mov_b32 m0
|
||||
; GFX9-NOT: m0
|
||||
|
||||
; GCN-DAG: ds_read2_b64
|
||||
; GCN-DAG: ds_read_b64
|
||||
|
||||
@ -47,6 +57,9 @@ entry:
|
||||
}
|
||||
|
||||
; FUNC-LABEL: {{^}}local_load_v4i64:
|
||||
; SICIVI: s_mov_b32 m0
|
||||
; GFX9-NOT: m0
|
||||
|
||||
; GCN: ds_read2_b64
|
||||
; GCN: ds_read2_b64
|
||||
|
||||
@ -67,6 +80,9 @@ entry:
|
||||
}
|
||||
|
||||
; FUNC-LABEL: {{^}}local_load_v8i64:
|
||||
; SICIVI: s_mov_b32 m0
|
||||
; GFX9-NOT: m0
|
||||
|
||||
; GCN: ds_read2_b64
|
||||
; GCN: ds_read2_b64
|
||||
; GCN: ds_read2_b64
|
||||
@ -96,6 +112,9 @@ entry:
|
||||
}
|
||||
|
||||
; FUNC-LABEL: {{^}}local_load_v16i64:
|
||||
; SICIVI: s_mov_b32 m0
|
||||
; GFX9-NOT: m0
|
||||
|
||||
; GCN: ds_read2_b64
|
||||
; GCN: ds_read2_b64
|
||||
; GCN: ds_read2_b64
|
||||
|
@ -1,11 +1,13 @@
|
||||
; RUN: llc -march=amdgcn -mtriple=amdgcn---amdgiz -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,SI,FUNC %s
|
||||
; RUN: llc -march=amdgcn -mtriple=amdgcn---amdgiz -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,VI,FUNC %s
|
||||
; RUN: llc -march=amdgcn -mtriple=amdgcn---amdgiz -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,SI,SICIVI,FUNC %s
|
||||
; RUN: llc -march=amdgcn -mtriple=amdgcn---amdgiz -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,VI,SICIVI,FUNC %s
|
||||
; RUN: llc -march=amdgcn -mtriple=amdgcn---amdgiz -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX9,FUNC %s
|
||||
; RUN: llc -march=r600 -mtriple=r600---amdgiz -mcpu=redwood -verify-machineinstrs < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s
|
||||
|
||||
|
||||
; FUNC-LABEL: {{^}}local_load_i8:
|
||||
; GCN-NOT: s_wqm_b64
|
||||
; GCN: s_mov_b32 m0
|
||||
; SICIVI: s_mov_b32 m0
|
||||
; GFX9-NOT: m0
|
||||
; GCN: ds_read_u8
|
||||
|
||||
; EG: LDS_UBYTE_READ_RET
|
||||
@ -18,7 +20,8 @@ entry:
|
||||
|
||||
; FUNC-LABEL: {{^}}local_load_v2i8:
|
||||
; GCN-NOT: s_wqm_b64
|
||||
; GCN: s_mov_b32 m0
|
||||
; SICIVI: s_mov_b32 m0
|
||||
; GFX9-NOT: m0
|
||||
; GCN: ds_read_u16
|
||||
|
||||
; EG: LDS_USHORT_READ_RET
|
||||
@ -30,6 +33,7 @@ entry:
|
||||
}
|
||||
|
||||
; FUNC-LABEL: {{^}}local_load_v3i8:
|
||||
; GFX9-NOT: m0
|
||||
; GCN: ds_read_b32
|
||||
|
||||
; EG: DS_READ_RET
|
||||
@ -41,6 +45,7 @@ entry:
|
||||
}
|
||||
|
||||
; FUNC-LABEL: {{^}}local_load_v4i8:
|
||||
; GFX9-NOT: m0
|
||||
; GCN: ds_read_b32
|
||||
|
||||
; EG: LDS_READ_RET
|
||||
@ -52,6 +57,7 @@ entry:
|
||||
}
|
||||
|
||||
; FUNC-LABEL: {{^}}local_load_v8i8:
|
||||
; GFX9-NOT: m0
|
||||
; GCN: ds_read_b64
|
||||
|
||||
; EG: LDS_READ_RET
|
||||
@ -64,6 +70,7 @@ entry:
|
||||
}
|
||||
|
||||
; FUNC-LABEL: {{^}}local_load_v16i8:
|
||||
; GFX9-NOT: m0
|
||||
; GCN: ds_read2_b64 v{{\[}}[[LO:[0-9]+]]:[[HI:[0-9]+]]{{\]}}, v{{[0-9]+}} offset1:1{{$}}
|
||||
; GCN: ds_write2_b64 v{{[0-9]+}}, v{{\[}}[[LO]]:{{[0-9]+}}], v[{{[0-9]+}}:[[HI]]{{\]}} offset1:1{{$}}
|
||||
|
||||
@ -79,8 +86,9 @@ entry:
|
||||
}
|
||||
|
||||
; FUNC-LABEL: {{^}}local_zextload_i8_to_i32:
|
||||
; GFX9-NOT: m0
|
||||
; GCN-NOT: s_wqm_b64
|
||||
; GCN: s_mov_b32 m0
|
||||
; SICIVI: s_mov_b32 m0
|
||||
; GCN: ds_read_u8
|
||||
|
||||
; EG: LDS_UBYTE_READ_RET
|
||||
@ -93,7 +101,8 @@ define amdgpu_kernel void @local_zextload_i8_to_i32(i32 addrspace(3)* %out, i8 a
|
||||
|
||||
; FUNC-LABEL: {{^}}local_sextload_i8_to_i32:
|
||||
; GCN-NOT: s_wqm_b64
|
||||
; GCN: s_mov_b32 m0
|
||||
; GFX9-NOT: m0
|
||||
; SICIVI: s_mov_b32 m0
|
||||
; GCN: ds_read_i8
|
||||
|
||||
; EG: LDS_UBYTE_READ_RET
|
||||
@ -116,6 +125,7 @@ define amdgpu_kernel void @local_zextload_v1i8_to_v1i32(<1 x i32> addrspace(3)*
|
||||
}
|
||||
|
||||
; FUNC-LABEL: {{^}}local_sextload_v1i8_to_v1i32:
|
||||
; GFX9-NOT: m0
|
||||
|
||||
; EG: LDS_UBYTE_READ_RET
|
||||
; EG: BFE_INT
|
||||
@ -127,6 +137,7 @@ define amdgpu_kernel void @local_sextload_v1i8_to_v1i32(<1 x i32> addrspace(3)*
|
||||
}
|
||||
|
||||
; FUNC-LABEL: {{^}}local_zextload_v2i8_to_v2i32:
|
||||
; GFX9-NOT: m0
|
||||
; GCN: ds_read_u16
|
||||
|
||||
; EG: LDS_USHORT_READ_RET
|
||||
@ -139,7 +150,8 @@ define amdgpu_kernel void @local_zextload_v2i8_to_v2i32(<2 x i32> addrspace(3)*
|
||||
|
||||
; FUNC-LABEL: {{^}}local_sextload_v2i8_to_v2i32:
|
||||
; GCN-NOT: s_wqm_b64
|
||||
; GCN: s_mov_b32 m0
|
||||
; GFX9-NOT: m0
|
||||
; SICIVI: s_mov_b32 m0
|
||||
; GCN: ds_read_u16
|
||||
; FIXME: Need to optimize this sequence to avoid extra shift on VI.
|
||||
; t23: i16 = srl t39, Constant:i32<8>
|
||||
@ -164,6 +176,7 @@ define amdgpu_kernel void @local_sextload_v2i8_to_v2i32(<2 x i32> addrspace(3)*
|
||||
}
|
||||
|
||||
; FUNC-LABEL: {{^}}local_zextload_v3i8_to_v3i32:
|
||||
; GFX9-NOT: m0
|
||||
; GCN: ds_read_b32
|
||||
|
||||
; SI-DAG: v_bfe_u32 v{{[0-9]+}}, v{{[0-9]+}}, 8, 8
|
||||
@ -182,7 +195,8 @@ entry:
|
||||
|
||||
; FUNC-LABEL: {{^}}local_sextload_v3i8_to_v3i32:
|
||||
; GCN-NOT: s_wqm_b64
|
||||
; GCN: s_mov_b32 m0
|
||||
; GFX9-NOT: m0
|
||||
; SICIVI: s_mov_b32 m0
|
||||
; GCN: ds_read_b32
|
||||
|
||||
; GCN-DAG: v_bfe_i32
|
||||
@ -207,7 +221,8 @@ entry:
|
||||
|
||||
; FUNC-LABEL: {{^}}local_zextload_v4i8_to_v4i32:
|
||||
; GCN-NOT: s_wqm_b64
|
||||
; GCN: s_mov_b32 m0
|
||||
; GFX9-NOT: m0
|
||||
; SICIVI: s_mov_b32 m0
|
||||
; GCN: ds_read_b32
|
||||
|
||||
; EG: LDS_READ_RET
|
||||
@ -223,7 +238,8 @@ define amdgpu_kernel void @local_zextload_v4i8_to_v4i32(<4 x i32> addrspace(3)*
|
||||
|
||||
; FUNC-LABEL: {{^}}local_sextload_v4i8_to_v4i32:
|
||||
; GCN-NOT: s_wqm_b64
|
||||
; GCN: s_mov_b32 m0
|
||||
; GFX9-NOT: m0
|
||||
; SICIVI: s_mov_b32 m0
|
||||
; GCN: ds_read_b32
|
||||
|
||||
; EG-DAG: LDS_READ_RET
|
||||
@ -239,6 +255,8 @@ define amdgpu_kernel void @local_sextload_v4i8_to_v4i32(<4 x i32> addrspace(3)*
|
||||
}
|
||||
|
||||
; FUNC-LABEL: {{^}}local_zextload_v8i8_to_v8i32:
|
||||
; SICIVI: s_mov_b32 m0
|
||||
; GFX9-NOT: m0
|
||||
|
||||
; EG-DAG: LDS_READ_RET
|
||||
; EG-DAG: LDS_READ_RET
|
||||
@ -256,6 +274,8 @@ define amdgpu_kernel void @local_zextload_v8i8_to_v8i32(<8 x i32> addrspace(3)*
|
||||
}
|
||||
|
||||
; FUNC-LABEL: {{^}}local_sextload_v8i8_to_v8i32:
|
||||
; SICIVI: s_mov_b32 m0
|
||||
; GFX9-NOT: m0
|
||||
|
||||
; EG-DAG: LDS_READ_RET
|
||||
; EG-DAG: LDS_READ_RET
|
||||
@ -275,6 +295,8 @@ define amdgpu_kernel void @local_sextload_v8i8_to_v8i32(<8 x i32> addrspace(3)*
|
||||
}
|
||||
|
||||
; FUNC-LABEL: {{^}}local_zextload_v16i8_to_v16i32:
|
||||
; SICIVI: s_mov_b32 m0
|
||||
; GFX9-NOT: m0
|
||||
|
||||
; EG-DAG: LDS_READ_RET
|
||||
; EG-DAG: LDS_READ_RET
|
||||
@ -300,6 +322,8 @@ define amdgpu_kernel void @local_zextload_v16i8_to_v16i32(<16 x i32> addrspace(3
|
||||
}
|
||||
|
||||
; FUNC-LABEL: {{^}}local_sextload_v16i8_to_v16i32:
|
||||
; SICIVI: s_mov_b32 m0
|
||||
; GFX9-NOT: m0
|
||||
|
||||
; EG-DAG: LDS_READ_RET
|
||||
; EG-DAG: LDS_READ_RET
|
||||
@ -329,6 +353,8 @@ define amdgpu_kernel void @local_sextload_v16i8_to_v16i32(<16 x i32> addrspace(3
|
||||
}
|
||||
|
||||
; FUNC-LABEL: {{^}}local_zextload_v32i8_to_v32i32:
|
||||
; SICIVI: s_mov_b32 m0
|
||||
; GFX9-NOT: m0
|
||||
|
||||
; EG-DAG: LDS_READ_RET
|
||||
; EG-DAG: LDS_READ_RET
|
||||
@ -346,6 +372,8 @@ define amdgpu_kernel void @local_zextload_v32i8_to_v32i32(<32 x i32> addrspace(3
|
||||
}
|
||||
|
||||
; FUNC-LABEL: {{^}}local_sextload_v32i8_to_v32i32:
|
||||
; SICIVI: s_mov_b32 m0
|
||||
; GFX9-NOT: m0
|
||||
|
||||
; EG-DAG: LDS_READ_RET
|
||||
; EG-DAG: LDS_READ_RET
|
||||
@ -363,6 +391,8 @@ define amdgpu_kernel void @local_sextload_v32i8_to_v32i32(<32 x i32> addrspace(3
|
||||
}
|
||||
|
||||
; FUNC-LABEL: {{^}}local_zextload_v64i8_to_v64i32:
|
||||
; SICIVI: s_mov_b32 m0
|
||||
; GFX9-NOT: m0
|
||||
|
||||
; EG-DAG: LDS_READ_RET
|
||||
; EG-DAG: LDS_READ_RET
|
||||
@ -388,6 +418,8 @@ define amdgpu_kernel void @local_zextload_v64i8_to_v64i32(<64 x i32> addrspace(3
|
||||
}
|
||||
|
||||
; FUNC-LABEL: {{^}}local_sextload_v64i8_to_v64i32:
|
||||
; SICIVI: s_mov_b32 m0
|
||||
; GFX9-NOT: m0
|
||||
|
||||
; EG-DAG: LDS_READ_RET
|
||||
; EG-DAG: LDS_READ_RET
|
||||
@ -413,6 +445,9 @@ define amdgpu_kernel void @local_sextload_v64i8_to_v64i32(<64 x i32> addrspace(3
|
||||
}
|
||||
|
||||
; FUNC-LABEL: {{^}}local_zextload_i8_to_i64:
|
||||
; SICIVI: s_mov_b32 m0
|
||||
; GFX9-NOT: m0
|
||||
|
||||
; GCN-DAG: v_mov_b32_e32 v[[HI:[0-9]+]], 0{{$}}
|
||||
; GCN-DAG: ds_read_u8 v[[LO:[0-9]+]],
|
||||
; GCN: ds_write_b64 v{{[0-9]+}}, v{{\[}}[[LO]]:[[HI]]]
|
||||
@ -428,6 +463,9 @@ define amdgpu_kernel void @local_zextload_i8_to_i64(i64 addrspace(3)* %out, i8 a
|
||||
}
|
||||
|
||||
; FUNC-LABEL: {{^}}local_sextload_i8_to_i64:
|
||||
; SICIVI: s_mov_b32 m0
|
||||
; GFX9-NOT: m0
|
||||
|
||||
; GCN: ds_read_i8 v[[LO:[0-9]+]],
|
||||
; GCN: v_ashrrev_i32_e32 v[[HI:[0-9]+]], 31, v[[LO]]
|
||||
|
||||
@ -445,6 +483,8 @@ define amdgpu_kernel void @local_sextload_i8_to_i64(i64 addrspace(3)* %out, i8 a
|
||||
}
|
||||
|
||||
; FUNC-LABEL: {{^}}local_zextload_v1i8_to_v1i64:
|
||||
; SICIVI: s_mov_b32 m0
|
||||
; GFX9-NOT: m0
|
||||
|
||||
; EG: LDS_UBYTE_READ_RET
|
||||
; EG: MOV {{.*}}, literal
|
||||
@ -458,6 +498,8 @@ define amdgpu_kernel void @local_zextload_v1i8_to_v1i64(<1 x i64> addrspace(3)*
|
||||
}
|
||||
|
||||
; FUNC-LABEL: {{^}}local_sextload_v1i8_to_v1i64:
|
||||
; SICIVI: s_mov_b32 m0
|
||||
; GFX9-NOT: m0
|
||||
|
||||
; EG: LDS_UBYTE_READ_RET
|
||||
; EG: ASHR
|
||||
@ -471,6 +513,8 @@ define amdgpu_kernel void @local_sextload_v1i8_to_v1i64(<1 x i64> addrspace(3)*
|
||||
}
|
||||
|
||||
; FUNC-LABEL: {{^}}local_zextload_v2i8_to_v2i64:
|
||||
; SICIVI: s_mov_b32 m0
|
||||
; GFX9-NOT: m0
|
||||
|
||||
; EG: LDS_USHORT_READ_RET
|
||||
define amdgpu_kernel void @local_zextload_v2i8_to_v2i64(<2 x i64> addrspace(3)* %out, <2 x i8> addrspace(3)* %in) #0 {
|
||||
@ -481,6 +525,8 @@ define amdgpu_kernel void @local_zextload_v2i8_to_v2i64(<2 x i64> addrspace(3)*
|
||||
}
|
||||
|
||||
; FUNC-LABEL: {{^}}local_sextload_v2i8_to_v2i64:
|
||||
; SICIVI: s_mov_b32 m0
|
||||
; GFX9-NOT: m0
|
||||
|
||||
; EG: LDS_USHORT_READ_RET
|
||||
; EG: BFE_INT
|
||||
@ -493,6 +539,8 @@ define amdgpu_kernel void @local_sextload_v2i8_to_v2i64(<2 x i64> addrspace(3)*
|
||||
}
|
||||
|
||||
; FUNC-LABEL: {{^}}local_zextload_v4i8_to_v4i64:
|
||||
; SICIVI: s_mov_b32 m0
|
||||
; GFX9-NOT: m0
|
||||
|
||||
; EG: LDS_READ_RET
|
||||
define amdgpu_kernel void @local_zextload_v4i8_to_v4i64(<4 x i64> addrspace(3)* %out, <4 x i8> addrspace(3)* %in) #0 {
|
||||
@ -503,6 +551,8 @@ define amdgpu_kernel void @local_zextload_v4i8_to_v4i64(<4 x i64> addrspace(3)*
|
||||
}
|
||||
|
||||
; FUNC-LABEL: {{^}}local_sextload_v4i8_to_v4i64:
|
||||
; SICIVI: s_mov_b32 m0
|
||||
; GFX9-NOT: m0
|
||||
|
||||
; EG: LDS_READ_RET
|
||||
define amdgpu_kernel void @local_sextload_v4i8_to_v4i64(<4 x i64> addrspace(3)* %out, <4 x i8> addrspace(3)* %in) #0 {
|
||||
@ -513,6 +563,8 @@ define amdgpu_kernel void @local_sextload_v4i8_to_v4i64(<4 x i64> addrspace(3)*
|
||||
}
|
||||
|
||||
; FUNC-LABEL: {{^}}local_zextload_v8i8_to_v8i64:
|
||||
; SICIVI: s_mov_b32 m0
|
||||
; GFX9-NOT: m0
|
||||
|
||||
; EG: LDS_READ_RET
|
||||
; EG: LDS_READ_RET
|
||||
@ -524,6 +576,8 @@ define amdgpu_kernel void @local_zextload_v8i8_to_v8i64(<8 x i64> addrspace(3)*
|
||||
}
|
||||
|
||||
; FUNC-LABEL: {{^}}local_sextload_v8i8_to_v8i64:
|
||||
; SICIVI: s_mov_b32 m0
|
||||
; GFX9-NOT: m0
|
||||
|
||||
; EG: LDS_READ_RET
|
||||
; EG: LDS_READ_RET
|
||||
@ -544,6 +598,8 @@ define amdgpu_kernel void @local_sextload_v8i8_to_v8i64(<8 x i64> addrspace(3)*
|
||||
}
|
||||
|
||||
; FUNC-LABEL: {{^}}local_zextload_v16i8_to_v16i64:
|
||||
; SICIVI: s_mov_b32 m0
|
||||
; GFX9-NOT: m0
|
||||
|
||||
; EG: LDS_READ_RET
|
||||
; EG: LDS_READ_RET
|
||||
@ -557,6 +613,8 @@ define amdgpu_kernel void @local_zextload_v16i8_to_v16i64(<16 x i64> addrspace(3
|
||||
}
|
||||
|
||||
; FUNC-LABEL: {{^}}local_sextload_v16i8_to_v16i64:
|
||||
; SICIVI: s_mov_b32 m0
|
||||
; GFX9-NOT: m0
|
||||
|
||||
; EG: LDS_READ_RET
|
||||
; EG: LDS_READ_RET
|
||||
@ -570,6 +628,8 @@ define amdgpu_kernel void @local_sextload_v16i8_to_v16i64(<16 x i64> addrspace(3
|
||||
}
|
||||
|
||||
; FUNC-LABEL: {{^}}local_zextload_v32i8_to_v32i64:
|
||||
; SICIVI: s_mov_b32 m0
|
||||
; GFX9-NOT: m0
|
||||
|
||||
; EG: LDS_READ_RET
|
||||
; EG: LDS_READ_RET
|
||||
@ -587,6 +647,8 @@ define amdgpu_kernel void @local_zextload_v32i8_to_v32i64(<32 x i64> addrspace(3
|
||||
}
|
||||
|
||||
; FUNC-LABEL: {{^}}local_sextload_v32i8_to_v32i64:
|
||||
; SICIVI: s_mov_b32 m0
|
||||
; GFX9-NOT: m0
|
||||
|
||||
; EG: LDS_READ_RET
|
||||
; EG: LDS_READ_RET
|
||||
@ -620,6 +682,8 @@ define amdgpu_kernel void @local_sextload_v32i8_to_v32i64(<32 x i64> addrspace(3
|
||||
; }
|
||||
|
||||
; FUNC-LABEL: {{^}}local_zextload_i8_to_i16:
|
||||
; SICIVI: s_mov_b32 m0
|
||||
; GFX9-NOT: m0
|
||||
; GCN: ds_read_u8 v[[VAL:[0-9]+]],
|
||||
; GCN: ds_write_b16 v[[VAL:[0-9]+]]
|
||||
|
||||
@ -633,6 +697,8 @@ define amdgpu_kernel void @local_zextload_i8_to_i16(i16 addrspace(3)* %out, i8 a
|
||||
}
|
||||
|
||||
; FUNC-LABEL: {{^}}local_sextload_i8_to_i16:
|
||||
; SICIVI: s_mov_b32 m0
|
||||
; GFX9-NOT: m0
|
||||
; GCN: ds_read_i8 v[[VAL:[0-9]+]],
|
||||
; GCN: ds_write_b16 v{{[0-9]+}}, v[[VAL]]
|
||||
|
||||
@ -647,6 +713,8 @@ define amdgpu_kernel void @local_sextload_i8_to_i16(i16 addrspace(3)* %out, i8 a
|
||||
}
|
||||
|
||||
; FUNC-LABEL: {{^}}local_zextload_v1i8_to_v1i16:
|
||||
; SICIVI: s_mov_b32 m0
|
||||
; GFX9-NOT: m0
|
||||
|
||||
; EG: LDS_UBYTE_READ_RET
|
||||
; EG: LDS_SHORT_WRITE
|
||||
@ -658,6 +726,8 @@ define amdgpu_kernel void @local_zextload_v1i8_to_v1i16(<1 x i16> addrspace(3)*
|
||||
}
|
||||
|
||||
; FUNC-LABEL: {{^}}local_sextload_v1i8_to_v1i16:
|
||||
; SICIVI: s_mov_b32 m0
|
||||
; GFX9-NOT: m0
|
||||
|
||||
; EG: LDS_UBYTE_READ_RET
|
||||
; EG: BFE_INT
|
||||
@ -670,6 +740,8 @@ define amdgpu_kernel void @local_sextload_v1i8_to_v1i16(<1 x i16> addrspace(3)*
|
||||
}
|
||||
|
||||
; FUNC-LABEL: {{^}}local_zextload_v2i8_to_v2i16:
|
||||
; SICIVI: s_mov_b32 m0
|
||||
; GFX9-NOT: m0
|
||||
|
||||
; EG: LDS_USHORT_READ_RET
|
||||
; EG: LDS_WRITE
|
||||
@ -681,6 +753,8 @@ define amdgpu_kernel void @local_zextload_v2i8_to_v2i16(<2 x i16> addrspace(3)*
|
||||
}
|
||||
|
||||
; FUNC-LABEL: {{^}}local_sextload_v2i8_to_v2i16:
|
||||
; SICIVI: s_mov_b32 m0
|
||||
; GFX9-NOT: m0
|
||||
|
||||
; EG: LDS_USHORT_READ_RET
|
||||
; EG: BFE_INT
|
||||
@ -694,6 +768,8 @@ define amdgpu_kernel void @local_sextload_v2i8_to_v2i16(<2 x i16> addrspace(3)*
|
||||
}
|
||||
|
||||
; FUNC-LABEL: {{^}}local_zextload_v4i8_to_v4i16:
|
||||
; SICIVI: s_mov_b32 m0
|
||||
; GFX9-NOT: m0
|
||||
|
||||
; EG: LDS_READ_RET
|
||||
; EG: LDS_WRITE
|
||||
@ -706,6 +782,8 @@ define amdgpu_kernel void @local_zextload_v4i8_to_v4i16(<4 x i16> addrspace(3)*
|
||||
}
|
||||
|
||||
; FUNC-LABEL: {{^}}local_sextload_v4i8_to_v4i16:
|
||||
; SICIVI: s_mov_b32 m0
|
||||
; GFX9-NOT: m0
|
||||
|
||||
; EG: LDS_READ_RET
|
||||
; TODO: these do LSHR + BFE_INT, instead of just BFE_INT/ASHR
|
||||
@ -723,6 +801,8 @@ define amdgpu_kernel void @local_sextload_v4i8_to_v4i16(<4 x i16> addrspace(3)*
|
||||
}
|
||||
|
||||
; FUNC-LABEL: {{^}}local_zextload_v8i8_to_v8i16:
|
||||
; SICIVI: s_mov_b32 m0
|
||||
; GFX9-NOT: m0
|
||||
|
||||
; EG: LDS_READ_RET
|
||||
; EG: LDS_READ_RET
|
||||
@ -738,6 +818,8 @@ define amdgpu_kernel void @local_zextload_v8i8_to_v8i16(<8 x i16> addrspace(3)*
|
||||
}
|
||||
|
||||
; FUNC-LABEL: {{^}}local_sextload_v8i8_to_v8i16:
|
||||
; SICIVI: s_mov_b32 m0
|
||||
; GFX9-NOT: m0
|
||||
|
||||
; EG: LDS_READ_RET
|
||||
; EG: LDS_READ_RET
|
||||
@ -762,6 +844,8 @@ define amdgpu_kernel void @local_sextload_v8i8_to_v8i16(<8 x i16> addrspace(3)*
|
||||
}
|
||||
|
||||
; FUNC-LABEL: {{^}}local_zextload_v16i8_to_v16i16:
|
||||
; SICIVI: s_mov_b32 m0
|
||||
; GFX9-NOT: m0
|
||||
|
||||
; EG: LDS_READ_RET
|
||||
; EG: LDS_READ_RET
|
||||
@ -783,6 +867,8 @@ define amdgpu_kernel void @local_zextload_v16i8_to_v16i16(<16 x i16> addrspace(3
|
||||
}
|
||||
|
||||
; FUNC-LABEL: {{^}}local_sextload_v16i8_to_v16i16:
|
||||
; SICIVI: s_mov_b32 m0
|
||||
; GFX9-NOT: m0
|
||||
|
||||
; EG: LDS_READ_RET
|
||||
; EG: LDS_READ_RET
|
||||
@ -821,6 +907,8 @@ define amdgpu_kernel void @local_sextload_v16i8_to_v16i16(<16 x i16> addrspace(3
|
||||
}
|
||||
|
||||
; FUNC-LABEL: {{^}}local_zextload_v32i8_to_v32i16:
|
||||
; SICIVI: s_mov_b32 m0
|
||||
; GFX9-NOT: m0
|
||||
|
||||
; EG: LDS_READ_RET
|
||||
; EG: LDS_READ_RET
|
||||
@ -854,6 +942,8 @@ define amdgpu_kernel void @local_zextload_v32i8_to_v32i16(<32 x i16> addrspace(3
|
||||
}
|
||||
|
||||
; FUNC-LABEL: {{^}}local_sextload_v32i8_to_v32i16:
|
||||
; SICIVI: s_mov_b32 m0
|
||||
; GFX9-NOT: m0
|
||||
|
||||
; EG: LDS_READ_RET
|
||||
; EG: LDS_READ_RET
|
||||
|
@ -1,10 +1,14 @@
|
||||
; RUN: llc -march=amdgcn -verify-machineinstrs< %s | FileCheck --check-prefix=SI --check-prefix=BOTH %s
|
||||
; RUN: llc -march=amdgcn -mcpu=bonaire -verify-machineinstrs< %s | FileCheck --check-prefix=CI --check-prefix=BOTH %s
|
||||
; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs< %s | FileCheck --check-prefix=CI --check-prefix=BOTH %s
|
||||
; RUN: llc -march=amdgcn -verify-machineinstrs< %s | FileCheck -enable-var-scope -check-prefixes=GCN,SI,SICIVI %s
|
||||
; RUN: llc -march=amdgcn -mcpu=bonaire -verify-machineinstrs< %s | FileCheck -enable-var-scope -check-prefixes=GCN,CI,SICIVI %s
|
||||
; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs< %s | FileCheck -enable-var-scope -check-prefixes=GCN,VI,SICIVI %s
|
||||
; RUN: llc -march=amdgcn -mcpu=gfx900 -mattr=-flat-for-global -verify-machineinstrs< %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX9 %s
|
||||
|
||||
; BOTH-LABEL: {{^}}local_i32_load
|
||||
; BOTH: ds_read_b32 [[REG:v[0-9]+]], v{{[0-9]+}} offset:28
|
||||
; BOTH: buffer_store_dword [[REG]],
|
||||
; GCN-LABEL: {{^}}local_i32_load
|
||||
; SICIVI: s_mov_b32 m0
|
||||
; GFX9-NOT: m0
|
||||
|
||||
; GCN: ds_read_b32 [[REG:v[0-9]+]], v{{[0-9]+}} offset:28
|
||||
; GCN: buffer_store_dword [[REG]],
|
||||
define amdgpu_kernel void @local_i32_load(i32 addrspace(1)* %out, i32 addrspace(3)* %in) nounwind {
|
||||
%gep = getelementptr i32, i32 addrspace(3)* %in, i32 7
|
||||
%val = load i32, i32 addrspace(3)* %gep, align 4
|
||||
@ -12,19 +16,25 @@ define amdgpu_kernel void @local_i32_load(i32 addrspace(1)* %out, i32 addrspace(
|
||||
ret void
|
||||
}
|
||||
|
||||
; BOTH-LABEL: {{^}}local_i32_load_0_offset
|
||||
; BOTH: ds_read_b32 [[REG:v[0-9]+]], v{{[0-9]+}}
|
||||
; BOTH: buffer_store_dword [[REG]],
|
||||
; GCN-LABEL: {{^}}local_i32_load_0_offset
|
||||
; SICIVI: s_mov_b32 m0
|
||||
; GFX9-NOT: m0
|
||||
|
||||
; GCN: ds_read_b32 [[REG:v[0-9]+]], v{{[0-9]+}}
|
||||
; GCN: buffer_store_dword [[REG]],
|
||||
define amdgpu_kernel void @local_i32_load_0_offset(i32 addrspace(1)* %out, i32 addrspace(3)* %in) nounwind {
|
||||
%val = load i32, i32 addrspace(3)* %in, align 4
|
||||
store i32 %val, i32 addrspace(1)* %out, align 4
|
||||
ret void
|
||||
}
|
||||
|
||||
; BOTH-LABEL: {{^}}local_i8_load_i16_max_offset:
|
||||
; BOTH-NOT: ADD
|
||||
; BOTH: ds_read_u8 [[REG:v[0-9]+]], {{v[0-9]+}} offset:65535
|
||||
; BOTH: buffer_store_byte [[REG]],
|
||||
; GCN-LABEL: {{^}}local_i8_load_i16_max_offset:
|
||||
; SICIVI: s_mov_b32 m0
|
||||
; GFX9-NOT: m0
|
||||
|
||||
; GCN-NOT: add
|
||||
; GCN: ds_read_u8 [[REG:v[0-9]+]], {{v[0-9]+}} offset:65535
|
||||
; GCN: buffer_store_byte [[REG]],
|
||||
define amdgpu_kernel void @local_i8_load_i16_max_offset(i8 addrspace(1)* %out, i8 addrspace(3)* %in) nounwind {
|
||||
%gep = getelementptr i8, i8 addrspace(3)* %in, i32 65535
|
||||
%val = load i8, i8 addrspace(3)* %gep, align 4
|
||||
@ -32,14 +42,20 @@ define amdgpu_kernel void @local_i8_load_i16_max_offset(i8 addrspace(1)* %out, i
|
||||
ret void
|
||||
}
|
||||
|
||||
; BOTH-LABEL: {{^}}local_i8_load_over_i16_max_offset:
|
||||
; GCN-LABEL: {{^}}local_i8_load_over_i16_max_offset:
|
||||
; SICIVI-DAG: s_mov_b32 m0
|
||||
; GFX9-NOT: m0
|
||||
|
||||
; The LDS offset will be 65536 bytes, which is larger than the size of LDS on
|
||||
; SI, which is why it is being OR'd with the base pointer.
|
||||
; SI: s_or_b32 [[ADDR:s[0-9]+]], s{{[0-9]+}}, 0x10000
|
||||
; CI: s_add_i32 [[ADDR:s[0-9]+]], s{{[0-9]+}}, 0x10000
|
||||
; BOTH: v_mov_b32_e32 [[VREGADDR:v[0-9]+]], [[ADDR]]
|
||||
; BOTH: ds_read_u8 [[REG:v[0-9]+]], [[VREGADDR]]
|
||||
; BOTH: buffer_store_byte [[REG]],
|
||||
; SI-DAG: s_or_b32 [[ADDR:s[0-9]+]], s{{[0-9]+}}, 0x10000
|
||||
; CI-DAG: s_add_i32 [[ADDR:s[0-9]+]], s{{[0-9]+}}, 0x10000
|
||||
; VI-DAG: s_add_i32 [[ADDR:s[0-9]+]], s{{[0-9]+}}, 0x10000
|
||||
; GFX9-DAG: s_add_i32 [[ADDR:s[0-9]+]], s{{[0-9]+}}, 0x10000
|
||||
|
||||
; GCN-DAG: v_mov_b32_e32 [[VREGADDR:v[0-9]+]], [[ADDR]]
|
||||
; GCN: ds_read_u8 [[REG:v[0-9]+]], [[VREGADDR]]
|
||||
; GCN: buffer_store_byte [[REG]],
|
||||
define amdgpu_kernel void @local_i8_load_over_i16_max_offset(i8 addrspace(1)* %out, i8 addrspace(3)* %in) nounwind {
|
||||
%gep = getelementptr i8, i8 addrspace(3)* %in, i32 65536
|
||||
%val = load i8, i8 addrspace(3)* %gep, align 4
|
||||
@ -47,10 +63,13 @@ define amdgpu_kernel void @local_i8_load_over_i16_max_offset(i8 addrspace(1)* %o
|
||||
ret void
|
||||
}
|
||||
|
||||
; BOTH-LABEL: {{^}}local_i64_load:
|
||||
; BOTH-NOT: ADD
|
||||
; BOTH: ds_read_b64 [[REG:v[[0-9]+:[0-9]+]]], v{{[0-9]+}} offset:56
|
||||
; BOTH: buffer_store_dwordx2 [[REG]],
|
||||
; GCN-LABEL: {{^}}local_i64_load:
|
||||
; SICIVI: s_mov_b32 m0
|
||||
; GFX9-NOT: m0
|
||||
|
||||
; GCN-NOT: add
|
||||
; GCN: ds_read_b64 [[REG:v[[0-9]+:[0-9]+]]], v{{[0-9]+}} offset:56
|
||||
; GCN: buffer_store_dwordx2 [[REG]],
|
||||
define amdgpu_kernel void @local_i64_load(i64 addrspace(1)* %out, i64 addrspace(3)* %in) nounwind {
|
||||
%gep = getelementptr i64, i64 addrspace(3)* %in, i32 7
|
||||
%val = load i64, i64 addrspace(3)* %gep, align 8
|
||||
@ -58,19 +77,25 @@ define amdgpu_kernel void @local_i64_load(i64 addrspace(1)* %out, i64 addrspace(
|
||||
ret void
|
||||
}
|
||||
|
||||
; BOTH-LABEL: {{^}}local_i64_load_0_offset
|
||||
; BOTH: ds_read_b64 [[REG:v\[[0-9]+:[0-9]+\]]], v{{[0-9]+}}
|
||||
; BOTH: buffer_store_dwordx2 [[REG]],
|
||||
; GCN-LABEL: {{^}}local_i64_load_0_offset
|
||||
; SICIVI: s_mov_b32 m0
|
||||
; GFX9-NOT: m0
|
||||
|
||||
; GCN: ds_read_b64 [[REG:v\[[0-9]+:[0-9]+\]]], v{{[0-9]+}}
|
||||
; GCN: buffer_store_dwordx2 [[REG]],
|
||||
define amdgpu_kernel void @local_i64_load_0_offset(i64 addrspace(1)* %out, i64 addrspace(3)* %in) nounwind {
|
||||
%val = load i64, i64 addrspace(3)* %in, align 8
|
||||
store i64 %val, i64 addrspace(1)* %out, align 8
|
||||
ret void
|
||||
}
|
||||
|
||||
; BOTH-LABEL: {{^}}local_f64_load:
|
||||
; BOTH-NOT: ADD
|
||||
; BOTH: ds_read_b64 [[REG:v[[0-9]+:[0-9]+]]], v{{[0-9]+}} offset:56
|
||||
; BOTH: buffer_store_dwordx2 [[REG]],
|
||||
; GCN-LABEL: {{^}}local_f64_load:
|
||||
; SICIVI: s_mov_b32 m0
|
||||
; GFX9-NOT: m0
|
||||
|
||||
; GCN-NOT: add
|
||||
; GCN: ds_read_b64 [[REG:v[[0-9]+:[0-9]+]]], v{{[0-9]+}} offset:56
|
||||
; GCN: buffer_store_dwordx2 [[REG]],
|
||||
define amdgpu_kernel void @local_f64_load(double addrspace(1)* %out, double addrspace(3)* %in) nounwind {
|
||||
%gep = getelementptr double, double addrspace(3)* %in, i32 7
|
||||
%val = load double, double addrspace(3)* %gep, align 8
|
||||
@ -78,83 +103,110 @@ define amdgpu_kernel void @local_f64_load(double addrspace(1)* %out, double addr
|
||||
ret void
|
||||
}
|
||||
|
||||
; BOTH-LABEL: {{^}}local_f64_load_0_offset
|
||||
; BOTH: ds_read_b64 [[REG:v\[[0-9]+:[0-9]+\]]], v{{[0-9]+}}
|
||||
; BOTH: buffer_store_dwordx2 [[REG]],
|
||||
; GCN-LABEL: {{^}}local_f64_load_0_offset
|
||||
; SICIVI: s_mov_b32 m0
|
||||
; GFX9-NOT: m0
|
||||
|
||||
; GCN: ds_read_b64 [[REG:v\[[0-9]+:[0-9]+\]]], v{{[0-9]+}}
|
||||
; GCN: buffer_store_dwordx2 [[REG]],
|
||||
define amdgpu_kernel void @local_f64_load_0_offset(double addrspace(1)* %out, double addrspace(3)* %in) nounwind {
|
||||
%val = load double, double addrspace(3)* %in, align 8
|
||||
store double %val, double addrspace(1)* %out, align 8
|
||||
ret void
|
||||
}
|
||||
|
||||
; BOTH-LABEL: {{^}}local_i64_store:
|
||||
; BOTH-NOT: ADD
|
||||
; BOTH: ds_write_b64 v{{[0-9]+}}, {{v\[[0-9]+:[0-9]+\]}} offset:56
|
||||
; GCN-LABEL: {{^}}local_i64_store:
|
||||
; SICIVI: s_mov_b32 m0
|
||||
; GFX9-NOT: m0
|
||||
|
||||
; GCN-NOT: add
|
||||
; GCN: ds_write_b64 v{{[0-9]+}}, {{v\[[0-9]+:[0-9]+\]}} offset:56
|
||||
define amdgpu_kernel void @local_i64_store(i64 addrspace(3)* %out) nounwind {
|
||||
%gep = getelementptr i64, i64 addrspace(3)* %out, i32 7
|
||||
store i64 5678, i64 addrspace(3)* %gep, align 8
|
||||
ret void
|
||||
}
|
||||
|
||||
; BOTH-LABEL: {{^}}local_i64_store_0_offset:
|
||||
; BOTH-NOT: ADD
|
||||
; BOTH: ds_write_b64 v{{[0-9]+}}, {{v\[[0-9]+:[0-9]+\]}}
|
||||
; GCN-LABEL: {{^}}local_i64_store_0_offset:
|
||||
; SICIVI: s_mov_b32 m0
|
||||
; GFX9-NOT: m0
|
||||
|
||||
; GCN-NOT: add
|
||||
; GCN: ds_write_b64 v{{[0-9]+}}, {{v\[[0-9]+:[0-9]+\]}}
|
||||
define amdgpu_kernel void @local_i64_store_0_offset(i64 addrspace(3)* %out) nounwind {
|
||||
store i64 1234, i64 addrspace(3)* %out, align 8
|
||||
ret void
|
||||
}
|
||||
|
||||
; BOTH-LABEL: {{^}}local_f64_store:
|
||||
; BOTH-NOT: ADD
|
||||
; BOTH: ds_write_b64 v{{[0-9]+}}, {{v\[[0-9]+:[0-9]+\]}} offset:56
|
||||
; GCN-LABEL: {{^}}local_f64_store:
|
||||
; SICIVI: s_mov_b32 m0
|
||||
; GFX9-NOT: m0
|
||||
|
||||
; GCN-NOT: add
|
||||
; GCN: ds_write_b64 v{{[0-9]+}}, {{v\[[0-9]+:[0-9]+\]}} offset:56
|
||||
define amdgpu_kernel void @local_f64_store(double addrspace(3)* %out) nounwind {
|
||||
%gep = getelementptr double, double addrspace(3)* %out, i32 7
|
||||
store double 16.0, double addrspace(3)* %gep, align 8
|
||||
ret void
|
||||
}
|
||||
|
||||
; BOTH-LABEL: {{^}}local_f64_store_0_offset
|
||||
; BOTH: ds_write_b64 v{{[0-9]+}}, {{v\[[0-9]+:[0-9]+\]}}
|
||||
; GCN-LABEL: {{^}}local_f64_store_0_offset
|
||||
; SICIVI: s_mov_b32 m0
|
||||
; GFX9-NOT: m0
|
||||
|
||||
; GCN: ds_write_b64 v{{[0-9]+}}, {{v\[[0-9]+:[0-9]+\]}}
|
||||
define amdgpu_kernel void @local_f64_store_0_offset(double addrspace(3)* %out) nounwind {
|
||||
store double 20.0, double addrspace(3)* %out, align 8
|
||||
ret void
|
||||
}
|
||||
|
||||
; BOTH-LABEL: {{^}}local_v2i64_store:
|
||||
; BOTH-NOT: ADD
|
||||
; BOTH: ds_write2_b64 v{{[0-9]+}}, {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}} offset0:14 offset1:15
|
||||
; BOTH: s_endpgm
|
||||
; GCN-LABEL: {{^}}local_v2i64_store:
|
||||
; SICIVI: s_mov_b32 m0
|
||||
; GFX9-NOT: m0
|
||||
|
||||
; GCN-NOT: add
|
||||
; GCN: ds_write2_b64 v{{[0-9]+}}, {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}} offset0:14 offset1:15
|
||||
; GCN: s_endpgm
|
||||
define amdgpu_kernel void @local_v2i64_store(<2 x i64> addrspace(3)* %out) nounwind {
|
||||
%gep = getelementptr <2 x i64>, <2 x i64> addrspace(3)* %out, i32 7
|
||||
store <2 x i64> <i64 5678, i64 5678>, <2 x i64> addrspace(3)* %gep, align 16
|
||||
ret void
|
||||
}
|
||||
|
||||
; BOTH-LABEL: {{^}}local_v2i64_store_0_offset:
|
||||
; BOTH-NOT: ADD
|
||||
; BOTH: ds_write2_b64 v{{[0-9]+}}, {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}} offset1:1
|
||||
; BOTH: s_endpgm
|
||||
; GCN-LABEL: {{^}}local_v2i64_store_0_offset:
|
||||
; SICIVI: s_mov_b32 m0
|
||||
; GFX9-NOT: m0
|
||||
|
||||
; GCN-NOT: add
|
||||
; GCN: ds_write2_b64 v{{[0-9]+}}, {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}} offset1:1
|
||||
; GCN: s_endpgm
|
||||
define amdgpu_kernel void @local_v2i64_store_0_offset(<2 x i64> addrspace(3)* %out) nounwind {
|
||||
store <2 x i64> <i64 1234, i64 1234>, <2 x i64> addrspace(3)* %out, align 16
|
||||
ret void
|
||||
}
|
||||
|
||||
; BOTH-LABEL: {{^}}local_v4i64_store:
|
||||
; BOTH-NOT: ADD
|
||||
; BOTH-DAG: ds_write2_b64 v{{[0-9]+}}, {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}} offset0:30 offset1:31
|
||||
; BOTH-DAG: ds_write2_b64 v{{[0-9]+}}, {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}} offset0:28 offset1:29
|
||||
; BOTH: s_endpgm
|
||||
; GCN-LABEL: {{^}}local_v4i64_store:
|
||||
; SICIVI: s_mov_b32 m0
|
||||
; GFX9-NOT: m0
|
||||
|
||||
; GCN-NOT: add
|
||||
; GCN-DAG: ds_write2_b64 v{{[0-9]+}}, {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}} offset0:30 offset1:31
|
||||
; GCN-DAG: ds_write2_b64 v{{[0-9]+}}, {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}} offset0:28 offset1:29
|
||||
; GCN: s_endpgm
|
||||
define amdgpu_kernel void @local_v4i64_store(<4 x i64> addrspace(3)* %out) nounwind {
|
||||
%gep = getelementptr <4 x i64>, <4 x i64> addrspace(3)* %out, i32 7
|
||||
store <4 x i64> <i64 5678, i64 5678, i64 5678, i64 5678>, <4 x i64> addrspace(3)* %gep, align 16
|
||||
ret void
|
||||
}
|
||||
|
||||
; BOTH-LABEL: {{^}}local_v4i64_store_0_offset:
|
||||
; BOTH-NOT: ADD
|
||||
; BOTH-DAG: ds_write2_b64 v{{[0-9]+}}, {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}} offset0:2 offset1:3
|
||||
; BOTH-DAG: ds_write2_b64 v{{[0-9]+}}, {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}} offset1:1
|
||||
; BOTH: s_endpgm
|
||||
; GCN-LABEL: {{^}}local_v4i64_store_0_offset:
|
||||
; SICIVI: s_mov_b32 m0
|
||||
; GFX9-NOT: m0
|
||||
|
||||
; GCN-NOT: add
|
||||
; GCN-DAG: ds_write2_b64 v{{[0-9]+}}, {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}} offset0:2 offset1:3
|
||||
; GCN-DAG: ds_write2_b64 v{{[0-9]+}}, {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}} offset1:1
|
||||
; GCN: s_endpgm
|
||||
define amdgpu_kernel void @local_v4i64_store_0_offset(<4 x i64> addrspace(3)* %out) nounwind {
|
||||
store <4 x i64> <i64 1234, i64 1234, i64 1234, i64 1234>, <4 x i64> addrspace(3)* %out, align 16
|
||||
ret void
|
||||
|
@ -1,13 +1,18 @@
|
||||
; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=GCN -check-prefix=FUNC %s
|
||||
; RUN: llc -march=amdgcn -mcpu=bonaire -verify-machineinstrs < %s | FileCheck -check-prefix=CIVI -check-prefix=GCN -check-prefix=FUNC %s
|
||||
; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=CIVI -check-prefix=GCN -check-prefix=FUNC %s
|
||||
; RUN: llc -march=r600 -mcpu=redwood -verify-machineinstrs < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s
|
||||
; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,SI,SICIVI,FUNC %s
|
||||
; RUN: llc -march=amdgcn -mcpu=bonaire -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,CIVI,FUNC %s
|
||||
; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,CIVI,SICIVI,FUNC %s
|
||||
; RUN: llc -march=amdgcn -mcpu=gfx900 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX9,FUNC %s
|
||||
; RUN: llc -march=r600 -mcpu=redwood -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=EG,FUNC %s
|
||||
|
||||
; FUNC-LABEL: {{^}}lds_atomic_xchg_ret_i32:
|
||||
; EG: LDS_WRXCHG_RET *
|
||||
; GCN: s_load_dword [[SPTR:s[0-9]+]],
|
||||
; GCN: v_mov_b32_e32 [[DATA:v[0-9]+]], 4
|
||||
; GCN: v_mov_b32_e32 [[VPTR:v[0-9]+]], [[SPTR]]
|
||||
|
||||
; SICIVI-DAG: s_mov_b32 m0
|
||||
; GFX9-NOT: m0
|
||||
|
||||
; GCN-DAG: s_load_dword [[SPTR:s[0-9]+]],
|
||||
; GCN-DAG: v_mov_b32_e32 [[DATA:v[0-9]+]], 4
|
||||
; GCN-DAG: v_mov_b32_e32 [[VPTR:v[0-9]+]], [[SPTR]]
|
||||
; GCN: ds_wrxchg_rtn_b32 [[RESULT:v[0-9]+]], [[VPTR]], [[DATA]]
|
||||
; GCN: buffer_store_dword [[RESULT]],
|
||||
; GCN: s_endpgm
|
||||
@ -18,6 +23,9 @@ define amdgpu_kernel void @lds_atomic_xchg_ret_i32(i32 addrspace(1)* %out, i32 a
|
||||
}
|
||||
|
||||
; FUNC-LABEL: {{^}}lds_atomic_xchg_ret_i32_offset:
|
||||
; SICIVI: s_mov_b32 m0
|
||||
; GFX9-NOT: m0
|
||||
|
||||
; EG: LDS_WRXCHG_RET *
|
||||
; GCN: ds_wrxchg_rtn_b32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} offset:16
|
||||
; GCN: s_endpgm
|
||||
@ -31,9 +39,13 @@ define amdgpu_kernel void @lds_atomic_xchg_ret_i32_offset(i32 addrspace(1)* %out
|
||||
; XXX - Is it really necessary to load 4 into VGPR?
|
||||
; FUNC-LABEL: {{^}}lds_atomic_add_ret_i32:
|
||||
; EG: LDS_ADD_RET *
|
||||
; GCN: s_load_dword [[SPTR:s[0-9]+]],
|
||||
; GCN: v_mov_b32_e32 [[DATA:v[0-9]+]], 4
|
||||
; GCN: v_mov_b32_e32 [[VPTR:v[0-9]+]], [[SPTR]]
|
||||
|
||||
; SICIVI-DAG: s_mov_b32 m0
|
||||
; GFX9-NOT: m0
|
||||
|
||||
; GCN-DAG: s_load_dword [[SPTR:s[0-9]+]],
|
||||
; GCN-DAG: v_mov_b32_e32 [[DATA:v[0-9]+]], 4
|
||||
; GCN-DAG: v_mov_b32_e32 [[VPTR:v[0-9]+]], [[SPTR]]
|
||||
; GCN: ds_add_rtn_u32 [[RESULT:v[0-9]+]], [[VPTR]], [[DATA]]
|
||||
; GCN: buffer_store_dword [[RESULT]],
|
||||
; GCN: s_endpgm
|
||||
@ -44,6 +56,9 @@ define amdgpu_kernel void @lds_atomic_add_ret_i32(i32 addrspace(1)* %out, i32 ad
|
||||
}
|
||||
|
||||
; FUNC-LABEL: {{^}}lds_atomic_add_ret_i32_offset:
|
||||
; SICIVI: s_mov_b32 m0
|
||||
; GFX9-NOT: m0
|
||||
|
||||
; EG: LDS_ADD_RET *
|
||||
; GCN: ds_add_rtn_u32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} offset:16
|
||||
; GCN: s_endpgm
|
||||
@ -55,6 +70,9 @@ define amdgpu_kernel void @lds_atomic_add_ret_i32_offset(i32 addrspace(1)* %out,
|
||||
}
|
||||
|
||||
; FUNC-LABEL: {{^}}lds_atomic_add_ret_i32_bad_si_offset:
|
||||
; SICIVI: s_mov_b32 m0
|
||||
; GFX9-NOT: m0
|
||||
|
||||
; EG: LDS_ADD_RET *
|
||||
; SI: ds_add_rtn_u32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
|
||||
; CIVI: ds_add_rtn_u32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} offset:16
|
||||
@ -70,7 +88,11 @@ define amdgpu_kernel void @lds_atomic_add_ret_i32_bad_si_offset(i32 addrspace(1)
|
||||
|
||||
; FUNC-LABEL: {{^}}lds_atomic_add1_ret_i32:
|
||||
; EG: LDS_ADD_RET *
|
||||
; GCN: v_mov_b32_e32 [[ONE:v[0-9]+]], 1{{$}}
|
||||
|
||||
; SICIVI-DAG: s_mov_b32 m0
|
||||
; GFX9-NOT: m0
|
||||
|
||||
; GCN-DAG: v_mov_b32_e32 [[ONE:v[0-9]+]], 1{{$}}
|
||||
; GCN: ds_add_rtn_u32 v{{[0-9]+}}, v{{[0-9]+}}, [[ONE]]
|
||||
; GCN: s_endpgm
|
||||
define amdgpu_kernel void @lds_atomic_add1_ret_i32(i32 addrspace(1)* %out, i32 addrspace(3)* %ptr) nounwind {
|
||||
@ -81,7 +103,11 @@ define amdgpu_kernel void @lds_atomic_add1_ret_i32(i32 addrspace(1)* %out, i32 a
|
||||
|
||||
; FUNC-LABEL: {{^}}lds_atomic_add1_ret_i32_offset:
|
||||
; EG: LDS_ADD_RET *
|
||||
; GCN: v_mov_b32_e32 [[ONE:v[0-9]+]], 1{{$}}
|
||||
|
||||
; SICIVI-DAG: s_mov_b32 m0
|
||||
; GFX9-NOT: m0
|
||||
|
||||
; GCN-DAG: v_mov_b32_e32 [[ONE:v[0-9]+]], 1{{$}}
|
||||
; GCN: ds_add_rtn_u32 v{{[0-9]+}}, v{{[0-9]+}}, [[ONE]] offset:16
|
||||
; GCN: s_endpgm
|
||||
define amdgpu_kernel void @lds_atomic_add1_ret_i32_offset(i32 addrspace(1)* %out, i32 addrspace(3)* %ptr) nounwind {
|
||||
@ -92,6 +118,9 @@ define amdgpu_kernel void @lds_atomic_add1_ret_i32_offset(i32 addrspace(1)* %out
|
||||
}
|
||||
|
||||
; FUNC-LABEL: {{^}}lds_atomic_add1_ret_i32_bad_si_offset:
|
||||
; SICIVI: s_mov_b32 m0
|
||||
; GFX9-NOT: m0
|
||||
|
||||
; EG: LDS_ADD_RET *
|
||||
; SI: ds_add_rtn_u32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
|
||||
; CIVI: ds_add_rtn_u32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} offset:16
|
||||
@ -107,6 +136,10 @@ define amdgpu_kernel void @lds_atomic_add1_ret_i32_bad_si_offset(i32 addrspace(1
|
||||
|
||||
; FUNC-LABEL: {{^}}lds_atomic_sub_ret_i32:
|
||||
; EG: LDS_SUB_RET *
|
||||
|
||||
; SICIVI: s_mov_b32 m0
|
||||
; GFX9-NOT: m0
|
||||
|
||||
; GCN: ds_sub_rtn_u32
|
||||
; GCN: s_endpgm
|
||||
define amdgpu_kernel void @lds_atomic_sub_ret_i32(i32 addrspace(1)* %out, i32 addrspace(3)* %ptr) nounwind {
|
||||
@ -117,6 +150,10 @@ define amdgpu_kernel void @lds_atomic_sub_ret_i32(i32 addrspace(1)* %out, i32 ad
|
||||
|
||||
; FUNC-LABEL: {{^}}lds_atomic_sub_ret_i32_offset:
|
||||
; EG: LDS_SUB_RET *
|
||||
|
||||
; SICIVI: s_mov_b32 m0
|
||||
; GFX9-NOT: m0
|
||||
|
||||
; GCN: ds_sub_rtn_u32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} offset:16
|
||||
; GCN: s_endpgm
|
||||
define amdgpu_kernel void @lds_atomic_sub_ret_i32_offset(i32 addrspace(1)* %out, i32 addrspace(3)* %ptr) nounwind {
|
||||
@ -128,7 +165,11 @@ define amdgpu_kernel void @lds_atomic_sub_ret_i32_offset(i32 addrspace(1)* %out,
|
||||
|
||||
; FUNC-LABEL: {{^}}lds_atomic_sub1_ret_i32:
|
||||
; EG: LDS_SUB_RET *
|
||||
; GCN: v_mov_b32_e32 [[ONE:v[0-9]+]], 1{{$}}
|
||||
|
||||
; SICIVI-DAG: s_mov_b32 m0
|
||||
; GFX9-NOT: m0
|
||||
|
||||
; GCN-DAG: v_mov_b32_e32 [[ONE:v[0-9]+]], 1{{$}}
|
||||
; GCN: ds_sub_rtn_u32 v{{[0-9]+}}, v{{[0-9]+}}, [[ONE]]
|
||||
; GCN: s_endpgm
|
||||
define amdgpu_kernel void @lds_atomic_sub1_ret_i32(i32 addrspace(1)* %out, i32 addrspace(3)* %ptr) nounwind {
|
||||
@ -139,7 +180,11 @@ define amdgpu_kernel void @lds_atomic_sub1_ret_i32(i32 addrspace(1)* %out, i32 a
|
||||
|
||||
; FUNC-LABEL: {{^}}lds_atomic_sub1_ret_i32_offset:
|
||||
; EG: LDS_SUB_RET *
|
||||
; GCN: v_mov_b32_e32 [[ONE:v[0-9]+]], 1{{$}}
|
||||
|
||||
; SICIVI-DAG: s_mov_b32 m0
|
||||
; GFX9-NOT: m0
|
||||
|
||||
; GCN-DAG: v_mov_b32_e32 [[ONE:v[0-9]+]], 1{{$}}
|
||||
; GCN: ds_sub_rtn_u32 v{{[0-9]+}}, v{{[0-9]+}}, [[ONE]] offset:16
|
||||
; GCN: s_endpgm
|
||||
define amdgpu_kernel void @lds_atomic_sub1_ret_i32_offset(i32 addrspace(1)* %out, i32 addrspace(3)* %ptr) nounwind {
|
||||
@ -151,6 +196,10 @@ define amdgpu_kernel void @lds_atomic_sub1_ret_i32_offset(i32 addrspace(1)* %out
|
||||
|
||||
; FUNC-LABEL: {{^}}lds_atomic_and_ret_i32:
|
||||
; EG: LDS_AND_RET *
|
||||
|
||||
; SICIVI-DAG: s_mov_b32 m0
|
||||
; GFX9-NOT: m0
|
||||
|
||||
; GCN: ds_and_rtn_b32
|
||||
; GCN: s_endpgm
|
||||
define amdgpu_kernel void @lds_atomic_and_ret_i32(i32 addrspace(1)* %out, i32 addrspace(3)* %ptr) nounwind {
|
||||
@ -160,6 +209,9 @@ define amdgpu_kernel void @lds_atomic_and_ret_i32(i32 addrspace(1)* %out, i32 ad
|
||||
}
|
||||
|
||||
; FUNC-LABEL: {{^}}lds_atomic_and_ret_i32_offset:
|
||||
; SICIVI: s_mov_b32 m0
|
||||
; GFX9-NOT: m0
|
||||
|
||||
; EG: LDS_AND_RET *
|
||||
; GCN: ds_and_rtn_b32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} offset:16
|
||||
; GCN: s_endpgm
|
||||
@ -171,6 +223,9 @@ define amdgpu_kernel void @lds_atomic_and_ret_i32_offset(i32 addrspace(1)* %out,
|
||||
}
|
||||
|
||||
; FUNC-LABEL: {{^}}lds_atomic_or_ret_i32:
|
||||
; SICIVI: s_mov_b32 m0
|
||||
; GFX9-NOT: m0
|
||||
|
||||
; EG: LDS_OR_RET *
|
||||
; GCN: ds_or_rtn_b32
|
||||
; GCN: s_endpgm
|
||||
@ -181,6 +236,9 @@ define amdgpu_kernel void @lds_atomic_or_ret_i32(i32 addrspace(1)* %out, i32 add
|
||||
}
|
||||
|
||||
; FUNC-LABEL: {{^}}lds_atomic_or_ret_i32_offset:
|
||||
; SICIVI: s_mov_b32 m0
|
||||
; GFX9-NOT: m0
|
||||
|
||||
; EG: LDS_OR_RET *
|
||||
; GCN: ds_or_rtn_b32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} offset:16
|
||||
; GCN: s_endpgm
|
||||
@ -192,6 +250,9 @@ define amdgpu_kernel void @lds_atomic_or_ret_i32_offset(i32 addrspace(1)* %out,
|
||||
}
|
||||
|
||||
; FUNC-LABEL: {{^}}lds_atomic_xor_ret_i32:
|
||||
; SICIVI: s_mov_b32 m0
|
||||
; GFX9-NOT: m0
|
||||
|
||||
; EG: LDS_XOR_RET *
|
||||
; GCN: ds_xor_rtn_b32
|
||||
; GCN: s_endpgm
|
||||
@ -202,6 +263,9 @@ define amdgpu_kernel void @lds_atomic_xor_ret_i32(i32 addrspace(1)* %out, i32 ad
|
||||
}
|
||||
|
||||
; FUNC-LABEL: {{^}}lds_atomic_xor_ret_i32_offset:
|
||||
; SICIVI: s_mov_b32 m0
|
||||
; GFX9-NOT: m0
|
||||
|
||||
; EG: LDS_XOR_RET *
|
||||
; GCN: ds_xor_rtn_b32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} offset:16
|
||||
; GCN: s_endpgm
|
||||
@ -221,6 +285,9 @@ define amdgpu_kernel void @lds_atomic_xor_ret_i32_offset(i32 addrspace(1)* %out,
|
||||
; }
|
||||
|
||||
; FUNC-LABEL: {{^}}lds_atomic_min_ret_i32:
|
||||
; SICIVI: s_mov_b32 m0
|
||||
; GFX9-NOT: m0
|
||||
|
||||
; EG: LDS_MIN_INT_RET *
|
||||
; GCN: ds_min_rtn_i32
|
||||
; GCN: s_endpgm
|
||||
@ -231,6 +298,9 @@ define amdgpu_kernel void @lds_atomic_min_ret_i32(i32 addrspace(1)* %out, i32 ad
|
||||
}
|
||||
|
||||
; FUNC-LABEL: {{^}}lds_atomic_min_ret_i32_offset:
|
||||
; SICIVI: s_mov_b32 m0
|
||||
; GFX9-NOT: m0
|
||||
|
||||
; EG: LDS_MIN_INT_RET *
|
||||
; GCN: ds_min_rtn_i32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} offset:16
|
||||
; GCN: s_endpgm
|
||||
@ -242,6 +312,9 @@ define amdgpu_kernel void @lds_atomic_min_ret_i32_offset(i32 addrspace(1)* %out,
|
||||
}
|
||||
|
||||
; FUNC-LABEL: {{^}}lds_atomic_max_ret_i32:
|
||||
; SICIVI: s_mov_b32 m0
|
||||
; GFX9-NOT: m0
|
||||
|
||||
; EG: LDS_MAX_INT_RET *
|
||||
; GCN: ds_max_rtn_i32
|
||||
; GCN: s_endpgm
|
||||
@ -252,6 +325,9 @@ define amdgpu_kernel void @lds_atomic_max_ret_i32(i32 addrspace(1)* %out, i32 ad
|
||||
}
|
||||
|
||||
; FUNC-LABEL: {{^}}lds_atomic_max_ret_i32_offset:
|
||||
; SICIVI: s_mov_b32 m0
|
||||
; GFX9-NOT: m0
|
||||
|
||||
; EG: LDS_MAX_INT_RET *
|
||||
; GCN: ds_max_rtn_i32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} offset:16
|
||||
; GCN: s_endpgm
|
||||
@ -263,6 +339,9 @@ define amdgpu_kernel void @lds_atomic_max_ret_i32_offset(i32 addrspace(1)* %out,
|
||||
}
|
||||
|
||||
; FUNC-LABEL: {{^}}lds_atomic_umin_ret_i32:
|
||||
; SICIVI: s_mov_b32 m0
|
||||
; GFX9-NOT: m0
|
||||
|
||||
; EG: LDS_MIN_UINT_RET *
|
||||
; GCN: ds_min_rtn_u32
|
||||
; GCN: s_endpgm
|
||||
@ -273,6 +352,9 @@ define amdgpu_kernel void @lds_atomic_umin_ret_i32(i32 addrspace(1)* %out, i32 a
|
||||
}
|
||||
|
||||
; FUNC-LABEL: {{^}}lds_atomic_umin_ret_i32_offset:
|
||||
; SICIVI: s_mov_b32 m0
|
||||
; GFX9-NOT: m0
|
||||
|
||||
; EG: LDS_MIN_UINT_RET *
|
||||
; GCN: ds_min_rtn_u32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} offset:16
|
||||
; GCN: s_endpgm
|
||||
@ -284,6 +366,9 @@ define amdgpu_kernel void @lds_atomic_umin_ret_i32_offset(i32 addrspace(1)* %out
|
||||
}
|
||||
|
||||
; FUNC-LABEL: {{^}}lds_atomic_umax_ret_i32:
|
||||
; SICIVI: s_mov_b32 m0
|
||||
; GFX9-NOT: m0
|
||||
|
||||
; EG: LDS_MAX_UINT_RET *
|
||||
; GCN: ds_max_rtn_u32
|
||||
; GCN: s_endpgm
|
||||
@ -294,6 +379,9 @@ define amdgpu_kernel void @lds_atomic_umax_ret_i32(i32 addrspace(1)* %out, i32 a
|
||||
}
|
||||
|
||||
; FUNC-LABEL: {{^}}lds_atomic_umax_ret_i32_offset:
|
||||
; SICIVI: s_mov_b32 m0
|
||||
; GFX9-NOT: m0
|
||||
|
||||
; EG: LDS_MAX_UINT_RET *
|
||||
; GCN: ds_max_rtn_u32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} offset:16
|
||||
; GCN: s_endpgm
|
||||
@ -305,9 +393,12 @@ define amdgpu_kernel void @lds_atomic_umax_ret_i32_offset(i32 addrspace(1)* %out
|
||||
}
|
||||
|
||||
; FUNC-LABEL: {{^}}lds_atomic_xchg_noret_i32:
|
||||
; GCN: s_load_dword [[SPTR:s[0-9]+]],
|
||||
; GCN: v_mov_b32_e32 [[DATA:v[0-9]+]], 4
|
||||
; GCN: v_mov_b32_e32 [[VPTR:v[0-9]+]], [[SPTR]]
|
||||
; SICIVI-DAG: s_mov_b32 m0
|
||||
; GFX9-NOT: m0
|
||||
|
||||
; GCN-DAG: s_load_dword [[SPTR:s[0-9]+]],
|
||||
; GCN-DAG: v_mov_b32_e32 [[DATA:v[0-9]+]], 4
|
||||
; GCN-DAG: v_mov_b32_e32 [[VPTR:v[0-9]+]], [[SPTR]]
|
||||
; GCN: ds_wrxchg_rtn_b32 [[RESULT:v[0-9]+]], [[VPTR]], [[DATA]]
|
||||
; GCN: s_endpgm
|
||||
define amdgpu_kernel void @lds_atomic_xchg_noret_i32(i32 addrspace(3)* %ptr) nounwind {
|
||||
@ -316,6 +407,9 @@ define amdgpu_kernel void @lds_atomic_xchg_noret_i32(i32 addrspace(3)* %ptr) nou
|
||||
}
|
||||
|
||||
; FUNC-LABEL: {{^}}lds_atomic_xchg_noret_i32_offset:
|
||||
; SICIVI: s_mov_b32 m0
|
||||
; GFX9-NOT: m0
|
||||
|
||||
; GCN: ds_wrxchg_rtn_b32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} offset:16
|
||||
; GCN: s_endpgm
|
||||
define amdgpu_kernel void @lds_atomic_xchg_noret_i32_offset(i32 addrspace(3)* %ptr) nounwind {
|
||||
@ -325,9 +419,12 @@ define amdgpu_kernel void @lds_atomic_xchg_noret_i32_offset(i32 addrspace(3)* %p
|
||||
}
|
||||
|
||||
; FUNC-LABEL: {{^}}lds_atomic_add_noret_i32:
|
||||
; GCN: s_load_dword [[SPTR:s[0-9]+]],
|
||||
; GCN: v_mov_b32_e32 [[DATA:v[0-9]+]], 4
|
||||
; GCN: v_mov_b32_e32 [[VPTR:v[0-9]+]], [[SPTR]]
|
||||
; SICIVI-DAG: s_mov_b32 m0
|
||||
; GFX9-NOT: m0
|
||||
|
||||
; GCN-DAG: s_load_dword [[SPTR:s[0-9]+]],
|
||||
; GCN-DAG: v_mov_b32_e32 [[DATA:v[0-9]+]], 4
|
||||
; GCN-DAG: v_mov_b32_e32 [[VPTR:v[0-9]+]], [[SPTR]]
|
||||
; GCN: ds_add_u32 [[VPTR]], [[DATA]]
|
||||
; GCN: s_endpgm
|
||||
define amdgpu_kernel void @lds_atomic_add_noret_i32(i32 addrspace(3)* %ptr) nounwind {
|
||||
@ -336,6 +433,9 @@ define amdgpu_kernel void @lds_atomic_add_noret_i32(i32 addrspace(3)* %ptr) noun
|
||||
}
|
||||
|
||||
; FUNC-LABEL: {{^}}lds_atomic_add_noret_i32_offset:
|
||||
; SICIVI: s_mov_b32 m0
|
||||
; GFX9-NOT: m0
|
||||
|
||||
; GCN: ds_add_u32 v{{[0-9]+}}, v{{[0-9]+}} offset:16
|
||||
; GCN: s_endpgm
|
||||
define amdgpu_kernel void @lds_atomic_add_noret_i32_offset(i32 addrspace(3)* %ptr) nounwind {
|
||||
@ -345,6 +445,9 @@ define amdgpu_kernel void @lds_atomic_add_noret_i32_offset(i32 addrspace(3)* %pt
|
||||
}
|
||||
|
||||
; FUNC-LABEL: {{^}}lds_atomic_add_noret_i32_bad_si_offset
|
||||
; SICIVI: s_mov_b32 m0
|
||||
; GFX9-NOT: m0
|
||||
|
||||
; SI: ds_add_u32 v{{[0-9]+}}, v{{[0-9]+}}
|
||||
; CIVI: ds_add_u32 v{{[0-9]+}}, v{{[0-9]+}} offset:16
|
||||
; GCN: s_endpgm
|
||||
@ -357,7 +460,10 @@ define amdgpu_kernel void @lds_atomic_add_noret_i32_bad_si_offset(i32 addrspace(
|
||||
}
|
||||
|
||||
; FUNC-LABEL: {{^}}lds_atomic_add1_noret_i32:
|
||||
; GCN: v_mov_b32_e32 [[ONE:v[0-9]+]], 1{{$}}
|
||||
; SICIVI-DAG: s_mov_b32 m0
|
||||
; GFX9-NOT: m0
|
||||
|
||||
; GCN-DAG: v_mov_b32_e32 [[ONE:v[0-9]+]], 1{{$}}
|
||||
; GCN: ds_add_u32 v{{[0-9]+}}, [[ONE]]
|
||||
; GCN: s_endpgm
|
||||
define amdgpu_kernel void @lds_atomic_add1_noret_i32(i32 addrspace(3)* %ptr) nounwind {
|
||||
@ -366,7 +472,10 @@ define amdgpu_kernel void @lds_atomic_add1_noret_i32(i32 addrspace(3)* %ptr) nou
|
||||
}
|
||||
|
||||
; FUNC-LABEL: {{^}}lds_atomic_add1_noret_i32_offset:
|
||||
; GCN: v_mov_b32_e32 [[ONE:v[0-9]+]], 1{{$}}
|
||||
; SICIVI-DAG: s_mov_b32 m0
|
||||
; GFX9-NOT: m0
|
||||
|
||||
; GCN-DAG: v_mov_b32_e32 [[ONE:v[0-9]+]], 1{{$}}
|
||||
; GCN: ds_add_u32 v{{[0-9]+}}, [[ONE]] offset:16
|
||||
; GCN: s_endpgm
|
||||
define amdgpu_kernel void @lds_atomic_add1_noret_i32_offset(i32 addrspace(3)* %ptr) nounwind {
|
||||
@ -376,6 +485,9 @@ define amdgpu_kernel void @lds_atomic_add1_noret_i32_offset(i32 addrspace(3)* %p
|
||||
}
|
||||
|
||||
; FUNC-LABEL: {{^}}lds_atomic_add1_noret_i32_bad_si_offset:
|
||||
; SICIVI: s_mov_b32 m0
|
||||
; GFX9-NOT: m0
|
||||
|
||||
; SI: ds_add_u32 v{{[0-9]+}}, v{{[0-9]+}}
|
||||
; CIVI: ds_add_u32 v{{[0-9]+}}, v{{[0-9]+}} offset:16
|
||||
; GCN: s_endpgm
|
||||
@ -388,6 +500,9 @@ define amdgpu_kernel void @lds_atomic_add1_noret_i32_bad_si_offset(i32 addrspace
|
||||
}
|
||||
|
||||
; FUNC-LABEL: {{^}}lds_atomic_sub_noret_i32:
|
||||
; SICIVI: s_mov_b32 m0
|
||||
; GFX9-NOT: m0
|
||||
|
||||
; GCN: ds_sub_u32
|
||||
; GCN: s_endpgm
|
||||
define amdgpu_kernel void @lds_atomic_sub_noret_i32(i32 addrspace(3)* %ptr) nounwind {
|
||||
@ -396,6 +511,9 @@ define amdgpu_kernel void @lds_atomic_sub_noret_i32(i32 addrspace(3)* %ptr) noun
|
||||
}
|
||||
|
||||
; FUNC-LABEL: {{^}}lds_atomic_sub_noret_i32_offset:
|
||||
; SICIVI: s_mov_b32 m0
|
||||
; GFX9-NOT: m0
|
||||
|
||||
; GCN: ds_sub_u32 v{{[0-9]+}}, v{{[0-9]+}} offset:16
|
||||
; GCN: s_endpgm
|
||||
define amdgpu_kernel void @lds_atomic_sub_noret_i32_offset(i32 addrspace(3)* %ptr) nounwind {
|
||||
@ -405,7 +523,10 @@ define amdgpu_kernel void @lds_atomic_sub_noret_i32_offset(i32 addrspace(3)* %pt
|
||||
}
|
||||
|
||||
; FUNC-LABEL: {{^}}lds_atomic_sub1_noret_i32:
|
||||
; GCN: v_mov_b32_e32 [[ONE:v[0-9]+]], 1{{$}}
|
||||
; SICIVI-DAG: s_mov_b32 m0
|
||||
; GFX9-NOT: m0
|
||||
|
||||
; GCN-DAG: v_mov_b32_e32 [[ONE:v[0-9]+]], 1{{$}}
|
||||
; GCN: ds_sub_u32 v{{[0-9]+}}, [[ONE]]
|
||||
; GCN: s_endpgm
|
||||
define amdgpu_kernel void @lds_atomic_sub1_noret_i32(i32 addrspace(3)* %ptr) nounwind {
|
||||
@ -414,7 +535,10 @@ define amdgpu_kernel void @lds_atomic_sub1_noret_i32(i32 addrspace(3)* %ptr) nou
|
||||
}
|
||||
|
||||
; FUNC-LABEL: {{^}}lds_atomic_sub1_noret_i32_offset:
|
||||
; GCN: v_mov_b32_e32 [[ONE:v[0-9]+]], 1{{$}}
|
||||
; SICIVI-DAG: s_mov_b32 m0
|
||||
; GFX9-NOT: m0
|
||||
|
||||
; GCN-DAG: v_mov_b32_e32 [[ONE:v[0-9]+]], 1{{$}}
|
||||
; GCN: ds_sub_u32 v{{[0-9]+}}, [[ONE]] offset:16
|
||||
; GCN: s_endpgm
|
||||
define amdgpu_kernel void @lds_atomic_sub1_noret_i32_offset(i32 addrspace(3)* %ptr) nounwind {
|
||||
@ -424,6 +548,9 @@ define amdgpu_kernel void @lds_atomic_sub1_noret_i32_offset(i32 addrspace(3)* %p
|
||||
}
|
||||
|
||||
; FUNC-LABEL: {{^}}lds_atomic_and_noret_i32:
|
||||
; SICIVI: s_mov_b32 m0
|
||||
; GFX9-NOT: m0
|
||||
|
||||
; GCN: ds_and_b32
|
||||
; GCN: s_endpgm
|
||||
define amdgpu_kernel void @lds_atomic_and_noret_i32(i32 addrspace(3)* %ptr) nounwind {
|
||||
@ -432,6 +559,9 @@ define amdgpu_kernel void @lds_atomic_and_noret_i32(i32 addrspace(3)* %ptr) noun
|
||||
}
|
||||
|
||||
; FUNC-LABEL: {{^}}lds_atomic_and_noret_i32_offset:
|
||||
; SICIVI: s_mov_b32 m0
|
||||
; GFX9-NOT: m0
|
||||
|
||||
; GCN: ds_and_b32 v{{[0-9]+}}, v{{[0-9]+}} offset:16
|
||||
; GCN: s_endpgm
|
||||
define amdgpu_kernel void @lds_atomic_and_noret_i32_offset(i32 addrspace(3)* %ptr) nounwind {
|
||||
@ -441,6 +571,9 @@ define amdgpu_kernel void @lds_atomic_and_noret_i32_offset(i32 addrspace(3)* %pt
|
||||
}
|
||||
|
||||
; FUNC-LABEL: {{^}}lds_atomic_or_noret_i32:
|
||||
; SICIVI: s_mov_b32 m0
|
||||
; GFX9-NOT: m0
|
||||
|
||||
; GCN: ds_or_b32
|
||||
; GCN: s_endpgm
|
||||
define amdgpu_kernel void @lds_atomic_or_noret_i32(i32 addrspace(3)* %ptr) nounwind {
|
||||
@ -449,6 +582,9 @@ define amdgpu_kernel void @lds_atomic_or_noret_i32(i32 addrspace(3)* %ptr) nounw
|
||||
}
|
||||
|
||||
; FUNC-LABEL: {{^}}lds_atomic_or_noret_i32_offset:
|
||||
; SICIVI: s_mov_b32 m0
|
||||
; GFX9-NOT: m0
|
||||
|
||||
; GCN: ds_or_b32 v{{[0-9]+}}, v{{[0-9]+}} offset:16
|
||||
; GCN: s_endpgm
|
||||
define amdgpu_kernel void @lds_atomic_or_noret_i32_offset(i32 addrspace(3)* %ptr) nounwind {
|
||||
@ -458,6 +594,9 @@ define amdgpu_kernel void @lds_atomic_or_noret_i32_offset(i32 addrspace(3)* %ptr
|
||||
}
|
||||
|
||||
; FUNC-LABEL: {{^}}lds_atomic_xor_noret_i32:
|
||||
; SICIVI: s_mov_b32 m0
|
||||
; GFX9-NOT: m0
|
||||
|
||||
; GCN: ds_xor_b32
|
||||
; GCN: s_endpgm
|
||||
define amdgpu_kernel void @lds_atomic_xor_noret_i32(i32 addrspace(3)* %ptr) nounwind {
|
||||
@ -466,6 +605,9 @@ define amdgpu_kernel void @lds_atomic_xor_noret_i32(i32 addrspace(3)* %ptr) noun
|
||||
}
|
||||
|
||||
; FUNC-LABEL: {{^}}lds_atomic_xor_noret_i32_offset:
|
||||
; SICIVI: s_mov_b32 m0
|
||||
; GFX9-NOT: m0
|
||||
|
||||
; GCN: ds_xor_b32 v{{[0-9]+}}, v{{[0-9]+}} offset:16
|
||||
; GCN: s_endpgm
|
||||
define amdgpu_kernel void @lds_atomic_xor_noret_i32_offset(i32 addrspace(3)* %ptr) nounwind {
|
||||
@ -482,6 +624,9 @@ define amdgpu_kernel void @lds_atomic_xor_noret_i32_offset(i32 addrspace(3)* %pt
|
||||
; }
|
||||
|
||||
; FUNC-LABEL: {{^}}lds_atomic_min_noret_i32:
|
||||
; SICIVI: s_mov_b32 m0
|
||||
; GFX9-NOT: m0
|
||||
|
||||
; GCN: ds_min_i32
|
||||
; GCN: s_endpgm
|
||||
define amdgpu_kernel void @lds_atomic_min_noret_i32(i32 addrspace(3)* %ptr) nounwind {
|
||||
@ -490,6 +635,9 @@ define amdgpu_kernel void @lds_atomic_min_noret_i32(i32 addrspace(3)* %ptr) noun
|
||||
}
|
||||
|
||||
; FUNC-LABEL: {{^}}lds_atomic_min_noret_i32_offset:
|
||||
; SICIVI: s_mov_b32 m0
|
||||
; GFX9-NOT: m0
|
||||
|
||||
; GCN: ds_min_i32 v{{[0-9]+}}, v{{[0-9]+}} offset:16
|
||||
; GCN: s_endpgm
|
||||
define amdgpu_kernel void @lds_atomic_min_noret_i32_offset(i32 addrspace(3)* %ptr) nounwind {
|
||||
@ -499,6 +647,9 @@ define amdgpu_kernel void @lds_atomic_min_noret_i32_offset(i32 addrspace(3)* %pt
|
||||
}
|
||||
|
||||
; FUNC-LABEL: {{^}}lds_atomic_max_noret_i32:
|
||||
; SICIVI: s_mov_b32 m0
|
||||
; GFX9-NOT: m0
|
||||
|
||||
; GCN: ds_max_i32
|
||||
; GCN: s_endpgm
|
||||
define amdgpu_kernel void @lds_atomic_max_noret_i32(i32 addrspace(3)* %ptr) nounwind {
|
||||
@ -507,6 +658,9 @@ define amdgpu_kernel void @lds_atomic_max_noret_i32(i32 addrspace(3)* %ptr) noun
|
||||
}
|
||||
|
||||
; FUNC-LABEL: {{^}}lds_atomic_max_noret_i32_offset:
|
||||
; SICIVI: s_mov_b32 m0
|
||||
; GFX9-NOT: m0
|
||||
|
||||
; GCN: ds_max_i32 v{{[0-9]+}}, v{{[0-9]+}} offset:16
|
||||
; GCN: s_endpgm
|
||||
define amdgpu_kernel void @lds_atomic_max_noret_i32_offset(i32 addrspace(3)* %ptr) nounwind {
|
||||
@ -516,6 +670,9 @@ define amdgpu_kernel void @lds_atomic_max_noret_i32_offset(i32 addrspace(3)* %pt
|
||||
}
|
||||
|
||||
; FUNC-LABEL: {{^}}lds_atomic_umin_noret_i32:
|
||||
; SICIVI: s_mov_b32 m0
|
||||
; GFX9-NOT: m0
|
||||
|
||||
; GCN: ds_min_u32
|
||||
; GCN: s_endpgm
|
||||
define amdgpu_kernel void @lds_atomic_umin_noret_i32(i32 addrspace(3)* %ptr) nounwind {
|
||||
@ -524,6 +681,9 @@ define amdgpu_kernel void @lds_atomic_umin_noret_i32(i32 addrspace(3)* %ptr) nou
|
||||
}
|
||||
|
||||
; FUNC-LABEL: {{^}}lds_atomic_umin_noret_i32_offset:
|
||||
; SICIVI: s_mov_b32 m0
|
||||
; GFX9-NOT: m0
|
||||
|
||||
; GCN: ds_min_u32 v{{[0-9]+}}, v{{[0-9]+}} offset:16
|
||||
; GCN: s_endpgm
|
||||
define amdgpu_kernel void @lds_atomic_umin_noret_i32_offset(i32 addrspace(3)* %ptr) nounwind {
|
||||
@ -533,6 +693,9 @@ define amdgpu_kernel void @lds_atomic_umin_noret_i32_offset(i32 addrspace(3)* %p
|
||||
}
|
||||
|
||||
; FUNC-LABEL: {{^}}lds_atomic_umax_noret_i32:
|
||||
; SICIVI: s_mov_b32 m0
|
||||
; GFX9-NOT: m0
|
||||
|
||||
; GCN: ds_max_u32
|
||||
; GCN: s_endpgm
|
||||
define amdgpu_kernel void @lds_atomic_umax_noret_i32(i32 addrspace(3)* %ptr) nounwind {
|
||||
@ -541,6 +704,9 @@ define amdgpu_kernel void @lds_atomic_umax_noret_i32(i32 addrspace(3)* %ptr) nou
|
||||
}
|
||||
|
||||
; FUNC-LABEL: {{^}}lds_atomic_umax_noret_i32_offset:
|
||||
; SICIVI: s_mov_b32 m0
|
||||
; GFX9-NOT: m0
|
||||
|
||||
; GCN: ds_max_u32 v{{[0-9]+}}, v{{[0-9]+}} offset:16
|
||||
; GCN: s_endpgm
|
||||
define amdgpu_kernel void @lds_atomic_umax_noret_i32_offset(i32 addrspace(3)* %ptr) nounwind {
|
||||
|
@ -1,7 +1,11 @@
|
||||
; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -strict-whitespace -check-prefix=SI -check-prefix=GCN %s
|
||||
; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -strict-whitespace -check-prefix=VI -check-prefix=GCN %s
|
||||
; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -enable-var-scope -strict-whitespace -check-prefixes=GCN,SI,SICIVI %s
|
||||
; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -enable-var-scope -strict-whitespace -check-prefixes=GCN,VI,SICIVI,GFX89 %s
|
||||
; RUN: llc -march=amdgcn -mcpu=gfx900 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -enable-var-scope -strict-whitespace -check-prefixes=GCN,GFX9,GFX89 %s
|
||||
|
||||
; GCN-LABEL: {{^}}lds_atomic_xchg_ret_i64:
|
||||
; SICIVI: s_mov_b32 m0
|
||||
; GFX9-NOT: m0
|
||||
|
||||
; GCN: ds_wrxchg_rtn_b64
|
||||
; GCN: s_endpgm
|
||||
define amdgpu_kernel void @lds_atomic_xchg_ret_i64(i64 addrspace(1)* %out, i64 addrspace(3)* %ptr) nounwind {
|
||||
@ -11,6 +15,9 @@ define amdgpu_kernel void @lds_atomic_xchg_ret_i64(i64 addrspace(1)* %out, i64 a
|
||||
}
|
||||
|
||||
; GCN-LABEL: {{^}}lds_atomic_xchg_ret_i64_offset:
|
||||
; SICIVI: s_mov_b32 m0
|
||||
; GFX9-NOT: m0
|
||||
|
||||
; GCN: ds_wrxchg_rtn_b64 {{.*}} offset:32
|
||||
; GCN: s_endpgm
|
||||
define amdgpu_kernel void @lds_atomic_xchg_ret_i64_offset(i64 addrspace(1)* %out, i64 addrspace(3)* %ptr) nounwind {
|
||||
@ -21,6 +28,9 @@ define amdgpu_kernel void @lds_atomic_xchg_ret_i64_offset(i64 addrspace(1)* %out
|
||||
}
|
||||
|
||||
; GCN-LABEL: {{^}}lds_atomic_add_ret_i64:
|
||||
; SICIVI: s_mov_b32 m0
|
||||
; GFX9-NOT: m0
|
||||
|
||||
; GCN: ds_add_rtn_u64
|
||||
; GCN: s_endpgm
|
||||
define amdgpu_kernel void @lds_atomic_add_ret_i64(i64 addrspace(1)* %out, i64 addrspace(3)* %ptr) nounwind {
|
||||
@ -30,10 +40,13 @@ define amdgpu_kernel void @lds_atomic_add_ret_i64(i64 addrspace(1)* %out, i64 ad
|
||||
}
|
||||
|
||||
; GCN-LABEL: {{^}}lds_atomic_add_ret_i64_offset:
|
||||
; SI: s_load_dword [[PTR:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0xb
|
||||
; VI: s_load_dword [[PTR:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x2c
|
||||
; GCN: v_mov_b32_e32 v[[LOVDATA:[0-9]+]], 9
|
||||
; GCN: v_mov_b32_e32 v[[HIVDATA:[0-9]+]], 0
|
||||
; SICIVI-DAG: s_mov_b32 m0
|
||||
; GFX9-NOT: m0
|
||||
|
||||
; SI-DAG: s_load_dword [[PTR:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0xb
|
||||
; GFX89-DAG: s_load_dword [[PTR:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x2c
|
||||
; GCN-DAG: v_mov_b32_e32 v[[LOVDATA:[0-9]+]], 9
|
||||
; GCN-DAG: v_mov_b32_e32 v[[HIVDATA:[0-9]+]], 0
|
||||
; GCN-DAG: v_mov_b32_e32 [[VPTR:v[0-9]+]], [[PTR]]
|
||||
; GCN: ds_add_rtn_u64 [[RESULT:v\[[0-9]+:[0-9]+\]]], [[VPTR]], v{{\[}}[[LOVDATA]]:[[HIVDATA]]{{\]}} offset:32
|
||||
; GCN: buffer_store_dwordx2 [[RESULT]],
|
||||
@ -46,9 +59,12 @@ define amdgpu_kernel void @lds_atomic_add_ret_i64_offset(i64 addrspace(1)* %out,
|
||||
}
|
||||
|
||||
; GCN-LABEL: {{^}}lds_atomic_add1_ret_i64:
|
||||
; GCN: v_mov_b32_e32 v[[LOVDATA:[0-9]+]], 1{{$}}
|
||||
; GCN: v_mov_b32_e32 v[[HIVDATA:[0-9]+]], 0{{$}}
|
||||
; GCN: ds_add_rtn_u64 [[RESULT:v\[[0-9]+:[0-9]+\]]], [[VPTR]], v{{\[}}[[LOVDATA]]:[[HIVDATA]]{{\]}}
|
||||
; SICIVI-DAG: s_mov_b32 m0
|
||||
; GFX9-NOT: m0
|
||||
|
||||
; GCN-DAG: v_mov_b32_e32 v[[LOVDATA:[0-9]+]], 1{{$}}
|
||||
; GCN-DAG: v_mov_b32_e32 v[[HIVDATA:[0-9]+]], 0{{$}}
|
||||
; GCN: ds_add_rtn_u64 [[RESULT:v\[[0-9]+:[0-9]+\]]], {{v[0-9]+}}, v{{\[}}[[LOVDATA]]:[[HIVDATA]]{{\]}}
|
||||
; GCN: buffer_store_dwordx2 [[RESULT]],
|
||||
; GCN: s_endpgm
|
||||
define amdgpu_kernel void @lds_atomic_add1_ret_i64(i64 addrspace(1)* %out, i64 addrspace(3)* %ptr) nounwind {
|
||||
@ -58,6 +74,9 @@ define amdgpu_kernel void @lds_atomic_add1_ret_i64(i64 addrspace(1)* %out, i64 a
|
||||
}
|
||||
|
||||
; GCN-LABEL: {{^}}lds_atomic_add1_ret_i64_offset:
|
||||
; SICIVI: s_mov_b32 m0
|
||||
; GFX9-NOT: m0
|
||||
|
||||
; GCN: ds_add_rtn_u64 {{.*}} offset:32
|
||||
; GCN: s_endpgm
|
||||
define amdgpu_kernel void @lds_atomic_add1_ret_i64_offset(i64 addrspace(1)* %out, i64 addrspace(3)* %ptr) nounwind {
|
||||
@ -68,6 +87,9 @@ define amdgpu_kernel void @lds_atomic_add1_ret_i64_offset(i64 addrspace(1)* %out
|
||||
}
|
||||
|
||||
; GCN-LABEL: {{^}}lds_atomic_sub_ret_i64:
|
||||
; SICIVI: s_mov_b32 m0
|
||||
; GFX9-NOT: m0
|
||||
|
||||
; GCN: ds_sub_rtn_u64
|
||||
; GCN: s_endpgm
|
||||
define amdgpu_kernel void @lds_atomic_sub_ret_i64(i64 addrspace(1)* %out, i64 addrspace(3)* %ptr) nounwind {
|
||||
@ -77,6 +99,9 @@ define amdgpu_kernel void @lds_atomic_sub_ret_i64(i64 addrspace(1)* %out, i64 ad
|
||||
}
|
||||
|
||||
; GCN-LABEL: {{^}}lds_atomic_sub_ret_i64_offset:
|
||||
; SICIVI: s_mov_b32 m0
|
||||
; GFX9-NOT: m0
|
||||
|
||||
; GCN: ds_sub_rtn_u64 {{.*}} offset:32
|
||||
; GCN: s_endpgm
|
||||
define amdgpu_kernel void @lds_atomic_sub_ret_i64_offset(i64 addrspace(1)* %out, i64 addrspace(3)* %ptr) nounwind {
|
||||
@ -87,9 +112,12 @@ define amdgpu_kernel void @lds_atomic_sub_ret_i64_offset(i64 addrspace(1)* %out,
|
||||
}
|
||||
|
||||
; GCN-LABEL: {{^}}lds_atomic_sub1_ret_i64:
|
||||
; GCN: v_mov_b32_e32 v[[LOVDATA:[0-9]+]], 1{{$}}
|
||||
; GCN: v_mov_b32_e32 v[[HIVDATA:[0-9]+]], 0{{$}}
|
||||
; GCN: ds_sub_rtn_u64 [[RESULT:v\[[0-9]+:[0-9]+\]]], [[VPTR]], v{{\[}}[[LOVDATA]]:[[HIVDATA]]{{\]}}
|
||||
; SICIVI-DAG: s_mov_b32 m0
|
||||
; GFX9-NOT: m0
|
||||
|
||||
; GCN-DAG: v_mov_b32_e32 v[[LOVDATA:[0-9]+]], 1{{$}}
|
||||
; GCN-DAG: v_mov_b32_e32 v[[HIVDATA:[0-9]+]], 0{{$}}
|
||||
; GCN: ds_sub_rtn_u64 [[RESULT:v\[[0-9]+:[0-9]+\]]], {{v[0-9]+}}, v{{\[}}[[LOVDATA]]:[[HIVDATA]]{{\]}}
|
||||
; GCN: buffer_store_dwordx2 [[RESULT]],
|
||||
; GCN: s_endpgm
|
||||
define amdgpu_kernel void @lds_atomic_sub1_ret_i64(i64 addrspace(1)* %out, i64 addrspace(3)* %ptr) nounwind {
|
||||
@ -99,6 +127,9 @@ define amdgpu_kernel void @lds_atomic_sub1_ret_i64(i64 addrspace(1)* %out, i64 a
|
||||
}
|
||||
|
||||
; GCN-LABEL: {{^}}lds_atomic_sub1_ret_i64_offset:
|
||||
; SICIVI: s_mov_b32 m0
|
||||
; GFX9-NOT: m0
|
||||
|
||||
; GCN: ds_sub_rtn_u64 {{.*}} offset:32
|
||||
; GCN: s_endpgm
|
||||
define amdgpu_kernel void @lds_atomic_sub1_ret_i64_offset(i64 addrspace(1)* %out, i64 addrspace(3)* %ptr) nounwind {
|
||||
@ -109,6 +140,9 @@ define amdgpu_kernel void @lds_atomic_sub1_ret_i64_offset(i64 addrspace(1)* %out
|
||||
}
|
||||
|
||||
; GCN-LABEL: {{^}}lds_atomic_and_ret_i64:
|
||||
; SICIVI: s_mov_b32 m0
|
||||
; GFX9-NOT: m0
|
||||
|
||||
; GCN: ds_and_rtn_b64
|
||||
; GCN: s_endpgm
|
||||
define amdgpu_kernel void @lds_atomic_and_ret_i64(i64 addrspace(1)* %out, i64 addrspace(3)* %ptr) nounwind {
|
||||
@ -118,6 +152,9 @@ define amdgpu_kernel void @lds_atomic_and_ret_i64(i64 addrspace(1)* %out, i64 ad
|
||||
}
|
||||
|
||||
; GCN-LABEL: {{^}}lds_atomic_and_ret_i64_offset:
|
||||
; SICIVI: s_mov_b32 m0
|
||||
; GFX9-NOT: m0
|
||||
|
||||
; GCN: ds_and_rtn_b64 {{.*}} offset:32
|
||||
; GCN: s_endpgm
|
||||
define amdgpu_kernel void @lds_atomic_and_ret_i64_offset(i64 addrspace(1)* %out, i64 addrspace(3)* %ptr) nounwind {
|
||||
@ -128,6 +165,9 @@ define amdgpu_kernel void @lds_atomic_and_ret_i64_offset(i64 addrspace(1)* %out,
|
||||
}
|
||||
|
||||
; GCN-LABEL: {{^}}lds_atomic_or_ret_i64:
|
||||
; SICIVI: s_mov_b32 m0
|
||||
; GFX9-NOT: m0
|
||||
|
||||
; GCN: ds_or_rtn_b64
|
||||
; GCN: s_endpgm
|
||||
define amdgpu_kernel void @lds_atomic_or_ret_i64(i64 addrspace(1)* %out, i64 addrspace(3)* %ptr) nounwind {
|
||||
@ -137,6 +177,9 @@ define amdgpu_kernel void @lds_atomic_or_ret_i64(i64 addrspace(1)* %out, i64 add
|
||||
}
|
||||
|
||||
; GCN-LABEL: {{^}}lds_atomic_or_ret_i64_offset:
|
||||
; SICIVI: s_mov_b32 m0
|
||||
; GFX9-NOT: m0
|
||||
|
||||
; GCN: ds_or_rtn_b64 {{.*}} offset:32
|
||||
; GCN: s_endpgm
|
||||
define amdgpu_kernel void @lds_atomic_or_ret_i64_offset(i64 addrspace(1)* %out, i64 addrspace(3)* %ptr) nounwind {
|
||||
@ -147,6 +190,9 @@ define amdgpu_kernel void @lds_atomic_or_ret_i64_offset(i64 addrspace(1)* %out,
|
||||
}
|
||||
|
||||
; GCN-LABEL: {{^}}lds_atomic_xor_ret_i64:
|
||||
; SICIVI: s_mov_b32 m0
|
||||
; GFX9-NOT: m0
|
||||
|
||||
; GCN: ds_xor_rtn_b64
|
||||
; GCN: s_endpgm
|
||||
define amdgpu_kernel void @lds_atomic_xor_ret_i64(i64 addrspace(1)* %out, i64 addrspace(3)* %ptr) nounwind {
|
||||
@ -156,6 +202,9 @@ define amdgpu_kernel void @lds_atomic_xor_ret_i64(i64 addrspace(1)* %out, i64 ad
|
||||
}
|
||||
|
||||
; GCN-LABEL: {{^}}lds_atomic_xor_ret_i64_offset:
|
||||
; SICIVI: s_mov_b32 m0
|
||||
; GFX9-NOT: m0
|
||||
|
||||
; GCN: ds_xor_rtn_b64 {{.*}} offset:32
|
||||
; GCN: s_endpgm
|
||||
define amdgpu_kernel void @lds_atomic_xor_ret_i64_offset(i64 addrspace(1)* %out, i64 addrspace(3)* %ptr) nounwind {
|
||||
@ -174,6 +223,9 @@ define amdgpu_kernel void @lds_atomic_xor_ret_i64_offset(i64 addrspace(1)* %out,
|
||||
; }
|
||||
|
||||
; GCN-LABEL: {{^}}lds_atomic_min_ret_i64:
|
||||
; SICIVI: s_mov_b32 m0
|
||||
; GFX9-NOT: m0
|
||||
|
||||
; GCN: ds_min_rtn_i64
|
||||
; GCN: s_endpgm
|
||||
define amdgpu_kernel void @lds_atomic_min_ret_i64(i64 addrspace(1)* %out, i64 addrspace(3)* %ptr) nounwind {
|
||||
@ -183,6 +235,9 @@ define amdgpu_kernel void @lds_atomic_min_ret_i64(i64 addrspace(1)* %out, i64 ad
|
||||
}
|
||||
|
||||
; GCN-LABEL: {{^}}lds_atomic_min_ret_i64_offset:
|
||||
; SICIVI: s_mov_b32 m0
|
||||
; GFX9-NOT: m0
|
||||
|
||||
; GCN: ds_min_rtn_i64 {{.*}} offset:32
|
||||
; GCN: s_endpgm
|
||||
define amdgpu_kernel void @lds_atomic_min_ret_i64_offset(i64 addrspace(1)* %out, i64 addrspace(3)* %ptr) nounwind {
|
||||
@ -193,6 +248,9 @@ define amdgpu_kernel void @lds_atomic_min_ret_i64_offset(i64 addrspace(1)* %out,
|
||||
}
|
||||
|
||||
; GCN-LABEL: {{^}}lds_atomic_max_ret_i64:
|
||||
; SICIVI: s_mov_b32 m0
|
||||
; GFX9-NOT: m0
|
||||
|
||||
; GCN: ds_max_rtn_i64
|
||||
; GCN: s_endpgm
|
||||
define amdgpu_kernel void @lds_atomic_max_ret_i64(i64 addrspace(1)* %out, i64 addrspace(3)* %ptr) nounwind {
|
||||
@ -202,6 +260,9 @@ define amdgpu_kernel void @lds_atomic_max_ret_i64(i64 addrspace(1)* %out, i64 ad
|
||||
}
|
||||
|
||||
; GCN-LABEL: {{^}}lds_atomic_max_ret_i64_offset:
|
||||
; SICIVI: s_mov_b32 m0
|
||||
; GFX9-NOT: m0
|
||||
|
||||
; GCN: ds_max_rtn_i64 {{.*}} offset:32
|
||||
; GCN: s_endpgm
|
||||
define amdgpu_kernel void @lds_atomic_max_ret_i64_offset(i64 addrspace(1)* %out, i64 addrspace(3)* %ptr) nounwind {
|
||||
@ -212,6 +273,9 @@ define amdgpu_kernel void @lds_atomic_max_ret_i64_offset(i64 addrspace(1)* %out,
|
||||
}
|
||||
|
||||
; GCN-LABEL: {{^}}lds_atomic_umin_ret_i64:
|
||||
; SICIVI: s_mov_b32 m0
|
||||
; GFX9-NOT: m0
|
||||
|
||||
; GCN: ds_min_rtn_u64
|
||||
; GCN: s_endpgm
|
||||
define amdgpu_kernel void @lds_atomic_umin_ret_i64(i64 addrspace(1)* %out, i64 addrspace(3)* %ptr) nounwind {
|
||||
@ -221,6 +285,9 @@ define amdgpu_kernel void @lds_atomic_umin_ret_i64(i64 addrspace(1)* %out, i64 a
|
||||
}
|
||||
|
||||
; GCN-LABEL: {{^}}lds_atomic_umin_ret_i64_offset:
|
||||
; SICIVI: s_mov_b32 m0
|
||||
; GFX9-NOT: m0
|
||||
|
||||
; GCN: ds_min_rtn_u64 {{.*}} offset:32
|
||||
; GCN: s_endpgm
|
||||
define amdgpu_kernel void @lds_atomic_umin_ret_i64_offset(i64 addrspace(1)* %out, i64 addrspace(3)* %ptr) nounwind {
|
||||
@ -231,6 +298,9 @@ define amdgpu_kernel void @lds_atomic_umin_ret_i64_offset(i64 addrspace(1)* %out
|
||||
}
|
||||
|
||||
; GCN-LABEL: {{^}}lds_atomic_umax_ret_i64:
|
||||
; SICIVI: s_mov_b32 m0
|
||||
; GFX9-NOT: m0
|
||||
|
||||
; GCN: ds_max_rtn_u64
|
||||
; GCN: s_endpgm
|
||||
define amdgpu_kernel void @lds_atomic_umax_ret_i64(i64 addrspace(1)* %out, i64 addrspace(3)* %ptr) nounwind {
|
||||
@ -240,6 +310,9 @@ define amdgpu_kernel void @lds_atomic_umax_ret_i64(i64 addrspace(1)* %out, i64 a
|
||||
}
|
||||
|
||||
; GCN-LABEL: {{^}}lds_atomic_umax_ret_i64_offset:
|
||||
; SICIVI: s_mov_b32 m0
|
||||
; GFX9-NOT: m0
|
||||
|
||||
; GCN: ds_max_rtn_u64 {{.*}} offset:32
|
||||
; GCN: s_endpgm
|
||||
define amdgpu_kernel void @lds_atomic_umax_ret_i64_offset(i64 addrspace(1)* %out, i64 addrspace(3)* %ptr) nounwind {
|
||||
@ -250,6 +323,9 @@ define amdgpu_kernel void @lds_atomic_umax_ret_i64_offset(i64 addrspace(1)* %out
|
||||
}
|
||||
|
||||
; GCN-LABEL: {{^}}lds_atomic_xchg_noret_i64:
|
||||
; SICIVI: s_mov_b32 m0
|
||||
; GFX9-NOT: m0
|
||||
|
||||
; GCN: ds_wrxchg_rtn_b64
|
||||
; GCN: s_endpgm
|
||||
define amdgpu_kernel void @lds_atomic_xchg_noret_i64(i64 addrspace(3)* %ptr) nounwind {
|
||||
@ -258,6 +334,9 @@ define amdgpu_kernel void @lds_atomic_xchg_noret_i64(i64 addrspace(3)* %ptr) nou
|
||||
}
|
||||
|
||||
; GCN-LABEL: {{^}}lds_atomic_xchg_noret_i64_offset:
|
||||
; SICIVI: s_mov_b32 m0
|
||||
; GFX9-NOT: m0
|
||||
|
||||
; GCN: ds_wrxchg_rtn_b64 {{.*}} offset:32
|
||||
; GCN: s_endpgm
|
||||
define amdgpu_kernel void @lds_atomic_xchg_noret_i64_offset(i64 addrspace(3)* %ptr) nounwind {
|
||||
@ -267,6 +346,9 @@ define amdgpu_kernel void @lds_atomic_xchg_noret_i64_offset(i64 addrspace(3)* %p
|
||||
}
|
||||
|
||||
; GCN-LABEL: {{^}}lds_atomic_add_noret_i64:
|
||||
; SICIVI: s_mov_b32 m0
|
||||
; GFX9-NOT: m0
|
||||
|
||||
; GCN: ds_add_u64
|
||||
; GCN: s_endpgm
|
||||
define amdgpu_kernel void @lds_atomic_add_noret_i64(i64 addrspace(3)* %ptr) nounwind {
|
||||
@ -275,12 +357,15 @@ define amdgpu_kernel void @lds_atomic_add_noret_i64(i64 addrspace(3)* %ptr) noun
|
||||
}
|
||||
|
||||
; GCN-LABEL: {{^}}lds_atomic_add_noret_i64_offset:
|
||||
; SI: s_load_dword [[PTR:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x9
|
||||
; VI: s_load_dword [[PTR:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x24
|
||||
; GCN: v_mov_b32_e32 v[[LOVDATA:[0-9]+]], 9
|
||||
; GCN: v_mov_b32_e32 v[[HIVDATA:[0-9]+]], 0
|
||||
; SICIVI-DAG: s_mov_b32 m0
|
||||
; GFX9-NOT: m0
|
||||
|
||||
; SI-DAG: s_load_dword [[PTR:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x9
|
||||
; GFX89-DAG: s_load_dword [[PTR:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x24
|
||||
; GCN-DAG: v_mov_b32_e32 v[[LOVDATA:[0-9]+]], 9
|
||||
; GCN-DAG: v_mov_b32_e32 v[[HIVDATA:[0-9]+]], 0
|
||||
; GCN: v_mov_b32_e32 [[VPTR:v[0-9]+]], [[PTR]]
|
||||
; GCN: ds_add_u64 [[VPTR]], v{{\[}}[[LOVDATA]]:[[HIVDATA]]{{\]}} offset:32
|
||||
; GCN: ds_add_u64 {{v[0-9]+}}, v{{\[}}[[LOVDATA]]:[[HIVDATA]]{{\]}} offset:32
|
||||
; GCN: s_endpgm
|
||||
define amdgpu_kernel void @lds_atomic_add_noret_i64_offset(i64 addrspace(3)* %ptr) nounwind {
|
||||
%gep = getelementptr i64, i64 addrspace(3)* %ptr, i64 4
|
||||
@ -289,9 +374,12 @@ define amdgpu_kernel void @lds_atomic_add_noret_i64_offset(i64 addrspace(3)* %pt
|
||||
}
|
||||
|
||||
; GCN-LABEL: {{^}}lds_atomic_add1_noret_i64:
|
||||
; SICIVI-DAG: s_mov_b32 m0
|
||||
; GFX9-NOT: m0
|
||||
|
||||
; GCN-DAG: v_mov_b32_e32 v[[LOVDATA:[0-9]+]], 1{{$}}
|
||||
; GCN-DAG: v_mov_b32_e32 v[[HIVDATA:[0-9]+]], 0{{$}}
|
||||
; GCN: ds_add_u64 [[VPTR]], v{{\[}}[[LOVDATA]]:[[HIVDATA]]{{\]}}
|
||||
; GCN: ds_add_u64 {{v[0-9]+}}, v{{\[}}[[LOVDATA]]:[[HIVDATA]]{{\]}}
|
||||
; GCN: s_endpgm
|
||||
define amdgpu_kernel void @lds_atomic_add1_noret_i64(i64 addrspace(3)* %ptr) nounwind {
|
||||
%result = atomicrmw add i64 addrspace(3)* %ptr, i64 1 seq_cst
|
||||
@ -299,6 +387,9 @@ define amdgpu_kernel void @lds_atomic_add1_noret_i64(i64 addrspace(3)* %ptr) nou
|
||||
}
|
||||
|
||||
; GCN-LABEL: {{^}}lds_atomic_add1_noret_i64_offset:
|
||||
; SICIVI: s_mov_b32 m0
|
||||
; GFX9-NOT: m0
|
||||
|
||||
; GCN: ds_add_u64 {{.*}} offset:32
|
||||
; GCN: s_endpgm
|
||||
define amdgpu_kernel void @lds_atomic_add1_noret_i64_offset(i64 addrspace(3)* %ptr) nounwind {
|
||||
@ -308,6 +399,9 @@ define amdgpu_kernel void @lds_atomic_add1_noret_i64_offset(i64 addrspace(3)* %p
|
||||
}
|
||||
|
||||
; GCN-LABEL: {{^}}lds_atomic_sub_noret_i64:
|
||||
; SICIVI: s_mov_b32 m0
|
||||
; GFX9-NOT: m0
|
||||
|
||||
; GCN: ds_sub_u64
|
||||
; GCN: s_endpgm
|
||||
define amdgpu_kernel void @lds_atomic_sub_noret_i64(i64 addrspace(3)* %ptr) nounwind {
|
||||
@ -316,6 +410,9 @@ define amdgpu_kernel void @lds_atomic_sub_noret_i64(i64 addrspace(3)* %ptr) noun
|
||||
}
|
||||
|
||||
; GCN-LABEL: {{^}}lds_atomic_sub_noret_i64_offset:
|
||||
; SICIVI: s_mov_b32 m0
|
||||
; GFX9-NOT: m0
|
||||
|
||||
; GCN: ds_sub_u64 {{.*}} offset:32
|
||||
; GCN: s_endpgm
|
||||
define amdgpu_kernel void @lds_atomic_sub_noret_i64_offset(i64 addrspace(3)* %ptr) nounwind {
|
||||
@ -325,9 +422,12 @@ define amdgpu_kernel void @lds_atomic_sub_noret_i64_offset(i64 addrspace(3)* %pt
|
||||
}
|
||||
|
||||
; GCN-LABEL: {{^}}lds_atomic_sub1_noret_i64:
|
||||
; GCN: v_mov_b32_e32 v[[LOVDATA:[0-9]+]], 1{{$}}
|
||||
; GCN: v_mov_b32_e32 v[[HIVDATA:[0-9]+]], 0{{$}}
|
||||
; GCN: ds_sub_u64 [[VPTR]], v{{\[}}[[LOVDATA]]:[[HIVDATA]]{{\]}}
|
||||
; SICIVI-DAG: s_mov_b32 m0
|
||||
; GFX9-NOT: m0
|
||||
|
||||
; GCN-DAG: v_mov_b32_e32 v[[LOVDATA:[0-9]+]], 1{{$}}
|
||||
; GCN-DAG: v_mov_b32_e32 v[[HIVDATA:[0-9]+]], 0{{$}}
|
||||
; GCN: ds_sub_u64 {{v[0-9]+}}, v{{\[}}[[LOVDATA]]:[[HIVDATA]]{{\]}}
|
||||
; GCN: s_endpgm
|
||||
define amdgpu_kernel void @lds_atomic_sub1_noret_i64(i64 addrspace(3)* %ptr) nounwind {
|
||||
%result = atomicrmw sub i64 addrspace(3)* %ptr, i64 1 seq_cst
|
||||
@ -335,6 +435,9 @@ define amdgpu_kernel void @lds_atomic_sub1_noret_i64(i64 addrspace(3)* %ptr) nou
|
||||
}
|
||||
|
||||
; GCN-LABEL: {{^}}lds_atomic_sub1_noret_i64_offset:
|
||||
; SICIVI: s_mov_b32 m0
|
||||
; GFX9-NOT: m0
|
||||
|
||||
; GCN: ds_sub_u64 {{.*}} offset:32
|
||||
; GCN: s_endpgm
|
||||
define amdgpu_kernel void @lds_atomic_sub1_noret_i64_offset(i64 addrspace(3)* %ptr) nounwind {
|
||||
@ -344,6 +447,9 @@ define amdgpu_kernel void @lds_atomic_sub1_noret_i64_offset(i64 addrspace(3)* %p
|
||||
}
|
||||
|
||||
; GCN-LABEL: {{^}}lds_atomic_and_noret_i64:
|
||||
; SICIVI: s_mov_b32 m0
|
||||
; GFX9-NOT: m0
|
||||
|
||||
; GCN: ds_and_b64
|
||||
; GCN: s_endpgm
|
||||
define amdgpu_kernel void @lds_atomic_and_noret_i64(i64 addrspace(3)* %ptr) nounwind {
|
||||
@ -352,6 +458,9 @@ define amdgpu_kernel void @lds_atomic_and_noret_i64(i64 addrspace(3)* %ptr) noun
|
||||
}
|
||||
|
||||
; GCN-LABEL: {{^}}lds_atomic_and_noret_i64_offset:
|
||||
; SICIVI: s_mov_b32 m0
|
||||
; GFX9-NOT: m0
|
||||
|
||||
; GCN: ds_and_b64 {{.*}} offset:32
|
||||
; GCN: s_endpgm
|
||||
define amdgpu_kernel void @lds_atomic_and_noret_i64_offset(i64 addrspace(3)* %ptr) nounwind {
|
||||
@ -361,6 +470,9 @@ define amdgpu_kernel void @lds_atomic_and_noret_i64_offset(i64 addrspace(3)* %pt
|
||||
}
|
||||
|
||||
; GCN-LABEL: {{^}}lds_atomic_or_noret_i64:
|
||||
; SICIVI: s_mov_b32 m0
|
||||
; GFX9-NOT: m0
|
||||
|
||||
; GCN: ds_or_b64
|
||||
; GCN: s_endpgm
|
||||
define amdgpu_kernel void @lds_atomic_or_noret_i64(i64 addrspace(3)* %ptr) nounwind {
|
||||
@ -369,6 +481,9 @@ define amdgpu_kernel void @lds_atomic_or_noret_i64(i64 addrspace(3)* %ptr) nounw
|
||||
}
|
||||
|
||||
; GCN-LABEL: {{^}}lds_atomic_or_noret_i64_offset:
|
||||
; SICIVI: s_mov_b32 m0
|
||||
; GFX9-NOT: m0
|
||||
|
||||
; GCN: ds_or_b64 {{.*}} offset:32
|
||||
; GCN: s_endpgm
|
||||
define amdgpu_kernel void @lds_atomic_or_noret_i64_offset(i64 addrspace(3)* %ptr) nounwind {
|
||||
@ -378,6 +493,9 @@ define amdgpu_kernel void @lds_atomic_or_noret_i64_offset(i64 addrspace(3)* %ptr
|
||||
}
|
||||
|
||||
; GCN-LABEL: {{^}}lds_atomic_xor_noret_i64:
|
||||
; SICIVI: s_mov_b32 m0
|
||||
; GFX9-NOT: m0
|
||||
|
||||
; GCN: ds_xor_b64
|
||||
; GCN: s_endpgm
|
||||
define amdgpu_kernel void @lds_atomic_xor_noret_i64(i64 addrspace(3)* %ptr) nounwind {
|
||||
@ -386,6 +504,9 @@ define amdgpu_kernel void @lds_atomic_xor_noret_i64(i64 addrspace(3)* %ptr) noun
|
||||
}
|
||||
|
||||
; GCN-LABEL: {{^}}lds_atomic_xor_noret_i64_offset:
|
||||
; SICIVI: s_mov_b32 m0
|
||||
; GFX9-NOT: m0
|
||||
|
||||
; GCN: ds_xor_b64 {{.*}} offset:32
|
||||
; GCN: s_endpgm
|
||||
define amdgpu_kernel void @lds_atomic_xor_noret_i64_offset(i64 addrspace(3)* %ptr) nounwind {
|
||||
@ -402,6 +523,9 @@ define amdgpu_kernel void @lds_atomic_xor_noret_i64_offset(i64 addrspace(3)* %pt
|
||||
; }
|
||||
|
||||
; GCN-LABEL: {{^}}lds_atomic_min_noret_i64:
|
||||
; SICIVI: s_mov_b32 m0
|
||||
; GFX9-NOT: m0
|
||||
|
||||
; GCN: ds_min_i64
|
||||
; GCN: s_endpgm
|
||||
define amdgpu_kernel void @lds_atomic_min_noret_i64(i64 addrspace(3)* %ptr) nounwind {
|
||||
@ -410,6 +534,9 @@ define amdgpu_kernel void @lds_atomic_min_noret_i64(i64 addrspace(3)* %ptr) noun
|
||||
}
|
||||
|
||||
; GCN-LABEL: {{^}}lds_atomic_min_noret_i64_offset:
|
||||
; SICIVI: s_mov_b32 m0
|
||||
; GFX9-NOT: m0
|
||||
|
||||
; GCN: ds_min_i64 {{.*}} offset:32
|
||||
; GCN: s_endpgm
|
||||
define amdgpu_kernel void @lds_atomic_min_noret_i64_offset(i64 addrspace(3)* %ptr) nounwind {
|
||||
@ -419,6 +546,9 @@ define amdgpu_kernel void @lds_atomic_min_noret_i64_offset(i64 addrspace(3)* %pt
|
||||
}
|
||||
|
||||
; GCN-LABEL: {{^}}lds_atomic_max_noret_i64:
|
||||
; SICIVI: s_mov_b32 m0
|
||||
; GFX9-NOT: m0
|
||||
|
||||
; GCN: ds_max_i64
|
||||
; GCN: s_endpgm
|
||||
define amdgpu_kernel void @lds_atomic_max_noret_i64(i64 addrspace(3)* %ptr) nounwind {
|
||||
@ -427,6 +557,9 @@ define amdgpu_kernel void @lds_atomic_max_noret_i64(i64 addrspace(3)* %ptr) noun
|
||||
}
|
||||
|
||||
; GCN-LABEL: {{^}}lds_atomic_max_noret_i64_offset:
|
||||
; SICIVI: s_mov_b32 m0
|
||||
; GFX9-NOT: m0
|
||||
|
||||
; GCN: ds_max_i64 {{.*}} offset:32
|
||||
; GCN: s_endpgm
|
||||
define amdgpu_kernel void @lds_atomic_max_noret_i64_offset(i64 addrspace(3)* %ptr) nounwind {
|
||||
@ -436,6 +569,9 @@ define amdgpu_kernel void @lds_atomic_max_noret_i64_offset(i64 addrspace(3)* %pt
|
||||
}
|
||||
|
||||
; GCN-LABEL: {{^}}lds_atomic_umin_noret_i64:
|
||||
; SICIVI: s_mov_b32 m0
|
||||
; GFX9-NOT: m0
|
||||
|
||||
; GCN: ds_min_u64
|
||||
; GCN: s_endpgm
|
||||
define amdgpu_kernel void @lds_atomic_umin_noret_i64(i64 addrspace(3)* %ptr) nounwind {
|
||||
@ -444,6 +580,9 @@ define amdgpu_kernel void @lds_atomic_umin_noret_i64(i64 addrspace(3)* %ptr) nou
|
||||
}
|
||||
|
||||
; GCN-LABEL: {{^}}lds_atomic_umin_noret_i64_offset:
|
||||
; SICIVI: s_mov_b32 m0
|
||||
; GFX9-NOT: m0
|
||||
|
||||
; GCN: ds_min_u64 {{.*}} offset:32
|
||||
; GCN: s_endpgm
|
||||
define amdgpu_kernel void @lds_atomic_umin_noret_i64_offset(i64 addrspace(3)* %ptr) nounwind {
|
||||
@ -453,6 +592,9 @@ define amdgpu_kernel void @lds_atomic_umin_noret_i64_offset(i64 addrspace(3)* %p
|
||||
}
|
||||
|
||||
; GCN-LABEL: {{^}}lds_atomic_umax_noret_i64:
|
||||
; SICIVI: s_mov_b32 m0
|
||||
; GFX9-NOT: m0
|
||||
|
||||
; GCN: ds_max_u64
|
||||
; GCN: s_endpgm
|
||||
define amdgpu_kernel void @lds_atomic_umax_noret_i64(i64 addrspace(3)* %ptr) nounwind {
|
||||
@ -461,6 +603,9 @@ define amdgpu_kernel void @lds_atomic_umax_noret_i64(i64 addrspace(3)* %ptr) nou
|
||||
}
|
||||
|
||||
; GCN-LABEL: {{^}}lds_atomic_umax_noret_i64_offset:
|
||||
; SICIVI: s_mov_b32 m0
|
||||
; GFX9-NOT: m0
|
||||
|
||||
; GCN: ds_max_u64 {{.*}} offset:32
|
||||
; GCN: s_endpgm
|
||||
define amdgpu_kernel void @lds_atomic_umax_noret_i64_offset(i64 addrspace(3)* %ptr) nounwind {
|
||||
|
@ -233,7 +233,7 @@ bb:
|
||||
; GCN: ds_read_b32 [[VEC0:v[0-9]+]]
|
||||
; GCN: ds_read_b32 [[VEC1:v[0-9]+]]
|
||||
; GCN: ds_read_u16 [[PACKED:v[0-9]+]]
|
||||
; GCN-NEXT: s_waitcnt
|
||||
; GCN: s_waitcnt
|
||||
; GCN: ds_read_u16_d16_hi [[PACKED]]
|
||||
|
||||
; GCN: v_pk_fma_f16 v{{[0-9]+}}, [[VEC0]], [[VEC1]], [[PACKED]] neg_lo:[0,0,1] neg_hi:[0,0,1]{{$}}
|
||||
|
@ -1,9 +1,13 @@
|
||||
; RUN: llc -march=amdgcn -mtriple=amdgcn---amdgiz -mcpu=verde -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=FUNC %s
|
||||
; RUN: llc -march=amdgcn -mtriple=amdgcn---amdgiz -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=FUNC %s
|
||||
; RUN: llc -march=r600 -mtriple=r600---amdgiz -mcpu=redwood < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s
|
||||
; RUN: llc -march=r600 -mtriple=r600---amdgiz -mcpu=cayman < %s | FileCheck -check-prefix=CM -check-prefix=FUNC %s
|
||||
; RUN: llc -march=amdgcn -mtriple=amdgcn---amdgiz -mcpu=verde -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,SICIVI,FUNC %s
|
||||
; RUN: llc -march=amdgcn -mtriple=amdgcn---amdgiz -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,SICIVI,FUNC %s
|
||||
; RUN: llc -march=amdgcn -mtriple=amdgcn---amdgiz -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX9,FUNC %s
|
||||
; RUN: llc -march=r600 -mtriple=r600---amdgiz -mcpu=redwood < %s | FileCheck -check-prefixes=EG,FUNC %s
|
||||
; RUN: llc -march=r600 -mtriple=r600---amdgiz -mcpu=cayman < %s | FileCheck -check-prefixes=CM,FUNC %s
|
||||
|
||||
; FUNC-LABEL: {{^}}store_local_i1:
|
||||
; SICIVI: s_mov_b32 m0
|
||||
; GFX9-NOT: m0
|
||||
|
||||
; EG: LDS_BYTE_WRITE
|
||||
|
||||
; CM: LDS_BYTE_WRITE
|
||||
@ -16,6 +20,9 @@ entry:
|
||||
}
|
||||
|
||||
; FUNC-LABEL: {{^}}store_local_i8:
|
||||
; SICIVI: s_mov_b32 m0
|
||||
; GFX9-NOT: m0
|
||||
|
||||
; EG: LDS_BYTE_WRITE
|
||||
|
||||
; CM: LDS_BYTE_WRITE
|
||||
@ -27,6 +34,9 @@ define amdgpu_kernel void @store_local_i8(i8 addrspace(3)* %out, i8 %in) {
|
||||
}
|
||||
|
||||
; FUNC-LABEL: {{^}}store_local_i16:
|
||||
; SICIVI: s_mov_b32 m0
|
||||
; GFX9-NOT: m0
|
||||
|
||||
; EG: LDS_SHORT_WRITE
|
||||
|
||||
; CM: LDS_SHORT_WRITE
|
||||
@ -38,6 +48,9 @@ define amdgpu_kernel void @store_local_i16(i16 addrspace(3)* %out, i16 %in) {
|
||||
}
|
||||
|
||||
; FUNC-LABEL: {{^}}store_local_v2i16:
|
||||
; SICIVI: s_mov_b32 m0
|
||||
; GFX9-NOT: m0
|
||||
|
||||
; EG: LDS_WRITE
|
||||
|
||||
; CM: LDS_WRITE
|
||||
@ -50,6 +63,9 @@ entry:
|
||||
}
|
||||
|
||||
; FUNC-LABEL: {{^}}store_local_v4i8:
|
||||
; SICIVI: s_mov_b32 m0
|
||||
; GFX9-NOT: m0
|
||||
|
||||
; EG: LDS_WRITE
|
||||
|
||||
; CM: LDS_WRITE
|
||||
@ -62,6 +78,9 @@ entry:
|
||||
}
|
||||
|
||||
; FUNC-LABEL: {{^}}store_local_v4i8_unaligned:
|
||||
; SICIVI: s_mov_b32 m0
|
||||
; GFX9-NOT: m0
|
||||
|
||||
; EG: LDS_BYTE_WRITE
|
||||
; EG: LDS_BYTE_WRITE
|
||||
; EG: LDS_BYTE_WRITE
|
||||
@ -85,6 +104,9 @@ entry:
|
||||
}
|
||||
|
||||
; FUNC-LABEL: {{^}}store_local_v4i8_halfaligned:
|
||||
; SICIVI: s_mov_b32 m0
|
||||
; GFX9-NOT: m0
|
||||
|
||||
; EG: LDS_SHORT_WRITE
|
||||
; EG: LDS_SHORT_WRITE
|
||||
; EG-NOT: LDS_WRITE
|
||||
@ -102,6 +124,9 @@ entry:
|
||||
}
|
||||
|
||||
; FUNC-LABEL: {{^}}store_local_v2i32:
|
||||
; SICIVI: s_mov_b32 m0
|
||||
; GFX9-NOT: m0
|
||||
|
||||
; EG: LDS_WRITE
|
||||
; EG: LDS_WRITE
|
||||
; EG-NOT: LDS_WRITE
|
||||
@ -118,6 +143,9 @@ entry:
|
||||
}
|
||||
|
||||
; FUNC-LABEL: {{^}}store_local_v4i32:
|
||||
; SICIVI: s_mov_b32 m0
|
||||
; GFX9-NOT: m0
|
||||
|
||||
; EG: LDS_WRITE
|
||||
; EG: LDS_WRITE
|
||||
; EG: LDS_WRITE
|
||||
@ -136,6 +164,9 @@ entry:
|
||||
}
|
||||
|
||||
; FUNC-LABEL: {{^}}store_local_v4i32_align4:
|
||||
; SICIVI: s_mov_b32 m0
|
||||
; GFX9-NOT: m0
|
||||
|
||||
; EG: LDS_WRITE
|
||||
; EG: LDS_WRITE
|
||||
; EG: LDS_WRITE
|
||||
@ -155,6 +186,9 @@ entry:
|
||||
}
|
||||
|
||||
; FUNC-LABEL: {{^}}store_local_i64_i8:
|
||||
; SICIVI: s_mov_b32 m0
|
||||
; GFX9-NOT: m0
|
||||
|
||||
; EG: LDS_BYTE_WRITE
|
||||
; GCN: ds_write_b8
|
||||
define amdgpu_kernel void @store_local_i64_i8(i8 addrspace(3)* %out, i64 %in) {
|
||||
@ -165,6 +199,9 @@ entry:
|
||||
}
|
||||
|
||||
; FUNC-LABEL: {{^}}store_local_i64_i16:
|
||||
; SICIVI: s_mov_b32 m0
|
||||
; GFX9-NOT: m0
|
||||
|
||||
; EG: LDS_SHORT_WRITE
|
||||
; GCN: ds_write_b16
|
||||
define amdgpu_kernel void @store_local_i64_i16(i16 addrspace(3)* %out, i64 %in) {
|
||||
|
@ -29,7 +29,7 @@ define amdgpu_kernel void @local_store_i48(i48 addrspace(3)* %ptr, i48 %arg) #0
|
||||
}
|
||||
|
||||
; GCN-LABEL: {{^}}local_store_i65:
|
||||
; GCN-DAG: ds_write_b8 v{{[0-9]+}}, v0 offset:8
|
||||
; GCN-DAG: ds_write_b8 v{{[0-9]+}}, v{{[0-9]+}} offset:8
|
||||
; GCN-DAG: ds_write_b64
|
||||
define amdgpu_kernel void @local_store_i65(i65 addrspace(3)* %ptr, i65 %arg) #0 {
|
||||
store i65 %arg, i65 addrspace(3)* %ptr, align 8
|
||||
|
Loading…
x
Reference in New Issue
Block a user