mirror of
https://github.com/RPCS3/llvm-mirror.git
synced 2025-02-13 08:54:59 +00:00
AMDGPU: Make v2i16/v2f16 legal on VI
This usually results in better code. Fixes using inline asm with short2, and also fixes having a different ABI for function parameters between VI and gfx9. Partially cleans up the mess used for lowering of the d16 operations. Making v4f16 legal will help clean this up more, but this requires additional work. llvm-svn: 332953
This commit is contained in:
parent
2df2ccde1c
commit
ee27f88e14
@ -799,6 +799,9 @@ def Has16BitInsts : Predicate<"Subtarget->has16BitInsts()">,
|
|||||||
def HasVOP3PInsts : Predicate<"Subtarget->hasVOP3PInsts()">,
|
def HasVOP3PInsts : Predicate<"Subtarget->hasVOP3PInsts()">,
|
||||||
AssemblerPredicate<"FeatureVOP3P">;
|
AssemblerPredicate<"FeatureVOP3P">;
|
||||||
|
|
||||||
|
def NotHasVOP3PInsts : Predicate<"!Subtarget->hasVOP3PInsts()">,
|
||||||
|
AssemblerPredicate<"!FeatureVOP3P">;
|
||||||
|
|
||||||
def HasSDWA : Predicate<"Subtarget->hasSDWA()">,
|
def HasSDWA : Predicate<"Subtarget->hasSDWA()">,
|
||||||
AssemblerPredicate<"FeatureSDWA,FeatureVolcanicIslands">;
|
AssemblerPredicate<"FeatureSDWA,FeatureVolcanicIslands">;
|
||||||
|
|
||||||
|
@ -3007,7 +3007,7 @@ SDValue AMDGPUTargetLowering::performShlCombine(SDNode *N,
|
|||||||
SDValue X = LHS->getOperand(0);
|
SDValue X = LHS->getOperand(0);
|
||||||
|
|
||||||
if (VT == MVT::i32 && RHSVal == 16 && X.getValueType() == MVT::i16 &&
|
if (VT == MVT::i32 && RHSVal == 16 && X.getValueType() == MVT::i16 &&
|
||||||
isTypeLegal(MVT::v2i16)) {
|
isOperationLegal(ISD::BUILD_VECTOR, MVT::v2i16)) {
|
||||||
// Prefer build_vector as the canonical form if packed types are legal.
|
// Prefer build_vector as the canonical form if packed types are legal.
|
||||||
// (shl ([asz]ext i16:x), 16 -> build_vector 0, x
|
// (shl ([asz]ext i16:x), 16 -> build_vector 0, x
|
||||||
SDValue Vec = DAG.getBuildVector(MVT::v2i16, SL,
|
SDValue Vec = DAG.getBuildVector(MVT::v2i16, SL,
|
||||||
@ -3818,12 +3818,13 @@ SDValue AMDGPUTargetLowering::PerformDAGCombine(SDNode *N,
|
|||||||
// TODO: Generalize and move to DAGCombiner
|
// TODO: Generalize and move to DAGCombiner
|
||||||
SDValue Src = N->getOperand(0);
|
SDValue Src = N->getOperand(0);
|
||||||
if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Src)) {
|
if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Src)) {
|
||||||
assert(Src.getValueType() == MVT::i64);
|
if (Src.getValueType() == MVT::i64) {
|
||||||
SDLoc SL(N);
|
SDLoc SL(N);
|
||||||
uint64_t CVal = C->getZExtValue();
|
uint64_t CVal = C->getZExtValue();
|
||||||
return DAG.getNode(ISD::BUILD_VECTOR, SL, DestVT,
|
return DAG.getNode(ISD::BUILD_VECTOR, SL, DestVT,
|
||||||
DAG.getConstant(Lo_32(CVal), SL, MVT::i32),
|
DAG.getConstant(Lo_32(CVal), SL, MVT::i32),
|
||||||
DAG.getConstant(Hi_32(CVal), SL, MVT::i32));
|
DAG.getConstant(Hi_32(CVal), SL, MVT::i32));
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
if (ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(Src)) {
|
if (ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(Src)) {
|
||||||
|
@ -1060,14 +1060,14 @@ defm : MUBUF_LoadIntrinsicPat<SIbuffer_load_format, v2f32, "BUFFER_LOAD_FORMAT_X
|
|||||||
defm : MUBUF_LoadIntrinsicPat<SIbuffer_load_format, v4f32, "BUFFER_LOAD_FORMAT_XYZW">;
|
defm : MUBUF_LoadIntrinsicPat<SIbuffer_load_format, v4f32, "BUFFER_LOAD_FORMAT_XYZW">;
|
||||||
|
|
||||||
let SubtargetPredicate = HasUnpackedD16VMem in {
|
let SubtargetPredicate = HasUnpackedD16VMem in {
|
||||||
defm : MUBUF_LoadIntrinsicPat<SIbuffer_load_format, f16, "BUFFER_LOAD_FORMAT_D16_X_gfx80">;
|
defm : MUBUF_LoadIntrinsicPat<SIbuffer_load_format_d16, f16, "BUFFER_LOAD_FORMAT_D16_X_gfx80">;
|
||||||
defm : MUBUF_LoadIntrinsicPat<SIbuffer_load_format_d16, v2i32, "BUFFER_LOAD_FORMAT_D16_XY_gfx80">;
|
defm : MUBUF_LoadIntrinsicPat<SIbuffer_load_format_d16, v2i32, "BUFFER_LOAD_FORMAT_D16_XY_gfx80">;
|
||||||
defm : MUBUF_LoadIntrinsicPat<SIbuffer_load_format_d16, v4i32, "BUFFER_LOAD_FORMAT_D16_XYZW_gfx80">;
|
defm : MUBUF_LoadIntrinsicPat<SIbuffer_load_format_d16, v4i32, "BUFFER_LOAD_FORMAT_D16_XYZW_gfx80">;
|
||||||
} // End HasUnpackedD16VMem.
|
} // End HasUnpackedD16VMem.
|
||||||
|
|
||||||
let SubtargetPredicate = HasPackedD16VMem in {
|
let SubtargetPredicate = HasPackedD16VMem in {
|
||||||
defm : MUBUF_LoadIntrinsicPat<SIbuffer_load_format, f16, "BUFFER_LOAD_FORMAT_D16_X">;
|
defm : MUBUF_LoadIntrinsicPat<SIbuffer_load_format_d16, f16, "BUFFER_LOAD_FORMAT_D16_X">;
|
||||||
defm : MUBUF_LoadIntrinsicPat<SIbuffer_load_format, v2f16, "BUFFER_LOAD_FORMAT_D16_XY">;
|
defm : MUBUF_LoadIntrinsicPat<SIbuffer_load_format_d16, v2f16, "BUFFER_LOAD_FORMAT_D16_XY">;
|
||||||
defm : MUBUF_LoadIntrinsicPat<SIbuffer_load_format_d16, i32, "BUFFER_LOAD_FORMAT_D16_XY">;
|
defm : MUBUF_LoadIntrinsicPat<SIbuffer_load_format_d16, i32, "BUFFER_LOAD_FORMAT_D16_XY">;
|
||||||
defm : MUBUF_LoadIntrinsicPat<SIbuffer_load_format_d16, v2i32, "BUFFER_LOAD_FORMAT_D16_XYZW">;
|
defm : MUBUF_LoadIntrinsicPat<SIbuffer_load_format_d16, v2i32, "BUFFER_LOAD_FORMAT_D16_XYZW">;
|
||||||
} // End HasPackedD16VMem.
|
} // End HasPackedD16VMem.
|
||||||
@ -1547,14 +1547,14 @@ defm : MTBUF_LoadIntrinsicPat<SItbuffer_load, v2f32, "TBUFFER_LOAD_FORMAT_XY">;
|
|||||||
defm : MTBUF_LoadIntrinsicPat<SItbuffer_load, v4f32, "TBUFFER_LOAD_FORMAT_XYZW">;
|
defm : MTBUF_LoadIntrinsicPat<SItbuffer_load, v4f32, "TBUFFER_LOAD_FORMAT_XYZW">;
|
||||||
|
|
||||||
let SubtargetPredicate = HasUnpackedD16VMem in {
|
let SubtargetPredicate = HasUnpackedD16VMem in {
|
||||||
defm : MTBUF_LoadIntrinsicPat<SItbuffer_load, f16, "TBUFFER_LOAD_FORMAT_D16_X_gfx80">;
|
defm : MTBUF_LoadIntrinsicPat<SItbuffer_load_d16, f16, "TBUFFER_LOAD_FORMAT_D16_X_gfx80">;
|
||||||
defm : MTBUF_LoadIntrinsicPat<SItbuffer_load_d16, v2i32, "TBUFFER_LOAD_FORMAT_D16_XY_gfx80">;
|
defm : MTBUF_LoadIntrinsicPat<SItbuffer_load_d16, v2i32, "TBUFFER_LOAD_FORMAT_D16_XY_gfx80">;
|
||||||
defm : MTBUF_LoadIntrinsicPat<SItbuffer_load_d16, v4i32, "TBUFFER_LOAD_FORMAT_D16_XYZW_gfx80">;
|
defm : MTBUF_LoadIntrinsicPat<SItbuffer_load_d16, v4i32, "TBUFFER_LOAD_FORMAT_D16_XYZW_gfx80">;
|
||||||
} // End HasUnpackedD16VMem.
|
} // End HasUnpackedD16VMem.
|
||||||
|
|
||||||
let SubtargetPredicate = HasPackedD16VMem in {
|
let SubtargetPredicate = HasPackedD16VMem in {
|
||||||
defm : MTBUF_LoadIntrinsicPat<SItbuffer_load, f16, "TBUFFER_LOAD_FORMAT_D16_X">;
|
defm : MTBUF_LoadIntrinsicPat<SItbuffer_load_d16, f16, "TBUFFER_LOAD_FORMAT_D16_X">;
|
||||||
defm : MTBUF_LoadIntrinsicPat<SItbuffer_load, v2f16, "TBUFFER_LOAD_FORMAT_D16_XY">;
|
defm : MTBUF_LoadIntrinsicPat<SItbuffer_load_d16, v2f16, "TBUFFER_LOAD_FORMAT_D16_XY">;
|
||||||
defm : MTBUF_LoadIntrinsicPat<SItbuffer_load_d16, i32, "TBUFFER_LOAD_FORMAT_D16_XY">;
|
defm : MTBUF_LoadIntrinsicPat<SItbuffer_load_d16, i32, "TBUFFER_LOAD_FORMAT_D16_XY">;
|
||||||
defm : MTBUF_LoadIntrinsicPat<SItbuffer_load_d16, v2i32, "TBUFFER_LOAD_FORMAT_D16_XYZW">;
|
defm : MTBUF_LoadIntrinsicPat<SItbuffer_load_d16, v2i32, "TBUFFER_LOAD_FORMAT_D16_XYZW">;
|
||||||
} // End HasPackedD16VMem.
|
} // End HasPackedD16VMem.
|
||||||
|
@ -610,10 +610,7 @@ multiclass ImageDimD16Helper<AMDGPUImageDimIntrinsic I,
|
|||||||
|
|
||||||
let SubtargetPredicate = HasPackedD16VMem in {
|
let SubtargetPredicate = HasPackedD16VMem in {
|
||||||
def _packed_v1 : ImageDimPattern<I, "_V1", f16, "_D16">;
|
def _packed_v1 : ImageDimPattern<I, "_V1", f16, "_D16">;
|
||||||
// used on gfx810
|
def _packed_v2 : ImageDimPattern<I, "_V1", v2f16, "_D16">;
|
||||||
def _packed_v2 : ImageDimPattern<d16helper, "_V1", i32, "_D16">;
|
|
||||||
// used on gfx900
|
|
||||||
def _packed_v2_gfx9 : ImageDimPattern<I, "_V1", v2f16, "_D16">;
|
|
||||||
def _packed_v4 : ImageDimPattern<d16helper, "_V2", v2i32, "_D16">;
|
def _packed_v4 : ImageDimPattern<d16helper, "_V2", v2i32, "_D16">;
|
||||||
} // End HasPackedD16VMem.
|
} // End HasPackedD16VMem.
|
||||||
}
|
}
|
||||||
@ -717,7 +714,7 @@ multiclass ImageSampleAltPatterns<SDPatternOperator name, string opcode> {
|
|||||||
} // End HasUnpackedD16VMem.
|
} // End HasUnpackedD16VMem.
|
||||||
|
|
||||||
let SubtargetPredicate = HasPackedD16VMem in {
|
let SubtargetPredicate = HasPackedD16VMem in {
|
||||||
defm : ImageSampleDataPatterns<name, !cast<string>(opcode # _V1), i32, "_D16">;
|
defm : ImageSampleDataPatterns<name, !cast<string>(opcode # _V1), f16, "_D16">;
|
||||||
defm : ImageSampleDataPatterns<name, !cast<string>(opcode # _V2), v2i32, "_D16">;
|
defm : ImageSampleDataPatterns<name, !cast<string>(opcode # _V2), v2i32, "_D16">;
|
||||||
} // End HasPackedD16VMem.
|
} // End HasPackedD16VMem.
|
||||||
}
|
}
|
||||||
@ -780,7 +777,7 @@ multiclass ImageLoadAltPatterns<SDPatternOperator name, string opcode> {
|
|||||||
} // End HasUnPackedD16VMem.
|
} // End HasUnPackedD16VMem.
|
||||||
|
|
||||||
let SubtargetPredicate = HasPackedD16VMem in {
|
let SubtargetPredicate = HasPackedD16VMem in {
|
||||||
defm : ImageLoadDataPatterns<name, !cast<string>(opcode # _V1), i32, "_D16">;
|
defm : ImageLoadDataPatterns<name, !cast<string>(opcode # _V1), f16, "_D16">;
|
||||||
defm : ImageLoadDataPatterns<name, !cast<string>(opcode # _V2), v2i32, "_D16">;
|
defm : ImageLoadDataPatterns<name, !cast<string>(opcode # _V2), v2i32, "_D16">;
|
||||||
} // End HasPackedD16VMem.
|
} // End HasPackedD16VMem.
|
||||||
}
|
}
|
||||||
@ -865,8 +862,8 @@ defm : ImageLoadAltPatterns<SIImage_load, "IMAGE_LOAD">;
|
|||||||
defm : ImageLoadAltPatterns<SIImage_load_mip, "IMAGE_LOAD_MIP">;
|
defm : ImageLoadAltPatterns<SIImage_load_mip, "IMAGE_LOAD_MIP">;
|
||||||
|
|
||||||
// Image store.
|
// Image store.
|
||||||
defm : ImageStorePatterns<SIImage_store, "IMAGE_STORE">;
|
defm : ImageStorePatterns<int_amdgcn_image_store, "IMAGE_STORE">;
|
||||||
defm : ImageStorePatterns<SIImage_store_mip, "IMAGE_STORE_MIP">;
|
defm : ImageStorePatterns<int_amdgcn_image_store_mip, "IMAGE_STORE_MIP">;
|
||||||
defm : ImageStoreAltPatterns<SIImage_store, "IMAGE_STORE">;
|
defm : ImageStoreAltPatterns<SIImage_store, "IMAGE_STORE">;
|
||||||
defm : ImageStoreAltPatterns<SIImage_store_mip, "IMAGE_STORE_MIP">;
|
defm : ImageStoreAltPatterns<SIImage_store_mip, "IMAGE_STORE_MIP">;
|
||||||
|
|
||||||
|
@ -139,9 +139,8 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM,
|
|||||||
if (Subtarget->has16BitInsts()) {
|
if (Subtarget->has16BitInsts()) {
|
||||||
addRegisterClass(MVT::i16, &AMDGPU::SReg_32_XM0RegClass);
|
addRegisterClass(MVT::i16, &AMDGPU::SReg_32_XM0RegClass);
|
||||||
addRegisterClass(MVT::f16, &AMDGPU::SReg_32_XM0RegClass);
|
addRegisterClass(MVT::f16, &AMDGPU::SReg_32_XM0RegClass);
|
||||||
}
|
|
||||||
|
|
||||||
if (Subtarget->hasVOP3PInsts()) {
|
// Unless there are also VOP3P operations, not operations are really legal.
|
||||||
addRegisterClass(MVT::v2i16, &AMDGPU::SReg_32_XM0RegClass);
|
addRegisterClass(MVT::v2i16, &AMDGPU::SReg_32_XM0RegClass);
|
||||||
addRegisterClass(MVT::v2f16, &AMDGPU::SReg_32_XM0RegClass);
|
addRegisterClass(MVT::v2f16, &AMDGPU::SReg_32_XM0RegClass);
|
||||||
}
|
}
|
||||||
@ -174,7 +173,6 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM,
|
|||||||
|
|
||||||
setOperationAction(ISD::GlobalAddress, MVT::i32, Custom);
|
setOperationAction(ISD::GlobalAddress, MVT::i32, Custom);
|
||||||
setOperationAction(ISD::GlobalAddress, MVT::i64, Custom);
|
setOperationAction(ISD::GlobalAddress, MVT::i64, Custom);
|
||||||
setOperationAction(ISD::ConstantPool, MVT::v2i64, Expand);
|
|
||||||
|
|
||||||
setOperationAction(ISD::SELECT, MVT::i1, Promote);
|
setOperationAction(ISD::SELECT, MVT::i1, Promote);
|
||||||
setOperationAction(ISD::SELECT, MVT::i64, Custom);
|
setOperationAction(ISD::SELECT, MVT::i64, Custom);
|
||||||
@ -423,9 +421,7 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM,
|
|||||||
setOperationAction(ISD::FMA, MVT::f16, Legal);
|
setOperationAction(ISD::FMA, MVT::f16, Legal);
|
||||||
if (!Subtarget->hasFP16Denormals())
|
if (!Subtarget->hasFP16Denormals())
|
||||||
setOperationAction(ISD::FMAD, MVT::f16, Legal);
|
setOperationAction(ISD::FMAD, MVT::f16, Legal);
|
||||||
}
|
|
||||||
|
|
||||||
if (Subtarget->hasVOP3PInsts()) {
|
|
||||||
for (MVT VT : {MVT::v2i16, MVT::v2f16}) {
|
for (MVT VT : {MVT::v2i16, MVT::v2f16}) {
|
||||||
for (unsigned Op = 0; Op < ISD::BUILTIN_OP_END; ++Op) {
|
for (unsigned Op = 0; Op < ISD::BUILTIN_OP_END; ++Op) {
|
||||||
switch (Op) {
|
switch (Op) {
|
||||||
@ -472,11 +468,34 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM,
|
|||||||
AddPromotedToType(ISD::OR, MVT::v2i16, MVT::i32);
|
AddPromotedToType(ISD::OR, MVT::v2i16, MVT::i32);
|
||||||
setOperationAction(ISD::XOR, MVT::v2i16, Promote);
|
setOperationAction(ISD::XOR, MVT::v2i16, Promote);
|
||||||
AddPromotedToType(ISD::XOR, MVT::v2i16, MVT::i32);
|
AddPromotedToType(ISD::XOR, MVT::v2i16, MVT::i32);
|
||||||
setOperationAction(ISD::SELECT, MVT::v2i16, Promote);
|
|
||||||
AddPromotedToType(ISD::SELECT, MVT::v2i16, MVT::i32);
|
|
||||||
setOperationAction(ISD::SELECT, MVT::v2f16, Promote);
|
|
||||||
AddPromotedToType(ISD::SELECT, MVT::v2f16, MVT::i32);
|
|
||||||
|
|
||||||
|
setOperationAction(ISD::LOAD, MVT::v4i16, Promote);
|
||||||
|
AddPromotedToType(ISD::LOAD, MVT::v4i16, MVT::v2i32);
|
||||||
|
setOperationAction(ISD::LOAD, MVT::v4f16, Promote);
|
||||||
|
AddPromotedToType(ISD::LOAD, MVT::v4f16, MVT::v2i32);
|
||||||
|
|
||||||
|
setOperationAction(ISD::STORE, MVT::v4i16, Promote);
|
||||||
|
AddPromotedToType(ISD::STORE, MVT::v4i16, MVT::v2i32);
|
||||||
|
setOperationAction(ISD::STORE, MVT::v4f16, Promote);
|
||||||
|
AddPromotedToType(ISD::STORE, MVT::v4f16, MVT::v2i32);
|
||||||
|
|
||||||
|
setOperationAction(ISD::ANY_EXTEND, MVT::v2i32, Expand);
|
||||||
|
setOperationAction(ISD::ZERO_EXTEND, MVT::v2i32, Expand);
|
||||||
|
setOperationAction(ISD::SIGN_EXTEND, MVT::v2i32, Expand);
|
||||||
|
setOperationAction(ISD::FP_EXTEND, MVT::v2f32, Expand);
|
||||||
|
|
||||||
|
if (!Subtarget->hasVOP3PInsts()) {
|
||||||
|
setOperationAction(ISD::BUILD_VECTOR, MVT::v2i16, Custom);
|
||||||
|
setOperationAction(ISD::BUILD_VECTOR, MVT::v2f16, Custom);
|
||||||
|
}
|
||||||
|
|
||||||
|
setOperationAction(ISD::FNEG, MVT::v2f16, Legal);
|
||||||
|
// This isn't really legal, but this avoids the legalizer unrolling it (and
|
||||||
|
// allows matching fneg (fabs x) patterns)
|
||||||
|
setOperationAction(ISD::FABS, MVT::v2f16, Legal);
|
||||||
|
}
|
||||||
|
|
||||||
|
if (Subtarget->hasVOP3PInsts()) {
|
||||||
setOperationAction(ISD::ADD, MVT::v2i16, Legal);
|
setOperationAction(ISD::ADD, MVT::v2i16, Legal);
|
||||||
setOperationAction(ISD::SUB, MVT::v2i16, Legal);
|
setOperationAction(ISD::SUB, MVT::v2i16, Legal);
|
||||||
setOperationAction(ISD::MUL, MVT::v2i16, Legal);
|
setOperationAction(ISD::MUL, MVT::v2i16, Legal);
|
||||||
@ -489,25 +508,23 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM,
|
|||||||
setOperationAction(ISD::UMAX, MVT::v2i16, Legal);
|
setOperationAction(ISD::UMAX, MVT::v2i16, Legal);
|
||||||
|
|
||||||
setOperationAction(ISD::FADD, MVT::v2f16, Legal);
|
setOperationAction(ISD::FADD, MVT::v2f16, Legal);
|
||||||
setOperationAction(ISD::FNEG, MVT::v2f16, Legal);
|
|
||||||
setOperationAction(ISD::FMUL, MVT::v2f16, Legal);
|
setOperationAction(ISD::FMUL, MVT::v2f16, Legal);
|
||||||
setOperationAction(ISD::FMA, MVT::v2f16, Legal);
|
setOperationAction(ISD::FMA, MVT::v2f16, Legal);
|
||||||
setOperationAction(ISD::FMINNUM, MVT::v2f16, Legal);
|
setOperationAction(ISD::FMINNUM, MVT::v2f16, Legal);
|
||||||
setOperationAction(ISD::FMAXNUM, MVT::v2f16, Legal);
|
setOperationAction(ISD::FMAXNUM, MVT::v2f16, Legal);
|
||||||
setOperationAction(ISD::FCANONICALIZE, MVT::v2f16, Legal);
|
setOperationAction(ISD::FCANONICALIZE, MVT::v2f16, Legal);
|
||||||
|
|
||||||
// This isn't really legal, but this avoids the legalizer unrolling it (and
|
|
||||||
// allows matching fneg (fabs x) patterns)
|
|
||||||
setOperationAction(ISD::FABS, MVT::v2f16, Legal);
|
|
||||||
|
|
||||||
setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v2i16, Custom);
|
setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v2i16, Custom);
|
||||||
setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v2f16, Custom);
|
setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v2f16, Custom);
|
||||||
|
}
|
||||||
|
|
||||||
setOperationAction(ISD::ANY_EXTEND, MVT::v2i32, Expand);
|
if (Subtarget->has16BitInsts()) {
|
||||||
setOperationAction(ISD::ZERO_EXTEND, MVT::v2i32, Expand);
|
setOperationAction(ISD::SELECT, MVT::v2i16, Promote);
|
||||||
setOperationAction(ISD::SIGN_EXTEND, MVT::v2i32, Expand);
|
AddPromotedToType(ISD::SELECT, MVT::v2i16, MVT::i32);
|
||||||
setOperationAction(ISD::FP_EXTEND, MVT::v2f32, Expand);
|
setOperationAction(ISD::SELECT, MVT::v2f16, Promote);
|
||||||
|
AddPromotedToType(ISD::SELECT, MVT::v2f16, MVT::i32);
|
||||||
} else {
|
} else {
|
||||||
|
// Legalization hack.
|
||||||
setOperationAction(ISD::SELECT, MVT::v2i16, Custom);
|
setOperationAction(ISD::SELECT, MVT::v2i16, Custom);
|
||||||
setOperationAction(ISD::SELECT, MVT::v2f16, Custom);
|
setOperationAction(ISD::SELECT, MVT::v2f16, Custom);
|
||||||
}
|
}
|
||||||
@ -3514,205 +3531,72 @@ static unsigned getImageOpcode(unsigned IID) {
|
|||||||
return 0;
|
return 0;
|
||||||
}
|
}
|
||||||
|
|
||||||
static SDValue adjustLoadValueType(SDValue Result, EVT LoadVT, SDLoc DL,
|
static SDValue adjustLoadValueTypeImpl(SDValue Result, EVT LoadVT,
|
||||||
SelectionDAG &DAG, bool Unpacked) {
|
const SDLoc &DL,
|
||||||
|
SelectionDAG &DAG, bool Unpacked) {
|
||||||
|
if (!LoadVT.isVector())
|
||||||
|
return Result;
|
||||||
|
|
||||||
if (Unpacked) { // From v2i32/v4i32 back to v2f16/v4f16.
|
if (Unpacked) { // From v2i32/v4i32 back to v2f16/v4f16.
|
||||||
// Truncate to v2i16/v4i16.
|
// Truncate to v2i16/v4i16.
|
||||||
EVT IntLoadVT = LoadVT.changeTypeToInteger();
|
EVT IntLoadVT = LoadVT.changeTypeToInteger();
|
||||||
SDValue Trunc = DAG.getNode(ISD::TRUNCATE, DL, IntLoadVT, Result);
|
|
||||||
|
// Workaround legalizer not scalarizing truncate after vector op
|
||||||
|
// legalization byt not creating intermediate vector trunc.
|
||||||
|
SmallVector<SDValue, 4> Elts;
|
||||||
|
DAG.ExtractVectorElements(Result, Elts);
|
||||||
|
for (SDValue &Elt : Elts)
|
||||||
|
Elt = DAG.getNode(ISD::TRUNCATE, DL, MVT::i16, Elt);
|
||||||
|
|
||||||
|
Result = DAG.getBuildVector(IntLoadVT, DL, Elts);
|
||||||
|
|
||||||
// Bitcast to original type (v2f16/v4f16).
|
// Bitcast to original type (v2f16/v4f16).
|
||||||
return DAG.getNode(ISD::BITCAST, DL, LoadVT, Trunc);
|
return DAG.getNode(ISD::BITCAST, DL, LoadVT, Result);
|
||||||
}
|
}
|
||||||
|
|
||||||
// Cast back to the original packed type.
|
// Cast back to the original packed type.
|
||||||
return DAG.getNode(ISD::BITCAST, DL, LoadVT, Result);
|
return DAG.getNode(ISD::BITCAST, DL, LoadVT, Result);
|
||||||
}
|
}
|
||||||
|
|
||||||
// This is to lower INTRINSIC_W_CHAIN with illegal result types.
|
SDValue SITargetLowering::adjustLoadValueType(unsigned Opcode,
|
||||||
SDValue SITargetLowering::lowerIntrinsicWChain_IllegalReturnType(SDValue Op,
|
MemSDNode *M,
|
||||||
SDValue &Chain, SelectionDAG &DAG) const {
|
SelectionDAG &DAG,
|
||||||
EVT LoadVT = Op.getValueType();
|
bool IsIntrinsic) const {
|
||||||
// TODO: handle v3f16.
|
SDLoc DL(M);
|
||||||
if (LoadVT != MVT::v2f16 && LoadVT != MVT::v4f16)
|
SmallVector<SDValue, 10> Ops;
|
||||||
return SDValue();
|
Ops.reserve(M->getNumOperands());
|
||||||
|
|
||||||
|
Ops.push_back(M->getOperand(0));
|
||||||
|
if (IsIntrinsic)
|
||||||
|
Ops.push_back(DAG.getConstant(Opcode, DL, MVT::i32));
|
||||||
|
|
||||||
|
// Skip 1, as it is the intrinsic ID.
|
||||||
|
for (unsigned I = 2, E = M->getNumOperands(); I != E; ++I)
|
||||||
|
Ops.push_back(M->getOperand(I));
|
||||||
|
|
||||||
bool Unpacked = Subtarget->hasUnpackedD16VMem();
|
bool Unpacked = Subtarget->hasUnpackedD16VMem();
|
||||||
EVT UnpackedLoadVT = (LoadVT == MVT::v2f16) ? MVT::v2i32 : MVT::v4i32;
|
EVT LoadVT = M->getValueType(0);
|
||||||
EVT EquivLoadVT = Unpacked ? UnpackedLoadVT :
|
|
||||||
getEquivalentMemType(*DAG.getContext(), LoadVT);
|
EVT UnpackedLoadVT = LoadVT.isVector() ?
|
||||||
|
EVT::getVectorVT(*DAG.getContext(), MVT::i32,
|
||||||
|
LoadVT.getVectorNumElements()) : LoadVT;
|
||||||
|
EVT EquivLoadVT = LoadVT;
|
||||||
|
if (LoadVT.isVector()) {
|
||||||
|
EquivLoadVT = Unpacked ? UnpackedLoadVT :
|
||||||
|
getEquivalentMemType(*DAG.getContext(), LoadVT);
|
||||||
|
}
|
||||||
|
|
||||||
// Change from v4f16/v2f16 to EquivLoadVT.
|
// Change from v4f16/v2f16 to EquivLoadVT.
|
||||||
SDVTList VTList = DAG.getVTList(EquivLoadVT, MVT::Other);
|
SDVTList VTList = DAG.getVTList(EquivLoadVT, MVT::Other);
|
||||||
|
|
||||||
SDValue Res;
|
SDValue Load
|
||||||
SDLoc DL(Op);
|
= DAG.getMemIntrinsicNode(IsIntrinsic ? ISD::INTRINSIC_W_CHAIN : Opcode, DL,
|
||||||
MemSDNode *M = cast<MemSDNode>(Op);
|
VTList, Ops, M->getMemoryVT(),
|
||||||
unsigned IID = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue();
|
M->getMemOperand());
|
||||||
switch (IID) {
|
|
||||||
case Intrinsic::amdgcn_tbuffer_load: {
|
|
||||||
SDValue Ops[] = {
|
|
||||||
Op.getOperand(0), // Chain
|
|
||||||
Op.getOperand(2), // rsrc
|
|
||||||
Op.getOperand(3), // vindex
|
|
||||||
Op.getOperand(4), // voffset
|
|
||||||
Op.getOperand(5), // soffset
|
|
||||||
Op.getOperand(6), // offset
|
|
||||||
Op.getOperand(7), // dfmt
|
|
||||||
Op.getOperand(8), // nfmt
|
|
||||||
Op.getOperand(9), // glc
|
|
||||||
Op.getOperand(10) // slc
|
|
||||||
};
|
|
||||||
Res = DAG.getMemIntrinsicNode(AMDGPUISD::TBUFFER_LOAD_FORMAT_D16, DL,
|
|
||||||
VTList, Ops, M->getMemoryVT(),
|
|
||||||
M->getMemOperand());
|
|
||||||
Chain = Res.getValue(1);
|
|
||||||
return adjustLoadValueType(Res, LoadVT, DL, DAG, Unpacked);
|
|
||||||
}
|
|
||||||
case Intrinsic::amdgcn_buffer_load_format: {
|
|
||||||
SDValue Ops[] = {
|
|
||||||
Op.getOperand(0), // Chain
|
|
||||||
Op.getOperand(2), // rsrc
|
|
||||||
Op.getOperand(3), // vindex
|
|
||||||
Op.getOperand(4), // offset
|
|
||||||
Op.getOperand(5), // glc
|
|
||||||
Op.getOperand(6) // slc
|
|
||||||
};
|
|
||||||
Res = DAG.getMemIntrinsicNode(AMDGPUISD::BUFFER_LOAD_FORMAT_D16,
|
|
||||||
DL, VTList, Ops, M->getMemoryVT(),
|
|
||||||
M->getMemOperand());
|
|
||||||
Chain = Res.getValue(1);
|
|
||||||
return adjustLoadValueType(Res, LoadVT, DL, DAG, Unpacked);
|
|
||||||
}
|
|
||||||
case Intrinsic::amdgcn_image_load:
|
|
||||||
case Intrinsic::amdgcn_image_load_mip: {
|
|
||||||
SDValue Ops[] = {
|
|
||||||
Op.getOperand(0), // Chain
|
|
||||||
Op.getOperand(2), // vaddr
|
|
||||||
Op.getOperand(3), // rsrc
|
|
||||||
Op.getOperand(4), // dmask
|
|
||||||
Op.getOperand(5), // glc
|
|
||||||
Op.getOperand(6), // slc
|
|
||||||
Op.getOperand(7), // lwe
|
|
||||||
Op.getOperand(8) // da
|
|
||||||
};
|
|
||||||
unsigned Opc = getImageOpcode(IID);
|
|
||||||
Res = DAG.getMemIntrinsicNode(Opc, DL, VTList, Ops, M->getMemoryVT(),
|
|
||||||
M->getMemOperand());
|
|
||||||
Chain = Res.getValue(1);
|
|
||||||
return adjustLoadValueType(Res, LoadVT, DL, DAG, Unpacked);
|
|
||||||
}
|
|
||||||
// Basic sample.
|
|
||||||
case Intrinsic::amdgcn_image_sample:
|
|
||||||
case Intrinsic::amdgcn_image_sample_cl:
|
|
||||||
case Intrinsic::amdgcn_image_sample_d:
|
|
||||||
case Intrinsic::amdgcn_image_sample_d_cl:
|
|
||||||
case Intrinsic::amdgcn_image_sample_l:
|
|
||||||
case Intrinsic::amdgcn_image_sample_b:
|
|
||||||
case Intrinsic::amdgcn_image_sample_b_cl:
|
|
||||||
case Intrinsic::amdgcn_image_sample_lz:
|
|
||||||
case Intrinsic::amdgcn_image_sample_cd:
|
|
||||||
case Intrinsic::amdgcn_image_sample_cd_cl:
|
|
||||||
|
|
||||||
// Sample with comparison.
|
SDValue Adjusted = adjustLoadValueTypeImpl(Load, LoadVT, DL, DAG, Unpacked);
|
||||||
case Intrinsic::amdgcn_image_sample_c:
|
|
||||||
case Intrinsic::amdgcn_image_sample_c_cl:
|
|
||||||
case Intrinsic::amdgcn_image_sample_c_d:
|
|
||||||
case Intrinsic::amdgcn_image_sample_c_d_cl:
|
|
||||||
case Intrinsic::amdgcn_image_sample_c_l:
|
|
||||||
case Intrinsic::amdgcn_image_sample_c_b:
|
|
||||||
case Intrinsic::amdgcn_image_sample_c_b_cl:
|
|
||||||
case Intrinsic::amdgcn_image_sample_c_lz:
|
|
||||||
case Intrinsic::amdgcn_image_sample_c_cd:
|
|
||||||
case Intrinsic::amdgcn_image_sample_c_cd_cl:
|
|
||||||
|
|
||||||
// Sample with offsets.
|
return DAG.getMergeValues({ Adjusted, Load.getValue(1) }, DL);
|
||||||
case Intrinsic::amdgcn_image_sample_o:
|
|
||||||
case Intrinsic::amdgcn_image_sample_cl_o:
|
|
||||||
case Intrinsic::amdgcn_image_sample_d_o:
|
|
||||||
case Intrinsic::amdgcn_image_sample_d_cl_o:
|
|
||||||
case Intrinsic::amdgcn_image_sample_l_o:
|
|
||||||
case Intrinsic::amdgcn_image_sample_b_o:
|
|
||||||
case Intrinsic::amdgcn_image_sample_b_cl_o:
|
|
||||||
case Intrinsic::amdgcn_image_sample_lz_o:
|
|
||||||
case Intrinsic::amdgcn_image_sample_cd_o:
|
|
||||||
case Intrinsic::amdgcn_image_sample_cd_cl_o:
|
|
||||||
|
|
||||||
// Sample with comparison and offsets.
|
|
||||||
case Intrinsic::amdgcn_image_sample_c_o:
|
|
||||||
case Intrinsic::amdgcn_image_sample_c_cl_o:
|
|
||||||
case Intrinsic::amdgcn_image_sample_c_d_o:
|
|
||||||
case Intrinsic::amdgcn_image_sample_c_d_cl_o:
|
|
||||||
case Intrinsic::amdgcn_image_sample_c_l_o:
|
|
||||||
case Intrinsic::amdgcn_image_sample_c_b_o:
|
|
||||||
case Intrinsic::amdgcn_image_sample_c_b_cl_o:
|
|
||||||
case Intrinsic::amdgcn_image_sample_c_lz_o:
|
|
||||||
case Intrinsic::amdgcn_image_sample_c_cd_o:
|
|
||||||
case Intrinsic::amdgcn_image_sample_c_cd_cl_o:
|
|
||||||
|
|
||||||
// Basic gather4
|
|
||||||
case Intrinsic::amdgcn_image_gather4:
|
|
||||||
case Intrinsic::amdgcn_image_gather4_cl:
|
|
||||||
case Intrinsic::amdgcn_image_gather4_l:
|
|
||||||
case Intrinsic::amdgcn_image_gather4_b:
|
|
||||||
case Intrinsic::amdgcn_image_gather4_b_cl:
|
|
||||||
case Intrinsic::amdgcn_image_gather4_lz:
|
|
||||||
|
|
||||||
// Gather4 with comparison
|
|
||||||
case Intrinsic::amdgcn_image_gather4_c:
|
|
||||||
case Intrinsic::amdgcn_image_gather4_c_cl:
|
|
||||||
case Intrinsic::amdgcn_image_gather4_c_l:
|
|
||||||
case Intrinsic::amdgcn_image_gather4_c_b:
|
|
||||||
case Intrinsic::amdgcn_image_gather4_c_b_cl:
|
|
||||||
case Intrinsic::amdgcn_image_gather4_c_lz:
|
|
||||||
|
|
||||||
// Gather4 with offsets
|
|
||||||
case Intrinsic::amdgcn_image_gather4_o:
|
|
||||||
case Intrinsic::amdgcn_image_gather4_cl_o:
|
|
||||||
case Intrinsic::amdgcn_image_gather4_l_o:
|
|
||||||
case Intrinsic::amdgcn_image_gather4_b_o:
|
|
||||||
case Intrinsic::amdgcn_image_gather4_b_cl_o:
|
|
||||||
case Intrinsic::amdgcn_image_gather4_lz_o:
|
|
||||||
|
|
||||||
// Gather4 with comparison and offsets
|
|
||||||
case Intrinsic::amdgcn_image_gather4_c_o:
|
|
||||||
case Intrinsic::amdgcn_image_gather4_c_cl_o:
|
|
||||||
case Intrinsic::amdgcn_image_gather4_c_l_o:
|
|
||||||
case Intrinsic::amdgcn_image_gather4_c_b_o:
|
|
||||||
case Intrinsic::amdgcn_image_gather4_c_b_cl_o:
|
|
||||||
case Intrinsic::amdgcn_image_gather4_c_lz_o: {
|
|
||||||
SDValue Ops[] = {
|
|
||||||
Op.getOperand(0), // Chain
|
|
||||||
Op.getOperand(2), // vaddr
|
|
||||||
Op.getOperand(3), // rsrc
|
|
||||||
Op.getOperand(4), // sampler
|
|
||||||
Op.getOperand(5), // dmask
|
|
||||||
Op.getOperand(6), // unorm
|
|
||||||
Op.getOperand(7), // glc
|
|
||||||
Op.getOperand(8), // slc
|
|
||||||
Op.getOperand(9), // lwe
|
|
||||||
Op.getOperand(10) // da
|
|
||||||
};
|
|
||||||
unsigned Opc = getImageOpcode(IID);
|
|
||||||
Res = DAG.getMemIntrinsicNode(Opc, DL, VTList, Ops, M->getMemoryVT(),
|
|
||||||
M->getMemOperand());
|
|
||||||
Chain = Res.getValue(1);
|
|
||||||
return adjustLoadValueType(Res, LoadVT, DL, DAG, Unpacked);
|
|
||||||
}
|
|
||||||
default: {
|
|
||||||
const AMDGPU::D16ImageDimIntrinsic *D16ImageDimIntr =
|
|
||||||
AMDGPU::lookupD16ImageDimIntrinsicByIntr(IID);
|
|
||||||
if (D16ImageDimIntr) {
|
|
||||||
SmallVector<SDValue, 20> Ops;
|
|
||||||
for (auto Value : Op.getNode()->op_values())
|
|
||||||
Ops.push_back(Value);
|
|
||||||
Ops[1] = DAG.getConstant(D16ImageDimIntr->D16HelperIntr, DL, MVT::i32);
|
|
||||||
Res = DAG.getMemIntrinsicNode(ISD::INTRINSIC_W_CHAIN, DL, VTList, Ops,
|
|
||||||
M->getMemoryVT(), M->getMemOperand());
|
|
||||||
Chain = Res.getValue(1);
|
|
||||||
return adjustLoadValueType(Res, LoadVT, DL, DAG, Unpacked);
|
|
||||||
}
|
|
||||||
|
|
||||||
return SDValue();
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
void SITargetLowering::ReplaceNodeResults(SDNode *N,
|
void SITargetLowering::ReplaceNodeResults(SDNode *N,
|
||||||
@ -3767,13 +3651,12 @@ void SITargetLowering::ReplaceNodeResults(SDNode *N,
|
|||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
case ISD::INTRINSIC_W_CHAIN: {
|
case ISD::INTRINSIC_W_CHAIN: {
|
||||||
SDValue Chain;
|
if (SDValue Res = LowerINTRINSIC_W_CHAIN(SDValue(N, 0), DAG)) {
|
||||||
if (SDValue Res = lowerIntrinsicWChain_IllegalReturnType(SDValue(N, 0),
|
|
||||||
Chain, DAG)) {
|
|
||||||
Results.push_back(Res);
|
Results.push_back(Res);
|
||||||
Results.push_back(Chain);
|
Results.push_back(Res.getValue(1));
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
case ISD::SELECT: {
|
case ISD::SELECT: {
|
||||||
@ -4279,22 +4162,24 @@ SDValue SITargetLowering::lowerBUILD_VECTOR(SDValue Op,
|
|||||||
SelectionDAG &DAG) const {
|
SelectionDAG &DAG) const {
|
||||||
SDLoc SL(Op);
|
SDLoc SL(Op);
|
||||||
EVT VT = Op.getValueType();
|
EVT VT = Op.getValueType();
|
||||||
assert(VT == MVT::v4i16 || VT == MVT::v4f16);
|
|
||||||
|
|
||||||
EVT HalfVT = MVT::getVectorVT(VT.getVectorElementType().getSimpleVT(), 2);
|
assert(VT == MVT::v2f16 || VT == MVT::v2i16);
|
||||||
|
|
||||||
// Turn into pair of packed build_vectors.
|
SDValue Lo = Op.getOperand(0);
|
||||||
// TODO: Special case for constants that can be materialized with s_mov_b64.
|
SDValue Hi = Op.getOperand(1);
|
||||||
SDValue Lo = DAG.getBuildVector(HalfVT, SL,
|
|
||||||
{ Op.getOperand(0), Op.getOperand(1) });
|
|
||||||
SDValue Hi = DAG.getBuildVector(HalfVT, SL,
|
|
||||||
{ Op.getOperand(2), Op.getOperand(3) });
|
|
||||||
|
|
||||||
SDValue CastLo = DAG.getNode(ISD::BITCAST, SL, MVT::i32, Lo);
|
Lo = DAG.getNode(ISD::BITCAST, SL, MVT::i16, Lo);
|
||||||
SDValue CastHi = DAG.getNode(ISD::BITCAST, SL, MVT::i32, Hi);
|
Hi = DAG.getNode(ISD::BITCAST, SL, MVT::i16, Hi);
|
||||||
|
|
||||||
SDValue Blend = DAG.getBuildVector(MVT::v2i32, SL, { CastLo, CastHi });
|
Lo = DAG.getNode(ISD::ZERO_EXTEND, SL, MVT::i32, Lo);
|
||||||
return DAG.getNode(ISD::BITCAST, SL, VT, Blend);
|
Hi = DAG.getNode(ISD::ZERO_EXTEND, SL, MVT::i32, Hi);
|
||||||
|
|
||||||
|
SDValue ShlHi = DAG.getNode(ISD::SHL, SL, MVT::i32, Hi,
|
||||||
|
DAG.getConstant(16, SL, MVT::i32));
|
||||||
|
|
||||||
|
SDValue Or = DAG.getNode(ISD::OR, SL, MVT::i32, Lo, ShlHi);
|
||||||
|
|
||||||
|
return DAG.getNode(ISD::BITCAST, SL, VT, Or);
|
||||||
}
|
}
|
||||||
|
|
||||||
bool
|
bool
|
||||||
@ -4829,13 +4714,23 @@ SDValue SITargetLowering::LowerINTRINSIC_W_CHAIN(SDValue Op,
|
|||||||
AMDGPUISD::BUFFER_LOAD : AMDGPUISD::BUFFER_LOAD_FORMAT;
|
AMDGPUISD::BUFFER_LOAD : AMDGPUISD::BUFFER_LOAD_FORMAT;
|
||||||
EVT VT = Op.getValueType();
|
EVT VT = Op.getValueType();
|
||||||
EVT IntVT = VT.changeTypeToInteger();
|
EVT IntVT = VT.changeTypeToInteger();
|
||||||
|
|
||||||
auto *M = cast<MemSDNode>(Op);
|
auto *M = cast<MemSDNode>(Op);
|
||||||
|
EVT LoadVT = Op.getValueType();
|
||||||
|
bool IsD16 = LoadVT.getScalarType() == MVT::f16;
|
||||||
|
if (IsD16)
|
||||||
|
return adjustLoadValueType(AMDGPUISD::BUFFER_LOAD_FORMAT_D16, M, DAG);
|
||||||
|
|
||||||
return DAG.getMemIntrinsicNode(Opc, DL, Op->getVTList(), Ops, IntVT,
|
return DAG.getMemIntrinsicNode(Opc, DL, Op->getVTList(), Ops, IntVT,
|
||||||
M->getMemOperand());
|
M->getMemOperand());
|
||||||
}
|
}
|
||||||
case Intrinsic::amdgcn_tbuffer_load: {
|
case Intrinsic::amdgcn_tbuffer_load: {
|
||||||
MemSDNode *M = cast<MemSDNode>(Op);
|
MemSDNode *M = cast<MemSDNode>(Op);
|
||||||
|
EVT LoadVT = Op.getValueType();
|
||||||
|
bool IsD16 = LoadVT.getScalarType() == MVT::f16;
|
||||||
|
if (IsD16) {
|
||||||
|
return adjustLoadValueType(AMDGPUISD::TBUFFER_LOAD_FORMAT_D16, M, DAG);
|
||||||
|
}
|
||||||
|
|
||||||
SDValue Ops[] = {
|
SDValue Ops[] = {
|
||||||
Op.getOperand(0), // Chain
|
Op.getOperand(0), // Chain
|
||||||
Op.getOperand(2), // rsrc
|
Op.getOperand(2), // rsrc
|
||||||
@ -4849,10 +4744,9 @@ SDValue SITargetLowering::LowerINTRINSIC_W_CHAIN(SDValue Op,
|
|||||||
Op.getOperand(10) // slc
|
Op.getOperand(10) // slc
|
||||||
};
|
};
|
||||||
|
|
||||||
EVT VT = Op.getValueType();
|
|
||||||
|
|
||||||
return DAG.getMemIntrinsicNode(AMDGPUISD::TBUFFER_LOAD_FORMAT, DL,
|
return DAG.getMemIntrinsicNode(AMDGPUISD::TBUFFER_LOAD_FORMAT, DL,
|
||||||
Op->getVTList(), Ops, VT, M->getMemOperand());
|
Op->getVTList(), Ops, LoadVT,
|
||||||
|
M->getMemOperand());
|
||||||
}
|
}
|
||||||
case Intrinsic::amdgcn_buffer_atomic_swap:
|
case Intrinsic::amdgcn_buffer_atomic_swap:
|
||||||
case Intrinsic::amdgcn_buffer_atomic_add:
|
case Intrinsic::amdgcn_buffer_atomic_add:
|
||||||
@ -4933,6 +4827,18 @@ SDValue SITargetLowering::LowerINTRINSIC_W_CHAIN(SDValue Op,
|
|||||||
Op->getVTList(), Ops, VT, M->getMemOperand());
|
Op->getVTList(), Ops, VT, M->getMemOperand());
|
||||||
}
|
}
|
||||||
|
|
||||||
|
case Intrinsic::amdgcn_image_load:
|
||||||
|
case Intrinsic::amdgcn_image_load_mip: {
|
||||||
|
EVT LoadVT = Op.getValueType();
|
||||||
|
if ((Subtarget->hasUnpackedD16VMem() && LoadVT == MVT::v2f16) ||
|
||||||
|
LoadVT == MVT::v4f16) {
|
||||||
|
MemSDNode *M = cast<MemSDNode>(Op);
|
||||||
|
return adjustLoadValueType(getImageOpcode(IntrID), M, DAG);
|
||||||
|
}
|
||||||
|
|
||||||
|
return SDValue();
|
||||||
|
}
|
||||||
|
|
||||||
// Basic sample.
|
// Basic sample.
|
||||||
case Intrinsic::amdgcn_image_sample:
|
case Intrinsic::amdgcn_image_sample:
|
||||||
case Intrinsic::amdgcn_image_sample_cl:
|
case Intrinsic::amdgcn_image_sample_cl:
|
||||||
@ -4979,7 +4885,39 @@ SDValue SITargetLowering::LowerINTRINSIC_W_CHAIN(SDValue Op,
|
|||||||
case Intrinsic::amdgcn_image_sample_c_b_cl_o:
|
case Intrinsic::amdgcn_image_sample_c_b_cl_o:
|
||||||
case Intrinsic::amdgcn_image_sample_c_lz_o:
|
case Intrinsic::amdgcn_image_sample_c_lz_o:
|
||||||
case Intrinsic::amdgcn_image_sample_c_cd_o:
|
case Intrinsic::amdgcn_image_sample_c_cd_o:
|
||||||
case Intrinsic::amdgcn_image_sample_c_cd_cl_o: {
|
case Intrinsic::amdgcn_image_sample_c_cd_cl_o:
|
||||||
|
|
||||||
|
// Basic gather4
|
||||||
|
case Intrinsic::amdgcn_image_gather4:
|
||||||
|
case Intrinsic::amdgcn_image_gather4_cl:
|
||||||
|
case Intrinsic::amdgcn_image_gather4_l:
|
||||||
|
case Intrinsic::amdgcn_image_gather4_b:
|
||||||
|
case Intrinsic::amdgcn_image_gather4_b_cl:
|
||||||
|
case Intrinsic::amdgcn_image_gather4_lz:
|
||||||
|
|
||||||
|
// Gather4 with comparison
|
||||||
|
case Intrinsic::amdgcn_image_gather4_c:
|
||||||
|
case Intrinsic::amdgcn_image_gather4_c_cl:
|
||||||
|
case Intrinsic::amdgcn_image_gather4_c_l:
|
||||||
|
case Intrinsic::amdgcn_image_gather4_c_b:
|
||||||
|
case Intrinsic::amdgcn_image_gather4_c_b_cl:
|
||||||
|
case Intrinsic::amdgcn_image_gather4_c_lz:
|
||||||
|
|
||||||
|
// Gather4 with offsets
|
||||||
|
case Intrinsic::amdgcn_image_gather4_o:
|
||||||
|
case Intrinsic::amdgcn_image_gather4_cl_o:
|
||||||
|
case Intrinsic::amdgcn_image_gather4_l_o:
|
||||||
|
case Intrinsic::amdgcn_image_gather4_b_o:
|
||||||
|
case Intrinsic::amdgcn_image_gather4_b_cl_o:
|
||||||
|
case Intrinsic::amdgcn_image_gather4_lz_o:
|
||||||
|
|
||||||
|
// Gather4 with comparison and offsets
|
||||||
|
case Intrinsic::amdgcn_image_gather4_c_o:
|
||||||
|
case Intrinsic::amdgcn_image_gather4_c_cl_o:
|
||||||
|
case Intrinsic::amdgcn_image_gather4_c_l_o:
|
||||||
|
case Intrinsic::amdgcn_image_gather4_c_b_o:
|
||||||
|
case Intrinsic::amdgcn_image_gather4_c_b_cl_o:
|
||||||
|
case Intrinsic::amdgcn_image_gather4_c_lz_o: {
|
||||||
// Replace dmask with everything disabled with undef.
|
// Replace dmask with everything disabled with undef.
|
||||||
const ConstantSDNode *DMask = dyn_cast<ConstantSDNode>(Op.getOperand(5));
|
const ConstantSDNode *DMask = dyn_cast<ConstantSDNode>(Op.getOperand(5));
|
||||||
if (!DMask || DMask->isNullValue()) {
|
if (!DMask || DMask->isNullValue()) {
|
||||||
@ -4987,9 +4925,32 @@ SDValue SITargetLowering::LowerINTRINSIC_W_CHAIN(SDValue Op,
|
|||||||
return DAG.getMergeValues({ Undef, Op.getOperand(0) }, SDLoc(Op));
|
return DAG.getMergeValues({ Undef, Op.getOperand(0) }, SDLoc(Op));
|
||||||
}
|
}
|
||||||
|
|
||||||
|
if ((Subtarget->hasUnpackedD16VMem() && Op.getValueType() == MVT::v2f16) ||
|
||||||
|
Op.getValueType() == MVT::v4f16) {
|
||||||
|
return adjustLoadValueType(getImageOpcode(IntrID), cast<MemSDNode>(Op),
|
||||||
|
DAG);
|
||||||
|
}
|
||||||
|
|
||||||
return SDValue();
|
return SDValue();
|
||||||
}
|
}
|
||||||
default:
|
default:
|
||||||
|
EVT LoadVT = Op.getValueType();
|
||||||
|
if (LoadVT.getScalarSizeInBits() != 16)
|
||||||
|
return SDValue();
|
||||||
|
|
||||||
|
const AMDGPU::D16ImageDimIntrinsic *D16ImageDimIntr =
|
||||||
|
AMDGPU::lookupD16ImageDimIntrinsicByIntr(IntrID);
|
||||||
|
if (D16ImageDimIntr) {
|
||||||
|
bool Unpacked = Subtarget->hasUnpackedD16VMem();
|
||||||
|
MemSDNode *M = cast<MemSDNode>(Op);
|
||||||
|
|
||||||
|
if (isTypeLegal(LoadVT) && (!Unpacked || LoadVT == MVT::f16))
|
||||||
|
return SDValue();
|
||||||
|
|
||||||
|
return adjustLoadValueType(D16ImageDimIntr->D16HelperIntr,
|
||||||
|
M, DAG, true);
|
||||||
|
}
|
||||||
|
|
||||||
return SDValue();
|
return SDValue();
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -4997,26 +4958,32 @@ SDValue SITargetLowering::LowerINTRINSIC_W_CHAIN(SDValue Op,
|
|||||||
SDValue SITargetLowering::handleD16VData(SDValue VData,
|
SDValue SITargetLowering::handleD16VData(SDValue VData,
|
||||||
SelectionDAG &DAG) const {
|
SelectionDAG &DAG) const {
|
||||||
EVT StoreVT = VData.getValueType();
|
EVT StoreVT = VData.getValueType();
|
||||||
SDLoc DL(VData);
|
|
||||||
|
|
||||||
if (StoreVT.isVector()) {
|
|
||||||
assert ((StoreVT.getVectorNumElements() != 3) && "Handle v3f16");
|
|
||||||
if (!Subtarget->hasUnpackedD16VMem()) {
|
|
||||||
if (!isTypeLegal(StoreVT)) {
|
|
||||||
// If Target supports packed vmem, we just need to workaround
|
|
||||||
// the illegal type by casting to an equivalent one.
|
|
||||||
EVT EquivStoreVT = getEquivalentMemType(*DAG.getContext(), StoreVT);
|
|
||||||
return DAG.getNode(ISD::BITCAST, DL, EquivStoreVT, VData);
|
|
||||||
}
|
|
||||||
} else { // We need to unpack the packed data to store.
|
|
||||||
EVT IntStoreVT = StoreVT.changeTypeToInteger();
|
|
||||||
SDValue IntVData = DAG.getNode(ISD::BITCAST, DL, IntStoreVT, VData);
|
|
||||||
EVT EquivStoreVT = (StoreVT == MVT::v2f16) ? MVT::v2i32 : MVT::v4i32;
|
|
||||||
return DAG.getNode(ISD::ZERO_EXTEND, DL, EquivStoreVT, IntVData);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
// No change for f16 and legal vector D16 types.
|
// No change for f16 and legal vector D16 types.
|
||||||
return VData;
|
if (!StoreVT.isVector())
|
||||||
|
return VData;
|
||||||
|
|
||||||
|
SDLoc DL(VData);
|
||||||
|
assert((StoreVT.getVectorNumElements() != 3) && "Handle v3f16");
|
||||||
|
|
||||||
|
if (Subtarget->hasUnpackedD16VMem()) {
|
||||||
|
// We need to unpack the packed data to store.
|
||||||
|
EVT IntStoreVT = StoreVT.changeTypeToInteger();
|
||||||
|
SDValue IntVData = DAG.getNode(ISD::BITCAST, DL, IntStoreVT, VData);
|
||||||
|
|
||||||
|
EVT EquivStoreVT = EVT::getVectorVT(*DAG.getContext(), MVT::i32,
|
||||||
|
StoreVT.getVectorNumElements());
|
||||||
|
SDValue ZExt = DAG.getNode(ISD::ZERO_EXTEND, DL, EquivStoreVT, IntVData);
|
||||||
|
return DAG.UnrollVectorOp(ZExt.getNode());
|
||||||
|
}
|
||||||
|
|
||||||
|
if (isTypeLegal(StoreVT))
|
||||||
|
return VData;
|
||||||
|
|
||||||
|
// If target supports packed vmem, we just need to workaround
|
||||||
|
// the illegal type by casting to an equivalent one.
|
||||||
|
EVT EquivStoreVT = getEquivalentMemType(*DAG.getContext(), StoreVT);
|
||||||
|
return DAG.getNode(ISD::BITCAST, DL, EquivStoreVT, VData);
|
||||||
}
|
}
|
||||||
|
|
||||||
SDValue SITargetLowering::LowerINTRINSIC_VOID(SDValue Op,
|
SDValue SITargetLowering::LowerINTRINSIC_VOID(SDValue Op,
|
||||||
@ -5207,46 +5174,48 @@ SDValue SITargetLowering::LowerINTRINSIC_VOID(SDValue Op,
|
|||||||
return DAG.getMemIntrinsicNode(Opc, DL, Op->getVTList(), Ops,
|
return DAG.getMemIntrinsicNode(Opc, DL, Op->getVTList(), Ops,
|
||||||
M->getMemoryVT(), M->getMemOperand());
|
M->getMemoryVT(), M->getMemOperand());
|
||||||
}
|
}
|
||||||
|
|
||||||
case Intrinsic::amdgcn_image_store:
|
case Intrinsic::amdgcn_image_store:
|
||||||
case Intrinsic::amdgcn_image_store_mip: {
|
case Intrinsic::amdgcn_image_store_mip: {
|
||||||
SDValue VData = Op.getOperand(2);
|
SDValue VData = Op.getOperand(2);
|
||||||
bool IsD16 = (VData.getValueType().getScalarType() == MVT::f16);
|
if ((Subtarget->hasUnpackedD16VMem() &&
|
||||||
if (IsD16)
|
VData.getValueType() == MVT::v2f16) ||
|
||||||
VData = handleD16VData(VData, DAG);
|
VData.getValueType() == MVT::v4f16) {
|
||||||
SDValue Ops[] = {
|
SDValue Chain = Op.getOperand(0);
|
||||||
Chain, // Chain
|
|
||||||
VData, // vdata
|
|
||||||
Op.getOperand(3), // vaddr
|
|
||||||
Op.getOperand(4), // rsrc
|
|
||||||
Op.getOperand(5), // dmask
|
|
||||||
Op.getOperand(6), // glc
|
|
||||||
Op.getOperand(7), // slc
|
|
||||||
Op.getOperand(8), // lwe
|
|
||||||
Op.getOperand(9) // da
|
|
||||||
};
|
|
||||||
unsigned Opc = (IntrinsicID==Intrinsic::amdgcn_image_store) ?
|
|
||||||
AMDGPUISD::IMAGE_STORE : AMDGPUISD::IMAGE_STORE_MIP;
|
|
||||||
MemSDNode *M = cast<MemSDNode>(Op);
|
|
||||||
return DAG.getMemIntrinsicNode(Opc, DL, Op->getVTList(), Ops,
|
|
||||||
M->getMemoryVT(), M->getMemOperand());
|
|
||||||
}
|
|
||||||
|
|
||||||
|
VData = handleD16VData(VData, DAG);
|
||||||
|
SDValue Ops[] = {
|
||||||
|
Chain, // Chain
|
||||||
|
VData, // vdata
|
||||||
|
Op.getOperand(3), // vaddr
|
||||||
|
Op.getOperand(4), // rsrc
|
||||||
|
Op.getOperand(5), // dmask
|
||||||
|
Op.getOperand(6), // glc
|
||||||
|
Op.getOperand(7), // slc
|
||||||
|
Op.getOperand(8), // lwe
|
||||||
|
Op.getOperand(9) // da
|
||||||
|
};
|
||||||
|
unsigned Opc = (IntrinsicID == Intrinsic::amdgcn_image_store) ?
|
||||||
|
AMDGPUISD::IMAGE_STORE : AMDGPUISD::IMAGE_STORE_MIP;
|
||||||
|
MemSDNode *M = cast<MemSDNode>(Op);
|
||||||
|
return DAG.getMemIntrinsicNode(Opc, DL, Op->getVTList(), Ops,
|
||||||
|
M->getMemoryVT(), M->getMemOperand());
|
||||||
|
}
|
||||||
|
|
||||||
|
return SDValue();
|
||||||
|
}
|
||||||
default: {
|
default: {
|
||||||
const AMDGPU::D16ImageDimIntrinsic *D16ImageDimIntr =
|
const AMDGPU::D16ImageDimIntrinsic *D16ImageDimIntr =
|
||||||
AMDGPU::lookupD16ImageDimIntrinsicByIntr(IntrinsicID);
|
AMDGPU::lookupD16ImageDimIntrinsicByIntr(IntrinsicID);
|
||||||
if (D16ImageDimIntr) {
|
if (D16ImageDimIntr) {
|
||||||
SDValue VData = Op.getOperand(2);
|
SDValue VData = Op.getOperand(2);
|
||||||
EVT StoreVT = VData.getValueType();
|
EVT StoreVT = VData.getValueType();
|
||||||
if ((StoreVT == MVT::v2f16 && !isTypeLegal(StoreVT)) ||
|
if (((StoreVT == MVT::v2f16 || StoreVT == MVT::v4f16) &&
|
||||||
StoreVT == MVT::v4f16) {
|
Subtarget->hasUnpackedD16VMem()) ||
|
||||||
VData = handleD16VData(VData, DAG);
|
!isTypeLegal(StoreVT)) {
|
||||||
|
SmallVector<SDValue, 12> Ops(Op.getNode()->op_values());
|
||||||
|
|
||||||
SmallVector<SDValue, 12> Ops;
|
|
||||||
for (auto Value : Op.getNode()->op_values())
|
|
||||||
Ops.push_back(Value);
|
|
||||||
Ops[1] = DAG.getConstant(D16ImageDimIntr->D16HelperIntr, DL, MVT::i32);
|
Ops[1] = DAG.getConstant(D16ImageDimIntr->D16HelperIntr, DL, MVT::i32);
|
||||||
Ops[2] = VData;
|
Ops[2] = handleD16VData(VData, DAG);
|
||||||
|
|
||||||
MemSDNode *M = cast<MemSDNode>(Op);
|
MemSDNode *M = cast<MemSDNode>(Op);
|
||||||
return DAG.getMemIntrinsicNode(ISD::INTRINSIC_VOID, DL, Op->getVTList(),
|
return DAG.getMemIntrinsicNode(ISD::INTRINSIC_VOID, DL, Op->getVTList(),
|
||||||
|
@ -60,8 +60,10 @@ class SITargetLowering final : public AMDGPUTargetLowering {
|
|||||||
SDValue LowerATOMIC_CMP_SWAP(SDValue Op, SelectionDAG &DAG) const;
|
SDValue LowerATOMIC_CMP_SWAP(SDValue Op, SelectionDAG &DAG) const;
|
||||||
SDValue LowerBRCOND(SDValue Op, SelectionDAG &DAG) const;
|
SDValue LowerBRCOND(SDValue Op, SelectionDAG &DAG) const;
|
||||||
|
|
||||||
SDValue lowerIntrinsicWChain_IllegalReturnType(SDValue Op, SDValue &Chain,
|
SDValue adjustLoadValueType(unsigned Opcode, MemSDNode *M,
|
||||||
SelectionDAG &DAG) const;
|
SelectionDAG &DAG,
|
||||||
|
bool IsIntrinsic = false) const;
|
||||||
|
|
||||||
SDValue handleD16VData(SDValue VData, SelectionDAG &DAG) const;
|
SDValue handleD16VData(SDValue VData, SelectionDAG &DAG) const;
|
||||||
|
|
||||||
/// Converts \p Op, which must be of floating point type, to the
|
/// Converts \p Op, which must be of floating point type, to the
|
||||||
|
@ -871,11 +871,13 @@ def : ClampPat<V_MAX_F32_e64, f32>;
|
|||||||
def : ClampPat<V_MAX_F64, f64>;
|
def : ClampPat<V_MAX_F64, f64>;
|
||||||
def : ClampPat<V_MAX_F16_e64, f16>;
|
def : ClampPat<V_MAX_F16_e64, f16>;
|
||||||
|
|
||||||
|
let SubtargetPredicate = HasVOP3PInsts in {
|
||||||
def : GCNPat <
|
def : GCNPat <
|
||||||
(v2f16 (AMDGPUclamp (VOP3PMods v2f16:$src0, i32:$src0_modifiers))),
|
(v2f16 (AMDGPUclamp (VOP3PMods v2f16:$src0, i32:$src0_modifiers))),
|
||||||
(V_PK_MAX_F16 $src0_modifiers, $src0,
|
(V_PK_MAX_F16 $src0_modifiers, $src0,
|
||||||
$src0_modifiers, $src0, DSTCLAMP.ENABLE)
|
$src0_modifiers, $src0, DSTCLAMP.ENABLE)
|
||||||
>;
|
>;
|
||||||
|
}
|
||||||
|
|
||||||
/********** ================================ **********/
|
/********** ================================ **********/
|
||||||
/********** Floating point absolute/negative **********/
|
/********** Floating point absolute/negative **********/
|
||||||
@ -1333,11 +1335,13 @@ def : GCNPat<
|
|||||||
(V_MAX_F16_e64 $src_mods, $src, $src_mods, $src, 0, 0)
|
(V_MAX_F16_e64 $src_mods, $src, $src_mods, $src, 0, 0)
|
||||||
>;
|
>;
|
||||||
|
|
||||||
|
let SubtargetPredicate = HasVOP3PInsts in {
|
||||||
def : GCNPat<
|
def : GCNPat<
|
||||||
(fcanonicalize (v2f16 (VOP3PMods v2f16:$src, i32:$src_mods))),
|
(fcanonicalize (v2f16 (VOP3PMods v2f16:$src, i32:$src_mods))),
|
||||||
(V_PK_MAX_F16 $src_mods, $src, $src_mods, $src, DSTCLAMP.NONE)
|
(V_PK_MAX_F16 $src_mods, $src, $src_mods, $src, DSTCLAMP.NONE)
|
||||||
>;
|
>;
|
||||||
}
|
}
|
||||||
|
}
|
||||||
|
|
||||||
let OtherPredicates = [NoFP32Denormals] in {
|
let OtherPredicates = [NoFP32Denormals] in {
|
||||||
def : GCNPat<
|
def : GCNPat<
|
||||||
@ -1387,11 +1391,6 @@ class ExpPattern<SDPatternOperator node, ValueType vt, Instruction Inst> : GCNPa
|
|||||||
def : ExpPattern<AMDGPUexport, i32, EXP>;
|
def : ExpPattern<AMDGPUexport, i32, EXP>;
|
||||||
def : ExpPattern<AMDGPUexport_done, i32, EXP_DONE>;
|
def : ExpPattern<AMDGPUexport_done, i32, EXP_DONE>;
|
||||||
|
|
||||||
def : GCNPat <
|
|
||||||
(v2i16 (build_vector i16:$src0, i16:$src1)),
|
|
||||||
(v2i16 (S_PACK_LL_B32_B16 $src0, $src1))
|
|
||||||
>;
|
|
||||||
|
|
||||||
// COPY_TO_REGCLASS is workaround tablegen bug from multiple outputs
|
// COPY_TO_REGCLASS is workaround tablegen bug from multiple outputs
|
||||||
// from S_LSHL_B32's multiple outputs from implicit scc def.
|
// from S_LSHL_B32's multiple outputs from implicit scc def.
|
||||||
def : GCNPat <
|
def : GCNPat <
|
||||||
@ -1399,6 +1398,13 @@ def : GCNPat <
|
|||||||
(v2i16 (COPY_TO_REGCLASS (S_LSHL_B32 i16:$src1, (i16 16)), SReg_32_XM0))
|
(v2i16 (COPY_TO_REGCLASS (S_LSHL_B32 i16:$src1, (i16 16)), SReg_32_XM0))
|
||||||
>;
|
>;
|
||||||
|
|
||||||
|
|
||||||
|
let SubtargetPredicate = HasVOP3PInsts in {
|
||||||
|
def : GCNPat <
|
||||||
|
(v2i16 (build_vector i16:$src0, i16:$src1)),
|
||||||
|
(v2i16 (S_PACK_LL_B32_B16 $src0, $src1))
|
||||||
|
>;
|
||||||
|
|
||||||
// With multiple uses of the shift, this will duplicate the shift and
|
// With multiple uses of the shift, this will duplicate the shift and
|
||||||
// increase register pressure.
|
// increase register pressure.
|
||||||
def : GCNPat <
|
def : GCNPat <
|
||||||
@ -1406,6 +1412,7 @@ def : GCNPat <
|
|||||||
(v2i16 (S_PACK_LH_B32_B16 i16:$src0, i32:$src1))
|
(v2i16 (S_PACK_LH_B32_B16 i16:$src0, i32:$src1))
|
||||||
>;
|
>;
|
||||||
|
|
||||||
|
|
||||||
def : GCNPat <
|
def : GCNPat <
|
||||||
(v2i16 (build_vector (i16 (trunc (srl_oneuse i32:$src0, (i32 16)))),
|
(v2i16 (build_vector (i16 (trunc (srl_oneuse i32:$src0, (i32 16)))),
|
||||||
(i16 (trunc (srl_oneuse i32:$src1, (i32 16)))))),
|
(i16 (trunc (srl_oneuse i32:$src1, (i32 16)))))),
|
||||||
@ -1418,6 +1425,9 @@ def : GCNPat <
|
|||||||
(v2f16 (S_PACK_LL_B32_B16 $src0, $src1))
|
(v2f16 (S_PACK_LL_B32_B16 $src0, $src1))
|
||||||
>;
|
>;
|
||||||
|
|
||||||
|
} // End SubtargetPredicate = HasVOP3PInsts
|
||||||
|
|
||||||
|
|
||||||
// def : GCNPat <
|
// def : GCNPat <
|
||||||
// (v2f16 (scalar_to_vector f16:$src0)),
|
// (v2f16 (scalar_to_vector f16:$src0)),
|
||||||
// (COPY $src0)
|
// (COPY $src0)
|
||||||
|
@ -1,12 +1,14 @@
|
|||||||
; RUN: llc -march=amdgcn -mcpu=gfx900 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GFX9 -check-prefix=GCN %s
|
; RUN: llc -march=amdgcn -mcpu=gfx900 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX9,GCN %s
|
||||||
; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI %s
|
; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,VI %s
|
||||||
|
|
||||||
; FIXME: Need to handle non-uniform case for function below (load without gep).
|
; FIXME: Need to handle non-uniform case for function below (load without gep).
|
||||||
; GCN-LABEL: {{^}}v_test_add_v2i16:
|
; GCN-LABEL: {{^}}v_test_add_v2i16:
|
||||||
; GFX9: v_pk_add_u16 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
|
; GFX9: v_pk_add_u16 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
|
||||||
|
|
||||||
; VI: v_add_u16_sdwa v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
|
; FIXME: or should be unnecessary
|
||||||
; VI: v_add_u16_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
|
; VI: v_add_u16_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
|
||||||
|
; VI: v_add_u16_sdwa v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
|
||||||
|
; VI: v_or_b32
|
||||||
define amdgpu_kernel void @v_test_add_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %in0, <2 x i16> addrspace(1)* %in1) #1 {
|
define amdgpu_kernel void @v_test_add_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %in0, <2 x i16> addrspace(1)* %in1) #1 {
|
||||||
%tid = call i32 @llvm.amdgcn.workitem.id.x()
|
%tid = call i32 @llvm.amdgcn.workitem.id.x()
|
||||||
%gep.out = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %out, i32 %tid
|
%gep.out = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %out, i32 %tid
|
||||||
@ -52,21 +54,26 @@ define amdgpu_kernel void @s_test_add_self_v2i16(<2 x i16> addrspace(1)* %out, <
|
|||||||
; GCN-LABEL: {{^}}s_test_add_v2i16_kernarg:
|
; GCN-LABEL: {{^}}s_test_add_v2i16_kernarg:
|
||||||
; GFX9: v_pk_add_u16 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}
|
; GFX9: v_pk_add_u16 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}
|
||||||
|
|
||||||
; VI: v_add_u32
|
; VI: s_add_i32
|
||||||
; VI: v_add_u32_sdwa
|
; VI: s_add_i32
|
||||||
|
; VI: s_lshl_b32 s{{[0-9]+}}, s{{[0-9]+}}, 16
|
||||||
|
; VI: s_and_b32
|
||||||
|
; VI: s_or_b32
|
||||||
define amdgpu_kernel void @s_test_add_v2i16_kernarg(<2 x i16> addrspace(1)* %out, <2 x i16> %a, <2 x i16> %b) #1 {
|
define amdgpu_kernel void @s_test_add_v2i16_kernarg(<2 x i16> addrspace(1)* %out, <2 x i16> %a, <2 x i16> %b) #1 {
|
||||||
%add = add <2 x i16> %a, %b
|
%add = add <2 x i16> %a, %b
|
||||||
store <2 x i16> %add, <2 x i16> addrspace(1)* %out
|
store <2 x i16> %add, <2 x i16> addrspace(1)* %out
|
||||||
ret void
|
ret void
|
||||||
}
|
}
|
||||||
|
|
||||||
|
; FIXME: Eliminate or with sdwa
|
||||||
; GCN-LABEL: {{^}}v_test_add_v2i16_constant:
|
; GCN-LABEL: {{^}}v_test_add_v2i16_constant:
|
||||||
; GFX9: s_mov_b32 [[CONST:s[0-9]+]], 0x1c8007b{{$}}
|
; GFX9: s_mov_b32 [[CONST:s[0-9]+]], 0x1c8007b{{$}}
|
||||||
; GFX9: v_pk_add_u16 v{{[0-9]+}}, v{{[0-9]+}}, [[CONST]]
|
; GFX9: v_pk_add_u16 v{{[0-9]+}}, v{{[0-9]+}}, [[CONST]]
|
||||||
|
|
||||||
; VI-DAG: v_add_u16_e32 v{{[0-9]+}}, 0x7b, v{{[0-9]+}}
|
|
||||||
; VI-DAG: v_mov_b32_e32 v[[SCONST:[0-9]+]], 0x1c8
|
; VI-DAG: v_mov_b32_e32 v[[SCONST:[0-9]+]], 0x1c8
|
||||||
; VI-DAG: v_add_u16_sdwa v{{[0-9]+}}, v{{[0-9]+}}, v[[SCONST]] dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
|
; VI-DAG: v_add_u16_e32 v{{[0-9]+}}, 0x7b, v{{[0-9]+}}
|
||||||
|
; VI-DAG: v_add_u16_sdwa v{{[0-9]+}}, v{{[0-9]+}}, v[[SCONST]] dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
|
||||||
|
; VI: v_or_b32_e32
|
||||||
define amdgpu_kernel void @v_test_add_v2i16_constant(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %in0) #1 {
|
define amdgpu_kernel void @v_test_add_v2i16_constant(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %in0) #1 {
|
||||||
%tid = call i32 @llvm.amdgcn.workitem.id.x()
|
%tid = call i32 @llvm.amdgcn.workitem.id.x()
|
||||||
%gep.out = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %out, i32 %tid
|
%gep.out = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %out, i32 %tid
|
||||||
@ -84,7 +91,7 @@ define amdgpu_kernel void @v_test_add_v2i16_constant(<2 x i16> addrspace(1)* %ou
|
|||||||
|
|
||||||
; VI-DAG: v_add_u16_e32 v{{[0-9]+}}, 0xfffffcb3, v{{[0-9]+}}
|
; VI-DAG: v_add_u16_e32 v{{[0-9]+}}, 0xfffffcb3, v{{[0-9]+}}
|
||||||
; VI-DAG: v_mov_b32_e32 v[[SCONST:[0-9]+]], 0xfffffc21
|
; VI-DAG: v_mov_b32_e32 v[[SCONST:[0-9]+]], 0xfffffc21
|
||||||
; VI-DAG: v_add_u16_sdwa v{{[0-9]+}}, v{{[0-9]+}}, v[[SCONST]] dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
|
; VI-DAG: v_add_u16_sdwa v{{[0-9]+}}, v{{[0-9]+}}, v[[SCONST]] dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
|
||||||
define amdgpu_kernel void @v_test_add_v2i16_neg_constant(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %in0) #1 {
|
define amdgpu_kernel void @v_test_add_v2i16_neg_constant(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %in0) #1 {
|
||||||
%tid = call i32 @llvm.amdgcn.workitem.id.x()
|
%tid = call i32 @llvm.amdgcn.workitem.id.x()
|
||||||
%gep.out = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %out, i32 %tid
|
%gep.out = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %out, i32 %tid
|
||||||
@ -99,10 +106,9 @@ define amdgpu_kernel void @v_test_add_v2i16_neg_constant(<2 x i16> addrspace(1)*
|
|||||||
; GFX9: v_pk_add_u16 v{{[0-9]+}}, v{{[0-9]+}}, -1 op_sel_hi:[1,0]{{$}}
|
; GFX9: v_pk_add_u16 v{{[0-9]+}}, v{{[0-9]+}}, -1 op_sel_hi:[1,0]{{$}}
|
||||||
|
|
||||||
; VI: v_mov_b32_e32 v[[SCONST:[0-9]+]], -1
|
; VI: v_mov_b32_e32 v[[SCONST:[0-9]+]], -1
|
||||||
; VI: flat_load_ushort [[LOAD0:v[0-9]+]]
|
; VI: flat_load_dword [[LOAD:v[0-9]+]]
|
||||||
; VI: flat_load_ushort [[LOAD1:v[0-9]+]]
|
; VI-DAG: v_add_u16_sdwa v{{[0-9]+}}, [[LOAD]], v[[SCONST]] dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
|
||||||
; VI-DAG: v_add_u16_sdwa v{{[0-9]+}}, [[LOAD0]], v[[SCONST]] dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
|
; VI-DAG: v_add_u16_e32 v{{[0-9]+}}, -1, [[LOAD]]
|
||||||
; VI-DAG: v_add_u16_e32 v{{[0-9]+}}, -1, [[LOAD1]]
|
|
||||||
; VI: v_or_b32_e32
|
; VI: v_or_b32_e32
|
||||||
define amdgpu_kernel void @v_test_add_v2i16_inline_neg1(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %in0) #1 {
|
define amdgpu_kernel void @v_test_add_v2i16_inline_neg1(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %in0) #1 {
|
||||||
%tid = call i32 @llvm.amdgcn.workitem.id.x()
|
%tid = call i32 @llvm.amdgcn.workitem.id.x()
|
||||||
@ -117,10 +123,11 @@ define amdgpu_kernel void @v_test_add_v2i16_inline_neg1(<2 x i16> addrspace(1)*
|
|||||||
; GCN-LABEL: {{^}}v_test_add_v2i16_inline_lo_zero_hi:
|
; GCN-LABEL: {{^}}v_test_add_v2i16_inline_lo_zero_hi:
|
||||||
; GFX9: v_pk_add_u16 v{{[0-9]+}}, v{{[0-9]+}}, 32{{$}}
|
; GFX9: v_pk_add_u16 v{{[0-9]+}}, v{{[0-9]+}}, 32{{$}}
|
||||||
|
|
||||||
|
; VI: flat_load_dword
|
||||||
; VI-NOT: v_add_u16
|
; VI-NOT: v_add_u16
|
||||||
|
; VI: v_and_b32_e32 v{{[0-9]+}}, 0xffff0000,
|
||||||
; VI: v_add_u16_e32 v{{[0-9]+}}, 32, v{{[0-9]+}}
|
; VI: v_add_u16_e32 v{{[0-9]+}}, 32, v{{[0-9]+}}
|
||||||
; VI-NOT: v_add_u16
|
; VI-NOT: v_add_u16
|
||||||
; VI: v_lshlrev_b32_e32 v{{[0-9]+}}, 16,
|
|
||||||
; VI: v_or_b32_e32
|
; VI: v_or_b32_e32
|
||||||
define amdgpu_kernel void @v_test_add_v2i16_inline_lo_zero_hi(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %in0) #1 {
|
define amdgpu_kernel void @v_test_add_v2i16_inline_lo_zero_hi(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %in0) #1 {
|
||||||
%tid = call i32 @llvm.amdgcn.workitem.id.x()
|
%tid = call i32 @llvm.amdgcn.workitem.id.x()
|
||||||
@ -139,9 +146,9 @@ define amdgpu_kernel void @v_test_add_v2i16_inline_lo_zero_hi(<2 x i16> addrspac
|
|||||||
|
|
||||||
; VI-NOT: v_add_u16
|
; VI-NOT: v_add_u16
|
||||||
; VI: v_mov_b32_e32 v[[K:[0-9]+]], 0x3f80
|
; VI: v_mov_b32_e32 v[[K:[0-9]+]], 0x3f80
|
||||||
; VI: v_add_u16_sdwa v{{[0-9]+}}, v{{[0-9]+}}, v[[K]] dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
|
; VI: v_add_u16_sdwa v{{[0-9]+}}, v{{[0-9]+}}, v[[K]] dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
|
||||||
; VI-NOT: v_add_u16
|
; VI-NOT: v_add_u16
|
||||||
; VI: v_or_b32_e32
|
; VI: v_or_b32_sdwa v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
|
||||||
define amdgpu_kernel void @v_test_add_v2i16_inline_fp_split(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %in0) #1 {
|
define amdgpu_kernel void @v_test_add_v2i16_inline_fp_split(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %in0) #1 {
|
||||||
%tid = call i32 @llvm.amdgcn.workitem.id.x()
|
%tid = call i32 @llvm.amdgcn.workitem.id.x()
|
||||||
%gep.out = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %out, i32 %tid
|
%gep.out = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %out, i32 %tid
|
||||||
@ -162,15 +169,13 @@ define amdgpu_kernel void @v_test_add_v2i16_inline_fp_split(<2 x i16> addrspace(
|
|||||||
; GFX9-DAG: v_lshrrev_b32_e32 v[[ELT1:[0-9]+]], 16, [[ADD]]
|
; GFX9-DAG: v_lshrrev_b32_e32 v[[ELT1:[0-9]+]], 16, [[ADD]]
|
||||||
; GFX9: buffer_store_dwordx2 v{{\[}}[[ELT0]]:[[ELT1]]{{\]}}
|
; GFX9: buffer_store_dwordx2 v{{\[}}[[ELT0]]:[[ELT1]]{{\]}}
|
||||||
|
|
||||||
; VI: flat_load_ushort v[[A_LO:[0-9]+]]
|
; VI: flat_load_dword v[[A:[0-9]+]]
|
||||||
; VI: flat_load_ushort v[[A_HI:[0-9]+]]
|
; VI: flat_load_dword v[[B:[0-9]+]]
|
||||||
; VI: flat_load_ushort v[[B_LO:[0-9]+]]
|
|
||||||
; VI: flat_load_ushort v[[B_HI:[0-9]+]]
|
|
||||||
|
|
||||||
; VI: v_add_u16_e32 v[[ADD_HI:[0-9]+]], v[[A_HI]], v[[B_HI]]
|
|
||||||
; VI-NOT: and
|
; VI-NOT: and
|
||||||
; VI-NOT: shl
|
; VI-NOT: shl
|
||||||
; VI: v_add_u16_e32 v[[ADD_LO:[0-9]+]], v[[A_LO]], v[[B_LO]]
|
; VI: v_add_u16_e32 v[[ADD_LO:[0-9]+]], v[[A]], v[[B]]
|
||||||
|
; VI: v_add_u16_sdwa v[[ADD_HI:[0-9]+]], v[[A]], v[[B]] dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
|
||||||
; VI-NOT: and
|
; VI-NOT: and
|
||||||
; VI-NOT: shl
|
; VI-NOT: shl
|
||||||
; VI: buffer_store_dwordx2 v{{\[}}[[ADD_LO]]:[[ADD_HI]]{{\]}}
|
; VI: buffer_store_dwordx2 v{{\[}}[[ADD_LO]]:[[ADD_HI]]{{\]}}
|
||||||
@ -198,13 +203,11 @@ define amdgpu_kernel void @v_test_add_v2i16_zext_to_v2i32(<2 x i32> addrspace(1)
|
|||||||
; GFX9: buffer_store_dwordx4
|
; GFX9: buffer_store_dwordx4
|
||||||
|
|
||||||
; VI-DAG: v_mov_b32_e32 v{{[0-9]+}}, 0{{$}}
|
; VI-DAG: v_mov_b32_e32 v{{[0-9]+}}, 0{{$}}
|
||||||
; VI: flat_load_ushort v[[A_LO:[0-9]+]]
|
; VI: flat_load_dword v[[A:[0-9]+]]
|
||||||
; VI: flat_load_ushort v[[A_HI:[0-9]+]]
|
; VI: flat_load_dword v[[B:[0-9]+]]
|
||||||
; VI: flat_load_ushort v[[B_LO:[0-9]+]]
|
|
||||||
; VI: flat_load_ushort v[[B_HI:[0-9]+]]
|
|
||||||
|
|
||||||
; VI-DAG: v_add_u16_e32
|
; VI-DAG: v_add_u16_e32
|
||||||
; VI-DAG: v_add_u16_e32
|
; VI: v_add_u16_sdwa v[[ADD_HI:[0-9]+]], v[[A]], v[[B]] dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
|
||||||
|
|
||||||
; VI: buffer_store_dwordx4
|
; VI: buffer_store_dwordx4
|
||||||
define amdgpu_kernel void @v_test_add_v2i16_zext_to_v2i64(<2 x i64> addrspace(1)* %out, <2 x i16> addrspace(1)* %in0, <2 x i16> addrspace(1)* %in1) #1 {
|
define amdgpu_kernel void @v_test_add_v2i16_zext_to_v2i64(<2 x i64> addrspace(1)* %out, <2 x i16> addrspace(1)* %in0, <2 x i16> addrspace(1)* %in1) #1 {
|
||||||
@ -230,8 +233,9 @@ define amdgpu_kernel void @v_test_add_v2i16_zext_to_v2i64(<2 x i64> addrspace(1)
|
|||||||
; GFX9-DAG: v_ashrrev_i32_e32 v[[ELT1:[0-9]+]], 16, [[ADD]]
|
; GFX9-DAG: v_ashrrev_i32_e32 v[[ELT1:[0-9]+]], 16, [[ADD]]
|
||||||
; GFX9: buffer_store_dwordx2 v{{\[}}[[ELT0]]:[[ELT1]]{{\]}}
|
; GFX9: buffer_store_dwordx2 v{{\[}}[[ELT0]]:[[ELT1]]{{\]}}
|
||||||
|
|
||||||
|
; VI: v_add_u16_sdwa v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
|
||||||
; VI: v_add_u16_e32
|
; VI: v_add_u16_e32
|
||||||
; VI: v_add_u16_e32
|
|
||||||
; VI: v_bfe_i32 v{{[0-9]+}}, v{{[0-9]+}}, 0, 16
|
; VI: v_bfe_i32 v{{[0-9]+}}, v{{[0-9]+}}, 0, 16
|
||||||
; VI: v_bfe_i32 v{{[0-9]+}}, v{{[0-9]+}}, 0, 16
|
; VI: v_bfe_i32 v{{[0-9]+}}, v{{[0-9]+}}, 0, 16
|
||||||
; VI: buffer_store_dwordx2
|
; VI: buffer_store_dwordx2
|
||||||
|
@ -8,8 +8,17 @@
|
|||||||
; GFX9: v_mov_b32_e32 [[VLHS:v[0-9]+]], [[LHS]]
|
; GFX9: v_mov_b32_e32 [[VLHS:v[0-9]+]], [[LHS]]
|
||||||
; GFX9: v_pk_ashrrev_i16 [[RESULT:v[0-9]+]], [[RHS]], [[VLHS]]
|
; GFX9: v_pk_ashrrev_i16 [[RESULT:v[0-9]+]], [[RHS]], [[VLHS]]
|
||||||
|
|
||||||
; VI: v_ashrrev_i32_sdwa v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
|
; VI: s_load_dword [[LHS:s[0-9]+]]
|
||||||
; VI: v_or_b32_sdwa v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
|
; VI: s_load_dword [[RHS:s[0-9]+]]
|
||||||
|
; VI: s_ashr_i32
|
||||||
|
; VI: s_ashr_i32
|
||||||
|
; VI: s_sext_i32_i16
|
||||||
|
; VI: s_sext_i32_i16
|
||||||
|
; VI: s_ashr_i32
|
||||||
|
; VI: s_ashr_i32
|
||||||
|
; VI: s_lshl_b32
|
||||||
|
; VI: s_and_b32
|
||||||
|
; VI: s_or_b32
|
||||||
|
|
||||||
; CI-DAG: v_ashrrev_i32_e32
|
; CI-DAG: v_ashrrev_i32_e32
|
||||||
; CI-DAG: v_and_b32_e32 v{{[0-9]+}}, 0xffff, v{{[0-9]+}}
|
; CI-DAG: v_and_b32_e32 v{{[0-9]+}}, 0xffff, v{{[0-9]+}}
|
||||||
|
@ -71,10 +71,15 @@ define amdgpu_kernel void @extract_vector_elt_v3i16(i16 addrspace(1)* %out, <3 x
|
|||||||
}
|
}
|
||||||
|
|
||||||
; GCN-LABEL: {{^}}extract_vector_elt_v4i16:
|
; GCN-LABEL: {{^}}extract_vector_elt_v4i16:
|
||||||
; SICIVI: buffer_load_ushort
|
; SICI: buffer_load_ushort
|
||||||
; SICIVI: buffer_load_ushort
|
; SICI: buffer_load_ushort
|
||||||
; SICIVI: buffer_store_short
|
; SICI: buffer_store_short
|
||||||
; SICIVI: buffer_store_short
|
; SICI: buffer_store_short
|
||||||
|
|
||||||
|
; VI: s_load_dword s
|
||||||
|
; VI: s_load_dword s
|
||||||
|
; VI: buffer_store_short
|
||||||
|
; VI: buffer_store_short
|
||||||
|
|
||||||
; GFX9-DAG: s_load_dword [[LOAD0:s[0-9]+]], s[0:1], 0x2c
|
; GFX9-DAG: s_load_dword [[LOAD0:s[0-9]+]], s[0:1], 0x2c
|
||||||
; GFX9-DAG: s_load_dword [[LOAD1:s[0-9]+]], s[0:1], 0x30
|
; GFX9-DAG: s_load_dword [[LOAD1:s[0-9]+]], s[0:1], 0x30
|
||||||
@ -92,9 +97,16 @@ define amdgpu_kernel void @extract_vector_elt_v4i16(i16 addrspace(1)* %out, <4 x
|
|||||||
}
|
}
|
||||||
|
|
||||||
; GCN-LABEL: {{^}}dynamic_extract_vector_elt_v3i16:
|
; GCN-LABEL: {{^}}dynamic_extract_vector_elt_v3i16:
|
||||||
; SICIVI: buffer_load_ushort
|
; SICI: buffer_load_ushort
|
||||||
; SICIVI: buffer_load_ushort
|
; SICI: buffer_load_ushort
|
||||||
; SICIVI: buffer_load_ushort
|
; SICI: buffer_load_ushort
|
||||||
|
|
||||||
|
; SICI: buffer_store_short
|
||||||
|
; SICI: buffer_store_short
|
||||||
|
; SICI: buffer_store_short
|
||||||
|
|
||||||
|
; SICI: buffer_load_ushort
|
||||||
|
; SICI: buffer_store_short
|
||||||
|
|
||||||
; GFX9-DAG: global_load_short_d16_hi v
|
; GFX9-DAG: global_load_short_d16_hi v
|
||||||
; GFX9-DAG: global_load_short_d16 v
|
; GFX9-DAG: global_load_short_d16 v
|
||||||
|
@ -1,6 +1,6 @@
|
|||||||
; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=kaveri -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GCN -check-prefix=CI %s
|
; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=kaveri -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GCN -check-prefix=CI %s
|
||||||
; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=tonga -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GCN -check-prefix=VI %s
|
; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=tonga -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GCN -check-prefix=VI -check-prefix=GFX89 %s
|
||||||
; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GCN -check-prefix=GFX9 %s
|
; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GCN -check-prefix=GFX9 -check-prefix=GFX89 %s
|
||||||
|
|
||||||
; DAGCombiner will transform:
|
; DAGCombiner will transform:
|
||||||
; (fabs (f16 bitcast (i16 a))) => (f16 bitcast (and (i16 a), 0x7FFFFFFF))
|
; (fabs (f16 bitcast (i16 a))) => (f16 bitcast (and (i16 a), 0x7FFFFFFF))
|
||||||
@ -36,16 +36,8 @@ define amdgpu_kernel void @s_fabs_f16(half addrspace(1)* %out, half %in) {
|
|||||||
; CI: v_and_b32_e32 v{{[0-9]+}}, [[MASK]]
|
; CI: v_and_b32_e32 v{{[0-9]+}}, [[MASK]]
|
||||||
; CI: v_or_b32_e32
|
; CI: v_or_b32_e32
|
||||||
|
|
||||||
; VI: flat_load_ushort [[HI:v[0-9]+]]
|
; GFX89: s_load_dword [[VAL:s[0-9]+]]
|
||||||
; VI: flat_load_ushort [[LO:v[0-9]+]]
|
; GFX89: s_and_b32 s{{[0-9]+}}, [[VAL]], 0x7fff7fff
|
||||||
; VI: v_mov_b32_e32 [[MASK:v[0-9]+]], 0x7fff{{$}}
|
|
||||||
; VI-DAG: v_and_b32_e32 [[FABS_LO:v[0-9]+]], [[HI]], [[MASK]]
|
|
||||||
; VI-DAG: v_and_b32_sdwa [[FABS_HI:v[0-9]+]], [[LO]], [[MASK]] dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
|
|
||||||
; VI-DAG: v_or_b32_e32 v{{[0-9]+}}, [[FABS_LO]], [[FABS_HI]]
|
|
||||||
; VI: flat_store_dword
|
|
||||||
|
|
||||||
; GFX9: s_load_dword [[VAL:s[0-9]+]]
|
|
||||||
; GFX9: s_and_b32 s{{[0-9]+}}, [[VAL]], 0x7fff7fff
|
|
||||||
define amdgpu_kernel void @s_fabs_v2f16(<2 x half> addrspace(1)* %out, <2 x half> %in) {
|
define amdgpu_kernel void @s_fabs_v2f16(<2 x half> addrspace(1)* %out, <2 x half> %in) {
|
||||||
%fabs = call <2 x half> @llvm.fabs.v2f16(<2 x half> %in)
|
%fabs = call <2 x half> @llvm.fabs.v2f16(<2 x half> %in)
|
||||||
store <2 x half> %fabs, <2 x half> addrspace(1)* %out
|
store <2 x half> %fabs, <2 x half> addrspace(1)* %out
|
||||||
@ -59,13 +51,12 @@ define amdgpu_kernel void @s_fabs_v2f16(<2 x half> addrspace(1)* %out, <2 x half
|
|||||||
; CI: v_and_b32_e32 v{{[0-9]+}}, [[MASK]]
|
; CI: v_and_b32_e32 v{{[0-9]+}}, [[MASK]]
|
||||||
; CI: v_and_b32_e32 v{{[0-9]+}}, [[MASK]]
|
; CI: v_and_b32_e32 v{{[0-9]+}}, [[MASK]]
|
||||||
|
|
||||||
; VI: v_mov_b32_e32 [[MASK:v[0-9]+]], 0x7fff{{$}}
|
|
||||||
; VI-DAG: v_and_b32_sdwa v{{[0-9]+}}, v{{[0-9]+}}, [[MASK]] dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
|
; GFX89: s_load_dword s
|
||||||
; VI-DAG: v_and_b32_sdwa v{{[0-9]+}}, v{{[0-9]+}}, [[MASK]] dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
|
; GFX89: s_load_dword s
|
||||||
; VI-DAG: v_and_b32_e32 v{{[0-9]+}}, v{{[0-9]+}}, [[MASK]]
|
; GFX89: s_mov_b32 [[MASK:s[0-9]+]], 0x7fff7fff
|
||||||
; VI-DAG: v_and_b32_e32 v{{[0-9]+}}, v{{[0-9]+}}, [[MASK]]
|
; GFX89: s_and_b32 s{{[0-9]+}}, s{{[0-9]+}}, [[MASK]]
|
||||||
; VI-DAG: v_or_b32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
|
; GFX89: s_and_b32 s{{[0-9]+}}, s{{[0-9]+}}, [[MASK]]
|
||||||
; VI: v_or_b32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
|
|
||||||
|
|
||||||
; GCN: {{flat|global}}_store_dwordx2
|
; GCN: {{flat|global}}_store_dwordx2
|
||||||
define amdgpu_kernel void @s_fabs_v4f16(<4 x half> addrspace(1)* %out, <4 x half> %in) {
|
define amdgpu_kernel void @s_fabs_v4f16(<4 x half> addrspace(1)* %out, <4 x half> %in) {
|
||||||
@ -147,9 +138,9 @@ define amdgpu_kernel void @v_fabs_fold_v2f16(<2 x half> addrspace(1)* %out, <2 x
|
|||||||
; CI-DAG: v_mul_f32_e32 v{{[0-9]+}}, 4.0, v{{[0-9]+}}
|
; CI-DAG: v_mul_f32_e32 v{{[0-9]+}}, 4.0, v{{[0-9]+}}
|
||||||
; CI-DAG: v_add_f32_e32 v{{[0-9]+}}, 2.0, v{{[0-9]+}}
|
; CI-DAG: v_add_f32_e32 v{{[0-9]+}}, 2.0, v{{[0-9]+}}
|
||||||
|
|
||||||
; GFX89-DAG: v_mul_f16_e32 v{{[0-9]+}}, -4.0, [[VAL]]
|
; GFX89-DAG: v_mul_f16_e64 v{{[0-9]+}}, |[[VAL]]|, 4.0
|
||||||
; GFX89-DAG: v_mov_b32_e32 [[CONST2:v[0-9]+]], 0x4000
|
; GFX89-DAG: v_mov_b32_e32 [[CONST2:v[0-9]+]], 0x4000
|
||||||
; GFX89-DAG: v_sub_f16_sdwa v{{[0-9]+}}, [[CONST2]], [[VAL]] dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
|
; GFX89-DAG: v_add_f16_sdwa v{{[0-9]+}}, |[[VAL]]|, [[CONST2]] dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
|
||||||
define amdgpu_kernel void @v_extract_fabs_fold_v2f16(<2 x half> addrspace(1)* %in) #0 {
|
define amdgpu_kernel void @v_extract_fabs_fold_v2f16(<2 x half> addrspace(1)* %in) #0 {
|
||||||
%tid = call i32 @llvm.amdgcn.workitem.id.x()
|
%tid = call i32 @llvm.amdgcn.workitem.id.x()
|
||||||
%gep.in = getelementptr inbounds <2 x half>, <2 x half> addrspace(1)* %in, i32 %tid
|
%gep.in = getelementptr inbounds <2 x half>, <2 x half> addrspace(1)* %in, i32 %tid
|
||||||
@ -167,11 +158,12 @@ define amdgpu_kernel void @v_extract_fabs_fold_v2f16(<2 x half> addrspace(1)* %i
|
|||||||
|
|
||||||
; GCN-LABEL: {{^}}v_extract_fabs_no_fold_v2f16:
|
; GCN-LABEL: {{^}}v_extract_fabs_no_fold_v2f16:
|
||||||
; GCN: {{flat|global}}_load_dword [[VAL:v[0-9]+]]
|
; GCN: {{flat|global}}_load_dword [[VAL:v[0-9]+]]
|
||||||
|
|
||||||
; FIXME: Extra bfe on VI
|
|
||||||
; GFX9-NOT: v_bfe_u32
|
|
||||||
; VI: v_bfe_u32
|
|
||||||
; GCN: v_and_b32_e32 [[AND:v[0-9]+]], 0x7fff7fff, [[VAL]]
|
; GCN: v_and_b32_e32 [[AND:v[0-9]+]], 0x7fff7fff, [[VAL]]
|
||||||
|
|
||||||
|
|
||||||
|
; VI: v_bfe_u32 v{{[0-9]+}}, v{{[0-9]+}}, 16, 15
|
||||||
|
; VI: flat_store_short
|
||||||
|
|
||||||
; GFX9: global_store_short_d16_hi v{{\[[0-9]+:[0-9]+\]}}, [[AND]], off
|
; GFX9: global_store_short_d16_hi v{{\[[0-9]+:[0-9]+\]}}, [[AND]], off
|
||||||
define amdgpu_kernel void @v_extract_fabs_no_fold_v2f16(<2 x half> addrspace(1)* %in) #0 {
|
define amdgpu_kernel void @v_extract_fabs_no_fold_v2f16(<2 x half> addrspace(1)* %in) #0 {
|
||||||
%tid = call i32 @llvm.amdgcn.workitem.id.x()
|
%tid = call i32 @llvm.amdgcn.workitem.id.x()
|
||||||
|
@ -222,12 +222,9 @@ define amdgpu_kernel void @v_test_canonicalize_var_v2f16(<2 x half> addrspace(1)
|
|||||||
ret void
|
ret void
|
||||||
}
|
}
|
||||||
|
|
||||||
; FIXME: Fold modifier
|
|
||||||
; GCN-LABEL: {{^}}v_test_canonicalize_fabs_var_v2f16:
|
; GCN-LABEL: {{^}}v_test_canonicalize_fabs_var_v2f16:
|
||||||
; VI-DAG: v_bfe_u32
|
; VI: v_max_f16_sdwa [[REG0:v[0-9]+]], |v{{[0-9]+}}|, |v{{[0-9]+}}| dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
|
||||||
; VI-DAG: v_and_b32_e32 v{{[0-9]+}}, 0x7fff7fff, v{{[0-9]+}}
|
; VI: v_max_f16_e64 [[REG1:v[0-9]+]], |v{{[0-9]+}}|, |v{{[0-9]+}}|
|
||||||
; VI: v_max_f16_sdwa [[REG0:v[0-9]+]], v{{[0-9]+}}, v{{[0-9]+}} dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
|
|
||||||
; VI: v_max_f16_e32 [[REG1:v[0-9]+]], v{{[0-9]+}}, v{{[0-9]+}}
|
|
||||||
; VI-NOT: 0xffff
|
; VI-NOT: 0xffff
|
||||||
; VI: v_or_b32
|
; VI: v_or_b32
|
||||||
|
|
||||||
@ -245,9 +242,8 @@ define amdgpu_kernel void @v_test_canonicalize_fabs_var_v2f16(<2 x half> addrspa
|
|||||||
}
|
}
|
||||||
|
|
||||||
; GCN-LABEL: {{^}}v_test_canonicalize_fneg_fabs_var_v2f16:
|
; GCN-LABEL: {{^}}v_test_canonicalize_fneg_fabs_var_v2f16:
|
||||||
; VI-DAG: v_or_b32_e32 v{{[0-9]+}}, 0x80008000, v{{[0-9]+}}
|
; VI-DAG: v_max_f16_sdwa [[REG0:v[0-9]+]], -|v{{[0-9]+}}|, -|v{{[0-9]+}}| dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
|
||||||
; VI-DAG: v_max_f16_sdwa [[REG0:v[0-9]+]], v{{[0-9]+}}, v{{[0-9]+}} dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
|
; VI-DAG: v_max_f16_e64 [[REG1:v[0-9]+]], -|v{{[0-9]+}}|, -|v{{[0-9]+}}|
|
||||||
; VI-DAG: v_max_f16_e32 [[REG1:v[0-9]+]], v{{[0-9]+}}, v{{[0-9]+}}
|
|
||||||
; VI: v_or_b32
|
; VI: v_or_b32
|
||||||
|
|
||||||
; GFX9: v_and_b32_e32 [[ABS:v[0-9]+]], 0x7fff7fff, v{{[0-9]+}}
|
; GFX9: v_and_b32_e32 [[ABS:v[0-9]+]], 0x7fff7fff, v{{[0-9]+}}
|
||||||
@ -265,9 +261,8 @@ define amdgpu_kernel void @v_test_canonicalize_fneg_fabs_var_v2f16(<2 x half> ad
|
|||||||
}
|
}
|
||||||
|
|
||||||
; GCN-LABEL: {{^}}v_test_canonicalize_fneg_var_v2f16:
|
; GCN-LABEL: {{^}}v_test_canonicalize_fneg_var_v2f16:
|
||||||
; VI: v_xor_b32_e32 [[FNEG:v[0-9]+]], 0x80008000, v{{[0-9]+}}
|
; VI-DAG: v_max_f16_sdwa [[REG1:v[0-9]+]], -v{{[0-9]+}}, -v{{[0-9]+}} dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
|
||||||
; VI-DAG: v_max_f16_sdwa [[REG1:v[0-9]+]], [[FNEG]], [[FNEG]] dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
|
; VI-DAG: v_max_f16_e64 [[REG0:v[0-9]+]], -v{{[0-9]+}}, -v{{[0-9]+}}
|
||||||
; VI-DAG: v_max_f16_e32 [[REG0:v[0-9]+]], [[FNEG]], [[FNEG]]
|
|
||||||
; VI-NOT: 0xffff
|
; VI-NOT: 0xffff
|
||||||
|
|
||||||
; GFX9: v_pk_max_f16 [[REG:v[0-9]+]], {{v[0-9]+}}, {{v[0-9]+}} neg_lo:[1,1] neg_hi:[1,1]{{$}}
|
; GFX9: v_pk_max_f16 [[REG:v[0-9]+]], {{v[0-9]+}}, {{v[0-9]+}} neg_lo:[1,1] neg_hi:[1,1]{{$}}
|
||||||
|
@ -94,12 +94,13 @@ define amdgpu_kernel void @test_fmax3_olt_1_f16(half addrspace(1)* %out, half ad
|
|||||||
; SI-NEXT: v_max3_f32
|
; SI-NEXT: v_max3_f32
|
||||||
; SI-NEXT: v_max3_f32
|
; SI-NEXT: v_max3_f32
|
||||||
|
|
||||||
; VI: v_max_f16_e32
|
; VI: v_max_f16_sdwa v4, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
|
||||||
; VI-NEXT: v_max_f16_e32
|
; VI: v_max_f16_e32 v0, v0, v1
|
||||||
; VI-NEXT: v_max_f16_e32
|
; VI: v_max_f16_sdwa v1, v2, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
|
||||||
; VI-NEXT: v_max_f16_e32
|
; VI: v_max_f16_e32 v0, v2, v0
|
||||||
; VI-NEXT: v_max_f16_e32
|
; VI: v_max_f16_sdwa v1, v1, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
|
||||||
; VI-NEXT: v_max_f16_e32
|
; VI: v_max_f16_e32 v0, v0, v3
|
||||||
|
; VI: v_or_b32_e32 v0, v0, v1
|
||||||
|
|
||||||
; GFX9: v_pk_max_f16
|
; GFX9: v_pk_max_f16
|
||||||
; GFX9-NEXT: v_pk_max_f16
|
; GFX9-NEXT: v_pk_max_f16
|
||||||
|
@ -92,12 +92,13 @@ define amdgpu_kernel void @test_fmin3_olt_1_f16(half addrspace(1)* %out, half ad
|
|||||||
; SI-NEXT: v_min3_f32
|
; SI-NEXT: v_min3_f32
|
||||||
; SI-NEXT: v_min3_f32
|
; SI-NEXT: v_min3_f32
|
||||||
|
|
||||||
; VI: v_min_f16_e32
|
; VI: v_min_f16_sdwa v4, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
|
||||||
; VI-NEXT: v_min_f16_e32
|
; VI: v_min_f16_e32 v0, v0, v1
|
||||||
; VI-NEXT: v_min_f16_e32
|
; VI: v_min_f16_sdwa v1, v2, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
|
||||||
; VI-NEXT: v_min_f16_e32
|
; VI: v_min_f16_e32 v0, v2, v0
|
||||||
; VI-NEXT: v_min_f16_e32
|
; VI: v_min_f16_sdwa v1, v1, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
|
||||||
; VI-NEXT: v_min_f16_e32
|
; VI: v_min_f16_e32 v0, v0, v3
|
||||||
|
; VI: v_or_b32_e32 v0, v0, v1
|
||||||
|
|
||||||
; GFX9: v_pk_min_f16
|
; GFX9: v_pk_min_f16
|
||||||
; GFX9: v_pk_min_f16
|
; GFX9: v_pk_min_f16
|
||||||
|
@ -73,12 +73,9 @@ define amdgpu_kernel void @v_fneg_fabs_f16(half addrspace(1)* %out, half addrspa
|
|||||||
; CI: v_lshlrev_b32_e32 [[SHL:v[0-9]+]], 16, v{{[0-9]+}}
|
; CI: v_lshlrev_b32_e32 [[SHL:v[0-9]+]], 16, v{{[0-9]+}}
|
||||||
; CI: v_or_b32_e32 [[OR:v[0-9]+]], v{{[0-9]+}}, [[SHL]]
|
; CI: v_or_b32_e32 [[OR:v[0-9]+]], v{{[0-9]+}}, [[SHL]]
|
||||||
; CI: v_or_b32_e32 v{{[0-9]+}}, 0x80008000, [[OR]]
|
; CI: v_or_b32_e32 v{{[0-9]+}}, 0x80008000, [[OR]]
|
||||||
; VI: s_mov_b32 [[MASK:s[0-9]+]], 0x8000{{$}}
|
|
||||||
; VI: v_mov_b32_e32 [[VMASK:v[0-9]+]], [[MASK]]
|
|
||||||
; VI: v_or_b32_sdwa v{{[0-9]+}}, v{{[0-9]+}}, [[VMASK]] dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
|
|
||||||
; VI: v_or_b32_e32 v{{[0-9]+}}, [[MASK]],
|
|
||||||
; CIVI: flat_store_dword
|
|
||||||
|
|
||||||
|
; FIXME: Random commute
|
||||||
|
; VI: s_or_b32 s{{[0-9]+}}, s{{[0-9]+}}, 0x80008000
|
||||||
; GFX9: s_or_b32 s{{[0-9]+}}, 0x80008000, s{{[0-9]+}}
|
; GFX9: s_or_b32 s{{[0-9]+}}, 0x80008000, s{{[0-9]+}}
|
||||||
define amdgpu_kernel void @s_fneg_fabs_v2f16(<2 x half> addrspace(1)* %out, <2 x half> %in) {
|
define amdgpu_kernel void @s_fneg_fabs_v2f16(<2 x half> addrspace(1)* %out, <2 x half> %in) {
|
||||||
%fabs = call <2 x half> @llvm.fabs.v2f16(<2 x half> %in)
|
%fabs = call <2 x half> @llvm.fabs.v2f16(<2 x half> %in)
|
||||||
@ -95,14 +92,13 @@ define amdgpu_kernel void @s_fneg_fabs_v2f16(<2 x half> addrspace(1)* %out, <2 x
|
|||||||
; CI: v_or_b32_e32 [[OR1:v[0-9]+]], v{{[0-9]+}}, [[SHL1]]
|
; CI: v_or_b32_e32 [[OR1:v[0-9]+]], v{{[0-9]+}}, [[SHL1]]
|
||||||
; CI: v_or_b32_e32 v{{[0-9]+}}, [[MASK]], [[OR0]]
|
; CI: v_or_b32_e32 v{{[0-9]+}}, [[MASK]], [[OR0]]
|
||||||
; CI: v_or_b32_e32 v{{[0-9]+}}, [[MASK]], [[OR1]]
|
; CI: v_or_b32_e32 v{{[0-9]+}}, [[MASK]], [[OR1]]
|
||||||
; VI: s_mov_b32 [[MASK:s[0-9]+]], 0x8000{{$}}
|
|
||||||
; VI: v_mov_b32_e32 [[VMASK:v[0-9]+]], [[MASK]]
|
|
||||||
; VI: v_or_b32_sdwa v{{[0-9]+}}, v{{[0-9]+}}, [[VMASK]] dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
|
|
||||||
; VI: v_or_b32_e32 v{{[0-9]+}}, [[MASK]],
|
|
||||||
; VI: v_or_b32_sdwa v{{[0-9]+}}, v{{[0-9]+}}, [[VMASK]] dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
|
|
||||||
; VI: v_or_b32_e32 v{{[0-9]+}}, [[MASK]],
|
|
||||||
|
|
||||||
; GFX9: s_mov_b32 [[MASK:s[0-9]+]], 0x80008000
|
; FIXME: Random commute
|
||||||
|
; GFX89: s_mov_b32 [[MASK:s[0-9]+]], 0x80008000
|
||||||
|
|
||||||
|
; VI: s_or_b32 s{{[0-9]+}}, s{{[0-9]+}}, [[MASK]]
|
||||||
|
; VI: s_or_b32 s{{[0-9]+}}, s{{[0-9]+}}, [[MASK]]
|
||||||
|
|
||||||
; GFX9: s_or_b32 s{{[0-9]+}}, [[MASK]], s{{[0-9]+}}
|
; GFX9: s_or_b32 s{{[0-9]+}}, [[MASK]], s{{[0-9]+}}
|
||||||
; GFX9: s_or_b32 s{{[0-9]+}}, [[MASK]], s{{[0-9]+}}
|
; GFX9: s_or_b32 s{{[0-9]+}}, [[MASK]], s{{[0-9]+}}
|
||||||
|
|
||||||
@ -120,7 +116,7 @@ define amdgpu_kernel void @fneg_fabs_v4f16(<4 x half> addrspace(1)* %out, <4 x h
|
|||||||
; CI: v_mul_f32_e32 v{{[0-9]+}}, 4.0, v{{[0-9]+}}
|
; CI: v_mul_f32_e32 v{{[0-9]+}}, 4.0, v{{[0-9]+}}
|
||||||
; CI: v_mul_f32_e32 v{{[0-9]+}}, 4.0, v{{[0-9]+}}
|
; CI: v_mul_f32_e32 v{{[0-9]+}}, 4.0, v{{[0-9]+}}
|
||||||
|
|
||||||
; VI: v_mul_f16_e64 v{{[0-9]+}}, -|v{{[0-9]+}}|, 4.0
|
; VI: v_mul_f16_e64 v{{[0-9]+}}, -|s{{[0-9]+}}|, 4.0
|
||||||
; VI: v_mul_f16_sdwa v{{[0-9]+}}, -|v{{[0-9]+}}|, v{{[0-9]+}} dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
|
; VI: v_mul_f16_sdwa v{{[0-9]+}}, -|v{{[0-9]+}}|, v{{[0-9]+}} dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
|
||||||
|
|
||||||
; GFX9: s_and_b32 [[ABS:s[0-9]+]], s{{[0-9]+}}, 0x7fff7fff
|
; GFX9: s_and_b32 [[ABS:s[0-9]+]], s{{[0-9]+}}, 0x7fff7fff
|
||||||
|
@ -60,7 +60,8 @@ define amdgpu_kernel void @v_fneg_fold_f16(half addrspace(1)* %out, half addrspa
|
|||||||
ret void
|
ret void
|
||||||
}
|
}
|
||||||
|
|
||||||
; FIXME: Terrible code with VI and even worse with SI/CI
|
; FIXME: Terrible code with SI/CI.
|
||||||
|
; FIXME: scalar for VI, vector for gfx9
|
||||||
; GCN-LABEL: {{^}}s_fneg_v2f16:
|
; GCN-LABEL: {{^}}s_fneg_v2f16:
|
||||||
; CI: s_mov_b32 [[MASK:s[0-9]+]], 0x8000{{$}}
|
; CI: s_mov_b32 [[MASK:s[0-9]+]], 0x8000{{$}}
|
||||||
; CI: v_xor_b32_e32 v{{[0-9]+}}, [[MASK]], v{{[0-9]+}}
|
; CI: v_xor_b32_e32 v{{[0-9]+}}, [[MASK]], v{{[0-9]+}}
|
||||||
@ -68,12 +69,9 @@ define amdgpu_kernel void @v_fneg_fold_f16(half addrspace(1)* %out, half addrspa
|
|||||||
; CI: v_xor_b32_e32 v{{[0-9]+}}, [[MASK]], v{{[0-9]+}}
|
; CI: v_xor_b32_e32 v{{[0-9]+}}, [[MASK]], v{{[0-9]+}}
|
||||||
; CI: v_or_b32_e32
|
; CI: v_or_b32_e32
|
||||||
|
|
||||||
; VI: v_mov_b32_e32 [[MASK:v[0-9]+]], 0x8000{{$}}
|
; VI: s_xor_b32 s{{[0-9]+}}, s{{[0-9]+}}, 0x80008000
|
||||||
; VI-DAG: v_xor_b32_sdwa v{{[0-9]+}}, v{{[0-9]+}}, [[MASK]] dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
|
|
||||||
; VI-DAG: v_xor_b32_e32 v{{[0-9]+}}, v{{[0-9]+}}, [[MASK]]
|
|
||||||
|
|
||||||
; GFX9: v_xor_b32_e32 v{{[0-9]+}}, 0x80008000, v{{[0-9]+}}
|
; GFX9: v_xor_b32_e32 v{{[0-9]+}}, 0x80008000, v{{[0-9]+}}
|
||||||
|
|
||||||
define amdgpu_kernel void @s_fneg_v2f16(<2 x half> addrspace(1)* %out, <2 x half> %in) #0 {
|
define amdgpu_kernel void @s_fneg_v2f16(<2 x half> addrspace(1)* %out, <2 x half> %in) #0 {
|
||||||
%fneg = fsub <2 x half> <half -0.0, half -0.0>, %in
|
%fneg = fsub <2 x half> <half -0.0, half -0.0>, %in
|
||||||
store <2 x half> %fneg, <2 x half> addrspace(1)* %out
|
store <2 x half> %fneg, <2 x half> addrspace(1)* %out
|
||||||
|
@ -1,5 +1,5 @@
|
|||||||
; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=tahiti -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=SI %s
|
; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=tahiti -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,SI %s
|
||||||
; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI %s
|
; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,VI %s
|
||||||
|
|
||||||
; half args should be promoted to float for SI and lower.
|
; half args should be promoted to float for SI and lower.
|
||||||
|
|
||||||
@ -13,13 +13,17 @@ define amdgpu_kernel void @load_f16_arg(half addrspace(1)* %out, half %arg) #0 {
|
|||||||
ret void
|
ret void
|
||||||
}
|
}
|
||||||
|
|
||||||
|
; FIXME: Should always be the same
|
||||||
; GCN-LABEL: {{^}}load_v2f16_arg:
|
; GCN-LABEL: {{^}}load_v2f16_arg:
|
||||||
; GCN-DAG: buffer_load_ushort [[V0:v[0-9]+]], off, s{{\[[0-9]+:[0-9]+\]}}, 0 offset:44
|
; SI-DAG: buffer_load_ushort [[V0:v[0-9]+]], off, s{{\[[0-9]+:[0-9]+\]}}, 0 offset:44
|
||||||
; GCN-DAG: buffer_load_ushort [[V1:v[0-9]+]], off, s{{\[[0-9]+:[0-9]+\]}}, 0 offset:46
|
; SI-DAG: buffer_load_ushort [[V1:v[0-9]+]], off, s{{\[[0-9]+:[0-9]+\]}}, 0 offset:46
|
||||||
; GCN: v_lshlrev_b32_e32 [[HI:v[0-9]+]], 16, [[V1]]
|
; SI: v_lshlrev_b32_e32 [[HI:v[0-9]+]], 16, [[V1]]
|
||||||
; GCN: v_or_b32_e32 [[PACKED:v[0-9]+]], [[V0]], [[HI]]
|
; SI: v_or_b32_e32 [[PACKED:v[0-9]+]], [[V0]], [[HI]]
|
||||||
; GCN: buffer_store_dword [[PACKED]], off, s{{\[[0-9]+:[0-9]+\]}}, 0{{$}}
|
; SI: buffer_store_dword [[PACKED]], off, s{{\[[0-9]+:[0-9]+\]}}, 0{{$}}
|
||||||
; GCN: s_endpgm
|
|
||||||
|
; VI: s_load_dword [[ARG:s[0-9]+]]
|
||||||
|
; VI: v_mov_b32_e32 [[V_ARG:v[0-9]+]], [[ARG]]
|
||||||
|
; VI: buffer_store_dword [[V_ARG]]
|
||||||
define amdgpu_kernel void @load_v2f16_arg(<2 x half> addrspace(1)* %out, <2 x half> %arg) #0 {
|
define amdgpu_kernel void @load_v2f16_arg(<2 x half> addrspace(1)* %out, <2 x half> %arg) #0 {
|
||||||
store <2 x half> %arg, <2 x half> addrspace(1)* %out
|
store <2 x half> %arg, <2 x half> addrspace(1)* %out
|
||||||
ret void
|
ret void
|
||||||
@ -40,12 +44,18 @@ define amdgpu_kernel void @load_v3f16_arg(<3 x half> addrspace(1)* %out, <3 x ha
|
|||||||
}
|
}
|
||||||
|
|
||||||
; GCN-LABEL: {{^}}load_v4f16_arg:
|
; GCN-LABEL: {{^}}load_v4f16_arg:
|
||||||
; GCN: buffer_load_ushort
|
; SI: buffer_load_ushort
|
||||||
; GCN: buffer_load_ushort
|
; SI: buffer_load_ushort
|
||||||
; GCN: buffer_load_ushort
|
; SI: buffer_load_ushort
|
||||||
; GCN: buffer_load_ushort
|
; SI: buffer_load_ushort
|
||||||
; GCN: buffer_store_dwordx2
|
; SI: buffer_store_dwordx2
|
||||||
; GCN: s_endpgm
|
|
||||||
|
; FIXME: Why not one load?
|
||||||
|
; VI-DAG: s_load_dword [[ARG0_LO:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x2c
|
||||||
|
; VI-DAG: s_load_dword [[ARG0_HI:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x30
|
||||||
|
; VI-DAG: v_mov_b32_e32 v[[V_ARG0_LO:[0-9]+]], [[ARG0_LO]]
|
||||||
|
; VI-DAG: v_mov_b32_e32 v[[V_ARG0_HI:[0-9]+]], [[ARG0_HI]]
|
||||||
|
; VI: buffer_store_dwordx2 v{{\[}}[[V_ARG0_LO]]:[[V_ARG0_HI]]{{\]}}
|
||||||
define amdgpu_kernel void @load_v4f16_arg(<4 x half> addrspace(1)* %out, <4 x half> %arg) #0 {
|
define amdgpu_kernel void @load_v4f16_arg(<4 x half> addrspace(1)* %out, <4 x half> %arg) #0 {
|
||||||
store <4 x half> %arg, <4 x half> addrspace(1)* %out
|
store <4 x half> %arg, <4 x half> addrspace(1)* %out
|
||||||
ret void
|
ret void
|
||||||
@ -104,14 +114,20 @@ define amdgpu_kernel void @extload_v4f16_to_v4f32_arg(<4 x float> addrspace(1)*
|
|||||||
}
|
}
|
||||||
|
|
||||||
; GCN-LABEL: {{^}}extload_v8f16_to_v8f32_arg:
|
; GCN-LABEL: {{^}}extload_v8f16_to_v8f32_arg:
|
||||||
; GCN: buffer_load_ushort
|
; SI: buffer_load_ushort
|
||||||
; GCN: buffer_load_ushort
|
; SI: buffer_load_ushort
|
||||||
; GCN: buffer_load_ushort
|
; SI: buffer_load_ushort
|
||||||
; GCN: buffer_load_ushort
|
; SI: buffer_load_ushort
|
||||||
; GCN: buffer_load_ushort
|
; SI: buffer_load_ushort
|
||||||
; GCN: buffer_load_ushort
|
; SI: buffer_load_ushort
|
||||||
; GCN: buffer_load_ushort
|
; SI: buffer_load_ushort
|
||||||
; GCN: buffer_load_ushort
|
; SI: buffer_load_ushort
|
||||||
|
|
||||||
|
|
||||||
|
; VI: s_load_dword s
|
||||||
|
; VI: s_load_dword s
|
||||||
|
; VI: s_load_dword s
|
||||||
|
; VI: s_load_dword s
|
||||||
|
|
||||||
; GCN: v_cvt_f32_f16_e32
|
; GCN: v_cvt_f32_f16_e32
|
||||||
; GCN: v_cvt_f32_f16_e32
|
; GCN: v_cvt_f32_f16_e32
|
||||||
@ -145,8 +161,12 @@ define amdgpu_kernel void @extload_f16_to_f64_arg(double addrspace(1)* %out, hal
|
|||||||
}
|
}
|
||||||
|
|
||||||
; GCN-LABEL: {{^}}extload_v2f16_to_v2f64_arg:
|
; GCN-LABEL: {{^}}extload_v2f16_to_v2f64_arg:
|
||||||
; GCN-DAG: buffer_load_ushort v
|
; SI-DAG: buffer_load_ushort v
|
||||||
; GCN-DAG: buffer_load_ushort v
|
; SI-DAG: buffer_load_ushort v
|
||||||
|
|
||||||
|
; VI-DAG: s_load_dword s
|
||||||
|
; VI: s_lshr_b32
|
||||||
|
|
||||||
; GCN-DAG: v_cvt_f32_f16_e32
|
; GCN-DAG: v_cvt_f32_f16_e32
|
||||||
; GCN-DAG: v_cvt_f32_f16_e32
|
; GCN-DAG: v_cvt_f32_f16_e32
|
||||||
; GCN-DAG: v_cvt_f64_f32_e32
|
; GCN-DAG: v_cvt_f64_f32_e32
|
||||||
@ -176,10 +196,14 @@ define amdgpu_kernel void @extload_v3f16_to_v3f64_arg(<3 x double> addrspace(1)*
|
|||||||
}
|
}
|
||||||
|
|
||||||
; GCN-LABEL: {{^}}extload_v4f16_to_v4f64_arg:
|
; GCN-LABEL: {{^}}extload_v4f16_to_v4f64_arg:
|
||||||
; GCN-DAG: buffer_load_ushort v
|
; SI: buffer_load_ushort v
|
||||||
; GCN-DAG: buffer_load_ushort v
|
; SI: buffer_load_ushort v
|
||||||
; GCN-DAG: buffer_load_ushort v
|
; SI: buffer_load_ushort v
|
||||||
; GCN-DAG: buffer_load_ushort v
|
; SI: buffer_load_ushort v
|
||||||
|
|
||||||
|
; VI: s_load_dword s
|
||||||
|
; VI: s_load_dword s
|
||||||
|
|
||||||
; GCN-DAG: v_cvt_f32_f16_e32
|
; GCN-DAG: v_cvt_f32_f16_e32
|
||||||
; GCN-DAG: v_cvt_f32_f16_e32
|
; GCN-DAG: v_cvt_f32_f16_e32
|
||||||
; GCN-DAG: v_cvt_f32_f16_e32
|
; GCN-DAG: v_cvt_f32_f16_e32
|
||||||
@ -196,15 +220,23 @@ define amdgpu_kernel void @extload_v4f16_to_v4f64_arg(<4 x double> addrspace(1)*
|
|||||||
}
|
}
|
||||||
|
|
||||||
; GCN-LABEL: {{^}}extload_v8f16_to_v8f64_arg:
|
; GCN-LABEL: {{^}}extload_v8f16_to_v8f64_arg:
|
||||||
; GCN-DAG: buffer_load_ushort v
|
; SI: buffer_load_ushort v
|
||||||
; GCN-DAG: buffer_load_ushort v
|
; SI: buffer_load_ushort v
|
||||||
; GCN-DAG: buffer_load_ushort v
|
; SI: buffer_load_ushort v
|
||||||
; GCN-DAG: buffer_load_ushort v
|
; SI: buffer_load_ushort v
|
||||||
|
|
||||||
|
; SI: buffer_load_ushort v
|
||||||
|
; SI: buffer_load_ushort v
|
||||||
|
; SI: buffer_load_ushort v
|
||||||
|
; SI: buffer_load_ushort v
|
||||||
|
|
||||||
|
|
||||||
|
; VI: s_load_dword s
|
||||||
|
; VI: s_load_dword s
|
||||||
|
; VI: s_load_dword s
|
||||||
|
; VI: s_load_dword s
|
||||||
|
|
||||||
|
|
||||||
; GCN-DAG: buffer_load_ushort v
|
|
||||||
; GCN-DAG: buffer_load_ushort v
|
|
||||||
; GCN-DAG: buffer_load_ushort v
|
|
||||||
; GCN-DAG: buffer_load_ushort v
|
|
||||||
|
|
||||||
; GCN-DAG: v_cvt_f32_f16_e32
|
; GCN-DAG: v_cvt_f32_f16_e32
|
||||||
; GCN-DAG: v_cvt_f32_f16_e32
|
; GCN-DAG: v_cvt_f32_f16_e32
|
||||||
|
@ -1,6 +1,6 @@
|
|||||||
; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn--amdhsa -mcpu=gfx900 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=GFX9 %s
|
; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn--amdhsa -mcpu=gfx900 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX9 %s
|
||||||
; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn--amdhsa -mcpu=fiji -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI %s
|
; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn--amdhsa -mcpu=fiji -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,VI %s
|
||||||
; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn--amdhsa -mcpu=kaveri -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=CI %s
|
; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn--amdhsa -mcpu=kaveri -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,CI %s
|
||||||
; FIXME: Merge into imm.ll
|
; FIXME: Merge into imm.ll
|
||||||
|
|
||||||
; GCN-LABEL: {{^}}store_inline_imm_neg_0.0_v2i16:
|
; GCN-LABEL: {{^}}store_inline_imm_neg_0.0_v2i16:
|
||||||
@ -120,11 +120,14 @@ define amdgpu_kernel void @store_literal_imm_v2f16(<2 x half> addrspace(1)* %out
|
|||||||
; GFX9: v_pk_add_f16 [[REG:v[0-9]+]], [[VAL]], 0{{$}}
|
; GFX9: v_pk_add_f16 [[REG:v[0-9]+]], [[VAL]], 0{{$}}
|
||||||
; GFX9: buffer_store_dword [[REG]]
|
; GFX9: buffer_store_dword [[REG]]
|
||||||
|
|
||||||
; VI: buffer_load_ushort [[VAL0:v[0-9]+]]
|
; FIXME: Shouldn't need right shift and SDWA, also extra copy
|
||||||
; VI: buffer_load_ushort [[VAL1:v[0-9]+]]
|
; VI-DAG: s_load_dword [[VAL:s[0-9]+]]
|
||||||
; VI-DAG: v_add_f16_e32 v{{[0-9]+}}, 0, [[VAL0]]
|
|
||||||
; VI-DAG: v_mov_b32_e32 [[CONST0:v[0-9]+]], 0
|
; VI-DAG: v_mov_b32_e32 [[CONST0:v[0-9]+]], 0
|
||||||
; VI-DAG: v_add_f16_sdwa v{{[0-9]+}}, [[VAL1]], [[CONST0]] dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
|
; VI-DAG: s_lshr_b32 [[SHR:s[0-9]+]], [[VAL]], 16
|
||||||
|
; VI-DAG: v_mov_b32_e32 [[V_SHR:v[0-9]+]], [[SHR]]
|
||||||
|
|
||||||
|
; VI-DAG: v_add_f16_sdwa v{{[0-9]+}}, [[V_SHR]], [[CONST0]] dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
|
||||||
|
; VI-DAG: v_add_f16_e64 v{{[0-9]+}}, [[VAL]], 0
|
||||||
; VI: v_or_b32
|
; VI: v_or_b32
|
||||||
; VI: buffer_store_dword
|
; VI: buffer_store_dword
|
||||||
define amdgpu_kernel void @add_inline_imm_0.0_v2f16(<2 x half> addrspace(1)* %out, <2 x half> %x) #0 {
|
define amdgpu_kernel void @add_inline_imm_0.0_v2f16(<2 x half> addrspace(1)* %out, <2 x half> %x) #0 {
|
||||||
@ -138,11 +141,14 @@ define amdgpu_kernel void @add_inline_imm_0.0_v2f16(<2 x half> addrspace(1)* %ou
|
|||||||
; GFX9: v_pk_add_f16 [[REG:v[0-9]+]], [[VAL]], 0.5 op_sel_hi:[1,0]{{$}}
|
; GFX9: v_pk_add_f16 [[REG:v[0-9]+]], [[VAL]], 0.5 op_sel_hi:[1,0]{{$}}
|
||||||
; GFX9: buffer_store_dword [[REG]]
|
; GFX9: buffer_store_dword [[REG]]
|
||||||
|
|
||||||
; VI: buffer_load_ushort [[VAL0:v[0-9]+]]
|
; FIXME: Shouldn't need right shift and SDWA, also extra copy
|
||||||
; VI: buffer_load_ushort [[VAL1:v[0-9]+]]
|
; VI-DAG: s_load_dword [[VAL:s[0-9]+]]
|
||||||
; VI-DAG: v_add_f16_e32 v{{[0-9]+}}, 0.5, [[VAL0]]
|
|
||||||
; VI-DAG: v_mov_b32_e32 [[CONST05:v[0-9]+]], 0x3800
|
; VI-DAG: v_mov_b32_e32 [[CONST05:v[0-9]+]], 0x3800
|
||||||
; VI-DAG: v_add_f16_sdwa v{{[0-9]+}}, [[VAL1]], [[CONST05]] dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
|
; VI-DAG: s_lshr_b32 [[SHR:s[0-9]+]], [[VAL]], 16
|
||||||
|
; VI-DAG: v_mov_b32_e32 [[V_SHR:v[0-9]+]], [[SHR]]
|
||||||
|
|
||||||
|
; VI-DAG: v_add_f16_sdwa v{{[0-9]+}}, [[V_SHR]], [[CONST05]] dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
|
||||||
|
; VI-DAG: v_add_f16_e64 v{{[0-9]+}}, [[VAL]], 0.5
|
||||||
; VI: v_or_b32
|
; VI: v_or_b32
|
||||||
; VI: buffer_store_dword
|
; VI: buffer_store_dword
|
||||||
define amdgpu_kernel void @add_inline_imm_0.5_v2f16(<2 x half> addrspace(1)* %out, <2 x half> %x) #0 {
|
define amdgpu_kernel void @add_inline_imm_0.5_v2f16(<2 x half> addrspace(1)* %out, <2 x half> %x) #0 {
|
||||||
@ -156,11 +162,14 @@ define amdgpu_kernel void @add_inline_imm_0.5_v2f16(<2 x half> addrspace(1)* %ou
|
|||||||
; GFX9: v_pk_add_f16 [[REG:v[0-9]+]], [[VAL]], -0.5 op_sel_hi:[1,0]{{$}}
|
; GFX9: v_pk_add_f16 [[REG:v[0-9]+]], [[VAL]], -0.5 op_sel_hi:[1,0]{{$}}
|
||||||
; GFX9: buffer_store_dword [[REG]]
|
; GFX9: buffer_store_dword [[REG]]
|
||||||
|
|
||||||
; VI: buffer_load_ushort [[VAL0:v[0-9]+]]
|
; FIXME: Shouldn't need right shift and SDWA, also extra copy
|
||||||
; VI: buffer_load_ushort [[VAL1:v[0-9]+]]
|
; VI-DAG: s_load_dword [[VAL:s[0-9]+]]
|
||||||
; VI-DAG: v_add_f16_e32 v{{[0-9]+}}, -0.5, [[VAL0]]
|
|
||||||
; VI-DAG: v_mov_b32_e32 [[CONSTM05:v[0-9]+]], 0xb800
|
; VI-DAG: v_mov_b32_e32 [[CONSTM05:v[0-9]+]], 0xb800
|
||||||
; VI-DAG: v_add_f16_sdwa v{{[0-9]+}}, [[VAL1]], [[CONSTM05]] dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
|
; VI-DAG: s_lshr_b32 [[SHR:s[0-9]+]], [[VAL]], 16
|
||||||
|
; VI-DAG: v_mov_b32_e32 [[V_SHR:v[0-9]+]], [[SHR]]
|
||||||
|
|
||||||
|
; VI-DAG: v_add_f16_sdwa v{{[0-9]+}}, [[V_SHR]], [[CONSTM05]] dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
|
||||||
|
; VI-DAG: v_add_f16_e64 v{{[0-9]+}}, [[VAL]], -0.5
|
||||||
; VI: v_or_b32
|
; VI: v_or_b32
|
||||||
; VI: buffer_store_dword
|
; VI: buffer_store_dword
|
||||||
define amdgpu_kernel void @add_inline_imm_neg_0.5_v2f16(<2 x half> addrspace(1)* %out, <2 x half> %x) #0 {
|
define amdgpu_kernel void @add_inline_imm_neg_0.5_v2f16(<2 x half> addrspace(1)* %out, <2 x half> %x) #0 {
|
||||||
@ -174,11 +183,14 @@ define amdgpu_kernel void @add_inline_imm_neg_0.5_v2f16(<2 x half> addrspace(1)*
|
|||||||
; GFX9: v_pk_add_f16 [[REG:v[0-9]+]], [[VAL]], 1.0 op_sel_hi:[1,0]{{$}}
|
; GFX9: v_pk_add_f16 [[REG:v[0-9]+]], [[VAL]], 1.0 op_sel_hi:[1,0]{{$}}
|
||||||
; GFX9: buffer_store_dword [[REG]]
|
; GFX9: buffer_store_dword [[REG]]
|
||||||
|
|
||||||
; VI: buffer_load_ushort [[VAL0:v[0-9]+]]
|
; FIXME: Shouldn't need right shift and SDWA, also extra copy
|
||||||
; VI: buffer_load_ushort [[VAL1:v[0-9]+]]
|
; VI-DAG: s_load_dword [[VAL:s[0-9]+]]
|
||||||
; VI-DAG: v_add_f16_e32 v{{[0-9]+}}, 1.0, [[VAL0]]
|
|
||||||
; VI-DAG: v_mov_b32_e32 [[CONST1:v[0-9]+]], 0x3c00
|
; VI-DAG: v_mov_b32_e32 [[CONST1:v[0-9]+]], 0x3c00
|
||||||
; VI-DAG: v_add_f16_sdwa v{{[0-9]+}}, [[VAL1]], [[CONST1]] dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
|
; VI-DAG: s_lshr_b32 [[SHR:s[0-9]+]], [[VAL]], 16
|
||||||
|
; VI-DAG: v_mov_b32_e32 [[V_SHR:v[0-9]+]], [[SHR]]
|
||||||
|
|
||||||
|
; VI-DAG: v_add_f16_sdwa v{{[0-9]+}}, [[V_SHR]], [[CONST1]] dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
|
||||||
|
; VI-DAG: v_add_f16_e64 v{{[0-9]+}}, [[VAL]], 1.0
|
||||||
; VI: v_or_b32
|
; VI: v_or_b32
|
||||||
; VI: buffer_store_dword
|
; VI: buffer_store_dword
|
||||||
define amdgpu_kernel void @add_inline_imm_1.0_v2f16(<2 x half> addrspace(1)* %out, <2 x half> %x) #0 {
|
define amdgpu_kernel void @add_inline_imm_1.0_v2f16(<2 x half> addrspace(1)* %out, <2 x half> %x) #0 {
|
||||||
@ -192,11 +204,15 @@ define amdgpu_kernel void @add_inline_imm_1.0_v2f16(<2 x half> addrspace(1)* %ou
|
|||||||
; GFX9: v_pk_add_f16 [[REG:v[0-9]+]], [[VAL]], -1.0 op_sel_hi:[1,0]{{$}}
|
; GFX9: v_pk_add_f16 [[REG:v[0-9]+]], [[VAL]], -1.0 op_sel_hi:[1,0]{{$}}
|
||||||
; GFX9: buffer_store_dword [[REG]]
|
; GFX9: buffer_store_dword [[REG]]
|
||||||
|
|
||||||
; VI: buffer_load_ushort [[VAL0:v[0-9]+]]
|
|
||||||
; VI: buffer_load_ushort [[VAL1:v[0-9]+]]
|
; FIXME: Shouldn't need right shift and SDWA, also extra copy
|
||||||
; VI-DAG: v_add_f16_e32 v{{[0-9]+}}, -1.0, [[VAL0]]
|
; VI-DAG: s_load_dword [[VAL:s[0-9]+]]
|
||||||
; VI-DAG: v_mov_b32_e32 [[CONSTM1:v[0-9]+]], 0xbc00
|
; VI-DAG: v_mov_b32_e32 [[CONST1:v[0-9]+]], 0xbc00
|
||||||
; VI-DAG: v_add_f16_sdwa v{{[0-9]+}}, [[VAL1]], [[CONSTM1]] dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
|
; VI-DAG: s_lshr_b32 [[SHR:s[0-9]+]], [[VAL]], 16
|
||||||
|
; VI-DAG: v_mov_b32_e32 [[V_SHR:v[0-9]+]], [[SHR]]
|
||||||
|
|
||||||
|
; VI-DAG: v_add_f16_sdwa v{{[0-9]+}}, [[V_SHR]], [[CONST1]] dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
|
||||||
|
; VI-DAG: v_add_f16_e64 v{{[0-9]+}}, [[VAL]], -1.0
|
||||||
; VI: v_or_b32
|
; VI: v_or_b32
|
||||||
; VI: buffer_store_dword
|
; VI: buffer_store_dword
|
||||||
define amdgpu_kernel void @add_inline_imm_neg_1.0_v2f16(<2 x half> addrspace(1)* %out, <2 x half> %x) #0 {
|
define amdgpu_kernel void @add_inline_imm_neg_1.0_v2f16(<2 x half> addrspace(1)* %out, <2 x half> %x) #0 {
|
||||||
@ -210,11 +226,14 @@ define amdgpu_kernel void @add_inline_imm_neg_1.0_v2f16(<2 x half> addrspace(1)*
|
|||||||
; GFX9: v_pk_add_f16 [[REG:v[0-9]+]], [[VAL]], 2.0 op_sel_hi:[1,0]{{$}}
|
; GFX9: v_pk_add_f16 [[REG:v[0-9]+]], [[VAL]], 2.0 op_sel_hi:[1,0]{{$}}
|
||||||
; GFX9: buffer_store_dword [[REG]]
|
; GFX9: buffer_store_dword [[REG]]
|
||||||
|
|
||||||
; VI: buffer_load_ushort [[VAL0:v[0-9]+]]
|
; FIXME: Shouldn't need right shift and SDWA, also extra copy
|
||||||
; VI: buffer_load_ushort [[VAL1:v[0-9]+]]
|
; VI-DAG: s_load_dword [[VAL:s[0-9]+]]
|
||||||
; VI-DAG: v_add_f16_e32 v{{[0-9]+}}, 2.0, [[VAL0]]
|
|
||||||
; VI-DAG: v_mov_b32_e32 [[CONST2:v[0-9]+]], 0x4000
|
; VI-DAG: v_mov_b32_e32 [[CONST2:v[0-9]+]], 0x4000
|
||||||
; VI-DAG: v_add_f16_sdwa v{{[0-9]+}}, [[VAL1]], [[CONST2]] dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
|
; VI-DAG: s_lshr_b32 [[SHR:s[0-9]+]], [[VAL]], 16
|
||||||
|
; VI-DAG: v_mov_b32_e32 [[V_SHR:v[0-9]+]], [[SHR]]
|
||||||
|
|
||||||
|
; VI-DAG: v_add_f16_sdwa v{{[0-9]+}}, [[V_SHR]], [[CONST2]] dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
|
||||||
|
; VI-DAG: v_add_f16_e64 v{{[0-9]+}}, [[VAL]], 2.0
|
||||||
; VI: v_or_b32
|
; VI: v_or_b32
|
||||||
; VI: buffer_store_dword
|
; VI: buffer_store_dword
|
||||||
define amdgpu_kernel void @add_inline_imm_2.0_v2f16(<2 x half> addrspace(1)* %out, <2 x half> %x) #0 {
|
define amdgpu_kernel void @add_inline_imm_2.0_v2f16(<2 x half> addrspace(1)* %out, <2 x half> %x) #0 {
|
||||||
@ -228,11 +247,14 @@ define amdgpu_kernel void @add_inline_imm_2.0_v2f16(<2 x half> addrspace(1)* %ou
|
|||||||
; GFX9: v_pk_add_f16 [[REG:v[0-9]+]], [[VAL]], -2.0 op_sel_hi:[1,0]{{$}}
|
; GFX9: v_pk_add_f16 [[REG:v[0-9]+]], [[VAL]], -2.0 op_sel_hi:[1,0]{{$}}
|
||||||
; GFX9: buffer_store_dword [[REG]]
|
; GFX9: buffer_store_dword [[REG]]
|
||||||
|
|
||||||
; VI: buffer_load_ushort [[VAL0:v[0-9]+]]
|
; FIXME: Shouldn't need right shift and SDWA, also extra copy
|
||||||
; VI: buffer_load_ushort [[VAL1:v[0-9]+]]
|
; VI-DAG: s_load_dword [[VAL:s[0-9]+]]
|
||||||
; VI-DAG: v_add_f16_e32 v{{[0-9]+}}, -2.0, [[VAL0]]
|
|
||||||
; VI-DAG: v_mov_b32_e32 [[CONSTM2:v[0-9]+]], 0xc000
|
; VI-DAG: v_mov_b32_e32 [[CONSTM2:v[0-9]+]], 0xc000
|
||||||
; VI-DAG: v_add_f16_sdwa v{{[0-9]+}}, [[VAL1]], [[CONSTM2]] dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
|
; VI-DAG: s_lshr_b32 [[SHR:s[0-9]+]], [[VAL]], 16
|
||||||
|
; VI-DAG: v_mov_b32_e32 [[V_SHR:v[0-9]+]], [[SHR]]
|
||||||
|
|
||||||
|
; VI-DAG: v_add_f16_sdwa v{{[0-9]+}}, [[V_SHR]], [[CONSTM2]] dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
|
||||||
|
; VI-DAG: v_add_f16_e64 v{{[0-9]+}}, [[VAL]], -2.0
|
||||||
; VI: v_or_b32
|
; VI: v_or_b32
|
||||||
; VI: buffer_store_dword
|
; VI: buffer_store_dword
|
||||||
define amdgpu_kernel void @add_inline_imm_neg_2.0_v2f16(<2 x half> addrspace(1)* %out, <2 x half> %x) #0 {
|
define amdgpu_kernel void @add_inline_imm_neg_2.0_v2f16(<2 x half> addrspace(1)* %out, <2 x half> %x) #0 {
|
||||||
@ -246,11 +268,14 @@ define amdgpu_kernel void @add_inline_imm_neg_2.0_v2f16(<2 x half> addrspace(1)*
|
|||||||
; GFX9: v_pk_add_f16 [[REG:v[0-9]+]], [[VAL]], 4.0 op_sel_hi:[1,0]{{$}}
|
; GFX9: v_pk_add_f16 [[REG:v[0-9]+]], [[VAL]], 4.0 op_sel_hi:[1,0]{{$}}
|
||||||
; GFX9: buffer_store_dword [[REG]]
|
; GFX9: buffer_store_dword [[REG]]
|
||||||
|
|
||||||
; VI: buffer_load_ushort [[VAL0:v[0-9]+]]
|
; FIXME: Shouldn't need right shift and SDWA, also extra copy
|
||||||
; VI: buffer_load_ushort [[VAL1:v[0-9]+]]
|
; VI-DAG: s_load_dword [[VAL:s[0-9]+]]
|
||||||
; VI-DAG: v_add_f16_e32 v{{[0-9]+}}, 4.0, [[VAL0]]
|
|
||||||
; VI-DAG: v_mov_b32_e32 [[CONST4:v[0-9]+]], 0x4400
|
; VI-DAG: v_mov_b32_e32 [[CONST4:v[0-9]+]], 0x4400
|
||||||
; VI-DAG: v_add_f16_sdwa v{{[0-9]+}}, [[VAL1]], [[CONST4]] dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
|
; VI-DAG: s_lshr_b32 [[SHR:s[0-9]+]], [[VAL]], 16
|
||||||
|
; VI-DAG: v_mov_b32_e32 [[V_SHR:v[0-9]+]], [[SHR]]
|
||||||
|
|
||||||
|
; VI-DAG: v_add_f16_sdwa v{{[0-9]+}}, [[V_SHR]], [[CONST4]] dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
|
||||||
|
; VI-DAG: v_add_f16_e64 v{{[0-9]+}}, [[VAL]], 4.0
|
||||||
; VI: v_or_b32
|
; VI: v_or_b32
|
||||||
; VI: buffer_store_dword
|
; VI: buffer_store_dword
|
||||||
define amdgpu_kernel void @add_inline_imm_4.0_v2f16(<2 x half> addrspace(1)* %out, <2 x half> %x) #0 {
|
define amdgpu_kernel void @add_inline_imm_4.0_v2f16(<2 x half> addrspace(1)* %out, <2 x half> %x) #0 {
|
||||||
@ -264,11 +289,14 @@ define amdgpu_kernel void @add_inline_imm_4.0_v2f16(<2 x half> addrspace(1)* %ou
|
|||||||
; GFX9: v_pk_add_f16 [[REG:v[0-9]+]], [[VAL]], -4.0 op_sel_hi:[1,0]{{$}}
|
; GFX9: v_pk_add_f16 [[REG:v[0-9]+]], [[VAL]], -4.0 op_sel_hi:[1,0]{{$}}
|
||||||
; GFX9: buffer_store_dword [[REG]]
|
; GFX9: buffer_store_dword [[REG]]
|
||||||
|
|
||||||
; VI: buffer_load_ushort [[VAL0:v[0-9]+]]
|
; FIXME: Shouldn't need right shift and SDWA, also extra copy
|
||||||
; VI: buffer_load_ushort [[VAL1:v[0-9]+]]
|
; VI-DAG: s_load_dword [[VAL:s[0-9]+]]
|
||||||
; VI-DAG: v_add_f16_e32 v{{[0-9]+}}, -4.0, [[VAL0]]
|
|
||||||
; VI-DAG: v_mov_b32_e32 [[CONSTM4:v[0-9]+]], 0xc400
|
; VI-DAG: v_mov_b32_e32 [[CONSTM4:v[0-9]+]], 0xc400
|
||||||
; VI-DAG: v_add_f16_sdwa v{{[0-9]+}}, [[VAL1]], [[CONSTM4]] dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
|
; VI-DAG: s_lshr_b32 [[SHR:s[0-9]+]], [[VAL]], 16
|
||||||
|
; VI-DAG: v_mov_b32_e32 [[V_SHR:v[0-9]+]], [[SHR]]
|
||||||
|
|
||||||
|
; VI-DAG: v_add_f16_sdwa v{{[0-9]+}}, [[V_SHR]], [[CONSTM4]] dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
|
||||||
|
; VI-DAG: v_add_f16_e64 v{{[0-9]+}}, [[VAL]], -4.0
|
||||||
; VI: v_or_b32
|
; VI: v_or_b32
|
||||||
; VI: buffer_store_dword
|
; VI: buffer_store_dword
|
||||||
define amdgpu_kernel void @add_inline_imm_neg_4.0_v2f16(<2 x half> addrspace(1)* %out, <2 x half> %x) #0 {
|
define amdgpu_kernel void @add_inline_imm_neg_4.0_v2f16(<2 x half> addrspace(1)* %out, <2 x half> %x) #0 {
|
||||||
@ -321,11 +349,14 @@ define amdgpu_kernel void @commute_add_literal_v2f16(<2 x half> addrspace(1)* %o
|
|||||||
; GFX9: v_pk_add_f16 [[REG:v[0-9]+]], [[VAL]], 1 op_sel_hi:[1,0]{{$}}
|
; GFX9: v_pk_add_f16 [[REG:v[0-9]+]], [[VAL]], 1 op_sel_hi:[1,0]{{$}}
|
||||||
; GFX9: buffer_store_dword [[REG]]
|
; GFX9: buffer_store_dword [[REG]]
|
||||||
|
|
||||||
; VI: buffer_load_ushort [[VAL0:v[0-9]+]]
|
; FIXME: Shouldn't need right shift and SDWA, also extra copy
|
||||||
; VI: buffer_load_ushort [[VAL1:v[0-9]+]]
|
; VI-DAG: s_load_dword [[VAL:s[0-9]+]]
|
||||||
; VI-DAG: v_add_f16_e32 v{{[0-9]+}}, 1, [[VAL0]]
|
; VI-DAG: v_mov_b32_e32 [[CONST1:v[0-9]+]], 1{{$}}
|
||||||
; VI-DAG: v_mov_b32_e32 [[CONST1:v[0-9]+]], 1
|
; VI-DAG: s_lshr_b32 [[SHR:s[0-9]+]], [[VAL]], 16
|
||||||
; VI-DAG: v_add_f16_sdwa v{{[0-9]+}}, [[VAL1]], [[CONST1]] dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
|
; VI-DAG: v_mov_b32_e32 [[V_SHR:v[0-9]+]], [[SHR]]
|
||||||
|
|
||||||
|
; VI-DAG: v_add_f16_sdwa v{{[0-9]+}}, [[V_SHR]], [[CONST1]] dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
|
||||||
|
; VI-DAG: v_add_f16_e64 v{{[0-9]+}}, [[VAL]], 1{{$}}
|
||||||
; VI: v_or_b32
|
; VI: v_or_b32
|
||||||
; VI: buffer_store_dword
|
; VI: buffer_store_dword
|
||||||
define amdgpu_kernel void @add_inline_imm_1_v2f16(<2 x half> addrspace(1)* %out, <2 x half> %x) #0 {
|
define amdgpu_kernel void @add_inline_imm_1_v2f16(<2 x half> addrspace(1)* %out, <2 x half> %x) #0 {
|
||||||
@ -339,11 +370,15 @@ define amdgpu_kernel void @add_inline_imm_1_v2f16(<2 x half> addrspace(1)* %out,
|
|||||||
; GFX9: v_pk_add_f16 [[REG:v[0-9]+]], [[VAL]], 2 op_sel_hi:[1,0]{{$}}
|
; GFX9: v_pk_add_f16 [[REG:v[0-9]+]], [[VAL]], 2 op_sel_hi:[1,0]{{$}}
|
||||||
; GFX9: buffer_store_dword [[REG]]
|
; GFX9: buffer_store_dword [[REG]]
|
||||||
|
|
||||||
; VI: buffer_load_ushort [[VAL0:v[0-9]+]]
|
|
||||||
; VI: buffer_load_ushort [[VAL1:v[0-9]+]]
|
; FIXME: Shouldn't need right shift and SDWA, also extra copy
|
||||||
; VI-DAG: v_add_f16_e32 v{{[0-9]+}}, 2, [[VAL0]]
|
; VI-DAG: s_load_dword [[VAL:s[0-9]+]]
|
||||||
; VI-DAG: v_mov_b32_e32 [[CONST2:v[0-9]+]], 2
|
; VI-DAG: v_mov_b32_e32 [[CONST2:v[0-9]+]], 2{{$}}
|
||||||
; VI-DAG: v_add_f16_sdwa v{{[0-9]+}}, [[VAL1]], [[CONST2]] dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
|
; VI-DAG: s_lshr_b32 [[SHR:s[0-9]+]], [[VAL]], 16
|
||||||
|
; VI-DAG: v_mov_b32_e32 [[V_SHR:v[0-9]+]], [[SHR]]
|
||||||
|
|
||||||
|
; VI-DAG: v_add_f16_sdwa v{{[0-9]+}}, [[V_SHR]], [[CONST2]] dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
|
||||||
|
; VI-DAG: v_add_f16_e64 v{{[0-9]+}}, [[VAL]], 2{{$}}
|
||||||
; VI: v_or_b32
|
; VI: v_or_b32
|
||||||
; VI: buffer_store_dword
|
; VI: buffer_store_dword
|
||||||
define amdgpu_kernel void @add_inline_imm_2_v2f16(<2 x half> addrspace(1)* %out, <2 x half> %x) #0 {
|
define amdgpu_kernel void @add_inline_imm_2_v2f16(<2 x half> addrspace(1)* %out, <2 x half> %x) #0 {
|
||||||
@ -357,11 +392,15 @@ define amdgpu_kernel void @add_inline_imm_2_v2f16(<2 x half> addrspace(1)* %out,
|
|||||||
; GFX9: v_pk_add_f16 [[REG:v[0-9]+]], [[VAL]], 16 op_sel_hi:[1,0]{{$}}
|
; GFX9: v_pk_add_f16 [[REG:v[0-9]+]], [[VAL]], 16 op_sel_hi:[1,0]{{$}}
|
||||||
; GFX9: buffer_store_dword [[REG]]
|
; GFX9: buffer_store_dword [[REG]]
|
||||||
|
|
||||||
; VI: buffer_load_ushort [[VAL0:v[0-9]+]]
|
|
||||||
; VI: buffer_load_ushort [[VAL1:v[0-9]+]]
|
; FIXME: Shouldn't need right shift and SDWA, also extra copy
|
||||||
; VI-DAG: v_add_f16_e32 v{{[0-9]+}}, 16, [[VAL0]]
|
; VI-DAG: s_load_dword [[VAL:s[0-9]+]]
|
||||||
; VI-DAG: v_mov_b32_e32 [[CONST16:v[0-9]+]], 16
|
; VI-DAG: v_mov_b32_e32 [[CONST16:v[0-9]+]], 16{{$}}
|
||||||
; VI-DAG: v_add_f16_sdwa v{{[0-9]+}}, [[VAL1]], [[CONST16]] dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
|
; VI-DAG: s_lshr_b32 [[SHR:s[0-9]+]], [[VAL]], 16
|
||||||
|
; VI-DAG: v_mov_b32_e32 [[V_SHR:v[0-9]+]], [[SHR]]
|
||||||
|
|
||||||
|
; VI-DAG: v_add_f16_sdwa v{{[0-9]+}}, [[V_SHR]], [[CONST16]] dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
|
||||||
|
; VI-DAG: v_add_f16_e64 v{{[0-9]+}}, [[VAL]], 16{{$}}
|
||||||
; VI: v_or_b32
|
; VI: v_or_b32
|
||||||
; VI: buffer_store_dword
|
; VI: buffer_store_dword
|
||||||
define amdgpu_kernel void @add_inline_imm_16_v2f16(<2 x half> addrspace(1)* %out, <2 x half> %x) #0 {
|
define amdgpu_kernel void @add_inline_imm_16_v2f16(<2 x half> addrspace(1)* %out, <2 x half> %x) #0 {
|
||||||
@ -375,10 +414,9 @@ define amdgpu_kernel void @add_inline_imm_16_v2f16(<2 x half> addrspace(1)* %out
|
|||||||
; GFX9: v_mov_b32_e32 [[REG:v[0-9]+]], [[VAL]]
|
; GFX9: v_mov_b32_e32 [[REG:v[0-9]+]], [[VAL]]
|
||||||
; GFX9: buffer_store_dword [[REG]]
|
; GFX9: buffer_store_dword [[REG]]
|
||||||
|
|
||||||
; VI: buffer_load_ushort [[VAL0:v[0-9]+]]
|
; VI: s_load_dword [[VAL:s[0-9]+]]
|
||||||
; VI: buffer_load_ushort [[VAL1:v[0-9]+]]
|
; VI: s_add_i32 [[ADD:s[0-9]+]], [[VAL]], -1{{$}}
|
||||||
; VI: v_or_b32_e32 [[REG:v[0-9]+]]
|
; VI: v_mov_b32_e32 [[REG:v[0-9]+]], [[ADD]]
|
||||||
; VI: v_add_u32_e32 [[REG]], vcc, -1, [[REG]]
|
|
||||||
; VI: buffer_store_dword [[REG]]
|
; VI: buffer_store_dword [[REG]]
|
||||||
define amdgpu_kernel void @add_inline_imm_neg_1_v2f16(<2 x half> addrspace(1)* %out, <2 x half> %x) #0 {
|
define amdgpu_kernel void @add_inline_imm_neg_1_v2f16(<2 x half> addrspace(1)* %out, <2 x half> %x) #0 {
|
||||||
%xbc = bitcast <2 x half> %x to i32
|
%xbc = bitcast <2 x half> %x to i32
|
||||||
@ -393,10 +431,9 @@ define amdgpu_kernel void @add_inline_imm_neg_1_v2f16(<2 x half> addrspace(1)* %
|
|||||||
; GFX9: v_mov_b32_e32 [[REG:v[0-9]+]], [[VAL]]
|
; GFX9: v_mov_b32_e32 [[REG:v[0-9]+]], [[VAL]]
|
||||||
; GFX9: buffer_store_dword [[REG]]
|
; GFX9: buffer_store_dword [[REG]]
|
||||||
|
|
||||||
; VI: buffer_load_ushort [[VAL0:v[0-9]+]]
|
; VI: s_load_dword [[VAL:s[0-9]+]]
|
||||||
; VI: buffer_load_ushort [[VAL1:v[0-9]+]]
|
; VI: s_add_i32 [[ADD:s[0-9]+]], [[VAL]], 0xfffefffe{{$}}
|
||||||
; VI: v_or_b32_e32 [[REG:v[0-9]+]]
|
; VI: v_mov_b32_e32 [[REG:v[0-9]+]], [[ADD]]
|
||||||
; VI: v_add_u32_e32 [[REG]], vcc, 0xfffefffe, [[REG]]
|
|
||||||
; VI: buffer_store_dword [[REG]]
|
; VI: buffer_store_dword [[REG]]
|
||||||
define amdgpu_kernel void @add_inline_imm_neg_2_v2f16(<2 x half> addrspace(1)* %out, <2 x half> %x) #0 {
|
define amdgpu_kernel void @add_inline_imm_neg_2_v2f16(<2 x half> addrspace(1)* %out, <2 x half> %x) #0 {
|
||||||
%xbc = bitcast <2 x half> %x to i32
|
%xbc = bitcast <2 x half> %x to i32
|
||||||
@ -411,10 +448,10 @@ define amdgpu_kernel void @add_inline_imm_neg_2_v2f16(<2 x half> addrspace(1)* %
|
|||||||
; GFX9: v_mov_b32_e32 [[REG:v[0-9]+]], [[VAL]]
|
; GFX9: v_mov_b32_e32 [[REG:v[0-9]+]], [[VAL]]
|
||||||
; GFX9: buffer_store_dword [[REG]]
|
; GFX9: buffer_store_dword [[REG]]
|
||||||
|
|
||||||
; VI: buffer_load_ushort [[VAL0:v[0-9]+]]
|
|
||||||
; VI: buffer_load_ushort [[VAL1:v[0-9]+]]
|
; VI: s_load_dword [[VAL:s[0-9]+]]
|
||||||
; VI: v_or_b32_e32 [[REG:v[0-9]+]]
|
; VI: s_add_i32 [[ADD:s[0-9]+]], [[VAL]], 0xfff0fff0{{$}}
|
||||||
; VI: v_add_u32_e32 [[REG]], vcc, 0xfff0fff0, [[REG]]
|
; VI: v_mov_b32_e32 [[REG:v[0-9]+]], [[ADD]]
|
||||||
; VI: buffer_store_dword [[REG]]
|
; VI: buffer_store_dword [[REG]]
|
||||||
define amdgpu_kernel void @add_inline_imm_neg_16_v2f16(<2 x half> addrspace(1)* %out, <2 x half> %x) #0 {
|
define amdgpu_kernel void @add_inline_imm_neg_16_v2f16(<2 x half> addrspace(1)* %out, <2 x half> %x) #0 {
|
||||||
%xbc = bitcast <2 x half> %x to i32
|
%xbc = bitcast <2 x half> %x to i32
|
||||||
@ -429,11 +466,14 @@ define amdgpu_kernel void @add_inline_imm_neg_16_v2f16(<2 x half> addrspace(1)*
|
|||||||
; GFX9: v_pk_add_f16 [[REG:v[0-9]+]], [[VAL]], 63
|
; GFX9: v_pk_add_f16 [[REG:v[0-9]+]], [[VAL]], 63
|
||||||
; GFX9: buffer_store_dword [[REG]]
|
; GFX9: buffer_store_dword [[REG]]
|
||||||
|
|
||||||
; VI: buffer_load_ushort [[VAL0:v[0-9]+]]
|
; FIXME: Shouldn't need right shift and SDWA, also extra copy
|
||||||
; VI: buffer_load_ushort [[VAL1:v[0-9]+]]
|
; VI-DAG: s_load_dword [[VAL:s[0-9]+]]
|
||||||
; VI-DAG: v_add_f16_e32 v{{[0-9]+}}, 63, [[VAL0]]
|
|
||||||
; VI-DAG: v_mov_b32_e32 [[CONST63:v[0-9]+]], 63
|
; VI-DAG: v_mov_b32_e32 [[CONST63:v[0-9]+]], 63
|
||||||
; VI-DAG: v_add_f16_sdwa v{{[0-9]+}}, [[VAL1]], [[CONST63]] dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
|
; VI-DAG: s_lshr_b32 [[SHR:s[0-9]+]], [[VAL]], 16
|
||||||
|
; VI-DAG: v_mov_b32_e32 [[V_SHR:v[0-9]+]], [[SHR]]
|
||||||
|
|
||||||
|
; VI-DAG: v_add_f16_sdwa v{{[0-9]+}}, [[V_SHR]], [[CONST63]] dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
|
||||||
|
; VI-DAG: v_add_f16_e64 v{{[0-9]+}}, [[VAL]], 63
|
||||||
; VI: v_or_b32
|
; VI: v_or_b32
|
||||||
; VI: buffer_store_dword
|
; VI: buffer_store_dword
|
||||||
define amdgpu_kernel void @add_inline_imm_63_v2f16(<2 x half> addrspace(1)* %out, <2 x half> %x) #0 {
|
define amdgpu_kernel void @add_inline_imm_63_v2f16(<2 x half> addrspace(1)* %out, <2 x half> %x) #0 {
|
||||||
@ -447,11 +487,14 @@ define amdgpu_kernel void @add_inline_imm_63_v2f16(<2 x half> addrspace(1)* %out
|
|||||||
; GFX9: v_pk_add_f16 [[REG:v[0-9]+]], [[VAL]], 64
|
; GFX9: v_pk_add_f16 [[REG:v[0-9]+]], [[VAL]], 64
|
||||||
; GFX9: buffer_store_dword [[REG]]
|
; GFX9: buffer_store_dword [[REG]]
|
||||||
|
|
||||||
; VI: buffer_load_ushort [[VAL0:v[0-9]+]]
|
; FIXME: Shouldn't need right shift and SDWA, also extra copy
|
||||||
; VI: buffer_load_ushort [[VAL1:v[0-9]+]]
|
; VI-DAG: s_load_dword [[VAL:s[0-9]+]]
|
||||||
; VI-DAG: v_add_f16_e32 v{{[0-9]+}}, 64, [[VAL0]]
|
|
||||||
; VI-DAG: v_mov_b32_e32 [[CONST64:v[0-9]+]], 64
|
; VI-DAG: v_mov_b32_e32 [[CONST64:v[0-9]+]], 64
|
||||||
; VI-DAG: v_add_f16_sdwa v{{[0-9]+}}, [[VAL1]], [[CONST64]] dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
|
; VI-DAG: s_lshr_b32 [[SHR:s[0-9]+]], [[VAL]], 16
|
||||||
|
; VI-DAG: v_mov_b32_e32 [[V_SHR:v[0-9]+]], [[SHR]]
|
||||||
|
|
||||||
|
; VI-DAG: v_add_f16_sdwa v{{[0-9]+}}, [[V_SHR]], [[CONST64]] dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
|
||||||
|
; VI-DAG: v_add_f16_e64 v{{[0-9]+}}, [[VAL]], 64
|
||||||
; VI: v_or_b32
|
; VI: v_or_b32
|
||||||
; VI: buffer_store_dword
|
; VI: buffer_store_dword
|
||||||
define amdgpu_kernel void @add_inline_imm_64_v2f16(<2 x half> addrspace(1)* %out, <2 x half> %x) #0 {
|
define amdgpu_kernel void @add_inline_imm_64_v2f16(<2 x half> addrspace(1)* %out, <2 x half> %x) #0 {
|
||||||
|
@ -42,16 +42,19 @@ define amdgpu_kernel void @s_input_output_f16() {
|
|||||||
ret void
|
ret void
|
||||||
}
|
}
|
||||||
|
|
||||||
; GCN: error: couldn't allocate output register for constraint 's'
|
; CI: error: couldn't allocate output register for constraint 's'
|
||||||
; GCN: error: couldn't allocate input reg for constraint 's'
|
; CI: error: couldn't allocate input reg for constraint 's'
|
||||||
|
|
||||||
|
; VI-NOT: error
|
||||||
define amdgpu_kernel void @s_input_output_v2f16() {
|
define amdgpu_kernel void @s_input_output_v2f16() {
|
||||||
%v = tail call <2 x half> asm sideeffect "s_mov_b32 $0, -1", "=s"()
|
%v = tail call <2 x half> asm sideeffect "s_mov_b32 $0, -1", "=s"()
|
||||||
tail call void asm sideeffect "; use $0", "s"(<2 x half> %v)
|
tail call void asm sideeffect "; use $0", "s"(<2 x half> %v)
|
||||||
ret void
|
ret void
|
||||||
}
|
}
|
||||||
|
|
||||||
; GCN: error: couldn't allocate output register for constraint 'v'
|
; CI: error: couldn't allocate output register for constraint 'v'
|
||||||
; GCN: error: couldn't allocate input reg for constraint 'v'
|
; CI: error: couldn't allocate input reg for constraint 'v'
|
||||||
|
; VI-NOT: error
|
||||||
define amdgpu_kernel void @v_input_output_v2f16() {
|
define amdgpu_kernel void @v_input_output_v2f16() {
|
||||||
%v = tail call <2 x half> asm sideeffect "v_mov_b32 $0, -1", "=v"()
|
%v = tail call <2 x half> asm sideeffect "v_mov_b32 $0, -1", "=v"()
|
||||||
tail call void asm sideeffect "; use $0", "v"(<2 x half> %v)
|
tail call void asm sideeffect "; use $0", "v"(<2 x half> %v)
|
||||||
@ -67,8 +70,12 @@ define amdgpu_kernel void @s_input_output_i16() {
|
|||||||
ret void
|
ret void
|
||||||
}
|
}
|
||||||
|
|
||||||
; GCN: error: couldn't allocate output register for constraint 's'
|
; FIXME: Should work on all targets?
|
||||||
; GCN: error: couldn't allocate input reg for constraint 's'
|
|
||||||
|
; CI: error: couldn't allocate output register for constraint 's'
|
||||||
|
; CI: error: couldn't allocate input reg for constraint 's'
|
||||||
|
|
||||||
|
; VI-NOT: error
|
||||||
define amdgpu_kernel void @s_input_output_v2i16() {
|
define amdgpu_kernel void @s_input_output_v2i16() {
|
||||||
%v = tail call <2 x i16> asm sideeffect "s_mov_b32 $0, -1", "=s"()
|
%v = tail call <2 x i16> asm sideeffect "s_mov_b32 $0, -1", "=s"()
|
||||||
tail call void asm sideeffect "; use $0", "s"(<2 x i16> %v)
|
tail call void asm sideeffect "; use $0", "s"(<2 x i16> %v)
|
||||||
|
@ -1,5 +1,5 @@
|
|||||||
; RUN: llc -verify-machineinstrs -march=amdgcn -mtriple=amdgcn---amdgiz -mcpu=tahiti -mattr=+max-private-element-size-16 < %s | FileCheck -check-prefix=GCN -check-prefix=SI -check-prefix=GCN-NO-TONGA %s
|
; RUN: llc -verify-machineinstrs -march=amdgcn -mtriple=amdgcn---amdgiz -mcpu=tahiti -mattr=+max-private-element-size-16 < %s | FileCheck -enable-var-scope -check-prefixes=GCN,SI,GCN-NO-TONGA %s
|
||||||
; RUN: llc -verify-machineinstrs -march=amdgcn -mtriple=amdgcn---amdgiz -mcpu=tonga -mattr=-flat-for-global -mattr=+max-private-element-size-16 < %s | FileCheck -check-prefix=GCN -check-prefix=SI -check-prefix=GCN-TONGA %s
|
; RUN: llc -verify-machineinstrs -march=amdgcn -mtriple=amdgcn---amdgiz -mcpu=tonga -mattr=-flat-for-global -mattr=+max-private-element-size-16 < %s | FileCheck -enable-var-scope -check-prefixes=GCN,VI,GCN-TONGA %s
|
||||||
|
|
||||||
; FIXME: Broken on evergreen
|
; FIXME: Broken on evergreen
|
||||||
; FIXME: For some reason the 8 and 16 vectors are being stored as
|
; FIXME: For some reason the 8 and 16 vectors are being stored as
|
||||||
|
@ -1,6 +1,6 @@
|
|||||||
; RUN: llc -verify-machineinstrs -march=amdgcn -mtriple=amdgcn---amdgiz -mcpu=gfx900 -enable-amdgpu-aa=0 -mattr=+flat-for-global,-fp64-fp16-denormals < %s | FileCheck -enable-var-scope -check-prefix=GCN -check-prefix=GFX9 -check-prefix=GFX89 %s
|
; RUN: llc -verify-machineinstrs -march=amdgcn -mtriple=amdgcn---amdgiz -mcpu=gfx900 -enable-amdgpu-aa=0 -mattr=+flat-for-global,-fp64-fp16-denormals < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX9,GFX89 %s
|
||||||
; RUN: llc -verify-machineinstrs -march=amdgcn -mtriple=amdgcn---amdgiz -mcpu=fiji -enable-amdgpu-aa=0 -mattr=+flat-for-global < %s | FileCheck -enable-var-scope -check-prefix=GCN -check-prefix=CIVI -check-prefix=VI -check-prefix=GFX89 %s
|
; RUN: llc -verify-machineinstrs -march=amdgcn -mtriple=amdgcn---amdgiz -mcpu=fiji -enable-amdgpu-aa=0 -mattr=+flat-for-global < %s | FileCheck -enable-var-scope -check-prefixes=GCN,CIVI,VI,GFX89 %s
|
||||||
; RUN: llc -verify-machineinstrs -march=amdgcn -mtriple=amdgcn---amdgiz -mcpu=hawaii -enable-amdgpu-aa=0 -mattr=+flat-for-global < %s | FileCheck -enable-var-scope -check-prefix=GCN -check-prefix=CIVI -check-prefix=CI %s
|
; RUN: llc -verify-machineinstrs -march=amdgcn -mtriple=amdgcn---amdgiz -mcpu=hawaii -enable-amdgpu-aa=0 -mattr=+flat-for-global < %s | FileCheck -enable-var-scope -check-prefixes=GCN,CIVI,CI %s
|
||||||
|
|
||||||
; GCN-LABEL: {{^}}s_insertelement_v2i16_0:
|
; GCN-LABEL: {{^}}s_insertelement_v2i16_0:
|
||||||
; GCN: s_load_dword [[VEC:s[0-9]+]]
|
; GCN: s_load_dword [[VEC:s[0-9]+]]
|
||||||
@ -39,11 +39,21 @@ define amdgpu_kernel void @s_insertelement_v2i16_0_reg(<2 x i16> addrspace(1)* %
|
|||||||
; GCN: s_load_dword [[ELT0:s[0-9]+]]
|
; GCN: s_load_dword [[ELT0:s[0-9]+]]
|
||||||
; GCN: s_load_dword [[VEC:s[0-9]+]]
|
; GCN: s_load_dword [[VEC:s[0-9]+]]
|
||||||
|
|
||||||
; CIVI-DAG: s_and_b32 [[ELT0]], [[ELT0]], 0xffff{{$}}
|
; CI-DAG: s_and_b32 [[ELT0]], [[ELT0]], 0xffff{{$}}
|
||||||
; CIVI: s_lshr_b32 [[SHR:s[0-9]+]], [[VEC]], 16
|
; CI: s_lshr_b32 [[SHR:s[0-9]+]], [[VEC]], 16
|
||||||
; CIVI: s_lshl_b32 [[ELT1:s[0-9]+]], [[SHR]], 16
|
; CI: s_lshl_b32 [[ELT1:s[0-9]+]], [[SHR]], 16
|
||||||
; CIVI-DAG: s_or_b32 s{{[0-9]+}}, [[ELT0]], [[ELT1]]
|
; CI-DAG: s_or_b32 s{{[0-9]+}}, [[ELT0]], [[ELT1]]
|
||||||
; CIVI-DAG: ; use [[SHR]]
|
; CI-DAG: ; use [[SHR]]
|
||||||
|
|
||||||
|
|
||||||
|
; FIXME: Should be able to void mask of upper bits
|
||||||
|
; VI-DAG: s_and_b32 [[ELT0]], [[ELT0]], 0xffff{{$}}
|
||||||
|
; VI-DAG: s_and_b32 [[VEC_HIMASK:s[0-9]+]], [[VEC]], 0xffff0000{{$}}
|
||||||
|
; VI: s_or_b32 [[OR:s[0-9]+]], [[ELT0]], [[VEC_HIMASK]]
|
||||||
|
; VI: s_lshr_b32 [[SHR:s[0-9]+]], [[VEC]], 16
|
||||||
|
|
||||||
|
; VI-DAG: ; use [[SHR]]
|
||||||
|
|
||||||
|
|
||||||
; GFX9: s_lshr_b32 [[ELT1:s[0-9]+]], [[VEC]], 16
|
; GFX9: s_lshr_b32 [[ELT1:s[0-9]+]], [[VEC]], 16
|
||||||
; GFX9-DAG: s_pack_ll_b32_b16 s{{[0-9]+}}, [[ELT0]], [[ELT1]]
|
; GFX9-DAG: s_pack_ll_b32_b16 s{{[0-9]+}}, [[ELT0]], [[ELT1]]
|
||||||
@ -103,10 +113,16 @@ define amdgpu_kernel void @s_insertelement_v2i16_0_reghi_multi_use_1(<2 x i16> a
|
|||||||
; GCN: s_load_dword [[ELT_ARG:s[0-9]+]], s[0:1]
|
; GCN: s_load_dword [[ELT_ARG:s[0-9]+]], s[0:1]
|
||||||
; GCN: s_load_dword [[VEC:s[0-9]+]],
|
; GCN: s_load_dword [[VEC:s[0-9]+]],
|
||||||
|
|
||||||
; CIVI-DAG: s_lshr_b32 [[ELT_HI:s[0-9]+]], [[ELT_ARG]], 16
|
; CI-DAG: s_lshr_b32 [[ELT_HI:s[0-9]+]], [[ELT_ARG]], 16
|
||||||
; CIVI-DAG: s_lshr_b32 [[SHR:s[0-9]+]], [[VEC]], 16
|
; CI-DAG: s_lshr_b32 [[SHR:s[0-9]+]], [[VEC]], 16
|
||||||
; CIVI-DAG: s_lshl_b32 [[VEC_HI:s[0-9]+]], [[SHR]], 16
|
; CI-DAG: s_lshl_b32 [[VEC_HI:s[0-9]+]], [[SHR]], 16
|
||||||
; CIVI: s_or_b32 s{{[0-9]+}}, [[ELT_HI]], [[VEC_HI]]
|
; CI: s_or_b32 s{{[0-9]+}}, [[ELT_HI]], [[VEC_HI]]
|
||||||
|
|
||||||
|
|
||||||
|
; VI-DAG: s_lshr_b32 [[ELT_HI:s[0-9]+]], [[ELT_ARG]], 16
|
||||||
|
; VI-DAG: s_lshr_b32 [[VEC_HI:s[0-9]+]], [[VEC]], 16
|
||||||
|
; VI: s_and_b32 [[MASK_HI:s[0-9]+]], [[VEC]], 0xffff0000
|
||||||
|
; VI: s_or_b32 s{{[0-9]+}}, [[ELT_HI]], [[MASK_HI]]
|
||||||
|
|
||||||
; GFX9-DAG: s_lshr_b32 [[ELT_HI:s[0-9]+]], [[ELT_ARG]], 16
|
; GFX9-DAG: s_lshr_b32 [[ELT_HI:s[0-9]+]], [[ELT_ARG]], 16
|
||||||
; GFX9-DAG: s_lshr_b32 [[VEC_HI:s[0-9]+]], [[VEC]], 16
|
; GFX9-DAG: s_lshr_b32 [[VEC_HI:s[0-9]+]], [[VEC]], 16
|
||||||
|
@ -1,8 +1,8 @@
|
|||||||
; RUN: llc < %s -march=amdgcn -verify-machineinstrs | FileCheck %s --check-prefixes=SI,GCN,MESA-GCN,FUNC
|
; RUN: llc < %s -march=amdgcn -verify-machineinstrs | FileCheck -enable-var-scope --check-prefixes=SI,GCN,MESA-GCN,FUNC %s
|
||||||
; RUN: llc < %s -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs | FileCheck %s --check-prefixes=VI,GCN,MESA-VI,MESA-GCN,FUNC
|
; RUN: llc < %s -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs | FileCheck -enable-var-scope -check-prefixes=VI,GCN,MESA-VI,MESA-GCN,FUNC %s
|
||||||
; RUN: llc < %s -mtriple=amdgcn--amdhsa -mcpu=fiji -verify-machineinstrs | FileCheck %s --check-prefixes=VI,GCN,HSA-VI,FUNC
|
; RUN: llc < %s -mtriple=amdgcn--amdhsa -mcpu=fiji -verify-machineinstrs | FileCheck -enable-var-scope -check-prefixes=VI,GCN,HSA-VI,FUNC %s
|
||||||
; RUN: llc < %s -march=r600 -mcpu=redwood -verify-machineinstrs | FileCheck %s --check-prefix=EG --check-prefix=FUNC
|
; RUN: llc < %s -march=r600 -mcpu=redwood -verify-machineinstrs | FileCheck -enable-var-scope -check-prefix=EG --check-prefix=FUNC %s
|
||||||
; RUN: llc < %s -march=r600 -mcpu=cayman -verify-machineinstrs | FileCheck %s --check-prefix=EG --check-prefix=FUNC
|
; RUN: llc < %s -march=r600 -mcpu=cayman -verify-machineinstrs | FileCheck -enable-var-scope --check-prefix=EG --check-prefix=FUNC %s
|
||||||
|
|
||||||
; FUNC-LABEL: {{^}}i8_arg:
|
; FUNC-LABEL: {{^}}i8_arg:
|
||||||
; HSA-VI: kernarg_segment_alignment = 4
|
; HSA-VI: kernarg_segment_alignment = 4
|
||||||
@ -162,10 +162,11 @@ entry:
|
|||||||
; HSA-VI: kernarg_segment_alignment = 4
|
; HSA-VI: kernarg_segment_alignment = 4
|
||||||
; EG: VTX_READ_16
|
; EG: VTX_READ_16
|
||||||
; EG: VTX_READ_16
|
; EG: VTX_READ_16
|
||||||
; MESA-GCN: buffer_load_ushort
|
|
||||||
; MESA-GCN: buffer_load_ushort
|
; SI: buffer_load_ushort
|
||||||
; HSA-VI: flat_load_ushort
|
; SI: buffer_load_ushort
|
||||||
; HSA-VI: flat_load_ushort
|
|
||||||
|
; VI: s_load_dword s
|
||||||
define amdgpu_kernel void @v2i16_arg(<2 x i16> addrspace(1)* %out, <2 x i16> %in) {
|
define amdgpu_kernel void @v2i16_arg(<2 x i16> addrspace(1)* %out, <2 x i16> %in) {
|
||||||
entry:
|
entry:
|
||||||
store <2 x i16> %in, <2 x i16> addrspace(1)* %out
|
store <2 x i16> %in, <2 x i16> addrspace(1)* %out
|
||||||
@ -285,14 +286,14 @@ entry:
|
|||||||
; EG: VTX_READ_16
|
; EG: VTX_READ_16
|
||||||
; EG: VTX_READ_16
|
; EG: VTX_READ_16
|
||||||
; EG: VTX_READ_16
|
; EG: VTX_READ_16
|
||||||
; MESA-GCN: buffer_load_ushort
|
|
||||||
; MESA-GCN: buffer_load_ushort
|
; SI: buffer_load_ushort
|
||||||
; MESA-GCN: buffer_load_ushort
|
; SI: buffer_load_ushort
|
||||||
; MESA-GCN: buffer_load_ushort
|
; SI: buffer_load_ushort
|
||||||
; HSA-GCN: flat_load_ushort
|
; SI: buffer_load_ushort
|
||||||
; HSA-GCN: flat_load_ushort
|
|
||||||
; HSA-GCN: flat_load_ushort
|
; VI: s_load_dword s
|
||||||
; HSA-GCN: flat_load_ushort
|
; VI: s_load_dword s
|
||||||
define amdgpu_kernel void @v4i16_arg(<4 x i16> addrspace(1)* %out, <4 x i16> %in) {
|
define amdgpu_kernel void @v4i16_arg(<4 x i16> addrspace(1)* %out, <4 x i16> %in) {
|
||||||
entry:
|
entry:
|
||||||
store <4 x i16> %in, <4 x i16> addrspace(1)* %out
|
store <4 x i16> %in, <4 x i16> addrspace(1)* %out
|
||||||
@ -305,6 +306,7 @@ entry:
|
|||||||
; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[3].Z
|
; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[3].Z
|
||||||
; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[3].W
|
; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[3].W
|
||||||
; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[4].X
|
; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[4].X
|
||||||
|
|
||||||
; SI: s_load_dwordx4 s{{\[[0-9]:[0-9]\]}}, s[0:1], 0xd
|
; SI: s_load_dwordx4 s{{\[[0-9]:[0-9]\]}}, s[0:1], 0xd
|
||||||
; MESA-VI: s_load_dwordx4 s{{\[[0-9]:[0-9]\]}}, s[0:1], 0x34
|
; MESA-VI: s_load_dwordx4 s{{\[[0-9]:[0-9]\]}}, s[0:1], 0x34
|
||||||
; HSA-VI: s_load_dwordx4 s[{{[0-9]+:[0-9]+}}], s[4:5], 0x10
|
; HSA-VI: s_load_dwordx4 s[{{[0-9]+:[0-9]+}}], s[4:5], 0x10
|
||||||
@ -370,22 +372,20 @@ entry:
|
|||||||
; EG: VTX_READ_16
|
; EG: VTX_READ_16
|
||||||
; EG: VTX_READ_16
|
; EG: VTX_READ_16
|
||||||
; EG: VTX_READ_16
|
; EG: VTX_READ_16
|
||||||
; MESA-GCN: buffer_load_ushort
|
|
||||||
; MESA-GCN: buffer_load_ushort
|
; SI: buffer_load_ushort
|
||||||
; MESA-GCN: buffer_load_ushort
|
; SI: buffer_load_ushort
|
||||||
; MESA-GCN: buffer_load_ushort
|
; SI: buffer_load_ushort
|
||||||
; MESA-GCN: buffer_load_ushort
|
; SI: buffer_load_ushort
|
||||||
; MESA-GCN: buffer_load_ushort
|
; SI: buffer_load_ushort
|
||||||
; MESA-GCN: buffer_load_ushort
|
; SI: buffer_load_ushort
|
||||||
; MESA-GCN: buffer_load_ushort
|
; SI: buffer_load_ushort
|
||||||
; HSA-VI: flat_load_ushort
|
; SI: buffer_load_ushort
|
||||||
; HSA-VI: flat_load_ushort
|
|
||||||
; HSA-VI: flat_load_ushort
|
; VI: s_load_dword s
|
||||||
; HSA-VI: flat_load_ushort
|
; VI: s_load_dword s
|
||||||
; HSA-VI: flat_load_ushort
|
; VI: s_load_dword s
|
||||||
; HSA-VI: flat_load_ushort
|
; VI: s_load_dword s
|
||||||
; HSA-VI: flat_load_ushort
|
|
||||||
; HSA-VI: flat_load_ushort
|
|
||||||
define amdgpu_kernel void @v8i16_arg(<8 x i16> addrspace(1)* %out, <8 x i16> %in) {
|
define amdgpu_kernel void @v8i16_arg(<8 x i16> addrspace(1)* %out, <8 x i16> %in) {
|
||||||
entry:
|
entry:
|
||||||
store <8 x i16> %in, <8 x i16> addrspace(1)* %out
|
store <8 x i16> %in, <8 x i16> addrspace(1)* %out
|
||||||
@ -502,38 +502,32 @@ entry:
|
|||||||
; EG: VTX_READ_16
|
; EG: VTX_READ_16
|
||||||
; EG: VTX_READ_16
|
; EG: VTX_READ_16
|
||||||
; EG: VTX_READ_16
|
; EG: VTX_READ_16
|
||||||
; MESA-GCN: buffer_load_ushort
|
|
||||||
; MESA-GCN: buffer_load_ushort
|
; SI: buffer_load_ushort
|
||||||
; MESA-GCN: buffer_load_ushort
|
; SI: buffer_load_ushort
|
||||||
; MESA-GCN: buffer_load_ushort
|
; SI: buffer_load_ushort
|
||||||
; MESA-GCN: buffer_load_ushort
|
; SI: buffer_load_ushort
|
||||||
; MESA-GCN: buffer_load_ushort
|
; SI: buffer_load_ushort
|
||||||
; MESA-GCN: buffer_load_ushort
|
; SI: buffer_load_ushort
|
||||||
; MESA-GCN: buffer_load_ushort
|
; SI: buffer_load_ushort
|
||||||
; MESA-GCN: buffer_load_ushort
|
; SI: buffer_load_ushort
|
||||||
; MESA-GCN: buffer_load_ushort
|
; SI: buffer_load_ushort
|
||||||
; MESA-GCN: buffer_load_ushort
|
; SI: buffer_load_ushort
|
||||||
; MESA-GCN: buffer_load_ushort
|
; SI: buffer_load_ushort
|
||||||
; MESA-GCN: buffer_load_ushort
|
; SI: buffer_load_ushort
|
||||||
; MESA-GCN: buffer_load_ushort
|
; SI: buffer_load_ushort
|
||||||
; MESA-GCN: buffer_load_ushort
|
; SI: buffer_load_ushort
|
||||||
; MESA-GCN: buffer_load_ushort
|
; SI: buffer_load_ushort
|
||||||
; HSA-VI: flat_load_ushort
|
; SI: buffer_load_ushort
|
||||||
; HSA-VI: flat_load_ushort
|
|
||||||
; HSA-VI: flat_load_ushort
|
; VI: s_load_dword s
|
||||||
; HSA-VI: flat_load_ushort
|
; VI: s_load_dword s
|
||||||
; HSA-VI: flat_load_ushort
|
; VI: s_load_dword s
|
||||||
; HSA-VI: flat_load_ushort
|
; VI: s_load_dword s
|
||||||
; HSA-VI: flat_load_ushort
|
; VI: s_load_dword s
|
||||||
; HSA-VI: flat_load_ushort
|
; VI: s_load_dword s
|
||||||
; HSA-VI: flat_load_ushort
|
; VI: s_load_dword s
|
||||||
; HSA-VI: flat_load_ushort
|
; VI: s_load_dword s
|
||||||
; HSA-VI: flat_load_ushort
|
|
||||||
; HSA-VI: flat_load_ushort
|
|
||||||
; HSA-VI: flat_load_ushort
|
|
||||||
; HSA-VI: flat_load_ushort
|
|
||||||
; HSA-VI: flat_load_ushort
|
|
||||||
; HSA-VI: flat_load_ushort
|
|
||||||
define amdgpu_kernel void @v16i16_arg(<16 x i16> addrspace(1)* %out, <16 x i16> %in) {
|
define amdgpu_kernel void @v16i16_arg(<16 x i16> addrspace(1)* %out, <16 x i16> %in) {
|
||||||
entry:
|
entry:
|
||||||
store <16 x i16> %in, <16 x i16> addrspace(1)* %out
|
store <16 x i16> %in, <16 x i16> addrspace(1)* %out
|
||||||
|
@ -1,4 +1,4 @@
|
|||||||
; RUN: llc < %s -march=amdgcn -mcpu=tonga -verify-machineinstrs -show-mc-encoding | FileCheck -enable-var-scope -check-prefix=GCN -check-prefix=UNPACKED %s
|
; RUN: llc < %s -march=amdgcn -mcpu=tonga -verify-machineinstrs | FileCheck -enable-var-scope -check-prefix=GCN -check-prefix=UNPACKED %s
|
||||||
; RUN: llc < %s -march=amdgcn -mcpu=gfx810 -verify-machineinstrs | FileCheck -enable-var-scope -check-prefix=GCN -check-prefix=PACKED -check-prefix=GFX81 %s
|
; RUN: llc < %s -march=amdgcn -mcpu=gfx810 -verify-machineinstrs | FileCheck -enable-var-scope -check-prefix=GCN -check-prefix=PACKED -check-prefix=GFX81 %s
|
||||||
; RUN: llc < %s -march=amdgcn -mcpu=gfx900 -verify-machineinstrs | FileCheck -enable-var-scope -check-prefix=GCN -check-prefix=PACKED -check-prefix=GFX9 %s
|
; RUN: llc < %s -march=amdgcn -mcpu=gfx900 -verify-machineinstrs | FileCheck -enable-var-scope -check-prefix=GCN -check-prefix=PACKED -check-prefix=GFX9 %s
|
||||||
|
|
||||||
@ -13,9 +13,12 @@ main_body:
|
|||||||
|
|
||||||
; GCN-LABEL: {{^}}buffer_store_format_d16_xy:
|
; GCN-LABEL: {{^}}buffer_store_format_d16_xy:
|
||||||
|
|
||||||
; UNPACKED: flat_load_ushort v[[HI:[0-9]+]], v[{{[0-9]+:[0-9]+}}] ;
|
; UNPACKED: s_load_dword [[S_DATA:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x34
|
||||||
; UNPACKED: flat_load_ushort v[[LO:[0-9]+]], v[{{[0-9]+:[0-9]+}}] ;
|
; UNPACKED-DAG: s_lshr_b32 [[SHR:s[0-9]+]], [[S_DATA]], 16
|
||||||
; UNPACKED: buffer_store_format_d16_xy v{{\[}}[[LO]]:[[HI]]{{\]}}, v{{[0-9]+}}, s[{{[0-9]+:[0-9]+}}], 0 idxen
|
; UNPACKED-DAG: s_and_b32 [[MASKED:s[0-9]+]], [[S_DATA]], 0xffff{{$}}
|
||||||
|
; UNPACKED-DAG: v_mov_b32_e32 v[[V_LO:[0-9]+]], [[MASKED]]
|
||||||
|
; UNPACKED-DAG: v_mov_b32_e32 v[[V_HI:[0-9]+]], [[SHR]]
|
||||||
|
; UNPACKED: buffer_store_format_d16_xy v{{\[}}[[V_LO]]:[[V_HI]]{{\]}}, v{{[0-9]+}}, s[{{[0-9]+:[0-9]+}}], 0 idxen
|
||||||
|
|
||||||
; PACKED: buffer_store_format_d16_xy v{{[0-9]+}}, v{{[0-9]+}}, s[{{[0-9]+:[0-9]+}}], 0 idxen
|
; PACKED: buffer_store_format_d16_xy v{{[0-9]+}}, v{{[0-9]+}}, s[{{[0-9]+:[0-9]+}}], 0 idxen
|
||||||
define amdgpu_kernel void @buffer_store_format_d16_xy(<4 x i32> %rsrc, <2 x half> %data, i32 %index) {
|
define amdgpu_kernel void @buffer_store_format_d16_xy(<4 x i32> %rsrc, <2 x half> %data, i32 %index) {
|
||||||
@ -26,17 +29,27 @@ main_body:
|
|||||||
|
|
||||||
; GCN-LABEL: {{^}}buffer_store_format_d16_xyzw:
|
; GCN-LABEL: {{^}}buffer_store_format_d16_xyzw:
|
||||||
|
|
||||||
; UNPACKED: flat_load_ushort v[[HI:[0-9]+]], v[{{[0-9]+:[0-9]+}}] ;
|
; UNPACKED-DAG: s_load_dword [[S_DATA_0:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x34
|
||||||
; UNPACKED: flat_load_ushort v{{[0-9]+}}, v[{{[0-9]+:[0-9]+}}] ;
|
; UNPACKED-DAG: s_load_dword [[S_DATA_1:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x38
|
||||||
; UNPACKED: flat_load_ushort v{{[0-9]+}}, v[{{[0-9]+:[0-9]+}}] ;
|
|
||||||
; UNPACKED: flat_load_ushort v[[LO:[0-9]+]], v[{{[0-9]+:[0-9]+}}] ;
|
; UNPACKED-DAG: s_mov_b32 [[K:s[0-9]+]], 0xffff{{$}}
|
||||||
|
; UNPACKED-DAG: s_lshr_b32 [[SHR0:s[0-9]+]], [[S_DATA_0]], 16
|
||||||
|
; UNPACKED-DAG: s_and_b32 [[MASKED0:s[0-9]+]], [[S_DATA_0]], [[K]]
|
||||||
|
; UNPACKED-DAG: s_lshr_b32 [[SHR1:s[0-9]+]], [[S_DATA_1]], 16
|
||||||
|
; UNPACKED-DAG: s_and_b32 [[MASKED1:s[0-9]+]], [[S_DATA_1]], [[K]]
|
||||||
|
|
||||||
|
; UNPACKED-DAG: v_mov_b32_e32 v[[LO:[0-9]+]], [[MASKED0]]
|
||||||
|
; UNPACKED-DAG: v_mov_b32_e32 v[[HI:[0-9]+]], [[SHR1]]
|
||||||
|
|
||||||
; UNPACKED: buffer_store_format_d16_xyzw v{{\[}}[[LO]]:[[HI]]{{\]}}, v{{[0-9]+}}, s[{{[0-9]+:[0-9]+}}], 0 idxen
|
; UNPACKED: buffer_store_format_d16_xyzw v{{\[}}[[LO]]:[[HI]]{{\]}}, v{{[0-9]+}}, s[{{[0-9]+:[0-9]+}}], 0 idxen
|
||||||
|
|
||||||
; GFX81: v_or_b32_e32 v[[HI:[0-9]+]]
|
|
||||||
; GFX81: v_or_b32_e32 v[[LO:[0-9]+]]
|
|
||||||
|
|
||||||
; GFX9: v_mov_b32_e32 v[[LO:[0-9]+]]
|
|
||||||
; GFX9: v_mov_b32_e32 v[[HI:[0-9]+]]
|
; PACKED-DAG: s_load_dword [[S_DATA_0:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x34
|
||||||
|
; PACKED-DAG: s_load_dword [[S_DATA_1:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x38
|
||||||
|
|
||||||
|
; PACKED: v_mov_b32_e32 v[[LO:[0-9]+]], [[S_DATA_0]]
|
||||||
|
; PACKED: v_mov_b32_e32 v[[HI:[0-9]+]], [[S_DATA_1]]
|
||||||
|
|
||||||
; PACKED: buffer_store_format_d16_xyzw v{{\[}}[[LO]]:[[HI]]{{\]}}, v{{[0-9]+}}, s[{{[0-9]+:[0-9]+}}], 0 idxen
|
; PACKED: buffer_store_format_d16_xyzw v{{\[}}[[LO]]:[[HI]]{{\]}}, v{{[0-9]+}}, s[{{[0-9]+:[0-9]+}}], 0 idxen
|
||||||
define amdgpu_kernel void @buffer_store_format_d16_xyzw(<4 x i32> %rsrc, <4 x half> %data, i32 %index) {
|
define amdgpu_kernel void @buffer_store_format_d16_xyzw(<4 x i32> %rsrc, <4 x half> %data, i32 %index) {
|
||||||
|
@ -1,6 +1,6 @@
|
|||||||
; RUN: llc < %s -march=amdgcn -mcpu=tonga -verify-machineinstrs | FileCheck -check-prefix=GCN -check-prefix=UNPACKED %s
|
; RUN: llc < %s -march=amdgcn -mcpu=tonga -verify-machineinstrs | FileCheck -enable-var-scope -check-prefix=GCN -check-prefix=UNPACKED %s
|
||||||
; RUN: llc < %s -march=amdgcn -mcpu=gfx810 -verify-machineinstrs | FileCheck -check-prefix=GCN -check-prefix=PACKED -check-prefix=GFX81 %s
|
; RUN: llc < %s -march=amdgcn -mcpu=gfx810 -verify-machineinstrs | FileCheck -enable-var-scope -check-prefix=GCN -check-prefix=PACKED -check-prefix=GFX81 %s
|
||||||
; RUN: llc < %s -march=amdgcn -mcpu=gfx900 -verify-machineinstrs | FileCheck -check-prefix=GCN -check-prefix=PACKED -check-prefix=GFX9 %s
|
; RUN: llc < %s -march=amdgcn -mcpu=gfx900 -verify-machineinstrs | FileCheck -enable-var-scope -check-prefix=GCN -check-prefix=PACKED -check-prefix=GFX9 %s
|
||||||
|
|
||||||
; GCN-LABEL: {{^}}image_load_f16
|
; GCN-LABEL: {{^}}image_load_f16
|
||||||
; GCN: image_load v{{[0-9]+}}, v[{{[0-9]+:[0-9]+}}], s[{{[0-9]+:[0-9]+}}] dmask:0x1 unorm d16
|
; GCN: image_load v{{[0-9]+}}, v[{{[0-9]+:[0-9]+}}], s[{{[0-9]+:[0-9]+}}] dmask:0x1 unorm d16
|
||||||
@ -58,11 +58,17 @@ main_body:
|
|||||||
ret void
|
ret void
|
||||||
}
|
}
|
||||||
|
|
||||||
; GCN-LABEL: {{^}}image_store_v2f16
|
; FIXME: Eliminate and to get low bits
|
||||||
|
; GCN-LABEL: {{^}}image_store_v2f16:
|
||||||
|
; UNPACKED: s_load_dword [[DATA:s[0-9]+]]
|
||||||
|
; UNPACKED-DAG: s_lshr_b32 [[UNPACK_1:s[0-9]+]], [[DATA]], 16
|
||||||
|
; UNPACKED-DAG: s_and_b32 [[UNPACK_0:s[0-9]+]], [[DATA]], 0xffff
|
||||||
|
; UNPACKED-DAG: v_mov_b32_e32 v[[V_UNPACK_0:[0-9]+]], [[UNPACK_0]]
|
||||||
|
; UNPACKED-DAG: v_mov_b32_e32 v[[V_UNPACK_1:[0-9]+]], [[UNPACK_1]]
|
||||||
|
|
||||||
; UNPACKED: flat_load_ushort v[[HI:[0-9]+]], v[{{[0-9]+:[0-9]+}}]{{$}}
|
|
||||||
; UNPACKED: flat_load_ushort v[[LO:[0-9]+]], v[{{[0-9]+:[0-9]+}}]{{$}}
|
|
||||||
; UNPACKED: image_store v{{\[}}[[LO]]:[[HI]]{{\]}}, v[{{[0-9]+:[0-9]+}}], s[{{[0-9]+:[0-9]+}}] dmask:0x3 unorm d16
|
; UNPACKED: image_store v{{\[}}[[V_UNPACK_0]]:[[V_UNPACK_1]]{{\]}}, v[{{[0-9]+:[0-9]+}}], s[{{[0-9]+:[0-9]+}}] dmask:0x3 unorm d16
|
||||||
|
|
||||||
; PACKED: image_store v{{[0-9]+}}, v[{{[0-9]+:[0-9]+}}], s[{{[0-9]+:[0-9]+}}] dmask:0x3 unorm d16
|
; PACKED: image_store v{{[0-9]+}}, v[{{[0-9]+:[0-9]+}}], s[{{[0-9]+:[0-9]+}}] dmask:0x3 unorm d16
|
||||||
define amdgpu_kernel void @image_store_v2f16(<2 x half> %data, <4 x i32> %coords, <8 x i32> inreg %rsrc) {
|
define amdgpu_kernel void @image_store_v2f16(<2 x half> %data, <4 x i32> %coords, <8 x i32> inreg %rsrc) {
|
||||||
@ -72,20 +78,19 @@ main_body:
|
|||||||
}
|
}
|
||||||
|
|
||||||
; GCN-LABEL: {{^}}image_store_v4f16
|
; GCN-LABEL: {{^}}image_store_v4f16
|
||||||
|
; UNPACKED: s_load_dword s
|
||||||
|
; UNPACKED: s_load_dword s
|
||||||
|
; UNPACKED: s_lshr_b32 s{{[0-9]+}}, s{{[0-9]+}}, 16
|
||||||
|
; UNPACKED: s_lshr_b32 s{{[0-9]+}}, s{{[0-9]+}}, 16
|
||||||
|
; UNPACKED: s_and_b32
|
||||||
|
; UNPACKED: s_and_b32
|
||||||
|
; UNPACKED: image_store v{{\[[0-9]+:[0-9]+\]}}, v[{{[0-9]+:[0-9]+}}], s[{{[0-9]+:[0-9]+}}] dmask:0xf unorm d16
|
||||||
|
|
||||||
; UNPACKED: flat_load_ushort v[[HI:[0-9]+]], v[{{[0-9]+:[0-9]+}}]{{$}}
|
; PACKED: s_load_dword [[DATA0:s[0-9]+]]
|
||||||
; UNPACKED: flat_load_ushort v{{[0-9]+}}, v[{{[0-9]+:[0-9]+}}]{{$}}
|
; PACKED: s_load_dword [[DATA1:s[0-9]+]]
|
||||||
; UNPACKED: flat_load_ushort v{{[0-9]+}}, v[{{[0-9]+:[0-9]+}}]{{$}}
|
; PACKED: v_mov_b32_e32 v[[V_LO:[0-9]+]], [[DATA0]]
|
||||||
; UNPACKED: flat_load_ushort v[[LO:[0-9]+]], v[{{[0-9]+:[0-9]+}}]{{$}}
|
; PACKED: v_mov_b32_e32 v[[V_HI:[0-9]+]], [[DATA1]]
|
||||||
; UNPACKED: image_store v{{\[}}[[LO]]:[[HI]]{{\]}}, v[{{[0-9]+:[0-9]+}}], s[{{[0-9]+:[0-9]+}}] dmask:0xf unorm d16
|
; PACKED: image_store v{{\[}}[[V_LO]]:[[V_HI]]{{\]}}, v[{{[0-9]+:[0-9]+}}], s[{{[0-9]+:[0-9]+}}] dmask:0xf unorm d16
|
||||||
|
|
||||||
; GFX81: v_or_b32_e32 v[[HI:[0-9]+]]
|
|
||||||
; GFX81: v_or_b32_e32 v[[LO:[0-9]+]]
|
|
||||||
|
|
||||||
; GFX9: v_mov_b32_e32 v[[LO:[0-9]+]]
|
|
||||||
; GFX9: v_mov_b32_e32 v[[HI:[0-9]+]]
|
|
||||||
|
|
||||||
; PACKED: image_store v{{\[}}[[LO]]:[[HI]]{{\]}}, v[{{[0-9]+:[0-9]+}}], s[{{[0-9]+:[0-9]+}}] dmask:0xf unorm d16
|
|
||||||
define amdgpu_kernel void @image_store_v4f16(<4 x half> %data, <4 x i32> %coords, <8 x i32> inreg %rsrc) {
|
define amdgpu_kernel void @image_store_v4f16(<4 x half> %data, <4 x i32> %coords, <8 x i32> inreg %rsrc) {
|
||||||
main_body:
|
main_body:
|
||||||
call void @llvm.amdgcn.image.store.v4f16.v4i32.v8i32(<4 x half> %data, <4 x i32> %coords, <8 x i32> %rsrc, i32 15, i1 false, i1 false, i1 false, i1 false)
|
call void @llvm.amdgcn.image.store.v4f16.v4i32.v8i32(<4 x half> %data, <4 x i32> %coords, <8 x i32> %rsrc, i32 15, i1 false, i1 false, i1 false, i1 false)
|
||||||
@ -93,20 +98,19 @@ main_body:
|
|||||||
}
|
}
|
||||||
|
|
||||||
; GCN-LABEL: {{^}}image_store_mip_v4f16
|
; GCN-LABEL: {{^}}image_store_mip_v4f16
|
||||||
|
; UNPACKD: s_load_dword s
|
||||||
|
; UNPACKD: s_load_dword s
|
||||||
|
; UNPACKED: s_lshr_b32 s{{[0-9]+}}, s{{[0-9]+}}, 16
|
||||||
|
; UNPACKED: s_lshr_b32 s{{[0-9]+}}, s{{[0-9]+}}, 16
|
||||||
|
; UNPACKED: s_and_b32
|
||||||
|
; UNPACKED: s_and_b32
|
||||||
|
; UNPACKED: image_store_mip v{{\[[0-9]+:[0-9]+\]}}, v[{{[0-9]+:[0-9]+}}], s[{{[0-9]+:[0-9]+}}] dmask:0xf unorm d16
|
||||||
|
|
||||||
; UNPACKED: flat_load_ushort v[[HI:[0-9]+]], v[{{[0-9]+:[0-9]+}}]{{$}}
|
; PACKED: s_load_dword [[DATA0:s[0-9]+]]
|
||||||
; UNPACKED: flat_load_ushort v{{[0-9]+}}, v[{{[0-9]+:[0-9]+}}]{{$}}
|
; PACKED: s_load_dword [[DATA1:s[0-9]+]]
|
||||||
; UNPACKED: flat_load_ushort v{{[0-9]+}}, v[{{[0-9]+:[0-9]+}}]{{$}}
|
; PACKED: v_mov_b32_e32 v[[V_LO:[0-9]+]], [[DATA0]]
|
||||||
; UNPACKED: flat_load_ushort v[[LO:[0-9]+]], v[{{[0-9]+:[0-9]+}}]{{$}}
|
; PACKED: v_mov_b32_e32 v[[V_HI:[0-9]+]], [[DATA1]]
|
||||||
; UNPACKED: image_store_mip v{{\[}}[[LO]]:[[HI]]{{\]}}, v[{{[0-9]+:[0-9]+}}], s[{{[0-9]+:[0-9]+}}] dmask:0xf unorm d16
|
; PACKED: image_store_mip v{{\[}}[[V_LO]]:[[V_HI]]{{\]}}, v[{{[0-9]+:[0-9]+}}], s[{{[0-9]+:[0-9]+}}] dmask:0xf unorm d16
|
||||||
|
|
||||||
; GFX81: v_or_b32_e32 v[[HI:[0-9]+]]
|
|
||||||
; GFX81: v_or_b32_e32 v[[LO:[0-9]+]]
|
|
||||||
|
|
||||||
; GFX9: v_mov_b32_e32 v[[LO:[0-9]+]]
|
|
||||||
; GFX9: v_mov_b32_e32 v[[HI:[0-9]+]]
|
|
||||||
|
|
||||||
; PACKED: image_store_mip v{{\[}}[[LO]]:[[HI]]{{\]}}, v[{{[0-9]+:[0-9]+}}], s[{{[0-9]+:[0-9]+}}] dmask:0xf unorm d16
|
|
||||||
define amdgpu_kernel void @image_store_mip_v4f16(<4 x half> %data, <4 x i32> %coords, <8 x i32> inreg %rsrc) {
|
define amdgpu_kernel void @image_store_mip_v4f16(<4 x half> %data, <4 x i32> %coords, <8 x i32> inreg %rsrc) {
|
||||||
main_body:
|
main_body:
|
||||||
call void @llvm.amdgcn.image.store.mip.v4f16.v4i32.v8i32(<4 x half> %data, <4 x i32> %coords, <8 x i32> %rsrc, i32 15, i1 false, i1 false, i1 false, i1 false)
|
call void @llvm.amdgcn.image.store.mip.v4f16.v4i32.v8i32(<4 x half> %data, <4 x i32> %coords, <8 x i32> %rsrc, i32 15, i1 false, i1 false, i1 false, i1 false)
|
||||||
|
@ -1,4 +1,5 @@
|
|||||||
; RUN: llc -march=amdgcn -mcpu=verde -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s
|
; RUN: llc -march=amdgcn -mcpu=verde -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s
|
||||||
|
; RUN: llc -march=amdgcn -mcpu=fiji -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s
|
||||||
; RUN: llc -march=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s
|
; RUN: llc -march=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s
|
||||||
|
|
||||||
; GCN-LABEL: {{^}}load_1d:
|
; GCN-LABEL: {{^}}load_1d:
|
||||||
|
@ -1,4 +1,4 @@
|
|||||||
; RUN: llc < %s -march=amdgcn -mcpu=tonga -verify-machineinstrs -show-mc-encoding | FileCheck -enable-var-scope -check-prefix=GCN -check-prefix=UNPACKED %s
|
; RUN: llc < %s -march=amdgcn -mcpu=tonga -verify-machineinstrs | FileCheck -enable-var-scope -check-prefix=GCN -check-prefix=UNPACKED %s
|
||||||
; RUN: llc < %s -march=amdgcn -mcpu=gfx810 -verify-machineinstrs | FileCheck -enable-var-scope -check-prefix=GCN -check-prefix=PACKED -check-prefix=GFX81 %s
|
; RUN: llc < %s -march=amdgcn -mcpu=gfx810 -verify-machineinstrs | FileCheck -enable-var-scope -check-prefix=GCN -check-prefix=PACKED -check-prefix=GFX81 %s
|
||||||
; RUN: llc < %s -march=amdgcn -mcpu=gfx900 -verify-machineinstrs | FileCheck -enable-var-scope -check-prefix=GCN -check-prefix=PACKED -check-prefix=GFX9 %s
|
; RUN: llc < %s -march=amdgcn -mcpu=gfx900 -verify-machineinstrs | FileCheck -enable-var-scope -check-prefix=GCN -check-prefix=PACKED -check-prefix=GFX9 %s
|
||||||
|
|
||||||
@ -12,12 +12,13 @@ main_body:
|
|||||||
ret void
|
ret void
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
; GCN-LABEL: {{^}}tbuffer_store_d16_xy:
|
; GCN-LABEL: {{^}}tbuffer_store_d16_xy:
|
||||||
|
; GCN: s_load_dword [[S_DATA:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x34
|
||||||
; UNPACKED: flat_load_ushort v[[HI:[0-9]+]], v[{{[0-9]+:[0-9]+}}] ;
|
; UNPACKED-DAG: s_lshr_b32 [[SHR:s[0-9]+]], [[S_DATA]], 16
|
||||||
; UNPACKED: flat_load_ushort v[[LO:[0-9]+]], v[{{[0-9]+:[0-9]+}}] ;
|
; UNPACKED-DAG: s_and_b32 [[MASKED:s[0-9]+]], [[S_DATA]], 0xffff{{$}}
|
||||||
; UNPACKED: tbuffer_store_format_d16_xy v{{\[}}[[LO]]:[[HI]]{{\]}}, v{{[0-9]+}}, s[{{[0-9]+:[0-9]+}}], dfmt:1, nfmt:2, 0 idxen
|
; UNPACKED-DAG: v_mov_b32_e32 v[[V_LO:[0-9]+]], [[MASKED]]
|
||||||
|
; UNPACKED-DAG: v_mov_b32_e32 v[[V_HI:[0-9]+]], [[SHR]]
|
||||||
|
; UNPACKED: tbuffer_store_format_d16_xy v{{\[}}[[V_LO]]:[[V_HI]]{{\]}}, v{{[0-9]+}}, s[{{[0-9]+:[0-9]+}}], dfmt:1, nfmt:2, 0 idxen
|
||||||
|
|
||||||
; PACKED: tbuffer_store_format_d16_xy v{{[0-9]+}}, v{{[0-9]+}}, s[{{[0-9]+:[0-9]+}}], dfmt:1, nfmt:2, 0 idxen
|
; PACKED: tbuffer_store_format_d16_xy v{{[0-9]+}}, v{{[0-9]+}}, s[{{[0-9]+:[0-9]+}}], dfmt:1, nfmt:2, 0 idxen
|
||||||
define amdgpu_kernel void @tbuffer_store_d16_xy(<4 x i32> %rsrc, <2 x half> %data, i32 %vindex) {
|
define amdgpu_kernel void @tbuffer_store_d16_xy(<4 x i32> %rsrc, <2 x half> %data, i32 %vindex) {
|
||||||
@ -26,21 +27,23 @@ main_body:
|
|||||||
ret void
|
ret void
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
; GCN-LABEL: {{^}}tbuffer_store_d16_xyzw:
|
; GCN-LABEL: {{^}}tbuffer_store_d16_xyzw:
|
||||||
|
; GCN-DAG: s_load_dword [[S_DATA_0:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x34
|
||||||
|
; GCN-DAG: s_load_dword [[S_DATA_1:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x38
|
||||||
|
|
||||||
; UNPACKED: flat_load_ushort v[[HI:[0-9]+]], v[{{[0-9]+:[0-9]+}}] ;
|
; UNPACKED-DAG: s_mov_b32 [[K:s[0-9]+]], 0xffff{{$}}
|
||||||
; UNPACKED: flat_load_ushort v{{[0-9]+}}, v[{{[0-9]+:[0-9]+}}] ;
|
; UNPACKED-DAG: s_lshr_b32 [[SHR0:s[0-9]+]], [[S_DATA_0]], 16
|
||||||
; UNPACKED: flat_load_ushort v{{[0-9]+}}, v[{{[0-9]+:[0-9]+}}] ;
|
; UNPACKED-DAG: s_and_b32 [[MASKED0:s[0-9]+]], [[S_DATA_0]], [[K]]
|
||||||
; UNPACKED: flat_load_ushort v[[LO:[0-9]+]], v[{{[0-9]+:[0-9]+}}] ;
|
; UNPACKED-DAG: s_lshr_b32 [[SHR1:s[0-9]+]], [[S_DATA_1]], 16
|
||||||
|
; UNPACKED-DAG: s_and_b32 [[MASKED1:s[0-9]+]], [[S_DATA_1]], [[K]]
|
||||||
|
|
||||||
|
; UNPACKED-DAG: v_mov_b32_e32 v[[LO:[0-9]+]], [[MASKED0]]
|
||||||
|
; UNPACKED-DAG: v_mov_b32_e32 v[[HI:[0-9]+]], [[SHR1]]
|
||||||
; UNPACKED: tbuffer_store_format_d16_xyzw v{{\[}}[[LO]]:[[HI]]{{\]}}, v{{[0-9]+}}, s[{{[0-9]+:[0-9]+}}], dfmt:1, nfmt:2, 0 idxen
|
; UNPACKED: tbuffer_store_format_d16_xyzw v{{\[}}[[LO]]:[[HI]]{{\]}}, v{{[0-9]+}}, s[{{[0-9]+:[0-9]+}}], dfmt:1, nfmt:2, 0 idxen
|
||||||
|
|
||||||
; GFX81: v_or_b32_e32 v[[HI:[0-9]+]]
|
|
||||||
; GFX81: v_or_b32_e32 v[[LO:[0-9]+]]
|
|
||||||
|
|
||||||
; GFX9: v_mov_b32_e32 v[[LO:[0-9]+]]
|
|
||||||
; GFX9: v_mov_b32_e32 v[[HI:[0-9]+]]
|
|
||||||
|
|
||||||
|
; PACKED: v_mov_b32_e32 v[[LO:[0-9]+]], [[S_DATA_0]]
|
||||||
|
; PACKED: v_mov_b32_e32 v[[HI:[0-9]+]], [[S_DATA_1]]
|
||||||
; PACKED: tbuffer_store_format_d16_xyzw v{{\[}}[[LO]]:[[HI]]{{\]}}, v{{[0-9]+}}, s[{{[0-9]+:[0-9]+}}], dfmt:1, nfmt:2, 0 idxen
|
; PACKED: tbuffer_store_format_d16_xyzw v{{\[}}[[LO]]:[[HI]]{{\]}}, v{{[0-9]+}}, s[{{[0-9]+:[0-9]+}}], dfmt:1, nfmt:2, 0 idxen
|
||||||
define amdgpu_kernel void @tbuffer_store_d16_xyzw(<4 x i32> %rsrc, <4 x half> %data, i32 %vindex) {
|
define amdgpu_kernel void @tbuffer_store_d16_xyzw(<4 x i32> %rsrc, <4 x half> %data, i32 %vindex) {
|
||||||
main_body:
|
main_body:
|
||||||
|
@ -145,8 +145,12 @@ define amdgpu_kernel void @fma_v2f16(
|
|||||||
}
|
}
|
||||||
|
|
||||||
; GCN-LABEL: {{^}}fma_v2f16_imm_a:
|
; GCN-LABEL: {{^}}fma_v2f16_imm_a:
|
||||||
; GCN: buffer_load_dword v[[B_V2_F16:[0-9]+]]
|
; SI: buffer_load_dword v[[B_V2_F16:[0-9]+]]
|
||||||
; GCN: buffer_load_dword v[[C_V2_F16:[0-9]+]]
|
; SI: buffer_load_dword v[[C_V2_F16:[0-9]+]]
|
||||||
|
|
||||||
|
; VI: buffer_load_dword v[[C_V2_F16:[0-9]+]]
|
||||||
|
; VI: buffer_load_dword v[[B_V2_F16:[0-9]+]]
|
||||||
|
|
||||||
|
|
||||||
; SI: v_mov_b32_e32 v[[A_F32:[0-9]+]], 0x40400000{{$}}
|
; SI: v_mov_b32_e32 v[[A_F32:[0-9]+]], 0x40400000{{$}}
|
||||||
; VI: v_mov_b32_e32 v[[A_F16:[0-9]+]], 0x4200{{$}}
|
; VI: v_mov_b32_e32 v[[A_F16:[0-9]+]], 0x4200{{$}}
|
||||||
@ -185,8 +189,8 @@ define amdgpu_kernel void @fma_v2f16_imm_a(
|
|||||||
; SI: buffer_load_dword v[[A_V2_F16:[0-9]+]]
|
; SI: buffer_load_dword v[[A_V2_F16:[0-9]+]]
|
||||||
; SI: buffer_load_dword v[[C_V2_F16:[0-9]+]]
|
; SI: buffer_load_dword v[[C_V2_F16:[0-9]+]]
|
||||||
|
|
||||||
; VI: buffer_load_dword v[[C_V2_F16:[0-9]+]]
|
|
||||||
; VI: buffer_load_dword v[[A_V2_F16:[0-9]+]]
|
; VI: buffer_load_dword v[[A_V2_F16:[0-9]+]]
|
||||||
|
; VI: buffer_load_dword v[[C_V2_F16:[0-9]+]]
|
||||||
|
|
||||||
; SI: v_mov_b32_e32 v[[B_F32:[0-9]+]], 0x40400000{{$}}
|
; SI: v_mov_b32_e32 v[[B_F32:[0-9]+]], 0x40400000{{$}}
|
||||||
; VI: v_mov_b32_e32 v[[B_F16:[0-9]+]], 0x4200{{$}}
|
; VI: v_mov_b32_e32 v[[B_F16:[0-9]+]], 0x4200{{$}}
|
||||||
@ -228,8 +232,8 @@ define amdgpu_kernel void @fma_v2f16_imm_b(
|
|||||||
; SI: buffer_load_dword v[[A_V2_F16:[0-9]+]]
|
; SI: buffer_load_dword v[[A_V2_F16:[0-9]+]]
|
||||||
; SI: buffer_load_dword v[[B_V2_F16:[0-9]+]]
|
; SI: buffer_load_dword v[[B_V2_F16:[0-9]+]]
|
||||||
|
|
||||||
; VI: buffer_load_dword v[[B_V2_F16:[0-9]+]]
|
|
||||||
; VI: buffer_load_dword v[[A_V2_F16:[0-9]+]]
|
; VI: buffer_load_dword v[[A_V2_F16:[0-9]+]]
|
||||||
|
; VI: buffer_load_dword v[[B_V2_F16:[0-9]+]]
|
||||||
|
|
||||||
; SI: v_mov_b32_e32 v[[C_F32:[0-9]+]], 0x40400000{{$}}
|
; SI: v_mov_b32_e32 v[[C_F32:[0-9]+]], 0x40400000{{$}}
|
||||||
; VI: v_mov_b32_e32 v[[C_F16:[0-9]+]], 0x4200{{$}}
|
; VI: v_mov_b32_e32 v[[C_F16:[0-9]+]], 0x4200{{$}}
|
||||||
|
@ -1,6 +1,6 @@
|
|||||||
; RUN: llc -march=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=GFX9 %s
|
; RUN: llc -march=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX9 %s
|
||||||
; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI -check-prefix=CIVI %s
|
; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,VI,CIVI %s
|
||||||
; RUN: llc -march=amdgcn -mcpu=bonaire -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=CI -check-prefix=CIVI %s
|
; RUN: llc -march=amdgcn -mcpu=bonaire -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,CI,CIVI %s
|
||||||
|
|
||||||
; GCN-LABEL: {{^}}s_lshr_v2i16:
|
; GCN-LABEL: {{^}}s_lshr_v2i16:
|
||||||
; GFX9: s_load_dword [[LHS:s[0-9]+]]
|
; GFX9: s_load_dword [[LHS:s[0-9]+]]
|
||||||
@ -8,11 +8,20 @@
|
|||||||
; GFX9: v_mov_b32_e32 [[VLHS:v[0-9]+]], [[LHS]]
|
; GFX9: v_mov_b32_e32 [[VLHS:v[0-9]+]], [[LHS]]
|
||||||
; GFX9: v_pk_lshrrev_b16 [[RESULT:v[0-9]+]], [[RHS]], [[VLHS]]
|
; GFX9: v_pk_lshrrev_b16 [[RESULT:v[0-9]+]], [[RHS]], [[VLHS]]
|
||||||
|
|
||||||
; VI-DAG: v_lshrrev_b32_sdwa v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
|
|
||||||
|
; VI: s_load_dword [[LHS:s[0-9]+]]
|
||||||
|
; VI: s_load_dword [[RHS:s[0-9]+]]
|
||||||
|
; VI: s_lshr_b32 s{{[0-9]+}}, s{{[0-9]+}}, 16
|
||||||
|
; VI: s_lshr_b32 s{{[0-9]+}}, s{{[0-9]+}}, 16
|
||||||
|
; VI: s_lshr_b32 s{{[0-9]+}}, s{{[0-9]+}}, s{{[0-9]+}}
|
||||||
|
; VI-DAG: v_bfe_u32 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}, 16
|
||||||
|
; VI-DAG: s_lshl_b32
|
||||||
|
; VI: v_or_b32_e32
|
||||||
|
|
||||||
; CI: v_lshrrev_b32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
|
; CI: v_lshrrev_b32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
|
||||||
; CI-DAG: v_lshlrev_b32_e32 v{{[0-9]+}}, 16, v{{[0-9]+}}
|
; CI-DAG: v_lshlrev_b32_e32 v{{[0-9]+}}, 16, v{{[0-9]+}}
|
||||||
; CIVI-DAG: v_bfe_u32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}, 16
|
; CI-DAG: v_bfe_u32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}, 16
|
||||||
; CIVI: v_or_b32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
|
; CI: v_or_b32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
|
||||||
define amdgpu_kernel void @s_lshr_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> %lhs, <2 x i16> %rhs) #0 {
|
define amdgpu_kernel void @s_lshr_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> %lhs, <2 x i16> %rhs) #0 {
|
||||||
%result = lshr <2 x i16> %lhs, %rhs
|
%result = lshr <2 x i16> %lhs, %rhs
|
||||||
store <2 x i16> %result, <2 x i16> addrspace(1)* %out
|
store <2 x i16> %result, <2 x i16> addrspace(1)* %out
|
||||||
|
@ -117,8 +117,10 @@ define amdgpu_kernel void @s_test_imin_sle_v4i8(<4 x i8> addrspace(1)* %out, <4
|
|||||||
; SI: v_min_i32
|
; SI: v_min_i32
|
||||||
; SI: v_min_i32
|
; SI: v_min_i32
|
||||||
|
|
||||||
; VI: v_min_i32
|
; VI: s_sext_i32_i16
|
||||||
; VI: v_min_i32
|
; VI: s_sext_i32_i16
|
||||||
|
; VI: s_min_i32
|
||||||
|
; VI: s_min_i32
|
||||||
|
|
||||||
; GFX9: v_pk_min_i16
|
; GFX9: v_pk_min_i16
|
||||||
|
|
||||||
@ -131,17 +133,16 @@ define amdgpu_kernel void @s_test_imin_sle_v2i16(<2 x i16> addrspace(1)* %out, <
|
|||||||
ret void
|
ret void
|
||||||
}
|
}
|
||||||
|
|
||||||
; FIXME: VI use s_min_i32
|
|
||||||
; FUNC-LABEL: {{^}}s_test_imin_sle_v4i16:
|
; FUNC-LABEL: {{^}}s_test_imin_sle_v4i16:
|
||||||
; SI: v_min_i32
|
; SI: v_min_i32
|
||||||
; SI: v_min_i32
|
; SI: v_min_i32
|
||||||
; SI: v_min_i32
|
; SI: v_min_i32
|
||||||
; SI: v_min_i32
|
; SI: v_min_i32
|
||||||
|
|
||||||
; VI: v_min_i32
|
; VI: s_min_i32
|
||||||
; VI: v_min_i32
|
; VI: s_min_i32
|
||||||
; VI: v_min_i32
|
; VI: s_min_i32
|
||||||
; VI: v_min_i32
|
; VI: s_min_i32
|
||||||
|
|
||||||
; GFX9: v_pk_min_i16
|
; GFX9: v_pk_min_i16
|
||||||
; GFX9: v_pk_min_i16
|
; GFX9: v_pk_min_i16
|
||||||
@ -461,14 +462,14 @@ define amdgpu_kernel void @s_test_umin_ult_v8i32(<8 x i32> addrspace(1)* %out, <
|
|||||||
; SI: v_min_u32
|
; SI: v_min_u32
|
||||||
; SI: v_min_u32
|
; SI: v_min_u32
|
||||||
|
|
||||||
; VI: v_min_u32
|
; VI: s_min_u32
|
||||||
; VI: v_min_u32
|
; VI: s_min_u32
|
||||||
; VI: v_min_u32
|
; VI: s_min_u32
|
||||||
; VI: v_min_u32
|
; VI: s_min_u32
|
||||||
; VI: v_min_u32
|
; VI: s_min_u32
|
||||||
; VI: v_min_u32
|
; VI: s_min_u32
|
||||||
; VI: v_min_u32
|
; VI: s_min_u32
|
||||||
; VI: v_min_u32
|
; VI: s_min_u32
|
||||||
|
|
||||||
; EG: MIN_UINT
|
; EG: MIN_UINT
|
||||||
; EG: MIN_UINT
|
; EG: MIN_UINT
|
||||||
|
@ -5,7 +5,7 @@
|
|||||||
; GFX9: v_pk_add_f16 [[ADD:v[0-9]+]], v{{[0-9]+}}, v{{[0-9]+}}{{$}}
|
; GFX9: v_pk_add_f16 [[ADD:v[0-9]+]], v{{[0-9]+}}, v{{[0-9]+}}{{$}}
|
||||||
; GFX9-NEXT: v_add_f16_sdwa v{{[0-9]+}}, [[ADD]], [[ADD]] dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
|
; GFX9-NEXT: v_add_f16_sdwa v{{[0-9]+}}, [[ADD]], [[ADD]] dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
|
||||||
|
|
||||||
; VI: v_add_f16_e32
|
; VI: v_add_f16_sdwa
|
||||||
; VI-NEXT: v_add_f16_e32
|
; VI-NEXT: v_add_f16_e32
|
||||||
; VI-NEXT: v_add_f16_e32
|
; VI-NEXT: v_add_f16_e32
|
||||||
define half @reduction_half4(<4 x half> %vec4) {
|
define half @reduction_half4(<4 x half> %vec4) {
|
||||||
@ -22,7 +22,7 @@ entry:
|
|||||||
; GFX9: v_pk_add_u16 [[ADD:v[0-9]+]], v{{[0-9]+}}, v{{[0-9]+}}{{$}}
|
; GFX9: v_pk_add_u16 [[ADD:v[0-9]+]], v{{[0-9]+}}, v{{[0-9]+}}{{$}}
|
||||||
; GFX9-NEXT: v_add_u16_sdwa v{{[0-9]+}}, [[ADD]], [[ADD]] dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
|
; GFX9-NEXT: v_add_u16_sdwa v{{[0-9]+}}, [[ADD]], [[ADD]] dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
|
||||||
|
|
||||||
; VI: v_add_u16_e32
|
; VI: v_add_u16_sdwa
|
||||||
; VI-NEXT: v_add_u16_e32
|
; VI-NEXT: v_add_u16_e32
|
||||||
; VI-NEXT: v_add_u16_e32
|
; VI-NEXT: v_add_u16_e32
|
||||||
define i16 @reduction_v4i16(<4 x i16> %vec4) {
|
define i16 @reduction_v4i16(<4 x i16> %vec4) {
|
||||||
@ -41,8 +41,8 @@ entry:
|
|||||||
; GFX9-NEXT: v_pk_add_f16 [[ADD3:v[0-9]+]], [[ADD2]], [[ADD1]]{{$}}
|
; GFX9-NEXT: v_pk_add_f16 [[ADD3:v[0-9]+]], [[ADD2]], [[ADD1]]{{$}}
|
||||||
; GFX9-NEXT: v_add_f16_sdwa v{{[0-9]+}}, [[ADD3]], [[ADD3]] dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
|
; GFX9-NEXT: v_add_f16_sdwa v{{[0-9]+}}, [[ADD3]], [[ADD3]] dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
|
||||||
|
|
||||||
; VI: v_add_f16_e32
|
; VI: v_add_f16_sdwa
|
||||||
; VI-NEXT: v_add_f16_e32
|
; VI-NEXT: v_add_f16_sdwa
|
||||||
; VI-NEXT: v_add_f16_e32
|
; VI-NEXT: v_add_f16_e32
|
||||||
; VI-NEXT: v_add_f16_e32
|
; VI-NEXT: v_add_f16_e32
|
||||||
; VI-NEXT: v_add_f16_e32
|
; VI-NEXT: v_add_f16_e32
|
||||||
@ -67,8 +67,8 @@ entry:
|
|||||||
; GFX9-NEXT: v_pk_add_u16 [[ADD3]], [[ADD2]], [[ADD1]]{{$}}
|
; GFX9-NEXT: v_pk_add_u16 [[ADD3]], [[ADD2]], [[ADD1]]{{$}}
|
||||||
; GFX9-NEXT: v_add_u16_sdwa v{{[0-9]+}}, [[ADD3]], [[ADD3]] dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
|
; GFX9-NEXT: v_add_u16_sdwa v{{[0-9]+}}, [[ADD3]], [[ADD3]] dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
|
||||||
|
|
||||||
; VI: v_add_u16_e32
|
; VI: v_add_u16_sdwa
|
||||||
; VI-NEXT: v_add_u16_e32
|
; VI-NEXT: v_add_u16_sdwa
|
||||||
; VI-NEXT: v_add_u16_e32
|
; VI-NEXT: v_add_u16_e32
|
||||||
; VI-NEXT: v_add_u16_e32
|
; VI-NEXT: v_add_u16_e32
|
||||||
; VI-NEXT: v_add_u16_e32
|
; VI-NEXT: v_add_u16_e32
|
||||||
@ -97,10 +97,10 @@ entry:
|
|||||||
; GFX9-NEXT: v_pk_add_f16 [[ADD3]], [[ADD2]], [[ADD1]]{{$}}
|
; GFX9-NEXT: v_pk_add_f16 [[ADD3]], [[ADD2]], [[ADD1]]{{$}}
|
||||||
; GFX9-NEXT: v_add_f16_sdwa v{{[0-9]+}}, [[ADD3]], [[ADD3]] dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
|
; GFX9-NEXT: v_add_f16_sdwa v{{[0-9]+}}, [[ADD3]], [[ADD3]] dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
|
||||||
|
|
||||||
; VI: v_add_f16_e32
|
; VI: v_add_f16_sdwa
|
||||||
; VI-NEXT: v_add_f16_e32
|
; VI-NEXT: v_add_f16_sdwa
|
||||||
; VI-NEXT: v_add_f16_e32
|
; VI-NEXT: v_add_f16_sdwa
|
||||||
; VI-NEXT: v_add_f16_e32
|
; VI-NEXT: v_add_f16_sdwa
|
||||||
; VI-NEXT: v_add_f16_e32
|
; VI-NEXT: v_add_f16_e32
|
||||||
; VI-NEXT: v_add_f16_e32
|
; VI-NEXT: v_add_f16_e32
|
||||||
; VI-NEXT: v_add_f16_e32
|
; VI-NEXT: v_add_f16_e32
|
||||||
@ -131,7 +131,7 @@ entry:
|
|||||||
; GFX9: v_pk_min_u16 [[MIN:v[0-9]+]], v{{[0-9]+}}, v{{[0-9]+}}{{$}}
|
; GFX9: v_pk_min_u16 [[MIN:v[0-9]+]], v{{[0-9]+}}, v{{[0-9]+}}{{$}}
|
||||||
; GFX9-NEXT: v_min_u16_sdwa v{{[0-9]+}}, [[MIN]], [[MIN]] dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
|
; GFX9-NEXT: v_min_u16_sdwa v{{[0-9]+}}, [[MIN]], [[MIN]] dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
|
||||||
|
|
||||||
; VI: v_min_u16_e32
|
; VI: v_min_u16_sdwa
|
||||||
; VI-NEXT: v_min_u16_e32
|
; VI-NEXT: v_min_u16_e32
|
||||||
; VI-NEXT: v_min_u16_e32
|
; VI-NEXT: v_min_u16_e32
|
||||||
define i16 @reduction_min_v4i16(<4 x i16> %vec4) {
|
define i16 @reduction_min_v4i16(<4 x i16> %vec4) {
|
||||||
@ -152,8 +152,8 @@ entry:
|
|||||||
; GFX9-NEXT: v_pk_min_u16 [[MIN3:v[0-9]+]], [[MIN2]], [[MIN1]]{{$}}
|
; GFX9-NEXT: v_pk_min_u16 [[MIN3:v[0-9]+]], [[MIN2]], [[MIN1]]{{$}}
|
||||||
; GFX9-NEXT: v_min_u16_sdwa v{{[0-9]+}}, [[MIN3]], [[MIN3]] dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
|
; GFX9-NEXT: v_min_u16_sdwa v{{[0-9]+}}, [[MIN3]], [[MIN3]] dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
|
||||||
|
|
||||||
; VI: v_min_u16_e32
|
; VI: v_min_u16_sdwa
|
||||||
; VI-NEXT: v_min_u16_e32
|
; VI-NEXT: v_min_u16_sdwa
|
||||||
; VI-NEXT: v_min_u16_e32
|
; VI-NEXT: v_min_u16_e32
|
||||||
; VI-NEXT: v_min_u16_e32
|
; VI-NEXT: v_min_u16_e32
|
||||||
; VI-NEXT: v_min_u16_e32
|
; VI-NEXT: v_min_u16_e32
|
||||||
@ -224,10 +224,10 @@ entry:
|
|||||||
; GFX9-NEXT: v_pk_min_i16 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}{{$}}
|
; GFX9-NEXT: v_pk_min_i16 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}{{$}}
|
||||||
; GFX9-NEXT: v_min_i16_sdwa v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
|
; GFX9-NEXT: v_min_i16_sdwa v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
|
||||||
|
|
||||||
; VI: v_min_i16_e32
|
; VI: v_min_i16_sdwa
|
||||||
; VI-NEXT: v_min_i16_e32
|
; VI-NEXT: v_min_i16_sdwa
|
||||||
; VI-NEXT: v_min_i16_e32
|
; VI-NEXT: v_min_i16_sdwa
|
||||||
; VI-NEXT: v_min_i16_e32
|
; VI-NEXT: v_min_i16_sdwa
|
||||||
; VI-NEXT: v_min_i16_e32
|
; VI-NEXT: v_min_i16_e32
|
||||||
; VI-NEXT: v_min_i16_e32
|
; VI-NEXT: v_min_i16_e32
|
||||||
; VI-NEXT: v_min_i16_e32
|
; VI-NEXT: v_min_i16_e32
|
||||||
@ -339,7 +339,7 @@ entry:
|
|||||||
; GFX9: v_pk_max_u16 [[MAX:v[0-9]+]], v{{[0-9]+}}, v{{[0-9]+}}{{$}}
|
; GFX9: v_pk_max_u16 [[MAX:v[0-9]+]], v{{[0-9]+}}, v{{[0-9]+}}{{$}}
|
||||||
; GFX9-NEXT: v_max_u16_sdwa v{{[0-9]+}}, [[MAX]], [[MAX]] dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
|
; GFX9-NEXT: v_max_u16_sdwa v{{[0-9]+}}, [[MAX]], [[MAX]] dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
|
||||||
|
|
||||||
; VI: v_max_u16_e32
|
; VI: v_max_u16_sdwa
|
||||||
; VI-NEXT: v_max_u16_e32
|
; VI-NEXT: v_max_u16_e32
|
||||||
; VI-NEXT: v_max_u16_e32
|
; VI-NEXT: v_max_u16_e32
|
||||||
define i16 @reduction_umax_v4i16(<4 x i16> %vec4) {
|
define i16 @reduction_umax_v4i16(<4 x i16> %vec4) {
|
||||||
@ -358,7 +358,7 @@ entry:
|
|||||||
; GFX9: v_pk_max_i16 [[MAX:v[0-9]+]], v{{[0-9]+}}, v{{[0-9]+}}{{$}}
|
; GFX9: v_pk_max_i16 [[MAX:v[0-9]+]], v{{[0-9]+}}, v{{[0-9]+}}{{$}}
|
||||||
; GFX9-NEXT: v_max_i16_sdwa v{{[0-9]+}}, [[MAX]], [[MAX]] dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
|
; GFX9-NEXT: v_max_i16_sdwa v{{[0-9]+}}, [[MAX]], [[MAX]] dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
|
||||||
|
|
||||||
; VI: v_max_i16_e32
|
; VI: v_max_i16_sdwa
|
||||||
; VI-NEXT: v_max_i16_e32
|
; VI-NEXT: v_max_i16_e32
|
||||||
; VI-NEXT: v_max_i16_e32
|
; VI-NEXT: v_max_i16_e32
|
||||||
define i16 @reduction_smax_v4i16(<4 x i16> %vec4) #0 {
|
define i16 @reduction_smax_v4i16(<4 x i16> %vec4) #0 {
|
||||||
@ -377,7 +377,7 @@ entry:
|
|||||||
; GFX9: v_pk_max_f16 [[MAX:v[0-9]+]], v{{[0-9]+}}, v{{[0-9]+}}{{$}}
|
; GFX9: v_pk_max_f16 [[MAX:v[0-9]+]], v{{[0-9]+}}, v{{[0-9]+}}{{$}}
|
||||||
; GFX9-NEXT: v_max_f16_sdwa v{{[0-9]+}}, [[MAX]], [[MAX]] dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
|
; GFX9-NEXT: v_max_f16_sdwa v{{[0-9]+}}, [[MAX]], [[MAX]] dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
|
||||||
|
|
||||||
; VI: v_max_f16_e32
|
; VI: v_max_f16_sdwa
|
||||||
; VI-NEXT: v_max_f16_e32
|
; VI-NEXT: v_max_f16_e32
|
||||||
; VI-NEXT: v_max_f16_e32
|
; VI-NEXT: v_max_f16_e32
|
||||||
define half @reduction_fmax_v4half(<4 x half> %vec4) {
|
define half @reduction_fmax_v4half(<4 x half> %vec4) {
|
||||||
@ -396,7 +396,7 @@ entry:
|
|||||||
; GFX9: v_pk_min_f16 [[MIN:v[0-9]+]], v{{[0-9]+}}, v{{[0-9]+}}{{$}}
|
; GFX9: v_pk_min_f16 [[MIN:v[0-9]+]], v{{[0-9]+}}, v{{[0-9]+}}{{$}}
|
||||||
; GFX9-NEXT: v_min_f16_sdwa v{{[0-9]+}}, [[MIN]], [[MIN]] dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
|
; GFX9-NEXT: v_min_f16_sdwa v{{[0-9]+}}, [[MIN]], [[MIN]] dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
|
||||||
|
|
||||||
; VI: v_min_f16_e32
|
; VI: v_min_f16_sdwa
|
||||||
; VI-NEXT: v_min_f16_e32
|
; VI-NEXT: v_min_f16_e32
|
||||||
; VI-NEXT: v_min_f16_e32
|
; VI-NEXT: v_min_f16_e32
|
||||||
define half @reduction_fmin_v4half(<4 x half> %vec4) {
|
define half @reduction_fmin_v4half(<4 x half> %vec4) {
|
||||||
@ -409,4 +409,4 @@ entry:
|
|||||||
%rdx.minmax.select3 = select <4 x i1> %rdx.minmax.cmp2, <4 x half> %rdx.minmax.select, <4 x half> %rdx.shuf1
|
%rdx.minmax.select3 = select <4 x i1> %rdx.minmax.cmp2, <4 x half> %rdx.minmax.select, <4 x half> %rdx.shuf1
|
||||||
%res = extractelement <4 x half> %rdx.minmax.select3, i32 0
|
%res = extractelement <4 x half> %rdx.minmax.select3, i32 0
|
||||||
ret half %res
|
ret half %res
|
||||||
}
|
}
|
||||||
|
@ -1,6 +1,6 @@
|
|||||||
; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=fiji -amdgpu-sdwa-peephole=0 -mattr=-fp64-fp16-denormals -verify-machineinstrs < %s | FileCheck -check-prefix=NOSDWA -check-prefix=GCN %s
|
; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=fiji -amdgpu-sdwa-peephole=0 -mattr=-fp64-fp16-denormals -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=NOSDWA,GCN %s
|
||||||
; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=fiji -amdgpu-sdwa-peephole -mattr=-fp64-fp16-denormals -verify-machineinstrs < %s | FileCheck -check-prefix=VI -check-prefix=SDWA -check-prefix=GCN %s
|
; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=fiji -amdgpu-sdwa-peephole -mattr=-fp64-fp16-denormals -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=VI,SDWA,GCN %s
|
||||||
; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=gfx900 -amdgpu-sdwa-peephole -mattr=-fp64-fp16-denormals -verify-machineinstrs < %s | FileCheck -check-prefix=GFX9 -check-prefix=SDWA -check-prefix=GCN %s
|
; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=gfx900 -amdgpu-sdwa-peephole -mattr=-fp64-fp16-denormals -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX9,SDWA,GCN %s
|
||||||
|
|
||||||
; GCN-LABEL: {{^}}add_shr_i32:
|
; GCN-LABEL: {{^}}add_shr_i32:
|
||||||
; NOSDWA: v_lshrrev_b32_e32 v[[DST:[0-9]+]], 16, v{{[0-9]+}}
|
; NOSDWA: v_lshrrev_b32_e32 v[[DST:[0-9]+]], 16, v{{[0-9]+}}
|
||||||
|
@ -1,6 +1,6 @@
|
|||||||
; RUN: llc -amdgpu-scalarize-global-loads=false -verify-machineinstrs -march=amdgcn -mcpu=tahiti < %s | FileCheck -check-prefix=GCN -check-prefix=SI %s
|
; RUN: llc -amdgpu-scalarize-global-loads=false -verify-machineinstrs -march=amdgcn -mcpu=tahiti < %s | FileCheck -enable-var-scope -check-prefixes=GCN,SI %s
|
||||||
; RUN: llc -amdgpu-scalarize-global-loads=false -verify-machineinstrs -march=amdgcn -mcpu=tonga -mattr=-flat-for-global < %s | FileCheck -check-prefix=GCN -check-prefix=VI %s
|
; RUN: llc -amdgpu-scalarize-global-loads=false -verify-machineinstrs -march=amdgcn -mcpu=tonga -mattr=-flat-for-global < %s | FileCheck -enable-var-scope -check-prefixes=GCN,VI,GFX89 %s
|
||||||
; RUN: llc -amdgpu-scalarize-global-loads=false -verify-machineinstrs -march=amdgcn -mcpu=gfx900 -mattr=-flat-for-global < %s | FileCheck -check-prefix=GCN -check-prefix=GFX9 %s
|
; RUN: llc -amdgpu-scalarize-global-loads=false -verify-machineinstrs -march=amdgcn -mcpu=gfx900 -mattr=-flat-for-global < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX9,GFX89 %s
|
||||||
|
|
||||||
; Test expansion of scalar selects on vectors.
|
; Test expansion of scalar selects on vectors.
|
||||||
; Evergreen not enabled since it seems to be having problems with doubles.
|
; Evergreen not enabled since it seems to be having problems with doubles.
|
||||||
@ -76,8 +76,14 @@ define amdgpu_kernel void @select_v4i8(<4 x i8> addrspace(1)* %out, <4 x i8> %a,
|
|||||||
}
|
}
|
||||||
|
|
||||||
; GCN-LABEL: {{^}}select_v2i16:
|
; GCN-LABEL: {{^}}select_v2i16:
|
||||||
; GCN: v_cndmask_b32_e32
|
; GFX89: s_load_dword
|
||||||
; GCN-NOT: v_cndmask_b32
|
; GFX89: s_load_dword
|
||||||
|
; GFX89: s_load_dword
|
||||||
|
; GFX89: v_cndmask_b32
|
||||||
|
; GFX89-NOT: v_cndmask_b32
|
||||||
|
|
||||||
|
; SI: v_cndmask_b32_e32
|
||||||
|
; SI-NOT: v_cndmask_b32e
|
||||||
define amdgpu_kernel void @select_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> %a, <2 x i16> %b, i32 %c) #0 {
|
define amdgpu_kernel void @select_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> %a, <2 x i16> %b, i32 %c) #0 {
|
||||||
%cmp = icmp eq i32 %c, 0
|
%cmp = icmp eq i32 %c, 0
|
||||||
%select = select i1 %cmp, <2 x i16> %a, <2 x i16> %b
|
%select = select i1 %cmp, <2 x i16> %a, <2 x i16> %b
|
||||||
@ -86,7 +92,9 @@ define amdgpu_kernel void @select_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16>
|
|||||||
}
|
}
|
||||||
|
|
||||||
; GCN-LABEL: {{^}}v_select_v2i16:
|
; GCN-LABEL: {{^}}v_select_v2i16:
|
||||||
; GCN: v_cndmask_b32_e32
|
; GCN: buffer_load_dword v
|
||||||
|
; GCN: buffer_load_dword v
|
||||||
|
; GCN: v_cndmask_b32
|
||||||
; GCN-NOT: cndmask
|
; GCN-NOT: cndmask
|
||||||
define amdgpu_kernel void @v_select_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %a.ptr, <2 x i16> addrspace(1)* %b.ptr, i32 %c) #0 {
|
define amdgpu_kernel void @v_select_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %a.ptr, <2 x i16> addrspace(1)* %b.ptr, i32 %c) #0 {
|
||||||
%a = load <2 x i16>, <2 x i16> addrspace(1)* %a.ptr
|
%a = load <2 x i16>, <2 x i16> addrspace(1)* %a.ptr
|
||||||
@ -330,7 +338,7 @@ define amdgpu_kernel void @select_v8f64(<8 x double> addrspace(1)* %out, <8 x do
|
|||||||
}
|
}
|
||||||
|
|
||||||
; GCN-LABEL: {{^}}v_select_v2f16:
|
; GCN-LABEL: {{^}}v_select_v2f16:
|
||||||
; GCN: v_cndmask_b32_e32
|
; GCN: v_cndmask_b32
|
||||||
; GCN-NOT: cndmask
|
; GCN-NOT: cndmask
|
||||||
define amdgpu_kernel void @v_select_v2f16(<2 x half> addrspace(1)* %out, <2 x half> addrspace(1)* %a.ptr, <2 x half> addrspace(1)* %b.ptr, i32 %c) #0 {
|
define amdgpu_kernel void @v_select_v2f16(<2 x half> addrspace(1)* %out, <2 x half> addrspace(1)* %a.ptr, <2 x half> addrspace(1)* %b.ptr, i32 %c) #0 {
|
||||||
%a = load <2 x half>, <2 x half> addrspace(1)* %a.ptr
|
%a = load <2 x half>, <2 x half> addrspace(1)* %a.ptr
|
||||||
|
@ -1,6 +1,6 @@
|
|||||||
; RUN: llc -march=amdgcn -mcpu=gfx900 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=GFX9 %s
|
; RUN: llc -march=amdgcn -mcpu=gfx900 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX9 %s
|
||||||
; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI -check-prefix=CIVI %s
|
; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,VI,CIVI %s
|
||||||
; RUN: llc -march=amdgcn -mcpu=bonaire -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=CI -check-prefix=CIVI %s
|
; RUN: llc -march=amdgcn -mcpu=bonaire -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,CI,CIVI %s
|
||||||
|
|
||||||
; GCN-LABEL: {{^}}s_shl_v2i16:
|
; GCN-LABEL: {{^}}s_shl_v2i16:
|
||||||
; GFX9: s_load_dword [[LHS:s[0-9]+]]
|
; GFX9: s_load_dword [[LHS:s[0-9]+]]
|
||||||
@ -8,9 +8,14 @@
|
|||||||
; GFX9: v_mov_b32_e32 [[VLHS:v[0-9]+]], [[LHS]]
|
; GFX9: v_mov_b32_e32 [[VLHS:v[0-9]+]], [[LHS]]
|
||||||
; GFX9: v_pk_lshlrev_b16 [[RESULT:v[0-9]+]], [[RHS]], [[VLHS]]
|
; GFX9: v_pk_lshlrev_b16 [[RESULT:v[0-9]+]], [[RHS]], [[VLHS]]
|
||||||
|
|
||||||
; VI: v_lshlrev_b32_e32
|
; VI: s_load_dword s
|
||||||
; VI: v_lshlrev_b32_sdwa v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
|
; VI: s_load_dword s
|
||||||
; VI: v_or_b32_sdwa v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
|
; VI: s_lshr_b32
|
||||||
|
; VI: s_lshr_b32
|
||||||
|
; VI: s_and_b32
|
||||||
|
; VI: s_and_b32
|
||||||
|
; SI: s_and_B32
|
||||||
|
; SI: s_or_b32
|
||||||
|
|
||||||
; CI-DAG: v_lshlrev_b32_e32
|
; CI-DAG: v_lshlrev_b32_e32
|
||||||
; CI-DAG: v_and_b32_e32 v{{[0-9]+}}, 0xffff, v{{[0-9]+}}
|
; CI-DAG: v_and_b32_e32 v{{[0-9]+}}, 0xffff, v{{[0-9]+}}
|
||||||
|
@ -1,5 +1,5 @@
|
|||||||
; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,SI %s
|
; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,SI %s
|
||||||
; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,VI %s
|
; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,VI %s
|
||||||
|
|
||||||
; GCN-LABEL: {{^}}s_sext_i1_to_i32:
|
; GCN-LABEL: {{^}}s_sext_i1_to_i32:
|
||||||
; GCN: v_cndmask_b32_e64
|
; GCN: v_cndmask_b32_e64
|
||||||
@ -177,10 +177,15 @@ define amdgpu_kernel void @v_sext_v4i8_to_v4i32(i32 addrspace(1)* %out, i32 addr
|
|||||||
ret void
|
ret void
|
||||||
}
|
}
|
||||||
|
|
||||||
; FIXME: s_bfe_i64
|
; FIXME: s_bfe_i64, same on SI and VI
|
||||||
; GCN-LABEL: {{^}}s_sext_v4i16_to_v4i32:
|
; GCN-LABEL: {{^}}s_sext_v4i16_to_v4i32:
|
||||||
; GCN-DAG: s_ashr_i64 s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 48
|
; SI-DAG: s_ashr_i64 s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 48
|
||||||
; GCN-DAG: s_ashr_i32 s{{[0-9]+}}, s{{[0-9]+}}, 16
|
; SI-DAG: s_ashr_i32 s{{[0-9]+}}, s{{[0-9]+}}, 16
|
||||||
|
|
||||||
|
; VI: s_ashr_i32 s{{[0-9]+}}, s{{[0-9]+}}, 16
|
||||||
|
; VI: s_ashr_i32 s{{[0-9]+}}, s{{[0-9]+}}, 16
|
||||||
|
|
||||||
|
|
||||||
; GCN-DAG: s_sext_i32_i16
|
; GCN-DAG: s_sext_i32_i16
|
||||||
; GCN-DAG: s_sext_i32_i16
|
; GCN-DAG: s_sext_i32_i16
|
||||||
; GCN: s_endpgm
|
; GCN: s_endpgm
|
||||||
@ -199,8 +204,6 @@ define amdgpu_kernel void @s_sext_v4i16_to_v4i32(i32 addrspace(1)* %out, i64 %a)
|
|||||||
}
|
}
|
||||||
|
|
||||||
; GCN-LABEL: {{^}}v_sext_v4i16_to_v4i32:
|
; GCN-LABEL: {{^}}v_sext_v4i16_to_v4i32:
|
||||||
; SI-DAG: v_ashr_i64 v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}}, 48
|
|
||||||
; VI-DAG: v_ashrrev_i64 v{{\[[0-9]+:[0-9]+\]}}, 48, v{{\[[0-9]+:[0-9]+\]}}
|
|
||||||
; GCN-DAG: v_ashrrev_i32_e32 v{{[0-9]+}}, 16, v{{[0-9]+}}
|
; GCN-DAG: v_ashrrev_i32_e32 v{{[0-9]+}}, 16, v{{[0-9]+}}
|
||||||
; GCN-DAG: v_ashrrev_i32_e32 v{{[0-9]+}}, 16, v{{[0-9]+}}
|
; GCN-DAG: v_ashrrev_i32_e32 v{{[0-9]+}}, 16, v{{[0-9]+}}
|
||||||
; GCN-DAG: v_bfe_i32 v{{[0-9]+}}, v{{[0-9]+}}, 0, 16
|
; GCN-DAG: v_bfe_i32 v{{[0-9]+}}, v{{[0-9]+}}, 0, 16
|
||||||
|
@ -1,6 +1,6 @@
|
|||||||
; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=gfx900 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GFX9 -check-prefix=GCN %s
|
; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=gfx900 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX9,GCN %s
|
||||||
; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=VI -check-prefix=CIVI -check-prefix=GCN %s
|
; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=VI,CIVI,GCN %s
|
||||||
; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=bonaire -verify-machineinstrs < %s | FileCheck -check-prefix=CI -check-prefix=CIVI -check-prefix=GCN %s
|
; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=bonaire -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=CI,CIVI,GCN %s
|
||||||
|
|
||||||
; GCN-LABEL: {{^}}s_abs_v2i16:
|
; GCN-LABEL: {{^}}s_abs_v2i16:
|
||||||
; GFX9: s_load_dword [[VAL:s[0-9]+]]
|
; GFX9: s_load_dword [[VAL:s[0-9]+]]
|
||||||
@ -8,13 +8,15 @@
|
|||||||
; GFX9: v_pk_max_i16 [[MAX:v[0-9]+]], [[VAL]], [[SUB]]
|
; GFX9: v_pk_max_i16 [[MAX:v[0-9]+]], [[VAL]], [[SUB]]
|
||||||
; GFX9: v_pk_add_u16 [[ADD:v[0-9]+]], [[MAX]], 2
|
; GFX9: v_pk_add_u16 [[ADD:v[0-9]+]], [[MAX]], 2
|
||||||
|
|
||||||
; VI: v_sub_u32_e32
|
; VI: s_lshr_b32 s{{[0-9]+}}, s{{[0-9]+}}, 16
|
||||||
; VI-DAG: v_sub_u32_e32
|
; VI: s_sub_i32
|
||||||
; VI: v_max_i32_sdwa v{{[0-9]+}}, sext(v{{[0-9]+}}), sext(v{{[0-9]+}}) dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:WORD_0
|
; VI: s_sub_i32
|
||||||
; VI: v_max_i32_sdwa v{{[0-9]+}}, sext(v{{[0-9]+}}), sext(v{{[0-9]+}}) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:WORD_0
|
; VI: s_max_i32
|
||||||
; VI: v_add_u32_e32
|
; VI: s_max_i32
|
||||||
; VI: v_add_u32_e32
|
; SI: s_add_i32
|
||||||
; VI: v_or_b32_sdwa v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
|
; SI: s_add_i32
|
||||||
|
; SI: s_and_b32
|
||||||
|
; SI: s_or_b32
|
||||||
|
|
||||||
; CI: v_sub_i32_e32
|
; CI: v_sub_i32_e32
|
||||||
; CI-DAG: v_sub_i32_e32
|
; CI-DAG: v_sub_i32_e32
|
||||||
|
@ -1,12 +1,15 @@
|
|||||||
; RUN: llc -march=amdgcn -mcpu=gfx900 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GFX9 -check-prefix=GCN %s
|
; RUN: llc -march=amdgcn -mcpu=gfx900 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX9,GFX89,GCN %s
|
||||||
; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=VI -check-prefix=GCN %s
|
; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=VI,GFX89,GCN %s
|
||||||
|
|
||||||
; FIXME: Need to handle non-uniform case for function below (load without gep).
|
; FIXME: Need to handle non-uniform case for function below (load without gep).
|
||||||
; GCN-LABEL: {{^}}v_test_sub_v2i16:
|
; GCN-LABEL: {{^}}v_test_sub_v2i16:
|
||||||
|
; GFX89: {{flat|global}}_load_dword
|
||||||
|
; GFX89: {{flat|global}}_load_dword
|
||||||
|
|
||||||
; GFX9: v_pk_sub_i16 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
|
; GFX9: v_pk_sub_i16 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
|
||||||
|
|
||||||
; VI: v_sub_u16_sdwa v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
|
; VI-DAG: v_sub_u16_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
|
||||||
; VI: v_sub_u16_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
|
; VI-DAG: v_sub_u16_sdwa v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
|
||||||
define amdgpu_kernel void @v_test_sub_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %in0, <2 x i16> addrspace(1)* %in1) #1 {
|
define amdgpu_kernel void @v_test_sub_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %in0, <2 x i16> addrspace(1)* %in1) #1 {
|
||||||
%tid = call i32 @llvm.amdgcn.workitem.id.x()
|
%tid = call i32 @llvm.amdgcn.workitem.id.x()
|
||||||
%gep.out = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %out, i32 %tid
|
%gep.out = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %out, i32 %tid
|
||||||
@ -47,10 +50,15 @@ define amdgpu_kernel void @s_test_sub_self_v2i16(<2 x i16> addrspace(1)* %out, <
|
|||||||
|
|
||||||
; FIXME: VI should not scalarize arg access.
|
; FIXME: VI should not scalarize arg access.
|
||||||
; GCN-LABEL: {{^}}s_test_sub_v2i16_kernarg:
|
; GCN-LABEL: {{^}}s_test_sub_v2i16_kernarg:
|
||||||
|
; GCN: s_load_dword s
|
||||||
|
; GCN: s_load_dword s
|
||||||
|
|
||||||
; GFX9: v_pk_sub_i16 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}
|
; GFX9: v_pk_sub_i16 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}
|
||||||
|
|
||||||
; VI: v_subrev_u32_e32
|
; VI: s_sub_i32
|
||||||
; VI: v_subrev_u32_sdwa v{{[0-9]+}}, vcc, v{{[0-9]+}}, v{{[0-9]+}} dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
|
; VI: s_sub_i32
|
||||||
|
; VI: s_lshl_b32
|
||||||
|
; VI: s_and_b32
|
||||||
define amdgpu_kernel void @s_test_sub_v2i16_kernarg(<2 x i16> addrspace(1)* %out, <2 x i16> %a, <2 x i16> %b) #1 {
|
define amdgpu_kernel void @s_test_sub_v2i16_kernarg(<2 x i16> addrspace(1)* %out, <2 x i16> %a, <2 x i16> %b) #1 {
|
||||||
%add = sub <2 x i16> %a, %b
|
%add = sub <2 x i16> %a, %b
|
||||||
store <2 x i16> %add, <2 x i16> addrspace(1)* %out
|
store <2 x i16> %add, <2 x i16> addrspace(1)* %out
|
||||||
@ -58,12 +66,15 @@ define amdgpu_kernel void @s_test_sub_v2i16_kernarg(<2 x i16> addrspace(1)* %out
|
|||||||
}
|
}
|
||||||
|
|
||||||
; GCN-LABEL: {{^}}v_test_sub_v2i16_constant:
|
; GCN-LABEL: {{^}}v_test_sub_v2i16_constant:
|
||||||
; GFX9: s_mov_b32 [[CONST:s[0-9]+]], 0x1c8007b{{$}}
|
; GFX89-DAG: {{flat|global}}_load_dword
|
||||||
|
|
||||||
|
; GFX9-DAG: s_mov_b32 [[CONST:s[0-9]+]], 0x1c8007b{{$}}
|
||||||
; GFX9: v_pk_sub_i16 v{{[0-9]+}}, v{{[0-9]+}}, [[CONST]]
|
; GFX9: v_pk_sub_i16 v{{[0-9]+}}, v{{[0-9]+}}, [[CONST]]
|
||||||
|
|
||||||
; VI-DAG: v_mov_b32_e32 [[K:v[0-9]+]], 0xfffffe38
|
; VI-DAG: v_mov_b32_e32 [[K:v[0-9]+]], 0xfffffe38
|
||||||
; VI-DAG: v_add_u16_sdwa v{{[0-9]+}}, v{{[0-9]+}}, [[K]] dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
|
|
||||||
; VI-DAG: v_add_u16_e32 v{{[0-9]+}}, 0xffffff85, v{{[0-9]+}}
|
; VI-DAG: v_add_u16_e32 v{{[0-9]+}}, 0xffffff85, v{{[0-9]+}}
|
||||||
|
; VI-DAG: v_add_u16_sdwa v{{[0-9]+}}, v{{[0-9]+}}, [[K]] dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
|
||||||
|
; VI: v_or_b32
|
||||||
define amdgpu_kernel void @v_test_sub_v2i16_constant(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %in0) #1 {
|
define amdgpu_kernel void @v_test_sub_v2i16_constant(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %in0) #1 {
|
||||||
%tid = call i32 @llvm.amdgcn.workitem.id.x()
|
%tid = call i32 @llvm.amdgcn.workitem.id.x()
|
||||||
%gep.out = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %out, i32 %tid
|
%gep.out = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %out, i32 %tid
|
||||||
@ -95,11 +106,10 @@ define amdgpu_kernel void @v_test_sub_v2i16_neg_constant(<2 x i16> addrspace(1)*
|
|||||||
; GCN-LABEL: {{^}}v_test_sub_v2i16_inline_neg1:
|
; GCN-LABEL: {{^}}v_test_sub_v2i16_inline_neg1:
|
||||||
; GFX9: v_pk_sub_i16 v{{[0-9]+}}, v{{[0-9]+}}, -1 op_sel_hi:[1,0]{{$}}
|
; GFX9: v_pk_sub_i16 v{{[0-9]+}}, v{{[0-9]+}}, -1 op_sel_hi:[1,0]{{$}}
|
||||||
|
|
||||||
; VI: v_mov_b32_e32 [[ONE:v[0-9]+]], 1
|
; VI-DAG: v_mov_b32_e32 [[ONE:v[0-9]+]], 1
|
||||||
; VI: flat_load_ushort [[LOAD0:v[0-9]+]]
|
; VI-DAG: flat_load_dword [[LOAD:v[0-9]+]]
|
||||||
; VI: flat_load_ushort [[LOAD1:v[0-9]+]]
|
; VI-DAG: v_add_u16_sdwa v{{[0-9]+}}, [[LOAD]], [[ONE]] dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
|
||||||
; VI-DAG: v_add_u16_sdwa v{{[0-9]+}}, [[LOAD0]], [[ONE]] dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
|
; VI-DAG: v_add_u16_e32 v{{[0-9]+}}, 1, [[LOAD]]
|
||||||
; VI-DAG: v_add_u16_e32 v{{[0-9]+}}, 1, [[LOAD1]]
|
|
||||||
; VI: v_or_b32_e32
|
; VI: v_or_b32_e32
|
||||||
define amdgpu_kernel void @v_test_sub_v2i16_inline_neg1(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %in0) #1 {
|
define amdgpu_kernel void @v_test_sub_v2i16_inline_neg1(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %in0) #1 {
|
||||||
%tid = call i32 @llvm.amdgcn.workitem.id.x()
|
%tid = call i32 @llvm.amdgcn.workitem.id.x()
|
||||||
@ -114,11 +124,10 @@ define amdgpu_kernel void @v_test_sub_v2i16_inline_neg1(<2 x i16> addrspace(1)*
|
|||||||
; GCN-LABEL: {{^}}v_test_sub_v2i16_inline_lo_zero_hi:
|
; GCN-LABEL: {{^}}v_test_sub_v2i16_inline_lo_zero_hi:
|
||||||
; GFX9: v_pk_sub_i16 v{{[0-9]+}}, v{{[0-9]+}}, 32{{$}}
|
; GFX9: v_pk_sub_i16 v{{[0-9]+}}, v{{[0-9]+}}, 32{{$}}
|
||||||
|
|
||||||
; VI-NOT: v_subrev_i16
|
; VI: flat_load_dword [[LOAD:v[0-9]+]]
|
||||||
; VI: v_add_u16_e32 v{{[0-9]+}}, 0xffffffe0, v{{[0-9]+}}
|
; VI-DAG: v_and_b32_e32 [[AND:v[0-9]+]], 0xffff0000, [[LOAD]]
|
||||||
; VI-NOT: v_subrev_i16
|
; VI-DAG: v_add_u16_e32 [[ADD:v[0-9]+]], 0xffffffe0, [[LOAD]]
|
||||||
; VI: v_lshlrev_b32_e32 v{{[0-9]+}}, 16,
|
; VI: v_or_b32_e32 v{{[0-9]+}}, [[ADD]], [[AND]]
|
||||||
; VI: v_or_b32_e32
|
|
||||||
define amdgpu_kernel void @v_test_sub_v2i16_inline_lo_zero_hi(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %in0) #1 {
|
define amdgpu_kernel void @v_test_sub_v2i16_inline_lo_zero_hi(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %in0) #1 {
|
||||||
%tid = call i32 @llvm.amdgcn.workitem.id.x()
|
%tid = call i32 @llvm.amdgcn.workitem.id.x()
|
||||||
%gep.out = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %out, i32 %tid
|
%gep.out = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %out, i32 %tid
|
||||||
@ -136,9 +145,10 @@ define amdgpu_kernel void @v_test_sub_v2i16_inline_lo_zero_hi(<2 x i16> addrspac
|
|||||||
|
|
||||||
; VI-NOT: v_subrev_i16
|
; VI-NOT: v_subrev_i16
|
||||||
; VI: v_mov_b32_e32 [[K:v[0-9]+]], 0xffffc080
|
; VI: v_mov_b32_e32 [[K:v[0-9]+]], 0xffffc080
|
||||||
; VI: v_add_u16_sdwa v{{[0-9]+}}, v{{[0-9]+}}, [[K]] dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
|
; VI: flat_load_dword
|
||||||
|
; VI: v_add_u16_sdwa [[ADD:v[0-9]+]], v{{[0-9]+}}, [[K]] dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
|
||||||
; VI-NOT: v_subrev_i16
|
; VI-NOT: v_subrev_i16
|
||||||
; VI: v_or_b32_e32
|
; VI: v_or_b32_sdwa v{{[0-9]+}}, v{{[0-9]+}}, [[ADD]] dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
|
||||||
define amdgpu_kernel void @v_test_sub_v2i16_inline_fp_split(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %in0) #1 {
|
define amdgpu_kernel void @v_test_sub_v2i16_inline_fp_split(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %in0) #1 {
|
||||||
%tid = call i32 @llvm.amdgcn.workitem.id.x()
|
%tid = call i32 @llvm.amdgcn.workitem.id.x()
|
||||||
%gep.out = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %out, i32 %tid
|
%gep.out = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %out, i32 %tid
|
||||||
@ -159,19 +169,12 @@ define amdgpu_kernel void @v_test_sub_v2i16_inline_fp_split(<2 x i16> addrspace(
|
|||||||
; GFX9-DAG: v_lshrrev_b32_e32 v[[ELT1:[0-9]+]], 16, [[ADD]]
|
; GFX9-DAG: v_lshrrev_b32_e32 v[[ELT1:[0-9]+]], 16, [[ADD]]
|
||||||
; GFX9: buffer_store_dwordx2 v{{\[}}[[ELT0]]:[[ELT1]]{{\]}}
|
; GFX9: buffer_store_dwordx2 v{{\[}}[[ELT0]]:[[ELT1]]{{\]}}
|
||||||
|
|
||||||
; VI: flat_load_ushort v[[A_LO:[0-9]+]]
|
; VI: flat_load_dword v[[A:[0-9]+]]
|
||||||
; VI: flat_load_ushort v[[A_HI:[0-9]+]]
|
; VI: flat_load_dword v[[B:[0-9]+]]
|
||||||
|
|
||||||
; VI: flat_load_ushort v[[B_LO:[0-9]+]]
|
; VI: v_sub_u16_e32 v[[ADD_LO:[0-9]+]], v[[A]], v[[B]]
|
||||||
; VI: flat_load_ushort v[[B_HI:[0-9]+]]
|
; VI-NEXT: v_sub_u16_sdwa v[[ADD_HI:[0-9]+]], v[[A]], v[[B]] dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
|
||||||
|
; VI-NEXT: buffer_store_dwordx2 v{{\[}}[[ADD_LO]]:[[ADD_HI]]{{\]}}
|
||||||
; VI: v_sub_u16_e32 v[[ADD_HI:[0-9]+]], v[[A_HI]], v[[B_HI]]
|
|
||||||
; VI-NOT: and
|
|
||||||
; VI-NOT: shl
|
|
||||||
; VI: v_sub_u16_e32 v[[ADD_LO:[0-9]+]], v[[A_LO]], v[[B_LO]]
|
|
||||||
; VI-NOT: and
|
|
||||||
; VI-NOT: shl
|
|
||||||
; VI: buffer_store_dwordx2 v{{\[}}[[ADD_LO]]:[[ADD_HI]]{{\]}}
|
|
||||||
define amdgpu_kernel void @v_test_sub_v2i16_zext_to_v2i32(<2 x i32> addrspace(1)* %out, <2 x i16> addrspace(1)* %in0, <2 x i16> addrspace(1)* %in1) #1 {
|
define amdgpu_kernel void @v_test_sub_v2i16_zext_to_v2i32(<2 x i32> addrspace(1)* %out, <2 x i16> addrspace(1)* %in0, <2 x i16> addrspace(1)* %in1) #1 {
|
||||||
%tid = call i32 @llvm.amdgcn.workitem.id.x()
|
%tid = call i32 @llvm.amdgcn.workitem.id.x()
|
||||||
%gep.out = getelementptr inbounds <2 x i32>, <2 x i32> addrspace(1)* %out, i32 %tid
|
%gep.out = getelementptr inbounds <2 x i32>, <2 x i32> addrspace(1)* %out, i32 %tid
|
||||||
@ -196,14 +199,10 @@ define amdgpu_kernel void @v_test_sub_v2i16_zext_to_v2i32(<2 x i32> addrspace(1)
|
|||||||
; GFX9-DAG: v_lshrrev_b32_e32 v[[ELT1:[0-9]+]], 16, [[ADD]]
|
; GFX9-DAG: v_lshrrev_b32_e32 v[[ELT1:[0-9]+]], 16, [[ADD]]
|
||||||
; GFX9: buffer_store_dwordx4
|
; GFX9: buffer_store_dwordx4
|
||||||
|
|
||||||
; VI: flat_load_ushort v[[A_LO:[0-9]+]]
|
; VI: flat_load_dword [[A:v[0-9]+]]
|
||||||
; VI: flat_load_ushort v[[A_HI:[0-9]+]]
|
; VI: flat_load_dword [[B:v[0-9]+]]
|
||||||
; VI: flat_load_ushort v[[B_LO:[0-9]+]]
|
; VI: v_sub_u16_e32 v[[ADD_LO:[0-9]+]], [[A]], [[B]]
|
||||||
; VI: flat_load_ushort v[[B_HI:[0-9]+]]
|
; VI: v_sub_u16_sdwa v[[ADD_HI:[0-9]+]], [[A]], [[B]] dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
|
||||||
|
|
||||||
; VI: v_sub_u16_e32
|
|
||||||
; VI: v_sub_u16_e32
|
|
||||||
|
|
||||||
; VI: buffer_store_dwordx4
|
; VI: buffer_store_dwordx4
|
||||||
define amdgpu_kernel void @v_test_sub_v2i16_zext_to_v2i64(<2 x i64> addrspace(1)* %out, <2 x i16> addrspace(1)* %in0, <2 x i16> addrspace(1)* %in1) #1 {
|
define amdgpu_kernel void @v_test_sub_v2i16_zext_to_v2i64(<2 x i64> addrspace(1)* %out, <2 x i16> addrspace(1)* %in0, <2 x i16> addrspace(1)* %in1) #1 {
|
||||||
%tid = call i32 @llvm.amdgcn.workitem.id.x()
|
%tid = call i32 @llvm.amdgcn.workitem.id.x()
|
||||||
@ -228,8 +227,11 @@ define amdgpu_kernel void @v_test_sub_v2i16_zext_to_v2i64(<2 x i64> addrspace(1)
|
|||||||
; GFX9-DAG: v_ashrrev_i32_e32 v[[ELT1:[0-9]+]], 16, [[ADD]]
|
; GFX9-DAG: v_ashrrev_i32_e32 v[[ELT1:[0-9]+]], 16, [[ADD]]
|
||||||
; GFX9: buffer_store_dwordx2 v{{\[}}[[ELT0]]:[[ELT1]]{{\]}}
|
; GFX9: buffer_store_dwordx2 v{{\[}}[[ELT0]]:[[ELT1]]{{\]}}
|
||||||
|
|
||||||
; VI: v_sub_u16_e32
|
; VI: flat_load_dword
|
||||||
; VI: v_sub_u16_e32
|
; VI: flat_load_dword
|
||||||
|
; VI-DAG: v_sub_u16_sdwa v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
|
||||||
|
; VI-DAG: v_sub_u16_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
|
||||||
|
|
||||||
; VI: buffer_store_dwordx2
|
; VI: buffer_store_dwordx2
|
||||||
define amdgpu_kernel void @v_test_sub_v2i16_sext_to_v2i32(<2 x i32> addrspace(1)* %out, <2 x i16> addrspace(1)* %in0, <2 x i16> addrspace(1)* %in1) #1 {
|
define amdgpu_kernel void @v_test_sub_v2i16_sext_to_v2i32(<2 x i32> addrspace(1)* %out, <2 x i16> addrspace(1)* %in0, <2 x i16> addrspace(1)* %in1) #1 {
|
||||||
%tid = call i32 @llvm.amdgcn.workitem.id.x()
|
%tid = call i32 @llvm.amdgcn.workitem.id.x()
|
||||||
|
@ -1,18 +1,15 @@
|
|||||||
; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -slp-vectorizer -dce < %s | FileCheck -check-prefixes=GCN,GFX9 %s
|
; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -slp-vectorizer -dce < %s | FileCheck -check-prefixes=GCN,GFX9,GFX89 %s
|
||||||
; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -mcpu=fiji -slp-vectorizer -dce < %s | FileCheck -check-prefixes=GCN,VI %s
|
; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -mcpu=fiji -slp-vectorizer -dce < %s | FileCheck -check-prefixes=GCN,VI,GFX89 %s
|
||||||
|
|
||||||
; FIXME: Should still like to vectorize the memory operations for VI
|
; FIXME: Should still like to vectorize the memory operations for VI
|
||||||
|
|
||||||
; Simple 3-pair chain with loads and stores
|
; Simple 3-pair chain with loads and stores
|
||||||
; GCN-LABEL: @test1_as_3_3_3_v2f16(
|
; GCN-LABEL: @test1_as_3_3_3_v2f16(
|
||||||
; GFX9: load <2 x half>, <2 x half> addrspace(3)*
|
; GFX89: load <2 x half>, <2 x half> addrspace(3)*
|
||||||
; GFX9: load <2 x half>, <2 x half> addrspace(3)*
|
; GFX89: load <2 x half>, <2 x half> addrspace(3)*
|
||||||
; GFX9: fmul <2 x half>
|
; GFX89: fmul <2 x half>
|
||||||
; GFX9: store <2 x half> %{{.*}}, <2 x half> addrspace(3)* %
|
; GFX89: store <2 x half> %{{.*}}, <2 x half> addrspace(3)* %
|
||||||
; GFX9: ret
|
; GFX89: ret
|
||||||
|
|
||||||
; VI: load half
|
|
||||||
; VI: load half
|
|
||||||
define amdgpu_kernel void @test1_as_3_3_3_v2f16(half addrspace(3)* %a, half addrspace(3)* %b, half addrspace(3)* %c) {
|
define amdgpu_kernel void @test1_as_3_3_3_v2f16(half addrspace(3)* %a, half addrspace(3)* %b, half addrspace(3)* %c) {
|
||||||
%i0 = load half, half addrspace(3)* %a, align 2
|
%i0 = load half, half addrspace(3)* %a, align 2
|
||||||
%i1 = load half, half addrspace(3)* %b, align 2
|
%i1 = load half, half addrspace(3)* %b, align 2
|
||||||
@ -29,14 +26,11 @@ define amdgpu_kernel void @test1_as_3_3_3_v2f16(half addrspace(3)* %a, half addr
|
|||||||
}
|
}
|
||||||
|
|
||||||
; GCN-LABEL: @test1_as_3_0_0(
|
; GCN-LABEL: @test1_as_3_0_0(
|
||||||
; GFX9: load <2 x half>, <2 x half> addrspace(3)*
|
; GFX89: load <2 x half>, <2 x half> addrspace(3)*
|
||||||
; GFX9: load <2 x half>, <2 x half>*
|
; GFX89: load <2 x half>, <2 x half>*
|
||||||
; GFX9: fmul <2 x half>
|
; GFX89: fmul <2 x half>
|
||||||
; GFX9: store <2 x half> %{{.*}}, <2 x half>* %
|
; GFX89: store <2 x half> %{{.*}}, <2 x half>* %
|
||||||
; GFX9: ret
|
; GFX89: ret
|
||||||
|
|
||||||
; VI: load half
|
|
||||||
; VI: load half
|
|
||||||
define amdgpu_kernel void @test1_as_3_0_0(half addrspace(3)* %a, half* %b, half* %c) {
|
define amdgpu_kernel void @test1_as_3_0_0(half addrspace(3)* %a, half* %b, half* %c) {
|
||||||
%i0 = load half, half addrspace(3)* %a, align 2
|
%i0 = load half, half addrspace(3)* %a, align 2
|
||||||
%i1 = load half, half* %b, align 2
|
%i1 = load half, half* %b, align 2
|
||||||
@ -53,14 +47,11 @@ define amdgpu_kernel void @test1_as_3_0_0(half addrspace(3)* %a, half* %b, half*
|
|||||||
}
|
}
|
||||||
|
|
||||||
; GCN-LABEL: @test1_as_0_0_3_v2f16(
|
; GCN-LABEL: @test1_as_0_0_3_v2f16(
|
||||||
; GFX9: load <2 x half>, <2 x half>*
|
; GFX89: load <2 x half>, <2 x half>*
|
||||||
; GFX9: load <2 x half>, <2 x half>*
|
; GFX89: load <2 x half>, <2 x half>*
|
||||||
; GFX9: fmul <2 x half>
|
; GFX89: fmul <2 x half>
|
||||||
; GFX9: store <2 x half> %{{.*}}, <2 x half> addrspace(3)* %
|
; GFX89: store <2 x half> %{{.*}}, <2 x half> addrspace(3)* %
|
||||||
; GFX9: ret
|
; GFX89: ret
|
||||||
|
|
||||||
; VI: load half
|
|
||||||
; VI: load half
|
|
||||||
define amdgpu_kernel void @test1_as_0_0_3_v2f16(half* %a, half* %b, half addrspace(3)* %c) {
|
define amdgpu_kernel void @test1_as_0_0_3_v2f16(half* %a, half* %b, half addrspace(3)* %c) {
|
||||||
%i0 = load half, half* %a, align 2
|
%i0 = load half, half* %a, align 2
|
||||||
%i1 = load half, half* %b, align 2
|
%i1 = load half, half* %b, align 2
|
||||||
|
Loading…
x
Reference in New Issue
Block a user