mirror of
https://github.com/RPCSX/llvm.git
synced 2025-04-13 05:20:27 +00:00
[AMDGPU] Untangle SDWA pass from SIShrinkInstructions
Remove dependency of SDWA pass on SIShrinkInstructions. The goal is to move SDWA even higher in the stack to avoid second run of MachineLICM, MachineCSE and SIFoldOperands. Also added handling to preserve original src modifiers. Differential Revision: https://reviews.llvm.org/D33860 git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@304665 91177308-0d34-0410-b5e6-96231b3b80d8
This commit is contained in:
parent
ec193781b1
commit
110a2bc818
@ -734,7 +734,6 @@ void GCNPassConfig::addMachineSSAOptimization() {
|
||||
addPass(&SIFoldOperandsID);
|
||||
addPass(&DeadMachineInstructionElimID);
|
||||
addPass(&SILoadStoreOptimizerID);
|
||||
addPass(createSIShrinkInstructionsPass());
|
||||
if (EnableSDWAPeephole) {
|
||||
addPass(&SIPeepholeSDWAID);
|
||||
addPass(&MachineLICMID);
|
||||
@ -742,6 +741,7 @@ void GCNPassConfig::addMachineSSAOptimization() {
|
||||
addPass(&SIFoldOperandsID);
|
||||
addPass(&DeadMachineInstructionElimID);
|
||||
}
|
||||
addPass(createSIShrinkInstructionsPass());
|
||||
}
|
||||
|
||||
bool GCNPassConfig::addILPOpts() {
|
||||
|
@ -129,7 +129,8 @@ public:
|
||||
bool getNeg() const { return Neg; }
|
||||
bool getSext() const { return Sext; }
|
||||
|
||||
uint64_t getSrcMods() const;
|
||||
uint64_t getSrcMods(const SIInstrInfo *TII,
|
||||
const MachineOperand *SrcOp) const;
|
||||
};
|
||||
|
||||
class SDWADstOperand : public SDWAOperand {
|
||||
@ -240,13 +241,24 @@ static bool isSubregOf(const MachineOperand &SubReg,
|
||||
return SuperMask.all();
|
||||
}
|
||||
|
||||
uint64_t SDWASrcOperand::getSrcMods() const {
|
||||
uint64_t SDWASrcOperand::getSrcMods(const SIInstrInfo *TII,
|
||||
const MachineOperand *SrcOp) const {
|
||||
uint64_t Mods = 0;
|
||||
const auto *MI = SrcOp->getParent();
|
||||
if (TII->getNamedOperand(*MI, AMDGPU::OpName::src0) == SrcOp) {
|
||||
if (auto *Mod = TII->getNamedOperand(*MI, AMDGPU::OpName::src0_modifiers)) {
|
||||
Mods = Mod->getImm();
|
||||
}
|
||||
} else if (TII->getNamedOperand(*MI, AMDGPU::OpName::src1) == SrcOp) {
|
||||
if (auto *Mod = TII->getNamedOperand(*MI, AMDGPU::OpName::src1_modifiers)) {
|
||||
Mods = Mod->getImm();
|
||||
}
|
||||
}
|
||||
if (Abs || Neg) {
|
||||
assert(!Sext &&
|
||||
"Float and integer src modifiers can't be set simulteniously");
|
||||
Mods |= Abs ? SISrcMods::ABS : 0;
|
||||
Mods |= Neg ? SISrcMods::NEG : 0;
|
||||
Mods ^= Neg ? SISrcMods::NEG : 0;
|
||||
} else if (Sext) {
|
||||
Mods |= SISrcMods::SEXT;
|
||||
}
|
||||
@ -312,7 +324,7 @@ bool SDWASrcOperand::convertToSDWA(MachineInstr &MI, const SIInstrInfo *TII) {
|
||||
}
|
||||
copyRegOperand(*Src, *getTargetOperand());
|
||||
SrcSel->setImm(getSrcSel());
|
||||
SrcMods->setImm(getSrcMods());
|
||||
SrcMods->setImm(getSrcMods(TII, Src));
|
||||
getTargetOperand()->setIsKill(false);
|
||||
return true;
|
||||
}
|
||||
@ -409,7 +421,10 @@ void SIPeepholeSDWA::matchSDWAOperands(MachineFunction &MF) {
|
||||
switch (Opcode) {
|
||||
case AMDGPU::V_LSHRREV_B32_e32:
|
||||
case AMDGPU::V_ASHRREV_I32_e32:
|
||||
case AMDGPU::V_LSHLREV_B32_e32: {
|
||||
case AMDGPU::V_LSHLREV_B32_e32:
|
||||
case AMDGPU::V_LSHRREV_B32_e64:
|
||||
case AMDGPU::V_ASHRREV_I32_e64:
|
||||
case AMDGPU::V_LSHLREV_B32_e64: {
|
||||
// from: v_lshrrev_b32_e32 v1, 16/24, v0
|
||||
// to SDWA src:v0 src_sel:WORD_1/BYTE_3
|
||||
|
||||
@ -432,7 +447,8 @@ void SIPeepholeSDWA::matchSDWAOperands(MachineFunction &MF) {
|
||||
TRI->isPhysicalRegister(Dst->getReg()))
|
||||
break;
|
||||
|
||||
if (Opcode == AMDGPU::V_LSHLREV_B32_e32) {
|
||||
if (Opcode == AMDGPU::V_LSHLREV_B32_e32 ||
|
||||
Opcode == AMDGPU::V_LSHLREV_B32_e64) {
|
||||
auto SDWADst = make_unique<SDWADstOperand>(
|
||||
Dst, Src1, *Imm == 16 ? WORD_1 : BYTE_3, UNUSED_PAD);
|
||||
DEBUG(dbgs() << "Match: " << MI << "To: " << *SDWADst << '\n');
|
||||
@ -441,7 +457,8 @@ void SIPeepholeSDWA::matchSDWAOperands(MachineFunction &MF) {
|
||||
} else {
|
||||
auto SDWASrc = make_unique<SDWASrcOperand>(
|
||||
Src1, Dst, *Imm == 16 ? WORD_1 : BYTE_3, false, false,
|
||||
Opcode == AMDGPU::V_LSHRREV_B32_e32 ? false : true);
|
||||
Opcode != AMDGPU::V_LSHRREV_B32_e32 &&
|
||||
Opcode != AMDGPU::V_LSHRREV_B32_e64);
|
||||
DEBUG(dbgs() << "Match: " << MI << "To: " << *SDWASrc << '\n');
|
||||
SDWAOperands[&MI] = std::move(SDWASrc);
|
||||
++NumSDWAPatternsFound;
|
||||
@ -451,7 +468,10 @@ void SIPeepholeSDWA::matchSDWAOperands(MachineFunction &MF) {
|
||||
|
||||
case AMDGPU::V_LSHRREV_B16_e32:
|
||||
case AMDGPU::V_ASHRREV_I16_e32:
|
||||
case AMDGPU::V_LSHLREV_B16_e32: {
|
||||
case AMDGPU::V_LSHLREV_B16_e32:
|
||||
case AMDGPU::V_LSHRREV_B16_e64:
|
||||
case AMDGPU::V_ASHRREV_I16_e64:
|
||||
case AMDGPU::V_LSHLREV_B16_e64: {
|
||||
// from: v_lshrrev_b16_e32 v1, 8, v0
|
||||
// to SDWA src:v0 src_sel:BYTE_1
|
||||
|
||||
@ -472,7 +492,8 @@ void SIPeepholeSDWA::matchSDWAOperands(MachineFunction &MF) {
|
||||
TRI->isPhysicalRegister(Dst->getReg()))
|
||||
break;
|
||||
|
||||
if (Opcode == AMDGPU::V_LSHLREV_B16_e32) {
|
||||
if (Opcode == AMDGPU::V_LSHLREV_B16_e32 ||
|
||||
Opcode == AMDGPU::V_LSHLREV_B16_e64) {
|
||||
auto SDWADst =
|
||||
make_unique<SDWADstOperand>(Dst, Src1, BYTE_1, UNUSED_PAD);
|
||||
DEBUG(dbgs() << "Match: " << MI << "To: " << *SDWADst << '\n');
|
||||
@ -481,7 +502,8 @@ void SIPeepholeSDWA::matchSDWAOperands(MachineFunction &MF) {
|
||||
} else {
|
||||
auto SDWASrc = make_unique<SDWASrcOperand>(
|
||||
Src1, Dst, BYTE_1, false, false,
|
||||
Opcode == AMDGPU::V_LSHRREV_B16_e32 ? false : true);
|
||||
Opcode != AMDGPU::V_LSHRREV_B16_e32 &&
|
||||
Opcode != AMDGPU::V_LSHRREV_B16_e64);
|
||||
DEBUG(dbgs() << "Match: " << MI << "To: " << *SDWASrc << '\n');
|
||||
SDWAOperands[&MI] = std::move(SDWASrc);
|
||||
++NumSDWAPatternsFound;
|
||||
@ -549,20 +571,25 @@ void SIPeepholeSDWA::matchSDWAOperands(MachineFunction &MF) {
|
||||
++NumSDWAPatternsFound;
|
||||
break;
|
||||
}
|
||||
case AMDGPU::V_AND_B32_e32: {
|
||||
case AMDGPU::V_AND_B32_e32:
|
||||
case AMDGPU::V_AND_B32_e64: {
|
||||
// e.g.:
|
||||
// from: v_and_b32_e32 v1, 0x0000ffff/0x000000ff, v0
|
||||
// to SDWA src:v0 src_sel:WORD_0/BYTE_0
|
||||
|
||||
MachineOperand *Src0 = TII->getNamedOperand(MI, AMDGPU::OpName::src0);
|
||||
auto Imm = foldToImm(*Src0);
|
||||
if (!Imm)
|
||||
break;
|
||||
|
||||
if (*Imm != 0x0000ffff && *Imm != 0x000000ff)
|
||||
break;
|
||||
|
||||
MachineOperand *Src1 = TII->getNamedOperand(MI, AMDGPU::OpName::src1);
|
||||
auto ValSrc = Src1;
|
||||
auto Imm = foldToImm(*Src0);
|
||||
|
||||
if (!Imm) {
|
||||
Imm = foldToImm(*Src1);
|
||||
ValSrc = Src0;
|
||||
}
|
||||
|
||||
if (!Imm || (*Imm != 0x0000ffff && *Imm != 0x000000ff))
|
||||
break;
|
||||
|
||||
MachineOperand *Dst = TII->getNamedOperand(MI, AMDGPU::OpName::vdst);
|
||||
|
||||
if (TRI->isPhysicalRegister(Src1->getReg()) ||
|
||||
@ -570,7 +597,7 @@ void SIPeepholeSDWA::matchSDWAOperands(MachineFunction &MF) {
|
||||
break;
|
||||
|
||||
auto SDWASrc = make_unique<SDWASrcOperand>(
|
||||
Src1, Dst, *Imm == 0x0000ffff ? WORD_0 : BYTE_0);
|
||||
ValSrc, Dst, *Imm == 0x0000ffff ? WORD_0 : BYTE_0);
|
||||
DEBUG(dbgs() << "Match: " << MI << "To: " << *SDWASrc << '\n');
|
||||
SDWAOperands[&MI] = std::move(SDWASrc);
|
||||
++NumSDWAPatternsFound;
|
||||
@ -583,28 +610,38 @@ void SIPeepholeSDWA::matchSDWAOperands(MachineFunction &MF) {
|
||||
|
||||
bool SIPeepholeSDWA::isConvertibleToSDWA(const MachineInstr &MI) const {
|
||||
// Check if this instruction has opcode that supports SDWA
|
||||
return AMDGPU::getSDWAOp(MI.getOpcode()) != -1;
|
||||
unsigned Opc = MI.getOpcode();
|
||||
if (AMDGPU::getSDWAOp(Opc) != -1)
|
||||
return true;
|
||||
int Opc32 = AMDGPU::getVOPe32(Opc);
|
||||
if (Opc32 != -1 && AMDGPU::getSDWAOp(Opc32) != -1)
|
||||
return !TII->hasModifiersSet(MI, AMDGPU::OpName::omod) &&
|
||||
!TII->getNamedOperand(MI, AMDGPU::OpName::sdst);
|
||||
return false;
|
||||
}
|
||||
|
||||
bool SIPeepholeSDWA::convertToSDWA(MachineInstr &MI,
|
||||
const SDWAOperandsVector &SDWAOperands) {
|
||||
// Convert to sdwa
|
||||
int SDWAOpcode = AMDGPU::getSDWAOp(MI.getOpcode());
|
||||
if (SDWAOpcode == -1)
|
||||
SDWAOpcode = AMDGPU::getSDWAOp(AMDGPU::getVOPe32(MI.getOpcode()));
|
||||
assert(SDWAOpcode != -1);
|
||||
|
||||
// Copy dst, if it is present in original then should also be present in SDWA
|
||||
MachineOperand *Dst = TII->getNamedOperand(MI, AMDGPU::OpName::vdst);
|
||||
if (!Dst && !TII->isVOPC(MI))
|
||||
return false;
|
||||
|
||||
const MCInstrDesc &SDWADesc = TII->get(SDWAOpcode);
|
||||
|
||||
// Create SDWA version of instruction MI and initialize its operands
|
||||
MachineInstrBuilder SDWAInst =
|
||||
BuildMI(*MI.getParent(), MI, MI.getDebugLoc(), SDWADesc);
|
||||
|
||||
// Copy dst, if it is present in original then should also be present in SDWA
|
||||
MachineOperand *Dst = TII->getNamedOperand(MI, AMDGPU::OpName::vdst);
|
||||
if (Dst) {
|
||||
assert(AMDGPU::getNamedOperandIdx(SDWAOpcode, AMDGPU::OpName::vdst) != -1);
|
||||
SDWAInst.add(*Dst);
|
||||
} else {
|
||||
assert(TII->isVOPC(MI));
|
||||
}
|
||||
|
||||
// Copy src0, initialize src0_modifiers. All sdwa instructions has src0 and
|
||||
@ -614,7 +651,10 @@ bool SIPeepholeSDWA::convertToSDWA(MachineInstr &MI,
|
||||
Src0 &&
|
||||
AMDGPU::getNamedOperandIdx(SDWAOpcode, AMDGPU::OpName::src0) != -1 &&
|
||||
AMDGPU::getNamedOperandIdx(SDWAOpcode, AMDGPU::OpName::src0_modifiers) != -1);
|
||||
SDWAInst.addImm(0);
|
||||
if (auto *Mod = TII->getNamedOperand(MI, AMDGPU::OpName::src0_modifiers))
|
||||
SDWAInst.addImm(Mod->getImm());
|
||||
else
|
||||
SDWAInst.addImm(0);
|
||||
SDWAInst.add(*Src0);
|
||||
|
||||
// Copy src1 if present, initialize src1_modifiers.
|
||||
@ -623,10 +663,11 @@ bool SIPeepholeSDWA::convertToSDWA(MachineInstr &MI,
|
||||
assert(
|
||||
AMDGPU::getNamedOperandIdx(SDWAOpcode, AMDGPU::OpName::src1) != -1 &&
|
||||
AMDGPU::getNamedOperandIdx(SDWAOpcode, AMDGPU::OpName::src1_modifiers) != -1);
|
||||
SDWAInst.addImm(0);
|
||||
if (auto *Mod = TII->getNamedOperand(MI, AMDGPU::OpName::src1_modifiers))
|
||||
SDWAInst.addImm(Mod->getImm());
|
||||
else
|
||||
SDWAInst.addImm(0);
|
||||
SDWAInst.add(*Src1);
|
||||
} else {
|
||||
assert(TII->isVOP1(MI));
|
||||
}
|
||||
|
||||
if (SDWAOpcode == AMDGPU::V_MAC_F16_sdwa ||
|
||||
|
@ -66,7 +66,7 @@ define amdgpu_kernel void @s_test_add_v2i16_kernarg(<2 x i16> addrspace(1)* %out
|
||||
|
||||
; VI-DAG: v_add_u16_e32 v{{[0-9]+}}, 0x7b, v{{[0-9]+}}
|
||||
; VI-DAG: v_mov_b32_e32 v[[SCONST:[0-9]+]], 0x1c8
|
||||
; VI-DAG: v_add_u16_sdwa v{{[0-9]+}}, v[[SCONST]], v{{[0-9]+}} dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
|
||||
; VI-DAG: v_add_u16_sdwa v{{[0-9]+}}, v{{[0-9]+}}, v[[SCONST]] dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
|
||||
define amdgpu_kernel void @v_test_add_v2i16_constant(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %in0) #1 {
|
||||
%tid = call i32 @llvm.amdgcn.workitem.id.x()
|
||||
%gep.out = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %out, i32 %tid
|
||||
@ -84,7 +84,7 @@ define amdgpu_kernel void @v_test_add_v2i16_constant(<2 x i16> addrspace(1)* %ou
|
||||
|
||||
; VI-DAG: v_add_u16_e32 v{{[0-9]+}}, 0xfffffcb3, v{{[0-9]+}}
|
||||
; VI-DAG: v_mov_b32_e32 v[[SCONST:[0-9]+]], 0xfffffc21
|
||||
; VI-DAG: v_add_u16_sdwa v{{[0-9]+}}, v[[SCONST]], v{{[0-9]+}} dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
|
||||
; VI-DAG: v_add_u16_sdwa v{{[0-9]+}}, v{{[0-9]+}}, v[[SCONST]] dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
|
||||
define amdgpu_kernel void @v_test_add_v2i16_neg_constant(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %in0) #1 {
|
||||
%tid = call i32 @llvm.amdgcn.workitem.id.x()
|
||||
%gep.out = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %out, i32 %tid
|
||||
@ -101,7 +101,7 @@ define amdgpu_kernel void @v_test_add_v2i16_neg_constant(<2 x i16> addrspace(1)*
|
||||
; VI: v_mov_b32_e32 v[[SCONST:[0-9]+]], -1
|
||||
; VI: flat_load_ushort [[LOAD0:v[0-9]+]]
|
||||
; VI: flat_load_ushort [[LOAD1:v[0-9]+]]
|
||||
; VI-DAG: v_add_u16_sdwa v{{[0-9]+}}, v[[SCONST]], [[LOAD0]] dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
|
||||
; VI-DAG: v_add_u16_sdwa v{{[0-9]+}}, [[LOAD0]], v[[SCONST]] dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
|
||||
; VI-DAG: v_add_u16_e32 v{{[0-9]+}}, -1, [[LOAD1]]
|
||||
; VI: v_or_b32_e32
|
||||
define amdgpu_kernel void @v_test_add_v2i16_inline_neg1(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %in0) #1 {
|
||||
@ -140,7 +140,7 @@ define amdgpu_kernel void @v_test_add_v2i16_inline_lo_zero_hi(<2 x i16> addrspac
|
||||
|
||||
; VI-NOT: v_add_u16
|
||||
; VI: v_mov_b32_e32 v[[K:[0-9]+]], 0x3f80
|
||||
; VI: v_add_u16_sdwa v{{[0-9]+}}, v[[K]], v{{[0-9]+}} dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
|
||||
; VI: v_add_u16_sdwa v{{[0-9]+}}, v{{[0-9]+}}, v[[K]] dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
|
||||
; VI-NOT: v_add_u16
|
||||
; VI: v_or_b32_e32
|
||||
define amdgpu_kernel void @v_test_add_v2i16_inline_fp_split(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %in0) #1 {
|
||||
|
@ -9,7 +9,7 @@
|
||||
; GFX9: v_pk_ashrrev_i16 [[RESULT:v[0-9]+]], [[RHS]], [[VLHS]]
|
||||
|
||||
; VI: v_ashrrev_i32_sdwa v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
|
||||
; VI: v_or_b32_sdwa v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
|
||||
; VI: v_or_b32_sdwa v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
|
||||
|
||||
; CI: v_ashrrev_i32_e32
|
||||
; CI: v_and_b32_e32 v{{[0-9]+}}, 0xffff, v{{[0-9]+}}
|
||||
|
@ -40,7 +40,7 @@ define amdgpu_kernel void @s_fabs_f16(half addrspace(1)* %out, half %in) {
|
||||
; VI: flat_load_ushort [[LO:v[0-9]+]]
|
||||
; VI: v_mov_b32_e32 [[MASK:v[0-9]+]], 0x7fff{{$}}
|
||||
; VI-DAG: v_and_b32_e32 [[FABS_LO:v[0-9]+]], [[MASK]], [[HI]]
|
||||
; VI-DAG: v_and_b32_sdwa [[FABS_HI:v[0-9]+]], [[MASK]], [[LO]] dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
|
||||
; VI-DAG: v_and_b32_sdwa [[FABS_HI:v[0-9]+]], [[LO]], [[MASK]] dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
|
||||
; VI-DAG: v_or_b32_e32 v{{[0-9]+}}, [[FABS_HI]], [[FABS_LO]]
|
||||
; VI: flat_store_dword
|
||||
|
||||
@ -60,8 +60,8 @@ define amdgpu_kernel void @s_fabs_v2f16(<2 x half> addrspace(1)* %out, <2 x half
|
||||
; CI: v_and_b32_e32 v{{[0-9]+}}, [[MASK]]
|
||||
|
||||
; VI: v_mov_b32_e32 [[MASK:v[0-9]+]], 0x7fff{{$}}
|
||||
; VI-DAG: v_and_b32_sdwa v{{[0-9]+}}, [[MASK]], v{{[0-9]+}} dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
|
||||
; VI-DAG: v_and_b32_sdwa v{{[0-9]+}}, [[MASK]], v{{[0-9]+}} dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
|
||||
; VI-DAG: v_and_b32_sdwa v{{[0-9]+}}, v{{[0-9]+}}, [[MASK]] dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
|
||||
; VI-DAG: v_and_b32_sdwa v{{[0-9]+}}, v{{[0-9]+}}, [[MASK]] dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
|
||||
; VI-DAG: v_and_b32_e32 v{{[0-9]+}}, [[MASK]], v{{[0-9]+}}
|
||||
; VI-DAG: v_and_b32_e32 v{{[0-9]+}}, [[MASK]], v{{[0-9]+}}
|
||||
; VI-DAG: v_or_b32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
|
||||
@ -128,7 +128,7 @@ define amdgpu_kernel void @fabs_free_v2f16(<2 x half> addrspace(1)* %out, i32 %i
|
||||
; CI: v_cvt_f16_f32
|
||||
|
||||
; VI: v_lshrrev_b32_e32 v{{[0-9]+}}, 16,
|
||||
; VI: v_mul_f16_e64 v{{[0-9]+}}, |v{{[0-9]+}}|, v{{[0-9]+}}
|
||||
; VI: v_mul_f16_sdwa v{{[0-9]+}}, |v{{[0-9]+}}|, v{{[0-9]+}} dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
|
||||
; VI: v_mul_f16_e64 v{{[0-9]+}}, |v{{[0-9]+}}|, v{{[0-9]+}}
|
||||
|
||||
; GFX9: v_and_b32_e32 [[FABS:v[0-9]+]], 0x7fff7fff, [[VAL]]
|
||||
|
@ -78,7 +78,7 @@ entry:
|
||||
; SI: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_HI]], v[[R_F16_0]]
|
||||
|
||||
; VI-DAG: v_add_f16_e32 v[[R_F16_LO:[0-9]+]], v[[B_V2_F16]], v[[A_V2_F16]]
|
||||
; VI-DAG: v_add_f16_sdwa v[[R_F16_HI:[0-9]+]], v[[B_V2_F16]], v[[A_V2_F16]] dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
|
||||
; VI-DAG: v_add_f16_sdwa v[[R_F16_HI:[0-9]+]], v[[A_V2_F16]], v[[B_V2_F16]] dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
|
||||
; VI: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_HI]], v[[R_F16_LO]]
|
||||
|
||||
; GCN: buffer_store_dword v[[R_V2_F16]]
|
||||
@ -108,7 +108,7 @@ entry:
|
||||
; SI: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_HI]], v[[R_F16_0]]
|
||||
|
||||
; VI-DAG: v_mov_b32_e32 v[[CONST2:[0-9]+]], 0x4000
|
||||
; VI-DAG: v_add_f16_sdwa v[[R_F16_HI:[0-9]+]], v[[CONST2]], v[[B_V2_F16]] dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
|
||||
; VI-DAG: v_add_f16_sdwa v[[R_F16_HI:[0-9]+]], v[[B_V2_F16]], v[[CONST2]] dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
|
||||
; VI-DAG: v_add_f16_e32 v[[R_F16_0:[0-9]+]], 1.0, v[[B_V2_F16]]
|
||||
; VI: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_HI]], v[[R_F16_0]]
|
||||
|
||||
@ -137,7 +137,7 @@ entry:
|
||||
; SI: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_HI]], v[[R_F16_0]]
|
||||
|
||||
; VI-DAG: v_mov_b32_e32 v[[CONST1:[0-9]+]], 0x3c00
|
||||
; VI-DAG: v_add_f16_sdwa v[[R_F16_0:[0-9]+]], v[[CONST1]], v[[A_V2_F16]] dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
|
||||
; VI-DAG: v_add_f16_sdwa v[[R_F16_0:[0-9]+]], v[[A_V2_F16]], v[[CONST1]] dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
|
||||
; VI-DAG: v_add_f16_e32 v[[R_F16_1:[0-9]+]], 2.0, v[[A_V2_F16]]
|
||||
; VI: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_0]], v[[R_F16_1]]
|
||||
|
||||
|
@ -278,9 +278,9 @@ define amdgpu_kernel void @v_test_canonicalize_fneg_var_v2f16(<2 x half> addrspa
|
||||
}
|
||||
|
||||
; GCN-LABEL: {{^}}s_test_canonicalize_var_v2f16:
|
||||
; VI: v_mul_f16_e64 [[REG0:v[0-9]+]], 1.0, {{s[0-9]+}}
|
||||
; VI-DAG: v_mul_f16_e64 [[REG1:v[0-9]+]], 1.0, {{s[0-9]+}}
|
||||
; VI-DAG: v_lshlrev_b32_e32 v{{[0-9]+}}, 16,
|
||||
; VI: v_mov_b32_e32 [[ONE:v[0-9]+]], 0x3c00
|
||||
; VI: v_mul_f16_sdwa [[REG0:v[0-9]+]], [[ONE]], {{v[0-9]+}} dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
|
||||
; VI: v_mul_f16_e64 [[REG1:v[0-9]+]], 1.0, {{s[0-9]+}}
|
||||
; VI-NOT: v_and_b32
|
||||
|
||||
; GFX9: v_pk_mul_f16 [[REG:v[0-9]+]], 1.0, {{s[0-9]+$}}
|
||||
|
@ -78,7 +78,7 @@ entry:
|
||||
; SI: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_HI]], v[[R_F16_0]]
|
||||
|
||||
; VI-DAG: v_mul_f16_e32 v[[R_F16_LO:[0-9]+]], v[[B_V2_F16]], v[[A_V2_F16]]
|
||||
; VI-DAG: v_mul_f16_sdwa v[[R_F16_HI:[0-9]+]], v[[B_V2_F16]], v[[A_V2_F16]] dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
|
||||
; VI-DAG: v_mul_f16_sdwa v[[R_F16_HI:[0-9]+]], v[[A_V2_F16]], v[[B_V2_F16]] dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
|
||||
; VI: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_HI]], v[[R_F16_LO]]
|
||||
|
||||
; GCN: buffer_store_dword v[[R_V2_F16]]
|
||||
@ -105,7 +105,7 @@ entry:
|
||||
; SI: v_mul_f32_e32 v[[R_F32_1:[0-9]+]], 4.0, v[[B_F32_1]]
|
||||
; SI: v_cvt_f16_f32_e32 v[[R_F16_1:[0-9]+]], v[[R_F32_1]]
|
||||
; VI-DAG: v_mov_b32_e32 v[[CONST4:[0-9]+]], 0x4400
|
||||
; VI-DAG: v_mul_f16_sdwa v[[R_F16_HI:[0-9]+]], v[[CONST4]], v[[B_V2_F16]] dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
|
||||
; VI-DAG: v_mul_f16_sdwa v[[R_F16_HI:[0-9]+]], v[[B_V2_F16]], v[[CONST4]] dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
|
||||
; VI-DAG: v_mul_f16_e32 v[[R_F16_0:[0-9]+]], 0x4200, v[[B_V2_F16]]
|
||||
; SI-DAG: v_lshlrev_b32_e32 v[[R_F16_HI:[0-9]+]], 16, v[[R_F16_1]]
|
||||
; GCN: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_HI]], v[[R_F16_0]]
|
||||
@ -131,7 +131,7 @@ entry:
|
||||
; SI: v_mul_f32_e32 v[[R_F32_1:[0-9]+]], 0x40400000, v[[A_F32_1]]
|
||||
; SI: v_cvt_f16_f32_e32 v[[R_F16_1:[0-9]+]], v[[R_F32_1]]
|
||||
; VI-DAG: v_mov_b32_e32 v[[CONST3:[0-9]+]], 0x4200
|
||||
; VI-DAG: v_mul_f16_sdwa v[[R_F16_HI:[0-9]+]], v[[CONST3]], v[[A_V2_F16]] dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
|
||||
; VI-DAG: v_mul_f16_sdwa v[[R_F16_HI:[0-9]+]], v[[A_V2_F16]], v[[CONST3]] dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
|
||||
; VI-DAG: v_mul_f16_e32 v[[R_F16_0:[0-9]+]], 4.0, v[[A_V2_F16]]
|
||||
; SI-DAG: v_lshlrev_b32_e32 v[[R_F16_HI:[0-9]+]], 16, v[[R_F16_1]]
|
||||
; GCN: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_HI]], v[[R_F16_0]]
|
||||
|
@ -73,7 +73,7 @@ define amdgpu_kernel void @v_fneg_fabs_f16(half addrspace(1)* %out, half addrspa
|
||||
; CIVI: s_mov_b32 [[MASK:s[0-9]+]], 0x8000{{$}}
|
||||
; VI: v_mov_b32_e32 [[VMASK:v[0-9]+]], [[MASK]]
|
||||
; CI: v_or_b32_e32 v{{[0-9]+}}, [[MASK]],
|
||||
; VI: v_or_b32_sdwa v{{[0-9]+}}, [[VMASK]], v{{[0-9]+}} dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
|
||||
; VI: v_or_b32_sdwa v{{[0-9]+}}, v{{[0-9]+}}, [[VMASK]] dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
|
||||
; CIVI: v_or_b32_e32 v{{[0-9]+}}, [[MASK]],
|
||||
; CIVI: flat_store_dword
|
||||
|
||||
@ -92,9 +92,9 @@ define amdgpu_kernel void @s_fneg_fabs_v2f16(<2 x half> addrspace(1)* %out, <2 x
|
||||
; CI: v_or_b32_e32 v{{[0-9]+}}, [[MASK]],
|
||||
; CI: v_or_b32_e32 v{{[0-9]+}}, [[MASK]],
|
||||
; VI: v_mov_b32_e32 [[VMASK:v[0-9]+]], [[MASK]]
|
||||
; VI: v_or_b32_sdwa v{{[0-9]+}}, [[VMASK]], v{{[0-9]+}} dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
|
||||
; VI: v_or_b32_sdwa v{{[0-9]+}}, v{{[0-9]+}}, [[VMASK]] dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
|
||||
; VI: v_or_b32_e32 v{{[0-9]+}}, [[MASK]],
|
||||
; VI: v_or_b32_sdwa v{{[0-9]+}}, [[VMASK]], v{{[0-9]+}} dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
|
||||
; VI: v_or_b32_sdwa v{{[0-9]+}}, v{{[0-9]+}}, [[VMASK]] dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
|
||||
; VI: v_or_b32_e32 v{{[0-9]+}}, [[MASK]],
|
||||
|
||||
; GFX9: s_mov_b32 [[MASK:s[0-9]+]], 0x80008000
|
||||
@ -116,7 +116,7 @@ define amdgpu_kernel void @fneg_fabs_v4f16(<4 x half> addrspace(1)* %out, <4 x h
|
||||
; CI: v_mul_f32_e32 v{{[0-9]+}}, 4.0, v{{[0-9]+}}
|
||||
|
||||
; VI: v_mul_f16_e64 v{{[0-9]+}}, -|v{{[0-9]+}}|, 4.0
|
||||
; VI: v_mul_f16_e64 v{{[0-9]+}}, -|v{{[0-9]+}}|, 4.0
|
||||
; VI: v_mul_f16_sdwa v{{[0-9]+}}, -|v{{[0-9]+}}|, v{{[0-9]+}} dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
|
||||
|
||||
; GFX9: s_and_b32 [[ABS:s[0-9]+]], s{{[0-9]+}}, 0x7fff7fff
|
||||
; GFX9: v_pk_mul_f16 v{{[0-9]+}}, [[ABS]], 4.0 neg_lo:[1,0] neg_hi:[1,0]
|
||||
|
@ -117,7 +117,7 @@ define amdgpu_kernel void @fneg_free_v2f16(<2 x half> addrspace(1)* %out, i32 %i
|
||||
; CI: v_cvt_f16_f32
|
||||
|
||||
; VI: v_lshrrev_b32_e32 v{{[0-9]+}}, 16,
|
||||
; VI: v_mul_f16_e64 v{{[0-9]+}}, -v{{[0-9]+}}, v{{[0-9]+}}
|
||||
; VI: v_mul_f16_sdwa v{{[0-9]+}}, -v{{[0-9]+}}, v{{[0-9]+}} dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
|
||||
; VI: v_mul_f16_e64 v{{[0-9]+}}, -v{{[0-9]+}}, v{{[0-9]+}}
|
||||
|
||||
; GFX9: v_pk_mul_f16 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} neg_lo:[1,0] neg_hi:[1,0]{{$}}
|
||||
|
@ -66,7 +66,7 @@ entry:
|
||||
; VI: v_cvt_f32_f16_sdwa v[[A_F32_1:[0-9]+]], v[[A_V2_F16]] dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
|
||||
; VI: v_cvt_i32_f32_e32 v[[R_I16_0:[0-9]+]], v[[A_F32_0]]
|
||||
; VI: v_cvt_i32_f32_sdwa v[[R_I16_1:[0-9]+]], v[[A_F32_1]] dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD
|
||||
; VI: v_or_b32_sdwa v[[R_V2_I16:[0-9]+]], v[[R_I16_1]], v[[R_I16_0]] dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
|
||||
; VI: v_or_b32_sdwa v[[R_V2_I16:[0-9]+]], v[[R_I16_0]], v[[R_I16_1]] dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
|
||||
|
||||
; GCN: buffer_store_dword v[[R_V2_I16]]
|
||||
; GCN: s_endpgm
|
||||
|
@ -66,7 +66,7 @@ entry:
|
||||
; VI-DAG: v_cvt_f32_f16_sdwa v[[A_F32_0:[0-9]+]], v[[A_V2_F16]] dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
|
||||
; VI: v_cvt_i32_f32_e32 v[[R_I16_1:[0-9]+]], v[[A_F32_1]]
|
||||
; VI: v_cvt_i32_f32_sdwa v[[R_I16_0:[0-9]+]], v[[A_F32_0]] dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD
|
||||
; VI: v_or_b32_sdwa v[[R_V2_I16:[0-9]+]], v[[R_I16_0]], v[[R_I16_1]] dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
|
||||
; VI: v_or_b32_sdwa v[[R_V2_I16:[0-9]+]], v[[R_I16_1]], v[[R_I16_0]] dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
|
||||
|
||||
; GCN: buffer_store_dword v[[R_V2_I16]]
|
||||
; GCN: s_endpgm
|
||||
|
@ -78,7 +78,7 @@ entry:
|
||||
; SI: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_HI]], v[[R_F16_0]]
|
||||
|
||||
; VI-DAG: v_subrev_f16_e32 v[[R_F16_0:[0-9]+]], v[[B_V2_F16]], v[[A_V2_F16]]
|
||||
; VI-DAG: v_subrev_f16_sdwa v[[R_F16_HI:[0-9]+]], v[[B_V2_F16]], v[[A_V2_F16]] dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
|
||||
; VI-DAG: v_sub_f16_sdwa v[[R_F16_HI:[0-9]+]], v[[A_V2_F16]], v[[B_V2_F16]] dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
|
||||
; VI: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_HI]], v[[R_F16_0]]
|
||||
|
||||
; GFX9: v_pk_add_f16 v[[R_V2_F16:[0-9]+]], v[[A_V2_F16]], v[[B_V2_F16]] neg_lo:[0,1] neg_hi:[0,1]
|
||||
@ -146,7 +146,7 @@ entry:
|
||||
; SI: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_HI]], v[[R_F16_0]]
|
||||
|
||||
; VI-DAG: v_mov_b32_e32 [[CONSTM1:v[0-9]+]], 0xbc00
|
||||
; VI-DAG: v_add_f16_sdwa v[[R_F16_HI:[0-9]+]], [[CONSTM1]], v[[A_V2_F16]] dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
|
||||
; VI-DAG: v_add_f16_sdwa v[[R_F16_HI:[0-9]+]], v[[A_V2_F16]], [[CONSTM1]] dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
|
||||
; VI-DAG: v_add_f16_e32 v[[R_F16_0:[0-9]+]], -2.0, v[[A_V2_F16]]
|
||||
; VI: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_HI]], v[[R_F16_0]]
|
||||
|
||||
|
@ -124,7 +124,7 @@ define amdgpu_kernel void @store_literal_imm_v2f16(<2 x half> addrspace(1)* %out
|
||||
; VI: buffer_load_ushort [[VAL1:v[0-9]+]]
|
||||
; VI-DAG: v_add_f16_e32 v{{[0-9]+}}, 0, [[VAL0]]
|
||||
; VI-DAG: v_mov_b32_e32 [[CONST0:v[0-9]+]], 0
|
||||
; VI-DAG: v_add_f16_sdwa v{{[0-9]+}}, [[CONST0]], [[VAL1]] dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
|
||||
; VI-DAG: v_add_f16_sdwa v{{[0-9]+}}, [[VAL1]], [[CONST0]] dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
|
||||
; VI: v_or_b32
|
||||
; VI: buffer_store_dword
|
||||
define amdgpu_kernel void @add_inline_imm_0.0_v2f16(<2 x half> addrspace(1)* %out, <2 x half> %x) #0 {
|
||||
@ -142,7 +142,7 @@ define amdgpu_kernel void @add_inline_imm_0.0_v2f16(<2 x half> addrspace(1)* %ou
|
||||
; VI: buffer_load_ushort [[VAL1:v[0-9]+]]
|
||||
; VI-DAG: v_add_f16_e32 v{{[0-9]+}}, 0.5, [[VAL0]]
|
||||
; VI-DAG: v_mov_b32_e32 [[CONST05:v[0-9]+]], 0x3800
|
||||
; VI-DAG: v_add_f16_sdwa v{{[0-9]+}}, [[CONST05]], [[VAL1]] dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
|
||||
; VI-DAG: v_add_f16_sdwa v{{[0-9]+}}, [[VAL1]], [[CONST05]] dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
|
||||
; VI: v_or_b32
|
||||
; VI: buffer_store_dword
|
||||
define amdgpu_kernel void @add_inline_imm_0.5_v2f16(<2 x half> addrspace(1)* %out, <2 x half> %x) #0 {
|
||||
@ -160,7 +160,7 @@ define amdgpu_kernel void @add_inline_imm_0.5_v2f16(<2 x half> addrspace(1)* %ou
|
||||
; VI: buffer_load_ushort [[VAL1:v[0-9]+]]
|
||||
; VI-DAG: v_add_f16_e32 v{{[0-9]+}}, -0.5, [[VAL0]]
|
||||
; VI-DAG: v_mov_b32_e32 [[CONSTM05:v[0-9]+]], 0xb800
|
||||
; VI-DAG: v_add_f16_sdwa v{{[0-9]+}}, [[CONSTM05]], [[VAL1]] dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
|
||||
; VI-DAG: v_add_f16_sdwa v{{[0-9]+}}, [[VAL1]], [[CONSTM05]] dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
|
||||
; VI: v_or_b32
|
||||
; VI: buffer_store_dword
|
||||
define amdgpu_kernel void @add_inline_imm_neg_0.5_v2f16(<2 x half> addrspace(1)* %out, <2 x half> %x) #0 {
|
||||
@ -178,7 +178,7 @@ define amdgpu_kernel void @add_inline_imm_neg_0.5_v2f16(<2 x half> addrspace(1)*
|
||||
; VI: buffer_load_ushort [[VAL1:v[0-9]+]]
|
||||
; VI-DAG: v_add_f16_e32 v{{[0-9]+}}, 1.0, [[VAL0]]
|
||||
; VI-DAG: v_mov_b32_e32 [[CONST1:v[0-9]+]], 0x3c00
|
||||
; VI-DAG: v_add_f16_sdwa v{{[0-9]+}}, [[CONST1]], [[VAL1]] dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
|
||||
; VI-DAG: v_add_f16_sdwa v{{[0-9]+}}, [[VAL1]], [[CONST1]] dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
|
||||
; VI: v_or_b32
|
||||
; VI: buffer_store_dword
|
||||
define amdgpu_kernel void @add_inline_imm_1.0_v2f16(<2 x half> addrspace(1)* %out, <2 x half> %x) #0 {
|
||||
@ -196,7 +196,7 @@ define amdgpu_kernel void @add_inline_imm_1.0_v2f16(<2 x half> addrspace(1)* %ou
|
||||
; VI: buffer_load_ushort [[VAL1:v[0-9]+]]
|
||||
; VI-DAG: v_add_f16_e32 v{{[0-9]+}}, -1.0, [[VAL0]]
|
||||
; VI-DAG: v_mov_b32_e32 [[CONSTM1:v[0-9]+]], 0xbc00
|
||||
; VI-DAG: v_add_f16_sdwa v{{[0-9]+}}, [[CONSTM1]], [[VAL1]] dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
|
||||
; VI-DAG: v_add_f16_sdwa v{{[0-9]+}}, [[VAL1]], [[CONSTM1]] dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
|
||||
; VI: v_or_b32
|
||||
; VI: buffer_store_dword
|
||||
define amdgpu_kernel void @add_inline_imm_neg_1.0_v2f16(<2 x half> addrspace(1)* %out, <2 x half> %x) #0 {
|
||||
@ -214,7 +214,7 @@ define amdgpu_kernel void @add_inline_imm_neg_1.0_v2f16(<2 x half> addrspace(1)*
|
||||
; VI: buffer_load_ushort [[VAL1:v[0-9]+]]
|
||||
; VI-DAG: v_add_f16_e32 v{{[0-9]+}}, 2.0, [[VAL0]]
|
||||
; VI-DAG: v_mov_b32_e32 [[CONST2:v[0-9]+]], 0x4000
|
||||
; VI-DAG: v_add_f16_sdwa v{{[0-9]+}}, [[CONST2]], [[VAL1]] dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
|
||||
; VI-DAG: v_add_f16_sdwa v{{[0-9]+}}, [[VAL1]], [[CONST2]] dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
|
||||
; VI: v_or_b32
|
||||
; VI: buffer_store_dword
|
||||
define amdgpu_kernel void @add_inline_imm_2.0_v2f16(<2 x half> addrspace(1)* %out, <2 x half> %x) #0 {
|
||||
@ -232,7 +232,7 @@ define amdgpu_kernel void @add_inline_imm_2.0_v2f16(<2 x half> addrspace(1)* %ou
|
||||
; VI: buffer_load_ushort [[VAL1:v[0-9]+]]
|
||||
; VI-DAG: v_add_f16_e32 v{{[0-9]+}}, -2.0, [[VAL0]]
|
||||
; VI-DAG: v_mov_b32_e32 [[CONSTM2:v[0-9]+]], 0xc000
|
||||
; VI-DAG: v_add_f16_sdwa v{{[0-9]+}}, [[CONSTM2]], [[VAL1]] dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
|
||||
; VI-DAG: v_add_f16_sdwa v{{[0-9]+}}, [[VAL1]], [[CONSTM2]] dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
|
||||
; VI: v_or_b32
|
||||
; VI: buffer_store_dword
|
||||
define amdgpu_kernel void @add_inline_imm_neg_2.0_v2f16(<2 x half> addrspace(1)* %out, <2 x half> %x) #0 {
|
||||
@ -250,7 +250,7 @@ define amdgpu_kernel void @add_inline_imm_neg_2.0_v2f16(<2 x half> addrspace(1)*
|
||||
; VI: buffer_load_ushort [[VAL1:v[0-9]+]]
|
||||
; VI-DAG: v_add_f16_e32 v{{[0-9]+}}, 4.0, [[VAL0]]
|
||||
; VI-DAG: v_mov_b32_e32 [[CONST4:v[0-9]+]], 0x4400
|
||||
; VI-DAG: v_add_f16_sdwa v{{[0-9]+}}, [[CONST4]], [[VAL1]] dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
|
||||
; VI-DAG: v_add_f16_sdwa v{{[0-9]+}}, [[VAL1]], [[CONST4]] dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
|
||||
; VI: v_or_b32
|
||||
; VI: buffer_store_dword
|
||||
define amdgpu_kernel void @add_inline_imm_4.0_v2f16(<2 x half> addrspace(1)* %out, <2 x half> %x) #0 {
|
||||
@ -268,7 +268,7 @@ define amdgpu_kernel void @add_inline_imm_4.0_v2f16(<2 x half> addrspace(1)* %ou
|
||||
; VI: buffer_load_ushort [[VAL1:v[0-9]+]]
|
||||
; VI-DAG: v_add_f16_e32 v{{[0-9]+}}, -4.0, [[VAL0]]
|
||||
; VI-DAG: v_mov_b32_e32 [[CONSTM4:v[0-9]+]], 0xc400
|
||||
; VI-DAG: v_add_f16_sdwa v{{[0-9]+}}, [[CONSTM4]], [[VAL1]] dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
|
||||
; VI-DAG: v_add_f16_sdwa v{{[0-9]+}}, [[VAL1]], [[CONSTM4]] dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
|
||||
; VI: v_or_b32
|
||||
; VI: buffer_store_dword
|
||||
define amdgpu_kernel void @add_inline_imm_neg_4.0_v2f16(<2 x half> addrspace(1)* %out, <2 x half> %x) #0 {
|
||||
@ -285,7 +285,7 @@ define amdgpu_kernel void @add_inline_imm_neg_4.0_v2f16(<2 x half> addrspace(1)*
|
||||
; VI: v_mov_b32_e32 [[CONST05:v[0-9]+]], 0x3800
|
||||
; VI: buffer_load_dword
|
||||
; VI-NOT: and
|
||||
; VI-DAG: v_add_f16_sdwa v{{[0-9]+}}, [[CONST05]], v{{[0-9]+}} dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
|
||||
; VI-DAG: v_add_f16_sdwa v{{[0-9]+}}, v{{[0-9]+}}, [[CONST05]] dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
|
||||
; VI-DAG: v_add_f16_e32 v{{[0-9]+}}, 0.5, v{{[0-9]+}}
|
||||
; VI: v_or_b32
|
||||
; VI: buffer_store_dword
|
||||
@ -306,7 +306,7 @@ define amdgpu_kernel void @commute_add_inline_imm_0.5_v2f16(<2 x half> addrspace
|
||||
; VI-DAG: buffer_load_dword
|
||||
; VI-NOT: and
|
||||
; VI-DAG: v_add_f16_e32 v{{[0-9]+}}, [[K]], v{{[0-9]+}}
|
||||
; VI-DAG: v_add_f16_sdwa v{{[0-9]+}}, [[K]], v{{[0-9]+}} dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
|
||||
; VI-DAG: v_add_f16_sdwa v{{[0-9]+}}, v{{[0-9]+}}, [[K]] dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
|
||||
; VI: v_or_b32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
|
||||
; VI: buffer_store_dword
|
||||
define amdgpu_kernel void @commute_add_literal_v2f16(<2 x half> addrspace(1)* %out, <2 x half> addrspace(1)* %in) #0 {
|
||||
@ -325,7 +325,7 @@ define amdgpu_kernel void @commute_add_literal_v2f16(<2 x half> addrspace(1)* %o
|
||||
; VI: buffer_load_ushort [[VAL1:v[0-9]+]]
|
||||
; VI-DAG: v_add_f16_e32 v{{[0-9]+}}, 1, [[VAL0]]
|
||||
; VI-DAG: v_mov_b32_e32 [[CONST1:v[0-9]+]], 1
|
||||
; VI-DAG: v_add_f16_sdwa v{{[0-9]+}}, [[CONST1]], [[VAL1]] dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
|
||||
; VI-DAG: v_add_f16_sdwa v{{[0-9]+}}, [[VAL1]], [[CONST1]] dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
|
||||
; VI: v_or_b32
|
||||
; VI: buffer_store_dword
|
||||
define amdgpu_kernel void @add_inline_imm_1_v2f16(<2 x half> addrspace(1)* %out, <2 x half> %x) #0 {
|
||||
@ -343,7 +343,7 @@ define amdgpu_kernel void @add_inline_imm_1_v2f16(<2 x half> addrspace(1)* %out,
|
||||
; VI: buffer_load_ushort [[VAL1:v[0-9]+]]
|
||||
; VI-DAG: v_add_f16_e32 v{{[0-9]+}}, 2, [[VAL0]]
|
||||
; VI-DAG: v_mov_b32_e32 [[CONST2:v[0-9]+]], 2
|
||||
; VI-DAG: v_add_f16_sdwa v{{[0-9]+}}, [[CONST2]], [[VAL1]] dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
|
||||
; VI-DAG: v_add_f16_sdwa v{{[0-9]+}}, [[VAL1]], [[CONST2]] dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
|
||||
; VI: v_or_b32
|
||||
; VI: buffer_store_dword
|
||||
define amdgpu_kernel void @add_inline_imm_2_v2f16(<2 x half> addrspace(1)* %out, <2 x half> %x) #0 {
|
||||
@ -361,7 +361,7 @@ define amdgpu_kernel void @add_inline_imm_2_v2f16(<2 x half> addrspace(1)* %out,
|
||||
; VI: buffer_load_ushort [[VAL1:v[0-9]+]]
|
||||
; VI-DAG: v_add_f16_e32 v{{[0-9]+}}, 16, [[VAL0]]
|
||||
; VI-DAG: v_mov_b32_e32 [[CONST16:v[0-9]+]], 16
|
||||
; VI-DAG: v_add_f16_sdwa v{{[0-9]+}}, [[CONST16]], [[VAL1]] dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
|
||||
; VI-DAG: v_add_f16_sdwa v{{[0-9]+}}, [[VAL1]], [[CONST16]] dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
|
||||
; VI: v_or_b32
|
||||
; VI: buffer_store_dword
|
||||
define amdgpu_kernel void @add_inline_imm_16_v2f16(<2 x half> addrspace(1)* %out, <2 x half> %x) #0 {
|
||||
@ -379,7 +379,7 @@ define amdgpu_kernel void @add_inline_imm_16_v2f16(<2 x half> addrspace(1)* %out
|
||||
; VI: buffer_load_ushort [[VAL1:v[0-9]+]]
|
||||
; VI-DAG: v_add_f16_e32 v{{[0-9]+}}, -1, [[VAL0]]
|
||||
; VI-DAG: v_mov_b32_e32 [[CONSTM1:v[0-9]+]], 0xffff
|
||||
; VI-DAG: v_add_f16_sdwa v{{[0-9]+}}, [[CONSTM1]], [[VAL1]] dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
|
||||
; VI-DAG: v_add_f16_sdwa v{{[0-9]+}}, [[VAL1]], [[CONSTM1]] dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
|
||||
; VI: v_or_b32
|
||||
; VI: buffer_store_dword
|
||||
define amdgpu_kernel void @add_inline_imm_neg_1_v2f16(<2 x half> addrspace(1)* %out, <2 x half> %x) #0 {
|
||||
@ -397,7 +397,7 @@ define amdgpu_kernel void @add_inline_imm_neg_1_v2f16(<2 x half> addrspace(1)* %
|
||||
; VI: buffer_load_ushort [[VAL1:v[0-9]+]]
|
||||
; VI-DAG: v_add_f16_e32 v{{[0-9]+}}, -2, [[VAL0]]
|
||||
; VI-DAG: v_mov_b32_e32 [[CONSTM2:v[0-9]+]], 0xfffe
|
||||
; VI-DAG: v_add_f16_sdwa v{{[0-9]+}}, [[CONSTM2]], [[VAL1]] dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
|
||||
; VI-DAG: v_add_f16_sdwa v{{[0-9]+}}, [[VAL1]], [[CONSTM2]] dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
|
||||
; VI: v_or_b32
|
||||
; VI: buffer_store_dword
|
||||
define amdgpu_kernel void @add_inline_imm_neg_2_v2f16(<2 x half> addrspace(1)* %out, <2 x half> %x) #0 {
|
||||
@ -415,7 +415,7 @@ define amdgpu_kernel void @add_inline_imm_neg_2_v2f16(<2 x half> addrspace(1)* %
|
||||
; VI: buffer_load_ushort [[VAL1:v[0-9]+]]
|
||||
; VI-DAG: v_add_f16_e32 v{{[0-9]+}}, -16, [[VAL0]]
|
||||
; VI-DAG: v_mov_b32_e32 [[CONSTM16:v[0-9]+]], 0xfff0
|
||||
; VI-DAG: v_add_f16_sdwa v{{[0-9]+}}, [[CONSTM16]], [[VAL1]] dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
|
||||
; VI-DAG: v_add_f16_sdwa v{{[0-9]+}}, [[VAL1]], [[CONSTM16]] dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
|
||||
; VI: v_or_b32
|
||||
; VI: buffer_store_dword
|
||||
define amdgpu_kernel void @add_inline_imm_neg_16_v2f16(<2 x half> addrspace(1)* %out, <2 x half> %x) #0 {
|
||||
@ -433,7 +433,7 @@ define amdgpu_kernel void @add_inline_imm_neg_16_v2f16(<2 x half> addrspace(1)*
|
||||
; VI: buffer_load_ushort [[VAL1:v[0-9]+]]
|
||||
; VI-DAG: v_add_f16_e32 v{{[0-9]+}}, 63, [[VAL0]]
|
||||
; VI-DAG: v_mov_b32_e32 [[CONST63:v[0-9]+]], 63
|
||||
; VI-DAG: v_add_f16_sdwa v{{[0-9]+}}, [[CONST63]], [[VAL1]] dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
|
||||
; VI-DAG: v_add_f16_sdwa v{{[0-9]+}}, [[VAL1]], [[CONST63]] dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
|
||||
; VI: v_or_b32
|
||||
; VI: buffer_store_dword
|
||||
define amdgpu_kernel void @add_inline_imm_63_v2f16(<2 x half> addrspace(1)* %out, <2 x half> %x) #0 {
|
||||
@ -451,7 +451,7 @@ define amdgpu_kernel void @add_inline_imm_63_v2f16(<2 x half> addrspace(1)* %out
|
||||
; VI: buffer_load_ushort [[VAL1:v[0-9]+]]
|
||||
; VI-DAG: v_add_f16_e32 v{{[0-9]+}}, 64, [[VAL0]]
|
||||
; VI-DAG: v_mov_b32_e32 [[CONST64:v[0-9]+]], 64
|
||||
; VI-DAG: v_add_f16_sdwa v{{[0-9]+}}, [[CONST64]], [[VAL1]] dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
|
||||
; VI-DAG: v_add_f16_sdwa v{{[0-9]+}}, [[VAL1]], [[CONST64]] dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
|
||||
; VI: v_or_b32
|
||||
; VI: buffer_store_dword
|
||||
define amdgpu_kernel void @add_inline_imm_64_v2f16(<2 x half> addrspace(1)* %out, <2 x half> %x) #0 {
|
||||
|
@ -261,7 +261,7 @@ define amdgpu_kernel void @v_insertelement_v2i16_0_inlineimm(<2 x i16> addrspace
|
||||
; VI: v_mov_b32_e32 [[K:v[0-9]+]], 0x3e70000
|
||||
; GCN-DAG: flat_load_dword [[VEC:v[0-9]+]]
|
||||
; CI: v_or_b32_e32 [[RES:v[0-9]+]], 0x3e70000, [[VEC]]
|
||||
; VI: v_or_b32_sdwa [[RES:v[0-9]+]], [[K]], [[VEC]] dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
|
||||
; VI: v_or_b32_sdwa [[RES:v[0-9]+]], [[VEC]], [[K]] dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
|
||||
|
||||
; GFX9-DAG: s_movk_i32 [[K:s[0-9]+]], 0x3e7
|
||||
; GFX9-DAG: v_and_b32_e32 [[ELT0:v[0-9]+]], 0xffff, [[VEC]]
|
||||
@ -285,7 +285,7 @@ define amdgpu_kernel void @v_insertelement_v2i16_1(<2 x i16> addrspace(1)* %out,
|
||||
; CI: v_and_b32_e32 [[ELT0:v[0-9]+]], 0xffff, [[VEC]]
|
||||
; GFX9: v_and_b32_e32 [[ELT0:v[0-9]+]], 0xffff, [[VEC]]
|
||||
; CI: v_or_b32_e32 [[RES:v[0-9]+]], 0xfff10000, [[ELT0]]
|
||||
; VI: v_or_b32_sdwa [[RES:v[0-9]+]], [[K]], [[VEC]] dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
|
||||
; VI: v_or_b32_sdwa [[RES:v[0-9]+]], [[VEC]], [[K]] dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
|
||||
; GFX9: v_lshl_or_b32 [[RES:v[0-9]+]], -15, 16, [[ELT0]]
|
||||
; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RES]]
|
||||
define amdgpu_kernel void @v_insertelement_v2i16_1_inlineimm(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %in) #0 {
|
||||
@ -345,7 +345,7 @@ define amdgpu_kernel void @v_insertelement_v2f16_0_inlineimm(<2 x half> addrspac
|
||||
; VI: v_mov_b32_e32 [[K:v[0-9]+]], 0x45000000
|
||||
; GCN-DAG: flat_load_dword [[VEC:v[0-9]+]]
|
||||
; CI: v_or_b32_e32 [[RES:v[0-9]+]], 0x45000000, [[VEC]]
|
||||
; VI: v_or_b32_sdwa [[RES:v[0-9]+]], [[K]], [[VEC]] dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
|
||||
; VI: v_or_b32_sdwa [[RES:v[0-9]+]], [[VEC]], [[K]] dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
|
||||
|
||||
; GFX9-DAG: s_movk_i32 [[K:s[0-9]+]], 0x4500
|
||||
; GFX9-DAG: v_and_b32_e32 [[ELT0:v[0-9]+]], 0xffff, [[VEC]]
|
||||
@ -369,7 +369,7 @@ define amdgpu_kernel void @v_insertelement_v2f16_1(<2 x half> addrspace(1)* %out
|
||||
; CI: v_and_b32_e32 [[ELT0:v[0-9]+]], 0xffff, [[VEC]]
|
||||
; GFX9: v_and_b32_e32 [[ELT0:v[0-9]+]], 0xffff, [[VEC]]
|
||||
; CI: v_or_b32_e32 [[RES:v[0-9]+]], 0x230000, [[ELT0]]
|
||||
; VI: v_or_b32_sdwa [[RES:v[0-9]+]], [[K]], [[VEC]] dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
|
||||
; VI: v_or_b32_sdwa [[RES:v[0-9]+]], [[VEC]], [[K]] dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
|
||||
; GFX9: v_lshl_or_b32 [[RES:v[0-9]+]], 35, 16, [[ELT0]]
|
||||
; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RES]]
|
||||
define amdgpu_kernel void @v_insertelement_v2f16_1_inlineimm(<2 x half> addrspace(1)* %out, <2 x half> addrspace(1)* %in) #0 {
|
||||
|
@ -118,7 +118,7 @@ define amdgpu_kernel void @fmuladd_f16_imm_b(
|
||||
; SI: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_HI]], v[[R_F16_LO]]
|
||||
|
||||
; VI-FLUSH: v_lshrrev_b32_e32 v[[A_F16_1:[0-9]+]], 16, v[[A_V2_F16]]
|
||||
; VI-FLUSH-DAG: v_mac_f16_sdwa v[[A_F16_1]], v[[C_V2_F16]], v[[B_V2_F16]] dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
|
||||
; VI-FLUSH-DAG: v_mac_f16_sdwa v[[A_F16_1]], v[[B_V2_F16]], v[[C_V2_F16]] dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
|
||||
; VI-FLUSH-DAG: v_mac_f16_e32 v[[A_V2_F16]], v[[C_V2_F16]], v[[B_V2_F16]]
|
||||
; VI-FLUSH-DAG: v_lshlrev_b32_e32 v[[R_F16_HI:[0-9]+]], 16, v[[A_F16_1]]
|
||||
; VI-FLUSH-NOT: v_and_b32
|
||||
|
@ -82,7 +82,7 @@ entry:
|
||||
; SI: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_HI]], v[[R_F16_0]]
|
||||
|
||||
; VI-DAG: v_max_f16_e32 v[[R_F16_0:[0-9]+]], v[[B_V2_F16]], v[[A_V2_F16]]
|
||||
; VI-DAG: v_max_f16_sdwa v[[R_F16_1:[0-9]+]], v[[B_V2_F16]], v[[A_V2_F16]] dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
|
||||
; VI-DAG: v_max_f16_sdwa v[[R_F16_1:[0-9]+]], v[[A_V2_F16]], v[[B_V2_F16]] dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
|
||||
; VI-NOT: and
|
||||
; VI: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_1]], v[[R_F16_0]]
|
||||
|
||||
@ -110,7 +110,7 @@ entry:
|
||||
; SI: v_max_f32_e32 v[[R_F32_1:[0-9]+]], 4.0, v[[B_F32_1]]
|
||||
; SI: v_cvt_f16_f32_e32 v[[R_F16_1:[0-9]+]], v[[R_F32_1]]
|
||||
; VI-DAG: v_mov_b32_e32 [[CONST4:v[0-9]+]], 0x4400
|
||||
; VI-DAG: v_max_f16_sdwa v[[R_F16_HI:[0-9]+]], [[CONST4]], v[[B_V2_F16]] dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
|
||||
; VI-DAG: v_max_f16_sdwa v[[R_F16_HI:[0-9]+]], v[[B_V2_F16]], [[CONST4]] dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
|
||||
; VI-DAG: v_max_f16_e32 v[[R_F16_0:[0-9]+]], 0x4200, v[[B_V2_F16]]
|
||||
|
||||
; SI-DAG: v_lshlrev_b32_e32 v[[R_F16_HI:[0-9]+]], 16, v[[R_F16_1]]
|
||||
@ -138,7 +138,7 @@ entry:
|
||||
; SI: v_max_f32_e32 v[[R_F32_1:[0-9]+]], 0x40400000, v[[A_F32_1]]
|
||||
; SI: v_cvt_f16_f32_e32 v[[R_F16_1:[0-9]+]], v[[R_F32_1]]
|
||||
; VI-DAG: v_mov_b32_e32 [[CONST3:v[0-9]+]], 0x4200
|
||||
; VI-DAG: v_max_f16_sdwa v[[R_F16_HI:[0-9]+]], [[CONST3]], v[[B_V2_F16]] dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
|
||||
; VI-DAG: v_max_f16_sdwa v[[R_F16_HI:[0-9]+]], v[[B_V2_F16]], [[CONST3]] dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
|
||||
; VI-DAG: v_max_f16_e32 v[[R_F16_0:[0-9]+]], 4.0, v[[A_V2_F16]]
|
||||
|
||||
; SI-DAG: v_lshlrev_b32_e32 v[[R_F16_HI:[0-9]+]], 16, v[[R_F16_1]]
|
||||
|
@ -81,7 +81,7 @@ entry:
|
||||
; SI: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_HI]], v[[R_F16_0]]
|
||||
|
||||
; VI-DAG: v_min_f16_e32 v[[R_F16_0:[0-9]+]], v[[B_V2_F16]], v[[A_V2_F16]]
|
||||
; VI-DAG: v_min_f16_sdwa v[[R_F16_1:[0-9]+]], v[[B_V2_F16]], v[[A_V2_F16]] dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
|
||||
; VI-DAG: v_min_f16_sdwa v[[R_F16_1:[0-9]+]], v[[A_V2_F16]], v[[B_V2_F16]] dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
|
||||
; VI-NOT: and
|
||||
; VI: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_1]], v[[R_F16_0]]
|
||||
|
||||
@ -111,7 +111,7 @@ entry:
|
||||
; SI-DAG: v_cvt_f16_f32_e32 v[[R_F16_1:[0-9]+]], v[[R_F32_1]]
|
||||
|
||||
; VI-DAG: v_mov_b32_e32 [[CONST4:v[0-9]+]], 0x4400
|
||||
; VI-DAG: v_min_f16_sdwa v[[R_F16_HI:[0-9]+]], [[CONST4]], v[[B_V2_F16]] dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
|
||||
; VI-DAG: v_min_f16_sdwa v[[R_F16_HI:[0-9]+]], v[[B_V2_F16]], [[CONST4]] dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
|
||||
; VI-DAG: v_min_f16_e32 v[[R_F16_0:[0-9]+]], 0x4200, v[[B_V2_F16]]
|
||||
|
||||
; SI-DAG: v_lshlrev_b32_e32 v[[R_F16_HI:[0-9]+]], 16, v[[R_F16_1]]
|
||||
@ -139,7 +139,7 @@ entry:
|
||||
; SI: v_min_f32_e32 v[[R_F32_1:[0-9]+]], 0x40400000, v[[A_F32_1]]
|
||||
; SI: v_cvt_f16_f32_e32 v[[R_F16_1:[0-9]+]], v[[R_F32_1]]
|
||||
; VI-DAG: v_mov_b32_e32 [[CONST3:v[0-9]+]], 0x4200
|
||||
; VI-DAG: v_min_f16_sdwa v[[R_F16_HI:[0-9]+]], [[CONST3]], v[[B_V2_F16]] dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
|
||||
; VI-DAG: v_min_f16_sdwa v[[R_F16_HI:[0-9]+]], v[[B_V2_F16]], [[CONST3]] dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
|
||||
; VI-DAG: v_min_f16_e32 v[[R_F16_0:[0-9]+]], 4.0, v[[A_V2_F16]]
|
||||
|
||||
; SI-DAG: v_lshlrev_b32_e32 v[[R_F16_HI:[0-9]+]], 16, v[[R_F16_1]]
|
||||
|
@ -12,8 +12,10 @@
|
||||
; GCN-DAG: v_lshlrev_b32_e32 [[BYTES:v[0-9]+]], 2, v0
|
||||
; GCN-DAG: v_and_b32_e32 [[CLAMP_IDX:v[0-9]+]], 0x1fc, [[BYTES]]
|
||||
|
||||
; GCN-DAG: v_or_b32_e32 [[LO_OFF:v[0-9]+]], 0x200, [[CLAMP_IDX]]
|
||||
; GCN-DAG: v_or_b32_e32 [[HI_OFF:v[0-9]+]], 0x400, [[CLAMP_IDX]]
|
||||
; GCN-DAG: v_mov_b32_e32 [[C200:v[0-9]+]], 0x200
|
||||
; GCN-DAG: v_mov_b32_e32 [[C400:v[0-9]+]], 0x400
|
||||
; GCN-DAG: v_or_b32_e32 [[LO_OFF:v[0-9]+]], [[C200]], [[CLAMP_IDX]]
|
||||
; GCN-DAG: v_or_b32_e32 [[HI_OFF:v[0-9]+]], [[C400]], [[CLAMP_IDX]]
|
||||
|
||||
; GCN: buffer_load_dword {{v[0-9]+}}, [[LO_OFF]], {{s\[[0-9]+:[0-9]+\]}}, [[SWO]] offen
|
||||
; GCN: buffer_load_dword {{v[0-9]+}}, [[HI_OFF]], {{s\[[0-9]+:[0-9]+\]}}, [[SWO]] offen
|
||||
|
@ -74,7 +74,7 @@ entry:
|
||||
|
||||
; SDWA-DAG: v_mul_u32_u24_sdwa v[[DST_MUL_LO:[0-9]+]], v{{[0-9]+}}, v{{[0-9]+}} dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:WORD_0
|
||||
; SDWA-DAG: v_mul_u32_u24_sdwa v[[DST_MUL_HI:[0-9]+]], v{{[0-9]+}}, v{{[0-9]+}} dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
|
||||
; SDWA: v_or_b32_sdwa v{{[0-9]+}}, v[[DST_MUL_HI]], v[[DST_MUL_LO]] dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
|
||||
; SDWA: v_or_b32_sdwa v{{[0-9]+}}, v[[DST_MUL_LO]], v[[DST_MUL_HI]] dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
|
||||
|
||||
define amdgpu_kernel void @mul_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %ina, <2 x i16> addrspace(1)* %inb) {
|
||||
entry:
|
||||
@ -97,8 +97,8 @@ entry:
|
||||
; SDWA-DAG: v_mul_u32_u24_sdwa v[[DST_MUL1:[0-9]+]], v{{[0-9]+}}, v{{[0-9]+}} dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
|
||||
; SDWA-DAG: v_mul_u32_u24_sdwa v[[DST_MUL2:[0-9]+]], v{{[0-9]+}}, v{{[0-9]+}} dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:WORD_0
|
||||
; SDWA-DAG: v_mul_u32_u24_sdwa v[[DST_MUL3:[0-9]+]], v{{[0-9]+}}, v{{[0-9]+}} dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
|
||||
; SDWA-DAG: v_or_b32_sdwa v{{[0-9]+}}, v[[DST_MUL3]], v[[DST_MUL2]] dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
|
||||
; SDWA-DAG: v_or_b32_sdwa v{{[0-9]+}}, v[[DST_MUL1]], v[[DST_MUL0]] dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
|
||||
; SDWA-DAG: v_or_b32_sdwa v{{[0-9]+}}, v[[DST_MUL2]], v[[DST_MUL3]] dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
|
||||
; SDWA-DAG: v_or_b32_sdwa v{{[0-9]+}}, v[[DST_MUL0]], v[[DST_MUL1]] dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
|
||||
|
||||
define amdgpu_kernel void @mul_v4i16(<4 x i16> addrspace(1)* %out, <4 x i16> addrspace(1)* %ina, <4 x i16> addrspace(1)* %inb) {
|
||||
entry:
|
||||
@ -125,10 +125,10 @@ entry:
|
||||
; SDWA-DAG: v_mul_u32_u24_sdwa v[[DST_MUL5:[0-9]+]], v{{[0-9]+}}, v{{[0-9]+}} dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
|
||||
; SDWA-DAG: v_mul_u32_u24_sdwa v[[DST_MUL6:[0-9]+]], v{{[0-9]+}}, v{{[0-9]+}} dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:WORD_0
|
||||
; SDWA-DAG: v_mul_u32_u24_sdwa v[[DST_MUL7:[0-9]+]], v{{[0-9]+}}, v{{[0-9]+}} dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
|
||||
; SDWA-DAG: v_or_b32_sdwa v{{[0-9]+}}, v[[DST_MUL7]], v[[DST_MUL6]] dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
|
||||
; SDWA-DAG: v_or_b32_sdwa v{{[0-9]+}}, v[[DST_MUL5]], v[[DST_MUL4]] dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
|
||||
; SDWA-DAG: v_or_b32_sdwa v{{[0-9]+}}, v[[DST_MUL3]], v[[DST_MUL2]] dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
|
||||
; SDWA-DAG: v_or_b32_sdwa v{{[0-9]+}}, v[[DST_MUL1]], v[[DST_MUL0]] dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
|
||||
; SDWA-DAG: v_or_b32_sdwa v{{[0-9]+}}, v[[DST_MUL6]], v[[DST_MUL7]] dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
|
||||
; SDWA-DAG: v_or_b32_sdwa v{{[0-9]+}}, v[[DST_MUL4]], v[[DST_MUL5]] dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
|
||||
; SDWA-DAG: v_or_b32_sdwa v{{[0-9]+}}, v[[DST_MUL2]], v[[DST_MUL3]] dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
|
||||
; SDWA-DAG: v_or_b32_sdwa v{{[0-9]+}}, v[[DST_MUL0]], v[[DST_MUL1]] dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
|
||||
|
||||
define amdgpu_kernel void @mul_v8i16(<8 x i16> addrspace(1)* %out, <8 x i16> addrspace(1)* %ina, <8 x i16> addrspace(1)* %inb) {
|
||||
entry:
|
||||
@ -347,8 +347,8 @@ entry:
|
||||
; NOSDWA-NOT: v_mul_u32_u24_sdwa
|
||||
; SDWA-DAG: v_mov_b32_e32 v[[M321:[0-9]+]], 0x141
|
||||
; SDWA-DAG: v_mov_b32_e32 v[[M123:[0-9]+]], 0x7b
|
||||
; SDWA-DAG: v_mul_u32_u24_sdwa v{{[0-9]+}}, v[[M123]], v{{[0-9]+}} dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
|
||||
; SDWA-DAG: v_mul_u32_u24_sdwa v{{[0-9]+}}, v[[M321]], v{{[0-9]+}} dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
|
||||
; SDWA-DAG: v_mul_u32_u24_sdwa v{{[0-9]+}}, v{{[0-9]+}}, v[[M123]] dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
|
||||
; SDWA-DAG: v_mul_u32_u24_sdwa v{{[0-9]+}}, v{{[0-9]+}}, v[[M321]] dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
|
||||
|
||||
define amdgpu_kernel void @immediate_mul_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %in) {
|
||||
entry:
|
||||
@ -367,7 +367,7 @@ entry:
|
||||
; NOSDWA: v_or_b32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
|
||||
; NOSDWA-NOT: v_mul_u32_u24_sdwa
|
||||
|
||||
; SDWA: v_mul_u32_u24_sdwa v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
|
||||
; SDWA: v_mul_u32_u24_sdwa v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
|
||||
|
||||
define amdgpu_kernel void @mulmul_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %ina, <2 x i16> addrspace(1)* %inb) {
|
||||
entry:
|
||||
@ -408,9 +408,9 @@ store_label:
|
||||
; NOSDWA-NOT: v_and_b32_sdwa
|
||||
; NOSDWA-NOT: v_or_b32_sdwa
|
||||
|
||||
; SDWA-DAG: v_and_b32_sdwa v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
|
||||
; SDWA-DAG: v_and_b32_sdwa v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
|
||||
; SDWA-DAG: v_lshlrev_b16_e32 v{{[0-9]+}}, 8, v{{[0-9]+}}
|
||||
; SDWA-DAG: v_and_b32_sdwa v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
|
||||
; SDWA-DAG: v_and_b32_sdwa v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
|
||||
; SDWA-DAG: v_lshlrev_b16_e32 v{{[0-9]+}}, 8, v{{[0-9]+}}
|
||||
; SDWA: v_or_b32_sdwa v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
|
||||
|
||||
|
@ -10,7 +10,7 @@
|
||||
|
||||
; VI: v_lshlrev_b32_e32
|
||||
; VI: v_lshlrev_b32_sdwa v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
|
||||
; VI: v_or_b32_sdwa v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
|
||||
; VI: v_or_b32_sdwa v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
|
||||
|
||||
; CI: v_lshlrev_b32_e32
|
||||
; CI: v_and_b32_e32 v{{[0-9]+}}, 0xffff, v{{[0-9]+}}
|
||||
|
@ -10,11 +10,11 @@
|
||||
|
||||
; VI: v_sub_i32_e32
|
||||
; VI-DAG: v_sub_i32_e32
|
||||
; VI: v_max_i32_sdwa v{{[0-9]+}}, sext(v{{[0-9]+}}), v{{[0-9]+}} dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
|
||||
; VI: v_max_i32_sdwa v{{[0-9]+}}, sext(v{{[0-9]+}}), v{{[0-9]+}} dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
|
||||
; VI: v_max_i32_sdwa v{{[0-9]+}}, v{{[0-9]+}}, sext(v{{[0-9]+}}) dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
|
||||
; VI: v_max_i32_sdwa v{{[0-9]+}}, v{{[0-9]+}}, sext(v{{[0-9]+}}) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
|
||||
; VI: v_add_i32_e32
|
||||
; VI: v_add_i32_e32
|
||||
; VI: v_or_b32_sdwa v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
|
||||
; VI: v_or_b32_sdwa v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
|
||||
|
||||
; CI: v_sub_i32_e32
|
||||
; CI-DAG: v_sub_i32_e32
|
||||
@ -47,7 +47,7 @@ define amdgpu_kernel void @s_abs_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> %
|
||||
; VI: v_max_i16_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
|
||||
; VI: v_max_i16_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
|
||||
; VI: v_add_u16_e32 v{{[0-9]+}}, 2, v{{[0-9]+}}
|
||||
; VI: v_add_u16_sdwa v{{[0-9]+}}, [[TWO]], v{{[0-9]+}} dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
|
||||
; VI: v_add_u16_sdwa v{{[0-9]+}}, v{{[0-9]+}}, [[TWO]] dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
|
||||
; VI-NOT: v_and_b32
|
||||
; VI: v_or_b32_e32
|
||||
define amdgpu_kernel void @v_abs_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %src) #0 {
|
||||
|
@ -5,7 +5,7 @@
|
||||
; GCN-LABEL: {{^}}v_test_sub_v2i16:
|
||||
; GFX9: v_pk_sub_i16 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
|
||||
|
||||
; VI: v_subrev_u16_sdwa v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
|
||||
; VI: v_sub_u16_sdwa v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
|
||||
; VI: v_subrev_u16_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
|
||||
define amdgpu_kernel void @v_test_sub_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %in0, <2 x i16> addrspace(1)* %in1) #1 {
|
||||
%tid = call i32 @llvm.amdgcn.workitem.id.x()
|
||||
@ -62,7 +62,7 @@ define amdgpu_kernel void @s_test_sub_v2i16_kernarg(<2 x i16> addrspace(1)* %out
|
||||
; GFX9: v_pk_sub_i16 v{{[0-9]+}}, v{{[0-9]+}}, [[CONST]]
|
||||
|
||||
; VI-DAG: v_mov_b32_e32 [[K:v[0-9]+]], 0xfffffe38
|
||||
; VI-DAG: v_add_u16_sdwa v{{[0-9]+}}, [[K]], v{{[0-9]+}} dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
|
||||
; VI-DAG: v_add_u16_sdwa v{{[0-9]+}}, v{{[0-9]+}}, [[K]] dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
|
||||
; VI-DAG: v_add_u16_e32 v{{[0-9]+}}, 0xffffff85, v{{[0-9]+}}
|
||||
define amdgpu_kernel void @v_test_sub_v2i16_constant(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %in0) #1 {
|
||||
%tid = call i32 @llvm.amdgcn.workitem.id.x()
|
||||
@ -80,7 +80,7 @@ define amdgpu_kernel void @v_test_sub_v2i16_constant(<2 x i16> addrspace(1)* %ou
|
||||
; GFX9: v_pk_sub_i16 v{{[0-9]+}}, v{{[0-9]+}}, [[CONST]]
|
||||
|
||||
; VI-DAG: v_mov_b32_e32 [[K:v[0-9]+]], 0x3df
|
||||
; VI-DAG: v_add_u16_sdwa v{{[0-9]+}}, [[K]], v{{[0-9]+}}
|
||||
; VI-DAG: v_add_u16_sdwa v{{[0-9]+}}, v{{[0-9]+}}, [[K]]
|
||||
; VI-DAG: v_add_u16_e32 v{{[0-9]+}}, 0x34d, v{{[0-9]+}}
|
||||
define amdgpu_kernel void @v_test_sub_v2i16_neg_constant(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %in0) #1 {
|
||||
%tid = call i32 @llvm.amdgcn.workitem.id.x()
|
||||
@ -98,7 +98,7 @@ define amdgpu_kernel void @v_test_sub_v2i16_neg_constant(<2 x i16> addrspace(1)*
|
||||
; VI: v_mov_b32_e32 [[ONE:v[0-9]+]], 1
|
||||
; VI: flat_load_ushort [[LOAD0:v[0-9]+]]
|
||||
; VI: flat_load_ushort [[LOAD1:v[0-9]+]]
|
||||
; VI-DAG: v_add_u16_sdwa v{{[0-9]+}}, [[ONE]], [[LOAD0]] dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
|
||||
; VI-DAG: v_add_u16_sdwa v{{[0-9]+}}, [[LOAD0]], [[ONE]] dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
|
||||
; VI-DAG: v_add_u16_e32 v{{[0-9]+}}, 1, [[LOAD1]]
|
||||
; VI: v_or_b32_e32
|
||||
define amdgpu_kernel void @v_test_sub_v2i16_inline_neg1(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %in0) #1 {
|
||||
@ -137,7 +137,7 @@ define amdgpu_kernel void @v_test_sub_v2i16_inline_lo_zero_hi(<2 x i16> addrspac
|
||||
|
||||
; VI-NOT: v_subrev_i16
|
||||
; VI: v_mov_b32_e32 [[K:v[0-9]+]], 0xffffc080
|
||||
; VI: v_add_u16_sdwa v{{[0-9]+}}, [[K]], v{{[0-9]+}} dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
|
||||
; VI: v_add_u16_sdwa v{{[0-9]+}}, v{{[0-9]+}}, [[K]] dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
|
||||
; VI-NOT: v_subrev_i16
|
||||
; VI: v_or_b32_e32
|
||||
define amdgpu_kernel void @v_test_sub_v2i16_inline_fp_split(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %in0) #1 {
|
||||
@ -252,7 +252,7 @@ define amdgpu_kernel void @v_test_sub_v2i16_sext_to_v2i32(<2 x i32> addrspace(1)
|
||||
; GFX9: v_pk_sub_i16
|
||||
; GFX9: v_lshrrev_b32_e32 v{{[0-9]+}}, 16, v{{[0-9]+}}
|
||||
|
||||
; VI: v_subrev_u16_sdwa v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
|
||||
; VI: v_sub_u16_sdwa v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
|
||||
; VI: v_subrev_u16_e32
|
||||
|
||||
; GCN: v_bfe_i32 v{{[0-9]+}}, v{{[0-9]+}}, 0, 16
|
||||
|
@ -304,14 +304,14 @@ entry:
|
||||
; GCN: {{buffer|flat}}_load_dword v[[C_V2_F16:[0-9]+]]
|
||||
|
||||
; SI: v_cvt_f32_f16_e32 v[[A_F32_0:[0-9]+]], v[[A_V2_F16]]
|
||||
; SI: v_lshrrev_b32_e32 v[[A_F16_1:[0-9]+]], 16, v[[A_V2_F16]]
|
||||
; SI: v_cvt_f32_f16_e32 v[[B_F32_0:[0-9]+]], v[[B_V2_F16]]
|
||||
; SI-DAG: v_lshrrev_b32_e32 v[[B_F16_1:[0-9]+]], 16, v[[B_V2_F16]]
|
||||
; SI-DAG: v_lshrrev_b32_e32 v[[C_F16_1:[0-9]+]], 16, v[[C_V2_F16]]
|
||||
; SI-DAG: v_cvt_f32_f16_e32 v[[C_F32_0:[0-9]+]], v[[C_V2_F16]]
|
||||
; SI-DAG: v_cvt_f32_f16_e32 v[[A_F32_1:[0-9]+]], v[[A_F16_1]]
|
||||
; SI: v_lshrrev_b32_e32 v[[A_F16_1:[0-9]+]], 16, v[[A_V2_F16]]
|
||||
; SI: v_cvt_f32_f16_e32 v[[A_F32_1:[0-9]+]], v[[A_F16_1]]
|
||||
; SI: v_lshrrev_b32_e32 v[[B_F16_1:[0-9]+]], 16, v[[B_V2_F16]]
|
||||
; SI: v_cvt_f32_f16_e32 v[[B_F32_1:[0-9]+]], v[[B_F16_1]]
|
||||
; SI: v_cvt_f32_f16_e32 v[[B_F32_0:[0-9]+]], v[[B_V2_F16]]
|
||||
; SI: v_lshrrev_b32_e32 v[[C_F16_1:[0-9]+]], 16, v[[C_V2_F16]]
|
||||
; SI: v_cvt_f32_f16_e32 v[[C_F32_1:[0-9]+]], v[[C_F16_1]]
|
||||
; SI-DAG: v_cvt_f32_f16_e32 v[[C_F32_0:[0-9]+]], v[[C_V2_F16]]
|
||||
; SI-DAG: v_mac_f32_e32 v[[C_F32_0]], v[[B_F32_0]], v[[A_F32_0]]
|
||||
; SI-DAG: v_cvt_f16_f32_e32 v[[R_F16_LO:[0-9]+]], v[[C_F32_0]]
|
||||
; SI-DAG: v_mac_f32_e32 v[[C_F32_1]], v[[B_F32_1]], v[[A_F32_1]]
|
||||
@ -320,12 +320,12 @@ entry:
|
||||
; VI-NOT: and
|
||||
; SI: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_HI]], v[[R_F16_LO]]
|
||||
|
||||
; VI-DAG: v_lshrrev_b32_e32 v[[A_F16_1:[0-9]+]], 16, v[[A_V2_F16]]
|
||||
; VI-DAG: v_mac_f16_sdwa v[[A_F16_1]], v[[C_V2_F16]], v[[B_V2_F16]] dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
|
||||
; VI-DAG: v_mac_f16_e32 v[[A_V2_F16]], v[[C_V2_F16]], v[[B_V2_F16]]
|
||||
; VI-DAG: v_lshlrev_b32_e32 v[[R_F16_HI:[0-9]+]], 16, v[[A_F16_1]]
|
||||
; VI-DAG: v_lshrrev_b32_e32 v[[C_F16_1:[0-9]+]], 16, v[[C_V2_F16]]
|
||||
; VI-DAG: v_mac_f16_sdwa v[[C_F16_1]], v[[A_V2_F16]], v[[B_V2_F16]] dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
|
||||
; VI-DAG: v_mac_f16_e32 v[[C_V2_F16]], v[[B_V2_F16]], v[[A_V2_F16]]
|
||||
; VI-DAG: v_lshlrev_b32_e32 v[[R_F16_HI:[0-9]+]], 16, v[[C_F16_1]]
|
||||
; VI-NOT: and
|
||||
; VI: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_HI]], v[[A_V2_F16]]
|
||||
; VI: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_HI]], v[[C_V2_F16]]
|
||||
|
||||
; GCN: {{buffer|flat}}_store_dword v[[R_V2_F16]]
|
||||
; GCN: s_endpgm
|
||||
@ -336,7 +336,9 @@ define amdgpu_kernel void @mac_v2f16(
|
||||
<2 x half> addrspace(1)* %c) #0 {
|
||||
entry:
|
||||
%a.val = load <2 x half>, <2 x half> addrspace(1)* %a
|
||||
call void @llvm.amdgcn.s.barrier() #2
|
||||
%b.val = load <2 x half>, <2 x half> addrspace(1)* %b
|
||||
call void @llvm.amdgcn.s.barrier() #2
|
||||
%c.val = load <2 x half>, <2 x half> addrspace(1)* %c
|
||||
|
||||
%t.val = fmul <2 x half> %a.val, %b.val
|
||||
@ -485,7 +487,7 @@ entry:
|
||||
; VI-DAG: v_mov_b32_e32 [[ZERO:v[0-9]+]], 0
|
||||
; VI-DAG: v_sub_f16_e32 v[[NEG_A1:[0-9]+]], 0, v{{[0-9]+}}
|
||||
; VI-DAG: v_sub_f16_sdwa v[[NEG_A0:[0-9]+]], [[ZERO]], v{{[0-9]+}} dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
|
||||
; VI-DAG: v_mac_f16_sdwa v{{[0-9]+}}, v{{[0-9]+}}, v[[NEG_A0]] dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
|
||||
; VI-DAG: v_mac_f16_sdwa v{{[0-9]+}}, v[[NEG_A0]], v{{[0-9]+}} dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
|
||||
; VI-DAG: v_mac_f16_e32 v{{[0-9]+}}, v{{[0-9]+}}, v[[NEG_A1]]
|
||||
|
||||
; GCN: s_endpgm
|
||||
@ -517,7 +519,7 @@ entry:
|
||||
; VI: v_mov_b32_e32 [[ZERO:v[0-9]+]], 0
|
||||
; VI: v_sub_f16_e32 v[[NEG_A1:[0-9]+]], 0, v{{[0-9]+}}
|
||||
; VI: v_sub_f16_sdwa v[[NEG_A0:[0-9]+]], [[ZERO]], v{{[0-9]+}} dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
|
||||
; VI-DAG: v_mac_f16_sdwa v{{[0-9]+}}, v[[NEG_A0]], v{{[0-9]+}} dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
|
||||
; VI-DAG: v_mac_f16_sdwa v{{[0-9]+}}, v{{[0-9]+}}, v[[NEG_A0]] dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
|
||||
; VI-DAG: v_mac_f16_e32 v{{[0-9]+}}, v[[NEG_A1]], v{{[0-9]+}}
|
||||
|
||||
; GCN: s_endpgm
|
||||
@ -670,5 +672,8 @@ entry:
|
||||
ret void
|
||||
}
|
||||
|
||||
declare void @llvm.amdgcn.s.barrier() #2
|
||||
|
||||
attributes #0 = { nounwind "no-signed-zeros-fp-math"="false" }
|
||||
attributes #1 = { nounwind "no-signed-zeros-fp-math"="true" }
|
||||
attributes #2 = { nounwind convergent }
|
||||
|
Loading…
x
Reference in New Issue
Block a user