[AMDGPU] Add GFX12 WMMA and SWMMAC instructions (#77795)

Co-authored-by: Petar Avramovic <Petar.Avramovic@amd.com>
Co-authored-by: Piotr Sobczak <piotr.sobczak@amd.com>
This commit is contained in:
Mirko Brkušanin 2024-01-24 13:43:07 +01:00 committed by Tom Stellard
parent aa4cb0e313
commit ed48280f8e
65 changed files with 17708 additions and 111 deletions

View File

@ -436,5 +436,67 @@ TARGET_BUILTIN(__builtin_amdgcn_global_load_tr_i32, "ii*1", "nc", "gfx12-insts,w
TARGET_BUILTIN(__builtin_amdgcn_global_load_tr_v4i16, "V4sV4s*1", "nc", "gfx12-insts,wavefrontsize64")
TARGET_BUILTIN(__builtin_amdgcn_global_load_tr_v4f16, "V4hV4h*1", "nc", "gfx12-insts,wavefrontsize64")
//===----------------------------------------------------------------------===//
// WMMA builtins.
// Postfix w32 indicates the builtin requires wavefront size of 32.
// Postfix w64 indicates the builtin requires wavefront size of 64.
//
// Some of these are very similar to their GFX11 counterparts, but they don't
// require replication of the A,B matrices, so they use fewer vector elements.
// Therefore, we add an "_gfx12" suffix to distinguish them from the existing
// builtins.
//===----------------------------------------------------------------------===//
TARGET_BUILTIN(__builtin_amdgcn_wmma_f32_16x16x16_f16_w32_gfx12, "V8fV8hV8hV8f", "nc", "gfx12-insts,wavefrontsize32")
TARGET_BUILTIN(__builtin_amdgcn_wmma_f32_16x16x16_bf16_w32_gfx12, "V8fV8sV8sV8f", "nc", "gfx12-insts,wavefrontsize32")
TARGET_BUILTIN(__builtin_amdgcn_wmma_f16_16x16x16_f16_w32_gfx12, "V8hV8hV8hV8h", "nc", "gfx12-insts,wavefrontsize32")
TARGET_BUILTIN(__builtin_amdgcn_wmma_bf16_16x16x16_bf16_w32_gfx12, "V8sV8sV8sV8s", "nc", "gfx12-insts,wavefrontsize32")
TARGET_BUILTIN(__builtin_amdgcn_wmma_i32_16x16x16_iu8_w32_gfx12, "V8iIbV2iIbV2iV8iIb", "nc", "gfx12-insts,wavefrontsize32")
TARGET_BUILTIN(__builtin_amdgcn_wmma_i32_16x16x16_iu4_w32_gfx12, "V8iIbiIbiV8iIb", "nc", "gfx12-insts,wavefrontsize32")
// These are gfx12-only, but for consistency with the other WMMA variants we're
// keeping the "_gfx12" suffix.
TARGET_BUILTIN(__builtin_amdgcn_wmma_f32_16x16x16_fp8_fp8_w32_gfx12, "V8fV2iV2iV8f", "nc", "gfx12-insts,wavefrontsize32")
TARGET_BUILTIN(__builtin_amdgcn_wmma_f32_16x16x16_fp8_bf8_w32_gfx12, "V8fV2iV2iV8f", "nc", "gfx12-insts,wavefrontsize32")
TARGET_BUILTIN(__builtin_amdgcn_wmma_f32_16x16x16_bf8_fp8_w32_gfx12, "V8fV2iV2iV8f", "nc", "gfx12-insts,wavefrontsize32")
TARGET_BUILTIN(__builtin_amdgcn_wmma_f32_16x16x16_bf8_bf8_w32_gfx12, "V8fV2iV2iV8f", "nc", "gfx12-insts,wavefrontsize32")
TARGET_BUILTIN(__builtin_amdgcn_wmma_i32_16x16x32_iu4_w32_gfx12, "V8iIbV2iIbV2iV8iIb", "nc", "gfx12-insts,wavefrontsize32")
TARGET_BUILTIN(__builtin_amdgcn_wmma_f32_16x16x16_f16_w64_gfx12, "V4fV4hV4hV4f", "nc", "gfx12-insts,wavefrontsize64")
TARGET_BUILTIN(__builtin_amdgcn_wmma_f32_16x16x16_bf16_w64_gfx12, "V4fV4sV4sV4f", "nc", "gfx12-insts,wavefrontsize64")
TARGET_BUILTIN(__builtin_amdgcn_wmma_f16_16x16x16_f16_w64_gfx12, "V4hV4hV4hV4h", "nc", "gfx12-insts,wavefrontsize64")
TARGET_BUILTIN(__builtin_amdgcn_wmma_bf16_16x16x16_bf16_w64_gfx12, "V4sV4sV4sV4s", "nc", "gfx12-insts,wavefrontsize64")
TARGET_BUILTIN(__builtin_amdgcn_wmma_i32_16x16x16_iu8_w64_gfx12, "V4iIbiIbiV4iIb", "nc", "gfx12-insts,wavefrontsize64")
TARGET_BUILTIN(__builtin_amdgcn_wmma_i32_16x16x16_iu4_w64_gfx12, "V4iIbiIbiV4iIb", "nc", "gfx12-insts,wavefrontsize64")
// These are gfx12-only, but for consistency with the other WMMA variants we're
// keeping the "_gfx12" suffix.
TARGET_BUILTIN(__builtin_amdgcn_wmma_f32_16x16x16_fp8_fp8_w64_gfx12, "V4fiiV4f", "nc", "gfx12-insts,wavefrontsize64")
TARGET_BUILTIN(__builtin_amdgcn_wmma_f32_16x16x16_fp8_bf8_w64_gfx12, "V4fiiV4f", "nc", "gfx12-insts,wavefrontsize64")
TARGET_BUILTIN(__builtin_amdgcn_wmma_f32_16x16x16_bf8_fp8_w64_gfx12, "V4fiiV4f", "nc", "gfx12-insts,wavefrontsize64")
TARGET_BUILTIN(__builtin_amdgcn_wmma_f32_16x16x16_bf8_bf8_w64_gfx12, "V4fiiV4f", "nc", "gfx12-insts,wavefrontsize64")
TARGET_BUILTIN(__builtin_amdgcn_wmma_i32_16x16x32_iu4_w64_gfx12, "V4iIbiIbiV4iIb", "nc", "gfx12-insts,wavefrontsize64")
TARGET_BUILTIN(__builtin_amdgcn_swmmac_f32_16x16x32_f16_w32, "V8fV8hV16hV8fs", "nc", "gfx12-insts,wavefrontsize32")
TARGET_BUILTIN(__builtin_amdgcn_swmmac_f32_16x16x32_bf16_w32, "V8fV8sV16sV8fs", "nc", "gfx12-insts,wavefrontsize32")
TARGET_BUILTIN(__builtin_amdgcn_swmmac_f16_16x16x32_f16_w32, "V8hV8hV16hV8hs", "nc", "gfx12-insts,wavefrontsize32")
TARGET_BUILTIN(__builtin_amdgcn_swmmac_bf16_16x16x32_bf16_w32, "V8sV8sV16sV8ss", "nc", "gfx12-insts,wavefrontsize32")
TARGET_BUILTIN(__builtin_amdgcn_swmmac_i32_16x16x32_iu8_w32, "V8iIbV2iIbV4iV8isIb", "nc", "gfx12-insts,wavefrontsize32")
TARGET_BUILTIN(__builtin_amdgcn_swmmac_i32_16x16x32_iu4_w32, "V8iIbiIbV2iV8isIb", "nc", "gfx12-insts,wavefrontsize32")
TARGET_BUILTIN(__builtin_amdgcn_swmmac_i32_16x16x64_iu4_w32, "V8iIbV2iIbV4iV8isIb", "nc", "gfx12-insts,wavefrontsize32")
TARGET_BUILTIN(__builtin_amdgcn_swmmac_f32_16x16x32_fp8_fp8_w32, "V8fV2iV4iV8fs", "nc", "gfx12-insts,wavefrontsize32")
TARGET_BUILTIN(__builtin_amdgcn_swmmac_f32_16x16x32_fp8_bf8_w32, "V8fV2iV4iV8fs", "nc", "gfx12-insts,wavefrontsize32")
TARGET_BUILTIN(__builtin_amdgcn_swmmac_f32_16x16x32_bf8_fp8_w32, "V8fV2iV4iV8fs", "nc", "gfx12-insts,wavefrontsize32")
TARGET_BUILTIN(__builtin_amdgcn_swmmac_f32_16x16x32_bf8_bf8_w32, "V8fV2iV4iV8fs", "nc", "gfx12-insts,wavefrontsize32")
TARGET_BUILTIN(__builtin_amdgcn_swmmac_f32_16x16x32_f16_w64, "V4fV4hV8hV4fs", "nc", "gfx12-insts,wavefrontsize64")
TARGET_BUILTIN(__builtin_amdgcn_swmmac_f32_16x16x32_bf16_w64, "V4fV4sV8sV4fs", "nc", "gfx12-insts,wavefrontsize64")
TARGET_BUILTIN(__builtin_amdgcn_swmmac_f16_16x16x32_f16_w64, "V4hV4hV8hV4hs", "nc", "gfx12-insts,wavefrontsize64")
TARGET_BUILTIN(__builtin_amdgcn_swmmac_bf16_16x16x32_bf16_w64, "V4sV4sV8sV4ss", "nc", "gfx12-insts,wavefrontsize64")
TARGET_BUILTIN(__builtin_amdgcn_swmmac_i32_16x16x32_iu8_w64, "V4iIbiIbV2iV4isIb", "nc", "gfx12-insts,wavefrontsize64")
TARGET_BUILTIN(__builtin_amdgcn_swmmac_i32_16x16x32_iu4_w64, "V4iIbiIbiV4isIb", "nc", "gfx12-insts,wavefrontsize64")
TARGET_BUILTIN(__builtin_amdgcn_swmmac_i32_16x16x64_iu4_w64, "V4iIbiIbV2iV4isIb", "nc", "gfx12-insts,wavefrontsize64")
TARGET_BUILTIN(__builtin_amdgcn_swmmac_f32_16x16x32_fp8_fp8_w64, "V4fiV2iV4fs", "nc", "gfx12-insts,wavefrontsize64")
TARGET_BUILTIN(__builtin_amdgcn_swmmac_f32_16x16x32_fp8_bf8_w64, "V4fiV2iV4fs", "nc", "gfx12-insts,wavefrontsize64")
TARGET_BUILTIN(__builtin_amdgcn_swmmac_f32_16x16x32_bf8_fp8_w64, "V4fiV2iV4fs", "nc", "gfx12-insts,wavefrontsize64")
TARGET_BUILTIN(__builtin_amdgcn_swmmac_f32_16x16x32_bf8_bf8_w64, "V4fiV2iV4fs", "nc", "gfx12-insts,wavefrontsize64")
#undef BUILTIN
#undef TARGET_BUILTIN

View File

@ -18279,65 +18279,216 @@ Value *CodeGenFunction::EmitAMDGPUBuiltinExpr(unsigned BuiltinID,
case AMDGPU::BI__builtin_amdgcn_wmma_i32_16x16x16_iu4_w32:
case AMDGPU::BI__builtin_amdgcn_wmma_i32_16x16x16_iu4_w64:
case AMDGPU::BI__builtin_amdgcn_wmma_i32_16x16x16_iu8_w32:
case AMDGPU::BI__builtin_amdgcn_wmma_i32_16x16x16_iu8_w64: {
case AMDGPU::BI__builtin_amdgcn_wmma_i32_16x16x16_iu8_w64:
case AMDGPU::BI__builtin_amdgcn_wmma_bf16_16x16x16_bf16_w32_gfx12:
case AMDGPU::BI__builtin_amdgcn_wmma_bf16_16x16x16_bf16_w64_gfx12:
case AMDGPU::BI__builtin_amdgcn_wmma_f16_16x16x16_f16_w32_gfx12:
case AMDGPU::BI__builtin_amdgcn_wmma_f16_16x16x16_f16_w64_gfx12:
case AMDGPU::BI__builtin_amdgcn_wmma_f32_16x16x16_bf16_w32_gfx12:
case AMDGPU::BI__builtin_amdgcn_wmma_f32_16x16x16_bf16_w64_gfx12:
case AMDGPU::BI__builtin_amdgcn_wmma_f32_16x16x16_f16_w32_gfx12:
case AMDGPU::BI__builtin_amdgcn_wmma_f32_16x16x16_f16_w64_gfx12:
case AMDGPU::BI__builtin_amdgcn_wmma_i32_16x16x16_iu4_w32_gfx12:
case AMDGPU::BI__builtin_amdgcn_wmma_i32_16x16x16_iu4_w64_gfx12:
case AMDGPU::BI__builtin_amdgcn_wmma_i32_16x16x16_iu8_w32_gfx12:
case AMDGPU::BI__builtin_amdgcn_wmma_i32_16x16x16_iu8_w64_gfx12:
case AMDGPU::BI__builtin_amdgcn_wmma_f32_16x16x16_fp8_fp8_w32_gfx12:
case AMDGPU::BI__builtin_amdgcn_wmma_f32_16x16x16_fp8_fp8_w64_gfx12:
case AMDGPU::BI__builtin_amdgcn_wmma_f32_16x16x16_fp8_bf8_w32_gfx12:
case AMDGPU::BI__builtin_amdgcn_wmma_f32_16x16x16_fp8_bf8_w64_gfx12:
case AMDGPU::BI__builtin_amdgcn_wmma_f32_16x16x16_bf8_fp8_w32_gfx12:
case AMDGPU::BI__builtin_amdgcn_wmma_f32_16x16x16_bf8_fp8_w64_gfx12:
case AMDGPU::BI__builtin_amdgcn_wmma_f32_16x16x16_bf8_bf8_w32_gfx12:
case AMDGPU::BI__builtin_amdgcn_wmma_f32_16x16x16_bf8_bf8_w64_gfx12:
case AMDGPU::BI__builtin_amdgcn_wmma_i32_16x16x32_iu4_w32_gfx12:
case AMDGPU::BI__builtin_amdgcn_wmma_i32_16x16x32_iu4_w64_gfx12:
case AMDGPU::BI__builtin_amdgcn_swmmac_f32_16x16x32_f16_w32:
case AMDGPU::BI__builtin_amdgcn_swmmac_f32_16x16x32_f16_w64:
case AMDGPU::BI__builtin_amdgcn_swmmac_f32_16x16x32_bf16_w32:
case AMDGPU::BI__builtin_amdgcn_swmmac_f32_16x16x32_bf16_w64:
case AMDGPU::BI__builtin_amdgcn_swmmac_f16_16x16x32_f16_w32:
case AMDGPU::BI__builtin_amdgcn_swmmac_f16_16x16x32_f16_w64:
case AMDGPU::BI__builtin_amdgcn_swmmac_bf16_16x16x32_bf16_w32:
case AMDGPU::BI__builtin_amdgcn_swmmac_bf16_16x16x32_bf16_w64:
case AMDGPU::BI__builtin_amdgcn_swmmac_i32_16x16x32_iu8_w32:
case AMDGPU::BI__builtin_amdgcn_swmmac_i32_16x16x32_iu8_w64:
case AMDGPU::BI__builtin_amdgcn_swmmac_i32_16x16x32_iu4_w32:
case AMDGPU::BI__builtin_amdgcn_swmmac_i32_16x16x32_iu4_w64:
case AMDGPU::BI__builtin_amdgcn_swmmac_i32_16x16x64_iu4_w32:
case AMDGPU::BI__builtin_amdgcn_swmmac_i32_16x16x64_iu4_w64:
case AMDGPU::BI__builtin_amdgcn_swmmac_f32_16x16x32_fp8_fp8_w32:
case AMDGPU::BI__builtin_amdgcn_swmmac_f32_16x16x32_fp8_fp8_w64:
case AMDGPU::BI__builtin_amdgcn_swmmac_f32_16x16x32_fp8_bf8_w32:
case AMDGPU::BI__builtin_amdgcn_swmmac_f32_16x16x32_fp8_bf8_w64:
case AMDGPU::BI__builtin_amdgcn_swmmac_f32_16x16x32_bf8_fp8_w32:
case AMDGPU::BI__builtin_amdgcn_swmmac_f32_16x16x32_bf8_fp8_w64:
case AMDGPU::BI__builtin_amdgcn_swmmac_f32_16x16x32_bf8_bf8_w32:
case AMDGPU::BI__builtin_amdgcn_swmmac_f32_16x16x32_bf8_bf8_w64: {
// These operations perform a matrix multiplication and accumulation of
// the form:
// D = A * B + C
// The return type always matches the type of matrix C.
unsigned ArgForMatchingRetType;
// We need to specify one type for matrices AB and one for matrices CD.
// Sparse matrix operations can have different types for A and B as well as
// an additional type for sparsity index.
// Destination type should be put before types used for source operands.
SmallVector<unsigned, 2> ArgsForMatchingMatrixTypes;
// On GFX12, the intrinsics with 16-bit accumulator use a packed layout.
// There is no need for the variable opsel argument, so always set it to
// "false".
bool AppendFalseForOpselArg = false;
unsigned BuiltinWMMAOp;
switch (BuiltinID) {
case AMDGPU::BI__builtin_amdgcn_wmma_f32_16x16x16_f16_w32:
case AMDGPU::BI__builtin_amdgcn_wmma_f32_16x16x16_f16_w64:
ArgForMatchingRetType = 2;
case AMDGPU::BI__builtin_amdgcn_wmma_f32_16x16x16_f16_w32_gfx12:
case AMDGPU::BI__builtin_amdgcn_wmma_f32_16x16x16_f16_w64_gfx12:
ArgsForMatchingMatrixTypes = {2, 0}; // CD, AB
BuiltinWMMAOp = Intrinsic::amdgcn_wmma_f32_16x16x16_f16;
break;
case AMDGPU::BI__builtin_amdgcn_wmma_f32_16x16x16_bf16_w32:
case AMDGPU::BI__builtin_amdgcn_wmma_f32_16x16x16_bf16_w64:
ArgForMatchingRetType = 2;
case AMDGPU::BI__builtin_amdgcn_wmma_f32_16x16x16_bf16_w32_gfx12:
case AMDGPU::BI__builtin_amdgcn_wmma_f32_16x16x16_bf16_w64_gfx12:
ArgsForMatchingMatrixTypes = {2, 0}; // CD, AB
BuiltinWMMAOp = Intrinsic::amdgcn_wmma_f32_16x16x16_bf16;
break;
case AMDGPU::BI__builtin_amdgcn_wmma_f16_16x16x16_f16_w32_gfx12:
case AMDGPU::BI__builtin_amdgcn_wmma_f16_16x16x16_f16_w64_gfx12:
AppendFalseForOpselArg = true;
LLVM_FALLTHROUGH;
case AMDGPU::BI__builtin_amdgcn_wmma_f16_16x16x16_f16_w32:
case AMDGPU::BI__builtin_amdgcn_wmma_f16_16x16x16_f16_w64:
ArgForMatchingRetType = 2;
ArgsForMatchingMatrixTypes = {2, 0}; // CD, AB
BuiltinWMMAOp = Intrinsic::amdgcn_wmma_f16_16x16x16_f16;
break;
case AMDGPU::BI__builtin_amdgcn_wmma_bf16_16x16x16_bf16_w32_gfx12:
case AMDGPU::BI__builtin_amdgcn_wmma_bf16_16x16x16_bf16_w64_gfx12:
AppendFalseForOpselArg = true;
LLVM_FALLTHROUGH;
case AMDGPU::BI__builtin_amdgcn_wmma_bf16_16x16x16_bf16_w32:
case AMDGPU::BI__builtin_amdgcn_wmma_bf16_16x16x16_bf16_w64:
ArgForMatchingRetType = 2;
ArgsForMatchingMatrixTypes = {2, 0}; // CD, AB
BuiltinWMMAOp = Intrinsic::amdgcn_wmma_bf16_16x16x16_bf16;
break;
case AMDGPU::BI__builtin_amdgcn_wmma_f16_16x16x16_f16_tied_w32:
case AMDGPU::BI__builtin_amdgcn_wmma_f16_16x16x16_f16_tied_w64:
ArgForMatchingRetType = 2;
ArgsForMatchingMatrixTypes = {2, 0}; // CD, AB
BuiltinWMMAOp = Intrinsic::amdgcn_wmma_f16_16x16x16_f16_tied;
break;
case AMDGPU::BI__builtin_amdgcn_wmma_bf16_16x16x16_bf16_tied_w32:
case AMDGPU::BI__builtin_amdgcn_wmma_bf16_16x16x16_bf16_tied_w64:
ArgForMatchingRetType = 2;
ArgsForMatchingMatrixTypes = {2, 0}; // CD, AB
BuiltinWMMAOp = Intrinsic::amdgcn_wmma_bf16_16x16x16_bf16_tied;
break;
case AMDGPU::BI__builtin_amdgcn_wmma_i32_16x16x16_iu8_w32:
case AMDGPU::BI__builtin_amdgcn_wmma_i32_16x16x16_iu8_w64:
ArgForMatchingRetType = 4;
case AMDGPU::BI__builtin_amdgcn_wmma_i32_16x16x16_iu8_w32_gfx12:
case AMDGPU::BI__builtin_amdgcn_wmma_i32_16x16x16_iu8_w64_gfx12:
ArgsForMatchingMatrixTypes = {4, 1}; // CD, AB
BuiltinWMMAOp = Intrinsic::amdgcn_wmma_i32_16x16x16_iu8;
break;
case AMDGPU::BI__builtin_amdgcn_wmma_i32_16x16x16_iu4_w32:
case AMDGPU::BI__builtin_amdgcn_wmma_i32_16x16x16_iu4_w64:
ArgForMatchingRetType = 4;
case AMDGPU::BI__builtin_amdgcn_wmma_i32_16x16x16_iu4_w32_gfx12:
case AMDGPU::BI__builtin_amdgcn_wmma_i32_16x16x16_iu4_w64_gfx12:
ArgsForMatchingMatrixTypes = {4, 1}; // CD, AB
BuiltinWMMAOp = Intrinsic::amdgcn_wmma_i32_16x16x16_iu4;
break;
case AMDGPU::BI__builtin_amdgcn_wmma_f32_16x16x16_fp8_fp8_w32_gfx12:
case AMDGPU::BI__builtin_amdgcn_wmma_f32_16x16x16_fp8_fp8_w64_gfx12:
ArgsForMatchingMatrixTypes = {2, 0}; // CD, AB
BuiltinWMMAOp = Intrinsic::amdgcn_wmma_f32_16x16x16_fp8_fp8;
break;
case AMDGPU::BI__builtin_amdgcn_wmma_f32_16x16x16_fp8_bf8_w32_gfx12:
case AMDGPU::BI__builtin_amdgcn_wmma_f32_16x16x16_fp8_bf8_w64_gfx12:
ArgsForMatchingMatrixTypes = {2, 0}; // CD, AB
BuiltinWMMAOp = Intrinsic::amdgcn_wmma_f32_16x16x16_fp8_bf8;
break;
case AMDGPU::BI__builtin_amdgcn_wmma_f32_16x16x16_bf8_fp8_w32_gfx12:
case AMDGPU::BI__builtin_amdgcn_wmma_f32_16x16x16_bf8_fp8_w64_gfx12:
ArgsForMatchingMatrixTypes = {2, 0}; // CD, AB
BuiltinWMMAOp = Intrinsic::amdgcn_wmma_f32_16x16x16_bf8_fp8;
break;
case AMDGPU::BI__builtin_amdgcn_wmma_f32_16x16x16_bf8_bf8_w32_gfx12:
case AMDGPU::BI__builtin_amdgcn_wmma_f32_16x16x16_bf8_bf8_w64_gfx12:
ArgsForMatchingMatrixTypes = {2, 0}; // CD, AB
BuiltinWMMAOp = Intrinsic::amdgcn_wmma_f32_16x16x16_bf8_bf8;
break;
case AMDGPU::BI__builtin_amdgcn_wmma_i32_16x16x32_iu4_w32_gfx12:
case AMDGPU::BI__builtin_amdgcn_wmma_i32_16x16x32_iu4_w64_gfx12:
ArgsForMatchingMatrixTypes = {4, 1}; // CD, AB
BuiltinWMMAOp = Intrinsic::amdgcn_wmma_i32_16x16x32_iu4;
break;
case AMDGPU::BI__builtin_amdgcn_swmmac_f32_16x16x32_f16_w32:
case AMDGPU::BI__builtin_amdgcn_swmmac_f32_16x16x32_f16_w64:
ArgsForMatchingMatrixTypes = {2, 0, 1, 3}; // CD, A, B, Index
BuiltinWMMAOp = Intrinsic::amdgcn_swmmac_f32_16x16x32_f16;
break;
case AMDGPU::BI__builtin_amdgcn_swmmac_f32_16x16x32_bf16_w32:
case AMDGPU::BI__builtin_amdgcn_swmmac_f32_16x16x32_bf16_w64:
ArgsForMatchingMatrixTypes = {2, 0, 1, 3}; // CD, A, B, Index
BuiltinWMMAOp = Intrinsic::amdgcn_swmmac_f32_16x16x32_bf16;
break;
case AMDGPU::BI__builtin_amdgcn_swmmac_f16_16x16x32_f16_w32:
case AMDGPU::BI__builtin_amdgcn_swmmac_f16_16x16x32_f16_w64:
ArgsForMatchingMatrixTypes = {2, 0, 1, 3}; // CD, A, B, Index
BuiltinWMMAOp = Intrinsic::amdgcn_swmmac_f16_16x16x32_f16;
break;
case AMDGPU::BI__builtin_amdgcn_swmmac_bf16_16x16x32_bf16_w32:
case AMDGPU::BI__builtin_amdgcn_swmmac_bf16_16x16x32_bf16_w64:
ArgsForMatchingMatrixTypes = {2, 0, 1, 3}; // CD, A, B, Index
BuiltinWMMAOp = Intrinsic::amdgcn_swmmac_bf16_16x16x32_bf16;
break;
case AMDGPU::BI__builtin_amdgcn_swmmac_i32_16x16x32_iu8_w32:
case AMDGPU::BI__builtin_amdgcn_swmmac_i32_16x16x32_iu8_w64:
ArgsForMatchingMatrixTypes = {4, 1, 3, 5}; // CD, A, B, Index
BuiltinWMMAOp = Intrinsic::amdgcn_swmmac_i32_16x16x32_iu8;
break;
case AMDGPU::BI__builtin_amdgcn_swmmac_i32_16x16x32_iu4_w32:
case AMDGPU::BI__builtin_amdgcn_swmmac_i32_16x16x32_iu4_w64:
ArgsForMatchingMatrixTypes = {4, 1, 3, 5}; // CD, A, B, Index
BuiltinWMMAOp = Intrinsic::amdgcn_swmmac_i32_16x16x32_iu4;
break;
case AMDGPU::BI__builtin_amdgcn_swmmac_i32_16x16x64_iu4_w32:
case AMDGPU::BI__builtin_amdgcn_swmmac_i32_16x16x64_iu4_w64:
ArgsForMatchingMatrixTypes = {4, 1, 3, 5}; // CD, A, B, Index
BuiltinWMMAOp = Intrinsic::amdgcn_swmmac_i32_16x16x64_iu4;
break;
case AMDGPU::BI__builtin_amdgcn_swmmac_f32_16x16x32_fp8_fp8_w32:
case AMDGPU::BI__builtin_amdgcn_swmmac_f32_16x16x32_fp8_fp8_w64:
ArgsForMatchingMatrixTypes = {2, 0, 1, 3}; // CD, A, B, Index
BuiltinWMMAOp = Intrinsic::amdgcn_swmmac_f32_16x16x32_fp8_fp8;
break;
case AMDGPU::BI__builtin_amdgcn_swmmac_f32_16x16x32_fp8_bf8_w32:
case AMDGPU::BI__builtin_amdgcn_swmmac_f32_16x16x32_fp8_bf8_w64:
ArgsForMatchingMatrixTypes = {2, 0, 1, 3}; // CD, A, B, Index
BuiltinWMMAOp = Intrinsic::amdgcn_swmmac_f32_16x16x32_fp8_bf8;
break;
case AMDGPU::BI__builtin_amdgcn_swmmac_f32_16x16x32_bf8_fp8_w32:
case AMDGPU::BI__builtin_amdgcn_swmmac_f32_16x16x32_bf8_fp8_w64:
ArgsForMatchingMatrixTypes = {2, 0, 1, 3}; // CD, A, B, Index
BuiltinWMMAOp = Intrinsic::amdgcn_swmmac_f32_16x16x32_bf8_fp8;
break;
case AMDGPU::BI__builtin_amdgcn_swmmac_f32_16x16x32_bf8_bf8_w32:
case AMDGPU::BI__builtin_amdgcn_swmmac_f32_16x16x32_bf8_bf8_w64:
ArgsForMatchingMatrixTypes = {2, 0, 1, 3}; // CD, A, B, Index
BuiltinWMMAOp = Intrinsic::amdgcn_swmmac_f32_16x16x32_bf8_bf8;
break;
}
SmallVector<Value *, 6> Args;
for (int i = 0, e = E->getNumArgs(); i != e; ++i)
Args.push_back(EmitScalarExpr(E->getArg(i)));
if (AppendFalseForOpselArg)
Args.push_back(Builder.getFalse());
Function *F = CGM.getIntrinsic(BuiltinWMMAOp,
{Args[ArgForMatchingRetType]->getType()});
SmallVector<llvm::Type *, 6> ArgTypes;
for (auto ArgIdx : ArgsForMatchingMatrixTypes)
ArgTypes.push_back(Args[ArgIdx]->getType());
Function *F = CGM.getIntrinsic(BuiltinWMMAOp, ArgTypes);
return Builder.CreateCall(F, Args);
}

View File

@ -0,0 +1,156 @@
// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py
// REQUIRES: amdgpu-registered-target
// RUN: %clang_cc1 -triple amdgcn-unknown-unknown -target-cpu gfx1200 -target-feature +wavefrontsize32 -S -emit-llvm -o - %s | FileCheck %s --check-prefix=CHECK-GFX1200
typedef int v2i __attribute__((ext_vector_type(2)));
typedef float v8f __attribute__((ext_vector_type(8)));
typedef half v8h __attribute__((ext_vector_type(8)));
typedef short v8s __attribute__((ext_vector_type(8)));
typedef int v8i __attribute__((ext_vector_type(8)));
// Wave32
//
// amdgcn_wmma_f32_16x16x16_f16
//
// CHECK-GFX1200-LABEL: @test_amdgcn_wmma_f32_16x16x16_f16_w32(
// CHECK-GFX1200-NEXT: entry:
// CHECK-GFX1200-NEXT: [[TMP0:%.*]] = tail call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.f16.v8f32.v8f16(<8 x half> [[A:%.*]], <8 x half> [[B:%.*]], <8 x float> [[C:%.*]])
// CHECK-GFX1200-NEXT: store <8 x float> [[TMP0]], ptr addrspace(1) [[OUT:%.*]], align 32, !tbaa [[TBAA4:![0-9]+]]
// CHECK-GFX1200-NEXT: ret void
//
void test_amdgcn_wmma_f32_16x16x16_f16_w32(global v8f* out, v8h a, v8h b, v8f c)
{
*out = __builtin_amdgcn_wmma_f32_16x16x16_f16_w32_gfx12(a, b, c);
}
//
// amdgcn_wmma_f32_16x16x16_bf16
//
// CHECK-GFX1200-LABEL: @test_amdgcn_wmma_f32_16x16x16_bf16_w32(
// CHECK-GFX1200-NEXT: entry:
// CHECK-GFX1200-NEXT: [[TMP0:%.*]] = tail call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf16.v8f32.v8i16(<8 x i16> [[A:%.*]], <8 x i16> [[B:%.*]], <8 x float> [[C:%.*]])
// CHECK-GFX1200-NEXT: store <8 x float> [[TMP0]], ptr addrspace(1) [[OUT:%.*]], align 32, !tbaa [[TBAA4]]
// CHECK-GFX1200-NEXT: ret void
//
void test_amdgcn_wmma_f32_16x16x16_bf16_w32(global v8f* out, v8s a, v8s b, v8f c)
{
*out = __builtin_amdgcn_wmma_f32_16x16x16_bf16_w32_gfx12(a, b, c);
}
//
// amdgcn_wmma_f16_16x16x16_f16
//
// CHECK-GFX1200-LABEL: @test_amdgcn_wmma_f16_16x16x16_f16_w32(
// CHECK-GFX1200-NEXT: entry:
// CHECK-GFX1200-NEXT: [[TMP0:%.*]] = tail call <8 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16.v8f16.v8f16(<8 x half> [[A:%.*]], <8 x half> [[B:%.*]], <8 x half> [[C:%.*]], i1 false)
// CHECK-GFX1200-NEXT: store <8 x half> [[TMP0]], ptr addrspace(1) [[OUT:%.*]], align 16, !tbaa [[TBAA4]]
// CHECK-GFX1200-NEXT: ret void
//
void test_amdgcn_wmma_f16_16x16x16_f16_w32(global v8h* out, v8h a, v8h b, v8h c)
{
*out = __builtin_amdgcn_wmma_f16_16x16x16_f16_w32_gfx12(a, b, c);
}
//
// amdgcn_wmma_bf16_16x16x16_bf16
//
// CHECK-GFX1200-LABEL: @test_amdgcn_wmma_bf16_16x16x16_bf16_w32(
// CHECK-GFX1200-NEXT: entry:
// CHECK-GFX1200-NEXT: [[TMP0:%.*]] = tail call <8 x i16> @llvm.amdgcn.wmma.bf16.16x16x16.bf16.v8i16.v8i16(<8 x i16> [[A:%.*]], <8 x i16> [[B:%.*]], <8 x i16> [[C:%.*]], i1 false)
// CHECK-GFX1200-NEXT: store <8 x i16> [[TMP0]], ptr addrspace(1) [[OUT:%.*]], align 16, !tbaa [[TBAA4]]
// CHECK-GFX1200-NEXT: ret void
//
void test_amdgcn_wmma_bf16_16x16x16_bf16_w32(global v8s* out, v8s a, v8s b, v8s c)
{
*out = __builtin_amdgcn_wmma_bf16_16x16x16_bf16_w32_gfx12(a, b, c);
}
//
// amdgcn_wmma_i32_16x16x16_iu8
//
// CHECK-GFX1200-LABEL: @test_amdgcn_wmma_i32_16x16x16_iu8_w32(
// CHECK-GFX1200-NEXT: entry:
// CHECK-GFX1200-NEXT: [[TMP0:%.*]] = tail call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8.v8i32.v2i32(i1 true, <2 x i32> [[A:%.*]], i1 true, <2 x i32> [[B:%.*]], <8 x i32> [[C:%.*]], i1 false)
// CHECK-GFX1200-NEXT: store <8 x i32> [[TMP0]], ptr addrspace(1) [[OUT:%.*]], align 32, !tbaa [[TBAA4]]
// CHECK-GFX1200-NEXT: ret void
//
void test_amdgcn_wmma_i32_16x16x16_iu8_w32(global v8i* out, v2i a, v2i b, v8i c)
{
*out = __builtin_amdgcn_wmma_i32_16x16x16_iu8_w32_gfx12(true, a, true, b, c, false);
}
//
// amdgcn_wmma_i32_16x16x16_iu4
//
// CHECK-GFX1200-LABEL: @test_amdgcn_wmma_i32_16x16x16_iu4_w32(
// CHECK-GFX1200-NEXT: entry:
// CHECK-GFX1200-NEXT: [[TMP0:%.*]] = tail call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4.v8i32.i32(i1 true, i32 [[A:%.*]], i1 true, i32 [[B:%.*]], <8 x i32> [[C:%.*]], i1 false)
// CHECK-GFX1200-NEXT: store <8 x i32> [[TMP0]], ptr addrspace(1) [[OUT:%.*]], align 32, !tbaa [[TBAA4]]
// CHECK-GFX1200-NEXT: ret void
//
void test_amdgcn_wmma_i32_16x16x16_iu4_w32(global v8i* out, int a, int b, v8i c)
{
*out = __builtin_amdgcn_wmma_i32_16x16x16_iu4_w32_gfx12(true, a, true, b, c, false);
}
// CHECK-GFX1200-LABEL: @test_amdgcn_wmma_f32_16x16x16_fp8_fp8_w32(
// CHECK-GFX1200-NEXT: entry:
// CHECK-GFX1200-NEXT: [[TMP0:%.*]] = tail call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.fp8.fp8.v8f32.v2i32(<2 x i32> [[A:%.*]], <2 x i32> [[B:%.*]], <8 x float> [[C:%.*]])
// CHECK-GFX1200-NEXT: store <8 x float> [[TMP0]], ptr addrspace(1) [[OUT:%.*]], align 32, !tbaa [[TBAA4]]
// CHECK-GFX1200-NEXT: ret void
//
void test_amdgcn_wmma_f32_16x16x16_fp8_fp8_w32(global v8f* out, v2i a, v2i b, v8f c)
{
*out = __builtin_amdgcn_wmma_f32_16x16x16_fp8_fp8_w32_gfx12(a, b, c);
}
// CHECK-GFX1200-LABEL: @test_amdgcn_wmma_f32_16x16x16_fp8_bf8_w32(
// CHECK-GFX1200-NEXT: entry:
// CHECK-GFX1200-NEXT: [[TMP0:%.*]] = tail call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.fp8.bf8.v8f32.v2i32(<2 x i32> [[A:%.*]], <2 x i32> [[B:%.*]], <8 x float> [[C:%.*]])
// CHECK-GFX1200-NEXT: store <8 x float> [[TMP0]], ptr addrspace(1) [[OUT:%.*]], align 32, !tbaa [[TBAA4]]
// CHECK-GFX1200-NEXT: ret void
//
void test_amdgcn_wmma_f32_16x16x16_fp8_bf8_w32(global v8f* out, v2i a, v2i b, v8f c)
{
*out = __builtin_amdgcn_wmma_f32_16x16x16_fp8_bf8_w32_gfx12(a, b, c);
}
// CHECK-GFX1200-LABEL: @test_amdgcn_wmma_f32_16x16x16_bf8_fp8_w32(
// CHECK-GFX1200-NEXT: entry:
// CHECK-GFX1200-NEXT: [[TMP0:%.*]] = tail call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf8.fp8.v8f32.v2i32(<2 x i32> [[A:%.*]], <2 x i32> [[B:%.*]], <8 x float> [[C:%.*]])
// CHECK-GFX1200-NEXT: store <8 x float> [[TMP0]], ptr addrspace(1) [[OUT:%.*]], align 32, !tbaa [[TBAA4]]
// CHECK-GFX1200-NEXT: ret void
//
void test_amdgcn_wmma_f32_16x16x16_bf8_fp8_w32(global v8f* out, v2i a, v2i b, v8f c)
{
*out = __builtin_amdgcn_wmma_f32_16x16x16_bf8_fp8_w32_gfx12(a, b, c);
}
// CHECK-GFX1200-LABEL: @test_amdgcn_wmma_f32_16x16x16_bf8_bf8_w32(
// CHECK-GFX1200-NEXT: entry:
// CHECK-GFX1200-NEXT: [[TMP0:%.*]] = tail call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf8.bf8.v8f32.v2i32(<2 x i32> [[A:%.*]], <2 x i32> [[B:%.*]], <8 x float> [[C:%.*]])
// CHECK-GFX1200-NEXT: store <8 x float> [[TMP0]], ptr addrspace(1) [[OUT:%.*]], align 32, !tbaa [[TBAA4]]
// CHECK-GFX1200-NEXT: ret void
//
void test_amdgcn_wmma_f32_16x16x16_bf8_bf8_w32(global v8f* out, v2i a, v2i b, v8f c)
{
*out = __builtin_amdgcn_wmma_f32_16x16x16_bf8_bf8_w32_gfx12(a, b, c);
}
// CHECK-GFX1200-LABEL: @test_amdgcn_wmma_i32_16x16x32_iu4_w32(
// CHECK-GFX1200-NEXT: entry:
// CHECK-GFX1200-NEXT: [[TMP0:%.*]] = tail call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x32.iu4.v8i32.v2i32(i1 true, <2 x i32> [[A:%.*]], i1 true, <2 x i32> [[B:%.*]], <8 x i32> [[C:%.*]], i1 false)
// CHECK-GFX1200-NEXT: store <8 x i32> [[TMP0]], ptr addrspace(1) [[OUT:%.*]], align 32, !tbaa [[TBAA4]]
// CHECK-GFX1200-NEXT: ret void
//
void test_amdgcn_wmma_i32_16x16x32_iu4_w32(global v8i* out, v2i a, v2i b, v8i c)
{
*out = __builtin_amdgcn_wmma_i32_16x16x32_iu4_w32_gfx12(true, a, true, b, c, false);
}

View File

@ -0,0 +1,155 @@
// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py
// REQUIRES: amdgpu-registered-target
// RUN: %clang_cc1 -triple amdgcn-unknown-unknown -target-cpu gfx1200 -target-feature +wavefrontsize64 -S -emit-llvm -o - %s | FileCheck %s --check-prefix=CHECK-GFX1200
typedef float v4f __attribute__((ext_vector_type(4)));
typedef half v4h __attribute__((ext_vector_type(4)));
typedef short v4s __attribute__((ext_vector_type(4)));
typedef int v4i __attribute__((ext_vector_type(4)));
// Wave64
//
// amdgcn_wmma_f32_16x16x16_f16
//
// CHECK-GFX1200-LABEL: @test_amdgcn_wmma_f32_16x16x16_f16_w64(
// CHECK-GFX1200-NEXT: entry:
// CHECK-GFX1200-NEXT: [[TMP0:%.*]] = tail call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.f16.v4f32.v4f16(<4 x half> [[A:%.*]], <4 x half> [[B:%.*]], <4 x float> [[C:%.*]])
// CHECK-GFX1200-NEXT: store <4 x float> [[TMP0]], ptr addrspace(1) [[OUT:%.*]], align 16, !tbaa [[TBAA4:![0-9]+]]
// CHECK-GFX1200-NEXT: ret void
//
void test_amdgcn_wmma_f32_16x16x16_f16_w64(global v4f* out, v4h a, v4h b, v4f c)
{
*out = __builtin_amdgcn_wmma_f32_16x16x16_f16_w64_gfx12(a, b, c);
}
//
// amdgcn_wmma_f32_16x16x16_bf16
//
// CHECK-GFX1200-LABEL: @test_amdgcn_wmma_f32_16x16x16_bf16_w64(
// CHECK-GFX1200-NEXT: entry:
// CHECK-GFX1200-NEXT: [[TMP0:%.*]] = tail call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf16.v4f32.v4i16(<4 x i16> [[A:%.*]], <4 x i16> [[B:%.*]], <4 x float> [[C:%.*]])
// CHECK-GFX1200-NEXT: store <4 x float> [[TMP0]], ptr addrspace(1) [[OUT:%.*]], align 16, !tbaa [[TBAA4]]
// CHECK-GFX1200-NEXT: ret void
//
void test_amdgcn_wmma_f32_16x16x16_bf16_w64(global v4f* out, v4s a, v4s b, v4f c)
{
*out = __builtin_amdgcn_wmma_f32_16x16x16_bf16_w64_gfx12(a, b, c);
}
//
// amdgcn_wmma_f16_16x16x16_f16
//
// CHECK-GFX1200-LABEL: @test_amdgcn_wmma_f16_16x16x16_f16_w64(
// CHECK-GFX1200-NEXT: entry:
// CHECK-GFX1200-NEXT: [[TMP0:%.*]] = tail call <4 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16.v4f16.v4f16(<4 x half> [[A:%.*]], <4 x half> [[B:%.*]], <4 x half> [[C:%.*]], i1 false)
// CHECK-GFX1200-NEXT: store <4 x half> [[TMP0]], ptr addrspace(1) [[OUT:%.*]], align 8, !tbaa [[TBAA4]]
// CHECK-GFX1200-NEXT: ret void
//
void test_amdgcn_wmma_f16_16x16x16_f16_w64(global v4h* out, v4h a, v4h b, v4h c)
{
*out = __builtin_amdgcn_wmma_f16_16x16x16_f16_w64_gfx12(a, b, c);
}
//
// amdgcn_wmma_bf16_16x16x16_bf16
//
// CHECK-GFX1200-LABEL: @test_amdgcn_wmma_bf16_16x16x16_bf16_w64(
// CHECK-GFX1200-NEXT: entry:
// CHECK-GFX1200-NEXT: [[TMP0:%.*]] = tail call <4 x i16> @llvm.amdgcn.wmma.bf16.16x16x16.bf16.v4i16.v4i16(<4 x i16> [[A:%.*]], <4 x i16> [[B:%.*]], <4 x i16> [[C:%.*]], i1 false)
// CHECK-GFX1200-NEXT: store <4 x i16> [[TMP0]], ptr addrspace(1) [[OUT:%.*]], align 8, !tbaa [[TBAA4]]
// CHECK-GFX1200-NEXT: ret void
//
void test_amdgcn_wmma_bf16_16x16x16_bf16_w64(global v4s* out, v4s a, v4s b, v4s c)
{
*out = __builtin_amdgcn_wmma_bf16_16x16x16_bf16_w64_gfx12(a, b, c);
}
//
// amdgcn_wmma_i32_16x16x16_iu8
//
// CHECK-GFX1200-LABEL: @test_amdgcn_wmma_i32_16x16x16_iu8_w64(
// CHECK-GFX1200-NEXT: entry:
// CHECK-GFX1200-NEXT: [[TMP0:%.*]] = tail call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8.v4i32.i32(i1 true, i32 [[A:%.*]], i1 true, i32 [[B:%.*]], <4 x i32> [[C:%.*]], i1 false)
// CHECK-GFX1200-NEXT: store <4 x i32> [[TMP0]], ptr addrspace(1) [[OUT:%.*]], align 16, !tbaa [[TBAA4]]
// CHECK-GFX1200-NEXT: ret void
//
void test_amdgcn_wmma_i32_16x16x16_iu8_w64(global v4i* out, int a, int b, v4i c)
{
*out = __builtin_amdgcn_wmma_i32_16x16x16_iu8_w64_gfx12(true, a, true, b, c, false);
}
//
// amdgcn_wmma_i32_16x16x16_iu4
//
// CHECK-GFX1200-LABEL: @test_amdgcn_wmma_i32_16x16x16_iu4_w64(
// CHECK-GFX1200-NEXT: entry:
// CHECK-GFX1200-NEXT: [[TMP0:%.*]] = tail call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4.v4i32.i32(i1 true, i32 [[A:%.*]], i1 true, i32 [[B:%.*]], <4 x i32> [[C:%.*]], i1 false)
// CHECK-GFX1200-NEXT: store <4 x i32> [[TMP0]], ptr addrspace(1) [[OUT:%.*]], align 16, !tbaa [[TBAA4]]
// CHECK-GFX1200-NEXT: ret void
//
void test_amdgcn_wmma_i32_16x16x16_iu4_w64(global v4i* out, int a, int b, v4i c)
{
*out = __builtin_amdgcn_wmma_i32_16x16x16_iu4_w64_gfx12(true, a, true, b, c, false);
}
// CHECK-GFX1200-LABEL: @test_amdgcn_wmma_f32_16x16x16_fp8_fp8_w32(
// CHECK-GFX1200-NEXT: entry:
// CHECK-GFX1200-NEXT: [[TMP0:%.*]] = tail call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.fp8.fp8.v4f32.i32(i32 [[A:%.*]], i32 [[B:%.*]], <4 x float> [[C:%.*]])
// CHECK-GFX1200-NEXT: store <4 x float> [[TMP0]], ptr addrspace(1) [[OUT:%.*]], align 16, !tbaa [[TBAA4]]
// CHECK-GFX1200-NEXT: ret void
//
void test_amdgcn_wmma_f32_16x16x16_fp8_fp8_w32(global v4f* out, int a, int b, v4f c)
{
*out = __builtin_amdgcn_wmma_f32_16x16x16_fp8_fp8_w64_gfx12(a, b, c);
}
// CHECK-GFX1200-LABEL: @test_amdgcn_wmma_f32_16x16x16_fp8_bf8_w32(
// CHECK-GFX1200-NEXT: entry:
// CHECK-GFX1200-NEXT: [[TMP0:%.*]] = tail call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.fp8.bf8.v4f32.i32(i32 [[A:%.*]], i32 [[B:%.*]], <4 x float> [[C:%.*]])
// CHECK-GFX1200-NEXT: store <4 x float> [[TMP0]], ptr addrspace(1) [[OUT:%.*]], align 16, !tbaa [[TBAA4]]
// CHECK-GFX1200-NEXT: ret void
//
void test_amdgcn_wmma_f32_16x16x16_fp8_bf8_w32(global v4f* out, int a, int b, v4f c)
{
*out = __builtin_amdgcn_wmma_f32_16x16x16_fp8_bf8_w64_gfx12(a, b, c);
}
// CHECK-GFX1200-LABEL: @test_amdgcn_wmma_f32_16x16x16_bf8_fp8_w32(
// CHECK-GFX1200-NEXT: entry:
// CHECK-GFX1200-NEXT: [[TMP0:%.*]] = tail call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf8.fp8.v4f32.i32(i32 [[A:%.*]], i32 [[B:%.*]], <4 x float> [[C:%.*]])
// CHECK-GFX1200-NEXT: store <4 x float> [[TMP0]], ptr addrspace(1) [[OUT:%.*]], align 16, !tbaa [[TBAA4]]
// CHECK-GFX1200-NEXT: ret void
//
void test_amdgcn_wmma_f32_16x16x16_bf8_fp8_w32(global v4f* out, int a, int b, v4f c)
{
*out = __builtin_amdgcn_wmma_f32_16x16x16_bf8_fp8_w64_gfx12(a, b, c);
}
// CHECK-GFX1200-LABEL: @test_amdgcn_wmma_f32_16x16x16_bf8_bf8_w32(
// CHECK-GFX1200-NEXT: entry:
// CHECK-GFX1200-NEXT: [[TMP0:%.*]] = tail call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf8.bf8.v4f32.i32(i32 [[A:%.*]], i32 [[B:%.*]], <4 x float> [[C:%.*]])
// CHECK-GFX1200-NEXT: store <4 x float> [[TMP0]], ptr addrspace(1) [[OUT:%.*]], align 16, !tbaa [[TBAA4]]
// CHECK-GFX1200-NEXT: ret void
//
void test_amdgcn_wmma_f32_16x16x16_bf8_bf8_w32(global v4f* out, int a, int b, v4f c)
{
*out = __builtin_amdgcn_wmma_f32_16x16x16_bf8_bf8_w64_gfx12(a, b, c);
}
// CHECK-GFX1200-LABEL: @test_amdgcn_wmma_i32_16x16x32_iu4_w32(
// CHECK-GFX1200-NEXT: entry:
// CHECK-GFX1200-NEXT: [[TMP0:%.*]] = tail call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x32.iu4.v4i32.i32(i1 true, i32 [[A:%.*]], i1 true, i32 [[B:%.*]], <4 x i32> [[C:%.*]], i1 false)
// CHECK-GFX1200-NEXT: store <4 x i32> [[TMP0]], ptr addrspace(1) [[OUT:%.*]], align 16, !tbaa [[TBAA4]]
// CHECK-GFX1200-NEXT: ret void
//
void test_amdgcn_wmma_i32_16x16x32_iu4_w32(global v4i* out, int a, int b, v4i c)
{
*out = __builtin_amdgcn_wmma_i32_16x16x32_iu4_w64_gfx12(true, a, true, b, c, false);
}

View File

@ -0,0 +1,135 @@
// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py
// REQUIRES: amdgpu-registered-target
// RUN: %clang_cc1 -triple amdgcn-unknown-unknown -target-cpu gfx1200 -target-feature +wavefrontsize32 -S -emit-llvm -o - %s | FileCheck %s --check-prefix=CHECK-GFX1200
typedef int v2i __attribute__((ext_vector_type(2)));
typedef int v4i __attribute__((ext_vector_type(4)));
typedef float v8f __attribute__((ext_vector_type(8)));
typedef half v8h __attribute__((ext_vector_type(8)));
typedef short v8s __attribute__((ext_vector_type(8)));
typedef int v8i __attribute__((ext_vector_type(8)));
typedef half v16h __attribute__((ext_vector_type(16)));
typedef short v16s __attribute__((ext_vector_type(16)));
// Wave32
// CHECK-GFX1200-LABEL: @test_amdgcn_swmmac_f32_16x16x32_f16_w32(
// CHECK-GFX1200-NEXT: entry:
// CHECK-GFX1200-NEXT: [[TMP0:%.*]] = tail call <8 x float> @llvm.amdgcn.swmmac.f32.16x16x32.f16.v8f32.v8f16.v16f16.i16(<8 x half> [[A:%.*]], <16 x half> [[B:%.*]], <8 x float> [[C:%.*]], i16 [[INDEX:%.*]])
// CHECK-GFX1200-NEXT: store <8 x float> [[TMP0]], ptr addrspace(1) [[OUT:%.*]], align 32, !tbaa [[TBAA4:![0-9]+]]
// CHECK-GFX1200-NEXT: ret void
//
void test_amdgcn_swmmac_f32_16x16x32_f16_w32(global v8f* out, v8h a, v16h b, v8f c, short index)
{
*out = __builtin_amdgcn_swmmac_f32_16x16x32_f16_w32(a, b, c, index);
}
// CHECK-GFX1200-LABEL: @test_amdgcn_swmmac_f32_16x16x32_bf16_w32(
// CHECK-GFX1200-NEXT: entry:
// CHECK-GFX1200-NEXT: [[TMP0:%.*]] = tail call <8 x float> @llvm.amdgcn.swmmac.f32.16x16x32.bf16.v8f32.v8i16.v16i16.i16(<8 x i16> [[A:%.*]], <16 x i16> [[B:%.*]], <8 x float> [[C:%.*]], i16 [[INDEX:%.*]])
// CHECK-GFX1200-NEXT: store <8 x float> [[TMP0]], ptr addrspace(1) [[OUT:%.*]], align 32, !tbaa [[TBAA4]]
// CHECK-GFX1200-NEXT: ret void
//
void test_amdgcn_swmmac_f32_16x16x32_bf16_w32(global v8f* out, v8s a, v16s b, v8f c, short index)
{
*out = __builtin_amdgcn_swmmac_f32_16x16x32_bf16_w32(a, b, c, index);
}
// CHECK-GFX1200-LABEL: @test_amdgcn_swmmac_f16_16x16x32_f16_w32(
// CHECK-GFX1200-NEXT: entry:
// CHECK-GFX1200-NEXT: [[TMP0:%.*]] = tail call <8 x half> @llvm.amdgcn.swmmac.f16.16x16x32.f16.v8f16.v8f16.v16f16.i16(<8 x half> [[A:%.*]], <16 x half> [[B:%.*]], <8 x half> [[C:%.*]], i16 [[INDEX:%.*]])
// CHECK-GFX1200-NEXT: store <8 x half> [[TMP0]], ptr addrspace(1) [[OUT:%.*]], align 16, !tbaa [[TBAA4]]
// CHECK-GFX1200-NEXT: ret void
//
void test_amdgcn_swmmac_f16_16x16x32_f16_w32(global v8h* out, v8h a, v16h b, v8h c, short index)
{
*out = __builtin_amdgcn_swmmac_f16_16x16x32_f16_w32(a, b, c, index);
}
// CHECK-GFX1200-LABEL: @test_amdgcn_swmmac_bf16_16x16x32_bf16_w32(
// CHECK-GFX1200-NEXT: entry:
// CHECK-GFX1200-NEXT: [[TMP0:%.*]] = tail call <8 x i16> @llvm.amdgcn.swmmac.bf16.16x16x32.bf16.v8i16.v8i16.v16i16.i16(<8 x i16> [[A:%.*]], <16 x i16> [[B:%.*]], <8 x i16> [[C:%.*]], i16 [[INDEX:%.*]])
// CHECK-GFX1200-NEXT: store <8 x i16> [[TMP0]], ptr addrspace(1) [[OUT:%.*]], align 16, !tbaa [[TBAA4]]
// CHECK-GFX1200-NEXT: ret void
//
void test_amdgcn_swmmac_bf16_16x16x32_bf16_w32(global v8s* out, v8s a, v16s b, v8s c, short index)
{
*out = __builtin_amdgcn_swmmac_bf16_16x16x32_bf16_w32(a, b, c, index);
}
// CHECK-GFX1200-LABEL: @test_amdgcn_swmmac_i32_16x16x32_iu8_w32(
// CHECK-GFX1200-NEXT: entry:
// CHECK-GFX1200-NEXT: [[TMP0:%.*]] = tail call <8 x i32> @llvm.amdgcn.swmmac.i32.16x16x32.iu8.v8i32.v2i32.v4i32.i16(i1 true, <2 x i32> [[A:%.*]], i1 true, <4 x i32> [[B:%.*]], <8 x i32> [[C:%.*]], i16 [[INDEX:%.*]], i1 true)
// CHECK-GFX1200-NEXT: store <8 x i32> [[TMP0]], ptr addrspace(1) [[OUT:%.*]], align 32, !tbaa [[TBAA4]]
// CHECK-GFX1200-NEXT: ret void
//
void test_amdgcn_swmmac_i32_16x16x32_iu8_w32(global v8i* out, v2i a, v4i b, v8i c, short index)
{
*out = __builtin_amdgcn_swmmac_i32_16x16x32_iu8_w32(true, a, true, b, c, index, true);
}
// CHECK-GFX1200-LABEL: @test_amdgcn_swmmac_i32_16x16x32_iu4_w32(
// CHECK-GFX1200-NEXT: entry:
// CHECK-GFX1200-NEXT: [[TMP0:%.*]] = tail call <8 x i32> @llvm.amdgcn.swmmac.i32.16x16x32.iu4.v8i32.i32.v2i32.i16(i1 true, i32 [[A:%.*]], i1 true, <2 x i32> [[B:%.*]], <8 x i32> [[C:%.*]], i16 [[INDEX:%.*]], i1 true)
// CHECK-GFX1200-NEXT: store <8 x i32> [[TMP0]], ptr addrspace(1) [[OUT:%.*]], align 32, !tbaa [[TBAA4]]
// CHECK-GFX1200-NEXT: ret void
//
void test_amdgcn_swmmac_i32_16x16x32_iu4_w32(global v8i* out, int a, v2i b, v8i c, short index)
{
*out = __builtin_amdgcn_swmmac_i32_16x16x32_iu4_w32(true, a, true, b, c, index, true);
}
// CHECK-GFX1200-LABEL: @test_amdgcn_swmmac_i32_16x16x64_iu4_w32(
// CHECK-GFX1200-NEXT: entry:
// CHECK-GFX1200-NEXT: [[TMP0:%.*]] = tail call <8 x i32> @llvm.amdgcn.swmmac.i32.16x16x64.iu4.v8i32.v2i32.v4i32.i16(i1 true, <2 x i32> [[A:%.*]], i1 true, <4 x i32> [[B:%.*]], <8 x i32> [[C:%.*]], i16 [[INDEX:%.*]], i1 true)
// CHECK-GFX1200-NEXT: store <8 x i32> [[TMP0]], ptr addrspace(1) [[OUT:%.*]], align 32, !tbaa [[TBAA4]]
// CHECK-GFX1200-NEXT: ret void
//
void test_amdgcn_swmmac_i32_16x16x64_iu4_w32(global v8i* out, v2i a, v4i b, v8i c, short index)
{
*out = __builtin_amdgcn_swmmac_i32_16x16x64_iu4_w32(true, a, true, b, c, index, true);
}
// CHECK-GFX1200-LABEL: @test_amdgcn_swmmac_f32_16x16x32_fp8_fp8_w32(
// CHECK-GFX1200-NEXT: entry:
// CHECK-GFX1200-NEXT: [[TMP0:%.*]] = tail call <8 x float> @llvm.amdgcn.swmmac.f32.16x16x32.fp8.fp8.v8f32.v2i32.v4i32.i16(<2 x i32> [[A:%.*]], <4 x i32> [[B:%.*]], <8 x float> [[C:%.*]], i16 [[INDEX:%.*]])
// CHECK-GFX1200-NEXT: store <8 x float> [[TMP0]], ptr addrspace(1) [[OUT:%.*]], align 32, !tbaa [[TBAA4]]
// CHECK-GFX1200-NEXT: ret void
//
void test_amdgcn_swmmac_f32_16x16x32_fp8_fp8_w32(global v8f* out, v2i a, v4i b, v8f c, short index)
{
*out = __builtin_amdgcn_swmmac_f32_16x16x32_fp8_fp8_w32(a, b, c, index);
}
// CHECK-GFX1200-LABEL: @test_amdgcn_swmmac_f32_16x16x32_fp8_bf8_w32(
// CHECK-GFX1200-NEXT: entry:
// CHECK-GFX1200-NEXT: [[TMP0:%.*]] = tail call <8 x float> @llvm.amdgcn.swmmac.f32.16x16x32.fp8.bf8.v8f32.v2i32.v4i32.i16(<2 x i32> [[A:%.*]], <4 x i32> [[B:%.*]], <8 x float> [[C:%.*]], i16 [[INDEX:%.*]])
// CHECK-GFX1200-NEXT: store <8 x float> [[TMP0]], ptr addrspace(1) [[OUT:%.*]], align 32, !tbaa [[TBAA4]]
// CHECK-GFX1200-NEXT: ret void
//
void test_amdgcn_swmmac_f32_16x16x32_fp8_bf8_w32(global v8f* out, v2i a, v4i b, v8f c, short index)
{
*out = __builtin_amdgcn_swmmac_f32_16x16x32_fp8_bf8_w32(a, b, c, index);
}
// CHECK-GFX1200-LABEL: @test_amdgcn_swmmac_f32_16x16x32_bf8_fp8_w32(
// CHECK-GFX1200-NEXT: entry:
// CHECK-GFX1200-NEXT: [[TMP0:%.*]] = tail call <8 x float> @llvm.amdgcn.swmmac.f32.16x16x32.bf8.fp8.v8f32.v2i32.v4i32.i16(<2 x i32> [[A:%.*]], <4 x i32> [[B:%.*]], <8 x float> [[C:%.*]], i16 [[INDEX:%.*]])
// CHECK-GFX1200-NEXT: store <8 x float> [[TMP0]], ptr addrspace(1) [[OUT:%.*]], align 32, !tbaa [[TBAA4]]
// CHECK-GFX1200-NEXT: ret void
//
void test_amdgcn_swmmac_f32_16x16x32_bf8_fp8_w32(global v8f* out, v2i a, v4i b, v8f c, short index)
{
*out = __builtin_amdgcn_swmmac_f32_16x16x32_bf8_fp8_w32(a, b, c, index);
}
// CHECK-GFX1200-LABEL: @test_amdgcn_swmmac_f32_16x16x32_bf8_bf8_w32(
// CHECK-GFX1200-NEXT: entry:
// CHECK-GFX1200-NEXT: [[TMP0:%.*]] = tail call <8 x float> @llvm.amdgcn.swmmac.f32.16x16x32.bf8.bf8.v8f32.v2i32.v4i32.i16(<2 x i32> [[A:%.*]], <4 x i32> [[B:%.*]], <8 x float> [[C:%.*]], i16 [[INDEX:%.*]])
// CHECK-GFX1200-NEXT: store <8 x float> [[TMP0]], ptr addrspace(1) [[OUT:%.*]], align 32, !tbaa [[TBAA4]]
// CHECK-GFX1200-NEXT: ret void
//
void test_amdgcn_swmmac_f32_16x16x32_bf8_bf8_w32(global v8f* out, v2i a, v4i b, v8f c, short index)
{
*out = __builtin_amdgcn_swmmac_f32_16x16x32_bf8_bf8_w32(a, b, c, index);
}

View File

@ -0,0 +1,134 @@
// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py
// REQUIRES: amdgpu-registered-target
// RUN: %clang_cc1 -triple amdgcn-unknown-unknown -target-cpu gfx1200 -target-feature +wavefrontsize64 -S -emit-llvm -o - %s | FileCheck %s --check-prefix=CHECK-GFX1200
typedef int v2i __attribute__((ext_vector_type(2)));
typedef int v4i __attribute__((ext_vector_type(4)));
typedef float v4f __attribute__((ext_vector_type(4)));
typedef half v4h __attribute__((ext_vector_type(4)));
typedef short v4s __attribute__((ext_vector_type(4)));
typedef half v8h __attribute__((ext_vector_type(8)));
typedef short v8s __attribute__((ext_vector_type(8)));
// Wave64
// CHECK-GFX1200-LABEL: @test_amdgcn_swmmac_f32_16x16x32_f16_w64(
// CHECK-GFX1200-NEXT: entry:
// CHECK-GFX1200-NEXT: [[TMP0:%.*]] = tail call <4 x float> @llvm.amdgcn.swmmac.f32.16x16x32.f16.v4f32.v4f16.v8f16.i16(<4 x half> [[A:%.*]], <8 x half> [[B:%.*]], <4 x float> [[C:%.*]], i16 [[INDEX:%.*]])
// CHECK-GFX1200-NEXT: store <4 x float> [[TMP0]], ptr addrspace(1) [[OUT:%.*]], align 16, !tbaa [[TBAA4:![0-9]+]]
// CHECK-GFX1200-NEXT: ret void
//
void test_amdgcn_swmmac_f32_16x16x32_f16_w64(global v4f* out, v4h a, v8h b, v4f c, short index)
{
*out = __builtin_amdgcn_swmmac_f32_16x16x32_f16_w64(a, b, c, index);
}
// CHECK-GFX1200-LABEL: @test_amdgcn_swmmac_f32_16x16x32_bf16_w64(
// CHECK-GFX1200-NEXT: entry:
// CHECK-GFX1200-NEXT: [[TMP0:%.*]] = tail call <4 x float> @llvm.amdgcn.swmmac.f32.16x16x32.bf16.v4f32.v4i16.v8i16.i16(<4 x i16> [[A:%.*]], <8 x i16> [[B:%.*]], <4 x float> [[C:%.*]], i16 [[INDEX:%.*]])
// CHECK-GFX1200-NEXT: store <4 x float> [[TMP0]], ptr addrspace(1) [[OUT:%.*]], align 16, !tbaa [[TBAA4]]
// CHECK-GFX1200-NEXT: ret void
//
void test_amdgcn_swmmac_f32_16x16x32_bf16_w64(global v4f* out, v4s a, v8s b, v4f c, short index)
{
*out = __builtin_amdgcn_swmmac_f32_16x16x32_bf16_w64(a, b, c, index);
}
// CHECK-GFX1200-LABEL: @test_amdgcn_swmmac_f16_16x16x32_f16_w64(
// CHECK-GFX1200-NEXT: entry:
// CHECK-GFX1200-NEXT: [[TMP0:%.*]] = tail call <4 x half> @llvm.amdgcn.swmmac.f16.16x16x32.f16.v4f16.v4f16.v8f16.i16(<4 x half> [[A:%.*]], <8 x half> [[B:%.*]], <4 x half> [[C:%.*]], i16 [[INDEX:%.*]])
// CHECK-GFX1200-NEXT: store <4 x half> [[TMP0]], ptr addrspace(1) [[OUT:%.*]], align 8, !tbaa [[TBAA4]]
// CHECK-GFX1200-NEXT: ret void
//
void test_amdgcn_swmmac_f16_16x16x32_f16_w64(global v4h* out, v4h a, v8h b, v4h c, short index)
{
*out = __builtin_amdgcn_swmmac_f16_16x16x32_f16_w64(a, b, c, index);
}
// CHECK-GFX1200-LABEL: @test_amdgcn_swmmac_bf16_16x16x32_bf16_w64(
// CHECK-GFX1200-NEXT: entry:
// CHECK-GFX1200-NEXT: [[TMP0:%.*]] = tail call <4 x i16> @llvm.amdgcn.swmmac.bf16.16x16x32.bf16.v4i16.v4i16.v8i16.i16(<4 x i16> [[A:%.*]], <8 x i16> [[B:%.*]], <4 x i16> [[C:%.*]], i16 [[INDEX:%.*]])
// CHECK-GFX1200-NEXT: store <4 x i16> [[TMP0]], ptr addrspace(1) [[OUT:%.*]], align 8, !tbaa [[TBAA4]]
// CHECK-GFX1200-NEXT: ret void
//
void test_amdgcn_swmmac_bf16_16x16x32_bf16_w64(global v4s* out, v4s a, v8s b, v4s c, short index)
{
*out = __builtin_amdgcn_swmmac_bf16_16x16x32_bf16_w64(a, b, c, index);
}
// CHECK-GFX1200-LABEL: @test_amdgcn_swmmac_i32_16x16x32_iu8_w64(
// CHECK-GFX1200-NEXT: entry:
// CHECK-GFX1200-NEXT: [[TMP0:%.*]] = tail call <4 x i32> @llvm.amdgcn.swmmac.i32.16x16x32.iu8.v4i32.i32.v2i32.i16(i1 true, i32 [[A:%.*]], i1 true, <2 x i32> [[B:%.*]], <4 x i32> [[C:%.*]], i16 [[INDEX:%.*]], i1 true)
// CHECK-GFX1200-NEXT: store <4 x i32> [[TMP0]], ptr addrspace(1) [[OUT:%.*]], align 16, !tbaa [[TBAA4]]
// CHECK-GFX1200-NEXT: ret void
//
void test_amdgcn_swmmac_i32_16x16x32_iu8_w64(global v4i* out, int a, v2i b, v4i c, short index)
{
*out = __builtin_amdgcn_swmmac_i32_16x16x32_iu8_w64(true, a, true, b, c, index, true);
}
// CHECK-GFX1200-LABEL: @test_amdgcn_swmmac_i32_16x16x32_iu4_w64(
// CHECK-GFX1200-NEXT: entry:
// CHECK-GFX1200-NEXT: [[TMP0:%.*]] = tail call <4 x i32> @llvm.amdgcn.swmmac.i32.16x16x32.iu4.v4i32.i32.i32.i16(i1 true, i32 [[A:%.*]], i1 true, i32 [[B:%.*]], <4 x i32> [[C:%.*]], i16 [[INDEX:%.*]], i1 true)
// CHECK-GFX1200-NEXT: store <4 x i32> [[TMP0]], ptr addrspace(1) [[OUT:%.*]], align 16, !tbaa [[TBAA4]]
// CHECK-GFX1200-NEXT: ret void
//
void test_amdgcn_swmmac_i32_16x16x32_iu4_w64(global v4i* out, int a, int b, v4i c, short index)
{
*out = __builtin_amdgcn_swmmac_i32_16x16x32_iu4_w64(true, a, true, b, c, index, true);
}
// CHECK-GFX1200-LABEL: @test_amdgcn_swmmac_i32_16x16x64_iu4_w64(
// CHECK-GFX1200-NEXT: entry:
// CHECK-GFX1200-NEXT: [[TMP0:%.*]] = tail call <4 x i32> @llvm.amdgcn.swmmac.i32.16x16x64.iu4.v4i32.i32.v2i32.i16(i1 true, i32 [[A:%.*]], i1 true, <2 x i32> [[B:%.*]], <4 x i32> [[C:%.*]], i16 [[INDEX:%.*]], i1 true)
// CHECK-GFX1200-NEXT: store <4 x i32> [[TMP0]], ptr addrspace(1) [[OUT:%.*]], align 16, !tbaa [[TBAA4]]
// CHECK-GFX1200-NEXT: ret void
//
void test_amdgcn_swmmac_i32_16x16x64_iu4_w64(global v4i* out, int a, v2i b, v4i c, short index)
{
*out = __builtin_amdgcn_swmmac_i32_16x16x64_iu4_w64(true, a, true, b, c, index, true);
}
// CHECK-GFX1200-LABEL: @test_amdgcn_swmmac_f32_16x16x32_fp8_fp8_w64(
// CHECK-GFX1200-NEXT: entry:
// CHECK-GFX1200-NEXT: [[TMP0:%.*]] = tail call <4 x float> @llvm.amdgcn.swmmac.f32.16x16x32.fp8.fp8.v4f32.i32.v2i32.i16(i32 [[A:%.*]], <2 x i32> [[B:%.*]], <4 x float> [[C:%.*]], i16 [[INDEX:%.*]])
// CHECK-GFX1200-NEXT: store <4 x float> [[TMP0]], ptr addrspace(1) [[OUT:%.*]], align 16, !tbaa [[TBAA4]]
// CHECK-GFX1200-NEXT: ret void
//
void test_amdgcn_swmmac_f32_16x16x32_fp8_fp8_w64(global v4f* out, int a, v2i b, v4f c, short index)
{
*out = __builtin_amdgcn_swmmac_f32_16x16x32_fp8_fp8_w64(a, b, c, index);
}
// CHECK-GFX1200-LABEL: @test_amdgcn_swmmac_f32_16x16x32_fp8_bf8_w64(
// CHECK-GFX1200-NEXT: entry:
// CHECK-GFX1200-NEXT: [[TMP0:%.*]] = tail call <4 x float> @llvm.amdgcn.swmmac.f32.16x16x32.fp8.bf8.v4f32.i32.v2i32.i16(i32 [[A:%.*]], <2 x i32> [[B:%.*]], <4 x float> [[C:%.*]], i16 [[INDEX:%.*]])
// CHECK-GFX1200-NEXT: store <4 x float> [[TMP0]], ptr addrspace(1) [[OUT:%.*]], align 16, !tbaa [[TBAA4]]
// CHECK-GFX1200-NEXT: ret void
//
void test_amdgcn_swmmac_f32_16x16x32_fp8_bf8_w64(global v4f* out, int a, v2i b, v4f c, short index)
{
*out = __builtin_amdgcn_swmmac_f32_16x16x32_fp8_bf8_w64(a, b, c, index);
}
// CHECK-GFX1200-LABEL: @test_amdgcn_swmmac_f32_16x16x32_bf8_fp8_w64(
// CHECK-GFX1200-NEXT: entry:
// CHECK-GFX1200-NEXT: [[TMP0:%.*]] = tail call <4 x float> @llvm.amdgcn.swmmac.f32.16x16x32.bf8.fp8.v4f32.i32.v2i32.i16(i32 [[A:%.*]], <2 x i32> [[B:%.*]], <4 x float> [[C:%.*]], i16 [[INDEX:%.*]])
// CHECK-GFX1200-NEXT: store <4 x float> [[TMP0]], ptr addrspace(1) [[OUT:%.*]], align 16, !tbaa [[TBAA4]]
// CHECK-GFX1200-NEXT: ret void
//
void test_amdgcn_swmmac_f32_16x16x32_bf8_fp8_w64(global v4f* out, int a, v2i b, v4f c, short index)
{
*out = __builtin_amdgcn_swmmac_f32_16x16x32_bf8_fp8_w64(a, b, c, index);
}
// CHECK-GFX1200-LABEL: @test_amdgcn_swmmac_f32_16x16x32_bf8_bf8_w64(
// CHECK-GFX1200-NEXT: entry:
// CHECK-GFX1200-NEXT: [[TMP0:%.*]] = tail call <4 x float> @llvm.amdgcn.swmmac.f32.16x16x32.bf8.bf8.v4f32.i32.v2i32.i16(i32 [[A:%.*]], <2 x i32> [[B:%.*]], <4 x float> [[C:%.*]], i16 [[INDEX:%.*]])
// CHECK-GFX1200-NEXT: store <4 x float> [[TMP0]], ptr addrspace(1) [[OUT:%.*]], align 16, !tbaa [[TBAA4]]
// CHECK-GFX1200-NEXT: ret void
//
void test_amdgcn_swmmac_f32_16x16x32_bf8_bf8_w64(global v4f* out, int a, v2i b, v4f c, short index)
{
*out = __builtin_amdgcn_swmmac_f32_16x16x32_bf8_bf8_w64(a, b, c, index);
}

View File

@ -16,7 +16,7 @@ typedef short v16s __attribute__((ext_vector_type(16)));
// Wave32
void test_amdgcn_wmma_f32_16x16x16_bf16_w32(global v8f* out8f, v16s a16s, v16s b16s, v8f c8f,
void test_amdgcn_wmma_f32_16x16x16_bf16_w32(global v8f* out8f, v16s a16s, v16s b16s, v8f c8f,
global v16h* out16h, v16h a16h, v16h b16h, v16h c16h,
global v16s* out16s, v2i a2i, v2i b2i, v16s c16s,
global v8i* out8i, v4i a4i, v4i b4i, v8i c8i)

View File

@ -2,7 +2,6 @@
// REQUIRES: amdgpu-registered-target
// RUN: %clang_cc1 -triple amdgcn-unknown-unknown -target-cpu gfx1100 -DWMMA_GFX1100_TESTS -S -emit-llvm -o - %s | FileCheck %s --check-prefix=CHECK-GFX1100
typedef float v4f __attribute__((ext_vector_type(4)));
typedef float v8f __attribute__((ext_vector_type(8)));
typedef half v16h __attribute__((ext_vector_type(16)));
typedef int v2i __attribute__((ext_vector_type(2)));
@ -20,7 +19,7 @@ typedef short v16s __attribute__((ext_vector_type(16)));
// CHECK-GFX1100-LABEL: @test_amdgcn_wmma_f32_16x16x16_f16_w32(
// CHECK-GFX1100-NEXT: entry:
// CHECK-GFX1100-NEXT: [[TMP0:%.*]] = tail call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.f16.v8f32(<16 x half> [[A:%.*]], <16 x half> [[B:%.*]], <8 x float> [[C:%.*]])
// CHECK-GFX1100-NEXT: [[TMP0:%.*]] = tail call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.f16.v8f32.v16f16(<16 x half> [[A:%.*]], <16 x half> [[B:%.*]], <8 x float> [[C:%.*]])
// CHECK-GFX1100-NEXT: store <8 x float> [[TMP0]], ptr addrspace(1) [[OUT:%.*]], align 32, !tbaa [[TBAA4:![0-9]+]]
// CHECK-GFX1100-NEXT: ret void
//
@ -35,7 +34,7 @@ void test_amdgcn_wmma_f32_16x16x16_f16_w32(global v8f* out, v16h a, v16h b, v8f
// CHECK-GFX1100-LABEL: @test_amdgcn_wmma_f32_16x16x16_bf16_w32(
// CHECK-GFX1100-NEXT: entry:
// CHECK-GFX1100-NEXT: [[TMP0:%.*]] = tail call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf16.v8f32(<16 x i16> [[A:%.*]], <16 x i16> [[B:%.*]], <8 x float> [[C:%.*]])
// CHECK-GFX1100-NEXT: [[TMP0:%.*]] = tail call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf16.v8f32.v16i16(<16 x i16> [[A:%.*]], <16 x i16> [[B:%.*]], <8 x float> [[C:%.*]])
// CHECK-GFX1100-NEXT: store <8 x float> [[TMP0]], ptr addrspace(1) [[OUT:%.*]], align 32, !tbaa [[TBAA4]]
// CHECK-GFX1100-NEXT: ret void
//
@ -50,7 +49,7 @@ void test_amdgcn_wmma_f32_16x16x16_bf16_w32(global v8f* out, v16s a, v16s b, v8f
// CHECK-GFX1100-LABEL: @test_amdgcn_wmma_f16_16x16x16_f16_w32(
// CHECK-GFX1100-NEXT: entry:
// CHECK-GFX1100-NEXT: [[TMP0:%.*]] = tail call <16 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16.v16f16(<16 x half> [[A:%.*]], <16 x half> [[B:%.*]], <16 x half> [[C:%.*]], i1 true)
// CHECK-GFX1100-NEXT: [[TMP0:%.*]] = tail call <16 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16.v16f16.v16f16(<16 x half> [[A:%.*]], <16 x half> [[B:%.*]], <16 x half> [[C:%.*]], i1 true)
// CHECK-GFX1100-NEXT: store <16 x half> [[TMP0]], ptr addrspace(1) [[OUT:%.*]], align 32, !tbaa [[TBAA4]]
// CHECK-GFX1100-NEXT: ret void
//
@ -65,7 +64,7 @@ void test_amdgcn_wmma_f16_16x16x16_f16_w32(global v16h* out, v16h a, v16h b, v16
// CHECK-GFX1100-LABEL: @test_amdgcn_wmma_bf16_16x16x16_bf16_w32(
// CHECK-GFX1100-NEXT: entry:
// CHECK-GFX1100-NEXT: [[TMP0:%.*]] = tail call <16 x i16> @llvm.amdgcn.wmma.bf16.16x16x16.bf16.v16i16(<16 x i16> [[A:%.*]], <16 x i16> [[B:%.*]], <16 x i16> [[C:%.*]], i1 true)
// CHECK-GFX1100-NEXT: [[TMP0:%.*]] = tail call <16 x i16> @llvm.amdgcn.wmma.bf16.16x16x16.bf16.v16i16.v16i16(<16 x i16> [[A:%.*]], <16 x i16> [[B:%.*]], <16 x i16> [[C:%.*]], i1 true)
// CHECK-GFX1100-NEXT: store <16 x i16> [[TMP0]], ptr addrspace(1) [[OUT:%.*]], align 32, !tbaa [[TBAA4]]
// CHECK-GFX1100-NEXT: ret void
//
@ -80,7 +79,7 @@ void test_amdgcn_wmma_bf16_16x16x16_bf16_w32(global v16s* out, v16s a, v16s b, v
// CHECK-GFX1100-LABEL: @test_amdgcn_wmma_f16_16x16x16_f16_tied_w32(
// CHECK-GFX1100-NEXT: entry:
// CHECK-GFX1100-NEXT: [[TMP0:%.*]] = tail call <16 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16.tied.v16f16(<16 x half> [[A:%.*]], <16 x half> [[B:%.*]], <16 x half> [[C:%.*]], i1 true)
// CHECK-GFX1100-NEXT: [[TMP0:%.*]] = tail call <16 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16.tied.v16f16.v16f16(<16 x half> [[A:%.*]], <16 x half> [[B:%.*]], <16 x half> [[C:%.*]], i1 true)
// CHECK-GFX1100-NEXT: store <16 x half> [[TMP0]], ptr addrspace(1) [[OUT:%.*]], align 32, !tbaa [[TBAA4]]
// CHECK-GFX1100-NEXT: ret void
//
@ -95,7 +94,7 @@ void test_amdgcn_wmma_f16_16x16x16_f16_tied_w32(global v16h* out, v16h a, v16h b
// CHECK-GFX1100-LABEL: @test_amdgcn_wmma_bf16_16x16x16_bf16_tied_w32(
// CHECK-GFX1100-NEXT: entry:
// CHECK-GFX1100-NEXT: [[TMP0:%.*]] = tail call <16 x i16> @llvm.amdgcn.wmma.bf16.16x16x16.bf16.tied.v16i16(<16 x i16> [[A:%.*]], <16 x i16> [[B:%.*]], <16 x i16> [[C:%.*]], i1 true)
// CHECK-GFX1100-NEXT: [[TMP0:%.*]] = tail call <16 x i16> @llvm.amdgcn.wmma.bf16.16x16x16.bf16.tied.v16i16.v16i16(<16 x i16> [[A:%.*]], <16 x i16> [[B:%.*]], <16 x i16> [[C:%.*]], i1 true)
// CHECK-GFX1100-NEXT: store <16 x i16> [[TMP0]], ptr addrspace(1) [[OUT:%.*]], align 32, !tbaa [[TBAA4]]
// CHECK-GFX1100-NEXT: ret void
//
@ -110,7 +109,7 @@ void test_amdgcn_wmma_bf16_16x16x16_bf16_tied_w32(global v16s* out, v16s a, v16s
// CHECK-GFX1100-LABEL: @test_amdgcn_wmma_i32_16x16x16_iu8_w32(
// CHECK-GFX1100-NEXT: entry:
// CHECK-GFX1100-NEXT: [[TMP0:%.*]] = tail call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8.v8i32(i1 true, <4 x i32> [[A:%.*]], i1 true, <4 x i32> [[B:%.*]], <8 x i32> [[C:%.*]], i1 false)
// CHECK-GFX1100-NEXT: [[TMP0:%.*]] = tail call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8.v8i32.v4i32(i1 true, <4 x i32> [[A:%.*]], i1 true, <4 x i32> [[B:%.*]], <8 x i32> [[C:%.*]], i1 false)
// CHECK-GFX1100-NEXT: store <8 x i32> [[TMP0]], ptr addrspace(1) [[OUT:%.*]], align 32, !tbaa [[TBAA4]]
// CHECK-GFX1100-NEXT: ret void
//
@ -125,7 +124,7 @@ void test_amdgcn_wmma_i32_16x16x16_iu8_w32(global v8i* out, v4i a, v4i b, v8i c)
// CHECK-GFX1100-LABEL: @test_amdgcn_wmma_i32_16x16x16_iu4_w32(
// CHECK-GFX1100-NEXT: entry:
// CHECK-GFX1100-NEXT: [[TMP0:%.*]] = tail call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4.v8i32(i1 true, <2 x i32> [[A:%.*]], i1 true, <2 x i32> [[B:%.*]], <8 x i32> [[C:%.*]], i1 false)
// CHECK-GFX1100-NEXT: [[TMP0:%.*]] = tail call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4.v8i32.v2i32(i1 true, <2 x i32> [[A:%.*]], i1 true, <2 x i32> [[B:%.*]], <8 x i32> [[C:%.*]], i1 false)
// CHECK-GFX1100-NEXT: store <8 x i32> [[TMP0]], ptr addrspace(1) [[OUT:%.*]], align 32, !tbaa [[TBAA4]]
// CHECK-GFX1100-NEXT: ret void
//

View File

@ -3,12 +3,10 @@
// RUN: %clang_cc1 -triple amdgcn-unknown-unknown -target-cpu gfx1100 -DWMMA_GFX1100_TESTS -S -emit-llvm -o - %s | FileCheck %s --check-prefix=CHECK-GFX1100
typedef float v4f __attribute__((ext_vector_type(4)));
typedef float v8f __attribute__((ext_vector_type(8)));
typedef half v8h __attribute__((ext_vector_type(8)));
typedef half v16h __attribute__((ext_vector_type(16)));
typedef int v2i __attribute__((ext_vector_type(2)));
typedef int v4i __attribute__((ext_vector_type(4)));
typedef int v8i __attribute__((ext_vector_type(8)));
typedef short v8s __attribute__((ext_vector_type(8)));
typedef short v16s __attribute__((ext_vector_type(16)));
@ -22,7 +20,7 @@ typedef short v16s __attribute__((ext_vector_type(16)));
// CHECK-GFX1100-LABEL: @test_amdgcn_wmma_f32_16x16x16_f16_w64(
// CHECK-GFX1100-NEXT: entry:
// CHECK-GFX1100-NEXT: [[TMP0:%.*]] = tail call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.f16.v4f32(<16 x half> [[A:%.*]], <16 x half> [[B:%.*]], <4 x float> [[C:%.*]])
// CHECK-GFX1100-NEXT: [[TMP0:%.*]] = tail call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.f16.v4f32.v16f16(<16 x half> [[A:%.*]], <16 x half> [[B:%.*]], <4 x float> [[C:%.*]])
// CHECK-GFX1100-NEXT: store <4 x float> [[TMP0]], ptr addrspace(1) [[OUT:%.*]], align 16, !tbaa [[TBAA4:![0-9]+]]
// CHECK-GFX1100-NEXT: ret void
//
@ -37,7 +35,7 @@ void test_amdgcn_wmma_f32_16x16x16_f16_w64(global v4f* out, v16h a, v16h b, v4f
// CHECK-GFX1100-LABEL: @test_amdgcn_wmma_f32_16x16x16_bf16_w64(
// CHECK-GFX1100-NEXT: entry:
// CHECK-GFX1100-NEXT: [[TMP0:%.*]] = tail call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf16.v4f32(<16 x i16> [[A:%.*]], <16 x i16> [[B:%.*]], <4 x float> [[C:%.*]])
// CHECK-GFX1100-NEXT: [[TMP0:%.*]] = tail call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf16.v4f32.v16i16(<16 x i16> [[A:%.*]], <16 x i16> [[B:%.*]], <4 x float> [[C:%.*]])
// CHECK-GFX1100-NEXT: store <4 x float> [[TMP0]], ptr addrspace(1) [[OUT:%.*]], align 16, !tbaa [[TBAA4]]
// CHECK-GFX1100-NEXT: ret void
//
@ -52,7 +50,7 @@ void test_amdgcn_wmma_f32_16x16x16_bf16_w64(global v4f* out, v16s a, v16s b, v4f
// CHECK-GFX1100-LABEL: @test_amdgcn_wmma_f16_16x16x16_f16_w64(
// CHECK-GFX1100-NEXT: entry:
// CHECK-GFX1100-NEXT: [[TMP0:%.*]] = tail call <8 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16.v8f16(<16 x half> [[A:%.*]], <16 x half> [[B:%.*]], <8 x half> [[C:%.*]], i1 true)
// CHECK-GFX1100-NEXT: [[TMP0:%.*]] = tail call <8 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16.v8f16.v16f16(<16 x half> [[A:%.*]], <16 x half> [[B:%.*]], <8 x half> [[C:%.*]], i1 true)
// CHECK-GFX1100-NEXT: store <8 x half> [[TMP0]], ptr addrspace(1) [[OUT:%.*]], align 16, !tbaa [[TBAA4]]
// CHECK-GFX1100-NEXT: ret void
//
@ -67,7 +65,7 @@ void test_amdgcn_wmma_f16_16x16x16_f16_w64(global v8h* out, v16h a, v16h b, v8h
// CHECK-GFX1100-LABEL: @test_amdgcn_wmma_bf16_16x16x16_bf16_w64(
// CHECK-GFX1100-NEXT: entry:
// CHECK-GFX1100-NEXT: [[TMP0:%.*]] = tail call <8 x i16> @llvm.amdgcn.wmma.bf16.16x16x16.bf16.v8i16(<16 x i16> [[A:%.*]], <16 x i16> [[B:%.*]], <8 x i16> [[C:%.*]], i1 true)
// CHECK-GFX1100-NEXT: [[TMP0:%.*]] = tail call <8 x i16> @llvm.amdgcn.wmma.bf16.16x16x16.bf16.v8i16.v16i16(<16 x i16> [[A:%.*]], <16 x i16> [[B:%.*]], <8 x i16> [[C:%.*]], i1 true)
// CHECK-GFX1100-NEXT: store <8 x i16> [[TMP0]], ptr addrspace(1) [[OUT:%.*]], align 16, !tbaa [[TBAA4]]
// CHECK-GFX1100-NEXT: ret void
//
@ -82,7 +80,7 @@ void test_amdgcn_wmma_bf16_16x16x16_bf16_w64(global v8s* out, v16s a, v16s b, v8
// CHECK-GFX1100-LABEL: @test_amdgcn_wmma_f16_16x16x16_f16_tied_w64(
// CHECK-GFX1100-NEXT: entry:
// CHECK-GFX1100-NEXT: [[TMP0:%.*]] = tail call <8 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16.tied.v8f16(<16 x half> [[A:%.*]], <16 x half> [[B:%.*]], <8 x half> [[C:%.*]], i1 true)
// CHECK-GFX1100-NEXT: [[TMP0:%.*]] = tail call <8 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16.tied.v8f16.v16f16(<16 x half> [[A:%.*]], <16 x half> [[B:%.*]], <8 x half> [[C:%.*]], i1 true)
// CHECK-GFX1100-NEXT: store <8 x half> [[TMP0]], ptr addrspace(1) [[OUT:%.*]], align 16, !tbaa [[TBAA4]]
// CHECK-GFX1100-NEXT: ret void
//
@ -97,7 +95,7 @@ void test_amdgcn_wmma_f16_16x16x16_f16_tied_w64(global v8h* out, v16h a, v16h b,
// CHECK-GFX1100-LABEL: @test_amdgcn_wmma_bf16_16x16x16_bf16_tied_w64(
// CHECK-GFX1100-NEXT: entry:
// CHECK-GFX1100-NEXT: [[TMP0:%.*]] = tail call <8 x i16> @llvm.amdgcn.wmma.bf16.16x16x16.bf16.tied.v8i16(<16 x i16> [[A:%.*]], <16 x i16> [[B:%.*]], <8 x i16> [[C:%.*]], i1 true)
// CHECK-GFX1100-NEXT: [[TMP0:%.*]] = tail call <8 x i16> @llvm.amdgcn.wmma.bf16.16x16x16.bf16.tied.v8i16.v16i16(<16 x i16> [[A:%.*]], <16 x i16> [[B:%.*]], <8 x i16> [[C:%.*]], i1 true)
// CHECK-GFX1100-NEXT: store <8 x i16> [[TMP0]], ptr addrspace(1) [[OUT:%.*]], align 16, !tbaa [[TBAA4]]
// CHECK-GFX1100-NEXT: ret void
//
@ -112,7 +110,7 @@ void test_amdgcn_wmma_bf16_16x16x16_bf16_tied_w64(global v8s* out, v16s a, v16s
// CHECK-GFX1100-LABEL: @test_amdgcn_wmma_i32_16x16x16_iu8_w64(
// CHECK-GFX1100-NEXT: entry:
// CHECK-GFX1100-NEXT: [[TMP0:%.*]] = tail call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8.v4i32(i1 true, <4 x i32> [[A:%.*]], i1 true, <4 x i32> [[B:%.*]], <4 x i32> [[C:%.*]], i1 false)
// CHECK-GFX1100-NEXT: [[TMP0:%.*]] = tail call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8.v4i32.v4i32(i1 true, <4 x i32> [[A:%.*]], i1 true, <4 x i32> [[B:%.*]], <4 x i32> [[C:%.*]], i1 false)
// CHECK-GFX1100-NEXT: store <4 x i32> [[TMP0]], ptr addrspace(1) [[OUT:%.*]], align 16, !tbaa [[TBAA4]]
// CHECK-GFX1100-NEXT: ret void
//
@ -127,7 +125,7 @@ void test_amdgcn_wmma_i32_16x16x16_iu8_w64(global v4i* out, v4i a, v4i b, v4i c)
// CHECK-GFX1100-LABEL: @test_amdgcn_wmma_i32_16x16x16_iu4_w64(
// CHECK-GFX1100-NEXT: entry:
// CHECK-GFX1100-NEXT: [[TMP0:%.*]] = tail call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4.v4i32(i1 true, <2 x i32> [[A:%.*]], i1 true, <2 x i32> [[B:%.*]], <4 x i32> [[C:%.*]], i1 false)
// CHECK-GFX1100-NEXT: [[TMP0:%.*]] = tail call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4.v4i32.v2i32(i1 true, <2 x i32> [[A:%.*]], i1 true, <2 x i32> [[B:%.*]], <4 x i32> [[C:%.*]], i1 false)
// CHECK-GFX1100-NEXT: store <4 x i32> [[TMP0]], ptr addrspace(1) [[OUT:%.*]], align 16, !tbaa [[TBAA4]]
// CHECK-GFX1100-NEXT: ret void
//

View File

@ -0,0 +1,107 @@
// RUN: %clang_cc1 -triple amdgcn-unknown-unknown -target-cpu gfx1200 -target-feature +wavefrontsize32 -S -o - %s | FileCheck %s --check-prefix=CHECK-GFX1200
typedef int v2i __attribute__((ext_vector_type(2)));
typedef float v8f __attribute__((ext_vector_type(8)));
typedef half v8h __attribute__((ext_vector_type(8)));
typedef short v8s __attribute__((ext_vector_type(8)));
typedef int v8i __attribute__((ext_vector_type(8)));
// Wave32
// CHECK-GFX1200-LABEL: test_amdgcn_wmma_f32_16x16x16_f16_w32:
// CHECK-GFX1200: v_wmma_f32_16x16x16_f16 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}]
//
void test_amdgcn_wmma_f32_16x16x16_f16_w32(global v8f* out, v8h a, v8h b, v8f c)
{
*out = __builtin_amdgcn_wmma_f32_16x16x16_f16_w32_gfx12(a, b, c);
}
// CHECK-GFX1200-LABEL: test_amdgcn_wmma_f32_16x16x16_bf16_w32:
// CHECK-GFX1200: v_wmma_f32_16x16x16_bf16 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}]
//
void test_amdgcn_wmma_f32_16x16x16_bf16_w32(global v8f* out, v8s a, v8s b, v8f c)
{
*out = __builtin_amdgcn_wmma_f32_16x16x16_bf16_w32_gfx12(a, b, c);
}
// CHECK-GFX1200-LABEL: test_amdgcn_wmma_f16_16x16x16_f16_w32:
// CHECK-GFX1200: v_wmma_f16_16x16x16_f16 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}]
//
void test_amdgcn_wmma_f16_16x16x16_f16_w32(global v8h* out, v8h a, v8h b, v8h c)
{
*out = __builtin_amdgcn_wmma_f16_16x16x16_f16_w32_gfx12(a, b, c);
}
// CHECK-GFX1200-LABEL: test_amdgcn_wmma_bf16_16x16x16_bf16_w32:
// CHECK-GFX1200: v_wmma_bf16_16x16x16_bf16 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}]
void test_amdgcn_wmma_bf16_16x16x16_bf16_w32(global v8s* out, v8s a, v8s b, v8s c)
{
*out = __builtin_amdgcn_wmma_bf16_16x16x16_bf16_w32_gfx12(a, b, c);
}
// CHECK-GFX1200-LABEL: test_amdgcn_wmma_i32_16x16x16_iu8_w32:
// CHECK-GFX1200: v_wmma_i32_16x16x16_iu8 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}] neg_lo:[1,1,0]
//
void test_amdgcn_wmma_i32_16x16x16_iu8_w32(global v8i* out, v2i a, v2i b, v8i c)
{
*out = __builtin_amdgcn_wmma_i32_16x16x16_iu8_w32_gfx12(true, a, true, b, c, false);
}
// CHECK-GFX1200-LABEL: test_amdgcn_wmma_i32_16x16x16_iu4_w32:
// CHECK-GFX1200: v_wmma_i32_16x16x16_iu4 v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}}, v{{[0-9]+}}, v[{{[0-9]+:[0-9]+}}] neg_lo:[1,1,0]
//
void test_amdgcn_wmma_i32_16x16x16_iu4_w32(global v8i* out, int a, int b, v8i c)
{
*out = __builtin_amdgcn_wmma_i32_16x16x16_iu4_w32_gfx12(true, a, true, b, c, false);
}
// CHECK-GFX1200-LABEL: test_amdgcn_wmma_f32_16x16x16_fp8_fp8_w32:
// CHECK-GFX1200: v_wmma_f32_16x16x16_fp8_fp8 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}]
//
void test_amdgcn_wmma_f32_16x16x16_fp8_fp8_w32(global v8f* out, v2i a, v2i b, v8f c)
{
*out = __builtin_amdgcn_wmma_f32_16x16x16_fp8_fp8_w32_gfx12(a, b, c);
}
// CHECK-GFX1200-LABEL: test_amdgcn_wmma_f32_16x16x16_fp8_bf8_w32:
// CHECK-GFX1200: v_wmma_f32_16x16x16_fp8_bf8 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}]
//
void test_amdgcn_wmma_f32_16x16x16_fp8_bf8_w32(global v8f* out, v2i a, v2i b, v8f c)
{
*out = __builtin_amdgcn_wmma_f32_16x16x16_fp8_bf8_w32_gfx12(a, b, c);
}
// CHECK-GFX1200-LABEL: test_amdgcn_wmma_f32_16x16x16_bf8_fp8_w32:
// CHECK-GFX1200: v_wmma_f32_16x16x16_bf8_fp8 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}]
//
void test_amdgcn_wmma_f32_16x16x16_bf8_fp8_w32(global v8f* out, v2i a, v2i b, v8f c)
{
*out = __builtin_amdgcn_wmma_f32_16x16x16_bf8_fp8_w32_gfx12(a, b, c);
}
// CHECK-GFX1200-LABEL: test_amdgcn_wmma_f32_16x16x16_bf8_bf8_w32:
// CHECK-GFX1200: v_wmma_f32_16x16x16_bf8_bf8 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}]
//
void test_amdgcn_wmma_f32_16x16x16_bf8_bf8_w32(global v8f* out, v2i a, v2i b, v8f c)
{
*out = __builtin_amdgcn_wmma_f32_16x16x16_bf8_bf8_w32_gfx12(a, b, c);
}
// CHECK-GFX1200-LABEL: test_amdgcn_wmma_i32_16x16x32_iu4_w32:
// CHECK-GFX1200: v_wmma_i32_16x16x32_iu4 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}] neg_lo:[1,1,0]
//
void test_amdgcn_wmma_i32_16x16x32_iu4_w32(global v8i* out, v2i a, v2i b, v8i c)
{
*out = __builtin_amdgcn_wmma_i32_16x16x32_iu4_w32_gfx12(true, a, true, b, c, false);
}

View File

@ -0,0 +1,104 @@
// RUN: %clang_cc1 -triple amdgcn-unknown-unknown -target-cpu gfx1200 -target-feature +wavefrontsize64 -S -o - %s | FileCheck %s --check-prefix=CHECK-GFX1200
typedef float v4f __attribute__((ext_vector_type(4)));
typedef half v4h __attribute__((ext_vector_type(4)));
typedef short v4s __attribute__((ext_vector_type(4)));
typedef int v4i __attribute__((ext_vector_type(4)));
// Wave64
// CHECK-GFX1200-LABEL: test_amdgcn_wmma_f32_16x16x16_f16_w64:
// CHECK-GFX1200: v_wmma_f32_16x16x16_f16 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}]
//
void test_amdgcn_wmma_f32_16x16x16_f16_w64(global v4f* out, v4h a, v4h b, v4f c)
{
*out = __builtin_amdgcn_wmma_f32_16x16x16_f16_w64_gfx12(a, b, c);
}
// CHECK-GFX1200-LABEL: test_amdgcn_wmma_f32_16x16x16_bf16_w64:
// CHECK-GFX1200: v_wmma_f32_16x16x16_bf16 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}]
//
void test_amdgcn_wmma_f32_16x16x16_bf16_w64(global v4f* out, v4s a, v4s b, v4f c)
{
*out = __builtin_amdgcn_wmma_f32_16x16x16_bf16_w64_gfx12(a, b, c);
}
// CHECK-GFX1200-LABEL: test_amdgcn_wmma_f16_16x16x16_f16_w64:
// CHECK-GFX1200: v_wmma_f16_16x16x16_f16 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}]
//
void test_amdgcn_wmma_f16_16x16x16_f16_w64(global v4h* out, v4h a, v4h b, v4h c)
{
*out = __builtin_amdgcn_wmma_f16_16x16x16_f16_w64_gfx12(a, b, c);
}
// CHECK-GFX1200-LABEL: test_amdgcn_wmma_bf16_16x16x16_bf16_w64:
// CHECK-GFX1200: v_wmma_bf16_16x16x16_bf16 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}]
//
void test_amdgcn_wmma_bf16_16x16x16_bf16_w64(global v4s* out, v4s a, v4s b, v4s c)
{
*out = __builtin_amdgcn_wmma_bf16_16x16x16_bf16_w64_gfx12(a, b, c);
}
// CHECK-GFX1200-LABEL: test_amdgcn_wmma_i32_16x16x16_iu8_w64:
// CHECK-GFX1200: v_wmma_i32_16x16x16_iu8 v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}}, v{{[0-9]+}}, v[{{[0-9]+:[0-9]+}}] neg_lo:[1,1,0]
//
void test_amdgcn_wmma_i32_16x16x16_iu8_w64(global v4i* out, int a, int b, v4i c)
{
*out = __builtin_amdgcn_wmma_i32_16x16x16_iu8_w64_gfx12(true, a, true, b, c, false);
}
// CHECK-GFX1200-LABEL: test_amdgcn_wmma_i32_16x16x16_iu4_w64:
// CHECK-GFX1200: v_wmma_i32_16x16x16_iu4 v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}}, v{{[0-9]+}}, v[{{[0-9]+:[0-9]+}}] neg_lo:[1,1,0]
//
void test_amdgcn_wmma_i32_16x16x16_iu4_w64(global v4i* out, int a, int b, v4i c)
{
*out = __builtin_amdgcn_wmma_i32_16x16x16_iu4_w64_gfx12(true, a, true, b, c, false);
}
// CHECK-GFX1200-LABEL: test_amdgcn_wmma_f32_16x16x16_fp8_fp8_w32:
// CHECK-GFX1200: v_wmma_f32_16x16x16_fp8_fp8 v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}}, v{{[0-9]+}}, v[{{[0-9]+:[0-9]+}}]
//
void test_amdgcn_wmma_f32_16x16x16_fp8_fp8_w32(global v4f* out, int a, int b, v4f c)
{
*out = __builtin_amdgcn_wmma_f32_16x16x16_fp8_fp8_w64_gfx12(a, b, c);
}
// CHECK-GFX1200-LABEL: test_amdgcn_wmma_f32_16x16x16_fp8_bf8_w32:
// CHECK-GFX1200: v_wmma_f32_16x16x16_fp8_bf8 v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}}, v{{[0-9]+}}, v[{{[0-9]+:[0-9]+}}]
//
void test_amdgcn_wmma_f32_16x16x16_fp8_bf8_w32(global v4f* out, int a, int b, v4f c)
{
*out = __builtin_amdgcn_wmma_f32_16x16x16_fp8_bf8_w64_gfx12(a, b, c);
}
// CHECK-GFX1200-LABEL: test_amdgcn_wmma_f32_16x16x16_bf8_fp8_w32:
// CHECK-GFX1200: v_wmma_f32_16x16x16_bf8_fp8 v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}}, v{{[0-9]+}}, v[{{[0-9]+:[0-9]+}}]
//
void test_amdgcn_wmma_f32_16x16x16_bf8_fp8_w32(global v4f* out, int a, int b, v4f c)
{
*out = __builtin_amdgcn_wmma_f32_16x16x16_bf8_fp8_w64_gfx12(a, b, c);
}
// CHECK-GFX1200-LABEL: test_amdgcn_wmma_f32_16x16x16_bf8_bf8_w32:
// CHECK-GFX1200: v_wmma_f32_16x16x16_bf8_bf8 v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}}, v{{[0-9]+}}, v[{{[0-9]+:[0-9]+}}]
//
void test_amdgcn_wmma_f32_16x16x16_bf8_bf8_w32(global v4f* out, int a, int b, v4f c)
{
*out = __builtin_amdgcn_wmma_f32_16x16x16_bf8_bf8_w64_gfx12(a, b, c);
}
// CHECK-GFX1200-LABEL: test_amdgcn_wmma_i32_16x16x32_iu4_w32:
// CHECK-GFX1200: v_wmma_i32_16x16x32_iu4 v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}}, v{{[0-9]+}}, v[{{[0-9]+:[0-9]+}}] neg_lo:[1,1,0]
//
void test_amdgcn_wmma_i32_16x16x32_iu4_w32(global v4i* out, int a, int b, v4i c)
{
*out = __builtin_amdgcn_wmma_i32_16x16x32_iu4_w64_gfx12(true, a, true, b, c, false);
}

View File

@ -0,0 +1,110 @@
// RUN: %clang_cc1 -triple amdgcn-unknown-unknown -target-cpu gfx1200 -target-feature +wavefrontsize32 -S -o - %s | FileCheck %s --check-prefix=CHECK-GFX1200
typedef int v2i __attribute__((ext_vector_type(2)));
typedef int v4i __attribute__((ext_vector_type(4)));
typedef float v8f __attribute__((ext_vector_type(8)));
typedef half v8h __attribute__((ext_vector_type(8)));
typedef short v8s __attribute__((ext_vector_type(8)));
typedef int v8i __attribute__((ext_vector_type(8)));
typedef half v16h __attribute__((ext_vector_type(16)));
typedef short v16s __attribute__((ext_vector_type(16)));
// Wave32
// CHECK-GFX1200-LABEL: test_amdgcn_swmmac_f32_16x16x32_f16_w32:
// CHECK-GFX1200: v_swmmac_f32_16x16x32_f16 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}}
//
void test_amdgcn_swmmac_f32_16x16x32_f16_w32(global v8f* out, v8h a, v16h b, v8f c, short index)
{
*out = __builtin_amdgcn_swmmac_f32_16x16x32_f16_w32(a, b, c, index);
}
// CHECK-GFX1200-LABEL: test_amdgcn_swmmac_f32_16x16x32_bf16_w32:
// CHECK-GFX1200: v_swmmac_f32_16x16x32_bf16 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}}
//
void test_amdgcn_swmmac_f32_16x16x32_bf16_w32(global v8f* out, v8s a, v16s b, v8f c, short index)
{
*out = __builtin_amdgcn_swmmac_f32_16x16x32_bf16_w32(a, b, c, index);
}
// CHECK-GFX1200-LABEL: test_amdgcn_swmmac_f16_16x16x32_f16_w32:
// CHECK-GFX1200: v_swmmac_f16_16x16x32_f16 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}}
//
void test_amdgcn_swmmac_f16_16x16x32_f16_w32(global v8h* out, v8h a, v16h b, v8h c, short index)
{
*out = __builtin_amdgcn_swmmac_f16_16x16x32_f16_w32(a, b, c, index);
}
// CHECK-GFX1200-LABEL: test_amdgcn_swmmac_bf16_16x16x32_bf16_w32:
// CHECK-GFX1200: v_swmmac_bf16_16x16x32_bf16 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}}
//
void test_amdgcn_swmmac_bf16_16x16x32_bf16_w32(global v8s* out, v8s a, v16s b, v8s c, short index)
{
*out = __builtin_amdgcn_swmmac_bf16_16x16x32_bf16_w32(a, b, c, index);
}
// CHECK-GFX1200-LABEL: test_amdgcn_swmmac_i32_16x16x32_iu8_w32:
// CHECK-GFX1200: v_swmmac_i32_16x16x32_iu8 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}} neg_lo:[1,1,0] clamp
//
void test_amdgcn_swmmac_i32_16x16x32_iu8_w32(global v8i* out, v2i a, v4i b, v8i c, short index)
{
*out = __builtin_amdgcn_swmmac_i32_16x16x32_iu8_w32(true, a, true, b, c, index, true);
}
// CHECK-GFX1200-LABEL: test_amdgcn_swmmac_i32_16x16x32_iu4_w32:
// CHECK-GFX1200: v_swmmac_i32_16x16x32_iu4 v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}}, v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}} neg_lo:[1,1,0] clamp
//
void test_amdgcn_swmmac_i32_16x16x32_iu4_w32(global v8i* out, int a, v2i b, v8i c, short index)
{
*out = __builtin_amdgcn_swmmac_i32_16x16x32_iu4_w32(true, a, true, b, c, index, true);
}
// CHECK-GFX1200-LABEL: test_amdgcn_swmmac_i32_16x16x64_iu4_w32:
// CHECK-GFX1200: v_swmmac_i32_16x16x64_iu4 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}} neg_lo:[1,1,0] clamp
//
void test_amdgcn_swmmac_i32_16x16x64_iu4_w32(global v8i* out, v2i a, v4i b, v8i c, short index)
{
*out = __builtin_amdgcn_swmmac_i32_16x16x64_iu4_w32(true, a, true, b, c, index, true);
}
// CHECK-GFX1200-LABEL: test_amdgcn_swmmac_f32_16x16x32_fp8_fp8_w32:
// CHECK-GFX1200: v_swmmac_f32_16x16x32_fp8_fp8 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}}
//
void test_amdgcn_swmmac_f32_16x16x32_fp8_fp8_w32(global v8f* out, v2i a, v4i b, v8f c, short index)
{
*out = __builtin_amdgcn_swmmac_f32_16x16x32_fp8_fp8_w32(a, b, c, index);
}
// CHECK-GFX1200-LABEL: test_amdgcn_swmmac_f32_16x16x32_fp8_bf8_w32:
// CHECK-GFX1200: v_swmmac_f32_16x16x32_fp8_bf8 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}}
//
void test_amdgcn_swmmac_f32_16x16x32_fp8_bf8_w32(global v8f* out, v2i a, v4i b, v8f c, short index)
{
*out = __builtin_amdgcn_swmmac_f32_16x16x32_fp8_bf8_w32(a, b, c, index);
}
// CHECK-GFX1200-LABEL: test_amdgcn_swmmac_f32_16x16x32_bf8_fp8_w32:
// CHECK-GFX1200: v_swmmac_f32_16x16x32_bf8_fp8 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}}
//
void test_amdgcn_swmmac_f32_16x16x32_bf8_fp8_w32(global v8f* out, v2i a, v4i b, v8f c, short index)
{
*out = __builtin_amdgcn_swmmac_f32_16x16x32_bf8_fp8_w32(a, b, c, index);
}
// CHECK-GFX1200-LABEL: test_amdgcn_swmmac_f32_16x16x32_bf8_bf8_w32:
// CHECK-GFX1200: v_swmmac_f32_16x16x32_bf8_bf8 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}}
//
void test_amdgcn_swmmac_f32_16x16x32_bf8_bf8_w32(global v8f* out, v2i a, v4i b, v8f c, short index)
{
*out = __builtin_amdgcn_swmmac_f32_16x16x32_bf8_bf8_w32(a, b, c, index);
}

View File

@ -0,0 +1,109 @@
// RUN: %clang_cc1 -triple amdgcn-unknown-unknown -target-cpu gfx1200 -target-feature +wavefrontsize64 -S -o - %s | FileCheck %s --check-prefix=CHECK-GFX1200
typedef int v2i __attribute__((ext_vector_type(2)));
typedef int v4i __attribute__((ext_vector_type(4)));
typedef float v4f __attribute__((ext_vector_type(4)));
typedef half v4h __attribute__((ext_vector_type(4)));
typedef short v4s __attribute__((ext_vector_type(4)));
typedef half v8h __attribute__((ext_vector_type(8)));
typedef short v8s __attribute__((ext_vector_type(8)));
// Wave64
// CHECK-GFX1200-LABEL: test_amdgcn_swmmac_f32_16x16x32_f16_w64:
// CHECK-GFX1200: v_swmmac_f32_16x16x32_f16 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}}
//
void test_amdgcn_swmmac_f32_16x16x32_f16_w64(global v4f* out, v4h a, v8h b, v4f c, short index)
{
*out = __builtin_amdgcn_swmmac_f32_16x16x32_f16_w64(a, b, c, index);
}
// CHECK-GFX1200-LABEL: test_amdgcn_swmmac_f32_16x16x32_bf16_w64:
// CHECK-GFX1200: v_swmmac_f32_16x16x32_bf16 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}}
//
void test_amdgcn_swmmac_f32_16x16x32_bf16_w64(global v4f* out, v4s a, v8s b, v4f c, short index)
{
*out = __builtin_amdgcn_swmmac_f32_16x16x32_bf16_w64(a, b, c, index);
}
// CHECK-GFX1200-LABEL: test_amdgcn_swmmac_f16_16x16x32_f16_w64:
// CHECK-GFX1200: v_swmmac_f16_16x16x32_f16 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}}
//
void test_amdgcn_swmmac_f16_16x16x32_f16_w64(global v4h* out, v4h a, v8h b, v4h c, short index)
{
*out = __builtin_amdgcn_swmmac_f16_16x16x32_f16_w64(a, b, c, index);
}
// CHECK-GFX1200-LABEL: test_amdgcn_swmmac_bf16_16x16x32_bf16_w64:
// CHECK-GFX1200: v_swmmac_bf16_16x16x32_bf16 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}}
//
void test_amdgcn_swmmac_bf16_16x16x32_bf16_w64(global v4s* out, v4s a, v8s b, v4s c, short index)
{
*out = __builtin_amdgcn_swmmac_bf16_16x16x32_bf16_w64(a, b, c, index);
}
// CHECK-GFX1200-LABEL: test_amdgcn_swmmac_i32_16x16x32_iu8_w64:
// CHECK-GFX1200: v_swmmac_i32_16x16x32_iu8 v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}}, v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}} neg_lo:[1,1,0] clamp
//
void test_amdgcn_swmmac_i32_16x16x32_iu8_w64(global v4i* out, int a, v2i b, v4i c, short index)
{
*out = __builtin_amdgcn_swmmac_i32_16x16x32_iu8_w64(true, a, true, b, c, index, true);
}
// CHECK-GFX1200-LABEL: test_amdgcn_swmmac_i32_16x16x32_iu4_w64:
// CHECK-GFX1200: v_swmmac_i32_16x16x32_iu4 v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} neg_lo:[1,1,0] clamp
//
void test_amdgcn_swmmac_i32_16x16x32_iu4_w64(global v4i* out, int a, int b, v4i c, short index)
{
*out = __builtin_amdgcn_swmmac_i32_16x16x32_iu4_w64(true, a, true, b, c, index, true);
}
// CHECK-GFX1200-LABEL: test_amdgcn_swmmac_i32_16x16x64_iu4_w64:
// CHECK-GFX1200: v_swmmac_i32_16x16x64_iu4 v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}}, v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}} neg_lo:[1,1,0] clamp
//
void test_amdgcn_swmmac_i32_16x16x64_iu4_w64(global v4i* out, int a, v2i b, v4i c, short index)
{
*out = __builtin_amdgcn_swmmac_i32_16x16x64_iu4_w64(true, a, true, b, c, index, true);
}
// CHECK-GFX1200-LABEL: test_amdgcn_swmmac_f32_16x16x32_fp8_fp8_w64:
// CHECK-GFX1200: v_swmmac_f32_16x16x32_fp8_fp8 v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}}, v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}}
//
void test_amdgcn_swmmac_f32_16x16x32_fp8_fp8_w64(global v4f* out, int a, v2i b, v4f c, short index)
{
*out = __builtin_amdgcn_swmmac_f32_16x16x32_fp8_fp8_w64(a, b, c, index);
}
// CHECK-GFX1200-LABEL: test_amdgcn_swmmac_f32_16x16x32_fp8_bf8_w64:
// CHECK-GFX1200: v_swmmac_f32_16x16x32_fp8_bf8 v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}}, v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}}
//
void test_amdgcn_swmmac_f32_16x16x32_fp8_bf8_w64(global v4f* out, int a, v2i b, v4f c, short index)
{
*out = __builtin_amdgcn_swmmac_f32_16x16x32_fp8_bf8_w64(a, b, c, index);
}
// CHECK-GFX1200-LABEL: test_amdgcn_swmmac_f32_16x16x32_bf8_fp8_w64:
// CHECK-GFX1200: v_swmmac_f32_16x16x32_bf8_fp8 v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}}, v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}}
//
void test_amdgcn_swmmac_f32_16x16x32_bf8_fp8_w64(global v4f* out, int a, v2i b, v4f c, short index)
{
*out = __builtin_amdgcn_swmmac_f32_16x16x32_bf8_fp8_w64(a, b, c, index);
}
// CHECK-GFX1200-LABEL: test_amdgcn_swmmac_f32_16x16x32_bf8_bf8_w64:
// CHECK-GFX1200: v_swmmac_f32_16x16x32_bf8_bf8 v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}}, v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}}
//
void test_amdgcn_swmmac_f32_16x16x32_bf8_bf8_w64(global v4f* out, int a, v2i b, v4f c, short index)
{
*out = __builtin_amdgcn_swmmac_f32_16x16x32_bf8_bf8_w64(a, b, c, index);
}

View File

@ -2601,6 +2601,11 @@ def int_amdgcn_ds_bvh_stack_rtn :
[ImmArg<ArgIndex<3>>, IntrWillReturn, IntrNoCallback, IntrNoFree]
>;
def int_amdgcn_s_wait_event_export_ready :
ClangBuiltin<"__builtin_amdgcn_s_wait_event_export_ready">,
Intrinsic<[], [], [IntrNoMem, IntrHasSideEffects, IntrWillReturn]
>;
// WMMA (Wave Matrix Multiply-Accumulate) intrinsics
//
// These operations perform a matrix multiplication and accumulation of
@ -2608,10 +2613,10 @@ def int_amdgcn_ds_bvh_stack_rtn :
class AMDGPUWmmaIntrinsic<LLVMType AB, LLVMType CD> :
Intrinsic<
[CD], // %D
[CD], // %D
[
AB, // %A
AB, // %B
LLVMMatchType<1>, // %B
LLVMMatchType<0>, // %C
],
[IntrNoMem, IntrConvergent, IntrWillReturn, IntrNoCallback, IntrNoFree]
@ -2619,49 +2624,50 @@ class AMDGPUWmmaIntrinsic<LLVMType AB, LLVMType CD> :
class AMDGPUWmmaIntrinsicOPSEL<LLVMType AB, LLVMType CD> :
Intrinsic<
[CD], // %D
[CD], // %D
[
AB, // %A
AB, // %B
LLVMMatchType<1>, // %B
LLVMMatchType<0>, // %C
llvm_i1_ty, // %high
llvm_i1_ty, // %high (op_sel) for GFX11, 0 for GFX12
],
[IntrNoMem, IntrConvergent, ImmArg<ArgIndex<3>>, IntrWillReturn, IntrNoCallback, IntrNoFree]
>;
class AMDGPUWmmaIntrinsicIU<LLVMType AB, LLVMType CD> :
Intrinsic<
[CD], // %D
[CD], // %D
[
llvm_i1_ty, // %A_sign
AB, // %A
llvm_i1_ty, // %B_sign
AB, // %B
LLVMMatchType<1>, // %B
LLVMMatchType<0>, // %C
llvm_i1_ty, // %clamp
],
[IntrNoMem, IntrConvergent, ImmArg<ArgIndex<0>>, ImmArg<ArgIndex<2>>, ImmArg<ArgIndex<5>>, IntrWillReturn, IntrNoCallback, IntrNoFree]
>;
def int_amdgcn_wmma_f32_16x16x16_f16 : AMDGPUWmmaIntrinsic<llvm_v16f16_ty, llvm_anyfloat_ty>;
def int_amdgcn_wmma_f32_16x16x16_bf16 : AMDGPUWmmaIntrinsic<llvm_v16i16_ty, llvm_anyfloat_ty>;
// The regular, untied f16/bf16 wmma intrinsics only write to one half
// of the registers (set via the op_sel bit).
// The content of the other 16-bit of the registers is undefined.
def int_amdgcn_wmma_f16_16x16x16_f16 : AMDGPUWmmaIntrinsicOPSEL<llvm_v16f16_ty, llvm_anyfloat_ty>;
def int_amdgcn_wmma_bf16_16x16x16_bf16 : AMDGPUWmmaIntrinsicOPSEL<llvm_v16i16_ty, llvm_anyint_ty>;
// The tied versions of the f16/bf16 wmma intrinsics tie the destination matrix
// registers to the input accumulator registers.
// Essentially, the content of the other 16-bit is preserved from the input.
def int_amdgcn_wmma_f16_16x16x16_f16_tied : AMDGPUWmmaIntrinsicOPSEL<llvm_v16f16_ty, llvm_anyfloat_ty>;
def int_amdgcn_wmma_bf16_16x16x16_bf16_tied : AMDGPUWmmaIntrinsicOPSEL<llvm_v16i16_ty, llvm_anyint_ty>;
def int_amdgcn_wmma_i32_16x16x16_iu8 : AMDGPUWmmaIntrinsicIU<llvm_v4i32_ty, llvm_anyint_ty>;
def int_amdgcn_wmma_i32_16x16x16_iu4 : AMDGPUWmmaIntrinsicIU<llvm_v2i32_ty, llvm_anyint_ty>;
// WMMA GFX11Only
def int_amdgcn_s_wait_event_export_ready :
ClangBuiltin<"__builtin_amdgcn_s_wait_event_export_ready">,
Intrinsic<[], [], [IntrNoMem, IntrHasSideEffects, IntrWillReturn]
>;
// The OPSEL intrinsics read from and write to one half of the registers, selected by the op_sel bit.
// The tied versions of the f16/bf16 wmma intrinsics tie the destination matrix registers to the input accumulator registers.
// The content of the other 16-bit half is preserved from the input.
def int_amdgcn_wmma_f16_16x16x16_f16_tied : AMDGPUWmmaIntrinsicOPSEL<llvm_anyfloat_ty, llvm_anyfloat_ty>;
def int_amdgcn_wmma_bf16_16x16x16_bf16_tied : AMDGPUWmmaIntrinsicOPSEL<llvm_anyint_ty, llvm_anyint_ty>;
// WMMA GFX11Plus
def int_amdgcn_wmma_f32_16x16x16_f16 : AMDGPUWmmaIntrinsic<llvm_anyfloat_ty, llvm_anyfloat_ty>;
def int_amdgcn_wmma_f32_16x16x16_bf16 : AMDGPUWmmaIntrinsic<llvm_anyint_ty, llvm_anyfloat_ty>;
def int_amdgcn_wmma_i32_16x16x16_iu8 : AMDGPUWmmaIntrinsicIU<llvm_anyint_ty, llvm_anyint_ty>;
def int_amdgcn_wmma_i32_16x16x16_iu4 : AMDGPUWmmaIntrinsicIU<llvm_anyint_ty, llvm_anyint_ty>;
// GFX11: The OPSEL intrinsics read from and write to one half of the registers, selected by the op_sel bit.
// The content of the other 16-bit half is undefined.
// GFX12: The op_sel bit must be 0.
def int_amdgcn_wmma_f16_16x16x16_f16 : AMDGPUWmmaIntrinsicOPSEL<llvm_anyfloat_ty, llvm_anyfloat_ty>;
def int_amdgcn_wmma_bf16_16x16x16_bf16 : AMDGPUWmmaIntrinsicOPSEL<llvm_anyint_ty, llvm_anyint_ty>;
//===----------------------------------------------------------------------===//
// GFX12 Intrinsics
@ -2681,6 +2687,65 @@ def int_amdgcn_permlanex16_var : ClangBuiltin<"__builtin_amdgcn_permlanex16_var"
[IntrNoMem, IntrConvergent, IntrWillReturn,
ImmArg<ArgIndex<3>>, ImmArg<ArgIndex<4>>, IntrNoCallback, IntrNoFree]>;
// WMMA (Wave Matrix Multiply-Accumulate) intrinsics
//
// These operations perform a matrix multiplication and accumulation of
// the form: D = A * B + C .
// A and B are <8 x fp8> or <8 x bf8>, but since fp8 and bf8 are not supported by llvm we use <2 x i32>.
def int_amdgcn_wmma_f32_16x16x16_fp8_fp8 : AMDGPUWmmaIntrinsic<llvm_anyint_ty, llvm_anyfloat_ty>;
def int_amdgcn_wmma_f32_16x16x16_fp8_bf8 : AMDGPUWmmaIntrinsic<llvm_anyint_ty, llvm_anyfloat_ty>;
def int_amdgcn_wmma_f32_16x16x16_bf8_fp8 : AMDGPUWmmaIntrinsic<llvm_anyint_ty, llvm_anyfloat_ty>;
def int_amdgcn_wmma_f32_16x16x16_bf8_bf8 : AMDGPUWmmaIntrinsic<llvm_anyint_ty, llvm_anyfloat_ty>;
// A and B are <16 x iu4>.
def int_amdgcn_wmma_i32_16x16x32_iu4 : AMDGPUWmmaIntrinsicIU<llvm_anyint_ty, llvm_anyint_ty>;
// SWMMAC (Wave Matrix(sparse) Multiply-Accumulate) intrinsics
//
// These operations perform a sparse matrix multiplication and accumulation of
// the form: D = A * B + C.
// A is sparse matrix, half the size of B, and is expanded using sparsity index.
class AMDGPUSWmmacIntrinsicIdx<LLVMType A, LLVMType B, LLVMType CD, LLVMType Index> :
Intrinsic<
[CD], // %D
[
A, // %A
B, // %B
LLVMMatchType<0>, // %C
Index // %Sparsity index for A
],
[IntrNoMem, IntrConvergent, IntrWillReturn]
>;
class AMDGPUSWmmacIntrinsicIUIdx<LLVMType A, LLVMType B, LLVMType CD, LLVMType Index> :
Intrinsic<
[CD], // %D
[
llvm_i1_ty, // %A_sign
A, // %A
llvm_i1_ty, // %B_sign
B, // %B
LLVMMatchType<0>, // %C
Index, // %Sparsity index for A
llvm_i1_ty, // %clamp
],
[IntrNoMem, IntrConvergent, IntrWillReturn, ImmArg<ArgIndex<0>>, ImmArg<ArgIndex<2>>, ImmArg<ArgIndex<6>>]
>;
def int_amdgcn_swmmac_f32_16x16x32_f16 : AMDGPUSWmmacIntrinsicIdx<llvm_anyfloat_ty, llvm_anyfloat_ty, llvm_anyfloat_ty, llvm_anyint_ty>;
def int_amdgcn_swmmac_f32_16x16x32_bf16 : AMDGPUSWmmacIntrinsicIdx<llvm_anyint_ty, llvm_anyint_ty, llvm_anyfloat_ty, llvm_anyint_ty>;
def int_amdgcn_swmmac_f16_16x16x32_f16 : AMDGPUSWmmacIntrinsicIdx<llvm_anyfloat_ty, llvm_anyfloat_ty, llvm_anyfloat_ty, llvm_anyint_ty>;
def int_amdgcn_swmmac_bf16_16x16x32_bf16 : AMDGPUSWmmacIntrinsicIdx<llvm_anyint_ty, llvm_anyint_ty, llvm_anyint_ty, llvm_anyint_ty>;
def int_amdgcn_swmmac_i32_16x16x32_iu8 : AMDGPUSWmmacIntrinsicIUIdx<llvm_anyint_ty, llvm_anyint_ty, llvm_anyint_ty, llvm_anyint_ty>;
def int_amdgcn_swmmac_i32_16x16x32_iu4 : AMDGPUSWmmacIntrinsicIUIdx<llvm_anyint_ty, llvm_anyint_ty, llvm_anyint_ty, llvm_anyint_ty>;
def int_amdgcn_swmmac_i32_16x16x64_iu4 : AMDGPUSWmmacIntrinsicIUIdx<llvm_anyint_ty, llvm_anyint_ty, llvm_anyint_ty, llvm_anyint_ty>;
def int_amdgcn_swmmac_f32_16x16x32_fp8_fp8 : AMDGPUSWmmacIntrinsicIdx<llvm_anyint_ty, llvm_anyint_ty, llvm_anyfloat_ty, llvm_anyint_ty>;
def int_amdgcn_swmmac_f32_16x16x32_fp8_bf8 : AMDGPUSWmmacIntrinsicIdx<llvm_anyint_ty, llvm_anyint_ty, llvm_anyfloat_ty, llvm_anyint_ty>;
def int_amdgcn_swmmac_f32_16x16x32_bf8_fp8 : AMDGPUSWmmacIntrinsicIdx<llvm_anyint_ty, llvm_anyint_ty, llvm_anyfloat_ty, llvm_anyint_ty>;
def int_amdgcn_swmmac_f32_16x16x32_bf8_bf8 : AMDGPUSWmmacIntrinsicIdx<llvm_anyint_ty, llvm_anyint_ty, llvm_anyfloat_ty, llvm_anyint_ty>;
def int_amdgcn_global_atomic_ordered_add_b64 : AMDGPUAtomicRtn<llvm_i64_ty, global_ptr_ty>;
def int_amdgcn_flat_atomic_fmin_num : AMDGPUAtomicRtn<llvm_anyfloat_ty>;

View File

@ -59,6 +59,30 @@ def gi_wmmaopselvop3pmods :
GIComplexOperandMatcher<s32, "selectWMMAOpSelVOP3PMods">,
GIComplexPatternEquiv<WMMAOpSelVOP3PMods>;
def gi_wmmavisrc :
GIComplexOperandMatcher<s32, "selectWMMAVISrc">,
GIComplexPatternEquiv<WMMAVISrc>;
def gi_wmmamods :
GIComplexOperandMatcher<s32, "selectWMMAModsF32NegAbs">,
GIComplexPatternEquiv<WMMAModsF32NegAbs>;
def gi_wmmamodsf16Neg :
GIComplexOperandMatcher<s32, "selectWMMAModsF16Neg">,
GIComplexPatternEquiv<WMMAModsF16Neg>;
def gi_wmmamodsf16NegAbs :
GIComplexOperandMatcher<s32, "selectWMMAModsF16NegAbs">,
GIComplexPatternEquiv<WMMAModsF16NegAbs>;
def gi_swmmacindex8 :
GIComplexOperandMatcher<s32, "selectSWMMACIndex8">,
GIComplexPatternEquiv<SWMMACIndex8>;
def gi_swmmacindex16 :
GIComplexOperandMatcher<s32, "selectSWMMACIndex16">,
GIComplexPatternEquiv<SWMMACIndex16>;
def gi_vop3opselmods :
GIComplexOperandMatcher<s32, "selectVOP3OpSelMods">,
GIComplexPatternEquiv<VOP3OpSelMods>;

View File

@ -3048,6 +3048,336 @@ bool AMDGPUDAGToDAGISel::SelectWMMAOpSelVOP3PMods(SDValue In,
return true;
}
static MachineSDNode *buildRegSequence32(SmallVectorImpl<SDValue> &Elts,
llvm::SelectionDAG *CurDAG,
const SDLoc &DL) {
unsigned DstRegClass;
EVT DstTy;
switch (Elts.size()) {
case 8:
DstRegClass = AMDGPU::VReg_256RegClassID;
DstTy = MVT::v8i32;
break;
case 4:
DstRegClass = AMDGPU::VReg_128RegClassID;
DstTy = MVT::v4i32;
break;
case 2:
DstRegClass = AMDGPU::VReg_64RegClassID;
DstTy = MVT::v2i32;
break;
default:
llvm_unreachable("unhandled Reg sequence size");
}
SmallVector<SDValue, 17> Ops;
Ops.push_back(CurDAG->getTargetConstant(DstRegClass, DL, MVT::i32));
for (unsigned i = 0; i < Elts.size(); ++i) {
Ops.push_back(Elts[i]);
Ops.push_back(CurDAG->getTargetConstant(
SIRegisterInfo::getSubRegFromChannel(i), DL, MVT::i32));
}
return CurDAG->getMachineNode(TargetOpcode::REG_SEQUENCE, DL, DstTy, Ops);
}
static MachineSDNode *buildRegSequence16(SmallVectorImpl<SDValue> &Elts,
llvm::SelectionDAG *CurDAG,
const SDLoc &DL) {
SmallVector<SDValue, 8> PackedElts;
assert("unhandled Reg sequence size" &&
(Elts.size() == 8 || Elts.size() == 16));
// Pack 16-bit elements in pairs into 32-bit register. If both elements are
// unpacked from 32-bit source use it, otherwise pack them using v_perm.
for (unsigned i = 0; i < Elts.size(); i += 2) {
SDValue LoSrc = stripExtractLoElt(stripBitcast(Elts[i]));
SDValue HiSrc;
if (isExtractHiElt(Elts[i + 1], HiSrc) && LoSrc == HiSrc) {
PackedElts.push_back(HiSrc);
} else {
SDValue PackLoLo = CurDAG->getTargetConstant(0x05040100, DL, MVT::i32);
MachineSDNode *Packed =
CurDAG->getMachineNode(AMDGPU::V_PERM_B32_e64, DL, MVT::i32,
{Elts[i + 1], Elts[i], PackLoLo});
PackedElts.push_back(SDValue(Packed, 0));
}
}
return buildRegSequence32(PackedElts, CurDAG, DL);
}
static MachineSDNode *buildRegSequence(SmallVectorImpl<SDValue> &Elts,
llvm::SelectionDAG *CurDAG,
const SDLoc &DL, unsigned ElementSize) {
if (ElementSize == 16)
return buildRegSequence16(Elts, CurDAG, DL);
if (ElementSize == 32)
return buildRegSequence32(Elts, CurDAG, DL);
llvm_unreachable("Unhandled element size");
}
static void selectWMMAModsNegAbs(unsigned ModOpcode, unsigned &Mods,
SmallVectorImpl<SDValue> &Elts, SDValue &Src,
llvm::SelectionDAG *CurDAG, const SDLoc &DL,
unsigned ElementSize) {
if (ModOpcode == ISD::FNEG) {
Mods |= SISrcMods::NEG;
// Check if all elements also have abs modifier
SmallVector<SDValue, 8> NegAbsElts;
for (auto El : Elts) {
if (El.getOpcode() != ISD::FABS)
break;
NegAbsElts.push_back(El->getOperand(0));
}
if (Elts.size() != NegAbsElts.size()) {
// Neg
Src = SDValue(buildRegSequence(Elts, CurDAG, DL, ElementSize), 0);
} else {
// Neg and Abs
Mods |= SISrcMods::NEG_HI;
Src = SDValue(buildRegSequence(NegAbsElts, CurDAG, DL, ElementSize), 0);
}
} else {
assert(ModOpcode == ISD::FABS);
// Abs
Mods |= SISrcMods::NEG_HI;
Src = SDValue(buildRegSequence(Elts, CurDAG, DL, ElementSize), 0);
}
}
// Check all f16 elements for modifiers while looking through b32 and v2b16
// build vector, stop if element does not satisfy ModifierCheck.
static void
checkWMMAElementsModifiersF16(BuildVectorSDNode *BV,
std::function<bool(SDValue)> ModifierCheck) {
for (unsigned i = 0; i < BV->getNumOperands(); ++i) {
if (auto *F16Pair =
dyn_cast<BuildVectorSDNode>(stripBitcast(BV->getOperand(i)))) {
for (unsigned i = 0; i < F16Pair->getNumOperands(); ++i) {
SDValue ElF16 = stripBitcast(F16Pair->getOperand(i));
if (!ModifierCheck(ElF16))
break;
}
}
}
}
bool AMDGPUDAGToDAGISel::SelectWMMAModsF16Neg(SDValue In, SDValue &Src,
SDValue &SrcMods) const {
Src = In;
unsigned Mods = SISrcMods::OP_SEL_1;
// mods are on f16 elements
if (auto *BV = dyn_cast<BuildVectorSDNode>(stripBitcast(In))) {
SmallVector<SDValue, 8> EltsF16;
checkWMMAElementsModifiersF16(BV, [&](SDValue Element) -> bool {
if (Element.getOpcode() != ISD::FNEG)
return false;
EltsF16.push_back(Element.getOperand(0));
return true;
});
// All elements have neg modifier
if (BV->getNumOperands() * 2 == EltsF16.size()) {
Src = SDValue(buildRegSequence16(EltsF16, CurDAG, SDLoc(In)), 0);
Mods |= SISrcMods::NEG;
Mods |= SISrcMods::NEG_HI;
}
}
// mods are on v2f16 elements
if (auto *BV = dyn_cast<BuildVectorSDNode>(stripBitcast(In))) {
SmallVector<SDValue, 8> EltsV2F16;
for (unsigned i = 0; i < BV->getNumOperands(); ++i) {
SDValue ElV2f16 = stripBitcast(BV->getOperand(i));
// Based on first element decide which mod we match, neg or abs
if (ElV2f16.getOpcode() != ISD::FNEG)
break;
EltsV2F16.push_back(ElV2f16.getOperand(0));
}
// All pairs of elements have neg modifier
if (BV->getNumOperands() == EltsV2F16.size()) {
Src = SDValue(buildRegSequence32(EltsV2F16, CurDAG, SDLoc(In)), 0);
Mods |= SISrcMods::NEG;
Mods |= SISrcMods::NEG_HI;
}
}
SrcMods = CurDAG->getTargetConstant(Mods, SDLoc(In), MVT::i32);
return true;
}
bool AMDGPUDAGToDAGISel::SelectWMMAModsF16NegAbs(SDValue In, SDValue &Src,
SDValue &SrcMods) const {
Src = In;
unsigned Mods = SISrcMods::OP_SEL_1;
unsigned ModOpcode;
// mods are on f16 elements
if (auto *BV = dyn_cast<BuildVectorSDNode>(stripBitcast(In))) {
SmallVector<SDValue, 8> EltsF16;
checkWMMAElementsModifiersF16(BV, [&](SDValue ElF16) -> bool {
// Based on first element decide which mod we match, neg or abs
if (EltsF16.empty())
ModOpcode = (ElF16.getOpcode() == ISD::FNEG) ? ISD::FNEG : ISD::FABS;
if (ElF16.getOpcode() != ModOpcode)
return false;
EltsF16.push_back(ElF16.getOperand(0));
return true;
});
// All elements have ModOpcode modifier
if (BV->getNumOperands() * 2 == EltsF16.size())
selectWMMAModsNegAbs(ModOpcode, Mods, EltsF16, Src, CurDAG, SDLoc(In),
16);
}
// mods are on v2f16 elements
if (auto *BV = dyn_cast<BuildVectorSDNode>(stripBitcast(In))) {
SmallVector<SDValue, 8> EltsV2F16;
for (unsigned i = 0; i < BV->getNumOperands(); ++i) {
SDValue ElV2f16 = stripBitcast(BV->getOperand(i));
// Based on first element decide which mod we match, neg or abs
if (EltsV2F16.empty())
ModOpcode = (ElV2f16.getOpcode() == ISD::FNEG) ? ISD::FNEG : ISD::FABS;
if (ElV2f16->getOpcode() != ModOpcode)
break;
EltsV2F16.push_back(ElV2f16->getOperand(0));
}
// All elements have ModOpcode modifier
if (BV->getNumOperands() == EltsV2F16.size())
selectWMMAModsNegAbs(ModOpcode, Mods, EltsV2F16, Src, CurDAG, SDLoc(In),
32);
}
SrcMods = CurDAG->getTargetConstant(Mods, SDLoc(In), MVT::i32);
return true;
}
bool AMDGPUDAGToDAGISel::SelectWMMAModsF32NegAbs(SDValue In, SDValue &Src,
SDValue &SrcMods) const {
Src = In;
unsigned Mods = SISrcMods::OP_SEL_1;
unsigned ModOpcode;
SmallVector<SDValue, 8> EltsF32;
if (auto *BV = dyn_cast<BuildVectorSDNode>(stripBitcast(In))) {
for (unsigned i = 0; i < BV->getNumOperands(); ++i) {
SDValue ElF32 = stripBitcast(BV->getOperand(i));
// Based on first element decide which mod we match, neg or abs
if (EltsF32.empty())
ModOpcode = (ElF32.getOpcode() == ISD::FNEG) ? ISD::FNEG : ISD::FABS;
if (ElF32.getOpcode() != ModOpcode)
break;
EltsF32.push_back(ElF32.getOperand(0));
}
// All elements had ModOpcode modifier
if (BV->getNumOperands() == EltsF32.size())
selectWMMAModsNegAbs(ModOpcode, Mods, EltsF32, Src, CurDAG, SDLoc(In),
32);
}
SrcMods = CurDAG->getTargetConstant(Mods, SDLoc(In), MVT::i32);
return true;
}
bool AMDGPUDAGToDAGISel::SelectWMMAVISrc(SDValue In, SDValue &Src) const {
if (auto *BV = dyn_cast<BuildVectorSDNode>(In)) {
BitVector UndefElements;
if (SDValue Splat = BV->getSplatValue(&UndefElements))
if (isInlineImmediate(Splat.getNode())) {
if (const ConstantSDNode *C = dyn_cast<ConstantSDNode>(Splat)) {
unsigned Imm = C->getAPIntValue().getSExtValue();
Src = CurDAG->getTargetConstant(Imm, SDLoc(In), MVT::i32);
return true;
}
if (const ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(Splat)) {
unsigned Imm = C->getValueAPF().bitcastToAPInt().getSExtValue();
Src = CurDAG->getTargetConstant(Imm, SDLoc(In), MVT::i32);
return true;
}
llvm_unreachable("unhandled Constant node");
}
}
// 16 bit splat
SDValue SplatSrc32 = stripBitcast(In);
if (auto *SplatSrc32BV = dyn_cast<BuildVectorSDNode>(SplatSrc32)) {
if (SDValue Splat32 = SplatSrc32BV->getSplatValue()) {
SDValue SplatSrc16 = stripBitcast(Splat32);
if (auto *SplatSrc16BV = dyn_cast<BuildVectorSDNode>(SplatSrc16)) {
if (SDValue Splat = SplatSrc16BV->getSplatValue()) {
// f16
if (isInlineImmediate(Splat.getNode())) {
const ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(Splat);
int64_t Imm = C->getValueAPF().bitcastToAPInt().getSExtValue();
Src = CurDAG->getTargetConstant(Imm, SDLoc(In), MVT::i16);
return true;
}
// bf16
if (const ConstantSDNode *C = dyn_cast<ConstantSDNode>(Splat)) {
const SIInstrInfo *TII = Subtarget->getInstrInfo();
APInt BF16Value = C->getAPIntValue();
APInt F32Value = BF16Value.zext(32).shl(16);
if (TII->isInlineConstant(F32Value)) {
int64_t Imm = F32Value.getSExtValue();
Src = CurDAG->getTargetConstant(Imm, SDLoc(In), MVT::i32);
return true;
}
}
}
}
}
}
return false;
}
bool AMDGPUDAGToDAGISel::SelectSWMMACIndex8(SDValue In, SDValue &Src,
SDValue &IndexKey) const {
unsigned Key = 0;
Src = In;
if (In.getOpcode() == ISD::SRL) {
const llvm::SDValue &ShiftSrc = In.getOperand(0);
ConstantSDNode *ShiftAmt = dyn_cast<ConstantSDNode>(In.getOperand(1));
if (ShiftSrc.getValueType().getSizeInBits() == 32 && ShiftAmt &&
ShiftAmt->getZExtValue() % 8 == 0) {
Key = ShiftAmt->getZExtValue() / 8;
Src = ShiftSrc;
}
}
IndexKey = CurDAG->getTargetConstant(Key, SDLoc(In), MVT::i32);
return true;
}
bool AMDGPUDAGToDAGISel::SelectSWMMACIndex16(SDValue In, SDValue &Src,
SDValue &IndexKey) const {
unsigned Key = 0;
Src = In;
if (In.getOpcode() == ISD::SRL) {
const llvm::SDValue &ShiftSrc = In.getOperand(0);
ConstantSDNode *ShiftAmt = dyn_cast<ConstantSDNode>(In.getOperand(1));
if (ShiftSrc.getValueType().getSizeInBits() == 32 && ShiftAmt &&
ShiftAmt->getZExtValue() == 16) {
Key = 1;
Src = ShiftSrc;
}
}
IndexKey = CurDAG->getTargetConstant(Key, SDLoc(In), MVT::i32);
return true;
}
bool AMDGPUDAGToDAGISel::SelectVOP3OpSel(SDValue In, SDValue &Src,
SDValue &SrcMods) const {
Src = In;

View File

@ -240,6 +240,16 @@ private:
bool SelectVOP3PModsNeg(SDValue In, SDValue &Src) const;
bool SelectWMMAOpSelVOP3PMods(SDValue In, SDValue &Src) const;
bool SelectWMMAModsF32NegAbs(SDValue In, SDValue &Src,
SDValue &SrcMods) const;
bool SelectWMMAModsF16Neg(SDValue In, SDValue &Src, SDValue &SrcMods) const;
bool SelectWMMAModsF16NegAbs(SDValue In, SDValue &Src,
SDValue &SrcMods) const;
bool SelectWMMAVISrc(SDValue In, SDValue &Src) const;
bool SelectSWMMACIndex8(SDValue In, SDValue &Src, SDValue &IndexKey) const;
bool SelectSWMMACIndex16(SDValue In, SDValue &Src, SDValue &IndexKey) const;
bool SelectVOP3OpSel(SDValue In, SDValue &Src, SDValue &SrcMods) const;
bool SelectVOP3OpSelMods(SDValue In, SDValue &Src, SDValue &SrcMods) const;

View File

@ -3956,6 +3956,219 @@ AMDGPUInstructionSelector::selectWMMAOpSelVOP3PMods(
}};
}
static Register buildRegSequence(SmallVectorImpl<Register> &Elts,
MachineInstr *InsertPt,
MachineRegisterInfo &MRI) {
const TargetRegisterClass *DstRegClass;
switch (Elts.size()) {
case 8:
DstRegClass = &AMDGPU::VReg_256RegClass;
break;
case 4:
DstRegClass = &AMDGPU::VReg_128RegClass;
break;
case 2:
DstRegClass = &AMDGPU::VReg_64RegClass;
break;
default:
llvm_unreachable("unhandled Reg sequence size");
}
MachineIRBuilder B(*InsertPt);
auto MIB = B.buildInstr(AMDGPU::REG_SEQUENCE)
.addDef(MRI.createVirtualRegister(DstRegClass));
for (unsigned i = 0; i < Elts.size(); ++i) {
MIB.addReg(Elts[i]);
MIB.addImm(SIRegisterInfo::getSubRegFromChannel(i));
}
return MIB->getOperand(0).getReg();
}
static void selectWMMAModsNegAbs(unsigned ModOpcode, unsigned &Mods,
SmallVectorImpl<Register> &Elts, Register &Src,
MachineInstr *InsertPt,
MachineRegisterInfo &MRI) {
if (ModOpcode == TargetOpcode::G_FNEG) {
Mods |= SISrcMods::NEG;
// Check if all elements also have abs modifier
SmallVector<Register, 8> NegAbsElts;
for (auto El : Elts) {
Register FabsSrc;
if (!mi_match(El, MRI, m_GFabs(m_Reg(FabsSrc))))
break;
NegAbsElts.push_back(FabsSrc);
}
if (Elts.size() != NegAbsElts.size()) {
// Neg
Src = buildRegSequence(Elts, InsertPt, MRI);
} else {
// Neg and Abs
Mods |= SISrcMods::NEG_HI;
Src = buildRegSequence(NegAbsElts, InsertPt, MRI);
}
} else {
assert(ModOpcode == TargetOpcode::G_FABS);
// Abs
Mods |= SISrcMods::NEG_HI;
Src = buildRegSequence(Elts, InsertPt, MRI);
}
}
InstructionSelector::ComplexRendererFns
AMDGPUInstructionSelector::selectWMMAModsF32NegAbs(MachineOperand &Root) const {
Register Src = Root.getReg();
unsigned Mods = SISrcMods::OP_SEL_1;
unsigned ModOpcode;
SmallVector<Register, 8> EltsF32;
if (GBuildVector *BV = dyn_cast<GBuildVector>(MRI->getVRegDef(Src))) {
for (unsigned i = 0; i < BV->getNumSources(); ++i) {
MachineInstr *ElF32 = MRI->getVRegDef(BV->getSourceReg(i));
// Based on first element decide which mod we match, neg or abs
if (EltsF32.empty())
ModOpcode = (ElF32->getOpcode() == AMDGPU::G_FNEG) ? AMDGPU::G_FNEG
: AMDGPU::G_FABS;
if (ElF32->getOpcode() != ModOpcode)
break;
EltsF32.push_back(ElF32->getOperand(1).getReg());
}
// All elements had ModOpcode modifier
if (BV->getNumSources() == EltsF32.size()) {
selectWMMAModsNegAbs(ModOpcode, Mods, EltsF32, Src, Root.getParent(),
*MRI);
}
}
return {{[=](MachineInstrBuilder &MIB) { MIB.addReg(Src); },
[=](MachineInstrBuilder &MIB) { MIB.addImm(Mods); }}};
}
InstructionSelector::ComplexRendererFns
AMDGPUInstructionSelector::selectWMMAModsF16Neg(MachineOperand &Root) const {
Register Src = Root.getReg();
unsigned Mods = SISrcMods::OP_SEL_1;
SmallVector<Register, 8> EltsV2F16;
if (GConcatVectors *CV = dyn_cast<GConcatVectors>(MRI->getVRegDef(Src))) {
for (unsigned i = 0; i < CV->getNumSources(); ++i) {
Register FNegSrc;
if (!mi_match(CV->getSourceReg(i), *MRI, m_GFNeg(m_Reg(FNegSrc))))
break;
EltsV2F16.push_back(FNegSrc);
}
// All elements had ModOpcode modifier
if (CV->getNumSources() == EltsV2F16.size()) {
Mods |= SISrcMods::NEG;
Mods |= SISrcMods::NEG_HI;
Src = buildRegSequence(EltsV2F16, Root.getParent(), *MRI);
}
}
return {{[=](MachineInstrBuilder &MIB) { MIB.addReg(Src); },
[=](MachineInstrBuilder &MIB) { MIB.addImm(Mods); }}};
}
InstructionSelector::ComplexRendererFns
AMDGPUInstructionSelector::selectWMMAModsF16NegAbs(MachineOperand &Root) const {
Register Src = Root.getReg();
unsigned Mods = SISrcMods::OP_SEL_1;
unsigned ModOpcode;
SmallVector<Register, 8> EltsV2F16;
if (GConcatVectors *CV = dyn_cast<GConcatVectors>(MRI->getVRegDef(Src))) {
for (unsigned i = 0; i < CV->getNumSources(); ++i) {
MachineInstr *ElV2F16 = MRI->getVRegDef(CV->getSourceReg(i));
// Based on first element decide which mod we match, neg or abs
if (EltsV2F16.empty())
ModOpcode = (ElV2F16->getOpcode() == AMDGPU::G_FNEG) ? AMDGPU::G_FNEG
: AMDGPU::G_FABS;
if (ElV2F16->getOpcode() != ModOpcode)
break;
EltsV2F16.push_back(ElV2F16->getOperand(1).getReg());
}
// All elements had ModOpcode modifier
if (CV->getNumSources() == EltsV2F16.size()) {
MachineIRBuilder B(*Root.getParent());
selectWMMAModsNegAbs(ModOpcode, Mods, EltsV2F16, Src, Root.getParent(),
*MRI);
}
}
return {{[=](MachineInstrBuilder &MIB) { MIB.addReg(Src); },
[=](MachineInstrBuilder &MIB) { MIB.addImm(Mods); }}};
}
InstructionSelector::ComplexRendererFns
AMDGPUInstructionSelector::selectWMMAVISrc(MachineOperand &Root) const {
std::optional<FPValueAndVReg> FPValReg;
if (mi_match(Root.getReg(), *MRI, m_GFCstOrSplat(FPValReg))) {
if (TII.isInlineConstant(FPValReg->Value.bitcastToAPInt())) {
return {{[=](MachineInstrBuilder &MIB) {
MIB.addImm(FPValReg->Value.bitcastToAPInt().getSExtValue());
}}};
}
// Non-inlineable splat floats should not fall-through for integer immediate
// checks.
return {};
}
APInt ICst;
if (mi_match(Root.getReg(), *MRI, m_ICstOrSplat(ICst))) {
if (TII.isInlineConstant(ICst)) {
return {
{[=](MachineInstrBuilder &MIB) { MIB.addImm(ICst.getSExtValue()); }}};
}
}
return {};
}
InstructionSelector::ComplexRendererFns
AMDGPUInstructionSelector::selectSWMMACIndex8(MachineOperand &Root) const {
Register Src =
getDefIgnoringCopies(Root.getReg(), *MRI)->getOperand(0).getReg();
unsigned Key = 0;
Register ShiftSrc;
std::optional<ValueAndVReg> ShiftAmt;
if (mi_match(Src, *MRI, m_GLShr(m_Reg(ShiftSrc), m_GCst(ShiftAmt))) &&
MRI->getType(ShiftSrc).getSizeInBits() == 32 &&
ShiftAmt->Value.getZExtValue() % 8 == 0) {
Key = ShiftAmt->Value.getZExtValue() / 8;
Src = ShiftSrc;
}
return {{
[=](MachineInstrBuilder &MIB) { MIB.addReg(Src); },
[=](MachineInstrBuilder &MIB) { MIB.addImm(Key); } // index_key
}};
}
InstructionSelector::ComplexRendererFns
AMDGPUInstructionSelector::selectSWMMACIndex16(MachineOperand &Root) const {
Register Src =
getDefIgnoringCopies(Root.getReg(), *MRI)->getOperand(0).getReg();
unsigned Key = 0;
Register ShiftSrc;
std::optional<ValueAndVReg> ShiftAmt;
if (mi_match(Src, *MRI, m_GLShr(m_Reg(ShiftSrc), m_GCst(ShiftAmt))) &&
MRI->getType(ShiftSrc).getSizeInBits() == 32 &&
ShiftAmt->Value.getZExtValue() == 16) {
Src = ShiftSrc;
Key = 1;
}
return {{
[=](MachineInstrBuilder &MIB) { MIB.addReg(Src); },
[=](MachineInstrBuilder &MIB) { MIB.addImm(Key); } // index_key
}};
}
InstructionSelector::ComplexRendererFns
AMDGPUInstructionSelector::selectVOP3OpSelMods(MachineOperand &Root) const {
Register Src;

View File

@ -199,6 +199,19 @@ private:
InstructionSelector::ComplexRendererFns
selectWMMAOpSelVOP3PMods(MachineOperand &Root) const;
InstructionSelector::ComplexRendererFns
selectWMMAModsF32NegAbs(MachineOperand &Root) const;
InstructionSelector::ComplexRendererFns
selectWMMAModsF16Neg(MachineOperand &Root) const;
InstructionSelector::ComplexRendererFns
selectWMMAModsF16NegAbs(MachineOperand &Root) const;
InstructionSelector::ComplexRendererFns
selectWMMAVISrc(MachineOperand &Root) const;
InstructionSelector::ComplexRendererFns
selectSWMMACIndex8(MachineOperand &Root) const;
InstructionSelector::ComplexRendererFns
selectSWMMACIndex16(MachineOperand &Root) const;
InstructionSelector::ComplexRendererFns
selectVOP3OpSelMods(MachineOperand &Root) const;

View File

@ -7134,6 +7134,29 @@ bool AMDGPULegalizerInfo::legalizeIntrinsic(LegalizerHelper &Helper,
return legalizeDSAtomicFPIntrinsic(Helper, MI, IntrID);
case Intrinsic::amdgcn_image_bvh_intersect_ray:
return legalizeBVHIntrinsic(MI, B);
case Intrinsic::amdgcn_swmmac_f16_16x16x32_f16:
case Intrinsic::amdgcn_swmmac_bf16_16x16x32_bf16:
case Intrinsic::amdgcn_swmmac_f32_16x16x32_bf16:
case Intrinsic::amdgcn_swmmac_f32_16x16x32_f16:
case Intrinsic::amdgcn_swmmac_f32_16x16x32_fp8_fp8:
case Intrinsic::amdgcn_swmmac_f32_16x16x32_fp8_bf8:
case Intrinsic::amdgcn_swmmac_f32_16x16x32_bf8_fp8:
case Intrinsic::amdgcn_swmmac_f32_16x16x32_bf8_bf8: {
Register Index = MI.getOperand(5).getReg();
LLT S32 = LLT::scalar(32);
if (MRI.getType(Index) != S32)
MI.getOperand(5).setReg(B.buildAnyExt(S32, Index).getReg(0));
return true;
}
case Intrinsic::amdgcn_swmmac_i32_16x16x32_iu4:
case Intrinsic::amdgcn_swmmac_i32_16x16x32_iu8:
case Intrinsic::amdgcn_swmmac_i32_16x16x64_iu4: {
Register Index = MI.getOperand(7).getReg();
LLT S32 = LLT::scalar(32);
if (MRI.getType(Index) != S32)
MI.getOperand(7).setReg(B.buildAnyExt(S32, Index).getReg(0));
return true;
}
case Intrinsic::amdgcn_fmed3: {
GISelChangeObserver &Observer = Helper.Observer;

View File

@ -4505,6 +4505,22 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const {
case Intrinsic::amdgcn_wmma_f32_16x16x16_f16:
case Intrinsic::amdgcn_wmma_i32_16x16x16_iu4:
case Intrinsic::amdgcn_wmma_i32_16x16x16_iu8:
case Intrinsic::amdgcn_wmma_f32_16x16x16_fp8_fp8:
case Intrinsic::amdgcn_wmma_f32_16x16x16_fp8_bf8:
case Intrinsic::amdgcn_wmma_f32_16x16x16_bf8_fp8:
case Intrinsic::amdgcn_wmma_f32_16x16x16_bf8_bf8:
case Intrinsic::amdgcn_wmma_i32_16x16x32_iu4:
case Intrinsic::amdgcn_swmmac_f32_16x16x32_f16:
case Intrinsic::amdgcn_swmmac_f32_16x16x32_bf16:
case Intrinsic::amdgcn_swmmac_f16_16x16x32_f16:
case Intrinsic::amdgcn_swmmac_bf16_16x16x32_bf16:
case Intrinsic::amdgcn_swmmac_i32_16x16x32_iu8:
case Intrinsic::amdgcn_swmmac_i32_16x16x32_iu4:
case Intrinsic::amdgcn_swmmac_i32_16x16x64_iu4:
case Intrinsic::amdgcn_swmmac_f32_16x16x32_fp8_fp8:
case Intrinsic::amdgcn_swmmac_f32_16x16x32_fp8_bf8:
case Intrinsic::amdgcn_swmmac_f32_16x16x32_bf8_fp8:
case Intrinsic::amdgcn_swmmac_f32_16x16x32_bf8_bf8:
return getDefaultMappingVOP(MI);
case Intrinsic::amdgcn_log:
case Intrinsic::amdgcn_exp2:

View File

@ -414,6 +414,22 @@ def : SourceOfDivergence<int_amdgcn_wmma_f16_16x16x16_f16>;
def : SourceOfDivergence<int_amdgcn_wmma_bf16_16x16x16_bf16>;
def : SourceOfDivergence<int_amdgcn_wmma_i32_16x16x16_iu8>;
def : SourceOfDivergence<int_amdgcn_wmma_i32_16x16x16_iu4>;
def : SourceOfDivergence<int_amdgcn_wmma_f32_16x16x16_fp8_fp8>;
def : SourceOfDivergence<int_amdgcn_wmma_f32_16x16x16_fp8_bf8>;
def : SourceOfDivergence<int_amdgcn_wmma_f32_16x16x16_bf8_fp8>;
def : SourceOfDivergence<int_amdgcn_wmma_f32_16x16x16_bf8_bf8>;
def : SourceOfDivergence<int_amdgcn_wmma_i32_16x16x32_iu4>;
def : SourceOfDivergence<int_amdgcn_swmmac_f32_16x16x32_f16>;
def : SourceOfDivergence<int_amdgcn_swmmac_f32_16x16x32_bf16>;
def : SourceOfDivergence<int_amdgcn_swmmac_f16_16x16x32_f16>;
def : SourceOfDivergence<int_amdgcn_swmmac_bf16_16x16x32_bf16>;
def : SourceOfDivergence<int_amdgcn_swmmac_i32_16x16x32_iu8>;
def : SourceOfDivergence<int_amdgcn_swmmac_i32_16x16x32_iu4>;
def : SourceOfDivergence<int_amdgcn_swmmac_i32_16x16x64_iu4>;
def : SourceOfDivergence<int_amdgcn_swmmac_f32_16x16x32_fp8_fp8>;
def : SourceOfDivergence<int_amdgcn_swmmac_f32_16x16x32_fp8_bf8>;
def : SourceOfDivergence<int_amdgcn_swmmac_f32_16x16x32_bf8_fp8>;
def : SourceOfDivergence<int_amdgcn_swmmac_f32_16x16x32_bf8_bf8>;
def : SourceOfDivergence<int_amdgcn_global_load_tr>;
// The dummy boolean output is divergent from the IR's perspective,

View File

@ -151,6 +151,8 @@ public:
ImmTyOpSelHi,
ImmTyNegLo,
ImmTyNegHi,
ImmTyIndexKey8bit,
ImmTyIndexKey16bit,
ImmTyDPP8,
ImmTyDppCtrl,
ImmTyDppRowMask,
@ -383,6 +385,8 @@ public:
bool isGDS() const { return isImmTy(ImmTyGDS); }
bool isLDS() const { return isImmTy(ImmTyLDS); }
bool isCPol() const { return isImmTy(ImmTyCPol); }
bool isIndexKey8bit() const { return isImmTy(ImmTyIndexKey8bit); }
bool isIndexKey16bit() const { return isImmTy(ImmTyIndexKey16bit); }
bool isTFE() const { return isImmTy(ImmTyTFE); }
bool isFORMAT() const { return isImmTy(ImmTyFORMAT) && isUInt<7>(getImm()); }
bool isDppBankMask() const { return isImmTy(ImmTyDppBankMask); }
@ -656,6 +660,14 @@ public:
return isVISrcF16() || isVISrcB32();
}
bool isVISrc_64F16() const {
return isRegOrInlineNoMods(AMDGPU::VReg_64RegClassID, MVT::f16);
}
bool isVISrc_64B32() const {
return isRegOrInlineNoMods(AMDGPU::VReg_64RegClassID, MVT::i32);
}
bool isVISrc_64B64() const {
return isRegOrInlineNoMods(AMDGPU::VReg_64RegClassID, MVT::i64);
}
@ -672,6 +684,14 @@ public:
return isRegOrInlineNoMods(AMDGPU::VReg_64RegClassID, MVT::i32);
}
bool isVISrc_256B32() const {
return isRegOrInlineNoMods(AMDGPU::VReg_256RegClassID, MVT::i32);
}
bool isVISrc_256F32() const {
return isRegOrInlineNoMods(AMDGPU::VReg_256RegClassID, MVT::f32);
}
bool isVISrc_256B64() const {
return isRegOrInlineNoMods(AMDGPU::VReg_256RegClassID, MVT::i64);
}
@ -1047,6 +1067,8 @@ public:
case ImmTyOffset1: OS << "Offset1"; break;
case ImmTySMEMOffsetMod: OS << "SMEMOffsetMod"; break;
case ImmTyCPol: OS << "CPol"; break;
case ImmTyIndexKey8bit: OS << "index_key"; break;
case ImmTyIndexKey16bit: OS << "index_key"; break;
case ImmTyTFE: OS << "TFE"; break;
case ImmTyD16: OS << "D16"; break;
case ImmTyFORMAT: OS << "FORMAT"; break;
@ -1604,6 +1626,11 @@ public:
ParseStatus parseRegWithFPInputMods(OperandVector &Operands);
ParseStatus parseRegWithIntInputMods(OperandVector &Operands);
ParseStatus parseVReg32OrOff(OperandVector &Operands);
ParseStatus tryParseIndexKey(OperandVector &Operands,
AMDGPUOperand::ImmTy ImmTy);
ParseStatus parseIndexKey8bit(OperandVector &Operands);
ParseStatus parseIndexKey16bit(OperandVector &Operands);
ParseStatus parseDfmtNfmt(int64_t &Format);
ParseStatus parseUfmt(int64_t &Format);
ParseStatus parseSymbolicSplitFormat(StringRef FormatStr, SMLoc Loc,
@ -1784,6 +1811,8 @@ public:
void cvtVOP3OpSel(MCInst &Inst, const OperandVector &Operands);
void cvtVOP3(MCInst &Inst, const OperandVector &Operands);
void cvtVOP3P(MCInst &Inst, const OperandVector &Operands);
void cvtSWMMAC(MCInst &Inst, const OperandVector &Operands);
void cvtVOPD(MCInst &Inst, const OperandVector &Operands);
void cvtVOP3OpSel(MCInst &Inst, const OperandVector &Operands,
OptionalImmIndexMap &OptionalIdx);
@ -4364,7 +4393,11 @@ bool AMDGPUAsmParser::validateNeg(const MCInst &Inst, int OpName) {
uint64_t TSFlags = MII.get(Opc).TSFlags;
// v_dot4 fp8/bf8 neg_lo/neg_hi not allowed on src0 and src1 (allowed on src2)
if (!(TSFlags & SIInstrFlags::IsDOT))
// v_wmma iu4/iu8 neg_lo not allowed on src2 (allowed on src0, src1)
// v_swmmac f16/bf16 neg_lo/neg_hi not allowed on src2 (allowed on src0, src1)
// other wmma/swmmac instructions don't have neg_lo/neg_hi operand.
if (!(TSFlags & SIInstrFlags::IsDOT) && !(TSFlags & SIInstrFlags::IsWMMA) &&
!(TSFlags & SIInstrFlags::IsSWMMAC))
return true;
int NegIdx = AMDGPU::getNamedOperandIdx(Opc, OpName);
@ -6465,6 +6498,33 @@ bool AMDGPUAsmParser::tryParseFmt(const char *Pref,
return true;
}
ParseStatus AMDGPUAsmParser::tryParseIndexKey(OperandVector &Operands,
AMDGPUOperand::ImmTy ImmTy) {
const char *Pref = "index_key";
int64_t ImmVal = 0;
SMLoc Loc = getLoc();
auto Res = parseIntWithPrefix(Pref, ImmVal);
if (!Res.isSuccess())
return Res;
if (ImmTy == AMDGPUOperand::ImmTyIndexKey16bit && (ImmVal < 0 || ImmVal > 1))
return Error(Loc, Twine("out of range ", StringRef(Pref)));
if (ImmTy == AMDGPUOperand::ImmTyIndexKey8bit && (ImmVal < 0 || ImmVal > 3))
return Error(Loc, Twine("out of range ", StringRef(Pref)));
Operands.push_back(AMDGPUOperand::CreateImm(this, ImmVal, Loc, ImmTy));
return ParseStatus::Success;
}
ParseStatus AMDGPUAsmParser::parseIndexKey8bit(OperandVector &Operands) {
return tryParseIndexKey(Operands, AMDGPUOperand::ImmTyIndexKey8bit);
}
ParseStatus AMDGPUAsmParser::parseIndexKey16bit(OperandVector &Operands) {
return tryParseIndexKey(Operands, AMDGPUOperand::ImmTyIndexKey16bit);
}
// dfmt and nfmt (in a tbuffer instruction) are parsed as one to allow their
// values to live in a joint format operand in the MCInst encoding.
ParseStatus AMDGPUAsmParser::parseDfmtNfmt(int64_t &Format) {
@ -8329,10 +8389,12 @@ void AMDGPUAsmParser::cvtVOP3P(MCInst &Inst, const OperandVector &Operands,
}
int NegLoIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::neg_lo);
if (NegLoIdx != -1) {
if (NegLoIdx != -1)
addOptionalImmOperand(Inst, Operands, OptIdx, AMDGPUOperand::ImmTyNegLo);
int NegHiIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::neg_hi);
if (NegHiIdx != -1)
addOptionalImmOperand(Inst, Operands, OptIdx, AMDGPUOperand::ImmTyNegHi);
}
const int Ops[] = { AMDGPU::OpName::src0,
AMDGPU::OpName::src1,
@ -8352,11 +8414,11 @@ void AMDGPUAsmParser::cvtVOP3P(MCInst &Inst, const OperandVector &Operands,
if (OpSelHiIdx != -1)
OpSelHi = Inst.getOperand(OpSelHiIdx).getImm();
if (NegLoIdx != -1) {
int NegHiIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::neg_hi);
if (NegLoIdx != -1)
NegLo = Inst.getOperand(NegLoIdx).getImm();
if (NegHiIdx != -1)
NegHi = Inst.getOperand(NegHiIdx).getImm();
}
for (int J = 0; J < 3; ++J) {
int OpIdx = AMDGPU::getNamedOperandIdx(Opc, Ops[J]);
@ -8392,6 +8454,43 @@ void AMDGPUAsmParser::cvtVOP3P(MCInst &Inst, const OperandVector &Operands) {
cvtVOP3P(Inst, Operands, OptIdx);
}
static void addSrcModifiersAndSrc(MCInst &Inst, const OperandVector &Operands,
unsigned i, unsigned Opc, unsigned OpName) {
if (AMDGPU::getNamedOperandIdx(Opc, OpName) != -1)
((AMDGPUOperand &)*Operands[i]).addRegOrImmWithFPInputModsOperands(Inst, 2);
else
((AMDGPUOperand &)*Operands[i]).addRegOperands(Inst, 1);
}
void AMDGPUAsmParser::cvtSWMMAC(MCInst &Inst, const OperandVector &Operands) {
unsigned Opc = Inst.getOpcode();
((AMDGPUOperand &)*Operands[1]).addRegOperands(Inst, 1);
addSrcModifiersAndSrc(Inst, Operands, 2, Opc, AMDGPU::OpName::src0_modifiers);
addSrcModifiersAndSrc(Inst, Operands, 3, Opc, AMDGPU::OpName::src1_modifiers);
((AMDGPUOperand &)*Operands[1]).addRegOperands(Inst, 1); // srcTiedDef
((AMDGPUOperand &)*Operands[4]).addRegOperands(Inst, 1); // src2
OptionalImmIndexMap OptIdx;
for (unsigned i = 5; i < Operands.size(); ++i) {
AMDGPUOperand &Op = ((AMDGPUOperand &)*Operands[i]);
OptIdx[Op.getImmTy()] = i;
}
if (AMDGPU::hasNamedOperand(Opc, AMDGPU::OpName::index_key_8bit))
addOptionalImmOperand(Inst, Operands, OptIdx,
AMDGPUOperand::ImmTyIndexKey8bit);
if (AMDGPU::hasNamedOperand(Opc, AMDGPU::OpName::index_key_16bit))
addOptionalImmOperand(Inst, Operands, OptIdx,
AMDGPUOperand::ImmTyIndexKey16bit);
if (AMDGPU::hasNamedOperand(Opc, AMDGPU::OpName::clamp))
addOptionalImmOperand(Inst, Operands, OptIdx, AMDGPUOperand::ImmTyClampSI);
cvtVOP3P(Inst, Operands, OptIdx);
}
//===----------------------------------------------------------------------===//
// VOPD
//===----------------------------------------------------------------------===//

View File

@ -260,8 +260,12 @@ DECODE_OPERAND_SRC_REG_OR_IMM_9(VS_32, OPW32, 32)
DECODE_OPERAND_SRC_REG_OR_IMM_9(VS_64, OPW64, 64)
DECODE_OPERAND_SRC_REG_OR_IMM_9(VS_64, OPW64, 32)
DECODE_OPERAND_SRC_REG_OR_IMM_9(VReg_64, OPW64, 64)
DECODE_OPERAND_SRC_REG_OR_IMM_9(VReg_64, OPW64, 32)
DECODE_OPERAND_SRC_REG_OR_IMM_9(VReg_64, OPW64, 16)
DECODE_OPERAND_SRC_REG_OR_IMM_9(VReg_128, OPW128, 32)
DECODE_OPERAND_SRC_REG_OR_IMM_9(VReg_128, OPW128, 16)
DECODE_OPERAND_SRC_REG_OR_IMM_9(VReg_256, OPW256, 64)
DECODE_OPERAND_SRC_REG_OR_IMM_9(VReg_256, OPW256, 32)
DECODE_OPERAND_SRC_REG_OR_IMM_9(VReg_512, OPW512, 32)
DECODE_OPERAND_SRC_REG_OR_IMM_9(VReg_1024, OPW1024, 32)
@ -704,6 +708,10 @@ DecodeStatus AMDGPUDisassembler::getInstruction(MCInst &MI, uint64_t &Size,
break;
Res = tryDecodeInst(DecoderTableWMMAGFX1164, MI, QW, Address, CS);
if (Res)
break;
Res = tryDecodeInst(DecoderTableWMMAGFX1264, MI, QW, Address, CS);
} while (false);
if (Res && AMDGPU::isMAC(MI.getOpcode())) {

View File

@ -1716,14 +1716,14 @@ bool GCNHazardRecognizer::fixVALUTransUseHazard(MachineInstr *MI) {
}
bool GCNHazardRecognizer::fixWMMAHazards(MachineInstr *MI) {
if (!SIInstrInfo::isWMMA(*MI))
if (!SIInstrInfo::isWMMA(*MI) && !SIInstrInfo::isSWMMAC(*MI))
return false;
const SIInstrInfo *TII = ST.getInstrInfo();
const SIRegisterInfo *TRI = ST.getRegisterInfo();
auto IsHazardFn = [MI, TII, TRI](const MachineInstr &I) {
if (!SIInstrInfo::isWMMA(I))
auto IsHazardFn = [MI, TII, TRI, this](const MachineInstr &I) {
if (!SIInstrInfo::isWMMA(I) && !SIInstrInfo::isSWMMAC(I))
return false;
// Src0 or Src1 of the current wmma instruction overlaps with the dest of
@ -1753,6 +1753,7 @@ bool GCNHazardRecognizer::fixWMMAHazards(MachineInstr *MI) {
const MachineOperand *Src2Mods =
TII->getNamedOperand(*MI, AMDGPU::OpName::src2_modifiers);
const bool NoSrc2Mods =
!Src2Mods ||
(Src2Mods->getImm() & (SISrcMods::NEG | SISrcMods::NEG_HI)) == 0;
// Exception: there is no hazard if the wmma instructions are of the same
// type and there is no input modifier on src2 of the current instruction.
@ -1760,6 +1761,18 @@ bool GCNHazardRecognizer::fixWMMAHazards(MachineInstr *MI) {
TII->pseudoToMCOpcode(MI->getOpcode())));
}
// GFX12+ allows overlap of matrix C with PrevDstReg (hardware will stall)
// but Index can't overlap with PrevDstReg.
if (AMDGPU::isGFX12Plus(ST)) {
if (SIInstrInfo::isSWMMAC(*MI)) {
const Register CurIndex =
TII->getNamedOperand(*MI, AMDGPU::OpName::src2)->getReg();
if (TRI->regsOverlap(PrevDstReg, CurIndex))
return true;
}
return false;
}
return false;
};

View File

@ -1275,6 +1275,23 @@ void AMDGPUInstPrinter::printPackedModifier(const MCInst *MI,
(ModIdx != -1) ? MI->getOperand(ModIdx).getImm() : DefaultValue;
}
// Print three values of neg/opsel for wmma instructions (prints 0 when there
// is no src_modifier operand instead of not printing anything).
if (MII.get(MI->getOpcode()).TSFlags & SIInstrFlags::IsSWMMAC ||
MII.get(MI->getOpcode()).TSFlags & SIInstrFlags::IsWMMA) {
NumOps = 0;
int DefaultValue = Mod == SISrcMods::OP_SEL_1;
for (int OpName :
{AMDGPU::OpName::src0_modifiers, AMDGPU::OpName::src1_modifiers,
AMDGPU::OpName::src2_modifiers}) {
int Idx = AMDGPU::getNamedOperandIdx(Opc, OpName);
if (Idx != -1)
Ops[NumOps++] = MI->getOperand(Idx).getImm();
else
Ops[NumOps++] = DefaultValue;
}
}
const bool HasDstSel =
NumOps > 0 &&
Mod == SISrcMods::OP_SEL_0 &&
@ -1336,6 +1353,26 @@ void AMDGPUInstPrinter::printNegHi(const MCInst *MI, unsigned OpNo,
printPackedModifier(MI, " neg_hi:[", SISrcMods::NEG_HI, O);
}
void AMDGPUInstPrinter::printIndexKey8bit(const MCInst *MI, unsigned OpNo,
const MCSubtargetInfo &STI,
raw_ostream &O) {
auto Imm = MI->getOperand(OpNo).getImm() & 0x7;
if (Imm == 0)
return;
O << " index_key:" << Imm;
}
void AMDGPUInstPrinter::printIndexKey16bit(const MCInst *MI, unsigned OpNo,
const MCSubtargetInfo &STI,
raw_ostream &O) {
auto Imm = MI->getOperand(OpNo).getImm() & 0x7;
if (Imm == 0)
return;
O << " index_key:" << Imm;
}
void AMDGPUInstPrinter::printInterpSlot(const MCInst *MI, unsigned OpNum,
const MCSubtargetInfo &STI,
raw_ostream &O) {

View File

@ -139,6 +139,10 @@ private:
const MCSubtargetInfo &STI, raw_ostream &O);
void printNegHi(const MCInst *MI, unsigned OpNo,
const MCSubtargetInfo &STI, raw_ostream &O);
void printIndexKey8bit(const MCInst *MI, unsigned OpNo,
const MCSubtargetInfo &STI, raw_ostream &O);
void printIndexKey16bit(const MCInst *MI, unsigned OpNo,
const MCSubtargetInfo &STI, raw_ostream &O);
void printInterpSlot(const MCInst *MI, unsigned OpNo,
const MCSubtargetInfo &STI, raw_ostream &O);
void printInterpAttr(const MCInst *MI, unsigned OpNo,

View File

@ -167,6 +167,9 @@ enum : uint64_t {
// ds_gws_* instructions.
GWS = UINT64_C(1) << 62,
// Is a SWMMAC instruction.
IsSWMMAC = UINT64_C(1) << 63,
};
// v_cmp_class_* etc. use a 10-bit mask for what operation is checked.

View File

@ -208,6 +208,7 @@ bool SIFoldOperands::canUseImmWithOpSel(FoldCandidate &Fold) const {
assert(Old.isReg() && Fold.isImm());
if (!(TSFlags & SIInstrFlags::IsPacked) || (TSFlags & SIInstrFlags::IsMAI) ||
(TSFlags & SIInstrFlags::IsWMMA) || (TSFlags & SIInstrFlags::IsSWMMAC) ||
(ST->hasDOTOpSelHazard() && (TSFlags & SIInstrFlags::IsDOT)))
return false;

View File

@ -8242,6 +8242,36 @@ SDValue SITargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
SIInstrInfo::MO_ABS32_LO);
return {DAG.getMachineNode(AMDGPU::S_MOV_B32, DL, MVT::i32, GA), 0};
}
case Intrinsic::amdgcn_swmmac_f16_16x16x32_f16:
case Intrinsic::amdgcn_swmmac_bf16_16x16x32_bf16:
case Intrinsic::amdgcn_swmmac_f32_16x16x32_bf16:
case Intrinsic::amdgcn_swmmac_f32_16x16x32_f16:
case Intrinsic::amdgcn_swmmac_f32_16x16x32_fp8_fp8:
case Intrinsic::amdgcn_swmmac_f32_16x16x32_fp8_bf8:
case Intrinsic::amdgcn_swmmac_f32_16x16x32_bf8_fp8:
case Intrinsic::amdgcn_swmmac_f32_16x16x32_bf8_bf8: {
if (Op.getOperand(4).getValueType() == MVT::i32)
return SDValue();
SDLoc SL(Op);
auto IndexKeyi32 = DAG.getAnyExtOrTrunc(Op.getOperand(4), SL, MVT::i32);
return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, SL, Op.getValueType(),
Op.getOperand(0), Op.getOperand(1), Op.getOperand(2),
Op.getOperand(3), IndexKeyi32);
}
case Intrinsic::amdgcn_swmmac_i32_16x16x32_iu4:
case Intrinsic::amdgcn_swmmac_i32_16x16x32_iu8:
case Intrinsic::amdgcn_swmmac_i32_16x16x64_iu4: {
if (Op.getOperand(6).getValueType() == MVT::i32)
return SDValue();
SDLoc SL(Op);
auto IndexKeyi32 = DAG.getAnyExtOrTrunc(Op.getOperand(6), SL, MVT::i32);
return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, SL, Op.getValueType(),
{Op.getOperand(0), Op.getOperand(1), Op.getOperand(2),
Op.getOperand(3), Op.getOperand(4), Op.getOperand(5),
IndexKeyi32, Op.getOperand(7)});
}
default:
if (const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr =
AMDGPU::getImageDimIntrinsicInfo(IntrinsicID))

View File

@ -161,6 +161,9 @@ class InstSI <dag outs, dag ins, string asm = "",
// ds_gws_* instructions.
field bit GWS = 0;
// This bit indicates that this is one of SWMMAC instructions.
field bit IsSWMMAC = 0;
// These need to be kept in sync with the enum in SIInstrFlags.
let TSFlags{0} = SALU;
let TSFlags{1} = VALU;
@ -248,6 +251,8 @@ class InstSI <dag outs, dag ins, string asm = "",
let TSFlags{62} = GWS;
let TSFlags{63} = IsSWMMAC;
let SchedRW = [Write32Bit];
let AsmVariantName = AMDGPUAsmVariants.Default;

View File

@ -802,6 +802,14 @@ public:
return isMFMA(MI) || isWMMA(MI);
}
static bool isSWMMAC(const MachineInstr &MI) {
return MI.getDesc().TSFlags & SIInstrFlags::IsSWMMAC;
}
bool isSWMMAC(uint16_t Opcode) const {
return get(Opcode).TSFlags & SIInstrFlags::IsSWMMAC;
}
bool isDOT(uint16_t Opcode) const {
return get(Opcode).TSFlags & SIInstrFlags::IsDOT;
}

View File

@ -1088,6 +1088,9 @@ def op_sel_hi0 : ArrayOperand0<"op_sel_hi", "OpSelHi">;
def neg_lo0 : ArrayOperand0<"neg_lo", "NegLo">;
def neg_hi0 : ArrayOperand0<"neg_hi", "NegHi">;
def IndexKey16bit : CustomOperand<i32, 1>;
def IndexKey8bit : CustomOperand<i32, 1>;
def dpp8 : CustomOperand<i32, 0, "DPP8">;
def dpp_ctrl : CustomOperand<i32, 0, "DPPCtrl">;
@ -1344,6 +1347,13 @@ def VOP3PModsDOT : ComplexPattern<untyped, 2, "SelectVOP3PModsDOT">;
def VOP3PModsNeg : ComplexPattern<untyped, 1, "SelectVOP3PModsNeg">;
def WMMAOpSelVOP3PMods : ComplexPattern<untyped, 1, "SelectWMMAOpSelVOP3PMods">;
def WMMAModsF32NegAbs : ComplexPattern<untyped, 2, "SelectWMMAModsF32NegAbs">;
def WMMAModsF16Neg : ComplexPattern<untyped, 2, "SelectWMMAModsF16Neg">;
def WMMAModsF16NegAbs : ComplexPattern<untyped, 2, "SelectWMMAModsF16NegAbs">;
def WMMAVISrc : ComplexPattern<untyped, 1, "SelectWMMAVISrc">;
def SWMMACIndex8 : ComplexPattern<untyped, 2, "SelectSWMMACIndex8">;
def SWMMACIndex16 : ComplexPattern<untyped, 2, "SelectSWMMACIndex16">;
def VOP3OpSel : ComplexPattern<untyped, 2, "SelectVOP3OpSel">;
def VOP3OpSelMods : ComplexPattern<untyped, 2, "SelectVOP3OpSelMods">;
@ -2278,6 +2288,7 @@ class VOPProfile <list<ValueType> _ArgVT, bit _EnableClamp = 0> {
field bit IsDOT = 0;
field bit IsSingle = 0;
field bit IsWMMA = 0;
field bit IsSWMMAC = 0;
field bit HasDst = !ne(DstVT.Value, untyped.Value);
field bit HasDst32 = HasDst;

View File

@ -1341,9 +1341,14 @@ def VCSrc_v2f16 : RegOrV2F16 <"VS_32", "OPERAND_REG_INLINE_C">;
// VISrc_* Operands with a VGPR or an inline constant
//===----------------------------------------------------------------------===//
def VISrc_64_f16 : RegOrF16 <"VReg_64", "OPERAND_REG_INLINE_C">;
def VISrc_64_b32 : RegOrB32 <"VReg_64", "OPERAND_REG_INLINE_C">;
def VISrc_64_f64 : RegOrF64 <"VReg_64", "OPERAND_REG_INLINE_C">;
def VISrc_128_f16 : RegOrF16 <"VReg_128", "OPERAND_REG_INLINE_C">;
def VISrc_128_b32 : RegOrB32 <"VReg_128", "OPERAND_REG_INLINE_C">;
def VISrc_128_f32 : RegOrF32 <"VReg_128", "OPERAND_REG_INLINE_C">;
def VISrc_256_b32 : RegOrB32 <"VReg_256", "OPERAND_REG_INLINE_C">;
def VISrc_256_f32 : RegOrF32 <"VReg_256", "OPERAND_REG_INLINE_C">;
def VISrc_256_f64 : RegOrF64 <"VReg_256", "OPERAND_REG_INLINE_C">;
def VISrc_512_b32 : RegOrB32 <"VReg_512", "OPERAND_REG_INLINE_C">;
def VISrc_512_f32 : RegOrF32 <"VReg_512", "OPERAND_REG_INLINE_C">;

View File

@ -936,16 +936,19 @@ multiclass WMMAInst<string Suffix, string Instr, VOPProfile P, SDPatternOperator
!cast<Instruction>(NAME # _threeaddr # Suffix)>;
}
if !eq(Type, WMMAOpSel) then {
def : WMMAOpSelPat<!cast<Instruction>(NAME # _twoaddr # Suffix), node, P>;
} else if !eq(Type, WMMAUIClamp) then {
def : WMMAUIClampPat<!cast<Instruction>(NAME # _twoaddr # Suffix), node, P>;
} else {
def : WMMARegularPat<!cast<Instruction>(NAME # _twoaddr # Suffix), node, P>;
let SubtargetPredicate = isGFX11Only in {
if !eq(Type, WMMAOpSel) then {
def : WMMAOpSelPat<!cast<Instruction>(NAME # _twoaddr # Suffix), node, P>;
} else if !eq(Type, WMMAUIClamp) then {
def : WMMAUIClampPat<!cast<Instruction>(NAME # _twoaddr # Suffix), node, P>;
} else {
def : WMMARegularPat<!cast<Instruction>(NAME # _twoaddr # Suffix), node, P>;
}
}
}
let WaveSizePredicate = isWave32 in {
defm V_WMMA_F32_16X16X16_F16 : WMMAInst<"_w32", "v_wmma_f32_16x16x16_f16", VOP_V8F32_V16F16_V16F16_V8F32, int_amdgcn_wmma_f32_16x16x16_f16, VRegSrc_256, WMMARegular, 1>;
defm V_WMMA_F32_16X16X16_BF16 : WMMAInst<"_w32", "v_wmma_f32_16x16x16_bf16", VOP_V8F32_V16I16_V16I16_V8F32, int_amdgcn_wmma_f32_16x16x16_bf16, VRegSrc_256, WMMARegular, 1>;
@ -969,6 +972,398 @@ let WaveSizePredicate = isWave64 in {
}
class VOP3PWMMA_Profile<list<ValueType> ArgTy, bit _IsSWMMAC, int _IndexType,
bit _IsIU, bit _IsFP8BF8>
: VOP3P_Profile<VOPProfile<ArgTy>> {
bit IsIU = _IsIU;
bit IsFP8BF8 = _IsFP8BF8;
bit IsF16BF16 = !not(!or(IsIU, IsFP8BF8));
int IndexType = _IndexType;
let IsPacked = 1;
let IsWMMA = !not(_IsSWMMAC);
let IsSWMMAC = _IsSWMMAC;
bit IsAB_F16 = !and(IsF16BF16, ArgTy[1].isFP);
bit IsAB_BF16 = !and(IsF16BF16, isIntType<ArgTy[1]>.ret);
bit IsC_F32 = !or(!eq(ArgTy[3], v8f32), !eq(ArgTy[3], v4f32));
bit IsC_BF16 = !or(!eq(ArgTy[3], v8i16), !eq(ArgTy[3], v4i16));
bit IsC_F16 = !or(!eq(ArgTy[3], v8f16), !eq(ArgTy[3], v4f16));
bit NegLo01 = !or(IsF16BF16, IsIU);
bit NegLo2 = !and(!or(IsF16BF16, IsFP8BF8), IsWMMA);
bit NegHi01 = IsF16BF16;
bit NegHi2 = !and(!or(IsF16BF16, IsFP8BF8), IsWMMA);
bit NegLoAny = !or(NegLo01, NegLo2);
bit NegHiAny = !or(NegHi01, NegHi2);
let DstRC = !cond(!eq(ArgTy[0], v8f32): VDst_256,
!eq(ArgTy[0], v8i32): VDst_256,
!eq(ArgTy[0], v8f16): VDst_128,
!eq(ArgTy[0], v8i16): VDst_128,
!eq(ArgTy[0], v4f32): VDst_128,
!eq(ArgTy[0], v4i32): VDst_128,
!eq(ArgTy[0], v4f16): VDst_64,
!eq(ArgTy[0], v4i16): VDst_64);
let Src0RC64 = !cond(!eq(ArgTy[1], v8f16): VRegSrc_128,
!eq(ArgTy[1], v4f16): VRegSrc_64,
!eq(ArgTy[1], v4i16): VRegSrc_64,
!eq(ArgTy[1], v8i16): VRegSrc_128,
!eq(ArgTy[1], v4i32): VRegSrc_128,
!eq(ArgTy[1], v2i32): VRegSrc_64,
!eq(ArgTy[1], i32) : VRegSrc_32);
let Src1RC64 = !cond(!eq(ArgTy[2], v16f16): VRegSrc_256,
!eq(ArgTy[2], v16i16): VRegSrc_256,
!eq(ArgTy[2], v8f16): VRegSrc_128,
!eq(ArgTy[2], v8i16): VRegSrc_128,
!eq(ArgTy[2], v4i32): VRegSrc_128,
!eq(ArgTy[1], v4i16): VRegSrc_64,
!eq(ArgTy[1], v4f16): VRegSrc_64,
!eq(ArgTy[2], v2i32): VRegSrc_64,
!eq(ArgTy[2], i32) : VRegSrc_32);
let Src2RC64 = !if(IsSWMMAC, DstRC,
!cond(!eq(ArgTy[3], v8f32): VISrc_256_f32,
!eq(ArgTy[3], v8i32): VISrc_256_b32,
!eq(ArgTy[3], v8f16): VISrc_128_f16,
!eq(ArgTy[3], v8i16): VISrc_128_f32, // bf16
!eq(ArgTy[3], v4f16): VISrc_64_f16,
!eq(ArgTy[3], v4i16): VISrc_64_b32,
!eq(ArgTy[3], v4i32): VISrc_128_b32,
!eq(ArgTy[3], v4f32): VISrc_128_f32));
// For f16 and bf16 matrices A and B, each element can be modified by
// fneg(neg_lo,neg_hi = 1). For iu4 and iu8 matrices A and B neg_lo is
// overloaded to mean unsigned/signed: neg_lo = 0 (u4 and u8) unsigned(zext)
// neg_lo = 1 (i4 and i8) signed(sext). For f16, bf16 and f32 matrix C each
// element can be modified by fneg(neg_lo = 1) or fabs(neg_hi = 1).
// Opcode | src0/src1 - matrix A/B | src2 - matrix C or Index
// ---------------------------------------------------------------------------
// wmma f32_f16 | both neg_lo,neg_hi = 1 | neg_lo = 1 neg C(f32)
// wmma f32_bf16 | neg A/B (f16 or bf16) | neg_hi = 1 abs C(f32)
// ---------------------------------------------------------------------------
// wmma f16_f16 | both neg_lo,neg_hi = 1 | neg_lo = 1 neg C(f16 or bf16)
// wmma bf16_bf16 | neg A/B (f16 or bf16) | neg_hi = 1 abs C(f16 or bf16)
// ---------------------------------------------------------------------------
// wmma i32_iu8/iu4 | neg_lo = 0 u4/u8(zext) | not allowed for
// | neg_lo = 1 i4/i8(sext) | i32 matrices
// ---------------------------------------------------------------------------
// wmma f32_fp8/bf8 | not allowed for | neg_lo = 1 neg C(f32)
// (4 instructions) | f8 and bf8 matrices | neg_hi = 1 abs C(f32)
// ---------------------------------------------------------------------------
// swmmac f32_f16 | both neg_lo,neg_hi = 1 | not allowed for sparse matrix
// swmmac f32_bf16 | neg A/B (f16 or bf16) | A Index - matrix C is in dst
// ---------------------------------------------------------------------------
// swmmac f16_f16 | both neg_lo,neg_hi = 1 | not allowed for sparse matrix
// swmmac bf16_bf16 | neg A/B (f16 or bf16) | A Index - matrix C is in dst
// ---------------------------------------------------------------------------
// swmmac i32_iu8/iu4 | neg_lo = 0 u4/u8(zext) | not allowed for sparse matrix
// | neg_lo = 1 i4/i8(sext) | A Index - matrix C is in dst
// ---------------------------------------------------------------------------
// swmmac f32_fp8/bf8 | not allowed for | not allowed for sparse matrix
// (4 instructions) | f8 and bf8 matrices | A Index - matrix C is in dst
// pseudo
// fp8bf8 wmmas don't use src (0 and 1) modifiers, iu use neg_lo, f16 and bf16
// use neg_lo and neg_hi. iu wmmas (C is i32) don't use src 2 modifiers,
// remaining wmmas(f16, bf16 and f8bf8) use neg_lo and neg_hi for C (C is f32
// f16 or bf16). swmmac use index_key and don't use src 2 modifiers.
dag Src0Mods = !if(IsFP8BF8, (ins), (ins PackedF16InputMods:$src0_modifiers));
dag Src1Mods = !if(IsFP8BF8, (ins), (ins PackedF16InputMods:$src1_modifiers));
dag Src2Mods = !if(IsIU, (ins), (ins PackedF16InputMods:$src2_modifiers));
dag IndexKey = !cond(!eq(IndexType, 0) : (ins),
!eq(IndexType, 8) : (ins IndexKey8bit:$index_key_8bit),
!eq(IndexType, 16): (ins IndexKey16bit:$index_key_16bit));
dag Clamp = !if(IsIU, (ins clampmod0:$clamp), (ins));
dag Neg = !cond(!and(NegLoAny, NegHiAny) : (ins neg_lo0:$neg_lo, neg_hi0:$neg_hi),
!and(NegLoAny, !not(NegHiAny)) : (ins neg_lo0:$neg_lo),
!and(!not(NegLoAny), !not(NegHiAny)) : (ins));
let InsVOP3P = !con(Src0Mods, (ins Src0RC64:$src0), Src1Mods, (ins Src1RC64:$src1),
!cond(IsWMMA : !con(Src2Mods, (ins Src2RC64:$src2)),
IsSWMMAC : !con((ins DstRC:$srcTiedDef), (ins VRegSrc_32:$src2), IndexKey)),
Clamp, Neg);
// asm
string IndexKeyAsm = !cond(!eq(IndexType, 0) : "",
!eq(IndexType, 8) : "$index_key_8bit",
!eq(IndexType, 16) : "$index_key_16bit");
string ClampAsm = !if(IsIU, "$clamp", "");
string NegAsm = !cond(!and(NegLoAny, NegHiAny) : "$neg_lo$neg_hi",
!and(NegLoAny, !not(NegHiAny)) : "$neg_lo",
!and(!not(NegLoAny), !not(NegHiAny)) : "");
let AsmVOP3P = "$vdst, $src0, $src1, $src2"#IndexKeyAsm#NegAsm#ClampAsm;
// isel patterns
dag Src0InPat = !cond(IsAB_F16 : (ins (Src0VT (WMMAModsF16Neg Src0VT:$src0, i32:$src0_modifiers))),
IsAB_BF16 : (ins Src0VT:$src0),
IsIU : (ins (VOP3PModsNeg i32:$src0_modifiers), Src0VT:$src0),
IsFP8BF8 : (ins Src0VT:$src0));
dag Src0OutPat = !cond(IsAB_F16 : (ins i32:$src0_modifiers, Src0VT:$src0),
IsAB_BF16 : (ins (i32 8), Src0VT:$src0),
IsIU : (ins i32:$src0_modifiers, Src0VT:$src0),
IsFP8BF8 : (ins Src0VT:$src0));
dag Src1InPat = !cond(IsAB_F16 : (ins (Src1VT (WMMAModsF16Neg Src1VT:$src1, i32:$src1_modifiers))),
IsAB_BF16 : (ins Src1VT:$src1),
IsIU : (ins (VOP3PModsNeg i32:$src1_modifiers), Src1VT:$src1),
IsFP8BF8 : (ins Src1VT:$src1));
dag Src1OutPat = !cond(IsAB_F16 : (ins i32:$src1_modifiers, Src1VT:$src1),
IsAB_BF16 : (ins (i32 8), Src1VT:$src1),
IsIU : (ins i32:$src1_modifiers, Src1VT:$src1),
IsFP8BF8 : (ins Src1VT:$src1));
dag Src2InPatWmma = !cond(IsC_F32 : (ins (Src2VT (WMMAModsF32NegAbs Src2VT:$src2, i32:$src2_modifiers))),
IsC_F16 : (ins (Src2VT (WMMAModsF16NegAbs Src2VT:$src2, i32:$src2_modifiers))),
IsC_BF16 : (ins Src2VT:$src2),
IsIU : (ins Src2VT:$src2),
IsSWMMAC : (ins));
dag Src2OutPatWmma = !cond(IsC_F32 : (ins i32:$src2_modifiers, Src2VT:$src2),
IsC_F16 : (ins i32:$src2_modifiers, Src2VT:$src2),
IsC_BF16 : (ins (i32 8), Src2VT:$src2),
IsIU : (ins Src2VT:$src2),
IsSWMMAC : (ins));
dag ClampPat = !if(IsIU, (ins i1:$clamp), (ins));
dag IndexInPat = !cond(!eq(IndexType, 0) : (ins i32:$src2),
!eq(IndexType, 8) : (ins (i32 (SWMMACIndex8 i32:$src2, i32:$index_key_8bit))),
!eq(IndexType, 16): (ins (i32 (SWMMACIndex16 i32:$src2, i32:$index_key_16bit))));
dag IndexOutPat = !cond(!eq(IndexType, 0) : (ins i32:$src2),
!eq(IndexType, 8) : (ins i32:$src2, i32:$index_key_8bit),
!eq(IndexType, 16): (ins i32:$src2, i32:$index_key_16bit));
dag Src2InlineInPat = (ins (Src2VT (WMMAVISrc Src2VT:$src2)));
dag Src2InlineOutPat = !con(!if(IsIU, (ins), (ins (i32 8))), (ins Src2VT:$src2));
dag WmmaInPat = !con(Src0InPat, Src1InPat, Src2InPatWmma, ClampPat);
dag WmmaOutPat = !con(Src0OutPat, Src1OutPat, Src2OutPatWmma, ClampPat);
dag SwmmacInPat = !con(Src0InPat, Src1InPat, (ins Src2VT:$srcTiedDef), IndexInPat, ClampPat);
dag SwmmacOutPat = !con(Src0OutPat, Src1OutPat, (ins Src2VT:$srcTiedDef), IndexOutPat, ClampPat);
// wmma pattern where src2 is inline imm uses _threeaddr pseudo,
// can't use _twoaddr since it would violate src2 tied to vdst constraint.
dag WmmaInlineInPat = !con(Src0InPat, Src1InPat, Src2InlineInPat, ClampPat);
dag WmmaInlineOutPat = !con(Src0OutPat, Src1OutPat, Src2InlineOutPat, ClampPat);
}
multiclass WMMAInstGFX12<string Instr, VOP3PWMMA_Profile WMMAProfile, string PseudoInstrSuffix> {
let Mnemonic = Instr, mayRaiseFPException = 0, ReadsModeReg = 0 in {
let Constraints = "@earlyclobber $vdst,$vdst = $src2", isConvertibleToThreeAddress = 1 in
def _twoaddr : VOP3P_Pseudo<Instr, WMMAProfile>{
let PseudoInstr = Instr#PseudoInstrSuffix;
}
let Constraints = "@earlyclobber $vdst", SchedRW = [Write32Bit, Write32Bit] in
def _threeaddr : VOP3P_Pseudo<Instr, WMMAProfile>{
let PseudoInstr = Instr#PseudoInstrSuffix;
}
}
def : WMMAOpcodeMapping<!cast<Instruction>(NAME # _twoaddr),
!cast<Instruction>(NAME # _threeaddr)>;
}
multiclass SWMMACInstGFX12<string Instr, VOP3PWMMA_Profile WMMAProfile, string PseudoInstrSuffix> {
def _twoaddr : VOP3P_Pseudo<Instr, WMMAProfile>{
let Mnemonic = Instr;
let PseudoInstr = Instr#PseudoInstrSuffix;
let mayRaiseFPException = 0;
let ReadsModeReg = 0;
let AsmMatchConverter = "cvtSWMMAC";
let Constraints = "@earlyclobber $vdst,$vdst = $srcTiedDef";
}
}
// First argument in Profile is types for matrices D, A, B and C (D = A * B + C)
// as used by llvm ir, types are vectors(with matrix elements)
// wave32:
// For 16x16 matrices, lanes 0 to 31 will have 8 matrix elts,
// for 16 x 32 16 elts and for 16 x 64 lanes have 32 elts.
// wave64:
// lanes will have half the size of elements in lanes compared to wave32 with
// exception of 16x16_iu4: lanes0-31 will have 8xi4, remaining lanes are ignored
// general idea on element distribution differences:
// wave32: lane n has 8 matrix elements
// wave64: lane n has first 4, lane n+32 has other 4 elements
// index size, for each 2 elements in lane you need 4bits in index
// Non-standard types (iu8, iu4, fp8, bf8) will be packed in vectors of i32s.
// Original type for them is in comment on the right and refers to A and B.
def F32_F16_WMMA_w32 : VOP3PWMMA_Profile<[v8f32, v8f16, v8f16, v8f32], 0, 0, 0, 0>;
def F32_BF16_WMMA_w32 : VOP3PWMMA_Profile<[v8f32, v8i16, v8i16, v8f32], 0, 0, 0, 0>;
def F16_F16_WMMA_w32 : VOP3PWMMA_Profile<[v8f16, v8f16, v8f16, v8f16], 0, 0, 0, 0>;
def BF16_BF16_WMMA_w32 : VOP3PWMMA_Profile<[v8i16, v8i16, v8i16, v8i16], 0, 0, 0, 0>;
def I32_IU8_WMMA_w32 : VOP3PWMMA_Profile<[v8i32, v2i32, v2i32, v8i32], 0, 0, 1, 0>; // 8xi8
def I32_IU4X16_WMMA_w32 : VOP3PWMMA_Profile<[v8i32, i32, i32, v8i32], 0, 0, 1, 0>; // 8xi4
def F32_FP8BF8_WMMA_w32 : VOP3PWMMA_Profile<[v8f32, v2i32, v2i32, v8f32], 0, 0, 0, 1>; // 8xf8
def I32_IU4X32_WMMA_w32 : VOP3PWMMA_Profile<[v8i32, v2i32, v2i32, v8i32], 0, 0, 1, 0>; // 16xi4
def F32_F16_WMMA_w64 : VOP3PWMMA_Profile<[v4f32, v4f16, v4f16, v4f32], 0, 0, 0, 0>;
def F32_BF16_WMMA_w64 : VOP3PWMMA_Profile<[v4f32, v4i16, v4i16, v4f32], 0, 0, 0, 0>;
def F16_F16_WMMA_w64 : VOP3PWMMA_Profile<[v4f16, v4f16, v4f16, v4f16], 0, 0, 0, 0>;
def BF16_BF16_WMMA_w64 : VOP3PWMMA_Profile<[v4i16, v4i16, v4i16, v4i16], 0, 0, 0, 0>;
def I32_IU8_WMMA_w64 : VOP3PWMMA_Profile<[v4i32, i32, i32, v4i32], 0, 0, 1, 0>; // 4xi8
def I32_IU4X16_WMMA_w64 : VOP3PWMMA_Profile<[v4i32, i32, i32, v4i32], 0, 0, 1, 0>; // 8xi4 *
def F32_FP8BF8_WMMA_w64 : VOP3PWMMA_Profile<[v4f32, i32, i32, v4f32], 0, 0, 0, 1>; // 4xf8
def I32_IU4X32_WMMA_w64 : VOP3PWMMA_Profile<[v4i32, i32, i32, v4i32], 0, 0, 1, 0>; // 8xi4
def F32_F16_SWMMAC_w32 : VOP3PWMMA_Profile<[v8f32, v8f16, v16f16, v8f32], 1, 16, 0, 0>;
def F32_BF16_SWMMAC_w32 : VOP3PWMMA_Profile<[v8f32, v8i16, v16i16, v8f32], 1, 16, 0, 0>;
def F16_F16_SWMMAC_w32 : VOP3PWMMA_Profile<[v8f16, v8f16, v16f16, v8f16], 1, 16, 0, 0>;
def BF16_BF16_SWMMAC_w32 : VOP3PWMMA_Profile<[v8i16, v8i16, v16i16, v8i16], 1, 16, 0, 0>;
def I32_IU8_SWMMAC_w32 : VOP3PWMMA_Profile<[v8i32, v2i32, v4i32, v8i32], 1, 16, 1, 0>; // 8xi8, 16xi8
def I32_IU4X32_SWMMAC_w32 : VOP3PWMMA_Profile<[v8i32, i32, v2i32, v8i32], 1, 16, 1, 0>; // 8xi4, 16xi4
def I32_IU4X64_SWMMAC_w32 : VOP3PWMMA_Profile<[v8i32, v2i32, v4i32, v8i32], 1, 0, 1, 0>; // 16xi4, 32xi4 **
def F32_FP8BF8_SWMMAC_w32 : VOP3PWMMA_Profile<[v8f32, v2i32, v4i32, v8f32], 1, 16, 0, 1>; // 8xf8, 16xf8
def F32_F16_SWMMAC_w64 : VOP3PWMMA_Profile<[v4f32, v4f16, v8f16, v4f32], 1, 8, 0, 0>;
def F32_BF16_SWMMAC_w64 : VOP3PWMMA_Profile<[v4f32, v4i16, v8i16, v4f32], 1, 8, 0, 0>;
def F16_F16_SWMMAC_w64 : VOP3PWMMA_Profile<[v4f16, v4f16, v8f16, v4f16], 1, 8, 0, 0>;
def BF16_BF16_SWMMAC_w64 : VOP3PWMMA_Profile<[v4i16, v4i16, v8i16, v4i16], 1, 8, 0, 0>;
def I32_IU8_SWMMAC_w64 : VOP3PWMMA_Profile<[v4i32, i32, v2i32, v4i32], 1, 8, 1, 0>; // 4xi8, 8xi8
def I32_IU4X32_SWMMAC_w64 : VOP3PWMMA_Profile<[v4i32, i32, i32, v4i32], 1, 16, 1, 0>; // 8xi4, 8xi4 ***
def I32_IU4X64_SWMMAC_w64 : VOP3PWMMA_Profile<[v4i32, i32, v2i32, v4i32], 1, 16, 1, 0>; // 8xi4, 16xi4
def F32_FP8BF8_SWMMAC_w64 : VOP3PWMMA_Profile<[v4f32, i32, v2i32, v4f32], 1, 8, 0, 1>; // 4xf8, 8xf8
// * IU4X16_WMMA_w64 lanes 0-31 will have 8xi4, remaining lanes are ignored
// ** IU4X64_SWMMAC_w32 index is i32, index_key is not used
// *** IU4X32_SWMMAC_w64 lanes 0-31 will have 8xi4 remaining lanes are ignored
// for matrix A, index is i16; Matrix B uses all lanes
let WaveSizePredicate = isWave32 in {
defm V_WMMA_F32_16X16X16_F16_w32 : WMMAInstGFX12<"v_wmma_f32_16x16x16_f16", F32_F16_WMMA_w32, "_w32">;
defm V_WMMA_F32_16X16X16_BF16_w32 : WMMAInstGFX12<"v_wmma_f32_16x16x16_bf16", F32_BF16_WMMA_w32, "_w32">;
defm V_WMMA_F16_16X16X16_F16_w32 : WMMAInstGFX12<"v_wmma_f16_16x16x16_f16", F16_F16_WMMA_w32, "_w32">;
defm V_WMMA_BF16_16X16X16_BF16_w32 : WMMAInstGFX12<"v_wmma_bf16_16x16x16_bf16", BF16_BF16_WMMA_w32, "_w32">;
defm V_WMMA_I32_16X16X16_IU8_w32 : WMMAInstGFX12<"v_wmma_i32_16x16x16_iu8", I32_IU8_WMMA_w32, "_w32">;
defm V_WMMA_I32_16X16X16_IU4_w32 : WMMAInstGFX12<"v_wmma_i32_16x16x16_iu4", I32_IU4X16_WMMA_w32, "_w32">;
defm V_WMMA_F32_16X16X16_FP8_FP8_w32 : WMMAInstGFX12<"v_wmma_f32_16x16x16_fp8_fp8", F32_FP8BF8_WMMA_w32, "_w32">;
defm V_WMMA_F32_16X16X16_FP8_BF8_w32 : WMMAInstGFX12<"v_wmma_f32_16x16x16_fp8_bf8", F32_FP8BF8_WMMA_w32, "_w32">;
defm V_WMMA_F32_16X16X16_BF8_FP8_w32 : WMMAInstGFX12<"v_wmma_f32_16x16x16_bf8_fp8", F32_FP8BF8_WMMA_w32, "_w32">;
defm V_WMMA_F32_16X16X16_BF8_BF8_w32 : WMMAInstGFX12<"v_wmma_f32_16x16x16_bf8_bf8", F32_FP8BF8_WMMA_w32, "_w32">;
defm V_WMMA_I32_16X16X32_IU4_w32 : WMMAInstGFX12<"v_wmma_i32_16x16x32_iu4", I32_IU4X32_WMMA_w32, "_w32">;
defm V_SWMMAC_F32_16X16X32_F16_w32 : SWMMACInstGFX12<"v_swmmac_f32_16x16x32_f16", F32_F16_SWMMAC_w32, "_w32">;
defm V_SWMMAC_F32_16X16X32_BF16_w32 : SWMMACInstGFX12<"v_swmmac_f32_16x16x32_bf16", F32_BF16_SWMMAC_w32, "_w32">;
defm V_SWMMAC_F16_16X16X32_F16_w32 : SWMMACInstGFX12<"v_swmmac_f16_16x16x32_f16", F16_F16_SWMMAC_w32, "_w32">;
defm V_SWMMAC_BF16_16X16X32_BF16_w32 : SWMMACInstGFX12<"v_swmmac_bf16_16x16x32_bf16", BF16_BF16_SWMMAC_w32, "_w32">;
defm V_SWMMAC_I32_16X16X32_IU8_w32 : SWMMACInstGFX12<"v_swmmac_i32_16x16x32_iu8", I32_IU8_SWMMAC_w32, "_w32">;
defm V_SWMMAC_I32_16X16X32_IU4_w32 : SWMMACInstGFX12<"v_swmmac_i32_16x16x32_iu4", I32_IU4X32_SWMMAC_w32, "_w32">;
defm V_SWMMAC_I32_16X16X64_IU4_w32 : SWMMACInstGFX12<"v_swmmac_i32_16x16x64_iu4", I32_IU4X64_SWMMAC_w32, "_w32">;
defm V_SWMMAC_F32_16X16X32_FP8_FP8_w32 : SWMMACInstGFX12<"v_swmmac_f32_16x16x32_fp8_fp8", F32_FP8BF8_SWMMAC_w32, "_w32">;
defm V_SWMMAC_F32_16X16X32_FP8_BF8_w32 : SWMMACInstGFX12<"v_swmmac_f32_16x16x32_fp8_bf8", F32_FP8BF8_SWMMAC_w32, "_w32">;
defm V_SWMMAC_F32_16X16X32_BF8_FP8_w32 : SWMMACInstGFX12<"v_swmmac_f32_16x16x32_bf8_fp8", F32_FP8BF8_SWMMAC_w32, "_w32">;
defm V_SWMMAC_F32_16X16X32_BF8_BF8_w32 : SWMMACInstGFX12<"v_swmmac_f32_16x16x32_bf8_bf8", F32_FP8BF8_SWMMAC_w32, "_w32">;
}
let WaveSizePredicate = isWave64 in {
defm V_WMMA_F32_16X16X16_F16_w64 : WMMAInstGFX12<"v_wmma_f32_16x16x16_f16", F32_F16_WMMA_w64, "_w64">;
defm V_WMMA_F32_16X16X16_BF16_w64 : WMMAInstGFX12<"v_wmma_f32_16x16x16_bf16", F32_BF16_WMMA_w64, "_w64">;
defm V_WMMA_F16_16X16X16_F16_w64 : WMMAInstGFX12<"v_wmma_f16_16x16x16_f16", F16_F16_WMMA_w64, "_w64">;
defm V_WMMA_BF16_16X16X16_BF16_w64 : WMMAInstGFX12<"v_wmma_bf16_16x16x16_bf16", BF16_BF16_WMMA_w64, "_w64">;
defm V_WMMA_I32_16X16X16_IU8_w64 : WMMAInstGFX12<"v_wmma_i32_16x16x16_iu8", I32_IU8_WMMA_w64, "_w64">;
defm V_WMMA_I32_16X16X16_IU4_w64 : WMMAInstGFX12<"v_wmma_i32_16x16x16_iu4", I32_IU4X16_WMMA_w64, "_w64">;
defm V_WMMA_F32_16X16X16_FP8_FP8_w64 : WMMAInstGFX12<"v_wmma_f32_16x16x16_fp8_fp8", F32_FP8BF8_WMMA_w64, "_w64">;
defm V_WMMA_F32_16X16X16_FP8_BF8_w64 : WMMAInstGFX12<"v_wmma_f32_16x16x16_fp8_bf8", F32_FP8BF8_WMMA_w64, "_w64">;
defm V_WMMA_F32_16X16X16_BF8_FP8_w64 : WMMAInstGFX12<"v_wmma_f32_16x16x16_bf8_fp8", F32_FP8BF8_WMMA_w64, "_w64">;
defm V_WMMA_F32_16X16X16_BF8_BF8_w64 : WMMAInstGFX12<"v_wmma_f32_16x16x16_bf8_bf8", F32_FP8BF8_WMMA_w64, "_w64">;
defm V_WMMA_I32_16X16X32_IU4_w64 : WMMAInstGFX12<"v_wmma_i32_16x16x32_iu4", I32_IU4X32_WMMA_w64, "_w64">;
defm V_SWMMAC_F32_16X16X32_F16_w64 : SWMMACInstGFX12<"v_swmmac_f32_16x16x32_f16", F32_F16_SWMMAC_w64, "_w64">;
defm V_SWMMAC_F32_16X16X32_BF16_w64 : SWMMACInstGFX12<"v_swmmac_f32_16x16x32_bf16", F32_BF16_SWMMAC_w64, "_w64">;
defm V_SWMMAC_F16_16X16X32_F16_w64 : SWMMACInstGFX12<"v_swmmac_f16_16x16x32_f16", F16_F16_SWMMAC_w64, "_w64">;
defm V_SWMMAC_BF16_16X16X32_BF16_w64 : SWMMACInstGFX12<"v_swmmac_bf16_16x16x32_bf16", BF16_BF16_SWMMAC_w64, "_w64">;
defm V_SWMMAC_I32_16X16X32_IU8_w64 : SWMMACInstGFX12<"v_swmmac_i32_16x16x32_iu8", I32_IU8_SWMMAC_w64, "_w64">;
defm V_SWMMAC_I32_16X16X32_IU4_w64 : SWMMACInstGFX12<"v_swmmac_i32_16x16x32_iu4", I32_IU4X32_SWMMAC_w64, "_w64">;
defm V_SWMMAC_I32_16X16X64_IU4_w64 : SWMMACInstGFX12<"v_swmmac_i32_16x16x64_iu4", I32_IU4X64_SWMMAC_w64, "_w64">;
defm V_SWMMAC_F32_16X16X32_FP8_FP8_w64 : SWMMACInstGFX12<"v_swmmac_f32_16x16x32_fp8_fp8", F32_FP8BF8_SWMMAC_w64, "_w64">;
defm V_SWMMAC_F32_16X16X32_FP8_BF8_w64 : SWMMACInstGFX12<"v_swmmac_f32_16x16x32_fp8_bf8", F32_FP8BF8_SWMMAC_w64, "_w64">;
defm V_SWMMAC_F32_16X16X32_BF8_FP8_w64 : SWMMACInstGFX12<"v_swmmac_f32_16x16x32_bf8_fp8", F32_FP8BF8_SWMMAC_w64, "_w64">;
defm V_SWMMAC_F32_16X16X32_BF8_BF8_w64 : SWMMACInstGFX12<"v_swmmac_f32_16x16x32_bf8_bf8", F32_FP8BF8_SWMMAC_w64, "_w64">;
}
// IsGFX11OpselIntrinsic: f16_f16 and bf16_bf16 Intrinsics have imm operand that
// controls opsel. Used by gfx11, removed in gfx12 (operand must be 0).
multiclass WMMAPat<string Inst, SDPatternOperator node, VOP3PWMMA_Profile P, bit IsGFX11OpselIntrinsic = 0> {
def : GCNPat <(P.DstVT !setdagop(!con(P.WmmaInPat, !if(IsGFX11OpselIntrinsic, (ins 0), (ins))), node)),
(P.DstVT !setdagop(P.WmmaOutPat, !cast<Instruction>(Inst#"_twoaddr")))>;
let AddedComplexity = 4 in
def : GCNPat <(P.DstVT !setdagop(!con(P.WmmaInlineInPat, !if(IsGFX11OpselIntrinsic, (ins 0), (ins))), node)),
(P.DstVT !setdagop(P.WmmaInlineOutPat, !cast<Instruction>(Inst#"_threeaddr")))>;
}
class SWMMACPat<Instruction Inst, SDPatternOperator node, VOP3PWMMA_Profile P> :
GCNPat <(P.DstVT !setdagop(P.SwmmacInPat, node)),
(P.DstVT !setdagop(P.SwmmacOutPat, Inst))>;
class SWMMACPat_w64<Instruction Inst, SDPatternOperator node, VOP3PWMMA_Profile P> :
GCNPat <(P.DstVT !setdagop(P.SwmmacInPat, node)),
(P.DstVT !setdagop(P.SwmmacOutPat, Inst))>{
let WaveSizePredicate = isWave64;
}
let WaveSizePredicate = isWave32, SubtargetPredicate = isGFX12Plus in {
defm : WMMAPat<"V_WMMA_F32_16X16X16_F16_w32", int_amdgcn_wmma_f32_16x16x16_f16, F32_F16_WMMA_w32>;
defm : WMMAPat<"V_WMMA_F32_16X16X16_BF16_w32", int_amdgcn_wmma_f32_16x16x16_bf16, F32_BF16_WMMA_w32>;
defm : WMMAPat<"V_WMMA_F16_16X16X16_F16_w32", int_amdgcn_wmma_f16_16x16x16_f16, F16_F16_WMMA_w32,1>;
defm : WMMAPat<"V_WMMA_BF16_16X16X16_BF16_w32", int_amdgcn_wmma_bf16_16x16x16_bf16, BF16_BF16_WMMA_w32,1>;
defm : WMMAPat<"V_WMMA_I32_16X16X16_IU8_w32", int_amdgcn_wmma_i32_16x16x16_iu8, I32_IU8_WMMA_w32>;
defm : WMMAPat<"V_WMMA_I32_16X16X16_IU4_w32", int_amdgcn_wmma_i32_16x16x16_iu4, I32_IU4X16_WMMA_w32>;
defm : WMMAPat<"V_WMMA_F32_16X16X16_FP8_FP8_w32", int_amdgcn_wmma_f32_16x16x16_fp8_fp8, F32_FP8BF8_WMMA_w32>;
defm : WMMAPat<"V_WMMA_F32_16X16X16_FP8_BF8_w32", int_amdgcn_wmma_f32_16x16x16_fp8_bf8, F32_FP8BF8_WMMA_w32>;
defm : WMMAPat<"V_WMMA_F32_16X16X16_BF8_FP8_w32", int_amdgcn_wmma_f32_16x16x16_bf8_fp8, F32_FP8BF8_WMMA_w32>;
defm : WMMAPat<"V_WMMA_F32_16X16X16_BF8_BF8_w32", int_amdgcn_wmma_f32_16x16x16_bf8_bf8, F32_FP8BF8_WMMA_w32>;
defm : WMMAPat<"V_WMMA_I32_16X16X32_IU4_w32", int_amdgcn_wmma_i32_16x16x32_iu4, I32_IU4X32_WMMA_w32>;
def : SWMMACPat<V_SWMMAC_F32_16X16X32_F16_w32_twoaddr, int_amdgcn_swmmac_f32_16x16x32_f16, F32_F16_SWMMAC_w32>;
def : SWMMACPat<V_SWMMAC_F32_16X16X32_BF16_w32_twoaddr, int_amdgcn_swmmac_f32_16x16x32_bf16, F32_BF16_SWMMAC_w32>;
def : SWMMACPat<V_SWMMAC_F16_16X16X32_F16_w32_twoaddr, int_amdgcn_swmmac_f16_16x16x32_f16, F16_F16_SWMMAC_w32>;
def : SWMMACPat<V_SWMMAC_BF16_16X16X32_BF16_w32_twoaddr, int_amdgcn_swmmac_bf16_16x16x32_bf16, BF16_BF16_SWMMAC_w32>;
def : SWMMACPat<V_SWMMAC_I32_16X16X32_IU8_w32_twoaddr, int_amdgcn_swmmac_i32_16x16x32_iu8, I32_IU8_SWMMAC_w32>;
def : SWMMACPat<V_SWMMAC_I32_16X16X32_IU4_w32_twoaddr, int_amdgcn_swmmac_i32_16x16x32_iu4, I32_IU4X32_SWMMAC_w32>;
def : GCNPat <(I32_IU4X64_SWMMAC_w32.DstVT !setdagop(I32_IU4X64_SWMMAC_w32.SwmmacInPat, int_amdgcn_swmmac_i32_16x16x64_iu4)),
(I32_IU4X64_SWMMAC_w32.DstVT !setdagop(I32_IU4X64_SWMMAC_w32.SwmmacOutPat, V_SWMMAC_I32_16X16X64_IU4_w32_twoaddr))>;
def : SWMMACPat<V_SWMMAC_F32_16X16X32_FP8_FP8_w32_twoaddr, int_amdgcn_swmmac_f32_16x16x32_fp8_fp8, F32_FP8BF8_SWMMAC_w32>;
def : SWMMACPat<V_SWMMAC_F32_16X16X32_FP8_BF8_w32_twoaddr, int_amdgcn_swmmac_f32_16x16x32_fp8_bf8, F32_FP8BF8_SWMMAC_w32>;
def : SWMMACPat<V_SWMMAC_F32_16X16X32_BF8_FP8_w32_twoaddr, int_amdgcn_swmmac_f32_16x16x32_bf8_fp8, F32_FP8BF8_SWMMAC_w32>;
def : SWMMACPat<V_SWMMAC_F32_16X16X32_BF8_BF8_w32_twoaddr, int_amdgcn_swmmac_f32_16x16x32_bf8_bf8, F32_FP8BF8_SWMMAC_w32>;
}
let WaveSizePredicate = isWave64, SubtargetPredicate = isGFX12Plus in {
defm : WMMAPat<"V_WMMA_F32_16X16X16_F16_w64", int_amdgcn_wmma_f32_16x16x16_f16, F32_F16_WMMA_w64>;
defm : WMMAPat<"V_WMMA_F32_16X16X16_BF16_w64", int_amdgcn_wmma_f32_16x16x16_bf16, F32_BF16_WMMA_w64>;
defm : WMMAPat<"V_WMMA_F16_16X16X16_F16_w64", int_amdgcn_wmma_f16_16x16x16_f16, F16_F16_WMMA_w64,1>;
defm : WMMAPat<"V_WMMA_BF16_16X16X16_BF16_w64", int_amdgcn_wmma_bf16_16x16x16_bf16, BF16_BF16_WMMA_w64,1>;
defm : WMMAPat<"V_WMMA_I32_16X16X16_IU8_w64", int_amdgcn_wmma_i32_16x16x16_iu8, I32_IU8_WMMA_w64>;
defm : WMMAPat<"V_WMMA_I32_16X16X16_IU4_w64", int_amdgcn_wmma_i32_16x16x16_iu4, I32_IU4X16_WMMA_w64>;
defm : WMMAPat<"V_WMMA_F32_16X16X16_FP8_FP8_w64", int_amdgcn_wmma_f32_16x16x16_fp8_fp8, F32_FP8BF8_WMMA_w64>;
defm : WMMAPat<"V_WMMA_F32_16X16X16_FP8_BF8_w64", int_amdgcn_wmma_f32_16x16x16_fp8_bf8, F32_FP8BF8_WMMA_w64>;
defm : WMMAPat<"V_WMMA_F32_16X16X16_BF8_FP8_w64", int_amdgcn_wmma_f32_16x16x16_bf8_fp8, F32_FP8BF8_WMMA_w64>;
defm : WMMAPat<"V_WMMA_F32_16X16X16_BF8_BF8_w64", int_amdgcn_wmma_f32_16x16x16_bf8_bf8, F32_FP8BF8_WMMA_w64>;
defm : WMMAPat<"V_WMMA_I32_16X16X32_IU4_w64", int_amdgcn_wmma_i32_16x16x32_iu4, I32_IU4X32_WMMA_w64>;
def : SWMMACPat<V_SWMMAC_F32_16X16X32_F16_w64_twoaddr, int_amdgcn_swmmac_f32_16x16x32_f16, F32_F16_SWMMAC_w64>;
def : SWMMACPat<V_SWMMAC_F32_16X16X32_BF16_w64_twoaddr, int_amdgcn_swmmac_f32_16x16x32_bf16, F32_BF16_SWMMAC_w64>;
def : SWMMACPat<V_SWMMAC_F16_16X16X32_F16_w64_twoaddr, int_amdgcn_swmmac_f16_16x16x32_f16, F16_F16_SWMMAC_w64>;
def : SWMMACPat<V_SWMMAC_BF16_16X16X32_BF16_w64_twoaddr, int_amdgcn_swmmac_bf16_16x16x32_bf16, BF16_BF16_SWMMAC_w64>;
def : SWMMACPat<V_SWMMAC_I32_16X16X32_IU8_w64_twoaddr, int_amdgcn_swmmac_i32_16x16x32_iu8, I32_IU8_SWMMAC_w64>;
def : SWMMACPat<V_SWMMAC_I32_16X16X32_IU4_w64_twoaddr, int_amdgcn_swmmac_i32_16x16x32_iu4, I32_IU4X32_SWMMAC_w64>;
def : SWMMACPat<V_SWMMAC_I32_16X16X64_IU4_w64_twoaddr, int_amdgcn_swmmac_i32_16x16x64_iu4, I32_IU4X64_SWMMAC_w64>;
def : SWMMACPat<V_SWMMAC_F32_16X16X32_FP8_FP8_w64_twoaddr, int_amdgcn_swmmac_f32_16x16x32_fp8_fp8, F32_FP8BF8_SWMMAC_w64>;
def : SWMMACPat<V_SWMMAC_F32_16X16X32_FP8_BF8_w64_twoaddr, int_amdgcn_swmmac_f32_16x16x32_fp8_bf8, F32_FP8BF8_SWMMAC_w64>;
def : SWMMACPat<V_SWMMAC_F32_16X16X32_BF8_FP8_w64_twoaddr, int_amdgcn_swmmac_f32_16x16x32_bf8_fp8, F32_FP8BF8_SWMMAC_w64>;
def : SWMMACPat<V_SWMMAC_F32_16X16X32_BF8_BF8_w64_twoaddr, int_amdgcn_swmmac_f32_16x16x32_bf8_bf8, F32_FP8BF8_SWMMAC_w64>;
}
//===----------------------------------------------------------------------===//
// Begin Real Encodings
//===----------------------------------------------------------------------===//
@ -1005,6 +1400,99 @@ multiclass VOP3P_Real_Base<GFXGen Gen, bits<7> op, string backing_ps_name = NAME
VOP3Pe_gfx11_gfx12<op, !cast<VOP3P_Pseudo>(backing_ps_name).Pfl>;
}
class VOP3PeWmma<bits<7> op, VOPProfile P, VOP3PWMMA_Profile WMMAP>
: VOP3Pe_gfx11_gfx12<op, P>{
// opsel
let Inst{11} = !cond(!eq(WMMAP.IndexType, 0) : 0,
!eq(WMMAP.IndexType, 8) : index_key_8bit{0},
!eq(WMMAP.IndexType, 16) : index_key_16bit{0});
let Inst{12} = !if(!eq(WMMAP.IndexType, 8), index_key_8bit{1}, 0);
let Inst{13} = 0;
// opsel_hi
let Inst{59} = 1;
let Inst{60} = 1;
let Inst{14} = 1;
// neg_lo
let Inst{61} = !if(WMMAP.NegLo01, src0_modifiers{0}, 0);
let Inst{62} = !if(WMMAP.NegLo01, src1_modifiers{0}, 0);
let Inst{63} = !if(WMMAP.NegLo2, src2_modifiers{0}, 0);
// neg_hi
let Inst{8} = !if(WMMAP.NegHi01, src0_modifiers{1}, 0);
let Inst{9} = !if(WMMAP.NegHi01, src1_modifiers{1}, 0);
let Inst{10} = !if(WMMAP.NegHi2, src2_modifiers{1}, 0);
// clamp
let Inst{15} = !if(WMMAP.IsIU, clamp{0}, 0);
}
multiclass VOP3P_WMMA_Real_Base<GFXGen Gen, bits<7> op, VOP3PWMMA_Profile WMMAP,
string backing_ps_name = NAME,
string asmName = !cast<VOP3P_Pseudo>(NAME).Mnemonic> {
def Gen.Suffix :
VOP3P_Real_Gen<!cast<VOP3P_Pseudo>(backing_ps_name), Gen, asmName>,
VOP3PeWmma<op, !cast<VOP3P_Pseudo>(backing_ps_name).Pfl, WMMAP>;
}
multiclass VOP3P_Real_WMMA_gfx12 <bits<7> op, VOP3PWMMA_Profile WMMAP> {
let WaveSizePredicate = isWave32, DecoderNamespace = "GFX12" in {
defm _twoaddr : VOP3P_WMMA_Real_Base <GFX12Gen, op, WMMAP>;
}
}
multiclass VOP3P_Real_WMMA_gfx12w64 <bits<7> op, VOP3PWMMA_Profile WMMAP> {
let WaveSizePredicate = isWave64, DecoderNamespace = "WMMAGFX12" in {
defm _twoaddr : VOP3P_WMMA_Real_Base <GFX12Gen, op, WMMAP>;
}
}
defm V_WMMA_F32_16X16X16_F16_w32 : VOP3P_Real_WMMA_gfx12 <0x040, F32_F16_WMMA_w32>;
defm V_WMMA_F32_16X16X16_BF16_w32 : VOP3P_Real_WMMA_gfx12 <0x041, F32_BF16_WMMA_w32>;
defm V_WMMA_F16_16X16X16_F16_w32 : VOP3P_Real_WMMA_gfx12 <0x042, F16_F16_WMMA_w32>;
defm V_WMMA_BF16_16X16X16_BF16_w32 : VOP3P_Real_WMMA_gfx12 <0x043, BF16_BF16_WMMA_w32>;
defm V_WMMA_I32_16X16X16_IU8_w32 : VOP3P_Real_WMMA_gfx12 <0x044, I32_IU8_WMMA_w32>;
defm V_WMMA_I32_16X16X16_IU4_w32 : VOP3P_Real_WMMA_gfx12 <0x045, I32_IU4X16_WMMA_w32>;
defm V_WMMA_F32_16X16X16_FP8_FP8_w32 : VOP3P_Real_WMMA_gfx12 <0x046, F32_FP8BF8_WMMA_w32>;
defm V_WMMA_F32_16X16X16_FP8_BF8_w32 : VOP3P_Real_WMMA_gfx12 <0x047, F32_FP8BF8_WMMA_w32>;
defm V_WMMA_F32_16X16X16_BF8_FP8_w32 : VOP3P_Real_WMMA_gfx12 <0x048, F32_FP8BF8_WMMA_w32>;
defm V_WMMA_F32_16X16X16_BF8_BF8_w32 : VOP3P_Real_WMMA_gfx12 <0x049, F32_FP8BF8_WMMA_w32>;
defm V_WMMA_I32_16X16X32_IU4_w32 : VOP3P_Real_WMMA_gfx12 <0x04a, I32_IU4X32_WMMA_w32>;
defm V_WMMA_F32_16X16X16_F16_w64 : VOP3P_Real_WMMA_gfx12w64 <0x040, F32_F16_WMMA_w64>;
defm V_WMMA_F32_16X16X16_BF16_w64 : VOP3P_Real_WMMA_gfx12w64 <0x041, F32_BF16_WMMA_w64>;
defm V_WMMA_F16_16X16X16_F16_w64 : VOP3P_Real_WMMA_gfx12w64 <0x042, F16_F16_WMMA_w64>;
defm V_WMMA_BF16_16X16X16_BF16_w64 : VOP3P_Real_WMMA_gfx12w64 <0x043, BF16_BF16_WMMA_w64>;
defm V_WMMA_I32_16X16X16_IU8_w64 : VOP3P_Real_WMMA_gfx12w64 <0x044, I32_IU8_WMMA_w64>;
defm V_WMMA_I32_16X16X16_IU4_w64 : VOP3P_Real_WMMA_gfx12w64 <0x045, I32_IU4X16_WMMA_w64>;
defm V_WMMA_F32_16X16X16_FP8_FP8_w64 : VOP3P_Real_WMMA_gfx12w64 <0x046, F32_FP8BF8_WMMA_w64>;
defm V_WMMA_F32_16X16X16_FP8_BF8_w64 : VOP3P_Real_WMMA_gfx12w64 <0x047, F32_FP8BF8_WMMA_w64>;
defm V_WMMA_F32_16X16X16_BF8_FP8_w64 : VOP3P_Real_WMMA_gfx12w64 <0x048, F32_FP8BF8_WMMA_w64>;
defm V_WMMA_F32_16X16X16_BF8_BF8_w64 : VOP3P_Real_WMMA_gfx12w64 <0x049, F32_FP8BF8_WMMA_w64>;
defm V_WMMA_I32_16X16X32_IU4_w64 : VOP3P_Real_WMMA_gfx12w64 <0x04a, I32_IU4X32_WMMA_w64>;
defm V_SWMMAC_F32_16X16X32_F16_w32 : VOP3P_Real_WMMA_gfx12 <0x050, F32_F16_SWMMAC_w32>;
defm V_SWMMAC_F32_16X16X32_BF16_w32 : VOP3P_Real_WMMA_gfx12 <0x051, F32_BF16_SWMMAC_w32>;
defm V_SWMMAC_F16_16X16X32_F16_w32 : VOP3P_Real_WMMA_gfx12 <0x052, F16_F16_SWMMAC_w32>;
defm V_SWMMAC_BF16_16X16X32_BF16_w32 : VOP3P_Real_WMMA_gfx12 <0x053, BF16_BF16_SWMMAC_w32>;
defm V_SWMMAC_I32_16X16X32_IU8_w32 : VOP3P_Real_WMMA_gfx12 <0x054, I32_IU8_SWMMAC_w32>;
defm V_SWMMAC_I32_16X16X32_IU4_w32 : VOP3P_Real_WMMA_gfx12 <0x055, I32_IU4X32_SWMMAC_w32>;
defm V_SWMMAC_I32_16X16X64_IU4_w32 : VOP3P_Real_WMMA_gfx12 <0x056, I32_IU4X64_SWMMAC_w32>;
defm V_SWMMAC_F32_16X16X32_FP8_FP8_w32 : VOP3P_Real_WMMA_gfx12 <0x057, F32_FP8BF8_SWMMAC_w32>;
defm V_SWMMAC_F32_16X16X32_FP8_BF8_w32 : VOP3P_Real_WMMA_gfx12 <0x058, F32_FP8BF8_SWMMAC_w32>;
defm V_SWMMAC_F32_16X16X32_BF8_FP8_w32 : VOP3P_Real_WMMA_gfx12 <0x059, F32_FP8BF8_SWMMAC_w32>;
defm V_SWMMAC_F32_16X16X32_BF8_BF8_w32 : VOP3P_Real_WMMA_gfx12 <0x05a, F32_FP8BF8_SWMMAC_w32>;
defm V_SWMMAC_F32_16X16X32_F16_w64 : VOP3P_Real_WMMA_gfx12w64 <0x050, F32_F16_SWMMAC_w64>;
defm V_SWMMAC_F32_16X16X32_BF16_w64 : VOP3P_Real_WMMA_gfx12w64 <0x051, F32_BF16_SWMMAC_w64>;
defm V_SWMMAC_F16_16X16X32_F16_w64 : VOP3P_Real_WMMA_gfx12w64 <0x052, F16_F16_SWMMAC_w64>;
defm V_SWMMAC_BF16_16X16X32_BF16_w64 : VOP3P_Real_WMMA_gfx12w64 <0x053, BF16_BF16_SWMMAC_w64>;
defm V_SWMMAC_I32_16X16X32_IU8_w64 : VOP3P_Real_WMMA_gfx12w64 <0x054, I32_IU8_SWMMAC_w64>;
defm V_SWMMAC_I32_16X16X32_IU4_w64 : VOP3P_Real_WMMA_gfx12w64 <0x055, I32_IU4X32_SWMMAC_w64>;
defm V_SWMMAC_I32_16X16X64_IU4_w64 : VOP3P_Real_WMMA_gfx12w64 <0x056, I32_IU4X64_SWMMAC_w64>;
defm V_SWMMAC_F32_16X16X32_FP8_FP8_w64 : VOP3P_Real_WMMA_gfx12w64 <0x057, F32_FP8BF8_SWMMAC_w64>;
defm V_SWMMAC_F32_16X16X32_FP8_BF8_w64 : VOP3P_Real_WMMA_gfx12w64 <0x058, F32_FP8BF8_SWMMAC_w64>;
defm V_SWMMAC_F32_16X16X32_BF8_FP8_w64 : VOP3P_Real_WMMA_gfx12w64 <0x059, F32_FP8BF8_SWMMAC_w64>;
defm V_SWMMAC_F32_16X16X32_BF8_BF8_w64 : VOP3P_Real_WMMA_gfx12w64 <0x05a, F32_FP8BF8_SWMMAC_w64>;
multiclass VOP3P_Real_with_name<GFXGen Gen, bits<7> op,
string backing_ps_name = NAME,
string asmName = !cast<VOP3P_Pseudo>(NAME).Mnemonic> {

View File

@ -124,6 +124,7 @@ class VOP3_Pseudo <string opName, VOPProfile P, list<dag> pattern = [],
let IsPacked = P.IsPacked;
let IsMAI = P.IsMAI;
let IsWMMA = P.IsWMMA;
let IsSWMMAC = P.IsSWMMAC;
let AsmOperands = !if(isVop3OpSel,
P.AsmVOP3OpSel,
@ -378,6 +379,8 @@ class VOP3Pe <bits<7> op, VOPProfile P> : Enc64 {
bits<4> src2_modifiers;
bits<9> src2;
bits<1> clamp;
bits<2> index_key_8bit;
bits<1> index_key_16bit;
let Inst{7-0} = vdst;
let Inst{8} = !if(P.HasSrc0Mods, src0_modifiers{1}, 0); // neg_hi src0

View File

@ -63,52 +63,140 @@ define amdgpu_kernel void @writelane(ptr addrspace(1) %out) #0 {
ret void
}
; CHECK: DIVERGENT: %tmp0 = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.f16.v8f32(<16 x half> %A, <16 x half> %B, <8 x float> %C)
; CHECK: DIVERGENT: %tmp0 = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.f16.v8f32.v16f16(<16 x half> %A, <16 x half> %B, <8 x float> %C)
define amdgpu_kernel void @wmma_f32_16x16x16_f16(<16 x half> %A, <16 x half> %B, <8 x float> %C, ptr addrspace(1) %out) {
%tmp0 = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.f16.v8f32(<16 x half> %A, <16 x half> %B, <8 x float> %C)
%tmp0 = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.f16.v8f32.v16f16(<16 x half> %A, <16 x half> %B, <8 x float> %C)
store <8 x float> %tmp0, ptr addrspace(1) %out, align 32
ret void
}
; CHECK: DIVERGENT: %tmp0 = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf16.v8f32(<16 x i16> %A, <16 x i16> %B, <8 x float> %C)
; CHECK: DIVERGENT: %tmp0 = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf16.v8f32.v16i16(<16 x i16> %A, <16 x i16> %B, <8 x float> %C)
define amdgpu_kernel void @wmma_f32_16x16x16_ibf16(<16 x i16> %A, <16 x i16> %B, <8 x float> %C, ptr addrspace(1) %out) {
%tmp0 = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf16.v8f32(<16 x i16> %A, <16 x i16> %B, <8 x float> %C)
%tmp0 = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf16.v8f32.v16i16(<16 x i16> %A, <16 x i16> %B, <8 x float> %C)
store <8 x float> %tmp0, ptr addrspace(1) %out, align 32
ret void
}
; CHECK: DIVERGENT: %tmp0 = call <16 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16.v16f16(<16 x half> %A, <16 x half> %B, <16 x half> %C, i1 false)
; CHECK: DIVERGENT: %tmp0 = call <16 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16.v16f16.v16f16(<16 x half> %A, <16 x half> %B, <16 x half> %C, i1 false)
define amdgpu_kernel void @wmma_f16_16x16x16_f16(<16 x half> %A, <16 x half> %B, <16 x half> %C, ptr addrspace(1) %out) {
bb:
%tmp0 = call <16 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16.v16f16(<16 x half> %A, <16 x half> %B, <16 x half> %C, i1 false)
%tmp0 = call <16 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16.v16f16.v16f16(<16 x half> %A, <16 x half> %B, <16 x half> %C, i1 false)
store <16 x half> %tmp0, ptr addrspace(1) %out, align 32
ret void
}
; CHECK: DIVERGENT: %tmp0 = call <16 x i16> @llvm.amdgcn.wmma.bf16.16x16x16.bf16.v16i16(<16 x i16> %A, <16 x i16> %B, <16 x i16> %C, i1 false)
; CHECK: DIVERGENT: %tmp0 = call <16 x i16> @llvm.amdgcn.wmma.bf16.16x16x16.bf16.v16i16.v16i16(<16 x i16> %A, <16 x i16> %B, <16 x i16> %C, i1 false)
define amdgpu_kernel void @wmma_f16_16x16x16_bf16(<16 x i16> %A, <16 x i16> %B, <16 x i16> %C, ptr addrspace(1) %out) {
bb:
%tmp0 = call <16 x i16> @llvm.amdgcn.wmma.bf16.16x16x16.bf16.v16i16(<16 x i16> %A, <16 x i16> %B, <16 x i16> %C, i1 false)
%tmp0 = call <16 x i16> @llvm.amdgcn.wmma.bf16.16x16x16.bf16.v16i16.v16i16(<16 x i16> %A, <16 x i16> %B, <16 x i16> %C, i1 false)
store <16 x i16> %tmp0, ptr addrspace(1) %out, align 32
ret void
}
; CHECK: DIVERGENT: %tmp0 = call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8.v8i32(i1 false, <4 x i32> %A, i1 false, <4 x i32> %B, <8 x i32> %C, i1 false)
; CHECK: DIVERGENT: %tmp0 = call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8.v8i32.v4i32(i1 false, <4 x i32> %A, i1 false, <4 x i32> %B, <8 x i32> %C, i1 false)
define amdgpu_kernel void @wmma_i32_16x16x16_ui8(<4 x i32> %A, <4 x i32> %B, <8 x i32> %C, ptr addrspace(1) %out) {
bb:
%tmp0 = call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8.v8i32(i1 false, <4 x i32> %A, i1 false, <4 x i32> %B, <8 x i32> %C, i1 false)
%tmp0 = call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8.v8i32.v4i32(i1 false, <4 x i32> %A, i1 false, <4 x i32> %B, <8 x i32> %C, i1 false)
store <8 x i32> %tmp0, ptr addrspace(1) %out, align 32
ret void
}
; CHECK: DIVERGENT: %tmp0 = call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4.v8i32(i1 false, <2 x i32> %A, i1 false, <2 x i32> %B, <8 x i32> %C, i1 false)
; CHECK: DIVERGENT: %tmp0 = call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4.v8i32.v2i32(i1 false, <2 x i32> %A, i1 false, <2 x i32> %B, <8 x i32> %C, i1 false)
define amdgpu_kernel void @wmma_i32_16x16x16_ui4(<2 x i32> %A, <2 x i32> %B, <8 x i32> %C, ptr addrspace(1) %out) {
bb:
%tmp0 = call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4.v8i32(i1 false, <2 x i32> %A, i1 false, <2 x i32> %B, <8 x i32> %C, i1 false)
%tmp0 = call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4.v8i32.v2i32(i1 false, <2 x i32> %A, i1 false, <2 x i32> %B, <8 x i32> %C, i1 false)
store <8 x i32> %tmp0, ptr addrspace(1) %out, align 32
ret void
}
; CHECK: DIVERGENT: %tmp0 = call <8 x float> @llvm.amdgcn.swmmac.f32.16x16x32.f16.v8f32.v8f16.v16f16.i16(<8 x half> %A, <16 x half> %B, <8 x float> %C, i16 %Index)
define amdgpu_kernel void @swmmac_f32_16x16x32_f16(<8 x half> %A, <16 x half> %B, <8 x float> %C, i16 %Index, ptr addrspace(1) %out) {
bb:
%tmp0 = call <8 x float> @llvm.amdgcn.swmmac.f32.16x16x32.f16(<8 x half> %A, <16 x half> %B, <8 x float> %C, i16 %Index)
store <8 x float> %tmp0, ptr addrspace(1) %out, align 32
ret void
}
; CHECK: DIVERGENT: %tmp0 = call <8 x float> @llvm.amdgcn.swmmac.f32.16x16x32.bf16.v8f32.v8i16.v16i16.i16(<8 x i16> %A, <16 x i16> %B, <8 x float> %C, i16 %Index)
define amdgpu_kernel void @swmmac_f32_16x16x32_bf16(<8 x i16> %A, <16 x i16> %B, <8 x float> %C, i16 %Index, ptr addrspace(1) %out) {
bb:
%tmp0 = call <8 x float> @llvm.amdgcn.swmmac.f32.16x16x32.bf16(<8 x i16> %A, <16 x i16> %B, <8 x float> %C, i16 %Index)
store <8 x float> %tmp0, ptr addrspace(1) %out, align 32
ret void
}
; CHECK: DIVERGENT: %tmp0 = call <8 x half> @llvm.amdgcn.swmmac.f16.16x16x32.f16.v8f16.v8f16.v16f16.i16(<8 x half> %A, <16 x half> %B, <8 x half> %C, i16 %Index)
define amdgpu_kernel void @swmmac_f16_16x16x32_f16(<8 x half> %A, <16 x half> %B, <8 x half> %C, i16 %Index, ptr addrspace(1) %out) {
bb:
%tmp0 = call <8 x half> @llvm.amdgcn.swmmac.f16.16x16x32.f16(<8 x half> %A, <16 x half> %B, <8 x half> %C, i16 %Index)
store <8 x half> %tmp0, ptr addrspace(1) %out, align 32
ret void
}
; CHECK: DIVERGENT: %tmp0 = call <8 x i16> @llvm.amdgcn.swmmac.bf16.16x16x32.bf16.v8i16.v8i16.v16i16.i16(<8 x i16> %A, <16 x i16> %B, <8 x i16> %C, i16 %Index)
define amdgpu_kernel void @swmmac_bf16_16x16x32_bf16(<8 x i16> %A, <16 x i16> %B, <8 x i16> %C, i16 %Index, ptr addrspace(1) %out) {
bb:
%tmp0 = call <8 x i16> @llvm.amdgcn.swmmac.bf16.16x16x32.bf16(<8 x i16> %A, <16 x i16> %B, <8 x i16> %C, i16 %Index)
store <8 x i16> %tmp0, ptr addrspace(1) %out, align 32
ret void
}
; CHECK: DIVERGENT: %tmp0 = call <8 x i32> @llvm.amdgcn.swmmac.i32.16x16x32.iu8.v8i32.v2i32.v4i32.i16(i1 false, <2 x i32> %A, i1 false, <4 x i32> %B, <8 x i32> %C, i16 %Index, i1 false)
define amdgpu_kernel void @swmmac_i32_16x16x32_iu8(<2 x i32> %A, <4 x i32> %B, <8 x i32> %C, i16 %Index, ptr addrspace(1) %out) {
bb:
%tmp0 = call <8 x i32> @llvm.amdgcn.swmmac.i32.16x16x32.iu8(i1 false, <2 x i32> %A, i1 false, <4 x i32> %B, <8 x i32> %C, i16 %Index, i1 false)
store <8 x i32> %tmp0, ptr addrspace(1) %out, align 32
ret void
}
; CHECK: DIVERGENT: %tmp0 = call <8 x i32> @llvm.amdgcn.swmmac.i32.16x16x32.iu4.v8i32.i32.v2i32.i16(i1 false, i32 %A, i1 false, <2 x i32> %B, <8 x i32> %C, i16 %Index, i1 false)
define amdgpu_kernel void @swmmac_i32_16x16x32_iu4(i32 %A, <2 x i32> %B, <8 x i32> %C, i16 %Index, ptr addrspace(1) %out) {
bb:
%tmp0 = call <8 x i32> @llvm.amdgcn.swmmac.i32.16x16x32.iu4(i1 false, i32 %A, i1 false, <2 x i32> %B, <8 x i32> %C, i16 %Index, i1 false)
store <8 x i32> %tmp0, ptr addrspace(1) %out, align 32
ret void
}
; CHECK: DIVERGENT: %tmp0 = call <8 x i32> @llvm.amdgcn.swmmac.i32.16x16x64.iu4.v8i32.v2i32.v4i32.i16(i1 false, <2 x i32> %A, i1 false, <4 x i32> %B, <8 x i32> %C, i16 %Index, i1 false)
define amdgpu_kernel void @swmmac_i32_16x16x64_iu4(<2 x i32> %A, <4 x i32> %B, <8 x i32> %C, i16 %Index, ptr addrspace(1) %out) {
bb:
%tmp0 = call <8 x i32> @llvm.amdgcn.swmmac.i32.16x16x64.iu4(i1 false, <2 x i32> %A, i1 false, <4 x i32> %B, <8 x i32> %C, i16 %Index, i1 false)
store <8 x i32> %tmp0, ptr addrspace(1) %out, align 32
ret void
}
; CHECK: DIVERGENT: %tmp0 = call <8 x float> @llvm.amdgcn.swmmac.f32.16x16x32.fp8.fp8.v8f32.v2i32.v4i32.i16(<2 x i32> %A, <4 x i32> %B, <8 x float> %C, i16 %Index)
define amdgpu_kernel void @swmmac_f32_16x16x32_fp8.fp8(<2 x i32> %A, <4 x i32> %B, <8 x float> %C, i16 %Index, ptr addrspace(1) %out) {
bb:
%tmp0 = call <8 x float> @llvm.amdgcn.swmmac.f32.16x16x32.fp8.fp8(<2 x i32> %A, <4 x i32> %B, <8 x float> %C, i16 %Index)
store <8 x float> %tmp0, ptr addrspace(1) %out, align 32
ret void
}
; CHECK: DIVERGENT: %tmp0 = call <8 x float> @llvm.amdgcn.swmmac.f32.16x16x32.fp8.bf8.v8f32.v2i32.v4i32.i16(<2 x i32> %A, <4 x i32> %B, <8 x float> %C, i16 %Index)
define amdgpu_kernel void @swmmac_f32_16x16x32_fp8.bf8(<2 x i32> %A, <4 x i32> %B, <8 x float> %C, i16 %Index, ptr addrspace(1) %out) {
bb:
%tmp0 = call <8 x float> @llvm.amdgcn.swmmac.f32.16x16x32.fp8.bf8(<2 x i32> %A, <4 x i32> %B, <8 x float> %C, i16 %Index)
store <8 x float> %tmp0, ptr addrspace(1) %out, align 32
ret void
}
; CHECK: DIVERGENT: %tmp0 = call <8 x float> @llvm.amdgcn.swmmac.f32.16x16x32.bf8.fp8.v8f32.v2i32.v4i32.i16(<2 x i32> %A, <4 x i32> %B, <8 x float> %C, i16 %Index)
define amdgpu_kernel void @swmmac_f32_16x16x32_bf8.fp8(<2 x i32> %A, <4 x i32> %B, <8 x float> %C, i16 %Index, ptr addrspace(1) %out) {
bb:
%tmp0 = call <8 x float> @llvm.amdgcn.swmmac.f32.16x16x32.bf8.fp8(<2 x i32> %A, <4 x i32> %B, <8 x float> %C, i16 %Index)
store <8 x float> %tmp0, ptr addrspace(1) %out, align 32
ret void
}
; CHECK: DIVERGENT: %tmp0 = call <8 x float> @llvm.amdgcn.swmmac.f32.16x16x32.bf8.bf8.v8f32.v2i32.v4i32.i16(<2 x i32> %A, <4 x i32> %B, <8 x float> %C, i16 %Index)
define amdgpu_kernel void @swmmac_f32_16x16x32_bf8.bf8(<2 x i32> %A, <4 x i32> %B, <8 x float> %C, i16 %Index, ptr addrspace(1) %out) {
bb:
%tmp0 = call <8 x float> @llvm.amdgcn.swmmac.f32.16x16x32.bf8.bf8(<2 x i32> %A, <4 x i32> %B, <8 x float> %C, i16 %Index)
store <8 x float> %tmp0, ptr addrspace(1) %out, align 32
ret void
}
; CHECK: DIVERGENT: %tmp0 = call <2 x i32> @llvm.amdgcn.global.load.tr.v2i32(ptr addrspace(1) %gep)
define amdgpu_kernel void @global_load_tr_b64_v2i32(ptr addrspace(1) %addr, ptr addrspace(1) %out) {
bb:
@ -190,12 +278,23 @@ declare i32 @llvm.amdgcn.mov.dpp.i32(i32, i32, i32, i32, i1) #1
declare i32 @llvm.amdgcn.mov.dpp8.i32(i32, i32) #1
declare i32 @llvm.amdgcn.update.dpp.i32(i32, i32, i32, i32, i32, i1) #1
declare i32 @llvm.amdgcn.writelane(i32, i32, i32) #1
declare <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.f16.v8f32(<16 x half>, <16 x half> , <8 x float>) #1
declare <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf16.v8f32(<16 x i16>, <16 x i16> , <8 x float>) #1
declare <16 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16.v16f16(<16 x half>, <16 x half> , <16 x half>, i1 immarg) #1
declare <16 x i16> @llvm.amdgcn.wmma.bf16.16x16x16.bf16.v16i16(<16 x i16>, <16 x i16> , <16 x i16>, i1 immarg) #1
declare <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8.v8i32(i1 immarg, <4 x i32>, i1 immarg, <4 x i32> , <8 x i32>, i1 immarg) #1
declare <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4.v8i32(i1 immarg, <2 x i32>, i1 immarg, <2 x i32> , <8 x i32>, i1 immarg) #1
declare <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.f16.v8f32.v16f16(<16 x half>, <16 x half> , <8 x float>) #1
declare <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf16.v8f32.v16i16(<16 x i16>, <16 x i16> , <8 x float>) #1
declare <16 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16.v16f16.v16f16(<16 x half>, <16 x half> , <16 x half>, i1 immarg) #1
declare <16 x i16> @llvm.amdgcn.wmma.bf16.16x16x16.bf16.v16i16.v16i16(<16 x i16>, <16 x i16> , <16 x i16>, i1 immarg) #1
declare <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8.v8i32.v4i32(i1 immarg, <4 x i32>, i1 immarg, <4 x i32> , <8 x i32>, i1 immarg) #1
declare <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4.v8i32.v2i32(i1 immarg, <2 x i32>, i1 immarg, <2 x i32> , <8 x i32>, i1 immarg) #1
declare <8 x float> @llvm.amdgcn.swmmac.f32.16x16x32.f16(<8 x half>, <16 x half>, <8 x float>, i16)
declare <8 x float> @llvm.amdgcn.swmmac.f32.16x16x32.bf16(<8 x i16>, <16 x i16>, <8 x float>, i16)
declare <8 x half> @llvm.amdgcn.swmmac.f16.16x16x32.f16(<8 x half>, <16 x half>, <8 x half>, i16)
declare <8 x i16> @llvm.amdgcn.swmmac.bf16.16x16x32.bf16(<8 x i16>, <16 x i16>, <8 x i16>, i16)
declare <8 x i32> @llvm.amdgcn.swmmac.i32.16x16x32.iu8(i1 immarg, <2 x i32>, i1 immarg, <4 x i32>, <8 x i32>, i16, i1)
declare <8 x i32> @llvm.amdgcn.swmmac.i32.16x16x32.iu4(i1 immarg, i32, i1 immarg, <2 x i32>, <8 x i32>, i16, i1)
declare <8 x i32> @llvm.amdgcn.swmmac.i32.16x16x64.iu4(i1 immarg, <2 x i32>, i1 immarg, <4 x i32>, <8 x i32>, i16, i1)
declare <8 x float> @llvm.amdgcn.swmmac.f32.16x16x32.fp8.fp8(<2 x i32>, <4 x i32>, <8 x float>, i16)
declare <8 x float> @llvm.amdgcn.swmmac.f32.16x16x32.fp8.bf8(<2 x i32>, <4 x i32>, <8 x float>, i16)
declare <8 x float> @llvm.amdgcn.swmmac.f32.16x16x32.bf8.fp8(<2 x i32>, <4 x i32>, <8 x float>, i16)
declare <8 x float> @llvm.amdgcn.swmmac.f32.16x16x32.bf8.bf8(<2 x i32>, <4 x i32>, <8 x float>, i16)
declare <2 x i32> @llvm.amdgcn.global.load.tr.v2i32(ptr addrspace(1))
declare <8 x i16> @llvm.amdgcn.global.load.tr.v8i16(ptr addrspace(1))

View File

@ -0,0 +1,504 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
; RUN: llc -global-isel -march=amdgcn -mcpu=gfx1200 -verify-machineinstrs < %s | FileCheck %s --check-prefix=GFX12
define amdgpu_ps void @test_wmma_f32_16x16x16_f16_negA(<8 x half> %A, <8 x half> %B, <8 x float> %C, <8 x float> addrspace(1)* %out) {
; GFX12-LABEL: test_wmma_f32_16x16x16_f16_negA:
; GFX12: ; %bb.0: ; %bb
; GFX12-NEXT: v_wmma_f32_16x16x16_f16 v[8:15], v[0:3], v[4:7], v[8:15] neg_lo:[1,0,0] neg_hi:[1,0,0]
; GFX12-NEXT: s_clause 0x1
; GFX12-NEXT: global_store_b128 v[16:17], v[8:11], off
; GFX12-NEXT: global_store_b128 v[16:17], v[12:15], off offset:16
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
bb:
%fneg.A = fneg <8 x half> %A
%res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.f16.v8f16.v8f32(<8 x half> %fneg.A, <8 x half> %B, <8 x float> %C)
store <8 x float> %res, <8 x float> addrspace(1)* %out
ret void
}
define amdgpu_ps void @test_wmma_f32_16x16x16_f16_negB(<8 x half> %A, <8 x half> %B, <8 x float> %C, <8 x float> addrspace(1)* %out) {
; GFX12-LABEL: test_wmma_f32_16x16x16_f16_negB:
; GFX12: ; %bb.0: ; %bb
; GFX12-NEXT: v_wmma_f32_16x16x16_f16 v[8:15], v[0:3], v[4:7], v[8:15] neg_lo:[0,1,0] neg_hi:[0,1,0]
; GFX12-NEXT: s_clause 0x1
; GFX12-NEXT: global_store_b128 v[16:17], v[8:11], off
; GFX12-NEXT: global_store_b128 v[16:17], v[12:15], off offset:16
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
bb:
%fneg.B = fneg <8 x half> %B
%res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.f16.v8f16.v8f32(<8 x half> %A, <8 x half> %fneg.B, <8 x float> %C)
store <8 x float> %res, <8 x float> addrspace(1)* %out
ret void
}
define amdgpu_ps void @test_wmma_f32_16x16x16_f16_negC(<8 x half> %A, <8 x half> %B, <8 x float> %C, <8 x float> addrspace(1)* %out) {
; GFX12-LABEL: test_wmma_f32_16x16x16_f16_negC:
; GFX12: ; %bb.0: ; %bb
; GFX12-NEXT: v_wmma_f32_16x16x16_f16 v[8:15], v[0:3], v[4:7], v[8:15] neg_lo:[0,0,1]
; GFX12-NEXT: s_clause 0x1
; GFX12-NEXT: global_store_b128 v[16:17], v[8:11], off
; GFX12-NEXT: global_store_b128 v[16:17], v[12:15], off offset:16
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
bb:
%fneg.C = fneg <8 x float> %C
%res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.f16.v8f16.v8f32(<8 x half> %A, <8 x half> %B, <8 x float> %fneg.C)
store <8 x float> %res, <8 x float> addrspace(1)* %out
ret void
}
define amdgpu_ps void @test_wmma_f32_16x16x16_f16_absC(<8 x half> %A, <8 x half> %B, <8 x float> %C, <8 x float> addrspace(1)* %out) {
; GFX12-LABEL: test_wmma_f32_16x16x16_f16_absC:
; GFX12: ; %bb.0: ; %bb
; GFX12-NEXT: v_wmma_f32_16x16x16_f16 v[8:15], v[0:3], v[4:7], v[8:15] neg_hi:[0,0,1]
; GFX12-NEXT: s_clause 0x1
; GFX12-NEXT: global_store_b128 v[16:17], v[8:11], off
; GFX12-NEXT: global_store_b128 v[16:17], v[12:15], off offset:16
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
bb:
%fabs.C = call <8 x float> @llvm.fabs.v8f32(<8 x float> %C)
%res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.f16.v8f16.v8f32(<8 x half> %A, <8 x half> %B, <8 x float> %fabs.C)
store <8 x float> %res, <8 x float> addrspace(1)* %out
ret void
}
define amdgpu_ps void @test_wmma_f32_16x16x16_bf16_negC(<8 x i16> %A, <8 x i16> %B, <8 x float> %C, <8 x float> addrspace(1)* %out) {
; GFX12-LABEL: test_wmma_f32_16x16x16_bf16_negC:
; GFX12: ; %bb.0: ; %bb
; GFX12-NEXT: v_wmma_f32_16x16x16_bf16 v[8:15], v[0:3], v[4:7], v[8:15] neg_lo:[0,0,1]
; GFX12-NEXT: s_clause 0x1
; GFX12-NEXT: global_store_b128 v[16:17], v[8:11], off
; GFX12-NEXT: global_store_b128 v[16:17], v[12:15], off offset:16
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
bb:
%fneg.C = fneg <8 x float> %C
%res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf16.v8i16.v8f32(<8 x i16> %A, <8 x i16> %B, <8 x float> %fneg.C)
store <8 x float> %res, <8 x float> addrspace(1)* %out
ret void
}
define amdgpu_ps void @test_wmma_f32_16x16x16_bf16_absC(<8 x i16> %A, <8 x i16> %B, <8 x float> %C, <8 x float> addrspace(1)* %out) {
; GFX12-LABEL: test_wmma_f32_16x16x16_bf16_absC:
; GFX12: ; %bb.0: ; %bb
; GFX12-NEXT: v_wmma_f32_16x16x16_bf16 v[8:15], v[0:3], v[4:7], v[8:15] neg_hi:[0,0,1]
; GFX12-NEXT: s_clause 0x1
; GFX12-NEXT: global_store_b128 v[16:17], v[8:11], off
; GFX12-NEXT: global_store_b128 v[16:17], v[12:15], off offset:16
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
bb:
%fabs.C = call <8 x float> @llvm.fabs.v8f32(<8 x float> %C)
%res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf16.v8i16.v8f32(<8 x i16> %A, <8 x i16> %B, <8 x float> %fabs.C)
store <8 x float> %res, <8 x float> addrspace(1)* %out
ret void
}
define amdgpu_ps void @test_wmma_f16_16x16x16_f16_negA(<8 x half> %A, <8 x half> %B, <8 x half> %C, <8 x half> addrspace(1)* %out) {
; GFX12-LABEL: test_wmma_f16_16x16x16_f16_negA:
; GFX12: ; %bb.0: ; %bb
; GFX12-NEXT: v_wmma_f16_16x16x16_f16 v[8:11], v[0:3], v[4:7], v[8:11] neg_lo:[1,0,0] neg_hi:[1,0,0]
; GFX12-NEXT: global_store_b128 v[12:13], v[8:11], off
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
bb:
%fneg.A = fneg <8 x half> %A
%res = call <8 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16.v8f16.v8f16(<8 x half> %fneg.A, <8 x half> %B, <8 x half> %C, i1 0)
store <8 x half> %res, <8 x half> addrspace(1)* %out
ret void
}
define amdgpu_ps void @test_wmma_f16_16x16x16_f16_negB(<8 x half> %A, <8 x half> %B, <8 x half> %C, <8 x half> addrspace(1)* %out) {
; GFX12-LABEL: test_wmma_f16_16x16x16_f16_negB:
; GFX12: ; %bb.0: ; %bb
; GFX12-NEXT: v_wmma_f16_16x16x16_f16 v[8:11], v[0:3], v[4:7], v[8:11] neg_lo:[0,1,0] neg_hi:[0,1,0]
; GFX12-NEXT: global_store_b128 v[12:13], v[8:11], off
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
bb:
%fneg.B = fneg <8 x half> %B
%res = call <8 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16.v8f16.v8f16(<8 x half> %A, <8 x half> %fneg.B, <8 x half> %C, i1 0)
store <8 x half> %res, <8 x half> addrspace(1)* %out
ret void
}
define amdgpu_ps void @test_wmma_f16_16x16x16_f16_negC(<8 x half> %A, <8 x half> %B, <8 x half> %C, <8 x half> addrspace(1)* %out) {
; GFX12-LABEL: test_wmma_f16_16x16x16_f16_negC:
; GFX12: ; %bb.0: ; %bb
; GFX12-NEXT: v_wmma_f16_16x16x16_f16 v[8:11], v[0:3], v[4:7], v[8:11] neg_lo:[0,0,1]
; GFX12-NEXT: global_store_b128 v[12:13], v[8:11], off
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
bb:
%fneg.C = fneg <8 x half> %C
%res = call <8 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16.v8f16.v8f16(<8 x half> %A, <8 x half> %B, <8 x half> %fneg.C, i1 0)
store <8 x half> %res, <8 x half> addrspace(1)* %out
ret void
}
define amdgpu_ps void @test_wmma_f16_16x16x16_f16_absC(<8 x half> %A, <8 x half> %B, <8 x half> %C, <8 x half> addrspace(1)* %out) {
; GFX12-LABEL: test_wmma_f16_16x16x16_f16_absC:
; GFX12: ; %bb.0: ; %bb
; GFX12-NEXT: v_wmma_f16_16x16x16_f16 v[8:11], v[0:3], v[4:7], v[8:11] neg_hi:[0,0,1]
; GFX12-NEXT: global_store_b128 v[12:13], v[8:11], off
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
bb:
%fabs.C = call <8 x half> @llvm.fabs.v8f16(<8 x half> %C)
%res = call <8 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16.v8f16.v8f16(<8 x half> %A, <8 x half> %B, <8 x half> %fabs.C, i1 0)
store <8 x half> %res, <8 x half> addrspace(1)* %out
ret void
}
define amdgpu_ps void @test_wmma_f32_16x16x16_fp8_fp8_negC(<2 x i32> %A, <2 x i32> %B, <8 x float> %C, <8 x float> addrspace(1)* %out) {
; GFX12-LABEL: test_wmma_f32_16x16x16_fp8_fp8_negC:
; GFX12: ; %bb.0: ; %bb
; GFX12-NEXT: v_wmma_f32_16x16x16_fp8_fp8 v[4:11], v[0:1], v[2:3], v[4:11] neg_lo:[0,0,1]
; GFX12-NEXT: s_clause 0x1
; GFX12-NEXT: global_store_b128 v[12:13], v[4:7], off
; GFX12-NEXT: global_store_b128 v[12:13], v[8:11], off offset:16
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
bb:
%fneg.C = fneg <8 x float> %C
%res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.fp8.fp8.v2i32.v8f32(<2 x i32> %A, <2 x i32> %B, <8 x float> %fneg.C)
store <8 x float> %res, <8 x float> addrspace(1)* %out
ret void
}
define amdgpu_ps void @test_wmma_f32_16x16x16_fp8_fp8_absC(<2 x i32> %A, <2 x i32> %B, <8 x float> %C, <8 x float> addrspace(1)* %out) {
; GFX12-LABEL: test_wmma_f32_16x16x16_fp8_fp8_absC:
; GFX12: ; %bb.0: ; %bb
; GFX12-NEXT: v_wmma_f32_16x16x16_fp8_fp8 v[4:11], v[0:1], v[2:3], v[4:11] neg_hi:[0,0,1]
; GFX12-NEXT: s_clause 0x1
; GFX12-NEXT: global_store_b128 v[12:13], v[4:7], off
; GFX12-NEXT: global_store_b128 v[12:13], v[8:11], off offset:16
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
bb:
%fabs.C = call <8 x float> @llvm.fabs.v8f32(<8 x float> %C)
%res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.fp8.fp8.v2i32.v8f32(<2 x i32> %A, <2 x i32> %B, <8 x float> %fabs.C)
store <8 x float> %res, <8 x float> addrspace(1)* %out
ret void
}
define amdgpu_ps void @test_wmma_f32_16x16x16_bf8_fp8_negC(<2 x i32> %A, <2 x i32> %B, <8 x float> %C, <8 x float> addrspace(1)* %out) {
; GFX12-LABEL: test_wmma_f32_16x16x16_bf8_fp8_negC:
; GFX12: ; %bb.0: ; %bb
; GFX12-NEXT: v_wmma_f32_16x16x16_bf8_fp8 v[4:11], v[0:1], v[2:3], v[4:11] neg_lo:[0,0,1]
; GFX12-NEXT: s_clause 0x1
; GFX12-NEXT: global_store_b128 v[12:13], v[4:7], off
; GFX12-NEXT: global_store_b128 v[12:13], v[8:11], off offset:16
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
bb:
%fneg.C = fneg <8 x float> %C
%res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf8.fp8.v2i32.v8f32(<2 x i32> %A, <2 x i32> %B, <8 x float> %fneg.C)
store <8 x float> %res, <8 x float> addrspace(1)* %out
ret void
}
define amdgpu_ps void @test_wmma_f32_16x16x16_bf8_fp8_absC(<2 x i32> %A, <2 x i32> %B, <8 x float> %C, <8 x float> addrspace(1)* %out) {
; GFX12-LABEL: test_wmma_f32_16x16x16_bf8_fp8_absC:
; GFX12: ; %bb.0: ; %bb
; GFX12-NEXT: v_wmma_f32_16x16x16_bf8_fp8 v[4:11], v[0:1], v[2:3], v[4:11] neg_hi:[0,0,1]
; GFX12-NEXT: s_clause 0x1
; GFX12-NEXT: global_store_b128 v[12:13], v[4:7], off
; GFX12-NEXT: global_store_b128 v[12:13], v[8:11], off offset:16
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
bb:
%fabs.C = call <8 x float> @llvm.fabs.v8f32(<8 x float> %C)
%res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf8.fp8.v2i32.v8f32(<2 x i32> %A, <2 x i32> %B, <8 x float> %fabs.C)
store <8 x float> %res, <8 x float> addrspace(1)* %out
ret void
}
define amdgpu_ps void @test_wmma_f32_16x16x16_fp8_bf8_negC(<2 x i32> %A, <2 x i32> %B, <8 x float> %C, <8 x float> addrspace(1)* %out) {
; GFX12-LABEL: test_wmma_f32_16x16x16_fp8_bf8_negC:
; GFX12: ; %bb.0: ; %bb
; GFX12-NEXT: v_wmma_f32_16x16x16_fp8_bf8 v[4:11], v[0:1], v[2:3], v[4:11] neg_lo:[0,0,1]
; GFX12-NEXT: s_clause 0x1
; GFX12-NEXT: global_store_b128 v[12:13], v[4:7], off
; GFX12-NEXT: global_store_b128 v[12:13], v[8:11], off offset:16
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
bb:
%fneg.C = fneg <8 x float> %C
%res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.fp8.bf8.v2i32.v8f32(<2 x i32> %A, <2 x i32> %B, <8 x float> %fneg.C)
store <8 x float> %res, <8 x float> addrspace(1)* %out
ret void
}
define amdgpu_ps void @test_wmma_f32_16x16x16_fp8_bf8_absC(<2 x i32> %A, <2 x i32> %B, <8 x float> %C, <8 x float> addrspace(1)* %out) {
; GFX12-LABEL: test_wmma_f32_16x16x16_fp8_bf8_absC:
; GFX12: ; %bb.0: ; %bb
; GFX12-NEXT: v_wmma_f32_16x16x16_fp8_bf8 v[4:11], v[0:1], v[2:3], v[4:11] neg_hi:[0,0,1]
; GFX12-NEXT: s_clause 0x1
; GFX12-NEXT: global_store_b128 v[12:13], v[4:7], off
; GFX12-NEXT: global_store_b128 v[12:13], v[8:11], off offset:16
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
bb:
%fabs.C = call <8 x float> @llvm.fabs.v8f32(<8 x float> %C)
%res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.fp8.bf8.v2i32.v8f32(<2 x i32> %A, <2 x i32> %B, <8 x float> %fabs.C)
store <8 x float> %res, <8 x float> addrspace(1)* %out
ret void
}
define amdgpu_ps void @test_wmma_f32_16x16x16_bf8_bf8_negC(<2 x i32> %A, <2 x i32> %B, <8 x float> %C, <8 x float> addrspace(1)* %out) {
; GFX12-LABEL: test_wmma_f32_16x16x16_bf8_bf8_negC:
; GFX12: ; %bb.0: ; %bb
; GFX12-NEXT: v_wmma_f32_16x16x16_bf8_bf8 v[4:11], v[0:1], v[2:3], v[4:11] neg_lo:[0,0,1]
; GFX12-NEXT: s_clause 0x1
; GFX12-NEXT: global_store_b128 v[12:13], v[4:7], off
; GFX12-NEXT: global_store_b128 v[12:13], v[8:11], off offset:16
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
bb:
%fneg.C = fneg <8 x float> %C
%res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf8.bf8.v2i32.v8f32(<2 x i32> %A, <2 x i32> %B, <8 x float> %fneg.C)
store <8 x float> %res, <8 x float> addrspace(1)* %out
ret void
}
define amdgpu_ps void @test_wmma_f32_16x16x16_bf8_bf8_absC(<2 x i32> %A, <2 x i32> %B, <8 x float> %C, <8 x float> addrspace(1)* %out) {
; GFX12-LABEL: test_wmma_f32_16x16x16_bf8_bf8_absC:
; GFX12: ; %bb.0: ; %bb
; GFX12-NEXT: v_wmma_f32_16x16x16_bf8_bf8 v[4:11], v[0:1], v[2:3], v[4:11] neg_hi:[0,0,1]
; GFX12-NEXT: s_clause 0x1
; GFX12-NEXT: global_store_b128 v[12:13], v[4:7], off
; GFX12-NEXT: global_store_b128 v[12:13], v[8:11], off offset:16
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
bb:
%fabs.C = call <8 x float> @llvm.fabs.v8f32(<8 x float> %C)
%res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf8.bf8.v2i32.v8f32(<2 x i32> %A, <2 x i32> %B, <8 x float> %fabs.C)
store <8 x float> %res, <8 x float> addrspace(1)* %out
ret void
}
define amdgpu_ps void @test_swmmac_f32_16x16x32_f16_negA(<8 x half> %A, <16 x half> %B, <8 x float> %C, i16 %Index, <8 x float> addrspace(1)* %out) {
; GFX12-LABEL: test_swmmac_f32_16x16x32_f16_negA:
; GFX12: ; %bb.0: ; %bb
; GFX12-NEXT: v_swmmac_f32_16x16x32_f16 v[12:19], v[0:3], v[4:11], v20 neg_lo:[1,0,0] neg_hi:[1,0,0]
; GFX12-NEXT: s_clause 0x1
; GFX12-NEXT: global_store_b128 v[21:22], v[12:15], off
; GFX12-NEXT: global_store_b128 v[21:22], v[16:19], off offset:16
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
bb:
%fneg.A = fneg <8 x half> %A
%res = call <8 x float> @llvm.amdgcn.swmmac.f32.16x16x32.f16.v8f16.v16f16.v8f32.i16(<8 x half> %fneg.A, <16 x half> %B, <8 x float> %C, i16 %Index)
store <8 x float> %res, <8 x float> addrspace(1)* %out
ret void
}
define amdgpu_ps void @test_swmmac_f32_16x16x32_f16_negB(<8 x half> %A, <16 x half> %B, <8 x float> %C, i16 %Index, <8 x float> addrspace(1)* %out) {
; GFX12-LABEL: test_swmmac_f32_16x16x32_f16_negB:
; GFX12: ; %bb.0: ; %bb
; GFX12-NEXT: v_swmmac_f32_16x16x32_f16 v[12:19], v[0:3], v[4:11], v20 neg_lo:[0,1,0] neg_hi:[0,1,0]
; GFX12-NEXT: s_clause 0x1
; GFX12-NEXT: global_store_b128 v[21:22], v[12:15], off
; GFX12-NEXT: global_store_b128 v[21:22], v[16:19], off offset:16
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
bb:
%fneg.B = fneg <16 x half> %B
%res = call <8 x float> @llvm.amdgcn.swmmac.f32.16x16x32.f16.v8f16.v16f16.v8f32.i16(<8 x half> %A, <16 x half> %fneg.B, <8 x float> %C, i16 %Index)
store <8 x float> %res, <8 x float> addrspace(1)* %out
ret void
}
define amdgpu_ps void @test_swmmac_f16_16x16x32_f16_negA(<8 x half> %A, <16 x half> %B, <8 x half> %C, i16 %Index, <8 x half> addrspace(1)* %out) {
; GFX12-LABEL: test_swmmac_f16_16x16x32_f16_negA:
; GFX12: ; %bb.0: ; %bb
; GFX12-NEXT: v_swmmac_f16_16x16x32_f16 v[12:15], v[0:3], v[4:11], v16 neg_lo:[1,0,0] neg_hi:[1,0,0]
; GFX12-NEXT: global_store_b128 v[17:18], v[12:15], off
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
bb:
%fneg.A = fneg <8 x half> %A
%res = call <8 x half> @llvm.amdgcn.swmmac.f16.16x16x32.f16.v8f16.v16f16.v8f16.i16(<8 x half> %fneg.A, <16 x half> %B, <8 x half> %C, i16 %Index)
store <8 x half> %res, <8 x half> addrspace(1)* %out
ret void
}
define amdgpu_ps void @test_swmmac_f16_16x16x32_f16_negB(<8 x half> %A, <16 x half> %B, <8 x half> %C, i16 %Index, <8 x half> addrspace(1)* %out) {
; GFX12-LABEL: test_swmmac_f16_16x16x32_f16_negB:
; GFX12: ; %bb.0: ; %bb
; GFX12-NEXT: v_swmmac_f16_16x16x32_f16 v[12:15], v[0:3], v[4:11], v16 neg_lo:[0,1,0] neg_hi:[0,1,0]
; GFX12-NEXT: global_store_b128 v[17:18], v[12:15], off
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
bb:
%fneg.B = fneg <16 x half> %B
%res = call <8 x half> @llvm.amdgcn.swmmac.f16.16x16x32.f16.v8f16.v16f16.v8f16.i16(<8 x half> %A, <16 x half> %fneg.B, <8 x half> %C, i16 %Index)
store <8 x half> %res, <8 x half> addrspace(1)* %out
ret void
}
; both neg and abs patterns (wmma matrix C f32 or f16 )
define amdgpu_ps void @test_wmma_f32_16x16x16_f16_negabsC(<8 x half> %A, <8 x half> %B, <8 x float> %C, <8 x float> addrspace(1)* %out) {
; GFX12-LABEL: test_wmma_f32_16x16x16_f16_negabsC:
; GFX12: ; %bb.0: ; %bb
; GFX12-NEXT: v_wmma_f32_16x16x16_f16 v[8:15], v[0:3], v[4:7], v[8:15] neg_lo:[0,0,1] neg_hi:[0,0,1]
; GFX12-NEXT: s_clause 0x1
; GFX12-NEXT: global_store_b128 v[16:17], v[8:11], off
; GFX12-NEXT: global_store_b128 v[16:17], v[12:15], off offset:16
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
bb:
%fabs.C = call <8 x float> @llvm.fabs.v8f32(<8 x float> %C)
%fneg.fabs.C = fneg <8 x float> %fabs.C
%res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.f16.v8f16.v8f32(<8 x half> %A, <8 x half> %B, <8 x float> %fneg.fabs.C)
store <8 x float> %res, <8 x float> addrspace(1)* %out
ret void
}
define amdgpu_ps void @test_wmma_f16_16x16x16_f16_negabsC(<8 x half> %A, <8 x half> %B, <8 x half> %C, <8 x half> addrspace(1)* %out) {
; GFX12-LABEL: test_wmma_f16_16x16x16_f16_negabsC:
; GFX12: ; %bb.0: ; %bb
; GFX12-NEXT: v_wmma_f16_16x16x16_f16 v[8:11], v[0:3], v[4:7], v[8:11] neg_lo:[0,0,1] neg_hi:[0,0,1]
; GFX12-NEXT: global_store_b128 v[12:13], v[8:11], off
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
bb:
%fabs.C = call <8 x half> @llvm.fabs.v8f16(<8 x half> %C)
%fneg.fabs.C = fneg <8 x half> %fabs.C
%res = call <8 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16.v8f16.v8f16(<8 x half> %A, <8 x half> %B, <8 x half> %fneg.fabs.C, i1 0)
store <8 x half> %res, <8 x half> addrspace(1)* %out
ret void
}
define amdgpu_ps void @test_wmma_f32_16x16x16_f16_neg_partial_fabsA(<8 x half> %A, <8 x half> %B, <8 x float> %C, <8 x float> addrspace(1)* %out) {
; GFX12-LABEL: test_wmma_f32_16x16x16_f16_neg_partial_fabsA:
; GFX12: ; %bb.0: ; %bb
; GFX12-NEXT: v_and_b32_e32 v11, 0x7fffffff, v11
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-NEXT: v_wmma_f32_16x16x16_f16 v[8:15], v[0:3], v[4:7], v[8:15] neg_lo:[0,0,1]
; GFX12-NEXT: s_clause 0x1
; GFX12-NEXT: global_store_b128 v[16:17], v[8:11], off
; GFX12-NEXT: global_store_b128 v[16:17], v[12:15], off offset:16
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
bb:
%el3 = extractelement <8 x float> %C, i32 3
%el3.fabs = call float @llvm.fabs.f32(float %el3)
%partial.fabs.C = insertelement <8 x float> %C, float %el3.fabs, i32 3
%fneg.partial.fabs.C = fneg <8 x float> %partial.fabs.C
%res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.f16.v8f16.v8f32(<8 x half> %A, <8 x half> %B, <8 x float> %fneg.partial.fabs.C)
store <8 x float> %res, <8 x float> addrspace(1)* %out
ret void
}
; A or B matrix modifier and constant in C
define amdgpu_ps void @test_wmma_f32_16x16x16_f16_negA_constantC(<8 x half> %A, <8 x half> %B, <8 x float> %C, <8 x float> addrspace(1)* %out) {
; GFX12-LABEL: test_wmma_f32_16x16x16_f16_negA_constantC:
; GFX12: ; %bb.0: ; %bb
; GFX12-NEXT: v_wmma_f32_16x16x16_f16 v[10:17], v[0:3], v[4:7], 1.0 neg_lo:[1,0,0] neg_hi:[1,0,0]
; GFX12-NEXT: s_clause 0x1
; GFX12-NEXT: global_store_b128 v[8:9], v[10:13], off
; GFX12-NEXT: global_store_b128 v[8:9], v[14:17], off offset:16
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
bb:
%fneg.A = fneg <8 x half> %A
%res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.f16.v8f16.v8f32(<8 x half> %fneg.A, <8 x half> %B, <8 x float> <float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0>)
store <8 x float> %res, <8 x float> addrspace(1)* %out
ret void
}
define amdgpu_ps void @test_wmma_f16_16x16x16_f16_negB_constantC(<8 x half> %A, <8 x half> %B, <8 x half> %C, <8 x half> addrspace(1)* %out) {
; GFX12-LABEL: test_wmma_f16_16x16x16_f16_negB_constantC:
; GFX12: ; %bb.0: ; %bb
; GFX12-NEXT: v_wmma_f16_16x16x16_f16 v[10:13], v[0:3], v[4:7], 1.0 neg_lo:[0,1,0] neg_hi:[0,1,0]
; GFX12-NEXT: global_store_b128 v[8:9], v[10:13], off
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
bb:
%fneg.B = fneg <8 x half> %B
%res = call <8 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16.v8f16.v8f16(<8 x half> %A, <8 x half> %fneg.B, <8 x half> <half 1.0, half 1.0, half 1.0, half 1.0, half 1.0, half 1.0, half 1.0, half 1.0>, i1 0)
store <8 x half> %res, <8 x half> addrspace(1)* %out
ret void
}
; pack f16 elements with v_perm_b32 since they don't come from same b32
define amdgpu_ps void @test_wmma_f16_16x16x16_f16_negC_pack(<8 x half> %A, <8 x half> %B, ptr %Caddr, <8 x half> addrspace(1)* %out) {
; GFX12-LABEL: test_wmma_f16_16x16x16_f16_negC_pack:
; GFX12: ; %bb.0: ; %bb
; GFX12-NEXT: s_clause 0x1
; GFX12-NEXT: flat_load_b128 v[12:15], v[8:9]
; GFX12-NEXT: flat_load_b128 v[16:19], v[8:9] offset:16
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x101
; GFX12-NEXT: v_and_b32_e32 v8, 0xffff, v12
; GFX12-NEXT: v_and_b32_e32 v9, 0xffff, v14
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-NEXT: v_and_b32_e32 v14, 0xffff, v16
; GFX12-NEXT: v_and_b32_e32 v16, 0xffff, v18
; GFX12-NEXT: v_lshl_or_b32 v12, v13, 16, v8
; GFX12-NEXT: v_lshl_or_b32 v13, v15, 16, v9
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
; GFX12-NEXT: v_lshl_or_b32 v14, v17, 16, v14
; GFX12-NEXT: v_lshl_or_b32 v15, v19, 16, v16
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-NEXT: v_wmma_f16_16x16x16_f16 v[12:15], v[0:3], v[4:7], v[12:15] neg_lo:[0,0,1]
; GFX12-NEXT: global_store_b128 v[10:11], v[12:15], off
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
bb:
%C = load <16 x half>, ptr %Caddr
%C_shuffle = shufflevector <16 x half> %C, <16 x half> undef, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
%fneg.C_shuffle = fneg <8 x half> %C_shuffle
%res = call <8 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16.v8f16.v8f16(<8 x half> %A, <8 x half> %B, <8 x half> %fneg.C_shuffle , i1 0)
store <8 x half> %res, <8 x half> addrspace(1)* %out
ret void
}
declare <8 x half> @llvm.fabs.v8f16(<8 x half>)
declare <8 x float> @llvm.fabs.v8f32(<8 x float>)
declare float @llvm.fabs.f32(float)
declare <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.f16.v8f16.v8f32(<8 x half>, <8 x half>, <8 x float>)
declare <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf16.v8i16.v8f32(<8 x i16>, <8 x i16>, <8 x float>)
declare <8 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16.v8f16.v8f16(<8 x half>, <8 x half>, <8 x half>, i1 immarg)
declare <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.fp8.fp8.v2i32.v8f32(<2 x i32>, <2 x i32>, <8 x float>)
declare <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.fp8.bf8.v2i32.v8f32(<2 x i32>, <2 x i32>, <8 x float>)
declare <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf8.fp8.v2i32.v8f32(<2 x i32>, <2 x i32>, <8 x float>)
declare <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf8.bf8.v2i32.v8f32(<2 x i32>, <2 x i32>, <8 x float>)
declare <8 x float> @llvm.amdgcn.swmmac.f32.16x16x32.f16.v8f16.v16f16.v8f32.i16(<8 x half>, <16 x half>, <8 x float>, i16)
declare <8 x half> @llvm.amdgcn.swmmac.f16.16x16x32.f16.v8f16.v16f16.v8f16.i16(<8 x half>, <16 x half>, <8 x half>, i16)

View File

@ -0,0 +1,519 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
; RUN: llc -global-isel -march=amdgcn -mcpu=gfx1200 -verify-machineinstrs < %s | FileCheck %s --check-prefix=GFX12
define amdgpu_ps void @test_wmma_f32_16x16x16_f16_imm(<8 x half> %A, <8 x half> %B, ptr addrspace(1) %out) {
; GFX12-LABEL: test_wmma_f32_16x16x16_f16_imm:
; GFX12: ; %bb.0: ; %bb
; GFX12-NEXT: v_wmma_f32_16x16x16_f16 v[10:17], v[0:3], v[4:7], 1.0
; GFX12-NEXT: s_clause 0x1
; GFX12-NEXT: global_store_b128 v[8:9], v[10:13], off
; GFX12-NEXT: global_store_b128 v[8:9], v[14:17], off offset:16
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
bb:
%res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.f16.v8f32.v8f16.v8f16.v8f32(<8 x half> %A, <8 x half> %B, <8 x float> <float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0>)
store <8 x float> %res, ptr addrspace(1) %out
ret void
}
define amdgpu_ps void @test_wmma_f32_16x16x16_f16_imm_non_inlineable(<8 x half> %A, <8 x half> %B, ptr addrspace(1) %out) {
; GFX12-LABEL: test_wmma_f32_16x16x16_f16_imm_non_inlineable:
; GFX12: ; %bb.0: ; %bb
; GFX12-NEXT: s_mov_b32 s0, 0x40400000
; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX12-NEXT: s_mov_b32 s7, s0
; GFX12-NEXT: s_mov_b32 s1, s0
; GFX12-NEXT: s_mov_b32 s2, s0
; GFX12-NEXT: s_mov_b32 s3, s0
; GFX12-NEXT: s_mov_b32 s4, s0
; GFX12-NEXT: s_mov_b32 s5, s0
; GFX12-NEXT: s_mov_b32 s6, s0
; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_1)
; GFX12-NEXT: v_dual_mov_b32 v17, s7 :: v_dual_mov_b32 v16, s6
; GFX12-NEXT: v_dual_mov_b32 v15, s5 :: v_dual_mov_b32 v14, s4
; GFX12-NEXT: v_dual_mov_b32 v13, s3 :: v_dual_mov_b32 v12, s2
; GFX12-NEXT: v_dual_mov_b32 v11, s1 :: v_dual_mov_b32 v10, s0
; GFX12-NEXT: v_wmma_f32_16x16x16_f16 v[10:17], v[0:3], v[4:7], v[10:17]
; GFX12-NEXT: s_clause 0x1
; GFX12-NEXT: global_store_b128 v[8:9], v[10:13], off
; GFX12-NEXT: global_store_b128 v[8:9], v[14:17], off offset:16
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
bb:
%res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.f16.v8f32.v8f16.v8f16.v8f32(<8 x half> %A, <8 x half> %B, <8 x float> <float 3.0, float 3.0, float 3.0, float 3.0, float 3.0, float 3.0, float 3.0, float 3.0>)
store <8 x float> %res, ptr addrspace(1) %out
ret void
}
define amdgpu_ps void @test_wmma_f32_16x16x16_bf16_imm(<8 x i16> %A, <8 x i16> %B, ptr addrspace(1) %out) {
; GFX12-LABEL: test_wmma_f32_16x16x16_bf16_imm:
; GFX12: ; %bb.0: ; %bb
; GFX12-NEXT: v_wmma_f32_16x16x16_bf16 v[10:17], v[0:3], v[4:7], 1.0
; GFX12-NEXT: s_clause 0x1
; GFX12-NEXT: global_store_b128 v[8:9], v[10:13], off
; GFX12-NEXT: global_store_b128 v[8:9], v[14:17], off offset:16
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
bb:
%res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf16.v8f32.v8i16.v8i16.v8f32(<8 x i16> %A, <8 x i16> %B, <8 x float> <float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0>)
store <8 x float> %res, ptr addrspace(1) %out
ret void
}
define amdgpu_ps void @test_wmma_f32_16x16x16_bf16_imm_non_inlineable(<8 x i16> %A, <8 x i16> %B, ptr addrspace(1) %out) {
; GFX12-LABEL: test_wmma_f32_16x16x16_bf16_imm_non_inlineable:
; GFX12: ; %bb.0: ; %bb
; GFX12-NEXT: s_mov_b32 s0, 0x40400000
; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX12-NEXT: s_mov_b32 s7, s0
; GFX12-NEXT: s_mov_b32 s1, s0
; GFX12-NEXT: s_mov_b32 s2, s0
; GFX12-NEXT: s_mov_b32 s3, s0
; GFX12-NEXT: s_mov_b32 s4, s0
; GFX12-NEXT: s_mov_b32 s5, s0
; GFX12-NEXT: s_mov_b32 s6, s0
; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_1)
; GFX12-NEXT: v_dual_mov_b32 v17, s7 :: v_dual_mov_b32 v16, s6
; GFX12-NEXT: v_dual_mov_b32 v15, s5 :: v_dual_mov_b32 v14, s4
; GFX12-NEXT: v_dual_mov_b32 v13, s3 :: v_dual_mov_b32 v12, s2
; GFX12-NEXT: v_dual_mov_b32 v11, s1 :: v_dual_mov_b32 v10, s0
; GFX12-NEXT: v_wmma_f32_16x16x16_bf16 v[10:17], v[0:3], v[4:7], v[10:17]
; GFX12-NEXT: s_clause 0x1
; GFX12-NEXT: global_store_b128 v[8:9], v[10:13], off
; GFX12-NEXT: global_store_b128 v[8:9], v[14:17], off offset:16
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
bb:
%res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf16.v8f32.v8i16.v8i16.v8f32(<8 x i16> %A, <8 x i16> %B, <8 x float> <float 3.0, float 3.0, float 3.0, float 3.0, float 3.0, float 3.0, float 3.0, float 3.0>)
store <8 x float> %res, ptr addrspace(1) %out
ret void
}
define amdgpu_ps void @test_wmma_f16_16x16x16_f16_imm(<8 x half> %A, <8 x half> %B, ptr addrspace(1) %out) {
; GFX12-LABEL: test_wmma_f16_16x16x16_f16_imm:
; GFX12: ; %bb.0: ; %bb
; GFX12-NEXT: v_wmma_f16_16x16x16_f16 v[10:13], v[0:3], v[4:7], 1.0
; GFX12-NEXT: global_store_b128 v[8:9], v[10:13], off
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
bb:
%res = call <8 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16.v8f16.v8f16.v8f16.v8f16(<8 x half> %A, <8 x half> %B, <8 x half> <half 1.0, half 1.0, half 1.0, half 1.0, half 1.0, half 1.0, half 1.0, half 1.0>, i1 0)
store <8 x half> %res, ptr addrspace(1) %out
ret void
}
define amdgpu_ps void @test_wmma_f16_16x16x16_f16_imm_non_inlineable(<8 x half> %A, <8 x half> %B, ptr addrspace(1) %out) {
; GFX12-LABEL: test_wmma_f16_16x16x16_f16_imm_non_inlineable:
; GFX12: ; %bb.0: ; %bb
; GFX12-NEXT: s_mov_b32 s0, 0x42004200
; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1)
; GFX12-NEXT: s_mov_b32 s3, s0
; GFX12-NEXT: s_mov_b32 s1, s0
; GFX12-NEXT: s_mov_b32 s2, s0
; GFX12-NEXT: v_dual_mov_b32 v13, s3 :: v_dual_mov_b32 v12, s2
; GFX12-NEXT: v_dual_mov_b32 v11, s1 :: v_dual_mov_b32 v10, s0
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-NEXT: v_wmma_f16_16x16x16_f16 v[10:13], v[0:3], v[4:7], v[10:13]
; GFX12-NEXT: global_store_b128 v[8:9], v[10:13], off
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
bb:
%res = call <8 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16.v8f16.v8f16.v8f16.v8f16(<8 x half> %A, <8 x half> %B, <8 x half> <half 3.0, half 3.0, half 3.0, half 3.0, half 3.0, half 3.0, half 3.0, half 3.0>, i1 0)
store <8 x half> %res, ptr addrspace(1) %out
ret void
}
define amdgpu_ps void @test_wmma_bf16_16x16x16_bf16_imm(<8 x i16> %A, <8 x i16> %B, ptr addrspace(1) %out) {
; GFX12-LABEL: test_wmma_bf16_16x16x16_bf16_imm:
; GFX12: ; %bb.0: ; %bb
; GFX12-NEXT: s_mov_b32 s0, 0x3f803f80
; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1)
; GFX12-NEXT: s_mov_b32 s3, s0
; GFX12-NEXT: s_mov_b32 s1, s0
; GFX12-NEXT: s_mov_b32 s2, s0
; GFX12-NEXT: v_dual_mov_b32 v13, s3 :: v_dual_mov_b32 v12, s2
; GFX12-NEXT: v_dual_mov_b32 v11, s1 :: v_dual_mov_b32 v10, s0
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-NEXT: v_wmma_bf16_16x16x16_bf16 v[10:13], v[0:3], v[4:7], v[10:13]
; GFX12-NEXT: global_store_b128 v[8:9], v[10:13], off
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
bb:
%res = call <8 x i16> @llvm.amdgcn.wmma.bf16.16x16x16.bf16.v8i16.v8i16.v8i16.v8i16(<8 x i16> %A, <8 x i16> %B, <8 x i16> <i16 16256, i16 16256, i16 16256, i16 16256, i16 16256, i16 16256, i16 16256, i16 16256>, i1 0)
store <8 x i16> %res, ptr addrspace(1) %out
ret void
}
define amdgpu_ps void @test_wmma_bf16_16x16x16_bf16_imm_non_inlineable(<8 x i16> %A, <8 x i16> %B, ptr addrspace(1) %out) {
; GFX12-LABEL: test_wmma_bf16_16x16x16_bf16_imm_non_inlineable:
; GFX12: ; %bb.0: ; %bb
; GFX12-NEXT: s_mov_b32 s0, 0x3fc03fc0
; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1)
; GFX12-NEXT: s_mov_b32 s3, s0
; GFX12-NEXT: s_mov_b32 s1, s0
; GFX12-NEXT: s_mov_b32 s2, s0
; GFX12-NEXT: v_dual_mov_b32 v13, s3 :: v_dual_mov_b32 v12, s2
; GFX12-NEXT: v_dual_mov_b32 v11, s1 :: v_dual_mov_b32 v10, s0
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-NEXT: v_wmma_bf16_16x16x16_bf16 v[10:13], v[0:3], v[4:7], v[10:13]
; GFX12-NEXT: global_store_b128 v[8:9], v[10:13], off
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
bb:
%res = call <8 x i16> @llvm.amdgcn.wmma.bf16.16x16x16.bf16.v8i16.v8i16.v8i16.v8i16(<8 x i16> %A, <8 x i16> %B, <8 x i16> <i16 16320, i16 16320, i16 16320, i16 16320, i16 16320, i16 16320, i16 16320, i16 16320>, i1 0)
store <8 x i16> %res, ptr addrspace(1) %out
ret void
}
define amdgpu_ps void @test_wmma_i32_16x16x16_iu8_imm(<2 x i32> %A, <2 x i32> %B, ptr addrspace(1) %out) {
; GFX12-LABEL: test_wmma_i32_16x16x16_iu8_imm:
; GFX12: ; %bb.0: ; %bb
; GFX12-NEXT: v_wmma_i32_16x16x16_iu8 v[6:13], v[0:1], v[2:3], 1
; GFX12-NEXT: s_clause 0x1
; GFX12-NEXT: global_store_b128 v[4:5], v[6:9], off
; GFX12-NEXT: global_store_b128 v[4:5], v[10:13], off offset:16
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
bb:
%res = call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8.v8i32.v2i32.v2i32.v8i32(i1 0, <2 x i32> %A, i1 0, <2 x i32> %B, <8 x i32> <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>, i1 0)
store <8 x i32> %res, ptr addrspace(1) %out
ret void
}
define amdgpu_ps void @test_wmma_i32_16x16x16_iu8_imm_non_inlineable(<2 x i32> %A, <2 x i32> %B, ptr addrspace(1) %out) {
; GFX12-LABEL: test_wmma_i32_16x16x16_iu8_imm_non_inlineable:
; GFX12: ; %bb.0: ; %bb
; GFX12-NEXT: s_movk_i32 s0, 0x80
; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX12-NEXT: s_mov_b32 s7, s0
; GFX12-NEXT: s_mov_b32 s1, s0
; GFX12-NEXT: s_mov_b32 s2, s0
; GFX12-NEXT: s_mov_b32 s3, s0
; GFX12-NEXT: s_mov_b32 s4, s0
; GFX12-NEXT: s_mov_b32 s5, s0
; GFX12-NEXT: s_mov_b32 s6, s0
; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_1)
; GFX12-NEXT: v_dual_mov_b32 v13, s7 :: v_dual_mov_b32 v12, s6
; GFX12-NEXT: v_dual_mov_b32 v11, s5 :: v_dual_mov_b32 v10, s4
; GFX12-NEXT: v_dual_mov_b32 v9, s3 :: v_dual_mov_b32 v8, s2
; GFX12-NEXT: v_dual_mov_b32 v7, s1 :: v_dual_mov_b32 v6, s0
; GFX12-NEXT: v_wmma_i32_16x16x16_iu8 v[6:13], v[0:1], v[2:3], v[6:13]
; GFX12-NEXT: s_clause 0x1
; GFX12-NEXT: global_store_b128 v[4:5], v[6:9], off
; GFX12-NEXT: global_store_b128 v[4:5], v[10:13], off offset:16
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
bb:
%res = call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8.v8i32.v2i32.v2i32.v8i32(i1 0, <2 x i32> %A, i1 0, <2 x i32> %B, <8 x i32> <i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128>, i1 0)
store <8 x i32> %res, ptr addrspace(1) %out
ret void
}
define amdgpu_ps void @test_wmma_i32_16x16x16_iu4_imm(i32 %A, i32 %B, ptr addrspace(1) %out) {
; GFX12-LABEL: test_wmma_i32_16x16x16_iu4_imm:
; GFX12: ; %bb.0: ; %bb
; GFX12-NEXT: v_wmma_i32_16x16x16_iu4 v[4:11], v0, v1, 1
; GFX12-NEXT: s_clause 0x1
; GFX12-NEXT: global_store_b128 v[2:3], v[4:7], off
; GFX12-NEXT: global_store_b128 v[2:3], v[8:11], off offset:16
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
bb:
%res = call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4.v8i32.i32.i32.v8i32(i1 0, i32 %A, i1 0, i32 %B, <8 x i32> <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>, i1 0)
store <8 x i32> %res, ptr addrspace(1) %out
ret void
}
define amdgpu_ps void @test_wmma_i32_16x16x16_iu4_imm_non_inlineable(i32 %A, i32 %B, ptr addrspace(1) %out) {
; GFX12-LABEL: test_wmma_i32_16x16x16_iu4_imm_non_inlineable:
; GFX12: ; %bb.0: ; %bb
; GFX12-NEXT: s_movk_i32 s0, 0x80
; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX12-NEXT: s_mov_b32 s7, s0
; GFX12-NEXT: s_mov_b32 s1, s0
; GFX12-NEXT: s_mov_b32 s2, s0
; GFX12-NEXT: s_mov_b32 s3, s0
; GFX12-NEXT: s_mov_b32 s4, s0
; GFX12-NEXT: s_mov_b32 s5, s0
; GFX12-NEXT: s_mov_b32 s6, s0
; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_1)
; GFX12-NEXT: v_dual_mov_b32 v11, s7 :: v_dual_mov_b32 v10, s6
; GFX12-NEXT: v_dual_mov_b32 v9, s5 :: v_dual_mov_b32 v8, s4
; GFX12-NEXT: v_dual_mov_b32 v7, s3 :: v_dual_mov_b32 v6, s2
; GFX12-NEXT: v_dual_mov_b32 v5, s1 :: v_dual_mov_b32 v4, s0
; GFX12-NEXT: v_wmma_i32_16x16x16_iu4 v[4:11], v0, v1, v[4:11]
; GFX12-NEXT: s_clause 0x1
; GFX12-NEXT: global_store_b128 v[2:3], v[4:7], off
; GFX12-NEXT: global_store_b128 v[2:3], v[8:11], off offset:16
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
bb:
%res = call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4.v8i32.i32.i32.v8i32(i1 0, i32 %A, i1 0, i32 %B, <8 x i32> <i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128>, i1 0)
store <8 x i32> %res, ptr addrspace(1) %out
ret void
}
define amdgpu_ps void @test_wmma_f32_16x16x16_fp8_fp8_imm(<2 x i32> %A, <2 x i32> %B, ptr addrspace(1) %out) {
; GFX12-LABEL: test_wmma_f32_16x16x16_fp8_fp8_imm:
; GFX12: ; %bb.0: ; %bb
; GFX12-NEXT: v_wmma_f32_16x16x16_fp8_fp8 v[6:13], v[0:1], v[2:3], 1.0
; GFX12-NEXT: s_clause 0x1
; GFX12-NEXT: global_store_b128 v[4:5], v[6:9], off
; GFX12-NEXT: global_store_b128 v[4:5], v[10:13], off offset:16
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
bb:
%res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.fp8.fp8.v8f32.v2i32.v2i32.v8f32(<2 x i32> %A, <2 x i32> %B, <8 x float> <float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0>)
store <8 x float> %res, ptr addrspace(1) %out
ret void
}
define amdgpu_ps void @test_wmma_f32_16x16x16_fp8_fp8_imm_non_inlineable(<2 x i32> %A, <2 x i32> %B, ptr addrspace(1) %out) {
; GFX12-LABEL: test_wmma_f32_16x16x16_fp8_fp8_imm_non_inlineable:
; GFX12: ; %bb.0: ; %bb
; GFX12-NEXT: s_mov_b32 s0, 0x40400000
; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX12-NEXT: s_mov_b32 s7, s0
; GFX12-NEXT: s_mov_b32 s1, s0
; GFX12-NEXT: s_mov_b32 s2, s0
; GFX12-NEXT: s_mov_b32 s3, s0
; GFX12-NEXT: s_mov_b32 s4, s0
; GFX12-NEXT: s_mov_b32 s5, s0
; GFX12-NEXT: s_mov_b32 s6, s0
; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_1)
; GFX12-NEXT: v_dual_mov_b32 v13, s7 :: v_dual_mov_b32 v12, s6
; GFX12-NEXT: v_dual_mov_b32 v11, s5 :: v_dual_mov_b32 v10, s4
; GFX12-NEXT: v_dual_mov_b32 v9, s3 :: v_dual_mov_b32 v8, s2
; GFX12-NEXT: v_dual_mov_b32 v7, s1 :: v_dual_mov_b32 v6, s0
; GFX12-NEXT: v_wmma_f32_16x16x16_fp8_fp8 v[6:13], v[0:1], v[2:3], v[6:13]
; GFX12-NEXT: s_clause 0x1
; GFX12-NEXT: global_store_b128 v[4:5], v[6:9], off
; GFX12-NEXT: global_store_b128 v[4:5], v[10:13], off offset:16
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
bb:
%res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.fp8.fp8.v8f32.v2i32.v2i32.v8f32(<2 x i32> %A, <2 x i32> %B, <8 x float> <float 3.0, float 3.0, float 3.0, float 3.0, float 3.0, float 3.0, float 3.0, float 3.0>)
store <8 x float> %res, ptr addrspace(1) %out
ret void
}
define amdgpu_ps void @test_wmma_f32_16x16x16_bf8_fp8_imm(<2 x i32> %A, <2 x i32> %B, ptr addrspace(1) %out) {
; GFX12-LABEL: test_wmma_f32_16x16x16_bf8_fp8_imm:
; GFX12: ; %bb.0: ; %bb
; GFX12-NEXT: v_wmma_f32_16x16x16_bf8_fp8 v[6:13], v[0:1], v[2:3], 1.0
; GFX12-NEXT: s_clause 0x1
; GFX12-NEXT: global_store_b128 v[4:5], v[6:9], off
; GFX12-NEXT: global_store_b128 v[4:5], v[10:13], off offset:16
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
bb:
%res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf8.fp8.v8f32.v2i32.v2i32.v8f32(<2 x i32> %A, <2 x i32> %B, <8 x float> <float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0>)
store <8 x float> %res, ptr addrspace(1) %out
ret void
}
define amdgpu_ps void @test_wmma_f32_16x16x16_bf8_fp8_imm_non_inlineable(<2 x i32> %A, <2 x i32> %B, ptr addrspace(1) %out) {
; GFX12-LABEL: test_wmma_f32_16x16x16_bf8_fp8_imm_non_inlineable:
; GFX12: ; %bb.0: ; %bb
; GFX12-NEXT: s_mov_b32 s0, 0x40400000
; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX12-NEXT: s_mov_b32 s7, s0
; GFX12-NEXT: s_mov_b32 s1, s0
; GFX12-NEXT: s_mov_b32 s2, s0
; GFX12-NEXT: s_mov_b32 s3, s0
; GFX12-NEXT: s_mov_b32 s4, s0
; GFX12-NEXT: s_mov_b32 s5, s0
; GFX12-NEXT: s_mov_b32 s6, s0
; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_1)
; GFX12-NEXT: v_dual_mov_b32 v13, s7 :: v_dual_mov_b32 v12, s6
; GFX12-NEXT: v_dual_mov_b32 v11, s5 :: v_dual_mov_b32 v10, s4
; GFX12-NEXT: v_dual_mov_b32 v9, s3 :: v_dual_mov_b32 v8, s2
; GFX12-NEXT: v_dual_mov_b32 v7, s1 :: v_dual_mov_b32 v6, s0
; GFX12-NEXT: v_wmma_f32_16x16x16_bf8_fp8 v[6:13], v[0:1], v[2:3], v[6:13]
; GFX12-NEXT: s_clause 0x1
; GFX12-NEXT: global_store_b128 v[4:5], v[6:9], off
; GFX12-NEXT: global_store_b128 v[4:5], v[10:13], off offset:16
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
bb:
%res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf8.fp8.v8f32.v2i32.v2i32.v8f32(<2 x i32> %A, <2 x i32> %B, <8 x float> <float 3.0, float 3.0, float 3.0, float 3.0, float 3.0, float 3.0, float 3.0, float 3.0>)
store <8 x float> %res, ptr addrspace(1) %out
ret void
}
define amdgpu_ps void @test_wmma_f32_16x16x16_fp8_bf8_imm(<2 x i32> %A, <2 x i32> %B, ptr addrspace(1) %out) {
; GFX12-LABEL: test_wmma_f32_16x16x16_fp8_bf8_imm:
; GFX12: ; %bb.0: ; %bb
; GFX12-NEXT: v_wmma_f32_16x16x16_fp8_bf8 v[6:13], v[0:1], v[2:3], 1.0
; GFX12-NEXT: s_clause 0x1
; GFX12-NEXT: global_store_b128 v[4:5], v[6:9], off
; GFX12-NEXT: global_store_b128 v[4:5], v[10:13], off offset:16
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
bb:
%res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.fp8.bf8.v8f32.v2i32.v2i32.v8f32(<2 x i32> %A, <2 x i32> %B, <8 x float> <float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0>)
store <8 x float> %res, ptr addrspace(1) %out
ret void
}
define amdgpu_ps void @test_wmma_f32_16x16x16_fp8_bf8_imm_non_inlineable(<2 x i32> %A, <2 x i32> %B, ptr addrspace(1) %out) {
; GFX12-LABEL: test_wmma_f32_16x16x16_fp8_bf8_imm_non_inlineable:
; GFX12: ; %bb.0: ; %bb
; GFX12-NEXT: s_mov_b32 s0, 0x40400000
; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX12-NEXT: s_mov_b32 s7, s0
; GFX12-NEXT: s_mov_b32 s1, s0
; GFX12-NEXT: s_mov_b32 s2, s0
; GFX12-NEXT: s_mov_b32 s3, s0
; GFX12-NEXT: s_mov_b32 s4, s0
; GFX12-NEXT: s_mov_b32 s5, s0
; GFX12-NEXT: s_mov_b32 s6, s0
; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_1)
; GFX12-NEXT: v_dual_mov_b32 v13, s7 :: v_dual_mov_b32 v12, s6
; GFX12-NEXT: v_dual_mov_b32 v11, s5 :: v_dual_mov_b32 v10, s4
; GFX12-NEXT: v_dual_mov_b32 v9, s3 :: v_dual_mov_b32 v8, s2
; GFX12-NEXT: v_dual_mov_b32 v7, s1 :: v_dual_mov_b32 v6, s0
; GFX12-NEXT: v_wmma_f32_16x16x16_fp8_bf8 v[6:13], v[0:1], v[2:3], v[6:13]
; GFX12-NEXT: s_clause 0x1
; GFX12-NEXT: global_store_b128 v[4:5], v[6:9], off
; GFX12-NEXT: global_store_b128 v[4:5], v[10:13], off offset:16
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
bb:
%res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.fp8.bf8.v8f32.v2i32.v2i32.v8f32(<2 x i32> %A, <2 x i32> %B, <8 x float> <float 3.0, float 3.0, float 3.0, float 3.0, float 3.0, float 3.0, float 3.0, float 3.0>)
store <8 x float> %res, ptr addrspace(1) %out
ret void
}
define amdgpu_ps void @test_wmma_f32_16x16x16_bf8_bf8_imm(<2 x i32> %A, <2 x i32> %B, ptr addrspace(1) %out) {
; GFX12-LABEL: test_wmma_f32_16x16x16_bf8_bf8_imm:
; GFX12: ; %bb.0: ; %bb
; GFX12-NEXT: v_wmma_f32_16x16x16_bf8_bf8 v[6:13], v[0:1], v[2:3], 1.0
; GFX12-NEXT: s_clause 0x1
; GFX12-NEXT: global_store_b128 v[4:5], v[6:9], off
; GFX12-NEXT: global_store_b128 v[4:5], v[10:13], off offset:16
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
bb:
%res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf8.bf8.v8f32.v2i32.v2i32.v8f32(<2 x i32> %A, <2 x i32> %B, <8 x float> <float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0>)
store <8 x float> %res, ptr addrspace(1) %out
ret void
}
define amdgpu_ps void @test_wmma_f32_16x16x16_bf8_bf8_imm_non_inlineable(<2 x i32> %A, <2 x i32> %B, ptr addrspace(1) %out) {
; GFX12-LABEL: test_wmma_f32_16x16x16_bf8_bf8_imm_non_inlineable:
; GFX12: ; %bb.0: ; %bb
; GFX12-NEXT: s_mov_b32 s0, 0x40400000
; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX12-NEXT: s_mov_b32 s7, s0
; GFX12-NEXT: s_mov_b32 s1, s0
; GFX12-NEXT: s_mov_b32 s2, s0
; GFX12-NEXT: s_mov_b32 s3, s0
; GFX12-NEXT: s_mov_b32 s4, s0
; GFX12-NEXT: s_mov_b32 s5, s0
; GFX12-NEXT: s_mov_b32 s6, s0
; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_1)
; GFX12-NEXT: v_dual_mov_b32 v13, s7 :: v_dual_mov_b32 v12, s6
; GFX12-NEXT: v_dual_mov_b32 v11, s5 :: v_dual_mov_b32 v10, s4
; GFX12-NEXT: v_dual_mov_b32 v9, s3 :: v_dual_mov_b32 v8, s2
; GFX12-NEXT: v_dual_mov_b32 v7, s1 :: v_dual_mov_b32 v6, s0
; GFX12-NEXT: v_wmma_f32_16x16x16_bf8_bf8 v[6:13], v[0:1], v[2:3], v[6:13]
; GFX12-NEXT: s_clause 0x1
; GFX12-NEXT: global_store_b128 v[4:5], v[6:9], off
; GFX12-NEXT: global_store_b128 v[4:5], v[10:13], off offset:16
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
bb:
%res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf8.bf8.v8f32.v2i32.v2i32.v8f32(<2 x i32> %A, <2 x i32> %B, <8 x float> <float 3.0, float 3.0, float 3.0, float 3.0, float 3.0, float 3.0, float 3.0, float 3.0>)
store <8 x float> %res, ptr addrspace(1) %out
ret void
}
define amdgpu_ps void @test_wmma_i32_16x16x32_iu4_imm(<2 x i32> %A, <2 x i32> %B, ptr addrspace(1) %out) {
; GFX12-LABEL: test_wmma_i32_16x16x32_iu4_imm:
; GFX12: ; %bb.0: ; %bb
; GFX12-NEXT: v_wmma_i32_16x16x32_iu4 v[6:13], v[0:1], v[2:3], 1
; GFX12-NEXT: s_clause 0x1
; GFX12-NEXT: global_store_b128 v[4:5], v[6:9], off
; GFX12-NEXT: global_store_b128 v[4:5], v[10:13], off offset:16
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
bb:
%res = call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x32.iu4.v8i32.v2i32.v2i32.v8i32(i1 0, <2 x i32> %A, i1 0, <2 x i32> %B, <8 x i32> <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>, i1 0)
store <8 x i32> %res, ptr addrspace(1) %out
ret void
}
define amdgpu_ps void @test_wmma_i32_16x16x32_iu4_imm_non_inlineable(<2 x i32> %A, <2 x i32> %B, ptr addrspace(1) %out) {
; GFX12-LABEL: test_wmma_i32_16x16x32_iu4_imm_non_inlineable:
; GFX12: ; %bb.0: ; %bb
; GFX12-NEXT: s_movk_i32 s0, 0x80
; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX12-NEXT: s_mov_b32 s7, s0
; GFX12-NEXT: s_mov_b32 s1, s0
; GFX12-NEXT: s_mov_b32 s2, s0
; GFX12-NEXT: s_mov_b32 s3, s0
; GFX12-NEXT: s_mov_b32 s4, s0
; GFX12-NEXT: s_mov_b32 s5, s0
; GFX12-NEXT: s_mov_b32 s6, s0
; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_1)
; GFX12-NEXT: v_dual_mov_b32 v13, s7 :: v_dual_mov_b32 v12, s6
; GFX12-NEXT: v_dual_mov_b32 v11, s5 :: v_dual_mov_b32 v10, s4
; GFX12-NEXT: v_dual_mov_b32 v9, s3 :: v_dual_mov_b32 v8, s2
; GFX12-NEXT: v_dual_mov_b32 v7, s1 :: v_dual_mov_b32 v6, s0
; GFX12-NEXT: v_wmma_i32_16x16x32_iu4 v[6:13], v[0:1], v[2:3], v[6:13]
; GFX12-NEXT: s_clause 0x1
; GFX12-NEXT: global_store_b128 v[4:5], v[6:9], off
; GFX12-NEXT: global_store_b128 v[4:5], v[10:13], off offset:16
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
bb:
%res = call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x32.iu4.v8i32.v2i32.v2i32.v8i32(i1 0, <2 x i32> %A, i1 0, <2 x i32> %B, <8 x i32> <i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128>, i1 0)
store <8 x i32> %res, ptr addrspace(1) %out
ret void
}
declare <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.f16.v8f32.v8f16.v8f16.v8f32(<8 x half>, <8 x half>, <8 x float>)
declare <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf16.v8f32.v8i16.v8i16.v8f32(<8 x i16>, <8 x i16>, <8 x float>)
declare <8 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16.v8f16.v8f16.v8f16.v8f16(<8 x half>, <8 x half>, <8 x half>, i1 immarg)
declare <8 x i16> @llvm.amdgcn.wmma.bf16.16x16x16.bf16.v8i16.v8i16.v8i16.v8i16(<8 x i16>, <8 x i16>, <8 x i16>, i1 immarg)
declare <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8.v8i32.v2i32.v2i32.v8i32(i1 immarg, <2 x i32>, i1 immarg, <2 x i32>, <8 x i32>, i1 immarg)
declare <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4.v8i32.i32.i32.v8i32(i1 immarg, i32, i1 immarg, i32, <8 x i32>, i1 immarg)
declare <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.fp8.fp8.v8f32.v2i32.v2i32.v8f32(<2 x i32>, <2 x i32>, <8 x float>)
declare <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.fp8.bf8.v8f32.v2i32.v2i32.v8f32(<2 x i32>, <2 x i32>, <8 x float>)
declare <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf8.fp8.v8f32.v2i32.v2i32.v8f32(<2 x i32>, <2 x i32>, <8 x float>)
declare <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf8.bf8.v8f32.v2i32.v2i32.v8f32(<2 x i32>, <2 x i32>, <8 x float>)
declare <8 x i32> @llvm.amdgcn.wmma.i32.16x16x32.iu4.v8i32.v2i32.v2i32.v8i32(i1 immarg, <2 x i32>, i1 immarg, <2 x i32>, <8 x i32>, i1 immarg)
declare <8 x float> @llvm.amdgcn.swmmac.f32.16x16x32.f16.v8f32.v8f16.v16f16.v8f32.i16(<8 x half>, <16 x half>, <8 x float>, i16)
declare <8 x float> @llvm.amdgcn.swmmac.f32.16x16x32.bf16.v8f32.v8i16.v16i16.v8f32.i16(<8 x i16>, <16 x i16>, <8 x float>, i16)
declare <8 x half> @llvm.amdgcn.swmmac.f16.16x16x32.f16.v8f16.v8f16.v16f16.v8f16.i16(<8 x half>, <16 x half>, <8 x half>, i16)
declare <8 x i16> @llvm.amdgcn.swmmac.bf16.16x16x32.bf16.v8i16.v8i16.v16i16.v8i16.i16(<8 x i16>, <16 x i16>, <8 x i16>, i16)
declare <8 x i32> @llvm.amdgcn.swmmac.i32.16x16x32.iu8.v8i32.v2i32.v4i32.v8i32.i16(i1 immarg, <2 x i32>, i1 immarg, <4 x i32>, <8 x i32>, i16 %Index, i1 immarg)
declare <8 x i32> @llvm.amdgcn.swmmac.i32.16x16x32.iu4.v8i32.i32.v2i32.v8i32.i16(i1 immarg, i32, i1 immarg, <2 x i32>, <8 x i32>, i16 %Index, i1 immarg)
declare <8 x i32> @llvm.amdgcn.swmmac.i32.16x16x64.iu4.v8i32.v2i32.v4i32.v8i32.i32(i1 immarg, <2 x i32>, i1 immarg, <4 x i32>, <8 x i32>, i32 %Index, i1 immarg)
declare <8 x float> @llvm.amdgcn.swmmac.f32.16x16x32.fp8.fp8.v8f32.v2i32.v4i32.v8f32.i16(<2 x i32>, <4 x i32>, <8 x float>, i16)
declare <8 x float> @llvm.amdgcn.swmmac.f32.16x16x32.fp8.bf8.v8f32.v2i32.v4i32.v8f32.i16(<2 x i32>, <4 x i32>, <8 x float>, i16)
declare <8 x float> @llvm.amdgcn.swmmac.f32.16x16x32.bf8.fp8.v8f32.v2i32.v4i32.v8f32.i16(<2 x i32>, <4 x i32>, <8 x float>, i16)
declare <8 x float> @llvm.amdgcn.swmmac.f32.16x16x32.bf8.bf8.v8f32.v2i32.v4i32.v8f32.i16(<2 x i32>, <4 x i32>, <8 x float>, i16)

View File

@ -0,0 +1,309 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
; RUN: llc -global-isel -march=amdgcn -mcpu=gfx1200 -verify-machineinstrs < %s | FileCheck %s --check-prefix=GFX12
define amdgpu_ps void @test_wmma_i32_16x16x16_iu8_zext_src0(<2 x i32> %A, <2 x i32> %B, <8 x i32> %C, ptr addrspace(1) %out) {
; GFX12-LABEL: test_wmma_i32_16x16x16_iu8_zext_src0:
; GFX12: ; %bb.0: ; %bb
; GFX12-NEXT: v_wmma_i32_16x16x16_iu8 v[4:11], v[0:1], v[2:3], v[4:11] neg_lo:[1,0,0]
; GFX12-NEXT: s_clause 0x1
; GFX12-NEXT: global_store_b128 v[12:13], v[4:7], off
; GFX12-NEXT: global_store_b128 v[12:13], v[8:11], off offset:16
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
bb:
%res = call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8.v8i32.v2i32.v2i32.v8i32(i1 1, <2 x i32> %A, i1 0, <2 x i32> %B, <8 x i32> %C, i1 0)
store <8 x i32> %res, ptr addrspace(1) %out
ret void
}
define amdgpu_ps void @test_wmma_i32_16x16x16_iu8_zext_src1(<2 x i32> %A, <2 x i32> %B, <8 x i32> %C, ptr addrspace(1) %out) {
; GFX12-LABEL: test_wmma_i32_16x16x16_iu8_zext_src1:
; GFX12: ; %bb.0: ; %bb
; GFX12-NEXT: v_wmma_i32_16x16x16_iu8 v[4:11], v[0:1], v[2:3], v[4:11] neg_lo:[0,1,0]
; GFX12-NEXT: s_clause 0x1
; GFX12-NEXT: global_store_b128 v[12:13], v[4:7], off
; GFX12-NEXT: global_store_b128 v[12:13], v[8:11], off offset:16
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
bb:
%res = call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8.v8i32.v2i32.v2i32.v8i32(i1 0, <2 x i32> %A, i1 1, <2 x i32> %B, <8 x i32> %C, i1 0)
store <8 x i32> %res, ptr addrspace(1) %out
ret void
}
define amdgpu_ps void @test_wmma_i32_16x16x16_iu8_clamp(<2 x i32> %A, <2 x i32> %B, <8 x i32> %C, ptr addrspace(1) %out) {
; GFX12-LABEL: test_wmma_i32_16x16x16_iu8_clamp:
; GFX12: ; %bb.0: ; %bb
; GFX12-NEXT: v_wmma_i32_16x16x16_iu8 v[4:11], v[0:1], v[2:3], v[4:11] clamp
; GFX12-NEXT: s_clause 0x1
; GFX12-NEXT: global_store_b128 v[12:13], v[4:7], off
; GFX12-NEXT: global_store_b128 v[12:13], v[8:11], off offset:16
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
bb:
%res = call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8.v8i32.v2i32.v2i32.v8i32(i1 0, <2 x i32> %A, i1 0, <2 x i32> %B, <8 x i32> %C, i1 1)
store <8 x i32> %res, ptr addrspace(1) %out
ret void
}
define amdgpu_ps void @test_wmma_i32_16x16x16_iu4_zext_src0(i32 %A, i32 %B, <8 x i32> %C, ptr addrspace(1) %out) {
; GFX12-LABEL: test_wmma_i32_16x16x16_iu4_zext_src0:
; GFX12: ; %bb.0: ; %bb
; GFX12-NEXT: v_wmma_i32_16x16x16_iu4 v[2:9], v0, v1, v[2:9] neg_lo:[1,0,0]
; GFX12-NEXT: s_clause 0x1
; GFX12-NEXT: global_store_b128 v[10:11], v[2:5], off
; GFX12-NEXT: global_store_b128 v[10:11], v[6:9], off offset:16
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
bb:
%res = call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4.v8i32.i32.i32.v8i32(i1 1, i32 %A, i1 0, i32 %B, <8 x i32> %C, i1 0)
store <8 x i32> %res, ptr addrspace(1) %out
ret void
}
define amdgpu_ps void @test_wmma_i32_16x16x16_iu4_zext_src1(i32 %A, i32 %B, <8 x i32> %C, ptr addrspace(1) %out) {
; GFX12-LABEL: test_wmma_i32_16x16x16_iu4_zext_src1:
; GFX12: ; %bb.0: ; %bb
; GFX12-NEXT: v_wmma_i32_16x16x16_iu4 v[2:9], v0, v1, v[2:9] neg_lo:[0,1,0]
; GFX12-NEXT: s_clause 0x1
; GFX12-NEXT: global_store_b128 v[10:11], v[2:5], off
; GFX12-NEXT: global_store_b128 v[10:11], v[6:9], off offset:16
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
bb:
%res = call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4.v8i32.i32.i32.v8i32(i1 0, i32 %A, i1 1, i32 %B, <8 x i32> %C, i1 0)
store <8 x i32> %res, ptr addrspace(1) %out
ret void
}
define amdgpu_ps void @test_wmma_i32_16x16x16_iu4_clamp(i32 %A, i32 %B, <8 x i32> %C, ptr addrspace(1) %out) {
; GFX12-LABEL: test_wmma_i32_16x16x16_iu4_clamp:
; GFX12: ; %bb.0: ; %bb
; GFX12-NEXT: v_wmma_i32_16x16x16_iu4 v[2:9], v0, v1, v[2:9] clamp
; GFX12-NEXT: s_clause 0x1
; GFX12-NEXT: global_store_b128 v[10:11], v[2:5], off
; GFX12-NEXT: global_store_b128 v[10:11], v[6:9], off offset:16
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
bb:
%res = call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4.v8i32.i32.i32.v8i32(i1 0, i32 %A, i1 0, i32 %B, <8 x i32> %C, i1 1)
store <8 x i32> %res, ptr addrspace(1) %out
ret void
}
define amdgpu_ps void @test_wmma_i32_16x16x32_iu4_zext_src0(<2 x i32> %A, <2 x i32> %B, <8 x i32> %C, ptr addrspace(1) %out) {
; GFX12-LABEL: test_wmma_i32_16x16x32_iu4_zext_src0:
; GFX12: ; %bb.0: ; %bb
; GFX12-NEXT: v_wmma_i32_16x16x32_iu4 v[4:11], v[0:1], v[2:3], v[4:11] neg_lo:[1,0,0]
; GFX12-NEXT: s_clause 0x1
; GFX12-NEXT: global_store_b128 v[12:13], v[4:7], off
; GFX12-NEXT: global_store_b128 v[12:13], v[8:11], off offset:16
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
bb:
%res = call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x32.iu4.v8i32.v2i32.v2i32.v8i32(i1 1, <2 x i32> %A, i1 0, <2 x i32> %B, <8 x i32> %C, i1 0)
store <8 x i32> %res, ptr addrspace(1) %out
ret void
}
define amdgpu_ps void @test_wmma_i32_16x16x32_iu4_zext_src1(<2 x i32> %A, <2 x i32> %B, <8 x i32> %C, ptr addrspace(1) %out) {
; GFX12-LABEL: test_wmma_i32_16x16x32_iu4_zext_src1:
; GFX12: ; %bb.0: ; %bb
; GFX12-NEXT: v_wmma_i32_16x16x32_iu4 v[4:11], v[0:1], v[2:3], v[4:11] neg_lo:[0,1,0]
; GFX12-NEXT: s_clause 0x1
; GFX12-NEXT: global_store_b128 v[12:13], v[4:7], off
; GFX12-NEXT: global_store_b128 v[12:13], v[8:11], off offset:16
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
bb:
%res = call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x32.iu4.v8i32.v2i32.v2i32.v8i32(i1 0, <2 x i32> %A, i1 1, <2 x i32> %B, <8 x i32> %C, i1 0)
store <8 x i32> %res, ptr addrspace(1) %out
ret void
}
define amdgpu_ps void @test_wmma_i32_16x16x32_iu4_clamp(<2 x i32> %A, <2 x i32> %B, <8 x i32> %C, ptr addrspace(1) %out) {
; GFX12-LABEL: test_wmma_i32_16x16x32_iu4_clamp:
; GFX12: ; %bb.0: ; %bb
; GFX12-NEXT: v_wmma_i32_16x16x32_iu4 v[4:11], v[0:1], v[2:3], v[4:11] clamp
; GFX12-NEXT: s_clause 0x1
; GFX12-NEXT: global_store_b128 v[12:13], v[4:7], off
; GFX12-NEXT: global_store_b128 v[12:13], v[8:11], off offset:16
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
bb:
%res = call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x32.iu4.v8i32.v2i32.v2i32.v8i32(i1 0, <2 x i32> %A, i1 0, <2 x i32> %B, <8 x i32> %C, i1 1)
store <8 x i32> %res, ptr addrspace(1) %out
ret void
}
define amdgpu_ps void @test_swmmac_i32_16x16x32_iu8_zext_src0(<2 x i32> %A, <4 x i32> %B, <8 x i32> %C, i16 %Index, ptr addrspace(1) %out) {
; GFX12-LABEL: test_swmmac_i32_16x16x32_iu8_zext_src0:
; GFX12: ; %bb.0: ; %bb
; GFX12-NEXT: v_swmmac_i32_16x16x32_iu8 v[6:13], v[0:1], v[2:5], v14 neg_lo:[1,0,0]
; GFX12-NEXT: s_clause 0x1
; GFX12-NEXT: global_store_b128 v[15:16], v[6:9], off
; GFX12-NEXT: global_store_b128 v[15:16], v[10:13], off offset:16
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
bb:
%res = call <8 x i32> @llvm.amdgcn.swmmac.i32.16x16x32.iu8.v8i32.v2i32.v4i32.v8i32.i16(i1 1, <2 x i32> %A, i1 0, <4 x i32> %B, <8 x i32> %C, i16 %Index, i1 0)
store <8 x i32> %res, ptr addrspace(1) %out
ret void
}
define amdgpu_ps void @test_swmmac_i32_16x16x32_iu8_zext_src1(<2 x i32> %A, <4 x i32> %B, <8 x i32> %C, i16 %Index, ptr addrspace(1) %out) {
; GFX12-LABEL: test_swmmac_i32_16x16x32_iu8_zext_src1:
; GFX12: ; %bb.0: ; %bb
; GFX12-NEXT: v_swmmac_i32_16x16x32_iu8 v[6:13], v[0:1], v[2:5], v14 neg_lo:[0,1,0]
; GFX12-NEXT: s_clause 0x1
; GFX12-NEXT: global_store_b128 v[15:16], v[6:9], off
; GFX12-NEXT: global_store_b128 v[15:16], v[10:13], off offset:16
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
bb:
%res = call <8 x i32> @llvm.amdgcn.swmmac.i32.16x16x32.iu8.v8i32.v2i32.v4i32.v8i32.i16(i1 0, <2 x i32> %A, i1 1, <4 x i32> %B, <8 x i32> %C, i16 %Index, i1 0)
store <8 x i32> %res, ptr addrspace(1) %out
ret void
}
define amdgpu_ps void @test_swmmac_i32_16x16x32_iu8_clamp(<2 x i32> %A, <4 x i32> %B, <8 x i32> %C, i16 %Index, ptr addrspace(1) %out) {
; GFX12-LABEL: test_swmmac_i32_16x16x32_iu8_clamp:
; GFX12: ; %bb.0: ; %bb
; GFX12-NEXT: v_swmmac_i32_16x16x32_iu8 v[6:13], v[0:1], v[2:5], v14 clamp
; GFX12-NEXT: s_clause 0x1
; GFX12-NEXT: global_store_b128 v[15:16], v[6:9], off
; GFX12-NEXT: global_store_b128 v[15:16], v[10:13], off offset:16
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
bb:
%res = call <8 x i32> @llvm.amdgcn.swmmac.i32.16x16x32.iu8.v8i32.v2i32.v4i32.v8i32.i16(i1 0, <2 x i32> %A, i1 0, <4 x i32> %B, <8 x i32> %C, i16 %Index, i1 1)
store <8 x i32> %res, ptr addrspace(1) %out
ret void
}
define amdgpu_ps void @test_swmmac_i32_16x16x32_iu4_zext_src0(i32 %A, <2 x i32> %B, <8 x i32> %C, i16 %Index, ptr addrspace(1) %out) {
; GFX12-LABEL: test_swmmac_i32_16x16x32_iu4_zext_src0:
; GFX12: ; %bb.0: ; %bb
; GFX12-NEXT: v_swmmac_i32_16x16x32_iu4 v[3:10], v0, v[1:2], v11 neg_lo:[1,0,0]
; GFX12-NEXT: s_clause 0x1
; GFX12-NEXT: global_store_b128 v[12:13], v[3:6], off
; GFX12-NEXT: global_store_b128 v[12:13], v[7:10], off offset:16
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
bb:
%res = call <8 x i32> @llvm.amdgcn.swmmac.i32.16x16x32.iu4.v8i32.i32.v2i32.v8i32.i16(i1 1, i32 %A, i1 0, <2 x i32> %B, <8 x i32> %C, i16 %Index, i1 0)
store <8 x i32> %res, ptr addrspace(1) %out
ret void
}
define amdgpu_ps void @test_swmmac_i32_16x16x32_iu4_zext_src1(i32 %A, <2 x i32> %B, <8 x i32> %C, i16 %Index, ptr addrspace(1) %out) {
; GFX12-LABEL: test_swmmac_i32_16x16x32_iu4_zext_src1:
; GFX12: ; %bb.0: ; %bb
; GFX12-NEXT: v_swmmac_i32_16x16x32_iu4 v[3:10], v0, v[1:2], v11 neg_lo:[0,1,0]
; GFX12-NEXT: s_clause 0x1
; GFX12-NEXT: global_store_b128 v[12:13], v[3:6], off
; GFX12-NEXT: global_store_b128 v[12:13], v[7:10], off offset:16
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
bb:
%res = call <8 x i32> @llvm.amdgcn.swmmac.i32.16x16x32.iu4.v8i32.i32.v2i32.v8i32.i16(i1 0, i32 %A, i1 1, <2 x i32> %B, <8 x i32> %C, i16 %Index, i1 0)
store <8 x i32> %res, ptr addrspace(1) %out
ret void
}
define amdgpu_ps void @test_swmmac_i32_16x16x32_iu4_clamp(i32 %A, <2 x i32> %B, <8 x i32> %C, i16 %Index, ptr addrspace(1) %out) {
; GFX12-LABEL: test_swmmac_i32_16x16x32_iu4_clamp:
; GFX12: ; %bb.0: ; %bb
; GFX12-NEXT: v_swmmac_i32_16x16x32_iu4 v[3:10], v0, v[1:2], v11 clamp
; GFX12-NEXT: s_clause 0x1
; GFX12-NEXT: global_store_b128 v[12:13], v[3:6], off
; GFX12-NEXT: global_store_b128 v[12:13], v[7:10], off offset:16
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
bb:
%res = call <8 x i32> @llvm.amdgcn.swmmac.i32.16x16x32.iu4.v8i32.i32.v2i32.v8i32.i16(i1 0, i32 %A, i1 0, <2 x i32> %B, <8 x i32> %C, i16 %Index, i1 1)
store <8 x i32> %res, ptr addrspace(1) %out
ret void
}
define amdgpu_ps void @test_swmmac_i32_16x16x64_iu4_zext_src0(<2 x i32> %A, <4 x i32> %B, <8 x i32> %C, i32 %Index, ptr addrspace(1) %out) {
; GFX12-LABEL: test_swmmac_i32_16x16x64_iu4_zext_src0:
; GFX12: ; %bb.0: ; %bb
; GFX12-NEXT: v_swmmac_i32_16x16x64_iu4 v[6:13], v[0:1], v[2:5], v14 neg_lo:[1,0,0]
; GFX12-NEXT: s_clause 0x1
; GFX12-NEXT: global_store_b128 v[15:16], v[6:9], off
; GFX12-NEXT: global_store_b128 v[15:16], v[10:13], off offset:16
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
bb:
%res = call <8 x i32> @llvm.amdgcn.swmmac.i32.16x16x64.iu4.v8i32.v2i32.v4i32.v8i32.i32(i1 1, <2 x i32> %A, i1 0, <4 x i32> %B, <8 x i32> %C, i32 %Index, i1 0)
store <8 x i32> %res, ptr addrspace(1) %out
ret void
}
define amdgpu_ps void @test_swmmac_i32_16x16x64_iu4_zext_src1(<2 x i32> %A, <4 x i32> %B, <8 x i32> %C, i32 %Index, ptr addrspace(1) %out) {
; GFX12-LABEL: test_swmmac_i32_16x16x64_iu4_zext_src1:
; GFX12: ; %bb.0: ; %bb
; GFX12-NEXT: v_swmmac_i32_16x16x64_iu4 v[6:13], v[0:1], v[2:5], v14 neg_lo:[0,1,0]
; GFX12-NEXT: s_clause 0x1
; GFX12-NEXT: global_store_b128 v[15:16], v[6:9], off
; GFX12-NEXT: global_store_b128 v[15:16], v[10:13], off offset:16
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
bb:
%res = call <8 x i32> @llvm.amdgcn.swmmac.i32.16x16x64.iu4.v8i32.v2i32.v4i32.v8i32.i32(i1 0, <2 x i32> %A, i1 1, <4 x i32> %B, <8 x i32> %C, i32 %Index, i1 0)
store <8 x i32> %res, ptr addrspace(1) %out
ret void
}
define amdgpu_ps void @test_swmmac_i32_16x16x64_iu4_clamp(<2 x i32> %A, <4 x i32> %B, <8 x i32> %C, i32 %Index, ptr addrspace(1) %out) {
; GFX12-LABEL: test_swmmac_i32_16x16x64_iu4_clamp:
; GFX12: ; %bb.0: ; %bb
; GFX12-NEXT: v_swmmac_i32_16x16x64_iu4 v[6:13], v[0:1], v[2:5], v14 clamp
; GFX12-NEXT: s_clause 0x1
; GFX12-NEXT: global_store_b128 v[15:16], v[6:9], off
; GFX12-NEXT: global_store_b128 v[15:16], v[10:13], off offset:16
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
bb:
%res = call <8 x i32> @llvm.amdgcn.swmmac.i32.16x16x64.iu4.v8i32.v2i32.v4i32.v8i32.i32(i1 0, <2 x i32> %A, i1 0, <4 x i32> %B, <8 x i32> %C, i32 %Index, i1 1)
store <8 x i32> %res, ptr addrspace(1) %out
ret void
}
declare <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8.v8i32.v2i32.v2i32.v8i32(i1 immarg, <2 x i32>, i1 immarg, <2 x i32>, <8 x i32>, i1 immarg)
declare <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4.v8i32.i32.i32.v8i32(i1 immarg, i32, i1 immarg, i32, <8 x i32>, i1 immarg)
declare <8 x i32> @llvm.amdgcn.wmma.i32.16x16x32.iu4.v8i32.v2i32.v2i32.v8i32(i1 immarg, <2 x i32>, i1 immarg, <2 x i32>, <8 x i32>, i1 immarg)
declare <8 x i32> @llvm.amdgcn.swmmac.i32.16x16x32.iu8.v8i32.v2i32.v4i32.v8i32.i16(i1 immarg, <2 x i32>, i1 immarg, <4 x i32>, <8 x i32>, i16 %Index, i1 immarg)
declare <8 x i32> @llvm.amdgcn.swmmac.i32.16x16x32.iu4.v8i32.i32.v2i32.v8i32.i16(i1 immarg, i32, i1 immarg, <2 x i32>, <8 x i32>, i16 %Index, i1 immarg)
declare <8 x i32> @llvm.amdgcn.swmmac.i32.16x16x64.iu4.v8i32.v2i32.v4i32.v8i32.i32(i1 immarg, <2 x i32>, i1 immarg, <4 x i32>, <8 x i32>, i32 %Index, i1 immarg)

View File

@ -0,0 +1,321 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
; RUN: llc -global-isel -march=amdgcn -mcpu=gfx1200 -verify-machineinstrs < %s | FileCheck %s --check-prefix=GFX12
define amdgpu_ps void @test_swmmac_f32_16x16x32_f16_index_key(<8 x half> %A, <16 x half> %B, <8 x float> %C, ptr addrspace(1) %IndexVecPtr, ptr addrspace(1) %out0, ptr addrspace(1) %out1) {
; GFX12-LABEL: test_swmmac_f32_16x16x32_f16_index_key:
; GFX12: ; %bb.0: ; %bb
; GFX12-NEXT: global_load_b32 v20, v[20:21], off
; GFX12-NEXT: v_dual_mov_b32 v33, v19 :: v_dual_mov_b32 v32, v18
; GFX12-NEXT: v_dual_mov_b32 v31, v17 :: v_dual_mov_b32 v30, v16
; GFX12-NEXT: v_dual_mov_b32 v29, v15 :: v_dual_mov_b32 v28, v14
; GFX12-NEXT: v_dual_mov_b32 v27, v13 :: v_dual_mov_b32 v26, v12
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-NEXT: v_swmmac_f32_16x16x32_f16 v[26:33], v[0:3], v[4:11], v20
; GFX12-NEXT: v_swmmac_f32_16x16x32_f16 v[12:19], v[0:3], v[4:11], v20 index_key:1
; GFX12-NEXT: s_clause 0x1
; GFX12-NEXT: global_store_b128 v[22:23], v[26:29], off
; GFX12-NEXT: global_store_b128 v[22:23], v[30:33], off offset:16
; GFX12-NEXT: s_clause 0x1
; GFX12-NEXT: global_store_b128 v[24:25], v[12:15], off
; GFX12-NEXT: global_store_b128 v[24:25], v[16:19], off offset:16
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
bb:
%IndexVec = load <2 x i16>, ptr addrspace(1) %IndexVecPtr, align 4
%Index0 = extractelement <2 x i16> %IndexVec, i32 0
%res0 = call <8 x float> @llvm.amdgcn.swmmac.f32.16x16x32.f16.v8f32.v8f16.v16f16.v8f32.i16(<8 x half> %A, <16 x half> %B, <8 x float> %C, i16 %Index0)
store <8 x float> %res0, ptr addrspace(1) %out0
%Index1 = extractelement <2 x i16> %IndexVec, i32 1
%res1 = call <8 x float> @llvm.amdgcn.swmmac.f32.16x16x32.f16.v8f32.v8f16.v16f16.v8f32.i16(<8 x half> %A, <16 x half> %B, <8 x float> %C, i16 %Index1)
store <8 x float> %res1, ptr addrspace(1) %out1
ret void
}
define amdgpu_ps void @test_swmmac_f32_16x16x32_bf16_index_key(<8 x i16> %A, <16 x i16> %B, <8 x float> %C, ptr addrspace(1) %IndexVecPtr, ptr addrspace(1) %out0, ptr addrspace(1) %out1) {
; GFX12-LABEL: test_swmmac_f32_16x16x32_bf16_index_key:
; GFX12: ; %bb.0: ; %bb
; GFX12-NEXT: global_load_b32 v20, v[20:21], off
; GFX12-NEXT: v_dual_mov_b32 v33, v19 :: v_dual_mov_b32 v32, v18
; GFX12-NEXT: v_dual_mov_b32 v31, v17 :: v_dual_mov_b32 v30, v16
; GFX12-NEXT: v_dual_mov_b32 v29, v15 :: v_dual_mov_b32 v28, v14
; GFX12-NEXT: v_dual_mov_b32 v27, v13 :: v_dual_mov_b32 v26, v12
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-NEXT: v_swmmac_f32_16x16x32_bf16 v[26:33], v[0:3], v[4:11], v20
; GFX12-NEXT: v_swmmac_f32_16x16x32_bf16 v[12:19], v[0:3], v[4:11], v20 index_key:1
; GFX12-NEXT: s_clause 0x1
; GFX12-NEXT: global_store_b128 v[22:23], v[26:29], off
; GFX12-NEXT: global_store_b128 v[22:23], v[30:33], off offset:16
; GFX12-NEXT: s_clause 0x1
; GFX12-NEXT: global_store_b128 v[24:25], v[12:15], off
; GFX12-NEXT: global_store_b128 v[24:25], v[16:19], off offset:16
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
bb:
%IndexVec = load <2 x i16>, ptr addrspace(1) %IndexVecPtr, align 4
%Index0 = extractelement <2 x i16> %IndexVec, i32 0
%res0 = call <8 x float> @llvm.amdgcn.swmmac.f32.16x16x32.bf16.v8f32.v8i16.v16i16.v8f32.i16(<8 x i16> %A, <16 x i16> %B, <8 x float> %C, i16 %Index0)
store <8 x float> %res0, ptr addrspace(1) %out0
%Index1 = extractelement <2 x i16> %IndexVec, i32 1
%res1 = call <8 x float> @llvm.amdgcn.swmmac.f32.16x16x32.bf16.v8f32.v8i16.v16i16.v8f32.i16(<8 x i16> %A, <16 x i16> %B, <8 x float> %C, i16 %Index1)
store <8 x float> %res1, ptr addrspace(1) %out1
ret void
}
define amdgpu_ps void @test_swmmac_f16_16x16x32_f16_index_key(<8 x half> %A, <16 x half> %B, <8 x half> %C, ptr addrspace(1) %IndexVecPtr, ptr addrspace(1) %out0, ptr addrspace(1) %out1) {
; GFX12-LABEL: test_swmmac_f16_16x16x32_f16_index_key:
; GFX12: ; %bb.0: ; %bb
; GFX12-NEXT: global_load_b32 v16, v[16:17], off
; GFX12-NEXT: v_dual_mov_b32 v25, v15 :: v_dual_mov_b32 v24, v14
; GFX12-NEXT: v_dual_mov_b32 v23, v13 :: v_dual_mov_b32 v22, v12
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-NEXT: v_swmmac_f16_16x16x32_f16 v[22:25], v[0:3], v[4:11], v16
; GFX12-NEXT: v_swmmac_f16_16x16x32_f16 v[12:15], v[0:3], v[4:11], v16 index_key:1
; GFX12-NEXT: global_store_b128 v[18:19], v[22:25], off
; GFX12-NEXT: global_store_b128 v[20:21], v[12:15], off
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
bb:
%IndexVec = load <2 x i16>, ptr addrspace(1) %IndexVecPtr, align 4
%Index0 = extractelement <2 x i16> %IndexVec, i32 0
%res0 = call <8 x half> @llvm.amdgcn.swmmac.f16.16x16x32.f16.v8f16.v8f16.v16f16.v8f16.i16(<8 x half> %A, <16 x half> %B, <8 x half> %C, i16 %Index0)
store <8 x half> %res0, ptr addrspace(1) %out0
%Index1 = extractelement <2 x i16> %IndexVec, i32 1
%res1 = call <8 x half> @llvm.amdgcn.swmmac.f16.16x16x32.f16.v8f16.v8f16.v16f16.v8f16.i16(<8 x half> %A, <16 x half> %B, <8 x half> %C, i16 %Index1)
store <8 x half> %res1, ptr addrspace(1) %out1
ret void
}
define amdgpu_ps void @test_swmmac_bf16_16x16x32_bf16_index_key(<8 x i16> %A, <16 x i16> %B, <8 x i16> %C, ptr addrspace(1) %IndexVecPtr, ptr addrspace(1) %out0, ptr addrspace(1) %out1) {
; GFX12-LABEL: test_swmmac_bf16_16x16x32_bf16_index_key:
; GFX12: ; %bb.0: ; %bb
; GFX12-NEXT: global_load_b32 v16, v[16:17], off
; GFX12-NEXT: v_dual_mov_b32 v25, v15 :: v_dual_mov_b32 v24, v14
; GFX12-NEXT: v_dual_mov_b32 v23, v13 :: v_dual_mov_b32 v22, v12
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-NEXT: v_swmmac_bf16_16x16x32_bf16 v[22:25], v[0:3], v[4:11], v16
; GFX12-NEXT: v_swmmac_bf16_16x16x32_bf16 v[12:15], v[0:3], v[4:11], v16 index_key:1
; GFX12-NEXT: global_store_b128 v[18:19], v[22:25], off
; GFX12-NEXT: global_store_b128 v[20:21], v[12:15], off
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
bb:
%IndexVec = load <2 x i16>, ptr addrspace(1) %IndexVecPtr, align 4
%Index0 = extractelement <2 x i16> %IndexVec, i32 0
%res0 = call <8 x i16> @llvm.amdgcn.swmmac.bf16.16x16x32.bf16.v8i16.v8i16.v16i16.v8i16.i16(<8 x i16> %A, <16 x i16> %B, <8 x i16> %C, i16 %Index0)
store <8 x i16> %res0, ptr addrspace(1) %out0
%Index1 = extractelement <2 x i16> %IndexVec, i32 1
%res1 = call <8 x i16> @llvm.amdgcn.swmmac.bf16.16x16x32.bf16.v8i16.v8i16.v16i16.v8i16.i16(<8 x i16> %A, <16 x i16> %B, <8 x i16> %C, i16 %Index1)
store <8 x i16> %res1, ptr addrspace(1) %out1
ret void
}
define amdgpu_ps void @test_swmmac_i32_16x16x32_iu8_index_key(<2 x i32> %A, <4 x i32> %B, <8 x i32> %C, ptr addrspace(1) %IndexVecPtr, ptr addrspace(1) %out0, ptr addrspace(1) %out1) {
; GFX12-LABEL: test_swmmac_i32_16x16x32_iu8_index_key:
; GFX12: ; %bb.0: ; %bb
; GFX12-NEXT: global_load_b32 v14, v[14:15], off
; GFX12-NEXT: v_dual_mov_b32 v27, v13 :: v_dual_mov_b32 v26, v12
; GFX12-NEXT: v_dual_mov_b32 v25, v11 :: v_dual_mov_b32 v24, v10
; GFX12-NEXT: v_dual_mov_b32 v23, v9 :: v_dual_mov_b32 v22, v8
; GFX12-NEXT: v_dual_mov_b32 v21, v7 :: v_dual_mov_b32 v20, v6
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-NEXT: v_swmmac_i32_16x16x32_iu8 v[20:27], v[0:1], v[2:5], v14
; GFX12-NEXT: v_swmmac_i32_16x16x32_iu8 v[6:13], v[0:1], v[2:5], v14 index_key:1
; GFX12-NEXT: s_clause 0x1
; GFX12-NEXT: global_store_b128 v[16:17], v[20:23], off
; GFX12-NEXT: global_store_b128 v[16:17], v[24:27], off offset:16
; GFX12-NEXT: s_clause 0x1
; GFX12-NEXT: global_store_b128 v[18:19], v[6:9], off
; GFX12-NEXT: global_store_b128 v[18:19], v[10:13], off offset:16
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
bb:
%IndexVec = load <2 x i16>, ptr addrspace(1) %IndexVecPtr, align 4
%Index0 = extractelement <2 x i16> %IndexVec, i32 0
%res0 = call <8 x i32> @llvm.amdgcn.swmmac.i32.16x16x32.iu8.v8i32.v2i32.v4i32.v8i32.i16(i1 0, <2 x i32> %A, i1 0, <4 x i32> %B, <8 x i32> %C, i16 %Index0, i1 0)
store <8 x i32> %res0, ptr addrspace(1) %out0
%Index1 = extractelement <2 x i16> %IndexVec, i32 1
%res1 = call <8 x i32> @llvm.amdgcn.swmmac.i32.16x16x32.iu8.v8i32.v2i32.v4i32.v8i32.i16(i1 0, <2 x i32> %A, i1 0, <4 x i32> %B, <8 x i32> %C, i16 %Index1, i1 0)
store <8 x i32> %res1, ptr addrspace(1) %out1
ret void
}
define amdgpu_ps void @test_swmmac_i32_16x16x32_iu4_index_key(i32 %A, <2 x i32> %B, <8 x i32> %C, ptr addrspace(1) %IndexVecPtr, ptr addrspace(1) %out0, ptr addrspace(1) %out1) {
; GFX12-LABEL: test_swmmac_i32_16x16x32_iu4_index_key:
; GFX12: ; %bb.0: ; %bb
; GFX12-NEXT: global_load_b32 v11, v[11:12], off
; GFX12-NEXT: v_dual_mov_b32 v24, v10 :: v_dual_mov_b32 v23, v9
; GFX12-NEXT: v_dual_mov_b32 v22, v8 :: v_dual_mov_b32 v21, v7
; GFX12-NEXT: v_dual_mov_b32 v20, v6 :: v_dual_mov_b32 v19, v5
; GFX12-NEXT: v_dual_mov_b32 v18, v4 :: v_dual_mov_b32 v17, v3
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-NEXT: v_swmmac_i32_16x16x32_iu4 v[17:24], v0, v[1:2], v11
; GFX12-NEXT: v_swmmac_i32_16x16x32_iu4 v[3:10], v0, v[1:2], v11 index_key:1
; GFX12-NEXT: s_clause 0x1
; GFX12-NEXT: global_store_b128 v[13:14], v[17:20], off
; GFX12-NEXT: global_store_b128 v[13:14], v[21:24], off offset:16
; GFX12-NEXT: s_clause 0x1
; GFX12-NEXT: global_store_b128 v[15:16], v[3:6], off
; GFX12-NEXT: global_store_b128 v[15:16], v[7:10], off offset:16
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
bb:
%IndexVec = load <2 x i16>, ptr addrspace(1) %IndexVecPtr, align 4
%Index0 = extractelement <2 x i16> %IndexVec, i32 0
%res0 = call <8 x i32> @llvm.amdgcn.swmmac.i32.16x16x32.iu4.v8i32.i32.v2i32.v8i32.i16(i1 0, i32 %A, i1 0, <2 x i32> %B, <8 x i32> %C, i16 %Index0, i1 0)
store <8 x i32> %res0, ptr addrspace(1) %out0
%Index1 = extractelement <2 x i16> %IndexVec, i32 1
%res1 = call <8 x i32> @llvm.amdgcn.swmmac.i32.16x16x32.iu4.v8i32.i32.v2i32.v8i32.i16(i1 0, i32 %A, i1 0, <2 x i32> %B, <8 x i32> %C, i16 %Index1, i1 0)
store <8 x i32> %res1, ptr addrspace(1) %out1
ret void
}
define amdgpu_ps void @test_swmmac_f32_16x16x32_fp8_fp8_index_key(<2 x i32> %A, <4 x i32> %B, <8 x float> %C, ptr addrspace(1) %IndexVecPtr, ptr addrspace(1) %out0, ptr addrspace(1) %out1) {
; GFX12-LABEL: test_swmmac_f32_16x16x32_fp8_fp8_index_key:
; GFX12: ; %bb.0: ; %bb
; GFX12-NEXT: global_load_b32 v14, v[14:15], off
; GFX12-NEXT: v_dual_mov_b32 v27, v13 :: v_dual_mov_b32 v26, v12
; GFX12-NEXT: v_dual_mov_b32 v25, v11 :: v_dual_mov_b32 v24, v10
; GFX12-NEXT: v_dual_mov_b32 v23, v9 :: v_dual_mov_b32 v22, v8
; GFX12-NEXT: v_dual_mov_b32 v21, v7 :: v_dual_mov_b32 v20, v6
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-NEXT: v_swmmac_f32_16x16x32_fp8_fp8 v[20:27], v[0:1], v[2:5], v14
; GFX12-NEXT: v_swmmac_f32_16x16x32_fp8_fp8 v[6:13], v[0:1], v[2:5], v14 index_key:1
; GFX12-NEXT: s_clause 0x1
; GFX12-NEXT: global_store_b128 v[16:17], v[20:23], off
; GFX12-NEXT: global_store_b128 v[16:17], v[24:27], off offset:16
; GFX12-NEXT: s_clause 0x1
; GFX12-NEXT: global_store_b128 v[18:19], v[6:9], off
; GFX12-NEXT: global_store_b128 v[18:19], v[10:13], off offset:16
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
bb:
%IndexVec = load <2 x i16>, ptr addrspace(1) %IndexVecPtr, align 4
%Index0 = extractelement <2 x i16> %IndexVec, i32 0
%res0 = call <8 x float> @llvm.amdgcn.swmmac.f32.16x16x32.fp8.fp8.v8f32.v2i32.v4i32.v8f32.i16(<2 x i32> %A, <4 x i32> %B, <8 x float> %C, i16 %Index0)
store <8 x float> %res0, ptr addrspace(1) %out0
%Index1 = extractelement <2 x i16> %IndexVec, i32 1
%res1 = call <8 x float> @llvm.amdgcn.swmmac.f32.16x16x32.fp8.fp8.v8f32.v2i32.v4i32.v8f32.i16(<2 x i32> %A, <4 x i32> %B, <8 x float> %C, i16 %Index1)
store <8 x float> %res1, ptr addrspace(1) %out1
ret void
}
define amdgpu_ps void @test_swmmac_f32_16x16x32_fp8_bf8_index_key(<2 x i32> %A, <4 x i32> %B, <8 x float> %C, ptr addrspace(1) %IndexVecPtr, ptr addrspace(1) %out0, ptr addrspace(1) %out1) {
; GFX12-LABEL: test_swmmac_f32_16x16x32_fp8_bf8_index_key:
; GFX12: ; %bb.0: ; %bb
; GFX12-NEXT: global_load_b32 v14, v[14:15], off
; GFX12-NEXT: v_dual_mov_b32 v27, v13 :: v_dual_mov_b32 v26, v12
; GFX12-NEXT: v_dual_mov_b32 v25, v11 :: v_dual_mov_b32 v24, v10
; GFX12-NEXT: v_dual_mov_b32 v23, v9 :: v_dual_mov_b32 v22, v8
; GFX12-NEXT: v_dual_mov_b32 v21, v7 :: v_dual_mov_b32 v20, v6
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-NEXT: v_swmmac_f32_16x16x32_fp8_bf8 v[20:27], v[0:1], v[2:5], v14
; GFX12-NEXT: v_swmmac_f32_16x16x32_fp8_bf8 v[6:13], v[0:1], v[2:5], v14 index_key:1
; GFX12-NEXT: s_clause 0x1
; GFX12-NEXT: global_store_b128 v[16:17], v[20:23], off
; GFX12-NEXT: global_store_b128 v[16:17], v[24:27], off offset:16
; GFX12-NEXT: s_clause 0x1
; GFX12-NEXT: global_store_b128 v[18:19], v[6:9], off
; GFX12-NEXT: global_store_b128 v[18:19], v[10:13], off offset:16
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
bb:
%IndexVec = load <2 x i16>, ptr addrspace(1) %IndexVecPtr, align 4
%Index0 = extractelement <2 x i16> %IndexVec, i32 0
%res0 = call <8 x float> @llvm.amdgcn.swmmac.f32.16x16x32.fp8.bf8.v8f32.v2i32.v4i32.v8f32.i16(<2 x i32> %A, <4 x i32> %B, <8 x float> %C, i16 %Index0)
store <8 x float> %res0, ptr addrspace(1) %out0
%Index1 = extractelement <2 x i16> %IndexVec, i32 1
%res1 = call <8 x float> @llvm.amdgcn.swmmac.f32.16x16x32.fp8.bf8.v8f32.v2i32.v4i32.v8f32.i16(<2 x i32> %A, <4 x i32> %B, <8 x float> %C, i16 %Index1)
store <8 x float> %res1, ptr addrspace(1) %out1
ret void
}
define amdgpu_ps void @test_swmmac_f32_16x16x32_bf8_fp8_index_key(<2 x i32> %A, <4 x i32> %B, <8 x float> %C, ptr addrspace(1) %IndexVecPtr, ptr addrspace(1) %out0, ptr addrspace(1) %out1) {
; GFX12-LABEL: test_swmmac_f32_16x16x32_bf8_fp8_index_key:
; GFX12: ; %bb.0: ; %bb
; GFX12-NEXT: global_load_b32 v14, v[14:15], off
; GFX12-NEXT: v_dual_mov_b32 v27, v13 :: v_dual_mov_b32 v26, v12
; GFX12-NEXT: v_dual_mov_b32 v25, v11 :: v_dual_mov_b32 v24, v10
; GFX12-NEXT: v_dual_mov_b32 v23, v9 :: v_dual_mov_b32 v22, v8
; GFX12-NEXT: v_dual_mov_b32 v21, v7 :: v_dual_mov_b32 v20, v6
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-NEXT: v_swmmac_f32_16x16x32_bf8_fp8 v[20:27], v[0:1], v[2:5], v14
; GFX12-NEXT: v_swmmac_f32_16x16x32_bf8_fp8 v[6:13], v[0:1], v[2:5], v14 index_key:1
; GFX12-NEXT: s_clause 0x1
; GFX12-NEXT: global_store_b128 v[16:17], v[20:23], off
; GFX12-NEXT: global_store_b128 v[16:17], v[24:27], off offset:16
; GFX12-NEXT: s_clause 0x1
; GFX12-NEXT: global_store_b128 v[18:19], v[6:9], off
; GFX12-NEXT: global_store_b128 v[18:19], v[10:13], off offset:16
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
bb:
%IndexVec = load <2 x i16>, ptr addrspace(1) %IndexVecPtr, align 4
%Index0 = extractelement <2 x i16> %IndexVec, i32 0
%res0 = call <8 x float> @llvm.amdgcn.swmmac.f32.16x16x32.bf8.fp8.v8f32.v2i32.v4i32.v8f32.i16(<2 x i32> %A, <4 x i32> %B, <8 x float> %C, i16 %Index0)
store <8 x float> %res0, ptr addrspace(1) %out0
%Index1 = extractelement <2 x i16> %IndexVec, i32 1
%res1 = call <8 x float> @llvm.amdgcn.swmmac.f32.16x16x32.bf8.fp8.v8f32.v2i32.v4i32.v8f32.i16(<2 x i32> %A, <4 x i32> %B, <8 x float> %C, i16 %Index1)
store <8 x float> %res1, ptr addrspace(1) %out1
ret void
}
define amdgpu_ps void @test_swmmac_f32_16x16x32_bf8_bf8_index_key(<2 x i32> %A, <4 x i32> %B, <8 x float> %C, ptr addrspace(1) %IndexVecPtr, ptr addrspace(1) %out0, ptr addrspace(1) %out1) {
; GFX12-LABEL: test_swmmac_f32_16x16x32_bf8_bf8_index_key:
; GFX12: ; %bb.0: ; %bb
; GFX12-NEXT: global_load_b32 v14, v[14:15], off
; GFX12-NEXT: v_dual_mov_b32 v27, v13 :: v_dual_mov_b32 v26, v12
; GFX12-NEXT: v_dual_mov_b32 v25, v11 :: v_dual_mov_b32 v24, v10
; GFX12-NEXT: v_dual_mov_b32 v23, v9 :: v_dual_mov_b32 v22, v8
; GFX12-NEXT: v_dual_mov_b32 v21, v7 :: v_dual_mov_b32 v20, v6
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-NEXT: v_swmmac_f32_16x16x32_bf8_bf8 v[20:27], v[0:1], v[2:5], v14
; GFX12-NEXT: v_swmmac_f32_16x16x32_bf8_bf8 v[6:13], v[0:1], v[2:5], v14 index_key:1
; GFX12-NEXT: s_clause 0x1
; GFX12-NEXT: global_store_b128 v[16:17], v[20:23], off
; GFX12-NEXT: global_store_b128 v[16:17], v[24:27], off offset:16
; GFX12-NEXT: s_clause 0x1
; GFX12-NEXT: global_store_b128 v[18:19], v[6:9], off
; GFX12-NEXT: global_store_b128 v[18:19], v[10:13], off offset:16
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
bb:
%IndexVec = load <2 x i16>, ptr addrspace(1) %IndexVecPtr, align 4
%Index0 = extractelement <2 x i16> %IndexVec, i32 0
%res0 = call <8 x float> @llvm.amdgcn.swmmac.f32.16x16x32.bf8.bf8.v8f32.v2i32.v4i32.v8f32.i16(<2 x i32> %A, <4 x i32> %B, <8 x float> %C, i16 %Index0)
store <8 x float> %res0, ptr addrspace(1) %out0
%Index1 = extractelement <2 x i16> %IndexVec, i32 1
%res1 = call <8 x float> @llvm.amdgcn.swmmac.f32.16x16x32.bf8.bf8.v8f32.v2i32.v4i32.v8f32.i16(<2 x i32> %A, <4 x i32> %B, <8 x float> %C, i16 %Index1)
store <8 x float> %res1, ptr addrspace(1) %out1
ret void
}
declare <8 x float> @llvm.amdgcn.swmmac.f32.16x16x32.f16.v8f32.v8f16.v16f16.v8f32.i16(<8 x half>, <16 x half>, <8 x float>, i16)
declare <8 x float> @llvm.amdgcn.swmmac.f32.16x16x32.bf16.v8f32.v8i16.v16i16.v8f32.i16(<8 x i16>, <16 x i16>, <8 x float>, i16)
declare <8 x half> @llvm.amdgcn.swmmac.f16.16x16x32.f16.v8f16.v8f16.v16f16.v8f16.i16(<8 x half>, <16 x half>, <8 x half>, i16)
declare <8 x i16> @llvm.amdgcn.swmmac.bf16.16x16x32.bf16.v8i16.v8i16.v16i16.v8i16.i16(<8 x i16>, <16 x i16>, <8 x i16>, i16)
declare <8 x i32> @llvm.amdgcn.swmmac.i32.16x16x32.iu8.v8i32.v2i32.v4i32.v8i32.i16(i1 immarg, <2 x i32>, i1 immarg, <4 x i32>, <8 x i32>, i16 %Index, i1 immarg)
declare <8 x i32> @llvm.amdgcn.swmmac.i32.16x16x32.iu4.v8i32.i32.v2i32.v8i32.i16(i1 immarg, i32, i1 immarg, <2 x i32>, <8 x i32>, i16 %Index, i1 immarg)
declare <8 x float> @llvm.amdgcn.swmmac.f32.16x16x32.fp8.fp8.v8f32.v2i32.v4i32.v8f32.i16(<2 x i32>, <4 x i32>, <8 x float>, i16)
declare <8 x float> @llvm.amdgcn.swmmac.f32.16x16x32.fp8.bf8.v8f32.v2i32.v4i32.v8f32.i16(<2 x i32>, <4 x i32>, <8 x float>, i16)
declare <8 x float> @llvm.amdgcn.swmmac.f32.16x16x32.bf8.fp8.v8f32.v2i32.v4i32.v8f32.i16(<2 x i32>, <4 x i32>, <8 x float>, i16)
declare <8 x float> @llvm.amdgcn.swmmac.f32.16x16x32.bf8.bf8.v8f32.v2i32.v4i32.v8f32.i16(<2 x i32>, <4 x i32>, <8 x float>, i16)

View File

@ -0,0 +1,370 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
; RUN: llc -global-isel -march=amdgcn -mcpu=gfx1200 -verify-machineinstrs < %s | FileCheck %s --check-prefix=GFX12
define amdgpu_ps void @test_wmma_f32_16x16x16_f16(<8 x half> %A, <8 x half> %B, <8 x float> %C, ptr addrspace(1) %out) {
; GFX12-LABEL: test_wmma_f32_16x16x16_f16:
; GFX12: ; %bb.0: ; %bb
; GFX12-NEXT: v_wmma_f32_16x16x16_f16 v[8:15], v[0:3], v[4:7], v[8:15]
; GFX12-NEXT: s_clause 0x1
; GFX12-NEXT: global_store_b128 v[16:17], v[8:11], off
; GFX12-NEXT: global_store_b128 v[16:17], v[12:15], off offset:16
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
bb:
%res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.f16.v8f32.v8f16.v8f16.v8f32(<8 x half> %A, <8 x half> %B, <8 x float> %C)
store <8 x float> %res, ptr addrspace(1) %out
ret void
}
define amdgpu_ps void @test_wmma_f32_16x16x16_bf16(<8 x i16> %A, <8 x i16> %B, <8 x float> %C, ptr addrspace(1) %out) {
; GFX12-LABEL: test_wmma_f32_16x16x16_bf16:
; GFX12: ; %bb.0: ; %bb
; GFX12-NEXT: v_wmma_f32_16x16x16_bf16 v[8:15], v[0:3], v[4:7], v[8:15]
; GFX12-NEXT: s_clause 0x1
; GFX12-NEXT: global_store_b128 v[16:17], v[8:11], off
; GFX12-NEXT: global_store_b128 v[16:17], v[12:15], off offset:16
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
bb:
%res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf16.v8f32.v8i16.v8i16.v8f32(<8 x i16> %A, <8 x i16> %B, <8 x float> %C)
store <8 x float> %res, ptr addrspace(1) %out
ret void
}
define amdgpu_ps void @test_wmma_f16_16x16x16_f16(<8 x half> %A, <8 x half> %B, <8 x half> %C, ptr addrspace(1) %out) {
; GFX12-LABEL: test_wmma_f16_16x16x16_f16:
; GFX12: ; %bb.0: ; %bb
; GFX12-NEXT: v_wmma_f16_16x16x16_f16 v[8:11], v[0:3], v[4:7], v[8:11]
; GFX12-NEXT: global_store_b128 v[12:13], v[8:11], off
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
bb:
%res = call <8 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16.v8f16.v8f16.v8f16.v8f16(<8 x half> %A, <8 x half> %B, <8 x half> %C, i1 0)
store <8 x half> %res, ptr addrspace(1) %out
ret void
}
define amdgpu_ps void @test_wmma_bf16_16x16x16_bf16(<8 x i16> %A, <8 x i16> %B, <8 x i16> %C, ptr addrspace(1) %out) {
; GFX12-LABEL: test_wmma_bf16_16x16x16_bf16:
; GFX12: ; %bb.0: ; %bb
; GFX12-NEXT: v_wmma_bf16_16x16x16_bf16 v[8:11], v[0:3], v[4:7], v[8:11]
; GFX12-NEXT: global_store_b128 v[12:13], v[8:11], off
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
bb:
%res = call <8 x i16> @llvm.amdgcn.wmma.bf16.16x16x16.bf16.v8i16.v8i16.v8i16.v8i16(<8 x i16> %A, <8 x i16> %B, <8 x i16> %C, i1 0)
store <8 x i16> %res, ptr addrspace(1) %out
ret void
}
define amdgpu_ps void @test_wmma_i32_16x16x16_iu8(<2 x i32> %A, <2 x i32> %B, <8 x i32> %C, ptr addrspace(1) %out) {
; GFX12-LABEL: test_wmma_i32_16x16x16_iu8:
; GFX12: ; %bb.0: ; %bb
; GFX12-NEXT: v_wmma_i32_16x16x16_iu8 v[4:11], v[0:1], v[2:3], v[4:11]
; GFX12-NEXT: s_clause 0x1
; GFX12-NEXT: global_store_b128 v[12:13], v[4:7], off
; GFX12-NEXT: global_store_b128 v[12:13], v[8:11], off offset:16
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
bb:
%res = call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8.v8i32.v2i32.v2i32.v8i32(i1 0, <2 x i32> %A, i1 0, <2 x i32> %B, <8 x i32> %C, i1 0)
store <8 x i32> %res, ptr addrspace(1) %out
ret void
}
define amdgpu_ps void @test_wmma_i32_16x16x16_iu4(i32 %A, i32 %B, <8 x i32> %C, ptr addrspace(1) %out) {
; GFX12-LABEL: test_wmma_i32_16x16x16_iu4:
; GFX12: ; %bb.0: ; %bb
; GFX12-NEXT: v_wmma_i32_16x16x16_iu4 v[2:9], v0, v1, v[2:9]
; GFX12-NEXT: s_clause 0x1
; GFX12-NEXT: global_store_b128 v[10:11], v[2:5], off
; GFX12-NEXT: global_store_b128 v[10:11], v[6:9], off offset:16
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
bb:
%res = call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4.v8i32.i32.i32.v8i32(i1 0, i32 %A, i1 0, i32 %B, <8 x i32> %C, i1 0)
store <8 x i32> %res, ptr addrspace(1) %out
ret void
}
define amdgpu_ps void @test_wmma_f32_16x16x16_fp8_fp8(<2 x i32> %A, <2 x i32> %B, <8 x float> %C, ptr addrspace(1) %out) {
; GFX12-LABEL: test_wmma_f32_16x16x16_fp8_fp8:
; GFX12: ; %bb.0: ; %bb
; GFX12-NEXT: v_wmma_f32_16x16x16_fp8_fp8 v[4:11], v[0:1], v[2:3], v[4:11]
; GFX12-NEXT: s_clause 0x1
; GFX12-NEXT: global_store_b128 v[12:13], v[4:7], off
; GFX12-NEXT: global_store_b128 v[12:13], v[8:11], off offset:16
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
bb:
%res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.fp8.fp8.v8f32.v2i32.v2i32.v8f32(<2 x i32> %A, <2 x i32> %B, <8 x float> %C)
store <8 x float> %res, ptr addrspace(1) %out
ret void
}
define amdgpu_ps void @test_wmma_f32_16x16x16_bf8_fp8(<2 x i32> %A, <2 x i32> %B, <8 x float> %C, ptr addrspace(1) %out) {
; GFX12-LABEL: test_wmma_f32_16x16x16_bf8_fp8:
; GFX12: ; %bb.0: ; %bb
; GFX12-NEXT: v_wmma_f32_16x16x16_bf8_fp8 v[4:11], v[0:1], v[2:3], v[4:11]
; GFX12-NEXT: s_clause 0x1
; GFX12-NEXT: global_store_b128 v[12:13], v[4:7], off
; GFX12-NEXT: global_store_b128 v[12:13], v[8:11], off offset:16
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
bb:
%res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf8.fp8.v8f32.v2i32.v2i32.v8f32(<2 x i32> %A, <2 x i32> %B, <8 x float> %C)
store <8 x float> %res, ptr addrspace(1) %out
ret void
}
define amdgpu_ps void @test_wmma_f32_16x16x16_fp8_bf8(<2 x i32> %A, <2 x i32> %B, <8 x float> %C, ptr addrspace(1) %out) {
; GFX12-LABEL: test_wmma_f32_16x16x16_fp8_bf8:
; GFX12: ; %bb.0: ; %bb
; GFX12-NEXT: v_wmma_f32_16x16x16_fp8_bf8 v[4:11], v[0:1], v[2:3], v[4:11]
; GFX12-NEXT: s_clause 0x1
; GFX12-NEXT: global_store_b128 v[12:13], v[4:7], off
; GFX12-NEXT: global_store_b128 v[12:13], v[8:11], off offset:16
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
bb:
%res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.fp8.bf8.v8f32.v2i32.v2i32.v8f32(<2 x i32> %A, <2 x i32> %B, <8 x float> %C)
store <8 x float> %res, ptr addrspace(1) %out
ret void
}
define amdgpu_ps void @test_wmma_f32_16x16x16_bf8_bf8(<2 x i32> %A, <2 x i32> %B, <8 x float> %C, ptr addrspace(1) %out) {
; GFX12-LABEL: test_wmma_f32_16x16x16_bf8_bf8:
; GFX12: ; %bb.0: ; %bb
; GFX12-NEXT: v_wmma_f32_16x16x16_bf8_bf8 v[4:11], v[0:1], v[2:3], v[4:11]
; GFX12-NEXT: s_clause 0x1
; GFX12-NEXT: global_store_b128 v[12:13], v[4:7], off
; GFX12-NEXT: global_store_b128 v[12:13], v[8:11], off offset:16
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
bb:
%res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf8.bf8.v8f32.v2i32.v2i32.v8f32(<2 x i32> %A, <2 x i32> %B, <8 x float> %C)
store <8 x float> %res, ptr addrspace(1) %out
ret void
}
define amdgpu_ps void @test_wmma_i32_16x16x32_iu4(<2 x i32> %A, <2 x i32> %B, <8 x i32> %C, ptr addrspace(1) %out) {
; GFX12-LABEL: test_wmma_i32_16x16x32_iu4:
; GFX12: ; %bb.0: ; %bb
; GFX12-NEXT: v_wmma_i32_16x16x32_iu4 v[4:11], v[0:1], v[2:3], v[4:11]
; GFX12-NEXT: s_clause 0x1
; GFX12-NEXT: global_store_b128 v[12:13], v[4:7], off
; GFX12-NEXT: global_store_b128 v[12:13], v[8:11], off offset:16
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
bb:
%res = call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x32.iu4.v8i32.v2i32.v2i32.v8i32(i1 0, <2 x i32> %A, i1 0, <2 x i32> %B, <8 x i32> %C, i1 0)
store <8 x i32> %res, ptr addrspace(1) %out
ret void
}
define amdgpu_ps void @test_swmmac_f32_16x16x32_f16(<8 x half> %A, <16 x half> %B, <8 x float> %C, i16 %Index, ptr addrspace(1) %out) {
; GFX12-LABEL: test_swmmac_f32_16x16x32_f16:
; GFX12: ; %bb.0: ; %bb
; GFX12-NEXT: v_swmmac_f32_16x16x32_f16 v[12:19], v[0:3], v[4:11], v20
; GFX12-NEXT: s_clause 0x1
; GFX12-NEXT: global_store_b128 v[21:22], v[12:15], off
; GFX12-NEXT: global_store_b128 v[21:22], v[16:19], off offset:16
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
bb:
%res = call <8 x float> @llvm.amdgcn.swmmac.f32.16x16x32.f16.v8f32.v8f16.v16f16.v8f32.i16(<8 x half> %A, <16 x half> %B, <8 x float> %C, i16 %Index)
store <8 x float> %res, ptr addrspace(1) %out
ret void
}
define amdgpu_ps void @test_swmmac_f32_16x16x32_bf16(<8 x i16> %A, <16 x i16> %B, <8 x float> %C, i16 %Index, ptr addrspace(1) %out) {
; GFX12-LABEL: test_swmmac_f32_16x16x32_bf16:
; GFX12: ; %bb.0: ; %bb
; GFX12-NEXT: v_swmmac_f32_16x16x32_bf16 v[12:19], v[0:3], v[4:11], v20
; GFX12-NEXT: s_clause 0x1
; GFX12-NEXT: global_store_b128 v[21:22], v[12:15], off
; GFX12-NEXT: global_store_b128 v[21:22], v[16:19], off offset:16
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
bb:
%res = call <8 x float> @llvm.amdgcn.swmmac.f32.16x16x32.bf16.v8f32.v8i16.v16i16.v8f32.i16(<8 x i16> %A, <16 x i16> %B, <8 x float> %C, i16 %Index)
store <8 x float> %res, ptr addrspace(1) %out
ret void
}
define amdgpu_ps void @test_swmmac_f16_16x16x32_f16(<8 x half> %A, <16 x half> %B, <8 x half> %C, i16 %Index, ptr addrspace(1) %out) {
; GFX12-LABEL: test_swmmac_f16_16x16x32_f16:
; GFX12: ; %bb.0: ; %bb
; GFX12-NEXT: v_swmmac_f16_16x16x32_f16 v[12:15], v[0:3], v[4:11], v16
; GFX12-NEXT: global_store_b128 v[17:18], v[12:15], off
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
bb:
%res = call <8 x half> @llvm.amdgcn.swmmac.f16.16x16x32.f16.v8f16.v8f16.v16f16.v8f16.i16(<8 x half> %A, <16 x half> %B, <8 x half> %C, i16 %Index)
store <8 x half> %res, ptr addrspace(1) %out
ret void
}
define amdgpu_ps void @test_swmmac_bf16_16x16x32_bf16(<8 x i16> %A, <16 x i16> %B, <8 x i16> %C, i16 %Index, ptr addrspace(1) %out) {
; GFX12-LABEL: test_swmmac_bf16_16x16x32_bf16:
; GFX12: ; %bb.0: ; %bb
; GFX12-NEXT: v_swmmac_bf16_16x16x32_bf16 v[12:15], v[0:3], v[4:11], v16
; GFX12-NEXT: global_store_b128 v[17:18], v[12:15], off
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
bb:
%res = call <8 x i16> @llvm.amdgcn.swmmac.bf16.16x16x32.bf16.v8i16.v8i16.v16i16.v8i16.i16(<8 x i16> %A, <16 x i16> %B, <8 x i16> %C, i16 %Index)
store <8 x i16> %res, ptr addrspace(1) %out
ret void
}
define amdgpu_ps void @test_swmmac_i32_16x16x32_iu8(<2 x i32> %A, <4 x i32> %B, <8 x i32> %C, i16 %Index, ptr addrspace(1) %out) {
; GFX12-LABEL: test_swmmac_i32_16x16x32_iu8:
; GFX12: ; %bb.0: ; %bb
; GFX12-NEXT: v_swmmac_i32_16x16x32_iu8 v[6:13], v[0:1], v[2:5], v14
; GFX12-NEXT: s_clause 0x1
; GFX12-NEXT: global_store_b128 v[15:16], v[6:9], off
; GFX12-NEXT: global_store_b128 v[15:16], v[10:13], off offset:16
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
bb:
%res = call <8 x i32> @llvm.amdgcn.swmmac.i32.16x16x32.iu8.v8i32.v2i32.v4i32.v8i32.i16(i1 0, <2 x i32> %A, i1 0, <4 x i32> %B, <8 x i32> %C, i16 %Index, i1 0)
store <8 x i32> %res, ptr addrspace(1) %out
ret void
}
define amdgpu_ps void @test_swmmac_i32_16x16x32_iu4(i32 %A, <2 x i32> %B, <8 x i32> %C, i16 %Index, ptr addrspace(1) %out) {
; GFX12-LABEL: test_swmmac_i32_16x16x32_iu4:
; GFX12: ; %bb.0: ; %bb
; GFX12-NEXT: v_swmmac_i32_16x16x32_iu4 v[3:10], v0, v[1:2], v11
; GFX12-NEXT: s_clause 0x1
; GFX12-NEXT: global_store_b128 v[12:13], v[3:6], off
; GFX12-NEXT: global_store_b128 v[12:13], v[7:10], off offset:16
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
bb:
%res = call <8 x i32> @llvm.amdgcn.swmmac.i32.16x16x32.iu4.v8i32.i32.v2i32.v8i32.i16(i1 0, i32 %A, i1 0, <2 x i32> %B, <8 x i32> %C, i16 %Index, i1 0)
store <8 x i32> %res, ptr addrspace(1) %out
ret void
}
define amdgpu_ps void @test_swmmac_i32_16x16x64_iu4(<2 x i32> %A, <4 x i32> %B, <8 x i32> %C, i32 %Index, ptr addrspace(1) %out) {
; GFX12-LABEL: test_swmmac_i32_16x16x64_iu4:
; GFX12: ; %bb.0: ; %bb
; GFX12-NEXT: v_swmmac_i32_16x16x64_iu4 v[6:13], v[0:1], v[2:5], v14
; GFX12-NEXT: s_clause 0x1
; GFX12-NEXT: global_store_b128 v[15:16], v[6:9], off
; GFX12-NEXT: global_store_b128 v[15:16], v[10:13], off offset:16
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
bb:
%res = call <8 x i32> @llvm.amdgcn.swmmac.i32.16x16x64.iu4.v8i32.v2i32.v4i32.v8i32.i32(i1 0, <2 x i32> %A, i1 0, <4 x i32> %B, <8 x i32> %C, i32 %Index, i1 0)
store <8 x i32> %res, ptr addrspace(1) %out
ret void
}
define amdgpu_ps void @test_swmmac_f32_16x16x32_fp8_fp8(<2 x i32> %A, <4 x i32> %B, <8 x float> %C, i16 %Index, ptr addrspace(1) %out) {
; GFX12-LABEL: test_swmmac_f32_16x16x32_fp8_fp8:
; GFX12: ; %bb.0: ; %bb
; GFX12-NEXT: v_swmmac_f32_16x16x32_fp8_fp8 v[6:13], v[0:1], v[2:5], v14
; GFX12-NEXT: s_clause 0x1
; GFX12-NEXT: global_store_b128 v[15:16], v[6:9], off
; GFX12-NEXT: global_store_b128 v[15:16], v[10:13], off offset:16
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
bb:
%res = call <8 x float> @llvm.amdgcn.swmmac.f32.16x16x32.fp8.fp8.v8f32.v2i32.v4i32.v8f32.i16(<2 x i32> %A, <4 x i32> %B, <8 x float> %C, i16 %Index)
store <8 x float> %res, ptr addrspace(1) %out
ret void
}
define amdgpu_ps void @test_swmmac_f32_16x16x32_fp8_bf8(<2 x i32> %A, <4 x i32> %B, <8 x float> %C, i16 %Index, ptr addrspace(1) %out) {
; GFX12-LABEL: test_swmmac_f32_16x16x32_fp8_bf8:
; GFX12: ; %bb.0: ; %bb
; GFX12-NEXT: v_swmmac_f32_16x16x32_fp8_bf8 v[6:13], v[0:1], v[2:5], v14
; GFX12-NEXT: s_clause 0x1
; GFX12-NEXT: global_store_b128 v[15:16], v[6:9], off
; GFX12-NEXT: global_store_b128 v[15:16], v[10:13], off offset:16
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
bb:
%res = call <8 x float> @llvm.amdgcn.swmmac.f32.16x16x32.fp8.bf8.v8f32.v2i32.v4i32.v8f32.i16(<2 x i32> %A, <4 x i32> %B, <8 x float> %C, i16 %Index)
store <8 x float> %res, ptr addrspace(1) %out
ret void
}
define amdgpu_ps void @test_swmmac_f32_16x16x32_bf8_fp8(<2 x i32> %A, <4 x i32> %B, <8 x float> %C, i16 %Index, ptr addrspace(1) %out) {
; GFX12-LABEL: test_swmmac_f32_16x16x32_bf8_fp8:
; GFX12: ; %bb.0: ; %bb
; GFX12-NEXT: v_swmmac_f32_16x16x32_bf8_fp8 v[6:13], v[0:1], v[2:5], v14
; GFX12-NEXT: s_clause 0x1
; GFX12-NEXT: global_store_b128 v[15:16], v[6:9], off
; GFX12-NEXT: global_store_b128 v[15:16], v[10:13], off offset:16
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
bb:
%res = call <8 x float> @llvm.amdgcn.swmmac.f32.16x16x32.bf8.fp8.v8f32.v2i32.v4i32.v8f32.i16(<2 x i32> %A, <4 x i32> %B, <8 x float> %C, i16 %Index)
store <8 x float> %res, ptr addrspace(1) %out
ret void
}
define amdgpu_ps void @test_swmmac_f32_16x16x32_bf8_bf8(<2 x i32> %A, <4 x i32> %B, <8 x float> %C, i16 %Index, ptr addrspace(1) %out) {
; GFX12-LABEL: test_swmmac_f32_16x16x32_bf8_bf8:
; GFX12: ; %bb.0: ; %bb
; GFX12-NEXT: v_swmmac_f32_16x16x32_bf8_bf8 v[6:13], v[0:1], v[2:5], v14
; GFX12-NEXT: s_clause 0x1
; GFX12-NEXT: global_store_b128 v[15:16], v[6:9], off
; GFX12-NEXT: global_store_b128 v[15:16], v[10:13], off offset:16
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
bb:
%res = call <8 x float> @llvm.amdgcn.swmmac.f32.16x16x32.bf8.bf8.v8f32.v2i32.v4i32.v8f32.i16(<2 x i32> %A, <4 x i32> %B, <8 x float> %C, i16 %Index)
store <8 x float> %res, ptr addrspace(1) %out
ret void
}
declare <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.f16.v8f32.v8f16.v8f16.v8f32(<8 x half>, <8 x half>, <8 x float>)
declare <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf16.v8f32.v8i16.v8i16.v8f32(<8 x i16>, <8 x i16>, <8 x float>)
declare <8 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16.v8f16.v8f16.v8f16.v8f16(<8 x half>, <8 x half>, <8 x half>, i1 immarg)
declare <8 x i16> @llvm.amdgcn.wmma.bf16.16x16x16.bf16.v8i16.v8i16.v8i16.v8i16(<8 x i16>, <8 x i16>, <8 x i16>, i1 immarg)
declare <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8.v8i32.v2i32.v2i32.v8i32(i1 immarg, <2 x i32>, i1 immarg, <2 x i32>, <8 x i32>, i1 immarg)
declare <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4.v8i32.i32.i32.v8i32(i1 immarg, i32, i1 immarg, i32, <8 x i32>, i1 immarg)
declare <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.fp8.fp8.v8f32.v2i32.v2i32.v8f32(<2 x i32>, <2 x i32>, <8 x float>)
declare <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.fp8.bf8.v8f32.v2i32.v2i32.v8f32(<2 x i32>, <2 x i32>, <8 x float>)
declare <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf8.fp8.v8f32.v2i32.v2i32.v8f32(<2 x i32>, <2 x i32>, <8 x float>)
declare <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf8.bf8.v8f32.v2i32.v2i32.v8f32(<2 x i32>, <2 x i32>, <8 x float>)
declare <8 x i32> @llvm.amdgcn.wmma.i32.16x16x32.iu4.v8i32.v2i32.v2i32.v8i32(i1 immarg, <2 x i32>, i1 immarg, <2 x i32>, <8 x i32>, i1 immarg)
declare <8 x float> @llvm.amdgcn.swmmac.f32.16x16x32.f16.v8f32.v8f16.v16f16.v8f32.i16(<8 x half>, <16 x half>, <8 x float>, i16)
declare <8 x float> @llvm.amdgcn.swmmac.f32.16x16x32.bf16.v8f32.v8i16.v16i16.v8f32.i16(<8 x i16>, <16 x i16>, <8 x float>, i16)
declare <8 x half> @llvm.amdgcn.swmmac.f16.16x16x32.f16.v8f16.v8f16.v16f16.v8f16.i16(<8 x half>, <16 x half>, <8 x half>, i16)
declare <8 x i16> @llvm.amdgcn.swmmac.bf16.16x16x32.bf16.v8i16.v8i16.v16i16.v8i16.i16(<8 x i16>, <16 x i16>, <8 x i16>, i16)
declare <8 x i32> @llvm.amdgcn.swmmac.i32.16x16x32.iu8.v8i32.v2i32.v4i32.v8i32.i16(i1 immarg, <2 x i32>, i1 immarg, <4 x i32>, <8 x i32>, i16 %Index, i1 immarg)
declare <8 x i32> @llvm.amdgcn.swmmac.i32.16x16x32.iu4.v8i32.i32.v2i32.v8i32.i16(i1 immarg, i32, i1 immarg, <2 x i32>, <8 x i32>, i16 %Index, i1 immarg)
declare <8 x i32> @llvm.amdgcn.swmmac.i32.16x16x64.iu4.v8i32.v2i32.v4i32.v8i32.i32(i1 immarg, <2 x i32>, i1 immarg, <4 x i32>, <8 x i32>, i32 %Index, i1 immarg)
declare <8 x float> @llvm.amdgcn.swmmac.f32.16x16x32.fp8.fp8.v8f32.v2i32.v4i32.v8f32.i16(<2 x i32>, <4 x i32>, <8 x float>, i16)
declare <8 x float> @llvm.amdgcn.swmmac.f32.16x16x32.fp8.bf8.v8f32.v2i32.v4i32.v8f32.i16(<2 x i32>, <4 x i32>, <8 x float>, i16)
declare <8 x float> @llvm.amdgcn.swmmac.f32.16x16x32.bf8.fp8.v8f32.v2i32.v4i32.v8f32.i16(<2 x i32>, <4 x i32>, <8 x float>, i16)
declare <8 x float> @llvm.amdgcn.swmmac.f32.16x16x32.bf8.bf8.v8f32.v2i32.v4i32.v8f32.i16(<2 x i32>, <4 x i32>, <8 x float>, i16)

View File

@ -0,0 +1,459 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
; RUN: llc -global-isel -march=amdgcn -mcpu=gfx1200 -mattr=-wavefrontsize32,+wavefrontsize64 -verify-machineinstrs < %s | FileCheck %s --check-prefix=GFX12
define amdgpu_ps void @test_wmma_f32_16x16x16_f16_negA(<4 x half> %A, <4 x half> %B, <4 x float> %C, <4 x float> addrspace(1)* %out) {
; GFX12-LABEL: test_wmma_f32_16x16x16_f16_negA:
; GFX12: ; %bb.0: ; %bb
; GFX12-NEXT: v_wmma_f32_16x16x16_f16 v[4:7], v[0:1], v[2:3], v[4:7] neg_lo:[1,0,0] neg_hi:[1,0,0]
; GFX12-NEXT: global_store_b128 v[8:9], v[4:7], off
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
bb:
%fneg.A = fneg <4 x half> %A
%res = call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.f16.v4f16.v4f32(<4 x half> %fneg.A, <4 x half> %B, <4 x float> %C)
store <4 x float> %res, ptr addrspace(1) %out
ret void
}
define amdgpu_ps void @test_wmma_f32_16x16x16_f16_negB(<4 x half> %A, <4 x half> %B, <4 x float> %C, <4 x float> addrspace(1)* %out) {
; GFX12-LABEL: test_wmma_f32_16x16x16_f16_negB:
; GFX12: ; %bb.0: ; %bb
; GFX12-NEXT: v_wmma_f32_16x16x16_f16 v[4:7], v[0:1], v[2:3], v[4:7] neg_lo:[0,1,0] neg_hi:[0,1,0]
; GFX12-NEXT: global_store_b128 v[8:9], v[4:7], off
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
bb:
%fneg.B = fneg <4 x half> %B
%res = call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.f16.v4f16.v4f32(<4 x half> %A, <4 x half> %fneg.B, <4 x float> %C)
store <4 x float> %res, <4 x float> addrspace(1)* %out
ret void
}
define amdgpu_ps void @test_wmma_f32_16x16x16_f16_negC(<4 x half> %A, <4 x half> %B, <4 x float> %C, <4 x float> addrspace(1)* %out) {
; GFX12-LABEL: test_wmma_f32_16x16x16_f16_negC:
; GFX12: ; %bb.0: ; %bb
; GFX12-NEXT: v_wmma_f32_16x16x16_f16 v[4:7], v[0:1], v[2:3], v[4:7] neg_lo:[0,0,1]
; GFX12-NEXT: global_store_b128 v[8:9], v[4:7], off
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
bb:
%fneg.C = fneg <4 x float> %C
%res = call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.f16.v4f16.v4f32(<4 x half> %A, <4 x half> %B, <4 x float> %fneg.C)
store <4 x float> %res, <4 x float> addrspace(1)* %out
ret void
}
define amdgpu_ps void @test_wmma_f32_16x16x16_f16_absC(<4 x half> %A, <4 x half> %B, <4 x float> %C, <4 x float> addrspace(1)* %out) {
; GFX12-LABEL: test_wmma_f32_16x16x16_f16_absC:
; GFX12: ; %bb.0: ; %bb
; GFX12-NEXT: v_wmma_f32_16x16x16_f16 v[4:7], v[0:1], v[2:3], v[4:7] neg_hi:[0,0,1]
; GFX12-NEXT: global_store_b128 v[8:9], v[4:7], off
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
bb:
%fabs.C = call <4 x float> @llvm.fabs.v4f32(<4 x float> %C)
%res = call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.f16.v4f16.v4f32(<4 x half> %A, <4 x half> %B, <4 x float> %fabs.C)
store <4 x float> %res, <4 x float> addrspace(1)* %out
ret void
}
define amdgpu_ps void @test_wmma_f32_16x16x16_bf16_negC(<4 x i16> %A, <4 x i16> %B, <4 x float> %C, <4 x float> addrspace(1)* %out) {
; GFX12-LABEL: test_wmma_f32_16x16x16_bf16_negC:
; GFX12: ; %bb.0: ; %bb
; GFX12-NEXT: v_wmma_f32_16x16x16_bf16 v[4:7], v[0:1], v[2:3], v[4:7] neg_lo:[0,0,1]
; GFX12-NEXT: global_store_b128 v[8:9], v[4:7], off
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
bb:
%fneg.C = fneg <4 x float> %C
%res = call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf16.v4i16.v4f32(<4 x i16> %A, <4 x i16> %B, <4 x float> %fneg.C)
store <4 x float> %res, <4 x float> addrspace(1)* %out
ret void
}
define amdgpu_ps void @test_wmma_f32_16x16x16_bf16_absC(<4 x i16> %A, <4 x i16> %B, <4 x float> %C, <4 x float> addrspace(1)* %out) {
; GFX12-LABEL: test_wmma_f32_16x16x16_bf16_absC:
; GFX12: ; %bb.0: ; %bb
; GFX12-NEXT: v_wmma_f32_16x16x16_bf16 v[4:7], v[0:1], v[2:3], v[4:7] neg_hi:[0,0,1]
; GFX12-NEXT: global_store_b128 v[8:9], v[4:7], off
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
bb:
%fabs.C = call <4 x float> @llvm.fabs.v4f32(<4 x float> %C)
%res = call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf16.v4i16.v4f32(<4 x i16> %A, <4 x i16> %B, <4 x float> %fabs.C)
store <4 x float> %res, <4 x float> addrspace(1)* %out
ret void
}
define amdgpu_ps void @test_wmma_f16_16x16x16_f16_negA(<4 x half> %A, <4 x half> %B, <4 x half> %C, <4 x half> addrspace(1)* %out) {
; GFX12-LABEL: test_wmma_f16_16x16x16_f16_negA:
; GFX12: ; %bb.0: ; %bb
; GFX12-NEXT: v_wmma_f16_16x16x16_f16 v[4:5], v[0:1], v[2:3], v[4:5] neg_lo:[1,0,0] neg_hi:[1,0,0]
; GFX12-NEXT: global_store_b64 v[6:7], v[4:5], off
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
bb:
%fneg.A = fneg <4 x half> %A
%res = call <4 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16.v8f16.v8f16(<4 x half> %fneg.A, <4 x half> %B, <4 x half> %C, i1 0)
store <4 x half> %res, <4 x half> addrspace(1)* %out
ret void
}
define amdgpu_ps void @test_wmma_f16_16x16x16_f16_negB(<4 x half> %A, <4 x half> %B, <4 x half> %C, <4 x half> addrspace(1)* %out) {
; GFX12-LABEL: test_wmma_f16_16x16x16_f16_negB:
; GFX12: ; %bb.0: ; %bb
; GFX12-NEXT: v_wmma_f16_16x16x16_f16 v[4:5], v[0:1], v[2:3], v[4:5] neg_lo:[0,1,0] neg_hi:[0,1,0]
; GFX12-NEXT: global_store_b64 v[6:7], v[4:5], off
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
bb:
%fneg.B = fneg <4 x half> %B
%res = call <4 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16.v8f16.v8f16(<4 x half> %A, <4 x half> %fneg.B, <4 x half> %C, i1 0)
store <4 x half> %res, <4 x half> addrspace(1)* %out
ret void
}
define amdgpu_ps void @test_wmma_f16_16x16x16_f16_negC(<4 x half> %A, <4 x half> %B, <4 x half> %C, <4 x half> addrspace(1)* %out) {
; GFX12-LABEL: test_wmma_f16_16x16x16_f16_negC:
; GFX12: ; %bb.0: ; %bb
; GFX12-NEXT: v_wmma_f16_16x16x16_f16 v[4:5], v[0:1], v[2:3], v[4:5] neg_lo:[0,0,1]
; GFX12-NEXT: global_store_b64 v[6:7], v[4:5], off
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
bb:
%fneg.C = fneg <4 x half> %C
%res = call <4 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16.v8f16.v8f16(<4 x half> %A, <4 x half> %B, <4 x half> %fneg.C, i1 0)
store <4 x half> %res, <4 x half> addrspace(1)* %out
ret void
}
define amdgpu_ps void @test_wmma_f16_16x16x16_f16_absC(<4 x half> %A, <4 x half> %B, <4 x half> %C, <4 x half> addrspace(1)* %out) {
; GFX12-LABEL: test_wmma_f16_16x16x16_f16_absC:
; GFX12: ; %bb.0: ; %bb
; GFX12-NEXT: v_wmma_f16_16x16x16_f16 v[4:5], v[0:1], v[2:3], v[4:5] neg_hi:[0,0,1]
; GFX12-NEXT: global_store_b64 v[6:7], v[4:5], off
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
bb:
%fabs.C = call <4 x half> @llvm.fabs.v4f16(<4 x half> %C)
%res = call <4 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16.v8f16.v8f16(<4 x half> %A, <4 x half> %B, <4 x half> %fabs.C, i1 0)
store <4 x half> %res, <4 x half> addrspace(1)* %out
ret void
}
define amdgpu_ps void @test_wmma_f32_16x16x16_fp8_fp8_negC(i32 %A, i32 %B, <4 x float> %C, <4 x float> addrspace(1)* %out) {
; GFX12-LABEL: test_wmma_f32_16x16x16_fp8_fp8_negC:
; GFX12: ; %bb.0: ; %bb
; GFX12-NEXT: v_wmma_f32_16x16x16_fp8_fp8 v[2:5], v0, v1, v[2:5] neg_lo:[0,0,1]
; GFX12-NEXT: global_store_b128 v[6:7], v[2:5], off
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
bb:
%fneg.C = fneg <4 x float> %C
%res = call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.fp8.fp8.i32.v4f32(i32 %A, i32 %B, <4 x float> %fneg.C)
store <4 x float> %res, <4 x float> addrspace(1)* %out
ret void
}
define amdgpu_ps void @test_wmma_f32_16x16x16_fp8_fp8_absC(i32 %A, i32 %B, <4 x float> %C, <4 x float> addrspace(1)* %out) {
; GFX12-LABEL: test_wmma_f32_16x16x16_fp8_fp8_absC:
; GFX12: ; %bb.0: ; %bb
; GFX12-NEXT: v_wmma_f32_16x16x16_fp8_fp8 v[2:5], v0, v1, v[2:5] neg_hi:[0,0,1]
; GFX12-NEXT: global_store_b128 v[6:7], v[2:5], off
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
bb:
%fabs.C = call <4 x float> @llvm.fabs.v4f32(<4 x float> %C)
%res = call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.fp8.fp8.i32.v4f32(i32 %A, i32 %B, <4 x float> %fabs.C)
store <4 x float> %res, <4 x float> addrspace(1)* %out
ret void
}
define amdgpu_ps void @test_wmma_f32_16x16x16_bf8_fp8_negC(i32 %A, i32 %B, <4 x float> %C, <4 x float> addrspace(1)* %out) {
; GFX12-LABEL: test_wmma_f32_16x16x16_bf8_fp8_negC:
; GFX12: ; %bb.0: ; %bb
; GFX12-NEXT: v_wmma_f32_16x16x16_bf8_fp8 v[2:5], v0, v1, v[2:5] neg_lo:[0,0,1]
; GFX12-NEXT: global_store_b128 v[6:7], v[2:5], off
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
bb:
%fneg.C = fneg <4 x float> %C
%res = call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf8.fp8.i32.v4f32(i32 %A, i32 %B, <4 x float> %fneg.C)
store <4 x float> %res, <4 x float> addrspace(1)* %out
ret void
}
define amdgpu_ps void @test_wmma_f32_16x16x16_bf8_fp8_absC(i32 %A, i32 %B, <4 x float> %C, <4 x float> addrspace(1)* %out) {
; GFX12-LABEL: test_wmma_f32_16x16x16_bf8_fp8_absC:
; GFX12: ; %bb.0: ; %bb
; GFX12-NEXT: v_wmma_f32_16x16x16_bf8_fp8 v[2:5], v0, v1, v[2:5] neg_hi:[0,0,1]
; GFX12-NEXT: global_store_b128 v[6:7], v[2:5], off
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
bb:
%fabs.C = call <4 x float> @llvm.fabs.v4f32(<4 x float> %C)
%res = call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf8.fp8.i32.v4f32(i32 %A, i32 %B, <4 x float> %fabs.C)
store <4 x float> %res, <4 x float> addrspace(1)* %out
ret void
}
define amdgpu_ps void @test_wmma_f32_16x16x16_fp8_bf8_negC(i32 %A, i32 %B, <4 x float> %C, <4 x float> addrspace(1)* %out) {
; GFX12-LABEL: test_wmma_f32_16x16x16_fp8_bf8_negC:
; GFX12: ; %bb.0: ; %bb
; GFX12-NEXT: v_wmma_f32_16x16x16_fp8_bf8 v[2:5], v0, v1, v[2:5] neg_lo:[0,0,1]
; GFX12-NEXT: global_store_b128 v[6:7], v[2:5], off
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
bb:
%fneg.C = fneg <4 x float> %C
%res = call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.fp8.bf8.i32.v4f32(i32 %A, i32 %B, <4 x float> %fneg.C)
store <4 x float> %res, <4 x float> addrspace(1)* %out
ret void
}
define amdgpu_ps void @test_wmma_f32_16x16x16_fp8_bf8_absC(i32 %A, i32 %B, <4 x float> %C, <4 x float> addrspace(1)* %out) {
; GFX12-LABEL: test_wmma_f32_16x16x16_fp8_bf8_absC:
; GFX12: ; %bb.0: ; %bb
; GFX12-NEXT: v_wmma_f32_16x16x16_fp8_bf8 v[2:5], v0, v1, v[2:5] neg_hi:[0,0,1]
; GFX12-NEXT: global_store_b128 v[6:7], v[2:5], off
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
bb:
%fabs.C = call <4 x float> @llvm.fabs.v4f32(<4 x float> %C)
%res = call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.fp8.bf8.i32.v4f32(i32 %A, i32 %B, <4 x float> %fabs.C)
store <4 x float> %res, <4 x float> addrspace(1)* %out
ret void
}
define amdgpu_ps void @test_wmma_f32_16x16x16_bf8_bf8_negC(i32 %A, i32 %B, <4 x float> %C, <4 x float> addrspace(1)* %out) {
; GFX12-LABEL: test_wmma_f32_16x16x16_bf8_bf8_negC:
; GFX12: ; %bb.0: ; %bb
; GFX12-NEXT: v_wmma_f32_16x16x16_bf8_bf8 v[2:5], v0, v1, v[2:5] neg_lo:[0,0,1]
; GFX12-NEXT: global_store_b128 v[6:7], v[2:5], off
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
bb:
%fneg.C = fneg <4 x float> %C
%res = call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf8.bf8.i32.v4f32(i32 %A, i32 %B, <4 x float> %fneg.C)
store <4 x float> %res, <4 x float> addrspace(1)* %out
ret void
}
define amdgpu_ps void @test_wmma_f32_16x16x16_bf8_bf8_absC(i32 %A, i32 %B, <4 x float> %C, <4 x float> addrspace(1)* %out) {
; GFX12-LABEL: test_wmma_f32_16x16x16_bf8_bf8_absC:
; GFX12: ; %bb.0: ; %bb
; GFX12-NEXT: v_wmma_f32_16x16x16_bf8_bf8 v[2:5], v0, v1, v[2:5] neg_hi:[0,0,1]
; GFX12-NEXT: global_store_b128 v[6:7], v[2:5], off
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
bb:
%fabs.C = call <4 x float> @llvm.fabs.v4f32(<4 x float> %C)
%res = call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf8.bf8.i32.v4f32(i32 %A, i32 %B, <4 x float> %fabs.C)
store <4 x float> %res, <4 x float> addrspace(1)* %out
ret void
}
define amdgpu_ps void @test_swmmac_f32_16x16x32_f16_negA(<4 x half> %A, <8 x half> %B, <4 x float> %C, i16 %Index, <4 x float> addrspace(1)* %out) {
; GFX12-LABEL: test_swmmac_f32_16x16x32_f16_negA:
; GFX12: ; %bb.0: ; %bb
; GFX12-NEXT: v_swmmac_f32_16x16x32_f16 v[6:9], v[0:1], v[2:5], v10 neg_lo:[1,0,0] neg_hi:[1,0,0]
; GFX12-NEXT: global_store_b128 v[11:12], v[6:9], off
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
bb:
%fneg.A = fneg <4 x half> %A
%res = call <4 x float> @llvm.amdgcn.swmmac.f32.16x16x32.f16.v4f16.v8f16.v4f32.i16(<4 x half> %fneg.A, <8 x half> %B, <4 x float> %C, i16 %Index)
store <4 x float> %res, <4 x float> addrspace(1)* %out
ret void
}
define amdgpu_ps void @test_swmmac_f32_16x16x32_f16_negB(<4 x half> %A, <8 x half> %B, <4 x float> %C, i16 %Index, <4 x float> addrspace(1)* %out) {
; GFX12-LABEL: test_swmmac_f32_16x16x32_f16_negB:
; GFX12: ; %bb.0: ; %bb
; GFX12-NEXT: v_swmmac_f32_16x16x32_f16 v[6:9], v[0:1], v[2:5], v10 neg_lo:[0,1,0] neg_hi:[0,1,0]
; GFX12-NEXT: global_store_b128 v[11:12], v[6:9], off
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
bb:
%fneg.B = fneg <8 x half> %B
%res = call <4 x float> @llvm.amdgcn.swmmac.f32.16x16x32.f16.v4f16.v8f16.v4f32.i16(<4 x half> %A, <8 x half> %fneg.B, <4 x float> %C, i16 %Index)
store <4 x float> %res, <4 x float> addrspace(1)* %out
ret void
}
define amdgpu_ps void @test_swmmac_f16_16x16x32_f16_negA(<4 x half> %A, <8 x half> %B, <4 x half> %C, i16 %Index, <4 x half> addrspace(1)* %out) {
; GFX12-LABEL: test_swmmac_f16_16x16x32_f16_negA:
; GFX12: ; %bb.0: ; %bb
; GFX12-NEXT: v_swmmac_f16_16x16x32_f16 v[6:7], v[0:1], v[2:5], v8 neg_lo:[1,0,0] neg_hi:[1,0,0]
; GFX12-NEXT: global_store_b64 v[9:10], v[6:7], off
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
bb:
%fneg.A = fneg <4 x half> %A
%res = call <4 x half> @llvm.amdgcn.swmmac.f16.16x16x32.f16.v4f16.v8f16.v4f16.i16(<4 x half> %fneg.A, <8 x half> %B, <4 x half> %C, i16 %Index)
store <4 x half> %res, <4 x half> addrspace(1)* %out
ret void
}
define amdgpu_ps void @test_swmmac_f16_16x16x32_f16_negB(<4 x half> %A, <8 x half> %B, <4 x half> %C, i16 %Index, <4 x half> addrspace(1)* %out) {
; GFX12-LABEL: test_swmmac_f16_16x16x32_f16_negB:
; GFX12: ; %bb.0: ; %bb
; GFX12-NEXT: v_swmmac_f16_16x16x32_f16 v[6:7], v[0:1], v[2:5], v8 neg_lo:[0,1,0] neg_hi:[0,1,0]
; GFX12-NEXT: global_store_b64 v[9:10], v[6:7], off
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
bb:
%fneg.B = fneg <8 x half> %B
%res = call <4 x half> @llvm.amdgcn.swmmac.f16.16x16x32.f16.v4f16.v8f16.v4f16.i16(<4 x half> %A, <8 x half> %fneg.B, <4 x half> %C, i16 %Index)
store <4 x half> %res, <4 x half> addrspace(1)* %out
ret void
}
; both neg and abs patterns (wmma matrix C f32 or f16 )
define amdgpu_ps void @test_wmma_f32_16x16x16_f16_negabsC(<4 x half> %A, <4 x half> %B, <4 x float> %C, <4 x float> addrspace(1)* %out) {
; GFX12-LABEL: test_wmma_f32_16x16x16_f16_negabsC:
; GFX12: ; %bb.0: ; %bb
; GFX12-NEXT: v_wmma_f32_16x16x16_f16 v[4:7], v[0:1], v[2:3], v[4:7] neg_lo:[0,0,1] neg_hi:[0,0,1]
; GFX12-NEXT: global_store_b128 v[8:9], v[4:7], off
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
bb:
%fabs.C = call <4 x float> @llvm.fabs.v4f32(<4 x float> %C)
%fneg.fabs.C = fneg <4 x float> %fabs.C
%res = call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.f16.v4f16.v4f32(<4 x half> %A, <4 x half> %B, <4 x float> %fneg.fabs.C)
store <4 x float> %res, <4 x float> addrspace(1)* %out
ret void
}
define amdgpu_ps void @test_wmma_f16_16x16x16_f16_negabsC(<4 x half> %A, <4 x half> %B, <4 x half> %C, <4 x half> addrspace(1)* %out) {
; GFX12-LABEL: test_wmma_f16_16x16x16_f16_negabsC:
; GFX12: ; %bb.0: ; %bb
; GFX12-NEXT: v_wmma_f16_16x16x16_f16 v[4:5], v[0:1], v[2:3], v[4:5] neg_lo:[0,0,1] neg_hi:[0,0,1]
; GFX12-NEXT: global_store_b64 v[6:7], v[4:5], off
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
bb:
%fabs.C = call <4 x half> @llvm.fabs.v4f16(<4 x half> %C)
%fneg.fabs.C = fneg <4 x half> %fabs.C
%res = call <4 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16.v8f16.v8f16(<4 x half> %A, <4 x half> %B, <4 x half> %fneg.fabs.C, i1 0)
store <4 x half> %res, <4 x half> addrspace(1)* %out
ret void
}
define amdgpu_ps void @test_wmma_f32_16x16x16_f16_neg_partial_fabsA(<4 x half> %A, <4 x half> %B, <4 x float> %C, <4 x float> addrspace(1)* %out) {
; GFX12-LABEL: test_wmma_f32_16x16x16_f16_neg_partial_fabsA:
; GFX12: ; %bb.0: ; %bb
; GFX12-NEXT: v_and_b32_e32 v7, 0x7fffffff, v7
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-NEXT: v_wmma_f32_16x16x16_f16 v[4:7], v[0:1], v[2:3], v[4:7] neg_lo:[0,0,1]
; GFX12-NEXT: global_store_b128 v[8:9], v[4:7], off
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
bb:
%el3 = extractelement <4 x float> %C, i32 3
%el3.fabs = call float @llvm.fabs.f32(float %el3)
%partial.fabs.C = insertelement <4 x float> %C, float %el3.fabs, i32 3
%fneg.partial.fabs.C = fneg <4 x float> %partial.fabs.C
%res = call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.f16.v4f16.v4f32(<4 x half> %A, <4 x half> %B, <4 x float> %fneg.partial.fabs.C)
store <4 x float> %res, <4 x float> addrspace(1)* %out
ret void
}
; A or B matrix modifier and constant in C
define amdgpu_ps void @test_wmma_f32_16x16x16_f16_negA_constantC(<4 x half> %A, <4 x half> %B, <4 x float> %C, <4 x float> addrspace(1)* %out) {
; GFX12-LABEL: test_wmma_f32_16x16x16_f16_negA_constantC:
; GFX12: ; %bb.0: ; %bb
; GFX12-NEXT: v_wmma_f32_16x16x16_f16 v[6:9], v[0:1], v[2:3], 1.0 neg_lo:[1,0,0] neg_hi:[1,0,0]
; GFX12-NEXT: global_store_b128 v[4:5], v[6:9], off
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
bb:
%fneg.A = fneg <4 x half> %A
%res = call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.f16.v4f16.v4f32(<4 x half> %fneg.A, <4 x half> %B, <4 x float> <float 1.0, float 1.0, float 1.0, float 1.0>)
store <4 x float> %res, <4 x float> addrspace(1)* %out
ret void
}
define amdgpu_ps void @test_wmma_f16_16x16x16_f16_negB_constantC(<4 x half> %A, <4 x half> %B, <4 x half> %C, <4 x half> addrspace(1)* %out) {
; GFX12-LABEL: test_wmma_f16_16x16x16_f16_negB_constantC:
; GFX12: ; %bb.0: ; %bb
; GFX12-NEXT: v_wmma_f16_16x16x16_f16 v[6:7], v[0:1], v[2:3], 1.0 neg_lo:[0,1,0] neg_hi:[0,1,0]
; GFX12-NEXT: global_store_b64 v[4:5], v[6:7], off
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
bb:
%fneg.B = fneg <4 x half> %B
%res = call <4 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16.v8f16.v8f16(<4 x half> %A, <4 x half> %fneg.B, <4 x half> <half 1.0, half 1.0, half 1.0, half 1.0>, i1 0)
store <4 x half> %res, <4 x half> addrspace(1)* %out
ret void
}
; pack f16 elements with v_perm_b32 since they don't come from same b32
define amdgpu_ps void @test_wmma_f16_16x16x16_f16_negC_pack(<4 x half> %A, <4 x half> %B, ptr %Caddr, <4 x half> addrspace(1)* %out) {
; GFX12-LABEL: test_wmma_f16_16x16x16_f16_negC_pack:
; GFX12: ; %bb.0: ; %bb
; GFX12-NEXT: flat_load_b128 v[8:11], v[4:5]
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-NEXT: v_and_b32_e32 v4, 0xffff, v8
; GFX12-NEXT: v_and_b32_e32 v5, 0xffff, v10
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX12-NEXT: v_lshl_or_b32 v4, v9, 16, v4
; GFX12-NEXT: v_lshl_or_b32 v5, v11, 16, v5
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-NEXT: v_wmma_f16_16x16x16_f16 v[4:5], v[0:1], v[2:3], v[4:5] neg_lo:[0,0,1]
; GFX12-NEXT: global_store_b64 v[6:7], v[4:5], off
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
bb:
%C = load <8 x half>, ptr %Caddr
%C_shuffle = shufflevector <8 x half> %C, <8 x half> undef, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
%fneg.C_shuffle = fneg <4 x half> %C_shuffle
%res = call <4 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16.v8f16.v8f16(<4 x half> %A, <4 x half> %B, <4 x half> %fneg.C_shuffle , i1 0)
store <4 x half> %res, <4 x half> addrspace(1)* %out
ret void
}
declare <4 x half> @llvm.fabs.v4f16(<4 x half>)
declare <4 x float> @llvm.fabs.v4f32(<4 x float>)
declare float @llvm.fabs.f32(float)
declare <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.f16.v4f16.v4f32(<4 x half>, <4 x half>, <4 x float>)
declare <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf16.v4i16.v4f32(<4 x i16>, <4 x i16>, <4 x float>)
declare <4 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16.v8f16.v8f16(<4 x half>, <4 x half>, <4 x half>, i1 immarg)
declare <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.fp8.fp8.i32.v4f32(i32, i32, <4 x float>)
declare <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.fp8.bf8.i32.v4f32(i32, i32, <4 x float>)
declare <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf8.fp8.i32.v4f32(i32, i32, <4 x float>)
declare <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf8.bf8.i32.v4f32(i32, i32, <4 x float>)
declare <4 x float> @llvm.amdgcn.swmmac.f32.16x16x32.f16.v4f16.v8f16.v4f32.i16(<4 x half>, <8 x half>, <4 x float>, i16)
declare <4 x half> @llvm.amdgcn.swmmac.f16.16x16x32.f16.v4f16.v8f16.v4f16.i16(<4 x half>, <8 x half>, <4 x half>, i16)

View File

@ -0,0 +1,430 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
; RUN: llc -global-isel -march=amdgcn -mcpu=gfx1200 -mattr=-wavefrontsize32,+wavefrontsize64 -verify-machineinstrs < %s | FileCheck %s --check-prefix=GFX12
define amdgpu_ps void @test_wmma_f32_16x16x16_f16_imm(<4 x half> %A, <4 x half> %B, ptr addrspace(1) %out) {
; GFX12-LABEL: test_wmma_f32_16x16x16_f16_imm:
; GFX12: ; %bb.0: ; %bb
; GFX12-NEXT: v_wmma_f32_16x16x16_f16 v[6:9], v[0:1], v[2:3], 1.0
; GFX12-NEXT: global_store_b128 v[4:5], v[6:9], off
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
bb:
%res = call <4 x float>@llvm.amdgcn.wmma.f32.16x16x16.f16.v4f32.v4f16.v4f16.v4f32(<4 x half> %A, <4 x half> %B, <4 x float> <float 1.0, float 1.0, float 1.0, float 1.0>)
store <4 x float> %res, ptr addrspace(1) %out
ret void
}
define amdgpu_ps void @test_wmma_f32_16x16x16_f16_imm_non_inlineable(<4 x half> %A, <4 x half> %B, ptr addrspace(1) %out) {
; GFX12-LABEL: test_wmma_f32_16x16x16_f16_imm_non_inlineable:
; GFX12: ; %bb.0: ; %bb
; GFX12-NEXT: s_mov_b32 s0, 0x40400000
; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX12-NEXT: s_mov_b32 s3, s0
; GFX12-NEXT: s_mov_b32 s1, s0
; GFX12-NEXT: s_mov_b32 s2, s0
; GFX12-NEXT: v_mov_b32_e32 v9, s3
; GFX12-NEXT: v_mov_b32_e32 v8, s2
; GFX12-NEXT: v_mov_b32_e32 v7, s1
; GFX12-NEXT: v_mov_b32_e32 v6, s0
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-NEXT: v_wmma_f32_16x16x16_f16 v[6:9], v[0:1], v[2:3], v[6:9]
; GFX12-NEXT: global_store_b128 v[4:5], v[6:9], off
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
bb:
%res = call <4 x float>@llvm.amdgcn.wmma.f32.16x16x16.f16.v4f32.v4f16.v4f16.v4f32(<4 x half> %A, <4 x half> %B, <4 x float> <float 3.0, float 3.0, float 3.0, float 3.0>)
store <4 x float> %res, ptr addrspace(1) %out
ret void
}
define amdgpu_ps void @test_wmma_f32_16x16x16_bf16_imm(<4 x i16> %A, <4 x i16> %B, ptr addrspace(1) %out) {
; GFX12-LABEL: test_wmma_f32_16x16x16_bf16_imm:
; GFX12: ; %bb.0: ; %bb
; GFX12-NEXT: v_wmma_f32_16x16x16_bf16 v[6:9], v[0:1], v[2:3], 1.0
; GFX12-NEXT: global_store_b128 v[4:5], v[6:9], off
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
bb:
%res = call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf16.v4f32.v4i16.v4i16.v4f32(<4 x i16> %A, <4 x i16> %B, <4 x float> <float 1.0, float 1.0, float 1.0, float 1.0>)
store <4 x float> %res, ptr addrspace(1) %out
ret void
}
define amdgpu_ps void @test_wmma_f32_16x16x16_bf16_imm_non_inlineable(<4 x i16> %A, <4 x i16> %B, ptr addrspace(1) %out) {
; GFX12-LABEL: test_wmma_f32_16x16x16_bf16_imm_non_inlineable:
; GFX12: ; %bb.0: ; %bb
; GFX12-NEXT: s_mov_b32 s0, 0x40400000
; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX12-NEXT: s_mov_b32 s3, s0
; GFX12-NEXT: s_mov_b32 s1, s0
; GFX12-NEXT: s_mov_b32 s2, s0
; GFX12-NEXT: v_mov_b32_e32 v9, s3
; GFX12-NEXT: v_mov_b32_e32 v8, s2
; GFX12-NEXT: v_mov_b32_e32 v7, s1
; GFX12-NEXT: v_mov_b32_e32 v6, s0
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-NEXT: v_wmma_f32_16x16x16_bf16 v[6:9], v[0:1], v[2:3], v[6:9]
; GFX12-NEXT: global_store_b128 v[4:5], v[6:9], off
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
bb:
%res = call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf16.v4f32.v4i16.v4i16.v4f32(<4 x i16> %A, <4 x i16> %B, <4 x float> <float 3.0, float 3.0, float 3.0, float 3.0>)
store <4 x float> %res, ptr addrspace(1) %out
ret void
}
define amdgpu_ps void @test_wmma_f16_16x16x16_f16_imm(<4 x half> %A, <4 x half> %B, ptr addrspace(1) %out) {
; GFX12-LABEL: test_wmma_f16_16x16x16_f16_imm:
; GFX12: ; %bb.0: ; %bb
; GFX12-NEXT: v_wmma_f16_16x16x16_f16 v[6:7], v[0:1], v[2:3], 1.0
; GFX12-NEXT: global_store_b64 v[4:5], v[6:7], off
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
bb:
%res = call <4 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16.v4f16.v4f16.v4f16.v4f16(<4 x half> %A, <4 x half> %B, <4 x half> <half 1.0, half 1.0, half 1.0, half 1.0>, i1 0)
store <4 x half> %res, ptr addrspace(1) %out
ret void
}
define amdgpu_ps void @test_wmma_f16_16x16x16_f16_imm_non_inlineable(<4 x half> %A, <4 x half> %B, ptr addrspace(1) %out) {
; GFX12-LABEL: test_wmma_f16_16x16x16_f16_imm_non_inlineable:
; GFX12: ; %bb.0: ; %bb
; GFX12-NEXT: s_mov_b32 s0, 0x42004200
; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
; GFX12-NEXT: s_mov_b32 s1, s0
; GFX12-NEXT: v_mov_b32_e32 v7, s1
; GFX12-NEXT: v_mov_b32_e32 v6, s0
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-NEXT: v_wmma_f16_16x16x16_f16 v[6:7], v[0:1], v[2:3], v[6:7]
; GFX12-NEXT: global_store_b64 v[4:5], v[6:7], off
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
bb:
%res = call <4 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16.v4f16.v4f16.v4f16.v4f16(<4 x half> %A, <4 x half> %B, <4 x half> <half 3.0, half 3.0, half 3.0, half 3.0>, i1 0)
store <4 x half> %res, ptr addrspace(1) %out
ret void
}
define amdgpu_ps void @test_wmma_bf16_16x16x16_bf16_imm(<4 x i16> %A, <4 x i16> %B, ptr addrspace(1) %out) {
; GFX12-LABEL: test_wmma_bf16_16x16x16_bf16_imm:
; GFX12: ; %bb.0: ; %bb
; GFX12-NEXT: s_mov_b32 s0, 0x3f803f80
; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
; GFX12-NEXT: s_mov_b32 s1, s0
; GFX12-NEXT: v_mov_b32_e32 v7, s1
; GFX12-NEXT: v_mov_b32_e32 v6, s0
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-NEXT: v_wmma_bf16_16x16x16_bf16 v[6:7], v[0:1], v[2:3], v[6:7]
; GFX12-NEXT: global_store_b64 v[4:5], v[6:7], off
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
bb:
%res = call <4 x i16> @llvm.amdgcn.wmma.bf16.16x16x16.bf16.v4i16.v4i16.v4i16.v4i16(<4 x i16> %A, <4 x i16> %B, <4 x i16> <i16 16256, i16 16256, i16 16256, i16 16256>, i1 0)
store <4 x i16> %res, ptr addrspace(1) %out
ret void
}
define amdgpu_ps void @test_wmma_bf16_16x16x16_bf16_imm_non_inlineable(<4 x i16> %A, <4 x i16> %B, ptr addrspace(1) %out) {
; GFX12-LABEL: test_wmma_bf16_16x16x16_bf16_imm_non_inlineable:
; GFX12: ; %bb.0: ; %bb
; GFX12-NEXT: s_mov_b32 s0, 0x3fc03fc0
; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
; GFX12-NEXT: s_mov_b32 s1, s0
; GFX12-NEXT: v_mov_b32_e32 v7, s1
; GFX12-NEXT: v_mov_b32_e32 v6, s0
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-NEXT: v_wmma_bf16_16x16x16_bf16 v[6:7], v[0:1], v[2:3], v[6:7]
; GFX12-NEXT: global_store_b64 v[4:5], v[6:7], off
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
bb:
%res = call <4 x i16> @llvm.amdgcn.wmma.bf16.16x16x16.bf16.v4i16.v4i16.v4i16.v4i16(<4 x i16> %A, <4 x i16> %B, <4 x i16> <i16 16320, i16 16320, i16 16320, i16 16320>, i1 0)
store <4 x i16> %res, ptr addrspace(1) %out
ret void
}
define amdgpu_ps void @test_wmma_i32_16x16x16_iu8_imm(i32 %A, i32 %B, ptr addrspace(1) %out) {
; GFX12-LABEL: test_wmma_i32_16x16x16_iu8_imm:
; GFX12: ; %bb.0: ; %bb
; GFX12-NEXT: v_wmma_i32_16x16x16_iu8 v[4:7], v0, v1, 1
; GFX12-NEXT: global_store_b128 v[2:3], v[4:7], off
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
bb:
%res = call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8.v4i32.i32.i32.v4i32(i1 0, i32 %A, i1 0, i32 %B, <4 x i32> <i32 1, i32 1, i32 1, i32 1>, i1 0)
store <4 x i32> %res, ptr addrspace(1) %out
ret void
}
define amdgpu_ps void @test_wmma_i32_16x16x16_iu8_imm_non_inlineable(i32 %A, i32 %B, ptr addrspace(1) %out) {
; GFX12-LABEL: test_wmma_i32_16x16x16_iu8_imm_non_inlineable:
; GFX12: ; %bb.0: ; %bb
; GFX12-NEXT: s_movk_i32 s0, 0x80
; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX12-NEXT: s_mov_b32 s3, s0
; GFX12-NEXT: s_mov_b32 s1, s0
; GFX12-NEXT: s_mov_b32 s2, s0
; GFX12-NEXT: v_mov_b32_e32 v7, s3
; GFX12-NEXT: v_mov_b32_e32 v6, s2
; GFX12-NEXT: v_mov_b32_e32 v5, s1
; GFX12-NEXT: v_mov_b32_e32 v4, s0
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-NEXT: v_wmma_i32_16x16x16_iu8 v[4:7], v0, v1, v[4:7]
; GFX12-NEXT: global_store_b128 v[2:3], v[4:7], off
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
bb:
%res = call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8.v4i32.i32.i32.v4i32(i1 0, i32 %A, i1 0, i32 %B, <4 x i32> <i32 128, i32 128, i32 128, i32 128>, i1 0)
store <4 x i32> %res, ptr addrspace(1) %out
ret void
}
define amdgpu_ps void @test_wmma_i32_16x16x16_iu4_imm(i32 %A, i32 %B, ptr addrspace(1) %out) {
; GFX12-LABEL: test_wmma_i32_16x16x16_iu4_imm:
; GFX12: ; %bb.0: ; %bb
; GFX12-NEXT: v_wmma_i32_16x16x16_iu4 v[4:7], v0, v1, 1
; GFX12-NEXT: global_store_b128 v[2:3], v[4:7], off
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
bb:
%res = call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4.v4i32.i32.i32.v4i32(i1 0, i32 %A, i1 0, i32 %B, <4 x i32> <i32 1, i32 1, i32 1, i32 1>, i1 0)
store <4 x i32> %res, ptr addrspace(1) %out
ret void
}
define amdgpu_ps void @test_wmma_i32_16x16x16_iu4_imm_non_inlineable(i32 %A, i32 %B, ptr addrspace(1) %out) {
; GFX12-LABEL: test_wmma_i32_16x16x16_iu4_imm_non_inlineable:
; GFX12: ; %bb.0: ; %bb
; GFX12-NEXT: s_movk_i32 s0, 0x80
; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX12-NEXT: s_mov_b32 s3, s0
; GFX12-NEXT: s_mov_b32 s1, s0
; GFX12-NEXT: s_mov_b32 s2, s0
; GFX12-NEXT: v_mov_b32_e32 v7, s3
; GFX12-NEXT: v_mov_b32_e32 v6, s2
; GFX12-NEXT: v_mov_b32_e32 v5, s1
; GFX12-NEXT: v_mov_b32_e32 v4, s0
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-NEXT: v_wmma_i32_16x16x16_iu4 v[4:7], v0, v1, v[4:7]
; GFX12-NEXT: global_store_b128 v[2:3], v[4:7], off
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
bb:
%res = call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4.v4i32.i32.i32.v4i32(i1 0, i32 %A, i1 0, i32 %B, <4 x i32> <i32 128, i32 128, i32 128, i32 128>, i1 0)
store <4 x i32> %res, ptr addrspace(1) %out
ret void
}
define amdgpu_ps void @test_wmma_f32_16x16x16_fp8_fp8_imm(i32 %A, i32 %B, <4 x float> %C, ptr addrspace(1) %out) {
; GFX12-LABEL: test_wmma_f32_16x16x16_fp8_fp8_imm:
; GFX12: ; %bb.0: ; %bb
; GFX12-NEXT: v_wmma_f32_16x16x16_fp8_fp8 v[4:7], v0, v1, 1.0
; GFX12-NEXT: global_store_b128 v[2:3], v[4:7], off
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
bb:
%res = call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.fp8.fp8.v4f32.i32.i32.v4f32(i32 %A, i32 %B, <4 x float> <float 1.0, float 1.0, float 1.0, float 1.0>)
store <4 x float> %res, ptr addrspace(1) %out
ret void
}
define amdgpu_ps void @test_wmma_f32_16x16x16_fp8_fp8_imm_non_inlineable(i32 %A, i32 %B, <4 x float> %C, ptr addrspace(1) %out) {
; GFX12-LABEL: test_wmma_f32_16x16x16_fp8_fp8_imm_non_inlineable:
; GFX12: ; %bb.0: ; %bb
; GFX12-NEXT: s_mov_b32 s0, 0x40400000
; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX12-NEXT: s_mov_b32 s3, s0
; GFX12-NEXT: s_mov_b32 s1, s0
; GFX12-NEXT: s_mov_b32 s2, s0
; GFX12-NEXT: v_mov_b32_e32 v7, s3
; GFX12-NEXT: v_mov_b32_e32 v6, s2
; GFX12-NEXT: v_mov_b32_e32 v5, s1
; GFX12-NEXT: v_mov_b32_e32 v4, s0
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-NEXT: v_wmma_f32_16x16x16_fp8_fp8 v[4:7], v0, v1, v[4:7]
; GFX12-NEXT: global_store_b128 v[2:3], v[4:7], off
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
bb:
%res = call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.fp8.fp8.v4f32.i32.i32.v4f32(i32 %A, i32 %B, <4 x float> <float 3.0, float 3.0, float 3.0, float 3.0>)
store <4 x float> %res, ptr addrspace(1) %out
ret void
}
define amdgpu_ps void @test_wmma_f32_16x16x16_bf8_fp8_imm(i32 %A, i32 %B, <4 x float> %C, ptr addrspace(1) %out) {
; GFX12-LABEL: test_wmma_f32_16x16x16_bf8_fp8_imm:
; GFX12: ; %bb.0: ; %bb
; GFX12-NEXT: v_wmma_f32_16x16x16_bf8_fp8 v[4:7], v0, v1, 1.0
; GFX12-NEXT: global_store_b128 v[2:3], v[4:7], off
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
bb:
%res = call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf8.fp8.v4f32.i32.i32.v4f32(i32 %A, i32 %B, <4 x float> <float 1.0, float 1.0, float 1.0, float 1.0>)
store <4 x float> %res, ptr addrspace(1) %out
ret void
}
define amdgpu_ps void @test_wmma_f32_16x16x16_bf8_fp8_imm_non_inlineable(i32 %A, i32 %B, <4 x float> %C, ptr addrspace(1) %out) {
; GFX12-LABEL: test_wmma_f32_16x16x16_bf8_fp8_imm_non_inlineable:
; GFX12: ; %bb.0: ; %bb
; GFX12-NEXT: s_mov_b32 s0, 0x40400000
; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX12-NEXT: s_mov_b32 s3, s0
; GFX12-NEXT: s_mov_b32 s1, s0
; GFX12-NEXT: s_mov_b32 s2, s0
; GFX12-NEXT: v_mov_b32_e32 v7, s3
; GFX12-NEXT: v_mov_b32_e32 v6, s2
; GFX12-NEXT: v_mov_b32_e32 v5, s1
; GFX12-NEXT: v_mov_b32_e32 v4, s0
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-NEXT: v_wmma_f32_16x16x16_bf8_fp8 v[4:7], v0, v1, v[4:7]
; GFX12-NEXT: global_store_b128 v[2:3], v[4:7], off
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
bb:
%res = call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf8.fp8.v4f32.i32.i32.v4f32(i32 %A, i32 %B, <4 x float> <float 3.0, float 3.0, float 3.0, float 3.0>)
store <4 x float> %res, ptr addrspace(1) %out
ret void
}
define amdgpu_ps void @test_wmma_f32_16x16x16_fp8_bf8_imm(i32 %A, i32 %B, <4 x float> %C, ptr addrspace(1) %out) {
; GFX12-LABEL: test_wmma_f32_16x16x16_fp8_bf8_imm:
; GFX12: ; %bb.0: ; %bb
; GFX12-NEXT: v_wmma_f32_16x16x16_fp8_bf8 v[4:7], v0, v1, 1.0
; GFX12-NEXT: global_store_b128 v[2:3], v[4:7], off
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
bb:
%res = call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.fp8.bf8.v4f32.i32.i32.v4f32(i32 %A, i32 %B, <4 x float> <float 1.0, float 1.0, float 1.0, float 1.0>)
store <4 x float> %res, ptr addrspace(1) %out
ret void
}
define amdgpu_ps void @test_wmma_f32_16x16x16_fp8_bf8_imm_non_inlineable(i32 %A, i32 %B, <4 x float> %C, ptr addrspace(1) %out) {
; GFX12-LABEL: test_wmma_f32_16x16x16_fp8_bf8_imm_non_inlineable:
; GFX12: ; %bb.0: ; %bb
; GFX12-NEXT: s_mov_b32 s0, 0x40400000
; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX12-NEXT: s_mov_b32 s3, s0
; GFX12-NEXT: s_mov_b32 s1, s0
; GFX12-NEXT: s_mov_b32 s2, s0
; GFX12-NEXT: v_mov_b32_e32 v7, s3
; GFX12-NEXT: v_mov_b32_e32 v6, s2
; GFX12-NEXT: v_mov_b32_e32 v5, s1
; GFX12-NEXT: v_mov_b32_e32 v4, s0
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-NEXT: v_wmma_f32_16x16x16_fp8_bf8 v[4:7], v0, v1, v[4:7]
; GFX12-NEXT: global_store_b128 v[2:3], v[4:7], off
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
bb:
%res = call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.fp8.bf8.v4f32.i32.i32.v4f32(i32 %A, i32 %B, <4 x float> <float 3.0, float 3.0, float 3.0, float 3.0>)
store <4 x float> %res, ptr addrspace(1) %out
ret void
}
define amdgpu_ps void @test_wmma_f32_16x16x16_bf8_bf8_imm(i32 %A, i32 %B, <4 x float> %C, ptr addrspace(1) %out) {
; GFX12-LABEL: test_wmma_f32_16x16x16_bf8_bf8_imm:
; GFX12: ; %bb.0: ; %bb
; GFX12-NEXT: v_wmma_f32_16x16x16_bf8_bf8 v[4:7], v0, v1, 1.0
; GFX12-NEXT: global_store_b128 v[2:3], v[4:7], off
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
bb:
%res = call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf8.bf8.v4f32.i32.i32.v4f32(i32 %A, i32 %B, <4 x float> <float 1.0, float 1.0, float 1.0, float 1.0>)
store <4 x float> %res, ptr addrspace(1) %out
ret void
}
define amdgpu_ps void @test_wmma_f32_16x16x16_bf8_bf8_imm_non_inlineable(i32 %A, i32 %B, <4 x float> %C, ptr addrspace(1) %out) {
; GFX12-LABEL: test_wmma_f32_16x16x16_bf8_bf8_imm_non_inlineable:
; GFX12: ; %bb.0: ; %bb
; GFX12-NEXT: s_mov_b32 s0, 0x40400000
; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX12-NEXT: s_mov_b32 s3, s0
; GFX12-NEXT: s_mov_b32 s1, s0
; GFX12-NEXT: s_mov_b32 s2, s0
; GFX12-NEXT: v_mov_b32_e32 v7, s3
; GFX12-NEXT: v_mov_b32_e32 v6, s2
; GFX12-NEXT: v_mov_b32_e32 v5, s1
; GFX12-NEXT: v_mov_b32_e32 v4, s0
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-NEXT: v_wmma_f32_16x16x16_bf8_bf8 v[4:7], v0, v1, v[4:7]
; GFX12-NEXT: global_store_b128 v[2:3], v[4:7], off
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
bb:
%res = call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf8.bf8.v4f32.i32.i32.v4f32(i32 %A, i32 %B, <4 x float> <float 3.0, float 3.0, float 3.0, float 3.0>)
store <4 x float> %res, ptr addrspace(1) %out
ret void
}
define amdgpu_ps void @test_wmma_i32_16x16x32_iu4_imm(i32 %A, i32 %B, <4 x i32> %C, ptr addrspace(1) %out) {
; GFX12-LABEL: test_wmma_i32_16x16x32_iu4_imm:
; GFX12: ; %bb.0: ; %bb
; GFX12-NEXT: v_wmma_i32_16x16x32_iu4 v[4:7], v0, v1, 1
; GFX12-NEXT: global_store_b128 v[2:3], v[4:7], off
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
bb:
%res = call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x32.iu4.v4i32.i32.i32.v4i32(i1 0, i32 %A, i1 0, i32 %B, <4 x i32> <i32 1, i32 1, i32 1, i32 1>, i1 0)
store <4 x i32> %res, ptr addrspace(1) %out
ret void
}
define amdgpu_ps void @test_wmma_i32_16x16x32_iu4_imm_non_inlineable(i32 %A, i32 %B, <4 x i32> %C, ptr addrspace(1) %out) {
; GFX12-LABEL: test_wmma_i32_16x16x32_iu4_imm_non_inlineable:
; GFX12: ; %bb.0: ; %bb
; GFX12-NEXT: s_movk_i32 s0, 0x80
; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX12-NEXT: s_mov_b32 s3, s0
; GFX12-NEXT: s_mov_b32 s1, s0
; GFX12-NEXT: s_mov_b32 s2, s0
; GFX12-NEXT: v_mov_b32_e32 v7, s3
; GFX12-NEXT: v_mov_b32_e32 v6, s2
; GFX12-NEXT: v_mov_b32_e32 v5, s1
; GFX12-NEXT: v_mov_b32_e32 v4, s0
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-NEXT: v_wmma_i32_16x16x32_iu4 v[4:7], v0, v1, v[4:7]
; GFX12-NEXT: global_store_b128 v[2:3], v[4:7], off
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
bb:
%res = call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x32.iu4.v4i32.i32.i32.v4i32(i1 0, i32 %A, i1 0, i32 %B, <4 x i32> <i32 128, i32 128, i32 128, i32 128>, i1 0)
store <4 x i32> %res, ptr addrspace(1) %out
ret void
}
declare <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.f16.v4f32.v4f16.v4f16.v4f32(<4 x half>, <4 x half>, <4 x float>)
declare <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf16.v4f32.v4i16.v4i16.v4f32(<4 x i16>, <4 x i16>, <4 x float>)
declare <4 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16.v4f16.v4f16.v4f16.v4f16(<4 x half>, <4 x half>, <4 x half>, i1 immarg)
declare <4 x i16> @llvm.amdgcn.wmma.bf16.16x16x16.bf16.v4i16.v4i16.v4i16.v4i16(<4 x i16>, <4 x i16>, <4 x i16>, i1 immarg)
declare <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8.v4i32.i32.i32.v4i32(i1 immarg, i32, i1 immarg, i32, <4 x i32>, i1 immarg)
declare <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4.v4i32.i32.i32.v4i32(i1 immarg, i32, i1 immarg, i32, <4 x i32>, i1 immarg)
declare <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.fp8.fp8.v4f32.i32.i32.v4f32(i32, i32, <4 x float>)
declare <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.fp8.bf8.v4f32.i32.i32.v4f32(i32, i32, <4 x float>)
declare <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf8.fp8.v4f32.i32.i32.v4f32(i32, i32, <4 x float>)
declare <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf8.bf8.v4f32.i32.i32.v4f32(i32, i32, <4 x float>)
declare <4 x i32> @llvm.amdgcn.wmma.i32.16x16x32.iu4.v4i32.i32.i32.v4i32(i1 immarg, i32, i1 immarg, i32, <4 x i32>, i1 immarg)

View File

@ -0,0 +1,274 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
; RUN: llc -global-isel -march=amdgcn -mcpu=gfx1200 -mattr=-wavefrontsize32,+wavefrontsize64 -verify-machineinstrs < %s | FileCheck %s --check-prefix=GFX12
define amdgpu_ps void @test_wmma_i32_16x16x16_iu8_zext_src0(i32 %A, i32 %B, <4 x i32> %C, ptr addrspace(1) %out) {
; GFX12-LABEL: test_wmma_i32_16x16x16_iu8_zext_src0:
; GFX12: ; %bb.0: ; %bb
; GFX12-NEXT: v_wmma_i32_16x16x16_iu8 v[2:5], v0, v1, v[2:5] neg_lo:[1,0,0]
; GFX12-NEXT: global_store_b128 v[6:7], v[2:5], off
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
bb:
%res = call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8.v4i32.i32.i32.v4i32(i1 1, i32 %A, i1 0, i32 %B, <4 x i32> %C, i1 0)
store <4 x i32> %res, ptr addrspace(1) %out
ret void
}
define amdgpu_ps void @test_wmma_i32_16x16x16_iu8_zext_src1(i32 %A, i32 %B, <4 x i32> %C, ptr addrspace(1) %out) {
; GFX12-LABEL: test_wmma_i32_16x16x16_iu8_zext_src1:
; GFX12: ; %bb.0: ; %bb
; GFX12-NEXT: v_wmma_i32_16x16x16_iu8 v[2:5], v0, v1, v[2:5] neg_lo:[0,1,0]
; GFX12-NEXT: global_store_b128 v[6:7], v[2:5], off
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
bb:
%res = call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8.v4i32.i32.i32.v4i32(i1 0, i32 %A, i1 1, i32 %B, <4 x i32> %C, i1 0)
store <4 x i32> %res, ptr addrspace(1) %out
ret void
}
define amdgpu_ps void @test_wmma_i32_16x16x16_iu8_clamp(i32 %A, i32 %B, <4 x i32> %C, ptr addrspace(1) %out) {
; GFX12-LABEL: test_wmma_i32_16x16x16_iu8_clamp:
; GFX12: ; %bb.0: ; %bb
; GFX12-NEXT: v_wmma_i32_16x16x16_iu8 v[2:5], v0, v1, v[2:5] clamp
; GFX12-NEXT: global_store_b128 v[6:7], v[2:5], off
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
bb:
%res = call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8.v4i32.i32.i32.v4i32(i1 0, i32 %A, i1 0, i32 %B, <4 x i32> %C, i1 1)
store <4 x i32> %res, ptr addrspace(1) %out
ret void
}
define amdgpu_ps void @test_wmma_i32_16x16x16_iu4_zext_src0(i32 %A, i32 %B, <4 x i32> %C, ptr addrspace(1) %out) {
; GFX12-LABEL: test_wmma_i32_16x16x16_iu4_zext_src0:
; GFX12: ; %bb.0: ; %bb
; GFX12-NEXT: v_wmma_i32_16x16x16_iu4 v[2:5], v0, v1, v[2:5] neg_lo:[1,0,0]
; GFX12-NEXT: global_store_b128 v[6:7], v[2:5], off
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
bb:
%res = call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4.v4i32.i32.i32.v4i32(i1 1, i32 %A, i1 0, i32 %B, <4 x i32> %C, i1 0)
store <4 x i32> %res, ptr addrspace(1) %out
ret void
}
define amdgpu_ps void @test_wmma_i32_16x16x16_iu4_zext_src1(i32 %A, i32 %B, <4 x i32> %C, ptr addrspace(1) %out) {
; GFX12-LABEL: test_wmma_i32_16x16x16_iu4_zext_src1:
; GFX12: ; %bb.0: ; %bb
; GFX12-NEXT: v_wmma_i32_16x16x16_iu4 v[2:5], v0, v1, v[2:5] neg_lo:[0,1,0]
; GFX12-NEXT: global_store_b128 v[6:7], v[2:5], off
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
bb:
%res = call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4.v4i32.i32.i32.v4i32(i1 0, i32 %A, i1 1, i32 %B, <4 x i32> %C, i1 0)
store <4 x i32> %res, ptr addrspace(1) %out
ret void
}
define amdgpu_ps void @test_wmma_i32_16x16x16_iu4_clamp(i32 %A, i32 %B, <4 x i32> %C, ptr addrspace(1) %out) {
; GFX12-LABEL: test_wmma_i32_16x16x16_iu4_clamp:
; GFX12: ; %bb.0: ; %bb
; GFX12-NEXT: v_wmma_i32_16x16x16_iu4 v[2:5], v0, v1, v[2:5] clamp
; GFX12-NEXT: global_store_b128 v[6:7], v[2:5], off
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
bb:
%res = call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4.v4i32.i32.i32.v4i32(i1 0, i32 %A, i1 0, i32 %B, <4 x i32> %C, i1 1)
store <4 x i32> %res, ptr addrspace(1) %out
ret void
}
define amdgpu_ps void @test_wmma_i32_16x16x32_iu4_zext_src0(i32 %A, i32 %B, <4 x i32> %C, ptr addrspace(1) %out) {
; GFX12-LABEL: test_wmma_i32_16x16x32_iu4_zext_src0:
; GFX12: ; %bb.0: ; %bb
; GFX12-NEXT: v_wmma_i32_16x16x32_iu4 v[2:5], v0, v1, v[2:5] neg_lo:[1,0,0]
; GFX12-NEXT: global_store_b128 v[6:7], v[2:5], off
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
bb:
%res = call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x32.iu4.v4i32.i32.i32.v4i32(i1 1, i32 %A, i1 0, i32 %B, <4 x i32> %C, i1 0)
store <4 x i32> %res, ptr addrspace(1) %out
ret void
}
define amdgpu_ps void @test_wmma_i32_16x16x32_iu4_zext_src1(i32 %A, i32 %B, <4 x i32> %C, ptr addrspace(1) %out) {
; GFX12-LABEL: test_wmma_i32_16x16x32_iu4_zext_src1:
; GFX12: ; %bb.0: ; %bb
; GFX12-NEXT: v_wmma_i32_16x16x32_iu4 v[2:5], v0, v1, v[2:5] neg_lo:[0,1,0]
; GFX12-NEXT: global_store_b128 v[6:7], v[2:5], off
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
bb:
%res = call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x32.iu4.v4i32.i32.i32.v4i32(i1 0, i32 %A, i1 1, i32 %B, <4 x i32> %C, i1 0)
store <4 x i32> %res, ptr addrspace(1) %out
ret void
}
define amdgpu_ps void @test_wmma_i32_16x16x32_iu4_clamp(i32 %A, i32 %B, <4 x i32> %C, ptr addrspace(1) %out) {
; GFX12-LABEL: test_wmma_i32_16x16x32_iu4_clamp:
; GFX12: ; %bb.0: ; %bb
; GFX12-NEXT: v_wmma_i32_16x16x32_iu4 v[2:5], v0, v1, v[2:5] clamp
; GFX12-NEXT: global_store_b128 v[6:7], v[2:5], off
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
bb:
%res = call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x32.iu4.v4i32.i32.i32.v4i32(i1 0, i32 %A, i1 0, i32 %B, <4 x i32> %C, i1 1)
store <4 x i32> %res, ptr addrspace(1) %out
ret void
}
define amdgpu_ps void @test_swmmac_i32_16x16x32_iu8_zext_src0(i32 %A, <2 x i32> %B, <4 x i32> %C, i8 %Index, ptr addrspace(1) %out) {
; GFX12-LABEL: test_swmmac_i32_16x16x32_iu8_zext_src0:
; GFX12: ; %bb.0: ; %bb
; GFX12-NEXT: v_swmmac_i32_16x16x32_iu8 v[3:6], v0, v[1:2], v7 neg_lo:[1,0,0]
; GFX12-NEXT: global_store_b128 v[8:9], v[3:6], off
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
bb:
%res = call <4 x i32> @llvm.amdgcn.swmmac.i32.16x16x32.iu8.v4i32.i32.v2i32.v4i32.i8(i1 1, i32 %A, i1 0, <2 x i32> %B, <4 x i32> %C, i8 %Index, i1 0)
store <4 x i32> %res, ptr addrspace(1) %out
ret void
}
define amdgpu_ps void @test_swmmac_i32_16x16x32_iu8_zext_src1(i32 %A, <2 x i32> %B, <4 x i32> %C, i8 %Index, ptr addrspace(1) %out) {
; GFX12-LABEL: test_swmmac_i32_16x16x32_iu8_zext_src1:
; GFX12: ; %bb.0: ; %bb
; GFX12-NEXT: v_swmmac_i32_16x16x32_iu8 v[3:6], v0, v[1:2], v7 neg_lo:[0,1,0]
; GFX12-NEXT: global_store_b128 v[8:9], v[3:6], off
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
bb:
%res = call <4 x i32> @llvm.amdgcn.swmmac.i32.16x16x32.iu8.v4i32.i32.v2i32.v4i32.i8(i1 0, i32 %A, i1 1, <2 x i32> %B, <4 x i32> %C, i8 %Index, i1 0)
store <4 x i32> %res, ptr addrspace(1) %out
ret void
}
define amdgpu_ps void @test_swmmac_i32_16x16x32_iu8_clamp(i32 %A, <2 x i32> %B, <4 x i32> %C, i8 %Index, ptr addrspace(1) %out) {
; GFX12-LABEL: test_swmmac_i32_16x16x32_iu8_clamp:
; GFX12: ; %bb.0: ; %bb
; GFX12-NEXT: v_swmmac_i32_16x16x32_iu8 v[3:6], v0, v[1:2], v7 clamp
; GFX12-NEXT: global_store_b128 v[8:9], v[3:6], off
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
bb:
%res = call <4 x i32> @llvm.amdgcn.swmmac.i32.16x16x32.iu8.v4i32.i32.v2i32.v4i32.i8(i1 0, i32 %A, i1 0, <2 x i32> %B, <4 x i32> %C, i8 %Index, i1 1)
store <4 x i32> %res, ptr addrspace(1) %out
ret void
}
define amdgpu_ps void @test_swmmac_i32_16x16x32_iu4_zext_src0(i32 %A, i32 %B, <4 x i32> %C, i16 %Index, ptr addrspace(1) %out) {
; GFX12-LABEL: test_swmmac_i32_16x16x32_iu4_zext_src0:
; GFX12: ; %bb.0: ; %bb
; GFX12-NEXT: v_swmmac_i32_16x16x32_iu4 v[2:5], v0, v1, v6 neg_lo:[1,0,0]
; GFX12-NEXT: global_store_b128 v[7:8], v[2:5], off
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
bb:
%res = call <4 x i32> @llvm.amdgcn.swmmac.i32.16x16x32.iu4.v4i32.i32.i32.v4i32.i16(i1 1, i32 %A, i1 0, i32 %B, <4 x i32> %C, i16 %Index, i1 0)
store <4 x i32> %res, ptr addrspace(1) %out
ret void
}
define amdgpu_ps void @test_swmmac_i32_16x16x32_iu4_zext_src1(i32 %A, i32 %B, <4 x i32> %C, i16 %Index, ptr addrspace(1) %out) {
; GFX12-LABEL: test_swmmac_i32_16x16x32_iu4_zext_src1:
; GFX12: ; %bb.0: ; %bb
; GFX12-NEXT: v_swmmac_i32_16x16x32_iu4 v[2:5], v0, v1, v6 neg_lo:[0,1,0]
; GFX12-NEXT: global_store_b128 v[7:8], v[2:5], off
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
bb:
%res = call <4 x i32> @llvm.amdgcn.swmmac.i32.16x16x32.iu4.v4i32.i32.i32.v4i32.i16(i1 0, i32 %A, i1 1, i32 %B, <4 x i32> %C, i16 %Index, i1 0)
store <4 x i32> %res, ptr addrspace(1) %out
ret void
}
define amdgpu_ps void @test_swmmac_i32_16x16x32_iu4_clamp(i32 %A, i32 %B, <4 x i32> %C, i16 %Index, ptr addrspace(1) %out) {
; GFX12-LABEL: test_swmmac_i32_16x16x32_iu4_clamp:
; GFX12: ; %bb.0: ; %bb
; GFX12-NEXT: v_swmmac_i32_16x16x32_iu4 v[2:5], v0, v1, v6 clamp
; GFX12-NEXT: global_store_b128 v[7:8], v[2:5], off
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
bb:
%res = call <4 x i32> @llvm.amdgcn.swmmac.i32.16x16x32.iu4.v4i32.i32.i32.v4i32.i16(i1 0, i32 %A, i1 0, i32 %B, <4 x i32> %C, i16 %Index, i1 1)
store <4 x i32> %res, ptr addrspace(1) %out
ret void
}
define amdgpu_ps void @test_swmmac_i32_16x16x64_iu4_zext_src0(i32 %A, <2 x i32> %B, <4 x i32> %C, i16 %Index, ptr addrspace(1) %out) {
; GFX12-LABEL: test_swmmac_i32_16x16x64_iu4_zext_src0:
; GFX12: ; %bb.0: ; %bb
; GFX12-NEXT: v_swmmac_i32_16x16x64_iu4 v[3:6], v0, v[1:2], v7 neg_lo:[1,0,0]
; GFX12-NEXT: global_store_b128 v[8:9], v[3:6], off
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
bb:
%res = call <4 x i32> @llvm.amdgcn.swmmac.i32.16x16x64.iu4.v4i32.i32.v2i32.v4i32.i16(i1 1, i32 %A, i1 0, <2 x i32> %B, <4 x i32> %C, i16 %Index, i1 0)
store <4 x i32> %res, ptr addrspace(1) %out
ret void
}
define amdgpu_ps void @test_swmmac_i32_16x16x64_iu4_zext_src1(i32 %A, <2 x i32> %B, <4 x i32> %C, i16 %Index, ptr addrspace(1) %out) {
; GFX12-LABEL: test_swmmac_i32_16x16x64_iu4_zext_src1:
; GFX12: ; %bb.0: ; %bb
; GFX12-NEXT: v_swmmac_i32_16x16x64_iu4 v[3:6], v0, v[1:2], v7 neg_lo:[0,1,0]
; GFX12-NEXT: global_store_b128 v[8:9], v[3:6], off
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
bb:
%res = call <4 x i32> @llvm.amdgcn.swmmac.i32.16x16x64.iu4.v4i32.i32.v2i32.v4i32.i16(i1 0, i32 %A, i1 1, <2 x i32> %B, <4 x i32> %C, i16 %Index, i1 0)
store <4 x i32> %res, ptr addrspace(1) %out
ret void
}
define amdgpu_ps void @test_swmmac_i32_16x16x64_iu4_clamp(i32 %A, <2 x i32> %B, <4 x i32> %C, i16 %Index, ptr addrspace(1) %out) {
; GFX12-LABEL: test_swmmac_i32_16x16x64_iu4_clamp:
; GFX12: ; %bb.0: ; %bb
; GFX12-NEXT: v_swmmac_i32_16x16x64_iu4 v[3:6], v0, v[1:2], v7 clamp
; GFX12-NEXT: global_store_b128 v[8:9], v[3:6], off
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
bb:
%res = call <4 x i32> @llvm.amdgcn.swmmac.i32.16x16x64.iu4.v4i32.i32.v2i32.v4i32.i16(i1 0, i32 %A, i1 0, <2 x i32> %B, <4 x i32> %C, i16 %Index, i1 1)
store <4 x i32> %res, ptr addrspace(1) %out
ret void
}
declare <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8.v4i32.i32.i32.v4i32(i1 immarg, i32, i1 immarg, i32, <4 x i32>, i1 immarg)
declare <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4.v4i32.i32.i32.v4i32(i1 immarg, i32, i1 immarg, i32, <4 x i32>, i1 immarg)
declare <4 x i32> @llvm.amdgcn.wmma.i32.16x16x32.iu4.v4i32.i32.i32.v4i32(i1 immarg, i32, i1 immarg, i32, <4 x i32>, i1 immarg)
declare <4 x i32> @llvm.amdgcn.swmmac.i32.16x16x32.iu8.v4i32.i32.v2i32.v4i32.i8(i1 immarg, i32, i1 immarg, <2 x i32>, <4 x i32>, i8 %Index, i1 immarg)
declare <4 x i32> @llvm.amdgcn.swmmac.i32.16x16x32.iu4.v4i32.i32.i32.v4i32.i16(i1 immarg, i32, i1 immarg, i32, <4 x i32>, i16 %Index, i1 immarg)
declare <4 x i32> @llvm.amdgcn.swmmac.i32.16x16x64.iu4.v4i32.i32.v2i32.v4i32.i16(i1 immarg, i32, i1 immarg, <2 x i32>, <4 x i32>, i16 %Index, i1 immarg)

View File

@ -0,0 +1,472 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
; RUN: llc -global-isel -march=amdgcn -mcpu=gfx1200 -mattr=-wavefrontsize32,+wavefrontsize64 -verify-machineinstrs < %s | FileCheck %s --check-prefix=GFX12
define amdgpu_ps void @test_swmmac_f32_16x16x32_f16_index_key(<4 x half> %A, <8 x half> %B, <4 x float> %C, ptr addrspace(1) %IndexVecPtr, ptr addrspace(1) %out0, ptr addrspace(1) %out1, ptr addrspace(1) %out2, ptr addrspace(1) %out3) {
; GFX12-LABEL: test_swmmac_f32_16x16x32_f16_index_key:
; GFX12: ; %bb.0: ; %bb
; GFX12-NEXT: global_load_b32 v10, v[10:11], off
; GFX12-NEXT: v_mov_b32_e32 v23, v9
; GFX12-NEXT: v_mov_b32_e32 v22, v8
; GFX12-NEXT: v_mov_b32_e32 v21, v7
; GFX12-NEXT: v_mov_b32_e32 v20, v6
; GFX12-NEXT: v_mov_b32_e32 v27, v9
; GFX12-NEXT: v_mov_b32_e32 v26, v8
; GFX12-NEXT: v_mov_b32_e32 v25, v7
; GFX12-NEXT: v_mov_b32_e32 v24, v6
; GFX12-NEXT: v_mov_b32_e32 v31, v9
; GFX12-NEXT: v_mov_b32_e32 v30, v8
; GFX12-NEXT: v_mov_b32_e32 v29, v7
; GFX12-NEXT: v_mov_b32_e32 v28, v6
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: v_swmmac_f32_16x16x32_f16 v[20:23], v[0:1], v[2:5], v10
; GFX12-NEXT: v_swmmac_f32_16x16x32_f16 v[24:27], v[0:1], v[2:5], v10 index_key:1
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3)
; GFX12-NEXT: v_swmmac_f32_16x16x32_f16 v[28:31], v[0:1], v[2:5], v10 index_key:2
; GFX12-NEXT: v_swmmac_f32_16x16x32_f16 v[6:9], v[0:1], v[2:5], v10 index_key:3
; GFX12-NEXT: global_store_b128 v[12:13], v[20:23], off
; GFX12-NEXT: global_store_b128 v[14:15], v[24:27], off
; GFX12-NEXT: global_store_b128 v[16:17], v[28:31], off
; GFX12-NEXT: global_store_b128 v[18:19], v[6:9], off
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
bb:
%IndexVec = load <4 x i8>, ptr addrspace(1) %IndexVecPtr, align 4
%Index0 = extractelement <4 x i8> %IndexVec, i32 0
%res0 = call <4 x float> @llvm.amdgcn.swmmac.f32.16x16x32.f16.v4f32.v4f16.v8f16.v4f32.i8(<4 x half> %A, <8 x half> %B, <4 x float> %C, i8 %Index0)
store <4 x float> %res0, ptr addrspace(1) %out0
%Index1 = extractelement <4 x i8> %IndexVec, i32 1
%res1 = call <4 x float> @llvm.amdgcn.swmmac.f32.16x16x32.f16.v4f32.v4f16.v8f16.v4f32.i8(<4 x half> %A, <8 x half> %B, <4 x float> %C, i8 %Index1)
store <4 x float> %res1, ptr addrspace(1) %out1
%Index2 = extractelement <4 x i8> %IndexVec, i32 2
%res2 = call <4 x float> @llvm.amdgcn.swmmac.f32.16x16x32.f16.v4f32.v4f16.v8f16.v4f32.i8(<4 x half> %A, <8 x half> %B, <4 x float> %C, i8 %Index2)
store <4 x float> %res2, ptr addrspace(1) %out2
%Index3 = extractelement <4 x i8> %IndexVec, i32 3
%res3 = call <4 x float> @llvm.amdgcn.swmmac.f32.16x16x32.f16.v4f32.v4f16.v8f16.v4f32.i8(<4 x half> %A, <8 x half> %B, <4 x float> %C, i8 %Index3)
store <4 x float> %res3, ptr addrspace(1) %out3
ret void
}
define amdgpu_ps void @test_swmmac_f32_16x16x32_bf16_index_key(<4 x i16> %A, <8 x i16> %B, <4 x float> %C, ptr addrspace(1) %IndexVecPtr, ptr addrspace(1) %out0, ptr addrspace(1) %out1, ptr addrspace(1) %out2, ptr addrspace(1) %out3) {
; GFX12-LABEL: test_swmmac_f32_16x16x32_bf16_index_key:
; GFX12: ; %bb.0: ; %bb
; GFX12-NEXT: global_load_b32 v10, v[10:11], off
; GFX12-NEXT: v_mov_b32_e32 v23, v9
; GFX12-NEXT: v_mov_b32_e32 v22, v8
; GFX12-NEXT: v_mov_b32_e32 v21, v7
; GFX12-NEXT: v_mov_b32_e32 v20, v6
; GFX12-NEXT: v_mov_b32_e32 v27, v9
; GFX12-NEXT: v_mov_b32_e32 v26, v8
; GFX12-NEXT: v_mov_b32_e32 v25, v7
; GFX12-NEXT: v_mov_b32_e32 v24, v6
; GFX12-NEXT: v_mov_b32_e32 v31, v9
; GFX12-NEXT: v_mov_b32_e32 v30, v8
; GFX12-NEXT: v_mov_b32_e32 v29, v7
; GFX12-NEXT: v_mov_b32_e32 v28, v6
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: v_swmmac_f32_16x16x32_bf16 v[20:23], v[0:1], v[2:5], v10
; GFX12-NEXT: v_swmmac_f32_16x16x32_bf16 v[24:27], v[0:1], v[2:5], v10 index_key:1
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3)
; GFX12-NEXT: v_swmmac_f32_16x16x32_bf16 v[28:31], v[0:1], v[2:5], v10 index_key:2
; GFX12-NEXT: v_swmmac_f32_16x16x32_bf16 v[6:9], v[0:1], v[2:5], v10 index_key:3
; GFX12-NEXT: global_store_b128 v[12:13], v[20:23], off
; GFX12-NEXT: global_store_b128 v[14:15], v[24:27], off
; GFX12-NEXT: global_store_b128 v[16:17], v[28:31], off
; GFX12-NEXT: global_store_b128 v[18:19], v[6:9], off
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
bb:
%IndexVec = load <4 x i8>, ptr addrspace(1) %IndexVecPtr, align 4
%Index0 = extractelement <4 x i8> %IndexVec, i32 0
%res0 = call <4 x float> @llvm.amdgcn.swmmac.f32.16x16x32.bf16.v4f32.v4i16.v8i16.v4f32.i8(<4 x i16> %A, <8 x i16> %B, <4 x float> %C, i8 %Index0)
store <4 x float> %res0, ptr addrspace(1) %out0
%Index1 = extractelement <4 x i8> %IndexVec, i32 1
%res1 = call <4 x float> @llvm.amdgcn.swmmac.f32.16x16x32.bf16.v4f32.v4i16.v8i16.v4f32.i8(<4 x i16> %A, <8 x i16> %B, <4 x float> %C, i8 %Index1)
store <4 x float> %res1, ptr addrspace(1) %out1
%Index2 = extractelement <4 x i8> %IndexVec, i32 2
%res2 = call <4 x float> @llvm.amdgcn.swmmac.f32.16x16x32.bf16.v4f32.v4i16.v8i16.v4f32.i8(<4 x i16> %A, <8 x i16> %B, <4 x float> %C, i8 %Index2)
store <4 x float> %res2, ptr addrspace(1) %out2
%Index3 = extractelement <4 x i8> %IndexVec, i32 3
%res3 = call <4 x float> @llvm.amdgcn.swmmac.f32.16x16x32.bf16.v4f32.v4i16.v8i16.v4f32.i8(<4 x i16> %A, <8 x i16> %B, <4 x float> %C, i8 %Index3)
store <4 x float> %res3, ptr addrspace(1) %out3
ret void
}
define amdgpu_ps void @test_swmmac_f16_16x16x32_f16_index_key(<4 x half> %A, <8 x half> %B, <4 x half> %C, ptr addrspace(1) %IndexVecPtr, ptr addrspace(1) %out0, ptr addrspace(1) %out1, ptr addrspace(1) %out2, ptr addrspace(1) %out3) {
; GFX12-LABEL: test_swmmac_f16_16x16x32_f16_index_key:
; GFX12: ; %bb.0: ; %bb
; GFX12-NEXT: global_load_b32 v22, v[8:9], off
; GFX12-NEXT: v_mov_b32_e32 v9, v7
; GFX12-NEXT: v_mov_b32_e32 v8, v6
; GFX12-NEXT: v_mov_b32_e32 v19, v7
; GFX12-NEXT: v_mov_b32_e32 v18, v6
; GFX12-NEXT: v_mov_b32_e32 v21, v7
; GFX12-NEXT: v_mov_b32_e32 v20, v6
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: v_swmmac_f16_16x16x32_f16 v[8:9], v[0:1], v[2:5], v22
; GFX12-NEXT: v_swmmac_f16_16x16x32_f16 v[18:19], v[0:1], v[2:5], v22 index_key:1
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3)
; GFX12-NEXT: v_swmmac_f16_16x16x32_f16 v[20:21], v[0:1], v[2:5], v22 index_key:2
; GFX12-NEXT: v_swmmac_f16_16x16x32_f16 v[6:7], v[0:1], v[2:5], v22 index_key:3
; GFX12-NEXT: global_store_b64 v[10:11], v[8:9], off
; GFX12-NEXT: global_store_b64 v[12:13], v[18:19], off
; GFX12-NEXT: global_store_b64 v[14:15], v[20:21], off
; GFX12-NEXT: global_store_b64 v[16:17], v[6:7], off
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
bb:
%IndexVec = load <4 x i8>, ptr addrspace(1) %IndexVecPtr, align 4
%Index0 = extractelement <4 x i8> %IndexVec, i32 0
%res0 = call <4 x half> @llvm.amdgcn.swmmac.f16.16x16x32.f16.v4f16.v4f16.v8f16.v4f16.i8(<4 x half> %A, <8 x half> %B, <4 x half> %C, i8 %Index0)
store <4 x half> %res0, ptr addrspace(1) %out0
%Index1 = extractelement <4 x i8> %IndexVec, i32 1
%res1 = call <4 x half> @llvm.amdgcn.swmmac.f16.16x16x32.f16.v4f16.v4f16.v8f16.v4f16.i8(<4 x half> %A, <8 x half> %B, <4 x half> %C, i8 %Index1)
store <4 x half> %res1, ptr addrspace(1) %out1
%Index2 = extractelement <4 x i8> %IndexVec, i32 2
%res2 = call <4 x half> @llvm.amdgcn.swmmac.f16.16x16x32.f16.v4f16.v4f16.v8f16.v4f16.i8(<4 x half> %A, <8 x half> %B, <4 x half> %C, i8 %Index2)
store <4 x half> %res2, ptr addrspace(1) %out2
%Index3 = extractelement <4 x i8> %IndexVec, i32 3
%res3 = call <4 x half> @llvm.amdgcn.swmmac.f16.16x16x32.f16.v4f16.v4f16.v8f16.v4f16.i8(<4 x half> %A, <8 x half> %B, <4 x half> %C, i8 %Index3)
store <4 x half> %res3, ptr addrspace(1) %out3
ret void
}
define amdgpu_ps void @test_swmmac_bf16_16x16x32_bf16_index_key(<4 x i16> %A, <8 x i16> %B, <4 x i16> %C, ptr addrspace(1) %IndexVecPtr, ptr addrspace(1) %out0, ptr addrspace(1) %out1, ptr addrspace(1) %out2, ptr addrspace(1) %out3) {
; GFX12-LABEL: test_swmmac_bf16_16x16x32_bf16_index_key:
; GFX12: ; %bb.0: ; %bb
; GFX12-NEXT: global_load_b32 v22, v[8:9], off
; GFX12-NEXT: v_mov_b32_e32 v9, v7
; GFX12-NEXT: v_mov_b32_e32 v8, v6
; GFX12-NEXT: v_mov_b32_e32 v19, v7
; GFX12-NEXT: v_mov_b32_e32 v18, v6
; GFX12-NEXT: v_mov_b32_e32 v21, v7
; GFX12-NEXT: v_mov_b32_e32 v20, v6
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: v_swmmac_bf16_16x16x32_bf16 v[8:9], v[0:1], v[2:5], v22
; GFX12-NEXT: v_swmmac_bf16_16x16x32_bf16 v[18:19], v[0:1], v[2:5], v22 index_key:1
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3)
; GFX12-NEXT: v_swmmac_bf16_16x16x32_bf16 v[20:21], v[0:1], v[2:5], v22 index_key:2
; GFX12-NEXT: v_swmmac_bf16_16x16x32_bf16 v[6:7], v[0:1], v[2:5], v22 index_key:3
; GFX12-NEXT: global_store_b64 v[10:11], v[8:9], off
; GFX12-NEXT: global_store_b64 v[12:13], v[18:19], off
; GFX12-NEXT: global_store_b64 v[14:15], v[20:21], off
; GFX12-NEXT: global_store_b64 v[16:17], v[6:7], off
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
bb:
%IndexVec = load <4 x i8>, ptr addrspace(1) %IndexVecPtr, align 4
%Index0 = extractelement <4 x i8> %IndexVec, i32 0
%res0 = call <4 x i16> @llvm.amdgcn.swmmac.bf16.16x16x32.bf16.v4i16.v4i16.v8i16.v4i16.i8(<4 x i16> %A, <8 x i16> %B, <4 x i16> %C, i8 %Index0)
store <4 x i16> %res0, ptr addrspace(1) %out0
%Index1 = extractelement <4 x i8> %IndexVec, i32 1
%res1 = call <4 x i16> @llvm.amdgcn.swmmac.bf16.16x16x32.bf16.v4i16.v4i16.v8i16.v4i16.i8(<4 x i16> %A, <8 x i16> %B, <4 x i16> %C, i8 %Index1)
store <4 x i16> %res1, ptr addrspace(1) %out1
%Index2 = extractelement <4 x i8> %IndexVec, i32 2
%res2 = call <4 x i16> @llvm.amdgcn.swmmac.bf16.16x16x32.bf16.v4i16.v4i16.v8i16.v4i16.i8(<4 x i16> %A, <8 x i16> %B, <4 x i16> %C, i8 %Index2)
store <4 x i16> %res2, ptr addrspace(1) %out2
%Index3 = extractelement <4 x i8> %IndexVec, i32 3
%res3 = call <4 x i16> @llvm.amdgcn.swmmac.bf16.16x16x32.bf16.v4i16.v4i16.v8i16.v4i16.i8(<4 x i16> %A, <8 x i16> %B, <4 x i16> %C, i8 %Index3)
store <4 x i16> %res3, ptr addrspace(1) %out3
ret void
}
define amdgpu_ps void @test_swmmac_i32_16x16x32_iu8_index_key(i32 %A, <2 x i32> %B, <4 x i32> %C, ptr addrspace(1) %IndexVecPtr, ptr addrspace(1) %out0, ptr addrspace(1) %out1, ptr addrspace(1) %out2, ptr addrspace(1) %out3) {
; GFX12-LABEL: test_swmmac_i32_16x16x32_iu8_index_key:
; GFX12: ; %bb.0: ; %bb
; GFX12-NEXT: global_load_b32 v7, v[7:8], off
; GFX12-NEXT: v_mov_b32_e32 v20, v6
; GFX12-NEXT: v_mov_b32_e32 v19, v5
; GFX12-NEXT: v_mov_b32_e32 v18, v4
; GFX12-NEXT: v_mov_b32_e32 v17, v3
; GFX12-NEXT: v_mov_b32_e32 v24, v6
; GFX12-NEXT: v_mov_b32_e32 v23, v5
; GFX12-NEXT: v_mov_b32_e32 v22, v4
; GFX12-NEXT: v_mov_b32_e32 v21, v3
; GFX12-NEXT: v_mov_b32_e32 v28, v6
; GFX12-NEXT: v_mov_b32_e32 v27, v5
; GFX12-NEXT: v_mov_b32_e32 v26, v4
; GFX12-NEXT: v_mov_b32_e32 v25, v3
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: v_swmmac_i32_16x16x32_iu8 v[17:20], v0, v[1:2], v7
; GFX12-NEXT: v_swmmac_i32_16x16x32_iu8 v[21:24], v0, v[1:2], v7 index_key:1
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3)
; GFX12-NEXT: v_swmmac_i32_16x16x32_iu8 v[25:28], v0, v[1:2], v7 index_key:2
; GFX12-NEXT: v_swmmac_i32_16x16x32_iu8 v[3:6], v0, v[1:2], v7 index_key:3
; GFX12-NEXT: global_store_b128 v[9:10], v[17:20], off
; GFX12-NEXT: global_store_b128 v[11:12], v[21:24], off
; GFX12-NEXT: global_store_b128 v[13:14], v[25:28], off
; GFX12-NEXT: global_store_b128 v[15:16], v[3:6], off
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
bb:
%IndexVec = load <4 x i8>, ptr addrspace(1) %IndexVecPtr, align 4
%Index0 = extractelement <4 x i8> %IndexVec, i32 0
%res0 = call <4 x i32> @llvm.amdgcn.swmmac.i32.16x16x32.iu8.v4i32.i32.v2i32.v4i32.i8(i1 0, i32 %A, i1 0, <2 x i32> %B, <4 x i32> %C, i8 %Index0, i1 0)
store <4 x i32> %res0, ptr addrspace(1) %out0
%Index1 = extractelement <4 x i8> %IndexVec, i32 1
%res1 = call <4 x i32> @llvm.amdgcn.swmmac.i32.16x16x32.iu8.v4i32.i32.v2i32.v4i32.i8(i1 0, i32 %A, i1 0, <2 x i32> %B, <4 x i32> %C, i8 %Index1, i1 0)
store <4 x i32> %res1, ptr addrspace(1) %out1
%Index2 = extractelement <4 x i8> %IndexVec, i32 2
%res2 = call <4 x i32> @llvm.amdgcn.swmmac.i32.16x16x32.iu8.v4i32.i32.v2i32.v4i32.i8(i1 0, i32 %A, i1 0, <2 x i32> %B, <4 x i32> %C, i8 %Index2, i1 0)
store <4 x i32> %res2, ptr addrspace(1) %out2
%Index3 = extractelement <4 x i8> %IndexVec, i32 3
%res3 = call <4 x i32> @llvm.amdgcn.swmmac.i32.16x16x32.iu8.v4i32.i32.v2i32.v4i32.i8(i1 0, i32 %A, i1 0, <2 x i32> %B, <4 x i32> %C, i8 %Index3, i1 0)
store <4 x i32> %res3, ptr addrspace(1) %out3
ret void
}
define amdgpu_ps void @test_swmmac_i32_16x16x32_iu4_index_key(i32 %A, i32 %B, <4 x i32> %C, ptr addrspace(1) %IndexVecPtr, ptr addrspace(1) %out0, ptr addrspace(1) %out1) {
; GFX12-LABEL: test_swmmac_i32_16x16x32_iu4_index_key:
; GFX12: ; %bb.0: ; %bb
; GFX12-NEXT: global_load_b32 v6, v[6:7], off
; GFX12-NEXT: v_mov_b32_e32 v15, v5
; GFX12-NEXT: v_mov_b32_e32 v14, v4
; GFX12-NEXT: v_mov_b32_e32 v13, v3
; GFX12-NEXT: v_mov_b32_e32 v12, v2
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-NEXT: v_swmmac_i32_16x16x32_iu4 v[12:15], v0, v1, v6
; GFX12-NEXT: v_swmmac_i32_16x16x32_iu4 v[2:5], v0, v1, v6 index_key:1
; GFX12-NEXT: global_store_b128 v[8:9], v[12:15], off
; GFX12-NEXT: global_store_b128 v[10:11], v[2:5], off
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
bb:
%IndexVec = load <2 x i16>, ptr addrspace(1) %IndexVecPtr, align 4
%Index0 = extractelement <2 x i16> %IndexVec, i32 0
%res0 = call <4 x i32> @llvm.amdgcn.swmmac.i32.16x16x32.iu4.v4i32.i32.i32.v4i32.i16(i1 0, i32 %A, i1 0, i32 %B, <4 x i32> %C, i16 %Index0, i1 0)
store <4 x i32> %res0, ptr addrspace(1) %out0
%Index1 = extractelement <2 x i16> %IndexVec, i32 1
%res1 = call <4 x i32> @llvm.amdgcn.swmmac.i32.16x16x32.iu4.v4i32.i32.i32.v4i32.i16(i1 0, i32 %A, i1 0, i32 %B, <4 x i32> %C, i16 %Index1, i1 0)
store <4 x i32> %res1, ptr addrspace(1) %out1
ret void
}
define amdgpu_ps void @test_swmmac_i32_16x16x64_iu4_index_key(i32 %A, <2 x i32> %B, <4 x i32> %C, ptr addrspace(1) %IndexVecPtr, ptr addrspace(1) %out0, ptr addrspace(1) %out1) {
; GFX12-LABEL: test_swmmac_i32_16x16x64_iu4_index_key:
; GFX12: ; %bb.0: ; %bb
; GFX12-NEXT: global_load_b32 v7, v[7:8], off
; GFX12-NEXT: v_mov_b32_e32 v16, v6
; GFX12-NEXT: v_mov_b32_e32 v15, v5
; GFX12-NEXT: v_mov_b32_e32 v14, v4
; GFX12-NEXT: v_mov_b32_e32 v13, v3
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-NEXT: v_swmmac_i32_16x16x64_iu4 v[13:16], v0, v[1:2], v7
; GFX12-NEXT: v_swmmac_i32_16x16x64_iu4 v[3:6], v0, v[1:2], v7 index_key:1
; GFX12-NEXT: global_store_b128 v[9:10], v[13:16], off
; GFX12-NEXT: global_store_b128 v[11:12], v[3:6], off
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
bb:
%IndexVec = load <2 x i16>, ptr addrspace(1) %IndexVecPtr, align 4
%Index0 = extractelement <2 x i16> %IndexVec, i32 0
%res0 = call <4 x i32> @llvm.amdgcn.swmmac.i32.16x16x64.iu4.v4i32.i32.v2i32.v4i32.i16(i1 0, i32 %A, i1 0, <2 x i32> %B, <4 x i32> %C, i16 %Index0, i1 0)
store <4 x i32> %res0, ptr addrspace(1) %out0
%Index1 = extractelement <2 x i16> %IndexVec, i32 1
%res1 = call <4 x i32> @llvm.amdgcn.swmmac.i32.16x16x64.iu4.v4i32.i32.v2i32.v4i32.i16(i1 0, i32 %A, i1 0, <2 x i32> %B, <4 x i32> %C, i16 %Index1, i1 0)
store <4 x i32> %res1, ptr addrspace(1) %out1
ret void
}
define amdgpu_ps void @test_swmmac_f32_16x16x32_fp8_fp8_index_key(i32 %A, <2 x i32> %B, <4 x float> %C, ptr addrspace(1) %IndexVecPtr, ptr addrspace(1) %out0, ptr addrspace(1) %out1, ptr addrspace(1) %out2, ptr addrspace(1) %out3) {
; GFX12-LABEL: test_swmmac_f32_16x16x32_fp8_fp8_index_key:
; GFX12: ; %bb.0: ; %bb
; GFX12-NEXT: global_load_b32 v7, v[7:8], off
; GFX12-NEXT: v_mov_b32_e32 v20, v6
; GFX12-NEXT: v_mov_b32_e32 v19, v5
; GFX12-NEXT: v_mov_b32_e32 v18, v4
; GFX12-NEXT: v_mov_b32_e32 v17, v3
; GFX12-NEXT: v_mov_b32_e32 v24, v6
; GFX12-NEXT: v_mov_b32_e32 v23, v5
; GFX12-NEXT: v_mov_b32_e32 v22, v4
; GFX12-NEXT: v_mov_b32_e32 v21, v3
; GFX12-NEXT: v_mov_b32_e32 v28, v6
; GFX12-NEXT: v_mov_b32_e32 v27, v5
; GFX12-NEXT: v_mov_b32_e32 v26, v4
; GFX12-NEXT: v_mov_b32_e32 v25, v3
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: v_swmmac_f32_16x16x32_fp8_fp8 v[17:20], v0, v[1:2], v7
; GFX12-NEXT: v_swmmac_f32_16x16x32_fp8_fp8 v[21:24], v0, v[1:2], v7 index_key:1
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3)
; GFX12-NEXT: v_swmmac_f32_16x16x32_fp8_fp8 v[25:28], v0, v[1:2], v7 index_key:2
; GFX12-NEXT: v_swmmac_f32_16x16x32_fp8_fp8 v[3:6], v0, v[1:2], v7 index_key:3
; GFX12-NEXT: global_store_b128 v[9:10], v[17:20], off
; GFX12-NEXT: global_store_b128 v[11:12], v[21:24], off
; GFX12-NEXT: global_store_b128 v[13:14], v[25:28], off
; GFX12-NEXT: global_store_b128 v[15:16], v[3:6], off
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
bb:
%IndexVec = load <4 x i8>, ptr addrspace(1) %IndexVecPtr, align 4
%Index0 = extractelement <4 x i8> %IndexVec, i32 0
%res0 = call <4 x float> @llvm.amdgcn.swmmac.f32.16x16x32.fp8.fp8.v4f32.i32.v2i32.v4f32.i8(i32 %A, <2 x i32> %B, <4 x float> %C, i8 %Index0)
store <4 x float> %res0, ptr addrspace(1) %out0
%Index1 = extractelement <4 x i8> %IndexVec, i32 1
%res1 = call <4 x float> @llvm.amdgcn.swmmac.f32.16x16x32.fp8.fp8.v4f32.i32.v2i32.v4f32.i8(i32 %A, <2 x i32> %B, <4 x float> %C, i8 %Index1)
store <4 x float> %res1, ptr addrspace(1) %out1
%Index2 = extractelement <4 x i8> %IndexVec, i32 2
%res2 = call <4 x float> @llvm.amdgcn.swmmac.f32.16x16x32.fp8.fp8.v4f32.i32.v2i32.v4f32.i8(i32 %A, <2 x i32> %B, <4 x float> %C, i8 %Index2)
store <4 x float> %res2, ptr addrspace(1) %out2
%Index3 = extractelement <4 x i8> %IndexVec, i32 3
%res3 = call <4 x float> @llvm.amdgcn.swmmac.f32.16x16x32.fp8.fp8.v4f32.i32.v2i32.v4f32.i8(i32 %A, <2 x i32> %B, <4 x float> %C, i8 %Index3)
store <4 x float> %res3, ptr addrspace(1) %out3
ret void
}
define amdgpu_ps void @test_swmmac_f32_16x16x32_fp8_bf8_index_key(i32 %A, <2 x i32> %B, <4 x float> %C, ptr addrspace(1) %IndexVecPtr, ptr addrspace(1) %out0, ptr addrspace(1) %out1, ptr addrspace(1) %out2, ptr addrspace(1) %out3) {
; GFX12-LABEL: test_swmmac_f32_16x16x32_fp8_bf8_index_key:
; GFX12: ; %bb.0: ; %bb
; GFX12-NEXT: global_load_b32 v7, v[7:8], off
; GFX12-NEXT: v_mov_b32_e32 v20, v6
; GFX12-NEXT: v_mov_b32_e32 v19, v5
; GFX12-NEXT: v_mov_b32_e32 v18, v4
; GFX12-NEXT: v_mov_b32_e32 v17, v3
; GFX12-NEXT: v_mov_b32_e32 v24, v6
; GFX12-NEXT: v_mov_b32_e32 v23, v5
; GFX12-NEXT: v_mov_b32_e32 v22, v4
; GFX12-NEXT: v_mov_b32_e32 v21, v3
; GFX12-NEXT: v_mov_b32_e32 v28, v6
; GFX12-NEXT: v_mov_b32_e32 v27, v5
; GFX12-NEXT: v_mov_b32_e32 v26, v4
; GFX12-NEXT: v_mov_b32_e32 v25, v3
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: v_swmmac_f32_16x16x32_fp8_bf8 v[17:20], v0, v[1:2], v7
; GFX12-NEXT: v_swmmac_f32_16x16x32_fp8_bf8 v[21:24], v0, v[1:2], v7 index_key:1
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3)
; GFX12-NEXT: v_swmmac_f32_16x16x32_fp8_bf8 v[25:28], v0, v[1:2], v7 index_key:2
; GFX12-NEXT: v_swmmac_f32_16x16x32_fp8_bf8 v[3:6], v0, v[1:2], v7 index_key:3
; GFX12-NEXT: global_store_b128 v[9:10], v[17:20], off
; GFX12-NEXT: global_store_b128 v[11:12], v[21:24], off
; GFX12-NEXT: global_store_b128 v[13:14], v[25:28], off
; GFX12-NEXT: global_store_b128 v[15:16], v[3:6], off
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
bb:
%IndexVec = load <4 x i8>, ptr addrspace(1) %IndexVecPtr, align 4
%Index0 = extractelement <4 x i8> %IndexVec, i32 0
%res0 = call <4 x float> @llvm.amdgcn.swmmac.f32.16x16x32.fp8.bf8.v4f32.i32.v2i32.v4f32.i8(i32 %A, <2 x i32> %B, <4 x float> %C, i8 %Index0)
store <4 x float> %res0, ptr addrspace(1) %out0
%Index1 = extractelement <4 x i8> %IndexVec, i32 1
%res1 = call <4 x float> @llvm.amdgcn.swmmac.f32.16x16x32.fp8.bf8.v4f32.i32.v2i32.v4f32.i8(i32 %A, <2 x i32> %B, <4 x float> %C, i8 %Index1)
store <4 x float> %res1, ptr addrspace(1) %out1
%Index2 = extractelement <4 x i8> %IndexVec, i32 2
%res2 = call <4 x float> @llvm.amdgcn.swmmac.f32.16x16x32.fp8.bf8.v4f32.i32.v2i32.v4f32.i8(i32 %A, <2 x i32> %B, <4 x float> %C, i8 %Index2)
store <4 x float> %res2, ptr addrspace(1) %out2
%Index3 = extractelement <4 x i8> %IndexVec, i32 3
%res3 = call <4 x float> @llvm.amdgcn.swmmac.f32.16x16x32.fp8.bf8.v4f32.i32.v2i32.v4f32.i8(i32 %A, <2 x i32> %B, <4 x float> %C, i8 %Index3)
store <4 x float> %res3, ptr addrspace(1) %out3
ret void
}
define amdgpu_ps void @test_swmmac_f32_16x16x32_bf8_fp8_index_key(i32 %A, <2 x i32> %B, <4 x float> %C, ptr addrspace(1) %IndexVecPtr, ptr addrspace(1) %out0, ptr addrspace(1) %out1, ptr addrspace(1) %out2, ptr addrspace(1) %out3) {
; GFX12-LABEL: test_swmmac_f32_16x16x32_bf8_fp8_index_key:
; GFX12: ; %bb.0: ; %bb
; GFX12-NEXT: global_load_b32 v7, v[7:8], off
; GFX12-NEXT: v_mov_b32_e32 v20, v6
; GFX12-NEXT: v_mov_b32_e32 v19, v5
; GFX12-NEXT: v_mov_b32_e32 v18, v4
; GFX12-NEXT: v_mov_b32_e32 v17, v3
; GFX12-NEXT: v_mov_b32_e32 v24, v6
; GFX12-NEXT: v_mov_b32_e32 v23, v5
; GFX12-NEXT: v_mov_b32_e32 v22, v4
; GFX12-NEXT: v_mov_b32_e32 v21, v3
; GFX12-NEXT: v_mov_b32_e32 v28, v6
; GFX12-NEXT: v_mov_b32_e32 v27, v5
; GFX12-NEXT: v_mov_b32_e32 v26, v4
; GFX12-NEXT: v_mov_b32_e32 v25, v3
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: v_swmmac_f32_16x16x32_bf8_fp8 v[17:20], v0, v[1:2], v7
; GFX12-NEXT: v_swmmac_f32_16x16x32_bf8_fp8 v[21:24], v0, v[1:2], v7 index_key:1
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3)
; GFX12-NEXT: v_swmmac_f32_16x16x32_bf8_fp8 v[25:28], v0, v[1:2], v7 index_key:2
; GFX12-NEXT: v_swmmac_f32_16x16x32_bf8_fp8 v[3:6], v0, v[1:2], v7 index_key:3
; GFX12-NEXT: global_store_b128 v[9:10], v[17:20], off
; GFX12-NEXT: global_store_b128 v[11:12], v[21:24], off
; GFX12-NEXT: global_store_b128 v[13:14], v[25:28], off
; GFX12-NEXT: global_store_b128 v[15:16], v[3:6], off
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
bb:
%IndexVec = load <4 x i8>, ptr addrspace(1) %IndexVecPtr, align 4
%Index0 = extractelement <4 x i8> %IndexVec, i32 0
%res0 = call <4 x float> @llvm.amdgcn.swmmac.f32.16x16x32.bf8.fp8.v4f32.i32.v2i32.v4f32.i8(i32 %A, <2 x i32> %B, <4 x float> %C, i8 %Index0)
store <4 x float> %res0, ptr addrspace(1) %out0
%Index1 = extractelement <4 x i8> %IndexVec, i32 1
%res1 = call <4 x float> @llvm.amdgcn.swmmac.f32.16x16x32.bf8.fp8.v4f32.i32.v2i32.v4f32.i8(i32 %A, <2 x i32> %B, <4 x float> %C, i8 %Index1)
store <4 x float> %res1, ptr addrspace(1) %out1
%Index2 = extractelement <4 x i8> %IndexVec, i32 2
%res2 = call <4 x float> @llvm.amdgcn.swmmac.f32.16x16x32.bf8.fp8.v4f32.i32.v2i32.v4f32.i8(i32 %A, <2 x i32> %B, <4 x float> %C, i8 %Index2)
store <4 x float> %res2, ptr addrspace(1) %out2
%Index3 = extractelement <4 x i8> %IndexVec, i32 3
%res3 = call <4 x float> @llvm.amdgcn.swmmac.f32.16x16x32.bf8.fp8.v4f32.i32.v2i32.v4f32.i8(i32 %A, <2 x i32> %B, <4 x float> %C, i8 %Index3)
store <4 x float> %res3, ptr addrspace(1) %out3
ret void
}
define amdgpu_ps void @test_swmmac_f32_16x16x32_bf8_bf8_index_key(i32 %A, <2 x i32> %B, <4 x float> %C, ptr addrspace(1) %IndexVecPtr, ptr addrspace(1) %out0, ptr addrspace(1) %out1, ptr addrspace(1) %out2, ptr addrspace(1) %out3) {
; GFX12-LABEL: test_swmmac_f32_16x16x32_bf8_bf8_index_key:
; GFX12: ; %bb.0: ; %bb
; GFX12-NEXT: global_load_b32 v7, v[7:8], off
; GFX12-NEXT: v_mov_b32_e32 v20, v6
; GFX12-NEXT: v_mov_b32_e32 v19, v5
; GFX12-NEXT: v_mov_b32_e32 v18, v4
; GFX12-NEXT: v_mov_b32_e32 v17, v3
; GFX12-NEXT: v_mov_b32_e32 v24, v6
; GFX12-NEXT: v_mov_b32_e32 v23, v5
; GFX12-NEXT: v_mov_b32_e32 v22, v4
; GFX12-NEXT: v_mov_b32_e32 v21, v3
; GFX12-NEXT: v_mov_b32_e32 v28, v6
; GFX12-NEXT: v_mov_b32_e32 v27, v5
; GFX12-NEXT: v_mov_b32_e32 v26, v4
; GFX12-NEXT: v_mov_b32_e32 v25, v3
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: v_swmmac_f32_16x16x32_bf8_bf8 v[17:20], v0, v[1:2], v7
; GFX12-NEXT: v_swmmac_f32_16x16x32_bf8_bf8 v[21:24], v0, v[1:2], v7 index_key:1
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3)
; GFX12-NEXT: v_swmmac_f32_16x16x32_bf8_bf8 v[25:28], v0, v[1:2], v7 index_key:2
; GFX12-NEXT: v_swmmac_f32_16x16x32_bf8_bf8 v[3:6], v0, v[1:2], v7 index_key:3
; GFX12-NEXT: global_store_b128 v[9:10], v[17:20], off
; GFX12-NEXT: global_store_b128 v[11:12], v[21:24], off
; GFX12-NEXT: global_store_b128 v[13:14], v[25:28], off
; GFX12-NEXT: global_store_b128 v[15:16], v[3:6], off
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
bb:
%IndexVec = load <4 x i8>, ptr addrspace(1) %IndexVecPtr, align 4
%Index0 = extractelement <4 x i8> %IndexVec, i32 0
%res0 = call <4 x float> @llvm.amdgcn.swmmac.f32.16x16x32.bf8.bf8.v4f32.i32.v2i32.v4f32.i8(i32 %A, <2 x i32> %B, <4 x float> %C, i8 %Index0)
store <4 x float> %res0, ptr addrspace(1) %out0
%Index1 = extractelement <4 x i8> %IndexVec, i32 1
%res1 = call <4 x float> @llvm.amdgcn.swmmac.f32.16x16x32.bf8.bf8.v4f32.i32.v2i32.v4f32.i8(i32 %A, <2 x i32> %B, <4 x float> %C, i8 %Index1)
store <4 x float> %res1, ptr addrspace(1) %out1
%Index2 = extractelement <4 x i8> %IndexVec, i32 2
%res2 = call <4 x float> @llvm.amdgcn.swmmac.f32.16x16x32.bf8.bf8.v4f32.i32.v2i32.v4f32.i8(i32 %A, <2 x i32> %B, <4 x float> %C, i8 %Index2)
store <4 x float> %res2, ptr addrspace(1) %out2
%Index3 = extractelement <4 x i8> %IndexVec, i32 3
%res3 = call <4 x float> @llvm.amdgcn.swmmac.f32.16x16x32.bf8.bf8.v4f32.i32.v2i32.v4f32.i8(i32 %A, <2 x i32> %B, <4 x float> %C, i8 %Index3)
store <4 x float> %res3, ptr addrspace(1) %out3
ret void
}
declare <4 x float> @llvm.amdgcn.swmmac.f32.16x16x32.f16.v4f32.v4f16.v8f16.v4f32.i8(<4 x half>, <8 x half>, <4 x float>, i8)
declare <4 x float> @llvm.amdgcn.swmmac.f32.16x16x32.bf16.v4f32.v4i16.v8i16.v4f32.i8(<4 x i16>, <8 x i16>, <4 x float>, i8)
declare <4 x half> @llvm.amdgcn.swmmac.f16.16x16x32.f16.v4f16.v4f16.v8f16.v4f16.i8(<4 x half>, <8 x half>, <4 x half>, i8)
declare <4 x i16> @llvm.amdgcn.swmmac.bf16.16x16x32.bf16.v4i16.v4i16.v8i16.v4i16.i8(<4 x i16>, <8 x i16>, <4 x i16>, i8)
declare <4 x i32> @llvm.amdgcn.swmmac.i32.16x16x32.iu8.v4i32.i32.v2i32.v4i32.i8(i1 immarg, i32, i1 immarg, <2 x i32>, <4 x i32>, i8 %Index, i1 immarg)
declare <4 x i32> @llvm.amdgcn.swmmac.i32.16x16x32.iu4.v4i32.i32.i32.v4i32.i16(i1 immarg, i32, i1 immarg, i32, <4 x i32>, i16 %Index, i1 immarg)
declare <4 x i32> @llvm.amdgcn.swmmac.i32.16x16x64.iu4.v4i32.i32.v2i32.v4i32.i16(i1 immarg, i32, i1 immarg, <2 x i32>, <4 x i32>, i16 %Index, i1 immarg)
declare <4 x float> @llvm.amdgcn.swmmac.f32.16x16x32.fp8.fp8.v4f32.i32.v2i32.v4f32.i8(i32, <2 x i32>, <4 x float>, i8)
declare <4 x float> @llvm.amdgcn.swmmac.f32.16x16x32.fp8.bf8.v4f32.i32.v2i32.v4f32.i8(i32, <2 x i32>, <4 x float>, i8)
declare <4 x float> @llvm.amdgcn.swmmac.f32.16x16x32.bf8.fp8.v4f32.i32.v2i32.v4f32.i8(i32, <2 x i32>, <4 x float>, i8)
declare <4 x float> @llvm.amdgcn.swmmac.f32.16x16x32.bf8.bf8.v4f32.i32.v2i32.v4f32.i8(i32, <2 x i32>, <4 x float>, i8)

View File

@ -0,0 +1,333 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
; RUN: llc -global-isel -march=amdgcn -mcpu=gfx1200 -mattr=-wavefrontsize32,+wavefrontsize64 -verify-machineinstrs < %s | FileCheck %s --check-prefix=GFX12
define amdgpu_ps void @test_wmma_f32_16x16x16_f16(<4 x half> %A, <4 x half> %B, <4 x float> %C, ptr addrspace(1) %out) {
; GFX12-LABEL: test_wmma_f32_16x16x16_f16:
; GFX12: ; %bb.0: ; %bb
; GFX12-NEXT: v_wmma_f32_16x16x16_f16 v[4:7], v[0:1], v[2:3], v[4:7]
; GFX12-NEXT: global_store_b128 v[8:9], v[4:7], off
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
bb:
%res = call <4 x float>@llvm.amdgcn.wmma.f32.16x16x16.f16.v4f32.v4f16.v4f16.v4f32(<4 x half> %A, <4 x half> %B, <4 x float> %C)
store <4 x float> %res, ptr addrspace(1) %out
ret void
}
define amdgpu_ps void @test_wmma_f32_16x16x16_bf16(<4 x i16> %A, <4 x i16> %B, <4 x float> %C, ptr addrspace(1) %out) {
; GFX12-LABEL: test_wmma_f32_16x16x16_bf16:
; GFX12: ; %bb.0: ; %bb
; GFX12-NEXT: v_wmma_f32_16x16x16_bf16 v[4:7], v[0:1], v[2:3], v[4:7]
; GFX12-NEXT: global_store_b128 v[8:9], v[4:7], off
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
bb:
%res = call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf16.v4f32.v4i16.v4i16.v4f32(<4 x i16> %A, <4 x i16> %B, <4 x float> %C)
store <4 x float> %res, ptr addrspace(1) %out
ret void
}
define amdgpu_ps void @test_wmma_f16_16x16x16_f16(<4 x half> %A, <4 x half> %B, <4 x half> %C, ptr addrspace(1) %out) {
; GFX12-LABEL: test_wmma_f16_16x16x16_f16:
; GFX12: ; %bb.0: ; %bb
; GFX12-NEXT: v_wmma_f16_16x16x16_f16 v[4:5], v[0:1], v[2:3], v[4:5]
; GFX12-NEXT: global_store_b64 v[6:7], v[4:5], off
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
bb:
%res = call <4 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16.v4f16.v4f16.v4f16.v4f16(<4 x half> %A, <4 x half> %B, <4 x half> %C, i1 0)
store <4 x half> %res, ptr addrspace(1) %out
ret void
}
define amdgpu_ps void @test_wmma_bf16_16x16x16_bf16(<4 x i16> %A, <4 x i16> %B, <4 x i16> %C, ptr addrspace(1) %out) {
; GFX12-LABEL: test_wmma_bf16_16x16x16_bf16:
; GFX12: ; %bb.0: ; %bb
; GFX12-NEXT: v_wmma_bf16_16x16x16_bf16 v[4:5], v[0:1], v[2:3], v[4:5]
; GFX12-NEXT: global_store_b64 v[6:7], v[4:5], off
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
bb:
%res = call <4 x i16> @llvm.amdgcn.wmma.bf16.16x16x16.bf16.v4i16.v4i16.v4i16.v4i16(<4 x i16> %A, <4 x i16> %B, <4 x i16> %C, i1 0)
store <4 x i16> %res, ptr addrspace(1) %out
ret void
}
define amdgpu_ps void @test_wmma_i32_16x16x16_iu8(i32 %A, i32 %B, <4 x i32> %C, ptr addrspace(1) %out) {
; GFX12-LABEL: test_wmma_i32_16x16x16_iu8:
; GFX12: ; %bb.0: ; %bb
; GFX12-NEXT: v_wmma_i32_16x16x16_iu8 v[2:5], v0, v1, v[2:5]
; GFX12-NEXT: global_store_b128 v[6:7], v[2:5], off
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
bb:
%res = call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8.v4i32.i32.i32.v4i32(i1 0, i32 %A, i1 0, i32 %B, <4 x i32> %C, i1 0)
store <4 x i32> %res, ptr addrspace(1) %out
ret void
}
define amdgpu_ps void @test_wmma_i32_16x16x16_iu4(i32 %A, i32 %B, <4 x i32> %C, ptr addrspace(1) %out) {
; GFX12-LABEL: test_wmma_i32_16x16x16_iu4:
; GFX12: ; %bb.0: ; %bb
; GFX12-NEXT: v_wmma_i32_16x16x16_iu4 v[2:5], v0, v1, v[2:5]
; GFX12-NEXT: global_store_b128 v[6:7], v[2:5], off
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
bb:
%res = call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4.v4i32.i32.i32.v4i32(i1 0, i32 %A, i1 0, i32 %B, <4 x i32> %C, i1 0)
store <4 x i32> %res, ptr addrspace(1) %out
ret void
}
define amdgpu_ps void @test_wmma_f32_16x16x16_fp8_fp8(i32 %A, i32 %B, <4 x float> %C, ptr addrspace(1) %out) {
; GFX12-LABEL: test_wmma_f32_16x16x16_fp8_fp8:
; GFX12: ; %bb.0: ; %bb
; GFX12-NEXT: v_wmma_f32_16x16x16_fp8_fp8 v[2:5], v0, v1, v[2:5]
; GFX12-NEXT: global_store_b128 v[6:7], v[2:5], off
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
bb:
%res = call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.fp8.fp8.v4f32.i32.i32.v4f32(i32 %A, i32 %B, <4 x float> %C)
store <4 x float> %res, ptr addrspace(1) %out
ret void
}
define amdgpu_ps void @test_wmma_f32_16x16x16_bf8_fp8(i32 %A, i32 %B, <4 x float> %C, ptr addrspace(1) %out) {
; GFX12-LABEL: test_wmma_f32_16x16x16_bf8_fp8:
; GFX12: ; %bb.0: ; %bb
; GFX12-NEXT: v_wmma_f32_16x16x16_bf8_fp8 v[2:5], v0, v1, v[2:5]
; GFX12-NEXT: global_store_b128 v[6:7], v[2:5], off
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
bb:
%res = call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf8.fp8.v4f32.i32.i32.v4f32(i32 %A, i32 %B, <4 x float> %C)
store <4 x float> %res, ptr addrspace(1) %out
ret void
}
define amdgpu_ps void @test_wmma_f32_16x16x16_fp8_bf8(i32 %A, i32 %B, <4 x float> %C, ptr addrspace(1) %out) {
; GFX12-LABEL: test_wmma_f32_16x16x16_fp8_bf8:
; GFX12: ; %bb.0: ; %bb
; GFX12-NEXT: v_wmma_f32_16x16x16_fp8_bf8 v[2:5], v0, v1, v[2:5]
; GFX12-NEXT: global_store_b128 v[6:7], v[2:5], off
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
bb:
%res = call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.fp8.bf8.v4f32.i32.i32.v4f32(i32 %A, i32 %B, <4 x float> %C)
store <4 x float> %res, ptr addrspace(1) %out
ret void
}
define amdgpu_ps void @test_wmma_f32_16x16x16_bf8_bf8(i32 %A, i32 %B, <4 x float> %C, ptr addrspace(1) %out) {
; GFX12-LABEL: test_wmma_f32_16x16x16_bf8_bf8:
; GFX12: ; %bb.0: ; %bb
; GFX12-NEXT: v_wmma_f32_16x16x16_bf8_bf8 v[2:5], v0, v1, v[2:5]
; GFX12-NEXT: global_store_b128 v[6:7], v[2:5], off
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
bb:
%res = call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf8.bf8.v4f32.i32.i32.v4f32(i32 %A, i32 %B, <4 x float> %C)
store <4 x float> %res, ptr addrspace(1) %out
ret void
}
define amdgpu_ps void @test_wmma_i32_16x16x32_iu4(i32 %A, i32 %B, <4 x i32> %C, ptr addrspace(1) %out) {
; GFX12-LABEL: test_wmma_i32_16x16x32_iu4:
; GFX12: ; %bb.0: ; %bb
; GFX12-NEXT: v_wmma_i32_16x16x32_iu4 v[2:5], v0, v1, v[2:5]
; GFX12-NEXT: global_store_b128 v[6:7], v[2:5], off
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
bb:
%res = call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x32.iu4.v4i32.i32.i32.v4i32(i1 0, i32 %A, i1 0, i32 %B, <4 x i32> %C, i1 0)
store <4 x i32> %res, ptr addrspace(1) %out
ret void
}
define amdgpu_ps void @test_swmmac_f32_16x16x32_f16(<4 x half> %A, <8 x half> %B, <4 x float> %C, i8 %Index, ptr addrspace(1) %out) {
; GFX12-LABEL: test_swmmac_f32_16x16x32_f16:
; GFX12: ; %bb.0: ; %bb
; GFX12-NEXT: v_swmmac_f32_16x16x32_f16 v[6:9], v[0:1], v[2:5], v10
; GFX12-NEXT: global_store_b128 v[11:12], v[6:9], off
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
bb:
%res = call <4 x float> @llvm.amdgcn.swmmac.f32.16x16x32.f16.v4f32.v4f16.v8f16.v4f32.i8(<4 x half> %A, <8 x half> %B, <4 x float> %C, i8 %Index)
store <4 x float> %res, ptr addrspace(1) %out
ret void
}
define amdgpu_ps void @test_swmmac_f32_16x16x32_bf16(<4 x i16> %A, <8 x i16> %B, <4 x float> %C, i8 %Index, ptr addrspace(1) %out) {
; GFX12-LABEL: test_swmmac_f32_16x16x32_bf16:
; GFX12: ; %bb.0: ; %bb
; GFX12-NEXT: v_swmmac_f32_16x16x32_bf16 v[6:9], v[0:1], v[2:5], v10
; GFX12-NEXT: global_store_b128 v[11:12], v[6:9], off
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
bb:
%res = call <4 x float> @llvm.amdgcn.swmmac.f32.16x16x32.bf16.v4f32.v4i16.v8i16.v4f32.i8(<4 x i16> %A, <8 x i16> %B, <4 x float> %C, i8 %Index)
store <4 x float> %res, ptr addrspace(1) %out
ret void
}
define amdgpu_ps void @test_swmmac_f16_16x16x32_f16(<4 x half> %A, <8 x half> %B, <4 x half> %C, i8 %Index, ptr addrspace(1) %out) {
; GFX12-LABEL: test_swmmac_f16_16x16x32_f16:
; GFX12: ; %bb.0: ; %bb
; GFX12-NEXT: v_swmmac_f16_16x16x32_f16 v[6:7], v[0:1], v[2:5], v8
; GFX12-NEXT: global_store_b64 v[9:10], v[6:7], off
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
bb:
%res = call <4 x half> @llvm.amdgcn.swmmac.f16.16x16x32.f16.v4f16.v4f16.v8f16.v4f16.i8(<4 x half> %A, <8 x half> %B, <4 x half> %C, i8 %Index)
store <4 x half> %res, ptr addrspace(1) %out
ret void
}
define amdgpu_ps void @test_swmmac_bf16_16x16x32_bf16(<4 x i16> %A, <8 x i16> %B, <4 x i16> %C, i8 %Index, ptr addrspace(1) %out) {
; GFX12-LABEL: test_swmmac_bf16_16x16x32_bf16:
; GFX12: ; %bb.0: ; %bb
; GFX12-NEXT: v_swmmac_bf16_16x16x32_bf16 v[6:7], v[0:1], v[2:5], v8
; GFX12-NEXT: global_store_b64 v[9:10], v[6:7], off
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
bb:
%res = call <4 x i16> @llvm.amdgcn.swmmac.bf16.16x16x32.bf16.v4i16.v4i16.v8i16.v4i16.i8(<4 x i16> %A, <8 x i16> %B, <4 x i16> %C, i8 %Index)
store <4 x i16> %res, ptr addrspace(1) %out
ret void
}
define amdgpu_ps void @test_swmmac_i32_16x16x32_iu8(i32 %A, <2 x i32> %B, <4 x i32> %C, i8 %Index, ptr addrspace(1) %out) {
; GFX12-LABEL: test_swmmac_i32_16x16x32_iu8:
; GFX12: ; %bb.0: ; %bb
; GFX12-NEXT: v_swmmac_i32_16x16x32_iu8 v[3:6], v0, v[1:2], v7
; GFX12-NEXT: global_store_b128 v[8:9], v[3:6], off
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
bb:
%res = call <4 x i32> @llvm.amdgcn.swmmac.i32.16x16x32.iu8.v4i32.i32.v2i32.v4i32.i8(i1 0, i32 %A, i1 0, <2 x i32> %B, <4 x i32> %C, i8 %Index, i1 0)
store <4 x i32> %res, ptr addrspace(1) %out
ret void
}
define amdgpu_ps void @test_swmmac_i32_16x16x32_iu4(i32 %A, i32 %B, <4 x i32> %C, i16 %Index, ptr addrspace(1) %out) {
; GFX12-LABEL: test_swmmac_i32_16x16x32_iu4:
; GFX12: ; %bb.0: ; %bb
; GFX12-NEXT: v_swmmac_i32_16x16x32_iu4 v[2:5], v0, v1, v6
; GFX12-NEXT: global_store_b128 v[7:8], v[2:5], off
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
bb:
%res = call <4 x i32> @llvm.amdgcn.swmmac.i32.16x16x32.iu4.v4i32.i32.i32.v4i32.i16(i1 0, i32 %A, i1 0, i32 %B, <4 x i32> %C, i16 %Index, i1 0)
store <4 x i32> %res, ptr addrspace(1) %out
ret void
}
define amdgpu_ps void @test_swmmac_i32_16x16x64_iu4(i32 %A, <2 x i32> %B, <4 x i32> %C, i16 %Index, ptr addrspace(1) %out) {
; GFX12-LABEL: test_swmmac_i32_16x16x64_iu4:
; GFX12: ; %bb.0: ; %bb
; GFX12-NEXT: v_swmmac_i32_16x16x64_iu4 v[3:6], v0, v[1:2], v7
; GFX12-NEXT: global_store_b128 v[8:9], v[3:6], off
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
bb:
%res = call <4 x i32> @llvm.amdgcn.swmmac.i32.16x16x64.iu4.v4i32.i32.v2i32.v4i32.i16(i1 0, i32 %A, i1 0, <2 x i32> %B, <4 x i32> %C, i16 %Index, i1 0)
store <4 x i32> %res, ptr addrspace(1) %out
ret void
}
define amdgpu_ps void @test_swmmac_f32_16x16x32_fp8_fp8(i32 %A, <2 x i32> %B, <4 x float> %C, i8 %Index, ptr addrspace(1) %out) {
; GFX12-LABEL: test_swmmac_f32_16x16x32_fp8_fp8:
; GFX12: ; %bb.0: ; %bb
; GFX12-NEXT: v_swmmac_f32_16x16x32_fp8_fp8 v[3:6], v0, v[1:2], v7
; GFX12-NEXT: global_store_b128 v[8:9], v[3:6], off
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
bb:
%res = call <4 x float> @llvm.amdgcn.swmmac.f32.16x16x32.fp8.fp8.v4f32.i32.v2i32.v4f32.i8(i32 %A, <2 x i32> %B, <4 x float> %C, i8 %Index)
store <4 x float> %res, ptr addrspace(1) %out
ret void
}
define amdgpu_ps void @test_swmmac_f32_16x16x32_fp8_bf8(i32 %A, <2 x i32> %B, <4 x float> %C, i8 %Index, ptr addrspace(1) %out) {
; GFX12-LABEL: test_swmmac_f32_16x16x32_fp8_bf8:
; GFX12: ; %bb.0: ; %bb
; GFX12-NEXT: v_swmmac_f32_16x16x32_fp8_bf8 v[3:6], v0, v[1:2], v7
; GFX12-NEXT: global_store_b128 v[8:9], v[3:6], off
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
bb:
%res = call <4 x float> @llvm.amdgcn.swmmac.f32.16x16x32.fp8.bf8.v4f32.i32.v2i32.v4f32.i8(i32 %A, <2 x i32> %B, <4 x float> %C, i8 %Index)
store <4 x float> %res, ptr addrspace(1) %out
ret void
}
define amdgpu_ps void @test_swmmac_f32_16x16x32_bf8_fp8(i32 %A, <2 x i32> %B, <4 x float> %C, i8 %Index, ptr addrspace(1) %out) {
; GFX12-LABEL: test_swmmac_f32_16x16x32_bf8_fp8:
; GFX12: ; %bb.0: ; %bb
; GFX12-NEXT: v_swmmac_f32_16x16x32_bf8_fp8 v[3:6], v0, v[1:2], v7
; GFX12-NEXT: global_store_b128 v[8:9], v[3:6], off
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
bb:
%res = call <4 x float> @llvm.amdgcn.swmmac.f32.16x16x32.bf8.fp8.v4f32.i32.v2i32.v4f32.i8(i32 %A, <2 x i32> %B, <4 x float> %C, i8 %Index)
store <4 x float> %res, ptr addrspace(1) %out
ret void
}
define amdgpu_ps void @test_swmmac_f32_16x16x32_bf8_bf8(i32 %A, <2 x i32> %B, <4 x float> %C, i8 %Index, ptr addrspace(1) %out) {
; GFX12-LABEL: test_swmmac_f32_16x16x32_bf8_bf8:
; GFX12: ; %bb.0: ; %bb
; GFX12-NEXT: v_swmmac_f32_16x16x32_bf8_bf8 v[3:6], v0, v[1:2], v7
; GFX12-NEXT: global_store_b128 v[8:9], v[3:6], off
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
bb:
%res = call <4 x float> @llvm.amdgcn.swmmac.f32.16x16x32.bf8.bf8.v4f32.i32.v2i32.v4f32.i8(i32 %A, <2 x i32> %B, <4 x float> %C, i8 %Index)
store <4 x float> %res, ptr addrspace(1) %out
ret void
}
declare <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.f16.v4f32.v4f16.v4f16.v4f32(<4 x half>, <4 x half>, <4 x float>)
declare <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf16.v4f32.v4i16.v4i16.v4f32(<4 x i16>, <4 x i16>, <4 x float>)
declare <4 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16.v4f16.v4f16.v4f16.v4f16(<4 x half>, <4 x half>, <4 x half>, i1 immarg)
declare <4 x i16> @llvm.amdgcn.wmma.bf16.16x16x16.bf16.v4i16.v4i16.v4i16.v4i16(<4 x i16>, <4 x i16>, <4 x i16>, i1 immarg)
declare <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8.v4i32.i32.i32.v4i32(i1 immarg, i32, i1 immarg, i32, <4 x i32>, i1 immarg)
declare <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4.v4i32.i32.i32.v4i32(i1 immarg, i32, i1 immarg, i32, <4 x i32>, i1 immarg)
declare <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.fp8.fp8.v4f32.i32.i32.v4f32(i32, i32, <4 x float>)
declare <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.fp8.bf8.v4f32.i32.i32.v4f32(i32, i32, <4 x float>)
declare <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf8.fp8.v4f32.i32.i32.v4f32(i32, i32, <4 x float>)
declare <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf8.bf8.v4f32.i32.i32.v4f32(i32, i32, <4 x float>)
declare <4 x i32> @llvm.amdgcn.wmma.i32.16x16x32.iu4.v4i32.i32.i32.v4i32(i1 immarg, i32, i1 immarg, i32, <4 x i32>, i1 immarg)
declare <4 x float> @llvm.amdgcn.swmmac.f32.16x16x32.f16.v4f32.v4f16.v8f16.v4f32.i8(<4 x half>, <8 x half>, <4 x float>, i8)
declare <4 x float> @llvm.amdgcn.swmmac.f32.16x16x32.bf16.v4f32.v4i16.v8i16.v4f32.i8(<4 x i16>, <8 x i16>, <4 x float>, i8)
declare <4 x half> @llvm.amdgcn.swmmac.f16.16x16x32.f16.v4f16.v4f16.v8f16.v4f16.i8(<4 x half>, <8 x half>, <4 x half>, i8)
declare <4 x i16> @llvm.amdgcn.swmmac.bf16.16x16x32.bf16.v4i16.v4i16.v8i16.v4i16.i8(<4 x i16>, <8 x i16>, <4 x i16>, i8)
declare <4 x i32> @llvm.amdgcn.swmmac.i32.16x16x32.iu8.v4i32.i32.v2i32.v4i32.i8(i1 immarg, i32, i1 immarg, <2 x i32>, <4 x i32>, i8 %Index, i1 immarg)
declare <4 x i32> @llvm.amdgcn.swmmac.i32.16x16x32.iu4.v4i32.i32.i32.v4i32.i16(i1 immarg, i32, i1 immarg, i32, <4 x i32>, i16 %Index, i1 immarg)
declare <4 x i32> @llvm.amdgcn.swmmac.i32.16x16x64.iu4.v4i32.i32.v2i32.v4i32.i16(i1 immarg, i32, i1 immarg, <2 x i32>, <4 x i32>, i16 %Index, i1 immarg)
declare <4 x float> @llvm.amdgcn.swmmac.f32.16x16x32.fp8.fp8.v4f32.i32.v2i32.v4f32.i8(i32, <2 x i32>, <4 x float>, i8)
declare <4 x float> @llvm.amdgcn.swmmac.f32.16x16x32.fp8.bf8.v4f32.i32.v2i32.v4f32.i8(i32, <2 x i32>, <4 x float>, i8)
declare <4 x float> @llvm.amdgcn.swmmac.f32.16x16x32.bf8.fp8.v4f32.i32.v2i32.v4f32.i8(i32, <2 x i32>, <4 x float>, i8)
declare <4 x float> @llvm.amdgcn.swmmac.f32.16x16x32.bf8.bf8.v4f32.i32.v2i32.v4f32.i8(i32, <2 x i32>, <4 x float>, i8)

View File

@ -0,0 +1,499 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
; RUN: llc -march=amdgcn -mcpu=gfx1200 -verify-machineinstrs < %s | FileCheck %s --check-prefix=GFX12
define amdgpu_ps void @test_wmma_f32_16x16x16_f16_negA(<8 x half> %A, <8 x half> %B, <8 x float> %C, <8 x float> addrspace(1)* %out) {
; GFX12-LABEL: test_wmma_f32_16x16x16_f16_negA:
; GFX12: ; %bb.0: ; %bb
; GFX12-NEXT: v_wmma_f32_16x16x16_f16 v[8:15], v[0:3], v[4:7], v[8:15] neg_lo:[1,0,0] neg_hi:[1,0,0]
; GFX12-NEXT: s_clause 0x1
; GFX12-NEXT: global_store_b128 v[16:17], v[12:15], off offset:16
; GFX12-NEXT: global_store_b128 v[16:17], v[8:11], off
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
bb:
%fneg.A = fneg <8 x half> %A
%res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.f16.v8f16.v8f32(<8 x half> %fneg.A, <8 x half> %B, <8 x float> %C)
store <8 x float> %res, <8 x float> addrspace(1)* %out
ret void
}
define amdgpu_ps void @test_wmma_f32_16x16x16_f16_negB(<8 x half> %A, <8 x half> %B, <8 x float> %C, <8 x float> addrspace(1)* %out) {
; GFX12-LABEL: test_wmma_f32_16x16x16_f16_negB:
; GFX12: ; %bb.0: ; %bb
; GFX12-NEXT: v_wmma_f32_16x16x16_f16 v[8:15], v[0:3], v[4:7], v[8:15] neg_lo:[0,1,0] neg_hi:[0,1,0]
; GFX12-NEXT: s_clause 0x1
; GFX12-NEXT: global_store_b128 v[16:17], v[12:15], off offset:16
; GFX12-NEXT: global_store_b128 v[16:17], v[8:11], off
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
bb:
%fneg.B = fneg <8 x half> %B
%res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.f16.v8f16.v8f32(<8 x half> %A, <8 x half> %fneg.B, <8 x float> %C)
store <8 x float> %res, <8 x float> addrspace(1)* %out
ret void
}
define amdgpu_ps void @test_wmma_f32_16x16x16_f16_negC(<8 x half> %A, <8 x half> %B, <8 x float> %C, <8 x float> addrspace(1)* %out) {
; GFX12-LABEL: test_wmma_f32_16x16x16_f16_negC:
; GFX12: ; %bb.0: ; %bb
; GFX12-NEXT: v_wmma_f32_16x16x16_f16 v[8:15], v[0:3], v[4:7], v[8:15] neg_lo:[0,0,1]
; GFX12-NEXT: s_clause 0x1
; GFX12-NEXT: global_store_b128 v[16:17], v[12:15], off offset:16
; GFX12-NEXT: global_store_b128 v[16:17], v[8:11], off
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
bb:
%fneg.C = fneg <8 x float> %C
%res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.f16.v8f16.v8f32(<8 x half> %A, <8 x half> %B, <8 x float> %fneg.C)
store <8 x float> %res, <8 x float> addrspace(1)* %out
ret void
}
define amdgpu_ps void @test_wmma_f32_16x16x16_f16_absC(<8 x half> %A, <8 x half> %B, <8 x float> %C, <8 x float> addrspace(1)* %out) {
; GFX12-LABEL: test_wmma_f32_16x16x16_f16_absC:
; GFX12: ; %bb.0: ; %bb
; GFX12-NEXT: v_wmma_f32_16x16x16_f16 v[8:15], v[0:3], v[4:7], v[8:15] neg_hi:[0,0,1]
; GFX12-NEXT: s_clause 0x1
; GFX12-NEXT: global_store_b128 v[16:17], v[12:15], off offset:16
; GFX12-NEXT: global_store_b128 v[16:17], v[8:11], off
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
bb:
%fabs.C = call <8 x float> @llvm.fabs.v8f32(<8 x float> %C)
%res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.f16.v8f16.v8f32(<8 x half> %A, <8 x half> %B, <8 x float> %fabs.C)
store <8 x float> %res, <8 x float> addrspace(1)* %out
ret void
}
define amdgpu_ps void @test_wmma_f32_16x16x16_bf16_negC(<8 x i16> %A, <8 x i16> %B, <8 x float> %C, <8 x float> addrspace(1)* %out) {
; GFX12-LABEL: test_wmma_f32_16x16x16_bf16_negC:
; GFX12: ; %bb.0: ; %bb
; GFX12-NEXT: v_wmma_f32_16x16x16_bf16 v[8:15], v[0:3], v[4:7], v[8:15] neg_lo:[0,0,1]
; GFX12-NEXT: s_clause 0x1
; GFX12-NEXT: global_store_b128 v[16:17], v[12:15], off offset:16
; GFX12-NEXT: global_store_b128 v[16:17], v[8:11], off
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
bb:
%fneg.C = fneg <8 x float> %C
%res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf16.v8i16.v8f32(<8 x i16> %A, <8 x i16> %B, <8 x float> %fneg.C)
store <8 x float> %res, <8 x float> addrspace(1)* %out
ret void
}
define amdgpu_ps void @test_wmma_f32_16x16x16_bf16_absC(<8 x i16> %A, <8 x i16> %B, <8 x float> %C, <8 x float> addrspace(1)* %out) {
; GFX12-LABEL: test_wmma_f32_16x16x16_bf16_absC:
; GFX12: ; %bb.0: ; %bb
; GFX12-NEXT: v_wmma_f32_16x16x16_bf16 v[8:15], v[0:3], v[4:7], v[8:15] neg_hi:[0,0,1]
; GFX12-NEXT: s_clause 0x1
; GFX12-NEXT: global_store_b128 v[16:17], v[12:15], off offset:16
; GFX12-NEXT: global_store_b128 v[16:17], v[8:11], off
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
bb:
%fabs.C = call <8 x float> @llvm.fabs.v8f32(<8 x float> %C)
%res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf16.v8i16.v8f32(<8 x i16> %A, <8 x i16> %B, <8 x float> %fabs.C)
store <8 x float> %res, <8 x float> addrspace(1)* %out
ret void
}
define amdgpu_ps void @test_wmma_f16_16x16x16_f16_negA(<8 x half> %A, <8 x half> %B, <8 x half> %C, <8 x half> addrspace(1)* %out) {
; GFX12-LABEL: test_wmma_f16_16x16x16_f16_negA:
; GFX12: ; %bb.0: ; %bb
; GFX12-NEXT: v_wmma_f16_16x16x16_f16 v[8:11], v[0:3], v[4:7], v[8:11] neg_lo:[1,0,0] neg_hi:[1,0,0]
; GFX12-NEXT: global_store_b128 v[12:13], v[8:11], off
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
bb:
%fneg.A = fneg <8 x half> %A
%res = call <8 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16.v8f16.v8f16(<8 x half> %fneg.A, <8 x half> %B, <8 x half> %C, i1 0)
store <8 x half> %res, <8 x half> addrspace(1)* %out
ret void
}
define amdgpu_ps void @test_wmma_f16_16x16x16_f16_negB(<8 x half> %A, <8 x half> %B, <8 x half> %C, <8 x half> addrspace(1)* %out) {
; GFX12-LABEL: test_wmma_f16_16x16x16_f16_negB:
; GFX12: ; %bb.0: ; %bb
; GFX12-NEXT: v_wmma_f16_16x16x16_f16 v[8:11], v[0:3], v[4:7], v[8:11] neg_lo:[0,1,0] neg_hi:[0,1,0]
; GFX12-NEXT: global_store_b128 v[12:13], v[8:11], off
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
bb:
%fneg.B = fneg <8 x half> %B
%res = call <8 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16.v8f16.v8f16(<8 x half> %A, <8 x half> %fneg.B, <8 x half> %C, i1 0)
store <8 x half> %res, <8 x half> addrspace(1)* %out
ret void
}
define amdgpu_ps void @test_wmma_f16_16x16x16_f16_negC(<8 x half> %A, <8 x half> %B, <8 x half> %C, <8 x half> addrspace(1)* %out) {
; GFX12-LABEL: test_wmma_f16_16x16x16_f16_negC:
; GFX12: ; %bb.0: ; %bb
; GFX12-NEXT: v_wmma_f16_16x16x16_f16 v[8:11], v[0:3], v[4:7], v[8:11] neg_lo:[0,0,1]
; GFX12-NEXT: global_store_b128 v[12:13], v[8:11], off
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
bb:
%fneg.C = fneg <8 x half> %C
%res = call <8 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16.v8f16.v8f16(<8 x half> %A, <8 x half> %B, <8 x half> %fneg.C, i1 0)
store <8 x half> %res, <8 x half> addrspace(1)* %out
ret void
}
define amdgpu_ps void @test_wmma_f16_16x16x16_f16_absC(<8 x half> %A, <8 x half> %B, <8 x half> %C, <8 x half> addrspace(1)* %out) {
; GFX12-LABEL: test_wmma_f16_16x16x16_f16_absC:
; GFX12: ; %bb.0: ; %bb
; GFX12-NEXT: v_wmma_f16_16x16x16_f16 v[8:11], v[0:3], v[4:7], v[8:11] neg_hi:[0,0,1]
; GFX12-NEXT: global_store_b128 v[12:13], v[8:11], off
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
bb:
%fabs.C = call <8 x half> @llvm.fabs.v8f16(<8 x half> %C)
%res = call <8 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16.v8f16.v8f16(<8 x half> %A, <8 x half> %B, <8 x half> %fabs.C, i1 0)
store <8 x half> %res, <8 x half> addrspace(1)* %out
ret void
}
define amdgpu_ps void @test_wmma_f32_16x16x16_fp8_fp8_negC(<2 x i32> %A, <2 x i32> %B, <8 x float> %C, <8 x float> addrspace(1)* %out) {
; GFX12-LABEL: test_wmma_f32_16x16x16_fp8_fp8_negC:
; GFX12: ; %bb.0: ; %bb
; GFX12-NEXT: v_wmma_f32_16x16x16_fp8_fp8 v[4:11], v[0:1], v[2:3], v[4:11] neg_lo:[0,0,1]
; GFX12-NEXT: s_clause 0x1
; GFX12-NEXT: global_store_b128 v[12:13], v[8:11], off offset:16
; GFX12-NEXT: global_store_b128 v[12:13], v[4:7], off
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
bb:
%fneg.C = fneg <8 x float> %C
%res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.fp8.fp8.v2i32.v8f32(<2 x i32> %A, <2 x i32> %B, <8 x float> %fneg.C)
store <8 x float> %res, <8 x float> addrspace(1)* %out
ret void
}
define amdgpu_ps void @test_wmma_f32_16x16x16_fp8_fp8_absC(<2 x i32> %A, <2 x i32> %B, <8 x float> %C, <8 x float> addrspace(1)* %out) {
; GFX12-LABEL: test_wmma_f32_16x16x16_fp8_fp8_absC:
; GFX12: ; %bb.0: ; %bb
; GFX12-NEXT: v_wmma_f32_16x16x16_fp8_fp8 v[4:11], v[0:1], v[2:3], v[4:11] neg_hi:[0,0,1]
; GFX12-NEXT: s_clause 0x1
; GFX12-NEXT: global_store_b128 v[12:13], v[8:11], off offset:16
; GFX12-NEXT: global_store_b128 v[12:13], v[4:7], off
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
bb:
%fabs.C = call <8 x float> @llvm.fabs.v8f32(<8 x float> %C)
%res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.fp8.fp8.v2i32.v8f32(<2 x i32> %A, <2 x i32> %B, <8 x float> %fabs.C)
store <8 x float> %res, <8 x float> addrspace(1)* %out
ret void
}
define amdgpu_ps void @test_wmma_f32_16x16x16_bf8_fp8_negC(<2 x i32> %A, <2 x i32> %B, <8 x float> %C, <8 x float> addrspace(1)* %out) {
; GFX12-LABEL: test_wmma_f32_16x16x16_bf8_fp8_negC:
; GFX12: ; %bb.0: ; %bb
; GFX12-NEXT: v_wmma_f32_16x16x16_bf8_fp8 v[4:11], v[0:1], v[2:3], v[4:11] neg_lo:[0,0,1]
; GFX12-NEXT: s_clause 0x1
; GFX12-NEXT: global_store_b128 v[12:13], v[8:11], off offset:16
; GFX12-NEXT: global_store_b128 v[12:13], v[4:7], off
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
bb:
%fneg.C = fneg <8 x float> %C
%res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf8.fp8.v2i32.v8f32(<2 x i32> %A, <2 x i32> %B, <8 x float> %fneg.C)
store <8 x float> %res, <8 x float> addrspace(1)* %out
ret void
}
define amdgpu_ps void @test_wmma_f32_16x16x16_bf8_fp8_absC(<2 x i32> %A, <2 x i32> %B, <8 x float> %C, <8 x float> addrspace(1)* %out) {
; GFX12-LABEL: test_wmma_f32_16x16x16_bf8_fp8_absC:
; GFX12: ; %bb.0: ; %bb
; GFX12-NEXT: v_wmma_f32_16x16x16_bf8_fp8 v[4:11], v[0:1], v[2:3], v[4:11] neg_hi:[0,0,1]
; GFX12-NEXT: s_clause 0x1
; GFX12-NEXT: global_store_b128 v[12:13], v[8:11], off offset:16
; GFX12-NEXT: global_store_b128 v[12:13], v[4:7], off
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
bb:
%fabs.C = call <8 x float> @llvm.fabs.v8f32(<8 x float> %C)
%res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf8.fp8.v2i32.v8f32(<2 x i32> %A, <2 x i32> %B, <8 x float> %fabs.C)
store <8 x float> %res, <8 x float> addrspace(1)* %out
ret void
}
define amdgpu_ps void @test_wmma_f32_16x16x16_fp8_bf8_negC(<2 x i32> %A, <2 x i32> %B, <8 x float> %C, <8 x float> addrspace(1)* %out) {
; GFX12-LABEL: test_wmma_f32_16x16x16_fp8_bf8_negC:
; GFX12: ; %bb.0: ; %bb
; GFX12-NEXT: v_wmma_f32_16x16x16_fp8_bf8 v[4:11], v[0:1], v[2:3], v[4:11] neg_lo:[0,0,1]
; GFX12-NEXT: s_clause 0x1
; GFX12-NEXT: global_store_b128 v[12:13], v[8:11], off offset:16
; GFX12-NEXT: global_store_b128 v[12:13], v[4:7], off
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
bb:
%fneg.C = fneg <8 x float> %C
%res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.fp8.bf8.v2i32.v8f32(<2 x i32> %A, <2 x i32> %B, <8 x float> %fneg.C)
store <8 x float> %res, <8 x float> addrspace(1)* %out
ret void
}
define amdgpu_ps void @test_wmma_f32_16x16x16_fp8_bf8_absC(<2 x i32> %A, <2 x i32> %B, <8 x float> %C, <8 x float> addrspace(1)* %out) {
; GFX12-LABEL: test_wmma_f32_16x16x16_fp8_bf8_absC:
; GFX12: ; %bb.0: ; %bb
; GFX12-NEXT: v_wmma_f32_16x16x16_fp8_bf8 v[4:11], v[0:1], v[2:3], v[4:11] neg_hi:[0,0,1]
; GFX12-NEXT: s_clause 0x1
; GFX12-NEXT: global_store_b128 v[12:13], v[8:11], off offset:16
; GFX12-NEXT: global_store_b128 v[12:13], v[4:7], off
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
bb:
%fabs.C = call <8 x float> @llvm.fabs.v8f32(<8 x float> %C)
%res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.fp8.bf8.v2i32.v8f32(<2 x i32> %A, <2 x i32> %B, <8 x float> %fabs.C)
store <8 x float> %res, <8 x float> addrspace(1)* %out
ret void
}
define amdgpu_ps void @test_wmma_f32_16x16x16_bf8_bf8_negC(<2 x i32> %A, <2 x i32> %B, <8 x float> %C, <8 x float> addrspace(1)* %out) {
; GFX12-LABEL: test_wmma_f32_16x16x16_bf8_bf8_negC:
; GFX12: ; %bb.0: ; %bb
; GFX12-NEXT: v_wmma_f32_16x16x16_bf8_bf8 v[4:11], v[0:1], v[2:3], v[4:11] neg_lo:[0,0,1]
; GFX12-NEXT: s_clause 0x1
; GFX12-NEXT: global_store_b128 v[12:13], v[8:11], off offset:16
; GFX12-NEXT: global_store_b128 v[12:13], v[4:7], off
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
bb:
%fneg.C = fneg <8 x float> %C
%res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf8.bf8.v2i32.v8f32(<2 x i32> %A, <2 x i32> %B, <8 x float> %fneg.C)
store <8 x float> %res, <8 x float> addrspace(1)* %out
ret void
}
define amdgpu_ps void @test_wmma_f32_16x16x16_bf8_bf8_absC(<2 x i32> %A, <2 x i32> %B, <8 x float> %C, <8 x float> addrspace(1)* %out) {
; GFX12-LABEL: test_wmma_f32_16x16x16_bf8_bf8_absC:
; GFX12: ; %bb.0: ; %bb
; GFX12-NEXT: v_wmma_f32_16x16x16_bf8_bf8 v[4:11], v[0:1], v[2:3], v[4:11] neg_hi:[0,0,1]
; GFX12-NEXT: s_clause 0x1
; GFX12-NEXT: global_store_b128 v[12:13], v[8:11], off offset:16
; GFX12-NEXT: global_store_b128 v[12:13], v[4:7], off
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
bb:
%fabs.C = call <8 x float> @llvm.fabs.v8f32(<8 x float> %C)
%res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf8.bf8.v2i32.v8f32(<2 x i32> %A, <2 x i32> %B, <8 x float> %fabs.C)
store <8 x float> %res, <8 x float> addrspace(1)* %out
ret void
}
define amdgpu_ps void @test_swmmac_f32_16x16x32_f16_negA(<8 x half> %A, <16 x half> %B, <8 x float> %C, i16 %Index, <8 x float> addrspace(1)* %out) {
; GFX12-LABEL: test_swmmac_f32_16x16x32_f16_negA:
; GFX12: ; %bb.0: ; %bb
; GFX12-NEXT: v_swmmac_f32_16x16x32_f16 v[12:19], v[0:3], v[4:11], v20 neg_lo:[1,0,0] neg_hi:[1,0,0]
; GFX12-NEXT: s_clause 0x1
; GFX12-NEXT: global_store_b128 v[21:22], v[16:19], off offset:16
; GFX12-NEXT: global_store_b128 v[21:22], v[12:15], off
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
bb:
%fneg.A = fneg <8 x half> %A
%res = call <8 x float> @llvm.amdgcn.swmmac.f32.16x16x32.f16.v8f16.v16f16.v8f32.i16(<8 x half> %fneg.A, <16 x half> %B, <8 x float> %C, i16 %Index)
store <8 x float> %res, <8 x float> addrspace(1)* %out
ret void
}
define amdgpu_ps void @test_swmmac_f32_16x16x32_f16_negB(<8 x half> %A, <16 x half> %B, <8 x float> %C, i16 %Index, <8 x float> addrspace(1)* %out) {
; GFX12-LABEL: test_swmmac_f32_16x16x32_f16_negB:
; GFX12: ; %bb.0: ; %bb
; GFX12-NEXT: v_swmmac_f32_16x16x32_f16 v[12:19], v[0:3], v[4:11], v20 neg_lo:[0,1,0] neg_hi:[0,1,0]
; GFX12-NEXT: s_clause 0x1
; GFX12-NEXT: global_store_b128 v[21:22], v[16:19], off offset:16
; GFX12-NEXT: global_store_b128 v[21:22], v[12:15], off
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
bb:
%fneg.B = fneg <16 x half> %B
%res = call <8 x float> @llvm.amdgcn.swmmac.f32.16x16x32.f16.v8f16.v16f16.v8f32.i16(<8 x half> %A, <16 x half> %fneg.B, <8 x float> %C, i16 %Index)
store <8 x float> %res, <8 x float> addrspace(1)* %out
ret void
}
define amdgpu_ps void @test_swmmac_f16_16x16x32_f16_negA(<8 x half> %A, <16 x half> %B, <8 x half> %C, i16 %Index, <8 x half> addrspace(1)* %out) {
; GFX12-LABEL: test_swmmac_f16_16x16x32_f16_negA:
; GFX12: ; %bb.0: ; %bb
; GFX12-NEXT: v_swmmac_f16_16x16x32_f16 v[12:15], v[0:3], v[4:11], v16 neg_lo:[1,0,0] neg_hi:[1,0,0]
; GFX12-NEXT: global_store_b128 v[17:18], v[12:15], off
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
bb:
%fneg.A = fneg <8 x half> %A
%res = call <8 x half> @llvm.amdgcn.swmmac.f16.16x16x32.f16.v8f16.v16f16.v8f16.i16(<8 x half> %fneg.A, <16 x half> %B, <8 x half> %C, i16 %Index)
store <8 x half> %res, <8 x half> addrspace(1)* %out
ret void
}
define amdgpu_ps void @test_swmmac_f16_16x16x32_f16_negB(<8 x half> %A, <16 x half> %B, <8 x half> %C, i16 %Index, <8 x half> addrspace(1)* %out) {
; GFX12-LABEL: test_swmmac_f16_16x16x32_f16_negB:
; GFX12: ; %bb.0: ; %bb
; GFX12-NEXT: v_swmmac_f16_16x16x32_f16 v[12:15], v[0:3], v[4:11], v16 neg_lo:[0,1,0] neg_hi:[0,1,0]
; GFX12-NEXT: global_store_b128 v[17:18], v[12:15], off
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
bb:
%fneg.B = fneg <16 x half> %B
%res = call <8 x half> @llvm.amdgcn.swmmac.f16.16x16x32.f16.v8f16.v16f16.v8f16.i16(<8 x half> %A, <16 x half> %fneg.B, <8 x half> %C, i16 %Index)
store <8 x half> %res, <8 x half> addrspace(1)* %out
ret void
}
; both neg and abs patterns (wmma matrix C f32 or f16 )
define amdgpu_ps void @test_wmma_f32_16x16x16_f16_negabsC(<8 x half> %A, <8 x half> %B, <8 x float> %C, <8 x float> addrspace(1)* %out) {
; GFX12-LABEL: test_wmma_f32_16x16x16_f16_negabsC:
; GFX12: ; %bb.0: ; %bb
; GFX12-NEXT: v_wmma_f32_16x16x16_f16 v[8:15], v[0:3], v[4:7], v[8:15] neg_lo:[0,0,1] neg_hi:[0,0,1]
; GFX12-NEXT: s_clause 0x1
; GFX12-NEXT: global_store_b128 v[16:17], v[12:15], off offset:16
; GFX12-NEXT: global_store_b128 v[16:17], v[8:11], off
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
bb:
%fabs.C = call <8 x float> @llvm.fabs.v8f32(<8 x float> %C)
%fneg.fabs.C = fneg <8 x float> %fabs.C
%res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.f16.v8f16.v8f32(<8 x half> %A, <8 x half> %B, <8 x float> %fneg.fabs.C)
store <8 x float> %res, <8 x float> addrspace(1)* %out
ret void
}
define amdgpu_ps void @test_wmma_f16_16x16x16_f16_negabsC(<8 x half> %A, <8 x half> %B, <8 x half> %C, <8 x half> addrspace(1)* %out) {
; GFX12-LABEL: test_wmma_f16_16x16x16_f16_negabsC:
; GFX12: ; %bb.0: ; %bb
; GFX12-NEXT: v_wmma_f16_16x16x16_f16 v[8:11], v[0:3], v[4:7], v[8:11] neg_lo:[0,0,1] neg_hi:[0,0,1]
; GFX12-NEXT: global_store_b128 v[12:13], v[8:11], off
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
bb:
%fabs.C = call <8 x half> @llvm.fabs.v8f16(<8 x half> %C)
%fneg.fabs.C = fneg <8 x half> %fabs.C
%res = call <8 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16.v8f16.v8f16(<8 x half> %A, <8 x half> %B, <8 x half> %fneg.fabs.C, i1 0)
store <8 x half> %res, <8 x half> addrspace(1)* %out
ret void
}
define amdgpu_ps void @test_wmma_f32_16x16x16_f16_neg_partial_fabsA(<8 x half> %A, <8 x half> %B, <8 x float> %C, <8 x float> addrspace(1)* %out) {
; GFX12-LABEL: test_wmma_f32_16x16x16_f16_neg_partial_fabsA:
; GFX12: ; %bb.0: ; %bb
; GFX12-NEXT: v_and_b32_e32 v11, 0x7fffffff, v11
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-NEXT: v_wmma_f32_16x16x16_f16 v[8:15], v[0:3], v[4:7], v[8:15] neg_lo:[0,0,1]
; GFX12-NEXT: s_clause 0x1
; GFX12-NEXT: global_store_b128 v[16:17], v[12:15], off offset:16
; GFX12-NEXT: global_store_b128 v[16:17], v[8:11], off
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
bb:
%el3 = extractelement <8 x float> %C, i32 3
%el3.fabs = call float @llvm.fabs.f32(float %el3)
%partial.fabs.C = insertelement <8 x float> %C, float %el3.fabs, i32 3
%fneg.partial.fabs.C = fneg <8 x float> %partial.fabs.C
%res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.f16.v8f16.v8f32(<8 x half> %A, <8 x half> %B, <8 x float> %fneg.partial.fabs.C)
store <8 x float> %res, <8 x float> addrspace(1)* %out
ret void
}
; A or B matrix modifier and constant in C
define amdgpu_ps void @test_wmma_f32_16x16x16_f16_negA_constantC(<8 x half> %A, <8 x half> %B, <8 x float> %C, <8 x float> addrspace(1)* %out) {
; GFX12-LABEL: test_wmma_f32_16x16x16_f16_negA_constantC:
; GFX12: ; %bb.0: ; %bb
; GFX12-NEXT: v_wmma_f32_16x16x16_f16 v[10:17], v[0:3], v[4:7], 1.0 neg_lo:[1,0,0] neg_hi:[1,0,0]
; GFX12-NEXT: s_clause 0x1
; GFX12-NEXT: global_store_b128 v[8:9], v[14:17], off offset:16
; GFX12-NEXT: global_store_b128 v[8:9], v[10:13], off
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
bb:
%fneg.A = fneg <8 x half> %A
%res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.f16.v8f16.v8f32(<8 x half> %fneg.A, <8 x half> %B, <8 x float> <float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0>)
store <8 x float> %res, <8 x float> addrspace(1)* %out
ret void
}
define amdgpu_ps void @test_wmma_f16_16x16x16_f16_negB_constantC(<8 x half> %A, <8 x half> %B, <8 x half> %C, <8 x half> addrspace(1)* %out) {
; GFX12-LABEL: test_wmma_f16_16x16x16_f16_negB_constantC:
; GFX12: ; %bb.0: ; %bb
; GFX12-NEXT: v_wmma_f16_16x16x16_f16 v[10:13], v[0:3], v[4:7], 1.0 neg_lo:[0,1,0] neg_hi:[0,1,0]
; GFX12-NEXT: global_store_b128 v[8:9], v[10:13], off
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
bb:
%fneg.B = fneg <8 x half> %B
%res = call <8 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16.v8f16.v8f16(<8 x half> %A, <8 x half> %fneg.B, <8 x half> <half 1.0, half 1.0, half 1.0, half 1.0, half 1.0, half 1.0, half 1.0, half 1.0>, i1 0)
store <8 x half> %res, <8 x half> addrspace(1)* %out
ret void
}
; pack f16 elements with v_perm_b32 since they don't come from same b32
define amdgpu_ps void @test_wmma_f16_16x16x16_f16_negC_pack(<8 x half> %A, <8 x half> %B, ptr %Caddr, <8 x half> addrspace(1)* %out) {
; GFX12-LABEL: test_wmma_f16_16x16x16_f16_negC_pack:
; GFX12: ; %bb.0: ; %bb
; GFX12-NEXT: s_clause 0x1
; GFX12-NEXT: flat_load_b128 v[12:15], v[8:9] offset:16
; GFX12-NEXT: flat_load_b128 v[16:19], v[8:9]
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x101
; GFX12-NEXT: v_perm_b32 v15, v15, v14, 0x5040100
; GFX12-NEXT: v_perm_b32 v14, v13, v12, 0x5040100
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-NEXT: v_perm_b32 v13, v19, v18, 0x5040100
; GFX12-NEXT: v_perm_b32 v12, v17, v16, 0x5040100
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-NEXT: v_wmma_f16_16x16x16_f16 v[12:15], v[0:3], v[4:7], v[12:15] neg_lo:[0,0,1]
; GFX12-NEXT: global_store_b128 v[10:11], v[12:15], off
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
bb:
%C = load <16 x half>, ptr %Caddr
%C_shuffle = shufflevector <16 x half> %C, <16 x half> undef, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
%fneg.C_shuffle = fneg <8 x half> %C_shuffle
%res = call <8 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16.v8f16.v8f16(<8 x half> %A, <8 x half> %B, <8 x half> %fneg.C_shuffle , i1 0)
store <8 x half> %res, <8 x half> addrspace(1)* %out
ret void
}
declare <8 x half> @llvm.fabs.v8f16(<8 x half>)
declare <8 x float> @llvm.fabs.v8f32(<8 x float>)
declare float @llvm.fabs.f32(float)
declare <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.f16.v8f16.v8f32(<8 x half>, <8 x half>, <8 x float>)
declare <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf16.v8i16.v8f32(<8 x i16>, <8 x i16>, <8 x float>)
declare <8 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16.v8f16.v8f16(<8 x half>, <8 x half>, <8 x half>, i1 immarg)
declare <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.fp8.fp8.v2i32.v8f32(<2 x i32>, <2 x i32>, <8 x float>)
declare <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.fp8.bf8.v2i32.v8f32(<2 x i32>, <2 x i32>, <8 x float>)
declare <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf8.fp8.v2i32.v8f32(<2 x i32>, <2 x i32>, <8 x float>)
declare <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf8.bf8.v2i32.v8f32(<2 x i32>, <2 x i32>, <8 x float>)
declare <8 x float> @llvm.amdgcn.swmmac.f32.16x16x32.f16.v8f16.v16f16.v8f32.i16(<8 x half>, <16 x half>, <8 x float>, i16)
declare <8 x half> @llvm.amdgcn.swmmac.f16.16x16x32.f16.v8f16.v16f16.v8f16.i16(<8 x half>, <16 x half>, <8 x half>, i16)

View File

@ -0,0 +1,431 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
; RUN: llc -march=amdgcn -mcpu=gfx1200 -verify-machineinstrs < %s | FileCheck %s --check-prefix=GFX12
define amdgpu_ps void @test_wmma_f32_16x16x16_f16_imm(<8 x half> %A, <8 x half> %B, ptr addrspace(1) %out) {
; GFX12-LABEL: test_wmma_f32_16x16x16_f16_imm:
; GFX12: ; %bb.0: ; %bb
; GFX12-NEXT: v_wmma_f32_16x16x16_f16 v[10:17], v[0:3], v[4:7], 1.0
; GFX12-NEXT: s_clause 0x1
; GFX12-NEXT: global_store_b128 v[8:9], v[14:17], off offset:16
; GFX12-NEXT: global_store_b128 v[8:9], v[10:13], off
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
bb:
%res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.f16.v8f32.v8f16.v8f16.v8f32(<8 x half> %A, <8 x half> %B, <8 x float> <float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0>)
store <8 x float> %res, ptr addrspace(1) %out
ret void
}
define amdgpu_ps void @test_wmma_f32_16x16x16_f16_imm_non_inlineable(<8 x half> %A, <8 x half> %B, ptr addrspace(1) %out) {
; GFX12-LABEL: test_wmma_f32_16x16x16_f16_imm_non_inlineable:
; GFX12: ; %bb.0: ; %bb
; GFX12-NEXT: v_mov_b32_e32 v10, 0x40400000
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1)
; GFX12-NEXT: v_dual_mov_b32 v11, v10 :: v_dual_mov_b32 v12, v10
; GFX12-NEXT: v_dual_mov_b32 v13, v10 :: v_dual_mov_b32 v14, v10
; GFX12-NEXT: v_dual_mov_b32 v15, v10 :: v_dual_mov_b32 v16, v10
; GFX12-NEXT: v_mov_b32_e32 v17, v10
; GFX12-NEXT: v_wmma_f32_16x16x16_f16 v[10:17], v[0:3], v[4:7], v[10:17]
; GFX12-NEXT: s_clause 0x1
; GFX12-NEXT: global_store_b128 v[8:9], v[14:17], off offset:16
; GFX12-NEXT: global_store_b128 v[8:9], v[10:13], off
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
bb:
%res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.f16.v8f32.v8f16.v8f16.v8f32(<8 x half> %A, <8 x half> %B, <8 x float> <float 3.0, float 3.0, float 3.0, float 3.0, float 3.0, float 3.0, float 3.0, float 3.0>)
store <8 x float> %res, ptr addrspace(1) %out
ret void
}
define amdgpu_ps void @test_wmma_f32_16x16x16_bf16_imm(<8 x i16> %A, <8 x i16> %B, ptr addrspace(1) %out) {
; GFX12-LABEL: test_wmma_f32_16x16x16_bf16_imm:
; GFX12: ; %bb.0: ; %bb
; GFX12-NEXT: v_wmma_f32_16x16x16_bf16 v[10:17], v[0:3], v[4:7], 1.0
; GFX12-NEXT: s_clause 0x1
; GFX12-NEXT: global_store_b128 v[8:9], v[14:17], off offset:16
; GFX12-NEXT: global_store_b128 v[8:9], v[10:13], off
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
bb:
%res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf16.v8f32.v8i16.v8i16.v8f32(<8 x i16> %A, <8 x i16> %B, <8 x float> <float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0>)
store <8 x float> %res, ptr addrspace(1) %out
ret void
}
define amdgpu_ps void @test_wmma_f32_16x16x16_bf16_imm_non_inlineable(<8 x i16> %A, <8 x i16> %B, ptr addrspace(1) %out) {
; GFX12-LABEL: test_wmma_f32_16x16x16_bf16_imm_non_inlineable:
; GFX12: ; %bb.0: ; %bb
; GFX12-NEXT: v_mov_b32_e32 v10, 0x40400000
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1)
; GFX12-NEXT: v_dual_mov_b32 v11, v10 :: v_dual_mov_b32 v12, v10
; GFX12-NEXT: v_dual_mov_b32 v13, v10 :: v_dual_mov_b32 v14, v10
; GFX12-NEXT: v_dual_mov_b32 v15, v10 :: v_dual_mov_b32 v16, v10
; GFX12-NEXT: v_mov_b32_e32 v17, v10
; GFX12-NEXT: v_wmma_f32_16x16x16_bf16 v[10:17], v[0:3], v[4:7], v[10:17]
; GFX12-NEXT: s_clause 0x1
; GFX12-NEXT: global_store_b128 v[8:9], v[14:17], off offset:16
; GFX12-NEXT: global_store_b128 v[8:9], v[10:13], off
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
bb:
%res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf16.v8f32.v8i16.v8i16.v8f32(<8 x i16> %A, <8 x i16> %B, <8 x float> <float 3.0, float 3.0, float 3.0, float 3.0, float 3.0, float 3.0, float 3.0, float 3.0>)
store <8 x float> %res, ptr addrspace(1) %out
ret void
}
define amdgpu_ps void @test_wmma_f16_16x16x16_f16_imm(<8 x half> %A, <8 x half> %B, ptr addrspace(1) %out) {
; GFX12-LABEL: test_wmma_f16_16x16x16_f16_imm:
; GFX12: ; %bb.0: ; %bb
; GFX12-NEXT: v_wmma_f16_16x16x16_f16 v[10:13], v[0:3], v[4:7], 1.0
; GFX12-NEXT: global_store_b128 v[8:9], v[10:13], off
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
bb:
%res = call <8 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16.v8f16.v8f16.v8f16.v8f16(<8 x half> %A, <8 x half> %B, <8 x half> <half 1.0, half 1.0, half 1.0, half 1.0, half 1.0, half 1.0, half 1.0, half 1.0>, i1 0)
store <8 x half> %res, ptr addrspace(1) %out
ret void
}
define amdgpu_ps void @test_wmma_f16_16x16x16_f16_imm_non_inlineable(<8 x half> %A, <8 x half> %B, ptr addrspace(1) %out) {
; GFX12-LABEL: test_wmma_f16_16x16x16_f16_imm_non_inlineable:
; GFX12: ; %bb.0: ; %bb
; GFX12-NEXT: v_mov_b32_e32 v10, 0x42004200
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
; GFX12-NEXT: v_dual_mov_b32 v11, v10 :: v_dual_mov_b32 v12, v10
; GFX12-NEXT: v_mov_b32_e32 v13, v10
; GFX12-NEXT: v_wmma_f16_16x16x16_f16 v[10:13], v[0:3], v[4:7], v[10:13]
; GFX12-NEXT: global_store_b128 v[8:9], v[10:13], off
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
bb:
%res = call <8 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16.v8f16.v8f16.v8f16.v8f16(<8 x half> %A, <8 x half> %B, <8 x half> <half 3.0, half 3.0, half 3.0, half 3.0, half 3.0, half 3.0, half 3.0, half 3.0>, i1 0)
store <8 x half> %res, ptr addrspace(1) %out
ret void
}
define amdgpu_ps void @test_wmma_bf16_16x16x16_bf16_imm(<8 x i16> %A, <8 x i16> %B, ptr addrspace(1) %out) {
; GFX12-LABEL: test_wmma_bf16_16x16x16_bf16_imm:
; GFX12: ; %bb.0: ; %bb
; GFX12-NEXT: v_wmma_bf16_16x16x16_bf16 v[10:13], v[0:3], v[4:7], 1.0
; GFX12-NEXT: global_store_b128 v[8:9], v[10:13], off
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
bb:
%res = call <8 x i16> @llvm.amdgcn.wmma.bf16.16x16x16.bf16.v8i16.v8i16.v8i16.v8i16(<8 x i16> %A, <8 x i16> %B, <8 x i16> <i16 16256, i16 16256, i16 16256, i16 16256, i16 16256, i16 16256, i16 16256, i16 16256>, i1 0)
store <8 x i16> %res, ptr addrspace(1) %out
ret void
}
define amdgpu_ps void @test_wmma_bf16_16x16x16_bf16_imm_non_inlineable(<8 x i16> %A, <8 x i16> %B, ptr addrspace(1) %out) {
; GFX12-LABEL: test_wmma_bf16_16x16x16_bf16_imm_non_inlineable:
; GFX12: ; %bb.0: ; %bb
; GFX12-NEXT: v_mov_b32_e32 v10, 0x3fc03fc0
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
; GFX12-NEXT: v_dual_mov_b32 v11, v10 :: v_dual_mov_b32 v12, v10
; GFX12-NEXT: v_mov_b32_e32 v13, v10
; GFX12-NEXT: v_wmma_bf16_16x16x16_bf16 v[10:13], v[0:3], v[4:7], v[10:13]
; GFX12-NEXT: global_store_b128 v[8:9], v[10:13], off
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
bb:
%res = call <8 x i16> @llvm.amdgcn.wmma.bf16.16x16x16.bf16.v8i16.v8i16.v8i16.v8i16(<8 x i16> %A, <8 x i16> %B, <8 x i16> <i16 16320, i16 16320, i16 16320, i16 16320, i16 16320, i16 16320, i16 16320, i16 16320>, i1 0)
store <8 x i16> %res, ptr addrspace(1) %out
ret void
}
define amdgpu_ps void @test_wmma_i32_16x16x16_iu8_imm(<2 x i32> %A, <2 x i32> %B, ptr addrspace(1) %out) {
; GFX12-LABEL: test_wmma_i32_16x16x16_iu8_imm:
; GFX12: ; %bb.0: ; %bb
; GFX12-NEXT: v_wmma_i32_16x16x16_iu8 v[6:13], v[0:1], v[2:3], 1
; GFX12-NEXT: s_clause 0x1
; GFX12-NEXT: global_store_b128 v[4:5], v[10:13], off offset:16
; GFX12-NEXT: global_store_b128 v[4:5], v[6:9], off
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
bb:
%res = call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8.v8i32.v2i32.v2i32.v8i32(i1 0, <2 x i32> %A, i1 0, <2 x i32> %B, <8 x i32> <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>, i1 0)
store <8 x i32> %res, ptr addrspace(1) %out
ret void
}
define amdgpu_ps void @test_wmma_i32_16x16x16_iu8_imm_non_inlineable(<2 x i32> %A, <2 x i32> %B, ptr addrspace(1) %out) {
; GFX12-LABEL: test_wmma_i32_16x16x16_iu8_imm_non_inlineable:
; GFX12: ; %bb.0: ; %bb
; GFX12-NEXT: v_mov_b32_e32 v6, 0x80
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1)
; GFX12-NEXT: v_dual_mov_b32 v7, v6 :: v_dual_mov_b32 v8, v6
; GFX12-NEXT: v_dual_mov_b32 v9, v6 :: v_dual_mov_b32 v10, v6
; GFX12-NEXT: v_dual_mov_b32 v11, v6 :: v_dual_mov_b32 v12, v6
; GFX12-NEXT: v_mov_b32_e32 v13, v6
; GFX12-NEXT: v_wmma_i32_16x16x16_iu8 v[6:13], v[0:1], v[2:3], v[6:13]
; GFX12-NEXT: s_clause 0x1
; GFX12-NEXT: global_store_b128 v[4:5], v[10:13], off offset:16
; GFX12-NEXT: global_store_b128 v[4:5], v[6:9], off
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
bb:
%res = call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8.v8i32.v2i32.v2i32.v8i32(i1 0, <2 x i32> %A, i1 0, <2 x i32> %B, <8 x i32> <i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128>, i1 0)
store <8 x i32> %res, ptr addrspace(1) %out
ret void
}
define amdgpu_ps void @test_wmma_i32_16x16x16_iu4_imm(i32 %A, i32 %B, ptr addrspace(1) %out) {
; GFX12-LABEL: test_wmma_i32_16x16x16_iu4_imm:
; GFX12: ; %bb.0: ; %bb
; GFX12-NEXT: v_wmma_i32_16x16x16_iu4 v[4:11], v0, v1, 1
; GFX12-NEXT: s_clause 0x1
; GFX12-NEXT: global_store_b128 v[2:3], v[8:11], off offset:16
; GFX12-NEXT: global_store_b128 v[2:3], v[4:7], off
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
bb:
%res = call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4.v8i32.i32.i32.v8i32(i1 0, i32 %A, i1 0, i32 %B, <8 x i32> <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>, i1 0)
store <8 x i32> %res, ptr addrspace(1) %out
ret void
}
define amdgpu_ps void @test_wmma_i32_16x16x16_iu4_imm_non_inlineable(i32 %A, i32 %B, ptr addrspace(1) %out) {
; GFX12-LABEL: test_wmma_i32_16x16x16_iu4_imm_non_inlineable:
; GFX12: ; %bb.0: ; %bb
; GFX12-NEXT: v_mov_b32_e32 v4, 0x80
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1)
; GFX12-NEXT: v_dual_mov_b32 v5, v4 :: v_dual_mov_b32 v6, v4
; GFX12-NEXT: v_dual_mov_b32 v7, v4 :: v_dual_mov_b32 v8, v4
; GFX12-NEXT: v_dual_mov_b32 v9, v4 :: v_dual_mov_b32 v10, v4
; GFX12-NEXT: v_mov_b32_e32 v11, v4
; GFX12-NEXT: v_wmma_i32_16x16x16_iu4 v[4:11], v0, v1, v[4:11]
; GFX12-NEXT: s_clause 0x1
; GFX12-NEXT: global_store_b128 v[2:3], v[8:11], off offset:16
; GFX12-NEXT: global_store_b128 v[2:3], v[4:7], off
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
bb:
%res = call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4.v8i32.i32.i32.v8i32(i1 0, i32 %A, i1 0, i32 %B, <8 x i32> <i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128>, i1 0)
store <8 x i32> %res, ptr addrspace(1) %out
ret void
}
define amdgpu_ps void @test_wmma_f32_16x16x16_fp8_fp8_imm(<2 x i32> %A, <2 x i32> %B, ptr addrspace(1) %out) {
; GFX12-LABEL: test_wmma_f32_16x16x16_fp8_fp8_imm:
; GFX12: ; %bb.0: ; %bb
; GFX12-NEXT: v_wmma_f32_16x16x16_fp8_fp8 v[6:13], v[0:1], v[2:3], 1.0
; GFX12-NEXT: s_clause 0x1
; GFX12-NEXT: global_store_b128 v[4:5], v[10:13], off offset:16
; GFX12-NEXT: global_store_b128 v[4:5], v[6:9], off
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
bb:
%res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.fp8.fp8.v8f32.v2i32.v2i32.v8f32(<2 x i32> %A, <2 x i32> %B, <8 x float> <float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0>)
store <8 x float> %res, ptr addrspace(1) %out
ret void
}
define amdgpu_ps void @test_wmma_f32_16x16x16_fp8_fp8_imm_non_inlineable(<2 x i32> %A, <2 x i32> %B, ptr addrspace(1) %out) {
; GFX12-LABEL: test_wmma_f32_16x16x16_fp8_fp8_imm_non_inlineable:
; GFX12: ; %bb.0: ; %bb
; GFX12-NEXT: v_mov_b32_e32 v6, 0x40400000
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1)
; GFX12-NEXT: v_dual_mov_b32 v7, v6 :: v_dual_mov_b32 v8, v6
; GFX12-NEXT: v_dual_mov_b32 v9, v6 :: v_dual_mov_b32 v10, v6
; GFX12-NEXT: v_dual_mov_b32 v11, v6 :: v_dual_mov_b32 v12, v6
; GFX12-NEXT: v_mov_b32_e32 v13, v6
; GFX12-NEXT: v_wmma_f32_16x16x16_fp8_fp8 v[6:13], v[0:1], v[2:3], v[6:13]
; GFX12-NEXT: s_clause 0x1
; GFX12-NEXT: global_store_b128 v[4:5], v[10:13], off offset:16
; GFX12-NEXT: global_store_b128 v[4:5], v[6:9], off
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
bb:
%res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.fp8.fp8.v8f32.v2i32.v2i32.v8f32(<2 x i32> %A, <2 x i32> %B, <8 x float> <float 3.0, float 3.0, float 3.0, float 3.0, float 3.0, float 3.0, float 3.0, float 3.0>)
store <8 x float> %res, ptr addrspace(1) %out
ret void
}
define amdgpu_ps void @test_wmma_f32_16x16x16_bf8_fp8_imm(<2 x i32> %A, <2 x i32> %B, ptr addrspace(1) %out) {
; GFX12-LABEL: test_wmma_f32_16x16x16_bf8_fp8_imm:
; GFX12: ; %bb.0: ; %bb
; GFX12-NEXT: v_wmma_f32_16x16x16_bf8_fp8 v[6:13], v[0:1], v[2:3], 1.0
; GFX12-NEXT: s_clause 0x1
; GFX12-NEXT: global_store_b128 v[4:5], v[10:13], off offset:16
; GFX12-NEXT: global_store_b128 v[4:5], v[6:9], off
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
bb:
%res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf8.fp8.v8f32.v2i32.v2i32.v8f32(<2 x i32> %A, <2 x i32> %B, <8 x float> <float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0>)
store <8 x float> %res, ptr addrspace(1) %out
ret void
}
define amdgpu_ps void @test_wmma_f32_16x16x16_bf8_fp8_imm_non_inlineable(<2 x i32> %A, <2 x i32> %B, ptr addrspace(1) %out) {
; GFX12-LABEL: test_wmma_f32_16x16x16_bf8_fp8_imm_non_inlineable:
; GFX12: ; %bb.0: ; %bb
; GFX12-NEXT: v_mov_b32_e32 v6, 0x40400000
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1)
; GFX12-NEXT: v_dual_mov_b32 v7, v6 :: v_dual_mov_b32 v8, v6
; GFX12-NEXT: v_dual_mov_b32 v9, v6 :: v_dual_mov_b32 v10, v6
; GFX12-NEXT: v_dual_mov_b32 v11, v6 :: v_dual_mov_b32 v12, v6
; GFX12-NEXT: v_mov_b32_e32 v13, v6
; GFX12-NEXT: v_wmma_f32_16x16x16_bf8_fp8 v[6:13], v[0:1], v[2:3], v[6:13]
; GFX12-NEXT: s_clause 0x1
; GFX12-NEXT: global_store_b128 v[4:5], v[10:13], off offset:16
; GFX12-NEXT: global_store_b128 v[4:5], v[6:9], off
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
bb:
%res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf8.fp8.v8f32.v2i32.v2i32.v8f32(<2 x i32> %A, <2 x i32> %B, <8 x float> <float 3.0, float 3.0, float 3.0, float 3.0, float 3.0, float 3.0, float 3.0, float 3.0>)
store <8 x float> %res, ptr addrspace(1) %out
ret void
}
define amdgpu_ps void @test_wmma_f32_16x16x16_fp8_bf8_imm(<2 x i32> %A, <2 x i32> %B, ptr addrspace(1) %out) {
; GFX12-LABEL: test_wmma_f32_16x16x16_fp8_bf8_imm:
; GFX12: ; %bb.0: ; %bb
; GFX12-NEXT: v_wmma_f32_16x16x16_fp8_bf8 v[6:13], v[0:1], v[2:3], 1.0
; GFX12-NEXT: s_clause 0x1
; GFX12-NEXT: global_store_b128 v[4:5], v[10:13], off offset:16
; GFX12-NEXT: global_store_b128 v[4:5], v[6:9], off
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
bb:
%res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.fp8.bf8.v8f32.v2i32.v2i32.v8f32(<2 x i32> %A, <2 x i32> %B, <8 x float> <float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0>)
store <8 x float> %res, ptr addrspace(1) %out
ret void
}
define amdgpu_ps void @test_wmma_f32_16x16x16_fp8_bf8_imm_non_inlineable(<2 x i32> %A, <2 x i32> %B, ptr addrspace(1) %out) {
; GFX12-LABEL: test_wmma_f32_16x16x16_fp8_bf8_imm_non_inlineable:
; GFX12: ; %bb.0: ; %bb
; GFX12-NEXT: v_mov_b32_e32 v6, 0x40400000
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1)
; GFX12-NEXT: v_dual_mov_b32 v7, v6 :: v_dual_mov_b32 v8, v6
; GFX12-NEXT: v_dual_mov_b32 v9, v6 :: v_dual_mov_b32 v10, v6
; GFX12-NEXT: v_dual_mov_b32 v11, v6 :: v_dual_mov_b32 v12, v6
; GFX12-NEXT: v_mov_b32_e32 v13, v6
; GFX12-NEXT: v_wmma_f32_16x16x16_fp8_bf8 v[6:13], v[0:1], v[2:3], v[6:13]
; GFX12-NEXT: s_clause 0x1
; GFX12-NEXT: global_store_b128 v[4:5], v[10:13], off offset:16
; GFX12-NEXT: global_store_b128 v[4:5], v[6:9], off
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
bb:
%res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.fp8.bf8.v8f32.v2i32.v2i32.v8f32(<2 x i32> %A, <2 x i32> %B, <8 x float> <float 3.0, float 3.0, float 3.0, float 3.0, float 3.0, float 3.0, float 3.0, float 3.0>)
store <8 x float> %res, ptr addrspace(1) %out
ret void
}
define amdgpu_ps void @test_wmma_f32_16x16x16_bf8_bf8_imm(<2 x i32> %A, <2 x i32> %B, ptr addrspace(1) %out) {
; GFX12-LABEL: test_wmma_f32_16x16x16_bf8_bf8_imm:
; GFX12: ; %bb.0: ; %bb
; GFX12-NEXT: v_wmma_f32_16x16x16_bf8_bf8 v[6:13], v[0:1], v[2:3], 1.0
; GFX12-NEXT: s_clause 0x1
; GFX12-NEXT: global_store_b128 v[4:5], v[10:13], off offset:16
; GFX12-NEXT: global_store_b128 v[4:5], v[6:9], off
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
bb:
%res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf8.bf8.v8f32.v2i32.v2i32.v8f32(<2 x i32> %A, <2 x i32> %B, <8 x float> <float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0>)
store <8 x float> %res, ptr addrspace(1) %out
ret void
}
define amdgpu_ps void @test_wmma_f32_16x16x16_bf8_bf8_imm_non_inlineable(<2 x i32> %A, <2 x i32> %B, ptr addrspace(1) %out) {
; GFX12-LABEL: test_wmma_f32_16x16x16_bf8_bf8_imm_non_inlineable:
; GFX12: ; %bb.0: ; %bb
; GFX12-NEXT: v_mov_b32_e32 v6, 0x40400000
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1)
; GFX12-NEXT: v_dual_mov_b32 v7, v6 :: v_dual_mov_b32 v8, v6
; GFX12-NEXT: v_dual_mov_b32 v9, v6 :: v_dual_mov_b32 v10, v6
; GFX12-NEXT: v_dual_mov_b32 v11, v6 :: v_dual_mov_b32 v12, v6
; GFX12-NEXT: v_mov_b32_e32 v13, v6
; GFX12-NEXT: v_wmma_f32_16x16x16_bf8_bf8 v[6:13], v[0:1], v[2:3], v[6:13]
; GFX12-NEXT: s_clause 0x1
; GFX12-NEXT: global_store_b128 v[4:5], v[10:13], off offset:16
; GFX12-NEXT: global_store_b128 v[4:5], v[6:9], off
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
bb:
%res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf8.bf8.v8f32.v2i32.v2i32.v8f32(<2 x i32> %A, <2 x i32> %B, <8 x float> <float 3.0, float 3.0, float 3.0, float 3.0, float 3.0, float 3.0, float 3.0, float 3.0>)
store <8 x float> %res, ptr addrspace(1) %out
ret void
}
define amdgpu_ps void @test_wmma_i32_16x16x32_iu4_imm(<2 x i32> %A, <2 x i32> %B, ptr addrspace(1) %out) {
; GFX12-LABEL: test_wmma_i32_16x16x32_iu4_imm:
; GFX12: ; %bb.0: ; %bb
; GFX12-NEXT: v_wmma_i32_16x16x32_iu4 v[6:13], v[0:1], v[2:3], 1
; GFX12-NEXT: s_clause 0x1
; GFX12-NEXT: global_store_b128 v[4:5], v[10:13], off offset:16
; GFX12-NEXT: global_store_b128 v[4:5], v[6:9], off
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
bb:
%res = call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x32.iu4.v8i32.v2i32.v2i32.v8i32(i1 0, <2 x i32> %A, i1 0, <2 x i32> %B, <8 x i32> <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>, i1 0)
store <8 x i32> %res, ptr addrspace(1) %out
ret void
}
define amdgpu_ps void @test_wmma_i32_16x16x32_iu4_imm_non_inlineable(<2 x i32> %A, <2 x i32> %B, ptr addrspace(1) %out) {
; GFX12-LABEL: test_wmma_i32_16x16x32_iu4_imm_non_inlineable:
; GFX12: ; %bb.0: ; %bb
; GFX12-NEXT: v_mov_b32_e32 v6, 0x80
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1)
; GFX12-NEXT: v_dual_mov_b32 v7, v6 :: v_dual_mov_b32 v8, v6
; GFX12-NEXT: v_dual_mov_b32 v9, v6 :: v_dual_mov_b32 v10, v6
; GFX12-NEXT: v_dual_mov_b32 v11, v6 :: v_dual_mov_b32 v12, v6
; GFX12-NEXT: v_mov_b32_e32 v13, v6
; GFX12-NEXT: v_wmma_i32_16x16x32_iu4 v[6:13], v[0:1], v[2:3], v[6:13]
; GFX12-NEXT: s_clause 0x1
; GFX12-NEXT: global_store_b128 v[4:5], v[10:13], off offset:16
; GFX12-NEXT: global_store_b128 v[4:5], v[6:9], off
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
bb:
%res = call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x32.iu4.v8i32.v2i32.v2i32.v8i32(i1 0, <2 x i32> %A, i1 0, <2 x i32> %B, <8 x i32> <i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128>, i1 0)
store <8 x i32> %res, ptr addrspace(1) %out
ret void
}
declare <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.f16.v8f32.v8f16.v8f16.v8f32(<8 x half>, <8 x half>, <8 x float>)
declare <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf16.v8f32.v8i16.v8i16.v8f32(<8 x i16>, <8 x i16>, <8 x float>)
declare <8 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16.v8f16.v8f16.v8f16.v8f16(<8 x half>, <8 x half>, <8 x half>, i1 immarg)
declare <8 x i16> @llvm.amdgcn.wmma.bf16.16x16x16.bf16.v8i16.v8i16.v8i16.v8i16(<8 x i16>, <8 x i16>, <8 x i16>, i1 immarg)
declare <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8.v8i32.v2i32.v2i32.v8i32(i1 immarg, <2 x i32>, i1 immarg, <2 x i32>, <8 x i32>, i1 immarg)
declare <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4.v8i32.i32.i32.v8i32(i1 immarg, i32, i1 immarg, i32, <8 x i32>, i1 immarg)
declare <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.fp8.fp8.v8f32.v2i32.v2i32.v8f32(<2 x i32>, <2 x i32>, <8 x float>)
declare <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.fp8.bf8.v8f32.v2i32.v2i32.v8f32(<2 x i32>, <2 x i32>, <8 x float>)
declare <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf8.fp8.v8f32.v2i32.v2i32.v8f32(<2 x i32>, <2 x i32>, <8 x float>)
declare <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf8.bf8.v8f32.v2i32.v2i32.v8f32(<2 x i32>, <2 x i32>, <8 x float>)
declare <8 x i32> @llvm.amdgcn.wmma.i32.16x16x32.iu4.v8i32.v2i32.v2i32.v8i32(i1 immarg, <2 x i32>, i1 immarg, <2 x i32>, <8 x i32>, i1 immarg)
declare <8 x float> @llvm.amdgcn.swmmac.f32.16x16x32.f16.v8f32.v8f16.v16f16.v8f32.i16(<8 x half>, <16 x half>, <8 x float>, i16)
declare <8 x float> @llvm.amdgcn.swmmac.f32.16x16x32.bf16.v8f32.v8i16.v16i16.v8f32.i16(<8 x i16>, <16 x i16>, <8 x float>, i16)
declare <8 x half> @llvm.amdgcn.swmmac.f16.16x16x32.f16.v8f16.v8f16.v16f16.v8f16.i16(<8 x half>, <16 x half>, <8 x half>, i16)
declare <8 x i16> @llvm.amdgcn.swmmac.bf16.16x16x32.bf16.v8i16.v8i16.v16i16.v8i16.i16(<8 x i16>, <16 x i16>, <8 x i16>, i16)
declare <8 x i32> @llvm.amdgcn.swmmac.i32.16x16x32.iu8.v8i32.v2i32.v4i32.v8i32.i16(i1 immarg, <2 x i32>, i1 immarg, <4 x i32>, <8 x i32>, i16 %Index, i1 immarg)
declare <8 x i32> @llvm.amdgcn.swmmac.i32.16x16x32.iu4.v8i32.i32.v2i32.v8i32.i16(i1 immarg, i32, i1 immarg, <2 x i32>, <8 x i32>, i16 %Index, i1 immarg)
declare <8 x i32> @llvm.amdgcn.swmmac.i32.16x16x64.iu4.v8i32.v2i32.v4i32.v8i32.i32(i1 immarg, <2 x i32>, i1 immarg, <4 x i32>, <8 x i32>, i32 %Index, i1 immarg)
declare <8 x float> @llvm.amdgcn.swmmac.f32.16x16x32.fp8.fp8.v8f32.v2i32.v4i32.v8f32.i16(<2 x i32>, <4 x i32>, <8 x float>, i16)
declare <8 x float> @llvm.amdgcn.swmmac.f32.16x16x32.fp8.bf8.v8f32.v2i32.v4i32.v8f32.i16(<2 x i32>, <4 x i32>, <8 x float>, i16)
declare <8 x float> @llvm.amdgcn.swmmac.f32.16x16x32.bf8.fp8.v8f32.v2i32.v4i32.v8f32.i16(<2 x i32>, <4 x i32>, <8 x float>, i16)
declare <8 x float> @llvm.amdgcn.swmmac.f32.16x16x32.bf8.bf8.v8f32.v2i32.v4i32.v8f32.i16(<2 x i32>, <4 x i32>, <8 x float>, i16)

View File

@ -0,0 +1,309 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
; RUN: llc -march=amdgcn -mcpu=gfx1200 -verify-machineinstrs < %s | FileCheck %s --check-prefix=GFX12
define amdgpu_ps void @test_wmma_i32_16x16x16_iu8_zext_src0(<2 x i32> %A, <2 x i32> %B, <8 x i32> %C, ptr addrspace(1) %out) {
; GFX12-LABEL: test_wmma_i32_16x16x16_iu8_zext_src0:
; GFX12: ; %bb.0: ; %bb
; GFX12-NEXT: v_wmma_i32_16x16x16_iu8 v[4:11], v[0:1], v[2:3], v[4:11] neg_lo:[1,0,0]
; GFX12-NEXT: s_clause 0x1
; GFX12-NEXT: global_store_b128 v[12:13], v[8:11], off offset:16
; GFX12-NEXT: global_store_b128 v[12:13], v[4:7], off
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
bb:
%res = call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8.v8i32.v2i32.v2i32.v8i32(i1 1, <2 x i32> %A, i1 0, <2 x i32> %B, <8 x i32> %C, i1 0)
store <8 x i32> %res, ptr addrspace(1) %out
ret void
}
define amdgpu_ps void @test_wmma_i32_16x16x16_iu8_zext_src1(<2 x i32> %A, <2 x i32> %B, <8 x i32> %C, ptr addrspace(1) %out) {
; GFX12-LABEL: test_wmma_i32_16x16x16_iu8_zext_src1:
; GFX12: ; %bb.0: ; %bb
; GFX12-NEXT: v_wmma_i32_16x16x16_iu8 v[4:11], v[0:1], v[2:3], v[4:11] neg_lo:[0,1,0]
; GFX12-NEXT: s_clause 0x1
; GFX12-NEXT: global_store_b128 v[12:13], v[8:11], off offset:16
; GFX12-NEXT: global_store_b128 v[12:13], v[4:7], off
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
bb:
%res = call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8.v8i32.v2i32.v2i32.v8i32(i1 0, <2 x i32> %A, i1 1, <2 x i32> %B, <8 x i32> %C, i1 0)
store <8 x i32> %res, ptr addrspace(1) %out
ret void
}
define amdgpu_ps void @test_wmma_i32_16x16x16_iu8_clamp(<2 x i32> %A, <2 x i32> %B, <8 x i32> %C, ptr addrspace(1) %out) {
; GFX12-LABEL: test_wmma_i32_16x16x16_iu8_clamp:
; GFX12: ; %bb.0: ; %bb
; GFX12-NEXT: v_wmma_i32_16x16x16_iu8 v[4:11], v[0:1], v[2:3], v[4:11] clamp
; GFX12-NEXT: s_clause 0x1
; GFX12-NEXT: global_store_b128 v[12:13], v[8:11], off offset:16
; GFX12-NEXT: global_store_b128 v[12:13], v[4:7], off
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
bb:
%res = call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8.v8i32.v2i32.v2i32.v8i32(i1 0, <2 x i32> %A, i1 0, <2 x i32> %B, <8 x i32> %C, i1 1)
store <8 x i32> %res, ptr addrspace(1) %out
ret void
}
define amdgpu_ps void @test_wmma_i32_16x16x16_iu4_zext_src0(i32 %A, i32 %B, <8 x i32> %C, ptr addrspace(1) %out) {
; GFX12-LABEL: test_wmma_i32_16x16x16_iu4_zext_src0:
; GFX12: ; %bb.0: ; %bb
; GFX12-NEXT: v_wmma_i32_16x16x16_iu4 v[2:9], v0, v1, v[2:9] neg_lo:[1,0,0]
; GFX12-NEXT: s_clause 0x1
; GFX12-NEXT: global_store_b128 v[10:11], v[6:9], off offset:16
; GFX12-NEXT: global_store_b128 v[10:11], v[2:5], off
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
bb:
%res = call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4.v8i32.i32.i32.v8i32(i1 1, i32 %A, i1 0, i32 %B, <8 x i32> %C, i1 0)
store <8 x i32> %res, ptr addrspace(1) %out
ret void
}
define amdgpu_ps void @test_wmma_i32_16x16x16_iu4_zext_src1(i32 %A, i32 %B, <8 x i32> %C, ptr addrspace(1) %out) {
; GFX12-LABEL: test_wmma_i32_16x16x16_iu4_zext_src1:
; GFX12: ; %bb.0: ; %bb
; GFX12-NEXT: v_wmma_i32_16x16x16_iu4 v[2:9], v0, v1, v[2:9] neg_lo:[0,1,0]
; GFX12-NEXT: s_clause 0x1
; GFX12-NEXT: global_store_b128 v[10:11], v[6:9], off offset:16
; GFX12-NEXT: global_store_b128 v[10:11], v[2:5], off
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
bb:
%res = call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4.v8i32.i32.i32.v8i32(i1 0, i32 %A, i1 1, i32 %B, <8 x i32> %C, i1 0)
store <8 x i32> %res, ptr addrspace(1) %out
ret void
}
define amdgpu_ps void @test_wmma_i32_16x16x16_iu4_clamp(i32 %A, i32 %B, <8 x i32> %C, ptr addrspace(1) %out) {
; GFX12-LABEL: test_wmma_i32_16x16x16_iu4_clamp:
; GFX12: ; %bb.0: ; %bb
; GFX12-NEXT: v_wmma_i32_16x16x16_iu4 v[2:9], v0, v1, v[2:9] clamp
; GFX12-NEXT: s_clause 0x1
; GFX12-NEXT: global_store_b128 v[10:11], v[6:9], off offset:16
; GFX12-NEXT: global_store_b128 v[10:11], v[2:5], off
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
bb:
%res = call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4.v8i32.i32.i32.v8i32(i1 0, i32 %A, i1 0, i32 %B, <8 x i32> %C, i1 1)
store <8 x i32> %res, ptr addrspace(1) %out
ret void
}
define amdgpu_ps void @test_wmma_i32_16x16x32_iu4_zext_src0(<2 x i32> %A, <2 x i32> %B, <8 x i32> %C, ptr addrspace(1) %out) {
; GFX12-LABEL: test_wmma_i32_16x16x32_iu4_zext_src0:
; GFX12: ; %bb.0: ; %bb
; GFX12-NEXT: v_wmma_i32_16x16x32_iu4 v[4:11], v[0:1], v[2:3], v[4:11] neg_lo:[1,0,0]
; GFX12-NEXT: s_clause 0x1
; GFX12-NEXT: global_store_b128 v[12:13], v[8:11], off offset:16
; GFX12-NEXT: global_store_b128 v[12:13], v[4:7], off
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
bb:
%res = call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x32.iu4.v8i32.v2i32.v2i32.v8i32(i1 1, <2 x i32> %A, i1 0, <2 x i32> %B, <8 x i32> %C, i1 0)
store <8 x i32> %res, ptr addrspace(1) %out
ret void
}
define amdgpu_ps void @test_wmma_i32_16x16x32_iu4_zext_src1(<2 x i32> %A, <2 x i32> %B, <8 x i32> %C, ptr addrspace(1) %out) {
; GFX12-LABEL: test_wmma_i32_16x16x32_iu4_zext_src1:
; GFX12: ; %bb.0: ; %bb
; GFX12-NEXT: v_wmma_i32_16x16x32_iu4 v[4:11], v[0:1], v[2:3], v[4:11] neg_lo:[0,1,0]
; GFX12-NEXT: s_clause 0x1
; GFX12-NEXT: global_store_b128 v[12:13], v[8:11], off offset:16
; GFX12-NEXT: global_store_b128 v[12:13], v[4:7], off
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
bb:
%res = call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x32.iu4.v8i32.v2i32.v2i32.v8i32(i1 0, <2 x i32> %A, i1 1, <2 x i32> %B, <8 x i32> %C, i1 0)
store <8 x i32> %res, ptr addrspace(1) %out
ret void
}
define amdgpu_ps void @test_wmma_i32_16x16x32_iu4_clamp(<2 x i32> %A, <2 x i32> %B, <8 x i32> %C, ptr addrspace(1) %out) {
; GFX12-LABEL: test_wmma_i32_16x16x32_iu4_clamp:
; GFX12: ; %bb.0: ; %bb
; GFX12-NEXT: v_wmma_i32_16x16x32_iu4 v[4:11], v[0:1], v[2:3], v[4:11] clamp
; GFX12-NEXT: s_clause 0x1
; GFX12-NEXT: global_store_b128 v[12:13], v[8:11], off offset:16
; GFX12-NEXT: global_store_b128 v[12:13], v[4:7], off
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
bb:
%res = call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x32.iu4.v8i32.v2i32.v2i32.v8i32(i1 0, <2 x i32> %A, i1 0, <2 x i32> %B, <8 x i32> %C, i1 1)
store <8 x i32> %res, ptr addrspace(1) %out
ret void
}
define amdgpu_ps void @test_swmmac_i32_16x16x32_iu8_zext_src0(<2 x i32> %A, <4 x i32> %B, <8 x i32> %C, i16 %Index, ptr addrspace(1) %out) {
; GFX12-LABEL: test_swmmac_i32_16x16x32_iu8_zext_src0:
; GFX12: ; %bb.0: ; %bb
; GFX12-NEXT: v_swmmac_i32_16x16x32_iu8 v[6:13], v[0:1], v[2:5], v14 neg_lo:[1,0,0]
; GFX12-NEXT: s_clause 0x1
; GFX12-NEXT: global_store_b128 v[15:16], v[10:13], off offset:16
; GFX12-NEXT: global_store_b128 v[15:16], v[6:9], off
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
bb:
%res = call <8 x i32> @llvm.amdgcn.swmmac.i32.16x16x32.iu8.v8i32.v2i32.v4i32.v8i32.i16(i1 1, <2 x i32> %A, i1 0, <4 x i32> %B, <8 x i32> %C, i16 %Index, i1 0)
store <8 x i32> %res, ptr addrspace(1) %out
ret void
}
define amdgpu_ps void @test_swmmac_i32_16x16x32_iu8_zext_src1(<2 x i32> %A, <4 x i32> %B, <8 x i32> %C, i16 %Index, ptr addrspace(1) %out) {
; GFX12-LABEL: test_swmmac_i32_16x16x32_iu8_zext_src1:
; GFX12: ; %bb.0: ; %bb
; GFX12-NEXT: v_swmmac_i32_16x16x32_iu8 v[6:13], v[0:1], v[2:5], v14 neg_lo:[0,1,0]
; GFX12-NEXT: s_clause 0x1
; GFX12-NEXT: global_store_b128 v[15:16], v[10:13], off offset:16
; GFX12-NEXT: global_store_b128 v[15:16], v[6:9], off
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
bb:
%res = call <8 x i32> @llvm.amdgcn.swmmac.i32.16x16x32.iu8.v8i32.v2i32.v4i32.v8i32.i16(i1 0, <2 x i32> %A, i1 1, <4 x i32> %B, <8 x i32> %C, i16 %Index, i1 0)
store <8 x i32> %res, ptr addrspace(1) %out
ret void
}
define amdgpu_ps void @test_swmmac_i32_16x16x32_iu8_clamp(<2 x i32> %A, <4 x i32> %B, <8 x i32> %C, i16 %Index, ptr addrspace(1) %out) {
; GFX12-LABEL: test_swmmac_i32_16x16x32_iu8_clamp:
; GFX12: ; %bb.0: ; %bb
; GFX12-NEXT: v_swmmac_i32_16x16x32_iu8 v[6:13], v[0:1], v[2:5], v14 clamp
; GFX12-NEXT: s_clause 0x1
; GFX12-NEXT: global_store_b128 v[15:16], v[10:13], off offset:16
; GFX12-NEXT: global_store_b128 v[15:16], v[6:9], off
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
bb:
%res = call <8 x i32> @llvm.amdgcn.swmmac.i32.16x16x32.iu8.v8i32.v2i32.v4i32.v8i32.i16(i1 0, <2 x i32> %A, i1 0, <4 x i32> %B, <8 x i32> %C, i16 %Index, i1 1)
store <8 x i32> %res, ptr addrspace(1) %out
ret void
}
define amdgpu_ps void @test_swmmac_i32_16x16x32_iu4_zext_src0(i32 %A, <2 x i32> %B, <8 x i32> %C, i16 %Index, ptr addrspace(1) %out) {
; GFX12-LABEL: test_swmmac_i32_16x16x32_iu4_zext_src0:
; GFX12: ; %bb.0: ; %bb
; GFX12-NEXT: v_swmmac_i32_16x16x32_iu4 v[3:10], v0, v[1:2], v11 neg_lo:[1,0,0]
; GFX12-NEXT: s_clause 0x1
; GFX12-NEXT: global_store_b128 v[12:13], v[7:10], off offset:16
; GFX12-NEXT: global_store_b128 v[12:13], v[3:6], off
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
bb:
%res = call <8 x i32> @llvm.amdgcn.swmmac.i32.16x16x32.iu4.v8i32.i32.v2i32.v8i32.i16(i1 1, i32 %A, i1 0, <2 x i32> %B, <8 x i32> %C, i16 %Index, i1 0)
store <8 x i32> %res, ptr addrspace(1) %out
ret void
}
define amdgpu_ps void @test_swmmac_i32_16x16x32_iu4_zext_src1(i32 %A, <2 x i32> %B, <8 x i32> %C, i16 %Index, ptr addrspace(1) %out) {
; GFX12-LABEL: test_swmmac_i32_16x16x32_iu4_zext_src1:
; GFX12: ; %bb.0: ; %bb
; GFX12-NEXT: v_swmmac_i32_16x16x32_iu4 v[3:10], v0, v[1:2], v11 neg_lo:[0,1,0]
; GFX12-NEXT: s_clause 0x1
; GFX12-NEXT: global_store_b128 v[12:13], v[7:10], off offset:16
; GFX12-NEXT: global_store_b128 v[12:13], v[3:6], off
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
bb:
%res = call <8 x i32> @llvm.amdgcn.swmmac.i32.16x16x32.iu4.v8i32.i32.v2i32.v8i32.i16(i1 0, i32 %A, i1 1, <2 x i32> %B, <8 x i32> %C, i16 %Index, i1 0)
store <8 x i32> %res, ptr addrspace(1) %out
ret void
}
define amdgpu_ps void @test_swmmac_i32_16x16x32_iu4_clamp(i32 %A, <2 x i32> %B, <8 x i32> %C, i16 %Index, ptr addrspace(1) %out) {
; GFX12-LABEL: test_swmmac_i32_16x16x32_iu4_clamp:
; GFX12: ; %bb.0: ; %bb
; GFX12-NEXT: v_swmmac_i32_16x16x32_iu4 v[3:10], v0, v[1:2], v11 clamp
; GFX12-NEXT: s_clause 0x1
; GFX12-NEXT: global_store_b128 v[12:13], v[7:10], off offset:16
; GFX12-NEXT: global_store_b128 v[12:13], v[3:6], off
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
bb:
%res = call <8 x i32> @llvm.amdgcn.swmmac.i32.16x16x32.iu4.v8i32.i32.v2i32.v8i32.i16(i1 0, i32 %A, i1 0, <2 x i32> %B, <8 x i32> %C, i16 %Index, i1 1)
store <8 x i32> %res, ptr addrspace(1) %out
ret void
}
define amdgpu_ps void @test_swmmac_i32_16x16x64_iu4_zext_src0(<2 x i32> %A, <4 x i32> %B, <8 x i32> %C, i32 %Index, ptr addrspace(1) %out) {
; GFX12-LABEL: test_swmmac_i32_16x16x64_iu4_zext_src0:
; GFX12: ; %bb.0: ; %bb
; GFX12-NEXT: v_swmmac_i32_16x16x64_iu4 v[6:13], v[0:1], v[2:5], v14 neg_lo:[1,0,0]
; GFX12-NEXT: s_clause 0x1
; GFX12-NEXT: global_store_b128 v[15:16], v[10:13], off offset:16
; GFX12-NEXT: global_store_b128 v[15:16], v[6:9], off
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
bb:
%res = call <8 x i32> @llvm.amdgcn.swmmac.i32.16x16x64.iu4.v8i32.v2i32.v4i32.v8i32.i32(i1 1, <2 x i32> %A, i1 0, <4 x i32> %B, <8 x i32> %C, i32 %Index, i1 0)
store <8 x i32> %res, ptr addrspace(1) %out
ret void
}
define amdgpu_ps void @test_swmmac_i32_16x16x64_iu4_zext_src1(<2 x i32> %A, <4 x i32> %B, <8 x i32> %C, i32 %Index, ptr addrspace(1) %out) {
; GFX12-LABEL: test_swmmac_i32_16x16x64_iu4_zext_src1:
; GFX12: ; %bb.0: ; %bb
; GFX12-NEXT: v_swmmac_i32_16x16x64_iu4 v[6:13], v[0:1], v[2:5], v14 neg_lo:[0,1,0]
; GFX12-NEXT: s_clause 0x1
; GFX12-NEXT: global_store_b128 v[15:16], v[10:13], off offset:16
; GFX12-NEXT: global_store_b128 v[15:16], v[6:9], off
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
bb:
%res = call <8 x i32> @llvm.amdgcn.swmmac.i32.16x16x64.iu4.v8i32.v2i32.v4i32.v8i32.i32(i1 0, <2 x i32> %A, i1 1, <4 x i32> %B, <8 x i32> %C, i32 %Index, i1 0)
store <8 x i32> %res, ptr addrspace(1) %out
ret void
}
define amdgpu_ps void @test_swmmac_i32_16x16x64_iu4_clamp(<2 x i32> %A, <4 x i32> %B, <8 x i32> %C, i32 %Index, ptr addrspace(1) %out) {
; GFX12-LABEL: test_swmmac_i32_16x16x64_iu4_clamp:
; GFX12: ; %bb.0: ; %bb
; GFX12-NEXT: v_swmmac_i32_16x16x64_iu4 v[6:13], v[0:1], v[2:5], v14 clamp
; GFX12-NEXT: s_clause 0x1
; GFX12-NEXT: global_store_b128 v[15:16], v[10:13], off offset:16
; GFX12-NEXT: global_store_b128 v[15:16], v[6:9], off
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
bb:
%res = call <8 x i32> @llvm.amdgcn.swmmac.i32.16x16x64.iu4.v8i32.v2i32.v4i32.v8i32.i32(i1 0, <2 x i32> %A, i1 0, <4 x i32> %B, <8 x i32> %C, i32 %Index, i1 1)
store <8 x i32> %res, ptr addrspace(1) %out
ret void
}
declare <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8.v8i32.v2i32.v2i32.v8i32(i1 immarg, <2 x i32>, i1 immarg, <2 x i32>, <8 x i32>, i1 immarg)
declare <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4.v8i32.i32.i32.v8i32(i1 immarg, i32, i1 immarg, i32, <8 x i32>, i1 immarg)
declare <8 x i32> @llvm.amdgcn.wmma.i32.16x16x32.iu4.v8i32.v2i32.v2i32.v8i32(i1 immarg, <2 x i32>, i1 immarg, <2 x i32>, <8 x i32>, i1 immarg)
declare <8 x i32> @llvm.amdgcn.swmmac.i32.16x16x32.iu8.v8i32.v2i32.v4i32.v8i32.i16(i1 immarg, <2 x i32>, i1 immarg, <4 x i32>, <8 x i32>, i16 %Index, i1 immarg)
declare <8 x i32> @llvm.amdgcn.swmmac.i32.16x16x32.iu4.v8i32.i32.v2i32.v8i32.i16(i1 immarg, i32, i1 immarg, <2 x i32>, <8 x i32>, i16 %Index, i1 immarg)
declare <8 x i32> @llvm.amdgcn.swmmac.i32.16x16x64.iu4.v8i32.v2i32.v4i32.v8i32.i32(i1 immarg, <2 x i32>, i1 immarg, <4 x i32>, <8 x i32>, i32 %Index, i1 immarg)

View File

@ -0,0 +1,321 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
; RUN: llc -march=amdgcn -mcpu=gfx1200 -verify-machineinstrs < %s | FileCheck %s --check-prefix=GFX12
define amdgpu_ps void @test_swmmac_f32_16x16x32_f16_index_key(<8 x half> %A, <16 x half> %B, <8 x float> %C, ptr addrspace(1) %IndexVecPtr, ptr addrspace(1) %out0, ptr addrspace(1) %out1) {
; GFX12-LABEL: test_swmmac_f32_16x16x32_f16_index_key:
; GFX12: ; %bb.0: ; %bb
; GFX12-NEXT: global_load_b32 v20, v[20:21], off
; GFX12-NEXT: v_dual_mov_b32 v33, v19 :: v_dual_mov_b32 v32, v18
; GFX12-NEXT: v_dual_mov_b32 v31, v17 :: v_dual_mov_b32 v30, v16
; GFX12-NEXT: v_dual_mov_b32 v29, v15 :: v_dual_mov_b32 v28, v14
; GFX12-NEXT: v_dual_mov_b32 v27, v13 :: v_dual_mov_b32 v26, v12
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-NEXT: v_swmmac_f32_16x16x32_f16 v[26:33], v[0:3], v[4:11], v20
; GFX12-NEXT: v_swmmac_f32_16x16x32_f16 v[12:19], v[0:3], v[4:11], v20 index_key:1
; GFX12-NEXT: s_clause 0x1
; GFX12-NEXT: global_store_b128 v[22:23], v[30:33], off offset:16
; GFX12-NEXT: global_store_b128 v[22:23], v[26:29], off
; GFX12-NEXT: s_clause 0x1
; GFX12-NEXT: global_store_b128 v[24:25], v[16:19], off offset:16
; GFX12-NEXT: global_store_b128 v[24:25], v[12:15], off
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
bb:
%IndexVec = load <2 x i16>, ptr addrspace(1) %IndexVecPtr, align 4
%Index0 = extractelement <2 x i16> %IndexVec, i32 0
%res0 = call <8 x float> @llvm.amdgcn.swmmac.f32.16x16x32.f16.v8f32.v8f16.v16f16.v8f32.i16(<8 x half> %A, <16 x half> %B, <8 x float> %C, i16 %Index0)
store <8 x float> %res0, ptr addrspace(1) %out0
%Index1 = extractelement <2 x i16> %IndexVec, i32 1
%res1 = call <8 x float> @llvm.amdgcn.swmmac.f32.16x16x32.f16.v8f32.v8f16.v16f16.v8f32.i16(<8 x half> %A, <16 x half> %B, <8 x float> %C, i16 %Index1)
store <8 x float> %res1, ptr addrspace(1) %out1
ret void
}
define amdgpu_ps void @test_swmmac_f32_16x16x32_bf16_index_key(<8 x i16> %A, <16 x i16> %B, <8 x float> %C, ptr addrspace(1) %IndexVecPtr, ptr addrspace(1) %out0, ptr addrspace(1) %out1) {
; GFX12-LABEL: test_swmmac_f32_16x16x32_bf16_index_key:
; GFX12: ; %bb.0: ; %bb
; GFX12-NEXT: global_load_b32 v20, v[20:21], off
; GFX12-NEXT: v_dual_mov_b32 v33, v19 :: v_dual_mov_b32 v32, v18
; GFX12-NEXT: v_dual_mov_b32 v31, v17 :: v_dual_mov_b32 v30, v16
; GFX12-NEXT: v_dual_mov_b32 v29, v15 :: v_dual_mov_b32 v28, v14
; GFX12-NEXT: v_dual_mov_b32 v27, v13 :: v_dual_mov_b32 v26, v12
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-NEXT: v_swmmac_f32_16x16x32_bf16 v[26:33], v[0:3], v[4:11], v20
; GFX12-NEXT: v_swmmac_f32_16x16x32_bf16 v[12:19], v[0:3], v[4:11], v20 index_key:1
; GFX12-NEXT: s_clause 0x1
; GFX12-NEXT: global_store_b128 v[22:23], v[30:33], off offset:16
; GFX12-NEXT: global_store_b128 v[22:23], v[26:29], off
; GFX12-NEXT: s_clause 0x1
; GFX12-NEXT: global_store_b128 v[24:25], v[16:19], off offset:16
; GFX12-NEXT: global_store_b128 v[24:25], v[12:15], off
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
bb:
%IndexVec = load <2 x i16>, ptr addrspace(1) %IndexVecPtr, align 4
%Index0 = extractelement <2 x i16> %IndexVec, i32 0
%res0 = call <8 x float> @llvm.amdgcn.swmmac.f32.16x16x32.bf16.v8f32.v8i16.v16i16.v8f32.i16(<8 x i16> %A, <16 x i16> %B, <8 x float> %C, i16 %Index0)
store <8 x float> %res0, ptr addrspace(1) %out0
%Index1 = extractelement <2 x i16> %IndexVec, i32 1
%res1 = call <8 x float> @llvm.amdgcn.swmmac.f32.16x16x32.bf16.v8f32.v8i16.v16i16.v8f32.i16(<8 x i16> %A, <16 x i16> %B, <8 x float> %C, i16 %Index1)
store <8 x float> %res1, ptr addrspace(1) %out1
ret void
}
define amdgpu_ps void @test_swmmac_f16_16x16x32_f16_index_key(<8 x half> %A, <16 x half> %B, <8 x half> %C, ptr addrspace(1) %IndexVecPtr, ptr addrspace(1) %out0, ptr addrspace(1) %out1) {
; GFX12-LABEL: test_swmmac_f16_16x16x32_f16_index_key:
; GFX12: ; %bb.0: ; %bb
; GFX12-NEXT: global_load_b32 v16, v[16:17], off
; GFX12-NEXT: v_dual_mov_b32 v25, v15 :: v_dual_mov_b32 v24, v14
; GFX12-NEXT: v_dual_mov_b32 v23, v13 :: v_dual_mov_b32 v22, v12
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-NEXT: v_swmmac_f16_16x16x32_f16 v[22:25], v[0:3], v[4:11], v16
; GFX12-NEXT: v_swmmac_f16_16x16x32_f16 v[12:15], v[0:3], v[4:11], v16 index_key:1
; GFX12-NEXT: global_store_b128 v[18:19], v[22:25], off
; GFX12-NEXT: global_store_b128 v[20:21], v[12:15], off
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
bb:
%IndexVec = load <2 x i16>, ptr addrspace(1) %IndexVecPtr, align 4
%Index0 = extractelement <2 x i16> %IndexVec, i32 0
%res0 = call <8 x half> @llvm.amdgcn.swmmac.f16.16x16x32.f16.v8f16.v8f16.v16f16.v8f16.i16(<8 x half> %A, <16 x half> %B, <8 x half> %C, i16 %Index0)
store <8 x half> %res0, ptr addrspace(1) %out0
%Index1 = extractelement <2 x i16> %IndexVec, i32 1
%res1 = call <8 x half> @llvm.amdgcn.swmmac.f16.16x16x32.f16.v8f16.v8f16.v16f16.v8f16.i16(<8 x half> %A, <16 x half> %B, <8 x half> %C, i16 %Index1)
store <8 x half> %res1, ptr addrspace(1) %out1
ret void
}
define amdgpu_ps void @test_swmmac_bf16_16x16x32_bf16_index_key(<8 x i16> %A, <16 x i16> %B, <8 x i16> %C, ptr addrspace(1) %IndexVecPtr, ptr addrspace(1) %out0, ptr addrspace(1) %out1) {
; GFX12-LABEL: test_swmmac_bf16_16x16x32_bf16_index_key:
; GFX12: ; %bb.0: ; %bb
; GFX12-NEXT: global_load_b32 v16, v[16:17], off
; GFX12-NEXT: v_dual_mov_b32 v25, v15 :: v_dual_mov_b32 v24, v14
; GFX12-NEXT: v_dual_mov_b32 v23, v13 :: v_dual_mov_b32 v22, v12
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-NEXT: v_swmmac_bf16_16x16x32_bf16 v[22:25], v[0:3], v[4:11], v16
; GFX12-NEXT: v_swmmac_bf16_16x16x32_bf16 v[12:15], v[0:3], v[4:11], v16 index_key:1
; GFX12-NEXT: global_store_b128 v[18:19], v[22:25], off
; GFX12-NEXT: global_store_b128 v[20:21], v[12:15], off
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
bb:
%IndexVec = load <2 x i16>, ptr addrspace(1) %IndexVecPtr, align 4
%Index0 = extractelement <2 x i16> %IndexVec, i32 0
%res0 = call <8 x i16> @llvm.amdgcn.swmmac.bf16.16x16x32.bf16.v8i16.v8i16.v16i16.v8i16.i16(<8 x i16> %A, <16 x i16> %B, <8 x i16> %C, i16 %Index0)
store <8 x i16> %res0, ptr addrspace(1) %out0
%Index1 = extractelement <2 x i16> %IndexVec, i32 1
%res1 = call <8 x i16> @llvm.amdgcn.swmmac.bf16.16x16x32.bf16.v8i16.v8i16.v16i16.v8i16.i16(<8 x i16> %A, <16 x i16> %B, <8 x i16> %C, i16 %Index1)
store <8 x i16> %res1, ptr addrspace(1) %out1
ret void
}
define amdgpu_ps void @test_swmmac_i32_16x16x32_iu8_index_key(<2 x i32> %A, <4 x i32> %B, <8 x i32> %C, ptr addrspace(1) %IndexVecPtr, ptr addrspace(1) %out0, ptr addrspace(1) %out1) {
; GFX12-LABEL: test_swmmac_i32_16x16x32_iu8_index_key:
; GFX12: ; %bb.0: ; %bb
; GFX12-NEXT: global_load_b32 v14, v[14:15], off
; GFX12-NEXT: v_dual_mov_b32 v27, v13 :: v_dual_mov_b32 v26, v12
; GFX12-NEXT: v_dual_mov_b32 v25, v11 :: v_dual_mov_b32 v24, v10
; GFX12-NEXT: v_dual_mov_b32 v23, v9 :: v_dual_mov_b32 v22, v8
; GFX12-NEXT: v_dual_mov_b32 v21, v7 :: v_dual_mov_b32 v20, v6
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-NEXT: v_swmmac_i32_16x16x32_iu8 v[20:27], v[0:1], v[2:5], v14
; GFX12-NEXT: v_swmmac_i32_16x16x32_iu8 v[6:13], v[0:1], v[2:5], v14 index_key:1
; GFX12-NEXT: s_clause 0x1
; GFX12-NEXT: global_store_b128 v[16:17], v[24:27], off offset:16
; GFX12-NEXT: global_store_b128 v[16:17], v[20:23], off
; GFX12-NEXT: s_clause 0x1
; GFX12-NEXT: global_store_b128 v[18:19], v[10:13], off offset:16
; GFX12-NEXT: global_store_b128 v[18:19], v[6:9], off
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
bb:
%IndexVec = load <2 x i16>, ptr addrspace(1) %IndexVecPtr, align 4
%Index0 = extractelement <2 x i16> %IndexVec, i32 0
%res0 = call <8 x i32> @llvm.amdgcn.swmmac.i32.16x16x32.iu8.v8i32.v2i32.v4i32.v8i32.i16(i1 0, <2 x i32> %A, i1 0, <4 x i32> %B, <8 x i32> %C, i16 %Index0, i1 0)
store <8 x i32> %res0, ptr addrspace(1) %out0
%Index1 = extractelement <2 x i16> %IndexVec, i32 1
%res1 = call <8 x i32> @llvm.amdgcn.swmmac.i32.16x16x32.iu8.v8i32.v2i32.v4i32.v8i32.i16(i1 0, <2 x i32> %A, i1 0, <4 x i32> %B, <8 x i32> %C, i16 %Index1, i1 0)
store <8 x i32> %res1, ptr addrspace(1) %out1
ret void
}
define amdgpu_ps void @test_swmmac_i32_16x16x32_iu4_index_key(i32 %A, <2 x i32> %B, <8 x i32> %C, ptr addrspace(1) %IndexVecPtr, ptr addrspace(1) %out0, ptr addrspace(1) %out1) {
; GFX12-LABEL: test_swmmac_i32_16x16x32_iu4_index_key:
; GFX12: ; %bb.0: ; %bb
; GFX12-NEXT: global_load_b32 v11, v[11:12], off
; GFX12-NEXT: v_dual_mov_b32 v24, v10 :: v_dual_mov_b32 v23, v9
; GFX12-NEXT: v_dual_mov_b32 v22, v8 :: v_dual_mov_b32 v21, v7
; GFX12-NEXT: v_dual_mov_b32 v20, v6 :: v_dual_mov_b32 v19, v5
; GFX12-NEXT: v_dual_mov_b32 v18, v4 :: v_dual_mov_b32 v17, v3
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-NEXT: v_swmmac_i32_16x16x32_iu4 v[17:24], v0, v[1:2], v11
; GFX12-NEXT: v_swmmac_i32_16x16x32_iu4 v[3:10], v0, v[1:2], v11 index_key:1
; GFX12-NEXT: s_clause 0x1
; GFX12-NEXT: global_store_b128 v[13:14], v[21:24], off offset:16
; GFX12-NEXT: global_store_b128 v[13:14], v[17:20], off
; GFX12-NEXT: s_clause 0x1
; GFX12-NEXT: global_store_b128 v[15:16], v[7:10], off offset:16
; GFX12-NEXT: global_store_b128 v[15:16], v[3:6], off
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
bb:
%IndexVec = load <2 x i16>, ptr addrspace(1) %IndexVecPtr, align 4
%Index0 = extractelement <2 x i16> %IndexVec, i32 0
%res0 = call <8 x i32> @llvm.amdgcn.swmmac.i32.16x16x32.iu4.v8i32.i32.v2i32.v8i32.i16(i1 0, i32 %A, i1 0, <2 x i32> %B, <8 x i32> %C, i16 %Index0, i1 0)
store <8 x i32> %res0, ptr addrspace(1) %out0
%Index1 = extractelement <2 x i16> %IndexVec, i32 1
%res1 = call <8 x i32> @llvm.amdgcn.swmmac.i32.16x16x32.iu4.v8i32.i32.v2i32.v8i32.i16(i1 0, i32 %A, i1 0, <2 x i32> %B, <8 x i32> %C, i16 %Index1, i1 0)
store <8 x i32> %res1, ptr addrspace(1) %out1
ret void
}
define amdgpu_ps void @test_swmmac_f32_16x16x32_fp8_fp8_index_key(<2 x i32> %A, <4 x i32> %B, <8 x float> %C, ptr addrspace(1) %IndexVecPtr, ptr addrspace(1) %out0, ptr addrspace(1) %out1) {
; GFX12-LABEL: test_swmmac_f32_16x16x32_fp8_fp8_index_key:
; GFX12: ; %bb.0: ; %bb
; GFX12-NEXT: global_load_b32 v14, v[14:15], off
; GFX12-NEXT: v_dual_mov_b32 v27, v13 :: v_dual_mov_b32 v26, v12
; GFX12-NEXT: v_dual_mov_b32 v25, v11 :: v_dual_mov_b32 v24, v10
; GFX12-NEXT: v_dual_mov_b32 v23, v9 :: v_dual_mov_b32 v22, v8
; GFX12-NEXT: v_dual_mov_b32 v21, v7 :: v_dual_mov_b32 v20, v6
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-NEXT: v_swmmac_f32_16x16x32_fp8_fp8 v[20:27], v[0:1], v[2:5], v14
; GFX12-NEXT: v_swmmac_f32_16x16x32_fp8_fp8 v[6:13], v[0:1], v[2:5], v14 index_key:1
; GFX12-NEXT: s_clause 0x1
; GFX12-NEXT: global_store_b128 v[16:17], v[24:27], off offset:16
; GFX12-NEXT: global_store_b128 v[16:17], v[20:23], off
; GFX12-NEXT: s_clause 0x1
; GFX12-NEXT: global_store_b128 v[18:19], v[10:13], off offset:16
; GFX12-NEXT: global_store_b128 v[18:19], v[6:9], off
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
bb:
%IndexVec = load <2 x i16>, ptr addrspace(1) %IndexVecPtr, align 4
%Index0 = extractelement <2 x i16> %IndexVec, i32 0
%res0 = call <8 x float> @llvm.amdgcn.swmmac.f32.16x16x32.fp8.fp8.v8f32.v2i32.v4i32.v8f32.i16(<2 x i32> %A, <4 x i32> %B, <8 x float> %C, i16 %Index0)
store <8 x float> %res0, ptr addrspace(1) %out0
%Index1 = extractelement <2 x i16> %IndexVec, i32 1
%res1 = call <8 x float> @llvm.amdgcn.swmmac.f32.16x16x32.fp8.fp8.v8f32.v2i32.v4i32.v8f32.i16(<2 x i32> %A, <4 x i32> %B, <8 x float> %C, i16 %Index1)
store <8 x float> %res1, ptr addrspace(1) %out1
ret void
}
define amdgpu_ps void @test_swmmac_f32_16x16x32_fp8_bf8_index_key(<2 x i32> %A, <4 x i32> %B, <8 x float> %C, ptr addrspace(1) %IndexVecPtr, ptr addrspace(1) %out0, ptr addrspace(1) %out1) {
; GFX12-LABEL: test_swmmac_f32_16x16x32_fp8_bf8_index_key:
; GFX12: ; %bb.0: ; %bb
; GFX12-NEXT: global_load_b32 v14, v[14:15], off
; GFX12-NEXT: v_dual_mov_b32 v27, v13 :: v_dual_mov_b32 v26, v12
; GFX12-NEXT: v_dual_mov_b32 v25, v11 :: v_dual_mov_b32 v24, v10
; GFX12-NEXT: v_dual_mov_b32 v23, v9 :: v_dual_mov_b32 v22, v8
; GFX12-NEXT: v_dual_mov_b32 v21, v7 :: v_dual_mov_b32 v20, v6
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-NEXT: v_swmmac_f32_16x16x32_fp8_bf8 v[20:27], v[0:1], v[2:5], v14
; GFX12-NEXT: v_swmmac_f32_16x16x32_fp8_bf8 v[6:13], v[0:1], v[2:5], v14 index_key:1
; GFX12-NEXT: s_clause 0x1
; GFX12-NEXT: global_store_b128 v[16:17], v[24:27], off offset:16
; GFX12-NEXT: global_store_b128 v[16:17], v[20:23], off
; GFX12-NEXT: s_clause 0x1
; GFX12-NEXT: global_store_b128 v[18:19], v[10:13], off offset:16
; GFX12-NEXT: global_store_b128 v[18:19], v[6:9], off
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
bb:
%IndexVec = load <2 x i16>, ptr addrspace(1) %IndexVecPtr, align 4
%Index0 = extractelement <2 x i16> %IndexVec, i32 0
%res0 = call <8 x float> @llvm.amdgcn.swmmac.f32.16x16x32.fp8.bf8.v8f32.v2i32.v4i32.v8f32.i16(<2 x i32> %A, <4 x i32> %B, <8 x float> %C, i16 %Index0)
store <8 x float> %res0, ptr addrspace(1) %out0
%Index1 = extractelement <2 x i16> %IndexVec, i32 1
%res1 = call <8 x float> @llvm.amdgcn.swmmac.f32.16x16x32.fp8.bf8.v8f32.v2i32.v4i32.v8f32.i16(<2 x i32> %A, <4 x i32> %B, <8 x float> %C, i16 %Index1)
store <8 x float> %res1, ptr addrspace(1) %out1
ret void
}
define amdgpu_ps void @test_swmmac_f32_16x16x32_bf8_fp8_index_key(<2 x i32> %A, <4 x i32> %B, <8 x float> %C, ptr addrspace(1) %IndexVecPtr, ptr addrspace(1) %out0, ptr addrspace(1) %out1) {
; GFX12-LABEL: test_swmmac_f32_16x16x32_bf8_fp8_index_key:
; GFX12: ; %bb.0: ; %bb
; GFX12-NEXT: global_load_b32 v14, v[14:15], off
; GFX12-NEXT: v_dual_mov_b32 v27, v13 :: v_dual_mov_b32 v26, v12
; GFX12-NEXT: v_dual_mov_b32 v25, v11 :: v_dual_mov_b32 v24, v10
; GFX12-NEXT: v_dual_mov_b32 v23, v9 :: v_dual_mov_b32 v22, v8
; GFX12-NEXT: v_dual_mov_b32 v21, v7 :: v_dual_mov_b32 v20, v6
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-NEXT: v_swmmac_f32_16x16x32_bf8_fp8 v[20:27], v[0:1], v[2:5], v14
; GFX12-NEXT: v_swmmac_f32_16x16x32_bf8_fp8 v[6:13], v[0:1], v[2:5], v14 index_key:1
; GFX12-NEXT: s_clause 0x1
; GFX12-NEXT: global_store_b128 v[16:17], v[24:27], off offset:16
; GFX12-NEXT: global_store_b128 v[16:17], v[20:23], off
; GFX12-NEXT: s_clause 0x1
; GFX12-NEXT: global_store_b128 v[18:19], v[10:13], off offset:16
; GFX12-NEXT: global_store_b128 v[18:19], v[6:9], off
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
bb:
%IndexVec = load <2 x i16>, ptr addrspace(1) %IndexVecPtr, align 4
%Index0 = extractelement <2 x i16> %IndexVec, i32 0
%res0 = call <8 x float> @llvm.amdgcn.swmmac.f32.16x16x32.bf8.fp8.v8f32.v2i32.v4i32.v8f32.i16(<2 x i32> %A, <4 x i32> %B, <8 x float> %C, i16 %Index0)
store <8 x float> %res0, ptr addrspace(1) %out0
%Index1 = extractelement <2 x i16> %IndexVec, i32 1
%res1 = call <8 x float> @llvm.amdgcn.swmmac.f32.16x16x32.bf8.fp8.v8f32.v2i32.v4i32.v8f32.i16(<2 x i32> %A, <4 x i32> %B, <8 x float> %C, i16 %Index1)
store <8 x float> %res1, ptr addrspace(1) %out1
ret void
}
define amdgpu_ps void @test_swmmac_f32_16x16x32_bf8_bf8_index_key(<2 x i32> %A, <4 x i32> %B, <8 x float> %C, ptr addrspace(1) %IndexVecPtr, ptr addrspace(1) %out0, ptr addrspace(1) %out1) {
; GFX12-LABEL: test_swmmac_f32_16x16x32_bf8_bf8_index_key:
; GFX12: ; %bb.0: ; %bb
; GFX12-NEXT: global_load_b32 v14, v[14:15], off
; GFX12-NEXT: v_dual_mov_b32 v27, v13 :: v_dual_mov_b32 v26, v12
; GFX12-NEXT: v_dual_mov_b32 v25, v11 :: v_dual_mov_b32 v24, v10
; GFX12-NEXT: v_dual_mov_b32 v23, v9 :: v_dual_mov_b32 v22, v8
; GFX12-NEXT: v_dual_mov_b32 v21, v7 :: v_dual_mov_b32 v20, v6
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-NEXT: v_swmmac_f32_16x16x32_bf8_bf8 v[20:27], v[0:1], v[2:5], v14
; GFX12-NEXT: v_swmmac_f32_16x16x32_bf8_bf8 v[6:13], v[0:1], v[2:5], v14 index_key:1
; GFX12-NEXT: s_clause 0x1
; GFX12-NEXT: global_store_b128 v[16:17], v[24:27], off offset:16
; GFX12-NEXT: global_store_b128 v[16:17], v[20:23], off
; GFX12-NEXT: s_clause 0x1
; GFX12-NEXT: global_store_b128 v[18:19], v[10:13], off offset:16
; GFX12-NEXT: global_store_b128 v[18:19], v[6:9], off
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
bb:
%IndexVec = load <2 x i16>, ptr addrspace(1) %IndexVecPtr, align 4
%Index0 = extractelement <2 x i16> %IndexVec, i32 0
%res0 = call <8 x float> @llvm.amdgcn.swmmac.f32.16x16x32.bf8.bf8.v8f32.v2i32.v4i32.v8f32.i16(<2 x i32> %A, <4 x i32> %B, <8 x float> %C, i16 %Index0)
store <8 x float> %res0, ptr addrspace(1) %out0
%Index1 = extractelement <2 x i16> %IndexVec, i32 1
%res1 = call <8 x float> @llvm.amdgcn.swmmac.f32.16x16x32.bf8.bf8.v8f32.v2i32.v4i32.v8f32.i16(<2 x i32> %A, <4 x i32> %B, <8 x float> %C, i16 %Index1)
store <8 x float> %res1, ptr addrspace(1) %out1
ret void
}
declare <8 x float> @llvm.amdgcn.swmmac.f32.16x16x32.f16.v8f32.v8f16.v16f16.v8f32.i16(<8 x half>, <16 x half>, <8 x float>, i16)
declare <8 x float> @llvm.amdgcn.swmmac.f32.16x16x32.bf16.v8f32.v8i16.v16i16.v8f32.i16(<8 x i16>, <16 x i16>, <8 x float>, i16)
declare <8 x half> @llvm.amdgcn.swmmac.f16.16x16x32.f16.v8f16.v8f16.v16f16.v8f16.i16(<8 x half>, <16 x half>, <8 x half>, i16)
declare <8 x i16> @llvm.amdgcn.swmmac.bf16.16x16x32.bf16.v8i16.v8i16.v16i16.v8i16.i16(<8 x i16>, <16 x i16>, <8 x i16>, i16)
declare <8 x i32> @llvm.amdgcn.swmmac.i32.16x16x32.iu8.v8i32.v2i32.v4i32.v8i32.i16(i1 immarg, <2 x i32>, i1 immarg, <4 x i32>, <8 x i32>, i16 %Index, i1 immarg)
declare <8 x i32> @llvm.amdgcn.swmmac.i32.16x16x32.iu4.v8i32.i32.v2i32.v8i32.i16(i1 immarg, i32, i1 immarg, <2 x i32>, <8 x i32>, i16 %Index, i1 immarg)
declare <8 x float> @llvm.amdgcn.swmmac.f32.16x16x32.fp8.fp8.v8f32.v2i32.v4i32.v8f32.i16(<2 x i32>, <4 x i32>, <8 x float>, i16)
declare <8 x float> @llvm.amdgcn.swmmac.f32.16x16x32.fp8.bf8.v8f32.v2i32.v4i32.v8f32.i16(<2 x i32>, <4 x i32>, <8 x float>, i16)
declare <8 x float> @llvm.amdgcn.swmmac.f32.16x16x32.bf8.fp8.v8f32.v2i32.v4i32.v8f32.i16(<2 x i32>, <4 x i32>, <8 x float>, i16)
declare <8 x float> @llvm.amdgcn.swmmac.f32.16x16x32.bf8.bf8.v8f32.v2i32.v4i32.v8f32.i16(<2 x i32>, <4 x i32>, <8 x float>, i16)

View File

@ -0,0 +1,370 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
; RUN: llc -march=amdgcn -mcpu=gfx1200 -verify-machineinstrs < %s | FileCheck %s --check-prefix=GFX12
define amdgpu_ps void @test_wmma_f32_16x16x16_f16(<8 x half> %A, <8 x half> %B, <8 x float> %C, ptr addrspace(1) %out) {
; GFX12-LABEL: test_wmma_f32_16x16x16_f16:
; GFX12: ; %bb.0: ; %bb
; GFX12-NEXT: v_wmma_f32_16x16x16_f16 v[8:15], v[0:3], v[4:7], v[8:15]
; GFX12-NEXT: s_clause 0x1
; GFX12-NEXT: global_store_b128 v[16:17], v[12:15], off offset:16
; GFX12-NEXT: global_store_b128 v[16:17], v[8:11], off
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
bb:
%res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.f16.v8f32.v8f16.v8f16.v8f32(<8 x half> %A, <8 x half> %B, <8 x float> %C)
store <8 x float> %res, ptr addrspace(1) %out
ret void
}
define amdgpu_ps void @test_wmma_f32_16x16x16_bf16(<8 x i16> %A, <8 x i16> %B, <8 x float> %C, ptr addrspace(1) %out) {
; GFX12-LABEL: test_wmma_f32_16x16x16_bf16:
; GFX12: ; %bb.0: ; %bb
; GFX12-NEXT: v_wmma_f32_16x16x16_bf16 v[8:15], v[0:3], v[4:7], v[8:15]
; GFX12-NEXT: s_clause 0x1
; GFX12-NEXT: global_store_b128 v[16:17], v[12:15], off offset:16
; GFX12-NEXT: global_store_b128 v[16:17], v[8:11], off
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
bb:
%res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf16.v8f32.v8i16.v8i16.v8f32(<8 x i16> %A, <8 x i16> %B, <8 x float> %C)
store <8 x float> %res, ptr addrspace(1) %out
ret void
}
define amdgpu_ps void @test_wmma_f16_16x16x16_f16(<8 x half> %A, <8 x half> %B, <8 x half> %C, ptr addrspace(1) %out) {
; GFX12-LABEL: test_wmma_f16_16x16x16_f16:
; GFX12: ; %bb.0: ; %bb
; GFX12-NEXT: v_wmma_f16_16x16x16_f16 v[8:11], v[0:3], v[4:7], v[8:11]
; GFX12-NEXT: global_store_b128 v[12:13], v[8:11], off
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
bb:
%res = call <8 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16.v8f16.v8f16.v8f16.v8f16(<8 x half> %A, <8 x half> %B, <8 x half> %C, i1 0)
store <8 x half> %res, ptr addrspace(1) %out
ret void
}
define amdgpu_ps void @test_wmma_bf16_16x16x16_bf16(<8 x i16> %A, <8 x i16> %B, <8 x i16> %C, ptr addrspace(1) %out) {
; GFX12-LABEL: test_wmma_bf16_16x16x16_bf16:
; GFX12: ; %bb.0: ; %bb
; GFX12-NEXT: v_wmma_bf16_16x16x16_bf16 v[8:11], v[0:3], v[4:7], v[8:11]
; GFX12-NEXT: global_store_b128 v[12:13], v[8:11], off
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
bb:
%res = call <8 x i16> @llvm.amdgcn.wmma.bf16.16x16x16.bf16.v8i16.v8i16.v8i16.v8i16(<8 x i16> %A, <8 x i16> %B, <8 x i16> %C, i1 0)
store <8 x i16> %res, ptr addrspace(1) %out
ret void
}
define amdgpu_ps void @test_wmma_i32_16x16x16_iu8(<2 x i32> %A, <2 x i32> %B, <8 x i32> %C, ptr addrspace(1) %out) {
; GFX12-LABEL: test_wmma_i32_16x16x16_iu8:
; GFX12: ; %bb.0: ; %bb
; GFX12-NEXT: v_wmma_i32_16x16x16_iu8 v[4:11], v[0:1], v[2:3], v[4:11]
; GFX12-NEXT: s_clause 0x1
; GFX12-NEXT: global_store_b128 v[12:13], v[8:11], off offset:16
; GFX12-NEXT: global_store_b128 v[12:13], v[4:7], off
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
bb:
%res = call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8.v8i32.v2i32.v2i32.v8i32(i1 0, <2 x i32> %A, i1 0, <2 x i32> %B, <8 x i32> %C, i1 0)
store <8 x i32> %res, ptr addrspace(1) %out
ret void
}
define amdgpu_ps void @test_wmma_i32_16x16x16_iu4(i32 %A, i32 %B, <8 x i32> %C, ptr addrspace(1) %out) {
; GFX12-LABEL: test_wmma_i32_16x16x16_iu4:
; GFX12: ; %bb.0: ; %bb
; GFX12-NEXT: v_wmma_i32_16x16x16_iu4 v[2:9], v0, v1, v[2:9]
; GFX12-NEXT: s_clause 0x1
; GFX12-NEXT: global_store_b128 v[10:11], v[6:9], off offset:16
; GFX12-NEXT: global_store_b128 v[10:11], v[2:5], off
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
bb:
%res = call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4.v8i32.i32.i32.v8i32(i1 0, i32 %A, i1 0, i32 %B, <8 x i32> %C, i1 0)
store <8 x i32> %res, ptr addrspace(1) %out
ret void
}
define amdgpu_ps void @test_wmma_f32_16x16x16_fp8_fp8(<2 x i32> %A, <2 x i32> %B, <8 x float> %C, ptr addrspace(1) %out) {
; GFX12-LABEL: test_wmma_f32_16x16x16_fp8_fp8:
; GFX12: ; %bb.0: ; %bb
; GFX12-NEXT: v_wmma_f32_16x16x16_fp8_fp8 v[4:11], v[0:1], v[2:3], v[4:11]
; GFX12-NEXT: s_clause 0x1
; GFX12-NEXT: global_store_b128 v[12:13], v[8:11], off offset:16
; GFX12-NEXT: global_store_b128 v[12:13], v[4:7], off
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
bb:
%res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.fp8.fp8.v8f32.v2i32.v2i32.v8f32(<2 x i32> %A, <2 x i32> %B, <8 x float> %C)
store <8 x float> %res, ptr addrspace(1) %out
ret void
}
define amdgpu_ps void @test_wmma_f32_16x16x16_bf8_fp8(<2 x i32> %A, <2 x i32> %B, <8 x float> %C, ptr addrspace(1) %out) {
; GFX12-LABEL: test_wmma_f32_16x16x16_bf8_fp8:
; GFX12: ; %bb.0: ; %bb
; GFX12-NEXT: v_wmma_f32_16x16x16_bf8_fp8 v[4:11], v[0:1], v[2:3], v[4:11]
; GFX12-NEXT: s_clause 0x1
; GFX12-NEXT: global_store_b128 v[12:13], v[8:11], off offset:16
; GFX12-NEXT: global_store_b128 v[12:13], v[4:7], off
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
bb:
%res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf8.fp8.v8f32.v2i32.v2i32.v8f32(<2 x i32> %A, <2 x i32> %B, <8 x float> %C)
store <8 x float> %res, ptr addrspace(1) %out
ret void
}
define amdgpu_ps void @test_wmma_f32_16x16x16_fp8_bf8(<2 x i32> %A, <2 x i32> %B, <8 x float> %C, ptr addrspace(1) %out) {
; GFX12-LABEL: test_wmma_f32_16x16x16_fp8_bf8:
; GFX12: ; %bb.0: ; %bb
; GFX12-NEXT: v_wmma_f32_16x16x16_fp8_bf8 v[4:11], v[0:1], v[2:3], v[4:11]
; GFX12-NEXT: s_clause 0x1
; GFX12-NEXT: global_store_b128 v[12:13], v[8:11], off offset:16
; GFX12-NEXT: global_store_b128 v[12:13], v[4:7], off
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
bb:
%res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.fp8.bf8.v8f32.v2i32.v2i32.v8f32(<2 x i32> %A, <2 x i32> %B, <8 x float> %C)
store <8 x float> %res, ptr addrspace(1) %out
ret void
}
define amdgpu_ps void @test_wmma_f32_16x16x16_bf8_bf8(<2 x i32> %A, <2 x i32> %B, <8 x float> %C, ptr addrspace(1) %out) {
; GFX12-LABEL: test_wmma_f32_16x16x16_bf8_bf8:
; GFX12: ; %bb.0: ; %bb
; GFX12-NEXT: v_wmma_f32_16x16x16_bf8_bf8 v[4:11], v[0:1], v[2:3], v[4:11]
; GFX12-NEXT: s_clause 0x1
; GFX12-NEXT: global_store_b128 v[12:13], v[8:11], off offset:16
; GFX12-NEXT: global_store_b128 v[12:13], v[4:7], off
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
bb:
%res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf8.bf8.v8f32.v2i32.v2i32.v8f32(<2 x i32> %A, <2 x i32> %B, <8 x float> %C)
store <8 x float> %res, ptr addrspace(1) %out
ret void
}
define amdgpu_ps void @test_wmma_i32_16x16x32_iu4(<2 x i32> %A, <2 x i32> %B, <8 x i32> %C, ptr addrspace(1) %out) {
; GFX12-LABEL: test_wmma_i32_16x16x32_iu4:
; GFX12: ; %bb.0: ; %bb
; GFX12-NEXT: v_wmma_i32_16x16x32_iu4 v[4:11], v[0:1], v[2:3], v[4:11]
; GFX12-NEXT: s_clause 0x1
; GFX12-NEXT: global_store_b128 v[12:13], v[8:11], off offset:16
; GFX12-NEXT: global_store_b128 v[12:13], v[4:7], off
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
bb:
%res = call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x32.iu4.v8i32.v2i32.v2i32.v8i32(i1 0, <2 x i32> %A, i1 0, <2 x i32> %B, <8 x i32> %C, i1 0)
store <8 x i32> %res, ptr addrspace(1) %out
ret void
}
define amdgpu_ps void @test_swmmac_f32_16x16x32_f16(<8 x half> %A, <16 x half> %B, <8 x float> %C, i16 %Index, ptr addrspace(1) %out) {
; GFX12-LABEL: test_swmmac_f32_16x16x32_f16:
; GFX12: ; %bb.0: ; %bb
; GFX12-NEXT: v_swmmac_f32_16x16x32_f16 v[12:19], v[0:3], v[4:11], v20
; GFX12-NEXT: s_clause 0x1
; GFX12-NEXT: global_store_b128 v[21:22], v[16:19], off offset:16
; GFX12-NEXT: global_store_b128 v[21:22], v[12:15], off
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
bb:
%res = call <8 x float> @llvm.amdgcn.swmmac.f32.16x16x32.f16.v8f32.v8f16.v16f16.v8f32.i16(<8 x half> %A, <16 x half> %B, <8 x float> %C, i16 %Index)
store <8 x float> %res, ptr addrspace(1) %out
ret void
}
define amdgpu_ps void @test_swmmac_f32_16x16x32_bf16(<8 x i16> %A, <16 x i16> %B, <8 x float> %C, i16 %Index, ptr addrspace(1) %out) {
; GFX12-LABEL: test_swmmac_f32_16x16x32_bf16:
; GFX12: ; %bb.0: ; %bb
; GFX12-NEXT: v_swmmac_f32_16x16x32_bf16 v[12:19], v[0:3], v[4:11], v20
; GFX12-NEXT: s_clause 0x1
; GFX12-NEXT: global_store_b128 v[21:22], v[16:19], off offset:16
; GFX12-NEXT: global_store_b128 v[21:22], v[12:15], off
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
bb:
%res = call <8 x float> @llvm.amdgcn.swmmac.f32.16x16x32.bf16.v8f32.v8i16.v16i16.v8f32.i16(<8 x i16> %A, <16 x i16> %B, <8 x float> %C, i16 %Index)
store <8 x float> %res, ptr addrspace(1) %out
ret void
}
define amdgpu_ps void @test_swmmac_f16_16x16x32_f16(<8 x half> %A, <16 x half> %B, <8 x half> %C, i16 %Index, ptr addrspace(1) %out) {
; GFX12-LABEL: test_swmmac_f16_16x16x32_f16:
; GFX12: ; %bb.0: ; %bb
; GFX12-NEXT: v_swmmac_f16_16x16x32_f16 v[12:15], v[0:3], v[4:11], v16
; GFX12-NEXT: global_store_b128 v[17:18], v[12:15], off
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
bb:
%res = call <8 x half> @llvm.amdgcn.swmmac.f16.16x16x32.f16.v8f16.v8f16.v16f16.v8f16.i16(<8 x half> %A, <16 x half> %B, <8 x half> %C, i16 %Index)
store <8 x half> %res, ptr addrspace(1) %out
ret void
}
define amdgpu_ps void @test_swmmac_bf16_16x16x32_bf16(<8 x i16> %A, <16 x i16> %B, <8 x i16> %C, i16 %Index, ptr addrspace(1) %out) {
; GFX12-LABEL: test_swmmac_bf16_16x16x32_bf16:
; GFX12: ; %bb.0: ; %bb
; GFX12-NEXT: v_swmmac_bf16_16x16x32_bf16 v[12:15], v[0:3], v[4:11], v16
; GFX12-NEXT: global_store_b128 v[17:18], v[12:15], off
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
bb:
%res = call <8 x i16> @llvm.amdgcn.swmmac.bf16.16x16x32.bf16.v8i16.v8i16.v16i16.v8i16.i16(<8 x i16> %A, <16 x i16> %B, <8 x i16> %C, i16 %Index)
store <8 x i16> %res, ptr addrspace(1) %out
ret void
}
define amdgpu_ps void @test_swmmac_i32_16x16x32_iu8(<2 x i32> %A, <4 x i32> %B, <8 x i32> %C, i16 %Index, ptr addrspace(1) %out) {
; GFX12-LABEL: test_swmmac_i32_16x16x32_iu8:
; GFX12: ; %bb.0: ; %bb
; GFX12-NEXT: v_swmmac_i32_16x16x32_iu8 v[6:13], v[0:1], v[2:5], v14
; GFX12-NEXT: s_clause 0x1
; GFX12-NEXT: global_store_b128 v[15:16], v[10:13], off offset:16
; GFX12-NEXT: global_store_b128 v[15:16], v[6:9], off
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
bb:
%res = call <8 x i32> @llvm.amdgcn.swmmac.i32.16x16x32.iu8.v8i32.v2i32.v4i32.v8i32.i16(i1 0, <2 x i32> %A, i1 0, <4 x i32> %B, <8 x i32> %C, i16 %Index, i1 0)
store <8 x i32> %res, ptr addrspace(1) %out
ret void
}
define amdgpu_ps void @test_swmmac_i32_16x16x32_iu4(i32 %A, <2 x i32> %B, <8 x i32> %C, i16 %Index, ptr addrspace(1) %out) {
; GFX12-LABEL: test_swmmac_i32_16x16x32_iu4:
; GFX12: ; %bb.0: ; %bb
; GFX12-NEXT: v_swmmac_i32_16x16x32_iu4 v[3:10], v0, v[1:2], v11
; GFX12-NEXT: s_clause 0x1
; GFX12-NEXT: global_store_b128 v[12:13], v[7:10], off offset:16
; GFX12-NEXT: global_store_b128 v[12:13], v[3:6], off
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
bb:
%res = call <8 x i32> @llvm.amdgcn.swmmac.i32.16x16x32.iu4.v8i32.i32.v2i32.v8i32.i16(i1 0, i32 %A, i1 0, <2 x i32> %B, <8 x i32> %C, i16 %Index, i1 0)
store <8 x i32> %res, ptr addrspace(1) %out
ret void
}
define amdgpu_ps void @test_swmmac_i32_16x16x64_iu4(<2 x i32> %A, <4 x i32> %B, <8 x i32> %C, i32 %Index, ptr addrspace(1) %out) {
; GFX12-LABEL: test_swmmac_i32_16x16x64_iu4:
; GFX12: ; %bb.0: ; %bb
; GFX12-NEXT: v_swmmac_i32_16x16x64_iu4 v[6:13], v[0:1], v[2:5], v14
; GFX12-NEXT: s_clause 0x1
; GFX12-NEXT: global_store_b128 v[15:16], v[10:13], off offset:16
; GFX12-NEXT: global_store_b128 v[15:16], v[6:9], off
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
bb:
%res = call <8 x i32> @llvm.amdgcn.swmmac.i32.16x16x64.iu4.v8i32.v2i32.v4i32.v8i32.i32(i1 0, <2 x i32> %A, i1 0, <4 x i32> %B, <8 x i32> %C, i32 %Index, i1 0)
store <8 x i32> %res, ptr addrspace(1) %out
ret void
}
define amdgpu_ps void @test_swmmac_f32_16x16x32_fp8_fp8(<2 x i32> %A, <4 x i32> %B, <8 x float> %C, i16 %Index, ptr addrspace(1) %out) {
; GFX12-LABEL: test_swmmac_f32_16x16x32_fp8_fp8:
; GFX12: ; %bb.0: ; %bb
; GFX12-NEXT: v_swmmac_f32_16x16x32_fp8_fp8 v[6:13], v[0:1], v[2:5], v14
; GFX12-NEXT: s_clause 0x1
; GFX12-NEXT: global_store_b128 v[15:16], v[10:13], off offset:16
; GFX12-NEXT: global_store_b128 v[15:16], v[6:9], off
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
bb:
%res = call <8 x float> @llvm.amdgcn.swmmac.f32.16x16x32.fp8.fp8.v8f32.v2i32.v4i32.v8f32.i16(<2 x i32> %A, <4 x i32> %B, <8 x float> %C, i16 %Index)
store <8 x float> %res, ptr addrspace(1) %out
ret void
}
define amdgpu_ps void @test_swmmac_f32_16x16x32_fp8_bf8(<2 x i32> %A, <4 x i32> %B, <8 x float> %C, i16 %Index, ptr addrspace(1) %out) {
; GFX12-LABEL: test_swmmac_f32_16x16x32_fp8_bf8:
; GFX12: ; %bb.0: ; %bb
; GFX12-NEXT: v_swmmac_f32_16x16x32_fp8_bf8 v[6:13], v[0:1], v[2:5], v14
; GFX12-NEXT: s_clause 0x1
; GFX12-NEXT: global_store_b128 v[15:16], v[10:13], off offset:16
; GFX12-NEXT: global_store_b128 v[15:16], v[6:9], off
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
bb:
%res = call <8 x float> @llvm.amdgcn.swmmac.f32.16x16x32.fp8.bf8.v8f32.v2i32.v4i32.v8f32.i16(<2 x i32> %A, <4 x i32> %B, <8 x float> %C, i16 %Index)
store <8 x float> %res, ptr addrspace(1) %out
ret void
}
define amdgpu_ps void @test_swmmac_f32_16x16x32_bf8_fp8(<2 x i32> %A, <4 x i32> %B, <8 x float> %C, i16 %Index, ptr addrspace(1) %out) {
; GFX12-LABEL: test_swmmac_f32_16x16x32_bf8_fp8:
; GFX12: ; %bb.0: ; %bb
; GFX12-NEXT: v_swmmac_f32_16x16x32_bf8_fp8 v[6:13], v[0:1], v[2:5], v14
; GFX12-NEXT: s_clause 0x1
; GFX12-NEXT: global_store_b128 v[15:16], v[10:13], off offset:16
; GFX12-NEXT: global_store_b128 v[15:16], v[6:9], off
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
bb:
%res = call <8 x float> @llvm.amdgcn.swmmac.f32.16x16x32.bf8.fp8.v8f32.v2i32.v4i32.v8f32.i16(<2 x i32> %A, <4 x i32> %B, <8 x float> %C, i16 %Index)
store <8 x float> %res, ptr addrspace(1) %out
ret void
}
define amdgpu_ps void @test_swmmac_f32_16x16x32_bf8_bf8(<2 x i32> %A, <4 x i32> %B, <8 x float> %C, i16 %Index, ptr addrspace(1) %out) {
; GFX12-LABEL: test_swmmac_f32_16x16x32_bf8_bf8:
; GFX12: ; %bb.0: ; %bb
; GFX12-NEXT: v_swmmac_f32_16x16x32_bf8_bf8 v[6:13], v[0:1], v[2:5], v14
; GFX12-NEXT: s_clause 0x1
; GFX12-NEXT: global_store_b128 v[15:16], v[10:13], off offset:16
; GFX12-NEXT: global_store_b128 v[15:16], v[6:9], off
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
bb:
%res = call <8 x float> @llvm.amdgcn.swmmac.f32.16x16x32.bf8.bf8.v8f32.v2i32.v4i32.v8f32.i16(<2 x i32> %A, <4 x i32> %B, <8 x float> %C, i16 %Index)
store <8 x float> %res, ptr addrspace(1) %out
ret void
}
declare <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.f16.v8f32.v8f16.v8f16.v8f32(<8 x half>, <8 x half>, <8 x float>)
declare <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf16.v8f32.v8i16.v8i16.v8f32(<8 x i16>, <8 x i16>, <8 x float>)
declare <8 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16.v8f16.v8f16.v8f16.v8f16(<8 x half>, <8 x half>, <8 x half>, i1 immarg)
declare <8 x i16> @llvm.amdgcn.wmma.bf16.16x16x16.bf16.v8i16.v8i16.v8i16.v8i16(<8 x i16>, <8 x i16>, <8 x i16>, i1 immarg)
declare <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8.v8i32.v2i32.v2i32.v8i32(i1 immarg, <2 x i32>, i1 immarg, <2 x i32>, <8 x i32>, i1 immarg)
declare <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4.v8i32.i32.i32.v8i32(i1 immarg, i32, i1 immarg, i32, <8 x i32>, i1 immarg)
declare <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.fp8.fp8.v8f32.v2i32.v2i32.v8f32(<2 x i32>, <2 x i32>, <8 x float>)
declare <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.fp8.bf8.v8f32.v2i32.v2i32.v8f32(<2 x i32>, <2 x i32>, <8 x float>)
declare <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf8.fp8.v8f32.v2i32.v2i32.v8f32(<2 x i32>, <2 x i32>, <8 x float>)
declare <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf8.bf8.v8f32.v2i32.v2i32.v8f32(<2 x i32>, <2 x i32>, <8 x float>)
declare <8 x i32> @llvm.amdgcn.wmma.i32.16x16x32.iu4.v8i32.v2i32.v2i32.v8i32(i1 immarg, <2 x i32>, i1 immarg, <2 x i32>, <8 x i32>, i1 immarg)
declare <8 x float> @llvm.amdgcn.swmmac.f32.16x16x32.f16.v8f32.v8f16.v16f16.v8f32.i16(<8 x half>, <16 x half>, <8 x float>, i16)
declare <8 x float> @llvm.amdgcn.swmmac.f32.16x16x32.bf16.v8f32.v8i16.v16i16.v8f32.i16(<8 x i16>, <16 x i16>, <8 x float>, i16)
declare <8 x half> @llvm.amdgcn.swmmac.f16.16x16x32.f16.v8f16.v8f16.v16f16.v8f16.i16(<8 x half>, <16 x half>, <8 x half>, i16)
declare <8 x i16> @llvm.amdgcn.swmmac.bf16.16x16x32.bf16.v8i16.v8i16.v16i16.v8i16.i16(<8 x i16>, <16 x i16>, <8 x i16>, i16)
declare <8 x i32> @llvm.amdgcn.swmmac.i32.16x16x32.iu8.v8i32.v2i32.v4i32.v8i32.i16(i1 immarg, <2 x i32>, i1 immarg, <4 x i32>, <8 x i32>, i16 %Index, i1 immarg)
declare <8 x i32> @llvm.amdgcn.swmmac.i32.16x16x32.iu4.v8i32.i32.v2i32.v8i32.i16(i1 immarg, i32, i1 immarg, <2 x i32>, <8 x i32>, i16 %Index, i1 immarg)
declare <8 x i32> @llvm.amdgcn.swmmac.i32.16x16x64.iu4.v8i32.v2i32.v4i32.v8i32.i32(i1 immarg, <2 x i32>, i1 immarg, <4 x i32>, <8 x i32>, i32 %Index, i1 immarg)
declare <8 x float> @llvm.amdgcn.swmmac.f32.16x16x32.fp8.fp8.v8f32.v2i32.v4i32.v8f32.i16(<2 x i32>, <4 x i32>, <8 x float>, i16)
declare <8 x float> @llvm.amdgcn.swmmac.f32.16x16x32.fp8.bf8.v8f32.v2i32.v4i32.v8f32.i16(<2 x i32>, <4 x i32>, <8 x float>, i16)
declare <8 x float> @llvm.amdgcn.swmmac.f32.16x16x32.bf8.fp8.v8f32.v2i32.v4i32.v8f32.i16(<2 x i32>, <4 x i32>, <8 x float>, i16)
declare <8 x float> @llvm.amdgcn.swmmac.f32.16x16x32.bf8.bf8.v8f32.v2i32.v4i32.v8f32.i16(<2 x i32>, <4 x i32>, <8 x float>, i16)

View File

@ -0,0 +1,456 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
; RUN: llc -march=amdgcn -mcpu=gfx1200 -mattr=-wavefrontsize32,+wavefrontsize64 -verify-machineinstrs < %s | FileCheck %s --check-prefix=GFX12
define amdgpu_ps void @test_wmma_f32_16x16x16_f16_negA(<4 x half> %A, <4 x half> %B, <4 x float> %C, <4 x float> addrspace(1)* %out) {
; GFX12-LABEL: test_wmma_f32_16x16x16_f16_negA:
; GFX12: ; %bb.0: ; %bb
; GFX12-NEXT: v_wmma_f32_16x16x16_f16 v[4:7], v[0:1], v[2:3], v[4:7] neg_lo:[1,0,0] neg_hi:[1,0,0]
; GFX12-NEXT: global_store_b128 v[8:9], v[4:7], off
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
bb:
%fneg.A = fneg <4 x half> %A
%res = call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.f16.v4f16.v4f32(<4 x half> %fneg.A, <4 x half> %B, <4 x float> %C)
store <4 x float> %res, ptr addrspace(1) %out
ret void
}
define amdgpu_ps void @test_wmma_f32_16x16x16_f16_negB(<4 x half> %A, <4 x half> %B, <4 x float> %C, <4 x float> addrspace(1)* %out) {
; GFX12-LABEL: test_wmma_f32_16x16x16_f16_negB:
; GFX12: ; %bb.0: ; %bb
; GFX12-NEXT: v_wmma_f32_16x16x16_f16 v[4:7], v[0:1], v[2:3], v[4:7] neg_lo:[0,1,0] neg_hi:[0,1,0]
; GFX12-NEXT: global_store_b128 v[8:9], v[4:7], off
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
bb:
%fneg.B = fneg <4 x half> %B
%res = call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.f16.v4f16.v4f32(<4 x half> %A, <4 x half> %fneg.B, <4 x float> %C)
store <4 x float> %res, <4 x float> addrspace(1)* %out
ret void
}
define amdgpu_ps void @test_wmma_f32_16x16x16_f16_negC(<4 x half> %A, <4 x half> %B, <4 x float> %C, <4 x float> addrspace(1)* %out) {
; GFX12-LABEL: test_wmma_f32_16x16x16_f16_negC:
; GFX12: ; %bb.0: ; %bb
; GFX12-NEXT: v_wmma_f32_16x16x16_f16 v[4:7], v[0:1], v[2:3], v[4:7] neg_lo:[0,0,1]
; GFX12-NEXT: global_store_b128 v[8:9], v[4:7], off
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
bb:
%fneg.C = fneg <4 x float> %C
%res = call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.f16.v4f16.v4f32(<4 x half> %A, <4 x half> %B, <4 x float> %fneg.C)
store <4 x float> %res, <4 x float> addrspace(1)* %out
ret void
}
define amdgpu_ps void @test_wmma_f32_16x16x16_f16_absC(<4 x half> %A, <4 x half> %B, <4 x float> %C, <4 x float> addrspace(1)* %out) {
; GFX12-LABEL: test_wmma_f32_16x16x16_f16_absC:
; GFX12: ; %bb.0: ; %bb
; GFX12-NEXT: v_wmma_f32_16x16x16_f16 v[4:7], v[0:1], v[2:3], v[4:7] neg_hi:[0,0,1]
; GFX12-NEXT: global_store_b128 v[8:9], v[4:7], off
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
bb:
%fabs.C = call <4 x float> @llvm.fabs.v4f32(<4 x float> %C)
%res = call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.f16.v4f16.v4f32(<4 x half> %A, <4 x half> %B, <4 x float> %fabs.C)
store <4 x float> %res, <4 x float> addrspace(1)* %out
ret void
}
define amdgpu_ps void @test_wmma_f32_16x16x16_bf16_negC(<4 x i16> %A, <4 x i16> %B, <4 x float> %C, <4 x float> addrspace(1)* %out) {
; GFX12-LABEL: test_wmma_f32_16x16x16_bf16_negC:
; GFX12: ; %bb.0: ; %bb
; GFX12-NEXT: v_wmma_f32_16x16x16_bf16 v[4:7], v[0:1], v[2:3], v[4:7] neg_lo:[0,0,1]
; GFX12-NEXT: global_store_b128 v[8:9], v[4:7], off
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
bb:
%fneg.C = fneg <4 x float> %C
%res = call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf16.v4i16.v4f32(<4 x i16> %A, <4 x i16> %B, <4 x float> %fneg.C)
store <4 x float> %res, <4 x float> addrspace(1)* %out
ret void
}
define amdgpu_ps void @test_wmma_f32_16x16x16_bf16_absC(<4 x i16> %A, <4 x i16> %B, <4 x float> %C, <4 x float> addrspace(1)* %out) {
; GFX12-LABEL: test_wmma_f32_16x16x16_bf16_absC:
; GFX12: ; %bb.0: ; %bb
; GFX12-NEXT: v_wmma_f32_16x16x16_bf16 v[4:7], v[0:1], v[2:3], v[4:7] neg_hi:[0,0,1]
; GFX12-NEXT: global_store_b128 v[8:9], v[4:7], off
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
bb:
%fabs.C = call <4 x float> @llvm.fabs.v4f32(<4 x float> %C)
%res = call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf16.v4i16.v4f32(<4 x i16> %A, <4 x i16> %B, <4 x float> %fabs.C)
store <4 x float> %res, <4 x float> addrspace(1)* %out
ret void
}
define amdgpu_ps void @test_wmma_f16_16x16x16_f16_negA(<4 x half> %A, <4 x half> %B, <4 x half> %C, <4 x half> addrspace(1)* %out) {
; GFX12-LABEL: test_wmma_f16_16x16x16_f16_negA:
; GFX12: ; %bb.0: ; %bb
; GFX12-NEXT: v_wmma_f16_16x16x16_f16 v[4:5], v[0:1], v[2:3], v[4:5] neg_lo:[1,0,0] neg_hi:[1,0,0]
; GFX12-NEXT: global_store_b64 v[6:7], v[4:5], off
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
bb:
%fneg.A = fneg <4 x half> %A
%res = call <4 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16.v8f16.v8f16(<4 x half> %fneg.A, <4 x half> %B, <4 x half> %C, i1 0)
store <4 x half> %res, <4 x half> addrspace(1)* %out
ret void
}
define amdgpu_ps void @test_wmma_f16_16x16x16_f16_negB(<4 x half> %A, <4 x half> %B, <4 x half> %C, <4 x half> addrspace(1)* %out) {
; GFX12-LABEL: test_wmma_f16_16x16x16_f16_negB:
; GFX12: ; %bb.0: ; %bb
; GFX12-NEXT: v_wmma_f16_16x16x16_f16 v[4:5], v[0:1], v[2:3], v[4:5] neg_lo:[0,1,0] neg_hi:[0,1,0]
; GFX12-NEXT: global_store_b64 v[6:7], v[4:5], off
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
bb:
%fneg.B = fneg <4 x half> %B
%res = call <4 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16.v8f16.v8f16(<4 x half> %A, <4 x half> %fneg.B, <4 x half> %C, i1 0)
store <4 x half> %res, <4 x half> addrspace(1)* %out
ret void
}
define amdgpu_ps void @test_wmma_f16_16x16x16_f16_negC(<4 x half> %A, <4 x half> %B, <4 x half> %C, <4 x half> addrspace(1)* %out) {
; GFX12-LABEL: test_wmma_f16_16x16x16_f16_negC:
; GFX12: ; %bb.0: ; %bb
; GFX12-NEXT: v_wmma_f16_16x16x16_f16 v[4:5], v[0:1], v[2:3], v[4:5] neg_lo:[0,0,1]
; GFX12-NEXT: global_store_b64 v[6:7], v[4:5], off
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
bb:
%fneg.C = fneg <4 x half> %C
%res = call <4 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16.v8f16.v8f16(<4 x half> %A, <4 x half> %B, <4 x half> %fneg.C, i1 0)
store <4 x half> %res, <4 x half> addrspace(1)* %out
ret void
}
define amdgpu_ps void @test_wmma_f16_16x16x16_f16_absC(<4 x half> %A, <4 x half> %B, <4 x half> %C, <4 x half> addrspace(1)* %out) {
; GFX12-LABEL: test_wmma_f16_16x16x16_f16_absC:
; GFX12: ; %bb.0: ; %bb
; GFX12-NEXT: v_wmma_f16_16x16x16_f16 v[4:5], v[0:1], v[2:3], v[4:5] neg_hi:[0,0,1]
; GFX12-NEXT: global_store_b64 v[6:7], v[4:5], off
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
bb:
%fabs.C = call <4 x half> @llvm.fabs.v4f16(<4 x half> %C)
%res = call <4 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16.v8f16.v8f16(<4 x half> %A, <4 x half> %B, <4 x half> %fabs.C, i1 0)
store <4 x half> %res, <4 x half> addrspace(1)* %out
ret void
}
define amdgpu_ps void @test_wmma_f32_16x16x16_fp8_fp8_negC(i32 %A, i32 %B, <4 x float> %C, <4 x float> addrspace(1)* %out) {
; GFX12-LABEL: test_wmma_f32_16x16x16_fp8_fp8_negC:
; GFX12: ; %bb.0: ; %bb
; GFX12-NEXT: v_wmma_f32_16x16x16_fp8_fp8 v[2:5], v0, v1, v[2:5] neg_lo:[0,0,1]
; GFX12-NEXT: global_store_b128 v[6:7], v[2:5], off
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
bb:
%fneg.C = fneg <4 x float> %C
%res = call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.fp8.fp8.i32.v4f32(i32 %A, i32 %B, <4 x float> %fneg.C)
store <4 x float> %res, <4 x float> addrspace(1)* %out
ret void
}
define amdgpu_ps void @test_wmma_f32_16x16x16_fp8_fp8_absC(i32 %A, i32 %B, <4 x float> %C, <4 x float> addrspace(1)* %out) {
; GFX12-LABEL: test_wmma_f32_16x16x16_fp8_fp8_absC:
; GFX12: ; %bb.0: ; %bb
; GFX12-NEXT: v_wmma_f32_16x16x16_fp8_fp8 v[2:5], v0, v1, v[2:5] neg_hi:[0,0,1]
; GFX12-NEXT: global_store_b128 v[6:7], v[2:5], off
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
bb:
%fabs.C = call <4 x float> @llvm.fabs.v4f32(<4 x float> %C)
%res = call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.fp8.fp8.i32.v4f32(i32 %A, i32 %B, <4 x float> %fabs.C)
store <4 x float> %res, <4 x float> addrspace(1)* %out
ret void
}
define amdgpu_ps void @test_wmma_f32_16x16x16_bf8_fp8_negC(i32 %A, i32 %B, <4 x float> %C, <4 x float> addrspace(1)* %out) {
; GFX12-LABEL: test_wmma_f32_16x16x16_bf8_fp8_negC:
; GFX12: ; %bb.0: ; %bb
; GFX12-NEXT: v_wmma_f32_16x16x16_bf8_fp8 v[2:5], v0, v1, v[2:5] neg_lo:[0,0,1]
; GFX12-NEXT: global_store_b128 v[6:7], v[2:5], off
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
bb:
%fneg.C = fneg <4 x float> %C
%res = call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf8.fp8.i32.v4f32(i32 %A, i32 %B, <4 x float> %fneg.C)
store <4 x float> %res, <4 x float> addrspace(1)* %out
ret void
}
define amdgpu_ps void @test_wmma_f32_16x16x16_bf8_fp8_absC(i32 %A, i32 %B, <4 x float> %C, <4 x float> addrspace(1)* %out) {
; GFX12-LABEL: test_wmma_f32_16x16x16_bf8_fp8_absC:
; GFX12: ; %bb.0: ; %bb
; GFX12-NEXT: v_wmma_f32_16x16x16_bf8_fp8 v[2:5], v0, v1, v[2:5] neg_hi:[0,0,1]
; GFX12-NEXT: global_store_b128 v[6:7], v[2:5], off
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
bb:
%fabs.C = call <4 x float> @llvm.fabs.v4f32(<4 x float> %C)
%res = call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf8.fp8.i32.v4f32(i32 %A, i32 %B, <4 x float> %fabs.C)
store <4 x float> %res, <4 x float> addrspace(1)* %out
ret void
}
define amdgpu_ps void @test_wmma_f32_16x16x16_fp8_bf8_negC(i32 %A, i32 %B, <4 x float> %C, <4 x float> addrspace(1)* %out) {
; GFX12-LABEL: test_wmma_f32_16x16x16_fp8_bf8_negC:
; GFX12: ; %bb.0: ; %bb
; GFX12-NEXT: v_wmma_f32_16x16x16_fp8_bf8 v[2:5], v0, v1, v[2:5] neg_lo:[0,0,1]
; GFX12-NEXT: global_store_b128 v[6:7], v[2:5], off
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
bb:
%fneg.C = fneg <4 x float> %C
%res = call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.fp8.bf8.i32.v4f32(i32 %A, i32 %B, <4 x float> %fneg.C)
store <4 x float> %res, <4 x float> addrspace(1)* %out
ret void
}
define amdgpu_ps void @test_wmma_f32_16x16x16_fp8_bf8_absC(i32 %A, i32 %B, <4 x float> %C, <4 x float> addrspace(1)* %out) {
; GFX12-LABEL: test_wmma_f32_16x16x16_fp8_bf8_absC:
; GFX12: ; %bb.0: ; %bb
; GFX12-NEXT: v_wmma_f32_16x16x16_fp8_bf8 v[2:5], v0, v1, v[2:5] neg_hi:[0,0,1]
; GFX12-NEXT: global_store_b128 v[6:7], v[2:5], off
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
bb:
%fabs.C = call <4 x float> @llvm.fabs.v4f32(<4 x float> %C)
%res = call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.fp8.bf8.i32.v4f32(i32 %A, i32 %B, <4 x float> %fabs.C)
store <4 x float> %res, <4 x float> addrspace(1)* %out
ret void
}
define amdgpu_ps void @test_wmma_f32_16x16x16_bf8_bf8_negC(i32 %A, i32 %B, <4 x float> %C, <4 x float> addrspace(1)* %out) {
; GFX12-LABEL: test_wmma_f32_16x16x16_bf8_bf8_negC:
; GFX12: ; %bb.0: ; %bb
; GFX12-NEXT: v_wmma_f32_16x16x16_bf8_bf8 v[2:5], v0, v1, v[2:5] neg_lo:[0,0,1]
; GFX12-NEXT: global_store_b128 v[6:7], v[2:5], off
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
bb:
%fneg.C = fneg <4 x float> %C
%res = call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf8.bf8.i32.v4f32(i32 %A, i32 %B, <4 x float> %fneg.C)
store <4 x float> %res, <4 x float> addrspace(1)* %out
ret void
}
define amdgpu_ps void @test_wmma_f32_16x16x16_bf8_bf8_absC(i32 %A, i32 %B, <4 x float> %C, <4 x float> addrspace(1)* %out) {
; GFX12-LABEL: test_wmma_f32_16x16x16_bf8_bf8_absC:
; GFX12: ; %bb.0: ; %bb
; GFX12-NEXT: v_wmma_f32_16x16x16_bf8_bf8 v[2:5], v0, v1, v[2:5] neg_hi:[0,0,1]
; GFX12-NEXT: global_store_b128 v[6:7], v[2:5], off
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
bb:
%fabs.C = call <4 x float> @llvm.fabs.v4f32(<4 x float> %C)
%res = call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf8.bf8.i32.v4f32(i32 %A, i32 %B, <4 x float> %fabs.C)
store <4 x float> %res, <4 x float> addrspace(1)* %out
ret void
}
define amdgpu_ps void @test_swmmac_f32_16x16x32_f16_negA(<4 x half> %A, <8 x half> %B, <4 x float> %C, i16 %Index, <4 x float> addrspace(1)* %out) {
; GFX12-LABEL: test_swmmac_f32_16x16x32_f16_negA:
; GFX12: ; %bb.0: ; %bb
; GFX12-NEXT: v_swmmac_f32_16x16x32_f16 v[6:9], v[0:1], v[2:5], v10 neg_lo:[1,0,0] neg_hi:[1,0,0]
; GFX12-NEXT: global_store_b128 v[11:12], v[6:9], off
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
bb:
%fneg.A = fneg <4 x half> %A
%res = call <4 x float> @llvm.amdgcn.swmmac.f32.16x16x32.f16.v4f16.v8f16.v4f32.i16(<4 x half> %fneg.A, <8 x half> %B, <4 x float> %C, i16 %Index)
store <4 x float> %res, <4 x float> addrspace(1)* %out
ret void
}
define amdgpu_ps void @test_swmmac_f32_16x16x32_f16_negB(<4 x half> %A, <8 x half> %B, <4 x float> %C, i16 %Index, <4 x float> addrspace(1)* %out) {
; GFX12-LABEL: test_swmmac_f32_16x16x32_f16_negB:
; GFX12: ; %bb.0: ; %bb
; GFX12-NEXT: v_swmmac_f32_16x16x32_f16 v[6:9], v[0:1], v[2:5], v10 neg_lo:[0,1,0] neg_hi:[0,1,0]
; GFX12-NEXT: global_store_b128 v[11:12], v[6:9], off
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
bb:
%fneg.B = fneg <8 x half> %B
%res = call <4 x float> @llvm.amdgcn.swmmac.f32.16x16x32.f16.v4f16.v8f16.v4f32.i16(<4 x half> %A, <8 x half> %fneg.B, <4 x float> %C, i16 %Index)
store <4 x float> %res, <4 x float> addrspace(1)* %out
ret void
}
define amdgpu_ps void @test_swmmac_f16_16x16x32_f16_negA(<4 x half> %A, <8 x half> %B, <4 x half> %C, i16 %Index, <4 x half> addrspace(1)* %out) {
; GFX12-LABEL: test_swmmac_f16_16x16x32_f16_negA:
; GFX12: ; %bb.0: ; %bb
; GFX12-NEXT: v_swmmac_f16_16x16x32_f16 v[6:7], v[0:1], v[2:5], v8 neg_lo:[1,0,0] neg_hi:[1,0,0]
; GFX12-NEXT: global_store_b64 v[9:10], v[6:7], off
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
bb:
%fneg.A = fneg <4 x half> %A
%res = call <4 x half> @llvm.amdgcn.swmmac.f16.16x16x32.f16.v4f16.v8f16.v4f16.i16(<4 x half> %fneg.A, <8 x half> %B, <4 x half> %C, i16 %Index)
store <4 x half> %res, <4 x half> addrspace(1)* %out
ret void
}
define amdgpu_ps void @test_swmmac_f16_16x16x32_f16_negB(<4 x half> %A, <8 x half> %B, <4 x half> %C, i16 %Index, <4 x half> addrspace(1)* %out) {
; GFX12-LABEL: test_swmmac_f16_16x16x32_f16_negB:
; GFX12: ; %bb.0: ; %bb
; GFX12-NEXT: v_swmmac_f16_16x16x32_f16 v[6:7], v[0:1], v[2:5], v8 neg_lo:[0,1,0] neg_hi:[0,1,0]
; GFX12-NEXT: global_store_b64 v[9:10], v[6:7], off
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
bb:
%fneg.B = fneg <8 x half> %B
%res = call <4 x half> @llvm.amdgcn.swmmac.f16.16x16x32.f16.v4f16.v8f16.v4f16.i16(<4 x half> %A, <8 x half> %fneg.B, <4 x half> %C, i16 %Index)
store <4 x half> %res, <4 x half> addrspace(1)* %out
ret void
}
; both neg and abs patterns (wmma matrix C f32 or f16 )
define amdgpu_ps void @test_wmma_f32_16x16x16_f16_negabsC(<4 x half> %A, <4 x half> %B, <4 x float> %C, <4 x float> addrspace(1)* %out) {
; GFX12-LABEL: test_wmma_f32_16x16x16_f16_negabsC:
; GFX12: ; %bb.0: ; %bb
; GFX12-NEXT: v_wmma_f32_16x16x16_f16 v[4:7], v[0:1], v[2:3], v[4:7] neg_lo:[0,0,1] neg_hi:[0,0,1]
; GFX12-NEXT: global_store_b128 v[8:9], v[4:7], off
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
bb:
%fabs.C = call <4 x float> @llvm.fabs.v4f32(<4 x float> %C)
%fneg.fabs.C = fneg <4 x float> %fabs.C
%res = call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.f16.v4f16.v4f32(<4 x half> %A, <4 x half> %B, <4 x float> %fneg.fabs.C)
store <4 x float> %res, <4 x float> addrspace(1)* %out
ret void
}
define amdgpu_ps void @test_wmma_f16_16x16x16_f16_negabsC(<4 x half> %A, <4 x half> %B, <4 x half> %C, <4 x half> addrspace(1)* %out) {
; GFX12-LABEL: test_wmma_f16_16x16x16_f16_negabsC:
; GFX12: ; %bb.0: ; %bb
; GFX12-NEXT: v_wmma_f16_16x16x16_f16 v[4:5], v[0:1], v[2:3], v[4:5] neg_lo:[0,0,1] neg_hi:[0,0,1]
; GFX12-NEXT: global_store_b64 v[6:7], v[4:5], off
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
bb:
%fabs.C = call <4 x half> @llvm.fabs.v4f16(<4 x half> %C)
%fneg.fabs.C = fneg <4 x half> %fabs.C
%res = call <4 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16.v8f16.v8f16(<4 x half> %A, <4 x half> %B, <4 x half> %fneg.fabs.C, i1 0)
store <4 x half> %res, <4 x half> addrspace(1)* %out
ret void
}
define amdgpu_ps void @test_wmma_f32_16x16x16_f16_neg_partial_fabsA(<4 x half> %A, <4 x half> %B, <4 x float> %C, <4 x float> addrspace(1)* %out) {
; GFX12-LABEL: test_wmma_f32_16x16x16_f16_neg_partial_fabsA:
; GFX12: ; %bb.0: ; %bb
; GFX12-NEXT: v_and_b32_e32 v7, 0x7fffffff, v7
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-NEXT: v_wmma_f32_16x16x16_f16 v[4:7], v[0:1], v[2:3], v[4:7] neg_lo:[0,0,1]
; GFX12-NEXT: global_store_b128 v[8:9], v[4:7], off
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
bb:
%el3 = extractelement <4 x float> %C, i32 3
%el3.fabs = call float @llvm.fabs.f32(float %el3)
%partial.fabs.C = insertelement <4 x float> %C, float %el3.fabs, i32 3
%fneg.partial.fabs.C = fneg <4 x float> %partial.fabs.C
%res = call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.f16.v4f16.v4f32(<4 x half> %A, <4 x half> %B, <4 x float> %fneg.partial.fabs.C)
store <4 x float> %res, <4 x float> addrspace(1)* %out
ret void
}
; A or B matrix modifier and constant in C
define amdgpu_ps void @test_wmma_f32_16x16x16_f16_negA_constantC(<4 x half> %A, <4 x half> %B, <4 x float> %C, <4 x float> addrspace(1)* %out) {
; GFX12-LABEL: test_wmma_f32_16x16x16_f16_negA_constantC:
; GFX12: ; %bb.0: ; %bb
; GFX12-NEXT: v_wmma_f32_16x16x16_f16 v[6:9], v[0:1], v[2:3], 1.0 neg_lo:[1,0,0] neg_hi:[1,0,0]
; GFX12-NEXT: global_store_b128 v[4:5], v[6:9], off
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
bb:
%fneg.A = fneg <4 x half> %A
%res = call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.f16.v4f16.v4f32(<4 x half> %fneg.A, <4 x half> %B, <4 x float> <float 1.0, float 1.0, float 1.0, float 1.0>)
store <4 x float> %res, <4 x float> addrspace(1)* %out
ret void
}
define amdgpu_ps void @test_wmma_f16_16x16x16_f16_negB_constantC(<4 x half> %A, <4 x half> %B, <4 x half> %C, <4 x half> addrspace(1)* %out) {
; GFX12-LABEL: test_wmma_f16_16x16x16_f16_negB_constantC:
; GFX12: ; %bb.0: ; %bb
; GFX12-NEXT: v_wmma_f16_16x16x16_f16 v[6:7], v[0:1], v[2:3], 1.0 neg_lo:[0,1,0] neg_hi:[0,1,0]
; GFX12-NEXT: global_store_b64 v[4:5], v[6:7], off
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
bb:
%fneg.B = fneg <4 x half> %B
%res = call <4 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16.v8f16.v8f16(<4 x half> %A, <4 x half> %fneg.B, <4 x half> <half 1.0, half 1.0, half 1.0, half 1.0>, i1 0)
store <4 x half> %res, <4 x half> addrspace(1)* %out
ret void
}
; pack f16 elements with v_perm_b32 since they don't come from same b32
define amdgpu_ps void @test_wmma_f16_16x16x16_f16_negC_pack(<4 x half> %A, <4 x half> %B, ptr %Caddr, <4 x half> addrspace(1)* %out) {
; GFX12-LABEL: test_wmma_f16_16x16x16_f16_negC_pack:
; GFX12: ; %bb.0: ; %bb
; GFX12-NEXT: flat_load_b128 v[8:11], v[4:5]
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-NEXT: v_perm_b32 v5, v11, v10, 0x5040100
; GFX12-NEXT: v_perm_b32 v4, v9, v8, 0x5040100
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-NEXT: v_wmma_f16_16x16x16_f16 v[4:5], v[0:1], v[2:3], v[4:5] neg_lo:[0,0,1]
; GFX12-NEXT: global_store_b64 v[6:7], v[4:5], off
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
bb:
%C = load <8 x half>, ptr %Caddr
%C_shuffle = shufflevector <8 x half> %C, <8 x half> undef, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
%fneg.C_shuffle = fneg <4 x half> %C_shuffle
%res = call <4 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16.v8f16.v8f16(<4 x half> %A, <4 x half> %B, <4 x half> %fneg.C_shuffle , i1 0)
store <4 x half> %res, <4 x half> addrspace(1)* %out
ret void
}
declare <4 x half> @llvm.fabs.v4f16(<4 x half>)
declare <4 x float> @llvm.fabs.v4f32(<4 x float>)
declare float @llvm.fabs.f32(float)
declare <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.f16.v4f16.v4f32(<4 x half>, <4 x half>, <4 x float>)
declare <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf16.v4i16.v4f32(<4 x i16>, <4 x i16>, <4 x float>)
declare <4 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16.v8f16.v8f16(<4 x half>, <4 x half>, <4 x half>, i1 immarg)
declare <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.fp8.fp8.i32.v4f32(i32, i32, <4 x float>)
declare <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.fp8.bf8.i32.v4f32(i32, i32, <4 x float>)
declare <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf8.fp8.i32.v4f32(i32, i32, <4 x float>)
declare <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf8.bf8.i32.v4f32(i32, i32, <4 x float>)
declare <4 x float> @llvm.amdgcn.swmmac.f32.16x16x32.f16.v4f16.v8f16.v4f32.i16(<4 x half>, <8 x half>, <4 x float>, i16)
declare <4 x half> @llvm.amdgcn.swmmac.f16.16x16x32.f16.v4f16.v8f16.v4f16.i16(<4 x half>, <8 x half>, <4 x half>, i16)

View File

@ -0,0 +1,373 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
; RUN: llc -march=amdgcn -mcpu=gfx1200 -mattr=-wavefrontsize32,+wavefrontsize64 -verify-machineinstrs < %s | FileCheck %s --check-prefix=GFX12
define amdgpu_ps void @test_wmma_f32_16x16x16_f16_imm(<4 x half> %A, <4 x half> %B, ptr addrspace(1) %out) {
; GFX12-LABEL: test_wmma_f32_16x16x16_f16_imm:
; GFX12: ; %bb.0: ; %bb
; GFX12-NEXT: v_wmma_f32_16x16x16_f16 v[6:9], v[0:1], v[2:3], 1.0
; GFX12-NEXT: global_store_b128 v[4:5], v[6:9], off
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
bb:
%res = call <4 x float>@llvm.amdgcn.wmma.f32.16x16x16.f16.v4f32.v4f16.v4f16.v4f32(<4 x half> %A, <4 x half> %B, <4 x float> <float 1.0, float 1.0, float 1.0, float 1.0>)
store <4 x float> %res, ptr addrspace(1) %out
ret void
}
define amdgpu_ps void @test_wmma_f32_16x16x16_f16_imm_non_inlineable(<4 x half> %A, <4 x half> %B, ptr addrspace(1) %out) {
; GFX12-LABEL: test_wmma_f32_16x16x16_f16_imm_non_inlineable:
; GFX12: ; %bb.0: ; %bb
; GFX12-NEXT: v_mov_b32_e32 v6, 0x40400000
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
; GFX12-NEXT: v_mov_b32_e32 v7, v6
; GFX12-NEXT: v_mov_b32_e32 v8, v6
; GFX12-NEXT: v_mov_b32_e32 v9, v6
; GFX12-NEXT: v_wmma_f32_16x16x16_f16 v[6:9], v[0:1], v[2:3], v[6:9]
; GFX12-NEXT: global_store_b128 v[4:5], v[6:9], off
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
bb:
%res = call <4 x float>@llvm.amdgcn.wmma.f32.16x16x16.f16.v4f32.v4f16.v4f16.v4f32(<4 x half> %A, <4 x half> %B, <4 x float> <float 3.0, float 3.0, float 3.0, float 3.0>)
store <4 x float> %res, ptr addrspace(1) %out
ret void
}
define amdgpu_ps void @test_wmma_f32_16x16x16_bf16_imm(<4 x i16> %A, <4 x i16> %B, ptr addrspace(1) %out) {
; GFX12-LABEL: test_wmma_f32_16x16x16_bf16_imm:
; GFX12: ; %bb.0: ; %bb
; GFX12-NEXT: v_wmma_f32_16x16x16_bf16 v[6:9], v[0:1], v[2:3], 1.0
; GFX12-NEXT: global_store_b128 v[4:5], v[6:9], off
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
bb:
%res = call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf16.v4f32.v4i16.v4i16.v4f32(<4 x i16> %A, <4 x i16> %B, <4 x float> <float 1.0, float 1.0, float 1.0, float 1.0>)
store <4 x float> %res, ptr addrspace(1) %out
ret void
}
define amdgpu_ps void @test_wmma_f32_16x16x16_bf16_imm_non_inlineable(<4 x i16> %A, <4 x i16> %B, ptr addrspace(1) %out) {
; GFX12-LABEL: test_wmma_f32_16x16x16_bf16_imm_non_inlineable:
; GFX12: ; %bb.0: ; %bb
; GFX12-NEXT: v_mov_b32_e32 v6, 0x40400000
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
; GFX12-NEXT: v_mov_b32_e32 v7, v6
; GFX12-NEXT: v_mov_b32_e32 v8, v6
; GFX12-NEXT: v_mov_b32_e32 v9, v6
; GFX12-NEXT: v_wmma_f32_16x16x16_bf16 v[6:9], v[0:1], v[2:3], v[6:9]
; GFX12-NEXT: global_store_b128 v[4:5], v[6:9], off
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
bb:
%res = call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf16.v4f32.v4i16.v4i16.v4f32(<4 x i16> %A, <4 x i16> %B, <4 x float> <float 3.0, float 3.0, float 3.0, float 3.0>)
store <4 x float> %res, ptr addrspace(1) %out
ret void
}
define amdgpu_ps void @test_wmma_f16_16x16x16_f16_imm(<4 x half> %A, <4 x half> %B, ptr addrspace(1) %out) {
; GFX12-LABEL: test_wmma_f16_16x16x16_f16_imm:
; GFX12: ; %bb.0: ; %bb
; GFX12-NEXT: v_wmma_f16_16x16x16_f16 v[6:7], v[0:1], v[2:3], 1.0
; GFX12-NEXT: global_store_b64 v[4:5], v[6:7], off
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
bb:
%res = call <4 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16.v4f16.v4f16.v4f16.v4f16(<4 x half> %A, <4 x half> %B, <4 x half> <half 1.0, half 1.0, half 1.0, half 1.0>, i1 0)
store <4 x half> %res, ptr addrspace(1) %out
ret void
}
define amdgpu_ps void @test_wmma_f16_16x16x16_f16_imm_non_inlineable(<4 x half> %A, <4 x half> %B, ptr addrspace(1) %out) {
; GFX12-LABEL: test_wmma_f16_16x16x16_f16_imm_non_inlineable:
; GFX12: ; %bb.0: ; %bb
; GFX12-NEXT: v_mov_b32_e32 v6, 0x42004200
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX12-NEXT: v_mov_b32_e32 v7, v6
; GFX12-NEXT: v_wmma_f16_16x16x16_f16 v[6:7], v[0:1], v[2:3], v[6:7]
; GFX12-NEXT: global_store_b64 v[4:5], v[6:7], off
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
bb:
%res = call <4 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16.v4f16.v4f16.v4f16.v4f16(<4 x half> %A, <4 x half> %B, <4 x half> <half 3.0, half 3.0, half 3.0, half 3.0>, i1 0)
store <4 x half> %res, ptr addrspace(1) %out
ret void
}
define amdgpu_ps void @test_wmma_bf16_16x16x16_bf16_imm(<4 x i16> %A, <4 x i16> %B, ptr addrspace(1) %out) {
; GFX12-LABEL: test_wmma_bf16_16x16x16_bf16_imm:
; GFX12: ; %bb.0: ; %bb
; GFX12-NEXT: v_wmma_bf16_16x16x16_bf16 v[6:7], v[0:1], v[2:3], 1.0
; GFX12-NEXT: global_store_b64 v[4:5], v[6:7], off
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
bb:
%res = call <4 x i16> @llvm.amdgcn.wmma.bf16.16x16x16.bf16.v4i16.v4i16.v4i16.v4i16(<4 x i16> %A, <4 x i16> %B, <4 x i16> <i16 16256, i16 16256, i16 16256, i16 16256>, i1 0)
store <4 x i16> %res, ptr addrspace(1) %out
ret void
}
define amdgpu_ps void @test_wmma_bf16_16x16x16_bf16_imm_non_inlineable(<4 x i16> %A, <4 x i16> %B, ptr addrspace(1) %out) {
; GFX12-LABEL: test_wmma_bf16_16x16x16_bf16_imm_non_inlineable:
; GFX12: ; %bb.0: ; %bb
; GFX12-NEXT: v_mov_b32_e32 v6, 0x3fc03fc0
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX12-NEXT: v_mov_b32_e32 v7, v6
; GFX12-NEXT: v_wmma_bf16_16x16x16_bf16 v[6:7], v[0:1], v[2:3], v[6:7]
; GFX12-NEXT: global_store_b64 v[4:5], v[6:7], off
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
bb:
%res = call <4 x i16> @llvm.amdgcn.wmma.bf16.16x16x16.bf16.v4i16.v4i16.v4i16.v4i16(<4 x i16> %A, <4 x i16> %B, <4 x i16> <i16 16320, i16 16320, i16 16320, i16 16320>, i1 0)
store <4 x i16> %res, ptr addrspace(1) %out
ret void
}
define amdgpu_ps void @test_wmma_i32_16x16x16_iu8_imm(i32 %A, i32 %B, ptr addrspace(1) %out) {
; GFX12-LABEL: test_wmma_i32_16x16x16_iu8_imm:
; GFX12: ; %bb.0: ; %bb
; GFX12-NEXT: v_wmma_i32_16x16x16_iu8 v[4:7], v0, v1, 1
; GFX12-NEXT: global_store_b128 v[2:3], v[4:7], off
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
bb:
%res = call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8.v4i32.i32.i32.v4i32(i1 0, i32 %A, i1 0, i32 %B, <4 x i32> <i32 1, i32 1, i32 1, i32 1>, i1 0)
store <4 x i32> %res, ptr addrspace(1) %out
ret void
}
define amdgpu_ps void @test_wmma_i32_16x16x16_iu8_imm_non_inlineable(i32 %A, i32 %B, ptr addrspace(1) %out) {
; GFX12-LABEL: test_wmma_i32_16x16x16_iu8_imm_non_inlineable:
; GFX12: ; %bb.0: ; %bb
; GFX12-NEXT: v_mov_b32_e32 v4, 0x80
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
; GFX12-NEXT: v_mov_b32_e32 v5, v4
; GFX12-NEXT: v_mov_b32_e32 v6, v4
; GFX12-NEXT: v_mov_b32_e32 v7, v4
; GFX12-NEXT: v_wmma_i32_16x16x16_iu8 v[4:7], v0, v1, v[4:7]
; GFX12-NEXT: global_store_b128 v[2:3], v[4:7], off
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
bb:
%res = call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8.v4i32.i32.i32.v4i32(i1 0, i32 %A, i1 0, i32 %B, <4 x i32> <i32 128, i32 128, i32 128, i32 128>, i1 0)
store <4 x i32> %res, ptr addrspace(1) %out
ret void
}
define amdgpu_ps void @test_wmma_i32_16x16x16_iu4_imm(i32 %A, i32 %B, ptr addrspace(1) %out) {
; GFX12-LABEL: test_wmma_i32_16x16x16_iu4_imm:
; GFX12: ; %bb.0: ; %bb
; GFX12-NEXT: v_wmma_i32_16x16x16_iu4 v[4:7], v0, v1, 1
; GFX12-NEXT: global_store_b128 v[2:3], v[4:7], off
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
bb:
%res = call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4.v4i32.i32.i32.v4i32(i1 0, i32 %A, i1 0, i32 %B, <4 x i32> <i32 1, i32 1, i32 1, i32 1>, i1 0)
store <4 x i32> %res, ptr addrspace(1) %out
ret void
}
define amdgpu_ps void @test_wmma_i32_16x16x16_iu4_imm_non_inlineable(i32 %A, i32 %B, ptr addrspace(1) %out) {
; GFX12-LABEL: test_wmma_i32_16x16x16_iu4_imm_non_inlineable:
; GFX12: ; %bb.0: ; %bb
; GFX12-NEXT: v_mov_b32_e32 v4, 0x80
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
; GFX12-NEXT: v_mov_b32_e32 v5, v4
; GFX12-NEXT: v_mov_b32_e32 v6, v4
; GFX12-NEXT: v_mov_b32_e32 v7, v4
; GFX12-NEXT: v_wmma_i32_16x16x16_iu4 v[4:7], v0, v1, v[4:7]
; GFX12-NEXT: global_store_b128 v[2:3], v[4:7], off
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
bb:
%res = call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4.v4i32.i32.i32.v4i32(i1 0, i32 %A, i1 0, i32 %B, <4 x i32> <i32 128, i32 128, i32 128, i32 128>, i1 0)
store <4 x i32> %res, ptr addrspace(1) %out
ret void
}
define amdgpu_ps void @test_wmma_f32_16x16x16_fp8_fp8_imm(i32 %A, i32 %B, <4 x float> %C, ptr addrspace(1) %out) {
; GFX12-LABEL: test_wmma_f32_16x16x16_fp8_fp8_imm:
; GFX12: ; %bb.0: ; %bb
; GFX12-NEXT: v_wmma_f32_16x16x16_fp8_fp8 v[4:7], v0, v1, 1.0
; GFX12-NEXT: global_store_b128 v[2:3], v[4:7], off
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
bb:
%res = call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.fp8.fp8.v4f32.i32.i32.v4f32(i32 %A, i32 %B, <4 x float> <float 1.0, float 1.0, float 1.0, float 1.0>)
store <4 x float> %res, ptr addrspace(1) %out
ret void
}
define amdgpu_ps void @test_wmma_f32_16x16x16_fp8_fp8_imm_non_inlineable(i32 %A, i32 %B, <4 x float> %C, ptr addrspace(1) %out) {
; GFX12-LABEL: test_wmma_f32_16x16x16_fp8_fp8_imm_non_inlineable:
; GFX12: ; %bb.0: ; %bb
; GFX12-NEXT: v_mov_b32_e32 v4, 0x40400000
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
; GFX12-NEXT: v_mov_b32_e32 v5, v4
; GFX12-NEXT: v_mov_b32_e32 v6, v4
; GFX12-NEXT: v_mov_b32_e32 v7, v4
; GFX12-NEXT: v_wmma_f32_16x16x16_fp8_fp8 v[4:7], v0, v1, v[4:7]
; GFX12-NEXT: global_store_b128 v[2:3], v[4:7], off
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
bb:
%res = call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.fp8.fp8.v4f32.i32.i32.v4f32(i32 %A, i32 %B, <4 x float> <float 3.0, float 3.0, float 3.0, float 3.0>)
store <4 x float> %res, ptr addrspace(1) %out
ret void
}
define amdgpu_ps void @test_wmma_f32_16x16x16_bf8_fp8_imm(i32 %A, i32 %B, <4 x float> %C, ptr addrspace(1) %out) {
; GFX12-LABEL: test_wmma_f32_16x16x16_bf8_fp8_imm:
; GFX12: ; %bb.0: ; %bb
; GFX12-NEXT: v_wmma_f32_16x16x16_bf8_fp8 v[4:7], v0, v1, 1.0
; GFX12-NEXT: global_store_b128 v[2:3], v[4:7], off
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
bb:
%res = call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf8.fp8.v4f32.i32.i32.v4f32(i32 %A, i32 %B, <4 x float> <float 1.0, float 1.0, float 1.0, float 1.0>)
store <4 x float> %res, ptr addrspace(1) %out
ret void
}
define amdgpu_ps void @test_wmma_f32_16x16x16_bf8_fp8_imm_non_inlineable(i32 %A, i32 %B, <4 x float> %C, ptr addrspace(1) %out) {
; GFX12-LABEL: test_wmma_f32_16x16x16_bf8_fp8_imm_non_inlineable:
; GFX12: ; %bb.0: ; %bb
; GFX12-NEXT: v_mov_b32_e32 v4, 0x40400000
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
; GFX12-NEXT: v_mov_b32_e32 v5, v4
; GFX12-NEXT: v_mov_b32_e32 v6, v4
; GFX12-NEXT: v_mov_b32_e32 v7, v4
; GFX12-NEXT: v_wmma_f32_16x16x16_bf8_fp8 v[4:7], v0, v1, v[4:7]
; GFX12-NEXT: global_store_b128 v[2:3], v[4:7], off
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
bb:
%res = call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf8.fp8.v4f32.i32.i32.v4f32(i32 %A, i32 %B, <4 x float> <float 3.0, float 3.0, float 3.0, float 3.0>)
store <4 x float> %res, ptr addrspace(1) %out
ret void
}
define amdgpu_ps void @test_wmma_f32_16x16x16_fp8_bf8_imm(i32 %A, i32 %B, <4 x float> %C, ptr addrspace(1) %out) {
; GFX12-LABEL: test_wmma_f32_16x16x16_fp8_bf8_imm:
; GFX12: ; %bb.0: ; %bb
; GFX12-NEXT: v_wmma_f32_16x16x16_fp8_bf8 v[4:7], v0, v1, 1.0
; GFX12-NEXT: global_store_b128 v[2:3], v[4:7], off
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
bb:
%res = call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.fp8.bf8.v4f32.i32.i32.v4f32(i32 %A, i32 %B, <4 x float> <float 1.0, float 1.0, float 1.0, float 1.0>)
store <4 x float> %res, ptr addrspace(1) %out
ret void
}
define amdgpu_ps void @test_wmma_f32_16x16x16_fp8_bf8_imm_non_inlineable(i32 %A, i32 %B, <4 x float> %C, ptr addrspace(1) %out) {
; GFX12-LABEL: test_wmma_f32_16x16x16_fp8_bf8_imm_non_inlineable:
; GFX12: ; %bb.0: ; %bb
; GFX12-NEXT: v_mov_b32_e32 v4, 0x40400000
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
; GFX12-NEXT: v_mov_b32_e32 v5, v4
; GFX12-NEXT: v_mov_b32_e32 v6, v4
; GFX12-NEXT: v_mov_b32_e32 v7, v4
; GFX12-NEXT: v_wmma_f32_16x16x16_fp8_bf8 v[4:7], v0, v1, v[4:7]
; GFX12-NEXT: global_store_b128 v[2:3], v[4:7], off
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
bb:
%res = call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.fp8.bf8.v4f32.i32.i32.v4f32(i32 %A, i32 %B, <4 x float> <float 3.0, float 3.0, float 3.0, float 3.0>)
store <4 x float> %res, ptr addrspace(1) %out
ret void
}
define amdgpu_ps void @test_wmma_f32_16x16x16_bf8_bf8_imm(i32 %A, i32 %B, <4 x float> %C, ptr addrspace(1) %out) {
; GFX12-LABEL: test_wmma_f32_16x16x16_bf8_bf8_imm:
; GFX12: ; %bb.0: ; %bb
; GFX12-NEXT: v_wmma_f32_16x16x16_bf8_bf8 v[4:7], v0, v1, 1.0
; GFX12-NEXT: global_store_b128 v[2:3], v[4:7], off
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
bb:
%res = call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf8.bf8.v4f32.i32.i32.v4f32(i32 %A, i32 %B, <4 x float> <float 1.0, float 1.0, float 1.0, float 1.0>)
store <4 x float> %res, ptr addrspace(1) %out
ret void
}
define amdgpu_ps void @test_wmma_f32_16x16x16_bf8_bf8_imm_non_inlineable(i32 %A, i32 %B, <4 x float> %C, ptr addrspace(1) %out) {
; GFX12-LABEL: test_wmma_f32_16x16x16_bf8_bf8_imm_non_inlineable:
; GFX12: ; %bb.0: ; %bb
; GFX12-NEXT: v_mov_b32_e32 v4, 0x40400000
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
; GFX12-NEXT: v_mov_b32_e32 v5, v4
; GFX12-NEXT: v_mov_b32_e32 v6, v4
; GFX12-NEXT: v_mov_b32_e32 v7, v4
; GFX12-NEXT: v_wmma_f32_16x16x16_bf8_bf8 v[4:7], v0, v1, v[4:7]
; GFX12-NEXT: global_store_b128 v[2:3], v[4:7], off
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
bb:
%res = call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf8.bf8.v4f32.i32.i32.v4f32(i32 %A, i32 %B, <4 x float> <float 3.0, float 3.0, float 3.0, float 3.0>)
store <4 x float> %res, ptr addrspace(1) %out
ret void
}
define amdgpu_ps void @test_wmma_i32_16x16x32_iu4_imm(i32 %A, i32 %B, <4 x i32> %C, ptr addrspace(1) %out) {
; GFX12-LABEL: test_wmma_i32_16x16x32_iu4_imm:
; GFX12: ; %bb.0: ; %bb
; GFX12-NEXT: v_wmma_i32_16x16x32_iu4 v[4:7], v0, v1, 1
; GFX12-NEXT: global_store_b128 v[2:3], v[4:7], off
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
bb:
%res = call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x32.iu4.v4i32.i32.i32.v4i32(i1 0, i32 %A, i1 0, i32 %B, <4 x i32> <i32 1, i32 1, i32 1, i32 1>, i1 0)
store <4 x i32> %res, ptr addrspace(1) %out
ret void
}
define amdgpu_ps void @test_wmma_i32_16x16x32_iu4_imm_non_inlineable(i32 %A, i32 %B, <4 x i32> %C, ptr addrspace(1) %out) {
; GFX12-LABEL: test_wmma_i32_16x16x32_iu4_imm_non_inlineable:
; GFX12: ; %bb.0: ; %bb
; GFX12-NEXT: v_mov_b32_e32 v4, 0x80
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
; GFX12-NEXT: v_mov_b32_e32 v5, v4
; GFX12-NEXT: v_mov_b32_e32 v6, v4
; GFX12-NEXT: v_mov_b32_e32 v7, v4
; GFX12-NEXT: v_wmma_i32_16x16x32_iu4 v[4:7], v0, v1, v[4:7]
; GFX12-NEXT: global_store_b128 v[2:3], v[4:7], off
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
bb:
%res = call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x32.iu4.v4i32.i32.i32.v4i32(i1 0, i32 %A, i1 0, i32 %B, <4 x i32> <i32 128, i32 128, i32 128, i32 128>, i1 0)
store <4 x i32> %res, ptr addrspace(1) %out
ret void
}
declare <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.f16.v4f32.v4f16.v4f16.v4f32(<4 x half>, <4 x half>, <4 x float>)
declare <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf16.v4f32.v4i16.v4i16.v4f32(<4 x i16>, <4 x i16>, <4 x float>)
declare <4 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16.v4f16.v4f16.v4f16.v4f16(<4 x half>, <4 x half>, <4 x half>, i1 immarg)
declare <4 x i16> @llvm.amdgcn.wmma.bf16.16x16x16.bf16.v4i16.v4i16.v4i16.v4i16(<4 x i16>, <4 x i16>, <4 x i16>, i1 immarg)
declare <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8.v4i32.i32.i32.v4i32(i1 immarg, i32, i1 immarg, i32, <4 x i32>, i1 immarg)
declare <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4.v4i32.i32.i32.v4i32(i1 immarg, i32, i1 immarg, i32, <4 x i32>, i1 immarg)
declare <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.fp8.fp8.v4f32.i32.i32.v4f32(i32, i32, <4 x float>)
declare <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.fp8.bf8.v4f32.i32.i32.v4f32(i32, i32, <4 x float>)
declare <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf8.fp8.v4f32.i32.i32.v4f32(i32, i32, <4 x float>)
declare <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf8.bf8.v4f32.i32.i32.v4f32(i32, i32, <4 x float>)
declare <4 x i32> @llvm.amdgcn.wmma.i32.16x16x32.iu4.v4i32.i32.i32.v4i32(i1 immarg, i32, i1 immarg, i32, <4 x i32>, i1 immarg)

View File

@ -0,0 +1,274 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
; RUN: llc -march=amdgcn -mcpu=gfx1200 -mattr=-wavefrontsize32,+wavefrontsize64 -verify-machineinstrs < %s | FileCheck %s --check-prefix=GFX12
define amdgpu_ps void @test_wmma_i32_16x16x16_iu8_zext_src0(i32 %A, i32 %B, <4 x i32> %C, ptr addrspace(1) %out) {
; GFX12-LABEL: test_wmma_i32_16x16x16_iu8_zext_src0:
; GFX12: ; %bb.0: ; %bb
; GFX12-NEXT: v_wmma_i32_16x16x16_iu8 v[2:5], v0, v1, v[2:5] neg_lo:[1,0,0]
; GFX12-NEXT: global_store_b128 v[6:7], v[2:5], off
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
bb:
%res = call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8.v4i32.i32.i32.v4i32(i1 1, i32 %A, i1 0, i32 %B, <4 x i32> %C, i1 0)
store <4 x i32> %res, ptr addrspace(1) %out
ret void
}
define amdgpu_ps void @test_wmma_i32_16x16x16_iu8_zext_src1(i32 %A, i32 %B, <4 x i32> %C, ptr addrspace(1) %out) {
; GFX12-LABEL: test_wmma_i32_16x16x16_iu8_zext_src1:
; GFX12: ; %bb.0: ; %bb
; GFX12-NEXT: v_wmma_i32_16x16x16_iu8 v[2:5], v0, v1, v[2:5] neg_lo:[0,1,0]
; GFX12-NEXT: global_store_b128 v[6:7], v[2:5], off
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
bb:
%res = call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8.v4i32.i32.i32.v4i32(i1 0, i32 %A, i1 1, i32 %B, <4 x i32> %C, i1 0)
store <4 x i32> %res, ptr addrspace(1) %out
ret void
}
define amdgpu_ps void @test_wmma_i32_16x16x16_iu8_clamp(i32 %A, i32 %B, <4 x i32> %C, ptr addrspace(1) %out) {
; GFX12-LABEL: test_wmma_i32_16x16x16_iu8_clamp:
; GFX12: ; %bb.0: ; %bb
; GFX12-NEXT: v_wmma_i32_16x16x16_iu8 v[2:5], v0, v1, v[2:5] clamp
; GFX12-NEXT: global_store_b128 v[6:7], v[2:5], off
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
bb:
%res = call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8.v4i32.i32.i32.v4i32(i1 0, i32 %A, i1 0, i32 %B, <4 x i32> %C, i1 1)
store <4 x i32> %res, ptr addrspace(1) %out
ret void
}
define amdgpu_ps void @test_wmma_i32_16x16x16_iu4_zext_src0(i32 %A, i32 %B, <4 x i32> %C, ptr addrspace(1) %out) {
; GFX12-LABEL: test_wmma_i32_16x16x16_iu4_zext_src0:
; GFX12: ; %bb.0: ; %bb
; GFX12-NEXT: v_wmma_i32_16x16x16_iu4 v[2:5], v0, v1, v[2:5] neg_lo:[1,0,0]
; GFX12-NEXT: global_store_b128 v[6:7], v[2:5], off
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
bb:
%res = call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4.v4i32.i32.i32.v4i32(i1 1, i32 %A, i1 0, i32 %B, <4 x i32> %C, i1 0)
store <4 x i32> %res, ptr addrspace(1) %out
ret void
}
define amdgpu_ps void @test_wmma_i32_16x16x16_iu4_zext_src1(i32 %A, i32 %B, <4 x i32> %C, ptr addrspace(1) %out) {
; GFX12-LABEL: test_wmma_i32_16x16x16_iu4_zext_src1:
; GFX12: ; %bb.0: ; %bb
; GFX12-NEXT: v_wmma_i32_16x16x16_iu4 v[2:5], v0, v1, v[2:5] neg_lo:[0,1,0]
; GFX12-NEXT: global_store_b128 v[6:7], v[2:5], off
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
bb:
%res = call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4.v4i32.i32.i32.v4i32(i1 0, i32 %A, i1 1, i32 %B, <4 x i32> %C, i1 0)
store <4 x i32> %res, ptr addrspace(1) %out
ret void
}
define amdgpu_ps void @test_wmma_i32_16x16x16_iu4_clamp(i32 %A, i32 %B, <4 x i32> %C, ptr addrspace(1) %out) {
; GFX12-LABEL: test_wmma_i32_16x16x16_iu4_clamp:
; GFX12: ; %bb.0: ; %bb
; GFX12-NEXT: v_wmma_i32_16x16x16_iu4 v[2:5], v0, v1, v[2:5] clamp
; GFX12-NEXT: global_store_b128 v[6:7], v[2:5], off
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
bb:
%res = call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4.v4i32.i32.i32.v4i32(i1 0, i32 %A, i1 0, i32 %B, <4 x i32> %C, i1 1)
store <4 x i32> %res, ptr addrspace(1) %out
ret void
}
define amdgpu_ps void @test_wmma_i32_16x16x32_iu4_zext_src0(i32 %A, i32 %B, <4 x i32> %C, ptr addrspace(1) %out) {
; GFX12-LABEL: test_wmma_i32_16x16x32_iu4_zext_src0:
; GFX12: ; %bb.0: ; %bb
; GFX12-NEXT: v_wmma_i32_16x16x32_iu4 v[2:5], v0, v1, v[2:5] neg_lo:[1,0,0]
; GFX12-NEXT: global_store_b128 v[6:7], v[2:5], off
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
bb:
%res = call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x32.iu4.v4i32.i32.i32.v4i32(i1 1, i32 %A, i1 0, i32 %B, <4 x i32> %C, i1 0)
store <4 x i32> %res, ptr addrspace(1) %out
ret void
}
define amdgpu_ps void @test_wmma_i32_16x16x32_iu4_zext_src1(i32 %A, i32 %B, <4 x i32> %C, ptr addrspace(1) %out) {
; GFX12-LABEL: test_wmma_i32_16x16x32_iu4_zext_src1:
; GFX12: ; %bb.0: ; %bb
; GFX12-NEXT: v_wmma_i32_16x16x32_iu4 v[2:5], v0, v1, v[2:5] neg_lo:[0,1,0]
; GFX12-NEXT: global_store_b128 v[6:7], v[2:5], off
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
bb:
%res = call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x32.iu4.v4i32.i32.i32.v4i32(i1 0, i32 %A, i1 1, i32 %B, <4 x i32> %C, i1 0)
store <4 x i32> %res, ptr addrspace(1) %out
ret void
}
define amdgpu_ps void @test_wmma_i32_16x16x32_iu4_clamp(i32 %A, i32 %B, <4 x i32> %C, ptr addrspace(1) %out) {
; GFX12-LABEL: test_wmma_i32_16x16x32_iu4_clamp:
; GFX12: ; %bb.0: ; %bb
; GFX12-NEXT: v_wmma_i32_16x16x32_iu4 v[2:5], v0, v1, v[2:5] clamp
; GFX12-NEXT: global_store_b128 v[6:7], v[2:5], off
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
bb:
%res = call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x32.iu4.v4i32.i32.i32.v4i32(i1 0, i32 %A, i1 0, i32 %B, <4 x i32> %C, i1 1)
store <4 x i32> %res, ptr addrspace(1) %out
ret void
}
define amdgpu_ps void @test_swmmac_i32_16x16x32_iu8_zext_src0(i32 %A, <2 x i32> %B, <4 x i32> %C, i8 %Index, ptr addrspace(1) %out) {
; GFX12-LABEL: test_swmmac_i32_16x16x32_iu8_zext_src0:
; GFX12: ; %bb.0: ; %bb
; GFX12-NEXT: v_swmmac_i32_16x16x32_iu8 v[3:6], v0, v[1:2], v7 neg_lo:[1,0,0]
; GFX12-NEXT: global_store_b128 v[8:9], v[3:6], off
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
bb:
%res = call <4 x i32> @llvm.amdgcn.swmmac.i32.16x16x32.iu8.v4i32.i32.v2i32.v4i32.i8(i1 1, i32 %A, i1 0, <2 x i32> %B, <4 x i32> %C, i8 %Index, i1 0)
store <4 x i32> %res, ptr addrspace(1) %out
ret void
}
define amdgpu_ps void @test_swmmac_i32_16x16x32_iu8_zext_src1(i32 %A, <2 x i32> %B, <4 x i32> %C, i8 %Index, ptr addrspace(1) %out) {
; GFX12-LABEL: test_swmmac_i32_16x16x32_iu8_zext_src1:
; GFX12: ; %bb.0: ; %bb
; GFX12-NEXT: v_swmmac_i32_16x16x32_iu8 v[3:6], v0, v[1:2], v7 neg_lo:[0,1,0]
; GFX12-NEXT: global_store_b128 v[8:9], v[3:6], off
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
bb:
%res = call <4 x i32> @llvm.amdgcn.swmmac.i32.16x16x32.iu8.v4i32.i32.v2i32.v4i32.i8(i1 0, i32 %A, i1 1, <2 x i32> %B, <4 x i32> %C, i8 %Index, i1 0)
store <4 x i32> %res, ptr addrspace(1) %out
ret void
}
define amdgpu_ps void @test_swmmac_i32_16x16x32_iu8_clamp(i32 %A, <2 x i32> %B, <4 x i32> %C, i8 %Index, ptr addrspace(1) %out) {
; GFX12-LABEL: test_swmmac_i32_16x16x32_iu8_clamp:
; GFX12: ; %bb.0: ; %bb
; GFX12-NEXT: v_swmmac_i32_16x16x32_iu8 v[3:6], v0, v[1:2], v7 clamp
; GFX12-NEXT: global_store_b128 v[8:9], v[3:6], off
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
bb:
%res = call <4 x i32> @llvm.amdgcn.swmmac.i32.16x16x32.iu8.v4i32.i32.v2i32.v4i32.i8(i1 0, i32 %A, i1 0, <2 x i32> %B, <4 x i32> %C, i8 %Index, i1 1)
store <4 x i32> %res, ptr addrspace(1) %out
ret void
}
define amdgpu_ps void @test_swmmac_i32_16x16x32_iu4_zext_src0(i32 %A, i32 %B, <4 x i32> %C, i16 %Index, ptr addrspace(1) %out) {
; GFX12-LABEL: test_swmmac_i32_16x16x32_iu4_zext_src0:
; GFX12: ; %bb.0: ; %bb
; GFX12-NEXT: v_swmmac_i32_16x16x32_iu4 v[2:5], v0, v1, v6 neg_lo:[1,0,0]
; GFX12-NEXT: global_store_b128 v[7:8], v[2:5], off
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
bb:
%res = call <4 x i32> @llvm.amdgcn.swmmac.i32.16x16x32.iu4.v4i32.i32.i32.v4i32.i16(i1 1, i32 %A, i1 0, i32 %B, <4 x i32> %C, i16 %Index, i1 0)
store <4 x i32> %res, ptr addrspace(1) %out
ret void
}
define amdgpu_ps void @test_swmmac_i32_16x16x32_iu4_zext_src1(i32 %A, i32 %B, <4 x i32> %C, i16 %Index, ptr addrspace(1) %out) {
; GFX12-LABEL: test_swmmac_i32_16x16x32_iu4_zext_src1:
; GFX12: ; %bb.0: ; %bb
; GFX12-NEXT: v_swmmac_i32_16x16x32_iu4 v[2:5], v0, v1, v6 neg_lo:[0,1,0]
; GFX12-NEXT: global_store_b128 v[7:8], v[2:5], off
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
bb:
%res = call <4 x i32> @llvm.amdgcn.swmmac.i32.16x16x32.iu4.v4i32.i32.i32.v4i32.i16(i1 0, i32 %A, i1 1, i32 %B, <4 x i32> %C, i16 %Index, i1 0)
store <4 x i32> %res, ptr addrspace(1) %out
ret void
}
define amdgpu_ps void @test_swmmac_i32_16x16x32_iu4_clamp(i32 %A, i32 %B, <4 x i32> %C, i16 %Index, ptr addrspace(1) %out) {
; GFX12-LABEL: test_swmmac_i32_16x16x32_iu4_clamp:
; GFX12: ; %bb.0: ; %bb
; GFX12-NEXT: v_swmmac_i32_16x16x32_iu4 v[2:5], v0, v1, v6 clamp
; GFX12-NEXT: global_store_b128 v[7:8], v[2:5], off
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
bb:
%res = call <4 x i32> @llvm.amdgcn.swmmac.i32.16x16x32.iu4.v4i32.i32.i32.v4i32.i16(i1 0, i32 %A, i1 0, i32 %B, <4 x i32> %C, i16 %Index, i1 1)
store <4 x i32> %res, ptr addrspace(1) %out
ret void
}
define amdgpu_ps void @test_swmmac_i32_16x16x64_iu4_zext_src0(i32 %A, <2 x i32> %B, <4 x i32> %C, i16 %Index, ptr addrspace(1) %out) {
; GFX12-LABEL: test_swmmac_i32_16x16x64_iu4_zext_src0:
; GFX12: ; %bb.0: ; %bb
; GFX12-NEXT: v_swmmac_i32_16x16x64_iu4 v[3:6], v0, v[1:2], v7 neg_lo:[1,0,0]
; GFX12-NEXT: global_store_b128 v[8:9], v[3:6], off
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
bb:
%res = call <4 x i32> @llvm.amdgcn.swmmac.i32.16x16x64.iu4.v4i32.i32.v2i32.v4i32.i16(i1 1, i32 %A, i1 0, <2 x i32> %B, <4 x i32> %C, i16 %Index, i1 0)
store <4 x i32> %res, ptr addrspace(1) %out
ret void
}
define amdgpu_ps void @test_swmmac_i32_16x16x64_iu4_zext_src1(i32 %A, <2 x i32> %B, <4 x i32> %C, i16 %Index, ptr addrspace(1) %out) {
; GFX12-LABEL: test_swmmac_i32_16x16x64_iu4_zext_src1:
; GFX12: ; %bb.0: ; %bb
; GFX12-NEXT: v_swmmac_i32_16x16x64_iu4 v[3:6], v0, v[1:2], v7 neg_lo:[0,1,0]
; GFX12-NEXT: global_store_b128 v[8:9], v[3:6], off
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
bb:
%res = call <4 x i32> @llvm.amdgcn.swmmac.i32.16x16x64.iu4.v4i32.i32.v2i32.v4i32.i16(i1 0, i32 %A, i1 1, <2 x i32> %B, <4 x i32> %C, i16 %Index, i1 0)
store <4 x i32> %res, ptr addrspace(1) %out
ret void
}
define amdgpu_ps void @test_swmmac_i32_16x16x64_iu4_clamp(i32 %A, <2 x i32> %B, <4 x i32> %C, i16 %Index, ptr addrspace(1) %out) {
; GFX12-LABEL: test_swmmac_i32_16x16x64_iu4_clamp:
; GFX12: ; %bb.0: ; %bb
; GFX12-NEXT: v_swmmac_i32_16x16x64_iu4 v[3:6], v0, v[1:2], v7 clamp
; GFX12-NEXT: global_store_b128 v[8:9], v[3:6], off
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
bb:
%res = call <4 x i32> @llvm.amdgcn.swmmac.i32.16x16x64.iu4.v4i32.i32.v2i32.v4i32.i16(i1 0, i32 %A, i1 0, <2 x i32> %B, <4 x i32> %C, i16 %Index, i1 1)
store <4 x i32> %res, ptr addrspace(1) %out
ret void
}
declare <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8.v4i32.i32.i32.v4i32(i1 immarg, i32, i1 immarg, i32, <4 x i32>, i1 immarg)
declare <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4.v4i32.i32.i32.v4i32(i1 immarg, i32, i1 immarg, i32, <4 x i32>, i1 immarg)
declare <4 x i32> @llvm.amdgcn.wmma.i32.16x16x32.iu4.v4i32.i32.i32.v4i32(i1 immarg, i32, i1 immarg, i32, <4 x i32>, i1 immarg)
declare <4 x i32> @llvm.amdgcn.swmmac.i32.16x16x32.iu8.v4i32.i32.v2i32.v4i32.i8(i1 immarg, i32, i1 immarg, <2 x i32>, <4 x i32>, i8 %Index, i1 immarg)
declare <4 x i32> @llvm.amdgcn.swmmac.i32.16x16x32.iu4.v4i32.i32.i32.v4i32.i16(i1 immarg, i32, i1 immarg, i32, <4 x i32>, i16 %Index, i1 immarg)
declare <4 x i32> @llvm.amdgcn.swmmac.i32.16x16x64.iu4.v4i32.i32.v2i32.v4i32.i16(i1 immarg, i32, i1 immarg, <2 x i32>, <4 x i32>, i16 %Index, i1 immarg)

View File

@ -0,0 +1,472 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
; RUN: llc -march=amdgcn -mcpu=gfx1200 -mattr=-wavefrontsize32,+wavefrontsize64 -verify-machineinstrs < %s | FileCheck %s --check-prefix=GFX12
define amdgpu_ps void @test_swmmac_f32_16x16x32_f16_index_key(<4 x half> %A, <8 x half> %B, <4 x float> %C, ptr addrspace(1) %IndexVecPtr, ptr addrspace(1) %out0, ptr addrspace(1) %out1, ptr addrspace(1) %out2, ptr addrspace(1) %out3) {
; GFX12-LABEL: test_swmmac_f32_16x16x32_f16_index_key:
; GFX12: ; %bb.0: ; %bb
; GFX12-NEXT: global_load_b32 v10, v[10:11], off
; GFX12-NEXT: v_mov_b32_e32 v23, v9
; GFX12-NEXT: v_mov_b32_e32 v22, v8
; GFX12-NEXT: v_mov_b32_e32 v21, v7
; GFX12-NEXT: v_mov_b32_e32 v20, v6
; GFX12-NEXT: v_mov_b32_e32 v27, v9
; GFX12-NEXT: v_mov_b32_e32 v26, v8
; GFX12-NEXT: v_mov_b32_e32 v25, v7
; GFX12-NEXT: v_mov_b32_e32 v24, v6
; GFX12-NEXT: v_mov_b32_e32 v31, v9
; GFX12-NEXT: v_mov_b32_e32 v30, v8
; GFX12-NEXT: v_mov_b32_e32 v29, v7
; GFX12-NEXT: v_mov_b32_e32 v28, v6
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: v_swmmac_f32_16x16x32_f16 v[20:23], v[0:1], v[2:5], v10
; GFX12-NEXT: v_swmmac_f32_16x16x32_f16 v[24:27], v[0:1], v[2:5], v10 index_key:1
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3)
; GFX12-NEXT: v_swmmac_f32_16x16x32_f16 v[28:31], v[0:1], v[2:5], v10 index_key:2
; GFX12-NEXT: v_swmmac_f32_16x16x32_f16 v[6:9], v[0:1], v[2:5], v10 index_key:3
; GFX12-NEXT: global_store_b128 v[12:13], v[20:23], off
; GFX12-NEXT: global_store_b128 v[14:15], v[24:27], off
; GFX12-NEXT: global_store_b128 v[16:17], v[28:31], off
; GFX12-NEXT: global_store_b128 v[18:19], v[6:9], off
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
bb:
%IndexVec = load <4 x i8>, ptr addrspace(1) %IndexVecPtr, align 4
%Index0 = extractelement <4 x i8> %IndexVec, i32 0
%res0 = call <4 x float> @llvm.amdgcn.swmmac.f32.16x16x32.f16.v4f32.v4f16.v8f16.v4f32.i8(<4 x half> %A, <8 x half> %B, <4 x float> %C, i8 %Index0)
store <4 x float> %res0, ptr addrspace(1) %out0
%Index1 = extractelement <4 x i8> %IndexVec, i32 1
%res1 = call <4 x float> @llvm.amdgcn.swmmac.f32.16x16x32.f16.v4f32.v4f16.v8f16.v4f32.i8(<4 x half> %A, <8 x half> %B, <4 x float> %C, i8 %Index1)
store <4 x float> %res1, ptr addrspace(1) %out1
%Index2 = extractelement <4 x i8> %IndexVec, i32 2
%res2 = call <4 x float> @llvm.amdgcn.swmmac.f32.16x16x32.f16.v4f32.v4f16.v8f16.v4f32.i8(<4 x half> %A, <8 x half> %B, <4 x float> %C, i8 %Index2)
store <4 x float> %res2, ptr addrspace(1) %out2
%Index3 = extractelement <4 x i8> %IndexVec, i32 3
%res3 = call <4 x float> @llvm.amdgcn.swmmac.f32.16x16x32.f16.v4f32.v4f16.v8f16.v4f32.i8(<4 x half> %A, <8 x half> %B, <4 x float> %C, i8 %Index3)
store <4 x float> %res3, ptr addrspace(1) %out3
ret void
}
define amdgpu_ps void @test_swmmac_f32_16x16x32_bf16_index_key(<4 x i16> %A, <8 x i16> %B, <4 x float> %C, ptr addrspace(1) %IndexVecPtr, ptr addrspace(1) %out0, ptr addrspace(1) %out1, ptr addrspace(1) %out2, ptr addrspace(1) %out3) {
; GFX12-LABEL: test_swmmac_f32_16x16x32_bf16_index_key:
; GFX12: ; %bb.0: ; %bb
; GFX12-NEXT: global_load_b32 v10, v[10:11], off
; GFX12-NEXT: v_mov_b32_e32 v23, v9
; GFX12-NEXT: v_mov_b32_e32 v22, v8
; GFX12-NEXT: v_mov_b32_e32 v21, v7
; GFX12-NEXT: v_mov_b32_e32 v20, v6
; GFX12-NEXT: v_mov_b32_e32 v27, v9
; GFX12-NEXT: v_mov_b32_e32 v26, v8
; GFX12-NEXT: v_mov_b32_e32 v25, v7
; GFX12-NEXT: v_mov_b32_e32 v24, v6
; GFX12-NEXT: v_mov_b32_e32 v31, v9
; GFX12-NEXT: v_mov_b32_e32 v30, v8
; GFX12-NEXT: v_mov_b32_e32 v29, v7
; GFX12-NEXT: v_mov_b32_e32 v28, v6
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: v_swmmac_f32_16x16x32_bf16 v[20:23], v[0:1], v[2:5], v10
; GFX12-NEXT: v_swmmac_f32_16x16x32_bf16 v[24:27], v[0:1], v[2:5], v10 index_key:1
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3)
; GFX12-NEXT: v_swmmac_f32_16x16x32_bf16 v[28:31], v[0:1], v[2:5], v10 index_key:2
; GFX12-NEXT: v_swmmac_f32_16x16x32_bf16 v[6:9], v[0:1], v[2:5], v10 index_key:3
; GFX12-NEXT: global_store_b128 v[12:13], v[20:23], off
; GFX12-NEXT: global_store_b128 v[14:15], v[24:27], off
; GFX12-NEXT: global_store_b128 v[16:17], v[28:31], off
; GFX12-NEXT: global_store_b128 v[18:19], v[6:9], off
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
bb:
%IndexVec = load <4 x i8>, ptr addrspace(1) %IndexVecPtr, align 4
%Index0 = extractelement <4 x i8> %IndexVec, i32 0
%res0 = call <4 x float> @llvm.amdgcn.swmmac.f32.16x16x32.bf16.v4f32.v4i16.v8i16.v4f32.i8(<4 x i16> %A, <8 x i16> %B, <4 x float> %C, i8 %Index0)
store <4 x float> %res0, ptr addrspace(1) %out0
%Index1 = extractelement <4 x i8> %IndexVec, i32 1
%res1 = call <4 x float> @llvm.amdgcn.swmmac.f32.16x16x32.bf16.v4f32.v4i16.v8i16.v4f32.i8(<4 x i16> %A, <8 x i16> %B, <4 x float> %C, i8 %Index1)
store <4 x float> %res1, ptr addrspace(1) %out1
%Index2 = extractelement <4 x i8> %IndexVec, i32 2
%res2 = call <4 x float> @llvm.amdgcn.swmmac.f32.16x16x32.bf16.v4f32.v4i16.v8i16.v4f32.i8(<4 x i16> %A, <8 x i16> %B, <4 x float> %C, i8 %Index2)
store <4 x float> %res2, ptr addrspace(1) %out2
%Index3 = extractelement <4 x i8> %IndexVec, i32 3
%res3 = call <4 x float> @llvm.amdgcn.swmmac.f32.16x16x32.bf16.v4f32.v4i16.v8i16.v4f32.i8(<4 x i16> %A, <8 x i16> %B, <4 x float> %C, i8 %Index3)
store <4 x float> %res3, ptr addrspace(1) %out3
ret void
}
define amdgpu_ps void @test_swmmac_f16_16x16x32_f16_index_key(<4 x half> %A, <8 x half> %B, <4 x half> %C, ptr addrspace(1) %IndexVecPtr, ptr addrspace(1) %out0, ptr addrspace(1) %out1, ptr addrspace(1) %out2, ptr addrspace(1) %out3) {
; GFX12-LABEL: test_swmmac_f16_16x16x32_f16_index_key:
; GFX12: ; %bb.0: ; %bb
; GFX12-NEXT: global_load_b32 v22, v[8:9], off
; GFX12-NEXT: v_mov_b32_e32 v9, v7
; GFX12-NEXT: v_mov_b32_e32 v8, v6
; GFX12-NEXT: v_mov_b32_e32 v19, v7
; GFX12-NEXT: v_mov_b32_e32 v18, v6
; GFX12-NEXT: v_mov_b32_e32 v21, v7
; GFX12-NEXT: v_mov_b32_e32 v20, v6
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: v_swmmac_f16_16x16x32_f16 v[8:9], v[0:1], v[2:5], v22
; GFX12-NEXT: v_swmmac_f16_16x16x32_f16 v[18:19], v[0:1], v[2:5], v22 index_key:1
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3)
; GFX12-NEXT: v_swmmac_f16_16x16x32_f16 v[20:21], v[0:1], v[2:5], v22 index_key:2
; GFX12-NEXT: v_swmmac_f16_16x16x32_f16 v[6:7], v[0:1], v[2:5], v22 index_key:3
; GFX12-NEXT: global_store_b64 v[10:11], v[8:9], off
; GFX12-NEXT: global_store_b64 v[12:13], v[18:19], off
; GFX12-NEXT: global_store_b64 v[14:15], v[20:21], off
; GFX12-NEXT: global_store_b64 v[16:17], v[6:7], off
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
bb:
%IndexVec = load <4 x i8>, ptr addrspace(1) %IndexVecPtr, align 4
%Index0 = extractelement <4 x i8> %IndexVec, i32 0
%res0 = call <4 x half> @llvm.amdgcn.swmmac.f16.16x16x32.f16.v4f16.v4f16.v8f16.v4f16.i8(<4 x half> %A, <8 x half> %B, <4 x half> %C, i8 %Index0)
store <4 x half> %res0, ptr addrspace(1) %out0
%Index1 = extractelement <4 x i8> %IndexVec, i32 1
%res1 = call <4 x half> @llvm.amdgcn.swmmac.f16.16x16x32.f16.v4f16.v4f16.v8f16.v4f16.i8(<4 x half> %A, <8 x half> %B, <4 x half> %C, i8 %Index1)
store <4 x half> %res1, ptr addrspace(1) %out1
%Index2 = extractelement <4 x i8> %IndexVec, i32 2
%res2 = call <4 x half> @llvm.amdgcn.swmmac.f16.16x16x32.f16.v4f16.v4f16.v8f16.v4f16.i8(<4 x half> %A, <8 x half> %B, <4 x half> %C, i8 %Index2)
store <4 x half> %res2, ptr addrspace(1) %out2
%Index3 = extractelement <4 x i8> %IndexVec, i32 3
%res3 = call <4 x half> @llvm.amdgcn.swmmac.f16.16x16x32.f16.v4f16.v4f16.v8f16.v4f16.i8(<4 x half> %A, <8 x half> %B, <4 x half> %C, i8 %Index3)
store <4 x half> %res3, ptr addrspace(1) %out3
ret void
}
define amdgpu_ps void @test_swmmac_bf16_16x16x32_bf16_index_key(<4 x i16> %A, <8 x i16> %B, <4 x i16> %C, ptr addrspace(1) %IndexVecPtr, ptr addrspace(1) %out0, ptr addrspace(1) %out1, ptr addrspace(1) %out2, ptr addrspace(1) %out3) {
; GFX12-LABEL: test_swmmac_bf16_16x16x32_bf16_index_key:
; GFX12: ; %bb.0: ; %bb
; GFX12-NEXT: global_load_b32 v22, v[8:9], off
; GFX12-NEXT: v_mov_b32_e32 v9, v7
; GFX12-NEXT: v_mov_b32_e32 v8, v6
; GFX12-NEXT: v_mov_b32_e32 v19, v7
; GFX12-NEXT: v_mov_b32_e32 v18, v6
; GFX12-NEXT: v_mov_b32_e32 v21, v7
; GFX12-NEXT: v_mov_b32_e32 v20, v6
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: v_swmmac_bf16_16x16x32_bf16 v[8:9], v[0:1], v[2:5], v22
; GFX12-NEXT: v_swmmac_bf16_16x16x32_bf16 v[18:19], v[0:1], v[2:5], v22 index_key:1
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3)
; GFX12-NEXT: v_swmmac_bf16_16x16x32_bf16 v[20:21], v[0:1], v[2:5], v22 index_key:2
; GFX12-NEXT: v_swmmac_bf16_16x16x32_bf16 v[6:7], v[0:1], v[2:5], v22 index_key:3
; GFX12-NEXT: global_store_b64 v[10:11], v[8:9], off
; GFX12-NEXT: global_store_b64 v[12:13], v[18:19], off
; GFX12-NEXT: global_store_b64 v[14:15], v[20:21], off
; GFX12-NEXT: global_store_b64 v[16:17], v[6:7], off
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
bb:
%IndexVec = load <4 x i8>, ptr addrspace(1) %IndexVecPtr, align 4
%Index0 = extractelement <4 x i8> %IndexVec, i32 0
%res0 = call <4 x i16> @llvm.amdgcn.swmmac.bf16.16x16x32.bf16.v4i16.v4i16.v8i16.v4i16.i8(<4 x i16> %A, <8 x i16> %B, <4 x i16> %C, i8 %Index0)
store <4 x i16> %res0, ptr addrspace(1) %out0
%Index1 = extractelement <4 x i8> %IndexVec, i32 1
%res1 = call <4 x i16> @llvm.amdgcn.swmmac.bf16.16x16x32.bf16.v4i16.v4i16.v8i16.v4i16.i8(<4 x i16> %A, <8 x i16> %B, <4 x i16> %C, i8 %Index1)
store <4 x i16> %res1, ptr addrspace(1) %out1
%Index2 = extractelement <4 x i8> %IndexVec, i32 2
%res2 = call <4 x i16> @llvm.amdgcn.swmmac.bf16.16x16x32.bf16.v4i16.v4i16.v8i16.v4i16.i8(<4 x i16> %A, <8 x i16> %B, <4 x i16> %C, i8 %Index2)
store <4 x i16> %res2, ptr addrspace(1) %out2
%Index3 = extractelement <4 x i8> %IndexVec, i32 3
%res3 = call <4 x i16> @llvm.amdgcn.swmmac.bf16.16x16x32.bf16.v4i16.v4i16.v8i16.v4i16.i8(<4 x i16> %A, <8 x i16> %B, <4 x i16> %C, i8 %Index3)
store <4 x i16> %res3, ptr addrspace(1) %out3
ret void
}
define amdgpu_ps void @test_swmmac_i32_16x16x32_iu8_index_key(i32 %A, <2 x i32> %B, <4 x i32> %C, ptr addrspace(1) %IndexVecPtr, ptr addrspace(1) %out0, ptr addrspace(1) %out1, ptr addrspace(1) %out2, ptr addrspace(1) %out3) {
; GFX12-LABEL: test_swmmac_i32_16x16x32_iu8_index_key:
; GFX12: ; %bb.0: ; %bb
; GFX12-NEXT: global_load_b32 v7, v[7:8], off
; GFX12-NEXT: v_mov_b32_e32 v20, v6
; GFX12-NEXT: v_mov_b32_e32 v19, v5
; GFX12-NEXT: v_mov_b32_e32 v18, v4
; GFX12-NEXT: v_mov_b32_e32 v17, v3
; GFX12-NEXT: v_mov_b32_e32 v24, v6
; GFX12-NEXT: v_mov_b32_e32 v23, v5
; GFX12-NEXT: v_mov_b32_e32 v22, v4
; GFX12-NEXT: v_mov_b32_e32 v21, v3
; GFX12-NEXT: v_mov_b32_e32 v28, v6
; GFX12-NEXT: v_mov_b32_e32 v27, v5
; GFX12-NEXT: v_mov_b32_e32 v26, v4
; GFX12-NEXT: v_mov_b32_e32 v25, v3
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: v_swmmac_i32_16x16x32_iu8 v[17:20], v0, v[1:2], v7
; GFX12-NEXT: v_swmmac_i32_16x16x32_iu8 v[21:24], v0, v[1:2], v7 index_key:1
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3)
; GFX12-NEXT: v_swmmac_i32_16x16x32_iu8 v[25:28], v0, v[1:2], v7 index_key:2
; GFX12-NEXT: v_swmmac_i32_16x16x32_iu8 v[3:6], v0, v[1:2], v7 index_key:3
; GFX12-NEXT: global_store_b128 v[9:10], v[17:20], off
; GFX12-NEXT: global_store_b128 v[11:12], v[21:24], off
; GFX12-NEXT: global_store_b128 v[13:14], v[25:28], off
; GFX12-NEXT: global_store_b128 v[15:16], v[3:6], off
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
bb:
%IndexVec = load <4 x i8>, ptr addrspace(1) %IndexVecPtr, align 4
%Index0 = extractelement <4 x i8> %IndexVec, i32 0
%res0 = call <4 x i32> @llvm.amdgcn.swmmac.i32.16x16x32.iu8.v4i32.i32.v2i32.v4i32.i8(i1 0, i32 %A, i1 0, <2 x i32> %B, <4 x i32> %C, i8 %Index0, i1 0)
store <4 x i32> %res0, ptr addrspace(1) %out0
%Index1 = extractelement <4 x i8> %IndexVec, i32 1
%res1 = call <4 x i32> @llvm.amdgcn.swmmac.i32.16x16x32.iu8.v4i32.i32.v2i32.v4i32.i8(i1 0, i32 %A, i1 0, <2 x i32> %B, <4 x i32> %C, i8 %Index1, i1 0)
store <4 x i32> %res1, ptr addrspace(1) %out1
%Index2 = extractelement <4 x i8> %IndexVec, i32 2
%res2 = call <4 x i32> @llvm.amdgcn.swmmac.i32.16x16x32.iu8.v4i32.i32.v2i32.v4i32.i8(i1 0, i32 %A, i1 0, <2 x i32> %B, <4 x i32> %C, i8 %Index2, i1 0)
store <4 x i32> %res2, ptr addrspace(1) %out2
%Index3 = extractelement <4 x i8> %IndexVec, i32 3
%res3 = call <4 x i32> @llvm.amdgcn.swmmac.i32.16x16x32.iu8.v4i32.i32.v2i32.v4i32.i8(i1 0, i32 %A, i1 0, <2 x i32> %B, <4 x i32> %C, i8 %Index3, i1 0)
store <4 x i32> %res3, ptr addrspace(1) %out3
ret void
}
define amdgpu_ps void @test_swmmac_i32_16x16x32_iu4_index_key(i32 %A, i32 %B, <4 x i32> %C, ptr addrspace(1) %IndexVecPtr, ptr addrspace(1) %out0, ptr addrspace(1) %out1) {
; GFX12-LABEL: test_swmmac_i32_16x16x32_iu4_index_key:
; GFX12: ; %bb.0: ; %bb
; GFX12-NEXT: global_load_b32 v6, v[6:7], off
; GFX12-NEXT: v_mov_b32_e32 v15, v5
; GFX12-NEXT: v_mov_b32_e32 v14, v4
; GFX12-NEXT: v_mov_b32_e32 v13, v3
; GFX12-NEXT: v_mov_b32_e32 v12, v2
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-NEXT: v_swmmac_i32_16x16x32_iu4 v[12:15], v0, v1, v6
; GFX12-NEXT: v_swmmac_i32_16x16x32_iu4 v[2:5], v0, v1, v6 index_key:1
; GFX12-NEXT: global_store_b128 v[8:9], v[12:15], off
; GFX12-NEXT: global_store_b128 v[10:11], v[2:5], off
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
bb:
%IndexVec = load <2 x i16>, ptr addrspace(1) %IndexVecPtr, align 4
%Index0 = extractelement <2 x i16> %IndexVec, i32 0
%res0 = call <4 x i32> @llvm.amdgcn.swmmac.i32.16x16x32.iu4.v4i32.i32.i32.v4i32.i16(i1 0, i32 %A, i1 0, i32 %B, <4 x i32> %C, i16 %Index0, i1 0)
store <4 x i32> %res0, ptr addrspace(1) %out0
%Index1 = extractelement <2 x i16> %IndexVec, i32 1
%res1 = call <4 x i32> @llvm.amdgcn.swmmac.i32.16x16x32.iu4.v4i32.i32.i32.v4i32.i16(i1 0, i32 %A, i1 0, i32 %B, <4 x i32> %C, i16 %Index1, i1 0)
store <4 x i32> %res1, ptr addrspace(1) %out1
ret void
}
define amdgpu_ps void @test_swmmac_i32_16x16x64_iu4_index_key(i32 %A, <2 x i32> %B, <4 x i32> %C, ptr addrspace(1) %IndexVecPtr, ptr addrspace(1) %out0, ptr addrspace(1) %out1) {
; GFX12-LABEL: test_swmmac_i32_16x16x64_iu4_index_key:
; GFX12: ; %bb.0: ; %bb
; GFX12-NEXT: global_load_b32 v7, v[7:8], off
; GFX12-NEXT: v_mov_b32_e32 v16, v6
; GFX12-NEXT: v_mov_b32_e32 v15, v5
; GFX12-NEXT: v_mov_b32_e32 v14, v4
; GFX12-NEXT: v_mov_b32_e32 v13, v3
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-NEXT: v_swmmac_i32_16x16x64_iu4 v[13:16], v0, v[1:2], v7
; GFX12-NEXT: v_swmmac_i32_16x16x64_iu4 v[3:6], v0, v[1:2], v7 index_key:1
; GFX12-NEXT: global_store_b128 v[9:10], v[13:16], off
; GFX12-NEXT: global_store_b128 v[11:12], v[3:6], off
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
bb:
%IndexVec = load <2 x i16>, ptr addrspace(1) %IndexVecPtr, align 4
%Index0 = extractelement <2 x i16> %IndexVec, i32 0
%res0 = call <4 x i32> @llvm.amdgcn.swmmac.i32.16x16x64.iu4.v4i32.i32.v2i32.v4i32.i16(i1 0, i32 %A, i1 0, <2 x i32> %B, <4 x i32> %C, i16 %Index0, i1 0)
store <4 x i32> %res0, ptr addrspace(1) %out0
%Index1 = extractelement <2 x i16> %IndexVec, i32 1
%res1 = call <4 x i32> @llvm.amdgcn.swmmac.i32.16x16x64.iu4.v4i32.i32.v2i32.v4i32.i16(i1 0, i32 %A, i1 0, <2 x i32> %B, <4 x i32> %C, i16 %Index1, i1 0)
store <4 x i32> %res1, ptr addrspace(1) %out1
ret void
}
define amdgpu_ps void @test_swmmac_f32_16x16x32_fp8_fp8_index_key(i32 %A, <2 x i32> %B, <4 x float> %C, ptr addrspace(1) %IndexVecPtr, ptr addrspace(1) %out0, ptr addrspace(1) %out1, ptr addrspace(1) %out2, ptr addrspace(1) %out3) {
; GFX12-LABEL: test_swmmac_f32_16x16x32_fp8_fp8_index_key:
; GFX12: ; %bb.0: ; %bb
; GFX12-NEXT: global_load_b32 v7, v[7:8], off
; GFX12-NEXT: v_mov_b32_e32 v20, v6
; GFX12-NEXT: v_mov_b32_e32 v19, v5
; GFX12-NEXT: v_mov_b32_e32 v18, v4
; GFX12-NEXT: v_mov_b32_e32 v17, v3
; GFX12-NEXT: v_mov_b32_e32 v24, v6
; GFX12-NEXT: v_mov_b32_e32 v23, v5
; GFX12-NEXT: v_mov_b32_e32 v22, v4
; GFX12-NEXT: v_mov_b32_e32 v21, v3
; GFX12-NEXT: v_mov_b32_e32 v28, v6
; GFX12-NEXT: v_mov_b32_e32 v27, v5
; GFX12-NEXT: v_mov_b32_e32 v26, v4
; GFX12-NEXT: v_mov_b32_e32 v25, v3
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: v_swmmac_f32_16x16x32_fp8_fp8 v[17:20], v0, v[1:2], v7
; GFX12-NEXT: v_swmmac_f32_16x16x32_fp8_fp8 v[21:24], v0, v[1:2], v7 index_key:1
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3)
; GFX12-NEXT: v_swmmac_f32_16x16x32_fp8_fp8 v[25:28], v0, v[1:2], v7 index_key:2
; GFX12-NEXT: v_swmmac_f32_16x16x32_fp8_fp8 v[3:6], v0, v[1:2], v7 index_key:3
; GFX12-NEXT: global_store_b128 v[9:10], v[17:20], off
; GFX12-NEXT: global_store_b128 v[11:12], v[21:24], off
; GFX12-NEXT: global_store_b128 v[13:14], v[25:28], off
; GFX12-NEXT: global_store_b128 v[15:16], v[3:6], off
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
bb:
%IndexVec = load <4 x i8>, ptr addrspace(1) %IndexVecPtr, align 4
%Index0 = extractelement <4 x i8> %IndexVec, i32 0
%res0 = call <4 x float> @llvm.amdgcn.swmmac.f32.16x16x32.fp8.fp8.v4f32.i32.v2i32.v4f32.i8(i32 %A, <2 x i32> %B, <4 x float> %C, i8 %Index0)
store <4 x float> %res0, ptr addrspace(1) %out0
%Index1 = extractelement <4 x i8> %IndexVec, i32 1
%res1 = call <4 x float> @llvm.amdgcn.swmmac.f32.16x16x32.fp8.fp8.v4f32.i32.v2i32.v4f32.i8(i32 %A, <2 x i32> %B, <4 x float> %C, i8 %Index1)
store <4 x float> %res1, ptr addrspace(1) %out1
%Index2 = extractelement <4 x i8> %IndexVec, i32 2
%res2 = call <4 x float> @llvm.amdgcn.swmmac.f32.16x16x32.fp8.fp8.v4f32.i32.v2i32.v4f32.i8(i32 %A, <2 x i32> %B, <4 x float> %C, i8 %Index2)
store <4 x float> %res2, ptr addrspace(1) %out2
%Index3 = extractelement <4 x i8> %IndexVec, i32 3
%res3 = call <4 x float> @llvm.amdgcn.swmmac.f32.16x16x32.fp8.fp8.v4f32.i32.v2i32.v4f32.i8(i32 %A, <2 x i32> %B, <4 x float> %C, i8 %Index3)
store <4 x float> %res3, ptr addrspace(1) %out3
ret void
}
define amdgpu_ps void @test_swmmac_f32_16x16x32_fp8_bf8_index_key(i32 %A, <2 x i32> %B, <4 x float> %C, ptr addrspace(1) %IndexVecPtr, ptr addrspace(1) %out0, ptr addrspace(1) %out1, ptr addrspace(1) %out2, ptr addrspace(1) %out3) {
; GFX12-LABEL: test_swmmac_f32_16x16x32_fp8_bf8_index_key:
; GFX12: ; %bb.0: ; %bb
; GFX12-NEXT: global_load_b32 v7, v[7:8], off
; GFX12-NEXT: v_mov_b32_e32 v20, v6
; GFX12-NEXT: v_mov_b32_e32 v19, v5
; GFX12-NEXT: v_mov_b32_e32 v18, v4
; GFX12-NEXT: v_mov_b32_e32 v17, v3
; GFX12-NEXT: v_mov_b32_e32 v24, v6
; GFX12-NEXT: v_mov_b32_e32 v23, v5
; GFX12-NEXT: v_mov_b32_e32 v22, v4
; GFX12-NEXT: v_mov_b32_e32 v21, v3
; GFX12-NEXT: v_mov_b32_e32 v28, v6
; GFX12-NEXT: v_mov_b32_e32 v27, v5
; GFX12-NEXT: v_mov_b32_e32 v26, v4
; GFX12-NEXT: v_mov_b32_e32 v25, v3
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: v_swmmac_f32_16x16x32_fp8_bf8 v[17:20], v0, v[1:2], v7
; GFX12-NEXT: v_swmmac_f32_16x16x32_fp8_bf8 v[21:24], v0, v[1:2], v7 index_key:1
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3)
; GFX12-NEXT: v_swmmac_f32_16x16x32_fp8_bf8 v[25:28], v0, v[1:2], v7 index_key:2
; GFX12-NEXT: v_swmmac_f32_16x16x32_fp8_bf8 v[3:6], v0, v[1:2], v7 index_key:3
; GFX12-NEXT: global_store_b128 v[9:10], v[17:20], off
; GFX12-NEXT: global_store_b128 v[11:12], v[21:24], off
; GFX12-NEXT: global_store_b128 v[13:14], v[25:28], off
; GFX12-NEXT: global_store_b128 v[15:16], v[3:6], off
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
bb:
%IndexVec = load <4 x i8>, ptr addrspace(1) %IndexVecPtr, align 4
%Index0 = extractelement <4 x i8> %IndexVec, i32 0
%res0 = call <4 x float> @llvm.amdgcn.swmmac.f32.16x16x32.fp8.bf8.v4f32.i32.v2i32.v4f32.i8(i32 %A, <2 x i32> %B, <4 x float> %C, i8 %Index0)
store <4 x float> %res0, ptr addrspace(1) %out0
%Index1 = extractelement <4 x i8> %IndexVec, i32 1
%res1 = call <4 x float> @llvm.amdgcn.swmmac.f32.16x16x32.fp8.bf8.v4f32.i32.v2i32.v4f32.i8(i32 %A, <2 x i32> %B, <4 x float> %C, i8 %Index1)
store <4 x float> %res1, ptr addrspace(1) %out1
%Index2 = extractelement <4 x i8> %IndexVec, i32 2
%res2 = call <4 x float> @llvm.amdgcn.swmmac.f32.16x16x32.fp8.bf8.v4f32.i32.v2i32.v4f32.i8(i32 %A, <2 x i32> %B, <4 x float> %C, i8 %Index2)
store <4 x float> %res2, ptr addrspace(1) %out2
%Index3 = extractelement <4 x i8> %IndexVec, i32 3
%res3 = call <4 x float> @llvm.amdgcn.swmmac.f32.16x16x32.fp8.bf8.v4f32.i32.v2i32.v4f32.i8(i32 %A, <2 x i32> %B, <4 x float> %C, i8 %Index3)
store <4 x float> %res3, ptr addrspace(1) %out3
ret void
}
define amdgpu_ps void @test_swmmac_f32_16x16x32_bf8_fp8_index_key(i32 %A, <2 x i32> %B, <4 x float> %C, ptr addrspace(1) %IndexVecPtr, ptr addrspace(1) %out0, ptr addrspace(1) %out1, ptr addrspace(1) %out2, ptr addrspace(1) %out3) {
; GFX12-LABEL: test_swmmac_f32_16x16x32_bf8_fp8_index_key:
; GFX12: ; %bb.0: ; %bb
; GFX12-NEXT: global_load_b32 v7, v[7:8], off
; GFX12-NEXT: v_mov_b32_e32 v20, v6
; GFX12-NEXT: v_mov_b32_e32 v19, v5
; GFX12-NEXT: v_mov_b32_e32 v18, v4
; GFX12-NEXT: v_mov_b32_e32 v17, v3
; GFX12-NEXT: v_mov_b32_e32 v24, v6
; GFX12-NEXT: v_mov_b32_e32 v23, v5
; GFX12-NEXT: v_mov_b32_e32 v22, v4
; GFX12-NEXT: v_mov_b32_e32 v21, v3
; GFX12-NEXT: v_mov_b32_e32 v28, v6
; GFX12-NEXT: v_mov_b32_e32 v27, v5
; GFX12-NEXT: v_mov_b32_e32 v26, v4
; GFX12-NEXT: v_mov_b32_e32 v25, v3
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: v_swmmac_f32_16x16x32_bf8_fp8 v[17:20], v0, v[1:2], v7
; GFX12-NEXT: v_swmmac_f32_16x16x32_bf8_fp8 v[21:24], v0, v[1:2], v7 index_key:1
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3)
; GFX12-NEXT: v_swmmac_f32_16x16x32_bf8_fp8 v[25:28], v0, v[1:2], v7 index_key:2
; GFX12-NEXT: v_swmmac_f32_16x16x32_bf8_fp8 v[3:6], v0, v[1:2], v7 index_key:3
; GFX12-NEXT: global_store_b128 v[9:10], v[17:20], off
; GFX12-NEXT: global_store_b128 v[11:12], v[21:24], off
; GFX12-NEXT: global_store_b128 v[13:14], v[25:28], off
; GFX12-NEXT: global_store_b128 v[15:16], v[3:6], off
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
bb:
%IndexVec = load <4 x i8>, ptr addrspace(1) %IndexVecPtr, align 4
%Index0 = extractelement <4 x i8> %IndexVec, i32 0
%res0 = call <4 x float> @llvm.amdgcn.swmmac.f32.16x16x32.bf8.fp8.v4f32.i32.v2i32.v4f32.i8(i32 %A, <2 x i32> %B, <4 x float> %C, i8 %Index0)
store <4 x float> %res0, ptr addrspace(1) %out0
%Index1 = extractelement <4 x i8> %IndexVec, i32 1
%res1 = call <4 x float> @llvm.amdgcn.swmmac.f32.16x16x32.bf8.fp8.v4f32.i32.v2i32.v4f32.i8(i32 %A, <2 x i32> %B, <4 x float> %C, i8 %Index1)
store <4 x float> %res1, ptr addrspace(1) %out1
%Index2 = extractelement <4 x i8> %IndexVec, i32 2
%res2 = call <4 x float> @llvm.amdgcn.swmmac.f32.16x16x32.bf8.fp8.v4f32.i32.v2i32.v4f32.i8(i32 %A, <2 x i32> %B, <4 x float> %C, i8 %Index2)
store <4 x float> %res2, ptr addrspace(1) %out2
%Index3 = extractelement <4 x i8> %IndexVec, i32 3
%res3 = call <4 x float> @llvm.amdgcn.swmmac.f32.16x16x32.bf8.fp8.v4f32.i32.v2i32.v4f32.i8(i32 %A, <2 x i32> %B, <4 x float> %C, i8 %Index3)
store <4 x float> %res3, ptr addrspace(1) %out3
ret void
}
define amdgpu_ps void @test_swmmac_f32_16x16x32_bf8_bf8_index_key(i32 %A, <2 x i32> %B, <4 x float> %C, ptr addrspace(1) %IndexVecPtr, ptr addrspace(1) %out0, ptr addrspace(1) %out1, ptr addrspace(1) %out2, ptr addrspace(1) %out3) {
; GFX12-LABEL: test_swmmac_f32_16x16x32_bf8_bf8_index_key:
; GFX12: ; %bb.0: ; %bb
; GFX12-NEXT: global_load_b32 v7, v[7:8], off
; GFX12-NEXT: v_mov_b32_e32 v20, v6
; GFX12-NEXT: v_mov_b32_e32 v19, v5
; GFX12-NEXT: v_mov_b32_e32 v18, v4
; GFX12-NEXT: v_mov_b32_e32 v17, v3
; GFX12-NEXT: v_mov_b32_e32 v24, v6
; GFX12-NEXT: v_mov_b32_e32 v23, v5
; GFX12-NEXT: v_mov_b32_e32 v22, v4
; GFX12-NEXT: v_mov_b32_e32 v21, v3
; GFX12-NEXT: v_mov_b32_e32 v28, v6
; GFX12-NEXT: v_mov_b32_e32 v27, v5
; GFX12-NEXT: v_mov_b32_e32 v26, v4
; GFX12-NEXT: v_mov_b32_e32 v25, v3
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: v_swmmac_f32_16x16x32_bf8_bf8 v[17:20], v0, v[1:2], v7
; GFX12-NEXT: v_swmmac_f32_16x16x32_bf8_bf8 v[21:24], v0, v[1:2], v7 index_key:1
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3)
; GFX12-NEXT: v_swmmac_f32_16x16x32_bf8_bf8 v[25:28], v0, v[1:2], v7 index_key:2
; GFX12-NEXT: v_swmmac_f32_16x16x32_bf8_bf8 v[3:6], v0, v[1:2], v7 index_key:3
; GFX12-NEXT: global_store_b128 v[9:10], v[17:20], off
; GFX12-NEXT: global_store_b128 v[11:12], v[21:24], off
; GFX12-NEXT: global_store_b128 v[13:14], v[25:28], off
; GFX12-NEXT: global_store_b128 v[15:16], v[3:6], off
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
bb:
%IndexVec = load <4 x i8>, ptr addrspace(1) %IndexVecPtr, align 4
%Index0 = extractelement <4 x i8> %IndexVec, i32 0
%res0 = call <4 x float> @llvm.amdgcn.swmmac.f32.16x16x32.bf8.bf8.v4f32.i32.v2i32.v4f32.i8(i32 %A, <2 x i32> %B, <4 x float> %C, i8 %Index0)
store <4 x float> %res0, ptr addrspace(1) %out0
%Index1 = extractelement <4 x i8> %IndexVec, i32 1
%res1 = call <4 x float> @llvm.amdgcn.swmmac.f32.16x16x32.bf8.bf8.v4f32.i32.v2i32.v4f32.i8(i32 %A, <2 x i32> %B, <4 x float> %C, i8 %Index1)
store <4 x float> %res1, ptr addrspace(1) %out1
%Index2 = extractelement <4 x i8> %IndexVec, i32 2
%res2 = call <4 x float> @llvm.amdgcn.swmmac.f32.16x16x32.bf8.bf8.v4f32.i32.v2i32.v4f32.i8(i32 %A, <2 x i32> %B, <4 x float> %C, i8 %Index2)
store <4 x float> %res2, ptr addrspace(1) %out2
%Index3 = extractelement <4 x i8> %IndexVec, i32 3
%res3 = call <4 x float> @llvm.amdgcn.swmmac.f32.16x16x32.bf8.bf8.v4f32.i32.v2i32.v4f32.i8(i32 %A, <2 x i32> %B, <4 x float> %C, i8 %Index3)
store <4 x float> %res3, ptr addrspace(1) %out3
ret void
}
declare <4 x float> @llvm.amdgcn.swmmac.f32.16x16x32.f16.v4f32.v4f16.v8f16.v4f32.i8(<4 x half>, <8 x half>, <4 x float>, i8)
declare <4 x float> @llvm.amdgcn.swmmac.f32.16x16x32.bf16.v4f32.v4i16.v8i16.v4f32.i8(<4 x i16>, <8 x i16>, <4 x float>, i8)
declare <4 x half> @llvm.amdgcn.swmmac.f16.16x16x32.f16.v4f16.v4f16.v8f16.v4f16.i8(<4 x half>, <8 x half>, <4 x half>, i8)
declare <4 x i16> @llvm.amdgcn.swmmac.bf16.16x16x32.bf16.v4i16.v4i16.v8i16.v4i16.i8(<4 x i16>, <8 x i16>, <4 x i16>, i8)
declare <4 x i32> @llvm.amdgcn.swmmac.i32.16x16x32.iu8.v4i32.i32.v2i32.v4i32.i8(i1 immarg, i32, i1 immarg, <2 x i32>, <4 x i32>, i8 %Index, i1 immarg)
declare <4 x i32> @llvm.amdgcn.swmmac.i32.16x16x32.iu4.v4i32.i32.i32.v4i32.i16(i1 immarg, i32, i1 immarg, i32, <4 x i32>, i16 %Index, i1 immarg)
declare <4 x i32> @llvm.amdgcn.swmmac.i32.16x16x64.iu4.v4i32.i32.v2i32.v4i32.i16(i1 immarg, i32, i1 immarg, <2 x i32>, <4 x i32>, i16 %Index, i1 immarg)
declare <4 x float> @llvm.amdgcn.swmmac.f32.16x16x32.fp8.fp8.v4f32.i32.v2i32.v4f32.i8(i32, <2 x i32>, <4 x float>, i8)
declare <4 x float> @llvm.amdgcn.swmmac.f32.16x16x32.fp8.bf8.v4f32.i32.v2i32.v4f32.i8(i32, <2 x i32>, <4 x float>, i8)
declare <4 x float> @llvm.amdgcn.swmmac.f32.16x16x32.bf8.fp8.v4f32.i32.v2i32.v4f32.i8(i32, <2 x i32>, <4 x float>, i8)
declare <4 x float> @llvm.amdgcn.swmmac.f32.16x16x32.bf8.bf8.v4f32.i32.v2i32.v4f32.i8(i32, <2 x i32>, <4 x float>, i8)

View File

@ -0,0 +1,333 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
; RUN: llc -march=amdgcn -mcpu=gfx1200 -mattr=-wavefrontsize32,+wavefrontsize64 -verify-machineinstrs < %s | FileCheck %s --check-prefix=GFX12
define amdgpu_ps void @test_wmma_f32_16x16x16_f16(<4 x half> %A, <4 x half> %B, <4 x float> %C, ptr addrspace(1) %out) {
; GFX12-LABEL: test_wmma_f32_16x16x16_f16:
; GFX12: ; %bb.0: ; %bb
; GFX12-NEXT: v_wmma_f32_16x16x16_f16 v[4:7], v[0:1], v[2:3], v[4:7]
; GFX12-NEXT: global_store_b128 v[8:9], v[4:7], off
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
bb:
%res = call <4 x float>@llvm.amdgcn.wmma.f32.16x16x16.f16.v4f32.v4f16.v4f16.v4f32(<4 x half> %A, <4 x half> %B, <4 x float> %C)
store <4 x float> %res, ptr addrspace(1) %out
ret void
}
define amdgpu_ps void @test_wmma_f32_16x16x16_bf16(<4 x i16> %A, <4 x i16> %B, <4 x float> %C, ptr addrspace(1) %out) {
; GFX12-LABEL: test_wmma_f32_16x16x16_bf16:
; GFX12: ; %bb.0: ; %bb
; GFX12-NEXT: v_wmma_f32_16x16x16_bf16 v[4:7], v[0:1], v[2:3], v[4:7]
; GFX12-NEXT: global_store_b128 v[8:9], v[4:7], off
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
bb:
%res = call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf16.v4f32.v4i16.v4i16.v4f32(<4 x i16> %A, <4 x i16> %B, <4 x float> %C)
store <4 x float> %res, ptr addrspace(1) %out
ret void
}
define amdgpu_ps void @test_wmma_f16_16x16x16_f16(<4 x half> %A, <4 x half> %B, <4 x half> %C, ptr addrspace(1) %out) {
; GFX12-LABEL: test_wmma_f16_16x16x16_f16:
; GFX12: ; %bb.0: ; %bb
; GFX12-NEXT: v_wmma_f16_16x16x16_f16 v[4:5], v[0:1], v[2:3], v[4:5]
; GFX12-NEXT: global_store_b64 v[6:7], v[4:5], off
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
bb:
%res = call <4 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16.v4f16.v4f16.v4f16.v4f16(<4 x half> %A, <4 x half> %B, <4 x half> %C, i1 0)
store <4 x half> %res, ptr addrspace(1) %out
ret void
}
define amdgpu_ps void @test_wmma_bf16_16x16x16_bf16(<4 x i16> %A, <4 x i16> %B, <4 x i16> %C, ptr addrspace(1) %out) {
; GFX12-LABEL: test_wmma_bf16_16x16x16_bf16:
; GFX12: ; %bb.0: ; %bb
; GFX12-NEXT: v_wmma_bf16_16x16x16_bf16 v[4:5], v[0:1], v[2:3], v[4:5]
; GFX12-NEXT: global_store_b64 v[6:7], v[4:5], off
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
bb:
%res = call <4 x i16> @llvm.amdgcn.wmma.bf16.16x16x16.bf16.v4i16.v4i16.v4i16.v4i16(<4 x i16> %A, <4 x i16> %B, <4 x i16> %C, i1 0)
store <4 x i16> %res, ptr addrspace(1) %out
ret void
}
define amdgpu_ps void @test_wmma_i32_16x16x16_iu8(i32 %A, i32 %B, <4 x i32> %C, ptr addrspace(1) %out) {
; GFX12-LABEL: test_wmma_i32_16x16x16_iu8:
; GFX12: ; %bb.0: ; %bb
; GFX12-NEXT: v_wmma_i32_16x16x16_iu8 v[2:5], v0, v1, v[2:5]
; GFX12-NEXT: global_store_b128 v[6:7], v[2:5], off
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
bb:
%res = call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8.v4i32.i32.i32.v4i32(i1 0, i32 %A, i1 0, i32 %B, <4 x i32> %C, i1 0)
store <4 x i32> %res, ptr addrspace(1) %out
ret void
}
define amdgpu_ps void @test_wmma_i32_16x16x16_iu4(i32 %A, i32 %B, <4 x i32> %C, ptr addrspace(1) %out) {
; GFX12-LABEL: test_wmma_i32_16x16x16_iu4:
; GFX12: ; %bb.0: ; %bb
; GFX12-NEXT: v_wmma_i32_16x16x16_iu4 v[2:5], v0, v1, v[2:5]
; GFX12-NEXT: global_store_b128 v[6:7], v[2:5], off
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
bb:
%res = call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4.v4i32.i32.i32.v4i32(i1 0, i32 %A, i1 0, i32 %B, <4 x i32> %C, i1 0)
store <4 x i32> %res, ptr addrspace(1) %out
ret void
}
define amdgpu_ps void @test_wmma_f32_16x16x16_fp8_fp8(i32 %A, i32 %B, <4 x float> %C, ptr addrspace(1) %out) {
; GFX12-LABEL: test_wmma_f32_16x16x16_fp8_fp8:
; GFX12: ; %bb.0: ; %bb
; GFX12-NEXT: v_wmma_f32_16x16x16_fp8_fp8 v[2:5], v0, v1, v[2:5]
; GFX12-NEXT: global_store_b128 v[6:7], v[2:5], off
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
bb:
%res = call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.fp8.fp8.v4f32.i32.i32.v4f32(i32 %A, i32 %B, <4 x float> %C)
store <4 x float> %res, ptr addrspace(1) %out
ret void
}
define amdgpu_ps void @test_wmma_f32_16x16x16_bf8_fp8(i32 %A, i32 %B, <4 x float> %C, ptr addrspace(1) %out) {
; GFX12-LABEL: test_wmma_f32_16x16x16_bf8_fp8:
; GFX12: ; %bb.0: ; %bb
; GFX12-NEXT: v_wmma_f32_16x16x16_bf8_fp8 v[2:5], v0, v1, v[2:5]
; GFX12-NEXT: global_store_b128 v[6:7], v[2:5], off
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
bb:
%res = call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf8.fp8.v4f32.i32.i32.v4f32(i32 %A, i32 %B, <4 x float> %C)
store <4 x float> %res, ptr addrspace(1) %out
ret void
}
define amdgpu_ps void @test_wmma_f32_16x16x16_fp8_bf8(i32 %A, i32 %B, <4 x float> %C, ptr addrspace(1) %out) {
; GFX12-LABEL: test_wmma_f32_16x16x16_fp8_bf8:
; GFX12: ; %bb.0: ; %bb
; GFX12-NEXT: v_wmma_f32_16x16x16_fp8_bf8 v[2:5], v0, v1, v[2:5]
; GFX12-NEXT: global_store_b128 v[6:7], v[2:5], off
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
bb:
%res = call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.fp8.bf8.v4f32.i32.i32.v4f32(i32 %A, i32 %B, <4 x float> %C)
store <4 x float> %res, ptr addrspace(1) %out
ret void
}
define amdgpu_ps void @test_wmma_f32_16x16x16_bf8_bf8(i32 %A, i32 %B, <4 x float> %C, ptr addrspace(1) %out) {
; GFX12-LABEL: test_wmma_f32_16x16x16_bf8_bf8:
; GFX12: ; %bb.0: ; %bb
; GFX12-NEXT: v_wmma_f32_16x16x16_bf8_bf8 v[2:5], v0, v1, v[2:5]
; GFX12-NEXT: global_store_b128 v[6:7], v[2:5], off
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
bb:
%res = call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf8.bf8.v4f32.i32.i32.v4f32(i32 %A, i32 %B, <4 x float> %C)
store <4 x float> %res, ptr addrspace(1) %out
ret void
}
define amdgpu_ps void @test_wmma_i32_16x16x32_iu4(i32 %A, i32 %B, <4 x i32> %C, ptr addrspace(1) %out) {
; GFX12-LABEL: test_wmma_i32_16x16x32_iu4:
; GFX12: ; %bb.0: ; %bb
; GFX12-NEXT: v_wmma_i32_16x16x32_iu4 v[2:5], v0, v1, v[2:5]
; GFX12-NEXT: global_store_b128 v[6:7], v[2:5], off
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
bb:
%res = call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x32.iu4.v4i32.i32.i32.v4i32(i1 0, i32 %A, i1 0, i32 %B, <4 x i32> %C, i1 0)
store <4 x i32> %res, ptr addrspace(1) %out
ret void
}
define amdgpu_ps void @test_swmmac_f32_16x16x32_f16(<4 x half> %A, <8 x half> %B, <4 x float> %C, i8 %Index, ptr addrspace(1) %out) {
; GFX12-LABEL: test_swmmac_f32_16x16x32_f16:
; GFX12: ; %bb.0: ; %bb
; GFX12-NEXT: v_swmmac_f32_16x16x32_f16 v[6:9], v[0:1], v[2:5], v10
; GFX12-NEXT: global_store_b128 v[11:12], v[6:9], off
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
bb:
%res = call <4 x float> @llvm.amdgcn.swmmac.f32.16x16x32.f16.v4f32.v4f16.v8f16.v4f32.i8(<4 x half> %A, <8 x half> %B, <4 x float> %C, i8 %Index)
store <4 x float> %res, ptr addrspace(1) %out
ret void
}
define amdgpu_ps void @test_swmmac_f32_16x16x32_bf16(<4 x i16> %A, <8 x i16> %B, <4 x float> %C, i8 %Index, ptr addrspace(1) %out) {
; GFX12-LABEL: test_swmmac_f32_16x16x32_bf16:
; GFX12: ; %bb.0: ; %bb
; GFX12-NEXT: v_swmmac_f32_16x16x32_bf16 v[6:9], v[0:1], v[2:5], v10
; GFX12-NEXT: global_store_b128 v[11:12], v[6:9], off
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
bb:
%res = call <4 x float> @llvm.amdgcn.swmmac.f32.16x16x32.bf16.v4f32.v4i16.v8i16.v4f32.i8(<4 x i16> %A, <8 x i16> %B, <4 x float> %C, i8 %Index)
store <4 x float> %res, ptr addrspace(1) %out
ret void
}
define amdgpu_ps void @test_swmmac_f16_16x16x32_f16(<4 x half> %A, <8 x half> %B, <4 x half> %C, i8 %Index, ptr addrspace(1) %out) {
; GFX12-LABEL: test_swmmac_f16_16x16x32_f16:
; GFX12: ; %bb.0: ; %bb
; GFX12-NEXT: v_swmmac_f16_16x16x32_f16 v[6:7], v[0:1], v[2:5], v8
; GFX12-NEXT: global_store_b64 v[9:10], v[6:7], off
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
bb:
%res = call <4 x half> @llvm.amdgcn.swmmac.f16.16x16x32.f16.v4f16.v4f16.v8f16.v4f16.i8(<4 x half> %A, <8 x half> %B, <4 x half> %C, i8 %Index)
store <4 x half> %res, ptr addrspace(1) %out
ret void
}
define amdgpu_ps void @test_swmmac_bf16_16x16x32_bf16(<4 x i16> %A, <8 x i16> %B, <4 x i16> %C, i8 %Index, ptr addrspace(1) %out) {
; GFX12-LABEL: test_swmmac_bf16_16x16x32_bf16:
; GFX12: ; %bb.0: ; %bb
; GFX12-NEXT: v_swmmac_bf16_16x16x32_bf16 v[6:7], v[0:1], v[2:5], v8
; GFX12-NEXT: global_store_b64 v[9:10], v[6:7], off
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
bb:
%res = call <4 x i16> @llvm.amdgcn.swmmac.bf16.16x16x32.bf16.v4i16.v4i16.v8i16.v4i16.i8(<4 x i16> %A, <8 x i16> %B, <4 x i16> %C, i8 %Index)
store <4 x i16> %res, ptr addrspace(1) %out
ret void
}
define amdgpu_ps void @test_swmmac_i32_16x16x32_iu8(i32 %A, <2 x i32> %B, <4 x i32> %C, i8 %Index, ptr addrspace(1) %out) {
; GFX12-LABEL: test_swmmac_i32_16x16x32_iu8:
; GFX12: ; %bb.0: ; %bb
; GFX12-NEXT: v_swmmac_i32_16x16x32_iu8 v[3:6], v0, v[1:2], v7
; GFX12-NEXT: global_store_b128 v[8:9], v[3:6], off
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
bb:
%res = call <4 x i32> @llvm.amdgcn.swmmac.i32.16x16x32.iu8.v4i32.i32.v2i32.v4i32.i8(i1 0, i32 %A, i1 0, <2 x i32> %B, <4 x i32> %C, i8 %Index, i1 0)
store <4 x i32> %res, ptr addrspace(1) %out
ret void
}
define amdgpu_ps void @test_swmmac_i32_16x16x32_iu4(i32 %A, i32 %B, <4 x i32> %C, i16 %Index, ptr addrspace(1) %out) {
; GFX12-LABEL: test_swmmac_i32_16x16x32_iu4:
; GFX12: ; %bb.0: ; %bb
; GFX12-NEXT: v_swmmac_i32_16x16x32_iu4 v[2:5], v0, v1, v6
; GFX12-NEXT: global_store_b128 v[7:8], v[2:5], off
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
bb:
%res = call <4 x i32> @llvm.amdgcn.swmmac.i32.16x16x32.iu4.v4i32.i32.i32.v4i32.i16(i1 0, i32 %A, i1 0, i32 %B, <4 x i32> %C, i16 %Index, i1 0)
store <4 x i32> %res, ptr addrspace(1) %out
ret void
}
define amdgpu_ps void @test_swmmac_i32_16x16x64_iu4(i32 %A, <2 x i32> %B, <4 x i32> %C, i16 %Index, ptr addrspace(1) %out) {
; GFX12-LABEL: test_swmmac_i32_16x16x64_iu4:
; GFX12: ; %bb.0: ; %bb
; GFX12-NEXT: v_swmmac_i32_16x16x64_iu4 v[3:6], v0, v[1:2], v7
; GFX12-NEXT: global_store_b128 v[8:9], v[3:6], off
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
bb:
%res = call <4 x i32> @llvm.amdgcn.swmmac.i32.16x16x64.iu4.v4i32.i32.v2i32.v4i32.i16(i1 0, i32 %A, i1 0, <2 x i32> %B, <4 x i32> %C, i16 %Index, i1 0)
store <4 x i32> %res, ptr addrspace(1) %out
ret void
}
define amdgpu_ps void @test_swmmac_f32_16x16x32_fp8_fp8(i32 %A, <2 x i32> %B, <4 x float> %C, i8 %Index, ptr addrspace(1) %out) {
; GFX12-LABEL: test_swmmac_f32_16x16x32_fp8_fp8:
; GFX12: ; %bb.0: ; %bb
; GFX12-NEXT: v_swmmac_f32_16x16x32_fp8_fp8 v[3:6], v0, v[1:2], v7
; GFX12-NEXT: global_store_b128 v[8:9], v[3:6], off
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
bb:
%res = call <4 x float> @llvm.amdgcn.swmmac.f32.16x16x32.fp8.fp8.v4f32.i32.v2i32.v4f32.i8(i32 %A, <2 x i32> %B, <4 x float> %C, i8 %Index)
store <4 x float> %res, ptr addrspace(1) %out
ret void
}
define amdgpu_ps void @test_swmmac_f32_16x16x32_fp8_bf8(i32 %A, <2 x i32> %B, <4 x float> %C, i8 %Index, ptr addrspace(1) %out) {
; GFX12-LABEL: test_swmmac_f32_16x16x32_fp8_bf8:
; GFX12: ; %bb.0: ; %bb
; GFX12-NEXT: v_swmmac_f32_16x16x32_fp8_bf8 v[3:6], v0, v[1:2], v7
; GFX12-NEXT: global_store_b128 v[8:9], v[3:6], off
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
bb:
%res = call <4 x float> @llvm.amdgcn.swmmac.f32.16x16x32.fp8.bf8.v4f32.i32.v2i32.v4f32.i8(i32 %A, <2 x i32> %B, <4 x float> %C, i8 %Index)
store <4 x float> %res, ptr addrspace(1) %out
ret void
}
define amdgpu_ps void @test_swmmac_f32_16x16x32_bf8_fp8(i32 %A, <2 x i32> %B, <4 x float> %C, i8 %Index, ptr addrspace(1) %out) {
; GFX12-LABEL: test_swmmac_f32_16x16x32_bf8_fp8:
; GFX12: ; %bb.0: ; %bb
; GFX12-NEXT: v_swmmac_f32_16x16x32_bf8_fp8 v[3:6], v0, v[1:2], v7
; GFX12-NEXT: global_store_b128 v[8:9], v[3:6], off
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
bb:
%res = call <4 x float> @llvm.amdgcn.swmmac.f32.16x16x32.bf8.fp8.v4f32.i32.v2i32.v4f32.i8(i32 %A, <2 x i32> %B, <4 x float> %C, i8 %Index)
store <4 x float> %res, ptr addrspace(1) %out
ret void
}
define amdgpu_ps void @test_swmmac_f32_16x16x32_bf8_bf8(i32 %A, <2 x i32> %B, <4 x float> %C, i8 %Index, ptr addrspace(1) %out) {
; GFX12-LABEL: test_swmmac_f32_16x16x32_bf8_bf8:
; GFX12: ; %bb.0: ; %bb
; GFX12-NEXT: v_swmmac_f32_16x16x32_bf8_bf8 v[3:6], v0, v[1:2], v7
; GFX12-NEXT: global_store_b128 v[8:9], v[3:6], off
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
bb:
%res = call <4 x float> @llvm.amdgcn.swmmac.f32.16x16x32.bf8.bf8.v4f32.i32.v2i32.v4f32.i8(i32 %A, <2 x i32> %B, <4 x float> %C, i8 %Index)
store <4 x float> %res, ptr addrspace(1) %out
ret void
}
declare <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.f16.v4f32.v4f16.v4f16.v4f32(<4 x half>, <4 x half>, <4 x float>)
declare <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf16.v4f32.v4i16.v4i16.v4f32(<4 x i16>, <4 x i16>, <4 x float>)
declare <4 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16.v4f16.v4f16.v4f16.v4f16(<4 x half>, <4 x half>, <4 x half>, i1 immarg)
declare <4 x i16> @llvm.amdgcn.wmma.bf16.16x16x16.bf16.v4i16.v4i16.v4i16.v4i16(<4 x i16>, <4 x i16>, <4 x i16>, i1 immarg)
declare <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8.v4i32.i32.i32.v4i32(i1 immarg, i32, i1 immarg, i32, <4 x i32>, i1 immarg)
declare <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4.v4i32.i32.i32.v4i32(i1 immarg, i32, i1 immarg, i32, <4 x i32>, i1 immarg)
declare <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.fp8.fp8.v4f32.i32.i32.v4f32(i32, i32, <4 x float>)
declare <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.fp8.bf8.v4f32.i32.i32.v4f32(i32, i32, <4 x float>)
declare <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf8.fp8.v4f32.i32.i32.v4f32(i32, i32, <4 x float>)
declare <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf8.bf8.v4f32.i32.i32.v4f32(i32, i32, <4 x float>)
declare <4 x i32> @llvm.amdgcn.wmma.i32.16x16x32.iu4.v4i32.i32.i32.v4i32(i1 immarg, i32, i1 immarg, i32, <4 x i32>, i1 immarg)
declare <4 x float> @llvm.amdgcn.swmmac.f32.16x16x32.f16.v4f32.v4f16.v8f16.v4f32.i8(<4 x half>, <8 x half>, <4 x float>, i8)
declare <4 x float> @llvm.amdgcn.swmmac.f32.16x16x32.bf16.v4f32.v4i16.v8i16.v4f32.i8(<4 x i16>, <8 x i16>, <4 x float>, i8)
declare <4 x half> @llvm.amdgcn.swmmac.f16.16x16x32.f16.v4f16.v4f16.v8f16.v4f16.i8(<4 x half>, <8 x half>, <4 x half>, i8)
declare <4 x i16> @llvm.amdgcn.swmmac.bf16.16x16x32.bf16.v4i16.v4i16.v8i16.v4i16.i8(<4 x i16>, <8 x i16>, <4 x i16>, i8)
declare <4 x i32> @llvm.amdgcn.swmmac.i32.16x16x32.iu8.v4i32.i32.v2i32.v4i32.i8(i1 immarg, i32, i1 immarg, <2 x i32>, <4 x i32>, i8 %Index, i1 immarg)
declare <4 x i32> @llvm.amdgcn.swmmac.i32.16x16x32.iu4.v4i32.i32.i32.v4i32.i16(i1 immarg, i32, i1 immarg, i32, <4 x i32>, i16 %Index, i1 immarg)
declare <4 x i32> @llvm.amdgcn.swmmac.i32.16x16x64.iu4.v4i32.i32.v2i32.v4i32.i16(i1 immarg, i32, i1 immarg, <2 x i32>, <4 x i32>, i16 %Index, i1 immarg)
declare <4 x float> @llvm.amdgcn.swmmac.f32.16x16x32.fp8.fp8.v4f32.i32.v2i32.v4f32.i8(i32, <2 x i32>, <4 x float>, i8)
declare <4 x float> @llvm.amdgcn.swmmac.f32.16x16x32.fp8.bf8.v4f32.i32.v2i32.v4f32.i8(i32, <2 x i32>, <4 x float>, i8)
declare <4 x float> @llvm.amdgcn.swmmac.f32.16x16x32.bf8.fp8.v4f32.i32.v2i32.v4f32.i8(i32, <2 x i32>, <4 x float>, i8)
declare <4 x float> @llvm.amdgcn.swmmac.f32.16x16x32.bf8.bf8.v4f32.i32.v2i32.v4f32.i8(i32, <2 x i32>, <4 x float>, i8)

View File

@ -0,0 +1,354 @@
# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
# RUN: llc -march=amdgcn -mcpu=gfx1200 -verify-machineinstrs -run-pass post-RA-hazard-rec %s -o - | FileCheck -check-prefix=GFX12 %s
# D0 overlaps A1, B1, C1 or Index1. Overlap starts at vgpr0.
# $D0 = wmma0 $A0, $B0, $C0 or $D0 = swmmac0 $A0, $B0, $C0, $Index0
# $D1 = wmma1 $A1, $B1, $C1 or $D1 = swmmac1 $A1, $B1, $C1, $Index1
---
name: test_wmma_f32_16x16x16_f16_D0_overlaps_A1
body: |
bb.0:
liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25, $vgpr26, $vgpr27, $vgpr28, $vgpr29, $vgpr30, $vgpr31, $vgpr32, $vgpr33, $vgpr34, $vgpr35, $vgpr36, $vgpr37, $vgpr38, $vgpr39, $vgpr40, $vgpr41, $vgpr42, $vgpr43, $vgpr44, $vgpr45
; GFX12-LABEL: name: test_wmma_f32_16x16x16_f16_D0_overlaps_A1
; GFX12: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25, $vgpr26, $vgpr27, $vgpr28, $vgpr29, $vgpr30, $vgpr31, $vgpr32, $vgpr33, $vgpr34, $vgpr35, $vgpr36, $vgpr37, $vgpr38, $vgpr39, $vgpr40, $vgpr41, $vgpr42, $vgpr43, $vgpr44, $vgpr45
; GFX12-NEXT: {{ $}}
; GFX12-NEXT: early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = V_WMMA_F32_16X16X16_F16_w32_twoaddr 8, killed $vgpr10_vgpr11_vgpr12_vgpr13, 8, killed $vgpr14_vgpr15_vgpr16_vgpr17, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 0, 0, implicit $exec
; GFX12-NEXT: V_NOP_e32 implicit $exec
; GFX12-NEXT: early-clobber renamable $vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43 = V_WMMA_F32_16X16X16_F16_w32_twoaddr 8, killed $vgpr0_vgpr1_vgpr2_vgpr3, 8, killed $vgpr32_vgpr33_vgpr34_vgpr35, 8, killed $vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43, 0, 0, implicit $exec
early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = V_WMMA_F32_16X16X16_F16_w32_twoaddr 8, killed $vgpr10_vgpr11_vgpr12_vgpr13, 8, killed $vgpr14_vgpr15_vgpr16_vgpr17, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 0, 0, implicit $exec
early-clobber renamable $vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43 = V_WMMA_F32_16X16X16_F16_w32_twoaddr 8, killed $vgpr0_vgpr1_vgpr2_vgpr3, 8, killed $vgpr32_vgpr33_vgpr34_vgpr35, 8, killed $vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43, 0, 0, implicit $exec
...
---
name: test_wmma_f32_16x16x16_bf16_D0_overlaps_B1
body: |
bb.0:
liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25, $vgpr26, $vgpr27, $vgpr28, $vgpr29, $vgpr30, $vgpr31, $vgpr32, $vgpr33, $vgpr34, $vgpr35, $vgpr36, $vgpr37, $vgpr38, $vgpr39, $vgpr40, $vgpr41, $vgpr42, $vgpr43, $vgpr44, $vgpr45
; GFX12-LABEL: name: test_wmma_f32_16x16x16_bf16_D0_overlaps_B1
; GFX12: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25, $vgpr26, $vgpr27, $vgpr28, $vgpr29, $vgpr30, $vgpr31, $vgpr32, $vgpr33, $vgpr34, $vgpr35, $vgpr36, $vgpr37, $vgpr38, $vgpr39, $vgpr40, $vgpr41, $vgpr42, $vgpr43, $vgpr44, $vgpr45
; GFX12-NEXT: {{ $}}
; GFX12-NEXT: early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = V_WMMA_F32_16X16X16_F16_w32_twoaddr 8, killed $vgpr10_vgpr11_vgpr12_vgpr13, 8, killed $vgpr14_vgpr15_vgpr16_vgpr17, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 0, 0, implicit $exec
; GFX12-NEXT: V_NOP_e32 implicit $exec
; GFX12-NEXT: early-clobber renamable $vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43 = V_WMMA_F32_16X16X16_BF16_w32_twoaddr 8, killed $vgpr28_vgpr29_vgpr30_vgpr31, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3, 8, killed $vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43, 0, 0, implicit $exec
early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = V_WMMA_F32_16X16X16_F16_w32_twoaddr 8, killed $vgpr10_vgpr11_vgpr12_vgpr13, 8, killed $vgpr14_vgpr15_vgpr16_vgpr17, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 0, 0, implicit $exec
early-clobber renamable $vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43 = V_WMMA_F32_16X16X16_BF16_w32_twoaddr 8, killed $vgpr28_vgpr29_vgpr30_vgpr31, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3, 8, killed $vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43, 0, 0, implicit $exec
...
---
name: test_wmma_f16_16x16x16_f16_D0_overlaps_C1
body: |
bb.0:
liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25, $vgpr26, $vgpr27, $vgpr28, $vgpr29, $vgpr30, $vgpr31, $vgpr32, $vgpr33, $vgpr34, $vgpr35, $vgpr36, $vgpr37, $vgpr38, $vgpr39, $vgpr40, $vgpr41
; GFX12-LABEL: name: test_wmma_f16_16x16x16_f16_D0_overlaps_C1
; GFX12: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25, $vgpr26, $vgpr27, $vgpr28, $vgpr29, $vgpr30, $vgpr31, $vgpr32, $vgpr33, $vgpr34, $vgpr35, $vgpr36, $vgpr37, $vgpr38, $vgpr39, $vgpr40, $vgpr41
; GFX12-NEXT: {{ $}}
; GFX12-NEXT: early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = V_WMMA_F32_16X16X16_F16_w32_twoaddr 8, killed $vgpr10_vgpr11_vgpr12_vgpr13, 8, killed $vgpr14_vgpr15_vgpr16_vgpr17, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 0, 0, implicit $exec
; GFX12-NEXT: V_NOP_e32 implicit $exec
; GFX12-NEXT: early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3 = V_WMMA_F16_16X16X16_F16_w32_twoaddr 8, killed $vgpr28_vgpr29_vgpr30_vgpr31, 8, killed $vgpr32_vgpr33_vgpr34_vgpr35, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3, 0, 0, implicit $exec
early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = V_WMMA_F32_16X16X16_F16_w32_twoaddr 8, killed $vgpr10_vgpr11_vgpr12_vgpr13, 8, killed $vgpr14_vgpr15_vgpr16_vgpr17, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 0, 0, implicit $exec
early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3 = V_WMMA_F16_16X16X16_F16_w32_twoaddr 8, killed $vgpr28_vgpr29_vgpr30_vgpr31, 8, killed $vgpr32_vgpr33_vgpr34_vgpr35, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3, 0, 0, implicit $exec
...
---
name: test_wmma_bf16_16x16x16_bf16_D0_overlaps_A1
body: |
bb.0:
liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25, $vgpr26, $vgpr27, $vgpr28, $vgpr29, $vgpr30, $vgpr31, $vgpr32, $vgpr33, $vgpr34, $vgpr35, $vgpr36, $vgpr37, $vgpr38, $vgpr39, $vgpr40, $vgpr41
; GFX12-LABEL: name: test_wmma_bf16_16x16x16_bf16_D0_overlaps_A1
; GFX12: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25, $vgpr26, $vgpr27, $vgpr28, $vgpr29, $vgpr30, $vgpr31, $vgpr32, $vgpr33, $vgpr34, $vgpr35, $vgpr36, $vgpr37, $vgpr38, $vgpr39, $vgpr40, $vgpr41
; GFX12-NEXT: {{ $}}
; GFX12-NEXT: early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = V_WMMA_F32_16X16X16_F16_w32_twoaddr 8, killed $vgpr10_vgpr11_vgpr12_vgpr13, 8, killed $vgpr14_vgpr15_vgpr16_vgpr17, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 0, 0, implicit $exec
; GFX12-NEXT: V_NOP_e32 implicit $exec
; GFX12-NEXT: early-clobber renamable $vgpr36_vgpr37_vgpr38_vgpr39 = V_WMMA_BF16_16X16X16_BF16_w32_twoaddr 8, killed $vgpr0_vgpr1_vgpr2_vgpr3, 8, killed $vgpr32_vgpr33_vgpr34_vgpr35, 8, killed $vgpr36_vgpr37_vgpr38_vgpr39, 0, 0, implicit $exec
early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = V_WMMA_F32_16X16X16_F16_w32_twoaddr 8, killed $vgpr10_vgpr11_vgpr12_vgpr13, 8, killed $vgpr14_vgpr15_vgpr16_vgpr17, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 0, 0, implicit $exec
early-clobber renamable $vgpr36_vgpr37_vgpr38_vgpr39 = V_WMMA_BF16_16X16X16_BF16_w32_twoaddr 8, killed $vgpr0_vgpr1_vgpr2_vgpr3, 8, killed $vgpr32_vgpr33_vgpr34_vgpr35, 8, killed $vgpr36_vgpr37_vgpr38_vgpr39, 0, 0, implicit $exec
...
---
name: test_wmma_i32_16x16x16_iu8_D0_overlaps_B1
body: |
bb.0:
liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25, $vgpr26, $vgpr27, $vgpr28, $vgpr29, $vgpr30, $vgpr31, $vgpr32, $vgpr33, $vgpr34, $vgpr35, $vgpr36, $vgpr37, $vgpr38, $vgpr39, $vgpr40, $vgpr41
; GFX12-LABEL: name: test_wmma_i32_16x16x16_iu8_D0_overlaps_B1
; GFX12: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25, $vgpr26, $vgpr27, $vgpr28, $vgpr29, $vgpr30, $vgpr31, $vgpr32, $vgpr33, $vgpr34, $vgpr35, $vgpr36, $vgpr37, $vgpr38, $vgpr39, $vgpr40, $vgpr41
; GFX12-NEXT: {{ $}}
; GFX12-NEXT: early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = V_WMMA_F32_16X16X16_F16_w32_twoaddr 8, killed $vgpr10_vgpr11_vgpr12_vgpr13, 8, killed $vgpr14_vgpr15_vgpr16_vgpr17, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 0, 0, implicit $exec
; GFX12-NEXT: V_NOP_e32 implicit $exec
; GFX12-NEXT: early-clobber renamable $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39 = V_WMMA_I32_16X16X16_IU8_w32_twoaddr 8, killed $vgpr28_vgpr29, 8, killed $vgpr0_vgpr1, killed $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39, 0, 0, implicit $exec
early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = V_WMMA_F32_16X16X16_F16_w32_twoaddr 8, killed $vgpr10_vgpr11_vgpr12_vgpr13, 8, killed $vgpr14_vgpr15_vgpr16_vgpr17, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 0, 0, implicit $exec
early-clobber renamable $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39 = V_WMMA_I32_16X16X16_IU8_w32_twoaddr 8, killed $vgpr28_vgpr29, 8, killed $vgpr0_vgpr1, killed $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39, 0, 0, implicit $exec
...
---
name: test_wmma_i32_16x16x16_iu4_D0_overlaps_C1
body: |
bb.0:
liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25, $vgpr26, $vgpr27, $vgpr28, $vgpr29, $vgpr30, $vgpr31, $vgpr32, $vgpr33, $vgpr34, $vgpr35, $vgpr36, $vgpr37, $vgpr38, $vgpr39
; GFX12-LABEL: name: test_wmma_i32_16x16x16_iu4_D0_overlaps_C1
; GFX12: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25, $vgpr26, $vgpr27, $vgpr28, $vgpr29, $vgpr30, $vgpr31, $vgpr32, $vgpr33, $vgpr34, $vgpr35, $vgpr36, $vgpr37, $vgpr38, $vgpr39
; GFX12-NEXT: {{ $}}
; GFX12-NEXT: early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = V_WMMA_F32_16X16X16_F16_w32_twoaddr 8, killed $vgpr10_vgpr11_vgpr12_vgpr13, 8, killed $vgpr14_vgpr15_vgpr16_vgpr17, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 0, 0, implicit $exec
; GFX12-NEXT: V_NOP_e32 implicit $exec
; GFX12-NEXT: early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = V_WMMA_I32_16X16X16_IU4_w32_twoaddr 8, killed $vgpr28, 8, killed $vgpr29, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 0, 0, implicit $exec
early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = V_WMMA_F32_16X16X16_F16_w32_twoaddr 8, killed $vgpr10_vgpr11_vgpr12_vgpr13, 8, killed $vgpr14_vgpr15_vgpr16_vgpr17, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 0, 0, implicit $exec
early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = V_WMMA_I32_16X16X16_IU4_w32_twoaddr 8, killed $vgpr28, 8, killed $vgpr29, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 0, 0, implicit $exec
...
---
name: test_wmma_f32_16x16x16_fp8_fp8_D0_overlaps_A1
body: |
bb.0:
liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25, $vgpr26, $vgpr27, $vgpr28, $vgpr29, $vgpr30, $vgpr31, $vgpr32, $vgpr33, $vgpr34, $vgpr35, $vgpr36, $vgpr37, $vgpr38, $vgpr39, $vgpr40, $vgpr41
; GFX12-LABEL: name: test_wmma_f32_16x16x16_fp8_fp8_D0_overlaps_A1
; GFX12: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25, $vgpr26, $vgpr27, $vgpr28, $vgpr29, $vgpr30, $vgpr31, $vgpr32, $vgpr33, $vgpr34, $vgpr35, $vgpr36, $vgpr37, $vgpr38, $vgpr39, $vgpr40, $vgpr41
; GFX12-NEXT: {{ $}}
; GFX12-NEXT: early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = V_WMMA_F32_16X16X16_F16_w32_twoaddr 8, killed $vgpr10_vgpr11_vgpr12_vgpr13, 8, killed $vgpr14_vgpr15_vgpr16_vgpr17, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 0, 0, implicit $exec
; GFX12-NEXT: V_NOP_e32 implicit $exec
; GFX12-NEXT: early-clobber renamable $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39 = V_WMMA_F32_16X16X16_FP8_FP8_w32_twoaddr killed $vgpr0_vgpr1, killed $vgpr30_vgpr31, 8, killed $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39, 0, 0, implicit $exec
early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = V_WMMA_F32_16X16X16_F16_w32_twoaddr 8, killed $vgpr10_vgpr11_vgpr12_vgpr13, 8, killed $vgpr14_vgpr15_vgpr16_vgpr17, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 0, 0, implicit $exec
early-clobber renamable $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39 = V_WMMA_F32_16X16X16_FP8_FP8_w32_twoaddr killed $vgpr0_vgpr1, killed $vgpr30_vgpr31, 8, killed $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39, 0, 0, implicit $exec
...
---
name: test_wmma_f32_16x16x16_bf8_fp8_D0_overlaps_B1
body: |
bb.0:
liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25, $vgpr26, $vgpr27, $vgpr28, $vgpr29, $vgpr30, $vgpr31, $vgpr32, $vgpr33, $vgpr34, $vgpr35, $vgpr36, $vgpr37, $vgpr38, $vgpr39, $vgpr40, $vgpr41
; GFX12-LABEL: name: test_wmma_f32_16x16x16_bf8_fp8_D0_overlaps_B1
; GFX12: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25, $vgpr26, $vgpr27, $vgpr28, $vgpr29, $vgpr30, $vgpr31, $vgpr32, $vgpr33, $vgpr34, $vgpr35, $vgpr36, $vgpr37, $vgpr38, $vgpr39, $vgpr40, $vgpr41
; GFX12-NEXT: {{ $}}
; GFX12-NEXT: early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = V_WMMA_F32_16X16X16_F16_w32_twoaddr 8, killed $vgpr10_vgpr11_vgpr12_vgpr13, 8, killed $vgpr14_vgpr15_vgpr16_vgpr17, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 0, 0, implicit $exec
; GFX12-NEXT: V_NOP_e32 implicit $exec
; GFX12-NEXT: early-clobber renamable $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39 = V_WMMA_F32_16X16X16_BF8_FP8_w32_twoaddr killed $vgpr28_vgpr29, killed $vgpr0_vgpr1, 8, killed $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39, 0, 0, implicit $exec
early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = V_WMMA_F32_16X16X16_F16_w32_twoaddr 8, killed $vgpr10_vgpr11_vgpr12_vgpr13, 8, killed $vgpr14_vgpr15_vgpr16_vgpr17, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 0, 0, implicit $exec
early-clobber renamable $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39 = V_WMMA_F32_16X16X16_BF8_FP8_w32_twoaddr killed $vgpr28_vgpr29, killed $vgpr0_vgpr1, 8, killed $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39, 0, 0, implicit $exec
...
---
name: test_wmma_f32_16x16x16_fp8_bf8_D0_overlaps_C1
body: |
bb.0:
liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25, $vgpr26, $vgpr27, $vgpr28, $vgpr29, $vgpr30, $vgpr31, $vgpr32, $vgpr33, $vgpr34, $vgpr35, $vgpr36, $vgpr37, $vgpr38, $vgpr39, $vgpr40, $vgpr41
; GFX12-LABEL: name: test_wmma_f32_16x16x16_fp8_bf8_D0_overlaps_C1
; GFX12: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25, $vgpr26, $vgpr27, $vgpr28, $vgpr29, $vgpr30, $vgpr31, $vgpr32, $vgpr33, $vgpr34, $vgpr35, $vgpr36, $vgpr37, $vgpr38, $vgpr39, $vgpr40, $vgpr41
; GFX12-NEXT: {{ $}}
; GFX12-NEXT: early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = V_WMMA_F32_16X16X16_F16_w32_twoaddr 8, killed $vgpr10_vgpr11_vgpr12_vgpr13, 8, killed $vgpr14_vgpr15_vgpr16_vgpr17, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 0, 0, implicit $exec
; GFX12-NEXT: V_NOP_e32 implicit $exec
; GFX12-NEXT: early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = V_WMMA_F32_16X16X16_FP8_BF8_w32_twoaddr killed $vgpr28_vgpr29, killed $vgpr30_vgpr31, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 0, 0, implicit $exec
early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = V_WMMA_F32_16X16X16_F16_w32_twoaddr 8, killed $vgpr10_vgpr11_vgpr12_vgpr13, 8, killed $vgpr14_vgpr15_vgpr16_vgpr17, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 0, 0, implicit $exec
early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = V_WMMA_F32_16X16X16_FP8_BF8_w32_twoaddr killed $vgpr28_vgpr29, killed $vgpr30_vgpr31, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 0, 0, implicit $exec
...
---
name: test_wmma_f32_16x16x16_bf8_bf8_D0_overlaps_A1
body: |
bb.0:
liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25, $vgpr26, $vgpr27, $vgpr28, $vgpr29, $vgpr30, $vgpr31, $vgpr32, $vgpr33, $vgpr34, $vgpr35, $vgpr36, $vgpr37, $vgpr38, $vgpr39, $vgpr40, $vgpr41
; GFX12-LABEL: name: test_wmma_f32_16x16x16_bf8_bf8_D0_overlaps_A1
; GFX12: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25, $vgpr26, $vgpr27, $vgpr28, $vgpr29, $vgpr30, $vgpr31, $vgpr32, $vgpr33, $vgpr34, $vgpr35, $vgpr36, $vgpr37, $vgpr38, $vgpr39, $vgpr40, $vgpr41
; GFX12-NEXT: {{ $}}
; GFX12-NEXT: early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = V_WMMA_F32_16X16X16_F16_w32_twoaddr 8, killed $vgpr10_vgpr11_vgpr12_vgpr13, 8, killed $vgpr14_vgpr15_vgpr16_vgpr17, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 0, 0, implicit $exec
; GFX12-NEXT: V_NOP_e32 implicit $exec
; GFX12-NEXT: early-clobber renamable $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39 = V_WMMA_F32_16X16X16_BF8_BF8_w32_twoaddr killed $vgpr0_vgpr1, killed $vgpr30_vgpr31, 8, killed $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39, 0, 0, implicit $exec
early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = V_WMMA_F32_16X16X16_F16_w32_twoaddr 8, killed $vgpr10_vgpr11_vgpr12_vgpr13, 8, killed $vgpr14_vgpr15_vgpr16_vgpr17, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 0, 0, implicit $exec
early-clobber renamable $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39 = V_WMMA_F32_16X16X16_BF8_BF8_w32_twoaddr killed $vgpr0_vgpr1, killed $vgpr30_vgpr31, 8, killed $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39, 0, 0, implicit $exec
...
---
name: test_wmma_i32_16x16x32_iu4_D0_overlaps_B1
body: |
bb.0:
liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25, $vgpr26, $vgpr27, $vgpr28, $vgpr29, $vgpr30, $vgpr31, $vgpr32, $vgpr33, $vgpr34, $vgpr35, $vgpr36, $vgpr37, $vgpr38, $vgpr39, $vgpr40, $vgpr41
; GFX12-LABEL: name: test_wmma_i32_16x16x32_iu4_D0_overlaps_B1
; GFX12: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25, $vgpr26, $vgpr27, $vgpr28, $vgpr29, $vgpr30, $vgpr31, $vgpr32, $vgpr33, $vgpr34, $vgpr35, $vgpr36, $vgpr37, $vgpr38, $vgpr39, $vgpr40, $vgpr41
; GFX12-NEXT: {{ $}}
; GFX12-NEXT: early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = V_WMMA_F32_16X16X16_F16_w32_twoaddr 8, killed $vgpr10_vgpr11_vgpr12_vgpr13, 8, killed $vgpr14_vgpr15_vgpr16_vgpr17, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 0, 0, implicit $exec
; GFX12-NEXT: V_NOP_e32 implicit $exec
; GFX12-NEXT: early-clobber renamable $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39 = V_WMMA_I32_16X16X32_IU4_w32_twoaddr 8, killed $vgpr28_vgpr29, 8, killed $vgpr0_vgpr1, killed $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39, 0, 0, implicit $exec
early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = V_WMMA_F32_16X16X16_F16_w32_twoaddr 8, killed $vgpr10_vgpr11_vgpr12_vgpr13, 8, killed $vgpr14_vgpr15_vgpr16_vgpr17, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 0, 0, implicit $exec
early-clobber renamable $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39 = V_WMMA_I32_16X16X32_IU4_w32_twoaddr 8, killed $vgpr28_vgpr29, 8, killed $vgpr0_vgpr1, killed $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39, 0, 0, implicit $exec
...
---
name: test_swmmac_f32_16x16x32_f16_D0_overlaps_C1
body: |
bb.0:
liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25, $vgpr26, $vgpr27, $vgpr28, $vgpr29, $vgpr30, $vgpr31, $vgpr32, $vgpr33, $vgpr34, $vgpr35, $vgpr36, $vgpr37, $vgpr38, $vgpr39, $vgpr40, $vgpr41, $vgpr42, $vgpr43, $vgpr44, $vgpr45, $vgpr46, $vgpr47, $vgpr48, $vgpr49, $vgpr50
; GFX12-LABEL: name: test_swmmac_f32_16x16x32_f16_D0_overlaps_C1
; GFX12: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25, $vgpr26, $vgpr27, $vgpr28, $vgpr29, $vgpr30, $vgpr31, $vgpr32, $vgpr33, $vgpr34, $vgpr35, $vgpr36, $vgpr37, $vgpr38, $vgpr39, $vgpr40, $vgpr41, $vgpr42, $vgpr43, $vgpr44, $vgpr45, $vgpr46, $vgpr47, $vgpr48, $vgpr49, $vgpr50
; GFX12-NEXT: {{ $}}
; GFX12-NEXT: early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = V_WMMA_F32_16X16X16_F16_w32_twoaddr 8, killed $vgpr10_vgpr11_vgpr12_vgpr13, 8, killed $vgpr14_vgpr15_vgpr16_vgpr17, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 0, 0, implicit $exec
; GFX12-NEXT: early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = V_SWMMAC_F32_16X16X32_F16_w32_twoaddr 8, killed $vgpr28_vgpr29_vgpr30_vgpr31, 8, killed $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, killed $vgpr44, 0, 0, 0, implicit $exec
early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = V_WMMA_F32_16X16X16_F16_w32_twoaddr 8, killed $vgpr10_vgpr11_vgpr12_vgpr13, 8, killed $vgpr14_vgpr15_vgpr16_vgpr17, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 0, 0, implicit $exec
early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = V_SWMMAC_F32_16X16X32_F16_w32_twoaddr 8, killed $vgpr28_vgpr29_vgpr30_vgpr31, 8, killed $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, killed $vgpr44, 0, 0, 0, implicit $exec
...
---
name: test_swmmac_f32_16x16x32_bf16_D0_overlaps_Index1
body: |
bb.0:
liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25, $vgpr26, $vgpr27, $vgpr28, $vgpr29, $vgpr30, $vgpr31, $vgpr32, $vgpr33, $vgpr34, $vgpr35, $vgpr36, $vgpr37, $vgpr38, $vgpr39, $vgpr40, $vgpr41, $vgpr42, $vgpr43, $vgpr44, $vgpr45, $vgpr46, $vgpr47, $vgpr48, $vgpr49, $vgpr50
; GFX12-LABEL: name: test_swmmac_f32_16x16x32_bf16_D0_overlaps_Index1
; GFX12: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25, $vgpr26, $vgpr27, $vgpr28, $vgpr29, $vgpr30, $vgpr31, $vgpr32, $vgpr33, $vgpr34, $vgpr35, $vgpr36, $vgpr37, $vgpr38, $vgpr39, $vgpr40, $vgpr41, $vgpr42, $vgpr43, $vgpr44, $vgpr45, $vgpr46, $vgpr47, $vgpr48, $vgpr49, $vgpr50
; GFX12-NEXT: {{ $}}
; GFX12-NEXT: early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = V_WMMA_F32_16X16X16_F16_w32_twoaddr 8, killed $vgpr10_vgpr11_vgpr12_vgpr13, 8, killed $vgpr14_vgpr15_vgpr16_vgpr17, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 0, 0, implicit $exec
; GFX12-NEXT: V_NOP_e32 implicit $exec
; GFX12-NEXT: early-clobber renamable $vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47 = V_SWMMAC_F32_16X16X32_BF16_w32_twoaddr 8, killed $vgpr28_vgpr29_vgpr30_vgpr31, 8, killed $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39, killed $vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47, killed $vgpr0, 0, 0, 0, implicit $exec
early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = V_WMMA_F32_16X16X16_F16_w32_twoaddr 8, killed $vgpr10_vgpr11_vgpr12_vgpr13, 8, killed $vgpr14_vgpr15_vgpr16_vgpr17, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 0, 0, implicit $exec
early-clobber renamable $vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47 = V_SWMMAC_F32_16X16X32_BF16_w32_twoaddr 8, killed $vgpr28_vgpr29_vgpr30_vgpr31, 8, killed $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39, killed $vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47, killed $vgpr0, 0, 0, 0, implicit $exec
...
---
name: test_swmmac_f16_16x16x32_f16_D0_overlaps_A1
body: |
bb.0:
liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25, $vgpr26, $vgpr27, $vgpr28, $vgpr29, $vgpr30, $vgpr31, $vgpr32, $vgpr33, $vgpr34, $vgpr35, $vgpr36, $vgpr37, $vgpr38, $vgpr39, $vgpr40, $vgpr41, $vgpr42, $vgpr43, $vgpr44, $vgpr45, $vgpr46
; GFX12-LABEL: name: test_swmmac_f16_16x16x32_f16_D0_overlaps_A1
; GFX12: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25, $vgpr26, $vgpr27, $vgpr28, $vgpr29, $vgpr30, $vgpr31, $vgpr32, $vgpr33, $vgpr34, $vgpr35, $vgpr36, $vgpr37, $vgpr38, $vgpr39, $vgpr40, $vgpr41, $vgpr42, $vgpr43, $vgpr44, $vgpr45, $vgpr46
; GFX12-NEXT: {{ $}}
; GFX12-NEXT: early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = V_WMMA_F32_16X16X16_F16_w32_twoaddr 8, killed $vgpr10_vgpr11_vgpr12_vgpr13, 8, killed $vgpr14_vgpr15_vgpr16_vgpr17, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 0, 0, implicit $exec
; GFX12-NEXT: V_NOP_e32 implicit $exec
; GFX12-NEXT: early-clobber renamable $vgpr40_vgpr41_vgpr42_vgpr43 = V_SWMMAC_F16_16X16X32_F16_w32_twoaddr 8, killed $vgpr0_vgpr1_vgpr2_vgpr3, 8, killed $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39, killed $vgpr40_vgpr41_vgpr42_vgpr43, killed $vgpr44, 0, 0, 0, implicit $exec
early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = V_WMMA_F32_16X16X16_F16_w32_twoaddr 8, killed $vgpr10_vgpr11_vgpr12_vgpr13, 8, killed $vgpr14_vgpr15_vgpr16_vgpr17, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 0, 0, implicit $exec
early-clobber renamable $vgpr40_vgpr41_vgpr42_vgpr43 = V_SWMMAC_F16_16X16X32_F16_w32_twoaddr 8, killed $vgpr0_vgpr1_vgpr2_vgpr3, 8, killed $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39, killed $vgpr40_vgpr41_vgpr42_vgpr43, killed $vgpr44, 0, 0, 0, implicit $exec
...
---
name: test_swmmac_bf16_16x16x32_bf16_D0_overlaps_B1
body: |
bb.0:
liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25, $vgpr26, $vgpr27, $vgpr28, $vgpr29, $vgpr30, $vgpr31, $vgpr32, $vgpr33, $vgpr34, $vgpr35, $vgpr36, $vgpr37, $vgpr38, $vgpr39, $vgpr40, $vgpr41, $vgpr42, $vgpr43, $vgpr44, $vgpr45, $vgpr46
; GFX12-LABEL: name: test_swmmac_bf16_16x16x32_bf16_D0_overlaps_B1
; GFX12: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25, $vgpr26, $vgpr27, $vgpr28, $vgpr29, $vgpr30, $vgpr31, $vgpr32, $vgpr33, $vgpr34, $vgpr35, $vgpr36, $vgpr37, $vgpr38, $vgpr39, $vgpr40, $vgpr41, $vgpr42, $vgpr43, $vgpr44, $vgpr45, $vgpr46
; GFX12-NEXT: {{ $}}
; GFX12-NEXT: early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = V_WMMA_F32_16X16X16_F16_w32_twoaddr 8, killed $vgpr10_vgpr11_vgpr12_vgpr13, 8, killed $vgpr14_vgpr15_vgpr16_vgpr17, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 0, 0, implicit $exec
; GFX12-NEXT: V_NOP_e32 implicit $exec
; GFX12-NEXT: early-clobber renamable $vgpr40_vgpr41_vgpr42_vgpr43 = V_SWMMAC_BF16_16X16X32_BF16_w32_twoaddr 8, killed $vgpr28_vgpr29_vgpr30_vgpr31, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, killed $vgpr40_vgpr41_vgpr42_vgpr43, killed $vgpr44, 0, 0, 0, implicit $exec
early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = V_WMMA_F32_16X16X16_F16_w32_twoaddr 8, killed $vgpr10_vgpr11_vgpr12_vgpr13, 8, killed $vgpr14_vgpr15_vgpr16_vgpr17, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 0, 0, implicit $exec
early-clobber renamable $vgpr40_vgpr41_vgpr42_vgpr43 = V_SWMMAC_BF16_16X16X32_BF16_w32_twoaddr 8, killed $vgpr28_vgpr29_vgpr30_vgpr31, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, killed $vgpr40_vgpr41_vgpr42_vgpr43, killed $vgpr44, 0, 0, 0, implicit $exec
...
---
name: test_swmmac_i32_16x16x32_iu8_D0_overlaps_C1
body: |
bb.0:
liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25, $vgpr26, $vgpr27, $vgpr28, $vgpr29, $vgpr30, $vgpr31, $vgpr32, $vgpr33, $vgpr34, $vgpr35, $vgpr36, $vgpr37, $vgpr38, $vgpr39, $vgpr40, $vgpr41, $vgpr42, $vgpr43, $vgpr44
; GFX12-LABEL: name: test_swmmac_i32_16x16x32_iu8_D0_overlaps_C1
; GFX12: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25, $vgpr26, $vgpr27, $vgpr28, $vgpr29, $vgpr30, $vgpr31, $vgpr32, $vgpr33, $vgpr34, $vgpr35, $vgpr36, $vgpr37, $vgpr38, $vgpr39, $vgpr40, $vgpr41, $vgpr42, $vgpr43, $vgpr44
; GFX12-NEXT: {{ $}}
; GFX12-NEXT: early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = V_WMMA_F32_16X16X16_F16_w32_twoaddr 8, killed $vgpr10_vgpr11_vgpr12_vgpr13, 8, killed $vgpr14_vgpr15_vgpr16_vgpr17, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 0, 0, implicit $exec
; GFX12-NEXT: early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = V_SWMMAC_I32_16X16X32_IU8_w32_twoaddr 8, killed $vgpr28_vgpr29, 8, killed $vgpr30_vgpr31_vgpr32_vgpr33, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, killed $vgpr42, 0, 0, 0, implicit $exec
early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = V_WMMA_F32_16X16X16_F16_w32_twoaddr 8, killed $vgpr10_vgpr11_vgpr12_vgpr13, 8, killed $vgpr14_vgpr15_vgpr16_vgpr17, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 0, 0, implicit $exec
early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = V_SWMMAC_I32_16X16X32_IU8_w32_twoaddr 8, killed $vgpr28_vgpr29, 8, killed $vgpr30_vgpr31_vgpr32_vgpr33, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, killed $vgpr42, 0, 0, 0, implicit $exec
...
---
name: test_swmmac_i32_16x16x32_iu4_D0_overlaps_Index1
body: |
bb.0:
liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25, $vgpr26, $vgpr27, $vgpr28, $vgpr29, $vgpr30, $vgpr31, $vgpr32, $vgpr33, $vgpr34, $vgpr35, $vgpr36, $vgpr37, $vgpr38, $vgpr39, $vgpr40, $vgpr41
; GFX12-LABEL: name: test_swmmac_i32_16x16x32_iu4_D0_overlaps_Index1
; GFX12: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25, $vgpr26, $vgpr27, $vgpr28, $vgpr29, $vgpr30, $vgpr31, $vgpr32, $vgpr33, $vgpr34, $vgpr35, $vgpr36, $vgpr37, $vgpr38, $vgpr39, $vgpr40, $vgpr41
; GFX12-NEXT: {{ $}}
; GFX12-NEXT: early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = V_WMMA_F32_16X16X16_F16_w32_twoaddr 8, killed $vgpr10_vgpr11_vgpr12_vgpr13, 8, killed $vgpr14_vgpr15_vgpr16_vgpr17, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 0, 0, implicit $exec
; GFX12-NEXT: V_NOP_e32 implicit $exec
; GFX12-NEXT: early-clobber renamable $vgpr31_vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38 = V_SWMMAC_I32_16X16X32_IU4_w32_twoaddr 8, killed $vgpr28, 8, killed $vgpr28_vgpr29, killed $vgpr31_vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38, killed $vgpr0, 0, 0, 0, implicit $exec
early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = V_WMMA_F32_16X16X16_F16_w32_twoaddr 8, killed $vgpr10_vgpr11_vgpr12_vgpr13, 8, killed $vgpr14_vgpr15_vgpr16_vgpr17, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 0, 0, implicit $exec
early-clobber renamable $vgpr31_vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38 = V_SWMMAC_I32_16X16X32_IU4_w32_twoaddr 8, killed $vgpr28, 8, killed $vgpr28_vgpr29, killed $vgpr31_vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38, killed $vgpr0, 0, 0, 0, implicit $exec
...
---
name: test_swmmac_i32_16x16x64_iu4_D0_overlaps_A1
body: |
bb.0:
liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25, $vgpr26, $vgpr27, $vgpr28, $vgpr29, $vgpr30, $vgpr31, $vgpr32, $vgpr33, $vgpr34, $vgpr35, $vgpr36, $vgpr37, $vgpr38, $vgpr39, $vgpr40, $vgpr41, $vgpr42, $vgpr43, $vgpr44
; GFX12-LABEL: name: test_swmmac_i32_16x16x64_iu4_D0_overlaps_A1
; GFX12: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25, $vgpr26, $vgpr27, $vgpr28, $vgpr29, $vgpr30, $vgpr31, $vgpr32, $vgpr33, $vgpr34, $vgpr35, $vgpr36, $vgpr37, $vgpr38, $vgpr39, $vgpr40, $vgpr41, $vgpr42, $vgpr43, $vgpr44
; GFX12-NEXT: {{ $}}
; GFX12-NEXT: early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = V_WMMA_F32_16X16X16_F16_w32_twoaddr 8, killed $vgpr10_vgpr11_vgpr12_vgpr13, 8, killed $vgpr14_vgpr15_vgpr16_vgpr17, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 0, 0, implicit $exec
; GFX12-NEXT: V_NOP_e32 implicit $exec
; GFX12-NEXT: early-clobber renamable $vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41 = V_SWMMAC_I32_16X16X64_IU4_w32_twoaddr 8, killed $vgpr0_vgpr1, 8, killed $vgpr30_vgpr31_vgpr32_vgpr33, killed $vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41, killed $vgpr42, 0, 0, implicit $exec
early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = V_WMMA_F32_16X16X16_F16_w32_twoaddr 8, killed $vgpr10_vgpr11_vgpr12_vgpr13, 8, killed $vgpr14_vgpr15_vgpr16_vgpr17, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 0, 0, implicit $exec
early-clobber renamable $vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41 = V_SWMMAC_I32_16X16X64_IU4_w32_twoaddr 8, killed $vgpr0_vgpr1, 8, killed $vgpr30_vgpr31_vgpr32_vgpr33, killed $vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41, killed $vgpr42, 0, 0, implicit $exec
...
---
name: test_swmmac_f32_16x16x32_fp8_fp8_D0_overlaps_B1
body: |
bb.0:
liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25, $vgpr26, $vgpr27, $vgpr28, $vgpr29, $vgpr30, $vgpr31, $vgpr32, $vgpr33, $vgpr34, $vgpr35, $vgpr36, $vgpr37, $vgpr38, $vgpr39, $vgpr40, $vgpr41, $vgpr42, $vgpr43, $vgpr44
; GFX12-LABEL: name: test_swmmac_f32_16x16x32_fp8_fp8_D0_overlaps_B1
; GFX12: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25, $vgpr26, $vgpr27, $vgpr28, $vgpr29, $vgpr30, $vgpr31, $vgpr32, $vgpr33, $vgpr34, $vgpr35, $vgpr36, $vgpr37, $vgpr38, $vgpr39, $vgpr40, $vgpr41, $vgpr42, $vgpr43, $vgpr44
; GFX12-NEXT: {{ $}}
; GFX12-NEXT: early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = V_WMMA_F32_16X16X16_F16_w32_twoaddr 8, killed $vgpr10_vgpr11_vgpr12_vgpr13, 8, killed $vgpr14_vgpr15_vgpr16_vgpr17, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 0, 0, implicit $exec
; GFX12-NEXT: V_NOP_e32 implicit $exec
; GFX12-NEXT: early-clobber renamable $vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41 = V_SWMMAC_F32_16X16X32_FP8_FP8_w32_twoaddr killed $vgpr28_vgpr29, killed $vgpr0_vgpr1_vgpr2_vgpr3, killed $vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41, killed $vgpr42, 0, implicit $exec
early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = V_WMMA_F32_16X16X16_F16_w32_twoaddr 8, killed $vgpr10_vgpr11_vgpr12_vgpr13, 8, killed $vgpr14_vgpr15_vgpr16_vgpr17, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 0, 0, implicit $exec
early-clobber renamable $vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41 = V_SWMMAC_F32_16X16X32_FP8_FP8_w32_twoaddr killed $vgpr28_vgpr29, killed $vgpr0_vgpr1_vgpr2_vgpr3, killed $vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41, killed $vgpr42, 0, implicit $exec
...
---
name: test_swmmac_f32_16x16x32_fp8_bf8_D0_overlaps_C1
body: |
bb.0:
liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25, $vgpr26, $vgpr27, $vgpr28, $vgpr29, $vgpr30, $vgpr31, $vgpr32, $vgpr33, $vgpr34, $vgpr35, $vgpr36, $vgpr37, $vgpr38, $vgpr39, $vgpr40, $vgpr41, $vgpr42, $vgpr43, $vgpr44
; GFX12-LABEL: name: test_swmmac_f32_16x16x32_fp8_bf8_D0_overlaps_C1
; GFX12: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25, $vgpr26, $vgpr27, $vgpr28, $vgpr29, $vgpr30, $vgpr31, $vgpr32, $vgpr33, $vgpr34, $vgpr35, $vgpr36, $vgpr37, $vgpr38, $vgpr39, $vgpr40, $vgpr41, $vgpr42, $vgpr43, $vgpr44
; GFX12-NEXT: {{ $}}
; GFX12-NEXT: early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = V_WMMA_F32_16X16X16_F16_w32_twoaddr 8, killed $vgpr10_vgpr11_vgpr12_vgpr13, 8, killed $vgpr14_vgpr15_vgpr16_vgpr17, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 0, 0, implicit $exec
; GFX12-NEXT: early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = V_SWMMAC_F32_16X16X32_FP8_BF8_w32_twoaddr killed $vgpr28_vgpr29, killed $vgpr30_vgpr31_vgpr32_vgpr33, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, killed $vgpr42, 0, implicit $exec
early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = V_WMMA_F32_16X16X16_F16_w32_twoaddr 8, killed $vgpr10_vgpr11_vgpr12_vgpr13, 8, killed $vgpr14_vgpr15_vgpr16_vgpr17, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 0, 0, implicit $exec
early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = V_SWMMAC_F32_16X16X32_FP8_BF8_w32_twoaddr killed $vgpr28_vgpr29, killed $vgpr30_vgpr31_vgpr32_vgpr33, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, killed $vgpr42, 0, implicit $exec
...
---
name: test_swmmac_f32_16x16x32_bf8_fp8_D0_overlaps_Index1
body: |
bb.0:
liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25, $vgpr26, $vgpr27, $vgpr28, $vgpr29, $vgpr30, $vgpr31, $vgpr32, $vgpr33, $vgpr34, $vgpr35, $vgpr36, $vgpr37, $vgpr38, $vgpr39, $vgpr40, $vgpr41, $vgpr42, $vgpr43, $vgpr44
; GFX12-LABEL: name: test_swmmac_f32_16x16x32_bf8_fp8_D0_overlaps_Index1
; GFX12: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25, $vgpr26, $vgpr27, $vgpr28, $vgpr29, $vgpr30, $vgpr31, $vgpr32, $vgpr33, $vgpr34, $vgpr35, $vgpr36, $vgpr37, $vgpr38, $vgpr39, $vgpr40, $vgpr41, $vgpr42, $vgpr43, $vgpr44
; GFX12-NEXT: {{ $}}
; GFX12-NEXT: early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = V_WMMA_F32_16X16X16_F16_w32_twoaddr 8, killed $vgpr10_vgpr11_vgpr12_vgpr13, 8, killed $vgpr14_vgpr15_vgpr16_vgpr17, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 0, 0, implicit $exec
; GFX12-NEXT: V_NOP_e32 implicit $exec
; GFX12-NEXT: early-clobber renamable $vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41 = V_SWMMAC_F32_16X16X32_BF8_FP8_w32_twoaddr killed $vgpr28_vgpr29, killed $vgpr30_vgpr31_vgpr32_vgpr33, killed $vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41, killed $vgpr0, 0, implicit $exec
early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = V_WMMA_F32_16X16X16_F16_w32_twoaddr 8, killed $vgpr10_vgpr11_vgpr12_vgpr13, 8, killed $vgpr14_vgpr15_vgpr16_vgpr17, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 0, 0, implicit $exec
early-clobber renamable $vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41 = V_SWMMAC_F32_16X16X32_BF8_FP8_w32_twoaddr killed $vgpr28_vgpr29, killed $vgpr30_vgpr31_vgpr32_vgpr33, killed $vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41, killed $vgpr0, 0, implicit $exec
...
---
name: test_swmmac_f32_16x16x32_bf8_bf8_D0_overlaps_A1
body: |
bb.0:
liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25, $vgpr26, $vgpr27, $vgpr28, $vgpr29, $vgpr30, $vgpr31, $vgpr32, $vgpr33, $vgpr34, $vgpr35, $vgpr36, $vgpr37, $vgpr38, $vgpr39, $vgpr40, $vgpr41, $vgpr42, $vgpr43, $vgpr44
; GFX12-LABEL: name: test_swmmac_f32_16x16x32_bf8_bf8_D0_overlaps_A1
; GFX12: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25, $vgpr26, $vgpr27, $vgpr28, $vgpr29, $vgpr30, $vgpr31, $vgpr32, $vgpr33, $vgpr34, $vgpr35, $vgpr36, $vgpr37, $vgpr38, $vgpr39, $vgpr40, $vgpr41, $vgpr42, $vgpr43, $vgpr44
; GFX12-NEXT: {{ $}}
; GFX12-NEXT: early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = V_WMMA_F32_16X16X16_F16_w32_twoaddr 8, killed $vgpr10_vgpr11_vgpr12_vgpr13, 8, killed $vgpr14_vgpr15_vgpr16_vgpr17, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 0, 0, implicit $exec
; GFX12-NEXT: V_NOP_e32 implicit $exec
; GFX12-NEXT: early-clobber renamable $vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41 = V_SWMMAC_F32_16X16X32_BF8_BF8_w32_twoaddr killed $vgpr0_vgpr1, killed $vgpr30_vgpr31_vgpr32_vgpr33, killed $vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41, killed $vgpr42, 0, implicit $exec
early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = V_WMMA_F32_16X16X16_F16_w32_twoaddr 8, killed $vgpr10_vgpr11_vgpr12_vgpr13, 8, killed $vgpr14_vgpr15_vgpr16_vgpr17, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 0, 0, implicit $exec
early-clobber renamable $vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41 = V_SWMMAC_F32_16X16X32_BF8_BF8_w32_twoaddr killed $vgpr0_vgpr1, killed $vgpr30_vgpr31_vgpr32_vgpr33, killed $vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41, killed $vgpr42, 0, implicit $exec
...

View File

@ -0,0 +1,355 @@
# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
# RUN: llc -march=amdgcn -mcpu=gfx1200 -verify-machineinstrs -mattr=-wavefrontsize32,+wavefrontsize64 -run-pass post-RA-hazard-rec %s -o - | FileCheck -check-prefix=GFX12 %s
# D0 overlaps A1, B1, C1 or Index1. Overlap starts at vgpr0.
# $D0 = wmma0 $A0, $B0, $C0 or $D0 = swmmac0 $A0, $B0, $C0, $Index0
# $D1 = wmma1 $A1, $B1, $C1 or $D1 = swmmac1 $A1, $B1, $C1, $Index1
---
name: test_wmma_f32_16x16x16_f16_D0_overlaps_A1
body: |
bb.0:
liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25
; GFX12-LABEL: name: test_wmma_f32_16x16x16_f16_D0_overlaps_A1
; GFX12: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25
; GFX12-NEXT: {{ $}}
; GFX12-NEXT: early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3 = V_WMMA_F32_16X16X16_F16_w64_twoaddr 8, killed $vgpr6_vgpr7, 8, killed $vgpr8_vgpr9, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3, 0, 0, implicit $exec
; GFX12-NEXT: V_NOP_e32 implicit $exec
; GFX12-NEXT: early-clobber renamable $vgpr20_vgpr21_vgpr22_vgpr23 = V_WMMA_F32_16X16X16_F16_w64_twoaddr 8, killed $vgpr0_vgpr1, 8, killed $vgpr18_vgpr19, 8, killed $vgpr20_vgpr21_vgpr22_vgpr23, 0, 0, implicit $exec
early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3 = V_WMMA_F32_16X16X16_F16_w64_twoaddr 8, killed $vgpr6_vgpr7, 8, killed $vgpr8_vgpr9, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3, 0, 0, implicit $exec
early-clobber renamable $vgpr20_vgpr21_vgpr22_vgpr23 = V_WMMA_F32_16X16X16_F16_w64_twoaddr 8, killed $vgpr0_vgpr1, 8, killed $vgpr18_vgpr19, 8, killed $vgpr20_vgpr21_vgpr22_vgpr23, 0, 0, implicit $exec
...
---
name: test_wmma_f32_16x16x16_bf16_D0_overlaps_B1
body: |
bb.0:
liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25
; GFX12-LABEL: name: test_wmma_f32_16x16x16_bf16_D0_overlaps_B1
; GFX12: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25
; GFX12-NEXT: {{ $}}
; GFX12-NEXT: early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3 = V_WMMA_F32_16X16X16_F16_w64_twoaddr 8, killed $vgpr6_vgpr7, 8, killed $vgpr8_vgpr9, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3, 0, 0, implicit $exec
; GFX12-NEXT: V_NOP_e32 implicit $exec
; GFX12-NEXT: early-clobber renamable $vgpr20_vgpr21_vgpr22_vgpr23 = V_WMMA_F32_16X16X16_BF16_w64_twoaddr 8, killed $vgpr16_vgpr17, 8, killed $vgpr0_vgpr1, 8, killed $vgpr20_vgpr21_vgpr22_vgpr23, 0, 0, implicit $exec
early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3 = V_WMMA_F32_16X16X16_F16_w64_twoaddr 8, killed $vgpr6_vgpr7, 8, killed $vgpr8_vgpr9, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3, 0, 0, implicit $exec
early-clobber renamable $vgpr20_vgpr21_vgpr22_vgpr23 = V_WMMA_F32_16X16X16_BF16_w64_twoaddr 8, killed $vgpr16_vgpr17, 8, killed $vgpr0_vgpr1, 8, killed $vgpr20_vgpr21_vgpr22_vgpr23, 0, 0, implicit $exec
...
---
name: test_wmma_f16_16x16x16_f16_D0_overlaps_C1
body: |
bb.0:
liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23
; GFX12-LABEL: name: test_wmma_f16_16x16x16_f16_D0_overlaps_C1
; GFX12: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23
; GFX12-NEXT: {{ $}}
; GFX12-NEXT: early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3 = V_WMMA_F32_16X16X16_F16_w64_twoaddr 8, killed $vgpr6_vgpr7, 8, killed $vgpr8_vgpr9, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3, 0, 0, implicit $exec
; GFX12-NEXT: V_NOP_e32 implicit $exec
; GFX12-NEXT: early-clobber renamable $vgpr0_vgpr1 = V_WMMA_F16_16X16X16_F16_w64_twoaddr 8, killed $vgpr16_vgpr17, 8, killed $vgpr18_vgpr19, 8, killed $vgpr0_vgpr1, 0, 0, implicit $exec
early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3 = V_WMMA_F32_16X16X16_F16_w64_twoaddr 8, killed $vgpr6_vgpr7, 8, killed $vgpr8_vgpr9, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3, 0, 0, implicit $exec
early-clobber renamable $vgpr0_vgpr1 = V_WMMA_F16_16X16X16_F16_w64_twoaddr 8, killed $vgpr16_vgpr17, 8, killed $vgpr18_vgpr19, 8, killed $vgpr0_vgpr1, 0, 0, implicit $exec
...
---
name: test_wmma_bf16_16x16x16_bf16_D0_overlaps_A1
body: |
bb.0:
liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23
; GFX12-LABEL: name: test_wmma_bf16_16x16x16_bf16_D0_overlaps_A1
; GFX12: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23
; GFX12-NEXT: {{ $}}
; GFX12-NEXT: early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3 = V_WMMA_F32_16X16X16_F16_w64_twoaddr 8, killed $vgpr6_vgpr7, 8, killed $vgpr8_vgpr9, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3, 0, 0, implicit $exec
; GFX12-NEXT: V_NOP_e32 implicit $exec
; GFX12-NEXT: early-clobber renamable $vgpr20_vgpr21 = V_WMMA_BF16_16X16X16_BF16_w64_twoaddr 8, killed $vgpr0_vgpr1, 8, killed $vgpr18_vgpr19, 8, killed $vgpr20_vgpr21, 0, 0, implicit $exec
early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3 = V_WMMA_F32_16X16X16_F16_w64_twoaddr 8, killed $vgpr6_vgpr7, 8, killed $vgpr8_vgpr9, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3, 0, 0, implicit $exec
early-clobber renamable $vgpr20_vgpr21 = V_WMMA_BF16_16X16X16_BF16_w64_twoaddr 8, killed $vgpr0_vgpr1, 8, killed $vgpr18_vgpr19, 8, killed $vgpr20_vgpr21, 0, 0, implicit $exec
...
---
name: test_wmma_i32_16x16x16_iu8_D0_overlaps_B1
body: |
bb.0:
liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23
; GFX12-LABEL: name: test_wmma_i32_16x16x16_iu8_D0_overlaps_B1
; GFX12: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23
; GFX12-NEXT: {{ $}}
; GFX12-NEXT: early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3 = V_WMMA_F32_16X16X16_F16_w64_twoaddr 8, killed $vgpr6_vgpr7, 8, killed $vgpr8_vgpr9, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3, 0, 0, implicit $exec
; GFX12-NEXT: V_NOP_e32 implicit $exec
; GFX12-NEXT: early-clobber renamable $vgpr18_vgpr19_vgpr20_vgpr21 = V_WMMA_I32_16X16X16_IU8_w64_twoaddr 8, killed $vgpr16, 8, killed $vgpr0, killed $vgpr18_vgpr19_vgpr20_vgpr21, 0, 0, implicit $exec
early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3 = V_WMMA_F32_16X16X16_F16_w64_twoaddr 8, killed $vgpr6_vgpr7, 8, killed $vgpr8_vgpr9, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3, 0, 0, implicit $exec
early-clobber renamable $vgpr18_vgpr19_vgpr20_vgpr21 = V_WMMA_I32_16X16X16_IU8_w64_twoaddr 8, killed $vgpr16, 8, killed $vgpr0, killed $vgpr18_vgpr19_vgpr20_vgpr21, 0, 0, implicit $exec
...
---
name: test_wmma_i32_16x16x16_iu4_D0_overlaps_C1
body: |
bb.0:
liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23
; GFX12-LABEL: name: test_wmma_i32_16x16x16_iu4_D0_overlaps_C1
; GFX12: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23
; GFX12-NEXT: {{ $}}
; GFX12-NEXT: early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3 = V_WMMA_F32_16X16X16_F16_w64_twoaddr 8, killed $vgpr6_vgpr7, 8, killed $vgpr8_vgpr9, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3, 0, 0, implicit $exec
; GFX12-NEXT: V_NOP_e32 implicit $exec
; GFX12-NEXT: early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3 = V_WMMA_I32_16X16X16_IU4_w64_twoaddr 8, killed $vgpr16, 8, killed $vgpr17, killed $vgpr0_vgpr1_vgpr2_vgpr3, 0, 0, implicit $exec
early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3 = V_WMMA_F32_16X16X16_F16_w64_twoaddr 8, killed $vgpr6_vgpr7, 8, killed $vgpr8_vgpr9, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3, 0, 0, implicit $exec
early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3 = V_WMMA_I32_16X16X16_IU4_w64_twoaddr 8, killed $vgpr16, 8, killed $vgpr17, killed $vgpr0_vgpr1_vgpr2_vgpr3, 0, 0, implicit $exec
...
---
name: test_wmma_f32_16x16x16_fp8_fp8_D0_overlaps_A1
body: |
bb.0:
liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23
; GFX12-LABEL: name: test_wmma_f32_16x16x16_fp8_fp8_D0_overlaps_A1
; GFX12: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23
; GFX12-NEXT: {{ $}}
; GFX12-NEXT: early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3 = V_WMMA_F32_16X16X16_F16_w64_twoaddr 8, killed $vgpr6_vgpr7, 8, killed $vgpr8_vgpr9, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3, 0, 0, implicit $exec
; GFX12-NEXT: V_NOP_e32 implicit $exec
; GFX12-NEXT: early-clobber renamable $vgpr18_vgpr19_vgpr20_vgpr21 = V_WMMA_F32_16X16X16_FP8_FP8_w64_twoaddr killed $vgpr0, killed $vgpr17, 8, killed $vgpr18_vgpr19_vgpr20_vgpr21, 0, 0, implicit $exec
early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3 = V_WMMA_F32_16X16X16_F16_w64_twoaddr 8, killed $vgpr6_vgpr7, 8, killed $vgpr8_vgpr9, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3, 0, 0, implicit $exec
early-clobber renamable $vgpr18_vgpr19_vgpr20_vgpr21 = V_WMMA_F32_16X16X16_FP8_FP8_w64_twoaddr killed $vgpr0, killed $vgpr17, 8, killed $vgpr18_vgpr19_vgpr20_vgpr21, 0, 0, implicit $exec
...
---
name: test_wmma_f32_16x16x16_bf8_fp8_D0_overlaps_B1
body: |
bb.0:
liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23
; GFX12-LABEL: name: test_wmma_f32_16x16x16_bf8_fp8_D0_overlaps_B1
; GFX12: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23
; GFX12-NEXT: {{ $}}
; GFX12-NEXT: early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3 = V_WMMA_F32_16X16X16_F16_w64_twoaddr 8, killed $vgpr6_vgpr7, 8, killed $vgpr8_vgpr9, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3, 0, 0, implicit $exec
; GFX12-NEXT: V_NOP_e32 implicit $exec
; GFX12-NEXT: early-clobber renamable $vgpr18_vgpr19_vgpr20_vgpr21 = V_WMMA_F32_16X16X16_BF8_FP8_w64_twoaddr killed $vgpr16, killed $vgpr0, 8, killed $vgpr18_vgpr19_vgpr20_vgpr21, 0, 0, implicit $exec
early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3 = V_WMMA_F32_16X16X16_F16_w64_twoaddr 8, killed $vgpr6_vgpr7, 8, killed $vgpr8_vgpr9, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3, 0, 0, implicit $exec
early-clobber renamable $vgpr18_vgpr19_vgpr20_vgpr21 = V_WMMA_F32_16X16X16_BF8_FP8_w64_twoaddr killed $vgpr16, killed $vgpr0, 8, killed $vgpr18_vgpr19_vgpr20_vgpr21, 0, 0, implicit $exec
...
---
name: test_wmma_f32_16x16x16_fp8_bf8_D0_overlaps_C1
body: |
bb.0:
liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23
; GFX12-LABEL: name: test_wmma_f32_16x16x16_fp8_bf8_D0_overlaps_C1
; GFX12: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23
; GFX12-NEXT: {{ $}}
; GFX12-NEXT: early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3 = V_WMMA_F32_16X16X16_F16_w64_twoaddr 8, killed $vgpr6_vgpr7, 8, killed $vgpr8_vgpr9, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3, 0, 0, implicit $exec
; GFX12-NEXT: V_NOP_e32 implicit $exec
; GFX12-NEXT: early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3 = V_WMMA_F32_16X16X16_FP8_BF8_w64_twoaddr killed $vgpr16, killed $vgpr17, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3, 0, 0, implicit $exec
early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3 = V_WMMA_F32_16X16X16_F16_w64_twoaddr 8, killed $vgpr6_vgpr7, 8, killed $vgpr8_vgpr9, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3, 0, 0, implicit $exec
early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3 = V_WMMA_F32_16X16X16_FP8_BF8_w64_twoaddr killed $vgpr16, killed $vgpr17, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3, 0, 0, implicit $exec
...
---
name: test_wmma_f32_16x16x16_bf8_bf8_D0_overlaps_A1
body: |
bb.0:
liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23
; GFX12-LABEL: name: test_wmma_f32_16x16x16_bf8_bf8_D0_overlaps_A1
; GFX12: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23
; GFX12-NEXT: {{ $}}
; GFX12-NEXT: early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3 = V_WMMA_F32_16X16X16_F16_w64_twoaddr 8, killed $vgpr6_vgpr7, 8, killed $vgpr8_vgpr9, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3, 0, 0, implicit $exec
; GFX12-NEXT: V_NOP_e32 implicit $exec
; GFX12-NEXT: early-clobber renamable $vgpr18_vgpr19_vgpr20_vgpr21 = V_WMMA_F32_16X16X16_BF8_BF8_w64_twoaddr killed $vgpr0, killed $vgpr17, 8, killed $vgpr18_vgpr19_vgpr20_vgpr21, 0, 0, implicit $exec
early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3 = V_WMMA_F32_16X16X16_F16_w64_twoaddr 8, killed $vgpr6_vgpr7, 8, killed $vgpr8_vgpr9, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3, 0, 0, implicit $exec
early-clobber renamable $vgpr18_vgpr19_vgpr20_vgpr21 = V_WMMA_F32_16X16X16_BF8_BF8_w64_twoaddr killed $vgpr0, killed $vgpr17, 8, killed $vgpr18_vgpr19_vgpr20_vgpr21, 0, 0, implicit $exec
...
---
name: test_wmma_i32_16x16x32_iu4_D0_overlaps_B1
body: |
bb.0:
liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23
; GFX12-LABEL: name: test_wmma_i32_16x16x32_iu4_D0_overlaps_B1
; GFX12: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23
; GFX12-NEXT: {{ $}}
; GFX12-NEXT: early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3 = V_WMMA_F32_16X16X16_F16_w64_twoaddr 8, killed $vgpr6_vgpr7, 8, killed $vgpr8_vgpr9, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3, 0, 0, implicit $exec
; GFX12-NEXT: V_NOP_e32 implicit $exec
; GFX12-NEXT: early-clobber renamable $vgpr18_vgpr19_vgpr20_vgpr21 = V_WMMA_I32_16X16X32_IU4_w64_twoaddr 8, killed $vgpr16, 8, killed $vgpr0, killed $vgpr18_vgpr19_vgpr20_vgpr21, 0, 0, implicit $exec
early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3 = V_WMMA_F32_16X16X16_F16_w64_twoaddr 8, killed $vgpr6_vgpr7, 8, killed $vgpr8_vgpr9, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3, 0, 0, implicit $exec
early-clobber renamable $vgpr18_vgpr19_vgpr20_vgpr21 = V_WMMA_I32_16X16X32_IU4_w64_twoaddr 8, killed $vgpr16, 8, killed $vgpr0, killed $vgpr18_vgpr19_vgpr20_vgpr21, 0, 0, implicit $exec
...
---
name: test_swmmac_f32_16x16x32_f16_D0_overlaps_C1
body: |
bb.0:
liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25, $vgpr26, $vgpr27, $vgpr28
; GFX12-LABEL: name: test_swmmac_f32_16x16x32_f16_D0_overlaps_C1
; GFX12: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25, $vgpr26, $vgpr27, $vgpr28
; GFX12-NEXT: {{ $}}
; GFX12-NEXT: early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3 = V_WMMA_F32_16X16X16_F16_w64_twoaddr 8, killed $vgpr6_vgpr7, 8, killed $vgpr8_vgpr9, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3, 0, 0, implicit $exec
; GFX12-NEXT: early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3 = V_SWMMAC_F32_16X16X32_F16_w64_twoaddr 8, killed $vgpr16_vgpr17, 8, killed $vgpr18_vgpr19_vgpr20_vgpr21, killed $vgpr0_vgpr1_vgpr2_vgpr3, killed $vgpr26, 0, 0, 0, implicit $exec
early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3 = V_WMMA_F32_16X16X16_F16_w64_twoaddr 8, killed $vgpr6_vgpr7, 8, killed $vgpr8_vgpr9, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3, 0, 0, implicit $exec
early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3 = V_SWMMAC_F32_16X16X32_F16_w64_twoaddr 8, killed $vgpr16_vgpr17, 8, killed $vgpr18_vgpr19_vgpr20_vgpr21, killed $vgpr0_vgpr1_vgpr2_vgpr3, killed $vgpr26, 0, 0, 0, implicit $exec
...
---
name: test_swmmac_f32_16x16x32_bf16_D0_overlaps_Index1
body: |
bb.0:
liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25, $vgpr26, $vgpr27, $vgpr28
; GFX12-LABEL: name: test_swmmac_f32_16x16x32_bf16_D0_overlaps_Index1
; GFX12: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25, $vgpr26, $vgpr27, $vgpr28
; GFX12-NEXT: {{ $}}
; GFX12-NEXT: early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3 = V_WMMA_F32_16X16X16_F16_w64_twoaddr 8, killed $vgpr6_vgpr7, 8, killed $vgpr8_vgpr9, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3, 0, 0, implicit $exec
; GFX12-NEXT: V_NOP_e32 implicit $exec
; GFX12-NEXT: early-clobber renamable $vgpr22_vgpr23_vgpr24_vgpr25 = V_SWMMAC_F32_16X16X32_BF16_w64_twoaddr 8, killed $vgpr16_vgpr17, 8, killed $vgpr18_vgpr19_vgpr20_vgpr21, killed $vgpr22_vgpr23_vgpr24_vgpr25, killed $vgpr0, 0, 0, 0, implicit $exec
early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3 = V_WMMA_F32_16X16X16_F16_w64_twoaddr 8, killed $vgpr6_vgpr7, 8, killed $vgpr8_vgpr9, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3, 0, 0, implicit $exec
early-clobber renamable $vgpr22_vgpr23_vgpr24_vgpr25 = V_SWMMAC_F32_16X16X32_BF16_w64_twoaddr 8, killed $vgpr16_vgpr17, 8, killed $vgpr18_vgpr19_vgpr20_vgpr21, killed $vgpr22_vgpr23_vgpr24_vgpr25, killed $vgpr0, 0, 0, 0, implicit $exec
...
---
name: test_swmmac_f16_16x16x32_f16_D0_overlaps_A1
body: |
bb.0:
liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25, $vgpr26
; GFX12-LABEL: name: test_swmmac_f16_16x16x32_f16_D0_overlaps_A1
; GFX12: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25, $vgpr26
; GFX12-NEXT: {{ $}}
; GFX12-NEXT: early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3 = V_WMMA_F32_16X16X16_F16_w64_twoaddr 8, killed $vgpr6_vgpr7, 8, killed $vgpr8_vgpr9, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3, 0, 0, implicit $exec
; GFX12-NEXT: V_NOP_e32 implicit $exec
; GFX12-NEXT: early-clobber renamable $vgpr22_vgpr23 = V_SWMMAC_F16_16X16X32_F16_w64_twoaddr 8, killed $vgpr0_vgpr1, 8, killed $vgpr18_vgpr19_vgpr20_vgpr21, killed $vgpr22_vgpr23, killed $vgpr24, 0, 0, 0, implicit $exec
early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3 = V_WMMA_F32_16X16X16_F16_w64_twoaddr 8, killed $vgpr6_vgpr7, 8, killed $vgpr8_vgpr9, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3, 0, 0, implicit $exec
early-clobber renamable $vgpr22_vgpr23 = V_SWMMAC_F16_16X16X32_F16_w64_twoaddr 8, killed $vgpr0_vgpr1, 8, killed $vgpr18_vgpr19_vgpr20_vgpr21, killed $vgpr22_vgpr23, killed $vgpr24, 0, 0, 0, implicit $exec
...
---
name: test_swmmac_bf16_16x16x32_bf16_D0_overlaps_B1
body: |
bb.0:
liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25, $vgpr26
; GFX12-LABEL: name: test_swmmac_bf16_16x16x32_bf16_D0_overlaps_B1
; GFX12: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25, $vgpr26
; GFX12-NEXT: {{ $}}
; GFX12-NEXT: early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3 = V_WMMA_F32_16X16X16_F16_w64_twoaddr 8, killed $vgpr6_vgpr7, 8, killed $vgpr8_vgpr9, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3, 0, 0, implicit $exec
; GFX12-NEXT: V_NOP_e32 implicit $exec
; GFX12-NEXT: early-clobber renamable $vgpr22_vgpr23 = V_SWMMAC_BF16_16X16X32_BF16_w64_twoaddr 8, killed $vgpr16_vgpr17, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3, killed $vgpr22_vgpr23, killed $vgpr24, 0, 0, 0, implicit $exec
early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3 = V_WMMA_F32_16X16X16_F16_w64_twoaddr 8, killed $vgpr6_vgpr7, 8, killed $vgpr8_vgpr9, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3, 0, 0, implicit $exec
early-clobber renamable $vgpr22_vgpr23 = V_SWMMAC_BF16_16X16X32_BF16_w64_twoaddr 8, killed $vgpr16_vgpr17, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3, killed $vgpr22_vgpr23, killed $vgpr24, 0, 0, 0, implicit $exec
...
---
name: test_swmmac_i32_16x16x32_iu8_D0_overlaps_C1
body: |
bb.0:
liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25
; GFX12-LABEL: name: test_swmmac_i32_16x16x32_iu8_D0_overlaps_C1
; GFX12: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25
; GFX12-NEXT: {{ $}}
; GFX12-NEXT: early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3 = V_WMMA_F32_16X16X16_F16_w64_twoaddr 8, killed $vgpr6_vgpr7, 8, killed $vgpr8_vgpr9, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3, 0, 0, implicit $exec
; GFX12-NEXT: early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3 = V_SWMMAC_I32_16X16X32_IU8_w64_twoaddr 8, killed $vgpr16, 8, killed $vgpr17_vgpr18, killed $vgpr0_vgpr1_vgpr2_vgpr3, killed $vgpr23, 0, 0, 0, implicit $exec
early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3 = V_WMMA_F32_16X16X16_F16_w64_twoaddr 8, killed $vgpr6_vgpr7, 8, killed $vgpr8_vgpr9, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3, 0, 0, implicit $exec
early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3 = V_SWMMAC_I32_16X16X32_IU8_w64_twoaddr 8, killed $vgpr16, 8, killed $vgpr17_vgpr18, killed $vgpr0_vgpr1_vgpr2_vgpr3, killed $vgpr23, 0, 0, 0, implicit $exec
...
---
name: test_swmmac_i32_16x16x32_iu4_D0_overlaps_Index1
body: |
bb.0:
liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24
; GFX12-LABEL: name: test_swmmac_i32_16x16x32_iu4_D0_overlaps_Index1
; GFX12: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24
; GFX12-NEXT: {{ $}}
; GFX12-NEXT: early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3 = V_WMMA_F32_16X16X16_F16_w64_twoaddr 8, killed $vgpr6_vgpr7, 8, killed $vgpr8_vgpr9, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3, 0, 0, implicit $exec
; GFX12-NEXT: V_NOP_e32 implicit $exec
; GFX12-NEXT: early-clobber renamable $vgpr18_vgpr19_vgpr20_vgpr21 = V_SWMMAC_I32_16X16X32_IU4_w64_twoaddr 8, killed $vgpr16, 8, killed $vgpr16, killed $vgpr18_vgpr19_vgpr20_vgpr21, killed $vgpr0, 0, 0, 0, implicit $exec
early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3 = V_WMMA_F32_16X16X16_F16_w64_twoaddr 8, killed $vgpr6_vgpr7, 8, killed $vgpr8_vgpr9, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3, 0, 0, implicit $exec
early-clobber renamable $vgpr18_vgpr19_vgpr20_vgpr21 = V_SWMMAC_I32_16X16X32_IU4_w64_twoaddr 8, killed $vgpr16, 8, killed $vgpr16, killed $vgpr18_vgpr19_vgpr20_vgpr21, killed $vgpr0, 0, 0, 0, implicit $exec
...
---
name: test_swmmac_i32_16x16x64_iu4_D0_overlaps_A1
body: |
bb.0:
liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25
; GFX12-LABEL: name: test_swmmac_i32_16x16x64_iu4_D0_overlaps_A1
; GFX12: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25
; GFX12-NEXT: {{ $}}
; GFX12-NEXT: early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3 = V_WMMA_F32_16X16X16_F16_w64_twoaddr 8, killed $vgpr6_vgpr7, 8, killed $vgpr8_vgpr9, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3, 0, 0, implicit $exec
; GFX12-NEXT: V_NOP_e32 implicit $exec
; GFX12-NEXT: early-clobber renamable $vgpr19_vgpr20_vgpr21_vgpr22 = V_SWMMAC_I32_16X16X64_IU4_w64_twoaddr 8, killed $vgpr0, 8, killed $vgpr17_vgpr18, killed $vgpr19_vgpr20_vgpr21_vgpr22, killed $vgpr23, 0, 0, 0, implicit $exec
early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3 = V_WMMA_F32_16X16X16_F16_w64_twoaddr 8, killed $vgpr6_vgpr7, 8, killed $vgpr8_vgpr9, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3, 0, 0, implicit $exec
early-clobber renamable $vgpr19_vgpr20_vgpr21_vgpr22 = V_SWMMAC_I32_16X16X64_IU4_w64_twoaddr 8, killed $vgpr0, 8, killed $vgpr17_vgpr18, killed $vgpr19_vgpr20_vgpr21_vgpr22, killed $vgpr23, 0, 0, 0, implicit $exec
...
---
name: test_swmmac_f32_16x16x32_fp8_fp8_D0_overlaps_B1
body: |
bb.0:
liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25
; GFX12-LABEL: name: test_swmmac_f32_16x16x32_fp8_fp8_D0_overlaps_B1
; GFX12: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25
; GFX12-NEXT: {{ $}}
; GFX12-NEXT: early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3 = V_WMMA_F32_16X16X16_F16_w64_twoaddr 8, killed $vgpr6_vgpr7, 8, killed $vgpr8_vgpr9, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3, 0, 0, implicit $exec
; GFX12-NEXT: V_NOP_e32 implicit $exec
; GFX12-NEXT: early-clobber renamable $vgpr19_vgpr20_vgpr21_vgpr22 = V_SWMMAC_F32_16X16X32_FP8_FP8_w64_twoaddr killed $vgpr16, killed $vgpr0_vgpr1, killed $vgpr19_vgpr20_vgpr21_vgpr22, killed $vgpr23, 0, implicit $exec
early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3 = V_WMMA_F32_16X16X16_F16_w64_twoaddr 8, killed $vgpr6_vgpr7, 8, killed $vgpr8_vgpr9, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3, 0, 0, implicit $exec
early-clobber renamable $vgpr19_vgpr20_vgpr21_vgpr22 = V_SWMMAC_F32_16X16X32_FP8_FP8_w64_twoaddr killed $vgpr16, killed $vgpr0_vgpr1, killed $vgpr19_vgpr20_vgpr21_vgpr22, killed $vgpr23, 0, implicit $exec
...
---
name: test_swmmac_f32_16x16x32_fp8_bf8_D0_overlaps_C1
body: |
bb.0:
liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25
; GFX12-LABEL: name: test_swmmac_f32_16x16x32_fp8_bf8_D0_overlaps_C1
; GFX12: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25
; GFX12-NEXT: {{ $}}
; GFX12-NEXT: early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3 = V_WMMA_F32_16X16X16_F16_w64_twoaddr 8, killed $vgpr6_vgpr7, 8, killed $vgpr8_vgpr9, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3, 0, 0, implicit $exec
; GFX12-NEXT: early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3 = V_SWMMAC_F32_16X16X32_FP8_BF8_w64_twoaddr killed $vgpr16, killed $vgpr17_vgpr18, killed $vgpr0_vgpr1_vgpr2_vgpr3, killed $vgpr23, 0, implicit $exec
early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3 = V_WMMA_F32_16X16X16_F16_w64_twoaddr 8, killed $vgpr6_vgpr7, 8, killed $vgpr8_vgpr9, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3, 0, 0, implicit $exec
early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3 = V_SWMMAC_F32_16X16X32_FP8_BF8_w64_twoaddr killed $vgpr16, killed $vgpr17_vgpr18, killed $vgpr0_vgpr1_vgpr2_vgpr3, killed $vgpr23, 0, implicit $exec
...
---
name: test_swmmac_f32_16x16x32_bf8_fp8_D0_overlaps_Index1
body: |
bb.0:
liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25
; GFX12-LABEL: name: test_swmmac_f32_16x16x32_bf8_fp8_D0_overlaps_Index1
; GFX12: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25
; GFX12-NEXT: {{ $}}
; GFX12-NEXT: early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3 = V_WMMA_F32_16X16X16_F16_w64_twoaddr 8, killed $vgpr6_vgpr7, 8, killed $vgpr8_vgpr9, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3, 0, 0, implicit $exec
; GFX12-NEXT: V_NOP_e32 implicit $exec
; GFX12-NEXT: early-clobber renamable $vgpr19_vgpr20_vgpr21_vgpr22 = V_SWMMAC_F32_16X16X32_BF8_FP8_w64_twoaddr killed $vgpr16, killed $vgpr17_vgpr18, killed $vgpr19_vgpr20_vgpr21_vgpr22, killed $vgpr0, 0, implicit $exec
early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3 = V_WMMA_F32_16X16X16_F16_w64_twoaddr 8, killed $vgpr6_vgpr7, 8, killed $vgpr8_vgpr9, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3, 0, 0, implicit $exec
early-clobber renamable $vgpr19_vgpr20_vgpr21_vgpr22 = V_SWMMAC_F32_16X16X32_BF8_FP8_w64_twoaddr killed $vgpr16, killed $vgpr17_vgpr18, killed $vgpr19_vgpr20_vgpr21_vgpr22, killed $vgpr0, 0, implicit $exec
...
---
name: test_swmmac_f32_16x16x32_bf8_bf8_D0_overlaps_A1
body: |
bb.0:
liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25
; GFX12-LABEL: name: test_swmmac_f32_16x16x32_bf8_bf8_D0_overlaps_A1
; GFX12: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25
; GFX12-NEXT: {{ $}}
; GFX12-NEXT: early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3 = V_WMMA_F32_16X16X16_F16_w64_twoaddr 8, killed $vgpr6_vgpr7, 8, killed $vgpr8_vgpr9, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3, 0, 0, implicit $exec
; GFX12-NEXT: V_NOP_e32 implicit $exec
; GFX12-NEXT: early-clobber renamable $vgpr19_vgpr20_vgpr21_vgpr22 = V_SWMMAC_F32_16X16X32_BF8_BF8_w64_twoaddr killed $vgpr0, killed $vgpr17_vgpr18, killed $vgpr19_vgpr20_vgpr21_vgpr22, killed $vgpr23, 0, implicit $exec
early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3 = V_WMMA_F32_16X16X16_F16_w64_twoaddr 8, killed $vgpr6_vgpr7, 8, killed $vgpr8_vgpr9, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3, 0, 0, implicit $exec
early-clobber renamable $vgpr19_vgpr20_vgpr21_vgpr22 = V_SWMMAC_F32_16X16X32_BF8_BF8_w64_twoaddr killed $vgpr0, killed $vgpr17_vgpr18, killed $vgpr19_vgpr20_vgpr21_vgpr22, killed $vgpr23, 0, implicit $exec
...

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

View File

@ -253,22 +253,23 @@ def ROCDL_mfma_f32_32x32x16_fp8_fp8 : ROCDL_Mfma_IntrOp<"mfma.f32.32x32x16.fp8.f
//===---------------------------------------------------------------------===//
// WMMA intrinsics
class ROCDL_Wmma_IntrOp<string mnemonic, list<Trait> traits = []> :
class ROCDL_Wmma_IntrOp<string mnemonic, list<int> overloadedOperands,
list<Trait> traits = []> :
LLVM_IntrOpBase<ROCDL_Dialect, mnemonic,
"amdgcn_" # !subst(".","_", mnemonic),
[0], [], traits, 1>,
[0], overloadedOperands, traits, 1>,
Arguments<(ins Variadic<LLVM_Type>:$args)> {
let assemblyFormat =
"$args attr-dict `:` functional-type($args, $res)";
}
// Available on RDNA3
def ROCDL_wmma_f32_16x16x16_f16 : ROCDL_Wmma_IntrOp<"wmma.f32.16x16x16.f16">;
def ROCDL_wmma_f32_16x16x16_bf16 : ROCDL_Wmma_IntrOp<"wmma.f32.16x16x16.bf16">;
def ROCDL_wmma_f16_16x16x16_f16 : ROCDL_Wmma_IntrOp<"wmma.f16.16x16x16.f16">;
def ROCDL_wmma_bf16_16x16x16_bf16 : ROCDL_Wmma_IntrOp<"wmma.bf16.16x16x16.bf16">;
def ROCDL_wmma_i32_16x16x16_iu8 : ROCDL_Wmma_IntrOp<"wmma.i32.16x16x16.iu8">;
def ROCDL_wmma_i32_16x16x16_iu4 : ROCDL_Wmma_IntrOp<"wmma.i32.16x16x16.iu4">;
def ROCDL_wmma_f32_16x16x16_f16 : ROCDL_Wmma_IntrOp<"wmma.f32.16x16x16.f16", [0]>;
def ROCDL_wmma_f32_16x16x16_bf16 : ROCDL_Wmma_IntrOp<"wmma.f32.16x16x16.bf16", [0]>;
def ROCDL_wmma_f16_16x16x16_f16 : ROCDL_Wmma_IntrOp<"wmma.f16.16x16x16.f16", [0]>;
def ROCDL_wmma_bf16_16x16x16_bf16 : ROCDL_Wmma_IntrOp<"wmma.bf16.16x16x16.bf16", [0]>;
def ROCDL_wmma_i32_16x16x16_iu8 : ROCDL_Wmma_IntrOp<"wmma.i32.16x16x16.iu8", [1]>;
def ROCDL_wmma_i32_16x16x16_iu4 : ROCDL_Wmma_IntrOp<"wmma.i32.16x16x16.iu4", [1]>;
//===---------------------------------------------------------------------===//
// Operations on raw buffer resources (stride of 0, bounds checks either off or in

View File

@ -248,53 +248,53 @@ llvm.func @rocdl.wmma(%arg0 : vector<8xf32>, %arg1 : vector<16 x f16>, %arg2 : v
// ---- Wave32 -----
// f16 -> f32
// CHECK: call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.f16.v8f32(<16 x half> %{{.*}}, <16 x half> %{{.*}}, <8 x float> %{{.*}})
// CHECK: call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.f16.v8f32.v16f16(<16 x half> %{{.*}}, <16 x half> %{{.*}}, <8 x float> %{{.*}})
%r0 = rocdl.wmma.f32.16x16x16.f16 %arg1, %arg1, %arg0 : (vector<16xf16>, vector<16xf16>, vector<8xf32>) -> vector<8xf32>
// bf16 -> f32
// CHECK: call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf16.v8f32(<16 x i16> %{{.*}}, <16 x i16> %{{.*}}, <8 x float> %{{.*}})
// CHECK: call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf16.v8f32.v16i16(<16 x i16> %{{.*}}, <16 x i16> %{{.*}}, <8 x float> %{{.*}})
%r1 = rocdl.wmma.f32.16x16x16.bf16 %arg2, %arg2, %arg0 : (vector<16xi16>, vector<16xi16>, vector<8xf32>) -> vector<8xf32>
// f16 -> f16 (OPSEL = {0,1})
// CHECK: call <16 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16.v16f16(<16 x half> %{{.*}}, <16 x half> %{{.*}}, <16 x half> %{{.*}}, i1 {{.*}})
// CHECK: call <16 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16.v16f16.v16f16(<16 x half> %{{.*}}, <16 x half> %{{.*}}, <16 x half> %{{.*}}, i1 {{.*}})
%r2 = rocdl.wmma.f16.16x16x16.f16 %arg1, %arg1, %arg1, %zero : (vector<16xf16>, vector<16xf16>, vector<16xf16>, i1) -> vector<16xf16>
// bf16 -> bf16 (OPSEL = {0,1})
// CHECK: call <16 x i16> @llvm.amdgcn.wmma.bf16.16x16x16.bf16.v16i16(<16 x i16> %{{.*}}, <16 x i16> %{{.*}}, <16 x i16> %{{.*}}, i1 {{.*}})
// CHECK: call <16 x i16> @llvm.amdgcn.wmma.bf16.16x16x16.bf16.v16i16.v16i16(<16 x i16> %{{.*}}, <16 x i16> %{{.*}}, <16 x i16> %{{.*}}, i1 {{.*}})
%r4 = rocdl.wmma.bf16.16x16x16.bf16 %arg2, %arg2, %arg2, %zero : (vector<16xi16>, vector<16xi16>, vector<16xi16>, i1) -> vector<16xi16>
// int8 -> int32 (signA = {0,1}, signB = {0,1}, clamp = {0,1})
// CHECK: call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8.v8i32(i1 {{.*}}, <4 x i32> %{{.*}}, i1 {{.*}}, <4 x i32> %{{.*}}, <8 x i32> %{{.*}}, i1 {{.*}})
// CHECK: call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8.v8i32.v4i32(i1 {{.*}}, <4 x i32> %{{.*}}, i1 {{.*}}, <4 x i32> %{{.*}}, <8 x i32> %{{.*}}, i1 {{.*}})
%r5 = rocdl.wmma.i32.16x16x16.iu8 %zero, %arg5, %zero, %arg5, %arg3, %zero : (i1, vector<4xi32>, i1, vector<4xi32>, vector<8xi32>, i1) -> vector<8xi32>
// int4 -> int32 (signA = {0,1}, signB = {0,1}, clamp = {0,1})
// CHECK: call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4.v8i32(i1 {{.*}}, <2 x i32> %{{.*}}, i1 {{.*}}, <2 x i32> %{{.*}}, <8 x i32> %{{.*}}, i1 {{.*}})
// CHECK: call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4.v8i32.v2i32(i1 {{.*}}, <2 x i32> %{{.*}}, i1 {{.*}}, <2 x i32> %{{.*}}, <8 x i32> %{{.*}}, i1 {{.*}})
%r6 = rocdl.wmma.i32.16x16x16.iu4 %zero, %arg4, %zero, %arg4, %arg3, %zero : (i1, vector<2xi32>, i1, vector<2xi32>, vector<8xi32>, i1) -> vector<8xi32>
// ---- Wave64 -----
// f16 -> f32
// CHECK: call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.f16.v4f32(<16 x half> %{{.*}}, <16 x half> %{{.*}}, <4 x float> %{{.*}})
// CHECK: call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.f16.v4f32.v16f16(<16 x half> %{{.*}}, <16 x half> %{{.*}}, <4 x float> %{{.*}})
%r7 = rocdl.wmma.f32.16x16x16.f16 %arg1, %arg1, %arg6 : (vector<16xf16>, vector<16xf16>, vector<4xf32>) -> vector<4xf32>
// bf16 -> f32
// CHECK: call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf16.v4f32(<16 x i16> %{{.*}}, <16 x i16> %{{.*}}, <4 x float> %{{.*}})
// CHECK: call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf16.v4f32.v16i16(<16 x i16> %{{.*}}, <16 x i16> %{{.*}}, <4 x float> %{{.*}})
%r8 = rocdl.wmma.f32.16x16x16.bf16 %arg2, %arg2, %arg6 : (vector<16xi16>, vector<16xi16>, vector<4xf32>) -> vector<4xf32>
// f16 -> f16 (OPSEL = {0,1})
// CHECK: call <8 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16.v8f16(<16 x half> %{{.*}}, <16 x half> %{{.*}}, <8 x half> %{{.*}}, i1 {{.*}})
// CHECK: call <8 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16.v8f16.v16f16(<16 x half> %{{.*}}, <16 x half> %{{.*}}, <8 x half> %{{.*}}, i1 {{.*}})
%r9 = rocdl.wmma.f16.16x16x16.f16 %arg1, %arg1, %arg7, %zero : (vector<16xf16>, vector<16xf16>, vector<8xf16>, i1) -> vector<8xf16>
// bf16 -> bf16 (OPSEL = {0,1})
// CHECK: call <8 x i16> @llvm.amdgcn.wmma.bf16.16x16x16.bf16.v8i16(<16 x i16> %{{.*}}, <16 x i16> %{{.*}}, <8 x i16> %{{.*}}, i1 {{.*}})
// CHECK: call <8 x i16> @llvm.amdgcn.wmma.bf16.16x16x16.bf16.v8i16.v16i16(<16 x i16> %{{.*}}, <16 x i16> %{{.*}}, <8 x i16> %{{.*}}, i1 {{.*}})
%r11 = rocdl.wmma.bf16.16x16x16.bf16 %arg2, %arg2, %arg8, %zero : (vector<16xi16>, vector<16xi16>, vector<8xi16>, i1) -> vector<8xi16>
// int8 -> int32 (signA = {0,1}, signB = {0,1}, clamp = {0,1})
// CHECK: call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8.v4i32(i1 {{.*}}, <4 x i32> %{{.*}}, i1 {{.*}}, <4 x i32> %{{.*}}, <4 x i32> %{{.*}}, i1 {{.*}})
// CHECK: call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8.v4i32.v4i32(i1 {{.*}}, <4 x i32> %{{.*}}, i1 {{.*}}, <4 x i32> %{{.*}}, <4 x i32> %{{.*}}, i1 {{.*}})
%r12 = rocdl.wmma.i32.16x16x16.iu8 %zero, %arg5, %zero, %arg5, %arg5, %zero : (i1, vector<4xi32>, i1, vector<4xi32>, vector<4xi32>, i1) -> vector<4xi32>
// int4 -> int32 (signA = {0,1}, signB = {0,1}, clamp = {0,1})
// CHECK: call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4.v4i32(i1 {{.*}}, <2 x i32> %{{.*}}, i1 {{.*}}, <2 x i32> %{{.*}}, <4 x i32> %{{.*}}, i1 {{.*}})
// CHECK: call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4.v4i32.v2i32(i1 {{.*}}, <2 x i32> %{{.*}}, i1 {{.*}}, <2 x i32> %{{.*}}, <4 x i32> %{{.*}}, i1 {{.*}})
%r13 = rocdl.wmma.i32.16x16x16.iu4 %zero, %arg4, %zero, %arg4, %arg5, %zero : (i1, vector<2xi32>, i1, vector<2xi32>, vector<4xi32>, i1) -> vector<4xi32>
llvm.return %r0 : vector<8xf32>