mirror of
https://github.com/capstone-engine/llvm-capstone.git
synced 2024-11-22 21:30:10 +00:00
[AMDGPU] Add GFX12 WMMA and SWMMAC instructions (#77795)
Co-authored-by: Petar Avramovic <Petar.Avramovic@amd.com> Co-authored-by: Piotr Sobczak <piotr.sobczak@amd.com>
This commit is contained in:
parent
aa4cb0e313
commit
ed48280f8e
@ -436,5 +436,67 @@ TARGET_BUILTIN(__builtin_amdgcn_global_load_tr_i32, "ii*1", "nc", "gfx12-insts,w
|
||||
TARGET_BUILTIN(__builtin_amdgcn_global_load_tr_v4i16, "V4sV4s*1", "nc", "gfx12-insts,wavefrontsize64")
|
||||
TARGET_BUILTIN(__builtin_amdgcn_global_load_tr_v4f16, "V4hV4h*1", "nc", "gfx12-insts,wavefrontsize64")
|
||||
|
||||
//===----------------------------------------------------------------------===//
|
||||
// WMMA builtins.
|
||||
// Postfix w32 indicates the builtin requires wavefront size of 32.
|
||||
// Postfix w64 indicates the builtin requires wavefront size of 64.
|
||||
//
|
||||
// Some of these are very similar to their GFX11 counterparts, but they don't
|
||||
// require replication of the A,B matrices, so they use fewer vector elements.
|
||||
// Therefore, we add an "_gfx12" suffix to distinguish them from the existing
|
||||
// builtins.
|
||||
//===----------------------------------------------------------------------===//
|
||||
TARGET_BUILTIN(__builtin_amdgcn_wmma_f32_16x16x16_f16_w32_gfx12, "V8fV8hV8hV8f", "nc", "gfx12-insts,wavefrontsize32")
|
||||
TARGET_BUILTIN(__builtin_amdgcn_wmma_f32_16x16x16_bf16_w32_gfx12, "V8fV8sV8sV8f", "nc", "gfx12-insts,wavefrontsize32")
|
||||
TARGET_BUILTIN(__builtin_amdgcn_wmma_f16_16x16x16_f16_w32_gfx12, "V8hV8hV8hV8h", "nc", "gfx12-insts,wavefrontsize32")
|
||||
TARGET_BUILTIN(__builtin_amdgcn_wmma_bf16_16x16x16_bf16_w32_gfx12, "V8sV8sV8sV8s", "nc", "gfx12-insts,wavefrontsize32")
|
||||
TARGET_BUILTIN(__builtin_amdgcn_wmma_i32_16x16x16_iu8_w32_gfx12, "V8iIbV2iIbV2iV8iIb", "nc", "gfx12-insts,wavefrontsize32")
|
||||
TARGET_BUILTIN(__builtin_amdgcn_wmma_i32_16x16x16_iu4_w32_gfx12, "V8iIbiIbiV8iIb", "nc", "gfx12-insts,wavefrontsize32")
|
||||
// These are gfx12-only, but for consistency with the other WMMA variants we're
|
||||
// keeping the "_gfx12" suffix.
|
||||
TARGET_BUILTIN(__builtin_amdgcn_wmma_f32_16x16x16_fp8_fp8_w32_gfx12, "V8fV2iV2iV8f", "nc", "gfx12-insts,wavefrontsize32")
|
||||
TARGET_BUILTIN(__builtin_amdgcn_wmma_f32_16x16x16_fp8_bf8_w32_gfx12, "V8fV2iV2iV8f", "nc", "gfx12-insts,wavefrontsize32")
|
||||
TARGET_BUILTIN(__builtin_amdgcn_wmma_f32_16x16x16_bf8_fp8_w32_gfx12, "V8fV2iV2iV8f", "nc", "gfx12-insts,wavefrontsize32")
|
||||
TARGET_BUILTIN(__builtin_amdgcn_wmma_f32_16x16x16_bf8_bf8_w32_gfx12, "V8fV2iV2iV8f", "nc", "gfx12-insts,wavefrontsize32")
|
||||
TARGET_BUILTIN(__builtin_amdgcn_wmma_i32_16x16x32_iu4_w32_gfx12, "V8iIbV2iIbV2iV8iIb", "nc", "gfx12-insts,wavefrontsize32")
|
||||
|
||||
TARGET_BUILTIN(__builtin_amdgcn_wmma_f32_16x16x16_f16_w64_gfx12, "V4fV4hV4hV4f", "nc", "gfx12-insts,wavefrontsize64")
|
||||
TARGET_BUILTIN(__builtin_amdgcn_wmma_f32_16x16x16_bf16_w64_gfx12, "V4fV4sV4sV4f", "nc", "gfx12-insts,wavefrontsize64")
|
||||
TARGET_BUILTIN(__builtin_amdgcn_wmma_f16_16x16x16_f16_w64_gfx12, "V4hV4hV4hV4h", "nc", "gfx12-insts,wavefrontsize64")
|
||||
TARGET_BUILTIN(__builtin_amdgcn_wmma_bf16_16x16x16_bf16_w64_gfx12, "V4sV4sV4sV4s", "nc", "gfx12-insts,wavefrontsize64")
|
||||
TARGET_BUILTIN(__builtin_amdgcn_wmma_i32_16x16x16_iu8_w64_gfx12, "V4iIbiIbiV4iIb", "nc", "gfx12-insts,wavefrontsize64")
|
||||
TARGET_BUILTIN(__builtin_amdgcn_wmma_i32_16x16x16_iu4_w64_gfx12, "V4iIbiIbiV4iIb", "nc", "gfx12-insts,wavefrontsize64")
|
||||
// These are gfx12-only, but for consistency with the other WMMA variants we're
|
||||
// keeping the "_gfx12" suffix.
|
||||
TARGET_BUILTIN(__builtin_amdgcn_wmma_f32_16x16x16_fp8_fp8_w64_gfx12, "V4fiiV4f", "nc", "gfx12-insts,wavefrontsize64")
|
||||
TARGET_BUILTIN(__builtin_amdgcn_wmma_f32_16x16x16_fp8_bf8_w64_gfx12, "V4fiiV4f", "nc", "gfx12-insts,wavefrontsize64")
|
||||
TARGET_BUILTIN(__builtin_amdgcn_wmma_f32_16x16x16_bf8_fp8_w64_gfx12, "V4fiiV4f", "nc", "gfx12-insts,wavefrontsize64")
|
||||
TARGET_BUILTIN(__builtin_amdgcn_wmma_f32_16x16x16_bf8_bf8_w64_gfx12, "V4fiiV4f", "nc", "gfx12-insts,wavefrontsize64")
|
||||
TARGET_BUILTIN(__builtin_amdgcn_wmma_i32_16x16x32_iu4_w64_gfx12, "V4iIbiIbiV4iIb", "nc", "gfx12-insts,wavefrontsize64")
|
||||
|
||||
TARGET_BUILTIN(__builtin_amdgcn_swmmac_f32_16x16x32_f16_w32, "V8fV8hV16hV8fs", "nc", "gfx12-insts,wavefrontsize32")
|
||||
TARGET_BUILTIN(__builtin_amdgcn_swmmac_f32_16x16x32_bf16_w32, "V8fV8sV16sV8fs", "nc", "gfx12-insts,wavefrontsize32")
|
||||
TARGET_BUILTIN(__builtin_amdgcn_swmmac_f16_16x16x32_f16_w32, "V8hV8hV16hV8hs", "nc", "gfx12-insts,wavefrontsize32")
|
||||
TARGET_BUILTIN(__builtin_amdgcn_swmmac_bf16_16x16x32_bf16_w32, "V8sV8sV16sV8ss", "nc", "gfx12-insts,wavefrontsize32")
|
||||
TARGET_BUILTIN(__builtin_amdgcn_swmmac_i32_16x16x32_iu8_w32, "V8iIbV2iIbV4iV8isIb", "nc", "gfx12-insts,wavefrontsize32")
|
||||
TARGET_BUILTIN(__builtin_amdgcn_swmmac_i32_16x16x32_iu4_w32, "V8iIbiIbV2iV8isIb", "nc", "gfx12-insts,wavefrontsize32")
|
||||
TARGET_BUILTIN(__builtin_amdgcn_swmmac_i32_16x16x64_iu4_w32, "V8iIbV2iIbV4iV8isIb", "nc", "gfx12-insts,wavefrontsize32")
|
||||
TARGET_BUILTIN(__builtin_amdgcn_swmmac_f32_16x16x32_fp8_fp8_w32, "V8fV2iV4iV8fs", "nc", "gfx12-insts,wavefrontsize32")
|
||||
TARGET_BUILTIN(__builtin_amdgcn_swmmac_f32_16x16x32_fp8_bf8_w32, "V8fV2iV4iV8fs", "nc", "gfx12-insts,wavefrontsize32")
|
||||
TARGET_BUILTIN(__builtin_amdgcn_swmmac_f32_16x16x32_bf8_fp8_w32, "V8fV2iV4iV8fs", "nc", "gfx12-insts,wavefrontsize32")
|
||||
TARGET_BUILTIN(__builtin_amdgcn_swmmac_f32_16x16x32_bf8_bf8_w32, "V8fV2iV4iV8fs", "nc", "gfx12-insts,wavefrontsize32")
|
||||
|
||||
TARGET_BUILTIN(__builtin_amdgcn_swmmac_f32_16x16x32_f16_w64, "V4fV4hV8hV4fs", "nc", "gfx12-insts,wavefrontsize64")
|
||||
TARGET_BUILTIN(__builtin_amdgcn_swmmac_f32_16x16x32_bf16_w64, "V4fV4sV8sV4fs", "nc", "gfx12-insts,wavefrontsize64")
|
||||
TARGET_BUILTIN(__builtin_amdgcn_swmmac_f16_16x16x32_f16_w64, "V4hV4hV8hV4hs", "nc", "gfx12-insts,wavefrontsize64")
|
||||
TARGET_BUILTIN(__builtin_amdgcn_swmmac_bf16_16x16x32_bf16_w64, "V4sV4sV8sV4ss", "nc", "gfx12-insts,wavefrontsize64")
|
||||
TARGET_BUILTIN(__builtin_amdgcn_swmmac_i32_16x16x32_iu8_w64, "V4iIbiIbV2iV4isIb", "nc", "gfx12-insts,wavefrontsize64")
|
||||
TARGET_BUILTIN(__builtin_amdgcn_swmmac_i32_16x16x32_iu4_w64, "V4iIbiIbiV4isIb", "nc", "gfx12-insts,wavefrontsize64")
|
||||
TARGET_BUILTIN(__builtin_amdgcn_swmmac_i32_16x16x64_iu4_w64, "V4iIbiIbV2iV4isIb", "nc", "gfx12-insts,wavefrontsize64")
|
||||
TARGET_BUILTIN(__builtin_amdgcn_swmmac_f32_16x16x32_fp8_fp8_w64, "V4fiV2iV4fs", "nc", "gfx12-insts,wavefrontsize64")
|
||||
TARGET_BUILTIN(__builtin_amdgcn_swmmac_f32_16x16x32_fp8_bf8_w64, "V4fiV2iV4fs", "nc", "gfx12-insts,wavefrontsize64")
|
||||
TARGET_BUILTIN(__builtin_amdgcn_swmmac_f32_16x16x32_bf8_fp8_w64, "V4fiV2iV4fs", "nc", "gfx12-insts,wavefrontsize64")
|
||||
TARGET_BUILTIN(__builtin_amdgcn_swmmac_f32_16x16x32_bf8_bf8_w64, "V4fiV2iV4fs", "nc", "gfx12-insts,wavefrontsize64")
|
||||
|
||||
#undef BUILTIN
|
||||
#undef TARGET_BUILTIN
|
||||
|
@ -18279,65 +18279,216 @@ Value *CodeGenFunction::EmitAMDGPUBuiltinExpr(unsigned BuiltinID,
|
||||
case AMDGPU::BI__builtin_amdgcn_wmma_i32_16x16x16_iu4_w32:
|
||||
case AMDGPU::BI__builtin_amdgcn_wmma_i32_16x16x16_iu4_w64:
|
||||
case AMDGPU::BI__builtin_amdgcn_wmma_i32_16x16x16_iu8_w32:
|
||||
case AMDGPU::BI__builtin_amdgcn_wmma_i32_16x16x16_iu8_w64: {
|
||||
case AMDGPU::BI__builtin_amdgcn_wmma_i32_16x16x16_iu8_w64:
|
||||
case AMDGPU::BI__builtin_amdgcn_wmma_bf16_16x16x16_bf16_w32_gfx12:
|
||||
case AMDGPU::BI__builtin_amdgcn_wmma_bf16_16x16x16_bf16_w64_gfx12:
|
||||
case AMDGPU::BI__builtin_amdgcn_wmma_f16_16x16x16_f16_w32_gfx12:
|
||||
case AMDGPU::BI__builtin_amdgcn_wmma_f16_16x16x16_f16_w64_gfx12:
|
||||
case AMDGPU::BI__builtin_amdgcn_wmma_f32_16x16x16_bf16_w32_gfx12:
|
||||
case AMDGPU::BI__builtin_amdgcn_wmma_f32_16x16x16_bf16_w64_gfx12:
|
||||
case AMDGPU::BI__builtin_amdgcn_wmma_f32_16x16x16_f16_w32_gfx12:
|
||||
case AMDGPU::BI__builtin_amdgcn_wmma_f32_16x16x16_f16_w64_gfx12:
|
||||
case AMDGPU::BI__builtin_amdgcn_wmma_i32_16x16x16_iu4_w32_gfx12:
|
||||
case AMDGPU::BI__builtin_amdgcn_wmma_i32_16x16x16_iu4_w64_gfx12:
|
||||
case AMDGPU::BI__builtin_amdgcn_wmma_i32_16x16x16_iu8_w32_gfx12:
|
||||
case AMDGPU::BI__builtin_amdgcn_wmma_i32_16x16x16_iu8_w64_gfx12:
|
||||
case AMDGPU::BI__builtin_amdgcn_wmma_f32_16x16x16_fp8_fp8_w32_gfx12:
|
||||
case AMDGPU::BI__builtin_amdgcn_wmma_f32_16x16x16_fp8_fp8_w64_gfx12:
|
||||
case AMDGPU::BI__builtin_amdgcn_wmma_f32_16x16x16_fp8_bf8_w32_gfx12:
|
||||
case AMDGPU::BI__builtin_amdgcn_wmma_f32_16x16x16_fp8_bf8_w64_gfx12:
|
||||
case AMDGPU::BI__builtin_amdgcn_wmma_f32_16x16x16_bf8_fp8_w32_gfx12:
|
||||
case AMDGPU::BI__builtin_amdgcn_wmma_f32_16x16x16_bf8_fp8_w64_gfx12:
|
||||
case AMDGPU::BI__builtin_amdgcn_wmma_f32_16x16x16_bf8_bf8_w32_gfx12:
|
||||
case AMDGPU::BI__builtin_amdgcn_wmma_f32_16x16x16_bf8_bf8_w64_gfx12:
|
||||
case AMDGPU::BI__builtin_amdgcn_wmma_i32_16x16x32_iu4_w32_gfx12:
|
||||
case AMDGPU::BI__builtin_amdgcn_wmma_i32_16x16x32_iu4_w64_gfx12:
|
||||
case AMDGPU::BI__builtin_amdgcn_swmmac_f32_16x16x32_f16_w32:
|
||||
case AMDGPU::BI__builtin_amdgcn_swmmac_f32_16x16x32_f16_w64:
|
||||
case AMDGPU::BI__builtin_amdgcn_swmmac_f32_16x16x32_bf16_w32:
|
||||
case AMDGPU::BI__builtin_amdgcn_swmmac_f32_16x16x32_bf16_w64:
|
||||
case AMDGPU::BI__builtin_amdgcn_swmmac_f16_16x16x32_f16_w32:
|
||||
case AMDGPU::BI__builtin_amdgcn_swmmac_f16_16x16x32_f16_w64:
|
||||
case AMDGPU::BI__builtin_amdgcn_swmmac_bf16_16x16x32_bf16_w32:
|
||||
case AMDGPU::BI__builtin_amdgcn_swmmac_bf16_16x16x32_bf16_w64:
|
||||
case AMDGPU::BI__builtin_amdgcn_swmmac_i32_16x16x32_iu8_w32:
|
||||
case AMDGPU::BI__builtin_amdgcn_swmmac_i32_16x16x32_iu8_w64:
|
||||
case AMDGPU::BI__builtin_amdgcn_swmmac_i32_16x16x32_iu4_w32:
|
||||
case AMDGPU::BI__builtin_amdgcn_swmmac_i32_16x16x32_iu4_w64:
|
||||
case AMDGPU::BI__builtin_amdgcn_swmmac_i32_16x16x64_iu4_w32:
|
||||
case AMDGPU::BI__builtin_amdgcn_swmmac_i32_16x16x64_iu4_w64:
|
||||
case AMDGPU::BI__builtin_amdgcn_swmmac_f32_16x16x32_fp8_fp8_w32:
|
||||
case AMDGPU::BI__builtin_amdgcn_swmmac_f32_16x16x32_fp8_fp8_w64:
|
||||
case AMDGPU::BI__builtin_amdgcn_swmmac_f32_16x16x32_fp8_bf8_w32:
|
||||
case AMDGPU::BI__builtin_amdgcn_swmmac_f32_16x16x32_fp8_bf8_w64:
|
||||
case AMDGPU::BI__builtin_amdgcn_swmmac_f32_16x16x32_bf8_fp8_w32:
|
||||
case AMDGPU::BI__builtin_amdgcn_swmmac_f32_16x16x32_bf8_fp8_w64:
|
||||
case AMDGPU::BI__builtin_amdgcn_swmmac_f32_16x16x32_bf8_bf8_w32:
|
||||
case AMDGPU::BI__builtin_amdgcn_swmmac_f32_16x16x32_bf8_bf8_w64: {
|
||||
|
||||
// These operations perform a matrix multiplication and accumulation of
|
||||
// the form:
|
||||
// D = A * B + C
|
||||
// The return type always matches the type of matrix C.
|
||||
unsigned ArgForMatchingRetType;
|
||||
// We need to specify one type for matrices AB and one for matrices CD.
|
||||
// Sparse matrix operations can have different types for A and B as well as
|
||||
// an additional type for sparsity index.
|
||||
// Destination type should be put before types used for source operands.
|
||||
SmallVector<unsigned, 2> ArgsForMatchingMatrixTypes;
|
||||
// On GFX12, the intrinsics with 16-bit accumulator use a packed layout.
|
||||
// There is no need for the variable opsel argument, so always set it to
|
||||
// "false".
|
||||
bool AppendFalseForOpselArg = false;
|
||||
unsigned BuiltinWMMAOp;
|
||||
|
||||
switch (BuiltinID) {
|
||||
case AMDGPU::BI__builtin_amdgcn_wmma_f32_16x16x16_f16_w32:
|
||||
case AMDGPU::BI__builtin_amdgcn_wmma_f32_16x16x16_f16_w64:
|
||||
ArgForMatchingRetType = 2;
|
||||
case AMDGPU::BI__builtin_amdgcn_wmma_f32_16x16x16_f16_w32_gfx12:
|
||||
case AMDGPU::BI__builtin_amdgcn_wmma_f32_16x16x16_f16_w64_gfx12:
|
||||
ArgsForMatchingMatrixTypes = {2, 0}; // CD, AB
|
||||
BuiltinWMMAOp = Intrinsic::amdgcn_wmma_f32_16x16x16_f16;
|
||||
break;
|
||||
case AMDGPU::BI__builtin_amdgcn_wmma_f32_16x16x16_bf16_w32:
|
||||
case AMDGPU::BI__builtin_amdgcn_wmma_f32_16x16x16_bf16_w64:
|
||||
ArgForMatchingRetType = 2;
|
||||
case AMDGPU::BI__builtin_amdgcn_wmma_f32_16x16x16_bf16_w32_gfx12:
|
||||
case AMDGPU::BI__builtin_amdgcn_wmma_f32_16x16x16_bf16_w64_gfx12:
|
||||
ArgsForMatchingMatrixTypes = {2, 0}; // CD, AB
|
||||
BuiltinWMMAOp = Intrinsic::amdgcn_wmma_f32_16x16x16_bf16;
|
||||
break;
|
||||
case AMDGPU::BI__builtin_amdgcn_wmma_f16_16x16x16_f16_w32_gfx12:
|
||||
case AMDGPU::BI__builtin_amdgcn_wmma_f16_16x16x16_f16_w64_gfx12:
|
||||
AppendFalseForOpselArg = true;
|
||||
LLVM_FALLTHROUGH;
|
||||
case AMDGPU::BI__builtin_amdgcn_wmma_f16_16x16x16_f16_w32:
|
||||
case AMDGPU::BI__builtin_amdgcn_wmma_f16_16x16x16_f16_w64:
|
||||
ArgForMatchingRetType = 2;
|
||||
ArgsForMatchingMatrixTypes = {2, 0}; // CD, AB
|
||||
BuiltinWMMAOp = Intrinsic::amdgcn_wmma_f16_16x16x16_f16;
|
||||
break;
|
||||
case AMDGPU::BI__builtin_amdgcn_wmma_bf16_16x16x16_bf16_w32_gfx12:
|
||||
case AMDGPU::BI__builtin_amdgcn_wmma_bf16_16x16x16_bf16_w64_gfx12:
|
||||
AppendFalseForOpselArg = true;
|
||||
LLVM_FALLTHROUGH;
|
||||
case AMDGPU::BI__builtin_amdgcn_wmma_bf16_16x16x16_bf16_w32:
|
||||
case AMDGPU::BI__builtin_amdgcn_wmma_bf16_16x16x16_bf16_w64:
|
||||
ArgForMatchingRetType = 2;
|
||||
ArgsForMatchingMatrixTypes = {2, 0}; // CD, AB
|
||||
BuiltinWMMAOp = Intrinsic::amdgcn_wmma_bf16_16x16x16_bf16;
|
||||
break;
|
||||
case AMDGPU::BI__builtin_amdgcn_wmma_f16_16x16x16_f16_tied_w32:
|
||||
case AMDGPU::BI__builtin_amdgcn_wmma_f16_16x16x16_f16_tied_w64:
|
||||
ArgForMatchingRetType = 2;
|
||||
ArgsForMatchingMatrixTypes = {2, 0}; // CD, AB
|
||||
BuiltinWMMAOp = Intrinsic::amdgcn_wmma_f16_16x16x16_f16_tied;
|
||||
break;
|
||||
case AMDGPU::BI__builtin_amdgcn_wmma_bf16_16x16x16_bf16_tied_w32:
|
||||
case AMDGPU::BI__builtin_amdgcn_wmma_bf16_16x16x16_bf16_tied_w64:
|
||||
ArgForMatchingRetType = 2;
|
||||
ArgsForMatchingMatrixTypes = {2, 0}; // CD, AB
|
||||
BuiltinWMMAOp = Intrinsic::amdgcn_wmma_bf16_16x16x16_bf16_tied;
|
||||
break;
|
||||
case AMDGPU::BI__builtin_amdgcn_wmma_i32_16x16x16_iu8_w32:
|
||||
case AMDGPU::BI__builtin_amdgcn_wmma_i32_16x16x16_iu8_w64:
|
||||
ArgForMatchingRetType = 4;
|
||||
case AMDGPU::BI__builtin_amdgcn_wmma_i32_16x16x16_iu8_w32_gfx12:
|
||||
case AMDGPU::BI__builtin_amdgcn_wmma_i32_16x16x16_iu8_w64_gfx12:
|
||||
ArgsForMatchingMatrixTypes = {4, 1}; // CD, AB
|
||||
BuiltinWMMAOp = Intrinsic::amdgcn_wmma_i32_16x16x16_iu8;
|
||||
break;
|
||||
case AMDGPU::BI__builtin_amdgcn_wmma_i32_16x16x16_iu4_w32:
|
||||
case AMDGPU::BI__builtin_amdgcn_wmma_i32_16x16x16_iu4_w64:
|
||||
ArgForMatchingRetType = 4;
|
||||
case AMDGPU::BI__builtin_amdgcn_wmma_i32_16x16x16_iu4_w32_gfx12:
|
||||
case AMDGPU::BI__builtin_amdgcn_wmma_i32_16x16x16_iu4_w64_gfx12:
|
||||
ArgsForMatchingMatrixTypes = {4, 1}; // CD, AB
|
||||
BuiltinWMMAOp = Intrinsic::amdgcn_wmma_i32_16x16x16_iu4;
|
||||
break;
|
||||
case AMDGPU::BI__builtin_amdgcn_wmma_f32_16x16x16_fp8_fp8_w32_gfx12:
|
||||
case AMDGPU::BI__builtin_amdgcn_wmma_f32_16x16x16_fp8_fp8_w64_gfx12:
|
||||
ArgsForMatchingMatrixTypes = {2, 0}; // CD, AB
|
||||
BuiltinWMMAOp = Intrinsic::amdgcn_wmma_f32_16x16x16_fp8_fp8;
|
||||
break;
|
||||
case AMDGPU::BI__builtin_amdgcn_wmma_f32_16x16x16_fp8_bf8_w32_gfx12:
|
||||
case AMDGPU::BI__builtin_amdgcn_wmma_f32_16x16x16_fp8_bf8_w64_gfx12:
|
||||
ArgsForMatchingMatrixTypes = {2, 0}; // CD, AB
|
||||
BuiltinWMMAOp = Intrinsic::amdgcn_wmma_f32_16x16x16_fp8_bf8;
|
||||
break;
|
||||
case AMDGPU::BI__builtin_amdgcn_wmma_f32_16x16x16_bf8_fp8_w32_gfx12:
|
||||
case AMDGPU::BI__builtin_amdgcn_wmma_f32_16x16x16_bf8_fp8_w64_gfx12:
|
||||
ArgsForMatchingMatrixTypes = {2, 0}; // CD, AB
|
||||
BuiltinWMMAOp = Intrinsic::amdgcn_wmma_f32_16x16x16_bf8_fp8;
|
||||
break;
|
||||
case AMDGPU::BI__builtin_amdgcn_wmma_f32_16x16x16_bf8_bf8_w32_gfx12:
|
||||
case AMDGPU::BI__builtin_amdgcn_wmma_f32_16x16x16_bf8_bf8_w64_gfx12:
|
||||
ArgsForMatchingMatrixTypes = {2, 0}; // CD, AB
|
||||
BuiltinWMMAOp = Intrinsic::amdgcn_wmma_f32_16x16x16_bf8_bf8;
|
||||
break;
|
||||
case AMDGPU::BI__builtin_amdgcn_wmma_i32_16x16x32_iu4_w32_gfx12:
|
||||
case AMDGPU::BI__builtin_amdgcn_wmma_i32_16x16x32_iu4_w64_gfx12:
|
||||
ArgsForMatchingMatrixTypes = {4, 1}; // CD, AB
|
||||
BuiltinWMMAOp = Intrinsic::amdgcn_wmma_i32_16x16x32_iu4;
|
||||
break;
|
||||
case AMDGPU::BI__builtin_amdgcn_swmmac_f32_16x16x32_f16_w32:
|
||||
case AMDGPU::BI__builtin_amdgcn_swmmac_f32_16x16x32_f16_w64:
|
||||
ArgsForMatchingMatrixTypes = {2, 0, 1, 3}; // CD, A, B, Index
|
||||
BuiltinWMMAOp = Intrinsic::amdgcn_swmmac_f32_16x16x32_f16;
|
||||
break;
|
||||
case AMDGPU::BI__builtin_amdgcn_swmmac_f32_16x16x32_bf16_w32:
|
||||
case AMDGPU::BI__builtin_amdgcn_swmmac_f32_16x16x32_bf16_w64:
|
||||
ArgsForMatchingMatrixTypes = {2, 0, 1, 3}; // CD, A, B, Index
|
||||
BuiltinWMMAOp = Intrinsic::amdgcn_swmmac_f32_16x16x32_bf16;
|
||||
break;
|
||||
case AMDGPU::BI__builtin_amdgcn_swmmac_f16_16x16x32_f16_w32:
|
||||
case AMDGPU::BI__builtin_amdgcn_swmmac_f16_16x16x32_f16_w64:
|
||||
ArgsForMatchingMatrixTypes = {2, 0, 1, 3}; // CD, A, B, Index
|
||||
BuiltinWMMAOp = Intrinsic::amdgcn_swmmac_f16_16x16x32_f16;
|
||||
break;
|
||||
case AMDGPU::BI__builtin_amdgcn_swmmac_bf16_16x16x32_bf16_w32:
|
||||
case AMDGPU::BI__builtin_amdgcn_swmmac_bf16_16x16x32_bf16_w64:
|
||||
ArgsForMatchingMatrixTypes = {2, 0, 1, 3}; // CD, A, B, Index
|
||||
BuiltinWMMAOp = Intrinsic::amdgcn_swmmac_bf16_16x16x32_bf16;
|
||||
break;
|
||||
case AMDGPU::BI__builtin_amdgcn_swmmac_i32_16x16x32_iu8_w32:
|
||||
case AMDGPU::BI__builtin_amdgcn_swmmac_i32_16x16x32_iu8_w64:
|
||||
ArgsForMatchingMatrixTypes = {4, 1, 3, 5}; // CD, A, B, Index
|
||||
BuiltinWMMAOp = Intrinsic::amdgcn_swmmac_i32_16x16x32_iu8;
|
||||
break;
|
||||
case AMDGPU::BI__builtin_amdgcn_swmmac_i32_16x16x32_iu4_w32:
|
||||
case AMDGPU::BI__builtin_amdgcn_swmmac_i32_16x16x32_iu4_w64:
|
||||
ArgsForMatchingMatrixTypes = {4, 1, 3, 5}; // CD, A, B, Index
|
||||
BuiltinWMMAOp = Intrinsic::amdgcn_swmmac_i32_16x16x32_iu4;
|
||||
break;
|
||||
case AMDGPU::BI__builtin_amdgcn_swmmac_i32_16x16x64_iu4_w32:
|
||||
case AMDGPU::BI__builtin_amdgcn_swmmac_i32_16x16x64_iu4_w64:
|
||||
ArgsForMatchingMatrixTypes = {4, 1, 3, 5}; // CD, A, B, Index
|
||||
BuiltinWMMAOp = Intrinsic::amdgcn_swmmac_i32_16x16x64_iu4;
|
||||
break;
|
||||
case AMDGPU::BI__builtin_amdgcn_swmmac_f32_16x16x32_fp8_fp8_w32:
|
||||
case AMDGPU::BI__builtin_amdgcn_swmmac_f32_16x16x32_fp8_fp8_w64:
|
||||
ArgsForMatchingMatrixTypes = {2, 0, 1, 3}; // CD, A, B, Index
|
||||
BuiltinWMMAOp = Intrinsic::amdgcn_swmmac_f32_16x16x32_fp8_fp8;
|
||||
break;
|
||||
case AMDGPU::BI__builtin_amdgcn_swmmac_f32_16x16x32_fp8_bf8_w32:
|
||||
case AMDGPU::BI__builtin_amdgcn_swmmac_f32_16x16x32_fp8_bf8_w64:
|
||||
ArgsForMatchingMatrixTypes = {2, 0, 1, 3}; // CD, A, B, Index
|
||||
BuiltinWMMAOp = Intrinsic::amdgcn_swmmac_f32_16x16x32_fp8_bf8;
|
||||
break;
|
||||
case AMDGPU::BI__builtin_amdgcn_swmmac_f32_16x16x32_bf8_fp8_w32:
|
||||
case AMDGPU::BI__builtin_amdgcn_swmmac_f32_16x16x32_bf8_fp8_w64:
|
||||
ArgsForMatchingMatrixTypes = {2, 0, 1, 3}; // CD, A, B, Index
|
||||
BuiltinWMMAOp = Intrinsic::amdgcn_swmmac_f32_16x16x32_bf8_fp8;
|
||||
break;
|
||||
case AMDGPU::BI__builtin_amdgcn_swmmac_f32_16x16x32_bf8_bf8_w32:
|
||||
case AMDGPU::BI__builtin_amdgcn_swmmac_f32_16x16x32_bf8_bf8_w64:
|
||||
ArgsForMatchingMatrixTypes = {2, 0, 1, 3}; // CD, A, B, Index
|
||||
BuiltinWMMAOp = Intrinsic::amdgcn_swmmac_f32_16x16x32_bf8_bf8;
|
||||
break;
|
||||
}
|
||||
|
||||
SmallVector<Value *, 6> Args;
|
||||
for (int i = 0, e = E->getNumArgs(); i != e; ++i)
|
||||
Args.push_back(EmitScalarExpr(E->getArg(i)));
|
||||
if (AppendFalseForOpselArg)
|
||||
Args.push_back(Builder.getFalse());
|
||||
|
||||
Function *F = CGM.getIntrinsic(BuiltinWMMAOp,
|
||||
{Args[ArgForMatchingRetType]->getType()});
|
||||
SmallVector<llvm::Type *, 6> ArgTypes;
|
||||
for (auto ArgIdx : ArgsForMatchingMatrixTypes)
|
||||
ArgTypes.push_back(Args[ArgIdx]->getType());
|
||||
|
||||
Function *F = CGM.getIntrinsic(BuiltinWMMAOp, ArgTypes);
|
||||
return Builder.CreateCall(F, Args);
|
||||
}
|
||||
|
||||
|
156
clang/test/CodeGenOpenCL/builtins-amdgcn-gfx12-wmma-w32.cl
Normal file
156
clang/test/CodeGenOpenCL/builtins-amdgcn-gfx12-wmma-w32.cl
Normal file
@ -0,0 +1,156 @@
|
||||
// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py
|
||||
// REQUIRES: amdgpu-registered-target
|
||||
// RUN: %clang_cc1 -triple amdgcn-unknown-unknown -target-cpu gfx1200 -target-feature +wavefrontsize32 -S -emit-llvm -o - %s | FileCheck %s --check-prefix=CHECK-GFX1200
|
||||
|
||||
typedef int v2i __attribute__((ext_vector_type(2)));
|
||||
typedef float v8f __attribute__((ext_vector_type(8)));
|
||||
typedef half v8h __attribute__((ext_vector_type(8)));
|
||||
typedef short v8s __attribute__((ext_vector_type(8)));
|
||||
typedef int v8i __attribute__((ext_vector_type(8)));
|
||||
|
||||
// Wave32
|
||||
|
||||
//
|
||||
// amdgcn_wmma_f32_16x16x16_f16
|
||||
//
|
||||
|
||||
// CHECK-GFX1200-LABEL: @test_amdgcn_wmma_f32_16x16x16_f16_w32(
|
||||
// CHECK-GFX1200-NEXT: entry:
|
||||
// CHECK-GFX1200-NEXT: [[TMP0:%.*]] = tail call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.f16.v8f32.v8f16(<8 x half> [[A:%.*]], <8 x half> [[B:%.*]], <8 x float> [[C:%.*]])
|
||||
// CHECK-GFX1200-NEXT: store <8 x float> [[TMP0]], ptr addrspace(1) [[OUT:%.*]], align 32, !tbaa [[TBAA4:![0-9]+]]
|
||||
// CHECK-GFX1200-NEXT: ret void
|
||||
//
|
||||
void test_amdgcn_wmma_f32_16x16x16_f16_w32(global v8f* out, v8h a, v8h b, v8f c)
|
||||
{
|
||||
*out = __builtin_amdgcn_wmma_f32_16x16x16_f16_w32_gfx12(a, b, c);
|
||||
}
|
||||
|
||||
//
|
||||
// amdgcn_wmma_f32_16x16x16_bf16
|
||||
//
|
||||
|
||||
// CHECK-GFX1200-LABEL: @test_amdgcn_wmma_f32_16x16x16_bf16_w32(
|
||||
// CHECK-GFX1200-NEXT: entry:
|
||||
// CHECK-GFX1200-NEXT: [[TMP0:%.*]] = tail call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf16.v8f32.v8i16(<8 x i16> [[A:%.*]], <8 x i16> [[B:%.*]], <8 x float> [[C:%.*]])
|
||||
// CHECK-GFX1200-NEXT: store <8 x float> [[TMP0]], ptr addrspace(1) [[OUT:%.*]], align 32, !tbaa [[TBAA4]]
|
||||
// CHECK-GFX1200-NEXT: ret void
|
||||
//
|
||||
void test_amdgcn_wmma_f32_16x16x16_bf16_w32(global v8f* out, v8s a, v8s b, v8f c)
|
||||
{
|
||||
*out = __builtin_amdgcn_wmma_f32_16x16x16_bf16_w32_gfx12(a, b, c);
|
||||
}
|
||||
|
||||
//
|
||||
// amdgcn_wmma_f16_16x16x16_f16
|
||||
//
|
||||
|
||||
// CHECK-GFX1200-LABEL: @test_amdgcn_wmma_f16_16x16x16_f16_w32(
|
||||
// CHECK-GFX1200-NEXT: entry:
|
||||
// CHECK-GFX1200-NEXT: [[TMP0:%.*]] = tail call <8 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16.v8f16.v8f16(<8 x half> [[A:%.*]], <8 x half> [[B:%.*]], <8 x half> [[C:%.*]], i1 false)
|
||||
// CHECK-GFX1200-NEXT: store <8 x half> [[TMP0]], ptr addrspace(1) [[OUT:%.*]], align 16, !tbaa [[TBAA4]]
|
||||
// CHECK-GFX1200-NEXT: ret void
|
||||
//
|
||||
void test_amdgcn_wmma_f16_16x16x16_f16_w32(global v8h* out, v8h a, v8h b, v8h c)
|
||||
{
|
||||
*out = __builtin_amdgcn_wmma_f16_16x16x16_f16_w32_gfx12(a, b, c);
|
||||
}
|
||||
|
||||
//
|
||||
// amdgcn_wmma_bf16_16x16x16_bf16
|
||||
//
|
||||
|
||||
// CHECK-GFX1200-LABEL: @test_amdgcn_wmma_bf16_16x16x16_bf16_w32(
|
||||
// CHECK-GFX1200-NEXT: entry:
|
||||
// CHECK-GFX1200-NEXT: [[TMP0:%.*]] = tail call <8 x i16> @llvm.amdgcn.wmma.bf16.16x16x16.bf16.v8i16.v8i16(<8 x i16> [[A:%.*]], <8 x i16> [[B:%.*]], <8 x i16> [[C:%.*]], i1 false)
|
||||
// CHECK-GFX1200-NEXT: store <8 x i16> [[TMP0]], ptr addrspace(1) [[OUT:%.*]], align 16, !tbaa [[TBAA4]]
|
||||
// CHECK-GFX1200-NEXT: ret void
|
||||
//
|
||||
void test_amdgcn_wmma_bf16_16x16x16_bf16_w32(global v8s* out, v8s a, v8s b, v8s c)
|
||||
{
|
||||
*out = __builtin_amdgcn_wmma_bf16_16x16x16_bf16_w32_gfx12(a, b, c);
|
||||
}
|
||||
|
||||
//
|
||||
// amdgcn_wmma_i32_16x16x16_iu8
|
||||
//
|
||||
|
||||
// CHECK-GFX1200-LABEL: @test_amdgcn_wmma_i32_16x16x16_iu8_w32(
|
||||
// CHECK-GFX1200-NEXT: entry:
|
||||
// CHECK-GFX1200-NEXT: [[TMP0:%.*]] = tail call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8.v8i32.v2i32(i1 true, <2 x i32> [[A:%.*]], i1 true, <2 x i32> [[B:%.*]], <8 x i32> [[C:%.*]], i1 false)
|
||||
// CHECK-GFX1200-NEXT: store <8 x i32> [[TMP0]], ptr addrspace(1) [[OUT:%.*]], align 32, !tbaa [[TBAA4]]
|
||||
// CHECK-GFX1200-NEXT: ret void
|
||||
//
|
||||
void test_amdgcn_wmma_i32_16x16x16_iu8_w32(global v8i* out, v2i a, v2i b, v8i c)
|
||||
{
|
||||
*out = __builtin_amdgcn_wmma_i32_16x16x16_iu8_w32_gfx12(true, a, true, b, c, false);
|
||||
}
|
||||
|
||||
//
|
||||
// amdgcn_wmma_i32_16x16x16_iu4
|
||||
//
|
||||
|
||||
// CHECK-GFX1200-LABEL: @test_amdgcn_wmma_i32_16x16x16_iu4_w32(
|
||||
// CHECK-GFX1200-NEXT: entry:
|
||||
// CHECK-GFX1200-NEXT: [[TMP0:%.*]] = tail call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4.v8i32.i32(i1 true, i32 [[A:%.*]], i1 true, i32 [[B:%.*]], <8 x i32> [[C:%.*]], i1 false)
|
||||
// CHECK-GFX1200-NEXT: store <8 x i32> [[TMP0]], ptr addrspace(1) [[OUT:%.*]], align 32, !tbaa [[TBAA4]]
|
||||
// CHECK-GFX1200-NEXT: ret void
|
||||
//
|
||||
void test_amdgcn_wmma_i32_16x16x16_iu4_w32(global v8i* out, int a, int b, v8i c)
|
||||
{
|
||||
*out = __builtin_amdgcn_wmma_i32_16x16x16_iu4_w32_gfx12(true, a, true, b, c, false);
|
||||
}
|
||||
|
||||
// CHECK-GFX1200-LABEL: @test_amdgcn_wmma_f32_16x16x16_fp8_fp8_w32(
|
||||
// CHECK-GFX1200-NEXT: entry:
|
||||
// CHECK-GFX1200-NEXT: [[TMP0:%.*]] = tail call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.fp8.fp8.v8f32.v2i32(<2 x i32> [[A:%.*]], <2 x i32> [[B:%.*]], <8 x float> [[C:%.*]])
|
||||
// CHECK-GFX1200-NEXT: store <8 x float> [[TMP0]], ptr addrspace(1) [[OUT:%.*]], align 32, !tbaa [[TBAA4]]
|
||||
// CHECK-GFX1200-NEXT: ret void
|
||||
//
|
||||
void test_amdgcn_wmma_f32_16x16x16_fp8_fp8_w32(global v8f* out, v2i a, v2i b, v8f c)
|
||||
{
|
||||
*out = __builtin_amdgcn_wmma_f32_16x16x16_fp8_fp8_w32_gfx12(a, b, c);
|
||||
}
|
||||
|
||||
// CHECK-GFX1200-LABEL: @test_amdgcn_wmma_f32_16x16x16_fp8_bf8_w32(
|
||||
// CHECK-GFX1200-NEXT: entry:
|
||||
// CHECK-GFX1200-NEXT: [[TMP0:%.*]] = tail call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.fp8.bf8.v8f32.v2i32(<2 x i32> [[A:%.*]], <2 x i32> [[B:%.*]], <8 x float> [[C:%.*]])
|
||||
// CHECK-GFX1200-NEXT: store <8 x float> [[TMP0]], ptr addrspace(1) [[OUT:%.*]], align 32, !tbaa [[TBAA4]]
|
||||
// CHECK-GFX1200-NEXT: ret void
|
||||
//
|
||||
void test_amdgcn_wmma_f32_16x16x16_fp8_bf8_w32(global v8f* out, v2i a, v2i b, v8f c)
|
||||
{
|
||||
*out = __builtin_amdgcn_wmma_f32_16x16x16_fp8_bf8_w32_gfx12(a, b, c);
|
||||
}
|
||||
|
||||
// CHECK-GFX1200-LABEL: @test_amdgcn_wmma_f32_16x16x16_bf8_fp8_w32(
|
||||
// CHECK-GFX1200-NEXT: entry:
|
||||
// CHECK-GFX1200-NEXT: [[TMP0:%.*]] = tail call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf8.fp8.v8f32.v2i32(<2 x i32> [[A:%.*]], <2 x i32> [[B:%.*]], <8 x float> [[C:%.*]])
|
||||
// CHECK-GFX1200-NEXT: store <8 x float> [[TMP0]], ptr addrspace(1) [[OUT:%.*]], align 32, !tbaa [[TBAA4]]
|
||||
// CHECK-GFX1200-NEXT: ret void
|
||||
//
|
||||
void test_amdgcn_wmma_f32_16x16x16_bf8_fp8_w32(global v8f* out, v2i a, v2i b, v8f c)
|
||||
{
|
||||
*out = __builtin_amdgcn_wmma_f32_16x16x16_bf8_fp8_w32_gfx12(a, b, c);
|
||||
}
|
||||
|
||||
// CHECK-GFX1200-LABEL: @test_amdgcn_wmma_f32_16x16x16_bf8_bf8_w32(
|
||||
// CHECK-GFX1200-NEXT: entry:
|
||||
// CHECK-GFX1200-NEXT: [[TMP0:%.*]] = tail call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf8.bf8.v8f32.v2i32(<2 x i32> [[A:%.*]], <2 x i32> [[B:%.*]], <8 x float> [[C:%.*]])
|
||||
// CHECK-GFX1200-NEXT: store <8 x float> [[TMP0]], ptr addrspace(1) [[OUT:%.*]], align 32, !tbaa [[TBAA4]]
|
||||
// CHECK-GFX1200-NEXT: ret void
|
||||
//
|
||||
void test_amdgcn_wmma_f32_16x16x16_bf8_bf8_w32(global v8f* out, v2i a, v2i b, v8f c)
|
||||
{
|
||||
*out = __builtin_amdgcn_wmma_f32_16x16x16_bf8_bf8_w32_gfx12(a, b, c);
|
||||
}
|
||||
|
||||
// CHECK-GFX1200-LABEL: @test_amdgcn_wmma_i32_16x16x32_iu4_w32(
|
||||
// CHECK-GFX1200-NEXT: entry:
|
||||
// CHECK-GFX1200-NEXT: [[TMP0:%.*]] = tail call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x32.iu4.v8i32.v2i32(i1 true, <2 x i32> [[A:%.*]], i1 true, <2 x i32> [[B:%.*]], <8 x i32> [[C:%.*]], i1 false)
|
||||
// CHECK-GFX1200-NEXT: store <8 x i32> [[TMP0]], ptr addrspace(1) [[OUT:%.*]], align 32, !tbaa [[TBAA4]]
|
||||
// CHECK-GFX1200-NEXT: ret void
|
||||
//
|
||||
void test_amdgcn_wmma_i32_16x16x32_iu4_w32(global v8i* out, v2i a, v2i b, v8i c)
|
||||
{
|
||||
*out = __builtin_amdgcn_wmma_i32_16x16x32_iu4_w32_gfx12(true, a, true, b, c, false);
|
||||
}
|
155
clang/test/CodeGenOpenCL/builtins-amdgcn-gfx12-wmma-w64.cl
Normal file
155
clang/test/CodeGenOpenCL/builtins-amdgcn-gfx12-wmma-w64.cl
Normal file
@ -0,0 +1,155 @@
|
||||
// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py
|
||||
// REQUIRES: amdgpu-registered-target
|
||||
// RUN: %clang_cc1 -triple amdgcn-unknown-unknown -target-cpu gfx1200 -target-feature +wavefrontsize64 -S -emit-llvm -o - %s | FileCheck %s --check-prefix=CHECK-GFX1200
|
||||
|
||||
typedef float v4f __attribute__((ext_vector_type(4)));
|
||||
typedef half v4h __attribute__((ext_vector_type(4)));
|
||||
typedef short v4s __attribute__((ext_vector_type(4)));
|
||||
typedef int v4i __attribute__((ext_vector_type(4)));
|
||||
|
||||
// Wave64
|
||||
|
||||
//
|
||||
// amdgcn_wmma_f32_16x16x16_f16
|
||||
//
|
||||
|
||||
// CHECK-GFX1200-LABEL: @test_amdgcn_wmma_f32_16x16x16_f16_w64(
|
||||
// CHECK-GFX1200-NEXT: entry:
|
||||
// CHECK-GFX1200-NEXT: [[TMP0:%.*]] = tail call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.f16.v4f32.v4f16(<4 x half> [[A:%.*]], <4 x half> [[B:%.*]], <4 x float> [[C:%.*]])
|
||||
// CHECK-GFX1200-NEXT: store <4 x float> [[TMP0]], ptr addrspace(1) [[OUT:%.*]], align 16, !tbaa [[TBAA4:![0-9]+]]
|
||||
// CHECK-GFX1200-NEXT: ret void
|
||||
//
|
||||
void test_amdgcn_wmma_f32_16x16x16_f16_w64(global v4f* out, v4h a, v4h b, v4f c)
|
||||
{
|
||||
*out = __builtin_amdgcn_wmma_f32_16x16x16_f16_w64_gfx12(a, b, c);
|
||||
}
|
||||
|
||||
//
|
||||
// amdgcn_wmma_f32_16x16x16_bf16
|
||||
//
|
||||
|
||||
// CHECK-GFX1200-LABEL: @test_amdgcn_wmma_f32_16x16x16_bf16_w64(
|
||||
// CHECK-GFX1200-NEXT: entry:
|
||||
// CHECK-GFX1200-NEXT: [[TMP0:%.*]] = tail call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf16.v4f32.v4i16(<4 x i16> [[A:%.*]], <4 x i16> [[B:%.*]], <4 x float> [[C:%.*]])
|
||||
// CHECK-GFX1200-NEXT: store <4 x float> [[TMP0]], ptr addrspace(1) [[OUT:%.*]], align 16, !tbaa [[TBAA4]]
|
||||
// CHECK-GFX1200-NEXT: ret void
|
||||
//
|
||||
void test_amdgcn_wmma_f32_16x16x16_bf16_w64(global v4f* out, v4s a, v4s b, v4f c)
|
||||
{
|
||||
*out = __builtin_amdgcn_wmma_f32_16x16x16_bf16_w64_gfx12(a, b, c);
|
||||
}
|
||||
|
||||
//
|
||||
// amdgcn_wmma_f16_16x16x16_f16
|
||||
//
|
||||
|
||||
// CHECK-GFX1200-LABEL: @test_amdgcn_wmma_f16_16x16x16_f16_w64(
|
||||
// CHECK-GFX1200-NEXT: entry:
|
||||
// CHECK-GFX1200-NEXT: [[TMP0:%.*]] = tail call <4 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16.v4f16.v4f16(<4 x half> [[A:%.*]], <4 x half> [[B:%.*]], <4 x half> [[C:%.*]], i1 false)
|
||||
// CHECK-GFX1200-NEXT: store <4 x half> [[TMP0]], ptr addrspace(1) [[OUT:%.*]], align 8, !tbaa [[TBAA4]]
|
||||
// CHECK-GFX1200-NEXT: ret void
|
||||
//
|
||||
void test_amdgcn_wmma_f16_16x16x16_f16_w64(global v4h* out, v4h a, v4h b, v4h c)
|
||||
{
|
||||
*out = __builtin_amdgcn_wmma_f16_16x16x16_f16_w64_gfx12(a, b, c);
|
||||
}
|
||||
|
||||
//
|
||||
// amdgcn_wmma_bf16_16x16x16_bf16
|
||||
//
|
||||
|
||||
// CHECK-GFX1200-LABEL: @test_amdgcn_wmma_bf16_16x16x16_bf16_w64(
|
||||
// CHECK-GFX1200-NEXT: entry:
|
||||
// CHECK-GFX1200-NEXT: [[TMP0:%.*]] = tail call <4 x i16> @llvm.amdgcn.wmma.bf16.16x16x16.bf16.v4i16.v4i16(<4 x i16> [[A:%.*]], <4 x i16> [[B:%.*]], <4 x i16> [[C:%.*]], i1 false)
|
||||
// CHECK-GFX1200-NEXT: store <4 x i16> [[TMP0]], ptr addrspace(1) [[OUT:%.*]], align 8, !tbaa [[TBAA4]]
|
||||
// CHECK-GFX1200-NEXT: ret void
|
||||
//
|
||||
void test_amdgcn_wmma_bf16_16x16x16_bf16_w64(global v4s* out, v4s a, v4s b, v4s c)
|
||||
{
|
||||
*out = __builtin_amdgcn_wmma_bf16_16x16x16_bf16_w64_gfx12(a, b, c);
|
||||
}
|
||||
|
||||
//
|
||||
// amdgcn_wmma_i32_16x16x16_iu8
|
||||
//
|
||||
|
||||
// CHECK-GFX1200-LABEL: @test_amdgcn_wmma_i32_16x16x16_iu8_w64(
|
||||
// CHECK-GFX1200-NEXT: entry:
|
||||
// CHECK-GFX1200-NEXT: [[TMP0:%.*]] = tail call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8.v4i32.i32(i1 true, i32 [[A:%.*]], i1 true, i32 [[B:%.*]], <4 x i32> [[C:%.*]], i1 false)
|
||||
// CHECK-GFX1200-NEXT: store <4 x i32> [[TMP0]], ptr addrspace(1) [[OUT:%.*]], align 16, !tbaa [[TBAA4]]
|
||||
// CHECK-GFX1200-NEXT: ret void
|
||||
//
|
||||
void test_amdgcn_wmma_i32_16x16x16_iu8_w64(global v4i* out, int a, int b, v4i c)
|
||||
{
|
||||
*out = __builtin_amdgcn_wmma_i32_16x16x16_iu8_w64_gfx12(true, a, true, b, c, false);
|
||||
}
|
||||
|
||||
//
|
||||
// amdgcn_wmma_i32_16x16x16_iu4
|
||||
//
|
||||
|
||||
// CHECK-GFX1200-LABEL: @test_amdgcn_wmma_i32_16x16x16_iu4_w64(
|
||||
// CHECK-GFX1200-NEXT: entry:
|
||||
// CHECK-GFX1200-NEXT: [[TMP0:%.*]] = tail call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4.v4i32.i32(i1 true, i32 [[A:%.*]], i1 true, i32 [[B:%.*]], <4 x i32> [[C:%.*]], i1 false)
|
||||
// CHECK-GFX1200-NEXT: store <4 x i32> [[TMP0]], ptr addrspace(1) [[OUT:%.*]], align 16, !tbaa [[TBAA4]]
|
||||
// CHECK-GFX1200-NEXT: ret void
|
||||
//
|
||||
void test_amdgcn_wmma_i32_16x16x16_iu4_w64(global v4i* out, int a, int b, v4i c)
|
||||
{
|
||||
*out = __builtin_amdgcn_wmma_i32_16x16x16_iu4_w64_gfx12(true, a, true, b, c, false);
|
||||
}
|
||||
|
||||
// CHECK-GFX1200-LABEL: @test_amdgcn_wmma_f32_16x16x16_fp8_fp8_w32(
|
||||
// CHECK-GFX1200-NEXT: entry:
|
||||
// CHECK-GFX1200-NEXT: [[TMP0:%.*]] = tail call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.fp8.fp8.v4f32.i32(i32 [[A:%.*]], i32 [[B:%.*]], <4 x float> [[C:%.*]])
|
||||
// CHECK-GFX1200-NEXT: store <4 x float> [[TMP0]], ptr addrspace(1) [[OUT:%.*]], align 16, !tbaa [[TBAA4]]
|
||||
// CHECK-GFX1200-NEXT: ret void
|
||||
//
|
||||
void test_amdgcn_wmma_f32_16x16x16_fp8_fp8_w32(global v4f* out, int a, int b, v4f c)
|
||||
{
|
||||
*out = __builtin_amdgcn_wmma_f32_16x16x16_fp8_fp8_w64_gfx12(a, b, c);
|
||||
}
|
||||
|
||||
// CHECK-GFX1200-LABEL: @test_amdgcn_wmma_f32_16x16x16_fp8_bf8_w32(
|
||||
// CHECK-GFX1200-NEXT: entry:
|
||||
// CHECK-GFX1200-NEXT: [[TMP0:%.*]] = tail call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.fp8.bf8.v4f32.i32(i32 [[A:%.*]], i32 [[B:%.*]], <4 x float> [[C:%.*]])
|
||||
// CHECK-GFX1200-NEXT: store <4 x float> [[TMP0]], ptr addrspace(1) [[OUT:%.*]], align 16, !tbaa [[TBAA4]]
|
||||
// CHECK-GFX1200-NEXT: ret void
|
||||
//
|
||||
void test_amdgcn_wmma_f32_16x16x16_fp8_bf8_w32(global v4f* out, int a, int b, v4f c)
|
||||
{
|
||||
*out = __builtin_amdgcn_wmma_f32_16x16x16_fp8_bf8_w64_gfx12(a, b, c);
|
||||
}
|
||||
|
||||
// CHECK-GFX1200-LABEL: @test_amdgcn_wmma_f32_16x16x16_bf8_fp8_w32(
|
||||
// CHECK-GFX1200-NEXT: entry:
|
||||
// CHECK-GFX1200-NEXT: [[TMP0:%.*]] = tail call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf8.fp8.v4f32.i32(i32 [[A:%.*]], i32 [[B:%.*]], <4 x float> [[C:%.*]])
|
||||
// CHECK-GFX1200-NEXT: store <4 x float> [[TMP0]], ptr addrspace(1) [[OUT:%.*]], align 16, !tbaa [[TBAA4]]
|
||||
// CHECK-GFX1200-NEXT: ret void
|
||||
//
|
||||
void test_amdgcn_wmma_f32_16x16x16_bf8_fp8_w32(global v4f* out, int a, int b, v4f c)
|
||||
{
|
||||
*out = __builtin_amdgcn_wmma_f32_16x16x16_bf8_fp8_w64_gfx12(a, b, c);
|
||||
}
|
||||
|
||||
// CHECK-GFX1200-LABEL: @test_amdgcn_wmma_f32_16x16x16_bf8_bf8_w32(
|
||||
// CHECK-GFX1200-NEXT: entry:
|
||||
// CHECK-GFX1200-NEXT: [[TMP0:%.*]] = tail call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf8.bf8.v4f32.i32(i32 [[A:%.*]], i32 [[B:%.*]], <4 x float> [[C:%.*]])
|
||||
// CHECK-GFX1200-NEXT: store <4 x float> [[TMP0]], ptr addrspace(1) [[OUT:%.*]], align 16, !tbaa [[TBAA4]]
|
||||
// CHECK-GFX1200-NEXT: ret void
|
||||
//
|
||||
void test_amdgcn_wmma_f32_16x16x16_bf8_bf8_w32(global v4f* out, int a, int b, v4f c)
|
||||
{
|
||||
*out = __builtin_amdgcn_wmma_f32_16x16x16_bf8_bf8_w64_gfx12(a, b, c);
|
||||
}
|
||||
|
||||
// CHECK-GFX1200-LABEL: @test_amdgcn_wmma_i32_16x16x32_iu4_w32(
|
||||
// CHECK-GFX1200-NEXT: entry:
|
||||
// CHECK-GFX1200-NEXT: [[TMP0:%.*]] = tail call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x32.iu4.v4i32.i32(i1 true, i32 [[A:%.*]], i1 true, i32 [[B:%.*]], <4 x i32> [[C:%.*]], i1 false)
|
||||
// CHECK-GFX1200-NEXT: store <4 x i32> [[TMP0]], ptr addrspace(1) [[OUT:%.*]], align 16, !tbaa [[TBAA4]]
|
||||
// CHECK-GFX1200-NEXT: ret void
|
||||
//
|
||||
void test_amdgcn_wmma_i32_16x16x32_iu4_w32(global v4i* out, int a, int b, v4i c)
|
||||
{
|
||||
*out = __builtin_amdgcn_wmma_i32_16x16x32_iu4_w64_gfx12(true, a, true, b, c, false);
|
||||
}
|
135
clang/test/CodeGenOpenCL/builtins-amdgcn-swmmac-w32.cl
Normal file
135
clang/test/CodeGenOpenCL/builtins-amdgcn-swmmac-w32.cl
Normal file
@ -0,0 +1,135 @@
|
||||
// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py
|
||||
// REQUIRES: amdgpu-registered-target
|
||||
// RUN: %clang_cc1 -triple amdgcn-unknown-unknown -target-cpu gfx1200 -target-feature +wavefrontsize32 -S -emit-llvm -o - %s | FileCheck %s --check-prefix=CHECK-GFX1200
|
||||
|
||||
typedef int v2i __attribute__((ext_vector_type(2)));
|
||||
typedef int v4i __attribute__((ext_vector_type(4)));
|
||||
typedef float v8f __attribute__((ext_vector_type(8)));
|
||||
typedef half v8h __attribute__((ext_vector_type(8)));
|
||||
typedef short v8s __attribute__((ext_vector_type(8)));
|
||||
typedef int v8i __attribute__((ext_vector_type(8)));
|
||||
typedef half v16h __attribute__((ext_vector_type(16)));
|
||||
typedef short v16s __attribute__((ext_vector_type(16)));
|
||||
|
||||
// Wave32
|
||||
|
||||
// CHECK-GFX1200-LABEL: @test_amdgcn_swmmac_f32_16x16x32_f16_w32(
|
||||
// CHECK-GFX1200-NEXT: entry:
|
||||
// CHECK-GFX1200-NEXT: [[TMP0:%.*]] = tail call <8 x float> @llvm.amdgcn.swmmac.f32.16x16x32.f16.v8f32.v8f16.v16f16.i16(<8 x half> [[A:%.*]], <16 x half> [[B:%.*]], <8 x float> [[C:%.*]], i16 [[INDEX:%.*]])
|
||||
// CHECK-GFX1200-NEXT: store <8 x float> [[TMP0]], ptr addrspace(1) [[OUT:%.*]], align 32, !tbaa [[TBAA4:![0-9]+]]
|
||||
// CHECK-GFX1200-NEXT: ret void
|
||||
//
|
||||
void test_amdgcn_swmmac_f32_16x16x32_f16_w32(global v8f* out, v8h a, v16h b, v8f c, short index)
|
||||
{
|
||||
*out = __builtin_amdgcn_swmmac_f32_16x16x32_f16_w32(a, b, c, index);
|
||||
}
|
||||
|
||||
// CHECK-GFX1200-LABEL: @test_amdgcn_swmmac_f32_16x16x32_bf16_w32(
|
||||
// CHECK-GFX1200-NEXT: entry:
|
||||
// CHECK-GFX1200-NEXT: [[TMP0:%.*]] = tail call <8 x float> @llvm.amdgcn.swmmac.f32.16x16x32.bf16.v8f32.v8i16.v16i16.i16(<8 x i16> [[A:%.*]], <16 x i16> [[B:%.*]], <8 x float> [[C:%.*]], i16 [[INDEX:%.*]])
|
||||
// CHECK-GFX1200-NEXT: store <8 x float> [[TMP0]], ptr addrspace(1) [[OUT:%.*]], align 32, !tbaa [[TBAA4]]
|
||||
// CHECK-GFX1200-NEXT: ret void
|
||||
//
|
||||
void test_amdgcn_swmmac_f32_16x16x32_bf16_w32(global v8f* out, v8s a, v16s b, v8f c, short index)
|
||||
{
|
||||
*out = __builtin_amdgcn_swmmac_f32_16x16x32_bf16_w32(a, b, c, index);
|
||||
}
|
||||
|
||||
// CHECK-GFX1200-LABEL: @test_amdgcn_swmmac_f16_16x16x32_f16_w32(
|
||||
// CHECK-GFX1200-NEXT: entry:
|
||||
// CHECK-GFX1200-NEXT: [[TMP0:%.*]] = tail call <8 x half> @llvm.amdgcn.swmmac.f16.16x16x32.f16.v8f16.v8f16.v16f16.i16(<8 x half> [[A:%.*]], <16 x half> [[B:%.*]], <8 x half> [[C:%.*]], i16 [[INDEX:%.*]])
|
||||
// CHECK-GFX1200-NEXT: store <8 x half> [[TMP0]], ptr addrspace(1) [[OUT:%.*]], align 16, !tbaa [[TBAA4]]
|
||||
// CHECK-GFX1200-NEXT: ret void
|
||||
//
|
||||
void test_amdgcn_swmmac_f16_16x16x32_f16_w32(global v8h* out, v8h a, v16h b, v8h c, short index)
|
||||
{
|
||||
*out = __builtin_amdgcn_swmmac_f16_16x16x32_f16_w32(a, b, c, index);
|
||||
}
|
||||
|
||||
// CHECK-GFX1200-LABEL: @test_amdgcn_swmmac_bf16_16x16x32_bf16_w32(
|
||||
// CHECK-GFX1200-NEXT: entry:
|
||||
// CHECK-GFX1200-NEXT: [[TMP0:%.*]] = tail call <8 x i16> @llvm.amdgcn.swmmac.bf16.16x16x32.bf16.v8i16.v8i16.v16i16.i16(<8 x i16> [[A:%.*]], <16 x i16> [[B:%.*]], <8 x i16> [[C:%.*]], i16 [[INDEX:%.*]])
|
||||
// CHECK-GFX1200-NEXT: store <8 x i16> [[TMP0]], ptr addrspace(1) [[OUT:%.*]], align 16, !tbaa [[TBAA4]]
|
||||
// CHECK-GFX1200-NEXT: ret void
|
||||
//
|
||||
void test_amdgcn_swmmac_bf16_16x16x32_bf16_w32(global v8s* out, v8s a, v16s b, v8s c, short index)
|
||||
{
|
||||
*out = __builtin_amdgcn_swmmac_bf16_16x16x32_bf16_w32(a, b, c, index);
|
||||
}
|
||||
|
||||
// CHECK-GFX1200-LABEL: @test_amdgcn_swmmac_i32_16x16x32_iu8_w32(
|
||||
// CHECK-GFX1200-NEXT: entry:
|
||||
// CHECK-GFX1200-NEXT: [[TMP0:%.*]] = tail call <8 x i32> @llvm.amdgcn.swmmac.i32.16x16x32.iu8.v8i32.v2i32.v4i32.i16(i1 true, <2 x i32> [[A:%.*]], i1 true, <4 x i32> [[B:%.*]], <8 x i32> [[C:%.*]], i16 [[INDEX:%.*]], i1 true)
|
||||
// CHECK-GFX1200-NEXT: store <8 x i32> [[TMP0]], ptr addrspace(1) [[OUT:%.*]], align 32, !tbaa [[TBAA4]]
|
||||
// CHECK-GFX1200-NEXT: ret void
|
||||
//
|
||||
void test_amdgcn_swmmac_i32_16x16x32_iu8_w32(global v8i* out, v2i a, v4i b, v8i c, short index)
|
||||
{
|
||||
*out = __builtin_amdgcn_swmmac_i32_16x16x32_iu8_w32(true, a, true, b, c, index, true);
|
||||
}
|
||||
|
||||
// CHECK-GFX1200-LABEL: @test_amdgcn_swmmac_i32_16x16x32_iu4_w32(
|
||||
// CHECK-GFX1200-NEXT: entry:
|
||||
// CHECK-GFX1200-NEXT: [[TMP0:%.*]] = tail call <8 x i32> @llvm.amdgcn.swmmac.i32.16x16x32.iu4.v8i32.i32.v2i32.i16(i1 true, i32 [[A:%.*]], i1 true, <2 x i32> [[B:%.*]], <8 x i32> [[C:%.*]], i16 [[INDEX:%.*]], i1 true)
|
||||
// CHECK-GFX1200-NEXT: store <8 x i32> [[TMP0]], ptr addrspace(1) [[OUT:%.*]], align 32, !tbaa [[TBAA4]]
|
||||
// CHECK-GFX1200-NEXT: ret void
|
||||
//
|
||||
void test_amdgcn_swmmac_i32_16x16x32_iu4_w32(global v8i* out, int a, v2i b, v8i c, short index)
|
||||
{
|
||||
*out = __builtin_amdgcn_swmmac_i32_16x16x32_iu4_w32(true, a, true, b, c, index, true);
|
||||
}
|
||||
|
||||
// CHECK-GFX1200-LABEL: @test_amdgcn_swmmac_i32_16x16x64_iu4_w32(
|
||||
// CHECK-GFX1200-NEXT: entry:
|
||||
// CHECK-GFX1200-NEXT: [[TMP0:%.*]] = tail call <8 x i32> @llvm.amdgcn.swmmac.i32.16x16x64.iu4.v8i32.v2i32.v4i32.i16(i1 true, <2 x i32> [[A:%.*]], i1 true, <4 x i32> [[B:%.*]], <8 x i32> [[C:%.*]], i16 [[INDEX:%.*]], i1 true)
|
||||
// CHECK-GFX1200-NEXT: store <8 x i32> [[TMP0]], ptr addrspace(1) [[OUT:%.*]], align 32, !tbaa [[TBAA4]]
|
||||
// CHECK-GFX1200-NEXT: ret void
|
||||
//
|
||||
void test_amdgcn_swmmac_i32_16x16x64_iu4_w32(global v8i* out, v2i a, v4i b, v8i c, short index)
|
||||
{
|
||||
*out = __builtin_amdgcn_swmmac_i32_16x16x64_iu4_w32(true, a, true, b, c, index, true);
|
||||
}
|
||||
|
||||
// CHECK-GFX1200-LABEL: @test_amdgcn_swmmac_f32_16x16x32_fp8_fp8_w32(
|
||||
// CHECK-GFX1200-NEXT: entry:
|
||||
// CHECK-GFX1200-NEXT: [[TMP0:%.*]] = tail call <8 x float> @llvm.amdgcn.swmmac.f32.16x16x32.fp8.fp8.v8f32.v2i32.v4i32.i16(<2 x i32> [[A:%.*]], <4 x i32> [[B:%.*]], <8 x float> [[C:%.*]], i16 [[INDEX:%.*]])
|
||||
// CHECK-GFX1200-NEXT: store <8 x float> [[TMP0]], ptr addrspace(1) [[OUT:%.*]], align 32, !tbaa [[TBAA4]]
|
||||
// CHECK-GFX1200-NEXT: ret void
|
||||
//
|
||||
void test_amdgcn_swmmac_f32_16x16x32_fp8_fp8_w32(global v8f* out, v2i a, v4i b, v8f c, short index)
|
||||
{
|
||||
*out = __builtin_amdgcn_swmmac_f32_16x16x32_fp8_fp8_w32(a, b, c, index);
|
||||
}
|
||||
|
||||
// CHECK-GFX1200-LABEL: @test_amdgcn_swmmac_f32_16x16x32_fp8_bf8_w32(
|
||||
// CHECK-GFX1200-NEXT: entry:
|
||||
// CHECK-GFX1200-NEXT: [[TMP0:%.*]] = tail call <8 x float> @llvm.amdgcn.swmmac.f32.16x16x32.fp8.bf8.v8f32.v2i32.v4i32.i16(<2 x i32> [[A:%.*]], <4 x i32> [[B:%.*]], <8 x float> [[C:%.*]], i16 [[INDEX:%.*]])
|
||||
// CHECK-GFX1200-NEXT: store <8 x float> [[TMP0]], ptr addrspace(1) [[OUT:%.*]], align 32, !tbaa [[TBAA4]]
|
||||
// CHECK-GFX1200-NEXT: ret void
|
||||
//
|
||||
void test_amdgcn_swmmac_f32_16x16x32_fp8_bf8_w32(global v8f* out, v2i a, v4i b, v8f c, short index)
|
||||
{
|
||||
*out = __builtin_amdgcn_swmmac_f32_16x16x32_fp8_bf8_w32(a, b, c, index);
|
||||
}
|
||||
|
||||
// CHECK-GFX1200-LABEL: @test_amdgcn_swmmac_f32_16x16x32_bf8_fp8_w32(
|
||||
// CHECK-GFX1200-NEXT: entry:
|
||||
// CHECK-GFX1200-NEXT: [[TMP0:%.*]] = tail call <8 x float> @llvm.amdgcn.swmmac.f32.16x16x32.bf8.fp8.v8f32.v2i32.v4i32.i16(<2 x i32> [[A:%.*]], <4 x i32> [[B:%.*]], <8 x float> [[C:%.*]], i16 [[INDEX:%.*]])
|
||||
// CHECK-GFX1200-NEXT: store <8 x float> [[TMP0]], ptr addrspace(1) [[OUT:%.*]], align 32, !tbaa [[TBAA4]]
|
||||
// CHECK-GFX1200-NEXT: ret void
|
||||
//
|
||||
void test_amdgcn_swmmac_f32_16x16x32_bf8_fp8_w32(global v8f* out, v2i a, v4i b, v8f c, short index)
|
||||
{
|
||||
*out = __builtin_amdgcn_swmmac_f32_16x16x32_bf8_fp8_w32(a, b, c, index);
|
||||
}
|
||||
|
||||
// CHECK-GFX1200-LABEL: @test_amdgcn_swmmac_f32_16x16x32_bf8_bf8_w32(
|
||||
// CHECK-GFX1200-NEXT: entry:
|
||||
// CHECK-GFX1200-NEXT: [[TMP0:%.*]] = tail call <8 x float> @llvm.amdgcn.swmmac.f32.16x16x32.bf8.bf8.v8f32.v2i32.v4i32.i16(<2 x i32> [[A:%.*]], <4 x i32> [[B:%.*]], <8 x float> [[C:%.*]], i16 [[INDEX:%.*]])
|
||||
// CHECK-GFX1200-NEXT: store <8 x float> [[TMP0]], ptr addrspace(1) [[OUT:%.*]], align 32, !tbaa [[TBAA4]]
|
||||
// CHECK-GFX1200-NEXT: ret void
|
||||
//
|
||||
void test_amdgcn_swmmac_f32_16x16x32_bf8_bf8_w32(global v8f* out, v2i a, v4i b, v8f c, short index)
|
||||
{
|
||||
*out = __builtin_amdgcn_swmmac_f32_16x16x32_bf8_bf8_w32(a, b, c, index);
|
||||
}
|
134
clang/test/CodeGenOpenCL/builtins-amdgcn-swmmac-w64.cl
Normal file
134
clang/test/CodeGenOpenCL/builtins-amdgcn-swmmac-w64.cl
Normal file
@ -0,0 +1,134 @@
|
||||
// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py
|
||||
// REQUIRES: amdgpu-registered-target
|
||||
// RUN: %clang_cc1 -triple amdgcn-unknown-unknown -target-cpu gfx1200 -target-feature +wavefrontsize64 -S -emit-llvm -o - %s | FileCheck %s --check-prefix=CHECK-GFX1200
|
||||
|
||||
typedef int v2i __attribute__((ext_vector_type(2)));
|
||||
typedef int v4i __attribute__((ext_vector_type(4)));
|
||||
typedef float v4f __attribute__((ext_vector_type(4)));
|
||||
typedef half v4h __attribute__((ext_vector_type(4)));
|
||||
typedef short v4s __attribute__((ext_vector_type(4)));
|
||||
typedef half v8h __attribute__((ext_vector_type(8)));
|
||||
typedef short v8s __attribute__((ext_vector_type(8)));
|
||||
|
||||
// Wave64
|
||||
|
||||
// CHECK-GFX1200-LABEL: @test_amdgcn_swmmac_f32_16x16x32_f16_w64(
|
||||
// CHECK-GFX1200-NEXT: entry:
|
||||
// CHECK-GFX1200-NEXT: [[TMP0:%.*]] = tail call <4 x float> @llvm.amdgcn.swmmac.f32.16x16x32.f16.v4f32.v4f16.v8f16.i16(<4 x half> [[A:%.*]], <8 x half> [[B:%.*]], <4 x float> [[C:%.*]], i16 [[INDEX:%.*]])
|
||||
// CHECK-GFX1200-NEXT: store <4 x float> [[TMP0]], ptr addrspace(1) [[OUT:%.*]], align 16, !tbaa [[TBAA4:![0-9]+]]
|
||||
// CHECK-GFX1200-NEXT: ret void
|
||||
//
|
||||
void test_amdgcn_swmmac_f32_16x16x32_f16_w64(global v4f* out, v4h a, v8h b, v4f c, short index)
|
||||
{
|
||||
*out = __builtin_amdgcn_swmmac_f32_16x16x32_f16_w64(a, b, c, index);
|
||||
}
|
||||
|
||||
// CHECK-GFX1200-LABEL: @test_amdgcn_swmmac_f32_16x16x32_bf16_w64(
|
||||
// CHECK-GFX1200-NEXT: entry:
|
||||
// CHECK-GFX1200-NEXT: [[TMP0:%.*]] = tail call <4 x float> @llvm.amdgcn.swmmac.f32.16x16x32.bf16.v4f32.v4i16.v8i16.i16(<4 x i16> [[A:%.*]], <8 x i16> [[B:%.*]], <4 x float> [[C:%.*]], i16 [[INDEX:%.*]])
|
||||
// CHECK-GFX1200-NEXT: store <4 x float> [[TMP0]], ptr addrspace(1) [[OUT:%.*]], align 16, !tbaa [[TBAA4]]
|
||||
// CHECK-GFX1200-NEXT: ret void
|
||||
//
|
||||
void test_amdgcn_swmmac_f32_16x16x32_bf16_w64(global v4f* out, v4s a, v8s b, v4f c, short index)
|
||||
{
|
||||
*out = __builtin_amdgcn_swmmac_f32_16x16x32_bf16_w64(a, b, c, index);
|
||||
}
|
||||
|
||||
// CHECK-GFX1200-LABEL: @test_amdgcn_swmmac_f16_16x16x32_f16_w64(
|
||||
// CHECK-GFX1200-NEXT: entry:
|
||||
// CHECK-GFX1200-NEXT: [[TMP0:%.*]] = tail call <4 x half> @llvm.amdgcn.swmmac.f16.16x16x32.f16.v4f16.v4f16.v8f16.i16(<4 x half> [[A:%.*]], <8 x half> [[B:%.*]], <4 x half> [[C:%.*]], i16 [[INDEX:%.*]])
|
||||
// CHECK-GFX1200-NEXT: store <4 x half> [[TMP0]], ptr addrspace(1) [[OUT:%.*]], align 8, !tbaa [[TBAA4]]
|
||||
// CHECK-GFX1200-NEXT: ret void
|
||||
//
|
||||
void test_amdgcn_swmmac_f16_16x16x32_f16_w64(global v4h* out, v4h a, v8h b, v4h c, short index)
|
||||
{
|
||||
*out = __builtin_amdgcn_swmmac_f16_16x16x32_f16_w64(a, b, c, index);
|
||||
}
|
||||
|
||||
// CHECK-GFX1200-LABEL: @test_amdgcn_swmmac_bf16_16x16x32_bf16_w64(
|
||||
// CHECK-GFX1200-NEXT: entry:
|
||||
// CHECK-GFX1200-NEXT: [[TMP0:%.*]] = tail call <4 x i16> @llvm.amdgcn.swmmac.bf16.16x16x32.bf16.v4i16.v4i16.v8i16.i16(<4 x i16> [[A:%.*]], <8 x i16> [[B:%.*]], <4 x i16> [[C:%.*]], i16 [[INDEX:%.*]])
|
||||
// CHECK-GFX1200-NEXT: store <4 x i16> [[TMP0]], ptr addrspace(1) [[OUT:%.*]], align 8, !tbaa [[TBAA4]]
|
||||
// CHECK-GFX1200-NEXT: ret void
|
||||
//
|
||||
void test_amdgcn_swmmac_bf16_16x16x32_bf16_w64(global v4s* out, v4s a, v8s b, v4s c, short index)
|
||||
{
|
||||
*out = __builtin_amdgcn_swmmac_bf16_16x16x32_bf16_w64(a, b, c, index);
|
||||
}
|
||||
|
||||
// CHECK-GFX1200-LABEL: @test_amdgcn_swmmac_i32_16x16x32_iu8_w64(
|
||||
// CHECK-GFX1200-NEXT: entry:
|
||||
// CHECK-GFX1200-NEXT: [[TMP0:%.*]] = tail call <4 x i32> @llvm.amdgcn.swmmac.i32.16x16x32.iu8.v4i32.i32.v2i32.i16(i1 true, i32 [[A:%.*]], i1 true, <2 x i32> [[B:%.*]], <4 x i32> [[C:%.*]], i16 [[INDEX:%.*]], i1 true)
|
||||
// CHECK-GFX1200-NEXT: store <4 x i32> [[TMP0]], ptr addrspace(1) [[OUT:%.*]], align 16, !tbaa [[TBAA4]]
|
||||
// CHECK-GFX1200-NEXT: ret void
|
||||
//
|
||||
void test_amdgcn_swmmac_i32_16x16x32_iu8_w64(global v4i* out, int a, v2i b, v4i c, short index)
|
||||
{
|
||||
*out = __builtin_amdgcn_swmmac_i32_16x16x32_iu8_w64(true, a, true, b, c, index, true);
|
||||
}
|
||||
|
||||
// CHECK-GFX1200-LABEL: @test_amdgcn_swmmac_i32_16x16x32_iu4_w64(
|
||||
// CHECK-GFX1200-NEXT: entry:
|
||||
// CHECK-GFX1200-NEXT: [[TMP0:%.*]] = tail call <4 x i32> @llvm.amdgcn.swmmac.i32.16x16x32.iu4.v4i32.i32.i32.i16(i1 true, i32 [[A:%.*]], i1 true, i32 [[B:%.*]], <4 x i32> [[C:%.*]], i16 [[INDEX:%.*]], i1 true)
|
||||
// CHECK-GFX1200-NEXT: store <4 x i32> [[TMP0]], ptr addrspace(1) [[OUT:%.*]], align 16, !tbaa [[TBAA4]]
|
||||
// CHECK-GFX1200-NEXT: ret void
|
||||
//
|
||||
void test_amdgcn_swmmac_i32_16x16x32_iu4_w64(global v4i* out, int a, int b, v4i c, short index)
|
||||
{
|
||||
*out = __builtin_amdgcn_swmmac_i32_16x16x32_iu4_w64(true, a, true, b, c, index, true);
|
||||
}
|
||||
|
||||
// CHECK-GFX1200-LABEL: @test_amdgcn_swmmac_i32_16x16x64_iu4_w64(
|
||||
// CHECK-GFX1200-NEXT: entry:
|
||||
// CHECK-GFX1200-NEXT: [[TMP0:%.*]] = tail call <4 x i32> @llvm.amdgcn.swmmac.i32.16x16x64.iu4.v4i32.i32.v2i32.i16(i1 true, i32 [[A:%.*]], i1 true, <2 x i32> [[B:%.*]], <4 x i32> [[C:%.*]], i16 [[INDEX:%.*]], i1 true)
|
||||
// CHECK-GFX1200-NEXT: store <4 x i32> [[TMP0]], ptr addrspace(1) [[OUT:%.*]], align 16, !tbaa [[TBAA4]]
|
||||
// CHECK-GFX1200-NEXT: ret void
|
||||
//
|
||||
void test_amdgcn_swmmac_i32_16x16x64_iu4_w64(global v4i* out, int a, v2i b, v4i c, short index)
|
||||
{
|
||||
*out = __builtin_amdgcn_swmmac_i32_16x16x64_iu4_w64(true, a, true, b, c, index, true);
|
||||
}
|
||||
|
||||
// CHECK-GFX1200-LABEL: @test_amdgcn_swmmac_f32_16x16x32_fp8_fp8_w64(
|
||||
// CHECK-GFX1200-NEXT: entry:
|
||||
// CHECK-GFX1200-NEXT: [[TMP0:%.*]] = tail call <4 x float> @llvm.amdgcn.swmmac.f32.16x16x32.fp8.fp8.v4f32.i32.v2i32.i16(i32 [[A:%.*]], <2 x i32> [[B:%.*]], <4 x float> [[C:%.*]], i16 [[INDEX:%.*]])
|
||||
// CHECK-GFX1200-NEXT: store <4 x float> [[TMP0]], ptr addrspace(1) [[OUT:%.*]], align 16, !tbaa [[TBAA4]]
|
||||
// CHECK-GFX1200-NEXT: ret void
|
||||
//
|
||||
void test_amdgcn_swmmac_f32_16x16x32_fp8_fp8_w64(global v4f* out, int a, v2i b, v4f c, short index)
|
||||
{
|
||||
*out = __builtin_amdgcn_swmmac_f32_16x16x32_fp8_fp8_w64(a, b, c, index);
|
||||
}
|
||||
|
||||
// CHECK-GFX1200-LABEL: @test_amdgcn_swmmac_f32_16x16x32_fp8_bf8_w64(
|
||||
// CHECK-GFX1200-NEXT: entry:
|
||||
// CHECK-GFX1200-NEXT: [[TMP0:%.*]] = tail call <4 x float> @llvm.amdgcn.swmmac.f32.16x16x32.fp8.bf8.v4f32.i32.v2i32.i16(i32 [[A:%.*]], <2 x i32> [[B:%.*]], <4 x float> [[C:%.*]], i16 [[INDEX:%.*]])
|
||||
// CHECK-GFX1200-NEXT: store <4 x float> [[TMP0]], ptr addrspace(1) [[OUT:%.*]], align 16, !tbaa [[TBAA4]]
|
||||
// CHECK-GFX1200-NEXT: ret void
|
||||
//
|
||||
void test_amdgcn_swmmac_f32_16x16x32_fp8_bf8_w64(global v4f* out, int a, v2i b, v4f c, short index)
|
||||
{
|
||||
*out = __builtin_amdgcn_swmmac_f32_16x16x32_fp8_bf8_w64(a, b, c, index);
|
||||
}
|
||||
|
||||
// CHECK-GFX1200-LABEL: @test_amdgcn_swmmac_f32_16x16x32_bf8_fp8_w64(
|
||||
// CHECK-GFX1200-NEXT: entry:
|
||||
// CHECK-GFX1200-NEXT: [[TMP0:%.*]] = tail call <4 x float> @llvm.amdgcn.swmmac.f32.16x16x32.bf8.fp8.v4f32.i32.v2i32.i16(i32 [[A:%.*]], <2 x i32> [[B:%.*]], <4 x float> [[C:%.*]], i16 [[INDEX:%.*]])
|
||||
// CHECK-GFX1200-NEXT: store <4 x float> [[TMP0]], ptr addrspace(1) [[OUT:%.*]], align 16, !tbaa [[TBAA4]]
|
||||
// CHECK-GFX1200-NEXT: ret void
|
||||
//
|
||||
void test_amdgcn_swmmac_f32_16x16x32_bf8_fp8_w64(global v4f* out, int a, v2i b, v4f c, short index)
|
||||
{
|
||||
*out = __builtin_amdgcn_swmmac_f32_16x16x32_bf8_fp8_w64(a, b, c, index);
|
||||
}
|
||||
|
||||
// CHECK-GFX1200-LABEL: @test_amdgcn_swmmac_f32_16x16x32_bf8_bf8_w64(
|
||||
// CHECK-GFX1200-NEXT: entry:
|
||||
// CHECK-GFX1200-NEXT: [[TMP0:%.*]] = tail call <4 x float> @llvm.amdgcn.swmmac.f32.16x16x32.bf8.bf8.v4f32.i32.v2i32.i16(i32 [[A:%.*]], <2 x i32> [[B:%.*]], <4 x float> [[C:%.*]], i16 [[INDEX:%.*]])
|
||||
// CHECK-GFX1200-NEXT: store <4 x float> [[TMP0]], ptr addrspace(1) [[OUT:%.*]], align 16, !tbaa [[TBAA4]]
|
||||
// CHECK-GFX1200-NEXT: ret void
|
||||
//
|
||||
void test_amdgcn_swmmac_f32_16x16x32_bf8_bf8_w64(global v4f* out, int a, v2i b, v4f c, short index)
|
||||
{
|
||||
*out = __builtin_amdgcn_swmmac_f32_16x16x32_bf8_bf8_w64(a, b, c, index);
|
||||
}
|
@ -16,7 +16,7 @@ typedef short v16s __attribute__((ext_vector_type(16)));
|
||||
|
||||
// Wave32
|
||||
|
||||
void test_amdgcn_wmma_f32_16x16x16_bf16_w32(global v8f* out8f, v16s a16s, v16s b16s, v8f c8f,
|
||||
void test_amdgcn_wmma_f32_16x16x16_bf16_w32(global v8f* out8f, v16s a16s, v16s b16s, v8f c8f,
|
||||
global v16h* out16h, v16h a16h, v16h b16h, v16h c16h,
|
||||
global v16s* out16s, v2i a2i, v2i b2i, v16s c16s,
|
||||
global v8i* out8i, v4i a4i, v4i b4i, v8i c8i)
|
||||
|
@ -2,7 +2,6 @@
|
||||
// REQUIRES: amdgpu-registered-target
|
||||
// RUN: %clang_cc1 -triple amdgcn-unknown-unknown -target-cpu gfx1100 -DWMMA_GFX1100_TESTS -S -emit-llvm -o - %s | FileCheck %s --check-prefix=CHECK-GFX1100
|
||||
|
||||
typedef float v4f __attribute__((ext_vector_type(4)));
|
||||
typedef float v8f __attribute__((ext_vector_type(8)));
|
||||
typedef half v16h __attribute__((ext_vector_type(16)));
|
||||
typedef int v2i __attribute__((ext_vector_type(2)));
|
||||
@ -20,7 +19,7 @@ typedef short v16s __attribute__((ext_vector_type(16)));
|
||||
|
||||
// CHECK-GFX1100-LABEL: @test_amdgcn_wmma_f32_16x16x16_f16_w32(
|
||||
// CHECK-GFX1100-NEXT: entry:
|
||||
// CHECK-GFX1100-NEXT: [[TMP0:%.*]] = tail call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.f16.v8f32(<16 x half> [[A:%.*]], <16 x half> [[B:%.*]], <8 x float> [[C:%.*]])
|
||||
// CHECK-GFX1100-NEXT: [[TMP0:%.*]] = tail call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.f16.v8f32.v16f16(<16 x half> [[A:%.*]], <16 x half> [[B:%.*]], <8 x float> [[C:%.*]])
|
||||
// CHECK-GFX1100-NEXT: store <8 x float> [[TMP0]], ptr addrspace(1) [[OUT:%.*]], align 32, !tbaa [[TBAA4:![0-9]+]]
|
||||
// CHECK-GFX1100-NEXT: ret void
|
||||
//
|
||||
@ -35,7 +34,7 @@ void test_amdgcn_wmma_f32_16x16x16_f16_w32(global v8f* out, v16h a, v16h b, v8f
|
||||
|
||||
// CHECK-GFX1100-LABEL: @test_amdgcn_wmma_f32_16x16x16_bf16_w32(
|
||||
// CHECK-GFX1100-NEXT: entry:
|
||||
// CHECK-GFX1100-NEXT: [[TMP0:%.*]] = tail call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf16.v8f32(<16 x i16> [[A:%.*]], <16 x i16> [[B:%.*]], <8 x float> [[C:%.*]])
|
||||
// CHECK-GFX1100-NEXT: [[TMP0:%.*]] = tail call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf16.v8f32.v16i16(<16 x i16> [[A:%.*]], <16 x i16> [[B:%.*]], <8 x float> [[C:%.*]])
|
||||
// CHECK-GFX1100-NEXT: store <8 x float> [[TMP0]], ptr addrspace(1) [[OUT:%.*]], align 32, !tbaa [[TBAA4]]
|
||||
// CHECK-GFX1100-NEXT: ret void
|
||||
//
|
||||
@ -50,7 +49,7 @@ void test_amdgcn_wmma_f32_16x16x16_bf16_w32(global v8f* out, v16s a, v16s b, v8f
|
||||
|
||||
// CHECK-GFX1100-LABEL: @test_amdgcn_wmma_f16_16x16x16_f16_w32(
|
||||
// CHECK-GFX1100-NEXT: entry:
|
||||
// CHECK-GFX1100-NEXT: [[TMP0:%.*]] = tail call <16 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16.v16f16(<16 x half> [[A:%.*]], <16 x half> [[B:%.*]], <16 x half> [[C:%.*]], i1 true)
|
||||
// CHECK-GFX1100-NEXT: [[TMP0:%.*]] = tail call <16 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16.v16f16.v16f16(<16 x half> [[A:%.*]], <16 x half> [[B:%.*]], <16 x half> [[C:%.*]], i1 true)
|
||||
// CHECK-GFX1100-NEXT: store <16 x half> [[TMP0]], ptr addrspace(1) [[OUT:%.*]], align 32, !tbaa [[TBAA4]]
|
||||
// CHECK-GFX1100-NEXT: ret void
|
||||
//
|
||||
@ -65,7 +64,7 @@ void test_amdgcn_wmma_f16_16x16x16_f16_w32(global v16h* out, v16h a, v16h b, v16
|
||||
|
||||
// CHECK-GFX1100-LABEL: @test_amdgcn_wmma_bf16_16x16x16_bf16_w32(
|
||||
// CHECK-GFX1100-NEXT: entry:
|
||||
// CHECK-GFX1100-NEXT: [[TMP0:%.*]] = tail call <16 x i16> @llvm.amdgcn.wmma.bf16.16x16x16.bf16.v16i16(<16 x i16> [[A:%.*]], <16 x i16> [[B:%.*]], <16 x i16> [[C:%.*]], i1 true)
|
||||
// CHECK-GFX1100-NEXT: [[TMP0:%.*]] = tail call <16 x i16> @llvm.amdgcn.wmma.bf16.16x16x16.bf16.v16i16.v16i16(<16 x i16> [[A:%.*]], <16 x i16> [[B:%.*]], <16 x i16> [[C:%.*]], i1 true)
|
||||
// CHECK-GFX1100-NEXT: store <16 x i16> [[TMP0]], ptr addrspace(1) [[OUT:%.*]], align 32, !tbaa [[TBAA4]]
|
||||
// CHECK-GFX1100-NEXT: ret void
|
||||
//
|
||||
@ -80,7 +79,7 @@ void test_amdgcn_wmma_bf16_16x16x16_bf16_w32(global v16s* out, v16s a, v16s b, v
|
||||
|
||||
// CHECK-GFX1100-LABEL: @test_amdgcn_wmma_f16_16x16x16_f16_tied_w32(
|
||||
// CHECK-GFX1100-NEXT: entry:
|
||||
// CHECK-GFX1100-NEXT: [[TMP0:%.*]] = tail call <16 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16.tied.v16f16(<16 x half> [[A:%.*]], <16 x half> [[B:%.*]], <16 x half> [[C:%.*]], i1 true)
|
||||
// CHECK-GFX1100-NEXT: [[TMP0:%.*]] = tail call <16 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16.tied.v16f16.v16f16(<16 x half> [[A:%.*]], <16 x half> [[B:%.*]], <16 x half> [[C:%.*]], i1 true)
|
||||
// CHECK-GFX1100-NEXT: store <16 x half> [[TMP0]], ptr addrspace(1) [[OUT:%.*]], align 32, !tbaa [[TBAA4]]
|
||||
// CHECK-GFX1100-NEXT: ret void
|
||||
//
|
||||
@ -95,7 +94,7 @@ void test_amdgcn_wmma_f16_16x16x16_f16_tied_w32(global v16h* out, v16h a, v16h b
|
||||
|
||||
// CHECK-GFX1100-LABEL: @test_amdgcn_wmma_bf16_16x16x16_bf16_tied_w32(
|
||||
// CHECK-GFX1100-NEXT: entry:
|
||||
// CHECK-GFX1100-NEXT: [[TMP0:%.*]] = tail call <16 x i16> @llvm.amdgcn.wmma.bf16.16x16x16.bf16.tied.v16i16(<16 x i16> [[A:%.*]], <16 x i16> [[B:%.*]], <16 x i16> [[C:%.*]], i1 true)
|
||||
// CHECK-GFX1100-NEXT: [[TMP0:%.*]] = tail call <16 x i16> @llvm.amdgcn.wmma.bf16.16x16x16.bf16.tied.v16i16.v16i16(<16 x i16> [[A:%.*]], <16 x i16> [[B:%.*]], <16 x i16> [[C:%.*]], i1 true)
|
||||
// CHECK-GFX1100-NEXT: store <16 x i16> [[TMP0]], ptr addrspace(1) [[OUT:%.*]], align 32, !tbaa [[TBAA4]]
|
||||
// CHECK-GFX1100-NEXT: ret void
|
||||
//
|
||||
@ -110,7 +109,7 @@ void test_amdgcn_wmma_bf16_16x16x16_bf16_tied_w32(global v16s* out, v16s a, v16s
|
||||
|
||||
// CHECK-GFX1100-LABEL: @test_amdgcn_wmma_i32_16x16x16_iu8_w32(
|
||||
// CHECK-GFX1100-NEXT: entry:
|
||||
// CHECK-GFX1100-NEXT: [[TMP0:%.*]] = tail call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8.v8i32(i1 true, <4 x i32> [[A:%.*]], i1 true, <4 x i32> [[B:%.*]], <8 x i32> [[C:%.*]], i1 false)
|
||||
// CHECK-GFX1100-NEXT: [[TMP0:%.*]] = tail call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8.v8i32.v4i32(i1 true, <4 x i32> [[A:%.*]], i1 true, <4 x i32> [[B:%.*]], <8 x i32> [[C:%.*]], i1 false)
|
||||
// CHECK-GFX1100-NEXT: store <8 x i32> [[TMP0]], ptr addrspace(1) [[OUT:%.*]], align 32, !tbaa [[TBAA4]]
|
||||
// CHECK-GFX1100-NEXT: ret void
|
||||
//
|
||||
@ -125,7 +124,7 @@ void test_amdgcn_wmma_i32_16x16x16_iu8_w32(global v8i* out, v4i a, v4i b, v8i c)
|
||||
|
||||
// CHECK-GFX1100-LABEL: @test_amdgcn_wmma_i32_16x16x16_iu4_w32(
|
||||
// CHECK-GFX1100-NEXT: entry:
|
||||
// CHECK-GFX1100-NEXT: [[TMP0:%.*]] = tail call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4.v8i32(i1 true, <2 x i32> [[A:%.*]], i1 true, <2 x i32> [[B:%.*]], <8 x i32> [[C:%.*]], i1 false)
|
||||
// CHECK-GFX1100-NEXT: [[TMP0:%.*]] = tail call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4.v8i32.v2i32(i1 true, <2 x i32> [[A:%.*]], i1 true, <2 x i32> [[B:%.*]], <8 x i32> [[C:%.*]], i1 false)
|
||||
// CHECK-GFX1100-NEXT: store <8 x i32> [[TMP0]], ptr addrspace(1) [[OUT:%.*]], align 32, !tbaa [[TBAA4]]
|
||||
// CHECK-GFX1100-NEXT: ret void
|
||||
//
|
||||
|
@ -3,12 +3,10 @@
|
||||
// RUN: %clang_cc1 -triple amdgcn-unknown-unknown -target-cpu gfx1100 -DWMMA_GFX1100_TESTS -S -emit-llvm -o - %s | FileCheck %s --check-prefix=CHECK-GFX1100
|
||||
|
||||
typedef float v4f __attribute__((ext_vector_type(4)));
|
||||
typedef float v8f __attribute__((ext_vector_type(8)));
|
||||
typedef half v8h __attribute__((ext_vector_type(8)));
|
||||
typedef half v16h __attribute__((ext_vector_type(16)));
|
||||
typedef int v2i __attribute__((ext_vector_type(2)));
|
||||
typedef int v4i __attribute__((ext_vector_type(4)));
|
||||
typedef int v8i __attribute__((ext_vector_type(8)));
|
||||
typedef short v8s __attribute__((ext_vector_type(8)));
|
||||
typedef short v16s __attribute__((ext_vector_type(16)));
|
||||
|
||||
@ -22,7 +20,7 @@ typedef short v16s __attribute__((ext_vector_type(16)));
|
||||
|
||||
// CHECK-GFX1100-LABEL: @test_amdgcn_wmma_f32_16x16x16_f16_w64(
|
||||
// CHECK-GFX1100-NEXT: entry:
|
||||
// CHECK-GFX1100-NEXT: [[TMP0:%.*]] = tail call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.f16.v4f32(<16 x half> [[A:%.*]], <16 x half> [[B:%.*]], <4 x float> [[C:%.*]])
|
||||
// CHECK-GFX1100-NEXT: [[TMP0:%.*]] = tail call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.f16.v4f32.v16f16(<16 x half> [[A:%.*]], <16 x half> [[B:%.*]], <4 x float> [[C:%.*]])
|
||||
// CHECK-GFX1100-NEXT: store <4 x float> [[TMP0]], ptr addrspace(1) [[OUT:%.*]], align 16, !tbaa [[TBAA4:![0-9]+]]
|
||||
// CHECK-GFX1100-NEXT: ret void
|
||||
//
|
||||
@ -37,7 +35,7 @@ void test_amdgcn_wmma_f32_16x16x16_f16_w64(global v4f* out, v16h a, v16h b, v4f
|
||||
|
||||
// CHECK-GFX1100-LABEL: @test_amdgcn_wmma_f32_16x16x16_bf16_w64(
|
||||
// CHECK-GFX1100-NEXT: entry:
|
||||
// CHECK-GFX1100-NEXT: [[TMP0:%.*]] = tail call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf16.v4f32(<16 x i16> [[A:%.*]], <16 x i16> [[B:%.*]], <4 x float> [[C:%.*]])
|
||||
// CHECK-GFX1100-NEXT: [[TMP0:%.*]] = tail call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf16.v4f32.v16i16(<16 x i16> [[A:%.*]], <16 x i16> [[B:%.*]], <4 x float> [[C:%.*]])
|
||||
// CHECK-GFX1100-NEXT: store <4 x float> [[TMP0]], ptr addrspace(1) [[OUT:%.*]], align 16, !tbaa [[TBAA4]]
|
||||
// CHECK-GFX1100-NEXT: ret void
|
||||
//
|
||||
@ -52,7 +50,7 @@ void test_amdgcn_wmma_f32_16x16x16_bf16_w64(global v4f* out, v16s a, v16s b, v4f
|
||||
|
||||
// CHECK-GFX1100-LABEL: @test_amdgcn_wmma_f16_16x16x16_f16_w64(
|
||||
// CHECK-GFX1100-NEXT: entry:
|
||||
// CHECK-GFX1100-NEXT: [[TMP0:%.*]] = tail call <8 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16.v8f16(<16 x half> [[A:%.*]], <16 x half> [[B:%.*]], <8 x half> [[C:%.*]], i1 true)
|
||||
// CHECK-GFX1100-NEXT: [[TMP0:%.*]] = tail call <8 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16.v8f16.v16f16(<16 x half> [[A:%.*]], <16 x half> [[B:%.*]], <8 x half> [[C:%.*]], i1 true)
|
||||
// CHECK-GFX1100-NEXT: store <8 x half> [[TMP0]], ptr addrspace(1) [[OUT:%.*]], align 16, !tbaa [[TBAA4]]
|
||||
// CHECK-GFX1100-NEXT: ret void
|
||||
//
|
||||
@ -67,7 +65,7 @@ void test_amdgcn_wmma_f16_16x16x16_f16_w64(global v8h* out, v16h a, v16h b, v8h
|
||||
|
||||
// CHECK-GFX1100-LABEL: @test_amdgcn_wmma_bf16_16x16x16_bf16_w64(
|
||||
// CHECK-GFX1100-NEXT: entry:
|
||||
// CHECK-GFX1100-NEXT: [[TMP0:%.*]] = tail call <8 x i16> @llvm.amdgcn.wmma.bf16.16x16x16.bf16.v8i16(<16 x i16> [[A:%.*]], <16 x i16> [[B:%.*]], <8 x i16> [[C:%.*]], i1 true)
|
||||
// CHECK-GFX1100-NEXT: [[TMP0:%.*]] = tail call <8 x i16> @llvm.amdgcn.wmma.bf16.16x16x16.bf16.v8i16.v16i16(<16 x i16> [[A:%.*]], <16 x i16> [[B:%.*]], <8 x i16> [[C:%.*]], i1 true)
|
||||
// CHECK-GFX1100-NEXT: store <8 x i16> [[TMP0]], ptr addrspace(1) [[OUT:%.*]], align 16, !tbaa [[TBAA4]]
|
||||
// CHECK-GFX1100-NEXT: ret void
|
||||
//
|
||||
@ -82,7 +80,7 @@ void test_amdgcn_wmma_bf16_16x16x16_bf16_w64(global v8s* out, v16s a, v16s b, v8
|
||||
|
||||
// CHECK-GFX1100-LABEL: @test_amdgcn_wmma_f16_16x16x16_f16_tied_w64(
|
||||
// CHECK-GFX1100-NEXT: entry:
|
||||
// CHECK-GFX1100-NEXT: [[TMP0:%.*]] = tail call <8 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16.tied.v8f16(<16 x half> [[A:%.*]], <16 x half> [[B:%.*]], <8 x half> [[C:%.*]], i1 true)
|
||||
// CHECK-GFX1100-NEXT: [[TMP0:%.*]] = tail call <8 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16.tied.v8f16.v16f16(<16 x half> [[A:%.*]], <16 x half> [[B:%.*]], <8 x half> [[C:%.*]], i1 true)
|
||||
// CHECK-GFX1100-NEXT: store <8 x half> [[TMP0]], ptr addrspace(1) [[OUT:%.*]], align 16, !tbaa [[TBAA4]]
|
||||
// CHECK-GFX1100-NEXT: ret void
|
||||
//
|
||||
@ -97,7 +95,7 @@ void test_amdgcn_wmma_f16_16x16x16_f16_tied_w64(global v8h* out, v16h a, v16h b,
|
||||
|
||||
// CHECK-GFX1100-LABEL: @test_amdgcn_wmma_bf16_16x16x16_bf16_tied_w64(
|
||||
// CHECK-GFX1100-NEXT: entry:
|
||||
// CHECK-GFX1100-NEXT: [[TMP0:%.*]] = tail call <8 x i16> @llvm.amdgcn.wmma.bf16.16x16x16.bf16.tied.v8i16(<16 x i16> [[A:%.*]], <16 x i16> [[B:%.*]], <8 x i16> [[C:%.*]], i1 true)
|
||||
// CHECK-GFX1100-NEXT: [[TMP0:%.*]] = tail call <8 x i16> @llvm.amdgcn.wmma.bf16.16x16x16.bf16.tied.v8i16.v16i16(<16 x i16> [[A:%.*]], <16 x i16> [[B:%.*]], <8 x i16> [[C:%.*]], i1 true)
|
||||
// CHECK-GFX1100-NEXT: store <8 x i16> [[TMP0]], ptr addrspace(1) [[OUT:%.*]], align 16, !tbaa [[TBAA4]]
|
||||
// CHECK-GFX1100-NEXT: ret void
|
||||
//
|
||||
@ -112,7 +110,7 @@ void test_amdgcn_wmma_bf16_16x16x16_bf16_tied_w64(global v8s* out, v16s a, v16s
|
||||
|
||||
// CHECK-GFX1100-LABEL: @test_amdgcn_wmma_i32_16x16x16_iu8_w64(
|
||||
// CHECK-GFX1100-NEXT: entry:
|
||||
// CHECK-GFX1100-NEXT: [[TMP0:%.*]] = tail call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8.v4i32(i1 true, <4 x i32> [[A:%.*]], i1 true, <4 x i32> [[B:%.*]], <4 x i32> [[C:%.*]], i1 false)
|
||||
// CHECK-GFX1100-NEXT: [[TMP0:%.*]] = tail call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8.v4i32.v4i32(i1 true, <4 x i32> [[A:%.*]], i1 true, <4 x i32> [[B:%.*]], <4 x i32> [[C:%.*]], i1 false)
|
||||
// CHECK-GFX1100-NEXT: store <4 x i32> [[TMP0]], ptr addrspace(1) [[OUT:%.*]], align 16, !tbaa [[TBAA4]]
|
||||
// CHECK-GFX1100-NEXT: ret void
|
||||
//
|
||||
@ -127,7 +125,7 @@ void test_amdgcn_wmma_i32_16x16x16_iu8_w64(global v4i* out, v4i a, v4i b, v4i c)
|
||||
|
||||
// CHECK-GFX1100-LABEL: @test_amdgcn_wmma_i32_16x16x16_iu4_w64(
|
||||
// CHECK-GFX1100-NEXT: entry:
|
||||
// CHECK-GFX1100-NEXT: [[TMP0:%.*]] = tail call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4.v4i32(i1 true, <2 x i32> [[A:%.*]], i1 true, <2 x i32> [[B:%.*]], <4 x i32> [[C:%.*]], i1 false)
|
||||
// CHECK-GFX1100-NEXT: [[TMP0:%.*]] = tail call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4.v4i32.v2i32(i1 true, <2 x i32> [[A:%.*]], i1 true, <2 x i32> [[B:%.*]], <4 x i32> [[C:%.*]], i1 false)
|
||||
// CHECK-GFX1100-NEXT: store <4 x i32> [[TMP0]], ptr addrspace(1) [[OUT:%.*]], align 16, !tbaa [[TBAA4]]
|
||||
// CHECK-GFX1100-NEXT: ret void
|
||||
//
|
||||
|
107
cross-project-tests/amdgpu/builtins-amdgcn-gfx12-wmma-w32.cl
Normal file
107
cross-project-tests/amdgpu/builtins-amdgcn-gfx12-wmma-w32.cl
Normal file
@ -0,0 +1,107 @@
|
||||
// RUN: %clang_cc1 -triple amdgcn-unknown-unknown -target-cpu gfx1200 -target-feature +wavefrontsize32 -S -o - %s | FileCheck %s --check-prefix=CHECK-GFX1200
|
||||
|
||||
typedef int v2i __attribute__((ext_vector_type(2)));
|
||||
typedef float v8f __attribute__((ext_vector_type(8)));
|
||||
typedef half v8h __attribute__((ext_vector_type(8)));
|
||||
typedef short v8s __attribute__((ext_vector_type(8)));
|
||||
typedef int v8i __attribute__((ext_vector_type(8)));
|
||||
|
||||
// Wave32
|
||||
|
||||
|
||||
// CHECK-GFX1200-LABEL: test_amdgcn_wmma_f32_16x16x16_f16_w32:
|
||||
// CHECK-GFX1200: v_wmma_f32_16x16x16_f16 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}]
|
||||
//
|
||||
void test_amdgcn_wmma_f32_16x16x16_f16_w32(global v8f* out, v8h a, v8h b, v8f c)
|
||||
{
|
||||
*out = __builtin_amdgcn_wmma_f32_16x16x16_f16_w32_gfx12(a, b, c);
|
||||
}
|
||||
|
||||
|
||||
// CHECK-GFX1200-LABEL: test_amdgcn_wmma_f32_16x16x16_bf16_w32:
|
||||
// CHECK-GFX1200: v_wmma_f32_16x16x16_bf16 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}]
|
||||
//
|
||||
void test_amdgcn_wmma_f32_16x16x16_bf16_w32(global v8f* out, v8s a, v8s b, v8f c)
|
||||
{
|
||||
*out = __builtin_amdgcn_wmma_f32_16x16x16_bf16_w32_gfx12(a, b, c);
|
||||
}
|
||||
|
||||
|
||||
// CHECK-GFX1200-LABEL: test_amdgcn_wmma_f16_16x16x16_f16_w32:
|
||||
// CHECK-GFX1200: v_wmma_f16_16x16x16_f16 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}]
|
||||
//
|
||||
void test_amdgcn_wmma_f16_16x16x16_f16_w32(global v8h* out, v8h a, v8h b, v8h c)
|
||||
{
|
||||
*out = __builtin_amdgcn_wmma_f16_16x16x16_f16_w32_gfx12(a, b, c);
|
||||
}
|
||||
|
||||
|
||||
// CHECK-GFX1200-LABEL: test_amdgcn_wmma_bf16_16x16x16_bf16_w32:
|
||||
// CHECK-GFX1200: v_wmma_bf16_16x16x16_bf16 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}]
|
||||
void test_amdgcn_wmma_bf16_16x16x16_bf16_w32(global v8s* out, v8s a, v8s b, v8s c)
|
||||
{
|
||||
*out = __builtin_amdgcn_wmma_bf16_16x16x16_bf16_w32_gfx12(a, b, c);
|
||||
}
|
||||
|
||||
|
||||
// CHECK-GFX1200-LABEL: test_amdgcn_wmma_i32_16x16x16_iu8_w32:
|
||||
// CHECK-GFX1200: v_wmma_i32_16x16x16_iu8 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}] neg_lo:[1,1,0]
|
||||
//
|
||||
void test_amdgcn_wmma_i32_16x16x16_iu8_w32(global v8i* out, v2i a, v2i b, v8i c)
|
||||
{
|
||||
*out = __builtin_amdgcn_wmma_i32_16x16x16_iu8_w32_gfx12(true, a, true, b, c, false);
|
||||
}
|
||||
|
||||
|
||||
// CHECK-GFX1200-LABEL: test_amdgcn_wmma_i32_16x16x16_iu4_w32:
|
||||
// CHECK-GFX1200: v_wmma_i32_16x16x16_iu4 v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}}, v{{[0-9]+}}, v[{{[0-9]+:[0-9]+}}] neg_lo:[1,1,0]
|
||||
//
|
||||
void test_amdgcn_wmma_i32_16x16x16_iu4_w32(global v8i* out, int a, int b, v8i c)
|
||||
{
|
||||
*out = __builtin_amdgcn_wmma_i32_16x16x16_iu4_w32_gfx12(true, a, true, b, c, false);
|
||||
}
|
||||
|
||||
|
||||
// CHECK-GFX1200-LABEL: test_amdgcn_wmma_f32_16x16x16_fp8_fp8_w32:
|
||||
// CHECK-GFX1200: v_wmma_f32_16x16x16_fp8_fp8 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}]
|
||||
//
|
||||
void test_amdgcn_wmma_f32_16x16x16_fp8_fp8_w32(global v8f* out, v2i a, v2i b, v8f c)
|
||||
{
|
||||
*out = __builtin_amdgcn_wmma_f32_16x16x16_fp8_fp8_w32_gfx12(a, b, c);
|
||||
}
|
||||
|
||||
|
||||
// CHECK-GFX1200-LABEL: test_amdgcn_wmma_f32_16x16x16_fp8_bf8_w32:
|
||||
// CHECK-GFX1200: v_wmma_f32_16x16x16_fp8_bf8 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}]
|
||||
//
|
||||
void test_amdgcn_wmma_f32_16x16x16_fp8_bf8_w32(global v8f* out, v2i a, v2i b, v8f c)
|
||||
{
|
||||
*out = __builtin_amdgcn_wmma_f32_16x16x16_fp8_bf8_w32_gfx12(a, b, c);
|
||||
}
|
||||
|
||||
|
||||
// CHECK-GFX1200-LABEL: test_amdgcn_wmma_f32_16x16x16_bf8_fp8_w32:
|
||||
// CHECK-GFX1200: v_wmma_f32_16x16x16_bf8_fp8 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}]
|
||||
//
|
||||
void test_amdgcn_wmma_f32_16x16x16_bf8_fp8_w32(global v8f* out, v2i a, v2i b, v8f c)
|
||||
{
|
||||
*out = __builtin_amdgcn_wmma_f32_16x16x16_bf8_fp8_w32_gfx12(a, b, c);
|
||||
}
|
||||
|
||||
|
||||
// CHECK-GFX1200-LABEL: test_amdgcn_wmma_f32_16x16x16_bf8_bf8_w32:
|
||||
// CHECK-GFX1200: v_wmma_f32_16x16x16_bf8_bf8 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}]
|
||||
//
|
||||
void test_amdgcn_wmma_f32_16x16x16_bf8_bf8_w32(global v8f* out, v2i a, v2i b, v8f c)
|
||||
{
|
||||
*out = __builtin_amdgcn_wmma_f32_16x16x16_bf8_bf8_w32_gfx12(a, b, c);
|
||||
}
|
||||
|
||||
|
||||
// CHECK-GFX1200-LABEL: test_amdgcn_wmma_i32_16x16x32_iu4_w32:
|
||||
// CHECK-GFX1200: v_wmma_i32_16x16x32_iu4 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}] neg_lo:[1,1,0]
|
||||
//
|
||||
void test_amdgcn_wmma_i32_16x16x32_iu4_w32(global v8i* out, v2i a, v2i b, v8i c)
|
||||
{
|
||||
*out = __builtin_amdgcn_wmma_i32_16x16x32_iu4_w32_gfx12(true, a, true, b, c, false);
|
||||
}
|
104
cross-project-tests/amdgpu/builtins-amdgcn-gfx12-wmma-w64.cl
Normal file
104
cross-project-tests/amdgpu/builtins-amdgcn-gfx12-wmma-w64.cl
Normal file
@ -0,0 +1,104 @@
|
||||
// RUN: %clang_cc1 -triple amdgcn-unknown-unknown -target-cpu gfx1200 -target-feature +wavefrontsize64 -S -o - %s | FileCheck %s --check-prefix=CHECK-GFX1200
|
||||
|
||||
typedef float v4f __attribute__((ext_vector_type(4)));
|
||||
typedef half v4h __attribute__((ext_vector_type(4)));
|
||||
typedef short v4s __attribute__((ext_vector_type(4)));
|
||||
typedef int v4i __attribute__((ext_vector_type(4)));
|
||||
|
||||
// Wave64
|
||||
|
||||
|
||||
// CHECK-GFX1200-LABEL: test_amdgcn_wmma_f32_16x16x16_f16_w64:
|
||||
// CHECK-GFX1200: v_wmma_f32_16x16x16_f16 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}]
|
||||
//
|
||||
void test_amdgcn_wmma_f32_16x16x16_f16_w64(global v4f* out, v4h a, v4h b, v4f c)
|
||||
{
|
||||
*out = __builtin_amdgcn_wmma_f32_16x16x16_f16_w64_gfx12(a, b, c);
|
||||
}
|
||||
|
||||
|
||||
// CHECK-GFX1200-LABEL: test_amdgcn_wmma_f32_16x16x16_bf16_w64:
|
||||
// CHECK-GFX1200: v_wmma_f32_16x16x16_bf16 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}]
|
||||
//
|
||||
void test_amdgcn_wmma_f32_16x16x16_bf16_w64(global v4f* out, v4s a, v4s b, v4f c)
|
||||
{
|
||||
*out = __builtin_amdgcn_wmma_f32_16x16x16_bf16_w64_gfx12(a, b, c);
|
||||
}
|
||||
|
||||
|
||||
// CHECK-GFX1200-LABEL: test_amdgcn_wmma_f16_16x16x16_f16_w64:
|
||||
// CHECK-GFX1200: v_wmma_f16_16x16x16_f16 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}]
|
||||
//
|
||||
void test_amdgcn_wmma_f16_16x16x16_f16_w64(global v4h* out, v4h a, v4h b, v4h c)
|
||||
{
|
||||
*out = __builtin_amdgcn_wmma_f16_16x16x16_f16_w64_gfx12(a, b, c);
|
||||
}
|
||||
|
||||
|
||||
// CHECK-GFX1200-LABEL: test_amdgcn_wmma_bf16_16x16x16_bf16_w64:
|
||||
// CHECK-GFX1200: v_wmma_bf16_16x16x16_bf16 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}]
|
||||
//
|
||||
void test_amdgcn_wmma_bf16_16x16x16_bf16_w64(global v4s* out, v4s a, v4s b, v4s c)
|
||||
{
|
||||
*out = __builtin_amdgcn_wmma_bf16_16x16x16_bf16_w64_gfx12(a, b, c);
|
||||
}
|
||||
|
||||
|
||||
// CHECK-GFX1200-LABEL: test_amdgcn_wmma_i32_16x16x16_iu8_w64:
|
||||
// CHECK-GFX1200: v_wmma_i32_16x16x16_iu8 v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}}, v{{[0-9]+}}, v[{{[0-9]+:[0-9]+}}] neg_lo:[1,1,0]
|
||||
//
|
||||
void test_amdgcn_wmma_i32_16x16x16_iu8_w64(global v4i* out, int a, int b, v4i c)
|
||||
{
|
||||
*out = __builtin_amdgcn_wmma_i32_16x16x16_iu8_w64_gfx12(true, a, true, b, c, false);
|
||||
}
|
||||
|
||||
|
||||
// CHECK-GFX1200-LABEL: test_amdgcn_wmma_i32_16x16x16_iu4_w64:
|
||||
// CHECK-GFX1200: v_wmma_i32_16x16x16_iu4 v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}}, v{{[0-9]+}}, v[{{[0-9]+:[0-9]+}}] neg_lo:[1,1,0]
|
||||
//
|
||||
void test_amdgcn_wmma_i32_16x16x16_iu4_w64(global v4i* out, int a, int b, v4i c)
|
||||
{
|
||||
*out = __builtin_amdgcn_wmma_i32_16x16x16_iu4_w64_gfx12(true, a, true, b, c, false);
|
||||
}
|
||||
|
||||
|
||||
// CHECK-GFX1200-LABEL: test_amdgcn_wmma_f32_16x16x16_fp8_fp8_w32:
|
||||
// CHECK-GFX1200: v_wmma_f32_16x16x16_fp8_fp8 v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}}, v{{[0-9]+}}, v[{{[0-9]+:[0-9]+}}]
|
||||
//
|
||||
void test_amdgcn_wmma_f32_16x16x16_fp8_fp8_w32(global v4f* out, int a, int b, v4f c)
|
||||
{
|
||||
*out = __builtin_amdgcn_wmma_f32_16x16x16_fp8_fp8_w64_gfx12(a, b, c);
|
||||
}
|
||||
|
||||
|
||||
// CHECK-GFX1200-LABEL: test_amdgcn_wmma_f32_16x16x16_fp8_bf8_w32:
|
||||
// CHECK-GFX1200: v_wmma_f32_16x16x16_fp8_bf8 v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}}, v{{[0-9]+}}, v[{{[0-9]+:[0-9]+}}]
|
||||
//
|
||||
void test_amdgcn_wmma_f32_16x16x16_fp8_bf8_w32(global v4f* out, int a, int b, v4f c)
|
||||
{
|
||||
*out = __builtin_amdgcn_wmma_f32_16x16x16_fp8_bf8_w64_gfx12(a, b, c);
|
||||
}
|
||||
|
||||
// CHECK-GFX1200-LABEL: test_amdgcn_wmma_f32_16x16x16_bf8_fp8_w32:
|
||||
// CHECK-GFX1200: v_wmma_f32_16x16x16_bf8_fp8 v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}}, v{{[0-9]+}}, v[{{[0-9]+:[0-9]+}}]
|
||||
//
|
||||
void test_amdgcn_wmma_f32_16x16x16_bf8_fp8_w32(global v4f* out, int a, int b, v4f c)
|
||||
{
|
||||
*out = __builtin_amdgcn_wmma_f32_16x16x16_bf8_fp8_w64_gfx12(a, b, c);
|
||||
}
|
||||
|
||||
// CHECK-GFX1200-LABEL: test_amdgcn_wmma_f32_16x16x16_bf8_bf8_w32:
|
||||
// CHECK-GFX1200: v_wmma_f32_16x16x16_bf8_bf8 v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}}, v{{[0-9]+}}, v[{{[0-9]+:[0-9]+}}]
|
||||
//
|
||||
void test_amdgcn_wmma_f32_16x16x16_bf8_bf8_w32(global v4f* out, int a, int b, v4f c)
|
||||
{
|
||||
*out = __builtin_amdgcn_wmma_f32_16x16x16_bf8_bf8_w64_gfx12(a, b, c);
|
||||
}
|
||||
|
||||
// CHECK-GFX1200-LABEL: test_amdgcn_wmma_i32_16x16x32_iu4_w32:
|
||||
// CHECK-GFX1200: v_wmma_i32_16x16x32_iu4 v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}}, v{{[0-9]+}}, v[{{[0-9]+:[0-9]+}}] neg_lo:[1,1,0]
|
||||
//
|
||||
void test_amdgcn_wmma_i32_16x16x32_iu4_w32(global v4i* out, int a, int b, v4i c)
|
||||
{
|
||||
*out = __builtin_amdgcn_wmma_i32_16x16x32_iu4_w64_gfx12(true, a, true, b, c, false);
|
||||
}
|
110
cross-project-tests/amdgpu/builtins-amdgcn-swmmac-w32.cl
Normal file
110
cross-project-tests/amdgpu/builtins-amdgcn-swmmac-w32.cl
Normal file
@ -0,0 +1,110 @@
|
||||
// RUN: %clang_cc1 -triple amdgcn-unknown-unknown -target-cpu gfx1200 -target-feature +wavefrontsize32 -S -o - %s | FileCheck %s --check-prefix=CHECK-GFX1200
|
||||
|
||||
typedef int v2i __attribute__((ext_vector_type(2)));
|
||||
typedef int v4i __attribute__((ext_vector_type(4)));
|
||||
typedef float v8f __attribute__((ext_vector_type(8)));
|
||||
typedef half v8h __attribute__((ext_vector_type(8)));
|
||||
typedef short v8s __attribute__((ext_vector_type(8)));
|
||||
typedef int v8i __attribute__((ext_vector_type(8)));
|
||||
typedef half v16h __attribute__((ext_vector_type(16)));
|
||||
typedef short v16s __attribute__((ext_vector_type(16)));
|
||||
|
||||
// Wave32
|
||||
|
||||
|
||||
// CHECK-GFX1200-LABEL: test_amdgcn_swmmac_f32_16x16x32_f16_w32:
|
||||
// CHECK-GFX1200: v_swmmac_f32_16x16x32_f16 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}}
|
||||
//
|
||||
void test_amdgcn_swmmac_f32_16x16x32_f16_w32(global v8f* out, v8h a, v16h b, v8f c, short index)
|
||||
{
|
||||
*out = __builtin_amdgcn_swmmac_f32_16x16x32_f16_w32(a, b, c, index);
|
||||
}
|
||||
|
||||
|
||||
// CHECK-GFX1200-LABEL: test_amdgcn_swmmac_f32_16x16x32_bf16_w32:
|
||||
// CHECK-GFX1200: v_swmmac_f32_16x16x32_bf16 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}}
|
||||
//
|
||||
void test_amdgcn_swmmac_f32_16x16x32_bf16_w32(global v8f* out, v8s a, v16s b, v8f c, short index)
|
||||
{
|
||||
*out = __builtin_amdgcn_swmmac_f32_16x16x32_bf16_w32(a, b, c, index);
|
||||
}
|
||||
|
||||
|
||||
// CHECK-GFX1200-LABEL: test_amdgcn_swmmac_f16_16x16x32_f16_w32:
|
||||
// CHECK-GFX1200: v_swmmac_f16_16x16x32_f16 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}}
|
||||
//
|
||||
void test_amdgcn_swmmac_f16_16x16x32_f16_w32(global v8h* out, v8h a, v16h b, v8h c, short index)
|
||||
{
|
||||
*out = __builtin_amdgcn_swmmac_f16_16x16x32_f16_w32(a, b, c, index);
|
||||
}
|
||||
|
||||
|
||||
// CHECK-GFX1200-LABEL: test_amdgcn_swmmac_bf16_16x16x32_bf16_w32:
|
||||
// CHECK-GFX1200: v_swmmac_bf16_16x16x32_bf16 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}}
|
||||
//
|
||||
void test_amdgcn_swmmac_bf16_16x16x32_bf16_w32(global v8s* out, v8s a, v16s b, v8s c, short index)
|
||||
{
|
||||
*out = __builtin_amdgcn_swmmac_bf16_16x16x32_bf16_w32(a, b, c, index);
|
||||
}
|
||||
|
||||
|
||||
// CHECK-GFX1200-LABEL: test_amdgcn_swmmac_i32_16x16x32_iu8_w32:
|
||||
// CHECK-GFX1200: v_swmmac_i32_16x16x32_iu8 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}} neg_lo:[1,1,0] clamp
|
||||
//
|
||||
void test_amdgcn_swmmac_i32_16x16x32_iu8_w32(global v8i* out, v2i a, v4i b, v8i c, short index)
|
||||
{
|
||||
*out = __builtin_amdgcn_swmmac_i32_16x16x32_iu8_w32(true, a, true, b, c, index, true);
|
||||
}
|
||||
|
||||
|
||||
// CHECK-GFX1200-LABEL: test_amdgcn_swmmac_i32_16x16x32_iu4_w32:
|
||||
// CHECK-GFX1200: v_swmmac_i32_16x16x32_iu4 v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}}, v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}} neg_lo:[1,1,0] clamp
|
||||
//
|
||||
void test_amdgcn_swmmac_i32_16x16x32_iu4_w32(global v8i* out, int a, v2i b, v8i c, short index)
|
||||
{
|
||||
*out = __builtin_amdgcn_swmmac_i32_16x16x32_iu4_w32(true, a, true, b, c, index, true);
|
||||
}
|
||||
|
||||
|
||||
// CHECK-GFX1200-LABEL: test_amdgcn_swmmac_i32_16x16x64_iu4_w32:
|
||||
// CHECK-GFX1200: v_swmmac_i32_16x16x64_iu4 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}} neg_lo:[1,1,0] clamp
|
||||
//
|
||||
void test_amdgcn_swmmac_i32_16x16x64_iu4_w32(global v8i* out, v2i a, v4i b, v8i c, short index)
|
||||
{
|
||||
*out = __builtin_amdgcn_swmmac_i32_16x16x64_iu4_w32(true, a, true, b, c, index, true);
|
||||
}
|
||||
|
||||
|
||||
// CHECK-GFX1200-LABEL: test_amdgcn_swmmac_f32_16x16x32_fp8_fp8_w32:
|
||||
// CHECK-GFX1200: v_swmmac_f32_16x16x32_fp8_fp8 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}}
|
||||
//
|
||||
void test_amdgcn_swmmac_f32_16x16x32_fp8_fp8_w32(global v8f* out, v2i a, v4i b, v8f c, short index)
|
||||
{
|
||||
*out = __builtin_amdgcn_swmmac_f32_16x16x32_fp8_fp8_w32(a, b, c, index);
|
||||
}
|
||||
|
||||
|
||||
// CHECK-GFX1200-LABEL: test_amdgcn_swmmac_f32_16x16x32_fp8_bf8_w32:
|
||||
// CHECK-GFX1200: v_swmmac_f32_16x16x32_fp8_bf8 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}}
|
||||
//
|
||||
void test_amdgcn_swmmac_f32_16x16x32_fp8_bf8_w32(global v8f* out, v2i a, v4i b, v8f c, short index)
|
||||
{
|
||||
*out = __builtin_amdgcn_swmmac_f32_16x16x32_fp8_bf8_w32(a, b, c, index);
|
||||
}
|
||||
|
||||
|
||||
// CHECK-GFX1200-LABEL: test_amdgcn_swmmac_f32_16x16x32_bf8_fp8_w32:
|
||||
// CHECK-GFX1200: v_swmmac_f32_16x16x32_bf8_fp8 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}}
|
||||
//
|
||||
void test_amdgcn_swmmac_f32_16x16x32_bf8_fp8_w32(global v8f* out, v2i a, v4i b, v8f c, short index)
|
||||
{
|
||||
*out = __builtin_amdgcn_swmmac_f32_16x16x32_bf8_fp8_w32(a, b, c, index);
|
||||
}
|
||||
|
||||
// CHECK-GFX1200-LABEL: test_amdgcn_swmmac_f32_16x16x32_bf8_bf8_w32:
|
||||
// CHECK-GFX1200: v_swmmac_f32_16x16x32_bf8_bf8 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}}
|
||||
//
|
||||
void test_amdgcn_swmmac_f32_16x16x32_bf8_bf8_w32(global v8f* out, v2i a, v4i b, v8f c, short index)
|
||||
{
|
||||
*out = __builtin_amdgcn_swmmac_f32_16x16x32_bf8_bf8_w32(a, b, c, index);
|
||||
}
|
109
cross-project-tests/amdgpu/builtins-amdgcn-swmmac-w64.cl
Normal file
109
cross-project-tests/amdgpu/builtins-amdgcn-swmmac-w64.cl
Normal file
@ -0,0 +1,109 @@
|
||||
// RUN: %clang_cc1 -triple amdgcn-unknown-unknown -target-cpu gfx1200 -target-feature +wavefrontsize64 -S -o - %s | FileCheck %s --check-prefix=CHECK-GFX1200
|
||||
|
||||
typedef int v2i __attribute__((ext_vector_type(2)));
|
||||
typedef int v4i __attribute__((ext_vector_type(4)));
|
||||
typedef float v4f __attribute__((ext_vector_type(4)));
|
||||
typedef half v4h __attribute__((ext_vector_type(4)));
|
||||
typedef short v4s __attribute__((ext_vector_type(4)));
|
||||
typedef half v8h __attribute__((ext_vector_type(8)));
|
||||
typedef short v8s __attribute__((ext_vector_type(8)));
|
||||
|
||||
// Wave64
|
||||
|
||||
|
||||
// CHECK-GFX1200-LABEL: test_amdgcn_swmmac_f32_16x16x32_f16_w64:
|
||||
// CHECK-GFX1200: v_swmmac_f32_16x16x32_f16 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}}
|
||||
//
|
||||
void test_amdgcn_swmmac_f32_16x16x32_f16_w64(global v4f* out, v4h a, v8h b, v4f c, short index)
|
||||
{
|
||||
*out = __builtin_amdgcn_swmmac_f32_16x16x32_f16_w64(a, b, c, index);
|
||||
}
|
||||
|
||||
|
||||
// CHECK-GFX1200-LABEL: test_amdgcn_swmmac_f32_16x16x32_bf16_w64:
|
||||
// CHECK-GFX1200: v_swmmac_f32_16x16x32_bf16 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}}
|
||||
//
|
||||
void test_amdgcn_swmmac_f32_16x16x32_bf16_w64(global v4f* out, v4s a, v8s b, v4f c, short index)
|
||||
{
|
||||
*out = __builtin_amdgcn_swmmac_f32_16x16x32_bf16_w64(a, b, c, index);
|
||||
}
|
||||
|
||||
|
||||
// CHECK-GFX1200-LABEL: test_amdgcn_swmmac_f16_16x16x32_f16_w64:
|
||||
// CHECK-GFX1200: v_swmmac_f16_16x16x32_f16 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}}
|
||||
//
|
||||
void test_amdgcn_swmmac_f16_16x16x32_f16_w64(global v4h* out, v4h a, v8h b, v4h c, short index)
|
||||
{
|
||||
*out = __builtin_amdgcn_swmmac_f16_16x16x32_f16_w64(a, b, c, index);
|
||||
}
|
||||
|
||||
|
||||
// CHECK-GFX1200-LABEL: test_amdgcn_swmmac_bf16_16x16x32_bf16_w64:
|
||||
// CHECK-GFX1200: v_swmmac_bf16_16x16x32_bf16 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}}
|
||||
//
|
||||
void test_amdgcn_swmmac_bf16_16x16x32_bf16_w64(global v4s* out, v4s a, v8s b, v4s c, short index)
|
||||
{
|
||||
*out = __builtin_amdgcn_swmmac_bf16_16x16x32_bf16_w64(a, b, c, index);
|
||||
}
|
||||
|
||||
|
||||
// CHECK-GFX1200-LABEL: test_amdgcn_swmmac_i32_16x16x32_iu8_w64:
|
||||
// CHECK-GFX1200: v_swmmac_i32_16x16x32_iu8 v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}}, v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}} neg_lo:[1,1,0] clamp
|
||||
//
|
||||
void test_amdgcn_swmmac_i32_16x16x32_iu8_w64(global v4i* out, int a, v2i b, v4i c, short index)
|
||||
{
|
||||
*out = __builtin_amdgcn_swmmac_i32_16x16x32_iu8_w64(true, a, true, b, c, index, true);
|
||||
}
|
||||
|
||||
|
||||
// CHECK-GFX1200-LABEL: test_amdgcn_swmmac_i32_16x16x32_iu4_w64:
|
||||
// CHECK-GFX1200: v_swmmac_i32_16x16x32_iu4 v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} neg_lo:[1,1,0] clamp
|
||||
//
|
||||
void test_amdgcn_swmmac_i32_16x16x32_iu4_w64(global v4i* out, int a, int b, v4i c, short index)
|
||||
{
|
||||
*out = __builtin_amdgcn_swmmac_i32_16x16x32_iu4_w64(true, a, true, b, c, index, true);
|
||||
}
|
||||
|
||||
|
||||
// CHECK-GFX1200-LABEL: test_amdgcn_swmmac_i32_16x16x64_iu4_w64:
|
||||
// CHECK-GFX1200: v_swmmac_i32_16x16x64_iu4 v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}}, v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}} neg_lo:[1,1,0] clamp
|
||||
//
|
||||
void test_amdgcn_swmmac_i32_16x16x64_iu4_w64(global v4i* out, int a, v2i b, v4i c, short index)
|
||||
{
|
||||
*out = __builtin_amdgcn_swmmac_i32_16x16x64_iu4_w64(true, a, true, b, c, index, true);
|
||||
}
|
||||
|
||||
|
||||
// CHECK-GFX1200-LABEL: test_amdgcn_swmmac_f32_16x16x32_fp8_fp8_w64:
|
||||
// CHECK-GFX1200: v_swmmac_f32_16x16x32_fp8_fp8 v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}}, v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}}
|
||||
//
|
||||
void test_amdgcn_swmmac_f32_16x16x32_fp8_fp8_w64(global v4f* out, int a, v2i b, v4f c, short index)
|
||||
{
|
||||
*out = __builtin_amdgcn_swmmac_f32_16x16x32_fp8_fp8_w64(a, b, c, index);
|
||||
}
|
||||
|
||||
|
||||
// CHECK-GFX1200-LABEL: test_amdgcn_swmmac_f32_16x16x32_fp8_bf8_w64:
|
||||
// CHECK-GFX1200: v_swmmac_f32_16x16x32_fp8_bf8 v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}}, v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}}
|
||||
//
|
||||
void test_amdgcn_swmmac_f32_16x16x32_fp8_bf8_w64(global v4f* out, int a, v2i b, v4f c, short index)
|
||||
{
|
||||
*out = __builtin_amdgcn_swmmac_f32_16x16x32_fp8_bf8_w64(a, b, c, index);
|
||||
}
|
||||
|
||||
|
||||
// CHECK-GFX1200-LABEL: test_amdgcn_swmmac_f32_16x16x32_bf8_fp8_w64:
|
||||
// CHECK-GFX1200: v_swmmac_f32_16x16x32_bf8_fp8 v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}}, v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}}
|
||||
//
|
||||
void test_amdgcn_swmmac_f32_16x16x32_bf8_fp8_w64(global v4f* out, int a, v2i b, v4f c, short index)
|
||||
{
|
||||
*out = __builtin_amdgcn_swmmac_f32_16x16x32_bf8_fp8_w64(a, b, c, index);
|
||||
}
|
||||
|
||||
// CHECK-GFX1200-LABEL: test_amdgcn_swmmac_f32_16x16x32_bf8_bf8_w64:
|
||||
// CHECK-GFX1200: v_swmmac_f32_16x16x32_bf8_bf8 v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}}, v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}}
|
||||
//
|
||||
void test_amdgcn_swmmac_f32_16x16x32_bf8_bf8_w64(global v4f* out, int a, v2i b, v4f c, short index)
|
||||
{
|
||||
*out = __builtin_amdgcn_swmmac_f32_16x16x32_bf8_bf8_w64(a, b, c, index);
|
||||
}
|
@ -2601,6 +2601,11 @@ def int_amdgcn_ds_bvh_stack_rtn :
|
||||
[ImmArg<ArgIndex<3>>, IntrWillReturn, IntrNoCallback, IntrNoFree]
|
||||
>;
|
||||
|
||||
def int_amdgcn_s_wait_event_export_ready :
|
||||
ClangBuiltin<"__builtin_amdgcn_s_wait_event_export_ready">,
|
||||
Intrinsic<[], [], [IntrNoMem, IntrHasSideEffects, IntrWillReturn]
|
||||
>;
|
||||
|
||||
// WMMA (Wave Matrix Multiply-Accumulate) intrinsics
|
||||
//
|
||||
// These operations perform a matrix multiplication and accumulation of
|
||||
@ -2608,10 +2613,10 @@ def int_amdgcn_ds_bvh_stack_rtn :
|
||||
|
||||
class AMDGPUWmmaIntrinsic<LLVMType AB, LLVMType CD> :
|
||||
Intrinsic<
|
||||
[CD], // %D
|
||||
[CD], // %D
|
||||
[
|
||||
AB, // %A
|
||||
AB, // %B
|
||||
LLVMMatchType<1>, // %B
|
||||
LLVMMatchType<0>, // %C
|
||||
],
|
||||
[IntrNoMem, IntrConvergent, IntrWillReturn, IntrNoCallback, IntrNoFree]
|
||||
@ -2619,49 +2624,50 @@ class AMDGPUWmmaIntrinsic<LLVMType AB, LLVMType CD> :
|
||||
|
||||
class AMDGPUWmmaIntrinsicOPSEL<LLVMType AB, LLVMType CD> :
|
||||
Intrinsic<
|
||||
[CD], // %D
|
||||
[CD], // %D
|
||||
[
|
||||
AB, // %A
|
||||
AB, // %B
|
||||
LLVMMatchType<1>, // %B
|
||||
LLVMMatchType<0>, // %C
|
||||
llvm_i1_ty, // %high
|
||||
llvm_i1_ty, // %high (op_sel) for GFX11, 0 for GFX12
|
||||
],
|
||||
[IntrNoMem, IntrConvergent, ImmArg<ArgIndex<3>>, IntrWillReturn, IntrNoCallback, IntrNoFree]
|
||||
>;
|
||||
|
||||
class AMDGPUWmmaIntrinsicIU<LLVMType AB, LLVMType CD> :
|
||||
Intrinsic<
|
||||
[CD], // %D
|
||||
[CD], // %D
|
||||
[
|
||||
llvm_i1_ty, // %A_sign
|
||||
AB, // %A
|
||||
llvm_i1_ty, // %B_sign
|
||||
AB, // %B
|
||||
LLVMMatchType<1>, // %B
|
||||
LLVMMatchType<0>, // %C
|
||||
llvm_i1_ty, // %clamp
|
||||
],
|
||||
[IntrNoMem, IntrConvergent, ImmArg<ArgIndex<0>>, ImmArg<ArgIndex<2>>, ImmArg<ArgIndex<5>>, IntrWillReturn, IntrNoCallback, IntrNoFree]
|
||||
>;
|
||||
|
||||
def int_amdgcn_wmma_f32_16x16x16_f16 : AMDGPUWmmaIntrinsic<llvm_v16f16_ty, llvm_anyfloat_ty>;
|
||||
def int_amdgcn_wmma_f32_16x16x16_bf16 : AMDGPUWmmaIntrinsic<llvm_v16i16_ty, llvm_anyfloat_ty>;
|
||||
// The regular, untied f16/bf16 wmma intrinsics only write to one half
|
||||
// of the registers (set via the op_sel bit).
|
||||
// The content of the other 16-bit of the registers is undefined.
|
||||
def int_amdgcn_wmma_f16_16x16x16_f16 : AMDGPUWmmaIntrinsicOPSEL<llvm_v16f16_ty, llvm_anyfloat_ty>;
|
||||
def int_amdgcn_wmma_bf16_16x16x16_bf16 : AMDGPUWmmaIntrinsicOPSEL<llvm_v16i16_ty, llvm_anyint_ty>;
|
||||
// The tied versions of the f16/bf16 wmma intrinsics tie the destination matrix
|
||||
// registers to the input accumulator registers.
|
||||
// Essentially, the content of the other 16-bit is preserved from the input.
|
||||
def int_amdgcn_wmma_f16_16x16x16_f16_tied : AMDGPUWmmaIntrinsicOPSEL<llvm_v16f16_ty, llvm_anyfloat_ty>;
|
||||
def int_amdgcn_wmma_bf16_16x16x16_bf16_tied : AMDGPUWmmaIntrinsicOPSEL<llvm_v16i16_ty, llvm_anyint_ty>;
|
||||
def int_amdgcn_wmma_i32_16x16x16_iu8 : AMDGPUWmmaIntrinsicIU<llvm_v4i32_ty, llvm_anyint_ty>;
|
||||
def int_amdgcn_wmma_i32_16x16x16_iu4 : AMDGPUWmmaIntrinsicIU<llvm_v2i32_ty, llvm_anyint_ty>;
|
||||
// WMMA GFX11Only
|
||||
|
||||
def int_amdgcn_s_wait_event_export_ready :
|
||||
ClangBuiltin<"__builtin_amdgcn_s_wait_event_export_ready">,
|
||||
Intrinsic<[], [], [IntrNoMem, IntrHasSideEffects, IntrWillReturn]
|
||||
>;
|
||||
// The OPSEL intrinsics read from and write to one half of the registers, selected by the op_sel bit.
|
||||
// The tied versions of the f16/bf16 wmma intrinsics tie the destination matrix registers to the input accumulator registers.
|
||||
// The content of the other 16-bit half is preserved from the input.
|
||||
def int_amdgcn_wmma_f16_16x16x16_f16_tied : AMDGPUWmmaIntrinsicOPSEL<llvm_anyfloat_ty, llvm_anyfloat_ty>;
|
||||
def int_amdgcn_wmma_bf16_16x16x16_bf16_tied : AMDGPUWmmaIntrinsicOPSEL<llvm_anyint_ty, llvm_anyint_ty>;
|
||||
|
||||
// WMMA GFX11Plus
|
||||
|
||||
def int_amdgcn_wmma_f32_16x16x16_f16 : AMDGPUWmmaIntrinsic<llvm_anyfloat_ty, llvm_anyfloat_ty>;
|
||||
def int_amdgcn_wmma_f32_16x16x16_bf16 : AMDGPUWmmaIntrinsic<llvm_anyint_ty, llvm_anyfloat_ty>;
|
||||
def int_amdgcn_wmma_i32_16x16x16_iu8 : AMDGPUWmmaIntrinsicIU<llvm_anyint_ty, llvm_anyint_ty>;
|
||||
def int_amdgcn_wmma_i32_16x16x16_iu4 : AMDGPUWmmaIntrinsicIU<llvm_anyint_ty, llvm_anyint_ty>;
|
||||
|
||||
// GFX11: The OPSEL intrinsics read from and write to one half of the registers, selected by the op_sel bit.
|
||||
// The content of the other 16-bit half is undefined.
|
||||
// GFX12: The op_sel bit must be 0.
|
||||
def int_amdgcn_wmma_f16_16x16x16_f16 : AMDGPUWmmaIntrinsicOPSEL<llvm_anyfloat_ty, llvm_anyfloat_ty>;
|
||||
def int_amdgcn_wmma_bf16_16x16x16_bf16 : AMDGPUWmmaIntrinsicOPSEL<llvm_anyint_ty, llvm_anyint_ty>;
|
||||
|
||||
//===----------------------------------------------------------------------===//
|
||||
// GFX12 Intrinsics
|
||||
@ -2681,6 +2687,65 @@ def int_amdgcn_permlanex16_var : ClangBuiltin<"__builtin_amdgcn_permlanex16_var"
|
||||
[IntrNoMem, IntrConvergent, IntrWillReturn,
|
||||
ImmArg<ArgIndex<3>>, ImmArg<ArgIndex<4>>, IntrNoCallback, IntrNoFree]>;
|
||||
|
||||
|
||||
// WMMA (Wave Matrix Multiply-Accumulate) intrinsics
|
||||
//
|
||||
// These operations perform a matrix multiplication and accumulation of
|
||||
// the form: D = A * B + C .
|
||||
|
||||
// A and B are <8 x fp8> or <8 x bf8>, but since fp8 and bf8 are not supported by llvm we use <2 x i32>.
|
||||
def int_amdgcn_wmma_f32_16x16x16_fp8_fp8 : AMDGPUWmmaIntrinsic<llvm_anyint_ty, llvm_anyfloat_ty>;
|
||||
def int_amdgcn_wmma_f32_16x16x16_fp8_bf8 : AMDGPUWmmaIntrinsic<llvm_anyint_ty, llvm_anyfloat_ty>;
|
||||
def int_amdgcn_wmma_f32_16x16x16_bf8_fp8 : AMDGPUWmmaIntrinsic<llvm_anyint_ty, llvm_anyfloat_ty>;
|
||||
def int_amdgcn_wmma_f32_16x16x16_bf8_bf8 : AMDGPUWmmaIntrinsic<llvm_anyint_ty, llvm_anyfloat_ty>;
|
||||
// A and B are <16 x iu4>.
|
||||
def int_amdgcn_wmma_i32_16x16x32_iu4 : AMDGPUWmmaIntrinsicIU<llvm_anyint_ty, llvm_anyint_ty>;
|
||||
|
||||
// SWMMAC (Wave Matrix(sparse) Multiply-Accumulate) intrinsics
|
||||
//
|
||||
// These operations perform a sparse matrix multiplication and accumulation of
|
||||
// the form: D = A * B + C.
|
||||
// A is sparse matrix, half the size of B, and is expanded using sparsity index.
|
||||
|
||||
class AMDGPUSWmmacIntrinsicIdx<LLVMType A, LLVMType B, LLVMType CD, LLVMType Index> :
|
||||
Intrinsic<
|
||||
[CD], // %D
|
||||
[
|
||||
A, // %A
|
||||
B, // %B
|
||||
LLVMMatchType<0>, // %C
|
||||
Index // %Sparsity index for A
|
||||
],
|
||||
[IntrNoMem, IntrConvergent, IntrWillReturn]
|
||||
>;
|
||||
|
||||
class AMDGPUSWmmacIntrinsicIUIdx<LLVMType A, LLVMType B, LLVMType CD, LLVMType Index> :
|
||||
Intrinsic<
|
||||
[CD], // %D
|
||||
[
|
||||
llvm_i1_ty, // %A_sign
|
||||
A, // %A
|
||||
llvm_i1_ty, // %B_sign
|
||||
B, // %B
|
||||
LLVMMatchType<0>, // %C
|
||||
Index, // %Sparsity index for A
|
||||
llvm_i1_ty, // %clamp
|
||||
],
|
||||
[IntrNoMem, IntrConvergent, IntrWillReturn, ImmArg<ArgIndex<0>>, ImmArg<ArgIndex<2>>, ImmArg<ArgIndex<6>>]
|
||||
>;
|
||||
|
||||
def int_amdgcn_swmmac_f32_16x16x32_f16 : AMDGPUSWmmacIntrinsicIdx<llvm_anyfloat_ty, llvm_anyfloat_ty, llvm_anyfloat_ty, llvm_anyint_ty>;
|
||||
def int_amdgcn_swmmac_f32_16x16x32_bf16 : AMDGPUSWmmacIntrinsicIdx<llvm_anyint_ty, llvm_anyint_ty, llvm_anyfloat_ty, llvm_anyint_ty>;
|
||||
def int_amdgcn_swmmac_f16_16x16x32_f16 : AMDGPUSWmmacIntrinsicIdx<llvm_anyfloat_ty, llvm_anyfloat_ty, llvm_anyfloat_ty, llvm_anyint_ty>;
|
||||
def int_amdgcn_swmmac_bf16_16x16x32_bf16 : AMDGPUSWmmacIntrinsicIdx<llvm_anyint_ty, llvm_anyint_ty, llvm_anyint_ty, llvm_anyint_ty>;
|
||||
def int_amdgcn_swmmac_i32_16x16x32_iu8 : AMDGPUSWmmacIntrinsicIUIdx<llvm_anyint_ty, llvm_anyint_ty, llvm_anyint_ty, llvm_anyint_ty>;
|
||||
def int_amdgcn_swmmac_i32_16x16x32_iu4 : AMDGPUSWmmacIntrinsicIUIdx<llvm_anyint_ty, llvm_anyint_ty, llvm_anyint_ty, llvm_anyint_ty>;
|
||||
def int_amdgcn_swmmac_i32_16x16x64_iu4 : AMDGPUSWmmacIntrinsicIUIdx<llvm_anyint_ty, llvm_anyint_ty, llvm_anyint_ty, llvm_anyint_ty>;
|
||||
def int_amdgcn_swmmac_f32_16x16x32_fp8_fp8 : AMDGPUSWmmacIntrinsicIdx<llvm_anyint_ty, llvm_anyint_ty, llvm_anyfloat_ty, llvm_anyint_ty>;
|
||||
def int_amdgcn_swmmac_f32_16x16x32_fp8_bf8 : AMDGPUSWmmacIntrinsicIdx<llvm_anyint_ty, llvm_anyint_ty, llvm_anyfloat_ty, llvm_anyint_ty>;
|
||||
def int_amdgcn_swmmac_f32_16x16x32_bf8_fp8 : AMDGPUSWmmacIntrinsicIdx<llvm_anyint_ty, llvm_anyint_ty, llvm_anyfloat_ty, llvm_anyint_ty>;
|
||||
def int_amdgcn_swmmac_f32_16x16x32_bf8_bf8 : AMDGPUSWmmacIntrinsicIdx<llvm_anyint_ty, llvm_anyint_ty, llvm_anyfloat_ty, llvm_anyint_ty>;
|
||||
|
||||
def int_amdgcn_global_atomic_ordered_add_b64 : AMDGPUAtomicRtn<llvm_i64_ty, global_ptr_ty>;
|
||||
|
||||
def int_amdgcn_flat_atomic_fmin_num : AMDGPUAtomicRtn<llvm_anyfloat_ty>;
|
||||
|
@ -59,6 +59,30 @@ def gi_wmmaopselvop3pmods :
|
||||
GIComplexOperandMatcher<s32, "selectWMMAOpSelVOP3PMods">,
|
||||
GIComplexPatternEquiv<WMMAOpSelVOP3PMods>;
|
||||
|
||||
def gi_wmmavisrc :
|
||||
GIComplexOperandMatcher<s32, "selectWMMAVISrc">,
|
||||
GIComplexPatternEquiv<WMMAVISrc>;
|
||||
|
||||
def gi_wmmamods :
|
||||
GIComplexOperandMatcher<s32, "selectWMMAModsF32NegAbs">,
|
||||
GIComplexPatternEquiv<WMMAModsF32NegAbs>;
|
||||
|
||||
def gi_wmmamodsf16Neg :
|
||||
GIComplexOperandMatcher<s32, "selectWMMAModsF16Neg">,
|
||||
GIComplexPatternEquiv<WMMAModsF16Neg>;
|
||||
|
||||
def gi_wmmamodsf16NegAbs :
|
||||
GIComplexOperandMatcher<s32, "selectWMMAModsF16NegAbs">,
|
||||
GIComplexPatternEquiv<WMMAModsF16NegAbs>;
|
||||
|
||||
def gi_swmmacindex8 :
|
||||
GIComplexOperandMatcher<s32, "selectSWMMACIndex8">,
|
||||
GIComplexPatternEquiv<SWMMACIndex8>;
|
||||
|
||||
def gi_swmmacindex16 :
|
||||
GIComplexOperandMatcher<s32, "selectSWMMACIndex16">,
|
||||
GIComplexPatternEquiv<SWMMACIndex16>;
|
||||
|
||||
def gi_vop3opselmods :
|
||||
GIComplexOperandMatcher<s32, "selectVOP3OpSelMods">,
|
||||
GIComplexPatternEquiv<VOP3OpSelMods>;
|
||||
|
@ -3048,6 +3048,336 @@ bool AMDGPUDAGToDAGISel::SelectWMMAOpSelVOP3PMods(SDValue In,
|
||||
return true;
|
||||
}
|
||||
|
||||
static MachineSDNode *buildRegSequence32(SmallVectorImpl<SDValue> &Elts,
|
||||
llvm::SelectionDAG *CurDAG,
|
||||
const SDLoc &DL) {
|
||||
unsigned DstRegClass;
|
||||
EVT DstTy;
|
||||
switch (Elts.size()) {
|
||||
case 8:
|
||||
DstRegClass = AMDGPU::VReg_256RegClassID;
|
||||
DstTy = MVT::v8i32;
|
||||
break;
|
||||
case 4:
|
||||
DstRegClass = AMDGPU::VReg_128RegClassID;
|
||||
DstTy = MVT::v4i32;
|
||||
break;
|
||||
case 2:
|
||||
DstRegClass = AMDGPU::VReg_64RegClassID;
|
||||
DstTy = MVT::v2i32;
|
||||
break;
|
||||
default:
|
||||
llvm_unreachable("unhandled Reg sequence size");
|
||||
}
|
||||
|
||||
SmallVector<SDValue, 17> Ops;
|
||||
Ops.push_back(CurDAG->getTargetConstant(DstRegClass, DL, MVT::i32));
|
||||
for (unsigned i = 0; i < Elts.size(); ++i) {
|
||||
Ops.push_back(Elts[i]);
|
||||
Ops.push_back(CurDAG->getTargetConstant(
|
||||
SIRegisterInfo::getSubRegFromChannel(i), DL, MVT::i32));
|
||||
}
|
||||
return CurDAG->getMachineNode(TargetOpcode::REG_SEQUENCE, DL, DstTy, Ops);
|
||||
}
|
||||
|
||||
static MachineSDNode *buildRegSequence16(SmallVectorImpl<SDValue> &Elts,
|
||||
llvm::SelectionDAG *CurDAG,
|
||||
const SDLoc &DL) {
|
||||
SmallVector<SDValue, 8> PackedElts;
|
||||
assert("unhandled Reg sequence size" &&
|
||||
(Elts.size() == 8 || Elts.size() == 16));
|
||||
|
||||
// Pack 16-bit elements in pairs into 32-bit register. If both elements are
|
||||
// unpacked from 32-bit source use it, otherwise pack them using v_perm.
|
||||
for (unsigned i = 0; i < Elts.size(); i += 2) {
|
||||
SDValue LoSrc = stripExtractLoElt(stripBitcast(Elts[i]));
|
||||
SDValue HiSrc;
|
||||
if (isExtractHiElt(Elts[i + 1], HiSrc) && LoSrc == HiSrc) {
|
||||
PackedElts.push_back(HiSrc);
|
||||
} else {
|
||||
SDValue PackLoLo = CurDAG->getTargetConstant(0x05040100, DL, MVT::i32);
|
||||
MachineSDNode *Packed =
|
||||
CurDAG->getMachineNode(AMDGPU::V_PERM_B32_e64, DL, MVT::i32,
|
||||
{Elts[i + 1], Elts[i], PackLoLo});
|
||||
PackedElts.push_back(SDValue(Packed, 0));
|
||||
}
|
||||
}
|
||||
|
||||
return buildRegSequence32(PackedElts, CurDAG, DL);
|
||||
}
|
||||
|
||||
static MachineSDNode *buildRegSequence(SmallVectorImpl<SDValue> &Elts,
|
||||
llvm::SelectionDAG *CurDAG,
|
||||
const SDLoc &DL, unsigned ElementSize) {
|
||||
if (ElementSize == 16)
|
||||
return buildRegSequence16(Elts, CurDAG, DL);
|
||||
if (ElementSize == 32)
|
||||
return buildRegSequence32(Elts, CurDAG, DL);
|
||||
llvm_unreachable("Unhandled element size");
|
||||
}
|
||||
|
||||
static void selectWMMAModsNegAbs(unsigned ModOpcode, unsigned &Mods,
|
||||
SmallVectorImpl<SDValue> &Elts, SDValue &Src,
|
||||
llvm::SelectionDAG *CurDAG, const SDLoc &DL,
|
||||
unsigned ElementSize) {
|
||||
if (ModOpcode == ISD::FNEG) {
|
||||
Mods |= SISrcMods::NEG;
|
||||
// Check if all elements also have abs modifier
|
||||
SmallVector<SDValue, 8> NegAbsElts;
|
||||
for (auto El : Elts) {
|
||||
if (El.getOpcode() != ISD::FABS)
|
||||
break;
|
||||
NegAbsElts.push_back(El->getOperand(0));
|
||||
}
|
||||
if (Elts.size() != NegAbsElts.size()) {
|
||||
// Neg
|
||||
Src = SDValue(buildRegSequence(Elts, CurDAG, DL, ElementSize), 0);
|
||||
} else {
|
||||
// Neg and Abs
|
||||
Mods |= SISrcMods::NEG_HI;
|
||||
Src = SDValue(buildRegSequence(NegAbsElts, CurDAG, DL, ElementSize), 0);
|
||||
}
|
||||
} else {
|
||||
assert(ModOpcode == ISD::FABS);
|
||||
// Abs
|
||||
Mods |= SISrcMods::NEG_HI;
|
||||
Src = SDValue(buildRegSequence(Elts, CurDAG, DL, ElementSize), 0);
|
||||
}
|
||||
}
|
||||
|
||||
// Check all f16 elements for modifiers while looking through b32 and v2b16
|
||||
// build vector, stop if element does not satisfy ModifierCheck.
|
||||
static void
|
||||
checkWMMAElementsModifiersF16(BuildVectorSDNode *BV,
|
||||
std::function<bool(SDValue)> ModifierCheck) {
|
||||
for (unsigned i = 0; i < BV->getNumOperands(); ++i) {
|
||||
if (auto *F16Pair =
|
||||
dyn_cast<BuildVectorSDNode>(stripBitcast(BV->getOperand(i)))) {
|
||||
for (unsigned i = 0; i < F16Pair->getNumOperands(); ++i) {
|
||||
SDValue ElF16 = stripBitcast(F16Pair->getOperand(i));
|
||||
if (!ModifierCheck(ElF16))
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
bool AMDGPUDAGToDAGISel::SelectWMMAModsF16Neg(SDValue In, SDValue &Src,
|
||||
SDValue &SrcMods) const {
|
||||
Src = In;
|
||||
unsigned Mods = SISrcMods::OP_SEL_1;
|
||||
|
||||
// mods are on f16 elements
|
||||
if (auto *BV = dyn_cast<BuildVectorSDNode>(stripBitcast(In))) {
|
||||
SmallVector<SDValue, 8> EltsF16;
|
||||
|
||||
checkWMMAElementsModifiersF16(BV, [&](SDValue Element) -> bool {
|
||||
if (Element.getOpcode() != ISD::FNEG)
|
||||
return false;
|
||||
EltsF16.push_back(Element.getOperand(0));
|
||||
return true;
|
||||
});
|
||||
|
||||
// All elements have neg modifier
|
||||
if (BV->getNumOperands() * 2 == EltsF16.size()) {
|
||||
Src = SDValue(buildRegSequence16(EltsF16, CurDAG, SDLoc(In)), 0);
|
||||
Mods |= SISrcMods::NEG;
|
||||
Mods |= SISrcMods::NEG_HI;
|
||||
}
|
||||
}
|
||||
|
||||
// mods are on v2f16 elements
|
||||
if (auto *BV = dyn_cast<BuildVectorSDNode>(stripBitcast(In))) {
|
||||
SmallVector<SDValue, 8> EltsV2F16;
|
||||
for (unsigned i = 0; i < BV->getNumOperands(); ++i) {
|
||||
SDValue ElV2f16 = stripBitcast(BV->getOperand(i));
|
||||
// Based on first element decide which mod we match, neg or abs
|
||||
if (ElV2f16.getOpcode() != ISD::FNEG)
|
||||
break;
|
||||
EltsV2F16.push_back(ElV2f16.getOperand(0));
|
||||
}
|
||||
|
||||
// All pairs of elements have neg modifier
|
||||
if (BV->getNumOperands() == EltsV2F16.size()) {
|
||||
Src = SDValue(buildRegSequence32(EltsV2F16, CurDAG, SDLoc(In)), 0);
|
||||
Mods |= SISrcMods::NEG;
|
||||
Mods |= SISrcMods::NEG_HI;
|
||||
}
|
||||
}
|
||||
|
||||
SrcMods = CurDAG->getTargetConstant(Mods, SDLoc(In), MVT::i32);
|
||||
return true;
|
||||
}
|
||||
|
||||
bool AMDGPUDAGToDAGISel::SelectWMMAModsF16NegAbs(SDValue In, SDValue &Src,
|
||||
SDValue &SrcMods) const {
|
||||
Src = In;
|
||||
unsigned Mods = SISrcMods::OP_SEL_1;
|
||||
unsigned ModOpcode;
|
||||
|
||||
// mods are on f16 elements
|
||||
if (auto *BV = dyn_cast<BuildVectorSDNode>(stripBitcast(In))) {
|
||||
SmallVector<SDValue, 8> EltsF16;
|
||||
checkWMMAElementsModifiersF16(BV, [&](SDValue ElF16) -> bool {
|
||||
// Based on first element decide which mod we match, neg or abs
|
||||
if (EltsF16.empty())
|
||||
ModOpcode = (ElF16.getOpcode() == ISD::FNEG) ? ISD::FNEG : ISD::FABS;
|
||||
if (ElF16.getOpcode() != ModOpcode)
|
||||
return false;
|
||||
EltsF16.push_back(ElF16.getOperand(0));
|
||||
return true;
|
||||
});
|
||||
|
||||
// All elements have ModOpcode modifier
|
||||
if (BV->getNumOperands() * 2 == EltsF16.size())
|
||||
selectWMMAModsNegAbs(ModOpcode, Mods, EltsF16, Src, CurDAG, SDLoc(In),
|
||||
16);
|
||||
}
|
||||
|
||||
// mods are on v2f16 elements
|
||||
if (auto *BV = dyn_cast<BuildVectorSDNode>(stripBitcast(In))) {
|
||||
SmallVector<SDValue, 8> EltsV2F16;
|
||||
|
||||
for (unsigned i = 0; i < BV->getNumOperands(); ++i) {
|
||||
SDValue ElV2f16 = stripBitcast(BV->getOperand(i));
|
||||
// Based on first element decide which mod we match, neg or abs
|
||||
if (EltsV2F16.empty())
|
||||
ModOpcode = (ElV2f16.getOpcode() == ISD::FNEG) ? ISD::FNEG : ISD::FABS;
|
||||
if (ElV2f16->getOpcode() != ModOpcode)
|
||||
break;
|
||||
EltsV2F16.push_back(ElV2f16->getOperand(0));
|
||||
}
|
||||
|
||||
// All elements have ModOpcode modifier
|
||||
if (BV->getNumOperands() == EltsV2F16.size())
|
||||
selectWMMAModsNegAbs(ModOpcode, Mods, EltsV2F16, Src, CurDAG, SDLoc(In),
|
||||
32);
|
||||
}
|
||||
|
||||
SrcMods = CurDAG->getTargetConstant(Mods, SDLoc(In), MVT::i32);
|
||||
return true;
|
||||
}
|
||||
|
||||
bool AMDGPUDAGToDAGISel::SelectWMMAModsF32NegAbs(SDValue In, SDValue &Src,
|
||||
SDValue &SrcMods) const {
|
||||
Src = In;
|
||||
unsigned Mods = SISrcMods::OP_SEL_1;
|
||||
unsigned ModOpcode;
|
||||
SmallVector<SDValue, 8> EltsF32;
|
||||
|
||||
if (auto *BV = dyn_cast<BuildVectorSDNode>(stripBitcast(In))) {
|
||||
for (unsigned i = 0; i < BV->getNumOperands(); ++i) {
|
||||
SDValue ElF32 = stripBitcast(BV->getOperand(i));
|
||||
// Based on first element decide which mod we match, neg or abs
|
||||
if (EltsF32.empty())
|
||||
ModOpcode = (ElF32.getOpcode() == ISD::FNEG) ? ISD::FNEG : ISD::FABS;
|
||||
if (ElF32.getOpcode() != ModOpcode)
|
||||
break;
|
||||
EltsF32.push_back(ElF32.getOperand(0));
|
||||
}
|
||||
|
||||
// All elements had ModOpcode modifier
|
||||
if (BV->getNumOperands() == EltsF32.size())
|
||||
selectWMMAModsNegAbs(ModOpcode, Mods, EltsF32, Src, CurDAG, SDLoc(In),
|
||||
32);
|
||||
}
|
||||
|
||||
SrcMods = CurDAG->getTargetConstant(Mods, SDLoc(In), MVT::i32);
|
||||
return true;
|
||||
}
|
||||
|
||||
bool AMDGPUDAGToDAGISel::SelectWMMAVISrc(SDValue In, SDValue &Src) const {
|
||||
if (auto *BV = dyn_cast<BuildVectorSDNode>(In)) {
|
||||
BitVector UndefElements;
|
||||
if (SDValue Splat = BV->getSplatValue(&UndefElements))
|
||||
if (isInlineImmediate(Splat.getNode())) {
|
||||
if (const ConstantSDNode *C = dyn_cast<ConstantSDNode>(Splat)) {
|
||||
unsigned Imm = C->getAPIntValue().getSExtValue();
|
||||
Src = CurDAG->getTargetConstant(Imm, SDLoc(In), MVT::i32);
|
||||
return true;
|
||||
}
|
||||
if (const ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(Splat)) {
|
||||
unsigned Imm = C->getValueAPF().bitcastToAPInt().getSExtValue();
|
||||
Src = CurDAG->getTargetConstant(Imm, SDLoc(In), MVT::i32);
|
||||
return true;
|
||||
}
|
||||
llvm_unreachable("unhandled Constant node");
|
||||
}
|
||||
}
|
||||
|
||||
// 16 bit splat
|
||||
SDValue SplatSrc32 = stripBitcast(In);
|
||||
if (auto *SplatSrc32BV = dyn_cast<BuildVectorSDNode>(SplatSrc32)) {
|
||||
if (SDValue Splat32 = SplatSrc32BV->getSplatValue()) {
|
||||
SDValue SplatSrc16 = stripBitcast(Splat32);
|
||||
if (auto *SplatSrc16BV = dyn_cast<BuildVectorSDNode>(SplatSrc16)) {
|
||||
if (SDValue Splat = SplatSrc16BV->getSplatValue()) {
|
||||
|
||||
// f16
|
||||
if (isInlineImmediate(Splat.getNode())) {
|
||||
const ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(Splat);
|
||||
int64_t Imm = C->getValueAPF().bitcastToAPInt().getSExtValue();
|
||||
Src = CurDAG->getTargetConstant(Imm, SDLoc(In), MVT::i16);
|
||||
return true;
|
||||
}
|
||||
|
||||
// bf16
|
||||
if (const ConstantSDNode *C = dyn_cast<ConstantSDNode>(Splat)) {
|
||||
const SIInstrInfo *TII = Subtarget->getInstrInfo();
|
||||
APInt BF16Value = C->getAPIntValue();
|
||||
APInt F32Value = BF16Value.zext(32).shl(16);
|
||||
if (TII->isInlineConstant(F32Value)) {
|
||||
int64_t Imm = F32Value.getSExtValue();
|
||||
Src = CurDAG->getTargetConstant(Imm, SDLoc(In), MVT::i32);
|
||||
return true;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return false;
|
||||
}
|
||||
|
||||
bool AMDGPUDAGToDAGISel::SelectSWMMACIndex8(SDValue In, SDValue &Src,
|
||||
SDValue &IndexKey) const {
|
||||
unsigned Key = 0;
|
||||
Src = In;
|
||||
|
||||
if (In.getOpcode() == ISD::SRL) {
|
||||
const llvm::SDValue &ShiftSrc = In.getOperand(0);
|
||||
ConstantSDNode *ShiftAmt = dyn_cast<ConstantSDNode>(In.getOperand(1));
|
||||
if (ShiftSrc.getValueType().getSizeInBits() == 32 && ShiftAmt &&
|
||||
ShiftAmt->getZExtValue() % 8 == 0) {
|
||||
Key = ShiftAmt->getZExtValue() / 8;
|
||||
Src = ShiftSrc;
|
||||
}
|
||||
}
|
||||
|
||||
IndexKey = CurDAG->getTargetConstant(Key, SDLoc(In), MVT::i32);
|
||||
return true;
|
||||
}
|
||||
|
||||
bool AMDGPUDAGToDAGISel::SelectSWMMACIndex16(SDValue In, SDValue &Src,
|
||||
SDValue &IndexKey) const {
|
||||
unsigned Key = 0;
|
||||
Src = In;
|
||||
|
||||
if (In.getOpcode() == ISD::SRL) {
|
||||
const llvm::SDValue &ShiftSrc = In.getOperand(0);
|
||||
ConstantSDNode *ShiftAmt = dyn_cast<ConstantSDNode>(In.getOperand(1));
|
||||
if (ShiftSrc.getValueType().getSizeInBits() == 32 && ShiftAmt &&
|
||||
ShiftAmt->getZExtValue() == 16) {
|
||||
Key = 1;
|
||||
Src = ShiftSrc;
|
||||
}
|
||||
}
|
||||
|
||||
IndexKey = CurDAG->getTargetConstant(Key, SDLoc(In), MVT::i32);
|
||||
return true;
|
||||
}
|
||||
|
||||
bool AMDGPUDAGToDAGISel::SelectVOP3OpSel(SDValue In, SDValue &Src,
|
||||
SDValue &SrcMods) const {
|
||||
Src = In;
|
||||
|
@ -240,6 +240,16 @@ private:
|
||||
bool SelectVOP3PModsNeg(SDValue In, SDValue &Src) const;
|
||||
bool SelectWMMAOpSelVOP3PMods(SDValue In, SDValue &Src) const;
|
||||
|
||||
bool SelectWMMAModsF32NegAbs(SDValue In, SDValue &Src,
|
||||
SDValue &SrcMods) const;
|
||||
bool SelectWMMAModsF16Neg(SDValue In, SDValue &Src, SDValue &SrcMods) const;
|
||||
bool SelectWMMAModsF16NegAbs(SDValue In, SDValue &Src,
|
||||
SDValue &SrcMods) const;
|
||||
bool SelectWMMAVISrc(SDValue In, SDValue &Src) const;
|
||||
|
||||
bool SelectSWMMACIndex8(SDValue In, SDValue &Src, SDValue &IndexKey) const;
|
||||
bool SelectSWMMACIndex16(SDValue In, SDValue &Src, SDValue &IndexKey) const;
|
||||
|
||||
bool SelectVOP3OpSel(SDValue In, SDValue &Src, SDValue &SrcMods) const;
|
||||
|
||||
bool SelectVOP3OpSelMods(SDValue In, SDValue &Src, SDValue &SrcMods) const;
|
||||
|
@ -3956,6 +3956,219 @@ AMDGPUInstructionSelector::selectWMMAOpSelVOP3PMods(
|
||||
}};
|
||||
}
|
||||
|
||||
static Register buildRegSequence(SmallVectorImpl<Register> &Elts,
|
||||
MachineInstr *InsertPt,
|
||||
MachineRegisterInfo &MRI) {
|
||||
const TargetRegisterClass *DstRegClass;
|
||||
switch (Elts.size()) {
|
||||
case 8:
|
||||
DstRegClass = &AMDGPU::VReg_256RegClass;
|
||||
break;
|
||||
case 4:
|
||||
DstRegClass = &AMDGPU::VReg_128RegClass;
|
||||
break;
|
||||
case 2:
|
||||
DstRegClass = &AMDGPU::VReg_64RegClass;
|
||||
break;
|
||||
default:
|
||||
llvm_unreachable("unhandled Reg sequence size");
|
||||
}
|
||||
|
||||
MachineIRBuilder B(*InsertPt);
|
||||
auto MIB = B.buildInstr(AMDGPU::REG_SEQUENCE)
|
||||
.addDef(MRI.createVirtualRegister(DstRegClass));
|
||||
for (unsigned i = 0; i < Elts.size(); ++i) {
|
||||
MIB.addReg(Elts[i]);
|
||||
MIB.addImm(SIRegisterInfo::getSubRegFromChannel(i));
|
||||
}
|
||||
return MIB->getOperand(0).getReg();
|
||||
}
|
||||
|
||||
static void selectWMMAModsNegAbs(unsigned ModOpcode, unsigned &Mods,
|
||||
SmallVectorImpl<Register> &Elts, Register &Src,
|
||||
MachineInstr *InsertPt,
|
||||
MachineRegisterInfo &MRI) {
|
||||
if (ModOpcode == TargetOpcode::G_FNEG) {
|
||||
Mods |= SISrcMods::NEG;
|
||||
// Check if all elements also have abs modifier
|
||||
SmallVector<Register, 8> NegAbsElts;
|
||||
for (auto El : Elts) {
|
||||
Register FabsSrc;
|
||||
if (!mi_match(El, MRI, m_GFabs(m_Reg(FabsSrc))))
|
||||
break;
|
||||
NegAbsElts.push_back(FabsSrc);
|
||||
}
|
||||
if (Elts.size() != NegAbsElts.size()) {
|
||||
// Neg
|
||||
Src = buildRegSequence(Elts, InsertPt, MRI);
|
||||
} else {
|
||||
// Neg and Abs
|
||||
Mods |= SISrcMods::NEG_HI;
|
||||
Src = buildRegSequence(NegAbsElts, InsertPt, MRI);
|
||||
}
|
||||
} else {
|
||||
assert(ModOpcode == TargetOpcode::G_FABS);
|
||||
// Abs
|
||||
Mods |= SISrcMods::NEG_HI;
|
||||
Src = buildRegSequence(Elts, InsertPt, MRI);
|
||||
}
|
||||
}
|
||||
|
||||
InstructionSelector::ComplexRendererFns
|
||||
AMDGPUInstructionSelector::selectWMMAModsF32NegAbs(MachineOperand &Root) const {
|
||||
Register Src = Root.getReg();
|
||||
unsigned Mods = SISrcMods::OP_SEL_1;
|
||||
unsigned ModOpcode;
|
||||
SmallVector<Register, 8> EltsF32;
|
||||
|
||||
if (GBuildVector *BV = dyn_cast<GBuildVector>(MRI->getVRegDef(Src))) {
|
||||
for (unsigned i = 0; i < BV->getNumSources(); ++i) {
|
||||
MachineInstr *ElF32 = MRI->getVRegDef(BV->getSourceReg(i));
|
||||
// Based on first element decide which mod we match, neg or abs
|
||||
if (EltsF32.empty())
|
||||
ModOpcode = (ElF32->getOpcode() == AMDGPU::G_FNEG) ? AMDGPU::G_FNEG
|
||||
: AMDGPU::G_FABS;
|
||||
if (ElF32->getOpcode() != ModOpcode)
|
||||
break;
|
||||
EltsF32.push_back(ElF32->getOperand(1).getReg());
|
||||
}
|
||||
|
||||
// All elements had ModOpcode modifier
|
||||
if (BV->getNumSources() == EltsF32.size()) {
|
||||
selectWMMAModsNegAbs(ModOpcode, Mods, EltsF32, Src, Root.getParent(),
|
||||
*MRI);
|
||||
}
|
||||
}
|
||||
|
||||
return {{[=](MachineInstrBuilder &MIB) { MIB.addReg(Src); },
|
||||
[=](MachineInstrBuilder &MIB) { MIB.addImm(Mods); }}};
|
||||
}
|
||||
|
||||
InstructionSelector::ComplexRendererFns
|
||||
AMDGPUInstructionSelector::selectWMMAModsF16Neg(MachineOperand &Root) const {
|
||||
Register Src = Root.getReg();
|
||||
unsigned Mods = SISrcMods::OP_SEL_1;
|
||||
SmallVector<Register, 8> EltsV2F16;
|
||||
|
||||
if (GConcatVectors *CV = dyn_cast<GConcatVectors>(MRI->getVRegDef(Src))) {
|
||||
for (unsigned i = 0; i < CV->getNumSources(); ++i) {
|
||||
Register FNegSrc;
|
||||
if (!mi_match(CV->getSourceReg(i), *MRI, m_GFNeg(m_Reg(FNegSrc))))
|
||||
break;
|
||||
EltsV2F16.push_back(FNegSrc);
|
||||
}
|
||||
|
||||
// All elements had ModOpcode modifier
|
||||
if (CV->getNumSources() == EltsV2F16.size()) {
|
||||
Mods |= SISrcMods::NEG;
|
||||
Mods |= SISrcMods::NEG_HI;
|
||||
Src = buildRegSequence(EltsV2F16, Root.getParent(), *MRI);
|
||||
}
|
||||
}
|
||||
|
||||
return {{[=](MachineInstrBuilder &MIB) { MIB.addReg(Src); },
|
||||
[=](MachineInstrBuilder &MIB) { MIB.addImm(Mods); }}};
|
||||
}
|
||||
|
||||
InstructionSelector::ComplexRendererFns
|
||||
AMDGPUInstructionSelector::selectWMMAModsF16NegAbs(MachineOperand &Root) const {
|
||||
Register Src = Root.getReg();
|
||||
unsigned Mods = SISrcMods::OP_SEL_1;
|
||||
unsigned ModOpcode;
|
||||
SmallVector<Register, 8> EltsV2F16;
|
||||
|
||||
if (GConcatVectors *CV = dyn_cast<GConcatVectors>(MRI->getVRegDef(Src))) {
|
||||
for (unsigned i = 0; i < CV->getNumSources(); ++i) {
|
||||
MachineInstr *ElV2F16 = MRI->getVRegDef(CV->getSourceReg(i));
|
||||
// Based on first element decide which mod we match, neg or abs
|
||||
if (EltsV2F16.empty())
|
||||
ModOpcode = (ElV2F16->getOpcode() == AMDGPU::G_FNEG) ? AMDGPU::G_FNEG
|
||||
: AMDGPU::G_FABS;
|
||||
if (ElV2F16->getOpcode() != ModOpcode)
|
||||
break;
|
||||
EltsV2F16.push_back(ElV2F16->getOperand(1).getReg());
|
||||
}
|
||||
|
||||
// All elements had ModOpcode modifier
|
||||
if (CV->getNumSources() == EltsV2F16.size()) {
|
||||
MachineIRBuilder B(*Root.getParent());
|
||||
selectWMMAModsNegAbs(ModOpcode, Mods, EltsV2F16, Src, Root.getParent(),
|
||||
*MRI);
|
||||
}
|
||||
}
|
||||
|
||||
return {{[=](MachineInstrBuilder &MIB) { MIB.addReg(Src); },
|
||||
[=](MachineInstrBuilder &MIB) { MIB.addImm(Mods); }}};
|
||||
}
|
||||
|
||||
InstructionSelector::ComplexRendererFns
|
||||
AMDGPUInstructionSelector::selectWMMAVISrc(MachineOperand &Root) const {
|
||||
std::optional<FPValueAndVReg> FPValReg;
|
||||
if (mi_match(Root.getReg(), *MRI, m_GFCstOrSplat(FPValReg))) {
|
||||
if (TII.isInlineConstant(FPValReg->Value.bitcastToAPInt())) {
|
||||
return {{[=](MachineInstrBuilder &MIB) {
|
||||
MIB.addImm(FPValReg->Value.bitcastToAPInt().getSExtValue());
|
||||
}}};
|
||||
}
|
||||
// Non-inlineable splat floats should not fall-through for integer immediate
|
||||
// checks.
|
||||
return {};
|
||||
}
|
||||
|
||||
APInt ICst;
|
||||
if (mi_match(Root.getReg(), *MRI, m_ICstOrSplat(ICst))) {
|
||||
if (TII.isInlineConstant(ICst)) {
|
||||
return {
|
||||
{[=](MachineInstrBuilder &MIB) { MIB.addImm(ICst.getSExtValue()); }}};
|
||||
}
|
||||
}
|
||||
|
||||
return {};
|
||||
}
|
||||
|
||||
InstructionSelector::ComplexRendererFns
|
||||
AMDGPUInstructionSelector::selectSWMMACIndex8(MachineOperand &Root) const {
|
||||
Register Src =
|
||||
getDefIgnoringCopies(Root.getReg(), *MRI)->getOperand(0).getReg();
|
||||
unsigned Key = 0;
|
||||
|
||||
Register ShiftSrc;
|
||||
std::optional<ValueAndVReg> ShiftAmt;
|
||||
if (mi_match(Src, *MRI, m_GLShr(m_Reg(ShiftSrc), m_GCst(ShiftAmt))) &&
|
||||
MRI->getType(ShiftSrc).getSizeInBits() == 32 &&
|
||||
ShiftAmt->Value.getZExtValue() % 8 == 0) {
|
||||
Key = ShiftAmt->Value.getZExtValue() / 8;
|
||||
Src = ShiftSrc;
|
||||
}
|
||||
|
||||
return {{
|
||||
[=](MachineInstrBuilder &MIB) { MIB.addReg(Src); },
|
||||
[=](MachineInstrBuilder &MIB) { MIB.addImm(Key); } // index_key
|
||||
}};
|
||||
}
|
||||
|
||||
InstructionSelector::ComplexRendererFns
|
||||
AMDGPUInstructionSelector::selectSWMMACIndex16(MachineOperand &Root) const {
|
||||
|
||||
Register Src =
|
||||
getDefIgnoringCopies(Root.getReg(), *MRI)->getOperand(0).getReg();
|
||||
unsigned Key = 0;
|
||||
|
||||
Register ShiftSrc;
|
||||
std::optional<ValueAndVReg> ShiftAmt;
|
||||
if (mi_match(Src, *MRI, m_GLShr(m_Reg(ShiftSrc), m_GCst(ShiftAmt))) &&
|
||||
MRI->getType(ShiftSrc).getSizeInBits() == 32 &&
|
||||
ShiftAmt->Value.getZExtValue() == 16) {
|
||||
Src = ShiftSrc;
|
||||
Key = 1;
|
||||
}
|
||||
|
||||
return {{
|
||||
[=](MachineInstrBuilder &MIB) { MIB.addReg(Src); },
|
||||
[=](MachineInstrBuilder &MIB) { MIB.addImm(Key); } // index_key
|
||||
}};
|
||||
}
|
||||
|
||||
InstructionSelector::ComplexRendererFns
|
||||
AMDGPUInstructionSelector::selectVOP3OpSelMods(MachineOperand &Root) const {
|
||||
Register Src;
|
||||
|
@ -199,6 +199,19 @@ private:
|
||||
InstructionSelector::ComplexRendererFns
|
||||
selectWMMAOpSelVOP3PMods(MachineOperand &Root) const;
|
||||
|
||||
InstructionSelector::ComplexRendererFns
|
||||
selectWMMAModsF32NegAbs(MachineOperand &Root) const;
|
||||
InstructionSelector::ComplexRendererFns
|
||||
selectWMMAModsF16Neg(MachineOperand &Root) const;
|
||||
InstructionSelector::ComplexRendererFns
|
||||
selectWMMAModsF16NegAbs(MachineOperand &Root) const;
|
||||
InstructionSelector::ComplexRendererFns
|
||||
selectWMMAVISrc(MachineOperand &Root) const;
|
||||
InstructionSelector::ComplexRendererFns
|
||||
selectSWMMACIndex8(MachineOperand &Root) const;
|
||||
InstructionSelector::ComplexRendererFns
|
||||
selectSWMMACIndex16(MachineOperand &Root) const;
|
||||
|
||||
InstructionSelector::ComplexRendererFns
|
||||
selectVOP3OpSelMods(MachineOperand &Root) const;
|
||||
|
||||
|
@ -7134,6 +7134,29 @@ bool AMDGPULegalizerInfo::legalizeIntrinsic(LegalizerHelper &Helper,
|
||||
return legalizeDSAtomicFPIntrinsic(Helper, MI, IntrID);
|
||||
case Intrinsic::amdgcn_image_bvh_intersect_ray:
|
||||
return legalizeBVHIntrinsic(MI, B);
|
||||
case Intrinsic::amdgcn_swmmac_f16_16x16x32_f16:
|
||||
case Intrinsic::amdgcn_swmmac_bf16_16x16x32_bf16:
|
||||
case Intrinsic::amdgcn_swmmac_f32_16x16x32_bf16:
|
||||
case Intrinsic::amdgcn_swmmac_f32_16x16x32_f16:
|
||||
case Intrinsic::amdgcn_swmmac_f32_16x16x32_fp8_fp8:
|
||||
case Intrinsic::amdgcn_swmmac_f32_16x16x32_fp8_bf8:
|
||||
case Intrinsic::amdgcn_swmmac_f32_16x16x32_bf8_fp8:
|
||||
case Intrinsic::amdgcn_swmmac_f32_16x16x32_bf8_bf8: {
|
||||
Register Index = MI.getOperand(5).getReg();
|
||||
LLT S32 = LLT::scalar(32);
|
||||
if (MRI.getType(Index) != S32)
|
||||
MI.getOperand(5).setReg(B.buildAnyExt(S32, Index).getReg(0));
|
||||
return true;
|
||||
}
|
||||
case Intrinsic::amdgcn_swmmac_i32_16x16x32_iu4:
|
||||
case Intrinsic::amdgcn_swmmac_i32_16x16x32_iu8:
|
||||
case Intrinsic::amdgcn_swmmac_i32_16x16x64_iu4: {
|
||||
Register Index = MI.getOperand(7).getReg();
|
||||
LLT S32 = LLT::scalar(32);
|
||||
if (MRI.getType(Index) != S32)
|
||||
MI.getOperand(7).setReg(B.buildAnyExt(S32, Index).getReg(0));
|
||||
return true;
|
||||
}
|
||||
case Intrinsic::amdgcn_fmed3: {
|
||||
GISelChangeObserver &Observer = Helper.Observer;
|
||||
|
||||
|
@ -4505,6 +4505,22 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const {
|
||||
case Intrinsic::amdgcn_wmma_f32_16x16x16_f16:
|
||||
case Intrinsic::amdgcn_wmma_i32_16x16x16_iu4:
|
||||
case Intrinsic::amdgcn_wmma_i32_16x16x16_iu8:
|
||||
case Intrinsic::amdgcn_wmma_f32_16x16x16_fp8_fp8:
|
||||
case Intrinsic::amdgcn_wmma_f32_16x16x16_fp8_bf8:
|
||||
case Intrinsic::amdgcn_wmma_f32_16x16x16_bf8_fp8:
|
||||
case Intrinsic::amdgcn_wmma_f32_16x16x16_bf8_bf8:
|
||||
case Intrinsic::amdgcn_wmma_i32_16x16x32_iu4:
|
||||
case Intrinsic::amdgcn_swmmac_f32_16x16x32_f16:
|
||||
case Intrinsic::amdgcn_swmmac_f32_16x16x32_bf16:
|
||||
case Intrinsic::amdgcn_swmmac_f16_16x16x32_f16:
|
||||
case Intrinsic::amdgcn_swmmac_bf16_16x16x32_bf16:
|
||||
case Intrinsic::amdgcn_swmmac_i32_16x16x32_iu8:
|
||||
case Intrinsic::amdgcn_swmmac_i32_16x16x32_iu4:
|
||||
case Intrinsic::amdgcn_swmmac_i32_16x16x64_iu4:
|
||||
case Intrinsic::amdgcn_swmmac_f32_16x16x32_fp8_fp8:
|
||||
case Intrinsic::amdgcn_swmmac_f32_16x16x32_fp8_bf8:
|
||||
case Intrinsic::amdgcn_swmmac_f32_16x16x32_bf8_fp8:
|
||||
case Intrinsic::amdgcn_swmmac_f32_16x16x32_bf8_bf8:
|
||||
return getDefaultMappingVOP(MI);
|
||||
case Intrinsic::amdgcn_log:
|
||||
case Intrinsic::amdgcn_exp2:
|
||||
|
@ -414,6 +414,22 @@ def : SourceOfDivergence<int_amdgcn_wmma_f16_16x16x16_f16>;
|
||||
def : SourceOfDivergence<int_amdgcn_wmma_bf16_16x16x16_bf16>;
|
||||
def : SourceOfDivergence<int_amdgcn_wmma_i32_16x16x16_iu8>;
|
||||
def : SourceOfDivergence<int_amdgcn_wmma_i32_16x16x16_iu4>;
|
||||
def : SourceOfDivergence<int_amdgcn_wmma_f32_16x16x16_fp8_fp8>;
|
||||
def : SourceOfDivergence<int_amdgcn_wmma_f32_16x16x16_fp8_bf8>;
|
||||
def : SourceOfDivergence<int_amdgcn_wmma_f32_16x16x16_bf8_fp8>;
|
||||
def : SourceOfDivergence<int_amdgcn_wmma_f32_16x16x16_bf8_bf8>;
|
||||
def : SourceOfDivergence<int_amdgcn_wmma_i32_16x16x32_iu4>;
|
||||
def : SourceOfDivergence<int_amdgcn_swmmac_f32_16x16x32_f16>;
|
||||
def : SourceOfDivergence<int_amdgcn_swmmac_f32_16x16x32_bf16>;
|
||||
def : SourceOfDivergence<int_amdgcn_swmmac_f16_16x16x32_f16>;
|
||||
def : SourceOfDivergence<int_amdgcn_swmmac_bf16_16x16x32_bf16>;
|
||||
def : SourceOfDivergence<int_amdgcn_swmmac_i32_16x16x32_iu8>;
|
||||
def : SourceOfDivergence<int_amdgcn_swmmac_i32_16x16x32_iu4>;
|
||||
def : SourceOfDivergence<int_amdgcn_swmmac_i32_16x16x64_iu4>;
|
||||
def : SourceOfDivergence<int_amdgcn_swmmac_f32_16x16x32_fp8_fp8>;
|
||||
def : SourceOfDivergence<int_amdgcn_swmmac_f32_16x16x32_fp8_bf8>;
|
||||
def : SourceOfDivergence<int_amdgcn_swmmac_f32_16x16x32_bf8_fp8>;
|
||||
def : SourceOfDivergence<int_amdgcn_swmmac_f32_16x16x32_bf8_bf8>;
|
||||
def : SourceOfDivergence<int_amdgcn_global_load_tr>;
|
||||
|
||||
// The dummy boolean output is divergent from the IR's perspective,
|
||||
|
@ -151,6 +151,8 @@ public:
|
||||
ImmTyOpSelHi,
|
||||
ImmTyNegLo,
|
||||
ImmTyNegHi,
|
||||
ImmTyIndexKey8bit,
|
||||
ImmTyIndexKey16bit,
|
||||
ImmTyDPP8,
|
||||
ImmTyDppCtrl,
|
||||
ImmTyDppRowMask,
|
||||
@ -383,6 +385,8 @@ public:
|
||||
bool isGDS() const { return isImmTy(ImmTyGDS); }
|
||||
bool isLDS() const { return isImmTy(ImmTyLDS); }
|
||||
bool isCPol() const { return isImmTy(ImmTyCPol); }
|
||||
bool isIndexKey8bit() const { return isImmTy(ImmTyIndexKey8bit); }
|
||||
bool isIndexKey16bit() const { return isImmTy(ImmTyIndexKey16bit); }
|
||||
bool isTFE() const { return isImmTy(ImmTyTFE); }
|
||||
bool isFORMAT() const { return isImmTy(ImmTyFORMAT) && isUInt<7>(getImm()); }
|
||||
bool isDppBankMask() const { return isImmTy(ImmTyDppBankMask); }
|
||||
@ -656,6 +660,14 @@ public:
|
||||
return isVISrcF16() || isVISrcB32();
|
||||
}
|
||||
|
||||
bool isVISrc_64F16() const {
|
||||
return isRegOrInlineNoMods(AMDGPU::VReg_64RegClassID, MVT::f16);
|
||||
}
|
||||
|
||||
bool isVISrc_64B32() const {
|
||||
return isRegOrInlineNoMods(AMDGPU::VReg_64RegClassID, MVT::i32);
|
||||
}
|
||||
|
||||
bool isVISrc_64B64() const {
|
||||
return isRegOrInlineNoMods(AMDGPU::VReg_64RegClassID, MVT::i64);
|
||||
}
|
||||
@ -672,6 +684,14 @@ public:
|
||||
return isRegOrInlineNoMods(AMDGPU::VReg_64RegClassID, MVT::i32);
|
||||
}
|
||||
|
||||
bool isVISrc_256B32() const {
|
||||
return isRegOrInlineNoMods(AMDGPU::VReg_256RegClassID, MVT::i32);
|
||||
}
|
||||
|
||||
bool isVISrc_256F32() const {
|
||||
return isRegOrInlineNoMods(AMDGPU::VReg_256RegClassID, MVT::f32);
|
||||
}
|
||||
|
||||
bool isVISrc_256B64() const {
|
||||
return isRegOrInlineNoMods(AMDGPU::VReg_256RegClassID, MVT::i64);
|
||||
}
|
||||
@ -1047,6 +1067,8 @@ public:
|
||||
case ImmTyOffset1: OS << "Offset1"; break;
|
||||
case ImmTySMEMOffsetMod: OS << "SMEMOffsetMod"; break;
|
||||
case ImmTyCPol: OS << "CPol"; break;
|
||||
case ImmTyIndexKey8bit: OS << "index_key"; break;
|
||||
case ImmTyIndexKey16bit: OS << "index_key"; break;
|
||||
case ImmTyTFE: OS << "TFE"; break;
|
||||
case ImmTyD16: OS << "D16"; break;
|
||||
case ImmTyFORMAT: OS << "FORMAT"; break;
|
||||
@ -1604,6 +1626,11 @@ public:
|
||||
ParseStatus parseRegWithFPInputMods(OperandVector &Operands);
|
||||
ParseStatus parseRegWithIntInputMods(OperandVector &Operands);
|
||||
ParseStatus parseVReg32OrOff(OperandVector &Operands);
|
||||
ParseStatus tryParseIndexKey(OperandVector &Operands,
|
||||
AMDGPUOperand::ImmTy ImmTy);
|
||||
ParseStatus parseIndexKey8bit(OperandVector &Operands);
|
||||
ParseStatus parseIndexKey16bit(OperandVector &Operands);
|
||||
|
||||
ParseStatus parseDfmtNfmt(int64_t &Format);
|
||||
ParseStatus parseUfmt(int64_t &Format);
|
||||
ParseStatus parseSymbolicSplitFormat(StringRef FormatStr, SMLoc Loc,
|
||||
@ -1784,6 +1811,8 @@ public:
|
||||
void cvtVOP3OpSel(MCInst &Inst, const OperandVector &Operands);
|
||||
void cvtVOP3(MCInst &Inst, const OperandVector &Operands);
|
||||
void cvtVOP3P(MCInst &Inst, const OperandVector &Operands);
|
||||
void cvtSWMMAC(MCInst &Inst, const OperandVector &Operands);
|
||||
|
||||
void cvtVOPD(MCInst &Inst, const OperandVector &Operands);
|
||||
void cvtVOP3OpSel(MCInst &Inst, const OperandVector &Operands,
|
||||
OptionalImmIndexMap &OptionalIdx);
|
||||
@ -4364,7 +4393,11 @@ bool AMDGPUAsmParser::validateNeg(const MCInst &Inst, int OpName) {
|
||||
uint64_t TSFlags = MII.get(Opc).TSFlags;
|
||||
|
||||
// v_dot4 fp8/bf8 neg_lo/neg_hi not allowed on src0 and src1 (allowed on src2)
|
||||
if (!(TSFlags & SIInstrFlags::IsDOT))
|
||||
// v_wmma iu4/iu8 neg_lo not allowed on src2 (allowed on src0, src1)
|
||||
// v_swmmac f16/bf16 neg_lo/neg_hi not allowed on src2 (allowed on src0, src1)
|
||||
// other wmma/swmmac instructions don't have neg_lo/neg_hi operand.
|
||||
if (!(TSFlags & SIInstrFlags::IsDOT) && !(TSFlags & SIInstrFlags::IsWMMA) &&
|
||||
!(TSFlags & SIInstrFlags::IsSWMMAC))
|
||||
return true;
|
||||
|
||||
int NegIdx = AMDGPU::getNamedOperandIdx(Opc, OpName);
|
||||
@ -6465,6 +6498,33 @@ bool AMDGPUAsmParser::tryParseFmt(const char *Pref,
|
||||
return true;
|
||||
}
|
||||
|
||||
ParseStatus AMDGPUAsmParser::tryParseIndexKey(OperandVector &Operands,
|
||||
AMDGPUOperand::ImmTy ImmTy) {
|
||||
const char *Pref = "index_key";
|
||||
int64_t ImmVal = 0;
|
||||
SMLoc Loc = getLoc();
|
||||
auto Res = parseIntWithPrefix(Pref, ImmVal);
|
||||
if (!Res.isSuccess())
|
||||
return Res;
|
||||
|
||||
if (ImmTy == AMDGPUOperand::ImmTyIndexKey16bit && (ImmVal < 0 || ImmVal > 1))
|
||||
return Error(Loc, Twine("out of range ", StringRef(Pref)));
|
||||
|
||||
if (ImmTy == AMDGPUOperand::ImmTyIndexKey8bit && (ImmVal < 0 || ImmVal > 3))
|
||||
return Error(Loc, Twine("out of range ", StringRef(Pref)));
|
||||
|
||||
Operands.push_back(AMDGPUOperand::CreateImm(this, ImmVal, Loc, ImmTy));
|
||||
return ParseStatus::Success;
|
||||
}
|
||||
|
||||
ParseStatus AMDGPUAsmParser::parseIndexKey8bit(OperandVector &Operands) {
|
||||
return tryParseIndexKey(Operands, AMDGPUOperand::ImmTyIndexKey8bit);
|
||||
}
|
||||
|
||||
ParseStatus AMDGPUAsmParser::parseIndexKey16bit(OperandVector &Operands) {
|
||||
return tryParseIndexKey(Operands, AMDGPUOperand::ImmTyIndexKey16bit);
|
||||
}
|
||||
|
||||
// dfmt and nfmt (in a tbuffer instruction) are parsed as one to allow their
|
||||
// values to live in a joint format operand in the MCInst encoding.
|
||||
ParseStatus AMDGPUAsmParser::parseDfmtNfmt(int64_t &Format) {
|
||||
@ -8329,10 +8389,12 @@ void AMDGPUAsmParser::cvtVOP3P(MCInst &Inst, const OperandVector &Operands,
|
||||
}
|
||||
|
||||
int NegLoIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::neg_lo);
|
||||
if (NegLoIdx != -1) {
|
||||
if (NegLoIdx != -1)
|
||||
addOptionalImmOperand(Inst, Operands, OptIdx, AMDGPUOperand::ImmTyNegLo);
|
||||
|
||||
int NegHiIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::neg_hi);
|
||||
if (NegHiIdx != -1)
|
||||
addOptionalImmOperand(Inst, Operands, OptIdx, AMDGPUOperand::ImmTyNegHi);
|
||||
}
|
||||
|
||||
const int Ops[] = { AMDGPU::OpName::src0,
|
||||
AMDGPU::OpName::src1,
|
||||
@ -8352,11 +8414,11 @@ void AMDGPUAsmParser::cvtVOP3P(MCInst &Inst, const OperandVector &Operands,
|
||||
if (OpSelHiIdx != -1)
|
||||
OpSelHi = Inst.getOperand(OpSelHiIdx).getImm();
|
||||
|
||||
if (NegLoIdx != -1) {
|
||||
int NegHiIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::neg_hi);
|
||||
if (NegLoIdx != -1)
|
||||
NegLo = Inst.getOperand(NegLoIdx).getImm();
|
||||
|
||||
if (NegHiIdx != -1)
|
||||
NegHi = Inst.getOperand(NegHiIdx).getImm();
|
||||
}
|
||||
|
||||
for (int J = 0; J < 3; ++J) {
|
||||
int OpIdx = AMDGPU::getNamedOperandIdx(Opc, Ops[J]);
|
||||
@ -8392,6 +8454,43 @@ void AMDGPUAsmParser::cvtVOP3P(MCInst &Inst, const OperandVector &Operands) {
|
||||
cvtVOP3P(Inst, Operands, OptIdx);
|
||||
}
|
||||
|
||||
static void addSrcModifiersAndSrc(MCInst &Inst, const OperandVector &Operands,
|
||||
unsigned i, unsigned Opc, unsigned OpName) {
|
||||
if (AMDGPU::getNamedOperandIdx(Opc, OpName) != -1)
|
||||
((AMDGPUOperand &)*Operands[i]).addRegOrImmWithFPInputModsOperands(Inst, 2);
|
||||
else
|
||||
((AMDGPUOperand &)*Operands[i]).addRegOperands(Inst, 1);
|
||||
}
|
||||
|
||||
void AMDGPUAsmParser::cvtSWMMAC(MCInst &Inst, const OperandVector &Operands) {
|
||||
unsigned Opc = Inst.getOpcode();
|
||||
|
||||
((AMDGPUOperand &)*Operands[1]).addRegOperands(Inst, 1);
|
||||
addSrcModifiersAndSrc(Inst, Operands, 2, Opc, AMDGPU::OpName::src0_modifiers);
|
||||
addSrcModifiersAndSrc(Inst, Operands, 3, Opc, AMDGPU::OpName::src1_modifiers);
|
||||
((AMDGPUOperand &)*Operands[1]).addRegOperands(Inst, 1); // srcTiedDef
|
||||
((AMDGPUOperand &)*Operands[4]).addRegOperands(Inst, 1); // src2
|
||||
|
||||
OptionalImmIndexMap OptIdx;
|
||||
for (unsigned i = 5; i < Operands.size(); ++i) {
|
||||
AMDGPUOperand &Op = ((AMDGPUOperand &)*Operands[i]);
|
||||
OptIdx[Op.getImmTy()] = i;
|
||||
}
|
||||
|
||||
if (AMDGPU::hasNamedOperand(Opc, AMDGPU::OpName::index_key_8bit))
|
||||
addOptionalImmOperand(Inst, Operands, OptIdx,
|
||||
AMDGPUOperand::ImmTyIndexKey8bit);
|
||||
|
||||
if (AMDGPU::hasNamedOperand(Opc, AMDGPU::OpName::index_key_16bit))
|
||||
addOptionalImmOperand(Inst, Operands, OptIdx,
|
||||
AMDGPUOperand::ImmTyIndexKey16bit);
|
||||
|
||||
if (AMDGPU::hasNamedOperand(Opc, AMDGPU::OpName::clamp))
|
||||
addOptionalImmOperand(Inst, Operands, OptIdx, AMDGPUOperand::ImmTyClampSI);
|
||||
|
||||
cvtVOP3P(Inst, Operands, OptIdx);
|
||||
}
|
||||
|
||||
//===----------------------------------------------------------------------===//
|
||||
// VOPD
|
||||
//===----------------------------------------------------------------------===//
|
||||
|
@ -260,8 +260,12 @@ DECODE_OPERAND_SRC_REG_OR_IMM_9(VS_32, OPW32, 32)
|
||||
DECODE_OPERAND_SRC_REG_OR_IMM_9(VS_64, OPW64, 64)
|
||||
DECODE_OPERAND_SRC_REG_OR_IMM_9(VS_64, OPW64, 32)
|
||||
DECODE_OPERAND_SRC_REG_OR_IMM_9(VReg_64, OPW64, 64)
|
||||
DECODE_OPERAND_SRC_REG_OR_IMM_9(VReg_64, OPW64, 32)
|
||||
DECODE_OPERAND_SRC_REG_OR_IMM_9(VReg_64, OPW64, 16)
|
||||
DECODE_OPERAND_SRC_REG_OR_IMM_9(VReg_128, OPW128, 32)
|
||||
DECODE_OPERAND_SRC_REG_OR_IMM_9(VReg_128, OPW128, 16)
|
||||
DECODE_OPERAND_SRC_REG_OR_IMM_9(VReg_256, OPW256, 64)
|
||||
DECODE_OPERAND_SRC_REG_OR_IMM_9(VReg_256, OPW256, 32)
|
||||
DECODE_OPERAND_SRC_REG_OR_IMM_9(VReg_512, OPW512, 32)
|
||||
DECODE_OPERAND_SRC_REG_OR_IMM_9(VReg_1024, OPW1024, 32)
|
||||
|
||||
@ -704,6 +708,10 @@ DecodeStatus AMDGPUDisassembler::getInstruction(MCInst &MI, uint64_t &Size,
|
||||
break;
|
||||
|
||||
Res = tryDecodeInst(DecoderTableWMMAGFX1164, MI, QW, Address, CS);
|
||||
if (Res)
|
||||
break;
|
||||
|
||||
Res = tryDecodeInst(DecoderTableWMMAGFX1264, MI, QW, Address, CS);
|
||||
} while (false);
|
||||
|
||||
if (Res && AMDGPU::isMAC(MI.getOpcode())) {
|
||||
|
@ -1716,14 +1716,14 @@ bool GCNHazardRecognizer::fixVALUTransUseHazard(MachineInstr *MI) {
|
||||
}
|
||||
|
||||
bool GCNHazardRecognizer::fixWMMAHazards(MachineInstr *MI) {
|
||||
if (!SIInstrInfo::isWMMA(*MI))
|
||||
if (!SIInstrInfo::isWMMA(*MI) && !SIInstrInfo::isSWMMAC(*MI))
|
||||
return false;
|
||||
|
||||
const SIInstrInfo *TII = ST.getInstrInfo();
|
||||
const SIRegisterInfo *TRI = ST.getRegisterInfo();
|
||||
|
||||
auto IsHazardFn = [MI, TII, TRI](const MachineInstr &I) {
|
||||
if (!SIInstrInfo::isWMMA(I))
|
||||
auto IsHazardFn = [MI, TII, TRI, this](const MachineInstr &I) {
|
||||
if (!SIInstrInfo::isWMMA(I) && !SIInstrInfo::isSWMMAC(I))
|
||||
return false;
|
||||
|
||||
// Src0 or Src1 of the current wmma instruction overlaps with the dest of
|
||||
@ -1753,6 +1753,7 @@ bool GCNHazardRecognizer::fixWMMAHazards(MachineInstr *MI) {
|
||||
const MachineOperand *Src2Mods =
|
||||
TII->getNamedOperand(*MI, AMDGPU::OpName::src2_modifiers);
|
||||
const bool NoSrc2Mods =
|
||||
!Src2Mods ||
|
||||
(Src2Mods->getImm() & (SISrcMods::NEG | SISrcMods::NEG_HI)) == 0;
|
||||
// Exception: there is no hazard if the wmma instructions are of the same
|
||||
// type and there is no input modifier on src2 of the current instruction.
|
||||
@ -1760,6 +1761,18 @@ bool GCNHazardRecognizer::fixWMMAHazards(MachineInstr *MI) {
|
||||
TII->pseudoToMCOpcode(MI->getOpcode())));
|
||||
}
|
||||
|
||||
// GFX12+ allows overlap of matrix C with PrevDstReg (hardware will stall)
|
||||
// but Index can't overlap with PrevDstReg.
|
||||
if (AMDGPU::isGFX12Plus(ST)) {
|
||||
if (SIInstrInfo::isSWMMAC(*MI)) {
|
||||
const Register CurIndex =
|
||||
TII->getNamedOperand(*MI, AMDGPU::OpName::src2)->getReg();
|
||||
if (TRI->regsOverlap(PrevDstReg, CurIndex))
|
||||
return true;
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
return false;
|
||||
};
|
||||
|
||||
|
@ -1275,6 +1275,23 @@ void AMDGPUInstPrinter::printPackedModifier(const MCInst *MI,
|
||||
(ModIdx != -1) ? MI->getOperand(ModIdx).getImm() : DefaultValue;
|
||||
}
|
||||
|
||||
// Print three values of neg/opsel for wmma instructions (prints 0 when there
|
||||
// is no src_modifier operand instead of not printing anything).
|
||||
if (MII.get(MI->getOpcode()).TSFlags & SIInstrFlags::IsSWMMAC ||
|
||||
MII.get(MI->getOpcode()).TSFlags & SIInstrFlags::IsWMMA) {
|
||||
NumOps = 0;
|
||||
int DefaultValue = Mod == SISrcMods::OP_SEL_1;
|
||||
for (int OpName :
|
||||
{AMDGPU::OpName::src0_modifiers, AMDGPU::OpName::src1_modifiers,
|
||||
AMDGPU::OpName::src2_modifiers}) {
|
||||
int Idx = AMDGPU::getNamedOperandIdx(Opc, OpName);
|
||||
if (Idx != -1)
|
||||
Ops[NumOps++] = MI->getOperand(Idx).getImm();
|
||||
else
|
||||
Ops[NumOps++] = DefaultValue;
|
||||
}
|
||||
}
|
||||
|
||||
const bool HasDstSel =
|
||||
NumOps > 0 &&
|
||||
Mod == SISrcMods::OP_SEL_0 &&
|
||||
@ -1336,6 +1353,26 @@ void AMDGPUInstPrinter::printNegHi(const MCInst *MI, unsigned OpNo,
|
||||
printPackedModifier(MI, " neg_hi:[", SISrcMods::NEG_HI, O);
|
||||
}
|
||||
|
||||
void AMDGPUInstPrinter::printIndexKey8bit(const MCInst *MI, unsigned OpNo,
|
||||
const MCSubtargetInfo &STI,
|
||||
raw_ostream &O) {
|
||||
auto Imm = MI->getOperand(OpNo).getImm() & 0x7;
|
||||
if (Imm == 0)
|
||||
return;
|
||||
|
||||
O << " index_key:" << Imm;
|
||||
}
|
||||
|
||||
void AMDGPUInstPrinter::printIndexKey16bit(const MCInst *MI, unsigned OpNo,
|
||||
const MCSubtargetInfo &STI,
|
||||
raw_ostream &O) {
|
||||
auto Imm = MI->getOperand(OpNo).getImm() & 0x7;
|
||||
if (Imm == 0)
|
||||
return;
|
||||
|
||||
O << " index_key:" << Imm;
|
||||
}
|
||||
|
||||
void AMDGPUInstPrinter::printInterpSlot(const MCInst *MI, unsigned OpNum,
|
||||
const MCSubtargetInfo &STI,
|
||||
raw_ostream &O) {
|
||||
|
@ -139,6 +139,10 @@ private:
|
||||
const MCSubtargetInfo &STI, raw_ostream &O);
|
||||
void printNegHi(const MCInst *MI, unsigned OpNo,
|
||||
const MCSubtargetInfo &STI, raw_ostream &O);
|
||||
void printIndexKey8bit(const MCInst *MI, unsigned OpNo,
|
||||
const MCSubtargetInfo &STI, raw_ostream &O);
|
||||
void printIndexKey16bit(const MCInst *MI, unsigned OpNo,
|
||||
const MCSubtargetInfo &STI, raw_ostream &O);
|
||||
void printInterpSlot(const MCInst *MI, unsigned OpNo,
|
||||
const MCSubtargetInfo &STI, raw_ostream &O);
|
||||
void printInterpAttr(const MCInst *MI, unsigned OpNo,
|
||||
|
@ -167,6 +167,9 @@ enum : uint64_t {
|
||||
|
||||
// ds_gws_* instructions.
|
||||
GWS = UINT64_C(1) << 62,
|
||||
|
||||
// Is a SWMMAC instruction.
|
||||
IsSWMMAC = UINT64_C(1) << 63,
|
||||
};
|
||||
|
||||
// v_cmp_class_* etc. use a 10-bit mask for what operation is checked.
|
||||
|
@ -208,6 +208,7 @@ bool SIFoldOperands::canUseImmWithOpSel(FoldCandidate &Fold) const {
|
||||
assert(Old.isReg() && Fold.isImm());
|
||||
|
||||
if (!(TSFlags & SIInstrFlags::IsPacked) || (TSFlags & SIInstrFlags::IsMAI) ||
|
||||
(TSFlags & SIInstrFlags::IsWMMA) || (TSFlags & SIInstrFlags::IsSWMMAC) ||
|
||||
(ST->hasDOTOpSelHazard() && (TSFlags & SIInstrFlags::IsDOT)))
|
||||
return false;
|
||||
|
||||
|
@ -8242,6 +8242,36 @@ SDValue SITargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
|
||||
SIInstrInfo::MO_ABS32_LO);
|
||||
return {DAG.getMachineNode(AMDGPU::S_MOV_B32, DL, MVT::i32, GA), 0};
|
||||
}
|
||||
case Intrinsic::amdgcn_swmmac_f16_16x16x32_f16:
|
||||
case Intrinsic::amdgcn_swmmac_bf16_16x16x32_bf16:
|
||||
case Intrinsic::amdgcn_swmmac_f32_16x16x32_bf16:
|
||||
case Intrinsic::amdgcn_swmmac_f32_16x16x32_f16:
|
||||
case Intrinsic::amdgcn_swmmac_f32_16x16x32_fp8_fp8:
|
||||
case Intrinsic::amdgcn_swmmac_f32_16x16x32_fp8_bf8:
|
||||
case Intrinsic::amdgcn_swmmac_f32_16x16x32_bf8_fp8:
|
||||
case Intrinsic::amdgcn_swmmac_f32_16x16x32_bf8_bf8: {
|
||||
if (Op.getOperand(4).getValueType() == MVT::i32)
|
||||
return SDValue();
|
||||
|
||||
SDLoc SL(Op);
|
||||
auto IndexKeyi32 = DAG.getAnyExtOrTrunc(Op.getOperand(4), SL, MVT::i32);
|
||||
return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, SL, Op.getValueType(),
|
||||
Op.getOperand(0), Op.getOperand(1), Op.getOperand(2),
|
||||
Op.getOperand(3), IndexKeyi32);
|
||||
}
|
||||
case Intrinsic::amdgcn_swmmac_i32_16x16x32_iu4:
|
||||
case Intrinsic::amdgcn_swmmac_i32_16x16x32_iu8:
|
||||
case Intrinsic::amdgcn_swmmac_i32_16x16x64_iu4: {
|
||||
if (Op.getOperand(6).getValueType() == MVT::i32)
|
||||
return SDValue();
|
||||
|
||||
SDLoc SL(Op);
|
||||
auto IndexKeyi32 = DAG.getAnyExtOrTrunc(Op.getOperand(6), SL, MVT::i32);
|
||||
return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, SL, Op.getValueType(),
|
||||
{Op.getOperand(0), Op.getOperand(1), Op.getOperand(2),
|
||||
Op.getOperand(3), Op.getOperand(4), Op.getOperand(5),
|
||||
IndexKeyi32, Op.getOperand(7)});
|
||||
}
|
||||
default:
|
||||
if (const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr =
|
||||
AMDGPU::getImageDimIntrinsicInfo(IntrinsicID))
|
||||
|
@ -161,6 +161,9 @@ class InstSI <dag outs, dag ins, string asm = "",
|
||||
// ds_gws_* instructions.
|
||||
field bit GWS = 0;
|
||||
|
||||
// This bit indicates that this is one of SWMMAC instructions.
|
||||
field bit IsSWMMAC = 0;
|
||||
|
||||
// These need to be kept in sync with the enum in SIInstrFlags.
|
||||
let TSFlags{0} = SALU;
|
||||
let TSFlags{1} = VALU;
|
||||
@ -248,6 +251,8 @@ class InstSI <dag outs, dag ins, string asm = "",
|
||||
|
||||
let TSFlags{62} = GWS;
|
||||
|
||||
let TSFlags{63} = IsSWMMAC;
|
||||
|
||||
let SchedRW = [Write32Bit];
|
||||
|
||||
let AsmVariantName = AMDGPUAsmVariants.Default;
|
||||
|
@ -802,6 +802,14 @@ public:
|
||||
return isMFMA(MI) || isWMMA(MI);
|
||||
}
|
||||
|
||||
static bool isSWMMAC(const MachineInstr &MI) {
|
||||
return MI.getDesc().TSFlags & SIInstrFlags::IsSWMMAC;
|
||||
}
|
||||
|
||||
bool isSWMMAC(uint16_t Opcode) const {
|
||||
return get(Opcode).TSFlags & SIInstrFlags::IsSWMMAC;
|
||||
}
|
||||
|
||||
bool isDOT(uint16_t Opcode) const {
|
||||
return get(Opcode).TSFlags & SIInstrFlags::IsDOT;
|
||||
}
|
||||
|
@ -1088,6 +1088,9 @@ def op_sel_hi0 : ArrayOperand0<"op_sel_hi", "OpSelHi">;
|
||||
def neg_lo0 : ArrayOperand0<"neg_lo", "NegLo">;
|
||||
def neg_hi0 : ArrayOperand0<"neg_hi", "NegHi">;
|
||||
|
||||
def IndexKey16bit : CustomOperand<i32, 1>;
|
||||
def IndexKey8bit : CustomOperand<i32, 1>;
|
||||
|
||||
def dpp8 : CustomOperand<i32, 0, "DPP8">;
|
||||
def dpp_ctrl : CustomOperand<i32, 0, "DPPCtrl">;
|
||||
|
||||
@ -1344,6 +1347,13 @@ def VOP3PModsDOT : ComplexPattern<untyped, 2, "SelectVOP3PModsDOT">;
|
||||
def VOP3PModsNeg : ComplexPattern<untyped, 1, "SelectVOP3PModsNeg">;
|
||||
def WMMAOpSelVOP3PMods : ComplexPattern<untyped, 1, "SelectWMMAOpSelVOP3PMods">;
|
||||
|
||||
def WMMAModsF32NegAbs : ComplexPattern<untyped, 2, "SelectWMMAModsF32NegAbs">;
|
||||
def WMMAModsF16Neg : ComplexPattern<untyped, 2, "SelectWMMAModsF16Neg">;
|
||||
def WMMAModsF16NegAbs : ComplexPattern<untyped, 2, "SelectWMMAModsF16NegAbs">;
|
||||
def WMMAVISrc : ComplexPattern<untyped, 1, "SelectWMMAVISrc">;
|
||||
def SWMMACIndex8 : ComplexPattern<untyped, 2, "SelectSWMMACIndex8">;
|
||||
def SWMMACIndex16 : ComplexPattern<untyped, 2, "SelectSWMMACIndex16">;
|
||||
|
||||
def VOP3OpSel : ComplexPattern<untyped, 2, "SelectVOP3OpSel">;
|
||||
|
||||
def VOP3OpSelMods : ComplexPattern<untyped, 2, "SelectVOP3OpSelMods">;
|
||||
@ -2278,6 +2288,7 @@ class VOPProfile <list<ValueType> _ArgVT, bit _EnableClamp = 0> {
|
||||
field bit IsDOT = 0;
|
||||
field bit IsSingle = 0;
|
||||
field bit IsWMMA = 0;
|
||||
field bit IsSWMMAC = 0;
|
||||
|
||||
field bit HasDst = !ne(DstVT.Value, untyped.Value);
|
||||
field bit HasDst32 = HasDst;
|
||||
|
@ -1341,9 +1341,14 @@ def VCSrc_v2f16 : RegOrV2F16 <"VS_32", "OPERAND_REG_INLINE_C">;
|
||||
// VISrc_* Operands with a VGPR or an inline constant
|
||||
//===----------------------------------------------------------------------===//
|
||||
|
||||
def VISrc_64_f16 : RegOrF16 <"VReg_64", "OPERAND_REG_INLINE_C">;
|
||||
def VISrc_64_b32 : RegOrB32 <"VReg_64", "OPERAND_REG_INLINE_C">;
|
||||
def VISrc_64_f64 : RegOrF64 <"VReg_64", "OPERAND_REG_INLINE_C">;
|
||||
def VISrc_128_f16 : RegOrF16 <"VReg_128", "OPERAND_REG_INLINE_C">;
|
||||
def VISrc_128_b32 : RegOrB32 <"VReg_128", "OPERAND_REG_INLINE_C">;
|
||||
def VISrc_128_f32 : RegOrF32 <"VReg_128", "OPERAND_REG_INLINE_C">;
|
||||
def VISrc_256_b32 : RegOrB32 <"VReg_256", "OPERAND_REG_INLINE_C">;
|
||||
def VISrc_256_f32 : RegOrF32 <"VReg_256", "OPERAND_REG_INLINE_C">;
|
||||
def VISrc_256_f64 : RegOrF64 <"VReg_256", "OPERAND_REG_INLINE_C">;
|
||||
def VISrc_512_b32 : RegOrB32 <"VReg_512", "OPERAND_REG_INLINE_C">;
|
||||
def VISrc_512_f32 : RegOrF32 <"VReg_512", "OPERAND_REG_INLINE_C">;
|
||||
|
@ -936,16 +936,19 @@ multiclass WMMAInst<string Suffix, string Instr, VOPProfile P, SDPatternOperator
|
||||
!cast<Instruction>(NAME # _threeaddr # Suffix)>;
|
||||
}
|
||||
|
||||
if !eq(Type, WMMAOpSel) then {
|
||||
def : WMMAOpSelPat<!cast<Instruction>(NAME # _twoaddr # Suffix), node, P>;
|
||||
} else if !eq(Type, WMMAUIClamp) then {
|
||||
def : WMMAUIClampPat<!cast<Instruction>(NAME # _twoaddr # Suffix), node, P>;
|
||||
} else {
|
||||
def : WMMARegularPat<!cast<Instruction>(NAME # _twoaddr # Suffix), node, P>;
|
||||
let SubtargetPredicate = isGFX11Only in {
|
||||
if !eq(Type, WMMAOpSel) then {
|
||||
def : WMMAOpSelPat<!cast<Instruction>(NAME # _twoaddr # Suffix), node, P>;
|
||||
} else if !eq(Type, WMMAUIClamp) then {
|
||||
def : WMMAUIClampPat<!cast<Instruction>(NAME # _twoaddr # Suffix), node, P>;
|
||||
} else {
|
||||
def : WMMARegularPat<!cast<Instruction>(NAME # _twoaddr # Suffix), node, P>;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
|
||||
let WaveSizePredicate = isWave32 in {
|
||||
defm V_WMMA_F32_16X16X16_F16 : WMMAInst<"_w32", "v_wmma_f32_16x16x16_f16", VOP_V8F32_V16F16_V16F16_V8F32, int_amdgcn_wmma_f32_16x16x16_f16, VRegSrc_256, WMMARegular, 1>;
|
||||
defm V_WMMA_F32_16X16X16_BF16 : WMMAInst<"_w32", "v_wmma_f32_16x16x16_bf16", VOP_V8F32_V16I16_V16I16_V8F32, int_amdgcn_wmma_f32_16x16x16_bf16, VRegSrc_256, WMMARegular, 1>;
|
||||
@ -969,6 +972,398 @@ let WaveSizePredicate = isWave64 in {
|
||||
|
||||
}
|
||||
|
||||
class VOP3PWMMA_Profile<list<ValueType> ArgTy, bit _IsSWMMAC, int _IndexType,
|
||||
bit _IsIU, bit _IsFP8BF8>
|
||||
: VOP3P_Profile<VOPProfile<ArgTy>> {
|
||||
bit IsIU = _IsIU;
|
||||
bit IsFP8BF8 = _IsFP8BF8;
|
||||
bit IsF16BF16 = !not(!or(IsIU, IsFP8BF8));
|
||||
|
||||
int IndexType = _IndexType;
|
||||
|
||||
let IsPacked = 1;
|
||||
let IsWMMA = !not(_IsSWMMAC);
|
||||
let IsSWMMAC = _IsSWMMAC;
|
||||
|
||||
bit IsAB_F16 = !and(IsF16BF16, ArgTy[1].isFP);
|
||||
bit IsAB_BF16 = !and(IsF16BF16, isIntType<ArgTy[1]>.ret);
|
||||
bit IsC_F32 = !or(!eq(ArgTy[3], v8f32), !eq(ArgTy[3], v4f32));
|
||||
bit IsC_BF16 = !or(!eq(ArgTy[3], v8i16), !eq(ArgTy[3], v4i16));
|
||||
bit IsC_F16 = !or(!eq(ArgTy[3], v8f16), !eq(ArgTy[3], v4f16));
|
||||
|
||||
bit NegLo01 = !or(IsF16BF16, IsIU);
|
||||
bit NegLo2 = !and(!or(IsF16BF16, IsFP8BF8), IsWMMA);
|
||||
bit NegHi01 = IsF16BF16;
|
||||
bit NegHi2 = !and(!or(IsF16BF16, IsFP8BF8), IsWMMA);
|
||||
bit NegLoAny = !or(NegLo01, NegLo2);
|
||||
bit NegHiAny = !or(NegHi01, NegHi2);
|
||||
|
||||
let DstRC = !cond(!eq(ArgTy[0], v8f32): VDst_256,
|
||||
!eq(ArgTy[0], v8i32): VDst_256,
|
||||
!eq(ArgTy[0], v8f16): VDst_128,
|
||||
!eq(ArgTy[0], v8i16): VDst_128,
|
||||
!eq(ArgTy[0], v4f32): VDst_128,
|
||||
!eq(ArgTy[0], v4i32): VDst_128,
|
||||
!eq(ArgTy[0], v4f16): VDst_64,
|
||||
!eq(ArgTy[0], v4i16): VDst_64);
|
||||
let Src0RC64 = !cond(!eq(ArgTy[1], v8f16): VRegSrc_128,
|
||||
!eq(ArgTy[1], v4f16): VRegSrc_64,
|
||||
!eq(ArgTy[1], v4i16): VRegSrc_64,
|
||||
!eq(ArgTy[1], v8i16): VRegSrc_128,
|
||||
!eq(ArgTy[1], v4i32): VRegSrc_128,
|
||||
!eq(ArgTy[1], v2i32): VRegSrc_64,
|
||||
!eq(ArgTy[1], i32) : VRegSrc_32);
|
||||
let Src1RC64 = !cond(!eq(ArgTy[2], v16f16): VRegSrc_256,
|
||||
!eq(ArgTy[2], v16i16): VRegSrc_256,
|
||||
!eq(ArgTy[2], v8f16): VRegSrc_128,
|
||||
!eq(ArgTy[2], v8i16): VRegSrc_128,
|
||||
!eq(ArgTy[2], v4i32): VRegSrc_128,
|
||||
!eq(ArgTy[1], v4i16): VRegSrc_64,
|
||||
!eq(ArgTy[1], v4f16): VRegSrc_64,
|
||||
!eq(ArgTy[2], v2i32): VRegSrc_64,
|
||||
!eq(ArgTy[2], i32) : VRegSrc_32);
|
||||
let Src2RC64 = !if(IsSWMMAC, DstRC,
|
||||
!cond(!eq(ArgTy[3], v8f32): VISrc_256_f32,
|
||||
!eq(ArgTy[3], v8i32): VISrc_256_b32,
|
||||
!eq(ArgTy[3], v8f16): VISrc_128_f16,
|
||||
!eq(ArgTy[3], v8i16): VISrc_128_f32, // bf16
|
||||
!eq(ArgTy[3], v4f16): VISrc_64_f16,
|
||||
!eq(ArgTy[3], v4i16): VISrc_64_b32,
|
||||
!eq(ArgTy[3], v4i32): VISrc_128_b32,
|
||||
!eq(ArgTy[3], v4f32): VISrc_128_f32));
|
||||
|
||||
// For f16 and bf16 matrices A and B, each element can be modified by
|
||||
// fneg(neg_lo,neg_hi = 1). For iu4 and iu8 matrices A and B neg_lo is
|
||||
// overloaded to mean unsigned/signed: neg_lo = 0 (u4 and u8) unsigned(zext)
|
||||
// neg_lo = 1 (i4 and i8) signed(sext). For f16, bf16 and f32 matrix C each
|
||||
// element can be modified by fneg(neg_lo = 1) or fabs(neg_hi = 1).
|
||||
|
||||
// Opcode | src0/src1 - matrix A/B | src2 - matrix C or Index
|
||||
// ---------------------------------------------------------------------------
|
||||
// wmma f32_f16 | both neg_lo,neg_hi = 1 | neg_lo = 1 neg C(f32)
|
||||
// wmma f32_bf16 | neg A/B (f16 or bf16) | neg_hi = 1 abs C(f32)
|
||||
// ---------------------------------------------------------------------------
|
||||
// wmma f16_f16 | both neg_lo,neg_hi = 1 | neg_lo = 1 neg C(f16 or bf16)
|
||||
// wmma bf16_bf16 | neg A/B (f16 or bf16) | neg_hi = 1 abs C(f16 or bf16)
|
||||
// ---------------------------------------------------------------------------
|
||||
// wmma i32_iu8/iu4 | neg_lo = 0 u4/u8(zext) | not allowed for
|
||||
// | neg_lo = 1 i4/i8(sext) | i32 matrices
|
||||
// ---------------------------------------------------------------------------
|
||||
// wmma f32_fp8/bf8 | not allowed for | neg_lo = 1 neg C(f32)
|
||||
// (4 instructions) | f8 and bf8 matrices | neg_hi = 1 abs C(f32)
|
||||
// ---------------------------------------------------------------------------
|
||||
// swmmac f32_f16 | both neg_lo,neg_hi = 1 | not allowed for sparse matrix
|
||||
// swmmac f32_bf16 | neg A/B (f16 or bf16) | A Index - matrix C is in dst
|
||||
// ---------------------------------------------------------------------------
|
||||
// swmmac f16_f16 | both neg_lo,neg_hi = 1 | not allowed for sparse matrix
|
||||
// swmmac bf16_bf16 | neg A/B (f16 or bf16) | A Index - matrix C is in dst
|
||||
// ---------------------------------------------------------------------------
|
||||
// swmmac i32_iu8/iu4 | neg_lo = 0 u4/u8(zext) | not allowed for sparse matrix
|
||||
// | neg_lo = 1 i4/i8(sext) | A Index - matrix C is in dst
|
||||
// ---------------------------------------------------------------------------
|
||||
// swmmac f32_fp8/bf8 | not allowed for | not allowed for sparse matrix
|
||||
// (4 instructions) | f8 and bf8 matrices | A Index - matrix C is in dst
|
||||
|
||||
// pseudo
|
||||
|
||||
// fp8bf8 wmmas don't use src (0 and 1) modifiers, iu use neg_lo, f16 and bf16
|
||||
// use neg_lo and neg_hi. iu wmmas (C is i32) don't use src 2 modifiers,
|
||||
// remaining wmmas(f16, bf16 and f8bf8) use neg_lo and neg_hi for C (C is f32
|
||||
// f16 or bf16). swmmac use index_key and don't use src 2 modifiers.
|
||||
|
||||
dag Src0Mods = !if(IsFP8BF8, (ins), (ins PackedF16InputMods:$src0_modifiers));
|
||||
dag Src1Mods = !if(IsFP8BF8, (ins), (ins PackedF16InputMods:$src1_modifiers));
|
||||
dag Src2Mods = !if(IsIU, (ins), (ins PackedF16InputMods:$src2_modifiers));
|
||||
dag IndexKey = !cond(!eq(IndexType, 0) : (ins),
|
||||
!eq(IndexType, 8) : (ins IndexKey8bit:$index_key_8bit),
|
||||
!eq(IndexType, 16): (ins IndexKey16bit:$index_key_16bit));
|
||||
dag Clamp = !if(IsIU, (ins clampmod0:$clamp), (ins));
|
||||
dag Neg = !cond(!and(NegLoAny, NegHiAny) : (ins neg_lo0:$neg_lo, neg_hi0:$neg_hi),
|
||||
!and(NegLoAny, !not(NegHiAny)) : (ins neg_lo0:$neg_lo),
|
||||
!and(!not(NegLoAny), !not(NegHiAny)) : (ins));
|
||||
|
||||
let InsVOP3P = !con(Src0Mods, (ins Src0RC64:$src0), Src1Mods, (ins Src1RC64:$src1),
|
||||
!cond(IsWMMA : !con(Src2Mods, (ins Src2RC64:$src2)),
|
||||
IsSWMMAC : !con((ins DstRC:$srcTiedDef), (ins VRegSrc_32:$src2), IndexKey)),
|
||||
Clamp, Neg);
|
||||
|
||||
// asm
|
||||
|
||||
string IndexKeyAsm = !cond(!eq(IndexType, 0) : "",
|
||||
!eq(IndexType, 8) : "$index_key_8bit",
|
||||
!eq(IndexType, 16) : "$index_key_16bit");
|
||||
string ClampAsm = !if(IsIU, "$clamp", "");
|
||||
string NegAsm = !cond(!and(NegLoAny, NegHiAny) : "$neg_lo$neg_hi",
|
||||
!and(NegLoAny, !not(NegHiAny)) : "$neg_lo",
|
||||
!and(!not(NegLoAny), !not(NegHiAny)) : "");
|
||||
|
||||
let AsmVOP3P = "$vdst, $src0, $src1, $src2"#IndexKeyAsm#NegAsm#ClampAsm;
|
||||
|
||||
// isel patterns
|
||||
|
||||
dag Src0InPat = !cond(IsAB_F16 : (ins (Src0VT (WMMAModsF16Neg Src0VT:$src0, i32:$src0_modifiers))),
|
||||
IsAB_BF16 : (ins Src0VT:$src0),
|
||||
IsIU : (ins (VOP3PModsNeg i32:$src0_modifiers), Src0VT:$src0),
|
||||
IsFP8BF8 : (ins Src0VT:$src0));
|
||||
dag Src0OutPat = !cond(IsAB_F16 : (ins i32:$src0_modifiers, Src0VT:$src0),
|
||||
IsAB_BF16 : (ins (i32 8), Src0VT:$src0),
|
||||
IsIU : (ins i32:$src0_modifiers, Src0VT:$src0),
|
||||
IsFP8BF8 : (ins Src0VT:$src0));
|
||||
dag Src1InPat = !cond(IsAB_F16 : (ins (Src1VT (WMMAModsF16Neg Src1VT:$src1, i32:$src1_modifiers))),
|
||||
IsAB_BF16 : (ins Src1VT:$src1),
|
||||
IsIU : (ins (VOP3PModsNeg i32:$src1_modifiers), Src1VT:$src1),
|
||||
IsFP8BF8 : (ins Src1VT:$src1));
|
||||
dag Src1OutPat = !cond(IsAB_F16 : (ins i32:$src1_modifiers, Src1VT:$src1),
|
||||
IsAB_BF16 : (ins (i32 8), Src1VT:$src1),
|
||||
IsIU : (ins i32:$src1_modifiers, Src1VT:$src1),
|
||||
IsFP8BF8 : (ins Src1VT:$src1));
|
||||
dag Src2InPatWmma = !cond(IsC_F32 : (ins (Src2VT (WMMAModsF32NegAbs Src2VT:$src2, i32:$src2_modifiers))),
|
||||
IsC_F16 : (ins (Src2VT (WMMAModsF16NegAbs Src2VT:$src2, i32:$src2_modifiers))),
|
||||
IsC_BF16 : (ins Src2VT:$src2),
|
||||
IsIU : (ins Src2VT:$src2),
|
||||
IsSWMMAC : (ins));
|
||||
dag Src2OutPatWmma = !cond(IsC_F32 : (ins i32:$src2_modifiers, Src2VT:$src2),
|
||||
IsC_F16 : (ins i32:$src2_modifiers, Src2VT:$src2),
|
||||
IsC_BF16 : (ins (i32 8), Src2VT:$src2),
|
||||
IsIU : (ins Src2VT:$src2),
|
||||
IsSWMMAC : (ins));
|
||||
dag ClampPat = !if(IsIU, (ins i1:$clamp), (ins));
|
||||
dag IndexInPat = !cond(!eq(IndexType, 0) : (ins i32:$src2),
|
||||
!eq(IndexType, 8) : (ins (i32 (SWMMACIndex8 i32:$src2, i32:$index_key_8bit))),
|
||||
!eq(IndexType, 16): (ins (i32 (SWMMACIndex16 i32:$src2, i32:$index_key_16bit))));
|
||||
dag IndexOutPat = !cond(!eq(IndexType, 0) : (ins i32:$src2),
|
||||
!eq(IndexType, 8) : (ins i32:$src2, i32:$index_key_8bit),
|
||||
!eq(IndexType, 16): (ins i32:$src2, i32:$index_key_16bit));
|
||||
dag Src2InlineInPat = (ins (Src2VT (WMMAVISrc Src2VT:$src2)));
|
||||
dag Src2InlineOutPat = !con(!if(IsIU, (ins), (ins (i32 8))), (ins Src2VT:$src2));
|
||||
|
||||
|
||||
dag WmmaInPat = !con(Src0InPat, Src1InPat, Src2InPatWmma, ClampPat);
|
||||
dag WmmaOutPat = !con(Src0OutPat, Src1OutPat, Src2OutPatWmma, ClampPat);
|
||||
|
||||
dag SwmmacInPat = !con(Src0InPat, Src1InPat, (ins Src2VT:$srcTiedDef), IndexInPat, ClampPat);
|
||||
dag SwmmacOutPat = !con(Src0OutPat, Src1OutPat, (ins Src2VT:$srcTiedDef), IndexOutPat, ClampPat);
|
||||
|
||||
// wmma pattern where src2 is inline imm uses _threeaddr pseudo,
|
||||
// can't use _twoaddr since it would violate src2 tied to vdst constraint.
|
||||
dag WmmaInlineInPat = !con(Src0InPat, Src1InPat, Src2InlineInPat, ClampPat);
|
||||
dag WmmaInlineOutPat = !con(Src0OutPat, Src1OutPat, Src2InlineOutPat, ClampPat);
|
||||
}
|
||||
|
||||
multiclass WMMAInstGFX12<string Instr, VOP3PWMMA_Profile WMMAProfile, string PseudoInstrSuffix> {
|
||||
let Mnemonic = Instr, mayRaiseFPException = 0, ReadsModeReg = 0 in {
|
||||
let Constraints = "@earlyclobber $vdst,$vdst = $src2", isConvertibleToThreeAddress = 1 in
|
||||
def _twoaddr : VOP3P_Pseudo<Instr, WMMAProfile>{
|
||||
let PseudoInstr = Instr#PseudoInstrSuffix;
|
||||
}
|
||||
|
||||
let Constraints = "@earlyclobber $vdst", SchedRW = [Write32Bit, Write32Bit] in
|
||||
def _threeaddr : VOP3P_Pseudo<Instr, WMMAProfile>{
|
||||
let PseudoInstr = Instr#PseudoInstrSuffix;
|
||||
}
|
||||
|
||||
}
|
||||
def : WMMAOpcodeMapping<!cast<Instruction>(NAME # _twoaddr),
|
||||
!cast<Instruction>(NAME # _threeaddr)>;
|
||||
}
|
||||
|
||||
multiclass SWMMACInstGFX12<string Instr, VOP3PWMMA_Profile WMMAProfile, string PseudoInstrSuffix> {
|
||||
def _twoaddr : VOP3P_Pseudo<Instr, WMMAProfile>{
|
||||
let Mnemonic = Instr;
|
||||
let PseudoInstr = Instr#PseudoInstrSuffix;
|
||||
let mayRaiseFPException = 0;
|
||||
let ReadsModeReg = 0;
|
||||
let AsmMatchConverter = "cvtSWMMAC";
|
||||
|
||||
let Constraints = "@earlyclobber $vdst,$vdst = $srcTiedDef";
|
||||
}
|
||||
}
|
||||
|
||||
// First argument in Profile is types for matrices D, A, B and C (D = A * B + C)
|
||||
// as used by llvm ir, types are vectors(with matrix elements)
|
||||
// wave32:
|
||||
// For 16x16 matrices, lanes 0 to 31 will have 8 matrix elts,
|
||||
// for 16 x 32 16 elts and for 16 x 64 lanes have 32 elts.
|
||||
// wave64:
|
||||
// lanes will have half the size of elements in lanes compared to wave32 with
|
||||
// exception of 16x16_iu4: lanes0-31 will have 8xi4, remaining lanes are ignored
|
||||
|
||||
// general idea on element distribution differences:
|
||||
// wave32: lane n has 8 matrix elements
|
||||
// wave64: lane n has first 4, lane n+32 has other 4 elements
|
||||
|
||||
// index size, for each 2 elements in lane you need 4bits in index
|
||||
|
||||
// Non-standard types (iu8, iu4, fp8, bf8) will be packed in vectors of i32s.
|
||||
// Original type for them is in comment on the right and refers to A and B.
|
||||
|
||||
def F32_F16_WMMA_w32 : VOP3PWMMA_Profile<[v8f32, v8f16, v8f16, v8f32], 0, 0, 0, 0>;
|
||||
def F32_BF16_WMMA_w32 : VOP3PWMMA_Profile<[v8f32, v8i16, v8i16, v8f32], 0, 0, 0, 0>;
|
||||
def F16_F16_WMMA_w32 : VOP3PWMMA_Profile<[v8f16, v8f16, v8f16, v8f16], 0, 0, 0, 0>;
|
||||
def BF16_BF16_WMMA_w32 : VOP3PWMMA_Profile<[v8i16, v8i16, v8i16, v8i16], 0, 0, 0, 0>;
|
||||
def I32_IU8_WMMA_w32 : VOP3PWMMA_Profile<[v8i32, v2i32, v2i32, v8i32], 0, 0, 1, 0>; // 8xi8
|
||||
def I32_IU4X16_WMMA_w32 : VOP3PWMMA_Profile<[v8i32, i32, i32, v8i32], 0, 0, 1, 0>; // 8xi4
|
||||
def F32_FP8BF8_WMMA_w32 : VOP3PWMMA_Profile<[v8f32, v2i32, v2i32, v8f32], 0, 0, 0, 1>; // 8xf8
|
||||
def I32_IU4X32_WMMA_w32 : VOP3PWMMA_Profile<[v8i32, v2i32, v2i32, v8i32], 0, 0, 1, 0>; // 16xi4
|
||||
|
||||
def F32_F16_WMMA_w64 : VOP3PWMMA_Profile<[v4f32, v4f16, v4f16, v4f32], 0, 0, 0, 0>;
|
||||
def F32_BF16_WMMA_w64 : VOP3PWMMA_Profile<[v4f32, v4i16, v4i16, v4f32], 0, 0, 0, 0>;
|
||||
def F16_F16_WMMA_w64 : VOP3PWMMA_Profile<[v4f16, v4f16, v4f16, v4f16], 0, 0, 0, 0>;
|
||||
def BF16_BF16_WMMA_w64 : VOP3PWMMA_Profile<[v4i16, v4i16, v4i16, v4i16], 0, 0, 0, 0>;
|
||||
def I32_IU8_WMMA_w64 : VOP3PWMMA_Profile<[v4i32, i32, i32, v4i32], 0, 0, 1, 0>; // 4xi8
|
||||
def I32_IU4X16_WMMA_w64 : VOP3PWMMA_Profile<[v4i32, i32, i32, v4i32], 0, 0, 1, 0>; // 8xi4 *
|
||||
def F32_FP8BF8_WMMA_w64 : VOP3PWMMA_Profile<[v4f32, i32, i32, v4f32], 0, 0, 0, 1>; // 4xf8
|
||||
def I32_IU4X32_WMMA_w64 : VOP3PWMMA_Profile<[v4i32, i32, i32, v4i32], 0, 0, 1, 0>; // 8xi4
|
||||
|
||||
def F32_F16_SWMMAC_w32 : VOP3PWMMA_Profile<[v8f32, v8f16, v16f16, v8f32], 1, 16, 0, 0>;
|
||||
def F32_BF16_SWMMAC_w32 : VOP3PWMMA_Profile<[v8f32, v8i16, v16i16, v8f32], 1, 16, 0, 0>;
|
||||
def F16_F16_SWMMAC_w32 : VOP3PWMMA_Profile<[v8f16, v8f16, v16f16, v8f16], 1, 16, 0, 0>;
|
||||
def BF16_BF16_SWMMAC_w32 : VOP3PWMMA_Profile<[v8i16, v8i16, v16i16, v8i16], 1, 16, 0, 0>;
|
||||
def I32_IU8_SWMMAC_w32 : VOP3PWMMA_Profile<[v8i32, v2i32, v4i32, v8i32], 1, 16, 1, 0>; // 8xi8, 16xi8
|
||||
def I32_IU4X32_SWMMAC_w32 : VOP3PWMMA_Profile<[v8i32, i32, v2i32, v8i32], 1, 16, 1, 0>; // 8xi4, 16xi4
|
||||
def I32_IU4X64_SWMMAC_w32 : VOP3PWMMA_Profile<[v8i32, v2i32, v4i32, v8i32], 1, 0, 1, 0>; // 16xi4, 32xi4 **
|
||||
def F32_FP8BF8_SWMMAC_w32 : VOP3PWMMA_Profile<[v8f32, v2i32, v4i32, v8f32], 1, 16, 0, 1>; // 8xf8, 16xf8
|
||||
|
||||
def F32_F16_SWMMAC_w64 : VOP3PWMMA_Profile<[v4f32, v4f16, v8f16, v4f32], 1, 8, 0, 0>;
|
||||
def F32_BF16_SWMMAC_w64 : VOP3PWMMA_Profile<[v4f32, v4i16, v8i16, v4f32], 1, 8, 0, 0>;
|
||||
def F16_F16_SWMMAC_w64 : VOP3PWMMA_Profile<[v4f16, v4f16, v8f16, v4f16], 1, 8, 0, 0>;
|
||||
def BF16_BF16_SWMMAC_w64 : VOP3PWMMA_Profile<[v4i16, v4i16, v8i16, v4i16], 1, 8, 0, 0>;
|
||||
def I32_IU8_SWMMAC_w64 : VOP3PWMMA_Profile<[v4i32, i32, v2i32, v4i32], 1, 8, 1, 0>; // 4xi8, 8xi8
|
||||
def I32_IU4X32_SWMMAC_w64 : VOP3PWMMA_Profile<[v4i32, i32, i32, v4i32], 1, 16, 1, 0>; // 8xi4, 8xi4 ***
|
||||
def I32_IU4X64_SWMMAC_w64 : VOP3PWMMA_Profile<[v4i32, i32, v2i32, v4i32], 1, 16, 1, 0>; // 8xi4, 16xi4
|
||||
def F32_FP8BF8_SWMMAC_w64 : VOP3PWMMA_Profile<[v4f32, i32, v2i32, v4f32], 1, 8, 0, 1>; // 4xf8, 8xf8
|
||||
|
||||
// * IU4X16_WMMA_w64 lanes 0-31 will have 8xi4, remaining lanes are ignored
|
||||
// ** IU4X64_SWMMAC_w32 index is i32, index_key is not used
|
||||
// *** IU4X32_SWMMAC_w64 lanes 0-31 will have 8xi4 remaining lanes are ignored
|
||||
// for matrix A, index is i16; Matrix B uses all lanes
|
||||
|
||||
let WaveSizePredicate = isWave32 in {
|
||||
defm V_WMMA_F32_16X16X16_F16_w32 : WMMAInstGFX12<"v_wmma_f32_16x16x16_f16", F32_F16_WMMA_w32, "_w32">;
|
||||
defm V_WMMA_F32_16X16X16_BF16_w32 : WMMAInstGFX12<"v_wmma_f32_16x16x16_bf16", F32_BF16_WMMA_w32, "_w32">;
|
||||
defm V_WMMA_F16_16X16X16_F16_w32 : WMMAInstGFX12<"v_wmma_f16_16x16x16_f16", F16_F16_WMMA_w32, "_w32">;
|
||||
defm V_WMMA_BF16_16X16X16_BF16_w32 : WMMAInstGFX12<"v_wmma_bf16_16x16x16_bf16", BF16_BF16_WMMA_w32, "_w32">;
|
||||
defm V_WMMA_I32_16X16X16_IU8_w32 : WMMAInstGFX12<"v_wmma_i32_16x16x16_iu8", I32_IU8_WMMA_w32, "_w32">;
|
||||
defm V_WMMA_I32_16X16X16_IU4_w32 : WMMAInstGFX12<"v_wmma_i32_16x16x16_iu4", I32_IU4X16_WMMA_w32, "_w32">;
|
||||
defm V_WMMA_F32_16X16X16_FP8_FP8_w32 : WMMAInstGFX12<"v_wmma_f32_16x16x16_fp8_fp8", F32_FP8BF8_WMMA_w32, "_w32">;
|
||||
defm V_WMMA_F32_16X16X16_FP8_BF8_w32 : WMMAInstGFX12<"v_wmma_f32_16x16x16_fp8_bf8", F32_FP8BF8_WMMA_w32, "_w32">;
|
||||
defm V_WMMA_F32_16X16X16_BF8_FP8_w32 : WMMAInstGFX12<"v_wmma_f32_16x16x16_bf8_fp8", F32_FP8BF8_WMMA_w32, "_w32">;
|
||||
defm V_WMMA_F32_16X16X16_BF8_BF8_w32 : WMMAInstGFX12<"v_wmma_f32_16x16x16_bf8_bf8", F32_FP8BF8_WMMA_w32, "_w32">;
|
||||
defm V_WMMA_I32_16X16X32_IU4_w32 : WMMAInstGFX12<"v_wmma_i32_16x16x32_iu4", I32_IU4X32_WMMA_w32, "_w32">;
|
||||
|
||||
defm V_SWMMAC_F32_16X16X32_F16_w32 : SWMMACInstGFX12<"v_swmmac_f32_16x16x32_f16", F32_F16_SWMMAC_w32, "_w32">;
|
||||
defm V_SWMMAC_F32_16X16X32_BF16_w32 : SWMMACInstGFX12<"v_swmmac_f32_16x16x32_bf16", F32_BF16_SWMMAC_w32, "_w32">;
|
||||
defm V_SWMMAC_F16_16X16X32_F16_w32 : SWMMACInstGFX12<"v_swmmac_f16_16x16x32_f16", F16_F16_SWMMAC_w32, "_w32">;
|
||||
defm V_SWMMAC_BF16_16X16X32_BF16_w32 : SWMMACInstGFX12<"v_swmmac_bf16_16x16x32_bf16", BF16_BF16_SWMMAC_w32, "_w32">;
|
||||
defm V_SWMMAC_I32_16X16X32_IU8_w32 : SWMMACInstGFX12<"v_swmmac_i32_16x16x32_iu8", I32_IU8_SWMMAC_w32, "_w32">;
|
||||
defm V_SWMMAC_I32_16X16X32_IU4_w32 : SWMMACInstGFX12<"v_swmmac_i32_16x16x32_iu4", I32_IU4X32_SWMMAC_w32, "_w32">;
|
||||
defm V_SWMMAC_I32_16X16X64_IU4_w32 : SWMMACInstGFX12<"v_swmmac_i32_16x16x64_iu4", I32_IU4X64_SWMMAC_w32, "_w32">;
|
||||
defm V_SWMMAC_F32_16X16X32_FP8_FP8_w32 : SWMMACInstGFX12<"v_swmmac_f32_16x16x32_fp8_fp8", F32_FP8BF8_SWMMAC_w32, "_w32">;
|
||||
defm V_SWMMAC_F32_16X16X32_FP8_BF8_w32 : SWMMACInstGFX12<"v_swmmac_f32_16x16x32_fp8_bf8", F32_FP8BF8_SWMMAC_w32, "_w32">;
|
||||
defm V_SWMMAC_F32_16X16X32_BF8_FP8_w32 : SWMMACInstGFX12<"v_swmmac_f32_16x16x32_bf8_fp8", F32_FP8BF8_SWMMAC_w32, "_w32">;
|
||||
defm V_SWMMAC_F32_16X16X32_BF8_BF8_w32 : SWMMACInstGFX12<"v_swmmac_f32_16x16x32_bf8_bf8", F32_FP8BF8_SWMMAC_w32, "_w32">;
|
||||
}
|
||||
|
||||
let WaveSizePredicate = isWave64 in {
|
||||
defm V_WMMA_F32_16X16X16_F16_w64 : WMMAInstGFX12<"v_wmma_f32_16x16x16_f16", F32_F16_WMMA_w64, "_w64">;
|
||||
defm V_WMMA_F32_16X16X16_BF16_w64 : WMMAInstGFX12<"v_wmma_f32_16x16x16_bf16", F32_BF16_WMMA_w64, "_w64">;
|
||||
defm V_WMMA_F16_16X16X16_F16_w64 : WMMAInstGFX12<"v_wmma_f16_16x16x16_f16", F16_F16_WMMA_w64, "_w64">;
|
||||
defm V_WMMA_BF16_16X16X16_BF16_w64 : WMMAInstGFX12<"v_wmma_bf16_16x16x16_bf16", BF16_BF16_WMMA_w64, "_w64">;
|
||||
defm V_WMMA_I32_16X16X16_IU8_w64 : WMMAInstGFX12<"v_wmma_i32_16x16x16_iu8", I32_IU8_WMMA_w64, "_w64">;
|
||||
defm V_WMMA_I32_16X16X16_IU4_w64 : WMMAInstGFX12<"v_wmma_i32_16x16x16_iu4", I32_IU4X16_WMMA_w64, "_w64">;
|
||||
defm V_WMMA_F32_16X16X16_FP8_FP8_w64 : WMMAInstGFX12<"v_wmma_f32_16x16x16_fp8_fp8", F32_FP8BF8_WMMA_w64, "_w64">;
|
||||
defm V_WMMA_F32_16X16X16_FP8_BF8_w64 : WMMAInstGFX12<"v_wmma_f32_16x16x16_fp8_bf8", F32_FP8BF8_WMMA_w64, "_w64">;
|
||||
defm V_WMMA_F32_16X16X16_BF8_FP8_w64 : WMMAInstGFX12<"v_wmma_f32_16x16x16_bf8_fp8", F32_FP8BF8_WMMA_w64, "_w64">;
|
||||
defm V_WMMA_F32_16X16X16_BF8_BF8_w64 : WMMAInstGFX12<"v_wmma_f32_16x16x16_bf8_bf8", F32_FP8BF8_WMMA_w64, "_w64">;
|
||||
defm V_WMMA_I32_16X16X32_IU4_w64 : WMMAInstGFX12<"v_wmma_i32_16x16x32_iu4", I32_IU4X32_WMMA_w64, "_w64">;
|
||||
|
||||
defm V_SWMMAC_F32_16X16X32_F16_w64 : SWMMACInstGFX12<"v_swmmac_f32_16x16x32_f16", F32_F16_SWMMAC_w64, "_w64">;
|
||||
defm V_SWMMAC_F32_16X16X32_BF16_w64 : SWMMACInstGFX12<"v_swmmac_f32_16x16x32_bf16", F32_BF16_SWMMAC_w64, "_w64">;
|
||||
defm V_SWMMAC_F16_16X16X32_F16_w64 : SWMMACInstGFX12<"v_swmmac_f16_16x16x32_f16", F16_F16_SWMMAC_w64, "_w64">;
|
||||
defm V_SWMMAC_BF16_16X16X32_BF16_w64 : SWMMACInstGFX12<"v_swmmac_bf16_16x16x32_bf16", BF16_BF16_SWMMAC_w64, "_w64">;
|
||||
defm V_SWMMAC_I32_16X16X32_IU8_w64 : SWMMACInstGFX12<"v_swmmac_i32_16x16x32_iu8", I32_IU8_SWMMAC_w64, "_w64">;
|
||||
defm V_SWMMAC_I32_16X16X32_IU4_w64 : SWMMACInstGFX12<"v_swmmac_i32_16x16x32_iu4", I32_IU4X32_SWMMAC_w64, "_w64">;
|
||||
defm V_SWMMAC_I32_16X16X64_IU4_w64 : SWMMACInstGFX12<"v_swmmac_i32_16x16x64_iu4", I32_IU4X64_SWMMAC_w64, "_w64">;
|
||||
defm V_SWMMAC_F32_16X16X32_FP8_FP8_w64 : SWMMACInstGFX12<"v_swmmac_f32_16x16x32_fp8_fp8", F32_FP8BF8_SWMMAC_w64, "_w64">;
|
||||
defm V_SWMMAC_F32_16X16X32_FP8_BF8_w64 : SWMMACInstGFX12<"v_swmmac_f32_16x16x32_fp8_bf8", F32_FP8BF8_SWMMAC_w64, "_w64">;
|
||||
defm V_SWMMAC_F32_16X16X32_BF8_FP8_w64 : SWMMACInstGFX12<"v_swmmac_f32_16x16x32_bf8_fp8", F32_FP8BF8_SWMMAC_w64, "_w64">;
|
||||
defm V_SWMMAC_F32_16X16X32_BF8_BF8_w64 : SWMMACInstGFX12<"v_swmmac_f32_16x16x32_bf8_bf8", F32_FP8BF8_SWMMAC_w64, "_w64">;
|
||||
}
|
||||
|
||||
// IsGFX11OpselIntrinsic: f16_f16 and bf16_bf16 Intrinsics have imm operand that
|
||||
// controls opsel. Used by gfx11, removed in gfx12 (operand must be 0).
|
||||
multiclass WMMAPat<string Inst, SDPatternOperator node, VOP3PWMMA_Profile P, bit IsGFX11OpselIntrinsic = 0> {
|
||||
def : GCNPat <(P.DstVT !setdagop(!con(P.WmmaInPat, !if(IsGFX11OpselIntrinsic, (ins 0), (ins))), node)),
|
||||
(P.DstVT !setdagop(P.WmmaOutPat, !cast<Instruction>(Inst#"_twoaddr")))>;
|
||||
let AddedComplexity = 4 in
|
||||
def : GCNPat <(P.DstVT !setdagop(!con(P.WmmaInlineInPat, !if(IsGFX11OpselIntrinsic, (ins 0), (ins))), node)),
|
||||
(P.DstVT !setdagop(P.WmmaInlineOutPat, !cast<Instruction>(Inst#"_threeaddr")))>;
|
||||
}
|
||||
|
||||
class SWMMACPat<Instruction Inst, SDPatternOperator node, VOP3PWMMA_Profile P> :
|
||||
GCNPat <(P.DstVT !setdagop(P.SwmmacInPat, node)),
|
||||
(P.DstVT !setdagop(P.SwmmacOutPat, Inst))>;
|
||||
|
||||
class SWMMACPat_w64<Instruction Inst, SDPatternOperator node, VOP3PWMMA_Profile P> :
|
||||
GCNPat <(P.DstVT !setdagop(P.SwmmacInPat, node)),
|
||||
(P.DstVT !setdagop(P.SwmmacOutPat, Inst))>{
|
||||
let WaveSizePredicate = isWave64;
|
||||
}
|
||||
|
||||
let WaveSizePredicate = isWave32, SubtargetPredicate = isGFX12Plus in {
|
||||
defm : WMMAPat<"V_WMMA_F32_16X16X16_F16_w32", int_amdgcn_wmma_f32_16x16x16_f16, F32_F16_WMMA_w32>;
|
||||
defm : WMMAPat<"V_WMMA_F32_16X16X16_BF16_w32", int_amdgcn_wmma_f32_16x16x16_bf16, F32_BF16_WMMA_w32>;
|
||||
defm : WMMAPat<"V_WMMA_F16_16X16X16_F16_w32", int_amdgcn_wmma_f16_16x16x16_f16, F16_F16_WMMA_w32,1>;
|
||||
defm : WMMAPat<"V_WMMA_BF16_16X16X16_BF16_w32", int_amdgcn_wmma_bf16_16x16x16_bf16, BF16_BF16_WMMA_w32,1>;
|
||||
defm : WMMAPat<"V_WMMA_I32_16X16X16_IU8_w32", int_amdgcn_wmma_i32_16x16x16_iu8, I32_IU8_WMMA_w32>;
|
||||
defm : WMMAPat<"V_WMMA_I32_16X16X16_IU4_w32", int_amdgcn_wmma_i32_16x16x16_iu4, I32_IU4X16_WMMA_w32>;
|
||||
defm : WMMAPat<"V_WMMA_F32_16X16X16_FP8_FP8_w32", int_amdgcn_wmma_f32_16x16x16_fp8_fp8, F32_FP8BF8_WMMA_w32>;
|
||||
defm : WMMAPat<"V_WMMA_F32_16X16X16_FP8_BF8_w32", int_amdgcn_wmma_f32_16x16x16_fp8_bf8, F32_FP8BF8_WMMA_w32>;
|
||||
defm : WMMAPat<"V_WMMA_F32_16X16X16_BF8_FP8_w32", int_amdgcn_wmma_f32_16x16x16_bf8_fp8, F32_FP8BF8_WMMA_w32>;
|
||||
defm : WMMAPat<"V_WMMA_F32_16X16X16_BF8_BF8_w32", int_amdgcn_wmma_f32_16x16x16_bf8_bf8, F32_FP8BF8_WMMA_w32>;
|
||||
defm : WMMAPat<"V_WMMA_I32_16X16X32_IU4_w32", int_amdgcn_wmma_i32_16x16x32_iu4, I32_IU4X32_WMMA_w32>;
|
||||
|
||||
def : SWMMACPat<V_SWMMAC_F32_16X16X32_F16_w32_twoaddr, int_amdgcn_swmmac_f32_16x16x32_f16, F32_F16_SWMMAC_w32>;
|
||||
def : SWMMACPat<V_SWMMAC_F32_16X16X32_BF16_w32_twoaddr, int_amdgcn_swmmac_f32_16x16x32_bf16, F32_BF16_SWMMAC_w32>;
|
||||
def : SWMMACPat<V_SWMMAC_F16_16X16X32_F16_w32_twoaddr, int_amdgcn_swmmac_f16_16x16x32_f16, F16_F16_SWMMAC_w32>;
|
||||
def : SWMMACPat<V_SWMMAC_BF16_16X16X32_BF16_w32_twoaddr, int_amdgcn_swmmac_bf16_16x16x32_bf16, BF16_BF16_SWMMAC_w32>;
|
||||
def : SWMMACPat<V_SWMMAC_I32_16X16X32_IU8_w32_twoaddr, int_amdgcn_swmmac_i32_16x16x32_iu8, I32_IU8_SWMMAC_w32>;
|
||||
def : SWMMACPat<V_SWMMAC_I32_16X16X32_IU4_w32_twoaddr, int_amdgcn_swmmac_i32_16x16x32_iu4, I32_IU4X32_SWMMAC_w32>;
|
||||
def : GCNPat <(I32_IU4X64_SWMMAC_w32.DstVT !setdagop(I32_IU4X64_SWMMAC_w32.SwmmacInPat, int_amdgcn_swmmac_i32_16x16x64_iu4)),
|
||||
(I32_IU4X64_SWMMAC_w32.DstVT !setdagop(I32_IU4X64_SWMMAC_w32.SwmmacOutPat, V_SWMMAC_I32_16X16X64_IU4_w32_twoaddr))>;
|
||||
def : SWMMACPat<V_SWMMAC_F32_16X16X32_FP8_FP8_w32_twoaddr, int_amdgcn_swmmac_f32_16x16x32_fp8_fp8, F32_FP8BF8_SWMMAC_w32>;
|
||||
def : SWMMACPat<V_SWMMAC_F32_16X16X32_FP8_BF8_w32_twoaddr, int_amdgcn_swmmac_f32_16x16x32_fp8_bf8, F32_FP8BF8_SWMMAC_w32>;
|
||||
def : SWMMACPat<V_SWMMAC_F32_16X16X32_BF8_FP8_w32_twoaddr, int_amdgcn_swmmac_f32_16x16x32_bf8_fp8, F32_FP8BF8_SWMMAC_w32>;
|
||||
def : SWMMACPat<V_SWMMAC_F32_16X16X32_BF8_BF8_w32_twoaddr, int_amdgcn_swmmac_f32_16x16x32_bf8_bf8, F32_FP8BF8_SWMMAC_w32>;
|
||||
}
|
||||
|
||||
let WaveSizePredicate = isWave64, SubtargetPredicate = isGFX12Plus in {
|
||||
defm : WMMAPat<"V_WMMA_F32_16X16X16_F16_w64", int_amdgcn_wmma_f32_16x16x16_f16, F32_F16_WMMA_w64>;
|
||||
defm : WMMAPat<"V_WMMA_F32_16X16X16_BF16_w64", int_amdgcn_wmma_f32_16x16x16_bf16, F32_BF16_WMMA_w64>;
|
||||
defm : WMMAPat<"V_WMMA_F16_16X16X16_F16_w64", int_amdgcn_wmma_f16_16x16x16_f16, F16_F16_WMMA_w64,1>;
|
||||
defm : WMMAPat<"V_WMMA_BF16_16X16X16_BF16_w64", int_amdgcn_wmma_bf16_16x16x16_bf16, BF16_BF16_WMMA_w64,1>;
|
||||
defm : WMMAPat<"V_WMMA_I32_16X16X16_IU8_w64", int_amdgcn_wmma_i32_16x16x16_iu8, I32_IU8_WMMA_w64>;
|
||||
defm : WMMAPat<"V_WMMA_I32_16X16X16_IU4_w64", int_amdgcn_wmma_i32_16x16x16_iu4, I32_IU4X16_WMMA_w64>;
|
||||
defm : WMMAPat<"V_WMMA_F32_16X16X16_FP8_FP8_w64", int_amdgcn_wmma_f32_16x16x16_fp8_fp8, F32_FP8BF8_WMMA_w64>;
|
||||
defm : WMMAPat<"V_WMMA_F32_16X16X16_FP8_BF8_w64", int_amdgcn_wmma_f32_16x16x16_fp8_bf8, F32_FP8BF8_WMMA_w64>;
|
||||
defm : WMMAPat<"V_WMMA_F32_16X16X16_BF8_FP8_w64", int_amdgcn_wmma_f32_16x16x16_bf8_fp8, F32_FP8BF8_WMMA_w64>;
|
||||
defm : WMMAPat<"V_WMMA_F32_16X16X16_BF8_BF8_w64", int_amdgcn_wmma_f32_16x16x16_bf8_bf8, F32_FP8BF8_WMMA_w64>;
|
||||
defm : WMMAPat<"V_WMMA_I32_16X16X32_IU4_w64", int_amdgcn_wmma_i32_16x16x32_iu4, I32_IU4X32_WMMA_w64>;
|
||||
|
||||
def : SWMMACPat<V_SWMMAC_F32_16X16X32_F16_w64_twoaddr, int_amdgcn_swmmac_f32_16x16x32_f16, F32_F16_SWMMAC_w64>;
|
||||
def : SWMMACPat<V_SWMMAC_F32_16X16X32_BF16_w64_twoaddr, int_amdgcn_swmmac_f32_16x16x32_bf16, F32_BF16_SWMMAC_w64>;
|
||||
def : SWMMACPat<V_SWMMAC_F16_16X16X32_F16_w64_twoaddr, int_amdgcn_swmmac_f16_16x16x32_f16, F16_F16_SWMMAC_w64>;
|
||||
def : SWMMACPat<V_SWMMAC_BF16_16X16X32_BF16_w64_twoaddr, int_amdgcn_swmmac_bf16_16x16x32_bf16, BF16_BF16_SWMMAC_w64>;
|
||||
def : SWMMACPat<V_SWMMAC_I32_16X16X32_IU8_w64_twoaddr, int_amdgcn_swmmac_i32_16x16x32_iu8, I32_IU8_SWMMAC_w64>;
|
||||
def : SWMMACPat<V_SWMMAC_I32_16X16X32_IU4_w64_twoaddr, int_amdgcn_swmmac_i32_16x16x32_iu4, I32_IU4X32_SWMMAC_w64>;
|
||||
def : SWMMACPat<V_SWMMAC_I32_16X16X64_IU4_w64_twoaddr, int_amdgcn_swmmac_i32_16x16x64_iu4, I32_IU4X64_SWMMAC_w64>;
|
||||
def : SWMMACPat<V_SWMMAC_F32_16X16X32_FP8_FP8_w64_twoaddr, int_amdgcn_swmmac_f32_16x16x32_fp8_fp8, F32_FP8BF8_SWMMAC_w64>;
|
||||
def : SWMMACPat<V_SWMMAC_F32_16X16X32_FP8_BF8_w64_twoaddr, int_amdgcn_swmmac_f32_16x16x32_fp8_bf8, F32_FP8BF8_SWMMAC_w64>;
|
||||
def : SWMMACPat<V_SWMMAC_F32_16X16X32_BF8_FP8_w64_twoaddr, int_amdgcn_swmmac_f32_16x16x32_bf8_fp8, F32_FP8BF8_SWMMAC_w64>;
|
||||
def : SWMMACPat<V_SWMMAC_F32_16X16X32_BF8_BF8_w64_twoaddr, int_amdgcn_swmmac_f32_16x16x32_bf8_bf8, F32_FP8BF8_SWMMAC_w64>;
|
||||
}
|
||||
|
||||
|
||||
//===----------------------------------------------------------------------===//
|
||||
// Begin Real Encodings
|
||||
//===----------------------------------------------------------------------===//
|
||||
@ -1005,6 +1400,99 @@ multiclass VOP3P_Real_Base<GFXGen Gen, bits<7> op, string backing_ps_name = NAME
|
||||
VOP3Pe_gfx11_gfx12<op, !cast<VOP3P_Pseudo>(backing_ps_name).Pfl>;
|
||||
}
|
||||
|
||||
class VOP3PeWmma<bits<7> op, VOPProfile P, VOP3PWMMA_Profile WMMAP>
|
||||
: VOP3Pe_gfx11_gfx12<op, P>{
|
||||
// opsel
|
||||
let Inst{11} = !cond(!eq(WMMAP.IndexType, 0) : 0,
|
||||
!eq(WMMAP.IndexType, 8) : index_key_8bit{0},
|
||||
!eq(WMMAP.IndexType, 16) : index_key_16bit{0});
|
||||
let Inst{12} = !if(!eq(WMMAP.IndexType, 8), index_key_8bit{1}, 0);
|
||||
let Inst{13} = 0;
|
||||
// opsel_hi
|
||||
let Inst{59} = 1;
|
||||
let Inst{60} = 1;
|
||||
let Inst{14} = 1;
|
||||
// neg_lo
|
||||
let Inst{61} = !if(WMMAP.NegLo01, src0_modifiers{0}, 0);
|
||||
let Inst{62} = !if(WMMAP.NegLo01, src1_modifiers{0}, 0);
|
||||
let Inst{63} = !if(WMMAP.NegLo2, src2_modifiers{0}, 0);
|
||||
// neg_hi
|
||||
let Inst{8} = !if(WMMAP.NegHi01, src0_modifiers{1}, 0);
|
||||
let Inst{9} = !if(WMMAP.NegHi01, src1_modifiers{1}, 0);
|
||||
let Inst{10} = !if(WMMAP.NegHi2, src2_modifiers{1}, 0);
|
||||
// clamp
|
||||
let Inst{15} = !if(WMMAP.IsIU, clamp{0}, 0);
|
||||
}
|
||||
|
||||
multiclass VOP3P_WMMA_Real_Base<GFXGen Gen, bits<7> op, VOP3PWMMA_Profile WMMAP,
|
||||
string backing_ps_name = NAME,
|
||||
string asmName = !cast<VOP3P_Pseudo>(NAME).Mnemonic> {
|
||||
def Gen.Suffix :
|
||||
VOP3P_Real_Gen<!cast<VOP3P_Pseudo>(backing_ps_name), Gen, asmName>,
|
||||
VOP3PeWmma<op, !cast<VOP3P_Pseudo>(backing_ps_name).Pfl, WMMAP>;
|
||||
}
|
||||
|
||||
multiclass VOP3P_Real_WMMA_gfx12 <bits<7> op, VOP3PWMMA_Profile WMMAP> {
|
||||
let WaveSizePredicate = isWave32, DecoderNamespace = "GFX12" in {
|
||||
defm _twoaddr : VOP3P_WMMA_Real_Base <GFX12Gen, op, WMMAP>;
|
||||
}
|
||||
}
|
||||
|
||||
multiclass VOP3P_Real_WMMA_gfx12w64 <bits<7> op, VOP3PWMMA_Profile WMMAP> {
|
||||
let WaveSizePredicate = isWave64, DecoderNamespace = "WMMAGFX12" in {
|
||||
defm _twoaddr : VOP3P_WMMA_Real_Base <GFX12Gen, op, WMMAP>;
|
||||
}
|
||||
}
|
||||
|
||||
defm V_WMMA_F32_16X16X16_F16_w32 : VOP3P_Real_WMMA_gfx12 <0x040, F32_F16_WMMA_w32>;
|
||||
defm V_WMMA_F32_16X16X16_BF16_w32 : VOP3P_Real_WMMA_gfx12 <0x041, F32_BF16_WMMA_w32>;
|
||||
defm V_WMMA_F16_16X16X16_F16_w32 : VOP3P_Real_WMMA_gfx12 <0x042, F16_F16_WMMA_w32>;
|
||||
defm V_WMMA_BF16_16X16X16_BF16_w32 : VOP3P_Real_WMMA_gfx12 <0x043, BF16_BF16_WMMA_w32>;
|
||||
defm V_WMMA_I32_16X16X16_IU8_w32 : VOP3P_Real_WMMA_gfx12 <0x044, I32_IU8_WMMA_w32>;
|
||||
defm V_WMMA_I32_16X16X16_IU4_w32 : VOP3P_Real_WMMA_gfx12 <0x045, I32_IU4X16_WMMA_w32>;
|
||||
defm V_WMMA_F32_16X16X16_FP8_FP8_w32 : VOP3P_Real_WMMA_gfx12 <0x046, F32_FP8BF8_WMMA_w32>;
|
||||
defm V_WMMA_F32_16X16X16_FP8_BF8_w32 : VOP3P_Real_WMMA_gfx12 <0x047, F32_FP8BF8_WMMA_w32>;
|
||||
defm V_WMMA_F32_16X16X16_BF8_FP8_w32 : VOP3P_Real_WMMA_gfx12 <0x048, F32_FP8BF8_WMMA_w32>;
|
||||
defm V_WMMA_F32_16X16X16_BF8_BF8_w32 : VOP3P_Real_WMMA_gfx12 <0x049, F32_FP8BF8_WMMA_w32>;
|
||||
defm V_WMMA_I32_16X16X32_IU4_w32 : VOP3P_Real_WMMA_gfx12 <0x04a, I32_IU4X32_WMMA_w32>;
|
||||
|
||||
defm V_WMMA_F32_16X16X16_F16_w64 : VOP3P_Real_WMMA_gfx12w64 <0x040, F32_F16_WMMA_w64>;
|
||||
defm V_WMMA_F32_16X16X16_BF16_w64 : VOP3P_Real_WMMA_gfx12w64 <0x041, F32_BF16_WMMA_w64>;
|
||||
defm V_WMMA_F16_16X16X16_F16_w64 : VOP3P_Real_WMMA_gfx12w64 <0x042, F16_F16_WMMA_w64>;
|
||||
defm V_WMMA_BF16_16X16X16_BF16_w64 : VOP3P_Real_WMMA_gfx12w64 <0x043, BF16_BF16_WMMA_w64>;
|
||||
defm V_WMMA_I32_16X16X16_IU8_w64 : VOP3P_Real_WMMA_gfx12w64 <0x044, I32_IU8_WMMA_w64>;
|
||||
defm V_WMMA_I32_16X16X16_IU4_w64 : VOP3P_Real_WMMA_gfx12w64 <0x045, I32_IU4X16_WMMA_w64>;
|
||||
defm V_WMMA_F32_16X16X16_FP8_FP8_w64 : VOP3P_Real_WMMA_gfx12w64 <0x046, F32_FP8BF8_WMMA_w64>;
|
||||
defm V_WMMA_F32_16X16X16_FP8_BF8_w64 : VOP3P_Real_WMMA_gfx12w64 <0x047, F32_FP8BF8_WMMA_w64>;
|
||||
defm V_WMMA_F32_16X16X16_BF8_FP8_w64 : VOP3P_Real_WMMA_gfx12w64 <0x048, F32_FP8BF8_WMMA_w64>;
|
||||
defm V_WMMA_F32_16X16X16_BF8_BF8_w64 : VOP3P_Real_WMMA_gfx12w64 <0x049, F32_FP8BF8_WMMA_w64>;
|
||||
defm V_WMMA_I32_16X16X32_IU4_w64 : VOP3P_Real_WMMA_gfx12w64 <0x04a, I32_IU4X32_WMMA_w64>;
|
||||
|
||||
|
||||
defm V_SWMMAC_F32_16X16X32_F16_w32 : VOP3P_Real_WMMA_gfx12 <0x050, F32_F16_SWMMAC_w32>;
|
||||
defm V_SWMMAC_F32_16X16X32_BF16_w32 : VOP3P_Real_WMMA_gfx12 <0x051, F32_BF16_SWMMAC_w32>;
|
||||
defm V_SWMMAC_F16_16X16X32_F16_w32 : VOP3P_Real_WMMA_gfx12 <0x052, F16_F16_SWMMAC_w32>;
|
||||
defm V_SWMMAC_BF16_16X16X32_BF16_w32 : VOP3P_Real_WMMA_gfx12 <0x053, BF16_BF16_SWMMAC_w32>;
|
||||
defm V_SWMMAC_I32_16X16X32_IU8_w32 : VOP3P_Real_WMMA_gfx12 <0x054, I32_IU8_SWMMAC_w32>;
|
||||
defm V_SWMMAC_I32_16X16X32_IU4_w32 : VOP3P_Real_WMMA_gfx12 <0x055, I32_IU4X32_SWMMAC_w32>;
|
||||
defm V_SWMMAC_I32_16X16X64_IU4_w32 : VOP3P_Real_WMMA_gfx12 <0x056, I32_IU4X64_SWMMAC_w32>;
|
||||
defm V_SWMMAC_F32_16X16X32_FP8_FP8_w32 : VOP3P_Real_WMMA_gfx12 <0x057, F32_FP8BF8_SWMMAC_w32>;
|
||||
defm V_SWMMAC_F32_16X16X32_FP8_BF8_w32 : VOP3P_Real_WMMA_gfx12 <0x058, F32_FP8BF8_SWMMAC_w32>;
|
||||
defm V_SWMMAC_F32_16X16X32_BF8_FP8_w32 : VOP3P_Real_WMMA_gfx12 <0x059, F32_FP8BF8_SWMMAC_w32>;
|
||||
defm V_SWMMAC_F32_16X16X32_BF8_BF8_w32 : VOP3P_Real_WMMA_gfx12 <0x05a, F32_FP8BF8_SWMMAC_w32>;
|
||||
|
||||
defm V_SWMMAC_F32_16X16X32_F16_w64 : VOP3P_Real_WMMA_gfx12w64 <0x050, F32_F16_SWMMAC_w64>;
|
||||
defm V_SWMMAC_F32_16X16X32_BF16_w64 : VOP3P_Real_WMMA_gfx12w64 <0x051, F32_BF16_SWMMAC_w64>;
|
||||
defm V_SWMMAC_F16_16X16X32_F16_w64 : VOP3P_Real_WMMA_gfx12w64 <0x052, F16_F16_SWMMAC_w64>;
|
||||
defm V_SWMMAC_BF16_16X16X32_BF16_w64 : VOP3P_Real_WMMA_gfx12w64 <0x053, BF16_BF16_SWMMAC_w64>;
|
||||
defm V_SWMMAC_I32_16X16X32_IU8_w64 : VOP3P_Real_WMMA_gfx12w64 <0x054, I32_IU8_SWMMAC_w64>;
|
||||
defm V_SWMMAC_I32_16X16X32_IU4_w64 : VOP3P_Real_WMMA_gfx12w64 <0x055, I32_IU4X32_SWMMAC_w64>;
|
||||
defm V_SWMMAC_I32_16X16X64_IU4_w64 : VOP3P_Real_WMMA_gfx12w64 <0x056, I32_IU4X64_SWMMAC_w64>;
|
||||
defm V_SWMMAC_F32_16X16X32_FP8_FP8_w64 : VOP3P_Real_WMMA_gfx12w64 <0x057, F32_FP8BF8_SWMMAC_w64>;
|
||||
defm V_SWMMAC_F32_16X16X32_FP8_BF8_w64 : VOP3P_Real_WMMA_gfx12w64 <0x058, F32_FP8BF8_SWMMAC_w64>;
|
||||
defm V_SWMMAC_F32_16X16X32_BF8_FP8_w64 : VOP3P_Real_WMMA_gfx12w64 <0x059, F32_FP8BF8_SWMMAC_w64>;
|
||||
defm V_SWMMAC_F32_16X16X32_BF8_BF8_w64 : VOP3P_Real_WMMA_gfx12w64 <0x05a, F32_FP8BF8_SWMMAC_w64>;
|
||||
|
||||
multiclass VOP3P_Real_with_name<GFXGen Gen, bits<7> op,
|
||||
string backing_ps_name = NAME,
|
||||
string asmName = !cast<VOP3P_Pseudo>(NAME).Mnemonic> {
|
||||
|
@ -124,6 +124,7 @@ class VOP3_Pseudo <string opName, VOPProfile P, list<dag> pattern = [],
|
||||
let IsPacked = P.IsPacked;
|
||||
let IsMAI = P.IsMAI;
|
||||
let IsWMMA = P.IsWMMA;
|
||||
let IsSWMMAC = P.IsSWMMAC;
|
||||
|
||||
let AsmOperands = !if(isVop3OpSel,
|
||||
P.AsmVOP3OpSel,
|
||||
@ -378,6 +379,8 @@ class VOP3Pe <bits<7> op, VOPProfile P> : Enc64 {
|
||||
bits<4> src2_modifiers;
|
||||
bits<9> src2;
|
||||
bits<1> clamp;
|
||||
bits<2> index_key_8bit;
|
||||
bits<1> index_key_16bit;
|
||||
|
||||
let Inst{7-0} = vdst;
|
||||
let Inst{8} = !if(P.HasSrc0Mods, src0_modifiers{1}, 0); // neg_hi src0
|
||||
|
@ -63,52 +63,140 @@ define amdgpu_kernel void @writelane(ptr addrspace(1) %out) #0 {
|
||||
ret void
|
||||
}
|
||||
|
||||
; CHECK: DIVERGENT: %tmp0 = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.f16.v8f32(<16 x half> %A, <16 x half> %B, <8 x float> %C)
|
||||
; CHECK: DIVERGENT: %tmp0 = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.f16.v8f32.v16f16(<16 x half> %A, <16 x half> %B, <8 x float> %C)
|
||||
define amdgpu_kernel void @wmma_f32_16x16x16_f16(<16 x half> %A, <16 x half> %B, <8 x float> %C, ptr addrspace(1) %out) {
|
||||
%tmp0 = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.f16.v8f32(<16 x half> %A, <16 x half> %B, <8 x float> %C)
|
||||
%tmp0 = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.f16.v8f32.v16f16(<16 x half> %A, <16 x half> %B, <8 x float> %C)
|
||||
store <8 x float> %tmp0, ptr addrspace(1) %out, align 32
|
||||
ret void
|
||||
}
|
||||
|
||||
; CHECK: DIVERGENT: %tmp0 = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf16.v8f32(<16 x i16> %A, <16 x i16> %B, <8 x float> %C)
|
||||
; CHECK: DIVERGENT: %tmp0 = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf16.v8f32.v16i16(<16 x i16> %A, <16 x i16> %B, <8 x float> %C)
|
||||
define amdgpu_kernel void @wmma_f32_16x16x16_ibf16(<16 x i16> %A, <16 x i16> %B, <8 x float> %C, ptr addrspace(1) %out) {
|
||||
%tmp0 = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf16.v8f32(<16 x i16> %A, <16 x i16> %B, <8 x float> %C)
|
||||
%tmp0 = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf16.v8f32.v16i16(<16 x i16> %A, <16 x i16> %B, <8 x float> %C)
|
||||
store <8 x float> %tmp0, ptr addrspace(1) %out, align 32
|
||||
ret void
|
||||
}
|
||||
|
||||
; CHECK: DIVERGENT: %tmp0 = call <16 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16.v16f16(<16 x half> %A, <16 x half> %B, <16 x half> %C, i1 false)
|
||||
; CHECK: DIVERGENT: %tmp0 = call <16 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16.v16f16.v16f16(<16 x half> %A, <16 x half> %B, <16 x half> %C, i1 false)
|
||||
define amdgpu_kernel void @wmma_f16_16x16x16_f16(<16 x half> %A, <16 x half> %B, <16 x half> %C, ptr addrspace(1) %out) {
|
||||
bb:
|
||||
%tmp0 = call <16 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16.v16f16(<16 x half> %A, <16 x half> %B, <16 x half> %C, i1 false)
|
||||
%tmp0 = call <16 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16.v16f16.v16f16(<16 x half> %A, <16 x half> %B, <16 x half> %C, i1 false)
|
||||
store <16 x half> %tmp0, ptr addrspace(1) %out, align 32
|
||||
ret void
|
||||
}
|
||||
|
||||
; CHECK: DIVERGENT: %tmp0 = call <16 x i16> @llvm.amdgcn.wmma.bf16.16x16x16.bf16.v16i16(<16 x i16> %A, <16 x i16> %B, <16 x i16> %C, i1 false)
|
||||
; CHECK: DIVERGENT: %tmp0 = call <16 x i16> @llvm.amdgcn.wmma.bf16.16x16x16.bf16.v16i16.v16i16(<16 x i16> %A, <16 x i16> %B, <16 x i16> %C, i1 false)
|
||||
define amdgpu_kernel void @wmma_f16_16x16x16_bf16(<16 x i16> %A, <16 x i16> %B, <16 x i16> %C, ptr addrspace(1) %out) {
|
||||
bb:
|
||||
%tmp0 = call <16 x i16> @llvm.amdgcn.wmma.bf16.16x16x16.bf16.v16i16(<16 x i16> %A, <16 x i16> %B, <16 x i16> %C, i1 false)
|
||||
%tmp0 = call <16 x i16> @llvm.amdgcn.wmma.bf16.16x16x16.bf16.v16i16.v16i16(<16 x i16> %A, <16 x i16> %B, <16 x i16> %C, i1 false)
|
||||
store <16 x i16> %tmp0, ptr addrspace(1) %out, align 32
|
||||
ret void
|
||||
}
|
||||
|
||||
; CHECK: DIVERGENT: %tmp0 = call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8.v8i32(i1 false, <4 x i32> %A, i1 false, <4 x i32> %B, <8 x i32> %C, i1 false)
|
||||
; CHECK: DIVERGENT: %tmp0 = call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8.v8i32.v4i32(i1 false, <4 x i32> %A, i1 false, <4 x i32> %B, <8 x i32> %C, i1 false)
|
||||
define amdgpu_kernel void @wmma_i32_16x16x16_ui8(<4 x i32> %A, <4 x i32> %B, <8 x i32> %C, ptr addrspace(1) %out) {
|
||||
bb:
|
||||
%tmp0 = call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8.v8i32(i1 false, <4 x i32> %A, i1 false, <4 x i32> %B, <8 x i32> %C, i1 false)
|
||||
%tmp0 = call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8.v8i32.v4i32(i1 false, <4 x i32> %A, i1 false, <4 x i32> %B, <8 x i32> %C, i1 false)
|
||||
store <8 x i32> %tmp0, ptr addrspace(1) %out, align 32
|
||||
ret void
|
||||
}
|
||||
|
||||
; CHECK: DIVERGENT: %tmp0 = call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4.v8i32(i1 false, <2 x i32> %A, i1 false, <2 x i32> %B, <8 x i32> %C, i1 false)
|
||||
; CHECK: DIVERGENT: %tmp0 = call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4.v8i32.v2i32(i1 false, <2 x i32> %A, i1 false, <2 x i32> %B, <8 x i32> %C, i1 false)
|
||||
define amdgpu_kernel void @wmma_i32_16x16x16_ui4(<2 x i32> %A, <2 x i32> %B, <8 x i32> %C, ptr addrspace(1) %out) {
|
||||
bb:
|
||||
%tmp0 = call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4.v8i32(i1 false, <2 x i32> %A, i1 false, <2 x i32> %B, <8 x i32> %C, i1 false)
|
||||
%tmp0 = call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4.v8i32.v2i32(i1 false, <2 x i32> %A, i1 false, <2 x i32> %B, <8 x i32> %C, i1 false)
|
||||
store <8 x i32> %tmp0, ptr addrspace(1) %out, align 32
|
||||
ret void
|
||||
}
|
||||
|
||||
; CHECK: DIVERGENT: %tmp0 = call <8 x float> @llvm.amdgcn.swmmac.f32.16x16x32.f16.v8f32.v8f16.v16f16.i16(<8 x half> %A, <16 x half> %B, <8 x float> %C, i16 %Index)
|
||||
define amdgpu_kernel void @swmmac_f32_16x16x32_f16(<8 x half> %A, <16 x half> %B, <8 x float> %C, i16 %Index, ptr addrspace(1) %out) {
|
||||
bb:
|
||||
%tmp0 = call <8 x float> @llvm.amdgcn.swmmac.f32.16x16x32.f16(<8 x half> %A, <16 x half> %B, <8 x float> %C, i16 %Index)
|
||||
store <8 x float> %tmp0, ptr addrspace(1) %out, align 32
|
||||
ret void
|
||||
}
|
||||
|
||||
; CHECK: DIVERGENT: %tmp0 = call <8 x float> @llvm.amdgcn.swmmac.f32.16x16x32.bf16.v8f32.v8i16.v16i16.i16(<8 x i16> %A, <16 x i16> %B, <8 x float> %C, i16 %Index)
|
||||
define amdgpu_kernel void @swmmac_f32_16x16x32_bf16(<8 x i16> %A, <16 x i16> %B, <8 x float> %C, i16 %Index, ptr addrspace(1) %out) {
|
||||
bb:
|
||||
%tmp0 = call <8 x float> @llvm.amdgcn.swmmac.f32.16x16x32.bf16(<8 x i16> %A, <16 x i16> %B, <8 x float> %C, i16 %Index)
|
||||
store <8 x float> %tmp0, ptr addrspace(1) %out, align 32
|
||||
ret void
|
||||
}
|
||||
|
||||
; CHECK: DIVERGENT: %tmp0 = call <8 x half> @llvm.amdgcn.swmmac.f16.16x16x32.f16.v8f16.v8f16.v16f16.i16(<8 x half> %A, <16 x half> %B, <8 x half> %C, i16 %Index)
|
||||
define amdgpu_kernel void @swmmac_f16_16x16x32_f16(<8 x half> %A, <16 x half> %B, <8 x half> %C, i16 %Index, ptr addrspace(1) %out) {
|
||||
bb:
|
||||
%tmp0 = call <8 x half> @llvm.amdgcn.swmmac.f16.16x16x32.f16(<8 x half> %A, <16 x half> %B, <8 x half> %C, i16 %Index)
|
||||
store <8 x half> %tmp0, ptr addrspace(1) %out, align 32
|
||||
ret void
|
||||
}
|
||||
|
||||
; CHECK: DIVERGENT: %tmp0 = call <8 x i16> @llvm.amdgcn.swmmac.bf16.16x16x32.bf16.v8i16.v8i16.v16i16.i16(<8 x i16> %A, <16 x i16> %B, <8 x i16> %C, i16 %Index)
|
||||
define amdgpu_kernel void @swmmac_bf16_16x16x32_bf16(<8 x i16> %A, <16 x i16> %B, <8 x i16> %C, i16 %Index, ptr addrspace(1) %out) {
|
||||
bb:
|
||||
%tmp0 = call <8 x i16> @llvm.amdgcn.swmmac.bf16.16x16x32.bf16(<8 x i16> %A, <16 x i16> %B, <8 x i16> %C, i16 %Index)
|
||||
store <8 x i16> %tmp0, ptr addrspace(1) %out, align 32
|
||||
ret void
|
||||
}
|
||||
|
||||
; CHECK: DIVERGENT: %tmp0 = call <8 x i32> @llvm.amdgcn.swmmac.i32.16x16x32.iu8.v8i32.v2i32.v4i32.i16(i1 false, <2 x i32> %A, i1 false, <4 x i32> %B, <8 x i32> %C, i16 %Index, i1 false)
|
||||
define amdgpu_kernel void @swmmac_i32_16x16x32_iu8(<2 x i32> %A, <4 x i32> %B, <8 x i32> %C, i16 %Index, ptr addrspace(1) %out) {
|
||||
bb:
|
||||
%tmp0 = call <8 x i32> @llvm.amdgcn.swmmac.i32.16x16x32.iu8(i1 false, <2 x i32> %A, i1 false, <4 x i32> %B, <8 x i32> %C, i16 %Index, i1 false)
|
||||
store <8 x i32> %tmp0, ptr addrspace(1) %out, align 32
|
||||
ret void
|
||||
}
|
||||
|
||||
; CHECK: DIVERGENT: %tmp0 = call <8 x i32> @llvm.amdgcn.swmmac.i32.16x16x32.iu4.v8i32.i32.v2i32.i16(i1 false, i32 %A, i1 false, <2 x i32> %B, <8 x i32> %C, i16 %Index, i1 false)
|
||||
define amdgpu_kernel void @swmmac_i32_16x16x32_iu4(i32 %A, <2 x i32> %B, <8 x i32> %C, i16 %Index, ptr addrspace(1) %out) {
|
||||
bb:
|
||||
%tmp0 = call <8 x i32> @llvm.amdgcn.swmmac.i32.16x16x32.iu4(i1 false, i32 %A, i1 false, <2 x i32> %B, <8 x i32> %C, i16 %Index, i1 false)
|
||||
store <8 x i32> %tmp0, ptr addrspace(1) %out, align 32
|
||||
ret void
|
||||
}
|
||||
|
||||
; CHECK: DIVERGENT: %tmp0 = call <8 x i32> @llvm.amdgcn.swmmac.i32.16x16x64.iu4.v8i32.v2i32.v4i32.i16(i1 false, <2 x i32> %A, i1 false, <4 x i32> %B, <8 x i32> %C, i16 %Index, i1 false)
|
||||
define amdgpu_kernel void @swmmac_i32_16x16x64_iu4(<2 x i32> %A, <4 x i32> %B, <8 x i32> %C, i16 %Index, ptr addrspace(1) %out) {
|
||||
bb:
|
||||
%tmp0 = call <8 x i32> @llvm.amdgcn.swmmac.i32.16x16x64.iu4(i1 false, <2 x i32> %A, i1 false, <4 x i32> %B, <8 x i32> %C, i16 %Index, i1 false)
|
||||
store <8 x i32> %tmp0, ptr addrspace(1) %out, align 32
|
||||
ret void
|
||||
}
|
||||
|
||||
; CHECK: DIVERGENT: %tmp0 = call <8 x float> @llvm.amdgcn.swmmac.f32.16x16x32.fp8.fp8.v8f32.v2i32.v4i32.i16(<2 x i32> %A, <4 x i32> %B, <8 x float> %C, i16 %Index)
|
||||
define amdgpu_kernel void @swmmac_f32_16x16x32_fp8.fp8(<2 x i32> %A, <4 x i32> %B, <8 x float> %C, i16 %Index, ptr addrspace(1) %out) {
|
||||
bb:
|
||||
%tmp0 = call <8 x float> @llvm.amdgcn.swmmac.f32.16x16x32.fp8.fp8(<2 x i32> %A, <4 x i32> %B, <8 x float> %C, i16 %Index)
|
||||
store <8 x float> %tmp0, ptr addrspace(1) %out, align 32
|
||||
ret void
|
||||
}
|
||||
|
||||
; CHECK: DIVERGENT: %tmp0 = call <8 x float> @llvm.amdgcn.swmmac.f32.16x16x32.fp8.bf8.v8f32.v2i32.v4i32.i16(<2 x i32> %A, <4 x i32> %B, <8 x float> %C, i16 %Index)
|
||||
define amdgpu_kernel void @swmmac_f32_16x16x32_fp8.bf8(<2 x i32> %A, <4 x i32> %B, <8 x float> %C, i16 %Index, ptr addrspace(1) %out) {
|
||||
bb:
|
||||
%tmp0 = call <8 x float> @llvm.amdgcn.swmmac.f32.16x16x32.fp8.bf8(<2 x i32> %A, <4 x i32> %B, <8 x float> %C, i16 %Index)
|
||||
store <8 x float> %tmp0, ptr addrspace(1) %out, align 32
|
||||
ret void
|
||||
}
|
||||
|
||||
; CHECK: DIVERGENT: %tmp0 = call <8 x float> @llvm.amdgcn.swmmac.f32.16x16x32.bf8.fp8.v8f32.v2i32.v4i32.i16(<2 x i32> %A, <4 x i32> %B, <8 x float> %C, i16 %Index)
|
||||
define amdgpu_kernel void @swmmac_f32_16x16x32_bf8.fp8(<2 x i32> %A, <4 x i32> %B, <8 x float> %C, i16 %Index, ptr addrspace(1) %out) {
|
||||
bb:
|
||||
%tmp0 = call <8 x float> @llvm.amdgcn.swmmac.f32.16x16x32.bf8.fp8(<2 x i32> %A, <4 x i32> %B, <8 x float> %C, i16 %Index)
|
||||
store <8 x float> %tmp0, ptr addrspace(1) %out, align 32
|
||||
ret void
|
||||
}
|
||||
|
||||
; CHECK: DIVERGENT: %tmp0 = call <8 x float> @llvm.amdgcn.swmmac.f32.16x16x32.bf8.bf8.v8f32.v2i32.v4i32.i16(<2 x i32> %A, <4 x i32> %B, <8 x float> %C, i16 %Index)
|
||||
define amdgpu_kernel void @swmmac_f32_16x16x32_bf8.bf8(<2 x i32> %A, <4 x i32> %B, <8 x float> %C, i16 %Index, ptr addrspace(1) %out) {
|
||||
bb:
|
||||
%tmp0 = call <8 x float> @llvm.amdgcn.swmmac.f32.16x16x32.bf8.bf8(<2 x i32> %A, <4 x i32> %B, <8 x float> %C, i16 %Index)
|
||||
store <8 x float> %tmp0, ptr addrspace(1) %out, align 32
|
||||
ret void
|
||||
}
|
||||
|
||||
; CHECK: DIVERGENT: %tmp0 = call <2 x i32> @llvm.amdgcn.global.load.tr.v2i32(ptr addrspace(1) %gep)
|
||||
define amdgpu_kernel void @global_load_tr_b64_v2i32(ptr addrspace(1) %addr, ptr addrspace(1) %out) {
|
||||
bb:
|
||||
@ -190,12 +278,23 @@ declare i32 @llvm.amdgcn.mov.dpp.i32(i32, i32, i32, i32, i1) #1
|
||||
declare i32 @llvm.amdgcn.mov.dpp8.i32(i32, i32) #1
|
||||
declare i32 @llvm.amdgcn.update.dpp.i32(i32, i32, i32, i32, i32, i1) #1
|
||||
declare i32 @llvm.amdgcn.writelane(i32, i32, i32) #1
|
||||
declare <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.f16.v8f32(<16 x half>, <16 x half> , <8 x float>) #1
|
||||
declare <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf16.v8f32(<16 x i16>, <16 x i16> , <8 x float>) #1
|
||||
declare <16 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16.v16f16(<16 x half>, <16 x half> , <16 x half>, i1 immarg) #1
|
||||
declare <16 x i16> @llvm.amdgcn.wmma.bf16.16x16x16.bf16.v16i16(<16 x i16>, <16 x i16> , <16 x i16>, i1 immarg) #1
|
||||
declare <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8.v8i32(i1 immarg, <4 x i32>, i1 immarg, <4 x i32> , <8 x i32>, i1 immarg) #1
|
||||
declare <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4.v8i32(i1 immarg, <2 x i32>, i1 immarg, <2 x i32> , <8 x i32>, i1 immarg) #1
|
||||
declare <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.f16.v8f32.v16f16(<16 x half>, <16 x half> , <8 x float>) #1
|
||||
declare <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf16.v8f32.v16i16(<16 x i16>, <16 x i16> , <8 x float>) #1
|
||||
declare <16 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16.v16f16.v16f16(<16 x half>, <16 x half> , <16 x half>, i1 immarg) #1
|
||||
declare <16 x i16> @llvm.amdgcn.wmma.bf16.16x16x16.bf16.v16i16.v16i16(<16 x i16>, <16 x i16> , <16 x i16>, i1 immarg) #1
|
||||
declare <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8.v8i32.v4i32(i1 immarg, <4 x i32>, i1 immarg, <4 x i32> , <8 x i32>, i1 immarg) #1
|
||||
declare <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4.v8i32.v2i32(i1 immarg, <2 x i32>, i1 immarg, <2 x i32> , <8 x i32>, i1 immarg) #1
|
||||
declare <8 x float> @llvm.amdgcn.swmmac.f32.16x16x32.f16(<8 x half>, <16 x half>, <8 x float>, i16)
|
||||
declare <8 x float> @llvm.amdgcn.swmmac.f32.16x16x32.bf16(<8 x i16>, <16 x i16>, <8 x float>, i16)
|
||||
declare <8 x half> @llvm.amdgcn.swmmac.f16.16x16x32.f16(<8 x half>, <16 x half>, <8 x half>, i16)
|
||||
declare <8 x i16> @llvm.amdgcn.swmmac.bf16.16x16x32.bf16(<8 x i16>, <16 x i16>, <8 x i16>, i16)
|
||||
declare <8 x i32> @llvm.amdgcn.swmmac.i32.16x16x32.iu8(i1 immarg, <2 x i32>, i1 immarg, <4 x i32>, <8 x i32>, i16, i1)
|
||||
declare <8 x i32> @llvm.amdgcn.swmmac.i32.16x16x32.iu4(i1 immarg, i32, i1 immarg, <2 x i32>, <8 x i32>, i16, i1)
|
||||
declare <8 x i32> @llvm.amdgcn.swmmac.i32.16x16x64.iu4(i1 immarg, <2 x i32>, i1 immarg, <4 x i32>, <8 x i32>, i16, i1)
|
||||
declare <8 x float> @llvm.amdgcn.swmmac.f32.16x16x32.fp8.fp8(<2 x i32>, <4 x i32>, <8 x float>, i16)
|
||||
declare <8 x float> @llvm.amdgcn.swmmac.f32.16x16x32.fp8.bf8(<2 x i32>, <4 x i32>, <8 x float>, i16)
|
||||
declare <8 x float> @llvm.amdgcn.swmmac.f32.16x16x32.bf8.fp8(<2 x i32>, <4 x i32>, <8 x float>, i16)
|
||||
declare <8 x float> @llvm.amdgcn.swmmac.f32.16x16x32.bf8.bf8(<2 x i32>, <4 x i32>, <8 x float>, i16)
|
||||
|
||||
declare <2 x i32> @llvm.amdgcn.global.load.tr.v2i32(ptr addrspace(1))
|
||||
declare <8 x i16> @llvm.amdgcn.global.load.tr.v8i16(ptr addrspace(1))
|
||||
|
@ -0,0 +1,504 @@
|
||||
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
|
||||
; RUN: llc -global-isel -march=amdgcn -mcpu=gfx1200 -verify-machineinstrs < %s | FileCheck %s --check-prefix=GFX12
|
||||
|
||||
define amdgpu_ps void @test_wmma_f32_16x16x16_f16_negA(<8 x half> %A, <8 x half> %B, <8 x float> %C, <8 x float> addrspace(1)* %out) {
|
||||
; GFX12-LABEL: test_wmma_f32_16x16x16_f16_negA:
|
||||
; GFX12: ; %bb.0: ; %bb
|
||||
; GFX12-NEXT: v_wmma_f32_16x16x16_f16 v[8:15], v[0:3], v[4:7], v[8:15] neg_lo:[1,0,0] neg_hi:[1,0,0]
|
||||
; GFX12-NEXT: s_clause 0x1
|
||||
; GFX12-NEXT: global_store_b128 v[16:17], v[8:11], off
|
||||
; GFX12-NEXT: global_store_b128 v[16:17], v[12:15], off offset:16
|
||||
; GFX12-NEXT: s_nop 0
|
||||
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
|
||||
; GFX12-NEXT: s_endpgm
|
||||
bb:
|
||||
%fneg.A = fneg <8 x half> %A
|
||||
%res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.f16.v8f16.v8f32(<8 x half> %fneg.A, <8 x half> %B, <8 x float> %C)
|
||||
store <8 x float> %res, <8 x float> addrspace(1)* %out
|
||||
ret void
|
||||
}
|
||||
|
||||
define amdgpu_ps void @test_wmma_f32_16x16x16_f16_negB(<8 x half> %A, <8 x half> %B, <8 x float> %C, <8 x float> addrspace(1)* %out) {
|
||||
; GFX12-LABEL: test_wmma_f32_16x16x16_f16_negB:
|
||||
; GFX12: ; %bb.0: ; %bb
|
||||
; GFX12-NEXT: v_wmma_f32_16x16x16_f16 v[8:15], v[0:3], v[4:7], v[8:15] neg_lo:[0,1,0] neg_hi:[0,1,0]
|
||||
; GFX12-NEXT: s_clause 0x1
|
||||
; GFX12-NEXT: global_store_b128 v[16:17], v[8:11], off
|
||||
; GFX12-NEXT: global_store_b128 v[16:17], v[12:15], off offset:16
|
||||
; GFX12-NEXT: s_nop 0
|
||||
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
|
||||
; GFX12-NEXT: s_endpgm
|
||||
bb:
|
||||
%fneg.B = fneg <8 x half> %B
|
||||
%res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.f16.v8f16.v8f32(<8 x half> %A, <8 x half> %fneg.B, <8 x float> %C)
|
||||
store <8 x float> %res, <8 x float> addrspace(1)* %out
|
||||
ret void
|
||||
}
|
||||
|
||||
define amdgpu_ps void @test_wmma_f32_16x16x16_f16_negC(<8 x half> %A, <8 x half> %B, <8 x float> %C, <8 x float> addrspace(1)* %out) {
|
||||
; GFX12-LABEL: test_wmma_f32_16x16x16_f16_negC:
|
||||
; GFX12: ; %bb.0: ; %bb
|
||||
; GFX12-NEXT: v_wmma_f32_16x16x16_f16 v[8:15], v[0:3], v[4:7], v[8:15] neg_lo:[0,0,1]
|
||||
; GFX12-NEXT: s_clause 0x1
|
||||
; GFX12-NEXT: global_store_b128 v[16:17], v[8:11], off
|
||||
; GFX12-NEXT: global_store_b128 v[16:17], v[12:15], off offset:16
|
||||
; GFX12-NEXT: s_nop 0
|
||||
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
|
||||
; GFX12-NEXT: s_endpgm
|
||||
bb:
|
||||
%fneg.C = fneg <8 x float> %C
|
||||
%res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.f16.v8f16.v8f32(<8 x half> %A, <8 x half> %B, <8 x float> %fneg.C)
|
||||
store <8 x float> %res, <8 x float> addrspace(1)* %out
|
||||
ret void
|
||||
}
|
||||
|
||||
define amdgpu_ps void @test_wmma_f32_16x16x16_f16_absC(<8 x half> %A, <8 x half> %B, <8 x float> %C, <8 x float> addrspace(1)* %out) {
|
||||
; GFX12-LABEL: test_wmma_f32_16x16x16_f16_absC:
|
||||
; GFX12: ; %bb.0: ; %bb
|
||||
; GFX12-NEXT: v_wmma_f32_16x16x16_f16 v[8:15], v[0:3], v[4:7], v[8:15] neg_hi:[0,0,1]
|
||||
; GFX12-NEXT: s_clause 0x1
|
||||
; GFX12-NEXT: global_store_b128 v[16:17], v[8:11], off
|
||||
; GFX12-NEXT: global_store_b128 v[16:17], v[12:15], off offset:16
|
||||
; GFX12-NEXT: s_nop 0
|
||||
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
|
||||
; GFX12-NEXT: s_endpgm
|
||||
bb:
|
||||
%fabs.C = call <8 x float> @llvm.fabs.v8f32(<8 x float> %C)
|
||||
%res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.f16.v8f16.v8f32(<8 x half> %A, <8 x half> %B, <8 x float> %fabs.C)
|
||||
store <8 x float> %res, <8 x float> addrspace(1)* %out
|
||||
ret void
|
||||
}
|
||||
|
||||
define amdgpu_ps void @test_wmma_f32_16x16x16_bf16_negC(<8 x i16> %A, <8 x i16> %B, <8 x float> %C, <8 x float> addrspace(1)* %out) {
|
||||
; GFX12-LABEL: test_wmma_f32_16x16x16_bf16_negC:
|
||||
; GFX12: ; %bb.0: ; %bb
|
||||
; GFX12-NEXT: v_wmma_f32_16x16x16_bf16 v[8:15], v[0:3], v[4:7], v[8:15] neg_lo:[0,0,1]
|
||||
; GFX12-NEXT: s_clause 0x1
|
||||
; GFX12-NEXT: global_store_b128 v[16:17], v[8:11], off
|
||||
; GFX12-NEXT: global_store_b128 v[16:17], v[12:15], off offset:16
|
||||
; GFX12-NEXT: s_nop 0
|
||||
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
|
||||
; GFX12-NEXT: s_endpgm
|
||||
bb:
|
||||
%fneg.C = fneg <8 x float> %C
|
||||
%res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf16.v8i16.v8f32(<8 x i16> %A, <8 x i16> %B, <8 x float> %fneg.C)
|
||||
store <8 x float> %res, <8 x float> addrspace(1)* %out
|
||||
ret void
|
||||
}
|
||||
|
||||
define amdgpu_ps void @test_wmma_f32_16x16x16_bf16_absC(<8 x i16> %A, <8 x i16> %B, <8 x float> %C, <8 x float> addrspace(1)* %out) {
|
||||
; GFX12-LABEL: test_wmma_f32_16x16x16_bf16_absC:
|
||||
; GFX12: ; %bb.0: ; %bb
|
||||
; GFX12-NEXT: v_wmma_f32_16x16x16_bf16 v[8:15], v[0:3], v[4:7], v[8:15] neg_hi:[0,0,1]
|
||||
; GFX12-NEXT: s_clause 0x1
|
||||
; GFX12-NEXT: global_store_b128 v[16:17], v[8:11], off
|
||||
; GFX12-NEXT: global_store_b128 v[16:17], v[12:15], off offset:16
|
||||
; GFX12-NEXT: s_nop 0
|
||||
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
|
||||
; GFX12-NEXT: s_endpgm
|
||||
bb:
|
||||
%fabs.C = call <8 x float> @llvm.fabs.v8f32(<8 x float> %C)
|
||||
%res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf16.v8i16.v8f32(<8 x i16> %A, <8 x i16> %B, <8 x float> %fabs.C)
|
||||
store <8 x float> %res, <8 x float> addrspace(1)* %out
|
||||
ret void
|
||||
}
|
||||
|
||||
define amdgpu_ps void @test_wmma_f16_16x16x16_f16_negA(<8 x half> %A, <8 x half> %B, <8 x half> %C, <8 x half> addrspace(1)* %out) {
|
||||
; GFX12-LABEL: test_wmma_f16_16x16x16_f16_negA:
|
||||
; GFX12: ; %bb.0: ; %bb
|
||||
; GFX12-NEXT: v_wmma_f16_16x16x16_f16 v[8:11], v[0:3], v[4:7], v[8:11] neg_lo:[1,0,0] neg_hi:[1,0,0]
|
||||
; GFX12-NEXT: global_store_b128 v[12:13], v[8:11], off
|
||||
; GFX12-NEXT: s_nop 0
|
||||
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
|
||||
; GFX12-NEXT: s_endpgm
|
||||
bb:
|
||||
%fneg.A = fneg <8 x half> %A
|
||||
%res = call <8 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16.v8f16.v8f16(<8 x half> %fneg.A, <8 x half> %B, <8 x half> %C, i1 0)
|
||||
store <8 x half> %res, <8 x half> addrspace(1)* %out
|
||||
ret void
|
||||
}
|
||||
|
||||
define amdgpu_ps void @test_wmma_f16_16x16x16_f16_negB(<8 x half> %A, <8 x half> %B, <8 x half> %C, <8 x half> addrspace(1)* %out) {
|
||||
; GFX12-LABEL: test_wmma_f16_16x16x16_f16_negB:
|
||||
; GFX12: ; %bb.0: ; %bb
|
||||
; GFX12-NEXT: v_wmma_f16_16x16x16_f16 v[8:11], v[0:3], v[4:7], v[8:11] neg_lo:[0,1,0] neg_hi:[0,1,0]
|
||||
; GFX12-NEXT: global_store_b128 v[12:13], v[8:11], off
|
||||
; GFX12-NEXT: s_nop 0
|
||||
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
|
||||
; GFX12-NEXT: s_endpgm
|
||||
bb:
|
||||
%fneg.B = fneg <8 x half> %B
|
||||
%res = call <8 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16.v8f16.v8f16(<8 x half> %A, <8 x half> %fneg.B, <8 x half> %C, i1 0)
|
||||
store <8 x half> %res, <8 x half> addrspace(1)* %out
|
||||
ret void
|
||||
}
|
||||
|
||||
define amdgpu_ps void @test_wmma_f16_16x16x16_f16_negC(<8 x half> %A, <8 x half> %B, <8 x half> %C, <8 x half> addrspace(1)* %out) {
|
||||
; GFX12-LABEL: test_wmma_f16_16x16x16_f16_negC:
|
||||
; GFX12: ; %bb.0: ; %bb
|
||||
; GFX12-NEXT: v_wmma_f16_16x16x16_f16 v[8:11], v[0:3], v[4:7], v[8:11] neg_lo:[0,0,1]
|
||||
; GFX12-NEXT: global_store_b128 v[12:13], v[8:11], off
|
||||
; GFX12-NEXT: s_nop 0
|
||||
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
|
||||
; GFX12-NEXT: s_endpgm
|
||||
bb:
|
||||
%fneg.C = fneg <8 x half> %C
|
||||
%res = call <8 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16.v8f16.v8f16(<8 x half> %A, <8 x half> %B, <8 x half> %fneg.C, i1 0)
|
||||
store <8 x half> %res, <8 x half> addrspace(1)* %out
|
||||
ret void
|
||||
}
|
||||
|
||||
define amdgpu_ps void @test_wmma_f16_16x16x16_f16_absC(<8 x half> %A, <8 x half> %B, <8 x half> %C, <8 x half> addrspace(1)* %out) {
|
||||
; GFX12-LABEL: test_wmma_f16_16x16x16_f16_absC:
|
||||
; GFX12: ; %bb.0: ; %bb
|
||||
; GFX12-NEXT: v_wmma_f16_16x16x16_f16 v[8:11], v[0:3], v[4:7], v[8:11] neg_hi:[0,0,1]
|
||||
; GFX12-NEXT: global_store_b128 v[12:13], v[8:11], off
|
||||
; GFX12-NEXT: s_nop 0
|
||||
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
|
||||
; GFX12-NEXT: s_endpgm
|
||||
bb:
|
||||
%fabs.C = call <8 x half> @llvm.fabs.v8f16(<8 x half> %C)
|
||||
%res = call <8 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16.v8f16.v8f16(<8 x half> %A, <8 x half> %B, <8 x half> %fabs.C, i1 0)
|
||||
store <8 x half> %res, <8 x half> addrspace(1)* %out
|
||||
ret void
|
||||
}
|
||||
|
||||
define amdgpu_ps void @test_wmma_f32_16x16x16_fp8_fp8_negC(<2 x i32> %A, <2 x i32> %B, <8 x float> %C, <8 x float> addrspace(1)* %out) {
|
||||
; GFX12-LABEL: test_wmma_f32_16x16x16_fp8_fp8_negC:
|
||||
; GFX12: ; %bb.0: ; %bb
|
||||
; GFX12-NEXT: v_wmma_f32_16x16x16_fp8_fp8 v[4:11], v[0:1], v[2:3], v[4:11] neg_lo:[0,0,1]
|
||||
; GFX12-NEXT: s_clause 0x1
|
||||
; GFX12-NEXT: global_store_b128 v[12:13], v[4:7], off
|
||||
; GFX12-NEXT: global_store_b128 v[12:13], v[8:11], off offset:16
|
||||
; GFX12-NEXT: s_nop 0
|
||||
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
|
||||
; GFX12-NEXT: s_endpgm
|
||||
bb:
|
||||
%fneg.C = fneg <8 x float> %C
|
||||
%res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.fp8.fp8.v2i32.v8f32(<2 x i32> %A, <2 x i32> %B, <8 x float> %fneg.C)
|
||||
store <8 x float> %res, <8 x float> addrspace(1)* %out
|
||||
ret void
|
||||
}
|
||||
|
||||
define amdgpu_ps void @test_wmma_f32_16x16x16_fp8_fp8_absC(<2 x i32> %A, <2 x i32> %B, <8 x float> %C, <8 x float> addrspace(1)* %out) {
|
||||
; GFX12-LABEL: test_wmma_f32_16x16x16_fp8_fp8_absC:
|
||||
; GFX12: ; %bb.0: ; %bb
|
||||
; GFX12-NEXT: v_wmma_f32_16x16x16_fp8_fp8 v[4:11], v[0:1], v[2:3], v[4:11] neg_hi:[0,0,1]
|
||||
; GFX12-NEXT: s_clause 0x1
|
||||
; GFX12-NEXT: global_store_b128 v[12:13], v[4:7], off
|
||||
; GFX12-NEXT: global_store_b128 v[12:13], v[8:11], off offset:16
|
||||
; GFX12-NEXT: s_nop 0
|
||||
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
|
||||
; GFX12-NEXT: s_endpgm
|
||||
bb:
|
||||
%fabs.C = call <8 x float> @llvm.fabs.v8f32(<8 x float> %C)
|
||||
%res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.fp8.fp8.v2i32.v8f32(<2 x i32> %A, <2 x i32> %B, <8 x float> %fabs.C)
|
||||
store <8 x float> %res, <8 x float> addrspace(1)* %out
|
||||
ret void
|
||||
}
|
||||
|
||||
define amdgpu_ps void @test_wmma_f32_16x16x16_bf8_fp8_negC(<2 x i32> %A, <2 x i32> %B, <8 x float> %C, <8 x float> addrspace(1)* %out) {
|
||||
; GFX12-LABEL: test_wmma_f32_16x16x16_bf8_fp8_negC:
|
||||
; GFX12: ; %bb.0: ; %bb
|
||||
; GFX12-NEXT: v_wmma_f32_16x16x16_bf8_fp8 v[4:11], v[0:1], v[2:3], v[4:11] neg_lo:[0,0,1]
|
||||
; GFX12-NEXT: s_clause 0x1
|
||||
; GFX12-NEXT: global_store_b128 v[12:13], v[4:7], off
|
||||
; GFX12-NEXT: global_store_b128 v[12:13], v[8:11], off offset:16
|
||||
; GFX12-NEXT: s_nop 0
|
||||
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
|
||||
; GFX12-NEXT: s_endpgm
|
||||
bb:
|
||||
%fneg.C = fneg <8 x float> %C
|
||||
%res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf8.fp8.v2i32.v8f32(<2 x i32> %A, <2 x i32> %B, <8 x float> %fneg.C)
|
||||
store <8 x float> %res, <8 x float> addrspace(1)* %out
|
||||
ret void
|
||||
}
|
||||
|
||||
define amdgpu_ps void @test_wmma_f32_16x16x16_bf8_fp8_absC(<2 x i32> %A, <2 x i32> %B, <8 x float> %C, <8 x float> addrspace(1)* %out) {
|
||||
; GFX12-LABEL: test_wmma_f32_16x16x16_bf8_fp8_absC:
|
||||
; GFX12: ; %bb.0: ; %bb
|
||||
; GFX12-NEXT: v_wmma_f32_16x16x16_bf8_fp8 v[4:11], v[0:1], v[2:3], v[4:11] neg_hi:[0,0,1]
|
||||
; GFX12-NEXT: s_clause 0x1
|
||||
; GFX12-NEXT: global_store_b128 v[12:13], v[4:7], off
|
||||
; GFX12-NEXT: global_store_b128 v[12:13], v[8:11], off offset:16
|
||||
; GFX12-NEXT: s_nop 0
|
||||
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
|
||||
; GFX12-NEXT: s_endpgm
|
||||
bb:
|
||||
%fabs.C = call <8 x float> @llvm.fabs.v8f32(<8 x float> %C)
|
||||
%res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf8.fp8.v2i32.v8f32(<2 x i32> %A, <2 x i32> %B, <8 x float> %fabs.C)
|
||||
store <8 x float> %res, <8 x float> addrspace(1)* %out
|
||||
ret void
|
||||
}
|
||||
|
||||
define amdgpu_ps void @test_wmma_f32_16x16x16_fp8_bf8_negC(<2 x i32> %A, <2 x i32> %B, <8 x float> %C, <8 x float> addrspace(1)* %out) {
|
||||
; GFX12-LABEL: test_wmma_f32_16x16x16_fp8_bf8_negC:
|
||||
; GFX12: ; %bb.0: ; %bb
|
||||
; GFX12-NEXT: v_wmma_f32_16x16x16_fp8_bf8 v[4:11], v[0:1], v[2:3], v[4:11] neg_lo:[0,0,1]
|
||||
; GFX12-NEXT: s_clause 0x1
|
||||
; GFX12-NEXT: global_store_b128 v[12:13], v[4:7], off
|
||||
; GFX12-NEXT: global_store_b128 v[12:13], v[8:11], off offset:16
|
||||
; GFX12-NEXT: s_nop 0
|
||||
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
|
||||
; GFX12-NEXT: s_endpgm
|
||||
bb:
|
||||
%fneg.C = fneg <8 x float> %C
|
||||
%res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.fp8.bf8.v2i32.v8f32(<2 x i32> %A, <2 x i32> %B, <8 x float> %fneg.C)
|
||||
store <8 x float> %res, <8 x float> addrspace(1)* %out
|
||||
ret void
|
||||
}
|
||||
|
||||
define amdgpu_ps void @test_wmma_f32_16x16x16_fp8_bf8_absC(<2 x i32> %A, <2 x i32> %B, <8 x float> %C, <8 x float> addrspace(1)* %out) {
|
||||
; GFX12-LABEL: test_wmma_f32_16x16x16_fp8_bf8_absC:
|
||||
; GFX12: ; %bb.0: ; %bb
|
||||
; GFX12-NEXT: v_wmma_f32_16x16x16_fp8_bf8 v[4:11], v[0:1], v[2:3], v[4:11] neg_hi:[0,0,1]
|
||||
; GFX12-NEXT: s_clause 0x1
|
||||
; GFX12-NEXT: global_store_b128 v[12:13], v[4:7], off
|
||||
; GFX12-NEXT: global_store_b128 v[12:13], v[8:11], off offset:16
|
||||
; GFX12-NEXT: s_nop 0
|
||||
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
|
||||
; GFX12-NEXT: s_endpgm
|
||||
bb:
|
||||
%fabs.C = call <8 x float> @llvm.fabs.v8f32(<8 x float> %C)
|
||||
%res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.fp8.bf8.v2i32.v8f32(<2 x i32> %A, <2 x i32> %B, <8 x float> %fabs.C)
|
||||
store <8 x float> %res, <8 x float> addrspace(1)* %out
|
||||
ret void
|
||||
}
|
||||
|
||||
define amdgpu_ps void @test_wmma_f32_16x16x16_bf8_bf8_negC(<2 x i32> %A, <2 x i32> %B, <8 x float> %C, <8 x float> addrspace(1)* %out) {
|
||||
; GFX12-LABEL: test_wmma_f32_16x16x16_bf8_bf8_negC:
|
||||
; GFX12: ; %bb.0: ; %bb
|
||||
; GFX12-NEXT: v_wmma_f32_16x16x16_bf8_bf8 v[4:11], v[0:1], v[2:3], v[4:11] neg_lo:[0,0,1]
|
||||
; GFX12-NEXT: s_clause 0x1
|
||||
; GFX12-NEXT: global_store_b128 v[12:13], v[4:7], off
|
||||
; GFX12-NEXT: global_store_b128 v[12:13], v[8:11], off offset:16
|
||||
; GFX12-NEXT: s_nop 0
|
||||
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
|
||||
; GFX12-NEXT: s_endpgm
|
||||
bb:
|
||||
%fneg.C = fneg <8 x float> %C
|
||||
%res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf8.bf8.v2i32.v8f32(<2 x i32> %A, <2 x i32> %B, <8 x float> %fneg.C)
|
||||
store <8 x float> %res, <8 x float> addrspace(1)* %out
|
||||
ret void
|
||||
}
|
||||
|
||||
define amdgpu_ps void @test_wmma_f32_16x16x16_bf8_bf8_absC(<2 x i32> %A, <2 x i32> %B, <8 x float> %C, <8 x float> addrspace(1)* %out) {
|
||||
; GFX12-LABEL: test_wmma_f32_16x16x16_bf8_bf8_absC:
|
||||
; GFX12: ; %bb.0: ; %bb
|
||||
; GFX12-NEXT: v_wmma_f32_16x16x16_bf8_bf8 v[4:11], v[0:1], v[2:3], v[4:11] neg_hi:[0,0,1]
|
||||
; GFX12-NEXT: s_clause 0x1
|
||||
; GFX12-NEXT: global_store_b128 v[12:13], v[4:7], off
|
||||
; GFX12-NEXT: global_store_b128 v[12:13], v[8:11], off offset:16
|
||||
; GFX12-NEXT: s_nop 0
|
||||
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
|
||||
; GFX12-NEXT: s_endpgm
|
||||
bb:
|
||||
%fabs.C = call <8 x float> @llvm.fabs.v8f32(<8 x float> %C)
|
||||
%res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf8.bf8.v2i32.v8f32(<2 x i32> %A, <2 x i32> %B, <8 x float> %fabs.C)
|
||||
store <8 x float> %res, <8 x float> addrspace(1)* %out
|
||||
ret void
|
||||
}
|
||||
|
||||
define amdgpu_ps void @test_swmmac_f32_16x16x32_f16_negA(<8 x half> %A, <16 x half> %B, <8 x float> %C, i16 %Index, <8 x float> addrspace(1)* %out) {
|
||||
; GFX12-LABEL: test_swmmac_f32_16x16x32_f16_negA:
|
||||
; GFX12: ; %bb.0: ; %bb
|
||||
; GFX12-NEXT: v_swmmac_f32_16x16x32_f16 v[12:19], v[0:3], v[4:11], v20 neg_lo:[1,0,0] neg_hi:[1,0,0]
|
||||
; GFX12-NEXT: s_clause 0x1
|
||||
; GFX12-NEXT: global_store_b128 v[21:22], v[12:15], off
|
||||
; GFX12-NEXT: global_store_b128 v[21:22], v[16:19], off offset:16
|
||||
; GFX12-NEXT: s_nop 0
|
||||
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
|
||||
; GFX12-NEXT: s_endpgm
|
||||
bb:
|
||||
%fneg.A = fneg <8 x half> %A
|
||||
%res = call <8 x float> @llvm.amdgcn.swmmac.f32.16x16x32.f16.v8f16.v16f16.v8f32.i16(<8 x half> %fneg.A, <16 x half> %B, <8 x float> %C, i16 %Index)
|
||||
store <8 x float> %res, <8 x float> addrspace(1)* %out
|
||||
ret void
|
||||
}
|
||||
|
||||
define amdgpu_ps void @test_swmmac_f32_16x16x32_f16_negB(<8 x half> %A, <16 x half> %B, <8 x float> %C, i16 %Index, <8 x float> addrspace(1)* %out) {
|
||||
; GFX12-LABEL: test_swmmac_f32_16x16x32_f16_negB:
|
||||
; GFX12: ; %bb.0: ; %bb
|
||||
; GFX12-NEXT: v_swmmac_f32_16x16x32_f16 v[12:19], v[0:3], v[4:11], v20 neg_lo:[0,1,0] neg_hi:[0,1,0]
|
||||
; GFX12-NEXT: s_clause 0x1
|
||||
; GFX12-NEXT: global_store_b128 v[21:22], v[12:15], off
|
||||
; GFX12-NEXT: global_store_b128 v[21:22], v[16:19], off offset:16
|
||||
; GFX12-NEXT: s_nop 0
|
||||
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
|
||||
; GFX12-NEXT: s_endpgm
|
||||
bb:
|
||||
%fneg.B = fneg <16 x half> %B
|
||||
%res = call <8 x float> @llvm.amdgcn.swmmac.f32.16x16x32.f16.v8f16.v16f16.v8f32.i16(<8 x half> %A, <16 x half> %fneg.B, <8 x float> %C, i16 %Index)
|
||||
store <8 x float> %res, <8 x float> addrspace(1)* %out
|
||||
ret void
|
||||
}
|
||||
|
||||
define amdgpu_ps void @test_swmmac_f16_16x16x32_f16_negA(<8 x half> %A, <16 x half> %B, <8 x half> %C, i16 %Index, <8 x half> addrspace(1)* %out) {
|
||||
; GFX12-LABEL: test_swmmac_f16_16x16x32_f16_negA:
|
||||
; GFX12: ; %bb.0: ; %bb
|
||||
; GFX12-NEXT: v_swmmac_f16_16x16x32_f16 v[12:15], v[0:3], v[4:11], v16 neg_lo:[1,0,0] neg_hi:[1,0,0]
|
||||
; GFX12-NEXT: global_store_b128 v[17:18], v[12:15], off
|
||||
; GFX12-NEXT: s_nop 0
|
||||
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
|
||||
; GFX12-NEXT: s_endpgm
|
||||
bb:
|
||||
%fneg.A = fneg <8 x half> %A
|
||||
%res = call <8 x half> @llvm.amdgcn.swmmac.f16.16x16x32.f16.v8f16.v16f16.v8f16.i16(<8 x half> %fneg.A, <16 x half> %B, <8 x half> %C, i16 %Index)
|
||||
store <8 x half> %res, <8 x half> addrspace(1)* %out
|
||||
ret void
|
||||
}
|
||||
|
||||
define amdgpu_ps void @test_swmmac_f16_16x16x32_f16_negB(<8 x half> %A, <16 x half> %B, <8 x half> %C, i16 %Index, <8 x half> addrspace(1)* %out) {
|
||||
; GFX12-LABEL: test_swmmac_f16_16x16x32_f16_negB:
|
||||
; GFX12: ; %bb.0: ; %bb
|
||||
; GFX12-NEXT: v_swmmac_f16_16x16x32_f16 v[12:15], v[0:3], v[4:11], v16 neg_lo:[0,1,0] neg_hi:[0,1,0]
|
||||
; GFX12-NEXT: global_store_b128 v[17:18], v[12:15], off
|
||||
; GFX12-NEXT: s_nop 0
|
||||
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
|
||||
; GFX12-NEXT: s_endpgm
|
||||
bb:
|
||||
%fneg.B = fneg <16 x half> %B
|
||||
%res = call <8 x half> @llvm.amdgcn.swmmac.f16.16x16x32.f16.v8f16.v16f16.v8f16.i16(<8 x half> %A, <16 x half> %fneg.B, <8 x half> %C, i16 %Index)
|
||||
store <8 x half> %res, <8 x half> addrspace(1)* %out
|
||||
ret void
|
||||
}
|
||||
|
||||
; both neg and abs patterns (wmma matrix C f32 or f16 )
|
||||
|
||||
define amdgpu_ps void @test_wmma_f32_16x16x16_f16_negabsC(<8 x half> %A, <8 x half> %B, <8 x float> %C, <8 x float> addrspace(1)* %out) {
|
||||
; GFX12-LABEL: test_wmma_f32_16x16x16_f16_negabsC:
|
||||
; GFX12: ; %bb.0: ; %bb
|
||||
; GFX12-NEXT: v_wmma_f32_16x16x16_f16 v[8:15], v[0:3], v[4:7], v[8:15] neg_lo:[0,0,1] neg_hi:[0,0,1]
|
||||
; GFX12-NEXT: s_clause 0x1
|
||||
; GFX12-NEXT: global_store_b128 v[16:17], v[8:11], off
|
||||
; GFX12-NEXT: global_store_b128 v[16:17], v[12:15], off offset:16
|
||||
; GFX12-NEXT: s_nop 0
|
||||
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
|
||||
; GFX12-NEXT: s_endpgm
|
||||
bb:
|
||||
%fabs.C = call <8 x float> @llvm.fabs.v8f32(<8 x float> %C)
|
||||
%fneg.fabs.C = fneg <8 x float> %fabs.C
|
||||
%res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.f16.v8f16.v8f32(<8 x half> %A, <8 x half> %B, <8 x float> %fneg.fabs.C)
|
||||
store <8 x float> %res, <8 x float> addrspace(1)* %out
|
||||
ret void
|
||||
}
|
||||
|
||||
define amdgpu_ps void @test_wmma_f16_16x16x16_f16_negabsC(<8 x half> %A, <8 x half> %B, <8 x half> %C, <8 x half> addrspace(1)* %out) {
|
||||
; GFX12-LABEL: test_wmma_f16_16x16x16_f16_negabsC:
|
||||
; GFX12: ; %bb.0: ; %bb
|
||||
; GFX12-NEXT: v_wmma_f16_16x16x16_f16 v[8:11], v[0:3], v[4:7], v[8:11] neg_lo:[0,0,1] neg_hi:[0,0,1]
|
||||
; GFX12-NEXT: global_store_b128 v[12:13], v[8:11], off
|
||||
; GFX12-NEXT: s_nop 0
|
||||
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
|
||||
; GFX12-NEXT: s_endpgm
|
||||
bb:
|
||||
%fabs.C = call <8 x half> @llvm.fabs.v8f16(<8 x half> %C)
|
||||
%fneg.fabs.C = fneg <8 x half> %fabs.C
|
||||
%res = call <8 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16.v8f16.v8f16(<8 x half> %A, <8 x half> %B, <8 x half> %fneg.fabs.C, i1 0)
|
||||
store <8 x half> %res, <8 x half> addrspace(1)* %out
|
||||
ret void
|
||||
}
|
||||
|
||||
define amdgpu_ps void @test_wmma_f32_16x16x16_f16_neg_partial_fabsA(<8 x half> %A, <8 x half> %B, <8 x float> %C, <8 x float> addrspace(1)* %out) {
|
||||
; GFX12-LABEL: test_wmma_f32_16x16x16_f16_neg_partial_fabsA:
|
||||
; GFX12: ; %bb.0: ; %bb
|
||||
; GFX12-NEXT: v_and_b32_e32 v11, 0x7fffffff, v11
|
||||
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
|
||||
; GFX12-NEXT: v_wmma_f32_16x16x16_f16 v[8:15], v[0:3], v[4:7], v[8:15] neg_lo:[0,0,1]
|
||||
; GFX12-NEXT: s_clause 0x1
|
||||
; GFX12-NEXT: global_store_b128 v[16:17], v[8:11], off
|
||||
; GFX12-NEXT: global_store_b128 v[16:17], v[12:15], off offset:16
|
||||
; GFX12-NEXT: s_nop 0
|
||||
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
|
||||
; GFX12-NEXT: s_endpgm
|
||||
bb:
|
||||
%el3 = extractelement <8 x float> %C, i32 3
|
||||
%el3.fabs = call float @llvm.fabs.f32(float %el3)
|
||||
%partial.fabs.C = insertelement <8 x float> %C, float %el3.fabs, i32 3
|
||||
%fneg.partial.fabs.C = fneg <8 x float> %partial.fabs.C
|
||||
%res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.f16.v8f16.v8f32(<8 x half> %A, <8 x half> %B, <8 x float> %fneg.partial.fabs.C)
|
||||
store <8 x float> %res, <8 x float> addrspace(1)* %out
|
||||
ret void
|
||||
}
|
||||
|
||||
; A or B matrix modifier and constant in C
|
||||
|
||||
define amdgpu_ps void @test_wmma_f32_16x16x16_f16_negA_constantC(<8 x half> %A, <8 x half> %B, <8 x float> %C, <8 x float> addrspace(1)* %out) {
|
||||
; GFX12-LABEL: test_wmma_f32_16x16x16_f16_negA_constantC:
|
||||
; GFX12: ; %bb.0: ; %bb
|
||||
; GFX12-NEXT: v_wmma_f32_16x16x16_f16 v[10:17], v[0:3], v[4:7], 1.0 neg_lo:[1,0,0] neg_hi:[1,0,0]
|
||||
; GFX12-NEXT: s_clause 0x1
|
||||
; GFX12-NEXT: global_store_b128 v[8:9], v[10:13], off
|
||||
; GFX12-NEXT: global_store_b128 v[8:9], v[14:17], off offset:16
|
||||
; GFX12-NEXT: s_nop 0
|
||||
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
|
||||
; GFX12-NEXT: s_endpgm
|
||||
bb:
|
||||
%fneg.A = fneg <8 x half> %A
|
||||
%res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.f16.v8f16.v8f32(<8 x half> %fneg.A, <8 x half> %B, <8 x float> <float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0>)
|
||||
store <8 x float> %res, <8 x float> addrspace(1)* %out
|
||||
ret void
|
||||
}
|
||||
|
||||
define amdgpu_ps void @test_wmma_f16_16x16x16_f16_negB_constantC(<8 x half> %A, <8 x half> %B, <8 x half> %C, <8 x half> addrspace(1)* %out) {
|
||||
; GFX12-LABEL: test_wmma_f16_16x16x16_f16_negB_constantC:
|
||||
; GFX12: ; %bb.0: ; %bb
|
||||
; GFX12-NEXT: v_wmma_f16_16x16x16_f16 v[10:13], v[0:3], v[4:7], 1.0 neg_lo:[0,1,0] neg_hi:[0,1,0]
|
||||
; GFX12-NEXT: global_store_b128 v[8:9], v[10:13], off
|
||||
; GFX12-NEXT: s_nop 0
|
||||
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
|
||||
; GFX12-NEXT: s_endpgm
|
||||
bb:
|
||||
%fneg.B = fneg <8 x half> %B
|
||||
%res = call <8 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16.v8f16.v8f16(<8 x half> %A, <8 x half> %fneg.B, <8 x half> <half 1.0, half 1.0, half 1.0, half 1.0, half 1.0, half 1.0, half 1.0, half 1.0>, i1 0)
|
||||
store <8 x half> %res, <8 x half> addrspace(1)* %out
|
||||
ret void
|
||||
}
|
||||
|
||||
; pack f16 elements with v_perm_b32 since they don't come from same b32
|
||||
|
||||
define amdgpu_ps void @test_wmma_f16_16x16x16_f16_negC_pack(<8 x half> %A, <8 x half> %B, ptr %Caddr, <8 x half> addrspace(1)* %out) {
|
||||
; GFX12-LABEL: test_wmma_f16_16x16x16_f16_negC_pack:
|
||||
; GFX12: ; %bb.0: ; %bb
|
||||
; GFX12-NEXT: s_clause 0x1
|
||||
; GFX12-NEXT: flat_load_b128 v[12:15], v[8:9]
|
||||
; GFX12-NEXT: flat_load_b128 v[16:19], v[8:9] offset:16
|
||||
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x101
|
||||
; GFX12-NEXT: v_and_b32_e32 v8, 0xffff, v12
|
||||
; GFX12-NEXT: v_and_b32_e32 v9, 0xffff, v14
|
||||
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
|
||||
; GFX12-NEXT: v_and_b32_e32 v14, 0xffff, v16
|
||||
; GFX12-NEXT: v_and_b32_e32 v16, 0xffff, v18
|
||||
; GFX12-NEXT: v_lshl_or_b32 v12, v13, 16, v8
|
||||
; GFX12-NEXT: v_lshl_or_b32 v13, v15, 16, v9
|
||||
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
|
||||
; GFX12-NEXT: v_lshl_or_b32 v14, v17, 16, v14
|
||||
; GFX12-NEXT: v_lshl_or_b32 v15, v19, 16, v16
|
||||
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
|
||||
; GFX12-NEXT: v_wmma_f16_16x16x16_f16 v[12:15], v[0:3], v[4:7], v[12:15] neg_lo:[0,0,1]
|
||||
; GFX12-NEXT: global_store_b128 v[10:11], v[12:15], off
|
||||
; GFX12-NEXT: s_nop 0
|
||||
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
|
||||
; GFX12-NEXT: s_endpgm
|
||||
bb:
|
||||
%C = load <16 x half>, ptr %Caddr
|
||||
%C_shuffle = shufflevector <16 x half> %C, <16 x half> undef, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
|
||||
%fneg.C_shuffle = fneg <8 x half> %C_shuffle
|
||||
%res = call <8 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16.v8f16.v8f16(<8 x half> %A, <8 x half> %B, <8 x half> %fneg.C_shuffle , i1 0)
|
||||
store <8 x half> %res, <8 x half> addrspace(1)* %out
|
||||
ret void
|
||||
}
|
||||
|
||||
declare <8 x half> @llvm.fabs.v8f16(<8 x half>)
|
||||
declare <8 x float> @llvm.fabs.v8f32(<8 x float>)
|
||||
declare float @llvm.fabs.f32(float)
|
||||
|
||||
declare <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.f16.v8f16.v8f32(<8 x half>, <8 x half>, <8 x float>)
|
||||
declare <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf16.v8i16.v8f32(<8 x i16>, <8 x i16>, <8 x float>)
|
||||
declare <8 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16.v8f16.v8f16(<8 x half>, <8 x half>, <8 x half>, i1 immarg)
|
||||
declare <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.fp8.fp8.v2i32.v8f32(<2 x i32>, <2 x i32>, <8 x float>)
|
||||
declare <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.fp8.bf8.v2i32.v8f32(<2 x i32>, <2 x i32>, <8 x float>)
|
||||
declare <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf8.fp8.v2i32.v8f32(<2 x i32>, <2 x i32>, <8 x float>)
|
||||
declare <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf8.bf8.v2i32.v8f32(<2 x i32>, <2 x i32>, <8 x float>)
|
||||
declare <8 x float> @llvm.amdgcn.swmmac.f32.16x16x32.f16.v8f16.v16f16.v8f32.i16(<8 x half>, <16 x half>, <8 x float>, i16)
|
||||
declare <8 x half> @llvm.amdgcn.swmmac.f16.16x16x32.f16.v8f16.v16f16.v8f16.i16(<8 x half>, <16 x half>, <8 x half>, i16)
|
519
llvm/test/CodeGen/AMDGPU/GlobalISel/wmma-gfx12-w32-imm.ll
Normal file
519
llvm/test/CodeGen/AMDGPU/GlobalISel/wmma-gfx12-w32-imm.ll
Normal file
@ -0,0 +1,519 @@
|
||||
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
|
||||
; RUN: llc -global-isel -march=amdgcn -mcpu=gfx1200 -verify-machineinstrs < %s | FileCheck %s --check-prefix=GFX12
|
||||
|
||||
define amdgpu_ps void @test_wmma_f32_16x16x16_f16_imm(<8 x half> %A, <8 x half> %B, ptr addrspace(1) %out) {
|
||||
; GFX12-LABEL: test_wmma_f32_16x16x16_f16_imm:
|
||||
; GFX12: ; %bb.0: ; %bb
|
||||
; GFX12-NEXT: v_wmma_f32_16x16x16_f16 v[10:17], v[0:3], v[4:7], 1.0
|
||||
; GFX12-NEXT: s_clause 0x1
|
||||
; GFX12-NEXT: global_store_b128 v[8:9], v[10:13], off
|
||||
; GFX12-NEXT: global_store_b128 v[8:9], v[14:17], off offset:16
|
||||
; GFX12-NEXT: s_nop 0
|
||||
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
|
||||
; GFX12-NEXT: s_endpgm
|
||||
bb:
|
||||
%res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.f16.v8f32.v8f16.v8f16.v8f32(<8 x half> %A, <8 x half> %B, <8 x float> <float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0>)
|
||||
store <8 x float> %res, ptr addrspace(1) %out
|
||||
ret void
|
||||
}
|
||||
|
||||
define amdgpu_ps void @test_wmma_f32_16x16x16_f16_imm_non_inlineable(<8 x half> %A, <8 x half> %B, ptr addrspace(1) %out) {
|
||||
; GFX12-LABEL: test_wmma_f32_16x16x16_f16_imm_non_inlineable:
|
||||
; GFX12: ; %bb.0: ; %bb
|
||||
; GFX12-NEXT: s_mov_b32 s0, 0x40400000
|
||||
; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
|
||||
; GFX12-NEXT: s_mov_b32 s7, s0
|
||||
; GFX12-NEXT: s_mov_b32 s1, s0
|
||||
; GFX12-NEXT: s_mov_b32 s2, s0
|
||||
; GFX12-NEXT: s_mov_b32 s3, s0
|
||||
; GFX12-NEXT: s_mov_b32 s4, s0
|
||||
; GFX12-NEXT: s_mov_b32 s5, s0
|
||||
; GFX12-NEXT: s_mov_b32 s6, s0
|
||||
; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_1)
|
||||
; GFX12-NEXT: v_dual_mov_b32 v17, s7 :: v_dual_mov_b32 v16, s6
|
||||
; GFX12-NEXT: v_dual_mov_b32 v15, s5 :: v_dual_mov_b32 v14, s4
|
||||
; GFX12-NEXT: v_dual_mov_b32 v13, s3 :: v_dual_mov_b32 v12, s2
|
||||
; GFX12-NEXT: v_dual_mov_b32 v11, s1 :: v_dual_mov_b32 v10, s0
|
||||
; GFX12-NEXT: v_wmma_f32_16x16x16_f16 v[10:17], v[0:3], v[4:7], v[10:17]
|
||||
; GFX12-NEXT: s_clause 0x1
|
||||
; GFX12-NEXT: global_store_b128 v[8:9], v[10:13], off
|
||||
; GFX12-NEXT: global_store_b128 v[8:9], v[14:17], off offset:16
|
||||
; GFX12-NEXT: s_nop 0
|
||||
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
|
||||
; GFX12-NEXT: s_endpgm
|
||||
bb:
|
||||
%res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.f16.v8f32.v8f16.v8f16.v8f32(<8 x half> %A, <8 x half> %B, <8 x float> <float 3.0, float 3.0, float 3.0, float 3.0, float 3.0, float 3.0, float 3.0, float 3.0>)
|
||||
store <8 x float> %res, ptr addrspace(1) %out
|
||||
ret void
|
||||
}
|
||||
|
||||
define amdgpu_ps void @test_wmma_f32_16x16x16_bf16_imm(<8 x i16> %A, <8 x i16> %B, ptr addrspace(1) %out) {
|
||||
; GFX12-LABEL: test_wmma_f32_16x16x16_bf16_imm:
|
||||
; GFX12: ; %bb.0: ; %bb
|
||||
; GFX12-NEXT: v_wmma_f32_16x16x16_bf16 v[10:17], v[0:3], v[4:7], 1.0
|
||||
; GFX12-NEXT: s_clause 0x1
|
||||
; GFX12-NEXT: global_store_b128 v[8:9], v[10:13], off
|
||||
; GFX12-NEXT: global_store_b128 v[8:9], v[14:17], off offset:16
|
||||
; GFX12-NEXT: s_nop 0
|
||||
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
|
||||
; GFX12-NEXT: s_endpgm
|
||||
bb:
|
||||
%res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf16.v8f32.v8i16.v8i16.v8f32(<8 x i16> %A, <8 x i16> %B, <8 x float> <float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0>)
|
||||
store <8 x float> %res, ptr addrspace(1) %out
|
||||
ret void
|
||||
}
|
||||
|
||||
define amdgpu_ps void @test_wmma_f32_16x16x16_bf16_imm_non_inlineable(<8 x i16> %A, <8 x i16> %B, ptr addrspace(1) %out) {
|
||||
; GFX12-LABEL: test_wmma_f32_16x16x16_bf16_imm_non_inlineable:
|
||||
; GFX12: ; %bb.0: ; %bb
|
||||
; GFX12-NEXT: s_mov_b32 s0, 0x40400000
|
||||
; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
|
||||
; GFX12-NEXT: s_mov_b32 s7, s0
|
||||
; GFX12-NEXT: s_mov_b32 s1, s0
|
||||
; GFX12-NEXT: s_mov_b32 s2, s0
|
||||
; GFX12-NEXT: s_mov_b32 s3, s0
|
||||
; GFX12-NEXT: s_mov_b32 s4, s0
|
||||
; GFX12-NEXT: s_mov_b32 s5, s0
|
||||
; GFX12-NEXT: s_mov_b32 s6, s0
|
||||
; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_1)
|
||||
; GFX12-NEXT: v_dual_mov_b32 v17, s7 :: v_dual_mov_b32 v16, s6
|
||||
; GFX12-NEXT: v_dual_mov_b32 v15, s5 :: v_dual_mov_b32 v14, s4
|
||||
; GFX12-NEXT: v_dual_mov_b32 v13, s3 :: v_dual_mov_b32 v12, s2
|
||||
; GFX12-NEXT: v_dual_mov_b32 v11, s1 :: v_dual_mov_b32 v10, s0
|
||||
; GFX12-NEXT: v_wmma_f32_16x16x16_bf16 v[10:17], v[0:3], v[4:7], v[10:17]
|
||||
; GFX12-NEXT: s_clause 0x1
|
||||
; GFX12-NEXT: global_store_b128 v[8:9], v[10:13], off
|
||||
; GFX12-NEXT: global_store_b128 v[8:9], v[14:17], off offset:16
|
||||
; GFX12-NEXT: s_nop 0
|
||||
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
|
||||
; GFX12-NEXT: s_endpgm
|
||||
bb:
|
||||
%res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf16.v8f32.v8i16.v8i16.v8f32(<8 x i16> %A, <8 x i16> %B, <8 x float> <float 3.0, float 3.0, float 3.0, float 3.0, float 3.0, float 3.0, float 3.0, float 3.0>)
|
||||
store <8 x float> %res, ptr addrspace(1) %out
|
||||
ret void
|
||||
}
|
||||
|
||||
define amdgpu_ps void @test_wmma_f16_16x16x16_f16_imm(<8 x half> %A, <8 x half> %B, ptr addrspace(1) %out) {
|
||||
; GFX12-LABEL: test_wmma_f16_16x16x16_f16_imm:
|
||||
; GFX12: ; %bb.0: ; %bb
|
||||
; GFX12-NEXT: v_wmma_f16_16x16x16_f16 v[10:13], v[0:3], v[4:7], 1.0
|
||||
; GFX12-NEXT: global_store_b128 v[8:9], v[10:13], off
|
||||
; GFX12-NEXT: s_nop 0
|
||||
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
|
||||
; GFX12-NEXT: s_endpgm
|
||||
bb:
|
||||
%res = call <8 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16.v8f16.v8f16.v8f16.v8f16(<8 x half> %A, <8 x half> %B, <8 x half> <half 1.0, half 1.0, half 1.0, half 1.0, half 1.0, half 1.0, half 1.0, half 1.0>, i1 0)
|
||||
store <8 x half> %res, ptr addrspace(1) %out
|
||||
ret void
|
||||
}
|
||||
|
||||
define amdgpu_ps void @test_wmma_f16_16x16x16_f16_imm_non_inlineable(<8 x half> %A, <8 x half> %B, ptr addrspace(1) %out) {
|
||||
; GFX12-LABEL: test_wmma_f16_16x16x16_f16_imm_non_inlineable:
|
||||
; GFX12: ; %bb.0: ; %bb
|
||||
; GFX12-NEXT: s_mov_b32 s0, 0x42004200
|
||||
; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1)
|
||||
; GFX12-NEXT: s_mov_b32 s3, s0
|
||||
; GFX12-NEXT: s_mov_b32 s1, s0
|
||||
; GFX12-NEXT: s_mov_b32 s2, s0
|
||||
; GFX12-NEXT: v_dual_mov_b32 v13, s3 :: v_dual_mov_b32 v12, s2
|
||||
; GFX12-NEXT: v_dual_mov_b32 v11, s1 :: v_dual_mov_b32 v10, s0
|
||||
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
|
||||
; GFX12-NEXT: v_wmma_f16_16x16x16_f16 v[10:13], v[0:3], v[4:7], v[10:13]
|
||||
; GFX12-NEXT: global_store_b128 v[8:9], v[10:13], off
|
||||
; GFX12-NEXT: s_nop 0
|
||||
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
|
||||
; GFX12-NEXT: s_endpgm
|
||||
bb:
|
||||
%res = call <8 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16.v8f16.v8f16.v8f16.v8f16(<8 x half> %A, <8 x half> %B, <8 x half> <half 3.0, half 3.0, half 3.0, half 3.0, half 3.0, half 3.0, half 3.0, half 3.0>, i1 0)
|
||||
store <8 x half> %res, ptr addrspace(1) %out
|
||||
ret void
|
||||
}
|
||||
|
||||
define amdgpu_ps void @test_wmma_bf16_16x16x16_bf16_imm(<8 x i16> %A, <8 x i16> %B, ptr addrspace(1) %out) {
|
||||
; GFX12-LABEL: test_wmma_bf16_16x16x16_bf16_imm:
|
||||
; GFX12: ; %bb.0: ; %bb
|
||||
; GFX12-NEXT: s_mov_b32 s0, 0x3f803f80
|
||||
; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1)
|
||||
; GFX12-NEXT: s_mov_b32 s3, s0
|
||||
; GFX12-NEXT: s_mov_b32 s1, s0
|
||||
; GFX12-NEXT: s_mov_b32 s2, s0
|
||||
; GFX12-NEXT: v_dual_mov_b32 v13, s3 :: v_dual_mov_b32 v12, s2
|
||||
; GFX12-NEXT: v_dual_mov_b32 v11, s1 :: v_dual_mov_b32 v10, s0
|
||||
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
|
||||
; GFX12-NEXT: v_wmma_bf16_16x16x16_bf16 v[10:13], v[0:3], v[4:7], v[10:13]
|
||||
; GFX12-NEXT: global_store_b128 v[8:9], v[10:13], off
|
||||
; GFX12-NEXT: s_nop 0
|
||||
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
|
||||
; GFX12-NEXT: s_endpgm
|
||||
bb:
|
||||
%res = call <8 x i16> @llvm.amdgcn.wmma.bf16.16x16x16.bf16.v8i16.v8i16.v8i16.v8i16(<8 x i16> %A, <8 x i16> %B, <8 x i16> <i16 16256, i16 16256, i16 16256, i16 16256, i16 16256, i16 16256, i16 16256, i16 16256>, i1 0)
|
||||
store <8 x i16> %res, ptr addrspace(1) %out
|
||||
ret void
|
||||
}
|
||||
|
||||
define amdgpu_ps void @test_wmma_bf16_16x16x16_bf16_imm_non_inlineable(<8 x i16> %A, <8 x i16> %B, ptr addrspace(1) %out) {
|
||||
; GFX12-LABEL: test_wmma_bf16_16x16x16_bf16_imm_non_inlineable:
|
||||
; GFX12: ; %bb.0: ; %bb
|
||||
; GFX12-NEXT: s_mov_b32 s0, 0x3fc03fc0
|
||||
; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1)
|
||||
; GFX12-NEXT: s_mov_b32 s3, s0
|
||||
; GFX12-NEXT: s_mov_b32 s1, s0
|
||||
; GFX12-NEXT: s_mov_b32 s2, s0
|
||||
; GFX12-NEXT: v_dual_mov_b32 v13, s3 :: v_dual_mov_b32 v12, s2
|
||||
; GFX12-NEXT: v_dual_mov_b32 v11, s1 :: v_dual_mov_b32 v10, s0
|
||||
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
|
||||
; GFX12-NEXT: v_wmma_bf16_16x16x16_bf16 v[10:13], v[0:3], v[4:7], v[10:13]
|
||||
; GFX12-NEXT: global_store_b128 v[8:9], v[10:13], off
|
||||
; GFX12-NEXT: s_nop 0
|
||||
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
|
||||
; GFX12-NEXT: s_endpgm
|
||||
bb:
|
||||
%res = call <8 x i16> @llvm.amdgcn.wmma.bf16.16x16x16.bf16.v8i16.v8i16.v8i16.v8i16(<8 x i16> %A, <8 x i16> %B, <8 x i16> <i16 16320, i16 16320, i16 16320, i16 16320, i16 16320, i16 16320, i16 16320, i16 16320>, i1 0)
|
||||
store <8 x i16> %res, ptr addrspace(1) %out
|
||||
ret void
|
||||
}
|
||||
|
||||
define amdgpu_ps void @test_wmma_i32_16x16x16_iu8_imm(<2 x i32> %A, <2 x i32> %B, ptr addrspace(1) %out) {
|
||||
; GFX12-LABEL: test_wmma_i32_16x16x16_iu8_imm:
|
||||
; GFX12: ; %bb.0: ; %bb
|
||||
; GFX12-NEXT: v_wmma_i32_16x16x16_iu8 v[6:13], v[0:1], v[2:3], 1
|
||||
; GFX12-NEXT: s_clause 0x1
|
||||
; GFX12-NEXT: global_store_b128 v[4:5], v[6:9], off
|
||||
; GFX12-NEXT: global_store_b128 v[4:5], v[10:13], off offset:16
|
||||
; GFX12-NEXT: s_nop 0
|
||||
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
|
||||
; GFX12-NEXT: s_endpgm
|
||||
bb:
|
||||
%res = call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8.v8i32.v2i32.v2i32.v8i32(i1 0, <2 x i32> %A, i1 0, <2 x i32> %B, <8 x i32> <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>, i1 0)
|
||||
store <8 x i32> %res, ptr addrspace(1) %out
|
||||
ret void
|
||||
}
|
||||
|
||||
define amdgpu_ps void @test_wmma_i32_16x16x16_iu8_imm_non_inlineable(<2 x i32> %A, <2 x i32> %B, ptr addrspace(1) %out) {
|
||||
; GFX12-LABEL: test_wmma_i32_16x16x16_iu8_imm_non_inlineable:
|
||||
; GFX12: ; %bb.0: ; %bb
|
||||
; GFX12-NEXT: s_movk_i32 s0, 0x80
|
||||
; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
|
||||
; GFX12-NEXT: s_mov_b32 s7, s0
|
||||
; GFX12-NEXT: s_mov_b32 s1, s0
|
||||
; GFX12-NEXT: s_mov_b32 s2, s0
|
||||
; GFX12-NEXT: s_mov_b32 s3, s0
|
||||
; GFX12-NEXT: s_mov_b32 s4, s0
|
||||
; GFX12-NEXT: s_mov_b32 s5, s0
|
||||
; GFX12-NEXT: s_mov_b32 s6, s0
|
||||
; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_1)
|
||||
; GFX12-NEXT: v_dual_mov_b32 v13, s7 :: v_dual_mov_b32 v12, s6
|
||||
; GFX12-NEXT: v_dual_mov_b32 v11, s5 :: v_dual_mov_b32 v10, s4
|
||||
; GFX12-NEXT: v_dual_mov_b32 v9, s3 :: v_dual_mov_b32 v8, s2
|
||||
; GFX12-NEXT: v_dual_mov_b32 v7, s1 :: v_dual_mov_b32 v6, s0
|
||||
; GFX12-NEXT: v_wmma_i32_16x16x16_iu8 v[6:13], v[0:1], v[2:3], v[6:13]
|
||||
; GFX12-NEXT: s_clause 0x1
|
||||
; GFX12-NEXT: global_store_b128 v[4:5], v[6:9], off
|
||||
; GFX12-NEXT: global_store_b128 v[4:5], v[10:13], off offset:16
|
||||
; GFX12-NEXT: s_nop 0
|
||||
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
|
||||
; GFX12-NEXT: s_endpgm
|
||||
bb:
|
||||
%res = call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8.v8i32.v2i32.v2i32.v8i32(i1 0, <2 x i32> %A, i1 0, <2 x i32> %B, <8 x i32> <i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128>, i1 0)
|
||||
store <8 x i32> %res, ptr addrspace(1) %out
|
||||
ret void
|
||||
}
|
||||
|
||||
define amdgpu_ps void @test_wmma_i32_16x16x16_iu4_imm(i32 %A, i32 %B, ptr addrspace(1) %out) {
|
||||
; GFX12-LABEL: test_wmma_i32_16x16x16_iu4_imm:
|
||||
; GFX12: ; %bb.0: ; %bb
|
||||
; GFX12-NEXT: v_wmma_i32_16x16x16_iu4 v[4:11], v0, v1, 1
|
||||
; GFX12-NEXT: s_clause 0x1
|
||||
; GFX12-NEXT: global_store_b128 v[2:3], v[4:7], off
|
||||
; GFX12-NEXT: global_store_b128 v[2:3], v[8:11], off offset:16
|
||||
; GFX12-NEXT: s_nop 0
|
||||
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
|
||||
; GFX12-NEXT: s_endpgm
|
||||
bb:
|
||||
%res = call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4.v8i32.i32.i32.v8i32(i1 0, i32 %A, i1 0, i32 %B, <8 x i32> <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>, i1 0)
|
||||
store <8 x i32> %res, ptr addrspace(1) %out
|
||||
ret void
|
||||
}
|
||||
|
||||
define amdgpu_ps void @test_wmma_i32_16x16x16_iu4_imm_non_inlineable(i32 %A, i32 %B, ptr addrspace(1) %out) {
|
||||
; GFX12-LABEL: test_wmma_i32_16x16x16_iu4_imm_non_inlineable:
|
||||
; GFX12: ; %bb.0: ; %bb
|
||||
; GFX12-NEXT: s_movk_i32 s0, 0x80
|
||||
; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
|
||||
; GFX12-NEXT: s_mov_b32 s7, s0
|
||||
; GFX12-NEXT: s_mov_b32 s1, s0
|
||||
; GFX12-NEXT: s_mov_b32 s2, s0
|
||||
; GFX12-NEXT: s_mov_b32 s3, s0
|
||||
; GFX12-NEXT: s_mov_b32 s4, s0
|
||||
; GFX12-NEXT: s_mov_b32 s5, s0
|
||||
; GFX12-NEXT: s_mov_b32 s6, s0
|
||||
; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_1)
|
||||
; GFX12-NEXT: v_dual_mov_b32 v11, s7 :: v_dual_mov_b32 v10, s6
|
||||
; GFX12-NEXT: v_dual_mov_b32 v9, s5 :: v_dual_mov_b32 v8, s4
|
||||
; GFX12-NEXT: v_dual_mov_b32 v7, s3 :: v_dual_mov_b32 v6, s2
|
||||
; GFX12-NEXT: v_dual_mov_b32 v5, s1 :: v_dual_mov_b32 v4, s0
|
||||
; GFX12-NEXT: v_wmma_i32_16x16x16_iu4 v[4:11], v0, v1, v[4:11]
|
||||
; GFX12-NEXT: s_clause 0x1
|
||||
; GFX12-NEXT: global_store_b128 v[2:3], v[4:7], off
|
||||
; GFX12-NEXT: global_store_b128 v[2:3], v[8:11], off offset:16
|
||||
; GFX12-NEXT: s_nop 0
|
||||
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
|
||||
; GFX12-NEXT: s_endpgm
|
||||
bb:
|
||||
%res = call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4.v8i32.i32.i32.v8i32(i1 0, i32 %A, i1 0, i32 %B, <8 x i32> <i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128>, i1 0)
|
||||
store <8 x i32> %res, ptr addrspace(1) %out
|
||||
ret void
|
||||
}
|
||||
|
||||
define amdgpu_ps void @test_wmma_f32_16x16x16_fp8_fp8_imm(<2 x i32> %A, <2 x i32> %B, ptr addrspace(1) %out) {
|
||||
; GFX12-LABEL: test_wmma_f32_16x16x16_fp8_fp8_imm:
|
||||
; GFX12: ; %bb.0: ; %bb
|
||||
; GFX12-NEXT: v_wmma_f32_16x16x16_fp8_fp8 v[6:13], v[0:1], v[2:3], 1.0
|
||||
; GFX12-NEXT: s_clause 0x1
|
||||
; GFX12-NEXT: global_store_b128 v[4:5], v[6:9], off
|
||||
; GFX12-NEXT: global_store_b128 v[4:5], v[10:13], off offset:16
|
||||
; GFX12-NEXT: s_nop 0
|
||||
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
|
||||
; GFX12-NEXT: s_endpgm
|
||||
bb:
|
||||
%res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.fp8.fp8.v8f32.v2i32.v2i32.v8f32(<2 x i32> %A, <2 x i32> %B, <8 x float> <float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0>)
|
||||
store <8 x float> %res, ptr addrspace(1) %out
|
||||
ret void
|
||||
}
|
||||
|
||||
define amdgpu_ps void @test_wmma_f32_16x16x16_fp8_fp8_imm_non_inlineable(<2 x i32> %A, <2 x i32> %B, ptr addrspace(1) %out) {
|
||||
; GFX12-LABEL: test_wmma_f32_16x16x16_fp8_fp8_imm_non_inlineable:
|
||||
; GFX12: ; %bb.0: ; %bb
|
||||
; GFX12-NEXT: s_mov_b32 s0, 0x40400000
|
||||
; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
|
||||
; GFX12-NEXT: s_mov_b32 s7, s0
|
||||
; GFX12-NEXT: s_mov_b32 s1, s0
|
||||
; GFX12-NEXT: s_mov_b32 s2, s0
|
||||
; GFX12-NEXT: s_mov_b32 s3, s0
|
||||
; GFX12-NEXT: s_mov_b32 s4, s0
|
||||
; GFX12-NEXT: s_mov_b32 s5, s0
|
||||
; GFX12-NEXT: s_mov_b32 s6, s0
|
||||
; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_1)
|
||||
; GFX12-NEXT: v_dual_mov_b32 v13, s7 :: v_dual_mov_b32 v12, s6
|
||||
; GFX12-NEXT: v_dual_mov_b32 v11, s5 :: v_dual_mov_b32 v10, s4
|
||||
; GFX12-NEXT: v_dual_mov_b32 v9, s3 :: v_dual_mov_b32 v8, s2
|
||||
; GFX12-NEXT: v_dual_mov_b32 v7, s1 :: v_dual_mov_b32 v6, s0
|
||||
; GFX12-NEXT: v_wmma_f32_16x16x16_fp8_fp8 v[6:13], v[0:1], v[2:3], v[6:13]
|
||||
; GFX12-NEXT: s_clause 0x1
|
||||
; GFX12-NEXT: global_store_b128 v[4:5], v[6:9], off
|
||||
; GFX12-NEXT: global_store_b128 v[4:5], v[10:13], off offset:16
|
||||
; GFX12-NEXT: s_nop 0
|
||||
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
|
||||
; GFX12-NEXT: s_endpgm
|
||||
bb:
|
||||
%res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.fp8.fp8.v8f32.v2i32.v2i32.v8f32(<2 x i32> %A, <2 x i32> %B, <8 x float> <float 3.0, float 3.0, float 3.0, float 3.0, float 3.0, float 3.0, float 3.0, float 3.0>)
|
||||
store <8 x float> %res, ptr addrspace(1) %out
|
||||
ret void
|
||||
}
|
||||
|
||||
define amdgpu_ps void @test_wmma_f32_16x16x16_bf8_fp8_imm(<2 x i32> %A, <2 x i32> %B, ptr addrspace(1) %out) {
|
||||
; GFX12-LABEL: test_wmma_f32_16x16x16_bf8_fp8_imm:
|
||||
; GFX12: ; %bb.0: ; %bb
|
||||
; GFX12-NEXT: v_wmma_f32_16x16x16_bf8_fp8 v[6:13], v[0:1], v[2:3], 1.0
|
||||
; GFX12-NEXT: s_clause 0x1
|
||||
; GFX12-NEXT: global_store_b128 v[4:5], v[6:9], off
|
||||
; GFX12-NEXT: global_store_b128 v[4:5], v[10:13], off offset:16
|
||||
; GFX12-NEXT: s_nop 0
|
||||
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
|
||||
; GFX12-NEXT: s_endpgm
|
||||
bb:
|
||||
%res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf8.fp8.v8f32.v2i32.v2i32.v8f32(<2 x i32> %A, <2 x i32> %B, <8 x float> <float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0>)
|
||||
store <8 x float> %res, ptr addrspace(1) %out
|
||||
ret void
|
||||
}
|
||||
|
||||
define amdgpu_ps void @test_wmma_f32_16x16x16_bf8_fp8_imm_non_inlineable(<2 x i32> %A, <2 x i32> %B, ptr addrspace(1) %out) {
|
||||
; GFX12-LABEL: test_wmma_f32_16x16x16_bf8_fp8_imm_non_inlineable:
|
||||
; GFX12: ; %bb.0: ; %bb
|
||||
; GFX12-NEXT: s_mov_b32 s0, 0x40400000
|
||||
; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
|
||||
; GFX12-NEXT: s_mov_b32 s7, s0
|
||||
; GFX12-NEXT: s_mov_b32 s1, s0
|
||||
; GFX12-NEXT: s_mov_b32 s2, s0
|
||||
; GFX12-NEXT: s_mov_b32 s3, s0
|
||||
; GFX12-NEXT: s_mov_b32 s4, s0
|
||||
; GFX12-NEXT: s_mov_b32 s5, s0
|
||||
; GFX12-NEXT: s_mov_b32 s6, s0
|
||||
; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_1)
|
||||
; GFX12-NEXT: v_dual_mov_b32 v13, s7 :: v_dual_mov_b32 v12, s6
|
||||
; GFX12-NEXT: v_dual_mov_b32 v11, s5 :: v_dual_mov_b32 v10, s4
|
||||
; GFX12-NEXT: v_dual_mov_b32 v9, s3 :: v_dual_mov_b32 v8, s2
|
||||
; GFX12-NEXT: v_dual_mov_b32 v7, s1 :: v_dual_mov_b32 v6, s0
|
||||
; GFX12-NEXT: v_wmma_f32_16x16x16_bf8_fp8 v[6:13], v[0:1], v[2:3], v[6:13]
|
||||
; GFX12-NEXT: s_clause 0x1
|
||||
; GFX12-NEXT: global_store_b128 v[4:5], v[6:9], off
|
||||
; GFX12-NEXT: global_store_b128 v[4:5], v[10:13], off offset:16
|
||||
; GFX12-NEXT: s_nop 0
|
||||
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
|
||||
; GFX12-NEXT: s_endpgm
|
||||
bb:
|
||||
%res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf8.fp8.v8f32.v2i32.v2i32.v8f32(<2 x i32> %A, <2 x i32> %B, <8 x float> <float 3.0, float 3.0, float 3.0, float 3.0, float 3.0, float 3.0, float 3.0, float 3.0>)
|
||||
store <8 x float> %res, ptr addrspace(1) %out
|
||||
ret void
|
||||
}
|
||||
|
||||
define amdgpu_ps void @test_wmma_f32_16x16x16_fp8_bf8_imm(<2 x i32> %A, <2 x i32> %B, ptr addrspace(1) %out) {
|
||||
; GFX12-LABEL: test_wmma_f32_16x16x16_fp8_bf8_imm:
|
||||
; GFX12: ; %bb.0: ; %bb
|
||||
; GFX12-NEXT: v_wmma_f32_16x16x16_fp8_bf8 v[6:13], v[0:1], v[2:3], 1.0
|
||||
; GFX12-NEXT: s_clause 0x1
|
||||
; GFX12-NEXT: global_store_b128 v[4:5], v[6:9], off
|
||||
; GFX12-NEXT: global_store_b128 v[4:5], v[10:13], off offset:16
|
||||
; GFX12-NEXT: s_nop 0
|
||||
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
|
||||
; GFX12-NEXT: s_endpgm
|
||||
bb:
|
||||
%res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.fp8.bf8.v8f32.v2i32.v2i32.v8f32(<2 x i32> %A, <2 x i32> %B, <8 x float> <float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0>)
|
||||
store <8 x float> %res, ptr addrspace(1) %out
|
||||
ret void
|
||||
}
|
||||
|
||||
define amdgpu_ps void @test_wmma_f32_16x16x16_fp8_bf8_imm_non_inlineable(<2 x i32> %A, <2 x i32> %B, ptr addrspace(1) %out) {
|
||||
; GFX12-LABEL: test_wmma_f32_16x16x16_fp8_bf8_imm_non_inlineable:
|
||||
; GFX12: ; %bb.0: ; %bb
|
||||
; GFX12-NEXT: s_mov_b32 s0, 0x40400000
|
||||
; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
|
||||
; GFX12-NEXT: s_mov_b32 s7, s0
|
||||
; GFX12-NEXT: s_mov_b32 s1, s0
|
||||
; GFX12-NEXT: s_mov_b32 s2, s0
|
||||
; GFX12-NEXT: s_mov_b32 s3, s0
|
||||
; GFX12-NEXT: s_mov_b32 s4, s0
|
||||
; GFX12-NEXT: s_mov_b32 s5, s0
|
||||
; GFX12-NEXT: s_mov_b32 s6, s0
|
||||
; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_1)
|
||||
; GFX12-NEXT: v_dual_mov_b32 v13, s7 :: v_dual_mov_b32 v12, s6
|
||||
; GFX12-NEXT: v_dual_mov_b32 v11, s5 :: v_dual_mov_b32 v10, s4
|
||||
; GFX12-NEXT: v_dual_mov_b32 v9, s3 :: v_dual_mov_b32 v8, s2
|
||||
; GFX12-NEXT: v_dual_mov_b32 v7, s1 :: v_dual_mov_b32 v6, s0
|
||||
; GFX12-NEXT: v_wmma_f32_16x16x16_fp8_bf8 v[6:13], v[0:1], v[2:3], v[6:13]
|
||||
; GFX12-NEXT: s_clause 0x1
|
||||
; GFX12-NEXT: global_store_b128 v[4:5], v[6:9], off
|
||||
; GFX12-NEXT: global_store_b128 v[4:5], v[10:13], off offset:16
|
||||
; GFX12-NEXT: s_nop 0
|
||||
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
|
||||
; GFX12-NEXT: s_endpgm
|
||||
bb:
|
||||
%res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.fp8.bf8.v8f32.v2i32.v2i32.v8f32(<2 x i32> %A, <2 x i32> %B, <8 x float> <float 3.0, float 3.0, float 3.0, float 3.0, float 3.0, float 3.0, float 3.0, float 3.0>)
|
||||
store <8 x float> %res, ptr addrspace(1) %out
|
||||
ret void
|
||||
}
|
||||
|
||||
define amdgpu_ps void @test_wmma_f32_16x16x16_bf8_bf8_imm(<2 x i32> %A, <2 x i32> %B, ptr addrspace(1) %out) {
|
||||
; GFX12-LABEL: test_wmma_f32_16x16x16_bf8_bf8_imm:
|
||||
; GFX12: ; %bb.0: ; %bb
|
||||
; GFX12-NEXT: v_wmma_f32_16x16x16_bf8_bf8 v[6:13], v[0:1], v[2:3], 1.0
|
||||
; GFX12-NEXT: s_clause 0x1
|
||||
; GFX12-NEXT: global_store_b128 v[4:5], v[6:9], off
|
||||
; GFX12-NEXT: global_store_b128 v[4:5], v[10:13], off offset:16
|
||||
; GFX12-NEXT: s_nop 0
|
||||
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
|
||||
; GFX12-NEXT: s_endpgm
|
||||
bb:
|
||||
%res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf8.bf8.v8f32.v2i32.v2i32.v8f32(<2 x i32> %A, <2 x i32> %B, <8 x float> <float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0>)
|
||||
store <8 x float> %res, ptr addrspace(1) %out
|
||||
ret void
|
||||
}
|
||||
|
||||
define amdgpu_ps void @test_wmma_f32_16x16x16_bf8_bf8_imm_non_inlineable(<2 x i32> %A, <2 x i32> %B, ptr addrspace(1) %out) {
|
||||
; GFX12-LABEL: test_wmma_f32_16x16x16_bf8_bf8_imm_non_inlineable:
|
||||
; GFX12: ; %bb.0: ; %bb
|
||||
; GFX12-NEXT: s_mov_b32 s0, 0x40400000
|
||||
; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
|
||||
; GFX12-NEXT: s_mov_b32 s7, s0
|
||||
; GFX12-NEXT: s_mov_b32 s1, s0
|
||||
; GFX12-NEXT: s_mov_b32 s2, s0
|
||||
; GFX12-NEXT: s_mov_b32 s3, s0
|
||||
; GFX12-NEXT: s_mov_b32 s4, s0
|
||||
; GFX12-NEXT: s_mov_b32 s5, s0
|
||||
; GFX12-NEXT: s_mov_b32 s6, s0
|
||||
; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_1)
|
||||
; GFX12-NEXT: v_dual_mov_b32 v13, s7 :: v_dual_mov_b32 v12, s6
|
||||
; GFX12-NEXT: v_dual_mov_b32 v11, s5 :: v_dual_mov_b32 v10, s4
|
||||
; GFX12-NEXT: v_dual_mov_b32 v9, s3 :: v_dual_mov_b32 v8, s2
|
||||
; GFX12-NEXT: v_dual_mov_b32 v7, s1 :: v_dual_mov_b32 v6, s0
|
||||
; GFX12-NEXT: v_wmma_f32_16x16x16_bf8_bf8 v[6:13], v[0:1], v[2:3], v[6:13]
|
||||
; GFX12-NEXT: s_clause 0x1
|
||||
; GFX12-NEXT: global_store_b128 v[4:5], v[6:9], off
|
||||
; GFX12-NEXT: global_store_b128 v[4:5], v[10:13], off offset:16
|
||||
; GFX12-NEXT: s_nop 0
|
||||
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
|
||||
; GFX12-NEXT: s_endpgm
|
||||
bb:
|
||||
%res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf8.bf8.v8f32.v2i32.v2i32.v8f32(<2 x i32> %A, <2 x i32> %B, <8 x float> <float 3.0, float 3.0, float 3.0, float 3.0, float 3.0, float 3.0, float 3.0, float 3.0>)
|
||||
store <8 x float> %res, ptr addrspace(1) %out
|
||||
ret void
|
||||
}
|
||||
|
||||
define amdgpu_ps void @test_wmma_i32_16x16x32_iu4_imm(<2 x i32> %A, <2 x i32> %B, ptr addrspace(1) %out) {
|
||||
; GFX12-LABEL: test_wmma_i32_16x16x32_iu4_imm:
|
||||
; GFX12: ; %bb.0: ; %bb
|
||||
; GFX12-NEXT: v_wmma_i32_16x16x32_iu4 v[6:13], v[0:1], v[2:3], 1
|
||||
; GFX12-NEXT: s_clause 0x1
|
||||
; GFX12-NEXT: global_store_b128 v[4:5], v[6:9], off
|
||||
; GFX12-NEXT: global_store_b128 v[4:5], v[10:13], off offset:16
|
||||
; GFX12-NEXT: s_nop 0
|
||||
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
|
||||
; GFX12-NEXT: s_endpgm
|
||||
bb:
|
||||
%res = call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x32.iu4.v8i32.v2i32.v2i32.v8i32(i1 0, <2 x i32> %A, i1 0, <2 x i32> %B, <8 x i32> <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>, i1 0)
|
||||
store <8 x i32> %res, ptr addrspace(1) %out
|
||||
ret void
|
||||
}
|
||||
|
||||
define amdgpu_ps void @test_wmma_i32_16x16x32_iu4_imm_non_inlineable(<2 x i32> %A, <2 x i32> %B, ptr addrspace(1) %out) {
|
||||
; GFX12-LABEL: test_wmma_i32_16x16x32_iu4_imm_non_inlineable:
|
||||
; GFX12: ; %bb.0: ; %bb
|
||||
; GFX12-NEXT: s_movk_i32 s0, 0x80
|
||||
; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
|
||||
; GFX12-NEXT: s_mov_b32 s7, s0
|
||||
; GFX12-NEXT: s_mov_b32 s1, s0
|
||||
; GFX12-NEXT: s_mov_b32 s2, s0
|
||||
; GFX12-NEXT: s_mov_b32 s3, s0
|
||||
; GFX12-NEXT: s_mov_b32 s4, s0
|
||||
; GFX12-NEXT: s_mov_b32 s5, s0
|
||||
; GFX12-NEXT: s_mov_b32 s6, s0
|
||||
; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_1)
|
||||
; GFX12-NEXT: v_dual_mov_b32 v13, s7 :: v_dual_mov_b32 v12, s6
|
||||
; GFX12-NEXT: v_dual_mov_b32 v11, s5 :: v_dual_mov_b32 v10, s4
|
||||
; GFX12-NEXT: v_dual_mov_b32 v9, s3 :: v_dual_mov_b32 v8, s2
|
||||
; GFX12-NEXT: v_dual_mov_b32 v7, s1 :: v_dual_mov_b32 v6, s0
|
||||
; GFX12-NEXT: v_wmma_i32_16x16x32_iu4 v[6:13], v[0:1], v[2:3], v[6:13]
|
||||
; GFX12-NEXT: s_clause 0x1
|
||||
; GFX12-NEXT: global_store_b128 v[4:5], v[6:9], off
|
||||
; GFX12-NEXT: global_store_b128 v[4:5], v[10:13], off offset:16
|
||||
; GFX12-NEXT: s_nop 0
|
||||
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
|
||||
; GFX12-NEXT: s_endpgm
|
||||
bb:
|
||||
%res = call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x32.iu4.v8i32.v2i32.v2i32.v8i32(i1 0, <2 x i32> %A, i1 0, <2 x i32> %B, <8 x i32> <i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128>, i1 0)
|
||||
store <8 x i32> %res, ptr addrspace(1) %out
|
||||
ret void
|
||||
}
|
||||
|
||||
declare <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.f16.v8f32.v8f16.v8f16.v8f32(<8 x half>, <8 x half>, <8 x float>)
|
||||
declare <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf16.v8f32.v8i16.v8i16.v8f32(<8 x i16>, <8 x i16>, <8 x float>)
|
||||
declare <8 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16.v8f16.v8f16.v8f16.v8f16(<8 x half>, <8 x half>, <8 x half>, i1 immarg)
|
||||
declare <8 x i16> @llvm.amdgcn.wmma.bf16.16x16x16.bf16.v8i16.v8i16.v8i16.v8i16(<8 x i16>, <8 x i16>, <8 x i16>, i1 immarg)
|
||||
declare <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8.v8i32.v2i32.v2i32.v8i32(i1 immarg, <2 x i32>, i1 immarg, <2 x i32>, <8 x i32>, i1 immarg)
|
||||
declare <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4.v8i32.i32.i32.v8i32(i1 immarg, i32, i1 immarg, i32, <8 x i32>, i1 immarg)
|
||||
declare <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.fp8.fp8.v8f32.v2i32.v2i32.v8f32(<2 x i32>, <2 x i32>, <8 x float>)
|
||||
declare <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.fp8.bf8.v8f32.v2i32.v2i32.v8f32(<2 x i32>, <2 x i32>, <8 x float>)
|
||||
declare <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf8.fp8.v8f32.v2i32.v2i32.v8f32(<2 x i32>, <2 x i32>, <8 x float>)
|
||||
declare <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf8.bf8.v8f32.v2i32.v2i32.v8f32(<2 x i32>, <2 x i32>, <8 x float>)
|
||||
declare <8 x i32> @llvm.amdgcn.wmma.i32.16x16x32.iu4.v8i32.v2i32.v2i32.v8i32(i1 immarg, <2 x i32>, i1 immarg, <2 x i32>, <8 x i32>, i1 immarg)
|
||||
declare <8 x float> @llvm.amdgcn.swmmac.f32.16x16x32.f16.v8f32.v8f16.v16f16.v8f32.i16(<8 x half>, <16 x half>, <8 x float>, i16)
|
||||
declare <8 x float> @llvm.amdgcn.swmmac.f32.16x16x32.bf16.v8f32.v8i16.v16i16.v8f32.i16(<8 x i16>, <16 x i16>, <8 x float>, i16)
|
||||
declare <8 x half> @llvm.amdgcn.swmmac.f16.16x16x32.f16.v8f16.v8f16.v16f16.v8f16.i16(<8 x half>, <16 x half>, <8 x half>, i16)
|
||||
declare <8 x i16> @llvm.amdgcn.swmmac.bf16.16x16x32.bf16.v8i16.v8i16.v16i16.v8i16.i16(<8 x i16>, <16 x i16>, <8 x i16>, i16)
|
||||
declare <8 x i32> @llvm.amdgcn.swmmac.i32.16x16x32.iu8.v8i32.v2i32.v4i32.v8i32.i16(i1 immarg, <2 x i32>, i1 immarg, <4 x i32>, <8 x i32>, i16 %Index, i1 immarg)
|
||||
declare <8 x i32> @llvm.amdgcn.swmmac.i32.16x16x32.iu4.v8i32.i32.v2i32.v8i32.i16(i1 immarg, i32, i1 immarg, <2 x i32>, <8 x i32>, i16 %Index, i1 immarg)
|
||||
declare <8 x i32> @llvm.amdgcn.swmmac.i32.16x16x64.iu4.v8i32.v2i32.v4i32.v8i32.i32(i1 immarg, <2 x i32>, i1 immarg, <4 x i32>, <8 x i32>, i32 %Index, i1 immarg)
|
||||
declare <8 x float> @llvm.amdgcn.swmmac.f32.16x16x32.fp8.fp8.v8f32.v2i32.v4i32.v8f32.i16(<2 x i32>, <4 x i32>, <8 x float>, i16)
|
||||
declare <8 x float> @llvm.amdgcn.swmmac.f32.16x16x32.fp8.bf8.v8f32.v2i32.v4i32.v8f32.i16(<2 x i32>, <4 x i32>, <8 x float>, i16)
|
||||
declare <8 x float> @llvm.amdgcn.swmmac.f32.16x16x32.bf8.fp8.v8f32.v2i32.v4i32.v8f32.i16(<2 x i32>, <4 x i32>, <8 x float>, i16)
|
||||
declare <8 x float> @llvm.amdgcn.swmmac.f32.16x16x32.bf8.bf8.v8f32.v2i32.v4i32.v8f32.i16(<2 x i32>, <4 x i32>, <8 x float>, i16)
|
@ -0,0 +1,309 @@
|
||||
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
|
||||
; RUN: llc -global-isel -march=amdgcn -mcpu=gfx1200 -verify-machineinstrs < %s | FileCheck %s --check-prefix=GFX12
|
||||
|
||||
define amdgpu_ps void @test_wmma_i32_16x16x16_iu8_zext_src0(<2 x i32> %A, <2 x i32> %B, <8 x i32> %C, ptr addrspace(1) %out) {
|
||||
; GFX12-LABEL: test_wmma_i32_16x16x16_iu8_zext_src0:
|
||||
; GFX12: ; %bb.0: ; %bb
|
||||
; GFX12-NEXT: v_wmma_i32_16x16x16_iu8 v[4:11], v[0:1], v[2:3], v[4:11] neg_lo:[1,0,0]
|
||||
; GFX12-NEXT: s_clause 0x1
|
||||
; GFX12-NEXT: global_store_b128 v[12:13], v[4:7], off
|
||||
; GFX12-NEXT: global_store_b128 v[12:13], v[8:11], off offset:16
|
||||
; GFX12-NEXT: s_nop 0
|
||||
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
|
||||
; GFX12-NEXT: s_endpgm
|
||||
bb:
|
||||
%res = call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8.v8i32.v2i32.v2i32.v8i32(i1 1, <2 x i32> %A, i1 0, <2 x i32> %B, <8 x i32> %C, i1 0)
|
||||
store <8 x i32> %res, ptr addrspace(1) %out
|
||||
ret void
|
||||
}
|
||||
|
||||
define amdgpu_ps void @test_wmma_i32_16x16x16_iu8_zext_src1(<2 x i32> %A, <2 x i32> %B, <8 x i32> %C, ptr addrspace(1) %out) {
|
||||
; GFX12-LABEL: test_wmma_i32_16x16x16_iu8_zext_src1:
|
||||
; GFX12: ; %bb.0: ; %bb
|
||||
; GFX12-NEXT: v_wmma_i32_16x16x16_iu8 v[4:11], v[0:1], v[2:3], v[4:11] neg_lo:[0,1,0]
|
||||
; GFX12-NEXT: s_clause 0x1
|
||||
; GFX12-NEXT: global_store_b128 v[12:13], v[4:7], off
|
||||
; GFX12-NEXT: global_store_b128 v[12:13], v[8:11], off offset:16
|
||||
; GFX12-NEXT: s_nop 0
|
||||
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
|
||||
; GFX12-NEXT: s_endpgm
|
||||
bb:
|
||||
%res = call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8.v8i32.v2i32.v2i32.v8i32(i1 0, <2 x i32> %A, i1 1, <2 x i32> %B, <8 x i32> %C, i1 0)
|
||||
store <8 x i32> %res, ptr addrspace(1) %out
|
||||
ret void
|
||||
}
|
||||
|
||||
define amdgpu_ps void @test_wmma_i32_16x16x16_iu8_clamp(<2 x i32> %A, <2 x i32> %B, <8 x i32> %C, ptr addrspace(1) %out) {
|
||||
; GFX12-LABEL: test_wmma_i32_16x16x16_iu8_clamp:
|
||||
; GFX12: ; %bb.0: ; %bb
|
||||
; GFX12-NEXT: v_wmma_i32_16x16x16_iu8 v[4:11], v[0:1], v[2:3], v[4:11] clamp
|
||||
; GFX12-NEXT: s_clause 0x1
|
||||
; GFX12-NEXT: global_store_b128 v[12:13], v[4:7], off
|
||||
; GFX12-NEXT: global_store_b128 v[12:13], v[8:11], off offset:16
|
||||
; GFX12-NEXT: s_nop 0
|
||||
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
|
||||
; GFX12-NEXT: s_endpgm
|
||||
bb:
|
||||
%res = call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8.v8i32.v2i32.v2i32.v8i32(i1 0, <2 x i32> %A, i1 0, <2 x i32> %B, <8 x i32> %C, i1 1)
|
||||
store <8 x i32> %res, ptr addrspace(1) %out
|
||||
ret void
|
||||
}
|
||||
|
||||
|
||||
|
||||
define amdgpu_ps void @test_wmma_i32_16x16x16_iu4_zext_src0(i32 %A, i32 %B, <8 x i32> %C, ptr addrspace(1) %out) {
|
||||
; GFX12-LABEL: test_wmma_i32_16x16x16_iu4_zext_src0:
|
||||
; GFX12: ; %bb.0: ; %bb
|
||||
; GFX12-NEXT: v_wmma_i32_16x16x16_iu4 v[2:9], v0, v1, v[2:9] neg_lo:[1,0,0]
|
||||
; GFX12-NEXT: s_clause 0x1
|
||||
; GFX12-NEXT: global_store_b128 v[10:11], v[2:5], off
|
||||
; GFX12-NEXT: global_store_b128 v[10:11], v[6:9], off offset:16
|
||||
; GFX12-NEXT: s_nop 0
|
||||
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
|
||||
; GFX12-NEXT: s_endpgm
|
||||
bb:
|
||||
%res = call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4.v8i32.i32.i32.v8i32(i1 1, i32 %A, i1 0, i32 %B, <8 x i32> %C, i1 0)
|
||||
store <8 x i32> %res, ptr addrspace(1) %out
|
||||
ret void
|
||||
}
|
||||
|
||||
define amdgpu_ps void @test_wmma_i32_16x16x16_iu4_zext_src1(i32 %A, i32 %B, <8 x i32> %C, ptr addrspace(1) %out) {
|
||||
; GFX12-LABEL: test_wmma_i32_16x16x16_iu4_zext_src1:
|
||||
; GFX12: ; %bb.0: ; %bb
|
||||
; GFX12-NEXT: v_wmma_i32_16x16x16_iu4 v[2:9], v0, v1, v[2:9] neg_lo:[0,1,0]
|
||||
; GFX12-NEXT: s_clause 0x1
|
||||
; GFX12-NEXT: global_store_b128 v[10:11], v[2:5], off
|
||||
; GFX12-NEXT: global_store_b128 v[10:11], v[6:9], off offset:16
|
||||
; GFX12-NEXT: s_nop 0
|
||||
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
|
||||
; GFX12-NEXT: s_endpgm
|
||||
bb:
|
||||
%res = call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4.v8i32.i32.i32.v8i32(i1 0, i32 %A, i1 1, i32 %B, <8 x i32> %C, i1 0)
|
||||
store <8 x i32> %res, ptr addrspace(1) %out
|
||||
ret void
|
||||
}
|
||||
|
||||
define amdgpu_ps void @test_wmma_i32_16x16x16_iu4_clamp(i32 %A, i32 %B, <8 x i32> %C, ptr addrspace(1) %out) {
|
||||
; GFX12-LABEL: test_wmma_i32_16x16x16_iu4_clamp:
|
||||
; GFX12: ; %bb.0: ; %bb
|
||||
; GFX12-NEXT: v_wmma_i32_16x16x16_iu4 v[2:9], v0, v1, v[2:9] clamp
|
||||
; GFX12-NEXT: s_clause 0x1
|
||||
; GFX12-NEXT: global_store_b128 v[10:11], v[2:5], off
|
||||
; GFX12-NEXT: global_store_b128 v[10:11], v[6:9], off offset:16
|
||||
; GFX12-NEXT: s_nop 0
|
||||
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
|
||||
; GFX12-NEXT: s_endpgm
|
||||
bb:
|
||||
%res = call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4.v8i32.i32.i32.v8i32(i1 0, i32 %A, i1 0, i32 %B, <8 x i32> %C, i1 1)
|
||||
store <8 x i32> %res, ptr addrspace(1) %out
|
||||
ret void
|
||||
}
|
||||
|
||||
|
||||
|
||||
define amdgpu_ps void @test_wmma_i32_16x16x32_iu4_zext_src0(<2 x i32> %A, <2 x i32> %B, <8 x i32> %C, ptr addrspace(1) %out) {
|
||||
; GFX12-LABEL: test_wmma_i32_16x16x32_iu4_zext_src0:
|
||||
; GFX12: ; %bb.0: ; %bb
|
||||
; GFX12-NEXT: v_wmma_i32_16x16x32_iu4 v[4:11], v[0:1], v[2:3], v[4:11] neg_lo:[1,0,0]
|
||||
; GFX12-NEXT: s_clause 0x1
|
||||
; GFX12-NEXT: global_store_b128 v[12:13], v[4:7], off
|
||||
; GFX12-NEXT: global_store_b128 v[12:13], v[8:11], off offset:16
|
||||
; GFX12-NEXT: s_nop 0
|
||||
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
|
||||
; GFX12-NEXT: s_endpgm
|
||||
bb:
|
||||
%res = call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x32.iu4.v8i32.v2i32.v2i32.v8i32(i1 1, <2 x i32> %A, i1 0, <2 x i32> %B, <8 x i32> %C, i1 0)
|
||||
store <8 x i32> %res, ptr addrspace(1) %out
|
||||
ret void
|
||||
}
|
||||
|
||||
define amdgpu_ps void @test_wmma_i32_16x16x32_iu4_zext_src1(<2 x i32> %A, <2 x i32> %B, <8 x i32> %C, ptr addrspace(1) %out) {
|
||||
; GFX12-LABEL: test_wmma_i32_16x16x32_iu4_zext_src1:
|
||||
; GFX12: ; %bb.0: ; %bb
|
||||
; GFX12-NEXT: v_wmma_i32_16x16x32_iu4 v[4:11], v[0:1], v[2:3], v[4:11] neg_lo:[0,1,0]
|
||||
; GFX12-NEXT: s_clause 0x1
|
||||
; GFX12-NEXT: global_store_b128 v[12:13], v[4:7], off
|
||||
; GFX12-NEXT: global_store_b128 v[12:13], v[8:11], off offset:16
|
||||
; GFX12-NEXT: s_nop 0
|
||||
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
|
||||
; GFX12-NEXT: s_endpgm
|
||||
bb:
|
||||
%res = call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x32.iu4.v8i32.v2i32.v2i32.v8i32(i1 0, <2 x i32> %A, i1 1, <2 x i32> %B, <8 x i32> %C, i1 0)
|
||||
store <8 x i32> %res, ptr addrspace(1) %out
|
||||
ret void
|
||||
}
|
||||
|
||||
define amdgpu_ps void @test_wmma_i32_16x16x32_iu4_clamp(<2 x i32> %A, <2 x i32> %B, <8 x i32> %C, ptr addrspace(1) %out) {
|
||||
; GFX12-LABEL: test_wmma_i32_16x16x32_iu4_clamp:
|
||||
; GFX12: ; %bb.0: ; %bb
|
||||
; GFX12-NEXT: v_wmma_i32_16x16x32_iu4 v[4:11], v[0:1], v[2:3], v[4:11] clamp
|
||||
; GFX12-NEXT: s_clause 0x1
|
||||
; GFX12-NEXT: global_store_b128 v[12:13], v[4:7], off
|
||||
; GFX12-NEXT: global_store_b128 v[12:13], v[8:11], off offset:16
|
||||
; GFX12-NEXT: s_nop 0
|
||||
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
|
||||
; GFX12-NEXT: s_endpgm
|
||||
bb:
|
||||
%res = call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x32.iu4.v8i32.v2i32.v2i32.v8i32(i1 0, <2 x i32> %A, i1 0, <2 x i32> %B, <8 x i32> %C, i1 1)
|
||||
store <8 x i32> %res, ptr addrspace(1) %out
|
||||
ret void
|
||||
}
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
define amdgpu_ps void @test_swmmac_i32_16x16x32_iu8_zext_src0(<2 x i32> %A, <4 x i32> %B, <8 x i32> %C, i16 %Index, ptr addrspace(1) %out) {
|
||||
; GFX12-LABEL: test_swmmac_i32_16x16x32_iu8_zext_src0:
|
||||
; GFX12: ; %bb.0: ; %bb
|
||||
; GFX12-NEXT: v_swmmac_i32_16x16x32_iu8 v[6:13], v[0:1], v[2:5], v14 neg_lo:[1,0,0]
|
||||
; GFX12-NEXT: s_clause 0x1
|
||||
; GFX12-NEXT: global_store_b128 v[15:16], v[6:9], off
|
||||
; GFX12-NEXT: global_store_b128 v[15:16], v[10:13], off offset:16
|
||||
; GFX12-NEXT: s_nop 0
|
||||
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
|
||||
; GFX12-NEXT: s_endpgm
|
||||
bb:
|
||||
%res = call <8 x i32> @llvm.amdgcn.swmmac.i32.16x16x32.iu8.v8i32.v2i32.v4i32.v8i32.i16(i1 1, <2 x i32> %A, i1 0, <4 x i32> %B, <8 x i32> %C, i16 %Index, i1 0)
|
||||
store <8 x i32> %res, ptr addrspace(1) %out
|
||||
ret void
|
||||
}
|
||||
|
||||
define amdgpu_ps void @test_swmmac_i32_16x16x32_iu8_zext_src1(<2 x i32> %A, <4 x i32> %B, <8 x i32> %C, i16 %Index, ptr addrspace(1) %out) {
|
||||
; GFX12-LABEL: test_swmmac_i32_16x16x32_iu8_zext_src1:
|
||||
; GFX12: ; %bb.0: ; %bb
|
||||
; GFX12-NEXT: v_swmmac_i32_16x16x32_iu8 v[6:13], v[0:1], v[2:5], v14 neg_lo:[0,1,0]
|
||||
; GFX12-NEXT: s_clause 0x1
|
||||
; GFX12-NEXT: global_store_b128 v[15:16], v[6:9], off
|
||||
; GFX12-NEXT: global_store_b128 v[15:16], v[10:13], off offset:16
|
||||
; GFX12-NEXT: s_nop 0
|
||||
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
|
||||
; GFX12-NEXT: s_endpgm
|
||||
bb:
|
||||
%res = call <8 x i32> @llvm.amdgcn.swmmac.i32.16x16x32.iu8.v8i32.v2i32.v4i32.v8i32.i16(i1 0, <2 x i32> %A, i1 1, <4 x i32> %B, <8 x i32> %C, i16 %Index, i1 0)
|
||||
store <8 x i32> %res, ptr addrspace(1) %out
|
||||
ret void
|
||||
}
|
||||
|
||||
define amdgpu_ps void @test_swmmac_i32_16x16x32_iu8_clamp(<2 x i32> %A, <4 x i32> %B, <8 x i32> %C, i16 %Index, ptr addrspace(1) %out) {
|
||||
; GFX12-LABEL: test_swmmac_i32_16x16x32_iu8_clamp:
|
||||
; GFX12: ; %bb.0: ; %bb
|
||||
; GFX12-NEXT: v_swmmac_i32_16x16x32_iu8 v[6:13], v[0:1], v[2:5], v14 clamp
|
||||
; GFX12-NEXT: s_clause 0x1
|
||||
; GFX12-NEXT: global_store_b128 v[15:16], v[6:9], off
|
||||
; GFX12-NEXT: global_store_b128 v[15:16], v[10:13], off offset:16
|
||||
; GFX12-NEXT: s_nop 0
|
||||
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
|
||||
; GFX12-NEXT: s_endpgm
|
||||
bb:
|
||||
%res = call <8 x i32> @llvm.amdgcn.swmmac.i32.16x16x32.iu8.v8i32.v2i32.v4i32.v8i32.i16(i1 0, <2 x i32> %A, i1 0, <4 x i32> %B, <8 x i32> %C, i16 %Index, i1 1)
|
||||
store <8 x i32> %res, ptr addrspace(1) %out
|
||||
ret void
|
||||
}
|
||||
|
||||
|
||||
|
||||
define amdgpu_ps void @test_swmmac_i32_16x16x32_iu4_zext_src0(i32 %A, <2 x i32> %B, <8 x i32> %C, i16 %Index, ptr addrspace(1) %out) {
|
||||
; GFX12-LABEL: test_swmmac_i32_16x16x32_iu4_zext_src0:
|
||||
; GFX12: ; %bb.0: ; %bb
|
||||
; GFX12-NEXT: v_swmmac_i32_16x16x32_iu4 v[3:10], v0, v[1:2], v11 neg_lo:[1,0,0]
|
||||
; GFX12-NEXT: s_clause 0x1
|
||||
; GFX12-NEXT: global_store_b128 v[12:13], v[3:6], off
|
||||
; GFX12-NEXT: global_store_b128 v[12:13], v[7:10], off offset:16
|
||||
; GFX12-NEXT: s_nop 0
|
||||
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
|
||||
; GFX12-NEXT: s_endpgm
|
||||
bb:
|
||||
%res = call <8 x i32> @llvm.amdgcn.swmmac.i32.16x16x32.iu4.v8i32.i32.v2i32.v8i32.i16(i1 1, i32 %A, i1 0, <2 x i32> %B, <8 x i32> %C, i16 %Index, i1 0)
|
||||
store <8 x i32> %res, ptr addrspace(1) %out
|
||||
ret void
|
||||
}
|
||||
|
||||
define amdgpu_ps void @test_swmmac_i32_16x16x32_iu4_zext_src1(i32 %A, <2 x i32> %B, <8 x i32> %C, i16 %Index, ptr addrspace(1) %out) {
|
||||
; GFX12-LABEL: test_swmmac_i32_16x16x32_iu4_zext_src1:
|
||||
; GFX12: ; %bb.0: ; %bb
|
||||
; GFX12-NEXT: v_swmmac_i32_16x16x32_iu4 v[3:10], v0, v[1:2], v11 neg_lo:[0,1,0]
|
||||
; GFX12-NEXT: s_clause 0x1
|
||||
; GFX12-NEXT: global_store_b128 v[12:13], v[3:6], off
|
||||
; GFX12-NEXT: global_store_b128 v[12:13], v[7:10], off offset:16
|
||||
; GFX12-NEXT: s_nop 0
|
||||
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
|
||||
; GFX12-NEXT: s_endpgm
|
||||
bb:
|
||||
%res = call <8 x i32> @llvm.amdgcn.swmmac.i32.16x16x32.iu4.v8i32.i32.v2i32.v8i32.i16(i1 0, i32 %A, i1 1, <2 x i32> %B, <8 x i32> %C, i16 %Index, i1 0)
|
||||
store <8 x i32> %res, ptr addrspace(1) %out
|
||||
ret void
|
||||
}
|
||||
|
||||
define amdgpu_ps void @test_swmmac_i32_16x16x32_iu4_clamp(i32 %A, <2 x i32> %B, <8 x i32> %C, i16 %Index, ptr addrspace(1) %out) {
|
||||
; GFX12-LABEL: test_swmmac_i32_16x16x32_iu4_clamp:
|
||||
; GFX12: ; %bb.0: ; %bb
|
||||
; GFX12-NEXT: v_swmmac_i32_16x16x32_iu4 v[3:10], v0, v[1:2], v11 clamp
|
||||
; GFX12-NEXT: s_clause 0x1
|
||||
; GFX12-NEXT: global_store_b128 v[12:13], v[3:6], off
|
||||
; GFX12-NEXT: global_store_b128 v[12:13], v[7:10], off offset:16
|
||||
; GFX12-NEXT: s_nop 0
|
||||
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
|
||||
; GFX12-NEXT: s_endpgm
|
||||
bb:
|
||||
%res = call <8 x i32> @llvm.amdgcn.swmmac.i32.16x16x32.iu4.v8i32.i32.v2i32.v8i32.i16(i1 0, i32 %A, i1 0, <2 x i32> %B, <8 x i32> %C, i16 %Index, i1 1)
|
||||
store <8 x i32> %res, ptr addrspace(1) %out
|
||||
ret void
|
||||
}
|
||||
|
||||
|
||||
|
||||
define amdgpu_ps void @test_swmmac_i32_16x16x64_iu4_zext_src0(<2 x i32> %A, <4 x i32> %B, <8 x i32> %C, i32 %Index, ptr addrspace(1) %out) {
|
||||
; GFX12-LABEL: test_swmmac_i32_16x16x64_iu4_zext_src0:
|
||||
; GFX12: ; %bb.0: ; %bb
|
||||
; GFX12-NEXT: v_swmmac_i32_16x16x64_iu4 v[6:13], v[0:1], v[2:5], v14 neg_lo:[1,0,0]
|
||||
; GFX12-NEXT: s_clause 0x1
|
||||
; GFX12-NEXT: global_store_b128 v[15:16], v[6:9], off
|
||||
; GFX12-NEXT: global_store_b128 v[15:16], v[10:13], off offset:16
|
||||
; GFX12-NEXT: s_nop 0
|
||||
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
|
||||
; GFX12-NEXT: s_endpgm
|
||||
bb:
|
||||
%res = call <8 x i32> @llvm.amdgcn.swmmac.i32.16x16x64.iu4.v8i32.v2i32.v4i32.v8i32.i32(i1 1, <2 x i32> %A, i1 0, <4 x i32> %B, <8 x i32> %C, i32 %Index, i1 0)
|
||||
store <8 x i32> %res, ptr addrspace(1) %out
|
||||
ret void
|
||||
}
|
||||
|
||||
define amdgpu_ps void @test_swmmac_i32_16x16x64_iu4_zext_src1(<2 x i32> %A, <4 x i32> %B, <8 x i32> %C, i32 %Index, ptr addrspace(1) %out) {
|
||||
; GFX12-LABEL: test_swmmac_i32_16x16x64_iu4_zext_src1:
|
||||
; GFX12: ; %bb.0: ; %bb
|
||||
; GFX12-NEXT: v_swmmac_i32_16x16x64_iu4 v[6:13], v[0:1], v[2:5], v14 neg_lo:[0,1,0]
|
||||
; GFX12-NEXT: s_clause 0x1
|
||||
; GFX12-NEXT: global_store_b128 v[15:16], v[6:9], off
|
||||
; GFX12-NEXT: global_store_b128 v[15:16], v[10:13], off offset:16
|
||||
; GFX12-NEXT: s_nop 0
|
||||
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
|
||||
; GFX12-NEXT: s_endpgm
|
||||
bb:
|
||||
%res = call <8 x i32> @llvm.amdgcn.swmmac.i32.16x16x64.iu4.v8i32.v2i32.v4i32.v8i32.i32(i1 0, <2 x i32> %A, i1 1, <4 x i32> %B, <8 x i32> %C, i32 %Index, i1 0)
|
||||
store <8 x i32> %res, ptr addrspace(1) %out
|
||||
ret void
|
||||
}
|
||||
|
||||
define amdgpu_ps void @test_swmmac_i32_16x16x64_iu4_clamp(<2 x i32> %A, <4 x i32> %B, <8 x i32> %C, i32 %Index, ptr addrspace(1) %out) {
|
||||
; GFX12-LABEL: test_swmmac_i32_16x16x64_iu4_clamp:
|
||||
; GFX12: ; %bb.0: ; %bb
|
||||
; GFX12-NEXT: v_swmmac_i32_16x16x64_iu4 v[6:13], v[0:1], v[2:5], v14 clamp
|
||||
; GFX12-NEXT: s_clause 0x1
|
||||
; GFX12-NEXT: global_store_b128 v[15:16], v[6:9], off
|
||||
; GFX12-NEXT: global_store_b128 v[15:16], v[10:13], off offset:16
|
||||
; GFX12-NEXT: s_nop 0
|
||||
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
|
||||
; GFX12-NEXT: s_endpgm
|
||||
bb:
|
||||
%res = call <8 x i32> @llvm.amdgcn.swmmac.i32.16x16x64.iu4.v8i32.v2i32.v4i32.v8i32.i32(i1 0, <2 x i32> %A, i1 0, <4 x i32> %B, <8 x i32> %C, i32 %Index, i1 1)
|
||||
store <8 x i32> %res, ptr addrspace(1) %out
|
||||
ret void
|
||||
}
|
||||
|
||||
declare <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8.v8i32.v2i32.v2i32.v8i32(i1 immarg, <2 x i32>, i1 immarg, <2 x i32>, <8 x i32>, i1 immarg)
|
||||
declare <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4.v8i32.i32.i32.v8i32(i1 immarg, i32, i1 immarg, i32, <8 x i32>, i1 immarg)
|
||||
declare <8 x i32> @llvm.amdgcn.wmma.i32.16x16x32.iu4.v8i32.v2i32.v2i32.v8i32(i1 immarg, <2 x i32>, i1 immarg, <2 x i32>, <8 x i32>, i1 immarg)
|
||||
declare <8 x i32> @llvm.amdgcn.swmmac.i32.16x16x32.iu8.v8i32.v2i32.v4i32.v8i32.i16(i1 immarg, <2 x i32>, i1 immarg, <4 x i32>, <8 x i32>, i16 %Index, i1 immarg)
|
||||
declare <8 x i32> @llvm.amdgcn.swmmac.i32.16x16x32.iu4.v8i32.i32.v2i32.v8i32.i16(i1 immarg, i32, i1 immarg, <2 x i32>, <8 x i32>, i16 %Index, i1 immarg)
|
||||
declare <8 x i32> @llvm.amdgcn.swmmac.i32.16x16x64.iu4.v8i32.v2i32.v4i32.v8i32.i32(i1 immarg, <2 x i32>, i1 immarg, <4 x i32>, <8 x i32>, i32 %Index, i1 immarg)
|
@ -0,0 +1,321 @@
|
||||
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
|
||||
; RUN: llc -global-isel -march=amdgcn -mcpu=gfx1200 -verify-machineinstrs < %s | FileCheck %s --check-prefix=GFX12
|
||||
|
||||
define amdgpu_ps void @test_swmmac_f32_16x16x32_f16_index_key(<8 x half> %A, <16 x half> %B, <8 x float> %C, ptr addrspace(1) %IndexVecPtr, ptr addrspace(1) %out0, ptr addrspace(1) %out1) {
|
||||
; GFX12-LABEL: test_swmmac_f32_16x16x32_f16_index_key:
|
||||
; GFX12: ; %bb.0: ; %bb
|
||||
; GFX12-NEXT: global_load_b32 v20, v[20:21], off
|
||||
; GFX12-NEXT: v_dual_mov_b32 v33, v19 :: v_dual_mov_b32 v32, v18
|
||||
; GFX12-NEXT: v_dual_mov_b32 v31, v17 :: v_dual_mov_b32 v30, v16
|
||||
; GFX12-NEXT: v_dual_mov_b32 v29, v15 :: v_dual_mov_b32 v28, v14
|
||||
; GFX12-NEXT: v_dual_mov_b32 v27, v13 :: v_dual_mov_b32 v26, v12
|
||||
; GFX12-NEXT: s_wait_loadcnt 0x0
|
||||
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
|
||||
; GFX12-NEXT: v_swmmac_f32_16x16x32_f16 v[26:33], v[0:3], v[4:11], v20
|
||||
; GFX12-NEXT: v_swmmac_f32_16x16x32_f16 v[12:19], v[0:3], v[4:11], v20 index_key:1
|
||||
; GFX12-NEXT: s_clause 0x1
|
||||
; GFX12-NEXT: global_store_b128 v[22:23], v[26:29], off
|
||||
; GFX12-NEXT: global_store_b128 v[22:23], v[30:33], off offset:16
|
||||
; GFX12-NEXT: s_clause 0x1
|
||||
; GFX12-NEXT: global_store_b128 v[24:25], v[12:15], off
|
||||
; GFX12-NEXT: global_store_b128 v[24:25], v[16:19], off offset:16
|
||||
; GFX12-NEXT: s_nop 0
|
||||
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
|
||||
; GFX12-NEXT: s_endpgm
|
||||
bb:
|
||||
%IndexVec = load <2 x i16>, ptr addrspace(1) %IndexVecPtr, align 4
|
||||
%Index0 = extractelement <2 x i16> %IndexVec, i32 0
|
||||
%res0 = call <8 x float> @llvm.amdgcn.swmmac.f32.16x16x32.f16.v8f32.v8f16.v16f16.v8f32.i16(<8 x half> %A, <16 x half> %B, <8 x float> %C, i16 %Index0)
|
||||
store <8 x float> %res0, ptr addrspace(1) %out0
|
||||
%Index1 = extractelement <2 x i16> %IndexVec, i32 1
|
||||
%res1 = call <8 x float> @llvm.amdgcn.swmmac.f32.16x16x32.f16.v8f32.v8f16.v16f16.v8f32.i16(<8 x half> %A, <16 x half> %B, <8 x float> %C, i16 %Index1)
|
||||
store <8 x float> %res1, ptr addrspace(1) %out1
|
||||
ret void
|
||||
}
|
||||
|
||||
define amdgpu_ps void @test_swmmac_f32_16x16x32_bf16_index_key(<8 x i16> %A, <16 x i16> %B, <8 x float> %C, ptr addrspace(1) %IndexVecPtr, ptr addrspace(1) %out0, ptr addrspace(1) %out1) {
|
||||
; GFX12-LABEL: test_swmmac_f32_16x16x32_bf16_index_key:
|
||||
; GFX12: ; %bb.0: ; %bb
|
||||
; GFX12-NEXT: global_load_b32 v20, v[20:21], off
|
||||
; GFX12-NEXT: v_dual_mov_b32 v33, v19 :: v_dual_mov_b32 v32, v18
|
||||
; GFX12-NEXT: v_dual_mov_b32 v31, v17 :: v_dual_mov_b32 v30, v16
|
||||
; GFX12-NEXT: v_dual_mov_b32 v29, v15 :: v_dual_mov_b32 v28, v14
|
||||
; GFX12-NEXT: v_dual_mov_b32 v27, v13 :: v_dual_mov_b32 v26, v12
|
||||
; GFX12-NEXT: s_wait_loadcnt 0x0
|
||||
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
|
||||
; GFX12-NEXT: v_swmmac_f32_16x16x32_bf16 v[26:33], v[0:3], v[4:11], v20
|
||||
; GFX12-NEXT: v_swmmac_f32_16x16x32_bf16 v[12:19], v[0:3], v[4:11], v20 index_key:1
|
||||
; GFX12-NEXT: s_clause 0x1
|
||||
; GFX12-NEXT: global_store_b128 v[22:23], v[26:29], off
|
||||
; GFX12-NEXT: global_store_b128 v[22:23], v[30:33], off offset:16
|
||||
; GFX12-NEXT: s_clause 0x1
|
||||
; GFX12-NEXT: global_store_b128 v[24:25], v[12:15], off
|
||||
; GFX12-NEXT: global_store_b128 v[24:25], v[16:19], off offset:16
|
||||
; GFX12-NEXT: s_nop 0
|
||||
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
|
||||
; GFX12-NEXT: s_endpgm
|
||||
bb:
|
||||
%IndexVec = load <2 x i16>, ptr addrspace(1) %IndexVecPtr, align 4
|
||||
%Index0 = extractelement <2 x i16> %IndexVec, i32 0
|
||||
%res0 = call <8 x float> @llvm.amdgcn.swmmac.f32.16x16x32.bf16.v8f32.v8i16.v16i16.v8f32.i16(<8 x i16> %A, <16 x i16> %B, <8 x float> %C, i16 %Index0)
|
||||
store <8 x float> %res0, ptr addrspace(1) %out0
|
||||
%Index1 = extractelement <2 x i16> %IndexVec, i32 1
|
||||
%res1 = call <8 x float> @llvm.amdgcn.swmmac.f32.16x16x32.bf16.v8f32.v8i16.v16i16.v8f32.i16(<8 x i16> %A, <16 x i16> %B, <8 x float> %C, i16 %Index1)
|
||||
store <8 x float> %res1, ptr addrspace(1) %out1
|
||||
ret void
|
||||
}
|
||||
|
||||
define amdgpu_ps void @test_swmmac_f16_16x16x32_f16_index_key(<8 x half> %A, <16 x half> %B, <8 x half> %C, ptr addrspace(1) %IndexVecPtr, ptr addrspace(1) %out0, ptr addrspace(1) %out1) {
|
||||
; GFX12-LABEL: test_swmmac_f16_16x16x32_f16_index_key:
|
||||
; GFX12: ; %bb.0: ; %bb
|
||||
; GFX12-NEXT: global_load_b32 v16, v[16:17], off
|
||||
; GFX12-NEXT: v_dual_mov_b32 v25, v15 :: v_dual_mov_b32 v24, v14
|
||||
; GFX12-NEXT: v_dual_mov_b32 v23, v13 :: v_dual_mov_b32 v22, v12
|
||||
; GFX12-NEXT: s_wait_loadcnt 0x0
|
||||
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
|
||||
; GFX12-NEXT: v_swmmac_f16_16x16x32_f16 v[22:25], v[0:3], v[4:11], v16
|
||||
; GFX12-NEXT: v_swmmac_f16_16x16x32_f16 v[12:15], v[0:3], v[4:11], v16 index_key:1
|
||||
; GFX12-NEXT: global_store_b128 v[18:19], v[22:25], off
|
||||
; GFX12-NEXT: global_store_b128 v[20:21], v[12:15], off
|
||||
; GFX12-NEXT: s_nop 0
|
||||
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
|
||||
; GFX12-NEXT: s_endpgm
|
||||
bb:
|
||||
%IndexVec = load <2 x i16>, ptr addrspace(1) %IndexVecPtr, align 4
|
||||
%Index0 = extractelement <2 x i16> %IndexVec, i32 0
|
||||
%res0 = call <8 x half> @llvm.amdgcn.swmmac.f16.16x16x32.f16.v8f16.v8f16.v16f16.v8f16.i16(<8 x half> %A, <16 x half> %B, <8 x half> %C, i16 %Index0)
|
||||
store <8 x half> %res0, ptr addrspace(1) %out0
|
||||
%Index1 = extractelement <2 x i16> %IndexVec, i32 1
|
||||
%res1 = call <8 x half> @llvm.amdgcn.swmmac.f16.16x16x32.f16.v8f16.v8f16.v16f16.v8f16.i16(<8 x half> %A, <16 x half> %B, <8 x half> %C, i16 %Index1)
|
||||
store <8 x half> %res1, ptr addrspace(1) %out1
|
||||
ret void
|
||||
}
|
||||
|
||||
define amdgpu_ps void @test_swmmac_bf16_16x16x32_bf16_index_key(<8 x i16> %A, <16 x i16> %B, <8 x i16> %C, ptr addrspace(1) %IndexVecPtr, ptr addrspace(1) %out0, ptr addrspace(1) %out1) {
|
||||
; GFX12-LABEL: test_swmmac_bf16_16x16x32_bf16_index_key:
|
||||
; GFX12: ; %bb.0: ; %bb
|
||||
; GFX12-NEXT: global_load_b32 v16, v[16:17], off
|
||||
; GFX12-NEXT: v_dual_mov_b32 v25, v15 :: v_dual_mov_b32 v24, v14
|
||||
; GFX12-NEXT: v_dual_mov_b32 v23, v13 :: v_dual_mov_b32 v22, v12
|
||||
; GFX12-NEXT: s_wait_loadcnt 0x0
|
||||
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
|
||||
; GFX12-NEXT: v_swmmac_bf16_16x16x32_bf16 v[22:25], v[0:3], v[4:11], v16
|
||||
; GFX12-NEXT: v_swmmac_bf16_16x16x32_bf16 v[12:15], v[0:3], v[4:11], v16 index_key:1
|
||||
; GFX12-NEXT: global_store_b128 v[18:19], v[22:25], off
|
||||
; GFX12-NEXT: global_store_b128 v[20:21], v[12:15], off
|
||||
; GFX12-NEXT: s_nop 0
|
||||
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
|
||||
; GFX12-NEXT: s_endpgm
|
||||
bb:
|
||||
%IndexVec = load <2 x i16>, ptr addrspace(1) %IndexVecPtr, align 4
|
||||
%Index0 = extractelement <2 x i16> %IndexVec, i32 0
|
||||
%res0 = call <8 x i16> @llvm.amdgcn.swmmac.bf16.16x16x32.bf16.v8i16.v8i16.v16i16.v8i16.i16(<8 x i16> %A, <16 x i16> %B, <8 x i16> %C, i16 %Index0)
|
||||
store <8 x i16> %res0, ptr addrspace(1) %out0
|
||||
%Index1 = extractelement <2 x i16> %IndexVec, i32 1
|
||||
%res1 = call <8 x i16> @llvm.amdgcn.swmmac.bf16.16x16x32.bf16.v8i16.v8i16.v16i16.v8i16.i16(<8 x i16> %A, <16 x i16> %B, <8 x i16> %C, i16 %Index1)
|
||||
store <8 x i16> %res1, ptr addrspace(1) %out1
|
||||
ret void
|
||||
}
|
||||
|
||||
define amdgpu_ps void @test_swmmac_i32_16x16x32_iu8_index_key(<2 x i32> %A, <4 x i32> %B, <8 x i32> %C, ptr addrspace(1) %IndexVecPtr, ptr addrspace(1) %out0, ptr addrspace(1) %out1) {
|
||||
; GFX12-LABEL: test_swmmac_i32_16x16x32_iu8_index_key:
|
||||
; GFX12: ; %bb.0: ; %bb
|
||||
; GFX12-NEXT: global_load_b32 v14, v[14:15], off
|
||||
; GFX12-NEXT: v_dual_mov_b32 v27, v13 :: v_dual_mov_b32 v26, v12
|
||||
; GFX12-NEXT: v_dual_mov_b32 v25, v11 :: v_dual_mov_b32 v24, v10
|
||||
; GFX12-NEXT: v_dual_mov_b32 v23, v9 :: v_dual_mov_b32 v22, v8
|
||||
; GFX12-NEXT: v_dual_mov_b32 v21, v7 :: v_dual_mov_b32 v20, v6
|
||||
; GFX12-NEXT: s_wait_loadcnt 0x0
|
||||
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
|
||||
; GFX12-NEXT: v_swmmac_i32_16x16x32_iu8 v[20:27], v[0:1], v[2:5], v14
|
||||
; GFX12-NEXT: v_swmmac_i32_16x16x32_iu8 v[6:13], v[0:1], v[2:5], v14 index_key:1
|
||||
; GFX12-NEXT: s_clause 0x1
|
||||
; GFX12-NEXT: global_store_b128 v[16:17], v[20:23], off
|
||||
; GFX12-NEXT: global_store_b128 v[16:17], v[24:27], off offset:16
|
||||
; GFX12-NEXT: s_clause 0x1
|
||||
; GFX12-NEXT: global_store_b128 v[18:19], v[6:9], off
|
||||
; GFX12-NEXT: global_store_b128 v[18:19], v[10:13], off offset:16
|
||||
; GFX12-NEXT: s_nop 0
|
||||
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
|
||||
; GFX12-NEXT: s_endpgm
|
||||
bb:
|
||||
%IndexVec = load <2 x i16>, ptr addrspace(1) %IndexVecPtr, align 4
|
||||
%Index0 = extractelement <2 x i16> %IndexVec, i32 0
|
||||
%res0 = call <8 x i32> @llvm.amdgcn.swmmac.i32.16x16x32.iu8.v8i32.v2i32.v4i32.v8i32.i16(i1 0, <2 x i32> %A, i1 0, <4 x i32> %B, <8 x i32> %C, i16 %Index0, i1 0)
|
||||
store <8 x i32> %res0, ptr addrspace(1) %out0
|
||||
%Index1 = extractelement <2 x i16> %IndexVec, i32 1
|
||||
%res1 = call <8 x i32> @llvm.amdgcn.swmmac.i32.16x16x32.iu8.v8i32.v2i32.v4i32.v8i32.i16(i1 0, <2 x i32> %A, i1 0, <4 x i32> %B, <8 x i32> %C, i16 %Index1, i1 0)
|
||||
store <8 x i32> %res1, ptr addrspace(1) %out1
|
||||
ret void
|
||||
}
|
||||
|
||||
define amdgpu_ps void @test_swmmac_i32_16x16x32_iu4_index_key(i32 %A, <2 x i32> %B, <8 x i32> %C, ptr addrspace(1) %IndexVecPtr, ptr addrspace(1) %out0, ptr addrspace(1) %out1) {
|
||||
; GFX12-LABEL: test_swmmac_i32_16x16x32_iu4_index_key:
|
||||
; GFX12: ; %bb.0: ; %bb
|
||||
; GFX12-NEXT: global_load_b32 v11, v[11:12], off
|
||||
; GFX12-NEXT: v_dual_mov_b32 v24, v10 :: v_dual_mov_b32 v23, v9
|
||||
; GFX12-NEXT: v_dual_mov_b32 v22, v8 :: v_dual_mov_b32 v21, v7
|
||||
; GFX12-NEXT: v_dual_mov_b32 v20, v6 :: v_dual_mov_b32 v19, v5
|
||||
; GFX12-NEXT: v_dual_mov_b32 v18, v4 :: v_dual_mov_b32 v17, v3
|
||||
; GFX12-NEXT: s_wait_loadcnt 0x0
|
||||
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
|
||||
; GFX12-NEXT: v_swmmac_i32_16x16x32_iu4 v[17:24], v0, v[1:2], v11
|
||||
; GFX12-NEXT: v_swmmac_i32_16x16x32_iu4 v[3:10], v0, v[1:2], v11 index_key:1
|
||||
; GFX12-NEXT: s_clause 0x1
|
||||
; GFX12-NEXT: global_store_b128 v[13:14], v[17:20], off
|
||||
; GFX12-NEXT: global_store_b128 v[13:14], v[21:24], off offset:16
|
||||
; GFX12-NEXT: s_clause 0x1
|
||||
; GFX12-NEXT: global_store_b128 v[15:16], v[3:6], off
|
||||
; GFX12-NEXT: global_store_b128 v[15:16], v[7:10], off offset:16
|
||||
; GFX12-NEXT: s_nop 0
|
||||
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
|
||||
; GFX12-NEXT: s_endpgm
|
||||
bb:
|
||||
%IndexVec = load <2 x i16>, ptr addrspace(1) %IndexVecPtr, align 4
|
||||
%Index0 = extractelement <2 x i16> %IndexVec, i32 0
|
||||
%res0 = call <8 x i32> @llvm.amdgcn.swmmac.i32.16x16x32.iu4.v8i32.i32.v2i32.v8i32.i16(i1 0, i32 %A, i1 0, <2 x i32> %B, <8 x i32> %C, i16 %Index0, i1 0)
|
||||
store <8 x i32> %res0, ptr addrspace(1) %out0
|
||||
%Index1 = extractelement <2 x i16> %IndexVec, i32 1
|
||||
%res1 = call <8 x i32> @llvm.amdgcn.swmmac.i32.16x16x32.iu4.v8i32.i32.v2i32.v8i32.i16(i1 0, i32 %A, i1 0, <2 x i32> %B, <8 x i32> %C, i16 %Index1, i1 0)
|
||||
store <8 x i32> %res1, ptr addrspace(1) %out1
|
||||
ret void
|
||||
}
|
||||
|
||||
define amdgpu_ps void @test_swmmac_f32_16x16x32_fp8_fp8_index_key(<2 x i32> %A, <4 x i32> %B, <8 x float> %C, ptr addrspace(1) %IndexVecPtr, ptr addrspace(1) %out0, ptr addrspace(1) %out1) {
|
||||
; GFX12-LABEL: test_swmmac_f32_16x16x32_fp8_fp8_index_key:
|
||||
; GFX12: ; %bb.0: ; %bb
|
||||
; GFX12-NEXT: global_load_b32 v14, v[14:15], off
|
||||
; GFX12-NEXT: v_dual_mov_b32 v27, v13 :: v_dual_mov_b32 v26, v12
|
||||
; GFX12-NEXT: v_dual_mov_b32 v25, v11 :: v_dual_mov_b32 v24, v10
|
||||
; GFX12-NEXT: v_dual_mov_b32 v23, v9 :: v_dual_mov_b32 v22, v8
|
||||
; GFX12-NEXT: v_dual_mov_b32 v21, v7 :: v_dual_mov_b32 v20, v6
|
||||
; GFX12-NEXT: s_wait_loadcnt 0x0
|
||||
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
|
||||
; GFX12-NEXT: v_swmmac_f32_16x16x32_fp8_fp8 v[20:27], v[0:1], v[2:5], v14
|
||||
; GFX12-NEXT: v_swmmac_f32_16x16x32_fp8_fp8 v[6:13], v[0:1], v[2:5], v14 index_key:1
|
||||
; GFX12-NEXT: s_clause 0x1
|
||||
; GFX12-NEXT: global_store_b128 v[16:17], v[20:23], off
|
||||
; GFX12-NEXT: global_store_b128 v[16:17], v[24:27], off offset:16
|
||||
; GFX12-NEXT: s_clause 0x1
|
||||
; GFX12-NEXT: global_store_b128 v[18:19], v[6:9], off
|
||||
; GFX12-NEXT: global_store_b128 v[18:19], v[10:13], off offset:16
|
||||
; GFX12-NEXT: s_nop 0
|
||||
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
|
||||
; GFX12-NEXT: s_endpgm
|
||||
bb:
|
||||
%IndexVec = load <2 x i16>, ptr addrspace(1) %IndexVecPtr, align 4
|
||||
%Index0 = extractelement <2 x i16> %IndexVec, i32 0
|
||||
%res0 = call <8 x float> @llvm.amdgcn.swmmac.f32.16x16x32.fp8.fp8.v8f32.v2i32.v4i32.v8f32.i16(<2 x i32> %A, <4 x i32> %B, <8 x float> %C, i16 %Index0)
|
||||
store <8 x float> %res0, ptr addrspace(1) %out0
|
||||
%Index1 = extractelement <2 x i16> %IndexVec, i32 1
|
||||
%res1 = call <8 x float> @llvm.amdgcn.swmmac.f32.16x16x32.fp8.fp8.v8f32.v2i32.v4i32.v8f32.i16(<2 x i32> %A, <4 x i32> %B, <8 x float> %C, i16 %Index1)
|
||||
store <8 x float> %res1, ptr addrspace(1) %out1
|
||||
ret void
|
||||
}
|
||||
|
||||
define amdgpu_ps void @test_swmmac_f32_16x16x32_fp8_bf8_index_key(<2 x i32> %A, <4 x i32> %B, <8 x float> %C, ptr addrspace(1) %IndexVecPtr, ptr addrspace(1) %out0, ptr addrspace(1) %out1) {
|
||||
; GFX12-LABEL: test_swmmac_f32_16x16x32_fp8_bf8_index_key:
|
||||
; GFX12: ; %bb.0: ; %bb
|
||||
; GFX12-NEXT: global_load_b32 v14, v[14:15], off
|
||||
; GFX12-NEXT: v_dual_mov_b32 v27, v13 :: v_dual_mov_b32 v26, v12
|
||||
; GFX12-NEXT: v_dual_mov_b32 v25, v11 :: v_dual_mov_b32 v24, v10
|
||||
; GFX12-NEXT: v_dual_mov_b32 v23, v9 :: v_dual_mov_b32 v22, v8
|
||||
; GFX12-NEXT: v_dual_mov_b32 v21, v7 :: v_dual_mov_b32 v20, v6
|
||||
; GFX12-NEXT: s_wait_loadcnt 0x0
|
||||
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
|
||||
; GFX12-NEXT: v_swmmac_f32_16x16x32_fp8_bf8 v[20:27], v[0:1], v[2:5], v14
|
||||
; GFX12-NEXT: v_swmmac_f32_16x16x32_fp8_bf8 v[6:13], v[0:1], v[2:5], v14 index_key:1
|
||||
; GFX12-NEXT: s_clause 0x1
|
||||
; GFX12-NEXT: global_store_b128 v[16:17], v[20:23], off
|
||||
; GFX12-NEXT: global_store_b128 v[16:17], v[24:27], off offset:16
|
||||
; GFX12-NEXT: s_clause 0x1
|
||||
; GFX12-NEXT: global_store_b128 v[18:19], v[6:9], off
|
||||
; GFX12-NEXT: global_store_b128 v[18:19], v[10:13], off offset:16
|
||||
; GFX12-NEXT: s_nop 0
|
||||
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
|
||||
; GFX12-NEXT: s_endpgm
|
||||
bb:
|
||||
%IndexVec = load <2 x i16>, ptr addrspace(1) %IndexVecPtr, align 4
|
||||
%Index0 = extractelement <2 x i16> %IndexVec, i32 0
|
||||
%res0 = call <8 x float> @llvm.amdgcn.swmmac.f32.16x16x32.fp8.bf8.v8f32.v2i32.v4i32.v8f32.i16(<2 x i32> %A, <4 x i32> %B, <8 x float> %C, i16 %Index0)
|
||||
store <8 x float> %res0, ptr addrspace(1) %out0
|
||||
%Index1 = extractelement <2 x i16> %IndexVec, i32 1
|
||||
%res1 = call <8 x float> @llvm.amdgcn.swmmac.f32.16x16x32.fp8.bf8.v8f32.v2i32.v4i32.v8f32.i16(<2 x i32> %A, <4 x i32> %B, <8 x float> %C, i16 %Index1)
|
||||
store <8 x float> %res1, ptr addrspace(1) %out1
|
||||
ret void
|
||||
}
|
||||
|
||||
define amdgpu_ps void @test_swmmac_f32_16x16x32_bf8_fp8_index_key(<2 x i32> %A, <4 x i32> %B, <8 x float> %C, ptr addrspace(1) %IndexVecPtr, ptr addrspace(1) %out0, ptr addrspace(1) %out1) {
|
||||
; GFX12-LABEL: test_swmmac_f32_16x16x32_bf8_fp8_index_key:
|
||||
; GFX12: ; %bb.0: ; %bb
|
||||
; GFX12-NEXT: global_load_b32 v14, v[14:15], off
|
||||
; GFX12-NEXT: v_dual_mov_b32 v27, v13 :: v_dual_mov_b32 v26, v12
|
||||
; GFX12-NEXT: v_dual_mov_b32 v25, v11 :: v_dual_mov_b32 v24, v10
|
||||
; GFX12-NEXT: v_dual_mov_b32 v23, v9 :: v_dual_mov_b32 v22, v8
|
||||
; GFX12-NEXT: v_dual_mov_b32 v21, v7 :: v_dual_mov_b32 v20, v6
|
||||
; GFX12-NEXT: s_wait_loadcnt 0x0
|
||||
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
|
||||
; GFX12-NEXT: v_swmmac_f32_16x16x32_bf8_fp8 v[20:27], v[0:1], v[2:5], v14
|
||||
; GFX12-NEXT: v_swmmac_f32_16x16x32_bf8_fp8 v[6:13], v[0:1], v[2:5], v14 index_key:1
|
||||
; GFX12-NEXT: s_clause 0x1
|
||||
; GFX12-NEXT: global_store_b128 v[16:17], v[20:23], off
|
||||
; GFX12-NEXT: global_store_b128 v[16:17], v[24:27], off offset:16
|
||||
; GFX12-NEXT: s_clause 0x1
|
||||
; GFX12-NEXT: global_store_b128 v[18:19], v[6:9], off
|
||||
; GFX12-NEXT: global_store_b128 v[18:19], v[10:13], off offset:16
|
||||
; GFX12-NEXT: s_nop 0
|
||||
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
|
||||
; GFX12-NEXT: s_endpgm
|
||||
bb:
|
||||
%IndexVec = load <2 x i16>, ptr addrspace(1) %IndexVecPtr, align 4
|
||||
%Index0 = extractelement <2 x i16> %IndexVec, i32 0
|
||||
%res0 = call <8 x float> @llvm.amdgcn.swmmac.f32.16x16x32.bf8.fp8.v8f32.v2i32.v4i32.v8f32.i16(<2 x i32> %A, <4 x i32> %B, <8 x float> %C, i16 %Index0)
|
||||
store <8 x float> %res0, ptr addrspace(1) %out0
|
||||
%Index1 = extractelement <2 x i16> %IndexVec, i32 1
|
||||
%res1 = call <8 x float> @llvm.amdgcn.swmmac.f32.16x16x32.bf8.fp8.v8f32.v2i32.v4i32.v8f32.i16(<2 x i32> %A, <4 x i32> %B, <8 x float> %C, i16 %Index1)
|
||||
store <8 x float> %res1, ptr addrspace(1) %out1
|
||||
ret void
|
||||
}
|
||||
|
||||
define amdgpu_ps void @test_swmmac_f32_16x16x32_bf8_bf8_index_key(<2 x i32> %A, <4 x i32> %B, <8 x float> %C, ptr addrspace(1) %IndexVecPtr, ptr addrspace(1) %out0, ptr addrspace(1) %out1) {
|
||||
; GFX12-LABEL: test_swmmac_f32_16x16x32_bf8_bf8_index_key:
|
||||
; GFX12: ; %bb.0: ; %bb
|
||||
; GFX12-NEXT: global_load_b32 v14, v[14:15], off
|
||||
; GFX12-NEXT: v_dual_mov_b32 v27, v13 :: v_dual_mov_b32 v26, v12
|
||||
; GFX12-NEXT: v_dual_mov_b32 v25, v11 :: v_dual_mov_b32 v24, v10
|
||||
; GFX12-NEXT: v_dual_mov_b32 v23, v9 :: v_dual_mov_b32 v22, v8
|
||||
; GFX12-NEXT: v_dual_mov_b32 v21, v7 :: v_dual_mov_b32 v20, v6
|
||||
; GFX12-NEXT: s_wait_loadcnt 0x0
|
||||
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
|
||||
; GFX12-NEXT: v_swmmac_f32_16x16x32_bf8_bf8 v[20:27], v[0:1], v[2:5], v14
|
||||
; GFX12-NEXT: v_swmmac_f32_16x16x32_bf8_bf8 v[6:13], v[0:1], v[2:5], v14 index_key:1
|
||||
; GFX12-NEXT: s_clause 0x1
|
||||
; GFX12-NEXT: global_store_b128 v[16:17], v[20:23], off
|
||||
; GFX12-NEXT: global_store_b128 v[16:17], v[24:27], off offset:16
|
||||
; GFX12-NEXT: s_clause 0x1
|
||||
; GFX12-NEXT: global_store_b128 v[18:19], v[6:9], off
|
||||
; GFX12-NEXT: global_store_b128 v[18:19], v[10:13], off offset:16
|
||||
; GFX12-NEXT: s_nop 0
|
||||
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
|
||||
; GFX12-NEXT: s_endpgm
|
||||
bb:
|
||||
%IndexVec = load <2 x i16>, ptr addrspace(1) %IndexVecPtr, align 4
|
||||
%Index0 = extractelement <2 x i16> %IndexVec, i32 0
|
||||
%res0 = call <8 x float> @llvm.amdgcn.swmmac.f32.16x16x32.bf8.bf8.v8f32.v2i32.v4i32.v8f32.i16(<2 x i32> %A, <4 x i32> %B, <8 x float> %C, i16 %Index0)
|
||||
store <8 x float> %res0, ptr addrspace(1) %out0
|
||||
%Index1 = extractelement <2 x i16> %IndexVec, i32 1
|
||||
%res1 = call <8 x float> @llvm.amdgcn.swmmac.f32.16x16x32.bf8.bf8.v8f32.v2i32.v4i32.v8f32.i16(<2 x i32> %A, <4 x i32> %B, <8 x float> %C, i16 %Index1)
|
||||
store <8 x float> %res1, ptr addrspace(1) %out1
|
||||
ret void
|
||||
}
|
||||
|
||||
declare <8 x float> @llvm.amdgcn.swmmac.f32.16x16x32.f16.v8f32.v8f16.v16f16.v8f32.i16(<8 x half>, <16 x half>, <8 x float>, i16)
|
||||
declare <8 x float> @llvm.amdgcn.swmmac.f32.16x16x32.bf16.v8f32.v8i16.v16i16.v8f32.i16(<8 x i16>, <16 x i16>, <8 x float>, i16)
|
||||
declare <8 x half> @llvm.amdgcn.swmmac.f16.16x16x32.f16.v8f16.v8f16.v16f16.v8f16.i16(<8 x half>, <16 x half>, <8 x half>, i16)
|
||||
declare <8 x i16> @llvm.amdgcn.swmmac.bf16.16x16x32.bf16.v8i16.v8i16.v16i16.v8i16.i16(<8 x i16>, <16 x i16>, <8 x i16>, i16)
|
||||
declare <8 x i32> @llvm.amdgcn.swmmac.i32.16x16x32.iu8.v8i32.v2i32.v4i32.v8i32.i16(i1 immarg, <2 x i32>, i1 immarg, <4 x i32>, <8 x i32>, i16 %Index, i1 immarg)
|
||||
declare <8 x i32> @llvm.amdgcn.swmmac.i32.16x16x32.iu4.v8i32.i32.v2i32.v8i32.i16(i1 immarg, i32, i1 immarg, <2 x i32>, <8 x i32>, i16 %Index, i1 immarg)
|
||||
declare <8 x float> @llvm.amdgcn.swmmac.f32.16x16x32.fp8.fp8.v8f32.v2i32.v4i32.v8f32.i16(<2 x i32>, <4 x i32>, <8 x float>, i16)
|
||||
declare <8 x float> @llvm.amdgcn.swmmac.f32.16x16x32.fp8.bf8.v8f32.v2i32.v4i32.v8f32.i16(<2 x i32>, <4 x i32>, <8 x float>, i16)
|
||||
declare <8 x float> @llvm.amdgcn.swmmac.f32.16x16x32.bf8.fp8.v8f32.v2i32.v4i32.v8f32.i16(<2 x i32>, <4 x i32>, <8 x float>, i16)
|
||||
declare <8 x float> @llvm.amdgcn.swmmac.f32.16x16x32.bf8.bf8.v8f32.v2i32.v4i32.v8f32.i16(<2 x i32>, <4 x i32>, <8 x float>, i16)
|
370
llvm/test/CodeGen/AMDGPU/GlobalISel/wmma-gfx12-w32.ll
Normal file
370
llvm/test/CodeGen/AMDGPU/GlobalISel/wmma-gfx12-w32.ll
Normal file
@ -0,0 +1,370 @@
|
||||
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
|
||||
; RUN: llc -global-isel -march=amdgcn -mcpu=gfx1200 -verify-machineinstrs < %s | FileCheck %s --check-prefix=GFX12
|
||||
|
||||
define amdgpu_ps void @test_wmma_f32_16x16x16_f16(<8 x half> %A, <8 x half> %B, <8 x float> %C, ptr addrspace(1) %out) {
|
||||
; GFX12-LABEL: test_wmma_f32_16x16x16_f16:
|
||||
; GFX12: ; %bb.0: ; %bb
|
||||
; GFX12-NEXT: v_wmma_f32_16x16x16_f16 v[8:15], v[0:3], v[4:7], v[8:15]
|
||||
; GFX12-NEXT: s_clause 0x1
|
||||
; GFX12-NEXT: global_store_b128 v[16:17], v[8:11], off
|
||||
; GFX12-NEXT: global_store_b128 v[16:17], v[12:15], off offset:16
|
||||
; GFX12-NEXT: s_nop 0
|
||||
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
|
||||
; GFX12-NEXT: s_endpgm
|
||||
bb:
|
||||
%res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.f16.v8f32.v8f16.v8f16.v8f32(<8 x half> %A, <8 x half> %B, <8 x float> %C)
|
||||
store <8 x float> %res, ptr addrspace(1) %out
|
||||
ret void
|
||||
}
|
||||
|
||||
define amdgpu_ps void @test_wmma_f32_16x16x16_bf16(<8 x i16> %A, <8 x i16> %B, <8 x float> %C, ptr addrspace(1) %out) {
|
||||
; GFX12-LABEL: test_wmma_f32_16x16x16_bf16:
|
||||
; GFX12: ; %bb.0: ; %bb
|
||||
; GFX12-NEXT: v_wmma_f32_16x16x16_bf16 v[8:15], v[0:3], v[4:7], v[8:15]
|
||||
; GFX12-NEXT: s_clause 0x1
|
||||
; GFX12-NEXT: global_store_b128 v[16:17], v[8:11], off
|
||||
; GFX12-NEXT: global_store_b128 v[16:17], v[12:15], off offset:16
|
||||
; GFX12-NEXT: s_nop 0
|
||||
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
|
||||
; GFX12-NEXT: s_endpgm
|
||||
bb:
|
||||
%res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf16.v8f32.v8i16.v8i16.v8f32(<8 x i16> %A, <8 x i16> %B, <8 x float> %C)
|
||||
store <8 x float> %res, ptr addrspace(1) %out
|
||||
ret void
|
||||
}
|
||||
|
||||
define amdgpu_ps void @test_wmma_f16_16x16x16_f16(<8 x half> %A, <8 x half> %B, <8 x half> %C, ptr addrspace(1) %out) {
|
||||
; GFX12-LABEL: test_wmma_f16_16x16x16_f16:
|
||||
; GFX12: ; %bb.0: ; %bb
|
||||
; GFX12-NEXT: v_wmma_f16_16x16x16_f16 v[8:11], v[0:3], v[4:7], v[8:11]
|
||||
; GFX12-NEXT: global_store_b128 v[12:13], v[8:11], off
|
||||
; GFX12-NEXT: s_nop 0
|
||||
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
|
||||
; GFX12-NEXT: s_endpgm
|
||||
bb:
|
||||
%res = call <8 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16.v8f16.v8f16.v8f16.v8f16(<8 x half> %A, <8 x half> %B, <8 x half> %C, i1 0)
|
||||
store <8 x half> %res, ptr addrspace(1) %out
|
||||
ret void
|
||||
}
|
||||
|
||||
define amdgpu_ps void @test_wmma_bf16_16x16x16_bf16(<8 x i16> %A, <8 x i16> %B, <8 x i16> %C, ptr addrspace(1) %out) {
|
||||
; GFX12-LABEL: test_wmma_bf16_16x16x16_bf16:
|
||||
; GFX12: ; %bb.0: ; %bb
|
||||
; GFX12-NEXT: v_wmma_bf16_16x16x16_bf16 v[8:11], v[0:3], v[4:7], v[8:11]
|
||||
; GFX12-NEXT: global_store_b128 v[12:13], v[8:11], off
|
||||
; GFX12-NEXT: s_nop 0
|
||||
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
|
||||
; GFX12-NEXT: s_endpgm
|
||||
bb:
|
||||
%res = call <8 x i16> @llvm.amdgcn.wmma.bf16.16x16x16.bf16.v8i16.v8i16.v8i16.v8i16(<8 x i16> %A, <8 x i16> %B, <8 x i16> %C, i1 0)
|
||||
store <8 x i16> %res, ptr addrspace(1) %out
|
||||
ret void
|
||||
}
|
||||
|
||||
define amdgpu_ps void @test_wmma_i32_16x16x16_iu8(<2 x i32> %A, <2 x i32> %B, <8 x i32> %C, ptr addrspace(1) %out) {
|
||||
; GFX12-LABEL: test_wmma_i32_16x16x16_iu8:
|
||||
; GFX12: ; %bb.0: ; %bb
|
||||
; GFX12-NEXT: v_wmma_i32_16x16x16_iu8 v[4:11], v[0:1], v[2:3], v[4:11]
|
||||
; GFX12-NEXT: s_clause 0x1
|
||||
; GFX12-NEXT: global_store_b128 v[12:13], v[4:7], off
|
||||
; GFX12-NEXT: global_store_b128 v[12:13], v[8:11], off offset:16
|
||||
; GFX12-NEXT: s_nop 0
|
||||
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
|
||||
; GFX12-NEXT: s_endpgm
|
||||
bb:
|
||||
%res = call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8.v8i32.v2i32.v2i32.v8i32(i1 0, <2 x i32> %A, i1 0, <2 x i32> %B, <8 x i32> %C, i1 0)
|
||||
store <8 x i32> %res, ptr addrspace(1) %out
|
||||
ret void
|
||||
}
|
||||
|
||||
define amdgpu_ps void @test_wmma_i32_16x16x16_iu4(i32 %A, i32 %B, <8 x i32> %C, ptr addrspace(1) %out) {
|
||||
; GFX12-LABEL: test_wmma_i32_16x16x16_iu4:
|
||||
; GFX12: ; %bb.0: ; %bb
|
||||
; GFX12-NEXT: v_wmma_i32_16x16x16_iu4 v[2:9], v0, v1, v[2:9]
|
||||
; GFX12-NEXT: s_clause 0x1
|
||||
; GFX12-NEXT: global_store_b128 v[10:11], v[2:5], off
|
||||
; GFX12-NEXT: global_store_b128 v[10:11], v[6:9], off offset:16
|
||||
; GFX12-NEXT: s_nop 0
|
||||
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
|
||||
; GFX12-NEXT: s_endpgm
|
||||
bb:
|
||||
%res = call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4.v8i32.i32.i32.v8i32(i1 0, i32 %A, i1 0, i32 %B, <8 x i32> %C, i1 0)
|
||||
store <8 x i32> %res, ptr addrspace(1) %out
|
||||
ret void
|
||||
}
|
||||
|
||||
define amdgpu_ps void @test_wmma_f32_16x16x16_fp8_fp8(<2 x i32> %A, <2 x i32> %B, <8 x float> %C, ptr addrspace(1) %out) {
|
||||
; GFX12-LABEL: test_wmma_f32_16x16x16_fp8_fp8:
|
||||
; GFX12: ; %bb.0: ; %bb
|
||||
; GFX12-NEXT: v_wmma_f32_16x16x16_fp8_fp8 v[4:11], v[0:1], v[2:3], v[4:11]
|
||||
; GFX12-NEXT: s_clause 0x1
|
||||
; GFX12-NEXT: global_store_b128 v[12:13], v[4:7], off
|
||||
; GFX12-NEXT: global_store_b128 v[12:13], v[8:11], off offset:16
|
||||
; GFX12-NEXT: s_nop 0
|
||||
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
|
||||
; GFX12-NEXT: s_endpgm
|
||||
bb:
|
||||
%res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.fp8.fp8.v8f32.v2i32.v2i32.v8f32(<2 x i32> %A, <2 x i32> %B, <8 x float> %C)
|
||||
store <8 x float> %res, ptr addrspace(1) %out
|
||||
ret void
|
||||
}
|
||||
|
||||
define amdgpu_ps void @test_wmma_f32_16x16x16_bf8_fp8(<2 x i32> %A, <2 x i32> %B, <8 x float> %C, ptr addrspace(1) %out) {
|
||||
; GFX12-LABEL: test_wmma_f32_16x16x16_bf8_fp8:
|
||||
; GFX12: ; %bb.0: ; %bb
|
||||
; GFX12-NEXT: v_wmma_f32_16x16x16_bf8_fp8 v[4:11], v[0:1], v[2:3], v[4:11]
|
||||
; GFX12-NEXT: s_clause 0x1
|
||||
; GFX12-NEXT: global_store_b128 v[12:13], v[4:7], off
|
||||
; GFX12-NEXT: global_store_b128 v[12:13], v[8:11], off offset:16
|
||||
; GFX12-NEXT: s_nop 0
|
||||
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
|
||||
; GFX12-NEXT: s_endpgm
|
||||
bb:
|
||||
%res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf8.fp8.v8f32.v2i32.v2i32.v8f32(<2 x i32> %A, <2 x i32> %B, <8 x float> %C)
|
||||
store <8 x float> %res, ptr addrspace(1) %out
|
||||
ret void
|
||||
}
|
||||
|
||||
define amdgpu_ps void @test_wmma_f32_16x16x16_fp8_bf8(<2 x i32> %A, <2 x i32> %B, <8 x float> %C, ptr addrspace(1) %out) {
|
||||
; GFX12-LABEL: test_wmma_f32_16x16x16_fp8_bf8:
|
||||
; GFX12: ; %bb.0: ; %bb
|
||||
; GFX12-NEXT: v_wmma_f32_16x16x16_fp8_bf8 v[4:11], v[0:1], v[2:3], v[4:11]
|
||||
; GFX12-NEXT: s_clause 0x1
|
||||
; GFX12-NEXT: global_store_b128 v[12:13], v[4:7], off
|
||||
; GFX12-NEXT: global_store_b128 v[12:13], v[8:11], off offset:16
|
||||
; GFX12-NEXT: s_nop 0
|
||||
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
|
||||
; GFX12-NEXT: s_endpgm
|
||||
bb:
|
||||
%res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.fp8.bf8.v8f32.v2i32.v2i32.v8f32(<2 x i32> %A, <2 x i32> %B, <8 x float> %C)
|
||||
store <8 x float> %res, ptr addrspace(1) %out
|
||||
ret void
|
||||
}
|
||||
|
||||
define amdgpu_ps void @test_wmma_f32_16x16x16_bf8_bf8(<2 x i32> %A, <2 x i32> %B, <8 x float> %C, ptr addrspace(1) %out) {
|
||||
; GFX12-LABEL: test_wmma_f32_16x16x16_bf8_bf8:
|
||||
; GFX12: ; %bb.0: ; %bb
|
||||
; GFX12-NEXT: v_wmma_f32_16x16x16_bf8_bf8 v[4:11], v[0:1], v[2:3], v[4:11]
|
||||
; GFX12-NEXT: s_clause 0x1
|
||||
; GFX12-NEXT: global_store_b128 v[12:13], v[4:7], off
|
||||
; GFX12-NEXT: global_store_b128 v[12:13], v[8:11], off offset:16
|
||||
; GFX12-NEXT: s_nop 0
|
||||
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
|
||||
; GFX12-NEXT: s_endpgm
|
||||
bb:
|
||||
%res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf8.bf8.v8f32.v2i32.v2i32.v8f32(<2 x i32> %A, <2 x i32> %B, <8 x float> %C)
|
||||
store <8 x float> %res, ptr addrspace(1) %out
|
||||
ret void
|
||||
}
|
||||
|
||||
define amdgpu_ps void @test_wmma_i32_16x16x32_iu4(<2 x i32> %A, <2 x i32> %B, <8 x i32> %C, ptr addrspace(1) %out) {
|
||||
; GFX12-LABEL: test_wmma_i32_16x16x32_iu4:
|
||||
; GFX12: ; %bb.0: ; %bb
|
||||
; GFX12-NEXT: v_wmma_i32_16x16x32_iu4 v[4:11], v[0:1], v[2:3], v[4:11]
|
||||
; GFX12-NEXT: s_clause 0x1
|
||||
; GFX12-NEXT: global_store_b128 v[12:13], v[4:7], off
|
||||
; GFX12-NEXT: global_store_b128 v[12:13], v[8:11], off offset:16
|
||||
; GFX12-NEXT: s_nop 0
|
||||
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
|
||||
; GFX12-NEXT: s_endpgm
|
||||
bb:
|
||||
%res = call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x32.iu4.v8i32.v2i32.v2i32.v8i32(i1 0, <2 x i32> %A, i1 0, <2 x i32> %B, <8 x i32> %C, i1 0)
|
||||
store <8 x i32> %res, ptr addrspace(1) %out
|
||||
ret void
|
||||
}
|
||||
|
||||
|
||||
define amdgpu_ps void @test_swmmac_f32_16x16x32_f16(<8 x half> %A, <16 x half> %B, <8 x float> %C, i16 %Index, ptr addrspace(1) %out) {
|
||||
; GFX12-LABEL: test_swmmac_f32_16x16x32_f16:
|
||||
; GFX12: ; %bb.0: ; %bb
|
||||
; GFX12-NEXT: v_swmmac_f32_16x16x32_f16 v[12:19], v[0:3], v[4:11], v20
|
||||
; GFX12-NEXT: s_clause 0x1
|
||||
; GFX12-NEXT: global_store_b128 v[21:22], v[12:15], off
|
||||
; GFX12-NEXT: global_store_b128 v[21:22], v[16:19], off offset:16
|
||||
; GFX12-NEXT: s_nop 0
|
||||
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
|
||||
; GFX12-NEXT: s_endpgm
|
||||
bb:
|
||||
%res = call <8 x float> @llvm.amdgcn.swmmac.f32.16x16x32.f16.v8f32.v8f16.v16f16.v8f32.i16(<8 x half> %A, <16 x half> %B, <8 x float> %C, i16 %Index)
|
||||
store <8 x float> %res, ptr addrspace(1) %out
|
||||
ret void
|
||||
}
|
||||
|
||||
define amdgpu_ps void @test_swmmac_f32_16x16x32_bf16(<8 x i16> %A, <16 x i16> %B, <8 x float> %C, i16 %Index, ptr addrspace(1) %out) {
|
||||
; GFX12-LABEL: test_swmmac_f32_16x16x32_bf16:
|
||||
; GFX12: ; %bb.0: ; %bb
|
||||
; GFX12-NEXT: v_swmmac_f32_16x16x32_bf16 v[12:19], v[0:3], v[4:11], v20
|
||||
; GFX12-NEXT: s_clause 0x1
|
||||
; GFX12-NEXT: global_store_b128 v[21:22], v[12:15], off
|
||||
; GFX12-NEXT: global_store_b128 v[21:22], v[16:19], off offset:16
|
||||
; GFX12-NEXT: s_nop 0
|
||||
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
|
||||
; GFX12-NEXT: s_endpgm
|
||||
bb:
|
||||
%res = call <8 x float> @llvm.amdgcn.swmmac.f32.16x16x32.bf16.v8f32.v8i16.v16i16.v8f32.i16(<8 x i16> %A, <16 x i16> %B, <8 x float> %C, i16 %Index)
|
||||
store <8 x float> %res, ptr addrspace(1) %out
|
||||
ret void
|
||||
}
|
||||
|
||||
define amdgpu_ps void @test_swmmac_f16_16x16x32_f16(<8 x half> %A, <16 x half> %B, <8 x half> %C, i16 %Index, ptr addrspace(1) %out) {
|
||||
; GFX12-LABEL: test_swmmac_f16_16x16x32_f16:
|
||||
; GFX12: ; %bb.0: ; %bb
|
||||
; GFX12-NEXT: v_swmmac_f16_16x16x32_f16 v[12:15], v[0:3], v[4:11], v16
|
||||
; GFX12-NEXT: global_store_b128 v[17:18], v[12:15], off
|
||||
; GFX12-NEXT: s_nop 0
|
||||
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
|
||||
; GFX12-NEXT: s_endpgm
|
||||
bb:
|
||||
%res = call <8 x half> @llvm.amdgcn.swmmac.f16.16x16x32.f16.v8f16.v8f16.v16f16.v8f16.i16(<8 x half> %A, <16 x half> %B, <8 x half> %C, i16 %Index)
|
||||
store <8 x half> %res, ptr addrspace(1) %out
|
||||
ret void
|
||||
}
|
||||
|
||||
define amdgpu_ps void @test_swmmac_bf16_16x16x32_bf16(<8 x i16> %A, <16 x i16> %B, <8 x i16> %C, i16 %Index, ptr addrspace(1) %out) {
|
||||
; GFX12-LABEL: test_swmmac_bf16_16x16x32_bf16:
|
||||
; GFX12: ; %bb.0: ; %bb
|
||||
; GFX12-NEXT: v_swmmac_bf16_16x16x32_bf16 v[12:15], v[0:3], v[4:11], v16
|
||||
; GFX12-NEXT: global_store_b128 v[17:18], v[12:15], off
|
||||
; GFX12-NEXT: s_nop 0
|
||||
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
|
||||
; GFX12-NEXT: s_endpgm
|
||||
bb:
|
||||
%res = call <8 x i16> @llvm.amdgcn.swmmac.bf16.16x16x32.bf16.v8i16.v8i16.v16i16.v8i16.i16(<8 x i16> %A, <16 x i16> %B, <8 x i16> %C, i16 %Index)
|
||||
store <8 x i16> %res, ptr addrspace(1) %out
|
||||
ret void
|
||||
}
|
||||
|
||||
define amdgpu_ps void @test_swmmac_i32_16x16x32_iu8(<2 x i32> %A, <4 x i32> %B, <8 x i32> %C, i16 %Index, ptr addrspace(1) %out) {
|
||||
; GFX12-LABEL: test_swmmac_i32_16x16x32_iu8:
|
||||
; GFX12: ; %bb.0: ; %bb
|
||||
; GFX12-NEXT: v_swmmac_i32_16x16x32_iu8 v[6:13], v[0:1], v[2:5], v14
|
||||
; GFX12-NEXT: s_clause 0x1
|
||||
; GFX12-NEXT: global_store_b128 v[15:16], v[6:9], off
|
||||
; GFX12-NEXT: global_store_b128 v[15:16], v[10:13], off offset:16
|
||||
; GFX12-NEXT: s_nop 0
|
||||
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
|
||||
; GFX12-NEXT: s_endpgm
|
||||
bb:
|
||||
%res = call <8 x i32> @llvm.amdgcn.swmmac.i32.16x16x32.iu8.v8i32.v2i32.v4i32.v8i32.i16(i1 0, <2 x i32> %A, i1 0, <4 x i32> %B, <8 x i32> %C, i16 %Index, i1 0)
|
||||
store <8 x i32> %res, ptr addrspace(1) %out
|
||||
ret void
|
||||
}
|
||||
|
||||
define amdgpu_ps void @test_swmmac_i32_16x16x32_iu4(i32 %A, <2 x i32> %B, <8 x i32> %C, i16 %Index, ptr addrspace(1) %out) {
|
||||
; GFX12-LABEL: test_swmmac_i32_16x16x32_iu4:
|
||||
; GFX12: ; %bb.0: ; %bb
|
||||
; GFX12-NEXT: v_swmmac_i32_16x16x32_iu4 v[3:10], v0, v[1:2], v11
|
||||
; GFX12-NEXT: s_clause 0x1
|
||||
; GFX12-NEXT: global_store_b128 v[12:13], v[3:6], off
|
||||
; GFX12-NEXT: global_store_b128 v[12:13], v[7:10], off offset:16
|
||||
; GFX12-NEXT: s_nop 0
|
||||
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
|
||||
; GFX12-NEXT: s_endpgm
|
||||
bb:
|
||||
%res = call <8 x i32> @llvm.amdgcn.swmmac.i32.16x16x32.iu4.v8i32.i32.v2i32.v8i32.i16(i1 0, i32 %A, i1 0, <2 x i32> %B, <8 x i32> %C, i16 %Index, i1 0)
|
||||
store <8 x i32> %res, ptr addrspace(1) %out
|
||||
ret void
|
||||
}
|
||||
|
||||
define amdgpu_ps void @test_swmmac_i32_16x16x64_iu4(<2 x i32> %A, <4 x i32> %B, <8 x i32> %C, i32 %Index, ptr addrspace(1) %out) {
|
||||
; GFX12-LABEL: test_swmmac_i32_16x16x64_iu4:
|
||||
; GFX12: ; %bb.0: ; %bb
|
||||
; GFX12-NEXT: v_swmmac_i32_16x16x64_iu4 v[6:13], v[0:1], v[2:5], v14
|
||||
; GFX12-NEXT: s_clause 0x1
|
||||
; GFX12-NEXT: global_store_b128 v[15:16], v[6:9], off
|
||||
; GFX12-NEXT: global_store_b128 v[15:16], v[10:13], off offset:16
|
||||
; GFX12-NEXT: s_nop 0
|
||||
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
|
||||
; GFX12-NEXT: s_endpgm
|
||||
bb:
|
||||
%res = call <8 x i32> @llvm.amdgcn.swmmac.i32.16x16x64.iu4.v8i32.v2i32.v4i32.v8i32.i32(i1 0, <2 x i32> %A, i1 0, <4 x i32> %B, <8 x i32> %C, i32 %Index, i1 0)
|
||||
store <8 x i32> %res, ptr addrspace(1) %out
|
||||
ret void
|
||||
}
|
||||
|
||||
define amdgpu_ps void @test_swmmac_f32_16x16x32_fp8_fp8(<2 x i32> %A, <4 x i32> %B, <8 x float> %C, i16 %Index, ptr addrspace(1) %out) {
|
||||
; GFX12-LABEL: test_swmmac_f32_16x16x32_fp8_fp8:
|
||||
; GFX12: ; %bb.0: ; %bb
|
||||
; GFX12-NEXT: v_swmmac_f32_16x16x32_fp8_fp8 v[6:13], v[0:1], v[2:5], v14
|
||||
; GFX12-NEXT: s_clause 0x1
|
||||
; GFX12-NEXT: global_store_b128 v[15:16], v[6:9], off
|
||||
; GFX12-NEXT: global_store_b128 v[15:16], v[10:13], off offset:16
|
||||
; GFX12-NEXT: s_nop 0
|
||||
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
|
||||
; GFX12-NEXT: s_endpgm
|
||||
bb:
|
||||
%res = call <8 x float> @llvm.amdgcn.swmmac.f32.16x16x32.fp8.fp8.v8f32.v2i32.v4i32.v8f32.i16(<2 x i32> %A, <4 x i32> %B, <8 x float> %C, i16 %Index)
|
||||
store <8 x float> %res, ptr addrspace(1) %out
|
||||
ret void
|
||||
}
|
||||
|
||||
define amdgpu_ps void @test_swmmac_f32_16x16x32_fp8_bf8(<2 x i32> %A, <4 x i32> %B, <8 x float> %C, i16 %Index, ptr addrspace(1) %out) {
|
||||
; GFX12-LABEL: test_swmmac_f32_16x16x32_fp8_bf8:
|
||||
; GFX12: ; %bb.0: ; %bb
|
||||
; GFX12-NEXT: v_swmmac_f32_16x16x32_fp8_bf8 v[6:13], v[0:1], v[2:5], v14
|
||||
; GFX12-NEXT: s_clause 0x1
|
||||
; GFX12-NEXT: global_store_b128 v[15:16], v[6:9], off
|
||||
; GFX12-NEXT: global_store_b128 v[15:16], v[10:13], off offset:16
|
||||
; GFX12-NEXT: s_nop 0
|
||||
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
|
||||
; GFX12-NEXT: s_endpgm
|
||||
bb:
|
||||
%res = call <8 x float> @llvm.amdgcn.swmmac.f32.16x16x32.fp8.bf8.v8f32.v2i32.v4i32.v8f32.i16(<2 x i32> %A, <4 x i32> %B, <8 x float> %C, i16 %Index)
|
||||
store <8 x float> %res, ptr addrspace(1) %out
|
||||
ret void
|
||||
}
|
||||
|
||||
define amdgpu_ps void @test_swmmac_f32_16x16x32_bf8_fp8(<2 x i32> %A, <4 x i32> %B, <8 x float> %C, i16 %Index, ptr addrspace(1) %out) {
|
||||
; GFX12-LABEL: test_swmmac_f32_16x16x32_bf8_fp8:
|
||||
; GFX12: ; %bb.0: ; %bb
|
||||
; GFX12-NEXT: v_swmmac_f32_16x16x32_bf8_fp8 v[6:13], v[0:1], v[2:5], v14
|
||||
; GFX12-NEXT: s_clause 0x1
|
||||
; GFX12-NEXT: global_store_b128 v[15:16], v[6:9], off
|
||||
; GFX12-NEXT: global_store_b128 v[15:16], v[10:13], off offset:16
|
||||
; GFX12-NEXT: s_nop 0
|
||||
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
|
||||
; GFX12-NEXT: s_endpgm
|
||||
bb:
|
||||
%res = call <8 x float> @llvm.amdgcn.swmmac.f32.16x16x32.bf8.fp8.v8f32.v2i32.v4i32.v8f32.i16(<2 x i32> %A, <4 x i32> %B, <8 x float> %C, i16 %Index)
|
||||
store <8 x float> %res, ptr addrspace(1) %out
|
||||
ret void
|
||||
}
|
||||
|
||||
define amdgpu_ps void @test_swmmac_f32_16x16x32_bf8_bf8(<2 x i32> %A, <4 x i32> %B, <8 x float> %C, i16 %Index, ptr addrspace(1) %out) {
|
||||
; GFX12-LABEL: test_swmmac_f32_16x16x32_bf8_bf8:
|
||||
; GFX12: ; %bb.0: ; %bb
|
||||
; GFX12-NEXT: v_swmmac_f32_16x16x32_bf8_bf8 v[6:13], v[0:1], v[2:5], v14
|
||||
; GFX12-NEXT: s_clause 0x1
|
||||
; GFX12-NEXT: global_store_b128 v[15:16], v[6:9], off
|
||||
; GFX12-NEXT: global_store_b128 v[15:16], v[10:13], off offset:16
|
||||
; GFX12-NEXT: s_nop 0
|
||||
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
|
||||
; GFX12-NEXT: s_endpgm
|
||||
bb:
|
||||
%res = call <8 x float> @llvm.amdgcn.swmmac.f32.16x16x32.bf8.bf8.v8f32.v2i32.v4i32.v8f32.i16(<2 x i32> %A, <4 x i32> %B, <8 x float> %C, i16 %Index)
|
||||
store <8 x float> %res, ptr addrspace(1) %out
|
||||
ret void
|
||||
}
|
||||
|
||||
declare <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.f16.v8f32.v8f16.v8f16.v8f32(<8 x half>, <8 x half>, <8 x float>)
|
||||
declare <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf16.v8f32.v8i16.v8i16.v8f32(<8 x i16>, <8 x i16>, <8 x float>)
|
||||
declare <8 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16.v8f16.v8f16.v8f16.v8f16(<8 x half>, <8 x half>, <8 x half>, i1 immarg)
|
||||
declare <8 x i16> @llvm.amdgcn.wmma.bf16.16x16x16.bf16.v8i16.v8i16.v8i16.v8i16(<8 x i16>, <8 x i16>, <8 x i16>, i1 immarg)
|
||||
declare <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8.v8i32.v2i32.v2i32.v8i32(i1 immarg, <2 x i32>, i1 immarg, <2 x i32>, <8 x i32>, i1 immarg)
|
||||
declare <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4.v8i32.i32.i32.v8i32(i1 immarg, i32, i1 immarg, i32, <8 x i32>, i1 immarg)
|
||||
declare <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.fp8.fp8.v8f32.v2i32.v2i32.v8f32(<2 x i32>, <2 x i32>, <8 x float>)
|
||||
declare <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.fp8.bf8.v8f32.v2i32.v2i32.v8f32(<2 x i32>, <2 x i32>, <8 x float>)
|
||||
declare <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf8.fp8.v8f32.v2i32.v2i32.v8f32(<2 x i32>, <2 x i32>, <8 x float>)
|
||||
declare <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf8.bf8.v8f32.v2i32.v2i32.v8f32(<2 x i32>, <2 x i32>, <8 x float>)
|
||||
declare <8 x i32> @llvm.amdgcn.wmma.i32.16x16x32.iu4.v8i32.v2i32.v2i32.v8i32(i1 immarg, <2 x i32>, i1 immarg, <2 x i32>, <8 x i32>, i1 immarg)
|
||||
declare <8 x float> @llvm.amdgcn.swmmac.f32.16x16x32.f16.v8f32.v8f16.v16f16.v8f32.i16(<8 x half>, <16 x half>, <8 x float>, i16)
|
||||
declare <8 x float> @llvm.amdgcn.swmmac.f32.16x16x32.bf16.v8f32.v8i16.v16i16.v8f32.i16(<8 x i16>, <16 x i16>, <8 x float>, i16)
|
||||
declare <8 x half> @llvm.amdgcn.swmmac.f16.16x16x32.f16.v8f16.v8f16.v16f16.v8f16.i16(<8 x half>, <16 x half>, <8 x half>, i16)
|
||||
declare <8 x i16> @llvm.amdgcn.swmmac.bf16.16x16x32.bf16.v8i16.v8i16.v16i16.v8i16.i16(<8 x i16>, <16 x i16>, <8 x i16>, i16)
|
||||
declare <8 x i32> @llvm.amdgcn.swmmac.i32.16x16x32.iu8.v8i32.v2i32.v4i32.v8i32.i16(i1 immarg, <2 x i32>, i1 immarg, <4 x i32>, <8 x i32>, i16 %Index, i1 immarg)
|
||||
declare <8 x i32> @llvm.amdgcn.swmmac.i32.16x16x32.iu4.v8i32.i32.v2i32.v8i32.i16(i1 immarg, i32, i1 immarg, <2 x i32>, <8 x i32>, i16 %Index, i1 immarg)
|
||||
declare <8 x i32> @llvm.amdgcn.swmmac.i32.16x16x64.iu4.v8i32.v2i32.v4i32.v8i32.i32(i1 immarg, <2 x i32>, i1 immarg, <4 x i32>, <8 x i32>, i32 %Index, i1 immarg)
|
||||
declare <8 x float> @llvm.amdgcn.swmmac.f32.16x16x32.fp8.fp8.v8f32.v2i32.v4i32.v8f32.i16(<2 x i32>, <4 x i32>, <8 x float>, i16)
|
||||
declare <8 x float> @llvm.amdgcn.swmmac.f32.16x16x32.fp8.bf8.v8f32.v2i32.v4i32.v8f32.i16(<2 x i32>, <4 x i32>, <8 x float>, i16)
|
||||
declare <8 x float> @llvm.amdgcn.swmmac.f32.16x16x32.bf8.fp8.v8f32.v2i32.v4i32.v8f32.i16(<2 x i32>, <4 x i32>, <8 x float>, i16)
|
||||
declare <8 x float> @llvm.amdgcn.swmmac.f32.16x16x32.bf8.bf8.v8f32.v2i32.v4i32.v8f32.i16(<2 x i32>, <4 x i32>, <8 x float>, i16)
|
@ -0,0 +1,459 @@
|
||||
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
|
||||
; RUN: llc -global-isel -march=amdgcn -mcpu=gfx1200 -mattr=-wavefrontsize32,+wavefrontsize64 -verify-machineinstrs < %s | FileCheck %s --check-prefix=GFX12
|
||||
|
||||
define amdgpu_ps void @test_wmma_f32_16x16x16_f16_negA(<4 x half> %A, <4 x half> %B, <4 x float> %C, <4 x float> addrspace(1)* %out) {
|
||||
; GFX12-LABEL: test_wmma_f32_16x16x16_f16_negA:
|
||||
; GFX12: ; %bb.0: ; %bb
|
||||
; GFX12-NEXT: v_wmma_f32_16x16x16_f16 v[4:7], v[0:1], v[2:3], v[4:7] neg_lo:[1,0,0] neg_hi:[1,0,0]
|
||||
; GFX12-NEXT: global_store_b128 v[8:9], v[4:7], off
|
||||
; GFX12-NEXT: s_nop 0
|
||||
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
|
||||
; GFX12-NEXT: s_endpgm
|
||||
bb:
|
||||
%fneg.A = fneg <4 x half> %A
|
||||
%res = call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.f16.v4f16.v4f32(<4 x half> %fneg.A, <4 x half> %B, <4 x float> %C)
|
||||
store <4 x float> %res, ptr addrspace(1) %out
|
||||
ret void
|
||||
}
|
||||
|
||||
define amdgpu_ps void @test_wmma_f32_16x16x16_f16_negB(<4 x half> %A, <4 x half> %B, <4 x float> %C, <4 x float> addrspace(1)* %out) {
|
||||
; GFX12-LABEL: test_wmma_f32_16x16x16_f16_negB:
|
||||
; GFX12: ; %bb.0: ; %bb
|
||||
; GFX12-NEXT: v_wmma_f32_16x16x16_f16 v[4:7], v[0:1], v[2:3], v[4:7] neg_lo:[0,1,0] neg_hi:[0,1,0]
|
||||
; GFX12-NEXT: global_store_b128 v[8:9], v[4:7], off
|
||||
; GFX12-NEXT: s_nop 0
|
||||
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
|
||||
; GFX12-NEXT: s_endpgm
|
||||
bb:
|
||||
%fneg.B = fneg <4 x half> %B
|
||||
%res = call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.f16.v4f16.v4f32(<4 x half> %A, <4 x half> %fneg.B, <4 x float> %C)
|
||||
store <4 x float> %res, <4 x float> addrspace(1)* %out
|
||||
ret void
|
||||
}
|
||||
|
||||
define amdgpu_ps void @test_wmma_f32_16x16x16_f16_negC(<4 x half> %A, <4 x half> %B, <4 x float> %C, <4 x float> addrspace(1)* %out) {
|
||||
; GFX12-LABEL: test_wmma_f32_16x16x16_f16_negC:
|
||||
; GFX12: ; %bb.0: ; %bb
|
||||
; GFX12-NEXT: v_wmma_f32_16x16x16_f16 v[4:7], v[0:1], v[2:3], v[4:7] neg_lo:[0,0,1]
|
||||
; GFX12-NEXT: global_store_b128 v[8:9], v[4:7], off
|
||||
; GFX12-NEXT: s_nop 0
|
||||
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
|
||||
; GFX12-NEXT: s_endpgm
|
||||
bb:
|
||||
%fneg.C = fneg <4 x float> %C
|
||||
%res = call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.f16.v4f16.v4f32(<4 x half> %A, <4 x half> %B, <4 x float> %fneg.C)
|
||||
store <4 x float> %res, <4 x float> addrspace(1)* %out
|
||||
ret void
|
||||
}
|
||||
|
||||
define amdgpu_ps void @test_wmma_f32_16x16x16_f16_absC(<4 x half> %A, <4 x half> %B, <4 x float> %C, <4 x float> addrspace(1)* %out) {
|
||||
; GFX12-LABEL: test_wmma_f32_16x16x16_f16_absC:
|
||||
; GFX12: ; %bb.0: ; %bb
|
||||
; GFX12-NEXT: v_wmma_f32_16x16x16_f16 v[4:7], v[0:1], v[2:3], v[4:7] neg_hi:[0,0,1]
|
||||
; GFX12-NEXT: global_store_b128 v[8:9], v[4:7], off
|
||||
; GFX12-NEXT: s_nop 0
|
||||
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
|
||||
; GFX12-NEXT: s_endpgm
|
||||
bb:
|
||||
%fabs.C = call <4 x float> @llvm.fabs.v4f32(<4 x float> %C)
|
||||
%res = call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.f16.v4f16.v4f32(<4 x half> %A, <4 x half> %B, <4 x float> %fabs.C)
|
||||
store <4 x float> %res, <4 x float> addrspace(1)* %out
|
||||
ret void
|
||||
}
|
||||
|
||||
define amdgpu_ps void @test_wmma_f32_16x16x16_bf16_negC(<4 x i16> %A, <4 x i16> %B, <4 x float> %C, <4 x float> addrspace(1)* %out) {
|
||||
; GFX12-LABEL: test_wmma_f32_16x16x16_bf16_negC:
|
||||
; GFX12: ; %bb.0: ; %bb
|
||||
; GFX12-NEXT: v_wmma_f32_16x16x16_bf16 v[4:7], v[0:1], v[2:3], v[4:7] neg_lo:[0,0,1]
|
||||
; GFX12-NEXT: global_store_b128 v[8:9], v[4:7], off
|
||||
; GFX12-NEXT: s_nop 0
|
||||
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
|
||||
; GFX12-NEXT: s_endpgm
|
||||
bb:
|
||||
%fneg.C = fneg <4 x float> %C
|
||||
%res = call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf16.v4i16.v4f32(<4 x i16> %A, <4 x i16> %B, <4 x float> %fneg.C)
|
||||
store <4 x float> %res, <4 x float> addrspace(1)* %out
|
||||
ret void
|
||||
}
|
||||
|
||||
define amdgpu_ps void @test_wmma_f32_16x16x16_bf16_absC(<4 x i16> %A, <4 x i16> %B, <4 x float> %C, <4 x float> addrspace(1)* %out) {
|
||||
; GFX12-LABEL: test_wmma_f32_16x16x16_bf16_absC:
|
||||
; GFX12: ; %bb.0: ; %bb
|
||||
; GFX12-NEXT: v_wmma_f32_16x16x16_bf16 v[4:7], v[0:1], v[2:3], v[4:7] neg_hi:[0,0,1]
|
||||
; GFX12-NEXT: global_store_b128 v[8:9], v[4:7], off
|
||||
; GFX12-NEXT: s_nop 0
|
||||
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
|
||||
; GFX12-NEXT: s_endpgm
|
||||
bb:
|
||||
%fabs.C = call <4 x float> @llvm.fabs.v4f32(<4 x float> %C)
|
||||
%res = call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf16.v4i16.v4f32(<4 x i16> %A, <4 x i16> %B, <4 x float> %fabs.C)
|
||||
store <4 x float> %res, <4 x float> addrspace(1)* %out
|
||||
ret void
|
||||
}
|
||||
|
||||
define amdgpu_ps void @test_wmma_f16_16x16x16_f16_negA(<4 x half> %A, <4 x half> %B, <4 x half> %C, <4 x half> addrspace(1)* %out) {
|
||||
; GFX12-LABEL: test_wmma_f16_16x16x16_f16_negA:
|
||||
; GFX12: ; %bb.0: ; %bb
|
||||
; GFX12-NEXT: v_wmma_f16_16x16x16_f16 v[4:5], v[0:1], v[2:3], v[4:5] neg_lo:[1,0,0] neg_hi:[1,0,0]
|
||||
; GFX12-NEXT: global_store_b64 v[6:7], v[4:5], off
|
||||
; GFX12-NEXT: s_nop 0
|
||||
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
|
||||
; GFX12-NEXT: s_endpgm
|
||||
bb:
|
||||
%fneg.A = fneg <4 x half> %A
|
||||
%res = call <4 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16.v8f16.v8f16(<4 x half> %fneg.A, <4 x half> %B, <4 x half> %C, i1 0)
|
||||
store <4 x half> %res, <4 x half> addrspace(1)* %out
|
||||
ret void
|
||||
}
|
||||
|
||||
define amdgpu_ps void @test_wmma_f16_16x16x16_f16_negB(<4 x half> %A, <4 x half> %B, <4 x half> %C, <4 x half> addrspace(1)* %out) {
|
||||
; GFX12-LABEL: test_wmma_f16_16x16x16_f16_negB:
|
||||
; GFX12: ; %bb.0: ; %bb
|
||||
; GFX12-NEXT: v_wmma_f16_16x16x16_f16 v[4:5], v[0:1], v[2:3], v[4:5] neg_lo:[0,1,0] neg_hi:[0,1,0]
|
||||
; GFX12-NEXT: global_store_b64 v[6:7], v[4:5], off
|
||||
; GFX12-NEXT: s_nop 0
|
||||
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
|
||||
; GFX12-NEXT: s_endpgm
|
||||
bb:
|
||||
%fneg.B = fneg <4 x half> %B
|
||||
%res = call <4 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16.v8f16.v8f16(<4 x half> %A, <4 x half> %fneg.B, <4 x half> %C, i1 0)
|
||||
store <4 x half> %res, <4 x half> addrspace(1)* %out
|
||||
ret void
|
||||
}
|
||||
|
||||
define amdgpu_ps void @test_wmma_f16_16x16x16_f16_negC(<4 x half> %A, <4 x half> %B, <4 x half> %C, <4 x half> addrspace(1)* %out) {
|
||||
; GFX12-LABEL: test_wmma_f16_16x16x16_f16_negC:
|
||||
; GFX12: ; %bb.0: ; %bb
|
||||
; GFX12-NEXT: v_wmma_f16_16x16x16_f16 v[4:5], v[0:1], v[2:3], v[4:5] neg_lo:[0,0,1]
|
||||
; GFX12-NEXT: global_store_b64 v[6:7], v[4:5], off
|
||||
; GFX12-NEXT: s_nop 0
|
||||
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
|
||||
; GFX12-NEXT: s_endpgm
|
||||
bb:
|
||||
%fneg.C = fneg <4 x half> %C
|
||||
%res = call <4 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16.v8f16.v8f16(<4 x half> %A, <4 x half> %B, <4 x half> %fneg.C, i1 0)
|
||||
store <4 x half> %res, <4 x half> addrspace(1)* %out
|
||||
ret void
|
||||
}
|
||||
|
||||
define amdgpu_ps void @test_wmma_f16_16x16x16_f16_absC(<4 x half> %A, <4 x half> %B, <4 x half> %C, <4 x half> addrspace(1)* %out) {
|
||||
; GFX12-LABEL: test_wmma_f16_16x16x16_f16_absC:
|
||||
; GFX12: ; %bb.0: ; %bb
|
||||
; GFX12-NEXT: v_wmma_f16_16x16x16_f16 v[4:5], v[0:1], v[2:3], v[4:5] neg_hi:[0,0,1]
|
||||
; GFX12-NEXT: global_store_b64 v[6:7], v[4:5], off
|
||||
; GFX12-NEXT: s_nop 0
|
||||
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
|
||||
; GFX12-NEXT: s_endpgm
|
||||
bb:
|
||||
%fabs.C = call <4 x half> @llvm.fabs.v4f16(<4 x half> %C)
|
||||
%res = call <4 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16.v8f16.v8f16(<4 x half> %A, <4 x half> %B, <4 x half> %fabs.C, i1 0)
|
||||
store <4 x half> %res, <4 x half> addrspace(1)* %out
|
||||
ret void
|
||||
}
|
||||
|
||||
define amdgpu_ps void @test_wmma_f32_16x16x16_fp8_fp8_negC(i32 %A, i32 %B, <4 x float> %C, <4 x float> addrspace(1)* %out) {
|
||||
; GFX12-LABEL: test_wmma_f32_16x16x16_fp8_fp8_negC:
|
||||
; GFX12: ; %bb.0: ; %bb
|
||||
; GFX12-NEXT: v_wmma_f32_16x16x16_fp8_fp8 v[2:5], v0, v1, v[2:5] neg_lo:[0,0,1]
|
||||
; GFX12-NEXT: global_store_b128 v[6:7], v[2:5], off
|
||||
; GFX12-NEXT: s_nop 0
|
||||
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
|
||||
; GFX12-NEXT: s_endpgm
|
||||
bb:
|
||||
%fneg.C = fneg <4 x float> %C
|
||||
%res = call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.fp8.fp8.i32.v4f32(i32 %A, i32 %B, <4 x float> %fneg.C)
|
||||
store <4 x float> %res, <4 x float> addrspace(1)* %out
|
||||
ret void
|
||||
}
|
||||
|
||||
define amdgpu_ps void @test_wmma_f32_16x16x16_fp8_fp8_absC(i32 %A, i32 %B, <4 x float> %C, <4 x float> addrspace(1)* %out) {
|
||||
; GFX12-LABEL: test_wmma_f32_16x16x16_fp8_fp8_absC:
|
||||
; GFX12: ; %bb.0: ; %bb
|
||||
; GFX12-NEXT: v_wmma_f32_16x16x16_fp8_fp8 v[2:5], v0, v1, v[2:5] neg_hi:[0,0,1]
|
||||
; GFX12-NEXT: global_store_b128 v[6:7], v[2:5], off
|
||||
; GFX12-NEXT: s_nop 0
|
||||
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
|
||||
; GFX12-NEXT: s_endpgm
|
||||
bb:
|
||||
%fabs.C = call <4 x float> @llvm.fabs.v4f32(<4 x float> %C)
|
||||
%res = call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.fp8.fp8.i32.v4f32(i32 %A, i32 %B, <4 x float> %fabs.C)
|
||||
store <4 x float> %res, <4 x float> addrspace(1)* %out
|
||||
ret void
|
||||
}
|
||||
|
||||
define amdgpu_ps void @test_wmma_f32_16x16x16_bf8_fp8_negC(i32 %A, i32 %B, <4 x float> %C, <4 x float> addrspace(1)* %out) {
|
||||
; GFX12-LABEL: test_wmma_f32_16x16x16_bf8_fp8_negC:
|
||||
; GFX12: ; %bb.0: ; %bb
|
||||
; GFX12-NEXT: v_wmma_f32_16x16x16_bf8_fp8 v[2:5], v0, v1, v[2:5] neg_lo:[0,0,1]
|
||||
; GFX12-NEXT: global_store_b128 v[6:7], v[2:5], off
|
||||
; GFX12-NEXT: s_nop 0
|
||||
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
|
||||
; GFX12-NEXT: s_endpgm
|
||||
bb:
|
||||
%fneg.C = fneg <4 x float> %C
|
||||
%res = call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf8.fp8.i32.v4f32(i32 %A, i32 %B, <4 x float> %fneg.C)
|
||||
store <4 x float> %res, <4 x float> addrspace(1)* %out
|
||||
ret void
|
||||
}
|
||||
|
||||
define amdgpu_ps void @test_wmma_f32_16x16x16_bf8_fp8_absC(i32 %A, i32 %B, <4 x float> %C, <4 x float> addrspace(1)* %out) {
|
||||
; GFX12-LABEL: test_wmma_f32_16x16x16_bf8_fp8_absC:
|
||||
; GFX12: ; %bb.0: ; %bb
|
||||
; GFX12-NEXT: v_wmma_f32_16x16x16_bf8_fp8 v[2:5], v0, v1, v[2:5] neg_hi:[0,0,1]
|
||||
; GFX12-NEXT: global_store_b128 v[6:7], v[2:5], off
|
||||
; GFX12-NEXT: s_nop 0
|
||||
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
|
||||
; GFX12-NEXT: s_endpgm
|
||||
bb:
|
||||
%fabs.C = call <4 x float> @llvm.fabs.v4f32(<4 x float> %C)
|
||||
%res = call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf8.fp8.i32.v4f32(i32 %A, i32 %B, <4 x float> %fabs.C)
|
||||
store <4 x float> %res, <4 x float> addrspace(1)* %out
|
||||
ret void
|
||||
}
|
||||
|
||||
define amdgpu_ps void @test_wmma_f32_16x16x16_fp8_bf8_negC(i32 %A, i32 %B, <4 x float> %C, <4 x float> addrspace(1)* %out) {
|
||||
; GFX12-LABEL: test_wmma_f32_16x16x16_fp8_bf8_negC:
|
||||
; GFX12: ; %bb.0: ; %bb
|
||||
; GFX12-NEXT: v_wmma_f32_16x16x16_fp8_bf8 v[2:5], v0, v1, v[2:5] neg_lo:[0,0,1]
|
||||
; GFX12-NEXT: global_store_b128 v[6:7], v[2:5], off
|
||||
; GFX12-NEXT: s_nop 0
|
||||
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
|
||||
; GFX12-NEXT: s_endpgm
|
||||
bb:
|
||||
%fneg.C = fneg <4 x float> %C
|
||||
%res = call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.fp8.bf8.i32.v4f32(i32 %A, i32 %B, <4 x float> %fneg.C)
|
||||
store <4 x float> %res, <4 x float> addrspace(1)* %out
|
||||
ret void
|
||||
}
|
||||
|
||||
define amdgpu_ps void @test_wmma_f32_16x16x16_fp8_bf8_absC(i32 %A, i32 %B, <4 x float> %C, <4 x float> addrspace(1)* %out) {
|
||||
; GFX12-LABEL: test_wmma_f32_16x16x16_fp8_bf8_absC:
|
||||
; GFX12: ; %bb.0: ; %bb
|
||||
; GFX12-NEXT: v_wmma_f32_16x16x16_fp8_bf8 v[2:5], v0, v1, v[2:5] neg_hi:[0,0,1]
|
||||
; GFX12-NEXT: global_store_b128 v[6:7], v[2:5], off
|
||||
; GFX12-NEXT: s_nop 0
|
||||
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
|
||||
; GFX12-NEXT: s_endpgm
|
||||
bb:
|
||||
%fabs.C = call <4 x float> @llvm.fabs.v4f32(<4 x float> %C)
|
||||
%res = call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.fp8.bf8.i32.v4f32(i32 %A, i32 %B, <4 x float> %fabs.C)
|
||||
store <4 x float> %res, <4 x float> addrspace(1)* %out
|
||||
ret void
|
||||
}
|
||||
|
||||
define amdgpu_ps void @test_wmma_f32_16x16x16_bf8_bf8_negC(i32 %A, i32 %B, <4 x float> %C, <4 x float> addrspace(1)* %out) {
|
||||
; GFX12-LABEL: test_wmma_f32_16x16x16_bf8_bf8_negC:
|
||||
; GFX12: ; %bb.0: ; %bb
|
||||
; GFX12-NEXT: v_wmma_f32_16x16x16_bf8_bf8 v[2:5], v0, v1, v[2:5] neg_lo:[0,0,1]
|
||||
; GFX12-NEXT: global_store_b128 v[6:7], v[2:5], off
|
||||
; GFX12-NEXT: s_nop 0
|
||||
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
|
||||
; GFX12-NEXT: s_endpgm
|
||||
bb:
|
||||
%fneg.C = fneg <4 x float> %C
|
||||
%res = call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf8.bf8.i32.v4f32(i32 %A, i32 %B, <4 x float> %fneg.C)
|
||||
store <4 x float> %res, <4 x float> addrspace(1)* %out
|
||||
ret void
|
||||
}
|
||||
|
||||
define amdgpu_ps void @test_wmma_f32_16x16x16_bf8_bf8_absC(i32 %A, i32 %B, <4 x float> %C, <4 x float> addrspace(1)* %out) {
|
||||
; GFX12-LABEL: test_wmma_f32_16x16x16_bf8_bf8_absC:
|
||||
; GFX12: ; %bb.0: ; %bb
|
||||
; GFX12-NEXT: v_wmma_f32_16x16x16_bf8_bf8 v[2:5], v0, v1, v[2:5] neg_hi:[0,0,1]
|
||||
; GFX12-NEXT: global_store_b128 v[6:7], v[2:5], off
|
||||
; GFX12-NEXT: s_nop 0
|
||||
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
|
||||
; GFX12-NEXT: s_endpgm
|
||||
bb:
|
||||
%fabs.C = call <4 x float> @llvm.fabs.v4f32(<4 x float> %C)
|
||||
%res = call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf8.bf8.i32.v4f32(i32 %A, i32 %B, <4 x float> %fabs.C)
|
||||
store <4 x float> %res, <4 x float> addrspace(1)* %out
|
||||
ret void
|
||||
}
|
||||
|
||||
define amdgpu_ps void @test_swmmac_f32_16x16x32_f16_negA(<4 x half> %A, <8 x half> %B, <4 x float> %C, i16 %Index, <4 x float> addrspace(1)* %out) {
|
||||
; GFX12-LABEL: test_swmmac_f32_16x16x32_f16_negA:
|
||||
; GFX12: ; %bb.0: ; %bb
|
||||
; GFX12-NEXT: v_swmmac_f32_16x16x32_f16 v[6:9], v[0:1], v[2:5], v10 neg_lo:[1,0,0] neg_hi:[1,0,0]
|
||||
; GFX12-NEXT: global_store_b128 v[11:12], v[6:9], off
|
||||
; GFX12-NEXT: s_nop 0
|
||||
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
|
||||
; GFX12-NEXT: s_endpgm
|
||||
bb:
|
||||
%fneg.A = fneg <4 x half> %A
|
||||
%res = call <4 x float> @llvm.amdgcn.swmmac.f32.16x16x32.f16.v4f16.v8f16.v4f32.i16(<4 x half> %fneg.A, <8 x half> %B, <4 x float> %C, i16 %Index)
|
||||
store <4 x float> %res, <4 x float> addrspace(1)* %out
|
||||
ret void
|
||||
}
|
||||
|
||||
define amdgpu_ps void @test_swmmac_f32_16x16x32_f16_negB(<4 x half> %A, <8 x half> %B, <4 x float> %C, i16 %Index, <4 x float> addrspace(1)* %out) {
|
||||
; GFX12-LABEL: test_swmmac_f32_16x16x32_f16_negB:
|
||||
; GFX12: ; %bb.0: ; %bb
|
||||
; GFX12-NEXT: v_swmmac_f32_16x16x32_f16 v[6:9], v[0:1], v[2:5], v10 neg_lo:[0,1,0] neg_hi:[0,1,0]
|
||||
; GFX12-NEXT: global_store_b128 v[11:12], v[6:9], off
|
||||
; GFX12-NEXT: s_nop 0
|
||||
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
|
||||
; GFX12-NEXT: s_endpgm
|
||||
bb:
|
||||
%fneg.B = fneg <8 x half> %B
|
||||
%res = call <4 x float> @llvm.amdgcn.swmmac.f32.16x16x32.f16.v4f16.v8f16.v4f32.i16(<4 x half> %A, <8 x half> %fneg.B, <4 x float> %C, i16 %Index)
|
||||
store <4 x float> %res, <4 x float> addrspace(1)* %out
|
||||
ret void
|
||||
}
|
||||
|
||||
define amdgpu_ps void @test_swmmac_f16_16x16x32_f16_negA(<4 x half> %A, <8 x half> %B, <4 x half> %C, i16 %Index, <4 x half> addrspace(1)* %out) {
|
||||
; GFX12-LABEL: test_swmmac_f16_16x16x32_f16_negA:
|
||||
; GFX12: ; %bb.0: ; %bb
|
||||
; GFX12-NEXT: v_swmmac_f16_16x16x32_f16 v[6:7], v[0:1], v[2:5], v8 neg_lo:[1,0,0] neg_hi:[1,0,0]
|
||||
; GFX12-NEXT: global_store_b64 v[9:10], v[6:7], off
|
||||
; GFX12-NEXT: s_nop 0
|
||||
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
|
||||
; GFX12-NEXT: s_endpgm
|
||||
bb:
|
||||
%fneg.A = fneg <4 x half> %A
|
||||
%res = call <4 x half> @llvm.amdgcn.swmmac.f16.16x16x32.f16.v4f16.v8f16.v4f16.i16(<4 x half> %fneg.A, <8 x half> %B, <4 x half> %C, i16 %Index)
|
||||
store <4 x half> %res, <4 x half> addrspace(1)* %out
|
||||
ret void
|
||||
}
|
||||
|
||||
define amdgpu_ps void @test_swmmac_f16_16x16x32_f16_negB(<4 x half> %A, <8 x half> %B, <4 x half> %C, i16 %Index, <4 x half> addrspace(1)* %out) {
|
||||
; GFX12-LABEL: test_swmmac_f16_16x16x32_f16_negB:
|
||||
; GFX12: ; %bb.0: ; %bb
|
||||
; GFX12-NEXT: v_swmmac_f16_16x16x32_f16 v[6:7], v[0:1], v[2:5], v8 neg_lo:[0,1,0] neg_hi:[0,1,0]
|
||||
; GFX12-NEXT: global_store_b64 v[9:10], v[6:7], off
|
||||
; GFX12-NEXT: s_nop 0
|
||||
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
|
||||
; GFX12-NEXT: s_endpgm
|
||||
bb:
|
||||
%fneg.B = fneg <8 x half> %B
|
||||
%res = call <4 x half> @llvm.amdgcn.swmmac.f16.16x16x32.f16.v4f16.v8f16.v4f16.i16(<4 x half> %A, <8 x half> %fneg.B, <4 x half> %C, i16 %Index)
|
||||
store <4 x half> %res, <4 x half> addrspace(1)* %out
|
||||
ret void
|
||||
}
|
||||
|
||||
; both neg and abs patterns (wmma matrix C f32 or f16 )
|
||||
|
||||
define amdgpu_ps void @test_wmma_f32_16x16x16_f16_negabsC(<4 x half> %A, <4 x half> %B, <4 x float> %C, <4 x float> addrspace(1)* %out) {
|
||||
; GFX12-LABEL: test_wmma_f32_16x16x16_f16_negabsC:
|
||||
; GFX12: ; %bb.0: ; %bb
|
||||
; GFX12-NEXT: v_wmma_f32_16x16x16_f16 v[4:7], v[0:1], v[2:3], v[4:7] neg_lo:[0,0,1] neg_hi:[0,0,1]
|
||||
; GFX12-NEXT: global_store_b128 v[8:9], v[4:7], off
|
||||
; GFX12-NEXT: s_nop 0
|
||||
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
|
||||
; GFX12-NEXT: s_endpgm
|
||||
bb:
|
||||
%fabs.C = call <4 x float> @llvm.fabs.v4f32(<4 x float> %C)
|
||||
%fneg.fabs.C = fneg <4 x float> %fabs.C
|
||||
%res = call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.f16.v4f16.v4f32(<4 x half> %A, <4 x half> %B, <4 x float> %fneg.fabs.C)
|
||||
store <4 x float> %res, <4 x float> addrspace(1)* %out
|
||||
ret void
|
||||
}
|
||||
|
||||
define amdgpu_ps void @test_wmma_f16_16x16x16_f16_negabsC(<4 x half> %A, <4 x half> %B, <4 x half> %C, <4 x half> addrspace(1)* %out) {
|
||||
; GFX12-LABEL: test_wmma_f16_16x16x16_f16_negabsC:
|
||||
; GFX12: ; %bb.0: ; %bb
|
||||
; GFX12-NEXT: v_wmma_f16_16x16x16_f16 v[4:5], v[0:1], v[2:3], v[4:5] neg_lo:[0,0,1] neg_hi:[0,0,1]
|
||||
; GFX12-NEXT: global_store_b64 v[6:7], v[4:5], off
|
||||
; GFX12-NEXT: s_nop 0
|
||||
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
|
||||
; GFX12-NEXT: s_endpgm
|
||||
bb:
|
||||
%fabs.C = call <4 x half> @llvm.fabs.v4f16(<4 x half> %C)
|
||||
%fneg.fabs.C = fneg <4 x half> %fabs.C
|
||||
%res = call <4 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16.v8f16.v8f16(<4 x half> %A, <4 x half> %B, <4 x half> %fneg.fabs.C, i1 0)
|
||||
store <4 x half> %res, <4 x half> addrspace(1)* %out
|
||||
ret void
|
||||
}
|
||||
|
||||
define amdgpu_ps void @test_wmma_f32_16x16x16_f16_neg_partial_fabsA(<4 x half> %A, <4 x half> %B, <4 x float> %C, <4 x float> addrspace(1)* %out) {
|
||||
; GFX12-LABEL: test_wmma_f32_16x16x16_f16_neg_partial_fabsA:
|
||||
; GFX12: ; %bb.0: ; %bb
|
||||
; GFX12-NEXT: v_and_b32_e32 v7, 0x7fffffff, v7
|
||||
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
|
||||
; GFX12-NEXT: v_wmma_f32_16x16x16_f16 v[4:7], v[0:1], v[2:3], v[4:7] neg_lo:[0,0,1]
|
||||
; GFX12-NEXT: global_store_b128 v[8:9], v[4:7], off
|
||||
; GFX12-NEXT: s_nop 0
|
||||
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
|
||||
; GFX12-NEXT: s_endpgm
|
||||
bb:
|
||||
%el3 = extractelement <4 x float> %C, i32 3
|
||||
%el3.fabs = call float @llvm.fabs.f32(float %el3)
|
||||
%partial.fabs.C = insertelement <4 x float> %C, float %el3.fabs, i32 3
|
||||
%fneg.partial.fabs.C = fneg <4 x float> %partial.fabs.C
|
||||
%res = call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.f16.v4f16.v4f32(<4 x half> %A, <4 x half> %B, <4 x float> %fneg.partial.fabs.C)
|
||||
store <4 x float> %res, <4 x float> addrspace(1)* %out
|
||||
ret void
|
||||
}
|
||||
|
||||
; A or B matrix modifier and constant in C
|
||||
|
||||
define amdgpu_ps void @test_wmma_f32_16x16x16_f16_negA_constantC(<4 x half> %A, <4 x half> %B, <4 x float> %C, <4 x float> addrspace(1)* %out) {
|
||||
; GFX12-LABEL: test_wmma_f32_16x16x16_f16_negA_constantC:
|
||||
; GFX12: ; %bb.0: ; %bb
|
||||
; GFX12-NEXT: v_wmma_f32_16x16x16_f16 v[6:9], v[0:1], v[2:3], 1.0 neg_lo:[1,0,0] neg_hi:[1,0,0]
|
||||
; GFX12-NEXT: global_store_b128 v[4:5], v[6:9], off
|
||||
; GFX12-NEXT: s_nop 0
|
||||
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
|
||||
; GFX12-NEXT: s_endpgm
|
||||
bb:
|
||||
%fneg.A = fneg <4 x half> %A
|
||||
%res = call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.f16.v4f16.v4f32(<4 x half> %fneg.A, <4 x half> %B, <4 x float> <float 1.0, float 1.0, float 1.0, float 1.0>)
|
||||
store <4 x float> %res, <4 x float> addrspace(1)* %out
|
||||
ret void
|
||||
}
|
||||
|
||||
define amdgpu_ps void @test_wmma_f16_16x16x16_f16_negB_constantC(<4 x half> %A, <4 x half> %B, <4 x half> %C, <4 x half> addrspace(1)* %out) {
|
||||
; GFX12-LABEL: test_wmma_f16_16x16x16_f16_negB_constantC:
|
||||
; GFX12: ; %bb.0: ; %bb
|
||||
; GFX12-NEXT: v_wmma_f16_16x16x16_f16 v[6:7], v[0:1], v[2:3], 1.0 neg_lo:[0,1,0] neg_hi:[0,1,0]
|
||||
; GFX12-NEXT: global_store_b64 v[4:5], v[6:7], off
|
||||
; GFX12-NEXT: s_nop 0
|
||||
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
|
||||
; GFX12-NEXT: s_endpgm
|
||||
bb:
|
||||
%fneg.B = fneg <4 x half> %B
|
||||
%res = call <4 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16.v8f16.v8f16(<4 x half> %A, <4 x half> %fneg.B, <4 x half> <half 1.0, half 1.0, half 1.0, half 1.0>, i1 0)
|
||||
store <4 x half> %res, <4 x half> addrspace(1)* %out
|
||||
ret void
|
||||
}
|
||||
|
||||
; pack f16 elements with v_perm_b32 since they don't come from same b32
|
||||
|
||||
define amdgpu_ps void @test_wmma_f16_16x16x16_f16_negC_pack(<4 x half> %A, <4 x half> %B, ptr %Caddr, <4 x half> addrspace(1)* %out) {
|
||||
; GFX12-LABEL: test_wmma_f16_16x16x16_f16_negC_pack:
|
||||
; GFX12: ; %bb.0: ; %bb
|
||||
; GFX12-NEXT: flat_load_b128 v[8:11], v[4:5]
|
||||
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
|
||||
; GFX12-NEXT: v_and_b32_e32 v4, 0xffff, v8
|
||||
; GFX12-NEXT: v_and_b32_e32 v5, 0xffff, v10
|
||||
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
|
||||
; GFX12-NEXT: v_lshl_or_b32 v4, v9, 16, v4
|
||||
; GFX12-NEXT: v_lshl_or_b32 v5, v11, 16, v5
|
||||
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
|
||||
; GFX12-NEXT: v_wmma_f16_16x16x16_f16 v[4:5], v[0:1], v[2:3], v[4:5] neg_lo:[0,0,1]
|
||||
; GFX12-NEXT: global_store_b64 v[6:7], v[4:5], off
|
||||
; GFX12-NEXT: s_nop 0
|
||||
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
|
||||
; GFX12-NEXT: s_endpgm
|
||||
bb:
|
||||
%C = load <8 x half>, ptr %Caddr
|
||||
%C_shuffle = shufflevector <8 x half> %C, <8 x half> undef, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
|
||||
%fneg.C_shuffle = fneg <4 x half> %C_shuffle
|
||||
%res = call <4 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16.v8f16.v8f16(<4 x half> %A, <4 x half> %B, <4 x half> %fneg.C_shuffle , i1 0)
|
||||
store <4 x half> %res, <4 x half> addrspace(1)* %out
|
||||
ret void
|
||||
}
|
||||
|
||||
declare <4 x half> @llvm.fabs.v4f16(<4 x half>)
|
||||
declare <4 x float> @llvm.fabs.v4f32(<4 x float>)
|
||||
declare float @llvm.fabs.f32(float)
|
||||
|
||||
declare <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.f16.v4f16.v4f32(<4 x half>, <4 x half>, <4 x float>)
|
||||
declare <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf16.v4i16.v4f32(<4 x i16>, <4 x i16>, <4 x float>)
|
||||
declare <4 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16.v8f16.v8f16(<4 x half>, <4 x half>, <4 x half>, i1 immarg)
|
||||
declare <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.fp8.fp8.i32.v4f32(i32, i32, <4 x float>)
|
||||
declare <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.fp8.bf8.i32.v4f32(i32, i32, <4 x float>)
|
||||
declare <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf8.fp8.i32.v4f32(i32, i32, <4 x float>)
|
||||
declare <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf8.bf8.i32.v4f32(i32, i32, <4 x float>)
|
||||
declare <4 x float> @llvm.amdgcn.swmmac.f32.16x16x32.f16.v4f16.v8f16.v4f32.i16(<4 x half>, <8 x half>, <4 x float>, i16)
|
||||
declare <4 x half> @llvm.amdgcn.swmmac.f16.16x16x32.f16.v4f16.v8f16.v4f16.i16(<4 x half>, <8 x half>, <4 x half>, i16)
|
430
llvm/test/CodeGen/AMDGPU/GlobalISel/wmma-gfx12-w64-imm.ll
Normal file
430
llvm/test/CodeGen/AMDGPU/GlobalISel/wmma-gfx12-w64-imm.ll
Normal file
@ -0,0 +1,430 @@
|
||||
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
|
||||
; RUN: llc -global-isel -march=amdgcn -mcpu=gfx1200 -mattr=-wavefrontsize32,+wavefrontsize64 -verify-machineinstrs < %s | FileCheck %s --check-prefix=GFX12
|
||||
|
||||
define amdgpu_ps void @test_wmma_f32_16x16x16_f16_imm(<4 x half> %A, <4 x half> %B, ptr addrspace(1) %out) {
|
||||
; GFX12-LABEL: test_wmma_f32_16x16x16_f16_imm:
|
||||
; GFX12: ; %bb.0: ; %bb
|
||||
; GFX12-NEXT: v_wmma_f32_16x16x16_f16 v[6:9], v[0:1], v[2:3], 1.0
|
||||
; GFX12-NEXT: global_store_b128 v[4:5], v[6:9], off
|
||||
; GFX12-NEXT: s_nop 0
|
||||
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
|
||||
; GFX12-NEXT: s_endpgm
|
||||
bb:
|
||||
%res = call <4 x float>@llvm.amdgcn.wmma.f32.16x16x16.f16.v4f32.v4f16.v4f16.v4f32(<4 x half> %A, <4 x half> %B, <4 x float> <float 1.0, float 1.0, float 1.0, float 1.0>)
|
||||
store <4 x float> %res, ptr addrspace(1) %out
|
||||
ret void
|
||||
}
|
||||
|
||||
define amdgpu_ps void @test_wmma_f32_16x16x16_f16_imm_non_inlineable(<4 x half> %A, <4 x half> %B, ptr addrspace(1) %out) {
|
||||
; GFX12-LABEL: test_wmma_f32_16x16x16_f16_imm_non_inlineable:
|
||||
; GFX12: ; %bb.0: ; %bb
|
||||
; GFX12-NEXT: s_mov_b32 s0, 0x40400000
|
||||
; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
|
||||
; GFX12-NEXT: s_mov_b32 s3, s0
|
||||
; GFX12-NEXT: s_mov_b32 s1, s0
|
||||
; GFX12-NEXT: s_mov_b32 s2, s0
|
||||
; GFX12-NEXT: v_mov_b32_e32 v9, s3
|
||||
; GFX12-NEXT: v_mov_b32_e32 v8, s2
|
||||
; GFX12-NEXT: v_mov_b32_e32 v7, s1
|
||||
; GFX12-NEXT: v_mov_b32_e32 v6, s0
|
||||
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
|
||||
; GFX12-NEXT: v_wmma_f32_16x16x16_f16 v[6:9], v[0:1], v[2:3], v[6:9]
|
||||
; GFX12-NEXT: global_store_b128 v[4:5], v[6:9], off
|
||||
; GFX12-NEXT: s_nop 0
|
||||
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
|
||||
; GFX12-NEXT: s_endpgm
|
||||
bb:
|
||||
%res = call <4 x float>@llvm.amdgcn.wmma.f32.16x16x16.f16.v4f32.v4f16.v4f16.v4f32(<4 x half> %A, <4 x half> %B, <4 x float> <float 3.0, float 3.0, float 3.0, float 3.0>)
|
||||
store <4 x float> %res, ptr addrspace(1) %out
|
||||
ret void
|
||||
}
|
||||
|
||||
define amdgpu_ps void @test_wmma_f32_16x16x16_bf16_imm(<4 x i16> %A, <4 x i16> %B, ptr addrspace(1) %out) {
|
||||
; GFX12-LABEL: test_wmma_f32_16x16x16_bf16_imm:
|
||||
; GFX12: ; %bb.0: ; %bb
|
||||
; GFX12-NEXT: v_wmma_f32_16x16x16_bf16 v[6:9], v[0:1], v[2:3], 1.0
|
||||
; GFX12-NEXT: global_store_b128 v[4:5], v[6:9], off
|
||||
; GFX12-NEXT: s_nop 0
|
||||
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
|
||||
; GFX12-NEXT: s_endpgm
|
||||
bb:
|
||||
%res = call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf16.v4f32.v4i16.v4i16.v4f32(<4 x i16> %A, <4 x i16> %B, <4 x float> <float 1.0, float 1.0, float 1.0, float 1.0>)
|
||||
store <4 x float> %res, ptr addrspace(1) %out
|
||||
ret void
|
||||
}
|
||||
|
||||
define amdgpu_ps void @test_wmma_f32_16x16x16_bf16_imm_non_inlineable(<4 x i16> %A, <4 x i16> %B, ptr addrspace(1) %out) {
|
||||
; GFX12-LABEL: test_wmma_f32_16x16x16_bf16_imm_non_inlineable:
|
||||
; GFX12: ; %bb.0: ; %bb
|
||||
; GFX12-NEXT: s_mov_b32 s0, 0x40400000
|
||||
; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
|
||||
; GFX12-NEXT: s_mov_b32 s3, s0
|
||||
; GFX12-NEXT: s_mov_b32 s1, s0
|
||||
; GFX12-NEXT: s_mov_b32 s2, s0
|
||||
; GFX12-NEXT: v_mov_b32_e32 v9, s3
|
||||
; GFX12-NEXT: v_mov_b32_e32 v8, s2
|
||||
; GFX12-NEXT: v_mov_b32_e32 v7, s1
|
||||
; GFX12-NEXT: v_mov_b32_e32 v6, s0
|
||||
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
|
||||
; GFX12-NEXT: v_wmma_f32_16x16x16_bf16 v[6:9], v[0:1], v[2:3], v[6:9]
|
||||
; GFX12-NEXT: global_store_b128 v[4:5], v[6:9], off
|
||||
; GFX12-NEXT: s_nop 0
|
||||
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
|
||||
; GFX12-NEXT: s_endpgm
|
||||
bb:
|
||||
%res = call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf16.v4f32.v4i16.v4i16.v4f32(<4 x i16> %A, <4 x i16> %B, <4 x float> <float 3.0, float 3.0, float 3.0, float 3.0>)
|
||||
store <4 x float> %res, ptr addrspace(1) %out
|
||||
ret void
|
||||
}
|
||||
|
||||
define amdgpu_ps void @test_wmma_f16_16x16x16_f16_imm(<4 x half> %A, <4 x half> %B, ptr addrspace(1) %out) {
|
||||
; GFX12-LABEL: test_wmma_f16_16x16x16_f16_imm:
|
||||
; GFX12: ; %bb.0: ; %bb
|
||||
; GFX12-NEXT: v_wmma_f16_16x16x16_f16 v[6:7], v[0:1], v[2:3], 1.0
|
||||
; GFX12-NEXT: global_store_b64 v[4:5], v[6:7], off
|
||||
; GFX12-NEXT: s_nop 0
|
||||
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
|
||||
; GFX12-NEXT: s_endpgm
|
||||
bb:
|
||||
%res = call <4 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16.v4f16.v4f16.v4f16.v4f16(<4 x half> %A, <4 x half> %B, <4 x half> <half 1.0, half 1.0, half 1.0, half 1.0>, i1 0)
|
||||
store <4 x half> %res, ptr addrspace(1) %out
|
||||
ret void
|
||||
}
|
||||
|
||||
define amdgpu_ps void @test_wmma_f16_16x16x16_f16_imm_non_inlineable(<4 x half> %A, <4 x half> %B, ptr addrspace(1) %out) {
|
||||
; GFX12-LABEL: test_wmma_f16_16x16x16_f16_imm_non_inlineable:
|
||||
; GFX12: ; %bb.0: ; %bb
|
||||
; GFX12-NEXT: s_mov_b32 s0, 0x42004200
|
||||
; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
|
||||
; GFX12-NEXT: s_mov_b32 s1, s0
|
||||
; GFX12-NEXT: v_mov_b32_e32 v7, s1
|
||||
; GFX12-NEXT: v_mov_b32_e32 v6, s0
|
||||
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
|
||||
; GFX12-NEXT: v_wmma_f16_16x16x16_f16 v[6:7], v[0:1], v[2:3], v[6:7]
|
||||
; GFX12-NEXT: global_store_b64 v[4:5], v[6:7], off
|
||||
; GFX12-NEXT: s_nop 0
|
||||
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
|
||||
; GFX12-NEXT: s_endpgm
|
||||
bb:
|
||||
%res = call <4 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16.v4f16.v4f16.v4f16.v4f16(<4 x half> %A, <4 x half> %B, <4 x half> <half 3.0, half 3.0, half 3.0, half 3.0>, i1 0)
|
||||
store <4 x half> %res, ptr addrspace(1) %out
|
||||
ret void
|
||||
}
|
||||
|
||||
define amdgpu_ps void @test_wmma_bf16_16x16x16_bf16_imm(<4 x i16> %A, <4 x i16> %B, ptr addrspace(1) %out) {
|
||||
; GFX12-LABEL: test_wmma_bf16_16x16x16_bf16_imm:
|
||||
; GFX12: ; %bb.0: ; %bb
|
||||
; GFX12-NEXT: s_mov_b32 s0, 0x3f803f80
|
||||
; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
|
||||
; GFX12-NEXT: s_mov_b32 s1, s0
|
||||
; GFX12-NEXT: v_mov_b32_e32 v7, s1
|
||||
; GFX12-NEXT: v_mov_b32_e32 v6, s0
|
||||
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
|
||||
; GFX12-NEXT: v_wmma_bf16_16x16x16_bf16 v[6:7], v[0:1], v[2:3], v[6:7]
|
||||
; GFX12-NEXT: global_store_b64 v[4:5], v[6:7], off
|
||||
; GFX12-NEXT: s_nop 0
|
||||
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
|
||||
; GFX12-NEXT: s_endpgm
|
||||
bb:
|
||||
%res = call <4 x i16> @llvm.amdgcn.wmma.bf16.16x16x16.bf16.v4i16.v4i16.v4i16.v4i16(<4 x i16> %A, <4 x i16> %B, <4 x i16> <i16 16256, i16 16256, i16 16256, i16 16256>, i1 0)
|
||||
store <4 x i16> %res, ptr addrspace(1) %out
|
||||
ret void
|
||||
}
|
||||
|
||||
define amdgpu_ps void @test_wmma_bf16_16x16x16_bf16_imm_non_inlineable(<4 x i16> %A, <4 x i16> %B, ptr addrspace(1) %out) {
|
||||
; GFX12-LABEL: test_wmma_bf16_16x16x16_bf16_imm_non_inlineable:
|
||||
; GFX12: ; %bb.0: ; %bb
|
||||
; GFX12-NEXT: s_mov_b32 s0, 0x3fc03fc0
|
||||
; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
|
||||
; GFX12-NEXT: s_mov_b32 s1, s0
|
||||
; GFX12-NEXT: v_mov_b32_e32 v7, s1
|
||||
; GFX12-NEXT: v_mov_b32_e32 v6, s0
|
||||
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
|
||||
; GFX12-NEXT: v_wmma_bf16_16x16x16_bf16 v[6:7], v[0:1], v[2:3], v[6:7]
|
||||
; GFX12-NEXT: global_store_b64 v[4:5], v[6:7], off
|
||||
; GFX12-NEXT: s_nop 0
|
||||
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
|
||||
; GFX12-NEXT: s_endpgm
|
||||
bb:
|
||||
%res = call <4 x i16> @llvm.amdgcn.wmma.bf16.16x16x16.bf16.v4i16.v4i16.v4i16.v4i16(<4 x i16> %A, <4 x i16> %B, <4 x i16> <i16 16320, i16 16320, i16 16320, i16 16320>, i1 0)
|
||||
store <4 x i16> %res, ptr addrspace(1) %out
|
||||
ret void
|
||||
}
|
||||
|
||||
define amdgpu_ps void @test_wmma_i32_16x16x16_iu8_imm(i32 %A, i32 %B, ptr addrspace(1) %out) {
|
||||
; GFX12-LABEL: test_wmma_i32_16x16x16_iu8_imm:
|
||||
; GFX12: ; %bb.0: ; %bb
|
||||
; GFX12-NEXT: v_wmma_i32_16x16x16_iu8 v[4:7], v0, v1, 1
|
||||
; GFX12-NEXT: global_store_b128 v[2:3], v[4:7], off
|
||||
; GFX12-NEXT: s_nop 0
|
||||
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
|
||||
; GFX12-NEXT: s_endpgm
|
||||
bb:
|
||||
%res = call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8.v4i32.i32.i32.v4i32(i1 0, i32 %A, i1 0, i32 %B, <4 x i32> <i32 1, i32 1, i32 1, i32 1>, i1 0)
|
||||
store <4 x i32> %res, ptr addrspace(1) %out
|
||||
ret void
|
||||
}
|
||||
|
||||
define amdgpu_ps void @test_wmma_i32_16x16x16_iu8_imm_non_inlineable(i32 %A, i32 %B, ptr addrspace(1) %out) {
|
||||
; GFX12-LABEL: test_wmma_i32_16x16x16_iu8_imm_non_inlineable:
|
||||
; GFX12: ; %bb.0: ; %bb
|
||||
; GFX12-NEXT: s_movk_i32 s0, 0x80
|
||||
; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
|
||||
; GFX12-NEXT: s_mov_b32 s3, s0
|
||||
; GFX12-NEXT: s_mov_b32 s1, s0
|
||||
; GFX12-NEXT: s_mov_b32 s2, s0
|
||||
; GFX12-NEXT: v_mov_b32_e32 v7, s3
|
||||
; GFX12-NEXT: v_mov_b32_e32 v6, s2
|
||||
; GFX12-NEXT: v_mov_b32_e32 v5, s1
|
||||
; GFX12-NEXT: v_mov_b32_e32 v4, s0
|
||||
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
|
||||
; GFX12-NEXT: v_wmma_i32_16x16x16_iu8 v[4:7], v0, v1, v[4:7]
|
||||
; GFX12-NEXT: global_store_b128 v[2:3], v[4:7], off
|
||||
; GFX12-NEXT: s_nop 0
|
||||
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
|
||||
; GFX12-NEXT: s_endpgm
|
||||
bb:
|
||||
%res = call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8.v4i32.i32.i32.v4i32(i1 0, i32 %A, i1 0, i32 %B, <4 x i32> <i32 128, i32 128, i32 128, i32 128>, i1 0)
|
||||
store <4 x i32> %res, ptr addrspace(1) %out
|
||||
ret void
|
||||
}
|
||||
|
||||
define amdgpu_ps void @test_wmma_i32_16x16x16_iu4_imm(i32 %A, i32 %B, ptr addrspace(1) %out) {
|
||||
; GFX12-LABEL: test_wmma_i32_16x16x16_iu4_imm:
|
||||
; GFX12: ; %bb.0: ; %bb
|
||||
; GFX12-NEXT: v_wmma_i32_16x16x16_iu4 v[4:7], v0, v1, 1
|
||||
; GFX12-NEXT: global_store_b128 v[2:3], v[4:7], off
|
||||
; GFX12-NEXT: s_nop 0
|
||||
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
|
||||
; GFX12-NEXT: s_endpgm
|
||||
bb:
|
||||
%res = call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4.v4i32.i32.i32.v4i32(i1 0, i32 %A, i1 0, i32 %B, <4 x i32> <i32 1, i32 1, i32 1, i32 1>, i1 0)
|
||||
store <4 x i32> %res, ptr addrspace(1) %out
|
||||
ret void
|
||||
}
|
||||
|
||||
define amdgpu_ps void @test_wmma_i32_16x16x16_iu4_imm_non_inlineable(i32 %A, i32 %B, ptr addrspace(1) %out) {
|
||||
; GFX12-LABEL: test_wmma_i32_16x16x16_iu4_imm_non_inlineable:
|
||||
; GFX12: ; %bb.0: ; %bb
|
||||
; GFX12-NEXT: s_movk_i32 s0, 0x80
|
||||
; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
|
||||
; GFX12-NEXT: s_mov_b32 s3, s0
|
||||
; GFX12-NEXT: s_mov_b32 s1, s0
|
||||
; GFX12-NEXT: s_mov_b32 s2, s0
|
||||
; GFX12-NEXT: v_mov_b32_e32 v7, s3
|
||||
; GFX12-NEXT: v_mov_b32_e32 v6, s2
|
||||
; GFX12-NEXT: v_mov_b32_e32 v5, s1
|
||||
; GFX12-NEXT: v_mov_b32_e32 v4, s0
|
||||
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
|
||||
; GFX12-NEXT: v_wmma_i32_16x16x16_iu4 v[4:7], v0, v1, v[4:7]
|
||||
; GFX12-NEXT: global_store_b128 v[2:3], v[4:7], off
|
||||
; GFX12-NEXT: s_nop 0
|
||||
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
|
||||
; GFX12-NEXT: s_endpgm
|
||||
bb:
|
||||
%res = call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4.v4i32.i32.i32.v4i32(i1 0, i32 %A, i1 0, i32 %B, <4 x i32> <i32 128, i32 128, i32 128, i32 128>, i1 0)
|
||||
store <4 x i32> %res, ptr addrspace(1) %out
|
||||
ret void
|
||||
}
|
||||
|
||||
define amdgpu_ps void @test_wmma_f32_16x16x16_fp8_fp8_imm(i32 %A, i32 %B, <4 x float> %C, ptr addrspace(1) %out) {
|
||||
; GFX12-LABEL: test_wmma_f32_16x16x16_fp8_fp8_imm:
|
||||
; GFX12: ; %bb.0: ; %bb
|
||||
; GFX12-NEXT: v_wmma_f32_16x16x16_fp8_fp8 v[4:7], v0, v1, 1.0
|
||||
; GFX12-NEXT: global_store_b128 v[2:3], v[4:7], off
|
||||
; GFX12-NEXT: s_nop 0
|
||||
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
|
||||
; GFX12-NEXT: s_endpgm
|
||||
bb:
|
||||
%res = call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.fp8.fp8.v4f32.i32.i32.v4f32(i32 %A, i32 %B, <4 x float> <float 1.0, float 1.0, float 1.0, float 1.0>)
|
||||
store <4 x float> %res, ptr addrspace(1) %out
|
||||
ret void
|
||||
}
|
||||
|
||||
define amdgpu_ps void @test_wmma_f32_16x16x16_fp8_fp8_imm_non_inlineable(i32 %A, i32 %B, <4 x float> %C, ptr addrspace(1) %out) {
|
||||
; GFX12-LABEL: test_wmma_f32_16x16x16_fp8_fp8_imm_non_inlineable:
|
||||
; GFX12: ; %bb.0: ; %bb
|
||||
; GFX12-NEXT: s_mov_b32 s0, 0x40400000
|
||||
; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
|
||||
; GFX12-NEXT: s_mov_b32 s3, s0
|
||||
; GFX12-NEXT: s_mov_b32 s1, s0
|
||||
; GFX12-NEXT: s_mov_b32 s2, s0
|
||||
; GFX12-NEXT: v_mov_b32_e32 v7, s3
|
||||
; GFX12-NEXT: v_mov_b32_e32 v6, s2
|
||||
; GFX12-NEXT: v_mov_b32_e32 v5, s1
|
||||
; GFX12-NEXT: v_mov_b32_e32 v4, s0
|
||||
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
|
||||
; GFX12-NEXT: v_wmma_f32_16x16x16_fp8_fp8 v[4:7], v0, v1, v[4:7]
|
||||
; GFX12-NEXT: global_store_b128 v[2:3], v[4:7], off
|
||||
; GFX12-NEXT: s_nop 0
|
||||
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
|
||||
; GFX12-NEXT: s_endpgm
|
||||
bb:
|
||||
%res = call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.fp8.fp8.v4f32.i32.i32.v4f32(i32 %A, i32 %B, <4 x float> <float 3.0, float 3.0, float 3.0, float 3.0>)
|
||||
store <4 x float> %res, ptr addrspace(1) %out
|
||||
ret void
|
||||
}
|
||||
|
||||
define amdgpu_ps void @test_wmma_f32_16x16x16_bf8_fp8_imm(i32 %A, i32 %B, <4 x float> %C, ptr addrspace(1) %out) {
|
||||
; GFX12-LABEL: test_wmma_f32_16x16x16_bf8_fp8_imm:
|
||||
; GFX12: ; %bb.0: ; %bb
|
||||
; GFX12-NEXT: v_wmma_f32_16x16x16_bf8_fp8 v[4:7], v0, v1, 1.0
|
||||
; GFX12-NEXT: global_store_b128 v[2:3], v[4:7], off
|
||||
; GFX12-NEXT: s_nop 0
|
||||
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
|
||||
; GFX12-NEXT: s_endpgm
|
||||
bb:
|
||||
%res = call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf8.fp8.v4f32.i32.i32.v4f32(i32 %A, i32 %B, <4 x float> <float 1.0, float 1.0, float 1.0, float 1.0>)
|
||||
store <4 x float> %res, ptr addrspace(1) %out
|
||||
ret void
|
||||
}
|
||||
|
||||
define amdgpu_ps void @test_wmma_f32_16x16x16_bf8_fp8_imm_non_inlineable(i32 %A, i32 %B, <4 x float> %C, ptr addrspace(1) %out) {
|
||||
; GFX12-LABEL: test_wmma_f32_16x16x16_bf8_fp8_imm_non_inlineable:
|
||||
; GFX12: ; %bb.0: ; %bb
|
||||
; GFX12-NEXT: s_mov_b32 s0, 0x40400000
|
||||
; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
|
||||
; GFX12-NEXT: s_mov_b32 s3, s0
|
||||
; GFX12-NEXT: s_mov_b32 s1, s0
|
||||
; GFX12-NEXT: s_mov_b32 s2, s0
|
||||
; GFX12-NEXT: v_mov_b32_e32 v7, s3
|
||||
; GFX12-NEXT: v_mov_b32_e32 v6, s2
|
||||
; GFX12-NEXT: v_mov_b32_e32 v5, s1
|
||||
; GFX12-NEXT: v_mov_b32_e32 v4, s0
|
||||
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
|
||||
; GFX12-NEXT: v_wmma_f32_16x16x16_bf8_fp8 v[4:7], v0, v1, v[4:7]
|
||||
; GFX12-NEXT: global_store_b128 v[2:3], v[4:7], off
|
||||
; GFX12-NEXT: s_nop 0
|
||||
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
|
||||
; GFX12-NEXT: s_endpgm
|
||||
bb:
|
||||
%res = call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf8.fp8.v4f32.i32.i32.v4f32(i32 %A, i32 %B, <4 x float> <float 3.0, float 3.0, float 3.0, float 3.0>)
|
||||
store <4 x float> %res, ptr addrspace(1) %out
|
||||
ret void
|
||||
}
|
||||
|
||||
define amdgpu_ps void @test_wmma_f32_16x16x16_fp8_bf8_imm(i32 %A, i32 %B, <4 x float> %C, ptr addrspace(1) %out) {
|
||||
; GFX12-LABEL: test_wmma_f32_16x16x16_fp8_bf8_imm:
|
||||
; GFX12: ; %bb.0: ; %bb
|
||||
; GFX12-NEXT: v_wmma_f32_16x16x16_fp8_bf8 v[4:7], v0, v1, 1.0
|
||||
; GFX12-NEXT: global_store_b128 v[2:3], v[4:7], off
|
||||
; GFX12-NEXT: s_nop 0
|
||||
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
|
||||
; GFX12-NEXT: s_endpgm
|
||||
bb:
|
||||
%res = call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.fp8.bf8.v4f32.i32.i32.v4f32(i32 %A, i32 %B, <4 x float> <float 1.0, float 1.0, float 1.0, float 1.0>)
|
||||
store <4 x float> %res, ptr addrspace(1) %out
|
||||
ret void
|
||||
}
|
||||
|
||||
define amdgpu_ps void @test_wmma_f32_16x16x16_fp8_bf8_imm_non_inlineable(i32 %A, i32 %B, <4 x float> %C, ptr addrspace(1) %out) {
|
||||
; GFX12-LABEL: test_wmma_f32_16x16x16_fp8_bf8_imm_non_inlineable:
|
||||
; GFX12: ; %bb.0: ; %bb
|
||||
; GFX12-NEXT: s_mov_b32 s0, 0x40400000
|
||||
; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
|
||||
; GFX12-NEXT: s_mov_b32 s3, s0
|
||||
; GFX12-NEXT: s_mov_b32 s1, s0
|
||||
; GFX12-NEXT: s_mov_b32 s2, s0
|
||||
; GFX12-NEXT: v_mov_b32_e32 v7, s3
|
||||
; GFX12-NEXT: v_mov_b32_e32 v6, s2
|
||||
; GFX12-NEXT: v_mov_b32_e32 v5, s1
|
||||
; GFX12-NEXT: v_mov_b32_e32 v4, s0
|
||||
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
|
||||
; GFX12-NEXT: v_wmma_f32_16x16x16_fp8_bf8 v[4:7], v0, v1, v[4:7]
|
||||
; GFX12-NEXT: global_store_b128 v[2:3], v[4:7], off
|
||||
; GFX12-NEXT: s_nop 0
|
||||
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
|
||||
; GFX12-NEXT: s_endpgm
|
||||
bb:
|
||||
%res = call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.fp8.bf8.v4f32.i32.i32.v4f32(i32 %A, i32 %B, <4 x float> <float 3.0, float 3.0, float 3.0, float 3.0>)
|
||||
store <4 x float> %res, ptr addrspace(1) %out
|
||||
ret void
|
||||
}
|
||||
|
||||
define amdgpu_ps void @test_wmma_f32_16x16x16_bf8_bf8_imm(i32 %A, i32 %B, <4 x float> %C, ptr addrspace(1) %out) {
|
||||
; GFX12-LABEL: test_wmma_f32_16x16x16_bf8_bf8_imm:
|
||||
; GFX12: ; %bb.0: ; %bb
|
||||
; GFX12-NEXT: v_wmma_f32_16x16x16_bf8_bf8 v[4:7], v0, v1, 1.0
|
||||
; GFX12-NEXT: global_store_b128 v[2:3], v[4:7], off
|
||||
; GFX12-NEXT: s_nop 0
|
||||
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
|
||||
; GFX12-NEXT: s_endpgm
|
||||
bb:
|
||||
%res = call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf8.bf8.v4f32.i32.i32.v4f32(i32 %A, i32 %B, <4 x float> <float 1.0, float 1.0, float 1.0, float 1.0>)
|
||||
store <4 x float> %res, ptr addrspace(1) %out
|
||||
ret void
|
||||
}
|
||||
|
||||
define amdgpu_ps void @test_wmma_f32_16x16x16_bf8_bf8_imm_non_inlineable(i32 %A, i32 %B, <4 x float> %C, ptr addrspace(1) %out) {
|
||||
; GFX12-LABEL: test_wmma_f32_16x16x16_bf8_bf8_imm_non_inlineable:
|
||||
; GFX12: ; %bb.0: ; %bb
|
||||
; GFX12-NEXT: s_mov_b32 s0, 0x40400000
|
||||
; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
|
||||
; GFX12-NEXT: s_mov_b32 s3, s0
|
||||
; GFX12-NEXT: s_mov_b32 s1, s0
|
||||
; GFX12-NEXT: s_mov_b32 s2, s0
|
||||
; GFX12-NEXT: v_mov_b32_e32 v7, s3
|
||||
; GFX12-NEXT: v_mov_b32_e32 v6, s2
|
||||
; GFX12-NEXT: v_mov_b32_e32 v5, s1
|
||||
; GFX12-NEXT: v_mov_b32_e32 v4, s0
|
||||
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
|
||||
; GFX12-NEXT: v_wmma_f32_16x16x16_bf8_bf8 v[4:7], v0, v1, v[4:7]
|
||||
; GFX12-NEXT: global_store_b128 v[2:3], v[4:7], off
|
||||
; GFX12-NEXT: s_nop 0
|
||||
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
|
||||
; GFX12-NEXT: s_endpgm
|
||||
bb:
|
||||
%res = call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf8.bf8.v4f32.i32.i32.v4f32(i32 %A, i32 %B, <4 x float> <float 3.0, float 3.0, float 3.0, float 3.0>)
|
||||
store <4 x float> %res, ptr addrspace(1) %out
|
||||
ret void
|
||||
}
|
||||
|
||||
define amdgpu_ps void @test_wmma_i32_16x16x32_iu4_imm(i32 %A, i32 %B, <4 x i32> %C, ptr addrspace(1) %out) {
|
||||
; GFX12-LABEL: test_wmma_i32_16x16x32_iu4_imm:
|
||||
; GFX12: ; %bb.0: ; %bb
|
||||
; GFX12-NEXT: v_wmma_i32_16x16x32_iu4 v[4:7], v0, v1, 1
|
||||
; GFX12-NEXT: global_store_b128 v[2:3], v[4:7], off
|
||||
; GFX12-NEXT: s_nop 0
|
||||
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
|
||||
; GFX12-NEXT: s_endpgm
|
||||
bb:
|
||||
%res = call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x32.iu4.v4i32.i32.i32.v4i32(i1 0, i32 %A, i1 0, i32 %B, <4 x i32> <i32 1, i32 1, i32 1, i32 1>, i1 0)
|
||||
store <4 x i32> %res, ptr addrspace(1) %out
|
||||
ret void
|
||||
}
|
||||
|
||||
define amdgpu_ps void @test_wmma_i32_16x16x32_iu4_imm_non_inlineable(i32 %A, i32 %B, <4 x i32> %C, ptr addrspace(1) %out) {
|
||||
; GFX12-LABEL: test_wmma_i32_16x16x32_iu4_imm_non_inlineable:
|
||||
; GFX12: ; %bb.0: ; %bb
|
||||
; GFX12-NEXT: s_movk_i32 s0, 0x80
|
||||
; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
|
||||
; GFX12-NEXT: s_mov_b32 s3, s0
|
||||
; GFX12-NEXT: s_mov_b32 s1, s0
|
||||
; GFX12-NEXT: s_mov_b32 s2, s0
|
||||
; GFX12-NEXT: v_mov_b32_e32 v7, s3
|
||||
; GFX12-NEXT: v_mov_b32_e32 v6, s2
|
||||
; GFX12-NEXT: v_mov_b32_e32 v5, s1
|
||||
; GFX12-NEXT: v_mov_b32_e32 v4, s0
|
||||
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
|
||||
; GFX12-NEXT: v_wmma_i32_16x16x32_iu4 v[4:7], v0, v1, v[4:7]
|
||||
; GFX12-NEXT: global_store_b128 v[2:3], v[4:7], off
|
||||
; GFX12-NEXT: s_nop 0
|
||||
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
|
||||
; GFX12-NEXT: s_endpgm
|
||||
bb:
|
||||
%res = call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x32.iu4.v4i32.i32.i32.v4i32(i1 0, i32 %A, i1 0, i32 %B, <4 x i32> <i32 128, i32 128, i32 128, i32 128>, i1 0)
|
||||
store <4 x i32> %res, ptr addrspace(1) %out
|
||||
ret void
|
||||
}
|
||||
|
||||
declare <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.f16.v4f32.v4f16.v4f16.v4f32(<4 x half>, <4 x half>, <4 x float>)
|
||||
declare <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf16.v4f32.v4i16.v4i16.v4f32(<4 x i16>, <4 x i16>, <4 x float>)
|
||||
declare <4 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16.v4f16.v4f16.v4f16.v4f16(<4 x half>, <4 x half>, <4 x half>, i1 immarg)
|
||||
declare <4 x i16> @llvm.amdgcn.wmma.bf16.16x16x16.bf16.v4i16.v4i16.v4i16.v4i16(<4 x i16>, <4 x i16>, <4 x i16>, i1 immarg)
|
||||
declare <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8.v4i32.i32.i32.v4i32(i1 immarg, i32, i1 immarg, i32, <4 x i32>, i1 immarg)
|
||||
declare <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4.v4i32.i32.i32.v4i32(i1 immarg, i32, i1 immarg, i32, <4 x i32>, i1 immarg)
|
||||
declare <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.fp8.fp8.v4f32.i32.i32.v4f32(i32, i32, <4 x float>)
|
||||
declare <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.fp8.bf8.v4f32.i32.i32.v4f32(i32, i32, <4 x float>)
|
||||
declare <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf8.fp8.v4f32.i32.i32.v4f32(i32, i32, <4 x float>)
|
||||
declare <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf8.bf8.v4f32.i32.i32.v4f32(i32, i32, <4 x float>)
|
||||
declare <4 x i32> @llvm.amdgcn.wmma.i32.16x16x32.iu4.v4i32.i32.i32.v4i32(i1 immarg, i32, i1 immarg, i32, <4 x i32>, i1 immarg)
|
@ -0,0 +1,274 @@
|
||||
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
|
||||
; RUN: llc -global-isel -march=amdgcn -mcpu=gfx1200 -mattr=-wavefrontsize32,+wavefrontsize64 -verify-machineinstrs < %s | FileCheck %s --check-prefix=GFX12
|
||||
|
||||
define amdgpu_ps void @test_wmma_i32_16x16x16_iu8_zext_src0(i32 %A, i32 %B, <4 x i32> %C, ptr addrspace(1) %out) {
|
||||
; GFX12-LABEL: test_wmma_i32_16x16x16_iu8_zext_src0:
|
||||
; GFX12: ; %bb.0: ; %bb
|
||||
; GFX12-NEXT: v_wmma_i32_16x16x16_iu8 v[2:5], v0, v1, v[2:5] neg_lo:[1,0,0]
|
||||
; GFX12-NEXT: global_store_b128 v[6:7], v[2:5], off
|
||||
; GFX12-NEXT: s_nop 0
|
||||
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
|
||||
; GFX12-NEXT: s_endpgm
|
||||
bb:
|
||||
%res = call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8.v4i32.i32.i32.v4i32(i1 1, i32 %A, i1 0, i32 %B, <4 x i32> %C, i1 0)
|
||||
store <4 x i32> %res, ptr addrspace(1) %out
|
||||
ret void
|
||||
}
|
||||
|
||||
define amdgpu_ps void @test_wmma_i32_16x16x16_iu8_zext_src1(i32 %A, i32 %B, <4 x i32> %C, ptr addrspace(1) %out) {
|
||||
; GFX12-LABEL: test_wmma_i32_16x16x16_iu8_zext_src1:
|
||||
; GFX12: ; %bb.0: ; %bb
|
||||
; GFX12-NEXT: v_wmma_i32_16x16x16_iu8 v[2:5], v0, v1, v[2:5] neg_lo:[0,1,0]
|
||||
; GFX12-NEXT: global_store_b128 v[6:7], v[2:5], off
|
||||
; GFX12-NEXT: s_nop 0
|
||||
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
|
||||
; GFX12-NEXT: s_endpgm
|
||||
bb:
|
||||
%res = call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8.v4i32.i32.i32.v4i32(i1 0, i32 %A, i1 1, i32 %B, <4 x i32> %C, i1 0)
|
||||
store <4 x i32> %res, ptr addrspace(1) %out
|
||||
ret void
|
||||
}
|
||||
|
||||
define amdgpu_ps void @test_wmma_i32_16x16x16_iu8_clamp(i32 %A, i32 %B, <4 x i32> %C, ptr addrspace(1) %out) {
|
||||
; GFX12-LABEL: test_wmma_i32_16x16x16_iu8_clamp:
|
||||
; GFX12: ; %bb.0: ; %bb
|
||||
; GFX12-NEXT: v_wmma_i32_16x16x16_iu8 v[2:5], v0, v1, v[2:5] clamp
|
||||
; GFX12-NEXT: global_store_b128 v[6:7], v[2:5], off
|
||||
; GFX12-NEXT: s_nop 0
|
||||
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
|
||||
; GFX12-NEXT: s_endpgm
|
||||
bb:
|
||||
%res = call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8.v4i32.i32.i32.v4i32(i1 0, i32 %A, i1 0, i32 %B, <4 x i32> %C, i1 1)
|
||||
store <4 x i32> %res, ptr addrspace(1) %out
|
||||
ret void
|
||||
}
|
||||
|
||||
|
||||
|
||||
define amdgpu_ps void @test_wmma_i32_16x16x16_iu4_zext_src0(i32 %A, i32 %B, <4 x i32> %C, ptr addrspace(1) %out) {
|
||||
; GFX12-LABEL: test_wmma_i32_16x16x16_iu4_zext_src0:
|
||||
; GFX12: ; %bb.0: ; %bb
|
||||
; GFX12-NEXT: v_wmma_i32_16x16x16_iu4 v[2:5], v0, v1, v[2:5] neg_lo:[1,0,0]
|
||||
; GFX12-NEXT: global_store_b128 v[6:7], v[2:5], off
|
||||
; GFX12-NEXT: s_nop 0
|
||||
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
|
||||
; GFX12-NEXT: s_endpgm
|
||||
bb:
|
||||
%res = call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4.v4i32.i32.i32.v4i32(i1 1, i32 %A, i1 0, i32 %B, <4 x i32> %C, i1 0)
|
||||
store <4 x i32> %res, ptr addrspace(1) %out
|
||||
ret void
|
||||
}
|
||||
|
||||
define amdgpu_ps void @test_wmma_i32_16x16x16_iu4_zext_src1(i32 %A, i32 %B, <4 x i32> %C, ptr addrspace(1) %out) {
|
||||
; GFX12-LABEL: test_wmma_i32_16x16x16_iu4_zext_src1:
|
||||
; GFX12: ; %bb.0: ; %bb
|
||||
; GFX12-NEXT: v_wmma_i32_16x16x16_iu4 v[2:5], v0, v1, v[2:5] neg_lo:[0,1,0]
|
||||
; GFX12-NEXT: global_store_b128 v[6:7], v[2:5], off
|
||||
; GFX12-NEXT: s_nop 0
|
||||
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
|
||||
; GFX12-NEXT: s_endpgm
|
||||
bb:
|
||||
%res = call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4.v4i32.i32.i32.v4i32(i1 0, i32 %A, i1 1, i32 %B, <4 x i32> %C, i1 0)
|
||||
store <4 x i32> %res, ptr addrspace(1) %out
|
||||
ret void
|
||||
}
|
||||
|
||||
define amdgpu_ps void @test_wmma_i32_16x16x16_iu4_clamp(i32 %A, i32 %B, <4 x i32> %C, ptr addrspace(1) %out) {
|
||||
; GFX12-LABEL: test_wmma_i32_16x16x16_iu4_clamp:
|
||||
; GFX12: ; %bb.0: ; %bb
|
||||
; GFX12-NEXT: v_wmma_i32_16x16x16_iu4 v[2:5], v0, v1, v[2:5] clamp
|
||||
; GFX12-NEXT: global_store_b128 v[6:7], v[2:5], off
|
||||
; GFX12-NEXT: s_nop 0
|
||||
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
|
||||
; GFX12-NEXT: s_endpgm
|
||||
bb:
|
||||
%res = call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4.v4i32.i32.i32.v4i32(i1 0, i32 %A, i1 0, i32 %B, <4 x i32> %C, i1 1)
|
||||
store <4 x i32> %res, ptr addrspace(1) %out
|
||||
ret void
|
||||
}
|
||||
|
||||
|
||||
|
||||
define amdgpu_ps void @test_wmma_i32_16x16x32_iu4_zext_src0(i32 %A, i32 %B, <4 x i32> %C, ptr addrspace(1) %out) {
|
||||
; GFX12-LABEL: test_wmma_i32_16x16x32_iu4_zext_src0:
|
||||
; GFX12: ; %bb.0: ; %bb
|
||||
; GFX12-NEXT: v_wmma_i32_16x16x32_iu4 v[2:5], v0, v1, v[2:5] neg_lo:[1,0,0]
|
||||
; GFX12-NEXT: global_store_b128 v[6:7], v[2:5], off
|
||||
; GFX12-NEXT: s_nop 0
|
||||
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
|
||||
; GFX12-NEXT: s_endpgm
|
||||
bb:
|
||||
%res = call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x32.iu4.v4i32.i32.i32.v4i32(i1 1, i32 %A, i1 0, i32 %B, <4 x i32> %C, i1 0)
|
||||
store <4 x i32> %res, ptr addrspace(1) %out
|
||||
ret void
|
||||
}
|
||||
|
||||
define amdgpu_ps void @test_wmma_i32_16x16x32_iu4_zext_src1(i32 %A, i32 %B, <4 x i32> %C, ptr addrspace(1) %out) {
|
||||
; GFX12-LABEL: test_wmma_i32_16x16x32_iu4_zext_src1:
|
||||
; GFX12: ; %bb.0: ; %bb
|
||||
; GFX12-NEXT: v_wmma_i32_16x16x32_iu4 v[2:5], v0, v1, v[2:5] neg_lo:[0,1,0]
|
||||
; GFX12-NEXT: global_store_b128 v[6:7], v[2:5], off
|
||||
; GFX12-NEXT: s_nop 0
|
||||
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
|
||||
; GFX12-NEXT: s_endpgm
|
||||
bb:
|
||||
%res = call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x32.iu4.v4i32.i32.i32.v4i32(i1 0, i32 %A, i1 1, i32 %B, <4 x i32> %C, i1 0)
|
||||
store <4 x i32> %res, ptr addrspace(1) %out
|
||||
ret void
|
||||
}
|
||||
|
||||
define amdgpu_ps void @test_wmma_i32_16x16x32_iu4_clamp(i32 %A, i32 %B, <4 x i32> %C, ptr addrspace(1) %out) {
|
||||
; GFX12-LABEL: test_wmma_i32_16x16x32_iu4_clamp:
|
||||
; GFX12: ; %bb.0: ; %bb
|
||||
; GFX12-NEXT: v_wmma_i32_16x16x32_iu4 v[2:5], v0, v1, v[2:5] clamp
|
||||
; GFX12-NEXT: global_store_b128 v[6:7], v[2:5], off
|
||||
; GFX12-NEXT: s_nop 0
|
||||
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
|
||||
; GFX12-NEXT: s_endpgm
|
||||
bb:
|
||||
%res = call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x32.iu4.v4i32.i32.i32.v4i32(i1 0, i32 %A, i1 0, i32 %B, <4 x i32> %C, i1 1)
|
||||
store <4 x i32> %res, ptr addrspace(1) %out
|
||||
ret void
|
||||
}
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
define amdgpu_ps void @test_swmmac_i32_16x16x32_iu8_zext_src0(i32 %A, <2 x i32> %B, <4 x i32> %C, i8 %Index, ptr addrspace(1) %out) {
|
||||
; GFX12-LABEL: test_swmmac_i32_16x16x32_iu8_zext_src0:
|
||||
; GFX12: ; %bb.0: ; %bb
|
||||
; GFX12-NEXT: v_swmmac_i32_16x16x32_iu8 v[3:6], v0, v[1:2], v7 neg_lo:[1,0,0]
|
||||
; GFX12-NEXT: global_store_b128 v[8:9], v[3:6], off
|
||||
; GFX12-NEXT: s_nop 0
|
||||
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
|
||||
; GFX12-NEXT: s_endpgm
|
||||
bb:
|
||||
%res = call <4 x i32> @llvm.amdgcn.swmmac.i32.16x16x32.iu8.v4i32.i32.v2i32.v4i32.i8(i1 1, i32 %A, i1 0, <2 x i32> %B, <4 x i32> %C, i8 %Index, i1 0)
|
||||
store <4 x i32> %res, ptr addrspace(1) %out
|
||||
ret void
|
||||
}
|
||||
|
||||
define amdgpu_ps void @test_swmmac_i32_16x16x32_iu8_zext_src1(i32 %A, <2 x i32> %B, <4 x i32> %C, i8 %Index, ptr addrspace(1) %out) {
|
||||
; GFX12-LABEL: test_swmmac_i32_16x16x32_iu8_zext_src1:
|
||||
; GFX12: ; %bb.0: ; %bb
|
||||
; GFX12-NEXT: v_swmmac_i32_16x16x32_iu8 v[3:6], v0, v[1:2], v7 neg_lo:[0,1,0]
|
||||
; GFX12-NEXT: global_store_b128 v[8:9], v[3:6], off
|
||||
; GFX12-NEXT: s_nop 0
|
||||
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
|
||||
; GFX12-NEXT: s_endpgm
|
||||
bb:
|
||||
%res = call <4 x i32> @llvm.amdgcn.swmmac.i32.16x16x32.iu8.v4i32.i32.v2i32.v4i32.i8(i1 0, i32 %A, i1 1, <2 x i32> %B, <4 x i32> %C, i8 %Index, i1 0)
|
||||
store <4 x i32> %res, ptr addrspace(1) %out
|
||||
ret void
|
||||
}
|
||||
|
||||
define amdgpu_ps void @test_swmmac_i32_16x16x32_iu8_clamp(i32 %A, <2 x i32> %B, <4 x i32> %C, i8 %Index, ptr addrspace(1) %out) {
|
||||
; GFX12-LABEL: test_swmmac_i32_16x16x32_iu8_clamp:
|
||||
; GFX12: ; %bb.0: ; %bb
|
||||
; GFX12-NEXT: v_swmmac_i32_16x16x32_iu8 v[3:6], v0, v[1:2], v7 clamp
|
||||
; GFX12-NEXT: global_store_b128 v[8:9], v[3:6], off
|
||||
; GFX12-NEXT: s_nop 0
|
||||
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
|
||||
; GFX12-NEXT: s_endpgm
|
||||
bb:
|
||||
%res = call <4 x i32> @llvm.amdgcn.swmmac.i32.16x16x32.iu8.v4i32.i32.v2i32.v4i32.i8(i1 0, i32 %A, i1 0, <2 x i32> %B, <4 x i32> %C, i8 %Index, i1 1)
|
||||
store <4 x i32> %res, ptr addrspace(1) %out
|
||||
ret void
|
||||
}
|
||||
|
||||
|
||||
|
||||
define amdgpu_ps void @test_swmmac_i32_16x16x32_iu4_zext_src0(i32 %A, i32 %B, <4 x i32> %C, i16 %Index, ptr addrspace(1) %out) {
|
||||
; GFX12-LABEL: test_swmmac_i32_16x16x32_iu4_zext_src0:
|
||||
; GFX12: ; %bb.0: ; %bb
|
||||
; GFX12-NEXT: v_swmmac_i32_16x16x32_iu4 v[2:5], v0, v1, v6 neg_lo:[1,0,0]
|
||||
; GFX12-NEXT: global_store_b128 v[7:8], v[2:5], off
|
||||
; GFX12-NEXT: s_nop 0
|
||||
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
|
||||
; GFX12-NEXT: s_endpgm
|
||||
bb:
|
||||
%res = call <4 x i32> @llvm.amdgcn.swmmac.i32.16x16x32.iu4.v4i32.i32.i32.v4i32.i16(i1 1, i32 %A, i1 0, i32 %B, <4 x i32> %C, i16 %Index, i1 0)
|
||||
store <4 x i32> %res, ptr addrspace(1) %out
|
||||
ret void
|
||||
}
|
||||
|
||||
define amdgpu_ps void @test_swmmac_i32_16x16x32_iu4_zext_src1(i32 %A, i32 %B, <4 x i32> %C, i16 %Index, ptr addrspace(1) %out) {
|
||||
; GFX12-LABEL: test_swmmac_i32_16x16x32_iu4_zext_src1:
|
||||
; GFX12: ; %bb.0: ; %bb
|
||||
; GFX12-NEXT: v_swmmac_i32_16x16x32_iu4 v[2:5], v0, v1, v6 neg_lo:[0,1,0]
|
||||
; GFX12-NEXT: global_store_b128 v[7:8], v[2:5], off
|
||||
; GFX12-NEXT: s_nop 0
|
||||
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
|
||||
; GFX12-NEXT: s_endpgm
|
||||
bb:
|
||||
%res = call <4 x i32> @llvm.amdgcn.swmmac.i32.16x16x32.iu4.v4i32.i32.i32.v4i32.i16(i1 0, i32 %A, i1 1, i32 %B, <4 x i32> %C, i16 %Index, i1 0)
|
||||
store <4 x i32> %res, ptr addrspace(1) %out
|
||||
ret void
|
||||
}
|
||||
|
||||
define amdgpu_ps void @test_swmmac_i32_16x16x32_iu4_clamp(i32 %A, i32 %B, <4 x i32> %C, i16 %Index, ptr addrspace(1) %out) {
|
||||
; GFX12-LABEL: test_swmmac_i32_16x16x32_iu4_clamp:
|
||||
; GFX12: ; %bb.0: ; %bb
|
||||
; GFX12-NEXT: v_swmmac_i32_16x16x32_iu4 v[2:5], v0, v1, v6 clamp
|
||||
; GFX12-NEXT: global_store_b128 v[7:8], v[2:5], off
|
||||
; GFX12-NEXT: s_nop 0
|
||||
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
|
||||
; GFX12-NEXT: s_endpgm
|
||||
bb:
|
||||
%res = call <4 x i32> @llvm.amdgcn.swmmac.i32.16x16x32.iu4.v4i32.i32.i32.v4i32.i16(i1 0, i32 %A, i1 0, i32 %B, <4 x i32> %C, i16 %Index, i1 1)
|
||||
store <4 x i32> %res, ptr addrspace(1) %out
|
||||
ret void
|
||||
}
|
||||
|
||||
|
||||
|
||||
define amdgpu_ps void @test_swmmac_i32_16x16x64_iu4_zext_src0(i32 %A, <2 x i32> %B, <4 x i32> %C, i16 %Index, ptr addrspace(1) %out) {
|
||||
; GFX12-LABEL: test_swmmac_i32_16x16x64_iu4_zext_src0:
|
||||
; GFX12: ; %bb.0: ; %bb
|
||||
; GFX12-NEXT: v_swmmac_i32_16x16x64_iu4 v[3:6], v0, v[1:2], v7 neg_lo:[1,0,0]
|
||||
; GFX12-NEXT: global_store_b128 v[8:9], v[3:6], off
|
||||
; GFX12-NEXT: s_nop 0
|
||||
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
|
||||
; GFX12-NEXT: s_endpgm
|
||||
bb:
|
||||
%res = call <4 x i32> @llvm.amdgcn.swmmac.i32.16x16x64.iu4.v4i32.i32.v2i32.v4i32.i16(i1 1, i32 %A, i1 0, <2 x i32> %B, <4 x i32> %C, i16 %Index, i1 0)
|
||||
store <4 x i32> %res, ptr addrspace(1) %out
|
||||
ret void
|
||||
}
|
||||
|
||||
define amdgpu_ps void @test_swmmac_i32_16x16x64_iu4_zext_src1(i32 %A, <2 x i32> %B, <4 x i32> %C, i16 %Index, ptr addrspace(1) %out) {
|
||||
; GFX12-LABEL: test_swmmac_i32_16x16x64_iu4_zext_src1:
|
||||
; GFX12: ; %bb.0: ; %bb
|
||||
; GFX12-NEXT: v_swmmac_i32_16x16x64_iu4 v[3:6], v0, v[1:2], v7 neg_lo:[0,1,0]
|
||||
; GFX12-NEXT: global_store_b128 v[8:9], v[3:6], off
|
||||
; GFX12-NEXT: s_nop 0
|
||||
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
|
||||
; GFX12-NEXT: s_endpgm
|
||||
bb:
|
||||
%res = call <4 x i32> @llvm.amdgcn.swmmac.i32.16x16x64.iu4.v4i32.i32.v2i32.v4i32.i16(i1 0, i32 %A, i1 1, <2 x i32> %B, <4 x i32> %C, i16 %Index, i1 0)
|
||||
store <4 x i32> %res, ptr addrspace(1) %out
|
||||
ret void
|
||||
}
|
||||
|
||||
define amdgpu_ps void @test_swmmac_i32_16x16x64_iu4_clamp(i32 %A, <2 x i32> %B, <4 x i32> %C, i16 %Index, ptr addrspace(1) %out) {
|
||||
; GFX12-LABEL: test_swmmac_i32_16x16x64_iu4_clamp:
|
||||
; GFX12: ; %bb.0: ; %bb
|
||||
; GFX12-NEXT: v_swmmac_i32_16x16x64_iu4 v[3:6], v0, v[1:2], v7 clamp
|
||||
; GFX12-NEXT: global_store_b128 v[8:9], v[3:6], off
|
||||
; GFX12-NEXT: s_nop 0
|
||||
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
|
||||
; GFX12-NEXT: s_endpgm
|
||||
bb:
|
||||
%res = call <4 x i32> @llvm.amdgcn.swmmac.i32.16x16x64.iu4.v4i32.i32.v2i32.v4i32.i16(i1 0, i32 %A, i1 0, <2 x i32> %B, <4 x i32> %C, i16 %Index, i1 1)
|
||||
store <4 x i32> %res, ptr addrspace(1) %out
|
||||
ret void
|
||||
}
|
||||
|
||||
declare <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8.v4i32.i32.i32.v4i32(i1 immarg, i32, i1 immarg, i32, <4 x i32>, i1 immarg)
|
||||
declare <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4.v4i32.i32.i32.v4i32(i1 immarg, i32, i1 immarg, i32, <4 x i32>, i1 immarg)
|
||||
declare <4 x i32> @llvm.amdgcn.wmma.i32.16x16x32.iu4.v4i32.i32.i32.v4i32(i1 immarg, i32, i1 immarg, i32, <4 x i32>, i1 immarg)
|
||||
declare <4 x i32> @llvm.amdgcn.swmmac.i32.16x16x32.iu8.v4i32.i32.v2i32.v4i32.i8(i1 immarg, i32, i1 immarg, <2 x i32>, <4 x i32>, i8 %Index, i1 immarg)
|
||||
declare <4 x i32> @llvm.amdgcn.swmmac.i32.16x16x32.iu4.v4i32.i32.i32.v4i32.i16(i1 immarg, i32, i1 immarg, i32, <4 x i32>, i16 %Index, i1 immarg)
|
||||
declare <4 x i32> @llvm.amdgcn.swmmac.i32.16x16x64.iu4.v4i32.i32.v2i32.v4i32.i16(i1 immarg, i32, i1 immarg, <2 x i32>, <4 x i32>, i16 %Index, i1 immarg)
|
@ -0,0 +1,472 @@
|
||||
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
|
||||
; RUN: llc -global-isel -march=amdgcn -mcpu=gfx1200 -mattr=-wavefrontsize32,+wavefrontsize64 -verify-machineinstrs < %s | FileCheck %s --check-prefix=GFX12
|
||||
|
||||
define amdgpu_ps void @test_swmmac_f32_16x16x32_f16_index_key(<4 x half> %A, <8 x half> %B, <4 x float> %C, ptr addrspace(1) %IndexVecPtr, ptr addrspace(1) %out0, ptr addrspace(1) %out1, ptr addrspace(1) %out2, ptr addrspace(1) %out3) {
|
||||
; GFX12-LABEL: test_swmmac_f32_16x16x32_f16_index_key:
|
||||
; GFX12: ; %bb.0: ; %bb
|
||||
; GFX12-NEXT: global_load_b32 v10, v[10:11], off
|
||||
; GFX12-NEXT: v_mov_b32_e32 v23, v9
|
||||
; GFX12-NEXT: v_mov_b32_e32 v22, v8
|
||||
; GFX12-NEXT: v_mov_b32_e32 v21, v7
|
||||
; GFX12-NEXT: v_mov_b32_e32 v20, v6
|
||||
; GFX12-NEXT: v_mov_b32_e32 v27, v9
|
||||
; GFX12-NEXT: v_mov_b32_e32 v26, v8
|
||||
; GFX12-NEXT: v_mov_b32_e32 v25, v7
|
||||
; GFX12-NEXT: v_mov_b32_e32 v24, v6
|
||||
; GFX12-NEXT: v_mov_b32_e32 v31, v9
|
||||
; GFX12-NEXT: v_mov_b32_e32 v30, v8
|
||||
; GFX12-NEXT: v_mov_b32_e32 v29, v7
|
||||
; GFX12-NEXT: v_mov_b32_e32 v28, v6
|
||||
; GFX12-NEXT: s_wait_loadcnt 0x0
|
||||
; GFX12-NEXT: v_swmmac_f32_16x16x32_f16 v[20:23], v[0:1], v[2:5], v10
|
||||
; GFX12-NEXT: v_swmmac_f32_16x16x32_f16 v[24:27], v[0:1], v[2:5], v10 index_key:1
|
||||
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3)
|
||||
; GFX12-NEXT: v_swmmac_f32_16x16x32_f16 v[28:31], v[0:1], v[2:5], v10 index_key:2
|
||||
; GFX12-NEXT: v_swmmac_f32_16x16x32_f16 v[6:9], v[0:1], v[2:5], v10 index_key:3
|
||||
; GFX12-NEXT: global_store_b128 v[12:13], v[20:23], off
|
||||
; GFX12-NEXT: global_store_b128 v[14:15], v[24:27], off
|
||||
; GFX12-NEXT: global_store_b128 v[16:17], v[28:31], off
|
||||
; GFX12-NEXT: global_store_b128 v[18:19], v[6:9], off
|
||||
; GFX12-NEXT: s_nop 0
|
||||
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
|
||||
; GFX12-NEXT: s_endpgm
|
||||
bb:
|
||||
%IndexVec = load <4 x i8>, ptr addrspace(1) %IndexVecPtr, align 4
|
||||
%Index0 = extractelement <4 x i8> %IndexVec, i32 0
|
||||
%res0 = call <4 x float> @llvm.amdgcn.swmmac.f32.16x16x32.f16.v4f32.v4f16.v8f16.v4f32.i8(<4 x half> %A, <8 x half> %B, <4 x float> %C, i8 %Index0)
|
||||
store <4 x float> %res0, ptr addrspace(1) %out0
|
||||
%Index1 = extractelement <4 x i8> %IndexVec, i32 1
|
||||
%res1 = call <4 x float> @llvm.amdgcn.swmmac.f32.16x16x32.f16.v4f32.v4f16.v8f16.v4f32.i8(<4 x half> %A, <8 x half> %B, <4 x float> %C, i8 %Index1)
|
||||
store <4 x float> %res1, ptr addrspace(1) %out1
|
||||
%Index2 = extractelement <4 x i8> %IndexVec, i32 2
|
||||
%res2 = call <4 x float> @llvm.amdgcn.swmmac.f32.16x16x32.f16.v4f32.v4f16.v8f16.v4f32.i8(<4 x half> %A, <8 x half> %B, <4 x float> %C, i8 %Index2)
|
||||
store <4 x float> %res2, ptr addrspace(1) %out2
|
||||
%Index3 = extractelement <4 x i8> %IndexVec, i32 3
|
||||
%res3 = call <4 x float> @llvm.amdgcn.swmmac.f32.16x16x32.f16.v4f32.v4f16.v8f16.v4f32.i8(<4 x half> %A, <8 x half> %B, <4 x float> %C, i8 %Index3)
|
||||
store <4 x float> %res3, ptr addrspace(1) %out3
|
||||
ret void
|
||||
}
|
||||
|
||||
define amdgpu_ps void @test_swmmac_f32_16x16x32_bf16_index_key(<4 x i16> %A, <8 x i16> %B, <4 x float> %C, ptr addrspace(1) %IndexVecPtr, ptr addrspace(1) %out0, ptr addrspace(1) %out1, ptr addrspace(1) %out2, ptr addrspace(1) %out3) {
|
||||
; GFX12-LABEL: test_swmmac_f32_16x16x32_bf16_index_key:
|
||||
; GFX12: ; %bb.0: ; %bb
|
||||
; GFX12-NEXT: global_load_b32 v10, v[10:11], off
|
||||
; GFX12-NEXT: v_mov_b32_e32 v23, v9
|
||||
; GFX12-NEXT: v_mov_b32_e32 v22, v8
|
||||
; GFX12-NEXT: v_mov_b32_e32 v21, v7
|
||||
; GFX12-NEXT: v_mov_b32_e32 v20, v6
|
||||
; GFX12-NEXT: v_mov_b32_e32 v27, v9
|
||||
; GFX12-NEXT: v_mov_b32_e32 v26, v8
|
||||
; GFX12-NEXT: v_mov_b32_e32 v25, v7
|
||||
; GFX12-NEXT: v_mov_b32_e32 v24, v6
|
||||
; GFX12-NEXT: v_mov_b32_e32 v31, v9
|
||||
; GFX12-NEXT: v_mov_b32_e32 v30, v8
|
||||
; GFX12-NEXT: v_mov_b32_e32 v29, v7
|
||||
; GFX12-NEXT: v_mov_b32_e32 v28, v6
|
||||
; GFX12-NEXT: s_wait_loadcnt 0x0
|
||||
; GFX12-NEXT: v_swmmac_f32_16x16x32_bf16 v[20:23], v[0:1], v[2:5], v10
|
||||
; GFX12-NEXT: v_swmmac_f32_16x16x32_bf16 v[24:27], v[0:1], v[2:5], v10 index_key:1
|
||||
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3)
|
||||
; GFX12-NEXT: v_swmmac_f32_16x16x32_bf16 v[28:31], v[0:1], v[2:5], v10 index_key:2
|
||||
; GFX12-NEXT: v_swmmac_f32_16x16x32_bf16 v[6:9], v[0:1], v[2:5], v10 index_key:3
|
||||
; GFX12-NEXT: global_store_b128 v[12:13], v[20:23], off
|
||||
; GFX12-NEXT: global_store_b128 v[14:15], v[24:27], off
|
||||
; GFX12-NEXT: global_store_b128 v[16:17], v[28:31], off
|
||||
; GFX12-NEXT: global_store_b128 v[18:19], v[6:9], off
|
||||
; GFX12-NEXT: s_nop 0
|
||||
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
|
||||
; GFX12-NEXT: s_endpgm
|
||||
bb:
|
||||
%IndexVec = load <4 x i8>, ptr addrspace(1) %IndexVecPtr, align 4
|
||||
%Index0 = extractelement <4 x i8> %IndexVec, i32 0
|
||||
%res0 = call <4 x float> @llvm.amdgcn.swmmac.f32.16x16x32.bf16.v4f32.v4i16.v8i16.v4f32.i8(<4 x i16> %A, <8 x i16> %B, <4 x float> %C, i8 %Index0)
|
||||
store <4 x float> %res0, ptr addrspace(1) %out0
|
||||
%Index1 = extractelement <4 x i8> %IndexVec, i32 1
|
||||
%res1 = call <4 x float> @llvm.amdgcn.swmmac.f32.16x16x32.bf16.v4f32.v4i16.v8i16.v4f32.i8(<4 x i16> %A, <8 x i16> %B, <4 x float> %C, i8 %Index1)
|
||||
store <4 x float> %res1, ptr addrspace(1) %out1
|
||||
%Index2 = extractelement <4 x i8> %IndexVec, i32 2
|
||||
%res2 = call <4 x float> @llvm.amdgcn.swmmac.f32.16x16x32.bf16.v4f32.v4i16.v8i16.v4f32.i8(<4 x i16> %A, <8 x i16> %B, <4 x float> %C, i8 %Index2)
|
||||
store <4 x float> %res2, ptr addrspace(1) %out2
|
||||
%Index3 = extractelement <4 x i8> %IndexVec, i32 3
|
||||
%res3 = call <4 x float> @llvm.amdgcn.swmmac.f32.16x16x32.bf16.v4f32.v4i16.v8i16.v4f32.i8(<4 x i16> %A, <8 x i16> %B, <4 x float> %C, i8 %Index3)
|
||||
store <4 x float> %res3, ptr addrspace(1) %out3
|
||||
ret void
|
||||
}
|
||||
|
||||
define amdgpu_ps void @test_swmmac_f16_16x16x32_f16_index_key(<4 x half> %A, <8 x half> %B, <4 x half> %C, ptr addrspace(1) %IndexVecPtr, ptr addrspace(1) %out0, ptr addrspace(1) %out1, ptr addrspace(1) %out2, ptr addrspace(1) %out3) {
|
||||
; GFX12-LABEL: test_swmmac_f16_16x16x32_f16_index_key:
|
||||
; GFX12: ; %bb.0: ; %bb
|
||||
; GFX12-NEXT: global_load_b32 v22, v[8:9], off
|
||||
; GFX12-NEXT: v_mov_b32_e32 v9, v7
|
||||
; GFX12-NEXT: v_mov_b32_e32 v8, v6
|
||||
; GFX12-NEXT: v_mov_b32_e32 v19, v7
|
||||
; GFX12-NEXT: v_mov_b32_e32 v18, v6
|
||||
; GFX12-NEXT: v_mov_b32_e32 v21, v7
|
||||
; GFX12-NEXT: v_mov_b32_e32 v20, v6
|
||||
; GFX12-NEXT: s_wait_loadcnt 0x0
|
||||
; GFX12-NEXT: v_swmmac_f16_16x16x32_f16 v[8:9], v[0:1], v[2:5], v22
|
||||
; GFX12-NEXT: v_swmmac_f16_16x16x32_f16 v[18:19], v[0:1], v[2:5], v22 index_key:1
|
||||
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3)
|
||||
; GFX12-NEXT: v_swmmac_f16_16x16x32_f16 v[20:21], v[0:1], v[2:5], v22 index_key:2
|
||||
; GFX12-NEXT: v_swmmac_f16_16x16x32_f16 v[6:7], v[0:1], v[2:5], v22 index_key:3
|
||||
; GFX12-NEXT: global_store_b64 v[10:11], v[8:9], off
|
||||
; GFX12-NEXT: global_store_b64 v[12:13], v[18:19], off
|
||||
; GFX12-NEXT: global_store_b64 v[14:15], v[20:21], off
|
||||
; GFX12-NEXT: global_store_b64 v[16:17], v[6:7], off
|
||||
; GFX12-NEXT: s_nop 0
|
||||
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
|
||||
; GFX12-NEXT: s_endpgm
|
||||
bb:
|
||||
%IndexVec = load <4 x i8>, ptr addrspace(1) %IndexVecPtr, align 4
|
||||
%Index0 = extractelement <4 x i8> %IndexVec, i32 0
|
||||
%res0 = call <4 x half> @llvm.amdgcn.swmmac.f16.16x16x32.f16.v4f16.v4f16.v8f16.v4f16.i8(<4 x half> %A, <8 x half> %B, <4 x half> %C, i8 %Index0)
|
||||
store <4 x half> %res0, ptr addrspace(1) %out0
|
||||
%Index1 = extractelement <4 x i8> %IndexVec, i32 1
|
||||
%res1 = call <4 x half> @llvm.amdgcn.swmmac.f16.16x16x32.f16.v4f16.v4f16.v8f16.v4f16.i8(<4 x half> %A, <8 x half> %B, <4 x half> %C, i8 %Index1)
|
||||
store <4 x half> %res1, ptr addrspace(1) %out1
|
||||
%Index2 = extractelement <4 x i8> %IndexVec, i32 2
|
||||
%res2 = call <4 x half> @llvm.amdgcn.swmmac.f16.16x16x32.f16.v4f16.v4f16.v8f16.v4f16.i8(<4 x half> %A, <8 x half> %B, <4 x half> %C, i8 %Index2)
|
||||
store <4 x half> %res2, ptr addrspace(1) %out2
|
||||
%Index3 = extractelement <4 x i8> %IndexVec, i32 3
|
||||
%res3 = call <4 x half> @llvm.amdgcn.swmmac.f16.16x16x32.f16.v4f16.v4f16.v8f16.v4f16.i8(<4 x half> %A, <8 x half> %B, <4 x half> %C, i8 %Index3)
|
||||
store <4 x half> %res3, ptr addrspace(1) %out3
|
||||
ret void
|
||||
}
|
||||
|
||||
define amdgpu_ps void @test_swmmac_bf16_16x16x32_bf16_index_key(<4 x i16> %A, <8 x i16> %B, <4 x i16> %C, ptr addrspace(1) %IndexVecPtr, ptr addrspace(1) %out0, ptr addrspace(1) %out1, ptr addrspace(1) %out2, ptr addrspace(1) %out3) {
|
||||
; GFX12-LABEL: test_swmmac_bf16_16x16x32_bf16_index_key:
|
||||
; GFX12: ; %bb.0: ; %bb
|
||||
; GFX12-NEXT: global_load_b32 v22, v[8:9], off
|
||||
; GFX12-NEXT: v_mov_b32_e32 v9, v7
|
||||
; GFX12-NEXT: v_mov_b32_e32 v8, v6
|
||||
; GFX12-NEXT: v_mov_b32_e32 v19, v7
|
||||
; GFX12-NEXT: v_mov_b32_e32 v18, v6
|
||||
; GFX12-NEXT: v_mov_b32_e32 v21, v7
|
||||
; GFX12-NEXT: v_mov_b32_e32 v20, v6
|
||||
; GFX12-NEXT: s_wait_loadcnt 0x0
|
||||
; GFX12-NEXT: v_swmmac_bf16_16x16x32_bf16 v[8:9], v[0:1], v[2:5], v22
|
||||
; GFX12-NEXT: v_swmmac_bf16_16x16x32_bf16 v[18:19], v[0:1], v[2:5], v22 index_key:1
|
||||
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3)
|
||||
; GFX12-NEXT: v_swmmac_bf16_16x16x32_bf16 v[20:21], v[0:1], v[2:5], v22 index_key:2
|
||||
; GFX12-NEXT: v_swmmac_bf16_16x16x32_bf16 v[6:7], v[0:1], v[2:5], v22 index_key:3
|
||||
; GFX12-NEXT: global_store_b64 v[10:11], v[8:9], off
|
||||
; GFX12-NEXT: global_store_b64 v[12:13], v[18:19], off
|
||||
; GFX12-NEXT: global_store_b64 v[14:15], v[20:21], off
|
||||
; GFX12-NEXT: global_store_b64 v[16:17], v[6:7], off
|
||||
; GFX12-NEXT: s_nop 0
|
||||
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
|
||||
; GFX12-NEXT: s_endpgm
|
||||
bb:
|
||||
%IndexVec = load <4 x i8>, ptr addrspace(1) %IndexVecPtr, align 4
|
||||
%Index0 = extractelement <4 x i8> %IndexVec, i32 0
|
||||
%res0 = call <4 x i16> @llvm.amdgcn.swmmac.bf16.16x16x32.bf16.v4i16.v4i16.v8i16.v4i16.i8(<4 x i16> %A, <8 x i16> %B, <4 x i16> %C, i8 %Index0)
|
||||
store <4 x i16> %res0, ptr addrspace(1) %out0
|
||||
%Index1 = extractelement <4 x i8> %IndexVec, i32 1
|
||||
%res1 = call <4 x i16> @llvm.amdgcn.swmmac.bf16.16x16x32.bf16.v4i16.v4i16.v8i16.v4i16.i8(<4 x i16> %A, <8 x i16> %B, <4 x i16> %C, i8 %Index1)
|
||||
store <4 x i16> %res1, ptr addrspace(1) %out1
|
||||
%Index2 = extractelement <4 x i8> %IndexVec, i32 2
|
||||
%res2 = call <4 x i16> @llvm.amdgcn.swmmac.bf16.16x16x32.bf16.v4i16.v4i16.v8i16.v4i16.i8(<4 x i16> %A, <8 x i16> %B, <4 x i16> %C, i8 %Index2)
|
||||
store <4 x i16> %res2, ptr addrspace(1) %out2
|
||||
%Index3 = extractelement <4 x i8> %IndexVec, i32 3
|
||||
%res3 = call <4 x i16> @llvm.amdgcn.swmmac.bf16.16x16x32.bf16.v4i16.v4i16.v8i16.v4i16.i8(<4 x i16> %A, <8 x i16> %B, <4 x i16> %C, i8 %Index3)
|
||||
store <4 x i16> %res3, ptr addrspace(1) %out3
|
||||
ret void
|
||||
}
|
||||
|
||||
define amdgpu_ps void @test_swmmac_i32_16x16x32_iu8_index_key(i32 %A, <2 x i32> %B, <4 x i32> %C, ptr addrspace(1) %IndexVecPtr, ptr addrspace(1) %out0, ptr addrspace(1) %out1, ptr addrspace(1) %out2, ptr addrspace(1) %out3) {
|
||||
; GFX12-LABEL: test_swmmac_i32_16x16x32_iu8_index_key:
|
||||
; GFX12: ; %bb.0: ; %bb
|
||||
; GFX12-NEXT: global_load_b32 v7, v[7:8], off
|
||||
; GFX12-NEXT: v_mov_b32_e32 v20, v6
|
||||
; GFX12-NEXT: v_mov_b32_e32 v19, v5
|
||||
; GFX12-NEXT: v_mov_b32_e32 v18, v4
|
||||
; GFX12-NEXT: v_mov_b32_e32 v17, v3
|
||||
; GFX12-NEXT: v_mov_b32_e32 v24, v6
|
||||
; GFX12-NEXT: v_mov_b32_e32 v23, v5
|
||||
; GFX12-NEXT: v_mov_b32_e32 v22, v4
|
||||
; GFX12-NEXT: v_mov_b32_e32 v21, v3
|
||||
; GFX12-NEXT: v_mov_b32_e32 v28, v6
|
||||
; GFX12-NEXT: v_mov_b32_e32 v27, v5
|
||||
; GFX12-NEXT: v_mov_b32_e32 v26, v4
|
||||
; GFX12-NEXT: v_mov_b32_e32 v25, v3
|
||||
; GFX12-NEXT: s_wait_loadcnt 0x0
|
||||
; GFX12-NEXT: v_swmmac_i32_16x16x32_iu8 v[17:20], v0, v[1:2], v7
|
||||
; GFX12-NEXT: v_swmmac_i32_16x16x32_iu8 v[21:24], v0, v[1:2], v7 index_key:1
|
||||
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3)
|
||||
; GFX12-NEXT: v_swmmac_i32_16x16x32_iu8 v[25:28], v0, v[1:2], v7 index_key:2
|
||||
; GFX12-NEXT: v_swmmac_i32_16x16x32_iu8 v[3:6], v0, v[1:2], v7 index_key:3
|
||||
; GFX12-NEXT: global_store_b128 v[9:10], v[17:20], off
|
||||
; GFX12-NEXT: global_store_b128 v[11:12], v[21:24], off
|
||||
; GFX12-NEXT: global_store_b128 v[13:14], v[25:28], off
|
||||
; GFX12-NEXT: global_store_b128 v[15:16], v[3:6], off
|
||||
; GFX12-NEXT: s_nop 0
|
||||
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
|
||||
; GFX12-NEXT: s_endpgm
|
||||
bb:
|
||||
%IndexVec = load <4 x i8>, ptr addrspace(1) %IndexVecPtr, align 4
|
||||
%Index0 = extractelement <4 x i8> %IndexVec, i32 0
|
||||
%res0 = call <4 x i32> @llvm.amdgcn.swmmac.i32.16x16x32.iu8.v4i32.i32.v2i32.v4i32.i8(i1 0, i32 %A, i1 0, <2 x i32> %B, <4 x i32> %C, i8 %Index0, i1 0)
|
||||
store <4 x i32> %res0, ptr addrspace(1) %out0
|
||||
%Index1 = extractelement <4 x i8> %IndexVec, i32 1
|
||||
%res1 = call <4 x i32> @llvm.amdgcn.swmmac.i32.16x16x32.iu8.v4i32.i32.v2i32.v4i32.i8(i1 0, i32 %A, i1 0, <2 x i32> %B, <4 x i32> %C, i8 %Index1, i1 0)
|
||||
store <4 x i32> %res1, ptr addrspace(1) %out1
|
||||
%Index2 = extractelement <4 x i8> %IndexVec, i32 2
|
||||
%res2 = call <4 x i32> @llvm.amdgcn.swmmac.i32.16x16x32.iu8.v4i32.i32.v2i32.v4i32.i8(i1 0, i32 %A, i1 0, <2 x i32> %B, <4 x i32> %C, i8 %Index2, i1 0)
|
||||
store <4 x i32> %res2, ptr addrspace(1) %out2
|
||||
%Index3 = extractelement <4 x i8> %IndexVec, i32 3
|
||||
%res3 = call <4 x i32> @llvm.amdgcn.swmmac.i32.16x16x32.iu8.v4i32.i32.v2i32.v4i32.i8(i1 0, i32 %A, i1 0, <2 x i32> %B, <4 x i32> %C, i8 %Index3, i1 0)
|
||||
store <4 x i32> %res3, ptr addrspace(1) %out3
|
||||
ret void
|
||||
}
|
||||
|
||||
define amdgpu_ps void @test_swmmac_i32_16x16x32_iu4_index_key(i32 %A, i32 %B, <4 x i32> %C, ptr addrspace(1) %IndexVecPtr, ptr addrspace(1) %out0, ptr addrspace(1) %out1) {
|
||||
; GFX12-LABEL: test_swmmac_i32_16x16x32_iu4_index_key:
|
||||
; GFX12: ; %bb.0: ; %bb
|
||||
; GFX12-NEXT: global_load_b32 v6, v[6:7], off
|
||||
; GFX12-NEXT: v_mov_b32_e32 v15, v5
|
||||
; GFX12-NEXT: v_mov_b32_e32 v14, v4
|
||||
; GFX12-NEXT: v_mov_b32_e32 v13, v3
|
||||
; GFX12-NEXT: v_mov_b32_e32 v12, v2
|
||||
; GFX12-NEXT: s_wait_loadcnt 0x0
|
||||
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
|
||||
; GFX12-NEXT: v_swmmac_i32_16x16x32_iu4 v[12:15], v0, v1, v6
|
||||
; GFX12-NEXT: v_swmmac_i32_16x16x32_iu4 v[2:5], v0, v1, v6 index_key:1
|
||||
; GFX12-NEXT: global_store_b128 v[8:9], v[12:15], off
|
||||
; GFX12-NEXT: global_store_b128 v[10:11], v[2:5], off
|
||||
; GFX12-NEXT: s_nop 0
|
||||
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
|
||||
; GFX12-NEXT: s_endpgm
|
||||
bb:
|
||||
%IndexVec = load <2 x i16>, ptr addrspace(1) %IndexVecPtr, align 4
|
||||
%Index0 = extractelement <2 x i16> %IndexVec, i32 0
|
||||
%res0 = call <4 x i32> @llvm.amdgcn.swmmac.i32.16x16x32.iu4.v4i32.i32.i32.v4i32.i16(i1 0, i32 %A, i1 0, i32 %B, <4 x i32> %C, i16 %Index0, i1 0)
|
||||
store <4 x i32> %res0, ptr addrspace(1) %out0
|
||||
%Index1 = extractelement <2 x i16> %IndexVec, i32 1
|
||||
%res1 = call <4 x i32> @llvm.amdgcn.swmmac.i32.16x16x32.iu4.v4i32.i32.i32.v4i32.i16(i1 0, i32 %A, i1 0, i32 %B, <4 x i32> %C, i16 %Index1, i1 0)
|
||||
store <4 x i32> %res1, ptr addrspace(1) %out1
|
||||
ret void
|
||||
}
|
||||
|
||||
define amdgpu_ps void @test_swmmac_i32_16x16x64_iu4_index_key(i32 %A, <2 x i32> %B, <4 x i32> %C, ptr addrspace(1) %IndexVecPtr, ptr addrspace(1) %out0, ptr addrspace(1) %out1) {
|
||||
; GFX12-LABEL: test_swmmac_i32_16x16x64_iu4_index_key:
|
||||
; GFX12: ; %bb.0: ; %bb
|
||||
; GFX12-NEXT: global_load_b32 v7, v[7:8], off
|
||||
; GFX12-NEXT: v_mov_b32_e32 v16, v6
|
||||
; GFX12-NEXT: v_mov_b32_e32 v15, v5
|
||||
; GFX12-NEXT: v_mov_b32_e32 v14, v4
|
||||
; GFX12-NEXT: v_mov_b32_e32 v13, v3
|
||||
; GFX12-NEXT: s_wait_loadcnt 0x0
|
||||
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
|
||||
; GFX12-NEXT: v_swmmac_i32_16x16x64_iu4 v[13:16], v0, v[1:2], v7
|
||||
; GFX12-NEXT: v_swmmac_i32_16x16x64_iu4 v[3:6], v0, v[1:2], v7 index_key:1
|
||||
; GFX12-NEXT: global_store_b128 v[9:10], v[13:16], off
|
||||
; GFX12-NEXT: global_store_b128 v[11:12], v[3:6], off
|
||||
; GFX12-NEXT: s_nop 0
|
||||
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
|
||||
; GFX12-NEXT: s_endpgm
|
||||
bb:
|
||||
%IndexVec = load <2 x i16>, ptr addrspace(1) %IndexVecPtr, align 4
|
||||
%Index0 = extractelement <2 x i16> %IndexVec, i32 0
|
||||
%res0 = call <4 x i32> @llvm.amdgcn.swmmac.i32.16x16x64.iu4.v4i32.i32.v2i32.v4i32.i16(i1 0, i32 %A, i1 0, <2 x i32> %B, <4 x i32> %C, i16 %Index0, i1 0)
|
||||
store <4 x i32> %res0, ptr addrspace(1) %out0
|
||||
%Index1 = extractelement <2 x i16> %IndexVec, i32 1
|
||||
%res1 = call <4 x i32> @llvm.amdgcn.swmmac.i32.16x16x64.iu4.v4i32.i32.v2i32.v4i32.i16(i1 0, i32 %A, i1 0, <2 x i32> %B, <4 x i32> %C, i16 %Index1, i1 0)
|
||||
store <4 x i32> %res1, ptr addrspace(1) %out1
|
||||
ret void
|
||||
}
|
||||
|
||||
define amdgpu_ps void @test_swmmac_f32_16x16x32_fp8_fp8_index_key(i32 %A, <2 x i32> %B, <4 x float> %C, ptr addrspace(1) %IndexVecPtr, ptr addrspace(1) %out0, ptr addrspace(1) %out1, ptr addrspace(1) %out2, ptr addrspace(1) %out3) {
|
||||
; GFX12-LABEL: test_swmmac_f32_16x16x32_fp8_fp8_index_key:
|
||||
; GFX12: ; %bb.0: ; %bb
|
||||
; GFX12-NEXT: global_load_b32 v7, v[7:8], off
|
||||
; GFX12-NEXT: v_mov_b32_e32 v20, v6
|
||||
; GFX12-NEXT: v_mov_b32_e32 v19, v5
|
||||
; GFX12-NEXT: v_mov_b32_e32 v18, v4
|
||||
; GFX12-NEXT: v_mov_b32_e32 v17, v3
|
||||
; GFX12-NEXT: v_mov_b32_e32 v24, v6
|
||||
; GFX12-NEXT: v_mov_b32_e32 v23, v5
|
||||
; GFX12-NEXT: v_mov_b32_e32 v22, v4
|
||||
; GFX12-NEXT: v_mov_b32_e32 v21, v3
|
||||
; GFX12-NEXT: v_mov_b32_e32 v28, v6
|
||||
; GFX12-NEXT: v_mov_b32_e32 v27, v5
|
||||
; GFX12-NEXT: v_mov_b32_e32 v26, v4
|
||||
; GFX12-NEXT: v_mov_b32_e32 v25, v3
|
||||
; GFX12-NEXT: s_wait_loadcnt 0x0
|
||||
; GFX12-NEXT: v_swmmac_f32_16x16x32_fp8_fp8 v[17:20], v0, v[1:2], v7
|
||||
; GFX12-NEXT: v_swmmac_f32_16x16x32_fp8_fp8 v[21:24], v0, v[1:2], v7 index_key:1
|
||||
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3)
|
||||
; GFX12-NEXT: v_swmmac_f32_16x16x32_fp8_fp8 v[25:28], v0, v[1:2], v7 index_key:2
|
||||
; GFX12-NEXT: v_swmmac_f32_16x16x32_fp8_fp8 v[3:6], v0, v[1:2], v7 index_key:3
|
||||
; GFX12-NEXT: global_store_b128 v[9:10], v[17:20], off
|
||||
; GFX12-NEXT: global_store_b128 v[11:12], v[21:24], off
|
||||
; GFX12-NEXT: global_store_b128 v[13:14], v[25:28], off
|
||||
; GFX12-NEXT: global_store_b128 v[15:16], v[3:6], off
|
||||
; GFX12-NEXT: s_nop 0
|
||||
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
|
||||
; GFX12-NEXT: s_endpgm
|
||||
bb:
|
||||
%IndexVec = load <4 x i8>, ptr addrspace(1) %IndexVecPtr, align 4
|
||||
%Index0 = extractelement <4 x i8> %IndexVec, i32 0
|
||||
%res0 = call <4 x float> @llvm.amdgcn.swmmac.f32.16x16x32.fp8.fp8.v4f32.i32.v2i32.v4f32.i8(i32 %A, <2 x i32> %B, <4 x float> %C, i8 %Index0)
|
||||
store <4 x float> %res0, ptr addrspace(1) %out0
|
||||
%Index1 = extractelement <4 x i8> %IndexVec, i32 1
|
||||
%res1 = call <4 x float> @llvm.amdgcn.swmmac.f32.16x16x32.fp8.fp8.v4f32.i32.v2i32.v4f32.i8(i32 %A, <2 x i32> %B, <4 x float> %C, i8 %Index1)
|
||||
store <4 x float> %res1, ptr addrspace(1) %out1
|
||||
%Index2 = extractelement <4 x i8> %IndexVec, i32 2
|
||||
%res2 = call <4 x float> @llvm.amdgcn.swmmac.f32.16x16x32.fp8.fp8.v4f32.i32.v2i32.v4f32.i8(i32 %A, <2 x i32> %B, <4 x float> %C, i8 %Index2)
|
||||
store <4 x float> %res2, ptr addrspace(1) %out2
|
||||
%Index3 = extractelement <4 x i8> %IndexVec, i32 3
|
||||
%res3 = call <4 x float> @llvm.amdgcn.swmmac.f32.16x16x32.fp8.fp8.v4f32.i32.v2i32.v4f32.i8(i32 %A, <2 x i32> %B, <4 x float> %C, i8 %Index3)
|
||||
store <4 x float> %res3, ptr addrspace(1) %out3
|
||||
ret void
|
||||
}
|
||||
|
||||
define amdgpu_ps void @test_swmmac_f32_16x16x32_fp8_bf8_index_key(i32 %A, <2 x i32> %B, <4 x float> %C, ptr addrspace(1) %IndexVecPtr, ptr addrspace(1) %out0, ptr addrspace(1) %out1, ptr addrspace(1) %out2, ptr addrspace(1) %out3) {
|
||||
; GFX12-LABEL: test_swmmac_f32_16x16x32_fp8_bf8_index_key:
|
||||
; GFX12: ; %bb.0: ; %bb
|
||||
; GFX12-NEXT: global_load_b32 v7, v[7:8], off
|
||||
; GFX12-NEXT: v_mov_b32_e32 v20, v6
|
||||
; GFX12-NEXT: v_mov_b32_e32 v19, v5
|
||||
; GFX12-NEXT: v_mov_b32_e32 v18, v4
|
||||
; GFX12-NEXT: v_mov_b32_e32 v17, v3
|
||||
; GFX12-NEXT: v_mov_b32_e32 v24, v6
|
||||
; GFX12-NEXT: v_mov_b32_e32 v23, v5
|
||||
; GFX12-NEXT: v_mov_b32_e32 v22, v4
|
||||
; GFX12-NEXT: v_mov_b32_e32 v21, v3
|
||||
; GFX12-NEXT: v_mov_b32_e32 v28, v6
|
||||
; GFX12-NEXT: v_mov_b32_e32 v27, v5
|
||||
; GFX12-NEXT: v_mov_b32_e32 v26, v4
|
||||
; GFX12-NEXT: v_mov_b32_e32 v25, v3
|
||||
; GFX12-NEXT: s_wait_loadcnt 0x0
|
||||
; GFX12-NEXT: v_swmmac_f32_16x16x32_fp8_bf8 v[17:20], v0, v[1:2], v7
|
||||
; GFX12-NEXT: v_swmmac_f32_16x16x32_fp8_bf8 v[21:24], v0, v[1:2], v7 index_key:1
|
||||
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3)
|
||||
; GFX12-NEXT: v_swmmac_f32_16x16x32_fp8_bf8 v[25:28], v0, v[1:2], v7 index_key:2
|
||||
; GFX12-NEXT: v_swmmac_f32_16x16x32_fp8_bf8 v[3:6], v0, v[1:2], v7 index_key:3
|
||||
; GFX12-NEXT: global_store_b128 v[9:10], v[17:20], off
|
||||
; GFX12-NEXT: global_store_b128 v[11:12], v[21:24], off
|
||||
; GFX12-NEXT: global_store_b128 v[13:14], v[25:28], off
|
||||
; GFX12-NEXT: global_store_b128 v[15:16], v[3:6], off
|
||||
; GFX12-NEXT: s_nop 0
|
||||
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
|
||||
; GFX12-NEXT: s_endpgm
|
||||
bb:
|
||||
%IndexVec = load <4 x i8>, ptr addrspace(1) %IndexVecPtr, align 4
|
||||
%Index0 = extractelement <4 x i8> %IndexVec, i32 0
|
||||
%res0 = call <4 x float> @llvm.amdgcn.swmmac.f32.16x16x32.fp8.bf8.v4f32.i32.v2i32.v4f32.i8(i32 %A, <2 x i32> %B, <4 x float> %C, i8 %Index0)
|
||||
store <4 x float> %res0, ptr addrspace(1) %out0
|
||||
%Index1 = extractelement <4 x i8> %IndexVec, i32 1
|
||||
%res1 = call <4 x float> @llvm.amdgcn.swmmac.f32.16x16x32.fp8.bf8.v4f32.i32.v2i32.v4f32.i8(i32 %A, <2 x i32> %B, <4 x float> %C, i8 %Index1)
|
||||
store <4 x float> %res1, ptr addrspace(1) %out1
|
||||
%Index2 = extractelement <4 x i8> %IndexVec, i32 2
|
||||
%res2 = call <4 x float> @llvm.amdgcn.swmmac.f32.16x16x32.fp8.bf8.v4f32.i32.v2i32.v4f32.i8(i32 %A, <2 x i32> %B, <4 x float> %C, i8 %Index2)
|
||||
store <4 x float> %res2, ptr addrspace(1) %out2
|
||||
%Index3 = extractelement <4 x i8> %IndexVec, i32 3
|
||||
%res3 = call <4 x float> @llvm.amdgcn.swmmac.f32.16x16x32.fp8.bf8.v4f32.i32.v2i32.v4f32.i8(i32 %A, <2 x i32> %B, <4 x float> %C, i8 %Index3)
|
||||
store <4 x float> %res3, ptr addrspace(1) %out3
|
||||
ret void
|
||||
}
|
||||
|
||||
define amdgpu_ps void @test_swmmac_f32_16x16x32_bf8_fp8_index_key(i32 %A, <2 x i32> %B, <4 x float> %C, ptr addrspace(1) %IndexVecPtr, ptr addrspace(1) %out0, ptr addrspace(1) %out1, ptr addrspace(1) %out2, ptr addrspace(1) %out3) {
|
||||
; GFX12-LABEL: test_swmmac_f32_16x16x32_bf8_fp8_index_key:
|
||||
; GFX12: ; %bb.0: ; %bb
|
||||
; GFX12-NEXT: global_load_b32 v7, v[7:8], off
|
||||
; GFX12-NEXT: v_mov_b32_e32 v20, v6
|
||||
; GFX12-NEXT: v_mov_b32_e32 v19, v5
|
||||
; GFX12-NEXT: v_mov_b32_e32 v18, v4
|
||||
; GFX12-NEXT: v_mov_b32_e32 v17, v3
|
||||
; GFX12-NEXT: v_mov_b32_e32 v24, v6
|
||||
; GFX12-NEXT: v_mov_b32_e32 v23, v5
|
||||
; GFX12-NEXT: v_mov_b32_e32 v22, v4
|
||||
; GFX12-NEXT: v_mov_b32_e32 v21, v3
|
||||
; GFX12-NEXT: v_mov_b32_e32 v28, v6
|
||||
; GFX12-NEXT: v_mov_b32_e32 v27, v5
|
||||
; GFX12-NEXT: v_mov_b32_e32 v26, v4
|
||||
; GFX12-NEXT: v_mov_b32_e32 v25, v3
|
||||
; GFX12-NEXT: s_wait_loadcnt 0x0
|
||||
; GFX12-NEXT: v_swmmac_f32_16x16x32_bf8_fp8 v[17:20], v0, v[1:2], v7
|
||||
; GFX12-NEXT: v_swmmac_f32_16x16x32_bf8_fp8 v[21:24], v0, v[1:2], v7 index_key:1
|
||||
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3)
|
||||
; GFX12-NEXT: v_swmmac_f32_16x16x32_bf8_fp8 v[25:28], v0, v[1:2], v7 index_key:2
|
||||
; GFX12-NEXT: v_swmmac_f32_16x16x32_bf8_fp8 v[3:6], v0, v[1:2], v7 index_key:3
|
||||
; GFX12-NEXT: global_store_b128 v[9:10], v[17:20], off
|
||||
; GFX12-NEXT: global_store_b128 v[11:12], v[21:24], off
|
||||
; GFX12-NEXT: global_store_b128 v[13:14], v[25:28], off
|
||||
; GFX12-NEXT: global_store_b128 v[15:16], v[3:6], off
|
||||
; GFX12-NEXT: s_nop 0
|
||||
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
|
||||
; GFX12-NEXT: s_endpgm
|
||||
bb:
|
||||
%IndexVec = load <4 x i8>, ptr addrspace(1) %IndexVecPtr, align 4
|
||||
%Index0 = extractelement <4 x i8> %IndexVec, i32 0
|
||||
%res0 = call <4 x float> @llvm.amdgcn.swmmac.f32.16x16x32.bf8.fp8.v4f32.i32.v2i32.v4f32.i8(i32 %A, <2 x i32> %B, <4 x float> %C, i8 %Index0)
|
||||
store <4 x float> %res0, ptr addrspace(1) %out0
|
||||
%Index1 = extractelement <4 x i8> %IndexVec, i32 1
|
||||
%res1 = call <4 x float> @llvm.amdgcn.swmmac.f32.16x16x32.bf8.fp8.v4f32.i32.v2i32.v4f32.i8(i32 %A, <2 x i32> %B, <4 x float> %C, i8 %Index1)
|
||||
store <4 x float> %res1, ptr addrspace(1) %out1
|
||||
%Index2 = extractelement <4 x i8> %IndexVec, i32 2
|
||||
%res2 = call <4 x float> @llvm.amdgcn.swmmac.f32.16x16x32.bf8.fp8.v4f32.i32.v2i32.v4f32.i8(i32 %A, <2 x i32> %B, <4 x float> %C, i8 %Index2)
|
||||
store <4 x float> %res2, ptr addrspace(1) %out2
|
||||
%Index3 = extractelement <4 x i8> %IndexVec, i32 3
|
||||
%res3 = call <4 x float> @llvm.amdgcn.swmmac.f32.16x16x32.bf8.fp8.v4f32.i32.v2i32.v4f32.i8(i32 %A, <2 x i32> %B, <4 x float> %C, i8 %Index3)
|
||||
store <4 x float> %res3, ptr addrspace(1) %out3
|
||||
ret void
|
||||
}
|
||||
|
||||
define amdgpu_ps void @test_swmmac_f32_16x16x32_bf8_bf8_index_key(i32 %A, <2 x i32> %B, <4 x float> %C, ptr addrspace(1) %IndexVecPtr, ptr addrspace(1) %out0, ptr addrspace(1) %out1, ptr addrspace(1) %out2, ptr addrspace(1) %out3) {
|
||||
; GFX12-LABEL: test_swmmac_f32_16x16x32_bf8_bf8_index_key:
|
||||
; GFX12: ; %bb.0: ; %bb
|
||||
; GFX12-NEXT: global_load_b32 v7, v[7:8], off
|
||||
; GFX12-NEXT: v_mov_b32_e32 v20, v6
|
||||
; GFX12-NEXT: v_mov_b32_e32 v19, v5
|
||||
; GFX12-NEXT: v_mov_b32_e32 v18, v4
|
||||
; GFX12-NEXT: v_mov_b32_e32 v17, v3
|
||||
; GFX12-NEXT: v_mov_b32_e32 v24, v6
|
||||
; GFX12-NEXT: v_mov_b32_e32 v23, v5
|
||||
; GFX12-NEXT: v_mov_b32_e32 v22, v4
|
||||
; GFX12-NEXT: v_mov_b32_e32 v21, v3
|
||||
; GFX12-NEXT: v_mov_b32_e32 v28, v6
|
||||
; GFX12-NEXT: v_mov_b32_e32 v27, v5
|
||||
; GFX12-NEXT: v_mov_b32_e32 v26, v4
|
||||
; GFX12-NEXT: v_mov_b32_e32 v25, v3
|
||||
; GFX12-NEXT: s_wait_loadcnt 0x0
|
||||
; GFX12-NEXT: v_swmmac_f32_16x16x32_bf8_bf8 v[17:20], v0, v[1:2], v7
|
||||
; GFX12-NEXT: v_swmmac_f32_16x16x32_bf8_bf8 v[21:24], v0, v[1:2], v7 index_key:1
|
||||
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3)
|
||||
; GFX12-NEXT: v_swmmac_f32_16x16x32_bf8_bf8 v[25:28], v0, v[1:2], v7 index_key:2
|
||||
; GFX12-NEXT: v_swmmac_f32_16x16x32_bf8_bf8 v[3:6], v0, v[1:2], v7 index_key:3
|
||||
; GFX12-NEXT: global_store_b128 v[9:10], v[17:20], off
|
||||
; GFX12-NEXT: global_store_b128 v[11:12], v[21:24], off
|
||||
; GFX12-NEXT: global_store_b128 v[13:14], v[25:28], off
|
||||
; GFX12-NEXT: global_store_b128 v[15:16], v[3:6], off
|
||||
; GFX12-NEXT: s_nop 0
|
||||
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
|
||||
; GFX12-NEXT: s_endpgm
|
||||
bb:
|
||||
%IndexVec = load <4 x i8>, ptr addrspace(1) %IndexVecPtr, align 4
|
||||
%Index0 = extractelement <4 x i8> %IndexVec, i32 0
|
||||
%res0 = call <4 x float> @llvm.amdgcn.swmmac.f32.16x16x32.bf8.bf8.v4f32.i32.v2i32.v4f32.i8(i32 %A, <2 x i32> %B, <4 x float> %C, i8 %Index0)
|
||||
store <4 x float> %res0, ptr addrspace(1) %out0
|
||||
%Index1 = extractelement <4 x i8> %IndexVec, i32 1
|
||||
%res1 = call <4 x float> @llvm.amdgcn.swmmac.f32.16x16x32.bf8.bf8.v4f32.i32.v2i32.v4f32.i8(i32 %A, <2 x i32> %B, <4 x float> %C, i8 %Index1)
|
||||
store <4 x float> %res1, ptr addrspace(1) %out1
|
||||
%Index2 = extractelement <4 x i8> %IndexVec, i32 2
|
||||
%res2 = call <4 x float> @llvm.amdgcn.swmmac.f32.16x16x32.bf8.bf8.v4f32.i32.v2i32.v4f32.i8(i32 %A, <2 x i32> %B, <4 x float> %C, i8 %Index2)
|
||||
store <4 x float> %res2, ptr addrspace(1) %out2
|
||||
%Index3 = extractelement <4 x i8> %IndexVec, i32 3
|
||||
%res3 = call <4 x float> @llvm.amdgcn.swmmac.f32.16x16x32.bf8.bf8.v4f32.i32.v2i32.v4f32.i8(i32 %A, <2 x i32> %B, <4 x float> %C, i8 %Index3)
|
||||
store <4 x float> %res3, ptr addrspace(1) %out3
|
||||
ret void
|
||||
}
|
||||
|
||||
declare <4 x float> @llvm.amdgcn.swmmac.f32.16x16x32.f16.v4f32.v4f16.v8f16.v4f32.i8(<4 x half>, <8 x half>, <4 x float>, i8)
|
||||
declare <4 x float> @llvm.amdgcn.swmmac.f32.16x16x32.bf16.v4f32.v4i16.v8i16.v4f32.i8(<4 x i16>, <8 x i16>, <4 x float>, i8)
|
||||
declare <4 x half> @llvm.amdgcn.swmmac.f16.16x16x32.f16.v4f16.v4f16.v8f16.v4f16.i8(<4 x half>, <8 x half>, <4 x half>, i8)
|
||||
declare <4 x i16> @llvm.amdgcn.swmmac.bf16.16x16x32.bf16.v4i16.v4i16.v8i16.v4i16.i8(<4 x i16>, <8 x i16>, <4 x i16>, i8)
|
||||
declare <4 x i32> @llvm.amdgcn.swmmac.i32.16x16x32.iu8.v4i32.i32.v2i32.v4i32.i8(i1 immarg, i32, i1 immarg, <2 x i32>, <4 x i32>, i8 %Index, i1 immarg)
|
||||
declare <4 x i32> @llvm.amdgcn.swmmac.i32.16x16x32.iu4.v4i32.i32.i32.v4i32.i16(i1 immarg, i32, i1 immarg, i32, <4 x i32>, i16 %Index, i1 immarg)
|
||||
declare <4 x i32> @llvm.amdgcn.swmmac.i32.16x16x64.iu4.v4i32.i32.v2i32.v4i32.i16(i1 immarg, i32, i1 immarg, <2 x i32>, <4 x i32>, i16 %Index, i1 immarg)
|
||||
declare <4 x float> @llvm.amdgcn.swmmac.f32.16x16x32.fp8.fp8.v4f32.i32.v2i32.v4f32.i8(i32, <2 x i32>, <4 x float>, i8)
|
||||
declare <4 x float> @llvm.amdgcn.swmmac.f32.16x16x32.fp8.bf8.v4f32.i32.v2i32.v4f32.i8(i32, <2 x i32>, <4 x float>, i8)
|
||||
declare <4 x float> @llvm.amdgcn.swmmac.f32.16x16x32.bf8.fp8.v4f32.i32.v2i32.v4f32.i8(i32, <2 x i32>, <4 x float>, i8)
|
||||
declare <4 x float> @llvm.amdgcn.swmmac.f32.16x16x32.bf8.bf8.v4f32.i32.v2i32.v4f32.i8(i32, <2 x i32>, <4 x float>, i8)
|
333
llvm/test/CodeGen/AMDGPU/GlobalISel/wmma-gfx12-w64.ll
Normal file
333
llvm/test/CodeGen/AMDGPU/GlobalISel/wmma-gfx12-w64.ll
Normal file
@ -0,0 +1,333 @@
|
||||
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
|
||||
; RUN: llc -global-isel -march=amdgcn -mcpu=gfx1200 -mattr=-wavefrontsize32,+wavefrontsize64 -verify-machineinstrs < %s | FileCheck %s --check-prefix=GFX12
|
||||
|
||||
define amdgpu_ps void @test_wmma_f32_16x16x16_f16(<4 x half> %A, <4 x half> %B, <4 x float> %C, ptr addrspace(1) %out) {
|
||||
; GFX12-LABEL: test_wmma_f32_16x16x16_f16:
|
||||
; GFX12: ; %bb.0: ; %bb
|
||||
; GFX12-NEXT: v_wmma_f32_16x16x16_f16 v[4:7], v[0:1], v[2:3], v[4:7]
|
||||
; GFX12-NEXT: global_store_b128 v[8:9], v[4:7], off
|
||||
; GFX12-NEXT: s_nop 0
|
||||
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
|
||||
; GFX12-NEXT: s_endpgm
|
||||
bb:
|
||||
%res = call <4 x float>@llvm.amdgcn.wmma.f32.16x16x16.f16.v4f32.v4f16.v4f16.v4f32(<4 x half> %A, <4 x half> %B, <4 x float> %C)
|
||||
store <4 x float> %res, ptr addrspace(1) %out
|
||||
ret void
|
||||
}
|
||||
|
||||
define amdgpu_ps void @test_wmma_f32_16x16x16_bf16(<4 x i16> %A, <4 x i16> %B, <4 x float> %C, ptr addrspace(1) %out) {
|
||||
; GFX12-LABEL: test_wmma_f32_16x16x16_bf16:
|
||||
; GFX12: ; %bb.0: ; %bb
|
||||
; GFX12-NEXT: v_wmma_f32_16x16x16_bf16 v[4:7], v[0:1], v[2:3], v[4:7]
|
||||
; GFX12-NEXT: global_store_b128 v[8:9], v[4:7], off
|
||||
; GFX12-NEXT: s_nop 0
|
||||
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
|
||||
; GFX12-NEXT: s_endpgm
|
||||
bb:
|
||||
%res = call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf16.v4f32.v4i16.v4i16.v4f32(<4 x i16> %A, <4 x i16> %B, <4 x float> %C)
|
||||
store <4 x float> %res, ptr addrspace(1) %out
|
||||
ret void
|
||||
}
|
||||
|
||||
define amdgpu_ps void @test_wmma_f16_16x16x16_f16(<4 x half> %A, <4 x half> %B, <4 x half> %C, ptr addrspace(1) %out) {
|
||||
; GFX12-LABEL: test_wmma_f16_16x16x16_f16:
|
||||
; GFX12: ; %bb.0: ; %bb
|
||||
; GFX12-NEXT: v_wmma_f16_16x16x16_f16 v[4:5], v[0:1], v[2:3], v[4:5]
|
||||
; GFX12-NEXT: global_store_b64 v[6:7], v[4:5], off
|
||||
; GFX12-NEXT: s_nop 0
|
||||
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
|
||||
; GFX12-NEXT: s_endpgm
|
||||
bb:
|
||||
%res = call <4 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16.v4f16.v4f16.v4f16.v4f16(<4 x half> %A, <4 x half> %B, <4 x half> %C, i1 0)
|
||||
store <4 x half> %res, ptr addrspace(1) %out
|
||||
ret void
|
||||
}
|
||||
|
||||
define amdgpu_ps void @test_wmma_bf16_16x16x16_bf16(<4 x i16> %A, <4 x i16> %B, <4 x i16> %C, ptr addrspace(1) %out) {
|
||||
; GFX12-LABEL: test_wmma_bf16_16x16x16_bf16:
|
||||
; GFX12: ; %bb.0: ; %bb
|
||||
; GFX12-NEXT: v_wmma_bf16_16x16x16_bf16 v[4:5], v[0:1], v[2:3], v[4:5]
|
||||
; GFX12-NEXT: global_store_b64 v[6:7], v[4:5], off
|
||||
; GFX12-NEXT: s_nop 0
|
||||
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
|
||||
; GFX12-NEXT: s_endpgm
|
||||
bb:
|
||||
%res = call <4 x i16> @llvm.amdgcn.wmma.bf16.16x16x16.bf16.v4i16.v4i16.v4i16.v4i16(<4 x i16> %A, <4 x i16> %B, <4 x i16> %C, i1 0)
|
||||
store <4 x i16> %res, ptr addrspace(1) %out
|
||||
ret void
|
||||
}
|
||||
|
||||
define amdgpu_ps void @test_wmma_i32_16x16x16_iu8(i32 %A, i32 %B, <4 x i32> %C, ptr addrspace(1) %out) {
|
||||
; GFX12-LABEL: test_wmma_i32_16x16x16_iu8:
|
||||
; GFX12: ; %bb.0: ; %bb
|
||||
; GFX12-NEXT: v_wmma_i32_16x16x16_iu8 v[2:5], v0, v1, v[2:5]
|
||||
; GFX12-NEXT: global_store_b128 v[6:7], v[2:5], off
|
||||
; GFX12-NEXT: s_nop 0
|
||||
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
|
||||
; GFX12-NEXT: s_endpgm
|
||||
bb:
|
||||
%res = call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8.v4i32.i32.i32.v4i32(i1 0, i32 %A, i1 0, i32 %B, <4 x i32> %C, i1 0)
|
||||
store <4 x i32> %res, ptr addrspace(1) %out
|
||||
ret void
|
||||
}
|
||||
|
||||
define amdgpu_ps void @test_wmma_i32_16x16x16_iu4(i32 %A, i32 %B, <4 x i32> %C, ptr addrspace(1) %out) {
|
||||
; GFX12-LABEL: test_wmma_i32_16x16x16_iu4:
|
||||
; GFX12: ; %bb.0: ; %bb
|
||||
; GFX12-NEXT: v_wmma_i32_16x16x16_iu4 v[2:5], v0, v1, v[2:5]
|
||||
; GFX12-NEXT: global_store_b128 v[6:7], v[2:5], off
|
||||
; GFX12-NEXT: s_nop 0
|
||||
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
|
||||
; GFX12-NEXT: s_endpgm
|
||||
bb:
|
||||
%res = call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4.v4i32.i32.i32.v4i32(i1 0, i32 %A, i1 0, i32 %B, <4 x i32> %C, i1 0)
|
||||
store <4 x i32> %res, ptr addrspace(1) %out
|
||||
ret void
|
||||
}
|
||||
|
||||
define amdgpu_ps void @test_wmma_f32_16x16x16_fp8_fp8(i32 %A, i32 %B, <4 x float> %C, ptr addrspace(1) %out) {
|
||||
; GFX12-LABEL: test_wmma_f32_16x16x16_fp8_fp8:
|
||||
; GFX12: ; %bb.0: ; %bb
|
||||
; GFX12-NEXT: v_wmma_f32_16x16x16_fp8_fp8 v[2:5], v0, v1, v[2:5]
|
||||
; GFX12-NEXT: global_store_b128 v[6:7], v[2:5], off
|
||||
; GFX12-NEXT: s_nop 0
|
||||
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
|
||||
; GFX12-NEXT: s_endpgm
|
||||
bb:
|
||||
%res = call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.fp8.fp8.v4f32.i32.i32.v4f32(i32 %A, i32 %B, <4 x float> %C)
|
||||
store <4 x float> %res, ptr addrspace(1) %out
|
||||
ret void
|
||||
}
|
||||
|
||||
define amdgpu_ps void @test_wmma_f32_16x16x16_bf8_fp8(i32 %A, i32 %B, <4 x float> %C, ptr addrspace(1) %out) {
|
||||
; GFX12-LABEL: test_wmma_f32_16x16x16_bf8_fp8:
|
||||
; GFX12: ; %bb.0: ; %bb
|
||||
; GFX12-NEXT: v_wmma_f32_16x16x16_bf8_fp8 v[2:5], v0, v1, v[2:5]
|
||||
; GFX12-NEXT: global_store_b128 v[6:7], v[2:5], off
|
||||
; GFX12-NEXT: s_nop 0
|
||||
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
|
||||
; GFX12-NEXT: s_endpgm
|
||||
bb:
|
||||
%res = call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf8.fp8.v4f32.i32.i32.v4f32(i32 %A, i32 %B, <4 x float> %C)
|
||||
store <4 x float> %res, ptr addrspace(1) %out
|
||||
ret void
|
||||
}
|
||||
|
||||
define amdgpu_ps void @test_wmma_f32_16x16x16_fp8_bf8(i32 %A, i32 %B, <4 x float> %C, ptr addrspace(1) %out) {
|
||||
; GFX12-LABEL: test_wmma_f32_16x16x16_fp8_bf8:
|
||||
; GFX12: ; %bb.0: ; %bb
|
||||
; GFX12-NEXT: v_wmma_f32_16x16x16_fp8_bf8 v[2:5], v0, v1, v[2:5]
|
||||
; GFX12-NEXT: global_store_b128 v[6:7], v[2:5], off
|
||||
; GFX12-NEXT: s_nop 0
|
||||
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
|
||||
; GFX12-NEXT: s_endpgm
|
||||
bb:
|
||||
%res = call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.fp8.bf8.v4f32.i32.i32.v4f32(i32 %A, i32 %B, <4 x float> %C)
|
||||
store <4 x float> %res, ptr addrspace(1) %out
|
||||
ret void
|
||||
}
|
||||
|
||||
define amdgpu_ps void @test_wmma_f32_16x16x16_bf8_bf8(i32 %A, i32 %B, <4 x float> %C, ptr addrspace(1) %out) {
|
||||
; GFX12-LABEL: test_wmma_f32_16x16x16_bf8_bf8:
|
||||
; GFX12: ; %bb.0: ; %bb
|
||||
; GFX12-NEXT: v_wmma_f32_16x16x16_bf8_bf8 v[2:5], v0, v1, v[2:5]
|
||||
; GFX12-NEXT: global_store_b128 v[6:7], v[2:5], off
|
||||
; GFX12-NEXT: s_nop 0
|
||||
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
|
||||
; GFX12-NEXT: s_endpgm
|
||||
bb:
|
||||
%res = call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf8.bf8.v4f32.i32.i32.v4f32(i32 %A, i32 %B, <4 x float> %C)
|
||||
store <4 x float> %res, ptr addrspace(1) %out
|
||||
ret void
|
||||
}
|
||||
|
||||
define amdgpu_ps void @test_wmma_i32_16x16x32_iu4(i32 %A, i32 %B, <4 x i32> %C, ptr addrspace(1) %out) {
|
||||
; GFX12-LABEL: test_wmma_i32_16x16x32_iu4:
|
||||
; GFX12: ; %bb.0: ; %bb
|
||||
; GFX12-NEXT: v_wmma_i32_16x16x32_iu4 v[2:5], v0, v1, v[2:5]
|
||||
; GFX12-NEXT: global_store_b128 v[6:7], v[2:5], off
|
||||
; GFX12-NEXT: s_nop 0
|
||||
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
|
||||
; GFX12-NEXT: s_endpgm
|
||||
bb:
|
||||
%res = call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x32.iu4.v4i32.i32.i32.v4i32(i1 0, i32 %A, i1 0, i32 %B, <4 x i32> %C, i1 0)
|
||||
store <4 x i32> %res, ptr addrspace(1) %out
|
||||
ret void
|
||||
}
|
||||
|
||||
define amdgpu_ps void @test_swmmac_f32_16x16x32_f16(<4 x half> %A, <8 x half> %B, <4 x float> %C, i8 %Index, ptr addrspace(1) %out) {
|
||||
; GFX12-LABEL: test_swmmac_f32_16x16x32_f16:
|
||||
; GFX12: ; %bb.0: ; %bb
|
||||
; GFX12-NEXT: v_swmmac_f32_16x16x32_f16 v[6:9], v[0:1], v[2:5], v10
|
||||
; GFX12-NEXT: global_store_b128 v[11:12], v[6:9], off
|
||||
; GFX12-NEXT: s_nop 0
|
||||
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
|
||||
; GFX12-NEXT: s_endpgm
|
||||
bb:
|
||||
%res = call <4 x float> @llvm.amdgcn.swmmac.f32.16x16x32.f16.v4f32.v4f16.v8f16.v4f32.i8(<4 x half> %A, <8 x half> %B, <4 x float> %C, i8 %Index)
|
||||
store <4 x float> %res, ptr addrspace(1) %out
|
||||
ret void
|
||||
}
|
||||
|
||||
define amdgpu_ps void @test_swmmac_f32_16x16x32_bf16(<4 x i16> %A, <8 x i16> %B, <4 x float> %C, i8 %Index, ptr addrspace(1) %out) {
|
||||
; GFX12-LABEL: test_swmmac_f32_16x16x32_bf16:
|
||||
; GFX12: ; %bb.0: ; %bb
|
||||
; GFX12-NEXT: v_swmmac_f32_16x16x32_bf16 v[6:9], v[0:1], v[2:5], v10
|
||||
; GFX12-NEXT: global_store_b128 v[11:12], v[6:9], off
|
||||
; GFX12-NEXT: s_nop 0
|
||||
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
|
||||
; GFX12-NEXT: s_endpgm
|
||||
bb:
|
||||
%res = call <4 x float> @llvm.amdgcn.swmmac.f32.16x16x32.bf16.v4f32.v4i16.v8i16.v4f32.i8(<4 x i16> %A, <8 x i16> %B, <4 x float> %C, i8 %Index)
|
||||
store <4 x float> %res, ptr addrspace(1) %out
|
||||
ret void
|
||||
}
|
||||
|
||||
define amdgpu_ps void @test_swmmac_f16_16x16x32_f16(<4 x half> %A, <8 x half> %B, <4 x half> %C, i8 %Index, ptr addrspace(1) %out) {
|
||||
; GFX12-LABEL: test_swmmac_f16_16x16x32_f16:
|
||||
; GFX12: ; %bb.0: ; %bb
|
||||
; GFX12-NEXT: v_swmmac_f16_16x16x32_f16 v[6:7], v[0:1], v[2:5], v8
|
||||
; GFX12-NEXT: global_store_b64 v[9:10], v[6:7], off
|
||||
; GFX12-NEXT: s_nop 0
|
||||
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
|
||||
; GFX12-NEXT: s_endpgm
|
||||
bb:
|
||||
%res = call <4 x half> @llvm.amdgcn.swmmac.f16.16x16x32.f16.v4f16.v4f16.v8f16.v4f16.i8(<4 x half> %A, <8 x half> %B, <4 x half> %C, i8 %Index)
|
||||
store <4 x half> %res, ptr addrspace(1) %out
|
||||
ret void
|
||||
}
|
||||
|
||||
define amdgpu_ps void @test_swmmac_bf16_16x16x32_bf16(<4 x i16> %A, <8 x i16> %B, <4 x i16> %C, i8 %Index, ptr addrspace(1) %out) {
|
||||
; GFX12-LABEL: test_swmmac_bf16_16x16x32_bf16:
|
||||
; GFX12: ; %bb.0: ; %bb
|
||||
; GFX12-NEXT: v_swmmac_bf16_16x16x32_bf16 v[6:7], v[0:1], v[2:5], v8
|
||||
; GFX12-NEXT: global_store_b64 v[9:10], v[6:7], off
|
||||
; GFX12-NEXT: s_nop 0
|
||||
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
|
||||
; GFX12-NEXT: s_endpgm
|
||||
bb:
|
||||
%res = call <4 x i16> @llvm.amdgcn.swmmac.bf16.16x16x32.bf16.v4i16.v4i16.v8i16.v4i16.i8(<4 x i16> %A, <8 x i16> %B, <4 x i16> %C, i8 %Index)
|
||||
store <4 x i16> %res, ptr addrspace(1) %out
|
||||
ret void
|
||||
}
|
||||
|
||||
define amdgpu_ps void @test_swmmac_i32_16x16x32_iu8(i32 %A, <2 x i32> %B, <4 x i32> %C, i8 %Index, ptr addrspace(1) %out) {
|
||||
; GFX12-LABEL: test_swmmac_i32_16x16x32_iu8:
|
||||
; GFX12: ; %bb.0: ; %bb
|
||||
; GFX12-NEXT: v_swmmac_i32_16x16x32_iu8 v[3:6], v0, v[1:2], v7
|
||||
; GFX12-NEXT: global_store_b128 v[8:9], v[3:6], off
|
||||
; GFX12-NEXT: s_nop 0
|
||||
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
|
||||
; GFX12-NEXT: s_endpgm
|
||||
bb:
|
||||
%res = call <4 x i32> @llvm.amdgcn.swmmac.i32.16x16x32.iu8.v4i32.i32.v2i32.v4i32.i8(i1 0, i32 %A, i1 0, <2 x i32> %B, <4 x i32> %C, i8 %Index, i1 0)
|
||||
store <4 x i32> %res, ptr addrspace(1) %out
|
||||
ret void
|
||||
}
|
||||
|
||||
define amdgpu_ps void @test_swmmac_i32_16x16x32_iu4(i32 %A, i32 %B, <4 x i32> %C, i16 %Index, ptr addrspace(1) %out) {
|
||||
; GFX12-LABEL: test_swmmac_i32_16x16x32_iu4:
|
||||
; GFX12: ; %bb.0: ; %bb
|
||||
; GFX12-NEXT: v_swmmac_i32_16x16x32_iu4 v[2:5], v0, v1, v6
|
||||
; GFX12-NEXT: global_store_b128 v[7:8], v[2:5], off
|
||||
; GFX12-NEXT: s_nop 0
|
||||
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
|
||||
; GFX12-NEXT: s_endpgm
|
||||
bb:
|
||||
%res = call <4 x i32> @llvm.amdgcn.swmmac.i32.16x16x32.iu4.v4i32.i32.i32.v4i32.i16(i1 0, i32 %A, i1 0, i32 %B, <4 x i32> %C, i16 %Index, i1 0)
|
||||
store <4 x i32> %res, ptr addrspace(1) %out
|
||||
ret void
|
||||
}
|
||||
|
||||
define amdgpu_ps void @test_swmmac_i32_16x16x64_iu4(i32 %A, <2 x i32> %B, <4 x i32> %C, i16 %Index, ptr addrspace(1) %out) {
|
||||
; GFX12-LABEL: test_swmmac_i32_16x16x64_iu4:
|
||||
; GFX12: ; %bb.0: ; %bb
|
||||
; GFX12-NEXT: v_swmmac_i32_16x16x64_iu4 v[3:6], v0, v[1:2], v7
|
||||
; GFX12-NEXT: global_store_b128 v[8:9], v[3:6], off
|
||||
; GFX12-NEXT: s_nop 0
|
||||
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
|
||||
; GFX12-NEXT: s_endpgm
|
||||
bb:
|
||||
%res = call <4 x i32> @llvm.amdgcn.swmmac.i32.16x16x64.iu4.v4i32.i32.v2i32.v4i32.i16(i1 0, i32 %A, i1 0, <2 x i32> %B, <4 x i32> %C, i16 %Index, i1 0)
|
||||
store <4 x i32> %res, ptr addrspace(1) %out
|
||||
ret void
|
||||
}
|
||||
|
||||
define amdgpu_ps void @test_swmmac_f32_16x16x32_fp8_fp8(i32 %A, <2 x i32> %B, <4 x float> %C, i8 %Index, ptr addrspace(1) %out) {
|
||||
; GFX12-LABEL: test_swmmac_f32_16x16x32_fp8_fp8:
|
||||
; GFX12: ; %bb.0: ; %bb
|
||||
; GFX12-NEXT: v_swmmac_f32_16x16x32_fp8_fp8 v[3:6], v0, v[1:2], v7
|
||||
; GFX12-NEXT: global_store_b128 v[8:9], v[3:6], off
|
||||
; GFX12-NEXT: s_nop 0
|
||||
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
|
||||
; GFX12-NEXT: s_endpgm
|
||||
bb:
|
||||
%res = call <4 x float> @llvm.amdgcn.swmmac.f32.16x16x32.fp8.fp8.v4f32.i32.v2i32.v4f32.i8(i32 %A, <2 x i32> %B, <4 x float> %C, i8 %Index)
|
||||
store <4 x float> %res, ptr addrspace(1) %out
|
||||
ret void
|
||||
}
|
||||
|
||||
define amdgpu_ps void @test_swmmac_f32_16x16x32_fp8_bf8(i32 %A, <2 x i32> %B, <4 x float> %C, i8 %Index, ptr addrspace(1) %out) {
|
||||
; GFX12-LABEL: test_swmmac_f32_16x16x32_fp8_bf8:
|
||||
; GFX12: ; %bb.0: ; %bb
|
||||
; GFX12-NEXT: v_swmmac_f32_16x16x32_fp8_bf8 v[3:6], v0, v[1:2], v7
|
||||
; GFX12-NEXT: global_store_b128 v[8:9], v[3:6], off
|
||||
; GFX12-NEXT: s_nop 0
|
||||
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
|
||||
; GFX12-NEXT: s_endpgm
|
||||
bb:
|
||||
%res = call <4 x float> @llvm.amdgcn.swmmac.f32.16x16x32.fp8.bf8.v4f32.i32.v2i32.v4f32.i8(i32 %A, <2 x i32> %B, <4 x float> %C, i8 %Index)
|
||||
store <4 x float> %res, ptr addrspace(1) %out
|
||||
ret void
|
||||
}
|
||||
|
||||
define amdgpu_ps void @test_swmmac_f32_16x16x32_bf8_fp8(i32 %A, <2 x i32> %B, <4 x float> %C, i8 %Index, ptr addrspace(1) %out) {
|
||||
; GFX12-LABEL: test_swmmac_f32_16x16x32_bf8_fp8:
|
||||
; GFX12: ; %bb.0: ; %bb
|
||||
; GFX12-NEXT: v_swmmac_f32_16x16x32_bf8_fp8 v[3:6], v0, v[1:2], v7
|
||||
; GFX12-NEXT: global_store_b128 v[8:9], v[3:6], off
|
||||
; GFX12-NEXT: s_nop 0
|
||||
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
|
||||
; GFX12-NEXT: s_endpgm
|
||||
bb:
|
||||
%res = call <4 x float> @llvm.amdgcn.swmmac.f32.16x16x32.bf8.fp8.v4f32.i32.v2i32.v4f32.i8(i32 %A, <2 x i32> %B, <4 x float> %C, i8 %Index)
|
||||
store <4 x float> %res, ptr addrspace(1) %out
|
||||
ret void
|
||||
}
|
||||
|
||||
define amdgpu_ps void @test_swmmac_f32_16x16x32_bf8_bf8(i32 %A, <2 x i32> %B, <4 x float> %C, i8 %Index, ptr addrspace(1) %out) {
|
||||
; GFX12-LABEL: test_swmmac_f32_16x16x32_bf8_bf8:
|
||||
; GFX12: ; %bb.0: ; %bb
|
||||
; GFX12-NEXT: v_swmmac_f32_16x16x32_bf8_bf8 v[3:6], v0, v[1:2], v7
|
||||
; GFX12-NEXT: global_store_b128 v[8:9], v[3:6], off
|
||||
; GFX12-NEXT: s_nop 0
|
||||
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
|
||||
; GFX12-NEXT: s_endpgm
|
||||
bb:
|
||||
%res = call <4 x float> @llvm.amdgcn.swmmac.f32.16x16x32.bf8.bf8.v4f32.i32.v2i32.v4f32.i8(i32 %A, <2 x i32> %B, <4 x float> %C, i8 %Index)
|
||||
store <4 x float> %res, ptr addrspace(1) %out
|
||||
ret void
|
||||
}
|
||||
|
||||
declare <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.f16.v4f32.v4f16.v4f16.v4f32(<4 x half>, <4 x half>, <4 x float>)
|
||||
declare <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf16.v4f32.v4i16.v4i16.v4f32(<4 x i16>, <4 x i16>, <4 x float>)
|
||||
declare <4 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16.v4f16.v4f16.v4f16.v4f16(<4 x half>, <4 x half>, <4 x half>, i1 immarg)
|
||||
declare <4 x i16> @llvm.amdgcn.wmma.bf16.16x16x16.bf16.v4i16.v4i16.v4i16.v4i16(<4 x i16>, <4 x i16>, <4 x i16>, i1 immarg)
|
||||
declare <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8.v4i32.i32.i32.v4i32(i1 immarg, i32, i1 immarg, i32, <4 x i32>, i1 immarg)
|
||||
declare <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4.v4i32.i32.i32.v4i32(i1 immarg, i32, i1 immarg, i32, <4 x i32>, i1 immarg)
|
||||
declare <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.fp8.fp8.v4f32.i32.i32.v4f32(i32, i32, <4 x float>)
|
||||
declare <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.fp8.bf8.v4f32.i32.i32.v4f32(i32, i32, <4 x float>)
|
||||
declare <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf8.fp8.v4f32.i32.i32.v4f32(i32, i32, <4 x float>)
|
||||
declare <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf8.bf8.v4f32.i32.i32.v4f32(i32, i32, <4 x float>)
|
||||
declare <4 x i32> @llvm.amdgcn.wmma.i32.16x16x32.iu4.v4i32.i32.i32.v4i32(i1 immarg, i32, i1 immarg, i32, <4 x i32>, i1 immarg)
|
||||
declare <4 x float> @llvm.amdgcn.swmmac.f32.16x16x32.f16.v4f32.v4f16.v8f16.v4f32.i8(<4 x half>, <8 x half>, <4 x float>, i8)
|
||||
declare <4 x float> @llvm.amdgcn.swmmac.f32.16x16x32.bf16.v4f32.v4i16.v8i16.v4f32.i8(<4 x i16>, <8 x i16>, <4 x float>, i8)
|
||||
declare <4 x half> @llvm.amdgcn.swmmac.f16.16x16x32.f16.v4f16.v4f16.v8f16.v4f16.i8(<4 x half>, <8 x half>, <4 x half>, i8)
|
||||
declare <4 x i16> @llvm.amdgcn.swmmac.bf16.16x16x32.bf16.v4i16.v4i16.v8i16.v4i16.i8(<4 x i16>, <8 x i16>, <4 x i16>, i8)
|
||||
declare <4 x i32> @llvm.amdgcn.swmmac.i32.16x16x32.iu8.v4i32.i32.v2i32.v4i32.i8(i1 immarg, i32, i1 immarg, <2 x i32>, <4 x i32>, i8 %Index, i1 immarg)
|
||||
declare <4 x i32> @llvm.amdgcn.swmmac.i32.16x16x32.iu4.v4i32.i32.i32.v4i32.i16(i1 immarg, i32, i1 immarg, i32, <4 x i32>, i16 %Index, i1 immarg)
|
||||
declare <4 x i32> @llvm.amdgcn.swmmac.i32.16x16x64.iu4.v4i32.i32.v2i32.v4i32.i16(i1 immarg, i32, i1 immarg, <2 x i32>, <4 x i32>, i16 %Index, i1 immarg)
|
||||
declare <4 x float> @llvm.amdgcn.swmmac.f32.16x16x32.fp8.fp8.v4f32.i32.v2i32.v4f32.i8(i32, <2 x i32>, <4 x float>, i8)
|
||||
declare <4 x float> @llvm.amdgcn.swmmac.f32.16x16x32.fp8.bf8.v4f32.i32.v2i32.v4f32.i8(i32, <2 x i32>, <4 x float>, i8)
|
||||
declare <4 x float> @llvm.amdgcn.swmmac.f32.16x16x32.bf8.fp8.v4f32.i32.v2i32.v4f32.i8(i32, <2 x i32>, <4 x float>, i8)
|
||||
declare <4 x float> @llvm.amdgcn.swmmac.f32.16x16x32.bf8.bf8.v4f32.i32.v2i32.v4f32.i8(i32, <2 x i32>, <4 x float>, i8)
|
@ -0,0 +1,499 @@
|
||||
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
|
||||
; RUN: llc -march=amdgcn -mcpu=gfx1200 -verify-machineinstrs < %s | FileCheck %s --check-prefix=GFX12
|
||||
|
||||
define amdgpu_ps void @test_wmma_f32_16x16x16_f16_negA(<8 x half> %A, <8 x half> %B, <8 x float> %C, <8 x float> addrspace(1)* %out) {
|
||||
; GFX12-LABEL: test_wmma_f32_16x16x16_f16_negA:
|
||||
; GFX12: ; %bb.0: ; %bb
|
||||
; GFX12-NEXT: v_wmma_f32_16x16x16_f16 v[8:15], v[0:3], v[4:7], v[8:15] neg_lo:[1,0,0] neg_hi:[1,0,0]
|
||||
; GFX12-NEXT: s_clause 0x1
|
||||
; GFX12-NEXT: global_store_b128 v[16:17], v[12:15], off offset:16
|
||||
; GFX12-NEXT: global_store_b128 v[16:17], v[8:11], off
|
||||
; GFX12-NEXT: s_nop 0
|
||||
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
|
||||
; GFX12-NEXT: s_endpgm
|
||||
bb:
|
||||
%fneg.A = fneg <8 x half> %A
|
||||
%res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.f16.v8f16.v8f32(<8 x half> %fneg.A, <8 x half> %B, <8 x float> %C)
|
||||
store <8 x float> %res, <8 x float> addrspace(1)* %out
|
||||
ret void
|
||||
}
|
||||
|
||||
define amdgpu_ps void @test_wmma_f32_16x16x16_f16_negB(<8 x half> %A, <8 x half> %B, <8 x float> %C, <8 x float> addrspace(1)* %out) {
|
||||
; GFX12-LABEL: test_wmma_f32_16x16x16_f16_negB:
|
||||
; GFX12: ; %bb.0: ; %bb
|
||||
; GFX12-NEXT: v_wmma_f32_16x16x16_f16 v[8:15], v[0:3], v[4:7], v[8:15] neg_lo:[0,1,0] neg_hi:[0,1,0]
|
||||
; GFX12-NEXT: s_clause 0x1
|
||||
; GFX12-NEXT: global_store_b128 v[16:17], v[12:15], off offset:16
|
||||
; GFX12-NEXT: global_store_b128 v[16:17], v[8:11], off
|
||||
; GFX12-NEXT: s_nop 0
|
||||
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
|
||||
; GFX12-NEXT: s_endpgm
|
||||
bb:
|
||||
%fneg.B = fneg <8 x half> %B
|
||||
%res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.f16.v8f16.v8f32(<8 x half> %A, <8 x half> %fneg.B, <8 x float> %C)
|
||||
store <8 x float> %res, <8 x float> addrspace(1)* %out
|
||||
ret void
|
||||
}
|
||||
|
||||
define amdgpu_ps void @test_wmma_f32_16x16x16_f16_negC(<8 x half> %A, <8 x half> %B, <8 x float> %C, <8 x float> addrspace(1)* %out) {
|
||||
; GFX12-LABEL: test_wmma_f32_16x16x16_f16_negC:
|
||||
; GFX12: ; %bb.0: ; %bb
|
||||
; GFX12-NEXT: v_wmma_f32_16x16x16_f16 v[8:15], v[0:3], v[4:7], v[8:15] neg_lo:[0,0,1]
|
||||
; GFX12-NEXT: s_clause 0x1
|
||||
; GFX12-NEXT: global_store_b128 v[16:17], v[12:15], off offset:16
|
||||
; GFX12-NEXT: global_store_b128 v[16:17], v[8:11], off
|
||||
; GFX12-NEXT: s_nop 0
|
||||
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
|
||||
; GFX12-NEXT: s_endpgm
|
||||
bb:
|
||||
%fneg.C = fneg <8 x float> %C
|
||||
%res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.f16.v8f16.v8f32(<8 x half> %A, <8 x half> %B, <8 x float> %fneg.C)
|
||||
store <8 x float> %res, <8 x float> addrspace(1)* %out
|
||||
ret void
|
||||
}
|
||||
|
||||
define amdgpu_ps void @test_wmma_f32_16x16x16_f16_absC(<8 x half> %A, <8 x half> %B, <8 x float> %C, <8 x float> addrspace(1)* %out) {
|
||||
; GFX12-LABEL: test_wmma_f32_16x16x16_f16_absC:
|
||||
; GFX12: ; %bb.0: ; %bb
|
||||
; GFX12-NEXT: v_wmma_f32_16x16x16_f16 v[8:15], v[0:3], v[4:7], v[8:15] neg_hi:[0,0,1]
|
||||
; GFX12-NEXT: s_clause 0x1
|
||||
; GFX12-NEXT: global_store_b128 v[16:17], v[12:15], off offset:16
|
||||
; GFX12-NEXT: global_store_b128 v[16:17], v[8:11], off
|
||||
; GFX12-NEXT: s_nop 0
|
||||
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
|
||||
; GFX12-NEXT: s_endpgm
|
||||
bb:
|
||||
%fabs.C = call <8 x float> @llvm.fabs.v8f32(<8 x float> %C)
|
||||
%res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.f16.v8f16.v8f32(<8 x half> %A, <8 x half> %B, <8 x float> %fabs.C)
|
||||
store <8 x float> %res, <8 x float> addrspace(1)* %out
|
||||
ret void
|
||||
}
|
||||
|
||||
define amdgpu_ps void @test_wmma_f32_16x16x16_bf16_negC(<8 x i16> %A, <8 x i16> %B, <8 x float> %C, <8 x float> addrspace(1)* %out) {
|
||||
; GFX12-LABEL: test_wmma_f32_16x16x16_bf16_negC:
|
||||
; GFX12: ; %bb.0: ; %bb
|
||||
; GFX12-NEXT: v_wmma_f32_16x16x16_bf16 v[8:15], v[0:3], v[4:7], v[8:15] neg_lo:[0,0,1]
|
||||
; GFX12-NEXT: s_clause 0x1
|
||||
; GFX12-NEXT: global_store_b128 v[16:17], v[12:15], off offset:16
|
||||
; GFX12-NEXT: global_store_b128 v[16:17], v[8:11], off
|
||||
; GFX12-NEXT: s_nop 0
|
||||
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
|
||||
; GFX12-NEXT: s_endpgm
|
||||
bb:
|
||||
%fneg.C = fneg <8 x float> %C
|
||||
%res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf16.v8i16.v8f32(<8 x i16> %A, <8 x i16> %B, <8 x float> %fneg.C)
|
||||
store <8 x float> %res, <8 x float> addrspace(1)* %out
|
||||
ret void
|
||||
}
|
||||
|
||||
define amdgpu_ps void @test_wmma_f32_16x16x16_bf16_absC(<8 x i16> %A, <8 x i16> %B, <8 x float> %C, <8 x float> addrspace(1)* %out) {
|
||||
; GFX12-LABEL: test_wmma_f32_16x16x16_bf16_absC:
|
||||
; GFX12: ; %bb.0: ; %bb
|
||||
; GFX12-NEXT: v_wmma_f32_16x16x16_bf16 v[8:15], v[0:3], v[4:7], v[8:15] neg_hi:[0,0,1]
|
||||
; GFX12-NEXT: s_clause 0x1
|
||||
; GFX12-NEXT: global_store_b128 v[16:17], v[12:15], off offset:16
|
||||
; GFX12-NEXT: global_store_b128 v[16:17], v[8:11], off
|
||||
; GFX12-NEXT: s_nop 0
|
||||
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
|
||||
; GFX12-NEXT: s_endpgm
|
||||
bb:
|
||||
%fabs.C = call <8 x float> @llvm.fabs.v8f32(<8 x float> %C)
|
||||
%res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf16.v8i16.v8f32(<8 x i16> %A, <8 x i16> %B, <8 x float> %fabs.C)
|
||||
store <8 x float> %res, <8 x float> addrspace(1)* %out
|
||||
ret void
|
||||
}
|
||||
|
||||
define amdgpu_ps void @test_wmma_f16_16x16x16_f16_negA(<8 x half> %A, <8 x half> %B, <8 x half> %C, <8 x half> addrspace(1)* %out) {
|
||||
; GFX12-LABEL: test_wmma_f16_16x16x16_f16_negA:
|
||||
; GFX12: ; %bb.0: ; %bb
|
||||
; GFX12-NEXT: v_wmma_f16_16x16x16_f16 v[8:11], v[0:3], v[4:7], v[8:11] neg_lo:[1,0,0] neg_hi:[1,0,0]
|
||||
; GFX12-NEXT: global_store_b128 v[12:13], v[8:11], off
|
||||
; GFX12-NEXT: s_nop 0
|
||||
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
|
||||
; GFX12-NEXT: s_endpgm
|
||||
bb:
|
||||
%fneg.A = fneg <8 x half> %A
|
||||
%res = call <8 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16.v8f16.v8f16(<8 x half> %fneg.A, <8 x half> %B, <8 x half> %C, i1 0)
|
||||
store <8 x half> %res, <8 x half> addrspace(1)* %out
|
||||
ret void
|
||||
}
|
||||
|
||||
define amdgpu_ps void @test_wmma_f16_16x16x16_f16_negB(<8 x half> %A, <8 x half> %B, <8 x half> %C, <8 x half> addrspace(1)* %out) {
|
||||
; GFX12-LABEL: test_wmma_f16_16x16x16_f16_negB:
|
||||
; GFX12: ; %bb.0: ; %bb
|
||||
; GFX12-NEXT: v_wmma_f16_16x16x16_f16 v[8:11], v[0:3], v[4:7], v[8:11] neg_lo:[0,1,0] neg_hi:[0,1,0]
|
||||
; GFX12-NEXT: global_store_b128 v[12:13], v[8:11], off
|
||||
; GFX12-NEXT: s_nop 0
|
||||
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
|
||||
; GFX12-NEXT: s_endpgm
|
||||
bb:
|
||||
%fneg.B = fneg <8 x half> %B
|
||||
%res = call <8 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16.v8f16.v8f16(<8 x half> %A, <8 x half> %fneg.B, <8 x half> %C, i1 0)
|
||||
store <8 x half> %res, <8 x half> addrspace(1)* %out
|
||||
ret void
|
||||
}
|
||||
|
||||
define amdgpu_ps void @test_wmma_f16_16x16x16_f16_negC(<8 x half> %A, <8 x half> %B, <8 x half> %C, <8 x half> addrspace(1)* %out) {
|
||||
; GFX12-LABEL: test_wmma_f16_16x16x16_f16_negC:
|
||||
; GFX12: ; %bb.0: ; %bb
|
||||
; GFX12-NEXT: v_wmma_f16_16x16x16_f16 v[8:11], v[0:3], v[4:7], v[8:11] neg_lo:[0,0,1]
|
||||
; GFX12-NEXT: global_store_b128 v[12:13], v[8:11], off
|
||||
; GFX12-NEXT: s_nop 0
|
||||
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
|
||||
; GFX12-NEXT: s_endpgm
|
||||
bb:
|
||||
%fneg.C = fneg <8 x half> %C
|
||||
%res = call <8 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16.v8f16.v8f16(<8 x half> %A, <8 x half> %B, <8 x half> %fneg.C, i1 0)
|
||||
store <8 x half> %res, <8 x half> addrspace(1)* %out
|
||||
ret void
|
||||
}
|
||||
|
||||
define amdgpu_ps void @test_wmma_f16_16x16x16_f16_absC(<8 x half> %A, <8 x half> %B, <8 x half> %C, <8 x half> addrspace(1)* %out) {
|
||||
; GFX12-LABEL: test_wmma_f16_16x16x16_f16_absC:
|
||||
; GFX12: ; %bb.0: ; %bb
|
||||
; GFX12-NEXT: v_wmma_f16_16x16x16_f16 v[8:11], v[0:3], v[4:7], v[8:11] neg_hi:[0,0,1]
|
||||
; GFX12-NEXT: global_store_b128 v[12:13], v[8:11], off
|
||||
; GFX12-NEXT: s_nop 0
|
||||
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
|
||||
; GFX12-NEXT: s_endpgm
|
||||
bb:
|
||||
%fabs.C = call <8 x half> @llvm.fabs.v8f16(<8 x half> %C)
|
||||
%res = call <8 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16.v8f16.v8f16(<8 x half> %A, <8 x half> %B, <8 x half> %fabs.C, i1 0)
|
||||
store <8 x half> %res, <8 x half> addrspace(1)* %out
|
||||
ret void
|
||||
}
|
||||
|
||||
define amdgpu_ps void @test_wmma_f32_16x16x16_fp8_fp8_negC(<2 x i32> %A, <2 x i32> %B, <8 x float> %C, <8 x float> addrspace(1)* %out) {
|
||||
; GFX12-LABEL: test_wmma_f32_16x16x16_fp8_fp8_negC:
|
||||
; GFX12: ; %bb.0: ; %bb
|
||||
; GFX12-NEXT: v_wmma_f32_16x16x16_fp8_fp8 v[4:11], v[0:1], v[2:3], v[4:11] neg_lo:[0,0,1]
|
||||
; GFX12-NEXT: s_clause 0x1
|
||||
; GFX12-NEXT: global_store_b128 v[12:13], v[8:11], off offset:16
|
||||
; GFX12-NEXT: global_store_b128 v[12:13], v[4:7], off
|
||||
; GFX12-NEXT: s_nop 0
|
||||
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
|
||||
; GFX12-NEXT: s_endpgm
|
||||
bb:
|
||||
%fneg.C = fneg <8 x float> %C
|
||||
%res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.fp8.fp8.v2i32.v8f32(<2 x i32> %A, <2 x i32> %B, <8 x float> %fneg.C)
|
||||
store <8 x float> %res, <8 x float> addrspace(1)* %out
|
||||
ret void
|
||||
}
|
||||
|
||||
define amdgpu_ps void @test_wmma_f32_16x16x16_fp8_fp8_absC(<2 x i32> %A, <2 x i32> %B, <8 x float> %C, <8 x float> addrspace(1)* %out) {
|
||||
; GFX12-LABEL: test_wmma_f32_16x16x16_fp8_fp8_absC:
|
||||
; GFX12: ; %bb.0: ; %bb
|
||||
; GFX12-NEXT: v_wmma_f32_16x16x16_fp8_fp8 v[4:11], v[0:1], v[2:3], v[4:11] neg_hi:[0,0,1]
|
||||
; GFX12-NEXT: s_clause 0x1
|
||||
; GFX12-NEXT: global_store_b128 v[12:13], v[8:11], off offset:16
|
||||
; GFX12-NEXT: global_store_b128 v[12:13], v[4:7], off
|
||||
; GFX12-NEXT: s_nop 0
|
||||
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
|
||||
; GFX12-NEXT: s_endpgm
|
||||
bb:
|
||||
%fabs.C = call <8 x float> @llvm.fabs.v8f32(<8 x float> %C)
|
||||
%res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.fp8.fp8.v2i32.v8f32(<2 x i32> %A, <2 x i32> %B, <8 x float> %fabs.C)
|
||||
store <8 x float> %res, <8 x float> addrspace(1)* %out
|
||||
ret void
|
||||
}
|
||||
|
||||
define amdgpu_ps void @test_wmma_f32_16x16x16_bf8_fp8_negC(<2 x i32> %A, <2 x i32> %B, <8 x float> %C, <8 x float> addrspace(1)* %out) {
|
||||
; GFX12-LABEL: test_wmma_f32_16x16x16_bf8_fp8_negC:
|
||||
; GFX12: ; %bb.0: ; %bb
|
||||
; GFX12-NEXT: v_wmma_f32_16x16x16_bf8_fp8 v[4:11], v[0:1], v[2:3], v[4:11] neg_lo:[0,0,1]
|
||||
; GFX12-NEXT: s_clause 0x1
|
||||
; GFX12-NEXT: global_store_b128 v[12:13], v[8:11], off offset:16
|
||||
; GFX12-NEXT: global_store_b128 v[12:13], v[4:7], off
|
||||
; GFX12-NEXT: s_nop 0
|
||||
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
|
||||
; GFX12-NEXT: s_endpgm
|
||||
bb:
|
||||
%fneg.C = fneg <8 x float> %C
|
||||
%res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf8.fp8.v2i32.v8f32(<2 x i32> %A, <2 x i32> %B, <8 x float> %fneg.C)
|
||||
store <8 x float> %res, <8 x float> addrspace(1)* %out
|
||||
ret void
|
||||
}
|
||||
|
||||
define amdgpu_ps void @test_wmma_f32_16x16x16_bf8_fp8_absC(<2 x i32> %A, <2 x i32> %B, <8 x float> %C, <8 x float> addrspace(1)* %out) {
|
||||
; GFX12-LABEL: test_wmma_f32_16x16x16_bf8_fp8_absC:
|
||||
; GFX12: ; %bb.0: ; %bb
|
||||
; GFX12-NEXT: v_wmma_f32_16x16x16_bf8_fp8 v[4:11], v[0:1], v[2:3], v[4:11] neg_hi:[0,0,1]
|
||||
; GFX12-NEXT: s_clause 0x1
|
||||
; GFX12-NEXT: global_store_b128 v[12:13], v[8:11], off offset:16
|
||||
; GFX12-NEXT: global_store_b128 v[12:13], v[4:7], off
|
||||
; GFX12-NEXT: s_nop 0
|
||||
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
|
||||
; GFX12-NEXT: s_endpgm
|
||||
bb:
|
||||
%fabs.C = call <8 x float> @llvm.fabs.v8f32(<8 x float> %C)
|
||||
%res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf8.fp8.v2i32.v8f32(<2 x i32> %A, <2 x i32> %B, <8 x float> %fabs.C)
|
||||
store <8 x float> %res, <8 x float> addrspace(1)* %out
|
||||
ret void
|
||||
}
|
||||
|
||||
define amdgpu_ps void @test_wmma_f32_16x16x16_fp8_bf8_negC(<2 x i32> %A, <2 x i32> %B, <8 x float> %C, <8 x float> addrspace(1)* %out) {
|
||||
; GFX12-LABEL: test_wmma_f32_16x16x16_fp8_bf8_negC:
|
||||
; GFX12: ; %bb.0: ; %bb
|
||||
; GFX12-NEXT: v_wmma_f32_16x16x16_fp8_bf8 v[4:11], v[0:1], v[2:3], v[4:11] neg_lo:[0,0,1]
|
||||
; GFX12-NEXT: s_clause 0x1
|
||||
; GFX12-NEXT: global_store_b128 v[12:13], v[8:11], off offset:16
|
||||
; GFX12-NEXT: global_store_b128 v[12:13], v[4:7], off
|
||||
; GFX12-NEXT: s_nop 0
|
||||
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
|
||||
; GFX12-NEXT: s_endpgm
|
||||
bb:
|
||||
%fneg.C = fneg <8 x float> %C
|
||||
%res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.fp8.bf8.v2i32.v8f32(<2 x i32> %A, <2 x i32> %B, <8 x float> %fneg.C)
|
||||
store <8 x float> %res, <8 x float> addrspace(1)* %out
|
||||
ret void
|
||||
}
|
||||
|
||||
define amdgpu_ps void @test_wmma_f32_16x16x16_fp8_bf8_absC(<2 x i32> %A, <2 x i32> %B, <8 x float> %C, <8 x float> addrspace(1)* %out) {
|
||||
; GFX12-LABEL: test_wmma_f32_16x16x16_fp8_bf8_absC:
|
||||
; GFX12: ; %bb.0: ; %bb
|
||||
; GFX12-NEXT: v_wmma_f32_16x16x16_fp8_bf8 v[4:11], v[0:1], v[2:3], v[4:11] neg_hi:[0,0,1]
|
||||
; GFX12-NEXT: s_clause 0x1
|
||||
; GFX12-NEXT: global_store_b128 v[12:13], v[8:11], off offset:16
|
||||
; GFX12-NEXT: global_store_b128 v[12:13], v[4:7], off
|
||||
; GFX12-NEXT: s_nop 0
|
||||
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
|
||||
; GFX12-NEXT: s_endpgm
|
||||
bb:
|
||||
%fabs.C = call <8 x float> @llvm.fabs.v8f32(<8 x float> %C)
|
||||
%res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.fp8.bf8.v2i32.v8f32(<2 x i32> %A, <2 x i32> %B, <8 x float> %fabs.C)
|
||||
store <8 x float> %res, <8 x float> addrspace(1)* %out
|
||||
ret void
|
||||
}
|
||||
|
||||
define amdgpu_ps void @test_wmma_f32_16x16x16_bf8_bf8_negC(<2 x i32> %A, <2 x i32> %B, <8 x float> %C, <8 x float> addrspace(1)* %out) {
|
||||
; GFX12-LABEL: test_wmma_f32_16x16x16_bf8_bf8_negC:
|
||||
; GFX12: ; %bb.0: ; %bb
|
||||
; GFX12-NEXT: v_wmma_f32_16x16x16_bf8_bf8 v[4:11], v[0:1], v[2:3], v[4:11] neg_lo:[0,0,1]
|
||||
; GFX12-NEXT: s_clause 0x1
|
||||
; GFX12-NEXT: global_store_b128 v[12:13], v[8:11], off offset:16
|
||||
; GFX12-NEXT: global_store_b128 v[12:13], v[4:7], off
|
||||
; GFX12-NEXT: s_nop 0
|
||||
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
|
||||
; GFX12-NEXT: s_endpgm
|
||||
bb:
|
||||
%fneg.C = fneg <8 x float> %C
|
||||
%res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf8.bf8.v2i32.v8f32(<2 x i32> %A, <2 x i32> %B, <8 x float> %fneg.C)
|
||||
store <8 x float> %res, <8 x float> addrspace(1)* %out
|
||||
ret void
|
||||
}
|
||||
|
||||
define amdgpu_ps void @test_wmma_f32_16x16x16_bf8_bf8_absC(<2 x i32> %A, <2 x i32> %B, <8 x float> %C, <8 x float> addrspace(1)* %out) {
|
||||
; GFX12-LABEL: test_wmma_f32_16x16x16_bf8_bf8_absC:
|
||||
; GFX12: ; %bb.0: ; %bb
|
||||
; GFX12-NEXT: v_wmma_f32_16x16x16_bf8_bf8 v[4:11], v[0:1], v[2:3], v[4:11] neg_hi:[0,0,1]
|
||||
; GFX12-NEXT: s_clause 0x1
|
||||
; GFX12-NEXT: global_store_b128 v[12:13], v[8:11], off offset:16
|
||||
; GFX12-NEXT: global_store_b128 v[12:13], v[4:7], off
|
||||
; GFX12-NEXT: s_nop 0
|
||||
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
|
||||
; GFX12-NEXT: s_endpgm
|
||||
bb:
|
||||
%fabs.C = call <8 x float> @llvm.fabs.v8f32(<8 x float> %C)
|
||||
%res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf8.bf8.v2i32.v8f32(<2 x i32> %A, <2 x i32> %B, <8 x float> %fabs.C)
|
||||
store <8 x float> %res, <8 x float> addrspace(1)* %out
|
||||
ret void
|
||||
}
|
||||
|
||||
define amdgpu_ps void @test_swmmac_f32_16x16x32_f16_negA(<8 x half> %A, <16 x half> %B, <8 x float> %C, i16 %Index, <8 x float> addrspace(1)* %out) {
|
||||
; GFX12-LABEL: test_swmmac_f32_16x16x32_f16_negA:
|
||||
; GFX12: ; %bb.0: ; %bb
|
||||
; GFX12-NEXT: v_swmmac_f32_16x16x32_f16 v[12:19], v[0:3], v[4:11], v20 neg_lo:[1,0,0] neg_hi:[1,0,0]
|
||||
; GFX12-NEXT: s_clause 0x1
|
||||
; GFX12-NEXT: global_store_b128 v[21:22], v[16:19], off offset:16
|
||||
; GFX12-NEXT: global_store_b128 v[21:22], v[12:15], off
|
||||
; GFX12-NEXT: s_nop 0
|
||||
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
|
||||
; GFX12-NEXT: s_endpgm
|
||||
bb:
|
||||
%fneg.A = fneg <8 x half> %A
|
||||
%res = call <8 x float> @llvm.amdgcn.swmmac.f32.16x16x32.f16.v8f16.v16f16.v8f32.i16(<8 x half> %fneg.A, <16 x half> %B, <8 x float> %C, i16 %Index)
|
||||
store <8 x float> %res, <8 x float> addrspace(1)* %out
|
||||
ret void
|
||||
}
|
||||
|
||||
define amdgpu_ps void @test_swmmac_f32_16x16x32_f16_negB(<8 x half> %A, <16 x half> %B, <8 x float> %C, i16 %Index, <8 x float> addrspace(1)* %out) {
|
||||
; GFX12-LABEL: test_swmmac_f32_16x16x32_f16_negB:
|
||||
; GFX12: ; %bb.0: ; %bb
|
||||
; GFX12-NEXT: v_swmmac_f32_16x16x32_f16 v[12:19], v[0:3], v[4:11], v20 neg_lo:[0,1,0] neg_hi:[0,1,0]
|
||||
; GFX12-NEXT: s_clause 0x1
|
||||
; GFX12-NEXT: global_store_b128 v[21:22], v[16:19], off offset:16
|
||||
; GFX12-NEXT: global_store_b128 v[21:22], v[12:15], off
|
||||
; GFX12-NEXT: s_nop 0
|
||||
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
|
||||
; GFX12-NEXT: s_endpgm
|
||||
bb:
|
||||
%fneg.B = fneg <16 x half> %B
|
||||
%res = call <8 x float> @llvm.amdgcn.swmmac.f32.16x16x32.f16.v8f16.v16f16.v8f32.i16(<8 x half> %A, <16 x half> %fneg.B, <8 x float> %C, i16 %Index)
|
||||
store <8 x float> %res, <8 x float> addrspace(1)* %out
|
||||
ret void
|
||||
}
|
||||
|
||||
define amdgpu_ps void @test_swmmac_f16_16x16x32_f16_negA(<8 x half> %A, <16 x half> %B, <8 x half> %C, i16 %Index, <8 x half> addrspace(1)* %out) {
|
||||
; GFX12-LABEL: test_swmmac_f16_16x16x32_f16_negA:
|
||||
; GFX12: ; %bb.0: ; %bb
|
||||
; GFX12-NEXT: v_swmmac_f16_16x16x32_f16 v[12:15], v[0:3], v[4:11], v16 neg_lo:[1,0,0] neg_hi:[1,0,0]
|
||||
; GFX12-NEXT: global_store_b128 v[17:18], v[12:15], off
|
||||
; GFX12-NEXT: s_nop 0
|
||||
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
|
||||
; GFX12-NEXT: s_endpgm
|
||||
bb:
|
||||
%fneg.A = fneg <8 x half> %A
|
||||
%res = call <8 x half> @llvm.amdgcn.swmmac.f16.16x16x32.f16.v8f16.v16f16.v8f16.i16(<8 x half> %fneg.A, <16 x half> %B, <8 x half> %C, i16 %Index)
|
||||
store <8 x half> %res, <8 x half> addrspace(1)* %out
|
||||
ret void
|
||||
}
|
||||
|
||||
define amdgpu_ps void @test_swmmac_f16_16x16x32_f16_negB(<8 x half> %A, <16 x half> %B, <8 x half> %C, i16 %Index, <8 x half> addrspace(1)* %out) {
|
||||
; GFX12-LABEL: test_swmmac_f16_16x16x32_f16_negB:
|
||||
; GFX12: ; %bb.0: ; %bb
|
||||
; GFX12-NEXT: v_swmmac_f16_16x16x32_f16 v[12:15], v[0:3], v[4:11], v16 neg_lo:[0,1,0] neg_hi:[0,1,0]
|
||||
; GFX12-NEXT: global_store_b128 v[17:18], v[12:15], off
|
||||
; GFX12-NEXT: s_nop 0
|
||||
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
|
||||
; GFX12-NEXT: s_endpgm
|
||||
bb:
|
||||
%fneg.B = fneg <16 x half> %B
|
||||
%res = call <8 x half> @llvm.amdgcn.swmmac.f16.16x16x32.f16.v8f16.v16f16.v8f16.i16(<8 x half> %A, <16 x half> %fneg.B, <8 x half> %C, i16 %Index)
|
||||
store <8 x half> %res, <8 x half> addrspace(1)* %out
|
||||
ret void
|
||||
}
|
||||
|
||||
; both neg and abs patterns (wmma matrix C f32 or f16 )
|
||||
|
||||
define amdgpu_ps void @test_wmma_f32_16x16x16_f16_negabsC(<8 x half> %A, <8 x half> %B, <8 x float> %C, <8 x float> addrspace(1)* %out) {
|
||||
; GFX12-LABEL: test_wmma_f32_16x16x16_f16_negabsC:
|
||||
; GFX12: ; %bb.0: ; %bb
|
||||
; GFX12-NEXT: v_wmma_f32_16x16x16_f16 v[8:15], v[0:3], v[4:7], v[8:15] neg_lo:[0,0,1] neg_hi:[0,0,1]
|
||||
; GFX12-NEXT: s_clause 0x1
|
||||
; GFX12-NEXT: global_store_b128 v[16:17], v[12:15], off offset:16
|
||||
; GFX12-NEXT: global_store_b128 v[16:17], v[8:11], off
|
||||
; GFX12-NEXT: s_nop 0
|
||||
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
|
||||
; GFX12-NEXT: s_endpgm
|
||||
bb:
|
||||
%fabs.C = call <8 x float> @llvm.fabs.v8f32(<8 x float> %C)
|
||||
%fneg.fabs.C = fneg <8 x float> %fabs.C
|
||||
%res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.f16.v8f16.v8f32(<8 x half> %A, <8 x half> %B, <8 x float> %fneg.fabs.C)
|
||||
store <8 x float> %res, <8 x float> addrspace(1)* %out
|
||||
ret void
|
||||
}
|
||||
|
||||
define amdgpu_ps void @test_wmma_f16_16x16x16_f16_negabsC(<8 x half> %A, <8 x half> %B, <8 x half> %C, <8 x half> addrspace(1)* %out) {
|
||||
; GFX12-LABEL: test_wmma_f16_16x16x16_f16_negabsC:
|
||||
; GFX12: ; %bb.0: ; %bb
|
||||
; GFX12-NEXT: v_wmma_f16_16x16x16_f16 v[8:11], v[0:3], v[4:7], v[8:11] neg_lo:[0,0,1] neg_hi:[0,0,1]
|
||||
; GFX12-NEXT: global_store_b128 v[12:13], v[8:11], off
|
||||
; GFX12-NEXT: s_nop 0
|
||||
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
|
||||
; GFX12-NEXT: s_endpgm
|
||||
bb:
|
||||
%fabs.C = call <8 x half> @llvm.fabs.v8f16(<8 x half> %C)
|
||||
%fneg.fabs.C = fneg <8 x half> %fabs.C
|
||||
%res = call <8 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16.v8f16.v8f16(<8 x half> %A, <8 x half> %B, <8 x half> %fneg.fabs.C, i1 0)
|
||||
store <8 x half> %res, <8 x half> addrspace(1)* %out
|
||||
ret void
|
||||
}
|
||||
|
||||
define amdgpu_ps void @test_wmma_f32_16x16x16_f16_neg_partial_fabsA(<8 x half> %A, <8 x half> %B, <8 x float> %C, <8 x float> addrspace(1)* %out) {
|
||||
; GFX12-LABEL: test_wmma_f32_16x16x16_f16_neg_partial_fabsA:
|
||||
; GFX12: ; %bb.0: ; %bb
|
||||
; GFX12-NEXT: v_and_b32_e32 v11, 0x7fffffff, v11
|
||||
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
|
||||
; GFX12-NEXT: v_wmma_f32_16x16x16_f16 v[8:15], v[0:3], v[4:7], v[8:15] neg_lo:[0,0,1]
|
||||
; GFX12-NEXT: s_clause 0x1
|
||||
; GFX12-NEXT: global_store_b128 v[16:17], v[12:15], off offset:16
|
||||
; GFX12-NEXT: global_store_b128 v[16:17], v[8:11], off
|
||||
; GFX12-NEXT: s_nop 0
|
||||
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
|
||||
; GFX12-NEXT: s_endpgm
|
||||
bb:
|
||||
%el3 = extractelement <8 x float> %C, i32 3
|
||||
%el3.fabs = call float @llvm.fabs.f32(float %el3)
|
||||
%partial.fabs.C = insertelement <8 x float> %C, float %el3.fabs, i32 3
|
||||
%fneg.partial.fabs.C = fneg <8 x float> %partial.fabs.C
|
||||
%res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.f16.v8f16.v8f32(<8 x half> %A, <8 x half> %B, <8 x float> %fneg.partial.fabs.C)
|
||||
store <8 x float> %res, <8 x float> addrspace(1)* %out
|
||||
ret void
|
||||
}
|
||||
|
||||
; A or B matrix modifier and constant in C
|
||||
|
||||
define amdgpu_ps void @test_wmma_f32_16x16x16_f16_negA_constantC(<8 x half> %A, <8 x half> %B, <8 x float> %C, <8 x float> addrspace(1)* %out) {
|
||||
; GFX12-LABEL: test_wmma_f32_16x16x16_f16_negA_constantC:
|
||||
; GFX12: ; %bb.0: ; %bb
|
||||
; GFX12-NEXT: v_wmma_f32_16x16x16_f16 v[10:17], v[0:3], v[4:7], 1.0 neg_lo:[1,0,0] neg_hi:[1,0,0]
|
||||
; GFX12-NEXT: s_clause 0x1
|
||||
; GFX12-NEXT: global_store_b128 v[8:9], v[14:17], off offset:16
|
||||
; GFX12-NEXT: global_store_b128 v[8:9], v[10:13], off
|
||||
; GFX12-NEXT: s_nop 0
|
||||
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
|
||||
; GFX12-NEXT: s_endpgm
|
||||
bb:
|
||||
%fneg.A = fneg <8 x half> %A
|
||||
%res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.f16.v8f16.v8f32(<8 x half> %fneg.A, <8 x half> %B, <8 x float> <float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0>)
|
||||
store <8 x float> %res, <8 x float> addrspace(1)* %out
|
||||
ret void
|
||||
}
|
||||
|
||||
define amdgpu_ps void @test_wmma_f16_16x16x16_f16_negB_constantC(<8 x half> %A, <8 x half> %B, <8 x half> %C, <8 x half> addrspace(1)* %out) {
|
||||
; GFX12-LABEL: test_wmma_f16_16x16x16_f16_negB_constantC:
|
||||
; GFX12: ; %bb.0: ; %bb
|
||||
; GFX12-NEXT: v_wmma_f16_16x16x16_f16 v[10:13], v[0:3], v[4:7], 1.0 neg_lo:[0,1,0] neg_hi:[0,1,0]
|
||||
; GFX12-NEXT: global_store_b128 v[8:9], v[10:13], off
|
||||
; GFX12-NEXT: s_nop 0
|
||||
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
|
||||
; GFX12-NEXT: s_endpgm
|
||||
bb:
|
||||
%fneg.B = fneg <8 x half> %B
|
||||
%res = call <8 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16.v8f16.v8f16(<8 x half> %A, <8 x half> %fneg.B, <8 x half> <half 1.0, half 1.0, half 1.0, half 1.0, half 1.0, half 1.0, half 1.0, half 1.0>, i1 0)
|
||||
store <8 x half> %res, <8 x half> addrspace(1)* %out
|
||||
ret void
|
||||
}
|
||||
|
||||
; pack f16 elements with v_perm_b32 since they don't come from same b32
|
||||
|
||||
define amdgpu_ps void @test_wmma_f16_16x16x16_f16_negC_pack(<8 x half> %A, <8 x half> %B, ptr %Caddr, <8 x half> addrspace(1)* %out) {
|
||||
; GFX12-LABEL: test_wmma_f16_16x16x16_f16_negC_pack:
|
||||
; GFX12: ; %bb.0: ; %bb
|
||||
; GFX12-NEXT: s_clause 0x1
|
||||
; GFX12-NEXT: flat_load_b128 v[12:15], v[8:9] offset:16
|
||||
; GFX12-NEXT: flat_load_b128 v[16:19], v[8:9]
|
||||
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x101
|
||||
; GFX12-NEXT: v_perm_b32 v15, v15, v14, 0x5040100
|
||||
; GFX12-NEXT: v_perm_b32 v14, v13, v12, 0x5040100
|
||||
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
|
||||
; GFX12-NEXT: v_perm_b32 v13, v19, v18, 0x5040100
|
||||
; GFX12-NEXT: v_perm_b32 v12, v17, v16, 0x5040100
|
||||
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
|
||||
; GFX12-NEXT: v_wmma_f16_16x16x16_f16 v[12:15], v[0:3], v[4:7], v[12:15] neg_lo:[0,0,1]
|
||||
; GFX12-NEXT: global_store_b128 v[10:11], v[12:15], off
|
||||
; GFX12-NEXT: s_nop 0
|
||||
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
|
||||
; GFX12-NEXT: s_endpgm
|
||||
bb:
|
||||
%C = load <16 x half>, ptr %Caddr
|
||||
%C_shuffle = shufflevector <16 x half> %C, <16 x half> undef, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
|
||||
%fneg.C_shuffle = fneg <8 x half> %C_shuffle
|
||||
%res = call <8 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16.v8f16.v8f16(<8 x half> %A, <8 x half> %B, <8 x half> %fneg.C_shuffle , i1 0)
|
||||
store <8 x half> %res, <8 x half> addrspace(1)* %out
|
||||
ret void
|
||||
}
|
||||
|
||||
declare <8 x half> @llvm.fabs.v8f16(<8 x half>)
|
||||
declare <8 x float> @llvm.fabs.v8f32(<8 x float>)
|
||||
declare float @llvm.fabs.f32(float)
|
||||
|
||||
declare <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.f16.v8f16.v8f32(<8 x half>, <8 x half>, <8 x float>)
|
||||
declare <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf16.v8i16.v8f32(<8 x i16>, <8 x i16>, <8 x float>)
|
||||
declare <8 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16.v8f16.v8f16(<8 x half>, <8 x half>, <8 x half>, i1 immarg)
|
||||
declare <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.fp8.fp8.v2i32.v8f32(<2 x i32>, <2 x i32>, <8 x float>)
|
||||
declare <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.fp8.bf8.v2i32.v8f32(<2 x i32>, <2 x i32>, <8 x float>)
|
||||
declare <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf8.fp8.v2i32.v8f32(<2 x i32>, <2 x i32>, <8 x float>)
|
||||
declare <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf8.bf8.v2i32.v8f32(<2 x i32>, <2 x i32>, <8 x float>)
|
||||
declare <8 x float> @llvm.amdgcn.swmmac.f32.16x16x32.f16.v8f16.v16f16.v8f32.i16(<8 x half>, <16 x half>, <8 x float>, i16)
|
||||
declare <8 x half> @llvm.amdgcn.swmmac.f16.16x16x32.f16.v8f16.v16f16.v8f16.i16(<8 x half>, <16 x half>, <8 x half>, i16)
|
431
llvm/test/CodeGen/AMDGPU/wmma-gfx12-w32-imm.ll
Normal file
431
llvm/test/CodeGen/AMDGPU/wmma-gfx12-w32-imm.ll
Normal file
@ -0,0 +1,431 @@
|
||||
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
|
||||
; RUN: llc -march=amdgcn -mcpu=gfx1200 -verify-machineinstrs < %s | FileCheck %s --check-prefix=GFX12
|
||||
|
||||
define amdgpu_ps void @test_wmma_f32_16x16x16_f16_imm(<8 x half> %A, <8 x half> %B, ptr addrspace(1) %out) {
|
||||
; GFX12-LABEL: test_wmma_f32_16x16x16_f16_imm:
|
||||
; GFX12: ; %bb.0: ; %bb
|
||||
; GFX12-NEXT: v_wmma_f32_16x16x16_f16 v[10:17], v[0:3], v[4:7], 1.0
|
||||
; GFX12-NEXT: s_clause 0x1
|
||||
; GFX12-NEXT: global_store_b128 v[8:9], v[14:17], off offset:16
|
||||
; GFX12-NEXT: global_store_b128 v[8:9], v[10:13], off
|
||||
; GFX12-NEXT: s_nop 0
|
||||
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
|
||||
; GFX12-NEXT: s_endpgm
|
||||
bb:
|
||||
%res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.f16.v8f32.v8f16.v8f16.v8f32(<8 x half> %A, <8 x half> %B, <8 x float> <float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0>)
|
||||
store <8 x float> %res, ptr addrspace(1) %out
|
||||
ret void
|
||||
}
|
||||
|
||||
define amdgpu_ps void @test_wmma_f32_16x16x16_f16_imm_non_inlineable(<8 x half> %A, <8 x half> %B, ptr addrspace(1) %out) {
|
||||
; GFX12-LABEL: test_wmma_f32_16x16x16_f16_imm_non_inlineable:
|
||||
; GFX12: ; %bb.0: ; %bb
|
||||
; GFX12-NEXT: v_mov_b32_e32 v10, 0x40400000
|
||||
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1)
|
||||
; GFX12-NEXT: v_dual_mov_b32 v11, v10 :: v_dual_mov_b32 v12, v10
|
||||
; GFX12-NEXT: v_dual_mov_b32 v13, v10 :: v_dual_mov_b32 v14, v10
|
||||
; GFX12-NEXT: v_dual_mov_b32 v15, v10 :: v_dual_mov_b32 v16, v10
|
||||
; GFX12-NEXT: v_mov_b32_e32 v17, v10
|
||||
; GFX12-NEXT: v_wmma_f32_16x16x16_f16 v[10:17], v[0:3], v[4:7], v[10:17]
|
||||
; GFX12-NEXT: s_clause 0x1
|
||||
; GFX12-NEXT: global_store_b128 v[8:9], v[14:17], off offset:16
|
||||
; GFX12-NEXT: global_store_b128 v[8:9], v[10:13], off
|
||||
; GFX12-NEXT: s_nop 0
|
||||
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
|
||||
; GFX12-NEXT: s_endpgm
|
||||
bb:
|
||||
%res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.f16.v8f32.v8f16.v8f16.v8f32(<8 x half> %A, <8 x half> %B, <8 x float> <float 3.0, float 3.0, float 3.0, float 3.0, float 3.0, float 3.0, float 3.0, float 3.0>)
|
||||
store <8 x float> %res, ptr addrspace(1) %out
|
||||
ret void
|
||||
}
|
||||
|
||||
define amdgpu_ps void @test_wmma_f32_16x16x16_bf16_imm(<8 x i16> %A, <8 x i16> %B, ptr addrspace(1) %out) {
|
||||
; GFX12-LABEL: test_wmma_f32_16x16x16_bf16_imm:
|
||||
; GFX12: ; %bb.0: ; %bb
|
||||
; GFX12-NEXT: v_wmma_f32_16x16x16_bf16 v[10:17], v[0:3], v[4:7], 1.0
|
||||
; GFX12-NEXT: s_clause 0x1
|
||||
; GFX12-NEXT: global_store_b128 v[8:9], v[14:17], off offset:16
|
||||
; GFX12-NEXT: global_store_b128 v[8:9], v[10:13], off
|
||||
; GFX12-NEXT: s_nop 0
|
||||
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
|
||||
; GFX12-NEXT: s_endpgm
|
||||
bb:
|
||||
%res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf16.v8f32.v8i16.v8i16.v8f32(<8 x i16> %A, <8 x i16> %B, <8 x float> <float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0>)
|
||||
store <8 x float> %res, ptr addrspace(1) %out
|
||||
ret void
|
||||
}
|
||||
|
||||
define amdgpu_ps void @test_wmma_f32_16x16x16_bf16_imm_non_inlineable(<8 x i16> %A, <8 x i16> %B, ptr addrspace(1) %out) {
|
||||
; GFX12-LABEL: test_wmma_f32_16x16x16_bf16_imm_non_inlineable:
|
||||
; GFX12: ; %bb.0: ; %bb
|
||||
; GFX12-NEXT: v_mov_b32_e32 v10, 0x40400000
|
||||
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1)
|
||||
; GFX12-NEXT: v_dual_mov_b32 v11, v10 :: v_dual_mov_b32 v12, v10
|
||||
; GFX12-NEXT: v_dual_mov_b32 v13, v10 :: v_dual_mov_b32 v14, v10
|
||||
; GFX12-NEXT: v_dual_mov_b32 v15, v10 :: v_dual_mov_b32 v16, v10
|
||||
; GFX12-NEXT: v_mov_b32_e32 v17, v10
|
||||
; GFX12-NEXT: v_wmma_f32_16x16x16_bf16 v[10:17], v[0:3], v[4:7], v[10:17]
|
||||
; GFX12-NEXT: s_clause 0x1
|
||||
; GFX12-NEXT: global_store_b128 v[8:9], v[14:17], off offset:16
|
||||
; GFX12-NEXT: global_store_b128 v[8:9], v[10:13], off
|
||||
; GFX12-NEXT: s_nop 0
|
||||
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
|
||||
; GFX12-NEXT: s_endpgm
|
||||
bb:
|
||||
%res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf16.v8f32.v8i16.v8i16.v8f32(<8 x i16> %A, <8 x i16> %B, <8 x float> <float 3.0, float 3.0, float 3.0, float 3.0, float 3.0, float 3.0, float 3.0, float 3.0>)
|
||||
store <8 x float> %res, ptr addrspace(1) %out
|
||||
ret void
|
||||
}
|
||||
|
||||
define amdgpu_ps void @test_wmma_f16_16x16x16_f16_imm(<8 x half> %A, <8 x half> %B, ptr addrspace(1) %out) {
|
||||
; GFX12-LABEL: test_wmma_f16_16x16x16_f16_imm:
|
||||
; GFX12: ; %bb.0: ; %bb
|
||||
; GFX12-NEXT: v_wmma_f16_16x16x16_f16 v[10:13], v[0:3], v[4:7], 1.0
|
||||
; GFX12-NEXT: global_store_b128 v[8:9], v[10:13], off
|
||||
; GFX12-NEXT: s_nop 0
|
||||
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
|
||||
; GFX12-NEXT: s_endpgm
|
||||
bb:
|
||||
%res = call <8 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16.v8f16.v8f16.v8f16.v8f16(<8 x half> %A, <8 x half> %B, <8 x half> <half 1.0, half 1.0, half 1.0, half 1.0, half 1.0, half 1.0, half 1.0, half 1.0>, i1 0)
|
||||
store <8 x half> %res, ptr addrspace(1) %out
|
||||
ret void
|
||||
}
|
||||
|
||||
define amdgpu_ps void @test_wmma_f16_16x16x16_f16_imm_non_inlineable(<8 x half> %A, <8 x half> %B, ptr addrspace(1) %out) {
|
||||
; GFX12-LABEL: test_wmma_f16_16x16x16_f16_imm_non_inlineable:
|
||||
; GFX12: ; %bb.0: ; %bb
|
||||
; GFX12-NEXT: v_mov_b32_e32 v10, 0x42004200
|
||||
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
|
||||
; GFX12-NEXT: v_dual_mov_b32 v11, v10 :: v_dual_mov_b32 v12, v10
|
||||
; GFX12-NEXT: v_mov_b32_e32 v13, v10
|
||||
; GFX12-NEXT: v_wmma_f16_16x16x16_f16 v[10:13], v[0:3], v[4:7], v[10:13]
|
||||
; GFX12-NEXT: global_store_b128 v[8:9], v[10:13], off
|
||||
; GFX12-NEXT: s_nop 0
|
||||
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
|
||||
; GFX12-NEXT: s_endpgm
|
||||
bb:
|
||||
%res = call <8 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16.v8f16.v8f16.v8f16.v8f16(<8 x half> %A, <8 x half> %B, <8 x half> <half 3.0, half 3.0, half 3.0, half 3.0, half 3.0, half 3.0, half 3.0, half 3.0>, i1 0)
|
||||
store <8 x half> %res, ptr addrspace(1) %out
|
||||
ret void
|
||||
}
|
||||
|
||||
define amdgpu_ps void @test_wmma_bf16_16x16x16_bf16_imm(<8 x i16> %A, <8 x i16> %B, ptr addrspace(1) %out) {
|
||||
; GFX12-LABEL: test_wmma_bf16_16x16x16_bf16_imm:
|
||||
; GFX12: ; %bb.0: ; %bb
|
||||
; GFX12-NEXT: v_wmma_bf16_16x16x16_bf16 v[10:13], v[0:3], v[4:7], 1.0
|
||||
; GFX12-NEXT: global_store_b128 v[8:9], v[10:13], off
|
||||
; GFX12-NEXT: s_nop 0
|
||||
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
|
||||
; GFX12-NEXT: s_endpgm
|
||||
bb:
|
||||
%res = call <8 x i16> @llvm.amdgcn.wmma.bf16.16x16x16.bf16.v8i16.v8i16.v8i16.v8i16(<8 x i16> %A, <8 x i16> %B, <8 x i16> <i16 16256, i16 16256, i16 16256, i16 16256, i16 16256, i16 16256, i16 16256, i16 16256>, i1 0)
|
||||
store <8 x i16> %res, ptr addrspace(1) %out
|
||||
ret void
|
||||
}
|
||||
|
||||
define amdgpu_ps void @test_wmma_bf16_16x16x16_bf16_imm_non_inlineable(<8 x i16> %A, <8 x i16> %B, ptr addrspace(1) %out) {
|
||||
; GFX12-LABEL: test_wmma_bf16_16x16x16_bf16_imm_non_inlineable:
|
||||
; GFX12: ; %bb.0: ; %bb
|
||||
; GFX12-NEXT: v_mov_b32_e32 v10, 0x3fc03fc0
|
||||
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
|
||||
; GFX12-NEXT: v_dual_mov_b32 v11, v10 :: v_dual_mov_b32 v12, v10
|
||||
; GFX12-NEXT: v_mov_b32_e32 v13, v10
|
||||
; GFX12-NEXT: v_wmma_bf16_16x16x16_bf16 v[10:13], v[0:3], v[4:7], v[10:13]
|
||||
; GFX12-NEXT: global_store_b128 v[8:9], v[10:13], off
|
||||
; GFX12-NEXT: s_nop 0
|
||||
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
|
||||
; GFX12-NEXT: s_endpgm
|
||||
bb:
|
||||
%res = call <8 x i16> @llvm.amdgcn.wmma.bf16.16x16x16.bf16.v8i16.v8i16.v8i16.v8i16(<8 x i16> %A, <8 x i16> %B, <8 x i16> <i16 16320, i16 16320, i16 16320, i16 16320, i16 16320, i16 16320, i16 16320, i16 16320>, i1 0)
|
||||
store <8 x i16> %res, ptr addrspace(1) %out
|
||||
ret void
|
||||
}
|
||||
|
||||
define amdgpu_ps void @test_wmma_i32_16x16x16_iu8_imm(<2 x i32> %A, <2 x i32> %B, ptr addrspace(1) %out) {
|
||||
; GFX12-LABEL: test_wmma_i32_16x16x16_iu8_imm:
|
||||
; GFX12: ; %bb.0: ; %bb
|
||||
; GFX12-NEXT: v_wmma_i32_16x16x16_iu8 v[6:13], v[0:1], v[2:3], 1
|
||||
; GFX12-NEXT: s_clause 0x1
|
||||
; GFX12-NEXT: global_store_b128 v[4:5], v[10:13], off offset:16
|
||||
; GFX12-NEXT: global_store_b128 v[4:5], v[6:9], off
|
||||
; GFX12-NEXT: s_nop 0
|
||||
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
|
||||
; GFX12-NEXT: s_endpgm
|
||||
bb:
|
||||
%res = call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8.v8i32.v2i32.v2i32.v8i32(i1 0, <2 x i32> %A, i1 0, <2 x i32> %B, <8 x i32> <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>, i1 0)
|
||||
store <8 x i32> %res, ptr addrspace(1) %out
|
||||
ret void
|
||||
}
|
||||
|
||||
define amdgpu_ps void @test_wmma_i32_16x16x16_iu8_imm_non_inlineable(<2 x i32> %A, <2 x i32> %B, ptr addrspace(1) %out) {
|
||||
; GFX12-LABEL: test_wmma_i32_16x16x16_iu8_imm_non_inlineable:
|
||||
; GFX12: ; %bb.0: ; %bb
|
||||
; GFX12-NEXT: v_mov_b32_e32 v6, 0x80
|
||||
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1)
|
||||
; GFX12-NEXT: v_dual_mov_b32 v7, v6 :: v_dual_mov_b32 v8, v6
|
||||
; GFX12-NEXT: v_dual_mov_b32 v9, v6 :: v_dual_mov_b32 v10, v6
|
||||
; GFX12-NEXT: v_dual_mov_b32 v11, v6 :: v_dual_mov_b32 v12, v6
|
||||
; GFX12-NEXT: v_mov_b32_e32 v13, v6
|
||||
; GFX12-NEXT: v_wmma_i32_16x16x16_iu8 v[6:13], v[0:1], v[2:3], v[6:13]
|
||||
; GFX12-NEXT: s_clause 0x1
|
||||
; GFX12-NEXT: global_store_b128 v[4:5], v[10:13], off offset:16
|
||||
; GFX12-NEXT: global_store_b128 v[4:5], v[6:9], off
|
||||
; GFX12-NEXT: s_nop 0
|
||||
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
|
||||
; GFX12-NEXT: s_endpgm
|
||||
bb:
|
||||
%res = call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8.v8i32.v2i32.v2i32.v8i32(i1 0, <2 x i32> %A, i1 0, <2 x i32> %B, <8 x i32> <i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128>, i1 0)
|
||||
store <8 x i32> %res, ptr addrspace(1) %out
|
||||
ret void
|
||||
}
|
||||
|
||||
define amdgpu_ps void @test_wmma_i32_16x16x16_iu4_imm(i32 %A, i32 %B, ptr addrspace(1) %out) {
|
||||
; GFX12-LABEL: test_wmma_i32_16x16x16_iu4_imm:
|
||||
; GFX12: ; %bb.0: ; %bb
|
||||
; GFX12-NEXT: v_wmma_i32_16x16x16_iu4 v[4:11], v0, v1, 1
|
||||
; GFX12-NEXT: s_clause 0x1
|
||||
; GFX12-NEXT: global_store_b128 v[2:3], v[8:11], off offset:16
|
||||
; GFX12-NEXT: global_store_b128 v[2:3], v[4:7], off
|
||||
; GFX12-NEXT: s_nop 0
|
||||
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
|
||||
; GFX12-NEXT: s_endpgm
|
||||
bb:
|
||||
%res = call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4.v8i32.i32.i32.v8i32(i1 0, i32 %A, i1 0, i32 %B, <8 x i32> <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>, i1 0)
|
||||
store <8 x i32> %res, ptr addrspace(1) %out
|
||||
ret void
|
||||
}
|
||||
|
||||
define amdgpu_ps void @test_wmma_i32_16x16x16_iu4_imm_non_inlineable(i32 %A, i32 %B, ptr addrspace(1) %out) {
|
||||
; GFX12-LABEL: test_wmma_i32_16x16x16_iu4_imm_non_inlineable:
|
||||
; GFX12: ; %bb.0: ; %bb
|
||||
; GFX12-NEXT: v_mov_b32_e32 v4, 0x80
|
||||
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1)
|
||||
; GFX12-NEXT: v_dual_mov_b32 v5, v4 :: v_dual_mov_b32 v6, v4
|
||||
; GFX12-NEXT: v_dual_mov_b32 v7, v4 :: v_dual_mov_b32 v8, v4
|
||||
; GFX12-NEXT: v_dual_mov_b32 v9, v4 :: v_dual_mov_b32 v10, v4
|
||||
; GFX12-NEXT: v_mov_b32_e32 v11, v4
|
||||
; GFX12-NEXT: v_wmma_i32_16x16x16_iu4 v[4:11], v0, v1, v[4:11]
|
||||
; GFX12-NEXT: s_clause 0x1
|
||||
; GFX12-NEXT: global_store_b128 v[2:3], v[8:11], off offset:16
|
||||
; GFX12-NEXT: global_store_b128 v[2:3], v[4:7], off
|
||||
; GFX12-NEXT: s_nop 0
|
||||
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
|
||||
; GFX12-NEXT: s_endpgm
|
||||
bb:
|
||||
%res = call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4.v8i32.i32.i32.v8i32(i1 0, i32 %A, i1 0, i32 %B, <8 x i32> <i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128>, i1 0)
|
||||
store <8 x i32> %res, ptr addrspace(1) %out
|
||||
ret void
|
||||
}
|
||||
|
||||
define amdgpu_ps void @test_wmma_f32_16x16x16_fp8_fp8_imm(<2 x i32> %A, <2 x i32> %B, ptr addrspace(1) %out) {
|
||||
; GFX12-LABEL: test_wmma_f32_16x16x16_fp8_fp8_imm:
|
||||
; GFX12: ; %bb.0: ; %bb
|
||||
; GFX12-NEXT: v_wmma_f32_16x16x16_fp8_fp8 v[6:13], v[0:1], v[2:3], 1.0
|
||||
; GFX12-NEXT: s_clause 0x1
|
||||
; GFX12-NEXT: global_store_b128 v[4:5], v[10:13], off offset:16
|
||||
; GFX12-NEXT: global_store_b128 v[4:5], v[6:9], off
|
||||
; GFX12-NEXT: s_nop 0
|
||||
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
|
||||
; GFX12-NEXT: s_endpgm
|
||||
bb:
|
||||
%res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.fp8.fp8.v8f32.v2i32.v2i32.v8f32(<2 x i32> %A, <2 x i32> %B, <8 x float> <float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0>)
|
||||
store <8 x float> %res, ptr addrspace(1) %out
|
||||
ret void
|
||||
}
|
||||
|
||||
define amdgpu_ps void @test_wmma_f32_16x16x16_fp8_fp8_imm_non_inlineable(<2 x i32> %A, <2 x i32> %B, ptr addrspace(1) %out) {
|
||||
; GFX12-LABEL: test_wmma_f32_16x16x16_fp8_fp8_imm_non_inlineable:
|
||||
; GFX12: ; %bb.0: ; %bb
|
||||
; GFX12-NEXT: v_mov_b32_e32 v6, 0x40400000
|
||||
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1)
|
||||
; GFX12-NEXT: v_dual_mov_b32 v7, v6 :: v_dual_mov_b32 v8, v6
|
||||
; GFX12-NEXT: v_dual_mov_b32 v9, v6 :: v_dual_mov_b32 v10, v6
|
||||
; GFX12-NEXT: v_dual_mov_b32 v11, v6 :: v_dual_mov_b32 v12, v6
|
||||
; GFX12-NEXT: v_mov_b32_e32 v13, v6
|
||||
; GFX12-NEXT: v_wmma_f32_16x16x16_fp8_fp8 v[6:13], v[0:1], v[2:3], v[6:13]
|
||||
; GFX12-NEXT: s_clause 0x1
|
||||
; GFX12-NEXT: global_store_b128 v[4:5], v[10:13], off offset:16
|
||||
; GFX12-NEXT: global_store_b128 v[4:5], v[6:9], off
|
||||
; GFX12-NEXT: s_nop 0
|
||||
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
|
||||
; GFX12-NEXT: s_endpgm
|
||||
bb:
|
||||
%res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.fp8.fp8.v8f32.v2i32.v2i32.v8f32(<2 x i32> %A, <2 x i32> %B, <8 x float> <float 3.0, float 3.0, float 3.0, float 3.0, float 3.0, float 3.0, float 3.0, float 3.0>)
|
||||
store <8 x float> %res, ptr addrspace(1) %out
|
||||
ret void
|
||||
}
|
||||
|
||||
define amdgpu_ps void @test_wmma_f32_16x16x16_bf8_fp8_imm(<2 x i32> %A, <2 x i32> %B, ptr addrspace(1) %out) {
|
||||
; GFX12-LABEL: test_wmma_f32_16x16x16_bf8_fp8_imm:
|
||||
; GFX12: ; %bb.0: ; %bb
|
||||
; GFX12-NEXT: v_wmma_f32_16x16x16_bf8_fp8 v[6:13], v[0:1], v[2:3], 1.0
|
||||
; GFX12-NEXT: s_clause 0x1
|
||||
; GFX12-NEXT: global_store_b128 v[4:5], v[10:13], off offset:16
|
||||
; GFX12-NEXT: global_store_b128 v[4:5], v[6:9], off
|
||||
; GFX12-NEXT: s_nop 0
|
||||
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
|
||||
; GFX12-NEXT: s_endpgm
|
||||
bb:
|
||||
%res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf8.fp8.v8f32.v2i32.v2i32.v8f32(<2 x i32> %A, <2 x i32> %B, <8 x float> <float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0>)
|
||||
store <8 x float> %res, ptr addrspace(1) %out
|
||||
ret void
|
||||
}
|
||||
|
||||
define amdgpu_ps void @test_wmma_f32_16x16x16_bf8_fp8_imm_non_inlineable(<2 x i32> %A, <2 x i32> %B, ptr addrspace(1) %out) {
|
||||
; GFX12-LABEL: test_wmma_f32_16x16x16_bf8_fp8_imm_non_inlineable:
|
||||
; GFX12: ; %bb.0: ; %bb
|
||||
; GFX12-NEXT: v_mov_b32_e32 v6, 0x40400000
|
||||
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1)
|
||||
; GFX12-NEXT: v_dual_mov_b32 v7, v6 :: v_dual_mov_b32 v8, v6
|
||||
; GFX12-NEXT: v_dual_mov_b32 v9, v6 :: v_dual_mov_b32 v10, v6
|
||||
; GFX12-NEXT: v_dual_mov_b32 v11, v6 :: v_dual_mov_b32 v12, v6
|
||||
; GFX12-NEXT: v_mov_b32_e32 v13, v6
|
||||
; GFX12-NEXT: v_wmma_f32_16x16x16_bf8_fp8 v[6:13], v[0:1], v[2:3], v[6:13]
|
||||
; GFX12-NEXT: s_clause 0x1
|
||||
; GFX12-NEXT: global_store_b128 v[4:5], v[10:13], off offset:16
|
||||
; GFX12-NEXT: global_store_b128 v[4:5], v[6:9], off
|
||||
; GFX12-NEXT: s_nop 0
|
||||
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
|
||||
; GFX12-NEXT: s_endpgm
|
||||
bb:
|
||||
%res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf8.fp8.v8f32.v2i32.v2i32.v8f32(<2 x i32> %A, <2 x i32> %B, <8 x float> <float 3.0, float 3.0, float 3.0, float 3.0, float 3.0, float 3.0, float 3.0, float 3.0>)
|
||||
store <8 x float> %res, ptr addrspace(1) %out
|
||||
ret void
|
||||
}
|
||||
|
||||
define amdgpu_ps void @test_wmma_f32_16x16x16_fp8_bf8_imm(<2 x i32> %A, <2 x i32> %B, ptr addrspace(1) %out) {
|
||||
; GFX12-LABEL: test_wmma_f32_16x16x16_fp8_bf8_imm:
|
||||
; GFX12: ; %bb.0: ; %bb
|
||||
; GFX12-NEXT: v_wmma_f32_16x16x16_fp8_bf8 v[6:13], v[0:1], v[2:3], 1.0
|
||||
; GFX12-NEXT: s_clause 0x1
|
||||
; GFX12-NEXT: global_store_b128 v[4:5], v[10:13], off offset:16
|
||||
; GFX12-NEXT: global_store_b128 v[4:5], v[6:9], off
|
||||
; GFX12-NEXT: s_nop 0
|
||||
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
|
||||
; GFX12-NEXT: s_endpgm
|
||||
bb:
|
||||
%res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.fp8.bf8.v8f32.v2i32.v2i32.v8f32(<2 x i32> %A, <2 x i32> %B, <8 x float> <float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0>)
|
||||
store <8 x float> %res, ptr addrspace(1) %out
|
||||
ret void
|
||||
}
|
||||
|
||||
define amdgpu_ps void @test_wmma_f32_16x16x16_fp8_bf8_imm_non_inlineable(<2 x i32> %A, <2 x i32> %B, ptr addrspace(1) %out) {
|
||||
; GFX12-LABEL: test_wmma_f32_16x16x16_fp8_bf8_imm_non_inlineable:
|
||||
; GFX12: ; %bb.0: ; %bb
|
||||
; GFX12-NEXT: v_mov_b32_e32 v6, 0x40400000
|
||||
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1)
|
||||
; GFX12-NEXT: v_dual_mov_b32 v7, v6 :: v_dual_mov_b32 v8, v6
|
||||
; GFX12-NEXT: v_dual_mov_b32 v9, v6 :: v_dual_mov_b32 v10, v6
|
||||
; GFX12-NEXT: v_dual_mov_b32 v11, v6 :: v_dual_mov_b32 v12, v6
|
||||
; GFX12-NEXT: v_mov_b32_e32 v13, v6
|
||||
; GFX12-NEXT: v_wmma_f32_16x16x16_fp8_bf8 v[6:13], v[0:1], v[2:3], v[6:13]
|
||||
; GFX12-NEXT: s_clause 0x1
|
||||
; GFX12-NEXT: global_store_b128 v[4:5], v[10:13], off offset:16
|
||||
; GFX12-NEXT: global_store_b128 v[4:5], v[6:9], off
|
||||
; GFX12-NEXT: s_nop 0
|
||||
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
|
||||
; GFX12-NEXT: s_endpgm
|
||||
bb:
|
||||
%res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.fp8.bf8.v8f32.v2i32.v2i32.v8f32(<2 x i32> %A, <2 x i32> %B, <8 x float> <float 3.0, float 3.0, float 3.0, float 3.0, float 3.0, float 3.0, float 3.0, float 3.0>)
|
||||
store <8 x float> %res, ptr addrspace(1) %out
|
||||
ret void
|
||||
}
|
||||
|
||||
define amdgpu_ps void @test_wmma_f32_16x16x16_bf8_bf8_imm(<2 x i32> %A, <2 x i32> %B, ptr addrspace(1) %out) {
|
||||
; GFX12-LABEL: test_wmma_f32_16x16x16_bf8_bf8_imm:
|
||||
; GFX12: ; %bb.0: ; %bb
|
||||
; GFX12-NEXT: v_wmma_f32_16x16x16_bf8_bf8 v[6:13], v[0:1], v[2:3], 1.0
|
||||
; GFX12-NEXT: s_clause 0x1
|
||||
; GFX12-NEXT: global_store_b128 v[4:5], v[10:13], off offset:16
|
||||
; GFX12-NEXT: global_store_b128 v[4:5], v[6:9], off
|
||||
; GFX12-NEXT: s_nop 0
|
||||
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
|
||||
; GFX12-NEXT: s_endpgm
|
||||
bb:
|
||||
%res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf8.bf8.v8f32.v2i32.v2i32.v8f32(<2 x i32> %A, <2 x i32> %B, <8 x float> <float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0>)
|
||||
store <8 x float> %res, ptr addrspace(1) %out
|
||||
ret void
|
||||
}
|
||||
|
||||
define amdgpu_ps void @test_wmma_f32_16x16x16_bf8_bf8_imm_non_inlineable(<2 x i32> %A, <2 x i32> %B, ptr addrspace(1) %out) {
|
||||
; GFX12-LABEL: test_wmma_f32_16x16x16_bf8_bf8_imm_non_inlineable:
|
||||
; GFX12: ; %bb.0: ; %bb
|
||||
; GFX12-NEXT: v_mov_b32_e32 v6, 0x40400000
|
||||
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1)
|
||||
; GFX12-NEXT: v_dual_mov_b32 v7, v6 :: v_dual_mov_b32 v8, v6
|
||||
; GFX12-NEXT: v_dual_mov_b32 v9, v6 :: v_dual_mov_b32 v10, v6
|
||||
; GFX12-NEXT: v_dual_mov_b32 v11, v6 :: v_dual_mov_b32 v12, v6
|
||||
; GFX12-NEXT: v_mov_b32_e32 v13, v6
|
||||
; GFX12-NEXT: v_wmma_f32_16x16x16_bf8_bf8 v[6:13], v[0:1], v[2:3], v[6:13]
|
||||
; GFX12-NEXT: s_clause 0x1
|
||||
; GFX12-NEXT: global_store_b128 v[4:5], v[10:13], off offset:16
|
||||
; GFX12-NEXT: global_store_b128 v[4:5], v[6:9], off
|
||||
; GFX12-NEXT: s_nop 0
|
||||
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
|
||||
; GFX12-NEXT: s_endpgm
|
||||
bb:
|
||||
%res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf8.bf8.v8f32.v2i32.v2i32.v8f32(<2 x i32> %A, <2 x i32> %B, <8 x float> <float 3.0, float 3.0, float 3.0, float 3.0, float 3.0, float 3.0, float 3.0, float 3.0>)
|
||||
store <8 x float> %res, ptr addrspace(1) %out
|
||||
ret void
|
||||
}
|
||||
|
||||
define amdgpu_ps void @test_wmma_i32_16x16x32_iu4_imm(<2 x i32> %A, <2 x i32> %B, ptr addrspace(1) %out) {
|
||||
; GFX12-LABEL: test_wmma_i32_16x16x32_iu4_imm:
|
||||
; GFX12: ; %bb.0: ; %bb
|
||||
; GFX12-NEXT: v_wmma_i32_16x16x32_iu4 v[6:13], v[0:1], v[2:3], 1
|
||||
; GFX12-NEXT: s_clause 0x1
|
||||
; GFX12-NEXT: global_store_b128 v[4:5], v[10:13], off offset:16
|
||||
; GFX12-NEXT: global_store_b128 v[4:5], v[6:9], off
|
||||
; GFX12-NEXT: s_nop 0
|
||||
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
|
||||
; GFX12-NEXT: s_endpgm
|
||||
bb:
|
||||
%res = call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x32.iu4.v8i32.v2i32.v2i32.v8i32(i1 0, <2 x i32> %A, i1 0, <2 x i32> %B, <8 x i32> <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>, i1 0)
|
||||
store <8 x i32> %res, ptr addrspace(1) %out
|
||||
ret void
|
||||
}
|
||||
|
||||
define amdgpu_ps void @test_wmma_i32_16x16x32_iu4_imm_non_inlineable(<2 x i32> %A, <2 x i32> %B, ptr addrspace(1) %out) {
|
||||
; GFX12-LABEL: test_wmma_i32_16x16x32_iu4_imm_non_inlineable:
|
||||
; GFX12: ; %bb.0: ; %bb
|
||||
; GFX12-NEXT: v_mov_b32_e32 v6, 0x80
|
||||
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1)
|
||||
; GFX12-NEXT: v_dual_mov_b32 v7, v6 :: v_dual_mov_b32 v8, v6
|
||||
; GFX12-NEXT: v_dual_mov_b32 v9, v6 :: v_dual_mov_b32 v10, v6
|
||||
; GFX12-NEXT: v_dual_mov_b32 v11, v6 :: v_dual_mov_b32 v12, v6
|
||||
; GFX12-NEXT: v_mov_b32_e32 v13, v6
|
||||
; GFX12-NEXT: v_wmma_i32_16x16x32_iu4 v[6:13], v[0:1], v[2:3], v[6:13]
|
||||
; GFX12-NEXT: s_clause 0x1
|
||||
; GFX12-NEXT: global_store_b128 v[4:5], v[10:13], off offset:16
|
||||
; GFX12-NEXT: global_store_b128 v[4:5], v[6:9], off
|
||||
; GFX12-NEXT: s_nop 0
|
||||
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
|
||||
; GFX12-NEXT: s_endpgm
|
||||
bb:
|
||||
%res = call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x32.iu4.v8i32.v2i32.v2i32.v8i32(i1 0, <2 x i32> %A, i1 0, <2 x i32> %B, <8 x i32> <i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128>, i1 0)
|
||||
store <8 x i32> %res, ptr addrspace(1) %out
|
||||
ret void
|
||||
}
|
||||
|
||||
declare <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.f16.v8f32.v8f16.v8f16.v8f32(<8 x half>, <8 x half>, <8 x float>)
|
||||
declare <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf16.v8f32.v8i16.v8i16.v8f32(<8 x i16>, <8 x i16>, <8 x float>)
|
||||
declare <8 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16.v8f16.v8f16.v8f16.v8f16(<8 x half>, <8 x half>, <8 x half>, i1 immarg)
|
||||
declare <8 x i16> @llvm.amdgcn.wmma.bf16.16x16x16.bf16.v8i16.v8i16.v8i16.v8i16(<8 x i16>, <8 x i16>, <8 x i16>, i1 immarg)
|
||||
declare <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8.v8i32.v2i32.v2i32.v8i32(i1 immarg, <2 x i32>, i1 immarg, <2 x i32>, <8 x i32>, i1 immarg)
|
||||
declare <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4.v8i32.i32.i32.v8i32(i1 immarg, i32, i1 immarg, i32, <8 x i32>, i1 immarg)
|
||||
declare <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.fp8.fp8.v8f32.v2i32.v2i32.v8f32(<2 x i32>, <2 x i32>, <8 x float>)
|
||||
declare <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.fp8.bf8.v8f32.v2i32.v2i32.v8f32(<2 x i32>, <2 x i32>, <8 x float>)
|
||||
declare <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf8.fp8.v8f32.v2i32.v2i32.v8f32(<2 x i32>, <2 x i32>, <8 x float>)
|
||||
declare <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf8.bf8.v8f32.v2i32.v2i32.v8f32(<2 x i32>, <2 x i32>, <8 x float>)
|
||||
declare <8 x i32> @llvm.amdgcn.wmma.i32.16x16x32.iu4.v8i32.v2i32.v2i32.v8i32(i1 immarg, <2 x i32>, i1 immarg, <2 x i32>, <8 x i32>, i1 immarg)
|
||||
declare <8 x float> @llvm.amdgcn.swmmac.f32.16x16x32.f16.v8f32.v8f16.v16f16.v8f32.i16(<8 x half>, <16 x half>, <8 x float>, i16)
|
||||
declare <8 x float> @llvm.amdgcn.swmmac.f32.16x16x32.bf16.v8f32.v8i16.v16i16.v8f32.i16(<8 x i16>, <16 x i16>, <8 x float>, i16)
|
||||
declare <8 x half> @llvm.amdgcn.swmmac.f16.16x16x32.f16.v8f16.v8f16.v16f16.v8f16.i16(<8 x half>, <16 x half>, <8 x half>, i16)
|
||||
declare <8 x i16> @llvm.amdgcn.swmmac.bf16.16x16x32.bf16.v8i16.v8i16.v16i16.v8i16.i16(<8 x i16>, <16 x i16>, <8 x i16>, i16)
|
||||
declare <8 x i32> @llvm.amdgcn.swmmac.i32.16x16x32.iu8.v8i32.v2i32.v4i32.v8i32.i16(i1 immarg, <2 x i32>, i1 immarg, <4 x i32>, <8 x i32>, i16 %Index, i1 immarg)
|
||||
declare <8 x i32> @llvm.amdgcn.swmmac.i32.16x16x32.iu4.v8i32.i32.v2i32.v8i32.i16(i1 immarg, i32, i1 immarg, <2 x i32>, <8 x i32>, i16 %Index, i1 immarg)
|
||||
declare <8 x i32> @llvm.amdgcn.swmmac.i32.16x16x64.iu4.v8i32.v2i32.v4i32.v8i32.i32(i1 immarg, <2 x i32>, i1 immarg, <4 x i32>, <8 x i32>, i32 %Index, i1 immarg)
|
||||
declare <8 x float> @llvm.amdgcn.swmmac.f32.16x16x32.fp8.fp8.v8f32.v2i32.v4i32.v8f32.i16(<2 x i32>, <4 x i32>, <8 x float>, i16)
|
||||
declare <8 x float> @llvm.amdgcn.swmmac.f32.16x16x32.fp8.bf8.v8f32.v2i32.v4i32.v8f32.i16(<2 x i32>, <4 x i32>, <8 x float>, i16)
|
||||
declare <8 x float> @llvm.amdgcn.swmmac.f32.16x16x32.bf8.fp8.v8f32.v2i32.v4i32.v8f32.i16(<2 x i32>, <4 x i32>, <8 x float>, i16)
|
||||
declare <8 x float> @llvm.amdgcn.swmmac.f32.16x16x32.bf8.bf8.v8f32.v2i32.v4i32.v8f32.i16(<2 x i32>, <4 x i32>, <8 x float>, i16)
|
309
llvm/test/CodeGen/AMDGPU/wmma-gfx12-w32-iu-modifiers.ll
Normal file
309
llvm/test/CodeGen/AMDGPU/wmma-gfx12-w32-iu-modifiers.ll
Normal file
@ -0,0 +1,309 @@
|
||||
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
|
||||
; RUN: llc -march=amdgcn -mcpu=gfx1200 -verify-machineinstrs < %s | FileCheck %s --check-prefix=GFX12
|
||||
|
||||
define amdgpu_ps void @test_wmma_i32_16x16x16_iu8_zext_src0(<2 x i32> %A, <2 x i32> %B, <8 x i32> %C, ptr addrspace(1) %out) {
|
||||
; GFX12-LABEL: test_wmma_i32_16x16x16_iu8_zext_src0:
|
||||
; GFX12: ; %bb.0: ; %bb
|
||||
; GFX12-NEXT: v_wmma_i32_16x16x16_iu8 v[4:11], v[0:1], v[2:3], v[4:11] neg_lo:[1,0,0]
|
||||
; GFX12-NEXT: s_clause 0x1
|
||||
; GFX12-NEXT: global_store_b128 v[12:13], v[8:11], off offset:16
|
||||
; GFX12-NEXT: global_store_b128 v[12:13], v[4:7], off
|
||||
; GFX12-NEXT: s_nop 0
|
||||
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
|
||||
; GFX12-NEXT: s_endpgm
|
||||
bb:
|
||||
%res = call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8.v8i32.v2i32.v2i32.v8i32(i1 1, <2 x i32> %A, i1 0, <2 x i32> %B, <8 x i32> %C, i1 0)
|
||||
store <8 x i32> %res, ptr addrspace(1) %out
|
||||
ret void
|
||||
}
|
||||
|
||||
define amdgpu_ps void @test_wmma_i32_16x16x16_iu8_zext_src1(<2 x i32> %A, <2 x i32> %B, <8 x i32> %C, ptr addrspace(1) %out) {
|
||||
; GFX12-LABEL: test_wmma_i32_16x16x16_iu8_zext_src1:
|
||||
; GFX12: ; %bb.0: ; %bb
|
||||
; GFX12-NEXT: v_wmma_i32_16x16x16_iu8 v[4:11], v[0:1], v[2:3], v[4:11] neg_lo:[0,1,0]
|
||||
; GFX12-NEXT: s_clause 0x1
|
||||
; GFX12-NEXT: global_store_b128 v[12:13], v[8:11], off offset:16
|
||||
; GFX12-NEXT: global_store_b128 v[12:13], v[4:7], off
|
||||
; GFX12-NEXT: s_nop 0
|
||||
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
|
||||
; GFX12-NEXT: s_endpgm
|
||||
bb:
|
||||
%res = call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8.v8i32.v2i32.v2i32.v8i32(i1 0, <2 x i32> %A, i1 1, <2 x i32> %B, <8 x i32> %C, i1 0)
|
||||
store <8 x i32> %res, ptr addrspace(1) %out
|
||||
ret void
|
||||
}
|
||||
|
||||
define amdgpu_ps void @test_wmma_i32_16x16x16_iu8_clamp(<2 x i32> %A, <2 x i32> %B, <8 x i32> %C, ptr addrspace(1) %out) {
|
||||
; GFX12-LABEL: test_wmma_i32_16x16x16_iu8_clamp:
|
||||
; GFX12: ; %bb.0: ; %bb
|
||||
; GFX12-NEXT: v_wmma_i32_16x16x16_iu8 v[4:11], v[0:1], v[2:3], v[4:11] clamp
|
||||
; GFX12-NEXT: s_clause 0x1
|
||||
; GFX12-NEXT: global_store_b128 v[12:13], v[8:11], off offset:16
|
||||
; GFX12-NEXT: global_store_b128 v[12:13], v[4:7], off
|
||||
; GFX12-NEXT: s_nop 0
|
||||
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
|
||||
; GFX12-NEXT: s_endpgm
|
||||
bb:
|
||||
%res = call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8.v8i32.v2i32.v2i32.v8i32(i1 0, <2 x i32> %A, i1 0, <2 x i32> %B, <8 x i32> %C, i1 1)
|
||||
store <8 x i32> %res, ptr addrspace(1) %out
|
||||
ret void
|
||||
}
|
||||
|
||||
|
||||
|
||||
define amdgpu_ps void @test_wmma_i32_16x16x16_iu4_zext_src0(i32 %A, i32 %B, <8 x i32> %C, ptr addrspace(1) %out) {
|
||||
; GFX12-LABEL: test_wmma_i32_16x16x16_iu4_zext_src0:
|
||||
; GFX12: ; %bb.0: ; %bb
|
||||
; GFX12-NEXT: v_wmma_i32_16x16x16_iu4 v[2:9], v0, v1, v[2:9] neg_lo:[1,0,0]
|
||||
; GFX12-NEXT: s_clause 0x1
|
||||
; GFX12-NEXT: global_store_b128 v[10:11], v[6:9], off offset:16
|
||||
; GFX12-NEXT: global_store_b128 v[10:11], v[2:5], off
|
||||
; GFX12-NEXT: s_nop 0
|
||||
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
|
||||
; GFX12-NEXT: s_endpgm
|
||||
bb:
|
||||
%res = call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4.v8i32.i32.i32.v8i32(i1 1, i32 %A, i1 0, i32 %B, <8 x i32> %C, i1 0)
|
||||
store <8 x i32> %res, ptr addrspace(1) %out
|
||||
ret void
|
||||
}
|
||||
|
||||
define amdgpu_ps void @test_wmma_i32_16x16x16_iu4_zext_src1(i32 %A, i32 %B, <8 x i32> %C, ptr addrspace(1) %out) {
|
||||
; GFX12-LABEL: test_wmma_i32_16x16x16_iu4_zext_src1:
|
||||
; GFX12: ; %bb.0: ; %bb
|
||||
; GFX12-NEXT: v_wmma_i32_16x16x16_iu4 v[2:9], v0, v1, v[2:9] neg_lo:[0,1,0]
|
||||
; GFX12-NEXT: s_clause 0x1
|
||||
; GFX12-NEXT: global_store_b128 v[10:11], v[6:9], off offset:16
|
||||
; GFX12-NEXT: global_store_b128 v[10:11], v[2:5], off
|
||||
; GFX12-NEXT: s_nop 0
|
||||
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
|
||||
; GFX12-NEXT: s_endpgm
|
||||
bb:
|
||||
%res = call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4.v8i32.i32.i32.v8i32(i1 0, i32 %A, i1 1, i32 %B, <8 x i32> %C, i1 0)
|
||||
store <8 x i32> %res, ptr addrspace(1) %out
|
||||
ret void
|
||||
}
|
||||
|
||||
define amdgpu_ps void @test_wmma_i32_16x16x16_iu4_clamp(i32 %A, i32 %B, <8 x i32> %C, ptr addrspace(1) %out) {
|
||||
; GFX12-LABEL: test_wmma_i32_16x16x16_iu4_clamp:
|
||||
; GFX12: ; %bb.0: ; %bb
|
||||
; GFX12-NEXT: v_wmma_i32_16x16x16_iu4 v[2:9], v0, v1, v[2:9] clamp
|
||||
; GFX12-NEXT: s_clause 0x1
|
||||
; GFX12-NEXT: global_store_b128 v[10:11], v[6:9], off offset:16
|
||||
; GFX12-NEXT: global_store_b128 v[10:11], v[2:5], off
|
||||
; GFX12-NEXT: s_nop 0
|
||||
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
|
||||
; GFX12-NEXT: s_endpgm
|
||||
bb:
|
||||
%res = call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4.v8i32.i32.i32.v8i32(i1 0, i32 %A, i1 0, i32 %B, <8 x i32> %C, i1 1)
|
||||
store <8 x i32> %res, ptr addrspace(1) %out
|
||||
ret void
|
||||
}
|
||||
|
||||
|
||||
|
||||
define amdgpu_ps void @test_wmma_i32_16x16x32_iu4_zext_src0(<2 x i32> %A, <2 x i32> %B, <8 x i32> %C, ptr addrspace(1) %out) {
|
||||
; GFX12-LABEL: test_wmma_i32_16x16x32_iu4_zext_src0:
|
||||
; GFX12: ; %bb.0: ; %bb
|
||||
; GFX12-NEXT: v_wmma_i32_16x16x32_iu4 v[4:11], v[0:1], v[2:3], v[4:11] neg_lo:[1,0,0]
|
||||
; GFX12-NEXT: s_clause 0x1
|
||||
; GFX12-NEXT: global_store_b128 v[12:13], v[8:11], off offset:16
|
||||
; GFX12-NEXT: global_store_b128 v[12:13], v[4:7], off
|
||||
; GFX12-NEXT: s_nop 0
|
||||
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
|
||||
; GFX12-NEXT: s_endpgm
|
||||
bb:
|
||||
%res = call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x32.iu4.v8i32.v2i32.v2i32.v8i32(i1 1, <2 x i32> %A, i1 0, <2 x i32> %B, <8 x i32> %C, i1 0)
|
||||
store <8 x i32> %res, ptr addrspace(1) %out
|
||||
ret void
|
||||
}
|
||||
|
||||
define amdgpu_ps void @test_wmma_i32_16x16x32_iu4_zext_src1(<2 x i32> %A, <2 x i32> %B, <8 x i32> %C, ptr addrspace(1) %out) {
|
||||
; GFX12-LABEL: test_wmma_i32_16x16x32_iu4_zext_src1:
|
||||
; GFX12: ; %bb.0: ; %bb
|
||||
; GFX12-NEXT: v_wmma_i32_16x16x32_iu4 v[4:11], v[0:1], v[2:3], v[4:11] neg_lo:[0,1,0]
|
||||
; GFX12-NEXT: s_clause 0x1
|
||||
; GFX12-NEXT: global_store_b128 v[12:13], v[8:11], off offset:16
|
||||
; GFX12-NEXT: global_store_b128 v[12:13], v[4:7], off
|
||||
; GFX12-NEXT: s_nop 0
|
||||
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
|
||||
; GFX12-NEXT: s_endpgm
|
||||
bb:
|
||||
%res = call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x32.iu4.v8i32.v2i32.v2i32.v8i32(i1 0, <2 x i32> %A, i1 1, <2 x i32> %B, <8 x i32> %C, i1 0)
|
||||
store <8 x i32> %res, ptr addrspace(1) %out
|
||||
ret void
|
||||
}
|
||||
|
||||
define amdgpu_ps void @test_wmma_i32_16x16x32_iu4_clamp(<2 x i32> %A, <2 x i32> %B, <8 x i32> %C, ptr addrspace(1) %out) {
|
||||
; GFX12-LABEL: test_wmma_i32_16x16x32_iu4_clamp:
|
||||
; GFX12: ; %bb.0: ; %bb
|
||||
; GFX12-NEXT: v_wmma_i32_16x16x32_iu4 v[4:11], v[0:1], v[2:3], v[4:11] clamp
|
||||
; GFX12-NEXT: s_clause 0x1
|
||||
; GFX12-NEXT: global_store_b128 v[12:13], v[8:11], off offset:16
|
||||
; GFX12-NEXT: global_store_b128 v[12:13], v[4:7], off
|
||||
; GFX12-NEXT: s_nop 0
|
||||
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
|
||||
; GFX12-NEXT: s_endpgm
|
||||
bb:
|
||||
%res = call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x32.iu4.v8i32.v2i32.v2i32.v8i32(i1 0, <2 x i32> %A, i1 0, <2 x i32> %B, <8 x i32> %C, i1 1)
|
||||
store <8 x i32> %res, ptr addrspace(1) %out
|
||||
ret void
|
||||
}
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
define amdgpu_ps void @test_swmmac_i32_16x16x32_iu8_zext_src0(<2 x i32> %A, <4 x i32> %B, <8 x i32> %C, i16 %Index, ptr addrspace(1) %out) {
|
||||
; GFX12-LABEL: test_swmmac_i32_16x16x32_iu8_zext_src0:
|
||||
; GFX12: ; %bb.0: ; %bb
|
||||
; GFX12-NEXT: v_swmmac_i32_16x16x32_iu8 v[6:13], v[0:1], v[2:5], v14 neg_lo:[1,0,0]
|
||||
; GFX12-NEXT: s_clause 0x1
|
||||
; GFX12-NEXT: global_store_b128 v[15:16], v[10:13], off offset:16
|
||||
; GFX12-NEXT: global_store_b128 v[15:16], v[6:9], off
|
||||
; GFX12-NEXT: s_nop 0
|
||||
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
|
||||
; GFX12-NEXT: s_endpgm
|
||||
bb:
|
||||
%res = call <8 x i32> @llvm.amdgcn.swmmac.i32.16x16x32.iu8.v8i32.v2i32.v4i32.v8i32.i16(i1 1, <2 x i32> %A, i1 0, <4 x i32> %B, <8 x i32> %C, i16 %Index, i1 0)
|
||||
store <8 x i32> %res, ptr addrspace(1) %out
|
||||
ret void
|
||||
}
|
||||
|
||||
define amdgpu_ps void @test_swmmac_i32_16x16x32_iu8_zext_src1(<2 x i32> %A, <4 x i32> %B, <8 x i32> %C, i16 %Index, ptr addrspace(1) %out) {
|
||||
; GFX12-LABEL: test_swmmac_i32_16x16x32_iu8_zext_src1:
|
||||
; GFX12: ; %bb.0: ; %bb
|
||||
; GFX12-NEXT: v_swmmac_i32_16x16x32_iu8 v[6:13], v[0:1], v[2:5], v14 neg_lo:[0,1,0]
|
||||
; GFX12-NEXT: s_clause 0x1
|
||||
; GFX12-NEXT: global_store_b128 v[15:16], v[10:13], off offset:16
|
||||
; GFX12-NEXT: global_store_b128 v[15:16], v[6:9], off
|
||||
; GFX12-NEXT: s_nop 0
|
||||
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
|
||||
; GFX12-NEXT: s_endpgm
|
||||
bb:
|
||||
%res = call <8 x i32> @llvm.amdgcn.swmmac.i32.16x16x32.iu8.v8i32.v2i32.v4i32.v8i32.i16(i1 0, <2 x i32> %A, i1 1, <4 x i32> %B, <8 x i32> %C, i16 %Index, i1 0)
|
||||
store <8 x i32> %res, ptr addrspace(1) %out
|
||||
ret void
|
||||
}
|
||||
|
||||
define amdgpu_ps void @test_swmmac_i32_16x16x32_iu8_clamp(<2 x i32> %A, <4 x i32> %B, <8 x i32> %C, i16 %Index, ptr addrspace(1) %out) {
|
||||
; GFX12-LABEL: test_swmmac_i32_16x16x32_iu8_clamp:
|
||||
; GFX12: ; %bb.0: ; %bb
|
||||
; GFX12-NEXT: v_swmmac_i32_16x16x32_iu8 v[6:13], v[0:1], v[2:5], v14 clamp
|
||||
; GFX12-NEXT: s_clause 0x1
|
||||
; GFX12-NEXT: global_store_b128 v[15:16], v[10:13], off offset:16
|
||||
; GFX12-NEXT: global_store_b128 v[15:16], v[6:9], off
|
||||
; GFX12-NEXT: s_nop 0
|
||||
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
|
||||
; GFX12-NEXT: s_endpgm
|
||||
bb:
|
||||
%res = call <8 x i32> @llvm.amdgcn.swmmac.i32.16x16x32.iu8.v8i32.v2i32.v4i32.v8i32.i16(i1 0, <2 x i32> %A, i1 0, <4 x i32> %B, <8 x i32> %C, i16 %Index, i1 1)
|
||||
store <8 x i32> %res, ptr addrspace(1) %out
|
||||
ret void
|
||||
}
|
||||
|
||||
|
||||
|
||||
define amdgpu_ps void @test_swmmac_i32_16x16x32_iu4_zext_src0(i32 %A, <2 x i32> %B, <8 x i32> %C, i16 %Index, ptr addrspace(1) %out) {
|
||||
; GFX12-LABEL: test_swmmac_i32_16x16x32_iu4_zext_src0:
|
||||
; GFX12: ; %bb.0: ; %bb
|
||||
; GFX12-NEXT: v_swmmac_i32_16x16x32_iu4 v[3:10], v0, v[1:2], v11 neg_lo:[1,0,0]
|
||||
; GFX12-NEXT: s_clause 0x1
|
||||
; GFX12-NEXT: global_store_b128 v[12:13], v[7:10], off offset:16
|
||||
; GFX12-NEXT: global_store_b128 v[12:13], v[3:6], off
|
||||
; GFX12-NEXT: s_nop 0
|
||||
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
|
||||
; GFX12-NEXT: s_endpgm
|
||||
bb:
|
||||
%res = call <8 x i32> @llvm.amdgcn.swmmac.i32.16x16x32.iu4.v8i32.i32.v2i32.v8i32.i16(i1 1, i32 %A, i1 0, <2 x i32> %B, <8 x i32> %C, i16 %Index, i1 0)
|
||||
store <8 x i32> %res, ptr addrspace(1) %out
|
||||
ret void
|
||||
}
|
||||
|
||||
define amdgpu_ps void @test_swmmac_i32_16x16x32_iu4_zext_src1(i32 %A, <2 x i32> %B, <8 x i32> %C, i16 %Index, ptr addrspace(1) %out) {
|
||||
; GFX12-LABEL: test_swmmac_i32_16x16x32_iu4_zext_src1:
|
||||
; GFX12: ; %bb.0: ; %bb
|
||||
; GFX12-NEXT: v_swmmac_i32_16x16x32_iu4 v[3:10], v0, v[1:2], v11 neg_lo:[0,1,0]
|
||||
; GFX12-NEXT: s_clause 0x1
|
||||
; GFX12-NEXT: global_store_b128 v[12:13], v[7:10], off offset:16
|
||||
; GFX12-NEXT: global_store_b128 v[12:13], v[3:6], off
|
||||
; GFX12-NEXT: s_nop 0
|
||||
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
|
||||
; GFX12-NEXT: s_endpgm
|
||||
bb:
|
||||
%res = call <8 x i32> @llvm.amdgcn.swmmac.i32.16x16x32.iu4.v8i32.i32.v2i32.v8i32.i16(i1 0, i32 %A, i1 1, <2 x i32> %B, <8 x i32> %C, i16 %Index, i1 0)
|
||||
store <8 x i32> %res, ptr addrspace(1) %out
|
||||
ret void
|
||||
}
|
||||
|
||||
define amdgpu_ps void @test_swmmac_i32_16x16x32_iu4_clamp(i32 %A, <2 x i32> %B, <8 x i32> %C, i16 %Index, ptr addrspace(1) %out) {
|
||||
; GFX12-LABEL: test_swmmac_i32_16x16x32_iu4_clamp:
|
||||
; GFX12: ; %bb.0: ; %bb
|
||||
; GFX12-NEXT: v_swmmac_i32_16x16x32_iu4 v[3:10], v0, v[1:2], v11 clamp
|
||||
; GFX12-NEXT: s_clause 0x1
|
||||
; GFX12-NEXT: global_store_b128 v[12:13], v[7:10], off offset:16
|
||||
; GFX12-NEXT: global_store_b128 v[12:13], v[3:6], off
|
||||
; GFX12-NEXT: s_nop 0
|
||||
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
|
||||
; GFX12-NEXT: s_endpgm
|
||||
bb:
|
||||
%res = call <8 x i32> @llvm.amdgcn.swmmac.i32.16x16x32.iu4.v8i32.i32.v2i32.v8i32.i16(i1 0, i32 %A, i1 0, <2 x i32> %B, <8 x i32> %C, i16 %Index, i1 1)
|
||||
store <8 x i32> %res, ptr addrspace(1) %out
|
||||
ret void
|
||||
}
|
||||
|
||||
|
||||
|
||||
define amdgpu_ps void @test_swmmac_i32_16x16x64_iu4_zext_src0(<2 x i32> %A, <4 x i32> %B, <8 x i32> %C, i32 %Index, ptr addrspace(1) %out) {
|
||||
; GFX12-LABEL: test_swmmac_i32_16x16x64_iu4_zext_src0:
|
||||
; GFX12: ; %bb.0: ; %bb
|
||||
; GFX12-NEXT: v_swmmac_i32_16x16x64_iu4 v[6:13], v[0:1], v[2:5], v14 neg_lo:[1,0,0]
|
||||
; GFX12-NEXT: s_clause 0x1
|
||||
; GFX12-NEXT: global_store_b128 v[15:16], v[10:13], off offset:16
|
||||
; GFX12-NEXT: global_store_b128 v[15:16], v[6:9], off
|
||||
; GFX12-NEXT: s_nop 0
|
||||
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
|
||||
; GFX12-NEXT: s_endpgm
|
||||
bb:
|
||||
%res = call <8 x i32> @llvm.amdgcn.swmmac.i32.16x16x64.iu4.v8i32.v2i32.v4i32.v8i32.i32(i1 1, <2 x i32> %A, i1 0, <4 x i32> %B, <8 x i32> %C, i32 %Index, i1 0)
|
||||
store <8 x i32> %res, ptr addrspace(1) %out
|
||||
ret void
|
||||
}
|
||||
|
||||
define amdgpu_ps void @test_swmmac_i32_16x16x64_iu4_zext_src1(<2 x i32> %A, <4 x i32> %B, <8 x i32> %C, i32 %Index, ptr addrspace(1) %out) {
|
||||
; GFX12-LABEL: test_swmmac_i32_16x16x64_iu4_zext_src1:
|
||||
; GFX12: ; %bb.0: ; %bb
|
||||
; GFX12-NEXT: v_swmmac_i32_16x16x64_iu4 v[6:13], v[0:1], v[2:5], v14 neg_lo:[0,1,0]
|
||||
; GFX12-NEXT: s_clause 0x1
|
||||
; GFX12-NEXT: global_store_b128 v[15:16], v[10:13], off offset:16
|
||||
; GFX12-NEXT: global_store_b128 v[15:16], v[6:9], off
|
||||
; GFX12-NEXT: s_nop 0
|
||||
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
|
||||
; GFX12-NEXT: s_endpgm
|
||||
bb:
|
||||
%res = call <8 x i32> @llvm.amdgcn.swmmac.i32.16x16x64.iu4.v8i32.v2i32.v4i32.v8i32.i32(i1 0, <2 x i32> %A, i1 1, <4 x i32> %B, <8 x i32> %C, i32 %Index, i1 0)
|
||||
store <8 x i32> %res, ptr addrspace(1) %out
|
||||
ret void
|
||||
}
|
||||
|
||||
define amdgpu_ps void @test_swmmac_i32_16x16x64_iu4_clamp(<2 x i32> %A, <4 x i32> %B, <8 x i32> %C, i32 %Index, ptr addrspace(1) %out) {
|
||||
; GFX12-LABEL: test_swmmac_i32_16x16x64_iu4_clamp:
|
||||
; GFX12: ; %bb.0: ; %bb
|
||||
; GFX12-NEXT: v_swmmac_i32_16x16x64_iu4 v[6:13], v[0:1], v[2:5], v14 clamp
|
||||
; GFX12-NEXT: s_clause 0x1
|
||||
; GFX12-NEXT: global_store_b128 v[15:16], v[10:13], off offset:16
|
||||
; GFX12-NEXT: global_store_b128 v[15:16], v[6:9], off
|
||||
; GFX12-NEXT: s_nop 0
|
||||
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
|
||||
; GFX12-NEXT: s_endpgm
|
||||
bb:
|
||||
%res = call <8 x i32> @llvm.amdgcn.swmmac.i32.16x16x64.iu4.v8i32.v2i32.v4i32.v8i32.i32(i1 0, <2 x i32> %A, i1 0, <4 x i32> %B, <8 x i32> %C, i32 %Index, i1 1)
|
||||
store <8 x i32> %res, ptr addrspace(1) %out
|
||||
ret void
|
||||
}
|
||||
|
||||
declare <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8.v8i32.v2i32.v2i32.v8i32(i1 immarg, <2 x i32>, i1 immarg, <2 x i32>, <8 x i32>, i1 immarg)
|
||||
declare <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4.v8i32.i32.i32.v8i32(i1 immarg, i32, i1 immarg, i32, <8 x i32>, i1 immarg)
|
||||
declare <8 x i32> @llvm.amdgcn.wmma.i32.16x16x32.iu4.v8i32.v2i32.v2i32.v8i32(i1 immarg, <2 x i32>, i1 immarg, <2 x i32>, <8 x i32>, i1 immarg)
|
||||
declare <8 x i32> @llvm.amdgcn.swmmac.i32.16x16x32.iu8.v8i32.v2i32.v4i32.v8i32.i16(i1 immarg, <2 x i32>, i1 immarg, <4 x i32>, <8 x i32>, i16 %Index, i1 immarg)
|
||||
declare <8 x i32> @llvm.amdgcn.swmmac.i32.16x16x32.iu4.v8i32.i32.v2i32.v8i32.i16(i1 immarg, i32, i1 immarg, <2 x i32>, <8 x i32>, i16 %Index, i1 immarg)
|
||||
declare <8 x i32> @llvm.amdgcn.swmmac.i32.16x16x64.iu4.v8i32.v2i32.v4i32.v8i32.i32(i1 immarg, <2 x i32>, i1 immarg, <4 x i32>, <8 x i32>, i32 %Index, i1 immarg)
|
321
llvm/test/CodeGen/AMDGPU/wmma-gfx12-w32-swmmac-index_key.ll
Normal file
321
llvm/test/CodeGen/AMDGPU/wmma-gfx12-w32-swmmac-index_key.ll
Normal file
@ -0,0 +1,321 @@
|
||||
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
|
||||
; RUN: llc -march=amdgcn -mcpu=gfx1200 -verify-machineinstrs < %s | FileCheck %s --check-prefix=GFX12
|
||||
|
||||
define amdgpu_ps void @test_swmmac_f32_16x16x32_f16_index_key(<8 x half> %A, <16 x half> %B, <8 x float> %C, ptr addrspace(1) %IndexVecPtr, ptr addrspace(1) %out0, ptr addrspace(1) %out1) {
|
||||
; GFX12-LABEL: test_swmmac_f32_16x16x32_f16_index_key:
|
||||
; GFX12: ; %bb.0: ; %bb
|
||||
; GFX12-NEXT: global_load_b32 v20, v[20:21], off
|
||||
; GFX12-NEXT: v_dual_mov_b32 v33, v19 :: v_dual_mov_b32 v32, v18
|
||||
; GFX12-NEXT: v_dual_mov_b32 v31, v17 :: v_dual_mov_b32 v30, v16
|
||||
; GFX12-NEXT: v_dual_mov_b32 v29, v15 :: v_dual_mov_b32 v28, v14
|
||||
; GFX12-NEXT: v_dual_mov_b32 v27, v13 :: v_dual_mov_b32 v26, v12
|
||||
; GFX12-NEXT: s_wait_loadcnt 0x0
|
||||
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
|
||||
; GFX12-NEXT: v_swmmac_f32_16x16x32_f16 v[26:33], v[0:3], v[4:11], v20
|
||||
; GFX12-NEXT: v_swmmac_f32_16x16x32_f16 v[12:19], v[0:3], v[4:11], v20 index_key:1
|
||||
; GFX12-NEXT: s_clause 0x1
|
||||
; GFX12-NEXT: global_store_b128 v[22:23], v[30:33], off offset:16
|
||||
; GFX12-NEXT: global_store_b128 v[22:23], v[26:29], off
|
||||
; GFX12-NEXT: s_clause 0x1
|
||||
; GFX12-NEXT: global_store_b128 v[24:25], v[16:19], off offset:16
|
||||
; GFX12-NEXT: global_store_b128 v[24:25], v[12:15], off
|
||||
; GFX12-NEXT: s_nop 0
|
||||
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
|
||||
; GFX12-NEXT: s_endpgm
|
||||
bb:
|
||||
%IndexVec = load <2 x i16>, ptr addrspace(1) %IndexVecPtr, align 4
|
||||
%Index0 = extractelement <2 x i16> %IndexVec, i32 0
|
||||
%res0 = call <8 x float> @llvm.amdgcn.swmmac.f32.16x16x32.f16.v8f32.v8f16.v16f16.v8f32.i16(<8 x half> %A, <16 x half> %B, <8 x float> %C, i16 %Index0)
|
||||
store <8 x float> %res0, ptr addrspace(1) %out0
|
||||
%Index1 = extractelement <2 x i16> %IndexVec, i32 1
|
||||
%res1 = call <8 x float> @llvm.amdgcn.swmmac.f32.16x16x32.f16.v8f32.v8f16.v16f16.v8f32.i16(<8 x half> %A, <16 x half> %B, <8 x float> %C, i16 %Index1)
|
||||
store <8 x float> %res1, ptr addrspace(1) %out1
|
||||
ret void
|
||||
}
|
||||
|
||||
define amdgpu_ps void @test_swmmac_f32_16x16x32_bf16_index_key(<8 x i16> %A, <16 x i16> %B, <8 x float> %C, ptr addrspace(1) %IndexVecPtr, ptr addrspace(1) %out0, ptr addrspace(1) %out1) {
|
||||
; GFX12-LABEL: test_swmmac_f32_16x16x32_bf16_index_key:
|
||||
; GFX12: ; %bb.0: ; %bb
|
||||
; GFX12-NEXT: global_load_b32 v20, v[20:21], off
|
||||
; GFX12-NEXT: v_dual_mov_b32 v33, v19 :: v_dual_mov_b32 v32, v18
|
||||
; GFX12-NEXT: v_dual_mov_b32 v31, v17 :: v_dual_mov_b32 v30, v16
|
||||
; GFX12-NEXT: v_dual_mov_b32 v29, v15 :: v_dual_mov_b32 v28, v14
|
||||
; GFX12-NEXT: v_dual_mov_b32 v27, v13 :: v_dual_mov_b32 v26, v12
|
||||
; GFX12-NEXT: s_wait_loadcnt 0x0
|
||||
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
|
||||
; GFX12-NEXT: v_swmmac_f32_16x16x32_bf16 v[26:33], v[0:3], v[4:11], v20
|
||||
; GFX12-NEXT: v_swmmac_f32_16x16x32_bf16 v[12:19], v[0:3], v[4:11], v20 index_key:1
|
||||
; GFX12-NEXT: s_clause 0x1
|
||||
; GFX12-NEXT: global_store_b128 v[22:23], v[30:33], off offset:16
|
||||
; GFX12-NEXT: global_store_b128 v[22:23], v[26:29], off
|
||||
; GFX12-NEXT: s_clause 0x1
|
||||
; GFX12-NEXT: global_store_b128 v[24:25], v[16:19], off offset:16
|
||||
; GFX12-NEXT: global_store_b128 v[24:25], v[12:15], off
|
||||
; GFX12-NEXT: s_nop 0
|
||||
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
|
||||
; GFX12-NEXT: s_endpgm
|
||||
bb:
|
||||
%IndexVec = load <2 x i16>, ptr addrspace(1) %IndexVecPtr, align 4
|
||||
%Index0 = extractelement <2 x i16> %IndexVec, i32 0
|
||||
%res0 = call <8 x float> @llvm.amdgcn.swmmac.f32.16x16x32.bf16.v8f32.v8i16.v16i16.v8f32.i16(<8 x i16> %A, <16 x i16> %B, <8 x float> %C, i16 %Index0)
|
||||
store <8 x float> %res0, ptr addrspace(1) %out0
|
||||
%Index1 = extractelement <2 x i16> %IndexVec, i32 1
|
||||
%res1 = call <8 x float> @llvm.amdgcn.swmmac.f32.16x16x32.bf16.v8f32.v8i16.v16i16.v8f32.i16(<8 x i16> %A, <16 x i16> %B, <8 x float> %C, i16 %Index1)
|
||||
store <8 x float> %res1, ptr addrspace(1) %out1
|
||||
ret void
|
||||
}
|
||||
|
||||
define amdgpu_ps void @test_swmmac_f16_16x16x32_f16_index_key(<8 x half> %A, <16 x half> %B, <8 x half> %C, ptr addrspace(1) %IndexVecPtr, ptr addrspace(1) %out0, ptr addrspace(1) %out1) {
|
||||
; GFX12-LABEL: test_swmmac_f16_16x16x32_f16_index_key:
|
||||
; GFX12: ; %bb.0: ; %bb
|
||||
; GFX12-NEXT: global_load_b32 v16, v[16:17], off
|
||||
; GFX12-NEXT: v_dual_mov_b32 v25, v15 :: v_dual_mov_b32 v24, v14
|
||||
; GFX12-NEXT: v_dual_mov_b32 v23, v13 :: v_dual_mov_b32 v22, v12
|
||||
; GFX12-NEXT: s_wait_loadcnt 0x0
|
||||
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
|
||||
; GFX12-NEXT: v_swmmac_f16_16x16x32_f16 v[22:25], v[0:3], v[4:11], v16
|
||||
; GFX12-NEXT: v_swmmac_f16_16x16x32_f16 v[12:15], v[0:3], v[4:11], v16 index_key:1
|
||||
; GFX12-NEXT: global_store_b128 v[18:19], v[22:25], off
|
||||
; GFX12-NEXT: global_store_b128 v[20:21], v[12:15], off
|
||||
; GFX12-NEXT: s_nop 0
|
||||
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
|
||||
; GFX12-NEXT: s_endpgm
|
||||
bb:
|
||||
%IndexVec = load <2 x i16>, ptr addrspace(1) %IndexVecPtr, align 4
|
||||
%Index0 = extractelement <2 x i16> %IndexVec, i32 0
|
||||
%res0 = call <8 x half> @llvm.amdgcn.swmmac.f16.16x16x32.f16.v8f16.v8f16.v16f16.v8f16.i16(<8 x half> %A, <16 x half> %B, <8 x half> %C, i16 %Index0)
|
||||
store <8 x half> %res0, ptr addrspace(1) %out0
|
||||
%Index1 = extractelement <2 x i16> %IndexVec, i32 1
|
||||
%res1 = call <8 x half> @llvm.amdgcn.swmmac.f16.16x16x32.f16.v8f16.v8f16.v16f16.v8f16.i16(<8 x half> %A, <16 x half> %B, <8 x half> %C, i16 %Index1)
|
||||
store <8 x half> %res1, ptr addrspace(1) %out1
|
||||
ret void
|
||||
}
|
||||
|
||||
define amdgpu_ps void @test_swmmac_bf16_16x16x32_bf16_index_key(<8 x i16> %A, <16 x i16> %B, <8 x i16> %C, ptr addrspace(1) %IndexVecPtr, ptr addrspace(1) %out0, ptr addrspace(1) %out1) {
|
||||
; GFX12-LABEL: test_swmmac_bf16_16x16x32_bf16_index_key:
|
||||
; GFX12: ; %bb.0: ; %bb
|
||||
; GFX12-NEXT: global_load_b32 v16, v[16:17], off
|
||||
; GFX12-NEXT: v_dual_mov_b32 v25, v15 :: v_dual_mov_b32 v24, v14
|
||||
; GFX12-NEXT: v_dual_mov_b32 v23, v13 :: v_dual_mov_b32 v22, v12
|
||||
; GFX12-NEXT: s_wait_loadcnt 0x0
|
||||
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
|
||||
; GFX12-NEXT: v_swmmac_bf16_16x16x32_bf16 v[22:25], v[0:3], v[4:11], v16
|
||||
; GFX12-NEXT: v_swmmac_bf16_16x16x32_bf16 v[12:15], v[0:3], v[4:11], v16 index_key:1
|
||||
; GFX12-NEXT: global_store_b128 v[18:19], v[22:25], off
|
||||
; GFX12-NEXT: global_store_b128 v[20:21], v[12:15], off
|
||||
; GFX12-NEXT: s_nop 0
|
||||
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
|
||||
; GFX12-NEXT: s_endpgm
|
||||
bb:
|
||||
%IndexVec = load <2 x i16>, ptr addrspace(1) %IndexVecPtr, align 4
|
||||
%Index0 = extractelement <2 x i16> %IndexVec, i32 0
|
||||
%res0 = call <8 x i16> @llvm.amdgcn.swmmac.bf16.16x16x32.bf16.v8i16.v8i16.v16i16.v8i16.i16(<8 x i16> %A, <16 x i16> %B, <8 x i16> %C, i16 %Index0)
|
||||
store <8 x i16> %res0, ptr addrspace(1) %out0
|
||||
%Index1 = extractelement <2 x i16> %IndexVec, i32 1
|
||||
%res1 = call <8 x i16> @llvm.amdgcn.swmmac.bf16.16x16x32.bf16.v8i16.v8i16.v16i16.v8i16.i16(<8 x i16> %A, <16 x i16> %B, <8 x i16> %C, i16 %Index1)
|
||||
store <8 x i16> %res1, ptr addrspace(1) %out1
|
||||
ret void
|
||||
}
|
||||
|
||||
define amdgpu_ps void @test_swmmac_i32_16x16x32_iu8_index_key(<2 x i32> %A, <4 x i32> %B, <8 x i32> %C, ptr addrspace(1) %IndexVecPtr, ptr addrspace(1) %out0, ptr addrspace(1) %out1) {
|
||||
; GFX12-LABEL: test_swmmac_i32_16x16x32_iu8_index_key:
|
||||
; GFX12: ; %bb.0: ; %bb
|
||||
; GFX12-NEXT: global_load_b32 v14, v[14:15], off
|
||||
; GFX12-NEXT: v_dual_mov_b32 v27, v13 :: v_dual_mov_b32 v26, v12
|
||||
; GFX12-NEXT: v_dual_mov_b32 v25, v11 :: v_dual_mov_b32 v24, v10
|
||||
; GFX12-NEXT: v_dual_mov_b32 v23, v9 :: v_dual_mov_b32 v22, v8
|
||||
; GFX12-NEXT: v_dual_mov_b32 v21, v7 :: v_dual_mov_b32 v20, v6
|
||||
; GFX12-NEXT: s_wait_loadcnt 0x0
|
||||
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
|
||||
; GFX12-NEXT: v_swmmac_i32_16x16x32_iu8 v[20:27], v[0:1], v[2:5], v14
|
||||
; GFX12-NEXT: v_swmmac_i32_16x16x32_iu8 v[6:13], v[0:1], v[2:5], v14 index_key:1
|
||||
; GFX12-NEXT: s_clause 0x1
|
||||
; GFX12-NEXT: global_store_b128 v[16:17], v[24:27], off offset:16
|
||||
; GFX12-NEXT: global_store_b128 v[16:17], v[20:23], off
|
||||
; GFX12-NEXT: s_clause 0x1
|
||||
; GFX12-NEXT: global_store_b128 v[18:19], v[10:13], off offset:16
|
||||
; GFX12-NEXT: global_store_b128 v[18:19], v[6:9], off
|
||||
; GFX12-NEXT: s_nop 0
|
||||
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
|
||||
; GFX12-NEXT: s_endpgm
|
||||
bb:
|
||||
%IndexVec = load <2 x i16>, ptr addrspace(1) %IndexVecPtr, align 4
|
||||
%Index0 = extractelement <2 x i16> %IndexVec, i32 0
|
||||
%res0 = call <8 x i32> @llvm.amdgcn.swmmac.i32.16x16x32.iu8.v8i32.v2i32.v4i32.v8i32.i16(i1 0, <2 x i32> %A, i1 0, <4 x i32> %B, <8 x i32> %C, i16 %Index0, i1 0)
|
||||
store <8 x i32> %res0, ptr addrspace(1) %out0
|
||||
%Index1 = extractelement <2 x i16> %IndexVec, i32 1
|
||||
%res1 = call <8 x i32> @llvm.amdgcn.swmmac.i32.16x16x32.iu8.v8i32.v2i32.v4i32.v8i32.i16(i1 0, <2 x i32> %A, i1 0, <4 x i32> %B, <8 x i32> %C, i16 %Index1, i1 0)
|
||||
store <8 x i32> %res1, ptr addrspace(1) %out1
|
||||
ret void
|
||||
}
|
||||
|
||||
define amdgpu_ps void @test_swmmac_i32_16x16x32_iu4_index_key(i32 %A, <2 x i32> %B, <8 x i32> %C, ptr addrspace(1) %IndexVecPtr, ptr addrspace(1) %out0, ptr addrspace(1) %out1) {
|
||||
; GFX12-LABEL: test_swmmac_i32_16x16x32_iu4_index_key:
|
||||
; GFX12: ; %bb.0: ; %bb
|
||||
; GFX12-NEXT: global_load_b32 v11, v[11:12], off
|
||||
; GFX12-NEXT: v_dual_mov_b32 v24, v10 :: v_dual_mov_b32 v23, v9
|
||||
; GFX12-NEXT: v_dual_mov_b32 v22, v8 :: v_dual_mov_b32 v21, v7
|
||||
; GFX12-NEXT: v_dual_mov_b32 v20, v6 :: v_dual_mov_b32 v19, v5
|
||||
; GFX12-NEXT: v_dual_mov_b32 v18, v4 :: v_dual_mov_b32 v17, v3
|
||||
; GFX12-NEXT: s_wait_loadcnt 0x0
|
||||
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
|
||||
; GFX12-NEXT: v_swmmac_i32_16x16x32_iu4 v[17:24], v0, v[1:2], v11
|
||||
; GFX12-NEXT: v_swmmac_i32_16x16x32_iu4 v[3:10], v0, v[1:2], v11 index_key:1
|
||||
; GFX12-NEXT: s_clause 0x1
|
||||
; GFX12-NEXT: global_store_b128 v[13:14], v[21:24], off offset:16
|
||||
; GFX12-NEXT: global_store_b128 v[13:14], v[17:20], off
|
||||
; GFX12-NEXT: s_clause 0x1
|
||||
; GFX12-NEXT: global_store_b128 v[15:16], v[7:10], off offset:16
|
||||
; GFX12-NEXT: global_store_b128 v[15:16], v[3:6], off
|
||||
; GFX12-NEXT: s_nop 0
|
||||
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
|
||||
; GFX12-NEXT: s_endpgm
|
||||
bb:
|
||||
%IndexVec = load <2 x i16>, ptr addrspace(1) %IndexVecPtr, align 4
|
||||
%Index0 = extractelement <2 x i16> %IndexVec, i32 0
|
||||
%res0 = call <8 x i32> @llvm.amdgcn.swmmac.i32.16x16x32.iu4.v8i32.i32.v2i32.v8i32.i16(i1 0, i32 %A, i1 0, <2 x i32> %B, <8 x i32> %C, i16 %Index0, i1 0)
|
||||
store <8 x i32> %res0, ptr addrspace(1) %out0
|
||||
%Index1 = extractelement <2 x i16> %IndexVec, i32 1
|
||||
%res1 = call <8 x i32> @llvm.amdgcn.swmmac.i32.16x16x32.iu4.v8i32.i32.v2i32.v8i32.i16(i1 0, i32 %A, i1 0, <2 x i32> %B, <8 x i32> %C, i16 %Index1, i1 0)
|
||||
store <8 x i32> %res1, ptr addrspace(1) %out1
|
||||
ret void
|
||||
}
|
||||
|
||||
define amdgpu_ps void @test_swmmac_f32_16x16x32_fp8_fp8_index_key(<2 x i32> %A, <4 x i32> %B, <8 x float> %C, ptr addrspace(1) %IndexVecPtr, ptr addrspace(1) %out0, ptr addrspace(1) %out1) {
|
||||
; GFX12-LABEL: test_swmmac_f32_16x16x32_fp8_fp8_index_key:
|
||||
; GFX12: ; %bb.0: ; %bb
|
||||
; GFX12-NEXT: global_load_b32 v14, v[14:15], off
|
||||
; GFX12-NEXT: v_dual_mov_b32 v27, v13 :: v_dual_mov_b32 v26, v12
|
||||
; GFX12-NEXT: v_dual_mov_b32 v25, v11 :: v_dual_mov_b32 v24, v10
|
||||
; GFX12-NEXT: v_dual_mov_b32 v23, v9 :: v_dual_mov_b32 v22, v8
|
||||
; GFX12-NEXT: v_dual_mov_b32 v21, v7 :: v_dual_mov_b32 v20, v6
|
||||
; GFX12-NEXT: s_wait_loadcnt 0x0
|
||||
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
|
||||
; GFX12-NEXT: v_swmmac_f32_16x16x32_fp8_fp8 v[20:27], v[0:1], v[2:5], v14
|
||||
; GFX12-NEXT: v_swmmac_f32_16x16x32_fp8_fp8 v[6:13], v[0:1], v[2:5], v14 index_key:1
|
||||
; GFX12-NEXT: s_clause 0x1
|
||||
; GFX12-NEXT: global_store_b128 v[16:17], v[24:27], off offset:16
|
||||
; GFX12-NEXT: global_store_b128 v[16:17], v[20:23], off
|
||||
; GFX12-NEXT: s_clause 0x1
|
||||
; GFX12-NEXT: global_store_b128 v[18:19], v[10:13], off offset:16
|
||||
; GFX12-NEXT: global_store_b128 v[18:19], v[6:9], off
|
||||
; GFX12-NEXT: s_nop 0
|
||||
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
|
||||
; GFX12-NEXT: s_endpgm
|
||||
bb:
|
||||
%IndexVec = load <2 x i16>, ptr addrspace(1) %IndexVecPtr, align 4
|
||||
%Index0 = extractelement <2 x i16> %IndexVec, i32 0
|
||||
%res0 = call <8 x float> @llvm.amdgcn.swmmac.f32.16x16x32.fp8.fp8.v8f32.v2i32.v4i32.v8f32.i16(<2 x i32> %A, <4 x i32> %B, <8 x float> %C, i16 %Index0)
|
||||
store <8 x float> %res0, ptr addrspace(1) %out0
|
||||
%Index1 = extractelement <2 x i16> %IndexVec, i32 1
|
||||
%res1 = call <8 x float> @llvm.amdgcn.swmmac.f32.16x16x32.fp8.fp8.v8f32.v2i32.v4i32.v8f32.i16(<2 x i32> %A, <4 x i32> %B, <8 x float> %C, i16 %Index1)
|
||||
store <8 x float> %res1, ptr addrspace(1) %out1
|
||||
ret void
|
||||
}
|
||||
|
||||
define amdgpu_ps void @test_swmmac_f32_16x16x32_fp8_bf8_index_key(<2 x i32> %A, <4 x i32> %B, <8 x float> %C, ptr addrspace(1) %IndexVecPtr, ptr addrspace(1) %out0, ptr addrspace(1) %out1) {
|
||||
; GFX12-LABEL: test_swmmac_f32_16x16x32_fp8_bf8_index_key:
|
||||
; GFX12: ; %bb.0: ; %bb
|
||||
; GFX12-NEXT: global_load_b32 v14, v[14:15], off
|
||||
; GFX12-NEXT: v_dual_mov_b32 v27, v13 :: v_dual_mov_b32 v26, v12
|
||||
; GFX12-NEXT: v_dual_mov_b32 v25, v11 :: v_dual_mov_b32 v24, v10
|
||||
; GFX12-NEXT: v_dual_mov_b32 v23, v9 :: v_dual_mov_b32 v22, v8
|
||||
; GFX12-NEXT: v_dual_mov_b32 v21, v7 :: v_dual_mov_b32 v20, v6
|
||||
; GFX12-NEXT: s_wait_loadcnt 0x0
|
||||
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
|
||||
; GFX12-NEXT: v_swmmac_f32_16x16x32_fp8_bf8 v[20:27], v[0:1], v[2:5], v14
|
||||
; GFX12-NEXT: v_swmmac_f32_16x16x32_fp8_bf8 v[6:13], v[0:1], v[2:5], v14 index_key:1
|
||||
; GFX12-NEXT: s_clause 0x1
|
||||
; GFX12-NEXT: global_store_b128 v[16:17], v[24:27], off offset:16
|
||||
; GFX12-NEXT: global_store_b128 v[16:17], v[20:23], off
|
||||
; GFX12-NEXT: s_clause 0x1
|
||||
; GFX12-NEXT: global_store_b128 v[18:19], v[10:13], off offset:16
|
||||
; GFX12-NEXT: global_store_b128 v[18:19], v[6:9], off
|
||||
; GFX12-NEXT: s_nop 0
|
||||
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
|
||||
; GFX12-NEXT: s_endpgm
|
||||
bb:
|
||||
%IndexVec = load <2 x i16>, ptr addrspace(1) %IndexVecPtr, align 4
|
||||
%Index0 = extractelement <2 x i16> %IndexVec, i32 0
|
||||
%res0 = call <8 x float> @llvm.amdgcn.swmmac.f32.16x16x32.fp8.bf8.v8f32.v2i32.v4i32.v8f32.i16(<2 x i32> %A, <4 x i32> %B, <8 x float> %C, i16 %Index0)
|
||||
store <8 x float> %res0, ptr addrspace(1) %out0
|
||||
%Index1 = extractelement <2 x i16> %IndexVec, i32 1
|
||||
%res1 = call <8 x float> @llvm.amdgcn.swmmac.f32.16x16x32.fp8.bf8.v8f32.v2i32.v4i32.v8f32.i16(<2 x i32> %A, <4 x i32> %B, <8 x float> %C, i16 %Index1)
|
||||
store <8 x float> %res1, ptr addrspace(1) %out1
|
||||
ret void
|
||||
}
|
||||
|
||||
define amdgpu_ps void @test_swmmac_f32_16x16x32_bf8_fp8_index_key(<2 x i32> %A, <4 x i32> %B, <8 x float> %C, ptr addrspace(1) %IndexVecPtr, ptr addrspace(1) %out0, ptr addrspace(1) %out1) {
|
||||
; GFX12-LABEL: test_swmmac_f32_16x16x32_bf8_fp8_index_key:
|
||||
; GFX12: ; %bb.0: ; %bb
|
||||
; GFX12-NEXT: global_load_b32 v14, v[14:15], off
|
||||
; GFX12-NEXT: v_dual_mov_b32 v27, v13 :: v_dual_mov_b32 v26, v12
|
||||
; GFX12-NEXT: v_dual_mov_b32 v25, v11 :: v_dual_mov_b32 v24, v10
|
||||
; GFX12-NEXT: v_dual_mov_b32 v23, v9 :: v_dual_mov_b32 v22, v8
|
||||
; GFX12-NEXT: v_dual_mov_b32 v21, v7 :: v_dual_mov_b32 v20, v6
|
||||
; GFX12-NEXT: s_wait_loadcnt 0x0
|
||||
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
|
||||
; GFX12-NEXT: v_swmmac_f32_16x16x32_bf8_fp8 v[20:27], v[0:1], v[2:5], v14
|
||||
; GFX12-NEXT: v_swmmac_f32_16x16x32_bf8_fp8 v[6:13], v[0:1], v[2:5], v14 index_key:1
|
||||
; GFX12-NEXT: s_clause 0x1
|
||||
; GFX12-NEXT: global_store_b128 v[16:17], v[24:27], off offset:16
|
||||
; GFX12-NEXT: global_store_b128 v[16:17], v[20:23], off
|
||||
; GFX12-NEXT: s_clause 0x1
|
||||
; GFX12-NEXT: global_store_b128 v[18:19], v[10:13], off offset:16
|
||||
; GFX12-NEXT: global_store_b128 v[18:19], v[6:9], off
|
||||
; GFX12-NEXT: s_nop 0
|
||||
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
|
||||
; GFX12-NEXT: s_endpgm
|
||||
bb:
|
||||
%IndexVec = load <2 x i16>, ptr addrspace(1) %IndexVecPtr, align 4
|
||||
%Index0 = extractelement <2 x i16> %IndexVec, i32 0
|
||||
%res0 = call <8 x float> @llvm.amdgcn.swmmac.f32.16x16x32.bf8.fp8.v8f32.v2i32.v4i32.v8f32.i16(<2 x i32> %A, <4 x i32> %B, <8 x float> %C, i16 %Index0)
|
||||
store <8 x float> %res0, ptr addrspace(1) %out0
|
||||
%Index1 = extractelement <2 x i16> %IndexVec, i32 1
|
||||
%res1 = call <8 x float> @llvm.amdgcn.swmmac.f32.16x16x32.bf8.fp8.v8f32.v2i32.v4i32.v8f32.i16(<2 x i32> %A, <4 x i32> %B, <8 x float> %C, i16 %Index1)
|
||||
store <8 x float> %res1, ptr addrspace(1) %out1
|
||||
ret void
|
||||
}
|
||||
|
||||
define amdgpu_ps void @test_swmmac_f32_16x16x32_bf8_bf8_index_key(<2 x i32> %A, <4 x i32> %B, <8 x float> %C, ptr addrspace(1) %IndexVecPtr, ptr addrspace(1) %out0, ptr addrspace(1) %out1) {
|
||||
; GFX12-LABEL: test_swmmac_f32_16x16x32_bf8_bf8_index_key:
|
||||
; GFX12: ; %bb.0: ; %bb
|
||||
; GFX12-NEXT: global_load_b32 v14, v[14:15], off
|
||||
; GFX12-NEXT: v_dual_mov_b32 v27, v13 :: v_dual_mov_b32 v26, v12
|
||||
; GFX12-NEXT: v_dual_mov_b32 v25, v11 :: v_dual_mov_b32 v24, v10
|
||||
; GFX12-NEXT: v_dual_mov_b32 v23, v9 :: v_dual_mov_b32 v22, v8
|
||||
; GFX12-NEXT: v_dual_mov_b32 v21, v7 :: v_dual_mov_b32 v20, v6
|
||||
; GFX12-NEXT: s_wait_loadcnt 0x0
|
||||
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
|
||||
; GFX12-NEXT: v_swmmac_f32_16x16x32_bf8_bf8 v[20:27], v[0:1], v[2:5], v14
|
||||
; GFX12-NEXT: v_swmmac_f32_16x16x32_bf8_bf8 v[6:13], v[0:1], v[2:5], v14 index_key:1
|
||||
; GFX12-NEXT: s_clause 0x1
|
||||
; GFX12-NEXT: global_store_b128 v[16:17], v[24:27], off offset:16
|
||||
; GFX12-NEXT: global_store_b128 v[16:17], v[20:23], off
|
||||
; GFX12-NEXT: s_clause 0x1
|
||||
; GFX12-NEXT: global_store_b128 v[18:19], v[10:13], off offset:16
|
||||
; GFX12-NEXT: global_store_b128 v[18:19], v[6:9], off
|
||||
; GFX12-NEXT: s_nop 0
|
||||
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
|
||||
; GFX12-NEXT: s_endpgm
|
||||
bb:
|
||||
%IndexVec = load <2 x i16>, ptr addrspace(1) %IndexVecPtr, align 4
|
||||
%Index0 = extractelement <2 x i16> %IndexVec, i32 0
|
||||
%res0 = call <8 x float> @llvm.amdgcn.swmmac.f32.16x16x32.bf8.bf8.v8f32.v2i32.v4i32.v8f32.i16(<2 x i32> %A, <4 x i32> %B, <8 x float> %C, i16 %Index0)
|
||||
store <8 x float> %res0, ptr addrspace(1) %out0
|
||||
%Index1 = extractelement <2 x i16> %IndexVec, i32 1
|
||||
%res1 = call <8 x float> @llvm.amdgcn.swmmac.f32.16x16x32.bf8.bf8.v8f32.v2i32.v4i32.v8f32.i16(<2 x i32> %A, <4 x i32> %B, <8 x float> %C, i16 %Index1)
|
||||
store <8 x float> %res1, ptr addrspace(1) %out1
|
||||
ret void
|
||||
}
|
||||
|
||||
declare <8 x float> @llvm.amdgcn.swmmac.f32.16x16x32.f16.v8f32.v8f16.v16f16.v8f32.i16(<8 x half>, <16 x half>, <8 x float>, i16)
|
||||
declare <8 x float> @llvm.amdgcn.swmmac.f32.16x16x32.bf16.v8f32.v8i16.v16i16.v8f32.i16(<8 x i16>, <16 x i16>, <8 x float>, i16)
|
||||
declare <8 x half> @llvm.amdgcn.swmmac.f16.16x16x32.f16.v8f16.v8f16.v16f16.v8f16.i16(<8 x half>, <16 x half>, <8 x half>, i16)
|
||||
declare <8 x i16> @llvm.amdgcn.swmmac.bf16.16x16x32.bf16.v8i16.v8i16.v16i16.v8i16.i16(<8 x i16>, <16 x i16>, <8 x i16>, i16)
|
||||
declare <8 x i32> @llvm.amdgcn.swmmac.i32.16x16x32.iu8.v8i32.v2i32.v4i32.v8i32.i16(i1 immarg, <2 x i32>, i1 immarg, <4 x i32>, <8 x i32>, i16 %Index, i1 immarg)
|
||||
declare <8 x i32> @llvm.amdgcn.swmmac.i32.16x16x32.iu4.v8i32.i32.v2i32.v8i32.i16(i1 immarg, i32, i1 immarg, <2 x i32>, <8 x i32>, i16 %Index, i1 immarg)
|
||||
declare <8 x float> @llvm.amdgcn.swmmac.f32.16x16x32.fp8.fp8.v8f32.v2i32.v4i32.v8f32.i16(<2 x i32>, <4 x i32>, <8 x float>, i16)
|
||||
declare <8 x float> @llvm.amdgcn.swmmac.f32.16x16x32.fp8.bf8.v8f32.v2i32.v4i32.v8f32.i16(<2 x i32>, <4 x i32>, <8 x float>, i16)
|
||||
declare <8 x float> @llvm.amdgcn.swmmac.f32.16x16x32.bf8.fp8.v8f32.v2i32.v4i32.v8f32.i16(<2 x i32>, <4 x i32>, <8 x float>, i16)
|
||||
declare <8 x float> @llvm.amdgcn.swmmac.f32.16x16x32.bf8.bf8.v8f32.v2i32.v4i32.v8f32.i16(<2 x i32>, <4 x i32>, <8 x float>, i16)
|
370
llvm/test/CodeGen/AMDGPU/wmma-gfx12-w32.ll
Normal file
370
llvm/test/CodeGen/AMDGPU/wmma-gfx12-w32.ll
Normal file
@ -0,0 +1,370 @@
|
||||
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
|
||||
; RUN: llc -march=amdgcn -mcpu=gfx1200 -verify-machineinstrs < %s | FileCheck %s --check-prefix=GFX12
|
||||
|
||||
define amdgpu_ps void @test_wmma_f32_16x16x16_f16(<8 x half> %A, <8 x half> %B, <8 x float> %C, ptr addrspace(1) %out) {
|
||||
; GFX12-LABEL: test_wmma_f32_16x16x16_f16:
|
||||
; GFX12: ; %bb.0: ; %bb
|
||||
; GFX12-NEXT: v_wmma_f32_16x16x16_f16 v[8:15], v[0:3], v[4:7], v[8:15]
|
||||
; GFX12-NEXT: s_clause 0x1
|
||||
; GFX12-NEXT: global_store_b128 v[16:17], v[12:15], off offset:16
|
||||
; GFX12-NEXT: global_store_b128 v[16:17], v[8:11], off
|
||||
; GFX12-NEXT: s_nop 0
|
||||
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
|
||||
; GFX12-NEXT: s_endpgm
|
||||
bb:
|
||||
%res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.f16.v8f32.v8f16.v8f16.v8f32(<8 x half> %A, <8 x half> %B, <8 x float> %C)
|
||||
store <8 x float> %res, ptr addrspace(1) %out
|
||||
ret void
|
||||
}
|
||||
|
||||
define amdgpu_ps void @test_wmma_f32_16x16x16_bf16(<8 x i16> %A, <8 x i16> %B, <8 x float> %C, ptr addrspace(1) %out) {
|
||||
; GFX12-LABEL: test_wmma_f32_16x16x16_bf16:
|
||||
; GFX12: ; %bb.0: ; %bb
|
||||
; GFX12-NEXT: v_wmma_f32_16x16x16_bf16 v[8:15], v[0:3], v[4:7], v[8:15]
|
||||
; GFX12-NEXT: s_clause 0x1
|
||||
; GFX12-NEXT: global_store_b128 v[16:17], v[12:15], off offset:16
|
||||
; GFX12-NEXT: global_store_b128 v[16:17], v[8:11], off
|
||||
; GFX12-NEXT: s_nop 0
|
||||
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
|
||||
; GFX12-NEXT: s_endpgm
|
||||
bb:
|
||||
%res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf16.v8f32.v8i16.v8i16.v8f32(<8 x i16> %A, <8 x i16> %B, <8 x float> %C)
|
||||
store <8 x float> %res, ptr addrspace(1) %out
|
||||
ret void
|
||||
}
|
||||
|
||||
define amdgpu_ps void @test_wmma_f16_16x16x16_f16(<8 x half> %A, <8 x half> %B, <8 x half> %C, ptr addrspace(1) %out) {
|
||||
; GFX12-LABEL: test_wmma_f16_16x16x16_f16:
|
||||
; GFX12: ; %bb.0: ; %bb
|
||||
; GFX12-NEXT: v_wmma_f16_16x16x16_f16 v[8:11], v[0:3], v[4:7], v[8:11]
|
||||
; GFX12-NEXT: global_store_b128 v[12:13], v[8:11], off
|
||||
; GFX12-NEXT: s_nop 0
|
||||
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
|
||||
; GFX12-NEXT: s_endpgm
|
||||
bb:
|
||||
%res = call <8 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16.v8f16.v8f16.v8f16.v8f16(<8 x half> %A, <8 x half> %B, <8 x half> %C, i1 0)
|
||||
store <8 x half> %res, ptr addrspace(1) %out
|
||||
ret void
|
||||
}
|
||||
|
||||
define amdgpu_ps void @test_wmma_bf16_16x16x16_bf16(<8 x i16> %A, <8 x i16> %B, <8 x i16> %C, ptr addrspace(1) %out) {
|
||||
; GFX12-LABEL: test_wmma_bf16_16x16x16_bf16:
|
||||
; GFX12: ; %bb.0: ; %bb
|
||||
; GFX12-NEXT: v_wmma_bf16_16x16x16_bf16 v[8:11], v[0:3], v[4:7], v[8:11]
|
||||
; GFX12-NEXT: global_store_b128 v[12:13], v[8:11], off
|
||||
; GFX12-NEXT: s_nop 0
|
||||
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
|
||||
; GFX12-NEXT: s_endpgm
|
||||
bb:
|
||||
%res = call <8 x i16> @llvm.amdgcn.wmma.bf16.16x16x16.bf16.v8i16.v8i16.v8i16.v8i16(<8 x i16> %A, <8 x i16> %B, <8 x i16> %C, i1 0)
|
||||
store <8 x i16> %res, ptr addrspace(1) %out
|
||||
ret void
|
||||
}
|
||||
|
||||
define amdgpu_ps void @test_wmma_i32_16x16x16_iu8(<2 x i32> %A, <2 x i32> %B, <8 x i32> %C, ptr addrspace(1) %out) {
|
||||
; GFX12-LABEL: test_wmma_i32_16x16x16_iu8:
|
||||
; GFX12: ; %bb.0: ; %bb
|
||||
; GFX12-NEXT: v_wmma_i32_16x16x16_iu8 v[4:11], v[0:1], v[2:3], v[4:11]
|
||||
; GFX12-NEXT: s_clause 0x1
|
||||
; GFX12-NEXT: global_store_b128 v[12:13], v[8:11], off offset:16
|
||||
; GFX12-NEXT: global_store_b128 v[12:13], v[4:7], off
|
||||
; GFX12-NEXT: s_nop 0
|
||||
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
|
||||
; GFX12-NEXT: s_endpgm
|
||||
bb:
|
||||
%res = call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8.v8i32.v2i32.v2i32.v8i32(i1 0, <2 x i32> %A, i1 0, <2 x i32> %B, <8 x i32> %C, i1 0)
|
||||
store <8 x i32> %res, ptr addrspace(1) %out
|
||||
ret void
|
||||
}
|
||||
|
||||
define amdgpu_ps void @test_wmma_i32_16x16x16_iu4(i32 %A, i32 %B, <8 x i32> %C, ptr addrspace(1) %out) {
|
||||
; GFX12-LABEL: test_wmma_i32_16x16x16_iu4:
|
||||
; GFX12: ; %bb.0: ; %bb
|
||||
; GFX12-NEXT: v_wmma_i32_16x16x16_iu4 v[2:9], v0, v1, v[2:9]
|
||||
; GFX12-NEXT: s_clause 0x1
|
||||
; GFX12-NEXT: global_store_b128 v[10:11], v[6:9], off offset:16
|
||||
; GFX12-NEXT: global_store_b128 v[10:11], v[2:5], off
|
||||
; GFX12-NEXT: s_nop 0
|
||||
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
|
||||
; GFX12-NEXT: s_endpgm
|
||||
bb:
|
||||
%res = call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4.v8i32.i32.i32.v8i32(i1 0, i32 %A, i1 0, i32 %B, <8 x i32> %C, i1 0)
|
||||
store <8 x i32> %res, ptr addrspace(1) %out
|
||||
ret void
|
||||
}
|
||||
|
||||
define amdgpu_ps void @test_wmma_f32_16x16x16_fp8_fp8(<2 x i32> %A, <2 x i32> %B, <8 x float> %C, ptr addrspace(1) %out) {
|
||||
; GFX12-LABEL: test_wmma_f32_16x16x16_fp8_fp8:
|
||||
; GFX12: ; %bb.0: ; %bb
|
||||
; GFX12-NEXT: v_wmma_f32_16x16x16_fp8_fp8 v[4:11], v[0:1], v[2:3], v[4:11]
|
||||
; GFX12-NEXT: s_clause 0x1
|
||||
; GFX12-NEXT: global_store_b128 v[12:13], v[8:11], off offset:16
|
||||
; GFX12-NEXT: global_store_b128 v[12:13], v[4:7], off
|
||||
; GFX12-NEXT: s_nop 0
|
||||
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
|
||||
; GFX12-NEXT: s_endpgm
|
||||
bb:
|
||||
%res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.fp8.fp8.v8f32.v2i32.v2i32.v8f32(<2 x i32> %A, <2 x i32> %B, <8 x float> %C)
|
||||
store <8 x float> %res, ptr addrspace(1) %out
|
||||
ret void
|
||||
}
|
||||
|
||||
define amdgpu_ps void @test_wmma_f32_16x16x16_bf8_fp8(<2 x i32> %A, <2 x i32> %B, <8 x float> %C, ptr addrspace(1) %out) {
|
||||
; GFX12-LABEL: test_wmma_f32_16x16x16_bf8_fp8:
|
||||
; GFX12: ; %bb.0: ; %bb
|
||||
; GFX12-NEXT: v_wmma_f32_16x16x16_bf8_fp8 v[4:11], v[0:1], v[2:3], v[4:11]
|
||||
; GFX12-NEXT: s_clause 0x1
|
||||
; GFX12-NEXT: global_store_b128 v[12:13], v[8:11], off offset:16
|
||||
; GFX12-NEXT: global_store_b128 v[12:13], v[4:7], off
|
||||
; GFX12-NEXT: s_nop 0
|
||||
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
|
||||
; GFX12-NEXT: s_endpgm
|
||||
bb:
|
||||
%res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf8.fp8.v8f32.v2i32.v2i32.v8f32(<2 x i32> %A, <2 x i32> %B, <8 x float> %C)
|
||||
store <8 x float> %res, ptr addrspace(1) %out
|
||||
ret void
|
||||
}
|
||||
|
||||
define amdgpu_ps void @test_wmma_f32_16x16x16_fp8_bf8(<2 x i32> %A, <2 x i32> %B, <8 x float> %C, ptr addrspace(1) %out) {
|
||||
; GFX12-LABEL: test_wmma_f32_16x16x16_fp8_bf8:
|
||||
; GFX12: ; %bb.0: ; %bb
|
||||
; GFX12-NEXT: v_wmma_f32_16x16x16_fp8_bf8 v[4:11], v[0:1], v[2:3], v[4:11]
|
||||
; GFX12-NEXT: s_clause 0x1
|
||||
; GFX12-NEXT: global_store_b128 v[12:13], v[8:11], off offset:16
|
||||
; GFX12-NEXT: global_store_b128 v[12:13], v[4:7], off
|
||||
; GFX12-NEXT: s_nop 0
|
||||
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
|
||||
; GFX12-NEXT: s_endpgm
|
||||
bb:
|
||||
%res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.fp8.bf8.v8f32.v2i32.v2i32.v8f32(<2 x i32> %A, <2 x i32> %B, <8 x float> %C)
|
||||
store <8 x float> %res, ptr addrspace(1) %out
|
||||
ret void
|
||||
}
|
||||
|
||||
define amdgpu_ps void @test_wmma_f32_16x16x16_bf8_bf8(<2 x i32> %A, <2 x i32> %B, <8 x float> %C, ptr addrspace(1) %out) {
|
||||
; GFX12-LABEL: test_wmma_f32_16x16x16_bf8_bf8:
|
||||
; GFX12: ; %bb.0: ; %bb
|
||||
; GFX12-NEXT: v_wmma_f32_16x16x16_bf8_bf8 v[4:11], v[0:1], v[2:3], v[4:11]
|
||||
; GFX12-NEXT: s_clause 0x1
|
||||
; GFX12-NEXT: global_store_b128 v[12:13], v[8:11], off offset:16
|
||||
; GFX12-NEXT: global_store_b128 v[12:13], v[4:7], off
|
||||
; GFX12-NEXT: s_nop 0
|
||||
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
|
||||
; GFX12-NEXT: s_endpgm
|
||||
bb:
|
||||
%res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf8.bf8.v8f32.v2i32.v2i32.v8f32(<2 x i32> %A, <2 x i32> %B, <8 x float> %C)
|
||||
store <8 x float> %res, ptr addrspace(1) %out
|
||||
ret void
|
||||
}
|
||||
|
||||
define amdgpu_ps void @test_wmma_i32_16x16x32_iu4(<2 x i32> %A, <2 x i32> %B, <8 x i32> %C, ptr addrspace(1) %out) {
|
||||
; GFX12-LABEL: test_wmma_i32_16x16x32_iu4:
|
||||
; GFX12: ; %bb.0: ; %bb
|
||||
; GFX12-NEXT: v_wmma_i32_16x16x32_iu4 v[4:11], v[0:1], v[2:3], v[4:11]
|
||||
; GFX12-NEXT: s_clause 0x1
|
||||
; GFX12-NEXT: global_store_b128 v[12:13], v[8:11], off offset:16
|
||||
; GFX12-NEXT: global_store_b128 v[12:13], v[4:7], off
|
||||
; GFX12-NEXT: s_nop 0
|
||||
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
|
||||
; GFX12-NEXT: s_endpgm
|
||||
bb:
|
||||
%res = call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x32.iu4.v8i32.v2i32.v2i32.v8i32(i1 0, <2 x i32> %A, i1 0, <2 x i32> %B, <8 x i32> %C, i1 0)
|
||||
store <8 x i32> %res, ptr addrspace(1) %out
|
||||
ret void
|
||||
}
|
||||
|
||||
|
||||
define amdgpu_ps void @test_swmmac_f32_16x16x32_f16(<8 x half> %A, <16 x half> %B, <8 x float> %C, i16 %Index, ptr addrspace(1) %out) {
|
||||
; GFX12-LABEL: test_swmmac_f32_16x16x32_f16:
|
||||
; GFX12: ; %bb.0: ; %bb
|
||||
; GFX12-NEXT: v_swmmac_f32_16x16x32_f16 v[12:19], v[0:3], v[4:11], v20
|
||||
; GFX12-NEXT: s_clause 0x1
|
||||
; GFX12-NEXT: global_store_b128 v[21:22], v[16:19], off offset:16
|
||||
; GFX12-NEXT: global_store_b128 v[21:22], v[12:15], off
|
||||
; GFX12-NEXT: s_nop 0
|
||||
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
|
||||
; GFX12-NEXT: s_endpgm
|
||||
bb:
|
||||
%res = call <8 x float> @llvm.amdgcn.swmmac.f32.16x16x32.f16.v8f32.v8f16.v16f16.v8f32.i16(<8 x half> %A, <16 x half> %B, <8 x float> %C, i16 %Index)
|
||||
store <8 x float> %res, ptr addrspace(1) %out
|
||||
ret void
|
||||
}
|
||||
|
||||
define amdgpu_ps void @test_swmmac_f32_16x16x32_bf16(<8 x i16> %A, <16 x i16> %B, <8 x float> %C, i16 %Index, ptr addrspace(1) %out) {
|
||||
; GFX12-LABEL: test_swmmac_f32_16x16x32_bf16:
|
||||
; GFX12: ; %bb.0: ; %bb
|
||||
; GFX12-NEXT: v_swmmac_f32_16x16x32_bf16 v[12:19], v[0:3], v[4:11], v20
|
||||
; GFX12-NEXT: s_clause 0x1
|
||||
; GFX12-NEXT: global_store_b128 v[21:22], v[16:19], off offset:16
|
||||
; GFX12-NEXT: global_store_b128 v[21:22], v[12:15], off
|
||||
; GFX12-NEXT: s_nop 0
|
||||
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
|
||||
; GFX12-NEXT: s_endpgm
|
||||
bb:
|
||||
%res = call <8 x float> @llvm.amdgcn.swmmac.f32.16x16x32.bf16.v8f32.v8i16.v16i16.v8f32.i16(<8 x i16> %A, <16 x i16> %B, <8 x float> %C, i16 %Index)
|
||||
store <8 x float> %res, ptr addrspace(1) %out
|
||||
ret void
|
||||
}
|
||||
|
||||
define amdgpu_ps void @test_swmmac_f16_16x16x32_f16(<8 x half> %A, <16 x half> %B, <8 x half> %C, i16 %Index, ptr addrspace(1) %out) {
|
||||
; GFX12-LABEL: test_swmmac_f16_16x16x32_f16:
|
||||
; GFX12: ; %bb.0: ; %bb
|
||||
; GFX12-NEXT: v_swmmac_f16_16x16x32_f16 v[12:15], v[0:3], v[4:11], v16
|
||||
; GFX12-NEXT: global_store_b128 v[17:18], v[12:15], off
|
||||
; GFX12-NEXT: s_nop 0
|
||||
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
|
||||
; GFX12-NEXT: s_endpgm
|
||||
bb:
|
||||
%res = call <8 x half> @llvm.amdgcn.swmmac.f16.16x16x32.f16.v8f16.v8f16.v16f16.v8f16.i16(<8 x half> %A, <16 x half> %B, <8 x half> %C, i16 %Index)
|
||||
store <8 x half> %res, ptr addrspace(1) %out
|
||||
ret void
|
||||
}
|
||||
|
||||
define amdgpu_ps void @test_swmmac_bf16_16x16x32_bf16(<8 x i16> %A, <16 x i16> %B, <8 x i16> %C, i16 %Index, ptr addrspace(1) %out) {
|
||||
; GFX12-LABEL: test_swmmac_bf16_16x16x32_bf16:
|
||||
; GFX12: ; %bb.0: ; %bb
|
||||
; GFX12-NEXT: v_swmmac_bf16_16x16x32_bf16 v[12:15], v[0:3], v[4:11], v16
|
||||
; GFX12-NEXT: global_store_b128 v[17:18], v[12:15], off
|
||||
; GFX12-NEXT: s_nop 0
|
||||
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
|
||||
; GFX12-NEXT: s_endpgm
|
||||
bb:
|
||||
%res = call <8 x i16> @llvm.amdgcn.swmmac.bf16.16x16x32.bf16.v8i16.v8i16.v16i16.v8i16.i16(<8 x i16> %A, <16 x i16> %B, <8 x i16> %C, i16 %Index)
|
||||
store <8 x i16> %res, ptr addrspace(1) %out
|
||||
ret void
|
||||
}
|
||||
|
||||
define amdgpu_ps void @test_swmmac_i32_16x16x32_iu8(<2 x i32> %A, <4 x i32> %B, <8 x i32> %C, i16 %Index, ptr addrspace(1) %out) {
|
||||
; GFX12-LABEL: test_swmmac_i32_16x16x32_iu8:
|
||||
; GFX12: ; %bb.0: ; %bb
|
||||
; GFX12-NEXT: v_swmmac_i32_16x16x32_iu8 v[6:13], v[0:1], v[2:5], v14
|
||||
; GFX12-NEXT: s_clause 0x1
|
||||
; GFX12-NEXT: global_store_b128 v[15:16], v[10:13], off offset:16
|
||||
; GFX12-NEXT: global_store_b128 v[15:16], v[6:9], off
|
||||
; GFX12-NEXT: s_nop 0
|
||||
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
|
||||
; GFX12-NEXT: s_endpgm
|
||||
bb:
|
||||
%res = call <8 x i32> @llvm.amdgcn.swmmac.i32.16x16x32.iu8.v8i32.v2i32.v4i32.v8i32.i16(i1 0, <2 x i32> %A, i1 0, <4 x i32> %B, <8 x i32> %C, i16 %Index, i1 0)
|
||||
store <8 x i32> %res, ptr addrspace(1) %out
|
||||
ret void
|
||||
}
|
||||
|
||||
define amdgpu_ps void @test_swmmac_i32_16x16x32_iu4(i32 %A, <2 x i32> %B, <8 x i32> %C, i16 %Index, ptr addrspace(1) %out) {
|
||||
; GFX12-LABEL: test_swmmac_i32_16x16x32_iu4:
|
||||
; GFX12: ; %bb.0: ; %bb
|
||||
; GFX12-NEXT: v_swmmac_i32_16x16x32_iu4 v[3:10], v0, v[1:2], v11
|
||||
; GFX12-NEXT: s_clause 0x1
|
||||
; GFX12-NEXT: global_store_b128 v[12:13], v[7:10], off offset:16
|
||||
; GFX12-NEXT: global_store_b128 v[12:13], v[3:6], off
|
||||
; GFX12-NEXT: s_nop 0
|
||||
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
|
||||
; GFX12-NEXT: s_endpgm
|
||||
bb:
|
||||
%res = call <8 x i32> @llvm.amdgcn.swmmac.i32.16x16x32.iu4.v8i32.i32.v2i32.v8i32.i16(i1 0, i32 %A, i1 0, <2 x i32> %B, <8 x i32> %C, i16 %Index, i1 0)
|
||||
store <8 x i32> %res, ptr addrspace(1) %out
|
||||
ret void
|
||||
}
|
||||
|
||||
define amdgpu_ps void @test_swmmac_i32_16x16x64_iu4(<2 x i32> %A, <4 x i32> %B, <8 x i32> %C, i32 %Index, ptr addrspace(1) %out) {
|
||||
; GFX12-LABEL: test_swmmac_i32_16x16x64_iu4:
|
||||
; GFX12: ; %bb.0: ; %bb
|
||||
; GFX12-NEXT: v_swmmac_i32_16x16x64_iu4 v[6:13], v[0:1], v[2:5], v14
|
||||
; GFX12-NEXT: s_clause 0x1
|
||||
; GFX12-NEXT: global_store_b128 v[15:16], v[10:13], off offset:16
|
||||
; GFX12-NEXT: global_store_b128 v[15:16], v[6:9], off
|
||||
; GFX12-NEXT: s_nop 0
|
||||
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
|
||||
; GFX12-NEXT: s_endpgm
|
||||
bb:
|
||||
%res = call <8 x i32> @llvm.amdgcn.swmmac.i32.16x16x64.iu4.v8i32.v2i32.v4i32.v8i32.i32(i1 0, <2 x i32> %A, i1 0, <4 x i32> %B, <8 x i32> %C, i32 %Index, i1 0)
|
||||
store <8 x i32> %res, ptr addrspace(1) %out
|
||||
ret void
|
||||
}
|
||||
|
||||
define amdgpu_ps void @test_swmmac_f32_16x16x32_fp8_fp8(<2 x i32> %A, <4 x i32> %B, <8 x float> %C, i16 %Index, ptr addrspace(1) %out) {
|
||||
; GFX12-LABEL: test_swmmac_f32_16x16x32_fp8_fp8:
|
||||
; GFX12: ; %bb.0: ; %bb
|
||||
; GFX12-NEXT: v_swmmac_f32_16x16x32_fp8_fp8 v[6:13], v[0:1], v[2:5], v14
|
||||
; GFX12-NEXT: s_clause 0x1
|
||||
; GFX12-NEXT: global_store_b128 v[15:16], v[10:13], off offset:16
|
||||
; GFX12-NEXT: global_store_b128 v[15:16], v[6:9], off
|
||||
; GFX12-NEXT: s_nop 0
|
||||
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
|
||||
; GFX12-NEXT: s_endpgm
|
||||
bb:
|
||||
%res = call <8 x float> @llvm.amdgcn.swmmac.f32.16x16x32.fp8.fp8.v8f32.v2i32.v4i32.v8f32.i16(<2 x i32> %A, <4 x i32> %B, <8 x float> %C, i16 %Index)
|
||||
store <8 x float> %res, ptr addrspace(1) %out
|
||||
ret void
|
||||
}
|
||||
|
||||
define amdgpu_ps void @test_swmmac_f32_16x16x32_fp8_bf8(<2 x i32> %A, <4 x i32> %B, <8 x float> %C, i16 %Index, ptr addrspace(1) %out) {
|
||||
; GFX12-LABEL: test_swmmac_f32_16x16x32_fp8_bf8:
|
||||
; GFX12: ; %bb.0: ; %bb
|
||||
; GFX12-NEXT: v_swmmac_f32_16x16x32_fp8_bf8 v[6:13], v[0:1], v[2:5], v14
|
||||
; GFX12-NEXT: s_clause 0x1
|
||||
; GFX12-NEXT: global_store_b128 v[15:16], v[10:13], off offset:16
|
||||
; GFX12-NEXT: global_store_b128 v[15:16], v[6:9], off
|
||||
; GFX12-NEXT: s_nop 0
|
||||
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
|
||||
; GFX12-NEXT: s_endpgm
|
||||
bb:
|
||||
%res = call <8 x float> @llvm.amdgcn.swmmac.f32.16x16x32.fp8.bf8.v8f32.v2i32.v4i32.v8f32.i16(<2 x i32> %A, <4 x i32> %B, <8 x float> %C, i16 %Index)
|
||||
store <8 x float> %res, ptr addrspace(1) %out
|
||||
ret void
|
||||
}
|
||||
|
||||
define amdgpu_ps void @test_swmmac_f32_16x16x32_bf8_fp8(<2 x i32> %A, <4 x i32> %B, <8 x float> %C, i16 %Index, ptr addrspace(1) %out) {
|
||||
; GFX12-LABEL: test_swmmac_f32_16x16x32_bf8_fp8:
|
||||
; GFX12: ; %bb.0: ; %bb
|
||||
; GFX12-NEXT: v_swmmac_f32_16x16x32_bf8_fp8 v[6:13], v[0:1], v[2:5], v14
|
||||
; GFX12-NEXT: s_clause 0x1
|
||||
; GFX12-NEXT: global_store_b128 v[15:16], v[10:13], off offset:16
|
||||
; GFX12-NEXT: global_store_b128 v[15:16], v[6:9], off
|
||||
; GFX12-NEXT: s_nop 0
|
||||
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
|
||||
; GFX12-NEXT: s_endpgm
|
||||
bb:
|
||||
%res = call <8 x float> @llvm.amdgcn.swmmac.f32.16x16x32.bf8.fp8.v8f32.v2i32.v4i32.v8f32.i16(<2 x i32> %A, <4 x i32> %B, <8 x float> %C, i16 %Index)
|
||||
store <8 x float> %res, ptr addrspace(1) %out
|
||||
ret void
|
||||
}
|
||||
|
||||
define amdgpu_ps void @test_swmmac_f32_16x16x32_bf8_bf8(<2 x i32> %A, <4 x i32> %B, <8 x float> %C, i16 %Index, ptr addrspace(1) %out) {
|
||||
; GFX12-LABEL: test_swmmac_f32_16x16x32_bf8_bf8:
|
||||
; GFX12: ; %bb.0: ; %bb
|
||||
; GFX12-NEXT: v_swmmac_f32_16x16x32_bf8_bf8 v[6:13], v[0:1], v[2:5], v14
|
||||
; GFX12-NEXT: s_clause 0x1
|
||||
; GFX12-NEXT: global_store_b128 v[15:16], v[10:13], off offset:16
|
||||
; GFX12-NEXT: global_store_b128 v[15:16], v[6:9], off
|
||||
; GFX12-NEXT: s_nop 0
|
||||
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
|
||||
; GFX12-NEXT: s_endpgm
|
||||
bb:
|
||||
%res = call <8 x float> @llvm.amdgcn.swmmac.f32.16x16x32.bf8.bf8.v8f32.v2i32.v4i32.v8f32.i16(<2 x i32> %A, <4 x i32> %B, <8 x float> %C, i16 %Index)
|
||||
store <8 x float> %res, ptr addrspace(1) %out
|
||||
ret void
|
||||
}
|
||||
|
||||
declare <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.f16.v8f32.v8f16.v8f16.v8f32(<8 x half>, <8 x half>, <8 x float>)
|
||||
declare <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf16.v8f32.v8i16.v8i16.v8f32(<8 x i16>, <8 x i16>, <8 x float>)
|
||||
declare <8 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16.v8f16.v8f16.v8f16.v8f16(<8 x half>, <8 x half>, <8 x half>, i1 immarg)
|
||||
declare <8 x i16> @llvm.amdgcn.wmma.bf16.16x16x16.bf16.v8i16.v8i16.v8i16.v8i16(<8 x i16>, <8 x i16>, <8 x i16>, i1 immarg)
|
||||
declare <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8.v8i32.v2i32.v2i32.v8i32(i1 immarg, <2 x i32>, i1 immarg, <2 x i32>, <8 x i32>, i1 immarg)
|
||||
declare <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4.v8i32.i32.i32.v8i32(i1 immarg, i32, i1 immarg, i32, <8 x i32>, i1 immarg)
|
||||
declare <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.fp8.fp8.v8f32.v2i32.v2i32.v8f32(<2 x i32>, <2 x i32>, <8 x float>)
|
||||
declare <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.fp8.bf8.v8f32.v2i32.v2i32.v8f32(<2 x i32>, <2 x i32>, <8 x float>)
|
||||
declare <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf8.fp8.v8f32.v2i32.v2i32.v8f32(<2 x i32>, <2 x i32>, <8 x float>)
|
||||
declare <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf8.bf8.v8f32.v2i32.v2i32.v8f32(<2 x i32>, <2 x i32>, <8 x float>)
|
||||
declare <8 x i32> @llvm.amdgcn.wmma.i32.16x16x32.iu4.v8i32.v2i32.v2i32.v8i32(i1 immarg, <2 x i32>, i1 immarg, <2 x i32>, <8 x i32>, i1 immarg)
|
||||
declare <8 x float> @llvm.amdgcn.swmmac.f32.16x16x32.f16.v8f32.v8f16.v16f16.v8f32.i16(<8 x half>, <16 x half>, <8 x float>, i16)
|
||||
declare <8 x float> @llvm.amdgcn.swmmac.f32.16x16x32.bf16.v8f32.v8i16.v16i16.v8f32.i16(<8 x i16>, <16 x i16>, <8 x float>, i16)
|
||||
declare <8 x half> @llvm.amdgcn.swmmac.f16.16x16x32.f16.v8f16.v8f16.v16f16.v8f16.i16(<8 x half>, <16 x half>, <8 x half>, i16)
|
||||
declare <8 x i16> @llvm.amdgcn.swmmac.bf16.16x16x32.bf16.v8i16.v8i16.v16i16.v8i16.i16(<8 x i16>, <16 x i16>, <8 x i16>, i16)
|
||||
declare <8 x i32> @llvm.amdgcn.swmmac.i32.16x16x32.iu8.v8i32.v2i32.v4i32.v8i32.i16(i1 immarg, <2 x i32>, i1 immarg, <4 x i32>, <8 x i32>, i16 %Index, i1 immarg)
|
||||
declare <8 x i32> @llvm.amdgcn.swmmac.i32.16x16x32.iu4.v8i32.i32.v2i32.v8i32.i16(i1 immarg, i32, i1 immarg, <2 x i32>, <8 x i32>, i16 %Index, i1 immarg)
|
||||
declare <8 x i32> @llvm.amdgcn.swmmac.i32.16x16x64.iu4.v8i32.v2i32.v4i32.v8i32.i32(i1 immarg, <2 x i32>, i1 immarg, <4 x i32>, <8 x i32>, i32 %Index, i1 immarg)
|
||||
declare <8 x float> @llvm.amdgcn.swmmac.f32.16x16x32.fp8.fp8.v8f32.v2i32.v4i32.v8f32.i16(<2 x i32>, <4 x i32>, <8 x float>, i16)
|
||||
declare <8 x float> @llvm.amdgcn.swmmac.f32.16x16x32.fp8.bf8.v8f32.v2i32.v4i32.v8f32.i16(<2 x i32>, <4 x i32>, <8 x float>, i16)
|
||||
declare <8 x float> @llvm.amdgcn.swmmac.f32.16x16x32.bf8.fp8.v8f32.v2i32.v4i32.v8f32.i16(<2 x i32>, <4 x i32>, <8 x float>, i16)
|
||||
declare <8 x float> @llvm.amdgcn.swmmac.f32.16x16x32.bf8.bf8.v8f32.v2i32.v4i32.v8f32.i16(<2 x i32>, <4 x i32>, <8 x float>, i16)
|
@ -0,0 +1,456 @@
|
||||
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
|
||||
; RUN: llc -march=amdgcn -mcpu=gfx1200 -mattr=-wavefrontsize32,+wavefrontsize64 -verify-machineinstrs < %s | FileCheck %s --check-prefix=GFX12
|
||||
|
||||
define amdgpu_ps void @test_wmma_f32_16x16x16_f16_negA(<4 x half> %A, <4 x half> %B, <4 x float> %C, <4 x float> addrspace(1)* %out) {
|
||||
; GFX12-LABEL: test_wmma_f32_16x16x16_f16_negA:
|
||||
; GFX12: ; %bb.0: ; %bb
|
||||
; GFX12-NEXT: v_wmma_f32_16x16x16_f16 v[4:7], v[0:1], v[2:3], v[4:7] neg_lo:[1,0,0] neg_hi:[1,0,0]
|
||||
; GFX12-NEXT: global_store_b128 v[8:9], v[4:7], off
|
||||
; GFX12-NEXT: s_nop 0
|
||||
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
|
||||
; GFX12-NEXT: s_endpgm
|
||||
bb:
|
||||
%fneg.A = fneg <4 x half> %A
|
||||
%res = call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.f16.v4f16.v4f32(<4 x half> %fneg.A, <4 x half> %B, <4 x float> %C)
|
||||
store <4 x float> %res, ptr addrspace(1) %out
|
||||
ret void
|
||||
}
|
||||
|
||||
define amdgpu_ps void @test_wmma_f32_16x16x16_f16_negB(<4 x half> %A, <4 x half> %B, <4 x float> %C, <4 x float> addrspace(1)* %out) {
|
||||
; GFX12-LABEL: test_wmma_f32_16x16x16_f16_negB:
|
||||
; GFX12: ; %bb.0: ; %bb
|
||||
; GFX12-NEXT: v_wmma_f32_16x16x16_f16 v[4:7], v[0:1], v[2:3], v[4:7] neg_lo:[0,1,0] neg_hi:[0,1,0]
|
||||
; GFX12-NEXT: global_store_b128 v[8:9], v[4:7], off
|
||||
; GFX12-NEXT: s_nop 0
|
||||
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
|
||||
; GFX12-NEXT: s_endpgm
|
||||
bb:
|
||||
%fneg.B = fneg <4 x half> %B
|
||||
%res = call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.f16.v4f16.v4f32(<4 x half> %A, <4 x half> %fneg.B, <4 x float> %C)
|
||||
store <4 x float> %res, <4 x float> addrspace(1)* %out
|
||||
ret void
|
||||
}
|
||||
|
||||
define amdgpu_ps void @test_wmma_f32_16x16x16_f16_negC(<4 x half> %A, <4 x half> %B, <4 x float> %C, <4 x float> addrspace(1)* %out) {
|
||||
; GFX12-LABEL: test_wmma_f32_16x16x16_f16_negC:
|
||||
; GFX12: ; %bb.0: ; %bb
|
||||
; GFX12-NEXT: v_wmma_f32_16x16x16_f16 v[4:7], v[0:1], v[2:3], v[4:7] neg_lo:[0,0,1]
|
||||
; GFX12-NEXT: global_store_b128 v[8:9], v[4:7], off
|
||||
; GFX12-NEXT: s_nop 0
|
||||
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
|
||||
; GFX12-NEXT: s_endpgm
|
||||
bb:
|
||||
%fneg.C = fneg <4 x float> %C
|
||||
%res = call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.f16.v4f16.v4f32(<4 x half> %A, <4 x half> %B, <4 x float> %fneg.C)
|
||||
store <4 x float> %res, <4 x float> addrspace(1)* %out
|
||||
ret void
|
||||
}
|
||||
|
||||
define amdgpu_ps void @test_wmma_f32_16x16x16_f16_absC(<4 x half> %A, <4 x half> %B, <4 x float> %C, <4 x float> addrspace(1)* %out) {
|
||||
; GFX12-LABEL: test_wmma_f32_16x16x16_f16_absC:
|
||||
; GFX12: ; %bb.0: ; %bb
|
||||
; GFX12-NEXT: v_wmma_f32_16x16x16_f16 v[4:7], v[0:1], v[2:3], v[4:7] neg_hi:[0,0,1]
|
||||
; GFX12-NEXT: global_store_b128 v[8:9], v[4:7], off
|
||||
; GFX12-NEXT: s_nop 0
|
||||
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
|
||||
; GFX12-NEXT: s_endpgm
|
||||
bb:
|
||||
%fabs.C = call <4 x float> @llvm.fabs.v4f32(<4 x float> %C)
|
||||
%res = call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.f16.v4f16.v4f32(<4 x half> %A, <4 x half> %B, <4 x float> %fabs.C)
|
||||
store <4 x float> %res, <4 x float> addrspace(1)* %out
|
||||
ret void
|
||||
}
|
||||
|
||||
define amdgpu_ps void @test_wmma_f32_16x16x16_bf16_negC(<4 x i16> %A, <4 x i16> %B, <4 x float> %C, <4 x float> addrspace(1)* %out) {
|
||||
; GFX12-LABEL: test_wmma_f32_16x16x16_bf16_negC:
|
||||
; GFX12: ; %bb.0: ; %bb
|
||||
; GFX12-NEXT: v_wmma_f32_16x16x16_bf16 v[4:7], v[0:1], v[2:3], v[4:7] neg_lo:[0,0,1]
|
||||
; GFX12-NEXT: global_store_b128 v[8:9], v[4:7], off
|
||||
; GFX12-NEXT: s_nop 0
|
||||
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
|
||||
; GFX12-NEXT: s_endpgm
|
||||
bb:
|
||||
%fneg.C = fneg <4 x float> %C
|
||||
%res = call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf16.v4i16.v4f32(<4 x i16> %A, <4 x i16> %B, <4 x float> %fneg.C)
|
||||
store <4 x float> %res, <4 x float> addrspace(1)* %out
|
||||
ret void
|
||||
}
|
||||
|
||||
define amdgpu_ps void @test_wmma_f32_16x16x16_bf16_absC(<4 x i16> %A, <4 x i16> %B, <4 x float> %C, <4 x float> addrspace(1)* %out) {
|
||||
; GFX12-LABEL: test_wmma_f32_16x16x16_bf16_absC:
|
||||
; GFX12: ; %bb.0: ; %bb
|
||||
; GFX12-NEXT: v_wmma_f32_16x16x16_bf16 v[4:7], v[0:1], v[2:3], v[4:7] neg_hi:[0,0,1]
|
||||
; GFX12-NEXT: global_store_b128 v[8:9], v[4:7], off
|
||||
; GFX12-NEXT: s_nop 0
|
||||
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
|
||||
; GFX12-NEXT: s_endpgm
|
||||
bb:
|
||||
%fabs.C = call <4 x float> @llvm.fabs.v4f32(<4 x float> %C)
|
||||
%res = call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf16.v4i16.v4f32(<4 x i16> %A, <4 x i16> %B, <4 x float> %fabs.C)
|
||||
store <4 x float> %res, <4 x float> addrspace(1)* %out
|
||||
ret void
|
||||
}
|
||||
|
||||
define amdgpu_ps void @test_wmma_f16_16x16x16_f16_negA(<4 x half> %A, <4 x half> %B, <4 x half> %C, <4 x half> addrspace(1)* %out) {
|
||||
; GFX12-LABEL: test_wmma_f16_16x16x16_f16_negA:
|
||||
; GFX12: ; %bb.0: ; %bb
|
||||
; GFX12-NEXT: v_wmma_f16_16x16x16_f16 v[4:5], v[0:1], v[2:3], v[4:5] neg_lo:[1,0,0] neg_hi:[1,0,0]
|
||||
; GFX12-NEXT: global_store_b64 v[6:7], v[4:5], off
|
||||
; GFX12-NEXT: s_nop 0
|
||||
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
|
||||
; GFX12-NEXT: s_endpgm
|
||||
bb:
|
||||
%fneg.A = fneg <4 x half> %A
|
||||
%res = call <4 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16.v8f16.v8f16(<4 x half> %fneg.A, <4 x half> %B, <4 x half> %C, i1 0)
|
||||
store <4 x half> %res, <4 x half> addrspace(1)* %out
|
||||
ret void
|
||||
}
|
||||
|
||||
define amdgpu_ps void @test_wmma_f16_16x16x16_f16_negB(<4 x half> %A, <4 x half> %B, <4 x half> %C, <4 x half> addrspace(1)* %out) {
|
||||
; GFX12-LABEL: test_wmma_f16_16x16x16_f16_negB:
|
||||
; GFX12: ; %bb.0: ; %bb
|
||||
; GFX12-NEXT: v_wmma_f16_16x16x16_f16 v[4:5], v[0:1], v[2:3], v[4:5] neg_lo:[0,1,0] neg_hi:[0,1,0]
|
||||
; GFX12-NEXT: global_store_b64 v[6:7], v[4:5], off
|
||||
; GFX12-NEXT: s_nop 0
|
||||
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
|
||||
; GFX12-NEXT: s_endpgm
|
||||
bb:
|
||||
%fneg.B = fneg <4 x half> %B
|
||||
%res = call <4 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16.v8f16.v8f16(<4 x half> %A, <4 x half> %fneg.B, <4 x half> %C, i1 0)
|
||||
store <4 x half> %res, <4 x half> addrspace(1)* %out
|
||||
ret void
|
||||
}
|
||||
|
||||
define amdgpu_ps void @test_wmma_f16_16x16x16_f16_negC(<4 x half> %A, <4 x half> %B, <4 x half> %C, <4 x half> addrspace(1)* %out) {
|
||||
; GFX12-LABEL: test_wmma_f16_16x16x16_f16_negC:
|
||||
; GFX12: ; %bb.0: ; %bb
|
||||
; GFX12-NEXT: v_wmma_f16_16x16x16_f16 v[4:5], v[0:1], v[2:3], v[4:5] neg_lo:[0,0,1]
|
||||
; GFX12-NEXT: global_store_b64 v[6:7], v[4:5], off
|
||||
; GFX12-NEXT: s_nop 0
|
||||
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
|
||||
; GFX12-NEXT: s_endpgm
|
||||
bb:
|
||||
%fneg.C = fneg <4 x half> %C
|
||||
%res = call <4 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16.v8f16.v8f16(<4 x half> %A, <4 x half> %B, <4 x half> %fneg.C, i1 0)
|
||||
store <4 x half> %res, <4 x half> addrspace(1)* %out
|
||||
ret void
|
||||
}
|
||||
|
||||
define amdgpu_ps void @test_wmma_f16_16x16x16_f16_absC(<4 x half> %A, <4 x half> %B, <4 x half> %C, <4 x half> addrspace(1)* %out) {
|
||||
; GFX12-LABEL: test_wmma_f16_16x16x16_f16_absC:
|
||||
; GFX12: ; %bb.0: ; %bb
|
||||
; GFX12-NEXT: v_wmma_f16_16x16x16_f16 v[4:5], v[0:1], v[2:3], v[4:5] neg_hi:[0,0,1]
|
||||
; GFX12-NEXT: global_store_b64 v[6:7], v[4:5], off
|
||||
; GFX12-NEXT: s_nop 0
|
||||
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
|
||||
; GFX12-NEXT: s_endpgm
|
||||
bb:
|
||||
%fabs.C = call <4 x half> @llvm.fabs.v4f16(<4 x half> %C)
|
||||
%res = call <4 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16.v8f16.v8f16(<4 x half> %A, <4 x half> %B, <4 x half> %fabs.C, i1 0)
|
||||
store <4 x half> %res, <4 x half> addrspace(1)* %out
|
||||
ret void
|
||||
}
|
||||
|
||||
define amdgpu_ps void @test_wmma_f32_16x16x16_fp8_fp8_negC(i32 %A, i32 %B, <4 x float> %C, <4 x float> addrspace(1)* %out) {
|
||||
; GFX12-LABEL: test_wmma_f32_16x16x16_fp8_fp8_negC:
|
||||
; GFX12: ; %bb.0: ; %bb
|
||||
; GFX12-NEXT: v_wmma_f32_16x16x16_fp8_fp8 v[2:5], v0, v1, v[2:5] neg_lo:[0,0,1]
|
||||
; GFX12-NEXT: global_store_b128 v[6:7], v[2:5], off
|
||||
; GFX12-NEXT: s_nop 0
|
||||
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
|
||||
; GFX12-NEXT: s_endpgm
|
||||
bb:
|
||||
%fneg.C = fneg <4 x float> %C
|
||||
%res = call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.fp8.fp8.i32.v4f32(i32 %A, i32 %B, <4 x float> %fneg.C)
|
||||
store <4 x float> %res, <4 x float> addrspace(1)* %out
|
||||
ret void
|
||||
}
|
||||
|
||||
define amdgpu_ps void @test_wmma_f32_16x16x16_fp8_fp8_absC(i32 %A, i32 %B, <4 x float> %C, <4 x float> addrspace(1)* %out) {
|
||||
; GFX12-LABEL: test_wmma_f32_16x16x16_fp8_fp8_absC:
|
||||
; GFX12: ; %bb.0: ; %bb
|
||||
; GFX12-NEXT: v_wmma_f32_16x16x16_fp8_fp8 v[2:5], v0, v1, v[2:5] neg_hi:[0,0,1]
|
||||
; GFX12-NEXT: global_store_b128 v[6:7], v[2:5], off
|
||||
; GFX12-NEXT: s_nop 0
|
||||
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
|
||||
; GFX12-NEXT: s_endpgm
|
||||
bb:
|
||||
%fabs.C = call <4 x float> @llvm.fabs.v4f32(<4 x float> %C)
|
||||
%res = call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.fp8.fp8.i32.v4f32(i32 %A, i32 %B, <4 x float> %fabs.C)
|
||||
store <4 x float> %res, <4 x float> addrspace(1)* %out
|
||||
ret void
|
||||
}
|
||||
|
||||
define amdgpu_ps void @test_wmma_f32_16x16x16_bf8_fp8_negC(i32 %A, i32 %B, <4 x float> %C, <4 x float> addrspace(1)* %out) {
|
||||
; GFX12-LABEL: test_wmma_f32_16x16x16_bf8_fp8_negC:
|
||||
; GFX12: ; %bb.0: ; %bb
|
||||
; GFX12-NEXT: v_wmma_f32_16x16x16_bf8_fp8 v[2:5], v0, v1, v[2:5] neg_lo:[0,0,1]
|
||||
; GFX12-NEXT: global_store_b128 v[6:7], v[2:5], off
|
||||
; GFX12-NEXT: s_nop 0
|
||||
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
|
||||
; GFX12-NEXT: s_endpgm
|
||||
bb:
|
||||
%fneg.C = fneg <4 x float> %C
|
||||
%res = call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf8.fp8.i32.v4f32(i32 %A, i32 %B, <4 x float> %fneg.C)
|
||||
store <4 x float> %res, <4 x float> addrspace(1)* %out
|
||||
ret void
|
||||
}
|
||||
|
||||
define amdgpu_ps void @test_wmma_f32_16x16x16_bf8_fp8_absC(i32 %A, i32 %B, <4 x float> %C, <4 x float> addrspace(1)* %out) {
|
||||
; GFX12-LABEL: test_wmma_f32_16x16x16_bf8_fp8_absC:
|
||||
; GFX12: ; %bb.0: ; %bb
|
||||
; GFX12-NEXT: v_wmma_f32_16x16x16_bf8_fp8 v[2:5], v0, v1, v[2:5] neg_hi:[0,0,1]
|
||||
; GFX12-NEXT: global_store_b128 v[6:7], v[2:5], off
|
||||
; GFX12-NEXT: s_nop 0
|
||||
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
|
||||
; GFX12-NEXT: s_endpgm
|
||||
bb:
|
||||
%fabs.C = call <4 x float> @llvm.fabs.v4f32(<4 x float> %C)
|
||||
%res = call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf8.fp8.i32.v4f32(i32 %A, i32 %B, <4 x float> %fabs.C)
|
||||
store <4 x float> %res, <4 x float> addrspace(1)* %out
|
||||
ret void
|
||||
}
|
||||
|
||||
define amdgpu_ps void @test_wmma_f32_16x16x16_fp8_bf8_negC(i32 %A, i32 %B, <4 x float> %C, <4 x float> addrspace(1)* %out) {
|
||||
; GFX12-LABEL: test_wmma_f32_16x16x16_fp8_bf8_negC:
|
||||
; GFX12: ; %bb.0: ; %bb
|
||||
; GFX12-NEXT: v_wmma_f32_16x16x16_fp8_bf8 v[2:5], v0, v1, v[2:5] neg_lo:[0,0,1]
|
||||
; GFX12-NEXT: global_store_b128 v[6:7], v[2:5], off
|
||||
; GFX12-NEXT: s_nop 0
|
||||
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
|
||||
; GFX12-NEXT: s_endpgm
|
||||
bb:
|
||||
%fneg.C = fneg <4 x float> %C
|
||||
%res = call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.fp8.bf8.i32.v4f32(i32 %A, i32 %B, <4 x float> %fneg.C)
|
||||
store <4 x float> %res, <4 x float> addrspace(1)* %out
|
||||
ret void
|
||||
}
|
||||
|
||||
define amdgpu_ps void @test_wmma_f32_16x16x16_fp8_bf8_absC(i32 %A, i32 %B, <4 x float> %C, <4 x float> addrspace(1)* %out) {
|
||||
; GFX12-LABEL: test_wmma_f32_16x16x16_fp8_bf8_absC:
|
||||
; GFX12: ; %bb.0: ; %bb
|
||||
; GFX12-NEXT: v_wmma_f32_16x16x16_fp8_bf8 v[2:5], v0, v1, v[2:5] neg_hi:[0,0,1]
|
||||
; GFX12-NEXT: global_store_b128 v[6:7], v[2:5], off
|
||||
; GFX12-NEXT: s_nop 0
|
||||
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
|
||||
; GFX12-NEXT: s_endpgm
|
||||
bb:
|
||||
%fabs.C = call <4 x float> @llvm.fabs.v4f32(<4 x float> %C)
|
||||
%res = call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.fp8.bf8.i32.v4f32(i32 %A, i32 %B, <4 x float> %fabs.C)
|
||||
store <4 x float> %res, <4 x float> addrspace(1)* %out
|
||||
ret void
|
||||
}
|
||||
|
||||
define amdgpu_ps void @test_wmma_f32_16x16x16_bf8_bf8_negC(i32 %A, i32 %B, <4 x float> %C, <4 x float> addrspace(1)* %out) {
|
||||
; GFX12-LABEL: test_wmma_f32_16x16x16_bf8_bf8_negC:
|
||||
; GFX12: ; %bb.0: ; %bb
|
||||
; GFX12-NEXT: v_wmma_f32_16x16x16_bf8_bf8 v[2:5], v0, v1, v[2:5] neg_lo:[0,0,1]
|
||||
; GFX12-NEXT: global_store_b128 v[6:7], v[2:5], off
|
||||
; GFX12-NEXT: s_nop 0
|
||||
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
|
||||
; GFX12-NEXT: s_endpgm
|
||||
bb:
|
||||
%fneg.C = fneg <4 x float> %C
|
||||
%res = call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf8.bf8.i32.v4f32(i32 %A, i32 %B, <4 x float> %fneg.C)
|
||||
store <4 x float> %res, <4 x float> addrspace(1)* %out
|
||||
ret void
|
||||
}
|
||||
|
||||
define amdgpu_ps void @test_wmma_f32_16x16x16_bf8_bf8_absC(i32 %A, i32 %B, <4 x float> %C, <4 x float> addrspace(1)* %out) {
|
||||
; GFX12-LABEL: test_wmma_f32_16x16x16_bf8_bf8_absC:
|
||||
; GFX12: ; %bb.0: ; %bb
|
||||
; GFX12-NEXT: v_wmma_f32_16x16x16_bf8_bf8 v[2:5], v0, v1, v[2:5] neg_hi:[0,0,1]
|
||||
; GFX12-NEXT: global_store_b128 v[6:7], v[2:5], off
|
||||
; GFX12-NEXT: s_nop 0
|
||||
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
|
||||
; GFX12-NEXT: s_endpgm
|
||||
bb:
|
||||
%fabs.C = call <4 x float> @llvm.fabs.v4f32(<4 x float> %C)
|
||||
%res = call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf8.bf8.i32.v4f32(i32 %A, i32 %B, <4 x float> %fabs.C)
|
||||
store <4 x float> %res, <4 x float> addrspace(1)* %out
|
||||
ret void
|
||||
}
|
||||
|
||||
define amdgpu_ps void @test_swmmac_f32_16x16x32_f16_negA(<4 x half> %A, <8 x half> %B, <4 x float> %C, i16 %Index, <4 x float> addrspace(1)* %out) {
|
||||
; GFX12-LABEL: test_swmmac_f32_16x16x32_f16_negA:
|
||||
; GFX12: ; %bb.0: ; %bb
|
||||
; GFX12-NEXT: v_swmmac_f32_16x16x32_f16 v[6:9], v[0:1], v[2:5], v10 neg_lo:[1,0,0] neg_hi:[1,0,0]
|
||||
; GFX12-NEXT: global_store_b128 v[11:12], v[6:9], off
|
||||
; GFX12-NEXT: s_nop 0
|
||||
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
|
||||
; GFX12-NEXT: s_endpgm
|
||||
bb:
|
||||
%fneg.A = fneg <4 x half> %A
|
||||
%res = call <4 x float> @llvm.amdgcn.swmmac.f32.16x16x32.f16.v4f16.v8f16.v4f32.i16(<4 x half> %fneg.A, <8 x half> %B, <4 x float> %C, i16 %Index)
|
||||
store <4 x float> %res, <4 x float> addrspace(1)* %out
|
||||
ret void
|
||||
}
|
||||
|
||||
define amdgpu_ps void @test_swmmac_f32_16x16x32_f16_negB(<4 x half> %A, <8 x half> %B, <4 x float> %C, i16 %Index, <4 x float> addrspace(1)* %out) {
|
||||
; GFX12-LABEL: test_swmmac_f32_16x16x32_f16_negB:
|
||||
; GFX12: ; %bb.0: ; %bb
|
||||
; GFX12-NEXT: v_swmmac_f32_16x16x32_f16 v[6:9], v[0:1], v[2:5], v10 neg_lo:[0,1,0] neg_hi:[0,1,0]
|
||||
; GFX12-NEXT: global_store_b128 v[11:12], v[6:9], off
|
||||
; GFX12-NEXT: s_nop 0
|
||||
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
|
||||
; GFX12-NEXT: s_endpgm
|
||||
bb:
|
||||
%fneg.B = fneg <8 x half> %B
|
||||
%res = call <4 x float> @llvm.amdgcn.swmmac.f32.16x16x32.f16.v4f16.v8f16.v4f32.i16(<4 x half> %A, <8 x half> %fneg.B, <4 x float> %C, i16 %Index)
|
||||
store <4 x float> %res, <4 x float> addrspace(1)* %out
|
||||
ret void
|
||||
}
|
||||
|
||||
define amdgpu_ps void @test_swmmac_f16_16x16x32_f16_negA(<4 x half> %A, <8 x half> %B, <4 x half> %C, i16 %Index, <4 x half> addrspace(1)* %out) {
|
||||
; GFX12-LABEL: test_swmmac_f16_16x16x32_f16_negA:
|
||||
; GFX12: ; %bb.0: ; %bb
|
||||
; GFX12-NEXT: v_swmmac_f16_16x16x32_f16 v[6:7], v[0:1], v[2:5], v8 neg_lo:[1,0,0] neg_hi:[1,0,0]
|
||||
; GFX12-NEXT: global_store_b64 v[9:10], v[6:7], off
|
||||
; GFX12-NEXT: s_nop 0
|
||||
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
|
||||
; GFX12-NEXT: s_endpgm
|
||||
bb:
|
||||
%fneg.A = fneg <4 x half> %A
|
||||
%res = call <4 x half> @llvm.amdgcn.swmmac.f16.16x16x32.f16.v4f16.v8f16.v4f16.i16(<4 x half> %fneg.A, <8 x half> %B, <4 x half> %C, i16 %Index)
|
||||
store <4 x half> %res, <4 x half> addrspace(1)* %out
|
||||
ret void
|
||||
}
|
||||
|
||||
define amdgpu_ps void @test_swmmac_f16_16x16x32_f16_negB(<4 x half> %A, <8 x half> %B, <4 x half> %C, i16 %Index, <4 x half> addrspace(1)* %out) {
|
||||
; GFX12-LABEL: test_swmmac_f16_16x16x32_f16_negB:
|
||||
; GFX12: ; %bb.0: ; %bb
|
||||
; GFX12-NEXT: v_swmmac_f16_16x16x32_f16 v[6:7], v[0:1], v[2:5], v8 neg_lo:[0,1,0] neg_hi:[0,1,0]
|
||||
; GFX12-NEXT: global_store_b64 v[9:10], v[6:7], off
|
||||
; GFX12-NEXT: s_nop 0
|
||||
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
|
||||
; GFX12-NEXT: s_endpgm
|
||||
bb:
|
||||
%fneg.B = fneg <8 x half> %B
|
||||
%res = call <4 x half> @llvm.amdgcn.swmmac.f16.16x16x32.f16.v4f16.v8f16.v4f16.i16(<4 x half> %A, <8 x half> %fneg.B, <4 x half> %C, i16 %Index)
|
||||
store <4 x half> %res, <4 x half> addrspace(1)* %out
|
||||
ret void
|
||||
}
|
||||
|
||||
; both neg and abs patterns (wmma matrix C f32 or f16 )
|
||||
|
||||
define amdgpu_ps void @test_wmma_f32_16x16x16_f16_negabsC(<4 x half> %A, <4 x half> %B, <4 x float> %C, <4 x float> addrspace(1)* %out) {
|
||||
; GFX12-LABEL: test_wmma_f32_16x16x16_f16_negabsC:
|
||||
; GFX12: ; %bb.0: ; %bb
|
||||
; GFX12-NEXT: v_wmma_f32_16x16x16_f16 v[4:7], v[0:1], v[2:3], v[4:7] neg_lo:[0,0,1] neg_hi:[0,0,1]
|
||||
; GFX12-NEXT: global_store_b128 v[8:9], v[4:7], off
|
||||
; GFX12-NEXT: s_nop 0
|
||||
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
|
||||
; GFX12-NEXT: s_endpgm
|
||||
bb:
|
||||
%fabs.C = call <4 x float> @llvm.fabs.v4f32(<4 x float> %C)
|
||||
%fneg.fabs.C = fneg <4 x float> %fabs.C
|
||||
%res = call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.f16.v4f16.v4f32(<4 x half> %A, <4 x half> %B, <4 x float> %fneg.fabs.C)
|
||||
store <4 x float> %res, <4 x float> addrspace(1)* %out
|
||||
ret void
|
||||
}
|
||||
|
||||
define amdgpu_ps void @test_wmma_f16_16x16x16_f16_negabsC(<4 x half> %A, <4 x half> %B, <4 x half> %C, <4 x half> addrspace(1)* %out) {
|
||||
; GFX12-LABEL: test_wmma_f16_16x16x16_f16_negabsC:
|
||||
; GFX12: ; %bb.0: ; %bb
|
||||
; GFX12-NEXT: v_wmma_f16_16x16x16_f16 v[4:5], v[0:1], v[2:3], v[4:5] neg_lo:[0,0,1] neg_hi:[0,0,1]
|
||||
; GFX12-NEXT: global_store_b64 v[6:7], v[4:5], off
|
||||
; GFX12-NEXT: s_nop 0
|
||||
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
|
||||
; GFX12-NEXT: s_endpgm
|
||||
bb:
|
||||
%fabs.C = call <4 x half> @llvm.fabs.v4f16(<4 x half> %C)
|
||||
%fneg.fabs.C = fneg <4 x half> %fabs.C
|
||||
%res = call <4 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16.v8f16.v8f16(<4 x half> %A, <4 x half> %B, <4 x half> %fneg.fabs.C, i1 0)
|
||||
store <4 x half> %res, <4 x half> addrspace(1)* %out
|
||||
ret void
|
||||
}
|
||||
|
||||
define amdgpu_ps void @test_wmma_f32_16x16x16_f16_neg_partial_fabsA(<4 x half> %A, <4 x half> %B, <4 x float> %C, <4 x float> addrspace(1)* %out) {
|
||||
; GFX12-LABEL: test_wmma_f32_16x16x16_f16_neg_partial_fabsA:
|
||||
; GFX12: ; %bb.0: ; %bb
|
||||
; GFX12-NEXT: v_and_b32_e32 v7, 0x7fffffff, v7
|
||||
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
|
||||
; GFX12-NEXT: v_wmma_f32_16x16x16_f16 v[4:7], v[0:1], v[2:3], v[4:7] neg_lo:[0,0,1]
|
||||
; GFX12-NEXT: global_store_b128 v[8:9], v[4:7], off
|
||||
; GFX12-NEXT: s_nop 0
|
||||
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
|
||||
; GFX12-NEXT: s_endpgm
|
||||
bb:
|
||||
%el3 = extractelement <4 x float> %C, i32 3
|
||||
%el3.fabs = call float @llvm.fabs.f32(float %el3)
|
||||
%partial.fabs.C = insertelement <4 x float> %C, float %el3.fabs, i32 3
|
||||
%fneg.partial.fabs.C = fneg <4 x float> %partial.fabs.C
|
||||
%res = call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.f16.v4f16.v4f32(<4 x half> %A, <4 x half> %B, <4 x float> %fneg.partial.fabs.C)
|
||||
store <4 x float> %res, <4 x float> addrspace(1)* %out
|
||||
ret void
|
||||
}
|
||||
|
||||
; A or B matrix modifier and constant in C
|
||||
|
||||
define amdgpu_ps void @test_wmma_f32_16x16x16_f16_negA_constantC(<4 x half> %A, <4 x half> %B, <4 x float> %C, <4 x float> addrspace(1)* %out) {
|
||||
; GFX12-LABEL: test_wmma_f32_16x16x16_f16_negA_constantC:
|
||||
; GFX12: ; %bb.0: ; %bb
|
||||
; GFX12-NEXT: v_wmma_f32_16x16x16_f16 v[6:9], v[0:1], v[2:3], 1.0 neg_lo:[1,0,0] neg_hi:[1,0,0]
|
||||
; GFX12-NEXT: global_store_b128 v[4:5], v[6:9], off
|
||||
; GFX12-NEXT: s_nop 0
|
||||
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
|
||||
; GFX12-NEXT: s_endpgm
|
||||
bb:
|
||||
%fneg.A = fneg <4 x half> %A
|
||||
%res = call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.f16.v4f16.v4f32(<4 x half> %fneg.A, <4 x half> %B, <4 x float> <float 1.0, float 1.0, float 1.0, float 1.0>)
|
||||
store <4 x float> %res, <4 x float> addrspace(1)* %out
|
||||
ret void
|
||||
}
|
||||
|
||||
define amdgpu_ps void @test_wmma_f16_16x16x16_f16_negB_constantC(<4 x half> %A, <4 x half> %B, <4 x half> %C, <4 x half> addrspace(1)* %out) {
|
||||
; GFX12-LABEL: test_wmma_f16_16x16x16_f16_negB_constantC:
|
||||
; GFX12: ; %bb.0: ; %bb
|
||||
; GFX12-NEXT: v_wmma_f16_16x16x16_f16 v[6:7], v[0:1], v[2:3], 1.0 neg_lo:[0,1,0] neg_hi:[0,1,0]
|
||||
; GFX12-NEXT: global_store_b64 v[4:5], v[6:7], off
|
||||
; GFX12-NEXT: s_nop 0
|
||||
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
|
||||
; GFX12-NEXT: s_endpgm
|
||||
bb:
|
||||
%fneg.B = fneg <4 x half> %B
|
||||
%res = call <4 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16.v8f16.v8f16(<4 x half> %A, <4 x half> %fneg.B, <4 x half> <half 1.0, half 1.0, half 1.0, half 1.0>, i1 0)
|
||||
store <4 x half> %res, <4 x half> addrspace(1)* %out
|
||||
ret void
|
||||
}
|
||||
|
||||
; pack f16 elements with v_perm_b32 since they don't come from same b32
|
||||
|
||||
define amdgpu_ps void @test_wmma_f16_16x16x16_f16_negC_pack(<4 x half> %A, <4 x half> %B, ptr %Caddr, <4 x half> addrspace(1)* %out) {
|
||||
; GFX12-LABEL: test_wmma_f16_16x16x16_f16_negC_pack:
|
||||
; GFX12: ; %bb.0: ; %bb
|
||||
; GFX12-NEXT: flat_load_b128 v[8:11], v[4:5]
|
||||
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
|
||||
; GFX12-NEXT: v_perm_b32 v5, v11, v10, 0x5040100
|
||||
; GFX12-NEXT: v_perm_b32 v4, v9, v8, 0x5040100
|
||||
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
|
||||
; GFX12-NEXT: v_wmma_f16_16x16x16_f16 v[4:5], v[0:1], v[2:3], v[4:5] neg_lo:[0,0,1]
|
||||
; GFX12-NEXT: global_store_b64 v[6:7], v[4:5], off
|
||||
; GFX12-NEXT: s_nop 0
|
||||
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
|
||||
; GFX12-NEXT: s_endpgm
|
||||
bb:
|
||||
%C = load <8 x half>, ptr %Caddr
|
||||
%C_shuffle = shufflevector <8 x half> %C, <8 x half> undef, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
|
||||
%fneg.C_shuffle = fneg <4 x half> %C_shuffle
|
||||
%res = call <4 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16.v8f16.v8f16(<4 x half> %A, <4 x half> %B, <4 x half> %fneg.C_shuffle , i1 0)
|
||||
store <4 x half> %res, <4 x half> addrspace(1)* %out
|
||||
ret void
|
||||
}
|
||||
|
||||
declare <4 x half> @llvm.fabs.v4f16(<4 x half>)
|
||||
declare <4 x float> @llvm.fabs.v4f32(<4 x float>)
|
||||
declare float @llvm.fabs.f32(float)
|
||||
|
||||
declare <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.f16.v4f16.v4f32(<4 x half>, <4 x half>, <4 x float>)
|
||||
declare <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf16.v4i16.v4f32(<4 x i16>, <4 x i16>, <4 x float>)
|
||||
declare <4 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16.v8f16.v8f16(<4 x half>, <4 x half>, <4 x half>, i1 immarg)
|
||||
declare <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.fp8.fp8.i32.v4f32(i32, i32, <4 x float>)
|
||||
declare <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.fp8.bf8.i32.v4f32(i32, i32, <4 x float>)
|
||||
declare <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf8.fp8.i32.v4f32(i32, i32, <4 x float>)
|
||||
declare <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf8.bf8.i32.v4f32(i32, i32, <4 x float>)
|
||||
declare <4 x float> @llvm.amdgcn.swmmac.f32.16x16x32.f16.v4f16.v8f16.v4f32.i16(<4 x half>, <8 x half>, <4 x float>, i16)
|
||||
declare <4 x half> @llvm.amdgcn.swmmac.f16.16x16x32.f16.v4f16.v8f16.v4f16.i16(<4 x half>, <8 x half>, <4 x half>, i16)
|
373
llvm/test/CodeGen/AMDGPU/wmma-gfx12-w64-imm.ll
Normal file
373
llvm/test/CodeGen/AMDGPU/wmma-gfx12-w64-imm.ll
Normal file
@ -0,0 +1,373 @@
|
||||
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
|
||||
; RUN: llc -march=amdgcn -mcpu=gfx1200 -mattr=-wavefrontsize32,+wavefrontsize64 -verify-machineinstrs < %s | FileCheck %s --check-prefix=GFX12
|
||||
|
||||
define amdgpu_ps void @test_wmma_f32_16x16x16_f16_imm(<4 x half> %A, <4 x half> %B, ptr addrspace(1) %out) {
|
||||
; GFX12-LABEL: test_wmma_f32_16x16x16_f16_imm:
|
||||
; GFX12: ; %bb.0: ; %bb
|
||||
; GFX12-NEXT: v_wmma_f32_16x16x16_f16 v[6:9], v[0:1], v[2:3], 1.0
|
||||
; GFX12-NEXT: global_store_b128 v[4:5], v[6:9], off
|
||||
; GFX12-NEXT: s_nop 0
|
||||
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
|
||||
; GFX12-NEXT: s_endpgm
|
||||
bb:
|
||||
%res = call <4 x float>@llvm.amdgcn.wmma.f32.16x16x16.f16.v4f32.v4f16.v4f16.v4f32(<4 x half> %A, <4 x half> %B, <4 x float> <float 1.0, float 1.0, float 1.0, float 1.0>)
|
||||
store <4 x float> %res, ptr addrspace(1) %out
|
||||
ret void
|
||||
}
|
||||
|
||||
define amdgpu_ps void @test_wmma_f32_16x16x16_f16_imm_non_inlineable(<4 x half> %A, <4 x half> %B, ptr addrspace(1) %out) {
|
||||
; GFX12-LABEL: test_wmma_f32_16x16x16_f16_imm_non_inlineable:
|
||||
; GFX12: ; %bb.0: ; %bb
|
||||
; GFX12-NEXT: v_mov_b32_e32 v6, 0x40400000
|
||||
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
|
||||
; GFX12-NEXT: v_mov_b32_e32 v7, v6
|
||||
; GFX12-NEXT: v_mov_b32_e32 v8, v6
|
||||
; GFX12-NEXT: v_mov_b32_e32 v9, v6
|
||||
; GFX12-NEXT: v_wmma_f32_16x16x16_f16 v[6:9], v[0:1], v[2:3], v[6:9]
|
||||
; GFX12-NEXT: global_store_b128 v[4:5], v[6:9], off
|
||||
; GFX12-NEXT: s_nop 0
|
||||
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
|
||||
; GFX12-NEXT: s_endpgm
|
||||
bb:
|
||||
%res = call <4 x float>@llvm.amdgcn.wmma.f32.16x16x16.f16.v4f32.v4f16.v4f16.v4f32(<4 x half> %A, <4 x half> %B, <4 x float> <float 3.0, float 3.0, float 3.0, float 3.0>)
|
||||
store <4 x float> %res, ptr addrspace(1) %out
|
||||
ret void
|
||||
}
|
||||
|
||||
define amdgpu_ps void @test_wmma_f32_16x16x16_bf16_imm(<4 x i16> %A, <4 x i16> %B, ptr addrspace(1) %out) {
|
||||
; GFX12-LABEL: test_wmma_f32_16x16x16_bf16_imm:
|
||||
; GFX12: ; %bb.0: ; %bb
|
||||
; GFX12-NEXT: v_wmma_f32_16x16x16_bf16 v[6:9], v[0:1], v[2:3], 1.0
|
||||
; GFX12-NEXT: global_store_b128 v[4:5], v[6:9], off
|
||||
; GFX12-NEXT: s_nop 0
|
||||
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
|
||||
; GFX12-NEXT: s_endpgm
|
||||
bb:
|
||||
%res = call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf16.v4f32.v4i16.v4i16.v4f32(<4 x i16> %A, <4 x i16> %B, <4 x float> <float 1.0, float 1.0, float 1.0, float 1.0>)
|
||||
store <4 x float> %res, ptr addrspace(1) %out
|
||||
ret void
|
||||
}
|
||||
|
||||
define amdgpu_ps void @test_wmma_f32_16x16x16_bf16_imm_non_inlineable(<4 x i16> %A, <4 x i16> %B, ptr addrspace(1) %out) {
|
||||
; GFX12-LABEL: test_wmma_f32_16x16x16_bf16_imm_non_inlineable:
|
||||
; GFX12: ; %bb.0: ; %bb
|
||||
; GFX12-NEXT: v_mov_b32_e32 v6, 0x40400000
|
||||
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
|
||||
; GFX12-NEXT: v_mov_b32_e32 v7, v6
|
||||
; GFX12-NEXT: v_mov_b32_e32 v8, v6
|
||||
; GFX12-NEXT: v_mov_b32_e32 v9, v6
|
||||
; GFX12-NEXT: v_wmma_f32_16x16x16_bf16 v[6:9], v[0:1], v[2:3], v[6:9]
|
||||
; GFX12-NEXT: global_store_b128 v[4:5], v[6:9], off
|
||||
; GFX12-NEXT: s_nop 0
|
||||
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
|
||||
; GFX12-NEXT: s_endpgm
|
||||
bb:
|
||||
%res = call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf16.v4f32.v4i16.v4i16.v4f32(<4 x i16> %A, <4 x i16> %B, <4 x float> <float 3.0, float 3.0, float 3.0, float 3.0>)
|
||||
store <4 x float> %res, ptr addrspace(1) %out
|
||||
ret void
|
||||
}
|
||||
|
||||
define amdgpu_ps void @test_wmma_f16_16x16x16_f16_imm(<4 x half> %A, <4 x half> %B, ptr addrspace(1) %out) {
|
||||
; GFX12-LABEL: test_wmma_f16_16x16x16_f16_imm:
|
||||
; GFX12: ; %bb.0: ; %bb
|
||||
; GFX12-NEXT: v_wmma_f16_16x16x16_f16 v[6:7], v[0:1], v[2:3], 1.0
|
||||
; GFX12-NEXT: global_store_b64 v[4:5], v[6:7], off
|
||||
; GFX12-NEXT: s_nop 0
|
||||
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
|
||||
; GFX12-NEXT: s_endpgm
|
||||
bb:
|
||||
%res = call <4 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16.v4f16.v4f16.v4f16.v4f16(<4 x half> %A, <4 x half> %B, <4 x half> <half 1.0, half 1.0, half 1.0, half 1.0>, i1 0)
|
||||
store <4 x half> %res, ptr addrspace(1) %out
|
||||
ret void
|
||||
}
|
||||
|
||||
define amdgpu_ps void @test_wmma_f16_16x16x16_f16_imm_non_inlineable(<4 x half> %A, <4 x half> %B, ptr addrspace(1) %out) {
|
||||
; GFX12-LABEL: test_wmma_f16_16x16x16_f16_imm_non_inlineable:
|
||||
; GFX12: ; %bb.0: ; %bb
|
||||
; GFX12-NEXT: v_mov_b32_e32 v6, 0x42004200
|
||||
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
|
||||
; GFX12-NEXT: v_mov_b32_e32 v7, v6
|
||||
; GFX12-NEXT: v_wmma_f16_16x16x16_f16 v[6:7], v[0:1], v[2:3], v[6:7]
|
||||
; GFX12-NEXT: global_store_b64 v[4:5], v[6:7], off
|
||||
; GFX12-NEXT: s_nop 0
|
||||
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
|
||||
; GFX12-NEXT: s_endpgm
|
||||
bb:
|
||||
%res = call <4 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16.v4f16.v4f16.v4f16.v4f16(<4 x half> %A, <4 x half> %B, <4 x half> <half 3.0, half 3.0, half 3.0, half 3.0>, i1 0)
|
||||
store <4 x half> %res, ptr addrspace(1) %out
|
||||
ret void
|
||||
}
|
||||
|
||||
define amdgpu_ps void @test_wmma_bf16_16x16x16_bf16_imm(<4 x i16> %A, <4 x i16> %B, ptr addrspace(1) %out) {
|
||||
; GFX12-LABEL: test_wmma_bf16_16x16x16_bf16_imm:
|
||||
; GFX12: ; %bb.0: ; %bb
|
||||
; GFX12-NEXT: v_wmma_bf16_16x16x16_bf16 v[6:7], v[0:1], v[2:3], 1.0
|
||||
; GFX12-NEXT: global_store_b64 v[4:5], v[6:7], off
|
||||
; GFX12-NEXT: s_nop 0
|
||||
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
|
||||
; GFX12-NEXT: s_endpgm
|
||||
bb:
|
||||
%res = call <4 x i16> @llvm.amdgcn.wmma.bf16.16x16x16.bf16.v4i16.v4i16.v4i16.v4i16(<4 x i16> %A, <4 x i16> %B, <4 x i16> <i16 16256, i16 16256, i16 16256, i16 16256>, i1 0)
|
||||
store <4 x i16> %res, ptr addrspace(1) %out
|
||||
ret void
|
||||
}
|
||||
|
||||
define amdgpu_ps void @test_wmma_bf16_16x16x16_bf16_imm_non_inlineable(<4 x i16> %A, <4 x i16> %B, ptr addrspace(1) %out) {
|
||||
; GFX12-LABEL: test_wmma_bf16_16x16x16_bf16_imm_non_inlineable:
|
||||
; GFX12: ; %bb.0: ; %bb
|
||||
; GFX12-NEXT: v_mov_b32_e32 v6, 0x3fc03fc0
|
||||
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
|
||||
; GFX12-NEXT: v_mov_b32_e32 v7, v6
|
||||
; GFX12-NEXT: v_wmma_bf16_16x16x16_bf16 v[6:7], v[0:1], v[2:3], v[6:7]
|
||||
; GFX12-NEXT: global_store_b64 v[4:5], v[6:7], off
|
||||
; GFX12-NEXT: s_nop 0
|
||||
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
|
||||
; GFX12-NEXT: s_endpgm
|
||||
bb:
|
||||
%res = call <4 x i16> @llvm.amdgcn.wmma.bf16.16x16x16.bf16.v4i16.v4i16.v4i16.v4i16(<4 x i16> %A, <4 x i16> %B, <4 x i16> <i16 16320, i16 16320, i16 16320, i16 16320>, i1 0)
|
||||
store <4 x i16> %res, ptr addrspace(1) %out
|
||||
ret void
|
||||
}
|
||||
|
||||
define amdgpu_ps void @test_wmma_i32_16x16x16_iu8_imm(i32 %A, i32 %B, ptr addrspace(1) %out) {
|
||||
; GFX12-LABEL: test_wmma_i32_16x16x16_iu8_imm:
|
||||
; GFX12: ; %bb.0: ; %bb
|
||||
; GFX12-NEXT: v_wmma_i32_16x16x16_iu8 v[4:7], v0, v1, 1
|
||||
; GFX12-NEXT: global_store_b128 v[2:3], v[4:7], off
|
||||
; GFX12-NEXT: s_nop 0
|
||||
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
|
||||
; GFX12-NEXT: s_endpgm
|
||||
bb:
|
||||
%res = call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8.v4i32.i32.i32.v4i32(i1 0, i32 %A, i1 0, i32 %B, <4 x i32> <i32 1, i32 1, i32 1, i32 1>, i1 0)
|
||||
store <4 x i32> %res, ptr addrspace(1) %out
|
||||
ret void
|
||||
}
|
||||
|
||||
define amdgpu_ps void @test_wmma_i32_16x16x16_iu8_imm_non_inlineable(i32 %A, i32 %B, ptr addrspace(1) %out) {
|
||||
; GFX12-LABEL: test_wmma_i32_16x16x16_iu8_imm_non_inlineable:
|
||||
; GFX12: ; %bb.0: ; %bb
|
||||
; GFX12-NEXT: v_mov_b32_e32 v4, 0x80
|
||||
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
|
||||
; GFX12-NEXT: v_mov_b32_e32 v5, v4
|
||||
; GFX12-NEXT: v_mov_b32_e32 v6, v4
|
||||
; GFX12-NEXT: v_mov_b32_e32 v7, v4
|
||||
; GFX12-NEXT: v_wmma_i32_16x16x16_iu8 v[4:7], v0, v1, v[4:7]
|
||||
; GFX12-NEXT: global_store_b128 v[2:3], v[4:7], off
|
||||
; GFX12-NEXT: s_nop 0
|
||||
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
|
||||
; GFX12-NEXT: s_endpgm
|
||||
bb:
|
||||
%res = call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8.v4i32.i32.i32.v4i32(i1 0, i32 %A, i1 0, i32 %B, <4 x i32> <i32 128, i32 128, i32 128, i32 128>, i1 0)
|
||||
store <4 x i32> %res, ptr addrspace(1) %out
|
||||
ret void
|
||||
}
|
||||
|
||||
define amdgpu_ps void @test_wmma_i32_16x16x16_iu4_imm(i32 %A, i32 %B, ptr addrspace(1) %out) {
|
||||
; GFX12-LABEL: test_wmma_i32_16x16x16_iu4_imm:
|
||||
; GFX12: ; %bb.0: ; %bb
|
||||
; GFX12-NEXT: v_wmma_i32_16x16x16_iu4 v[4:7], v0, v1, 1
|
||||
; GFX12-NEXT: global_store_b128 v[2:3], v[4:7], off
|
||||
; GFX12-NEXT: s_nop 0
|
||||
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
|
||||
; GFX12-NEXT: s_endpgm
|
||||
bb:
|
||||
%res = call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4.v4i32.i32.i32.v4i32(i1 0, i32 %A, i1 0, i32 %B, <4 x i32> <i32 1, i32 1, i32 1, i32 1>, i1 0)
|
||||
store <4 x i32> %res, ptr addrspace(1) %out
|
||||
ret void
|
||||
}
|
||||
|
||||
define amdgpu_ps void @test_wmma_i32_16x16x16_iu4_imm_non_inlineable(i32 %A, i32 %B, ptr addrspace(1) %out) {
|
||||
; GFX12-LABEL: test_wmma_i32_16x16x16_iu4_imm_non_inlineable:
|
||||
; GFX12: ; %bb.0: ; %bb
|
||||
; GFX12-NEXT: v_mov_b32_e32 v4, 0x80
|
||||
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
|
||||
; GFX12-NEXT: v_mov_b32_e32 v5, v4
|
||||
; GFX12-NEXT: v_mov_b32_e32 v6, v4
|
||||
; GFX12-NEXT: v_mov_b32_e32 v7, v4
|
||||
; GFX12-NEXT: v_wmma_i32_16x16x16_iu4 v[4:7], v0, v1, v[4:7]
|
||||
; GFX12-NEXT: global_store_b128 v[2:3], v[4:7], off
|
||||
; GFX12-NEXT: s_nop 0
|
||||
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
|
||||
; GFX12-NEXT: s_endpgm
|
||||
bb:
|
||||
%res = call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4.v4i32.i32.i32.v4i32(i1 0, i32 %A, i1 0, i32 %B, <4 x i32> <i32 128, i32 128, i32 128, i32 128>, i1 0)
|
||||
store <4 x i32> %res, ptr addrspace(1) %out
|
||||
ret void
|
||||
}
|
||||
|
||||
define amdgpu_ps void @test_wmma_f32_16x16x16_fp8_fp8_imm(i32 %A, i32 %B, <4 x float> %C, ptr addrspace(1) %out) {
|
||||
; GFX12-LABEL: test_wmma_f32_16x16x16_fp8_fp8_imm:
|
||||
; GFX12: ; %bb.0: ; %bb
|
||||
; GFX12-NEXT: v_wmma_f32_16x16x16_fp8_fp8 v[4:7], v0, v1, 1.0
|
||||
; GFX12-NEXT: global_store_b128 v[2:3], v[4:7], off
|
||||
; GFX12-NEXT: s_nop 0
|
||||
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
|
||||
; GFX12-NEXT: s_endpgm
|
||||
bb:
|
||||
%res = call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.fp8.fp8.v4f32.i32.i32.v4f32(i32 %A, i32 %B, <4 x float> <float 1.0, float 1.0, float 1.0, float 1.0>)
|
||||
store <4 x float> %res, ptr addrspace(1) %out
|
||||
ret void
|
||||
}
|
||||
|
||||
define amdgpu_ps void @test_wmma_f32_16x16x16_fp8_fp8_imm_non_inlineable(i32 %A, i32 %B, <4 x float> %C, ptr addrspace(1) %out) {
|
||||
; GFX12-LABEL: test_wmma_f32_16x16x16_fp8_fp8_imm_non_inlineable:
|
||||
; GFX12: ; %bb.0: ; %bb
|
||||
; GFX12-NEXT: v_mov_b32_e32 v4, 0x40400000
|
||||
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
|
||||
; GFX12-NEXT: v_mov_b32_e32 v5, v4
|
||||
; GFX12-NEXT: v_mov_b32_e32 v6, v4
|
||||
; GFX12-NEXT: v_mov_b32_e32 v7, v4
|
||||
; GFX12-NEXT: v_wmma_f32_16x16x16_fp8_fp8 v[4:7], v0, v1, v[4:7]
|
||||
; GFX12-NEXT: global_store_b128 v[2:3], v[4:7], off
|
||||
; GFX12-NEXT: s_nop 0
|
||||
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
|
||||
; GFX12-NEXT: s_endpgm
|
||||
bb:
|
||||
%res = call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.fp8.fp8.v4f32.i32.i32.v4f32(i32 %A, i32 %B, <4 x float> <float 3.0, float 3.0, float 3.0, float 3.0>)
|
||||
store <4 x float> %res, ptr addrspace(1) %out
|
||||
ret void
|
||||
}
|
||||
|
||||
define amdgpu_ps void @test_wmma_f32_16x16x16_bf8_fp8_imm(i32 %A, i32 %B, <4 x float> %C, ptr addrspace(1) %out) {
|
||||
; GFX12-LABEL: test_wmma_f32_16x16x16_bf8_fp8_imm:
|
||||
; GFX12: ; %bb.0: ; %bb
|
||||
; GFX12-NEXT: v_wmma_f32_16x16x16_bf8_fp8 v[4:7], v0, v1, 1.0
|
||||
; GFX12-NEXT: global_store_b128 v[2:3], v[4:7], off
|
||||
; GFX12-NEXT: s_nop 0
|
||||
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
|
||||
; GFX12-NEXT: s_endpgm
|
||||
bb:
|
||||
%res = call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf8.fp8.v4f32.i32.i32.v4f32(i32 %A, i32 %B, <4 x float> <float 1.0, float 1.0, float 1.0, float 1.0>)
|
||||
store <4 x float> %res, ptr addrspace(1) %out
|
||||
ret void
|
||||
}
|
||||
|
||||
define amdgpu_ps void @test_wmma_f32_16x16x16_bf8_fp8_imm_non_inlineable(i32 %A, i32 %B, <4 x float> %C, ptr addrspace(1) %out) {
|
||||
; GFX12-LABEL: test_wmma_f32_16x16x16_bf8_fp8_imm_non_inlineable:
|
||||
; GFX12: ; %bb.0: ; %bb
|
||||
; GFX12-NEXT: v_mov_b32_e32 v4, 0x40400000
|
||||
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
|
||||
; GFX12-NEXT: v_mov_b32_e32 v5, v4
|
||||
; GFX12-NEXT: v_mov_b32_e32 v6, v4
|
||||
; GFX12-NEXT: v_mov_b32_e32 v7, v4
|
||||
; GFX12-NEXT: v_wmma_f32_16x16x16_bf8_fp8 v[4:7], v0, v1, v[4:7]
|
||||
; GFX12-NEXT: global_store_b128 v[2:3], v[4:7], off
|
||||
; GFX12-NEXT: s_nop 0
|
||||
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
|
||||
; GFX12-NEXT: s_endpgm
|
||||
bb:
|
||||
%res = call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf8.fp8.v4f32.i32.i32.v4f32(i32 %A, i32 %B, <4 x float> <float 3.0, float 3.0, float 3.0, float 3.0>)
|
||||
store <4 x float> %res, ptr addrspace(1) %out
|
||||
ret void
|
||||
}
|
||||
|
||||
define amdgpu_ps void @test_wmma_f32_16x16x16_fp8_bf8_imm(i32 %A, i32 %B, <4 x float> %C, ptr addrspace(1) %out) {
|
||||
; GFX12-LABEL: test_wmma_f32_16x16x16_fp8_bf8_imm:
|
||||
; GFX12: ; %bb.0: ; %bb
|
||||
; GFX12-NEXT: v_wmma_f32_16x16x16_fp8_bf8 v[4:7], v0, v1, 1.0
|
||||
; GFX12-NEXT: global_store_b128 v[2:3], v[4:7], off
|
||||
; GFX12-NEXT: s_nop 0
|
||||
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
|
||||
; GFX12-NEXT: s_endpgm
|
||||
bb:
|
||||
%res = call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.fp8.bf8.v4f32.i32.i32.v4f32(i32 %A, i32 %B, <4 x float> <float 1.0, float 1.0, float 1.0, float 1.0>)
|
||||
store <4 x float> %res, ptr addrspace(1) %out
|
||||
ret void
|
||||
}
|
||||
|
||||
define amdgpu_ps void @test_wmma_f32_16x16x16_fp8_bf8_imm_non_inlineable(i32 %A, i32 %B, <4 x float> %C, ptr addrspace(1) %out) {
|
||||
; GFX12-LABEL: test_wmma_f32_16x16x16_fp8_bf8_imm_non_inlineable:
|
||||
; GFX12: ; %bb.0: ; %bb
|
||||
; GFX12-NEXT: v_mov_b32_e32 v4, 0x40400000
|
||||
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
|
||||
; GFX12-NEXT: v_mov_b32_e32 v5, v4
|
||||
; GFX12-NEXT: v_mov_b32_e32 v6, v4
|
||||
; GFX12-NEXT: v_mov_b32_e32 v7, v4
|
||||
; GFX12-NEXT: v_wmma_f32_16x16x16_fp8_bf8 v[4:7], v0, v1, v[4:7]
|
||||
; GFX12-NEXT: global_store_b128 v[2:3], v[4:7], off
|
||||
; GFX12-NEXT: s_nop 0
|
||||
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
|
||||
; GFX12-NEXT: s_endpgm
|
||||
bb:
|
||||
%res = call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.fp8.bf8.v4f32.i32.i32.v4f32(i32 %A, i32 %B, <4 x float> <float 3.0, float 3.0, float 3.0, float 3.0>)
|
||||
store <4 x float> %res, ptr addrspace(1) %out
|
||||
ret void
|
||||
}
|
||||
|
||||
define amdgpu_ps void @test_wmma_f32_16x16x16_bf8_bf8_imm(i32 %A, i32 %B, <4 x float> %C, ptr addrspace(1) %out) {
|
||||
; GFX12-LABEL: test_wmma_f32_16x16x16_bf8_bf8_imm:
|
||||
; GFX12: ; %bb.0: ; %bb
|
||||
; GFX12-NEXT: v_wmma_f32_16x16x16_bf8_bf8 v[4:7], v0, v1, 1.0
|
||||
; GFX12-NEXT: global_store_b128 v[2:3], v[4:7], off
|
||||
; GFX12-NEXT: s_nop 0
|
||||
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
|
||||
; GFX12-NEXT: s_endpgm
|
||||
bb:
|
||||
%res = call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf8.bf8.v4f32.i32.i32.v4f32(i32 %A, i32 %B, <4 x float> <float 1.0, float 1.0, float 1.0, float 1.0>)
|
||||
store <4 x float> %res, ptr addrspace(1) %out
|
||||
ret void
|
||||
}
|
||||
|
||||
define amdgpu_ps void @test_wmma_f32_16x16x16_bf8_bf8_imm_non_inlineable(i32 %A, i32 %B, <4 x float> %C, ptr addrspace(1) %out) {
|
||||
; GFX12-LABEL: test_wmma_f32_16x16x16_bf8_bf8_imm_non_inlineable:
|
||||
; GFX12: ; %bb.0: ; %bb
|
||||
; GFX12-NEXT: v_mov_b32_e32 v4, 0x40400000
|
||||
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
|
||||
; GFX12-NEXT: v_mov_b32_e32 v5, v4
|
||||
; GFX12-NEXT: v_mov_b32_e32 v6, v4
|
||||
; GFX12-NEXT: v_mov_b32_e32 v7, v4
|
||||
; GFX12-NEXT: v_wmma_f32_16x16x16_bf8_bf8 v[4:7], v0, v1, v[4:7]
|
||||
; GFX12-NEXT: global_store_b128 v[2:3], v[4:7], off
|
||||
; GFX12-NEXT: s_nop 0
|
||||
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
|
||||
; GFX12-NEXT: s_endpgm
|
||||
bb:
|
||||
%res = call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf8.bf8.v4f32.i32.i32.v4f32(i32 %A, i32 %B, <4 x float> <float 3.0, float 3.0, float 3.0, float 3.0>)
|
||||
store <4 x float> %res, ptr addrspace(1) %out
|
||||
ret void
|
||||
}
|
||||
|
||||
define amdgpu_ps void @test_wmma_i32_16x16x32_iu4_imm(i32 %A, i32 %B, <4 x i32> %C, ptr addrspace(1) %out) {
|
||||
; GFX12-LABEL: test_wmma_i32_16x16x32_iu4_imm:
|
||||
; GFX12: ; %bb.0: ; %bb
|
||||
; GFX12-NEXT: v_wmma_i32_16x16x32_iu4 v[4:7], v0, v1, 1
|
||||
; GFX12-NEXT: global_store_b128 v[2:3], v[4:7], off
|
||||
; GFX12-NEXT: s_nop 0
|
||||
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
|
||||
; GFX12-NEXT: s_endpgm
|
||||
bb:
|
||||
%res = call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x32.iu4.v4i32.i32.i32.v4i32(i1 0, i32 %A, i1 0, i32 %B, <4 x i32> <i32 1, i32 1, i32 1, i32 1>, i1 0)
|
||||
store <4 x i32> %res, ptr addrspace(1) %out
|
||||
ret void
|
||||
}
|
||||
|
||||
define amdgpu_ps void @test_wmma_i32_16x16x32_iu4_imm_non_inlineable(i32 %A, i32 %B, <4 x i32> %C, ptr addrspace(1) %out) {
|
||||
; GFX12-LABEL: test_wmma_i32_16x16x32_iu4_imm_non_inlineable:
|
||||
; GFX12: ; %bb.0: ; %bb
|
||||
; GFX12-NEXT: v_mov_b32_e32 v4, 0x80
|
||||
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
|
||||
; GFX12-NEXT: v_mov_b32_e32 v5, v4
|
||||
; GFX12-NEXT: v_mov_b32_e32 v6, v4
|
||||
; GFX12-NEXT: v_mov_b32_e32 v7, v4
|
||||
; GFX12-NEXT: v_wmma_i32_16x16x32_iu4 v[4:7], v0, v1, v[4:7]
|
||||
; GFX12-NEXT: global_store_b128 v[2:3], v[4:7], off
|
||||
; GFX12-NEXT: s_nop 0
|
||||
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
|
||||
; GFX12-NEXT: s_endpgm
|
||||
bb:
|
||||
%res = call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x32.iu4.v4i32.i32.i32.v4i32(i1 0, i32 %A, i1 0, i32 %B, <4 x i32> <i32 128, i32 128, i32 128, i32 128>, i1 0)
|
||||
store <4 x i32> %res, ptr addrspace(1) %out
|
||||
ret void
|
||||
}
|
||||
|
||||
declare <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.f16.v4f32.v4f16.v4f16.v4f32(<4 x half>, <4 x half>, <4 x float>)
|
||||
declare <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf16.v4f32.v4i16.v4i16.v4f32(<4 x i16>, <4 x i16>, <4 x float>)
|
||||
declare <4 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16.v4f16.v4f16.v4f16.v4f16(<4 x half>, <4 x half>, <4 x half>, i1 immarg)
|
||||
declare <4 x i16> @llvm.amdgcn.wmma.bf16.16x16x16.bf16.v4i16.v4i16.v4i16.v4i16(<4 x i16>, <4 x i16>, <4 x i16>, i1 immarg)
|
||||
declare <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8.v4i32.i32.i32.v4i32(i1 immarg, i32, i1 immarg, i32, <4 x i32>, i1 immarg)
|
||||
declare <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4.v4i32.i32.i32.v4i32(i1 immarg, i32, i1 immarg, i32, <4 x i32>, i1 immarg)
|
||||
declare <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.fp8.fp8.v4f32.i32.i32.v4f32(i32, i32, <4 x float>)
|
||||
declare <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.fp8.bf8.v4f32.i32.i32.v4f32(i32, i32, <4 x float>)
|
||||
declare <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf8.fp8.v4f32.i32.i32.v4f32(i32, i32, <4 x float>)
|
||||
declare <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf8.bf8.v4f32.i32.i32.v4f32(i32, i32, <4 x float>)
|
||||
declare <4 x i32> @llvm.amdgcn.wmma.i32.16x16x32.iu4.v4i32.i32.i32.v4i32(i1 immarg, i32, i1 immarg, i32, <4 x i32>, i1 immarg)
|
274
llvm/test/CodeGen/AMDGPU/wmma-gfx12-w64-iu-modifiers.ll
Normal file
274
llvm/test/CodeGen/AMDGPU/wmma-gfx12-w64-iu-modifiers.ll
Normal file
@ -0,0 +1,274 @@
|
||||
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
|
||||
; RUN: llc -march=amdgcn -mcpu=gfx1200 -mattr=-wavefrontsize32,+wavefrontsize64 -verify-machineinstrs < %s | FileCheck %s --check-prefix=GFX12
|
||||
|
||||
define amdgpu_ps void @test_wmma_i32_16x16x16_iu8_zext_src0(i32 %A, i32 %B, <4 x i32> %C, ptr addrspace(1) %out) {
|
||||
; GFX12-LABEL: test_wmma_i32_16x16x16_iu8_zext_src0:
|
||||
; GFX12: ; %bb.0: ; %bb
|
||||
; GFX12-NEXT: v_wmma_i32_16x16x16_iu8 v[2:5], v0, v1, v[2:5] neg_lo:[1,0,0]
|
||||
; GFX12-NEXT: global_store_b128 v[6:7], v[2:5], off
|
||||
; GFX12-NEXT: s_nop 0
|
||||
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
|
||||
; GFX12-NEXT: s_endpgm
|
||||
bb:
|
||||
%res = call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8.v4i32.i32.i32.v4i32(i1 1, i32 %A, i1 0, i32 %B, <4 x i32> %C, i1 0)
|
||||
store <4 x i32> %res, ptr addrspace(1) %out
|
||||
ret void
|
||||
}
|
||||
|
||||
define amdgpu_ps void @test_wmma_i32_16x16x16_iu8_zext_src1(i32 %A, i32 %B, <4 x i32> %C, ptr addrspace(1) %out) {
|
||||
; GFX12-LABEL: test_wmma_i32_16x16x16_iu8_zext_src1:
|
||||
; GFX12: ; %bb.0: ; %bb
|
||||
; GFX12-NEXT: v_wmma_i32_16x16x16_iu8 v[2:5], v0, v1, v[2:5] neg_lo:[0,1,0]
|
||||
; GFX12-NEXT: global_store_b128 v[6:7], v[2:5], off
|
||||
; GFX12-NEXT: s_nop 0
|
||||
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
|
||||
; GFX12-NEXT: s_endpgm
|
||||
bb:
|
||||
%res = call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8.v4i32.i32.i32.v4i32(i1 0, i32 %A, i1 1, i32 %B, <4 x i32> %C, i1 0)
|
||||
store <4 x i32> %res, ptr addrspace(1) %out
|
||||
ret void
|
||||
}
|
||||
|
||||
define amdgpu_ps void @test_wmma_i32_16x16x16_iu8_clamp(i32 %A, i32 %B, <4 x i32> %C, ptr addrspace(1) %out) {
|
||||
; GFX12-LABEL: test_wmma_i32_16x16x16_iu8_clamp:
|
||||
; GFX12: ; %bb.0: ; %bb
|
||||
; GFX12-NEXT: v_wmma_i32_16x16x16_iu8 v[2:5], v0, v1, v[2:5] clamp
|
||||
; GFX12-NEXT: global_store_b128 v[6:7], v[2:5], off
|
||||
; GFX12-NEXT: s_nop 0
|
||||
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
|
||||
; GFX12-NEXT: s_endpgm
|
||||
bb:
|
||||
%res = call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8.v4i32.i32.i32.v4i32(i1 0, i32 %A, i1 0, i32 %B, <4 x i32> %C, i1 1)
|
||||
store <4 x i32> %res, ptr addrspace(1) %out
|
||||
ret void
|
||||
}
|
||||
|
||||
|
||||
|
||||
define amdgpu_ps void @test_wmma_i32_16x16x16_iu4_zext_src0(i32 %A, i32 %B, <4 x i32> %C, ptr addrspace(1) %out) {
|
||||
; GFX12-LABEL: test_wmma_i32_16x16x16_iu4_zext_src0:
|
||||
; GFX12: ; %bb.0: ; %bb
|
||||
; GFX12-NEXT: v_wmma_i32_16x16x16_iu4 v[2:5], v0, v1, v[2:5] neg_lo:[1,0,0]
|
||||
; GFX12-NEXT: global_store_b128 v[6:7], v[2:5], off
|
||||
; GFX12-NEXT: s_nop 0
|
||||
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
|
||||
; GFX12-NEXT: s_endpgm
|
||||
bb:
|
||||
%res = call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4.v4i32.i32.i32.v4i32(i1 1, i32 %A, i1 0, i32 %B, <4 x i32> %C, i1 0)
|
||||
store <4 x i32> %res, ptr addrspace(1) %out
|
||||
ret void
|
||||
}
|
||||
|
||||
define amdgpu_ps void @test_wmma_i32_16x16x16_iu4_zext_src1(i32 %A, i32 %B, <4 x i32> %C, ptr addrspace(1) %out) {
|
||||
; GFX12-LABEL: test_wmma_i32_16x16x16_iu4_zext_src1:
|
||||
; GFX12: ; %bb.0: ; %bb
|
||||
; GFX12-NEXT: v_wmma_i32_16x16x16_iu4 v[2:5], v0, v1, v[2:5] neg_lo:[0,1,0]
|
||||
; GFX12-NEXT: global_store_b128 v[6:7], v[2:5], off
|
||||
; GFX12-NEXT: s_nop 0
|
||||
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
|
||||
; GFX12-NEXT: s_endpgm
|
||||
bb:
|
||||
%res = call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4.v4i32.i32.i32.v4i32(i1 0, i32 %A, i1 1, i32 %B, <4 x i32> %C, i1 0)
|
||||
store <4 x i32> %res, ptr addrspace(1) %out
|
||||
ret void
|
||||
}
|
||||
|
||||
define amdgpu_ps void @test_wmma_i32_16x16x16_iu4_clamp(i32 %A, i32 %B, <4 x i32> %C, ptr addrspace(1) %out) {
|
||||
; GFX12-LABEL: test_wmma_i32_16x16x16_iu4_clamp:
|
||||
; GFX12: ; %bb.0: ; %bb
|
||||
; GFX12-NEXT: v_wmma_i32_16x16x16_iu4 v[2:5], v0, v1, v[2:5] clamp
|
||||
; GFX12-NEXT: global_store_b128 v[6:7], v[2:5], off
|
||||
; GFX12-NEXT: s_nop 0
|
||||
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
|
||||
; GFX12-NEXT: s_endpgm
|
||||
bb:
|
||||
%res = call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4.v4i32.i32.i32.v4i32(i1 0, i32 %A, i1 0, i32 %B, <4 x i32> %C, i1 1)
|
||||
store <4 x i32> %res, ptr addrspace(1) %out
|
||||
ret void
|
||||
}
|
||||
|
||||
|
||||
|
||||
define amdgpu_ps void @test_wmma_i32_16x16x32_iu4_zext_src0(i32 %A, i32 %B, <4 x i32> %C, ptr addrspace(1) %out) {
|
||||
; GFX12-LABEL: test_wmma_i32_16x16x32_iu4_zext_src0:
|
||||
; GFX12: ; %bb.0: ; %bb
|
||||
; GFX12-NEXT: v_wmma_i32_16x16x32_iu4 v[2:5], v0, v1, v[2:5] neg_lo:[1,0,0]
|
||||
; GFX12-NEXT: global_store_b128 v[6:7], v[2:5], off
|
||||
; GFX12-NEXT: s_nop 0
|
||||
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
|
||||
; GFX12-NEXT: s_endpgm
|
||||
bb:
|
||||
%res = call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x32.iu4.v4i32.i32.i32.v4i32(i1 1, i32 %A, i1 0, i32 %B, <4 x i32> %C, i1 0)
|
||||
store <4 x i32> %res, ptr addrspace(1) %out
|
||||
ret void
|
||||
}
|
||||
|
||||
define amdgpu_ps void @test_wmma_i32_16x16x32_iu4_zext_src1(i32 %A, i32 %B, <4 x i32> %C, ptr addrspace(1) %out) {
|
||||
; GFX12-LABEL: test_wmma_i32_16x16x32_iu4_zext_src1:
|
||||
; GFX12: ; %bb.0: ; %bb
|
||||
; GFX12-NEXT: v_wmma_i32_16x16x32_iu4 v[2:5], v0, v1, v[2:5] neg_lo:[0,1,0]
|
||||
; GFX12-NEXT: global_store_b128 v[6:7], v[2:5], off
|
||||
; GFX12-NEXT: s_nop 0
|
||||
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
|
||||
; GFX12-NEXT: s_endpgm
|
||||
bb:
|
||||
%res = call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x32.iu4.v4i32.i32.i32.v4i32(i1 0, i32 %A, i1 1, i32 %B, <4 x i32> %C, i1 0)
|
||||
store <4 x i32> %res, ptr addrspace(1) %out
|
||||
ret void
|
||||
}
|
||||
|
||||
define amdgpu_ps void @test_wmma_i32_16x16x32_iu4_clamp(i32 %A, i32 %B, <4 x i32> %C, ptr addrspace(1) %out) {
|
||||
; GFX12-LABEL: test_wmma_i32_16x16x32_iu4_clamp:
|
||||
; GFX12: ; %bb.0: ; %bb
|
||||
; GFX12-NEXT: v_wmma_i32_16x16x32_iu4 v[2:5], v0, v1, v[2:5] clamp
|
||||
; GFX12-NEXT: global_store_b128 v[6:7], v[2:5], off
|
||||
; GFX12-NEXT: s_nop 0
|
||||
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
|
||||
; GFX12-NEXT: s_endpgm
|
||||
bb:
|
||||
%res = call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x32.iu4.v4i32.i32.i32.v4i32(i1 0, i32 %A, i1 0, i32 %B, <4 x i32> %C, i1 1)
|
||||
store <4 x i32> %res, ptr addrspace(1) %out
|
||||
ret void
|
||||
}
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
define amdgpu_ps void @test_swmmac_i32_16x16x32_iu8_zext_src0(i32 %A, <2 x i32> %B, <4 x i32> %C, i8 %Index, ptr addrspace(1) %out) {
|
||||
; GFX12-LABEL: test_swmmac_i32_16x16x32_iu8_zext_src0:
|
||||
; GFX12: ; %bb.0: ; %bb
|
||||
; GFX12-NEXT: v_swmmac_i32_16x16x32_iu8 v[3:6], v0, v[1:2], v7 neg_lo:[1,0,0]
|
||||
; GFX12-NEXT: global_store_b128 v[8:9], v[3:6], off
|
||||
; GFX12-NEXT: s_nop 0
|
||||
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
|
||||
; GFX12-NEXT: s_endpgm
|
||||
bb:
|
||||
%res = call <4 x i32> @llvm.amdgcn.swmmac.i32.16x16x32.iu8.v4i32.i32.v2i32.v4i32.i8(i1 1, i32 %A, i1 0, <2 x i32> %B, <4 x i32> %C, i8 %Index, i1 0)
|
||||
store <4 x i32> %res, ptr addrspace(1) %out
|
||||
ret void
|
||||
}
|
||||
|
||||
define amdgpu_ps void @test_swmmac_i32_16x16x32_iu8_zext_src1(i32 %A, <2 x i32> %B, <4 x i32> %C, i8 %Index, ptr addrspace(1) %out) {
|
||||
; GFX12-LABEL: test_swmmac_i32_16x16x32_iu8_zext_src1:
|
||||
; GFX12: ; %bb.0: ; %bb
|
||||
; GFX12-NEXT: v_swmmac_i32_16x16x32_iu8 v[3:6], v0, v[1:2], v7 neg_lo:[0,1,0]
|
||||
; GFX12-NEXT: global_store_b128 v[8:9], v[3:6], off
|
||||
; GFX12-NEXT: s_nop 0
|
||||
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
|
||||
; GFX12-NEXT: s_endpgm
|
||||
bb:
|
||||
%res = call <4 x i32> @llvm.amdgcn.swmmac.i32.16x16x32.iu8.v4i32.i32.v2i32.v4i32.i8(i1 0, i32 %A, i1 1, <2 x i32> %B, <4 x i32> %C, i8 %Index, i1 0)
|
||||
store <4 x i32> %res, ptr addrspace(1) %out
|
||||
ret void
|
||||
}
|
||||
|
||||
define amdgpu_ps void @test_swmmac_i32_16x16x32_iu8_clamp(i32 %A, <2 x i32> %B, <4 x i32> %C, i8 %Index, ptr addrspace(1) %out) {
|
||||
; GFX12-LABEL: test_swmmac_i32_16x16x32_iu8_clamp:
|
||||
; GFX12: ; %bb.0: ; %bb
|
||||
; GFX12-NEXT: v_swmmac_i32_16x16x32_iu8 v[3:6], v0, v[1:2], v7 clamp
|
||||
; GFX12-NEXT: global_store_b128 v[8:9], v[3:6], off
|
||||
; GFX12-NEXT: s_nop 0
|
||||
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
|
||||
; GFX12-NEXT: s_endpgm
|
||||
bb:
|
||||
%res = call <4 x i32> @llvm.amdgcn.swmmac.i32.16x16x32.iu8.v4i32.i32.v2i32.v4i32.i8(i1 0, i32 %A, i1 0, <2 x i32> %B, <4 x i32> %C, i8 %Index, i1 1)
|
||||
store <4 x i32> %res, ptr addrspace(1) %out
|
||||
ret void
|
||||
}
|
||||
|
||||
|
||||
|
||||
define amdgpu_ps void @test_swmmac_i32_16x16x32_iu4_zext_src0(i32 %A, i32 %B, <4 x i32> %C, i16 %Index, ptr addrspace(1) %out) {
|
||||
; GFX12-LABEL: test_swmmac_i32_16x16x32_iu4_zext_src0:
|
||||
; GFX12: ; %bb.0: ; %bb
|
||||
; GFX12-NEXT: v_swmmac_i32_16x16x32_iu4 v[2:5], v0, v1, v6 neg_lo:[1,0,0]
|
||||
; GFX12-NEXT: global_store_b128 v[7:8], v[2:5], off
|
||||
; GFX12-NEXT: s_nop 0
|
||||
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
|
||||
; GFX12-NEXT: s_endpgm
|
||||
bb:
|
||||
%res = call <4 x i32> @llvm.amdgcn.swmmac.i32.16x16x32.iu4.v4i32.i32.i32.v4i32.i16(i1 1, i32 %A, i1 0, i32 %B, <4 x i32> %C, i16 %Index, i1 0)
|
||||
store <4 x i32> %res, ptr addrspace(1) %out
|
||||
ret void
|
||||
}
|
||||
|
||||
define amdgpu_ps void @test_swmmac_i32_16x16x32_iu4_zext_src1(i32 %A, i32 %B, <4 x i32> %C, i16 %Index, ptr addrspace(1) %out) {
|
||||
; GFX12-LABEL: test_swmmac_i32_16x16x32_iu4_zext_src1:
|
||||
; GFX12: ; %bb.0: ; %bb
|
||||
; GFX12-NEXT: v_swmmac_i32_16x16x32_iu4 v[2:5], v0, v1, v6 neg_lo:[0,1,0]
|
||||
; GFX12-NEXT: global_store_b128 v[7:8], v[2:5], off
|
||||
; GFX12-NEXT: s_nop 0
|
||||
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
|
||||
; GFX12-NEXT: s_endpgm
|
||||
bb:
|
||||
%res = call <4 x i32> @llvm.amdgcn.swmmac.i32.16x16x32.iu4.v4i32.i32.i32.v4i32.i16(i1 0, i32 %A, i1 1, i32 %B, <4 x i32> %C, i16 %Index, i1 0)
|
||||
store <4 x i32> %res, ptr addrspace(1) %out
|
||||
ret void
|
||||
}
|
||||
|
||||
define amdgpu_ps void @test_swmmac_i32_16x16x32_iu4_clamp(i32 %A, i32 %B, <4 x i32> %C, i16 %Index, ptr addrspace(1) %out) {
|
||||
; GFX12-LABEL: test_swmmac_i32_16x16x32_iu4_clamp:
|
||||
; GFX12: ; %bb.0: ; %bb
|
||||
; GFX12-NEXT: v_swmmac_i32_16x16x32_iu4 v[2:5], v0, v1, v6 clamp
|
||||
; GFX12-NEXT: global_store_b128 v[7:8], v[2:5], off
|
||||
; GFX12-NEXT: s_nop 0
|
||||
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
|
||||
; GFX12-NEXT: s_endpgm
|
||||
bb:
|
||||
%res = call <4 x i32> @llvm.amdgcn.swmmac.i32.16x16x32.iu4.v4i32.i32.i32.v4i32.i16(i1 0, i32 %A, i1 0, i32 %B, <4 x i32> %C, i16 %Index, i1 1)
|
||||
store <4 x i32> %res, ptr addrspace(1) %out
|
||||
ret void
|
||||
}
|
||||
|
||||
|
||||
|
||||
define amdgpu_ps void @test_swmmac_i32_16x16x64_iu4_zext_src0(i32 %A, <2 x i32> %B, <4 x i32> %C, i16 %Index, ptr addrspace(1) %out) {
|
||||
; GFX12-LABEL: test_swmmac_i32_16x16x64_iu4_zext_src0:
|
||||
; GFX12: ; %bb.0: ; %bb
|
||||
; GFX12-NEXT: v_swmmac_i32_16x16x64_iu4 v[3:6], v0, v[1:2], v7 neg_lo:[1,0,0]
|
||||
; GFX12-NEXT: global_store_b128 v[8:9], v[3:6], off
|
||||
; GFX12-NEXT: s_nop 0
|
||||
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
|
||||
; GFX12-NEXT: s_endpgm
|
||||
bb:
|
||||
%res = call <4 x i32> @llvm.amdgcn.swmmac.i32.16x16x64.iu4.v4i32.i32.v2i32.v4i32.i16(i1 1, i32 %A, i1 0, <2 x i32> %B, <4 x i32> %C, i16 %Index, i1 0)
|
||||
store <4 x i32> %res, ptr addrspace(1) %out
|
||||
ret void
|
||||
}
|
||||
|
||||
define amdgpu_ps void @test_swmmac_i32_16x16x64_iu4_zext_src1(i32 %A, <2 x i32> %B, <4 x i32> %C, i16 %Index, ptr addrspace(1) %out) {
|
||||
; GFX12-LABEL: test_swmmac_i32_16x16x64_iu4_zext_src1:
|
||||
; GFX12: ; %bb.0: ; %bb
|
||||
; GFX12-NEXT: v_swmmac_i32_16x16x64_iu4 v[3:6], v0, v[1:2], v7 neg_lo:[0,1,0]
|
||||
; GFX12-NEXT: global_store_b128 v[8:9], v[3:6], off
|
||||
; GFX12-NEXT: s_nop 0
|
||||
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
|
||||
; GFX12-NEXT: s_endpgm
|
||||
bb:
|
||||
%res = call <4 x i32> @llvm.amdgcn.swmmac.i32.16x16x64.iu4.v4i32.i32.v2i32.v4i32.i16(i1 0, i32 %A, i1 1, <2 x i32> %B, <4 x i32> %C, i16 %Index, i1 0)
|
||||
store <4 x i32> %res, ptr addrspace(1) %out
|
||||
ret void
|
||||
}
|
||||
|
||||
define amdgpu_ps void @test_swmmac_i32_16x16x64_iu4_clamp(i32 %A, <2 x i32> %B, <4 x i32> %C, i16 %Index, ptr addrspace(1) %out) {
|
||||
; GFX12-LABEL: test_swmmac_i32_16x16x64_iu4_clamp:
|
||||
; GFX12: ; %bb.0: ; %bb
|
||||
; GFX12-NEXT: v_swmmac_i32_16x16x64_iu4 v[3:6], v0, v[1:2], v7 clamp
|
||||
; GFX12-NEXT: global_store_b128 v[8:9], v[3:6], off
|
||||
; GFX12-NEXT: s_nop 0
|
||||
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
|
||||
; GFX12-NEXT: s_endpgm
|
||||
bb:
|
||||
%res = call <4 x i32> @llvm.amdgcn.swmmac.i32.16x16x64.iu4.v4i32.i32.v2i32.v4i32.i16(i1 0, i32 %A, i1 0, <2 x i32> %B, <4 x i32> %C, i16 %Index, i1 1)
|
||||
store <4 x i32> %res, ptr addrspace(1) %out
|
||||
ret void
|
||||
}
|
||||
|
||||
declare <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8.v4i32.i32.i32.v4i32(i1 immarg, i32, i1 immarg, i32, <4 x i32>, i1 immarg)
|
||||
declare <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4.v4i32.i32.i32.v4i32(i1 immarg, i32, i1 immarg, i32, <4 x i32>, i1 immarg)
|
||||
declare <4 x i32> @llvm.amdgcn.wmma.i32.16x16x32.iu4.v4i32.i32.i32.v4i32(i1 immarg, i32, i1 immarg, i32, <4 x i32>, i1 immarg)
|
||||
declare <4 x i32> @llvm.amdgcn.swmmac.i32.16x16x32.iu8.v4i32.i32.v2i32.v4i32.i8(i1 immarg, i32, i1 immarg, <2 x i32>, <4 x i32>, i8 %Index, i1 immarg)
|
||||
declare <4 x i32> @llvm.amdgcn.swmmac.i32.16x16x32.iu4.v4i32.i32.i32.v4i32.i16(i1 immarg, i32, i1 immarg, i32, <4 x i32>, i16 %Index, i1 immarg)
|
||||
declare <4 x i32> @llvm.amdgcn.swmmac.i32.16x16x64.iu4.v4i32.i32.v2i32.v4i32.i16(i1 immarg, i32, i1 immarg, <2 x i32>, <4 x i32>, i16 %Index, i1 immarg)
|
472
llvm/test/CodeGen/AMDGPU/wmma-gfx12-w64-swmmac-index_key.ll
Normal file
472
llvm/test/CodeGen/AMDGPU/wmma-gfx12-w64-swmmac-index_key.ll
Normal file
@ -0,0 +1,472 @@
|
||||
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
|
||||
; RUN: llc -march=amdgcn -mcpu=gfx1200 -mattr=-wavefrontsize32,+wavefrontsize64 -verify-machineinstrs < %s | FileCheck %s --check-prefix=GFX12
|
||||
|
||||
define amdgpu_ps void @test_swmmac_f32_16x16x32_f16_index_key(<4 x half> %A, <8 x half> %B, <4 x float> %C, ptr addrspace(1) %IndexVecPtr, ptr addrspace(1) %out0, ptr addrspace(1) %out1, ptr addrspace(1) %out2, ptr addrspace(1) %out3) {
|
||||
; GFX12-LABEL: test_swmmac_f32_16x16x32_f16_index_key:
|
||||
; GFX12: ; %bb.0: ; %bb
|
||||
; GFX12-NEXT: global_load_b32 v10, v[10:11], off
|
||||
; GFX12-NEXT: v_mov_b32_e32 v23, v9
|
||||
; GFX12-NEXT: v_mov_b32_e32 v22, v8
|
||||
; GFX12-NEXT: v_mov_b32_e32 v21, v7
|
||||
; GFX12-NEXT: v_mov_b32_e32 v20, v6
|
||||
; GFX12-NEXT: v_mov_b32_e32 v27, v9
|
||||
; GFX12-NEXT: v_mov_b32_e32 v26, v8
|
||||
; GFX12-NEXT: v_mov_b32_e32 v25, v7
|
||||
; GFX12-NEXT: v_mov_b32_e32 v24, v6
|
||||
; GFX12-NEXT: v_mov_b32_e32 v31, v9
|
||||
; GFX12-NEXT: v_mov_b32_e32 v30, v8
|
||||
; GFX12-NEXT: v_mov_b32_e32 v29, v7
|
||||
; GFX12-NEXT: v_mov_b32_e32 v28, v6
|
||||
; GFX12-NEXT: s_wait_loadcnt 0x0
|
||||
; GFX12-NEXT: v_swmmac_f32_16x16x32_f16 v[20:23], v[0:1], v[2:5], v10
|
||||
; GFX12-NEXT: v_swmmac_f32_16x16x32_f16 v[24:27], v[0:1], v[2:5], v10 index_key:1
|
||||
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3)
|
||||
; GFX12-NEXT: v_swmmac_f32_16x16x32_f16 v[28:31], v[0:1], v[2:5], v10 index_key:2
|
||||
; GFX12-NEXT: v_swmmac_f32_16x16x32_f16 v[6:9], v[0:1], v[2:5], v10 index_key:3
|
||||
; GFX12-NEXT: global_store_b128 v[12:13], v[20:23], off
|
||||
; GFX12-NEXT: global_store_b128 v[14:15], v[24:27], off
|
||||
; GFX12-NEXT: global_store_b128 v[16:17], v[28:31], off
|
||||
; GFX12-NEXT: global_store_b128 v[18:19], v[6:9], off
|
||||
; GFX12-NEXT: s_nop 0
|
||||
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
|
||||
; GFX12-NEXT: s_endpgm
|
||||
bb:
|
||||
%IndexVec = load <4 x i8>, ptr addrspace(1) %IndexVecPtr, align 4
|
||||
%Index0 = extractelement <4 x i8> %IndexVec, i32 0
|
||||
%res0 = call <4 x float> @llvm.amdgcn.swmmac.f32.16x16x32.f16.v4f32.v4f16.v8f16.v4f32.i8(<4 x half> %A, <8 x half> %B, <4 x float> %C, i8 %Index0)
|
||||
store <4 x float> %res0, ptr addrspace(1) %out0
|
||||
%Index1 = extractelement <4 x i8> %IndexVec, i32 1
|
||||
%res1 = call <4 x float> @llvm.amdgcn.swmmac.f32.16x16x32.f16.v4f32.v4f16.v8f16.v4f32.i8(<4 x half> %A, <8 x half> %B, <4 x float> %C, i8 %Index1)
|
||||
store <4 x float> %res1, ptr addrspace(1) %out1
|
||||
%Index2 = extractelement <4 x i8> %IndexVec, i32 2
|
||||
%res2 = call <4 x float> @llvm.amdgcn.swmmac.f32.16x16x32.f16.v4f32.v4f16.v8f16.v4f32.i8(<4 x half> %A, <8 x half> %B, <4 x float> %C, i8 %Index2)
|
||||
store <4 x float> %res2, ptr addrspace(1) %out2
|
||||
%Index3 = extractelement <4 x i8> %IndexVec, i32 3
|
||||
%res3 = call <4 x float> @llvm.amdgcn.swmmac.f32.16x16x32.f16.v4f32.v4f16.v8f16.v4f32.i8(<4 x half> %A, <8 x half> %B, <4 x float> %C, i8 %Index3)
|
||||
store <4 x float> %res3, ptr addrspace(1) %out3
|
||||
ret void
|
||||
}
|
||||
|
||||
define amdgpu_ps void @test_swmmac_f32_16x16x32_bf16_index_key(<4 x i16> %A, <8 x i16> %B, <4 x float> %C, ptr addrspace(1) %IndexVecPtr, ptr addrspace(1) %out0, ptr addrspace(1) %out1, ptr addrspace(1) %out2, ptr addrspace(1) %out3) {
|
||||
; GFX12-LABEL: test_swmmac_f32_16x16x32_bf16_index_key:
|
||||
; GFX12: ; %bb.0: ; %bb
|
||||
; GFX12-NEXT: global_load_b32 v10, v[10:11], off
|
||||
; GFX12-NEXT: v_mov_b32_e32 v23, v9
|
||||
; GFX12-NEXT: v_mov_b32_e32 v22, v8
|
||||
; GFX12-NEXT: v_mov_b32_e32 v21, v7
|
||||
; GFX12-NEXT: v_mov_b32_e32 v20, v6
|
||||
; GFX12-NEXT: v_mov_b32_e32 v27, v9
|
||||
; GFX12-NEXT: v_mov_b32_e32 v26, v8
|
||||
; GFX12-NEXT: v_mov_b32_e32 v25, v7
|
||||
; GFX12-NEXT: v_mov_b32_e32 v24, v6
|
||||
; GFX12-NEXT: v_mov_b32_e32 v31, v9
|
||||
; GFX12-NEXT: v_mov_b32_e32 v30, v8
|
||||
; GFX12-NEXT: v_mov_b32_e32 v29, v7
|
||||
; GFX12-NEXT: v_mov_b32_e32 v28, v6
|
||||
; GFX12-NEXT: s_wait_loadcnt 0x0
|
||||
; GFX12-NEXT: v_swmmac_f32_16x16x32_bf16 v[20:23], v[0:1], v[2:5], v10
|
||||
; GFX12-NEXT: v_swmmac_f32_16x16x32_bf16 v[24:27], v[0:1], v[2:5], v10 index_key:1
|
||||
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3)
|
||||
; GFX12-NEXT: v_swmmac_f32_16x16x32_bf16 v[28:31], v[0:1], v[2:5], v10 index_key:2
|
||||
; GFX12-NEXT: v_swmmac_f32_16x16x32_bf16 v[6:9], v[0:1], v[2:5], v10 index_key:3
|
||||
; GFX12-NEXT: global_store_b128 v[12:13], v[20:23], off
|
||||
; GFX12-NEXT: global_store_b128 v[14:15], v[24:27], off
|
||||
; GFX12-NEXT: global_store_b128 v[16:17], v[28:31], off
|
||||
; GFX12-NEXT: global_store_b128 v[18:19], v[6:9], off
|
||||
; GFX12-NEXT: s_nop 0
|
||||
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
|
||||
; GFX12-NEXT: s_endpgm
|
||||
bb:
|
||||
%IndexVec = load <4 x i8>, ptr addrspace(1) %IndexVecPtr, align 4
|
||||
%Index0 = extractelement <4 x i8> %IndexVec, i32 0
|
||||
%res0 = call <4 x float> @llvm.amdgcn.swmmac.f32.16x16x32.bf16.v4f32.v4i16.v8i16.v4f32.i8(<4 x i16> %A, <8 x i16> %B, <4 x float> %C, i8 %Index0)
|
||||
store <4 x float> %res0, ptr addrspace(1) %out0
|
||||
%Index1 = extractelement <4 x i8> %IndexVec, i32 1
|
||||
%res1 = call <4 x float> @llvm.amdgcn.swmmac.f32.16x16x32.bf16.v4f32.v4i16.v8i16.v4f32.i8(<4 x i16> %A, <8 x i16> %B, <4 x float> %C, i8 %Index1)
|
||||
store <4 x float> %res1, ptr addrspace(1) %out1
|
||||
%Index2 = extractelement <4 x i8> %IndexVec, i32 2
|
||||
%res2 = call <4 x float> @llvm.amdgcn.swmmac.f32.16x16x32.bf16.v4f32.v4i16.v8i16.v4f32.i8(<4 x i16> %A, <8 x i16> %B, <4 x float> %C, i8 %Index2)
|
||||
store <4 x float> %res2, ptr addrspace(1) %out2
|
||||
%Index3 = extractelement <4 x i8> %IndexVec, i32 3
|
||||
%res3 = call <4 x float> @llvm.amdgcn.swmmac.f32.16x16x32.bf16.v4f32.v4i16.v8i16.v4f32.i8(<4 x i16> %A, <8 x i16> %B, <4 x float> %C, i8 %Index3)
|
||||
store <4 x float> %res3, ptr addrspace(1) %out3
|
||||
ret void
|
||||
}
|
||||
|
||||
define amdgpu_ps void @test_swmmac_f16_16x16x32_f16_index_key(<4 x half> %A, <8 x half> %B, <4 x half> %C, ptr addrspace(1) %IndexVecPtr, ptr addrspace(1) %out0, ptr addrspace(1) %out1, ptr addrspace(1) %out2, ptr addrspace(1) %out3) {
|
||||
; GFX12-LABEL: test_swmmac_f16_16x16x32_f16_index_key:
|
||||
; GFX12: ; %bb.0: ; %bb
|
||||
; GFX12-NEXT: global_load_b32 v22, v[8:9], off
|
||||
; GFX12-NEXT: v_mov_b32_e32 v9, v7
|
||||
; GFX12-NEXT: v_mov_b32_e32 v8, v6
|
||||
; GFX12-NEXT: v_mov_b32_e32 v19, v7
|
||||
; GFX12-NEXT: v_mov_b32_e32 v18, v6
|
||||
; GFX12-NEXT: v_mov_b32_e32 v21, v7
|
||||
; GFX12-NEXT: v_mov_b32_e32 v20, v6
|
||||
; GFX12-NEXT: s_wait_loadcnt 0x0
|
||||
; GFX12-NEXT: v_swmmac_f16_16x16x32_f16 v[8:9], v[0:1], v[2:5], v22
|
||||
; GFX12-NEXT: v_swmmac_f16_16x16x32_f16 v[18:19], v[0:1], v[2:5], v22 index_key:1
|
||||
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3)
|
||||
; GFX12-NEXT: v_swmmac_f16_16x16x32_f16 v[20:21], v[0:1], v[2:5], v22 index_key:2
|
||||
; GFX12-NEXT: v_swmmac_f16_16x16x32_f16 v[6:7], v[0:1], v[2:5], v22 index_key:3
|
||||
; GFX12-NEXT: global_store_b64 v[10:11], v[8:9], off
|
||||
; GFX12-NEXT: global_store_b64 v[12:13], v[18:19], off
|
||||
; GFX12-NEXT: global_store_b64 v[14:15], v[20:21], off
|
||||
; GFX12-NEXT: global_store_b64 v[16:17], v[6:7], off
|
||||
; GFX12-NEXT: s_nop 0
|
||||
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
|
||||
; GFX12-NEXT: s_endpgm
|
||||
bb:
|
||||
%IndexVec = load <4 x i8>, ptr addrspace(1) %IndexVecPtr, align 4
|
||||
%Index0 = extractelement <4 x i8> %IndexVec, i32 0
|
||||
%res0 = call <4 x half> @llvm.amdgcn.swmmac.f16.16x16x32.f16.v4f16.v4f16.v8f16.v4f16.i8(<4 x half> %A, <8 x half> %B, <4 x half> %C, i8 %Index0)
|
||||
store <4 x half> %res0, ptr addrspace(1) %out0
|
||||
%Index1 = extractelement <4 x i8> %IndexVec, i32 1
|
||||
%res1 = call <4 x half> @llvm.amdgcn.swmmac.f16.16x16x32.f16.v4f16.v4f16.v8f16.v4f16.i8(<4 x half> %A, <8 x half> %B, <4 x half> %C, i8 %Index1)
|
||||
store <4 x half> %res1, ptr addrspace(1) %out1
|
||||
%Index2 = extractelement <4 x i8> %IndexVec, i32 2
|
||||
%res2 = call <4 x half> @llvm.amdgcn.swmmac.f16.16x16x32.f16.v4f16.v4f16.v8f16.v4f16.i8(<4 x half> %A, <8 x half> %B, <4 x half> %C, i8 %Index2)
|
||||
store <4 x half> %res2, ptr addrspace(1) %out2
|
||||
%Index3 = extractelement <4 x i8> %IndexVec, i32 3
|
||||
%res3 = call <4 x half> @llvm.amdgcn.swmmac.f16.16x16x32.f16.v4f16.v4f16.v8f16.v4f16.i8(<4 x half> %A, <8 x half> %B, <4 x half> %C, i8 %Index3)
|
||||
store <4 x half> %res3, ptr addrspace(1) %out3
|
||||
ret void
|
||||
}
|
||||
|
||||
define amdgpu_ps void @test_swmmac_bf16_16x16x32_bf16_index_key(<4 x i16> %A, <8 x i16> %B, <4 x i16> %C, ptr addrspace(1) %IndexVecPtr, ptr addrspace(1) %out0, ptr addrspace(1) %out1, ptr addrspace(1) %out2, ptr addrspace(1) %out3) {
|
||||
; GFX12-LABEL: test_swmmac_bf16_16x16x32_bf16_index_key:
|
||||
; GFX12: ; %bb.0: ; %bb
|
||||
; GFX12-NEXT: global_load_b32 v22, v[8:9], off
|
||||
; GFX12-NEXT: v_mov_b32_e32 v9, v7
|
||||
; GFX12-NEXT: v_mov_b32_e32 v8, v6
|
||||
; GFX12-NEXT: v_mov_b32_e32 v19, v7
|
||||
; GFX12-NEXT: v_mov_b32_e32 v18, v6
|
||||
; GFX12-NEXT: v_mov_b32_e32 v21, v7
|
||||
; GFX12-NEXT: v_mov_b32_e32 v20, v6
|
||||
; GFX12-NEXT: s_wait_loadcnt 0x0
|
||||
; GFX12-NEXT: v_swmmac_bf16_16x16x32_bf16 v[8:9], v[0:1], v[2:5], v22
|
||||
; GFX12-NEXT: v_swmmac_bf16_16x16x32_bf16 v[18:19], v[0:1], v[2:5], v22 index_key:1
|
||||
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3)
|
||||
; GFX12-NEXT: v_swmmac_bf16_16x16x32_bf16 v[20:21], v[0:1], v[2:5], v22 index_key:2
|
||||
; GFX12-NEXT: v_swmmac_bf16_16x16x32_bf16 v[6:7], v[0:1], v[2:5], v22 index_key:3
|
||||
; GFX12-NEXT: global_store_b64 v[10:11], v[8:9], off
|
||||
; GFX12-NEXT: global_store_b64 v[12:13], v[18:19], off
|
||||
; GFX12-NEXT: global_store_b64 v[14:15], v[20:21], off
|
||||
; GFX12-NEXT: global_store_b64 v[16:17], v[6:7], off
|
||||
; GFX12-NEXT: s_nop 0
|
||||
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
|
||||
; GFX12-NEXT: s_endpgm
|
||||
bb:
|
||||
%IndexVec = load <4 x i8>, ptr addrspace(1) %IndexVecPtr, align 4
|
||||
%Index0 = extractelement <4 x i8> %IndexVec, i32 0
|
||||
%res0 = call <4 x i16> @llvm.amdgcn.swmmac.bf16.16x16x32.bf16.v4i16.v4i16.v8i16.v4i16.i8(<4 x i16> %A, <8 x i16> %B, <4 x i16> %C, i8 %Index0)
|
||||
store <4 x i16> %res0, ptr addrspace(1) %out0
|
||||
%Index1 = extractelement <4 x i8> %IndexVec, i32 1
|
||||
%res1 = call <4 x i16> @llvm.amdgcn.swmmac.bf16.16x16x32.bf16.v4i16.v4i16.v8i16.v4i16.i8(<4 x i16> %A, <8 x i16> %B, <4 x i16> %C, i8 %Index1)
|
||||
store <4 x i16> %res1, ptr addrspace(1) %out1
|
||||
%Index2 = extractelement <4 x i8> %IndexVec, i32 2
|
||||
%res2 = call <4 x i16> @llvm.amdgcn.swmmac.bf16.16x16x32.bf16.v4i16.v4i16.v8i16.v4i16.i8(<4 x i16> %A, <8 x i16> %B, <4 x i16> %C, i8 %Index2)
|
||||
store <4 x i16> %res2, ptr addrspace(1) %out2
|
||||
%Index3 = extractelement <4 x i8> %IndexVec, i32 3
|
||||
%res3 = call <4 x i16> @llvm.amdgcn.swmmac.bf16.16x16x32.bf16.v4i16.v4i16.v8i16.v4i16.i8(<4 x i16> %A, <8 x i16> %B, <4 x i16> %C, i8 %Index3)
|
||||
store <4 x i16> %res3, ptr addrspace(1) %out3
|
||||
ret void
|
||||
}
|
||||
|
||||
define amdgpu_ps void @test_swmmac_i32_16x16x32_iu8_index_key(i32 %A, <2 x i32> %B, <4 x i32> %C, ptr addrspace(1) %IndexVecPtr, ptr addrspace(1) %out0, ptr addrspace(1) %out1, ptr addrspace(1) %out2, ptr addrspace(1) %out3) {
|
||||
; GFX12-LABEL: test_swmmac_i32_16x16x32_iu8_index_key:
|
||||
; GFX12: ; %bb.0: ; %bb
|
||||
; GFX12-NEXT: global_load_b32 v7, v[7:8], off
|
||||
; GFX12-NEXT: v_mov_b32_e32 v20, v6
|
||||
; GFX12-NEXT: v_mov_b32_e32 v19, v5
|
||||
; GFX12-NEXT: v_mov_b32_e32 v18, v4
|
||||
; GFX12-NEXT: v_mov_b32_e32 v17, v3
|
||||
; GFX12-NEXT: v_mov_b32_e32 v24, v6
|
||||
; GFX12-NEXT: v_mov_b32_e32 v23, v5
|
||||
; GFX12-NEXT: v_mov_b32_e32 v22, v4
|
||||
; GFX12-NEXT: v_mov_b32_e32 v21, v3
|
||||
; GFX12-NEXT: v_mov_b32_e32 v28, v6
|
||||
; GFX12-NEXT: v_mov_b32_e32 v27, v5
|
||||
; GFX12-NEXT: v_mov_b32_e32 v26, v4
|
||||
; GFX12-NEXT: v_mov_b32_e32 v25, v3
|
||||
; GFX12-NEXT: s_wait_loadcnt 0x0
|
||||
; GFX12-NEXT: v_swmmac_i32_16x16x32_iu8 v[17:20], v0, v[1:2], v7
|
||||
; GFX12-NEXT: v_swmmac_i32_16x16x32_iu8 v[21:24], v0, v[1:2], v7 index_key:1
|
||||
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3)
|
||||
; GFX12-NEXT: v_swmmac_i32_16x16x32_iu8 v[25:28], v0, v[1:2], v7 index_key:2
|
||||
; GFX12-NEXT: v_swmmac_i32_16x16x32_iu8 v[3:6], v0, v[1:2], v7 index_key:3
|
||||
; GFX12-NEXT: global_store_b128 v[9:10], v[17:20], off
|
||||
; GFX12-NEXT: global_store_b128 v[11:12], v[21:24], off
|
||||
; GFX12-NEXT: global_store_b128 v[13:14], v[25:28], off
|
||||
; GFX12-NEXT: global_store_b128 v[15:16], v[3:6], off
|
||||
; GFX12-NEXT: s_nop 0
|
||||
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
|
||||
; GFX12-NEXT: s_endpgm
|
||||
bb:
|
||||
%IndexVec = load <4 x i8>, ptr addrspace(1) %IndexVecPtr, align 4
|
||||
%Index0 = extractelement <4 x i8> %IndexVec, i32 0
|
||||
%res0 = call <4 x i32> @llvm.amdgcn.swmmac.i32.16x16x32.iu8.v4i32.i32.v2i32.v4i32.i8(i1 0, i32 %A, i1 0, <2 x i32> %B, <4 x i32> %C, i8 %Index0, i1 0)
|
||||
store <4 x i32> %res0, ptr addrspace(1) %out0
|
||||
%Index1 = extractelement <4 x i8> %IndexVec, i32 1
|
||||
%res1 = call <4 x i32> @llvm.amdgcn.swmmac.i32.16x16x32.iu8.v4i32.i32.v2i32.v4i32.i8(i1 0, i32 %A, i1 0, <2 x i32> %B, <4 x i32> %C, i8 %Index1, i1 0)
|
||||
store <4 x i32> %res1, ptr addrspace(1) %out1
|
||||
%Index2 = extractelement <4 x i8> %IndexVec, i32 2
|
||||
%res2 = call <4 x i32> @llvm.amdgcn.swmmac.i32.16x16x32.iu8.v4i32.i32.v2i32.v4i32.i8(i1 0, i32 %A, i1 0, <2 x i32> %B, <4 x i32> %C, i8 %Index2, i1 0)
|
||||
store <4 x i32> %res2, ptr addrspace(1) %out2
|
||||
%Index3 = extractelement <4 x i8> %IndexVec, i32 3
|
||||
%res3 = call <4 x i32> @llvm.amdgcn.swmmac.i32.16x16x32.iu8.v4i32.i32.v2i32.v4i32.i8(i1 0, i32 %A, i1 0, <2 x i32> %B, <4 x i32> %C, i8 %Index3, i1 0)
|
||||
store <4 x i32> %res3, ptr addrspace(1) %out3
|
||||
ret void
|
||||
}
|
||||
|
||||
define amdgpu_ps void @test_swmmac_i32_16x16x32_iu4_index_key(i32 %A, i32 %B, <4 x i32> %C, ptr addrspace(1) %IndexVecPtr, ptr addrspace(1) %out0, ptr addrspace(1) %out1) {
|
||||
; GFX12-LABEL: test_swmmac_i32_16x16x32_iu4_index_key:
|
||||
; GFX12: ; %bb.0: ; %bb
|
||||
; GFX12-NEXT: global_load_b32 v6, v[6:7], off
|
||||
; GFX12-NEXT: v_mov_b32_e32 v15, v5
|
||||
; GFX12-NEXT: v_mov_b32_e32 v14, v4
|
||||
; GFX12-NEXT: v_mov_b32_e32 v13, v3
|
||||
; GFX12-NEXT: v_mov_b32_e32 v12, v2
|
||||
; GFX12-NEXT: s_wait_loadcnt 0x0
|
||||
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
|
||||
; GFX12-NEXT: v_swmmac_i32_16x16x32_iu4 v[12:15], v0, v1, v6
|
||||
; GFX12-NEXT: v_swmmac_i32_16x16x32_iu4 v[2:5], v0, v1, v6 index_key:1
|
||||
; GFX12-NEXT: global_store_b128 v[8:9], v[12:15], off
|
||||
; GFX12-NEXT: global_store_b128 v[10:11], v[2:5], off
|
||||
; GFX12-NEXT: s_nop 0
|
||||
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
|
||||
; GFX12-NEXT: s_endpgm
|
||||
bb:
|
||||
%IndexVec = load <2 x i16>, ptr addrspace(1) %IndexVecPtr, align 4
|
||||
%Index0 = extractelement <2 x i16> %IndexVec, i32 0
|
||||
%res0 = call <4 x i32> @llvm.amdgcn.swmmac.i32.16x16x32.iu4.v4i32.i32.i32.v4i32.i16(i1 0, i32 %A, i1 0, i32 %B, <4 x i32> %C, i16 %Index0, i1 0)
|
||||
store <4 x i32> %res0, ptr addrspace(1) %out0
|
||||
%Index1 = extractelement <2 x i16> %IndexVec, i32 1
|
||||
%res1 = call <4 x i32> @llvm.amdgcn.swmmac.i32.16x16x32.iu4.v4i32.i32.i32.v4i32.i16(i1 0, i32 %A, i1 0, i32 %B, <4 x i32> %C, i16 %Index1, i1 0)
|
||||
store <4 x i32> %res1, ptr addrspace(1) %out1
|
||||
ret void
|
||||
}
|
||||
|
||||
define amdgpu_ps void @test_swmmac_i32_16x16x64_iu4_index_key(i32 %A, <2 x i32> %B, <4 x i32> %C, ptr addrspace(1) %IndexVecPtr, ptr addrspace(1) %out0, ptr addrspace(1) %out1) {
|
||||
; GFX12-LABEL: test_swmmac_i32_16x16x64_iu4_index_key:
|
||||
; GFX12: ; %bb.0: ; %bb
|
||||
; GFX12-NEXT: global_load_b32 v7, v[7:8], off
|
||||
; GFX12-NEXT: v_mov_b32_e32 v16, v6
|
||||
; GFX12-NEXT: v_mov_b32_e32 v15, v5
|
||||
; GFX12-NEXT: v_mov_b32_e32 v14, v4
|
||||
; GFX12-NEXT: v_mov_b32_e32 v13, v3
|
||||
; GFX12-NEXT: s_wait_loadcnt 0x0
|
||||
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
|
||||
; GFX12-NEXT: v_swmmac_i32_16x16x64_iu4 v[13:16], v0, v[1:2], v7
|
||||
; GFX12-NEXT: v_swmmac_i32_16x16x64_iu4 v[3:6], v0, v[1:2], v7 index_key:1
|
||||
; GFX12-NEXT: global_store_b128 v[9:10], v[13:16], off
|
||||
; GFX12-NEXT: global_store_b128 v[11:12], v[3:6], off
|
||||
; GFX12-NEXT: s_nop 0
|
||||
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
|
||||
; GFX12-NEXT: s_endpgm
|
||||
bb:
|
||||
%IndexVec = load <2 x i16>, ptr addrspace(1) %IndexVecPtr, align 4
|
||||
%Index0 = extractelement <2 x i16> %IndexVec, i32 0
|
||||
%res0 = call <4 x i32> @llvm.amdgcn.swmmac.i32.16x16x64.iu4.v4i32.i32.v2i32.v4i32.i16(i1 0, i32 %A, i1 0, <2 x i32> %B, <4 x i32> %C, i16 %Index0, i1 0)
|
||||
store <4 x i32> %res0, ptr addrspace(1) %out0
|
||||
%Index1 = extractelement <2 x i16> %IndexVec, i32 1
|
||||
%res1 = call <4 x i32> @llvm.amdgcn.swmmac.i32.16x16x64.iu4.v4i32.i32.v2i32.v4i32.i16(i1 0, i32 %A, i1 0, <2 x i32> %B, <4 x i32> %C, i16 %Index1, i1 0)
|
||||
store <4 x i32> %res1, ptr addrspace(1) %out1
|
||||
ret void
|
||||
}
|
||||
|
||||
define amdgpu_ps void @test_swmmac_f32_16x16x32_fp8_fp8_index_key(i32 %A, <2 x i32> %B, <4 x float> %C, ptr addrspace(1) %IndexVecPtr, ptr addrspace(1) %out0, ptr addrspace(1) %out1, ptr addrspace(1) %out2, ptr addrspace(1) %out3) {
|
||||
; GFX12-LABEL: test_swmmac_f32_16x16x32_fp8_fp8_index_key:
|
||||
; GFX12: ; %bb.0: ; %bb
|
||||
; GFX12-NEXT: global_load_b32 v7, v[7:8], off
|
||||
; GFX12-NEXT: v_mov_b32_e32 v20, v6
|
||||
; GFX12-NEXT: v_mov_b32_e32 v19, v5
|
||||
; GFX12-NEXT: v_mov_b32_e32 v18, v4
|
||||
; GFX12-NEXT: v_mov_b32_e32 v17, v3
|
||||
; GFX12-NEXT: v_mov_b32_e32 v24, v6
|
||||
; GFX12-NEXT: v_mov_b32_e32 v23, v5
|
||||
; GFX12-NEXT: v_mov_b32_e32 v22, v4
|
||||
; GFX12-NEXT: v_mov_b32_e32 v21, v3
|
||||
; GFX12-NEXT: v_mov_b32_e32 v28, v6
|
||||
; GFX12-NEXT: v_mov_b32_e32 v27, v5
|
||||
; GFX12-NEXT: v_mov_b32_e32 v26, v4
|
||||
; GFX12-NEXT: v_mov_b32_e32 v25, v3
|
||||
; GFX12-NEXT: s_wait_loadcnt 0x0
|
||||
; GFX12-NEXT: v_swmmac_f32_16x16x32_fp8_fp8 v[17:20], v0, v[1:2], v7
|
||||
; GFX12-NEXT: v_swmmac_f32_16x16x32_fp8_fp8 v[21:24], v0, v[1:2], v7 index_key:1
|
||||
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3)
|
||||
; GFX12-NEXT: v_swmmac_f32_16x16x32_fp8_fp8 v[25:28], v0, v[1:2], v7 index_key:2
|
||||
; GFX12-NEXT: v_swmmac_f32_16x16x32_fp8_fp8 v[3:6], v0, v[1:2], v7 index_key:3
|
||||
; GFX12-NEXT: global_store_b128 v[9:10], v[17:20], off
|
||||
; GFX12-NEXT: global_store_b128 v[11:12], v[21:24], off
|
||||
; GFX12-NEXT: global_store_b128 v[13:14], v[25:28], off
|
||||
; GFX12-NEXT: global_store_b128 v[15:16], v[3:6], off
|
||||
; GFX12-NEXT: s_nop 0
|
||||
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
|
||||
; GFX12-NEXT: s_endpgm
|
||||
bb:
|
||||
%IndexVec = load <4 x i8>, ptr addrspace(1) %IndexVecPtr, align 4
|
||||
%Index0 = extractelement <4 x i8> %IndexVec, i32 0
|
||||
%res0 = call <4 x float> @llvm.amdgcn.swmmac.f32.16x16x32.fp8.fp8.v4f32.i32.v2i32.v4f32.i8(i32 %A, <2 x i32> %B, <4 x float> %C, i8 %Index0)
|
||||
store <4 x float> %res0, ptr addrspace(1) %out0
|
||||
%Index1 = extractelement <4 x i8> %IndexVec, i32 1
|
||||
%res1 = call <4 x float> @llvm.amdgcn.swmmac.f32.16x16x32.fp8.fp8.v4f32.i32.v2i32.v4f32.i8(i32 %A, <2 x i32> %B, <4 x float> %C, i8 %Index1)
|
||||
store <4 x float> %res1, ptr addrspace(1) %out1
|
||||
%Index2 = extractelement <4 x i8> %IndexVec, i32 2
|
||||
%res2 = call <4 x float> @llvm.amdgcn.swmmac.f32.16x16x32.fp8.fp8.v4f32.i32.v2i32.v4f32.i8(i32 %A, <2 x i32> %B, <4 x float> %C, i8 %Index2)
|
||||
store <4 x float> %res2, ptr addrspace(1) %out2
|
||||
%Index3 = extractelement <4 x i8> %IndexVec, i32 3
|
||||
%res3 = call <4 x float> @llvm.amdgcn.swmmac.f32.16x16x32.fp8.fp8.v4f32.i32.v2i32.v4f32.i8(i32 %A, <2 x i32> %B, <4 x float> %C, i8 %Index3)
|
||||
store <4 x float> %res3, ptr addrspace(1) %out3
|
||||
ret void
|
||||
}
|
||||
|
||||
define amdgpu_ps void @test_swmmac_f32_16x16x32_fp8_bf8_index_key(i32 %A, <2 x i32> %B, <4 x float> %C, ptr addrspace(1) %IndexVecPtr, ptr addrspace(1) %out0, ptr addrspace(1) %out1, ptr addrspace(1) %out2, ptr addrspace(1) %out3) {
|
||||
; GFX12-LABEL: test_swmmac_f32_16x16x32_fp8_bf8_index_key:
|
||||
; GFX12: ; %bb.0: ; %bb
|
||||
; GFX12-NEXT: global_load_b32 v7, v[7:8], off
|
||||
; GFX12-NEXT: v_mov_b32_e32 v20, v6
|
||||
; GFX12-NEXT: v_mov_b32_e32 v19, v5
|
||||
; GFX12-NEXT: v_mov_b32_e32 v18, v4
|
||||
; GFX12-NEXT: v_mov_b32_e32 v17, v3
|
||||
; GFX12-NEXT: v_mov_b32_e32 v24, v6
|
||||
; GFX12-NEXT: v_mov_b32_e32 v23, v5
|
||||
; GFX12-NEXT: v_mov_b32_e32 v22, v4
|
||||
; GFX12-NEXT: v_mov_b32_e32 v21, v3
|
||||
; GFX12-NEXT: v_mov_b32_e32 v28, v6
|
||||
; GFX12-NEXT: v_mov_b32_e32 v27, v5
|
||||
; GFX12-NEXT: v_mov_b32_e32 v26, v4
|
||||
; GFX12-NEXT: v_mov_b32_e32 v25, v3
|
||||
; GFX12-NEXT: s_wait_loadcnt 0x0
|
||||
; GFX12-NEXT: v_swmmac_f32_16x16x32_fp8_bf8 v[17:20], v0, v[1:2], v7
|
||||
; GFX12-NEXT: v_swmmac_f32_16x16x32_fp8_bf8 v[21:24], v0, v[1:2], v7 index_key:1
|
||||
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3)
|
||||
; GFX12-NEXT: v_swmmac_f32_16x16x32_fp8_bf8 v[25:28], v0, v[1:2], v7 index_key:2
|
||||
; GFX12-NEXT: v_swmmac_f32_16x16x32_fp8_bf8 v[3:6], v0, v[1:2], v7 index_key:3
|
||||
; GFX12-NEXT: global_store_b128 v[9:10], v[17:20], off
|
||||
; GFX12-NEXT: global_store_b128 v[11:12], v[21:24], off
|
||||
; GFX12-NEXT: global_store_b128 v[13:14], v[25:28], off
|
||||
; GFX12-NEXT: global_store_b128 v[15:16], v[3:6], off
|
||||
; GFX12-NEXT: s_nop 0
|
||||
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
|
||||
; GFX12-NEXT: s_endpgm
|
||||
bb:
|
||||
%IndexVec = load <4 x i8>, ptr addrspace(1) %IndexVecPtr, align 4
|
||||
%Index0 = extractelement <4 x i8> %IndexVec, i32 0
|
||||
%res0 = call <4 x float> @llvm.amdgcn.swmmac.f32.16x16x32.fp8.bf8.v4f32.i32.v2i32.v4f32.i8(i32 %A, <2 x i32> %B, <4 x float> %C, i8 %Index0)
|
||||
store <4 x float> %res0, ptr addrspace(1) %out0
|
||||
%Index1 = extractelement <4 x i8> %IndexVec, i32 1
|
||||
%res1 = call <4 x float> @llvm.amdgcn.swmmac.f32.16x16x32.fp8.bf8.v4f32.i32.v2i32.v4f32.i8(i32 %A, <2 x i32> %B, <4 x float> %C, i8 %Index1)
|
||||
store <4 x float> %res1, ptr addrspace(1) %out1
|
||||
%Index2 = extractelement <4 x i8> %IndexVec, i32 2
|
||||
%res2 = call <4 x float> @llvm.amdgcn.swmmac.f32.16x16x32.fp8.bf8.v4f32.i32.v2i32.v4f32.i8(i32 %A, <2 x i32> %B, <4 x float> %C, i8 %Index2)
|
||||
store <4 x float> %res2, ptr addrspace(1) %out2
|
||||
%Index3 = extractelement <4 x i8> %IndexVec, i32 3
|
||||
%res3 = call <4 x float> @llvm.amdgcn.swmmac.f32.16x16x32.fp8.bf8.v4f32.i32.v2i32.v4f32.i8(i32 %A, <2 x i32> %B, <4 x float> %C, i8 %Index3)
|
||||
store <4 x float> %res3, ptr addrspace(1) %out3
|
||||
ret void
|
||||
}
|
||||
|
||||
define amdgpu_ps void @test_swmmac_f32_16x16x32_bf8_fp8_index_key(i32 %A, <2 x i32> %B, <4 x float> %C, ptr addrspace(1) %IndexVecPtr, ptr addrspace(1) %out0, ptr addrspace(1) %out1, ptr addrspace(1) %out2, ptr addrspace(1) %out3) {
|
||||
; GFX12-LABEL: test_swmmac_f32_16x16x32_bf8_fp8_index_key:
|
||||
; GFX12: ; %bb.0: ; %bb
|
||||
; GFX12-NEXT: global_load_b32 v7, v[7:8], off
|
||||
; GFX12-NEXT: v_mov_b32_e32 v20, v6
|
||||
; GFX12-NEXT: v_mov_b32_e32 v19, v5
|
||||
; GFX12-NEXT: v_mov_b32_e32 v18, v4
|
||||
; GFX12-NEXT: v_mov_b32_e32 v17, v3
|
||||
; GFX12-NEXT: v_mov_b32_e32 v24, v6
|
||||
; GFX12-NEXT: v_mov_b32_e32 v23, v5
|
||||
; GFX12-NEXT: v_mov_b32_e32 v22, v4
|
||||
; GFX12-NEXT: v_mov_b32_e32 v21, v3
|
||||
; GFX12-NEXT: v_mov_b32_e32 v28, v6
|
||||
; GFX12-NEXT: v_mov_b32_e32 v27, v5
|
||||
; GFX12-NEXT: v_mov_b32_e32 v26, v4
|
||||
; GFX12-NEXT: v_mov_b32_e32 v25, v3
|
||||
; GFX12-NEXT: s_wait_loadcnt 0x0
|
||||
; GFX12-NEXT: v_swmmac_f32_16x16x32_bf8_fp8 v[17:20], v0, v[1:2], v7
|
||||
; GFX12-NEXT: v_swmmac_f32_16x16x32_bf8_fp8 v[21:24], v0, v[1:2], v7 index_key:1
|
||||
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3)
|
||||
; GFX12-NEXT: v_swmmac_f32_16x16x32_bf8_fp8 v[25:28], v0, v[1:2], v7 index_key:2
|
||||
; GFX12-NEXT: v_swmmac_f32_16x16x32_bf8_fp8 v[3:6], v0, v[1:2], v7 index_key:3
|
||||
; GFX12-NEXT: global_store_b128 v[9:10], v[17:20], off
|
||||
; GFX12-NEXT: global_store_b128 v[11:12], v[21:24], off
|
||||
; GFX12-NEXT: global_store_b128 v[13:14], v[25:28], off
|
||||
; GFX12-NEXT: global_store_b128 v[15:16], v[3:6], off
|
||||
; GFX12-NEXT: s_nop 0
|
||||
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
|
||||
; GFX12-NEXT: s_endpgm
|
||||
bb:
|
||||
%IndexVec = load <4 x i8>, ptr addrspace(1) %IndexVecPtr, align 4
|
||||
%Index0 = extractelement <4 x i8> %IndexVec, i32 0
|
||||
%res0 = call <4 x float> @llvm.amdgcn.swmmac.f32.16x16x32.bf8.fp8.v4f32.i32.v2i32.v4f32.i8(i32 %A, <2 x i32> %B, <4 x float> %C, i8 %Index0)
|
||||
store <4 x float> %res0, ptr addrspace(1) %out0
|
||||
%Index1 = extractelement <4 x i8> %IndexVec, i32 1
|
||||
%res1 = call <4 x float> @llvm.amdgcn.swmmac.f32.16x16x32.bf8.fp8.v4f32.i32.v2i32.v4f32.i8(i32 %A, <2 x i32> %B, <4 x float> %C, i8 %Index1)
|
||||
store <4 x float> %res1, ptr addrspace(1) %out1
|
||||
%Index2 = extractelement <4 x i8> %IndexVec, i32 2
|
||||
%res2 = call <4 x float> @llvm.amdgcn.swmmac.f32.16x16x32.bf8.fp8.v4f32.i32.v2i32.v4f32.i8(i32 %A, <2 x i32> %B, <4 x float> %C, i8 %Index2)
|
||||
store <4 x float> %res2, ptr addrspace(1) %out2
|
||||
%Index3 = extractelement <4 x i8> %IndexVec, i32 3
|
||||
%res3 = call <4 x float> @llvm.amdgcn.swmmac.f32.16x16x32.bf8.fp8.v4f32.i32.v2i32.v4f32.i8(i32 %A, <2 x i32> %B, <4 x float> %C, i8 %Index3)
|
||||
store <4 x float> %res3, ptr addrspace(1) %out3
|
||||
ret void
|
||||
}
|
||||
|
||||
define amdgpu_ps void @test_swmmac_f32_16x16x32_bf8_bf8_index_key(i32 %A, <2 x i32> %B, <4 x float> %C, ptr addrspace(1) %IndexVecPtr, ptr addrspace(1) %out0, ptr addrspace(1) %out1, ptr addrspace(1) %out2, ptr addrspace(1) %out3) {
|
||||
; GFX12-LABEL: test_swmmac_f32_16x16x32_bf8_bf8_index_key:
|
||||
; GFX12: ; %bb.0: ; %bb
|
||||
; GFX12-NEXT: global_load_b32 v7, v[7:8], off
|
||||
; GFX12-NEXT: v_mov_b32_e32 v20, v6
|
||||
; GFX12-NEXT: v_mov_b32_e32 v19, v5
|
||||
; GFX12-NEXT: v_mov_b32_e32 v18, v4
|
||||
; GFX12-NEXT: v_mov_b32_e32 v17, v3
|
||||
; GFX12-NEXT: v_mov_b32_e32 v24, v6
|
||||
; GFX12-NEXT: v_mov_b32_e32 v23, v5
|
||||
; GFX12-NEXT: v_mov_b32_e32 v22, v4
|
||||
; GFX12-NEXT: v_mov_b32_e32 v21, v3
|
||||
; GFX12-NEXT: v_mov_b32_e32 v28, v6
|
||||
; GFX12-NEXT: v_mov_b32_e32 v27, v5
|
||||
; GFX12-NEXT: v_mov_b32_e32 v26, v4
|
||||
; GFX12-NEXT: v_mov_b32_e32 v25, v3
|
||||
; GFX12-NEXT: s_wait_loadcnt 0x0
|
||||
; GFX12-NEXT: v_swmmac_f32_16x16x32_bf8_bf8 v[17:20], v0, v[1:2], v7
|
||||
; GFX12-NEXT: v_swmmac_f32_16x16x32_bf8_bf8 v[21:24], v0, v[1:2], v7 index_key:1
|
||||
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3)
|
||||
; GFX12-NEXT: v_swmmac_f32_16x16x32_bf8_bf8 v[25:28], v0, v[1:2], v7 index_key:2
|
||||
; GFX12-NEXT: v_swmmac_f32_16x16x32_bf8_bf8 v[3:6], v0, v[1:2], v7 index_key:3
|
||||
; GFX12-NEXT: global_store_b128 v[9:10], v[17:20], off
|
||||
; GFX12-NEXT: global_store_b128 v[11:12], v[21:24], off
|
||||
; GFX12-NEXT: global_store_b128 v[13:14], v[25:28], off
|
||||
; GFX12-NEXT: global_store_b128 v[15:16], v[3:6], off
|
||||
; GFX12-NEXT: s_nop 0
|
||||
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
|
||||
; GFX12-NEXT: s_endpgm
|
||||
bb:
|
||||
%IndexVec = load <4 x i8>, ptr addrspace(1) %IndexVecPtr, align 4
|
||||
%Index0 = extractelement <4 x i8> %IndexVec, i32 0
|
||||
%res0 = call <4 x float> @llvm.amdgcn.swmmac.f32.16x16x32.bf8.bf8.v4f32.i32.v2i32.v4f32.i8(i32 %A, <2 x i32> %B, <4 x float> %C, i8 %Index0)
|
||||
store <4 x float> %res0, ptr addrspace(1) %out0
|
||||
%Index1 = extractelement <4 x i8> %IndexVec, i32 1
|
||||
%res1 = call <4 x float> @llvm.amdgcn.swmmac.f32.16x16x32.bf8.bf8.v4f32.i32.v2i32.v4f32.i8(i32 %A, <2 x i32> %B, <4 x float> %C, i8 %Index1)
|
||||
store <4 x float> %res1, ptr addrspace(1) %out1
|
||||
%Index2 = extractelement <4 x i8> %IndexVec, i32 2
|
||||
%res2 = call <4 x float> @llvm.amdgcn.swmmac.f32.16x16x32.bf8.bf8.v4f32.i32.v2i32.v4f32.i8(i32 %A, <2 x i32> %B, <4 x float> %C, i8 %Index2)
|
||||
store <4 x float> %res2, ptr addrspace(1) %out2
|
||||
%Index3 = extractelement <4 x i8> %IndexVec, i32 3
|
||||
%res3 = call <4 x float> @llvm.amdgcn.swmmac.f32.16x16x32.bf8.bf8.v4f32.i32.v2i32.v4f32.i8(i32 %A, <2 x i32> %B, <4 x float> %C, i8 %Index3)
|
||||
store <4 x float> %res3, ptr addrspace(1) %out3
|
||||
ret void
|
||||
}
|
||||
|
||||
declare <4 x float> @llvm.amdgcn.swmmac.f32.16x16x32.f16.v4f32.v4f16.v8f16.v4f32.i8(<4 x half>, <8 x half>, <4 x float>, i8)
|
||||
declare <4 x float> @llvm.amdgcn.swmmac.f32.16x16x32.bf16.v4f32.v4i16.v8i16.v4f32.i8(<4 x i16>, <8 x i16>, <4 x float>, i8)
|
||||
declare <4 x half> @llvm.amdgcn.swmmac.f16.16x16x32.f16.v4f16.v4f16.v8f16.v4f16.i8(<4 x half>, <8 x half>, <4 x half>, i8)
|
||||
declare <4 x i16> @llvm.amdgcn.swmmac.bf16.16x16x32.bf16.v4i16.v4i16.v8i16.v4i16.i8(<4 x i16>, <8 x i16>, <4 x i16>, i8)
|
||||
declare <4 x i32> @llvm.amdgcn.swmmac.i32.16x16x32.iu8.v4i32.i32.v2i32.v4i32.i8(i1 immarg, i32, i1 immarg, <2 x i32>, <4 x i32>, i8 %Index, i1 immarg)
|
||||
declare <4 x i32> @llvm.amdgcn.swmmac.i32.16x16x32.iu4.v4i32.i32.i32.v4i32.i16(i1 immarg, i32, i1 immarg, i32, <4 x i32>, i16 %Index, i1 immarg)
|
||||
declare <4 x i32> @llvm.amdgcn.swmmac.i32.16x16x64.iu4.v4i32.i32.v2i32.v4i32.i16(i1 immarg, i32, i1 immarg, <2 x i32>, <4 x i32>, i16 %Index, i1 immarg)
|
||||
declare <4 x float> @llvm.amdgcn.swmmac.f32.16x16x32.fp8.fp8.v4f32.i32.v2i32.v4f32.i8(i32, <2 x i32>, <4 x float>, i8)
|
||||
declare <4 x float> @llvm.amdgcn.swmmac.f32.16x16x32.fp8.bf8.v4f32.i32.v2i32.v4f32.i8(i32, <2 x i32>, <4 x float>, i8)
|
||||
declare <4 x float> @llvm.amdgcn.swmmac.f32.16x16x32.bf8.fp8.v4f32.i32.v2i32.v4f32.i8(i32, <2 x i32>, <4 x float>, i8)
|
||||
declare <4 x float> @llvm.amdgcn.swmmac.f32.16x16x32.bf8.bf8.v4f32.i32.v2i32.v4f32.i8(i32, <2 x i32>, <4 x float>, i8)
|
333
llvm/test/CodeGen/AMDGPU/wmma-gfx12-w64.ll
Normal file
333
llvm/test/CodeGen/AMDGPU/wmma-gfx12-w64.ll
Normal file
@ -0,0 +1,333 @@
|
||||
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
|
||||
; RUN: llc -march=amdgcn -mcpu=gfx1200 -mattr=-wavefrontsize32,+wavefrontsize64 -verify-machineinstrs < %s | FileCheck %s --check-prefix=GFX12
|
||||
|
||||
define amdgpu_ps void @test_wmma_f32_16x16x16_f16(<4 x half> %A, <4 x half> %B, <4 x float> %C, ptr addrspace(1) %out) {
|
||||
; GFX12-LABEL: test_wmma_f32_16x16x16_f16:
|
||||
; GFX12: ; %bb.0: ; %bb
|
||||
; GFX12-NEXT: v_wmma_f32_16x16x16_f16 v[4:7], v[0:1], v[2:3], v[4:7]
|
||||
; GFX12-NEXT: global_store_b128 v[8:9], v[4:7], off
|
||||
; GFX12-NEXT: s_nop 0
|
||||
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
|
||||
; GFX12-NEXT: s_endpgm
|
||||
bb:
|
||||
%res = call <4 x float>@llvm.amdgcn.wmma.f32.16x16x16.f16.v4f32.v4f16.v4f16.v4f32(<4 x half> %A, <4 x half> %B, <4 x float> %C)
|
||||
store <4 x float> %res, ptr addrspace(1) %out
|
||||
ret void
|
||||
}
|
||||
|
||||
define amdgpu_ps void @test_wmma_f32_16x16x16_bf16(<4 x i16> %A, <4 x i16> %B, <4 x float> %C, ptr addrspace(1) %out) {
|
||||
; GFX12-LABEL: test_wmma_f32_16x16x16_bf16:
|
||||
; GFX12: ; %bb.0: ; %bb
|
||||
; GFX12-NEXT: v_wmma_f32_16x16x16_bf16 v[4:7], v[0:1], v[2:3], v[4:7]
|
||||
; GFX12-NEXT: global_store_b128 v[8:9], v[4:7], off
|
||||
; GFX12-NEXT: s_nop 0
|
||||
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
|
||||
; GFX12-NEXT: s_endpgm
|
||||
bb:
|
||||
%res = call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf16.v4f32.v4i16.v4i16.v4f32(<4 x i16> %A, <4 x i16> %B, <4 x float> %C)
|
||||
store <4 x float> %res, ptr addrspace(1) %out
|
||||
ret void
|
||||
}
|
||||
|
||||
define amdgpu_ps void @test_wmma_f16_16x16x16_f16(<4 x half> %A, <4 x half> %B, <4 x half> %C, ptr addrspace(1) %out) {
|
||||
; GFX12-LABEL: test_wmma_f16_16x16x16_f16:
|
||||
; GFX12: ; %bb.0: ; %bb
|
||||
; GFX12-NEXT: v_wmma_f16_16x16x16_f16 v[4:5], v[0:1], v[2:3], v[4:5]
|
||||
; GFX12-NEXT: global_store_b64 v[6:7], v[4:5], off
|
||||
; GFX12-NEXT: s_nop 0
|
||||
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
|
||||
; GFX12-NEXT: s_endpgm
|
||||
bb:
|
||||
%res = call <4 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16.v4f16.v4f16.v4f16.v4f16(<4 x half> %A, <4 x half> %B, <4 x half> %C, i1 0)
|
||||
store <4 x half> %res, ptr addrspace(1) %out
|
||||
ret void
|
||||
}
|
||||
|
||||
define amdgpu_ps void @test_wmma_bf16_16x16x16_bf16(<4 x i16> %A, <4 x i16> %B, <4 x i16> %C, ptr addrspace(1) %out) {
|
||||
; GFX12-LABEL: test_wmma_bf16_16x16x16_bf16:
|
||||
; GFX12: ; %bb.0: ; %bb
|
||||
; GFX12-NEXT: v_wmma_bf16_16x16x16_bf16 v[4:5], v[0:1], v[2:3], v[4:5]
|
||||
; GFX12-NEXT: global_store_b64 v[6:7], v[4:5], off
|
||||
; GFX12-NEXT: s_nop 0
|
||||
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
|
||||
; GFX12-NEXT: s_endpgm
|
||||
bb:
|
||||
%res = call <4 x i16> @llvm.amdgcn.wmma.bf16.16x16x16.bf16.v4i16.v4i16.v4i16.v4i16(<4 x i16> %A, <4 x i16> %B, <4 x i16> %C, i1 0)
|
||||
store <4 x i16> %res, ptr addrspace(1) %out
|
||||
ret void
|
||||
}
|
||||
|
||||
define amdgpu_ps void @test_wmma_i32_16x16x16_iu8(i32 %A, i32 %B, <4 x i32> %C, ptr addrspace(1) %out) {
|
||||
; GFX12-LABEL: test_wmma_i32_16x16x16_iu8:
|
||||
; GFX12: ; %bb.0: ; %bb
|
||||
; GFX12-NEXT: v_wmma_i32_16x16x16_iu8 v[2:5], v0, v1, v[2:5]
|
||||
; GFX12-NEXT: global_store_b128 v[6:7], v[2:5], off
|
||||
; GFX12-NEXT: s_nop 0
|
||||
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
|
||||
; GFX12-NEXT: s_endpgm
|
||||
bb:
|
||||
%res = call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8.v4i32.i32.i32.v4i32(i1 0, i32 %A, i1 0, i32 %B, <4 x i32> %C, i1 0)
|
||||
store <4 x i32> %res, ptr addrspace(1) %out
|
||||
ret void
|
||||
}
|
||||
|
||||
define amdgpu_ps void @test_wmma_i32_16x16x16_iu4(i32 %A, i32 %B, <4 x i32> %C, ptr addrspace(1) %out) {
|
||||
; GFX12-LABEL: test_wmma_i32_16x16x16_iu4:
|
||||
; GFX12: ; %bb.0: ; %bb
|
||||
; GFX12-NEXT: v_wmma_i32_16x16x16_iu4 v[2:5], v0, v1, v[2:5]
|
||||
; GFX12-NEXT: global_store_b128 v[6:7], v[2:5], off
|
||||
; GFX12-NEXT: s_nop 0
|
||||
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
|
||||
; GFX12-NEXT: s_endpgm
|
||||
bb:
|
||||
%res = call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4.v4i32.i32.i32.v4i32(i1 0, i32 %A, i1 0, i32 %B, <4 x i32> %C, i1 0)
|
||||
store <4 x i32> %res, ptr addrspace(1) %out
|
||||
ret void
|
||||
}
|
||||
|
||||
define amdgpu_ps void @test_wmma_f32_16x16x16_fp8_fp8(i32 %A, i32 %B, <4 x float> %C, ptr addrspace(1) %out) {
|
||||
; GFX12-LABEL: test_wmma_f32_16x16x16_fp8_fp8:
|
||||
; GFX12: ; %bb.0: ; %bb
|
||||
; GFX12-NEXT: v_wmma_f32_16x16x16_fp8_fp8 v[2:5], v0, v1, v[2:5]
|
||||
; GFX12-NEXT: global_store_b128 v[6:7], v[2:5], off
|
||||
; GFX12-NEXT: s_nop 0
|
||||
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
|
||||
; GFX12-NEXT: s_endpgm
|
||||
bb:
|
||||
%res = call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.fp8.fp8.v4f32.i32.i32.v4f32(i32 %A, i32 %B, <4 x float> %C)
|
||||
store <4 x float> %res, ptr addrspace(1) %out
|
||||
ret void
|
||||
}
|
||||
|
||||
define amdgpu_ps void @test_wmma_f32_16x16x16_bf8_fp8(i32 %A, i32 %B, <4 x float> %C, ptr addrspace(1) %out) {
|
||||
; GFX12-LABEL: test_wmma_f32_16x16x16_bf8_fp8:
|
||||
; GFX12: ; %bb.0: ; %bb
|
||||
; GFX12-NEXT: v_wmma_f32_16x16x16_bf8_fp8 v[2:5], v0, v1, v[2:5]
|
||||
; GFX12-NEXT: global_store_b128 v[6:7], v[2:5], off
|
||||
; GFX12-NEXT: s_nop 0
|
||||
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
|
||||
; GFX12-NEXT: s_endpgm
|
||||
bb:
|
||||
%res = call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf8.fp8.v4f32.i32.i32.v4f32(i32 %A, i32 %B, <4 x float> %C)
|
||||
store <4 x float> %res, ptr addrspace(1) %out
|
||||
ret void
|
||||
}
|
||||
|
||||
define amdgpu_ps void @test_wmma_f32_16x16x16_fp8_bf8(i32 %A, i32 %B, <4 x float> %C, ptr addrspace(1) %out) {
|
||||
; GFX12-LABEL: test_wmma_f32_16x16x16_fp8_bf8:
|
||||
; GFX12: ; %bb.0: ; %bb
|
||||
; GFX12-NEXT: v_wmma_f32_16x16x16_fp8_bf8 v[2:5], v0, v1, v[2:5]
|
||||
; GFX12-NEXT: global_store_b128 v[6:7], v[2:5], off
|
||||
; GFX12-NEXT: s_nop 0
|
||||
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
|
||||
; GFX12-NEXT: s_endpgm
|
||||
bb:
|
||||
%res = call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.fp8.bf8.v4f32.i32.i32.v4f32(i32 %A, i32 %B, <4 x float> %C)
|
||||
store <4 x float> %res, ptr addrspace(1) %out
|
||||
ret void
|
||||
}
|
||||
|
||||
define amdgpu_ps void @test_wmma_f32_16x16x16_bf8_bf8(i32 %A, i32 %B, <4 x float> %C, ptr addrspace(1) %out) {
|
||||
; GFX12-LABEL: test_wmma_f32_16x16x16_bf8_bf8:
|
||||
; GFX12: ; %bb.0: ; %bb
|
||||
; GFX12-NEXT: v_wmma_f32_16x16x16_bf8_bf8 v[2:5], v0, v1, v[2:5]
|
||||
; GFX12-NEXT: global_store_b128 v[6:7], v[2:5], off
|
||||
; GFX12-NEXT: s_nop 0
|
||||
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
|
||||
; GFX12-NEXT: s_endpgm
|
||||
bb:
|
||||
%res = call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf8.bf8.v4f32.i32.i32.v4f32(i32 %A, i32 %B, <4 x float> %C)
|
||||
store <4 x float> %res, ptr addrspace(1) %out
|
||||
ret void
|
||||
}
|
||||
|
||||
define amdgpu_ps void @test_wmma_i32_16x16x32_iu4(i32 %A, i32 %B, <4 x i32> %C, ptr addrspace(1) %out) {
|
||||
; GFX12-LABEL: test_wmma_i32_16x16x32_iu4:
|
||||
; GFX12: ; %bb.0: ; %bb
|
||||
; GFX12-NEXT: v_wmma_i32_16x16x32_iu4 v[2:5], v0, v1, v[2:5]
|
||||
; GFX12-NEXT: global_store_b128 v[6:7], v[2:5], off
|
||||
; GFX12-NEXT: s_nop 0
|
||||
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
|
||||
; GFX12-NEXT: s_endpgm
|
||||
bb:
|
||||
%res = call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x32.iu4.v4i32.i32.i32.v4i32(i1 0, i32 %A, i1 0, i32 %B, <4 x i32> %C, i1 0)
|
||||
store <4 x i32> %res, ptr addrspace(1) %out
|
||||
ret void
|
||||
}
|
||||
|
||||
define amdgpu_ps void @test_swmmac_f32_16x16x32_f16(<4 x half> %A, <8 x half> %B, <4 x float> %C, i8 %Index, ptr addrspace(1) %out) {
|
||||
; GFX12-LABEL: test_swmmac_f32_16x16x32_f16:
|
||||
; GFX12: ; %bb.0: ; %bb
|
||||
; GFX12-NEXT: v_swmmac_f32_16x16x32_f16 v[6:9], v[0:1], v[2:5], v10
|
||||
; GFX12-NEXT: global_store_b128 v[11:12], v[6:9], off
|
||||
; GFX12-NEXT: s_nop 0
|
||||
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
|
||||
; GFX12-NEXT: s_endpgm
|
||||
bb:
|
||||
%res = call <4 x float> @llvm.amdgcn.swmmac.f32.16x16x32.f16.v4f32.v4f16.v8f16.v4f32.i8(<4 x half> %A, <8 x half> %B, <4 x float> %C, i8 %Index)
|
||||
store <4 x float> %res, ptr addrspace(1) %out
|
||||
ret void
|
||||
}
|
||||
|
||||
define amdgpu_ps void @test_swmmac_f32_16x16x32_bf16(<4 x i16> %A, <8 x i16> %B, <4 x float> %C, i8 %Index, ptr addrspace(1) %out) {
|
||||
; GFX12-LABEL: test_swmmac_f32_16x16x32_bf16:
|
||||
; GFX12: ; %bb.0: ; %bb
|
||||
; GFX12-NEXT: v_swmmac_f32_16x16x32_bf16 v[6:9], v[0:1], v[2:5], v10
|
||||
; GFX12-NEXT: global_store_b128 v[11:12], v[6:9], off
|
||||
; GFX12-NEXT: s_nop 0
|
||||
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
|
||||
; GFX12-NEXT: s_endpgm
|
||||
bb:
|
||||
%res = call <4 x float> @llvm.amdgcn.swmmac.f32.16x16x32.bf16.v4f32.v4i16.v8i16.v4f32.i8(<4 x i16> %A, <8 x i16> %B, <4 x float> %C, i8 %Index)
|
||||
store <4 x float> %res, ptr addrspace(1) %out
|
||||
ret void
|
||||
}
|
||||
|
||||
define amdgpu_ps void @test_swmmac_f16_16x16x32_f16(<4 x half> %A, <8 x half> %B, <4 x half> %C, i8 %Index, ptr addrspace(1) %out) {
|
||||
; GFX12-LABEL: test_swmmac_f16_16x16x32_f16:
|
||||
; GFX12: ; %bb.0: ; %bb
|
||||
; GFX12-NEXT: v_swmmac_f16_16x16x32_f16 v[6:7], v[0:1], v[2:5], v8
|
||||
; GFX12-NEXT: global_store_b64 v[9:10], v[6:7], off
|
||||
; GFX12-NEXT: s_nop 0
|
||||
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
|
||||
; GFX12-NEXT: s_endpgm
|
||||
bb:
|
||||
%res = call <4 x half> @llvm.amdgcn.swmmac.f16.16x16x32.f16.v4f16.v4f16.v8f16.v4f16.i8(<4 x half> %A, <8 x half> %B, <4 x half> %C, i8 %Index)
|
||||
store <4 x half> %res, ptr addrspace(1) %out
|
||||
ret void
|
||||
}
|
||||
|
||||
define amdgpu_ps void @test_swmmac_bf16_16x16x32_bf16(<4 x i16> %A, <8 x i16> %B, <4 x i16> %C, i8 %Index, ptr addrspace(1) %out) {
|
||||
; GFX12-LABEL: test_swmmac_bf16_16x16x32_bf16:
|
||||
; GFX12: ; %bb.0: ; %bb
|
||||
; GFX12-NEXT: v_swmmac_bf16_16x16x32_bf16 v[6:7], v[0:1], v[2:5], v8
|
||||
; GFX12-NEXT: global_store_b64 v[9:10], v[6:7], off
|
||||
; GFX12-NEXT: s_nop 0
|
||||
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
|
||||
; GFX12-NEXT: s_endpgm
|
||||
bb:
|
||||
%res = call <4 x i16> @llvm.amdgcn.swmmac.bf16.16x16x32.bf16.v4i16.v4i16.v8i16.v4i16.i8(<4 x i16> %A, <8 x i16> %B, <4 x i16> %C, i8 %Index)
|
||||
store <4 x i16> %res, ptr addrspace(1) %out
|
||||
ret void
|
||||
}
|
||||
|
||||
define amdgpu_ps void @test_swmmac_i32_16x16x32_iu8(i32 %A, <2 x i32> %B, <4 x i32> %C, i8 %Index, ptr addrspace(1) %out) {
|
||||
; GFX12-LABEL: test_swmmac_i32_16x16x32_iu8:
|
||||
; GFX12: ; %bb.0: ; %bb
|
||||
; GFX12-NEXT: v_swmmac_i32_16x16x32_iu8 v[3:6], v0, v[1:2], v7
|
||||
; GFX12-NEXT: global_store_b128 v[8:9], v[3:6], off
|
||||
; GFX12-NEXT: s_nop 0
|
||||
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
|
||||
; GFX12-NEXT: s_endpgm
|
||||
bb:
|
||||
%res = call <4 x i32> @llvm.amdgcn.swmmac.i32.16x16x32.iu8.v4i32.i32.v2i32.v4i32.i8(i1 0, i32 %A, i1 0, <2 x i32> %B, <4 x i32> %C, i8 %Index, i1 0)
|
||||
store <4 x i32> %res, ptr addrspace(1) %out
|
||||
ret void
|
||||
}
|
||||
|
||||
define amdgpu_ps void @test_swmmac_i32_16x16x32_iu4(i32 %A, i32 %B, <4 x i32> %C, i16 %Index, ptr addrspace(1) %out) {
|
||||
; GFX12-LABEL: test_swmmac_i32_16x16x32_iu4:
|
||||
; GFX12: ; %bb.0: ; %bb
|
||||
; GFX12-NEXT: v_swmmac_i32_16x16x32_iu4 v[2:5], v0, v1, v6
|
||||
; GFX12-NEXT: global_store_b128 v[7:8], v[2:5], off
|
||||
; GFX12-NEXT: s_nop 0
|
||||
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
|
||||
; GFX12-NEXT: s_endpgm
|
||||
bb:
|
||||
%res = call <4 x i32> @llvm.amdgcn.swmmac.i32.16x16x32.iu4.v4i32.i32.i32.v4i32.i16(i1 0, i32 %A, i1 0, i32 %B, <4 x i32> %C, i16 %Index, i1 0)
|
||||
store <4 x i32> %res, ptr addrspace(1) %out
|
||||
ret void
|
||||
}
|
||||
|
||||
define amdgpu_ps void @test_swmmac_i32_16x16x64_iu4(i32 %A, <2 x i32> %B, <4 x i32> %C, i16 %Index, ptr addrspace(1) %out) {
|
||||
; GFX12-LABEL: test_swmmac_i32_16x16x64_iu4:
|
||||
; GFX12: ; %bb.0: ; %bb
|
||||
; GFX12-NEXT: v_swmmac_i32_16x16x64_iu4 v[3:6], v0, v[1:2], v7
|
||||
; GFX12-NEXT: global_store_b128 v[8:9], v[3:6], off
|
||||
; GFX12-NEXT: s_nop 0
|
||||
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
|
||||
; GFX12-NEXT: s_endpgm
|
||||
bb:
|
||||
%res = call <4 x i32> @llvm.amdgcn.swmmac.i32.16x16x64.iu4.v4i32.i32.v2i32.v4i32.i16(i1 0, i32 %A, i1 0, <2 x i32> %B, <4 x i32> %C, i16 %Index, i1 0)
|
||||
store <4 x i32> %res, ptr addrspace(1) %out
|
||||
ret void
|
||||
}
|
||||
|
||||
define amdgpu_ps void @test_swmmac_f32_16x16x32_fp8_fp8(i32 %A, <2 x i32> %B, <4 x float> %C, i8 %Index, ptr addrspace(1) %out) {
|
||||
; GFX12-LABEL: test_swmmac_f32_16x16x32_fp8_fp8:
|
||||
; GFX12: ; %bb.0: ; %bb
|
||||
; GFX12-NEXT: v_swmmac_f32_16x16x32_fp8_fp8 v[3:6], v0, v[1:2], v7
|
||||
; GFX12-NEXT: global_store_b128 v[8:9], v[3:6], off
|
||||
; GFX12-NEXT: s_nop 0
|
||||
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
|
||||
; GFX12-NEXT: s_endpgm
|
||||
bb:
|
||||
%res = call <4 x float> @llvm.amdgcn.swmmac.f32.16x16x32.fp8.fp8.v4f32.i32.v2i32.v4f32.i8(i32 %A, <2 x i32> %B, <4 x float> %C, i8 %Index)
|
||||
store <4 x float> %res, ptr addrspace(1) %out
|
||||
ret void
|
||||
}
|
||||
|
||||
define amdgpu_ps void @test_swmmac_f32_16x16x32_fp8_bf8(i32 %A, <2 x i32> %B, <4 x float> %C, i8 %Index, ptr addrspace(1) %out) {
|
||||
; GFX12-LABEL: test_swmmac_f32_16x16x32_fp8_bf8:
|
||||
; GFX12: ; %bb.0: ; %bb
|
||||
; GFX12-NEXT: v_swmmac_f32_16x16x32_fp8_bf8 v[3:6], v0, v[1:2], v7
|
||||
; GFX12-NEXT: global_store_b128 v[8:9], v[3:6], off
|
||||
; GFX12-NEXT: s_nop 0
|
||||
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
|
||||
; GFX12-NEXT: s_endpgm
|
||||
bb:
|
||||
%res = call <4 x float> @llvm.amdgcn.swmmac.f32.16x16x32.fp8.bf8.v4f32.i32.v2i32.v4f32.i8(i32 %A, <2 x i32> %B, <4 x float> %C, i8 %Index)
|
||||
store <4 x float> %res, ptr addrspace(1) %out
|
||||
ret void
|
||||
}
|
||||
|
||||
define amdgpu_ps void @test_swmmac_f32_16x16x32_bf8_fp8(i32 %A, <2 x i32> %B, <4 x float> %C, i8 %Index, ptr addrspace(1) %out) {
|
||||
; GFX12-LABEL: test_swmmac_f32_16x16x32_bf8_fp8:
|
||||
; GFX12: ; %bb.0: ; %bb
|
||||
; GFX12-NEXT: v_swmmac_f32_16x16x32_bf8_fp8 v[3:6], v0, v[1:2], v7
|
||||
; GFX12-NEXT: global_store_b128 v[8:9], v[3:6], off
|
||||
; GFX12-NEXT: s_nop 0
|
||||
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
|
||||
; GFX12-NEXT: s_endpgm
|
||||
bb:
|
||||
%res = call <4 x float> @llvm.amdgcn.swmmac.f32.16x16x32.bf8.fp8.v4f32.i32.v2i32.v4f32.i8(i32 %A, <2 x i32> %B, <4 x float> %C, i8 %Index)
|
||||
store <4 x float> %res, ptr addrspace(1) %out
|
||||
ret void
|
||||
}
|
||||
|
||||
define amdgpu_ps void @test_swmmac_f32_16x16x32_bf8_bf8(i32 %A, <2 x i32> %B, <4 x float> %C, i8 %Index, ptr addrspace(1) %out) {
|
||||
; GFX12-LABEL: test_swmmac_f32_16x16x32_bf8_bf8:
|
||||
; GFX12: ; %bb.0: ; %bb
|
||||
; GFX12-NEXT: v_swmmac_f32_16x16x32_bf8_bf8 v[3:6], v0, v[1:2], v7
|
||||
; GFX12-NEXT: global_store_b128 v[8:9], v[3:6], off
|
||||
; GFX12-NEXT: s_nop 0
|
||||
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
|
||||
; GFX12-NEXT: s_endpgm
|
||||
bb:
|
||||
%res = call <4 x float> @llvm.amdgcn.swmmac.f32.16x16x32.bf8.bf8.v4f32.i32.v2i32.v4f32.i8(i32 %A, <2 x i32> %B, <4 x float> %C, i8 %Index)
|
||||
store <4 x float> %res, ptr addrspace(1) %out
|
||||
ret void
|
||||
}
|
||||
|
||||
declare <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.f16.v4f32.v4f16.v4f16.v4f32(<4 x half>, <4 x half>, <4 x float>)
|
||||
declare <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf16.v4f32.v4i16.v4i16.v4f32(<4 x i16>, <4 x i16>, <4 x float>)
|
||||
declare <4 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16.v4f16.v4f16.v4f16.v4f16(<4 x half>, <4 x half>, <4 x half>, i1 immarg)
|
||||
declare <4 x i16> @llvm.amdgcn.wmma.bf16.16x16x16.bf16.v4i16.v4i16.v4i16.v4i16(<4 x i16>, <4 x i16>, <4 x i16>, i1 immarg)
|
||||
declare <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8.v4i32.i32.i32.v4i32(i1 immarg, i32, i1 immarg, i32, <4 x i32>, i1 immarg)
|
||||
declare <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4.v4i32.i32.i32.v4i32(i1 immarg, i32, i1 immarg, i32, <4 x i32>, i1 immarg)
|
||||
declare <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.fp8.fp8.v4f32.i32.i32.v4f32(i32, i32, <4 x float>)
|
||||
declare <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.fp8.bf8.v4f32.i32.i32.v4f32(i32, i32, <4 x float>)
|
||||
declare <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf8.fp8.v4f32.i32.i32.v4f32(i32, i32, <4 x float>)
|
||||
declare <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf8.bf8.v4f32.i32.i32.v4f32(i32, i32, <4 x float>)
|
||||
declare <4 x i32> @llvm.amdgcn.wmma.i32.16x16x32.iu4.v4i32.i32.i32.v4i32(i1 immarg, i32, i1 immarg, i32, <4 x i32>, i1 immarg)
|
||||
declare <4 x float> @llvm.amdgcn.swmmac.f32.16x16x32.f16.v4f32.v4f16.v8f16.v4f32.i8(<4 x half>, <8 x half>, <4 x float>, i8)
|
||||
declare <4 x float> @llvm.amdgcn.swmmac.f32.16x16x32.bf16.v4f32.v4i16.v8i16.v4f32.i8(<4 x i16>, <8 x i16>, <4 x float>, i8)
|
||||
declare <4 x half> @llvm.amdgcn.swmmac.f16.16x16x32.f16.v4f16.v4f16.v8f16.v4f16.i8(<4 x half>, <8 x half>, <4 x half>, i8)
|
||||
declare <4 x i16> @llvm.amdgcn.swmmac.bf16.16x16x32.bf16.v4i16.v4i16.v8i16.v4i16.i8(<4 x i16>, <8 x i16>, <4 x i16>, i8)
|
||||
declare <4 x i32> @llvm.amdgcn.swmmac.i32.16x16x32.iu8.v4i32.i32.v2i32.v4i32.i8(i1 immarg, i32, i1 immarg, <2 x i32>, <4 x i32>, i8 %Index, i1 immarg)
|
||||
declare <4 x i32> @llvm.amdgcn.swmmac.i32.16x16x32.iu4.v4i32.i32.i32.v4i32.i16(i1 immarg, i32, i1 immarg, i32, <4 x i32>, i16 %Index, i1 immarg)
|
||||
declare <4 x i32> @llvm.amdgcn.swmmac.i32.16x16x64.iu4.v4i32.i32.v2i32.v4i32.i16(i1 immarg, i32, i1 immarg, <2 x i32>, <4 x i32>, i16 %Index, i1 immarg)
|
||||
declare <4 x float> @llvm.amdgcn.swmmac.f32.16x16x32.fp8.fp8.v4f32.i32.v2i32.v4f32.i8(i32, <2 x i32>, <4 x float>, i8)
|
||||
declare <4 x float> @llvm.amdgcn.swmmac.f32.16x16x32.fp8.bf8.v4f32.i32.v2i32.v4f32.i8(i32, <2 x i32>, <4 x float>, i8)
|
||||
declare <4 x float> @llvm.amdgcn.swmmac.f32.16x16x32.bf8.fp8.v4f32.i32.v2i32.v4f32.i8(i32, <2 x i32>, <4 x float>, i8)
|
||||
declare <4 x float> @llvm.amdgcn.swmmac.f32.16x16x32.bf8.bf8.v4f32.i32.v2i32.v4f32.i8(i32, <2 x i32>, <4 x float>, i8)
|
354
llvm/test/CodeGen/AMDGPU/wmma-hazards-gfx12-w32.mir
Normal file
354
llvm/test/CodeGen/AMDGPU/wmma-hazards-gfx12-w32.mir
Normal file
@ -0,0 +1,354 @@
|
||||
# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
|
||||
# RUN: llc -march=amdgcn -mcpu=gfx1200 -verify-machineinstrs -run-pass post-RA-hazard-rec %s -o - | FileCheck -check-prefix=GFX12 %s
|
||||
|
||||
# D0 overlaps A1, B1, C1 or Index1. Overlap starts at vgpr0.
|
||||
# $D0 = wmma0 $A0, $B0, $C0 or $D0 = swmmac0 $A0, $B0, $C0, $Index0
|
||||
# $D1 = wmma1 $A1, $B1, $C1 or $D1 = swmmac1 $A1, $B1, $C1, $Index1
|
||||
|
||||
---
|
||||
name: test_wmma_f32_16x16x16_f16_D0_overlaps_A1
|
||||
body: |
|
||||
bb.0:
|
||||
liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25, $vgpr26, $vgpr27, $vgpr28, $vgpr29, $vgpr30, $vgpr31, $vgpr32, $vgpr33, $vgpr34, $vgpr35, $vgpr36, $vgpr37, $vgpr38, $vgpr39, $vgpr40, $vgpr41, $vgpr42, $vgpr43, $vgpr44, $vgpr45
|
||||
|
||||
; GFX12-LABEL: name: test_wmma_f32_16x16x16_f16_D0_overlaps_A1
|
||||
; GFX12: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25, $vgpr26, $vgpr27, $vgpr28, $vgpr29, $vgpr30, $vgpr31, $vgpr32, $vgpr33, $vgpr34, $vgpr35, $vgpr36, $vgpr37, $vgpr38, $vgpr39, $vgpr40, $vgpr41, $vgpr42, $vgpr43, $vgpr44, $vgpr45
|
||||
; GFX12-NEXT: {{ $}}
|
||||
; GFX12-NEXT: early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = V_WMMA_F32_16X16X16_F16_w32_twoaddr 8, killed $vgpr10_vgpr11_vgpr12_vgpr13, 8, killed $vgpr14_vgpr15_vgpr16_vgpr17, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 0, 0, implicit $exec
|
||||
; GFX12-NEXT: V_NOP_e32 implicit $exec
|
||||
; GFX12-NEXT: early-clobber renamable $vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43 = V_WMMA_F32_16X16X16_F16_w32_twoaddr 8, killed $vgpr0_vgpr1_vgpr2_vgpr3, 8, killed $vgpr32_vgpr33_vgpr34_vgpr35, 8, killed $vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43, 0, 0, implicit $exec
|
||||
early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = V_WMMA_F32_16X16X16_F16_w32_twoaddr 8, killed $vgpr10_vgpr11_vgpr12_vgpr13, 8, killed $vgpr14_vgpr15_vgpr16_vgpr17, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 0, 0, implicit $exec
|
||||
early-clobber renamable $vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43 = V_WMMA_F32_16X16X16_F16_w32_twoaddr 8, killed $vgpr0_vgpr1_vgpr2_vgpr3, 8, killed $vgpr32_vgpr33_vgpr34_vgpr35, 8, killed $vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43, 0, 0, implicit $exec
|
||||
...
|
||||
|
||||
---
|
||||
name: test_wmma_f32_16x16x16_bf16_D0_overlaps_B1
|
||||
body: |
|
||||
bb.0:
|
||||
liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25, $vgpr26, $vgpr27, $vgpr28, $vgpr29, $vgpr30, $vgpr31, $vgpr32, $vgpr33, $vgpr34, $vgpr35, $vgpr36, $vgpr37, $vgpr38, $vgpr39, $vgpr40, $vgpr41, $vgpr42, $vgpr43, $vgpr44, $vgpr45
|
||||
|
||||
; GFX12-LABEL: name: test_wmma_f32_16x16x16_bf16_D0_overlaps_B1
|
||||
; GFX12: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25, $vgpr26, $vgpr27, $vgpr28, $vgpr29, $vgpr30, $vgpr31, $vgpr32, $vgpr33, $vgpr34, $vgpr35, $vgpr36, $vgpr37, $vgpr38, $vgpr39, $vgpr40, $vgpr41, $vgpr42, $vgpr43, $vgpr44, $vgpr45
|
||||
; GFX12-NEXT: {{ $}}
|
||||
; GFX12-NEXT: early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = V_WMMA_F32_16X16X16_F16_w32_twoaddr 8, killed $vgpr10_vgpr11_vgpr12_vgpr13, 8, killed $vgpr14_vgpr15_vgpr16_vgpr17, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 0, 0, implicit $exec
|
||||
; GFX12-NEXT: V_NOP_e32 implicit $exec
|
||||
; GFX12-NEXT: early-clobber renamable $vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43 = V_WMMA_F32_16X16X16_BF16_w32_twoaddr 8, killed $vgpr28_vgpr29_vgpr30_vgpr31, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3, 8, killed $vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43, 0, 0, implicit $exec
|
||||
early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = V_WMMA_F32_16X16X16_F16_w32_twoaddr 8, killed $vgpr10_vgpr11_vgpr12_vgpr13, 8, killed $vgpr14_vgpr15_vgpr16_vgpr17, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 0, 0, implicit $exec
|
||||
early-clobber renamable $vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43 = V_WMMA_F32_16X16X16_BF16_w32_twoaddr 8, killed $vgpr28_vgpr29_vgpr30_vgpr31, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3, 8, killed $vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43, 0, 0, implicit $exec
|
||||
...
|
||||
|
||||
---
|
||||
name: test_wmma_f16_16x16x16_f16_D0_overlaps_C1
|
||||
body: |
|
||||
bb.0:
|
||||
liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25, $vgpr26, $vgpr27, $vgpr28, $vgpr29, $vgpr30, $vgpr31, $vgpr32, $vgpr33, $vgpr34, $vgpr35, $vgpr36, $vgpr37, $vgpr38, $vgpr39, $vgpr40, $vgpr41
|
||||
|
||||
; GFX12-LABEL: name: test_wmma_f16_16x16x16_f16_D0_overlaps_C1
|
||||
; GFX12: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25, $vgpr26, $vgpr27, $vgpr28, $vgpr29, $vgpr30, $vgpr31, $vgpr32, $vgpr33, $vgpr34, $vgpr35, $vgpr36, $vgpr37, $vgpr38, $vgpr39, $vgpr40, $vgpr41
|
||||
; GFX12-NEXT: {{ $}}
|
||||
; GFX12-NEXT: early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = V_WMMA_F32_16X16X16_F16_w32_twoaddr 8, killed $vgpr10_vgpr11_vgpr12_vgpr13, 8, killed $vgpr14_vgpr15_vgpr16_vgpr17, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 0, 0, implicit $exec
|
||||
; GFX12-NEXT: V_NOP_e32 implicit $exec
|
||||
; GFX12-NEXT: early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3 = V_WMMA_F16_16X16X16_F16_w32_twoaddr 8, killed $vgpr28_vgpr29_vgpr30_vgpr31, 8, killed $vgpr32_vgpr33_vgpr34_vgpr35, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3, 0, 0, implicit $exec
|
||||
early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = V_WMMA_F32_16X16X16_F16_w32_twoaddr 8, killed $vgpr10_vgpr11_vgpr12_vgpr13, 8, killed $vgpr14_vgpr15_vgpr16_vgpr17, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 0, 0, implicit $exec
|
||||
early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3 = V_WMMA_F16_16X16X16_F16_w32_twoaddr 8, killed $vgpr28_vgpr29_vgpr30_vgpr31, 8, killed $vgpr32_vgpr33_vgpr34_vgpr35, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3, 0, 0, implicit $exec
|
||||
...
|
||||
|
||||
---
|
||||
name: test_wmma_bf16_16x16x16_bf16_D0_overlaps_A1
|
||||
body: |
|
||||
bb.0:
|
||||
liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25, $vgpr26, $vgpr27, $vgpr28, $vgpr29, $vgpr30, $vgpr31, $vgpr32, $vgpr33, $vgpr34, $vgpr35, $vgpr36, $vgpr37, $vgpr38, $vgpr39, $vgpr40, $vgpr41
|
||||
|
||||
; GFX12-LABEL: name: test_wmma_bf16_16x16x16_bf16_D0_overlaps_A1
|
||||
; GFX12: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25, $vgpr26, $vgpr27, $vgpr28, $vgpr29, $vgpr30, $vgpr31, $vgpr32, $vgpr33, $vgpr34, $vgpr35, $vgpr36, $vgpr37, $vgpr38, $vgpr39, $vgpr40, $vgpr41
|
||||
; GFX12-NEXT: {{ $}}
|
||||
; GFX12-NEXT: early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = V_WMMA_F32_16X16X16_F16_w32_twoaddr 8, killed $vgpr10_vgpr11_vgpr12_vgpr13, 8, killed $vgpr14_vgpr15_vgpr16_vgpr17, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 0, 0, implicit $exec
|
||||
; GFX12-NEXT: V_NOP_e32 implicit $exec
|
||||
; GFX12-NEXT: early-clobber renamable $vgpr36_vgpr37_vgpr38_vgpr39 = V_WMMA_BF16_16X16X16_BF16_w32_twoaddr 8, killed $vgpr0_vgpr1_vgpr2_vgpr3, 8, killed $vgpr32_vgpr33_vgpr34_vgpr35, 8, killed $vgpr36_vgpr37_vgpr38_vgpr39, 0, 0, implicit $exec
|
||||
early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = V_WMMA_F32_16X16X16_F16_w32_twoaddr 8, killed $vgpr10_vgpr11_vgpr12_vgpr13, 8, killed $vgpr14_vgpr15_vgpr16_vgpr17, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 0, 0, implicit $exec
|
||||
early-clobber renamable $vgpr36_vgpr37_vgpr38_vgpr39 = V_WMMA_BF16_16X16X16_BF16_w32_twoaddr 8, killed $vgpr0_vgpr1_vgpr2_vgpr3, 8, killed $vgpr32_vgpr33_vgpr34_vgpr35, 8, killed $vgpr36_vgpr37_vgpr38_vgpr39, 0, 0, implicit $exec
|
||||
...
|
||||
---
|
||||
name: test_wmma_i32_16x16x16_iu8_D0_overlaps_B1
|
||||
body: |
|
||||
bb.0:
|
||||
liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25, $vgpr26, $vgpr27, $vgpr28, $vgpr29, $vgpr30, $vgpr31, $vgpr32, $vgpr33, $vgpr34, $vgpr35, $vgpr36, $vgpr37, $vgpr38, $vgpr39, $vgpr40, $vgpr41
|
||||
|
||||
; GFX12-LABEL: name: test_wmma_i32_16x16x16_iu8_D0_overlaps_B1
|
||||
; GFX12: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25, $vgpr26, $vgpr27, $vgpr28, $vgpr29, $vgpr30, $vgpr31, $vgpr32, $vgpr33, $vgpr34, $vgpr35, $vgpr36, $vgpr37, $vgpr38, $vgpr39, $vgpr40, $vgpr41
|
||||
; GFX12-NEXT: {{ $}}
|
||||
; GFX12-NEXT: early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = V_WMMA_F32_16X16X16_F16_w32_twoaddr 8, killed $vgpr10_vgpr11_vgpr12_vgpr13, 8, killed $vgpr14_vgpr15_vgpr16_vgpr17, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 0, 0, implicit $exec
|
||||
; GFX12-NEXT: V_NOP_e32 implicit $exec
|
||||
; GFX12-NEXT: early-clobber renamable $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39 = V_WMMA_I32_16X16X16_IU8_w32_twoaddr 8, killed $vgpr28_vgpr29, 8, killed $vgpr0_vgpr1, killed $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39, 0, 0, implicit $exec
|
||||
early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = V_WMMA_F32_16X16X16_F16_w32_twoaddr 8, killed $vgpr10_vgpr11_vgpr12_vgpr13, 8, killed $vgpr14_vgpr15_vgpr16_vgpr17, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 0, 0, implicit $exec
|
||||
early-clobber renamable $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39 = V_WMMA_I32_16X16X16_IU8_w32_twoaddr 8, killed $vgpr28_vgpr29, 8, killed $vgpr0_vgpr1, killed $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39, 0, 0, implicit $exec
|
||||
...
|
||||
|
||||
---
|
||||
name: test_wmma_i32_16x16x16_iu4_D0_overlaps_C1
|
||||
body: |
|
||||
bb.0:
|
||||
liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25, $vgpr26, $vgpr27, $vgpr28, $vgpr29, $vgpr30, $vgpr31, $vgpr32, $vgpr33, $vgpr34, $vgpr35, $vgpr36, $vgpr37, $vgpr38, $vgpr39
|
||||
|
||||
; GFX12-LABEL: name: test_wmma_i32_16x16x16_iu4_D0_overlaps_C1
|
||||
; GFX12: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25, $vgpr26, $vgpr27, $vgpr28, $vgpr29, $vgpr30, $vgpr31, $vgpr32, $vgpr33, $vgpr34, $vgpr35, $vgpr36, $vgpr37, $vgpr38, $vgpr39
|
||||
; GFX12-NEXT: {{ $}}
|
||||
; GFX12-NEXT: early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = V_WMMA_F32_16X16X16_F16_w32_twoaddr 8, killed $vgpr10_vgpr11_vgpr12_vgpr13, 8, killed $vgpr14_vgpr15_vgpr16_vgpr17, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 0, 0, implicit $exec
|
||||
; GFX12-NEXT: V_NOP_e32 implicit $exec
|
||||
; GFX12-NEXT: early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = V_WMMA_I32_16X16X16_IU4_w32_twoaddr 8, killed $vgpr28, 8, killed $vgpr29, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 0, 0, implicit $exec
|
||||
early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = V_WMMA_F32_16X16X16_F16_w32_twoaddr 8, killed $vgpr10_vgpr11_vgpr12_vgpr13, 8, killed $vgpr14_vgpr15_vgpr16_vgpr17, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 0, 0, implicit $exec
|
||||
early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = V_WMMA_I32_16X16X16_IU4_w32_twoaddr 8, killed $vgpr28, 8, killed $vgpr29, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 0, 0, implicit $exec
|
||||
...
|
||||
|
||||
---
|
||||
name: test_wmma_f32_16x16x16_fp8_fp8_D0_overlaps_A1
|
||||
body: |
|
||||
bb.0:
|
||||
liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25, $vgpr26, $vgpr27, $vgpr28, $vgpr29, $vgpr30, $vgpr31, $vgpr32, $vgpr33, $vgpr34, $vgpr35, $vgpr36, $vgpr37, $vgpr38, $vgpr39, $vgpr40, $vgpr41
|
||||
|
||||
; GFX12-LABEL: name: test_wmma_f32_16x16x16_fp8_fp8_D0_overlaps_A1
|
||||
; GFX12: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25, $vgpr26, $vgpr27, $vgpr28, $vgpr29, $vgpr30, $vgpr31, $vgpr32, $vgpr33, $vgpr34, $vgpr35, $vgpr36, $vgpr37, $vgpr38, $vgpr39, $vgpr40, $vgpr41
|
||||
; GFX12-NEXT: {{ $}}
|
||||
; GFX12-NEXT: early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = V_WMMA_F32_16X16X16_F16_w32_twoaddr 8, killed $vgpr10_vgpr11_vgpr12_vgpr13, 8, killed $vgpr14_vgpr15_vgpr16_vgpr17, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 0, 0, implicit $exec
|
||||
; GFX12-NEXT: V_NOP_e32 implicit $exec
|
||||
; GFX12-NEXT: early-clobber renamable $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39 = V_WMMA_F32_16X16X16_FP8_FP8_w32_twoaddr killed $vgpr0_vgpr1, killed $vgpr30_vgpr31, 8, killed $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39, 0, 0, implicit $exec
|
||||
early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = V_WMMA_F32_16X16X16_F16_w32_twoaddr 8, killed $vgpr10_vgpr11_vgpr12_vgpr13, 8, killed $vgpr14_vgpr15_vgpr16_vgpr17, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 0, 0, implicit $exec
|
||||
early-clobber renamable $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39 = V_WMMA_F32_16X16X16_FP8_FP8_w32_twoaddr killed $vgpr0_vgpr1, killed $vgpr30_vgpr31, 8, killed $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39, 0, 0, implicit $exec
|
||||
...
|
||||
|
||||
---
|
||||
name: test_wmma_f32_16x16x16_bf8_fp8_D0_overlaps_B1
|
||||
body: |
|
||||
bb.0:
|
||||
liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25, $vgpr26, $vgpr27, $vgpr28, $vgpr29, $vgpr30, $vgpr31, $vgpr32, $vgpr33, $vgpr34, $vgpr35, $vgpr36, $vgpr37, $vgpr38, $vgpr39, $vgpr40, $vgpr41
|
||||
|
||||
; GFX12-LABEL: name: test_wmma_f32_16x16x16_bf8_fp8_D0_overlaps_B1
|
||||
; GFX12: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25, $vgpr26, $vgpr27, $vgpr28, $vgpr29, $vgpr30, $vgpr31, $vgpr32, $vgpr33, $vgpr34, $vgpr35, $vgpr36, $vgpr37, $vgpr38, $vgpr39, $vgpr40, $vgpr41
|
||||
; GFX12-NEXT: {{ $}}
|
||||
; GFX12-NEXT: early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = V_WMMA_F32_16X16X16_F16_w32_twoaddr 8, killed $vgpr10_vgpr11_vgpr12_vgpr13, 8, killed $vgpr14_vgpr15_vgpr16_vgpr17, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 0, 0, implicit $exec
|
||||
; GFX12-NEXT: V_NOP_e32 implicit $exec
|
||||
; GFX12-NEXT: early-clobber renamable $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39 = V_WMMA_F32_16X16X16_BF8_FP8_w32_twoaddr killed $vgpr28_vgpr29, killed $vgpr0_vgpr1, 8, killed $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39, 0, 0, implicit $exec
|
||||
early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = V_WMMA_F32_16X16X16_F16_w32_twoaddr 8, killed $vgpr10_vgpr11_vgpr12_vgpr13, 8, killed $vgpr14_vgpr15_vgpr16_vgpr17, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 0, 0, implicit $exec
|
||||
early-clobber renamable $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39 = V_WMMA_F32_16X16X16_BF8_FP8_w32_twoaddr killed $vgpr28_vgpr29, killed $vgpr0_vgpr1, 8, killed $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39, 0, 0, implicit $exec
|
||||
...
|
||||
|
||||
---
|
||||
name: test_wmma_f32_16x16x16_fp8_bf8_D0_overlaps_C1
|
||||
body: |
|
||||
bb.0:
|
||||
liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25, $vgpr26, $vgpr27, $vgpr28, $vgpr29, $vgpr30, $vgpr31, $vgpr32, $vgpr33, $vgpr34, $vgpr35, $vgpr36, $vgpr37, $vgpr38, $vgpr39, $vgpr40, $vgpr41
|
||||
|
||||
; GFX12-LABEL: name: test_wmma_f32_16x16x16_fp8_bf8_D0_overlaps_C1
|
||||
; GFX12: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25, $vgpr26, $vgpr27, $vgpr28, $vgpr29, $vgpr30, $vgpr31, $vgpr32, $vgpr33, $vgpr34, $vgpr35, $vgpr36, $vgpr37, $vgpr38, $vgpr39, $vgpr40, $vgpr41
|
||||
; GFX12-NEXT: {{ $}}
|
||||
; GFX12-NEXT: early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = V_WMMA_F32_16X16X16_F16_w32_twoaddr 8, killed $vgpr10_vgpr11_vgpr12_vgpr13, 8, killed $vgpr14_vgpr15_vgpr16_vgpr17, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 0, 0, implicit $exec
|
||||
; GFX12-NEXT: V_NOP_e32 implicit $exec
|
||||
; GFX12-NEXT: early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = V_WMMA_F32_16X16X16_FP8_BF8_w32_twoaddr killed $vgpr28_vgpr29, killed $vgpr30_vgpr31, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 0, 0, implicit $exec
|
||||
early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = V_WMMA_F32_16X16X16_F16_w32_twoaddr 8, killed $vgpr10_vgpr11_vgpr12_vgpr13, 8, killed $vgpr14_vgpr15_vgpr16_vgpr17, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 0, 0, implicit $exec
|
||||
early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = V_WMMA_F32_16X16X16_FP8_BF8_w32_twoaddr killed $vgpr28_vgpr29, killed $vgpr30_vgpr31, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 0, 0, implicit $exec
|
||||
...
|
||||
|
||||
---
|
||||
name: test_wmma_f32_16x16x16_bf8_bf8_D0_overlaps_A1
|
||||
body: |
|
||||
bb.0:
|
||||
liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25, $vgpr26, $vgpr27, $vgpr28, $vgpr29, $vgpr30, $vgpr31, $vgpr32, $vgpr33, $vgpr34, $vgpr35, $vgpr36, $vgpr37, $vgpr38, $vgpr39, $vgpr40, $vgpr41
|
||||
|
||||
; GFX12-LABEL: name: test_wmma_f32_16x16x16_bf8_bf8_D0_overlaps_A1
|
||||
; GFX12: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25, $vgpr26, $vgpr27, $vgpr28, $vgpr29, $vgpr30, $vgpr31, $vgpr32, $vgpr33, $vgpr34, $vgpr35, $vgpr36, $vgpr37, $vgpr38, $vgpr39, $vgpr40, $vgpr41
|
||||
; GFX12-NEXT: {{ $}}
|
||||
; GFX12-NEXT: early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = V_WMMA_F32_16X16X16_F16_w32_twoaddr 8, killed $vgpr10_vgpr11_vgpr12_vgpr13, 8, killed $vgpr14_vgpr15_vgpr16_vgpr17, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 0, 0, implicit $exec
|
||||
; GFX12-NEXT: V_NOP_e32 implicit $exec
|
||||
; GFX12-NEXT: early-clobber renamable $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39 = V_WMMA_F32_16X16X16_BF8_BF8_w32_twoaddr killed $vgpr0_vgpr1, killed $vgpr30_vgpr31, 8, killed $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39, 0, 0, implicit $exec
|
||||
early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = V_WMMA_F32_16X16X16_F16_w32_twoaddr 8, killed $vgpr10_vgpr11_vgpr12_vgpr13, 8, killed $vgpr14_vgpr15_vgpr16_vgpr17, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 0, 0, implicit $exec
|
||||
early-clobber renamable $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39 = V_WMMA_F32_16X16X16_BF8_BF8_w32_twoaddr killed $vgpr0_vgpr1, killed $vgpr30_vgpr31, 8, killed $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39, 0, 0, implicit $exec
|
||||
...
|
||||
|
||||
---
|
||||
name: test_wmma_i32_16x16x32_iu4_D0_overlaps_B1
|
||||
body: |
|
||||
bb.0:
|
||||
liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25, $vgpr26, $vgpr27, $vgpr28, $vgpr29, $vgpr30, $vgpr31, $vgpr32, $vgpr33, $vgpr34, $vgpr35, $vgpr36, $vgpr37, $vgpr38, $vgpr39, $vgpr40, $vgpr41
|
||||
|
||||
; GFX12-LABEL: name: test_wmma_i32_16x16x32_iu4_D0_overlaps_B1
|
||||
; GFX12: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25, $vgpr26, $vgpr27, $vgpr28, $vgpr29, $vgpr30, $vgpr31, $vgpr32, $vgpr33, $vgpr34, $vgpr35, $vgpr36, $vgpr37, $vgpr38, $vgpr39, $vgpr40, $vgpr41
|
||||
; GFX12-NEXT: {{ $}}
|
||||
; GFX12-NEXT: early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = V_WMMA_F32_16X16X16_F16_w32_twoaddr 8, killed $vgpr10_vgpr11_vgpr12_vgpr13, 8, killed $vgpr14_vgpr15_vgpr16_vgpr17, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 0, 0, implicit $exec
|
||||
; GFX12-NEXT: V_NOP_e32 implicit $exec
|
||||
; GFX12-NEXT: early-clobber renamable $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39 = V_WMMA_I32_16X16X32_IU4_w32_twoaddr 8, killed $vgpr28_vgpr29, 8, killed $vgpr0_vgpr1, killed $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39, 0, 0, implicit $exec
|
||||
early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = V_WMMA_F32_16X16X16_F16_w32_twoaddr 8, killed $vgpr10_vgpr11_vgpr12_vgpr13, 8, killed $vgpr14_vgpr15_vgpr16_vgpr17, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 0, 0, implicit $exec
|
||||
early-clobber renamable $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39 = V_WMMA_I32_16X16X32_IU4_w32_twoaddr 8, killed $vgpr28_vgpr29, 8, killed $vgpr0_vgpr1, killed $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39, 0, 0, implicit $exec
|
||||
...
|
||||
|
||||
---
|
||||
name: test_swmmac_f32_16x16x32_f16_D0_overlaps_C1
|
||||
body: |
|
||||
bb.0:
|
||||
liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25, $vgpr26, $vgpr27, $vgpr28, $vgpr29, $vgpr30, $vgpr31, $vgpr32, $vgpr33, $vgpr34, $vgpr35, $vgpr36, $vgpr37, $vgpr38, $vgpr39, $vgpr40, $vgpr41, $vgpr42, $vgpr43, $vgpr44, $vgpr45, $vgpr46, $vgpr47, $vgpr48, $vgpr49, $vgpr50
|
||||
|
||||
; GFX12-LABEL: name: test_swmmac_f32_16x16x32_f16_D0_overlaps_C1
|
||||
; GFX12: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25, $vgpr26, $vgpr27, $vgpr28, $vgpr29, $vgpr30, $vgpr31, $vgpr32, $vgpr33, $vgpr34, $vgpr35, $vgpr36, $vgpr37, $vgpr38, $vgpr39, $vgpr40, $vgpr41, $vgpr42, $vgpr43, $vgpr44, $vgpr45, $vgpr46, $vgpr47, $vgpr48, $vgpr49, $vgpr50
|
||||
; GFX12-NEXT: {{ $}}
|
||||
; GFX12-NEXT: early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = V_WMMA_F32_16X16X16_F16_w32_twoaddr 8, killed $vgpr10_vgpr11_vgpr12_vgpr13, 8, killed $vgpr14_vgpr15_vgpr16_vgpr17, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 0, 0, implicit $exec
|
||||
; GFX12-NEXT: early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = V_SWMMAC_F32_16X16X32_F16_w32_twoaddr 8, killed $vgpr28_vgpr29_vgpr30_vgpr31, 8, killed $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, killed $vgpr44, 0, 0, 0, implicit $exec
|
||||
early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = V_WMMA_F32_16X16X16_F16_w32_twoaddr 8, killed $vgpr10_vgpr11_vgpr12_vgpr13, 8, killed $vgpr14_vgpr15_vgpr16_vgpr17, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 0, 0, implicit $exec
|
||||
early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = V_SWMMAC_F32_16X16X32_F16_w32_twoaddr 8, killed $vgpr28_vgpr29_vgpr30_vgpr31, 8, killed $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, killed $vgpr44, 0, 0, 0, implicit $exec
|
||||
...
|
||||
|
||||
---
|
||||
name: test_swmmac_f32_16x16x32_bf16_D0_overlaps_Index1
|
||||
body: |
|
||||
bb.0:
|
||||
liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25, $vgpr26, $vgpr27, $vgpr28, $vgpr29, $vgpr30, $vgpr31, $vgpr32, $vgpr33, $vgpr34, $vgpr35, $vgpr36, $vgpr37, $vgpr38, $vgpr39, $vgpr40, $vgpr41, $vgpr42, $vgpr43, $vgpr44, $vgpr45, $vgpr46, $vgpr47, $vgpr48, $vgpr49, $vgpr50
|
||||
|
||||
; GFX12-LABEL: name: test_swmmac_f32_16x16x32_bf16_D0_overlaps_Index1
|
||||
; GFX12: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25, $vgpr26, $vgpr27, $vgpr28, $vgpr29, $vgpr30, $vgpr31, $vgpr32, $vgpr33, $vgpr34, $vgpr35, $vgpr36, $vgpr37, $vgpr38, $vgpr39, $vgpr40, $vgpr41, $vgpr42, $vgpr43, $vgpr44, $vgpr45, $vgpr46, $vgpr47, $vgpr48, $vgpr49, $vgpr50
|
||||
; GFX12-NEXT: {{ $}}
|
||||
; GFX12-NEXT: early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = V_WMMA_F32_16X16X16_F16_w32_twoaddr 8, killed $vgpr10_vgpr11_vgpr12_vgpr13, 8, killed $vgpr14_vgpr15_vgpr16_vgpr17, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 0, 0, implicit $exec
|
||||
; GFX12-NEXT: V_NOP_e32 implicit $exec
|
||||
; GFX12-NEXT: early-clobber renamable $vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47 = V_SWMMAC_F32_16X16X32_BF16_w32_twoaddr 8, killed $vgpr28_vgpr29_vgpr30_vgpr31, 8, killed $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39, killed $vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47, killed $vgpr0, 0, 0, 0, implicit $exec
|
||||
early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = V_WMMA_F32_16X16X16_F16_w32_twoaddr 8, killed $vgpr10_vgpr11_vgpr12_vgpr13, 8, killed $vgpr14_vgpr15_vgpr16_vgpr17, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 0, 0, implicit $exec
|
||||
early-clobber renamable $vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47 = V_SWMMAC_F32_16X16X32_BF16_w32_twoaddr 8, killed $vgpr28_vgpr29_vgpr30_vgpr31, 8, killed $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39, killed $vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47, killed $vgpr0, 0, 0, 0, implicit $exec
|
||||
...
|
||||
|
||||
---
|
||||
name: test_swmmac_f16_16x16x32_f16_D0_overlaps_A1
|
||||
body: |
|
||||
bb.0:
|
||||
liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25, $vgpr26, $vgpr27, $vgpr28, $vgpr29, $vgpr30, $vgpr31, $vgpr32, $vgpr33, $vgpr34, $vgpr35, $vgpr36, $vgpr37, $vgpr38, $vgpr39, $vgpr40, $vgpr41, $vgpr42, $vgpr43, $vgpr44, $vgpr45, $vgpr46
|
||||
|
||||
; GFX12-LABEL: name: test_swmmac_f16_16x16x32_f16_D0_overlaps_A1
|
||||
; GFX12: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25, $vgpr26, $vgpr27, $vgpr28, $vgpr29, $vgpr30, $vgpr31, $vgpr32, $vgpr33, $vgpr34, $vgpr35, $vgpr36, $vgpr37, $vgpr38, $vgpr39, $vgpr40, $vgpr41, $vgpr42, $vgpr43, $vgpr44, $vgpr45, $vgpr46
|
||||
; GFX12-NEXT: {{ $}}
|
||||
; GFX12-NEXT: early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = V_WMMA_F32_16X16X16_F16_w32_twoaddr 8, killed $vgpr10_vgpr11_vgpr12_vgpr13, 8, killed $vgpr14_vgpr15_vgpr16_vgpr17, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 0, 0, implicit $exec
|
||||
; GFX12-NEXT: V_NOP_e32 implicit $exec
|
||||
; GFX12-NEXT: early-clobber renamable $vgpr40_vgpr41_vgpr42_vgpr43 = V_SWMMAC_F16_16X16X32_F16_w32_twoaddr 8, killed $vgpr0_vgpr1_vgpr2_vgpr3, 8, killed $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39, killed $vgpr40_vgpr41_vgpr42_vgpr43, killed $vgpr44, 0, 0, 0, implicit $exec
|
||||
early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = V_WMMA_F32_16X16X16_F16_w32_twoaddr 8, killed $vgpr10_vgpr11_vgpr12_vgpr13, 8, killed $vgpr14_vgpr15_vgpr16_vgpr17, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 0, 0, implicit $exec
|
||||
early-clobber renamable $vgpr40_vgpr41_vgpr42_vgpr43 = V_SWMMAC_F16_16X16X32_F16_w32_twoaddr 8, killed $vgpr0_vgpr1_vgpr2_vgpr3, 8, killed $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39, killed $vgpr40_vgpr41_vgpr42_vgpr43, killed $vgpr44, 0, 0, 0, implicit $exec
|
||||
...
|
||||
|
||||
---
|
||||
name: test_swmmac_bf16_16x16x32_bf16_D0_overlaps_B1
|
||||
body: |
|
||||
bb.0:
|
||||
liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25, $vgpr26, $vgpr27, $vgpr28, $vgpr29, $vgpr30, $vgpr31, $vgpr32, $vgpr33, $vgpr34, $vgpr35, $vgpr36, $vgpr37, $vgpr38, $vgpr39, $vgpr40, $vgpr41, $vgpr42, $vgpr43, $vgpr44, $vgpr45, $vgpr46
|
||||
|
||||
; GFX12-LABEL: name: test_swmmac_bf16_16x16x32_bf16_D0_overlaps_B1
|
||||
; GFX12: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25, $vgpr26, $vgpr27, $vgpr28, $vgpr29, $vgpr30, $vgpr31, $vgpr32, $vgpr33, $vgpr34, $vgpr35, $vgpr36, $vgpr37, $vgpr38, $vgpr39, $vgpr40, $vgpr41, $vgpr42, $vgpr43, $vgpr44, $vgpr45, $vgpr46
|
||||
; GFX12-NEXT: {{ $}}
|
||||
; GFX12-NEXT: early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = V_WMMA_F32_16X16X16_F16_w32_twoaddr 8, killed $vgpr10_vgpr11_vgpr12_vgpr13, 8, killed $vgpr14_vgpr15_vgpr16_vgpr17, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 0, 0, implicit $exec
|
||||
; GFX12-NEXT: V_NOP_e32 implicit $exec
|
||||
; GFX12-NEXT: early-clobber renamable $vgpr40_vgpr41_vgpr42_vgpr43 = V_SWMMAC_BF16_16X16X32_BF16_w32_twoaddr 8, killed $vgpr28_vgpr29_vgpr30_vgpr31, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, killed $vgpr40_vgpr41_vgpr42_vgpr43, killed $vgpr44, 0, 0, 0, implicit $exec
|
||||
early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = V_WMMA_F32_16X16X16_F16_w32_twoaddr 8, killed $vgpr10_vgpr11_vgpr12_vgpr13, 8, killed $vgpr14_vgpr15_vgpr16_vgpr17, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 0, 0, implicit $exec
|
||||
early-clobber renamable $vgpr40_vgpr41_vgpr42_vgpr43 = V_SWMMAC_BF16_16X16X32_BF16_w32_twoaddr 8, killed $vgpr28_vgpr29_vgpr30_vgpr31, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, killed $vgpr40_vgpr41_vgpr42_vgpr43, killed $vgpr44, 0, 0, 0, implicit $exec
|
||||
...
|
||||
|
||||
---
|
||||
name: test_swmmac_i32_16x16x32_iu8_D0_overlaps_C1
|
||||
body: |
|
||||
bb.0:
|
||||
liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25, $vgpr26, $vgpr27, $vgpr28, $vgpr29, $vgpr30, $vgpr31, $vgpr32, $vgpr33, $vgpr34, $vgpr35, $vgpr36, $vgpr37, $vgpr38, $vgpr39, $vgpr40, $vgpr41, $vgpr42, $vgpr43, $vgpr44
|
||||
|
||||
; GFX12-LABEL: name: test_swmmac_i32_16x16x32_iu8_D0_overlaps_C1
|
||||
; GFX12: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25, $vgpr26, $vgpr27, $vgpr28, $vgpr29, $vgpr30, $vgpr31, $vgpr32, $vgpr33, $vgpr34, $vgpr35, $vgpr36, $vgpr37, $vgpr38, $vgpr39, $vgpr40, $vgpr41, $vgpr42, $vgpr43, $vgpr44
|
||||
; GFX12-NEXT: {{ $}}
|
||||
; GFX12-NEXT: early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = V_WMMA_F32_16X16X16_F16_w32_twoaddr 8, killed $vgpr10_vgpr11_vgpr12_vgpr13, 8, killed $vgpr14_vgpr15_vgpr16_vgpr17, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 0, 0, implicit $exec
|
||||
; GFX12-NEXT: early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = V_SWMMAC_I32_16X16X32_IU8_w32_twoaddr 8, killed $vgpr28_vgpr29, 8, killed $vgpr30_vgpr31_vgpr32_vgpr33, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, killed $vgpr42, 0, 0, 0, implicit $exec
|
||||
early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = V_WMMA_F32_16X16X16_F16_w32_twoaddr 8, killed $vgpr10_vgpr11_vgpr12_vgpr13, 8, killed $vgpr14_vgpr15_vgpr16_vgpr17, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 0, 0, implicit $exec
|
||||
early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = V_SWMMAC_I32_16X16X32_IU8_w32_twoaddr 8, killed $vgpr28_vgpr29, 8, killed $vgpr30_vgpr31_vgpr32_vgpr33, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, killed $vgpr42, 0, 0, 0, implicit $exec
|
||||
...
|
||||
|
||||
---
|
||||
name: test_swmmac_i32_16x16x32_iu4_D0_overlaps_Index1
|
||||
body: |
|
||||
bb.0:
|
||||
liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25, $vgpr26, $vgpr27, $vgpr28, $vgpr29, $vgpr30, $vgpr31, $vgpr32, $vgpr33, $vgpr34, $vgpr35, $vgpr36, $vgpr37, $vgpr38, $vgpr39, $vgpr40, $vgpr41
|
||||
|
||||
; GFX12-LABEL: name: test_swmmac_i32_16x16x32_iu4_D0_overlaps_Index1
|
||||
; GFX12: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25, $vgpr26, $vgpr27, $vgpr28, $vgpr29, $vgpr30, $vgpr31, $vgpr32, $vgpr33, $vgpr34, $vgpr35, $vgpr36, $vgpr37, $vgpr38, $vgpr39, $vgpr40, $vgpr41
|
||||
; GFX12-NEXT: {{ $}}
|
||||
; GFX12-NEXT: early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = V_WMMA_F32_16X16X16_F16_w32_twoaddr 8, killed $vgpr10_vgpr11_vgpr12_vgpr13, 8, killed $vgpr14_vgpr15_vgpr16_vgpr17, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 0, 0, implicit $exec
|
||||
; GFX12-NEXT: V_NOP_e32 implicit $exec
|
||||
; GFX12-NEXT: early-clobber renamable $vgpr31_vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38 = V_SWMMAC_I32_16X16X32_IU4_w32_twoaddr 8, killed $vgpr28, 8, killed $vgpr28_vgpr29, killed $vgpr31_vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38, killed $vgpr0, 0, 0, 0, implicit $exec
|
||||
early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = V_WMMA_F32_16X16X16_F16_w32_twoaddr 8, killed $vgpr10_vgpr11_vgpr12_vgpr13, 8, killed $vgpr14_vgpr15_vgpr16_vgpr17, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 0, 0, implicit $exec
|
||||
early-clobber renamable $vgpr31_vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38 = V_SWMMAC_I32_16X16X32_IU4_w32_twoaddr 8, killed $vgpr28, 8, killed $vgpr28_vgpr29, killed $vgpr31_vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38, killed $vgpr0, 0, 0, 0, implicit $exec
|
||||
...
|
||||
|
||||
---
|
||||
name: test_swmmac_i32_16x16x64_iu4_D0_overlaps_A1
|
||||
body: |
|
||||
bb.0:
|
||||
liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25, $vgpr26, $vgpr27, $vgpr28, $vgpr29, $vgpr30, $vgpr31, $vgpr32, $vgpr33, $vgpr34, $vgpr35, $vgpr36, $vgpr37, $vgpr38, $vgpr39, $vgpr40, $vgpr41, $vgpr42, $vgpr43, $vgpr44
|
||||
|
||||
; GFX12-LABEL: name: test_swmmac_i32_16x16x64_iu4_D0_overlaps_A1
|
||||
; GFX12: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25, $vgpr26, $vgpr27, $vgpr28, $vgpr29, $vgpr30, $vgpr31, $vgpr32, $vgpr33, $vgpr34, $vgpr35, $vgpr36, $vgpr37, $vgpr38, $vgpr39, $vgpr40, $vgpr41, $vgpr42, $vgpr43, $vgpr44
|
||||
; GFX12-NEXT: {{ $}}
|
||||
; GFX12-NEXT: early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = V_WMMA_F32_16X16X16_F16_w32_twoaddr 8, killed $vgpr10_vgpr11_vgpr12_vgpr13, 8, killed $vgpr14_vgpr15_vgpr16_vgpr17, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 0, 0, implicit $exec
|
||||
; GFX12-NEXT: V_NOP_e32 implicit $exec
|
||||
; GFX12-NEXT: early-clobber renamable $vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41 = V_SWMMAC_I32_16X16X64_IU4_w32_twoaddr 8, killed $vgpr0_vgpr1, 8, killed $vgpr30_vgpr31_vgpr32_vgpr33, killed $vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41, killed $vgpr42, 0, 0, implicit $exec
|
||||
early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = V_WMMA_F32_16X16X16_F16_w32_twoaddr 8, killed $vgpr10_vgpr11_vgpr12_vgpr13, 8, killed $vgpr14_vgpr15_vgpr16_vgpr17, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 0, 0, implicit $exec
|
||||
early-clobber renamable $vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41 = V_SWMMAC_I32_16X16X64_IU4_w32_twoaddr 8, killed $vgpr0_vgpr1, 8, killed $vgpr30_vgpr31_vgpr32_vgpr33, killed $vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41, killed $vgpr42, 0, 0, implicit $exec
|
||||
...
|
||||
|
||||
---
|
||||
name: test_swmmac_f32_16x16x32_fp8_fp8_D0_overlaps_B1
|
||||
body: |
|
||||
bb.0:
|
||||
liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25, $vgpr26, $vgpr27, $vgpr28, $vgpr29, $vgpr30, $vgpr31, $vgpr32, $vgpr33, $vgpr34, $vgpr35, $vgpr36, $vgpr37, $vgpr38, $vgpr39, $vgpr40, $vgpr41, $vgpr42, $vgpr43, $vgpr44
|
||||
|
||||
; GFX12-LABEL: name: test_swmmac_f32_16x16x32_fp8_fp8_D0_overlaps_B1
|
||||
; GFX12: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25, $vgpr26, $vgpr27, $vgpr28, $vgpr29, $vgpr30, $vgpr31, $vgpr32, $vgpr33, $vgpr34, $vgpr35, $vgpr36, $vgpr37, $vgpr38, $vgpr39, $vgpr40, $vgpr41, $vgpr42, $vgpr43, $vgpr44
|
||||
; GFX12-NEXT: {{ $}}
|
||||
; GFX12-NEXT: early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = V_WMMA_F32_16X16X16_F16_w32_twoaddr 8, killed $vgpr10_vgpr11_vgpr12_vgpr13, 8, killed $vgpr14_vgpr15_vgpr16_vgpr17, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 0, 0, implicit $exec
|
||||
; GFX12-NEXT: V_NOP_e32 implicit $exec
|
||||
; GFX12-NEXT: early-clobber renamable $vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41 = V_SWMMAC_F32_16X16X32_FP8_FP8_w32_twoaddr killed $vgpr28_vgpr29, killed $vgpr0_vgpr1_vgpr2_vgpr3, killed $vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41, killed $vgpr42, 0, implicit $exec
|
||||
early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = V_WMMA_F32_16X16X16_F16_w32_twoaddr 8, killed $vgpr10_vgpr11_vgpr12_vgpr13, 8, killed $vgpr14_vgpr15_vgpr16_vgpr17, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 0, 0, implicit $exec
|
||||
early-clobber renamable $vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41 = V_SWMMAC_F32_16X16X32_FP8_FP8_w32_twoaddr killed $vgpr28_vgpr29, killed $vgpr0_vgpr1_vgpr2_vgpr3, killed $vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41, killed $vgpr42, 0, implicit $exec
|
||||
...
|
||||
|
||||
---
|
||||
name: test_swmmac_f32_16x16x32_fp8_bf8_D0_overlaps_C1
|
||||
body: |
|
||||
bb.0:
|
||||
liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25, $vgpr26, $vgpr27, $vgpr28, $vgpr29, $vgpr30, $vgpr31, $vgpr32, $vgpr33, $vgpr34, $vgpr35, $vgpr36, $vgpr37, $vgpr38, $vgpr39, $vgpr40, $vgpr41, $vgpr42, $vgpr43, $vgpr44
|
||||
|
||||
; GFX12-LABEL: name: test_swmmac_f32_16x16x32_fp8_bf8_D0_overlaps_C1
|
||||
; GFX12: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25, $vgpr26, $vgpr27, $vgpr28, $vgpr29, $vgpr30, $vgpr31, $vgpr32, $vgpr33, $vgpr34, $vgpr35, $vgpr36, $vgpr37, $vgpr38, $vgpr39, $vgpr40, $vgpr41, $vgpr42, $vgpr43, $vgpr44
|
||||
; GFX12-NEXT: {{ $}}
|
||||
; GFX12-NEXT: early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = V_WMMA_F32_16X16X16_F16_w32_twoaddr 8, killed $vgpr10_vgpr11_vgpr12_vgpr13, 8, killed $vgpr14_vgpr15_vgpr16_vgpr17, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 0, 0, implicit $exec
|
||||
; GFX12-NEXT: early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = V_SWMMAC_F32_16X16X32_FP8_BF8_w32_twoaddr killed $vgpr28_vgpr29, killed $vgpr30_vgpr31_vgpr32_vgpr33, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, killed $vgpr42, 0, implicit $exec
|
||||
early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = V_WMMA_F32_16X16X16_F16_w32_twoaddr 8, killed $vgpr10_vgpr11_vgpr12_vgpr13, 8, killed $vgpr14_vgpr15_vgpr16_vgpr17, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 0, 0, implicit $exec
|
||||
early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = V_SWMMAC_F32_16X16X32_FP8_BF8_w32_twoaddr killed $vgpr28_vgpr29, killed $vgpr30_vgpr31_vgpr32_vgpr33, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, killed $vgpr42, 0, implicit $exec
|
||||
...
|
||||
|
||||
---
|
||||
name: test_swmmac_f32_16x16x32_bf8_fp8_D0_overlaps_Index1
|
||||
body: |
|
||||
bb.0:
|
||||
liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25, $vgpr26, $vgpr27, $vgpr28, $vgpr29, $vgpr30, $vgpr31, $vgpr32, $vgpr33, $vgpr34, $vgpr35, $vgpr36, $vgpr37, $vgpr38, $vgpr39, $vgpr40, $vgpr41, $vgpr42, $vgpr43, $vgpr44
|
||||
|
||||
; GFX12-LABEL: name: test_swmmac_f32_16x16x32_bf8_fp8_D0_overlaps_Index1
|
||||
; GFX12: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25, $vgpr26, $vgpr27, $vgpr28, $vgpr29, $vgpr30, $vgpr31, $vgpr32, $vgpr33, $vgpr34, $vgpr35, $vgpr36, $vgpr37, $vgpr38, $vgpr39, $vgpr40, $vgpr41, $vgpr42, $vgpr43, $vgpr44
|
||||
; GFX12-NEXT: {{ $}}
|
||||
; GFX12-NEXT: early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = V_WMMA_F32_16X16X16_F16_w32_twoaddr 8, killed $vgpr10_vgpr11_vgpr12_vgpr13, 8, killed $vgpr14_vgpr15_vgpr16_vgpr17, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 0, 0, implicit $exec
|
||||
; GFX12-NEXT: V_NOP_e32 implicit $exec
|
||||
; GFX12-NEXT: early-clobber renamable $vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41 = V_SWMMAC_F32_16X16X32_BF8_FP8_w32_twoaddr killed $vgpr28_vgpr29, killed $vgpr30_vgpr31_vgpr32_vgpr33, killed $vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41, killed $vgpr0, 0, implicit $exec
|
||||
early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = V_WMMA_F32_16X16X16_F16_w32_twoaddr 8, killed $vgpr10_vgpr11_vgpr12_vgpr13, 8, killed $vgpr14_vgpr15_vgpr16_vgpr17, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 0, 0, implicit $exec
|
||||
early-clobber renamable $vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41 = V_SWMMAC_F32_16X16X32_BF8_FP8_w32_twoaddr killed $vgpr28_vgpr29, killed $vgpr30_vgpr31_vgpr32_vgpr33, killed $vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41, killed $vgpr0, 0, implicit $exec
|
||||
...
|
||||
|
||||
---
|
||||
name: test_swmmac_f32_16x16x32_bf8_bf8_D0_overlaps_A1
|
||||
body: |
|
||||
bb.0:
|
||||
liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25, $vgpr26, $vgpr27, $vgpr28, $vgpr29, $vgpr30, $vgpr31, $vgpr32, $vgpr33, $vgpr34, $vgpr35, $vgpr36, $vgpr37, $vgpr38, $vgpr39, $vgpr40, $vgpr41, $vgpr42, $vgpr43, $vgpr44
|
||||
|
||||
; GFX12-LABEL: name: test_swmmac_f32_16x16x32_bf8_bf8_D0_overlaps_A1
|
||||
; GFX12: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25, $vgpr26, $vgpr27, $vgpr28, $vgpr29, $vgpr30, $vgpr31, $vgpr32, $vgpr33, $vgpr34, $vgpr35, $vgpr36, $vgpr37, $vgpr38, $vgpr39, $vgpr40, $vgpr41, $vgpr42, $vgpr43, $vgpr44
|
||||
; GFX12-NEXT: {{ $}}
|
||||
; GFX12-NEXT: early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = V_WMMA_F32_16X16X16_F16_w32_twoaddr 8, killed $vgpr10_vgpr11_vgpr12_vgpr13, 8, killed $vgpr14_vgpr15_vgpr16_vgpr17, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 0, 0, implicit $exec
|
||||
; GFX12-NEXT: V_NOP_e32 implicit $exec
|
||||
; GFX12-NEXT: early-clobber renamable $vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41 = V_SWMMAC_F32_16X16X32_BF8_BF8_w32_twoaddr killed $vgpr0_vgpr1, killed $vgpr30_vgpr31_vgpr32_vgpr33, killed $vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41, killed $vgpr42, 0, implicit $exec
|
||||
early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = V_WMMA_F32_16X16X16_F16_w32_twoaddr 8, killed $vgpr10_vgpr11_vgpr12_vgpr13, 8, killed $vgpr14_vgpr15_vgpr16_vgpr17, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 0, 0, implicit $exec
|
||||
early-clobber renamable $vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41 = V_SWMMAC_F32_16X16X32_BF8_BF8_w32_twoaddr killed $vgpr0_vgpr1, killed $vgpr30_vgpr31_vgpr32_vgpr33, killed $vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41, killed $vgpr42, 0, implicit $exec
|
||||
...
|
355
llvm/test/CodeGen/AMDGPU/wmma-hazards-gfx12-w64.mir
Normal file
355
llvm/test/CodeGen/AMDGPU/wmma-hazards-gfx12-w64.mir
Normal file
@ -0,0 +1,355 @@
|
||||
# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
|
||||
# RUN: llc -march=amdgcn -mcpu=gfx1200 -verify-machineinstrs -mattr=-wavefrontsize32,+wavefrontsize64 -run-pass post-RA-hazard-rec %s -o - | FileCheck -check-prefix=GFX12 %s
|
||||
|
||||
# D0 overlaps A1, B1, C1 or Index1. Overlap starts at vgpr0.
|
||||
# $D0 = wmma0 $A0, $B0, $C0 or $D0 = swmmac0 $A0, $B0, $C0, $Index0
|
||||
# $D1 = wmma1 $A1, $B1, $C1 or $D1 = swmmac1 $A1, $B1, $C1, $Index1
|
||||
|
||||
---
|
||||
name: test_wmma_f32_16x16x16_f16_D0_overlaps_A1
|
||||
body: |
|
||||
bb.0:
|
||||
liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25
|
||||
|
||||
; GFX12-LABEL: name: test_wmma_f32_16x16x16_f16_D0_overlaps_A1
|
||||
; GFX12: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25
|
||||
; GFX12-NEXT: {{ $}}
|
||||
; GFX12-NEXT: early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3 = V_WMMA_F32_16X16X16_F16_w64_twoaddr 8, killed $vgpr6_vgpr7, 8, killed $vgpr8_vgpr9, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3, 0, 0, implicit $exec
|
||||
; GFX12-NEXT: V_NOP_e32 implicit $exec
|
||||
; GFX12-NEXT: early-clobber renamable $vgpr20_vgpr21_vgpr22_vgpr23 = V_WMMA_F32_16X16X16_F16_w64_twoaddr 8, killed $vgpr0_vgpr1, 8, killed $vgpr18_vgpr19, 8, killed $vgpr20_vgpr21_vgpr22_vgpr23, 0, 0, implicit $exec
|
||||
early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3 = V_WMMA_F32_16X16X16_F16_w64_twoaddr 8, killed $vgpr6_vgpr7, 8, killed $vgpr8_vgpr9, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3, 0, 0, implicit $exec
|
||||
early-clobber renamable $vgpr20_vgpr21_vgpr22_vgpr23 = V_WMMA_F32_16X16X16_F16_w64_twoaddr 8, killed $vgpr0_vgpr1, 8, killed $vgpr18_vgpr19, 8, killed $vgpr20_vgpr21_vgpr22_vgpr23, 0, 0, implicit $exec
|
||||
...
|
||||
|
||||
---
|
||||
name: test_wmma_f32_16x16x16_bf16_D0_overlaps_B1
|
||||
body: |
|
||||
bb.0:
|
||||
liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25
|
||||
|
||||
; GFX12-LABEL: name: test_wmma_f32_16x16x16_bf16_D0_overlaps_B1
|
||||
; GFX12: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25
|
||||
; GFX12-NEXT: {{ $}}
|
||||
; GFX12-NEXT: early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3 = V_WMMA_F32_16X16X16_F16_w64_twoaddr 8, killed $vgpr6_vgpr7, 8, killed $vgpr8_vgpr9, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3, 0, 0, implicit $exec
|
||||
; GFX12-NEXT: V_NOP_e32 implicit $exec
|
||||
; GFX12-NEXT: early-clobber renamable $vgpr20_vgpr21_vgpr22_vgpr23 = V_WMMA_F32_16X16X16_BF16_w64_twoaddr 8, killed $vgpr16_vgpr17, 8, killed $vgpr0_vgpr1, 8, killed $vgpr20_vgpr21_vgpr22_vgpr23, 0, 0, implicit $exec
|
||||
early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3 = V_WMMA_F32_16X16X16_F16_w64_twoaddr 8, killed $vgpr6_vgpr7, 8, killed $vgpr8_vgpr9, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3, 0, 0, implicit $exec
|
||||
early-clobber renamable $vgpr20_vgpr21_vgpr22_vgpr23 = V_WMMA_F32_16X16X16_BF16_w64_twoaddr 8, killed $vgpr16_vgpr17, 8, killed $vgpr0_vgpr1, 8, killed $vgpr20_vgpr21_vgpr22_vgpr23, 0, 0, implicit $exec
|
||||
...
|
||||
|
||||
---
|
||||
name: test_wmma_f16_16x16x16_f16_D0_overlaps_C1
|
||||
body: |
|
||||
bb.0:
|
||||
liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23
|
||||
|
||||
; GFX12-LABEL: name: test_wmma_f16_16x16x16_f16_D0_overlaps_C1
|
||||
; GFX12: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23
|
||||
; GFX12-NEXT: {{ $}}
|
||||
; GFX12-NEXT: early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3 = V_WMMA_F32_16X16X16_F16_w64_twoaddr 8, killed $vgpr6_vgpr7, 8, killed $vgpr8_vgpr9, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3, 0, 0, implicit $exec
|
||||
; GFX12-NEXT: V_NOP_e32 implicit $exec
|
||||
; GFX12-NEXT: early-clobber renamable $vgpr0_vgpr1 = V_WMMA_F16_16X16X16_F16_w64_twoaddr 8, killed $vgpr16_vgpr17, 8, killed $vgpr18_vgpr19, 8, killed $vgpr0_vgpr1, 0, 0, implicit $exec
|
||||
early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3 = V_WMMA_F32_16X16X16_F16_w64_twoaddr 8, killed $vgpr6_vgpr7, 8, killed $vgpr8_vgpr9, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3, 0, 0, implicit $exec
|
||||
early-clobber renamable $vgpr0_vgpr1 = V_WMMA_F16_16X16X16_F16_w64_twoaddr 8, killed $vgpr16_vgpr17, 8, killed $vgpr18_vgpr19, 8, killed $vgpr0_vgpr1, 0, 0, implicit $exec
|
||||
...
|
||||
|
||||
---
|
||||
name: test_wmma_bf16_16x16x16_bf16_D0_overlaps_A1
|
||||
body: |
|
||||
bb.0:
|
||||
liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23
|
||||
|
||||
; GFX12-LABEL: name: test_wmma_bf16_16x16x16_bf16_D0_overlaps_A1
|
||||
; GFX12: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23
|
||||
; GFX12-NEXT: {{ $}}
|
||||
; GFX12-NEXT: early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3 = V_WMMA_F32_16X16X16_F16_w64_twoaddr 8, killed $vgpr6_vgpr7, 8, killed $vgpr8_vgpr9, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3, 0, 0, implicit $exec
|
||||
; GFX12-NEXT: V_NOP_e32 implicit $exec
|
||||
; GFX12-NEXT: early-clobber renamable $vgpr20_vgpr21 = V_WMMA_BF16_16X16X16_BF16_w64_twoaddr 8, killed $vgpr0_vgpr1, 8, killed $vgpr18_vgpr19, 8, killed $vgpr20_vgpr21, 0, 0, implicit $exec
|
||||
early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3 = V_WMMA_F32_16X16X16_F16_w64_twoaddr 8, killed $vgpr6_vgpr7, 8, killed $vgpr8_vgpr9, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3, 0, 0, implicit $exec
|
||||
early-clobber renamable $vgpr20_vgpr21 = V_WMMA_BF16_16X16X16_BF16_w64_twoaddr 8, killed $vgpr0_vgpr1, 8, killed $vgpr18_vgpr19, 8, killed $vgpr20_vgpr21, 0, 0, implicit $exec
|
||||
...
|
||||
|
||||
---
|
||||
name: test_wmma_i32_16x16x16_iu8_D0_overlaps_B1
|
||||
body: |
|
||||
bb.0:
|
||||
liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23
|
||||
|
||||
; GFX12-LABEL: name: test_wmma_i32_16x16x16_iu8_D0_overlaps_B1
|
||||
; GFX12: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23
|
||||
; GFX12-NEXT: {{ $}}
|
||||
; GFX12-NEXT: early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3 = V_WMMA_F32_16X16X16_F16_w64_twoaddr 8, killed $vgpr6_vgpr7, 8, killed $vgpr8_vgpr9, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3, 0, 0, implicit $exec
|
||||
; GFX12-NEXT: V_NOP_e32 implicit $exec
|
||||
; GFX12-NEXT: early-clobber renamable $vgpr18_vgpr19_vgpr20_vgpr21 = V_WMMA_I32_16X16X16_IU8_w64_twoaddr 8, killed $vgpr16, 8, killed $vgpr0, killed $vgpr18_vgpr19_vgpr20_vgpr21, 0, 0, implicit $exec
|
||||
early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3 = V_WMMA_F32_16X16X16_F16_w64_twoaddr 8, killed $vgpr6_vgpr7, 8, killed $vgpr8_vgpr9, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3, 0, 0, implicit $exec
|
||||
early-clobber renamable $vgpr18_vgpr19_vgpr20_vgpr21 = V_WMMA_I32_16X16X16_IU8_w64_twoaddr 8, killed $vgpr16, 8, killed $vgpr0, killed $vgpr18_vgpr19_vgpr20_vgpr21, 0, 0, implicit $exec
|
||||
...
|
||||
|
||||
---
|
||||
name: test_wmma_i32_16x16x16_iu4_D0_overlaps_C1
|
||||
body: |
|
||||
bb.0:
|
||||
liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23
|
||||
|
||||
; GFX12-LABEL: name: test_wmma_i32_16x16x16_iu4_D0_overlaps_C1
|
||||
; GFX12: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23
|
||||
; GFX12-NEXT: {{ $}}
|
||||
; GFX12-NEXT: early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3 = V_WMMA_F32_16X16X16_F16_w64_twoaddr 8, killed $vgpr6_vgpr7, 8, killed $vgpr8_vgpr9, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3, 0, 0, implicit $exec
|
||||
; GFX12-NEXT: V_NOP_e32 implicit $exec
|
||||
; GFX12-NEXT: early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3 = V_WMMA_I32_16X16X16_IU4_w64_twoaddr 8, killed $vgpr16, 8, killed $vgpr17, killed $vgpr0_vgpr1_vgpr2_vgpr3, 0, 0, implicit $exec
|
||||
early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3 = V_WMMA_F32_16X16X16_F16_w64_twoaddr 8, killed $vgpr6_vgpr7, 8, killed $vgpr8_vgpr9, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3, 0, 0, implicit $exec
|
||||
early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3 = V_WMMA_I32_16X16X16_IU4_w64_twoaddr 8, killed $vgpr16, 8, killed $vgpr17, killed $vgpr0_vgpr1_vgpr2_vgpr3, 0, 0, implicit $exec
|
||||
...
|
||||
|
||||
---
|
||||
name: test_wmma_f32_16x16x16_fp8_fp8_D0_overlaps_A1
|
||||
body: |
|
||||
bb.0:
|
||||
liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23
|
||||
|
||||
; GFX12-LABEL: name: test_wmma_f32_16x16x16_fp8_fp8_D0_overlaps_A1
|
||||
; GFX12: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23
|
||||
; GFX12-NEXT: {{ $}}
|
||||
; GFX12-NEXT: early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3 = V_WMMA_F32_16X16X16_F16_w64_twoaddr 8, killed $vgpr6_vgpr7, 8, killed $vgpr8_vgpr9, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3, 0, 0, implicit $exec
|
||||
; GFX12-NEXT: V_NOP_e32 implicit $exec
|
||||
; GFX12-NEXT: early-clobber renamable $vgpr18_vgpr19_vgpr20_vgpr21 = V_WMMA_F32_16X16X16_FP8_FP8_w64_twoaddr killed $vgpr0, killed $vgpr17, 8, killed $vgpr18_vgpr19_vgpr20_vgpr21, 0, 0, implicit $exec
|
||||
early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3 = V_WMMA_F32_16X16X16_F16_w64_twoaddr 8, killed $vgpr6_vgpr7, 8, killed $vgpr8_vgpr9, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3, 0, 0, implicit $exec
|
||||
early-clobber renamable $vgpr18_vgpr19_vgpr20_vgpr21 = V_WMMA_F32_16X16X16_FP8_FP8_w64_twoaddr killed $vgpr0, killed $vgpr17, 8, killed $vgpr18_vgpr19_vgpr20_vgpr21, 0, 0, implicit $exec
|
||||
...
|
||||
|
||||
---
|
||||
name: test_wmma_f32_16x16x16_bf8_fp8_D0_overlaps_B1
|
||||
body: |
|
||||
bb.0:
|
||||
liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23
|
||||
|
||||
; GFX12-LABEL: name: test_wmma_f32_16x16x16_bf8_fp8_D0_overlaps_B1
|
||||
; GFX12: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23
|
||||
; GFX12-NEXT: {{ $}}
|
||||
; GFX12-NEXT: early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3 = V_WMMA_F32_16X16X16_F16_w64_twoaddr 8, killed $vgpr6_vgpr7, 8, killed $vgpr8_vgpr9, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3, 0, 0, implicit $exec
|
||||
; GFX12-NEXT: V_NOP_e32 implicit $exec
|
||||
; GFX12-NEXT: early-clobber renamable $vgpr18_vgpr19_vgpr20_vgpr21 = V_WMMA_F32_16X16X16_BF8_FP8_w64_twoaddr killed $vgpr16, killed $vgpr0, 8, killed $vgpr18_vgpr19_vgpr20_vgpr21, 0, 0, implicit $exec
|
||||
early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3 = V_WMMA_F32_16X16X16_F16_w64_twoaddr 8, killed $vgpr6_vgpr7, 8, killed $vgpr8_vgpr9, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3, 0, 0, implicit $exec
|
||||
early-clobber renamable $vgpr18_vgpr19_vgpr20_vgpr21 = V_WMMA_F32_16X16X16_BF8_FP8_w64_twoaddr killed $vgpr16, killed $vgpr0, 8, killed $vgpr18_vgpr19_vgpr20_vgpr21, 0, 0, implicit $exec
|
||||
...
|
||||
|
||||
---
|
||||
name: test_wmma_f32_16x16x16_fp8_bf8_D0_overlaps_C1
|
||||
body: |
|
||||
bb.0:
|
||||
liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23
|
||||
|
||||
; GFX12-LABEL: name: test_wmma_f32_16x16x16_fp8_bf8_D0_overlaps_C1
|
||||
; GFX12: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23
|
||||
; GFX12-NEXT: {{ $}}
|
||||
; GFX12-NEXT: early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3 = V_WMMA_F32_16X16X16_F16_w64_twoaddr 8, killed $vgpr6_vgpr7, 8, killed $vgpr8_vgpr9, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3, 0, 0, implicit $exec
|
||||
; GFX12-NEXT: V_NOP_e32 implicit $exec
|
||||
; GFX12-NEXT: early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3 = V_WMMA_F32_16X16X16_FP8_BF8_w64_twoaddr killed $vgpr16, killed $vgpr17, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3, 0, 0, implicit $exec
|
||||
early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3 = V_WMMA_F32_16X16X16_F16_w64_twoaddr 8, killed $vgpr6_vgpr7, 8, killed $vgpr8_vgpr9, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3, 0, 0, implicit $exec
|
||||
early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3 = V_WMMA_F32_16X16X16_FP8_BF8_w64_twoaddr killed $vgpr16, killed $vgpr17, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3, 0, 0, implicit $exec
|
||||
...
|
||||
|
||||
---
|
||||
name: test_wmma_f32_16x16x16_bf8_bf8_D0_overlaps_A1
|
||||
body: |
|
||||
bb.0:
|
||||
liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23
|
||||
|
||||
; GFX12-LABEL: name: test_wmma_f32_16x16x16_bf8_bf8_D0_overlaps_A1
|
||||
; GFX12: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23
|
||||
; GFX12-NEXT: {{ $}}
|
||||
; GFX12-NEXT: early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3 = V_WMMA_F32_16X16X16_F16_w64_twoaddr 8, killed $vgpr6_vgpr7, 8, killed $vgpr8_vgpr9, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3, 0, 0, implicit $exec
|
||||
; GFX12-NEXT: V_NOP_e32 implicit $exec
|
||||
; GFX12-NEXT: early-clobber renamable $vgpr18_vgpr19_vgpr20_vgpr21 = V_WMMA_F32_16X16X16_BF8_BF8_w64_twoaddr killed $vgpr0, killed $vgpr17, 8, killed $vgpr18_vgpr19_vgpr20_vgpr21, 0, 0, implicit $exec
|
||||
early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3 = V_WMMA_F32_16X16X16_F16_w64_twoaddr 8, killed $vgpr6_vgpr7, 8, killed $vgpr8_vgpr9, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3, 0, 0, implicit $exec
|
||||
early-clobber renamable $vgpr18_vgpr19_vgpr20_vgpr21 = V_WMMA_F32_16X16X16_BF8_BF8_w64_twoaddr killed $vgpr0, killed $vgpr17, 8, killed $vgpr18_vgpr19_vgpr20_vgpr21, 0, 0, implicit $exec
|
||||
...
|
||||
|
||||
---
|
||||
name: test_wmma_i32_16x16x32_iu4_D0_overlaps_B1
|
||||
body: |
|
||||
bb.0:
|
||||
liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23
|
||||
|
||||
; GFX12-LABEL: name: test_wmma_i32_16x16x32_iu4_D0_overlaps_B1
|
||||
; GFX12: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23
|
||||
; GFX12-NEXT: {{ $}}
|
||||
; GFX12-NEXT: early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3 = V_WMMA_F32_16X16X16_F16_w64_twoaddr 8, killed $vgpr6_vgpr7, 8, killed $vgpr8_vgpr9, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3, 0, 0, implicit $exec
|
||||
; GFX12-NEXT: V_NOP_e32 implicit $exec
|
||||
; GFX12-NEXT: early-clobber renamable $vgpr18_vgpr19_vgpr20_vgpr21 = V_WMMA_I32_16X16X32_IU4_w64_twoaddr 8, killed $vgpr16, 8, killed $vgpr0, killed $vgpr18_vgpr19_vgpr20_vgpr21, 0, 0, implicit $exec
|
||||
early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3 = V_WMMA_F32_16X16X16_F16_w64_twoaddr 8, killed $vgpr6_vgpr7, 8, killed $vgpr8_vgpr9, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3, 0, 0, implicit $exec
|
||||
early-clobber renamable $vgpr18_vgpr19_vgpr20_vgpr21 = V_WMMA_I32_16X16X32_IU4_w64_twoaddr 8, killed $vgpr16, 8, killed $vgpr0, killed $vgpr18_vgpr19_vgpr20_vgpr21, 0, 0, implicit $exec
|
||||
...
|
||||
|
||||
---
|
||||
name: test_swmmac_f32_16x16x32_f16_D0_overlaps_C1
|
||||
body: |
|
||||
bb.0:
|
||||
liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25, $vgpr26, $vgpr27, $vgpr28
|
||||
|
||||
; GFX12-LABEL: name: test_swmmac_f32_16x16x32_f16_D0_overlaps_C1
|
||||
; GFX12: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25, $vgpr26, $vgpr27, $vgpr28
|
||||
; GFX12-NEXT: {{ $}}
|
||||
; GFX12-NEXT: early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3 = V_WMMA_F32_16X16X16_F16_w64_twoaddr 8, killed $vgpr6_vgpr7, 8, killed $vgpr8_vgpr9, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3, 0, 0, implicit $exec
|
||||
; GFX12-NEXT: early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3 = V_SWMMAC_F32_16X16X32_F16_w64_twoaddr 8, killed $vgpr16_vgpr17, 8, killed $vgpr18_vgpr19_vgpr20_vgpr21, killed $vgpr0_vgpr1_vgpr2_vgpr3, killed $vgpr26, 0, 0, 0, implicit $exec
|
||||
early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3 = V_WMMA_F32_16X16X16_F16_w64_twoaddr 8, killed $vgpr6_vgpr7, 8, killed $vgpr8_vgpr9, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3, 0, 0, implicit $exec
|
||||
early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3 = V_SWMMAC_F32_16X16X32_F16_w64_twoaddr 8, killed $vgpr16_vgpr17, 8, killed $vgpr18_vgpr19_vgpr20_vgpr21, killed $vgpr0_vgpr1_vgpr2_vgpr3, killed $vgpr26, 0, 0, 0, implicit $exec
|
||||
...
|
||||
|
||||
---
|
||||
name: test_swmmac_f32_16x16x32_bf16_D0_overlaps_Index1
|
||||
body: |
|
||||
bb.0:
|
||||
liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25, $vgpr26, $vgpr27, $vgpr28
|
||||
|
||||
; GFX12-LABEL: name: test_swmmac_f32_16x16x32_bf16_D0_overlaps_Index1
|
||||
; GFX12: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25, $vgpr26, $vgpr27, $vgpr28
|
||||
; GFX12-NEXT: {{ $}}
|
||||
; GFX12-NEXT: early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3 = V_WMMA_F32_16X16X16_F16_w64_twoaddr 8, killed $vgpr6_vgpr7, 8, killed $vgpr8_vgpr9, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3, 0, 0, implicit $exec
|
||||
; GFX12-NEXT: V_NOP_e32 implicit $exec
|
||||
; GFX12-NEXT: early-clobber renamable $vgpr22_vgpr23_vgpr24_vgpr25 = V_SWMMAC_F32_16X16X32_BF16_w64_twoaddr 8, killed $vgpr16_vgpr17, 8, killed $vgpr18_vgpr19_vgpr20_vgpr21, killed $vgpr22_vgpr23_vgpr24_vgpr25, killed $vgpr0, 0, 0, 0, implicit $exec
|
||||
early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3 = V_WMMA_F32_16X16X16_F16_w64_twoaddr 8, killed $vgpr6_vgpr7, 8, killed $vgpr8_vgpr9, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3, 0, 0, implicit $exec
|
||||
early-clobber renamable $vgpr22_vgpr23_vgpr24_vgpr25 = V_SWMMAC_F32_16X16X32_BF16_w64_twoaddr 8, killed $vgpr16_vgpr17, 8, killed $vgpr18_vgpr19_vgpr20_vgpr21, killed $vgpr22_vgpr23_vgpr24_vgpr25, killed $vgpr0, 0, 0, 0, implicit $exec
|
||||
...
|
||||
|
||||
---
|
||||
name: test_swmmac_f16_16x16x32_f16_D0_overlaps_A1
|
||||
body: |
|
||||
bb.0:
|
||||
liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25, $vgpr26
|
||||
|
||||
; GFX12-LABEL: name: test_swmmac_f16_16x16x32_f16_D0_overlaps_A1
|
||||
; GFX12: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25, $vgpr26
|
||||
; GFX12-NEXT: {{ $}}
|
||||
; GFX12-NEXT: early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3 = V_WMMA_F32_16X16X16_F16_w64_twoaddr 8, killed $vgpr6_vgpr7, 8, killed $vgpr8_vgpr9, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3, 0, 0, implicit $exec
|
||||
; GFX12-NEXT: V_NOP_e32 implicit $exec
|
||||
; GFX12-NEXT: early-clobber renamable $vgpr22_vgpr23 = V_SWMMAC_F16_16X16X32_F16_w64_twoaddr 8, killed $vgpr0_vgpr1, 8, killed $vgpr18_vgpr19_vgpr20_vgpr21, killed $vgpr22_vgpr23, killed $vgpr24, 0, 0, 0, implicit $exec
|
||||
early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3 = V_WMMA_F32_16X16X16_F16_w64_twoaddr 8, killed $vgpr6_vgpr7, 8, killed $vgpr8_vgpr9, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3, 0, 0, implicit $exec
|
||||
early-clobber renamable $vgpr22_vgpr23 = V_SWMMAC_F16_16X16X32_F16_w64_twoaddr 8, killed $vgpr0_vgpr1, 8, killed $vgpr18_vgpr19_vgpr20_vgpr21, killed $vgpr22_vgpr23, killed $vgpr24, 0, 0, 0, implicit $exec
|
||||
...
|
||||
|
||||
---
|
||||
name: test_swmmac_bf16_16x16x32_bf16_D0_overlaps_B1
|
||||
body: |
|
||||
bb.0:
|
||||
liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25, $vgpr26
|
||||
|
||||
; GFX12-LABEL: name: test_swmmac_bf16_16x16x32_bf16_D0_overlaps_B1
|
||||
; GFX12: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25, $vgpr26
|
||||
; GFX12-NEXT: {{ $}}
|
||||
; GFX12-NEXT: early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3 = V_WMMA_F32_16X16X16_F16_w64_twoaddr 8, killed $vgpr6_vgpr7, 8, killed $vgpr8_vgpr9, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3, 0, 0, implicit $exec
|
||||
; GFX12-NEXT: V_NOP_e32 implicit $exec
|
||||
; GFX12-NEXT: early-clobber renamable $vgpr22_vgpr23 = V_SWMMAC_BF16_16X16X32_BF16_w64_twoaddr 8, killed $vgpr16_vgpr17, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3, killed $vgpr22_vgpr23, killed $vgpr24, 0, 0, 0, implicit $exec
|
||||
early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3 = V_WMMA_F32_16X16X16_F16_w64_twoaddr 8, killed $vgpr6_vgpr7, 8, killed $vgpr8_vgpr9, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3, 0, 0, implicit $exec
|
||||
early-clobber renamable $vgpr22_vgpr23 = V_SWMMAC_BF16_16X16X32_BF16_w64_twoaddr 8, killed $vgpr16_vgpr17, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3, killed $vgpr22_vgpr23, killed $vgpr24, 0, 0, 0, implicit $exec
|
||||
...
|
||||
|
||||
---
|
||||
name: test_swmmac_i32_16x16x32_iu8_D0_overlaps_C1
|
||||
body: |
|
||||
bb.0:
|
||||
liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25
|
||||
|
||||
; GFX12-LABEL: name: test_swmmac_i32_16x16x32_iu8_D0_overlaps_C1
|
||||
; GFX12: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25
|
||||
; GFX12-NEXT: {{ $}}
|
||||
; GFX12-NEXT: early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3 = V_WMMA_F32_16X16X16_F16_w64_twoaddr 8, killed $vgpr6_vgpr7, 8, killed $vgpr8_vgpr9, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3, 0, 0, implicit $exec
|
||||
; GFX12-NEXT: early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3 = V_SWMMAC_I32_16X16X32_IU8_w64_twoaddr 8, killed $vgpr16, 8, killed $vgpr17_vgpr18, killed $vgpr0_vgpr1_vgpr2_vgpr3, killed $vgpr23, 0, 0, 0, implicit $exec
|
||||
early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3 = V_WMMA_F32_16X16X16_F16_w64_twoaddr 8, killed $vgpr6_vgpr7, 8, killed $vgpr8_vgpr9, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3, 0, 0, implicit $exec
|
||||
early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3 = V_SWMMAC_I32_16X16X32_IU8_w64_twoaddr 8, killed $vgpr16, 8, killed $vgpr17_vgpr18, killed $vgpr0_vgpr1_vgpr2_vgpr3, killed $vgpr23, 0, 0, 0, implicit $exec
|
||||
...
|
||||
|
||||
---
|
||||
name: test_swmmac_i32_16x16x32_iu4_D0_overlaps_Index1
|
||||
body: |
|
||||
bb.0:
|
||||
liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24
|
||||
|
||||
; GFX12-LABEL: name: test_swmmac_i32_16x16x32_iu4_D0_overlaps_Index1
|
||||
; GFX12: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24
|
||||
; GFX12-NEXT: {{ $}}
|
||||
; GFX12-NEXT: early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3 = V_WMMA_F32_16X16X16_F16_w64_twoaddr 8, killed $vgpr6_vgpr7, 8, killed $vgpr8_vgpr9, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3, 0, 0, implicit $exec
|
||||
; GFX12-NEXT: V_NOP_e32 implicit $exec
|
||||
; GFX12-NEXT: early-clobber renamable $vgpr18_vgpr19_vgpr20_vgpr21 = V_SWMMAC_I32_16X16X32_IU4_w64_twoaddr 8, killed $vgpr16, 8, killed $vgpr16, killed $vgpr18_vgpr19_vgpr20_vgpr21, killed $vgpr0, 0, 0, 0, implicit $exec
|
||||
early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3 = V_WMMA_F32_16X16X16_F16_w64_twoaddr 8, killed $vgpr6_vgpr7, 8, killed $vgpr8_vgpr9, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3, 0, 0, implicit $exec
|
||||
early-clobber renamable $vgpr18_vgpr19_vgpr20_vgpr21 = V_SWMMAC_I32_16X16X32_IU4_w64_twoaddr 8, killed $vgpr16, 8, killed $vgpr16, killed $vgpr18_vgpr19_vgpr20_vgpr21, killed $vgpr0, 0, 0, 0, implicit $exec
|
||||
...
|
||||
|
||||
---
|
||||
name: test_swmmac_i32_16x16x64_iu4_D0_overlaps_A1
|
||||
body: |
|
||||
bb.0:
|
||||
liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25
|
||||
|
||||
; GFX12-LABEL: name: test_swmmac_i32_16x16x64_iu4_D0_overlaps_A1
|
||||
; GFX12: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25
|
||||
; GFX12-NEXT: {{ $}}
|
||||
; GFX12-NEXT: early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3 = V_WMMA_F32_16X16X16_F16_w64_twoaddr 8, killed $vgpr6_vgpr7, 8, killed $vgpr8_vgpr9, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3, 0, 0, implicit $exec
|
||||
; GFX12-NEXT: V_NOP_e32 implicit $exec
|
||||
; GFX12-NEXT: early-clobber renamable $vgpr19_vgpr20_vgpr21_vgpr22 = V_SWMMAC_I32_16X16X64_IU4_w64_twoaddr 8, killed $vgpr0, 8, killed $vgpr17_vgpr18, killed $vgpr19_vgpr20_vgpr21_vgpr22, killed $vgpr23, 0, 0, 0, implicit $exec
|
||||
early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3 = V_WMMA_F32_16X16X16_F16_w64_twoaddr 8, killed $vgpr6_vgpr7, 8, killed $vgpr8_vgpr9, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3, 0, 0, implicit $exec
|
||||
early-clobber renamable $vgpr19_vgpr20_vgpr21_vgpr22 = V_SWMMAC_I32_16X16X64_IU4_w64_twoaddr 8, killed $vgpr0, 8, killed $vgpr17_vgpr18, killed $vgpr19_vgpr20_vgpr21_vgpr22, killed $vgpr23, 0, 0, 0, implicit $exec
|
||||
...
|
||||
|
||||
---
|
||||
name: test_swmmac_f32_16x16x32_fp8_fp8_D0_overlaps_B1
|
||||
body: |
|
||||
bb.0:
|
||||
liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25
|
||||
|
||||
; GFX12-LABEL: name: test_swmmac_f32_16x16x32_fp8_fp8_D0_overlaps_B1
|
||||
; GFX12: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25
|
||||
; GFX12-NEXT: {{ $}}
|
||||
; GFX12-NEXT: early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3 = V_WMMA_F32_16X16X16_F16_w64_twoaddr 8, killed $vgpr6_vgpr7, 8, killed $vgpr8_vgpr9, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3, 0, 0, implicit $exec
|
||||
; GFX12-NEXT: V_NOP_e32 implicit $exec
|
||||
; GFX12-NEXT: early-clobber renamable $vgpr19_vgpr20_vgpr21_vgpr22 = V_SWMMAC_F32_16X16X32_FP8_FP8_w64_twoaddr killed $vgpr16, killed $vgpr0_vgpr1, killed $vgpr19_vgpr20_vgpr21_vgpr22, killed $vgpr23, 0, implicit $exec
|
||||
early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3 = V_WMMA_F32_16X16X16_F16_w64_twoaddr 8, killed $vgpr6_vgpr7, 8, killed $vgpr8_vgpr9, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3, 0, 0, implicit $exec
|
||||
early-clobber renamable $vgpr19_vgpr20_vgpr21_vgpr22 = V_SWMMAC_F32_16X16X32_FP8_FP8_w64_twoaddr killed $vgpr16, killed $vgpr0_vgpr1, killed $vgpr19_vgpr20_vgpr21_vgpr22, killed $vgpr23, 0, implicit $exec
|
||||
...
|
||||
|
||||
---
|
||||
name: test_swmmac_f32_16x16x32_fp8_bf8_D0_overlaps_C1
|
||||
body: |
|
||||
bb.0:
|
||||
liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25
|
||||
|
||||
; GFX12-LABEL: name: test_swmmac_f32_16x16x32_fp8_bf8_D0_overlaps_C1
|
||||
; GFX12: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25
|
||||
; GFX12-NEXT: {{ $}}
|
||||
; GFX12-NEXT: early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3 = V_WMMA_F32_16X16X16_F16_w64_twoaddr 8, killed $vgpr6_vgpr7, 8, killed $vgpr8_vgpr9, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3, 0, 0, implicit $exec
|
||||
; GFX12-NEXT: early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3 = V_SWMMAC_F32_16X16X32_FP8_BF8_w64_twoaddr killed $vgpr16, killed $vgpr17_vgpr18, killed $vgpr0_vgpr1_vgpr2_vgpr3, killed $vgpr23, 0, implicit $exec
|
||||
early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3 = V_WMMA_F32_16X16X16_F16_w64_twoaddr 8, killed $vgpr6_vgpr7, 8, killed $vgpr8_vgpr9, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3, 0, 0, implicit $exec
|
||||
early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3 = V_SWMMAC_F32_16X16X32_FP8_BF8_w64_twoaddr killed $vgpr16, killed $vgpr17_vgpr18, killed $vgpr0_vgpr1_vgpr2_vgpr3, killed $vgpr23, 0, implicit $exec
|
||||
...
|
||||
|
||||
---
|
||||
name: test_swmmac_f32_16x16x32_bf8_fp8_D0_overlaps_Index1
|
||||
body: |
|
||||
bb.0:
|
||||
liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25
|
||||
|
||||
; GFX12-LABEL: name: test_swmmac_f32_16x16x32_bf8_fp8_D0_overlaps_Index1
|
||||
; GFX12: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25
|
||||
; GFX12-NEXT: {{ $}}
|
||||
; GFX12-NEXT: early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3 = V_WMMA_F32_16X16X16_F16_w64_twoaddr 8, killed $vgpr6_vgpr7, 8, killed $vgpr8_vgpr9, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3, 0, 0, implicit $exec
|
||||
; GFX12-NEXT: V_NOP_e32 implicit $exec
|
||||
; GFX12-NEXT: early-clobber renamable $vgpr19_vgpr20_vgpr21_vgpr22 = V_SWMMAC_F32_16X16X32_BF8_FP8_w64_twoaddr killed $vgpr16, killed $vgpr17_vgpr18, killed $vgpr19_vgpr20_vgpr21_vgpr22, killed $vgpr0, 0, implicit $exec
|
||||
early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3 = V_WMMA_F32_16X16X16_F16_w64_twoaddr 8, killed $vgpr6_vgpr7, 8, killed $vgpr8_vgpr9, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3, 0, 0, implicit $exec
|
||||
early-clobber renamable $vgpr19_vgpr20_vgpr21_vgpr22 = V_SWMMAC_F32_16X16X32_BF8_FP8_w64_twoaddr killed $vgpr16, killed $vgpr17_vgpr18, killed $vgpr19_vgpr20_vgpr21_vgpr22, killed $vgpr0, 0, implicit $exec
|
||||
...
|
||||
|
||||
---
|
||||
name: test_swmmac_f32_16x16x32_bf8_bf8_D0_overlaps_A1
|
||||
body: |
|
||||
bb.0:
|
||||
liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25
|
||||
|
||||
; GFX12-LABEL: name: test_swmmac_f32_16x16x32_bf8_bf8_D0_overlaps_A1
|
||||
; GFX12: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25
|
||||
; GFX12-NEXT: {{ $}}
|
||||
; GFX12-NEXT: early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3 = V_WMMA_F32_16X16X16_F16_w64_twoaddr 8, killed $vgpr6_vgpr7, 8, killed $vgpr8_vgpr9, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3, 0, 0, implicit $exec
|
||||
; GFX12-NEXT: V_NOP_e32 implicit $exec
|
||||
; GFX12-NEXT: early-clobber renamable $vgpr19_vgpr20_vgpr21_vgpr22 = V_SWMMAC_F32_16X16X32_BF8_BF8_w64_twoaddr killed $vgpr0, killed $vgpr17_vgpr18, killed $vgpr19_vgpr20_vgpr21_vgpr22, killed $vgpr23, 0, implicit $exec
|
||||
early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3 = V_WMMA_F32_16X16X16_F16_w64_twoaddr 8, killed $vgpr6_vgpr7, 8, killed $vgpr8_vgpr9, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3, 0, 0, implicit $exec
|
||||
early-clobber renamable $vgpr19_vgpr20_vgpr21_vgpr22 = V_SWMMAC_F32_16X16X32_BF8_BF8_w64_twoaddr killed $vgpr0, killed $vgpr17_vgpr18, killed $vgpr19_vgpr20_vgpr21_vgpr22, killed $vgpr23, 0, implicit $exec
|
||||
...
|
1529
llvm/test/MC/AMDGPU/gfx12_asm_wmma_w32.s
Normal file
1529
llvm/test/MC/AMDGPU/gfx12_asm_wmma_w32.s
Normal file
File diff suppressed because it is too large
Load Diff
1529
llvm/test/MC/AMDGPU/gfx12_asm_wmma_w64.s
Normal file
1529
llvm/test/MC/AMDGPU/gfx12_asm_wmma_w64.s
Normal file
File diff suppressed because it is too large
Load Diff
1628
llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_wmma_w32.txt
Normal file
1628
llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_wmma_w32.txt
Normal file
File diff suppressed because it is too large
Load Diff
1628
llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_wmma_w64.txt
Normal file
1628
llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_wmma_w64.txt
Normal file
File diff suppressed because it is too large
Load Diff
@ -253,22 +253,23 @@ def ROCDL_mfma_f32_32x32x16_fp8_fp8 : ROCDL_Mfma_IntrOp<"mfma.f32.32x32x16.fp8.f
|
||||
|
||||
//===---------------------------------------------------------------------===//
|
||||
// WMMA intrinsics
|
||||
class ROCDL_Wmma_IntrOp<string mnemonic, list<Trait> traits = []> :
|
||||
class ROCDL_Wmma_IntrOp<string mnemonic, list<int> overloadedOperands,
|
||||
list<Trait> traits = []> :
|
||||
LLVM_IntrOpBase<ROCDL_Dialect, mnemonic,
|
||||
"amdgcn_" # !subst(".","_", mnemonic),
|
||||
[0], [], traits, 1>,
|
||||
[0], overloadedOperands, traits, 1>,
|
||||
Arguments<(ins Variadic<LLVM_Type>:$args)> {
|
||||
let assemblyFormat =
|
||||
"$args attr-dict `:` functional-type($args, $res)";
|
||||
}
|
||||
|
||||
// Available on RDNA3
|
||||
def ROCDL_wmma_f32_16x16x16_f16 : ROCDL_Wmma_IntrOp<"wmma.f32.16x16x16.f16">;
|
||||
def ROCDL_wmma_f32_16x16x16_bf16 : ROCDL_Wmma_IntrOp<"wmma.f32.16x16x16.bf16">;
|
||||
def ROCDL_wmma_f16_16x16x16_f16 : ROCDL_Wmma_IntrOp<"wmma.f16.16x16x16.f16">;
|
||||
def ROCDL_wmma_bf16_16x16x16_bf16 : ROCDL_Wmma_IntrOp<"wmma.bf16.16x16x16.bf16">;
|
||||
def ROCDL_wmma_i32_16x16x16_iu8 : ROCDL_Wmma_IntrOp<"wmma.i32.16x16x16.iu8">;
|
||||
def ROCDL_wmma_i32_16x16x16_iu4 : ROCDL_Wmma_IntrOp<"wmma.i32.16x16x16.iu4">;
|
||||
def ROCDL_wmma_f32_16x16x16_f16 : ROCDL_Wmma_IntrOp<"wmma.f32.16x16x16.f16", [0]>;
|
||||
def ROCDL_wmma_f32_16x16x16_bf16 : ROCDL_Wmma_IntrOp<"wmma.f32.16x16x16.bf16", [0]>;
|
||||
def ROCDL_wmma_f16_16x16x16_f16 : ROCDL_Wmma_IntrOp<"wmma.f16.16x16x16.f16", [0]>;
|
||||
def ROCDL_wmma_bf16_16x16x16_bf16 : ROCDL_Wmma_IntrOp<"wmma.bf16.16x16x16.bf16", [0]>;
|
||||
def ROCDL_wmma_i32_16x16x16_iu8 : ROCDL_Wmma_IntrOp<"wmma.i32.16x16x16.iu8", [1]>;
|
||||
def ROCDL_wmma_i32_16x16x16_iu4 : ROCDL_Wmma_IntrOp<"wmma.i32.16x16x16.iu4", [1]>;
|
||||
|
||||
//===---------------------------------------------------------------------===//
|
||||
// Operations on raw buffer resources (stride of 0, bounds checks either off or in
|
||||
|
@ -248,53 +248,53 @@ llvm.func @rocdl.wmma(%arg0 : vector<8xf32>, %arg1 : vector<16 x f16>, %arg2 : v
|
||||
// ---- Wave32 -----
|
||||
|
||||
// f16 -> f32
|
||||
// CHECK: call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.f16.v8f32(<16 x half> %{{.*}}, <16 x half> %{{.*}}, <8 x float> %{{.*}})
|
||||
// CHECK: call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.f16.v8f32.v16f16(<16 x half> %{{.*}}, <16 x half> %{{.*}}, <8 x float> %{{.*}})
|
||||
%r0 = rocdl.wmma.f32.16x16x16.f16 %arg1, %arg1, %arg0 : (vector<16xf16>, vector<16xf16>, vector<8xf32>) -> vector<8xf32>
|
||||
|
||||
// bf16 -> f32
|
||||
// CHECK: call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf16.v8f32(<16 x i16> %{{.*}}, <16 x i16> %{{.*}}, <8 x float> %{{.*}})
|
||||
// CHECK: call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf16.v8f32.v16i16(<16 x i16> %{{.*}}, <16 x i16> %{{.*}}, <8 x float> %{{.*}})
|
||||
%r1 = rocdl.wmma.f32.16x16x16.bf16 %arg2, %arg2, %arg0 : (vector<16xi16>, vector<16xi16>, vector<8xf32>) -> vector<8xf32>
|
||||
|
||||
// f16 -> f16 (OPSEL = {0,1})
|
||||
// CHECK: call <16 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16.v16f16(<16 x half> %{{.*}}, <16 x half> %{{.*}}, <16 x half> %{{.*}}, i1 {{.*}})
|
||||
// CHECK: call <16 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16.v16f16.v16f16(<16 x half> %{{.*}}, <16 x half> %{{.*}}, <16 x half> %{{.*}}, i1 {{.*}})
|
||||
%r2 = rocdl.wmma.f16.16x16x16.f16 %arg1, %arg1, %arg1, %zero : (vector<16xf16>, vector<16xf16>, vector<16xf16>, i1) -> vector<16xf16>
|
||||
|
||||
// bf16 -> bf16 (OPSEL = {0,1})
|
||||
// CHECK: call <16 x i16> @llvm.amdgcn.wmma.bf16.16x16x16.bf16.v16i16(<16 x i16> %{{.*}}, <16 x i16> %{{.*}}, <16 x i16> %{{.*}}, i1 {{.*}})
|
||||
// CHECK: call <16 x i16> @llvm.amdgcn.wmma.bf16.16x16x16.bf16.v16i16.v16i16(<16 x i16> %{{.*}}, <16 x i16> %{{.*}}, <16 x i16> %{{.*}}, i1 {{.*}})
|
||||
%r4 = rocdl.wmma.bf16.16x16x16.bf16 %arg2, %arg2, %arg2, %zero : (vector<16xi16>, vector<16xi16>, vector<16xi16>, i1) -> vector<16xi16>
|
||||
|
||||
// int8 -> int32 (signA = {0,1}, signB = {0,1}, clamp = {0,1})
|
||||
// CHECK: call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8.v8i32(i1 {{.*}}, <4 x i32> %{{.*}}, i1 {{.*}}, <4 x i32> %{{.*}}, <8 x i32> %{{.*}}, i1 {{.*}})
|
||||
// CHECK: call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8.v8i32.v4i32(i1 {{.*}}, <4 x i32> %{{.*}}, i1 {{.*}}, <4 x i32> %{{.*}}, <8 x i32> %{{.*}}, i1 {{.*}})
|
||||
%r5 = rocdl.wmma.i32.16x16x16.iu8 %zero, %arg5, %zero, %arg5, %arg3, %zero : (i1, vector<4xi32>, i1, vector<4xi32>, vector<8xi32>, i1) -> vector<8xi32>
|
||||
|
||||
// int4 -> int32 (signA = {0,1}, signB = {0,1}, clamp = {0,1})
|
||||
// CHECK: call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4.v8i32(i1 {{.*}}, <2 x i32> %{{.*}}, i1 {{.*}}, <2 x i32> %{{.*}}, <8 x i32> %{{.*}}, i1 {{.*}})
|
||||
// CHECK: call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4.v8i32.v2i32(i1 {{.*}}, <2 x i32> %{{.*}}, i1 {{.*}}, <2 x i32> %{{.*}}, <8 x i32> %{{.*}}, i1 {{.*}})
|
||||
%r6 = rocdl.wmma.i32.16x16x16.iu4 %zero, %arg4, %zero, %arg4, %arg3, %zero : (i1, vector<2xi32>, i1, vector<2xi32>, vector<8xi32>, i1) -> vector<8xi32>
|
||||
|
||||
// ---- Wave64 -----
|
||||
|
||||
// f16 -> f32
|
||||
// CHECK: call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.f16.v4f32(<16 x half> %{{.*}}, <16 x half> %{{.*}}, <4 x float> %{{.*}})
|
||||
// CHECK: call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.f16.v4f32.v16f16(<16 x half> %{{.*}}, <16 x half> %{{.*}}, <4 x float> %{{.*}})
|
||||
%r7 = rocdl.wmma.f32.16x16x16.f16 %arg1, %arg1, %arg6 : (vector<16xf16>, vector<16xf16>, vector<4xf32>) -> vector<4xf32>
|
||||
|
||||
// bf16 -> f32
|
||||
// CHECK: call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf16.v4f32(<16 x i16> %{{.*}}, <16 x i16> %{{.*}}, <4 x float> %{{.*}})
|
||||
// CHECK: call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf16.v4f32.v16i16(<16 x i16> %{{.*}}, <16 x i16> %{{.*}}, <4 x float> %{{.*}})
|
||||
%r8 = rocdl.wmma.f32.16x16x16.bf16 %arg2, %arg2, %arg6 : (vector<16xi16>, vector<16xi16>, vector<4xf32>) -> vector<4xf32>
|
||||
|
||||
// f16 -> f16 (OPSEL = {0,1})
|
||||
// CHECK: call <8 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16.v8f16(<16 x half> %{{.*}}, <16 x half> %{{.*}}, <8 x half> %{{.*}}, i1 {{.*}})
|
||||
// CHECK: call <8 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16.v8f16.v16f16(<16 x half> %{{.*}}, <16 x half> %{{.*}}, <8 x half> %{{.*}}, i1 {{.*}})
|
||||
%r9 = rocdl.wmma.f16.16x16x16.f16 %arg1, %arg1, %arg7, %zero : (vector<16xf16>, vector<16xf16>, vector<8xf16>, i1) -> vector<8xf16>
|
||||
|
||||
// bf16 -> bf16 (OPSEL = {0,1})
|
||||
// CHECK: call <8 x i16> @llvm.amdgcn.wmma.bf16.16x16x16.bf16.v8i16(<16 x i16> %{{.*}}, <16 x i16> %{{.*}}, <8 x i16> %{{.*}}, i1 {{.*}})
|
||||
// CHECK: call <8 x i16> @llvm.amdgcn.wmma.bf16.16x16x16.bf16.v8i16.v16i16(<16 x i16> %{{.*}}, <16 x i16> %{{.*}}, <8 x i16> %{{.*}}, i1 {{.*}})
|
||||
%r11 = rocdl.wmma.bf16.16x16x16.bf16 %arg2, %arg2, %arg8, %zero : (vector<16xi16>, vector<16xi16>, vector<8xi16>, i1) -> vector<8xi16>
|
||||
|
||||
// int8 -> int32 (signA = {0,1}, signB = {0,1}, clamp = {0,1})
|
||||
// CHECK: call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8.v4i32(i1 {{.*}}, <4 x i32> %{{.*}}, i1 {{.*}}, <4 x i32> %{{.*}}, <4 x i32> %{{.*}}, i1 {{.*}})
|
||||
// CHECK: call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8.v4i32.v4i32(i1 {{.*}}, <4 x i32> %{{.*}}, i1 {{.*}}, <4 x i32> %{{.*}}, <4 x i32> %{{.*}}, i1 {{.*}})
|
||||
%r12 = rocdl.wmma.i32.16x16x16.iu8 %zero, %arg5, %zero, %arg5, %arg5, %zero : (i1, vector<4xi32>, i1, vector<4xi32>, vector<4xi32>, i1) -> vector<4xi32>
|
||||
|
||||
// int4 -> int32 (signA = {0,1}, signB = {0,1}, clamp = {0,1})
|
||||
// CHECK: call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4.v4i32(i1 {{.*}}, <2 x i32> %{{.*}}, i1 {{.*}}, <2 x i32> %{{.*}}, <4 x i32> %{{.*}}, i1 {{.*}})
|
||||
// CHECK: call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4.v4i32.v2i32(i1 {{.*}}, <2 x i32> %{{.*}}, i1 {{.*}}, <2 x i32> %{{.*}}, <4 x i32> %{{.*}}, i1 {{.*}})
|
||||
%r13 = rocdl.wmma.i32.16x16x16.iu4 %zero, %arg4, %zero, %arg4, %arg5, %zero : (i1, vector<2xi32>, i1, vector<2xi32>, vector<4xi32>, i1) -> vector<4xi32>
|
||||
|
||||
llvm.return %r0 : vector<8xf32>
|
||||
|
Loading…
Reference in New Issue
Block a user