From ed48280f8e9dd0fc2a21c3ee24c8db3fe12702f8 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mirko=20Brku=C5=A1anin?= Date: Wed, 24 Jan 2024 13:43:07 +0100 Subject: [PATCH] [AMDGPU] Add GFX12 WMMA and SWMMAC instructions (#77795) Co-authored-by: Petar Avramovic Co-authored-by: Piotr Sobczak --- clang/include/clang/Basic/BuiltinsAMDGPU.def | 62 + clang/lib/CodeGen/CGBuiltin.cpp | 177 +- .../builtins-amdgcn-gfx12-wmma-w32.cl | 156 ++ .../builtins-amdgcn-gfx12-wmma-w64.cl | 155 ++ .../builtins-amdgcn-swmmac-w32.cl | 135 ++ .../builtins-amdgcn-swmmac-w64.cl | 134 ++ .../builtins-amdgcn-wmma-w32-gfx10-err.cl | 2 +- .../CodeGenOpenCL/builtins-amdgcn-wmma-w32.cl | 17 +- .../CodeGenOpenCL/builtins-amdgcn-wmma-w64.cl | 18 +- .../amdgpu/builtins-amdgcn-gfx12-wmma-w32.cl | 107 ++ .../amdgpu/builtins-amdgcn-gfx12-wmma-w64.cl | 104 ++ .../amdgpu/builtins-amdgcn-swmmac-w32.cl | 110 ++ .../amdgpu/builtins-amdgcn-swmmac-w64.cl | 109 ++ llvm/include/llvm/IR/IntrinsicsAMDGPU.td | 115 +- llvm/lib/Target/AMDGPU/AMDGPUGISel.td | 24 + llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp | 330 ++++ llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.h | 10 + .../AMDGPU/AMDGPUInstructionSelector.cpp | 213 +++ .../Target/AMDGPU/AMDGPUInstructionSelector.h | 13 + .../lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp | 23 + .../Target/AMDGPU/AMDGPURegisterBankInfo.cpp | 16 + .../Target/AMDGPU/AMDGPUSearchableTables.td | 16 + .../AMDGPU/AsmParser/AMDGPUAsmParser.cpp | 111 +- .../Disassembler/AMDGPUDisassembler.cpp | 8 + .../lib/Target/AMDGPU/GCNHazardRecognizer.cpp | 19 +- .../AMDGPU/MCTargetDesc/AMDGPUInstPrinter.cpp | 37 + .../AMDGPU/MCTargetDesc/AMDGPUInstPrinter.h | 4 + llvm/lib/Target/AMDGPU/SIDefines.h | 3 + llvm/lib/Target/AMDGPU/SIFoldOperands.cpp | 1 + llvm/lib/Target/AMDGPU/SIISelLowering.cpp | 30 + llvm/lib/Target/AMDGPU/SIInstrFormats.td | 5 + llvm/lib/Target/AMDGPU/SIInstrInfo.h | 8 + llvm/lib/Target/AMDGPU/SIInstrInfo.td | 11 + llvm/lib/Target/AMDGPU/SIRegisterInfo.td | 5 + llvm/lib/Target/AMDGPU/VOP3PInstructions.td | 500 ++++- llvm/lib/Target/AMDGPU/VOPInstructions.td | 3 + .../UniformityAnalysis/AMDGPU/intrinsics.ll | 135 +- ...wmma-gfx12-w32-f16-f32-matrix-modifiers.ll | 504 +++++ .../AMDGPU/GlobalISel/wmma-gfx12-w32-imm.ll | 519 ++++++ .../GlobalISel/wmma-gfx12-w32-iu-modifiers.ll | 309 ++++ .../wmma-gfx12-w32-swmmac-index_key.ll | 321 ++++ .../AMDGPU/GlobalISel/wmma-gfx12-w32.ll | 370 ++++ ...wmma-gfx12-w64-f16-f32-matrix-modifiers.ll | 459 +++++ .../AMDGPU/GlobalISel/wmma-gfx12-w64-imm.ll | 430 +++++ .../GlobalISel/wmma-gfx12-w64-iu-modifiers.ll | 274 +++ .../wmma-gfx12-w64-swmmac-index_key.ll | 472 +++++ .../AMDGPU/GlobalISel/wmma-gfx12-w64.ll | 333 ++++ ...wmma-gfx12-w32-f16-f32-matrix-modifiers.ll | 499 +++++ .../test/CodeGen/AMDGPU/wmma-gfx12-w32-imm.ll | 431 +++++ .../AMDGPU/wmma-gfx12-w32-iu-modifiers.ll | 309 ++++ .../AMDGPU/wmma-gfx12-w32-swmmac-index_key.ll | 321 ++++ llvm/test/CodeGen/AMDGPU/wmma-gfx12-w32.ll | 370 ++++ ...wmma-gfx12-w64-f16-f32-matrix-modifiers.ll | 456 +++++ .../test/CodeGen/AMDGPU/wmma-gfx12-w64-imm.ll | 373 ++++ .../AMDGPU/wmma-gfx12-w64-iu-modifiers.ll | 274 +++ .../AMDGPU/wmma-gfx12-w64-swmmac-index_key.ll | 472 +++++ llvm/test/CodeGen/AMDGPU/wmma-gfx12-w64.ll | 333 ++++ .../CodeGen/AMDGPU/wmma-hazards-gfx12-w32.mir | 354 ++++ .../CodeGen/AMDGPU/wmma-hazards-gfx12-w64.mir | 355 ++++ llvm/test/MC/AMDGPU/gfx12_asm_wmma_w32.s | 1529 ++++++++++++++++ llvm/test/MC/AMDGPU/gfx12_asm_wmma_w64.s | 1529 ++++++++++++++++ .../AMDGPU/gfx12_dasm_wmma_w32.txt | 1628 +++++++++++++++++ .../AMDGPU/gfx12_dasm_wmma_w64.txt | 1628 +++++++++++++++++ mlir/include/mlir/Dialect/LLVMIR/ROCDLOps.td | 17 +- mlir/test/Target/LLVMIR/rocdl.mlir | 24 +- 65 files changed, 17708 insertions(+), 111 deletions(-) create mode 100644 clang/test/CodeGenOpenCL/builtins-amdgcn-gfx12-wmma-w32.cl create mode 100644 clang/test/CodeGenOpenCL/builtins-amdgcn-gfx12-wmma-w64.cl create mode 100644 clang/test/CodeGenOpenCL/builtins-amdgcn-swmmac-w32.cl create mode 100644 clang/test/CodeGenOpenCL/builtins-amdgcn-swmmac-w64.cl create mode 100644 cross-project-tests/amdgpu/builtins-amdgcn-gfx12-wmma-w32.cl create mode 100644 cross-project-tests/amdgpu/builtins-amdgcn-gfx12-wmma-w64.cl create mode 100644 cross-project-tests/amdgpu/builtins-amdgcn-swmmac-w32.cl create mode 100644 cross-project-tests/amdgpu/builtins-amdgcn-swmmac-w64.cl create mode 100644 llvm/test/CodeGen/AMDGPU/GlobalISel/wmma-gfx12-w32-f16-f32-matrix-modifiers.ll create mode 100644 llvm/test/CodeGen/AMDGPU/GlobalISel/wmma-gfx12-w32-imm.ll create mode 100644 llvm/test/CodeGen/AMDGPU/GlobalISel/wmma-gfx12-w32-iu-modifiers.ll create mode 100644 llvm/test/CodeGen/AMDGPU/GlobalISel/wmma-gfx12-w32-swmmac-index_key.ll create mode 100644 llvm/test/CodeGen/AMDGPU/GlobalISel/wmma-gfx12-w32.ll create mode 100644 llvm/test/CodeGen/AMDGPU/GlobalISel/wmma-gfx12-w64-f16-f32-matrix-modifiers.ll create mode 100644 llvm/test/CodeGen/AMDGPU/GlobalISel/wmma-gfx12-w64-imm.ll create mode 100644 llvm/test/CodeGen/AMDGPU/GlobalISel/wmma-gfx12-w64-iu-modifiers.ll create mode 100644 llvm/test/CodeGen/AMDGPU/GlobalISel/wmma-gfx12-w64-swmmac-index_key.ll create mode 100644 llvm/test/CodeGen/AMDGPU/GlobalISel/wmma-gfx12-w64.ll create mode 100644 llvm/test/CodeGen/AMDGPU/wmma-gfx12-w32-f16-f32-matrix-modifiers.ll create mode 100644 llvm/test/CodeGen/AMDGPU/wmma-gfx12-w32-imm.ll create mode 100644 llvm/test/CodeGen/AMDGPU/wmma-gfx12-w32-iu-modifiers.ll create mode 100644 llvm/test/CodeGen/AMDGPU/wmma-gfx12-w32-swmmac-index_key.ll create mode 100644 llvm/test/CodeGen/AMDGPU/wmma-gfx12-w32.ll create mode 100644 llvm/test/CodeGen/AMDGPU/wmma-gfx12-w64-f16-f32-matrix-modifiers.ll create mode 100644 llvm/test/CodeGen/AMDGPU/wmma-gfx12-w64-imm.ll create mode 100644 llvm/test/CodeGen/AMDGPU/wmma-gfx12-w64-iu-modifiers.ll create mode 100644 llvm/test/CodeGen/AMDGPU/wmma-gfx12-w64-swmmac-index_key.ll create mode 100644 llvm/test/CodeGen/AMDGPU/wmma-gfx12-w64.ll create mode 100644 llvm/test/CodeGen/AMDGPU/wmma-hazards-gfx12-w32.mir create mode 100644 llvm/test/CodeGen/AMDGPU/wmma-hazards-gfx12-w64.mir create mode 100644 llvm/test/MC/AMDGPU/gfx12_asm_wmma_w32.s create mode 100644 llvm/test/MC/AMDGPU/gfx12_asm_wmma_w64.s create mode 100644 llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_wmma_w32.txt create mode 100644 llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_wmma_w64.txt diff --git a/clang/include/clang/Basic/BuiltinsAMDGPU.def b/clang/include/clang/Basic/BuiltinsAMDGPU.def index d208342d9c51..74dfd1d214e8 100644 --- a/clang/include/clang/Basic/BuiltinsAMDGPU.def +++ b/clang/include/clang/Basic/BuiltinsAMDGPU.def @@ -436,5 +436,67 @@ TARGET_BUILTIN(__builtin_amdgcn_global_load_tr_i32, "ii*1", "nc", "gfx12-insts,w TARGET_BUILTIN(__builtin_amdgcn_global_load_tr_v4i16, "V4sV4s*1", "nc", "gfx12-insts,wavefrontsize64") TARGET_BUILTIN(__builtin_amdgcn_global_load_tr_v4f16, "V4hV4h*1", "nc", "gfx12-insts,wavefrontsize64") +//===----------------------------------------------------------------------===// +// WMMA builtins. +// Postfix w32 indicates the builtin requires wavefront size of 32. +// Postfix w64 indicates the builtin requires wavefront size of 64. +// +// Some of these are very similar to their GFX11 counterparts, but they don't +// require replication of the A,B matrices, so they use fewer vector elements. +// Therefore, we add an "_gfx12" suffix to distinguish them from the existing +// builtins. +//===----------------------------------------------------------------------===// +TARGET_BUILTIN(__builtin_amdgcn_wmma_f32_16x16x16_f16_w32_gfx12, "V8fV8hV8hV8f", "nc", "gfx12-insts,wavefrontsize32") +TARGET_BUILTIN(__builtin_amdgcn_wmma_f32_16x16x16_bf16_w32_gfx12, "V8fV8sV8sV8f", "nc", "gfx12-insts,wavefrontsize32") +TARGET_BUILTIN(__builtin_amdgcn_wmma_f16_16x16x16_f16_w32_gfx12, "V8hV8hV8hV8h", "nc", "gfx12-insts,wavefrontsize32") +TARGET_BUILTIN(__builtin_amdgcn_wmma_bf16_16x16x16_bf16_w32_gfx12, "V8sV8sV8sV8s", "nc", "gfx12-insts,wavefrontsize32") +TARGET_BUILTIN(__builtin_amdgcn_wmma_i32_16x16x16_iu8_w32_gfx12, "V8iIbV2iIbV2iV8iIb", "nc", "gfx12-insts,wavefrontsize32") +TARGET_BUILTIN(__builtin_amdgcn_wmma_i32_16x16x16_iu4_w32_gfx12, "V8iIbiIbiV8iIb", "nc", "gfx12-insts,wavefrontsize32") +// These are gfx12-only, but for consistency with the other WMMA variants we're +// keeping the "_gfx12" suffix. +TARGET_BUILTIN(__builtin_amdgcn_wmma_f32_16x16x16_fp8_fp8_w32_gfx12, "V8fV2iV2iV8f", "nc", "gfx12-insts,wavefrontsize32") +TARGET_BUILTIN(__builtin_amdgcn_wmma_f32_16x16x16_fp8_bf8_w32_gfx12, "V8fV2iV2iV8f", "nc", "gfx12-insts,wavefrontsize32") +TARGET_BUILTIN(__builtin_amdgcn_wmma_f32_16x16x16_bf8_fp8_w32_gfx12, "V8fV2iV2iV8f", "nc", "gfx12-insts,wavefrontsize32") +TARGET_BUILTIN(__builtin_amdgcn_wmma_f32_16x16x16_bf8_bf8_w32_gfx12, "V8fV2iV2iV8f", "nc", "gfx12-insts,wavefrontsize32") +TARGET_BUILTIN(__builtin_amdgcn_wmma_i32_16x16x32_iu4_w32_gfx12, "V8iIbV2iIbV2iV8iIb", "nc", "gfx12-insts,wavefrontsize32") + +TARGET_BUILTIN(__builtin_amdgcn_wmma_f32_16x16x16_f16_w64_gfx12, "V4fV4hV4hV4f", "nc", "gfx12-insts,wavefrontsize64") +TARGET_BUILTIN(__builtin_amdgcn_wmma_f32_16x16x16_bf16_w64_gfx12, "V4fV4sV4sV4f", "nc", "gfx12-insts,wavefrontsize64") +TARGET_BUILTIN(__builtin_amdgcn_wmma_f16_16x16x16_f16_w64_gfx12, "V4hV4hV4hV4h", "nc", "gfx12-insts,wavefrontsize64") +TARGET_BUILTIN(__builtin_amdgcn_wmma_bf16_16x16x16_bf16_w64_gfx12, "V4sV4sV4sV4s", "nc", "gfx12-insts,wavefrontsize64") +TARGET_BUILTIN(__builtin_amdgcn_wmma_i32_16x16x16_iu8_w64_gfx12, "V4iIbiIbiV4iIb", "nc", "gfx12-insts,wavefrontsize64") +TARGET_BUILTIN(__builtin_amdgcn_wmma_i32_16x16x16_iu4_w64_gfx12, "V4iIbiIbiV4iIb", "nc", "gfx12-insts,wavefrontsize64") +// These are gfx12-only, but for consistency with the other WMMA variants we're +// keeping the "_gfx12" suffix. +TARGET_BUILTIN(__builtin_amdgcn_wmma_f32_16x16x16_fp8_fp8_w64_gfx12, "V4fiiV4f", "nc", "gfx12-insts,wavefrontsize64") +TARGET_BUILTIN(__builtin_amdgcn_wmma_f32_16x16x16_fp8_bf8_w64_gfx12, "V4fiiV4f", "nc", "gfx12-insts,wavefrontsize64") +TARGET_BUILTIN(__builtin_amdgcn_wmma_f32_16x16x16_bf8_fp8_w64_gfx12, "V4fiiV4f", "nc", "gfx12-insts,wavefrontsize64") +TARGET_BUILTIN(__builtin_amdgcn_wmma_f32_16x16x16_bf8_bf8_w64_gfx12, "V4fiiV4f", "nc", "gfx12-insts,wavefrontsize64") +TARGET_BUILTIN(__builtin_amdgcn_wmma_i32_16x16x32_iu4_w64_gfx12, "V4iIbiIbiV4iIb", "nc", "gfx12-insts,wavefrontsize64") + +TARGET_BUILTIN(__builtin_amdgcn_swmmac_f32_16x16x32_f16_w32, "V8fV8hV16hV8fs", "nc", "gfx12-insts,wavefrontsize32") +TARGET_BUILTIN(__builtin_amdgcn_swmmac_f32_16x16x32_bf16_w32, "V8fV8sV16sV8fs", "nc", "gfx12-insts,wavefrontsize32") +TARGET_BUILTIN(__builtin_amdgcn_swmmac_f16_16x16x32_f16_w32, "V8hV8hV16hV8hs", "nc", "gfx12-insts,wavefrontsize32") +TARGET_BUILTIN(__builtin_amdgcn_swmmac_bf16_16x16x32_bf16_w32, "V8sV8sV16sV8ss", "nc", "gfx12-insts,wavefrontsize32") +TARGET_BUILTIN(__builtin_amdgcn_swmmac_i32_16x16x32_iu8_w32, "V8iIbV2iIbV4iV8isIb", "nc", "gfx12-insts,wavefrontsize32") +TARGET_BUILTIN(__builtin_amdgcn_swmmac_i32_16x16x32_iu4_w32, "V8iIbiIbV2iV8isIb", "nc", "gfx12-insts,wavefrontsize32") +TARGET_BUILTIN(__builtin_amdgcn_swmmac_i32_16x16x64_iu4_w32, "V8iIbV2iIbV4iV8isIb", "nc", "gfx12-insts,wavefrontsize32") +TARGET_BUILTIN(__builtin_amdgcn_swmmac_f32_16x16x32_fp8_fp8_w32, "V8fV2iV4iV8fs", "nc", "gfx12-insts,wavefrontsize32") +TARGET_BUILTIN(__builtin_amdgcn_swmmac_f32_16x16x32_fp8_bf8_w32, "V8fV2iV4iV8fs", "nc", "gfx12-insts,wavefrontsize32") +TARGET_BUILTIN(__builtin_amdgcn_swmmac_f32_16x16x32_bf8_fp8_w32, "V8fV2iV4iV8fs", "nc", "gfx12-insts,wavefrontsize32") +TARGET_BUILTIN(__builtin_amdgcn_swmmac_f32_16x16x32_bf8_bf8_w32, "V8fV2iV4iV8fs", "nc", "gfx12-insts,wavefrontsize32") + +TARGET_BUILTIN(__builtin_amdgcn_swmmac_f32_16x16x32_f16_w64, "V4fV4hV8hV4fs", "nc", "gfx12-insts,wavefrontsize64") +TARGET_BUILTIN(__builtin_amdgcn_swmmac_f32_16x16x32_bf16_w64, "V4fV4sV8sV4fs", "nc", "gfx12-insts,wavefrontsize64") +TARGET_BUILTIN(__builtin_amdgcn_swmmac_f16_16x16x32_f16_w64, "V4hV4hV8hV4hs", "nc", "gfx12-insts,wavefrontsize64") +TARGET_BUILTIN(__builtin_amdgcn_swmmac_bf16_16x16x32_bf16_w64, "V4sV4sV8sV4ss", "nc", "gfx12-insts,wavefrontsize64") +TARGET_BUILTIN(__builtin_amdgcn_swmmac_i32_16x16x32_iu8_w64, "V4iIbiIbV2iV4isIb", "nc", "gfx12-insts,wavefrontsize64") +TARGET_BUILTIN(__builtin_amdgcn_swmmac_i32_16x16x32_iu4_w64, "V4iIbiIbiV4isIb", "nc", "gfx12-insts,wavefrontsize64") +TARGET_BUILTIN(__builtin_amdgcn_swmmac_i32_16x16x64_iu4_w64, "V4iIbiIbV2iV4isIb", "nc", "gfx12-insts,wavefrontsize64") +TARGET_BUILTIN(__builtin_amdgcn_swmmac_f32_16x16x32_fp8_fp8_w64, "V4fiV2iV4fs", "nc", "gfx12-insts,wavefrontsize64") +TARGET_BUILTIN(__builtin_amdgcn_swmmac_f32_16x16x32_fp8_bf8_w64, "V4fiV2iV4fs", "nc", "gfx12-insts,wavefrontsize64") +TARGET_BUILTIN(__builtin_amdgcn_swmmac_f32_16x16x32_bf8_fp8_w64, "V4fiV2iV4fs", "nc", "gfx12-insts,wavefrontsize64") +TARGET_BUILTIN(__builtin_amdgcn_swmmac_f32_16x16x32_bf8_bf8_w64, "V4fiV2iV4fs", "nc", "gfx12-insts,wavefrontsize64") + #undef BUILTIN #undef TARGET_BUILTIN diff --git a/clang/lib/CodeGen/CGBuiltin.cpp b/clang/lib/CodeGen/CGBuiltin.cpp index 7ef764b8e1ac..a4f26a6f0eb1 100644 --- a/clang/lib/CodeGen/CGBuiltin.cpp +++ b/clang/lib/CodeGen/CGBuiltin.cpp @@ -18279,65 +18279,216 @@ Value *CodeGenFunction::EmitAMDGPUBuiltinExpr(unsigned BuiltinID, case AMDGPU::BI__builtin_amdgcn_wmma_i32_16x16x16_iu4_w32: case AMDGPU::BI__builtin_amdgcn_wmma_i32_16x16x16_iu4_w64: case AMDGPU::BI__builtin_amdgcn_wmma_i32_16x16x16_iu8_w32: - case AMDGPU::BI__builtin_amdgcn_wmma_i32_16x16x16_iu8_w64: { + case AMDGPU::BI__builtin_amdgcn_wmma_i32_16x16x16_iu8_w64: + case AMDGPU::BI__builtin_amdgcn_wmma_bf16_16x16x16_bf16_w32_gfx12: + case AMDGPU::BI__builtin_amdgcn_wmma_bf16_16x16x16_bf16_w64_gfx12: + case AMDGPU::BI__builtin_amdgcn_wmma_f16_16x16x16_f16_w32_gfx12: + case AMDGPU::BI__builtin_amdgcn_wmma_f16_16x16x16_f16_w64_gfx12: + case AMDGPU::BI__builtin_amdgcn_wmma_f32_16x16x16_bf16_w32_gfx12: + case AMDGPU::BI__builtin_amdgcn_wmma_f32_16x16x16_bf16_w64_gfx12: + case AMDGPU::BI__builtin_amdgcn_wmma_f32_16x16x16_f16_w32_gfx12: + case AMDGPU::BI__builtin_amdgcn_wmma_f32_16x16x16_f16_w64_gfx12: + case AMDGPU::BI__builtin_amdgcn_wmma_i32_16x16x16_iu4_w32_gfx12: + case AMDGPU::BI__builtin_amdgcn_wmma_i32_16x16x16_iu4_w64_gfx12: + case AMDGPU::BI__builtin_amdgcn_wmma_i32_16x16x16_iu8_w32_gfx12: + case AMDGPU::BI__builtin_amdgcn_wmma_i32_16x16x16_iu8_w64_gfx12: + case AMDGPU::BI__builtin_amdgcn_wmma_f32_16x16x16_fp8_fp8_w32_gfx12: + case AMDGPU::BI__builtin_amdgcn_wmma_f32_16x16x16_fp8_fp8_w64_gfx12: + case AMDGPU::BI__builtin_amdgcn_wmma_f32_16x16x16_fp8_bf8_w32_gfx12: + case AMDGPU::BI__builtin_amdgcn_wmma_f32_16x16x16_fp8_bf8_w64_gfx12: + case AMDGPU::BI__builtin_amdgcn_wmma_f32_16x16x16_bf8_fp8_w32_gfx12: + case AMDGPU::BI__builtin_amdgcn_wmma_f32_16x16x16_bf8_fp8_w64_gfx12: + case AMDGPU::BI__builtin_amdgcn_wmma_f32_16x16x16_bf8_bf8_w32_gfx12: + case AMDGPU::BI__builtin_amdgcn_wmma_f32_16x16x16_bf8_bf8_w64_gfx12: + case AMDGPU::BI__builtin_amdgcn_wmma_i32_16x16x32_iu4_w32_gfx12: + case AMDGPU::BI__builtin_amdgcn_wmma_i32_16x16x32_iu4_w64_gfx12: + case AMDGPU::BI__builtin_amdgcn_swmmac_f32_16x16x32_f16_w32: + case AMDGPU::BI__builtin_amdgcn_swmmac_f32_16x16x32_f16_w64: + case AMDGPU::BI__builtin_amdgcn_swmmac_f32_16x16x32_bf16_w32: + case AMDGPU::BI__builtin_amdgcn_swmmac_f32_16x16x32_bf16_w64: + case AMDGPU::BI__builtin_amdgcn_swmmac_f16_16x16x32_f16_w32: + case AMDGPU::BI__builtin_amdgcn_swmmac_f16_16x16x32_f16_w64: + case AMDGPU::BI__builtin_amdgcn_swmmac_bf16_16x16x32_bf16_w32: + case AMDGPU::BI__builtin_amdgcn_swmmac_bf16_16x16x32_bf16_w64: + case AMDGPU::BI__builtin_amdgcn_swmmac_i32_16x16x32_iu8_w32: + case AMDGPU::BI__builtin_amdgcn_swmmac_i32_16x16x32_iu8_w64: + case AMDGPU::BI__builtin_amdgcn_swmmac_i32_16x16x32_iu4_w32: + case AMDGPU::BI__builtin_amdgcn_swmmac_i32_16x16x32_iu4_w64: + case AMDGPU::BI__builtin_amdgcn_swmmac_i32_16x16x64_iu4_w32: + case AMDGPU::BI__builtin_amdgcn_swmmac_i32_16x16x64_iu4_w64: + case AMDGPU::BI__builtin_amdgcn_swmmac_f32_16x16x32_fp8_fp8_w32: + case AMDGPU::BI__builtin_amdgcn_swmmac_f32_16x16x32_fp8_fp8_w64: + case AMDGPU::BI__builtin_amdgcn_swmmac_f32_16x16x32_fp8_bf8_w32: + case AMDGPU::BI__builtin_amdgcn_swmmac_f32_16x16x32_fp8_bf8_w64: + case AMDGPU::BI__builtin_amdgcn_swmmac_f32_16x16x32_bf8_fp8_w32: + case AMDGPU::BI__builtin_amdgcn_swmmac_f32_16x16x32_bf8_fp8_w64: + case AMDGPU::BI__builtin_amdgcn_swmmac_f32_16x16x32_bf8_bf8_w32: + case AMDGPU::BI__builtin_amdgcn_swmmac_f32_16x16x32_bf8_bf8_w64: { // These operations perform a matrix multiplication and accumulation of // the form: // D = A * B + C - // The return type always matches the type of matrix C. - unsigned ArgForMatchingRetType; + // We need to specify one type for matrices AB and one for matrices CD. + // Sparse matrix operations can have different types for A and B as well as + // an additional type for sparsity index. + // Destination type should be put before types used for source operands. + SmallVector ArgsForMatchingMatrixTypes; + // On GFX12, the intrinsics with 16-bit accumulator use a packed layout. + // There is no need for the variable opsel argument, so always set it to + // "false". + bool AppendFalseForOpselArg = false; unsigned BuiltinWMMAOp; switch (BuiltinID) { case AMDGPU::BI__builtin_amdgcn_wmma_f32_16x16x16_f16_w32: case AMDGPU::BI__builtin_amdgcn_wmma_f32_16x16x16_f16_w64: - ArgForMatchingRetType = 2; + case AMDGPU::BI__builtin_amdgcn_wmma_f32_16x16x16_f16_w32_gfx12: + case AMDGPU::BI__builtin_amdgcn_wmma_f32_16x16x16_f16_w64_gfx12: + ArgsForMatchingMatrixTypes = {2, 0}; // CD, AB BuiltinWMMAOp = Intrinsic::amdgcn_wmma_f32_16x16x16_f16; break; case AMDGPU::BI__builtin_amdgcn_wmma_f32_16x16x16_bf16_w32: case AMDGPU::BI__builtin_amdgcn_wmma_f32_16x16x16_bf16_w64: - ArgForMatchingRetType = 2; + case AMDGPU::BI__builtin_amdgcn_wmma_f32_16x16x16_bf16_w32_gfx12: + case AMDGPU::BI__builtin_amdgcn_wmma_f32_16x16x16_bf16_w64_gfx12: + ArgsForMatchingMatrixTypes = {2, 0}; // CD, AB BuiltinWMMAOp = Intrinsic::amdgcn_wmma_f32_16x16x16_bf16; break; + case AMDGPU::BI__builtin_amdgcn_wmma_f16_16x16x16_f16_w32_gfx12: + case AMDGPU::BI__builtin_amdgcn_wmma_f16_16x16x16_f16_w64_gfx12: + AppendFalseForOpselArg = true; + LLVM_FALLTHROUGH; case AMDGPU::BI__builtin_amdgcn_wmma_f16_16x16x16_f16_w32: case AMDGPU::BI__builtin_amdgcn_wmma_f16_16x16x16_f16_w64: - ArgForMatchingRetType = 2; + ArgsForMatchingMatrixTypes = {2, 0}; // CD, AB BuiltinWMMAOp = Intrinsic::amdgcn_wmma_f16_16x16x16_f16; break; + case AMDGPU::BI__builtin_amdgcn_wmma_bf16_16x16x16_bf16_w32_gfx12: + case AMDGPU::BI__builtin_amdgcn_wmma_bf16_16x16x16_bf16_w64_gfx12: + AppendFalseForOpselArg = true; + LLVM_FALLTHROUGH; case AMDGPU::BI__builtin_amdgcn_wmma_bf16_16x16x16_bf16_w32: case AMDGPU::BI__builtin_amdgcn_wmma_bf16_16x16x16_bf16_w64: - ArgForMatchingRetType = 2; + ArgsForMatchingMatrixTypes = {2, 0}; // CD, AB BuiltinWMMAOp = Intrinsic::amdgcn_wmma_bf16_16x16x16_bf16; break; case AMDGPU::BI__builtin_amdgcn_wmma_f16_16x16x16_f16_tied_w32: case AMDGPU::BI__builtin_amdgcn_wmma_f16_16x16x16_f16_tied_w64: - ArgForMatchingRetType = 2; + ArgsForMatchingMatrixTypes = {2, 0}; // CD, AB BuiltinWMMAOp = Intrinsic::amdgcn_wmma_f16_16x16x16_f16_tied; break; case AMDGPU::BI__builtin_amdgcn_wmma_bf16_16x16x16_bf16_tied_w32: case AMDGPU::BI__builtin_amdgcn_wmma_bf16_16x16x16_bf16_tied_w64: - ArgForMatchingRetType = 2; + ArgsForMatchingMatrixTypes = {2, 0}; // CD, AB BuiltinWMMAOp = Intrinsic::amdgcn_wmma_bf16_16x16x16_bf16_tied; break; case AMDGPU::BI__builtin_amdgcn_wmma_i32_16x16x16_iu8_w32: case AMDGPU::BI__builtin_amdgcn_wmma_i32_16x16x16_iu8_w64: - ArgForMatchingRetType = 4; + case AMDGPU::BI__builtin_amdgcn_wmma_i32_16x16x16_iu8_w32_gfx12: + case AMDGPU::BI__builtin_amdgcn_wmma_i32_16x16x16_iu8_w64_gfx12: + ArgsForMatchingMatrixTypes = {4, 1}; // CD, AB BuiltinWMMAOp = Intrinsic::amdgcn_wmma_i32_16x16x16_iu8; break; case AMDGPU::BI__builtin_amdgcn_wmma_i32_16x16x16_iu4_w32: case AMDGPU::BI__builtin_amdgcn_wmma_i32_16x16x16_iu4_w64: - ArgForMatchingRetType = 4; + case AMDGPU::BI__builtin_amdgcn_wmma_i32_16x16x16_iu4_w32_gfx12: + case AMDGPU::BI__builtin_amdgcn_wmma_i32_16x16x16_iu4_w64_gfx12: + ArgsForMatchingMatrixTypes = {4, 1}; // CD, AB BuiltinWMMAOp = Intrinsic::amdgcn_wmma_i32_16x16x16_iu4; break; + case AMDGPU::BI__builtin_amdgcn_wmma_f32_16x16x16_fp8_fp8_w32_gfx12: + case AMDGPU::BI__builtin_amdgcn_wmma_f32_16x16x16_fp8_fp8_w64_gfx12: + ArgsForMatchingMatrixTypes = {2, 0}; // CD, AB + BuiltinWMMAOp = Intrinsic::amdgcn_wmma_f32_16x16x16_fp8_fp8; + break; + case AMDGPU::BI__builtin_amdgcn_wmma_f32_16x16x16_fp8_bf8_w32_gfx12: + case AMDGPU::BI__builtin_amdgcn_wmma_f32_16x16x16_fp8_bf8_w64_gfx12: + ArgsForMatchingMatrixTypes = {2, 0}; // CD, AB + BuiltinWMMAOp = Intrinsic::amdgcn_wmma_f32_16x16x16_fp8_bf8; + break; + case AMDGPU::BI__builtin_amdgcn_wmma_f32_16x16x16_bf8_fp8_w32_gfx12: + case AMDGPU::BI__builtin_amdgcn_wmma_f32_16x16x16_bf8_fp8_w64_gfx12: + ArgsForMatchingMatrixTypes = {2, 0}; // CD, AB + BuiltinWMMAOp = Intrinsic::amdgcn_wmma_f32_16x16x16_bf8_fp8; + break; + case AMDGPU::BI__builtin_amdgcn_wmma_f32_16x16x16_bf8_bf8_w32_gfx12: + case AMDGPU::BI__builtin_amdgcn_wmma_f32_16x16x16_bf8_bf8_w64_gfx12: + ArgsForMatchingMatrixTypes = {2, 0}; // CD, AB + BuiltinWMMAOp = Intrinsic::amdgcn_wmma_f32_16x16x16_bf8_bf8; + break; + case AMDGPU::BI__builtin_amdgcn_wmma_i32_16x16x32_iu4_w32_gfx12: + case AMDGPU::BI__builtin_amdgcn_wmma_i32_16x16x32_iu4_w64_gfx12: + ArgsForMatchingMatrixTypes = {4, 1}; // CD, AB + BuiltinWMMAOp = Intrinsic::amdgcn_wmma_i32_16x16x32_iu4; + break; + case AMDGPU::BI__builtin_amdgcn_swmmac_f32_16x16x32_f16_w32: + case AMDGPU::BI__builtin_amdgcn_swmmac_f32_16x16x32_f16_w64: + ArgsForMatchingMatrixTypes = {2, 0, 1, 3}; // CD, A, B, Index + BuiltinWMMAOp = Intrinsic::amdgcn_swmmac_f32_16x16x32_f16; + break; + case AMDGPU::BI__builtin_amdgcn_swmmac_f32_16x16x32_bf16_w32: + case AMDGPU::BI__builtin_amdgcn_swmmac_f32_16x16x32_bf16_w64: + ArgsForMatchingMatrixTypes = {2, 0, 1, 3}; // CD, A, B, Index + BuiltinWMMAOp = Intrinsic::amdgcn_swmmac_f32_16x16x32_bf16; + break; + case AMDGPU::BI__builtin_amdgcn_swmmac_f16_16x16x32_f16_w32: + case AMDGPU::BI__builtin_amdgcn_swmmac_f16_16x16x32_f16_w64: + ArgsForMatchingMatrixTypes = {2, 0, 1, 3}; // CD, A, B, Index + BuiltinWMMAOp = Intrinsic::amdgcn_swmmac_f16_16x16x32_f16; + break; + case AMDGPU::BI__builtin_amdgcn_swmmac_bf16_16x16x32_bf16_w32: + case AMDGPU::BI__builtin_amdgcn_swmmac_bf16_16x16x32_bf16_w64: + ArgsForMatchingMatrixTypes = {2, 0, 1, 3}; // CD, A, B, Index + BuiltinWMMAOp = Intrinsic::amdgcn_swmmac_bf16_16x16x32_bf16; + break; + case AMDGPU::BI__builtin_amdgcn_swmmac_i32_16x16x32_iu8_w32: + case AMDGPU::BI__builtin_amdgcn_swmmac_i32_16x16x32_iu8_w64: + ArgsForMatchingMatrixTypes = {4, 1, 3, 5}; // CD, A, B, Index + BuiltinWMMAOp = Intrinsic::amdgcn_swmmac_i32_16x16x32_iu8; + break; + case AMDGPU::BI__builtin_amdgcn_swmmac_i32_16x16x32_iu4_w32: + case AMDGPU::BI__builtin_amdgcn_swmmac_i32_16x16x32_iu4_w64: + ArgsForMatchingMatrixTypes = {4, 1, 3, 5}; // CD, A, B, Index + BuiltinWMMAOp = Intrinsic::amdgcn_swmmac_i32_16x16x32_iu4; + break; + case AMDGPU::BI__builtin_amdgcn_swmmac_i32_16x16x64_iu4_w32: + case AMDGPU::BI__builtin_amdgcn_swmmac_i32_16x16x64_iu4_w64: + ArgsForMatchingMatrixTypes = {4, 1, 3, 5}; // CD, A, B, Index + BuiltinWMMAOp = Intrinsic::amdgcn_swmmac_i32_16x16x64_iu4; + break; + case AMDGPU::BI__builtin_amdgcn_swmmac_f32_16x16x32_fp8_fp8_w32: + case AMDGPU::BI__builtin_amdgcn_swmmac_f32_16x16x32_fp8_fp8_w64: + ArgsForMatchingMatrixTypes = {2, 0, 1, 3}; // CD, A, B, Index + BuiltinWMMAOp = Intrinsic::amdgcn_swmmac_f32_16x16x32_fp8_fp8; + break; + case AMDGPU::BI__builtin_amdgcn_swmmac_f32_16x16x32_fp8_bf8_w32: + case AMDGPU::BI__builtin_amdgcn_swmmac_f32_16x16x32_fp8_bf8_w64: + ArgsForMatchingMatrixTypes = {2, 0, 1, 3}; // CD, A, B, Index + BuiltinWMMAOp = Intrinsic::amdgcn_swmmac_f32_16x16x32_fp8_bf8; + break; + case AMDGPU::BI__builtin_amdgcn_swmmac_f32_16x16x32_bf8_fp8_w32: + case AMDGPU::BI__builtin_amdgcn_swmmac_f32_16x16x32_bf8_fp8_w64: + ArgsForMatchingMatrixTypes = {2, 0, 1, 3}; // CD, A, B, Index + BuiltinWMMAOp = Intrinsic::amdgcn_swmmac_f32_16x16x32_bf8_fp8; + break; + case AMDGPU::BI__builtin_amdgcn_swmmac_f32_16x16x32_bf8_bf8_w32: + case AMDGPU::BI__builtin_amdgcn_swmmac_f32_16x16x32_bf8_bf8_w64: + ArgsForMatchingMatrixTypes = {2, 0, 1, 3}; // CD, A, B, Index + BuiltinWMMAOp = Intrinsic::amdgcn_swmmac_f32_16x16x32_bf8_bf8; + break; } SmallVector Args; for (int i = 0, e = E->getNumArgs(); i != e; ++i) Args.push_back(EmitScalarExpr(E->getArg(i))); + if (AppendFalseForOpselArg) + Args.push_back(Builder.getFalse()); - Function *F = CGM.getIntrinsic(BuiltinWMMAOp, - {Args[ArgForMatchingRetType]->getType()}); + SmallVector ArgTypes; + for (auto ArgIdx : ArgsForMatchingMatrixTypes) + ArgTypes.push_back(Args[ArgIdx]->getType()); + Function *F = CGM.getIntrinsic(BuiltinWMMAOp, ArgTypes); return Builder.CreateCall(F, Args); } diff --git a/clang/test/CodeGenOpenCL/builtins-amdgcn-gfx12-wmma-w32.cl b/clang/test/CodeGenOpenCL/builtins-amdgcn-gfx12-wmma-w32.cl new file mode 100644 index 000000000000..11747af7ea74 --- /dev/null +++ b/clang/test/CodeGenOpenCL/builtins-amdgcn-gfx12-wmma-w32.cl @@ -0,0 +1,156 @@ +// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py +// REQUIRES: amdgpu-registered-target +// RUN: %clang_cc1 -triple amdgcn-unknown-unknown -target-cpu gfx1200 -target-feature +wavefrontsize32 -S -emit-llvm -o - %s | FileCheck %s --check-prefix=CHECK-GFX1200 + +typedef int v2i __attribute__((ext_vector_type(2))); +typedef float v8f __attribute__((ext_vector_type(8))); +typedef half v8h __attribute__((ext_vector_type(8))); +typedef short v8s __attribute__((ext_vector_type(8))); +typedef int v8i __attribute__((ext_vector_type(8))); + +// Wave32 + +// +// amdgcn_wmma_f32_16x16x16_f16 +// + +// CHECK-GFX1200-LABEL: @test_amdgcn_wmma_f32_16x16x16_f16_w32( +// CHECK-GFX1200-NEXT: entry: +// CHECK-GFX1200-NEXT: [[TMP0:%.*]] = tail call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.f16.v8f32.v8f16(<8 x half> [[A:%.*]], <8 x half> [[B:%.*]], <8 x float> [[C:%.*]]) +// CHECK-GFX1200-NEXT: store <8 x float> [[TMP0]], ptr addrspace(1) [[OUT:%.*]], align 32, !tbaa [[TBAA4:![0-9]+]] +// CHECK-GFX1200-NEXT: ret void +// +void test_amdgcn_wmma_f32_16x16x16_f16_w32(global v8f* out, v8h a, v8h b, v8f c) +{ + *out = __builtin_amdgcn_wmma_f32_16x16x16_f16_w32_gfx12(a, b, c); +} + +// +// amdgcn_wmma_f32_16x16x16_bf16 +// + +// CHECK-GFX1200-LABEL: @test_amdgcn_wmma_f32_16x16x16_bf16_w32( +// CHECK-GFX1200-NEXT: entry: +// CHECK-GFX1200-NEXT: [[TMP0:%.*]] = tail call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf16.v8f32.v8i16(<8 x i16> [[A:%.*]], <8 x i16> [[B:%.*]], <8 x float> [[C:%.*]]) +// CHECK-GFX1200-NEXT: store <8 x float> [[TMP0]], ptr addrspace(1) [[OUT:%.*]], align 32, !tbaa [[TBAA4]] +// CHECK-GFX1200-NEXT: ret void +// +void test_amdgcn_wmma_f32_16x16x16_bf16_w32(global v8f* out, v8s a, v8s b, v8f c) +{ + *out = __builtin_amdgcn_wmma_f32_16x16x16_bf16_w32_gfx12(a, b, c); +} + +// +// amdgcn_wmma_f16_16x16x16_f16 +// + +// CHECK-GFX1200-LABEL: @test_amdgcn_wmma_f16_16x16x16_f16_w32( +// CHECK-GFX1200-NEXT: entry: +// CHECK-GFX1200-NEXT: [[TMP0:%.*]] = tail call <8 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16.v8f16.v8f16(<8 x half> [[A:%.*]], <8 x half> [[B:%.*]], <8 x half> [[C:%.*]], i1 false) +// CHECK-GFX1200-NEXT: store <8 x half> [[TMP0]], ptr addrspace(1) [[OUT:%.*]], align 16, !tbaa [[TBAA4]] +// CHECK-GFX1200-NEXT: ret void +// +void test_amdgcn_wmma_f16_16x16x16_f16_w32(global v8h* out, v8h a, v8h b, v8h c) +{ + *out = __builtin_amdgcn_wmma_f16_16x16x16_f16_w32_gfx12(a, b, c); +} + +// +// amdgcn_wmma_bf16_16x16x16_bf16 +// + +// CHECK-GFX1200-LABEL: @test_amdgcn_wmma_bf16_16x16x16_bf16_w32( +// CHECK-GFX1200-NEXT: entry: +// CHECK-GFX1200-NEXT: [[TMP0:%.*]] = tail call <8 x i16> @llvm.amdgcn.wmma.bf16.16x16x16.bf16.v8i16.v8i16(<8 x i16> [[A:%.*]], <8 x i16> [[B:%.*]], <8 x i16> [[C:%.*]], i1 false) +// CHECK-GFX1200-NEXT: store <8 x i16> [[TMP0]], ptr addrspace(1) [[OUT:%.*]], align 16, !tbaa [[TBAA4]] +// CHECK-GFX1200-NEXT: ret void +// +void test_amdgcn_wmma_bf16_16x16x16_bf16_w32(global v8s* out, v8s a, v8s b, v8s c) +{ + *out = __builtin_amdgcn_wmma_bf16_16x16x16_bf16_w32_gfx12(a, b, c); +} + +// +// amdgcn_wmma_i32_16x16x16_iu8 +// + +// CHECK-GFX1200-LABEL: @test_amdgcn_wmma_i32_16x16x16_iu8_w32( +// CHECK-GFX1200-NEXT: entry: +// CHECK-GFX1200-NEXT: [[TMP0:%.*]] = tail call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8.v8i32.v2i32(i1 true, <2 x i32> [[A:%.*]], i1 true, <2 x i32> [[B:%.*]], <8 x i32> [[C:%.*]], i1 false) +// CHECK-GFX1200-NEXT: store <8 x i32> [[TMP0]], ptr addrspace(1) [[OUT:%.*]], align 32, !tbaa [[TBAA4]] +// CHECK-GFX1200-NEXT: ret void +// +void test_amdgcn_wmma_i32_16x16x16_iu8_w32(global v8i* out, v2i a, v2i b, v8i c) +{ + *out = __builtin_amdgcn_wmma_i32_16x16x16_iu8_w32_gfx12(true, a, true, b, c, false); +} + +// +// amdgcn_wmma_i32_16x16x16_iu4 +// + +// CHECK-GFX1200-LABEL: @test_amdgcn_wmma_i32_16x16x16_iu4_w32( +// CHECK-GFX1200-NEXT: entry: +// CHECK-GFX1200-NEXT: [[TMP0:%.*]] = tail call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4.v8i32.i32(i1 true, i32 [[A:%.*]], i1 true, i32 [[B:%.*]], <8 x i32> [[C:%.*]], i1 false) +// CHECK-GFX1200-NEXT: store <8 x i32> [[TMP0]], ptr addrspace(1) [[OUT:%.*]], align 32, !tbaa [[TBAA4]] +// CHECK-GFX1200-NEXT: ret void +// +void test_amdgcn_wmma_i32_16x16x16_iu4_w32(global v8i* out, int a, int b, v8i c) +{ + *out = __builtin_amdgcn_wmma_i32_16x16x16_iu4_w32_gfx12(true, a, true, b, c, false); +} + +// CHECK-GFX1200-LABEL: @test_amdgcn_wmma_f32_16x16x16_fp8_fp8_w32( +// CHECK-GFX1200-NEXT: entry: +// CHECK-GFX1200-NEXT: [[TMP0:%.*]] = tail call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.fp8.fp8.v8f32.v2i32(<2 x i32> [[A:%.*]], <2 x i32> [[B:%.*]], <8 x float> [[C:%.*]]) +// CHECK-GFX1200-NEXT: store <8 x float> [[TMP0]], ptr addrspace(1) [[OUT:%.*]], align 32, !tbaa [[TBAA4]] +// CHECK-GFX1200-NEXT: ret void +// +void test_amdgcn_wmma_f32_16x16x16_fp8_fp8_w32(global v8f* out, v2i a, v2i b, v8f c) +{ + *out = __builtin_amdgcn_wmma_f32_16x16x16_fp8_fp8_w32_gfx12(a, b, c); +} + +// CHECK-GFX1200-LABEL: @test_amdgcn_wmma_f32_16x16x16_fp8_bf8_w32( +// CHECK-GFX1200-NEXT: entry: +// CHECK-GFX1200-NEXT: [[TMP0:%.*]] = tail call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.fp8.bf8.v8f32.v2i32(<2 x i32> [[A:%.*]], <2 x i32> [[B:%.*]], <8 x float> [[C:%.*]]) +// CHECK-GFX1200-NEXT: store <8 x float> [[TMP0]], ptr addrspace(1) [[OUT:%.*]], align 32, !tbaa [[TBAA4]] +// CHECK-GFX1200-NEXT: ret void +// +void test_amdgcn_wmma_f32_16x16x16_fp8_bf8_w32(global v8f* out, v2i a, v2i b, v8f c) +{ + *out = __builtin_amdgcn_wmma_f32_16x16x16_fp8_bf8_w32_gfx12(a, b, c); +} + +// CHECK-GFX1200-LABEL: @test_amdgcn_wmma_f32_16x16x16_bf8_fp8_w32( +// CHECK-GFX1200-NEXT: entry: +// CHECK-GFX1200-NEXT: [[TMP0:%.*]] = tail call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf8.fp8.v8f32.v2i32(<2 x i32> [[A:%.*]], <2 x i32> [[B:%.*]], <8 x float> [[C:%.*]]) +// CHECK-GFX1200-NEXT: store <8 x float> [[TMP0]], ptr addrspace(1) [[OUT:%.*]], align 32, !tbaa [[TBAA4]] +// CHECK-GFX1200-NEXT: ret void +// +void test_amdgcn_wmma_f32_16x16x16_bf8_fp8_w32(global v8f* out, v2i a, v2i b, v8f c) +{ + *out = __builtin_amdgcn_wmma_f32_16x16x16_bf8_fp8_w32_gfx12(a, b, c); +} + +// CHECK-GFX1200-LABEL: @test_amdgcn_wmma_f32_16x16x16_bf8_bf8_w32( +// CHECK-GFX1200-NEXT: entry: +// CHECK-GFX1200-NEXT: [[TMP0:%.*]] = tail call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf8.bf8.v8f32.v2i32(<2 x i32> [[A:%.*]], <2 x i32> [[B:%.*]], <8 x float> [[C:%.*]]) +// CHECK-GFX1200-NEXT: store <8 x float> [[TMP0]], ptr addrspace(1) [[OUT:%.*]], align 32, !tbaa [[TBAA4]] +// CHECK-GFX1200-NEXT: ret void +// +void test_amdgcn_wmma_f32_16x16x16_bf8_bf8_w32(global v8f* out, v2i a, v2i b, v8f c) +{ + *out = __builtin_amdgcn_wmma_f32_16x16x16_bf8_bf8_w32_gfx12(a, b, c); +} + +// CHECK-GFX1200-LABEL: @test_amdgcn_wmma_i32_16x16x32_iu4_w32( +// CHECK-GFX1200-NEXT: entry: +// CHECK-GFX1200-NEXT: [[TMP0:%.*]] = tail call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x32.iu4.v8i32.v2i32(i1 true, <2 x i32> [[A:%.*]], i1 true, <2 x i32> [[B:%.*]], <8 x i32> [[C:%.*]], i1 false) +// CHECK-GFX1200-NEXT: store <8 x i32> [[TMP0]], ptr addrspace(1) [[OUT:%.*]], align 32, !tbaa [[TBAA4]] +// CHECK-GFX1200-NEXT: ret void +// +void test_amdgcn_wmma_i32_16x16x32_iu4_w32(global v8i* out, v2i a, v2i b, v8i c) +{ + *out = __builtin_amdgcn_wmma_i32_16x16x32_iu4_w32_gfx12(true, a, true, b, c, false); +} diff --git a/clang/test/CodeGenOpenCL/builtins-amdgcn-gfx12-wmma-w64.cl b/clang/test/CodeGenOpenCL/builtins-amdgcn-gfx12-wmma-w64.cl new file mode 100644 index 000000000000..ef32648743ca --- /dev/null +++ b/clang/test/CodeGenOpenCL/builtins-amdgcn-gfx12-wmma-w64.cl @@ -0,0 +1,155 @@ +// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py +// REQUIRES: amdgpu-registered-target +// RUN: %clang_cc1 -triple amdgcn-unknown-unknown -target-cpu gfx1200 -target-feature +wavefrontsize64 -S -emit-llvm -o - %s | FileCheck %s --check-prefix=CHECK-GFX1200 + +typedef float v4f __attribute__((ext_vector_type(4))); +typedef half v4h __attribute__((ext_vector_type(4))); +typedef short v4s __attribute__((ext_vector_type(4))); +typedef int v4i __attribute__((ext_vector_type(4))); + +// Wave64 + +// +// amdgcn_wmma_f32_16x16x16_f16 +// + +// CHECK-GFX1200-LABEL: @test_amdgcn_wmma_f32_16x16x16_f16_w64( +// CHECK-GFX1200-NEXT: entry: +// CHECK-GFX1200-NEXT: [[TMP0:%.*]] = tail call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.f16.v4f32.v4f16(<4 x half> [[A:%.*]], <4 x half> [[B:%.*]], <4 x float> [[C:%.*]]) +// CHECK-GFX1200-NEXT: store <4 x float> [[TMP0]], ptr addrspace(1) [[OUT:%.*]], align 16, !tbaa [[TBAA4:![0-9]+]] +// CHECK-GFX1200-NEXT: ret void +// +void test_amdgcn_wmma_f32_16x16x16_f16_w64(global v4f* out, v4h a, v4h b, v4f c) +{ + *out = __builtin_amdgcn_wmma_f32_16x16x16_f16_w64_gfx12(a, b, c); +} + +// +// amdgcn_wmma_f32_16x16x16_bf16 +// + +// CHECK-GFX1200-LABEL: @test_amdgcn_wmma_f32_16x16x16_bf16_w64( +// CHECK-GFX1200-NEXT: entry: +// CHECK-GFX1200-NEXT: [[TMP0:%.*]] = tail call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf16.v4f32.v4i16(<4 x i16> [[A:%.*]], <4 x i16> [[B:%.*]], <4 x float> [[C:%.*]]) +// CHECK-GFX1200-NEXT: store <4 x float> [[TMP0]], ptr addrspace(1) [[OUT:%.*]], align 16, !tbaa [[TBAA4]] +// CHECK-GFX1200-NEXT: ret void +// +void test_amdgcn_wmma_f32_16x16x16_bf16_w64(global v4f* out, v4s a, v4s b, v4f c) +{ + *out = __builtin_amdgcn_wmma_f32_16x16x16_bf16_w64_gfx12(a, b, c); +} + +// +// amdgcn_wmma_f16_16x16x16_f16 +// + +// CHECK-GFX1200-LABEL: @test_amdgcn_wmma_f16_16x16x16_f16_w64( +// CHECK-GFX1200-NEXT: entry: +// CHECK-GFX1200-NEXT: [[TMP0:%.*]] = tail call <4 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16.v4f16.v4f16(<4 x half> [[A:%.*]], <4 x half> [[B:%.*]], <4 x half> [[C:%.*]], i1 false) +// CHECK-GFX1200-NEXT: store <4 x half> [[TMP0]], ptr addrspace(1) [[OUT:%.*]], align 8, !tbaa [[TBAA4]] +// CHECK-GFX1200-NEXT: ret void +// +void test_amdgcn_wmma_f16_16x16x16_f16_w64(global v4h* out, v4h a, v4h b, v4h c) +{ + *out = __builtin_amdgcn_wmma_f16_16x16x16_f16_w64_gfx12(a, b, c); +} + +// +// amdgcn_wmma_bf16_16x16x16_bf16 +// + +// CHECK-GFX1200-LABEL: @test_amdgcn_wmma_bf16_16x16x16_bf16_w64( +// CHECK-GFX1200-NEXT: entry: +// CHECK-GFX1200-NEXT: [[TMP0:%.*]] = tail call <4 x i16> @llvm.amdgcn.wmma.bf16.16x16x16.bf16.v4i16.v4i16(<4 x i16> [[A:%.*]], <4 x i16> [[B:%.*]], <4 x i16> [[C:%.*]], i1 false) +// CHECK-GFX1200-NEXT: store <4 x i16> [[TMP0]], ptr addrspace(1) [[OUT:%.*]], align 8, !tbaa [[TBAA4]] +// CHECK-GFX1200-NEXT: ret void +// +void test_amdgcn_wmma_bf16_16x16x16_bf16_w64(global v4s* out, v4s a, v4s b, v4s c) +{ + *out = __builtin_amdgcn_wmma_bf16_16x16x16_bf16_w64_gfx12(a, b, c); +} + +// +// amdgcn_wmma_i32_16x16x16_iu8 +// + +// CHECK-GFX1200-LABEL: @test_amdgcn_wmma_i32_16x16x16_iu8_w64( +// CHECK-GFX1200-NEXT: entry: +// CHECK-GFX1200-NEXT: [[TMP0:%.*]] = tail call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8.v4i32.i32(i1 true, i32 [[A:%.*]], i1 true, i32 [[B:%.*]], <4 x i32> [[C:%.*]], i1 false) +// CHECK-GFX1200-NEXT: store <4 x i32> [[TMP0]], ptr addrspace(1) [[OUT:%.*]], align 16, !tbaa [[TBAA4]] +// CHECK-GFX1200-NEXT: ret void +// +void test_amdgcn_wmma_i32_16x16x16_iu8_w64(global v4i* out, int a, int b, v4i c) +{ + *out = __builtin_amdgcn_wmma_i32_16x16x16_iu8_w64_gfx12(true, a, true, b, c, false); +} + +// +// amdgcn_wmma_i32_16x16x16_iu4 +// + +// CHECK-GFX1200-LABEL: @test_amdgcn_wmma_i32_16x16x16_iu4_w64( +// CHECK-GFX1200-NEXT: entry: +// CHECK-GFX1200-NEXT: [[TMP0:%.*]] = tail call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4.v4i32.i32(i1 true, i32 [[A:%.*]], i1 true, i32 [[B:%.*]], <4 x i32> [[C:%.*]], i1 false) +// CHECK-GFX1200-NEXT: store <4 x i32> [[TMP0]], ptr addrspace(1) [[OUT:%.*]], align 16, !tbaa [[TBAA4]] +// CHECK-GFX1200-NEXT: ret void +// +void test_amdgcn_wmma_i32_16x16x16_iu4_w64(global v4i* out, int a, int b, v4i c) +{ + *out = __builtin_amdgcn_wmma_i32_16x16x16_iu4_w64_gfx12(true, a, true, b, c, false); +} + +// CHECK-GFX1200-LABEL: @test_amdgcn_wmma_f32_16x16x16_fp8_fp8_w32( +// CHECK-GFX1200-NEXT: entry: +// CHECK-GFX1200-NEXT: [[TMP0:%.*]] = tail call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.fp8.fp8.v4f32.i32(i32 [[A:%.*]], i32 [[B:%.*]], <4 x float> [[C:%.*]]) +// CHECK-GFX1200-NEXT: store <4 x float> [[TMP0]], ptr addrspace(1) [[OUT:%.*]], align 16, !tbaa [[TBAA4]] +// CHECK-GFX1200-NEXT: ret void +// +void test_amdgcn_wmma_f32_16x16x16_fp8_fp8_w32(global v4f* out, int a, int b, v4f c) +{ + *out = __builtin_amdgcn_wmma_f32_16x16x16_fp8_fp8_w64_gfx12(a, b, c); +} + +// CHECK-GFX1200-LABEL: @test_amdgcn_wmma_f32_16x16x16_fp8_bf8_w32( +// CHECK-GFX1200-NEXT: entry: +// CHECK-GFX1200-NEXT: [[TMP0:%.*]] = tail call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.fp8.bf8.v4f32.i32(i32 [[A:%.*]], i32 [[B:%.*]], <4 x float> [[C:%.*]]) +// CHECK-GFX1200-NEXT: store <4 x float> [[TMP0]], ptr addrspace(1) [[OUT:%.*]], align 16, !tbaa [[TBAA4]] +// CHECK-GFX1200-NEXT: ret void +// +void test_amdgcn_wmma_f32_16x16x16_fp8_bf8_w32(global v4f* out, int a, int b, v4f c) +{ + *out = __builtin_amdgcn_wmma_f32_16x16x16_fp8_bf8_w64_gfx12(a, b, c); +} + +// CHECK-GFX1200-LABEL: @test_amdgcn_wmma_f32_16x16x16_bf8_fp8_w32( +// CHECK-GFX1200-NEXT: entry: +// CHECK-GFX1200-NEXT: [[TMP0:%.*]] = tail call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf8.fp8.v4f32.i32(i32 [[A:%.*]], i32 [[B:%.*]], <4 x float> [[C:%.*]]) +// CHECK-GFX1200-NEXT: store <4 x float> [[TMP0]], ptr addrspace(1) [[OUT:%.*]], align 16, !tbaa [[TBAA4]] +// CHECK-GFX1200-NEXT: ret void +// +void test_amdgcn_wmma_f32_16x16x16_bf8_fp8_w32(global v4f* out, int a, int b, v4f c) +{ + *out = __builtin_amdgcn_wmma_f32_16x16x16_bf8_fp8_w64_gfx12(a, b, c); +} + +// CHECK-GFX1200-LABEL: @test_amdgcn_wmma_f32_16x16x16_bf8_bf8_w32( +// CHECK-GFX1200-NEXT: entry: +// CHECK-GFX1200-NEXT: [[TMP0:%.*]] = tail call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf8.bf8.v4f32.i32(i32 [[A:%.*]], i32 [[B:%.*]], <4 x float> [[C:%.*]]) +// CHECK-GFX1200-NEXT: store <4 x float> [[TMP0]], ptr addrspace(1) [[OUT:%.*]], align 16, !tbaa [[TBAA4]] +// CHECK-GFX1200-NEXT: ret void +// +void test_amdgcn_wmma_f32_16x16x16_bf8_bf8_w32(global v4f* out, int a, int b, v4f c) +{ + *out = __builtin_amdgcn_wmma_f32_16x16x16_bf8_bf8_w64_gfx12(a, b, c); +} + +// CHECK-GFX1200-LABEL: @test_amdgcn_wmma_i32_16x16x32_iu4_w32( +// CHECK-GFX1200-NEXT: entry: +// CHECK-GFX1200-NEXT: [[TMP0:%.*]] = tail call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x32.iu4.v4i32.i32(i1 true, i32 [[A:%.*]], i1 true, i32 [[B:%.*]], <4 x i32> [[C:%.*]], i1 false) +// CHECK-GFX1200-NEXT: store <4 x i32> [[TMP0]], ptr addrspace(1) [[OUT:%.*]], align 16, !tbaa [[TBAA4]] +// CHECK-GFX1200-NEXT: ret void +// +void test_amdgcn_wmma_i32_16x16x32_iu4_w32(global v4i* out, int a, int b, v4i c) +{ + *out = __builtin_amdgcn_wmma_i32_16x16x32_iu4_w64_gfx12(true, a, true, b, c, false); +} diff --git a/clang/test/CodeGenOpenCL/builtins-amdgcn-swmmac-w32.cl b/clang/test/CodeGenOpenCL/builtins-amdgcn-swmmac-w32.cl new file mode 100644 index 000000000000..b303c2f25ddd --- /dev/null +++ b/clang/test/CodeGenOpenCL/builtins-amdgcn-swmmac-w32.cl @@ -0,0 +1,135 @@ +// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py +// REQUIRES: amdgpu-registered-target +// RUN: %clang_cc1 -triple amdgcn-unknown-unknown -target-cpu gfx1200 -target-feature +wavefrontsize32 -S -emit-llvm -o - %s | FileCheck %s --check-prefix=CHECK-GFX1200 + +typedef int v2i __attribute__((ext_vector_type(2))); +typedef int v4i __attribute__((ext_vector_type(4))); +typedef float v8f __attribute__((ext_vector_type(8))); +typedef half v8h __attribute__((ext_vector_type(8))); +typedef short v8s __attribute__((ext_vector_type(8))); +typedef int v8i __attribute__((ext_vector_type(8))); +typedef half v16h __attribute__((ext_vector_type(16))); +typedef short v16s __attribute__((ext_vector_type(16))); + +// Wave32 + +// CHECK-GFX1200-LABEL: @test_amdgcn_swmmac_f32_16x16x32_f16_w32( +// CHECK-GFX1200-NEXT: entry: +// CHECK-GFX1200-NEXT: [[TMP0:%.*]] = tail call <8 x float> @llvm.amdgcn.swmmac.f32.16x16x32.f16.v8f32.v8f16.v16f16.i16(<8 x half> [[A:%.*]], <16 x half> [[B:%.*]], <8 x float> [[C:%.*]], i16 [[INDEX:%.*]]) +// CHECK-GFX1200-NEXT: store <8 x float> [[TMP0]], ptr addrspace(1) [[OUT:%.*]], align 32, !tbaa [[TBAA4:![0-9]+]] +// CHECK-GFX1200-NEXT: ret void +// +void test_amdgcn_swmmac_f32_16x16x32_f16_w32(global v8f* out, v8h a, v16h b, v8f c, short index) +{ + *out = __builtin_amdgcn_swmmac_f32_16x16x32_f16_w32(a, b, c, index); +} + +// CHECK-GFX1200-LABEL: @test_amdgcn_swmmac_f32_16x16x32_bf16_w32( +// CHECK-GFX1200-NEXT: entry: +// CHECK-GFX1200-NEXT: [[TMP0:%.*]] = tail call <8 x float> @llvm.amdgcn.swmmac.f32.16x16x32.bf16.v8f32.v8i16.v16i16.i16(<8 x i16> [[A:%.*]], <16 x i16> [[B:%.*]], <8 x float> [[C:%.*]], i16 [[INDEX:%.*]]) +// CHECK-GFX1200-NEXT: store <8 x float> [[TMP0]], ptr addrspace(1) [[OUT:%.*]], align 32, !tbaa [[TBAA4]] +// CHECK-GFX1200-NEXT: ret void +// +void test_amdgcn_swmmac_f32_16x16x32_bf16_w32(global v8f* out, v8s a, v16s b, v8f c, short index) +{ + *out = __builtin_amdgcn_swmmac_f32_16x16x32_bf16_w32(a, b, c, index); +} + +// CHECK-GFX1200-LABEL: @test_amdgcn_swmmac_f16_16x16x32_f16_w32( +// CHECK-GFX1200-NEXT: entry: +// CHECK-GFX1200-NEXT: [[TMP0:%.*]] = tail call <8 x half> @llvm.amdgcn.swmmac.f16.16x16x32.f16.v8f16.v8f16.v16f16.i16(<8 x half> [[A:%.*]], <16 x half> [[B:%.*]], <8 x half> [[C:%.*]], i16 [[INDEX:%.*]]) +// CHECK-GFX1200-NEXT: store <8 x half> [[TMP0]], ptr addrspace(1) [[OUT:%.*]], align 16, !tbaa [[TBAA4]] +// CHECK-GFX1200-NEXT: ret void +// +void test_amdgcn_swmmac_f16_16x16x32_f16_w32(global v8h* out, v8h a, v16h b, v8h c, short index) +{ + *out = __builtin_amdgcn_swmmac_f16_16x16x32_f16_w32(a, b, c, index); +} + +// CHECK-GFX1200-LABEL: @test_amdgcn_swmmac_bf16_16x16x32_bf16_w32( +// CHECK-GFX1200-NEXT: entry: +// CHECK-GFX1200-NEXT: [[TMP0:%.*]] = tail call <8 x i16> @llvm.amdgcn.swmmac.bf16.16x16x32.bf16.v8i16.v8i16.v16i16.i16(<8 x i16> [[A:%.*]], <16 x i16> [[B:%.*]], <8 x i16> [[C:%.*]], i16 [[INDEX:%.*]]) +// CHECK-GFX1200-NEXT: store <8 x i16> [[TMP0]], ptr addrspace(1) [[OUT:%.*]], align 16, !tbaa [[TBAA4]] +// CHECK-GFX1200-NEXT: ret void +// +void test_amdgcn_swmmac_bf16_16x16x32_bf16_w32(global v8s* out, v8s a, v16s b, v8s c, short index) +{ + *out = __builtin_amdgcn_swmmac_bf16_16x16x32_bf16_w32(a, b, c, index); +} + +// CHECK-GFX1200-LABEL: @test_amdgcn_swmmac_i32_16x16x32_iu8_w32( +// CHECK-GFX1200-NEXT: entry: +// CHECK-GFX1200-NEXT: [[TMP0:%.*]] = tail call <8 x i32> @llvm.amdgcn.swmmac.i32.16x16x32.iu8.v8i32.v2i32.v4i32.i16(i1 true, <2 x i32> [[A:%.*]], i1 true, <4 x i32> [[B:%.*]], <8 x i32> [[C:%.*]], i16 [[INDEX:%.*]], i1 true) +// CHECK-GFX1200-NEXT: store <8 x i32> [[TMP0]], ptr addrspace(1) [[OUT:%.*]], align 32, !tbaa [[TBAA4]] +// CHECK-GFX1200-NEXT: ret void +// +void test_amdgcn_swmmac_i32_16x16x32_iu8_w32(global v8i* out, v2i a, v4i b, v8i c, short index) +{ + *out = __builtin_amdgcn_swmmac_i32_16x16x32_iu8_w32(true, a, true, b, c, index, true); +} + +// CHECK-GFX1200-LABEL: @test_amdgcn_swmmac_i32_16x16x32_iu4_w32( +// CHECK-GFX1200-NEXT: entry: +// CHECK-GFX1200-NEXT: [[TMP0:%.*]] = tail call <8 x i32> @llvm.amdgcn.swmmac.i32.16x16x32.iu4.v8i32.i32.v2i32.i16(i1 true, i32 [[A:%.*]], i1 true, <2 x i32> [[B:%.*]], <8 x i32> [[C:%.*]], i16 [[INDEX:%.*]], i1 true) +// CHECK-GFX1200-NEXT: store <8 x i32> [[TMP0]], ptr addrspace(1) [[OUT:%.*]], align 32, !tbaa [[TBAA4]] +// CHECK-GFX1200-NEXT: ret void +// +void test_amdgcn_swmmac_i32_16x16x32_iu4_w32(global v8i* out, int a, v2i b, v8i c, short index) +{ + *out = __builtin_amdgcn_swmmac_i32_16x16x32_iu4_w32(true, a, true, b, c, index, true); +} + +// CHECK-GFX1200-LABEL: @test_amdgcn_swmmac_i32_16x16x64_iu4_w32( +// CHECK-GFX1200-NEXT: entry: +// CHECK-GFX1200-NEXT: [[TMP0:%.*]] = tail call <8 x i32> @llvm.amdgcn.swmmac.i32.16x16x64.iu4.v8i32.v2i32.v4i32.i16(i1 true, <2 x i32> [[A:%.*]], i1 true, <4 x i32> [[B:%.*]], <8 x i32> [[C:%.*]], i16 [[INDEX:%.*]], i1 true) +// CHECK-GFX1200-NEXT: store <8 x i32> [[TMP0]], ptr addrspace(1) [[OUT:%.*]], align 32, !tbaa [[TBAA4]] +// CHECK-GFX1200-NEXT: ret void +// +void test_amdgcn_swmmac_i32_16x16x64_iu4_w32(global v8i* out, v2i a, v4i b, v8i c, short index) +{ + *out = __builtin_amdgcn_swmmac_i32_16x16x64_iu4_w32(true, a, true, b, c, index, true); +} + +// CHECK-GFX1200-LABEL: @test_amdgcn_swmmac_f32_16x16x32_fp8_fp8_w32( +// CHECK-GFX1200-NEXT: entry: +// CHECK-GFX1200-NEXT: [[TMP0:%.*]] = tail call <8 x float> @llvm.amdgcn.swmmac.f32.16x16x32.fp8.fp8.v8f32.v2i32.v4i32.i16(<2 x i32> [[A:%.*]], <4 x i32> [[B:%.*]], <8 x float> [[C:%.*]], i16 [[INDEX:%.*]]) +// CHECK-GFX1200-NEXT: store <8 x float> [[TMP0]], ptr addrspace(1) [[OUT:%.*]], align 32, !tbaa [[TBAA4]] +// CHECK-GFX1200-NEXT: ret void +// +void test_amdgcn_swmmac_f32_16x16x32_fp8_fp8_w32(global v8f* out, v2i a, v4i b, v8f c, short index) +{ + *out = __builtin_amdgcn_swmmac_f32_16x16x32_fp8_fp8_w32(a, b, c, index); +} + +// CHECK-GFX1200-LABEL: @test_amdgcn_swmmac_f32_16x16x32_fp8_bf8_w32( +// CHECK-GFX1200-NEXT: entry: +// CHECK-GFX1200-NEXT: [[TMP0:%.*]] = tail call <8 x float> @llvm.amdgcn.swmmac.f32.16x16x32.fp8.bf8.v8f32.v2i32.v4i32.i16(<2 x i32> [[A:%.*]], <4 x i32> [[B:%.*]], <8 x float> [[C:%.*]], i16 [[INDEX:%.*]]) +// CHECK-GFX1200-NEXT: store <8 x float> [[TMP0]], ptr addrspace(1) [[OUT:%.*]], align 32, !tbaa [[TBAA4]] +// CHECK-GFX1200-NEXT: ret void +// +void test_amdgcn_swmmac_f32_16x16x32_fp8_bf8_w32(global v8f* out, v2i a, v4i b, v8f c, short index) +{ + *out = __builtin_amdgcn_swmmac_f32_16x16x32_fp8_bf8_w32(a, b, c, index); +} + +// CHECK-GFX1200-LABEL: @test_amdgcn_swmmac_f32_16x16x32_bf8_fp8_w32( +// CHECK-GFX1200-NEXT: entry: +// CHECK-GFX1200-NEXT: [[TMP0:%.*]] = tail call <8 x float> @llvm.amdgcn.swmmac.f32.16x16x32.bf8.fp8.v8f32.v2i32.v4i32.i16(<2 x i32> [[A:%.*]], <4 x i32> [[B:%.*]], <8 x float> [[C:%.*]], i16 [[INDEX:%.*]]) +// CHECK-GFX1200-NEXT: store <8 x float> [[TMP0]], ptr addrspace(1) [[OUT:%.*]], align 32, !tbaa [[TBAA4]] +// CHECK-GFX1200-NEXT: ret void +// +void test_amdgcn_swmmac_f32_16x16x32_bf8_fp8_w32(global v8f* out, v2i a, v4i b, v8f c, short index) +{ + *out = __builtin_amdgcn_swmmac_f32_16x16x32_bf8_fp8_w32(a, b, c, index); +} + +// CHECK-GFX1200-LABEL: @test_amdgcn_swmmac_f32_16x16x32_bf8_bf8_w32( +// CHECK-GFX1200-NEXT: entry: +// CHECK-GFX1200-NEXT: [[TMP0:%.*]] = tail call <8 x float> @llvm.amdgcn.swmmac.f32.16x16x32.bf8.bf8.v8f32.v2i32.v4i32.i16(<2 x i32> [[A:%.*]], <4 x i32> [[B:%.*]], <8 x float> [[C:%.*]], i16 [[INDEX:%.*]]) +// CHECK-GFX1200-NEXT: store <8 x float> [[TMP0]], ptr addrspace(1) [[OUT:%.*]], align 32, !tbaa [[TBAA4]] +// CHECK-GFX1200-NEXT: ret void +// +void test_amdgcn_swmmac_f32_16x16x32_bf8_bf8_w32(global v8f* out, v2i a, v4i b, v8f c, short index) +{ + *out = __builtin_amdgcn_swmmac_f32_16x16x32_bf8_bf8_w32(a, b, c, index); +} diff --git a/clang/test/CodeGenOpenCL/builtins-amdgcn-swmmac-w64.cl b/clang/test/CodeGenOpenCL/builtins-amdgcn-swmmac-w64.cl new file mode 100644 index 000000000000..855fa7351e15 --- /dev/null +++ b/clang/test/CodeGenOpenCL/builtins-amdgcn-swmmac-w64.cl @@ -0,0 +1,134 @@ +// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py +// REQUIRES: amdgpu-registered-target +// RUN: %clang_cc1 -triple amdgcn-unknown-unknown -target-cpu gfx1200 -target-feature +wavefrontsize64 -S -emit-llvm -o - %s | FileCheck %s --check-prefix=CHECK-GFX1200 + +typedef int v2i __attribute__((ext_vector_type(2))); +typedef int v4i __attribute__((ext_vector_type(4))); +typedef float v4f __attribute__((ext_vector_type(4))); +typedef half v4h __attribute__((ext_vector_type(4))); +typedef short v4s __attribute__((ext_vector_type(4))); +typedef half v8h __attribute__((ext_vector_type(8))); +typedef short v8s __attribute__((ext_vector_type(8))); + +// Wave64 + +// CHECK-GFX1200-LABEL: @test_amdgcn_swmmac_f32_16x16x32_f16_w64( +// CHECK-GFX1200-NEXT: entry: +// CHECK-GFX1200-NEXT: [[TMP0:%.*]] = tail call <4 x float> @llvm.amdgcn.swmmac.f32.16x16x32.f16.v4f32.v4f16.v8f16.i16(<4 x half> [[A:%.*]], <8 x half> [[B:%.*]], <4 x float> [[C:%.*]], i16 [[INDEX:%.*]]) +// CHECK-GFX1200-NEXT: store <4 x float> [[TMP0]], ptr addrspace(1) [[OUT:%.*]], align 16, !tbaa [[TBAA4:![0-9]+]] +// CHECK-GFX1200-NEXT: ret void +// +void test_amdgcn_swmmac_f32_16x16x32_f16_w64(global v4f* out, v4h a, v8h b, v4f c, short index) +{ + *out = __builtin_amdgcn_swmmac_f32_16x16x32_f16_w64(a, b, c, index); +} + +// CHECK-GFX1200-LABEL: @test_amdgcn_swmmac_f32_16x16x32_bf16_w64( +// CHECK-GFX1200-NEXT: entry: +// CHECK-GFX1200-NEXT: [[TMP0:%.*]] = tail call <4 x float> @llvm.amdgcn.swmmac.f32.16x16x32.bf16.v4f32.v4i16.v8i16.i16(<4 x i16> [[A:%.*]], <8 x i16> [[B:%.*]], <4 x float> [[C:%.*]], i16 [[INDEX:%.*]]) +// CHECK-GFX1200-NEXT: store <4 x float> [[TMP0]], ptr addrspace(1) [[OUT:%.*]], align 16, !tbaa [[TBAA4]] +// CHECK-GFX1200-NEXT: ret void +// +void test_amdgcn_swmmac_f32_16x16x32_bf16_w64(global v4f* out, v4s a, v8s b, v4f c, short index) +{ + *out = __builtin_amdgcn_swmmac_f32_16x16x32_bf16_w64(a, b, c, index); +} + +// CHECK-GFX1200-LABEL: @test_amdgcn_swmmac_f16_16x16x32_f16_w64( +// CHECK-GFX1200-NEXT: entry: +// CHECK-GFX1200-NEXT: [[TMP0:%.*]] = tail call <4 x half> @llvm.amdgcn.swmmac.f16.16x16x32.f16.v4f16.v4f16.v8f16.i16(<4 x half> [[A:%.*]], <8 x half> [[B:%.*]], <4 x half> [[C:%.*]], i16 [[INDEX:%.*]]) +// CHECK-GFX1200-NEXT: store <4 x half> [[TMP0]], ptr addrspace(1) [[OUT:%.*]], align 8, !tbaa [[TBAA4]] +// CHECK-GFX1200-NEXT: ret void +// +void test_amdgcn_swmmac_f16_16x16x32_f16_w64(global v4h* out, v4h a, v8h b, v4h c, short index) +{ + *out = __builtin_amdgcn_swmmac_f16_16x16x32_f16_w64(a, b, c, index); +} + +// CHECK-GFX1200-LABEL: @test_amdgcn_swmmac_bf16_16x16x32_bf16_w64( +// CHECK-GFX1200-NEXT: entry: +// CHECK-GFX1200-NEXT: [[TMP0:%.*]] = tail call <4 x i16> @llvm.amdgcn.swmmac.bf16.16x16x32.bf16.v4i16.v4i16.v8i16.i16(<4 x i16> [[A:%.*]], <8 x i16> [[B:%.*]], <4 x i16> [[C:%.*]], i16 [[INDEX:%.*]]) +// CHECK-GFX1200-NEXT: store <4 x i16> [[TMP0]], ptr addrspace(1) [[OUT:%.*]], align 8, !tbaa [[TBAA4]] +// CHECK-GFX1200-NEXT: ret void +// +void test_amdgcn_swmmac_bf16_16x16x32_bf16_w64(global v4s* out, v4s a, v8s b, v4s c, short index) +{ + *out = __builtin_amdgcn_swmmac_bf16_16x16x32_bf16_w64(a, b, c, index); +} + +// CHECK-GFX1200-LABEL: @test_amdgcn_swmmac_i32_16x16x32_iu8_w64( +// CHECK-GFX1200-NEXT: entry: +// CHECK-GFX1200-NEXT: [[TMP0:%.*]] = tail call <4 x i32> @llvm.amdgcn.swmmac.i32.16x16x32.iu8.v4i32.i32.v2i32.i16(i1 true, i32 [[A:%.*]], i1 true, <2 x i32> [[B:%.*]], <4 x i32> [[C:%.*]], i16 [[INDEX:%.*]], i1 true) +// CHECK-GFX1200-NEXT: store <4 x i32> [[TMP0]], ptr addrspace(1) [[OUT:%.*]], align 16, !tbaa [[TBAA4]] +// CHECK-GFX1200-NEXT: ret void +// +void test_amdgcn_swmmac_i32_16x16x32_iu8_w64(global v4i* out, int a, v2i b, v4i c, short index) +{ + *out = __builtin_amdgcn_swmmac_i32_16x16x32_iu8_w64(true, a, true, b, c, index, true); +} + +// CHECK-GFX1200-LABEL: @test_amdgcn_swmmac_i32_16x16x32_iu4_w64( +// CHECK-GFX1200-NEXT: entry: +// CHECK-GFX1200-NEXT: [[TMP0:%.*]] = tail call <4 x i32> @llvm.amdgcn.swmmac.i32.16x16x32.iu4.v4i32.i32.i32.i16(i1 true, i32 [[A:%.*]], i1 true, i32 [[B:%.*]], <4 x i32> [[C:%.*]], i16 [[INDEX:%.*]], i1 true) +// CHECK-GFX1200-NEXT: store <4 x i32> [[TMP0]], ptr addrspace(1) [[OUT:%.*]], align 16, !tbaa [[TBAA4]] +// CHECK-GFX1200-NEXT: ret void +// +void test_amdgcn_swmmac_i32_16x16x32_iu4_w64(global v4i* out, int a, int b, v4i c, short index) +{ + *out = __builtin_amdgcn_swmmac_i32_16x16x32_iu4_w64(true, a, true, b, c, index, true); +} + +// CHECK-GFX1200-LABEL: @test_amdgcn_swmmac_i32_16x16x64_iu4_w64( +// CHECK-GFX1200-NEXT: entry: +// CHECK-GFX1200-NEXT: [[TMP0:%.*]] = tail call <4 x i32> @llvm.amdgcn.swmmac.i32.16x16x64.iu4.v4i32.i32.v2i32.i16(i1 true, i32 [[A:%.*]], i1 true, <2 x i32> [[B:%.*]], <4 x i32> [[C:%.*]], i16 [[INDEX:%.*]], i1 true) +// CHECK-GFX1200-NEXT: store <4 x i32> [[TMP0]], ptr addrspace(1) [[OUT:%.*]], align 16, !tbaa [[TBAA4]] +// CHECK-GFX1200-NEXT: ret void +// +void test_amdgcn_swmmac_i32_16x16x64_iu4_w64(global v4i* out, int a, v2i b, v4i c, short index) +{ + *out = __builtin_amdgcn_swmmac_i32_16x16x64_iu4_w64(true, a, true, b, c, index, true); +} + +// CHECK-GFX1200-LABEL: @test_amdgcn_swmmac_f32_16x16x32_fp8_fp8_w64( +// CHECK-GFX1200-NEXT: entry: +// CHECK-GFX1200-NEXT: [[TMP0:%.*]] = tail call <4 x float> @llvm.amdgcn.swmmac.f32.16x16x32.fp8.fp8.v4f32.i32.v2i32.i16(i32 [[A:%.*]], <2 x i32> [[B:%.*]], <4 x float> [[C:%.*]], i16 [[INDEX:%.*]]) +// CHECK-GFX1200-NEXT: store <4 x float> [[TMP0]], ptr addrspace(1) [[OUT:%.*]], align 16, !tbaa [[TBAA4]] +// CHECK-GFX1200-NEXT: ret void +// +void test_amdgcn_swmmac_f32_16x16x32_fp8_fp8_w64(global v4f* out, int a, v2i b, v4f c, short index) +{ + *out = __builtin_amdgcn_swmmac_f32_16x16x32_fp8_fp8_w64(a, b, c, index); +} + +// CHECK-GFX1200-LABEL: @test_amdgcn_swmmac_f32_16x16x32_fp8_bf8_w64( +// CHECK-GFX1200-NEXT: entry: +// CHECK-GFX1200-NEXT: [[TMP0:%.*]] = tail call <4 x float> @llvm.amdgcn.swmmac.f32.16x16x32.fp8.bf8.v4f32.i32.v2i32.i16(i32 [[A:%.*]], <2 x i32> [[B:%.*]], <4 x float> [[C:%.*]], i16 [[INDEX:%.*]]) +// CHECK-GFX1200-NEXT: store <4 x float> [[TMP0]], ptr addrspace(1) [[OUT:%.*]], align 16, !tbaa [[TBAA4]] +// CHECK-GFX1200-NEXT: ret void +// +void test_amdgcn_swmmac_f32_16x16x32_fp8_bf8_w64(global v4f* out, int a, v2i b, v4f c, short index) +{ + *out = __builtin_amdgcn_swmmac_f32_16x16x32_fp8_bf8_w64(a, b, c, index); +} + +// CHECK-GFX1200-LABEL: @test_amdgcn_swmmac_f32_16x16x32_bf8_fp8_w64( +// CHECK-GFX1200-NEXT: entry: +// CHECK-GFX1200-NEXT: [[TMP0:%.*]] = tail call <4 x float> @llvm.amdgcn.swmmac.f32.16x16x32.bf8.fp8.v4f32.i32.v2i32.i16(i32 [[A:%.*]], <2 x i32> [[B:%.*]], <4 x float> [[C:%.*]], i16 [[INDEX:%.*]]) +// CHECK-GFX1200-NEXT: store <4 x float> [[TMP0]], ptr addrspace(1) [[OUT:%.*]], align 16, !tbaa [[TBAA4]] +// CHECK-GFX1200-NEXT: ret void +// +void test_amdgcn_swmmac_f32_16x16x32_bf8_fp8_w64(global v4f* out, int a, v2i b, v4f c, short index) +{ + *out = __builtin_amdgcn_swmmac_f32_16x16x32_bf8_fp8_w64(a, b, c, index); +} + +// CHECK-GFX1200-LABEL: @test_amdgcn_swmmac_f32_16x16x32_bf8_bf8_w64( +// CHECK-GFX1200-NEXT: entry: +// CHECK-GFX1200-NEXT: [[TMP0:%.*]] = tail call <4 x float> @llvm.amdgcn.swmmac.f32.16x16x32.bf8.bf8.v4f32.i32.v2i32.i16(i32 [[A:%.*]], <2 x i32> [[B:%.*]], <4 x float> [[C:%.*]], i16 [[INDEX:%.*]]) +// CHECK-GFX1200-NEXT: store <4 x float> [[TMP0]], ptr addrspace(1) [[OUT:%.*]], align 16, !tbaa [[TBAA4]] +// CHECK-GFX1200-NEXT: ret void +// +void test_amdgcn_swmmac_f32_16x16x32_bf8_bf8_w64(global v4f* out, int a, v2i b, v4f c, short index) +{ + *out = __builtin_amdgcn_swmmac_f32_16x16x32_bf8_bf8_w64(a, b, c, index); +} diff --git a/clang/test/CodeGenOpenCL/builtins-amdgcn-wmma-w32-gfx10-err.cl b/clang/test/CodeGenOpenCL/builtins-amdgcn-wmma-w32-gfx10-err.cl index 41a78ae268be..49cb797df423 100644 --- a/clang/test/CodeGenOpenCL/builtins-amdgcn-wmma-w32-gfx10-err.cl +++ b/clang/test/CodeGenOpenCL/builtins-amdgcn-wmma-w32-gfx10-err.cl @@ -16,7 +16,7 @@ typedef short v16s __attribute__((ext_vector_type(16))); // Wave32 -void test_amdgcn_wmma_f32_16x16x16_bf16_w32(global v8f* out8f, v16s a16s, v16s b16s, v8f c8f, +void test_amdgcn_wmma_f32_16x16x16_bf16_w32(global v8f* out8f, v16s a16s, v16s b16s, v8f c8f, global v16h* out16h, v16h a16h, v16h b16h, v16h c16h, global v16s* out16s, v2i a2i, v2i b2i, v16s c16s, global v8i* out8i, v4i a4i, v4i b4i, v8i c8i) diff --git a/clang/test/CodeGenOpenCL/builtins-amdgcn-wmma-w32.cl b/clang/test/CodeGenOpenCL/builtins-amdgcn-wmma-w32.cl index 4f13c75e5e81..3c6aaf5e3828 100644 --- a/clang/test/CodeGenOpenCL/builtins-amdgcn-wmma-w32.cl +++ b/clang/test/CodeGenOpenCL/builtins-amdgcn-wmma-w32.cl @@ -2,7 +2,6 @@ // REQUIRES: amdgpu-registered-target // RUN: %clang_cc1 -triple amdgcn-unknown-unknown -target-cpu gfx1100 -DWMMA_GFX1100_TESTS -S -emit-llvm -o - %s | FileCheck %s --check-prefix=CHECK-GFX1100 -typedef float v4f __attribute__((ext_vector_type(4))); typedef float v8f __attribute__((ext_vector_type(8))); typedef half v16h __attribute__((ext_vector_type(16))); typedef int v2i __attribute__((ext_vector_type(2))); @@ -20,7 +19,7 @@ typedef short v16s __attribute__((ext_vector_type(16))); // CHECK-GFX1100-LABEL: @test_amdgcn_wmma_f32_16x16x16_f16_w32( // CHECK-GFX1100-NEXT: entry: -// CHECK-GFX1100-NEXT: [[TMP0:%.*]] = tail call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.f16.v8f32(<16 x half> [[A:%.*]], <16 x half> [[B:%.*]], <8 x float> [[C:%.*]]) +// CHECK-GFX1100-NEXT: [[TMP0:%.*]] = tail call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.f16.v8f32.v16f16(<16 x half> [[A:%.*]], <16 x half> [[B:%.*]], <8 x float> [[C:%.*]]) // CHECK-GFX1100-NEXT: store <8 x float> [[TMP0]], ptr addrspace(1) [[OUT:%.*]], align 32, !tbaa [[TBAA4:![0-9]+]] // CHECK-GFX1100-NEXT: ret void // @@ -35,7 +34,7 @@ void test_amdgcn_wmma_f32_16x16x16_f16_w32(global v8f* out, v16h a, v16h b, v8f // CHECK-GFX1100-LABEL: @test_amdgcn_wmma_f32_16x16x16_bf16_w32( // CHECK-GFX1100-NEXT: entry: -// CHECK-GFX1100-NEXT: [[TMP0:%.*]] = tail call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf16.v8f32(<16 x i16> [[A:%.*]], <16 x i16> [[B:%.*]], <8 x float> [[C:%.*]]) +// CHECK-GFX1100-NEXT: [[TMP0:%.*]] = tail call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf16.v8f32.v16i16(<16 x i16> [[A:%.*]], <16 x i16> [[B:%.*]], <8 x float> [[C:%.*]]) // CHECK-GFX1100-NEXT: store <8 x float> [[TMP0]], ptr addrspace(1) [[OUT:%.*]], align 32, !tbaa [[TBAA4]] // CHECK-GFX1100-NEXT: ret void // @@ -50,7 +49,7 @@ void test_amdgcn_wmma_f32_16x16x16_bf16_w32(global v8f* out, v16s a, v16s b, v8f // CHECK-GFX1100-LABEL: @test_amdgcn_wmma_f16_16x16x16_f16_w32( // CHECK-GFX1100-NEXT: entry: -// CHECK-GFX1100-NEXT: [[TMP0:%.*]] = tail call <16 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16.v16f16(<16 x half> [[A:%.*]], <16 x half> [[B:%.*]], <16 x half> [[C:%.*]], i1 true) +// CHECK-GFX1100-NEXT: [[TMP0:%.*]] = tail call <16 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16.v16f16.v16f16(<16 x half> [[A:%.*]], <16 x half> [[B:%.*]], <16 x half> [[C:%.*]], i1 true) // CHECK-GFX1100-NEXT: store <16 x half> [[TMP0]], ptr addrspace(1) [[OUT:%.*]], align 32, !tbaa [[TBAA4]] // CHECK-GFX1100-NEXT: ret void // @@ -65,7 +64,7 @@ void test_amdgcn_wmma_f16_16x16x16_f16_w32(global v16h* out, v16h a, v16h b, v16 // CHECK-GFX1100-LABEL: @test_amdgcn_wmma_bf16_16x16x16_bf16_w32( // CHECK-GFX1100-NEXT: entry: -// CHECK-GFX1100-NEXT: [[TMP0:%.*]] = tail call <16 x i16> @llvm.amdgcn.wmma.bf16.16x16x16.bf16.v16i16(<16 x i16> [[A:%.*]], <16 x i16> [[B:%.*]], <16 x i16> [[C:%.*]], i1 true) +// CHECK-GFX1100-NEXT: [[TMP0:%.*]] = tail call <16 x i16> @llvm.amdgcn.wmma.bf16.16x16x16.bf16.v16i16.v16i16(<16 x i16> [[A:%.*]], <16 x i16> [[B:%.*]], <16 x i16> [[C:%.*]], i1 true) // CHECK-GFX1100-NEXT: store <16 x i16> [[TMP0]], ptr addrspace(1) [[OUT:%.*]], align 32, !tbaa [[TBAA4]] // CHECK-GFX1100-NEXT: ret void // @@ -80,7 +79,7 @@ void test_amdgcn_wmma_bf16_16x16x16_bf16_w32(global v16s* out, v16s a, v16s b, v // CHECK-GFX1100-LABEL: @test_amdgcn_wmma_f16_16x16x16_f16_tied_w32( // CHECK-GFX1100-NEXT: entry: -// CHECK-GFX1100-NEXT: [[TMP0:%.*]] = tail call <16 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16.tied.v16f16(<16 x half> [[A:%.*]], <16 x half> [[B:%.*]], <16 x half> [[C:%.*]], i1 true) +// CHECK-GFX1100-NEXT: [[TMP0:%.*]] = tail call <16 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16.tied.v16f16.v16f16(<16 x half> [[A:%.*]], <16 x half> [[B:%.*]], <16 x half> [[C:%.*]], i1 true) // CHECK-GFX1100-NEXT: store <16 x half> [[TMP0]], ptr addrspace(1) [[OUT:%.*]], align 32, !tbaa [[TBAA4]] // CHECK-GFX1100-NEXT: ret void // @@ -95,7 +94,7 @@ void test_amdgcn_wmma_f16_16x16x16_f16_tied_w32(global v16h* out, v16h a, v16h b // CHECK-GFX1100-LABEL: @test_amdgcn_wmma_bf16_16x16x16_bf16_tied_w32( // CHECK-GFX1100-NEXT: entry: -// CHECK-GFX1100-NEXT: [[TMP0:%.*]] = tail call <16 x i16> @llvm.amdgcn.wmma.bf16.16x16x16.bf16.tied.v16i16(<16 x i16> [[A:%.*]], <16 x i16> [[B:%.*]], <16 x i16> [[C:%.*]], i1 true) +// CHECK-GFX1100-NEXT: [[TMP0:%.*]] = tail call <16 x i16> @llvm.amdgcn.wmma.bf16.16x16x16.bf16.tied.v16i16.v16i16(<16 x i16> [[A:%.*]], <16 x i16> [[B:%.*]], <16 x i16> [[C:%.*]], i1 true) // CHECK-GFX1100-NEXT: store <16 x i16> [[TMP0]], ptr addrspace(1) [[OUT:%.*]], align 32, !tbaa [[TBAA4]] // CHECK-GFX1100-NEXT: ret void // @@ -110,7 +109,7 @@ void test_amdgcn_wmma_bf16_16x16x16_bf16_tied_w32(global v16s* out, v16s a, v16s // CHECK-GFX1100-LABEL: @test_amdgcn_wmma_i32_16x16x16_iu8_w32( // CHECK-GFX1100-NEXT: entry: -// CHECK-GFX1100-NEXT: [[TMP0:%.*]] = tail call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8.v8i32(i1 true, <4 x i32> [[A:%.*]], i1 true, <4 x i32> [[B:%.*]], <8 x i32> [[C:%.*]], i1 false) +// CHECK-GFX1100-NEXT: [[TMP0:%.*]] = tail call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8.v8i32.v4i32(i1 true, <4 x i32> [[A:%.*]], i1 true, <4 x i32> [[B:%.*]], <8 x i32> [[C:%.*]], i1 false) // CHECK-GFX1100-NEXT: store <8 x i32> [[TMP0]], ptr addrspace(1) [[OUT:%.*]], align 32, !tbaa [[TBAA4]] // CHECK-GFX1100-NEXT: ret void // @@ -125,7 +124,7 @@ void test_amdgcn_wmma_i32_16x16x16_iu8_w32(global v8i* out, v4i a, v4i b, v8i c) // CHECK-GFX1100-LABEL: @test_amdgcn_wmma_i32_16x16x16_iu4_w32( // CHECK-GFX1100-NEXT: entry: -// CHECK-GFX1100-NEXT: [[TMP0:%.*]] = tail call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4.v8i32(i1 true, <2 x i32> [[A:%.*]], i1 true, <2 x i32> [[B:%.*]], <8 x i32> [[C:%.*]], i1 false) +// CHECK-GFX1100-NEXT: [[TMP0:%.*]] = tail call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4.v8i32.v2i32(i1 true, <2 x i32> [[A:%.*]], i1 true, <2 x i32> [[B:%.*]], <8 x i32> [[C:%.*]], i1 false) // CHECK-GFX1100-NEXT: store <8 x i32> [[TMP0]], ptr addrspace(1) [[OUT:%.*]], align 32, !tbaa [[TBAA4]] // CHECK-GFX1100-NEXT: ret void // diff --git a/clang/test/CodeGenOpenCL/builtins-amdgcn-wmma-w64.cl b/clang/test/CodeGenOpenCL/builtins-amdgcn-wmma-w64.cl index 4797675f50d4..1490f14fd17b 100644 --- a/clang/test/CodeGenOpenCL/builtins-amdgcn-wmma-w64.cl +++ b/clang/test/CodeGenOpenCL/builtins-amdgcn-wmma-w64.cl @@ -3,12 +3,10 @@ // RUN: %clang_cc1 -triple amdgcn-unknown-unknown -target-cpu gfx1100 -DWMMA_GFX1100_TESTS -S -emit-llvm -o - %s | FileCheck %s --check-prefix=CHECK-GFX1100 typedef float v4f __attribute__((ext_vector_type(4))); -typedef float v8f __attribute__((ext_vector_type(8))); typedef half v8h __attribute__((ext_vector_type(8))); typedef half v16h __attribute__((ext_vector_type(16))); typedef int v2i __attribute__((ext_vector_type(2))); typedef int v4i __attribute__((ext_vector_type(4))); -typedef int v8i __attribute__((ext_vector_type(8))); typedef short v8s __attribute__((ext_vector_type(8))); typedef short v16s __attribute__((ext_vector_type(16))); @@ -22,7 +20,7 @@ typedef short v16s __attribute__((ext_vector_type(16))); // CHECK-GFX1100-LABEL: @test_amdgcn_wmma_f32_16x16x16_f16_w64( // CHECK-GFX1100-NEXT: entry: -// CHECK-GFX1100-NEXT: [[TMP0:%.*]] = tail call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.f16.v4f32(<16 x half> [[A:%.*]], <16 x half> [[B:%.*]], <4 x float> [[C:%.*]]) +// CHECK-GFX1100-NEXT: [[TMP0:%.*]] = tail call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.f16.v4f32.v16f16(<16 x half> [[A:%.*]], <16 x half> [[B:%.*]], <4 x float> [[C:%.*]]) // CHECK-GFX1100-NEXT: store <4 x float> [[TMP0]], ptr addrspace(1) [[OUT:%.*]], align 16, !tbaa [[TBAA4:![0-9]+]] // CHECK-GFX1100-NEXT: ret void // @@ -37,7 +35,7 @@ void test_amdgcn_wmma_f32_16x16x16_f16_w64(global v4f* out, v16h a, v16h b, v4f // CHECK-GFX1100-LABEL: @test_amdgcn_wmma_f32_16x16x16_bf16_w64( // CHECK-GFX1100-NEXT: entry: -// CHECK-GFX1100-NEXT: [[TMP0:%.*]] = tail call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf16.v4f32(<16 x i16> [[A:%.*]], <16 x i16> [[B:%.*]], <4 x float> [[C:%.*]]) +// CHECK-GFX1100-NEXT: [[TMP0:%.*]] = tail call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf16.v4f32.v16i16(<16 x i16> [[A:%.*]], <16 x i16> [[B:%.*]], <4 x float> [[C:%.*]]) // CHECK-GFX1100-NEXT: store <4 x float> [[TMP0]], ptr addrspace(1) [[OUT:%.*]], align 16, !tbaa [[TBAA4]] // CHECK-GFX1100-NEXT: ret void // @@ -52,7 +50,7 @@ void test_amdgcn_wmma_f32_16x16x16_bf16_w64(global v4f* out, v16s a, v16s b, v4f // CHECK-GFX1100-LABEL: @test_amdgcn_wmma_f16_16x16x16_f16_w64( // CHECK-GFX1100-NEXT: entry: -// CHECK-GFX1100-NEXT: [[TMP0:%.*]] = tail call <8 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16.v8f16(<16 x half> [[A:%.*]], <16 x half> [[B:%.*]], <8 x half> [[C:%.*]], i1 true) +// CHECK-GFX1100-NEXT: [[TMP0:%.*]] = tail call <8 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16.v8f16.v16f16(<16 x half> [[A:%.*]], <16 x half> [[B:%.*]], <8 x half> [[C:%.*]], i1 true) // CHECK-GFX1100-NEXT: store <8 x half> [[TMP0]], ptr addrspace(1) [[OUT:%.*]], align 16, !tbaa [[TBAA4]] // CHECK-GFX1100-NEXT: ret void // @@ -67,7 +65,7 @@ void test_amdgcn_wmma_f16_16x16x16_f16_w64(global v8h* out, v16h a, v16h b, v8h // CHECK-GFX1100-LABEL: @test_amdgcn_wmma_bf16_16x16x16_bf16_w64( // CHECK-GFX1100-NEXT: entry: -// CHECK-GFX1100-NEXT: [[TMP0:%.*]] = tail call <8 x i16> @llvm.amdgcn.wmma.bf16.16x16x16.bf16.v8i16(<16 x i16> [[A:%.*]], <16 x i16> [[B:%.*]], <8 x i16> [[C:%.*]], i1 true) +// CHECK-GFX1100-NEXT: [[TMP0:%.*]] = tail call <8 x i16> @llvm.amdgcn.wmma.bf16.16x16x16.bf16.v8i16.v16i16(<16 x i16> [[A:%.*]], <16 x i16> [[B:%.*]], <8 x i16> [[C:%.*]], i1 true) // CHECK-GFX1100-NEXT: store <8 x i16> [[TMP0]], ptr addrspace(1) [[OUT:%.*]], align 16, !tbaa [[TBAA4]] // CHECK-GFX1100-NEXT: ret void // @@ -82,7 +80,7 @@ void test_amdgcn_wmma_bf16_16x16x16_bf16_w64(global v8s* out, v16s a, v16s b, v8 // CHECK-GFX1100-LABEL: @test_amdgcn_wmma_f16_16x16x16_f16_tied_w64( // CHECK-GFX1100-NEXT: entry: -// CHECK-GFX1100-NEXT: [[TMP0:%.*]] = tail call <8 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16.tied.v8f16(<16 x half> [[A:%.*]], <16 x half> [[B:%.*]], <8 x half> [[C:%.*]], i1 true) +// CHECK-GFX1100-NEXT: [[TMP0:%.*]] = tail call <8 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16.tied.v8f16.v16f16(<16 x half> [[A:%.*]], <16 x half> [[B:%.*]], <8 x half> [[C:%.*]], i1 true) // CHECK-GFX1100-NEXT: store <8 x half> [[TMP0]], ptr addrspace(1) [[OUT:%.*]], align 16, !tbaa [[TBAA4]] // CHECK-GFX1100-NEXT: ret void // @@ -97,7 +95,7 @@ void test_amdgcn_wmma_f16_16x16x16_f16_tied_w64(global v8h* out, v16h a, v16h b, // CHECK-GFX1100-LABEL: @test_amdgcn_wmma_bf16_16x16x16_bf16_tied_w64( // CHECK-GFX1100-NEXT: entry: -// CHECK-GFX1100-NEXT: [[TMP0:%.*]] = tail call <8 x i16> @llvm.amdgcn.wmma.bf16.16x16x16.bf16.tied.v8i16(<16 x i16> [[A:%.*]], <16 x i16> [[B:%.*]], <8 x i16> [[C:%.*]], i1 true) +// CHECK-GFX1100-NEXT: [[TMP0:%.*]] = tail call <8 x i16> @llvm.amdgcn.wmma.bf16.16x16x16.bf16.tied.v8i16.v16i16(<16 x i16> [[A:%.*]], <16 x i16> [[B:%.*]], <8 x i16> [[C:%.*]], i1 true) // CHECK-GFX1100-NEXT: store <8 x i16> [[TMP0]], ptr addrspace(1) [[OUT:%.*]], align 16, !tbaa [[TBAA4]] // CHECK-GFX1100-NEXT: ret void // @@ -112,7 +110,7 @@ void test_amdgcn_wmma_bf16_16x16x16_bf16_tied_w64(global v8s* out, v16s a, v16s // CHECK-GFX1100-LABEL: @test_amdgcn_wmma_i32_16x16x16_iu8_w64( // CHECK-GFX1100-NEXT: entry: -// CHECK-GFX1100-NEXT: [[TMP0:%.*]] = tail call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8.v4i32(i1 true, <4 x i32> [[A:%.*]], i1 true, <4 x i32> [[B:%.*]], <4 x i32> [[C:%.*]], i1 false) +// CHECK-GFX1100-NEXT: [[TMP0:%.*]] = tail call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8.v4i32.v4i32(i1 true, <4 x i32> [[A:%.*]], i1 true, <4 x i32> [[B:%.*]], <4 x i32> [[C:%.*]], i1 false) // CHECK-GFX1100-NEXT: store <4 x i32> [[TMP0]], ptr addrspace(1) [[OUT:%.*]], align 16, !tbaa [[TBAA4]] // CHECK-GFX1100-NEXT: ret void // @@ -127,7 +125,7 @@ void test_amdgcn_wmma_i32_16x16x16_iu8_w64(global v4i* out, v4i a, v4i b, v4i c) // CHECK-GFX1100-LABEL: @test_amdgcn_wmma_i32_16x16x16_iu4_w64( // CHECK-GFX1100-NEXT: entry: -// CHECK-GFX1100-NEXT: [[TMP0:%.*]] = tail call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4.v4i32(i1 true, <2 x i32> [[A:%.*]], i1 true, <2 x i32> [[B:%.*]], <4 x i32> [[C:%.*]], i1 false) +// CHECK-GFX1100-NEXT: [[TMP0:%.*]] = tail call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4.v4i32.v2i32(i1 true, <2 x i32> [[A:%.*]], i1 true, <2 x i32> [[B:%.*]], <4 x i32> [[C:%.*]], i1 false) // CHECK-GFX1100-NEXT: store <4 x i32> [[TMP0]], ptr addrspace(1) [[OUT:%.*]], align 16, !tbaa [[TBAA4]] // CHECK-GFX1100-NEXT: ret void // diff --git a/cross-project-tests/amdgpu/builtins-amdgcn-gfx12-wmma-w32.cl b/cross-project-tests/amdgpu/builtins-amdgcn-gfx12-wmma-w32.cl new file mode 100644 index 000000000000..867273126f8f --- /dev/null +++ b/cross-project-tests/amdgpu/builtins-amdgcn-gfx12-wmma-w32.cl @@ -0,0 +1,107 @@ +// RUN: %clang_cc1 -triple amdgcn-unknown-unknown -target-cpu gfx1200 -target-feature +wavefrontsize32 -S -o - %s | FileCheck %s --check-prefix=CHECK-GFX1200 + +typedef int v2i __attribute__((ext_vector_type(2))); +typedef float v8f __attribute__((ext_vector_type(8))); +typedef half v8h __attribute__((ext_vector_type(8))); +typedef short v8s __attribute__((ext_vector_type(8))); +typedef int v8i __attribute__((ext_vector_type(8))); + +// Wave32 + + +// CHECK-GFX1200-LABEL: test_amdgcn_wmma_f32_16x16x16_f16_w32: +// CHECK-GFX1200: v_wmma_f32_16x16x16_f16 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}] +// +void test_amdgcn_wmma_f32_16x16x16_f16_w32(global v8f* out, v8h a, v8h b, v8f c) +{ + *out = __builtin_amdgcn_wmma_f32_16x16x16_f16_w32_gfx12(a, b, c); +} + + +// CHECK-GFX1200-LABEL: test_amdgcn_wmma_f32_16x16x16_bf16_w32: +// CHECK-GFX1200: v_wmma_f32_16x16x16_bf16 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}] +// +void test_amdgcn_wmma_f32_16x16x16_bf16_w32(global v8f* out, v8s a, v8s b, v8f c) +{ + *out = __builtin_amdgcn_wmma_f32_16x16x16_bf16_w32_gfx12(a, b, c); +} + + +// CHECK-GFX1200-LABEL: test_amdgcn_wmma_f16_16x16x16_f16_w32: +// CHECK-GFX1200: v_wmma_f16_16x16x16_f16 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}] +// +void test_amdgcn_wmma_f16_16x16x16_f16_w32(global v8h* out, v8h a, v8h b, v8h c) +{ + *out = __builtin_amdgcn_wmma_f16_16x16x16_f16_w32_gfx12(a, b, c); +} + + +// CHECK-GFX1200-LABEL: test_amdgcn_wmma_bf16_16x16x16_bf16_w32: +// CHECK-GFX1200: v_wmma_bf16_16x16x16_bf16 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}] +void test_amdgcn_wmma_bf16_16x16x16_bf16_w32(global v8s* out, v8s a, v8s b, v8s c) +{ + *out = __builtin_amdgcn_wmma_bf16_16x16x16_bf16_w32_gfx12(a, b, c); +} + + +// CHECK-GFX1200-LABEL: test_amdgcn_wmma_i32_16x16x16_iu8_w32: +// CHECK-GFX1200: v_wmma_i32_16x16x16_iu8 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}] neg_lo:[1,1,0] +// +void test_amdgcn_wmma_i32_16x16x16_iu8_w32(global v8i* out, v2i a, v2i b, v8i c) +{ + *out = __builtin_amdgcn_wmma_i32_16x16x16_iu8_w32_gfx12(true, a, true, b, c, false); +} + + +// CHECK-GFX1200-LABEL: test_amdgcn_wmma_i32_16x16x16_iu4_w32: +// CHECK-GFX1200: v_wmma_i32_16x16x16_iu4 v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}}, v{{[0-9]+}}, v[{{[0-9]+:[0-9]+}}] neg_lo:[1,1,0] +// +void test_amdgcn_wmma_i32_16x16x16_iu4_w32(global v8i* out, int a, int b, v8i c) +{ + *out = __builtin_amdgcn_wmma_i32_16x16x16_iu4_w32_gfx12(true, a, true, b, c, false); +} + + +// CHECK-GFX1200-LABEL: test_amdgcn_wmma_f32_16x16x16_fp8_fp8_w32: +// CHECK-GFX1200: v_wmma_f32_16x16x16_fp8_fp8 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}] +// +void test_amdgcn_wmma_f32_16x16x16_fp8_fp8_w32(global v8f* out, v2i a, v2i b, v8f c) +{ + *out = __builtin_amdgcn_wmma_f32_16x16x16_fp8_fp8_w32_gfx12(a, b, c); +} + + +// CHECK-GFX1200-LABEL: test_amdgcn_wmma_f32_16x16x16_fp8_bf8_w32: +// CHECK-GFX1200: v_wmma_f32_16x16x16_fp8_bf8 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}] +// +void test_amdgcn_wmma_f32_16x16x16_fp8_bf8_w32(global v8f* out, v2i a, v2i b, v8f c) +{ + *out = __builtin_amdgcn_wmma_f32_16x16x16_fp8_bf8_w32_gfx12(a, b, c); +} + + +// CHECK-GFX1200-LABEL: test_amdgcn_wmma_f32_16x16x16_bf8_fp8_w32: +// CHECK-GFX1200: v_wmma_f32_16x16x16_bf8_fp8 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}] +// +void test_amdgcn_wmma_f32_16x16x16_bf8_fp8_w32(global v8f* out, v2i a, v2i b, v8f c) +{ + *out = __builtin_amdgcn_wmma_f32_16x16x16_bf8_fp8_w32_gfx12(a, b, c); +} + + +// CHECK-GFX1200-LABEL: test_amdgcn_wmma_f32_16x16x16_bf8_bf8_w32: +// CHECK-GFX1200: v_wmma_f32_16x16x16_bf8_bf8 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}] +// +void test_amdgcn_wmma_f32_16x16x16_bf8_bf8_w32(global v8f* out, v2i a, v2i b, v8f c) +{ + *out = __builtin_amdgcn_wmma_f32_16x16x16_bf8_bf8_w32_gfx12(a, b, c); +} + + +// CHECK-GFX1200-LABEL: test_amdgcn_wmma_i32_16x16x32_iu4_w32: +// CHECK-GFX1200: v_wmma_i32_16x16x32_iu4 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}] neg_lo:[1,1,0] +// +void test_amdgcn_wmma_i32_16x16x32_iu4_w32(global v8i* out, v2i a, v2i b, v8i c) +{ + *out = __builtin_amdgcn_wmma_i32_16x16x32_iu4_w32_gfx12(true, a, true, b, c, false); +} diff --git a/cross-project-tests/amdgpu/builtins-amdgcn-gfx12-wmma-w64.cl b/cross-project-tests/amdgpu/builtins-amdgcn-gfx12-wmma-w64.cl new file mode 100644 index 000000000000..ad01450c35a7 --- /dev/null +++ b/cross-project-tests/amdgpu/builtins-amdgcn-gfx12-wmma-w64.cl @@ -0,0 +1,104 @@ +// RUN: %clang_cc1 -triple amdgcn-unknown-unknown -target-cpu gfx1200 -target-feature +wavefrontsize64 -S -o - %s | FileCheck %s --check-prefix=CHECK-GFX1200 + +typedef float v4f __attribute__((ext_vector_type(4))); +typedef half v4h __attribute__((ext_vector_type(4))); +typedef short v4s __attribute__((ext_vector_type(4))); +typedef int v4i __attribute__((ext_vector_type(4))); + +// Wave64 + + +// CHECK-GFX1200-LABEL: test_amdgcn_wmma_f32_16x16x16_f16_w64: +// CHECK-GFX1200: v_wmma_f32_16x16x16_f16 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}] +// +void test_amdgcn_wmma_f32_16x16x16_f16_w64(global v4f* out, v4h a, v4h b, v4f c) +{ + *out = __builtin_amdgcn_wmma_f32_16x16x16_f16_w64_gfx12(a, b, c); +} + + +// CHECK-GFX1200-LABEL: test_amdgcn_wmma_f32_16x16x16_bf16_w64: +// CHECK-GFX1200: v_wmma_f32_16x16x16_bf16 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}] +// +void test_amdgcn_wmma_f32_16x16x16_bf16_w64(global v4f* out, v4s a, v4s b, v4f c) +{ + *out = __builtin_amdgcn_wmma_f32_16x16x16_bf16_w64_gfx12(a, b, c); +} + + +// CHECK-GFX1200-LABEL: test_amdgcn_wmma_f16_16x16x16_f16_w64: +// CHECK-GFX1200: v_wmma_f16_16x16x16_f16 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}] +// +void test_amdgcn_wmma_f16_16x16x16_f16_w64(global v4h* out, v4h a, v4h b, v4h c) +{ + *out = __builtin_amdgcn_wmma_f16_16x16x16_f16_w64_gfx12(a, b, c); +} + + +// CHECK-GFX1200-LABEL: test_amdgcn_wmma_bf16_16x16x16_bf16_w64: +// CHECK-GFX1200: v_wmma_bf16_16x16x16_bf16 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}] +// +void test_amdgcn_wmma_bf16_16x16x16_bf16_w64(global v4s* out, v4s a, v4s b, v4s c) +{ + *out = __builtin_amdgcn_wmma_bf16_16x16x16_bf16_w64_gfx12(a, b, c); +} + + +// CHECK-GFX1200-LABEL: test_amdgcn_wmma_i32_16x16x16_iu8_w64: +// CHECK-GFX1200: v_wmma_i32_16x16x16_iu8 v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}}, v{{[0-9]+}}, v[{{[0-9]+:[0-9]+}}] neg_lo:[1,1,0] +// +void test_amdgcn_wmma_i32_16x16x16_iu8_w64(global v4i* out, int a, int b, v4i c) +{ + *out = __builtin_amdgcn_wmma_i32_16x16x16_iu8_w64_gfx12(true, a, true, b, c, false); +} + + +// CHECK-GFX1200-LABEL: test_amdgcn_wmma_i32_16x16x16_iu4_w64: +// CHECK-GFX1200: v_wmma_i32_16x16x16_iu4 v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}}, v{{[0-9]+}}, v[{{[0-9]+:[0-9]+}}] neg_lo:[1,1,0] +// +void test_amdgcn_wmma_i32_16x16x16_iu4_w64(global v4i* out, int a, int b, v4i c) +{ + *out = __builtin_amdgcn_wmma_i32_16x16x16_iu4_w64_gfx12(true, a, true, b, c, false); +} + + +// CHECK-GFX1200-LABEL: test_amdgcn_wmma_f32_16x16x16_fp8_fp8_w32: +// CHECK-GFX1200: v_wmma_f32_16x16x16_fp8_fp8 v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}}, v{{[0-9]+}}, v[{{[0-9]+:[0-9]+}}] +// +void test_amdgcn_wmma_f32_16x16x16_fp8_fp8_w32(global v4f* out, int a, int b, v4f c) +{ + *out = __builtin_amdgcn_wmma_f32_16x16x16_fp8_fp8_w64_gfx12(a, b, c); +} + + +// CHECK-GFX1200-LABEL: test_amdgcn_wmma_f32_16x16x16_fp8_bf8_w32: +// CHECK-GFX1200: v_wmma_f32_16x16x16_fp8_bf8 v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}}, v{{[0-9]+}}, v[{{[0-9]+:[0-9]+}}] +// +void test_amdgcn_wmma_f32_16x16x16_fp8_bf8_w32(global v4f* out, int a, int b, v4f c) +{ + *out = __builtin_amdgcn_wmma_f32_16x16x16_fp8_bf8_w64_gfx12(a, b, c); +} + +// CHECK-GFX1200-LABEL: test_amdgcn_wmma_f32_16x16x16_bf8_fp8_w32: +// CHECK-GFX1200: v_wmma_f32_16x16x16_bf8_fp8 v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}}, v{{[0-9]+}}, v[{{[0-9]+:[0-9]+}}] +// +void test_amdgcn_wmma_f32_16x16x16_bf8_fp8_w32(global v4f* out, int a, int b, v4f c) +{ + *out = __builtin_amdgcn_wmma_f32_16x16x16_bf8_fp8_w64_gfx12(a, b, c); +} + +// CHECK-GFX1200-LABEL: test_amdgcn_wmma_f32_16x16x16_bf8_bf8_w32: +// CHECK-GFX1200: v_wmma_f32_16x16x16_bf8_bf8 v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}}, v{{[0-9]+}}, v[{{[0-9]+:[0-9]+}}] +// +void test_amdgcn_wmma_f32_16x16x16_bf8_bf8_w32(global v4f* out, int a, int b, v4f c) +{ + *out = __builtin_amdgcn_wmma_f32_16x16x16_bf8_bf8_w64_gfx12(a, b, c); +} + +// CHECK-GFX1200-LABEL: test_amdgcn_wmma_i32_16x16x32_iu4_w32: +// CHECK-GFX1200: v_wmma_i32_16x16x32_iu4 v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}}, v{{[0-9]+}}, v[{{[0-9]+:[0-9]+}}] neg_lo:[1,1,0] +// +void test_amdgcn_wmma_i32_16x16x32_iu4_w32(global v4i* out, int a, int b, v4i c) +{ + *out = __builtin_amdgcn_wmma_i32_16x16x32_iu4_w64_gfx12(true, a, true, b, c, false); +} diff --git a/cross-project-tests/amdgpu/builtins-amdgcn-swmmac-w32.cl b/cross-project-tests/amdgpu/builtins-amdgcn-swmmac-w32.cl new file mode 100644 index 000000000000..317d9a1102cc --- /dev/null +++ b/cross-project-tests/amdgpu/builtins-amdgcn-swmmac-w32.cl @@ -0,0 +1,110 @@ +// RUN: %clang_cc1 -triple amdgcn-unknown-unknown -target-cpu gfx1200 -target-feature +wavefrontsize32 -S -o - %s | FileCheck %s --check-prefix=CHECK-GFX1200 + +typedef int v2i __attribute__((ext_vector_type(2))); +typedef int v4i __attribute__((ext_vector_type(4))); +typedef float v8f __attribute__((ext_vector_type(8))); +typedef half v8h __attribute__((ext_vector_type(8))); +typedef short v8s __attribute__((ext_vector_type(8))); +typedef int v8i __attribute__((ext_vector_type(8))); +typedef half v16h __attribute__((ext_vector_type(16))); +typedef short v16s __attribute__((ext_vector_type(16))); + +// Wave32 + + +// CHECK-GFX1200-LABEL: test_amdgcn_swmmac_f32_16x16x32_f16_w32: +// CHECK-GFX1200: v_swmmac_f32_16x16x32_f16 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}} +// +void test_amdgcn_swmmac_f32_16x16x32_f16_w32(global v8f* out, v8h a, v16h b, v8f c, short index) +{ + *out = __builtin_amdgcn_swmmac_f32_16x16x32_f16_w32(a, b, c, index); +} + + +// CHECK-GFX1200-LABEL: test_amdgcn_swmmac_f32_16x16x32_bf16_w32: +// CHECK-GFX1200: v_swmmac_f32_16x16x32_bf16 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}} +// +void test_amdgcn_swmmac_f32_16x16x32_bf16_w32(global v8f* out, v8s a, v16s b, v8f c, short index) +{ + *out = __builtin_amdgcn_swmmac_f32_16x16x32_bf16_w32(a, b, c, index); +} + + +// CHECK-GFX1200-LABEL: test_amdgcn_swmmac_f16_16x16x32_f16_w32: +// CHECK-GFX1200: v_swmmac_f16_16x16x32_f16 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}} +// +void test_amdgcn_swmmac_f16_16x16x32_f16_w32(global v8h* out, v8h a, v16h b, v8h c, short index) +{ + *out = __builtin_amdgcn_swmmac_f16_16x16x32_f16_w32(a, b, c, index); +} + + +// CHECK-GFX1200-LABEL: test_amdgcn_swmmac_bf16_16x16x32_bf16_w32: +// CHECK-GFX1200: v_swmmac_bf16_16x16x32_bf16 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}} +// +void test_amdgcn_swmmac_bf16_16x16x32_bf16_w32(global v8s* out, v8s a, v16s b, v8s c, short index) +{ + *out = __builtin_amdgcn_swmmac_bf16_16x16x32_bf16_w32(a, b, c, index); +} + + +// CHECK-GFX1200-LABEL: test_amdgcn_swmmac_i32_16x16x32_iu8_w32: +// CHECK-GFX1200: v_swmmac_i32_16x16x32_iu8 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}} neg_lo:[1,1,0] clamp +// +void test_amdgcn_swmmac_i32_16x16x32_iu8_w32(global v8i* out, v2i a, v4i b, v8i c, short index) +{ + *out = __builtin_amdgcn_swmmac_i32_16x16x32_iu8_w32(true, a, true, b, c, index, true); +} + + +// CHECK-GFX1200-LABEL: test_amdgcn_swmmac_i32_16x16x32_iu4_w32: +// CHECK-GFX1200: v_swmmac_i32_16x16x32_iu4 v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}}, v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}} neg_lo:[1,1,0] clamp +// +void test_amdgcn_swmmac_i32_16x16x32_iu4_w32(global v8i* out, int a, v2i b, v8i c, short index) +{ + *out = __builtin_amdgcn_swmmac_i32_16x16x32_iu4_w32(true, a, true, b, c, index, true); +} + + +// CHECK-GFX1200-LABEL: test_amdgcn_swmmac_i32_16x16x64_iu4_w32: +// CHECK-GFX1200: v_swmmac_i32_16x16x64_iu4 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}} neg_lo:[1,1,0] clamp +// +void test_amdgcn_swmmac_i32_16x16x64_iu4_w32(global v8i* out, v2i a, v4i b, v8i c, short index) +{ + *out = __builtin_amdgcn_swmmac_i32_16x16x64_iu4_w32(true, a, true, b, c, index, true); +} + + +// CHECK-GFX1200-LABEL: test_amdgcn_swmmac_f32_16x16x32_fp8_fp8_w32: +// CHECK-GFX1200: v_swmmac_f32_16x16x32_fp8_fp8 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}} +// +void test_amdgcn_swmmac_f32_16x16x32_fp8_fp8_w32(global v8f* out, v2i a, v4i b, v8f c, short index) +{ + *out = __builtin_amdgcn_swmmac_f32_16x16x32_fp8_fp8_w32(a, b, c, index); +} + + +// CHECK-GFX1200-LABEL: test_amdgcn_swmmac_f32_16x16x32_fp8_bf8_w32: +// CHECK-GFX1200: v_swmmac_f32_16x16x32_fp8_bf8 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}} +// +void test_amdgcn_swmmac_f32_16x16x32_fp8_bf8_w32(global v8f* out, v2i a, v4i b, v8f c, short index) +{ + *out = __builtin_amdgcn_swmmac_f32_16x16x32_fp8_bf8_w32(a, b, c, index); +} + + +// CHECK-GFX1200-LABEL: test_amdgcn_swmmac_f32_16x16x32_bf8_fp8_w32: +// CHECK-GFX1200: v_swmmac_f32_16x16x32_bf8_fp8 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}} +// +void test_amdgcn_swmmac_f32_16x16x32_bf8_fp8_w32(global v8f* out, v2i a, v4i b, v8f c, short index) +{ + *out = __builtin_amdgcn_swmmac_f32_16x16x32_bf8_fp8_w32(a, b, c, index); +} + +// CHECK-GFX1200-LABEL: test_amdgcn_swmmac_f32_16x16x32_bf8_bf8_w32: +// CHECK-GFX1200: v_swmmac_f32_16x16x32_bf8_bf8 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}} +// +void test_amdgcn_swmmac_f32_16x16x32_bf8_bf8_w32(global v8f* out, v2i a, v4i b, v8f c, short index) +{ + *out = __builtin_amdgcn_swmmac_f32_16x16x32_bf8_bf8_w32(a, b, c, index); +} diff --git a/cross-project-tests/amdgpu/builtins-amdgcn-swmmac-w64.cl b/cross-project-tests/amdgpu/builtins-amdgcn-swmmac-w64.cl new file mode 100644 index 000000000000..eb81234f53a6 --- /dev/null +++ b/cross-project-tests/amdgpu/builtins-amdgcn-swmmac-w64.cl @@ -0,0 +1,109 @@ +// RUN: %clang_cc1 -triple amdgcn-unknown-unknown -target-cpu gfx1200 -target-feature +wavefrontsize64 -S -o - %s | FileCheck %s --check-prefix=CHECK-GFX1200 + +typedef int v2i __attribute__((ext_vector_type(2))); +typedef int v4i __attribute__((ext_vector_type(4))); +typedef float v4f __attribute__((ext_vector_type(4))); +typedef half v4h __attribute__((ext_vector_type(4))); +typedef short v4s __attribute__((ext_vector_type(4))); +typedef half v8h __attribute__((ext_vector_type(8))); +typedef short v8s __attribute__((ext_vector_type(8))); + +// Wave64 + + +// CHECK-GFX1200-LABEL: test_amdgcn_swmmac_f32_16x16x32_f16_w64: +// CHECK-GFX1200: v_swmmac_f32_16x16x32_f16 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}} +// +void test_amdgcn_swmmac_f32_16x16x32_f16_w64(global v4f* out, v4h a, v8h b, v4f c, short index) +{ + *out = __builtin_amdgcn_swmmac_f32_16x16x32_f16_w64(a, b, c, index); +} + + +// CHECK-GFX1200-LABEL: test_amdgcn_swmmac_f32_16x16x32_bf16_w64: +// CHECK-GFX1200: v_swmmac_f32_16x16x32_bf16 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}} +// +void test_amdgcn_swmmac_f32_16x16x32_bf16_w64(global v4f* out, v4s a, v8s b, v4f c, short index) +{ + *out = __builtin_amdgcn_swmmac_f32_16x16x32_bf16_w64(a, b, c, index); +} + + +// CHECK-GFX1200-LABEL: test_amdgcn_swmmac_f16_16x16x32_f16_w64: +// CHECK-GFX1200: v_swmmac_f16_16x16x32_f16 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}} +// +void test_amdgcn_swmmac_f16_16x16x32_f16_w64(global v4h* out, v4h a, v8h b, v4h c, short index) +{ + *out = __builtin_amdgcn_swmmac_f16_16x16x32_f16_w64(a, b, c, index); +} + + +// CHECK-GFX1200-LABEL: test_amdgcn_swmmac_bf16_16x16x32_bf16_w64: +// CHECK-GFX1200: v_swmmac_bf16_16x16x32_bf16 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}} +// +void test_amdgcn_swmmac_bf16_16x16x32_bf16_w64(global v4s* out, v4s a, v8s b, v4s c, short index) +{ + *out = __builtin_amdgcn_swmmac_bf16_16x16x32_bf16_w64(a, b, c, index); +} + + +// CHECK-GFX1200-LABEL: test_amdgcn_swmmac_i32_16x16x32_iu8_w64: +// CHECK-GFX1200: v_swmmac_i32_16x16x32_iu8 v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}}, v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}} neg_lo:[1,1,0] clamp +// +void test_amdgcn_swmmac_i32_16x16x32_iu8_w64(global v4i* out, int a, v2i b, v4i c, short index) +{ + *out = __builtin_amdgcn_swmmac_i32_16x16x32_iu8_w64(true, a, true, b, c, index, true); +} + + +// CHECK-GFX1200-LABEL: test_amdgcn_swmmac_i32_16x16x32_iu4_w64: +// CHECK-GFX1200: v_swmmac_i32_16x16x32_iu4 v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} neg_lo:[1,1,0] clamp +// +void test_amdgcn_swmmac_i32_16x16x32_iu4_w64(global v4i* out, int a, int b, v4i c, short index) +{ + *out = __builtin_amdgcn_swmmac_i32_16x16x32_iu4_w64(true, a, true, b, c, index, true); +} + + +// CHECK-GFX1200-LABEL: test_amdgcn_swmmac_i32_16x16x64_iu4_w64: +// CHECK-GFX1200: v_swmmac_i32_16x16x64_iu4 v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}}, v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}} neg_lo:[1,1,0] clamp +// +void test_amdgcn_swmmac_i32_16x16x64_iu4_w64(global v4i* out, int a, v2i b, v4i c, short index) +{ + *out = __builtin_amdgcn_swmmac_i32_16x16x64_iu4_w64(true, a, true, b, c, index, true); +} + + +// CHECK-GFX1200-LABEL: test_amdgcn_swmmac_f32_16x16x32_fp8_fp8_w64: +// CHECK-GFX1200: v_swmmac_f32_16x16x32_fp8_fp8 v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}}, v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}} +// +void test_amdgcn_swmmac_f32_16x16x32_fp8_fp8_w64(global v4f* out, int a, v2i b, v4f c, short index) +{ + *out = __builtin_amdgcn_swmmac_f32_16x16x32_fp8_fp8_w64(a, b, c, index); +} + + +// CHECK-GFX1200-LABEL: test_amdgcn_swmmac_f32_16x16x32_fp8_bf8_w64: +// CHECK-GFX1200: v_swmmac_f32_16x16x32_fp8_bf8 v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}}, v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}} +// +void test_amdgcn_swmmac_f32_16x16x32_fp8_bf8_w64(global v4f* out, int a, v2i b, v4f c, short index) +{ + *out = __builtin_amdgcn_swmmac_f32_16x16x32_fp8_bf8_w64(a, b, c, index); +} + + +// CHECK-GFX1200-LABEL: test_amdgcn_swmmac_f32_16x16x32_bf8_fp8_w64: +// CHECK-GFX1200: v_swmmac_f32_16x16x32_bf8_fp8 v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}}, v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}} +// +void test_amdgcn_swmmac_f32_16x16x32_bf8_fp8_w64(global v4f* out, int a, v2i b, v4f c, short index) +{ + *out = __builtin_amdgcn_swmmac_f32_16x16x32_bf8_fp8_w64(a, b, c, index); +} + +// CHECK-GFX1200-LABEL: test_amdgcn_swmmac_f32_16x16x32_bf8_bf8_w64: +// CHECK-GFX1200: v_swmmac_f32_16x16x32_bf8_bf8 v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}}, v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}} +// +void test_amdgcn_swmmac_f32_16x16x32_bf8_bf8_w64(global v4f* out, int a, v2i b, v4f c, short index) +{ + *out = __builtin_amdgcn_swmmac_f32_16x16x32_bf8_bf8_w64(a, b, c, index); +} diff --git a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td index e6db9da5526a..9eb1ac8e27be 100644 --- a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td +++ b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td @@ -2601,6 +2601,11 @@ def int_amdgcn_ds_bvh_stack_rtn : [ImmArg>, IntrWillReturn, IntrNoCallback, IntrNoFree] >; +def int_amdgcn_s_wait_event_export_ready : + ClangBuiltin<"__builtin_amdgcn_s_wait_event_export_ready">, + Intrinsic<[], [], [IntrNoMem, IntrHasSideEffects, IntrWillReturn] +>; + // WMMA (Wave Matrix Multiply-Accumulate) intrinsics // // These operations perform a matrix multiplication and accumulation of @@ -2608,10 +2613,10 @@ def int_amdgcn_ds_bvh_stack_rtn : class AMDGPUWmmaIntrinsic : Intrinsic< - [CD], // %D + [CD], // %D [ AB, // %A - AB, // %B + LLVMMatchType<1>, // %B LLVMMatchType<0>, // %C ], [IntrNoMem, IntrConvergent, IntrWillReturn, IntrNoCallback, IntrNoFree] @@ -2619,49 +2624,50 @@ class AMDGPUWmmaIntrinsic : class AMDGPUWmmaIntrinsicOPSEL : Intrinsic< - [CD], // %D + [CD], // %D [ AB, // %A - AB, // %B + LLVMMatchType<1>, // %B LLVMMatchType<0>, // %C - llvm_i1_ty, // %high + llvm_i1_ty, // %high (op_sel) for GFX11, 0 for GFX12 ], [IntrNoMem, IntrConvergent, ImmArg>, IntrWillReturn, IntrNoCallback, IntrNoFree] >; class AMDGPUWmmaIntrinsicIU : Intrinsic< - [CD], // %D + [CD], // %D [ llvm_i1_ty, // %A_sign AB, // %A llvm_i1_ty, // %B_sign - AB, // %B + LLVMMatchType<1>, // %B LLVMMatchType<0>, // %C llvm_i1_ty, // %clamp ], [IntrNoMem, IntrConvergent, ImmArg>, ImmArg>, ImmArg>, IntrWillReturn, IntrNoCallback, IntrNoFree] >; -def int_amdgcn_wmma_f32_16x16x16_f16 : AMDGPUWmmaIntrinsic; -def int_amdgcn_wmma_f32_16x16x16_bf16 : AMDGPUWmmaIntrinsic; -// The regular, untied f16/bf16 wmma intrinsics only write to one half -// of the registers (set via the op_sel bit). -// The content of the other 16-bit of the registers is undefined. -def int_amdgcn_wmma_f16_16x16x16_f16 : AMDGPUWmmaIntrinsicOPSEL; -def int_amdgcn_wmma_bf16_16x16x16_bf16 : AMDGPUWmmaIntrinsicOPSEL; -// The tied versions of the f16/bf16 wmma intrinsics tie the destination matrix -// registers to the input accumulator registers. -// Essentially, the content of the other 16-bit is preserved from the input. -def int_amdgcn_wmma_f16_16x16x16_f16_tied : AMDGPUWmmaIntrinsicOPSEL; -def int_amdgcn_wmma_bf16_16x16x16_bf16_tied : AMDGPUWmmaIntrinsicOPSEL; -def int_amdgcn_wmma_i32_16x16x16_iu8 : AMDGPUWmmaIntrinsicIU; -def int_amdgcn_wmma_i32_16x16x16_iu4 : AMDGPUWmmaIntrinsicIU; +// WMMA GFX11Only -def int_amdgcn_s_wait_event_export_ready : - ClangBuiltin<"__builtin_amdgcn_s_wait_event_export_ready">, - Intrinsic<[], [], [IntrNoMem, IntrHasSideEffects, IntrWillReturn] ->; +// The OPSEL intrinsics read from and write to one half of the registers, selected by the op_sel bit. +// The tied versions of the f16/bf16 wmma intrinsics tie the destination matrix registers to the input accumulator registers. +// The content of the other 16-bit half is preserved from the input. +def int_amdgcn_wmma_f16_16x16x16_f16_tied : AMDGPUWmmaIntrinsicOPSEL; +def int_amdgcn_wmma_bf16_16x16x16_bf16_tied : AMDGPUWmmaIntrinsicOPSEL; + +// WMMA GFX11Plus + +def int_amdgcn_wmma_f32_16x16x16_f16 : AMDGPUWmmaIntrinsic; +def int_amdgcn_wmma_f32_16x16x16_bf16 : AMDGPUWmmaIntrinsic; +def int_amdgcn_wmma_i32_16x16x16_iu8 : AMDGPUWmmaIntrinsicIU; +def int_amdgcn_wmma_i32_16x16x16_iu4 : AMDGPUWmmaIntrinsicIU; + +// GFX11: The OPSEL intrinsics read from and write to one half of the registers, selected by the op_sel bit. +// The content of the other 16-bit half is undefined. +// GFX12: The op_sel bit must be 0. +def int_amdgcn_wmma_f16_16x16x16_f16 : AMDGPUWmmaIntrinsicOPSEL; +def int_amdgcn_wmma_bf16_16x16x16_bf16 : AMDGPUWmmaIntrinsicOPSEL; //===----------------------------------------------------------------------===// // GFX12 Intrinsics @@ -2681,6 +2687,65 @@ def int_amdgcn_permlanex16_var : ClangBuiltin<"__builtin_amdgcn_permlanex16_var" [IntrNoMem, IntrConvergent, IntrWillReturn, ImmArg>, ImmArg>, IntrNoCallback, IntrNoFree]>; + +// WMMA (Wave Matrix Multiply-Accumulate) intrinsics +// +// These operations perform a matrix multiplication and accumulation of +// the form: D = A * B + C . + +// A and B are <8 x fp8> or <8 x bf8>, but since fp8 and bf8 are not supported by llvm we use <2 x i32>. +def int_amdgcn_wmma_f32_16x16x16_fp8_fp8 : AMDGPUWmmaIntrinsic; +def int_amdgcn_wmma_f32_16x16x16_fp8_bf8 : AMDGPUWmmaIntrinsic; +def int_amdgcn_wmma_f32_16x16x16_bf8_fp8 : AMDGPUWmmaIntrinsic; +def int_amdgcn_wmma_f32_16x16x16_bf8_bf8 : AMDGPUWmmaIntrinsic; +// A and B are <16 x iu4>. +def int_amdgcn_wmma_i32_16x16x32_iu4 : AMDGPUWmmaIntrinsicIU; + +// SWMMAC (Wave Matrix(sparse) Multiply-Accumulate) intrinsics +// +// These operations perform a sparse matrix multiplication and accumulation of +// the form: D = A * B + C. +// A is sparse matrix, half the size of B, and is expanded using sparsity index. + +class AMDGPUSWmmacIntrinsicIdx : + Intrinsic< + [CD], // %D + [ + A, // %A + B, // %B + LLVMMatchType<0>, // %C + Index // %Sparsity index for A + ], + [IntrNoMem, IntrConvergent, IntrWillReturn] +>; + +class AMDGPUSWmmacIntrinsicIUIdx : + Intrinsic< + [CD], // %D + [ + llvm_i1_ty, // %A_sign + A, // %A + llvm_i1_ty, // %B_sign + B, // %B + LLVMMatchType<0>, // %C + Index, // %Sparsity index for A + llvm_i1_ty, // %clamp + ], + [IntrNoMem, IntrConvergent, IntrWillReturn, ImmArg>, ImmArg>, ImmArg>] +>; + +def int_amdgcn_swmmac_f32_16x16x32_f16 : AMDGPUSWmmacIntrinsicIdx; +def int_amdgcn_swmmac_f32_16x16x32_bf16 : AMDGPUSWmmacIntrinsicIdx; +def int_amdgcn_swmmac_f16_16x16x32_f16 : AMDGPUSWmmacIntrinsicIdx; +def int_amdgcn_swmmac_bf16_16x16x32_bf16 : AMDGPUSWmmacIntrinsicIdx; +def int_amdgcn_swmmac_i32_16x16x32_iu8 : AMDGPUSWmmacIntrinsicIUIdx; +def int_amdgcn_swmmac_i32_16x16x32_iu4 : AMDGPUSWmmacIntrinsicIUIdx; +def int_amdgcn_swmmac_i32_16x16x64_iu4 : AMDGPUSWmmacIntrinsicIUIdx; +def int_amdgcn_swmmac_f32_16x16x32_fp8_fp8 : AMDGPUSWmmacIntrinsicIdx; +def int_amdgcn_swmmac_f32_16x16x32_fp8_bf8 : AMDGPUSWmmacIntrinsicIdx; +def int_amdgcn_swmmac_f32_16x16x32_bf8_fp8 : AMDGPUSWmmacIntrinsicIdx; +def int_amdgcn_swmmac_f32_16x16x32_bf8_bf8 : AMDGPUSWmmacIntrinsicIdx; + def int_amdgcn_global_atomic_ordered_add_b64 : AMDGPUAtomicRtn; def int_amdgcn_flat_atomic_fmin_num : AMDGPUAtomicRtn; diff --git a/llvm/lib/Target/AMDGPU/AMDGPUGISel.td b/llvm/lib/Target/AMDGPU/AMDGPUGISel.td index a19b03b92923..152f495a452b 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUGISel.td +++ b/llvm/lib/Target/AMDGPU/AMDGPUGISel.td @@ -59,6 +59,30 @@ def gi_wmmaopselvop3pmods : GIComplexOperandMatcher, GIComplexPatternEquiv; +def gi_wmmavisrc : + GIComplexOperandMatcher, + GIComplexPatternEquiv; + +def gi_wmmamods : + GIComplexOperandMatcher, + GIComplexPatternEquiv; + +def gi_wmmamodsf16Neg : + GIComplexOperandMatcher, + GIComplexPatternEquiv; + +def gi_wmmamodsf16NegAbs : + GIComplexOperandMatcher, + GIComplexPatternEquiv; + +def gi_swmmacindex8 : + GIComplexOperandMatcher, + GIComplexPatternEquiv; + +def gi_swmmacindex16 : + GIComplexOperandMatcher, + GIComplexPatternEquiv; + def gi_vop3opselmods : GIComplexOperandMatcher, GIComplexPatternEquiv; diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp index 4c35649cec6c..4f7bf3f7d35e 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp @@ -3048,6 +3048,336 @@ bool AMDGPUDAGToDAGISel::SelectWMMAOpSelVOP3PMods(SDValue In, return true; } +static MachineSDNode *buildRegSequence32(SmallVectorImpl &Elts, + llvm::SelectionDAG *CurDAG, + const SDLoc &DL) { + unsigned DstRegClass; + EVT DstTy; + switch (Elts.size()) { + case 8: + DstRegClass = AMDGPU::VReg_256RegClassID; + DstTy = MVT::v8i32; + break; + case 4: + DstRegClass = AMDGPU::VReg_128RegClassID; + DstTy = MVT::v4i32; + break; + case 2: + DstRegClass = AMDGPU::VReg_64RegClassID; + DstTy = MVT::v2i32; + break; + default: + llvm_unreachable("unhandled Reg sequence size"); + } + + SmallVector Ops; + Ops.push_back(CurDAG->getTargetConstant(DstRegClass, DL, MVT::i32)); + for (unsigned i = 0; i < Elts.size(); ++i) { + Ops.push_back(Elts[i]); + Ops.push_back(CurDAG->getTargetConstant( + SIRegisterInfo::getSubRegFromChannel(i), DL, MVT::i32)); + } + return CurDAG->getMachineNode(TargetOpcode::REG_SEQUENCE, DL, DstTy, Ops); +} + +static MachineSDNode *buildRegSequence16(SmallVectorImpl &Elts, + llvm::SelectionDAG *CurDAG, + const SDLoc &DL) { + SmallVector PackedElts; + assert("unhandled Reg sequence size" && + (Elts.size() == 8 || Elts.size() == 16)); + + // Pack 16-bit elements in pairs into 32-bit register. If both elements are + // unpacked from 32-bit source use it, otherwise pack them using v_perm. + for (unsigned i = 0; i < Elts.size(); i += 2) { + SDValue LoSrc = stripExtractLoElt(stripBitcast(Elts[i])); + SDValue HiSrc; + if (isExtractHiElt(Elts[i + 1], HiSrc) && LoSrc == HiSrc) { + PackedElts.push_back(HiSrc); + } else { + SDValue PackLoLo = CurDAG->getTargetConstant(0x05040100, DL, MVT::i32); + MachineSDNode *Packed = + CurDAG->getMachineNode(AMDGPU::V_PERM_B32_e64, DL, MVT::i32, + {Elts[i + 1], Elts[i], PackLoLo}); + PackedElts.push_back(SDValue(Packed, 0)); + } + } + + return buildRegSequence32(PackedElts, CurDAG, DL); +} + +static MachineSDNode *buildRegSequence(SmallVectorImpl &Elts, + llvm::SelectionDAG *CurDAG, + const SDLoc &DL, unsigned ElementSize) { + if (ElementSize == 16) + return buildRegSequence16(Elts, CurDAG, DL); + if (ElementSize == 32) + return buildRegSequence32(Elts, CurDAG, DL); + llvm_unreachable("Unhandled element size"); +} + +static void selectWMMAModsNegAbs(unsigned ModOpcode, unsigned &Mods, + SmallVectorImpl &Elts, SDValue &Src, + llvm::SelectionDAG *CurDAG, const SDLoc &DL, + unsigned ElementSize) { + if (ModOpcode == ISD::FNEG) { + Mods |= SISrcMods::NEG; + // Check if all elements also have abs modifier + SmallVector NegAbsElts; + for (auto El : Elts) { + if (El.getOpcode() != ISD::FABS) + break; + NegAbsElts.push_back(El->getOperand(0)); + } + if (Elts.size() != NegAbsElts.size()) { + // Neg + Src = SDValue(buildRegSequence(Elts, CurDAG, DL, ElementSize), 0); + } else { + // Neg and Abs + Mods |= SISrcMods::NEG_HI; + Src = SDValue(buildRegSequence(NegAbsElts, CurDAG, DL, ElementSize), 0); + } + } else { + assert(ModOpcode == ISD::FABS); + // Abs + Mods |= SISrcMods::NEG_HI; + Src = SDValue(buildRegSequence(Elts, CurDAG, DL, ElementSize), 0); + } +} + +// Check all f16 elements for modifiers while looking through b32 and v2b16 +// build vector, stop if element does not satisfy ModifierCheck. +static void +checkWMMAElementsModifiersF16(BuildVectorSDNode *BV, + std::function ModifierCheck) { + for (unsigned i = 0; i < BV->getNumOperands(); ++i) { + if (auto *F16Pair = + dyn_cast(stripBitcast(BV->getOperand(i)))) { + for (unsigned i = 0; i < F16Pair->getNumOperands(); ++i) { + SDValue ElF16 = stripBitcast(F16Pair->getOperand(i)); + if (!ModifierCheck(ElF16)) + break; + } + } + } +} + +bool AMDGPUDAGToDAGISel::SelectWMMAModsF16Neg(SDValue In, SDValue &Src, + SDValue &SrcMods) const { + Src = In; + unsigned Mods = SISrcMods::OP_SEL_1; + + // mods are on f16 elements + if (auto *BV = dyn_cast(stripBitcast(In))) { + SmallVector EltsF16; + + checkWMMAElementsModifiersF16(BV, [&](SDValue Element) -> bool { + if (Element.getOpcode() != ISD::FNEG) + return false; + EltsF16.push_back(Element.getOperand(0)); + return true; + }); + + // All elements have neg modifier + if (BV->getNumOperands() * 2 == EltsF16.size()) { + Src = SDValue(buildRegSequence16(EltsF16, CurDAG, SDLoc(In)), 0); + Mods |= SISrcMods::NEG; + Mods |= SISrcMods::NEG_HI; + } + } + + // mods are on v2f16 elements + if (auto *BV = dyn_cast(stripBitcast(In))) { + SmallVector EltsV2F16; + for (unsigned i = 0; i < BV->getNumOperands(); ++i) { + SDValue ElV2f16 = stripBitcast(BV->getOperand(i)); + // Based on first element decide which mod we match, neg or abs + if (ElV2f16.getOpcode() != ISD::FNEG) + break; + EltsV2F16.push_back(ElV2f16.getOperand(0)); + } + + // All pairs of elements have neg modifier + if (BV->getNumOperands() == EltsV2F16.size()) { + Src = SDValue(buildRegSequence32(EltsV2F16, CurDAG, SDLoc(In)), 0); + Mods |= SISrcMods::NEG; + Mods |= SISrcMods::NEG_HI; + } + } + + SrcMods = CurDAG->getTargetConstant(Mods, SDLoc(In), MVT::i32); + return true; +} + +bool AMDGPUDAGToDAGISel::SelectWMMAModsF16NegAbs(SDValue In, SDValue &Src, + SDValue &SrcMods) const { + Src = In; + unsigned Mods = SISrcMods::OP_SEL_1; + unsigned ModOpcode; + + // mods are on f16 elements + if (auto *BV = dyn_cast(stripBitcast(In))) { + SmallVector EltsF16; + checkWMMAElementsModifiersF16(BV, [&](SDValue ElF16) -> bool { + // Based on first element decide which mod we match, neg or abs + if (EltsF16.empty()) + ModOpcode = (ElF16.getOpcode() == ISD::FNEG) ? ISD::FNEG : ISD::FABS; + if (ElF16.getOpcode() != ModOpcode) + return false; + EltsF16.push_back(ElF16.getOperand(0)); + return true; + }); + + // All elements have ModOpcode modifier + if (BV->getNumOperands() * 2 == EltsF16.size()) + selectWMMAModsNegAbs(ModOpcode, Mods, EltsF16, Src, CurDAG, SDLoc(In), + 16); + } + + // mods are on v2f16 elements + if (auto *BV = dyn_cast(stripBitcast(In))) { + SmallVector EltsV2F16; + + for (unsigned i = 0; i < BV->getNumOperands(); ++i) { + SDValue ElV2f16 = stripBitcast(BV->getOperand(i)); + // Based on first element decide which mod we match, neg or abs + if (EltsV2F16.empty()) + ModOpcode = (ElV2f16.getOpcode() == ISD::FNEG) ? ISD::FNEG : ISD::FABS; + if (ElV2f16->getOpcode() != ModOpcode) + break; + EltsV2F16.push_back(ElV2f16->getOperand(0)); + } + + // All elements have ModOpcode modifier + if (BV->getNumOperands() == EltsV2F16.size()) + selectWMMAModsNegAbs(ModOpcode, Mods, EltsV2F16, Src, CurDAG, SDLoc(In), + 32); + } + + SrcMods = CurDAG->getTargetConstant(Mods, SDLoc(In), MVT::i32); + return true; +} + +bool AMDGPUDAGToDAGISel::SelectWMMAModsF32NegAbs(SDValue In, SDValue &Src, + SDValue &SrcMods) const { + Src = In; + unsigned Mods = SISrcMods::OP_SEL_1; + unsigned ModOpcode; + SmallVector EltsF32; + + if (auto *BV = dyn_cast(stripBitcast(In))) { + for (unsigned i = 0; i < BV->getNumOperands(); ++i) { + SDValue ElF32 = stripBitcast(BV->getOperand(i)); + // Based on first element decide which mod we match, neg or abs + if (EltsF32.empty()) + ModOpcode = (ElF32.getOpcode() == ISD::FNEG) ? ISD::FNEG : ISD::FABS; + if (ElF32.getOpcode() != ModOpcode) + break; + EltsF32.push_back(ElF32.getOperand(0)); + } + + // All elements had ModOpcode modifier + if (BV->getNumOperands() == EltsF32.size()) + selectWMMAModsNegAbs(ModOpcode, Mods, EltsF32, Src, CurDAG, SDLoc(In), + 32); + } + + SrcMods = CurDAG->getTargetConstant(Mods, SDLoc(In), MVT::i32); + return true; +} + +bool AMDGPUDAGToDAGISel::SelectWMMAVISrc(SDValue In, SDValue &Src) const { + if (auto *BV = dyn_cast(In)) { + BitVector UndefElements; + if (SDValue Splat = BV->getSplatValue(&UndefElements)) + if (isInlineImmediate(Splat.getNode())) { + if (const ConstantSDNode *C = dyn_cast(Splat)) { + unsigned Imm = C->getAPIntValue().getSExtValue(); + Src = CurDAG->getTargetConstant(Imm, SDLoc(In), MVT::i32); + return true; + } + if (const ConstantFPSDNode *C = dyn_cast(Splat)) { + unsigned Imm = C->getValueAPF().bitcastToAPInt().getSExtValue(); + Src = CurDAG->getTargetConstant(Imm, SDLoc(In), MVT::i32); + return true; + } + llvm_unreachable("unhandled Constant node"); + } + } + + // 16 bit splat + SDValue SplatSrc32 = stripBitcast(In); + if (auto *SplatSrc32BV = dyn_cast(SplatSrc32)) { + if (SDValue Splat32 = SplatSrc32BV->getSplatValue()) { + SDValue SplatSrc16 = stripBitcast(Splat32); + if (auto *SplatSrc16BV = dyn_cast(SplatSrc16)) { + if (SDValue Splat = SplatSrc16BV->getSplatValue()) { + + // f16 + if (isInlineImmediate(Splat.getNode())) { + const ConstantFPSDNode *C = dyn_cast(Splat); + int64_t Imm = C->getValueAPF().bitcastToAPInt().getSExtValue(); + Src = CurDAG->getTargetConstant(Imm, SDLoc(In), MVT::i16); + return true; + } + + // bf16 + if (const ConstantSDNode *C = dyn_cast(Splat)) { + const SIInstrInfo *TII = Subtarget->getInstrInfo(); + APInt BF16Value = C->getAPIntValue(); + APInt F32Value = BF16Value.zext(32).shl(16); + if (TII->isInlineConstant(F32Value)) { + int64_t Imm = F32Value.getSExtValue(); + Src = CurDAG->getTargetConstant(Imm, SDLoc(In), MVT::i32); + return true; + } + } + } + } + } + } + + return false; +} + +bool AMDGPUDAGToDAGISel::SelectSWMMACIndex8(SDValue In, SDValue &Src, + SDValue &IndexKey) const { + unsigned Key = 0; + Src = In; + + if (In.getOpcode() == ISD::SRL) { + const llvm::SDValue &ShiftSrc = In.getOperand(0); + ConstantSDNode *ShiftAmt = dyn_cast(In.getOperand(1)); + if (ShiftSrc.getValueType().getSizeInBits() == 32 && ShiftAmt && + ShiftAmt->getZExtValue() % 8 == 0) { + Key = ShiftAmt->getZExtValue() / 8; + Src = ShiftSrc; + } + } + + IndexKey = CurDAG->getTargetConstant(Key, SDLoc(In), MVT::i32); + return true; +} + +bool AMDGPUDAGToDAGISel::SelectSWMMACIndex16(SDValue In, SDValue &Src, + SDValue &IndexKey) const { + unsigned Key = 0; + Src = In; + + if (In.getOpcode() == ISD::SRL) { + const llvm::SDValue &ShiftSrc = In.getOperand(0); + ConstantSDNode *ShiftAmt = dyn_cast(In.getOperand(1)); + if (ShiftSrc.getValueType().getSizeInBits() == 32 && ShiftAmt && + ShiftAmt->getZExtValue() == 16) { + Key = 1; + Src = ShiftSrc; + } + } + + IndexKey = CurDAG->getTargetConstant(Key, SDLoc(In), MVT::i32); + return true; +} + bool AMDGPUDAGToDAGISel::SelectVOP3OpSel(SDValue In, SDValue &Src, SDValue &SrcMods) const { Src = In; diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.h b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.h index 8645490f0b16..3b42d88df0c2 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.h +++ b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.h @@ -240,6 +240,16 @@ private: bool SelectVOP3PModsNeg(SDValue In, SDValue &Src) const; bool SelectWMMAOpSelVOP3PMods(SDValue In, SDValue &Src) const; + bool SelectWMMAModsF32NegAbs(SDValue In, SDValue &Src, + SDValue &SrcMods) const; + bool SelectWMMAModsF16Neg(SDValue In, SDValue &Src, SDValue &SrcMods) const; + bool SelectWMMAModsF16NegAbs(SDValue In, SDValue &Src, + SDValue &SrcMods) const; + bool SelectWMMAVISrc(SDValue In, SDValue &Src) const; + + bool SelectSWMMACIndex8(SDValue In, SDValue &Src, SDValue &IndexKey) const; + bool SelectSWMMACIndex16(SDValue In, SDValue &Src, SDValue &IndexKey) const; + bool SelectVOP3OpSel(SDValue In, SDValue &Src, SDValue &SrcMods) const; bool SelectVOP3OpSelMods(SDValue In, SDValue &Src, SDValue &SrcMods) const; diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp index fdee74d58d26..f255d098b631 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp @@ -3956,6 +3956,219 @@ AMDGPUInstructionSelector::selectWMMAOpSelVOP3PMods( }}; } +static Register buildRegSequence(SmallVectorImpl &Elts, + MachineInstr *InsertPt, + MachineRegisterInfo &MRI) { + const TargetRegisterClass *DstRegClass; + switch (Elts.size()) { + case 8: + DstRegClass = &AMDGPU::VReg_256RegClass; + break; + case 4: + DstRegClass = &AMDGPU::VReg_128RegClass; + break; + case 2: + DstRegClass = &AMDGPU::VReg_64RegClass; + break; + default: + llvm_unreachable("unhandled Reg sequence size"); + } + + MachineIRBuilder B(*InsertPt); + auto MIB = B.buildInstr(AMDGPU::REG_SEQUENCE) + .addDef(MRI.createVirtualRegister(DstRegClass)); + for (unsigned i = 0; i < Elts.size(); ++i) { + MIB.addReg(Elts[i]); + MIB.addImm(SIRegisterInfo::getSubRegFromChannel(i)); + } + return MIB->getOperand(0).getReg(); +} + +static void selectWMMAModsNegAbs(unsigned ModOpcode, unsigned &Mods, + SmallVectorImpl &Elts, Register &Src, + MachineInstr *InsertPt, + MachineRegisterInfo &MRI) { + if (ModOpcode == TargetOpcode::G_FNEG) { + Mods |= SISrcMods::NEG; + // Check if all elements also have abs modifier + SmallVector NegAbsElts; + for (auto El : Elts) { + Register FabsSrc; + if (!mi_match(El, MRI, m_GFabs(m_Reg(FabsSrc)))) + break; + NegAbsElts.push_back(FabsSrc); + } + if (Elts.size() != NegAbsElts.size()) { + // Neg + Src = buildRegSequence(Elts, InsertPt, MRI); + } else { + // Neg and Abs + Mods |= SISrcMods::NEG_HI; + Src = buildRegSequence(NegAbsElts, InsertPt, MRI); + } + } else { + assert(ModOpcode == TargetOpcode::G_FABS); + // Abs + Mods |= SISrcMods::NEG_HI; + Src = buildRegSequence(Elts, InsertPt, MRI); + } +} + +InstructionSelector::ComplexRendererFns +AMDGPUInstructionSelector::selectWMMAModsF32NegAbs(MachineOperand &Root) const { + Register Src = Root.getReg(); + unsigned Mods = SISrcMods::OP_SEL_1; + unsigned ModOpcode; + SmallVector EltsF32; + + if (GBuildVector *BV = dyn_cast(MRI->getVRegDef(Src))) { + for (unsigned i = 0; i < BV->getNumSources(); ++i) { + MachineInstr *ElF32 = MRI->getVRegDef(BV->getSourceReg(i)); + // Based on first element decide which mod we match, neg or abs + if (EltsF32.empty()) + ModOpcode = (ElF32->getOpcode() == AMDGPU::G_FNEG) ? AMDGPU::G_FNEG + : AMDGPU::G_FABS; + if (ElF32->getOpcode() != ModOpcode) + break; + EltsF32.push_back(ElF32->getOperand(1).getReg()); + } + + // All elements had ModOpcode modifier + if (BV->getNumSources() == EltsF32.size()) { + selectWMMAModsNegAbs(ModOpcode, Mods, EltsF32, Src, Root.getParent(), + *MRI); + } + } + + return {{[=](MachineInstrBuilder &MIB) { MIB.addReg(Src); }, + [=](MachineInstrBuilder &MIB) { MIB.addImm(Mods); }}}; +} + +InstructionSelector::ComplexRendererFns +AMDGPUInstructionSelector::selectWMMAModsF16Neg(MachineOperand &Root) const { + Register Src = Root.getReg(); + unsigned Mods = SISrcMods::OP_SEL_1; + SmallVector EltsV2F16; + + if (GConcatVectors *CV = dyn_cast(MRI->getVRegDef(Src))) { + for (unsigned i = 0; i < CV->getNumSources(); ++i) { + Register FNegSrc; + if (!mi_match(CV->getSourceReg(i), *MRI, m_GFNeg(m_Reg(FNegSrc)))) + break; + EltsV2F16.push_back(FNegSrc); + } + + // All elements had ModOpcode modifier + if (CV->getNumSources() == EltsV2F16.size()) { + Mods |= SISrcMods::NEG; + Mods |= SISrcMods::NEG_HI; + Src = buildRegSequence(EltsV2F16, Root.getParent(), *MRI); + } + } + + return {{[=](MachineInstrBuilder &MIB) { MIB.addReg(Src); }, + [=](MachineInstrBuilder &MIB) { MIB.addImm(Mods); }}}; +} + +InstructionSelector::ComplexRendererFns +AMDGPUInstructionSelector::selectWMMAModsF16NegAbs(MachineOperand &Root) const { + Register Src = Root.getReg(); + unsigned Mods = SISrcMods::OP_SEL_1; + unsigned ModOpcode; + SmallVector EltsV2F16; + + if (GConcatVectors *CV = dyn_cast(MRI->getVRegDef(Src))) { + for (unsigned i = 0; i < CV->getNumSources(); ++i) { + MachineInstr *ElV2F16 = MRI->getVRegDef(CV->getSourceReg(i)); + // Based on first element decide which mod we match, neg or abs + if (EltsV2F16.empty()) + ModOpcode = (ElV2F16->getOpcode() == AMDGPU::G_FNEG) ? AMDGPU::G_FNEG + : AMDGPU::G_FABS; + if (ElV2F16->getOpcode() != ModOpcode) + break; + EltsV2F16.push_back(ElV2F16->getOperand(1).getReg()); + } + + // All elements had ModOpcode modifier + if (CV->getNumSources() == EltsV2F16.size()) { + MachineIRBuilder B(*Root.getParent()); + selectWMMAModsNegAbs(ModOpcode, Mods, EltsV2F16, Src, Root.getParent(), + *MRI); + } + } + + return {{[=](MachineInstrBuilder &MIB) { MIB.addReg(Src); }, + [=](MachineInstrBuilder &MIB) { MIB.addImm(Mods); }}}; +} + +InstructionSelector::ComplexRendererFns +AMDGPUInstructionSelector::selectWMMAVISrc(MachineOperand &Root) const { + std::optional FPValReg; + if (mi_match(Root.getReg(), *MRI, m_GFCstOrSplat(FPValReg))) { + if (TII.isInlineConstant(FPValReg->Value.bitcastToAPInt())) { + return {{[=](MachineInstrBuilder &MIB) { + MIB.addImm(FPValReg->Value.bitcastToAPInt().getSExtValue()); + }}}; + } + // Non-inlineable splat floats should not fall-through for integer immediate + // checks. + return {}; + } + + APInt ICst; + if (mi_match(Root.getReg(), *MRI, m_ICstOrSplat(ICst))) { + if (TII.isInlineConstant(ICst)) { + return { + {[=](MachineInstrBuilder &MIB) { MIB.addImm(ICst.getSExtValue()); }}}; + } + } + + return {}; +} + +InstructionSelector::ComplexRendererFns +AMDGPUInstructionSelector::selectSWMMACIndex8(MachineOperand &Root) const { + Register Src = + getDefIgnoringCopies(Root.getReg(), *MRI)->getOperand(0).getReg(); + unsigned Key = 0; + + Register ShiftSrc; + std::optional ShiftAmt; + if (mi_match(Src, *MRI, m_GLShr(m_Reg(ShiftSrc), m_GCst(ShiftAmt))) && + MRI->getType(ShiftSrc).getSizeInBits() == 32 && + ShiftAmt->Value.getZExtValue() % 8 == 0) { + Key = ShiftAmt->Value.getZExtValue() / 8; + Src = ShiftSrc; + } + + return {{ + [=](MachineInstrBuilder &MIB) { MIB.addReg(Src); }, + [=](MachineInstrBuilder &MIB) { MIB.addImm(Key); } // index_key + }}; +} + +InstructionSelector::ComplexRendererFns +AMDGPUInstructionSelector::selectSWMMACIndex16(MachineOperand &Root) const { + + Register Src = + getDefIgnoringCopies(Root.getReg(), *MRI)->getOperand(0).getReg(); + unsigned Key = 0; + + Register ShiftSrc; + std::optional ShiftAmt; + if (mi_match(Src, *MRI, m_GLShr(m_Reg(ShiftSrc), m_GCst(ShiftAmt))) && + MRI->getType(ShiftSrc).getSizeInBits() == 32 && + ShiftAmt->Value.getZExtValue() == 16) { + Src = ShiftSrc; + Key = 1; + } + + return {{ + [=](MachineInstrBuilder &MIB) { MIB.addReg(Src); }, + [=](MachineInstrBuilder &MIB) { MIB.addImm(Key); } // index_key + }}; +} + InstructionSelector::ComplexRendererFns AMDGPUInstructionSelector::selectVOP3OpSelMods(MachineOperand &Root) const { Register Src; diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.h b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.h index 12ea46c2895b..ef7630f137ac 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.h +++ b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.h @@ -199,6 +199,19 @@ private: InstructionSelector::ComplexRendererFns selectWMMAOpSelVOP3PMods(MachineOperand &Root) const; + InstructionSelector::ComplexRendererFns + selectWMMAModsF32NegAbs(MachineOperand &Root) const; + InstructionSelector::ComplexRendererFns + selectWMMAModsF16Neg(MachineOperand &Root) const; + InstructionSelector::ComplexRendererFns + selectWMMAModsF16NegAbs(MachineOperand &Root) const; + InstructionSelector::ComplexRendererFns + selectWMMAVISrc(MachineOperand &Root) const; + InstructionSelector::ComplexRendererFns + selectSWMMACIndex8(MachineOperand &Root) const; + InstructionSelector::ComplexRendererFns + selectSWMMACIndex16(MachineOperand &Root) const; + InstructionSelector::ComplexRendererFns selectVOP3OpSelMods(MachineOperand &Root) const; diff --git a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp index 8e74d4c0e945..32921bb248ca 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp @@ -7134,6 +7134,29 @@ bool AMDGPULegalizerInfo::legalizeIntrinsic(LegalizerHelper &Helper, return legalizeDSAtomicFPIntrinsic(Helper, MI, IntrID); case Intrinsic::amdgcn_image_bvh_intersect_ray: return legalizeBVHIntrinsic(MI, B); + case Intrinsic::amdgcn_swmmac_f16_16x16x32_f16: + case Intrinsic::amdgcn_swmmac_bf16_16x16x32_bf16: + case Intrinsic::amdgcn_swmmac_f32_16x16x32_bf16: + case Intrinsic::amdgcn_swmmac_f32_16x16x32_f16: + case Intrinsic::amdgcn_swmmac_f32_16x16x32_fp8_fp8: + case Intrinsic::amdgcn_swmmac_f32_16x16x32_fp8_bf8: + case Intrinsic::amdgcn_swmmac_f32_16x16x32_bf8_fp8: + case Intrinsic::amdgcn_swmmac_f32_16x16x32_bf8_bf8: { + Register Index = MI.getOperand(5).getReg(); + LLT S32 = LLT::scalar(32); + if (MRI.getType(Index) != S32) + MI.getOperand(5).setReg(B.buildAnyExt(S32, Index).getReg(0)); + return true; + } + case Intrinsic::amdgcn_swmmac_i32_16x16x32_iu4: + case Intrinsic::amdgcn_swmmac_i32_16x16x32_iu8: + case Intrinsic::amdgcn_swmmac_i32_16x16x64_iu4: { + Register Index = MI.getOperand(7).getReg(); + LLT S32 = LLT::scalar(32); + if (MRI.getType(Index) != S32) + MI.getOperand(7).setReg(B.buildAnyExt(S32, Index).getReg(0)); + return true; + } case Intrinsic::amdgcn_fmed3: { GISelChangeObserver &Observer = Helper.Observer; diff --git a/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp index bdd4e891f158..09fac963d222 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp @@ -4505,6 +4505,22 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const { case Intrinsic::amdgcn_wmma_f32_16x16x16_f16: case Intrinsic::amdgcn_wmma_i32_16x16x16_iu4: case Intrinsic::amdgcn_wmma_i32_16x16x16_iu8: + case Intrinsic::amdgcn_wmma_f32_16x16x16_fp8_fp8: + case Intrinsic::amdgcn_wmma_f32_16x16x16_fp8_bf8: + case Intrinsic::amdgcn_wmma_f32_16x16x16_bf8_fp8: + case Intrinsic::amdgcn_wmma_f32_16x16x16_bf8_bf8: + case Intrinsic::amdgcn_wmma_i32_16x16x32_iu4: + case Intrinsic::amdgcn_swmmac_f32_16x16x32_f16: + case Intrinsic::amdgcn_swmmac_f32_16x16x32_bf16: + case Intrinsic::amdgcn_swmmac_f16_16x16x32_f16: + case Intrinsic::amdgcn_swmmac_bf16_16x16x32_bf16: + case Intrinsic::amdgcn_swmmac_i32_16x16x32_iu8: + case Intrinsic::amdgcn_swmmac_i32_16x16x32_iu4: + case Intrinsic::amdgcn_swmmac_i32_16x16x64_iu4: + case Intrinsic::amdgcn_swmmac_f32_16x16x32_fp8_fp8: + case Intrinsic::amdgcn_swmmac_f32_16x16x32_fp8_bf8: + case Intrinsic::amdgcn_swmmac_f32_16x16x32_bf8_fp8: + case Intrinsic::amdgcn_swmmac_f32_16x16x32_bf8_bf8: return getDefaultMappingVOP(MI); case Intrinsic::amdgcn_log: case Intrinsic::amdgcn_exp2: diff --git a/llvm/lib/Target/AMDGPU/AMDGPUSearchableTables.td b/llvm/lib/Target/AMDGPU/AMDGPUSearchableTables.td index 67263f23b983..bb1c6b733729 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUSearchableTables.td +++ b/llvm/lib/Target/AMDGPU/AMDGPUSearchableTables.td @@ -414,6 +414,22 @@ def : SourceOfDivergence; def : SourceOfDivergence; def : SourceOfDivergence; def : SourceOfDivergence; +def : SourceOfDivergence; +def : SourceOfDivergence; +def : SourceOfDivergence; +def : SourceOfDivergence; +def : SourceOfDivergence; +def : SourceOfDivergence; +def : SourceOfDivergence; +def : SourceOfDivergence; +def : SourceOfDivergence; +def : SourceOfDivergence; +def : SourceOfDivergence; +def : SourceOfDivergence; +def : SourceOfDivergence; +def : SourceOfDivergence; +def : SourceOfDivergence; +def : SourceOfDivergence; def : SourceOfDivergence; // The dummy boolean output is divergent from the IR's perspective, diff --git a/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp b/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp index 489cf85693ed..35d641086737 100644 --- a/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp +++ b/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp @@ -151,6 +151,8 @@ public: ImmTyOpSelHi, ImmTyNegLo, ImmTyNegHi, + ImmTyIndexKey8bit, + ImmTyIndexKey16bit, ImmTyDPP8, ImmTyDppCtrl, ImmTyDppRowMask, @@ -383,6 +385,8 @@ public: bool isGDS() const { return isImmTy(ImmTyGDS); } bool isLDS() const { return isImmTy(ImmTyLDS); } bool isCPol() const { return isImmTy(ImmTyCPol); } + bool isIndexKey8bit() const { return isImmTy(ImmTyIndexKey8bit); } + bool isIndexKey16bit() const { return isImmTy(ImmTyIndexKey16bit); } bool isTFE() const { return isImmTy(ImmTyTFE); } bool isFORMAT() const { return isImmTy(ImmTyFORMAT) && isUInt<7>(getImm()); } bool isDppBankMask() const { return isImmTy(ImmTyDppBankMask); } @@ -656,6 +660,14 @@ public: return isVISrcF16() || isVISrcB32(); } + bool isVISrc_64F16() const { + return isRegOrInlineNoMods(AMDGPU::VReg_64RegClassID, MVT::f16); + } + + bool isVISrc_64B32() const { + return isRegOrInlineNoMods(AMDGPU::VReg_64RegClassID, MVT::i32); + } + bool isVISrc_64B64() const { return isRegOrInlineNoMods(AMDGPU::VReg_64RegClassID, MVT::i64); } @@ -672,6 +684,14 @@ public: return isRegOrInlineNoMods(AMDGPU::VReg_64RegClassID, MVT::i32); } + bool isVISrc_256B32() const { + return isRegOrInlineNoMods(AMDGPU::VReg_256RegClassID, MVT::i32); + } + + bool isVISrc_256F32() const { + return isRegOrInlineNoMods(AMDGPU::VReg_256RegClassID, MVT::f32); + } + bool isVISrc_256B64() const { return isRegOrInlineNoMods(AMDGPU::VReg_256RegClassID, MVT::i64); } @@ -1047,6 +1067,8 @@ public: case ImmTyOffset1: OS << "Offset1"; break; case ImmTySMEMOffsetMod: OS << "SMEMOffsetMod"; break; case ImmTyCPol: OS << "CPol"; break; + case ImmTyIndexKey8bit: OS << "index_key"; break; + case ImmTyIndexKey16bit: OS << "index_key"; break; case ImmTyTFE: OS << "TFE"; break; case ImmTyD16: OS << "D16"; break; case ImmTyFORMAT: OS << "FORMAT"; break; @@ -1604,6 +1626,11 @@ public: ParseStatus parseRegWithFPInputMods(OperandVector &Operands); ParseStatus parseRegWithIntInputMods(OperandVector &Operands); ParseStatus parseVReg32OrOff(OperandVector &Operands); + ParseStatus tryParseIndexKey(OperandVector &Operands, + AMDGPUOperand::ImmTy ImmTy); + ParseStatus parseIndexKey8bit(OperandVector &Operands); + ParseStatus parseIndexKey16bit(OperandVector &Operands); + ParseStatus parseDfmtNfmt(int64_t &Format); ParseStatus parseUfmt(int64_t &Format); ParseStatus parseSymbolicSplitFormat(StringRef FormatStr, SMLoc Loc, @@ -1784,6 +1811,8 @@ public: void cvtVOP3OpSel(MCInst &Inst, const OperandVector &Operands); void cvtVOP3(MCInst &Inst, const OperandVector &Operands); void cvtVOP3P(MCInst &Inst, const OperandVector &Operands); + void cvtSWMMAC(MCInst &Inst, const OperandVector &Operands); + void cvtVOPD(MCInst &Inst, const OperandVector &Operands); void cvtVOP3OpSel(MCInst &Inst, const OperandVector &Operands, OptionalImmIndexMap &OptionalIdx); @@ -4364,7 +4393,11 @@ bool AMDGPUAsmParser::validateNeg(const MCInst &Inst, int OpName) { uint64_t TSFlags = MII.get(Opc).TSFlags; // v_dot4 fp8/bf8 neg_lo/neg_hi not allowed on src0 and src1 (allowed on src2) - if (!(TSFlags & SIInstrFlags::IsDOT)) + // v_wmma iu4/iu8 neg_lo not allowed on src2 (allowed on src0, src1) + // v_swmmac f16/bf16 neg_lo/neg_hi not allowed on src2 (allowed on src0, src1) + // other wmma/swmmac instructions don't have neg_lo/neg_hi operand. + if (!(TSFlags & SIInstrFlags::IsDOT) && !(TSFlags & SIInstrFlags::IsWMMA) && + !(TSFlags & SIInstrFlags::IsSWMMAC)) return true; int NegIdx = AMDGPU::getNamedOperandIdx(Opc, OpName); @@ -6465,6 +6498,33 @@ bool AMDGPUAsmParser::tryParseFmt(const char *Pref, return true; } +ParseStatus AMDGPUAsmParser::tryParseIndexKey(OperandVector &Operands, + AMDGPUOperand::ImmTy ImmTy) { + const char *Pref = "index_key"; + int64_t ImmVal = 0; + SMLoc Loc = getLoc(); + auto Res = parseIntWithPrefix(Pref, ImmVal); + if (!Res.isSuccess()) + return Res; + + if (ImmTy == AMDGPUOperand::ImmTyIndexKey16bit && (ImmVal < 0 || ImmVal > 1)) + return Error(Loc, Twine("out of range ", StringRef(Pref))); + + if (ImmTy == AMDGPUOperand::ImmTyIndexKey8bit && (ImmVal < 0 || ImmVal > 3)) + return Error(Loc, Twine("out of range ", StringRef(Pref))); + + Operands.push_back(AMDGPUOperand::CreateImm(this, ImmVal, Loc, ImmTy)); + return ParseStatus::Success; +} + +ParseStatus AMDGPUAsmParser::parseIndexKey8bit(OperandVector &Operands) { + return tryParseIndexKey(Operands, AMDGPUOperand::ImmTyIndexKey8bit); +} + +ParseStatus AMDGPUAsmParser::parseIndexKey16bit(OperandVector &Operands) { + return tryParseIndexKey(Operands, AMDGPUOperand::ImmTyIndexKey16bit); +} + // dfmt and nfmt (in a tbuffer instruction) are parsed as one to allow their // values to live in a joint format operand in the MCInst encoding. ParseStatus AMDGPUAsmParser::parseDfmtNfmt(int64_t &Format) { @@ -8329,10 +8389,12 @@ void AMDGPUAsmParser::cvtVOP3P(MCInst &Inst, const OperandVector &Operands, } int NegLoIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::neg_lo); - if (NegLoIdx != -1) { + if (NegLoIdx != -1) addOptionalImmOperand(Inst, Operands, OptIdx, AMDGPUOperand::ImmTyNegLo); + + int NegHiIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::neg_hi); + if (NegHiIdx != -1) addOptionalImmOperand(Inst, Operands, OptIdx, AMDGPUOperand::ImmTyNegHi); - } const int Ops[] = { AMDGPU::OpName::src0, AMDGPU::OpName::src1, @@ -8352,11 +8414,11 @@ void AMDGPUAsmParser::cvtVOP3P(MCInst &Inst, const OperandVector &Operands, if (OpSelHiIdx != -1) OpSelHi = Inst.getOperand(OpSelHiIdx).getImm(); - if (NegLoIdx != -1) { - int NegHiIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::neg_hi); + if (NegLoIdx != -1) NegLo = Inst.getOperand(NegLoIdx).getImm(); + + if (NegHiIdx != -1) NegHi = Inst.getOperand(NegHiIdx).getImm(); - } for (int J = 0; J < 3; ++J) { int OpIdx = AMDGPU::getNamedOperandIdx(Opc, Ops[J]); @@ -8392,6 +8454,43 @@ void AMDGPUAsmParser::cvtVOP3P(MCInst &Inst, const OperandVector &Operands) { cvtVOP3P(Inst, Operands, OptIdx); } +static void addSrcModifiersAndSrc(MCInst &Inst, const OperandVector &Operands, + unsigned i, unsigned Opc, unsigned OpName) { + if (AMDGPU::getNamedOperandIdx(Opc, OpName) != -1) + ((AMDGPUOperand &)*Operands[i]).addRegOrImmWithFPInputModsOperands(Inst, 2); + else + ((AMDGPUOperand &)*Operands[i]).addRegOperands(Inst, 1); +} + +void AMDGPUAsmParser::cvtSWMMAC(MCInst &Inst, const OperandVector &Operands) { + unsigned Opc = Inst.getOpcode(); + + ((AMDGPUOperand &)*Operands[1]).addRegOperands(Inst, 1); + addSrcModifiersAndSrc(Inst, Operands, 2, Opc, AMDGPU::OpName::src0_modifiers); + addSrcModifiersAndSrc(Inst, Operands, 3, Opc, AMDGPU::OpName::src1_modifiers); + ((AMDGPUOperand &)*Operands[1]).addRegOperands(Inst, 1); // srcTiedDef + ((AMDGPUOperand &)*Operands[4]).addRegOperands(Inst, 1); // src2 + + OptionalImmIndexMap OptIdx; + for (unsigned i = 5; i < Operands.size(); ++i) { + AMDGPUOperand &Op = ((AMDGPUOperand &)*Operands[i]); + OptIdx[Op.getImmTy()] = i; + } + + if (AMDGPU::hasNamedOperand(Opc, AMDGPU::OpName::index_key_8bit)) + addOptionalImmOperand(Inst, Operands, OptIdx, + AMDGPUOperand::ImmTyIndexKey8bit); + + if (AMDGPU::hasNamedOperand(Opc, AMDGPU::OpName::index_key_16bit)) + addOptionalImmOperand(Inst, Operands, OptIdx, + AMDGPUOperand::ImmTyIndexKey16bit); + + if (AMDGPU::hasNamedOperand(Opc, AMDGPU::OpName::clamp)) + addOptionalImmOperand(Inst, Operands, OptIdx, AMDGPUOperand::ImmTyClampSI); + + cvtVOP3P(Inst, Operands, OptIdx); +} + //===----------------------------------------------------------------------===// // VOPD //===----------------------------------------------------------------------===// diff --git a/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp b/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp index 86096b0d80b4..e2f9bc9278b0 100644 --- a/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp +++ b/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp @@ -260,8 +260,12 @@ DECODE_OPERAND_SRC_REG_OR_IMM_9(VS_32, OPW32, 32) DECODE_OPERAND_SRC_REG_OR_IMM_9(VS_64, OPW64, 64) DECODE_OPERAND_SRC_REG_OR_IMM_9(VS_64, OPW64, 32) DECODE_OPERAND_SRC_REG_OR_IMM_9(VReg_64, OPW64, 64) +DECODE_OPERAND_SRC_REG_OR_IMM_9(VReg_64, OPW64, 32) +DECODE_OPERAND_SRC_REG_OR_IMM_9(VReg_64, OPW64, 16) DECODE_OPERAND_SRC_REG_OR_IMM_9(VReg_128, OPW128, 32) +DECODE_OPERAND_SRC_REG_OR_IMM_9(VReg_128, OPW128, 16) DECODE_OPERAND_SRC_REG_OR_IMM_9(VReg_256, OPW256, 64) +DECODE_OPERAND_SRC_REG_OR_IMM_9(VReg_256, OPW256, 32) DECODE_OPERAND_SRC_REG_OR_IMM_9(VReg_512, OPW512, 32) DECODE_OPERAND_SRC_REG_OR_IMM_9(VReg_1024, OPW1024, 32) @@ -704,6 +708,10 @@ DecodeStatus AMDGPUDisassembler::getInstruction(MCInst &MI, uint64_t &Size, break; Res = tryDecodeInst(DecoderTableWMMAGFX1164, MI, QW, Address, CS); + if (Res) + break; + + Res = tryDecodeInst(DecoderTableWMMAGFX1264, MI, QW, Address, CS); } while (false); if (Res && AMDGPU::isMAC(MI.getOpcode())) { diff --git a/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp b/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp index b6e4e65ff5b0..08bef7ad3002 100644 --- a/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp +++ b/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp @@ -1716,14 +1716,14 @@ bool GCNHazardRecognizer::fixVALUTransUseHazard(MachineInstr *MI) { } bool GCNHazardRecognizer::fixWMMAHazards(MachineInstr *MI) { - if (!SIInstrInfo::isWMMA(*MI)) + if (!SIInstrInfo::isWMMA(*MI) && !SIInstrInfo::isSWMMAC(*MI)) return false; const SIInstrInfo *TII = ST.getInstrInfo(); const SIRegisterInfo *TRI = ST.getRegisterInfo(); - auto IsHazardFn = [MI, TII, TRI](const MachineInstr &I) { - if (!SIInstrInfo::isWMMA(I)) + auto IsHazardFn = [MI, TII, TRI, this](const MachineInstr &I) { + if (!SIInstrInfo::isWMMA(I) && !SIInstrInfo::isSWMMAC(I)) return false; // Src0 or Src1 of the current wmma instruction overlaps with the dest of @@ -1753,6 +1753,7 @@ bool GCNHazardRecognizer::fixWMMAHazards(MachineInstr *MI) { const MachineOperand *Src2Mods = TII->getNamedOperand(*MI, AMDGPU::OpName::src2_modifiers); const bool NoSrc2Mods = + !Src2Mods || (Src2Mods->getImm() & (SISrcMods::NEG | SISrcMods::NEG_HI)) == 0; // Exception: there is no hazard if the wmma instructions are of the same // type and there is no input modifier on src2 of the current instruction. @@ -1760,6 +1761,18 @@ bool GCNHazardRecognizer::fixWMMAHazards(MachineInstr *MI) { TII->pseudoToMCOpcode(MI->getOpcode()))); } + // GFX12+ allows overlap of matrix C with PrevDstReg (hardware will stall) + // but Index can't overlap with PrevDstReg. + if (AMDGPU::isGFX12Plus(ST)) { + if (SIInstrInfo::isSWMMAC(*MI)) { + const Register CurIndex = + TII->getNamedOperand(*MI, AMDGPU::OpName::src2)->getReg(); + if (TRI->regsOverlap(PrevDstReg, CurIndex)) + return true; + } + return false; + } + return false; }; diff --git a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.cpp b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.cpp index e73e53aa270f..007785dc4913 100644 --- a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.cpp +++ b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.cpp @@ -1275,6 +1275,23 @@ void AMDGPUInstPrinter::printPackedModifier(const MCInst *MI, (ModIdx != -1) ? MI->getOperand(ModIdx).getImm() : DefaultValue; } + // Print three values of neg/opsel for wmma instructions (prints 0 when there + // is no src_modifier operand instead of not printing anything). + if (MII.get(MI->getOpcode()).TSFlags & SIInstrFlags::IsSWMMAC || + MII.get(MI->getOpcode()).TSFlags & SIInstrFlags::IsWMMA) { + NumOps = 0; + int DefaultValue = Mod == SISrcMods::OP_SEL_1; + for (int OpName : + {AMDGPU::OpName::src0_modifiers, AMDGPU::OpName::src1_modifiers, + AMDGPU::OpName::src2_modifiers}) { + int Idx = AMDGPU::getNamedOperandIdx(Opc, OpName); + if (Idx != -1) + Ops[NumOps++] = MI->getOperand(Idx).getImm(); + else + Ops[NumOps++] = DefaultValue; + } + } + const bool HasDstSel = NumOps > 0 && Mod == SISrcMods::OP_SEL_0 && @@ -1336,6 +1353,26 @@ void AMDGPUInstPrinter::printNegHi(const MCInst *MI, unsigned OpNo, printPackedModifier(MI, " neg_hi:[", SISrcMods::NEG_HI, O); } +void AMDGPUInstPrinter::printIndexKey8bit(const MCInst *MI, unsigned OpNo, + const MCSubtargetInfo &STI, + raw_ostream &O) { + auto Imm = MI->getOperand(OpNo).getImm() & 0x7; + if (Imm == 0) + return; + + O << " index_key:" << Imm; +} + +void AMDGPUInstPrinter::printIndexKey16bit(const MCInst *MI, unsigned OpNo, + const MCSubtargetInfo &STI, + raw_ostream &O) { + auto Imm = MI->getOperand(OpNo).getImm() & 0x7; + if (Imm == 0) + return; + + O << " index_key:" << Imm; +} + void AMDGPUInstPrinter::printInterpSlot(const MCInst *MI, unsigned OpNum, const MCSubtargetInfo &STI, raw_ostream &O) { diff --git a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.h b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.h index e3958f88277d..e91ff86b219a 100644 --- a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.h +++ b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.h @@ -139,6 +139,10 @@ private: const MCSubtargetInfo &STI, raw_ostream &O); void printNegHi(const MCInst *MI, unsigned OpNo, const MCSubtargetInfo &STI, raw_ostream &O); + void printIndexKey8bit(const MCInst *MI, unsigned OpNo, + const MCSubtargetInfo &STI, raw_ostream &O); + void printIndexKey16bit(const MCInst *MI, unsigned OpNo, + const MCSubtargetInfo &STI, raw_ostream &O); void printInterpSlot(const MCInst *MI, unsigned OpNo, const MCSubtargetInfo &STI, raw_ostream &O); void printInterpAttr(const MCInst *MI, unsigned OpNo, diff --git a/llvm/lib/Target/AMDGPU/SIDefines.h b/llvm/lib/Target/AMDGPU/SIDefines.h index 8ab66d4fd5b8..19596d53b453 100644 --- a/llvm/lib/Target/AMDGPU/SIDefines.h +++ b/llvm/lib/Target/AMDGPU/SIDefines.h @@ -167,6 +167,9 @@ enum : uint64_t { // ds_gws_* instructions. GWS = UINT64_C(1) << 62, + + // Is a SWMMAC instruction. + IsSWMMAC = UINT64_C(1) << 63, }; // v_cmp_class_* etc. use a 10-bit mask for what operation is checked. diff --git a/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp b/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp index 2862a7787e75..a812cdc61500 100644 --- a/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp +++ b/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp @@ -208,6 +208,7 @@ bool SIFoldOperands::canUseImmWithOpSel(FoldCandidate &Fold) const { assert(Old.isReg() && Fold.isImm()); if (!(TSFlags & SIInstrFlags::IsPacked) || (TSFlags & SIInstrFlags::IsMAI) || + (TSFlags & SIInstrFlags::IsWMMA) || (TSFlags & SIInstrFlags::IsSWMMAC) || (ST->hasDOTOpSelHazard() && (TSFlags & SIInstrFlags::IsDOT))) return false; diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp index cf947dccafac..d35b76c8ad54 100644 --- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp +++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp @@ -8242,6 +8242,36 @@ SDValue SITargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op, SIInstrInfo::MO_ABS32_LO); return {DAG.getMachineNode(AMDGPU::S_MOV_B32, DL, MVT::i32, GA), 0}; } + case Intrinsic::amdgcn_swmmac_f16_16x16x32_f16: + case Intrinsic::amdgcn_swmmac_bf16_16x16x32_bf16: + case Intrinsic::amdgcn_swmmac_f32_16x16x32_bf16: + case Intrinsic::amdgcn_swmmac_f32_16x16x32_f16: + case Intrinsic::amdgcn_swmmac_f32_16x16x32_fp8_fp8: + case Intrinsic::amdgcn_swmmac_f32_16x16x32_fp8_bf8: + case Intrinsic::amdgcn_swmmac_f32_16x16x32_bf8_fp8: + case Intrinsic::amdgcn_swmmac_f32_16x16x32_bf8_bf8: { + if (Op.getOperand(4).getValueType() == MVT::i32) + return SDValue(); + + SDLoc SL(Op); + auto IndexKeyi32 = DAG.getAnyExtOrTrunc(Op.getOperand(4), SL, MVT::i32); + return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, SL, Op.getValueType(), + Op.getOperand(0), Op.getOperand(1), Op.getOperand(2), + Op.getOperand(3), IndexKeyi32); + } + case Intrinsic::amdgcn_swmmac_i32_16x16x32_iu4: + case Intrinsic::amdgcn_swmmac_i32_16x16x32_iu8: + case Intrinsic::amdgcn_swmmac_i32_16x16x64_iu4: { + if (Op.getOperand(6).getValueType() == MVT::i32) + return SDValue(); + + SDLoc SL(Op); + auto IndexKeyi32 = DAG.getAnyExtOrTrunc(Op.getOperand(6), SL, MVT::i32); + return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, SL, Op.getValueType(), + {Op.getOperand(0), Op.getOperand(1), Op.getOperand(2), + Op.getOperand(3), Op.getOperand(4), Op.getOperand(5), + IndexKeyi32, Op.getOperand(7)}); + } default: if (const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr = AMDGPU::getImageDimIntrinsicInfo(IntrinsicID)) diff --git a/llvm/lib/Target/AMDGPU/SIInstrFormats.td b/llvm/lib/Target/AMDGPU/SIInstrFormats.td index 1b66d163714f..ab536f8f49d5 100644 --- a/llvm/lib/Target/AMDGPU/SIInstrFormats.td +++ b/llvm/lib/Target/AMDGPU/SIInstrFormats.td @@ -161,6 +161,9 @@ class InstSI ; def neg_lo0 : ArrayOperand0<"neg_lo", "NegLo">; def neg_hi0 : ArrayOperand0<"neg_hi", "NegHi">; +def IndexKey16bit : CustomOperand; +def IndexKey8bit : CustomOperand; + def dpp8 : CustomOperand; def dpp_ctrl : CustomOperand; @@ -1344,6 +1347,13 @@ def VOP3PModsDOT : ComplexPattern; def VOP3PModsNeg : ComplexPattern; def WMMAOpSelVOP3PMods : ComplexPattern; +def WMMAModsF32NegAbs : ComplexPattern; +def WMMAModsF16Neg : ComplexPattern; +def WMMAModsF16NegAbs : ComplexPattern; +def WMMAVISrc : ComplexPattern; +def SWMMACIndex8 : ComplexPattern; +def SWMMACIndex16 : ComplexPattern; + def VOP3OpSel : ComplexPattern; def VOP3OpSelMods : ComplexPattern; @@ -2278,6 +2288,7 @@ class VOPProfile _ArgVT, bit _EnableClamp = 0> { field bit IsDOT = 0; field bit IsSingle = 0; field bit IsWMMA = 0; + field bit IsSWMMAC = 0; field bit HasDst = !ne(DstVT.Value, untyped.Value); field bit HasDst32 = HasDst; diff --git a/llvm/lib/Target/AMDGPU/SIRegisterInfo.td b/llvm/lib/Target/AMDGPU/SIRegisterInfo.td index f42af89cf5e6..b3265b73fa7e 100644 --- a/llvm/lib/Target/AMDGPU/SIRegisterInfo.td +++ b/llvm/lib/Target/AMDGPU/SIRegisterInfo.td @@ -1341,9 +1341,14 @@ def VCSrc_v2f16 : RegOrV2F16 <"VS_32", "OPERAND_REG_INLINE_C">; // VISrc_* Operands with a VGPR or an inline constant //===----------------------------------------------------------------------===// +def VISrc_64_f16 : RegOrF16 <"VReg_64", "OPERAND_REG_INLINE_C">; +def VISrc_64_b32 : RegOrB32 <"VReg_64", "OPERAND_REG_INLINE_C">; def VISrc_64_f64 : RegOrF64 <"VReg_64", "OPERAND_REG_INLINE_C">; +def VISrc_128_f16 : RegOrF16 <"VReg_128", "OPERAND_REG_INLINE_C">; def VISrc_128_b32 : RegOrB32 <"VReg_128", "OPERAND_REG_INLINE_C">; def VISrc_128_f32 : RegOrF32 <"VReg_128", "OPERAND_REG_INLINE_C">; +def VISrc_256_b32 : RegOrB32 <"VReg_256", "OPERAND_REG_INLINE_C">; +def VISrc_256_f32 : RegOrF32 <"VReg_256", "OPERAND_REG_INLINE_C">; def VISrc_256_f64 : RegOrF64 <"VReg_256", "OPERAND_REG_INLINE_C">; def VISrc_512_b32 : RegOrB32 <"VReg_512", "OPERAND_REG_INLINE_C">; def VISrc_512_f32 : RegOrF32 <"VReg_512", "OPERAND_REG_INLINE_C">; diff --git a/llvm/lib/Target/AMDGPU/VOP3PInstructions.td b/llvm/lib/Target/AMDGPU/VOP3PInstructions.td index 0c7a08cd4bc9..107b95a9ca8e 100644 --- a/llvm/lib/Target/AMDGPU/VOP3PInstructions.td +++ b/llvm/lib/Target/AMDGPU/VOP3PInstructions.td @@ -936,16 +936,19 @@ multiclass WMMAInst(NAME # _threeaddr # Suffix)>; } - if !eq(Type, WMMAOpSel) then { - def : WMMAOpSelPat(NAME # _twoaddr # Suffix), node, P>; - } else if !eq(Type, WMMAUIClamp) then { - def : WMMAUIClampPat(NAME # _twoaddr # Suffix), node, P>; - } else { - def : WMMARegularPat(NAME # _twoaddr # Suffix), node, P>; + let SubtargetPredicate = isGFX11Only in { + if !eq(Type, WMMAOpSel) then { + def : WMMAOpSelPat(NAME # _twoaddr # Suffix), node, P>; + } else if !eq(Type, WMMAUIClamp) then { + def : WMMAUIClampPat(NAME # _twoaddr # Suffix), node, P>; + } else { + def : WMMARegularPat(NAME # _twoaddr # Suffix), node, P>; + } } } + let WaveSizePredicate = isWave32 in { defm V_WMMA_F32_16X16X16_F16 : WMMAInst<"_w32", "v_wmma_f32_16x16x16_f16", VOP_V8F32_V16F16_V16F16_V8F32, int_amdgcn_wmma_f32_16x16x16_f16, VRegSrc_256, WMMARegular, 1>; defm V_WMMA_F32_16X16X16_BF16 : WMMAInst<"_w32", "v_wmma_f32_16x16x16_bf16", VOP_V8F32_V16I16_V16I16_V8F32, int_amdgcn_wmma_f32_16x16x16_bf16, VRegSrc_256, WMMARegular, 1>; @@ -969,6 +972,398 @@ let WaveSizePredicate = isWave64 in { } +class VOP3PWMMA_Profile ArgTy, bit _IsSWMMAC, int _IndexType, + bit _IsIU, bit _IsFP8BF8> + : VOP3P_Profile> { + bit IsIU = _IsIU; + bit IsFP8BF8 = _IsFP8BF8; + bit IsF16BF16 = !not(!or(IsIU, IsFP8BF8)); + + int IndexType = _IndexType; + + let IsPacked = 1; + let IsWMMA = !not(_IsSWMMAC); + let IsSWMMAC = _IsSWMMAC; + + bit IsAB_F16 = !and(IsF16BF16, ArgTy[1].isFP); + bit IsAB_BF16 = !and(IsF16BF16, isIntType.ret); + bit IsC_F32 = !or(!eq(ArgTy[3], v8f32), !eq(ArgTy[3], v4f32)); + bit IsC_BF16 = !or(!eq(ArgTy[3], v8i16), !eq(ArgTy[3], v4i16)); + bit IsC_F16 = !or(!eq(ArgTy[3], v8f16), !eq(ArgTy[3], v4f16)); + + bit NegLo01 = !or(IsF16BF16, IsIU); + bit NegLo2 = !and(!or(IsF16BF16, IsFP8BF8), IsWMMA); + bit NegHi01 = IsF16BF16; + bit NegHi2 = !and(!or(IsF16BF16, IsFP8BF8), IsWMMA); + bit NegLoAny = !or(NegLo01, NegLo2); + bit NegHiAny = !or(NegHi01, NegHi2); + + let DstRC = !cond(!eq(ArgTy[0], v8f32): VDst_256, + !eq(ArgTy[0], v8i32): VDst_256, + !eq(ArgTy[0], v8f16): VDst_128, + !eq(ArgTy[0], v8i16): VDst_128, + !eq(ArgTy[0], v4f32): VDst_128, + !eq(ArgTy[0], v4i32): VDst_128, + !eq(ArgTy[0], v4f16): VDst_64, + !eq(ArgTy[0], v4i16): VDst_64); + let Src0RC64 = !cond(!eq(ArgTy[1], v8f16): VRegSrc_128, + !eq(ArgTy[1], v4f16): VRegSrc_64, + !eq(ArgTy[1], v4i16): VRegSrc_64, + !eq(ArgTy[1], v8i16): VRegSrc_128, + !eq(ArgTy[1], v4i32): VRegSrc_128, + !eq(ArgTy[1], v2i32): VRegSrc_64, + !eq(ArgTy[1], i32) : VRegSrc_32); + let Src1RC64 = !cond(!eq(ArgTy[2], v16f16): VRegSrc_256, + !eq(ArgTy[2], v16i16): VRegSrc_256, + !eq(ArgTy[2], v8f16): VRegSrc_128, + !eq(ArgTy[2], v8i16): VRegSrc_128, + !eq(ArgTy[2], v4i32): VRegSrc_128, + !eq(ArgTy[1], v4i16): VRegSrc_64, + !eq(ArgTy[1], v4f16): VRegSrc_64, + !eq(ArgTy[2], v2i32): VRegSrc_64, + !eq(ArgTy[2], i32) : VRegSrc_32); + let Src2RC64 = !if(IsSWMMAC, DstRC, + !cond(!eq(ArgTy[3], v8f32): VISrc_256_f32, + !eq(ArgTy[3], v8i32): VISrc_256_b32, + !eq(ArgTy[3], v8f16): VISrc_128_f16, + !eq(ArgTy[3], v8i16): VISrc_128_f32, // bf16 + !eq(ArgTy[3], v4f16): VISrc_64_f16, + !eq(ArgTy[3], v4i16): VISrc_64_b32, + !eq(ArgTy[3], v4i32): VISrc_128_b32, + !eq(ArgTy[3], v4f32): VISrc_128_f32)); + + // For f16 and bf16 matrices A and B, each element can be modified by + // fneg(neg_lo,neg_hi = 1). For iu4 and iu8 matrices A and B neg_lo is + // overloaded to mean unsigned/signed: neg_lo = 0 (u4 and u8) unsigned(zext) + // neg_lo = 1 (i4 and i8) signed(sext). For f16, bf16 and f32 matrix C each + // element can be modified by fneg(neg_lo = 1) or fabs(neg_hi = 1). + + // Opcode | src0/src1 - matrix A/B | src2 - matrix C or Index + // --------------------------------------------------------------------------- + // wmma f32_f16 | both neg_lo,neg_hi = 1 | neg_lo = 1 neg C(f32) + // wmma f32_bf16 | neg A/B (f16 or bf16) | neg_hi = 1 abs C(f32) + // --------------------------------------------------------------------------- + // wmma f16_f16 | both neg_lo,neg_hi = 1 | neg_lo = 1 neg C(f16 or bf16) + // wmma bf16_bf16 | neg A/B (f16 or bf16) | neg_hi = 1 abs C(f16 or bf16) + // --------------------------------------------------------------------------- + // wmma i32_iu8/iu4 | neg_lo = 0 u4/u8(zext) | not allowed for + // | neg_lo = 1 i4/i8(sext) | i32 matrices + // --------------------------------------------------------------------------- + // wmma f32_fp8/bf8 | not allowed for | neg_lo = 1 neg C(f32) + // (4 instructions) | f8 and bf8 matrices | neg_hi = 1 abs C(f32) + // --------------------------------------------------------------------------- + // swmmac f32_f16 | both neg_lo,neg_hi = 1 | not allowed for sparse matrix + // swmmac f32_bf16 | neg A/B (f16 or bf16) | A Index - matrix C is in dst + // --------------------------------------------------------------------------- + // swmmac f16_f16 | both neg_lo,neg_hi = 1 | not allowed for sparse matrix + // swmmac bf16_bf16 | neg A/B (f16 or bf16) | A Index - matrix C is in dst + // --------------------------------------------------------------------------- + // swmmac i32_iu8/iu4 | neg_lo = 0 u4/u8(zext) | not allowed for sparse matrix + // | neg_lo = 1 i4/i8(sext) | A Index - matrix C is in dst + // --------------------------------------------------------------------------- + // swmmac f32_fp8/bf8 | not allowed for | not allowed for sparse matrix + // (4 instructions) | f8 and bf8 matrices | A Index - matrix C is in dst + + // pseudo + + // fp8bf8 wmmas don't use src (0 and 1) modifiers, iu use neg_lo, f16 and bf16 + // use neg_lo and neg_hi. iu wmmas (C is i32) don't use src 2 modifiers, + // remaining wmmas(f16, bf16 and f8bf8) use neg_lo and neg_hi for C (C is f32 + // f16 or bf16). swmmac use index_key and don't use src 2 modifiers. + + dag Src0Mods = !if(IsFP8BF8, (ins), (ins PackedF16InputMods:$src0_modifiers)); + dag Src1Mods = !if(IsFP8BF8, (ins), (ins PackedF16InputMods:$src1_modifiers)); + dag Src2Mods = !if(IsIU, (ins), (ins PackedF16InputMods:$src2_modifiers)); + dag IndexKey = !cond(!eq(IndexType, 0) : (ins), + !eq(IndexType, 8) : (ins IndexKey8bit:$index_key_8bit), + !eq(IndexType, 16): (ins IndexKey16bit:$index_key_16bit)); + dag Clamp = !if(IsIU, (ins clampmod0:$clamp), (ins)); + dag Neg = !cond(!and(NegLoAny, NegHiAny) : (ins neg_lo0:$neg_lo, neg_hi0:$neg_hi), + !and(NegLoAny, !not(NegHiAny)) : (ins neg_lo0:$neg_lo), + !and(!not(NegLoAny), !not(NegHiAny)) : (ins)); + + let InsVOP3P = !con(Src0Mods, (ins Src0RC64:$src0), Src1Mods, (ins Src1RC64:$src1), + !cond(IsWMMA : !con(Src2Mods, (ins Src2RC64:$src2)), + IsSWMMAC : !con((ins DstRC:$srcTiedDef), (ins VRegSrc_32:$src2), IndexKey)), + Clamp, Neg); + + // asm + + string IndexKeyAsm = !cond(!eq(IndexType, 0) : "", + !eq(IndexType, 8) : "$index_key_8bit", + !eq(IndexType, 16) : "$index_key_16bit"); + string ClampAsm = !if(IsIU, "$clamp", ""); + string NegAsm = !cond(!and(NegLoAny, NegHiAny) : "$neg_lo$neg_hi", + !and(NegLoAny, !not(NegHiAny)) : "$neg_lo", + !and(!not(NegLoAny), !not(NegHiAny)) : ""); + + let AsmVOP3P = "$vdst, $src0, $src1, $src2"#IndexKeyAsm#NegAsm#ClampAsm; + + // isel patterns + + dag Src0InPat = !cond(IsAB_F16 : (ins (Src0VT (WMMAModsF16Neg Src0VT:$src0, i32:$src0_modifiers))), + IsAB_BF16 : (ins Src0VT:$src0), + IsIU : (ins (VOP3PModsNeg i32:$src0_modifiers), Src0VT:$src0), + IsFP8BF8 : (ins Src0VT:$src0)); + dag Src0OutPat = !cond(IsAB_F16 : (ins i32:$src0_modifiers, Src0VT:$src0), + IsAB_BF16 : (ins (i32 8), Src0VT:$src0), + IsIU : (ins i32:$src0_modifiers, Src0VT:$src0), + IsFP8BF8 : (ins Src0VT:$src0)); + dag Src1InPat = !cond(IsAB_F16 : (ins (Src1VT (WMMAModsF16Neg Src1VT:$src1, i32:$src1_modifiers))), + IsAB_BF16 : (ins Src1VT:$src1), + IsIU : (ins (VOP3PModsNeg i32:$src1_modifiers), Src1VT:$src1), + IsFP8BF8 : (ins Src1VT:$src1)); + dag Src1OutPat = !cond(IsAB_F16 : (ins i32:$src1_modifiers, Src1VT:$src1), + IsAB_BF16 : (ins (i32 8), Src1VT:$src1), + IsIU : (ins i32:$src1_modifiers, Src1VT:$src1), + IsFP8BF8 : (ins Src1VT:$src1)); + dag Src2InPatWmma = !cond(IsC_F32 : (ins (Src2VT (WMMAModsF32NegAbs Src2VT:$src2, i32:$src2_modifiers))), + IsC_F16 : (ins (Src2VT (WMMAModsF16NegAbs Src2VT:$src2, i32:$src2_modifiers))), + IsC_BF16 : (ins Src2VT:$src2), + IsIU : (ins Src2VT:$src2), + IsSWMMAC : (ins)); + dag Src2OutPatWmma = !cond(IsC_F32 : (ins i32:$src2_modifiers, Src2VT:$src2), + IsC_F16 : (ins i32:$src2_modifiers, Src2VT:$src2), + IsC_BF16 : (ins (i32 8), Src2VT:$src2), + IsIU : (ins Src2VT:$src2), + IsSWMMAC : (ins)); + dag ClampPat = !if(IsIU, (ins i1:$clamp), (ins)); + dag IndexInPat = !cond(!eq(IndexType, 0) : (ins i32:$src2), + !eq(IndexType, 8) : (ins (i32 (SWMMACIndex8 i32:$src2, i32:$index_key_8bit))), + !eq(IndexType, 16): (ins (i32 (SWMMACIndex16 i32:$src2, i32:$index_key_16bit)))); + dag IndexOutPat = !cond(!eq(IndexType, 0) : (ins i32:$src2), + !eq(IndexType, 8) : (ins i32:$src2, i32:$index_key_8bit), + !eq(IndexType, 16): (ins i32:$src2, i32:$index_key_16bit)); + dag Src2InlineInPat = (ins (Src2VT (WMMAVISrc Src2VT:$src2))); + dag Src2InlineOutPat = !con(!if(IsIU, (ins), (ins (i32 8))), (ins Src2VT:$src2)); + + + dag WmmaInPat = !con(Src0InPat, Src1InPat, Src2InPatWmma, ClampPat); + dag WmmaOutPat = !con(Src0OutPat, Src1OutPat, Src2OutPatWmma, ClampPat); + + dag SwmmacInPat = !con(Src0InPat, Src1InPat, (ins Src2VT:$srcTiedDef), IndexInPat, ClampPat); + dag SwmmacOutPat = !con(Src0OutPat, Src1OutPat, (ins Src2VT:$srcTiedDef), IndexOutPat, ClampPat); + + // wmma pattern where src2 is inline imm uses _threeaddr pseudo, + // can't use _twoaddr since it would violate src2 tied to vdst constraint. + dag WmmaInlineInPat = !con(Src0InPat, Src1InPat, Src2InlineInPat, ClampPat); + dag WmmaInlineOutPat = !con(Src0OutPat, Src1OutPat, Src2InlineOutPat, ClampPat); +} + +multiclass WMMAInstGFX12 { + let Mnemonic = Instr, mayRaiseFPException = 0, ReadsModeReg = 0 in { + let Constraints = "@earlyclobber $vdst,$vdst = $src2", isConvertibleToThreeAddress = 1 in + def _twoaddr : VOP3P_Pseudo{ + let PseudoInstr = Instr#PseudoInstrSuffix; + } + + let Constraints = "@earlyclobber $vdst", SchedRW = [Write32Bit, Write32Bit] in + def _threeaddr : VOP3P_Pseudo{ + let PseudoInstr = Instr#PseudoInstrSuffix; + } + + } + def : WMMAOpcodeMapping(NAME # _twoaddr), + !cast(NAME # _threeaddr)>; +} + +multiclass SWMMACInstGFX12 { + def _twoaddr : VOP3P_Pseudo{ + let Mnemonic = Instr; + let PseudoInstr = Instr#PseudoInstrSuffix; + let mayRaiseFPException = 0; + let ReadsModeReg = 0; + let AsmMatchConverter = "cvtSWMMAC"; + + let Constraints = "@earlyclobber $vdst,$vdst = $srcTiedDef"; + } +} + +// First argument in Profile is types for matrices D, A, B and C (D = A * B + C) +// as used by llvm ir, types are vectors(with matrix elements) +// wave32: +// For 16x16 matrices, lanes 0 to 31 will have 8 matrix elts, +// for 16 x 32 16 elts and for 16 x 64 lanes have 32 elts. +// wave64: +// lanes will have half the size of elements in lanes compared to wave32 with +// exception of 16x16_iu4: lanes0-31 will have 8xi4, remaining lanes are ignored + +// general idea on element distribution differences: +// wave32: lane n has 8 matrix elements +// wave64: lane n has first 4, lane n+32 has other 4 elements + +// index size, for each 2 elements in lane you need 4bits in index + +// Non-standard types (iu8, iu4, fp8, bf8) will be packed in vectors of i32s. +// Original type for them is in comment on the right and refers to A and B. + +def F32_F16_WMMA_w32 : VOP3PWMMA_Profile<[v8f32, v8f16, v8f16, v8f32], 0, 0, 0, 0>; +def F32_BF16_WMMA_w32 : VOP3PWMMA_Profile<[v8f32, v8i16, v8i16, v8f32], 0, 0, 0, 0>; +def F16_F16_WMMA_w32 : VOP3PWMMA_Profile<[v8f16, v8f16, v8f16, v8f16], 0, 0, 0, 0>; +def BF16_BF16_WMMA_w32 : VOP3PWMMA_Profile<[v8i16, v8i16, v8i16, v8i16], 0, 0, 0, 0>; +def I32_IU8_WMMA_w32 : VOP3PWMMA_Profile<[v8i32, v2i32, v2i32, v8i32], 0, 0, 1, 0>; // 8xi8 +def I32_IU4X16_WMMA_w32 : VOP3PWMMA_Profile<[v8i32, i32, i32, v8i32], 0, 0, 1, 0>; // 8xi4 +def F32_FP8BF8_WMMA_w32 : VOP3PWMMA_Profile<[v8f32, v2i32, v2i32, v8f32], 0, 0, 0, 1>; // 8xf8 +def I32_IU4X32_WMMA_w32 : VOP3PWMMA_Profile<[v8i32, v2i32, v2i32, v8i32], 0, 0, 1, 0>; // 16xi4 + +def F32_F16_WMMA_w64 : VOP3PWMMA_Profile<[v4f32, v4f16, v4f16, v4f32], 0, 0, 0, 0>; +def F32_BF16_WMMA_w64 : VOP3PWMMA_Profile<[v4f32, v4i16, v4i16, v4f32], 0, 0, 0, 0>; +def F16_F16_WMMA_w64 : VOP3PWMMA_Profile<[v4f16, v4f16, v4f16, v4f16], 0, 0, 0, 0>; +def BF16_BF16_WMMA_w64 : VOP3PWMMA_Profile<[v4i16, v4i16, v4i16, v4i16], 0, 0, 0, 0>; +def I32_IU8_WMMA_w64 : VOP3PWMMA_Profile<[v4i32, i32, i32, v4i32], 0, 0, 1, 0>; // 4xi8 +def I32_IU4X16_WMMA_w64 : VOP3PWMMA_Profile<[v4i32, i32, i32, v4i32], 0, 0, 1, 0>; // 8xi4 * +def F32_FP8BF8_WMMA_w64 : VOP3PWMMA_Profile<[v4f32, i32, i32, v4f32], 0, 0, 0, 1>; // 4xf8 +def I32_IU4X32_WMMA_w64 : VOP3PWMMA_Profile<[v4i32, i32, i32, v4i32], 0, 0, 1, 0>; // 8xi4 + +def F32_F16_SWMMAC_w32 : VOP3PWMMA_Profile<[v8f32, v8f16, v16f16, v8f32], 1, 16, 0, 0>; +def F32_BF16_SWMMAC_w32 : VOP3PWMMA_Profile<[v8f32, v8i16, v16i16, v8f32], 1, 16, 0, 0>; +def F16_F16_SWMMAC_w32 : VOP3PWMMA_Profile<[v8f16, v8f16, v16f16, v8f16], 1, 16, 0, 0>; +def BF16_BF16_SWMMAC_w32 : VOP3PWMMA_Profile<[v8i16, v8i16, v16i16, v8i16], 1, 16, 0, 0>; +def I32_IU8_SWMMAC_w32 : VOP3PWMMA_Profile<[v8i32, v2i32, v4i32, v8i32], 1, 16, 1, 0>; // 8xi8, 16xi8 +def I32_IU4X32_SWMMAC_w32 : VOP3PWMMA_Profile<[v8i32, i32, v2i32, v8i32], 1, 16, 1, 0>; // 8xi4, 16xi4 +def I32_IU4X64_SWMMAC_w32 : VOP3PWMMA_Profile<[v8i32, v2i32, v4i32, v8i32], 1, 0, 1, 0>; // 16xi4, 32xi4 ** +def F32_FP8BF8_SWMMAC_w32 : VOP3PWMMA_Profile<[v8f32, v2i32, v4i32, v8f32], 1, 16, 0, 1>; // 8xf8, 16xf8 + +def F32_F16_SWMMAC_w64 : VOP3PWMMA_Profile<[v4f32, v4f16, v8f16, v4f32], 1, 8, 0, 0>; +def F32_BF16_SWMMAC_w64 : VOP3PWMMA_Profile<[v4f32, v4i16, v8i16, v4f32], 1, 8, 0, 0>; +def F16_F16_SWMMAC_w64 : VOP3PWMMA_Profile<[v4f16, v4f16, v8f16, v4f16], 1, 8, 0, 0>; +def BF16_BF16_SWMMAC_w64 : VOP3PWMMA_Profile<[v4i16, v4i16, v8i16, v4i16], 1, 8, 0, 0>; +def I32_IU8_SWMMAC_w64 : VOP3PWMMA_Profile<[v4i32, i32, v2i32, v4i32], 1, 8, 1, 0>; // 4xi8, 8xi8 +def I32_IU4X32_SWMMAC_w64 : VOP3PWMMA_Profile<[v4i32, i32, i32, v4i32], 1, 16, 1, 0>; // 8xi4, 8xi4 *** +def I32_IU4X64_SWMMAC_w64 : VOP3PWMMA_Profile<[v4i32, i32, v2i32, v4i32], 1, 16, 1, 0>; // 8xi4, 16xi4 +def F32_FP8BF8_SWMMAC_w64 : VOP3PWMMA_Profile<[v4f32, i32, v2i32, v4f32], 1, 8, 0, 1>; // 4xf8, 8xf8 + +// * IU4X16_WMMA_w64 lanes 0-31 will have 8xi4, remaining lanes are ignored +// ** IU4X64_SWMMAC_w32 index is i32, index_key is not used +// *** IU4X32_SWMMAC_w64 lanes 0-31 will have 8xi4 remaining lanes are ignored +// for matrix A, index is i16; Matrix B uses all lanes + +let WaveSizePredicate = isWave32 in { +defm V_WMMA_F32_16X16X16_F16_w32 : WMMAInstGFX12<"v_wmma_f32_16x16x16_f16", F32_F16_WMMA_w32, "_w32">; +defm V_WMMA_F32_16X16X16_BF16_w32 : WMMAInstGFX12<"v_wmma_f32_16x16x16_bf16", F32_BF16_WMMA_w32, "_w32">; +defm V_WMMA_F16_16X16X16_F16_w32 : WMMAInstGFX12<"v_wmma_f16_16x16x16_f16", F16_F16_WMMA_w32, "_w32">; +defm V_WMMA_BF16_16X16X16_BF16_w32 : WMMAInstGFX12<"v_wmma_bf16_16x16x16_bf16", BF16_BF16_WMMA_w32, "_w32">; +defm V_WMMA_I32_16X16X16_IU8_w32 : WMMAInstGFX12<"v_wmma_i32_16x16x16_iu8", I32_IU8_WMMA_w32, "_w32">; +defm V_WMMA_I32_16X16X16_IU4_w32 : WMMAInstGFX12<"v_wmma_i32_16x16x16_iu4", I32_IU4X16_WMMA_w32, "_w32">; +defm V_WMMA_F32_16X16X16_FP8_FP8_w32 : WMMAInstGFX12<"v_wmma_f32_16x16x16_fp8_fp8", F32_FP8BF8_WMMA_w32, "_w32">; +defm V_WMMA_F32_16X16X16_FP8_BF8_w32 : WMMAInstGFX12<"v_wmma_f32_16x16x16_fp8_bf8", F32_FP8BF8_WMMA_w32, "_w32">; +defm V_WMMA_F32_16X16X16_BF8_FP8_w32 : WMMAInstGFX12<"v_wmma_f32_16x16x16_bf8_fp8", F32_FP8BF8_WMMA_w32, "_w32">; +defm V_WMMA_F32_16X16X16_BF8_BF8_w32 : WMMAInstGFX12<"v_wmma_f32_16x16x16_bf8_bf8", F32_FP8BF8_WMMA_w32, "_w32">; +defm V_WMMA_I32_16X16X32_IU4_w32 : WMMAInstGFX12<"v_wmma_i32_16x16x32_iu4", I32_IU4X32_WMMA_w32, "_w32">; + +defm V_SWMMAC_F32_16X16X32_F16_w32 : SWMMACInstGFX12<"v_swmmac_f32_16x16x32_f16", F32_F16_SWMMAC_w32, "_w32">; +defm V_SWMMAC_F32_16X16X32_BF16_w32 : SWMMACInstGFX12<"v_swmmac_f32_16x16x32_bf16", F32_BF16_SWMMAC_w32, "_w32">; +defm V_SWMMAC_F16_16X16X32_F16_w32 : SWMMACInstGFX12<"v_swmmac_f16_16x16x32_f16", F16_F16_SWMMAC_w32, "_w32">; +defm V_SWMMAC_BF16_16X16X32_BF16_w32 : SWMMACInstGFX12<"v_swmmac_bf16_16x16x32_bf16", BF16_BF16_SWMMAC_w32, "_w32">; +defm V_SWMMAC_I32_16X16X32_IU8_w32 : SWMMACInstGFX12<"v_swmmac_i32_16x16x32_iu8", I32_IU8_SWMMAC_w32, "_w32">; +defm V_SWMMAC_I32_16X16X32_IU4_w32 : SWMMACInstGFX12<"v_swmmac_i32_16x16x32_iu4", I32_IU4X32_SWMMAC_w32, "_w32">; +defm V_SWMMAC_I32_16X16X64_IU4_w32 : SWMMACInstGFX12<"v_swmmac_i32_16x16x64_iu4", I32_IU4X64_SWMMAC_w32, "_w32">; +defm V_SWMMAC_F32_16X16X32_FP8_FP8_w32 : SWMMACInstGFX12<"v_swmmac_f32_16x16x32_fp8_fp8", F32_FP8BF8_SWMMAC_w32, "_w32">; +defm V_SWMMAC_F32_16X16X32_FP8_BF8_w32 : SWMMACInstGFX12<"v_swmmac_f32_16x16x32_fp8_bf8", F32_FP8BF8_SWMMAC_w32, "_w32">; +defm V_SWMMAC_F32_16X16X32_BF8_FP8_w32 : SWMMACInstGFX12<"v_swmmac_f32_16x16x32_bf8_fp8", F32_FP8BF8_SWMMAC_w32, "_w32">; +defm V_SWMMAC_F32_16X16X32_BF8_BF8_w32 : SWMMACInstGFX12<"v_swmmac_f32_16x16x32_bf8_bf8", F32_FP8BF8_SWMMAC_w32, "_w32">; +} + +let WaveSizePredicate = isWave64 in { +defm V_WMMA_F32_16X16X16_F16_w64 : WMMAInstGFX12<"v_wmma_f32_16x16x16_f16", F32_F16_WMMA_w64, "_w64">; +defm V_WMMA_F32_16X16X16_BF16_w64 : WMMAInstGFX12<"v_wmma_f32_16x16x16_bf16", F32_BF16_WMMA_w64, "_w64">; +defm V_WMMA_F16_16X16X16_F16_w64 : WMMAInstGFX12<"v_wmma_f16_16x16x16_f16", F16_F16_WMMA_w64, "_w64">; +defm V_WMMA_BF16_16X16X16_BF16_w64 : WMMAInstGFX12<"v_wmma_bf16_16x16x16_bf16", BF16_BF16_WMMA_w64, "_w64">; +defm V_WMMA_I32_16X16X16_IU8_w64 : WMMAInstGFX12<"v_wmma_i32_16x16x16_iu8", I32_IU8_WMMA_w64, "_w64">; +defm V_WMMA_I32_16X16X16_IU4_w64 : WMMAInstGFX12<"v_wmma_i32_16x16x16_iu4", I32_IU4X16_WMMA_w64, "_w64">; +defm V_WMMA_F32_16X16X16_FP8_FP8_w64 : WMMAInstGFX12<"v_wmma_f32_16x16x16_fp8_fp8", F32_FP8BF8_WMMA_w64, "_w64">; +defm V_WMMA_F32_16X16X16_FP8_BF8_w64 : WMMAInstGFX12<"v_wmma_f32_16x16x16_fp8_bf8", F32_FP8BF8_WMMA_w64, "_w64">; +defm V_WMMA_F32_16X16X16_BF8_FP8_w64 : WMMAInstGFX12<"v_wmma_f32_16x16x16_bf8_fp8", F32_FP8BF8_WMMA_w64, "_w64">; +defm V_WMMA_F32_16X16X16_BF8_BF8_w64 : WMMAInstGFX12<"v_wmma_f32_16x16x16_bf8_bf8", F32_FP8BF8_WMMA_w64, "_w64">; +defm V_WMMA_I32_16X16X32_IU4_w64 : WMMAInstGFX12<"v_wmma_i32_16x16x32_iu4", I32_IU4X32_WMMA_w64, "_w64">; + +defm V_SWMMAC_F32_16X16X32_F16_w64 : SWMMACInstGFX12<"v_swmmac_f32_16x16x32_f16", F32_F16_SWMMAC_w64, "_w64">; +defm V_SWMMAC_F32_16X16X32_BF16_w64 : SWMMACInstGFX12<"v_swmmac_f32_16x16x32_bf16", F32_BF16_SWMMAC_w64, "_w64">; +defm V_SWMMAC_F16_16X16X32_F16_w64 : SWMMACInstGFX12<"v_swmmac_f16_16x16x32_f16", F16_F16_SWMMAC_w64, "_w64">; +defm V_SWMMAC_BF16_16X16X32_BF16_w64 : SWMMACInstGFX12<"v_swmmac_bf16_16x16x32_bf16", BF16_BF16_SWMMAC_w64, "_w64">; +defm V_SWMMAC_I32_16X16X32_IU8_w64 : SWMMACInstGFX12<"v_swmmac_i32_16x16x32_iu8", I32_IU8_SWMMAC_w64, "_w64">; +defm V_SWMMAC_I32_16X16X32_IU4_w64 : SWMMACInstGFX12<"v_swmmac_i32_16x16x32_iu4", I32_IU4X32_SWMMAC_w64, "_w64">; +defm V_SWMMAC_I32_16X16X64_IU4_w64 : SWMMACInstGFX12<"v_swmmac_i32_16x16x64_iu4", I32_IU4X64_SWMMAC_w64, "_w64">; +defm V_SWMMAC_F32_16X16X32_FP8_FP8_w64 : SWMMACInstGFX12<"v_swmmac_f32_16x16x32_fp8_fp8", F32_FP8BF8_SWMMAC_w64, "_w64">; +defm V_SWMMAC_F32_16X16X32_FP8_BF8_w64 : SWMMACInstGFX12<"v_swmmac_f32_16x16x32_fp8_bf8", F32_FP8BF8_SWMMAC_w64, "_w64">; +defm V_SWMMAC_F32_16X16X32_BF8_FP8_w64 : SWMMACInstGFX12<"v_swmmac_f32_16x16x32_bf8_fp8", F32_FP8BF8_SWMMAC_w64, "_w64">; +defm V_SWMMAC_F32_16X16X32_BF8_BF8_w64 : SWMMACInstGFX12<"v_swmmac_f32_16x16x32_bf8_bf8", F32_FP8BF8_SWMMAC_w64, "_w64">; +} + +// IsGFX11OpselIntrinsic: f16_f16 and bf16_bf16 Intrinsics have imm operand that +// controls opsel. Used by gfx11, removed in gfx12 (operand must be 0). +multiclass WMMAPat { + def : GCNPat <(P.DstVT !setdagop(!con(P.WmmaInPat, !if(IsGFX11OpselIntrinsic, (ins 0), (ins))), node)), + (P.DstVT !setdagop(P.WmmaOutPat, !cast(Inst#"_twoaddr")))>; + let AddedComplexity = 4 in + def : GCNPat <(P.DstVT !setdagop(!con(P.WmmaInlineInPat, !if(IsGFX11OpselIntrinsic, (ins 0), (ins))), node)), + (P.DstVT !setdagop(P.WmmaInlineOutPat, !cast(Inst#"_threeaddr")))>; +} + +class SWMMACPat : + GCNPat <(P.DstVT !setdagop(P.SwmmacInPat, node)), + (P.DstVT !setdagop(P.SwmmacOutPat, Inst))>; + +class SWMMACPat_w64 : + GCNPat <(P.DstVT !setdagop(P.SwmmacInPat, node)), + (P.DstVT !setdagop(P.SwmmacOutPat, Inst))>{ + let WaveSizePredicate = isWave64; + } + +let WaveSizePredicate = isWave32, SubtargetPredicate = isGFX12Plus in { + defm : WMMAPat<"V_WMMA_F32_16X16X16_F16_w32", int_amdgcn_wmma_f32_16x16x16_f16, F32_F16_WMMA_w32>; + defm : WMMAPat<"V_WMMA_F32_16X16X16_BF16_w32", int_amdgcn_wmma_f32_16x16x16_bf16, F32_BF16_WMMA_w32>; + defm : WMMAPat<"V_WMMA_F16_16X16X16_F16_w32", int_amdgcn_wmma_f16_16x16x16_f16, F16_F16_WMMA_w32,1>; + defm : WMMAPat<"V_WMMA_BF16_16X16X16_BF16_w32", int_amdgcn_wmma_bf16_16x16x16_bf16, BF16_BF16_WMMA_w32,1>; + defm : WMMAPat<"V_WMMA_I32_16X16X16_IU8_w32", int_amdgcn_wmma_i32_16x16x16_iu8, I32_IU8_WMMA_w32>; + defm : WMMAPat<"V_WMMA_I32_16X16X16_IU4_w32", int_amdgcn_wmma_i32_16x16x16_iu4, I32_IU4X16_WMMA_w32>; + defm : WMMAPat<"V_WMMA_F32_16X16X16_FP8_FP8_w32", int_amdgcn_wmma_f32_16x16x16_fp8_fp8, F32_FP8BF8_WMMA_w32>; + defm : WMMAPat<"V_WMMA_F32_16X16X16_FP8_BF8_w32", int_amdgcn_wmma_f32_16x16x16_fp8_bf8, F32_FP8BF8_WMMA_w32>; + defm : WMMAPat<"V_WMMA_F32_16X16X16_BF8_FP8_w32", int_amdgcn_wmma_f32_16x16x16_bf8_fp8, F32_FP8BF8_WMMA_w32>; + defm : WMMAPat<"V_WMMA_F32_16X16X16_BF8_BF8_w32", int_amdgcn_wmma_f32_16x16x16_bf8_bf8, F32_FP8BF8_WMMA_w32>; + defm : WMMAPat<"V_WMMA_I32_16X16X32_IU4_w32", int_amdgcn_wmma_i32_16x16x32_iu4, I32_IU4X32_WMMA_w32>; + + def : SWMMACPat; + def : SWMMACPat; + def : SWMMACPat; + def : SWMMACPat; + def : SWMMACPat; + def : SWMMACPat; + def : GCNPat <(I32_IU4X64_SWMMAC_w32.DstVT !setdagop(I32_IU4X64_SWMMAC_w32.SwmmacInPat, int_amdgcn_swmmac_i32_16x16x64_iu4)), + (I32_IU4X64_SWMMAC_w32.DstVT !setdagop(I32_IU4X64_SWMMAC_w32.SwmmacOutPat, V_SWMMAC_I32_16X16X64_IU4_w32_twoaddr))>; + def : SWMMACPat; + def : SWMMACPat; + def : SWMMACPat; + def : SWMMACPat; +} + +let WaveSizePredicate = isWave64, SubtargetPredicate = isGFX12Plus in { + defm : WMMAPat<"V_WMMA_F32_16X16X16_F16_w64", int_amdgcn_wmma_f32_16x16x16_f16, F32_F16_WMMA_w64>; + defm : WMMAPat<"V_WMMA_F32_16X16X16_BF16_w64", int_amdgcn_wmma_f32_16x16x16_bf16, F32_BF16_WMMA_w64>; + defm : WMMAPat<"V_WMMA_F16_16X16X16_F16_w64", int_amdgcn_wmma_f16_16x16x16_f16, F16_F16_WMMA_w64,1>; + defm : WMMAPat<"V_WMMA_BF16_16X16X16_BF16_w64", int_amdgcn_wmma_bf16_16x16x16_bf16, BF16_BF16_WMMA_w64,1>; + defm : WMMAPat<"V_WMMA_I32_16X16X16_IU8_w64", int_amdgcn_wmma_i32_16x16x16_iu8, I32_IU8_WMMA_w64>; + defm : WMMAPat<"V_WMMA_I32_16X16X16_IU4_w64", int_amdgcn_wmma_i32_16x16x16_iu4, I32_IU4X16_WMMA_w64>; + defm : WMMAPat<"V_WMMA_F32_16X16X16_FP8_FP8_w64", int_amdgcn_wmma_f32_16x16x16_fp8_fp8, F32_FP8BF8_WMMA_w64>; + defm : WMMAPat<"V_WMMA_F32_16X16X16_FP8_BF8_w64", int_amdgcn_wmma_f32_16x16x16_fp8_bf8, F32_FP8BF8_WMMA_w64>; + defm : WMMAPat<"V_WMMA_F32_16X16X16_BF8_FP8_w64", int_amdgcn_wmma_f32_16x16x16_bf8_fp8, F32_FP8BF8_WMMA_w64>; + defm : WMMAPat<"V_WMMA_F32_16X16X16_BF8_BF8_w64", int_amdgcn_wmma_f32_16x16x16_bf8_bf8, F32_FP8BF8_WMMA_w64>; + defm : WMMAPat<"V_WMMA_I32_16X16X32_IU4_w64", int_amdgcn_wmma_i32_16x16x32_iu4, I32_IU4X32_WMMA_w64>; + + def : SWMMACPat; + def : SWMMACPat; + def : SWMMACPat; + def : SWMMACPat; + def : SWMMACPat; + def : SWMMACPat; + def : SWMMACPat; + def : SWMMACPat; + def : SWMMACPat; + def : SWMMACPat; + def : SWMMACPat; +} + + //===----------------------------------------------------------------------===// // Begin Real Encodings //===----------------------------------------------------------------------===// @@ -1005,6 +1400,99 @@ multiclass VOP3P_Real_Base op, string backing_ps_name = NAME VOP3Pe_gfx11_gfx12(backing_ps_name).Pfl>; } +class VOP3PeWmma op, VOPProfile P, VOP3PWMMA_Profile WMMAP> + : VOP3Pe_gfx11_gfx12{ + // opsel + let Inst{11} = !cond(!eq(WMMAP.IndexType, 0) : 0, + !eq(WMMAP.IndexType, 8) : index_key_8bit{0}, + !eq(WMMAP.IndexType, 16) : index_key_16bit{0}); + let Inst{12} = !if(!eq(WMMAP.IndexType, 8), index_key_8bit{1}, 0); + let Inst{13} = 0; + // opsel_hi + let Inst{59} = 1; + let Inst{60} = 1; + let Inst{14} = 1; + // neg_lo + let Inst{61} = !if(WMMAP.NegLo01, src0_modifiers{0}, 0); + let Inst{62} = !if(WMMAP.NegLo01, src1_modifiers{0}, 0); + let Inst{63} = !if(WMMAP.NegLo2, src2_modifiers{0}, 0); + // neg_hi + let Inst{8} = !if(WMMAP.NegHi01, src0_modifiers{1}, 0); + let Inst{9} = !if(WMMAP.NegHi01, src1_modifiers{1}, 0); + let Inst{10} = !if(WMMAP.NegHi2, src2_modifiers{1}, 0); + // clamp + let Inst{15} = !if(WMMAP.IsIU, clamp{0}, 0); +} + +multiclass VOP3P_WMMA_Real_Base op, VOP3PWMMA_Profile WMMAP, + string backing_ps_name = NAME, + string asmName = !cast(NAME).Mnemonic> { + def Gen.Suffix : + VOP3P_Real_Gen(backing_ps_name), Gen, asmName>, + VOP3PeWmma(backing_ps_name).Pfl, WMMAP>; +} + +multiclass VOP3P_Real_WMMA_gfx12 op, VOP3PWMMA_Profile WMMAP> { + let WaveSizePredicate = isWave32, DecoderNamespace = "GFX12" in { + defm _twoaddr : VOP3P_WMMA_Real_Base ; + } +} + +multiclass VOP3P_Real_WMMA_gfx12w64 op, VOP3PWMMA_Profile WMMAP> { + let WaveSizePredicate = isWave64, DecoderNamespace = "WMMAGFX12" in { + defm _twoaddr : VOP3P_WMMA_Real_Base ; + } +} + +defm V_WMMA_F32_16X16X16_F16_w32 : VOP3P_Real_WMMA_gfx12 <0x040, F32_F16_WMMA_w32>; +defm V_WMMA_F32_16X16X16_BF16_w32 : VOP3P_Real_WMMA_gfx12 <0x041, F32_BF16_WMMA_w32>; +defm V_WMMA_F16_16X16X16_F16_w32 : VOP3P_Real_WMMA_gfx12 <0x042, F16_F16_WMMA_w32>; +defm V_WMMA_BF16_16X16X16_BF16_w32 : VOP3P_Real_WMMA_gfx12 <0x043, BF16_BF16_WMMA_w32>; +defm V_WMMA_I32_16X16X16_IU8_w32 : VOP3P_Real_WMMA_gfx12 <0x044, I32_IU8_WMMA_w32>; +defm V_WMMA_I32_16X16X16_IU4_w32 : VOP3P_Real_WMMA_gfx12 <0x045, I32_IU4X16_WMMA_w32>; +defm V_WMMA_F32_16X16X16_FP8_FP8_w32 : VOP3P_Real_WMMA_gfx12 <0x046, F32_FP8BF8_WMMA_w32>; +defm V_WMMA_F32_16X16X16_FP8_BF8_w32 : VOP3P_Real_WMMA_gfx12 <0x047, F32_FP8BF8_WMMA_w32>; +defm V_WMMA_F32_16X16X16_BF8_FP8_w32 : VOP3P_Real_WMMA_gfx12 <0x048, F32_FP8BF8_WMMA_w32>; +defm V_WMMA_F32_16X16X16_BF8_BF8_w32 : VOP3P_Real_WMMA_gfx12 <0x049, F32_FP8BF8_WMMA_w32>; +defm V_WMMA_I32_16X16X32_IU4_w32 : VOP3P_Real_WMMA_gfx12 <0x04a, I32_IU4X32_WMMA_w32>; + +defm V_WMMA_F32_16X16X16_F16_w64 : VOP3P_Real_WMMA_gfx12w64 <0x040, F32_F16_WMMA_w64>; +defm V_WMMA_F32_16X16X16_BF16_w64 : VOP3P_Real_WMMA_gfx12w64 <0x041, F32_BF16_WMMA_w64>; +defm V_WMMA_F16_16X16X16_F16_w64 : VOP3P_Real_WMMA_gfx12w64 <0x042, F16_F16_WMMA_w64>; +defm V_WMMA_BF16_16X16X16_BF16_w64 : VOP3P_Real_WMMA_gfx12w64 <0x043, BF16_BF16_WMMA_w64>; +defm V_WMMA_I32_16X16X16_IU8_w64 : VOP3P_Real_WMMA_gfx12w64 <0x044, I32_IU8_WMMA_w64>; +defm V_WMMA_I32_16X16X16_IU4_w64 : VOP3P_Real_WMMA_gfx12w64 <0x045, I32_IU4X16_WMMA_w64>; +defm V_WMMA_F32_16X16X16_FP8_FP8_w64 : VOP3P_Real_WMMA_gfx12w64 <0x046, F32_FP8BF8_WMMA_w64>; +defm V_WMMA_F32_16X16X16_FP8_BF8_w64 : VOP3P_Real_WMMA_gfx12w64 <0x047, F32_FP8BF8_WMMA_w64>; +defm V_WMMA_F32_16X16X16_BF8_FP8_w64 : VOP3P_Real_WMMA_gfx12w64 <0x048, F32_FP8BF8_WMMA_w64>; +defm V_WMMA_F32_16X16X16_BF8_BF8_w64 : VOP3P_Real_WMMA_gfx12w64 <0x049, F32_FP8BF8_WMMA_w64>; +defm V_WMMA_I32_16X16X32_IU4_w64 : VOP3P_Real_WMMA_gfx12w64 <0x04a, I32_IU4X32_WMMA_w64>; + + +defm V_SWMMAC_F32_16X16X32_F16_w32 : VOP3P_Real_WMMA_gfx12 <0x050, F32_F16_SWMMAC_w32>; +defm V_SWMMAC_F32_16X16X32_BF16_w32 : VOP3P_Real_WMMA_gfx12 <0x051, F32_BF16_SWMMAC_w32>; +defm V_SWMMAC_F16_16X16X32_F16_w32 : VOP3P_Real_WMMA_gfx12 <0x052, F16_F16_SWMMAC_w32>; +defm V_SWMMAC_BF16_16X16X32_BF16_w32 : VOP3P_Real_WMMA_gfx12 <0x053, BF16_BF16_SWMMAC_w32>; +defm V_SWMMAC_I32_16X16X32_IU8_w32 : VOP3P_Real_WMMA_gfx12 <0x054, I32_IU8_SWMMAC_w32>; +defm V_SWMMAC_I32_16X16X32_IU4_w32 : VOP3P_Real_WMMA_gfx12 <0x055, I32_IU4X32_SWMMAC_w32>; +defm V_SWMMAC_I32_16X16X64_IU4_w32 : VOP3P_Real_WMMA_gfx12 <0x056, I32_IU4X64_SWMMAC_w32>; +defm V_SWMMAC_F32_16X16X32_FP8_FP8_w32 : VOP3P_Real_WMMA_gfx12 <0x057, F32_FP8BF8_SWMMAC_w32>; +defm V_SWMMAC_F32_16X16X32_FP8_BF8_w32 : VOP3P_Real_WMMA_gfx12 <0x058, F32_FP8BF8_SWMMAC_w32>; +defm V_SWMMAC_F32_16X16X32_BF8_FP8_w32 : VOP3P_Real_WMMA_gfx12 <0x059, F32_FP8BF8_SWMMAC_w32>; +defm V_SWMMAC_F32_16X16X32_BF8_BF8_w32 : VOP3P_Real_WMMA_gfx12 <0x05a, F32_FP8BF8_SWMMAC_w32>; + +defm V_SWMMAC_F32_16X16X32_F16_w64 : VOP3P_Real_WMMA_gfx12w64 <0x050, F32_F16_SWMMAC_w64>; +defm V_SWMMAC_F32_16X16X32_BF16_w64 : VOP3P_Real_WMMA_gfx12w64 <0x051, F32_BF16_SWMMAC_w64>; +defm V_SWMMAC_F16_16X16X32_F16_w64 : VOP3P_Real_WMMA_gfx12w64 <0x052, F16_F16_SWMMAC_w64>; +defm V_SWMMAC_BF16_16X16X32_BF16_w64 : VOP3P_Real_WMMA_gfx12w64 <0x053, BF16_BF16_SWMMAC_w64>; +defm V_SWMMAC_I32_16X16X32_IU8_w64 : VOP3P_Real_WMMA_gfx12w64 <0x054, I32_IU8_SWMMAC_w64>; +defm V_SWMMAC_I32_16X16X32_IU4_w64 : VOP3P_Real_WMMA_gfx12w64 <0x055, I32_IU4X32_SWMMAC_w64>; +defm V_SWMMAC_I32_16X16X64_IU4_w64 : VOP3P_Real_WMMA_gfx12w64 <0x056, I32_IU4X64_SWMMAC_w64>; +defm V_SWMMAC_F32_16X16X32_FP8_FP8_w64 : VOP3P_Real_WMMA_gfx12w64 <0x057, F32_FP8BF8_SWMMAC_w64>; +defm V_SWMMAC_F32_16X16X32_FP8_BF8_w64 : VOP3P_Real_WMMA_gfx12w64 <0x058, F32_FP8BF8_SWMMAC_w64>; +defm V_SWMMAC_F32_16X16X32_BF8_FP8_w64 : VOP3P_Real_WMMA_gfx12w64 <0x059, F32_FP8BF8_SWMMAC_w64>; +defm V_SWMMAC_F32_16X16X32_BF8_BF8_w64 : VOP3P_Real_WMMA_gfx12w64 <0x05a, F32_FP8BF8_SWMMAC_w64>; + multiclass VOP3P_Real_with_name op, string backing_ps_name = NAME, string asmName = !cast(NAME).Mnemonic> { diff --git a/llvm/lib/Target/AMDGPU/VOPInstructions.td b/llvm/lib/Target/AMDGPU/VOPInstructions.td index df505c3365cb..fae07852ffc4 100644 --- a/llvm/lib/Target/AMDGPU/VOPInstructions.td +++ b/llvm/lib/Target/AMDGPU/VOPInstructions.td @@ -124,6 +124,7 @@ class VOP3_Pseudo pattern = [], let IsPacked = P.IsPacked; let IsMAI = P.IsMAI; let IsWMMA = P.IsWMMA; + let IsSWMMAC = P.IsSWMMAC; let AsmOperands = !if(isVop3OpSel, P.AsmVOP3OpSel, @@ -378,6 +379,8 @@ class VOP3Pe op, VOPProfile P> : Enc64 { bits<4> src2_modifiers; bits<9> src2; bits<1> clamp; + bits<2> index_key_8bit; + bits<1> index_key_16bit; let Inst{7-0} = vdst; let Inst{8} = !if(P.HasSrc0Mods, src0_modifiers{1}, 0); // neg_hi src0 diff --git a/llvm/test/Analysis/UniformityAnalysis/AMDGPU/intrinsics.ll b/llvm/test/Analysis/UniformityAnalysis/AMDGPU/intrinsics.ll index a08ca86c8a61..4b6f6159ff23 100644 --- a/llvm/test/Analysis/UniformityAnalysis/AMDGPU/intrinsics.ll +++ b/llvm/test/Analysis/UniformityAnalysis/AMDGPU/intrinsics.ll @@ -63,52 +63,140 @@ define amdgpu_kernel void @writelane(ptr addrspace(1) %out) #0 { ret void } -; CHECK: DIVERGENT: %tmp0 = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.f16.v8f32(<16 x half> %A, <16 x half> %B, <8 x float> %C) +; CHECK: DIVERGENT: %tmp0 = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.f16.v8f32.v16f16(<16 x half> %A, <16 x half> %B, <8 x float> %C) define amdgpu_kernel void @wmma_f32_16x16x16_f16(<16 x half> %A, <16 x half> %B, <8 x float> %C, ptr addrspace(1) %out) { - %tmp0 = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.f16.v8f32(<16 x half> %A, <16 x half> %B, <8 x float> %C) + %tmp0 = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.f16.v8f32.v16f16(<16 x half> %A, <16 x half> %B, <8 x float> %C) store <8 x float> %tmp0, ptr addrspace(1) %out, align 32 ret void } -; CHECK: DIVERGENT: %tmp0 = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf16.v8f32(<16 x i16> %A, <16 x i16> %B, <8 x float> %C) +; CHECK: DIVERGENT: %tmp0 = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf16.v8f32.v16i16(<16 x i16> %A, <16 x i16> %B, <8 x float> %C) define amdgpu_kernel void @wmma_f32_16x16x16_ibf16(<16 x i16> %A, <16 x i16> %B, <8 x float> %C, ptr addrspace(1) %out) { - %tmp0 = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf16.v8f32(<16 x i16> %A, <16 x i16> %B, <8 x float> %C) + %tmp0 = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf16.v8f32.v16i16(<16 x i16> %A, <16 x i16> %B, <8 x float> %C) store <8 x float> %tmp0, ptr addrspace(1) %out, align 32 ret void } -; CHECK: DIVERGENT: %tmp0 = call <16 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16.v16f16(<16 x half> %A, <16 x half> %B, <16 x half> %C, i1 false) +; CHECK: DIVERGENT: %tmp0 = call <16 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16.v16f16.v16f16(<16 x half> %A, <16 x half> %B, <16 x half> %C, i1 false) define amdgpu_kernel void @wmma_f16_16x16x16_f16(<16 x half> %A, <16 x half> %B, <16 x half> %C, ptr addrspace(1) %out) { bb: - %tmp0 = call <16 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16.v16f16(<16 x half> %A, <16 x half> %B, <16 x half> %C, i1 false) + %tmp0 = call <16 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16.v16f16.v16f16(<16 x half> %A, <16 x half> %B, <16 x half> %C, i1 false) store <16 x half> %tmp0, ptr addrspace(1) %out, align 32 ret void } -; CHECK: DIVERGENT: %tmp0 = call <16 x i16> @llvm.amdgcn.wmma.bf16.16x16x16.bf16.v16i16(<16 x i16> %A, <16 x i16> %B, <16 x i16> %C, i1 false) +; CHECK: DIVERGENT: %tmp0 = call <16 x i16> @llvm.amdgcn.wmma.bf16.16x16x16.bf16.v16i16.v16i16(<16 x i16> %A, <16 x i16> %B, <16 x i16> %C, i1 false) define amdgpu_kernel void @wmma_f16_16x16x16_bf16(<16 x i16> %A, <16 x i16> %B, <16 x i16> %C, ptr addrspace(1) %out) { bb: - %tmp0 = call <16 x i16> @llvm.amdgcn.wmma.bf16.16x16x16.bf16.v16i16(<16 x i16> %A, <16 x i16> %B, <16 x i16> %C, i1 false) + %tmp0 = call <16 x i16> @llvm.amdgcn.wmma.bf16.16x16x16.bf16.v16i16.v16i16(<16 x i16> %A, <16 x i16> %B, <16 x i16> %C, i1 false) store <16 x i16> %tmp0, ptr addrspace(1) %out, align 32 ret void } -; CHECK: DIVERGENT: %tmp0 = call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8.v8i32(i1 false, <4 x i32> %A, i1 false, <4 x i32> %B, <8 x i32> %C, i1 false) +; CHECK: DIVERGENT: %tmp0 = call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8.v8i32.v4i32(i1 false, <4 x i32> %A, i1 false, <4 x i32> %B, <8 x i32> %C, i1 false) define amdgpu_kernel void @wmma_i32_16x16x16_ui8(<4 x i32> %A, <4 x i32> %B, <8 x i32> %C, ptr addrspace(1) %out) { bb: - %tmp0 = call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8.v8i32(i1 false, <4 x i32> %A, i1 false, <4 x i32> %B, <8 x i32> %C, i1 false) + %tmp0 = call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8.v8i32.v4i32(i1 false, <4 x i32> %A, i1 false, <4 x i32> %B, <8 x i32> %C, i1 false) store <8 x i32> %tmp0, ptr addrspace(1) %out, align 32 ret void } -; CHECK: DIVERGENT: %tmp0 = call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4.v8i32(i1 false, <2 x i32> %A, i1 false, <2 x i32> %B, <8 x i32> %C, i1 false) +; CHECK: DIVERGENT: %tmp0 = call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4.v8i32.v2i32(i1 false, <2 x i32> %A, i1 false, <2 x i32> %B, <8 x i32> %C, i1 false) define amdgpu_kernel void @wmma_i32_16x16x16_ui4(<2 x i32> %A, <2 x i32> %B, <8 x i32> %C, ptr addrspace(1) %out) { bb: - %tmp0 = call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4.v8i32(i1 false, <2 x i32> %A, i1 false, <2 x i32> %B, <8 x i32> %C, i1 false) + %tmp0 = call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4.v8i32.v2i32(i1 false, <2 x i32> %A, i1 false, <2 x i32> %B, <8 x i32> %C, i1 false) store <8 x i32> %tmp0, ptr addrspace(1) %out, align 32 ret void } +; CHECK: DIVERGENT: %tmp0 = call <8 x float> @llvm.amdgcn.swmmac.f32.16x16x32.f16.v8f32.v8f16.v16f16.i16(<8 x half> %A, <16 x half> %B, <8 x float> %C, i16 %Index) +define amdgpu_kernel void @swmmac_f32_16x16x32_f16(<8 x half> %A, <16 x half> %B, <8 x float> %C, i16 %Index, ptr addrspace(1) %out) { +bb: + %tmp0 = call <8 x float> @llvm.amdgcn.swmmac.f32.16x16x32.f16(<8 x half> %A, <16 x half> %B, <8 x float> %C, i16 %Index) + store <8 x float> %tmp0, ptr addrspace(1) %out, align 32 + ret void +} + +; CHECK: DIVERGENT: %tmp0 = call <8 x float> @llvm.amdgcn.swmmac.f32.16x16x32.bf16.v8f32.v8i16.v16i16.i16(<8 x i16> %A, <16 x i16> %B, <8 x float> %C, i16 %Index) +define amdgpu_kernel void @swmmac_f32_16x16x32_bf16(<8 x i16> %A, <16 x i16> %B, <8 x float> %C, i16 %Index, ptr addrspace(1) %out) { +bb: + %tmp0 = call <8 x float> @llvm.amdgcn.swmmac.f32.16x16x32.bf16(<8 x i16> %A, <16 x i16> %B, <8 x float> %C, i16 %Index) + store <8 x float> %tmp0, ptr addrspace(1) %out, align 32 + ret void +} + +; CHECK: DIVERGENT: %tmp0 = call <8 x half> @llvm.amdgcn.swmmac.f16.16x16x32.f16.v8f16.v8f16.v16f16.i16(<8 x half> %A, <16 x half> %B, <8 x half> %C, i16 %Index) +define amdgpu_kernel void @swmmac_f16_16x16x32_f16(<8 x half> %A, <16 x half> %B, <8 x half> %C, i16 %Index, ptr addrspace(1) %out) { +bb: + %tmp0 = call <8 x half> @llvm.amdgcn.swmmac.f16.16x16x32.f16(<8 x half> %A, <16 x half> %B, <8 x half> %C, i16 %Index) + store <8 x half> %tmp0, ptr addrspace(1) %out, align 32 + ret void +} + +; CHECK: DIVERGENT: %tmp0 = call <8 x i16> @llvm.amdgcn.swmmac.bf16.16x16x32.bf16.v8i16.v8i16.v16i16.i16(<8 x i16> %A, <16 x i16> %B, <8 x i16> %C, i16 %Index) +define amdgpu_kernel void @swmmac_bf16_16x16x32_bf16(<8 x i16> %A, <16 x i16> %B, <8 x i16> %C, i16 %Index, ptr addrspace(1) %out) { +bb: + %tmp0 = call <8 x i16> @llvm.amdgcn.swmmac.bf16.16x16x32.bf16(<8 x i16> %A, <16 x i16> %B, <8 x i16> %C, i16 %Index) + store <8 x i16> %tmp0, ptr addrspace(1) %out, align 32 + ret void +} + +; CHECK: DIVERGENT: %tmp0 = call <8 x i32> @llvm.amdgcn.swmmac.i32.16x16x32.iu8.v8i32.v2i32.v4i32.i16(i1 false, <2 x i32> %A, i1 false, <4 x i32> %B, <8 x i32> %C, i16 %Index, i1 false) +define amdgpu_kernel void @swmmac_i32_16x16x32_iu8(<2 x i32> %A, <4 x i32> %B, <8 x i32> %C, i16 %Index, ptr addrspace(1) %out) { +bb: + %tmp0 = call <8 x i32> @llvm.amdgcn.swmmac.i32.16x16x32.iu8(i1 false, <2 x i32> %A, i1 false, <4 x i32> %B, <8 x i32> %C, i16 %Index, i1 false) + store <8 x i32> %tmp0, ptr addrspace(1) %out, align 32 + ret void +} + +; CHECK: DIVERGENT: %tmp0 = call <8 x i32> @llvm.amdgcn.swmmac.i32.16x16x32.iu4.v8i32.i32.v2i32.i16(i1 false, i32 %A, i1 false, <2 x i32> %B, <8 x i32> %C, i16 %Index, i1 false) +define amdgpu_kernel void @swmmac_i32_16x16x32_iu4(i32 %A, <2 x i32> %B, <8 x i32> %C, i16 %Index, ptr addrspace(1) %out) { +bb: + %tmp0 = call <8 x i32> @llvm.amdgcn.swmmac.i32.16x16x32.iu4(i1 false, i32 %A, i1 false, <2 x i32> %B, <8 x i32> %C, i16 %Index, i1 false) + store <8 x i32> %tmp0, ptr addrspace(1) %out, align 32 + ret void +} + +; CHECK: DIVERGENT: %tmp0 = call <8 x i32> @llvm.amdgcn.swmmac.i32.16x16x64.iu4.v8i32.v2i32.v4i32.i16(i1 false, <2 x i32> %A, i1 false, <4 x i32> %B, <8 x i32> %C, i16 %Index, i1 false) +define amdgpu_kernel void @swmmac_i32_16x16x64_iu4(<2 x i32> %A, <4 x i32> %B, <8 x i32> %C, i16 %Index, ptr addrspace(1) %out) { +bb: + %tmp0 = call <8 x i32> @llvm.amdgcn.swmmac.i32.16x16x64.iu4(i1 false, <2 x i32> %A, i1 false, <4 x i32> %B, <8 x i32> %C, i16 %Index, i1 false) + store <8 x i32> %tmp0, ptr addrspace(1) %out, align 32 + ret void +} + +; CHECK: DIVERGENT: %tmp0 = call <8 x float> @llvm.amdgcn.swmmac.f32.16x16x32.fp8.fp8.v8f32.v2i32.v4i32.i16(<2 x i32> %A, <4 x i32> %B, <8 x float> %C, i16 %Index) +define amdgpu_kernel void @swmmac_f32_16x16x32_fp8.fp8(<2 x i32> %A, <4 x i32> %B, <8 x float> %C, i16 %Index, ptr addrspace(1) %out) { +bb: + %tmp0 = call <8 x float> @llvm.amdgcn.swmmac.f32.16x16x32.fp8.fp8(<2 x i32> %A, <4 x i32> %B, <8 x float> %C, i16 %Index) + store <8 x float> %tmp0, ptr addrspace(1) %out, align 32 + ret void +} + +; CHECK: DIVERGENT: %tmp0 = call <8 x float> @llvm.amdgcn.swmmac.f32.16x16x32.fp8.bf8.v8f32.v2i32.v4i32.i16(<2 x i32> %A, <4 x i32> %B, <8 x float> %C, i16 %Index) +define amdgpu_kernel void @swmmac_f32_16x16x32_fp8.bf8(<2 x i32> %A, <4 x i32> %B, <8 x float> %C, i16 %Index, ptr addrspace(1) %out) { +bb: + %tmp0 = call <8 x float> @llvm.amdgcn.swmmac.f32.16x16x32.fp8.bf8(<2 x i32> %A, <4 x i32> %B, <8 x float> %C, i16 %Index) + store <8 x float> %tmp0, ptr addrspace(1) %out, align 32 + ret void +} + +; CHECK: DIVERGENT: %tmp0 = call <8 x float> @llvm.amdgcn.swmmac.f32.16x16x32.bf8.fp8.v8f32.v2i32.v4i32.i16(<2 x i32> %A, <4 x i32> %B, <8 x float> %C, i16 %Index) +define amdgpu_kernel void @swmmac_f32_16x16x32_bf8.fp8(<2 x i32> %A, <4 x i32> %B, <8 x float> %C, i16 %Index, ptr addrspace(1) %out) { +bb: + %tmp0 = call <8 x float> @llvm.amdgcn.swmmac.f32.16x16x32.bf8.fp8(<2 x i32> %A, <4 x i32> %B, <8 x float> %C, i16 %Index) + store <8 x float> %tmp0, ptr addrspace(1) %out, align 32 + ret void +} + +; CHECK: DIVERGENT: %tmp0 = call <8 x float> @llvm.amdgcn.swmmac.f32.16x16x32.bf8.bf8.v8f32.v2i32.v4i32.i16(<2 x i32> %A, <4 x i32> %B, <8 x float> %C, i16 %Index) +define amdgpu_kernel void @swmmac_f32_16x16x32_bf8.bf8(<2 x i32> %A, <4 x i32> %B, <8 x float> %C, i16 %Index, ptr addrspace(1) %out) { +bb: + %tmp0 = call <8 x float> @llvm.amdgcn.swmmac.f32.16x16x32.bf8.bf8(<2 x i32> %A, <4 x i32> %B, <8 x float> %C, i16 %Index) + store <8 x float> %tmp0, ptr addrspace(1) %out, align 32 + ret void +} + ; CHECK: DIVERGENT: %tmp0 = call <2 x i32> @llvm.amdgcn.global.load.tr.v2i32(ptr addrspace(1) %gep) define amdgpu_kernel void @global_load_tr_b64_v2i32(ptr addrspace(1) %addr, ptr addrspace(1) %out) { bb: @@ -190,12 +278,23 @@ declare i32 @llvm.amdgcn.mov.dpp.i32(i32, i32, i32, i32, i1) #1 declare i32 @llvm.amdgcn.mov.dpp8.i32(i32, i32) #1 declare i32 @llvm.amdgcn.update.dpp.i32(i32, i32, i32, i32, i32, i1) #1 declare i32 @llvm.amdgcn.writelane(i32, i32, i32) #1 -declare <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.f16.v8f32(<16 x half>, <16 x half> , <8 x float>) #1 -declare <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf16.v8f32(<16 x i16>, <16 x i16> , <8 x float>) #1 -declare <16 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16.v16f16(<16 x half>, <16 x half> , <16 x half>, i1 immarg) #1 -declare <16 x i16> @llvm.amdgcn.wmma.bf16.16x16x16.bf16.v16i16(<16 x i16>, <16 x i16> , <16 x i16>, i1 immarg) #1 -declare <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8.v8i32(i1 immarg, <4 x i32>, i1 immarg, <4 x i32> , <8 x i32>, i1 immarg) #1 -declare <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4.v8i32(i1 immarg, <2 x i32>, i1 immarg, <2 x i32> , <8 x i32>, i1 immarg) #1 +declare <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.f16.v8f32.v16f16(<16 x half>, <16 x half> , <8 x float>) #1 +declare <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf16.v8f32.v16i16(<16 x i16>, <16 x i16> , <8 x float>) #1 +declare <16 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16.v16f16.v16f16(<16 x half>, <16 x half> , <16 x half>, i1 immarg) #1 +declare <16 x i16> @llvm.amdgcn.wmma.bf16.16x16x16.bf16.v16i16.v16i16(<16 x i16>, <16 x i16> , <16 x i16>, i1 immarg) #1 +declare <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8.v8i32.v4i32(i1 immarg, <4 x i32>, i1 immarg, <4 x i32> , <8 x i32>, i1 immarg) #1 +declare <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4.v8i32.v2i32(i1 immarg, <2 x i32>, i1 immarg, <2 x i32> , <8 x i32>, i1 immarg) #1 +declare <8 x float> @llvm.amdgcn.swmmac.f32.16x16x32.f16(<8 x half>, <16 x half>, <8 x float>, i16) +declare <8 x float> @llvm.amdgcn.swmmac.f32.16x16x32.bf16(<8 x i16>, <16 x i16>, <8 x float>, i16) +declare <8 x half> @llvm.amdgcn.swmmac.f16.16x16x32.f16(<8 x half>, <16 x half>, <8 x half>, i16) +declare <8 x i16> @llvm.amdgcn.swmmac.bf16.16x16x32.bf16(<8 x i16>, <16 x i16>, <8 x i16>, i16) +declare <8 x i32> @llvm.amdgcn.swmmac.i32.16x16x32.iu8(i1 immarg, <2 x i32>, i1 immarg, <4 x i32>, <8 x i32>, i16, i1) +declare <8 x i32> @llvm.amdgcn.swmmac.i32.16x16x32.iu4(i1 immarg, i32, i1 immarg, <2 x i32>, <8 x i32>, i16, i1) +declare <8 x i32> @llvm.amdgcn.swmmac.i32.16x16x64.iu4(i1 immarg, <2 x i32>, i1 immarg, <4 x i32>, <8 x i32>, i16, i1) +declare <8 x float> @llvm.amdgcn.swmmac.f32.16x16x32.fp8.fp8(<2 x i32>, <4 x i32>, <8 x float>, i16) +declare <8 x float> @llvm.amdgcn.swmmac.f32.16x16x32.fp8.bf8(<2 x i32>, <4 x i32>, <8 x float>, i16) +declare <8 x float> @llvm.amdgcn.swmmac.f32.16x16x32.bf8.fp8(<2 x i32>, <4 x i32>, <8 x float>, i16) +declare <8 x float> @llvm.amdgcn.swmmac.f32.16x16x32.bf8.bf8(<2 x i32>, <4 x i32>, <8 x float>, i16) declare <2 x i32> @llvm.amdgcn.global.load.tr.v2i32(ptr addrspace(1)) declare <8 x i16> @llvm.amdgcn.global.load.tr.v8i16(ptr addrspace(1)) diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/wmma-gfx12-w32-f16-f32-matrix-modifiers.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/wmma-gfx12-w32-f16-f32-matrix-modifiers.ll new file mode 100644 index 000000000000..ea377531df2a --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/wmma-gfx12-w32-f16-f32-matrix-modifiers.ll @@ -0,0 +1,504 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc -global-isel -march=amdgcn -mcpu=gfx1200 -verify-machineinstrs < %s | FileCheck %s --check-prefix=GFX12 + +define amdgpu_ps void @test_wmma_f32_16x16x16_f16_negA(<8 x half> %A, <8 x half> %B, <8 x float> %C, <8 x float> addrspace(1)* %out) { +; GFX12-LABEL: test_wmma_f32_16x16x16_f16_negA: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: v_wmma_f32_16x16x16_f16 v[8:15], v[0:3], v[4:7], v[8:15] neg_lo:[1,0,0] neg_hi:[1,0,0] +; GFX12-NEXT: s_clause 0x1 +; GFX12-NEXT: global_store_b128 v[16:17], v[8:11], off +; GFX12-NEXT: global_store_b128 v[16:17], v[12:15], off offset:16 +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +bb: + %fneg.A = fneg <8 x half> %A + %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.f16.v8f16.v8f32(<8 x half> %fneg.A, <8 x half> %B, <8 x float> %C) + store <8 x float> %res, <8 x float> addrspace(1)* %out + ret void +} + +define amdgpu_ps void @test_wmma_f32_16x16x16_f16_negB(<8 x half> %A, <8 x half> %B, <8 x float> %C, <8 x float> addrspace(1)* %out) { +; GFX12-LABEL: test_wmma_f32_16x16x16_f16_negB: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: v_wmma_f32_16x16x16_f16 v[8:15], v[0:3], v[4:7], v[8:15] neg_lo:[0,1,0] neg_hi:[0,1,0] +; GFX12-NEXT: s_clause 0x1 +; GFX12-NEXT: global_store_b128 v[16:17], v[8:11], off +; GFX12-NEXT: global_store_b128 v[16:17], v[12:15], off offset:16 +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +bb: + %fneg.B = fneg <8 x half> %B + %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.f16.v8f16.v8f32(<8 x half> %A, <8 x half> %fneg.B, <8 x float> %C) + store <8 x float> %res, <8 x float> addrspace(1)* %out + ret void +} + +define amdgpu_ps void @test_wmma_f32_16x16x16_f16_negC(<8 x half> %A, <8 x half> %B, <8 x float> %C, <8 x float> addrspace(1)* %out) { +; GFX12-LABEL: test_wmma_f32_16x16x16_f16_negC: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: v_wmma_f32_16x16x16_f16 v[8:15], v[0:3], v[4:7], v[8:15] neg_lo:[0,0,1] +; GFX12-NEXT: s_clause 0x1 +; GFX12-NEXT: global_store_b128 v[16:17], v[8:11], off +; GFX12-NEXT: global_store_b128 v[16:17], v[12:15], off offset:16 +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +bb: + %fneg.C = fneg <8 x float> %C + %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.f16.v8f16.v8f32(<8 x half> %A, <8 x half> %B, <8 x float> %fneg.C) + store <8 x float> %res, <8 x float> addrspace(1)* %out + ret void +} + +define amdgpu_ps void @test_wmma_f32_16x16x16_f16_absC(<8 x half> %A, <8 x half> %B, <8 x float> %C, <8 x float> addrspace(1)* %out) { +; GFX12-LABEL: test_wmma_f32_16x16x16_f16_absC: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: v_wmma_f32_16x16x16_f16 v[8:15], v[0:3], v[4:7], v[8:15] neg_hi:[0,0,1] +; GFX12-NEXT: s_clause 0x1 +; GFX12-NEXT: global_store_b128 v[16:17], v[8:11], off +; GFX12-NEXT: global_store_b128 v[16:17], v[12:15], off offset:16 +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +bb: + %fabs.C = call <8 x float> @llvm.fabs.v8f32(<8 x float> %C) + %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.f16.v8f16.v8f32(<8 x half> %A, <8 x half> %B, <8 x float> %fabs.C) + store <8 x float> %res, <8 x float> addrspace(1)* %out + ret void +} + +define amdgpu_ps void @test_wmma_f32_16x16x16_bf16_negC(<8 x i16> %A, <8 x i16> %B, <8 x float> %C, <8 x float> addrspace(1)* %out) { +; GFX12-LABEL: test_wmma_f32_16x16x16_bf16_negC: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: v_wmma_f32_16x16x16_bf16 v[8:15], v[0:3], v[4:7], v[8:15] neg_lo:[0,0,1] +; GFX12-NEXT: s_clause 0x1 +; GFX12-NEXT: global_store_b128 v[16:17], v[8:11], off +; GFX12-NEXT: global_store_b128 v[16:17], v[12:15], off offset:16 +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +bb: + %fneg.C = fneg <8 x float> %C + %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf16.v8i16.v8f32(<8 x i16> %A, <8 x i16> %B, <8 x float> %fneg.C) + store <8 x float> %res, <8 x float> addrspace(1)* %out + ret void +} + +define amdgpu_ps void @test_wmma_f32_16x16x16_bf16_absC(<8 x i16> %A, <8 x i16> %B, <8 x float> %C, <8 x float> addrspace(1)* %out) { +; GFX12-LABEL: test_wmma_f32_16x16x16_bf16_absC: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: v_wmma_f32_16x16x16_bf16 v[8:15], v[0:3], v[4:7], v[8:15] neg_hi:[0,0,1] +; GFX12-NEXT: s_clause 0x1 +; GFX12-NEXT: global_store_b128 v[16:17], v[8:11], off +; GFX12-NEXT: global_store_b128 v[16:17], v[12:15], off offset:16 +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +bb: + %fabs.C = call <8 x float> @llvm.fabs.v8f32(<8 x float> %C) + %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf16.v8i16.v8f32(<8 x i16> %A, <8 x i16> %B, <8 x float> %fabs.C) + store <8 x float> %res, <8 x float> addrspace(1)* %out + ret void +} + +define amdgpu_ps void @test_wmma_f16_16x16x16_f16_negA(<8 x half> %A, <8 x half> %B, <8 x half> %C, <8 x half> addrspace(1)* %out) { +; GFX12-LABEL: test_wmma_f16_16x16x16_f16_negA: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: v_wmma_f16_16x16x16_f16 v[8:11], v[0:3], v[4:7], v[8:11] neg_lo:[1,0,0] neg_hi:[1,0,0] +; GFX12-NEXT: global_store_b128 v[12:13], v[8:11], off +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +bb: + %fneg.A = fneg <8 x half> %A + %res = call <8 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16.v8f16.v8f16(<8 x half> %fneg.A, <8 x half> %B, <8 x half> %C, i1 0) + store <8 x half> %res, <8 x half> addrspace(1)* %out + ret void +} + +define amdgpu_ps void @test_wmma_f16_16x16x16_f16_negB(<8 x half> %A, <8 x half> %B, <8 x half> %C, <8 x half> addrspace(1)* %out) { +; GFX12-LABEL: test_wmma_f16_16x16x16_f16_negB: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: v_wmma_f16_16x16x16_f16 v[8:11], v[0:3], v[4:7], v[8:11] neg_lo:[0,1,0] neg_hi:[0,1,0] +; GFX12-NEXT: global_store_b128 v[12:13], v[8:11], off +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +bb: + %fneg.B = fneg <8 x half> %B + %res = call <8 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16.v8f16.v8f16(<8 x half> %A, <8 x half> %fneg.B, <8 x half> %C, i1 0) + store <8 x half> %res, <8 x half> addrspace(1)* %out + ret void +} + +define amdgpu_ps void @test_wmma_f16_16x16x16_f16_negC(<8 x half> %A, <8 x half> %B, <8 x half> %C, <8 x half> addrspace(1)* %out) { +; GFX12-LABEL: test_wmma_f16_16x16x16_f16_negC: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: v_wmma_f16_16x16x16_f16 v[8:11], v[0:3], v[4:7], v[8:11] neg_lo:[0,0,1] +; GFX12-NEXT: global_store_b128 v[12:13], v[8:11], off +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +bb: + %fneg.C = fneg <8 x half> %C + %res = call <8 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16.v8f16.v8f16(<8 x half> %A, <8 x half> %B, <8 x half> %fneg.C, i1 0) + store <8 x half> %res, <8 x half> addrspace(1)* %out + ret void +} + +define amdgpu_ps void @test_wmma_f16_16x16x16_f16_absC(<8 x half> %A, <8 x half> %B, <8 x half> %C, <8 x half> addrspace(1)* %out) { +; GFX12-LABEL: test_wmma_f16_16x16x16_f16_absC: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: v_wmma_f16_16x16x16_f16 v[8:11], v[0:3], v[4:7], v[8:11] neg_hi:[0,0,1] +; GFX12-NEXT: global_store_b128 v[12:13], v[8:11], off +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +bb: + %fabs.C = call <8 x half> @llvm.fabs.v8f16(<8 x half> %C) + %res = call <8 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16.v8f16.v8f16(<8 x half> %A, <8 x half> %B, <8 x half> %fabs.C, i1 0) + store <8 x half> %res, <8 x half> addrspace(1)* %out + ret void +} + +define amdgpu_ps void @test_wmma_f32_16x16x16_fp8_fp8_negC(<2 x i32> %A, <2 x i32> %B, <8 x float> %C, <8 x float> addrspace(1)* %out) { +; GFX12-LABEL: test_wmma_f32_16x16x16_fp8_fp8_negC: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: v_wmma_f32_16x16x16_fp8_fp8 v[4:11], v[0:1], v[2:3], v[4:11] neg_lo:[0,0,1] +; GFX12-NEXT: s_clause 0x1 +; GFX12-NEXT: global_store_b128 v[12:13], v[4:7], off +; GFX12-NEXT: global_store_b128 v[12:13], v[8:11], off offset:16 +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +bb: + %fneg.C = fneg <8 x float> %C + %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.fp8.fp8.v2i32.v8f32(<2 x i32> %A, <2 x i32> %B, <8 x float> %fneg.C) + store <8 x float> %res, <8 x float> addrspace(1)* %out + ret void +} + +define amdgpu_ps void @test_wmma_f32_16x16x16_fp8_fp8_absC(<2 x i32> %A, <2 x i32> %B, <8 x float> %C, <8 x float> addrspace(1)* %out) { +; GFX12-LABEL: test_wmma_f32_16x16x16_fp8_fp8_absC: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: v_wmma_f32_16x16x16_fp8_fp8 v[4:11], v[0:1], v[2:3], v[4:11] neg_hi:[0,0,1] +; GFX12-NEXT: s_clause 0x1 +; GFX12-NEXT: global_store_b128 v[12:13], v[4:7], off +; GFX12-NEXT: global_store_b128 v[12:13], v[8:11], off offset:16 +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +bb: + %fabs.C = call <8 x float> @llvm.fabs.v8f32(<8 x float> %C) + %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.fp8.fp8.v2i32.v8f32(<2 x i32> %A, <2 x i32> %B, <8 x float> %fabs.C) + store <8 x float> %res, <8 x float> addrspace(1)* %out + ret void +} + +define amdgpu_ps void @test_wmma_f32_16x16x16_bf8_fp8_negC(<2 x i32> %A, <2 x i32> %B, <8 x float> %C, <8 x float> addrspace(1)* %out) { +; GFX12-LABEL: test_wmma_f32_16x16x16_bf8_fp8_negC: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: v_wmma_f32_16x16x16_bf8_fp8 v[4:11], v[0:1], v[2:3], v[4:11] neg_lo:[0,0,1] +; GFX12-NEXT: s_clause 0x1 +; GFX12-NEXT: global_store_b128 v[12:13], v[4:7], off +; GFX12-NEXT: global_store_b128 v[12:13], v[8:11], off offset:16 +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +bb: + %fneg.C = fneg <8 x float> %C + %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf8.fp8.v2i32.v8f32(<2 x i32> %A, <2 x i32> %B, <8 x float> %fneg.C) + store <8 x float> %res, <8 x float> addrspace(1)* %out + ret void +} + +define amdgpu_ps void @test_wmma_f32_16x16x16_bf8_fp8_absC(<2 x i32> %A, <2 x i32> %B, <8 x float> %C, <8 x float> addrspace(1)* %out) { +; GFX12-LABEL: test_wmma_f32_16x16x16_bf8_fp8_absC: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: v_wmma_f32_16x16x16_bf8_fp8 v[4:11], v[0:1], v[2:3], v[4:11] neg_hi:[0,0,1] +; GFX12-NEXT: s_clause 0x1 +; GFX12-NEXT: global_store_b128 v[12:13], v[4:7], off +; GFX12-NEXT: global_store_b128 v[12:13], v[8:11], off offset:16 +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +bb: + %fabs.C = call <8 x float> @llvm.fabs.v8f32(<8 x float> %C) + %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf8.fp8.v2i32.v8f32(<2 x i32> %A, <2 x i32> %B, <8 x float> %fabs.C) + store <8 x float> %res, <8 x float> addrspace(1)* %out + ret void +} + +define amdgpu_ps void @test_wmma_f32_16x16x16_fp8_bf8_negC(<2 x i32> %A, <2 x i32> %B, <8 x float> %C, <8 x float> addrspace(1)* %out) { +; GFX12-LABEL: test_wmma_f32_16x16x16_fp8_bf8_negC: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: v_wmma_f32_16x16x16_fp8_bf8 v[4:11], v[0:1], v[2:3], v[4:11] neg_lo:[0,0,1] +; GFX12-NEXT: s_clause 0x1 +; GFX12-NEXT: global_store_b128 v[12:13], v[4:7], off +; GFX12-NEXT: global_store_b128 v[12:13], v[8:11], off offset:16 +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +bb: + %fneg.C = fneg <8 x float> %C + %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.fp8.bf8.v2i32.v8f32(<2 x i32> %A, <2 x i32> %B, <8 x float> %fneg.C) + store <8 x float> %res, <8 x float> addrspace(1)* %out + ret void +} + +define amdgpu_ps void @test_wmma_f32_16x16x16_fp8_bf8_absC(<2 x i32> %A, <2 x i32> %B, <8 x float> %C, <8 x float> addrspace(1)* %out) { +; GFX12-LABEL: test_wmma_f32_16x16x16_fp8_bf8_absC: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: v_wmma_f32_16x16x16_fp8_bf8 v[4:11], v[0:1], v[2:3], v[4:11] neg_hi:[0,0,1] +; GFX12-NEXT: s_clause 0x1 +; GFX12-NEXT: global_store_b128 v[12:13], v[4:7], off +; GFX12-NEXT: global_store_b128 v[12:13], v[8:11], off offset:16 +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +bb: + %fabs.C = call <8 x float> @llvm.fabs.v8f32(<8 x float> %C) + %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.fp8.bf8.v2i32.v8f32(<2 x i32> %A, <2 x i32> %B, <8 x float> %fabs.C) + store <8 x float> %res, <8 x float> addrspace(1)* %out + ret void +} + +define amdgpu_ps void @test_wmma_f32_16x16x16_bf8_bf8_negC(<2 x i32> %A, <2 x i32> %B, <8 x float> %C, <8 x float> addrspace(1)* %out) { +; GFX12-LABEL: test_wmma_f32_16x16x16_bf8_bf8_negC: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: v_wmma_f32_16x16x16_bf8_bf8 v[4:11], v[0:1], v[2:3], v[4:11] neg_lo:[0,0,1] +; GFX12-NEXT: s_clause 0x1 +; GFX12-NEXT: global_store_b128 v[12:13], v[4:7], off +; GFX12-NEXT: global_store_b128 v[12:13], v[8:11], off offset:16 +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +bb: + %fneg.C = fneg <8 x float> %C + %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf8.bf8.v2i32.v8f32(<2 x i32> %A, <2 x i32> %B, <8 x float> %fneg.C) + store <8 x float> %res, <8 x float> addrspace(1)* %out + ret void +} + +define amdgpu_ps void @test_wmma_f32_16x16x16_bf8_bf8_absC(<2 x i32> %A, <2 x i32> %B, <8 x float> %C, <8 x float> addrspace(1)* %out) { +; GFX12-LABEL: test_wmma_f32_16x16x16_bf8_bf8_absC: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: v_wmma_f32_16x16x16_bf8_bf8 v[4:11], v[0:1], v[2:3], v[4:11] neg_hi:[0,0,1] +; GFX12-NEXT: s_clause 0x1 +; GFX12-NEXT: global_store_b128 v[12:13], v[4:7], off +; GFX12-NEXT: global_store_b128 v[12:13], v[8:11], off offset:16 +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +bb: + %fabs.C = call <8 x float> @llvm.fabs.v8f32(<8 x float> %C) + %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf8.bf8.v2i32.v8f32(<2 x i32> %A, <2 x i32> %B, <8 x float> %fabs.C) + store <8 x float> %res, <8 x float> addrspace(1)* %out + ret void +} + +define amdgpu_ps void @test_swmmac_f32_16x16x32_f16_negA(<8 x half> %A, <16 x half> %B, <8 x float> %C, i16 %Index, <8 x float> addrspace(1)* %out) { +; GFX12-LABEL: test_swmmac_f32_16x16x32_f16_negA: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: v_swmmac_f32_16x16x32_f16 v[12:19], v[0:3], v[4:11], v20 neg_lo:[1,0,0] neg_hi:[1,0,0] +; GFX12-NEXT: s_clause 0x1 +; GFX12-NEXT: global_store_b128 v[21:22], v[12:15], off +; GFX12-NEXT: global_store_b128 v[21:22], v[16:19], off offset:16 +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +bb: + %fneg.A = fneg <8 x half> %A + %res = call <8 x float> @llvm.amdgcn.swmmac.f32.16x16x32.f16.v8f16.v16f16.v8f32.i16(<8 x half> %fneg.A, <16 x half> %B, <8 x float> %C, i16 %Index) + store <8 x float> %res, <8 x float> addrspace(1)* %out + ret void +} + +define amdgpu_ps void @test_swmmac_f32_16x16x32_f16_negB(<8 x half> %A, <16 x half> %B, <8 x float> %C, i16 %Index, <8 x float> addrspace(1)* %out) { +; GFX12-LABEL: test_swmmac_f32_16x16x32_f16_negB: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: v_swmmac_f32_16x16x32_f16 v[12:19], v[0:3], v[4:11], v20 neg_lo:[0,1,0] neg_hi:[0,1,0] +; GFX12-NEXT: s_clause 0x1 +; GFX12-NEXT: global_store_b128 v[21:22], v[12:15], off +; GFX12-NEXT: global_store_b128 v[21:22], v[16:19], off offset:16 +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +bb: + %fneg.B = fneg <16 x half> %B + %res = call <8 x float> @llvm.amdgcn.swmmac.f32.16x16x32.f16.v8f16.v16f16.v8f32.i16(<8 x half> %A, <16 x half> %fneg.B, <8 x float> %C, i16 %Index) + store <8 x float> %res, <8 x float> addrspace(1)* %out + ret void +} + +define amdgpu_ps void @test_swmmac_f16_16x16x32_f16_negA(<8 x half> %A, <16 x half> %B, <8 x half> %C, i16 %Index, <8 x half> addrspace(1)* %out) { +; GFX12-LABEL: test_swmmac_f16_16x16x32_f16_negA: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: v_swmmac_f16_16x16x32_f16 v[12:15], v[0:3], v[4:11], v16 neg_lo:[1,0,0] neg_hi:[1,0,0] +; GFX12-NEXT: global_store_b128 v[17:18], v[12:15], off +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +bb: + %fneg.A = fneg <8 x half> %A + %res = call <8 x half> @llvm.amdgcn.swmmac.f16.16x16x32.f16.v8f16.v16f16.v8f16.i16(<8 x half> %fneg.A, <16 x half> %B, <8 x half> %C, i16 %Index) + store <8 x half> %res, <8 x half> addrspace(1)* %out + ret void +} + +define amdgpu_ps void @test_swmmac_f16_16x16x32_f16_negB(<8 x half> %A, <16 x half> %B, <8 x half> %C, i16 %Index, <8 x half> addrspace(1)* %out) { +; GFX12-LABEL: test_swmmac_f16_16x16x32_f16_negB: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: v_swmmac_f16_16x16x32_f16 v[12:15], v[0:3], v[4:11], v16 neg_lo:[0,1,0] neg_hi:[0,1,0] +; GFX12-NEXT: global_store_b128 v[17:18], v[12:15], off +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +bb: + %fneg.B = fneg <16 x half> %B + %res = call <8 x half> @llvm.amdgcn.swmmac.f16.16x16x32.f16.v8f16.v16f16.v8f16.i16(<8 x half> %A, <16 x half> %fneg.B, <8 x half> %C, i16 %Index) + store <8 x half> %res, <8 x half> addrspace(1)* %out + ret void +} + +; both neg and abs patterns (wmma matrix C f32 or f16 ) + +define amdgpu_ps void @test_wmma_f32_16x16x16_f16_negabsC(<8 x half> %A, <8 x half> %B, <8 x float> %C, <8 x float> addrspace(1)* %out) { +; GFX12-LABEL: test_wmma_f32_16x16x16_f16_negabsC: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: v_wmma_f32_16x16x16_f16 v[8:15], v[0:3], v[4:7], v[8:15] neg_lo:[0,0,1] neg_hi:[0,0,1] +; GFX12-NEXT: s_clause 0x1 +; GFX12-NEXT: global_store_b128 v[16:17], v[8:11], off +; GFX12-NEXT: global_store_b128 v[16:17], v[12:15], off offset:16 +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +bb: + %fabs.C = call <8 x float> @llvm.fabs.v8f32(<8 x float> %C) + %fneg.fabs.C = fneg <8 x float> %fabs.C + %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.f16.v8f16.v8f32(<8 x half> %A, <8 x half> %B, <8 x float> %fneg.fabs.C) + store <8 x float> %res, <8 x float> addrspace(1)* %out + ret void +} + +define amdgpu_ps void @test_wmma_f16_16x16x16_f16_negabsC(<8 x half> %A, <8 x half> %B, <8 x half> %C, <8 x half> addrspace(1)* %out) { +; GFX12-LABEL: test_wmma_f16_16x16x16_f16_negabsC: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: v_wmma_f16_16x16x16_f16 v[8:11], v[0:3], v[4:7], v[8:11] neg_lo:[0,0,1] neg_hi:[0,0,1] +; GFX12-NEXT: global_store_b128 v[12:13], v[8:11], off +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +bb: + %fabs.C = call <8 x half> @llvm.fabs.v8f16(<8 x half> %C) + %fneg.fabs.C = fneg <8 x half> %fabs.C + %res = call <8 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16.v8f16.v8f16(<8 x half> %A, <8 x half> %B, <8 x half> %fneg.fabs.C, i1 0) + store <8 x half> %res, <8 x half> addrspace(1)* %out + ret void +} + +define amdgpu_ps void @test_wmma_f32_16x16x16_f16_neg_partial_fabsA(<8 x half> %A, <8 x half> %B, <8 x float> %C, <8 x float> addrspace(1)* %out) { +; GFX12-LABEL: test_wmma_f32_16x16x16_f16_neg_partial_fabsA: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: v_and_b32_e32 v11, 0x7fffffff, v11 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: v_wmma_f32_16x16x16_f16 v[8:15], v[0:3], v[4:7], v[8:15] neg_lo:[0,0,1] +; GFX12-NEXT: s_clause 0x1 +; GFX12-NEXT: global_store_b128 v[16:17], v[8:11], off +; GFX12-NEXT: global_store_b128 v[16:17], v[12:15], off offset:16 +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +bb: + %el3 = extractelement <8 x float> %C, i32 3 + %el3.fabs = call float @llvm.fabs.f32(float %el3) + %partial.fabs.C = insertelement <8 x float> %C, float %el3.fabs, i32 3 + %fneg.partial.fabs.C = fneg <8 x float> %partial.fabs.C + %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.f16.v8f16.v8f32(<8 x half> %A, <8 x half> %B, <8 x float> %fneg.partial.fabs.C) + store <8 x float> %res, <8 x float> addrspace(1)* %out + ret void +} + +; A or B matrix modifier and constant in C + +define amdgpu_ps void @test_wmma_f32_16x16x16_f16_negA_constantC(<8 x half> %A, <8 x half> %B, <8 x float> %C, <8 x float> addrspace(1)* %out) { +; GFX12-LABEL: test_wmma_f32_16x16x16_f16_negA_constantC: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: v_wmma_f32_16x16x16_f16 v[10:17], v[0:3], v[4:7], 1.0 neg_lo:[1,0,0] neg_hi:[1,0,0] +; GFX12-NEXT: s_clause 0x1 +; GFX12-NEXT: global_store_b128 v[8:9], v[10:13], off +; GFX12-NEXT: global_store_b128 v[8:9], v[14:17], off offset:16 +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +bb: + %fneg.A = fneg <8 x half> %A + %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.f16.v8f16.v8f32(<8 x half> %fneg.A, <8 x half> %B, <8 x float> ) + store <8 x float> %res, <8 x float> addrspace(1)* %out + ret void +} + +define amdgpu_ps void @test_wmma_f16_16x16x16_f16_negB_constantC(<8 x half> %A, <8 x half> %B, <8 x half> %C, <8 x half> addrspace(1)* %out) { +; GFX12-LABEL: test_wmma_f16_16x16x16_f16_negB_constantC: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: v_wmma_f16_16x16x16_f16 v[10:13], v[0:3], v[4:7], 1.0 neg_lo:[0,1,0] neg_hi:[0,1,0] +; GFX12-NEXT: global_store_b128 v[8:9], v[10:13], off +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +bb: + %fneg.B = fneg <8 x half> %B + %res = call <8 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16.v8f16.v8f16(<8 x half> %A, <8 x half> %fneg.B, <8 x half> , i1 0) + store <8 x half> %res, <8 x half> addrspace(1)* %out + ret void +} + +; pack f16 elements with v_perm_b32 since they don't come from same b32 + +define amdgpu_ps void @test_wmma_f16_16x16x16_f16_negC_pack(<8 x half> %A, <8 x half> %B, ptr %Caddr, <8 x half> addrspace(1)* %out) { +; GFX12-LABEL: test_wmma_f16_16x16x16_f16_negC_pack: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: s_clause 0x1 +; GFX12-NEXT: flat_load_b128 v[12:15], v[8:9] +; GFX12-NEXT: flat_load_b128 v[16:19], v[8:9] offset:16 +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x101 +; GFX12-NEXT: v_and_b32_e32 v8, 0xffff, v12 +; GFX12-NEXT: v_and_b32_e32 v9, 0xffff, v14 +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: v_and_b32_e32 v14, 0xffff, v16 +; GFX12-NEXT: v_and_b32_e32 v16, 0xffff, v18 +; GFX12-NEXT: v_lshl_or_b32 v12, v13, 16, v8 +; GFX12-NEXT: v_lshl_or_b32 v13, v15, 16, v9 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX12-NEXT: v_lshl_or_b32 v14, v17, 16, v14 +; GFX12-NEXT: v_lshl_or_b32 v15, v19, 16, v16 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: v_wmma_f16_16x16x16_f16 v[12:15], v[0:3], v[4:7], v[12:15] neg_lo:[0,0,1] +; GFX12-NEXT: global_store_b128 v[10:11], v[12:15], off +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +bb: + %C = load <16 x half>, ptr %Caddr + %C_shuffle = shufflevector <16 x half> %C, <16 x half> undef, <8 x i32> + %fneg.C_shuffle = fneg <8 x half> %C_shuffle + %res = call <8 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16.v8f16.v8f16(<8 x half> %A, <8 x half> %B, <8 x half> %fneg.C_shuffle , i1 0) + store <8 x half> %res, <8 x half> addrspace(1)* %out + ret void +} + +declare <8 x half> @llvm.fabs.v8f16(<8 x half>) +declare <8 x float> @llvm.fabs.v8f32(<8 x float>) +declare float @llvm.fabs.f32(float) + +declare <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.f16.v8f16.v8f32(<8 x half>, <8 x half>, <8 x float>) +declare <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf16.v8i16.v8f32(<8 x i16>, <8 x i16>, <8 x float>) +declare <8 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16.v8f16.v8f16(<8 x half>, <8 x half>, <8 x half>, i1 immarg) +declare <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.fp8.fp8.v2i32.v8f32(<2 x i32>, <2 x i32>, <8 x float>) +declare <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.fp8.bf8.v2i32.v8f32(<2 x i32>, <2 x i32>, <8 x float>) +declare <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf8.fp8.v2i32.v8f32(<2 x i32>, <2 x i32>, <8 x float>) +declare <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf8.bf8.v2i32.v8f32(<2 x i32>, <2 x i32>, <8 x float>) +declare <8 x float> @llvm.amdgcn.swmmac.f32.16x16x32.f16.v8f16.v16f16.v8f32.i16(<8 x half>, <16 x half>, <8 x float>, i16) +declare <8 x half> @llvm.amdgcn.swmmac.f16.16x16x32.f16.v8f16.v16f16.v8f16.i16(<8 x half>, <16 x half>, <8 x half>, i16) diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/wmma-gfx12-w32-imm.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/wmma-gfx12-w32-imm.ll new file mode 100644 index 000000000000..6251dfdc392e --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/wmma-gfx12-w32-imm.ll @@ -0,0 +1,519 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc -global-isel -march=amdgcn -mcpu=gfx1200 -verify-machineinstrs < %s | FileCheck %s --check-prefix=GFX12 + +define amdgpu_ps void @test_wmma_f32_16x16x16_f16_imm(<8 x half> %A, <8 x half> %B, ptr addrspace(1) %out) { +; GFX12-LABEL: test_wmma_f32_16x16x16_f16_imm: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: v_wmma_f32_16x16x16_f16 v[10:17], v[0:3], v[4:7], 1.0 +; GFX12-NEXT: s_clause 0x1 +; GFX12-NEXT: global_store_b128 v[8:9], v[10:13], off +; GFX12-NEXT: global_store_b128 v[8:9], v[14:17], off offset:16 +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +bb: + %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.f16.v8f32.v8f16.v8f16.v8f32(<8 x half> %A, <8 x half> %B, <8 x float> ) + store <8 x float> %res, ptr addrspace(1) %out + ret void +} + +define amdgpu_ps void @test_wmma_f32_16x16x16_f16_imm_non_inlineable(<8 x half> %A, <8 x half> %B, ptr addrspace(1) %out) { +; GFX12-LABEL: test_wmma_f32_16x16x16_f16_imm_non_inlineable: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: s_mov_b32 s0, 0x40400000 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: s_mov_b32 s7, s0 +; GFX12-NEXT: s_mov_b32 s1, s0 +; GFX12-NEXT: s_mov_b32 s2, s0 +; GFX12-NEXT: s_mov_b32 s3, s0 +; GFX12-NEXT: s_mov_b32 s4, s0 +; GFX12-NEXT: s_mov_b32 s5, s0 +; GFX12-NEXT: s_mov_b32 s6, s0 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_1) +; GFX12-NEXT: v_dual_mov_b32 v17, s7 :: v_dual_mov_b32 v16, s6 +; GFX12-NEXT: v_dual_mov_b32 v15, s5 :: v_dual_mov_b32 v14, s4 +; GFX12-NEXT: v_dual_mov_b32 v13, s3 :: v_dual_mov_b32 v12, s2 +; GFX12-NEXT: v_dual_mov_b32 v11, s1 :: v_dual_mov_b32 v10, s0 +; GFX12-NEXT: v_wmma_f32_16x16x16_f16 v[10:17], v[0:3], v[4:7], v[10:17] +; GFX12-NEXT: s_clause 0x1 +; GFX12-NEXT: global_store_b128 v[8:9], v[10:13], off +; GFX12-NEXT: global_store_b128 v[8:9], v[14:17], off offset:16 +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +bb: + %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.f16.v8f32.v8f16.v8f16.v8f32(<8 x half> %A, <8 x half> %B, <8 x float> ) + store <8 x float> %res, ptr addrspace(1) %out + ret void +} + +define amdgpu_ps void @test_wmma_f32_16x16x16_bf16_imm(<8 x i16> %A, <8 x i16> %B, ptr addrspace(1) %out) { +; GFX12-LABEL: test_wmma_f32_16x16x16_bf16_imm: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: v_wmma_f32_16x16x16_bf16 v[10:17], v[0:3], v[4:7], 1.0 +; GFX12-NEXT: s_clause 0x1 +; GFX12-NEXT: global_store_b128 v[8:9], v[10:13], off +; GFX12-NEXT: global_store_b128 v[8:9], v[14:17], off offset:16 +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +bb: + %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf16.v8f32.v8i16.v8i16.v8f32(<8 x i16> %A, <8 x i16> %B, <8 x float> ) + store <8 x float> %res, ptr addrspace(1) %out + ret void +} + +define amdgpu_ps void @test_wmma_f32_16x16x16_bf16_imm_non_inlineable(<8 x i16> %A, <8 x i16> %B, ptr addrspace(1) %out) { +; GFX12-LABEL: test_wmma_f32_16x16x16_bf16_imm_non_inlineable: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: s_mov_b32 s0, 0x40400000 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: s_mov_b32 s7, s0 +; GFX12-NEXT: s_mov_b32 s1, s0 +; GFX12-NEXT: s_mov_b32 s2, s0 +; GFX12-NEXT: s_mov_b32 s3, s0 +; GFX12-NEXT: s_mov_b32 s4, s0 +; GFX12-NEXT: s_mov_b32 s5, s0 +; GFX12-NEXT: s_mov_b32 s6, s0 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_1) +; GFX12-NEXT: v_dual_mov_b32 v17, s7 :: v_dual_mov_b32 v16, s6 +; GFX12-NEXT: v_dual_mov_b32 v15, s5 :: v_dual_mov_b32 v14, s4 +; GFX12-NEXT: v_dual_mov_b32 v13, s3 :: v_dual_mov_b32 v12, s2 +; GFX12-NEXT: v_dual_mov_b32 v11, s1 :: v_dual_mov_b32 v10, s0 +; GFX12-NEXT: v_wmma_f32_16x16x16_bf16 v[10:17], v[0:3], v[4:7], v[10:17] +; GFX12-NEXT: s_clause 0x1 +; GFX12-NEXT: global_store_b128 v[8:9], v[10:13], off +; GFX12-NEXT: global_store_b128 v[8:9], v[14:17], off offset:16 +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +bb: + %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf16.v8f32.v8i16.v8i16.v8f32(<8 x i16> %A, <8 x i16> %B, <8 x float> ) + store <8 x float> %res, ptr addrspace(1) %out + ret void +} + +define amdgpu_ps void @test_wmma_f16_16x16x16_f16_imm(<8 x half> %A, <8 x half> %B, ptr addrspace(1) %out) { +; GFX12-LABEL: test_wmma_f16_16x16x16_f16_imm: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: v_wmma_f16_16x16x16_f16 v[10:13], v[0:3], v[4:7], 1.0 +; GFX12-NEXT: global_store_b128 v[8:9], v[10:13], off +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +bb: + %res = call <8 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16.v8f16.v8f16.v8f16.v8f16(<8 x half> %A, <8 x half> %B, <8 x half> , i1 0) + store <8 x half> %res, ptr addrspace(1) %out + ret void +} + +define amdgpu_ps void @test_wmma_f16_16x16x16_f16_imm_non_inlineable(<8 x half> %A, <8 x half> %B, ptr addrspace(1) %out) { +; GFX12-LABEL: test_wmma_f16_16x16x16_f16_imm_non_inlineable: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: s_mov_b32 s0, 0x42004200 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1) +; GFX12-NEXT: s_mov_b32 s3, s0 +; GFX12-NEXT: s_mov_b32 s1, s0 +; GFX12-NEXT: s_mov_b32 s2, s0 +; GFX12-NEXT: v_dual_mov_b32 v13, s3 :: v_dual_mov_b32 v12, s2 +; GFX12-NEXT: v_dual_mov_b32 v11, s1 :: v_dual_mov_b32 v10, s0 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: v_wmma_f16_16x16x16_f16 v[10:13], v[0:3], v[4:7], v[10:13] +; GFX12-NEXT: global_store_b128 v[8:9], v[10:13], off +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +bb: + %res = call <8 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16.v8f16.v8f16.v8f16.v8f16(<8 x half> %A, <8 x half> %B, <8 x half> , i1 0) + store <8 x half> %res, ptr addrspace(1) %out + ret void +} + +define amdgpu_ps void @test_wmma_bf16_16x16x16_bf16_imm(<8 x i16> %A, <8 x i16> %B, ptr addrspace(1) %out) { +; GFX12-LABEL: test_wmma_bf16_16x16x16_bf16_imm: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: s_mov_b32 s0, 0x3f803f80 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1) +; GFX12-NEXT: s_mov_b32 s3, s0 +; GFX12-NEXT: s_mov_b32 s1, s0 +; GFX12-NEXT: s_mov_b32 s2, s0 +; GFX12-NEXT: v_dual_mov_b32 v13, s3 :: v_dual_mov_b32 v12, s2 +; GFX12-NEXT: v_dual_mov_b32 v11, s1 :: v_dual_mov_b32 v10, s0 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: v_wmma_bf16_16x16x16_bf16 v[10:13], v[0:3], v[4:7], v[10:13] +; GFX12-NEXT: global_store_b128 v[8:9], v[10:13], off +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +bb: + %res = call <8 x i16> @llvm.amdgcn.wmma.bf16.16x16x16.bf16.v8i16.v8i16.v8i16.v8i16(<8 x i16> %A, <8 x i16> %B, <8 x i16> , i1 0) + store <8 x i16> %res, ptr addrspace(1) %out + ret void +} + +define amdgpu_ps void @test_wmma_bf16_16x16x16_bf16_imm_non_inlineable(<8 x i16> %A, <8 x i16> %B, ptr addrspace(1) %out) { +; GFX12-LABEL: test_wmma_bf16_16x16x16_bf16_imm_non_inlineable: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: s_mov_b32 s0, 0x3fc03fc0 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1) +; GFX12-NEXT: s_mov_b32 s3, s0 +; GFX12-NEXT: s_mov_b32 s1, s0 +; GFX12-NEXT: s_mov_b32 s2, s0 +; GFX12-NEXT: v_dual_mov_b32 v13, s3 :: v_dual_mov_b32 v12, s2 +; GFX12-NEXT: v_dual_mov_b32 v11, s1 :: v_dual_mov_b32 v10, s0 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: v_wmma_bf16_16x16x16_bf16 v[10:13], v[0:3], v[4:7], v[10:13] +; GFX12-NEXT: global_store_b128 v[8:9], v[10:13], off +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +bb: + %res = call <8 x i16> @llvm.amdgcn.wmma.bf16.16x16x16.bf16.v8i16.v8i16.v8i16.v8i16(<8 x i16> %A, <8 x i16> %B, <8 x i16> , i1 0) + store <8 x i16> %res, ptr addrspace(1) %out + ret void +} + +define amdgpu_ps void @test_wmma_i32_16x16x16_iu8_imm(<2 x i32> %A, <2 x i32> %B, ptr addrspace(1) %out) { +; GFX12-LABEL: test_wmma_i32_16x16x16_iu8_imm: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: v_wmma_i32_16x16x16_iu8 v[6:13], v[0:1], v[2:3], 1 +; GFX12-NEXT: s_clause 0x1 +; GFX12-NEXT: global_store_b128 v[4:5], v[6:9], off +; GFX12-NEXT: global_store_b128 v[4:5], v[10:13], off offset:16 +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +bb: + %res = call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8.v8i32.v2i32.v2i32.v8i32(i1 0, <2 x i32> %A, i1 0, <2 x i32> %B, <8 x i32> , i1 0) + store <8 x i32> %res, ptr addrspace(1) %out + ret void +} + +define amdgpu_ps void @test_wmma_i32_16x16x16_iu8_imm_non_inlineable(<2 x i32> %A, <2 x i32> %B, ptr addrspace(1) %out) { +; GFX12-LABEL: test_wmma_i32_16x16x16_iu8_imm_non_inlineable: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: s_movk_i32 s0, 0x80 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: s_mov_b32 s7, s0 +; GFX12-NEXT: s_mov_b32 s1, s0 +; GFX12-NEXT: s_mov_b32 s2, s0 +; GFX12-NEXT: s_mov_b32 s3, s0 +; GFX12-NEXT: s_mov_b32 s4, s0 +; GFX12-NEXT: s_mov_b32 s5, s0 +; GFX12-NEXT: s_mov_b32 s6, s0 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_1) +; GFX12-NEXT: v_dual_mov_b32 v13, s7 :: v_dual_mov_b32 v12, s6 +; GFX12-NEXT: v_dual_mov_b32 v11, s5 :: v_dual_mov_b32 v10, s4 +; GFX12-NEXT: v_dual_mov_b32 v9, s3 :: v_dual_mov_b32 v8, s2 +; GFX12-NEXT: v_dual_mov_b32 v7, s1 :: v_dual_mov_b32 v6, s0 +; GFX12-NEXT: v_wmma_i32_16x16x16_iu8 v[6:13], v[0:1], v[2:3], v[6:13] +; GFX12-NEXT: s_clause 0x1 +; GFX12-NEXT: global_store_b128 v[4:5], v[6:9], off +; GFX12-NEXT: global_store_b128 v[4:5], v[10:13], off offset:16 +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +bb: + %res = call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8.v8i32.v2i32.v2i32.v8i32(i1 0, <2 x i32> %A, i1 0, <2 x i32> %B, <8 x i32> , i1 0) + store <8 x i32> %res, ptr addrspace(1) %out + ret void +} + +define amdgpu_ps void @test_wmma_i32_16x16x16_iu4_imm(i32 %A, i32 %B, ptr addrspace(1) %out) { +; GFX12-LABEL: test_wmma_i32_16x16x16_iu4_imm: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: v_wmma_i32_16x16x16_iu4 v[4:11], v0, v1, 1 +; GFX12-NEXT: s_clause 0x1 +; GFX12-NEXT: global_store_b128 v[2:3], v[4:7], off +; GFX12-NEXT: global_store_b128 v[2:3], v[8:11], off offset:16 +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +bb: + %res = call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4.v8i32.i32.i32.v8i32(i1 0, i32 %A, i1 0, i32 %B, <8 x i32> , i1 0) + store <8 x i32> %res, ptr addrspace(1) %out + ret void +} + +define amdgpu_ps void @test_wmma_i32_16x16x16_iu4_imm_non_inlineable(i32 %A, i32 %B, ptr addrspace(1) %out) { +; GFX12-LABEL: test_wmma_i32_16x16x16_iu4_imm_non_inlineable: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: s_movk_i32 s0, 0x80 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: s_mov_b32 s7, s0 +; GFX12-NEXT: s_mov_b32 s1, s0 +; GFX12-NEXT: s_mov_b32 s2, s0 +; GFX12-NEXT: s_mov_b32 s3, s0 +; GFX12-NEXT: s_mov_b32 s4, s0 +; GFX12-NEXT: s_mov_b32 s5, s0 +; GFX12-NEXT: s_mov_b32 s6, s0 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_1) +; GFX12-NEXT: v_dual_mov_b32 v11, s7 :: v_dual_mov_b32 v10, s6 +; GFX12-NEXT: v_dual_mov_b32 v9, s5 :: v_dual_mov_b32 v8, s4 +; GFX12-NEXT: v_dual_mov_b32 v7, s3 :: v_dual_mov_b32 v6, s2 +; GFX12-NEXT: v_dual_mov_b32 v5, s1 :: v_dual_mov_b32 v4, s0 +; GFX12-NEXT: v_wmma_i32_16x16x16_iu4 v[4:11], v0, v1, v[4:11] +; GFX12-NEXT: s_clause 0x1 +; GFX12-NEXT: global_store_b128 v[2:3], v[4:7], off +; GFX12-NEXT: global_store_b128 v[2:3], v[8:11], off offset:16 +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +bb: + %res = call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4.v8i32.i32.i32.v8i32(i1 0, i32 %A, i1 0, i32 %B, <8 x i32> , i1 0) + store <8 x i32> %res, ptr addrspace(1) %out + ret void +} + +define amdgpu_ps void @test_wmma_f32_16x16x16_fp8_fp8_imm(<2 x i32> %A, <2 x i32> %B, ptr addrspace(1) %out) { +; GFX12-LABEL: test_wmma_f32_16x16x16_fp8_fp8_imm: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: v_wmma_f32_16x16x16_fp8_fp8 v[6:13], v[0:1], v[2:3], 1.0 +; GFX12-NEXT: s_clause 0x1 +; GFX12-NEXT: global_store_b128 v[4:5], v[6:9], off +; GFX12-NEXT: global_store_b128 v[4:5], v[10:13], off offset:16 +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +bb: + %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.fp8.fp8.v8f32.v2i32.v2i32.v8f32(<2 x i32> %A, <2 x i32> %B, <8 x float> ) + store <8 x float> %res, ptr addrspace(1) %out + ret void +} + +define amdgpu_ps void @test_wmma_f32_16x16x16_fp8_fp8_imm_non_inlineable(<2 x i32> %A, <2 x i32> %B, ptr addrspace(1) %out) { +; GFX12-LABEL: test_wmma_f32_16x16x16_fp8_fp8_imm_non_inlineable: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: s_mov_b32 s0, 0x40400000 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: s_mov_b32 s7, s0 +; GFX12-NEXT: s_mov_b32 s1, s0 +; GFX12-NEXT: s_mov_b32 s2, s0 +; GFX12-NEXT: s_mov_b32 s3, s0 +; GFX12-NEXT: s_mov_b32 s4, s0 +; GFX12-NEXT: s_mov_b32 s5, s0 +; GFX12-NEXT: s_mov_b32 s6, s0 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_1) +; GFX12-NEXT: v_dual_mov_b32 v13, s7 :: v_dual_mov_b32 v12, s6 +; GFX12-NEXT: v_dual_mov_b32 v11, s5 :: v_dual_mov_b32 v10, s4 +; GFX12-NEXT: v_dual_mov_b32 v9, s3 :: v_dual_mov_b32 v8, s2 +; GFX12-NEXT: v_dual_mov_b32 v7, s1 :: v_dual_mov_b32 v6, s0 +; GFX12-NEXT: v_wmma_f32_16x16x16_fp8_fp8 v[6:13], v[0:1], v[2:3], v[6:13] +; GFX12-NEXT: s_clause 0x1 +; GFX12-NEXT: global_store_b128 v[4:5], v[6:9], off +; GFX12-NEXT: global_store_b128 v[4:5], v[10:13], off offset:16 +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +bb: + %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.fp8.fp8.v8f32.v2i32.v2i32.v8f32(<2 x i32> %A, <2 x i32> %B, <8 x float> ) + store <8 x float> %res, ptr addrspace(1) %out + ret void +} + +define amdgpu_ps void @test_wmma_f32_16x16x16_bf8_fp8_imm(<2 x i32> %A, <2 x i32> %B, ptr addrspace(1) %out) { +; GFX12-LABEL: test_wmma_f32_16x16x16_bf8_fp8_imm: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: v_wmma_f32_16x16x16_bf8_fp8 v[6:13], v[0:1], v[2:3], 1.0 +; GFX12-NEXT: s_clause 0x1 +; GFX12-NEXT: global_store_b128 v[4:5], v[6:9], off +; GFX12-NEXT: global_store_b128 v[4:5], v[10:13], off offset:16 +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +bb: + %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf8.fp8.v8f32.v2i32.v2i32.v8f32(<2 x i32> %A, <2 x i32> %B, <8 x float> ) + store <8 x float> %res, ptr addrspace(1) %out + ret void +} + +define amdgpu_ps void @test_wmma_f32_16x16x16_bf8_fp8_imm_non_inlineable(<2 x i32> %A, <2 x i32> %B, ptr addrspace(1) %out) { +; GFX12-LABEL: test_wmma_f32_16x16x16_bf8_fp8_imm_non_inlineable: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: s_mov_b32 s0, 0x40400000 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: s_mov_b32 s7, s0 +; GFX12-NEXT: s_mov_b32 s1, s0 +; GFX12-NEXT: s_mov_b32 s2, s0 +; GFX12-NEXT: s_mov_b32 s3, s0 +; GFX12-NEXT: s_mov_b32 s4, s0 +; GFX12-NEXT: s_mov_b32 s5, s0 +; GFX12-NEXT: s_mov_b32 s6, s0 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_1) +; GFX12-NEXT: v_dual_mov_b32 v13, s7 :: v_dual_mov_b32 v12, s6 +; GFX12-NEXT: v_dual_mov_b32 v11, s5 :: v_dual_mov_b32 v10, s4 +; GFX12-NEXT: v_dual_mov_b32 v9, s3 :: v_dual_mov_b32 v8, s2 +; GFX12-NEXT: v_dual_mov_b32 v7, s1 :: v_dual_mov_b32 v6, s0 +; GFX12-NEXT: v_wmma_f32_16x16x16_bf8_fp8 v[6:13], v[0:1], v[2:3], v[6:13] +; GFX12-NEXT: s_clause 0x1 +; GFX12-NEXT: global_store_b128 v[4:5], v[6:9], off +; GFX12-NEXT: global_store_b128 v[4:5], v[10:13], off offset:16 +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +bb: + %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf8.fp8.v8f32.v2i32.v2i32.v8f32(<2 x i32> %A, <2 x i32> %B, <8 x float> ) + store <8 x float> %res, ptr addrspace(1) %out + ret void +} + +define amdgpu_ps void @test_wmma_f32_16x16x16_fp8_bf8_imm(<2 x i32> %A, <2 x i32> %B, ptr addrspace(1) %out) { +; GFX12-LABEL: test_wmma_f32_16x16x16_fp8_bf8_imm: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: v_wmma_f32_16x16x16_fp8_bf8 v[6:13], v[0:1], v[2:3], 1.0 +; GFX12-NEXT: s_clause 0x1 +; GFX12-NEXT: global_store_b128 v[4:5], v[6:9], off +; GFX12-NEXT: global_store_b128 v[4:5], v[10:13], off offset:16 +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +bb: + %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.fp8.bf8.v8f32.v2i32.v2i32.v8f32(<2 x i32> %A, <2 x i32> %B, <8 x float> ) + store <8 x float> %res, ptr addrspace(1) %out + ret void +} + +define amdgpu_ps void @test_wmma_f32_16x16x16_fp8_bf8_imm_non_inlineable(<2 x i32> %A, <2 x i32> %B, ptr addrspace(1) %out) { +; GFX12-LABEL: test_wmma_f32_16x16x16_fp8_bf8_imm_non_inlineable: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: s_mov_b32 s0, 0x40400000 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: s_mov_b32 s7, s0 +; GFX12-NEXT: s_mov_b32 s1, s0 +; GFX12-NEXT: s_mov_b32 s2, s0 +; GFX12-NEXT: s_mov_b32 s3, s0 +; GFX12-NEXT: s_mov_b32 s4, s0 +; GFX12-NEXT: s_mov_b32 s5, s0 +; GFX12-NEXT: s_mov_b32 s6, s0 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_1) +; GFX12-NEXT: v_dual_mov_b32 v13, s7 :: v_dual_mov_b32 v12, s6 +; GFX12-NEXT: v_dual_mov_b32 v11, s5 :: v_dual_mov_b32 v10, s4 +; GFX12-NEXT: v_dual_mov_b32 v9, s3 :: v_dual_mov_b32 v8, s2 +; GFX12-NEXT: v_dual_mov_b32 v7, s1 :: v_dual_mov_b32 v6, s0 +; GFX12-NEXT: v_wmma_f32_16x16x16_fp8_bf8 v[6:13], v[0:1], v[2:3], v[6:13] +; GFX12-NEXT: s_clause 0x1 +; GFX12-NEXT: global_store_b128 v[4:5], v[6:9], off +; GFX12-NEXT: global_store_b128 v[4:5], v[10:13], off offset:16 +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +bb: + %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.fp8.bf8.v8f32.v2i32.v2i32.v8f32(<2 x i32> %A, <2 x i32> %B, <8 x float> ) + store <8 x float> %res, ptr addrspace(1) %out + ret void +} + +define amdgpu_ps void @test_wmma_f32_16x16x16_bf8_bf8_imm(<2 x i32> %A, <2 x i32> %B, ptr addrspace(1) %out) { +; GFX12-LABEL: test_wmma_f32_16x16x16_bf8_bf8_imm: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: v_wmma_f32_16x16x16_bf8_bf8 v[6:13], v[0:1], v[2:3], 1.0 +; GFX12-NEXT: s_clause 0x1 +; GFX12-NEXT: global_store_b128 v[4:5], v[6:9], off +; GFX12-NEXT: global_store_b128 v[4:5], v[10:13], off offset:16 +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +bb: + %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf8.bf8.v8f32.v2i32.v2i32.v8f32(<2 x i32> %A, <2 x i32> %B, <8 x float> ) + store <8 x float> %res, ptr addrspace(1) %out + ret void +} + +define amdgpu_ps void @test_wmma_f32_16x16x16_bf8_bf8_imm_non_inlineable(<2 x i32> %A, <2 x i32> %B, ptr addrspace(1) %out) { +; GFX12-LABEL: test_wmma_f32_16x16x16_bf8_bf8_imm_non_inlineable: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: s_mov_b32 s0, 0x40400000 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: s_mov_b32 s7, s0 +; GFX12-NEXT: s_mov_b32 s1, s0 +; GFX12-NEXT: s_mov_b32 s2, s0 +; GFX12-NEXT: s_mov_b32 s3, s0 +; GFX12-NEXT: s_mov_b32 s4, s0 +; GFX12-NEXT: s_mov_b32 s5, s0 +; GFX12-NEXT: s_mov_b32 s6, s0 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_1) +; GFX12-NEXT: v_dual_mov_b32 v13, s7 :: v_dual_mov_b32 v12, s6 +; GFX12-NEXT: v_dual_mov_b32 v11, s5 :: v_dual_mov_b32 v10, s4 +; GFX12-NEXT: v_dual_mov_b32 v9, s3 :: v_dual_mov_b32 v8, s2 +; GFX12-NEXT: v_dual_mov_b32 v7, s1 :: v_dual_mov_b32 v6, s0 +; GFX12-NEXT: v_wmma_f32_16x16x16_bf8_bf8 v[6:13], v[0:1], v[2:3], v[6:13] +; GFX12-NEXT: s_clause 0x1 +; GFX12-NEXT: global_store_b128 v[4:5], v[6:9], off +; GFX12-NEXT: global_store_b128 v[4:5], v[10:13], off offset:16 +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +bb: + %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf8.bf8.v8f32.v2i32.v2i32.v8f32(<2 x i32> %A, <2 x i32> %B, <8 x float> ) + store <8 x float> %res, ptr addrspace(1) %out + ret void +} + +define amdgpu_ps void @test_wmma_i32_16x16x32_iu4_imm(<2 x i32> %A, <2 x i32> %B, ptr addrspace(1) %out) { +; GFX12-LABEL: test_wmma_i32_16x16x32_iu4_imm: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: v_wmma_i32_16x16x32_iu4 v[6:13], v[0:1], v[2:3], 1 +; GFX12-NEXT: s_clause 0x1 +; GFX12-NEXT: global_store_b128 v[4:5], v[6:9], off +; GFX12-NEXT: global_store_b128 v[4:5], v[10:13], off offset:16 +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +bb: + %res = call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x32.iu4.v8i32.v2i32.v2i32.v8i32(i1 0, <2 x i32> %A, i1 0, <2 x i32> %B, <8 x i32> , i1 0) + store <8 x i32> %res, ptr addrspace(1) %out + ret void +} + +define amdgpu_ps void @test_wmma_i32_16x16x32_iu4_imm_non_inlineable(<2 x i32> %A, <2 x i32> %B, ptr addrspace(1) %out) { +; GFX12-LABEL: test_wmma_i32_16x16x32_iu4_imm_non_inlineable: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: s_movk_i32 s0, 0x80 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: s_mov_b32 s7, s0 +; GFX12-NEXT: s_mov_b32 s1, s0 +; GFX12-NEXT: s_mov_b32 s2, s0 +; GFX12-NEXT: s_mov_b32 s3, s0 +; GFX12-NEXT: s_mov_b32 s4, s0 +; GFX12-NEXT: s_mov_b32 s5, s0 +; GFX12-NEXT: s_mov_b32 s6, s0 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_1) +; GFX12-NEXT: v_dual_mov_b32 v13, s7 :: v_dual_mov_b32 v12, s6 +; GFX12-NEXT: v_dual_mov_b32 v11, s5 :: v_dual_mov_b32 v10, s4 +; GFX12-NEXT: v_dual_mov_b32 v9, s3 :: v_dual_mov_b32 v8, s2 +; GFX12-NEXT: v_dual_mov_b32 v7, s1 :: v_dual_mov_b32 v6, s0 +; GFX12-NEXT: v_wmma_i32_16x16x32_iu4 v[6:13], v[0:1], v[2:3], v[6:13] +; GFX12-NEXT: s_clause 0x1 +; GFX12-NEXT: global_store_b128 v[4:5], v[6:9], off +; GFX12-NEXT: global_store_b128 v[4:5], v[10:13], off offset:16 +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +bb: + %res = call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x32.iu4.v8i32.v2i32.v2i32.v8i32(i1 0, <2 x i32> %A, i1 0, <2 x i32> %B, <8 x i32> , i1 0) + store <8 x i32> %res, ptr addrspace(1) %out + ret void +} + +declare <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.f16.v8f32.v8f16.v8f16.v8f32(<8 x half>, <8 x half>, <8 x float>) +declare <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf16.v8f32.v8i16.v8i16.v8f32(<8 x i16>, <8 x i16>, <8 x float>) +declare <8 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16.v8f16.v8f16.v8f16.v8f16(<8 x half>, <8 x half>, <8 x half>, i1 immarg) +declare <8 x i16> @llvm.amdgcn.wmma.bf16.16x16x16.bf16.v8i16.v8i16.v8i16.v8i16(<8 x i16>, <8 x i16>, <8 x i16>, i1 immarg) +declare <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8.v8i32.v2i32.v2i32.v8i32(i1 immarg, <2 x i32>, i1 immarg, <2 x i32>, <8 x i32>, i1 immarg) +declare <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4.v8i32.i32.i32.v8i32(i1 immarg, i32, i1 immarg, i32, <8 x i32>, i1 immarg) +declare <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.fp8.fp8.v8f32.v2i32.v2i32.v8f32(<2 x i32>, <2 x i32>, <8 x float>) +declare <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.fp8.bf8.v8f32.v2i32.v2i32.v8f32(<2 x i32>, <2 x i32>, <8 x float>) +declare <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf8.fp8.v8f32.v2i32.v2i32.v8f32(<2 x i32>, <2 x i32>, <8 x float>) +declare <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf8.bf8.v8f32.v2i32.v2i32.v8f32(<2 x i32>, <2 x i32>, <8 x float>) +declare <8 x i32> @llvm.amdgcn.wmma.i32.16x16x32.iu4.v8i32.v2i32.v2i32.v8i32(i1 immarg, <2 x i32>, i1 immarg, <2 x i32>, <8 x i32>, i1 immarg) +declare <8 x float> @llvm.amdgcn.swmmac.f32.16x16x32.f16.v8f32.v8f16.v16f16.v8f32.i16(<8 x half>, <16 x half>, <8 x float>, i16) +declare <8 x float> @llvm.amdgcn.swmmac.f32.16x16x32.bf16.v8f32.v8i16.v16i16.v8f32.i16(<8 x i16>, <16 x i16>, <8 x float>, i16) +declare <8 x half> @llvm.amdgcn.swmmac.f16.16x16x32.f16.v8f16.v8f16.v16f16.v8f16.i16(<8 x half>, <16 x half>, <8 x half>, i16) +declare <8 x i16> @llvm.amdgcn.swmmac.bf16.16x16x32.bf16.v8i16.v8i16.v16i16.v8i16.i16(<8 x i16>, <16 x i16>, <8 x i16>, i16) +declare <8 x i32> @llvm.amdgcn.swmmac.i32.16x16x32.iu8.v8i32.v2i32.v4i32.v8i32.i16(i1 immarg, <2 x i32>, i1 immarg, <4 x i32>, <8 x i32>, i16 %Index, i1 immarg) +declare <8 x i32> @llvm.amdgcn.swmmac.i32.16x16x32.iu4.v8i32.i32.v2i32.v8i32.i16(i1 immarg, i32, i1 immarg, <2 x i32>, <8 x i32>, i16 %Index, i1 immarg) +declare <8 x i32> @llvm.amdgcn.swmmac.i32.16x16x64.iu4.v8i32.v2i32.v4i32.v8i32.i32(i1 immarg, <2 x i32>, i1 immarg, <4 x i32>, <8 x i32>, i32 %Index, i1 immarg) +declare <8 x float> @llvm.amdgcn.swmmac.f32.16x16x32.fp8.fp8.v8f32.v2i32.v4i32.v8f32.i16(<2 x i32>, <4 x i32>, <8 x float>, i16) +declare <8 x float> @llvm.amdgcn.swmmac.f32.16x16x32.fp8.bf8.v8f32.v2i32.v4i32.v8f32.i16(<2 x i32>, <4 x i32>, <8 x float>, i16) +declare <8 x float> @llvm.amdgcn.swmmac.f32.16x16x32.bf8.fp8.v8f32.v2i32.v4i32.v8f32.i16(<2 x i32>, <4 x i32>, <8 x float>, i16) +declare <8 x float> @llvm.amdgcn.swmmac.f32.16x16x32.bf8.bf8.v8f32.v2i32.v4i32.v8f32.i16(<2 x i32>, <4 x i32>, <8 x float>, i16) diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/wmma-gfx12-w32-iu-modifiers.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/wmma-gfx12-w32-iu-modifiers.ll new file mode 100644 index 000000000000..fe6d16bd8b5e --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/wmma-gfx12-w32-iu-modifiers.ll @@ -0,0 +1,309 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc -global-isel -march=amdgcn -mcpu=gfx1200 -verify-machineinstrs < %s | FileCheck %s --check-prefix=GFX12 + +define amdgpu_ps void @test_wmma_i32_16x16x16_iu8_zext_src0(<2 x i32> %A, <2 x i32> %B, <8 x i32> %C, ptr addrspace(1) %out) { +; GFX12-LABEL: test_wmma_i32_16x16x16_iu8_zext_src0: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: v_wmma_i32_16x16x16_iu8 v[4:11], v[0:1], v[2:3], v[4:11] neg_lo:[1,0,0] +; GFX12-NEXT: s_clause 0x1 +; GFX12-NEXT: global_store_b128 v[12:13], v[4:7], off +; GFX12-NEXT: global_store_b128 v[12:13], v[8:11], off offset:16 +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +bb: + %res = call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8.v8i32.v2i32.v2i32.v8i32(i1 1, <2 x i32> %A, i1 0, <2 x i32> %B, <8 x i32> %C, i1 0) + store <8 x i32> %res, ptr addrspace(1) %out + ret void +} + +define amdgpu_ps void @test_wmma_i32_16x16x16_iu8_zext_src1(<2 x i32> %A, <2 x i32> %B, <8 x i32> %C, ptr addrspace(1) %out) { +; GFX12-LABEL: test_wmma_i32_16x16x16_iu8_zext_src1: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: v_wmma_i32_16x16x16_iu8 v[4:11], v[0:1], v[2:3], v[4:11] neg_lo:[0,1,0] +; GFX12-NEXT: s_clause 0x1 +; GFX12-NEXT: global_store_b128 v[12:13], v[4:7], off +; GFX12-NEXT: global_store_b128 v[12:13], v[8:11], off offset:16 +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +bb: + %res = call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8.v8i32.v2i32.v2i32.v8i32(i1 0, <2 x i32> %A, i1 1, <2 x i32> %B, <8 x i32> %C, i1 0) + store <8 x i32> %res, ptr addrspace(1) %out + ret void +} + +define amdgpu_ps void @test_wmma_i32_16x16x16_iu8_clamp(<2 x i32> %A, <2 x i32> %B, <8 x i32> %C, ptr addrspace(1) %out) { +; GFX12-LABEL: test_wmma_i32_16x16x16_iu8_clamp: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: v_wmma_i32_16x16x16_iu8 v[4:11], v[0:1], v[2:3], v[4:11] clamp +; GFX12-NEXT: s_clause 0x1 +; GFX12-NEXT: global_store_b128 v[12:13], v[4:7], off +; GFX12-NEXT: global_store_b128 v[12:13], v[8:11], off offset:16 +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +bb: + %res = call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8.v8i32.v2i32.v2i32.v8i32(i1 0, <2 x i32> %A, i1 0, <2 x i32> %B, <8 x i32> %C, i1 1) + store <8 x i32> %res, ptr addrspace(1) %out + ret void +} + + + +define amdgpu_ps void @test_wmma_i32_16x16x16_iu4_zext_src0(i32 %A, i32 %B, <8 x i32> %C, ptr addrspace(1) %out) { +; GFX12-LABEL: test_wmma_i32_16x16x16_iu4_zext_src0: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: v_wmma_i32_16x16x16_iu4 v[2:9], v0, v1, v[2:9] neg_lo:[1,0,0] +; GFX12-NEXT: s_clause 0x1 +; GFX12-NEXT: global_store_b128 v[10:11], v[2:5], off +; GFX12-NEXT: global_store_b128 v[10:11], v[6:9], off offset:16 +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +bb: + %res = call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4.v8i32.i32.i32.v8i32(i1 1, i32 %A, i1 0, i32 %B, <8 x i32> %C, i1 0) + store <8 x i32> %res, ptr addrspace(1) %out + ret void +} + +define amdgpu_ps void @test_wmma_i32_16x16x16_iu4_zext_src1(i32 %A, i32 %B, <8 x i32> %C, ptr addrspace(1) %out) { +; GFX12-LABEL: test_wmma_i32_16x16x16_iu4_zext_src1: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: v_wmma_i32_16x16x16_iu4 v[2:9], v0, v1, v[2:9] neg_lo:[0,1,0] +; GFX12-NEXT: s_clause 0x1 +; GFX12-NEXT: global_store_b128 v[10:11], v[2:5], off +; GFX12-NEXT: global_store_b128 v[10:11], v[6:9], off offset:16 +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +bb: + %res = call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4.v8i32.i32.i32.v8i32(i1 0, i32 %A, i1 1, i32 %B, <8 x i32> %C, i1 0) + store <8 x i32> %res, ptr addrspace(1) %out + ret void +} + +define amdgpu_ps void @test_wmma_i32_16x16x16_iu4_clamp(i32 %A, i32 %B, <8 x i32> %C, ptr addrspace(1) %out) { +; GFX12-LABEL: test_wmma_i32_16x16x16_iu4_clamp: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: v_wmma_i32_16x16x16_iu4 v[2:9], v0, v1, v[2:9] clamp +; GFX12-NEXT: s_clause 0x1 +; GFX12-NEXT: global_store_b128 v[10:11], v[2:5], off +; GFX12-NEXT: global_store_b128 v[10:11], v[6:9], off offset:16 +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +bb: + %res = call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4.v8i32.i32.i32.v8i32(i1 0, i32 %A, i1 0, i32 %B, <8 x i32> %C, i1 1) + store <8 x i32> %res, ptr addrspace(1) %out + ret void +} + + + +define amdgpu_ps void @test_wmma_i32_16x16x32_iu4_zext_src0(<2 x i32> %A, <2 x i32> %B, <8 x i32> %C, ptr addrspace(1) %out) { +; GFX12-LABEL: test_wmma_i32_16x16x32_iu4_zext_src0: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: v_wmma_i32_16x16x32_iu4 v[4:11], v[0:1], v[2:3], v[4:11] neg_lo:[1,0,0] +; GFX12-NEXT: s_clause 0x1 +; GFX12-NEXT: global_store_b128 v[12:13], v[4:7], off +; GFX12-NEXT: global_store_b128 v[12:13], v[8:11], off offset:16 +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +bb: + %res = call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x32.iu4.v8i32.v2i32.v2i32.v8i32(i1 1, <2 x i32> %A, i1 0, <2 x i32> %B, <8 x i32> %C, i1 0) + store <8 x i32> %res, ptr addrspace(1) %out + ret void +} + +define amdgpu_ps void @test_wmma_i32_16x16x32_iu4_zext_src1(<2 x i32> %A, <2 x i32> %B, <8 x i32> %C, ptr addrspace(1) %out) { +; GFX12-LABEL: test_wmma_i32_16x16x32_iu4_zext_src1: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: v_wmma_i32_16x16x32_iu4 v[4:11], v[0:1], v[2:3], v[4:11] neg_lo:[0,1,0] +; GFX12-NEXT: s_clause 0x1 +; GFX12-NEXT: global_store_b128 v[12:13], v[4:7], off +; GFX12-NEXT: global_store_b128 v[12:13], v[8:11], off offset:16 +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +bb: + %res = call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x32.iu4.v8i32.v2i32.v2i32.v8i32(i1 0, <2 x i32> %A, i1 1, <2 x i32> %B, <8 x i32> %C, i1 0) + store <8 x i32> %res, ptr addrspace(1) %out + ret void +} + +define amdgpu_ps void @test_wmma_i32_16x16x32_iu4_clamp(<2 x i32> %A, <2 x i32> %B, <8 x i32> %C, ptr addrspace(1) %out) { +; GFX12-LABEL: test_wmma_i32_16x16x32_iu4_clamp: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: v_wmma_i32_16x16x32_iu4 v[4:11], v[0:1], v[2:3], v[4:11] clamp +; GFX12-NEXT: s_clause 0x1 +; GFX12-NEXT: global_store_b128 v[12:13], v[4:7], off +; GFX12-NEXT: global_store_b128 v[12:13], v[8:11], off offset:16 +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +bb: + %res = call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x32.iu4.v8i32.v2i32.v2i32.v8i32(i1 0, <2 x i32> %A, i1 0, <2 x i32> %B, <8 x i32> %C, i1 1) + store <8 x i32> %res, ptr addrspace(1) %out + ret void +} + + + + + +define amdgpu_ps void @test_swmmac_i32_16x16x32_iu8_zext_src0(<2 x i32> %A, <4 x i32> %B, <8 x i32> %C, i16 %Index, ptr addrspace(1) %out) { +; GFX12-LABEL: test_swmmac_i32_16x16x32_iu8_zext_src0: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: v_swmmac_i32_16x16x32_iu8 v[6:13], v[0:1], v[2:5], v14 neg_lo:[1,0,0] +; GFX12-NEXT: s_clause 0x1 +; GFX12-NEXT: global_store_b128 v[15:16], v[6:9], off +; GFX12-NEXT: global_store_b128 v[15:16], v[10:13], off offset:16 +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +bb: + %res = call <8 x i32> @llvm.amdgcn.swmmac.i32.16x16x32.iu8.v8i32.v2i32.v4i32.v8i32.i16(i1 1, <2 x i32> %A, i1 0, <4 x i32> %B, <8 x i32> %C, i16 %Index, i1 0) + store <8 x i32> %res, ptr addrspace(1) %out + ret void +} + +define amdgpu_ps void @test_swmmac_i32_16x16x32_iu8_zext_src1(<2 x i32> %A, <4 x i32> %B, <8 x i32> %C, i16 %Index, ptr addrspace(1) %out) { +; GFX12-LABEL: test_swmmac_i32_16x16x32_iu8_zext_src1: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: v_swmmac_i32_16x16x32_iu8 v[6:13], v[0:1], v[2:5], v14 neg_lo:[0,1,0] +; GFX12-NEXT: s_clause 0x1 +; GFX12-NEXT: global_store_b128 v[15:16], v[6:9], off +; GFX12-NEXT: global_store_b128 v[15:16], v[10:13], off offset:16 +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +bb: + %res = call <8 x i32> @llvm.amdgcn.swmmac.i32.16x16x32.iu8.v8i32.v2i32.v4i32.v8i32.i16(i1 0, <2 x i32> %A, i1 1, <4 x i32> %B, <8 x i32> %C, i16 %Index, i1 0) + store <8 x i32> %res, ptr addrspace(1) %out + ret void +} + +define amdgpu_ps void @test_swmmac_i32_16x16x32_iu8_clamp(<2 x i32> %A, <4 x i32> %B, <8 x i32> %C, i16 %Index, ptr addrspace(1) %out) { +; GFX12-LABEL: test_swmmac_i32_16x16x32_iu8_clamp: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: v_swmmac_i32_16x16x32_iu8 v[6:13], v[0:1], v[2:5], v14 clamp +; GFX12-NEXT: s_clause 0x1 +; GFX12-NEXT: global_store_b128 v[15:16], v[6:9], off +; GFX12-NEXT: global_store_b128 v[15:16], v[10:13], off offset:16 +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +bb: + %res = call <8 x i32> @llvm.amdgcn.swmmac.i32.16x16x32.iu8.v8i32.v2i32.v4i32.v8i32.i16(i1 0, <2 x i32> %A, i1 0, <4 x i32> %B, <8 x i32> %C, i16 %Index, i1 1) + store <8 x i32> %res, ptr addrspace(1) %out + ret void +} + + + +define amdgpu_ps void @test_swmmac_i32_16x16x32_iu4_zext_src0(i32 %A, <2 x i32> %B, <8 x i32> %C, i16 %Index, ptr addrspace(1) %out) { +; GFX12-LABEL: test_swmmac_i32_16x16x32_iu4_zext_src0: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: v_swmmac_i32_16x16x32_iu4 v[3:10], v0, v[1:2], v11 neg_lo:[1,0,0] +; GFX12-NEXT: s_clause 0x1 +; GFX12-NEXT: global_store_b128 v[12:13], v[3:6], off +; GFX12-NEXT: global_store_b128 v[12:13], v[7:10], off offset:16 +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +bb: + %res = call <8 x i32> @llvm.amdgcn.swmmac.i32.16x16x32.iu4.v8i32.i32.v2i32.v8i32.i16(i1 1, i32 %A, i1 0, <2 x i32> %B, <8 x i32> %C, i16 %Index, i1 0) + store <8 x i32> %res, ptr addrspace(1) %out + ret void +} + +define amdgpu_ps void @test_swmmac_i32_16x16x32_iu4_zext_src1(i32 %A, <2 x i32> %B, <8 x i32> %C, i16 %Index, ptr addrspace(1) %out) { +; GFX12-LABEL: test_swmmac_i32_16x16x32_iu4_zext_src1: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: v_swmmac_i32_16x16x32_iu4 v[3:10], v0, v[1:2], v11 neg_lo:[0,1,0] +; GFX12-NEXT: s_clause 0x1 +; GFX12-NEXT: global_store_b128 v[12:13], v[3:6], off +; GFX12-NEXT: global_store_b128 v[12:13], v[7:10], off offset:16 +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +bb: + %res = call <8 x i32> @llvm.amdgcn.swmmac.i32.16x16x32.iu4.v8i32.i32.v2i32.v8i32.i16(i1 0, i32 %A, i1 1, <2 x i32> %B, <8 x i32> %C, i16 %Index, i1 0) + store <8 x i32> %res, ptr addrspace(1) %out + ret void +} + +define amdgpu_ps void @test_swmmac_i32_16x16x32_iu4_clamp(i32 %A, <2 x i32> %B, <8 x i32> %C, i16 %Index, ptr addrspace(1) %out) { +; GFX12-LABEL: test_swmmac_i32_16x16x32_iu4_clamp: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: v_swmmac_i32_16x16x32_iu4 v[3:10], v0, v[1:2], v11 clamp +; GFX12-NEXT: s_clause 0x1 +; GFX12-NEXT: global_store_b128 v[12:13], v[3:6], off +; GFX12-NEXT: global_store_b128 v[12:13], v[7:10], off offset:16 +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +bb: + %res = call <8 x i32> @llvm.amdgcn.swmmac.i32.16x16x32.iu4.v8i32.i32.v2i32.v8i32.i16(i1 0, i32 %A, i1 0, <2 x i32> %B, <8 x i32> %C, i16 %Index, i1 1) + store <8 x i32> %res, ptr addrspace(1) %out + ret void +} + + + +define amdgpu_ps void @test_swmmac_i32_16x16x64_iu4_zext_src0(<2 x i32> %A, <4 x i32> %B, <8 x i32> %C, i32 %Index, ptr addrspace(1) %out) { +; GFX12-LABEL: test_swmmac_i32_16x16x64_iu4_zext_src0: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: v_swmmac_i32_16x16x64_iu4 v[6:13], v[0:1], v[2:5], v14 neg_lo:[1,0,0] +; GFX12-NEXT: s_clause 0x1 +; GFX12-NEXT: global_store_b128 v[15:16], v[6:9], off +; GFX12-NEXT: global_store_b128 v[15:16], v[10:13], off offset:16 +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +bb: + %res = call <8 x i32> @llvm.amdgcn.swmmac.i32.16x16x64.iu4.v8i32.v2i32.v4i32.v8i32.i32(i1 1, <2 x i32> %A, i1 0, <4 x i32> %B, <8 x i32> %C, i32 %Index, i1 0) + store <8 x i32> %res, ptr addrspace(1) %out + ret void +} + +define amdgpu_ps void @test_swmmac_i32_16x16x64_iu4_zext_src1(<2 x i32> %A, <4 x i32> %B, <8 x i32> %C, i32 %Index, ptr addrspace(1) %out) { +; GFX12-LABEL: test_swmmac_i32_16x16x64_iu4_zext_src1: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: v_swmmac_i32_16x16x64_iu4 v[6:13], v[0:1], v[2:5], v14 neg_lo:[0,1,0] +; GFX12-NEXT: s_clause 0x1 +; GFX12-NEXT: global_store_b128 v[15:16], v[6:9], off +; GFX12-NEXT: global_store_b128 v[15:16], v[10:13], off offset:16 +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +bb: + %res = call <8 x i32> @llvm.amdgcn.swmmac.i32.16x16x64.iu4.v8i32.v2i32.v4i32.v8i32.i32(i1 0, <2 x i32> %A, i1 1, <4 x i32> %B, <8 x i32> %C, i32 %Index, i1 0) + store <8 x i32> %res, ptr addrspace(1) %out + ret void +} + +define amdgpu_ps void @test_swmmac_i32_16x16x64_iu4_clamp(<2 x i32> %A, <4 x i32> %B, <8 x i32> %C, i32 %Index, ptr addrspace(1) %out) { +; GFX12-LABEL: test_swmmac_i32_16x16x64_iu4_clamp: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: v_swmmac_i32_16x16x64_iu4 v[6:13], v[0:1], v[2:5], v14 clamp +; GFX12-NEXT: s_clause 0x1 +; GFX12-NEXT: global_store_b128 v[15:16], v[6:9], off +; GFX12-NEXT: global_store_b128 v[15:16], v[10:13], off offset:16 +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +bb: + %res = call <8 x i32> @llvm.amdgcn.swmmac.i32.16x16x64.iu4.v8i32.v2i32.v4i32.v8i32.i32(i1 0, <2 x i32> %A, i1 0, <4 x i32> %B, <8 x i32> %C, i32 %Index, i1 1) + store <8 x i32> %res, ptr addrspace(1) %out + ret void +} + +declare <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8.v8i32.v2i32.v2i32.v8i32(i1 immarg, <2 x i32>, i1 immarg, <2 x i32>, <8 x i32>, i1 immarg) +declare <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4.v8i32.i32.i32.v8i32(i1 immarg, i32, i1 immarg, i32, <8 x i32>, i1 immarg) +declare <8 x i32> @llvm.amdgcn.wmma.i32.16x16x32.iu4.v8i32.v2i32.v2i32.v8i32(i1 immarg, <2 x i32>, i1 immarg, <2 x i32>, <8 x i32>, i1 immarg) +declare <8 x i32> @llvm.amdgcn.swmmac.i32.16x16x32.iu8.v8i32.v2i32.v4i32.v8i32.i16(i1 immarg, <2 x i32>, i1 immarg, <4 x i32>, <8 x i32>, i16 %Index, i1 immarg) +declare <8 x i32> @llvm.amdgcn.swmmac.i32.16x16x32.iu4.v8i32.i32.v2i32.v8i32.i16(i1 immarg, i32, i1 immarg, <2 x i32>, <8 x i32>, i16 %Index, i1 immarg) +declare <8 x i32> @llvm.amdgcn.swmmac.i32.16x16x64.iu4.v8i32.v2i32.v4i32.v8i32.i32(i1 immarg, <2 x i32>, i1 immarg, <4 x i32>, <8 x i32>, i32 %Index, i1 immarg) diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/wmma-gfx12-w32-swmmac-index_key.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/wmma-gfx12-w32-swmmac-index_key.ll new file mode 100644 index 000000000000..c80d7a6d9a83 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/wmma-gfx12-w32-swmmac-index_key.ll @@ -0,0 +1,321 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc -global-isel -march=amdgcn -mcpu=gfx1200 -verify-machineinstrs < %s | FileCheck %s --check-prefix=GFX12 + +define amdgpu_ps void @test_swmmac_f32_16x16x32_f16_index_key(<8 x half> %A, <16 x half> %B, <8 x float> %C, ptr addrspace(1) %IndexVecPtr, ptr addrspace(1) %out0, ptr addrspace(1) %out1) { +; GFX12-LABEL: test_swmmac_f32_16x16x32_f16_index_key: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: global_load_b32 v20, v[20:21], off +; GFX12-NEXT: v_dual_mov_b32 v33, v19 :: v_dual_mov_b32 v32, v18 +; GFX12-NEXT: v_dual_mov_b32 v31, v17 :: v_dual_mov_b32 v30, v16 +; GFX12-NEXT: v_dual_mov_b32 v29, v15 :: v_dual_mov_b32 v28, v14 +; GFX12-NEXT: v_dual_mov_b32 v27, v13 :: v_dual_mov_b32 v26, v12 +; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: v_swmmac_f32_16x16x32_f16 v[26:33], v[0:3], v[4:11], v20 +; GFX12-NEXT: v_swmmac_f32_16x16x32_f16 v[12:19], v[0:3], v[4:11], v20 index_key:1 +; GFX12-NEXT: s_clause 0x1 +; GFX12-NEXT: global_store_b128 v[22:23], v[26:29], off +; GFX12-NEXT: global_store_b128 v[22:23], v[30:33], off offset:16 +; GFX12-NEXT: s_clause 0x1 +; GFX12-NEXT: global_store_b128 v[24:25], v[12:15], off +; GFX12-NEXT: global_store_b128 v[24:25], v[16:19], off offset:16 +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +bb: + %IndexVec = load <2 x i16>, ptr addrspace(1) %IndexVecPtr, align 4 + %Index0 = extractelement <2 x i16> %IndexVec, i32 0 + %res0 = call <8 x float> @llvm.amdgcn.swmmac.f32.16x16x32.f16.v8f32.v8f16.v16f16.v8f32.i16(<8 x half> %A, <16 x half> %B, <8 x float> %C, i16 %Index0) + store <8 x float> %res0, ptr addrspace(1) %out0 + %Index1 = extractelement <2 x i16> %IndexVec, i32 1 + %res1 = call <8 x float> @llvm.amdgcn.swmmac.f32.16x16x32.f16.v8f32.v8f16.v16f16.v8f32.i16(<8 x half> %A, <16 x half> %B, <8 x float> %C, i16 %Index1) + store <8 x float> %res1, ptr addrspace(1) %out1 + ret void +} + +define amdgpu_ps void @test_swmmac_f32_16x16x32_bf16_index_key(<8 x i16> %A, <16 x i16> %B, <8 x float> %C, ptr addrspace(1) %IndexVecPtr, ptr addrspace(1) %out0, ptr addrspace(1) %out1) { +; GFX12-LABEL: test_swmmac_f32_16x16x32_bf16_index_key: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: global_load_b32 v20, v[20:21], off +; GFX12-NEXT: v_dual_mov_b32 v33, v19 :: v_dual_mov_b32 v32, v18 +; GFX12-NEXT: v_dual_mov_b32 v31, v17 :: v_dual_mov_b32 v30, v16 +; GFX12-NEXT: v_dual_mov_b32 v29, v15 :: v_dual_mov_b32 v28, v14 +; GFX12-NEXT: v_dual_mov_b32 v27, v13 :: v_dual_mov_b32 v26, v12 +; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: v_swmmac_f32_16x16x32_bf16 v[26:33], v[0:3], v[4:11], v20 +; GFX12-NEXT: v_swmmac_f32_16x16x32_bf16 v[12:19], v[0:3], v[4:11], v20 index_key:1 +; GFX12-NEXT: s_clause 0x1 +; GFX12-NEXT: global_store_b128 v[22:23], v[26:29], off +; GFX12-NEXT: global_store_b128 v[22:23], v[30:33], off offset:16 +; GFX12-NEXT: s_clause 0x1 +; GFX12-NEXT: global_store_b128 v[24:25], v[12:15], off +; GFX12-NEXT: global_store_b128 v[24:25], v[16:19], off offset:16 +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +bb: + %IndexVec = load <2 x i16>, ptr addrspace(1) %IndexVecPtr, align 4 + %Index0 = extractelement <2 x i16> %IndexVec, i32 0 + %res0 = call <8 x float> @llvm.amdgcn.swmmac.f32.16x16x32.bf16.v8f32.v8i16.v16i16.v8f32.i16(<8 x i16> %A, <16 x i16> %B, <8 x float> %C, i16 %Index0) + store <8 x float> %res0, ptr addrspace(1) %out0 + %Index1 = extractelement <2 x i16> %IndexVec, i32 1 + %res1 = call <8 x float> @llvm.amdgcn.swmmac.f32.16x16x32.bf16.v8f32.v8i16.v16i16.v8f32.i16(<8 x i16> %A, <16 x i16> %B, <8 x float> %C, i16 %Index1) + store <8 x float> %res1, ptr addrspace(1) %out1 + ret void +} + +define amdgpu_ps void @test_swmmac_f16_16x16x32_f16_index_key(<8 x half> %A, <16 x half> %B, <8 x half> %C, ptr addrspace(1) %IndexVecPtr, ptr addrspace(1) %out0, ptr addrspace(1) %out1) { +; GFX12-LABEL: test_swmmac_f16_16x16x32_f16_index_key: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: global_load_b32 v16, v[16:17], off +; GFX12-NEXT: v_dual_mov_b32 v25, v15 :: v_dual_mov_b32 v24, v14 +; GFX12-NEXT: v_dual_mov_b32 v23, v13 :: v_dual_mov_b32 v22, v12 +; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: v_swmmac_f16_16x16x32_f16 v[22:25], v[0:3], v[4:11], v16 +; GFX12-NEXT: v_swmmac_f16_16x16x32_f16 v[12:15], v[0:3], v[4:11], v16 index_key:1 +; GFX12-NEXT: global_store_b128 v[18:19], v[22:25], off +; GFX12-NEXT: global_store_b128 v[20:21], v[12:15], off +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +bb: + %IndexVec = load <2 x i16>, ptr addrspace(1) %IndexVecPtr, align 4 + %Index0 = extractelement <2 x i16> %IndexVec, i32 0 + %res0 = call <8 x half> @llvm.amdgcn.swmmac.f16.16x16x32.f16.v8f16.v8f16.v16f16.v8f16.i16(<8 x half> %A, <16 x half> %B, <8 x half> %C, i16 %Index0) + store <8 x half> %res0, ptr addrspace(1) %out0 + %Index1 = extractelement <2 x i16> %IndexVec, i32 1 + %res1 = call <8 x half> @llvm.amdgcn.swmmac.f16.16x16x32.f16.v8f16.v8f16.v16f16.v8f16.i16(<8 x half> %A, <16 x half> %B, <8 x half> %C, i16 %Index1) + store <8 x half> %res1, ptr addrspace(1) %out1 + ret void +} + +define amdgpu_ps void @test_swmmac_bf16_16x16x32_bf16_index_key(<8 x i16> %A, <16 x i16> %B, <8 x i16> %C, ptr addrspace(1) %IndexVecPtr, ptr addrspace(1) %out0, ptr addrspace(1) %out1) { +; GFX12-LABEL: test_swmmac_bf16_16x16x32_bf16_index_key: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: global_load_b32 v16, v[16:17], off +; GFX12-NEXT: v_dual_mov_b32 v25, v15 :: v_dual_mov_b32 v24, v14 +; GFX12-NEXT: v_dual_mov_b32 v23, v13 :: v_dual_mov_b32 v22, v12 +; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: v_swmmac_bf16_16x16x32_bf16 v[22:25], v[0:3], v[4:11], v16 +; GFX12-NEXT: v_swmmac_bf16_16x16x32_bf16 v[12:15], v[0:3], v[4:11], v16 index_key:1 +; GFX12-NEXT: global_store_b128 v[18:19], v[22:25], off +; GFX12-NEXT: global_store_b128 v[20:21], v[12:15], off +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +bb: + %IndexVec = load <2 x i16>, ptr addrspace(1) %IndexVecPtr, align 4 + %Index0 = extractelement <2 x i16> %IndexVec, i32 0 + %res0 = call <8 x i16> @llvm.amdgcn.swmmac.bf16.16x16x32.bf16.v8i16.v8i16.v16i16.v8i16.i16(<8 x i16> %A, <16 x i16> %B, <8 x i16> %C, i16 %Index0) + store <8 x i16> %res0, ptr addrspace(1) %out0 + %Index1 = extractelement <2 x i16> %IndexVec, i32 1 + %res1 = call <8 x i16> @llvm.amdgcn.swmmac.bf16.16x16x32.bf16.v8i16.v8i16.v16i16.v8i16.i16(<8 x i16> %A, <16 x i16> %B, <8 x i16> %C, i16 %Index1) + store <8 x i16> %res1, ptr addrspace(1) %out1 + ret void +} + +define amdgpu_ps void @test_swmmac_i32_16x16x32_iu8_index_key(<2 x i32> %A, <4 x i32> %B, <8 x i32> %C, ptr addrspace(1) %IndexVecPtr, ptr addrspace(1) %out0, ptr addrspace(1) %out1) { +; GFX12-LABEL: test_swmmac_i32_16x16x32_iu8_index_key: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: global_load_b32 v14, v[14:15], off +; GFX12-NEXT: v_dual_mov_b32 v27, v13 :: v_dual_mov_b32 v26, v12 +; GFX12-NEXT: v_dual_mov_b32 v25, v11 :: v_dual_mov_b32 v24, v10 +; GFX12-NEXT: v_dual_mov_b32 v23, v9 :: v_dual_mov_b32 v22, v8 +; GFX12-NEXT: v_dual_mov_b32 v21, v7 :: v_dual_mov_b32 v20, v6 +; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: v_swmmac_i32_16x16x32_iu8 v[20:27], v[0:1], v[2:5], v14 +; GFX12-NEXT: v_swmmac_i32_16x16x32_iu8 v[6:13], v[0:1], v[2:5], v14 index_key:1 +; GFX12-NEXT: s_clause 0x1 +; GFX12-NEXT: global_store_b128 v[16:17], v[20:23], off +; GFX12-NEXT: global_store_b128 v[16:17], v[24:27], off offset:16 +; GFX12-NEXT: s_clause 0x1 +; GFX12-NEXT: global_store_b128 v[18:19], v[6:9], off +; GFX12-NEXT: global_store_b128 v[18:19], v[10:13], off offset:16 +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +bb: + %IndexVec = load <2 x i16>, ptr addrspace(1) %IndexVecPtr, align 4 + %Index0 = extractelement <2 x i16> %IndexVec, i32 0 + %res0 = call <8 x i32> @llvm.amdgcn.swmmac.i32.16x16x32.iu8.v8i32.v2i32.v4i32.v8i32.i16(i1 0, <2 x i32> %A, i1 0, <4 x i32> %B, <8 x i32> %C, i16 %Index0, i1 0) + store <8 x i32> %res0, ptr addrspace(1) %out0 + %Index1 = extractelement <2 x i16> %IndexVec, i32 1 + %res1 = call <8 x i32> @llvm.amdgcn.swmmac.i32.16x16x32.iu8.v8i32.v2i32.v4i32.v8i32.i16(i1 0, <2 x i32> %A, i1 0, <4 x i32> %B, <8 x i32> %C, i16 %Index1, i1 0) + store <8 x i32> %res1, ptr addrspace(1) %out1 + ret void +} + +define amdgpu_ps void @test_swmmac_i32_16x16x32_iu4_index_key(i32 %A, <2 x i32> %B, <8 x i32> %C, ptr addrspace(1) %IndexVecPtr, ptr addrspace(1) %out0, ptr addrspace(1) %out1) { +; GFX12-LABEL: test_swmmac_i32_16x16x32_iu4_index_key: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: global_load_b32 v11, v[11:12], off +; GFX12-NEXT: v_dual_mov_b32 v24, v10 :: v_dual_mov_b32 v23, v9 +; GFX12-NEXT: v_dual_mov_b32 v22, v8 :: v_dual_mov_b32 v21, v7 +; GFX12-NEXT: v_dual_mov_b32 v20, v6 :: v_dual_mov_b32 v19, v5 +; GFX12-NEXT: v_dual_mov_b32 v18, v4 :: v_dual_mov_b32 v17, v3 +; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: v_swmmac_i32_16x16x32_iu4 v[17:24], v0, v[1:2], v11 +; GFX12-NEXT: v_swmmac_i32_16x16x32_iu4 v[3:10], v0, v[1:2], v11 index_key:1 +; GFX12-NEXT: s_clause 0x1 +; GFX12-NEXT: global_store_b128 v[13:14], v[17:20], off +; GFX12-NEXT: global_store_b128 v[13:14], v[21:24], off offset:16 +; GFX12-NEXT: s_clause 0x1 +; GFX12-NEXT: global_store_b128 v[15:16], v[3:6], off +; GFX12-NEXT: global_store_b128 v[15:16], v[7:10], off offset:16 +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +bb: + %IndexVec = load <2 x i16>, ptr addrspace(1) %IndexVecPtr, align 4 + %Index0 = extractelement <2 x i16> %IndexVec, i32 0 + %res0 = call <8 x i32> @llvm.amdgcn.swmmac.i32.16x16x32.iu4.v8i32.i32.v2i32.v8i32.i16(i1 0, i32 %A, i1 0, <2 x i32> %B, <8 x i32> %C, i16 %Index0, i1 0) + store <8 x i32> %res0, ptr addrspace(1) %out0 + %Index1 = extractelement <2 x i16> %IndexVec, i32 1 + %res1 = call <8 x i32> @llvm.amdgcn.swmmac.i32.16x16x32.iu4.v8i32.i32.v2i32.v8i32.i16(i1 0, i32 %A, i1 0, <2 x i32> %B, <8 x i32> %C, i16 %Index1, i1 0) + store <8 x i32> %res1, ptr addrspace(1) %out1 + ret void +} + +define amdgpu_ps void @test_swmmac_f32_16x16x32_fp8_fp8_index_key(<2 x i32> %A, <4 x i32> %B, <8 x float> %C, ptr addrspace(1) %IndexVecPtr, ptr addrspace(1) %out0, ptr addrspace(1) %out1) { +; GFX12-LABEL: test_swmmac_f32_16x16x32_fp8_fp8_index_key: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: global_load_b32 v14, v[14:15], off +; GFX12-NEXT: v_dual_mov_b32 v27, v13 :: v_dual_mov_b32 v26, v12 +; GFX12-NEXT: v_dual_mov_b32 v25, v11 :: v_dual_mov_b32 v24, v10 +; GFX12-NEXT: v_dual_mov_b32 v23, v9 :: v_dual_mov_b32 v22, v8 +; GFX12-NEXT: v_dual_mov_b32 v21, v7 :: v_dual_mov_b32 v20, v6 +; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: v_swmmac_f32_16x16x32_fp8_fp8 v[20:27], v[0:1], v[2:5], v14 +; GFX12-NEXT: v_swmmac_f32_16x16x32_fp8_fp8 v[6:13], v[0:1], v[2:5], v14 index_key:1 +; GFX12-NEXT: s_clause 0x1 +; GFX12-NEXT: global_store_b128 v[16:17], v[20:23], off +; GFX12-NEXT: global_store_b128 v[16:17], v[24:27], off offset:16 +; GFX12-NEXT: s_clause 0x1 +; GFX12-NEXT: global_store_b128 v[18:19], v[6:9], off +; GFX12-NEXT: global_store_b128 v[18:19], v[10:13], off offset:16 +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +bb: + %IndexVec = load <2 x i16>, ptr addrspace(1) %IndexVecPtr, align 4 + %Index0 = extractelement <2 x i16> %IndexVec, i32 0 + %res0 = call <8 x float> @llvm.amdgcn.swmmac.f32.16x16x32.fp8.fp8.v8f32.v2i32.v4i32.v8f32.i16(<2 x i32> %A, <4 x i32> %B, <8 x float> %C, i16 %Index0) + store <8 x float> %res0, ptr addrspace(1) %out0 + %Index1 = extractelement <2 x i16> %IndexVec, i32 1 + %res1 = call <8 x float> @llvm.amdgcn.swmmac.f32.16x16x32.fp8.fp8.v8f32.v2i32.v4i32.v8f32.i16(<2 x i32> %A, <4 x i32> %B, <8 x float> %C, i16 %Index1) + store <8 x float> %res1, ptr addrspace(1) %out1 + ret void +} + +define amdgpu_ps void @test_swmmac_f32_16x16x32_fp8_bf8_index_key(<2 x i32> %A, <4 x i32> %B, <8 x float> %C, ptr addrspace(1) %IndexVecPtr, ptr addrspace(1) %out0, ptr addrspace(1) %out1) { +; GFX12-LABEL: test_swmmac_f32_16x16x32_fp8_bf8_index_key: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: global_load_b32 v14, v[14:15], off +; GFX12-NEXT: v_dual_mov_b32 v27, v13 :: v_dual_mov_b32 v26, v12 +; GFX12-NEXT: v_dual_mov_b32 v25, v11 :: v_dual_mov_b32 v24, v10 +; GFX12-NEXT: v_dual_mov_b32 v23, v9 :: v_dual_mov_b32 v22, v8 +; GFX12-NEXT: v_dual_mov_b32 v21, v7 :: v_dual_mov_b32 v20, v6 +; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: v_swmmac_f32_16x16x32_fp8_bf8 v[20:27], v[0:1], v[2:5], v14 +; GFX12-NEXT: v_swmmac_f32_16x16x32_fp8_bf8 v[6:13], v[0:1], v[2:5], v14 index_key:1 +; GFX12-NEXT: s_clause 0x1 +; GFX12-NEXT: global_store_b128 v[16:17], v[20:23], off +; GFX12-NEXT: global_store_b128 v[16:17], v[24:27], off offset:16 +; GFX12-NEXT: s_clause 0x1 +; GFX12-NEXT: global_store_b128 v[18:19], v[6:9], off +; GFX12-NEXT: global_store_b128 v[18:19], v[10:13], off offset:16 +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +bb: + %IndexVec = load <2 x i16>, ptr addrspace(1) %IndexVecPtr, align 4 + %Index0 = extractelement <2 x i16> %IndexVec, i32 0 + %res0 = call <8 x float> @llvm.amdgcn.swmmac.f32.16x16x32.fp8.bf8.v8f32.v2i32.v4i32.v8f32.i16(<2 x i32> %A, <4 x i32> %B, <8 x float> %C, i16 %Index0) + store <8 x float> %res0, ptr addrspace(1) %out0 + %Index1 = extractelement <2 x i16> %IndexVec, i32 1 + %res1 = call <8 x float> @llvm.amdgcn.swmmac.f32.16x16x32.fp8.bf8.v8f32.v2i32.v4i32.v8f32.i16(<2 x i32> %A, <4 x i32> %B, <8 x float> %C, i16 %Index1) + store <8 x float> %res1, ptr addrspace(1) %out1 + ret void +} + +define amdgpu_ps void @test_swmmac_f32_16x16x32_bf8_fp8_index_key(<2 x i32> %A, <4 x i32> %B, <8 x float> %C, ptr addrspace(1) %IndexVecPtr, ptr addrspace(1) %out0, ptr addrspace(1) %out1) { +; GFX12-LABEL: test_swmmac_f32_16x16x32_bf8_fp8_index_key: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: global_load_b32 v14, v[14:15], off +; GFX12-NEXT: v_dual_mov_b32 v27, v13 :: v_dual_mov_b32 v26, v12 +; GFX12-NEXT: v_dual_mov_b32 v25, v11 :: v_dual_mov_b32 v24, v10 +; GFX12-NEXT: v_dual_mov_b32 v23, v9 :: v_dual_mov_b32 v22, v8 +; GFX12-NEXT: v_dual_mov_b32 v21, v7 :: v_dual_mov_b32 v20, v6 +; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: v_swmmac_f32_16x16x32_bf8_fp8 v[20:27], v[0:1], v[2:5], v14 +; GFX12-NEXT: v_swmmac_f32_16x16x32_bf8_fp8 v[6:13], v[0:1], v[2:5], v14 index_key:1 +; GFX12-NEXT: s_clause 0x1 +; GFX12-NEXT: global_store_b128 v[16:17], v[20:23], off +; GFX12-NEXT: global_store_b128 v[16:17], v[24:27], off offset:16 +; GFX12-NEXT: s_clause 0x1 +; GFX12-NEXT: global_store_b128 v[18:19], v[6:9], off +; GFX12-NEXT: global_store_b128 v[18:19], v[10:13], off offset:16 +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +bb: + %IndexVec = load <2 x i16>, ptr addrspace(1) %IndexVecPtr, align 4 + %Index0 = extractelement <2 x i16> %IndexVec, i32 0 + %res0 = call <8 x float> @llvm.amdgcn.swmmac.f32.16x16x32.bf8.fp8.v8f32.v2i32.v4i32.v8f32.i16(<2 x i32> %A, <4 x i32> %B, <8 x float> %C, i16 %Index0) + store <8 x float> %res0, ptr addrspace(1) %out0 + %Index1 = extractelement <2 x i16> %IndexVec, i32 1 + %res1 = call <8 x float> @llvm.amdgcn.swmmac.f32.16x16x32.bf8.fp8.v8f32.v2i32.v4i32.v8f32.i16(<2 x i32> %A, <4 x i32> %B, <8 x float> %C, i16 %Index1) + store <8 x float> %res1, ptr addrspace(1) %out1 + ret void +} + +define amdgpu_ps void @test_swmmac_f32_16x16x32_bf8_bf8_index_key(<2 x i32> %A, <4 x i32> %B, <8 x float> %C, ptr addrspace(1) %IndexVecPtr, ptr addrspace(1) %out0, ptr addrspace(1) %out1) { +; GFX12-LABEL: test_swmmac_f32_16x16x32_bf8_bf8_index_key: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: global_load_b32 v14, v[14:15], off +; GFX12-NEXT: v_dual_mov_b32 v27, v13 :: v_dual_mov_b32 v26, v12 +; GFX12-NEXT: v_dual_mov_b32 v25, v11 :: v_dual_mov_b32 v24, v10 +; GFX12-NEXT: v_dual_mov_b32 v23, v9 :: v_dual_mov_b32 v22, v8 +; GFX12-NEXT: v_dual_mov_b32 v21, v7 :: v_dual_mov_b32 v20, v6 +; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: v_swmmac_f32_16x16x32_bf8_bf8 v[20:27], v[0:1], v[2:5], v14 +; GFX12-NEXT: v_swmmac_f32_16x16x32_bf8_bf8 v[6:13], v[0:1], v[2:5], v14 index_key:1 +; GFX12-NEXT: s_clause 0x1 +; GFX12-NEXT: global_store_b128 v[16:17], v[20:23], off +; GFX12-NEXT: global_store_b128 v[16:17], v[24:27], off offset:16 +; GFX12-NEXT: s_clause 0x1 +; GFX12-NEXT: global_store_b128 v[18:19], v[6:9], off +; GFX12-NEXT: global_store_b128 v[18:19], v[10:13], off offset:16 +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +bb: + %IndexVec = load <2 x i16>, ptr addrspace(1) %IndexVecPtr, align 4 + %Index0 = extractelement <2 x i16> %IndexVec, i32 0 + %res0 = call <8 x float> @llvm.amdgcn.swmmac.f32.16x16x32.bf8.bf8.v8f32.v2i32.v4i32.v8f32.i16(<2 x i32> %A, <4 x i32> %B, <8 x float> %C, i16 %Index0) + store <8 x float> %res0, ptr addrspace(1) %out0 + %Index1 = extractelement <2 x i16> %IndexVec, i32 1 + %res1 = call <8 x float> @llvm.amdgcn.swmmac.f32.16x16x32.bf8.bf8.v8f32.v2i32.v4i32.v8f32.i16(<2 x i32> %A, <4 x i32> %B, <8 x float> %C, i16 %Index1) + store <8 x float> %res1, ptr addrspace(1) %out1 + ret void +} + +declare <8 x float> @llvm.amdgcn.swmmac.f32.16x16x32.f16.v8f32.v8f16.v16f16.v8f32.i16(<8 x half>, <16 x half>, <8 x float>, i16) +declare <8 x float> @llvm.amdgcn.swmmac.f32.16x16x32.bf16.v8f32.v8i16.v16i16.v8f32.i16(<8 x i16>, <16 x i16>, <8 x float>, i16) +declare <8 x half> @llvm.amdgcn.swmmac.f16.16x16x32.f16.v8f16.v8f16.v16f16.v8f16.i16(<8 x half>, <16 x half>, <8 x half>, i16) +declare <8 x i16> @llvm.amdgcn.swmmac.bf16.16x16x32.bf16.v8i16.v8i16.v16i16.v8i16.i16(<8 x i16>, <16 x i16>, <8 x i16>, i16) +declare <8 x i32> @llvm.amdgcn.swmmac.i32.16x16x32.iu8.v8i32.v2i32.v4i32.v8i32.i16(i1 immarg, <2 x i32>, i1 immarg, <4 x i32>, <8 x i32>, i16 %Index, i1 immarg) +declare <8 x i32> @llvm.amdgcn.swmmac.i32.16x16x32.iu4.v8i32.i32.v2i32.v8i32.i16(i1 immarg, i32, i1 immarg, <2 x i32>, <8 x i32>, i16 %Index, i1 immarg) +declare <8 x float> @llvm.amdgcn.swmmac.f32.16x16x32.fp8.fp8.v8f32.v2i32.v4i32.v8f32.i16(<2 x i32>, <4 x i32>, <8 x float>, i16) +declare <8 x float> @llvm.amdgcn.swmmac.f32.16x16x32.fp8.bf8.v8f32.v2i32.v4i32.v8f32.i16(<2 x i32>, <4 x i32>, <8 x float>, i16) +declare <8 x float> @llvm.amdgcn.swmmac.f32.16x16x32.bf8.fp8.v8f32.v2i32.v4i32.v8f32.i16(<2 x i32>, <4 x i32>, <8 x float>, i16) +declare <8 x float> @llvm.amdgcn.swmmac.f32.16x16x32.bf8.bf8.v8f32.v2i32.v4i32.v8f32.i16(<2 x i32>, <4 x i32>, <8 x float>, i16) diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/wmma-gfx12-w32.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/wmma-gfx12-w32.ll new file mode 100644 index 000000000000..c4edc5b72b2f --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/wmma-gfx12-w32.ll @@ -0,0 +1,370 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc -global-isel -march=amdgcn -mcpu=gfx1200 -verify-machineinstrs < %s | FileCheck %s --check-prefix=GFX12 + +define amdgpu_ps void @test_wmma_f32_16x16x16_f16(<8 x half> %A, <8 x half> %B, <8 x float> %C, ptr addrspace(1) %out) { +; GFX12-LABEL: test_wmma_f32_16x16x16_f16: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: v_wmma_f32_16x16x16_f16 v[8:15], v[0:3], v[4:7], v[8:15] +; GFX12-NEXT: s_clause 0x1 +; GFX12-NEXT: global_store_b128 v[16:17], v[8:11], off +; GFX12-NEXT: global_store_b128 v[16:17], v[12:15], off offset:16 +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +bb: + %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.f16.v8f32.v8f16.v8f16.v8f32(<8 x half> %A, <8 x half> %B, <8 x float> %C) + store <8 x float> %res, ptr addrspace(1) %out + ret void +} + +define amdgpu_ps void @test_wmma_f32_16x16x16_bf16(<8 x i16> %A, <8 x i16> %B, <8 x float> %C, ptr addrspace(1) %out) { +; GFX12-LABEL: test_wmma_f32_16x16x16_bf16: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: v_wmma_f32_16x16x16_bf16 v[8:15], v[0:3], v[4:7], v[8:15] +; GFX12-NEXT: s_clause 0x1 +; GFX12-NEXT: global_store_b128 v[16:17], v[8:11], off +; GFX12-NEXT: global_store_b128 v[16:17], v[12:15], off offset:16 +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +bb: + %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf16.v8f32.v8i16.v8i16.v8f32(<8 x i16> %A, <8 x i16> %B, <8 x float> %C) + store <8 x float> %res, ptr addrspace(1) %out + ret void +} + +define amdgpu_ps void @test_wmma_f16_16x16x16_f16(<8 x half> %A, <8 x half> %B, <8 x half> %C, ptr addrspace(1) %out) { +; GFX12-LABEL: test_wmma_f16_16x16x16_f16: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: v_wmma_f16_16x16x16_f16 v[8:11], v[0:3], v[4:7], v[8:11] +; GFX12-NEXT: global_store_b128 v[12:13], v[8:11], off +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +bb: + %res = call <8 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16.v8f16.v8f16.v8f16.v8f16(<8 x half> %A, <8 x half> %B, <8 x half> %C, i1 0) + store <8 x half> %res, ptr addrspace(1) %out + ret void +} + +define amdgpu_ps void @test_wmma_bf16_16x16x16_bf16(<8 x i16> %A, <8 x i16> %B, <8 x i16> %C, ptr addrspace(1) %out) { +; GFX12-LABEL: test_wmma_bf16_16x16x16_bf16: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: v_wmma_bf16_16x16x16_bf16 v[8:11], v[0:3], v[4:7], v[8:11] +; GFX12-NEXT: global_store_b128 v[12:13], v[8:11], off +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +bb: + %res = call <8 x i16> @llvm.amdgcn.wmma.bf16.16x16x16.bf16.v8i16.v8i16.v8i16.v8i16(<8 x i16> %A, <8 x i16> %B, <8 x i16> %C, i1 0) + store <8 x i16> %res, ptr addrspace(1) %out + ret void +} + +define amdgpu_ps void @test_wmma_i32_16x16x16_iu8(<2 x i32> %A, <2 x i32> %B, <8 x i32> %C, ptr addrspace(1) %out) { +; GFX12-LABEL: test_wmma_i32_16x16x16_iu8: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: v_wmma_i32_16x16x16_iu8 v[4:11], v[0:1], v[2:3], v[4:11] +; GFX12-NEXT: s_clause 0x1 +; GFX12-NEXT: global_store_b128 v[12:13], v[4:7], off +; GFX12-NEXT: global_store_b128 v[12:13], v[8:11], off offset:16 +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +bb: + %res = call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8.v8i32.v2i32.v2i32.v8i32(i1 0, <2 x i32> %A, i1 0, <2 x i32> %B, <8 x i32> %C, i1 0) + store <8 x i32> %res, ptr addrspace(1) %out + ret void +} + +define amdgpu_ps void @test_wmma_i32_16x16x16_iu4(i32 %A, i32 %B, <8 x i32> %C, ptr addrspace(1) %out) { +; GFX12-LABEL: test_wmma_i32_16x16x16_iu4: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: v_wmma_i32_16x16x16_iu4 v[2:9], v0, v1, v[2:9] +; GFX12-NEXT: s_clause 0x1 +; GFX12-NEXT: global_store_b128 v[10:11], v[2:5], off +; GFX12-NEXT: global_store_b128 v[10:11], v[6:9], off offset:16 +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +bb: + %res = call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4.v8i32.i32.i32.v8i32(i1 0, i32 %A, i1 0, i32 %B, <8 x i32> %C, i1 0) + store <8 x i32> %res, ptr addrspace(1) %out + ret void +} + +define amdgpu_ps void @test_wmma_f32_16x16x16_fp8_fp8(<2 x i32> %A, <2 x i32> %B, <8 x float> %C, ptr addrspace(1) %out) { +; GFX12-LABEL: test_wmma_f32_16x16x16_fp8_fp8: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: v_wmma_f32_16x16x16_fp8_fp8 v[4:11], v[0:1], v[2:3], v[4:11] +; GFX12-NEXT: s_clause 0x1 +; GFX12-NEXT: global_store_b128 v[12:13], v[4:7], off +; GFX12-NEXT: global_store_b128 v[12:13], v[8:11], off offset:16 +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +bb: + %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.fp8.fp8.v8f32.v2i32.v2i32.v8f32(<2 x i32> %A, <2 x i32> %B, <8 x float> %C) + store <8 x float> %res, ptr addrspace(1) %out + ret void +} + +define amdgpu_ps void @test_wmma_f32_16x16x16_bf8_fp8(<2 x i32> %A, <2 x i32> %B, <8 x float> %C, ptr addrspace(1) %out) { +; GFX12-LABEL: test_wmma_f32_16x16x16_bf8_fp8: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: v_wmma_f32_16x16x16_bf8_fp8 v[4:11], v[0:1], v[2:3], v[4:11] +; GFX12-NEXT: s_clause 0x1 +; GFX12-NEXT: global_store_b128 v[12:13], v[4:7], off +; GFX12-NEXT: global_store_b128 v[12:13], v[8:11], off offset:16 +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +bb: + %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf8.fp8.v8f32.v2i32.v2i32.v8f32(<2 x i32> %A, <2 x i32> %B, <8 x float> %C) + store <8 x float> %res, ptr addrspace(1) %out + ret void +} + +define amdgpu_ps void @test_wmma_f32_16x16x16_fp8_bf8(<2 x i32> %A, <2 x i32> %B, <8 x float> %C, ptr addrspace(1) %out) { +; GFX12-LABEL: test_wmma_f32_16x16x16_fp8_bf8: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: v_wmma_f32_16x16x16_fp8_bf8 v[4:11], v[0:1], v[2:3], v[4:11] +; GFX12-NEXT: s_clause 0x1 +; GFX12-NEXT: global_store_b128 v[12:13], v[4:7], off +; GFX12-NEXT: global_store_b128 v[12:13], v[8:11], off offset:16 +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +bb: + %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.fp8.bf8.v8f32.v2i32.v2i32.v8f32(<2 x i32> %A, <2 x i32> %B, <8 x float> %C) + store <8 x float> %res, ptr addrspace(1) %out + ret void +} + +define amdgpu_ps void @test_wmma_f32_16x16x16_bf8_bf8(<2 x i32> %A, <2 x i32> %B, <8 x float> %C, ptr addrspace(1) %out) { +; GFX12-LABEL: test_wmma_f32_16x16x16_bf8_bf8: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: v_wmma_f32_16x16x16_bf8_bf8 v[4:11], v[0:1], v[2:3], v[4:11] +; GFX12-NEXT: s_clause 0x1 +; GFX12-NEXT: global_store_b128 v[12:13], v[4:7], off +; GFX12-NEXT: global_store_b128 v[12:13], v[8:11], off offset:16 +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +bb: + %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf8.bf8.v8f32.v2i32.v2i32.v8f32(<2 x i32> %A, <2 x i32> %B, <8 x float> %C) + store <8 x float> %res, ptr addrspace(1) %out + ret void +} + +define amdgpu_ps void @test_wmma_i32_16x16x32_iu4(<2 x i32> %A, <2 x i32> %B, <8 x i32> %C, ptr addrspace(1) %out) { +; GFX12-LABEL: test_wmma_i32_16x16x32_iu4: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: v_wmma_i32_16x16x32_iu4 v[4:11], v[0:1], v[2:3], v[4:11] +; GFX12-NEXT: s_clause 0x1 +; GFX12-NEXT: global_store_b128 v[12:13], v[4:7], off +; GFX12-NEXT: global_store_b128 v[12:13], v[8:11], off offset:16 +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +bb: + %res = call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x32.iu4.v8i32.v2i32.v2i32.v8i32(i1 0, <2 x i32> %A, i1 0, <2 x i32> %B, <8 x i32> %C, i1 0) + store <8 x i32> %res, ptr addrspace(1) %out + ret void +} + + +define amdgpu_ps void @test_swmmac_f32_16x16x32_f16(<8 x half> %A, <16 x half> %B, <8 x float> %C, i16 %Index, ptr addrspace(1) %out) { +; GFX12-LABEL: test_swmmac_f32_16x16x32_f16: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: v_swmmac_f32_16x16x32_f16 v[12:19], v[0:3], v[4:11], v20 +; GFX12-NEXT: s_clause 0x1 +; GFX12-NEXT: global_store_b128 v[21:22], v[12:15], off +; GFX12-NEXT: global_store_b128 v[21:22], v[16:19], off offset:16 +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +bb: + %res = call <8 x float> @llvm.amdgcn.swmmac.f32.16x16x32.f16.v8f32.v8f16.v16f16.v8f32.i16(<8 x half> %A, <16 x half> %B, <8 x float> %C, i16 %Index) + store <8 x float> %res, ptr addrspace(1) %out + ret void +} + +define amdgpu_ps void @test_swmmac_f32_16x16x32_bf16(<8 x i16> %A, <16 x i16> %B, <8 x float> %C, i16 %Index, ptr addrspace(1) %out) { +; GFX12-LABEL: test_swmmac_f32_16x16x32_bf16: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: v_swmmac_f32_16x16x32_bf16 v[12:19], v[0:3], v[4:11], v20 +; GFX12-NEXT: s_clause 0x1 +; GFX12-NEXT: global_store_b128 v[21:22], v[12:15], off +; GFX12-NEXT: global_store_b128 v[21:22], v[16:19], off offset:16 +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +bb: + %res = call <8 x float> @llvm.amdgcn.swmmac.f32.16x16x32.bf16.v8f32.v8i16.v16i16.v8f32.i16(<8 x i16> %A, <16 x i16> %B, <8 x float> %C, i16 %Index) + store <8 x float> %res, ptr addrspace(1) %out + ret void +} + +define amdgpu_ps void @test_swmmac_f16_16x16x32_f16(<8 x half> %A, <16 x half> %B, <8 x half> %C, i16 %Index, ptr addrspace(1) %out) { +; GFX12-LABEL: test_swmmac_f16_16x16x32_f16: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: v_swmmac_f16_16x16x32_f16 v[12:15], v[0:3], v[4:11], v16 +; GFX12-NEXT: global_store_b128 v[17:18], v[12:15], off +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +bb: + %res = call <8 x half> @llvm.amdgcn.swmmac.f16.16x16x32.f16.v8f16.v8f16.v16f16.v8f16.i16(<8 x half> %A, <16 x half> %B, <8 x half> %C, i16 %Index) + store <8 x half> %res, ptr addrspace(1) %out + ret void +} + +define amdgpu_ps void @test_swmmac_bf16_16x16x32_bf16(<8 x i16> %A, <16 x i16> %B, <8 x i16> %C, i16 %Index, ptr addrspace(1) %out) { +; GFX12-LABEL: test_swmmac_bf16_16x16x32_bf16: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: v_swmmac_bf16_16x16x32_bf16 v[12:15], v[0:3], v[4:11], v16 +; GFX12-NEXT: global_store_b128 v[17:18], v[12:15], off +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +bb: + %res = call <8 x i16> @llvm.amdgcn.swmmac.bf16.16x16x32.bf16.v8i16.v8i16.v16i16.v8i16.i16(<8 x i16> %A, <16 x i16> %B, <8 x i16> %C, i16 %Index) + store <8 x i16> %res, ptr addrspace(1) %out + ret void +} + +define amdgpu_ps void @test_swmmac_i32_16x16x32_iu8(<2 x i32> %A, <4 x i32> %B, <8 x i32> %C, i16 %Index, ptr addrspace(1) %out) { +; GFX12-LABEL: test_swmmac_i32_16x16x32_iu8: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: v_swmmac_i32_16x16x32_iu8 v[6:13], v[0:1], v[2:5], v14 +; GFX12-NEXT: s_clause 0x1 +; GFX12-NEXT: global_store_b128 v[15:16], v[6:9], off +; GFX12-NEXT: global_store_b128 v[15:16], v[10:13], off offset:16 +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +bb: + %res = call <8 x i32> @llvm.amdgcn.swmmac.i32.16x16x32.iu8.v8i32.v2i32.v4i32.v8i32.i16(i1 0, <2 x i32> %A, i1 0, <4 x i32> %B, <8 x i32> %C, i16 %Index, i1 0) + store <8 x i32> %res, ptr addrspace(1) %out + ret void +} + +define amdgpu_ps void @test_swmmac_i32_16x16x32_iu4(i32 %A, <2 x i32> %B, <8 x i32> %C, i16 %Index, ptr addrspace(1) %out) { +; GFX12-LABEL: test_swmmac_i32_16x16x32_iu4: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: v_swmmac_i32_16x16x32_iu4 v[3:10], v0, v[1:2], v11 +; GFX12-NEXT: s_clause 0x1 +; GFX12-NEXT: global_store_b128 v[12:13], v[3:6], off +; GFX12-NEXT: global_store_b128 v[12:13], v[7:10], off offset:16 +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +bb: + %res = call <8 x i32> @llvm.amdgcn.swmmac.i32.16x16x32.iu4.v8i32.i32.v2i32.v8i32.i16(i1 0, i32 %A, i1 0, <2 x i32> %B, <8 x i32> %C, i16 %Index, i1 0) + store <8 x i32> %res, ptr addrspace(1) %out + ret void +} + +define amdgpu_ps void @test_swmmac_i32_16x16x64_iu4(<2 x i32> %A, <4 x i32> %B, <8 x i32> %C, i32 %Index, ptr addrspace(1) %out) { +; GFX12-LABEL: test_swmmac_i32_16x16x64_iu4: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: v_swmmac_i32_16x16x64_iu4 v[6:13], v[0:1], v[2:5], v14 +; GFX12-NEXT: s_clause 0x1 +; GFX12-NEXT: global_store_b128 v[15:16], v[6:9], off +; GFX12-NEXT: global_store_b128 v[15:16], v[10:13], off offset:16 +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +bb: + %res = call <8 x i32> @llvm.amdgcn.swmmac.i32.16x16x64.iu4.v8i32.v2i32.v4i32.v8i32.i32(i1 0, <2 x i32> %A, i1 0, <4 x i32> %B, <8 x i32> %C, i32 %Index, i1 0) + store <8 x i32> %res, ptr addrspace(1) %out + ret void +} + +define amdgpu_ps void @test_swmmac_f32_16x16x32_fp8_fp8(<2 x i32> %A, <4 x i32> %B, <8 x float> %C, i16 %Index, ptr addrspace(1) %out) { +; GFX12-LABEL: test_swmmac_f32_16x16x32_fp8_fp8: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: v_swmmac_f32_16x16x32_fp8_fp8 v[6:13], v[0:1], v[2:5], v14 +; GFX12-NEXT: s_clause 0x1 +; GFX12-NEXT: global_store_b128 v[15:16], v[6:9], off +; GFX12-NEXT: global_store_b128 v[15:16], v[10:13], off offset:16 +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +bb: + %res = call <8 x float> @llvm.amdgcn.swmmac.f32.16x16x32.fp8.fp8.v8f32.v2i32.v4i32.v8f32.i16(<2 x i32> %A, <4 x i32> %B, <8 x float> %C, i16 %Index) + store <8 x float> %res, ptr addrspace(1) %out + ret void +} + +define amdgpu_ps void @test_swmmac_f32_16x16x32_fp8_bf8(<2 x i32> %A, <4 x i32> %B, <8 x float> %C, i16 %Index, ptr addrspace(1) %out) { +; GFX12-LABEL: test_swmmac_f32_16x16x32_fp8_bf8: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: v_swmmac_f32_16x16x32_fp8_bf8 v[6:13], v[0:1], v[2:5], v14 +; GFX12-NEXT: s_clause 0x1 +; GFX12-NEXT: global_store_b128 v[15:16], v[6:9], off +; GFX12-NEXT: global_store_b128 v[15:16], v[10:13], off offset:16 +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +bb: + %res = call <8 x float> @llvm.amdgcn.swmmac.f32.16x16x32.fp8.bf8.v8f32.v2i32.v4i32.v8f32.i16(<2 x i32> %A, <4 x i32> %B, <8 x float> %C, i16 %Index) + store <8 x float> %res, ptr addrspace(1) %out + ret void +} + +define amdgpu_ps void @test_swmmac_f32_16x16x32_bf8_fp8(<2 x i32> %A, <4 x i32> %B, <8 x float> %C, i16 %Index, ptr addrspace(1) %out) { +; GFX12-LABEL: test_swmmac_f32_16x16x32_bf8_fp8: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: v_swmmac_f32_16x16x32_bf8_fp8 v[6:13], v[0:1], v[2:5], v14 +; GFX12-NEXT: s_clause 0x1 +; GFX12-NEXT: global_store_b128 v[15:16], v[6:9], off +; GFX12-NEXT: global_store_b128 v[15:16], v[10:13], off offset:16 +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +bb: + %res = call <8 x float> @llvm.amdgcn.swmmac.f32.16x16x32.bf8.fp8.v8f32.v2i32.v4i32.v8f32.i16(<2 x i32> %A, <4 x i32> %B, <8 x float> %C, i16 %Index) + store <8 x float> %res, ptr addrspace(1) %out + ret void +} + +define amdgpu_ps void @test_swmmac_f32_16x16x32_bf8_bf8(<2 x i32> %A, <4 x i32> %B, <8 x float> %C, i16 %Index, ptr addrspace(1) %out) { +; GFX12-LABEL: test_swmmac_f32_16x16x32_bf8_bf8: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: v_swmmac_f32_16x16x32_bf8_bf8 v[6:13], v[0:1], v[2:5], v14 +; GFX12-NEXT: s_clause 0x1 +; GFX12-NEXT: global_store_b128 v[15:16], v[6:9], off +; GFX12-NEXT: global_store_b128 v[15:16], v[10:13], off offset:16 +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +bb: + %res = call <8 x float> @llvm.amdgcn.swmmac.f32.16x16x32.bf8.bf8.v8f32.v2i32.v4i32.v8f32.i16(<2 x i32> %A, <4 x i32> %B, <8 x float> %C, i16 %Index) + store <8 x float> %res, ptr addrspace(1) %out + ret void +} + +declare <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.f16.v8f32.v8f16.v8f16.v8f32(<8 x half>, <8 x half>, <8 x float>) +declare <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf16.v8f32.v8i16.v8i16.v8f32(<8 x i16>, <8 x i16>, <8 x float>) +declare <8 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16.v8f16.v8f16.v8f16.v8f16(<8 x half>, <8 x half>, <8 x half>, i1 immarg) +declare <8 x i16> @llvm.amdgcn.wmma.bf16.16x16x16.bf16.v8i16.v8i16.v8i16.v8i16(<8 x i16>, <8 x i16>, <8 x i16>, i1 immarg) +declare <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8.v8i32.v2i32.v2i32.v8i32(i1 immarg, <2 x i32>, i1 immarg, <2 x i32>, <8 x i32>, i1 immarg) +declare <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4.v8i32.i32.i32.v8i32(i1 immarg, i32, i1 immarg, i32, <8 x i32>, i1 immarg) +declare <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.fp8.fp8.v8f32.v2i32.v2i32.v8f32(<2 x i32>, <2 x i32>, <8 x float>) +declare <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.fp8.bf8.v8f32.v2i32.v2i32.v8f32(<2 x i32>, <2 x i32>, <8 x float>) +declare <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf8.fp8.v8f32.v2i32.v2i32.v8f32(<2 x i32>, <2 x i32>, <8 x float>) +declare <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf8.bf8.v8f32.v2i32.v2i32.v8f32(<2 x i32>, <2 x i32>, <8 x float>) +declare <8 x i32> @llvm.amdgcn.wmma.i32.16x16x32.iu4.v8i32.v2i32.v2i32.v8i32(i1 immarg, <2 x i32>, i1 immarg, <2 x i32>, <8 x i32>, i1 immarg) +declare <8 x float> @llvm.amdgcn.swmmac.f32.16x16x32.f16.v8f32.v8f16.v16f16.v8f32.i16(<8 x half>, <16 x half>, <8 x float>, i16) +declare <8 x float> @llvm.amdgcn.swmmac.f32.16x16x32.bf16.v8f32.v8i16.v16i16.v8f32.i16(<8 x i16>, <16 x i16>, <8 x float>, i16) +declare <8 x half> @llvm.amdgcn.swmmac.f16.16x16x32.f16.v8f16.v8f16.v16f16.v8f16.i16(<8 x half>, <16 x half>, <8 x half>, i16) +declare <8 x i16> @llvm.amdgcn.swmmac.bf16.16x16x32.bf16.v8i16.v8i16.v16i16.v8i16.i16(<8 x i16>, <16 x i16>, <8 x i16>, i16) +declare <8 x i32> @llvm.amdgcn.swmmac.i32.16x16x32.iu8.v8i32.v2i32.v4i32.v8i32.i16(i1 immarg, <2 x i32>, i1 immarg, <4 x i32>, <8 x i32>, i16 %Index, i1 immarg) +declare <8 x i32> @llvm.amdgcn.swmmac.i32.16x16x32.iu4.v8i32.i32.v2i32.v8i32.i16(i1 immarg, i32, i1 immarg, <2 x i32>, <8 x i32>, i16 %Index, i1 immarg) +declare <8 x i32> @llvm.amdgcn.swmmac.i32.16x16x64.iu4.v8i32.v2i32.v4i32.v8i32.i32(i1 immarg, <2 x i32>, i1 immarg, <4 x i32>, <8 x i32>, i32 %Index, i1 immarg) +declare <8 x float> @llvm.amdgcn.swmmac.f32.16x16x32.fp8.fp8.v8f32.v2i32.v4i32.v8f32.i16(<2 x i32>, <4 x i32>, <8 x float>, i16) +declare <8 x float> @llvm.amdgcn.swmmac.f32.16x16x32.fp8.bf8.v8f32.v2i32.v4i32.v8f32.i16(<2 x i32>, <4 x i32>, <8 x float>, i16) +declare <8 x float> @llvm.amdgcn.swmmac.f32.16x16x32.bf8.fp8.v8f32.v2i32.v4i32.v8f32.i16(<2 x i32>, <4 x i32>, <8 x float>, i16) +declare <8 x float> @llvm.amdgcn.swmmac.f32.16x16x32.bf8.bf8.v8f32.v2i32.v4i32.v8f32.i16(<2 x i32>, <4 x i32>, <8 x float>, i16) diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/wmma-gfx12-w64-f16-f32-matrix-modifiers.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/wmma-gfx12-w64-f16-f32-matrix-modifiers.ll new file mode 100644 index 000000000000..eafbfb6d1eeb --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/wmma-gfx12-w64-f16-f32-matrix-modifiers.ll @@ -0,0 +1,459 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc -global-isel -march=amdgcn -mcpu=gfx1200 -mattr=-wavefrontsize32,+wavefrontsize64 -verify-machineinstrs < %s | FileCheck %s --check-prefix=GFX12 + +define amdgpu_ps void @test_wmma_f32_16x16x16_f16_negA(<4 x half> %A, <4 x half> %B, <4 x float> %C, <4 x float> addrspace(1)* %out) { +; GFX12-LABEL: test_wmma_f32_16x16x16_f16_negA: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: v_wmma_f32_16x16x16_f16 v[4:7], v[0:1], v[2:3], v[4:7] neg_lo:[1,0,0] neg_hi:[1,0,0] +; GFX12-NEXT: global_store_b128 v[8:9], v[4:7], off +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +bb: + %fneg.A = fneg <4 x half> %A + %res = call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.f16.v4f16.v4f32(<4 x half> %fneg.A, <4 x half> %B, <4 x float> %C) + store <4 x float> %res, ptr addrspace(1) %out + ret void +} + +define amdgpu_ps void @test_wmma_f32_16x16x16_f16_negB(<4 x half> %A, <4 x half> %B, <4 x float> %C, <4 x float> addrspace(1)* %out) { +; GFX12-LABEL: test_wmma_f32_16x16x16_f16_negB: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: v_wmma_f32_16x16x16_f16 v[4:7], v[0:1], v[2:3], v[4:7] neg_lo:[0,1,0] neg_hi:[0,1,0] +; GFX12-NEXT: global_store_b128 v[8:9], v[4:7], off +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +bb: + %fneg.B = fneg <4 x half> %B + %res = call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.f16.v4f16.v4f32(<4 x half> %A, <4 x half> %fneg.B, <4 x float> %C) + store <4 x float> %res, <4 x float> addrspace(1)* %out + ret void +} + +define amdgpu_ps void @test_wmma_f32_16x16x16_f16_negC(<4 x half> %A, <4 x half> %B, <4 x float> %C, <4 x float> addrspace(1)* %out) { +; GFX12-LABEL: test_wmma_f32_16x16x16_f16_negC: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: v_wmma_f32_16x16x16_f16 v[4:7], v[0:1], v[2:3], v[4:7] neg_lo:[0,0,1] +; GFX12-NEXT: global_store_b128 v[8:9], v[4:7], off +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +bb: + %fneg.C = fneg <4 x float> %C + %res = call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.f16.v4f16.v4f32(<4 x half> %A, <4 x half> %B, <4 x float> %fneg.C) + store <4 x float> %res, <4 x float> addrspace(1)* %out + ret void +} + +define amdgpu_ps void @test_wmma_f32_16x16x16_f16_absC(<4 x half> %A, <4 x half> %B, <4 x float> %C, <4 x float> addrspace(1)* %out) { +; GFX12-LABEL: test_wmma_f32_16x16x16_f16_absC: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: v_wmma_f32_16x16x16_f16 v[4:7], v[0:1], v[2:3], v[4:7] neg_hi:[0,0,1] +; GFX12-NEXT: global_store_b128 v[8:9], v[4:7], off +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +bb: + %fabs.C = call <4 x float> @llvm.fabs.v4f32(<4 x float> %C) + %res = call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.f16.v4f16.v4f32(<4 x half> %A, <4 x half> %B, <4 x float> %fabs.C) + store <4 x float> %res, <4 x float> addrspace(1)* %out + ret void +} + +define amdgpu_ps void @test_wmma_f32_16x16x16_bf16_negC(<4 x i16> %A, <4 x i16> %B, <4 x float> %C, <4 x float> addrspace(1)* %out) { +; GFX12-LABEL: test_wmma_f32_16x16x16_bf16_negC: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: v_wmma_f32_16x16x16_bf16 v[4:7], v[0:1], v[2:3], v[4:7] neg_lo:[0,0,1] +; GFX12-NEXT: global_store_b128 v[8:9], v[4:7], off +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +bb: + %fneg.C = fneg <4 x float> %C + %res = call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf16.v4i16.v4f32(<4 x i16> %A, <4 x i16> %B, <4 x float> %fneg.C) + store <4 x float> %res, <4 x float> addrspace(1)* %out + ret void +} + +define amdgpu_ps void @test_wmma_f32_16x16x16_bf16_absC(<4 x i16> %A, <4 x i16> %B, <4 x float> %C, <4 x float> addrspace(1)* %out) { +; GFX12-LABEL: test_wmma_f32_16x16x16_bf16_absC: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: v_wmma_f32_16x16x16_bf16 v[4:7], v[0:1], v[2:3], v[4:7] neg_hi:[0,0,1] +; GFX12-NEXT: global_store_b128 v[8:9], v[4:7], off +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +bb: + %fabs.C = call <4 x float> @llvm.fabs.v4f32(<4 x float> %C) + %res = call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf16.v4i16.v4f32(<4 x i16> %A, <4 x i16> %B, <4 x float> %fabs.C) + store <4 x float> %res, <4 x float> addrspace(1)* %out + ret void +} + +define amdgpu_ps void @test_wmma_f16_16x16x16_f16_negA(<4 x half> %A, <4 x half> %B, <4 x half> %C, <4 x half> addrspace(1)* %out) { +; GFX12-LABEL: test_wmma_f16_16x16x16_f16_negA: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: v_wmma_f16_16x16x16_f16 v[4:5], v[0:1], v[2:3], v[4:5] neg_lo:[1,0,0] neg_hi:[1,0,0] +; GFX12-NEXT: global_store_b64 v[6:7], v[4:5], off +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +bb: + %fneg.A = fneg <4 x half> %A + %res = call <4 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16.v8f16.v8f16(<4 x half> %fneg.A, <4 x half> %B, <4 x half> %C, i1 0) + store <4 x half> %res, <4 x half> addrspace(1)* %out + ret void +} + +define amdgpu_ps void @test_wmma_f16_16x16x16_f16_negB(<4 x half> %A, <4 x half> %B, <4 x half> %C, <4 x half> addrspace(1)* %out) { +; GFX12-LABEL: test_wmma_f16_16x16x16_f16_negB: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: v_wmma_f16_16x16x16_f16 v[4:5], v[0:1], v[2:3], v[4:5] neg_lo:[0,1,0] neg_hi:[0,1,0] +; GFX12-NEXT: global_store_b64 v[6:7], v[4:5], off +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +bb: + %fneg.B = fneg <4 x half> %B + %res = call <4 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16.v8f16.v8f16(<4 x half> %A, <4 x half> %fneg.B, <4 x half> %C, i1 0) + store <4 x half> %res, <4 x half> addrspace(1)* %out + ret void +} + +define amdgpu_ps void @test_wmma_f16_16x16x16_f16_negC(<4 x half> %A, <4 x half> %B, <4 x half> %C, <4 x half> addrspace(1)* %out) { +; GFX12-LABEL: test_wmma_f16_16x16x16_f16_negC: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: v_wmma_f16_16x16x16_f16 v[4:5], v[0:1], v[2:3], v[4:5] neg_lo:[0,0,1] +; GFX12-NEXT: global_store_b64 v[6:7], v[4:5], off +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +bb: + %fneg.C = fneg <4 x half> %C + %res = call <4 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16.v8f16.v8f16(<4 x half> %A, <4 x half> %B, <4 x half> %fneg.C, i1 0) + store <4 x half> %res, <4 x half> addrspace(1)* %out + ret void +} + +define amdgpu_ps void @test_wmma_f16_16x16x16_f16_absC(<4 x half> %A, <4 x half> %B, <4 x half> %C, <4 x half> addrspace(1)* %out) { +; GFX12-LABEL: test_wmma_f16_16x16x16_f16_absC: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: v_wmma_f16_16x16x16_f16 v[4:5], v[0:1], v[2:3], v[4:5] neg_hi:[0,0,1] +; GFX12-NEXT: global_store_b64 v[6:7], v[4:5], off +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +bb: + %fabs.C = call <4 x half> @llvm.fabs.v4f16(<4 x half> %C) + %res = call <4 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16.v8f16.v8f16(<4 x half> %A, <4 x half> %B, <4 x half> %fabs.C, i1 0) + store <4 x half> %res, <4 x half> addrspace(1)* %out + ret void +} + +define amdgpu_ps void @test_wmma_f32_16x16x16_fp8_fp8_negC(i32 %A, i32 %B, <4 x float> %C, <4 x float> addrspace(1)* %out) { +; GFX12-LABEL: test_wmma_f32_16x16x16_fp8_fp8_negC: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: v_wmma_f32_16x16x16_fp8_fp8 v[2:5], v0, v1, v[2:5] neg_lo:[0,0,1] +; GFX12-NEXT: global_store_b128 v[6:7], v[2:5], off +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +bb: + %fneg.C = fneg <4 x float> %C + %res = call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.fp8.fp8.i32.v4f32(i32 %A, i32 %B, <4 x float> %fneg.C) + store <4 x float> %res, <4 x float> addrspace(1)* %out + ret void +} + +define amdgpu_ps void @test_wmma_f32_16x16x16_fp8_fp8_absC(i32 %A, i32 %B, <4 x float> %C, <4 x float> addrspace(1)* %out) { +; GFX12-LABEL: test_wmma_f32_16x16x16_fp8_fp8_absC: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: v_wmma_f32_16x16x16_fp8_fp8 v[2:5], v0, v1, v[2:5] neg_hi:[0,0,1] +; GFX12-NEXT: global_store_b128 v[6:7], v[2:5], off +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +bb: + %fabs.C = call <4 x float> @llvm.fabs.v4f32(<4 x float> %C) + %res = call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.fp8.fp8.i32.v4f32(i32 %A, i32 %B, <4 x float> %fabs.C) + store <4 x float> %res, <4 x float> addrspace(1)* %out + ret void +} + +define amdgpu_ps void @test_wmma_f32_16x16x16_bf8_fp8_negC(i32 %A, i32 %B, <4 x float> %C, <4 x float> addrspace(1)* %out) { +; GFX12-LABEL: test_wmma_f32_16x16x16_bf8_fp8_negC: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: v_wmma_f32_16x16x16_bf8_fp8 v[2:5], v0, v1, v[2:5] neg_lo:[0,0,1] +; GFX12-NEXT: global_store_b128 v[6:7], v[2:5], off +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +bb: + %fneg.C = fneg <4 x float> %C + %res = call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf8.fp8.i32.v4f32(i32 %A, i32 %B, <4 x float> %fneg.C) + store <4 x float> %res, <4 x float> addrspace(1)* %out + ret void +} + +define amdgpu_ps void @test_wmma_f32_16x16x16_bf8_fp8_absC(i32 %A, i32 %B, <4 x float> %C, <4 x float> addrspace(1)* %out) { +; GFX12-LABEL: test_wmma_f32_16x16x16_bf8_fp8_absC: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: v_wmma_f32_16x16x16_bf8_fp8 v[2:5], v0, v1, v[2:5] neg_hi:[0,0,1] +; GFX12-NEXT: global_store_b128 v[6:7], v[2:5], off +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +bb: + %fabs.C = call <4 x float> @llvm.fabs.v4f32(<4 x float> %C) + %res = call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf8.fp8.i32.v4f32(i32 %A, i32 %B, <4 x float> %fabs.C) + store <4 x float> %res, <4 x float> addrspace(1)* %out + ret void +} + +define amdgpu_ps void @test_wmma_f32_16x16x16_fp8_bf8_negC(i32 %A, i32 %B, <4 x float> %C, <4 x float> addrspace(1)* %out) { +; GFX12-LABEL: test_wmma_f32_16x16x16_fp8_bf8_negC: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: v_wmma_f32_16x16x16_fp8_bf8 v[2:5], v0, v1, v[2:5] neg_lo:[0,0,1] +; GFX12-NEXT: global_store_b128 v[6:7], v[2:5], off +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +bb: + %fneg.C = fneg <4 x float> %C + %res = call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.fp8.bf8.i32.v4f32(i32 %A, i32 %B, <4 x float> %fneg.C) + store <4 x float> %res, <4 x float> addrspace(1)* %out + ret void +} + +define amdgpu_ps void @test_wmma_f32_16x16x16_fp8_bf8_absC(i32 %A, i32 %B, <4 x float> %C, <4 x float> addrspace(1)* %out) { +; GFX12-LABEL: test_wmma_f32_16x16x16_fp8_bf8_absC: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: v_wmma_f32_16x16x16_fp8_bf8 v[2:5], v0, v1, v[2:5] neg_hi:[0,0,1] +; GFX12-NEXT: global_store_b128 v[6:7], v[2:5], off +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +bb: + %fabs.C = call <4 x float> @llvm.fabs.v4f32(<4 x float> %C) + %res = call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.fp8.bf8.i32.v4f32(i32 %A, i32 %B, <4 x float> %fabs.C) + store <4 x float> %res, <4 x float> addrspace(1)* %out + ret void +} + +define amdgpu_ps void @test_wmma_f32_16x16x16_bf8_bf8_negC(i32 %A, i32 %B, <4 x float> %C, <4 x float> addrspace(1)* %out) { +; GFX12-LABEL: test_wmma_f32_16x16x16_bf8_bf8_negC: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: v_wmma_f32_16x16x16_bf8_bf8 v[2:5], v0, v1, v[2:5] neg_lo:[0,0,1] +; GFX12-NEXT: global_store_b128 v[6:7], v[2:5], off +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +bb: + %fneg.C = fneg <4 x float> %C + %res = call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf8.bf8.i32.v4f32(i32 %A, i32 %B, <4 x float> %fneg.C) + store <4 x float> %res, <4 x float> addrspace(1)* %out + ret void +} + +define amdgpu_ps void @test_wmma_f32_16x16x16_bf8_bf8_absC(i32 %A, i32 %B, <4 x float> %C, <4 x float> addrspace(1)* %out) { +; GFX12-LABEL: test_wmma_f32_16x16x16_bf8_bf8_absC: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: v_wmma_f32_16x16x16_bf8_bf8 v[2:5], v0, v1, v[2:5] neg_hi:[0,0,1] +; GFX12-NEXT: global_store_b128 v[6:7], v[2:5], off +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +bb: + %fabs.C = call <4 x float> @llvm.fabs.v4f32(<4 x float> %C) + %res = call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf8.bf8.i32.v4f32(i32 %A, i32 %B, <4 x float> %fabs.C) + store <4 x float> %res, <4 x float> addrspace(1)* %out + ret void +} + +define amdgpu_ps void @test_swmmac_f32_16x16x32_f16_negA(<4 x half> %A, <8 x half> %B, <4 x float> %C, i16 %Index, <4 x float> addrspace(1)* %out) { +; GFX12-LABEL: test_swmmac_f32_16x16x32_f16_negA: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: v_swmmac_f32_16x16x32_f16 v[6:9], v[0:1], v[2:5], v10 neg_lo:[1,0,0] neg_hi:[1,0,0] +; GFX12-NEXT: global_store_b128 v[11:12], v[6:9], off +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +bb: + %fneg.A = fneg <4 x half> %A + %res = call <4 x float> @llvm.amdgcn.swmmac.f32.16x16x32.f16.v4f16.v8f16.v4f32.i16(<4 x half> %fneg.A, <8 x half> %B, <4 x float> %C, i16 %Index) + store <4 x float> %res, <4 x float> addrspace(1)* %out + ret void +} + +define amdgpu_ps void @test_swmmac_f32_16x16x32_f16_negB(<4 x half> %A, <8 x half> %B, <4 x float> %C, i16 %Index, <4 x float> addrspace(1)* %out) { +; GFX12-LABEL: test_swmmac_f32_16x16x32_f16_negB: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: v_swmmac_f32_16x16x32_f16 v[6:9], v[0:1], v[2:5], v10 neg_lo:[0,1,0] neg_hi:[0,1,0] +; GFX12-NEXT: global_store_b128 v[11:12], v[6:9], off +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +bb: + %fneg.B = fneg <8 x half> %B + %res = call <4 x float> @llvm.amdgcn.swmmac.f32.16x16x32.f16.v4f16.v8f16.v4f32.i16(<4 x half> %A, <8 x half> %fneg.B, <4 x float> %C, i16 %Index) + store <4 x float> %res, <4 x float> addrspace(1)* %out + ret void +} + +define amdgpu_ps void @test_swmmac_f16_16x16x32_f16_negA(<4 x half> %A, <8 x half> %B, <4 x half> %C, i16 %Index, <4 x half> addrspace(1)* %out) { +; GFX12-LABEL: test_swmmac_f16_16x16x32_f16_negA: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: v_swmmac_f16_16x16x32_f16 v[6:7], v[0:1], v[2:5], v8 neg_lo:[1,0,0] neg_hi:[1,0,0] +; GFX12-NEXT: global_store_b64 v[9:10], v[6:7], off +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +bb: + %fneg.A = fneg <4 x half> %A + %res = call <4 x half> @llvm.amdgcn.swmmac.f16.16x16x32.f16.v4f16.v8f16.v4f16.i16(<4 x half> %fneg.A, <8 x half> %B, <4 x half> %C, i16 %Index) + store <4 x half> %res, <4 x half> addrspace(1)* %out + ret void +} + +define amdgpu_ps void @test_swmmac_f16_16x16x32_f16_negB(<4 x half> %A, <8 x half> %B, <4 x half> %C, i16 %Index, <4 x half> addrspace(1)* %out) { +; GFX12-LABEL: test_swmmac_f16_16x16x32_f16_negB: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: v_swmmac_f16_16x16x32_f16 v[6:7], v[0:1], v[2:5], v8 neg_lo:[0,1,0] neg_hi:[0,1,0] +; GFX12-NEXT: global_store_b64 v[9:10], v[6:7], off +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +bb: + %fneg.B = fneg <8 x half> %B + %res = call <4 x half> @llvm.amdgcn.swmmac.f16.16x16x32.f16.v4f16.v8f16.v4f16.i16(<4 x half> %A, <8 x half> %fneg.B, <4 x half> %C, i16 %Index) + store <4 x half> %res, <4 x half> addrspace(1)* %out + ret void +} + +; both neg and abs patterns (wmma matrix C f32 or f16 ) + +define amdgpu_ps void @test_wmma_f32_16x16x16_f16_negabsC(<4 x half> %A, <4 x half> %B, <4 x float> %C, <4 x float> addrspace(1)* %out) { +; GFX12-LABEL: test_wmma_f32_16x16x16_f16_negabsC: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: v_wmma_f32_16x16x16_f16 v[4:7], v[0:1], v[2:3], v[4:7] neg_lo:[0,0,1] neg_hi:[0,0,1] +; GFX12-NEXT: global_store_b128 v[8:9], v[4:7], off +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +bb: + %fabs.C = call <4 x float> @llvm.fabs.v4f32(<4 x float> %C) + %fneg.fabs.C = fneg <4 x float> %fabs.C + %res = call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.f16.v4f16.v4f32(<4 x half> %A, <4 x half> %B, <4 x float> %fneg.fabs.C) + store <4 x float> %res, <4 x float> addrspace(1)* %out + ret void +} + +define amdgpu_ps void @test_wmma_f16_16x16x16_f16_negabsC(<4 x half> %A, <4 x half> %B, <4 x half> %C, <4 x half> addrspace(1)* %out) { +; GFX12-LABEL: test_wmma_f16_16x16x16_f16_negabsC: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: v_wmma_f16_16x16x16_f16 v[4:5], v[0:1], v[2:3], v[4:5] neg_lo:[0,0,1] neg_hi:[0,0,1] +; GFX12-NEXT: global_store_b64 v[6:7], v[4:5], off +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +bb: + %fabs.C = call <4 x half> @llvm.fabs.v4f16(<4 x half> %C) + %fneg.fabs.C = fneg <4 x half> %fabs.C + %res = call <4 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16.v8f16.v8f16(<4 x half> %A, <4 x half> %B, <4 x half> %fneg.fabs.C, i1 0) + store <4 x half> %res, <4 x half> addrspace(1)* %out + ret void +} + +define amdgpu_ps void @test_wmma_f32_16x16x16_f16_neg_partial_fabsA(<4 x half> %A, <4 x half> %B, <4 x float> %C, <4 x float> addrspace(1)* %out) { +; GFX12-LABEL: test_wmma_f32_16x16x16_f16_neg_partial_fabsA: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: v_and_b32_e32 v7, 0x7fffffff, v7 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: v_wmma_f32_16x16x16_f16 v[4:7], v[0:1], v[2:3], v[4:7] neg_lo:[0,0,1] +; GFX12-NEXT: global_store_b128 v[8:9], v[4:7], off +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +bb: + %el3 = extractelement <4 x float> %C, i32 3 + %el3.fabs = call float @llvm.fabs.f32(float %el3) + %partial.fabs.C = insertelement <4 x float> %C, float %el3.fabs, i32 3 + %fneg.partial.fabs.C = fneg <4 x float> %partial.fabs.C + %res = call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.f16.v4f16.v4f32(<4 x half> %A, <4 x half> %B, <4 x float> %fneg.partial.fabs.C) + store <4 x float> %res, <4 x float> addrspace(1)* %out + ret void +} + +; A or B matrix modifier and constant in C + +define amdgpu_ps void @test_wmma_f32_16x16x16_f16_negA_constantC(<4 x half> %A, <4 x half> %B, <4 x float> %C, <4 x float> addrspace(1)* %out) { +; GFX12-LABEL: test_wmma_f32_16x16x16_f16_negA_constantC: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: v_wmma_f32_16x16x16_f16 v[6:9], v[0:1], v[2:3], 1.0 neg_lo:[1,0,0] neg_hi:[1,0,0] +; GFX12-NEXT: global_store_b128 v[4:5], v[6:9], off +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +bb: + %fneg.A = fneg <4 x half> %A + %res = call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.f16.v4f16.v4f32(<4 x half> %fneg.A, <4 x half> %B, <4 x float> ) + store <4 x float> %res, <4 x float> addrspace(1)* %out + ret void +} + +define amdgpu_ps void @test_wmma_f16_16x16x16_f16_negB_constantC(<4 x half> %A, <4 x half> %B, <4 x half> %C, <4 x half> addrspace(1)* %out) { +; GFX12-LABEL: test_wmma_f16_16x16x16_f16_negB_constantC: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: v_wmma_f16_16x16x16_f16 v[6:7], v[0:1], v[2:3], 1.0 neg_lo:[0,1,0] neg_hi:[0,1,0] +; GFX12-NEXT: global_store_b64 v[4:5], v[6:7], off +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +bb: + %fneg.B = fneg <4 x half> %B + %res = call <4 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16.v8f16.v8f16(<4 x half> %A, <4 x half> %fneg.B, <4 x half> , i1 0) + store <4 x half> %res, <4 x half> addrspace(1)* %out + ret void +} + +; pack f16 elements with v_perm_b32 since they don't come from same b32 + +define amdgpu_ps void @test_wmma_f16_16x16x16_f16_negC_pack(<4 x half> %A, <4 x half> %B, ptr %Caddr, <4 x half> addrspace(1)* %out) { +; GFX12-LABEL: test_wmma_f16_16x16x16_f16_negC_pack: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: flat_load_b128 v[8:11], v[4:5] +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: v_and_b32_e32 v4, 0xffff, v8 +; GFX12-NEXT: v_and_b32_e32 v5, 0xffff, v10 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX12-NEXT: v_lshl_or_b32 v4, v9, 16, v4 +; GFX12-NEXT: v_lshl_or_b32 v5, v11, 16, v5 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: v_wmma_f16_16x16x16_f16 v[4:5], v[0:1], v[2:3], v[4:5] neg_lo:[0,0,1] +; GFX12-NEXT: global_store_b64 v[6:7], v[4:5], off +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +bb: + %C = load <8 x half>, ptr %Caddr + %C_shuffle = shufflevector <8 x half> %C, <8 x half> undef, <4 x i32> + %fneg.C_shuffle = fneg <4 x half> %C_shuffle + %res = call <4 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16.v8f16.v8f16(<4 x half> %A, <4 x half> %B, <4 x half> %fneg.C_shuffle , i1 0) + store <4 x half> %res, <4 x half> addrspace(1)* %out + ret void +} + +declare <4 x half> @llvm.fabs.v4f16(<4 x half>) +declare <4 x float> @llvm.fabs.v4f32(<4 x float>) +declare float @llvm.fabs.f32(float) + +declare <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.f16.v4f16.v4f32(<4 x half>, <4 x half>, <4 x float>) +declare <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf16.v4i16.v4f32(<4 x i16>, <4 x i16>, <4 x float>) +declare <4 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16.v8f16.v8f16(<4 x half>, <4 x half>, <4 x half>, i1 immarg) +declare <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.fp8.fp8.i32.v4f32(i32, i32, <4 x float>) +declare <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.fp8.bf8.i32.v4f32(i32, i32, <4 x float>) +declare <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf8.fp8.i32.v4f32(i32, i32, <4 x float>) +declare <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf8.bf8.i32.v4f32(i32, i32, <4 x float>) +declare <4 x float> @llvm.amdgcn.swmmac.f32.16x16x32.f16.v4f16.v8f16.v4f32.i16(<4 x half>, <8 x half>, <4 x float>, i16) +declare <4 x half> @llvm.amdgcn.swmmac.f16.16x16x32.f16.v4f16.v8f16.v4f16.i16(<4 x half>, <8 x half>, <4 x half>, i16) diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/wmma-gfx12-w64-imm.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/wmma-gfx12-w64-imm.ll new file mode 100644 index 000000000000..c4d70fd5f063 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/wmma-gfx12-w64-imm.ll @@ -0,0 +1,430 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc -global-isel -march=amdgcn -mcpu=gfx1200 -mattr=-wavefrontsize32,+wavefrontsize64 -verify-machineinstrs < %s | FileCheck %s --check-prefix=GFX12 + +define amdgpu_ps void @test_wmma_f32_16x16x16_f16_imm(<4 x half> %A, <4 x half> %B, ptr addrspace(1) %out) { +; GFX12-LABEL: test_wmma_f32_16x16x16_f16_imm: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: v_wmma_f32_16x16x16_f16 v[6:9], v[0:1], v[2:3], 1.0 +; GFX12-NEXT: global_store_b128 v[4:5], v[6:9], off +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +bb: + %res = call <4 x float>@llvm.amdgcn.wmma.f32.16x16x16.f16.v4f32.v4f16.v4f16.v4f32(<4 x half> %A, <4 x half> %B, <4 x float> ) + store <4 x float> %res, ptr addrspace(1) %out + ret void +} + +define amdgpu_ps void @test_wmma_f32_16x16x16_f16_imm_non_inlineable(<4 x half> %A, <4 x half> %B, ptr addrspace(1) %out) { +; GFX12-LABEL: test_wmma_f32_16x16x16_f16_imm_non_inlineable: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: s_mov_b32 s0, 0x40400000 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: s_mov_b32 s3, s0 +; GFX12-NEXT: s_mov_b32 s1, s0 +; GFX12-NEXT: s_mov_b32 s2, s0 +; GFX12-NEXT: v_mov_b32_e32 v9, s3 +; GFX12-NEXT: v_mov_b32_e32 v8, s2 +; GFX12-NEXT: v_mov_b32_e32 v7, s1 +; GFX12-NEXT: v_mov_b32_e32 v6, s0 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: v_wmma_f32_16x16x16_f16 v[6:9], v[0:1], v[2:3], v[6:9] +; GFX12-NEXT: global_store_b128 v[4:5], v[6:9], off +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +bb: + %res = call <4 x float>@llvm.amdgcn.wmma.f32.16x16x16.f16.v4f32.v4f16.v4f16.v4f32(<4 x half> %A, <4 x half> %B, <4 x float> ) + store <4 x float> %res, ptr addrspace(1) %out + ret void +} + +define amdgpu_ps void @test_wmma_f32_16x16x16_bf16_imm(<4 x i16> %A, <4 x i16> %B, ptr addrspace(1) %out) { +; GFX12-LABEL: test_wmma_f32_16x16x16_bf16_imm: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: v_wmma_f32_16x16x16_bf16 v[6:9], v[0:1], v[2:3], 1.0 +; GFX12-NEXT: global_store_b128 v[4:5], v[6:9], off +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +bb: + %res = call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf16.v4f32.v4i16.v4i16.v4f32(<4 x i16> %A, <4 x i16> %B, <4 x float> ) + store <4 x float> %res, ptr addrspace(1) %out + ret void +} + +define amdgpu_ps void @test_wmma_f32_16x16x16_bf16_imm_non_inlineable(<4 x i16> %A, <4 x i16> %B, ptr addrspace(1) %out) { +; GFX12-LABEL: test_wmma_f32_16x16x16_bf16_imm_non_inlineable: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: s_mov_b32 s0, 0x40400000 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: s_mov_b32 s3, s0 +; GFX12-NEXT: s_mov_b32 s1, s0 +; GFX12-NEXT: s_mov_b32 s2, s0 +; GFX12-NEXT: v_mov_b32_e32 v9, s3 +; GFX12-NEXT: v_mov_b32_e32 v8, s2 +; GFX12-NEXT: v_mov_b32_e32 v7, s1 +; GFX12-NEXT: v_mov_b32_e32 v6, s0 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: v_wmma_f32_16x16x16_bf16 v[6:9], v[0:1], v[2:3], v[6:9] +; GFX12-NEXT: global_store_b128 v[4:5], v[6:9], off +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +bb: + %res = call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf16.v4f32.v4i16.v4i16.v4f32(<4 x i16> %A, <4 x i16> %B, <4 x float> ) + store <4 x float> %res, ptr addrspace(1) %out + ret void +} + +define amdgpu_ps void @test_wmma_f16_16x16x16_f16_imm(<4 x half> %A, <4 x half> %B, ptr addrspace(1) %out) { +; GFX12-LABEL: test_wmma_f16_16x16x16_f16_imm: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: v_wmma_f16_16x16x16_f16 v[6:7], v[0:1], v[2:3], 1.0 +; GFX12-NEXT: global_store_b64 v[4:5], v[6:7], off +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +bb: + %res = call <4 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16.v4f16.v4f16.v4f16.v4f16(<4 x half> %A, <4 x half> %B, <4 x half> , i1 0) + store <4 x half> %res, ptr addrspace(1) %out + ret void +} + +define amdgpu_ps void @test_wmma_f16_16x16x16_f16_imm_non_inlineable(<4 x half> %A, <4 x half> %B, ptr addrspace(1) %out) { +; GFX12-LABEL: test_wmma_f16_16x16x16_f16_imm_non_inlineable: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: s_mov_b32 s0, 0x42004200 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX12-NEXT: s_mov_b32 s1, s0 +; GFX12-NEXT: v_mov_b32_e32 v7, s1 +; GFX12-NEXT: v_mov_b32_e32 v6, s0 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: v_wmma_f16_16x16x16_f16 v[6:7], v[0:1], v[2:3], v[6:7] +; GFX12-NEXT: global_store_b64 v[4:5], v[6:7], off +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +bb: + %res = call <4 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16.v4f16.v4f16.v4f16.v4f16(<4 x half> %A, <4 x half> %B, <4 x half> , i1 0) + store <4 x half> %res, ptr addrspace(1) %out + ret void +} + +define amdgpu_ps void @test_wmma_bf16_16x16x16_bf16_imm(<4 x i16> %A, <4 x i16> %B, ptr addrspace(1) %out) { +; GFX12-LABEL: test_wmma_bf16_16x16x16_bf16_imm: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: s_mov_b32 s0, 0x3f803f80 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX12-NEXT: s_mov_b32 s1, s0 +; GFX12-NEXT: v_mov_b32_e32 v7, s1 +; GFX12-NEXT: v_mov_b32_e32 v6, s0 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: v_wmma_bf16_16x16x16_bf16 v[6:7], v[0:1], v[2:3], v[6:7] +; GFX12-NEXT: global_store_b64 v[4:5], v[6:7], off +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +bb: + %res = call <4 x i16> @llvm.amdgcn.wmma.bf16.16x16x16.bf16.v4i16.v4i16.v4i16.v4i16(<4 x i16> %A, <4 x i16> %B, <4 x i16> , i1 0) + store <4 x i16> %res, ptr addrspace(1) %out + ret void +} + +define amdgpu_ps void @test_wmma_bf16_16x16x16_bf16_imm_non_inlineable(<4 x i16> %A, <4 x i16> %B, ptr addrspace(1) %out) { +; GFX12-LABEL: test_wmma_bf16_16x16x16_bf16_imm_non_inlineable: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: s_mov_b32 s0, 0x3fc03fc0 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX12-NEXT: s_mov_b32 s1, s0 +; GFX12-NEXT: v_mov_b32_e32 v7, s1 +; GFX12-NEXT: v_mov_b32_e32 v6, s0 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: v_wmma_bf16_16x16x16_bf16 v[6:7], v[0:1], v[2:3], v[6:7] +; GFX12-NEXT: global_store_b64 v[4:5], v[6:7], off +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +bb: + %res = call <4 x i16> @llvm.amdgcn.wmma.bf16.16x16x16.bf16.v4i16.v4i16.v4i16.v4i16(<4 x i16> %A, <4 x i16> %B, <4 x i16> , i1 0) + store <4 x i16> %res, ptr addrspace(1) %out + ret void +} + +define amdgpu_ps void @test_wmma_i32_16x16x16_iu8_imm(i32 %A, i32 %B, ptr addrspace(1) %out) { +; GFX12-LABEL: test_wmma_i32_16x16x16_iu8_imm: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: v_wmma_i32_16x16x16_iu8 v[4:7], v0, v1, 1 +; GFX12-NEXT: global_store_b128 v[2:3], v[4:7], off +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +bb: + %res = call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8.v4i32.i32.i32.v4i32(i1 0, i32 %A, i1 0, i32 %B, <4 x i32> , i1 0) + store <4 x i32> %res, ptr addrspace(1) %out + ret void +} + +define amdgpu_ps void @test_wmma_i32_16x16x16_iu8_imm_non_inlineable(i32 %A, i32 %B, ptr addrspace(1) %out) { +; GFX12-LABEL: test_wmma_i32_16x16x16_iu8_imm_non_inlineable: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: s_movk_i32 s0, 0x80 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: s_mov_b32 s3, s0 +; GFX12-NEXT: s_mov_b32 s1, s0 +; GFX12-NEXT: s_mov_b32 s2, s0 +; GFX12-NEXT: v_mov_b32_e32 v7, s3 +; GFX12-NEXT: v_mov_b32_e32 v6, s2 +; GFX12-NEXT: v_mov_b32_e32 v5, s1 +; GFX12-NEXT: v_mov_b32_e32 v4, s0 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: v_wmma_i32_16x16x16_iu8 v[4:7], v0, v1, v[4:7] +; GFX12-NEXT: global_store_b128 v[2:3], v[4:7], off +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +bb: + %res = call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8.v4i32.i32.i32.v4i32(i1 0, i32 %A, i1 0, i32 %B, <4 x i32> , i1 0) + store <4 x i32> %res, ptr addrspace(1) %out + ret void +} + +define amdgpu_ps void @test_wmma_i32_16x16x16_iu4_imm(i32 %A, i32 %B, ptr addrspace(1) %out) { +; GFX12-LABEL: test_wmma_i32_16x16x16_iu4_imm: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: v_wmma_i32_16x16x16_iu4 v[4:7], v0, v1, 1 +; GFX12-NEXT: global_store_b128 v[2:3], v[4:7], off +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +bb: + %res = call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4.v4i32.i32.i32.v4i32(i1 0, i32 %A, i1 0, i32 %B, <4 x i32> , i1 0) + store <4 x i32> %res, ptr addrspace(1) %out + ret void +} + +define amdgpu_ps void @test_wmma_i32_16x16x16_iu4_imm_non_inlineable(i32 %A, i32 %B, ptr addrspace(1) %out) { +; GFX12-LABEL: test_wmma_i32_16x16x16_iu4_imm_non_inlineable: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: s_movk_i32 s0, 0x80 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: s_mov_b32 s3, s0 +; GFX12-NEXT: s_mov_b32 s1, s0 +; GFX12-NEXT: s_mov_b32 s2, s0 +; GFX12-NEXT: v_mov_b32_e32 v7, s3 +; GFX12-NEXT: v_mov_b32_e32 v6, s2 +; GFX12-NEXT: v_mov_b32_e32 v5, s1 +; GFX12-NEXT: v_mov_b32_e32 v4, s0 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: v_wmma_i32_16x16x16_iu4 v[4:7], v0, v1, v[4:7] +; GFX12-NEXT: global_store_b128 v[2:3], v[4:7], off +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +bb: + %res = call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4.v4i32.i32.i32.v4i32(i1 0, i32 %A, i1 0, i32 %B, <4 x i32> , i1 0) + store <4 x i32> %res, ptr addrspace(1) %out + ret void +} + +define amdgpu_ps void @test_wmma_f32_16x16x16_fp8_fp8_imm(i32 %A, i32 %B, <4 x float> %C, ptr addrspace(1) %out) { +; GFX12-LABEL: test_wmma_f32_16x16x16_fp8_fp8_imm: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: v_wmma_f32_16x16x16_fp8_fp8 v[4:7], v0, v1, 1.0 +; GFX12-NEXT: global_store_b128 v[2:3], v[4:7], off +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +bb: + %res = call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.fp8.fp8.v4f32.i32.i32.v4f32(i32 %A, i32 %B, <4 x float> ) + store <4 x float> %res, ptr addrspace(1) %out + ret void +} + +define amdgpu_ps void @test_wmma_f32_16x16x16_fp8_fp8_imm_non_inlineable(i32 %A, i32 %B, <4 x float> %C, ptr addrspace(1) %out) { +; GFX12-LABEL: test_wmma_f32_16x16x16_fp8_fp8_imm_non_inlineable: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: s_mov_b32 s0, 0x40400000 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: s_mov_b32 s3, s0 +; GFX12-NEXT: s_mov_b32 s1, s0 +; GFX12-NEXT: s_mov_b32 s2, s0 +; GFX12-NEXT: v_mov_b32_e32 v7, s3 +; GFX12-NEXT: v_mov_b32_e32 v6, s2 +; GFX12-NEXT: v_mov_b32_e32 v5, s1 +; GFX12-NEXT: v_mov_b32_e32 v4, s0 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: v_wmma_f32_16x16x16_fp8_fp8 v[4:7], v0, v1, v[4:7] +; GFX12-NEXT: global_store_b128 v[2:3], v[4:7], off +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +bb: + %res = call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.fp8.fp8.v4f32.i32.i32.v4f32(i32 %A, i32 %B, <4 x float> ) + store <4 x float> %res, ptr addrspace(1) %out + ret void +} + +define amdgpu_ps void @test_wmma_f32_16x16x16_bf8_fp8_imm(i32 %A, i32 %B, <4 x float> %C, ptr addrspace(1) %out) { +; GFX12-LABEL: test_wmma_f32_16x16x16_bf8_fp8_imm: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: v_wmma_f32_16x16x16_bf8_fp8 v[4:7], v0, v1, 1.0 +; GFX12-NEXT: global_store_b128 v[2:3], v[4:7], off +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +bb: + %res = call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf8.fp8.v4f32.i32.i32.v4f32(i32 %A, i32 %B, <4 x float> ) + store <4 x float> %res, ptr addrspace(1) %out + ret void +} + +define amdgpu_ps void @test_wmma_f32_16x16x16_bf8_fp8_imm_non_inlineable(i32 %A, i32 %B, <4 x float> %C, ptr addrspace(1) %out) { +; GFX12-LABEL: test_wmma_f32_16x16x16_bf8_fp8_imm_non_inlineable: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: s_mov_b32 s0, 0x40400000 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: s_mov_b32 s3, s0 +; GFX12-NEXT: s_mov_b32 s1, s0 +; GFX12-NEXT: s_mov_b32 s2, s0 +; GFX12-NEXT: v_mov_b32_e32 v7, s3 +; GFX12-NEXT: v_mov_b32_e32 v6, s2 +; GFX12-NEXT: v_mov_b32_e32 v5, s1 +; GFX12-NEXT: v_mov_b32_e32 v4, s0 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: v_wmma_f32_16x16x16_bf8_fp8 v[4:7], v0, v1, v[4:7] +; GFX12-NEXT: global_store_b128 v[2:3], v[4:7], off +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +bb: + %res = call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf8.fp8.v4f32.i32.i32.v4f32(i32 %A, i32 %B, <4 x float> ) + store <4 x float> %res, ptr addrspace(1) %out + ret void +} + +define amdgpu_ps void @test_wmma_f32_16x16x16_fp8_bf8_imm(i32 %A, i32 %B, <4 x float> %C, ptr addrspace(1) %out) { +; GFX12-LABEL: test_wmma_f32_16x16x16_fp8_bf8_imm: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: v_wmma_f32_16x16x16_fp8_bf8 v[4:7], v0, v1, 1.0 +; GFX12-NEXT: global_store_b128 v[2:3], v[4:7], off +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +bb: + %res = call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.fp8.bf8.v4f32.i32.i32.v4f32(i32 %A, i32 %B, <4 x float> ) + store <4 x float> %res, ptr addrspace(1) %out + ret void +} + +define amdgpu_ps void @test_wmma_f32_16x16x16_fp8_bf8_imm_non_inlineable(i32 %A, i32 %B, <4 x float> %C, ptr addrspace(1) %out) { +; GFX12-LABEL: test_wmma_f32_16x16x16_fp8_bf8_imm_non_inlineable: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: s_mov_b32 s0, 0x40400000 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: s_mov_b32 s3, s0 +; GFX12-NEXT: s_mov_b32 s1, s0 +; GFX12-NEXT: s_mov_b32 s2, s0 +; GFX12-NEXT: v_mov_b32_e32 v7, s3 +; GFX12-NEXT: v_mov_b32_e32 v6, s2 +; GFX12-NEXT: v_mov_b32_e32 v5, s1 +; GFX12-NEXT: v_mov_b32_e32 v4, s0 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: v_wmma_f32_16x16x16_fp8_bf8 v[4:7], v0, v1, v[4:7] +; GFX12-NEXT: global_store_b128 v[2:3], v[4:7], off +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +bb: + %res = call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.fp8.bf8.v4f32.i32.i32.v4f32(i32 %A, i32 %B, <4 x float> ) + store <4 x float> %res, ptr addrspace(1) %out + ret void +} + +define amdgpu_ps void @test_wmma_f32_16x16x16_bf8_bf8_imm(i32 %A, i32 %B, <4 x float> %C, ptr addrspace(1) %out) { +; GFX12-LABEL: test_wmma_f32_16x16x16_bf8_bf8_imm: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: v_wmma_f32_16x16x16_bf8_bf8 v[4:7], v0, v1, 1.0 +; GFX12-NEXT: global_store_b128 v[2:3], v[4:7], off +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +bb: + %res = call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf8.bf8.v4f32.i32.i32.v4f32(i32 %A, i32 %B, <4 x float> ) + store <4 x float> %res, ptr addrspace(1) %out + ret void +} + +define amdgpu_ps void @test_wmma_f32_16x16x16_bf8_bf8_imm_non_inlineable(i32 %A, i32 %B, <4 x float> %C, ptr addrspace(1) %out) { +; GFX12-LABEL: test_wmma_f32_16x16x16_bf8_bf8_imm_non_inlineable: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: s_mov_b32 s0, 0x40400000 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: s_mov_b32 s3, s0 +; GFX12-NEXT: s_mov_b32 s1, s0 +; GFX12-NEXT: s_mov_b32 s2, s0 +; GFX12-NEXT: v_mov_b32_e32 v7, s3 +; GFX12-NEXT: v_mov_b32_e32 v6, s2 +; GFX12-NEXT: v_mov_b32_e32 v5, s1 +; GFX12-NEXT: v_mov_b32_e32 v4, s0 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: v_wmma_f32_16x16x16_bf8_bf8 v[4:7], v0, v1, v[4:7] +; GFX12-NEXT: global_store_b128 v[2:3], v[4:7], off +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +bb: + %res = call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf8.bf8.v4f32.i32.i32.v4f32(i32 %A, i32 %B, <4 x float> ) + store <4 x float> %res, ptr addrspace(1) %out + ret void +} + +define amdgpu_ps void @test_wmma_i32_16x16x32_iu4_imm(i32 %A, i32 %B, <4 x i32> %C, ptr addrspace(1) %out) { +; GFX12-LABEL: test_wmma_i32_16x16x32_iu4_imm: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: v_wmma_i32_16x16x32_iu4 v[4:7], v0, v1, 1 +; GFX12-NEXT: global_store_b128 v[2:3], v[4:7], off +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +bb: + %res = call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x32.iu4.v4i32.i32.i32.v4i32(i1 0, i32 %A, i1 0, i32 %B, <4 x i32> , i1 0) + store <4 x i32> %res, ptr addrspace(1) %out + ret void +} + +define amdgpu_ps void @test_wmma_i32_16x16x32_iu4_imm_non_inlineable(i32 %A, i32 %B, <4 x i32> %C, ptr addrspace(1) %out) { +; GFX12-LABEL: test_wmma_i32_16x16x32_iu4_imm_non_inlineable: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: s_movk_i32 s0, 0x80 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: s_mov_b32 s3, s0 +; GFX12-NEXT: s_mov_b32 s1, s0 +; GFX12-NEXT: s_mov_b32 s2, s0 +; GFX12-NEXT: v_mov_b32_e32 v7, s3 +; GFX12-NEXT: v_mov_b32_e32 v6, s2 +; GFX12-NEXT: v_mov_b32_e32 v5, s1 +; GFX12-NEXT: v_mov_b32_e32 v4, s0 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: v_wmma_i32_16x16x32_iu4 v[4:7], v0, v1, v[4:7] +; GFX12-NEXT: global_store_b128 v[2:3], v[4:7], off +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +bb: + %res = call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x32.iu4.v4i32.i32.i32.v4i32(i1 0, i32 %A, i1 0, i32 %B, <4 x i32> , i1 0) + store <4 x i32> %res, ptr addrspace(1) %out + ret void +} + +declare <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.f16.v4f32.v4f16.v4f16.v4f32(<4 x half>, <4 x half>, <4 x float>) +declare <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf16.v4f32.v4i16.v4i16.v4f32(<4 x i16>, <4 x i16>, <4 x float>) +declare <4 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16.v4f16.v4f16.v4f16.v4f16(<4 x half>, <4 x half>, <4 x half>, i1 immarg) +declare <4 x i16> @llvm.amdgcn.wmma.bf16.16x16x16.bf16.v4i16.v4i16.v4i16.v4i16(<4 x i16>, <4 x i16>, <4 x i16>, i1 immarg) +declare <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8.v4i32.i32.i32.v4i32(i1 immarg, i32, i1 immarg, i32, <4 x i32>, i1 immarg) +declare <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4.v4i32.i32.i32.v4i32(i1 immarg, i32, i1 immarg, i32, <4 x i32>, i1 immarg) +declare <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.fp8.fp8.v4f32.i32.i32.v4f32(i32, i32, <4 x float>) +declare <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.fp8.bf8.v4f32.i32.i32.v4f32(i32, i32, <4 x float>) +declare <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf8.fp8.v4f32.i32.i32.v4f32(i32, i32, <4 x float>) +declare <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf8.bf8.v4f32.i32.i32.v4f32(i32, i32, <4 x float>) +declare <4 x i32> @llvm.amdgcn.wmma.i32.16x16x32.iu4.v4i32.i32.i32.v4i32(i1 immarg, i32, i1 immarg, i32, <4 x i32>, i1 immarg) diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/wmma-gfx12-w64-iu-modifiers.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/wmma-gfx12-w64-iu-modifiers.ll new file mode 100644 index 000000000000..7e1d09805df3 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/wmma-gfx12-w64-iu-modifiers.ll @@ -0,0 +1,274 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc -global-isel -march=amdgcn -mcpu=gfx1200 -mattr=-wavefrontsize32,+wavefrontsize64 -verify-machineinstrs < %s | FileCheck %s --check-prefix=GFX12 + +define amdgpu_ps void @test_wmma_i32_16x16x16_iu8_zext_src0(i32 %A, i32 %B, <4 x i32> %C, ptr addrspace(1) %out) { +; GFX12-LABEL: test_wmma_i32_16x16x16_iu8_zext_src0: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: v_wmma_i32_16x16x16_iu8 v[2:5], v0, v1, v[2:5] neg_lo:[1,0,0] +; GFX12-NEXT: global_store_b128 v[6:7], v[2:5], off +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +bb: + %res = call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8.v4i32.i32.i32.v4i32(i1 1, i32 %A, i1 0, i32 %B, <4 x i32> %C, i1 0) + store <4 x i32> %res, ptr addrspace(1) %out + ret void +} + +define amdgpu_ps void @test_wmma_i32_16x16x16_iu8_zext_src1(i32 %A, i32 %B, <4 x i32> %C, ptr addrspace(1) %out) { +; GFX12-LABEL: test_wmma_i32_16x16x16_iu8_zext_src1: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: v_wmma_i32_16x16x16_iu8 v[2:5], v0, v1, v[2:5] neg_lo:[0,1,0] +; GFX12-NEXT: global_store_b128 v[6:7], v[2:5], off +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +bb: + %res = call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8.v4i32.i32.i32.v4i32(i1 0, i32 %A, i1 1, i32 %B, <4 x i32> %C, i1 0) + store <4 x i32> %res, ptr addrspace(1) %out + ret void +} + +define amdgpu_ps void @test_wmma_i32_16x16x16_iu8_clamp(i32 %A, i32 %B, <4 x i32> %C, ptr addrspace(1) %out) { +; GFX12-LABEL: test_wmma_i32_16x16x16_iu8_clamp: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: v_wmma_i32_16x16x16_iu8 v[2:5], v0, v1, v[2:5] clamp +; GFX12-NEXT: global_store_b128 v[6:7], v[2:5], off +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +bb: + %res = call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8.v4i32.i32.i32.v4i32(i1 0, i32 %A, i1 0, i32 %B, <4 x i32> %C, i1 1) + store <4 x i32> %res, ptr addrspace(1) %out + ret void +} + + + +define amdgpu_ps void @test_wmma_i32_16x16x16_iu4_zext_src0(i32 %A, i32 %B, <4 x i32> %C, ptr addrspace(1) %out) { +; GFX12-LABEL: test_wmma_i32_16x16x16_iu4_zext_src0: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: v_wmma_i32_16x16x16_iu4 v[2:5], v0, v1, v[2:5] neg_lo:[1,0,0] +; GFX12-NEXT: global_store_b128 v[6:7], v[2:5], off +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +bb: + %res = call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4.v4i32.i32.i32.v4i32(i1 1, i32 %A, i1 0, i32 %B, <4 x i32> %C, i1 0) + store <4 x i32> %res, ptr addrspace(1) %out + ret void +} + +define amdgpu_ps void @test_wmma_i32_16x16x16_iu4_zext_src1(i32 %A, i32 %B, <4 x i32> %C, ptr addrspace(1) %out) { +; GFX12-LABEL: test_wmma_i32_16x16x16_iu4_zext_src1: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: v_wmma_i32_16x16x16_iu4 v[2:5], v0, v1, v[2:5] neg_lo:[0,1,0] +; GFX12-NEXT: global_store_b128 v[6:7], v[2:5], off +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +bb: + %res = call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4.v4i32.i32.i32.v4i32(i1 0, i32 %A, i1 1, i32 %B, <4 x i32> %C, i1 0) + store <4 x i32> %res, ptr addrspace(1) %out + ret void +} + +define amdgpu_ps void @test_wmma_i32_16x16x16_iu4_clamp(i32 %A, i32 %B, <4 x i32> %C, ptr addrspace(1) %out) { +; GFX12-LABEL: test_wmma_i32_16x16x16_iu4_clamp: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: v_wmma_i32_16x16x16_iu4 v[2:5], v0, v1, v[2:5] clamp +; GFX12-NEXT: global_store_b128 v[6:7], v[2:5], off +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +bb: + %res = call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4.v4i32.i32.i32.v4i32(i1 0, i32 %A, i1 0, i32 %B, <4 x i32> %C, i1 1) + store <4 x i32> %res, ptr addrspace(1) %out + ret void +} + + + +define amdgpu_ps void @test_wmma_i32_16x16x32_iu4_zext_src0(i32 %A, i32 %B, <4 x i32> %C, ptr addrspace(1) %out) { +; GFX12-LABEL: test_wmma_i32_16x16x32_iu4_zext_src0: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: v_wmma_i32_16x16x32_iu4 v[2:5], v0, v1, v[2:5] neg_lo:[1,0,0] +; GFX12-NEXT: global_store_b128 v[6:7], v[2:5], off +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +bb: + %res = call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x32.iu4.v4i32.i32.i32.v4i32(i1 1, i32 %A, i1 0, i32 %B, <4 x i32> %C, i1 0) + store <4 x i32> %res, ptr addrspace(1) %out + ret void +} + +define amdgpu_ps void @test_wmma_i32_16x16x32_iu4_zext_src1(i32 %A, i32 %B, <4 x i32> %C, ptr addrspace(1) %out) { +; GFX12-LABEL: test_wmma_i32_16x16x32_iu4_zext_src1: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: v_wmma_i32_16x16x32_iu4 v[2:5], v0, v1, v[2:5] neg_lo:[0,1,0] +; GFX12-NEXT: global_store_b128 v[6:7], v[2:5], off +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +bb: + %res = call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x32.iu4.v4i32.i32.i32.v4i32(i1 0, i32 %A, i1 1, i32 %B, <4 x i32> %C, i1 0) + store <4 x i32> %res, ptr addrspace(1) %out + ret void +} + +define amdgpu_ps void @test_wmma_i32_16x16x32_iu4_clamp(i32 %A, i32 %B, <4 x i32> %C, ptr addrspace(1) %out) { +; GFX12-LABEL: test_wmma_i32_16x16x32_iu4_clamp: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: v_wmma_i32_16x16x32_iu4 v[2:5], v0, v1, v[2:5] clamp +; GFX12-NEXT: global_store_b128 v[6:7], v[2:5], off +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +bb: + %res = call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x32.iu4.v4i32.i32.i32.v4i32(i1 0, i32 %A, i1 0, i32 %B, <4 x i32> %C, i1 1) + store <4 x i32> %res, ptr addrspace(1) %out + ret void +} + + + + + + +define amdgpu_ps void @test_swmmac_i32_16x16x32_iu8_zext_src0(i32 %A, <2 x i32> %B, <4 x i32> %C, i8 %Index, ptr addrspace(1) %out) { +; GFX12-LABEL: test_swmmac_i32_16x16x32_iu8_zext_src0: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: v_swmmac_i32_16x16x32_iu8 v[3:6], v0, v[1:2], v7 neg_lo:[1,0,0] +; GFX12-NEXT: global_store_b128 v[8:9], v[3:6], off +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +bb: + %res = call <4 x i32> @llvm.amdgcn.swmmac.i32.16x16x32.iu8.v4i32.i32.v2i32.v4i32.i8(i1 1, i32 %A, i1 0, <2 x i32> %B, <4 x i32> %C, i8 %Index, i1 0) + store <4 x i32> %res, ptr addrspace(1) %out + ret void +} + +define amdgpu_ps void @test_swmmac_i32_16x16x32_iu8_zext_src1(i32 %A, <2 x i32> %B, <4 x i32> %C, i8 %Index, ptr addrspace(1) %out) { +; GFX12-LABEL: test_swmmac_i32_16x16x32_iu8_zext_src1: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: v_swmmac_i32_16x16x32_iu8 v[3:6], v0, v[1:2], v7 neg_lo:[0,1,0] +; GFX12-NEXT: global_store_b128 v[8:9], v[3:6], off +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +bb: + %res = call <4 x i32> @llvm.amdgcn.swmmac.i32.16x16x32.iu8.v4i32.i32.v2i32.v4i32.i8(i1 0, i32 %A, i1 1, <2 x i32> %B, <4 x i32> %C, i8 %Index, i1 0) + store <4 x i32> %res, ptr addrspace(1) %out + ret void +} + +define amdgpu_ps void @test_swmmac_i32_16x16x32_iu8_clamp(i32 %A, <2 x i32> %B, <4 x i32> %C, i8 %Index, ptr addrspace(1) %out) { +; GFX12-LABEL: test_swmmac_i32_16x16x32_iu8_clamp: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: v_swmmac_i32_16x16x32_iu8 v[3:6], v0, v[1:2], v7 clamp +; GFX12-NEXT: global_store_b128 v[8:9], v[3:6], off +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +bb: + %res = call <4 x i32> @llvm.amdgcn.swmmac.i32.16x16x32.iu8.v4i32.i32.v2i32.v4i32.i8(i1 0, i32 %A, i1 0, <2 x i32> %B, <4 x i32> %C, i8 %Index, i1 1) + store <4 x i32> %res, ptr addrspace(1) %out + ret void +} + + + +define amdgpu_ps void @test_swmmac_i32_16x16x32_iu4_zext_src0(i32 %A, i32 %B, <4 x i32> %C, i16 %Index, ptr addrspace(1) %out) { +; GFX12-LABEL: test_swmmac_i32_16x16x32_iu4_zext_src0: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: v_swmmac_i32_16x16x32_iu4 v[2:5], v0, v1, v6 neg_lo:[1,0,0] +; GFX12-NEXT: global_store_b128 v[7:8], v[2:5], off +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +bb: + %res = call <4 x i32> @llvm.amdgcn.swmmac.i32.16x16x32.iu4.v4i32.i32.i32.v4i32.i16(i1 1, i32 %A, i1 0, i32 %B, <4 x i32> %C, i16 %Index, i1 0) + store <4 x i32> %res, ptr addrspace(1) %out + ret void +} + +define amdgpu_ps void @test_swmmac_i32_16x16x32_iu4_zext_src1(i32 %A, i32 %B, <4 x i32> %C, i16 %Index, ptr addrspace(1) %out) { +; GFX12-LABEL: test_swmmac_i32_16x16x32_iu4_zext_src1: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: v_swmmac_i32_16x16x32_iu4 v[2:5], v0, v1, v6 neg_lo:[0,1,0] +; GFX12-NEXT: global_store_b128 v[7:8], v[2:5], off +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +bb: + %res = call <4 x i32> @llvm.amdgcn.swmmac.i32.16x16x32.iu4.v4i32.i32.i32.v4i32.i16(i1 0, i32 %A, i1 1, i32 %B, <4 x i32> %C, i16 %Index, i1 0) + store <4 x i32> %res, ptr addrspace(1) %out + ret void +} + +define amdgpu_ps void @test_swmmac_i32_16x16x32_iu4_clamp(i32 %A, i32 %B, <4 x i32> %C, i16 %Index, ptr addrspace(1) %out) { +; GFX12-LABEL: test_swmmac_i32_16x16x32_iu4_clamp: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: v_swmmac_i32_16x16x32_iu4 v[2:5], v0, v1, v6 clamp +; GFX12-NEXT: global_store_b128 v[7:8], v[2:5], off +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +bb: + %res = call <4 x i32> @llvm.amdgcn.swmmac.i32.16x16x32.iu4.v4i32.i32.i32.v4i32.i16(i1 0, i32 %A, i1 0, i32 %B, <4 x i32> %C, i16 %Index, i1 1) + store <4 x i32> %res, ptr addrspace(1) %out + ret void +} + + + +define amdgpu_ps void @test_swmmac_i32_16x16x64_iu4_zext_src0(i32 %A, <2 x i32> %B, <4 x i32> %C, i16 %Index, ptr addrspace(1) %out) { +; GFX12-LABEL: test_swmmac_i32_16x16x64_iu4_zext_src0: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: v_swmmac_i32_16x16x64_iu4 v[3:6], v0, v[1:2], v7 neg_lo:[1,0,0] +; GFX12-NEXT: global_store_b128 v[8:9], v[3:6], off +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +bb: + %res = call <4 x i32> @llvm.amdgcn.swmmac.i32.16x16x64.iu4.v4i32.i32.v2i32.v4i32.i16(i1 1, i32 %A, i1 0, <2 x i32> %B, <4 x i32> %C, i16 %Index, i1 0) + store <4 x i32> %res, ptr addrspace(1) %out + ret void +} + +define amdgpu_ps void @test_swmmac_i32_16x16x64_iu4_zext_src1(i32 %A, <2 x i32> %B, <4 x i32> %C, i16 %Index, ptr addrspace(1) %out) { +; GFX12-LABEL: test_swmmac_i32_16x16x64_iu4_zext_src1: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: v_swmmac_i32_16x16x64_iu4 v[3:6], v0, v[1:2], v7 neg_lo:[0,1,0] +; GFX12-NEXT: global_store_b128 v[8:9], v[3:6], off +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +bb: + %res = call <4 x i32> @llvm.amdgcn.swmmac.i32.16x16x64.iu4.v4i32.i32.v2i32.v4i32.i16(i1 0, i32 %A, i1 1, <2 x i32> %B, <4 x i32> %C, i16 %Index, i1 0) + store <4 x i32> %res, ptr addrspace(1) %out + ret void +} + +define amdgpu_ps void @test_swmmac_i32_16x16x64_iu4_clamp(i32 %A, <2 x i32> %B, <4 x i32> %C, i16 %Index, ptr addrspace(1) %out) { +; GFX12-LABEL: test_swmmac_i32_16x16x64_iu4_clamp: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: v_swmmac_i32_16x16x64_iu4 v[3:6], v0, v[1:2], v7 clamp +; GFX12-NEXT: global_store_b128 v[8:9], v[3:6], off +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +bb: + %res = call <4 x i32> @llvm.amdgcn.swmmac.i32.16x16x64.iu4.v4i32.i32.v2i32.v4i32.i16(i1 0, i32 %A, i1 0, <2 x i32> %B, <4 x i32> %C, i16 %Index, i1 1) + store <4 x i32> %res, ptr addrspace(1) %out + ret void +} + +declare <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8.v4i32.i32.i32.v4i32(i1 immarg, i32, i1 immarg, i32, <4 x i32>, i1 immarg) +declare <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4.v4i32.i32.i32.v4i32(i1 immarg, i32, i1 immarg, i32, <4 x i32>, i1 immarg) +declare <4 x i32> @llvm.amdgcn.wmma.i32.16x16x32.iu4.v4i32.i32.i32.v4i32(i1 immarg, i32, i1 immarg, i32, <4 x i32>, i1 immarg) +declare <4 x i32> @llvm.amdgcn.swmmac.i32.16x16x32.iu8.v4i32.i32.v2i32.v4i32.i8(i1 immarg, i32, i1 immarg, <2 x i32>, <4 x i32>, i8 %Index, i1 immarg) +declare <4 x i32> @llvm.amdgcn.swmmac.i32.16x16x32.iu4.v4i32.i32.i32.v4i32.i16(i1 immarg, i32, i1 immarg, i32, <4 x i32>, i16 %Index, i1 immarg) +declare <4 x i32> @llvm.amdgcn.swmmac.i32.16x16x64.iu4.v4i32.i32.v2i32.v4i32.i16(i1 immarg, i32, i1 immarg, <2 x i32>, <4 x i32>, i16 %Index, i1 immarg) diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/wmma-gfx12-w64-swmmac-index_key.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/wmma-gfx12-w64-swmmac-index_key.ll new file mode 100644 index 000000000000..b6f1828dce25 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/wmma-gfx12-w64-swmmac-index_key.ll @@ -0,0 +1,472 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc -global-isel -march=amdgcn -mcpu=gfx1200 -mattr=-wavefrontsize32,+wavefrontsize64 -verify-machineinstrs < %s | FileCheck %s --check-prefix=GFX12 + +define amdgpu_ps void @test_swmmac_f32_16x16x32_f16_index_key(<4 x half> %A, <8 x half> %B, <4 x float> %C, ptr addrspace(1) %IndexVecPtr, ptr addrspace(1) %out0, ptr addrspace(1) %out1, ptr addrspace(1) %out2, ptr addrspace(1) %out3) { +; GFX12-LABEL: test_swmmac_f32_16x16x32_f16_index_key: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: global_load_b32 v10, v[10:11], off +; GFX12-NEXT: v_mov_b32_e32 v23, v9 +; GFX12-NEXT: v_mov_b32_e32 v22, v8 +; GFX12-NEXT: v_mov_b32_e32 v21, v7 +; GFX12-NEXT: v_mov_b32_e32 v20, v6 +; GFX12-NEXT: v_mov_b32_e32 v27, v9 +; GFX12-NEXT: v_mov_b32_e32 v26, v8 +; GFX12-NEXT: v_mov_b32_e32 v25, v7 +; GFX12-NEXT: v_mov_b32_e32 v24, v6 +; GFX12-NEXT: v_mov_b32_e32 v31, v9 +; GFX12-NEXT: v_mov_b32_e32 v30, v8 +; GFX12-NEXT: v_mov_b32_e32 v29, v7 +; GFX12-NEXT: v_mov_b32_e32 v28, v6 +; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: v_swmmac_f32_16x16x32_f16 v[20:23], v[0:1], v[2:5], v10 +; GFX12-NEXT: v_swmmac_f32_16x16x32_f16 v[24:27], v[0:1], v[2:5], v10 index_key:1 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) +; GFX12-NEXT: v_swmmac_f32_16x16x32_f16 v[28:31], v[0:1], v[2:5], v10 index_key:2 +; GFX12-NEXT: v_swmmac_f32_16x16x32_f16 v[6:9], v[0:1], v[2:5], v10 index_key:3 +; GFX12-NEXT: global_store_b128 v[12:13], v[20:23], off +; GFX12-NEXT: global_store_b128 v[14:15], v[24:27], off +; GFX12-NEXT: global_store_b128 v[16:17], v[28:31], off +; GFX12-NEXT: global_store_b128 v[18:19], v[6:9], off +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +bb: + %IndexVec = load <4 x i8>, ptr addrspace(1) %IndexVecPtr, align 4 + %Index0 = extractelement <4 x i8> %IndexVec, i32 0 + %res0 = call <4 x float> @llvm.amdgcn.swmmac.f32.16x16x32.f16.v4f32.v4f16.v8f16.v4f32.i8(<4 x half> %A, <8 x half> %B, <4 x float> %C, i8 %Index0) + store <4 x float> %res0, ptr addrspace(1) %out0 + %Index1 = extractelement <4 x i8> %IndexVec, i32 1 + %res1 = call <4 x float> @llvm.amdgcn.swmmac.f32.16x16x32.f16.v4f32.v4f16.v8f16.v4f32.i8(<4 x half> %A, <8 x half> %B, <4 x float> %C, i8 %Index1) + store <4 x float> %res1, ptr addrspace(1) %out1 + %Index2 = extractelement <4 x i8> %IndexVec, i32 2 + %res2 = call <4 x float> @llvm.amdgcn.swmmac.f32.16x16x32.f16.v4f32.v4f16.v8f16.v4f32.i8(<4 x half> %A, <8 x half> %B, <4 x float> %C, i8 %Index2) + store <4 x float> %res2, ptr addrspace(1) %out2 + %Index3 = extractelement <4 x i8> %IndexVec, i32 3 + %res3 = call <4 x float> @llvm.amdgcn.swmmac.f32.16x16x32.f16.v4f32.v4f16.v8f16.v4f32.i8(<4 x half> %A, <8 x half> %B, <4 x float> %C, i8 %Index3) + store <4 x float> %res3, ptr addrspace(1) %out3 + ret void +} + +define amdgpu_ps void @test_swmmac_f32_16x16x32_bf16_index_key(<4 x i16> %A, <8 x i16> %B, <4 x float> %C, ptr addrspace(1) %IndexVecPtr, ptr addrspace(1) %out0, ptr addrspace(1) %out1, ptr addrspace(1) %out2, ptr addrspace(1) %out3) { +; GFX12-LABEL: test_swmmac_f32_16x16x32_bf16_index_key: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: global_load_b32 v10, v[10:11], off +; GFX12-NEXT: v_mov_b32_e32 v23, v9 +; GFX12-NEXT: v_mov_b32_e32 v22, v8 +; GFX12-NEXT: v_mov_b32_e32 v21, v7 +; GFX12-NEXT: v_mov_b32_e32 v20, v6 +; GFX12-NEXT: v_mov_b32_e32 v27, v9 +; GFX12-NEXT: v_mov_b32_e32 v26, v8 +; GFX12-NEXT: v_mov_b32_e32 v25, v7 +; GFX12-NEXT: v_mov_b32_e32 v24, v6 +; GFX12-NEXT: v_mov_b32_e32 v31, v9 +; GFX12-NEXT: v_mov_b32_e32 v30, v8 +; GFX12-NEXT: v_mov_b32_e32 v29, v7 +; GFX12-NEXT: v_mov_b32_e32 v28, v6 +; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: v_swmmac_f32_16x16x32_bf16 v[20:23], v[0:1], v[2:5], v10 +; GFX12-NEXT: v_swmmac_f32_16x16x32_bf16 v[24:27], v[0:1], v[2:5], v10 index_key:1 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) +; GFX12-NEXT: v_swmmac_f32_16x16x32_bf16 v[28:31], v[0:1], v[2:5], v10 index_key:2 +; GFX12-NEXT: v_swmmac_f32_16x16x32_bf16 v[6:9], v[0:1], v[2:5], v10 index_key:3 +; GFX12-NEXT: global_store_b128 v[12:13], v[20:23], off +; GFX12-NEXT: global_store_b128 v[14:15], v[24:27], off +; GFX12-NEXT: global_store_b128 v[16:17], v[28:31], off +; GFX12-NEXT: global_store_b128 v[18:19], v[6:9], off +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +bb: + %IndexVec = load <4 x i8>, ptr addrspace(1) %IndexVecPtr, align 4 + %Index0 = extractelement <4 x i8> %IndexVec, i32 0 + %res0 = call <4 x float> @llvm.amdgcn.swmmac.f32.16x16x32.bf16.v4f32.v4i16.v8i16.v4f32.i8(<4 x i16> %A, <8 x i16> %B, <4 x float> %C, i8 %Index0) + store <4 x float> %res0, ptr addrspace(1) %out0 + %Index1 = extractelement <4 x i8> %IndexVec, i32 1 + %res1 = call <4 x float> @llvm.amdgcn.swmmac.f32.16x16x32.bf16.v4f32.v4i16.v8i16.v4f32.i8(<4 x i16> %A, <8 x i16> %B, <4 x float> %C, i8 %Index1) + store <4 x float> %res1, ptr addrspace(1) %out1 + %Index2 = extractelement <4 x i8> %IndexVec, i32 2 + %res2 = call <4 x float> @llvm.amdgcn.swmmac.f32.16x16x32.bf16.v4f32.v4i16.v8i16.v4f32.i8(<4 x i16> %A, <8 x i16> %B, <4 x float> %C, i8 %Index2) + store <4 x float> %res2, ptr addrspace(1) %out2 + %Index3 = extractelement <4 x i8> %IndexVec, i32 3 + %res3 = call <4 x float> @llvm.amdgcn.swmmac.f32.16x16x32.bf16.v4f32.v4i16.v8i16.v4f32.i8(<4 x i16> %A, <8 x i16> %B, <4 x float> %C, i8 %Index3) + store <4 x float> %res3, ptr addrspace(1) %out3 + ret void +} + +define amdgpu_ps void @test_swmmac_f16_16x16x32_f16_index_key(<4 x half> %A, <8 x half> %B, <4 x half> %C, ptr addrspace(1) %IndexVecPtr, ptr addrspace(1) %out0, ptr addrspace(1) %out1, ptr addrspace(1) %out2, ptr addrspace(1) %out3) { +; GFX12-LABEL: test_swmmac_f16_16x16x32_f16_index_key: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: global_load_b32 v22, v[8:9], off +; GFX12-NEXT: v_mov_b32_e32 v9, v7 +; GFX12-NEXT: v_mov_b32_e32 v8, v6 +; GFX12-NEXT: v_mov_b32_e32 v19, v7 +; GFX12-NEXT: v_mov_b32_e32 v18, v6 +; GFX12-NEXT: v_mov_b32_e32 v21, v7 +; GFX12-NEXT: v_mov_b32_e32 v20, v6 +; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: v_swmmac_f16_16x16x32_f16 v[8:9], v[0:1], v[2:5], v22 +; GFX12-NEXT: v_swmmac_f16_16x16x32_f16 v[18:19], v[0:1], v[2:5], v22 index_key:1 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) +; GFX12-NEXT: v_swmmac_f16_16x16x32_f16 v[20:21], v[0:1], v[2:5], v22 index_key:2 +; GFX12-NEXT: v_swmmac_f16_16x16x32_f16 v[6:7], v[0:1], v[2:5], v22 index_key:3 +; GFX12-NEXT: global_store_b64 v[10:11], v[8:9], off +; GFX12-NEXT: global_store_b64 v[12:13], v[18:19], off +; GFX12-NEXT: global_store_b64 v[14:15], v[20:21], off +; GFX12-NEXT: global_store_b64 v[16:17], v[6:7], off +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +bb: + %IndexVec = load <4 x i8>, ptr addrspace(1) %IndexVecPtr, align 4 + %Index0 = extractelement <4 x i8> %IndexVec, i32 0 + %res0 = call <4 x half> @llvm.amdgcn.swmmac.f16.16x16x32.f16.v4f16.v4f16.v8f16.v4f16.i8(<4 x half> %A, <8 x half> %B, <4 x half> %C, i8 %Index0) + store <4 x half> %res0, ptr addrspace(1) %out0 + %Index1 = extractelement <4 x i8> %IndexVec, i32 1 + %res1 = call <4 x half> @llvm.amdgcn.swmmac.f16.16x16x32.f16.v4f16.v4f16.v8f16.v4f16.i8(<4 x half> %A, <8 x half> %B, <4 x half> %C, i8 %Index1) + store <4 x half> %res1, ptr addrspace(1) %out1 + %Index2 = extractelement <4 x i8> %IndexVec, i32 2 + %res2 = call <4 x half> @llvm.amdgcn.swmmac.f16.16x16x32.f16.v4f16.v4f16.v8f16.v4f16.i8(<4 x half> %A, <8 x half> %B, <4 x half> %C, i8 %Index2) + store <4 x half> %res2, ptr addrspace(1) %out2 + %Index3 = extractelement <4 x i8> %IndexVec, i32 3 + %res3 = call <4 x half> @llvm.amdgcn.swmmac.f16.16x16x32.f16.v4f16.v4f16.v8f16.v4f16.i8(<4 x half> %A, <8 x half> %B, <4 x half> %C, i8 %Index3) + store <4 x half> %res3, ptr addrspace(1) %out3 + ret void +} + +define amdgpu_ps void @test_swmmac_bf16_16x16x32_bf16_index_key(<4 x i16> %A, <8 x i16> %B, <4 x i16> %C, ptr addrspace(1) %IndexVecPtr, ptr addrspace(1) %out0, ptr addrspace(1) %out1, ptr addrspace(1) %out2, ptr addrspace(1) %out3) { +; GFX12-LABEL: test_swmmac_bf16_16x16x32_bf16_index_key: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: global_load_b32 v22, v[8:9], off +; GFX12-NEXT: v_mov_b32_e32 v9, v7 +; GFX12-NEXT: v_mov_b32_e32 v8, v6 +; GFX12-NEXT: v_mov_b32_e32 v19, v7 +; GFX12-NEXT: v_mov_b32_e32 v18, v6 +; GFX12-NEXT: v_mov_b32_e32 v21, v7 +; GFX12-NEXT: v_mov_b32_e32 v20, v6 +; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: v_swmmac_bf16_16x16x32_bf16 v[8:9], v[0:1], v[2:5], v22 +; GFX12-NEXT: v_swmmac_bf16_16x16x32_bf16 v[18:19], v[0:1], v[2:5], v22 index_key:1 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) +; GFX12-NEXT: v_swmmac_bf16_16x16x32_bf16 v[20:21], v[0:1], v[2:5], v22 index_key:2 +; GFX12-NEXT: v_swmmac_bf16_16x16x32_bf16 v[6:7], v[0:1], v[2:5], v22 index_key:3 +; GFX12-NEXT: global_store_b64 v[10:11], v[8:9], off +; GFX12-NEXT: global_store_b64 v[12:13], v[18:19], off +; GFX12-NEXT: global_store_b64 v[14:15], v[20:21], off +; GFX12-NEXT: global_store_b64 v[16:17], v[6:7], off +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +bb: + %IndexVec = load <4 x i8>, ptr addrspace(1) %IndexVecPtr, align 4 + %Index0 = extractelement <4 x i8> %IndexVec, i32 0 + %res0 = call <4 x i16> @llvm.amdgcn.swmmac.bf16.16x16x32.bf16.v4i16.v4i16.v8i16.v4i16.i8(<4 x i16> %A, <8 x i16> %B, <4 x i16> %C, i8 %Index0) + store <4 x i16> %res0, ptr addrspace(1) %out0 + %Index1 = extractelement <4 x i8> %IndexVec, i32 1 + %res1 = call <4 x i16> @llvm.amdgcn.swmmac.bf16.16x16x32.bf16.v4i16.v4i16.v8i16.v4i16.i8(<4 x i16> %A, <8 x i16> %B, <4 x i16> %C, i8 %Index1) + store <4 x i16> %res1, ptr addrspace(1) %out1 + %Index2 = extractelement <4 x i8> %IndexVec, i32 2 + %res2 = call <4 x i16> @llvm.amdgcn.swmmac.bf16.16x16x32.bf16.v4i16.v4i16.v8i16.v4i16.i8(<4 x i16> %A, <8 x i16> %B, <4 x i16> %C, i8 %Index2) + store <4 x i16> %res2, ptr addrspace(1) %out2 + %Index3 = extractelement <4 x i8> %IndexVec, i32 3 + %res3 = call <4 x i16> @llvm.amdgcn.swmmac.bf16.16x16x32.bf16.v4i16.v4i16.v8i16.v4i16.i8(<4 x i16> %A, <8 x i16> %B, <4 x i16> %C, i8 %Index3) + store <4 x i16> %res3, ptr addrspace(1) %out3 + ret void +} + +define amdgpu_ps void @test_swmmac_i32_16x16x32_iu8_index_key(i32 %A, <2 x i32> %B, <4 x i32> %C, ptr addrspace(1) %IndexVecPtr, ptr addrspace(1) %out0, ptr addrspace(1) %out1, ptr addrspace(1) %out2, ptr addrspace(1) %out3) { +; GFX12-LABEL: test_swmmac_i32_16x16x32_iu8_index_key: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: global_load_b32 v7, v[7:8], off +; GFX12-NEXT: v_mov_b32_e32 v20, v6 +; GFX12-NEXT: v_mov_b32_e32 v19, v5 +; GFX12-NEXT: v_mov_b32_e32 v18, v4 +; GFX12-NEXT: v_mov_b32_e32 v17, v3 +; GFX12-NEXT: v_mov_b32_e32 v24, v6 +; GFX12-NEXT: v_mov_b32_e32 v23, v5 +; GFX12-NEXT: v_mov_b32_e32 v22, v4 +; GFX12-NEXT: v_mov_b32_e32 v21, v3 +; GFX12-NEXT: v_mov_b32_e32 v28, v6 +; GFX12-NEXT: v_mov_b32_e32 v27, v5 +; GFX12-NEXT: v_mov_b32_e32 v26, v4 +; GFX12-NEXT: v_mov_b32_e32 v25, v3 +; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: v_swmmac_i32_16x16x32_iu8 v[17:20], v0, v[1:2], v7 +; GFX12-NEXT: v_swmmac_i32_16x16x32_iu8 v[21:24], v0, v[1:2], v7 index_key:1 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) +; GFX12-NEXT: v_swmmac_i32_16x16x32_iu8 v[25:28], v0, v[1:2], v7 index_key:2 +; GFX12-NEXT: v_swmmac_i32_16x16x32_iu8 v[3:6], v0, v[1:2], v7 index_key:3 +; GFX12-NEXT: global_store_b128 v[9:10], v[17:20], off +; GFX12-NEXT: global_store_b128 v[11:12], v[21:24], off +; GFX12-NEXT: global_store_b128 v[13:14], v[25:28], off +; GFX12-NEXT: global_store_b128 v[15:16], v[3:6], off +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +bb: + %IndexVec = load <4 x i8>, ptr addrspace(1) %IndexVecPtr, align 4 + %Index0 = extractelement <4 x i8> %IndexVec, i32 0 + %res0 = call <4 x i32> @llvm.amdgcn.swmmac.i32.16x16x32.iu8.v4i32.i32.v2i32.v4i32.i8(i1 0, i32 %A, i1 0, <2 x i32> %B, <4 x i32> %C, i8 %Index0, i1 0) + store <4 x i32> %res0, ptr addrspace(1) %out0 + %Index1 = extractelement <4 x i8> %IndexVec, i32 1 + %res1 = call <4 x i32> @llvm.amdgcn.swmmac.i32.16x16x32.iu8.v4i32.i32.v2i32.v4i32.i8(i1 0, i32 %A, i1 0, <2 x i32> %B, <4 x i32> %C, i8 %Index1, i1 0) + store <4 x i32> %res1, ptr addrspace(1) %out1 + %Index2 = extractelement <4 x i8> %IndexVec, i32 2 + %res2 = call <4 x i32> @llvm.amdgcn.swmmac.i32.16x16x32.iu8.v4i32.i32.v2i32.v4i32.i8(i1 0, i32 %A, i1 0, <2 x i32> %B, <4 x i32> %C, i8 %Index2, i1 0) + store <4 x i32> %res2, ptr addrspace(1) %out2 + %Index3 = extractelement <4 x i8> %IndexVec, i32 3 + %res3 = call <4 x i32> @llvm.amdgcn.swmmac.i32.16x16x32.iu8.v4i32.i32.v2i32.v4i32.i8(i1 0, i32 %A, i1 0, <2 x i32> %B, <4 x i32> %C, i8 %Index3, i1 0) + store <4 x i32> %res3, ptr addrspace(1) %out3 + ret void +} + +define amdgpu_ps void @test_swmmac_i32_16x16x32_iu4_index_key(i32 %A, i32 %B, <4 x i32> %C, ptr addrspace(1) %IndexVecPtr, ptr addrspace(1) %out0, ptr addrspace(1) %out1) { +; GFX12-LABEL: test_swmmac_i32_16x16x32_iu4_index_key: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: global_load_b32 v6, v[6:7], off +; GFX12-NEXT: v_mov_b32_e32 v15, v5 +; GFX12-NEXT: v_mov_b32_e32 v14, v4 +; GFX12-NEXT: v_mov_b32_e32 v13, v3 +; GFX12-NEXT: v_mov_b32_e32 v12, v2 +; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: v_swmmac_i32_16x16x32_iu4 v[12:15], v0, v1, v6 +; GFX12-NEXT: v_swmmac_i32_16x16x32_iu4 v[2:5], v0, v1, v6 index_key:1 +; GFX12-NEXT: global_store_b128 v[8:9], v[12:15], off +; GFX12-NEXT: global_store_b128 v[10:11], v[2:5], off +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +bb: + %IndexVec = load <2 x i16>, ptr addrspace(1) %IndexVecPtr, align 4 + %Index0 = extractelement <2 x i16> %IndexVec, i32 0 + %res0 = call <4 x i32> @llvm.amdgcn.swmmac.i32.16x16x32.iu4.v4i32.i32.i32.v4i32.i16(i1 0, i32 %A, i1 0, i32 %B, <4 x i32> %C, i16 %Index0, i1 0) + store <4 x i32> %res0, ptr addrspace(1) %out0 + %Index1 = extractelement <2 x i16> %IndexVec, i32 1 + %res1 = call <4 x i32> @llvm.amdgcn.swmmac.i32.16x16x32.iu4.v4i32.i32.i32.v4i32.i16(i1 0, i32 %A, i1 0, i32 %B, <4 x i32> %C, i16 %Index1, i1 0) + store <4 x i32> %res1, ptr addrspace(1) %out1 + ret void +} + +define amdgpu_ps void @test_swmmac_i32_16x16x64_iu4_index_key(i32 %A, <2 x i32> %B, <4 x i32> %C, ptr addrspace(1) %IndexVecPtr, ptr addrspace(1) %out0, ptr addrspace(1) %out1) { +; GFX12-LABEL: test_swmmac_i32_16x16x64_iu4_index_key: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: global_load_b32 v7, v[7:8], off +; GFX12-NEXT: v_mov_b32_e32 v16, v6 +; GFX12-NEXT: v_mov_b32_e32 v15, v5 +; GFX12-NEXT: v_mov_b32_e32 v14, v4 +; GFX12-NEXT: v_mov_b32_e32 v13, v3 +; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: v_swmmac_i32_16x16x64_iu4 v[13:16], v0, v[1:2], v7 +; GFX12-NEXT: v_swmmac_i32_16x16x64_iu4 v[3:6], v0, v[1:2], v7 index_key:1 +; GFX12-NEXT: global_store_b128 v[9:10], v[13:16], off +; GFX12-NEXT: global_store_b128 v[11:12], v[3:6], off +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +bb: + %IndexVec = load <2 x i16>, ptr addrspace(1) %IndexVecPtr, align 4 + %Index0 = extractelement <2 x i16> %IndexVec, i32 0 + %res0 = call <4 x i32> @llvm.amdgcn.swmmac.i32.16x16x64.iu4.v4i32.i32.v2i32.v4i32.i16(i1 0, i32 %A, i1 0, <2 x i32> %B, <4 x i32> %C, i16 %Index0, i1 0) + store <4 x i32> %res0, ptr addrspace(1) %out0 + %Index1 = extractelement <2 x i16> %IndexVec, i32 1 + %res1 = call <4 x i32> @llvm.amdgcn.swmmac.i32.16x16x64.iu4.v4i32.i32.v2i32.v4i32.i16(i1 0, i32 %A, i1 0, <2 x i32> %B, <4 x i32> %C, i16 %Index1, i1 0) + store <4 x i32> %res1, ptr addrspace(1) %out1 + ret void +} + +define amdgpu_ps void @test_swmmac_f32_16x16x32_fp8_fp8_index_key(i32 %A, <2 x i32> %B, <4 x float> %C, ptr addrspace(1) %IndexVecPtr, ptr addrspace(1) %out0, ptr addrspace(1) %out1, ptr addrspace(1) %out2, ptr addrspace(1) %out3) { +; GFX12-LABEL: test_swmmac_f32_16x16x32_fp8_fp8_index_key: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: global_load_b32 v7, v[7:8], off +; GFX12-NEXT: v_mov_b32_e32 v20, v6 +; GFX12-NEXT: v_mov_b32_e32 v19, v5 +; GFX12-NEXT: v_mov_b32_e32 v18, v4 +; GFX12-NEXT: v_mov_b32_e32 v17, v3 +; GFX12-NEXT: v_mov_b32_e32 v24, v6 +; GFX12-NEXT: v_mov_b32_e32 v23, v5 +; GFX12-NEXT: v_mov_b32_e32 v22, v4 +; GFX12-NEXT: v_mov_b32_e32 v21, v3 +; GFX12-NEXT: v_mov_b32_e32 v28, v6 +; GFX12-NEXT: v_mov_b32_e32 v27, v5 +; GFX12-NEXT: v_mov_b32_e32 v26, v4 +; GFX12-NEXT: v_mov_b32_e32 v25, v3 +; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: v_swmmac_f32_16x16x32_fp8_fp8 v[17:20], v0, v[1:2], v7 +; GFX12-NEXT: v_swmmac_f32_16x16x32_fp8_fp8 v[21:24], v0, v[1:2], v7 index_key:1 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) +; GFX12-NEXT: v_swmmac_f32_16x16x32_fp8_fp8 v[25:28], v0, v[1:2], v7 index_key:2 +; GFX12-NEXT: v_swmmac_f32_16x16x32_fp8_fp8 v[3:6], v0, v[1:2], v7 index_key:3 +; GFX12-NEXT: global_store_b128 v[9:10], v[17:20], off +; GFX12-NEXT: global_store_b128 v[11:12], v[21:24], off +; GFX12-NEXT: global_store_b128 v[13:14], v[25:28], off +; GFX12-NEXT: global_store_b128 v[15:16], v[3:6], off +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +bb: + %IndexVec = load <4 x i8>, ptr addrspace(1) %IndexVecPtr, align 4 + %Index0 = extractelement <4 x i8> %IndexVec, i32 0 + %res0 = call <4 x float> @llvm.amdgcn.swmmac.f32.16x16x32.fp8.fp8.v4f32.i32.v2i32.v4f32.i8(i32 %A, <2 x i32> %B, <4 x float> %C, i8 %Index0) + store <4 x float> %res0, ptr addrspace(1) %out0 + %Index1 = extractelement <4 x i8> %IndexVec, i32 1 + %res1 = call <4 x float> @llvm.amdgcn.swmmac.f32.16x16x32.fp8.fp8.v4f32.i32.v2i32.v4f32.i8(i32 %A, <2 x i32> %B, <4 x float> %C, i8 %Index1) + store <4 x float> %res1, ptr addrspace(1) %out1 + %Index2 = extractelement <4 x i8> %IndexVec, i32 2 + %res2 = call <4 x float> @llvm.amdgcn.swmmac.f32.16x16x32.fp8.fp8.v4f32.i32.v2i32.v4f32.i8(i32 %A, <2 x i32> %B, <4 x float> %C, i8 %Index2) + store <4 x float> %res2, ptr addrspace(1) %out2 + %Index3 = extractelement <4 x i8> %IndexVec, i32 3 + %res3 = call <4 x float> @llvm.amdgcn.swmmac.f32.16x16x32.fp8.fp8.v4f32.i32.v2i32.v4f32.i8(i32 %A, <2 x i32> %B, <4 x float> %C, i8 %Index3) + store <4 x float> %res3, ptr addrspace(1) %out3 + ret void +} + +define amdgpu_ps void @test_swmmac_f32_16x16x32_fp8_bf8_index_key(i32 %A, <2 x i32> %B, <4 x float> %C, ptr addrspace(1) %IndexVecPtr, ptr addrspace(1) %out0, ptr addrspace(1) %out1, ptr addrspace(1) %out2, ptr addrspace(1) %out3) { +; GFX12-LABEL: test_swmmac_f32_16x16x32_fp8_bf8_index_key: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: global_load_b32 v7, v[7:8], off +; GFX12-NEXT: v_mov_b32_e32 v20, v6 +; GFX12-NEXT: v_mov_b32_e32 v19, v5 +; GFX12-NEXT: v_mov_b32_e32 v18, v4 +; GFX12-NEXT: v_mov_b32_e32 v17, v3 +; GFX12-NEXT: v_mov_b32_e32 v24, v6 +; GFX12-NEXT: v_mov_b32_e32 v23, v5 +; GFX12-NEXT: v_mov_b32_e32 v22, v4 +; GFX12-NEXT: v_mov_b32_e32 v21, v3 +; GFX12-NEXT: v_mov_b32_e32 v28, v6 +; GFX12-NEXT: v_mov_b32_e32 v27, v5 +; GFX12-NEXT: v_mov_b32_e32 v26, v4 +; GFX12-NEXT: v_mov_b32_e32 v25, v3 +; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: v_swmmac_f32_16x16x32_fp8_bf8 v[17:20], v0, v[1:2], v7 +; GFX12-NEXT: v_swmmac_f32_16x16x32_fp8_bf8 v[21:24], v0, v[1:2], v7 index_key:1 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) +; GFX12-NEXT: v_swmmac_f32_16x16x32_fp8_bf8 v[25:28], v0, v[1:2], v7 index_key:2 +; GFX12-NEXT: v_swmmac_f32_16x16x32_fp8_bf8 v[3:6], v0, v[1:2], v7 index_key:3 +; GFX12-NEXT: global_store_b128 v[9:10], v[17:20], off +; GFX12-NEXT: global_store_b128 v[11:12], v[21:24], off +; GFX12-NEXT: global_store_b128 v[13:14], v[25:28], off +; GFX12-NEXT: global_store_b128 v[15:16], v[3:6], off +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +bb: + %IndexVec = load <4 x i8>, ptr addrspace(1) %IndexVecPtr, align 4 + %Index0 = extractelement <4 x i8> %IndexVec, i32 0 + %res0 = call <4 x float> @llvm.amdgcn.swmmac.f32.16x16x32.fp8.bf8.v4f32.i32.v2i32.v4f32.i8(i32 %A, <2 x i32> %B, <4 x float> %C, i8 %Index0) + store <4 x float> %res0, ptr addrspace(1) %out0 + %Index1 = extractelement <4 x i8> %IndexVec, i32 1 + %res1 = call <4 x float> @llvm.amdgcn.swmmac.f32.16x16x32.fp8.bf8.v4f32.i32.v2i32.v4f32.i8(i32 %A, <2 x i32> %B, <4 x float> %C, i8 %Index1) + store <4 x float> %res1, ptr addrspace(1) %out1 + %Index2 = extractelement <4 x i8> %IndexVec, i32 2 + %res2 = call <4 x float> @llvm.amdgcn.swmmac.f32.16x16x32.fp8.bf8.v4f32.i32.v2i32.v4f32.i8(i32 %A, <2 x i32> %B, <4 x float> %C, i8 %Index2) + store <4 x float> %res2, ptr addrspace(1) %out2 + %Index3 = extractelement <4 x i8> %IndexVec, i32 3 + %res3 = call <4 x float> @llvm.amdgcn.swmmac.f32.16x16x32.fp8.bf8.v4f32.i32.v2i32.v4f32.i8(i32 %A, <2 x i32> %B, <4 x float> %C, i8 %Index3) + store <4 x float> %res3, ptr addrspace(1) %out3 + ret void +} + +define amdgpu_ps void @test_swmmac_f32_16x16x32_bf8_fp8_index_key(i32 %A, <2 x i32> %B, <4 x float> %C, ptr addrspace(1) %IndexVecPtr, ptr addrspace(1) %out0, ptr addrspace(1) %out1, ptr addrspace(1) %out2, ptr addrspace(1) %out3) { +; GFX12-LABEL: test_swmmac_f32_16x16x32_bf8_fp8_index_key: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: global_load_b32 v7, v[7:8], off +; GFX12-NEXT: v_mov_b32_e32 v20, v6 +; GFX12-NEXT: v_mov_b32_e32 v19, v5 +; GFX12-NEXT: v_mov_b32_e32 v18, v4 +; GFX12-NEXT: v_mov_b32_e32 v17, v3 +; GFX12-NEXT: v_mov_b32_e32 v24, v6 +; GFX12-NEXT: v_mov_b32_e32 v23, v5 +; GFX12-NEXT: v_mov_b32_e32 v22, v4 +; GFX12-NEXT: v_mov_b32_e32 v21, v3 +; GFX12-NEXT: v_mov_b32_e32 v28, v6 +; GFX12-NEXT: v_mov_b32_e32 v27, v5 +; GFX12-NEXT: v_mov_b32_e32 v26, v4 +; GFX12-NEXT: v_mov_b32_e32 v25, v3 +; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: v_swmmac_f32_16x16x32_bf8_fp8 v[17:20], v0, v[1:2], v7 +; GFX12-NEXT: v_swmmac_f32_16x16x32_bf8_fp8 v[21:24], v0, v[1:2], v7 index_key:1 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) +; GFX12-NEXT: v_swmmac_f32_16x16x32_bf8_fp8 v[25:28], v0, v[1:2], v7 index_key:2 +; GFX12-NEXT: v_swmmac_f32_16x16x32_bf8_fp8 v[3:6], v0, v[1:2], v7 index_key:3 +; GFX12-NEXT: global_store_b128 v[9:10], v[17:20], off +; GFX12-NEXT: global_store_b128 v[11:12], v[21:24], off +; GFX12-NEXT: global_store_b128 v[13:14], v[25:28], off +; GFX12-NEXT: global_store_b128 v[15:16], v[3:6], off +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +bb: + %IndexVec = load <4 x i8>, ptr addrspace(1) %IndexVecPtr, align 4 + %Index0 = extractelement <4 x i8> %IndexVec, i32 0 + %res0 = call <4 x float> @llvm.amdgcn.swmmac.f32.16x16x32.bf8.fp8.v4f32.i32.v2i32.v4f32.i8(i32 %A, <2 x i32> %B, <4 x float> %C, i8 %Index0) + store <4 x float> %res0, ptr addrspace(1) %out0 + %Index1 = extractelement <4 x i8> %IndexVec, i32 1 + %res1 = call <4 x float> @llvm.amdgcn.swmmac.f32.16x16x32.bf8.fp8.v4f32.i32.v2i32.v4f32.i8(i32 %A, <2 x i32> %B, <4 x float> %C, i8 %Index1) + store <4 x float> %res1, ptr addrspace(1) %out1 + %Index2 = extractelement <4 x i8> %IndexVec, i32 2 + %res2 = call <4 x float> @llvm.amdgcn.swmmac.f32.16x16x32.bf8.fp8.v4f32.i32.v2i32.v4f32.i8(i32 %A, <2 x i32> %B, <4 x float> %C, i8 %Index2) + store <4 x float> %res2, ptr addrspace(1) %out2 + %Index3 = extractelement <4 x i8> %IndexVec, i32 3 + %res3 = call <4 x float> @llvm.amdgcn.swmmac.f32.16x16x32.bf8.fp8.v4f32.i32.v2i32.v4f32.i8(i32 %A, <2 x i32> %B, <4 x float> %C, i8 %Index3) + store <4 x float> %res3, ptr addrspace(1) %out3 + ret void +} + +define amdgpu_ps void @test_swmmac_f32_16x16x32_bf8_bf8_index_key(i32 %A, <2 x i32> %B, <4 x float> %C, ptr addrspace(1) %IndexVecPtr, ptr addrspace(1) %out0, ptr addrspace(1) %out1, ptr addrspace(1) %out2, ptr addrspace(1) %out3) { +; GFX12-LABEL: test_swmmac_f32_16x16x32_bf8_bf8_index_key: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: global_load_b32 v7, v[7:8], off +; GFX12-NEXT: v_mov_b32_e32 v20, v6 +; GFX12-NEXT: v_mov_b32_e32 v19, v5 +; GFX12-NEXT: v_mov_b32_e32 v18, v4 +; GFX12-NEXT: v_mov_b32_e32 v17, v3 +; GFX12-NEXT: v_mov_b32_e32 v24, v6 +; GFX12-NEXT: v_mov_b32_e32 v23, v5 +; GFX12-NEXT: v_mov_b32_e32 v22, v4 +; GFX12-NEXT: v_mov_b32_e32 v21, v3 +; GFX12-NEXT: v_mov_b32_e32 v28, v6 +; GFX12-NEXT: v_mov_b32_e32 v27, v5 +; GFX12-NEXT: v_mov_b32_e32 v26, v4 +; GFX12-NEXT: v_mov_b32_e32 v25, v3 +; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: v_swmmac_f32_16x16x32_bf8_bf8 v[17:20], v0, v[1:2], v7 +; GFX12-NEXT: v_swmmac_f32_16x16x32_bf8_bf8 v[21:24], v0, v[1:2], v7 index_key:1 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) +; GFX12-NEXT: v_swmmac_f32_16x16x32_bf8_bf8 v[25:28], v0, v[1:2], v7 index_key:2 +; GFX12-NEXT: v_swmmac_f32_16x16x32_bf8_bf8 v[3:6], v0, v[1:2], v7 index_key:3 +; GFX12-NEXT: global_store_b128 v[9:10], v[17:20], off +; GFX12-NEXT: global_store_b128 v[11:12], v[21:24], off +; GFX12-NEXT: global_store_b128 v[13:14], v[25:28], off +; GFX12-NEXT: global_store_b128 v[15:16], v[3:6], off +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +bb: + %IndexVec = load <4 x i8>, ptr addrspace(1) %IndexVecPtr, align 4 + %Index0 = extractelement <4 x i8> %IndexVec, i32 0 + %res0 = call <4 x float> @llvm.amdgcn.swmmac.f32.16x16x32.bf8.bf8.v4f32.i32.v2i32.v4f32.i8(i32 %A, <2 x i32> %B, <4 x float> %C, i8 %Index0) + store <4 x float> %res0, ptr addrspace(1) %out0 + %Index1 = extractelement <4 x i8> %IndexVec, i32 1 + %res1 = call <4 x float> @llvm.amdgcn.swmmac.f32.16x16x32.bf8.bf8.v4f32.i32.v2i32.v4f32.i8(i32 %A, <2 x i32> %B, <4 x float> %C, i8 %Index1) + store <4 x float> %res1, ptr addrspace(1) %out1 + %Index2 = extractelement <4 x i8> %IndexVec, i32 2 + %res2 = call <4 x float> @llvm.amdgcn.swmmac.f32.16x16x32.bf8.bf8.v4f32.i32.v2i32.v4f32.i8(i32 %A, <2 x i32> %B, <4 x float> %C, i8 %Index2) + store <4 x float> %res2, ptr addrspace(1) %out2 + %Index3 = extractelement <4 x i8> %IndexVec, i32 3 + %res3 = call <4 x float> @llvm.amdgcn.swmmac.f32.16x16x32.bf8.bf8.v4f32.i32.v2i32.v4f32.i8(i32 %A, <2 x i32> %B, <4 x float> %C, i8 %Index3) + store <4 x float> %res3, ptr addrspace(1) %out3 + ret void +} + +declare <4 x float> @llvm.amdgcn.swmmac.f32.16x16x32.f16.v4f32.v4f16.v8f16.v4f32.i8(<4 x half>, <8 x half>, <4 x float>, i8) +declare <4 x float> @llvm.amdgcn.swmmac.f32.16x16x32.bf16.v4f32.v4i16.v8i16.v4f32.i8(<4 x i16>, <8 x i16>, <4 x float>, i8) +declare <4 x half> @llvm.amdgcn.swmmac.f16.16x16x32.f16.v4f16.v4f16.v8f16.v4f16.i8(<4 x half>, <8 x half>, <4 x half>, i8) +declare <4 x i16> @llvm.amdgcn.swmmac.bf16.16x16x32.bf16.v4i16.v4i16.v8i16.v4i16.i8(<4 x i16>, <8 x i16>, <4 x i16>, i8) +declare <4 x i32> @llvm.amdgcn.swmmac.i32.16x16x32.iu8.v4i32.i32.v2i32.v4i32.i8(i1 immarg, i32, i1 immarg, <2 x i32>, <4 x i32>, i8 %Index, i1 immarg) +declare <4 x i32> @llvm.amdgcn.swmmac.i32.16x16x32.iu4.v4i32.i32.i32.v4i32.i16(i1 immarg, i32, i1 immarg, i32, <4 x i32>, i16 %Index, i1 immarg) +declare <4 x i32> @llvm.amdgcn.swmmac.i32.16x16x64.iu4.v4i32.i32.v2i32.v4i32.i16(i1 immarg, i32, i1 immarg, <2 x i32>, <4 x i32>, i16 %Index, i1 immarg) +declare <4 x float> @llvm.amdgcn.swmmac.f32.16x16x32.fp8.fp8.v4f32.i32.v2i32.v4f32.i8(i32, <2 x i32>, <4 x float>, i8) +declare <4 x float> @llvm.amdgcn.swmmac.f32.16x16x32.fp8.bf8.v4f32.i32.v2i32.v4f32.i8(i32, <2 x i32>, <4 x float>, i8) +declare <4 x float> @llvm.amdgcn.swmmac.f32.16x16x32.bf8.fp8.v4f32.i32.v2i32.v4f32.i8(i32, <2 x i32>, <4 x float>, i8) +declare <4 x float> @llvm.amdgcn.swmmac.f32.16x16x32.bf8.bf8.v4f32.i32.v2i32.v4f32.i8(i32, <2 x i32>, <4 x float>, i8) diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/wmma-gfx12-w64.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/wmma-gfx12-w64.ll new file mode 100644 index 000000000000..0d1871a18d40 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/wmma-gfx12-w64.ll @@ -0,0 +1,333 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc -global-isel -march=amdgcn -mcpu=gfx1200 -mattr=-wavefrontsize32,+wavefrontsize64 -verify-machineinstrs < %s | FileCheck %s --check-prefix=GFX12 + +define amdgpu_ps void @test_wmma_f32_16x16x16_f16(<4 x half> %A, <4 x half> %B, <4 x float> %C, ptr addrspace(1) %out) { +; GFX12-LABEL: test_wmma_f32_16x16x16_f16: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: v_wmma_f32_16x16x16_f16 v[4:7], v[0:1], v[2:3], v[4:7] +; GFX12-NEXT: global_store_b128 v[8:9], v[4:7], off +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +bb: + %res = call <4 x float>@llvm.amdgcn.wmma.f32.16x16x16.f16.v4f32.v4f16.v4f16.v4f32(<4 x half> %A, <4 x half> %B, <4 x float> %C) + store <4 x float> %res, ptr addrspace(1) %out + ret void +} + +define amdgpu_ps void @test_wmma_f32_16x16x16_bf16(<4 x i16> %A, <4 x i16> %B, <4 x float> %C, ptr addrspace(1) %out) { +; GFX12-LABEL: test_wmma_f32_16x16x16_bf16: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: v_wmma_f32_16x16x16_bf16 v[4:7], v[0:1], v[2:3], v[4:7] +; GFX12-NEXT: global_store_b128 v[8:9], v[4:7], off +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +bb: + %res = call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf16.v4f32.v4i16.v4i16.v4f32(<4 x i16> %A, <4 x i16> %B, <4 x float> %C) + store <4 x float> %res, ptr addrspace(1) %out + ret void +} + +define amdgpu_ps void @test_wmma_f16_16x16x16_f16(<4 x half> %A, <4 x half> %B, <4 x half> %C, ptr addrspace(1) %out) { +; GFX12-LABEL: test_wmma_f16_16x16x16_f16: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: v_wmma_f16_16x16x16_f16 v[4:5], v[0:1], v[2:3], v[4:5] +; GFX12-NEXT: global_store_b64 v[6:7], v[4:5], off +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +bb: + %res = call <4 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16.v4f16.v4f16.v4f16.v4f16(<4 x half> %A, <4 x half> %B, <4 x half> %C, i1 0) + store <4 x half> %res, ptr addrspace(1) %out + ret void +} + +define amdgpu_ps void @test_wmma_bf16_16x16x16_bf16(<4 x i16> %A, <4 x i16> %B, <4 x i16> %C, ptr addrspace(1) %out) { +; GFX12-LABEL: test_wmma_bf16_16x16x16_bf16: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: v_wmma_bf16_16x16x16_bf16 v[4:5], v[0:1], v[2:3], v[4:5] +; GFX12-NEXT: global_store_b64 v[6:7], v[4:5], off +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +bb: + %res = call <4 x i16> @llvm.amdgcn.wmma.bf16.16x16x16.bf16.v4i16.v4i16.v4i16.v4i16(<4 x i16> %A, <4 x i16> %B, <4 x i16> %C, i1 0) + store <4 x i16> %res, ptr addrspace(1) %out + ret void +} + +define amdgpu_ps void @test_wmma_i32_16x16x16_iu8(i32 %A, i32 %B, <4 x i32> %C, ptr addrspace(1) %out) { +; GFX12-LABEL: test_wmma_i32_16x16x16_iu8: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: v_wmma_i32_16x16x16_iu8 v[2:5], v0, v1, v[2:5] +; GFX12-NEXT: global_store_b128 v[6:7], v[2:5], off +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +bb: + %res = call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8.v4i32.i32.i32.v4i32(i1 0, i32 %A, i1 0, i32 %B, <4 x i32> %C, i1 0) + store <4 x i32> %res, ptr addrspace(1) %out + ret void +} + +define amdgpu_ps void @test_wmma_i32_16x16x16_iu4(i32 %A, i32 %B, <4 x i32> %C, ptr addrspace(1) %out) { +; GFX12-LABEL: test_wmma_i32_16x16x16_iu4: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: v_wmma_i32_16x16x16_iu4 v[2:5], v0, v1, v[2:5] +; GFX12-NEXT: global_store_b128 v[6:7], v[2:5], off +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +bb: + %res = call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4.v4i32.i32.i32.v4i32(i1 0, i32 %A, i1 0, i32 %B, <4 x i32> %C, i1 0) + store <4 x i32> %res, ptr addrspace(1) %out + ret void +} + +define amdgpu_ps void @test_wmma_f32_16x16x16_fp8_fp8(i32 %A, i32 %B, <4 x float> %C, ptr addrspace(1) %out) { +; GFX12-LABEL: test_wmma_f32_16x16x16_fp8_fp8: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: v_wmma_f32_16x16x16_fp8_fp8 v[2:5], v0, v1, v[2:5] +; GFX12-NEXT: global_store_b128 v[6:7], v[2:5], off +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +bb: + %res = call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.fp8.fp8.v4f32.i32.i32.v4f32(i32 %A, i32 %B, <4 x float> %C) + store <4 x float> %res, ptr addrspace(1) %out + ret void +} + +define amdgpu_ps void @test_wmma_f32_16x16x16_bf8_fp8(i32 %A, i32 %B, <4 x float> %C, ptr addrspace(1) %out) { +; GFX12-LABEL: test_wmma_f32_16x16x16_bf8_fp8: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: v_wmma_f32_16x16x16_bf8_fp8 v[2:5], v0, v1, v[2:5] +; GFX12-NEXT: global_store_b128 v[6:7], v[2:5], off +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +bb: + %res = call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf8.fp8.v4f32.i32.i32.v4f32(i32 %A, i32 %B, <4 x float> %C) + store <4 x float> %res, ptr addrspace(1) %out + ret void +} + +define amdgpu_ps void @test_wmma_f32_16x16x16_fp8_bf8(i32 %A, i32 %B, <4 x float> %C, ptr addrspace(1) %out) { +; GFX12-LABEL: test_wmma_f32_16x16x16_fp8_bf8: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: v_wmma_f32_16x16x16_fp8_bf8 v[2:5], v0, v1, v[2:5] +; GFX12-NEXT: global_store_b128 v[6:7], v[2:5], off +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +bb: + %res = call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.fp8.bf8.v4f32.i32.i32.v4f32(i32 %A, i32 %B, <4 x float> %C) + store <4 x float> %res, ptr addrspace(1) %out + ret void +} + +define amdgpu_ps void @test_wmma_f32_16x16x16_bf8_bf8(i32 %A, i32 %B, <4 x float> %C, ptr addrspace(1) %out) { +; GFX12-LABEL: test_wmma_f32_16x16x16_bf8_bf8: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: v_wmma_f32_16x16x16_bf8_bf8 v[2:5], v0, v1, v[2:5] +; GFX12-NEXT: global_store_b128 v[6:7], v[2:5], off +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +bb: + %res = call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf8.bf8.v4f32.i32.i32.v4f32(i32 %A, i32 %B, <4 x float> %C) + store <4 x float> %res, ptr addrspace(1) %out + ret void +} + +define amdgpu_ps void @test_wmma_i32_16x16x32_iu4(i32 %A, i32 %B, <4 x i32> %C, ptr addrspace(1) %out) { +; GFX12-LABEL: test_wmma_i32_16x16x32_iu4: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: v_wmma_i32_16x16x32_iu4 v[2:5], v0, v1, v[2:5] +; GFX12-NEXT: global_store_b128 v[6:7], v[2:5], off +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +bb: + %res = call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x32.iu4.v4i32.i32.i32.v4i32(i1 0, i32 %A, i1 0, i32 %B, <4 x i32> %C, i1 0) + store <4 x i32> %res, ptr addrspace(1) %out + ret void +} + +define amdgpu_ps void @test_swmmac_f32_16x16x32_f16(<4 x half> %A, <8 x half> %B, <4 x float> %C, i8 %Index, ptr addrspace(1) %out) { +; GFX12-LABEL: test_swmmac_f32_16x16x32_f16: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: v_swmmac_f32_16x16x32_f16 v[6:9], v[0:1], v[2:5], v10 +; GFX12-NEXT: global_store_b128 v[11:12], v[6:9], off +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +bb: + %res = call <4 x float> @llvm.amdgcn.swmmac.f32.16x16x32.f16.v4f32.v4f16.v8f16.v4f32.i8(<4 x half> %A, <8 x half> %B, <4 x float> %C, i8 %Index) + store <4 x float> %res, ptr addrspace(1) %out + ret void +} + +define amdgpu_ps void @test_swmmac_f32_16x16x32_bf16(<4 x i16> %A, <8 x i16> %B, <4 x float> %C, i8 %Index, ptr addrspace(1) %out) { +; GFX12-LABEL: test_swmmac_f32_16x16x32_bf16: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: v_swmmac_f32_16x16x32_bf16 v[6:9], v[0:1], v[2:5], v10 +; GFX12-NEXT: global_store_b128 v[11:12], v[6:9], off +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +bb: + %res = call <4 x float> @llvm.amdgcn.swmmac.f32.16x16x32.bf16.v4f32.v4i16.v8i16.v4f32.i8(<4 x i16> %A, <8 x i16> %B, <4 x float> %C, i8 %Index) + store <4 x float> %res, ptr addrspace(1) %out + ret void +} + +define amdgpu_ps void @test_swmmac_f16_16x16x32_f16(<4 x half> %A, <8 x half> %B, <4 x half> %C, i8 %Index, ptr addrspace(1) %out) { +; GFX12-LABEL: test_swmmac_f16_16x16x32_f16: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: v_swmmac_f16_16x16x32_f16 v[6:7], v[0:1], v[2:5], v8 +; GFX12-NEXT: global_store_b64 v[9:10], v[6:7], off +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +bb: + %res = call <4 x half> @llvm.amdgcn.swmmac.f16.16x16x32.f16.v4f16.v4f16.v8f16.v4f16.i8(<4 x half> %A, <8 x half> %B, <4 x half> %C, i8 %Index) + store <4 x half> %res, ptr addrspace(1) %out + ret void +} + +define amdgpu_ps void @test_swmmac_bf16_16x16x32_bf16(<4 x i16> %A, <8 x i16> %B, <4 x i16> %C, i8 %Index, ptr addrspace(1) %out) { +; GFX12-LABEL: test_swmmac_bf16_16x16x32_bf16: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: v_swmmac_bf16_16x16x32_bf16 v[6:7], v[0:1], v[2:5], v8 +; GFX12-NEXT: global_store_b64 v[9:10], v[6:7], off +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +bb: + %res = call <4 x i16> @llvm.amdgcn.swmmac.bf16.16x16x32.bf16.v4i16.v4i16.v8i16.v4i16.i8(<4 x i16> %A, <8 x i16> %B, <4 x i16> %C, i8 %Index) + store <4 x i16> %res, ptr addrspace(1) %out + ret void +} + +define amdgpu_ps void @test_swmmac_i32_16x16x32_iu8(i32 %A, <2 x i32> %B, <4 x i32> %C, i8 %Index, ptr addrspace(1) %out) { +; GFX12-LABEL: test_swmmac_i32_16x16x32_iu8: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: v_swmmac_i32_16x16x32_iu8 v[3:6], v0, v[1:2], v7 +; GFX12-NEXT: global_store_b128 v[8:9], v[3:6], off +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +bb: + %res = call <4 x i32> @llvm.amdgcn.swmmac.i32.16x16x32.iu8.v4i32.i32.v2i32.v4i32.i8(i1 0, i32 %A, i1 0, <2 x i32> %B, <4 x i32> %C, i8 %Index, i1 0) + store <4 x i32> %res, ptr addrspace(1) %out + ret void +} + +define amdgpu_ps void @test_swmmac_i32_16x16x32_iu4(i32 %A, i32 %B, <4 x i32> %C, i16 %Index, ptr addrspace(1) %out) { +; GFX12-LABEL: test_swmmac_i32_16x16x32_iu4: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: v_swmmac_i32_16x16x32_iu4 v[2:5], v0, v1, v6 +; GFX12-NEXT: global_store_b128 v[7:8], v[2:5], off +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +bb: + %res = call <4 x i32> @llvm.amdgcn.swmmac.i32.16x16x32.iu4.v4i32.i32.i32.v4i32.i16(i1 0, i32 %A, i1 0, i32 %B, <4 x i32> %C, i16 %Index, i1 0) + store <4 x i32> %res, ptr addrspace(1) %out + ret void +} + +define amdgpu_ps void @test_swmmac_i32_16x16x64_iu4(i32 %A, <2 x i32> %B, <4 x i32> %C, i16 %Index, ptr addrspace(1) %out) { +; GFX12-LABEL: test_swmmac_i32_16x16x64_iu4: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: v_swmmac_i32_16x16x64_iu4 v[3:6], v0, v[1:2], v7 +; GFX12-NEXT: global_store_b128 v[8:9], v[3:6], off +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +bb: + %res = call <4 x i32> @llvm.amdgcn.swmmac.i32.16x16x64.iu4.v4i32.i32.v2i32.v4i32.i16(i1 0, i32 %A, i1 0, <2 x i32> %B, <4 x i32> %C, i16 %Index, i1 0) + store <4 x i32> %res, ptr addrspace(1) %out + ret void +} + +define amdgpu_ps void @test_swmmac_f32_16x16x32_fp8_fp8(i32 %A, <2 x i32> %B, <4 x float> %C, i8 %Index, ptr addrspace(1) %out) { +; GFX12-LABEL: test_swmmac_f32_16x16x32_fp8_fp8: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: v_swmmac_f32_16x16x32_fp8_fp8 v[3:6], v0, v[1:2], v7 +; GFX12-NEXT: global_store_b128 v[8:9], v[3:6], off +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +bb: + %res = call <4 x float> @llvm.amdgcn.swmmac.f32.16x16x32.fp8.fp8.v4f32.i32.v2i32.v4f32.i8(i32 %A, <2 x i32> %B, <4 x float> %C, i8 %Index) + store <4 x float> %res, ptr addrspace(1) %out + ret void +} + +define amdgpu_ps void @test_swmmac_f32_16x16x32_fp8_bf8(i32 %A, <2 x i32> %B, <4 x float> %C, i8 %Index, ptr addrspace(1) %out) { +; GFX12-LABEL: test_swmmac_f32_16x16x32_fp8_bf8: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: v_swmmac_f32_16x16x32_fp8_bf8 v[3:6], v0, v[1:2], v7 +; GFX12-NEXT: global_store_b128 v[8:9], v[3:6], off +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +bb: + %res = call <4 x float> @llvm.amdgcn.swmmac.f32.16x16x32.fp8.bf8.v4f32.i32.v2i32.v4f32.i8(i32 %A, <2 x i32> %B, <4 x float> %C, i8 %Index) + store <4 x float> %res, ptr addrspace(1) %out + ret void +} + +define amdgpu_ps void @test_swmmac_f32_16x16x32_bf8_fp8(i32 %A, <2 x i32> %B, <4 x float> %C, i8 %Index, ptr addrspace(1) %out) { +; GFX12-LABEL: test_swmmac_f32_16x16x32_bf8_fp8: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: v_swmmac_f32_16x16x32_bf8_fp8 v[3:6], v0, v[1:2], v7 +; GFX12-NEXT: global_store_b128 v[8:9], v[3:6], off +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +bb: + %res = call <4 x float> @llvm.amdgcn.swmmac.f32.16x16x32.bf8.fp8.v4f32.i32.v2i32.v4f32.i8(i32 %A, <2 x i32> %B, <4 x float> %C, i8 %Index) + store <4 x float> %res, ptr addrspace(1) %out + ret void +} + +define amdgpu_ps void @test_swmmac_f32_16x16x32_bf8_bf8(i32 %A, <2 x i32> %B, <4 x float> %C, i8 %Index, ptr addrspace(1) %out) { +; GFX12-LABEL: test_swmmac_f32_16x16x32_bf8_bf8: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: v_swmmac_f32_16x16x32_bf8_bf8 v[3:6], v0, v[1:2], v7 +; GFX12-NEXT: global_store_b128 v[8:9], v[3:6], off +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +bb: + %res = call <4 x float> @llvm.amdgcn.swmmac.f32.16x16x32.bf8.bf8.v4f32.i32.v2i32.v4f32.i8(i32 %A, <2 x i32> %B, <4 x float> %C, i8 %Index) + store <4 x float> %res, ptr addrspace(1) %out + ret void +} + +declare <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.f16.v4f32.v4f16.v4f16.v4f32(<4 x half>, <4 x half>, <4 x float>) +declare <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf16.v4f32.v4i16.v4i16.v4f32(<4 x i16>, <4 x i16>, <4 x float>) +declare <4 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16.v4f16.v4f16.v4f16.v4f16(<4 x half>, <4 x half>, <4 x half>, i1 immarg) +declare <4 x i16> @llvm.amdgcn.wmma.bf16.16x16x16.bf16.v4i16.v4i16.v4i16.v4i16(<4 x i16>, <4 x i16>, <4 x i16>, i1 immarg) +declare <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8.v4i32.i32.i32.v4i32(i1 immarg, i32, i1 immarg, i32, <4 x i32>, i1 immarg) +declare <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4.v4i32.i32.i32.v4i32(i1 immarg, i32, i1 immarg, i32, <4 x i32>, i1 immarg) +declare <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.fp8.fp8.v4f32.i32.i32.v4f32(i32, i32, <4 x float>) +declare <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.fp8.bf8.v4f32.i32.i32.v4f32(i32, i32, <4 x float>) +declare <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf8.fp8.v4f32.i32.i32.v4f32(i32, i32, <4 x float>) +declare <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf8.bf8.v4f32.i32.i32.v4f32(i32, i32, <4 x float>) +declare <4 x i32> @llvm.amdgcn.wmma.i32.16x16x32.iu4.v4i32.i32.i32.v4i32(i1 immarg, i32, i1 immarg, i32, <4 x i32>, i1 immarg) +declare <4 x float> @llvm.amdgcn.swmmac.f32.16x16x32.f16.v4f32.v4f16.v8f16.v4f32.i8(<4 x half>, <8 x half>, <4 x float>, i8) +declare <4 x float> @llvm.amdgcn.swmmac.f32.16x16x32.bf16.v4f32.v4i16.v8i16.v4f32.i8(<4 x i16>, <8 x i16>, <4 x float>, i8) +declare <4 x half> @llvm.amdgcn.swmmac.f16.16x16x32.f16.v4f16.v4f16.v8f16.v4f16.i8(<4 x half>, <8 x half>, <4 x half>, i8) +declare <4 x i16> @llvm.amdgcn.swmmac.bf16.16x16x32.bf16.v4i16.v4i16.v8i16.v4i16.i8(<4 x i16>, <8 x i16>, <4 x i16>, i8) +declare <4 x i32> @llvm.amdgcn.swmmac.i32.16x16x32.iu8.v4i32.i32.v2i32.v4i32.i8(i1 immarg, i32, i1 immarg, <2 x i32>, <4 x i32>, i8 %Index, i1 immarg) +declare <4 x i32> @llvm.amdgcn.swmmac.i32.16x16x32.iu4.v4i32.i32.i32.v4i32.i16(i1 immarg, i32, i1 immarg, i32, <4 x i32>, i16 %Index, i1 immarg) +declare <4 x i32> @llvm.amdgcn.swmmac.i32.16x16x64.iu4.v4i32.i32.v2i32.v4i32.i16(i1 immarg, i32, i1 immarg, <2 x i32>, <4 x i32>, i16 %Index, i1 immarg) +declare <4 x float> @llvm.amdgcn.swmmac.f32.16x16x32.fp8.fp8.v4f32.i32.v2i32.v4f32.i8(i32, <2 x i32>, <4 x float>, i8) +declare <4 x float> @llvm.amdgcn.swmmac.f32.16x16x32.fp8.bf8.v4f32.i32.v2i32.v4f32.i8(i32, <2 x i32>, <4 x float>, i8) +declare <4 x float> @llvm.amdgcn.swmmac.f32.16x16x32.bf8.fp8.v4f32.i32.v2i32.v4f32.i8(i32, <2 x i32>, <4 x float>, i8) +declare <4 x float> @llvm.amdgcn.swmmac.f32.16x16x32.bf8.bf8.v4f32.i32.v2i32.v4f32.i8(i32, <2 x i32>, <4 x float>, i8) diff --git a/llvm/test/CodeGen/AMDGPU/wmma-gfx12-w32-f16-f32-matrix-modifiers.ll b/llvm/test/CodeGen/AMDGPU/wmma-gfx12-w32-f16-f32-matrix-modifiers.ll new file mode 100644 index 000000000000..fb2af36839b5 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/wmma-gfx12-w32-f16-f32-matrix-modifiers.ll @@ -0,0 +1,499 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc -march=amdgcn -mcpu=gfx1200 -verify-machineinstrs < %s | FileCheck %s --check-prefix=GFX12 + +define amdgpu_ps void @test_wmma_f32_16x16x16_f16_negA(<8 x half> %A, <8 x half> %B, <8 x float> %C, <8 x float> addrspace(1)* %out) { +; GFX12-LABEL: test_wmma_f32_16x16x16_f16_negA: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: v_wmma_f32_16x16x16_f16 v[8:15], v[0:3], v[4:7], v[8:15] neg_lo:[1,0,0] neg_hi:[1,0,0] +; GFX12-NEXT: s_clause 0x1 +; GFX12-NEXT: global_store_b128 v[16:17], v[12:15], off offset:16 +; GFX12-NEXT: global_store_b128 v[16:17], v[8:11], off +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +bb: + %fneg.A = fneg <8 x half> %A + %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.f16.v8f16.v8f32(<8 x half> %fneg.A, <8 x half> %B, <8 x float> %C) + store <8 x float> %res, <8 x float> addrspace(1)* %out + ret void +} + +define amdgpu_ps void @test_wmma_f32_16x16x16_f16_negB(<8 x half> %A, <8 x half> %B, <8 x float> %C, <8 x float> addrspace(1)* %out) { +; GFX12-LABEL: test_wmma_f32_16x16x16_f16_negB: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: v_wmma_f32_16x16x16_f16 v[8:15], v[0:3], v[4:7], v[8:15] neg_lo:[0,1,0] neg_hi:[0,1,0] +; GFX12-NEXT: s_clause 0x1 +; GFX12-NEXT: global_store_b128 v[16:17], v[12:15], off offset:16 +; GFX12-NEXT: global_store_b128 v[16:17], v[8:11], off +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +bb: + %fneg.B = fneg <8 x half> %B + %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.f16.v8f16.v8f32(<8 x half> %A, <8 x half> %fneg.B, <8 x float> %C) + store <8 x float> %res, <8 x float> addrspace(1)* %out + ret void +} + +define amdgpu_ps void @test_wmma_f32_16x16x16_f16_negC(<8 x half> %A, <8 x half> %B, <8 x float> %C, <8 x float> addrspace(1)* %out) { +; GFX12-LABEL: test_wmma_f32_16x16x16_f16_negC: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: v_wmma_f32_16x16x16_f16 v[8:15], v[0:3], v[4:7], v[8:15] neg_lo:[0,0,1] +; GFX12-NEXT: s_clause 0x1 +; GFX12-NEXT: global_store_b128 v[16:17], v[12:15], off offset:16 +; GFX12-NEXT: global_store_b128 v[16:17], v[8:11], off +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +bb: + %fneg.C = fneg <8 x float> %C + %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.f16.v8f16.v8f32(<8 x half> %A, <8 x half> %B, <8 x float> %fneg.C) + store <8 x float> %res, <8 x float> addrspace(1)* %out + ret void +} + +define amdgpu_ps void @test_wmma_f32_16x16x16_f16_absC(<8 x half> %A, <8 x half> %B, <8 x float> %C, <8 x float> addrspace(1)* %out) { +; GFX12-LABEL: test_wmma_f32_16x16x16_f16_absC: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: v_wmma_f32_16x16x16_f16 v[8:15], v[0:3], v[4:7], v[8:15] neg_hi:[0,0,1] +; GFX12-NEXT: s_clause 0x1 +; GFX12-NEXT: global_store_b128 v[16:17], v[12:15], off offset:16 +; GFX12-NEXT: global_store_b128 v[16:17], v[8:11], off +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +bb: + %fabs.C = call <8 x float> @llvm.fabs.v8f32(<8 x float> %C) + %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.f16.v8f16.v8f32(<8 x half> %A, <8 x half> %B, <8 x float> %fabs.C) + store <8 x float> %res, <8 x float> addrspace(1)* %out + ret void +} + +define amdgpu_ps void @test_wmma_f32_16x16x16_bf16_negC(<8 x i16> %A, <8 x i16> %B, <8 x float> %C, <8 x float> addrspace(1)* %out) { +; GFX12-LABEL: test_wmma_f32_16x16x16_bf16_negC: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: v_wmma_f32_16x16x16_bf16 v[8:15], v[0:3], v[4:7], v[8:15] neg_lo:[0,0,1] +; GFX12-NEXT: s_clause 0x1 +; GFX12-NEXT: global_store_b128 v[16:17], v[12:15], off offset:16 +; GFX12-NEXT: global_store_b128 v[16:17], v[8:11], off +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +bb: + %fneg.C = fneg <8 x float> %C + %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf16.v8i16.v8f32(<8 x i16> %A, <8 x i16> %B, <8 x float> %fneg.C) + store <8 x float> %res, <8 x float> addrspace(1)* %out + ret void +} + +define amdgpu_ps void @test_wmma_f32_16x16x16_bf16_absC(<8 x i16> %A, <8 x i16> %B, <8 x float> %C, <8 x float> addrspace(1)* %out) { +; GFX12-LABEL: test_wmma_f32_16x16x16_bf16_absC: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: v_wmma_f32_16x16x16_bf16 v[8:15], v[0:3], v[4:7], v[8:15] neg_hi:[0,0,1] +; GFX12-NEXT: s_clause 0x1 +; GFX12-NEXT: global_store_b128 v[16:17], v[12:15], off offset:16 +; GFX12-NEXT: global_store_b128 v[16:17], v[8:11], off +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +bb: + %fabs.C = call <8 x float> @llvm.fabs.v8f32(<8 x float> %C) + %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf16.v8i16.v8f32(<8 x i16> %A, <8 x i16> %B, <8 x float> %fabs.C) + store <8 x float> %res, <8 x float> addrspace(1)* %out + ret void +} + +define amdgpu_ps void @test_wmma_f16_16x16x16_f16_negA(<8 x half> %A, <8 x half> %B, <8 x half> %C, <8 x half> addrspace(1)* %out) { +; GFX12-LABEL: test_wmma_f16_16x16x16_f16_negA: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: v_wmma_f16_16x16x16_f16 v[8:11], v[0:3], v[4:7], v[8:11] neg_lo:[1,0,0] neg_hi:[1,0,0] +; GFX12-NEXT: global_store_b128 v[12:13], v[8:11], off +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +bb: + %fneg.A = fneg <8 x half> %A + %res = call <8 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16.v8f16.v8f16(<8 x half> %fneg.A, <8 x half> %B, <8 x half> %C, i1 0) + store <8 x half> %res, <8 x half> addrspace(1)* %out + ret void +} + +define amdgpu_ps void @test_wmma_f16_16x16x16_f16_negB(<8 x half> %A, <8 x half> %B, <8 x half> %C, <8 x half> addrspace(1)* %out) { +; GFX12-LABEL: test_wmma_f16_16x16x16_f16_negB: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: v_wmma_f16_16x16x16_f16 v[8:11], v[0:3], v[4:7], v[8:11] neg_lo:[0,1,0] neg_hi:[0,1,0] +; GFX12-NEXT: global_store_b128 v[12:13], v[8:11], off +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +bb: + %fneg.B = fneg <8 x half> %B + %res = call <8 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16.v8f16.v8f16(<8 x half> %A, <8 x half> %fneg.B, <8 x half> %C, i1 0) + store <8 x half> %res, <8 x half> addrspace(1)* %out + ret void +} + +define amdgpu_ps void @test_wmma_f16_16x16x16_f16_negC(<8 x half> %A, <8 x half> %B, <8 x half> %C, <8 x half> addrspace(1)* %out) { +; GFX12-LABEL: test_wmma_f16_16x16x16_f16_negC: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: v_wmma_f16_16x16x16_f16 v[8:11], v[0:3], v[4:7], v[8:11] neg_lo:[0,0,1] +; GFX12-NEXT: global_store_b128 v[12:13], v[8:11], off +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +bb: + %fneg.C = fneg <8 x half> %C + %res = call <8 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16.v8f16.v8f16(<8 x half> %A, <8 x half> %B, <8 x half> %fneg.C, i1 0) + store <8 x half> %res, <8 x half> addrspace(1)* %out + ret void +} + +define amdgpu_ps void @test_wmma_f16_16x16x16_f16_absC(<8 x half> %A, <8 x half> %B, <8 x half> %C, <8 x half> addrspace(1)* %out) { +; GFX12-LABEL: test_wmma_f16_16x16x16_f16_absC: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: v_wmma_f16_16x16x16_f16 v[8:11], v[0:3], v[4:7], v[8:11] neg_hi:[0,0,1] +; GFX12-NEXT: global_store_b128 v[12:13], v[8:11], off +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +bb: + %fabs.C = call <8 x half> @llvm.fabs.v8f16(<8 x half> %C) + %res = call <8 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16.v8f16.v8f16(<8 x half> %A, <8 x half> %B, <8 x half> %fabs.C, i1 0) + store <8 x half> %res, <8 x half> addrspace(1)* %out + ret void +} + +define amdgpu_ps void @test_wmma_f32_16x16x16_fp8_fp8_negC(<2 x i32> %A, <2 x i32> %B, <8 x float> %C, <8 x float> addrspace(1)* %out) { +; GFX12-LABEL: test_wmma_f32_16x16x16_fp8_fp8_negC: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: v_wmma_f32_16x16x16_fp8_fp8 v[4:11], v[0:1], v[2:3], v[4:11] neg_lo:[0,0,1] +; GFX12-NEXT: s_clause 0x1 +; GFX12-NEXT: global_store_b128 v[12:13], v[8:11], off offset:16 +; GFX12-NEXT: global_store_b128 v[12:13], v[4:7], off +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +bb: + %fneg.C = fneg <8 x float> %C + %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.fp8.fp8.v2i32.v8f32(<2 x i32> %A, <2 x i32> %B, <8 x float> %fneg.C) + store <8 x float> %res, <8 x float> addrspace(1)* %out + ret void +} + +define amdgpu_ps void @test_wmma_f32_16x16x16_fp8_fp8_absC(<2 x i32> %A, <2 x i32> %B, <8 x float> %C, <8 x float> addrspace(1)* %out) { +; GFX12-LABEL: test_wmma_f32_16x16x16_fp8_fp8_absC: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: v_wmma_f32_16x16x16_fp8_fp8 v[4:11], v[0:1], v[2:3], v[4:11] neg_hi:[0,0,1] +; GFX12-NEXT: s_clause 0x1 +; GFX12-NEXT: global_store_b128 v[12:13], v[8:11], off offset:16 +; GFX12-NEXT: global_store_b128 v[12:13], v[4:7], off +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +bb: + %fabs.C = call <8 x float> @llvm.fabs.v8f32(<8 x float> %C) + %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.fp8.fp8.v2i32.v8f32(<2 x i32> %A, <2 x i32> %B, <8 x float> %fabs.C) + store <8 x float> %res, <8 x float> addrspace(1)* %out + ret void +} + +define amdgpu_ps void @test_wmma_f32_16x16x16_bf8_fp8_negC(<2 x i32> %A, <2 x i32> %B, <8 x float> %C, <8 x float> addrspace(1)* %out) { +; GFX12-LABEL: test_wmma_f32_16x16x16_bf8_fp8_negC: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: v_wmma_f32_16x16x16_bf8_fp8 v[4:11], v[0:1], v[2:3], v[4:11] neg_lo:[0,0,1] +; GFX12-NEXT: s_clause 0x1 +; GFX12-NEXT: global_store_b128 v[12:13], v[8:11], off offset:16 +; GFX12-NEXT: global_store_b128 v[12:13], v[4:7], off +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +bb: + %fneg.C = fneg <8 x float> %C + %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf8.fp8.v2i32.v8f32(<2 x i32> %A, <2 x i32> %B, <8 x float> %fneg.C) + store <8 x float> %res, <8 x float> addrspace(1)* %out + ret void +} + +define amdgpu_ps void @test_wmma_f32_16x16x16_bf8_fp8_absC(<2 x i32> %A, <2 x i32> %B, <8 x float> %C, <8 x float> addrspace(1)* %out) { +; GFX12-LABEL: test_wmma_f32_16x16x16_bf8_fp8_absC: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: v_wmma_f32_16x16x16_bf8_fp8 v[4:11], v[0:1], v[2:3], v[4:11] neg_hi:[0,0,1] +; GFX12-NEXT: s_clause 0x1 +; GFX12-NEXT: global_store_b128 v[12:13], v[8:11], off offset:16 +; GFX12-NEXT: global_store_b128 v[12:13], v[4:7], off +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +bb: + %fabs.C = call <8 x float> @llvm.fabs.v8f32(<8 x float> %C) + %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf8.fp8.v2i32.v8f32(<2 x i32> %A, <2 x i32> %B, <8 x float> %fabs.C) + store <8 x float> %res, <8 x float> addrspace(1)* %out + ret void +} + +define amdgpu_ps void @test_wmma_f32_16x16x16_fp8_bf8_negC(<2 x i32> %A, <2 x i32> %B, <8 x float> %C, <8 x float> addrspace(1)* %out) { +; GFX12-LABEL: test_wmma_f32_16x16x16_fp8_bf8_negC: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: v_wmma_f32_16x16x16_fp8_bf8 v[4:11], v[0:1], v[2:3], v[4:11] neg_lo:[0,0,1] +; GFX12-NEXT: s_clause 0x1 +; GFX12-NEXT: global_store_b128 v[12:13], v[8:11], off offset:16 +; GFX12-NEXT: global_store_b128 v[12:13], v[4:7], off +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +bb: + %fneg.C = fneg <8 x float> %C + %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.fp8.bf8.v2i32.v8f32(<2 x i32> %A, <2 x i32> %B, <8 x float> %fneg.C) + store <8 x float> %res, <8 x float> addrspace(1)* %out + ret void +} + +define amdgpu_ps void @test_wmma_f32_16x16x16_fp8_bf8_absC(<2 x i32> %A, <2 x i32> %B, <8 x float> %C, <8 x float> addrspace(1)* %out) { +; GFX12-LABEL: test_wmma_f32_16x16x16_fp8_bf8_absC: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: v_wmma_f32_16x16x16_fp8_bf8 v[4:11], v[0:1], v[2:3], v[4:11] neg_hi:[0,0,1] +; GFX12-NEXT: s_clause 0x1 +; GFX12-NEXT: global_store_b128 v[12:13], v[8:11], off offset:16 +; GFX12-NEXT: global_store_b128 v[12:13], v[4:7], off +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +bb: + %fabs.C = call <8 x float> @llvm.fabs.v8f32(<8 x float> %C) + %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.fp8.bf8.v2i32.v8f32(<2 x i32> %A, <2 x i32> %B, <8 x float> %fabs.C) + store <8 x float> %res, <8 x float> addrspace(1)* %out + ret void +} + +define amdgpu_ps void @test_wmma_f32_16x16x16_bf8_bf8_negC(<2 x i32> %A, <2 x i32> %B, <8 x float> %C, <8 x float> addrspace(1)* %out) { +; GFX12-LABEL: test_wmma_f32_16x16x16_bf8_bf8_negC: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: v_wmma_f32_16x16x16_bf8_bf8 v[4:11], v[0:1], v[2:3], v[4:11] neg_lo:[0,0,1] +; GFX12-NEXT: s_clause 0x1 +; GFX12-NEXT: global_store_b128 v[12:13], v[8:11], off offset:16 +; GFX12-NEXT: global_store_b128 v[12:13], v[4:7], off +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +bb: + %fneg.C = fneg <8 x float> %C + %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf8.bf8.v2i32.v8f32(<2 x i32> %A, <2 x i32> %B, <8 x float> %fneg.C) + store <8 x float> %res, <8 x float> addrspace(1)* %out + ret void +} + +define amdgpu_ps void @test_wmma_f32_16x16x16_bf8_bf8_absC(<2 x i32> %A, <2 x i32> %B, <8 x float> %C, <8 x float> addrspace(1)* %out) { +; GFX12-LABEL: test_wmma_f32_16x16x16_bf8_bf8_absC: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: v_wmma_f32_16x16x16_bf8_bf8 v[4:11], v[0:1], v[2:3], v[4:11] neg_hi:[0,0,1] +; GFX12-NEXT: s_clause 0x1 +; GFX12-NEXT: global_store_b128 v[12:13], v[8:11], off offset:16 +; GFX12-NEXT: global_store_b128 v[12:13], v[4:7], off +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +bb: + %fabs.C = call <8 x float> @llvm.fabs.v8f32(<8 x float> %C) + %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf8.bf8.v2i32.v8f32(<2 x i32> %A, <2 x i32> %B, <8 x float> %fabs.C) + store <8 x float> %res, <8 x float> addrspace(1)* %out + ret void +} + +define amdgpu_ps void @test_swmmac_f32_16x16x32_f16_negA(<8 x half> %A, <16 x half> %B, <8 x float> %C, i16 %Index, <8 x float> addrspace(1)* %out) { +; GFX12-LABEL: test_swmmac_f32_16x16x32_f16_negA: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: v_swmmac_f32_16x16x32_f16 v[12:19], v[0:3], v[4:11], v20 neg_lo:[1,0,0] neg_hi:[1,0,0] +; GFX12-NEXT: s_clause 0x1 +; GFX12-NEXT: global_store_b128 v[21:22], v[16:19], off offset:16 +; GFX12-NEXT: global_store_b128 v[21:22], v[12:15], off +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +bb: + %fneg.A = fneg <8 x half> %A + %res = call <8 x float> @llvm.amdgcn.swmmac.f32.16x16x32.f16.v8f16.v16f16.v8f32.i16(<8 x half> %fneg.A, <16 x half> %B, <8 x float> %C, i16 %Index) + store <8 x float> %res, <8 x float> addrspace(1)* %out + ret void +} + +define amdgpu_ps void @test_swmmac_f32_16x16x32_f16_negB(<8 x half> %A, <16 x half> %B, <8 x float> %C, i16 %Index, <8 x float> addrspace(1)* %out) { +; GFX12-LABEL: test_swmmac_f32_16x16x32_f16_negB: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: v_swmmac_f32_16x16x32_f16 v[12:19], v[0:3], v[4:11], v20 neg_lo:[0,1,0] neg_hi:[0,1,0] +; GFX12-NEXT: s_clause 0x1 +; GFX12-NEXT: global_store_b128 v[21:22], v[16:19], off offset:16 +; GFX12-NEXT: global_store_b128 v[21:22], v[12:15], off +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +bb: + %fneg.B = fneg <16 x half> %B + %res = call <8 x float> @llvm.amdgcn.swmmac.f32.16x16x32.f16.v8f16.v16f16.v8f32.i16(<8 x half> %A, <16 x half> %fneg.B, <8 x float> %C, i16 %Index) + store <8 x float> %res, <8 x float> addrspace(1)* %out + ret void +} + +define amdgpu_ps void @test_swmmac_f16_16x16x32_f16_negA(<8 x half> %A, <16 x half> %B, <8 x half> %C, i16 %Index, <8 x half> addrspace(1)* %out) { +; GFX12-LABEL: test_swmmac_f16_16x16x32_f16_negA: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: v_swmmac_f16_16x16x32_f16 v[12:15], v[0:3], v[4:11], v16 neg_lo:[1,0,0] neg_hi:[1,0,0] +; GFX12-NEXT: global_store_b128 v[17:18], v[12:15], off +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +bb: + %fneg.A = fneg <8 x half> %A + %res = call <8 x half> @llvm.amdgcn.swmmac.f16.16x16x32.f16.v8f16.v16f16.v8f16.i16(<8 x half> %fneg.A, <16 x half> %B, <8 x half> %C, i16 %Index) + store <8 x half> %res, <8 x half> addrspace(1)* %out + ret void +} + +define amdgpu_ps void @test_swmmac_f16_16x16x32_f16_negB(<8 x half> %A, <16 x half> %B, <8 x half> %C, i16 %Index, <8 x half> addrspace(1)* %out) { +; GFX12-LABEL: test_swmmac_f16_16x16x32_f16_negB: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: v_swmmac_f16_16x16x32_f16 v[12:15], v[0:3], v[4:11], v16 neg_lo:[0,1,0] neg_hi:[0,1,0] +; GFX12-NEXT: global_store_b128 v[17:18], v[12:15], off +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +bb: + %fneg.B = fneg <16 x half> %B + %res = call <8 x half> @llvm.amdgcn.swmmac.f16.16x16x32.f16.v8f16.v16f16.v8f16.i16(<8 x half> %A, <16 x half> %fneg.B, <8 x half> %C, i16 %Index) + store <8 x half> %res, <8 x half> addrspace(1)* %out + ret void +} + +; both neg and abs patterns (wmma matrix C f32 or f16 ) + +define amdgpu_ps void @test_wmma_f32_16x16x16_f16_negabsC(<8 x half> %A, <8 x half> %B, <8 x float> %C, <8 x float> addrspace(1)* %out) { +; GFX12-LABEL: test_wmma_f32_16x16x16_f16_negabsC: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: v_wmma_f32_16x16x16_f16 v[8:15], v[0:3], v[4:7], v[8:15] neg_lo:[0,0,1] neg_hi:[0,0,1] +; GFX12-NEXT: s_clause 0x1 +; GFX12-NEXT: global_store_b128 v[16:17], v[12:15], off offset:16 +; GFX12-NEXT: global_store_b128 v[16:17], v[8:11], off +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +bb: + %fabs.C = call <8 x float> @llvm.fabs.v8f32(<8 x float> %C) + %fneg.fabs.C = fneg <8 x float> %fabs.C + %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.f16.v8f16.v8f32(<8 x half> %A, <8 x half> %B, <8 x float> %fneg.fabs.C) + store <8 x float> %res, <8 x float> addrspace(1)* %out + ret void +} + +define amdgpu_ps void @test_wmma_f16_16x16x16_f16_negabsC(<8 x half> %A, <8 x half> %B, <8 x half> %C, <8 x half> addrspace(1)* %out) { +; GFX12-LABEL: test_wmma_f16_16x16x16_f16_negabsC: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: v_wmma_f16_16x16x16_f16 v[8:11], v[0:3], v[4:7], v[8:11] neg_lo:[0,0,1] neg_hi:[0,0,1] +; GFX12-NEXT: global_store_b128 v[12:13], v[8:11], off +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +bb: + %fabs.C = call <8 x half> @llvm.fabs.v8f16(<8 x half> %C) + %fneg.fabs.C = fneg <8 x half> %fabs.C + %res = call <8 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16.v8f16.v8f16(<8 x half> %A, <8 x half> %B, <8 x half> %fneg.fabs.C, i1 0) + store <8 x half> %res, <8 x half> addrspace(1)* %out + ret void +} + +define amdgpu_ps void @test_wmma_f32_16x16x16_f16_neg_partial_fabsA(<8 x half> %A, <8 x half> %B, <8 x float> %C, <8 x float> addrspace(1)* %out) { +; GFX12-LABEL: test_wmma_f32_16x16x16_f16_neg_partial_fabsA: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: v_and_b32_e32 v11, 0x7fffffff, v11 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: v_wmma_f32_16x16x16_f16 v[8:15], v[0:3], v[4:7], v[8:15] neg_lo:[0,0,1] +; GFX12-NEXT: s_clause 0x1 +; GFX12-NEXT: global_store_b128 v[16:17], v[12:15], off offset:16 +; GFX12-NEXT: global_store_b128 v[16:17], v[8:11], off +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +bb: + %el3 = extractelement <8 x float> %C, i32 3 + %el3.fabs = call float @llvm.fabs.f32(float %el3) + %partial.fabs.C = insertelement <8 x float> %C, float %el3.fabs, i32 3 + %fneg.partial.fabs.C = fneg <8 x float> %partial.fabs.C + %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.f16.v8f16.v8f32(<8 x half> %A, <8 x half> %B, <8 x float> %fneg.partial.fabs.C) + store <8 x float> %res, <8 x float> addrspace(1)* %out + ret void +} + +; A or B matrix modifier and constant in C + +define amdgpu_ps void @test_wmma_f32_16x16x16_f16_negA_constantC(<8 x half> %A, <8 x half> %B, <8 x float> %C, <8 x float> addrspace(1)* %out) { +; GFX12-LABEL: test_wmma_f32_16x16x16_f16_negA_constantC: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: v_wmma_f32_16x16x16_f16 v[10:17], v[0:3], v[4:7], 1.0 neg_lo:[1,0,0] neg_hi:[1,0,0] +; GFX12-NEXT: s_clause 0x1 +; GFX12-NEXT: global_store_b128 v[8:9], v[14:17], off offset:16 +; GFX12-NEXT: global_store_b128 v[8:9], v[10:13], off +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +bb: + %fneg.A = fneg <8 x half> %A + %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.f16.v8f16.v8f32(<8 x half> %fneg.A, <8 x half> %B, <8 x float> ) + store <8 x float> %res, <8 x float> addrspace(1)* %out + ret void +} + +define amdgpu_ps void @test_wmma_f16_16x16x16_f16_negB_constantC(<8 x half> %A, <8 x half> %B, <8 x half> %C, <8 x half> addrspace(1)* %out) { +; GFX12-LABEL: test_wmma_f16_16x16x16_f16_negB_constantC: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: v_wmma_f16_16x16x16_f16 v[10:13], v[0:3], v[4:7], 1.0 neg_lo:[0,1,0] neg_hi:[0,1,0] +; GFX12-NEXT: global_store_b128 v[8:9], v[10:13], off +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +bb: + %fneg.B = fneg <8 x half> %B + %res = call <8 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16.v8f16.v8f16(<8 x half> %A, <8 x half> %fneg.B, <8 x half> , i1 0) + store <8 x half> %res, <8 x half> addrspace(1)* %out + ret void +} + +; pack f16 elements with v_perm_b32 since they don't come from same b32 + +define amdgpu_ps void @test_wmma_f16_16x16x16_f16_negC_pack(<8 x half> %A, <8 x half> %B, ptr %Caddr, <8 x half> addrspace(1)* %out) { +; GFX12-LABEL: test_wmma_f16_16x16x16_f16_negC_pack: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: s_clause 0x1 +; GFX12-NEXT: flat_load_b128 v[12:15], v[8:9] offset:16 +; GFX12-NEXT: flat_load_b128 v[16:19], v[8:9] +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x101 +; GFX12-NEXT: v_perm_b32 v15, v15, v14, 0x5040100 +; GFX12-NEXT: v_perm_b32 v14, v13, v12, 0x5040100 +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: v_perm_b32 v13, v19, v18, 0x5040100 +; GFX12-NEXT: v_perm_b32 v12, v17, v16, 0x5040100 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: v_wmma_f16_16x16x16_f16 v[12:15], v[0:3], v[4:7], v[12:15] neg_lo:[0,0,1] +; GFX12-NEXT: global_store_b128 v[10:11], v[12:15], off +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +bb: + %C = load <16 x half>, ptr %Caddr + %C_shuffle = shufflevector <16 x half> %C, <16 x half> undef, <8 x i32> + %fneg.C_shuffle = fneg <8 x half> %C_shuffle + %res = call <8 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16.v8f16.v8f16(<8 x half> %A, <8 x half> %B, <8 x half> %fneg.C_shuffle , i1 0) + store <8 x half> %res, <8 x half> addrspace(1)* %out + ret void +} + +declare <8 x half> @llvm.fabs.v8f16(<8 x half>) +declare <8 x float> @llvm.fabs.v8f32(<8 x float>) +declare float @llvm.fabs.f32(float) + +declare <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.f16.v8f16.v8f32(<8 x half>, <8 x half>, <8 x float>) +declare <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf16.v8i16.v8f32(<8 x i16>, <8 x i16>, <8 x float>) +declare <8 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16.v8f16.v8f16(<8 x half>, <8 x half>, <8 x half>, i1 immarg) +declare <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.fp8.fp8.v2i32.v8f32(<2 x i32>, <2 x i32>, <8 x float>) +declare <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.fp8.bf8.v2i32.v8f32(<2 x i32>, <2 x i32>, <8 x float>) +declare <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf8.fp8.v2i32.v8f32(<2 x i32>, <2 x i32>, <8 x float>) +declare <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf8.bf8.v2i32.v8f32(<2 x i32>, <2 x i32>, <8 x float>) +declare <8 x float> @llvm.amdgcn.swmmac.f32.16x16x32.f16.v8f16.v16f16.v8f32.i16(<8 x half>, <16 x half>, <8 x float>, i16) +declare <8 x half> @llvm.amdgcn.swmmac.f16.16x16x32.f16.v8f16.v16f16.v8f16.i16(<8 x half>, <16 x half>, <8 x half>, i16) diff --git a/llvm/test/CodeGen/AMDGPU/wmma-gfx12-w32-imm.ll b/llvm/test/CodeGen/AMDGPU/wmma-gfx12-w32-imm.ll new file mode 100644 index 000000000000..51f93b57f38e --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/wmma-gfx12-w32-imm.ll @@ -0,0 +1,431 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc -march=amdgcn -mcpu=gfx1200 -verify-machineinstrs < %s | FileCheck %s --check-prefix=GFX12 + +define amdgpu_ps void @test_wmma_f32_16x16x16_f16_imm(<8 x half> %A, <8 x half> %B, ptr addrspace(1) %out) { +; GFX12-LABEL: test_wmma_f32_16x16x16_f16_imm: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: v_wmma_f32_16x16x16_f16 v[10:17], v[0:3], v[4:7], 1.0 +; GFX12-NEXT: s_clause 0x1 +; GFX12-NEXT: global_store_b128 v[8:9], v[14:17], off offset:16 +; GFX12-NEXT: global_store_b128 v[8:9], v[10:13], off +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +bb: + %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.f16.v8f32.v8f16.v8f16.v8f32(<8 x half> %A, <8 x half> %B, <8 x float> ) + store <8 x float> %res, ptr addrspace(1) %out + ret void +} + +define amdgpu_ps void @test_wmma_f32_16x16x16_f16_imm_non_inlineable(<8 x half> %A, <8 x half> %B, ptr addrspace(1) %out) { +; GFX12-LABEL: test_wmma_f32_16x16x16_f16_imm_non_inlineable: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: v_mov_b32_e32 v10, 0x40400000 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1) +; GFX12-NEXT: v_dual_mov_b32 v11, v10 :: v_dual_mov_b32 v12, v10 +; GFX12-NEXT: v_dual_mov_b32 v13, v10 :: v_dual_mov_b32 v14, v10 +; GFX12-NEXT: v_dual_mov_b32 v15, v10 :: v_dual_mov_b32 v16, v10 +; GFX12-NEXT: v_mov_b32_e32 v17, v10 +; GFX12-NEXT: v_wmma_f32_16x16x16_f16 v[10:17], v[0:3], v[4:7], v[10:17] +; GFX12-NEXT: s_clause 0x1 +; GFX12-NEXT: global_store_b128 v[8:9], v[14:17], off offset:16 +; GFX12-NEXT: global_store_b128 v[8:9], v[10:13], off +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +bb: + %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.f16.v8f32.v8f16.v8f16.v8f32(<8 x half> %A, <8 x half> %B, <8 x float> ) + store <8 x float> %res, ptr addrspace(1) %out + ret void +} + +define amdgpu_ps void @test_wmma_f32_16x16x16_bf16_imm(<8 x i16> %A, <8 x i16> %B, ptr addrspace(1) %out) { +; GFX12-LABEL: test_wmma_f32_16x16x16_bf16_imm: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: v_wmma_f32_16x16x16_bf16 v[10:17], v[0:3], v[4:7], 1.0 +; GFX12-NEXT: s_clause 0x1 +; GFX12-NEXT: global_store_b128 v[8:9], v[14:17], off offset:16 +; GFX12-NEXT: global_store_b128 v[8:9], v[10:13], off +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +bb: + %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf16.v8f32.v8i16.v8i16.v8f32(<8 x i16> %A, <8 x i16> %B, <8 x float> ) + store <8 x float> %res, ptr addrspace(1) %out + ret void +} + +define amdgpu_ps void @test_wmma_f32_16x16x16_bf16_imm_non_inlineable(<8 x i16> %A, <8 x i16> %B, ptr addrspace(1) %out) { +; GFX12-LABEL: test_wmma_f32_16x16x16_bf16_imm_non_inlineable: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: v_mov_b32_e32 v10, 0x40400000 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1) +; GFX12-NEXT: v_dual_mov_b32 v11, v10 :: v_dual_mov_b32 v12, v10 +; GFX12-NEXT: v_dual_mov_b32 v13, v10 :: v_dual_mov_b32 v14, v10 +; GFX12-NEXT: v_dual_mov_b32 v15, v10 :: v_dual_mov_b32 v16, v10 +; GFX12-NEXT: v_mov_b32_e32 v17, v10 +; GFX12-NEXT: v_wmma_f32_16x16x16_bf16 v[10:17], v[0:3], v[4:7], v[10:17] +; GFX12-NEXT: s_clause 0x1 +; GFX12-NEXT: global_store_b128 v[8:9], v[14:17], off offset:16 +; GFX12-NEXT: global_store_b128 v[8:9], v[10:13], off +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +bb: + %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf16.v8f32.v8i16.v8i16.v8f32(<8 x i16> %A, <8 x i16> %B, <8 x float> ) + store <8 x float> %res, ptr addrspace(1) %out + ret void +} + +define amdgpu_ps void @test_wmma_f16_16x16x16_f16_imm(<8 x half> %A, <8 x half> %B, ptr addrspace(1) %out) { +; GFX12-LABEL: test_wmma_f16_16x16x16_f16_imm: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: v_wmma_f16_16x16x16_f16 v[10:13], v[0:3], v[4:7], 1.0 +; GFX12-NEXT: global_store_b128 v[8:9], v[10:13], off +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +bb: + %res = call <8 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16.v8f16.v8f16.v8f16.v8f16(<8 x half> %A, <8 x half> %B, <8 x half> , i1 0) + store <8 x half> %res, ptr addrspace(1) %out + ret void +} + +define amdgpu_ps void @test_wmma_f16_16x16x16_f16_imm_non_inlineable(<8 x half> %A, <8 x half> %B, ptr addrspace(1) %out) { +; GFX12-LABEL: test_wmma_f16_16x16x16_f16_imm_non_inlineable: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: v_mov_b32_e32 v10, 0x42004200 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX12-NEXT: v_dual_mov_b32 v11, v10 :: v_dual_mov_b32 v12, v10 +; GFX12-NEXT: v_mov_b32_e32 v13, v10 +; GFX12-NEXT: v_wmma_f16_16x16x16_f16 v[10:13], v[0:3], v[4:7], v[10:13] +; GFX12-NEXT: global_store_b128 v[8:9], v[10:13], off +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +bb: + %res = call <8 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16.v8f16.v8f16.v8f16.v8f16(<8 x half> %A, <8 x half> %B, <8 x half> , i1 0) + store <8 x half> %res, ptr addrspace(1) %out + ret void +} + +define amdgpu_ps void @test_wmma_bf16_16x16x16_bf16_imm(<8 x i16> %A, <8 x i16> %B, ptr addrspace(1) %out) { +; GFX12-LABEL: test_wmma_bf16_16x16x16_bf16_imm: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: v_wmma_bf16_16x16x16_bf16 v[10:13], v[0:3], v[4:7], 1.0 +; GFX12-NEXT: global_store_b128 v[8:9], v[10:13], off +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +bb: + %res = call <8 x i16> @llvm.amdgcn.wmma.bf16.16x16x16.bf16.v8i16.v8i16.v8i16.v8i16(<8 x i16> %A, <8 x i16> %B, <8 x i16> , i1 0) + store <8 x i16> %res, ptr addrspace(1) %out + ret void +} + +define amdgpu_ps void @test_wmma_bf16_16x16x16_bf16_imm_non_inlineable(<8 x i16> %A, <8 x i16> %B, ptr addrspace(1) %out) { +; GFX12-LABEL: test_wmma_bf16_16x16x16_bf16_imm_non_inlineable: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: v_mov_b32_e32 v10, 0x3fc03fc0 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX12-NEXT: v_dual_mov_b32 v11, v10 :: v_dual_mov_b32 v12, v10 +; GFX12-NEXT: v_mov_b32_e32 v13, v10 +; GFX12-NEXT: v_wmma_bf16_16x16x16_bf16 v[10:13], v[0:3], v[4:7], v[10:13] +; GFX12-NEXT: global_store_b128 v[8:9], v[10:13], off +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +bb: + %res = call <8 x i16> @llvm.amdgcn.wmma.bf16.16x16x16.bf16.v8i16.v8i16.v8i16.v8i16(<8 x i16> %A, <8 x i16> %B, <8 x i16> , i1 0) + store <8 x i16> %res, ptr addrspace(1) %out + ret void +} + +define amdgpu_ps void @test_wmma_i32_16x16x16_iu8_imm(<2 x i32> %A, <2 x i32> %B, ptr addrspace(1) %out) { +; GFX12-LABEL: test_wmma_i32_16x16x16_iu8_imm: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: v_wmma_i32_16x16x16_iu8 v[6:13], v[0:1], v[2:3], 1 +; GFX12-NEXT: s_clause 0x1 +; GFX12-NEXT: global_store_b128 v[4:5], v[10:13], off offset:16 +; GFX12-NEXT: global_store_b128 v[4:5], v[6:9], off +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +bb: + %res = call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8.v8i32.v2i32.v2i32.v8i32(i1 0, <2 x i32> %A, i1 0, <2 x i32> %B, <8 x i32> , i1 0) + store <8 x i32> %res, ptr addrspace(1) %out + ret void +} + +define amdgpu_ps void @test_wmma_i32_16x16x16_iu8_imm_non_inlineable(<2 x i32> %A, <2 x i32> %B, ptr addrspace(1) %out) { +; GFX12-LABEL: test_wmma_i32_16x16x16_iu8_imm_non_inlineable: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: v_mov_b32_e32 v6, 0x80 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1) +; GFX12-NEXT: v_dual_mov_b32 v7, v6 :: v_dual_mov_b32 v8, v6 +; GFX12-NEXT: v_dual_mov_b32 v9, v6 :: v_dual_mov_b32 v10, v6 +; GFX12-NEXT: v_dual_mov_b32 v11, v6 :: v_dual_mov_b32 v12, v6 +; GFX12-NEXT: v_mov_b32_e32 v13, v6 +; GFX12-NEXT: v_wmma_i32_16x16x16_iu8 v[6:13], v[0:1], v[2:3], v[6:13] +; GFX12-NEXT: s_clause 0x1 +; GFX12-NEXT: global_store_b128 v[4:5], v[10:13], off offset:16 +; GFX12-NEXT: global_store_b128 v[4:5], v[6:9], off +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +bb: + %res = call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8.v8i32.v2i32.v2i32.v8i32(i1 0, <2 x i32> %A, i1 0, <2 x i32> %B, <8 x i32> , i1 0) + store <8 x i32> %res, ptr addrspace(1) %out + ret void +} + +define amdgpu_ps void @test_wmma_i32_16x16x16_iu4_imm(i32 %A, i32 %B, ptr addrspace(1) %out) { +; GFX12-LABEL: test_wmma_i32_16x16x16_iu4_imm: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: v_wmma_i32_16x16x16_iu4 v[4:11], v0, v1, 1 +; GFX12-NEXT: s_clause 0x1 +; GFX12-NEXT: global_store_b128 v[2:3], v[8:11], off offset:16 +; GFX12-NEXT: global_store_b128 v[2:3], v[4:7], off +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +bb: + %res = call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4.v8i32.i32.i32.v8i32(i1 0, i32 %A, i1 0, i32 %B, <8 x i32> , i1 0) + store <8 x i32> %res, ptr addrspace(1) %out + ret void +} + +define amdgpu_ps void @test_wmma_i32_16x16x16_iu4_imm_non_inlineable(i32 %A, i32 %B, ptr addrspace(1) %out) { +; GFX12-LABEL: test_wmma_i32_16x16x16_iu4_imm_non_inlineable: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: v_mov_b32_e32 v4, 0x80 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1) +; GFX12-NEXT: v_dual_mov_b32 v5, v4 :: v_dual_mov_b32 v6, v4 +; GFX12-NEXT: v_dual_mov_b32 v7, v4 :: v_dual_mov_b32 v8, v4 +; GFX12-NEXT: v_dual_mov_b32 v9, v4 :: v_dual_mov_b32 v10, v4 +; GFX12-NEXT: v_mov_b32_e32 v11, v4 +; GFX12-NEXT: v_wmma_i32_16x16x16_iu4 v[4:11], v0, v1, v[4:11] +; GFX12-NEXT: s_clause 0x1 +; GFX12-NEXT: global_store_b128 v[2:3], v[8:11], off offset:16 +; GFX12-NEXT: global_store_b128 v[2:3], v[4:7], off +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +bb: + %res = call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4.v8i32.i32.i32.v8i32(i1 0, i32 %A, i1 0, i32 %B, <8 x i32> , i1 0) + store <8 x i32> %res, ptr addrspace(1) %out + ret void +} + +define amdgpu_ps void @test_wmma_f32_16x16x16_fp8_fp8_imm(<2 x i32> %A, <2 x i32> %B, ptr addrspace(1) %out) { +; GFX12-LABEL: test_wmma_f32_16x16x16_fp8_fp8_imm: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: v_wmma_f32_16x16x16_fp8_fp8 v[6:13], v[0:1], v[2:3], 1.0 +; GFX12-NEXT: s_clause 0x1 +; GFX12-NEXT: global_store_b128 v[4:5], v[10:13], off offset:16 +; GFX12-NEXT: global_store_b128 v[4:5], v[6:9], off +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +bb: + %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.fp8.fp8.v8f32.v2i32.v2i32.v8f32(<2 x i32> %A, <2 x i32> %B, <8 x float> ) + store <8 x float> %res, ptr addrspace(1) %out + ret void +} + +define amdgpu_ps void @test_wmma_f32_16x16x16_fp8_fp8_imm_non_inlineable(<2 x i32> %A, <2 x i32> %B, ptr addrspace(1) %out) { +; GFX12-LABEL: test_wmma_f32_16x16x16_fp8_fp8_imm_non_inlineable: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: v_mov_b32_e32 v6, 0x40400000 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1) +; GFX12-NEXT: v_dual_mov_b32 v7, v6 :: v_dual_mov_b32 v8, v6 +; GFX12-NEXT: v_dual_mov_b32 v9, v6 :: v_dual_mov_b32 v10, v6 +; GFX12-NEXT: v_dual_mov_b32 v11, v6 :: v_dual_mov_b32 v12, v6 +; GFX12-NEXT: v_mov_b32_e32 v13, v6 +; GFX12-NEXT: v_wmma_f32_16x16x16_fp8_fp8 v[6:13], v[0:1], v[2:3], v[6:13] +; GFX12-NEXT: s_clause 0x1 +; GFX12-NEXT: global_store_b128 v[4:5], v[10:13], off offset:16 +; GFX12-NEXT: global_store_b128 v[4:5], v[6:9], off +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +bb: + %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.fp8.fp8.v8f32.v2i32.v2i32.v8f32(<2 x i32> %A, <2 x i32> %B, <8 x float> ) + store <8 x float> %res, ptr addrspace(1) %out + ret void +} + +define amdgpu_ps void @test_wmma_f32_16x16x16_bf8_fp8_imm(<2 x i32> %A, <2 x i32> %B, ptr addrspace(1) %out) { +; GFX12-LABEL: test_wmma_f32_16x16x16_bf8_fp8_imm: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: v_wmma_f32_16x16x16_bf8_fp8 v[6:13], v[0:1], v[2:3], 1.0 +; GFX12-NEXT: s_clause 0x1 +; GFX12-NEXT: global_store_b128 v[4:5], v[10:13], off offset:16 +; GFX12-NEXT: global_store_b128 v[4:5], v[6:9], off +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +bb: + %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf8.fp8.v8f32.v2i32.v2i32.v8f32(<2 x i32> %A, <2 x i32> %B, <8 x float> ) + store <8 x float> %res, ptr addrspace(1) %out + ret void +} + +define amdgpu_ps void @test_wmma_f32_16x16x16_bf8_fp8_imm_non_inlineable(<2 x i32> %A, <2 x i32> %B, ptr addrspace(1) %out) { +; GFX12-LABEL: test_wmma_f32_16x16x16_bf8_fp8_imm_non_inlineable: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: v_mov_b32_e32 v6, 0x40400000 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1) +; GFX12-NEXT: v_dual_mov_b32 v7, v6 :: v_dual_mov_b32 v8, v6 +; GFX12-NEXT: v_dual_mov_b32 v9, v6 :: v_dual_mov_b32 v10, v6 +; GFX12-NEXT: v_dual_mov_b32 v11, v6 :: v_dual_mov_b32 v12, v6 +; GFX12-NEXT: v_mov_b32_e32 v13, v6 +; GFX12-NEXT: v_wmma_f32_16x16x16_bf8_fp8 v[6:13], v[0:1], v[2:3], v[6:13] +; GFX12-NEXT: s_clause 0x1 +; GFX12-NEXT: global_store_b128 v[4:5], v[10:13], off offset:16 +; GFX12-NEXT: global_store_b128 v[4:5], v[6:9], off +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +bb: + %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf8.fp8.v8f32.v2i32.v2i32.v8f32(<2 x i32> %A, <2 x i32> %B, <8 x float> ) + store <8 x float> %res, ptr addrspace(1) %out + ret void +} + +define amdgpu_ps void @test_wmma_f32_16x16x16_fp8_bf8_imm(<2 x i32> %A, <2 x i32> %B, ptr addrspace(1) %out) { +; GFX12-LABEL: test_wmma_f32_16x16x16_fp8_bf8_imm: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: v_wmma_f32_16x16x16_fp8_bf8 v[6:13], v[0:1], v[2:3], 1.0 +; GFX12-NEXT: s_clause 0x1 +; GFX12-NEXT: global_store_b128 v[4:5], v[10:13], off offset:16 +; GFX12-NEXT: global_store_b128 v[4:5], v[6:9], off +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +bb: + %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.fp8.bf8.v8f32.v2i32.v2i32.v8f32(<2 x i32> %A, <2 x i32> %B, <8 x float> ) + store <8 x float> %res, ptr addrspace(1) %out + ret void +} + +define amdgpu_ps void @test_wmma_f32_16x16x16_fp8_bf8_imm_non_inlineable(<2 x i32> %A, <2 x i32> %B, ptr addrspace(1) %out) { +; GFX12-LABEL: test_wmma_f32_16x16x16_fp8_bf8_imm_non_inlineable: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: v_mov_b32_e32 v6, 0x40400000 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1) +; GFX12-NEXT: v_dual_mov_b32 v7, v6 :: v_dual_mov_b32 v8, v6 +; GFX12-NEXT: v_dual_mov_b32 v9, v6 :: v_dual_mov_b32 v10, v6 +; GFX12-NEXT: v_dual_mov_b32 v11, v6 :: v_dual_mov_b32 v12, v6 +; GFX12-NEXT: v_mov_b32_e32 v13, v6 +; GFX12-NEXT: v_wmma_f32_16x16x16_fp8_bf8 v[6:13], v[0:1], v[2:3], v[6:13] +; GFX12-NEXT: s_clause 0x1 +; GFX12-NEXT: global_store_b128 v[4:5], v[10:13], off offset:16 +; GFX12-NEXT: global_store_b128 v[4:5], v[6:9], off +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +bb: + %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.fp8.bf8.v8f32.v2i32.v2i32.v8f32(<2 x i32> %A, <2 x i32> %B, <8 x float> ) + store <8 x float> %res, ptr addrspace(1) %out + ret void +} + +define amdgpu_ps void @test_wmma_f32_16x16x16_bf8_bf8_imm(<2 x i32> %A, <2 x i32> %B, ptr addrspace(1) %out) { +; GFX12-LABEL: test_wmma_f32_16x16x16_bf8_bf8_imm: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: v_wmma_f32_16x16x16_bf8_bf8 v[6:13], v[0:1], v[2:3], 1.0 +; GFX12-NEXT: s_clause 0x1 +; GFX12-NEXT: global_store_b128 v[4:5], v[10:13], off offset:16 +; GFX12-NEXT: global_store_b128 v[4:5], v[6:9], off +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +bb: + %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf8.bf8.v8f32.v2i32.v2i32.v8f32(<2 x i32> %A, <2 x i32> %B, <8 x float> ) + store <8 x float> %res, ptr addrspace(1) %out + ret void +} + +define amdgpu_ps void @test_wmma_f32_16x16x16_bf8_bf8_imm_non_inlineable(<2 x i32> %A, <2 x i32> %B, ptr addrspace(1) %out) { +; GFX12-LABEL: test_wmma_f32_16x16x16_bf8_bf8_imm_non_inlineable: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: v_mov_b32_e32 v6, 0x40400000 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1) +; GFX12-NEXT: v_dual_mov_b32 v7, v6 :: v_dual_mov_b32 v8, v6 +; GFX12-NEXT: v_dual_mov_b32 v9, v6 :: v_dual_mov_b32 v10, v6 +; GFX12-NEXT: v_dual_mov_b32 v11, v6 :: v_dual_mov_b32 v12, v6 +; GFX12-NEXT: v_mov_b32_e32 v13, v6 +; GFX12-NEXT: v_wmma_f32_16x16x16_bf8_bf8 v[6:13], v[0:1], v[2:3], v[6:13] +; GFX12-NEXT: s_clause 0x1 +; GFX12-NEXT: global_store_b128 v[4:5], v[10:13], off offset:16 +; GFX12-NEXT: global_store_b128 v[4:5], v[6:9], off +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +bb: + %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf8.bf8.v8f32.v2i32.v2i32.v8f32(<2 x i32> %A, <2 x i32> %B, <8 x float> ) + store <8 x float> %res, ptr addrspace(1) %out + ret void +} + +define amdgpu_ps void @test_wmma_i32_16x16x32_iu4_imm(<2 x i32> %A, <2 x i32> %B, ptr addrspace(1) %out) { +; GFX12-LABEL: test_wmma_i32_16x16x32_iu4_imm: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: v_wmma_i32_16x16x32_iu4 v[6:13], v[0:1], v[2:3], 1 +; GFX12-NEXT: s_clause 0x1 +; GFX12-NEXT: global_store_b128 v[4:5], v[10:13], off offset:16 +; GFX12-NEXT: global_store_b128 v[4:5], v[6:9], off +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +bb: + %res = call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x32.iu4.v8i32.v2i32.v2i32.v8i32(i1 0, <2 x i32> %A, i1 0, <2 x i32> %B, <8 x i32> , i1 0) + store <8 x i32> %res, ptr addrspace(1) %out + ret void +} + +define amdgpu_ps void @test_wmma_i32_16x16x32_iu4_imm_non_inlineable(<2 x i32> %A, <2 x i32> %B, ptr addrspace(1) %out) { +; GFX12-LABEL: test_wmma_i32_16x16x32_iu4_imm_non_inlineable: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: v_mov_b32_e32 v6, 0x80 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1) +; GFX12-NEXT: v_dual_mov_b32 v7, v6 :: v_dual_mov_b32 v8, v6 +; GFX12-NEXT: v_dual_mov_b32 v9, v6 :: v_dual_mov_b32 v10, v6 +; GFX12-NEXT: v_dual_mov_b32 v11, v6 :: v_dual_mov_b32 v12, v6 +; GFX12-NEXT: v_mov_b32_e32 v13, v6 +; GFX12-NEXT: v_wmma_i32_16x16x32_iu4 v[6:13], v[0:1], v[2:3], v[6:13] +; GFX12-NEXT: s_clause 0x1 +; GFX12-NEXT: global_store_b128 v[4:5], v[10:13], off offset:16 +; GFX12-NEXT: global_store_b128 v[4:5], v[6:9], off +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +bb: + %res = call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x32.iu4.v8i32.v2i32.v2i32.v8i32(i1 0, <2 x i32> %A, i1 0, <2 x i32> %B, <8 x i32> , i1 0) + store <8 x i32> %res, ptr addrspace(1) %out + ret void +} + +declare <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.f16.v8f32.v8f16.v8f16.v8f32(<8 x half>, <8 x half>, <8 x float>) +declare <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf16.v8f32.v8i16.v8i16.v8f32(<8 x i16>, <8 x i16>, <8 x float>) +declare <8 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16.v8f16.v8f16.v8f16.v8f16(<8 x half>, <8 x half>, <8 x half>, i1 immarg) +declare <8 x i16> @llvm.amdgcn.wmma.bf16.16x16x16.bf16.v8i16.v8i16.v8i16.v8i16(<8 x i16>, <8 x i16>, <8 x i16>, i1 immarg) +declare <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8.v8i32.v2i32.v2i32.v8i32(i1 immarg, <2 x i32>, i1 immarg, <2 x i32>, <8 x i32>, i1 immarg) +declare <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4.v8i32.i32.i32.v8i32(i1 immarg, i32, i1 immarg, i32, <8 x i32>, i1 immarg) +declare <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.fp8.fp8.v8f32.v2i32.v2i32.v8f32(<2 x i32>, <2 x i32>, <8 x float>) +declare <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.fp8.bf8.v8f32.v2i32.v2i32.v8f32(<2 x i32>, <2 x i32>, <8 x float>) +declare <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf8.fp8.v8f32.v2i32.v2i32.v8f32(<2 x i32>, <2 x i32>, <8 x float>) +declare <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf8.bf8.v8f32.v2i32.v2i32.v8f32(<2 x i32>, <2 x i32>, <8 x float>) +declare <8 x i32> @llvm.amdgcn.wmma.i32.16x16x32.iu4.v8i32.v2i32.v2i32.v8i32(i1 immarg, <2 x i32>, i1 immarg, <2 x i32>, <8 x i32>, i1 immarg) +declare <8 x float> @llvm.amdgcn.swmmac.f32.16x16x32.f16.v8f32.v8f16.v16f16.v8f32.i16(<8 x half>, <16 x half>, <8 x float>, i16) +declare <8 x float> @llvm.amdgcn.swmmac.f32.16x16x32.bf16.v8f32.v8i16.v16i16.v8f32.i16(<8 x i16>, <16 x i16>, <8 x float>, i16) +declare <8 x half> @llvm.amdgcn.swmmac.f16.16x16x32.f16.v8f16.v8f16.v16f16.v8f16.i16(<8 x half>, <16 x half>, <8 x half>, i16) +declare <8 x i16> @llvm.amdgcn.swmmac.bf16.16x16x32.bf16.v8i16.v8i16.v16i16.v8i16.i16(<8 x i16>, <16 x i16>, <8 x i16>, i16) +declare <8 x i32> @llvm.amdgcn.swmmac.i32.16x16x32.iu8.v8i32.v2i32.v4i32.v8i32.i16(i1 immarg, <2 x i32>, i1 immarg, <4 x i32>, <8 x i32>, i16 %Index, i1 immarg) +declare <8 x i32> @llvm.amdgcn.swmmac.i32.16x16x32.iu4.v8i32.i32.v2i32.v8i32.i16(i1 immarg, i32, i1 immarg, <2 x i32>, <8 x i32>, i16 %Index, i1 immarg) +declare <8 x i32> @llvm.amdgcn.swmmac.i32.16x16x64.iu4.v8i32.v2i32.v4i32.v8i32.i32(i1 immarg, <2 x i32>, i1 immarg, <4 x i32>, <8 x i32>, i32 %Index, i1 immarg) +declare <8 x float> @llvm.amdgcn.swmmac.f32.16x16x32.fp8.fp8.v8f32.v2i32.v4i32.v8f32.i16(<2 x i32>, <4 x i32>, <8 x float>, i16) +declare <8 x float> @llvm.amdgcn.swmmac.f32.16x16x32.fp8.bf8.v8f32.v2i32.v4i32.v8f32.i16(<2 x i32>, <4 x i32>, <8 x float>, i16) +declare <8 x float> @llvm.amdgcn.swmmac.f32.16x16x32.bf8.fp8.v8f32.v2i32.v4i32.v8f32.i16(<2 x i32>, <4 x i32>, <8 x float>, i16) +declare <8 x float> @llvm.amdgcn.swmmac.f32.16x16x32.bf8.bf8.v8f32.v2i32.v4i32.v8f32.i16(<2 x i32>, <4 x i32>, <8 x float>, i16) diff --git a/llvm/test/CodeGen/AMDGPU/wmma-gfx12-w32-iu-modifiers.ll b/llvm/test/CodeGen/AMDGPU/wmma-gfx12-w32-iu-modifiers.ll new file mode 100644 index 000000000000..48297932aa59 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/wmma-gfx12-w32-iu-modifiers.ll @@ -0,0 +1,309 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc -march=amdgcn -mcpu=gfx1200 -verify-machineinstrs < %s | FileCheck %s --check-prefix=GFX12 + +define amdgpu_ps void @test_wmma_i32_16x16x16_iu8_zext_src0(<2 x i32> %A, <2 x i32> %B, <8 x i32> %C, ptr addrspace(1) %out) { +; GFX12-LABEL: test_wmma_i32_16x16x16_iu8_zext_src0: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: v_wmma_i32_16x16x16_iu8 v[4:11], v[0:1], v[2:3], v[4:11] neg_lo:[1,0,0] +; GFX12-NEXT: s_clause 0x1 +; GFX12-NEXT: global_store_b128 v[12:13], v[8:11], off offset:16 +; GFX12-NEXT: global_store_b128 v[12:13], v[4:7], off +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +bb: + %res = call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8.v8i32.v2i32.v2i32.v8i32(i1 1, <2 x i32> %A, i1 0, <2 x i32> %B, <8 x i32> %C, i1 0) + store <8 x i32> %res, ptr addrspace(1) %out + ret void +} + +define amdgpu_ps void @test_wmma_i32_16x16x16_iu8_zext_src1(<2 x i32> %A, <2 x i32> %B, <8 x i32> %C, ptr addrspace(1) %out) { +; GFX12-LABEL: test_wmma_i32_16x16x16_iu8_zext_src1: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: v_wmma_i32_16x16x16_iu8 v[4:11], v[0:1], v[2:3], v[4:11] neg_lo:[0,1,0] +; GFX12-NEXT: s_clause 0x1 +; GFX12-NEXT: global_store_b128 v[12:13], v[8:11], off offset:16 +; GFX12-NEXT: global_store_b128 v[12:13], v[4:7], off +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +bb: + %res = call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8.v8i32.v2i32.v2i32.v8i32(i1 0, <2 x i32> %A, i1 1, <2 x i32> %B, <8 x i32> %C, i1 0) + store <8 x i32> %res, ptr addrspace(1) %out + ret void +} + +define amdgpu_ps void @test_wmma_i32_16x16x16_iu8_clamp(<2 x i32> %A, <2 x i32> %B, <8 x i32> %C, ptr addrspace(1) %out) { +; GFX12-LABEL: test_wmma_i32_16x16x16_iu8_clamp: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: v_wmma_i32_16x16x16_iu8 v[4:11], v[0:1], v[2:3], v[4:11] clamp +; GFX12-NEXT: s_clause 0x1 +; GFX12-NEXT: global_store_b128 v[12:13], v[8:11], off offset:16 +; GFX12-NEXT: global_store_b128 v[12:13], v[4:7], off +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +bb: + %res = call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8.v8i32.v2i32.v2i32.v8i32(i1 0, <2 x i32> %A, i1 0, <2 x i32> %B, <8 x i32> %C, i1 1) + store <8 x i32> %res, ptr addrspace(1) %out + ret void +} + + + +define amdgpu_ps void @test_wmma_i32_16x16x16_iu4_zext_src0(i32 %A, i32 %B, <8 x i32> %C, ptr addrspace(1) %out) { +; GFX12-LABEL: test_wmma_i32_16x16x16_iu4_zext_src0: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: v_wmma_i32_16x16x16_iu4 v[2:9], v0, v1, v[2:9] neg_lo:[1,0,0] +; GFX12-NEXT: s_clause 0x1 +; GFX12-NEXT: global_store_b128 v[10:11], v[6:9], off offset:16 +; GFX12-NEXT: global_store_b128 v[10:11], v[2:5], off +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +bb: + %res = call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4.v8i32.i32.i32.v8i32(i1 1, i32 %A, i1 0, i32 %B, <8 x i32> %C, i1 0) + store <8 x i32> %res, ptr addrspace(1) %out + ret void +} + +define amdgpu_ps void @test_wmma_i32_16x16x16_iu4_zext_src1(i32 %A, i32 %B, <8 x i32> %C, ptr addrspace(1) %out) { +; GFX12-LABEL: test_wmma_i32_16x16x16_iu4_zext_src1: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: v_wmma_i32_16x16x16_iu4 v[2:9], v0, v1, v[2:9] neg_lo:[0,1,0] +; GFX12-NEXT: s_clause 0x1 +; GFX12-NEXT: global_store_b128 v[10:11], v[6:9], off offset:16 +; GFX12-NEXT: global_store_b128 v[10:11], v[2:5], off +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +bb: + %res = call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4.v8i32.i32.i32.v8i32(i1 0, i32 %A, i1 1, i32 %B, <8 x i32> %C, i1 0) + store <8 x i32> %res, ptr addrspace(1) %out + ret void +} + +define amdgpu_ps void @test_wmma_i32_16x16x16_iu4_clamp(i32 %A, i32 %B, <8 x i32> %C, ptr addrspace(1) %out) { +; GFX12-LABEL: test_wmma_i32_16x16x16_iu4_clamp: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: v_wmma_i32_16x16x16_iu4 v[2:9], v0, v1, v[2:9] clamp +; GFX12-NEXT: s_clause 0x1 +; GFX12-NEXT: global_store_b128 v[10:11], v[6:9], off offset:16 +; GFX12-NEXT: global_store_b128 v[10:11], v[2:5], off +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +bb: + %res = call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4.v8i32.i32.i32.v8i32(i1 0, i32 %A, i1 0, i32 %B, <8 x i32> %C, i1 1) + store <8 x i32> %res, ptr addrspace(1) %out + ret void +} + + + +define amdgpu_ps void @test_wmma_i32_16x16x32_iu4_zext_src0(<2 x i32> %A, <2 x i32> %B, <8 x i32> %C, ptr addrspace(1) %out) { +; GFX12-LABEL: test_wmma_i32_16x16x32_iu4_zext_src0: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: v_wmma_i32_16x16x32_iu4 v[4:11], v[0:1], v[2:3], v[4:11] neg_lo:[1,0,0] +; GFX12-NEXT: s_clause 0x1 +; GFX12-NEXT: global_store_b128 v[12:13], v[8:11], off offset:16 +; GFX12-NEXT: global_store_b128 v[12:13], v[4:7], off +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +bb: + %res = call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x32.iu4.v8i32.v2i32.v2i32.v8i32(i1 1, <2 x i32> %A, i1 0, <2 x i32> %B, <8 x i32> %C, i1 0) + store <8 x i32> %res, ptr addrspace(1) %out + ret void +} + +define amdgpu_ps void @test_wmma_i32_16x16x32_iu4_zext_src1(<2 x i32> %A, <2 x i32> %B, <8 x i32> %C, ptr addrspace(1) %out) { +; GFX12-LABEL: test_wmma_i32_16x16x32_iu4_zext_src1: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: v_wmma_i32_16x16x32_iu4 v[4:11], v[0:1], v[2:3], v[4:11] neg_lo:[0,1,0] +; GFX12-NEXT: s_clause 0x1 +; GFX12-NEXT: global_store_b128 v[12:13], v[8:11], off offset:16 +; GFX12-NEXT: global_store_b128 v[12:13], v[4:7], off +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +bb: + %res = call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x32.iu4.v8i32.v2i32.v2i32.v8i32(i1 0, <2 x i32> %A, i1 1, <2 x i32> %B, <8 x i32> %C, i1 0) + store <8 x i32> %res, ptr addrspace(1) %out + ret void +} + +define amdgpu_ps void @test_wmma_i32_16x16x32_iu4_clamp(<2 x i32> %A, <2 x i32> %B, <8 x i32> %C, ptr addrspace(1) %out) { +; GFX12-LABEL: test_wmma_i32_16x16x32_iu4_clamp: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: v_wmma_i32_16x16x32_iu4 v[4:11], v[0:1], v[2:3], v[4:11] clamp +; GFX12-NEXT: s_clause 0x1 +; GFX12-NEXT: global_store_b128 v[12:13], v[8:11], off offset:16 +; GFX12-NEXT: global_store_b128 v[12:13], v[4:7], off +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +bb: + %res = call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x32.iu4.v8i32.v2i32.v2i32.v8i32(i1 0, <2 x i32> %A, i1 0, <2 x i32> %B, <8 x i32> %C, i1 1) + store <8 x i32> %res, ptr addrspace(1) %out + ret void +} + + + + + +define amdgpu_ps void @test_swmmac_i32_16x16x32_iu8_zext_src0(<2 x i32> %A, <4 x i32> %B, <8 x i32> %C, i16 %Index, ptr addrspace(1) %out) { +; GFX12-LABEL: test_swmmac_i32_16x16x32_iu8_zext_src0: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: v_swmmac_i32_16x16x32_iu8 v[6:13], v[0:1], v[2:5], v14 neg_lo:[1,0,0] +; GFX12-NEXT: s_clause 0x1 +; GFX12-NEXT: global_store_b128 v[15:16], v[10:13], off offset:16 +; GFX12-NEXT: global_store_b128 v[15:16], v[6:9], off +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +bb: + %res = call <8 x i32> @llvm.amdgcn.swmmac.i32.16x16x32.iu8.v8i32.v2i32.v4i32.v8i32.i16(i1 1, <2 x i32> %A, i1 0, <4 x i32> %B, <8 x i32> %C, i16 %Index, i1 0) + store <8 x i32> %res, ptr addrspace(1) %out + ret void +} + +define amdgpu_ps void @test_swmmac_i32_16x16x32_iu8_zext_src1(<2 x i32> %A, <4 x i32> %B, <8 x i32> %C, i16 %Index, ptr addrspace(1) %out) { +; GFX12-LABEL: test_swmmac_i32_16x16x32_iu8_zext_src1: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: v_swmmac_i32_16x16x32_iu8 v[6:13], v[0:1], v[2:5], v14 neg_lo:[0,1,0] +; GFX12-NEXT: s_clause 0x1 +; GFX12-NEXT: global_store_b128 v[15:16], v[10:13], off offset:16 +; GFX12-NEXT: global_store_b128 v[15:16], v[6:9], off +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +bb: + %res = call <8 x i32> @llvm.amdgcn.swmmac.i32.16x16x32.iu8.v8i32.v2i32.v4i32.v8i32.i16(i1 0, <2 x i32> %A, i1 1, <4 x i32> %B, <8 x i32> %C, i16 %Index, i1 0) + store <8 x i32> %res, ptr addrspace(1) %out + ret void +} + +define amdgpu_ps void @test_swmmac_i32_16x16x32_iu8_clamp(<2 x i32> %A, <4 x i32> %B, <8 x i32> %C, i16 %Index, ptr addrspace(1) %out) { +; GFX12-LABEL: test_swmmac_i32_16x16x32_iu8_clamp: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: v_swmmac_i32_16x16x32_iu8 v[6:13], v[0:1], v[2:5], v14 clamp +; GFX12-NEXT: s_clause 0x1 +; GFX12-NEXT: global_store_b128 v[15:16], v[10:13], off offset:16 +; GFX12-NEXT: global_store_b128 v[15:16], v[6:9], off +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +bb: + %res = call <8 x i32> @llvm.amdgcn.swmmac.i32.16x16x32.iu8.v8i32.v2i32.v4i32.v8i32.i16(i1 0, <2 x i32> %A, i1 0, <4 x i32> %B, <8 x i32> %C, i16 %Index, i1 1) + store <8 x i32> %res, ptr addrspace(1) %out + ret void +} + + + +define amdgpu_ps void @test_swmmac_i32_16x16x32_iu4_zext_src0(i32 %A, <2 x i32> %B, <8 x i32> %C, i16 %Index, ptr addrspace(1) %out) { +; GFX12-LABEL: test_swmmac_i32_16x16x32_iu4_zext_src0: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: v_swmmac_i32_16x16x32_iu4 v[3:10], v0, v[1:2], v11 neg_lo:[1,0,0] +; GFX12-NEXT: s_clause 0x1 +; GFX12-NEXT: global_store_b128 v[12:13], v[7:10], off offset:16 +; GFX12-NEXT: global_store_b128 v[12:13], v[3:6], off +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +bb: + %res = call <8 x i32> @llvm.amdgcn.swmmac.i32.16x16x32.iu4.v8i32.i32.v2i32.v8i32.i16(i1 1, i32 %A, i1 0, <2 x i32> %B, <8 x i32> %C, i16 %Index, i1 0) + store <8 x i32> %res, ptr addrspace(1) %out + ret void +} + +define amdgpu_ps void @test_swmmac_i32_16x16x32_iu4_zext_src1(i32 %A, <2 x i32> %B, <8 x i32> %C, i16 %Index, ptr addrspace(1) %out) { +; GFX12-LABEL: test_swmmac_i32_16x16x32_iu4_zext_src1: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: v_swmmac_i32_16x16x32_iu4 v[3:10], v0, v[1:2], v11 neg_lo:[0,1,0] +; GFX12-NEXT: s_clause 0x1 +; GFX12-NEXT: global_store_b128 v[12:13], v[7:10], off offset:16 +; GFX12-NEXT: global_store_b128 v[12:13], v[3:6], off +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +bb: + %res = call <8 x i32> @llvm.amdgcn.swmmac.i32.16x16x32.iu4.v8i32.i32.v2i32.v8i32.i16(i1 0, i32 %A, i1 1, <2 x i32> %B, <8 x i32> %C, i16 %Index, i1 0) + store <8 x i32> %res, ptr addrspace(1) %out + ret void +} + +define amdgpu_ps void @test_swmmac_i32_16x16x32_iu4_clamp(i32 %A, <2 x i32> %B, <8 x i32> %C, i16 %Index, ptr addrspace(1) %out) { +; GFX12-LABEL: test_swmmac_i32_16x16x32_iu4_clamp: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: v_swmmac_i32_16x16x32_iu4 v[3:10], v0, v[1:2], v11 clamp +; GFX12-NEXT: s_clause 0x1 +; GFX12-NEXT: global_store_b128 v[12:13], v[7:10], off offset:16 +; GFX12-NEXT: global_store_b128 v[12:13], v[3:6], off +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +bb: + %res = call <8 x i32> @llvm.amdgcn.swmmac.i32.16x16x32.iu4.v8i32.i32.v2i32.v8i32.i16(i1 0, i32 %A, i1 0, <2 x i32> %B, <8 x i32> %C, i16 %Index, i1 1) + store <8 x i32> %res, ptr addrspace(1) %out + ret void +} + + + +define amdgpu_ps void @test_swmmac_i32_16x16x64_iu4_zext_src0(<2 x i32> %A, <4 x i32> %B, <8 x i32> %C, i32 %Index, ptr addrspace(1) %out) { +; GFX12-LABEL: test_swmmac_i32_16x16x64_iu4_zext_src0: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: v_swmmac_i32_16x16x64_iu4 v[6:13], v[0:1], v[2:5], v14 neg_lo:[1,0,0] +; GFX12-NEXT: s_clause 0x1 +; GFX12-NEXT: global_store_b128 v[15:16], v[10:13], off offset:16 +; GFX12-NEXT: global_store_b128 v[15:16], v[6:9], off +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +bb: + %res = call <8 x i32> @llvm.amdgcn.swmmac.i32.16x16x64.iu4.v8i32.v2i32.v4i32.v8i32.i32(i1 1, <2 x i32> %A, i1 0, <4 x i32> %B, <8 x i32> %C, i32 %Index, i1 0) + store <8 x i32> %res, ptr addrspace(1) %out + ret void +} + +define amdgpu_ps void @test_swmmac_i32_16x16x64_iu4_zext_src1(<2 x i32> %A, <4 x i32> %B, <8 x i32> %C, i32 %Index, ptr addrspace(1) %out) { +; GFX12-LABEL: test_swmmac_i32_16x16x64_iu4_zext_src1: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: v_swmmac_i32_16x16x64_iu4 v[6:13], v[0:1], v[2:5], v14 neg_lo:[0,1,0] +; GFX12-NEXT: s_clause 0x1 +; GFX12-NEXT: global_store_b128 v[15:16], v[10:13], off offset:16 +; GFX12-NEXT: global_store_b128 v[15:16], v[6:9], off +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +bb: + %res = call <8 x i32> @llvm.amdgcn.swmmac.i32.16x16x64.iu4.v8i32.v2i32.v4i32.v8i32.i32(i1 0, <2 x i32> %A, i1 1, <4 x i32> %B, <8 x i32> %C, i32 %Index, i1 0) + store <8 x i32> %res, ptr addrspace(1) %out + ret void +} + +define amdgpu_ps void @test_swmmac_i32_16x16x64_iu4_clamp(<2 x i32> %A, <4 x i32> %B, <8 x i32> %C, i32 %Index, ptr addrspace(1) %out) { +; GFX12-LABEL: test_swmmac_i32_16x16x64_iu4_clamp: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: v_swmmac_i32_16x16x64_iu4 v[6:13], v[0:1], v[2:5], v14 clamp +; GFX12-NEXT: s_clause 0x1 +; GFX12-NEXT: global_store_b128 v[15:16], v[10:13], off offset:16 +; GFX12-NEXT: global_store_b128 v[15:16], v[6:9], off +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +bb: + %res = call <8 x i32> @llvm.amdgcn.swmmac.i32.16x16x64.iu4.v8i32.v2i32.v4i32.v8i32.i32(i1 0, <2 x i32> %A, i1 0, <4 x i32> %B, <8 x i32> %C, i32 %Index, i1 1) + store <8 x i32> %res, ptr addrspace(1) %out + ret void +} + +declare <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8.v8i32.v2i32.v2i32.v8i32(i1 immarg, <2 x i32>, i1 immarg, <2 x i32>, <8 x i32>, i1 immarg) +declare <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4.v8i32.i32.i32.v8i32(i1 immarg, i32, i1 immarg, i32, <8 x i32>, i1 immarg) +declare <8 x i32> @llvm.amdgcn.wmma.i32.16x16x32.iu4.v8i32.v2i32.v2i32.v8i32(i1 immarg, <2 x i32>, i1 immarg, <2 x i32>, <8 x i32>, i1 immarg) +declare <8 x i32> @llvm.amdgcn.swmmac.i32.16x16x32.iu8.v8i32.v2i32.v4i32.v8i32.i16(i1 immarg, <2 x i32>, i1 immarg, <4 x i32>, <8 x i32>, i16 %Index, i1 immarg) +declare <8 x i32> @llvm.amdgcn.swmmac.i32.16x16x32.iu4.v8i32.i32.v2i32.v8i32.i16(i1 immarg, i32, i1 immarg, <2 x i32>, <8 x i32>, i16 %Index, i1 immarg) +declare <8 x i32> @llvm.amdgcn.swmmac.i32.16x16x64.iu4.v8i32.v2i32.v4i32.v8i32.i32(i1 immarg, <2 x i32>, i1 immarg, <4 x i32>, <8 x i32>, i32 %Index, i1 immarg) diff --git a/llvm/test/CodeGen/AMDGPU/wmma-gfx12-w32-swmmac-index_key.ll b/llvm/test/CodeGen/AMDGPU/wmma-gfx12-w32-swmmac-index_key.ll new file mode 100644 index 000000000000..43538768f00f --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/wmma-gfx12-w32-swmmac-index_key.ll @@ -0,0 +1,321 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc -march=amdgcn -mcpu=gfx1200 -verify-machineinstrs < %s | FileCheck %s --check-prefix=GFX12 + +define amdgpu_ps void @test_swmmac_f32_16x16x32_f16_index_key(<8 x half> %A, <16 x half> %B, <8 x float> %C, ptr addrspace(1) %IndexVecPtr, ptr addrspace(1) %out0, ptr addrspace(1) %out1) { +; GFX12-LABEL: test_swmmac_f32_16x16x32_f16_index_key: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: global_load_b32 v20, v[20:21], off +; GFX12-NEXT: v_dual_mov_b32 v33, v19 :: v_dual_mov_b32 v32, v18 +; GFX12-NEXT: v_dual_mov_b32 v31, v17 :: v_dual_mov_b32 v30, v16 +; GFX12-NEXT: v_dual_mov_b32 v29, v15 :: v_dual_mov_b32 v28, v14 +; GFX12-NEXT: v_dual_mov_b32 v27, v13 :: v_dual_mov_b32 v26, v12 +; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: v_swmmac_f32_16x16x32_f16 v[26:33], v[0:3], v[4:11], v20 +; GFX12-NEXT: v_swmmac_f32_16x16x32_f16 v[12:19], v[0:3], v[4:11], v20 index_key:1 +; GFX12-NEXT: s_clause 0x1 +; GFX12-NEXT: global_store_b128 v[22:23], v[30:33], off offset:16 +; GFX12-NEXT: global_store_b128 v[22:23], v[26:29], off +; GFX12-NEXT: s_clause 0x1 +; GFX12-NEXT: global_store_b128 v[24:25], v[16:19], off offset:16 +; GFX12-NEXT: global_store_b128 v[24:25], v[12:15], off +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +bb: + %IndexVec = load <2 x i16>, ptr addrspace(1) %IndexVecPtr, align 4 + %Index0 = extractelement <2 x i16> %IndexVec, i32 0 + %res0 = call <8 x float> @llvm.amdgcn.swmmac.f32.16x16x32.f16.v8f32.v8f16.v16f16.v8f32.i16(<8 x half> %A, <16 x half> %B, <8 x float> %C, i16 %Index0) + store <8 x float> %res0, ptr addrspace(1) %out0 + %Index1 = extractelement <2 x i16> %IndexVec, i32 1 + %res1 = call <8 x float> @llvm.amdgcn.swmmac.f32.16x16x32.f16.v8f32.v8f16.v16f16.v8f32.i16(<8 x half> %A, <16 x half> %B, <8 x float> %C, i16 %Index1) + store <8 x float> %res1, ptr addrspace(1) %out1 + ret void +} + +define amdgpu_ps void @test_swmmac_f32_16x16x32_bf16_index_key(<8 x i16> %A, <16 x i16> %B, <8 x float> %C, ptr addrspace(1) %IndexVecPtr, ptr addrspace(1) %out0, ptr addrspace(1) %out1) { +; GFX12-LABEL: test_swmmac_f32_16x16x32_bf16_index_key: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: global_load_b32 v20, v[20:21], off +; GFX12-NEXT: v_dual_mov_b32 v33, v19 :: v_dual_mov_b32 v32, v18 +; GFX12-NEXT: v_dual_mov_b32 v31, v17 :: v_dual_mov_b32 v30, v16 +; GFX12-NEXT: v_dual_mov_b32 v29, v15 :: v_dual_mov_b32 v28, v14 +; GFX12-NEXT: v_dual_mov_b32 v27, v13 :: v_dual_mov_b32 v26, v12 +; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: v_swmmac_f32_16x16x32_bf16 v[26:33], v[0:3], v[4:11], v20 +; GFX12-NEXT: v_swmmac_f32_16x16x32_bf16 v[12:19], v[0:3], v[4:11], v20 index_key:1 +; GFX12-NEXT: s_clause 0x1 +; GFX12-NEXT: global_store_b128 v[22:23], v[30:33], off offset:16 +; GFX12-NEXT: global_store_b128 v[22:23], v[26:29], off +; GFX12-NEXT: s_clause 0x1 +; GFX12-NEXT: global_store_b128 v[24:25], v[16:19], off offset:16 +; GFX12-NEXT: global_store_b128 v[24:25], v[12:15], off +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +bb: + %IndexVec = load <2 x i16>, ptr addrspace(1) %IndexVecPtr, align 4 + %Index0 = extractelement <2 x i16> %IndexVec, i32 0 + %res0 = call <8 x float> @llvm.amdgcn.swmmac.f32.16x16x32.bf16.v8f32.v8i16.v16i16.v8f32.i16(<8 x i16> %A, <16 x i16> %B, <8 x float> %C, i16 %Index0) + store <8 x float> %res0, ptr addrspace(1) %out0 + %Index1 = extractelement <2 x i16> %IndexVec, i32 1 + %res1 = call <8 x float> @llvm.amdgcn.swmmac.f32.16x16x32.bf16.v8f32.v8i16.v16i16.v8f32.i16(<8 x i16> %A, <16 x i16> %B, <8 x float> %C, i16 %Index1) + store <8 x float> %res1, ptr addrspace(1) %out1 + ret void +} + +define amdgpu_ps void @test_swmmac_f16_16x16x32_f16_index_key(<8 x half> %A, <16 x half> %B, <8 x half> %C, ptr addrspace(1) %IndexVecPtr, ptr addrspace(1) %out0, ptr addrspace(1) %out1) { +; GFX12-LABEL: test_swmmac_f16_16x16x32_f16_index_key: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: global_load_b32 v16, v[16:17], off +; GFX12-NEXT: v_dual_mov_b32 v25, v15 :: v_dual_mov_b32 v24, v14 +; GFX12-NEXT: v_dual_mov_b32 v23, v13 :: v_dual_mov_b32 v22, v12 +; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: v_swmmac_f16_16x16x32_f16 v[22:25], v[0:3], v[4:11], v16 +; GFX12-NEXT: v_swmmac_f16_16x16x32_f16 v[12:15], v[0:3], v[4:11], v16 index_key:1 +; GFX12-NEXT: global_store_b128 v[18:19], v[22:25], off +; GFX12-NEXT: global_store_b128 v[20:21], v[12:15], off +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +bb: + %IndexVec = load <2 x i16>, ptr addrspace(1) %IndexVecPtr, align 4 + %Index0 = extractelement <2 x i16> %IndexVec, i32 0 + %res0 = call <8 x half> @llvm.amdgcn.swmmac.f16.16x16x32.f16.v8f16.v8f16.v16f16.v8f16.i16(<8 x half> %A, <16 x half> %B, <8 x half> %C, i16 %Index0) + store <8 x half> %res0, ptr addrspace(1) %out0 + %Index1 = extractelement <2 x i16> %IndexVec, i32 1 + %res1 = call <8 x half> @llvm.amdgcn.swmmac.f16.16x16x32.f16.v8f16.v8f16.v16f16.v8f16.i16(<8 x half> %A, <16 x half> %B, <8 x half> %C, i16 %Index1) + store <8 x half> %res1, ptr addrspace(1) %out1 + ret void +} + +define amdgpu_ps void @test_swmmac_bf16_16x16x32_bf16_index_key(<8 x i16> %A, <16 x i16> %B, <8 x i16> %C, ptr addrspace(1) %IndexVecPtr, ptr addrspace(1) %out0, ptr addrspace(1) %out1) { +; GFX12-LABEL: test_swmmac_bf16_16x16x32_bf16_index_key: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: global_load_b32 v16, v[16:17], off +; GFX12-NEXT: v_dual_mov_b32 v25, v15 :: v_dual_mov_b32 v24, v14 +; GFX12-NEXT: v_dual_mov_b32 v23, v13 :: v_dual_mov_b32 v22, v12 +; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: v_swmmac_bf16_16x16x32_bf16 v[22:25], v[0:3], v[4:11], v16 +; GFX12-NEXT: v_swmmac_bf16_16x16x32_bf16 v[12:15], v[0:3], v[4:11], v16 index_key:1 +; GFX12-NEXT: global_store_b128 v[18:19], v[22:25], off +; GFX12-NEXT: global_store_b128 v[20:21], v[12:15], off +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +bb: + %IndexVec = load <2 x i16>, ptr addrspace(1) %IndexVecPtr, align 4 + %Index0 = extractelement <2 x i16> %IndexVec, i32 0 + %res0 = call <8 x i16> @llvm.amdgcn.swmmac.bf16.16x16x32.bf16.v8i16.v8i16.v16i16.v8i16.i16(<8 x i16> %A, <16 x i16> %B, <8 x i16> %C, i16 %Index0) + store <8 x i16> %res0, ptr addrspace(1) %out0 + %Index1 = extractelement <2 x i16> %IndexVec, i32 1 + %res1 = call <8 x i16> @llvm.amdgcn.swmmac.bf16.16x16x32.bf16.v8i16.v8i16.v16i16.v8i16.i16(<8 x i16> %A, <16 x i16> %B, <8 x i16> %C, i16 %Index1) + store <8 x i16> %res1, ptr addrspace(1) %out1 + ret void +} + +define amdgpu_ps void @test_swmmac_i32_16x16x32_iu8_index_key(<2 x i32> %A, <4 x i32> %B, <8 x i32> %C, ptr addrspace(1) %IndexVecPtr, ptr addrspace(1) %out0, ptr addrspace(1) %out1) { +; GFX12-LABEL: test_swmmac_i32_16x16x32_iu8_index_key: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: global_load_b32 v14, v[14:15], off +; GFX12-NEXT: v_dual_mov_b32 v27, v13 :: v_dual_mov_b32 v26, v12 +; GFX12-NEXT: v_dual_mov_b32 v25, v11 :: v_dual_mov_b32 v24, v10 +; GFX12-NEXT: v_dual_mov_b32 v23, v9 :: v_dual_mov_b32 v22, v8 +; GFX12-NEXT: v_dual_mov_b32 v21, v7 :: v_dual_mov_b32 v20, v6 +; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: v_swmmac_i32_16x16x32_iu8 v[20:27], v[0:1], v[2:5], v14 +; GFX12-NEXT: v_swmmac_i32_16x16x32_iu8 v[6:13], v[0:1], v[2:5], v14 index_key:1 +; GFX12-NEXT: s_clause 0x1 +; GFX12-NEXT: global_store_b128 v[16:17], v[24:27], off offset:16 +; GFX12-NEXT: global_store_b128 v[16:17], v[20:23], off +; GFX12-NEXT: s_clause 0x1 +; GFX12-NEXT: global_store_b128 v[18:19], v[10:13], off offset:16 +; GFX12-NEXT: global_store_b128 v[18:19], v[6:9], off +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +bb: + %IndexVec = load <2 x i16>, ptr addrspace(1) %IndexVecPtr, align 4 + %Index0 = extractelement <2 x i16> %IndexVec, i32 0 + %res0 = call <8 x i32> @llvm.amdgcn.swmmac.i32.16x16x32.iu8.v8i32.v2i32.v4i32.v8i32.i16(i1 0, <2 x i32> %A, i1 0, <4 x i32> %B, <8 x i32> %C, i16 %Index0, i1 0) + store <8 x i32> %res0, ptr addrspace(1) %out0 + %Index1 = extractelement <2 x i16> %IndexVec, i32 1 + %res1 = call <8 x i32> @llvm.amdgcn.swmmac.i32.16x16x32.iu8.v8i32.v2i32.v4i32.v8i32.i16(i1 0, <2 x i32> %A, i1 0, <4 x i32> %B, <8 x i32> %C, i16 %Index1, i1 0) + store <8 x i32> %res1, ptr addrspace(1) %out1 + ret void +} + +define amdgpu_ps void @test_swmmac_i32_16x16x32_iu4_index_key(i32 %A, <2 x i32> %B, <8 x i32> %C, ptr addrspace(1) %IndexVecPtr, ptr addrspace(1) %out0, ptr addrspace(1) %out1) { +; GFX12-LABEL: test_swmmac_i32_16x16x32_iu4_index_key: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: global_load_b32 v11, v[11:12], off +; GFX12-NEXT: v_dual_mov_b32 v24, v10 :: v_dual_mov_b32 v23, v9 +; GFX12-NEXT: v_dual_mov_b32 v22, v8 :: v_dual_mov_b32 v21, v7 +; GFX12-NEXT: v_dual_mov_b32 v20, v6 :: v_dual_mov_b32 v19, v5 +; GFX12-NEXT: v_dual_mov_b32 v18, v4 :: v_dual_mov_b32 v17, v3 +; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: v_swmmac_i32_16x16x32_iu4 v[17:24], v0, v[1:2], v11 +; GFX12-NEXT: v_swmmac_i32_16x16x32_iu4 v[3:10], v0, v[1:2], v11 index_key:1 +; GFX12-NEXT: s_clause 0x1 +; GFX12-NEXT: global_store_b128 v[13:14], v[21:24], off offset:16 +; GFX12-NEXT: global_store_b128 v[13:14], v[17:20], off +; GFX12-NEXT: s_clause 0x1 +; GFX12-NEXT: global_store_b128 v[15:16], v[7:10], off offset:16 +; GFX12-NEXT: global_store_b128 v[15:16], v[3:6], off +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +bb: + %IndexVec = load <2 x i16>, ptr addrspace(1) %IndexVecPtr, align 4 + %Index0 = extractelement <2 x i16> %IndexVec, i32 0 + %res0 = call <8 x i32> @llvm.amdgcn.swmmac.i32.16x16x32.iu4.v8i32.i32.v2i32.v8i32.i16(i1 0, i32 %A, i1 0, <2 x i32> %B, <8 x i32> %C, i16 %Index0, i1 0) + store <8 x i32> %res0, ptr addrspace(1) %out0 + %Index1 = extractelement <2 x i16> %IndexVec, i32 1 + %res1 = call <8 x i32> @llvm.amdgcn.swmmac.i32.16x16x32.iu4.v8i32.i32.v2i32.v8i32.i16(i1 0, i32 %A, i1 0, <2 x i32> %B, <8 x i32> %C, i16 %Index1, i1 0) + store <8 x i32> %res1, ptr addrspace(1) %out1 + ret void +} + +define amdgpu_ps void @test_swmmac_f32_16x16x32_fp8_fp8_index_key(<2 x i32> %A, <4 x i32> %B, <8 x float> %C, ptr addrspace(1) %IndexVecPtr, ptr addrspace(1) %out0, ptr addrspace(1) %out1) { +; GFX12-LABEL: test_swmmac_f32_16x16x32_fp8_fp8_index_key: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: global_load_b32 v14, v[14:15], off +; GFX12-NEXT: v_dual_mov_b32 v27, v13 :: v_dual_mov_b32 v26, v12 +; GFX12-NEXT: v_dual_mov_b32 v25, v11 :: v_dual_mov_b32 v24, v10 +; GFX12-NEXT: v_dual_mov_b32 v23, v9 :: v_dual_mov_b32 v22, v8 +; GFX12-NEXT: v_dual_mov_b32 v21, v7 :: v_dual_mov_b32 v20, v6 +; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: v_swmmac_f32_16x16x32_fp8_fp8 v[20:27], v[0:1], v[2:5], v14 +; GFX12-NEXT: v_swmmac_f32_16x16x32_fp8_fp8 v[6:13], v[0:1], v[2:5], v14 index_key:1 +; GFX12-NEXT: s_clause 0x1 +; GFX12-NEXT: global_store_b128 v[16:17], v[24:27], off offset:16 +; GFX12-NEXT: global_store_b128 v[16:17], v[20:23], off +; GFX12-NEXT: s_clause 0x1 +; GFX12-NEXT: global_store_b128 v[18:19], v[10:13], off offset:16 +; GFX12-NEXT: global_store_b128 v[18:19], v[6:9], off +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +bb: + %IndexVec = load <2 x i16>, ptr addrspace(1) %IndexVecPtr, align 4 + %Index0 = extractelement <2 x i16> %IndexVec, i32 0 + %res0 = call <8 x float> @llvm.amdgcn.swmmac.f32.16x16x32.fp8.fp8.v8f32.v2i32.v4i32.v8f32.i16(<2 x i32> %A, <4 x i32> %B, <8 x float> %C, i16 %Index0) + store <8 x float> %res0, ptr addrspace(1) %out0 + %Index1 = extractelement <2 x i16> %IndexVec, i32 1 + %res1 = call <8 x float> @llvm.amdgcn.swmmac.f32.16x16x32.fp8.fp8.v8f32.v2i32.v4i32.v8f32.i16(<2 x i32> %A, <4 x i32> %B, <8 x float> %C, i16 %Index1) + store <8 x float> %res1, ptr addrspace(1) %out1 + ret void +} + +define amdgpu_ps void @test_swmmac_f32_16x16x32_fp8_bf8_index_key(<2 x i32> %A, <4 x i32> %B, <8 x float> %C, ptr addrspace(1) %IndexVecPtr, ptr addrspace(1) %out0, ptr addrspace(1) %out1) { +; GFX12-LABEL: test_swmmac_f32_16x16x32_fp8_bf8_index_key: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: global_load_b32 v14, v[14:15], off +; GFX12-NEXT: v_dual_mov_b32 v27, v13 :: v_dual_mov_b32 v26, v12 +; GFX12-NEXT: v_dual_mov_b32 v25, v11 :: v_dual_mov_b32 v24, v10 +; GFX12-NEXT: v_dual_mov_b32 v23, v9 :: v_dual_mov_b32 v22, v8 +; GFX12-NEXT: v_dual_mov_b32 v21, v7 :: v_dual_mov_b32 v20, v6 +; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: v_swmmac_f32_16x16x32_fp8_bf8 v[20:27], v[0:1], v[2:5], v14 +; GFX12-NEXT: v_swmmac_f32_16x16x32_fp8_bf8 v[6:13], v[0:1], v[2:5], v14 index_key:1 +; GFX12-NEXT: s_clause 0x1 +; GFX12-NEXT: global_store_b128 v[16:17], v[24:27], off offset:16 +; GFX12-NEXT: global_store_b128 v[16:17], v[20:23], off +; GFX12-NEXT: s_clause 0x1 +; GFX12-NEXT: global_store_b128 v[18:19], v[10:13], off offset:16 +; GFX12-NEXT: global_store_b128 v[18:19], v[6:9], off +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +bb: + %IndexVec = load <2 x i16>, ptr addrspace(1) %IndexVecPtr, align 4 + %Index0 = extractelement <2 x i16> %IndexVec, i32 0 + %res0 = call <8 x float> @llvm.amdgcn.swmmac.f32.16x16x32.fp8.bf8.v8f32.v2i32.v4i32.v8f32.i16(<2 x i32> %A, <4 x i32> %B, <8 x float> %C, i16 %Index0) + store <8 x float> %res0, ptr addrspace(1) %out0 + %Index1 = extractelement <2 x i16> %IndexVec, i32 1 + %res1 = call <8 x float> @llvm.amdgcn.swmmac.f32.16x16x32.fp8.bf8.v8f32.v2i32.v4i32.v8f32.i16(<2 x i32> %A, <4 x i32> %B, <8 x float> %C, i16 %Index1) + store <8 x float> %res1, ptr addrspace(1) %out1 + ret void +} + +define amdgpu_ps void @test_swmmac_f32_16x16x32_bf8_fp8_index_key(<2 x i32> %A, <4 x i32> %B, <8 x float> %C, ptr addrspace(1) %IndexVecPtr, ptr addrspace(1) %out0, ptr addrspace(1) %out1) { +; GFX12-LABEL: test_swmmac_f32_16x16x32_bf8_fp8_index_key: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: global_load_b32 v14, v[14:15], off +; GFX12-NEXT: v_dual_mov_b32 v27, v13 :: v_dual_mov_b32 v26, v12 +; GFX12-NEXT: v_dual_mov_b32 v25, v11 :: v_dual_mov_b32 v24, v10 +; GFX12-NEXT: v_dual_mov_b32 v23, v9 :: v_dual_mov_b32 v22, v8 +; GFX12-NEXT: v_dual_mov_b32 v21, v7 :: v_dual_mov_b32 v20, v6 +; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: v_swmmac_f32_16x16x32_bf8_fp8 v[20:27], v[0:1], v[2:5], v14 +; GFX12-NEXT: v_swmmac_f32_16x16x32_bf8_fp8 v[6:13], v[0:1], v[2:5], v14 index_key:1 +; GFX12-NEXT: s_clause 0x1 +; GFX12-NEXT: global_store_b128 v[16:17], v[24:27], off offset:16 +; GFX12-NEXT: global_store_b128 v[16:17], v[20:23], off +; GFX12-NEXT: s_clause 0x1 +; GFX12-NEXT: global_store_b128 v[18:19], v[10:13], off offset:16 +; GFX12-NEXT: global_store_b128 v[18:19], v[6:9], off +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +bb: + %IndexVec = load <2 x i16>, ptr addrspace(1) %IndexVecPtr, align 4 + %Index0 = extractelement <2 x i16> %IndexVec, i32 0 + %res0 = call <8 x float> @llvm.amdgcn.swmmac.f32.16x16x32.bf8.fp8.v8f32.v2i32.v4i32.v8f32.i16(<2 x i32> %A, <4 x i32> %B, <8 x float> %C, i16 %Index0) + store <8 x float> %res0, ptr addrspace(1) %out0 + %Index1 = extractelement <2 x i16> %IndexVec, i32 1 + %res1 = call <8 x float> @llvm.amdgcn.swmmac.f32.16x16x32.bf8.fp8.v8f32.v2i32.v4i32.v8f32.i16(<2 x i32> %A, <4 x i32> %B, <8 x float> %C, i16 %Index1) + store <8 x float> %res1, ptr addrspace(1) %out1 + ret void +} + +define amdgpu_ps void @test_swmmac_f32_16x16x32_bf8_bf8_index_key(<2 x i32> %A, <4 x i32> %B, <8 x float> %C, ptr addrspace(1) %IndexVecPtr, ptr addrspace(1) %out0, ptr addrspace(1) %out1) { +; GFX12-LABEL: test_swmmac_f32_16x16x32_bf8_bf8_index_key: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: global_load_b32 v14, v[14:15], off +; GFX12-NEXT: v_dual_mov_b32 v27, v13 :: v_dual_mov_b32 v26, v12 +; GFX12-NEXT: v_dual_mov_b32 v25, v11 :: v_dual_mov_b32 v24, v10 +; GFX12-NEXT: v_dual_mov_b32 v23, v9 :: v_dual_mov_b32 v22, v8 +; GFX12-NEXT: v_dual_mov_b32 v21, v7 :: v_dual_mov_b32 v20, v6 +; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: v_swmmac_f32_16x16x32_bf8_bf8 v[20:27], v[0:1], v[2:5], v14 +; GFX12-NEXT: v_swmmac_f32_16x16x32_bf8_bf8 v[6:13], v[0:1], v[2:5], v14 index_key:1 +; GFX12-NEXT: s_clause 0x1 +; GFX12-NEXT: global_store_b128 v[16:17], v[24:27], off offset:16 +; GFX12-NEXT: global_store_b128 v[16:17], v[20:23], off +; GFX12-NEXT: s_clause 0x1 +; GFX12-NEXT: global_store_b128 v[18:19], v[10:13], off offset:16 +; GFX12-NEXT: global_store_b128 v[18:19], v[6:9], off +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +bb: + %IndexVec = load <2 x i16>, ptr addrspace(1) %IndexVecPtr, align 4 + %Index0 = extractelement <2 x i16> %IndexVec, i32 0 + %res0 = call <8 x float> @llvm.amdgcn.swmmac.f32.16x16x32.bf8.bf8.v8f32.v2i32.v4i32.v8f32.i16(<2 x i32> %A, <4 x i32> %B, <8 x float> %C, i16 %Index0) + store <8 x float> %res0, ptr addrspace(1) %out0 + %Index1 = extractelement <2 x i16> %IndexVec, i32 1 + %res1 = call <8 x float> @llvm.amdgcn.swmmac.f32.16x16x32.bf8.bf8.v8f32.v2i32.v4i32.v8f32.i16(<2 x i32> %A, <4 x i32> %B, <8 x float> %C, i16 %Index1) + store <8 x float> %res1, ptr addrspace(1) %out1 + ret void +} + +declare <8 x float> @llvm.amdgcn.swmmac.f32.16x16x32.f16.v8f32.v8f16.v16f16.v8f32.i16(<8 x half>, <16 x half>, <8 x float>, i16) +declare <8 x float> @llvm.amdgcn.swmmac.f32.16x16x32.bf16.v8f32.v8i16.v16i16.v8f32.i16(<8 x i16>, <16 x i16>, <8 x float>, i16) +declare <8 x half> @llvm.amdgcn.swmmac.f16.16x16x32.f16.v8f16.v8f16.v16f16.v8f16.i16(<8 x half>, <16 x half>, <8 x half>, i16) +declare <8 x i16> @llvm.amdgcn.swmmac.bf16.16x16x32.bf16.v8i16.v8i16.v16i16.v8i16.i16(<8 x i16>, <16 x i16>, <8 x i16>, i16) +declare <8 x i32> @llvm.amdgcn.swmmac.i32.16x16x32.iu8.v8i32.v2i32.v4i32.v8i32.i16(i1 immarg, <2 x i32>, i1 immarg, <4 x i32>, <8 x i32>, i16 %Index, i1 immarg) +declare <8 x i32> @llvm.amdgcn.swmmac.i32.16x16x32.iu4.v8i32.i32.v2i32.v8i32.i16(i1 immarg, i32, i1 immarg, <2 x i32>, <8 x i32>, i16 %Index, i1 immarg) +declare <8 x float> @llvm.amdgcn.swmmac.f32.16x16x32.fp8.fp8.v8f32.v2i32.v4i32.v8f32.i16(<2 x i32>, <4 x i32>, <8 x float>, i16) +declare <8 x float> @llvm.amdgcn.swmmac.f32.16x16x32.fp8.bf8.v8f32.v2i32.v4i32.v8f32.i16(<2 x i32>, <4 x i32>, <8 x float>, i16) +declare <8 x float> @llvm.amdgcn.swmmac.f32.16x16x32.bf8.fp8.v8f32.v2i32.v4i32.v8f32.i16(<2 x i32>, <4 x i32>, <8 x float>, i16) +declare <8 x float> @llvm.amdgcn.swmmac.f32.16x16x32.bf8.bf8.v8f32.v2i32.v4i32.v8f32.i16(<2 x i32>, <4 x i32>, <8 x float>, i16) diff --git a/llvm/test/CodeGen/AMDGPU/wmma-gfx12-w32.ll b/llvm/test/CodeGen/AMDGPU/wmma-gfx12-w32.ll new file mode 100644 index 000000000000..2db3b07de54d --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/wmma-gfx12-w32.ll @@ -0,0 +1,370 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc -march=amdgcn -mcpu=gfx1200 -verify-machineinstrs < %s | FileCheck %s --check-prefix=GFX12 + +define amdgpu_ps void @test_wmma_f32_16x16x16_f16(<8 x half> %A, <8 x half> %B, <8 x float> %C, ptr addrspace(1) %out) { +; GFX12-LABEL: test_wmma_f32_16x16x16_f16: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: v_wmma_f32_16x16x16_f16 v[8:15], v[0:3], v[4:7], v[8:15] +; GFX12-NEXT: s_clause 0x1 +; GFX12-NEXT: global_store_b128 v[16:17], v[12:15], off offset:16 +; GFX12-NEXT: global_store_b128 v[16:17], v[8:11], off +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +bb: + %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.f16.v8f32.v8f16.v8f16.v8f32(<8 x half> %A, <8 x half> %B, <8 x float> %C) + store <8 x float> %res, ptr addrspace(1) %out + ret void +} + +define amdgpu_ps void @test_wmma_f32_16x16x16_bf16(<8 x i16> %A, <8 x i16> %B, <8 x float> %C, ptr addrspace(1) %out) { +; GFX12-LABEL: test_wmma_f32_16x16x16_bf16: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: v_wmma_f32_16x16x16_bf16 v[8:15], v[0:3], v[4:7], v[8:15] +; GFX12-NEXT: s_clause 0x1 +; GFX12-NEXT: global_store_b128 v[16:17], v[12:15], off offset:16 +; GFX12-NEXT: global_store_b128 v[16:17], v[8:11], off +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +bb: + %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf16.v8f32.v8i16.v8i16.v8f32(<8 x i16> %A, <8 x i16> %B, <8 x float> %C) + store <8 x float> %res, ptr addrspace(1) %out + ret void +} + +define amdgpu_ps void @test_wmma_f16_16x16x16_f16(<8 x half> %A, <8 x half> %B, <8 x half> %C, ptr addrspace(1) %out) { +; GFX12-LABEL: test_wmma_f16_16x16x16_f16: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: v_wmma_f16_16x16x16_f16 v[8:11], v[0:3], v[4:7], v[8:11] +; GFX12-NEXT: global_store_b128 v[12:13], v[8:11], off +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +bb: + %res = call <8 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16.v8f16.v8f16.v8f16.v8f16(<8 x half> %A, <8 x half> %B, <8 x half> %C, i1 0) + store <8 x half> %res, ptr addrspace(1) %out + ret void +} + +define amdgpu_ps void @test_wmma_bf16_16x16x16_bf16(<8 x i16> %A, <8 x i16> %B, <8 x i16> %C, ptr addrspace(1) %out) { +; GFX12-LABEL: test_wmma_bf16_16x16x16_bf16: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: v_wmma_bf16_16x16x16_bf16 v[8:11], v[0:3], v[4:7], v[8:11] +; GFX12-NEXT: global_store_b128 v[12:13], v[8:11], off +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +bb: + %res = call <8 x i16> @llvm.amdgcn.wmma.bf16.16x16x16.bf16.v8i16.v8i16.v8i16.v8i16(<8 x i16> %A, <8 x i16> %B, <8 x i16> %C, i1 0) + store <8 x i16> %res, ptr addrspace(1) %out + ret void +} + +define amdgpu_ps void @test_wmma_i32_16x16x16_iu8(<2 x i32> %A, <2 x i32> %B, <8 x i32> %C, ptr addrspace(1) %out) { +; GFX12-LABEL: test_wmma_i32_16x16x16_iu8: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: v_wmma_i32_16x16x16_iu8 v[4:11], v[0:1], v[2:3], v[4:11] +; GFX12-NEXT: s_clause 0x1 +; GFX12-NEXT: global_store_b128 v[12:13], v[8:11], off offset:16 +; GFX12-NEXT: global_store_b128 v[12:13], v[4:7], off +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +bb: + %res = call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8.v8i32.v2i32.v2i32.v8i32(i1 0, <2 x i32> %A, i1 0, <2 x i32> %B, <8 x i32> %C, i1 0) + store <8 x i32> %res, ptr addrspace(1) %out + ret void +} + +define amdgpu_ps void @test_wmma_i32_16x16x16_iu4(i32 %A, i32 %B, <8 x i32> %C, ptr addrspace(1) %out) { +; GFX12-LABEL: test_wmma_i32_16x16x16_iu4: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: v_wmma_i32_16x16x16_iu4 v[2:9], v0, v1, v[2:9] +; GFX12-NEXT: s_clause 0x1 +; GFX12-NEXT: global_store_b128 v[10:11], v[6:9], off offset:16 +; GFX12-NEXT: global_store_b128 v[10:11], v[2:5], off +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +bb: + %res = call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4.v8i32.i32.i32.v8i32(i1 0, i32 %A, i1 0, i32 %B, <8 x i32> %C, i1 0) + store <8 x i32> %res, ptr addrspace(1) %out + ret void +} + +define amdgpu_ps void @test_wmma_f32_16x16x16_fp8_fp8(<2 x i32> %A, <2 x i32> %B, <8 x float> %C, ptr addrspace(1) %out) { +; GFX12-LABEL: test_wmma_f32_16x16x16_fp8_fp8: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: v_wmma_f32_16x16x16_fp8_fp8 v[4:11], v[0:1], v[2:3], v[4:11] +; GFX12-NEXT: s_clause 0x1 +; GFX12-NEXT: global_store_b128 v[12:13], v[8:11], off offset:16 +; GFX12-NEXT: global_store_b128 v[12:13], v[4:7], off +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +bb: + %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.fp8.fp8.v8f32.v2i32.v2i32.v8f32(<2 x i32> %A, <2 x i32> %B, <8 x float> %C) + store <8 x float> %res, ptr addrspace(1) %out + ret void +} + +define amdgpu_ps void @test_wmma_f32_16x16x16_bf8_fp8(<2 x i32> %A, <2 x i32> %B, <8 x float> %C, ptr addrspace(1) %out) { +; GFX12-LABEL: test_wmma_f32_16x16x16_bf8_fp8: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: v_wmma_f32_16x16x16_bf8_fp8 v[4:11], v[0:1], v[2:3], v[4:11] +; GFX12-NEXT: s_clause 0x1 +; GFX12-NEXT: global_store_b128 v[12:13], v[8:11], off offset:16 +; GFX12-NEXT: global_store_b128 v[12:13], v[4:7], off +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +bb: + %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf8.fp8.v8f32.v2i32.v2i32.v8f32(<2 x i32> %A, <2 x i32> %B, <8 x float> %C) + store <8 x float> %res, ptr addrspace(1) %out + ret void +} + +define amdgpu_ps void @test_wmma_f32_16x16x16_fp8_bf8(<2 x i32> %A, <2 x i32> %B, <8 x float> %C, ptr addrspace(1) %out) { +; GFX12-LABEL: test_wmma_f32_16x16x16_fp8_bf8: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: v_wmma_f32_16x16x16_fp8_bf8 v[4:11], v[0:1], v[2:3], v[4:11] +; GFX12-NEXT: s_clause 0x1 +; GFX12-NEXT: global_store_b128 v[12:13], v[8:11], off offset:16 +; GFX12-NEXT: global_store_b128 v[12:13], v[4:7], off +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +bb: + %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.fp8.bf8.v8f32.v2i32.v2i32.v8f32(<2 x i32> %A, <2 x i32> %B, <8 x float> %C) + store <8 x float> %res, ptr addrspace(1) %out + ret void +} + +define amdgpu_ps void @test_wmma_f32_16x16x16_bf8_bf8(<2 x i32> %A, <2 x i32> %B, <8 x float> %C, ptr addrspace(1) %out) { +; GFX12-LABEL: test_wmma_f32_16x16x16_bf8_bf8: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: v_wmma_f32_16x16x16_bf8_bf8 v[4:11], v[0:1], v[2:3], v[4:11] +; GFX12-NEXT: s_clause 0x1 +; GFX12-NEXT: global_store_b128 v[12:13], v[8:11], off offset:16 +; GFX12-NEXT: global_store_b128 v[12:13], v[4:7], off +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +bb: + %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf8.bf8.v8f32.v2i32.v2i32.v8f32(<2 x i32> %A, <2 x i32> %B, <8 x float> %C) + store <8 x float> %res, ptr addrspace(1) %out + ret void +} + +define amdgpu_ps void @test_wmma_i32_16x16x32_iu4(<2 x i32> %A, <2 x i32> %B, <8 x i32> %C, ptr addrspace(1) %out) { +; GFX12-LABEL: test_wmma_i32_16x16x32_iu4: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: v_wmma_i32_16x16x32_iu4 v[4:11], v[0:1], v[2:3], v[4:11] +; GFX12-NEXT: s_clause 0x1 +; GFX12-NEXT: global_store_b128 v[12:13], v[8:11], off offset:16 +; GFX12-NEXT: global_store_b128 v[12:13], v[4:7], off +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +bb: + %res = call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x32.iu4.v8i32.v2i32.v2i32.v8i32(i1 0, <2 x i32> %A, i1 0, <2 x i32> %B, <8 x i32> %C, i1 0) + store <8 x i32> %res, ptr addrspace(1) %out + ret void +} + + +define amdgpu_ps void @test_swmmac_f32_16x16x32_f16(<8 x half> %A, <16 x half> %B, <8 x float> %C, i16 %Index, ptr addrspace(1) %out) { +; GFX12-LABEL: test_swmmac_f32_16x16x32_f16: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: v_swmmac_f32_16x16x32_f16 v[12:19], v[0:3], v[4:11], v20 +; GFX12-NEXT: s_clause 0x1 +; GFX12-NEXT: global_store_b128 v[21:22], v[16:19], off offset:16 +; GFX12-NEXT: global_store_b128 v[21:22], v[12:15], off +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +bb: + %res = call <8 x float> @llvm.amdgcn.swmmac.f32.16x16x32.f16.v8f32.v8f16.v16f16.v8f32.i16(<8 x half> %A, <16 x half> %B, <8 x float> %C, i16 %Index) + store <8 x float> %res, ptr addrspace(1) %out + ret void +} + +define amdgpu_ps void @test_swmmac_f32_16x16x32_bf16(<8 x i16> %A, <16 x i16> %B, <8 x float> %C, i16 %Index, ptr addrspace(1) %out) { +; GFX12-LABEL: test_swmmac_f32_16x16x32_bf16: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: v_swmmac_f32_16x16x32_bf16 v[12:19], v[0:3], v[4:11], v20 +; GFX12-NEXT: s_clause 0x1 +; GFX12-NEXT: global_store_b128 v[21:22], v[16:19], off offset:16 +; GFX12-NEXT: global_store_b128 v[21:22], v[12:15], off +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +bb: + %res = call <8 x float> @llvm.amdgcn.swmmac.f32.16x16x32.bf16.v8f32.v8i16.v16i16.v8f32.i16(<8 x i16> %A, <16 x i16> %B, <8 x float> %C, i16 %Index) + store <8 x float> %res, ptr addrspace(1) %out + ret void +} + +define amdgpu_ps void @test_swmmac_f16_16x16x32_f16(<8 x half> %A, <16 x half> %B, <8 x half> %C, i16 %Index, ptr addrspace(1) %out) { +; GFX12-LABEL: test_swmmac_f16_16x16x32_f16: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: v_swmmac_f16_16x16x32_f16 v[12:15], v[0:3], v[4:11], v16 +; GFX12-NEXT: global_store_b128 v[17:18], v[12:15], off +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +bb: + %res = call <8 x half> @llvm.amdgcn.swmmac.f16.16x16x32.f16.v8f16.v8f16.v16f16.v8f16.i16(<8 x half> %A, <16 x half> %B, <8 x half> %C, i16 %Index) + store <8 x half> %res, ptr addrspace(1) %out + ret void +} + +define amdgpu_ps void @test_swmmac_bf16_16x16x32_bf16(<8 x i16> %A, <16 x i16> %B, <8 x i16> %C, i16 %Index, ptr addrspace(1) %out) { +; GFX12-LABEL: test_swmmac_bf16_16x16x32_bf16: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: v_swmmac_bf16_16x16x32_bf16 v[12:15], v[0:3], v[4:11], v16 +; GFX12-NEXT: global_store_b128 v[17:18], v[12:15], off +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +bb: + %res = call <8 x i16> @llvm.amdgcn.swmmac.bf16.16x16x32.bf16.v8i16.v8i16.v16i16.v8i16.i16(<8 x i16> %A, <16 x i16> %B, <8 x i16> %C, i16 %Index) + store <8 x i16> %res, ptr addrspace(1) %out + ret void +} + +define amdgpu_ps void @test_swmmac_i32_16x16x32_iu8(<2 x i32> %A, <4 x i32> %B, <8 x i32> %C, i16 %Index, ptr addrspace(1) %out) { +; GFX12-LABEL: test_swmmac_i32_16x16x32_iu8: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: v_swmmac_i32_16x16x32_iu8 v[6:13], v[0:1], v[2:5], v14 +; GFX12-NEXT: s_clause 0x1 +; GFX12-NEXT: global_store_b128 v[15:16], v[10:13], off offset:16 +; GFX12-NEXT: global_store_b128 v[15:16], v[6:9], off +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +bb: + %res = call <8 x i32> @llvm.amdgcn.swmmac.i32.16x16x32.iu8.v8i32.v2i32.v4i32.v8i32.i16(i1 0, <2 x i32> %A, i1 0, <4 x i32> %B, <8 x i32> %C, i16 %Index, i1 0) + store <8 x i32> %res, ptr addrspace(1) %out + ret void +} + +define amdgpu_ps void @test_swmmac_i32_16x16x32_iu4(i32 %A, <2 x i32> %B, <8 x i32> %C, i16 %Index, ptr addrspace(1) %out) { +; GFX12-LABEL: test_swmmac_i32_16x16x32_iu4: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: v_swmmac_i32_16x16x32_iu4 v[3:10], v0, v[1:2], v11 +; GFX12-NEXT: s_clause 0x1 +; GFX12-NEXT: global_store_b128 v[12:13], v[7:10], off offset:16 +; GFX12-NEXT: global_store_b128 v[12:13], v[3:6], off +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +bb: + %res = call <8 x i32> @llvm.amdgcn.swmmac.i32.16x16x32.iu4.v8i32.i32.v2i32.v8i32.i16(i1 0, i32 %A, i1 0, <2 x i32> %B, <8 x i32> %C, i16 %Index, i1 0) + store <8 x i32> %res, ptr addrspace(1) %out + ret void +} + +define amdgpu_ps void @test_swmmac_i32_16x16x64_iu4(<2 x i32> %A, <4 x i32> %B, <8 x i32> %C, i32 %Index, ptr addrspace(1) %out) { +; GFX12-LABEL: test_swmmac_i32_16x16x64_iu4: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: v_swmmac_i32_16x16x64_iu4 v[6:13], v[0:1], v[2:5], v14 +; GFX12-NEXT: s_clause 0x1 +; GFX12-NEXT: global_store_b128 v[15:16], v[10:13], off offset:16 +; GFX12-NEXT: global_store_b128 v[15:16], v[6:9], off +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +bb: + %res = call <8 x i32> @llvm.amdgcn.swmmac.i32.16x16x64.iu4.v8i32.v2i32.v4i32.v8i32.i32(i1 0, <2 x i32> %A, i1 0, <4 x i32> %B, <8 x i32> %C, i32 %Index, i1 0) + store <8 x i32> %res, ptr addrspace(1) %out + ret void +} + +define amdgpu_ps void @test_swmmac_f32_16x16x32_fp8_fp8(<2 x i32> %A, <4 x i32> %B, <8 x float> %C, i16 %Index, ptr addrspace(1) %out) { +; GFX12-LABEL: test_swmmac_f32_16x16x32_fp8_fp8: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: v_swmmac_f32_16x16x32_fp8_fp8 v[6:13], v[0:1], v[2:5], v14 +; GFX12-NEXT: s_clause 0x1 +; GFX12-NEXT: global_store_b128 v[15:16], v[10:13], off offset:16 +; GFX12-NEXT: global_store_b128 v[15:16], v[6:9], off +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +bb: + %res = call <8 x float> @llvm.amdgcn.swmmac.f32.16x16x32.fp8.fp8.v8f32.v2i32.v4i32.v8f32.i16(<2 x i32> %A, <4 x i32> %B, <8 x float> %C, i16 %Index) + store <8 x float> %res, ptr addrspace(1) %out + ret void +} + +define amdgpu_ps void @test_swmmac_f32_16x16x32_fp8_bf8(<2 x i32> %A, <4 x i32> %B, <8 x float> %C, i16 %Index, ptr addrspace(1) %out) { +; GFX12-LABEL: test_swmmac_f32_16x16x32_fp8_bf8: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: v_swmmac_f32_16x16x32_fp8_bf8 v[6:13], v[0:1], v[2:5], v14 +; GFX12-NEXT: s_clause 0x1 +; GFX12-NEXT: global_store_b128 v[15:16], v[10:13], off offset:16 +; GFX12-NEXT: global_store_b128 v[15:16], v[6:9], off +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +bb: + %res = call <8 x float> @llvm.amdgcn.swmmac.f32.16x16x32.fp8.bf8.v8f32.v2i32.v4i32.v8f32.i16(<2 x i32> %A, <4 x i32> %B, <8 x float> %C, i16 %Index) + store <8 x float> %res, ptr addrspace(1) %out + ret void +} + +define amdgpu_ps void @test_swmmac_f32_16x16x32_bf8_fp8(<2 x i32> %A, <4 x i32> %B, <8 x float> %C, i16 %Index, ptr addrspace(1) %out) { +; GFX12-LABEL: test_swmmac_f32_16x16x32_bf8_fp8: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: v_swmmac_f32_16x16x32_bf8_fp8 v[6:13], v[0:1], v[2:5], v14 +; GFX12-NEXT: s_clause 0x1 +; GFX12-NEXT: global_store_b128 v[15:16], v[10:13], off offset:16 +; GFX12-NEXT: global_store_b128 v[15:16], v[6:9], off +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +bb: + %res = call <8 x float> @llvm.amdgcn.swmmac.f32.16x16x32.bf8.fp8.v8f32.v2i32.v4i32.v8f32.i16(<2 x i32> %A, <4 x i32> %B, <8 x float> %C, i16 %Index) + store <8 x float> %res, ptr addrspace(1) %out + ret void +} + +define amdgpu_ps void @test_swmmac_f32_16x16x32_bf8_bf8(<2 x i32> %A, <4 x i32> %B, <8 x float> %C, i16 %Index, ptr addrspace(1) %out) { +; GFX12-LABEL: test_swmmac_f32_16x16x32_bf8_bf8: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: v_swmmac_f32_16x16x32_bf8_bf8 v[6:13], v[0:1], v[2:5], v14 +; GFX12-NEXT: s_clause 0x1 +; GFX12-NEXT: global_store_b128 v[15:16], v[10:13], off offset:16 +; GFX12-NEXT: global_store_b128 v[15:16], v[6:9], off +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +bb: + %res = call <8 x float> @llvm.amdgcn.swmmac.f32.16x16x32.bf8.bf8.v8f32.v2i32.v4i32.v8f32.i16(<2 x i32> %A, <4 x i32> %B, <8 x float> %C, i16 %Index) + store <8 x float> %res, ptr addrspace(1) %out + ret void +} + +declare <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.f16.v8f32.v8f16.v8f16.v8f32(<8 x half>, <8 x half>, <8 x float>) +declare <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf16.v8f32.v8i16.v8i16.v8f32(<8 x i16>, <8 x i16>, <8 x float>) +declare <8 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16.v8f16.v8f16.v8f16.v8f16(<8 x half>, <8 x half>, <8 x half>, i1 immarg) +declare <8 x i16> @llvm.amdgcn.wmma.bf16.16x16x16.bf16.v8i16.v8i16.v8i16.v8i16(<8 x i16>, <8 x i16>, <8 x i16>, i1 immarg) +declare <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8.v8i32.v2i32.v2i32.v8i32(i1 immarg, <2 x i32>, i1 immarg, <2 x i32>, <8 x i32>, i1 immarg) +declare <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4.v8i32.i32.i32.v8i32(i1 immarg, i32, i1 immarg, i32, <8 x i32>, i1 immarg) +declare <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.fp8.fp8.v8f32.v2i32.v2i32.v8f32(<2 x i32>, <2 x i32>, <8 x float>) +declare <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.fp8.bf8.v8f32.v2i32.v2i32.v8f32(<2 x i32>, <2 x i32>, <8 x float>) +declare <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf8.fp8.v8f32.v2i32.v2i32.v8f32(<2 x i32>, <2 x i32>, <8 x float>) +declare <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf8.bf8.v8f32.v2i32.v2i32.v8f32(<2 x i32>, <2 x i32>, <8 x float>) +declare <8 x i32> @llvm.amdgcn.wmma.i32.16x16x32.iu4.v8i32.v2i32.v2i32.v8i32(i1 immarg, <2 x i32>, i1 immarg, <2 x i32>, <8 x i32>, i1 immarg) +declare <8 x float> @llvm.amdgcn.swmmac.f32.16x16x32.f16.v8f32.v8f16.v16f16.v8f32.i16(<8 x half>, <16 x half>, <8 x float>, i16) +declare <8 x float> @llvm.amdgcn.swmmac.f32.16x16x32.bf16.v8f32.v8i16.v16i16.v8f32.i16(<8 x i16>, <16 x i16>, <8 x float>, i16) +declare <8 x half> @llvm.amdgcn.swmmac.f16.16x16x32.f16.v8f16.v8f16.v16f16.v8f16.i16(<8 x half>, <16 x half>, <8 x half>, i16) +declare <8 x i16> @llvm.amdgcn.swmmac.bf16.16x16x32.bf16.v8i16.v8i16.v16i16.v8i16.i16(<8 x i16>, <16 x i16>, <8 x i16>, i16) +declare <8 x i32> @llvm.amdgcn.swmmac.i32.16x16x32.iu8.v8i32.v2i32.v4i32.v8i32.i16(i1 immarg, <2 x i32>, i1 immarg, <4 x i32>, <8 x i32>, i16 %Index, i1 immarg) +declare <8 x i32> @llvm.amdgcn.swmmac.i32.16x16x32.iu4.v8i32.i32.v2i32.v8i32.i16(i1 immarg, i32, i1 immarg, <2 x i32>, <8 x i32>, i16 %Index, i1 immarg) +declare <8 x i32> @llvm.amdgcn.swmmac.i32.16x16x64.iu4.v8i32.v2i32.v4i32.v8i32.i32(i1 immarg, <2 x i32>, i1 immarg, <4 x i32>, <8 x i32>, i32 %Index, i1 immarg) +declare <8 x float> @llvm.amdgcn.swmmac.f32.16x16x32.fp8.fp8.v8f32.v2i32.v4i32.v8f32.i16(<2 x i32>, <4 x i32>, <8 x float>, i16) +declare <8 x float> @llvm.amdgcn.swmmac.f32.16x16x32.fp8.bf8.v8f32.v2i32.v4i32.v8f32.i16(<2 x i32>, <4 x i32>, <8 x float>, i16) +declare <8 x float> @llvm.amdgcn.swmmac.f32.16x16x32.bf8.fp8.v8f32.v2i32.v4i32.v8f32.i16(<2 x i32>, <4 x i32>, <8 x float>, i16) +declare <8 x float> @llvm.amdgcn.swmmac.f32.16x16x32.bf8.bf8.v8f32.v2i32.v4i32.v8f32.i16(<2 x i32>, <4 x i32>, <8 x float>, i16) diff --git a/llvm/test/CodeGen/AMDGPU/wmma-gfx12-w64-f16-f32-matrix-modifiers.ll b/llvm/test/CodeGen/AMDGPU/wmma-gfx12-w64-f16-f32-matrix-modifiers.ll new file mode 100644 index 000000000000..1f2ffaf76271 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/wmma-gfx12-w64-f16-f32-matrix-modifiers.ll @@ -0,0 +1,456 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc -march=amdgcn -mcpu=gfx1200 -mattr=-wavefrontsize32,+wavefrontsize64 -verify-machineinstrs < %s | FileCheck %s --check-prefix=GFX12 + +define amdgpu_ps void @test_wmma_f32_16x16x16_f16_negA(<4 x half> %A, <4 x half> %B, <4 x float> %C, <4 x float> addrspace(1)* %out) { +; GFX12-LABEL: test_wmma_f32_16x16x16_f16_negA: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: v_wmma_f32_16x16x16_f16 v[4:7], v[0:1], v[2:3], v[4:7] neg_lo:[1,0,0] neg_hi:[1,0,0] +; GFX12-NEXT: global_store_b128 v[8:9], v[4:7], off +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +bb: + %fneg.A = fneg <4 x half> %A + %res = call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.f16.v4f16.v4f32(<4 x half> %fneg.A, <4 x half> %B, <4 x float> %C) + store <4 x float> %res, ptr addrspace(1) %out + ret void +} + +define amdgpu_ps void @test_wmma_f32_16x16x16_f16_negB(<4 x half> %A, <4 x half> %B, <4 x float> %C, <4 x float> addrspace(1)* %out) { +; GFX12-LABEL: test_wmma_f32_16x16x16_f16_negB: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: v_wmma_f32_16x16x16_f16 v[4:7], v[0:1], v[2:3], v[4:7] neg_lo:[0,1,0] neg_hi:[0,1,0] +; GFX12-NEXT: global_store_b128 v[8:9], v[4:7], off +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +bb: + %fneg.B = fneg <4 x half> %B + %res = call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.f16.v4f16.v4f32(<4 x half> %A, <4 x half> %fneg.B, <4 x float> %C) + store <4 x float> %res, <4 x float> addrspace(1)* %out + ret void +} + +define amdgpu_ps void @test_wmma_f32_16x16x16_f16_negC(<4 x half> %A, <4 x half> %B, <4 x float> %C, <4 x float> addrspace(1)* %out) { +; GFX12-LABEL: test_wmma_f32_16x16x16_f16_negC: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: v_wmma_f32_16x16x16_f16 v[4:7], v[0:1], v[2:3], v[4:7] neg_lo:[0,0,1] +; GFX12-NEXT: global_store_b128 v[8:9], v[4:7], off +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +bb: + %fneg.C = fneg <4 x float> %C + %res = call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.f16.v4f16.v4f32(<4 x half> %A, <4 x half> %B, <4 x float> %fneg.C) + store <4 x float> %res, <4 x float> addrspace(1)* %out + ret void +} + +define amdgpu_ps void @test_wmma_f32_16x16x16_f16_absC(<4 x half> %A, <4 x half> %B, <4 x float> %C, <4 x float> addrspace(1)* %out) { +; GFX12-LABEL: test_wmma_f32_16x16x16_f16_absC: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: v_wmma_f32_16x16x16_f16 v[4:7], v[0:1], v[2:3], v[4:7] neg_hi:[0,0,1] +; GFX12-NEXT: global_store_b128 v[8:9], v[4:7], off +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +bb: + %fabs.C = call <4 x float> @llvm.fabs.v4f32(<4 x float> %C) + %res = call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.f16.v4f16.v4f32(<4 x half> %A, <4 x half> %B, <4 x float> %fabs.C) + store <4 x float> %res, <4 x float> addrspace(1)* %out + ret void +} + +define amdgpu_ps void @test_wmma_f32_16x16x16_bf16_negC(<4 x i16> %A, <4 x i16> %B, <4 x float> %C, <4 x float> addrspace(1)* %out) { +; GFX12-LABEL: test_wmma_f32_16x16x16_bf16_negC: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: v_wmma_f32_16x16x16_bf16 v[4:7], v[0:1], v[2:3], v[4:7] neg_lo:[0,0,1] +; GFX12-NEXT: global_store_b128 v[8:9], v[4:7], off +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +bb: + %fneg.C = fneg <4 x float> %C + %res = call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf16.v4i16.v4f32(<4 x i16> %A, <4 x i16> %B, <4 x float> %fneg.C) + store <4 x float> %res, <4 x float> addrspace(1)* %out + ret void +} + +define amdgpu_ps void @test_wmma_f32_16x16x16_bf16_absC(<4 x i16> %A, <4 x i16> %B, <4 x float> %C, <4 x float> addrspace(1)* %out) { +; GFX12-LABEL: test_wmma_f32_16x16x16_bf16_absC: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: v_wmma_f32_16x16x16_bf16 v[4:7], v[0:1], v[2:3], v[4:7] neg_hi:[0,0,1] +; GFX12-NEXT: global_store_b128 v[8:9], v[4:7], off +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +bb: + %fabs.C = call <4 x float> @llvm.fabs.v4f32(<4 x float> %C) + %res = call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf16.v4i16.v4f32(<4 x i16> %A, <4 x i16> %B, <4 x float> %fabs.C) + store <4 x float> %res, <4 x float> addrspace(1)* %out + ret void +} + +define amdgpu_ps void @test_wmma_f16_16x16x16_f16_negA(<4 x half> %A, <4 x half> %B, <4 x half> %C, <4 x half> addrspace(1)* %out) { +; GFX12-LABEL: test_wmma_f16_16x16x16_f16_negA: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: v_wmma_f16_16x16x16_f16 v[4:5], v[0:1], v[2:3], v[4:5] neg_lo:[1,0,0] neg_hi:[1,0,0] +; GFX12-NEXT: global_store_b64 v[6:7], v[4:5], off +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +bb: + %fneg.A = fneg <4 x half> %A + %res = call <4 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16.v8f16.v8f16(<4 x half> %fneg.A, <4 x half> %B, <4 x half> %C, i1 0) + store <4 x half> %res, <4 x half> addrspace(1)* %out + ret void +} + +define amdgpu_ps void @test_wmma_f16_16x16x16_f16_negB(<4 x half> %A, <4 x half> %B, <4 x half> %C, <4 x half> addrspace(1)* %out) { +; GFX12-LABEL: test_wmma_f16_16x16x16_f16_negB: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: v_wmma_f16_16x16x16_f16 v[4:5], v[0:1], v[2:3], v[4:5] neg_lo:[0,1,0] neg_hi:[0,1,0] +; GFX12-NEXT: global_store_b64 v[6:7], v[4:5], off +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +bb: + %fneg.B = fneg <4 x half> %B + %res = call <4 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16.v8f16.v8f16(<4 x half> %A, <4 x half> %fneg.B, <4 x half> %C, i1 0) + store <4 x half> %res, <4 x half> addrspace(1)* %out + ret void +} + +define amdgpu_ps void @test_wmma_f16_16x16x16_f16_negC(<4 x half> %A, <4 x half> %B, <4 x half> %C, <4 x half> addrspace(1)* %out) { +; GFX12-LABEL: test_wmma_f16_16x16x16_f16_negC: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: v_wmma_f16_16x16x16_f16 v[4:5], v[0:1], v[2:3], v[4:5] neg_lo:[0,0,1] +; GFX12-NEXT: global_store_b64 v[6:7], v[4:5], off +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +bb: + %fneg.C = fneg <4 x half> %C + %res = call <4 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16.v8f16.v8f16(<4 x half> %A, <4 x half> %B, <4 x half> %fneg.C, i1 0) + store <4 x half> %res, <4 x half> addrspace(1)* %out + ret void +} + +define amdgpu_ps void @test_wmma_f16_16x16x16_f16_absC(<4 x half> %A, <4 x half> %B, <4 x half> %C, <4 x half> addrspace(1)* %out) { +; GFX12-LABEL: test_wmma_f16_16x16x16_f16_absC: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: v_wmma_f16_16x16x16_f16 v[4:5], v[0:1], v[2:3], v[4:5] neg_hi:[0,0,1] +; GFX12-NEXT: global_store_b64 v[6:7], v[4:5], off +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +bb: + %fabs.C = call <4 x half> @llvm.fabs.v4f16(<4 x half> %C) + %res = call <4 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16.v8f16.v8f16(<4 x half> %A, <4 x half> %B, <4 x half> %fabs.C, i1 0) + store <4 x half> %res, <4 x half> addrspace(1)* %out + ret void +} + +define amdgpu_ps void @test_wmma_f32_16x16x16_fp8_fp8_negC(i32 %A, i32 %B, <4 x float> %C, <4 x float> addrspace(1)* %out) { +; GFX12-LABEL: test_wmma_f32_16x16x16_fp8_fp8_negC: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: v_wmma_f32_16x16x16_fp8_fp8 v[2:5], v0, v1, v[2:5] neg_lo:[0,0,1] +; GFX12-NEXT: global_store_b128 v[6:7], v[2:5], off +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +bb: + %fneg.C = fneg <4 x float> %C + %res = call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.fp8.fp8.i32.v4f32(i32 %A, i32 %B, <4 x float> %fneg.C) + store <4 x float> %res, <4 x float> addrspace(1)* %out + ret void +} + +define amdgpu_ps void @test_wmma_f32_16x16x16_fp8_fp8_absC(i32 %A, i32 %B, <4 x float> %C, <4 x float> addrspace(1)* %out) { +; GFX12-LABEL: test_wmma_f32_16x16x16_fp8_fp8_absC: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: v_wmma_f32_16x16x16_fp8_fp8 v[2:5], v0, v1, v[2:5] neg_hi:[0,0,1] +; GFX12-NEXT: global_store_b128 v[6:7], v[2:5], off +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +bb: + %fabs.C = call <4 x float> @llvm.fabs.v4f32(<4 x float> %C) + %res = call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.fp8.fp8.i32.v4f32(i32 %A, i32 %B, <4 x float> %fabs.C) + store <4 x float> %res, <4 x float> addrspace(1)* %out + ret void +} + +define amdgpu_ps void @test_wmma_f32_16x16x16_bf8_fp8_negC(i32 %A, i32 %B, <4 x float> %C, <4 x float> addrspace(1)* %out) { +; GFX12-LABEL: test_wmma_f32_16x16x16_bf8_fp8_negC: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: v_wmma_f32_16x16x16_bf8_fp8 v[2:5], v0, v1, v[2:5] neg_lo:[0,0,1] +; GFX12-NEXT: global_store_b128 v[6:7], v[2:5], off +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +bb: + %fneg.C = fneg <4 x float> %C + %res = call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf8.fp8.i32.v4f32(i32 %A, i32 %B, <4 x float> %fneg.C) + store <4 x float> %res, <4 x float> addrspace(1)* %out + ret void +} + +define amdgpu_ps void @test_wmma_f32_16x16x16_bf8_fp8_absC(i32 %A, i32 %B, <4 x float> %C, <4 x float> addrspace(1)* %out) { +; GFX12-LABEL: test_wmma_f32_16x16x16_bf8_fp8_absC: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: v_wmma_f32_16x16x16_bf8_fp8 v[2:5], v0, v1, v[2:5] neg_hi:[0,0,1] +; GFX12-NEXT: global_store_b128 v[6:7], v[2:5], off +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +bb: + %fabs.C = call <4 x float> @llvm.fabs.v4f32(<4 x float> %C) + %res = call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf8.fp8.i32.v4f32(i32 %A, i32 %B, <4 x float> %fabs.C) + store <4 x float> %res, <4 x float> addrspace(1)* %out + ret void +} + +define amdgpu_ps void @test_wmma_f32_16x16x16_fp8_bf8_negC(i32 %A, i32 %B, <4 x float> %C, <4 x float> addrspace(1)* %out) { +; GFX12-LABEL: test_wmma_f32_16x16x16_fp8_bf8_negC: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: v_wmma_f32_16x16x16_fp8_bf8 v[2:5], v0, v1, v[2:5] neg_lo:[0,0,1] +; GFX12-NEXT: global_store_b128 v[6:7], v[2:5], off +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +bb: + %fneg.C = fneg <4 x float> %C + %res = call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.fp8.bf8.i32.v4f32(i32 %A, i32 %B, <4 x float> %fneg.C) + store <4 x float> %res, <4 x float> addrspace(1)* %out + ret void +} + +define amdgpu_ps void @test_wmma_f32_16x16x16_fp8_bf8_absC(i32 %A, i32 %B, <4 x float> %C, <4 x float> addrspace(1)* %out) { +; GFX12-LABEL: test_wmma_f32_16x16x16_fp8_bf8_absC: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: v_wmma_f32_16x16x16_fp8_bf8 v[2:5], v0, v1, v[2:5] neg_hi:[0,0,1] +; GFX12-NEXT: global_store_b128 v[6:7], v[2:5], off +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +bb: + %fabs.C = call <4 x float> @llvm.fabs.v4f32(<4 x float> %C) + %res = call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.fp8.bf8.i32.v4f32(i32 %A, i32 %B, <4 x float> %fabs.C) + store <4 x float> %res, <4 x float> addrspace(1)* %out + ret void +} + +define amdgpu_ps void @test_wmma_f32_16x16x16_bf8_bf8_negC(i32 %A, i32 %B, <4 x float> %C, <4 x float> addrspace(1)* %out) { +; GFX12-LABEL: test_wmma_f32_16x16x16_bf8_bf8_negC: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: v_wmma_f32_16x16x16_bf8_bf8 v[2:5], v0, v1, v[2:5] neg_lo:[0,0,1] +; GFX12-NEXT: global_store_b128 v[6:7], v[2:5], off +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +bb: + %fneg.C = fneg <4 x float> %C + %res = call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf8.bf8.i32.v4f32(i32 %A, i32 %B, <4 x float> %fneg.C) + store <4 x float> %res, <4 x float> addrspace(1)* %out + ret void +} + +define amdgpu_ps void @test_wmma_f32_16x16x16_bf8_bf8_absC(i32 %A, i32 %B, <4 x float> %C, <4 x float> addrspace(1)* %out) { +; GFX12-LABEL: test_wmma_f32_16x16x16_bf8_bf8_absC: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: v_wmma_f32_16x16x16_bf8_bf8 v[2:5], v0, v1, v[2:5] neg_hi:[0,0,1] +; GFX12-NEXT: global_store_b128 v[6:7], v[2:5], off +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +bb: + %fabs.C = call <4 x float> @llvm.fabs.v4f32(<4 x float> %C) + %res = call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf8.bf8.i32.v4f32(i32 %A, i32 %B, <4 x float> %fabs.C) + store <4 x float> %res, <4 x float> addrspace(1)* %out + ret void +} + +define amdgpu_ps void @test_swmmac_f32_16x16x32_f16_negA(<4 x half> %A, <8 x half> %B, <4 x float> %C, i16 %Index, <4 x float> addrspace(1)* %out) { +; GFX12-LABEL: test_swmmac_f32_16x16x32_f16_negA: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: v_swmmac_f32_16x16x32_f16 v[6:9], v[0:1], v[2:5], v10 neg_lo:[1,0,0] neg_hi:[1,0,0] +; GFX12-NEXT: global_store_b128 v[11:12], v[6:9], off +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +bb: + %fneg.A = fneg <4 x half> %A + %res = call <4 x float> @llvm.amdgcn.swmmac.f32.16x16x32.f16.v4f16.v8f16.v4f32.i16(<4 x half> %fneg.A, <8 x half> %B, <4 x float> %C, i16 %Index) + store <4 x float> %res, <4 x float> addrspace(1)* %out + ret void +} + +define amdgpu_ps void @test_swmmac_f32_16x16x32_f16_negB(<4 x half> %A, <8 x half> %B, <4 x float> %C, i16 %Index, <4 x float> addrspace(1)* %out) { +; GFX12-LABEL: test_swmmac_f32_16x16x32_f16_negB: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: v_swmmac_f32_16x16x32_f16 v[6:9], v[0:1], v[2:5], v10 neg_lo:[0,1,0] neg_hi:[0,1,0] +; GFX12-NEXT: global_store_b128 v[11:12], v[6:9], off +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +bb: + %fneg.B = fneg <8 x half> %B + %res = call <4 x float> @llvm.amdgcn.swmmac.f32.16x16x32.f16.v4f16.v8f16.v4f32.i16(<4 x half> %A, <8 x half> %fneg.B, <4 x float> %C, i16 %Index) + store <4 x float> %res, <4 x float> addrspace(1)* %out + ret void +} + +define amdgpu_ps void @test_swmmac_f16_16x16x32_f16_negA(<4 x half> %A, <8 x half> %B, <4 x half> %C, i16 %Index, <4 x half> addrspace(1)* %out) { +; GFX12-LABEL: test_swmmac_f16_16x16x32_f16_negA: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: v_swmmac_f16_16x16x32_f16 v[6:7], v[0:1], v[2:5], v8 neg_lo:[1,0,0] neg_hi:[1,0,0] +; GFX12-NEXT: global_store_b64 v[9:10], v[6:7], off +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +bb: + %fneg.A = fneg <4 x half> %A + %res = call <4 x half> @llvm.amdgcn.swmmac.f16.16x16x32.f16.v4f16.v8f16.v4f16.i16(<4 x half> %fneg.A, <8 x half> %B, <4 x half> %C, i16 %Index) + store <4 x half> %res, <4 x half> addrspace(1)* %out + ret void +} + +define amdgpu_ps void @test_swmmac_f16_16x16x32_f16_negB(<4 x half> %A, <8 x half> %B, <4 x half> %C, i16 %Index, <4 x half> addrspace(1)* %out) { +; GFX12-LABEL: test_swmmac_f16_16x16x32_f16_negB: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: v_swmmac_f16_16x16x32_f16 v[6:7], v[0:1], v[2:5], v8 neg_lo:[0,1,0] neg_hi:[0,1,0] +; GFX12-NEXT: global_store_b64 v[9:10], v[6:7], off +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +bb: + %fneg.B = fneg <8 x half> %B + %res = call <4 x half> @llvm.amdgcn.swmmac.f16.16x16x32.f16.v4f16.v8f16.v4f16.i16(<4 x half> %A, <8 x half> %fneg.B, <4 x half> %C, i16 %Index) + store <4 x half> %res, <4 x half> addrspace(1)* %out + ret void +} + +; both neg and abs patterns (wmma matrix C f32 or f16 ) + +define amdgpu_ps void @test_wmma_f32_16x16x16_f16_negabsC(<4 x half> %A, <4 x half> %B, <4 x float> %C, <4 x float> addrspace(1)* %out) { +; GFX12-LABEL: test_wmma_f32_16x16x16_f16_negabsC: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: v_wmma_f32_16x16x16_f16 v[4:7], v[0:1], v[2:3], v[4:7] neg_lo:[0,0,1] neg_hi:[0,0,1] +; GFX12-NEXT: global_store_b128 v[8:9], v[4:7], off +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +bb: + %fabs.C = call <4 x float> @llvm.fabs.v4f32(<4 x float> %C) + %fneg.fabs.C = fneg <4 x float> %fabs.C + %res = call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.f16.v4f16.v4f32(<4 x half> %A, <4 x half> %B, <4 x float> %fneg.fabs.C) + store <4 x float> %res, <4 x float> addrspace(1)* %out + ret void +} + +define amdgpu_ps void @test_wmma_f16_16x16x16_f16_negabsC(<4 x half> %A, <4 x half> %B, <4 x half> %C, <4 x half> addrspace(1)* %out) { +; GFX12-LABEL: test_wmma_f16_16x16x16_f16_negabsC: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: v_wmma_f16_16x16x16_f16 v[4:5], v[0:1], v[2:3], v[4:5] neg_lo:[0,0,1] neg_hi:[0,0,1] +; GFX12-NEXT: global_store_b64 v[6:7], v[4:5], off +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +bb: + %fabs.C = call <4 x half> @llvm.fabs.v4f16(<4 x half> %C) + %fneg.fabs.C = fneg <4 x half> %fabs.C + %res = call <4 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16.v8f16.v8f16(<4 x half> %A, <4 x half> %B, <4 x half> %fneg.fabs.C, i1 0) + store <4 x half> %res, <4 x half> addrspace(1)* %out + ret void +} + +define amdgpu_ps void @test_wmma_f32_16x16x16_f16_neg_partial_fabsA(<4 x half> %A, <4 x half> %B, <4 x float> %C, <4 x float> addrspace(1)* %out) { +; GFX12-LABEL: test_wmma_f32_16x16x16_f16_neg_partial_fabsA: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: v_and_b32_e32 v7, 0x7fffffff, v7 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: v_wmma_f32_16x16x16_f16 v[4:7], v[0:1], v[2:3], v[4:7] neg_lo:[0,0,1] +; GFX12-NEXT: global_store_b128 v[8:9], v[4:7], off +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +bb: + %el3 = extractelement <4 x float> %C, i32 3 + %el3.fabs = call float @llvm.fabs.f32(float %el3) + %partial.fabs.C = insertelement <4 x float> %C, float %el3.fabs, i32 3 + %fneg.partial.fabs.C = fneg <4 x float> %partial.fabs.C + %res = call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.f16.v4f16.v4f32(<4 x half> %A, <4 x half> %B, <4 x float> %fneg.partial.fabs.C) + store <4 x float> %res, <4 x float> addrspace(1)* %out + ret void +} + +; A or B matrix modifier and constant in C + +define amdgpu_ps void @test_wmma_f32_16x16x16_f16_negA_constantC(<4 x half> %A, <4 x half> %B, <4 x float> %C, <4 x float> addrspace(1)* %out) { +; GFX12-LABEL: test_wmma_f32_16x16x16_f16_negA_constantC: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: v_wmma_f32_16x16x16_f16 v[6:9], v[0:1], v[2:3], 1.0 neg_lo:[1,0,0] neg_hi:[1,0,0] +; GFX12-NEXT: global_store_b128 v[4:5], v[6:9], off +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +bb: + %fneg.A = fneg <4 x half> %A + %res = call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.f16.v4f16.v4f32(<4 x half> %fneg.A, <4 x half> %B, <4 x float> ) + store <4 x float> %res, <4 x float> addrspace(1)* %out + ret void +} + +define amdgpu_ps void @test_wmma_f16_16x16x16_f16_negB_constantC(<4 x half> %A, <4 x half> %B, <4 x half> %C, <4 x half> addrspace(1)* %out) { +; GFX12-LABEL: test_wmma_f16_16x16x16_f16_negB_constantC: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: v_wmma_f16_16x16x16_f16 v[6:7], v[0:1], v[2:3], 1.0 neg_lo:[0,1,0] neg_hi:[0,1,0] +; GFX12-NEXT: global_store_b64 v[4:5], v[6:7], off +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +bb: + %fneg.B = fneg <4 x half> %B + %res = call <4 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16.v8f16.v8f16(<4 x half> %A, <4 x half> %fneg.B, <4 x half> , i1 0) + store <4 x half> %res, <4 x half> addrspace(1)* %out + ret void +} + +; pack f16 elements with v_perm_b32 since they don't come from same b32 + +define amdgpu_ps void @test_wmma_f16_16x16x16_f16_negC_pack(<4 x half> %A, <4 x half> %B, ptr %Caddr, <4 x half> addrspace(1)* %out) { +; GFX12-LABEL: test_wmma_f16_16x16x16_f16_negC_pack: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: flat_load_b128 v[8:11], v[4:5] +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: v_perm_b32 v5, v11, v10, 0x5040100 +; GFX12-NEXT: v_perm_b32 v4, v9, v8, 0x5040100 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: v_wmma_f16_16x16x16_f16 v[4:5], v[0:1], v[2:3], v[4:5] neg_lo:[0,0,1] +; GFX12-NEXT: global_store_b64 v[6:7], v[4:5], off +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +bb: + %C = load <8 x half>, ptr %Caddr + %C_shuffle = shufflevector <8 x half> %C, <8 x half> undef, <4 x i32> + %fneg.C_shuffle = fneg <4 x half> %C_shuffle + %res = call <4 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16.v8f16.v8f16(<4 x half> %A, <4 x half> %B, <4 x half> %fneg.C_shuffle , i1 0) + store <4 x half> %res, <4 x half> addrspace(1)* %out + ret void +} + +declare <4 x half> @llvm.fabs.v4f16(<4 x half>) +declare <4 x float> @llvm.fabs.v4f32(<4 x float>) +declare float @llvm.fabs.f32(float) + +declare <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.f16.v4f16.v4f32(<4 x half>, <4 x half>, <4 x float>) +declare <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf16.v4i16.v4f32(<4 x i16>, <4 x i16>, <4 x float>) +declare <4 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16.v8f16.v8f16(<4 x half>, <4 x half>, <4 x half>, i1 immarg) +declare <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.fp8.fp8.i32.v4f32(i32, i32, <4 x float>) +declare <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.fp8.bf8.i32.v4f32(i32, i32, <4 x float>) +declare <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf8.fp8.i32.v4f32(i32, i32, <4 x float>) +declare <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf8.bf8.i32.v4f32(i32, i32, <4 x float>) +declare <4 x float> @llvm.amdgcn.swmmac.f32.16x16x32.f16.v4f16.v8f16.v4f32.i16(<4 x half>, <8 x half>, <4 x float>, i16) +declare <4 x half> @llvm.amdgcn.swmmac.f16.16x16x32.f16.v4f16.v8f16.v4f16.i16(<4 x half>, <8 x half>, <4 x half>, i16) diff --git a/llvm/test/CodeGen/AMDGPU/wmma-gfx12-w64-imm.ll b/llvm/test/CodeGen/AMDGPU/wmma-gfx12-w64-imm.ll new file mode 100644 index 000000000000..fa0a7c98cea3 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/wmma-gfx12-w64-imm.ll @@ -0,0 +1,373 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc -march=amdgcn -mcpu=gfx1200 -mattr=-wavefrontsize32,+wavefrontsize64 -verify-machineinstrs < %s | FileCheck %s --check-prefix=GFX12 + +define amdgpu_ps void @test_wmma_f32_16x16x16_f16_imm(<4 x half> %A, <4 x half> %B, ptr addrspace(1) %out) { +; GFX12-LABEL: test_wmma_f32_16x16x16_f16_imm: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: v_wmma_f32_16x16x16_f16 v[6:9], v[0:1], v[2:3], 1.0 +; GFX12-NEXT: global_store_b128 v[4:5], v[6:9], off +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +bb: + %res = call <4 x float>@llvm.amdgcn.wmma.f32.16x16x16.f16.v4f32.v4f16.v4f16.v4f32(<4 x half> %A, <4 x half> %B, <4 x float> ) + store <4 x float> %res, ptr addrspace(1) %out + ret void +} + +define amdgpu_ps void @test_wmma_f32_16x16x16_f16_imm_non_inlineable(<4 x half> %A, <4 x half> %B, ptr addrspace(1) %out) { +; GFX12-LABEL: test_wmma_f32_16x16x16_f16_imm_non_inlineable: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: v_mov_b32_e32 v6, 0x40400000 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1) +; GFX12-NEXT: v_mov_b32_e32 v7, v6 +; GFX12-NEXT: v_mov_b32_e32 v8, v6 +; GFX12-NEXT: v_mov_b32_e32 v9, v6 +; GFX12-NEXT: v_wmma_f32_16x16x16_f16 v[6:9], v[0:1], v[2:3], v[6:9] +; GFX12-NEXT: global_store_b128 v[4:5], v[6:9], off +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +bb: + %res = call <4 x float>@llvm.amdgcn.wmma.f32.16x16x16.f16.v4f32.v4f16.v4f16.v4f32(<4 x half> %A, <4 x half> %B, <4 x float> ) + store <4 x float> %res, ptr addrspace(1) %out + ret void +} + +define amdgpu_ps void @test_wmma_f32_16x16x16_bf16_imm(<4 x i16> %A, <4 x i16> %B, ptr addrspace(1) %out) { +; GFX12-LABEL: test_wmma_f32_16x16x16_bf16_imm: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: v_wmma_f32_16x16x16_bf16 v[6:9], v[0:1], v[2:3], 1.0 +; GFX12-NEXT: global_store_b128 v[4:5], v[6:9], off +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +bb: + %res = call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf16.v4f32.v4i16.v4i16.v4f32(<4 x i16> %A, <4 x i16> %B, <4 x float> ) + store <4 x float> %res, ptr addrspace(1) %out + ret void +} + +define amdgpu_ps void @test_wmma_f32_16x16x16_bf16_imm_non_inlineable(<4 x i16> %A, <4 x i16> %B, ptr addrspace(1) %out) { +; GFX12-LABEL: test_wmma_f32_16x16x16_bf16_imm_non_inlineable: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: v_mov_b32_e32 v6, 0x40400000 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1) +; GFX12-NEXT: v_mov_b32_e32 v7, v6 +; GFX12-NEXT: v_mov_b32_e32 v8, v6 +; GFX12-NEXT: v_mov_b32_e32 v9, v6 +; GFX12-NEXT: v_wmma_f32_16x16x16_bf16 v[6:9], v[0:1], v[2:3], v[6:9] +; GFX12-NEXT: global_store_b128 v[4:5], v[6:9], off +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +bb: + %res = call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf16.v4f32.v4i16.v4i16.v4f32(<4 x i16> %A, <4 x i16> %B, <4 x float> ) + store <4 x float> %res, ptr addrspace(1) %out + ret void +} + +define amdgpu_ps void @test_wmma_f16_16x16x16_f16_imm(<4 x half> %A, <4 x half> %B, ptr addrspace(1) %out) { +; GFX12-LABEL: test_wmma_f16_16x16x16_f16_imm: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: v_wmma_f16_16x16x16_f16 v[6:7], v[0:1], v[2:3], 1.0 +; GFX12-NEXT: global_store_b64 v[4:5], v[6:7], off +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +bb: + %res = call <4 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16.v4f16.v4f16.v4f16.v4f16(<4 x half> %A, <4 x half> %B, <4 x half> , i1 0) + store <4 x half> %res, ptr addrspace(1) %out + ret void +} + +define amdgpu_ps void @test_wmma_f16_16x16x16_f16_imm_non_inlineable(<4 x half> %A, <4 x half> %B, ptr addrspace(1) %out) { +; GFX12-LABEL: test_wmma_f16_16x16x16_f16_imm_non_inlineable: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: v_mov_b32_e32 v6, 0x42004200 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-NEXT: v_mov_b32_e32 v7, v6 +; GFX12-NEXT: v_wmma_f16_16x16x16_f16 v[6:7], v[0:1], v[2:3], v[6:7] +; GFX12-NEXT: global_store_b64 v[4:5], v[6:7], off +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +bb: + %res = call <4 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16.v4f16.v4f16.v4f16.v4f16(<4 x half> %A, <4 x half> %B, <4 x half> , i1 0) + store <4 x half> %res, ptr addrspace(1) %out + ret void +} + +define amdgpu_ps void @test_wmma_bf16_16x16x16_bf16_imm(<4 x i16> %A, <4 x i16> %B, ptr addrspace(1) %out) { +; GFX12-LABEL: test_wmma_bf16_16x16x16_bf16_imm: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: v_wmma_bf16_16x16x16_bf16 v[6:7], v[0:1], v[2:3], 1.0 +; GFX12-NEXT: global_store_b64 v[4:5], v[6:7], off +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +bb: + %res = call <4 x i16> @llvm.amdgcn.wmma.bf16.16x16x16.bf16.v4i16.v4i16.v4i16.v4i16(<4 x i16> %A, <4 x i16> %B, <4 x i16> , i1 0) + store <4 x i16> %res, ptr addrspace(1) %out + ret void +} + +define amdgpu_ps void @test_wmma_bf16_16x16x16_bf16_imm_non_inlineable(<4 x i16> %A, <4 x i16> %B, ptr addrspace(1) %out) { +; GFX12-LABEL: test_wmma_bf16_16x16x16_bf16_imm_non_inlineable: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: v_mov_b32_e32 v6, 0x3fc03fc0 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-NEXT: v_mov_b32_e32 v7, v6 +; GFX12-NEXT: v_wmma_bf16_16x16x16_bf16 v[6:7], v[0:1], v[2:3], v[6:7] +; GFX12-NEXT: global_store_b64 v[4:5], v[6:7], off +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +bb: + %res = call <4 x i16> @llvm.amdgcn.wmma.bf16.16x16x16.bf16.v4i16.v4i16.v4i16.v4i16(<4 x i16> %A, <4 x i16> %B, <4 x i16> , i1 0) + store <4 x i16> %res, ptr addrspace(1) %out + ret void +} + +define amdgpu_ps void @test_wmma_i32_16x16x16_iu8_imm(i32 %A, i32 %B, ptr addrspace(1) %out) { +; GFX12-LABEL: test_wmma_i32_16x16x16_iu8_imm: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: v_wmma_i32_16x16x16_iu8 v[4:7], v0, v1, 1 +; GFX12-NEXT: global_store_b128 v[2:3], v[4:7], off +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +bb: + %res = call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8.v4i32.i32.i32.v4i32(i1 0, i32 %A, i1 0, i32 %B, <4 x i32> , i1 0) + store <4 x i32> %res, ptr addrspace(1) %out + ret void +} + +define amdgpu_ps void @test_wmma_i32_16x16x16_iu8_imm_non_inlineable(i32 %A, i32 %B, ptr addrspace(1) %out) { +; GFX12-LABEL: test_wmma_i32_16x16x16_iu8_imm_non_inlineable: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: v_mov_b32_e32 v4, 0x80 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1) +; GFX12-NEXT: v_mov_b32_e32 v5, v4 +; GFX12-NEXT: v_mov_b32_e32 v6, v4 +; GFX12-NEXT: v_mov_b32_e32 v7, v4 +; GFX12-NEXT: v_wmma_i32_16x16x16_iu8 v[4:7], v0, v1, v[4:7] +; GFX12-NEXT: global_store_b128 v[2:3], v[4:7], off +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +bb: + %res = call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8.v4i32.i32.i32.v4i32(i1 0, i32 %A, i1 0, i32 %B, <4 x i32> , i1 0) + store <4 x i32> %res, ptr addrspace(1) %out + ret void +} + +define amdgpu_ps void @test_wmma_i32_16x16x16_iu4_imm(i32 %A, i32 %B, ptr addrspace(1) %out) { +; GFX12-LABEL: test_wmma_i32_16x16x16_iu4_imm: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: v_wmma_i32_16x16x16_iu4 v[4:7], v0, v1, 1 +; GFX12-NEXT: global_store_b128 v[2:3], v[4:7], off +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +bb: + %res = call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4.v4i32.i32.i32.v4i32(i1 0, i32 %A, i1 0, i32 %B, <4 x i32> , i1 0) + store <4 x i32> %res, ptr addrspace(1) %out + ret void +} + +define amdgpu_ps void @test_wmma_i32_16x16x16_iu4_imm_non_inlineable(i32 %A, i32 %B, ptr addrspace(1) %out) { +; GFX12-LABEL: test_wmma_i32_16x16x16_iu4_imm_non_inlineable: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: v_mov_b32_e32 v4, 0x80 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1) +; GFX12-NEXT: v_mov_b32_e32 v5, v4 +; GFX12-NEXT: v_mov_b32_e32 v6, v4 +; GFX12-NEXT: v_mov_b32_e32 v7, v4 +; GFX12-NEXT: v_wmma_i32_16x16x16_iu4 v[4:7], v0, v1, v[4:7] +; GFX12-NEXT: global_store_b128 v[2:3], v[4:7], off +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +bb: + %res = call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4.v4i32.i32.i32.v4i32(i1 0, i32 %A, i1 0, i32 %B, <4 x i32> , i1 0) + store <4 x i32> %res, ptr addrspace(1) %out + ret void +} + +define amdgpu_ps void @test_wmma_f32_16x16x16_fp8_fp8_imm(i32 %A, i32 %B, <4 x float> %C, ptr addrspace(1) %out) { +; GFX12-LABEL: test_wmma_f32_16x16x16_fp8_fp8_imm: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: v_wmma_f32_16x16x16_fp8_fp8 v[4:7], v0, v1, 1.0 +; GFX12-NEXT: global_store_b128 v[2:3], v[4:7], off +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +bb: + %res = call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.fp8.fp8.v4f32.i32.i32.v4f32(i32 %A, i32 %B, <4 x float> ) + store <4 x float> %res, ptr addrspace(1) %out + ret void +} + +define amdgpu_ps void @test_wmma_f32_16x16x16_fp8_fp8_imm_non_inlineable(i32 %A, i32 %B, <4 x float> %C, ptr addrspace(1) %out) { +; GFX12-LABEL: test_wmma_f32_16x16x16_fp8_fp8_imm_non_inlineable: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: v_mov_b32_e32 v4, 0x40400000 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1) +; GFX12-NEXT: v_mov_b32_e32 v5, v4 +; GFX12-NEXT: v_mov_b32_e32 v6, v4 +; GFX12-NEXT: v_mov_b32_e32 v7, v4 +; GFX12-NEXT: v_wmma_f32_16x16x16_fp8_fp8 v[4:7], v0, v1, v[4:7] +; GFX12-NEXT: global_store_b128 v[2:3], v[4:7], off +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +bb: + %res = call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.fp8.fp8.v4f32.i32.i32.v4f32(i32 %A, i32 %B, <4 x float> ) + store <4 x float> %res, ptr addrspace(1) %out + ret void +} + +define amdgpu_ps void @test_wmma_f32_16x16x16_bf8_fp8_imm(i32 %A, i32 %B, <4 x float> %C, ptr addrspace(1) %out) { +; GFX12-LABEL: test_wmma_f32_16x16x16_bf8_fp8_imm: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: v_wmma_f32_16x16x16_bf8_fp8 v[4:7], v0, v1, 1.0 +; GFX12-NEXT: global_store_b128 v[2:3], v[4:7], off +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +bb: + %res = call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf8.fp8.v4f32.i32.i32.v4f32(i32 %A, i32 %B, <4 x float> ) + store <4 x float> %res, ptr addrspace(1) %out + ret void +} + +define amdgpu_ps void @test_wmma_f32_16x16x16_bf8_fp8_imm_non_inlineable(i32 %A, i32 %B, <4 x float> %C, ptr addrspace(1) %out) { +; GFX12-LABEL: test_wmma_f32_16x16x16_bf8_fp8_imm_non_inlineable: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: v_mov_b32_e32 v4, 0x40400000 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1) +; GFX12-NEXT: v_mov_b32_e32 v5, v4 +; GFX12-NEXT: v_mov_b32_e32 v6, v4 +; GFX12-NEXT: v_mov_b32_e32 v7, v4 +; GFX12-NEXT: v_wmma_f32_16x16x16_bf8_fp8 v[4:7], v0, v1, v[4:7] +; GFX12-NEXT: global_store_b128 v[2:3], v[4:7], off +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +bb: + %res = call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf8.fp8.v4f32.i32.i32.v4f32(i32 %A, i32 %B, <4 x float> ) + store <4 x float> %res, ptr addrspace(1) %out + ret void +} + +define amdgpu_ps void @test_wmma_f32_16x16x16_fp8_bf8_imm(i32 %A, i32 %B, <4 x float> %C, ptr addrspace(1) %out) { +; GFX12-LABEL: test_wmma_f32_16x16x16_fp8_bf8_imm: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: v_wmma_f32_16x16x16_fp8_bf8 v[4:7], v0, v1, 1.0 +; GFX12-NEXT: global_store_b128 v[2:3], v[4:7], off +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +bb: + %res = call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.fp8.bf8.v4f32.i32.i32.v4f32(i32 %A, i32 %B, <4 x float> ) + store <4 x float> %res, ptr addrspace(1) %out + ret void +} + +define amdgpu_ps void @test_wmma_f32_16x16x16_fp8_bf8_imm_non_inlineable(i32 %A, i32 %B, <4 x float> %C, ptr addrspace(1) %out) { +; GFX12-LABEL: test_wmma_f32_16x16x16_fp8_bf8_imm_non_inlineable: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: v_mov_b32_e32 v4, 0x40400000 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1) +; GFX12-NEXT: v_mov_b32_e32 v5, v4 +; GFX12-NEXT: v_mov_b32_e32 v6, v4 +; GFX12-NEXT: v_mov_b32_e32 v7, v4 +; GFX12-NEXT: v_wmma_f32_16x16x16_fp8_bf8 v[4:7], v0, v1, v[4:7] +; GFX12-NEXT: global_store_b128 v[2:3], v[4:7], off +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +bb: + %res = call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.fp8.bf8.v4f32.i32.i32.v4f32(i32 %A, i32 %B, <4 x float> ) + store <4 x float> %res, ptr addrspace(1) %out + ret void +} + +define amdgpu_ps void @test_wmma_f32_16x16x16_bf8_bf8_imm(i32 %A, i32 %B, <4 x float> %C, ptr addrspace(1) %out) { +; GFX12-LABEL: test_wmma_f32_16x16x16_bf8_bf8_imm: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: v_wmma_f32_16x16x16_bf8_bf8 v[4:7], v0, v1, 1.0 +; GFX12-NEXT: global_store_b128 v[2:3], v[4:7], off +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +bb: + %res = call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf8.bf8.v4f32.i32.i32.v4f32(i32 %A, i32 %B, <4 x float> ) + store <4 x float> %res, ptr addrspace(1) %out + ret void +} + +define amdgpu_ps void @test_wmma_f32_16x16x16_bf8_bf8_imm_non_inlineable(i32 %A, i32 %B, <4 x float> %C, ptr addrspace(1) %out) { +; GFX12-LABEL: test_wmma_f32_16x16x16_bf8_bf8_imm_non_inlineable: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: v_mov_b32_e32 v4, 0x40400000 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1) +; GFX12-NEXT: v_mov_b32_e32 v5, v4 +; GFX12-NEXT: v_mov_b32_e32 v6, v4 +; GFX12-NEXT: v_mov_b32_e32 v7, v4 +; GFX12-NEXT: v_wmma_f32_16x16x16_bf8_bf8 v[4:7], v0, v1, v[4:7] +; GFX12-NEXT: global_store_b128 v[2:3], v[4:7], off +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +bb: + %res = call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf8.bf8.v4f32.i32.i32.v4f32(i32 %A, i32 %B, <4 x float> ) + store <4 x float> %res, ptr addrspace(1) %out + ret void +} + +define amdgpu_ps void @test_wmma_i32_16x16x32_iu4_imm(i32 %A, i32 %B, <4 x i32> %C, ptr addrspace(1) %out) { +; GFX12-LABEL: test_wmma_i32_16x16x32_iu4_imm: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: v_wmma_i32_16x16x32_iu4 v[4:7], v0, v1, 1 +; GFX12-NEXT: global_store_b128 v[2:3], v[4:7], off +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +bb: + %res = call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x32.iu4.v4i32.i32.i32.v4i32(i1 0, i32 %A, i1 0, i32 %B, <4 x i32> , i1 0) + store <4 x i32> %res, ptr addrspace(1) %out + ret void +} + +define amdgpu_ps void @test_wmma_i32_16x16x32_iu4_imm_non_inlineable(i32 %A, i32 %B, <4 x i32> %C, ptr addrspace(1) %out) { +; GFX12-LABEL: test_wmma_i32_16x16x32_iu4_imm_non_inlineable: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: v_mov_b32_e32 v4, 0x80 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1) +; GFX12-NEXT: v_mov_b32_e32 v5, v4 +; GFX12-NEXT: v_mov_b32_e32 v6, v4 +; GFX12-NEXT: v_mov_b32_e32 v7, v4 +; GFX12-NEXT: v_wmma_i32_16x16x32_iu4 v[4:7], v0, v1, v[4:7] +; GFX12-NEXT: global_store_b128 v[2:3], v[4:7], off +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +bb: + %res = call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x32.iu4.v4i32.i32.i32.v4i32(i1 0, i32 %A, i1 0, i32 %B, <4 x i32> , i1 0) + store <4 x i32> %res, ptr addrspace(1) %out + ret void +} + +declare <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.f16.v4f32.v4f16.v4f16.v4f32(<4 x half>, <4 x half>, <4 x float>) +declare <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf16.v4f32.v4i16.v4i16.v4f32(<4 x i16>, <4 x i16>, <4 x float>) +declare <4 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16.v4f16.v4f16.v4f16.v4f16(<4 x half>, <4 x half>, <4 x half>, i1 immarg) +declare <4 x i16> @llvm.amdgcn.wmma.bf16.16x16x16.bf16.v4i16.v4i16.v4i16.v4i16(<4 x i16>, <4 x i16>, <4 x i16>, i1 immarg) +declare <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8.v4i32.i32.i32.v4i32(i1 immarg, i32, i1 immarg, i32, <4 x i32>, i1 immarg) +declare <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4.v4i32.i32.i32.v4i32(i1 immarg, i32, i1 immarg, i32, <4 x i32>, i1 immarg) +declare <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.fp8.fp8.v4f32.i32.i32.v4f32(i32, i32, <4 x float>) +declare <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.fp8.bf8.v4f32.i32.i32.v4f32(i32, i32, <4 x float>) +declare <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf8.fp8.v4f32.i32.i32.v4f32(i32, i32, <4 x float>) +declare <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf8.bf8.v4f32.i32.i32.v4f32(i32, i32, <4 x float>) +declare <4 x i32> @llvm.amdgcn.wmma.i32.16x16x32.iu4.v4i32.i32.i32.v4i32(i1 immarg, i32, i1 immarg, i32, <4 x i32>, i1 immarg) diff --git a/llvm/test/CodeGen/AMDGPU/wmma-gfx12-w64-iu-modifiers.ll b/llvm/test/CodeGen/AMDGPU/wmma-gfx12-w64-iu-modifiers.ll new file mode 100644 index 000000000000..803453e45ced --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/wmma-gfx12-w64-iu-modifiers.ll @@ -0,0 +1,274 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc -march=amdgcn -mcpu=gfx1200 -mattr=-wavefrontsize32,+wavefrontsize64 -verify-machineinstrs < %s | FileCheck %s --check-prefix=GFX12 + +define amdgpu_ps void @test_wmma_i32_16x16x16_iu8_zext_src0(i32 %A, i32 %B, <4 x i32> %C, ptr addrspace(1) %out) { +; GFX12-LABEL: test_wmma_i32_16x16x16_iu8_zext_src0: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: v_wmma_i32_16x16x16_iu8 v[2:5], v0, v1, v[2:5] neg_lo:[1,0,0] +; GFX12-NEXT: global_store_b128 v[6:7], v[2:5], off +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +bb: + %res = call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8.v4i32.i32.i32.v4i32(i1 1, i32 %A, i1 0, i32 %B, <4 x i32> %C, i1 0) + store <4 x i32> %res, ptr addrspace(1) %out + ret void +} + +define amdgpu_ps void @test_wmma_i32_16x16x16_iu8_zext_src1(i32 %A, i32 %B, <4 x i32> %C, ptr addrspace(1) %out) { +; GFX12-LABEL: test_wmma_i32_16x16x16_iu8_zext_src1: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: v_wmma_i32_16x16x16_iu8 v[2:5], v0, v1, v[2:5] neg_lo:[0,1,0] +; GFX12-NEXT: global_store_b128 v[6:7], v[2:5], off +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +bb: + %res = call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8.v4i32.i32.i32.v4i32(i1 0, i32 %A, i1 1, i32 %B, <4 x i32> %C, i1 0) + store <4 x i32> %res, ptr addrspace(1) %out + ret void +} + +define amdgpu_ps void @test_wmma_i32_16x16x16_iu8_clamp(i32 %A, i32 %B, <4 x i32> %C, ptr addrspace(1) %out) { +; GFX12-LABEL: test_wmma_i32_16x16x16_iu8_clamp: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: v_wmma_i32_16x16x16_iu8 v[2:5], v0, v1, v[2:5] clamp +; GFX12-NEXT: global_store_b128 v[6:7], v[2:5], off +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +bb: + %res = call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8.v4i32.i32.i32.v4i32(i1 0, i32 %A, i1 0, i32 %B, <4 x i32> %C, i1 1) + store <4 x i32> %res, ptr addrspace(1) %out + ret void +} + + + +define amdgpu_ps void @test_wmma_i32_16x16x16_iu4_zext_src0(i32 %A, i32 %B, <4 x i32> %C, ptr addrspace(1) %out) { +; GFX12-LABEL: test_wmma_i32_16x16x16_iu4_zext_src0: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: v_wmma_i32_16x16x16_iu4 v[2:5], v0, v1, v[2:5] neg_lo:[1,0,0] +; GFX12-NEXT: global_store_b128 v[6:7], v[2:5], off +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +bb: + %res = call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4.v4i32.i32.i32.v4i32(i1 1, i32 %A, i1 0, i32 %B, <4 x i32> %C, i1 0) + store <4 x i32> %res, ptr addrspace(1) %out + ret void +} + +define amdgpu_ps void @test_wmma_i32_16x16x16_iu4_zext_src1(i32 %A, i32 %B, <4 x i32> %C, ptr addrspace(1) %out) { +; GFX12-LABEL: test_wmma_i32_16x16x16_iu4_zext_src1: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: v_wmma_i32_16x16x16_iu4 v[2:5], v0, v1, v[2:5] neg_lo:[0,1,0] +; GFX12-NEXT: global_store_b128 v[6:7], v[2:5], off +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +bb: + %res = call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4.v4i32.i32.i32.v4i32(i1 0, i32 %A, i1 1, i32 %B, <4 x i32> %C, i1 0) + store <4 x i32> %res, ptr addrspace(1) %out + ret void +} + +define amdgpu_ps void @test_wmma_i32_16x16x16_iu4_clamp(i32 %A, i32 %B, <4 x i32> %C, ptr addrspace(1) %out) { +; GFX12-LABEL: test_wmma_i32_16x16x16_iu4_clamp: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: v_wmma_i32_16x16x16_iu4 v[2:5], v0, v1, v[2:5] clamp +; GFX12-NEXT: global_store_b128 v[6:7], v[2:5], off +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +bb: + %res = call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4.v4i32.i32.i32.v4i32(i1 0, i32 %A, i1 0, i32 %B, <4 x i32> %C, i1 1) + store <4 x i32> %res, ptr addrspace(1) %out + ret void +} + + + +define amdgpu_ps void @test_wmma_i32_16x16x32_iu4_zext_src0(i32 %A, i32 %B, <4 x i32> %C, ptr addrspace(1) %out) { +; GFX12-LABEL: test_wmma_i32_16x16x32_iu4_zext_src0: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: v_wmma_i32_16x16x32_iu4 v[2:5], v0, v1, v[2:5] neg_lo:[1,0,0] +; GFX12-NEXT: global_store_b128 v[6:7], v[2:5], off +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +bb: + %res = call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x32.iu4.v4i32.i32.i32.v4i32(i1 1, i32 %A, i1 0, i32 %B, <4 x i32> %C, i1 0) + store <4 x i32> %res, ptr addrspace(1) %out + ret void +} + +define amdgpu_ps void @test_wmma_i32_16x16x32_iu4_zext_src1(i32 %A, i32 %B, <4 x i32> %C, ptr addrspace(1) %out) { +; GFX12-LABEL: test_wmma_i32_16x16x32_iu4_zext_src1: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: v_wmma_i32_16x16x32_iu4 v[2:5], v0, v1, v[2:5] neg_lo:[0,1,0] +; GFX12-NEXT: global_store_b128 v[6:7], v[2:5], off +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +bb: + %res = call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x32.iu4.v4i32.i32.i32.v4i32(i1 0, i32 %A, i1 1, i32 %B, <4 x i32> %C, i1 0) + store <4 x i32> %res, ptr addrspace(1) %out + ret void +} + +define amdgpu_ps void @test_wmma_i32_16x16x32_iu4_clamp(i32 %A, i32 %B, <4 x i32> %C, ptr addrspace(1) %out) { +; GFX12-LABEL: test_wmma_i32_16x16x32_iu4_clamp: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: v_wmma_i32_16x16x32_iu4 v[2:5], v0, v1, v[2:5] clamp +; GFX12-NEXT: global_store_b128 v[6:7], v[2:5], off +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +bb: + %res = call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x32.iu4.v4i32.i32.i32.v4i32(i1 0, i32 %A, i1 0, i32 %B, <4 x i32> %C, i1 1) + store <4 x i32> %res, ptr addrspace(1) %out + ret void +} + + + + + + +define amdgpu_ps void @test_swmmac_i32_16x16x32_iu8_zext_src0(i32 %A, <2 x i32> %B, <4 x i32> %C, i8 %Index, ptr addrspace(1) %out) { +; GFX12-LABEL: test_swmmac_i32_16x16x32_iu8_zext_src0: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: v_swmmac_i32_16x16x32_iu8 v[3:6], v0, v[1:2], v7 neg_lo:[1,0,0] +; GFX12-NEXT: global_store_b128 v[8:9], v[3:6], off +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +bb: + %res = call <4 x i32> @llvm.amdgcn.swmmac.i32.16x16x32.iu8.v4i32.i32.v2i32.v4i32.i8(i1 1, i32 %A, i1 0, <2 x i32> %B, <4 x i32> %C, i8 %Index, i1 0) + store <4 x i32> %res, ptr addrspace(1) %out + ret void +} + +define amdgpu_ps void @test_swmmac_i32_16x16x32_iu8_zext_src1(i32 %A, <2 x i32> %B, <4 x i32> %C, i8 %Index, ptr addrspace(1) %out) { +; GFX12-LABEL: test_swmmac_i32_16x16x32_iu8_zext_src1: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: v_swmmac_i32_16x16x32_iu8 v[3:6], v0, v[1:2], v7 neg_lo:[0,1,0] +; GFX12-NEXT: global_store_b128 v[8:9], v[3:6], off +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +bb: + %res = call <4 x i32> @llvm.amdgcn.swmmac.i32.16x16x32.iu8.v4i32.i32.v2i32.v4i32.i8(i1 0, i32 %A, i1 1, <2 x i32> %B, <4 x i32> %C, i8 %Index, i1 0) + store <4 x i32> %res, ptr addrspace(1) %out + ret void +} + +define amdgpu_ps void @test_swmmac_i32_16x16x32_iu8_clamp(i32 %A, <2 x i32> %B, <4 x i32> %C, i8 %Index, ptr addrspace(1) %out) { +; GFX12-LABEL: test_swmmac_i32_16x16x32_iu8_clamp: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: v_swmmac_i32_16x16x32_iu8 v[3:6], v0, v[1:2], v7 clamp +; GFX12-NEXT: global_store_b128 v[8:9], v[3:6], off +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +bb: + %res = call <4 x i32> @llvm.amdgcn.swmmac.i32.16x16x32.iu8.v4i32.i32.v2i32.v4i32.i8(i1 0, i32 %A, i1 0, <2 x i32> %B, <4 x i32> %C, i8 %Index, i1 1) + store <4 x i32> %res, ptr addrspace(1) %out + ret void +} + + + +define amdgpu_ps void @test_swmmac_i32_16x16x32_iu4_zext_src0(i32 %A, i32 %B, <4 x i32> %C, i16 %Index, ptr addrspace(1) %out) { +; GFX12-LABEL: test_swmmac_i32_16x16x32_iu4_zext_src0: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: v_swmmac_i32_16x16x32_iu4 v[2:5], v0, v1, v6 neg_lo:[1,0,0] +; GFX12-NEXT: global_store_b128 v[7:8], v[2:5], off +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +bb: + %res = call <4 x i32> @llvm.amdgcn.swmmac.i32.16x16x32.iu4.v4i32.i32.i32.v4i32.i16(i1 1, i32 %A, i1 0, i32 %B, <4 x i32> %C, i16 %Index, i1 0) + store <4 x i32> %res, ptr addrspace(1) %out + ret void +} + +define amdgpu_ps void @test_swmmac_i32_16x16x32_iu4_zext_src1(i32 %A, i32 %B, <4 x i32> %C, i16 %Index, ptr addrspace(1) %out) { +; GFX12-LABEL: test_swmmac_i32_16x16x32_iu4_zext_src1: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: v_swmmac_i32_16x16x32_iu4 v[2:5], v0, v1, v6 neg_lo:[0,1,0] +; GFX12-NEXT: global_store_b128 v[7:8], v[2:5], off +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +bb: + %res = call <4 x i32> @llvm.amdgcn.swmmac.i32.16x16x32.iu4.v4i32.i32.i32.v4i32.i16(i1 0, i32 %A, i1 1, i32 %B, <4 x i32> %C, i16 %Index, i1 0) + store <4 x i32> %res, ptr addrspace(1) %out + ret void +} + +define amdgpu_ps void @test_swmmac_i32_16x16x32_iu4_clamp(i32 %A, i32 %B, <4 x i32> %C, i16 %Index, ptr addrspace(1) %out) { +; GFX12-LABEL: test_swmmac_i32_16x16x32_iu4_clamp: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: v_swmmac_i32_16x16x32_iu4 v[2:5], v0, v1, v6 clamp +; GFX12-NEXT: global_store_b128 v[7:8], v[2:5], off +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +bb: + %res = call <4 x i32> @llvm.amdgcn.swmmac.i32.16x16x32.iu4.v4i32.i32.i32.v4i32.i16(i1 0, i32 %A, i1 0, i32 %B, <4 x i32> %C, i16 %Index, i1 1) + store <4 x i32> %res, ptr addrspace(1) %out + ret void +} + + + +define amdgpu_ps void @test_swmmac_i32_16x16x64_iu4_zext_src0(i32 %A, <2 x i32> %B, <4 x i32> %C, i16 %Index, ptr addrspace(1) %out) { +; GFX12-LABEL: test_swmmac_i32_16x16x64_iu4_zext_src0: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: v_swmmac_i32_16x16x64_iu4 v[3:6], v0, v[1:2], v7 neg_lo:[1,0,0] +; GFX12-NEXT: global_store_b128 v[8:9], v[3:6], off +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +bb: + %res = call <4 x i32> @llvm.amdgcn.swmmac.i32.16x16x64.iu4.v4i32.i32.v2i32.v4i32.i16(i1 1, i32 %A, i1 0, <2 x i32> %B, <4 x i32> %C, i16 %Index, i1 0) + store <4 x i32> %res, ptr addrspace(1) %out + ret void +} + +define amdgpu_ps void @test_swmmac_i32_16x16x64_iu4_zext_src1(i32 %A, <2 x i32> %B, <4 x i32> %C, i16 %Index, ptr addrspace(1) %out) { +; GFX12-LABEL: test_swmmac_i32_16x16x64_iu4_zext_src1: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: v_swmmac_i32_16x16x64_iu4 v[3:6], v0, v[1:2], v7 neg_lo:[0,1,0] +; GFX12-NEXT: global_store_b128 v[8:9], v[3:6], off +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +bb: + %res = call <4 x i32> @llvm.amdgcn.swmmac.i32.16x16x64.iu4.v4i32.i32.v2i32.v4i32.i16(i1 0, i32 %A, i1 1, <2 x i32> %B, <4 x i32> %C, i16 %Index, i1 0) + store <4 x i32> %res, ptr addrspace(1) %out + ret void +} + +define amdgpu_ps void @test_swmmac_i32_16x16x64_iu4_clamp(i32 %A, <2 x i32> %B, <4 x i32> %C, i16 %Index, ptr addrspace(1) %out) { +; GFX12-LABEL: test_swmmac_i32_16x16x64_iu4_clamp: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: v_swmmac_i32_16x16x64_iu4 v[3:6], v0, v[1:2], v7 clamp +; GFX12-NEXT: global_store_b128 v[8:9], v[3:6], off +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +bb: + %res = call <4 x i32> @llvm.amdgcn.swmmac.i32.16x16x64.iu4.v4i32.i32.v2i32.v4i32.i16(i1 0, i32 %A, i1 0, <2 x i32> %B, <4 x i32> %C, i16 %Index, i1 1) + store <4 x i32> %res, ptr addrspace(1) %out + ret void +} + +declare <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8.v4i32.i32.i32.v4i32(i1 immarg, i32, i1 immarg, i32, <4 x i32>, i1 immarg) +declare <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4.v4i32.i32.i32.v4i32(i1 immarg, i32, i1 immarg, i32, <4 x i32>, i1 immarg) +declare <4 x i32> @llvm.amdgcn.wmma.i32.16x16x32.iu4.v4i32.i32.i32.v4i32(i1 immarg, i32, i1 immarg, i32, <4 x i32>, i1 immarg) +declare <4 x i32> @llvm.amdgcn.swmmac.i32.16x16x32.iu8.v4i32.i32.v2i32.v4i32.i8(i1 immarg, i32, i1 immarg, <2 x i32>, <4 x i32>, i8 %Index, i1 immarg) +declare <4 x i32> @llvm.amdgcn.swmmac.i32.16x16x32.iu4.v4i32.i32.i32.v4i32.i16(i1 immarg, i32, i1 immarg, i32, <4 x i32>, i16 %Index, i1 immarg) +declare <4 x i32> @llvm.amdgcn.swmmac.i32.16x16x64.iu4.v4i32.i32.v2i32.v4i32.i16(i1 immarg, i32, i1 immarg, <2 x i32>, <4 x i32>, i16 %Index, i1 immarg) diff --git a/llvm/test/CodeGen/AMDGPU/wmma-gfx12-w64-swmmac-index_key.ll b/llvm/test/CodeGen/AMDGPU/wmma-gfx12-w64-swmmac-index_key.ll new file mode 100644 index 000000000000..0c5b19a8416f --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/wmma-gfx12-w64-swmmac-index_key.ll @@ -0,0 +1,472 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc -march=amdgcn -mcpu=gfx1200 -mattr=-wavefrontsize32,+wavefrontsize64 -verify-machineinstrs < %s | FileCheck %s --check-prefix=GFX12 + +define amdgpu_ps void @test_swmmac_f32_16x16x32_f16_index_key(<4 x half> %A, <8 x half> %B, <4 x float> %C, ptr addrspace(1) %IndexVecPtr, ptr addrspace(1) %out0, ptr addrspace(1) %out1, ptr addrspace(1) %out2, ptr addrspace(1) %out3) { +; GFX12-LABEL: test_swmmac_f32_16x16x32_f16_index_key: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: global_load_b32 v10, v[10:11], off +; GFX12-NEXT: v_mov_b32_e32 v23, v9 +; GFX12-NEXT: v_mov_b32_e32 v22, v8 +; GFX12-NEXT: v_mov_b32_e32 v21, v7 +; GFX12-NEXT: v_mov_b32_e32 v20, v6 +; GFX12-NEXT: v_mov_b32_e32 v27, v9 +; GFX12-NEXT: v_mov_b32_e32 v26, v8 +; GFX12-NEXT: v_mov_b32_e32 v25, v7 +; GFX12-NEXT: v_mov_b32_e32 v24, v6 +; GFX12-NEXT: v_mov_b32_e32 v31, v9 +; GFX12-NEXT: v_mov_b32_e32 v30, v8 +; GFX12-NEXT: v_mov_b32_e32 v29, v7 +; GFX12-NEXT: v_mov_b32_e32 v28, v6 +; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: v_swmmac_f32_16x16x32_f16 v[20:23], v[0:1], v[2:5], v10 +; GFX12-NEXT: v_swmmac_f32_16x16x32_f16 v[24:27], v[0:1], v[2:5], v10 index_key:1 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) +; GFX12-NEXT: v_swmmac_f32_16x16x32_f16 v[28:31], v[0:1], v[2:5], v10 index_key:2 +; GFX12-NEXT: v_swmmac_f32_16x16x32_f16 v[6:9], v[0:1], v[2:5], v10 index_key:3 +; GFX12-NEXT: global_store_b128 v[12:13], v[20:23], off +; GFX12-NEXT: global_store_b128 v[14:15], v[24:27], off +; GFX12-NEXT: global_store_b128 v[16:17], v[28:31], off +; GFX12-NEXT: global_store_b128 v[18:19], v[6:9], off +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +bb: + %IndexVec = load <4 x i8>, ptr addrspace(1) %IndexVecPtr, align 4 + %Index0 = extractelement <4 x i8> %IndexVec, i32 0 + %res0 = call <4 x float> @llvm.amdgcn.swmmac.f32.16x16x32.f16.v4f32.v4f16.v8f16.v4f32.i8(<4 x half> %A, <8 x half> %B, <4 x float> %C, i8 %Index0) + store <4 x float> %res0, ptr addrspace(1) %out0 + %Index1 = extractelement <4 x i8> %IndexVec, i32 1 + %res1 = call <4 x float> @llvm.amdgcn.swmmac.f32.16x16x32.f16.v4f32.v4f16.v8f16.v4f32.i8(<4 x half> %A, <8 x half> %B, <4 x float> %C, i8 %Index1) + store <4 x float> %res1, ptr addrspace(1) %out1 + %Index2 = extractelement <4 x i8> %IndexVec, i32 2 + %res2 = call <4 x float> @llvm.amdgcn.swmmac.f32.16x16x32.f16.v4f32.v4f16.v8f16.v4f32.i8(<4 x half> %A, <8 x half> %B, <4 x float> %C, i8 %Index2) + store <4 x float> %res2, ptr addrspace(1) %out2 + %Index3 = extractelement <4 x i8> %IndexVec, i32 3 + %res3 = call <4 x float> @llvm.amdgcn.swmmac.f32.16x16x32.f16.v4f32.v4f16.v8f16.v4f32.i8(<4 x half> %A, <8 x half> %B, <4 x float> %C, i8 %Index3) + store <4 x float> %res3, ptr addrspace(1) %out3 + ret void +} + +define amdgpu_ps void @test_swmmac_f32_16x16x32_bf16_index_key(<4 x i16> %A, <8 x i16> %B, <4 x float> %C, ptr addrspace(1) %IndexVecPtr, ptr addrspace(1) %out0, ptr addrspace(1) %out1, ptr addrspace(1) %out2, ptr addrspace(1) %out3) { +; GFX12-LABEL: test_swmmac_f32_16x16x32_bf16_index_key: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: global_load_b32 v10, v[10:11], off +; GFX12-NEXT: v_mov_b32_e32 v23, v9 +; GFX12-NEXT: v_mov_b32_e32 v22, v8 +; GFX12-NEXT: v_mov_b32_e32 v21, v7 +; GFX12-NEXT: v_mov_b32_e32 v20, v6 +; GFX12-NEXT: v_mov_b32_e32 v27, v9 +; GFX12-NEXT: v_mov_b32_e32 v26, v8 +; GFX12-NEXT: v_mov_b32_e32 v25, v7 +; GFX12-NEXT: v_mov_b32_e32 v24, v6 +; GFX12-NEXT: v_mov_b32_e32 v31, v9 +; GFX12-NEXT: v_mov_b32_e32 v30, v8 +; GFX12-NEXT: v_mov_b32_e32 v29, v7 +; GFX12-NEXT: v_mov_b32_e32 v28, v6 +; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: v_swmmac_f32_16x16x32_bf16 v[20:23], v[0:1], v[2:5], v10 +; GFX12-NEXT: v_swmmac_f32_16x16x32_bf16 v[24:27], v[0:1], v[2:5], v10 index_key:1 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) +; GFX12-NEXT: v_swmmac_f32_16x16x32_bf16 v[28:31], v[0:1], v[2:5], v10 index_key:2 +; GFX12-NEXT: v_swmmac_f32_16x16x32_bf16 v[6:9], v[0:1], v[2:5], v10 index_key:3 +; GFX12-NEXT: global_store_b128 v[12:13], v[20:23], off +; GFX12-NEXT: global_store_b128 v[14:15], v[24:27], off +; GFX12-NEXT: global_store_b128 v[16:17], v[28:31], off +; GFX12-NEXT: global_store_b128 v[18:19], v[6:9], off +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +bb: + %IndexVec = load <4 x i8>, ptr addrspace(1) %IndexVecPtr, align 4 + %Index0 = extractelement <4 x i8> %IndexVec, i32 0 + %res0 = call <4 x float> @llvm.amdgcn.swmmac.f32.16x16x32.bf16.v4f32.v4i16.v8i16.v4f32.i8(<4 x i16> %A, <8 x i16> %B, <4 x float> %C, i8 %Index0) + store <4 x float> %res0, ptr addrspace(1) %out0 + %Index1 = extractelement <4 x i8> %IndexVec, i32 1 + %res1 = call <4 x float> @llvm.amdgcn.swmmac.f32.16x16x32.bf16.v4f32.v4i16.v8i16.v4f32.i8(<4 x i16> %A, <8 x i16> %B, <4 x float> %C, i8 %Index1) + store <4 x float> %res1, ptr addrspace(1) %out1 + %Index2 = extractelement <4 x i8> %IndexVec, i32 2 + %res2 = call <4 x float> @llvm.amdgcn.swmmac.f32.16x16x32.bf16.v4f32.v4i16.v8i16.v4f32.i8(<4 x i16> %A, <8 x i16> %B, <4 x float> %C, i8 %Index2) + store <4 x float> %res2, ptr addrspace(1) %out2 + %Index3 = extractelement <4 x i8> %IndexVec, i32 3 + %res3 = call <4 x float> @llvm.amdgcn.swmmac.f32.16x16x32.bf16.v4f32.v4i16.v8i16.v4f32.i8(<4 x i16> %A, <8 x i16> %B, <4 x float> %C, i8 %Index3) + store <4 x float> %res3, ptr addrspace(1) %out3 + ret void +} + +define amdgpu_ps void @test_swmmac_f16_16x16x32_f16_index_key(<4 x half> %A, <8 x half> %B, <4 x half> %C, ptr addrspace(1) %IndexVecPtr, ptr addrspace(1) %out0, ptr addrspace(1) %out1, ptr addrspace(1) %out2, ptr addrspace(1) %out3) { +; GFX12-LABEL: test_swmmac_f16_16x16x32_f16_index_key: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: global_load_b32 v22, v[8:9], off +; GFX12-NEXT: v_mov_b32_e32 v9, v7 +; GFX12-NEXT: v_mov_b32_e32 v8, v6 +; GFX12-NEXT: v_mov_b32_e32 v19, v7 +; GFX12-NEXT: v_mov_b32_e32 v18, v6 +; GFX12-NEXT: v_mov_b32_e32 v21, v7 +; GFX12-NEXT: v_mov_b32_e32 v20, v6 +; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: v_swmmac_f16_16x16x32_f16 v[8:9], v[0:1], v[2:5], v22 +; GFX12-NEXT: v_swmmac_f16_16x16x32_f16 v[18:19], v[0:1], v[2:5], v22 index_key:1 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) +; GFX12-NEXT: v_swmmac_f16_16x16x32_f16 v[20:21], v[0:1], v[2:5], v22 index_key:2 +; GFX12-NEXT: v_swmmac_f16_16x16x32_f16 v[6:7], v[0:1], v[2:5], v22 index_key:3 +; GFX12-NEXT: global_store_b64 v[10:11], v[8:9], off +; GFX12-NEXT: global_store_b64 v[12:13], v[18:19], off +; GFX12-NEXT: global_store_b64 v[14:15], v[20:21], off +; GFX12-NEXT: global_store_b64 v[16:17], v[6:7], off +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +bb: + %IndexVec = load <4 x i8>, ptr addrspace(1) %IndexVecPtr, align 4 + %Index0 = extractelement <4 x i8> %IndexVec, i32 0 + %res0 = call <4 x half> @llvm.amdgcn.swmmac.f16.16x16x32.f16.v4f16.v4f16.v8f16.v4f16.i8(<4 x half> %A, <8 x half> %B, <4 x half> %C, i8 %Index0) + store <4 x half> %res0, ptr addrspace(1) %out0 + %Index1 = extractelement <4 x i8> %IndexVec, i32 1 + %res1 = call <4 x half> @llvm.amdgcn.swmmac.f16.16x16x32.f16.v4f16.v4f16.v8f16.v4f16.i8(<4 x half> %A, <8 x half> %B, <4 x half> %C, i8 %Index1) + store <4 x half> %res1, ptr addrspace(1) %out1 + %Index2 = extractelement <4 x i8> %IndexVec, i32 2 + %res2 = call <4 x half> @llvm.amdgcn.swmmac.f16.16x16x32.f16.v4f16.v4f16.v8f16.v4f16.i8(<4 x half> %A, <8 x half> %B, <4 x half> %C, i8 %Index2) + store <4 x half> %res2, ptr addrspace(1) %out2 + %Index3 = extractelement <4 x i8> %IndexVec, i32 3 + %res3 = call <4 x half> @llvm.amdgcn.swmmac.f16.16x16x32.f16.v4f16.v4f16.v8f16.v4f16.i8(<4 x half> %A, <8 x half> %B, <4 x half> %C, i8 %Index3) + store <4 x half> %res3, ptr addrspace(1) %out3 + ret void +} + +define amdgpu_ps void @test_swmmac_bf16_16x16x32_bf16_index_key(<4 x i16> %A, <8 x i16> %B, <4 x i16> %C, ptr addrspace(1) %IndexVecPtr, ptr addrspace(1) %out0, ptr addrspace(1) %out1, ptr addrspace(1) %out2, ptr addrspace(1) %out3) { +; GFX12-LABEL: test_swmmac_bf16_16x16x32_bf16_index_key: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: global_load_b32 v22, v[8:9], off +; GFX12-NEXT: v_mov_b32_e32 v9, v7 +; GFX12-NEXT: v_mov_b32_e32 v8, v6 +; GFX12-NEXT: v_mov_b32_e32 v19, v7 +; GFX12-NEXT: v_mov_b32_e32 v18, v6 +; GFX12-NEXT: v_mov_b32_e32 v21, v7 +; GFX12-NEXT: v_mov_b32_e32 v20, v6 +; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: v_swmmac_bf16_16x16x32_bf16 v[8:9], v[0:1], v[2:5], v22 +; GFX12-NEXT: v_swmmac_bf16_16x16x32_bf16 v[18:19], v[0:1], v[2:5], v22 index_key:1 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) +; GFX12-NEXT: v_swmmac_bf16_16x16x32_bf16 v[20:21], v[0:1], v[2:5], v22 index_key:2 +; GFX12-NEXT: v_swmmac_bf16_16x16x32_bf16 v[6:7], v[0:1], v[2:5], v22 index_key:3 +; GFX12-NEXT: global_store_b64 v[10:11], v[8:9], off +; GFX12-NEXT: global_store_b64 v[12:13], v[18:19], off +; GFX12-NEXT: global_store_b64 v[14:15], v[20:21], off +; GFX12-NEXT: global_store_b64 v[16:17], v[6:7], off +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +bb: + %IndexVec = load <4 x i8>, ptr addrspace(1) %IndexVecPtr, align 4 + %Index0 = extractelement <4 x i8> %IndexVec, i32 0 + %res0 = call <4 x i16> @llvm.amdgcn.swmmac.bf16.16x16x32.bf16.v4i16.v4i16.v8i16.v4i16.i8(<4 x i16> %A, <8 x i16> %B, <4 x i16> %C, i8 %Index0) + store <4 x i16> %res0, ptr addrspace(1) %out0 + %Index1 = extractelement <4 x i8> %IndexVec, i32 1 + %res1 = call <4 x i16> @llvm.amdgcn.swmmac.bf16.16x16x32.bf16.v4i16.v4i16.v8i16.v4i16.i8(<4 x i16> %A, <8 x i16> %B, <4 x i16> %C, i8 %Index1) + store <4 x i16> %res1, ptr addrspace(1) %out1 + %Index2 = extractelement <4 x i8> %IndexVec, i32 2 + %res2 = call <4 x i16> @llvm.amdgcn.swmmac.bf16.16x16x32.bf16.v4i16.v4i16.v8i16.v4i16.i8(<4 x i16> %A, <8 x i16> %B, <4 x i16> %C, i8 %Index2) + store <4 x i16> %res2, ptr addrspace(1) %out2 + %Index3 = extractelement <4 x i8> %IndexVec, i32 3 + %res3 = call <4 x i16> @llvm.amdgcn.swmmac.bf16.16x16x32.bf16.v4i16.v4i16.v8i16.v4i16.i8(<4 x i16> %A, <8 x i16> %B, <4 x i16> %C, i8 %Index3) + store <4 x i16> %res3, ptr addrspace(1) %out3 + ret void +} + +define amdgpu_ps void @test_swmmac_i32_16x16x32_iu8_index_key(i32 %A, <2 x i32> %B, <4 x i32> %C, ptr addrspace(1) %IndexVecPtr, ptr addrspace(1) %out0, ptr addrspace(1) %out1, ptr addrspace(1) %out2, ptr addrspace(1) %out3) { +; GFX12-LABEL: test_swmmac_i32_16x16x32_iu8_index_key: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: global_load_b32 v7, v[7:8], off +; GFX12-NEXT: v_mov_b32_e32 v20, v6 +; GFX12-NEXT: v_mov_b32_e32 v19, v5 +; GFX12-NEXT: v_mov_b32_e32 v18, v4 +; GFX12-NEXT: v_mov_b32_e32 v17, v3 +; GFX12-NEXT: v_mov_b32_e32 v24, v6 +; GFX12-NEXT: v_mov_b32_e32 v23, v5 +; GFX12-NEXT: v_mov_b32_e32 v22, v4 +; GFX12-NEXT: v_mov_b32_e32 v21, v3 +; GFX12-NEXT: v_mov_b32_e32 v28, v6 +; GFX12-NEXT: v_mov_b32_e32 v27, v5 +; GFX12-NEXT: v_mov_b32_e32 v26, v4 +; GFX12-NEXT: v_mov_b32_e32 v25, v3 +; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: v_swmmac_i32_16x16x32_iu8 v[17:20], v0, v[1:2], v7 +; GFX12-NEXT: v_swmmac_i32_16x16x32_iu8 v[21:24], v0, v[1:2], v7 index_key:1 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) +; GFX12-NEXT: v_swmmac_i32_16x16x32_iu8 v[25:28], v0, v[1:2], v7 index_key:2 +; GFX12-NEXT: v_swmmac_i32_16x16x32_iu8 v[3:6], v0, v[1:2], v7 index_key:3 +; GFX12-NEXT: global_store_b128 v[9:10], v[17:20], off +; GFX12-NEXT: global_store_b128 v[11:12], v[21:24], off +; GFX12-NEXT: global_store_b128 v[13:14], v[25:28], off +; GFX12-NEXT: global_store_b128 v[15:16], v[3:6], off +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +bb: + %IndexVec = load <4 x i8>, ptr addrspace(1) %IndexVecPtr, align 4 + %Index0 = extractelement <4 x i8> %IndexVec, i32 0 + %res0 = call <4 x i32> @llvm.amdgcn.swmmac.i32.16x16x32.iu8.v4i32.i32.v2i32.v4i32.i8(i1 0, i32 %A, i1 0, <2 x i32> %B, <4 x i32> %C, i8 %Index0, i1 0) + store <4 x i32> %res0, ptr addrspace(1) %out0 + %Index1 = extractelement <4 x i8> %IndexVec, i32 1 + %res1 = call <4 x i32> @llvm.amdgcn.swmmac.i32.16x16x32.iu8.v4i32.i32.v2i32.v4i32.i8(i1 0, i32 %A, i1 0, <2 x i32> %B, <4 x i32> %C, i8 %Index1, i1 0) + store <4 x i32> %res1, ptr addrspace(1) %out1 + %Index2 = extractelement <4 x i8> %IndexVec, i32 2 + %res2 = call <4 x i32> @llvm.amdgcn.swmmac.i32.16x16x32.iu8.v4i32.i32.v2i32.v4i32.i8(i1 0, i32 %A, i1 0, <2 x i32> %B, <4 x i32> %C, i8 %Index2, i1 0) + store <4 x i32> %res2, ptr addrspace(1) %out2 + %Index3 = extractelement <4 x i8> %IndexVec, i32 3 + %res3 = call <4 x i32> @llvm.amdgcn.swmmac.i32.16x16x32.iu8.v4i32.i32.v2i32.v4i32.i8(i1 0, i32 %A, i1 0, <2 x i32> %B, <4 x i32> %C, i8 %Index3, i1 0) + store <4 x i32> %res3, ptr addrspace(1) %out3 + ret void +} + +define amdgpu_ps void @test_swmmac_i32_16x16x32_iu4_index_key(i32 %A, i32 %B, <4 x i32> %C, ptr addrspace(1) %IndexVecPtr, ptr addrspace(1) %out0, ptr addrspace(1) %out1) { +; GFX12-LABEL: test_swmmac_i32_16x16x32_iu4_index_key: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: global_load_b32 v6, v[6:7], off +; GFX12-NEXT: v_mov_b32_e32 v15, v5 +; GFX12-NEXT: v_mov_b32_e32 v14, v4 +; GFX12-NEXT: v_mov_b32_e32 v13, v3 +; GFX12-NEXT: v_mov_b32_e32 v12, v2 +; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: v_swmmac_i32_16x16x32_iu4 v[12:15], v0, v1, v6 +; GFX12-NEXT: v_swmmac_i32_16x16x32_iu4 v[2:5], v0, v1, v6 index_key:1 +; GFX12-NEXT: global_store_b128 v[8:9], v[12:15], off +; GFX12-NEXT: global_store_b128 v[10:11], v[2:5], off +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +bb: + %IndexVec = load <2 x i16>, ptr addrspace(1) %IndexVecPtr, align 4 + %Index0 = extractelement <2 x i16> %IndexVec, i32 0 + %res0 = call <4 x i32> @llvm.amdgcn.swmmac.i32.16x16x32.iu4.v4i32.i32.i32.v4i32.i16(i1 0, i32 %A, i1 0, i32 %B, <4 x i32> %C, i16 %Index0, i1 0) + store <4 x i32> %res0, ptr addrspace(1) %out0 + %Index1 = extractelement <2 x i16> %IndexVec, i32 1 + %res1 = call <4 x i32> @llvm.amdgcn.swmmac.i32.16x16x32.iu4.v4i32.i32.i32.v4i32.i16(i1 0, i32 %A, i1 0, i32 %B, <4 x i32> %C, i16 %Index1, i1 0) + store <4 x i32> %res1, ptr addrspace(1) %out1 + ret void +} + +define amdgpu_ps void @test_swmmac_i32_16x16x64_iu4_index_key(i32 %A, <2 x i32> %B, <4 x i32> %C, ptr addrspace(1) %IndexVecPtr, ptr addrspace(1) %out0, ptr addrspace(1) %out1) { +; GFX12-LABEL: test_swmmac_i32_16x16x64_iu4_index_key: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: global_load_b32 v7, v[7:8], off +; GFX12-NEXT: v_mov_b32_e32 v16, v6 +; GFX12-NEXT: v_mov_b32_e32 v15, v5 +; GFX12-NEXT: v_mov_b32_e32 v14, v4 +; GFX12-NEXT: v_mov_b32_e32 v13, v3 +; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: v_swmmac_i32_16x16x64_iu4 v[13:16], v0, v[1:2], v7 +; GFX12-NEXT: v_swmmac_i32_16x16x64_iu4 v[3:6], v0, v[1:2], v7 index_key:1 +; GFX12-NEXT: global_store_b128 v[9:10], v[13:16], off +; GFX12-NEXT: global_store_b128 v[11:12], v[3:6], off +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +bb: + %IndexVec = load <2 x i16>, ptr addrspace(1) %IndexVecPtr, align 4 + %Index0 = extractelement <2 x i16> %IndexVec, i32 0 + %res0 = call <4 x i32> @llvm.amdgcn.swmmac.i32.16x16x64.iu4.v4i32.i32.v2i32.v4i32.i16(i1 0, i32 %A, i1 0, <2 x i32> %B, <4 x i32> %C, i16 %Index0, i1 0) + store <4 x i32> %res0, ptr addrspace(1) %out0 + %Index1 = extractelement <2 x i16> %IndexVec, i32 1 + %res1 = call <4 x i32> @llvm.amdgcn.swmmac.i32.16x16x64.iu4.v4i32.i32.v2i32.v4i32.i16(i1 0, i32 %A, i1 0, <2 x i32> %B, <4 x i32> %C, i16 %Index1, i1 0) + store <4 x i32> %res1, ptr addrspace(1) %out1 + ret void +} + +define amdgpu_ps void @test_swmmac_f32_16x16x32_fp8_fp8_index_key(i32 %A, <2 x i32> %B, <4 x float> %C, ptr addrspace(1) %IndexVecPtr, ptr addrspace(1) %out0, ptr addrspace(1) %out1, ptr addrspace(1) %out2, ptr addrspace(1) %out3) { +; GFX12-LABEL: test_swmmac_f32_16x16x32_fp8_fp8_index_key: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: global_load_b32 v7, v[7:8], off +; GFX12-NEXT: v_mov_b32_e32 v20, v6 +; GFX12-NEXT: v_mov_b32_e32 v19, v5 +; GFX12-NEXT: v_mov_b32_e32 v18, v4 +; GFX12-NEXT: v_mov_b32_e32 v17, v3 +; GFX12-NEXT: v_mov_b32_e32 v24, v6 +; GFX12-NEXT: v_mov_b32_e32 v23, v5 +; GFX12-NEXT: v_mov_b32_e32 v22, v4 +; GFX12-NEXT: v_mov_b32_e32 v21, v3 +; GFX12-NEXT: v_mov_b32_e32 v28, v6 +; GFX12-NEXT: v_mov_b32_e32 v27, v5 +; GFX12-NEXT: v_mov_b32_e32 v26, v4 +; GFX12-NEXT: v_mov_b32_e32 v25, v3 +; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: v_swmmac_f32_16x16x32_fp8_fp8 v[17:20], v0, v[1:2], v7 +; GFX12-NEXT: v_swmmac_f32_16x16x32_fp8_fp8 v[21:24], v0, v[1:2], v7 index_key:1 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) +; GFX12-NEXT: v_swmmac_f32_16x16x32_fp8_fp8 v[25:28], v0, v[1:2], v7 index_key:2 +; GFX12-NEXT: v_swmmac_f32_16x16x32_fp8_fp8 v[3:6], v0, v[1:2], v7 index_key:3 +; GFX12-NEXT: global_store_b128 v[9:10], v[17:20], off +; GFX12-NEXT: global_store_b128 v[11:12], v[21:24], off +; GFX12-NEXT: global_store_b128 v[13:14], v[25:28], off +; GFX12-NEXT: global_store_b128 v[15:16], v[3:6], off +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +bb: + %IndexVec = load <4 x i8>, ptr addrspace(1) %IndexVecPtr, align 4 + %Index0 = extractelement <4 x i8> %IndexVec, i32 0 + %res0 = call <4 x float> @llvm.amdgcn.swmmac.f32.16x16x32.fp8.fp8.v4f32.i32.v2i32.v4f32.i8(i32 %A, <2 x i32> %B, <4 x float> %C, i8 %Index0) + store <4 x float> %res0, ptr addrspace(1) %out0 + %Index1 = extractelement <4 x i8> %IndexVec, i32 1 + %res1 = call <4 x float> @llvm.amdgcn.swmmac.f32.16x16x32.fp8.fp8.v4f32.i32.v2i32.v4f32.i8(i32 %A, <2 x i32> %B, <4 x float> %C, i8 %Index1) + store <4 x float> %res1, ptr addrspace(1) %out1 + %Index2 = extractelement <4 x i8> %IndexVec, i32 2 + %res2 = call <4 x float> @llvm.amdgcn.swmmac.f32.16x16x32.fp8.fp8.v4f32.i32.v2i32.v4f32.i8(i32 %A, <2 x i32> %B, <4 x float> %C, i8 %Index2) + store <4 x float> %res2, ptr addrspace(1) %out2 + %Index3 = extractelement <4 x i8> %IndexVec, i32 3 + %res3 = call <4 x float> @llvm.amdgcn.swmmac.f32.16x16x32.fp8.fp8.v4f32.i32.v2i32.v4f32.i8(i32 %A, <2 x i32> %B, <4 x float> %C, i8 %Index3) + store <4 x float> %res3, ptr addrspace(1) %out3 + ret void +} + +define amdgpu_ps void @test_swmmac_f32_16x16x32_fp8_bf8_index_key(i32 %A, <2 x i32> %B, <4 x float> %C, ptr addrspace(1) %IndexVecPtr, ptr addrspace(1) %out0, ptr addrspace(1) %out1, ptr addrspace(1) %out2, ptr addrspace(1) %out3) { +; GFX12-LABEL: test_swmmac_f32_16x16x32_fp8_bf8_index_key: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: global_load_b32 v7, v[7:8], off +; GFX12-NEXT: v_mov_b32_e32 v20, v6 +; GFX12-NEXT: v_mov_b32_e32 v19, v5 +; GFX12-NEXT: v_mov_b32_e32 v18, v4 +; GFX12-NEXT: v_mov_b32_e32 v17, v3 +; GFX12-NEXT: v_mov_b32_e32 v24, v6 +; GFX12-NEXT: v_mov_b32_e32 v23, v5 +; GFX12-NEXT: v_mov_b32_e32 v22, v4 +; GFX12-NEXT: v_mov_b32_e32 v21, v3 +; GFX12-NEXT: v_mov_b32_e32 v28, v6 +; GFX12-NEXT: v_mov_b32_e32 v27, v5 +; GFX12-NEXT: v_mov_b32_e32 v26, v4 +; GFX12-NEXT: v_mov_b32_e32 v25, v3 +; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: v_swmmac_f32_16x16x32_fp8_bf8 v[17:20], v0, v[1:2], v7 +; GFX12-NEXT: v_swmmac_f32_16x16x32_fp8_bf8 v[21:24], v0, v[1:2], v7 index_key:1 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) +; GFX12-NEXT: v_swmmac_f32_16x16x32_fp8_bf8 v[25:28], v0, v[1:2], v7 index_key:2 +; GFX12-NEXT: v_swmmac_f32_16x16x32_fp8_bf8 v[3:6], v0, v[1:2], v7 index_key:3 +; GFX12-NEXT: global_store_b128 v[9:10], v[17:20], off +; GFX12-NEXT: global_store_b128 v[11:12], v[21:24], off +; GFX12-NEXT: global_store_b128 v[13:14], v[25:28], off +; GFX12-NEXT: global_store_b128 v[15:16], v[3:6], off +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +bb: + %IndexVec = load <4 x i8>, ptr addrspace(1) %IndexVecPtr, align 4 + %Index0 = extractelement <4 x i8> %IndexVec, i32 0 + %res0 = call <4 x float> @llvm.amdgcn.swmmac.f32.16x16x32.fp8.bf8.v4f32.i32.v2i32.v4f32.i8(i32 %A, <2 x i32> %B, <4 x float> %C, i8 %Index0) + store <4 x float> %res0, ptr addrspace(1) %out0 + %Index1 = extractelement <4 x i8> %IndexVec, i32 1 + %res1 = call <4 x float> @llvm.amdgcn.swmmac.f32.16x16x32.fp8.bf8.v4f32.i32.v2i32.v4f32.i8(i32 %A, <2 x i32> %B, <4 x float> %C, i8 %Index1) + store <4 x float> %res1, ptr addrspace(1) %out1 + %Index2 = extractelement <4 x i8> %IndexVec, i32 2 + %res2 = call <4 x float> @llvm.amdgcn.swmmac.f32.16x16x32.fp8.bf8.v4f32.i32.v2i32.v4f32.i8(i32 %A, <2 x i32> %B, <4 x float> %C, i8 %Index2) + store <4 x float> %res2, ptr addrspace(1) %out2 + %Index3 = extractelement <4 x i8> %IndexVec, i32 3 + %res3 = call <4 x float> @llvm.amdgcn.swmmac.f32.16x16x32.fp8.bf8.v4f32.i32.v2i32.v4f32.i8(i32 %A, <2 x i32> %B, <4 x float> %C, i8 %Index3) + store <4 x float> %res3, ptr addrspace(1) %out3 + ret void +} + +define amdgpu_ps void @test_swmmac_f32_16x16x32_bf8_fp8_index_key(i32 %A, <2 x i32> %B, <4 x float> %C, ptr addrspace(1) %IndexVecPtr, ptr addrspace(1) %out0, ptr addrspace(1) %out1, ptr addrspace(1) %out2, ptr addrspace(1) %out3) { +; GFX12-LABEL: test_swmmac_f32_16x16x32_bf8_fp8_index_key: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: global_load_b32 v7, v[7:8], off +; GFX12-NEXT: v_mov_b32_e32 v20, v6 +; GFX12-NEXT: v_mov_b32_e32 v19, v5 +; GFX12-NEXT: v_mov_b32_e32 v18, v4 +; GFX12-NEXT: v_mov_b32_e32 v17, v3 +; GFX12-NEXT: v_mov_b32_e32 v24, v6 +; GFX12-NEXT: v_mov_b32_e32 v23, v5 +; GFX12-NEXT: v_mov_b32_e32 v22, v4 +; GFX12-NEXT: v_mov_b32_e32 v21, v3 +; GFX12-NEXT: v_mov_b32_e32 v28, v6 +; GFX12-NEXT: v_mov_b32_e32 v27, v5 +; GFX12-NEXT: v_mov_b32_e32 v26, v4 +; GFX12-NEXT: v_mov_b32_e32 v25, v3 +; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: v_swmmac_f32_16x16x32_bf8_fp8 v[17:20], v0, v[1:2], v7 +; GFX12-NEXT: v_swmmac_f32_16x16x32_bf8_fp8 v[21:24], v0, v[1:2], v7 index_key:1 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) +; GFX12-NEXT: v_swmmac_f32_16x16x32_bf8_fp8 v[25:28], v0, v[1:2], v7 index_key:2 +; GFX12-NEXT: v_swmmac_f32_16x16x32_bf8_fp8 v[3:6], v0, v[1:2], v7 index_key:3 +; GFX12-NEXT: global_store_b128 v[9:10], v[17:20], off +; GFX12-NEXT: global_store_b128 v[11:12], v[21:24], off +; GFX12-NEXT: global_store_b128 v[13:14], v[25:28], off +; GFX12-NEXT: global_store_b128 v[15:16], v[3:6], off +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +bb: + %IndexVec = load <4 x i8>, ptr addrspace(1) %IndexVecPtr, align 4 + %Index0 = extractelement <4 x i8> %IndexVec, i32 0 + %res0 = call <4 x float> @llvm.amdgcn.swmmac.f32.16x16x32.bf8.fp8.v4f32.i32.v2i32.v4f32.i8(i32 %A, <2 x i32> %B, <4 x float> %C, i8 %Index0) + store <4 x float> %res0, ptr addrspace(1) %out0 + %Index1 = extractelement <4 x i8> %IndexVec, i32 1 + %res1 = call <4 x float> @llvm.amdgcn.swmmac.f32.16x16x32.bf8.fp8.v4f32.i32.v2i32.v4f32.i8(i32 %A, <2 x i32> %B, <4 x float> %C, i8 %Index1) + store <4 x float> %res1, ptr addrspace(1) %out1 + %Index2 = extractelement <4 x i8> %IndexVec, i32 2 + %res2 = call <4 x float> @llvm.amdgcn.swmmac.f32.16x16x32.bf8.fp8.v4f32.i32.v2i32.v4f32.i8(i32 %A, <2 x i32> %B, <4 x float> %C, i8 %Index2) + store <4 x float> %res2, ptr addrspace(1) %out2 + %Index3 = extractelement <4 x i8> %IndexVec, i32 3 + %res3 = call <4 x float> @llvm.amdgcn.swmmac.f32.16x16x32.bf8.fp8.v4f32.i32.v2i32.v4f32.i8(i32 %A, <2 x i32> %B, <4 x float> %C, i8 %Index3) + store <4 x float> %res3, ptr addrspace(1) %out3 + ret void +} + +define amdgpu_ps void @test_swmmac_f32_16x16x32_bf8_bf8_index_key(i32 %A, <2 x i32> %B, <4 x float> %C, ptr addrspace(1) %IndexVecPtr, ptr addrspace(1) %out0, ptr addrspace(1) %out1, ptr addrspace(1) %out2, ptr addrspace(1) %out3) { +; GFX12-LABEL: test_swmmac_f32_16x16x32_bf8_bf8_index_key: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: global_load_b32 v7, v[7:8], off +; GFX12-NEXT: v_mov_b32_e32 v20, v6 +; GFX12-NEXT: v_mov_b32_e32 v19, v5 +; GFX12-NEXT: v_mov_b32_e32 v18, v4 +; GFX12-NEXT: v_mov_b32_e32 v17, v3 +; GFX12-NEXT: v_mov_b32_e32 v24, v6 +; GFX12-NEXT: v_mov_b32_e32 v23, v5 +; GFX12-NEXT: v_mov_b32_e32 v22, v4 +; GFX12-NEXT: v_mov_b32_e32 v21, v3 +; GFX12-NEXT: v_mov_b32_e32 v28, v6 +; GFX12-NEXT: v_mov_b32_e32 v27, v5 +; GFX12-NEXT: v_mov_b32_e32 v26, v4 +; GFX12-NEXT: v_mov_b32_e32 v25, v3 +; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: v_swmmac_f32_16x16x32_bf8_bf8 v[17:20], v0, v[1:2], v7 +; GFX12-NEXT: v_swmmac_f32_16x16x32_bf8_bf8 v[21:24], v0, v[1:2], v7 index_key:1 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) +; GFX12-NEXT: v_swmmac_f32_16x16x32_bf8_bf8 v[25:28], v0, v[1:2], v7 index_key:2 +; GFX12-NEXT: v_swmmac_f32_16x16x32_bf8_bf8 v[3:6], v0, v[1:2], v7 index_key:3 +; GFX12-NEXT: global_store_b128 v[9:10], v[17:20], off +; GFX12-NEXT: global_store_b128 v[11:12], v[21:24], off +; GFX12-NEXT: global_store_b128 v[13:14], v[25:28], off +; GFX12-NEXT: global_store_b128 v[15:16], v[3:6], off +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +bb: + %IndexVec = load <4 x i8>, ptr addrspace(1) %IndexVecPtr, align 4 + %Index0 = extractelement <4 x i8> %IndexVec, i32 0 + %res0 = call <4 x float> @llvm.amdgcn.swmmac.f32.16x16x32.bf8.bf8.v4f32.i32.v2i32.v4f32.i8(i32 %A, <2 x i32> %B, <4 x float> %C, i8 %Index0) + store <4 x float> %res0, ptr addrspace(1) %out0 + %Index1 = extractelement <4 x i8> %IndexVec, i32 1 + %res1 = call <4 x float> @llvm.amdgcn.swmmac.f32.16x16x32.bf8.bf8.v4f32.i32.v2i32.v4f32.i8(i32 %A, <2 x i32> %B, <4 x float> %C, i8 %Index1) + store <4 x float> %res1, ptr addrspace(1) %out1 + %Index2 = extractelement <4 x i8> %IndexVec, i32 2 + %res2 = call <4 x float> @llvm.amdgcn.swmmac.f32.16x16x32.bf8.bf8.v4f32.i32.v2i32.v4f32.i8(i32 %A, <2 x i32> %B, <4 x float> %C, i8 %Index2) + store <4 x float> %res2, ptr addrspace(1) %out2 + %Index3 = extractelement <4 x i8> %IndexVec, i32 3 + %res3 = call <4 x float> @llvm.amdgcn.swmmac.f32.16x16x32.bf8.bf8.v4f32.i32.v2i32.v4f32.i8(i32 %A, <2 x i32> %B, <4 x float> %C, i8 %Index3) + store <4 x float> %res3, ptr addrspace(1) %out3 + ret void +} + +declare <4 x float> @llvm.amdgcn.swmmac.f32.16x16x32.f16.v4f32.v4f16.v8f16.v4f32.i8(<4 x half>, <8 x half>, <4 x float>, i8) +declare <4 x float> @llvm.amdgcn.swmmac.f32.16x16x32.bf16.v4f32.v4i16.v8i16.v4f32.i8(<4 x i16>, <8 x i16>, <4 x float>, i8) +declare <4 x half> @llvm.amdgcn.swmmac.f16.16x16x32.f16.v4f16.v4f16.v8f16.v4f16.i8(<4 x half>, <8 x half>, <4 x half>, i8) +declare <4 x i16> @llvm.amdgcn.swmmac.bf16.16x16x32.bf16.v4i16.v4i16.v8i16.v4i16.i8(<4 x i16>, <8 x i16>, <4 x i16>, i8) +declare <4 x i32> @llvm.amdgcn.swmmac.i32.16x16x32.iu8.v4i32.i32.v2i32.v4i32.i8(i1 immarg, i32, i1 immarg, <2 x i32>, <4 x i32>, i8 %Index, i1 immarg) +declare <4 x i32> @llvm.amdgcn.swmmac.i32.16x16x32.iu4.v4i32.i32.i32.v4i32.i16(i1 immarg, i32, i1 immarg, i32, <4 x i32>, i16 %Index, i1 immarg) +declare <4 x i32> @llvm.amdgcn.swmmac.i32.16x16x64.iu4.v4i32.i32.v2i32.v4i32.i16(i1 immarg, i32, i1 immarg, <2 x i32>, <4 x i32>, i16 %Index, i1 immarg) +declare <4 x float> @llvm.amdgcn.swmmac.f32.16x16x32.fp8.fp8.v4f32.i32.v2i32.v4f32.i8(i32, <2 x i32>, <4 x float>, i8) +declare <4 x float> @llvm.amdgcn.swmmac.f32.16x16x32.fp8.bf8.v4f32.i32.v2i32.v4f32.i8(i32, <2 x i32>, <4 x float>, i8) +declare <4 x float> @llvm.amdgcn.swmmac.f32.16x16x32.bf8.fp8.v4f32.i32.v2i32.v4f32.i8(i32, <2 x i32>, <4 x float>, i8) +declare <4 x float> @llvm.amdgcn.swmmac.f32.16x16x32.bf8.bf8.v4f32.i32.v2i32.v4f32.i8(i32, <2 x i32>, <4 x float>, i8) diff --git a/llvm/test/CodeGen/AMDGPU/wmma-gfx12-w64.ll b/llvm/test/CodeGen/AMDGPU/wmma-gfx12-w64.ll new file mode 100644 index 000000000000..b25f27851333 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/wmma-gfx12-w64.ll @@ -0,0 +1,333 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc -march=amdgcn -mcpu=gfx1200 -mattr=-wavefrontsize32,+wavefrontsize64 -verify-machineinstrs < %s | FileCheck %s --check-prefix=GFX12 + +define amdgpu_ps void @test_wmma_f32_16x16x16_f16(<4 x half> %A, <4 x half> %B, <4 x float> %C, ptr addrspace(1) %out) { +; GFX12-LABEL: test_wmma_f32_16x16x16_f16: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: v_wmma_f32_16x16x16_f16 v[4:7], v[0:1], v[2:3], v[4:7] +; GFX12-NEXT: global_store_b128 v[8:9], v[4:7], off +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +bb: + %res = call <4 x float>@llvm.amdgcn.wmma.f32.16x16x16.f16.v4f32.v4f16.v4f16.v4f32(<4 x half> %A, <4 x half> %B, <4 x float> %C) + store <4 x float> %res, ptr addrspace(1) %out + ret void +} + +define amdgpu_ps void @test_wmma_f32_16x16x16_bf16(<4 x i16> %A, <4 x i16> %B, <4 x float> %C, ptr addrspace(1) %out) { +; GFX12-LABEL: test_wmma_f32_16x16x16_bf16: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: v_wmma_f32_16x16x16_bf16 v[4:7], v[0:1], v[2:3], v[4:7] +; GFX12-NEXT: global_store_b128 v[8:9], v[4:7], off +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +bb: + %res = call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf16.v4f32.v4i16.v4i16.v4f32(<4 x i16> %A, <4 x i16> %B, <4 x float> %C) + store <4 x float> %res, ptr addrspace(1) %out + ret void +} + +define amdgpu_ps void @test_wmma_f16_16x16x16_f16(<4 x half> %A, <4 x half> %B, <4 x half> %C, ptr addrspace(1) %out) { +; GFX12-LABEL: test_wmma_f16_16x16x16_f16: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: v_wmma_f16_16x16x16_f16 v[4:5], v[0:1], v[2:3], v[4:5] +; GFX12-NEXT: global_store_b64 v[6:7], v[4:5], off +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +bb: + %res = call <4 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16.v4f16.v4f16.v4f16.v4f16(<4 x half> %A, <4 x half> %B, <4 x half> %C, i1 0) + store <4 x half> %res, ptr addrspace(1) %out + ret void +} + +define amdgpu_ps void @test_wmma_bf16_16x16x16_bf16(<4 x i16> %A, <4 x i16> %B, <4 x i16> %C, ptr addrspace(1) %out) { +; GFX12-LABEL: test_wmma_bf16_16x16x16_bf16: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: v_wmma_bf16_16x16x16_bf16 v[4:5], v[0:1], v[2:3], v[4:5] +; GFX12-NEXT: global_store_b64 v[6:7], v[4:5], off +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +bb: + %res = call <4 x i16> @llvm.amdgcn.wmma.bf16.16x16x16.bf16.v4i16.v4i16.v4i16.v4i16(<4 x i16> %A, <4 x i16> %B, <4 x i16> %C, i1 0) + store <4 x i16> %res, ptr addrspace(1) %out + ret void +} + +define amdgpu_ps void @test_wmma_i32_16x16x16_iu8(i32 %A, i32 %B, <4 x i32> %C, ptr addrspace(1) %out) { +; GFX12-LABEL: test_wmma_i32_16x16x16_iu8: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: v_wmma_i32_16x16x16_iu8 v[2:5], v0, v1, v[2:5] +; GFX12-NEXT: global_store_b128 v[6:7], v[2:5], off +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +bb: + %res = call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8.v4i32.i32.i32.v4i32(i1 0, i32 %A, i1 0, i32 %B, <4 x i32> %C, i1 0) + store <4 x i32> %res, ptr addrspace(1) %out + ret void +} + +define amdgpu_ps void @test_wmma_i32_16x16x16_iu4(i32 %A, i32 %B, <4 x i32> %C, ptr addrspace(1) %out) { +; GFX12-LABEL: test_wmma_i32_16x16x16_iu4: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: v_wmma_i32_16x16x16_iu4 v[2:5], v0, v1, v[2:5] +; GFX12-NEXT: global_store_b128 v[6:7], v[2:5], off +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +bb: + %res = call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4.v4i32.i32.i32.v4i32(i1 0, i32 %A, i1 0, i32 %B, <4 x i32> %C, i1 0) + store <4 x i32> %res, ptr addrspace(1) %out + ret void +} + +define amdgpu_ps void @test_wmma_f32_16x16x16_fp8_fp8(i32 %A, i32 %B, <4 x float> %C, ptr addrspace(1) %out) { +; GFX12-LABEL: test_wmma_f32_16x16x16_fp8_fp8: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: v_wmma_f32_16x16x16_fp8_fp8 v[2:5], v0, v1, v[2:5] +; GFX12-NEXT: global_store_b128 v[6:7], v[2:5], off +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +bb: + %res = call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.fp8.fp8.v4f32.i32.i32.v4f32(i32 %A, i32 %B, <4 x float> %C) + store <4 x float> %res, ptr addrspace(1) %out + ret void +} + +define amdgpu_ps void @test_wmma_f32_16x16x16_bf8_fp8(i32 %A, i32 %B, <4 x float> %C, ptr addrspace(1) %out) { +; GFX12-LABEL: test_wmma_f32_16x16x16_bf8_fp8: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: v_wmma_f32_16x16x16_bf8_fp8 v[2:5], v0, v1, v[2:5] +; GFX12-NEXT: global_store_b128 v[6:7], v[2:5], off +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +bb: + %res = call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf8.fp8.v4f32.i32.i32.v4f32(i32 %A, i32 %B, <4 x float> %C) + store <4 x float> %res, ptr addrspace(1) %out + ret void +} + +define amdgpu_ps void @test_wmma_f32_16x16x16_fp8_bf8(i32 %A, i32 %B, <4 x float> %C, ptr addrspace(1) %out) { +; GFX12-LABEL: test_wmma_f32_16x16x16_fp8_bf8: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: v_wmma_f32_16x16x16_fp8_bf8 v[2:5], v0, v1, v[2:5] +; GFX12-NEXT: global_store_b128 v[6:7], v[2:5], off +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +bb: + %res = call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.fp8.bf8.v4f32.i32.i32.v4f32(i32 %A, i32 %B, <4 x float> %C) + store <4 x float> %res, ptr addrspace(1) %out + ret void +} + +define amdgpu_ps void @test_wmma_f32_16x16x16_bf8_bf8(i32 %A, i32 %B, <4 x float> %C, ptr addrspace(1) %out) { +; GFX12-LABEL: test_wmma_f32_16x16x16_bf8_bf8: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: v_wmma_f32_16x16x16_bf8_bf8 v[2:5], v0, v1, v[2:5] +; GFX12-NEXT: global_store_b128 v[6:7], v[2:5], off +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +bb: + %res = call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf8.bf8.v4f32.i32.i32.v4f32(i32 %A, i32 %B, <4 x float> %C) + store <4 x float> %res, ptr addrspace(1) %out + ret void +} + +define amdgpu_ps void @test_wmma_i32_16x16x32_iu4(i32 %A, i32 %B, <4 x i32> %C, ptr addrspace(1) %out) { +; GFX12-LABEL: test_wmma_i32_16x16x32_iu4: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: v_wmma_i32_16x16x32_iu4 v[2:5], v0, v1, v[2:5] +; GFX12-NEXT: global_store_b128 v[6:7], v[2:5], off +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +bb: + %res = call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x32.iu4.v4i32.i32.i32.v4i32(i1 0, i32 %A, i1 0, i32 %B, <4 x i32> %C, i1 0) + store <4 x i32> %res, ptr addrspace(1) %out + ret void +} + +define amdgpu_ps void @test_swmmac_f32_16x16x32_f16(<4 x half> %A, <8 x half> %B, <4 x float> %C, i8 %Index, ptr addrspace(1) %out) { +; GFX12-LABEL: test_swmmac_f32_16x16x32_f16: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: v_swmmac_f32_16x16x32_f16 v[6:9], v[0:1], v[2:5], v10 +; GFX12-NEXT: global_store_b128 v[11:12], v[6:9], off +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +bb: + %res = call <4 x float> @llvm.amdgcn.swmmac.f32.16x16x32.f16.v4f32.v4f16.v8f16.v4f32.i8(<4 x half> %A, <8 x half> %B, <4 x float> %C, i8 %Index) + store <4 x float> %res, ptr addrspace(1) %out + ret void +} + +define amdgpu_ps void @test_swmmac_f32_16x16x32_bf16(<4 x i16> %A, <8 x i16> %B, <4 x float> %C, i8 %Index, ptr addrspace(1) %out) { +; GFX12-LABEL: test_swmmac_f32_16x16x32_bf16: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: v_swmmac_f32_16x16x32_bf16 v[6:9], v[0:1], v[2:5], v10 +; GFX12-NEXT: global_store_b128 v[11:12], v[6:9], off +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +bb: + %res = call <4 x float> @llvm.amdgcn.swmmac.f32.16x16x32.bf16.v4f32.v4i16.v8i16.v4f32.i8(<4 x i16> %A, <8 x i16> %B, <4 x float> %C, i8 %Index) + store <4 x float> %res, ptr addrspace(1) %out + ret void +} + +define amdgpu_ps void @test_swmmac_f16_16x16x32_f16(<4 x half> %A, <8 x half> %B, <4 x half> %C, i8 %Index, ptr addrspace(1) %out) { +; GFX12-LABEL: test_swmmac_f16_16x16x32_f16: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: v_swmmac_f16_16x16x32_f16 v[6:7], v[0:1], v[2:5], v8 +; GFX12-NEXT: global_store_b64 v[9:10], v[6:7], off +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +bb: + %res = call <4 x half> @llvm.amdgcn.swmmac.f16.16x16x32.f16.v4f16.v4f16.v8f16.v4f16.i8(<4 x half> %A, <8 x half> %B, <4 x half> %C, i8 %Index) + store <4 x half> %res, ptr addrspace(1) %out + ret void +} + +define amdgpu_ps void @test_swmmac_bf16_16x16x32_bf16(<4 x i16> %A, <8 x i16> %B, <4 x i16> %C, i8 %Index, ptr addrspace(1) %out) { +; GFX12-LABEL: test_swmmac_bf16_16x16x32_bf16: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: v_swmmac_bf16_16x16x32_bf16 v[6:7], v[0:1], v[2:5], v8 +; GFX12-NEXT: global_store_b64 v[9:10], v[6:7], off +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +bb: + %res = call <4 x i16> @llvm.amdgcn.swmmac.bf16.16x16x32.bf16.v4i16.v4i16.v8i16.v4i16.i8(<4 x i16> %A, <8 x i16> %B, <4 x i16> %C, i8 %Index) + store <4 x i16> %res, ptr addrspace(1) %out + ret void +} + +define amdgpu_ps void @test_swmmac_i32_16x16x32_iu8(i32 %A, <2 x i32> %B, <4 x i32> %C, i8 %Index, ptr addrspace(1) %out) { +; GFX12-LABEL: test_swmmac_i32_16x16x32_iu8: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: v_swmmac_i32_16x16x32_iu8 v[3:6], v0, v[1:2], v7 +; GFX12-NEXT: global_store_b128 v[8:9], v[3:6], off +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +bb: + %res = call <4 x i32> @llvm.amdgcn.swmmac.i32.16x16x32.iu8.v4i32.i32.v2i32.v4i32.i8(i1 0, i32 %A, i1 0, <2 x i32> %B, <4 x i32> %C, i8 %Index, i1 0) + store <4 x i32> %res, ptr addrspace(1) %out + ret void +} + +define amdgpu_ps void @test_swmmac_i32_16x16x32_iu4(i32 %A, i32 %B, <4 x i32> %C, i16 %Index, ptr addrspace(1) %out) { +; GFX12-LABEL: test_swmmac_i32_16x16x32_iu4: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: v_swmmac_i32_16x16x32_iu4 v[2:5], v0, v1, v6 +; GFX12-NEXT: global_store_b128 v[7:8], v[2:5], off +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +bb: + %res = call <4 x i32> @llvm.amdgcn.swmmac.i32.16x16x32.iu4.v4i32.i32.i32.v4i32.i16(i1 0, i32 %A, i1 0, i32 %B, <4 x i32> %C, i16 %Index, i1 0) + store <4 x i32> %res, ptr addrspace(1) %out + ret void +} + +define amdgpu_ps void @test_swmmac_i32_16x16x64_iu4(i32 %A, <2 x i32> %B, <4 x i32> %C, i16 %Index, ptr addrspace(1) %out) { +; GFX12-LABEL: test_swmmac_i32_16x16x64_iu4: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: v_swmmac_i32_16x16x64_iu4 v[3:6], v0, v[1:2], v7 +; GFX12-NEXT: global_store_b128 v[8:9], v[3:6], off +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +bb: + %res = call <4 x i32> @llvm.amdgcn.swmmac.i32.16x16x64.iu4.v4i32.i32.v2i32.v4i32.i16(i1 0, i32 %A, i1 0, <2 x i32> %B, <4 x i32> %C, i16 %Index, i1 0) + store <4 x i32> %res, ptr addrspace(1) %out + ret void +} + +define amdgpu_ps void @test_swmmac_f32_16x16x32_fp8_fp8(i32 %A, <2 x i32> %B, <4 x float> %C, i8 %Index, ptr addrspace(1) %out) { +; GFX12-LABEL: test_swmmac_f32_16x16x32_fp8_fp8: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: v_swmmac_f32_16x16x32_fp8_fp8 v[3:6], v0, v[1:2], v7 +; GFX12-NEXT: global_store_b128 v[8:9], v[3:6], off +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +bb: + %res = call <4 x float> @llvm.amdgcn.swmmac.f32.16x16x32.fp8.fp8.v4f32.i32.v2i32.v4f32.i8(i32 %A, <2 x i32> %B, <4 x float> %C, i8 %Index) + store <4 x float> %res, ptr addrspace(1) %out + ret void +} + +define amdgpu_ps void @test_swmmac_f32_16x16x32_fp8_bf8(i32 %A, <2 x i32> %B, <4 x float> %C, i8 %Index, ptr addrspace(1) %out) { +; GFX12-LABEL: test_swmmac_f32_16x16x32_fp8_bf8: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: v_swmmac_f32_16x16x32_fp8_bf8 v[3:6], v0, v[1:2], v7 +; GFX12-NEXT: global_store_b128 v[8:9], v[3:6], off +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +bb: + %res = call <4 x float> @llvm.amdgcn.swmmac.f32.16x16x32.fp8.bf8.v4f32.i32.v2i32.v4f32.i8(i32 %A, <2 x i32> %B, <4 x float> %C, i8 %Index) + store <4 x float> %res, ptr addrspace(1) %out + ret void +} + +define amdgpu_ps void @test_swmmac_f32_16x16x32_bf8_fp8(i32 %A, <2 x i32> %B, <4 x float> %C, i8 %Index, ptr addrspace(1) %out) { +; GFX12-LABEL: test_swmmac_f32_16x16x32_bf8_fp8: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: v_swmmac_f32_16x16x32_bf8_fp8 v[3:6], v0, v[1:2], v7 +; GFX12-NEXT: global_store_b128 v[8:9], v[3:6], off +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +bb: + %res = call <4 x float> @llvm.amdgcn.swmmac.f32.16x16x32.bf8.fp8.v4f32.i32.v2i32.v4f32.i8(i32 %A, <2 x i32> %B, <4 x float> %C, i8 %Index) + store <4 x float> %res, ptr addrspace(1) %out + ret void +} + +define amdgpu_ps void @test_swmmac_f32_16x16x32_bf8_bf8(i32 %A, <2 x i32> %B, <4 x float> %C, i8 %Index, ptr addrspace(1) %out) { +; GFX12-LABEL: test_swmmac_f32_16x16x32_bf8_bf8: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: v_swmmac_f32_16x16x32_bf8_bf8 v[3:6], v0, v[1:2], v7 +; GFX12-NEXT: global_store_b128 v[8:9], v[3:6], off +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +bb: + %res = call <4 x float> @llvm.amdgcn.swmmac.f32.16x16x32.bf8.bf8.v4f32.i32.v2i32.v4f32.i8(i32 %A, <2 x i32> %B, <4 x float> %C, i8 %Index) + store <4 x float> %res, ptr addrspace(1) %out + ret void +} + +declare <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.f16.v4f32.v4f16.v4f16.v4f32(<4 x half>, <4 x half>, <4 x float>) +declare <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf16.v4f32.v4i16.v4i16.v4f32(<4 x i16>, <4 x i16>, <4 x float>) +declare <4 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16.v4f16.v4f16.v4f16.v4f16(<4 x half>, <4 x half>, <4 x half>, i1 immarg) +declare <4 x i16> @llvm.amdgcn.wmma.bf16.16x16x16.bf16.v4i16.v4i16.v4i16.v4i16(<4 x i16>, <4 x i16>, <4 x i16>, i1 immarg) +declare <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8.v4i32.i32.i32.v4i32(i1 immarg, i32, i1 immarg, i32, <4 x i32>, i1 immarg) +declare <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4.v4i32.i32.i32.v4i32(i1 immarg, i32, i1 immarg, i32, <4 x i32>, i1 immarg) +declare <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.fp8.fp8.v4f32.i32.i32.v4f32(i32, i32, <4 x float>) +declare <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.fp8.bf8.v4f32.i32.i32.v4f32(i32, i32, <4 x float>) +declare <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf8.fp8.v4f32.i32.i32.v4f32(i32, i32, <4 x float>) +declare <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf8.bf8.v4f32.i32.i32.v4f32(i32, i32, <4 x float>) +declare <4 x i32> @llvm.amdgcn.wmma.i32.16x16x32.iu4.v4i32.i32.i32.v4i32(i1 immarg, i32, i1 immarg, i32, <4 x i32>, i1 immarg) +declare <4 x float> @llvm.amdgcn.swmmac.f32.16x16x32.f16.v4f32.v4f16.v8f16.v4f32.i8(<4 x half>, <8 x half>, <4 x float>, i8) +declare <4 x float> @llvm.amdgcn.swmmac.f32.16x16x32.bf16.v4f32.v4i16.v8i16.v4f32.i8(<4 x i16>, <8 x i16>, <4 x float>, i8) +declare <4 x half> @llvm.amdgcn.swmmac.f16.16x16x32.f16.v4f16.v4f16.v8f16.v4f16.i8(<4 x half>, <8 x half>, <4 x half>, i8) +declare <4 x i16> @llvm.amdgcn.swmmac.bf16.16x16x32.bf16.v4i16.v4i16.v8i16.v4i16.i8(<4 x i16>, <8 x i16>, <4 x i16>, i8) +declare <4 x i32> @llvm.amdgcn.swmmac.i32.16x16x32.iu8.v4i32.i32.v2i32.v4i32.i8(i1 immarg, i32, i1 immarg, <2 x i32>, <4 x i32>, i8 %Index, i1 immarg) +declare <4 x i32> @llvm.amdgcn.swmmac.i32.16x16x32.iu4.v4i32.i32.i32.v4i32.i16(i1 immarg, i32, i1 immarg, i32, <4 x i32>, i16 %Index, i1 immarg) +declare <4 x i32> @llvm.amdgcn.swmmac.i32.16x16x64.iu4.v4i32.i32.v2i32.v4i32.i16(i1 immarg, i32, i1 immarg, <2 x i32>, <4 x i32>, i16 %Index, i1 immarg) +declare <4 x float> @llvm.amdgcn.swmmac.f32.16x16x32.fp8.fp8.v4f32.i32.v2i32.v4f32.i8(i32, <2 x i32>, <4 x float>, i8) +declare <4 x float> @llvm.amdgcn.swmmac.f32.16x16x32.fp8.bf8.v4f32.i32.v2i32.v4f32.i8(i32, <2 x i32>, <4 x float>, i8) +declare <4 x float> @llvm.amdgcn.swmmac.f32.16x16x32.bf8.fp8.v4f32.i32.v2i32.v4f32.i8(i32, <2 x i32>, <4 x float>, i8) +declare <4 x float> @llvm.amdgcn.swmmac.f32.16x16x32.bf8.bf8.v4f32.i32.v2i32.v4f32.i8(i32, <2 x i32>, <4 x float>, i8) diff --git a/llvm/test/CodeGen/AMDGPU/wmma-hazards-gfx12-w32.mir b/llvm/test/CodeGen/AMDGPU/wmma-hazards-gfx12-w32.mir new file mode 100644 index 000000000000..33c32d7290da --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/wmma-hazards-gfx12-w32.mir @@ -0,0 +1,354 @@ +# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py +# RUN: llc -march=amdgcn -mcpu=gfx1200 -verify-machineinstrs -run-pass post-RA-hazard-rec %s -o - | FileCheck -check-prefix=GFX12 %s + +# D0 overlaps A1, B1, C1 or Index1. Overlap starts at vgpr0. +# $D0 = wmma0 $A0, $B0, $C0 or $D0 = swmmac0 $A0, $B0, $C0, $Index0 +# $D1 = wmma1 $A1, $B1, $C1 or $D1 = swmmac1 $A1, $B1, $C1, $Index1 + +--- +name: test_wmma_f32_16x16x16_f16_D0_overlaps_A1 +body: | + bb.0: + liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25, $vgpr26, $vgpr27, $vgpr28, $vgpr29, $vgpr30, $vgpr31, $vgpr32, $vgpr33, $vgpr34, $vgpr35, $vgpr36, $vgpr37, $vgpr38, $vgpr39, $vgpr40, $vgpr41, $vgpr42, $vgpr43, $vgpr44, $vgpr45 + + ; GFX12-LABEL: name: test_wmma_f32_16x16x16_f16_D0_overlaps_A1 + ; GFX12: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25, $vgpr26, $vgpr27, $vgpr28, $vgpr29, $vgpr30, $vgpr31, $vgpr32, $vgpr33, $vgpr34, $vgpr35, $vgpr36, $vgpr37, $vgpr38, $vgpr39, $vgpr40, $vgpr41, $vgpr42, $vgpr43, $vgpr44, $vgpr45 + ; GFX12-NEXT: {{ $}} + ; GFX12-NEXT: early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = V_WMMA_F32_16X16X16_F16_w32_twoaddr 8, killed $vgpr10_vgpr11_vgpr12_vgpr13, 8, killed $vgpr14_vgpr15_vgpr16_vgpr17, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 0, 0, implicit $exec + ; GFX12-NEXT: V_NOP_e32 implicit $exec + ; GFX12-NEXT: early-clobber renamable $vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43 = V_WMMA_F32_16X16X16_F16_w32_twoaddr 8, killed $vgpr0_vgpr1_vgpr2_vgpr3, 8, killed $vgpr32_vgpr33_vgpr34_vgpr35, 8, killed $vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43, 0, 0, implicit $exec + early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = V_WMMA_F32_16X16X16_F16_w32_twoaddr 8, killed $vgpr10_vgpr11_vgpr12_vgpr13, 8, killed $vgpr14_vgpr15_vgpr16_vgpr17, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 0, 0, implicit $exec + early-clobber renamable $vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43 = V_WMMA_F32_16X16X16_F16_w32_twoaddr 8, killed $vgpr0_vgpr1_vgpr2_vgpr3, 8, killed $vgpr32_vgpr33_vgpr34_vgpr35, 8, killed $vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43, 0, 0, implicit $exec +... + +--- +name: test_wmma_f32_16x16x16_bf16_D0_overlaps_B1 +body: | + bb.0: + liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25, $vgpr26, $vgpr27, $vgpr28, $vgpr29, $vgpr30, $vgpr31, $vgpr32, $vgpr33, $vgpr34, $vgpr35, $vgpr36, $vgpr37, $vgpr38, $vgpr39, $vgpr40, $vgpr41, $vgpr42, $vgpr43, $vgpr44, $vgpr45 + + ; GFX12-LABEL: name: test_wmma_f32_16x16x16_bf16_D0_overlaps_B1 + ; GFX12: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25, $vgpr26, $vgpr27, $vgpr28, $vgpr29, $vgpr30, $vgpr31, $vgpr32, $vgpr33, $vgpr34, $vgpr35, $vgpr36, $vgpr37, $vgpr38, $vgpr39, $vgpr40, $vgpr41, $vgpr42, $vgpr43, $vgpr44, $vgpr45 + ; GFX12-NEXT: {{ $}} + ; GFX12-NEXT: early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = V_WMMA_F32_16X16X16_F16_w32_twoaddr 8, killed $vgpr10_vgpr11_vgpr12_vgpr13, 8, killed $vgpr14_vgpr15_vgpr16_vgpr17, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 0, 0, implicit $exec + ; GFX12-NEXT: V_NOP_e32 implicit $exec + ; GFX12-NEXT: early-clobber renamable $vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43 = V_WMMA_F32_16X16X16_BF16_w32_twoaddr 8, killed $vgpr28_vgpr29_vgpr30_vgpr31, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3, 8, killed $vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43, 0, 0, implicit $exec + early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = V_WMMA_F32_16X16X16_F16_w32_twoaddr 8, killed $vgpr10_vgpr11_vgpr12_vgpr13, 8, killed $vgpr14_vgpr15_vgpr16_vgpr17, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 0, 0, implicit $exec + early-clobber renamable $vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43 = V_WMMA_F32_16X16X16_BF16_w32_twoaddr 8, killed $vgpr28_vgpr29_vgpr30_vgpr31, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3, 8, killed $vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43, 0, 0, implicit $exec +... + +--- +name: test_wmma_f16_16x16x16_f16_D0_overlaps_C1 +body: | + bb.0: + liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25, $vgpr26, $vgpr27, $vgpr28, $vgpr29, $vgpr30, $vgpr31, $vgpr32, $vgpr33, $vgpr34, $vgpr35, $vgpr36, $vgpr37, $vgpr38, $vgpr39, $vgpr40, $vgpr41 + + ; GFX12-LABEL: name: test_wmma_f16_16x16x16_f16_D0_overlaps_C1 + ; GFX12: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25, $vgpr26, $vgpr27, $vgpr28, $vgpr29, $vgpr30, $vgpr31, $vgpr32, $vgpr33, $vgpr34, $vgpr35, $vgpr36, $vgpr37, $vgpr38, $vgpr39, $vgpr40, $vgpr41 + ; GFX12-NEXT: {{ $}} + ; GFX12-NEXT: early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = V_WMMA_F32_16X16X16_F16_w32_twoaddr 8, killed $vgpr10_vgpr11_vgpr12_vgpr13, 8, killed $vgpr14_vgpr15_vgpr16_vgpr17, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 0, 0, implicit $exec + ; GFX12-NEXT: V_NOP_e32 implicit $exec + ; GFX12-NEXT: early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3 = V_WMMA_F16_16X16X16_F16_w32_twoaddr 8, killed $vgpr28_vgpr29_vgpr30_vgpr31, 8, killed $vgpr32_vgpr33_vgpr34_vgpr35, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3, 0, 0, implicit $exec + early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = V_WMMA_F32_16X16X16_F16_w32_twoaddr 8, killed $vgpr10_vgpr11_vgpr12_vgpr13, 8, killed $vgpr14_vgpr15_vgpr16_vgpr17, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 0, 0, implicit $exec + early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3 = V_WMMA_F16_16X16X16_F16_w32_twoaddr 8, killed $vgpr28_vgpr29_vgpr30_vgpr31, 8, killed $vgpr32_vgpr33_vgpr34_vgpr35, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3, 0, 0, implicit $exec +... + +--- +name: test_wmma_bf16_16x16x16_bf16_D0_overlaps_A1 +body: | + bb.0: + liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25, $vgpr26, $vgpr27, $vgpr28, $vgpr29, $vgpr30, $vgpr31, $vgpr32, $vgpr33, $vgpr34, $vgpr35, $vgpr36, $vgpr37, $vgpr38, $vgpr39, $vgpr40, $vgpr41 + + ; GFX12-LABEL: name: test_wmma_bf16_16x16x16_bf16_D0_overlaps_A1 + ; GFX12: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25, $vgpr26, $vgpr27, $vgpr28, $vgpr29, $vgpr30, $vgpr31, $vgpr32, $vgpr33, $vgpr34, $vgpr35, $vgpr36, $vgpr37, $vgpr38, $vgpr39, $vgpr40, $vgpr41 + ; GFX12-NEXT: {{ $}} + ; GFX12-NEXT: early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = V_WMMA_F32_16X16X16_F16_w32_twoaddr 8, killed $vgpr10_vgpr11_vgpr12_vgpr13, 8, killed $vgpr14_vgpr15_vgpr16_vgpr17, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 0, 0, implicit $exec + ; GFX12-NEXT: V_NOP_e32 implicit $exec + ; GFX12-NEXT: early-clobber renamable $vgpr36_vgpr37_vgpr38_vgpr39 = V_WMMA_BF16_16X16X16_BF16_w32_twoaddr 8, killed $vgpr0_vgpr1_vgpr2_vgpr3, 8, killed $vgpr32_vgpr33_vgpr34_vgpr35, 8, killed $vgpr36_vgpr37_vgpr38_vgpr39, 0, 0, implicit $exec + early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = V_WMMA_F32_16X16X16_F16_w32_twoaddr 8, killed $vgpr10_vgpr11_vgpr12_vgpr13, 8, killed $vgpr14_vgpr15_vgpr16_vgpr17, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 0, 0, implicit $exec + early-clobber renamable $vgpr36_vgpr37_vgpr38_vgpr39 = V_WMMA_BF16_16X16X16_BF16_w32_twoaddr 8, killed $vgpr0_vgpr1_vgpr2_vgpr3, 8, killed $vgpr32_vgpr33_vgpr34_vgpr35, 8, killed $vgpr36_vgpr37_vgpr38_vgpr39, 0, 0, implicit $exec +... +--- +name: test_wmma_i32_16x16x16_iu8_D0_overlaps_B1 +body: | + bb.0: + liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25, $vgpr26, $vgpr27, $vgpr28, $vgpr29, $vgpr30, $vgpr31, $vgpr32, $vgpr33, $vgpr34, $vgpr35, $vgpr36, $vgpr37, $vgpr38, $vgpr39, $vgpr40, $vgpr41 + + ; GFX12-LABEL: name: test_wmma_i32_16x16x16_iu8_D0_overlaps_B1 + ; GFX12: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25, $vgpr26, $vgpr27, $vgpr28, $vgpr29, $vgpr30, $vgpr31, $vgpr32, $vgpr33, $vgpr34, $vgpr35, $vgpr36, $vgpr37, $vgpr38, $vgpr39, $vgpr40, $vgpr41 + ; GFX12-NEXT: {{ $}} + ; GFX12-NEXT: early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = V_WMMA_F32_16X16X16_F16_w32_twoaddr 8, killed $vgpr10_vgpr11_vgpr12_vgpr13, 8, killed $vgpr14_vgpr15_vgpr16_vgpr17, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 0, 0, implicit $exec + ; GFX12-NEXT: V_NOP_e32 implicit $exec + ; GFX12-NEXT: early-clobber renamable $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39 = V_WMMA_I32_16X16X16_IU8_w32_twoaddr 8, killed $vgpr28_vgpr29, 8, killed $vgpr0_vgpr1, killed $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39, 0, 0, implicit $exec + early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = V_WMMA_F32_16X16X16_F16_w32_twoaddr 8, killed $vgpr10_vgpr11_vgpr12_vgpr13, 8, killed $vgpr14_vgpr15_vgpr16_vgpr17, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 0, 0, implicit $exec + early-clobber renamable $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39 = V_WMMA_I32_16X16X16_IU8_w32_twoaddr 8, killed $vgpr28_vgpr29, 8, killed $vgpr0_vgpr1, killed $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39, 0, 0, implicit $exec +... + +--- +name: test_wmma_i32_16x16x16_iu4_D0_overlaps_C1 +body: | + bb.0: + liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25, $vgpr26, $vgpr27, $vgpr28, $vgpr29, $vgpr30, $vgpr31, $vgpr32, $vgpr33, $vgpr34, $vgpr35, $vgpr36, $vgpr37, $vgpr38, $vgpr39 + + ; GFX12-LABEL: name: test_wmma_i32_16x16x16_iu4_D0_overlaps_C1 + ; GFX12: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25, $vgpr26, $vgpr27, $vgpr28, $vgpr29, $vgpr30, $vgpr31, $vgpr32, $vgpr33, $vgpr34, $vgpr35, $vgpr36, $vgpr37, $vgpr38, $vgpr39 + ; GFX12-NEXT: {{ $}} + ; GFX12-NEXT: early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = V_WMMA_F32_16X16X16_F16_w32_twoaddr 8, killed $vgpr10_vgpr11_vgpr12_vgpr13, 8, killed $vgpr14_vgpr15_vgpr16_vgpr17, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 0, 0, implicit $exec + ; GFX12-NEXT: V_NOP_e32 implicit $exec + ; GFX12-NEXT: early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = V_WMMA_I32_16X16X16_IU4_w32_twoaddr 8, killed $vgpr28, 8, killed $vgpr29, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 0, 0, implicit $exec + early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = V_WMMA_F32_16X16X16_F16_w32_twoaddr 8, killed $vgpr10_vgpr11_vgpr12_vgpr13, 8, killed $vgpr14_vgpr15_vgpr16_vgpr17, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 0, 0, implicit $exec + early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = V_WMMA_I32_16X16X16_IU4_w32_twoaddr 8, killed $vgpr28, 8, killed $vgpr29, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 0, 0, implicit $exec +... + +--- +name: test_wmma_f32_16x16x16_fp8_fp8_D0_overlaps_A1 +body: | + bb.0: + liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25, $vgpr26, $vgpr27, $vgpr28, $vgpr29, $vgpr30, $vgpr31, $vgpr32, $vgpr33, $vgpr34, $vgpr35, $vgpr36, $vgpr37, $vgpr38, $vgpr39, $vgpr40, $vgpr41 + + ; GFX12-LABEL: name: test_wmma_f32_16x16x16_fp8_fp8_D0_overlaps_A1 + ; GFX12: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25, $vgpr26, $vgpr27, $vgpr28, $vgpr29, $vgpr30, $vgpr31, $vgpr32, $vgpr33, $vgpr34, $vgpr35, $vgpr36, $vgpr37, $vgpr38, $vgpr39, $vgpr40, $vgpr41 + ; GFX12-NEXT: {{ $}} + ; GFX12-NEXT: early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = V_WMMA_F32_16X16X16_F16_w32_twoaddr 8, killed $vgpr10_vgpr11_vgpr12_vgpr13, 8, killed $vgpr14_vgpr15_vgpr16_vgpr17, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 0, 0, implicit $exec + ; GFX12-NEXT: V_NOP_e32 implicit $exec + ; GFX12-NEXT: early-clobber renamable $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39 = V_WMMA_F32_16X16X16_FP8_FP8_w32_twoaddr killed $vgpr0_vgpr1, killed $vgpr30_vgpr31, 8, killed $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39, 0, 0, implicit $exec + early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = V_WMMA_F32_16X16X16_F16_w32_twoaddr 8, killed $vgpr10_vgpr11_vgpr12_vgpr13, 8, killed $vgpr14_vgpr15_vgpr16_vgpr17, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 0, 0, implicit $exec + early-clobber renamable $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39 = V_WMMA_F32_16X16X16_FP8_FP8_w32_twoaddr killed $vgpr0_vgpr1, killed $vgpr30_vgpr31, 8, killed $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39, 0, 0, implicit $exec +... + +--- +name: test_wmma_f32_16x16x16_bf8_fp8_D0_overlaps_B1 +body: | + bb.0: + liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25, $vgpr26, $vgpr27, $vgpr28, $vgpr29, $vgpr30, $vgpr31, $vgpr32, $vgpr33, $vgpr34, $vgpr35, $vgpr36, $vgpr37, $vgpr38, $vgpr39, $vgpr40, $vgpr41 + + ; GFX12-LABEL: name: test_wmma_f32_16x16x16_bf8_fp8_D0_overlaps_B1 + ; GFX12: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25, $vgpr26, $vgpr27, $vgpr28, $vgpr29, $vgpr30, $vgpr31, $vgpr32, $vgpr33, $vgpr34, $vgpr35, $vgpr36, $vgpr37, $vgpr38, $vgpr39, $vgpr40, $vgpr41 + ; GFX12-NEXT: {{ $}} + ; GFX12-NEXT: early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = V_WMMA_F32_16X16X16_F16_w32_twoaddr 8, killed $vgpr10_vgpr11_vgpr12_vgpr13, 8, killed $vgpr14_vgpr15_vgpr16_vgpr17, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 0, 0, implicit $exec + ; GFX12-NEXT: V_NOP_e32 implicit $exec + ; GFX12-NEXT: early-clobber renamable $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39 = V_WMMA_F32_16X16X16_BF8_FP8_w32_twoaddr killed $vgpr28_vgpr29, killed $vgpr0_vgpr1, 8, killed $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39, 0, 0, implicit $exec + early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = V_WMMA_F32_16X16X16_F16_w32_twoaddr 8, killed $vgpr10_vgpr11_vgpr12_vgpr13, 8, killed $vgpr14_vgpr15_vgpr16_vgpr17, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 0, 0, implicit $exec + early-clobber renamable $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39 = V_WMMA_F32_16X16X16_BF8_FP8_w32_twoaddr killed $vgpr28_vgpr29, killed $vgpr0_vgpr1, 8, killed $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39, 0, 0, implicit $exec +... + +--- +name: test_wmma_f32_16x16x16_fp8_bf8_D0_overlaps_C1 +body: | + bb.0: + liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25, $vgpr26, $vgpr27, $vgpr28, $vgpr29, $vgpr30, $vgpr31, $vgpr32, $vgpr33, $vgpr34, $vgpr35, $vgpr36, $vgpr37, $vgpr38, $vgpr39, $vgpr40, $vgpr41 + + ; GFX12-LABEL: name: test_wmma_f32_16x16x16_fp8_bf8_D0_overlaps_C1 + ; GFX12: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25, $vgpr26, $vgpr27, $vgpr28, $vgpr29, $vgpr30, $vgpr31, $vgpr32, $vgpr33, $vgpr34, $vgpr35, $vgpr36, $vgpr37, $vgpr38, $vgpr39, $vgpr40, $vgpr41 + ; GFX12-NEXT: {{ $}} + ; GFX12-NEXT: early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = V_WMMA_F32_16X16X16_F16_w32_twoaddr 8, killed $vgpr10_vgpr11_vgpr12_vgpr13, 8, killed $vgpr14_vgpr15_vgpr16_vgpr17, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 0, 0, implicit $exec + ; GFX12-NEXT: V_NOP_e32 implicit $exec + ; GFX12-NEXT: early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = V_WMMA_F32_16X16X16_FP8_BF8_w32_twoaddr killed $vgpr28_vgpr29, killed $vgpr30_vgpr31, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 0, 0, implicit $exec + early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = V_WMMA_F32_16X16X16_F16_w32_twoaddr 8, killed $vgpr10_vgpr11_vgpr12_vgpr13, 8, killed $vgpr14_vgpr15_vgpr16_vgpr17, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 0, 0, implicit $exec + early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = V_WMMA_F32_16X16X16_FP8_BF8_w32_twoaddr killed $vgpr28_vgpr29, killed $vgpr30_vgpr31, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 0, 0, implicit $exec +... + +--- +name: test_wmma_f32_16x16x16_bf8_bf8_D0_overlaps_A1 +body: | + bb.0: + liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25, $vgpr26, $vgpr27, $vgpr28, $vgpr29, $vgpr30, $vgpr31, $vgpr32, $vgpr33, $vgpr34, $vgpr35, $vgpr36, $vgpr37, $vgpr38, $vgpr39, $vgpr40, $vgpr41 + + ; GFX12-LABEL: name: test_wmma_f32_16x16x16_bf8_bf8_D0_overlaps_A1 + ; GFX12: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25, $vgpr26, $vgpr27, $vgpr28, $vgpr29, $vgpr30, $vgpr31, $vgpr32, $vgpr33, $vgpr34, $vgpr35, $vgpr36, $vgpr37, $vgpr38, $vgpr39, $vgpr40, $vgpr41 + ; GFX12-NEXT: {{ $}} + ; GFX12-NEXT: early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = V_WMMA_F32_16X16X16_F16_w32_twoaddr 8, killed $vgpr10_vgpr11_vgpr12_vgpr13, 8, killed $vgpr14_vgpr15_vgpr16_vgpr17, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 0, 0, implicit $exec + ; GFX12-NEXT: V_NOP_e32 implicit $exec + ; GFX12-NEXT: early-clobber renamable $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39 = V_WMMA_F32_16X16X16_BF8_BF8_w32_twoaddr killed $vgpr0_vgpr1, killed $vgpr30_vgpr31, 8, killed $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39, 0, 0, implicit $exec + early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = V_WMMA_F32_16X16X16_F16_w32_twoaddr 8, killed $vgpr10_vgpr11_vgpr12_vgpr13, 8, killed $vgpr14_vgpr15_vgpr16_vgpr17, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 0, 0, implicit $exec + early-clobber renamable $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39 = V_WMMA_F32_16X16X16_BF8_BF8_w32_twoaddr killed $vgpr0_vgpr1, killed $vgpr30_vgpr31, 8, killed $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39, 0, 0, implicit $exec +... + +--- +name: test_wmma_i32_16x16x32_iu4_D0_overlaps_B1 +body: | + bb.0: + liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25, $vgpr26, $vgpr27, $vgpr28, $vgpr29, $vgpr30, $vgpr31, $vgpr32, $vgpr33, $vgpr34, $vgpr35, $vgpr36, $vgpr37, $vgpr38, $vgpr39, $vgpr40, $vgpr41 + + ; GFX12-LABEL: name: test_wmma_i32_16x16x32_iu4_D0_overlaps_B1 + ; GFX12: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25, $vgpr26, $vgpr27, $vgpr28, $vgpr29, $vgpr30, $vgpr31, $vgpr32, $vgpr33, $vgpr34, $vgpr35, $vgpr36, $vgpr37, $vgpr38, $vgpr39, $vgpr40, $vgpr41 + ; GFX12-NEXT: {{ $}} + ; GFX12-NEXT: early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = V_WMMA_F32_16X16X16_F16_w32_twoaddr 8, killed $vgpr10_vgpr11_vgpr12_vgpr13, 8, killed $vgpr14_vgpr15_vgpr16_vgpr17, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 0, 0, implicit $exec + ; GFX12-NEXT: V_NOP_e32 implicit $exec + ; GFX12-NEXT: early-clobber renamable $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39 = V_WMMA_I32_16X16X32_IU4_w32_twoaddr 8, killed $vgpr28_vgpr29, 8, killed $vgpr0_vgpr1, killed $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39, 0, 0, implicit $exec + early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = V_WMMA_F32_16X16X16_F16_w32_twoaddr 8, killed $vgpr10_vgpr11_vgpr12_vgpr13, 8, killed $vgpr14_vgpr15_vgpr16_vgpr17, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 0, 0, implicit $exec + early-clobber renamable $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39 = V_WMMA_I32_16X16X32_IU4_w32_twoaddr 8, killed $vgpr28_vgpr29, 8, killed $vgpr0_vgpr1, killed $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39, 0, 0, implicit $exec +... + +--- +name: test_swmmac_f32_16x16x32_f16_D0_overlaps_C1 +body: | + bb.0: + liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25, $vgpr26, $vgpr27, $vgpr28, $vgpr29, $vgpr30, $vgpr31, $vgpr32, $vgpr33, $vgpr34, $vgpr35, $vgpr36, $vgpr37, $vgpr38, $vgpr39, $vgpr40, $vgpr41, $vgpr42, $vgpr43, $vgpr44, $vgpr45, $vgpr46, $vgpr47, $vgpr48, $vgpr49, $vgpr50 + + ; GFX12-LABEL: name: test_swmmac_f32_16x16x32_f16_D0_overlaps_C1 + ; GFX12: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25, $vgpr26, $vgpr27, $vgpr28, $vgpr29, $vgpr30, $vgpr31, $vgpr32, $vgpr33, $vgpr34, $vgpr35, $vgpr36, $vgpr37, $vgpr38, $vgpr39, $vgpr40, $vgpr41, $vgpr42, $vgpr43, $vgpr44, $vgpr45, $vgpr46, $vgpr47, $vgpr48, $vgpr49, $vgpr50 + ; GFX12-NEXT: {{ $}} + ; GFX12-NEXT: early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = V_WMMA_F32_16X16X16_F16_w32_twoaddr 8, killed $vgpr10_vgpr11_vgpr12_vgpr13, 8, killed $vgpr14_vgpr15_vgpr16_vgpr17, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 0, 0, implicit $exec + ; GFX12-NEXT: early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = V_SWMMAC_F32_16X16X32_F16_w32_twoaddr 8, killed $vgpr28_vgpr29_vgpr30_vgpr31, 8, killed $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, killed $vgpr44, 0, 0, 0, implicit $exec + early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = V_WMMA_F32_16X16X16_F16_w32_twoaddr 8, killed $vgpr10_vgpr11_vgpr12_vgpr13, 8, killed $vgpr14_vgpr15_vgpr16_vgpr17, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 0, 0, implicit $exec + early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = V_SWMMAC_F32_16X16X32_F16_w32_twoaddr 8, killed $vgpr28_vgpr29_vgpr30_vgpr31, 8, killed $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, killed $vgpr44, 0, 0, 0, implicit $exec +... + +--- +name: test_swmmac_f32_16x16x32_bf16_D0_overlaps_Index1 +body: | + bb.0: + liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25, $vgpr26, $vgpr27, $vgpr28, $vgpr29, $vgpr30, $vgpr31, $vgpr32, $vgpr33, $vgpr34, $vgpr35, $vgpr36, $vgpr37, $vgpr38, $vgpr39, $vgpr40, $vgpr41, $vgpr42, $vgpr43, $vgpr44, $vgpr45, $vgpr46, $vgpr47, $vgpr48, $vgpr49, $vgpr50 + + ; GFX12-LABEL: name: test_swmmac_f32_16x16x32_bf16_D0_overlaps_Index1 + ; GFX12: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25, $vgpr26, $vgpr27, $vgpr28, $vgpr29, $vgpr30, $vgpr31, $vgpr32, $vgpr33, $vgpr34, $vgpr35, $vgpr36, $vgpr37, $vgpr38, $vgpr39, $vgpr40, $vgpr41, $vgpr42, $vgpr43, $vgpr44, $vgpr45, $vgpr46, $vgpr47, $vgpr48, $vgpr49, $vgpr50 + ; GFX12-NEXT: {{ $}} + ; GFX12-NEXT: early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = V_WMMA_F32_16X16X16_F16_w32_twoaddr 8, killed $vgpr10_vgpr11_vgpr12_vgpr13, 8, killed $vgpr14_vgpr15_vgpr16_vgpr17, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 0, 0, implicit $exec + ; GFX12-NEXT: V_NOP_e32 implicit $exec + ; GFX12-NEXT: early-clobber renamable $vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47 = V_SWMMAC_F32_16X16X32_BF16_w32_twoaddr 8, killed $vgpr28_vgpr29_vgpr30_vgpr31, 8, killed $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39, killed $vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47, killed $vgpr0, 0, 0, 0, implicit $exec + early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = V_WMMA_F32_16X16X16_F16_w32_twoaddr 8, killed $vgpr10_vgpr11_vgpr12_vgpr13, 8, killed $vgpr14_vgpr15_vgpr16_vgpr17, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 0, 0, implicit $exec + early-clobber renamable $vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47 = V_SWMMAC_F32_16X16X32_BF16_w32_twoaddr 8, killed $vgpr28_vgpr29_vgpr30_vgpr31, 8, killed $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39, killed $vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47, killed $vgpr0, 0, 0, 0, implicit $exec +... + +--- +name: test_swmmac_f16_16x16x32_f16_D0_overlaps_A1 +body: | + bb.0: + liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25, $vgpr26, $vgpr27, $vgpr28, $vgpr29, $vgpr30, $vgpr31, $vgpr32, $vgpr33, $vgpr34, $vgpr35, $vgpr36, $vgpr37, $vgpr38, $vgpr39, $vgpr40, $vgpr41, $vgpr42, $vgpr43, $vgpr44, $vgpr45, $vgpr46 + + ; GFX12-LABEL: name: test_swmmac_f16_16x16x32_f16_D0_overlaps_A1 + ; GFX12: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25, $vgpr26, $vgpr27, $vgpr28, $vgpr29, $vgpr30, $vgpr31, $vgpr32, $vgpr33, $vgpr34, $vgpr35, $vgpr36, $vgpr37, $vgpr38, $vgpr39, $vgpr40, $vgpr41, $vgpr42, $vgpr43, $vgpr44, $vgpr45, $vgpr46 + ; GFX12-NEXT: {{ $}} + ; GFX12-NEXT: early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = V_WMMA_F32_16X16X16_F16_w32_twoaddr 8, killed $vgpr10_vgpr11_vgpr12_vgpr13, 8, killed $vgpr14_vgpr15_vgpr16_vgpr17, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 0, 0, implicit $exec + ; GFX12-NEXT: V_NOP_e32 implicit $exec + ; GFX12-NEXT: early-clobber renamable $vgpr40_vgpr41_vgpr42_vgpr43 = V_SWMMAC_F16_16X16X32_F16_w32_twoaddr 8, killed $vgpr0_vgpr1_vgpr2_vgpr3, 8, killed $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39, killed $vgpr40_vgpr41_vgpr42_vgpr43, killed $vgpr44, 0, 0, 0, implicit $exec + early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = V_WMMA_F32_16X16X16_F16_w32_twoaddr 8, killed $vgpr10_vgpr11_vgpr12_vgpr13, 8, killed $vgpr14_vgpr15_vgpr16_vgpr17, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 0, 0, implicit $exec + early-clobber renamable $vgpr40_vgpr41_vgpr42_vgpr43 = V_SWMMAC_F16_16X16X32_F16_w32_twoaddr 8, killed $vgpr0_vgpr1_vgpr2_vgpr3, 8, killed $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39, killed $vgpr40_vgpr41_vgpr42_vgpr43, killed $vgpr44, 0, 0, 0, implicit $exec +... + +--- +name: test_swmmac_bf16_16x16x32_bf16_D0_overlaps_B1 +body: | + bb.0: + liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25, $vgpr26, $vgpr27, $vgpr28, $vgpr29, $vgpr30, $vgpr31, $vgpr32, $vgpr33, $vgpr34, $vgpr35, $vgpr36, $vgpr37, $vgpr38, $vgpr39, $vgpr40, $vgpr41, $vgpr42, $vgpr43, $vgpr44, $vgpr45, $vgpr46 + + ; GFX12-LABEL: name: test_swmmac_bf16_16x16x32_bf16_D0_overlaps_B1 + ; GFX12: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25, $vgpr26, $vgpr27, $vgpr28, $vgpr29, $vgpr30, $vgpr31, $vgpr32, $vgpr33, $vgpr34, $vgpr35, $vgpr36, $vgpr37, $vgpr38, $vgpr39, $vgpr40, $vgpr41, $vgpr42, $vgpr43, $vgpr44, $vgpr45, $vgpr46 + ; GFX12-NEXT: {{ $}} + ; GFX12-NEXT: early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = V_WMMA_F32_16X16X16_F16_w32_twoaddr 8, killed $vgpr10_vgpr11_vgpr12_vgpr13, 8, killed $vgpr14_vgpr15_vgpr16_vgpr17, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 0, 0, implicit $exec + ; GFX12-NEXT: V_NOP_e32 implicit $exec + ; GFX12-NEXT: early-clobber renamable $vgpr40_vgpr41_vgpr42_vgpr43 = V_SWMMAC_BF16_16X16X32_BF16_w32_twoaddr 8, killed $vgpr28_vgpr29_vgpr30_vgpr31, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, killed $vgpr40_vgpr41_vgpr42_vgpr43, killed $vgpr44, 0, 0, 0, implicit $exec + early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = V_WMMA_F32_16X16X16_F16_w32_twoaddr 8, killed $vgpr10_vgpr11_vgpr12_vgpr13, 8, killed $vgpr14_vgpr15_vgpr16_vgpr17, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 0, 0, implicit $exec + early-clobber renamable $vgpr40_vgpr41_vgpr42_vgpr43 = V_SWMMAC_BF16_16X16X32_BF16_w32_twoaddr 8, killed $vgpr28_vgpr29_vgpr30_vgpr31, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, killed $vgpr40_vgpr41_vgpr42_vgpr43, killed $vgpr44, 0, 0, 0, implicit $exec +... + +--- +name: test_swmmac_i32_16x16x32_iu8_D0_overlaps_C1 +body: | + bb.0: + liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25, $vgpr26, $vgpr27, $vgpr28, $vgpr29, $vgpr30, $vgpr31, $vgpr32, $vgpr33, $vgpr34, $vgpr35, $vgpr36, $vgpr37, $vgpr38, $vgpr39, $vgpr40, $vgpr41, $vgpr42, $vgpr43, $vgpr44 + + ; GFX12-LABEL: name: test_swmmac_i32_16x16x32_iu8_D0_overlaps_C1 + ; GFX12: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25, $vgpr26, $vgpr27, $vgpr28, $vgpr29, $vgpr30, $vgpr31, $vgpr32, $vgpr33, $vgpr34, $vgpr35, $vgpr36, $vgpr37, $vgpr38, $vgpr39, $vgpr40, $vgpr41, $vgpr42, $vgpr43, $vgpr44 + ; GFX12-NEXT: {{ $}} + ; GFX12-NEXT: early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = V_WMMA_F32_16X16X16_F16_w32_twoaddr 8, killed $vgpr10_vgpr11_vgpr12_vgpr13, 8, killed $vgpr14_vgpr15_vgpr16_vgpr17, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 0, 0, implicit $exec + ; GFX12-NEXT: early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = V_SWMMAC_I32_16X16X32_IU8_w32_twoaddr 8, killed $vgpr28_vgpr29, 8, killed $vgpr30_vgpr31_vgpr32_vgpr33, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, killed $vgpr42, 0, 0, 0, implicit $exec + early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = V_WMMA_F32_16X16X16_F16_w32_twoaddr 8, killed $vgpr10_vgpr11_vgpr12_vgpr13, 8, killed $vgpr14_vgpr15_vgpr16_vgpr17, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 0, 0, implicit $exec + early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = V_SWMMAC_I32_16X16X32_IU8_w32_twoaddr 8, killed $vgpr28_vgpr29, 8, killed $vgpr30_vgpr31_vgpr32_vgpr33, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, killed $vgpr42, 0, 0, 0, implicit $exec +... + +--- +name: test_swmmac_i32_16x16x32_iu4_D0_overlaps_Index1 +body: | + bb.0: + liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25, $vgpr26, $vgpr27, $vgpr28, $vgpr29, $vgpr30, $vgpr31, $vgpr32, $vgpr33, $vgpr34, $vgpr35, $vgpr36, $vgpr37, $vgpr38, $vgpr39, $vgpr40, $vgpr41 + + ; GFX12-LABEL: name: test_swmmac_i32_16x16x32_iu4_D0_overlaps_Index1 + ; GFX12: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25, $vgpr26, $vgpr27, $vgpr28, $vgpr29, $vgpr30, $vgpr31, $vgpr32, $vgpr33, $vgpr34, $vgpr35, $vgpr36, $vgpr37, $vgpr38, $vgpr39, $vgpr40, $vgpr41 + ; GFX12-NEXT: {{ $}} + ; GFX12-NEXT: early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = V_WMMA_F32_16X16X16_F16_w32_twoaddr 8, killed $vgpr10_vgpr11_vgpr12_vgpr13, 8, killed $vgpr14_vgpr15_vgpr16_vgpr17, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 0, 0, implicit $exec + ; GFX12-NEXT: V_NOP_e32 implicit $exec + ; GFX12-NEXT: early-clobber renamable $vgpr31_vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38 = V_SWMMAC_I32_16X16X32_IU4_w32_twoaddr 8, killed $vgpr28, 8, killed $vgpr28_vgpr29, killed $vgpr31_vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38, killed $vgpr0, 0, 0, 0, implicit $exec + early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = V_WMMA_F32_16X16X16_F16_w32_twoaddr 8, killed $vgpr10_vgpr11_vgpr12_vgpr13, 8, killed $vgpr14_vgpr15_vgpr16_vgpr17, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 0, 0, implicit $exec + early-clobber renamable $vgpr31_vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38 = V_SWMMAC_I32_16X16X32_IU4_w32_twoaddr 8, killed $vgpr28, 8, killed $vgpr28_vgpr29, killed $vgpr31_vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38, killed $vgpr0, 0, 0, 0, implicit $exec +... + +--- +name: test_swmmac_i32_16x16x64_iu4_D0_overlaps_A1 +body: | + bb.0: + liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25, $vgpr26, $vgpr27, $vgpr28, $vgpr29, $vgpr30, $vgpr31, $vgpr32, $vgpr33, $vgpr34, $vgpr35, $vgpr36, $vgpr37, $vgpr38, $vgpr39, $vgpr40, $vgpr41, $vgpr42, $vgpr43, $vgpr44 + + ; GFX12-LABEL: name: test_swmmac_i32_16x16x64_iu4_D0_overlaps_A1 + ; GFX12: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25, $vgpr26, $vgpr27, $vgpr28, $vgpr29, $vgpr30, $vgpr31, $vgpr32, $vgpr33, $vgpr34, $vgpr35, $vgpr36, $vgpr37, $vgpr38, $vgpr39, $vgpr40, $vgpr41, $vgpr42, $vgpr43, $vgpr44 + ; GFX12-NEXT: {{ $}} + ; GFX12-NEXT: early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = V_WMMA_F32_16X16X16_F16_w32_twoaddr 8, killed $vgpr10_vgpr11_vgpr12_vgpr13, 8, killed $vgpr14_vgpr15_vgpr16_vgpr17, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 0, 0, implicit $exec + ; GFX12-NEXT: V_NOP_e32 implicit $exec + ; GFX12-NEXT: early-clobber renamable $vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41 = V_SWMMAC_I32_16X16X64_IU4_w32_twoaddr 8, killed $vgpr0_vgpr1, 8, killed $vgpr30_vgpr31_vgpr32_vgpr33, killed $vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41, killed $vgpr42, 0, 0, implicit $exec + early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = V_WMMA_F32_16X16X16_F16_w32_twoaddr 8, killed $vgpr10_vgpr11_vgpr12_vgpr13, 8, killed $vgpr14_vgpr15_vgpr16_vgpr17, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 0, 0, implicit $exec + early-clobber renamable $vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41 = V_SWMMAC_I32_16X16X64_IU4_w32_twoaddr 8, killed $vgpr0_vgpr1, 8, killed $vgpr30_vgpr31_vgpr32_vgpr33, killed $vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41, killed $vgpr42, 0, 0, implicit $exec +... + +--- +name: test_swmmac_f32_16x16x32_fp8_fp8_D0_overlaps_B1 +body: | + bb.0: + liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25, $vgpr26, $vgpr27, $vgpr28, $vgpr29, $vgpr30, $vgpr31, $vgpr32, $vgpr33, $vgpr34, $vgpr35, $vgpr36, $vgpr37, $vgpr38, $vgpr39, $vgpr40, $vgpr41, $vgpr42, $vgpr43, $vgpr44 + + ; GFX12-LABEL: name: test_swmmac_f32_16x16x32_fp8_fp8_D0_overlaps_B1 + ; GFX12: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25, $vgpr26, $vgpr27, $vgpr28, $vgpr29, $vgpr30, $vgpr31, $vgpr32, $vgpr33, $vgpr34, $vgpr35, $vgpr36, $vgpr37, $vgpr38, $vgpr39, $vgpr40, $vgpr41, $vgpr42, $vgpr43, $vgpr44 + ; GFX12-NEXT: {{ $}} + ; GFX12-NEXT: early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = V_WMMA_F32_16X16X16_F16_w32_twoaddr 8, killed $vgpr10_vgpr11_vgpr12_vgpr13, 8, killed $vgpr14_vgpr15_vgpr16_vgpr17, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 0, 0, implicit $exec + ; GFX12-NEXT: V_NOP_e32 implicit $exec + ; GFX12-NEXT: early-clobber renamable $vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41 = V_SWMMAC_F32_16X16X32_FP8_FP8_w32_twoaddr killed $vgpr28_vgpr29, killed $vgpr0_vgpr1_vgpr2_vgpr3, killed $vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41, killed $vgpr42, 0, implicit $exec + early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = V_WMMA_F32_16X16X16_F16_w32_twoaddr 8, killed $vgpr10_vgpr11_vgpr12_vgpr13, 8, killed $vgpr14_vgpr15_vgpr16_vgpr17, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 0, 0, implicit $exec + early-clobber renamable $vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41 = V_SWMMAC_F32_16X16X32_FP8_FP8_w32_twoaddr killed $vgpr28_vgpr29, killed $vgpr0_vgpr1_vgpr2_vgpr3, killed $vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41, killed $vgpr42, 0, implicit $exec +... + +--- +name: test_swmmac_f32_16x16x32_fp8_bf8_D0_overlaps_C1 +body: | + bb.0: + liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25, $vgpr26, $vgpr27, $vgpr28, $vgpr29, $vgpr30, $vgpr31, $vgpr32, $vgpr33, $vgpr34, $vgpr35, $vgpr36, $vgpr37, $vgpr38, $vgpr39, $vgpr40, $vgpr41, $vgpr42, $vgpr43, $vgpr44 + + ; GFX12-LABEL: name: test_swmmac_f32_16x16x32_fp8_bf8_D0_overlaps_C1 + ; GFX12: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25, $vgpr26, $vgpr27, $vgpr28, $vgpr29, $vgpr30, $vgpr31, $vgpr32, $vgpr33, $vgpr34, $vgpr35, $vgpr36, $vgpr37, $vgpr38, $vgpr39, $vgpr40, $vgpr41, $vgpr42, $vgpr43, $vgpr44 + ; GFX12-NEXT: {{ $}} + ; GFX12-NEXT: early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = V_WMMA_F32_16X16X16_F16_w32_twoaddr 8, killed $vgpr10_vgpr11_vgpr12_vgpr13, 8, killed $vgpr14_vgpr15_vgpr16_vgpr17, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 0, 0, implicit $exec + ; GFX12-NEXT: early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = V_SWMMAC_F32_16X16X32_FP8_BF8_w32_twoaddr killed $vgpr28_vgpr29, killed $vgpr30_vgpr31_vgpr32_vgpr33, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, killed $vgpr42, 0, implicit $exec + early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = V_WMMA_F32_16X16X16_F16_w32_twoaddr 8, killed $vgpr10_vgpr11_vgpr12_vgpr13, 8, killed $vgpr14_vgpr15_vgpr16_vgpr17, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 0, 0, implicit $exec + early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = V_SWMMAC_F32_16X16X32_FP8_BF8_w32_twoaddr killed $vgpr28_vgpr29, killed $vgpr30_vgpr31_vgpr32_vgpr33, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, killed $vgpr42, 0, implicit $exec +... + +--- +name: test_swmmac_f32_16x16x32_bf8_fp8_D0_overlaps_Index1 +body: | + bb.0: + liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25, $vgpr26, $vgpr27, $vgpr28, $vgpr29, $vgpr30, $vgpr31, $vgpr32, $vgpr33, $vgpr34, $vgpr35, $vgpr36, $vgpr37, $vgpr38, $vgpr39, $vgpr40, $vgpr41, $vgpr42, $vgpr43, $vgpr44 + + ; GFX12-LABEL: name: test_swmmac_f32_16x16x32_bf8_fp8_D0_overlaps_Index1 + ; GFX12: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25, $vgpr26, $vgpr27, $vgpr28, $vgpr29, $vgpr30, $vgpr31, $vgpr32, $vgpr33, $vgpr34, $vgpr35, $vgpr36, $vgpr37, $vgpr38, $vgpr39, $vgpr40, $vgpr41, $vgpr42, $vgpr43, $vgpr44 + ; GFX12-NEXT: {{ $}} + ; GFX12-NEXT: early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = V_WMMA_F32_16X16X16_F16_w32_twoaddr 8, killed $vgpr10_vgpr11_vgpr12_vgpr13, 8, killed $vgpr14_vgpr15_vgpr16_vgpr17, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 0, 0, implicit $exec + ; GFX12-NEXT: V_NOP_e32 implicit $exec + ; GFX12-NEXT: early-clobber renamable $vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41 = V_SWMMAC_F32_16X16X32_BF8_FP8_w32_twoaddr killed $vgpr28_vgpr29, killed $vgpr30_vgpr31_vgpr32_vgpr33, killed $vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41, killed $vgpr0, 0, implicit $exec + early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = V_WMMA_F32_16X16X16_F16_w32_twoaddr 8, killed $vgpr10_vgpr11_vgpr12_vgpr13, 8, killed $vgpr14_vgpr15_vgpr16_vgpr17, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 0, 0, implicit $exec + early-clobber renamable $vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41 = V_SWMMAC_F32_16X16X32_BF8_FP8_w32_twoaddr killed $vgpr28_vgpr29, killed $vgpr30_vgpr31_vgpr32_vgpr33, killed $vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41, killed $vgpr0, 0, implicit $exec +... + +--- +name: test_swmmac_f32_16x16x32_bf8_bf8_D0_overlaps_A1 +body: | + bb.0: + liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25, $vgpr26, $vgpr27, $vgpr28, $vgpr29, $vgpr30, $vgpr31, $vgpr32, $vgpr33, $vgpr34, $vgpr35, $vgpr36, $vgpr37, $vgpr38, $vgpr39, $vgpr40, $vgpr41, $vgpr42, $vgpr43, $vgpr44 + + ; GFX12-LABEL: name: test_swmmac_f32_16x16x32_bf8_bf8_D0_overlaps_A1 + ; GFX12: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25, $vgpr26, $vgpr27, $vgpr28, $vgpr29, $vgpr30, $vgpr31, $vgpr32, $vgpr33, $vgpr34, $vgpr35, $vgpr36, $vgpr37, $vgpr38, $vgpr39, $vgpr40, $vgpr41, $vgpr42, $vgpr43, $vgpr44 + ; GFX12-NEXT: {{ $}} + ; GFX12-NEXT: early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = V_WMMA_F32_16X16X16_F16_w32_twoaddr 8, killed $vgpr10_vgpr11_vgpr12_vgpr13, 8, killed $vgpr14_vgpr15_vgpr16_vgpr17, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 0, 0, implicit $exec + ; GFX12-NEXT: V_NOP_e32 implicit $exec + ; GFX12-NEXT: early-clobber renamable $vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41 = V_SWMMAC_F32_16X16X32_BF8_BF8_w32_twoaddr killed $vgpr0_vgpr1, killed $vgpr30_vgpr31_vgpr32_vgpr33, killed $vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41, killed $vgpr42, 0, implicit $exec + early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = V_WMMA_F32_16X16X16_F16_w32_twoaddr 8, killed $vgpr10_vgpr11_vgpr12_vgpr13, 8, killed $vgpr14_vgpr15_vgpr16_vgpr17, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 0, 0, implicit $exec + early-clobber renamable $vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41 = V_SWMMAC_F32_16X16X32_BF8_BF8_w32_twoaddr killed $vgpr0_vgpr1, killed $vgpr30_vgpr31_vgpr32_vgpr33, killed $vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41, killed $vgpr42, 0, implicit $exec +... diff --git a/llvm/test/CodeGen/AMDGPU/wmma-hazards-gfx12-w64.mir b/llvm/test/CodeGen/AMDGPU/wmma-hazards-gfx12-w64.mir new file mode 100644 index 000000000000..ab89feb861b5 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/wmma-hazards-gfx12-w64.mir @@ -0,0 +1,355 @@ +# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py +# RUN: llc -march=amdgcn -mcpu=gfx1200 -verify-machineinstrs -mattr=-wavefrontsize32,+wavefrontsize64 -run-pass post-RA-hazard-rec %s -o - | FileCheck -check-prefix=GFX12 %s + +# D0 overlaps A1, B1, C1 or Index1. Overlap starts at vgpr0. +# $D0 = wmma0 $A0, $B0, $C0 or $D0 = swmmac0 $A0, $B0, $C0, $Index0 +# $D1 = wmma1 $A1, $B1, $C1 or $D1 = swmmac1 $A1, $B1, $C1, $Index1 + +--- +name: test_wmma_f32_16x16x16_f16_D0_overlaps_A1 +body: | + bb.0: + liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25 + + ; GFX12-LABEL: name: test_wmma_f32_16x16x16_f16_D0_overlaps_A1 + ; GFX12: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25 + ; GFX12-NEXT: {{ $}} + ; GFX12-NEXT: early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3 = V_WMMA_F32_16X16X16_F16_w64_twoaddr 8, killed $vgpr6_vgpr7, 8, killed $vgpr8_vgpr9, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3, 0, 0, implicit $exec + ; GFX12-NEXT: V_NOP_e32 implicit $exec + ; GFX12-NEXT: early-clobber renamable $vgpr20_vgpr21_vgpr22_vgpr23 = V_WMMA_F32_16X16X16_F16_w64_twoaddr 8, killed $vgpr0_vgpr1, 8, killed $vgpr18_vgpr19, 8, killed $vgpr20_vgpr21_vgpr22_vgpr23, 0, 0, implicit $exec + early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3 = V_WMMA_F32_16X16X16_F16_w64_twoaddr 8, killed $vgpr6_vgpr7, 8, killed $vgpr8_vgpr9, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3, 0, 0, implicit $exec + early-clobber renamable $vgpr20_vgpr21_vgpr22_vgpr23 = V_WMMA_F32_16X16X16_F16_w64_twoaddr 8, killed $vgpr0_vgpr1, 8, killed $vgpr18_vgpr19, 8, killed $vgpr20_vgpr21_vgpr22_vgpr23, 0, 0, implicit $exec +... + +--- +name: test_wmma_f32_16x16x16_bf16_D0_overlaps_B1 +body: | + bb.0: + liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25 + + ; GFX12-LABEL: name: test_wmma_f32_16x16x16_bf16_D0_overlaps_B1 + ; GFX12: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25 + ; GFX12-NEXT: {{ $}} + ; GFX12-NEXT: early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3 = V_WMMA_F32_16X16X16_F16_w64_twoaddr 8, killed $vgpr6_vgpr7, 8, killed $vgpr8_vgpr9, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3, 0, 0, implicit $exec + ; GFX12-NEXT: V_NOP_e32 implicit $exec + ; GFX12-NEXT: early-clobber renamable $vgpr20_vgpr21_vgpr22_vgpr23 = V_WMMA_F32_16X16X16_BF16_w64_twoaddr 8, killed $vgpr16_vgpr17, 8, killed $vgpr0_vgpr1, 8, killed $vgpr20_vgpr21_vgpr22_vgpr23, 0, 0, implicit $exec + early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3 = V_WMMA_F32_16X16X16_F16_w64_twoaddr 8, killed $vgpr6_vgpr7, 8, killed $vgpr8_vgpr9, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3, 0, 0, implicit $exec + early-clobber renamable $vgpr20_vgpr21_vgpr22_vgpr23 = V_WMMA_F32_16X16X16_BF16_w64_twoaddr 8, killed $vgpr16_vgpr17, 8, killed $vgpr0_vgpr1, 8, killed $vgpr20_vgpr21_vgpr22_vgpr23, 0, 0, implicit $exec +... + +--- +name: test_wmma_f16_16x16x16_f16_D0_overlaps_C1 +body: | + bb.0: + liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23 + + ; GFX12-LABEL: name: test_wmma_f16_16x16x16_f16_D0_overlaps_C1 + ; GFX12: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23 + ; GFX12-NEXT: {{ $}} + ; GFX12-NEXT: early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3 = V_WMMA_F32_16X16X16_F16_w64_twoaddr 8, killed $vgpr6_vgpr7, 8, killed $vgpr8_vgpr9, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3, 0, 0, implicit $exec + ; GFX12-NEXT: V_NOP_e32 implicit $exec + ; GFX12-NEXT: early-clobber renamable $vgpr0_vgpr1 = V_WMMA_F16_16X16X16_F16_w64_twoaddr 8, killed $vgpr16_vgpr17, 8, killed $vgpr18_vgpr19, 8, killed $vgpr0_vgpr1, 0, 0, implicit $exec + early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3 = V_WMMA_F32_16X16X16_F16_w64_twoaddr 8, killed $vgpr6_vgpr7, 8, killed $vgpr8_vgpr9, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3, 0, 0, implicit $exec + early-clobber renamable $vgpr0_vgpr1 = V_WMMA_F16_16X16X16_F16_w64_twoaddr 8, killed $vgpr16_vgpr17, 8, killed $vgpr18_vgpr19, 8, killed $vgpr0_vgpr1, 0, 0, implicit $exec +... + +--- +name: test_wmma_bf16_16x16x16_bf16_D0_overlaps_A1 +body: | + bb.0: + liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23 + + ; GFX12-LABEL: name: test_wmma_bf16_16x16x16_bf16_D0_overlaps_A1 + ; GFX12: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23 + ; GFX12-NEXT: {{ $}} + ; GFX12-NEXT: early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3 = V_WMMA_F32_16X16X16_F16_w64_twoaddr 8, killed $vgpr6_vgpr7, 8, killed $vgpr8_vgpr9, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3, 0, 0, implicit $exec + ; GFX12-NEXT: V_NOP_e32 implicit $exec + ; GFX12-NEXT: early-clobber renamable $vgpr20_vgpr21 = V_WMMA_BF16_16X16X16_BF16_w64_twoaddr 8, killed $vgpr0_vgpr1, 8, killed $vgpr18_vgpr19, 8, killed $vgpr20_vgpr21, 0, 0, implicit $exec + early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3 = V_WMMA_F32_16X16X16_F16_w64_twoaddr 8, killed $vgpr6_vgpr7, 8, killed $vgpr8_vgpr9, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3, 0, 0, implicit $exec + early-clobber renamable $vgpr20_vgpr21 = V_WMMA_BF16_16X16X16_BF16_w64_twoaddr 8, killed $vgpr0_vgpr1, 8, killed $vgpr18_vgpr19, 8, killed $vgpr20_vgpr21, 0, 0, implicit $exec +... + +--- +name: test_wmma_i32_16x16x16_iu8_D0_overlaps_B1 +body: | + bb.0: + liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23 + + ; GFX12-LABEL: name: test_wmma_i32_16x16x16_iu8_D0_overlaps_B1 + ; GFX12: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23 + ; GFX12-NEXT: {{ $}} + ; GFX12-NEXT: early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3 = V_WMMA_F32_16X16X16_F16_w64_twoaddr 8, killed $vgpr6_vgpr7, 8, killed $vgpr8_vgpr9, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3, 0, 0, implicit $exec + ; GFX12-NEXT: V_NOP_e32 implicit $exec + ; GFX12-NEXT: early-clobber renamable $vgpr18_vgpr19_vgpr20_vgpr21 = V_WMMA_I32_16X16X16_IU8_w64_twoaddr 8, killed $vgpr16, 8, killed $vgpr0, killed $vgpr18_vgpr19_vgpr20_vgpr21, 0, 0, implicit $exec + early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3 = V_WMMA_F32_16X16X16_F16_w64_twoaddr 8, killed $vgpr6_vgpr7, 8, killed $vgpr8_vgpr9, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3, 0, 0, implicit $exec + early-clobber renamable $vgpr18_vgpr19_vgpr20_vgpr21 = V_WMMA_I32_16X16X16_IU8_w64_twoaddr 8, killed $vgpr16, 8, killed $vgpr0, killed $vgpr18_vgpr19_vgpr20_vgpr21, 0, 0, implicit $exec +... + +--- +name: test_wmma_i32_16x16x16_iu4_D0_overlaps_C1 +body: | + bb.0: + liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23 + + ; GFX12-LABEL: name: test_wmma_i32_16x16x16_iu4_D0_overlaps_C1 + ; GFX12: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23 + ; GFX12-NEXT: {{ $}} + ; GFX12-NEXT: early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3 = V_WMMA_F32_16X16X16_F16_w64_twoaddr 8, killed $vgpr6_vgpr7, 8, killed $vgpr8_vgpr9, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3, 0, 0, implicit $exec + ; GFX12-NEXT: V_NOP_e32 implicit $exec + ; GFX12-NEXT: early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3 = V_WMMA_I32_16X16X16_IU4_w64_twoaddr 8, killed $vgpr16, 8, killed $vgpr17, killed $vgpr0_vgpr1_vgpr2_vgpr3, 0, 0, implicit $exec + early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3 = V_WMMA_F32_16X16X16_F16_w64_twoaddr 8, killed $vgpr6_vgpr7, 8, killed $vgpr8_vgpr9, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3, 0, 0, implicit $exec + early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3 = V_WMMA_I32_16X16X16_IU4_w64_twoaddr 8, killed $vgpr16, 8, killed $vgpr17, killed $vgpr0_vgpr1_vgpr2_vgpr3, 0, 0, implicit $exec +... + +--- +name: test_wmma_f32_16x16x16_fp8_fp8_D0_overlaps_A1 +body: | + bb.0: + liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23 + + ; GFX12-LABEL: name: test_wmma_f32_16x16x16_fp8_fp8_D0_overlaps_A1 + ; GFX12: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23 + ; GFX12-NEXT: {{ $}} + ; GFX12-NEXT: early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3 = V_WMMA_F32_16X16X16_F16_w64_twoaddr 8, killed $vgpr6_vgpr7, 8, killed $vgpr8_vgpr9, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3, 0, 0, implicit $exec + ; GFX12-NEXT: V_NOP_e32 implicit $exec + ; GFX12-NEXT: early-clobber renamable $vgpr18_vgpr19_vgpr20_vgpr21 = V_WMMA_F32_16X16X16_FP8_FP8_w64_twoaddr killed $vgpr0, killed $vgpr17, 8, killed $vgpr18_vgpr19_vgpr20_vgpr21, 0, 0, implicit $exec + early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3 = V_WMMA_F32_16X16X16_F16_w64_twoaddr 8, killed $vgpr6_vgpr7, 8, killed $vgpr8_vgpr9, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3, 0, 0, implicit $exec + early-clobber renamable $vgpr18_vgpr19_vgpr20_vgpr21 = V_WMMA_F32_16X16X16_FP8_FP8_w64_twoaddr killed $vgpr0, killed $vgpr17, 8, killed $vgpr18_vgpr19_vgpr20_vgpr21, 0, 0, implicit $exec +... + +--- +name: test_wmma_f32_16x16x16_bf8_fp8_D0_overlaps_B1 +body: | + bb.0: + liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23 + + ; GFX12-LABEL: name: test_wmma_f32_16x16x16_bf8_fp8_D0_overlaps_B1 + ; GFX12: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23 + ; GFX12-NEXT: {{ $}} + ; GFX12-NEXT: early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3 = V_WMMA_F32_16X16X16_F16_w64_twoaddr 8, killed $vgpr6_vgpr7, 8, killed $vgpr8_vgpr9, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3, 0, 0, implicit $exec + ; GFX12-NEXT: V_NOP_e32 implicit $exec + ; GFX12-NEXT: early-clobber renamable $vgpr18_vgpr19_vgpr20_vgpr21 = V_WMMA_F32_16X16X16_BF8_FP8_w64_twoaddr killed $vgpr16, killed $vgpr0, 8, killed $vgpr18_vgpr19_vgpr20_vgpr21, 0, 0, implicit $exec + early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3 = V_WMMA_F32_16X16X16_F16_w64_twoaddr 8, killed $vgpr6_vgpr7, 8, killed $vgpr8_vgpr9, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3, 0, 0, implicit $exec + early-clobber renamable $vgpr18_vgpr19_vgpr20_vgpr21 = V_WMMA_F32_16X16X16_BF8_FP8_w64_twoaddr killed $vgpr16, killed $vgpr0, 8, killed $vgpr18_vgpr19_vgpr20_vgpr21, 0, 0, implicit $exec +... + +--- +name: test_wmma_f32_16x16x16_fp8_bf8_D0_overlaps_C1 +body: | + bb.0: + liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23 + + ; GFX12-LABEL: name: test_wmma_f32_16x16x16_fp8_bf8_D0_overlaps_C1 + ; GFX12: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23 + ; GFX12-NEXT: {{ $}} + ; GFX12-NEXT: early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3 = V_WMMA_F32_16X16X16_F16_w64_twoaddr 8, killed $vgpr6_vgpr7, 8, killed $vgpr8_vgpr9, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3, 0, 0, implicit $exec + ; GFX12-NEXT: V_NOP_e32 implicit $exec + ; GFX12-NEXT: early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3 = V_WMMA_F32_16X16X16_FP8_BF8_w64_twoaddr killed $vgpr16, killed $vgpr17, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3, 0, 0, implicit $exec + early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3 = V_WMMA_F32_16X16X16_F16_w64_twoaddr 8, killed $vgpr6_vgpr7, 8, killed $vgpr8_vgpr9, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3, 0, 0, implicit $exec + early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3 = V_WMMA_F32_16X16X16_FP8_BF8_w64_twoaddr killed $vgpr16, killed $vgpr17, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3, 0, 0, implicit $exec +... + +--- +name: test_wmma_f32_16x16x16_bf8_bf8_D0_overlaps_A1 +body: | + bb.0: + liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23 + + ; GFX12-LABEL: name: test_wmma_f32_16x16x16_bf8_bf8_D0_overlaps_A1 + ; GFX12: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23 + ; GFX12-NEXT: {{ $}} + ; GFX12-NEXT: early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3 = V_WMMA_F32_16X16X16_F16_w64_twoaddr 8, killed $vgpr6_vgpr7, 8, killed $vgpr8_vgpr9, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3, 0, 0, implicit $exec + ; GFX12-NEXT: V_NOP_e32 implicit $exec + ; GFX12-NEXT: early-clobber renamable $vgpr18_vgpr19_vgpr20_vgpr21 = V_WMMA_F32_16X16X16_BF8_BF8_w64_twoaddr killed $vgpr0, killed $vgpr17, 8, killed $vgpr18_vgpr19_vgpr20_vgpr21, 0, 0, implicit $exec + early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3 = V_WMMA_F32_16X16X16_F16_w64_twoaddr 8, killed $vgpr6_vgpr7, 8, killed $vgpr8_vgpr9, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3, 0, 0, implicit $exec + early-clobber renamable $vgpr18_vgpr19_vgpr20_vgpr21 = V_WMMA_F32_16X16X16_BF8_BF8_w64_twoaddr killed $vgpr0, killed $vgpr17, 8, killed $vgpr18_vgpr19_vgpr20_vgpr21, 0, 0, implicit $exec +... + +--- +name: test_wmma_i32_16x16x32_iu4_D0_overlaps_B1 +body: | + bb.0: + liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23 + + ; GFX12-LABEL: name: test_wmma_i32_16x16x32_iu4_D0_overlaps_B1 + ; GFX12: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23 + ; GFX12-NEXT: {{ $}} + ; GFX12-NEXT: early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3 = V_WMMA_F32_16X16X16_F16_w64_twoaddr 8, killed $vgpr6_vgpr7, 8, killed $vgpr8_vgpr9, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3, 0, 0, implicit $exec + ; GFX12-NEXT: V_NOP_e32 implicit $exec + ; GFX12-NEXT: early-clobber renamable $vgpr18_vgpr19_vgpr20_vgpr21 = V_WMMA_I32_16X16X32_IU4_w64_twoaddr 8, killed $vgpr16, 8, killed $vgpr0, killed $vgpr18_vgpr19_vgpr20_vgpr21, 0, 0, implicit $exec + early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3 = V_WMMA_F32_16X16X16_F16_w64_twoaddr 8, killed $vgpr6_vgpr7, 8, killed $vgpr8_vgpr9, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3, 0, 0, implicit $exec + early-clobber renamable $vgpr18_vgpr19_vgpr20_vgpr21 = V_WMMA_I32_16X16X32_IU4_w64_twoaddr 8, killed $vgpr16, 8, killed $vgpr0, killed $vgpr18_vgpr19_vgpr20_vgpr21, 0, 0, implicit $exec +... + +--- +name: test_swmmac_f32_16x16x32_f16_D0_overlaps_C1 +body: | + bb.0: + liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25, $vgpr26, $vgpr27, $vgpr28 + + ; GFX12-LABEL: name: test_swmmac_f32_16x16x32_f16_D0_overlaps_C1 + ; GFX12: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25, $vgpr26, $vgpr27, $vgpr28 + ; GFX12-NEXT: {{ $}} + ; GFX12-NEXT: early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3 = V_WMMA_F32_16X16X16_F16_w64_twoaddr 8, killed $vgpr6_vgpr7, 8, killed $vgpr8_vgpr9, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3, 0, 0, implicit $exec + ; GFX12-NEXT: early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3 = V_SWMMAC_F32_16X16X32_F16_w64_twoaddr 8, killed $vgpr16_vgpr17, 8, killed $vgpr18_vgpr19_vgpr20_vgpr21, killed $vgpr0_vgpr1_vgpr2_vgpr3, killed $vgpr26, 0, 0, 0, implicit $exec + early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3 = V_WMMA_F32_16X16X16_F16_w64_twoaddr 8, killed $vgpr6_vgpr7, 8, killed $vgpr8_vgpr9, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3, 0, 0, implicit $exec + early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3 = V_SWMMAC_F32_16X16X32_F16_w64_twoaddr 8, killed $vgpr16_vgpr17, 8, killed $vgpr18_vgpr19_vgpr20_vgpr21, killed $vgpr0_vgpr1_vgpr2_vgpr3, killed $vgpr26, 0, 0, 0, implicit $exec +... + +--- +name: test_swmmac_f32_16x16x32_bf16_D0_overlaps_Index1 +body: | + bb.0: + liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25, $vgpr26, $vgpr27, $vgpr28 + + ; GFX12-LABEL: name: test_swmmac_f32_16x16x32_bf16_D0_overlaps_Index1 + ; GFX12: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25, $vgpr26, $vgpr27, $vgpr28 + ; GFX12-NEXT: {{ $}} + ; GFX12-NEXT: early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3 = V_WMMA_F32_16X16X16_F16_w64_twoaddr 8, killed $vgpr6_vgpr7, 8, killed $vgpr8_vgpr9, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3, 0, 0, implicit $exec + ; GFX12-NEXT: V_NOP_e32 implicit $exec + ; GFX12-NEXT: early-clobber renamable $vgpr22_vgpr23_vgpr24_vgpr25 = V_SWMMAC_F32_16X16X32_BF16_w64_twoaddr 8, killed $vgpr16_vgpr17, 8, killed $vgpr18_vgpr19_vgpr20_vgpr21, killed $vgpr22_vgpr23_vgpr24_vgpr25, killed $vgpr0, 0, 0, 0, implicit $exec + early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3 = V_WMMA_F32_16X16X16_F16_w64_twoaddr 8, killed $vgpr6_vgpr7, 8, killed $vgpr8_vgpr9, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3, 0, 0, implicit $exec + early-clobber renamable $vgpr22_vgpr23_vgpr24_vgpr25 = V_SWMMAC_F32_16X16X32_BF16_w64_twoaddr 8, killed $vgpr16_vgpr17, 8, killed $vgpr18_vgpr19_vgpr20_vgpr21, killed $vgpr22_vgpr23_vgpr24_vgpr25, killed $vgpr0, 0, 0, 0, implicit $exec +... + +--- +name: test_swmmac_f16_16x16x32_f16_D0_overlaps_A1 +body: | + bb.0: + liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25, $vgpr26 + + ; GFX12-LABEL: name: test_swmmac_f16_16x16x32_f16_D0_overlaps_A1 + ; GFX12: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25, $vgpr26 + ; GFX12-NEXT: {{ $}} + ; GFX12-NEXT: early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3 = V_WMMA_F32_16X16X16_F16_w64_twoaddr 8, killed $vgpr6_vgpr7, 8, killed $vgpr8_vgpr9, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3, 0, 0, implicit $exec + ; GFX12-NEXT: V_NOP_e32 implicit $exec + ; GFX12-NEXT: early-clobber renamable $vgpr22_vgpr23 = V_SWMMAC_F16_16X16X32_F16_w64_twoaddr 8, killed $vgpr0_vgpr1, 8, killed $vgpr18_vgpr19_vgpr20_vgpr21, killed $vgpr22_vgpr23, killed $vgpr24, 0, 0, 0, implicit $exec + early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3 = V_WMMA_F32_16X16X16_F16_w64_twoaddr 8, killed $vgpr6_vgpr7, 8, killed $vgpr8_vgpr9, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3, 0, 0, implicit $exec + early-clobber renamable $vgpr22_vgpr23 = V_SWMMAC_F16_16X16X32_F16_w64_twoaddr 8, killed $vgpr0_vgpr1, 8, killed $vgpr18_vgpr19_vgpr20_vgpr21, killed $vgpr22_vgpr23, killed $vgpr24, 0, 0, 0, implicit $exec +... + +--- +name: test_swmmac_bf16_16x16x32_bf16_D0_overlaps_B1 +body: | + bb.0: + liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25, $vgpr26 + + ; GFX12-LABEL: name: test_swmmac_bf16_16x16x32_bf16_D0_overlaps_B1 + ; GFX12: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25, $vgpr26 + ; GFX12-NEXT: {{ $}} + ; GFX12-NEXT: early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3 = V_WMMA_F32_16X16X16_F16_w64_twoaddr 8, killed $vgpr6_vgpr7, 8, killed $vgpr8_vgpr9, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3, 0, 0, implicit $exec + ; GFX12-NEXT: V_NOP_e32 implicit $exec + ; GFX12-NEXT: early-clobber renamable $vgpr22_vgpr23 = V_SWMMAC_BF16_16X16X32_BF16_w64_twoaddr 8, killed $vgpr16_vgpr17, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3, killed $vgpr22_vgpr23, killed $vgpr24, 0, 0, 0, implicit $exec + early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3 = V_WMMA_F32_16X16X16_F16_w64_twoaddr 8, killed $vgpr6_vgpr7, 8, killed $vgpr8_vgpr9, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3, 0, 0, implicit $exec + early-clobber renamable $vgpr22_vgpr23 = V_SWMMAC_BF16_16X16X32_BF16_w64_twoaddr 8, killed $vgpr16_vgpr17, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3, killed $vgpr22_vgpr23, killed $vgpr24, 0, 0, 0, implicit $exec +... + +--- +name: test_swmmac_i32_16x16x32_iu8_D0_overlaps_C1 +body: | + bb.0: + liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25 + + ; GFX12-LABEL: name: test_swmmac_i32_16x16x32_iu8_D0_overlaps_C1 + ; GFX12: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25 + ; GFX12-NEXT: {{ $}} + ; GFX12-NEXT: early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3 = V_WMMA_F32_16X16X16_F16_w64_twoaddr 8, killed $vgpr6_vgpr7, 8, killed $vgpr8_vgpr9, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3, 0, 0, implicit $exec + ; GFX12-NEXT: early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3 = V_SWMMAC_I32_16X16X32_IU8_w64_twoaddr 8, killed $vgpr16, 8, killed $vgpr17_vgpr18, killed $vgpr0_vgpr1_vgpr2_vgpr3, killed $vgpr23, 0, 0, 0, implicit $exec + early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3 = V_WMMA_F32_16X16X16_F16_w64_twoaddr 8, killed $vgpr6_vgpr7, 8, killed $vgpr8_vgpr9, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3, 0, 0, implicit $exec + early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3 = V_SWMMAC_I32_16X16X32_IU8_w64_twoaddr 8, killed $vgpr16, 8, killed $vgpr17_vgpr18, killed $vgpr0_vgpr1_vgpr2_vgpr3, killed $vgpr23, 0, 0, 0, implicit $exec +... + +--- +name: test_swmmac_i32_16x16x32_iu4_D0_overlaps_Index1 +body: | + bb.0: + liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24 + + ; GFX12-LABEL: name: test_swmmac_i32_16x16x32_iu4_D0_overlaps_Index1 + ; GFX12: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24 + ; GFX12-NEXT: {{ $}} + ; GFX12-NEXT: early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3 = V_WMMA_F32_16X16X16_F16_w64_twoaddr 8, killed $vgpr6_vgpr7, 8, killed $vgpr8_vgpr9, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3, 0, 0, implicit $exec + ; GFX12-NEXT: V_NOP_e32 implicit $exec + ; GFX12-NEXT: early-clobber renamable $vgpr18_vgpr19_vgpr20_vgpr21 = V_SWMMAC_I32_16X16X32_IU4_w64_twoaddr 8, killed $vgpr16, 8, killed $vgpr16, killed $vgpr18_vgpr19_vgpr20_vgpr21, killed $vgpr0, 0, 0, 0, implicit $exec + early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3 = V_WMMA_F32_16X16X16_F16_w64_twoaddr 8, killed $vgpr6_vgpr7, 8, killed $vgpr8_vgpr9, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3, 0, 0, implicit $exec + early-clobber renamable $vgpr18_vgpr19_vgpr20_vgpr21 = V_SWMMAC_I32_16X16X32_IU4_w64_twoaddr 8, killed $vgpr16, 8, killed $vgpr16, killed $vgpr18_vgpr19_vgpr20_vgpr21, killed $vgpr0, 0, 0, 0, implicit $exec +... + +--- +name: test_swmmac_i32_16x16x64_iu4_D0_overlaps_A1 +body: | + bb.0: + liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25 + + ; GFX12-LABEL: name: test_swmmac_i32_16x16x64_iu4_D0_overlaps_A1 + ; GFX12: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25 + ; GFX12-NEXT: {{ $}} + ; GFX12-NEXT: early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3 = V_WMMA_F32_16X16X16_F16_w64_twoaddr 8, killed $vgpr6_vgpr7, 8, killed $vgpr8_vgpr9, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3, 0, 0, implicit $exec + ; GFX12-NEXT: V_NOP_e32 implicit $exec + ; GFX12-NEXT: early-clobber renamable $vgpr19_vgpr20_vgpr21_vgpr22 = V_SWMMAC_I32_16X16X64_IU4_w64_twoaddr 8, killed $vgpr0, 8, killed $vgpr17_vgpr18, killed $vgpr19_vgpr20_vgpr21_vgpr22, killed $vgpr23, 0, 0, 0, implicit $exec + early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3 = V_WMMA_F32_16X16X16_F16_w64_twoaddr 8, killed $vgpr6_vgpr7, 8, killed $vgpr8_vgpr9, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3, 0, 0, implicit $exec + early-clobber renamable $vgpr19_vgpr20_vgpr21_vgpr22 = V_SWMMAC_I32_16X16X64_IU4_w64_twoaddr 8, killed $vgpr0, 8, killed $vgpr17_vgpr18, killed $vgpr19_vgpr20_vgpr21_vgpr22, killed $vgpr23, 0, 0, 0, implicit $exec +... + +--- +name: test_swmmac_f32_16x16x32_fp8_fp8_D0_overlaps_B1 +body: | + bb.0: + liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25 + + ; GFX12-LABEL: name: test_swmmac_f32_16x16x32_fp8_fp8_D0_overlaps_B1 + ; GFX12: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25 + ; GFX12-NEXT: {{ $}} + ; GFX12-NEXT: early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3 = V_WMMA_F32_16X16X16_F16_w64_twoaddr 8, killed $vgpr6_vgpr7, 8, killed $vgpr8_vgpr9, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3, 0, 0, implicit $exec + ; GFX12-NEXT: V_NOP_e32 implicit $exec + ; GFX12-NEXT: early-clobber renamable $vgpr19_vgpr20_vgpr21_vgpr22 = V_SWMMAC_F32_16X16X32_FP8_FP8_w64_twoaddr killed $vgpr16, killed $vgpr0_vgpr1, killed $vgpr19_vgpr20_vgpr21_vgpr22, killed $vgpr23, 0, implicit $exec + early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3 = V_WMMA_F32_16X16X16_F16_w64_twoaddr 8, killed $vgpr6_vgpr7, 8, killed $vgpr8_vgpr9, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3, 0, 0, implicit $exec + early-clobber renamable $vgpr19_vgpr20_vgpr21_vgpr22 = V_SWMMAC_F32_16X16X32_FP8_FP8_w64_twoaddr killed $vgpr16, killed $vgpr0_vgpr1, killed $vgpr19_vgpr20_vgpr21_vgpr22, killed $vgpr23, 0, implicit $exec +... + +--- +name: test_swmmac_f32_16x16x32_fp8_bf8_D0_overlaps_C1 +body: | + bb.0: + liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25 + + ; GFX12-LABEL: name: test_swmmac_f32_16x16x32_fp8_bf8_D0_overlaps_C1 + ; GFX12: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25 + ; GFX12-NEXT: {{ $}} + ; GFX12-NEXT: early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3 = V_WMMA_F32_16X16X16_F16_w64_twoaddr 8, killed $vgpr6_vgpr7, 8, killed $vgpr8_vgpr9, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3, 0, 0, implicit $exec + ; GFX12-NEXT: early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3 = V_SWMMAC_F32_16X16X32_FP8_BF8_w64_twoaddr killed $vgpr16, killed $vgpr17_vgpr18, killed $vgpr0_vgpr1_vgpr2_vgpr3, killed $vgpr23, 0, implicit $exec + early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3 = V_WMMA_F32_16X16X16_F16_w64_twoaddr 8, killed $vgpr6_vgpr7, 8, killed $vgpr8_vgpr9, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3, 0, 0, implicit $exec + early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3 = V_SWMMAC_F32_16X16X32_FP8_BF8_w64_twoaddr killed $vgpr16, killed $vgpr17_vgpr18, killed $vgpr0_vgpr1_vgpr2_vgpr3, killed $vgpr23, 0, implicit $exec +... + +--- +name: test_swmmac_f32_16x16x32_bf8_fp8_D0_overlaps_Index1 +body: | + bb.0: + liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25 + + ; GFX12-LABEL: name: test_swmmac_f32_16x16x32_bf8_fp8_D0_overlaps_Index1 + ; GFX12: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25 + ; GFX12-NEXT: {{ $}} + ; GFX12-NEXT: early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3 = V_WMMA_F32_16X16X16_F16_w64_twoaddr 8, killed $vgpr6_vgpr7, 8, killed $vgpr8_vgpr9, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3, 0, 0, implicit $exec + ; GFX12-NEXT: V_NOP_e32 implicit $exec + ; GFX12-NEXT: early-clobber renamable $vgpr19_vgpr20_vgpr21_vgpr22 = V_SWMMAC_F32_16X16X32_BF8_FP8_w64_twoaddr killed $vgpr16, killed $vgpr17_vgpr18, killed $vgpr19_vgpr20_vgpr21_vgpr22, killed $vgpr0, 0, implicit $exec + early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3 = V_WMMA_F32_16X16X16_F16_w64_twoaddr 8, killed $vgpr6_vgpr7, 8, killed $vgpr8_vgpr9, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3, 0, 0, implicit $exec + early-clobber renamable $vgpr19_vgpr20_vgpr21_vgpr22 = V_SWMMAC_F32_16X16X32_BF8_FP8_w64_twoaddr killed $vgpr16, killed $vgpr17_vgpr18, killed $vgpr19_vgpr20_vgpr21_vgpr22, killed $vgpr0, 0, implicit $exec +... + +--- +name: test_swmmac_f32_16x16x32_bf8_bf8_D0_overlaps_A1 +body: | + bb.0: + liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25 + + ; GFX12-LABEL: name: test_swmmac_f32_16x16x32_bf8_bf8_D0_overlaps_A1 + ; GFX12: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25 + ; GFX12-NEXT: {{ $}} + ; GFX12-NEXT: early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3 = V_WMMA_F32_16X16X16_F16_w64_twoaddr 8, killed $vgpr6_vgpr7, 8, killed $vgpr8_vgpr9, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3, 0, 0, implicit $exec + ; GFX12-NEXT: V_NOP_e32 implicit $exec + ; GFX12-NEXT: early-clobber renamable $vgpr19_vgpr20_vgpr21_vgpr22 = V_SWMMAC_F32_16X16X32_BF8_BF8_w64_twoaddr killed $vgpr0, killed $vgpr17_vgpr18, killed $vgpr19_vgpr20_vgpr21_vgpr22, killed $vgpr23, 0, implicit $exec + early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3 = V_WMMA_F32_16X16X16_F16_w64_twoaddr 8, killed $vgpr6_vgpr7, 8, killed $vgpr8_vgpr9, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3, 0, 0, implicit $exec + early-clobber renamable $vgpr19_vgpr20_vgpr21_vgpr22 = V_SWMMAC_F32_16X16X32_BF8_BF8_w64_twoaddr killed $vgpr0, killed $vgpr17_vgpr18, killed $vgpr19_vgpr20_vgpr21_vgpr22, killed $vgpr23, 0, implicit $exec +... diff --git a/llvm/test/MC/AMDGPU/gfx12_asm_wmma_w32.s b/llvm/test/MC/AMDGPU/gfx12_asm_wmma_w32.s new file mode 100644 index 000000000000..e1cd0cab6634 --- /dev/null +++ b/llvm/test/MC/AMDGPU/gfx12_asm_wmma_w32.s @@ -0,0 +1,1529 @@ +// RUN: not llvm-mc -arch=amdgcn -mcpu=gfx1200 -show-encoding %s | FileCheck --check-prefix=GFX12 %s +// RUN: not llvm-mc -arch=amdgcn -mcpu=gfx1200 %s 2>&1 | FileCheck --check-prefix=GFX12-ERR --implicit-check-not=error: %s + +v_wmma_f32_16x16x16_f16 v[8:15], v[0:3], v[4:7], v[8:15] +// GFX12: v_wmma_f32_16x16x16_f16 v[8:15], v[0:3], v[4:7], v[8:15] ; encoding: [0x08,0x40,0x40,0xcc,0x00,0x09,0x22,0x1c] + +v_wmma_f32_16x16x16_f16 v[8:15], v[0:3], v[4:7], v[8:15] clamp +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +v_wmma_f32_16x16x16_f16 v[8:15], v[0:3], v[4:7], v[8:15] op_sel:[1,0,0] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_wmma_f32_16x16x16_f16 v[8:15], v[0:3], v[4:7], v[8:15] op_sel:[0,1,0] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_wmma_f32_16x16x16_f16 v[8:15], v[0:3], v[4:7], v[8:15] op_sel:[0,0,1] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_wmma_f32_16x16x16_f16 v[8:15], v[0:3], v[4:7], v[8:15] op_sel_hi:[0,1,1] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_wmma_f32_16x16x16_f16 v[8:15], v[0:3], v[4:7], v[8:15] op_sel_hi:[1,0,1] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_wmma_f32_16x16x16_f16 v[8:15], v[0:3], v[4:7], v[8:15] op_sel_hi:[1,1,0] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_wmma_f32_16x16x16_f16 v[8:15], v[0:3], v[4:7], v[8:15] neg_lo:[1,0,0] +// GFX12: v_wmma_f32_16x16x16_f16 v[8:15], v[0:3], v[4:7], v[8:15] neg_lo:[1,0,0] ; encoding: [0x08,0x40,0x40,0xcc,0x00,0x09,0x22,0x3c] + +v_wmma_f32_16x16x16_f16 v[8:15], v[0:3], v[4:7], v[8:15] neg_lo:[0,1,0] +// GFX12: v_wmma_f32_16x16x16_f16 v[8:15], v[0:3], v[4:7], v[8:15] neg_lo:[0,1,0] ; encoding: [0x08,0x40,0x40,0xcc,0x00,0x09,0x22,0x5c] + +v_wmma_f32_16x16x16_f16 v[8:15], v[0:3], v[4:7], v[8:15] neg_lo:[0,0,1] +// GFX12: v_wmma_f32_16x16x16_f16 v[8:15], v[0:3], v[4:7], v[8:15] neg_lo:[0,0,1] ; encoding: [0x08,0x40,0x40,0xcc,0x00,0x09,0x22,0x9c] + +v_wmma_f32_16x16x16_f16 v[8:15], v[0:3], v[4:7], v[8:15] neg_hi:[1,0,0] +// GFX12: v_wmma_f32_16x16x16_f16 v[8:15], v[0:3], v[4:7], v[8:15] neg_hi:[1,0,0] ; encoding: [0x08,0x41,0x40,0xcc,0x00,0x09,0x22,0x1c] + +v_wmma_f32_16x16x16_f16 v[8:15], v[0:3], v[4:7], v[8:15] neg_hi:[0,1,0] +// GFX12: v_wmma_f32_16x16x16_f16 v[8:15], v[0:3], v[4:7], v[8:15] neg_hi:[0,1,0] ; encoding: [0x08,0x42,0x40,0xcc,0x00,0x09,0x22,0x1c] + +v_wmma_f32_16x16x16_f16 v[8:15], v[0:3], v[4:7], v[8:15] neg_hi:[0,0,1] +// GFX12: v_wmma_f32_16x16x16_f16 v[8:15], v[0:3], v[4:7], v[8:15] neg_hi:[0,0,1] ; encoding: [0x08,0x44,0x40,0xcc,0x00,0x09,0x22,0x1c] + +v_wmma_f32_16x16x16_f16 v[8:15], s[0:3], v[4:7], v[8:15] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +v_wmma_f32_16x16x16_f16 v[8:15], v[0:3], s[4:7], v[8:15] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +v_wmma_f32_16x16x16_f16 v[8:15], v[0:3], v[4:7], s[8:15] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +v_wmma_f32_16x16x16_f16 v[8:15], 1.0, v[4:7], v[8:15] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +v_wmma_f32_16x16x16_f16 v[8:15], v[0:3], 1.0, v[8:15] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +v_wmma_f32_16x16x16_f16 v[8:15], v[0:3], v[4:7], 1.0 +// GFX12: v_wmma_f32_16x16x16_f16 v[8:15], v[0:3], v[4:7], 1.0 ; encoding: [0x08,0x40,0x40,0xcc,0x00,0x09,0xca,0x1b] + +v_wmma_f32_16x16x16_f16 v[8:15], v[0:3], v[4:7], 1 +// GFX12: v_wmma_f32_16x16x16_f16 v[8:15], v[0:3], v[4:7], 1 ; encoding: [0x08,0x40,0x40,0xcc,0x00,0x09,0x06,0x1a] + + + +v_wmma_f32_16x16x16_bf16 v[8:15], v[0:3], v[4:7], v[8:15] +// GFX12: v_wmma_f32_16x16x16_bf16 v[8:15], v[0:3], v[4:7], v[8:15] ; encoding: [0x08,0x40,0x41,0xcc,0x00,0x09,0x22,0x1c] + +v_wmma_f32_16x16x16_bf16 v[8:15], v[0:3], v[4:7], v[8:15] clamp +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +v_wmma_f32_16x16x16_bf16 v[8:15], v[0:3], v[4:7], v[8:15] op_sel:[1,0,0] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_wmma_f32_16x16x16_bf16 v[8:15], v[0:3], v[4:7], v[8:15] op_sel:[0,1,0] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_wmma_f32_16x16x16_bf16 v[8:15], v[0:3], v[4:7], v[8:15] op_sel:[0,0,1] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_wmma_f32_16x16x16_bf16 v[8:15], v[0:3], v[4:7], v[8:15] op_sel_hi:[0,1,1] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_wmma_f32_16x16x16_bf16 v[8:15], v[0:3], v[4:7], v[8:15] op_sel_hi:[1,0,1] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_wmma_f32_16x16x16_bf16 v[8:15], v[0:3], v[4:7], v[8:15] op_sel_hi:[1,1,0] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_wmma_f32_16x16x16_bf16 v[8:15], v[0:3], v[4:7], v[8:15] neg_lo:[1,0,0] +// GFX12: v_wmma_f32_16x16x16_bf16 v[8:15], v[0:3], v[4:7], v[8:15] neg_lo:[1,0,0] ; encoding: [0x08,0x40,0x41,0xcc,0x00,0x09,0x22,0x3c] + +v_wmma_f32_16x16x16_bf16 v[8:15], v[0:3], v[4:7], v[8:15] neg_lo:[0,1,0] +// GFX12: v_wmma_f32_16x16x16_bf16 v[8:15], v[0:3], v[4:7], v[8:15] neg_lo:[0,1,0] ; encoding: [0x08,0x40,0x41,0xcc,0x00,0x09,0x22,0x5c] + +v_wmma_f32_16x16x16_bf16 v[8:15], v[0:3], v[4:7], v[8:15] neg_lo:[0,0,1] +// GFX12: v_wmma_f32_16x16x16_bf16 v[8:15], v[0:3], v[4:7], v[8:15] neg_lo:[0,0,1] ; encoding: [0x08,0x40,0x41,0xcc,0x00,0x09,0x22,0x9c] + +v_wmma_f32_16x16x16_bf16 v[8:15], v[0:3], v[4:7], v[8:15] neg_hi:[1,0,0] +// GFX12: v_wmma_f32_16x16x16_bf16 v[8:15], v[0:3], v[4:7], v[8:15] neg_hi:[1,0,0] ; encoding: [0x08,0x41,0x41,0xcc,0x00,0x09,0x22,0x1c] + +v_wmma_f32_16x16x16_bf16 v[8:15], v[0:3], v[4:7], v[8:15] neg_hi:[0,1,0] +// GFX12: v_wmma_f32_16x16x16_bf16 v[8:15], v[0:3], v[4:7], v[8:15] neg_hi:[0,1,0] ; encoding: [0x08,0x42,0x41,0xcc,0x00,0x09,0x22,0x1c] + +v_wmma_f32_16x16x16_bf16 v[8:15], v[0:3], v[4:7], v[8:15] neg_hi:[0,0,1] +// GFX12: v_wmma_f32_16x16x16_bf16 v[8:15], v[0:3], v[4:7], v[8:15] neg_hi:[0,0,1] ; encoding: [0x08,0x44,0x41,0xcc,0x00,0x09,0x22,0x1c] + +v_wmma_f32_16x16x16_bf16 v[8:15], s[0:3], v[4:7], v[8:15] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +v_wmma_f32_16x16x16_bf16 v[8:15], v[0:3], s[4:7], v[8:15] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +v_wmma_f32_16x16x16_bf16 v[8:15], v[0:3], v[4:7], s[8:15] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +v_wmma_f32_16x16x16_bf16 v[8:15], 1.0, v[4:7], v[8:15] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +v_wmma_f32_16x16x16_bf16 v[8:15], v[0:3], 1.0, v[8:15] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +v_wmma_f32_16x16x16_bf16 v[8:15], v[0:3], v[4:7], 1.0 +// GFX12: v_wmma_f32_16x16x16_bf16 v[8:15], v[0:3], v[4:7], 1.0 ; encoding: [0x08,0x40,0x41,0xcc,0x00,0x09,0xca,0x1b] + +v_wmma_f32_16x16x16_bf16 v[8:15], v[0:3], v[4:7], 1 +// GFX12: v_wmma_f32_16x16x16_bf16 v[8:15], v[0:3], v[4:7], 1 ; encoding: [0x08,0x40,0x41,0xcc,0x00,0x09,0x06,0x1a] + + + +v_wmma_f16_16x16x16_f16 v[8:11], v[0:3], v[4:7], v[8:11] +// GFX12: v_wmma_f16_16x16x16_f16 v[8:11], v[0:3], v[4:7], v[8:11] ; encoding: [0x08,0x40,0x42,0xcc,0x00,0x09,0x22,0x1c] + +v_wmma_f16_16x16x16_f16 v[8:11], v[0:3], v[4:7], v[8:11] clamp +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +v_wmma_f16_16x16x16_f16 v[8:11], v[0:3], v[4:7], v[8:11] op_sel:[1,0,0] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_wmma_f16_16x16x16_f16 v[8:11], v[0:3], v[4:7], v[8:11] op_sel:[0,1,0] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_wmma_f16_16x16x16_f16 v[8:11], v[0:3], v[4:7], v[8:11] op_sel:[0,0,1] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_wmma_f16_16x16x16_f16 v[8:11], v[0:3], v[4:7], v[8:11] op_sel_hi:[0,1,1] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_wmma_f16_16x16x16_f16 v[8:11], v[0:3], v[4:7], v[8:11] op_sel_hi:[1,0,1] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_wmma_f16_16x16x16_f16 v[8:11], v[0:3], v[4:7], v[8:11] op_sel_hi:[1,1,0] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_wmma_f16_16x16x16_f16 v[8:11], v[0:3], v[4:7], v[8:11] neg_lo:[1,0,0] +// GFX12: v_wmma_f16_16x16x16_f16 v[8:11], v[0:3], v[4:7], v[8:11] neg_lo:[1,0,0] ; encoding: [0x08,0x40,0x42,0xcc,0x00,0x09,0x22,0x3c] + +v_wmma_f16_16x16x16_f16 v[8:11], v[0:3], v[4:7], v[8:11] neg_lo:[0,1,0] +// GFX12: v_wmma_f16_16x16x16_f16 v[8:11], v[0:3], v[4:7], v[8:11] neg_lo:[0,1,0] ; encoding: [0x08,0x40,0x42,0xcc,0x00,0x09,0x22,0x5c] + +v_wmma_f16_16x16x16_f16 v[8:11], v[0:3], v[4:7], v[8:11] neg_lo:[0,0,1] +// GFX12: v_wmma_f16_16x16x16_f16 v[8:11], v[0:3], v[4:7], v[8:11] neg_lo:[0,0,1] ; encoding: [0x08,0x40,0x42,0xcc,0x00,0x09,0x22,0x9c] + +v_wmma_f16_16x16x16_f16 v[8:11], v[0:3], v[4:7], v[8:11] neg_hi:[1,0,0] +// GFX12: v_wmma_f16_16x16x16_f16 v[8:11], v[0:3], v[4:7], v[8:11] neg_hi:[1,0,0] ; encoding: [0x08,0x41,0x42,0xcc,0x00,0x09,0x22,0x1c] + +v_wmma_f16_16x16x16_f16 v[8:11], v[0:3], v[4:7], v[8:11] neg_hi:[0,1,0] +// GFX12: v_wmma_f16_16x16x16_f16 v[8:11], v[0:3], v[4:7], v[8:11] neg_hi:[0,1,0] ; encoding: [0x08,0x42,0x42,0xcc,0x00,0x09,0x22,0x1c] + +v_wmma_f16_16x16x16_f16 v[8:11], v[0:3], v[4:7], v[8:11] neg_hi:[0,0,1] +// GFX12: v_wmma_f16_16x16x16_f16 v[8:11], v[0:3], v[4:7], v[8:11] neg_hi:[0,0,1] ; encoding: [0x08,0x44,0x42,0xcc,0x00,0x09,0x22,0x1c] + +v_wmma_f16_16x16x16_f16 v[8:11], s[0:3], v[4:7], v[8:11] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +v_wmma_f16_16x16x16_f16 v[8:11], v[0:3], s[4:7], v[8:11] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +v_wmma_f16_16x16x16_f16 v[8:11], v[0:3], v[4:7], s[8:11] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +v_wmma_f16_16x16x16_f16 v[8:11], 1.0, v[4:7], v[8:11] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +v_wmma_f16_16x16x16_f16 v[8:11], v[0:3], 1.0, v[8:11] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +v_wmma_f16_16x16x16_f16 v[8:11], v[0:3], v[4:7], 1.0 +// GFX12: v_wmma_f16_16x16x16_f16 v[8:11], v[0:3], v[4:7], 1.0 ; encoding: [0x08,0x40,0x42,0xcc,0x00,0x09,0xca,0x1b] + +v_wmma_f16_16x16x16_f16 v[8:11], v[0:3], v[4:7], 1 +// GFX12: v_wmma_f16_16x16x16_f16 v[8:11], v[0:3], v[4:7], 1 ; encoding: [0x08,0x40,0x42,0xcc,0x00,0x09,0x06,0x1a] + + + +v_wmma_bf16_16x16x16_bf16 v[8:11], v[0:3], v[4:7], v[8:11] +// GFX12: v_wmma_bf16_16x16x16_bf16 v[8:11], v[0:3], v[4:7], v[8:11] ; encoding: [0x08,0x40,0x43,0xcc,0x00,0x09,0x22,0x1c] + +v_wmma_bf16_16x16x16_bf16 v[8:11], v[0:3], v[4:7], v[8:11] clamp +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +v_wmma_bf16_16x16x16_bf16 v[8:11], v[0:3], v[4:7], v[8:11] op_sel:[1,0,0] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_wmma_bf16_16x16x16_bf16 v[8:11], v[0:3], v[4:7], v[8:11] op_sel:[0,1,0] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_wmma_bf16_16x16x16_bf16 v[8:11], v[0:3], v[4:7], v[8:11] op_sel:[0,0,1] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_wmma_bf16_16x16x16_bf16 v[8:11], v[0:3], v[4:7], v[8:11] op_sel_hi:[0,1,1] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_wmma_bf16_16x16x16_bf16 v[8:11], v[0:3], v[4:7], v[8:11] op_sel_hi:[1,0,1] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_wmma_bf16_16x16x16_bf16 v[8:11], v[0:3], v[4:7], v[8:11] op_sel_hi:[1,1,0] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_wmma_bf16_16x16x16_bf16 v[8:11], v[0:3], v[4:7], v[8:11] neg_lo:[1,0,0] +// GFX12: v_wmma_bf16_16x16x16_bf16 v[8:11], v[0:3], v[4:7], v[8:11] neg_lo:[1,0,0] ; encoding: [0x08,0x40,0x43,0xcc,0x00,0x09,0x22,0x3c] + +v_wmma_bf16_16x16x16_bf16 v[8:11], v[0:3], v[4:7], v[8:11] neg_lo:[0,1,0] +// GFX12: v_wmma_bf16_16x16x16_bf16 v[8:11], v[0:3], v[4:7], v[8:11] neg_lo:[0,1,0] ; encoding: [0x08,0x40,0x43,0xcc,0x00,0x09,0x22,0x5c] + +v_wmma_bf16_16x16x16_bf16 v[8:11], v[0:3], v[4:7], v[8:11] neg_lo:[0,0,1] +// GFX12: v_wmma_bf16_16x16x16_bf16 v[8:11], v[0:3], v[4:7], v[8:11] neg_lo:[0,0,1] ; encoding: [0x08,0x40,0x43,0xcc,0x00,0x09,0x22,0x9c] + +v_wmma_bf16_16x16x16_bf16 v[8:11], v[0:3], v[4:7], v[8:11] neg_hi:[1,0,0] +// GFX12: v_wmma_bf16_16x16x16_bf16 v[8:11], v[0:3], v[4:7], v[8:11] neg_hi:[1,0,0] ; encoding: [0x08,0x41,0x43,0xcc,0x00,0x09,0x22,0x1c] + +v_wmma_bf16_16x16x16_bf16 v[8:11], v[0:3], v[4:7], v[8:11] neg_hi:[0,1,0] +// GFX12: v_wmma_bf16_16x16x16_bf16 v[8:11], v[0:3], v[4:7], v[8:11] neg_hi:[0,1,0] ; encoding: [0x08,0x42,0x43,0xcc,0x00,0x09,0x22,0x1c] + +v_wmma_bf16_16x16x16_bf16 v[8:11], v[0:3], v[4:7], v[8:11] neg_hi:[0,0,1] +// GFX12: v_wmma_bf16_16x16x16_bf16 v[8:11], v[0:3], v[4:7], v[8:11] neg_hi:[0,0,1] ; encoding: [0x08,0x44,0x43,0xcc,0x00,0x09,0x22,0x1c] + +v_wmma_bf16_16x16x16_bf16 v[8:11], s[0:3], v[4:7], v[8:11] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +v_wmma_bf16_16x16x16_bf16 v[8:11], v[0:3], s[4:7], v[8:11] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +v_wmma_bf16_16x16x16_bf16 v[8:11], v[0:3], v[4:7], s[8:11] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +v_wmma_bf16_16x16x16_bf16 v[8:11], 1.0, v[4:7], v[8:11] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +v_wmma_bf16_16x16x16_bf16 v[8:11], v[0:3], 1.0, v[8:11] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +v_wmma_bf16_16x16x16_bf16 v[8:11], v[0:3], v[4:7], 1.0 +// GFX12: v_wmma_bf16_16x16x16_bf16 v[8:11], v[0:3], v[4:7], 1.0 ; encoding: [0x08,0x40,0x43,0xcc,0x00,0x09,0xca,0x1b] + +v_wmma_bf16_16x16x16_bf16 v[8:11], v[0:3], v[4:7], 1 +// GFX12: v_wmma_bf16_16x16x16_bf16 v[8:11], v[0:3], v[4:7], 1 ; encoding: [0x08,0x40,0x43,0xcc,0x00,0x09,0x06,0x1a] + + + +v_wmma_i32_16x16x16_iu8 v[4:11], v[0:1], v[2:3], v[4:11] +// GFX12: v_wmma_i32_16x16x16_iu8 v[4:11], v[0:1], v[2:3], v[4:11] ; encoding: [0x04,0x40,0x44,0xcc,0x00,0x05,0x12,0x1c] + +v_wmma_i32_16x16x16_iu8 v[4:11], v[0:1], v[2:3], v[4:11] clamp +// GFX12: v_wmma_i32_16x16x16_iu8 v[4:11], v[0:1], v[2:3], v[4:11] clamp ; encoding: [0x04,0xc0,0x44,0xcc,0x00,0x05,0x12,0x1c] + +v_wmma_i32_16x16x16_iu8 v[4:11], v[0:1], v[2:3], v[4:11] op_sel:[1,0,0] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_wmma_i32_16x16x16_iu8 v[4:11], v[0:1], v[2:3], v[4:11] op_sel:[0,1,0] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_wmma_i32_16x16x16_iu8 v[4:11], v[0:1], v[2:3], v[4:11] op_sel:[0,0,1] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_wmma_i32_16x16x16_iu8 v[4:11], v[0:1], v[2:3], v[4:11] op_sel_hi:[0,1,1] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_wmma_i32_16x16x16_iu8 v[4:11], v[0:1], v[2:3], v[4:11] op_sel_hi:[1,0,1] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_wmma_i32_16x16x16_iu8 v[4:11], v[0:1], v[2:3], v[4:11] op_sel_hi:[1,1,0] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_wmma_i32_16x16x16_iu8 v[4:11], v[0:1], v[2:3], v[4:11] neg_lo:[1,0,0] +// GFX12: v_wmma_i32_16x16x16_iu8 v[4:11], v[0:1], v[2:3], v[4:11] neg_lo:[1,0,0] ; encoding: [0x04,0x40,0x44,0xcc,0x00,0x05,0x12,0x3c] + +v_wmma_i32_16x16x16_iu8 v[4:11], v[0:1], v[2:3], v[4:11] neg_lo:[0,1,0] +// GFX12: v_wmma_i32_16x16x16_iu8 v[4:11], v[0:1], v[2:3], v[4:11] neg_lo:[0,1,0] ; encoding: [0x04,0x40,0x44,0xcc,0x00,0x05,0x12,0x5c] + +v_wmma_i32_16x16x16_iu8 v[4:11], v[0:1], v[2:3], v[4:11] neg_lo:[0,0,1] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid neg_lo operand + +v_wmma_i32_16x16x16_iu8 v[4:11], v[0:1], v[2:3], v[4:11] neg_hi:[1,0,0] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_wmma_i32_16x16x16_iu8 v[4:11], v[0:1], v[2:3], v[4:11] neg_hi:[0,1,0] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_wmma_i32_16x16x16_iu8 v[4:11], v[0:1], v[2:3], v[4:11] neg_hi:[0,0,1] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_wmma_i32_16x16x16_iu8 v[4:11], s[0:1], v[2:3], v[4:11] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +v_wmma_i32_16x16x16_iu8 v[4:11], v[0:1], s[2:3], v[4:11] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +v_wmma_i32_16x16x16_iu8 v[4:11], v[0:1], v[2:3], s[4:11] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +v_wmma_i32_16x16x16_iu8 v[4:11], 1, v[2:3], v[4:11] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +v_wmma_i32_16x16x16_iu8 v[4:11], v[0:1], 1, v[4:11] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +v_wmma_i32_16x16x16_iu8 v[4:11], v[0:1], v[2:3], 1 +// GFX12: v_wmma_i32_16x16x16_iu8 v[4:11], v[0:1], v[2:3], 1 ; encoding: [0x04,0x40,0x44,0xcc,0x00,0x05,0x06,0x1a] + +v_wmma_i32_16x16x16_iu8 v[4:11], v[0:1], v[2:3], 1.0 +// GFX12: v_wmma_i32_16x16x16_iu8 v[4:11], v[0:1], v[2:3], 1.0 ; encoding: [0x04,0x40,0x44,0xcc,0x00,0x05,0xca,0x1b] + + + +v_wmma_i32_16x16x16_iu4 v[2:9], v0, v1, v[2:9] +// GFX12: v_wmma_i32_16x16x16_iu4 v[2:9], v0, v1, v[2:9] ; encoding: [0x02,0x40,0x45,0xcc,0x00,0x03,0x0a,0x1c] + +v_wmma_i32_16x16x16_iu4 v[2:9], v0, v1, v[2:9] clamp +// GFX12: v_wmma_i32_16x16x16_iu4 v[2:9], v0, v1, v[2:9] clamp ; encoding: [0x02,0xc0,0x45,0xcc,0x00,0x03,0x0a,0x1c] + +v_wmma_i32_16x16x16_iu4 v[2:9], v0, v1, v[2:9] op_sel:[1,0,0] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_wmma_i32_16x16x16_iu4 v[2:9], v0, v1, v[2:9] op_sel:[0,1,0] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_wmma_i32_16x16x16_iu4 v[2:9], v0, v1, v[2:9] op_sel:[0,0,1] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_wmma_i32_16x16x16_iu4 v[2:9], v0, v1, v[2:9] op_sel_hi:[0,1,1] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_wmma_i32_16x16x16_iu4 v[2:9], v0, v1, v[2:9] op_sel_hi:[1,0,1] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_wmma_i32_16x16x16_iu4 v[2:9], v0, v1, v[2:9] op_sel_hi:[1,1,0] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_wmma_i32_16x16x16_iu4 v[2:9], v0, v1, v[2:9] neg_lo:[1,0,0] +// GFX12: v_wmma_i32_16x16x16_iu4 v[2:9], v0, v1, v[2:9] neg_lo:[1,0,0] ; encoding: [0x02,0x40,0x45,0xcc,0x00,0x03,0x0a,0x3c] + +v_wmma_i32_16x16x16_iu4 v[2:9], v0, v1, v[2:9] neg_lo:[0,1,0] +// GFX12: v_wmma_i32_16x16x16_iu4 v[2:9], v0, v1, v[2:9] neg_lo:[0,1,0] ; encoding: [0x02,0x40,0x45,0xcc,0x00,0x03,0x0a,0x5c] + +v_wmma_i32_16x16x16_iu4 v[2:9], v0, v1, v[2:9] neg_lo:[0,0,1] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid neg_lo operand + +v_wmma_i32_16x16x16_iu4 v[2:9], v0, v1, v[2:9] neg_hi:[1,0,0] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_wmma_i32_16x16x16_iu4 v[2:9], v0, v1, v[2:9] neg_hi:[0,1,0] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_wmma_i32_16x16x16_iu4 v[2:9], v0, v1, v[2:9] neg_hi:[0,0,1] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_wmma_i32_16x16x16_iu4 v[2:9], s0, v1, v[2:9] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +v_wmma_i32_16x16x16_iu4 v[2:9], v0, s1, v[2:9] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +v_wmma_i32_16x16x16_iu4 v[2:9], v0, v1, s[0:7] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +v_wmma_i32_16x16x16_iu4 v[2:9], 1, v1, v[2:9] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +v_wmma_i32_16x16x16_iu4 v[2:9], v0, 1, v[2:9] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +v_wmma_i32_16x16x16_iu4 v[2:9], v0, v1, 1 +// GFX12: v_wmma_i32_16x16x16_iu4 v[2:9], v0, v1, 1 ; encoding: [0x02,0x40,0x45,0xcc,0x00,0x03,0x06,0x1a] + +v_wmma_i32_16x16x16_iu4 v[2:9], v0, v1, 1.0 +// GFX12: v_wmma_i32_16x16x16_iu4 v[2:9], v0, v1, 1.0 ; encoding: [0x02,0x40,0x45,0xcc,0x00,0x03,0xca,0x1b] + + + +v_wmma_f32_16x16x16_fp8_fp8 v[4:11], v[0:1], v[2:3], v[4:11] +// GFX12: v_wmma_f32_16x16x16_fp8_fp8 v[4:11], v[0:1], v[2:3], v[4:11] ; encoding: [0x04,0x40,0x46,0xcc,0x00,0x05,0x12,0x1c] + +v_wmma_f32_16x16x16_fp8_fp8 v[4:11], v[0:1], v[2:3], v[4:11] clamp +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +v_wmma_f32_16x16x16_fp8_fp8 v[4:11], v[0:1], v[2:3], v[4:11] op_sel:[1,0,0] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_wmma_f32_16x16x16_fp8_fp8 v[4:11], v[0:1], v[2:3], v[4:11] op_sel:[0,1,0] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_wmma_f32_16x16x16_fp8_fp8 v[4:11], v[0:1], v[2:3], v[4:11] op_sel:[0,0,1] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_wmma_f32_16x16x16_fp8_fp8 v[4:11], v[0:1], v[2:3], v[4:11] op_sel_hi:[0,1,1] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_wmma_f32_16x16x16_fp8_fp8 v[4:11], v[0:1], v[2:3], v[4:11] op_sel_hi:[1,0,1] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_wmma_f32_16x16x16_fp8_fp8 v[4:11], v[0:1], v[2:3], v[4:11] op_sel_hi:[1,1,0] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_wmma_f32_16x16x16_fp8_fp8 v[4:11], v[0:1], v[2:3], v[4:11] neg_lo:[1,0,0] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid neg_lo operand + +v_wmma_f32_16x16x16_fp8_fp8 v[4:11], v[0:1], v[2:3], v[4:11] neg_lo:[0,1,0] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid neg_lo operand + +v_wmma_f32_16x16x16_fp8_fp8 v[4:11], v[0:1], v[2:3], v[4:11] neg_lo:[0,0,1] +// GFX12: v_wmma_f32_16x16x16_fp8_fp8 v[4:11], v[0:1], v[2:3], v[4:11] neg_lo:[0,0,1] ; encoding: [0x04,0x40,0x46,0xcc,0x00,0x05,0x12,0x9c] + +v_wmma_f32_16x16x16_fp8_fp8 v[4:11], v[0:1], v[2:3], v[4:11] neg_hi:[1,0,0] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid neg_hi operand + +v_wmma_f32_16x16x16_fp8_fp8 v[4:11], v[0:1], v[2:3], v[4:11] neg_hi:[0,1,0] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid neg_hi operand + +v_wmma_f32_16x16x16_fp8_fp8 v[4:11], v[0:1], v[2:3], v[4:11] neg_hi:[0,0,1] +// GFX12: v_wmma_f32_16x16x16_fp8_fp8 v[4:11], v[0:1], v[2:3], v[4:11] neg_hi:[0,0,1] ; encoding: [0x04,0x44,0x46,0xcc,0x00,0x05,0x12,0x1c] + +v_wmma_f32_16x16x16_fp8_fp8 v[4:11], s[0:1], v[2:3], v[4:11] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +v_wmma_f32_16x16x16_fp8_fp8 v[4:11], v[0:1], s[2:3], v[4:11] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +v_wmma_f32_16x16x16_fp8_fp8 v[4:11], v[0:1], v[2:3], s[4:11] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +v_wmma_f32_16x16x16_fp8_fp8 v[4:11], 1.0, v[2:3], v[4:11] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +v_wmma_f32_16x16x16_fp8_fp8 v[4:11], v[0:1], 1.0, v[4:11] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +v_wmma_f32_16x16x16_fp8_fp8 v[4:11], v[0:1], v[2:3], 1.0 +// GFX12: v_wmma_f32_16x16x16_fp8_fp8 v[4:11], v[0:1], v[2:3], 1.0 ; encoding: [0x04,0x40,0x46,0xcc,0x00,0x05,0xca,0x1b] + +v_wmma_f32_16x16x16_fp8_fp8 v[4:11], v[0:1], v[2:3], 1 +// GFX12: v_wmma_f32_16x16x16_fp8_fp8 v[4:11], v[0:1], v[2:3], 1 ; encoding: [0x04,0x40,0x46,0xcc,0x00,0x05,0x06,0x1a] + + + +v_wmma_f32_16x16x16_bf8_fp8 v[4:11], v[0:1], v[2:3], v[4:11] +// GFX12: v_wmma_f32_16x16x16_bf8_fp8 v[4:11], v[0:1], v[2:3], v[4:11] ; encoding: [0x04,0x40,0x48,0xcc,0x00,0x05,0x12,0x1c] + +v_wmma_f32_16x16x16_bf8_fp8 v[4:11], v[0:1], v[2:3], v[4:11] clamp +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +v_wmma_f32_16x16x16_bf8_fp8 v[4:11], v[0:1], v[2:3], v[4:11] op_sel:[1,0,0] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_wmma_f32_16x16x16_bf8_fp8 v[4:11], v[0:1], v[2:3], v[4:11] op_sel:[0,1,0] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_wmma_f32_16x16x16_bf8_fp8 v[4:11], v[0:1], v[2:3], v[4:11] op_sel:[0,0,1] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_wmma_f32_16x16x16_bf8_fp8 v[4:11], v[0:1], v[2:3], v[4:11] op_sel_hi:[0,1,1] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_wmma_f32_16x16x16_bf8_fp8 v[4:11], v[0:1], v[2:3], v[4:11] op_sel_hi:[1,0,1] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_wmma_f32_16x16x16_bf8_fp8 v[4:11], v[0:1], v[2:3], v[4:11] op_sel_hi:[1,1,0] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_wmma_f32_16x16x16_bf8_fp8 v[4:11], v[0:1], v[2:3], v[4:11] neg_lo:[1,0,0] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid neg_lo operand + +v_wmma_f32_16x16x16_bf8_fp8 v[4:11], v[0:1], v[2:3], v[4:11] neg_lo:[0,1,0] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid neg_lo operand + +v_wmma_f32_16x16x16_bf8_fp8 v[4:11], v[0:1], v[2:3], v[4:11] neg_lo:[0,0,1] +// GFX12: v_wmma_f32_16x16x16_bf8_fp8 v[4:11], v[0:1], v[2:3], v[4:11] neg_lo:[0,0,1] ; encoding: [0x04,0x40,0x48,0xcc,0x00,0x05,0x12,0x9c] + +v_wmma_f32_16x16x16_bf8_fp8 v[4:11], v[0:1], v[2:3], v[4:11] neg_hi:[1,0,0] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid neg_hi operand + +v_wmma_f32_16x16x16_bf8_fp8 v[4:11], v[0:1], v[2:3], v[4:11] neg_hi:[0,1,0] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid neg_hi operand + +v_wmma_f32_16x16x16_bf8_fp8 v[4:11], v[0:1], v[2:3], v[4:11] neg_hi:[0,0,1] +// GFX12: v_wmma_f32_16x16x16_bf8_fp8 v[4:11], v[0:1], v[2:3], v[4:11] neg_hi:[0,0,1] ; encoding: [0x04,0x44,0x48,0xcc,0x00,0x05,0x12,0x1c] + +v_wmma_f32_16x16x16_bf8_fp8 v[4:11], s[0:1], v[2:3], v[4:11] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +v_wmma_f32_16x16x16_bf8_fp8 v[4:11], v[0:1], s[2:3], v[4:11] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +v_wmma_f32_16x16x16_bf8_fp8 v[4:11], v[0:1], v[2:3], s[4:11] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +v_wmma_f32_16x16x16_bf8_fp8 v[4:11], 1.0, v[2:3], v[4:11] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +v_wmma_f32_16x16x16_bf8_fp8 v[4:11], v[0:1], 1.0, v[4:11] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +v_wmma_f32_16x16x16_bf8_fp8 v[4:11], v[0:1], v[2:3], 1.0 +// GFX12: v_wmma_f32_16x16x16_bf8_fp8 v[4:11], v[0:1], v[2:3], 1.0 ; encoding: [0x04,0x40,0x48,0xcc,0x00,0x05,0xca,0x1b] + +v_wmma_f32_16x16x16_bf8_fp8 v[4:11], v[0:1], v[2:3], 1 +// GFX12: v_wmma_f32_16x16x16_bf8_fp8 v[4:11], v[0:1], v[2:3], 1 ; encoding: [0x04,0x40,0x48,0xcc,0x00,0x05,0x06,0x1a] + + + +v_wmma_f32_16x16x16_fp8_bf8 v[4:11], v[0:1], v[2:3], v[4:11] +// GFX12: v_wmma_f32_16x16x16_fp8_bf8 v[4:11], v[0:1], v[2:3], v[4:11] ; encoding: [0x04,0x40,0x47,0xcc,0x00,0x05,0x12,0x1c] + +v_wmma_f32_16x16x16_fp8_bf8 v[4:11], v[0:1], v[2:3], v[4:11] clamp +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +v_wmma_f32_16x16x16_fp8_bf8 v[4:11], v[0:1], v[2:3], v[4:11] op_sel:[1,0,0] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_wmma_f32_16x16x16_fp8_bf8 v[4:11], v[0:1], v[2:3], v[4:11] op_sel:[0,1,0] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_wmma_f32_16x16x16_fp8_bf8 v[4:11], v[0:1], v[2:3], v[4:11] op_sel:[0,0,1] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_wmma_f32_16x16x16_fp8_bf8 v[4:11], v[0:1], v[2:3], v[4:11] op_sel_hi:[0,1,1] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_wmma_f32_16x16x16_fp8_bf8 v[4:11], v[0:1], v[2:3], v[4:11] op_sel_hi:[1,0,1] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_wmma_f32_16x16x16_fp8_bf8 v[4:11], v[0:1], v[2:3], v[4:11] op_sel_hi:[1,1,0] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_wmma_f32_16x16x16_fp8_bf8 v[4:11], v[0:1], v[2:3], v[4:11] neg_lo:[1,0,0] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid neg_lo operand + +v_wmma_f32_16x16x16_fp8_bf8 v[4:11], v[0:1], v[2:3], v[4:11] neg_lo:[0,1,0] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid neg_lo operand + +v_wmma_f32_16x16x16_fp8_bf8 v[4:11], v[0:1], v[2:3], v[4:11] neg_lo:[0,0,1] +// GFX12: v_wmma_f32_16x16x16_fp8_bf8 v[4:11], v[0:1], v[2:3], v[4:11] neg_lo:[0,0,1] ; encoding: [0x04,0x40,0x47,0xcc,0x00,0x05,0x12,0x9c] + +v_wmma_f32_16x16x16_fp8_bf8 v[4:11], v[0:1], v[2:3], v[4:11] neg_hi:[1,0,0] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid neg_hi operand + +v_wmma_f32_16x16x16_fp8_bf8 v[4:11], v[0:1], v[2:3], v[4:11] neg_hi:[0,1,0] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid neg_hi operand + +v_wmma_f32_16x16x16_fp8_bf8 v[4:11], v[0:1], v[2:3], v[4:11] neg_hi:[0,0,1] +// GFX12: v_wmma_f32_16x16x16_fp8_bf8 v[4:11], v[0:1], v[2:3], v[4:11] neg_hi:[0,0,1] ; encoding: [0x04,0x44,0x47,0xcc,0x00,0x05,0x12,0x1c] + +v_wmma_f32_16x16x16_fp8_bf8 v[4:11], s[0:1], v[2:3], v[4:11] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +v_wmma_f32_16x16x16_fp8_bf8 v[4:11], v[0:1], s[2:3], v[4:11] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +v_wmma_f32_16x16x16_fp8_bf8 v[4:11], v[0:1], v[2:3], s[4:11] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +v_wmma_f32_16x16x16_fp8_bf8 v[4:11], 1.0, v[2:3], v[4:11] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +v_wmma_f32_16x16x16_fp8_bf8 v[4:11], v[0:1], 1.0, v[4:11] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +v_wmma_f32_16x16x16_fp8_bf8 v[4:11], v[0:1], v[2:3], 1.0 +// GFX12: v_wmma_f32_16x16x16_fp8_bf8 v[4:11], v[0:1], v[2:3], 1.0 ; encoding: [0x04,0x40,0x47,0xcc,0x00,0x05,0xca,0x1b] + +v_wmma_f32_16x16x16_fp8_bf8 v[4:11], v[0:1], v[2:3], 1 +// GFX12: v_wmma_f32_16x16x16_fp8_bf8 v[4:11], v[0:1], v[2:3], 1 ; encoding: [0x04,0x40,0x47,0xcc,0x00,0x05,0x06,0x1a] + + + +v_wmma_f32_16x16x16_bf8_bf8 v[4:11], v[0:1], v[2:3], v[4:11] +// GFX12: v_wmma_f32_16x16x16_bf8_bf8 v[4:11], v[0:1], v[2:3], v[4:11] ; encoding: [0x04,0x40,0x49,0xcc,0x00,0x05,0x12,0x1c] + +v_wmma_f32_16x16x16_bf8_bf8 v[4:11], v[0:1], v[2:3], v[4:11] clamp +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +v_wmma_f32_16x16x16_bf8_bf8 v[4:11], v[0:1], v[2:3], v[4:11] op_sel:[1,0,0] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_wmma_f32_16x16x16_bf8_bf8 v[4:11], v[0:1], v[2:3], v[4:11] op_sel:[0,1,0] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_wmma_f32_16x16x16_bf8_bf8 v[4:11], v[0:1], v[2:3], v[4:11] op_sel:[0,0,1] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_wmma_f32_16x16x16_bf8_bf8 v[4:11], v[0:1], v[2:3], v[4:11] op_sel_hi:[0,1,1] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_wmma_f32_16x16x16_bf8_bf8 v[4:11], v[0:1], v[2:3], v[4:11] op_sel_hi:[1,0,1] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_wmma_f32_16x16x16_bf8_bf8 v[4:11], v[0:1], v[2:3], v[4:11] op_sel_hi:[1,1,0] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_wmma_f32_16x16x16_bf8_bf8 v[4:11], v[0:1], v[2:3], v[4:11] neg_lo:[1,0,0] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid neg_lo operand + +v_wmma_f32_16x16x16_bf8_bf8 v[4:11], v[0:1], v[2:3], v[4:11] neg_lo:[0,1,0] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid neg_lo operand + +v_wmma_f32_16x16x16_bf8_bf8 v[4:11], v[0:1], v[2:3], v[4:11] neg_lo:[0,0,1] +// GFX12: v_wmma_f32_16x16x16_bf8_bf8 v[4:11], v[0:1], v[2:3], v[4:11] neg_lo:[0,0,1] ; encoding: [0x04,0x40,0x49,0xcc,0x00,0x05,0x12,0x9c] + +v_wmma_f32_16x16x16_bf8_bf8 v[4:11], v[0:1], v[2:3], v[4:11] neg_hi:[1,0,0] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid neg_hi operand + +v_wmma_f32_16x16x16_bf8_bf8 v[4:11], v[0:1], v[2:3], v[4:11] neg_hi:[0,1,0] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid neg_hi operand + +v_wmma_f32_16x16x16_bf8_bf8 v[4:11], v[0:1], v[2:3], v[4:11] neg_hi:[0,0,1] +// GFX12: v_wmma_f32_16x16x16_bf8_bf8 v[4:11], v[0:1], v[2:3], v[4:11] neg_hi:[0,0,1] ; encoding: [0x04,0x44,0x49,0xcc,0x00,0x05,0x12,0x1c] + +v_wmma_f32_16x16x16_bf8_bf8 v[4:11], s[0:1], v[2:3], v[4:11] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +v_wmma_f32_16x16x16_bf8_bf8 v[4:11], v[0:1], s[2:3], v[4:11] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +v_wmma_f32_16x16x16_bf8_bf8 v[4:11], v[0:1], v[2:3], s[4:11] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +v_wmma_f32_16x16x16_bf8_bf8 v[4:11], 1.0, v[2:3], v[4:11] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +v_wmma_f32_16x16x16_bf8_bf8 v[4:11], v[0:1], 1.0, v[4:11] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +v_wmma_f32_16x16x16_bf8_bf8 v[4:11], v[0:1], v[2:3], 1.0 +// GFX12: v_wmma_f32_16x16x16_bf8_bf8 v[4:11], v[0:1], v[2:3], 1.0 ; encoding: [0x04,0x40,0x49,0xcc,0x00,0x05,0xca,0x1b] + +v_wmma_f32_16x16x16_bf8_bf8 v[4:11], v[0:1], v[2:3], 1 +// GFX12: v_wmma_f32_16x16x16_bf8_bf8 v[4:11], v[0:1], v[2:3], 1 ; encoding: [0x04,0x40,0x49,0xcc,0x00,0x05,0x06,0x1a] + + + +v_wmma_i32_16x16x32_iu4 v[4:11], v[0:1], v[2:3], v[4:11] +// GFX12: v_wmma_i32_16x16x32_iu4 v[4:11], v[0:1], v[2:3], v[4:11] ; encoding: [0x04,0x40,0x4a,0xcc,0x00,0x05,0x12,0x1c] + +v_wmma_i32_16x16x32_iu4 v[4:11], v[0:1], v[2:3], v[4:11] clamp +// GFX12: v_wmma_i32_16x16x32_iu4 v[4:11], v[0:1], v[2:3], v[4:11] clamp ; encoding: [0x04,0xc0,0x4a,0xcc,0x00,0x05,0x12,0x1c] + +v_wmma_i32_16x16x32_iu4 v[4:11], v[0:1], v[2:3], v[4:11] op_sel:[1,0,0] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_wmma_i32_16x16x32_iu4 v[4:11], v[0:1], v[2:3], v[4:11] op_sel:[0,1,0] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_wmma_i32_16x16x32_iu4 v[4:11], v[0:1], v[2:3], v[4:11] op_sel:[0,0,1] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_wmma_i32_16x16x32_iu4 v[4:11], v[0:1], v[2:3], v[4:11] op_sel_hi:[0,1,1] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_wmma_i32_16x16x32_iu4 v[4:11], v[0:1], v[2:3], v[4:11] op_sel_hi:[1,0,1] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_wmma_i32_16x16x32_iu4 v[4:11], v[0:1], v[2:3], v[4:11] op_sel_hi:[1,1,0] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_wmma_i32_16x16x32_iu4 v[4:11], v[0:1], v[2:3], v[4:11] neg_lo:[1,0,0] +// GFX12: v_wmma_i32_16x16x32_iu4 v[4:11], v[0:1], v[2:3], v[4:11] neg_lo:[1,0,0] ; encoding: [0x04,0x40,0x4a,0xcc,0x00,0x05,0x12,0x3c] + +v_wmma_i32_16x16x32_iu4 v[4:11], v[0:1], v[2:3], v[4:11] neg_lo:[0,1,0] +// GFX12: v_wmma_i32_16x16x32_iu4 v[4:11], v[0:1], v[2:3], v[4:11] neg_lo:[0,1,0] ; encoding: [0x04,0x40,0x4a,0xcc,0x00,0x05,0x12,0x5c] + +v_wmma_i32_16x16x32_iu4 v[4:11], v[0:1], v[2:3], v[4:11] neg_lo:[0,0,1] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid neg_lo operand + +v_wmma_i32_16x16x32_iu4 v[4:11], v[0:1], v[2:3], v[4:11] neg_hi:[1,0,0] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_wmma_i32_16x16x32_iu4 v[4:11], v[0:1], v[2:3], v[4:11] neg_hi:[0,1,0] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_wmma_i32_16x16x32_iu4 v[4:11], v[0:1], v[2:3], v[4:11] neg_hi:[0,0,1] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_wmma_i32_16x16x32_iu4 v[4:11], s[0:1], v[2:3], v[4:11] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +v_wmma_i32_16x16x32_iu4 v[4:11], v[0:1], s[2:3], v[4:11] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +v_wmma_i32_16x16x32_iu4 v[4:11], v[0:1], v[2:3], s[4:11] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +v_wmma_i32_16x16x32_iu4 v[4:11], 1, v[2:3], v[4:11] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +v_wmma_i32_16x16x32_iu4 v[4:11], v[0:1], 1, v[4:11] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +v_wmma_i32_16x16x32_iu4 v[4:11], v[0:1], v[2:3], 1 +// GFX12: v_wmma_i32_16x16x32_iu4 v[4:11], v[0:1], v[2:3], 1 ; encoding: [0x04,0x40,0x4a,0xcc,0x00,0x05,0x06,0x1a] + +v_wmma_i32_16x16x32_iu4 v[4:11], v[0:1], v[2:3], 1.0 +// GFX12: v_wmma_i32_16x16x32_iu4 v[4:11], v[0:1], v[2:3], 1.0 ; encoding: [0x04,0x40,0x4a,0xcc,0x00,0x05,0xca,0x1b] + + + +v_swmmac_f32_16x16x32_f16 v[12:19], v[0:3], v[4:11], v20 +// GFX12: v_swmmac_f32_16x16x32_f16 v[12:19], v[0:3], v[4:11], v20 ; encoding: [0x0c,0x40,0x50,0xcc,0x00,0x09,0x52,0x1c] + +v_swmmac_f32_16x16x32_f16 v[12:19], v[0:3], v[4:11], v20 clamp +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +v_swmmac_f32_16x16x32_f16 v[12:19], v[0:3], v[4:11], v20 op_sel:[1,0,0] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_swmmac_f32_16x16x32_f16 v[12:19], v[0:3], v[4:11], v20 op_sel:[0,1,0] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_swmmac_f32_16x16x32_f16 v[12:19], v[0:3], v[4:11], v20 op_sel:[0,0,1] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_swmmac_f32_16x16x32_f16 v[12:19], v[0:3], v[4:11], v20 op_sel_hi:[0,1,1] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_swmmac_f32_16x16x32_f16 v[12:19], v[0:3], v[4:11], v20 op_sel_hi:[1,0,1] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_swmmac_f32_16x16x32_f16 v[12:19], v[0:3], v[4:11], v20 op_sel_hi:[1,1,0] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_swmmac_f32_16x16x32_f16 v[12:19], v[0:3], v[4:11], v20 index_key:1 +// GFX12: v_swmmac_f32_16x16x32_f16 v[12:19], v[0:3], v[4:11], v20 index_key:1 ; encoding: [0x0c,0x48,0x50,0xcc,0x00,0x09,0x52,0x1c] + +v_swmmac_f32_16x16x32_f16 v[12:19], v[0:3], v[4:11], v20 index_key:2 +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: out of range index_key + +v_swmmac_f32_16x16x32_f16 v[12:19], v[0:3], v[4:11], v20 index_key:3 +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: out of range index_key + +v_swmmac_f32_16x16x32_f16 v[12:19], v[0:3], v[4:11], v20 neg_lo:[1,0,0] +// GFX12: v_swmmac_f32_16x16x32_f16 v[12:19], v[0:3], v[4:11], v20 neg_lo:[1,0,0] ; encoding: [0x0c,0x40,0x50,0xcc,0x00,0x09,0x52,0x3c] + +v_swmmac_f32_16x16x32_f16 v[12:19], v[0:3], v[4:11], v20 neg_lo:[0,1,0] +// GFX12: v_swmmac_f32_16x16x32_f16 v[12:19], v[0:3], v[4:11], v20 neg_lo:[0,1,0] ; encoding: [0x0c,0x40,0x50,0xcc,0x00,0x09,0x52,0x5c] + +v_swmmac_f32_16x16x32_f16 v[12:19], v[0:3], v[4:11], v20 neg_lo:[0,0,1] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid neg_lo operand + +v_swmmac_f32_16x16x32_f16 v[12:19], v[0:3], v[4:11], v20 neg_hi:[1,0,0] +// GFX12: v_swmmac_f32_16x16x32_f16 v[12:19], v[0:3], v[4:11], v20 neg_hi:[1,0,0] ; encoding: [0x0c,0x41,0x50,0xcc,0x00,0x09,0x52,0x1c] + +v_swmmac_f32_16x16x32_f16 v[12:19], v[0:3], v[4:11], v20 neg_hi:[0,1,0] +// GFX12: v_swmmac_f32_16x16x32_f16 v[12:19], v[0:3], v[4:11], v20 neg_hi:[0,1,0] ; encoding: [0x0c,0x42,0x50,0xcc,0x00,0x09,0x52,0x1c] + +v_swmmac_f32_16x16x32_f16 v[12:19], v[0:3], v[4:11], v20 neg_hi:[0,0,1] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid neg_hi operand + +v_swmmac_f32_16x16x32_f16 v[12:19], s[0:3], v[4:11], v20 +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +v_swmmac_f32_16x16x32_f16 v[12:19], v[0:3], s[4:11], v20 +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +v_swmmac_f32_16x16x32_f16 v[12:19], v[0:3], v[4:11], s20 +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +v_swmmac_f32_16x16x32_f16 v[12:19], 1.0, v[4:11], v20 +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +v_swmmac_f32_16x16x32_f16 v[12:19], v[0:3], 1.0, v20 +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +v_swmmac_f32_16x16x32_f16 v[12:19], v[0:3], v[4:11], 1.0 +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +v_swmmac_f32_16x16x32_f16 v[12:19], v[0:3], v[4:11], 1 +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + + + +v_swmmac_f32_16x16x32_bf16 v[12:19], v[0:3], v[4:11], v20 +// GFX12: v_swmmac_f32_16x16x32_bf16 v[12:19], v[0:3], v[4:11], v20 ; encoding: [0x0c,0x40,0x51,0xcc,0x00,0x09,0x52,0x1c] + +v_swmmac_f32_16x16x32_bf16 v[12:19], v[0:3], v[4:11], v20 clamp +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +v_swmmac_f32_16x16x32_bf16 v[12:19], v[0:3], v[4:11], v20 op_sel:[1,0,0] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_swmmac_f32_16x16x32_bf16 v[12:19], v[0:3], v[4:11], v20 op_sel:[0,1,0] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_swmmac_f32_16x16x32_bf16 v[12:19], v[0:3], v[4:11], v20 op_sel:[0,0,1] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_swmmac_f32_16x16x32_bf16 v[12:19], v[0:3], v[4:11], v20 op_sel_hi:[0,1,1] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_swmmac_f32_16x16x32_bf16 v[12:19], v[0:3], v[4:11], v20 op_sel_hi:[1,0,1] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_swmmac_f32_16x16x32_bf16 v[12:19], v[0:3], v[4:11], v20 op_sel_hi:[1,1,0] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_swmmac_f32_16x16x32_bf16 v[12:19], v[0:3], v[4:11], v20 index_key:1 +// GFX12: v_swmmac_f32_16x16x32_bf16 v[12:19], v[0:3], v[4:11], v20 index_key:1 ; encoding: [0x0c,0x48,0x51,0xcc,0x00,0x09,0x52,0x1c] + +v_swmmac_f32_16x16x32_bf16 v[12:19], v[0:3], v[4:11], v20 index_key:2 +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: out of range index_key + +v_swmmac_f32_16x16x32_bf16 v[12:19], v[0:3], v[4:11], v20 index_key:3 +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: out of range index_key + +v_swmmac_f32_16x16x32_bf16 v[12:19], v[0:3], v[4:11], v20 neg_lo:[1,0,0] +// GFX12: v_swmmac_f32_16x16x32_bf16 v[12:19], v[0:3], v[4:11], v20 neg_lo:[1,0,0] ; encoding: [0x0c,0x40,0x51,0xcc,0x00,0x09,0x52,0x3c] + +v_swmmac_f32_16x16x32_bf16 v[12:19], v[0:3], v[4:11], v20 neg_lo:[0,1,0] +// GFX12: v_swmmac_f32_16x16x32_bf16 v[12:19], v[0:3], v[4:11], v20 neg_lo:[0,1,0] ; encoding: [0x0c,0x40,0x51,0xcc,0x00,0x09,0x52,0x5c] + +v_swmmac_f32_16x16x32_bf16 v[12:19], v[0:3], v[4:11], v20 neg_lo:[0,0,1] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid neg_lo operand + +v_swmmac_f32_16x16x32_bf16 v[12:19], v[0:3], v[4:11], v20 neg_hi:[1,0,0] +// GFX12: v_swmmac_f32_16x16x32_bf16 v[12:19], v[0:3], v[4:11], v20 neg_hi:[1,0,0] ; encoding: [0x0c,0x41,0x51,0xcc,0x00,0x09,0x52,0x1c] + +v_swmmac_f32_16x16x32_bf16 v[12:19], v[0:3], v[4:11], v20 neg_hi:[0,1,0] +// GFX12: v_swmmac_f32_16x16x32_bf16 v[12:19], v[0:3], v[4:11], v20 neg_hi:[0,1,0] ; encoding: [0x0c,0x42,0x51,0xcc,0x00,0x09,0x52,0x1c] + +v_swmmac_f32_16x16x32_bf16 v[12:19], v[0:3], v[4:11], v20 neg_hi:[0,0,1] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid neg_hi operand + +v_swmmac_f32_16x16x32_bf16 v[12:19], s[0:3], v[4:11], v20 +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +v_swmmac_f32_16x16x32_bf16 v[12:19], v[0:3], s[4:11], v20 +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +v_swmmac_f32_16x16x32_bf16 v[12:19], v[0:3], v[4:11], s20 +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +v_swmmac_f32_16x16x32_bf16 v[12:19], 1.0, v[4:11], v20 +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +v_swmmac_f32_16x16x32_bf16 v[12:19], v[0:3], 1.0, v20 +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +v_swmmac_f32_16x16x32_bf16 v[12:19], v[0:3], v[4:11], 1.0 +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +v_swmmac_f32_16x16x32_bf16 v[12:19], v[0:3], v[4:11], 1 +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + + + +v_swmmac_f16_16x16x32_f16 v[12:15], v[0:3], v[4:11], v16 +// GFX12: v_swmmac_f16_16x16x32_f16 v[12:15], v[0:3], v[4:11], v16 ; encoding: [0x0c,0x40,0x52,0xcc,0x00,0x09,0x42,0x1c] + +v_swmmac_f16_16x16x32_f16 v[12:15], v[0:3], v[4:11], v16 clamp +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +v_swmmac_f16_16x16x32_f16 v[12:15], v[0:3], v[4:11], v16 op_sel:[1,0,0] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_swmmac_f16_16x16x32_f16 v[12:15], v[0:3], v[4:11], v16 op_sel:[0,1,0] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_swmmac_f16_16x16x32_f16 v[12:15], v[0:3], v[4:11], v16 op_sel:[0,0,1] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_swmmac_f16_16x16x32_f16 v[12:15], v[0:3], v[4:11], v16 op_sel_hi:[0,1,1] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_swmmac_f16_16x16x32_f16 v[12:15], v[0:3], v[4:11], v16 op_sel_hi:[1,0,1] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_swmmac_f16_16x16x32_f16 v[12:15], v[0:3], v[4:11], v16 op_sel_hi:[1,1,0] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_swmmac_f16_16x16x32_f16 v[12:15], v[0:3], v[4:11], v16 index_key:1 +// GFX12: v_swmmac_f16_16x16x32_f16 v[12:15], v[0:3], v[4:11], v16 index_key:1 ; encoding: [0x0c,0x48,0x52,0xcc,0x00,0x09,0x42,0x1c] + +v_swmmac_f16_16x16x32_f16 v[12:15], v[0:3], v[4:11], v16 index_key:2 +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: out of range index_key + +v_swmmac_f16_16x16x32_f16 v[12:15], v[0:3], v[4:11], v16 index_key:3 +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: out of range index_key + +v_swmmac_f16_16x16x32_f16 v[12:15], v[0:3], v[4:11], v16 neg_lo:[1,0,0] +// GFX12: v_swmmac_f16_16x16x32_f16 v[12:15], v[0:3], v[4:11], v16 neg_lo:[1,0,0] ; encoding: [0x0c,0x40,0x52,0xcc,0x00,0x09,0x42,0x3c] + +v_swmmac_f16_16x16x32_f16 v[12:15], v[0:3], v[4:11], v16 neg_lo:[0,1,0] +// GFX12: v_swmmac_f16_16x16x32_f16 v[12:15], v[0:3], v[4:11], v16 neg_lo:[0,1,0] ; encoding: [0x0c,0x40,0x52,0xcc,0x00,0x09,0x42,0x5c] + +v_swmmac_f16_16x16x32_f16 v[12:15], v[0:3], v[4:11], v16 neg_lo:[0,0,1] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid neg_lo operand + +v_swmmac_f16_16x16x32_f16 v[12:15], v[0:3], v[4:11], v16 neg_hi:[1,0,0] +// GFX12: v_swmmac_f16_16x16x32_f16 v[12:15], v[0:3], v[4:11], v16 neg_hi:[1,0,0] ; encoding: [0x0c,0x41,0x52,0xcc,0x00,0x09,0x42,0x1c] + +v_swmmac_f16_16x16x32_f16 v[12:15], v[0:3], v[4:11], v16 neg_hi:[0,1,0] +// GFX12: v_swmmac_f16_16x16x32_f16 v[12:15], v[0:3], v[4:11], v16 neg_hi:[0,1,0] ; encoding: [0x0c,0x42,0x52,0xcc,0x00,0x09,0x42,0x1c] + +v_swmmac_f16_16x16x32_f16 v[12:15], v[0:3], v[4:11], v16 neg_hi:[0,0,1] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid neg_hi operand + +v_swmmac_f16_16x16x32_f16 v[12:15], s[0:3], v[4:11], v16 +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +v_swmmac_f16_16x16x32_f16 v[12:15], v[0:3], s[4:11], v16 +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +v_swmmac_f16_16x16x32_f16 v[12:15], v[0:3], v[4:11], s16 +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +v_swmmac_f16_16x16x32_f16 v[12:15], 1.0, v[4:11], v16 +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +v_swmmac_f16_16x16x32_f16 v[12:15], v[0:3], 1.0, v16 +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +v_swmmac_f16_16x16x32_f16 v[12:15], v[0:3], v[4:11], 1.0 +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +v_swmmac_f16_16x16x32_f16 v[12:15], v[0:3], v[4:11], 1 +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + + + +v_swmmac_bf16_16x16x32_bf16 v[12:15], v[0:3], v[4:11], v16 +// GFX12: v_swmmac_bf16_16x16x32_bf16 v[12:15], v[0:3], v[4:11], v16 ; encoding: [0x0c,0x40,0x53,0xcc,0x00,0x09,0x42,0x1c] + +v_swmmac_bf16_16x16x32_bf16 v[12:15], v[0:3], v[4:11], v16 clamp +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +v_swmmac_bf16_16x16x32_bf16 v[12:15], v[0:3], v[4:11], v16 op_sel:[1,0,0] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_swmmac_bf16_16x16x32_bf16 v[12:15], v[0:3], v[4:11], v16 op_sel:[0,1,0] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_swmmac_bf16_16x16x32_bf16 v[12:15], v[0:3], v[4:11], v16 op_sel:[0,0,1] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_swmmac_bf16_16x16x32_bf16 v[12:15], v[0:3], v[4:11], v16 op_sel_hi:[0,1,1] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_swmmac_bf16_16x16x32_bf16 v[12:15], v[0:3], v[4:11], v16 op_sel_hi:[1,0,1] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_swmmac_bf16_16x16x32_bf16 v[12:15], v[0:3], v[4:11], v16 op_sel_hi:[1,1,0] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_swmmac_bf16_16x16x32_bf16 v[12:15], v[0:3], v[4:11], v16 index_key:1 +// GFX12: v_swmmac_bf16_16x16x32_bf16 v[12:15], v[0:3], v[4:11], v16 index_key:1 ; encoding: [0x0c,0x48,0x53,0xcc,0x00,0x09,0x42,0x1c] + +v_swmmac_bf16_16x16x32_bf16 v[12:15], v[0:3], v[4:11], v16 index_key:2 +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: out of range index_key + +v_swmmac_bf16_16x16x32_bf16 v[12:15], v[0:3], v[4:11], v16 index_key:3 +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: out of range index_key + +v_swmmac_bf16_16x16x32_bf16 v[12:15], v[0:3], v[4:11], v16 neg_lo:[1,0,0] +// GFX12: v_swmmac_bf16_16x16x32_bf16 v[12:15], v[0:3], v[4:11], v16 neg_lo:[1,0,0] ; encoding: [0x0c,0x40,0x53,0xcc,0x00,0x09,0x42,0x3c] + +v_swmmac_bf16_16x16x32_bf16 v[12:15], v[0:3], v[4:11], v16 neg_lo:[0,1,0] +// GFX12: v_swmmac_bf16_16x16x32_bf16 v[12:15], v[0:3], v[4:11], v16 neg_lo:[0,1,0] ; encoding: [0x0c,0x40,0x53,0xcc,0x00,0x09,0x42,0x5c] + +v_swmmac_bf16_16x16x32_bf16 v[12:15], v[0:3], v[4:11], v16 neg_lo:[0,0,1] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid neg_lo operand + +v_swmmac_bf16_16x16x32_bf16 v[12:15], v[0:3], v[4:11], v16 neg_hi:[1,0,0] +// GFX12: v_swmmac_bf16_16x16x32_bf16 v[12:15], v[0:3], v[4:11], v16 neg_hi:[1,0,0] ; encoding: [0x0c,0x41,0x53,0xcc,0x00,0x09,0x42,0x1c] + +v_swmmac_bf16_16x16x32_bf16 v[12:15], v[0:3], v[4:11], v16 neg_hi:[0,1,0] +// GFX12: v_swmmac_bf16_16x16x32_bf16 v[12:15], v[0:3], v[4:11], v16 neg_hi:[0,1,0] ; encoding: [0x0c,0x42,0x53,0xcc,0x00,0x09,0x42,0x1c] + +v_swmmac_bf16_16x16x32_bf16 v[12:15], v[0:3], v[4:11], v16 neg_hi:[0,0,1] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid neg_hi operand + +v_swmmac_bf16_16x16x32_bf16 v[12:15], s[0:3], v[4:11], v16 +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +v_swmmac_bf16_16x16x32_bf16 v[12:15], v[0:3], s[4:11], v16 +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +v_swmmac_bf16_16x16x32_bf16 v[12:15], v[0:3], v[4:11], s16 +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +v_swmmac_bf16_16x16x32_bf16 v[12:15], 1.0, v[4:11], v16 +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +v_swmmac_bf16_16x16x32_bf16 v[12:15], v[0:3], 1.0, v16 +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +v_swmmac_bf16_16x16x32_bf16 v[12:15], v[0:3], v[4:11], 1.0 +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +v_swmmac_bf16_16x16x32_bf16 v[12:15], v[0:3], v[4:11], 1 +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + + + +v_swmmac_i32_16x16x32_iu8 v[6:13], v[0:1], v[2:5], v14 +// GFX12: v_swmmac_i32_16x16x32_iu8 v[6:13], v[0:1], v[2:5], v14 ; encoding: [0x06,0x40,0x54,0xcc,0x00,0x05,0x3a,0x1c] + +v_swmmac_i32_16x16x32_iu8 v[6:13], v[0:1], v[2:5], v14 clamp +// GFX12: v_swmmac_i32_16x16x32_iu8 v[6:13], v[0:1], v[2:5], v14 clamp ; encoding: [0x06,0xc0,0x54,0xcc,0x00,0x05,0x3a,0x1c] + +v_swmmac_i32_16x16x32_iu8 v[6:13], v[0:1], v[2:5], v14 op_sel:[1,0,0] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_swmmac_i32_16x16x32_iu8 v[6:13], v[0:1], v[2:5], v14 op_sel:[0,1,0] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_swmmac_i32_16x16x32_iu8 v[6:13], v[0:1], v[2:5], v14 op_sel:[0,0,1] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_swmmac_i32_16x16x32_iu8 v[6:13], v[0:1], v[2:5], v14 op_sel_hi:[0,1,1] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_swmmac_i32_16x16x32_iu8 v[6:13], v[0:1], v[2:5], v14 op_sel_hi:[1,0,1] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_swmmac_i32_16x16x32_iu8 v[6:13], v[0:1], v[2:5], v14 op_sel_hi:[1,1,0] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_swmmac_i32_16x16x32_iu8 v[6:13], v[0:1], v[2:5], v14 index_key:1 +// GFX12: v_swmmac_i32_16x16x32_iu8 v[6:13], v[0:1], v[2:5], v14 index_key:1 ; encoding: [0x06,0x48,0x54,0xcc,0x00,0x05,0x3a,0x1c] + +v_swmmac_i32_16x16x32_iu8 v[6:13], v[0:1], v[2:5], v14 index_key:2 +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: out of range index_key + +v_swmmac_i32_16x16x32_iu8 v[6:13], v[0:1], v[2:5], v14 index_key:3 +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: out of range index_key + +v_swmmac_i32_16x16x32_iu8 v[6:13], v[0:1], v[2:5], v14 neg_lo:[1,0,0] +// GFX12: v_swmmac_i32_16x16x32_iu8 v[6:13], v[0:1], v[2:5], v14 neg_lo:[1,0,0] ; encoding: [0x06,0x40,0x54,0xcc,0x00,0x05,0x3a,0x3c] + +v_swmmac_i32_16x16x32_iu8 v[6:13], v[0:1], v[2:5], v14 neg_lo:[0,1,0] +// GFX12: v_swmmac_i32_16x16x32_iu8 v[6:13], v[0:1], v[2:5], v14 neg_lo:[0,1,0] ; encoding: [0x06,0x40,0x54,0xcc,0x00,0x05,0x3a,0x5c] + +v_swmmac_i32_16x16x32_iu8 v[6:13], v[0:1], v[2:5], v14 neg_lo:[0,0,1] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid neg_lo operand + +v_swmmac_i32_16x16x32_iu8 v[6:13], v[0:1], v[2:5], v14 neg_hi:[1,0,0] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_swmmac_i32_16x16x32_iu8 v[6:13], v[0:1], v[2:5], v14 neg_hi:[0,1,0] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_swmmac_i32_16x16x32_iu8 v[6:13], v[0:1], v[2:5], v14 neg_hi:[0,0,1] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_swmmac_i32_16x16x32_iu8 v[6:13], s[0:1], v[2:5], v14 +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +v_swmmac_i32_16x16x32_iu8 v[6:13], v[0:1], s[0:3], v14 +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +v_swmmac_i32_16x16x32_iu8 v[6:13], v[0:1], v[2:5], s14 +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +v_swmmac_i32_16x16x32_iu8 v[6:13], 1, v[2:5], v14 +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +v_swmmac_i32_16x16x32_iu8 v[6:13], v[0:1], 1, v14 +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +v_swmmac_i32_16x16x32_iu8 v[6:13], v[0:1], v[2:5], 1 +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +v_swmmac_i32_16x16x32_iu8 v[6:13], v[0:1], v[2:5], 1.0 +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + + + +v_swmmac_i32_16x16x32_iu4 v[3:10], v0, v[1:2], v11 +// GFX12: v_swmmac_i32_16x16x32_iu4 v[3:10], v0, v[1:2], v11 ; encoding: [0x03,0x40,0x55,0xcc,0x00,0x03,0x2e,0x1c] + +v_swmmac_i32_16x16x32_iu4 v[3:10], v0, v[1:2], v11 clamp +// GFX12: v_swmmac_i32_16x16x32_iu4 v[3:10], v0, v[1:2], v11 clamp ; encoding: [0x03,0xc0,0x55,0xcc,0x00,0x03,0x2e,0x1c] + +v_swmmac_i32_16x16x32_iu4 v[3:10], v0, v[1:2], v11 op_sel:[1,0,0] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_swmmac_i32_16x16x32_iu4 v[3:10], v0, v[1:2], v11 op_sel:[0,1,0] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_swmmac_i32_16x16x32_iu4 v[3:10], v0, v[1:2], v11 op_sel:[0,0,1] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_swmmac_i32_16x16x32_iu4 v[3:10], v0, v[1:2], v11 op_sel_hi:[0,1,1] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_swmmac_i32_16x16x32_iu4 v[3:10], v0, v[1:2], v11 op_sel_hi:[1,0,1] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_swmmac_i32_16x16x32_iu4 v[3:10], v0, v[1:2], v11 op_sel_hi:[1,1,0] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_swmmac_i32_16x16x32_iu4 v[3:10], v0, v[1:2], v11 index_key:1 +// GFX12: v_swmmac_i32_16x16x32_iu4 v[3:10], v0, v[1:2], v11 index_key:1 ; encoding: [0x03,0x48,0x55,0xcc,0x00,0x03,0x2e,0x1c] + +v_swmmac_i32_16x16x32_iu4 v[3:10], v0, v[1:2], v11 index_key:2 +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: out of range index_key + +v_swmmac_i32_16x16x32_iu4 v[3:10], v0, v[1:2], v11 index_key:3 +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: out of range index_key + +v_swmmac_i32_16x16x32_iu4 v[3:10], v0, v[1:2], v11 neg_lo:[1,0,0] +// GFX12: v_swmmac_i32_16x16x32_iu4 v[3:10], v0, v[1:2], v11 neg_lo:[1,0,0] ; encoding: [0x03,0x40,0x55,0xcc,0x00,0x03,0x2e,0x3c] + +v_swmmac_i32_16x16x32_iu4 v[3:10], v0, v[1:2], v11 neg_lo:[0,1,0] +// GFX12: v_swmmac_i32_16x16x32_iu4 v[3:10], v0, v[1:2], v11 neg_lo:[0,1,0] ; encoding: [0x03,0x40,0x55,0xcc,0x00,0x03,0x2e,0x5c] + +v_swmmac_i32_16x16x32_iu4 v[3:10], v0, v[1:2], v11 neg_lo:[0,0,1] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid neg_lo operand + +v_swmmac_i32_16x16x32_iu4 v[3:10], v0, v[1:2], v11 neg_hi:[1,0,0] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_swmmac_i32_16x16x32_iu4 v[3:10], v0, v[1:2], v11 neg_hi:[0,1,0] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_swmmac_i32_16x16x32_iu4 v[3:10], v0, v[1:2], v11 neg_hi:[0,0,1] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_swmmac_i32_16x16x32_iu4 v[3:10], s0, v[1:2], v11 +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +v_swmmac_i32_16x16x32_iu4 v[3:10], v0, s[0:1], v11 +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +v_swmmac_i32_16x16x32_iu4 v[3:10], v0, v[1:2], s11 +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +v_swmmac_i32_16x16x32_iu4 v[3:10], 1, v[1:2], v11 +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +v_swmmac_i32_16x16x32_iu4 v[3:10], v0, 1, v11 +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +v_swmmac_i32_16x16x32_iu4 v[3:10], v0, v[1:2], 1 +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +v_swmmac_i32_16x16x32_iu4 v[3:10], v0, v[1:2], 1.0 +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + + + +v_swmmac_i32_16x16x64_iu4 v[6:13], v[0:1], v[2:5], v14 +// GFX12: v_swmmac_i32_16x16x64_iu4 v[6:13], v[0:1], v[2:5], v14 ; encoding: [0x06,0x40,0x56,0xcc,0x00,0x05,0x3a,0x1c] + +v_swmmac_i32_16x16x64_iu4 v[6:13], v[0:1], v[2:5], v14 clamp +// GFX12: v_swmmac_i32_16x16x64_iu4 v[6:13], v[0:1], v[2:5], v14 clamp ; encoding: [0x06,0xc0,0x56,0xcc,0x00,0x05,0x3a,0x1c] + +v_swmmac_i32_16x16x64_iu4 v[6:13], v[0:1], v[2:5], v14 op_sel:[1,0,0] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_swmmac_i32_16x16x64_iu4 v[6:13], v[0:1], v[2:5], v14 op_sel:[0,1,0] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_swmmac_i32_16x16x64_iu4 v[6:13], v[0:1], v[2:5], v14 op_sel:[0,0,1] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_swmmac_i32_16x16x64_iu4 v[6:13], v[0:1], v[2:5], v14 op_sel_hi:[0,1,1] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_swmmac_i32_16x16x64_iu4 v[6:13], v[0:1], v[2:5], v14 op_sel_hi:[1,0,1] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_swmmac_i32_16x16x64_iu4 v[6:13], v[0:1], v[2:5], v14 op_sel_hi:[1,1,0] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_swmmac_i32_16x16x64_iu4 v[6:13], v[0:1], v[2:5], v14 index_key:1 +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_swmmac_i32_16x16x64_iu4 v[6:13], v[0:1], v[2:5], v14 index_key:2 +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_swmmac_i32_16x16x64_iu4 v[6:13], v[0:1], v[2:5], v14 index_key:3 +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_swmmac_i32_16x16x64_iu4 v[6:13], v[0:1], v[2:5], v14 neg_lo:[1,0,0] +// GFX12: v_swmmac_i32_16x16x64_iu4 v[6:13], v[0:1], v[2:5], v14 neg_lo:[1,0,0] ; encoding: [0x06,0x40,0x56,0xcc,0x00,0x05,0x3a,0x3c] + +v_swmmac_i32_16x16x64_iu4 v[6:13], v[0:1], v[2:5], v14 neg_lo:[0,1,0] +// GFX12: v_swmmac_i32_16x16x64_iu4 v[6:13], v[0:1], v[2:5], v14 neg_lo:[0,1,0] ; encoding: [0x06,0x40,0x56,0xcc,0x00,0x05,0x3a,0x5c] + +v_swmmac_i32_16x16x64_iu4 v[6:13], v[0:1], v[2:5], v14 neg_lo:[0,0,1] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid neg_lo operand + +v_swmmac_i32_16x16x64_iu4 v[6:13], v[0:1], v[2:5], v14 neg_hi:[1,0,0] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_swmmac_i32_16x16x64_iu4 v[6:13], v[0:1], v[2:5], v14 neg_hi:[0,1,0] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_swmmac_i32_16x16x64_iu4 v[6:13], v[0:1], v[2:5], v14 neg_hi:[0,0,1] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_swmmac_i32_16x16x64_iu4 v[6:13], s[0:1], v[2:5], v14 +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +v_swmmac_i32_16x16x64_iu4 v[6:13], v[0:1], s[0:3], v14 +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +v_swmmac_i32_16x16x64_iu4 v[6:13], v[0:1], v[2:5], s14 +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +v_swmmac_i32_16x16x64_iu4 v[6:13], 1, v[2:5], v14 +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +v_swmmac_i32_16x16x64_iu4 v[6:13], v[0:1], 1, v14 +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +v_swmmac_i32_16x16x64_iu4 v[6:13], v[0:1], v[2:5], 1 +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +v_swmmac_i32_16x16x64_iu4 v[6:13], v[0:1], v[2:5], 1.0 +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + + + +v_swmmac_f32_16x16x32_fp8_fp8 v[6:13], v[0:1], v[2:5], v14 +// GFX12: v_swmmac_f32_16x16x32_fp8_fp8 v[6:13], v[0:1], v[2:5], v14 ; encoding: [0x06,0x40,0x57,0xcc,0x00,0x05,0x3a,0x1c] + +v_swmmac_f32_16x16x32_fp8_fp8 v[6:13], v[0:1], v[2:5], v14 clamp +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +v_swmmac_f32_16x16x32_fp8_fp8 v[6:13], v[0:1], v[2:5], v14 op_sel:[1,0,0] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_swmmac_f32_16x16x32_fp8_fp8 v[6:13], v[0:1], v[2:5], v14 op_sel:[0,1,0] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_swmmac_f32_16x16x32_fp8_fp8 v[6:13], v[0:1], v[2:5], v14 op_sel:[0,0,1] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_swmmac_f32_16x16x32_fp8_fp8 v[6:13], v[0:1], v[2:5], v14 op_sel_hi:[0,1,1] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_swmmac_f32_16x16x32_fp8_fp8 v[6:13], v[0:1], v[2:5], v14 op_sel_hi:[1,0,1] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_swmmac_f32_16x16x32_fp8_fp8 v[6:13], v[0:1], v[2:5], v14 op_sel_hi:[1,1,0] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_swmmac_f32_16x16x32_fp8_fp8 v[6:13], v[0:1], v[2:5], v14 index_key:1 +// GFX12: v_swmmac_f32_16x16x32_fp8_fp8 v[6:13], v[0:1], v[2:5], v14 index_key:1 ; encoding: [0x06,0x48,0x57,0xcc,0x00,0x05,0x3a,0x1c] + +v_swmmac_f32_16x16x32_fp8_fp8 v[6:13], v[0:1], v[2:5], v14 index_key:2 +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: out of range index_key + +v_swmmac_f32_16x16x32_fp8_fp8 v[6:13], v[0:1], v[2:5], v14 index_key:3 +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: out of range index_key + +v_swmmac_f32_16x16x32_fp8_fp8 v[6:13], v[0:1], v[2:5], v14 neg_lo:[1,0,0] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_swmmac_f32_16x16x32_fp8_fp8 v[6:13], v[0:1], v[2:5], v14 neg_lo:[0,1,0] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_swmmac_f32_16x16x32_fp8_fp8 v[6:13], v[0:1], v[2:5], v14 neg_lo:[0,0,1] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_swmmac_f32_16x16x32_fp8_fp8 v[6:13], v[0:1], v[2:5], v14 neg_hi:[1,0,0] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_swmmac_f32_16x16x32_fp8_fp8 v[6:13], v[0:1], v[2:5], v14 neg_hi:[0,1,0] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_swmmac_f32_16x16x32_fp8_fp8 v[6:13], v[0:1], v[2:5], v14 neg_hi:[0,0,1] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_swmmac_f32_16x16x32_fp8_fp8 v[6:13], s[0:1], v[2:5], v14 +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +v_swmmac_f32_16x16x32_fp8_fp8 v[6:13], v[0:1], s[0:3], v14 +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +v_swmmac_f32_16x16x32_fp8_fp8 v[6:13], v[0:1], v[2:5], s14 +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +v_swmmac_f32_16x16x32_fp8_fp8 v[6:13], 1.0, v[2:5], v14 +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +v_swmmac_f32_16x16x32_fp8_fp8 v[6:13], v[0:1], 1.0, v14 +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +v_swmmac_f32_16x16x32_fp8_fp8 v[6:13], v[0:1], v[2:5], 1.0 +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +v_swmmac_f32_16x16x32_fp8_fp8 v[6:13], v[0:1], v[2:5], 1 +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + + + +v_swmmac_f32_16x16x32_fp8_bf8 v[6:13], v[0:1], v[2:5], v14 +// GFX12: v_swmmac_f32_16x16x32_fp8_bf8 v[6:13], v[0:1], v[2:5], v14 ; encoding: [0x06,0x40,0x58,0xcc,0x00,0x05,0x3a,0x1c] + +v_swmmac_f32_16x16x32_fp8_bf8 v[6:13], v[0:1], v[2:5], v14 clamp +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +v_swmmac_f32_16x16x32_fp8_bf8 v[6:13], v[0:1], v[2:5], v14 op_sel:[1,0,0] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_swmmac_f32_16x16x32_fp8_bf8 v[6:13], v[0:1], v[2:5], v14 op_sel:[0,1,0] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_swmmac_f32_16x16x32_fp8_bf8 v[6:13], v[0:1], v[2:5], v14 op_sel:[0,0,1] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_swmmac_f32_16x16x32_fp8_bf8 v[6:13], v[0:1], v[2:5], v14 op_sel_hi:[0,1,1] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_swmmac_f32_16x16x32_fp8_bf8 v[6:13], v[0:1], v[2:5], v14 op_sel_hi:[1,0,1] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_swmmac_f32_16x16x32_fp8_bf8 v[6:13], v[0:1], v[2:5], v14 op_sel_hi:[1,1,0] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_swmmac_f32_16x16x32_fp8_bf8 v[6:13], v[0:1], v[2:5], v14 index_key:1 +// GFX12: v_swmmac_f32_16x16x32_fp8_bf8 v[6:13], v[0:1], v[2:5], v14 index_key:1 ; encoding: [0x06,0x48,0x58,0xcc,0x00,0x05,0x3a,0x1c] + +v_swmmac_f32_16x16x32_fp8_bf8 v[6:13], v[0:1], v[2:5], v14 index_key:2 +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: out of range index_key + +v_swmmac_f32_16x16x32_fp8_bf8 v[6:13], v[0:1], v[2:5], v14 index_key:3 +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: out of range index_key + +v_swmmac_f32_16x16x32_fp8_bf8 v[6:13], v[0:1], v[2:5], v14 neg_lo:[1,0,0] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_swmmac_f32_16x16x32_fp8_bf8 v[6:13], v[0:1], v[2:5], v14 neg_lo:[0,1,0] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_swmmac_f32_16x16x32_fp8_bf8 v[6:13], v[0:1], v[2:5], v14 neg_lo:[0,0,1] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_swmmac_f32_16x16x32_fp8_bf8 v[6:13], v[0:1], v[2:5], v14 neg_hi:[1,0,0] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_swmmac_f32_16x16x32_fp8_bf8 v[6:13], v[0:1], v[2:5], v14 neg_hi:[0,1,0] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_swmmac_f32_16x16x32_fp8_bf8 v[6:13], v[0:1], v[2:5], v14 neg_hi:[0,0,1] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_swmmac_f32_16x16x32_fp8_bf8 v[6:13], s[0:1], v[2:5], v14 +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +v_swmmac_f32_16x16x32_fp8_bf8 v[6:13], v[0:1], s[0:3], v14 +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +v_swmmac_f32_16x16x32_fp8_bf8 v[6:13], v[0:1], v[2:5], s14 +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +v_swmmac_f32_16x16x32_fp8_bf8 v[6:13], 1.0, v[2:5], v14 +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +v_swmmac_f32_16x16x32_fp8_bf8 v[6:13], v[0:1], 1.0, v14 +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +v_swmmac_f32_16x16x32_fp8_bf8 v[6:13], v[0:1], v[2:5], 1.0 +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +v_swmmac_f32_16x16x32_fp8_bf8 v[6:13], v[0:1], v[2:5], 1 +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + + + +v_swmmac_f32_16x16x32_bf8_fp8 v[6:13], v[0:1], v[2:5], v14 +// GFX12: v_swmmac_f32_16x16x32_bf8_fp8 v[6:13], v[0:1], v[2:5], v14 ; encoding: [0x06,0x40,0x59,0xcc,0x00,0x05,0x3a,0x1c] + +v_swmmac_f32_16x16x32_bf8_fp8 v[6:13], v[0:1], v[2:5], v14 clamp +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +v_swmmac_f32_16x16x32_bf8_fp8 v[6:13], v[0:1], v[2:5], v14 op_sel:[1,0,0] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_swmmac_f32_16x16x32_bf8_fp8 v[6:13], v[0:1], v[2:5], v14 op_sel:[0,1,0] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_swmmac_f32_16x16x32_bf8_fp8 v[6:13], v[0:1], v[2:5], v14 op_sel:[0,0,1] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_swmmac_f32_16x16x32_bf8_fp8 v[6:13], v[0:1], v[2:5], v14 op_sel_hi:[0,1,1] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_swmmac_f32_16x16x32_bf8_fp8 v[6:13], v[0:1], v[2:5], v14 op_sel_hi:[1,0,1] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_swmmac_f32_16x16x32_bf8_fp8 v[6:13], v[0:1], v[2:5], v14 op_sel_hi:[1,1,0] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_swmmac_f32_16x16x32_bf8_fp8 v[6:13], v[0:1], v[2:5], v14 index_key:1 +// GFX12: v_swmmac_f32_16x16x32_bf8_fp8 v[6:13], v[0:1], v[2:5], v14 index_key:1 ; encoding: [0x06,0x48,0x59,0xcc,0x00,0x05,0x3a,0x1c] + +v_swmmac_f32_16x16x32_bf8_fp8 v[6:13], v[0:1], v[2:5], v14 index_key:2 +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: out of range index_key + +v_swmmac_f32_16x16x32_bf8_fp8 v[6:13], v[0:1], v[2:5], v14 index_key:3 +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: out of range index_key + +v_swmmac_f32_16x16x32_bf8_fp8 v[6:13], v[0:1], v[2:5], v14 neg_lo:[1,0,0] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_swmmac_f32_16x16x32_bf8_fp8 v[6:13], v[0:1], v[2:5], v14 neg_lo:[0,1,0] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_swmmac_f32_16x16x32_bf8_fp8 v[6:13], v[0:1], v[2:5], v14 neg_lo:[0,0,1] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_swmmac_f32_16x16x32_bf8_fp8 v[6:13], v[0:1], v[2:5], v14 neg_hi:[1,0,0] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_swmmac_f32_16x16x32_bf8_fp8 v[6:13], v[0:1], v[2:5], v14 neg_hi:[0,1,0] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_swmmac_f32_16x16x32_bf8_fp8 v[6:13], v[0:1], v[2:5], v14 neg_hi:[0,0,1] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_swmmac_f32_16x16x32_bf8_fp8 v[6:13], s[0:1], v[2:5], v14 +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +v_swmmac_f32_16x16x32_bf8_fp8 v[6:13], v[0:1], s[0:3], v14 +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +v_swmmac_f32_16x16x32_bf8_fp8 v[6:13], v[0:1], v[2:5], s14 +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +v_swmmac_f32_16x16x32_bf8_fp8 v[6:13], 1.0, v[2:5], v14 +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +v_swmmac_f32_16x16x32_bf8_fp8 v[6:13], v[0:1], 1.0, v14 +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +v_swmmac_f32_16x16x32_bf8_fp8 v[6:13], v[0:1], v[2:5], 1.0 +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +v_swmmac_f32_16x16x32_bf8_fp8 v[6:13], v[0:1], v[2:5], 1 +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + + + +v_swmmac_f32_16x16x32_bf8_bf8 v[6:13], v[0:1], v[2:5], v14 +// GFX12: v_swmmac_f32_16x16x32_bf8_bf8 v[6:13], v[0:1], v[2:5], v14 ; encoding: [0x06,0x40,0x5a,0xcc,0x00,0x05,0x3a,0x1c] + +v_swmmac_f32_16x16x32_bf8_bf8 v[6:13], v[0:1], v[2:5], v14 clamp +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +v_swmmac_f32_16x16x32_bf8_bf8 v[6:13], v[0:1], v[2:5], v14 op_sel:[1,0,0] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_swmmac_f32_16x16x32_bf8_bf8 v[6:13], v[0:1], v[2:5], v14 op_sel:[0,1,0] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_swmmac_f32_16x16x32_bf8_bf8 v[6:13], v[0:1], v[2:5], v14 op_sel:[0,0,1] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_swmmac_f32_16x16x32_bf8_bf8 v[6:13], v[0:1], v[2:5], v14 op_sel_hi:[0,1,1] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_swmmac_f32_16x16x32_bf8_bf8 v[6:13], v[0:1], v[2:5], v14 op_sel_hi:[1,0,1] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_swmmac_f32_16x16x32_bf8_bf8 v[6:13], v[0:1], v[2:5], v14 op_sel_hi:[1,1,0] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_swmmac_f32_16x16x32_bf8_bf8 v[6:13], v[0:1], v[2:5], v14 index_key:1 +// GFX12: v_swmmac_f32_16x16x32_bf8_bf8 v[6:13], v[0:1], v[2:5], v14 index_key:1 ; encoding: [0x06,0x48,0x5a,0xcc,0x00,0x05,0x3a,0x1c] + +v_swmmac_f32_16x16x32_bf8_bf8 v[6:13], v[0:1], v[2:5], v14 index_key:2 +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: out of range index_key + +v_swmmac_f32_16x16x32_bf8_bf8 v[6:13], v[0:1], v[2:5], v14 index_key:3 +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: out of range index_key + +v_swmmac_f32_16x16x32_bf8_bf8 v[6:13], v[0:1], v[2:5], v14 neg_lo:[1,0,0] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_swmmac_f32_16x16x32_bf8_bf8 v[6:13], v[0:1], v[2:5], v14 neg_lo:[0,1,0] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_swmmac_f32_16x16x32_bf8_bf8 v[6:13], v[0:1], v[2:5], v14 neg_lo:[0,0,1] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_swmmac_f32_16x16x32_bf8_bf8 v[6:13], v[0:1], v[2:5], v14 neg_hi:[1,0,0] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_swmmac_f32_16x16x32_bf8_bf8 v[6:13], v[0:1], v[2:5], v14 neg_hi:[0,1,0] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_swmmac_f32_16x16x32_bf8_bf8 v[6:13], v[0:1], v[2:5], v14 neg_hi:[0,0,1] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_swmmac_f32_16x16x32_bf8_bf8 v[6:13], s[0:1], v[2:5], v14 +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +v_swmmac_f32_16x16x32_bf8_bf8 v[6:13], v[0:1], s[0:3], v14 +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +v_swmmac_f32_16x16x32_bf8_bf8 v[6:13], v[0:1], v[2:5], s14 +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +v_swmmac_f32_16x16x32_bf8_bf8 v[6:13], 1.0, v[2:5], v14 +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +v_swmmac_f32_16x16x32_bf8_bf8 v[6:13], v[0:1], 1.0, v14 +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +v_swmmac_f32_16x16x32_bf8_bf8 v[6:13], v[0:1], v[2:5], 1.0 +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +v_swmmac_f32_16x16x32_bf8_bf8 v[6:13], v[0:1], v[2:5], 1 +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction diff --git a/llvm/test/MC/AMDGPU/gfx12_asm_wmma_w64.s b/llvm/test/MC/AMDGPU/gfx12_asm_wmma_w64.s new file mode 100644 index 000000000000..8bd9e5039b72 --- /dev/null +++ b/llvm/test/MC/AMDGPU/gfx12_asm_wmma_w64.s @@ -0,0 +1,1529 @@ +// RUN: not llvm-mc -arch=amdgcn -mcpu=gfx1200 -mattr=-wavefrontsize32,+wavefrontsize64 -show-encoding %s | FileCheck --check-prefix=GFX12 %s +// RUN: not llvm-mc -arch=amdgcn -mcpu=gfx1200 -mattr=-wavefrontsize32,+wavefrontsize64 %s 2>&1 | FileCheck --check-prefix=GFX12-ERR --implicit-check-not=error: %s + +v_wmma_f32_16x16x16_f16 v[4:7], v[0:1], v[2:3], v[4:7] +// GFX12: v_wmma_f32_16x16x16_f16 v[4:7], v[0:1], v[2:3], v[4:7] ; encoding: [0x04,0x40,0x40,0xcc,0x00,0x05,0x12,0x1c] + +v_wmma_f32_16x16x16_f16 v[4:7], v[0:1], v[2:3], v[4:7] clamp +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +v_wmma_f32_16x16x16_f16 v[4:7], v[0:1], v[2:3], v[4:7] op_sel:[1,0,0] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_wmma_f32_16x16x16_f16 v[4:7], v[0:1], v[2:3], v[4:7] op_sel:[0,1,0] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_wmma_f32_16x16x16_f16 v[4:7], v[0:1], v[2:3], v[4:7] op_sel:[0,0,1] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_wmma_f32_16x16x16_f16 v[4:7], v[0:1], v[2:3], v[4:7] op_sel_hi:[0,1,1] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_wmma_f32_16x16x16_f16 v[4:7], v[0:1], v[2:3], v[4:7] op_sel_hi:[1,0,1] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_wmma_f32_16x16x16_f16 v[4:7], v[0:1], v[2:3], v[4:7] op_sel_hi:[1,1,0] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_wmma_f32_16x16x16_f16 v[4:7], v[0:1], v[2:3], v[4:7] neg_lo:[1,0,0] +// GFX12: v_wmma_f32_16x16x16_f16 v[4:7], v[0:1], v[2:3], v[4:7] neg_lo:[1,0,0] ; encoding: [0x04,0x40,0x40,0xcc,0x00,0x05,0x12,0x3c] + +v_wmma_f32_16x16x16_f16 v[4:7], v[0:1], v[2:3], v[4:7] neg_lo:[0,1,0] +// GFX12: v_wmma_f32_16x16x16_f16 v[4:7], v[0:1], v[2:3], v[4:7] neg_lo:[0,1,0] ; encoding: [0x04,0x40,0x40,0xcc,0x00,0x05,0x12,0x5c] + +v_wmma_f32_16x16x16_f16 v[4:7], v[0:1], v[2:3], v[4:7] neg_lo:[0,0,1] +// GFX12: v_wmma_f32_16x16x16_f16 v[4:7], v[0:1], v[2:3], v[4:7] neg_lo:[0,0,1] ; encoding: [0x04,0x40,0x40,0xcc,0x00,0x05,0x12,0x9c] + +v_wmma_f32_16x16x16_f16 v[4:7], v[0:1], v[2:3], v[4:7] neg_hi:[1,0,0] +// GFX12: v_wmma_f32_16x16x16_f16 v[4:7], v[0:1], v[2:3], v[4:7] neg_hi:[1,0,0] ; encoding: [0x04,0x41,0x40,0xcc,0x00,0x05,0x12,0x1c] + +v_wmma_f32_16x16x16_f16 v[4:7], v[0:1], v[2:3], v[4:7] neg_hi:[0,1,0] +// GFX12: v_wmma_f32_16x16x16_f16 v[4:7], v[0:1], v[2:3], v[4:7] neg_hi:[0,1,0] ; encoding: [0x04,0x42,0x40,0xcc,0x00,0x05,0x12,0x1c] + +v_wmma_f32_16x16x16_f16 v[4:7], v[0:1], v[2:3], v[4:7] neg_hi:[0,0,1] +// GFX12: v_wmma_f32_16x16x16_f16 v[4:7], v[0:1], v[2:3], v[4:7] neg_hi:[0,0,1] ; encoding: [0x04,0x44,0x40,0xcc,0x00,0x05,0x12,0x1c] + +v_wmma_f32_16x16x16_f16 v[4:7], s[0:1], v[2:3], v[4:7] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +v_wmma_f32_16x16x16_f16 v[4:7], v[0:1], s[2:3], v[4:7] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +v_wmma_f32_16x16x16_f16 v[4:7], v[0:1], v[2:3], s[4:7] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +v_wmma_f32_16x16x16_f16 v[4:7], 1.0, v[2:3], v[4:7] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +v_wmma_f32_16x16x16_f16 v[4:7], v[0:1], 1.0, v[4:7] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +v_wmma_f32_16x16x16_f16 v[4:7], v[0:1], v[2:3], 1.0 +// GFX12: v_wmma_f32_16x16x16_f16 v[4:7], v[0:1], v[2:3], 1.0 ; encoding: [0x04,0x40,0x40,0xcc,0x00,0x05,0xca,0x1b] + +v_wmma_f32_16x16x16_f16 v[4:7], v[0:1], v[2:3], 1 +// GFX12: v_wmma_f32_16x16x16_f16 v[4:7], v[0:1], v[2:3], 1 ; encoding: [0x04,0x40,0x40,0xcc,0x00,0x05,0x06,0x1a] + + + +v_wmma_f32_16x16x16_bf16 v[4:7], v[0:1], v[2:3], v[4:7] +// GFX12: v_wmma_f32_16x16x16_bf16 v[4:7], v[0:1], v[2:3], v[4:7] ; encoding: [0x04,0x40,0x41,0xcc,0x00,0x05,0x12,0x1c] + +v_wmma_f32_16x16x16_bf16 v[4:7], v[0:1], v[2:3], v[4:7] clamp +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +v_wmma_f32_16x16x16_bf16 v[4:7], v[0:1], v[2:3], v[4:7] op_sel:[1,0,0] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_wmma_f32_16x16x16_bf16 v[4:7], v[0:1], v[2:3], v[4:7] op_sel:[0,1,0] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_wmma_f32_16x16x16_bf16 v[4:7], v[0:1], v[2:3], v[4:7] op_sel:[0,0,1] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_wmma_f32_16x16x16_bf16 v[4:7], v[0:1], v[2:3], v[4:7] op_sel_hi:[0,1,1] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_wmma_f32_16x16x16_bf16 v[4:7], v[0:1], v[2:3], v[4:7] op_sel_hi:[1,0,1] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_wmma_f32_16x16x16_bf16 v[4:7], v[0:1], v[2:3], v[4:7] op_sel_hi:[1,1,0] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_wmma_f32_16x16x16_bf16 v[4:7], v[0:1], v[2:3], v[4:7] neg_lo:[1,0,0] +// GFX12: v_wmma_f32_16x16x16_bf16 v[4:7], v[0:1], v[2:3], v[4:7] neg_lo:[1,0,0] ; encoding: [0x04,0x40,0x41,0xcc,0x00,0x05,0x12,0x3c] + +v_wmma_f32_16x16x16_bf16 v[4:7], v[0:1], v[2:3], v[4:7] neg_lo:[0,1,0] +// GFX12: v_wmma_f32_16x16x16_bf16 v[4:7], v[0:1], v[2:3], v[4:7] neg_lo:[0,1,0] ; encoding: [0x04,0x40,0x41,0xcc,0x00,0x05,0x12,0x5c] + +v_wmma_f32_16x16x16_bf16 v[4:7], v[0:1], v[2:3], v[4:7] neg_lo:[0,0,1] +// GFX12: v_wmma_f32_16x16x16_bf16 v[4:7], v[0:1], v[2:3], v[4:7] neg_lo:[0,0,1] ; encoding: [0x04,0x40,0x41,0xcc,0x00,0x05,0x12,0x9c] + +v_wmma_f32_16x16x16_bf16 v[4:7], v[0:1], v[2:3], v[4:7] neg_hi:[1,0,0] +// GFX12: v_wmma_f32_16x16x16_bf16 v[4:7], v[0:1], v[2:3], v[4:7] neg_hi:[1,0,0] ; encoding: [0x04,0x41,0x41,0xcc,0x00,0x05,0x12,0x1c] + +v_wmma_f32_16x16x16_bf16 v[4:7], v[0:1], v[2:3], v[4:7] neg_hi:[0,1,0] +// GFX12: v_wmma_f32_16x16x16_bf16 v[4:7], v[0:1], v[2:3], v[4:7] neg_hi:[0,1,0] ; encoding: [0x04,0x42,0x41,0xcc,0x00,0x05,0x12,0x1c] + +v_wmma_f32_16x16x16_bf16 v[4:7], v[0:1], v[2:3], v[4:7] neg_hi:[0,0,1] +// GFX12: v_wmma_f32_16x16x16_bf16 v[4:7], v[0:1], v[2:3], v[4:7] neg_hi:[0,0,1] ; encoding: [0x04,0x44,0x41,0xcc,0x00,0x05,0x12,0x1c] + +v_wmma_f32_16x16x16_bf16 v[4:7], s[0:1], v[2:3], v[4:7] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +v_wmma_f32_16x16x16_bf16 v[4:7], v[0:1], s[2:3], v[4:7] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +v_wmma_f32_16x16x16_bf16 v[4:7], v[0:1], v[2:3], s[4:7] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +v_wmma_f32_16x16x16_bf16 v[4:7], 1.0, v[2:3], v[4:7] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +v_wmma_f32_16x16x16_bf16 v[4:7], v[0:1], 1.0, v[4:7] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +v_wmma_f32_16x16x16_bf16 v[4:7], v[0:1], v[2:3], 1.0 +// GFX12: v_wmma_f32_16x16x16_bf16 v[4:7], v[0:1], v[2:3], 1.0 ; encoding: [0x04,0x40,0x41,0xcc,0x00,0x05,0xca,0x1b] + +v_wmma_f32_16x16x16_bf16 v[4:7], v[0:1], v[2:3], 1 +// GFX12: v_wmma_f32_16x16x16_bf16 v[4:7], v[0:1], v[2:3], 1 ; encoding: [0x04,0x40,0x41,0xcc,0x00,0x05,0x06,0x1a] + + + +v_wmma_f16_16x16x16_f16 v[4:5], v[0:1], v[2:3], v[4:5] +// GFX12: v_wmma_f16_16x16x16_f16 v[4:5], v[0:1], v[2:3], v[4:5] ; encoding: [0x04,0x40,0x42,0xcc,0x00,0x05,0x12,0x1c] + +v_wmma_f16_16x16x16_f16 v[4:5], v[0:1], v[2:3], v[4:5] clamp +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +v_wmma_f16_16x16x16_f16 v[4:5], v[0:1], v[2:3], v[4:5] op_sel:[1,0,0] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_wmma_f16_16x16x16_f16 v[4:5], v[0:1], v[2:3], v[4:5] op_sel:[0,1,0] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_wmma_f16_16x16x16_f16 v[4:5], v[0:1], v[2:3], v[4:5] op_sel:[0,0,1] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_wmma_f16_16x16x16_f16 v[4:5], v[0:1], v[2:3], v[4:5] op_sel_hi:[0,1,1] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_wmma_f16_16x16x16_f16 v[4:5], v[0:1], v[2:3], v[4:5] op_sel_hi:[1,0,1] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_wmma_f16_16x16x16_f16 v[4:5], v[0:1], v[2:3], v[4:5] op_sel_hi:[1,1,0] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_wmma_f16_16x16x16_f16 v[4:5], v[0:1], v[2:3], v[4:5] neg_lo:[1,0,0] +// GFX12: v_wmma_f16_16x16x16_f16 v[4:5], v[0:1], v[2:3], v[4:5] neg_lo:[1,0,0] ; encoding: [0x04,0x40,0x42,0xcc,0x00,0x05,0x12,0x3c] + +v_wmma_f16_16x16x16_f16 v[4:5], v[0:1], v[2:3], v[4:5] neg_lo:[0,1,0] +// GFX12: v_wmma_f16_16x16x16_f16 v[4:5], v[0:1], v[2:3], v[4:5] neg_lo:[0,1,0] ; encoding: [0x04,0x40,0x42,0xcc,0x00,0x05,0x12,0x5c] + +v_wmma_f16_16x16x16_f16 v[4:5], v[0:1], v[2:3], v[4:5] neg_lo:[0,0,1] +// GFX12: v_wmma_f16_16x16x16_f16 v[4:5], v[0:1], v[2:3], v[4:5] neg_lo:[0,0,1] ; encoding: [0x04,0x40,0x42,0xcc,0x00,0x05,0x12,0x9c] + +v_wmma_f16_16x16x16_f16 v[4:5], v[0:1], v[2:3], v[4:5] neg_hi:[1,0,0] +// GFX12: v_wmma_f16_16x16x16_f16 v[4:5], v[0:1], v[2:3], v[4:5] neg_hi:[1,0,0] ; encoding: [0x04,0x41,0x42,0xcc,0x00,0x05,0x12,0x1c] + +v_wmma_f16_16x16x16_f16 v[4:5], v[0:1], v[2:3], v[4:5] neg_hi:[0,1,0] +// GFX12: v_wmma_f16_16x16x16_f16 v[4:5], v[0:1], v[2:3], v[4:5] neg_hi:[0,1,0] ; encoding: [0x04,0x42,0x42,0xcc,0x00,0x05,0x12,0x1c] + +v_wmma_f16_16x16x16_f16 v[4:5], v[0:1], v[2:3], v[4:5] neg_hi:[0,0,1] +// GFX12: v_wmma_f16_16x16x16_f16 v[4:5], v[0:1], v[2:3], v[4:5] neg_hi:[0,0,1] ; encoding: [0x04,0x44,0x42,0xcc,0x00,0x05,0x12,0x1c] + +v_wmma_f16_16x16x16_f16 v[4:5], s[0:1], v[2:3], v[4:5] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +v_wmma_f16_16x16x16_f16 v[4:5], v[0:1], s[2:3], v[4:5] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +v_wmma_f16_16x16x16_f16 v[4:5], v[0:1], v[2:3], s[4:5] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +v_wmma_f16_16x16x16_f16 v[4:5], 1.0, v[2:3], v[4:5] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +v_wmma_f16_16x16x16_f16 v[4:5], v[0:1], 1.0, v[4:5] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +v_wmma_f16_16x16x16_f16 v[4:5], v[0:1], v[2:3], 1.0 +// GFX12: v_wmma_f16_16x16x16_f16 v[4:5], v[0:1], v[2:3], 1.0 ; encoding: [0x04,0x40,0x42,0xcc,0x00,0x05,0xca,0x1b] + +v_wmma_f16_16x16x16_f16 v[4:5], v[0:1], v[2:3], 1 +// GFX12: v_wmma_f16_16x16x16_f16 v[4:5], v[0:1], v[2:3], 1 ; encoding: [0x04,0x40,0x42,0xcc,0x00,0x05,0x06,0x1a] + + + +v_wmma_bf16_16x16x16_bf16 v[4:5], v[0:1], v[2:3], v[4:5] +// GFX12: v_wmma_bf16_16x16x16_bf16 v[4:5], v[0:1], v[2:3], v[4:5] ; encoding: [0x04,0x40,0x43,0xcc,0x00,0x05,0x12,0x1c] + +v_wmma_bf16_16x16x16_bf16 v[4:5], v[0:1], v[2:3], v[4:5] clamp +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +v_wmma_bf16_16x16x16_bf16 v[4:5], v[0:1], v[2:3], v[4:5] op_sel:[1,0,0] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_wmma_bf16_16x16x16_bf16 v[4:5], v[0:1], v[2:3], v[4:5] op_sel:[0,1,0] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_wmma_bf16_16x16x16_bf16 v[4:5], v[0:1], v[2:3], v[4:5] op_sel:[0,0,1] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_wmma_bf16_16x16x16_bf16 v[4:5], v[0:1], v[2:3], v[4:5] op_sel_hi:[0,1,1] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_wmma_bf16_16x16x16_bf16 v[4:5], v[0:1], v[2:3], v[4:5] op_sel_hi:[1,0,1] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_wmma_bf16_16x16x16_bf16 v[4:5], v[0:1], v[2:3], v[4:5] op_sel_hi:[1,1,0] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_wmma_bf16_16x16x16_bf16 v[4:5], v[0:1], v[2:3], v[4:5] neg_lo:[1,0,0] +// GFX12: v_wmma_bf16_16x16x16_bf16 v[4:5], v[0:1], v[2:3], v[4:5] neg_lo:[1,0,0] ; encoding: [0x04,0x40,0x43,0xcc,0x00,0x05,0x12,0x3c] + +v_wmma_bf16_16x16x16_bf16 v[4:5], v[0:1], v[2:3], v[4:5] neg_lo:[0,1,0] +// GFX12: v_wmma_bf16_16x16x16_bf16 v[4:5], v[0:1], v[2:3], v[4:5] neg_lo:[0,1,0] ; encoding: [0x04,0x40,0x43,0xcc,0x00,0x05,0x12,0x5c] + +v_wmma_bf16_16x16x16_bf16 v[4:5], v[0:1], v[2:3], v[4:5] neg_lo:[0,0,1] +// GFX12: v_wmma_bf16_16x16x16_bf16 v[4:5], v[0:1], v[2:3], v[4:5] neg_lo:[0,0,1] ; encoding: [0x04,0x40,0x43,0xcc,0x00,0x05,0x12,0x9c] + +v_wmma_bf16_16x16x16_bf16 v[4:5], v[0:1], v[2:3], v[4:5] neg_hi:[1,0,0] +// GFX12: v_wmma_bf16_16x16x16_bf16 v[4:5], v[0:1], v[2:3], v[4:5] neg_hi:[1,0,0] ; encoding: [0x04,0x41,0x43,0xcc,0x00,0x05,0x12,0x1c] + +v_wmma_bf16_16x16x16_bf16 v[4:5], v[0:1], v[2:3], v[4:5] neg_hi:[0,1,0] +// GFX12: v_wmma_bf16_16x16x16_bf16 v[4:5], v[0:1], v[2:3], v[4:5] neg_hi:[0,1,0] ; encoding: [0x04,0x42,0x43,0xcc,0x00,0x05,0x12,0x1c] + +v_wmma_bf16_16x16x16_bf16 v[4:5], v[0:1], v[2:3], v[4:5] neg_hi:[0,0,1] +// GFX12: v_wmma_bf16_16x16x16_bf16 v[4:5], v[0:1], v[2:3], v[4:5] neg_hi:[0,0,1] ; encoding: [0x04,0x44,0x43,0xcc,0x00,0x05,0x12,0x1c] + +v_wmma_bf16_16x16x16_bf16 v[4:5], s[0:1], v[2:3], v[4:5] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +v_wmma_bf16_16x16x16_bf16 v[4:5], v[0:1], s[2:3], v[4:5] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +v_wmma_bf16_16x16x16_bf16 v[4:5], v[0:1], v[2:3], s[4:5] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +v_wmma_bf16_16x16x16_bf16 v[4:5], 1.0, v[2:3], v[4:5] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +v_wmma_bf16_16x16x16_bf16 v[4:5], v[0:1], 1.0, v[4:5] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +v_wmma_bf16_16x16x16_bf16 v[4:5], v[0:1], v[2:3], 1.0 +// GFX12: v_wmma_bf16_16x16x16_bf16 v[4:5], v[0:1], v[2:3], 1.0 ; encoding: [0x04,0x40,0x43,0xcc,0x00,0x05,0xca,0x1b] + +v_wmma_bf16_16x16x16_bf16 v[4:5], v[0:1], v[2:3], 1 +// GFX12: v_wmma_bf16_16x16x16_bf16 v[4:5], v[0:1], v[2:3], 1 ; encoding: [0x04,0x40,0x43,0xcc,0x00,0x05,0x06,0x1a] + + + +v_wmma_i32_16x16x16_iu8 v[2:5], v0, v1, v[2:5] +// GFX12: v_wmma_i32_16x16x16_iu8 v[2:5], v0, v1, v[2:5] ; encoding: [0x02,0x40,0x44,0xcc,0x00,0x03,0x0a,0x1c] + +v_wmma_i32_16x16x16_iu8 v[2:5], v0, v1, v[2:5] clamp +// GFX12: v_wmma_i32_16x16x16_iu8 v[2:5], v0, v1, v[2:5] clamp ; encoding: [0x02,0xc0,0x44,0xcc,0x00,0x03,0x0a,0x1c] + +v_wmma_i32_16x16x16_iu8 v[2:5], v0, v1, v[2:5] op_sel:[1,0,0] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_wmma_i32_16x16x16_iu8 v[2:5], v0, v1, v[2:5] op_sel:[0,1,0] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_wmma_i32_16x16x16_iu8 v[2:5], v0, v1, v[2:5] op_sel:[0,0,1] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_wmma_i32_16x16x16_iu8 v[2:5], v0, v1, v[2:5] op_sel_hi:[0,1,1] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_wmma_i32_16x16x16_iu8 v[2:5], v0, v1, v[2:5] op_sel_hi:[1,0,1] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_wmma_i32_16x16x16_iu8 v[2:5], v0, v1, v[2:5] op_sel_hi:[1,1,0] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_wmma_i32_16x16x16_iu8 v[2:5], v0, v1, v[2:5] neg_lo:[1,0,0] +// GFX12: v_wmma_i32_16x16x16_iu8 v[2:5], v0, v1, v[2:5] neg_lo:[1,0,0] ; encoding: [0x02,0x40,0x44,0xcc,0x00,0x03,0x0a,0x3c] + +v_wmma_i32_16x16x16_iu8 v[2:5], v0, v1, v[2:5] neg_lo:[0,1,0] +// GFX12: v_wmma_i32_16x16x16_iu8 v[2:5], v0, v1, v[2:5] neg_lo:[0,1,0] ; encoding: [0x02,0x40,0x44,0xcc,0x00,0x03,0x0a,0x5c] + +v_wmma_i32_16x16x16_iu8 v[2:5], v0, v1, v[2:5] neg_lo:[0,0,1] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid neg_lo operand + +v_wmma_i32_16x16x16_iu8 v[2:5], v0, v1, v[2:5] neg_hi:[1,0,0] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_wmma_i32_16x16x16_iu8 v[2:5], v0, v1, v[2:5] neg_hi:[0,1,0] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_wmma_i32_16x16x16_iu8 v[2:5], v0, v1, v[2:5] neg_hi:[0,0,1] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_wmma_i32_16x16x16_iu8 v[2:5], s0, v1, v[2:5] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +v_wmma_i32_16x16x16_iu8 v[2:5], v0, s1, v[2:5] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +v_wmma_i32_16x16x16_iu8 v[2:5], v0, v1, s[0:3] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +v_wmma_i32_16x16x16_iu8 v[2:5], 1, v1, v[2:5] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +v_wmma_i32_16x16x16_iu8 v[2:5], v0, 1, v[2:5] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +v_wmma_i32_16x16x16_iu8 v[2:5], v0, v1, 1 +// GFX12: v_wmma_i32_16x16x16_iu8 v[2:5], v0, v1, 1 ; encoding: [0x02,0x40,0x44,0xcc,0x00,0x03,0x06,0x1a] + +v_wmma_i32_16x16x16_iu8 v[2:5], v0, v1, 1.0 +// GFX12: v_wmma_i32_16x16x16_iu8 v[2:5], v0, v1, 1.0 ; encoding: [0x02,0x40,0x44,0xcc,0x00,0x03,0xca,0x1b] + + + +v_wmma_i32_16x16x16_iu4 v[2:5], v0, v1, v[2:5] +// GFX12: v_wmma_i32_16x16x16_iu4 v[2:5], v0, v1, v[2:5] ; encoding: [0x02,0x40,0x45,0xcc,0x00,0x03,0x0a,0x1c] + +v_wmma_i32_16x16x16_iu4 v[2:5], v0, v1, v[2:5] clamp +// GFX12: v_wmma_i32_16x16x16_iu4 v[2:5], v0, v1, v[2:5] clamp ; encoding: [0x02,0xc0,0x45,0xcc,0x00,0x03,0x0a,0x1c] + +v_wmma_i32_16x16x16_iu4 v[2:5], v0, v1, v[2:5] op_sel:[1,0,0] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_wmma_i32_16x16x16_iu4 v[2:5], v0, v1, v[2:5] op_sel:[0,1,0] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_wmma_i32_16x16x16_iu4 v[2:5], v0, v1, v[2:5] op_sel:[0,0,1] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_wmma_i32_16x16x16_iu4 v[2:5], v0, v1, v[2:5] op_sel_hi:[0,1,1] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_wmma_i32_16x16x16_iu4 v[2:5], v0, v1, v[2:5] op_sel_hi:[1,0,1] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_wmma_i32_16x16x16_iu4 v[2:5], v0, v1, v[2:5] op_sel_hi:[1,1,0] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_wmma_i32_16x16x16_iu4 v[2:5], v0, v1, v[2:5] neg_lo:[1,0,0] +// GFX12: v_wmma_i32_16x16x16_iu4 v[2:5], v0, v1, v[2:5] neg_lo:[1,0,0] ; encoding: [0x02,0x40,0x45,0xcc,0x00,0x03,0x0a,0x3c] + +v_wmma_i32_16x16x16_iu4 v[2:5], v0, v1, v[2:5] neg_lo:[0,1,0] +// GFX12: v_wmma_i32_16x16x16_iu4 v[2:5], v0, v1, v[2:5] neg_lo:[0,1,0] ; encoding: [0x02,0x40,0x45,0xcc,0x00,0x03,0x0a,0x5c] + +v_wmma_i32_16x16x16_iu4 v[2:5], v0, v1, v[2:5] neg_lo:[0,0,1] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid neg_lo operand + +v_wmma_i32_16x16x16_iu4 v[2:5], v0, v1, v[2:5] neg_hi:[1,0,0] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_wmma_i32_16x16x16_iu4 v[2:5], v0, v1, v[2:5] neg_hi:[0,1,0] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_wmma_i32_16x16x16_iu4 v[2:5], v0, v1, v[2:5] neg_hi:[0,0,1] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_wmma_i32_16x16x16_iu4 v[2:5], s0, v1, v[2:5] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +v_wmma_i32_16x16x16_iu4 v[2:5], v0, s1, v[2:5] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +v_wmma_i32_16x16x16_iu4 v[2:5], v0, v1, s[0:3] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +v_wmma_i32_16x16x16_iu4 v[2:5], 1, v1, v[2:5] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +v_wmma_i32_16x16x16_iu4 v[2:5], v0, 1, v[2:5] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +v_wmma_i32_16x16x16_iu4 v[2:5], v0, v1, 1 +// GFX12: v_wmma_i32_16x16x16_iu4 v[2:5], v0, v1, 1 ; encoding: [0x02,0x40,0x45,0xcc,0x00,0x03,0x06,0x1a] + +v_wmma_i32_16x16x16_iu4 v[2:5], v0, v1, 1.0 +// GFX12: v_wmma_i32_16x16x16_iu4 v[2:5], v0, v1, 1.0 ; encoding: [0x02,0x40,0x45,0xcc,0x00,0x03,0xca,0x1b] + + + +v_wmma_f32_16x16x16_fp8_fp8 v[2:5], v0, v1, v[2:5] +// GFX12: v_wmma_f32_16x16x16_fp8_fp8 v[2:5], v0, v1, v[2:5] ; encoding: [0x02,0x40,0x46,0xcc,0x00,0x03,0x0a,0x1c] + +v_wmma_f32_16x16x16_fp8_fp8 v[2:5], v0, v1, v[2:5] clamp +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +v_wmma_f32_16x16x16_fp8_fp8 v[2:5], v0, v1, v[2:5] op_sel:[1,0,0] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_wmma_f32_16x16x16_fp8_fp8 v[2:5], v0, v1, v[2:5] op_sel:[0,1,0] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_wmma_f32_16x16x16_fp8_fp8 v[2:5], v0, v1, v[2:5] op_sel:[0,0,1] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_wmma_f32_16x16x16_fp8_fp8 v[2:5], v0, v1, v[2:5] op_sel_hi:[0,1,1] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_wmma_f32_16x16x16_fp8_fp8 v[2:5], v0, v1, v[2:5] op_sel_hi:[1,0,1] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_wmma_f32_16x16x16_fp8_fp8 v[2:5], v0, v1, v[2:5] op_sel_hi:[1,1,0] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_wmma_f32_16x16x16_fp8_fp8 v[2:5], v0, v1, v[2:5] neg_lo:[1,0,0] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid neg_lo operand + +v_wmma_f32_16x16x16_fp8_fp8 v[2:5], v0, v1, v[2:5] neg_lo:[0,1,0] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid neg_lo operand + +v_wmma_f32_16x16x16_fp8_fp8 v[2:5], v0, v1, v[2:5] neg_lo:[0,0,1] +// GFX12: v_wmma_f32_16x16x16_fp8_fp8 v[2:5], v0, v1, v[2:5] neg_lo:[0,0,1] ; encoding: [0x02,0x40,0x46,0xcc,0x00,0x03,0x0a,0x9c] + +v_wmma_f32_16x16x16_fp8_fp8 v[2:5], v0, v1, v[2:5] neg_hi:[1,0,0] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid neg_hi operand + +v_wmma_f32_16x16x16_fp8_fp8 v[2:5], v0, v1, v[2:5] neg_hi:[0,1,0] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid neg_hi operand + +v_wmma_f32_16x16x16_fp8_fp8 v[2:5], v0, v1, v[2:5] neg_hi:[0,0,1] +// GFX12: v_wmma_f32_16x16x16_fp8_fp8 v[2:5], v0, v1, v[2:5] neg_hi:[0,0,1] ; encoding: [0x02,0x44,0x46,0xcc,0x00,0x03,0x0a,0x1c] + +v_wmma_f32_16x16x16_fp8_fp8 v[2:5], s0, v1, v[2:5] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +v_wmma_f32_16x16x16_fp8_fp8 v[2:5], v0, s1, v[2:5] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +v_wmma_f32_16x16x16_fp8_fp8 v[2:5], v0, v1, s[0:3] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +v_wmma_f32_16x16x16_fp8_fp8 v[2:5], 1.0, v1, v[2:5] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +v_wmma_f32_16x16x16_fp8_fp8 v[2:5], v0, 1.0, v[2:5] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +v_wmma_f32_16x16x16_fp8_fp8 v[2:5], v0, v1, 1.0 +// GFX12: v_wmma_f32_16x16x16_fp8_fp8 v[2:5], v0, v1, 1.0 ; encoding: [0x02,0x40,0x46,0xcc,0x00,0x03,0xca,0x1b] + +v_wmma_f32_16x16x16_fp8_fp8 v[2:5], v0, v1, 1 +// GFX12: v_wmma_f32_16x16x16_fp8_fp8 v[2:5], v0, v1, 1 ; encoding: [0x02,0x40,0x46,0xcc,0x00,0x03,0x06,0x1a] + + + +v_wmma_f32_16x16x16_fp8_bf8 v[2:5], v0, v1, v[2:5] +// GFX12: v_wmma_f32_16x16x16_fp8_bf8 v[2:5], v0, v1, v[2:5] ; encoding: [0x02,0x40,0x47,0xcc,0x00,0x03,0x0a,0x1c] + +v_wmma_f32_16x16x16_fp8_bf8 v[2:5], v0, v1, v[2:5] clamp +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +v_wmma_f32_16x16x16_fp8_bf8 v[2:5], v0, v1, v[2:5] op_sel:[1,0,0] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_wmma_f32_16x16x16_fp8_bf8 v[2:5], v0, v1, v[2:5] op_sel:[0,1,0] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_wmma_f32_16x16x16_fp8_bf8 v[2:5], v0, v1, v[2:5] op_sel:[0,0,1] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_wmma_f32_16x16x16_fp8_bf8 v[2:5], v0, v1, v[2:5] op_sel_hi:[0,1,1] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_wmma_f32_16x16x16_fp8_bf8 v[2:5], v0, v1, v[2:5] op_sel_hi:[1,0,1] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_wmma_f32_16x16x16_fp8_bf8 v[2:5], v0, v1, v[2:5] op_sel_hi:[1,1,0] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_wmma_f32_16x16x16_fp8_bf8 v[2:5], v0, v1, v[2:5] neg_lo:[1,0,0] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid neg_lo operand + +v_wmma_f32_16x16x16_fp8_bf8 v[2:5], v0, v1, v[2:5] neg_lo:[0,1,0] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid neg_lo operand + +v_wmma_f32_16x16x16_fp8_bf8 v[2:5], v0, v1, v[2:5] neg_lo:[0,0,1] +// GFX12: v_wmma_f32_16x16x16_fp8_bf8 v[2:5], v0, v1, v[2:5] neg_lo:[0,0,1] ; encoding: [0x02,0x40,0x47,0xcc,0x00,0x03,0x0a,0x9c] + +v_wmma_f32_16x16x16_fp8_bf8 v[2:5], v0, v1, v[2:5] neg_hi:[1,0,0] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid neg_hi operand + +v_wmma_f32_16x16x16_fp8_bf8 v[2:5], v0, v1, v[2:5] neg_hi:[0,1,0] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid neg_hi operand + +v_wmma_f32_16x16x16_fp8_bf8 v[2:5], v0, v1, v[2:5] neg_hi:[0,0,1] +// GFX12: v_wmma_f32_16x16x16_fp8_bf8 v[2:5], v0, v1, v[2:5] neg_hi:[0,0,1] ; encoding: [0x02,0x44,0x47,0xcc,0x00,0x03,0x0a,0x1c] + +v_wmma_f32_16x16x16_fp8_bf8 v[2:5], s0, v1, v[2:5] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +v_wmma_f32_16x16x16_fp8_bf8 v[2:5], v0, s1, v[2:5] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +v_wmma_f32_16x16x16_fp8_bf8 v[2:5], v0, v1, s[0:3] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +v_wmma_f32_16x16x16_fp8_bf8 v[2:5], 1.0, v1, v[2:5] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +v_wmma_f32_16x16x16_fp8_bf8 v[2:5], v0, 1.0, v[2:5] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +v_wmma_f32_16x16x16_fp8_bf8 v[2:5], v0, v1, 1.0 +// GFX12: v_wmma_f32_16x16x16_fp8_bf8 v[2:5], v0, v1, 1.0 ; encoding: [0x02,0x40,0x47,0xcc,0x00,0x03,0xca,0x1b] + +v_wmma_f32_16x16x16_fp8_bf8 v[2:5], v0, v1, 1 +// GFX12: v_wmma_f32_16x16x16_fp8_bf8 v[2:5], v0, v1, 1 ; encoding: [0x02,0x40,0x47,0xcc,0x00,0x03,0x06,0x1a] + + + +v_wmma_f32_16x16x16_bf8_fp8 v[2:5], v0, v1, v[2:5] +// GFX12: v_wmma_f32_16x16x16_bf8_fp8 v[2:5], v0, v1, v[2:5] ; encoding: [0x02,0x40,0x48,0xcc,0x00,0x03,0x0a,0x1c] + +v_wmma_f32_16x16x16_bf8_fp8 v[2:5], v0, v1, v[2:5] clamp +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +v_wmma_f32_16x16x16_bf8_fp8 v[2:5], v0, v1, v[2:5] op_sel:[1,0,0] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_wmma_f32_16x16x16_bf8_fp8 v[2:5], v0, v1, v[2:5] op_sel:[0,1,0] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_wmma_f32_16x16x16_bf8_fp8 v[2:5], v0, v1, v[2:5] op_sel:[0,0,1] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_wmma_f32_16x16x16_bf8_fp8 v[2:5], v0, v1, v[2:5] op_sel_hi:[0,1,1] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_wmma_f32_16x16x16_bf8_fp8 v[2:5], v0, v1, v[2:5] op_sel_hi:[1,0,1] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_wmma_f32_16x16x16_bf8_fp8 v[2:5], v0, v1, v[2:5] op_sel_hi:[1,1,0] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_wmma_f32_16x16x16_bf8_fp8 v[2:5], v0, v1, v[2:5] neg_lo:[1,0,0] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid neg_lo operand + +v_wmma_f32_16x16x16_bf8_fp8 v[2:5], v0, v1, v[2:5] neg_lo:[0,1,0] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid neg_lo operand + +v_wmma_f32_16x16x16_bf8_fp8 v[2:5], v0, v1, v[2:5] neg_lo:[0,0,1] +// GFX12: v_wmma_f32_16x16x16_bf8_fp8 v[2:5], v0, v1, v[2:5] neg_lo:[0,0,1] ; encoding: [0x02,0x40,0x48,0xcc,0x00,0x03,0x0a,0x9c] + +v_wmma_f32_16x16x16_bf8_fp8 v[2:5], v0, v1, v[2:5] neg_hi:[1,0,0] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid neg_hi operand + +v_wmma_f32_16x16x16_bf8_fp8 v[2:5], v0, v1, v[2:5] neg_hi:[0,1,0] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid neg_hi operand + +v_wmma_f32_16x16x16_bf8_fp8 v[2:5], v0, v1, v[2:5] neg_hi:[0,0,1] +// GFX12: v_wmma_f32_16x16x16_bf8_fp8 v[2:5], v0, v1, v[2:5] neg_hi:[0,0,1] ; encoding: [0x02,0x44,0x48,0xcc,0x00,0x03,0x0a,0x1c] + +v_wmma_f32_16x16x16_bf8_fp8 v[2:5], s0, v1, v[2:5] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +v_wmma_f32_16x16x16_bf8_fp8 v[2:5], v0, s1, v[2:5] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +v_wmma_f32_16x16x16_bf8_fp8 v[2:5], v0, v1, s[0:3] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +v_wmma_f32_16x16x16_bf8_fp8 v[2:5], 1.0, v1, v[2:5] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +v_wmma_f32_16x16x16_bf8_fp8 v[2:5], v0, 1.0, v[2:5] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +v_wmma_f32_16x16x16_bf8_fp8 v[2:5], v0, v1, 1.0 +// GFX12: v_wmma_f32_16x16x16_bf8_fp8 v[2:5], v0, v1, 1.0 ; encoding: [0x02,0x40,0x48,0xcc,0x00,0x03,0xca,0x1b] + +v_wmma_f32_16x16x16_bf8_fp8 v[2:5], v0, v1, 1 +// GFX12: v_wmma_f32_16x16x16_bf8_fp8 v[2:5], v0, v1, 1 ; encoding: [0x02,0x40,0x48,0xcc,0x00,0x03,0x06,0x1a] + + + +v_wmma_f32_16x16x16_bf8_bf8 v[2:5], v0, v1, v[2:5] +// GFX12: v_wmma_f32_16x16x16_bf8_bf8 v[2:5], v0, v1, v[2:5] ; encoding: [0x02,0x40,0x49,0xcc,0x00,0x03,0x0a,0x1c] + +v_wmma_f32_16x16x16_bf8_bf8 v[2:5], v0, v1, v[2:5] clamp +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +v_wmma_f32_16x16x16_bf8_bf8 v[2:5], v0, v1, v[2:5] op_sel:[1,0,0] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_wmma_f32_16x16x16_bf8_bf8 v[2:5], v0, v1, v[2:5] op_sel:[0,1,0] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_wmma_f32_16x16x16_bf8_bf8 v[2:5], v0, v1, v[2:5] op_sel:[0,0,1] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_wmma_f32_16x16x16_bf8_bf8 v[2:5], v0, v1, v[2:5] op_sel_hi:[0,1,1] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_wmma_f32_16x16x16_bf8_bf8 v[2:5], v0, v1, v[2:5] op_sel_hi:[1,0,1] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_wmma_f32_16x16x16_bf8_bf8 v[2:5], v0, v1, v[2:5] op_sel_hi:[1,1,0] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_wmma_f32_16x16x16_bf8_bf8 v[2:5], v0, v1, v[2:5] neg_lo:[1,0,0] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid neg_lo operand + +v_wmma_f32_16x16x16_bf8_bf8 v[2:5], v0, v1, v[2:5] neg_lo:[0,1,0] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid neg_lo operand + +v_wmma_f32_16x16x16_bf8_bf8 v[2:5], v0, v1, v[2:5] neg_lo:[0,0,1] +// GFX12: v_wmma_f32_16x16x16_bf8_bf8 v[2:5], v0, v1, v[2:5] neg_lo:[0,0,1] ; encoding: [0x02,0x40,0x49,0xcc,0x00,0x03,0x0a,0x9c] + +v_wmma_f32_16x16x16_bf8_bf8 v[2:5], v0, v1, v[2:5] neg_hi:[1,0,0] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid neg_hi operand + +v_wmma_f32_16x16x16_bf8_bf8 v[2:5], v0, v1, v[2:5] neg_hi:[0,1,0] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid neg_hi operand + +v_wmma_f32_16x16x16_bf8_bf8 v[2:5], v0, v1, v[2:5] neg_hi:[0,0,1] +// GFX12: v_wmma_f32_16x16x16_bf8_bf8 v[2:5], v0, v1, v[2:5] neg_hi:[0,0,1] ; encoding: [0x02,0x44,0x49,0xcc,0x00,0x03,0x0a,0x1c] + +v_wmma_f32_16x16x16_bf8_bf8 v[2:5], s0, v1, v[2:5] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +v_wmma_f32_16x16x16_bf8_bf8 v[2:5], v0, s1, v[2:5] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +v_wmma_f32_16x16x16_bf8_bf8 v[2:5], v0, v1, s[0:3] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +v_wmma_f32_16x16x16_bf8_bf8 v[2:5], 1.0, v1, v[2:5] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +v_wmma_f32_16x16x16_bf8_bf8 v[2:5], v0, 1.0, v[2:5] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +v_wmma_f32_16x16x16_bf8_bf8 v[2:5], v0, v1, 1.0 +// GFX12: v_wmma_f32_16x16x16_bf8_bf8 v[2:5], v0, v1, 1.0 ; encoding: [0x02,0x40,0x49,0xcc,0x00,0x03,0xca,0x1b] + +v_wmma_f32_16x16x16_bf8_bf8 v[2:5], v0, v1, 1 +// GFX12: v_wmma_f32_16x16x16_bf8_bf8 v[2:5], v0, v1, 1 ; encoding: [0x02,0x40,0x49,0xcc,0x00,0x03,0x06,0x1a] + + + +v_wmma_i32_16x16x32_iu4 v[2:5], v0, v1, v[2:5] +// GFX12: v_wmma_i32_16x16x32_iu4 v[2:5], v0, v1, v[2:5] ; encoding: [0x02,0x40,0x4a,0xcc,0x00,0x03,0x0a,0x1c] + +v_wmma_i32_16x16x32_iu4 v[2:5], v0, v1, v[2:5] clamp +// GFX12: v_wmma_i32_16x16x32_iu4 v[2:5], v0, v1, v[2:5] clamp ; encoding: [0x02,0xc0,0x4a,0xcc,0x00,0x03,0x0a,0x1c] + +v_wmma_i32_16x16x32_iu4 v[2:5], v0, v1, v[2:5] op_sel:[1,0,0] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_wmma_i32_16x16x32_iu4 v[2:5], v0, v1, v[2:5] op_sel:[0,1,0] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_wmma_i32_16x16x32_iu4 v[2:5], v0, v1, v[2:5] op_sel:[0,0,1] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_wmma_i32_16x16x32_iu4 v[2:5], v0, v1, v[2:5] op_sel_hi:[0,1,1] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_wmma_i32_16x16x32_iu4 v[2:5], v0, v1, v[2:5] op_sel_hi:[1,0,1] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_wmma_i32_16x16x32_iu4 v[2:5], v0, v1, v[2:5] op_sel_hi:[1,1,0] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_wmma_i32_16x16x32_iu4 v[2:5], v0, v1, v[2:5] neg_lo:[1,0,0] +// GFX12: v_wmma_i32_16x16x32_iu4 v[2:5], v0, v1, v[2:5] neg_lo:[1,0,0] ; encoding: [0x02,0x40,0x4a,0xcc,0x00,0x03,0x0a,0x3c] + +v_wmma_i32_16x16x32_iu4 v[2:5], v0, v1, v[2:5] neg_lo:[0,1,0] +// GFX12: v_wmma_i32_16x16x32_iu4 v[2:5], v0, v1, v[2:5] neg_lo:[0,1,0] ; encoding: [0x02,0x40,0x4a,0xcc,0x00,0x03,0x0a,0x5c] + +v_wmma_i32_16x16x32_iu4 v[2:5], v0, v1, v[2:5] neg_lo:[0,0,1] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid neg_lo operand + +v_wmma_i32_16x16x32_iu4 v[2:5], v0, v1, v[2:5] neg_hi:[1,0,0] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_wmma_i32_16x16x32_iu4 v[2:5], v0, v1, v[2:5] neg_hi:[0,1,0] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_wmma_i32_16x16x32_iu4 v[2:5], v0, v1, v[2:5] neg_hi:[0,0,1] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_wmma_i32_16x16x32_iu4 v[2:5], s0, v1, v[2:5] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +v_wmma_i32_16x16x32_iu4 v[2:5], v0, s1, v[2:5] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +v_wmma_i32_16x16x32_iu4 v[2:5], v0, v1, s[0:3] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +v_wmma_i32_16x16x32_iu4 v[2:5], 1.0, v1, v[2:5] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +v_wmma_i32_16x16x32_iu4 v[2:5], v0, 1.0, v[2:5] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +v_wmma_i32_16x16x32_iu4 v[2:5], v0, v1, 1.0 +// GFX12: v_wmma_i32_16x16x32_iu4 v[2:5], v0, v1, 1.0 ; encoding: [0x02,0x40,0x4a,0xcc,0x00,0x03,0xca,0x1b] + +v_wmma_i32_16x16x32_iu4 v[2:5], v0, v1, 1 +// GFX12: v_wmma_i32_16x16x32_iu4 v[2:5], v0, v1, 1 ; encoding: [0x02,0x40,0x4a,0xcc,0x00,0x03,0x06,0x1a] + + + +v_swmmac_f32_16x16x32_f16 v[6:9], v[0:1], v[2:5], v10 +// GFX12: v_swmmac_f32_16x16x32_f16 v[6:9], v[0:1], v[2:5], v10 ; encoding: [0x06,0x40,0x50,0xcc,0x00,0x05,0x2a,0x1c] + +v_swmmac_f32_16x16x32_f16 v[6:9], v[0:1], v[2:5], v10 clamp +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +v_swmmac_f32_16x16x32_f16 v[6:9], v[0:1], v[2:5], v10 op_sel:[1,0,0] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_swmmac_f32_16x16x32_f16 v[6:9], v[0:1], v[2:5], v10 op_sel:[0,1,0] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_swmmac_f32_16x16x32_f16 v[6:9], v[0:1], v[2:5], v10 op_sel:[0,0,1] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_swmmac_f32_16x16x32_f16 v[6:9], v[0:1], v[2:5], v10 op_sel_hi:[0,1,1] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_swmmac_f32_16x16x32_f16 v[6:9], v[0:1], v[2:5], v10 op_sel_hi:[1,0,1] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_swmmac_f32_16x16x32_f16 v[6:9], v[0:1], v[2:5], v10 op_sel_hi:[1,1,0] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_swmmac_f32_16x16x32_f16 v[6:9], v[0:1], v[2:5], v10 index_key:1 +// GFX12: v_swmmac_f32_16x16x32_f16 v[6:9], v[0:1], v[2:5], v10 index_key:1 ; encoding: [0x06,0x48,0x50,0xcc,0x00,0x05,0x2a,0x1c] + +v_swmmac_f32_16x16x32_f16 v[6:9], v[0:1], v[2:5], v10 index_key:2 +// GFX12: v_swmmac_f32_16x16x32_f16 v[6:9], v[0:1], v[2:5], v10 index_key:2 ; encoding: [0x06,0x50,0x50,0xcc,0x00,0x05,0x2a,0x1c] + +v_swmmac_f32_16x16x32_f16 v[6:9], v[0:1], v[2:5], v10 index_key:3 +// GFX12: v_swmmac_f32_16x16x32_f16 v[6:9], v[0:1], v[2:5], v10 index_key:3 ; encoding: [0x06,0x58,0x50,0xcc,0x00,0x05,0x2a,0x1c] + +v_swmmac_f32_16x16x32_f16 v[6:9], v[0:1], v[2:5], v10 neg_lo:[1,0,0] +// GFX12: v_swmmac_f32_16x16x32_f16 v[6:9], v[0:1], v[2:5], v10 neg_lo:[1,0,0] ; encoding: [0x06,0x40,0x50,0xcc,0x00,0x05,0x2a,0x3c] + +v_swmmac_f32_16x16x32_f16 v[6:9], v[0:1], v[2:5], v10 neg_lo:[0,1,0] +// GFX12: v_swmmac_f32_16x16x32_f16 v[6:9], v[0:1], v[2:5], v10 neg_lo:[0,1,0] ; encoding: [0x06,0x40,0x50,0xcc,0x00,0x05,0x2a,0x5c] + +v_swmmac_f32_16x16x32_f16 v[6:9], v[0:1], v[2:5], v10 neg_lo:[0,0,1] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid neg_lo operand + +v_swmmac_f32_16x16x32_f16 v[6:9], v[0:1], v[2:5], v10 neg_hi:[1,0,0] +// GFX12: v_swmmac_f32_16x16x32_f16 v[6:9], v[0:1], v[2:5], v10 neg_hi:[1,0,0] ; encoding: [0x06,0x41,0x50,0xcc,0x00,0x05,0x2a,0x1c] + +v_swmmac_f32_16x16x32_f16 v[6:9], v[0:1], v[2:5], v10 neg_hi:[0,1,0] +// GFX12: v_swmmac_f32_16x16x32_f16 v[6:9], v[0:1], v[2:5], v10 neg_hi:[0,1,0] ; encoding: [0x06,0x42,0x50,0xcc,0x00,0x05,0x2a,0x1c] + +v_swmmac_f32_16x16x32_f16 v[6:9], v[0:1], v[2:5], v10 neg_hi:[0,0,1] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid neg_hi operand + +v_swmmac_f32_16x16x32_f16 v[6:9], s[0:1], v[2:5], v10 +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +v_swmmac_f32_16x16x32_f16 v[6:9], v[0:1], s[0:3], v10 +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +v_swmmac_f32_16x16x32_f16 v[6:9], v[0:1], v[2:5], s10 +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +v_swmmac_f32_16x16x32_f16 v[6:9], 1.0, v[2:5], v10 +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +v_swmmac_f32_16x16x32_f16 v[6:9], v[0:1], 1.0, v10 +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +v_swmmac_f32_16x16x32_f16 v[6:9], v[0:1], v[2:5], 1.0 +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +v_swmmac_f32_16x16x32_f16 v[6:9], v[0:1], v[2:5], 1 +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + + + +v_swmmac_f32_16x16x32_bf16 v[6:9], v[0:1], v[2:5], v10 +// GFX12: v_swmmac_f32_16x16x32_bf16 v[6:9], v[0:1], v[2:5], v10 ; encoding: [0x06,0x40,0x51,0xcc,0x00,0x05,0x2a,0x1c] + +v_swmmac_f32_16x16x32_bf16 v[6:9], v[0:1], v[2:5], v10 clamp +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +v_swmmac_f32_16x16x32_bf16 v[6:9], v[0:1], v[2:5], v10 op_sel:[1,0,0] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_swmmac_f32_16x16x32_bf16 v[6:9], v[0:1], v[2:5], v10 op_sel:[0,1,0] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_swmmac_f32_16x16x32_bf16 v[6:9], v[0:1], v[2:5], v10 op_sel:[0,0,1] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_swmmac_f32_16x16x32_bf16 v[6:9], v[0:1], v[2:5], v10 op_sel_hi:[0,1,1] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_swmmac_f32_16x16x32_bf16 v[6:9], v[0:1], v[2:5], v10 op_sel_hi:[1,0,1] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_swmmac_f32_16x16x32_bf16 v[6:9], v[0:1], v[2:5], v10 op_sel_hi:[1,1,0] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_swmmac_f32_16x16x32_bf16 v[6:9], v[0:1], v[2:5], v10 index_key:1 +// GFX12: v_swmmac_f32_16x16x32_bf16 v[6:9], v[0:1], v[2:5], v10 index_key:1 ; encoding: [0x06,0x48,0x51,0xcc,0x00,0x05,0x2a,0x1c] + +v_swmmac_f32_16x16x32_bf16 v[6:9], v[0:1], v[2:5], v10 index_key:2 +// GFX12: v_swmmac_f32_16x16x32_bf16 v[6:9], v[0:1], v[2:5], v10 index_key:2 ; encoding: [0x06,0x50,0x51,0xcc,0x00,0x05,0x2a,0x1c] + +v_swmmac_f32_16x16x32_bf16 v[6:9], v[0:1], v[2:5], v10 index_key:3 +// GFX12: v_swmmac_f32_16x16x32_bf16 v[6:9], v[0:1], v[2:5], v10 index_key:3 ; encoding: [0x06,0x58,0x51,0xcc,0x00,0x05,0x2a,0x1c] + +v_swmmac_f32_16x16x32_bf16 v[6:9], v[0:1], v[2:5], v10 neg_lo:[1,0,0] +// GFX12: v_swmmac_f32_16x16x32_bf16 v[6:9], v[0:1], v[2:5], v10 neg_lo:[1,0,0] ; encoding: [0x06,0x40,0x51,0xcc,0x00,0x05,0x2a,0x3c] + +v_swmmac_f32_16x16x32_bf16 v[6:9], v[0:1], v[2:5], v10 neg_lo:[0,1,0] +// GFX12: v_swmmac_f32_16x16x32_bf16 v[6:9], v[0:1], v[2:5], v10 neg_lo:[0,1,0] ; encoding: [0x06,0x40,0x51,0xcc,0x00,0x05,0x2a,0x5c] + +v_swmmac_f32_16x16x32_bf16 v[6:9], v[0:1], v[2:5], v10 neg_lo:[0,0,1] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid neg_lo operand + +v_swmmac_f32_16x16x32_bf16 v[6:9], v[0:1], v[2:5], v10 neg_hi:[1,0,0] +// GFX12: v_swmmac_f32_16x16x32_bf16 v[6:9], v[0:1], v[2:5], v10 neg_hi:[1,0,0] ; encoding: [0x06,0x41,0x51,0xcc,0x00,0x05,0x2a,0x1c] + +v_swmmac_f32_16x16x32_bf16 v[6:9], v[0:1], v[2:5], v10 neg_hi:[0,1,0] +// GFX12: v_swmmac_f32_16x16x32_bf16 v[6:9], v[0:1], v[2:5], v10 neg_hi:[0,1,0] ; encoding: [0x06,0x42,0x51,0xcc,0x00,0x05,0x2a,0x1c] + +v_swmmac_f32_16x16x32_bf16 v[6:9], v[0:1], v[2:5], v10 neg_hi:[0,0,1] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid neg_hi operand + +v_swmmac_f32_16x16x32_bf16 v[6:9], s[0:1], v[2:5], v10 +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +v_swmmac_f32_16x16x32_bf16 v[6:9], v[0:1], s[0:3], v10 +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +v_swmmac_f32_16x16x32_bf16 v[6:9], v[0:1], v[2:5], s10 +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +v_swmmac_f32_16x16x32_bf16 v[6:9], 1.0, v[2:5], v10 +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +v_swmmac_f32_16x16x32_bf16 v[6:9], v[0:1], 1.0, v10 +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +v_swmmac_f32_16x16x32_bf16 v[6:9], v[0:1], v[2:5], 1.0 +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +v_swmmac_f32_16x16x32_bf16 v[6:9], v[0:1], v[2:5], 1 +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + + + +v_swmmac_f16_16x16x32_f16 v[6:7], v[0:1], v[2:5], v8 +// GFX12: v_swmmac_f16_16x16x32_f16 v[6:7], v[0:1], v[2:5], v8 ; encoding: [0x06,0x40,0x52,0xcc,0x00,0x05,0x22,0x1c] + +v_swmmac_f16_16x16x32_f16 v[6:7], v[0:1], v[2:5], v8 clamp +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +v_swmmac_f16_16x16x32_f16 v[6:7], v[0:1], v[2:5], v8 op_sel:[1,0,0] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_swmmac_f16_16x16x32_f16 v[6:7], v[0:1], v[2:5], v8 op_sel:[0,1,0] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_swmmac_f16_16x16x32_f16 v[6:7], v[0:1], v[2:5], v8 op_sel:[0,0,1] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_swmmac_f16_16x16x32_f16 v[6:7], v[0:1], v[2:5], v8 op_sel_hi:[0,1,1] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_swmmac_f16_16x16x32_f16 v[6:7], v[0:1], v[2:5], v8 op_sel_hi:[1,0,1] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_swmmac_f16_16x16x32_f16 v[6:7], v[0:1], v[2:5], v8 op_sel_hi:[1,1,0] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_swmmac_f16_16x16x32_f16 v[6:7], v[0:1], v[2:5], v8 index_key:1 +// GFX12: v_swmmac_f16_16x16x32_f16 v[6:7], v[0:1], v[2:5], v8 index_key:1 ; encoding: [0x06,0x48,0x52,0xcc,0x00,0x05,0x22,0x1c] + +v_swmmac_f16_16x16x32_f16 v[6:7], v[0:1], v[2:5], v8 index_key:2 +// GFX12: v_swmmac_f16_16x16x32_f16 v[6:7], v[0:1], v[2:5], v8 index_key:2 ; encoding: [0x06,0x50,0x52,0xcc,0x00,0x05,0x22,0x1c] + +v_swmmac_f16_16x16x32_f16 v[6:7], v[0:1], v[2:5], v8 index_key:3 +// GFX12: v_swmmac_f16_16x16x32_f16 v[6:7], v[0:1], v[2:5], v8 index_key:3 ; encoding: [0x06,0x58,0x52,0xcc,0x00,0x05,0x22,0x1c] + +v_swmmac_f16_16x16x32_f16 v[6:7], v[0:1], v[2:5], v8 neg_lo:[1,0,0] +// GFX12: v_swmmac_f16_16x16x32_f16 v[6:7], v[0:1], v[2:5], v8 neg_lo:[1,0,0] ; encoding: [0x06,0x40,0x52,0xcc,0x00,0x05,0x22,0x3c] + +v_swmmac_f16_16x16x32_f16 v[6:7], v[0:1], v[2:5], v8 neg_lo:[0,1,0] +// GFX12: v_swmmac_f16_16x16x32_f16 v[6:7], v[0:1], v[2:5], v8 neg_lo:[0,1,0] ; encoding: [0x06,0x40,0x52,0xcc,0x00,0x05,0x22,0x5c] + +v_swmmac_f16_16x16x32_f16 v[6:7], v[0:1], v[2:5], v8 neg_lo:[0,0,1] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid neg_lo operand + +v_swmmac_f16_16x16x32_f16 v[6:7], v[0:1], v[2:5], v8 neg_hi:[1,0,0] +// GFX12: v_swmmac_f16_16x16x32_f16 v[6:7], v[0:1], v[2:5], v8 neg_hi:[1,0,0] ; encoding: [0x06,0x41,0x52,0xcc,0x00,0x05,0x22,0x1c] + +v_swmmac_f16_16x16x32_f16 v[6:7], v[0:1], v[2:5], v8 neg_hi:[0,1,0] +// GFX12: v_swmmac_f16_16x16x32_f16 v[6:7], v[0:1], v[2:5], v8 neg_hi:[0,1,0] ; encoding: [0x06,0x42,0x52,0xcc,0x00,0x05,0x22,0x1c] + +v_swmmac_f16_16x16x32_f16 v[6:7], v[0:1], v[2:5], v8 neg_hi:[0,0,1] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid neg_hi operand + +v_swmmac_f16_16x16x32_f16 v[6:7], s[0:1], v[2:5], v8 +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +v_swmmac_f16_16x16x32_f16 v[6:7], v[0:1], s[0:3], v8 +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +v_swmmac_f16_16x16x32_f16 v[6:7], v[0:1], v[2:5], s8 +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +v_swmmac_f16_16x16x32_f16 v[6:7], 1.0, v[2:5], v8 +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +v_swmmac_f16_16x16x32_f16 v[6:7], v[0:1], 1.0, v8 +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +v_swmmac_f16_16x16x32_f16 v[6:7], v[0:1], v[2:5], 1.0 +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +v_swmmac_f16_16x16x32_f16 v[6:7], v[0:1], v[2:5], 1 +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + + + +v_swmmac_bf16_16x16x32_bf16 v[6:7], v[0:1], v[2:5], v8 +// GFX12: v_swmmac_bf16_16x16x32_bf16 v[6:7], v[0:1], v[2:5], v8 ; encoding: [0x06,0x40,0x53,0xcc,0x00,0x05,0x22,0x1c] + +v_swmmac_bf16_16x16x32_bf16 v[6:7], v[0:1], v[2:5], v8 clamp +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +v_swmmac_bf16_16x16x32_bf16 v[6:7], v[0:1], v[2:5], v8 op_sel:[1,0,0] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_swmmac_bf16_16x16x32_bf16 v[6:7], v[0:1], v[2:5], v8 op_sel:[0,1,0] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_swmmac_bf16_16x16x32_bf16 v[6:7], v[0:1], v[2:5], v8 op_sel:[0,0,1] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_swmmac_bf16_16x16x32_bf16 v[6:7], v[0:1], v[2:5], v8 op_sel_hi:[0,1,1] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_swmmac_bf16_16x16x32_bf16 v[6:7], v[0:1], v[2:5], v8 op_sel_hi:[1,0,1] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_swmmac_bf16_16x16x32_bf16 v[6:7], v[0:1], v[2:5], v8 op_sel_hi:[1,1,0] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_swmmac_bf16_16x16x32_bf16 v[6:7], v[0:1], v[2:5], v8 index_key:1 +// GFX12: v_swmmac_bf16_16x16x32_bf16 v[6:7], v[0:1], v[2:5], v8 index_key:1 ; encoding: [0x06,0x48,0x53,0xcc,0x00,0x05,0x22,0x1c] + +v_swmmac_bf16_16x16x32_bf16 v[6:7], v[0:1], v[2:5], v8 index_key:2 +// GFX12: v_swmmac_bf16_16x16x32_bf16 v[6:7], v[0:1], v[2:5], v8 index_key:2 ; encoding: [0x06,0x50,0x53,0xcc,0x00,0x05,0x22,0x1c] + +v_swmmac_bf16_16x16x32_bf16 v[6:7], v[0:1], v[2:5], v8 index_key:3 +// GFX12: v_swmmac_bf16_16x16x32_bf16 v[6:7], v[0:1], v[2:5], v8 index_key:3 ; encoding: [0x06,0x58,0x53,0xcc,0x00,0x05,0x22,0x1c] + +v_swmmac_bf16_16x16x32_bf16 v[6:7], v[0:1], v[2:5], v8 neg_lo:[1,0,0] +// GFX12: v_swmmac_bf16_16x16x32_bf16 v[6:7], v[0:1], v[2:5], v8 neg_lo:[1,0,0] ; encoding: [0x06,0x40,0x53,0xcc,0x00,0x05,0x22,0x3c] + +v_swmmac_bf16_16x16x32_bf16 v[6:7], v[0:1], v[2:5], v8 neg_lo:[0,1,0] +// GFX12: v_swmmac_bf16_16x16x32_bf16 v[6:7], v[0:1], v[2:5], v8 neg_lo:[0,1,0] ; encoding: [0x06,0x40,0x53,0xcc,0x00,0x05,0x22,0x5c] + +v_swmmac_bf16_16x16x32_bf16 v[6:7], v[0:1], v[2:5], v8 neg_lo:[0,0,1] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid neg_lo operand + +v_swmmac_bf16_16x16x32_bf16 v[6:7], v[0:1], v[2:5], v8 neg_hi:[1,0,0] +// GFX12: v_swmmac_bf16_16x16x32_bf16 v[6:7], v[0:1], v[2:5], v8 neg_hi:[1,0,0] ; encoding: [0x06,0x41,0x53,0xcc,0x00,0x05,0x22,0x1c] + +v_swmmac_bf16_16x16x32_bf16 v[6:7], v[0:1], v[2:5], v8 neg_hi:[0,1,0] +// GFX12: v_swmmac_bf16_16x16x32_bf16 v[6:7], v[0:1], v[2:5], v8 neg_hi:[0,1,0] ; encoding: [0x06,0x42,0x53,0xcc,0x00,0x05,0x22,0x1c] + +v_swmmac_bf16_16x16x32_bf16 v[6:7], v[0:1], v[2:5], v8 neg_hi:[0,0,1] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid neg_hi operand + +v_swmmac_bf16_16x16x32_bf16 v[6:7], s[0:1], v[2:5], v8 +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +v_swmmac_bf16_16x16x32_bf16 v[6:7], v[0:1], s[0:3], v8 +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +v_swmmac_bf16_16x16x32_bf16 v[6:7], v[0:1], v[2:5], s8 +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +v_swmmac_bf16_16x16x32_bf16 v[6:7], 1.0, v[2:5], v8 +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +v_swmmac_bf16_16x16x32_bf16 v[6:7], v[0:1], 1.0, v8 +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +v_swmmac_bf16_16x16x32_bf16 v[6:7], v[0:1], v[2:5], 1.0 +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +v_swmmac_bf16_16x16x32_bf16 v[6:7], v[0:1], v[2:5], 1 +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + + + +v_swmmac_i32_16x16x32_iu8 v[3:6], v0, v[1:2], v7 +// GFX12: v_swmmac_i32_16x16x32_iu8 v[3:6], v0, v[1:2], v7 ; encoding: [0x03,0x40,0x54,0xcc,0x00,0x03,0x1e,0x1c] + +v_swmmac_i32_16x16x32_iu8 v[3:6], v0, v[1:2], v7 clamp +// GFX12: v_swmmac_i32_16x16x32_iu8 v[3:6], v0, v[1:2], v7 clamp ; encoding: [0x03,0xc0,0x54,0xcc,0x00,0x03,0x1e,0x1c] + +v_swmmac_i32_16x16x32_iu8 v[3:6], v0, v[1:2], v7 op_sel:[1,0,0] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_swmmac_i32_16x16x32_iu8 v[3:6], v0, v[1:2], v7 op_sel:[0,1,0] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_swmmac_i32_16x16x32_iu8 v[3:6], v0, v[1:2], v7 op_sel:[0,0,1] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_swmmac_i32_16x16x32_iu8 v[3:6], v0, v[1:2], v7 op_sel_hi:[0,1,1] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_swmmac_i32_16x16x32_iu8 v[3:6], v0, v[1:2], v7 op_sel_hi:[1,0,1] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_swmmac_i32_16x16x32_iu8 v[3:6], v0, v[1:2], v7 op_sel_hi:[1,1,0] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_swmmac_i32_16x16x32_iu8 v[3:6], v0, v[1:2], v7 index_key:1 +// GFX12: v_swmmac_i32_16x16x32_iu8 v[3:6], v0, v[1:2], v7 index_key:1 ; encoding: [0x03,0x48,0x54,0xcc,0x00,0x03,0x1e,0x1c] + +v_swmmac_i32_16x16x32_iu8 v[3:6], v0, v[1:2], v7 index_key:2 +// GFX12: v_swmmac_i32_16x16x32_iu8 v[3:6], v0, v[1:2], v7 index_key:2 ; encoding: [0x03,0x50,0x54,0xcc,0x00,0x03,0x1e,0x1c] + +v_swmmac_i32_16x16x32_iu8 v[3:6], v0, v[1:2], v7 index_key:3 +// GFX12: v_swmmac_i32_16x16x32_iu8 v[3:6], v0, v[1:2], v7 index_key:3 ; encoding: [0x03,0x58,0x54,0xcc,0x00,0x03,0x1e,0x1c] + +v_swmmac_i32_16x16x32_iu8 v[3:6], v0, v[1:2], v7 neg_lo:[1,0,0] +// GFX12: v_swmmac_i32_16x16x32_iu8 v[3:6], v0, v[1:2], v7 neg_lo:[1,0,0] ; encoding: [0x03,0x40,0x54,0xcc,0x00,0x03,0x1e,0x3c] + +v_swmmac_i32_16x16x32_iu8 v[3:6], v0, v[1:2], v7 neg_lo:[0,1,0] +// GFX12: v_swmmac_i32_16x16x32_iu8 v[3:6], v0, v[1:2], v7 neg_lo:[0,1,0] ; encoding: [0x03,0x40,0x54,0xcc,0x00,0x03,0x1e,0x5c] + +v_swmmac_i32_16x16x32_iu8 v[3:6], v0, v[1:2], v7 neg_lo:[0,0,1] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid neg_lo operand + +v_swmmac_i32_16x16x32_iu8 v[3:6], v0, v[1:2], v7 neg_hi:[1,0,0] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_swmmac_i32_16x16x32_iu8 v[3:6], v0, v[1:2], v7 neg_hi:[0,1,0] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_swmmac_i32_16x16x32_iu8 v[3:6], v0, v[1:2], v7 neg_hi:[0,0,1] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_swmmac_i32_16x16x32_iu8 v[3:6], s0, v[1:2], v7 +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +v_swmmac_i32_16x16x32_iu8 v[3:6], v0, s[0:1], v7 +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +v_swmmac_i32_16x16x32_iu8 v[3:6], v0, v[1:2], s7 +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +v_swmmac_i32_16x16x32_iu8 v[3:6], 1, v[1:2], v7 +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +v_swmmac_i32_16x16x32_iu8 v[3:6], v0, 1, v7 +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +v_swmmac_i32_16x16x32_iu8 v[3:6], v0, v[1:2], 1 +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +v_swmmac_i32_16x16x32_iu8 v[3:6], v0, v[1:2], 1.0 +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + + + +v_swmmac_i32_16x16x32_iu4 v[2:5], v0, v1, v6 +// GFX12: v_swmmac_i32_16x16x32_iu4 v[2:5], v0, v1, v6 ; encoding: [0x02,0x40,0x55,0xcc,0x00,0x03,0x1a,0x1c] + +v_swmmac_i32_16x16x32_iu4 v[2:5], v0, v1, v6 clamp +// GFX12: v_swmmac_i32_16x16x32_iu4 v[2:5], v0, v1, v6 clamp ; encoding: [0x02,0xc0,0x55,0xcc,0x00,0x03,0x1a,0x1c] + +v_swmmac_i32_16x16x32_iu4 v[2:5], v0, v1, v6 op_sel:[1,0,0] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_swmmac_i32_16x16x32_iu4 v[2:5], v0, v1, v6 op_sel:[0,1,0] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_swmmac_i32_16x16x32_iu4 v[2:5], v0, v1, v6 op_sel:[0,0,1] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_swmmac_i32_16x16x32_iu4 v[2:5], v0, v1, v6 op_sel_hi:[0,1,1] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_swmmac_i32_16x16x32_iu4 v[2:5], v0, v1, v6 op_sel_hi:[1,0,1] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_swmmac_i32_16x16x32_iu4 v[2:5], v0, v1, v6 op_sel_hi:[1,1,0] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_swmmac_i32_16x16x32_iu4 v[2:5], v0, v1, v6 index_key:1 +// GFX12: v_swmmac_i32_16x16x32_iu4 v[2:5], v0, v1, v6 index_key:1 ; encoding: [0x02,0x48,0x55,0xcc,0x00,0x03,0x1a,0x1c] + +v_swmmac_i32_16x16x32_iu4 v[2:5], v0, v1, v6 index_key:2 +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: out of range index_key + +v_swmmac_i32_16x16x32_iu4 v[2:5], v0, v1, v6 index_key:3 +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: out of range index_key + +v_swmmac_i32_16x16x32_iu4 v[2:5], v0, v1, v6 neg_lo:[1,0,0] +// GFX12: v_swmmac_i32_16x16x32_iu4 v[2:5], v0, v1, v6 neg_lo:[1,0,0] ; encoding: [0x02,0x40,0x55,0xcc,0x00,0x03,0x1a,0x3c] + +v_swmmac_i32_16x16x32_iu4 v[2:5], v0, v1, v6 neg_lo:[0,1,0] +// GFX12: v_swmmac_i32_16x16x32_iu4 v[2:5], v0, v1, v6 neg_lo:[0,1,0] ; encoding: [0x02,0x40,0x55,0xcc,0x00,0x03,0x1a,0x5c] + +v_swmmac_i32_16x16x32_iu4 v[2:5], v0, v1, v6 neg_lo:[0,0,1] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid neg_lo operand + +v_swmmac_i32_16x16x32_iu4 v[2:5], v0, v1, v6 neg_hi:[1,0,0] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_swmmac_i32_16x16x32_iu4 v[2:5], v0, v1, v6 neg_hi:[0,1,0] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_swmmac_i32_16x16x32_iu4 v[2:5], v0, v1, v6 neg_hi:[0,0,1] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_swmmac_i32_16x16x32_iu4 v[2:5], s0, v1, v6 +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +v_swmmac_i32_16x16x32_iu4 v[2:5], v0, s1, v6 +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +v_swmmac_i32_16x16x32_iu4 v[2:5], v0, v1, s6 +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +v_swmmac_i32_16x16x32_iu4 v[2:5], 1, v1, v6 +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +v_swmmac_i32_16x16x32_iu4 v[2:5], v0, 1, v6 +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +v_swmmac_i32_16x16x32_iu4 v[2:5], v0, v1, 1 +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +v_swmmac_i32_16x16x32_iu4 v[2:5], v0, v1, 1.0 +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + + + +v_swmmac_i32_16x16x64_iu4 v[3:6], v0, v[1:2], v7 +// GFX12: v_swmmac_i32_16x16x64_iu4 v[3:6], v0, v[1:2], v7 ; encoding: [0x03,0x40,0x56,0xcc,0x00,0x03,0x1e,0x1c] + +v_swmmac_i32_16x16x64_iu4 v[3:6], v0, v[1:2], v7 clamp +// GFX12: v_swmmac_i32_16x16x64_iu4 v[3:6], v0, v[1:2], v7 clamp ; encoding: [0x03,0xc0,0x56,0xcc,0x00,0x03,0x1e,0x1c] + +v_swmmac_i32_16x16x64_iu4 v[3:6], v0, v[1:2], v7 op_sel:[1,0,0] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_swmmac_i32_16x16x64_iu4 v[3:6], v0, v[1:2], v7 op_sel:[0,1,0] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_swmmac_i32_16x16x64_iu4 v[3:6], v0, v[1:2], v7 op_sel:[0,0,1] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_swmmac_i32_16x16x64_iu4 v[3:6], v0, v[1:2], v7 op_sel_hi:[0,1,1] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_swmmac_i32_16x16x64_iu4 v[3:6], v0, v[1:2], v7 op_sel_hi:[1,0,1] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_swmmac_i32_16x16x64_iu4 v[3:6], v0, v[1:2], v7 op_sel_hi:[1,1,0] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_swmmac_i32_16x16x64_iu4 v[3:6], v0, v[1:2], v7 index_key:1 +// GFX12: v_swmmac_i32_16x16x64_iu4 v[3:6], v0, v[1:2], v7 index_key:1 ; encoding: [0x03,0x48,0x56,0xcc,0x00,0x03,0x1e,0x1c] + +v_swmmac_i32_16x16x64_iu4 v[3:6], v0, v[1:2], v7 index_key:2 +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: out of range index_key + +v_swmmac_i32_16x16x64_iu4 v[3:6], v0, v[1:2], v7 index_key:3 +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: out of range index_key + +v_swmmac_i32_16x16x64_iu4 v[3:6], v0, v[1:2], v7 neg_lo:[1,0,0] +// GFX12: v_swmmac_i32_16x16x64_iu4 v[3:6], v0, v[1:2], v7 neg_lo:[1,0,0] ; encoding: [0x03,0x40,0x56,0xcc,0x00,0x03,0x1e,0x3c] + +v_swmmac_i32_16x16x64_iu4 v[3:6], v0, v[1:2], v7 neg_lo:[0,1,0] +// GFX12: v_swmmac_i32_16x16x64_iu4 v[3:6], v0, v[1:2], v7 neg_lo:[0,1,0] ; encoding: [0x03,0x40,0x56,0xcc,0x00,0x03,0x1e,0x5c] + +v_swmmac_i32_16x16x64_iu4 v[3:6], v0, v[1:2], v7 neg_lo:[0,0,1] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid neg_lo operand + +v_swmmac_i32_16x16x64_iu4 v[3:6], v0, v[1:2], v7 neg_hi:[1,0,0] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_swmmac_i32_16x16x64_iu4 v[3:6], v0, v[1:2], v7 neg_hi:[0,1,0] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_swmmac_i32_16x16x64_iu4 v[3:6], v0, v[1:2], v7 neg_hi:[0,0,1] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_swmmac_i32_16x16x64_iu4 v[3:6], s0, v[1:2], v7 +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +v_swmmac_i32_16x16x64_iu4 v[3:6], v0, s[0:1], v7 +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +v_swmmac_i32_16x16x64_iu4 v[3:6], v0, v[1:2], s7 +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +v_swmmac_i32_16x16x64_iu4 v[3:6], 1, v[1:2], v7 +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +v_swmmac_i32_16x16x64_iu4 v[3:6], v0, 1, v7 +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +v_swmmac_i32_16x16x64_iu4 v[3:6], v0, v[1:2], 1 +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +v_swmmac_i32_16x16x64_iu4 v[3:6], v0, v[1:2], 1.0 +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + + + +v_swmmac_f32_16x16x32_fp8_fp8 v[3:6], v0, v[1:2], v7 +// GFX12: v_swmmac_f32_16x16x32_fp8_fp8 v[3:6], v0, v[1:2], v7 ; encoding: [0x03,0x40,0x57,0xcc,0x00,0x03,0x1e,0x1c] + +v_swmmac_f32_16x16x32_fp8_fp8 v[3:6], v0, v[1:2], v7 clamp +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +v_swmmac_f32_16x16x32_fp8_fp8 v[3:6], v0, v[1:2], v7 op_sel:[1,0,0] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_swmmac_f32_16x16x32_fp8_fp8 v[3:6], v0, v[1:2], v7 op_sel:[0,1,0] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_swmmac_f32_16x16x32_fp8_fp8 v[3:6], v0, v[1:2], v7 op_sel:[0,0,1] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_swmmac_f32_16x16x32_fp8_fp8 v[3:6], v0, v[1:2], v7 op_sel_hi:[0,1,1] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_swmmac_f32_16x16x32_fp8_fp8 v[3:6], v0, v[1:2], v7 op_sel_hi:[1,0,1] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_swmmac_f32_16x16x32_fp8_fp8 v[3:6], v0, v[1:2], v7 op_sel_hi:[1,1,0] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_swmmac_f32_16x16x32_fp8_fp8 v[3:6], v0, v[1:2], v7 index_key:1 +// GFX12: v_swmmac_f32_16x16x32_fp8_fp8 v[3:6], v0, v[1:2], v7 index_key:1 ; encoding: [0x03,0x48,0x57,0xcc,0x00,0x03,0x1e,0x1c] + +v_swmmac_f32_16x16x32_fp8_fp8 v[3:6], v0, v[1:2], v7 index_key:2 +// GFX12: v_swmmac_f32_16x16x32_fp8_fp8 v[3:6], v0, v[1:2], v7 index_key:2 ; encoding: [0x03,0x50,0x57,0xcc,0x00,0x03,0x1e,0x1c] + +v_swmmac_f32_16x16x32_fp8_fp8 v[3:6], v0, v[1:2], v7 index_key:3 +// GFX12: v_swmmac_f32_16x16x32_fp8_fp8 v[3:6], v0, v[1:2], v7 index_key:3 ; encoding: [0x03,0x58,0x57,0xcc,0x00,0x03,0x1e,0x1c] + +v_swmmac_f32_16x16x32_fp8_fp8 v[3:6], v0, v[1:2], v7 neg_lo:[1,0,0] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_swmmac_f32_16x16x32_fp8_fp8 v[3:6], v0, v[1:2], v7 neg_lo:[0,1,0] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_swmmac_f32_16x16x32_fp8_fp8 v[3:6], v0, v[1:2], v7 neg_lo:[0,0,1] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_swmmac_f32_16x16x32_fp8_fp8 v[3:6], v0, v[1:2], v7 neg_hi:[1,0,0] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_swmmac_f32_16x16x32_fp8_fp8 v[3:6], v0, v[1:2], v7 neg_hi:[0,1,0] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_swmmac_f32_16x16x32_fp8_fp8 v[3:6], v0, v[1:2], v7 neg_hi:[0,0,1] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_swmmac_f32_16x16x32_fp8_fp8 v[3:6], s0, v[1:2], v7 +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +v_swmmac_f32_16x16x32_fp8_fp8 v[3:6], v0, s[0:1], v7 +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +v_swmmac_f32_16x16x32_fp8_fp8 v[3:6], v0, v[1:2], s7 +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +v_swmmac_f32_16x16x32_fp8_fp8 v[3:6], 1.0, v[1:2], v7 +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +v_swmmac_f32_16x16x32_fp8_fp8 v[3:6], v0, 1.0, v7 +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +v_swmmac_f32_16x16x32_fp8_fp8 v[3:6], v0, v[1:2], 1.0 +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +v_swmmac_f32_16x16x32_fp8_fp8 v[3:6], v0, v[1:2], 1 +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + + + +v_swmmac_f32_16x16x32_fp8_bf8 v[3:6], v0, v[1:2], v7 +// GFX12: v_swmmac_f32_16x16x32_fp8_bf8 v[3:6], v0, v[1:2], v7 ; encoding: [0x03,0x40,0x58,0xcc,0x00,0x03,0x1e,0x1c] + +v_swmmac_f32_16x16x32_fp8_bf8 v[3:6], v0, v[1:2], v7 clamp +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +v_swmmac_f32_16x16x32_fp8_bf8 v[3:6], v0, v[1:2], v7 op_sel:[1,0,0] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_swmmac_f32_16x16x32_fp8_bf8 v[3:6], v0, v[1:2], v7 op_sel:[0,1,0] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_swmmac_f32_16x16x32_fp8_bf8 v[3:6], v0, v[1:2], v7 op_sel:[0,0,1] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_swmmac_f32_16x16x32_fp8_bf8 v[3:6], v0, v[1:2], v7 op_sel_hi:[0,1,1] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_swmmac_f32_16x16x32_fp8_bf8 v[3:6], v0, v[1:2], v7 op_sel_hi:[1,0,1] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_swmmac_f32_16x16x32_fp8_bf8 v[3:6], v0, v[1:2], v7 op_sel_hi:[1,1,0] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_swmmac_f32_16x16x32_fp8_bf8 v[3:6], v0, v[1:2], v7 index_key:1 +// GFX12: v_swmmac_f32_16x16x32_fp8_bf8 v[3:6], v0, v[1:2], v7 index_key:1 ; encoding: [0x03,0x48,0x58,0xcc,0x00,0x03,0x1e,0x1c] + +v_swmmac_f32_16x16x32_fp8_bf8 v[3:6], v0, v[1:2], v7 index_key:2 +// GFX12: v_swmmac_f32_16x16x32_fp8_bf8 v[3:6], v0, v[1:2], v7 index_key:2 ; encoding: [0x03,0x50,0x58,0xcc,0x00,0x03,0x1e,0x1c] + +v_swmmac_f32_16x16x32_fp8_bf8 v[3:6], v0, v[1:2], v7 index_key:3 +// GFX12: v_swmmac_f32_16x16x32_fp8_bf8 v[3:6], v0, v[1:2], v7 index_key:3 ; encoding: [0x03,0x58,0x58,0xcc,0x00,0x03,0x1e,0x1c] + +v_swmmac_f32_16x16x32_fp8_bf8 v[3:6], v0, v[1:2], v7 neg_lo:[1,0,0] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_swmmac_f32_16x16x32_fp8_bf8 v[3:6], v0, v[1:2], v7 neg_lo:[0,1,0] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_swmmac_f32_16x16x32_fp8_bf8 v[3:6], v0, v[1:2], v7 neg_lo:[0,0,1] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_swmmac_f32_16x16x32_fp8_bf8 v[3:6], v0, v[1:2], v7 neg_hi:[1,0,0] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_swmmac_f32_16x16x32_fp8_bf8 v[3:6], v0, v[1:2], v7 neg_hi:[0,1,0] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_swmmac_f32_16x16x32_fp8_bf8 v[3:6], v0, v[1:2], v7 neg_hi:[0,0,1] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_swmmac_f32_16x16x32_fp8_bf8 v[3:6], s0, v[1:2], v7 +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +v_swmmac_f32_16x16x32_fp8_bf8 v[3:6], v0, s[0:1], v7 +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +v_swmmac_f32_16x16x32_fp8_bf8 v[3:6], v0, v[1:2], s7 +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +v_swmmac_f32_16x16x32_fp8_bf8 v[3:6], 1.0, v[1:2], v7 +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +v_swmmac_f32_16x16x32_fp8_bf8 v[3:6], v0, 1.0, v7 +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +v_swmmac_f32_16x16x32_fp8_bf8 v[3:6], v0, v[1:2], 1.0 +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +v_swmmac_f32_16x16x32_fp8_bf8 v[3:6], v0, v[1:2], 1 +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + + + +v_swmmac_f32_16x16x32_bf8_fp8 v[3:6], v0, v[1:2], v7 +// GFX12: v_swmmac_f32_16x16x32_bf8_fp8 v[3:6], v0, v[1:2], v7 ; encoding: [0x03,0x40,0x59,0xcc,0x00,0x03,0x1e,0x1c] + +v_swmmac_f32_16x16x32_bf8_fp8 v[3:6], v0, v[1:2], v7 clamp +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +v_swmmac_f32_16x16x32_bf8_fp8 v[3:6], v0, v[1:2], v7 op_sel:[1,0,0] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_swmmac_f32_16x16x32_bf8_fp8 v[3:6], v0, v[1:2], v7 op_sel:[0,1,0] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_swmmac_f32_16x16x32_bf8_fp8 v[3:6], v0, v[1:2], v7 op_sel:[0,0,1] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_swmmac_f32_16x16x32_bf8_fp8 v[3:6], v0, v[1:2], v7 op_sel_hi:[0,1,1] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_swmmac_f32_16x16x32_bf8_fp8 v[3:6], v0, v[1:2], v7 op_sel_hi:[1,0,1] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_swmmac_f32_16x16x32_bf8_fp8 v[3:6], v0, v[1:2], v7 op_sel_hi:[1,1,0] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_swmmac_f32_16x16x32_bf8_fp8 v[3:6], v0, v[1:2], v7 index_key:1 +// GFX12: v_swmmac_f32_16x16x32_bf8_fp8 v[3:6], v0, v[1:2], v7 index_key:1 ; encoding: [0x03,0x48,0x59,0xcc,0x00,0x03,0x1e,0x1c] + +v_swmmac_f32_16x16x32_bf8_fp8 v[3:6], v0, v[1:2], v7 index_key:2 +// GFX12: v_swmmac_f32_16x16x32_bf8_fp8 v[3:6], v0, v[1:2], v7 index_key:2 ; encoding: [0x03,0x50,0x59,0xcc,0x00,0x03,0x1e,0x1c] + +v_swmmac_f32_16x16x32_bf8_fp8 v[3:6], v0, v[1:2], v7 index_key:3 +// GFX12: v_swmmac_f32_16x16x32_bf8_fp8 v[3:6], v0, v[1:2], v7 index_key:3 ; encoding: [0x03,0x58,0x59,0xcc,0x00,0x03,0x1e,0x1c] + +v_swmmac_f32_16x16x32_bf8_fp8 v[3:6], v0, v[1:2], v7 neg_lo:[1,0,0] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_swmmac_f32_16x16x32_bf8_fp8 v[3:6], v0, v[1:2], v7 neg_lo:[0,1,0] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_swmmac_f32_16x16x32_bf8_fp8 v[3:6], v0, v[1:2], v7 neg_lo:[0,0,1] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_swmmac_f32_16x16x32_bf8_fp8 v[3:6], v0, v[1:2], v7 neg_hi:[1,0,0] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_swmmac_f32_16x16x32_bf8_fp8 v[3:6], v0, v[1:2], v7 neg_hi:[0,1,0] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_swmmac_f32_16x16x32_bf8_fp8 v[3:6], v0, v[1:2], v7 neg_hi:[0,0,1] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_swmmac_f32_16x16x32_bf8_fp8 v[3:6], s0, v[1:2], v7 +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +v_swmmac_f32_16x16x32_bf8_fp8 v[3:6], v0, s[0:1], v7 +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +v_swmmac_f32_16x16x32_bf8_fp8 v[3:6], v0, v[1:2], s7 +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +v_swmmac_f32_16x16x32_bf8_fp8 v[3:6], 1.0, v[1:2], v7 +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +v_swmmac_f32_16x16x32_bf8_fp8 v[3:6], v0, 1.0, v7 +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +v_swmmac_f32_16x16x32_bf8_fp8 v[3:6], v0, v[1:2], 1.0 +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +v_swmmac_f32_16x16x32_bf8_fp8 v[3:6], v0, v[1:2], 1 +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + + + +v_swmmac_f32_16x16x32_bf8_bf8 v[3:6], v0, v[1:2], v7 +// GFX12: v_swmmac_f32_16x16x32_bf8_bf8 v[3:6], v0, v[1:2], v7 ; encoding: [0x03,0x40,0x5a,0xcc,0x00,0x03,0x1e,0x1c] + +v_swmmac_f32_16x16x32_bf8_bf8 v[3:6], v0, v[1:2], v7 clamp +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +v_swmmac_f32_16x16x32_bf8_bf8 v[3:6], v0, v[1:2], v7 op_sel:[1,0,0] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_swmmac_f32_16x16x32_bf8_bf8 v[3:6], v0, v[1:2], v7 op_sel:[0,1,0] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_swmmac_f32_16x16x32_bf8_bf8 v[3:6], v0, v[1:2], v7 op_sel:[0,0,1] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_swmmac_f32_16x16x32_bf8_bf8 v[3:6], v0, v[1:2], v7 op_sel_hi:[0,1,1] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_swmmac_f32_16x16x32_bf8_bf8 v[3:6], v0, v[1:2], v7 op_sel_hi:[1,0,1] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_swmmac_f32_16x16x32_bf8_bf8 v[3:6], v0, v[1:2], v7 op_sel_hi:[1,1,0] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_swmmac_f32_16x16x32_bf8_bf8 v[3:6], v0, v[1:2], v7 index_key:1 +// GFX12: v_swmmac_f32_16x16x32_bf8_bf8 v[3:6], v0, v[1:2], v7 index_key:1 ; encoding: [0x03,0x48,0x5a,0xcc,0x00,0x03,0x1e,0x1c] + +v_swmmac_f32_16x16x32_bf8_bf8 v[3:6], v0, v[1:2], v7 index_key:2 +// GFX12: v_swmmac_f32_16x16x32_bf8_bf8 v[3:6], v0, v[1:2], v7 index_key:2 ; encoding: [0x03,0x50,0x5a,0xcc,0x00,0x03,0x1e,0x1c] + +v_swmmac_f32_16x16x32_bf8_bf8 v[3:6], v0, v[1:2], v7 index_key:3 +// GFX12: v_swmmac_f32_16x16x32_bf8_bf8 v[3:6], v0, v[1:2], v7 index_key:3 ; encoding: [0x03,0x58,0x5a,0xcc,0x00,0x03,0x1e,0x1c] + +v_swmmac_f32_16x16x32_bf8_bf8 v[3:6], v0, v[1:2], v7 neg_lo:[1,0,0] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_swmmac_f32_16x16x32_bf8_bf8 v[3:6], v0, v[1:2], v7 neg_lo:[0,1,0] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_swmmac_f32_16x16x32_bf8_bf8 v[3:6], v0, v[1:2], v7 neg_lo:[0,0,1] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_swmmac_f32_16x16x32_bf8_bf8 v[3:6], v0, v[1:2], v7 neg_hi:[1,0,0] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_swmmac_f32_16x16x32_bf8_bf8 v[3:6], v0, v[1:2], v7 neg_hi:[0,1,0] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_swmmac_f32_16x16x32_bf8_bf8 v[3:6], v0, v[1:2], v7 neg_hi:[0,0,1] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_swmmac_f32_16x16x32_bf8_bf8 v[3:6], s0, v[1:2], v7 +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +v_swmmac_f32_16x16x32_bf8_bf8 v[3:6], v0, s[0:1], v7 +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +v_swmmac_f32_16x16x32_bf8_bf8 v[3:6], v0, v[1:2], s7 +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +v_swmmac_f32_16x16x32_bf8_bf8 v[3:6], 1.0, v[1:2], v7 +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +v_swmmac_f32_16x16x32_bf8_bf8 v[3:6], v0, 1.0, v7 +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +v_swmmac_f32_16x16x32_bf8_bf8 v[3:6], v0, v[1:2], 1.0 +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +v_swmmac_f32_16x16x32_bf8_bf8 v[3:6], v0, v[1:2], 1 +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction diff --git a/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_wmma_w32.txt b/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_wmma_w32.txt new file mode 100644 index 000000000000..5079d2f08965 --- /dev/null +++ b/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_wmma_w32.txt @@ -0,0 +1,1628 @@ +# RUN: not llvm-mc -disassemble -arch=amdgcn -mcpu=gfx1200 -show-encoding %s | FileCheck --check-prefix=GFX12 %s +# RUN: not llvm-mc -disassemble -arch=amdgcn -mcpu=gfx1200 -show-encoding %s 2>&1 | FileCheck --implicit-check-not=warning: --check-prefix=GFX12-ERR %s + +[0x08,0x40,0x40,0xcc,0x00,0x09,0x22,0x1c] +# GFX12: v_wmma_f32_16x16x16_f16 v[8:15], v[0:3], v[4:7], v[8:15] ; encoding: [0x08,0x40,0x40,0xcc,0x00,0x09,0x22,0x1c] + +[0x08,0xc0,0x40,0xcc,0x00,0x09,0x22,0x1c] # clamp +# GFX12-ERR: warning: invalid instruction encoding + +[0x08,0x48,0x40,0xcc,0x00,0x09,0x22,0x1c] # op_sel:[1,0,0] +# GFX12-ERR: warning: invalid instruction encoding + +[0x08,0x50,0x40,0xcc,0x00,0x09,0x22,0x1c] # op_sel:[0,1,0] +# GFX12-ERR: warning: invalid instruction encoding + +[0x08,0x60,0x40,0xcc,0x00,0x09,0x22,0x1c] # op_sel:[0,0,1] +# GFX12-ERR: warning: invalid instruction encoding + +[0x08,0x40,0x40,0xcc,0x00,0x09,0x22,0x14] # op_sel_hi:[0,1,1] +# GFX12-ERR: warning: invalid instruction encoding + +[0x08,0x40,0x40,0xcc,0x00,0x09,0x22,0x0c] # op_sel_hi:[1,0,1] +# GFX12-ERR: warning: invalid instruction encoding + +[0x08,0x00,0x40,0xcc,0x00,0x09,0x22,0x1c] # op_sel_hi:[1,1,0] +# GFX12-ERR: warning: invalid instruction encoding + +[0x08,0x48,0x40,0xcc,0x00,0x09,0x22,0x1c] # index_key:1 +# GFX12-ERR: warning: invalid instruction encoding + +[0x08,0x50,0x40,0xcc,0x00,0x09,0x22,0x1c] # index_key:2 +# GFX12-ERR: warning: invalid instruction encoding + +[0x08,0x58,0x40,0xcc,0x00,0x09,0x22,0x1c] # index_key:3 +# GFX12-ERR: warning: invalid instruction encoding + +[0x08,0x40,0x40,0xcc,0x00,0x09,0x22,0x3c] # neg_lo:[1,0,0] +# GFX12: v_wmma_f32_16x16x16_f16 v[8:15], v[0:3], v[4:7], v[8:15] neg_lo:[1,0,0] ; encoding: [0x08,0x40,0x40,0xcc,0x00,0x09,0x22,0x3c] + +[0x08,0x40,0x40,0xcc,0x00,0x09,0x22,0x5c] # neg_lo:[0,1,0] +# GFX12: v_wmma_f32_16x16x16_f16 v[8:15], v[0:3], v[4:7], v[8:15] neg_lo:[0,1,0] ; encoding: [0x08,0x40,0x40,0xcc,0x00,0x09,0x22,0x5c] + +[0x08,0x40,0x40,0xcc,0x00,0x09,0x22,0x9c] # neg_lo:[0,0,1] +# GFX12: v_wmma_f32_16x16x16_f16 v[8:15], v[0:3], v[4:7], v[8:15] neg_lo:[0,0,1] ; encoding: [0x08,0x40,0x40,0xcc,0x00,0x09,0x22,0x9c] + +[0x08,0x41,0x40,0xcc,0x00,0x09,0x22,0x1c] # neg_hi:[1,0,0] +# GFX12: v_wmma_f32_16x16x16_f16 v[8:15], v[0:3], v[4:7], v[8:15] neg_hi:[1,0,0] ; encoding: [0x08,0x41,0x40,0xcc,0x00,0x09,0x22,0x1c] + +[0x08,0x42,0x40,0xcc,0x00,0x09,0x22,0x1c] # neg_hi:[0,1,0] +# GFX12: v_wmma_f32_16x16x16_f16 v[8:15], v[0:3], v[4:7], v[8:15] neg_hi:[0,1,0] ; encoding: [0x08,0x42,0x40,0xcc,0x00,0x09,0x22,0x1c] + +[0x08,0x44,0x40,0xcc,0x00,0x09,0x22,0x1c] # neg_hi:[0,0,1] +# GFX12: v_wmma_f32_16x16x16_f16 v[8:15], v[0:3], v[4:7], v[8:15] neg_hi:[0,0,1] ; encoding: [0x08,0x44,0x40,0xcc,0x00,0x09,0x22,0x1c] + +[0x08,0x40,0x40,0xcc,0x00,0x08,0x22,0x1c] # sgpr src0 +# GFX12: v_wmma_f32_16x16x16_f16 v[8:15], s[0:3]/*Invalid register, operand has 'VReg_128' register class*/, v[4:7], v[8:15] ; encoding: [0x08,0x40,0x40,0xcc,0x00,0x08,0x22,0x1c] + +[0x08,0x40,0x40,0xcc,0x00,0x09,0x20,0x1c] # sgpr src1 +# GFX12: v_wmma_f32_16x16x16_f16 v[8:15], v[0:3], s[4:7]/*Invalid register, operand has 'VReg_128' register class*/, v[8:15] ; encoding: [0x08,0x40,0x40,0xcc,0x00,0x09,0x20,0x1c] + +[0x08,0x40,0x40,0xcc,0x00,0x09,0x22,0x18] # sgpr src2 +# GFX12: v_wmma_f32_16x16x16_f16 v[8:15], v[0:3], v[4:7], s[8:15]/*Invalid register, operand has 'VReg_256' register class*/ ; encoding: [0x08,0x40,0x40,0xcc,0x00,0x09,0x22,0x18] + +[0x08,0x40,0x40,0xcc,0xf2,0x08,0x22,0x1c] # 1.0 src0 +# GFX12: v_wmma_f32_16x16x16_f16 v[8:15], 1.0/*Invalid immediate*/, v[4:7], v[8:15] ; encoding: [0x08,0x40,0x40,0xcc,0x00,0x08,0x22,0x1c] + +[0x08,0x40,0x40,0xcc,0x00,0xe5,0x21,0x1c] # 1.0 src1 +# GFX12: v_wmma_f32_16x16x16_f16 v[8:15], v[0:3], 1.0/*Invalid immediate*/, v[8:15] ; encoding: [0x08,0x40,0x40,0xcc,0x00,0x01,0x20,0x1c] + +[0x08,0x40,0x40,0xcc,0x00,0x09,0xca,0x1b] # 1.0 src2 +# GFX12: v_wmma_f32_16x16x16_f16 v[8:15], v[0:3], v[4:7], 1.0 ; encoding: [0x08,0x40,0x40,0xcc,0x00,0x09,0xca,0x1b] + +[0x08,0x40,0x40,0xcc,0x00,0x09,0x06,0x1a] # 1 src2 +# GFX12: v_wmma_f32_16x16x16_f16 v[8:15], v[0:3], v[4:7], 1 ; encoding: [0x08,0x40,0x40,0xcc,0x00,0x09,0x06,0x1a] + + + +[0x08,0x40,0x41,0xcc,0x00,0x09,0x22,0x1c] +# GFX12: v_wmma_f32_16x16x16_bf16 v[8:15], v[0:3], v[4:7], v[8:15] ; encoding: [0x08,0x40,0x41,0xcc,0x00,0x09,0x22,0x1c] + +[0x08,0xc0,0x41,0xcc,0x00,0x09,0x22,0x1c] # clamp +# GFX12-ERR: warning: invalid instruction encoding + +[0x08,0x48,0x41,0xcc,0x00,0x09,0x22,0x1c] # op_sel:[1,0,0] +# GFX12-ERR: warning: invalid instruction encoding + +[0x08,0x50,0x41,0xcc,0x00,0x09,0x22,0x1c] # op_sel:[0,1,0] +# GFX12-ERR: warning: invalid instruction encoding + +[0x08,0x60,0x41,0xcc,0x00,0x09,0x22,0x1c] # op_sel:[0,0,1] +# GFX12-ERR: warning: invalid instruction encoding + +[0x08,0x40,0x41,0xcc,0x00,0x09,0x22,0x14] # op_sel_hi:[0,1,1] +# GFX12-ERR: warning: invalid instruction encoding + +[0x08,0x40,0x41,0xcc,0x00,0x09,0x22,0x0c] # op_sel_hi:[1,0,1] +# GFX12-ERR: warning: invalid instruction encoding + +[0x08,0x00,0x41,0xcc,0x00,0x09,0x22,0x1c] # op_sel_hi:[1,1,0] +# GFX12-ERR: warning: invalid instruction encoding + +[0x04,0x48,0x41,0xcc,0x00,0x05,0x12,0x1c] # index_key:1 +# GFX12-ERR: warning: invalid instruction encoding + +[0x04,0x50,0x41,0xcc,0x00,0x05,0x12,0x1c] # index_key:2 +# GFX12-ERR: warning: invalid instruction encoding + +[0x04,0x58,0x41,0xcc,0x00,0x05,0x12,0x1c] # index_key:3 +# GFX12-ERR: warning: invalid instruction encoding + +[0x08,0x40,0x41,0xcc,0x00,0x09,0x22,0x3c] # neg_lo:[1,0,0] +# GFX12: v_wmma_f32_16x16x16_bf16 v[8:15], v[0:3], v[4:7], v[8:15] neg_lo:[1,0,0] ; encoding: [0x08,0x40,0x41,0xcc,0x00,0x09,0x22,0x3c] + +[0x08,0x40,0x41,0xcc,0x00,0x09,0x22,0x5c] # neg_lo:[0,1,0] +# GFX12: v_wmma_f32_16x16x16_bf16 v[8:15], v[0:3], v[4:7], v[8:15] neg_lo:[0,1,0] ; encoding: [0x08,0x40,0x41,0xcc,0x00,0x09,0x22,0x5c] + +[0x08,0x40,0x41,0xcc,0x00,0x09,0x22,0x9c] # neg_lo:[0,0,1] +# GFX12: v_wmma_f32_16x16x16_bf16 v[8:15], v[0:3], v[4:7], v[8:15] neg_lo:[0,0,1] ; encoding: [0x08,0x40,0x41,0xcc,0x00,0x09,0x22,0x9c] + +[0x08,0x41,0x41,0xcc,0x00,0x09,0x22,0x1c] # neg_hi:[1,0,0] +# GFX12: v_wmma_f32_16x16x16_bf16 v[8:15], v[0:3], v[4:7], v[8:15] neg_hi:[1,0,0] ; encoding: [0x08,0x41,0x41,0xcc,0x00,0x09,0x22,0x1c] + +[0x08,0x42,0x41,0xcc,0x00,0x09,0x22,0x1c] # neg_hi:[0,1,0] +# GFX12: v_wmma_f32_16x16x16_bf16 v[8:15], v[0:3], v[4:7], v[8:15] neg_hi:[0,1,0] ; encoding: [0x08,0x42,0x41,0xcc,0x00,0x09,0x22,0x1c] + +[0x08,0x44,0x41,0xcc,0x00,0x09,0x22,0x1c] # neg_hi:[0,0,1] +# GFX12: v_wmma_f32_16x16x16_bf16 v[8:15], v[0:3], v[4:7], v[8:15] neg_hi:[0,0,1] ; encoding: [0x08,0x44,0x41,0xcc,0x00,0x09,0x22,0x1c] + +[0x08,0x40,0x41,0xcc,0x00,0x08,0x22,0x1c] # sgpr src0 +# GFX12: v_wmma_f32_16x16x16_bf16 v[8:15], s[0:3]/*Invalid register, operand has 'VReg_128' register class*/, v[4:7], v[8:15] ; encoding: [0x08,0x40,0x41,0xcc,0x00,0x08,0x22,0x1c] + +[0x08,0x40,0x41,0xcc,0x00,0x09,0x20,0x1c] # sgpr src1 +# GFX12: v_wmma_f32_16x16x16_bf16 v[8:15], v[0:3], s[4:7]/*Invalid register, operand has 'VReg_128' register class*/, v[8:15] ; encoding: [0x08,0x40,0x41,0xcc,0x00,0x09,0x20,0x1c] + +[0x08,0x40,0x41,0xcc,0x00,0x09,0x22,0x18] # sgpr src2 +# GFX12: v_wmma_f32_16x16x16_bf16 v[8:15], v[0:3], v[4:7], s[8:15]/*Invalid register, operand has 'VReg_256' register class*/ ; encoding: [0x08,0x40,0x41,0xcc,0x00,0x09,0x22,0x18] + +[0x08,0x40,0x41,0xcc,0xf2,0x08,0x22,0x1c] # 1.0 src0 +# GFX12: v_wmma_f32_16x16x16_bf16 v[8:15], 1.0/*Invalid immediate*/, v[4:7], v[8:15] ; encoding: [0x08,0x40,0x41,0xcc,0x00,0x08,0x22,0x1c] + +[0x08,0x40,0x41,0xcc,0x00,0xe5,0x21,0x1c] # 1.0 src1 +# GFX12: v_wmma_f32_16x16x16_bf16 v[8:15], v[0:3], 1.0/*Invalid immediate*/, v[8:15] ; encoding: [0x08,0x40,0x41,0xcc,0x00,0x01,0x20,0x1c] + +[0x08,0x40,0x41,0xcc,0x00,0x09,0xca,0x1b] # 1.0 src2 +# GFX12: v_wmma_f32_16x16x16_bf16 v[8:15], v[0:3], v[4:7], 1.0 ; encoding: [0x08,0x40,0x41,0xcc,0x00,0x09,0xca,0x1b] + +[0x08,0x40,0x41,0xcc,0x00,0x09,0x06,0x1a] # 1 src2 +# GFX12: v_wmma_f32_16x16x16_bf16 v[8:15], v[0:3], v[4:7], 1 ; encoding: [0x08,0x40,0x41,0xcc,0x00,0x09,0x06,0x1a] + + + +[0x08,0x40,0x42,0xcc,0x00,0x09,0x22,0x1c] +# GFX12: v_wmma_f16_16x16x16_f16 v[8:11], v[0:3], v[4:7], v[8:11] ; encoding: [0x08,0x40,0x42,0xcc,0x00,0x09,0x22,0x1c] + +[0x08,0xc0,0x42,0xcc,0x00,0x09,0x22,0x1c] # clamp +# GFX12-ERR: warning: invalid instruction encoding + +[0x08,0x48,0x42,0xcc,0x00,0x09,0x22,0x1c] # op_sel:[1,0,0] +# GFX12-ERR: warning: invalid instruction encoding + +[0x08,0x50,0x42,0xcc,0x00,0x09,0x22,0x1c] # op_sel:[0,1,0] +# GFX12-ERR: warning: invalid instruction encoding + +[0x08,0x60,0x42,0xcc,0x00,0x09,0x22,0x1c] # op_sel:[0,0,1] +# GFX12-ERR: warning: invalid instruction encoding + +[0x08,0x40,0x42,0xcc,0x00,0x09,0x22,0x14] # op_sel_hi:[0,1,1] +# GFX12-ERR: warning: invalid instruction encoding + +[0x08,0x40,0x42,0xcc,0x00,0x09,0x22,0x0c] # op_sel_hi:[1,0,1] +# GFX12-ERR: warning: invalid instruction encoding + +[0x08,0x00,0x42,0xcc,0x00,0x09,0x22,0x1c] # op_sel_hi:[1,1,0] +# GFX12-ERR: warning: invalid instruction encoding + +[0x04,0x48,0x42,0xcc,0x00,0x05,0x12,0x1c] # index_key:1 +# GFX12-ERR: warning: invalid instruction encoding + +[0x04,0x50,0x42,0xcc,0x00,0x05,0x12,0x1c] # index_key:2 +# GFX12-ERR: warning: invalid instruction encoding + +[0x04,0x58,0x42,0xcc,0x00,0x05,0x12,0x1c] # index_key:3 +# GFX12-ERR: warning: invalid instruction encoding + +[0x08,0x40,0x42,0xcc,0x00,0x09,0x22,0x3c] # neg_lo:[1,0,0] +# GFX12: v_wmma_f16_16x16x16_f16 v[8:11], v[0:3], v[4:7], v[8:11] neg_lo:[1,0,0] ; encoding: [0x08,0x40,0x42,0xcc,0x00,0x09,0x22,0x3c] + +[0x08,0x40,0x42,0xcc,0x00,0x09,0x22,0x5c] # neg_lo:[0,1,0] +# GFX12: v_wmma_f16_16x16x16_f16 v[8:11], v[0:3], v[4:7], v[8:11] neg_lo:[0,1,0] ; encoding: [0x08,0x40,0x42,0xcc,0x00,0x09,0x22,0x5c] + +[0x08,0x40,0x42,0xcc,0x00,0x09,0x22,0x9c] # neg_lo:[0,0,1] +# GFX12: v_wmma_f16_16x16x16_f16 v[8:11], v[0:3], v[4:7], v[8:11] neg_lo:[0,0,1] ; encoding: [0x08,0x40,0x42,0xcc,0x00,0x09,0x22,0x9c] + +[0x08,0x41,0x42,0xcc,0x00,0x09,0x22,0x1c] # neg_hi:[1,0,0] +# GFX12: v_wmma_f16_16x16x16_f16 v[8:11], v[0:3], v[4:7], v[8:11] neg_hi:[1,0,0] ; encoding: [0x08,0x41,0x42,0xcc,0x00,0x09,0x22,0x1c] + +[0x08,0x42,0x42,0xcc,0x00,0x09,0x22,0x1c] # neg_hi:[0,1,0] +# GFX12: v_wmma_f16_16x16x16_f16 v[8:11], v[0:3], v[4:7], v[8:11] neg_hi:[0,1,0] ; encoding: [0x08,0x42,0x42,0xcc,0x00,0x09,0x22,0x1c] + +[0x08,0x44,0x42,0xcc,0x00,0x09,0x22,0x1c] # neg_hi:[0,0,1] +# GFX12: v_wmma_f16_16x16x16_f16 v[8:11], v[0:3], v[4:7], v[8:11] neg_hi:[0,0,1] ; encoding: [0x08,0x44,0x42,0xcc,0x00,0x09,0x22,0x1c] + +[0x08,0x40,0x42,0xcc,0x00,0x08,0x22,0x1c] # sgpr src0 +# GFX12: v_wmma_f16_16x16x16_f16 v[8:11], s[0:3]/*Invalid register, operand has 'VReg_128' register class*/, v[4:7], v[8:11] ; encoding: [0x08,0x40,0x42,0xcc,0x00,0x08,0x22,0x1c] + +[0x08,0x40,0x42,0xcc,0x00,0x09,0x20,0x1c] # sgpr src1 +# GFX12: v_wmma_f16_16x16x16_f16 v[8:11], v[0:3], s[4:7]/*Invalid register, operand has 'VReg_128' register class*/, v[8:11] ; encoding: [0x08,0x40,0x42,0xcc,0x00,0x09,0x20,0x1c] + +[0x08,0x40,0x42,0xcc,0x00,0x09,0x22,0x18] # sgpr src2 +# GFX12: v_wmma_f16_16x16x16_f16 v[8:11], v[0:3], v[4:7], s[8:11]/*Invalid register, operand has 'VReg_128' register class*/ ; encoding: [0x08,0x40,0x42,0xcc,0x00,0x09,0x22,0x18] + +[0x08,0x40,0x42,0xcc,0xf2,0x08,0x22,0x1c] # 1.0 src0 +# GFX12: v_wmma_f16_16x16x16_f16 v[8:11], 1.0/*Invalid immediate*/, v[4:7], v[8:11] ; encoding: [0x08,0x40,0x42,0xcc,0x00,0x08,0x22,0x1c] + +[0x08,0x40,0x42,0xcc,0x00,0xe5,0x21,0x1c] # 1.0 src1 +# GFX12: v_wmma_f16_16x16x16_f16 v[8:11], v[0:3], 1.0/*Invalid immediate*/, v[8:11] ; encoding: [0x08,0x40,0x42,0xcc,0x00,0x01,0x20,0x1c] + +[0x08,0x40,0x42,0xcc,0x00,0x09,0xca,0x1b] # 1.0 src2 +# GFX12: v_wmma_f16_16x16x16_f16 v[8:11], v[0:3], v[4:7], 1.0 ; encoding: [0x08,0x40,0x42,0xcc,0x00,0x09,0xca,0x1b] + +[0x08,0x40,0x42,0xcc,0x00,0x09,0x06,0x1a] # 1 src2 +# GFX12: v_wmma_f16_16x16x16_f16 v[8:11], v[0:3], v[4:7], 1 ; encoding: [0x08,0x40,0x42,0xcc,0x00,0x09,0x06,0x1a] + + + +[0x08,0x40,0x43,0xcc,0x00,0x09,0x22,0x1c] +# GFX12: v_wmma_bf16_16x16x16_bf16 v[8:11], v[0:3], v[4:7], v[8:11] ; encoding: [0x08,0x40,0x43,0xcc,0x00,0x09,0x22,0x1c] + +[0x08,0xc0,0x43,0xcc,0x00,0x09,0x22,0x1c] # clamp +# GFX12-ERR: warning: invalid instruction encoding + +[0x08,0x48,0x43,0xcc,0x00,0x09,0x22,0x1c] # op_sel:[1,0,0] +# GFX12-ERR: warning: invalid instruction encoding + +[0x08,0x50,0x43,0xcc,0x00,0x09,0x22,0x1c] # op_sel:[0,1,0] +# GFX12-ERR: warning: invalid instruction encoding + +[0x08,0x60,0x43,0xcc,0x00,0x09,0x22,0x1c] # op_sel:[0,0,1] +# GFX12-ERR: warning: invalid instruction encoding + +[0x08,0x40,0x43,0xcc,0x00,0x09,0x22,0x14] # op_sel_hi:[0,1,1] +# GFX12-ERR: warning: invalid instruction encoding + +[0x08,0x40,0x43,0xcc,0x00,0x09,0x22,0x0c] # op_sel_hi:[1,0,1] +# GFX12-ERR: warning: invalid instruction encoding + +[0x08,0x00,0x43,0xcc,0x00,0x09,0x22,0x1c] # op_sel_hi:[1,1,0] +# GFX12-ERR: warning: invalid instruction encoding + +[0x04,0x48,0x43,0xcc,0x00,0x05,0x12,0x1c] # index_key:1 +# GFX12-ERR: warning: invalid instruction encoding + +[0x04,0x50,0x43,0xcc,0x00,0x05,0x12,0x1c] # index_key:2 +# GFX12-ERR: warning: invalid instruction encoding + +[0x04,0x58,0x43,0xcc,0x00,0x05,0x12,0x1c] # index_key:3 +# GFX12-ERR: warning: invalid instruction encoding + +[0x08,0x40,0x43,0xcc,0x00,0x09,0x22,0x3c] # neg_lo:[1,0,0] +# GFX12: v_wmma_bf16_16x16x16_bf16 v[8:11], v[0:3], v[4:7], v[8:11] neg_lo:[1,0,0] ; encoding: [0x08,0x40,0x43,0xcc,0x00,0x09,0x22,0x3c] + +[0x08,0x40,0x43,0xcc,0x00,0x09,0x22,0x5c] # neg_lo:[0,1,0] +# GFX12: v_wmma_bf16_16x16x16_bf16 v[8:11], v[0:3], v[4:7], v[8:11] neg_lo:[0,1,0] ; encoding: [0x08,0x40,0x43,0xcc,0x00,0x09,0x22,0x5c] + +[0x08,0x40,0x43,0xcc,0x00,0x09,0x22,0x9c] # neg_lo:[0,0,1] +# GFX12: v_wmma_bf16_16x16x16_bf16 v[8:11], v[0:3], v[4:7], v[8:11] neg_lo:[0,0,1] ; encoding: [0x08,0x40,0x43,0xcc,0x00,0x09,0x22,0x9c] + +[0x08,0x41,0x43,0xcc,0x00,0x09,0x22,0x1c] # neg_hi:[1,0,0] +# GFX12: v_wmma_bf16_16x16x16_bf16 v[8:11], v[0:3], v[4:7], v[8:11] neg_hi:[1,0,0] ; encoding: [0x08,0x41,0x43,0xcc,0x00,0x09,0x22,0x1c] + +[0x08,0x42,0x43,0xcc,0x00,0x09,0x22,0x1c] # neg_hi:[0,1,0] +# GFX12: v_wmma_bf16_16x16x16_bf16 v[8:11], v[0:3], v[4:7], v[8:11] neg_hi:[0,1,0] ; encoding: [0x08,0x42,0x43,0xcc,0x00,0x09,0x22,0x1c] + +[0x08,0x44,0x43,0xcc,0x00,0x09,0x22,0x1c] # neg_hi:[0,0,1] +# GFX12: v_wmma_bf16_16x16x16_bf16 v[8:11], v[0:3], v[4:7], v[8:11] neg_hi:[0,0,1] ; encoding: [0x08,0x44,0x43,0xcc,0x00,0x09,0x22,0x1c] + +[0x08,0x40,0x43,0xcc,0x00,0x08,0x22,0x1c] # sgpr src0 +# GFX12: v_wmma_bf16_16x16x16_bf16 v[8:11], s[0:3]/*Invalid register, operand has 'VReg_128' register class*/, v[4:7], v[8:11] ; encoding: [0x08,0x40,0x43,0xcc,0x00,0x08,0x22,0x1c] + +[0x08,0x40,0x43,0xcc,0x00,0x09,0x20,0x1c] # sgpr src1 +# GFX12: v_wmma_bf16_16x16x16_bf16 v[8:11], v[0:3], s[4:7]/*Invalid register, operand has 'VReg_128' register class*/, v[8:11] ; encoding: [0x08,0x40,0x43,0xcc,0x00,0x09,0x20,0x1c] + +[0x08,0x40,0x43,0xcc,0x00,0x09,0x22,0x18] # sgpr src2 +# GFX12: v_wmma_bf16_16x16x16_bf16 v[8:11], v[0:3], v[4:7], s[8:11]/*Invalid register, operand has 'VReg_128' register class*/ ; encoding: [0x08,0x40,0x43,0xcc,0x00,0x09,0x22,0x18] + +[0x08,0x40,0x43,0xcc,0xf2,0x08,0x22,0x1c] # 1.0 src0 +# GFX12: v_wmma_bf16_16x16x16_bf16 v[8:11], 1.0/*Invalid immediate*/, v[4:7], v[8:11] ; encoding: [0x08,0x40,0x43,0xcc,0x00,0x08,0x22,0x1c] + +[0x08,0x40,0x43,0xcc,0x00,0xe5,0x21,0x1c] # 1.0 src1 +# GFX12: v_wmma_bf16_16x16x16_bf16 v[8:11], v[0:3], 1.0/*Invalid immediate*/, v[8:11] ; encoding: [0x08,0x40,0x43,0xcc,0x00,0x01,0x20,0x1c] + +[0x08,0x40,0x43,0xcc,0x00,0x09,0xca,0x1b] # 1.0 src2 +# GFX12: v_wmma_bf16_16x16x16_bf16 v[8:11], v[0:3], v[4:7], 1.0 ; encoding: [0x08,0x40,0x43,0xcc,0x00,0x09,0xca,0x1b] + +[0x08,0x40,0x43,0xcc,0x00,0x09,0x06,0x1a] # 1 src2 +# GFX12: v_wmma_bf16_16x16x16_bf16 v[8:11], v[0:3], v[4:7], 1 ; encoding: [0x08,0x40,0x43,0xcc,0x00,0x09,0x06,0x1a] + + + +[0x04,0x40,0x44,0xcc,0x00,0x05,0x12,0x1c] +# GFX12: v_wmma_i32_16x16x16_iu8 v[4:11], v[0:1], v[2:3], v[4:11] ; encoding: [0x04,0x40,0x44,0xcc,0x00,0x05,0x12,0x1c] + +[0x04,0xc0,0x44,0xcc,0x00,0x05,0x12,0x1c] # clamp +# GFX12: v_wmma_i32_16x16x16_iu8 v[4:11], v[0:1], v[2:3], v[4:11] clamp ; encoding: [0x04,0xc0,0x44,0xcc,0x00,0x05,0x12,0x1c] + +[0x04,0x48,0x44,0xcc,0x00,0x05,0x12,0x1c] # op_sel:[1,0,0] +# GFX12-ERR: warning: invalid instruction encoding + +[0x04,0x50,0x44,0xcc,0x00,0x05,0x12,0x1c] # op_sel:[0,1,0] +# GFX12-ERR: warning: invalid instruction encoding + +[0x04,0x60,0x44,0xcc,0x00,0x05,0x12,0x1c] # op_sel:[0,0,1] +# GFX12-ERR: warning: invalid instruction encoding + +[0x04,0x40,0x44,0xcc,0x00,0x05,0x12,0x14] # op_sel_hi:[0,1,1] +# GFX12-ERR: warning: invalid instruction encoding + +[0x04,0x40,0x44,0xcc,0x00,0x05,0x12,0x0c] # op_sel_hi:[1,0,1] +# GFX12-ERR: warning: invalid instruction encoding + +[0x04,0x00,0x44,0xcc,0x00,0x05,0x12,0x1c] # op_sel_hi:[1,1,0] +# GFX12-ERR: warning: invalid instruction encoding + +[0x02,0x48,0x44,0xcc,0x00,0x03,0x0a,0x1c] # index_key:1 +# GFX12-ERR: warning: invalid instruction encoding + +[0x02,0x50,0x44,0xcc,0x00,0x03,0x0a,0x1c] # index_key:2 +# GFX12-ERR: warning: invalid instruction encoding + +[0x02,0x58,0x44,0xcc,0x00,0x03,0x0a,0x1c] # index_key:3 +# GFX12-ERR: warning: invalid instruction encoding + +[0x04,0x40,0x44,0xcc,0x00,0x05,0x12,0x3c] # neg_lo:[1,0,0] +# GFX12: v_wmma_i32_16x16x16_iu8 v[4:11], v[0:1], v[2:3], v[4:11] neg_lo:[1,0,0] ; encoding: [0x04,0x40,0x44,0xcc,0x00,0x05,0x12,0x3c] + +[0x04,0x40,0x44,0xcc,0x00,0x05,0x12,0x5c] # neg_lo:[0,1,0] +# GFX12: v_wmma_i32_16x16x16_iu8 v[4:11], v[0:1], v[2:3], v[4:11] neg_lo:[0,1,0] ; encoding: [0x04,0x40,0x44,0xcc,0x00,0x05,0x12,0x5c] + +[0x04,0x40,0x44,0xcc,0x00,0x05,0x12,0x9c] # neg_lo:[0,0,1] +# GFX12-ERR: warning: invalid instruction encoding + +[0x04,0x41,0x44,0xcc,0x00,0x05,0x12,0x1c] # neg_hi:[1,0,0] +# GFX12-ERR: warning: invalid instruction encoding + +[0x04,0x42,0x44,0xcc,0x00,0x05,0x12,0x1c] # neg_hi:[0,1,0] +# GFX12-ERR: warning: invalid instruction encoding + +[0x04,0x44,0x44,0xcc,0x00,0x05,0x12,0x1c] # neg_hi:[0,0,1] +# GFX12-ERR: warning: invalid instruction encoding + +[0x04,0x40,0x44,0xcc,0x00,0x04,0x12,0x1c] # sgpr src0 +# GFX12: v_wmma_i32_16x16x16_iu8 v[4:11], s[0:1]/*Invalid register, operand has 'VReg_64' register class*/, v[2:3], v[4:11] ; encoding: [0x04,0x40,0x44,0xcc,0x00,0x04,0x12,0x1c] + +[0x04,0x40,0x44,0xcc,0x00,0x05,0x10,0x1c] # sgpr src1 +# GFX12: v_wmma_i32_16x16x16_iu8 v[4:11], v[0:1], s[2:3]/*Invalid register, operand has 'VReg_64' register class*/, v[4:11] ; encoding: [0x04,0x40,0x44,0xcc,0x00,0x05,0x10,0x1c] + +[0x04,0x40,0x44,0xcc,0x00,0x05,0x12,0x18] # sgpr src2 +# GFX12: v_wmma_i32_16x16x16_iu8 v[4:11], v[0:1], v[2:3], s[4:11]/*Invalid register, operand has 'VReg_256' register class*/ ; encoding: [0x04,0x40,0x44,0xcc,0x00,0x05,0x12,0x18] + +[0x04,0x40,0x44,0xcc,0x81,0x04,0x12,0x1c] # 1 src0 +# GFX12: v_wmma_i32_16x16x16_iu8 v[4:11], 1/*Invalid immediate*/, v[2:3], v[4:11] ; encoding: [0x04,0x40,0x44,0xcc,0x01,0x04,0x12,0x1c] + +[0x04,0x40,0x44,0xcc,0x00,0x03,0x11,0x1c] # 1 src1 +# GFX12: v_wmma_i32_16x16x16_iu8 v[4:11], v[0:1], 1/*Invalid immediate*/, v[4:11] ; encoding: [0x04,0x40,0x44,0xcc,0x00,0x03,0x10,0x1c] + +[0x04,0x40,0x44,0xcc,0x00,0x05,0x06,0x1a] # 1 src2 +# GFX12: v_wmma_i32_16x16x16_iu8 v[4:11], v[0:1], v[2:3], 1 ; encoding: [0x04,0x40,0x44,0xcc,0x00,0x05,0x06,0x1a] + +[0x04,0x40,0x44,0xcc,0x00,0x05,0xca,0x1b] # 1.0 src2 +# GFX12: v_wmma_i32_16x16x16_iu8 v[4:11], v[0:1], v[2:3], 1.0 ; encoding: [0x04,0x40,0x44,0xcc,0x00,0x05,0xca,0x1b] + + + +[0x02,0x40,0x45,0xcc,0x00,0x03,0x0a,0x1c] +# GFX12: v_wmma_i32_16x16x16_iu4 v[2:9], v0, v1, v[2:9] ; encoding: [0x02,0x40,0x45,0xcc,0x00,0x03,0x0a,0x1c] + +[0x02,0xc0,0x45,0xcc,0x00,0x03,0x0a,0x1c] # clamp +# GFX12: v_wmma_i32_16x16x16_iu4 v[2:9], v0, v1, v[2:9] clamp ; encoding: [0x02,0xc0,0x45,0xcc,0x00,0x03,0x0a,0x1c] + +[0x02,0x48,0x45,0xcc,0x00,0x03,0x0a,0x1c] # op_sel:[1,0,0] +# GFX12-ERR: warning: invalid instruction encoding + +[0x02,0x50,0x45,0xcc,0x00,0x03,0x0a,0x1c] # op_sel:[0,1,0] +# GFX12-ERR: warning: invalid instruction encoding + +[0x02,0x60,0x45,0xcc,0x00,0x03,0x0a,0x1c] # op_sel:[0,0,1] +# GFX12-ERR: warning: invalid instruction encoding + +[0x02,0x40,0x45,0xcc,0x00,0x03,0x0a,0x14] # op_sel_hi:[0,1,1] +# GFX12-ERR: warning: invalid instruction encoding + +[0x02,0x40,0x45,0xcc,0x00,0x03,0x0a,0x0c] # op_sel_hi:[1,0,1] +# GFX12-ERR: warning: invalid instruction encoding + +[0x02,0x00,0x45,0xcc,0x00,0x03,0x0a,0x1c] # op_sel_hi:[1,1,0] +# GFX12-ERR: warning: invalid instruction encoding + +[0x02,0x48,0x45,0xcc,0x00,0x03,0x0a,0x1c] # index_key:1 +# GFX12-ERR: warning: invalid instruction encoding + +[0x02,0x50,0x45,0xcc,0x00,0x03,0x0a,0x1c] # index_key:2 +# GFX12-ERR: warning: invalid instruction encoding + +[0x02,0x58,0x45,0xcc,0x00,0x03,0x0a,0x1c] # index_key:3 +# GFX12-ERR: warning: invalid instruction encoding + +[0x02,0x40,0x45,0xcc,0x00,0x03,0x0a,0x3c] # neg_lo:[1,0,0] +# GFX12: v_wmma_i32_16x16x16_iu4 v[2:9], v0, v1, v[2:9] neg_lo:[1,0,0] ; encoding: [0x02,0x40,0x45,0xcc,0x00,0x03,0x0a,0x3c] + +[0x02,0x40,0x45,0xcc,0x00,0x03,0x0a,0x5c] # neg_lo:[0,1,0] +# GFX12: v_wmma_i32_16x16x16_iu4 v[2:9], v0, v1, v[2:9] neg_lo:[0,1,0] ; encoding: [0x02,0x40,0x45,0xcc,0x00,0x03,0x0a,0x5c] + +[0x02,0x40,0x45,0xcc,0x00,0x03,0x0a,0x9c] # neg_lo:[0,0,1] +# GFX12-ERR: warning: invalid instruction encoding + +[0x02,0x41,0x45,0xcc,0x00,0x03,0x0a,0x1c] # neg_hi:[1,0,0] +# GFX12-ERR: warning: invalid instruction encoding + +[0x02,0x42,0x45,0xcc,0x00,0x03,0x0a,0x1c] # neg_hi:[0,1,0] +# GFX12-ERR: warning: invalid instruction encoding + +[0x02,0x44,0x45,0xcc,0x00,0x03,0x0a,0x1c] # neg_hi:[0,0,1] +# GFX12-ERR: warning: invalid instruction encoding + +[0x02,0x40,0x45,0xcc,0x00,0x02,0x0a,0x1c] # sgpr src0 +# GFX12: v_wmma_i32_16x16x16_iu4 v[2:9], s0/*Invalid register, operand has 'VGPR_32' register class*/, v1, v[2:9] ; encoding: [0x02,0x40,0x45,0xcc,0x00,0x02,0x0a,0x1c] + +[0x02,0x40,0x45,0xcc,0x00,0x03,0x08,0x1c] # sgpr src1 +# GFX12: v_wmma_i32_16x16x16_iu4 v[2:9], v0, s1/*Invalid register, operand has 'VGPR_32' register class*/, v[2:9] ; encoding: [0x02,0x40,0x45,0xcc,0x00,0x03,0x08,0x1c] + +[0x02,0x40,0x45,0xcc,0x00,0x03,0x0a,0x18] # sgpr src2 +# GFX12: v_wmma_i32_16x16x16_iu4 v[2:9], v0, v1, s[0:7]/*Invalid register, operand has 'VReg_256' register class*/ ; encoding: [0x02,0x40,0x45,0xcc,0x00,0x03,0x02,0x18] + +[0x02,0x40,0x45,0xcc,0x81,0x02,0x0a,0x1c] # 1 src0 +# GFX12: v_wmma_i32_16x16x16_iu4 v[2:9], 1/*Invalid immediate*/, v1, v[2:9] ; encoding: [0x02,0x40,0x45,0xcc,0x01,0x02,0x0a,0x1c] + +[0x02,0x40,0x45,0xcc,0x00,0x03,0x09,0x1c] # 1 src1 +# GFX12: v_wmma_i32_16x16x16_iu4 v[2:9], v0, 1/*Invalid immediate*/, v[2:9] ; encoding: [0x02,0x40,0x45,0xcc,0x00,0x03,0x08,0x1c] + +[0x02,0x40,0x45,0xcc,0x00,0x03,0x06,0x1a] # 1 src2 +# GFX12: v_wmma_i32_16x16x16_iu4 v[2:9], v0, v1, 1 ; encoding: [0x02,0x40,0x45,0xcc,0x00,0x03,0x06,0x1a] + +[0x02,0x40,0x45,0xcc,0x00,0x03,0xca,0x1b] # 1.0 src2 +# GFX12: v_wmma_i32_16x16x16_iu4 v[2:9], v0, v1, 1.0 ; encoding: [0x02,0x40,0x45,0xcc,0x00,0x03,0xca,0x1b] + + + +[0x04,0x40,0x46,0xcc,0x00,0x05,0x12,0x1c] +# GFX12: v_wmma_f32_16x16x16_fp8_fp8 v[4:11], v[0:1], v[2:3], v[4:11] ; encoding: [0x04,0x40,0x46,0xcc,0x00,0x05,0x12,0x1c] + +[0x04,0xc0,0x46,0xcc,0x00,0x05,0x12,0x1c] # clamp +# GFX12-ERR: warning: invalid instruction encoding + +[0x04,0x48,0x46,0xcc,0x00,0x05,0x12,0x1c] # op_sel:[1,0,0] +# GFX12-ERR: warning: invalid instruction encoding + +[0x04,0x50,0x46,0xcc,0x00,0x05,0x12,0x1c] # op_sel:[0,1,0] +# GFX12-ERR: warning: invalid instruction encoding + +[0x04,0x60,0x46,0xcc,0x00,0x05,0x12,0x1c] # op_sel:[0,0,1] +# GFX12-ERR: warning: invalid instruction encoding + +[0x04,0x40,0x46,0xcc,0x00,0x05,0x12,0x14] # op_sel_hi:[0,1,1] +# GFX12-ERR: warning: invalid instruction encoding + +[0x04,0x40,0x46,0xcc,0x00,0x05,0x12,0x0c] # op_sel_hi:[1,0,1] +# GFX12-ERR: warning: invalid instruction encoding + +[0x04,0x00,0x46,0xcc,0x00,0x05,0x12,0x1c] # op_sel_hi:[1,1,0] +# GFX12-ERR: warning: invalid instruction encoding + +[0x04,0x48,0x46,0xcc,0x00,0x05,0x12,0x1c] # index_key:1 +# GFX12-ERR: warning: invalid instruction encoding + +[0x04,0x50,0x46,0xcc,0x00,0x05,0x12,0x1c] # index_key:2 +# GFX12-ERR: warning: invalid instruction encoding + +[0x04,0x58,0x46,0xcc,0x00,0x05,0x12,0x1c] # index_key:3 +# GFX12-ERR: warning: invalid instruction encoding + +[0x04,0x40,0x46,0xcc,0x00,0x05,0x12,0x3c] # neg_lo:[1,0,0] +# GFX12-ERR: warning: invalid instruction encoding + +[0x04,0x40,0x46,0xcc,0x00,0x05,0x12,0x5c] # neg_lo:[0,1,0] +# GFX12-ERR: warning: invalid instruction encoding + +[0x04,0x40,0x46,0xcc,0x00,0x05,0x12,0x9c] # neg_lo:[0,0,1] +# GFX12: v_wmma_f32_16x16x16_fp8_fp8 v[4:11], v[0:1], v[2:3], v[4:11] neg_lo:[0,0,1] ; encoding: [0x04,0x40,0x46,0xcc,0x00,0x05,0x12,0x9c] + +[0x04,0x41,0x46,0xcc,0x00,0x05,0x12,0x1c] # neg_hi:[1,0,0] +# GFX12-ERR: warning: invalid instruction encoding + +[0x04,0x42,0x46,0xcc,0x00,0x05,0x12,0x1c] # neg_hi:[0,1,0] +# GFX12-ERR: warning: invalid instruction encoding + +[0x04,0x44,0x46,0xcc,0x00,0x05,0x12,0x1c] # neg_hi:[0,0,1] +# GFX12: v_wmma_f32_16x16x16_fp8_fp8 v[4:11], v[0:1], v[2:3], v[4:11] neg_hi:[0,0,1] ; encoding: [0x04,0x44,0x46,0xcc,0x00,0x05,0x12,0x1c] + +[0x04,0x40,0x46,0xcc,0x00,0x04,0x12,0x1c] # sgpr src0 +# GFX12: v_wmma_f32_16x16x16_fp8_fp8 v[4:11], s[0:1]/*Invalid register, operand has 'VReg_64' register class*/, v[2:3], v[4:11] ; encoding: [0x04,0x40,0x46,0xcc,0x00,0x04,0x12,0x1c] + +[0x04,0x40,0x46,0xcc,0x00,0x05,0x10,0x1c] # sgpr src1 +# GFX12: v_wmma_f32_16x16x16_fp8_fp8 v[4:11], v[0:1], s[2:3]/*Invalid register, operand has 'VReg_64' register class*/, v[4:11] ; encoding: [0x04,0x40,0x46,0xcc,0x00,0x05,0x10,0x1c] + +[0x04,0x40,0x46,0xcc,0x00,0x05,0x12,0x18] # sgpr src2 +# GFX12: v_wmma_f32_16x16x16_fp8_fp8 v[4:11], v[0:1], v[2:3], s[4:11]/*Invalid register, operand has 'VReg_256' register class*/ ; encoding: [0x04,0x40,0x46,0xcc,0x00,0x05,0x12,0x18] + +[0x04,0x40,0x46,0xcc,0xf2,0x04,0x12,0x1c] # 1.0 src0 +# GFX12: v_wmma_f32_16x16x16_fp8_fp8 v[4:11], 1.0/*Invalid immediate*/, v[2:3], v[4:11] ; encoding: [0x04,0x40,0x46,0xcc,0x00,0x04,0x12,0x1c] + +[0x04,0x40,0x46,0xcc,0x00,0xe5,0x11,0x1c] # 1.0 src1 +# GFX12: v_wmma_f32_16x16x16_fp8_fp8 v[4:11], v[0:1], 1.0/*Invalid immediate*/, v[4:11] ; encoding: [0x04,0x40,0x46,0xcc,0x00,0x01,0x10,0x1c] + +[0x04,0x40,0x46,0xcc,0x00,0x05,0xca,0x1b] # 1.0 src2 +# GFX12: v_wmma_f32_16x16x16_fp8_fp8 v[4:11], v[0:1], v[2:3], 1.0 ; encoding: [0x04,0x40,0x46,0xcc,0x00,0x05,0xca,0x1b] + +[0x04,0x40,0x46,0xcc,0x00,0x05,0x06,0x1a] # 1 src2 +# GFX12: v_wmma_f32_16x16x16_fp8_fp8 v[4:11], v[0:1], v[2:3], 1 ; encoding: [0x04,0x40,0x46,0xcc,0x00,0x05,0x06,0x1a] + + + +[0x04,0x40,0x48,0xcc,0x00,0x05,0x12,0x1c] +# GFX12: v_wmma_f32_16x16x16_bf8_fp8 v[4:11], v[0:1], v[2:3], v[4:11] ; encoding: [0x04,0x40,0x48,0xcc,0x00,0x05,0x12,0x1c] + +[0x04,0xc0,0x48,0xcc,0x00,0x05,0x12,0x1c] # clamp +# GFX12-ERR: warning: invalid instruction encoding + +[0x04,0x48,0x48,0xcc,0x00,0x05,0x12,0x1c] # op_sel:[1,0,0] +# GFX12-ERR: warning: invalid instruction encoding + +[0x04,0x50,0x48,0xcc,0x00,0x05,0x12,0x1c] # op_sel:[0,1,0] +# GFX12-ERR: warning: invalid instruction encoding + +[0x04,0x60,0x48,0xcc,0x00,0x05,0x12,0x1c] # op_sel:[0,0,1] +# GFX12-ERR: warning: invalid instruction encoding + +[0x04,0x40,0x48,0xcc,0x00,0x05,0x12,0x14] # op_sel_hi:[0,1,1] +# GFX12-ERR: warning: invalid instruction encoding + +[0x04,0x40,0x48,0xcc,0x00,0x05,0x12,0x0c] # op_sel_hi:[1,0,1] +# GFX12-ERR: warning: invalid instruction encoding + +[0x04,0x00,0x48,0xcc,0x00,0x05,0x12,0x1c] # op_sel_hi:[1,1,0] +# GFX12-ERR: warning: invalid instruction encoding + +[0x04,0x48,0x48,0xcc,0x00,0x05,0x12,0x1c] # index_key:1 +# GFX12-ERR: warning: invalid instruction encoding + +[0x04,0x50,0x48,0xcc,0x00,0x05,0x12,0x1c] # index_key:2 +# GFX12-ERR: warning: invalid instruction encoding + +[0x04,0x58,0x48,0xcc,0x00,0x05,0x12,0x1c] # index_key:3 +# GFX12-ERR: warning: invalid instruction encoding + +[0x04,0x40,0x48,0xcc,0x00,0x05,0x12,0x3c] # neg_lo:[1,0,0] +# GFX12-ERR: warning: invalid instruction encoding + +[0x04,0x40,0x48,0xcc,0x00,0x05,0x12,0x5c] # neg_lo:[0,1,0] +# GFX12-ERR: warning: invalid instruction encoding + +[0x04,0x40,0x48,0xcc,0x00,0x05,0x12,0x9c] # neg_lo:[0,0,1] +# GFX12: v_wmma_f32_16x16x16_bf8_fp8 v[4:11], v[0:1], v[2:3], v[4:11] neg_lo:[0,0,1] ; encoding: [0x04,0x40,0x48,0xcc,0x00,0x05,0x12,0x9c] + +[0x04,0x41,0x48,0xcc,0x00,0x05,0x12,0x1c] # neg_hi:[1,0,0] +# GFX12-ERR: warning: invalid instruction encoding + +[0x04,0x42,0x48,0xcc,0x00,0x05,0x12,0x1c] # neg_hi:[0,1,0] +# GFX12-ERR: warning: invalid instruction encoding + +[0x04,0x44,0x48,0xcc,0x00,0x05,0x12,0x1c] # neg_hi:[0,0,1] +# GFX12: v_wmma_f32_16x16x16_bf8_fp8 v[4:11], v[0:1], v[2:3], v[4:11] neg_hi:[0,0,1] ; encoding: [0x04,0x44,0x48,0xcc,0x00,0x05,0x12,0x1c] + +[0x04,0x40,0x48,0xcc,0x00,0x04,0x12,0x1c] # sgpr src0 +# GFX12: v_wmma_f32_16x16x16_bf8_fp8 v[4:11], s[0:1]/*Invalid register, operand has 'VReg_64' register class*/, v[2:3], v[4:11] ; encoding: [0x04,0x40,0x48,0xcc,0x00,0x04,0x12,0x1c] + +[0x04,0x40,0x48,0xcc,0x00,0x05,0x10,0x1c] # sgpr src1 +# GFX12: v_wmma_f32_16x16x16_bf8_fp8 v[4:11], v[0:1], s[2:3]/*Invalid register, operand has 'VReg_64' register class*/, v[4:11] ; encoding: [0x04,0x40,0x48,0xcc,0x00,0x05,0x10,0x1c] + +[0x04,0x40,0x48,0xcc,0x00,0x05,0x12,0x18] # sgpr src2 +# GFX12: v_wmma_f32_16x16x16_bf8_fp8 v[4:11], v[0:1], v[2:3], s[4:11]/*Invalid register, operand has 'VReg_256' register class*/ ; encoding: [0x04,0x40,0x48,0xcc,0x00,0x05,0x12,0x18] + +[0x04,0x40,0x48,0xcc,0xf2,0x04,0x12,0x1c] # 1.0 src0 +# GFX12: v_wmma_f32_16x16x16_bf8_fp8 v[4:11], 1.0/*Invalid immediate*/, v[2:3], v[4:11] ; encoding: [0x04,0x40,0x48,0xcc,0x00,0x04,0x12,0x1c] + +[0x04,0x40,0x48,0xcc,0x00,0xe5,0x11,0x1c] # 1.0 src1 +# GFX12: v_wmma_f32_16x16x16_bf8_fp8 v[4:11], v[0:1], 1.0/*Invalid immediate*/, v[4:11] ; encoding: [0x04,0x40,0x48,0xcc,0x00,0x01,0x10,0x1c] + +[0x04,0x40,0x48,0xcc,0x00,0x05,0xca,0x1b] # 1.0 src2 +# GFX12: v_wmma_f32_16x16x16_bf8_fp8 v[4:11], v[0:1], v[2:3], 1.0 ; encoding: [0x04,0x40,0x48,0xcc,0x00,0x05,0xca,0x1b] + +[0x04,0x40,0x48,0xcc,0x00,0x05,0x06,0x1a] # 1 src2 +# GFX12: v_wmma_f32_16x16x16_bf8_fp8 v[4:11], v[0:1], v[2:3], 1 ; encoding: [0x04,0x40,0x48,0xcc,0x00,0x05,0x06,0x1a] + + + +[0x04,0x40,0x47,0xcc,0x00,0x05,0x12,0x1c] +# GFX12: v_wmma_f32_16x16x16_fp8_bf8 v[4:11], v[0:1], v[2:3], v[4:11] ; encoding: [0x04,0x40,0x47,0xcc,0x00,0x05,0x12,0x1c] + +[0x04,0xc0,0x47,0xcc,0x00,0x05,0x12,0x1c] # clamp +# GFX12-ERR: warning: invalid instruction encoding + +[0x04,0x48,0x47,0xcc,0x00,0x05,0x12,0x1c] # op_sel:[1,0,0] +# GFX12-ERR: warning: invalid instruction encoding + +[0x04,0x50,0x47,0xcc,0x00,0x05,0x12,0x1c] # op_sel:[0,1,0] +# GFX12-ERR: warning: invalid instruction encoding + +[0x04,0x60,0x47,0xcc,0x00,0x05,0x12,0x1c] # op_sel:[0,0,1] +# GFX12-ERR: warning: invalid instruction encoding + +[0x04,0x40,0x47,0xcc,0x00,0x05,0x12,0x14] # op_sel_hi:[0,1,1] +# GFX12-ERR: warning: invalid instruction encoding + +[0x04,0x40,0x47,0xcc,0x00,0x05,0x12,0x0c] # op_sel_hi:[1,0,1] +# GFX12-ERR: warning: invalid instruction encoding + +[0x04,0x00,0x47,0xcc,0x00,0x05,0x12,0x1c] # op_sel_hi:[1,1,0] +# GFX12-ERR: warning: invalid instruction encoding + +[0x04,0x48,0x47,0xcc,0x00,0x05,0x12,0x1c] # index_key:1 +# GFX12-ERR: warning: invalid instruction encoding + +[0x04,0x50,0x47,0xcc,0x00,0x05,0x12,0x1c] # index_key:2 +# GFX12-ERR: warning: invalid instruction encoding + +[0x04,0x58,0x47,0xcc,0x00,0x05,0x12,0x1c] # index_key:3 +# GFX12-ERR: warning: invalid instruction encoding + +[0x04,0x40,0x47,0xcc,0x00,0x05,0x12,0x3c] # neg_lo:[1,0,0] +# GFX12-ERR: warning: invalid instruction encoding + +[0x04,0x40,0x47,0xcc,0x00,0x05,0x12,0x5c] # neg_lo:[0,1,0] +# GFX12-ERR: warning: invalid instruction encoding + +[0x04,0x40,0x47,0xcc,0x00,0x05,0x12,0x9c] # neg_lo:[0,0,1] +# GFX12: v_wmma_f32_16x16x16_fp8_bf8 v[4:11], v[0:1], v[2:3], v[4:11] neg_lo:[0,0,1] ; encoding: [0x04,0x40,0x47,0xcc,0x00,0x05,0x12,0x9c] + +[0x04,0x41,0x47,0xcc,0x00,0x05,0x12,0x1c] # neg_hi:[1,0,0] +# GFX12-ERR: warning: invalid instruction encoding + +[0x04,0x42,0x47,0xcc,0x00,0x05,0x12,0x1c] # neg_hi:[0,1,0] +# GFX12-ERR: warning: invalid instruction encoding + +[0x04,0x44,0x47,0xcc,0x00,0x05,0x12,0x1c] # neg_hi:[0,0,1] +# GFX12: v_wmma_f32_16x16x16_fp8_bf8 v[4:11], v[0:1], v[2:3], v[4:11] neg_hi:[0,0,1] ; encoding: [0x04,0x44,0x47,0xcc,0x00,0x05,0x12,0x1c] + +[0x04,0x40,0x47,0xcc,0x00,0x04,0x12,0x1c] # sgpr src0 +# GFX12: v_wmma_f32_16x16x16_fp8_bf8 v[4:11], s[0:1]/*Invalid register, operand has 'VReg_64' register class*/, v[2:3], v[4:11] ; encoding: [0x04,0x40,0x47,0xcc,0x00,0x04,0x12,0x1c] + +[0x04,0x40,0x47,0xcc,0x00,0x05,0x10,0x1c] # sgpr src1 +# GFX12: v_wmma_f32_16x16x16_fp8_bf8 v[4:11], v[0:1], s[2:3]/*Invalid register, operand has 'VReg_64' register class*/, v[4:11] ; encoding: [0x04,0x40,0x47,0xcc,0x00,0x05,0x10,0x1c] + +[0x04,0x40,0x47,0xcc,0x00,0x05,0x12,0x18] # sgpr src2 +# GFX12: v_wmma_f32_16x16x16_fp8_bf8 v[4:11], v[0:1], v[2:3], s[4:11]/*Invalid register, operand has 'VReg_256' register class*/ ; encoding: [0x04,0x40,0x47,0xcc,0x00,0x05,0x12,0x18] + +[0x04,0x40,0x47,0xcc,0xf2,0x04,0x12,0x1c] # 1.0 src0 +# GFX12: v_wmma_f32_16x16x16_fp8_bf8 v[4:11], 1.0/*Invalid immediate*/, v[2:3], v[4:11] ; encoding: [0x04,0x40,0x47,0xcc,0x00,0x04,0x12,0x1c] + +[0x04,0x40,0x47,0xcc,0x00,0xe5,0x11,0x1c] # 1.0 src1 +# GFX12: v_wmma_f32_16x16x16_fp8_bf8 v[4:11], v[0:1], 1.0/*Invalid immediate*/, v[4:11] ; encoding: [0x04,0x40,0x47,0xcc,0x00,0x01,0x10,0x1c] + +[0x04,0x40,0x47,0xcc,0x00,0x05,0xca,0x1b] # 1.0 src2 +# GFX12: v_wmma_f32_16x16x16_fp8_bf8 v[4:11], v[0:1], v[2:3], 1.0 ; encoding: [0x04,0x40,0x47,0xcc,0x00,0x05,0xca,0x1b] + +[0x04,0x40,0x47,0xcc,0x00,0x05,0x06,0x1a] # 1 src2 +# GFX12: v_wmma_f32_16x16x16_fp8_bf8 v[4:11], v[0:1], v[2:3], 1 ; encoding: [0x04,0x40,0x47,0xcc,0x00,0x05,0x06,0x1a] + + + +[0x04,0x40,0x49,0xcc,0x00,0x05,0x12,0x1c] +# GFX12: v_wmma_f32_16x16x16_bf8_bf8 v[4:11], v[0:1], v[2:3], v[4:11] ; encoding: [0x04,0x40,0x49,0xcc,0x00,0x05,0x12,0x1c] + +[0x04,0xc0,0x49,0xcc,0x00,0x05,0x12,0x1c] # clamp +# GFX12-ERR: warning: invalid instruction encoding + +[0x04,0x48,0x49,0xcc,0x00,0x05,0x12,0x1c] # op_sel:[1,0,0] +# GFX12-ERR: warning: invalid instruction encoding + +[0x04,0x50,0x49,0xcc,0x00,0x05,0x12,0x1c] # op_sel:[0,1,0] +# GFX12-ERR: warning: invalid instruction encoding + +[0x04,0x60,0x49,0xcc,0x00,0x05,0x12,0x1c] # op_sel:[0,0,1] +# GFX12-ERR: warning: invalid instruction encoding + +[0x04,0x40,0x49,0xcc,0x00,0x05,0x12,0x14] # op_sel_hi:[0,1,1] +# GFX12-ERR: warning: invalid instruction encoding + +[0x04,0x40,0x49,0xcc,0x00,0x05,0x12,0x0c] # op_sel_hi:[1,0,1] +# GFX12-ERR: warning: invalid instruction encoding + +[0x04,0x00,0x49,0xcc,0x00,0x05,0x12,0x1c] # op_sel_hi:[1,1,0] +# GFX12-ERR: warning: invalid instruction encoding + +[0x04,0x48,0x49,0xcc,0x00,0x05,0x12,0x1c] # index_key:1 +# GFX12-ERR: warning: invalid instruction encoding + +[0x04,0x50,0x49,0xcc,0x00,0x05,0x12,0x1c] # index_key:2 +# GFX12-ERR: warning: invalid instruction encoding + +[0x04,0x58,0x49,0xcc,0x00,0x05,0x12,0x1c] # index_key:3 +# GFX12-ERR: warning: invalid instruction encoding + +[0x04,0x40,0x49,0xcc,0x00,0x05,0x12,0x3c] # neg_lo:[1,0,0] +# GFX12-ERR: warning: invalid instruction encoding + +[0x04,0x40,0x49,0xcc,0x00,0x05,0x12,0x5c] # neg_lo:[0,1,0] +# GFX12-ERR: warning: invalid instruction encoding + +[0x04,0x40,0x49,0xcc,0x00,0x05,0x12,0x9c] # neg_lo:[0,0,1] +# GFX12: v_wmma_f32_16x16x16_bf8_bf8 v[4:11], v[0:1], v[2:3], v[4:11] neg_lo:[0,0,1] ; encoding: [0x04,0x40,0x49,0xcc,0x00,0x05,0x12,0x9c] + +[0x04,0x41,0x49,0xcc,0x00,0x05,0x12,0x1c] # neg_hi:[1,0,0] +# GFX12-ERR: warning: invalid instruction encoding + +[0x04,0x42,0x49,0xcc,0x00,0x05,0x12,0x1c] # neg_hi:[0,1,0] +# GFX12-ERR: warning: invalid instruction encoding + +[0x04,0x44,0x49,0xcc,0x00,0x05,0x12,0x1c] # neg_hi:[0,0,1] +# GFX12: v_wmma_f32_16x16x16_bf8_bf8 v[4:11], v[0:1], v[2:3], v[4:11] neg_hi:[0,0,1] ; encoding: [0x04,0x44,0x49,0xcc,0x00,0x05,0x12,0x1c] + +[0x04,0x40,0x49,0xcc,0x00,0x04,0x12,0x1c] # sgpr src0 +# GFX12: v_wmma_f32_16x16x16_bf8_bf8 v[4:11], s[0:1]/*Invalid register, operand has 'VReg_64' register class*/, v[2:3], v[4:11] ; encoding: [0x04,0x40,0x49,0xcc,0x00,0x04,0x12,0x1c] + +[0x04,0x40,0x49,0xcc,0x00,0x05,0x10,0x1c] # sgpr src1 +# GFX12: v_wmma_f32_16x16x16_bf8_bf8 v[4:11], v[0:1], s[2:3]/*Invalid register, operand has 'VReg_64' register class*/, v[4:11] ; encoding: [0x04,0x40,0x49,0xcc,0x00,0x05,0x10,0x1c] + +[0x04,0x40,0x49,0xcc,0x00,0x05,0x12,0x18] # sgpr src2 +# GFX12: v_wmma_f32_16x16x16_bf8_bf8 v[4:11], v[0:1], v[2:3], s[4:11]/*Invalid register, operand has 'VReg_256' register class*/ ; encoding: [0x04,0x40,0x49,0xcc,0x00,0x05,0x12,0x18] + +[0x04,0x40,0x49,0xcc,0xf2,0x04,0x12,0x1c] # 1.0 src0 +# GFX12: v_wmma_f32_16x16x16_bf8_bf8 v[4:11], 1.0/*Invalid immediate*/, v[2:3], v[4:11] ; encoding: [0x04,0x40,0x49,0xcc,0x00,0x04,0x12,0x1c] + +[0x04,0x40,0x49,0xcc,0x00,0xe5,0x11,0x1c] # 1.0 src1 +# GFX12: v_wmma_f32_16x16x16_bf8_bf8 v[4:11], v[0:1], 1.0/*Invalid immediate*/, v[4:11] ; encoding: [0x04,0x40,0x49,0xcc,0x00,0x01,0x10,0x1c] + +[0x04,0x40,0x49,0xcc,0x00,0x05,0xca,0x1b] # 1.0 src2 +# GFX12: v_wmma_f32_16x16x16_bf8_bf8 v[4:11], v[0:1], v[2:3], 1.0 ; encoding: [0x04,0x40,0x49,0xcc,0x00,0x05,0xca,0x1b] + +[0x04,0x40,0x49,0xcc,0x00,0x05,0x06,0x1a] # 1 src2 +# GFX12: v_wmma_f32_16x16x16_bf8_bf8 v[4:11], v[0:1], v[2:3], 1 ; encoding: [0x04,0x40,0x49,0xcc,0x00,0x05,0x06,0x1a] + + + +[0x04,0x40,0x4a,0xcc,0x00,0x05,0x12,0x1c] +# GFX12: v_wmma_i32_16x16x32_iu4 v[4:11], v[0:1], v[2:3], v[4:11] ; encoding: [0x04,0x40,0x4a,0xcc,0x00,0x05,0x12,0x1c] + +[0x04,0xc0,0x4a,0xcc,0x00,0x05,0x12,0x1c] # clamp +# GFX12: v_wmma_i32_16x16x32_iu4 v[4:11], v[0:1], v[2:3], v[4:11] clamp ; encoding: [0x04,0xc0,0x4a,0xcc,0x00,0x05,0x12,0x1c] + +[0x04,0x48,0x4a,0xcc,0x00,0x05,0x12,0x1c] # op_sel:[1,0,0] +# GFX12-ERR: warning: invalid instruction encoding + +[0x04,0x50,0x4a,0xcc,0x00,0x05,0x12,0x1c] # op_sel:[0,1,0] +# GFX12-ERR: warning: invalid instruction encoding + +[0x04,0x60,0x4a,0xcc,0x00,0x05,0x12,0x1c] # op_sel:[0,0,1] +# GFX12-ERR: warning: invalid instruction encoding + +[0x04,0x40,0x4a,0xcc,0x00,0x05,0x12,0x14] # op_sel_hi:[0,1,1] +# GFX12-ERR: warning: invalid instruction encoding + +[0x04,0x40,0x4a,0xcc,0x00,0x05,0x12,0x0c] # op_sel_hi:[1,0,1] +# GFX12-ERR: warning: invalid instruction encoding + +[0x04,0x00,0x4a,0xcc,0x00,0x05,0x12,0x1c] # op_sel_hi:[1,1,0] +# GFX12-ERR: warning: invalid instruction encoding + +[0x04,0x48,0x4a,0xcc,0x00,0x05,0x12,0x1c] # index_key:1 +# GFX12-ERR: warning: invalid instruction encoding + +[0x04,0x50,0x4a,0xcc,0x00,0x05,0x12,0x1c] # index_key:2 +# GFX12-ERR: warning: invalid instruction encoding + +[0x04,0x58,0x4a,0xcc,0x00,0x05,0x12,0x1c] # index_key:3 +# GFX12-ERR: warning: invalid instruction encoding + +[0x04,0x40,0x4a,0xcc,0x00,0x05,0x12,0x3c] # neg_lo:[1,0,0] +# GFX12: v_wmma_i32_16x16x32_iu4 v[4:11], v[0:1], v[2:3], v[4:11] neg_lo:[1,0,0] ; encoding: [0x04,0x40,0x4a,0xcc,0x00,0x05,0x12,0x3c] + +[0x04,0x40,0x4a,0xcc,0x00,0x05,0x12,0x5c] # neg_lo:[0,1,0] +# GFX12: v_wmma_i32_16x16x32_iu4 v[4:11], v[0:1], v[2:3], v[4:11] neg_lo:[0,1,0] ; encoding: [0x04,0x40,0x4a,0xcc,0x00,0x05,0x12,0x5c] + +[0x04,0x40,0x4a,0xcc,0x00,0x05,0x12,0x9c] # neg_lo:[0,0,1] +# GFX12-ERR: warning: invalid instruction encoding + +[0x04,0x41,0x4a,0xcc,0x00,0x05,0x12,0x1c] # neg_hi:[1,0,0] +# GFX12-ERR: warning: invalid instruction encoding + +[0x04,0x42,0x4a,0xcc,0x00,0x05,0x12,0x1c] # neg_hi:[0,1,0] +# GFX12-ERR: warning: invalid instruction encoding + +[0x04,0x44,0x4a,0xcc,0x00,0x05,0x12,0x1c] # neg_hi:[0,0,1] +# GFX12-ERR: warning: invalid instruction encoding + +[0x04,0x40,0x4a,0xcc,0x00,0x04,0x12,0x1c] # sgpr src0 +# GFX12: v_wmma_i32_16x16x32_iu4 v[4:11], s[0:1]/*Invalid register, operand has 'VReg_64' register class*/, v[2:3], v[4:11] ; encoding: [0x04,0x40,0x4a,0xcc,0x00,0x04,0x12,0x1c] + +[0x04,0x40,0x4a,0xcc,0x00,0x05,0x10,0x1c] # sgpr src1 +# GFX12: v_wmma_i32_16x16x32_iu4 v[4:11], v[0:1], s[2:3]/*Invalid register, operand has 'VReg_64' register class*/, v[4:11] ; encoding: [0x04,0x40,0x4a,0xcc,0x00,0x05,0x10,0x1c] + +[0x04,0x40,0x4a,0xcc,0x00,0x05,0x12,0x18] # sgpr src2 +# GFX12: v_wmma_i32_16x16x32_iu4 v[4:11], v[0:1], v[2:3], s[4:11]/*Invalid register, operand has 'VReg_256' register class*/ ; encoding: [0x04,0x40,0x4a,0xcc,0x00,0x05,0x12,0x18] + +[0x04,0x40,0x4a,0xcc,0x81,0x04,0x12,0x1c] # 1 src0 +# GFX12: v_wmma_i32_16x16x32_iu4 v[4:11], 1/*Invalid immediate*/, v[2:3], v[4:11] ; encoding: [0x04,0x40,0x4a,0xcc,0x01,0x04,0x12,0x1c] + +[0x04,0x40,0x4a,0xcc,0x00,0x03,0x11,0x1c] # 1 src1 +# GFX12: v_wmma_i32_16x16x32_iu4 v[4:11], v[0:1], 1/*Invalid immediate*/, v[4:11] ; encoding: [0x04,0x40,0x4a,0xcc,0x00,0x03,0x10,0x1c] + +[0x04,0x40,0x4a,0xcc,0x00,0x05,0x06,0x1a] # 1 src2 +# GFX12: v_wmma_i32_16x16x32_iu4 v[4:11], v[0:1], v[2:3], 1 ; encoding: [0x04,0x40,0x4a,0xcc,0x00,0x05,0x06,0x1a] + +[0x04,0x40,0x4a,0xcc,0x00,0x05,0xca,0x1b] # 1.0 src2 +# GFX12: v_wmma_i32_16x16x32_iu4 v[4:11], v[0:1], v[2:3], 1.0 ; encoding: [0x04,0x40,0x4a,0xcc,0x00,0x05,0xca,0x1b] + + + +[0x0c,0x40,0x50,0xcc,0x00,0x09,0x52,0x1c] +# GFX12:v_swmmac_f32_16x16x32_f16 v[12:19], v[0:3], v[4:11], v20 ; encoding: [0x0c,0x40,0x50,0xcc,0x00,0x09,0x52,0x1c] + +[0x0c,0xc0,0x50,0xcc,0x00,0x09,0x52,0x1c] # clamp +# GFX12-ERR: warning: invalid instruction encoding + +[0x0c,0x48,0x50,0xcc,0x00,0x09,0x52,0x1c] # op_sel:[1,0,0] +# GFX12:v_swmmac_f32_16x16x32_f16 v[12:19], v[0:3], v[4:11], v20 index_key:1 ; encoding: [0x0c,0x48,0x50,0xcc,0x00,0x09,0x52,0x1c] + +[0x0c,0x50,0x50,0xcc,0x00,0x09,0x52,0x1c] # op_sel:[0,1,0] +# GFX12-ERR: warning: invalid instruction encoding + +[0x0c,0x60,0x50,0xcc,0x00,0x09,0x52,0x1c] # op_sel:[0,0,1] +# GFX12-ERR: warning: invalid instruction encoding + +[0x0c,0x40,0x50,0xcc,0x00,0x09,0x52,0x14] # op_sel_hi:[0,1,1] +# GFX12-ERR: warning: invalid instruction encoding + +[0x0c,0x40,0x50,0xcc,0x00,0x09,0x52,0x0c] # op_sel_hi:[1,0,1] +# GFX12-ERR: warning: invalid instruction encoding + +[0x0c,0x00,0x50,0xcc,0x00,0x09,0x52,0x1c] # op_sel_hi:[1,1,0] +# GFX12-ERR: warning: invalid instruction encoding + +[0x0c,0x48,0x50,0xcc,0x00,0x09,0x52,0x1c] # index_key:1 +# GFX12: v_swmmac_f32_16x16x32_f16 v[12:19], v[0:3], v[4:11], v20 index_key:1 ; encoding: [0x0c,0x48,0x50,0xcc,0x00,0x09,0x52,0x1c] + +[0x0c,0x50,0x50,0xcc,0x00,0x09,0x52,0x1c] # index_key:2 +# GFX12-ERR: warning: invalid instruction encoding + +[0x0c,0x58,0x50,0xcc,0x00,0x09,0x52,0x1c] # index_key:3 +# GFX12-ERR: warning: invalid instruction encoding + +[0x0c,0x40,0x50,0xcc,0x00,0x09,0x52,0x3c] # neg_lo:[1,0,0] +# GFX12: v_swmmac_f32_16x16x32_f16 v[12:19], v[0:3], v[4:11], v20 neg_lo:[1,0,0] ; encoding: [0x0c,0x40,0x50,0xcc,0x00,0x09,0x52,0x3c] + +[0x0c,0x40,0x50,0xcc,0x00,0x09,0x52,0x5c] # neg_lo:[0,1,0] +# GFX12: v_swmmac_f32_16x16x32_f16 v[12:19], v[0:3], v[4:11], v20 neg_lo:[0,1,0] ; encoding: [0x0c,0x40,0x50,0xcc,0x00,0x09,0x52,0x5c] + +[0x0c,0x40,0x50,0xcc,0x00,0x09,0x52,0x9c] # neg_lo:[0,0,1] +# GFX12-ERR: warning: invalid instruction encoding + +[0x0c,0x41,0x50,0xcc,0x00,0x09,0x52,0x1c] # neg_hi:[1,0,0] +# GFX12: v_swmmac_f32_16x16x32_f16 v[12:19], v[0:3], v[4:11], v20 neg_hi:[1,0,0] ; encoding: [0x0c,0x41,0x50,0xcc,0x00,0x09,0x52,0x1c] + +[0x0c,0x42,0x50,0xcc,0x00,0x09,0x52,0x1c] # neg_hi:[0,1,0] +# GFX12: v_swmmac_f32_16x16x32_f16 v[12:19], v[0:3], v[4:11], v20 neg_hi:[0,1,0] ; encoding: [0x0c,0x42,0x50,0xcc,0x00,0x09,0x52,0x1c] + +[0x0c,0x44,0x50,0xcc,0x00,0x09,0x52,0x1c] # neg_hi:[0,0,1] +# GFX12-ERR: warning: invalid instruction encoding + +[0x0c,0x40,0x50,0xcc,0x00,0x08,0x52,0x1c] # sgpr src0 +# GFX12: v_swmmac_f32_16x16x32_f16 v[12:19], s[0:3]/*Invalid register, operand has 'VReg_128' register class*/, v[4:11], v20 ; encoding: [0x0c,0x40,0x50,0xcc,0x00,0x08,0x52,0x1c] + +[0x0c,0x40,0x50,0xcc,0x00,0x09,0x50,0x1c] # sgpr src1 +# GFX12: v_swmmac_f32_16x16x32_f16 v[12:19], v[0:3], s[4:11]/*Invalid register, operand has 'VReg_256' register class*/, v20 ; encoding: [0x0c,0x40,0x50,0xcc,0x00,0x09,0x50,0x1c] + +[0x0c,0x40,0x50,0xcc,0x00,0x09,0x52,0x18] # sgpr src2 +# GFX12: v_swmmac_f32_16x16x32_f16 v[12:19], v[0:3], v[4:11], s20/*Invalid register, operand has 'VGPR_32' register class*/ ; encoding: [0x0c,0x40,0x50,0xcc,0x00,0x09,0x52,0x18] + +[0x0c,0x40,0x50,0xcc,0xf2,0x08,0x52,0x1c] # 1.0 src0 +# GFX12: v_swmmac_f32_16x16x32_f16 v[12:19], 1.0/*Invalid immediate*/, v[4:11], v20 ; encoding: [0x0c,0x40,0x50,0xcc,0x00,0x08,0x52,0x1c] + +[0x0c,0x40,0x50,0xcc,0x00,0xe5,0x51,0x1c] # 1.0 src1 +# GFX12: v_swmmac_f32_16x16x32_f16 v[12:19], v[0:3], 1.0/*Invalid immediate*/, v20 ; encoding: [0x0c,0x40,0x50,0xcc,0x00,0x01,0x50,0x1c] + +[0x0c,0x40,0x50,0xcc,0x00,0x09,0xca,0x1b] # 1.0 src2 +# GFX12: v_swmmac_f32_16x16x32_f16 v[12:19], v[0:3], v[4:11], 1.0/*Invalid immediate*/ ; encoding: [0x0c,0x40,0x50,0xcc,0x00,0x09,0x02,0x18] + +[0x0c,0x40,0x50,0xcc,0x00,0x09,0x06,0x1a] # 1 src2 +# GFX12: v_swmmac_f32_16x16x32_f16 v[12:19], v[0:3], v[4:11], 1/*Invalid immediate*/ ; encoding: [0x0c,0x40,0x50,0xcc,0x00,0x09,0x06,0x18] + + + +[0x0c,0x40,0x51,0xcc,0x00,0x09,0x52,0x1c] +# GFX12: v_swmmac_f32_16x16x32_bf16 v[12:19], v[0:3], v[4:11], v20 ; encoding: [0x0c,0x40,0x51,0xcc,0x00,0x09,0x52,0x1c] + +[0x0c,0xc0,0x51,0xcc,0x00,0x09,0x52,0x1c] # clamp +# GFX12-ERR: warning: invalid instruction encoding + +[0x0c,0x48,0x51,0xcc,0x00,0x09,0x52,0x1c] # op_sel:[1,0,0] +# GFX12: v_swmmac_f32_16x16x32_bf16 v[12:19], v[0:3], v[4:11], v20 index_key:1 ; encoding: [0x0c,0x48,0x51,0xcc,0x00,0x09,0x52,0x1c] + +[0x0c,0x50,0x51,0xcc,0x00,0x09,0x52,0x1c] # op_sel:[0,1,0] +# GFX12-ERR: warning: invalid instruction encoding + +[0x0c,0x60,0x51,0xcc,0x00,0x09,0x52,0x1c] # op_sel:[0,0,1] +# GFX12-ERR: warning: invalid instruction encoding + +[0x0c,0x40,0x51,0xcc,0x00,0x09,0x52,0x14] # op_sel_hi:[0,1,1] +# GFX12-ERR: warning: invalid instruction encoding + +[0x0c,0x40,0x51,0xcc,0x00,0x09,0x52,0x0c] # op_sel_hi:[1,0,1] +# GFX12-ERR: warning: invalid instruction encoding + +[0x0c,0x00,0x51,0xcc,0x00,0x09,0x52,0x1c] # op_sel_hi:[1,1,0] +# GFX12-ERR: warning: invalid instruction encoding + +[0x0c,0x48,0x51,0xcc,0x00,0x09,0x52,0x1c] # index_key:1 +# GFX12: v_swmmac_f32_16x16x32_bf16 v[12:19], v[0:3], v[4:11], v20 index_key:1 ; encoding: [0x0c,0x48,0x51,0xcc,0x00,0x09,0x52,0x1c] + +[0x0c,0x50,0x51,0xcc,0x00,0x09,0x52,0x1c] # index_key:2 +# GFX12-ERR: warning: invalid instruction encoding + +[0x0c,0x58,0x51,0xcc,0x00,0x09,0x52,0x1c] # index_key:3 +# GFX12-ERR: warning: invalid instruction encoding + +[0x0c,0x40,0x51,0xcc,0x00,0x09,0x52,0x3c] # neg_lo:[1,0,0] +# GFX12: v_swmmac_f32_16x16x32_bf16 v[12:19], v[0:3], v[4:11], v20 neg_lo:[1,0,0] ; encoding: [0x0c,0x40,0x51,0xcc,0x00,0x09,0x52,0x3c] + +[0x0c,0x40,0x51,0xcc,0x00,0x09,0x52,0x5c] # neg_lo:[0,1,0] +# GFX12: v_swmmac_f32_16x16x32_bf16 v[12:19], v[0:3], v[4:11], v20 neg_lo:[0,1,0] ; encoding: [0x0c,0x40,0x51,0xcc,0x00,0x09,0x52,0x5c] + +[0x0c,0x40,0x51,0xcc,0x00,0x09,0x52,0x9c] # neg_lo:[0,0,1] +# GFX12-ERR: warning: invalid instruction encoding + +[0x0c,0x41,0x51,0xcc,0x00,0x09,0x52,0x1c] # neg_hi:[1,0,0] +# GFX12: v_swmmac_f32_16x16x32_bf16 v[12:19], v[0:3], v[4:11], v20 neg_hi:[1,0,0] ; encoding: [0x0c,0x41,0x51,0xcc,0x00,0x09,0x52,0x1c] + +[0x0c,0x42,0x51,0xcc,0x00,0x09,0x52,0x1c] # neg_hi:[0,1,0] +# GFX12: v_swmmac_f32_16x16x32_bf16 v[12:19], v[0:3], v[4:11], v20 neg_hi:[0,1,0] ; encoding: [0x0c,0x42,0x51,0xcc,0x00,0x09,0x52,0x1c] + +[0x0c,0x44,0x51,0xcc,0x00,0x09,0x52,0x1c] # neg_hi:[0,0,1] +# GFX12-ERR: warning: invalid instruction encoding + +[0x0c,0x40,0x51,0xcc,0x00,0x08,0x52,0x1c] # sgpr src0 +# GFX12: v_swmmac_f32_16x16x32_bf16 v[12:19], s[0:3]/*Invalid register, operand has 'VReg_128' register class*/, v[4:11], v20 ; encoding: [0x0c,0x40,0x51,0xcc,0x00,0x08,0x52,0x1c] + +[0x0c,0x40,0x51,0xcc,0x00,0x09,0x50,0x1c] # sgpr src1 +# GFX12: v_swmmac_f32_16x16x32_bf16 v[12:19], v[0:3], s[4:11]/*Invalid register, operand has 'VReg_256' register class*/, v20 ; encoding: [0x0c,0x40,0x51,0xcc,0x00,0x09,0x50,0x1c] + +[0x0c,0x40,0x51,0xcc,0x00,0x09,0x52,0x18] # sgpr src2 +# GFX12: v_swmmac_f32_16x16x32_bf16 v[12:19], v[0:3], v[4:11], s20/*Invalid register, operand has 'VGPR_32' register class*/ ; encoding: [0x0c,0x40,0x51,0xcc,0x00,0x09,0x52,0x18] + +[0x0c,0x40,0x51,0xcc,0xf2,0x08,0x52,0x1c] # 1.0 src0 +# GFX12: v_swmmac_f32_16x16x32_bf16 v[12:19], 1.0/*Invalid immediate*/, v[4:11], v20 ; encoding: [0x0c,0x40,0x51,0xcc,0x00,0x08,0x52,0x1c] + +[0x0c,0x40,0x51,0xcc,0x00,0xe5,0x51,0x1c] # 1.0 src1 +# GFX12: v_swmmac_f32_16x16x32_bf16 v[12:19], v[0:3], 1.0/*Invalid immediate*/, v20 ; encoding: [0x0c,0x40,0x51,0xcc,0x00,0x01,0x50,0x1c] + +[0x0c,0x40,0x51,0xcc,0x00,0x09,0xca,0x1b] # 1.0 src2 +# GFX12: v_swmmac_f32_16x16x32_bf16 v[12:19], v[0:3], v[4:11], 1.0/*Invalid immediate*/ ; encoding: [0x0c,0x40,0x51,0xcc,0x00,0x09,0x02,0x18] + +[0x0c,0x40,0x51,0xcc,0x00,0x09,0x06,0x1a] # 1 src2 +# GFX12: v_swmmac_f32_16x16x32_bf16 v[12:19], v[0:3], v[4:11], 1/*Invalid immediate*/ ; encoding: [0x0c,0x40,0x51,0xcc,0x00,0x09,0x06,0x18] + + + +[0x0c,0x40,0x52,0xcc,0x00,0x09,0x42,0x1c] +# GFX12: v_swmmac_f16_16x16x32_f16 v[12:15], v[0:3], v[4:11], v16 ; encoding: [0x0c,0x40,0x52,0xcc,0x00,0x09,0x42,0x1c] + +[0x0c,0xc0,0x52,0xcc,0x00,0x09,0x42,0x1c] # clamp +# GFX12-ERR: warning: invalid instruction encoding + +[0x0c,0x48,0x52,0xcc,0x00,0x09,0x42,0x1c] # op_sel:[1,0,0] +# GFX12: v_swmmac_f16_16x16x32_f16 v[12:15], v[0:3], v[4:11], v16 index_key:1 ; encoding: [0x0c,0x48,0x52,0xcc,0x00,0x09,0x42,0x1c] + +[0x0c,0x50,0x52,0xcc,0x00,0x09,0x42,0x1c] # op_sel:[0,1,0] +# GFX12-ERR: warning: invalid instruction encoding + +[0x0c,0x60,0x52,0xcc,0x00,0x09,0x42,0x1c] # op_sel:[0,0,1] +# GFX12-ERR: warning: invalid instruction encoding + +[0x0c,0x40,0x52,0xcc,0x00,0x09,0x42,0x14] # op_sel_hi:[0,1,1] +# GFX12-ERR: warning: invalid instruction encoding + +[0x0c,0x40,0x52,0xcc,0x00,0x09,0x42,0x0c] # op_sel_hi:[1,0,1] +# GFX12-ERR: warning: invalid instruction encoding + +[0x0c,0x00,0x52,0xcc,0x00,0x09,0x42,0x1c] # op_sel_hi:[1,1,0] +# GFX12-ERR: warning: invalid instruction encoding + +[0x0c,0x48,0x52,0xcc,0x00,0x09,0x42,0x1c] # index_key:1 +# GFX12: v_swmmac_f16_16x16x32_f16 v[12:15], v[0:3], v[4:11], v16 index_key:1 ; encoding: [0x0c,0x48,0x52,0xcc,0x00,0x09,0x42,0x1c] + +[0x0c,0x50,0x52,0xcc,0x00,0x09,0x42,0x1c] # index_key:2 +# GFX12-ERR: warning: invalid instruction encoding + +[0x0c,0x58,0x52,0xcc,0x00,0x09,0x42,0x1c] # index_key:3 +# GFX12-ERR: warning: invalid instruction encoding + +[0x0c,0x40,0x52,0xcc,0x00,0x09,0x42,0x3c] # neg_lo:[1,0,0] +# GFX12: v_swmmac_f16_16x16x32_f16 v[12:15], v[0:3], v[4:11], v16 neg_lo:[1,0,0] ; encoding: [0x0c,0x40,0x52,0xcc,0x00,0x09,0x42,0x3c] + +[0x0c,0x40,0x52,0xcc,0x00,0x09,0x42,0x5c] # neg_lo:[0,1,0] +# GFX12: v_swmmac_f16_16x16x32_f16 v[12:15], v[0:3], v[4:11], v16 neg_lo:[0,1,0] ; encoding: [0x0c,0x40,0x52,0xcc,0x00,0x09,0x42,0x5c] + +[0x0c,0x40,0x52,0xcc,0x00,0x09,0x42,0x9c] # neg_lo:[0,0,1] +# GFX12-ERR: warning: invalid instruction encoding + +[0x0c,0x41,0x52,0xcc,0x00,0x09,0x42,0x1c] # neg_hi:[1,0,0] +# GFX12: v_swmmac_f16_16x16x32_f16 v[12:15], v[0:3], v[4:11], v16 neg_hi:[1,0,0] ; encoding: [0x0c,0x41,0x52,0xcc,0x00,0x09,0x42,0x1c] + +[0x0c,0x42,0x52,0xcc,0x00,0x09,0x42,0x1c] # neg_hi:[0,1,0] +# GFX12: v_swmmac_f16_16x16x32_f16 v[12:15], v[0:3], v[4:11], v16 neg_hi:[0,1,0] ; encoding: [0x0c,0x42,0x52,0xcc,0x00,0x09,0x42,0x1c] + +[0x0c,0x44,0x52,0xcc,0x00,0x09,0x42,0x1c] # neg_hi:[0,0,1] +# GFX12-ERR: warning: invalid instruction encoding + +[0x0c,0x40,0x52,0xcc,0x00,0x08,0x42,0x1c] # sgpr src0 +# GFX12: v_swmmac_f16_16x16x32_f16 v[12:15], s[0:3]/*Invalid register, operand has 'VReg_128' register class*/, v[4:11], v16 ; encoding: [0x0c,0x40,0x52,0xcc,0x00,0x08,0x42,0x1c] + +[0x0c,0x40,0x52,0xcc,0x00,0x09,0x40,0x1c] # sgpr src1 +# GFX12: v_swmmac_f16_16x16x32_f16 v[12:15], v[0:3], s[4:11]/*Invalid register, operand has 'VReg_256' register class*/, v16 ; encoding: [0x0c,0x40,0x52,0xcc,0x00,0x09,0x40,0x1c] + +[0x0c,0x40,0x52,0xcc,0x00,0x09,0x42,0x18] # sgpr src2 +# GFX12: v_swmmac_f16_16x16x32_f16 v[12:15], v[0:3], v[4:11], s16/*Invalid register, operand has 'VGPR_32' register class*/ ; encoding: [0x0c,0x40,0x52,0xcc,0x00,0x09,0x42,0x18] + +[0x0c,0x40,0x52,0xcc,0xf2,0x08,0x42,0x1c] # 1.0 src0 +# GFX12: v_swmmac_f16_16x16x32_f16 v[12:15], 1.0/*Invalid immediate*/, v[4:11], v16 ; encoding: [0x0c,0x40,0x52,0xcc,0x00,0x08,0x42,0x1c] + +[0x0c,0x40,0x52,0xcc,0x00,0xe5,0x41,0x1c] # 1.0 src1 +# GFX12: v_swmmac_f16_16x16x32_f16 v[12:15], v[0:3], 1.0/*Invalid immediate*/, v16 ; encoding: [0x0c,0x40,0x52,0xcc,0x00,0x01,0x40,0x1c] + +[0x0c,0x40,0x52,0xcc,0x00,0x09,0xca,0x1b] # 1.0 src2 +# GFX12: v_swmmac_f16_16x16x32_f16 v[12:15], v[0:3], v[4:11], 1.0/*Invalid immediate*/ ; encoding: [0x0c,0x40,0x52,0xcc,0x00,0x09,0x02,0x18] + +[0x0c,0x40,0x52,0xcc,0x00,0x09,0x06,0x1a] # 1 src2 +# GFX12: v_swmmac_f16_16x16x32_f16 v[12:15], v[0:3], v[4:11], 1/*Invalid immediate*/ ; encoding: [0x0c,0x40,0x52,0xcc,0x00,0x09,0x06,0x18] + + + +[0x0c,0x40,0x53,0xcc,0x00,0x09,0x42,0x1c] +# GFX12: v_swmmac_bf16_16x16x32_bf16 v[12:15], v[0:3], v[4:11], v16 ; encoding: [0x0c,0x40,0x53,0xcc,0x00,0x09,0x42,0x1c] + +[0x0c,0xc0,0x53,0xcc,0x00,0x09,0x42,0x1c] # clamp +# GFX12-ERR: warning: invalid instruction encoding + +[0x0c,0x48,0x53,0xcc,0x00,0x09,0x42,0x1c] # op_sel:[1,0,0] +# GFX12: v_swmmac_bf16_16x16x32_bf16 v[12:15], v[0:3], v[4:11], v16 index_key:1 ; encoding: [0x0c,0x48,0x53,0xcc,0x00,0x09,0x42,0x1c] + +[0x0c,0x50,0x53,0xcc,0x00,0x09,0x42,0x1c] # op_sel:[0,1,0] +# GFX12-ERR: warning: invalid instruction encoding + +[0x0c,0x60,0x53,0xcc,0x00,0x09,0x42,0x1c] # op_sel:[0,0,1] +# GFX12-ERR: warning: invalid instruction encoding + +[0x0c,0x40,0x53,0xcc,0x00,0x09,0x42,0x14] # op_sel_hi:[0,1,1] +# GFX12-ERR: warning: invalid instruction encoding + +[0x0c,0x40,0x53,0xcc,0x00,0x09,0x42,0x0c] # op_sel_hi:[1,0,1] +# GFX12-ERR: warning: invalid instruction encoding + +[0x0c,0x00,0x53,0xcc,0x00,0x09,0x42,0x1c] # op_sel_hi:[1,1,0] +# GFX12-ERR: warning: invalid instruction encoding + +[0x0c,0x48,0x53,0xcc,0x00,0x09,0x42,0x1c] # index_key:1 +# GFX12: v_swmmac_bf16_16x16x32_bf16 v[12:15], v[0:3], v[4:11], v16 index_key:1 ; encoding: [0x0c,0x48,0x53,0xcc,0x00,0x09,0x42,0x1c] + +[0x0c,0x50,0x53,0xcc,0x00,0x09,0x42,0x1c] # index_key:2 +# GFX12-ERR: warning: invalid instruction encoding + +[0x0c,0x58,0x53,0xcc,0x00,0x09,0x42,0x1c] # index_key:3 +# GFX12-ERR: warning: invalid instruction encoding + +[0x0c,0x40,0x53,0xcc,0x00,0x09,0x42,0x3c] # neg_lo:[1,0,0] +# GFX12: v_swmmac_bf16_16x16x32_bf16 v[12:15], v[0:3], v[4:11], v16 neg_lo:[1,0,0] ; encoding: [0x0c,0x40,0x53,0xcc,0x00,0x09,0x42,0x3c] + +[0x0c,0x40,0x53,0xcc,0x00,0x09,0x42,0x5c] # neg_lo:[0,1,0] +# GFX12: v_swmmac_bf16_16x16x32_bf16 v[12:15], v[0:3], v[4:11], v16 neg_lo:[0,1,0] ; encoding: [0x0c,0x40,0x53,0xcc,0x00,0x09,0x42,0x5c] + +[0x0c,0x40,0x53,0xcc,0x00,0x09,0x42,0x9c] # neg_lo:[0,0,1] +# GFX12-ERR: warning: invalid instruction encoding + +[0x0c,0x41,0x53,0xcc,0x00,0x09,0x42,0x1c] # neg_hi:[1,0,0] +# GFX12: v_swmmac_bf16_16x16x32_bf16 v[12:15], v[0:3], v[4:11], v16 neg_hi:[1,0,0] ; encoding: [0x0c,0x41,0x53,0xcc,0x00,0x09,0x42,0x1c] + +[0x0c,0x42,0x53,0xcc,0x00,0x09,0x42,0x1c] # neg_hi:[0,1,0] +# GFX12: v_swmmac_bf16_16x16x32_bf16 v[12:15], v[0:3], v[4:11], v16 neg_hi:[0,1,0] ; encoding: [0x0c,0x42,0x53,0xcc,0x00,0x09,0x42,0x1c] + +[0x0c,0x44,0x53,0xcc,0x00,0x09,0x42,0x1c] # neg_hi:[0,0,1] +# GFX12-ERR: warning: invalid instruction encoding + +[0x0c,0x40,0x53,0xcc,0x00,0x08,0x42,0x1c] # sgpr src0 +# GFX12: v_swmmac_bf16_16x16x32_bf16 v[12:15], s[0:3]/*Invalid register, operand has 'VReg_128' register class*/, v[4:11], v16 ; encoding: [0x0c,0x40,0x53,0xcc,0x00,0x08,0x42,0x1c] + +[0x0c,0x40,0x53,0xcc,0x00,0x09,0x40,0x1c] # sgpr src1 +# GFX12: v_swmmac_bf16_16x16x32_bf16 v[12:15], v[0:3], s[4:11]/*Invalid register, operand has 'VReg_256' register class*/, v16 ; encoding: [0x0c,0x40,0x53,0xcc,0x00,0x09,0x40,0x1c] + +[0x0c,0x40,0x53,0xcc,0x00,0x09,0x42,0x18] # sgpr src2 +# GFX12: v_swmmac_bf16_16x16x32_bf16 v[12:15], v[0:3], v[4:11], s16/*Invalid register, operand has 'VGPR_32' register class*/ ; encoding: [0x0c,0x40,0x53,0xcc,0x00,0x09,0x42,0x18] + +[0x0c,0x40,0x53,0xcc,0xf2,0x08,0x42,0x1c] # 1.0 src0 +# GFX12: v_swmmac_bf16_16x16x32_bf16 v[12:15], 1.0/*Invalid immediate*/, v[4:11], v16 ; encoding: [0x0c,0x40,0x53,0xcc,0x00,0x08,0x42,0x1c] + +[0x0c,0x40,0x53,0xcc,0x00,0xe5,0x41,0x1c] # 1.0 src1 +# GFX12: v_swmmac_bf16_16x16x32_bf16 v[12:15], v[0:3], 1.0/*Invalid immediate*/, v16 ; encoding: [0x0c,0x40,0x53,0xcc,0x00,0x01,0x40,0x1c] + +[0x0c,0x40,0x53,0xcc,0x00,0x09,0xca,0x1b] # 1.0 src2 +# GFX12: v_swmmac_bf16_16x16x32_bf16 v[12:15], v[0:3], v[4:11], 1.0/*Invalid immediate*/ ; encoding: [0x0c,0x40,0x53,0xcc,0x00,0x09,0x02,0x18] + +[0x0c,0x40,0x53,0xcc,0x00,0x09,0x06,0x1a] # 1 src2 +# GFX12: v_swmmac_bf16_16x16x32_bf16 v[12:15], v[0:3], v[4:11], 1/*Invalid immediate*/ ; encoding: [0x0c,0x40,0x53,0xcc,0x00,0x09,0x06,0x18] + + + +[0x06,0x40,0x54,0xcc,0x00,0x05,0x3a,0x1c] +# GFX12: v_swmmac_i32_16x16x32_iu8 v[6:13], v[0:1], v[2:5], v14 ; encoding: [0x06,0x40,0x54,0xcc,0x00,0x05,0x3a,0x1c] + +[0x06,0xc0,0x54,0xcc,0x00,0x05,0x3a,0x1c] # clamp +# GFX12: v_swmmac_i32_16x16x32_iu8 v[6:13], v[0:1], v[2:5], v14 clamp ; encoding: [0x06,0xc0,0x54,0xcc,0x00,0x05,0x3a,0x1c] + +[0x06,0x48,0x54,0xcc,0x00,0x05,0x3a,0x1c] # op_sel:[1,0,0] +# GFX12: v_swmmac_i32_16x16x32_iu8 v[6:13], v[0:1], v[2:5], v14 index_key:1 ; encoding: [0x06,0x48,0x54,0xcc,0x00,0x05,0x3a,0x1c] + +[0x06,0x50,0x54,0xcc,0x00,0x05,0x3a,0x1c] # op_sel:[0,1,0] +# GFX12-ERR: warning: invalid instruction encoding + +[0x06,0x60,0x54,0xcc,0x00,0x05,0x3a,0x1c] # op_sel:[0,0,1] +# GFX12-ERR: warning: invalid instruction encoding + +[0x06,0x40,0x54,0xcc,0x00,0x05,0x3a,0x14] # op_sel_hi:[0,1,1] +# GFX12-ERR: warning: invalid instruction encoding + +[0x06,0x40,0x54,0xcc,0x00,0x05,0x3a,0x0c] # op_sel_hi:[1,0,1] +# GFX12-ERR: warning: invalid instruction encoding + +[0x06,0x00,0x54,0xcc,0x00,0x05,0x3a,0x1c] # op_sel_hi:[1,1,0] +# GFX12-ERR: warning: invalid instruction encoding + +[0x06,0x48,0x54,0xcc,0x00,0x05,0x3a,0x1c] # index_key:1 +# GFX12: v_swmmac_i32_16x16x32_iu8 v[6:13], v[0:1], v[2:5], v14 index_key:1 ; encoding: [0x06,0x48,0x54,0xcc,0x00,0x05,0x3a,0x1c] + +[0x06,0x50,0x54,0xcc,0x00,0x05,0x3a,0x1c] # index_key:2 +# GFX12-ERR: warning: invalid instruction encoding + +[0x06,0x58,0x54,0xcc,0x00,0x05,0x3a,0x1c] # index_key:3 +# GFX12-ERR: warning: invalid instruction encoding + +[0x06,0x40,0x54,0xcc,0x00,0x05,0x3a,0x3c] # neg_lo:[1,0,0] +# GFX12: v_swmmac_i32_16x16x32_iu8 v[6:13], v[0:1], v[2:5], v14 neg_lo:[1,0,0] ; encoding: [0x06,0x40,0x54,0xcc,0x00,0x05,0x3a,0x3c] + +[0x06,0x40,0x54,0xcc,0x00,0x05,0x3a,0x5c] # neg_lo:[0,1,0] +# GFX12: v_swmmac_i32_16x16x32_iu8 v[6:13], v[0:1], v[2:5], v14 neg_lo:[0,1,0] ; encoding: [0x06,0x40,0x54,0xcc,0x00,0x05,0x3a,0x5c] + +[0x06,0x40,0x54,0xcc,0x00,0x05,0x3a,0x9c] # neg_lo:[0,0,1] +# GFX12-ERR: warning: invalid instruction encoding + +[0x06,0x41,0x54,0xcc,0x00,0x05,0x3a,0x1c] # neg_hi:[1,0,0] +# GFX12-ERR: warning: invalid instruction encoding + +[0x06,0x42,0x54,0xcc,0x00,0x05,0x3a,0x1c] # neg_hi:[0,1,0] +# GFX12-ERR: warning: invalid instruction encoding + +[0x06,0x44,0x54,0xcc,0x00,0x05,0x3a,0x1c] # neg_hi:[0,0,1] +# GFX12-ERR: warning: invalid instruction encoding + +[0x06,0x40,0x54,0xcc,0x00,0x04,0x3a,0x1c] # sgpr src0 +# GFX12:v_swmmac_i32_16x16x32_iu8 v[6:13], s[0:1]/*Invalid register, operand has 'VReg_64' register class*/, v[2:5], v14 ; encoding: [0x06,0x40,0x54,0xcc,0x00,0x04,0x3a,0x1c] + +[0x06,0x40,0x54,0xcc,0x00,0x05,0x38,0x1c] # sgpr src1 +# GFX12:v_swmmac_i32_16x16x32_iu8 v[6:13], v[0:1], s[0:3]/*Invalid register, operand has 'VReg_128' register class*/, v14 ; encoding: [0x06,0x40,0x54,0xcc,0x00,0x01,0x38,0x1c] + +[0x06,0x40,0x54,0xcc,0x00,0x05,0x3a,0x18] # sgpr src2 +# GFX12:v_swmmac_i32_16x16x32_iu8 v[6:13], v[0:1], v[2:5], s14/*Invalid register, operand has 'VGPR_32' register class*/ ; encoding: [0x06,0x40,0x54,0xcc,0x00,0x05,0x3a,0x18] + +[0x06,0x40,0x54,0xcc,0x81,0x04,0x3a,0x1c] # 1 src0 +# GFX12:v_swmmac_i32_16x16x32_iu8 v[6:13], 1/*Invalid immediate*/, v[2:5], v14 ; encoding: [0x06,0x40,0x54,0xcc,0x01,0x04,0x3a,0x1c] + +[0x06,0x40,0x54,0xcc,0x00,0x03,0x39,0x1c] # 1 src1 +# GFX12:v_swmmac_i32_16x16x32_iu8 v[6:13], v[0:1], 1/*Invalid immediate*/, v14 ; encoding: [0x06,0x40,0x54,0xcc,0x00,0x03,0x38,0x1c] + +[0x06,0x40,0x54,0xcc,0x00,0x05,0x06,0x1a] # 1 src2 +# GFX12:v_swmmac_i32_16x16x32_iu8 v[6:13], v[0:1], v[2:5], 1/*Invalid immediate*/ ; encoding: [0x06,0x40,0x54,0xcc,0x00,0x05,0x06,0x18] + +[0x06,0x40,0x54,0xcc,0x00,0x05,0xca,0x1b] # 1.0 src2 +# GFX12:v_swmmac_i32_16x16x32_iu8 v[6:13], v[0:1], v[2:5], 1.0/*Invalid immediate*/ ; encoding: [0x06,0x40,0x54,0xcc,0x00,0x05,0x02,0x18] + + + +[0x03,0x40,0x55,0xcc,0x00,0x03,0x2e,0x1c] +# GFX12: v_swmmac_i32_16x16x32_iu4 v[3:10], v0, v[1:2], v11 ; encoding: [0x03,0x40,0x55,0xcc,0x00,0x03,0x2e,0x1c] + +[0x03,0xc0,0x55,0xcc,0x00,0x03,0x2e,0x1c] # clamp +# GFX12: v_swmmac_i32_16x16x32_iu4 v[3:10], v0, v[1:2], v11 clamp ; encoding: [0x03,0xc0,0x55,0xcc,0x00,0x03,0x2e,0x1c] + +[0x03,0x48,0x55,0xcc,0x00,0x03,0x2e,0x1c] # op_sel:[1,0,0] +# GFX12: v_swmmac_i32_16x16x32_iu4 v[3:10], v0, v[1:2], v11 index_key:1 ; encoding: [0x03,0x48,0x55,0xcc,0x00,0x03,0x2e,0x1c] + +[0x03,0x50,0x55,0xcc,0x00,0x03,0x2e,0x1c] # op_sel:[0,1,0] +# GFX12-ERR: warning: invalid instruction encoding + +[0x03,0x60,0x55,0xcc,0x00,0x03,0x2e,0x1c] # op_sel:[0,0,1] +# GFX12-ERR: warning: invalid instruction encoding + +[0x03,0x40,0x55,0xcc,0x00,0x03,0x2e,0x14] # op_sel_hi:[0,1,1] +# GFX12-ERR: warning: invalid instruction encoding + +[0x03,0x40,0x55,0xcc,0x00,0x03,0x2e,0x0c] # op_sel_hi:[1,0,1] +# GFX12-ERR: warning: invalid instruction encoding + +[0x03,0x00,0x55,0xcc,0x00,0x03,0x2e,0x1c] # op_sel_hi:[1,1,0] +# GFX12-ERR: warning: invalid instruction encoding + +[0x03,0x48,0x55,0xcc,0x00,0x03,0x2e,0x1c] # index_key:1 +# GFX12: v_swmmac_i32_16x16x32_iu4 v[3:10], v0, v[1:2], v11 index_key:1 ; encoding: [0x03,0x48,0x55,0xcc,0x00,0x03,0x2e,0x1c] + +[0x03,0x50,0x55,0xcc,0x00,0x03,0x2e,0x1c] # index_key:2 +# GFX12-ERR: warning: invalid instruction encoding + +[0x03,0x58,0x55,0xcc,0x00,0x03,0x2e,0x1c] # index_key:3 +# GFX12-ERR: warning: invalid instruction encoding + +[0x03,0x40,0x55,0xcc,0x00,0x03,0x2e,0x3c] # neg_lo:[1,0,0] +# GFX12: v_swmmac_i32_16x16x32_iu4 v[3:10], v0, v[1:2], v11 neg_lo:[1,0,0] ; encoding: [0x03,0x40,0x55,0xcc,0x00,0x03,0x2e,0x3c] + +[0x03,0x40,0x55,0xcc,0x00,0x03,0x2e,0x5c] # neg_lo:[0,1,0] +# GFX12: v_swmmac_i32_16x16x32_iu4 v[3:10], v0, v[1:2], v11 neg_lo:[0,1,0] ; encoding: [0x03,0x40,0x55,0xcc,0x00,0x03,0x2e,0x5c] + +[0x03,0x40,0x55,0xcc,0x00,0x03,0x2e,0x9c] # neg_lo:[0,0,1] +# GFX12-ERR: warning: invalid instruction encoding + +[0x03,0x41,0x55,0xcc,0x00,0x03,0x2e,0x1c] # neg_hi:[1,0,0] +# GFX12-ERR: warning: invalid instruction encoding + +[0x03,0x42,0x55,0xcc,0x00,0x03,0x2e,0x1c] # neg_hi:[0,1,0] +# GFX12-ERR: warning: invalid instruction encoding + +[0x03,0x44,0x55,0xcc,0x00,0x03,0x2e,0x1c] # neg_hi:[0,0,1] +# GFX12-ERR: warning: invalid instruction encoding + +[0x03,0x40,0x55,0xcc,0x00,0x02,0x2e,0x1c] # sgpr src0 +# GFX12: v_swmmac_i32_16x16x32_iu4 v[3:10], s0/*Invalid register, operand has 'VGPR_32' register class*/, v[1:2], v11 ; encoding: [0x03,0x40,0x55,0xcc,0x00,0x02,0x2e,0x1c] + +[0x03,0x40,0x55,0xcc,0x00,0x03,0x2c,0x1c] # sgpr src1 +# GFX12: v_swmmac_i32_16x16x32_iu4 v[3:10], v0, s[0:1]/*Invalid register, operand has 'VReg_64' register class*/, v11 ; encoding: [0x03,0x40,0x55,0xcc,0x00,0x01,0x2c,0x1c] + +[0x03,0x40,0x55,0xcc,0x00,0x03,0x2e,0x18] # sgpr src2 +# GFX12: v_swmmac_i32_16x16x32_iu4 v[3:10], v0, v[1:2], s11/*Invalid register, operand has 'VGPR_32' register class*/ ; encoding: [0x03,0x40,0x55,0xcc,0x00,0x03,0x2e,0x18] + +[0x03,0x40,0x55,0xcc,0x81,0x02,0x2e,0x1c] # 1 src0 +# GFX12: v_swmmac_i32_16x16x32_iu4 v[3:10], 1/*Invalid immediate*/, v[1:2], v11 ; encoding: [0x03,0x40,0x55,0xcc,0x01,0x02,0x2e,0x1c] + +[0x03,0x40,0x55,0xcc,0x00,0x03,0x2d,0x1c] # 1 src1 +# GFX12: v_swmmac_i32_16x16x32_iu4 v[3:10], v0, 1/*Invalid immediate*/, v11 ; encoding: [0x03,0x40,0x55,0xcc,0x00,0x03,0x2c,0x1c] + +[0x03,0x40,0x55,0xcc,0x00,0x03,0x06,0x1a] # 1 src2 +# GFX12: v_swmmac_i32_16x16x32_iu4 v[3:10], v0, v[1:2], 1/*Invalid immediate*/ ; encoding: [0x03,0x40,0x55,0xcc,0x00,0x03,0x06,0x18] + +[0x03,0x40,0x55,0xcc,0x00,0x03,0xca,0x1b] # 1.0 src2 +# GFX12: v_swmmac_i32_16x16x32_iu4 v[3:10], v0, v[1:2], 1.0/*Invalid immediate*/ ; encoding: [0x03,0x40,0x55,0xcc,0x00,0x03,0x02,0x18] + + + +[0x06,0x40,0x56,0xcc,0x00,0x05,0x3a,0x1c] +# GFX12: v_swmmac_i32_16x16x64_iu4 v[6:13], v[0:1], v[2:5], v14 ; encoding: [0x06,0x40,0x56,0xcc,0x00,0x05,0x3a,0x1c] + +[0x06,0xc0,0x56,0xcc,0x00,0x05,0x3a,0x1c] # clamp +# GFX12: v_swmmac_i32_16x16x64_iu4 v[6:13], v[0:1], v[2:5], v14 clamp ; encoding: [0x06,0xc0,0x56,0xcc,0x00,0x05,0x3a,0x1c] + +[0x06,0x48,0x56,0xcc,0x00,0x05,0x3a,0x1c] # op_sel:[1,0,0] +# GFX12-ERR: warning: invalid instruction encoding + +[0x06,0x50,0x56,0xcc,0x00,0x05,0x3a,0x1c] # op_sel:[0,1,0] +# GFX12-ERR: warning: invalid instruction encoding + +[0x06,0x60,0x56,0xcc,0x00,0x05,0x3a,0x1c] # op_sel:[0,0,1] +# GFX12-ERR: warning: invalid instruction encoding + +[0x06,0x40,0x56,0xcc,0x00,0x05,0x3a,0x14] # op_sel_hi:[0,1,1] +# GFX12-ERR: warning: invalid instruction encoding + +[0x06,0x40,0x56,0xcc,0x00,0x05,0x3a,0x0c] # op_sel_hi:[1,0,1] +# GFX12-ERR: warning: invalid instruction encoding + +[0x06,0x00,0x56,0xcc,0x00,0x05,0x3a,0x1c] # op_sel_hi:[1,1,0] +# GFX12-ERR: warning: invalid instruction encoding + +[0x06,0x48,0x56,0xcc,0x00,0x05,0x3a,0x1c] # index_key:1 +# GFX12-ERR: warning: invalid instruction encoding + +[0x06,0x50,0x56,0xcc,0x00,0x05,0x3a,0x1c] # index_key:2 +# GFX12-ERR: warning: invalid instruction encoding + +[0x06,0x58,0x56,0xcc,0x00,0x05,0x3a,0x1c] # index_key:3 +# GFX12-ERR: warning: invalid instruction encoding + +[0x06,0x40,0x56,0xcc,0x00,0x05,0x3a,0x3c] # neg_lo:[1,0,0] +# GFX12: v_swmmac_i32_16x16x64_iu4 v[6:13], v[0:1], v[2:5], v14 neg_lo:[1,0,0] ; encoding: [0x06,0x40,0x56,0xcc,0x00,0x05,0x3a,0x3c] + +[0x06,0x40,0x56,0xcc,0x00,0x05,0x3a,0x5c] # neg_lo:[0,1,0] +# GFX12: v_swmmac_i32_16x16x64_iu4 v[6:13], v[0:1], v[2:5], v14 neg_lo:[0,1,0] ; encoding: [0x06,0x40,0x56,0xcc,0x00,0x05,0x3a,0x5c] + +[0x06,0x40,0x56,0xcc,0x00,0x05,0x3a,0x9c] # neg_lo:[0,0,1] +# GFX12-ERR: warning: invalid instruction encoding + +[0x06,0x41,0x56,0xcc,0x00,0x05,0x3a,0x1c] # neg_hi:[1,0,0] +# GFX12-ERR: warning: invalid instruction encoding + +[0x06,0x42,0x56,0xcc,0x00,0x05,0x3a,0x1c] # neg_hi:[0,1,0] +# GFX12-ERR: warning: invalid instruction encoding + +[0x06,0x44,0x56,0xcc,0x00,0x05,0x3a,0x1c] # neg_hi:[0,0,1] +# GFX12-ERR: warning: invalid instruction encoding + +[0x06,0x40,0x56,0xcc,0x00,0x04,0x3a,0x1c] # sgpr src0 +# GFX12: v_swmmac_i32_16x16x64_iu4 v[6:13], s[0:1]/*Invalid register, operand has 'VReg_64' register class*/, v[2:5], v14 ; encoding: [0x06,0x40,0x56,0xcc,0x00,0x04,0x3a,0x1c] + +[0x06,0x40,0x56,0xcc,0x00,0x05,0x38,0x1c] # sgpr src1 +# GFX12: v_swmmac_i32_16x16x64_iu4 v[6:13], v[0:1], s[0:3]/*Invalid register, operand has 'VReg_128' register class*/, v14 ; encoding: [0x06,0x40,0x56,0xcc,0x00,0x01,0x38,0x1c] + +[0x06,0x40,0x56,0xcc,0x00,0x05,0x3a,0x18] # sgpr src2 +# GFX12: v_swmmac_i32_16x16x64_iu4 v[6:13], v[0:1], v[2:5], s14/*Invalid register, operand has 'VGPR_32' register class*/ ; encoding: [0x06,0x40,0x56,0xcc,0x00,0x05,0x3a,0x18] + +[0x06,0x40,0x56,0xcc,0x81,0x04,0x3a,0x1c] # 1 src0 +# GFX12: v_swmmac_i32_16x16x64_iu4 v[6:13], 1/*Invalid immediate*/, v[2:5], v14 ; encoding: [0x06,0x40,0x56,0xcc,0x01,0x04,0x3a,0x1c] + +[0x06,0x40,0x56,0xcc,0x00,0x03,0x39,0x1c] # 1 src1 +# GFX12: v_swmmac_i32_16x16x64_iu4 v[6:13], v[0:1], 1/*Invalid immediate*/, v14 ; encoding: [0x06,0x40,0x56,0xcc,0x00,0x03,0x38,0x1c] + +[0x06,0x40,0x56,0xcc,0x00,0x05,0x06,0x1a] # 1 src2 +# GFX12: v_swmmac_i32_16x16x64_iu4 v[6:13], v[0:1], v[2:5], 1/*Invalid immediate*/ ; encoding: [0x06,0x40,0x56,0xcc,0x00,0x05,0x06,0x18] + +[0x06,0x40,0x56,0xcc,0x00,0x05,0xca,0x1b] # 1.0 src2 +# GFX12: v_swmmac_i32_16x16x64_iu4 v[6:13], v[0:1], v[2:5], 1.0/*Invalid immediate*/ ; encoding: [0x06,0x40,0x56,0xcc,0x00,0x05,0x02,0x18] + + + +[0x06,0x40,0x57,0xcc,0x00,0x05,0x3a,0x1c] +# GFX12: v_swmmac_f32_16x16x32_fp8_fp8 v[6:13], v[0:1], v[2:5], v14 ; encoding: [0x06,0x40,0x57,0xcc,0x00,0x05,0x3a,0x1c] + +[0x06,0xc0,0x57,0xcc,0x00,0x05,0x3a,0x1c] # clamp +# GFX12-ERR: warning: invalid instruction encoding + +[0x06,0x48,0x57,0xcc,0x00,0x05,0x3a,0x1c] # op_sel:[1,0,0] +# GFX12: v_swmmac_f32_16x16x32_fp8_fp8 v[6:13], v[0:1], v[2:5], v14 index_key:1 ; encoding: [0x06,0x48,0x57,0xcc,0x00,0x05,0x3a,0x1c] + +[0x06,0x50,0x57,0xcc,0x00,0x05,0x3a,0x1c] # op_sel:[0,1,0] +# GFX12-ERR: warning: invalid instruction encoding + +[0x06,0x60,0x57,0xcc,0x00,0x05,0x3a,0x1c] # op_sel:[0,0,1] +# GFX12-ERR: warning: invalid instruction encoding + +[0x06,0x40,0x57,0xcc,0x00,0x05,0x3a,0x14] # op_sel_hi:[0,1,1] +# GFX12-ERR: warning: invalid instruction encoding + +[0x06,0x40,0x57,0xcc,0x00,0x05,0x3a,0x0c] # op_sel_hi:[1,0,1] +# GFX12-ERR: warning: invalid instruction encoding + +[0x06,0x00,0x57,0xcc,0x00,0x05,0x3a,0x1c] # op_sel_hi:[1,1,0] +# GFX12-ERR: warning: invalid instruction encoding + +[0x06,0x48,0x57,0xcc,0x00,0x05,0x3a,0x1c] # index_key:1 +# GFX12: v_swmmac_f32_16x16x32_fp8_fp8 v[6:13], v[0:1], v[2:5], v14 index_key:1 ; encoding: [0x06,0x48,0x57,0xcc,0x00,0x05,0x3a,0x1c] + +[0x06,0x50,0x57,0xcc,0x00,0x05,0x3a,0x1c] # index_key:2 +# GFX12-ERR: warning: invalid instruction encoding + +[0x06,0x58,0x57,0xcc,0x00,0x05,0x3a,0x1c] # index_key:3 +# GFX12-ERR: warning: invalid instruction encoding + +[0x06,0x40,0x57,0xcc,0x00,0x05,0x3a,0x3c] # neg_lo:[1,0,0] +# GFX12-ERR: warning: invalid instruction encoding + +[0x06,0x40,0x57,0xcc,0x00,0x05,0x3a,0x5c] # neg_lo:[0,1,0] +# GFX12-ERR: warning: invalid instruction encoding + +[0x06,0x40,0x57,0xcc,0x00,0x05,0x3a,0x9c] # neg_lo:[0,0,1] +# GFX12-ERR: warning: invalid instruction encoding + +[0x06,0x41,0x57,0xcc,0x00,0x05,0x3a,0x1c] # neg_hi:[1,0,0] +# GFX12-ERR: warning: invalid instruction encoding + +[0x06,0x42,0x57,0xcc,0x00,0x05,0x3a,0x1c] # neg_hi:[0,1,0] +# GFX12-ERR: warning: invalid instruction encoding + +[0x06,0x44,0x57,0xcc,0x00,0x05,0x3a,0x1c] # neg_hi:[0,0,1] +# GFX12-ERR: warning: invalid instruction encoding + +[0x06,0x40,0x57,0xcc,0x00,0x04,0x3a,0x1c] # sgpr src0 +# GFX12: v_swmmac_f32_16x16x32_fp8_fp8 v[6:13], s[0:1]/*Invalid register, operand has 'VReg_64' register class*/, v[2:5], v14 ; encoding: [0x06,0x40,0x57,0xcc,0x00,0x04,0x3a,0x1c] + +[0x06,0x40,0x57,0xcc,0x00,0x05,0x38,0x1c] # sgpr src1 +# GFX12: v_swmmac_f32_16x16x32_fp8_fp8 v[6:13], v[0:1], s[0:3]/*Invalid register, operand has 'VReg_128' register class*/, v14 ; encoding: [0x06,0x40,0x57,0xcc,0x00,0x01,0x38,0x1c] + +[0x06,0x40,0x57,0xcc,0x00,0x05,0x3a,0x18] # sgpr src2 +# GFX12: v_swmmac_f32_16x16x32_fp8_fp8 v[6:13], v[0:1], v[2:5], s14/*Invalid register, operand has 'VGPR_32' register class*/ ; encoding: [0x06,0x40,0x57,0xcc,0x00,0x05,0x3a,0x18] + +[0x06,0x40,0x57,0xcc,0xf2,0x04,0x3a,0x1c] # 1.0 src0 +# GFX12: v_swmmac_f32_16x16x32_fp8_fp8 v[6:13], 1.0/*Invalid immediate*/, v[2:5], v14 ; encoding: [0x06,0x40,0x57,0xcc,0x00,0x04,0x3a,0x1c] + +[0x06,0x40,0x57,0xcc,0x00,0xe5,0x39,0x1c] # 1.0 src1 +# GFX12: v_swmmac_f32_16x16x32_fp8_fp8 v[6:13], v[0:1], 1.0/*Invalid immediate*/, v14 ; encoding: [0x06,0x40,0x57,0xcc,0x00,0x01,0x38,0x1c] + +[0x06,0x40,0x57,0xcc,0x00,0x05,0xca,0x1b] # 1.0 src2 +# GFX12: v_swmmac_f32_16x16x32_fp8_fp8 v[6:13], v[0:1], v[2:5], 1.0/*Invalid immediate*/ ; encoding: [0x06,0x40,0x57,0xcc,0x00,0x05,0x02,0x18] + +[0x06,0x40,0x57,0xcc,0x00,0x05,0x06,0x1a] # 1 src2 +# GFX12: v_swmmac_f32_16x16x32_fp8_fp8 v[6:13], v[0:1], v[2:5], 1/*Invalid immediate*/ ; encoding: [0x06,0x40,0x57,0xcc,0x00,0x05,0x06,0x18] + + + +[0x06,0x40,0x58,0xcc,0x00,0x05,0x3a,0x1c] +# GFX12: v_swmmac_f32_16x16x32_fp8_bf8 v[6:13], v[0:1], v[2:5], v14 ; encoding: [0x06,0x40,0x58,0xcc,0x00,0x05,0x3a,0x1c] + +[0x06,0xc0,0x58,0xcc,0x00,0x05,0x3a,0x1c] # clamp +# GFX12-ERR: warning: invalid instruction encoding + +[0x06,0x48,0x58,0xcc,0x00,0x05,0x3a,0x1c] # op_sel:[1,0,0] +# GFX12: v_swmmac_f32_16x16x32_fp8_bf8 v[6:13], v[0:1], v[2:5], v14 index_key:1 ; encoding: [0x06,0x48,0x58,0xcc,0x00,0x05,0x3a,0x1c] + +[0x06,0x50,0x58,0xcc,0x00,0x05,0x3a,0x1c] # op_sel:[0,1,0] +# GFX12-ERR: warning: invalid instruction encoding + +[0x06,0x60,0x58,0xcc,0x00,0x05,0x3a,0x1c] # op_sel:[0,0,1] +# GFX12-ERR: warning: invalid instruction encoding + +[0x06,0x40,0x58,0xcc,0x00,0x05,0x3a,0x14] # op_sel_hi:[0,1,1] +# GFX12-ERR: warning: invalid instruction encoding + +[0x06,0x40,0x58,0xcc,0x00,0x05,0x3a,0x0c] # op_sel_hi:[1,0,1] +# GFX12-ERR: warning: invalid instruction encoding + +[0x06,0x00,0x58,0xcc,0x00,0x05,0x3a,0x1c] # op_sel_hi:[1,1,0] +# GFX12-ERR: warning: invalid instruction encoding + +[0x06,0x48,0x58,0xcc,0x00,0x05,0x3a,0x1c] # index_key:1 +# GFX12: v_swmmac_f32_16x16x32_fp8_bf8 v[6:13], v[0:1], v[2:5], v14 index_key:1 ; encoding: [0x06,0x48,0x58,0xcc,0x00,0x05,0x3a,0x1c] + +[0x06,0x50,0x58,0xcc,0x00,0x05,0x3a,0x1c] # index_key:2 +# GFX12-ERR: warning: invalid instruction encoding + +[0x06,0x58,0x58,0xcc,0x00,0x05,0x3a,0x1c] # index_key:3 +# GFX12-ERR: warning: invalid instruction encoding + +[0x06,0x40,0x58,0xcc,0x00,0x05,0x3a,0x3c] # neg_lo:[1,0,0] +# GFX12-ERR: warning: invalid instruction encoding + +[0x06,0x40,0x58,0xcc,0x00,0x05,0x3a,0x5c] # neg_lo:[0,1,0] +# GFX12-ERR: warning: invalid instruction encoding + +[0x06,0x40,0x58,0xcc,0x00,0x05,0x3a,0x9c] # neg_lo:[0,0,1] +# GFX12-ERR: warning: invalid instruction encoding + +[0x06,0x41,0x58,0xcc,0x00,0x05,0x3a,0x1c] # neg_hi:[1,0,0] +# GFX12-ERR: warning: invalid instruction encoding + +[0x06,0x42,0x58,0xcc,0x00,0x05,0x3a,0x1c] # neg_hi:[0,1,0] +# GFX12-ERR: warning: invalid instruction encoding + +[0x06,0x44,0x58,0xcc,0x00,0x05,0x3a,0x1c] # neg_hi:[0,0,1] +# GFX12-ERR: warning: invalid instruction encoding + +[0x06,0x40,0x58,0xcc,0x00,0x04,0x3a,0x1c] # sgpr src0 +# GFX12: v_swmmac_f32_16x16x32_fp8_bf8 v[6:13], s[0:1]/*Invalid register, operand has 'VReg_64' register class*/, v[2:5], v14 ; encoding: [0x06,0x40,0x58,0xcc,0x00,0x04,0x3a,0x1c] + +[0x06,0x40,0x58,0xcc,0x00,0x05,0x38,0x1c] # sgpr src1 +# GFX12: v_swmmac_f32_16x16x32_fp8_bf8 v[6:13], v[0:1], s[0:3]/*Invalid register, operand has 'VReg_128' register class*/, v14 ; encoding: [0x06,0x40,0x58,0xcc,0x00,0x01,0x38,0x1c] + +[0x06,0x40,0x58,0xcc,0x00,0x05,0x3a,0x18] # sgpr src2 +# GFX12: v_swmmac_f32_16x16x32_fp8_bf8 v[6:13], v[0:1], v[2:5], s14/*Invalid register, operand has 'VGPR_32' register class*/ ; encoding: [0x06,0x40,0x58,0xcc,0x00,0x05,0x3a,0x18] + +[0x06,0x40,0x58,0xcc,0xf2,0x04,0x3a,0x1c] # 1.0 src0 +# GFX12: v_swmmac_f32_16x16x32_fp8_bf8 v[6:13], 1.0/*Invalid immediate*/, v[2:5], v14 ; encoding: [0x06,0x40,0x58,0xcc,0x00,0x04,0x3a,0x1c] + +[0x06,0x40,0x58,0xcc,0x00,0xe5,0x39,0x1c] # 1.0 src1 +# GFX12: v_swmmac_f32_16x16x32_fp8_bf8 v[6:13], v[0:1], 1.0/*Invalid immediate*/, v14 ; encoding: [0x06,0x40,0x58,0xcc,0x00,0x01,0x38,0x1c] + +[0x06,0x40,0x58,0xcc,0x00,0x05,0xca,0x1b] # 1.0 src2 +# GFX12: v_swmmac_f32_16x16x32_fp8_bf8 v[6:13], v[0:1], v[2:5], 1.0/*Invalid immediate*/ ; encoding: [0x06,0x40,0x58,0xcc,0x00,0x05,0x02,0x18] + +[0x06,0x40,0x58,0xcc,0x00,0x05,0x06,0x1a] # 1 src2 +# GFX12: v_swmmac_f32_16x16x32_fp8_bf8 v[6:13], v[0:1], v[2:5], 1/*Invalid immediate*/ ; encoding: [0x06,0x40,0x58,0xcc,0x00,0x05,0x06,0x18] + + + +[0x06,0x40,0x59,0xcc,0x00,0x05,0x3a,0x1c] +# GFX12: v_swmmac_f32_16x16x32_bf8_fp8 v[6:13], v[0:1], v[2:5], v14 ; encoding: [0x06,0x40,0x59,0xcc,0x00,0x05,0x3a,0x1c] + +[0x06,0xc0,0x59,0xcc,0x00,0x05,0x3a,0x1c] # clamp +# GFX12-ERR: warning: invalid instruction encoding + +[0x06,0x48,0x59,0xcc,0x00,0x05,0x3a,0x1c] # op_sel:[1,0,0] +# GFX12: v_swmmac_f32_16x16x32_bf8_fp8 v[6:13], v[0:1], v[2:5], v14 index_key:1 ; encoding: [0x06,0x48,0x59,0xcc,0x00,0x05,0x3a,0x1c] + +[0x06,0x50,0x59,0xcc,0x00,0x05,0x3a,0x1c] # op_sel:[0,1,0] +# GFX12-ERR: warning: invalid instruction encoding + +[0x06,0x60,0x59,0xcc,0x00,0x05,0x3a,0x1c] # op_sel:[0,0,1] +# GFX12-ERR: warning: invalid instruction encoding + +[0x06,0x40,0x59,0xcc,0x00,0x05,0x3a,0x14] # op_sel_hi:[0,1,1] +# GFX12-ERR: warning: invalid instruction encoding + +[0x06,0x40,0x59,0xcc,0x00,0x05,0x3a,0x0c] # op_sel_hi:[1,0,1] +# GFX12-ERR: warning: invalid instruction encoding + +[0x06,0x00,0x59,0xcc,0x00,0x05,0x3a,0x1c] # op_sel_hi:[1,1,0] +# GFX12-ERR: warning: invalid instruction encoding + +[0x06,0x48,0x59,0xcc,0x00,0x05,0x3a,0x1c] # index_key:1 +# GFX12: v_swmmac_f32_16x16x32_bf8_fp8 v[6:13], v[0:1], v[2:5], v14 index_key:1 ; encoding: [0x06,0x48,0x59,0xcc,0x00,0x05,0x3a,0x1c] + +[0x06,0x50,0x59,0xcc,0x00,0x05,0x3a,0x1c] # index_key:2 +# GFX12-ERR: warning: invalid instruction encoding + +[0x06,0x58,0x59,0xcc,0x00,0x05,0x3a,0x1c] # index_key:3 +# GFX12-ERR: warning: invalid instruction encoding + +[0x06,0x40,0x59,0xcc,0x00,0x05,0x3a,0x3c] # neg_lo:[1,0,0] +# GFX12-ERR: warning: invalid instruction encoding + +[0x06,0x40,0x59,0xcc,0x00,0x05,0x3a,0x5c] # neg_lo:[0,1,0] +# GFX12-ERR: warning: invalid instruction encoding + +[0x06,0x40,0x59,0xcc,0x00,0x05,0x3a,0x9c] # neg_lo:[0,0,1] +# GFX12-ERR: warning: invalid instruction encoding + +[0x06,0x41,0x59,0xcc,0x00,0x05,0x3a,0x1c] # neg_hi:[1,0,0] +# GFX12-ERR: warning: invalid instruction encoding + +[0x06,0x42,0x59,0xcc,0x00,0x05,0x3a,0x1c] # neg_hi:[0,1,0] +# GFX12-ERR: warning: invalid instruction encoding + +[0x06,0x44,0x59,0xcc,0x00,0x05,0x3a,0x1c] # neg_hi:[0,0,1] +# GFX12-ERR: warning: invalid instruction encoding + +[0x06,0x40,0x59,0xcc,0x00,0x04,0x3a,0x1c] # sgpr src0 +# GFX12: v_swmmac_f32_16x16x32_bf8_fp8 v[6:13], s[0:1]/*Invalid register, operand has 'VReg_64' register class*/, v[2:5], v14 ; encoding: [0x06,0x40,0x59,0xcc,0x00,0x04,0x3a,0x1c] + +[0x06,0x40,0x59,0xcc,0x00,0x05,0x38,0x1c] # sgpr src1 +# GFX12: v_swmmac_f32_16x16x32_bf8_fp8 v[6:13], v[0:1], s[0:3]/*Invalid register, operand has 'VReg_128' register class*/, v14 ; encoding: [0x06,0x40,0x59,0xcc,0x00,0x01,0x38,0x1c] + +[0x06,0x40,0x59,0xcc,0x00,0x05,0x3a,0x18] # sgpr src2 +# GFX12: v_swmmac_f32_16x16x32_bf8_fp8 v[6:13], v[0:1], v[2:5], s14/*Invalid register, operand has 'VGPR_32' register class*/ ; encoding: [0x06,0x40,0x59,0xcc,0x00,0x05,0x3a,0x18] + +[0x06,0x40,0x59,0xcc,0xf2,0x04,0x3a,0x1c] # 1.0 src0 +# GFX12: v_swmmac_f32_16x16x32_bf8_fp8 v[6:13], 1.0/*Invalid immediate*/, v[2:5], v14 ; encoding: [0x06,0x40,0x59,0xcc,0x00,0x04,0x3a,0x1c] + +[0x06,0x40,0x59,0xcc,0x00,0xe5,0x39,0x1c] # 1.0 src1 +# GFX12: v_swmmac_f32_16x16x32_bf8_fp8 v[6:13], v[0:1], 1.0/*Invalid immediate*/, v14 ; encoding: [0x06,0x40,0x59,0xcc,0x00,0x01,0x38,0x1c] + +[0x06,0x40,0x59,0xcc,0x00,0x05,0xca,0x1b] # 1.0 src2 +# GFX12: v_swmmac_f32_16x16x32_bf8_fp8 v[6:13], v[0:1], v[2:5], 1.0/*Invalid immediate*/ ; encoding: [0x06,0x40,0x59,0xcc,0x00,0x05,0x02,0x18] + +[0x06,0x40,0x59,0xcc,0x00,0x05,0x06,0x1a] # 1 src2 +# GFX12: v_swmmac_f32_16x16x32_bf8_fp8 v[6:13], v[0:1], v[2:5], 1/*Invalid immediate*/ ; encoding: [0x06,0x40,0x59,0xcc,0x00,0x05,0x06,0x18] + + + +[0x06,0x40,0x5a,0xcc,0x00,0x05,0x3a,0x1c] +# GFX12: v_swmmac_f32_16x16x32_bf8_bf8 v[6:13], v[0:1], v[2:5], v14 ; encoding: [0x06,0x40,0x5a,0xcc,0x00,0x05,0x3a,0x1c] + +[0x06,0xc0,0x5a,0xcc,0x00,0x05,0x3a,0x1c] # clamp +# GFX12-ERR: warning: invalid instruction encoding + +[0x06,0x48,0x5a,0xcc,0x00,0x05,0x3a,0x1c] # op_sel:[1,0,0] +# GFX12: v_swmmac_f32_16x16x32_bf8_bf8 v[6:13], v[0:1], v[2:5], v14 index_key:1 ; encoding: [0x06,0x48,0x5a,0xcc,0x00,0x05,0x3a,0x1c] + +[0x06,0x50,0x5a,0xcc,0x00,0x05,0x3a,0x1c] # op_sel:[0,1,0] +# GFX12-ERR: warning: invalid instruction encoding + +[0x06,0x60,0x5a,0xcc,0x00,0x05,0x3a,0x1c] # op_sel:[0,0,1] +# GFX12-ERR: warning: invalid instruction encoding + +[0x06,0x40,0x5a,0xcc,0x00,0x05,0x3a,0x14] # op_sel_hi:[0,1,1] +# GFX12-ERR: warning: invalid instruction encoding + +[0x06,0x40,0x5a,0xcc,0x00,0x05,0x3a,0x0c] # op_sel_hi:[1,0,1] +# GFX12-ERR: warning: invalid instruction encoding + +[0x06,0x00,0x5a,0xcc,0x00,0x05,0x3a,0x1c] # op_sel_hi:[1,1,0] +# GFX12-ERR: warning: invalid instruction encoding + +[0x06,0x48,0x5a,0xcc,0x00,0x05,0x3a,0x1c] # index_key:1 +# GFX12: v_swmmac_f32_16x16x32_bf8_bf8 v[6:13], v[0:1], v[2:5], v14 index_key:1 ; encoding: [0x06,0x48,0x5a,0xcc,0x00,0x05,0x3a,0x1c] + +[0x06,0x50,0x5a,0xcc,0x00,0x05,0x3a,0x1c] # index_key:2 +# GFX12-ERR: warning: invalid instruction encoding + +[0x06,0x58,0x5a,0xcc,0x00,0x05,0x3a,0x1c] # index_key:3 +# GFX12-ERR: warning: invalid instruction encoding + +[0x06,0x40,0x5a,0xcc,0x00,0x05,0x3a,0x3c] # neg_lo:[1,0,0] +# GFX12-ERR: warning: invalid instruction encoding + +[0x06,0x40,0x5a,0xcc,0x00,0x05,0x3a,0x5c] # neg_lo:[0,1,0] +# GFX12-ERR: warning: invalid instruction encoding + +[0x06,0x40,0x5a,0xcc,0x00,0x05,0x3a,0x9c] # neg_lo:[0,0,1] +# GFX12-ERR: warning: invalid instruction encoding + +[0x06,0x41,0x5a,0xcc,0x00,0x05,0x3a,0x1c] # neg_hi:[1,0,0] +# GFX12-ERR: warning: invalid instruction encoding + +[0x06,0x42,0x5a,0xcc,0x00,0x05,0x3a,0x1c] # neg_hi:[0,1,0] +# GFX12-ERR: warning: invalid instruction encoding + +[0x06,0x44,0x5a,0xcc,0x00,0x05,0x3a,0x1c] # neg_hi:[0,0,1] +# GFX12-ERR: warning: invalid instruction encoding + +[0x06,0x40,0x5a,0xcc,0x00,0x04,0x3a,0x1c] # sgpr src0 +# GFX12: v_swmmac_f32_16x16x32_bf8_bf8 v[6:13], s[0:1]/*Invalid register, operand has 'VReg_64' register class*/, v[2:5], v14 ; encoding: [0x06,0x40,0x5a,0xcc,0x00,0x04,0x3a,0x1c] + +[0x06,0x40,0x5a,0xcc,0x00,0x05,0x38,0x1c] # sgpr src1 +# GFX12: v_swmmac_f32_16x16x32_bf8_bf8 v[6:13], v[0:1], s[0:3]/*Invalid register, operand has 'VReg_128' register class*/, v14 ; encoding: [0x06,0x40,0x5a,0xcc,0x00,0x01,0x38,0x1c] + +[0x06,0x40,0x5a,0xcc,0x00,0x05,0x3a,0x18] # sgpr src2 +# GFX12: v_swmmac_f32_16x16x32_bf8_bf8 v[6:13], v[0:1], v[2:5], s14/*Invalid register, operand has 'VGPR_32' register class*/ ; encoding: [0x06,0x40,0x5a,0xcc,0x00,0x05,0x3a,0x18] + +[0x06,0x40,0x5a,0xcc,0xf2,0x04,0x3a,0x1c] # 1.0 src0 +# GFX12: v_swmmac_f32_16x16x32_bf8_bf8 v[6:13], 1.0/*Invalid immediate*/, v[2:5], v14 ; encoding: [0x06,0x40,0x5a,0xcc,0x00,0x04,0x3a,0x1c] + +[0x06,0x40,0x5a,0xcc,0x00,0xe5,0x39,0x1c] # 1.0 src1 +# GFX12: v_swmmac_f32_16x16x32_bf8_bf8 v[6:13], v[0:1], 1.0/*Invalid immediate*/, v14 ; encoding: [0x06,0x40,0x5a,0xcc,0x00,0x01,0x38,0x1c] + +[0x06,0x40,0x5a,0xcc,0x00,0x05,0xca,0x1b] # 1.0 src2 +# GFX12: v_swmmac_f32_16x16x32_bf8_bf8 v[6:13], v[0:1], v[2:5], 1.0/*Invalid immediate*/ ; encoding: [0x06,0x40,0x5a,0xcc,0x00,0x05,0x02,0x18] + +[0x06,0x40,0x5a,0xcc,0x00,0x05,0x06,0x1a] # 1 src2 +# GFX12: v_swmmac_f32_16x16x32_bf8_bf8 v[6:13], v[0:1], v[2:5], 1/*Invalid immediate*/ ; encoding: [0x06,0x40,0x5a,0xcc,0x00,0x05,0x06,0x18] diff --git a/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_wmma_w64.txt b/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_wmma_w64.txt new file mode 100644 index 000000000000..61700faa8e60 --- /dev/null +++ b/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_wmma_w64.txt @@ -0,0 +1,1628 @@ +# RUN: not llvm-mc -disassemble -arch=amdgcn -mcpu=gfx1200 -mattr=-wavefrontsize32,+wavefrontsize64 -show-encoding %s | FileCheck --check-prefix=GFX12 %s +# RUN: not llvm-mc -disassemble -arch=amdgcn -mcpu=gfx1200 -mattr=-wavefrontsize32,+wavefrontsize64 -show-encoding %s 2>&1 | FileCheck --implicit-check-not=warning: --check-prefix=GFX12-ERR %s + +[0x04,0x40,0x40,0xcc,0x00,0x05,0x12,0x1c] +# GFX12: v_wmma_f32_16x16x16_f16 v[4:7], v[0:1], v[2:3], v[4:7] ; encoding: [0x04,0x40,0x40,0xcc,0x00,0x05,0x12,0x1c] + +[0x04,0xc0,0x40,0xcc,0x00,0x05,0x12,0x1c] # clamp +# GFX12-ERR: warning: invalid instruction encoding + +[0x04,0x48,0x40,0xcc,0x00,0x05,0x12,0x1c] # op_sel:[1,0,0] +# GFX12-ERR: warning: invalid instruction encoding + +[0x04,0x50,0x40,0xcc,0x00,0x05,0x12,0x1c] # op_sel:[0,1,0] +# GFX12-ERR: warning: invalid instruction encoding + +[0x04,0x60,0x40,0xcc,0x00,0x05,0x12,0x1c] # op_sel:[0,0,1] +# GFX12-ERR: warning: invalid instruction encoding + +[0x04,0x40,0x40,0xcc,0x00,0x05,0x12,0x14] # op_sel_hi:[0,1,1] +# GFX12-ERR: warning: invalid instruction encoding + +[0x04,0x40,0x40,0xcc,0x00,0x05,0x12,0x0c] # op_sel_hi:[1,0,1] +# GFX12-ERR: warning: invalid instruction encoding + +[0x04,0x00,0x40,0xcc,0x00,0x05,0x12,0x1c] # op_sel_hi:[1,1,0] +# GFX12-ERR: warning: invalid instruction encoding + +[0x04,0x48,0x40,0xcc,0x00,0x05,0x12,0x1c] # index_key:1 +# GFX12-ERR: warning: invalid instruction encoding + +[0x04,0x50,0x40,0xcc,0x00,0x05,0x12,0x1c] # index_key:2 +# GFX12-ERR: warning: invalid instruction encoding + +[0x04,0x58,0x40,0xcc,0x00,0x05,0x12,0x1c] # index_key:3 +# GFX12-ERR: warning: invalid instruction encoding + +[0x04,0x40,0x40,0xcc,0x00,0x05,0x12,0x3c] # neg_lo:[1,0,0] +# GFX12: v_wmma_f32_16x16x16_f16 v[4:7], v[0:1], v[2:3], v[4:7] neg_lo:[1,0,0] ; encoding: [0x04,0x40,0x40,0xcc,0x00,0x05,0x12,0x3c] + +[0x04,0x40,0x40,0xcc,0x00,0x05,0x12,0x5c] # neg_lo:[0,1,0] +# GFX12: v_wmma_f32_16x16x16_f16 v[4:7], v[0:1], v[2:3], v[4:7] neg_lo:[0,1,0] ; encoding: [0x04,0x40,0x40,0xcc,0x00,0x05,0x12,0x5c] + +[0x04,0x40,0x40,0xcc,0x00,0x05,0x12,0x9c] # neg_lo:[0,0,1] +# GFX12: v_wmma_f32_16x16x16_f16 v[4:7], v[0:1], v[2:3], v[4:7] neg_lo:[0,0,1] ; encoding: [0x04,0x40,0x40,0xcc,0x00,0x05,0x12,0x9c] + +[0x04,0x41,0x40,0xcc,0x00,0x05,0x12,0x1c] # neg_hi:[1,0,0] +# GFX12: v_wmma_f32_16x16x16_f16 v[4:7], v[0:1], v[2:3], v[4:7] neg_hi:[1,0,0] ; encoding: [0x04,0x41,0x40,0xcc,0x00,0x05,0x12,0x1c] + +[0x04,0x42,0x40,0xcc,0x00,0x05,0x12,0x1c] # neg_hi:[0,1,0] +# GFX12: v_wmma_f32_16x16x16_f16 v[4:7], v[0:1], v[2:3], v[4:7] neg_hi:[0,1,0] ; encoding: [0x04,0x42,0x40,0xcc,0x00,0x05,0x12,0x1c] + +[0x04,0x44,0x40,0xcc,0x00,0x05,0x12,0x1c] # neg_hi:[0,0,1] +# GFX12: v_wmma_f32_16x16x16_f16 v[4:7], v[0:1], v[2:3], v[4:7] neg_hi:[0,0,1] ; encoding: [0x04,0x44,0x40,0xcc,0x00,0x05,0x12,0x1c] + +[0x04,0x40,0x40,0xcc,0x00,0x04,0x12,0x1c] # sgpr_0 src0 +# GFX12: v_wmma_f32_16x16x16_f16 v[4:7], s[0:1]/*Invalid register, operand has 'VReg_64' register class*/, v[2:3], v[4:7] ; encoding: [0x04,0x40,0x40,0xcc,0x00,0x04,0x12,0x1c] + +[0x04,0x40,0x40,0xcc,0x00,0x01,0x10,0x1c] # sgpr_0 src1 +# GFX12: v_wmma_f32_16x16x16_f16 v[4:7], v[0:1], s[0:1]/*Invalid register, operand has 'VReg_64' register class*/, v[4:7] ; encoding: [0x04,0x40,0x40,0xcc,0x00,0x01,0x10,0x1c] + +[0x04,0x40,0x40,0xcc,0x00,0x05,0x02,0x18] # sgpr_0 src2 +# GFX12: v_wmma_f32_16x16x16_f16 v[4:7], v[0:1], v[2:3], s[0:3]/*Invalid register, operand has 'VReg_128' register class*/ ; encoding: [0x04,0x40,0x40,0xcc,0x00,0x05,0x02,0x18] + +[0x04,0x40,0x40,0xcc,0xf2,0x04,0x12,0x1c] # 1.0 src0 +# GFX12: v_wmma_f32_16x16x16_f16 v[4:7], 1.0/*Invalid immediate*/, v[2:3], v[4:7] ; encoding: [0x04,0x40,0x40,0xcc,0x00,0x04,0x12,0x1c] + +[0x04,0x40,0x40,0xcc,0x00,0xe5,0x11,0x1c] # 1.0 src1 +# GFX12: v_wmma_f32_16x16x16_f16 v[4:7], v[0:1], 1.0/*Invalid immediate*/, v[4:7] ; encoding: [0x04,0x40,0x40,0xcc,0x00,0x01,0x10,0x1c] + +[0x04,0x40,0x40,0xcc,0x00,0x05,0xca,0x1b] # 1.0 src2 +# GFX12: v_wmma_f32_16x16x16_f16 v[4:7], v[0:1], v[2:3], 1.0 ; encoding: [0x04,0x40,0x40,0xcc,0x00,0x05,0xca,0x1b] + +[0x04,0x40,0x40,0xcc,0x00,0x05,0x06,0x1a] # 1 src2 +# GFX12: v_wmma_f32_16x16x16_f16 v[4:7], v[0:1], v[2:3], 1 ; encoding: [0x04,0x40,0x40,0xcc,0x00,0x05,0x06,0x1a] + + + +[0x04,0x40,0x41,0xcc,0x00,0x05,0x12,0x1c] +# GFX12: v_wmma_f32_16x16x16_bf16 v[4:7], v[0:1], v[2:3], v[4:7] ; encoding: [0x04,0x40,0x41,0xcc,0x00,0x05,0x12,0x1c] + +[0x04,0xc0,0x41,0xcc,0x00,0x05,0x12,0x1c] # clamp +# GFX12-ERR: warning: invalid instruction encoding + +[0x04,0x48,0x41,0xcc,0x00,0x05,0x12,0x1c] # op_sel:[1,0,0] +# GFX12-ERR: warning: invalid instruction encoding + +[0x04,0x50,0x41,0xcc,0x00,0x05,0x12,0x1c] # op_sel:[0,1,0] +# GFX12-ERR: warning: invalid instruction encoding + +[0x04,0x60,0x41,0xcc,0x00,0x05,0x12,0x1c] # op_sel:[0,0,1] +# GFX12-ERR: warning: invalid instruction encoding + +[0x04,0x40,0x41,0xcc,0x00,0x05,0x12,0x14] # op_sel_hi:[0,1,1] +# GFX12-ERR: warning: invalid instruction encoding + +[0x04,0x40,0x41,0xcc,0x00,0x05,0x12,0x0c] # op_sel_hi:[1,0,1] +# GFX12-ERR: warning: invalid instruction encoding + +[0x04,0x00,0x41,0xcc,0x00,0x05,0x12,0x1c] # op_sel_hi:[1,1,0] +# GFX12-ERR: warning: invalid instruction encoding + +[0x04,0x48,0x41,0xcc,0x00,0x05,0x12,0x1c] # index_key:1 +# GFX12-ERR: warning: invalid instruction encoding + +[0x04,0x50,0x41,0xcc,0x00,0x05,0x12,0x1c] # index_key:2 +# GFX12-ERR: warning: invalid instruction encoding + +[0x04,0x58,0x41,0xcc,0x00,0x05,0x12,0x1c] # index_key:3 +# GFX12-ERR: warning: invalid instruction encoding + +[0x04,0x40,0x41,0xcc,0x00,0x05,0x12,0x3c] # neg_lo:[1,0,0] +# GFX12: v_wmma_f32_16x16x16_bf16 v[4:7], v[0:1], v[2:3], v[4:7] neg_lo:[1,0,0] ; encoding: [0x04,0x40,0x41,0xcc,0x00,0x05,0x12,0x3c] + +[0x04,0x40,0x41,0xcc,0x00,0x05,0x12,0x5c] # neg_lo:[0,1,0] +# GFX12: v_wmma_f32_16x16x16_bf16 v[4:7], v[0:1], v[2:3], v[4:7] neg_lo:[0,1,0] ; encoding: [0x04,0x40,0x41,0xcc,0x00,0x05,0x12,0x5c] + +[0x04,0x40,0x41,0xcc,0x00,0x05,0x12,0x9c] # neg_lo:[0,0,1] +# GFX12: v_wmma_f32_16x16x16_bf16 v[4:7], v[0:1], v[2:3], v[4:7] neg_lo:[0,0,1] ; encoding: [0x04,0x40,0x41,0xcc,0x00,0x05,0x12,0x9c] + +[0x04,0x41,0x41,0xcc,0x00,0x05,0x12,0x1c] # neg_hi:[1,0,0] +# GFX12: v_wmma_f32_16x16x16_bf16 v[4:7], v[0:1], v[2:3], v[4:7] neg_hi:[1,0,0] ; encoding: [0x04,0x41,0x41,0xcc,0x00,0x05,0x12,0x1c] + +[0x04,0x42,0x41,0xcc,0x00,0x05,0x12,0x1c] # neg_hi:[0,1,0] +# GFX12: v_wmma_f32_16x16x16_bf16 v[4:7], v[0:1], v[2:3], v[4:7] neg_hi:[0,1,0] ; encoding: [0x04,0x42,0x41,0xcc,0x00,0x05,0x12,0x1c] + +[0x04,0x44,0x41,0xcc,0x00,0x05,0x12,0x1c] # neg_hi:[0,0,1] +# GFX12: v_wmma_f32_16x16x16_bf16 v[4:7], v[0:1], v[2:3], v[4:7] neg_hi:[0,0,1] ; encoding: [0x04,0x44,0x41,0xcc,0x00,0x05,0x12,0x1c] + +[0x04,0x40,0x41,0xcc,0x00,0x04,0x12,0x1c] # sgpr_0 src0 +# GFX12: v_wmma_f32_16x16x16_bf16 v[4:7], s[0:1]/*Invalid register, operand has 'VReg_64' register class*/, v[2:3], v[4:7] ; encoding: [0x04,0x40,0x41,0xcc,0x00,0x04,0x12,0x1c] + +[0x04,0x40,0x41,0xcc,0x00,0x01,0x10,0x1c] # sgpr_0 src1 +# GFX12: v_wmma_f32_16x16x16_bf16 v[4:7], v[0:1], s[0:1]/*Invalid register, operand has 'VReg_64' register class*/, v[4:7] ; encoding: [0x04,0x40,0x41,0xcc,0x00,0x01,0x10,0x1c] + +[0x04,0x40,0x41,0xcc,0x00,0x05,0x02,0x18] # sgpr_0 src2 +# GFX12: v_wmma_f32_16x16x16_bf16 v[4:7], v[0:1], v[2:3], s[0:3]/*Invalid register, operand has 'VReg_128' register class*/ ; encoding: [0x04,0x40,0x41,0xcc,0x00,0x05,0x02,0x18] + +[0x04,0x40,0x41,0xcc,0xf2,0x04,0x12,0x1c] # 1.0 src0 +# GFX12: v_wmma_f32_16x16x16_bf16 v[4:7], 1.0/*Invalid immediate*/, v[2:3], v[4:7] ; encoding: [0x04,0x40,0x41,0xcc,0x00,0x04,0x12,0x1c] + +[0x04,0x40,0x41,0xcc,0x00,0xe5,0x11,0x1c] # 1.0 src1 +# GFX12: v_wmma_f32_16x16x16_bf16 v[4:7], v[0:1], 1.0/*Invalid immediate*/, v[4:7] ; encoding: [0x04,0x40,0x41,0xcc,0x00,0x01,0x10,0x1c] + +[0x04,0x40,0x41,0xcc,0x00,0x05,0xca,0x1b] # 1.0 src2 +# GFX12: v_wmma_f32_16x16x16_bf16 v[4:7], v[0:1], v[2:3], 1.0 ; encoding: [0x04,0x40,0x41,0xcc,0x00,0x05,0xca,0x1b] + +[0x04,0x40,0x41,0xcc,0x00,0x05,0x06,0x1a] # 1 src2 +# GFX12: v_wmma_f32_16x16x16_bf16 v[4:7], v[0:1], v[2:3], 1 ; encoding: [0x04,0x40,0x41,0xcc,0x00,0x05,0x06,0x1a] + + + +[0x04,0x40,0x42,0xcc,0x00,0x05,0x12,0x1c] +# GFX12: v_wmma_f16_16x16x16_f16 v[4:5], v[0:1], v[2:3], v[4:5] ; encoding: [0x04,0x40,0x42,0xcc,0x00,0x05,0x12,0x1c] + +[0x04,0xc0,0x42,0xcc,0x00,0x05,0x12,0x1c] # clamp +# GFX12-ERR: warning: invalid instruction encoding + +[0x04,0x48,0x42,0xcc,0x00,0x05,0x12,0x1c] # op_sel:[1,0,0] +# GFX12-ERR: warning: invalid instruction encoding + +[0x04,0x50,0x42,0xcc,0x00,0x05,0x12,0x1c] # op_sel:[0,1,0] +# GFX12-ERR: warning: invalid instruction encoding + +[0x04,0x60,0x42,0xcc,0x00,0x05,0x12,0x1c] # op_sel:[0,0,1] +# GFX12-ERR: warning: invalid instruction encoding + +[0x04,0x40,0x42,0xcc,0x00,0x05,0x12,0x14] # op_sel_hi:[0,1,1] +# GFX12-ERR: warning: invalid instruction encoding + +[0x04,0x40,0x42,0xcc,0x00,0x05,0x12,0x0c] # op_sel_hi:[1,0,1] +# GFX12-ERR: warning: invalid instruction encoding + +[0x04,0x00,0x42,0xcc,0x00,0x05,0x12,0x1c] # op_sel_hi:[1,1,0] +# GFX12-ERR: warning: invalid instruction encoding + +[0x04,0x48,0x42,0xcc,0x00,0x05,0x12,0x1c] # index_key:1 +# GFX12-ERR: warning: invalid instruction encoding + +[0x04,0x50,0x42,0xcc,0x00,0x05,0x12,0x1c] # index_key:2 +# GFX12-ERR: warning: invalid instruction encoding + +[0x04,0x58,0x42,0xcc,0x00,0x05,0x12,0x1c] # index_key:3 +# GFX12-ERR: warning: invalid instruction encoding + +[0x04,0x40,0x42,0xcc,0x00,0x05,0x12,0x3c] # neg_lo:[1,0,0] +# GFX12: v_wmma_f16_16x16x16_f16 v[4:5], v[0:1], v[2:3], v[4:5] neg_lo:[1,0,0] ; encoding: [0x04,0x40,0x42,0xcc,0x00,0x05,0x12,0x3c] + +[0x04,0x40,0x42,0xcc,0x00,0x05,0x12,0x5c] # neg_lo:[0,1,0] +# GFX12: v_wmma_f16_16x16x16_f16 v[4:5], v[0:1], v[2:3], v[4:5] neg_lo:[0,1,0] ; encoding: [0x04,0x40,0x42,0xcc,0x00,0x05,0x12,0x5c] + +[0x04,0x40,0x42,0xcc,0x00,0x05,0x12,0x9c] # neg_lo:[0,0,1] +# GFX12: v_wmma_f16_16x16x16_f16 v[4:5], v[0:1], v[2:3], v[4:5] neg_lo:[0,0,1] ; encoding: [0x04,0x40,0x42,0xcc,0x00,0x05,0x12,0x9c] + +[0x04,0x41,0x42,0xcc,0x00,0x05,0x12,0x1c] # neg_hi:[1,0,0] +# GFX12: v_wmma_f16_16x16x16_f16 v[4:5], v[0:1], v[2:3], v[4:5] neg_hi:[1,0,0] ; encoding: [0x04,0x41,0x42,0xcc,0x00,0x05,0x12,0x1c] + +[0x04,0x42,0x42,0xcc,0x00,0x05,0x12,0x1c] # neg_hi:[0,1,0] +# GFX12: v_wmma_f16_16x16x16_f16 v[4:5], v[0:1], v[2:3], v[4:5] neg_hi:[0,1,0] ; encoding: [0x04,0x42,0x42,0xcc,0x00,0x05,0x12,0x1c] + +[0x04,0x44,0x42,0xcc,0x00,0x05,0x12,0x1c] # neg_hi:[0,0,1] +# GFX12: v_wmma_f16_16x16x16_f16 v[4:5], v[0:1], v[2:3], v[4:5] neg_hi:[0,0,1] ; encoding: [0x04,0x44,0x42,0xcc,0x00,0x05,0x12,0x1c] + +[0x04,0x40,0x42,0xcc,0x00,0x04,0x12,0x1c] # sgpr_0 src0 +# GFX12: v_wmma_f16_16x16x16_f16 v[4:5], s[0:1]/*Invalid register, operand has 'VReg_64' register class*/, v[2:3], v[4:5] ; encoding: [0x04,0x40,0x42,0xcc,0x00,0x04,0x12,0x1c] + +[0x04,0x40,0x42,0xcc,0x00,0x01,0x10,0x1c] # sgpr_0 src1 +# GFX12: v_wmma_f16_16x16x16_f16 v[4:5], v[0:1], s[0:1]/*Invalid register, operand has 'VReg_64' register class*/, v[4:5] ; encoding: [0x04,0x40,0x42,0xcc,0x00,0x01,0x10,0x1c] + +[0x04,0x40,0x42,0xcc,0x00,0x05,0x02,0x18] # sgpr_0 src2 +# GFX12: v_wmma_f16_16x16x16_f16 v[4:5], v[0:1], v[2:3], s[0:1]/*Invalid register, operand has 'VReg_64' register class*/ ; encoding: [0x04,0x40,0x42,0xcc,0x00,0x05,0x02,0x18] + +[0x04,0x40,0x42,0xcc,0xf2,0x04,0x12,0x1c] # 1.0 src0 +# GFX12: v_wmma_f16_16x16x16_f16 v[4:5], 1.0/*Invalid immediate*/, v[2:3], v[4:5] ; encoding: [0x04,0x40,0x42,0xcc,0x00,0x04,0x12,0x1c] + +[0x04,0x40,0x42,0xcc,0x00,0xe5,0x11,0x1c] # 1.0 src1 +# GFX12: v_wmma_f16_16x16x16_f16 v[4:5], v[0:1], 1.0/*Invalid immediate*/, v[4:5] ; encoding: [0x04,0x40,0x42,0xcc,0x00,0x01,0x10,0x1c] + +[0x04,0x40,0x42,0xcc,0x00,0x05,0xca,0x1b] # 1.0 src2 +# GFX12: v_wmma_f16_16x16x16_f16 v[4:5], v[0:1], v[2:3], 1.0 ; encoding: [0x04,0x40,0x42,0xcc,0x00,0x05,0xca,0x1b] + +[0x04,0x40,0x42,0xcc,0x00,0x05,0x06,0x1a] # 1 src2 +# GFX12: v_wmma_f16_16x16x16_f16 v[4:5], v[0:1], v[2:3], 1 ; encoding: [0x04,0x40,0x42,0xcc,0x00,0x05,0x06,0x1a] + + + +[0x04,0x40,0x43,0xcc,0x00,0x05,0x12,0x1c] +# GFX12: v_wmma_bf16_16x16x16_bf16 v[4:5], v[0:1], v[2:3], v[4:5] ; encoding: [0x04,0x40,0x43,0xcc,0x00,0x05,0x12,0x1c] + +[0x04,0xc0,0x43,0xcc,0x00,0x05,0x12,0x1c] # clamp +# GFX12-ERR: warning: invalid instruction encoding + +[0x04,0x48,0x43,0xcc,0x00,0x05,0x12,0x1c] # op_sel:[1,0,0] +# GFX12-ERR: warning: invalid instruction encoding + +[0x04,0x50,0x43,0xcc,0x00,0x05,0x12,0x1c] # op_sel:[0,1,0] +# GFX12-ERR: warning: invalid instruction encoding + +[0x04,0x60,0x43,0xcc,0x00,0x05,0x12,0x1c] # op_sel:[0,0,1] +# GFX12-ERR: warning: invalid instruction encoding + +[0x04,0x40,0x43,0xcc,0x00,0x05,0x12,0x14] # op_sel_hi:[0,1,1] +# GFX12-ERR: warning: invalid instruction encoding + +[0x04,0x40,0x43,0xcc,0x00,0x05,0x12,0x0c] # op_sel_hi:[1,0,1] +# GFX12-ERR: warning: invalid instruction encoding + +[0x04,0x00,0x43,0xcc,0x00,0x05,0x12,0x1c] # op_sel_hi:[1,1,0] +# GFX12-ERR: warning: invalid instruction encoding + +[0x04,0x48,0x43,0xcc,0x00,0x05,0x12,0x1c] # index_key:1 +# GFX12-ERR: warning: invalid instruction encoding + +[0x04,0x50,0x43,0xcc,0x00,0x05,0x12,0x1c] # index_key:2 +# GFX12-ERR: warning: invalid instruction encoding + +[0x04,0x58,0x43,0xcc,0x00,0x05,0x12,0x1c] # index_key:3 +# GFX12-ERR: warning: invalid instruction encoding + +[0x04,0x40,0x43,0xcc,0x00,0x05,0x12,0x3c] # neg_lo:[1,0,0] +# GFX12: v_wmma_bf16_16x16x16_bf16 v[4:5], v[0:1], v[2:3], v[4:5] neg_lo:[1,0,0] ; encoding: [0x04,0x40,0x43,0xcc,0x00,0x05,0x12,0x3c] + +[0x04,0x40,0x43,0xcc,0x00,0x05,0x12,0x5c] # neg_lo:[0,1,0] +# GFX12: v_wmma_bf16_16x16x16_bf16 v[4:5], v[0:1], v[2:3], v[4:5] neg_lo:[0,1,0] ; encoding: [0x04,0x40,0x43,0xcc,0x00,0x05,0x12,0x5c] + +[0x04,0x40,0x43,0xcc,0x00,0x05,0x12,0x9c] # neg_lo:[0,0,1] +# GFX12: v_wmma_bf16_16x16x16_bf16 v[4:5], v[0:1], v[2:3], v[4:5] neg_lo:[0,0,1] ; encoding: [0x04,0x40,0x43,0xcc,0x00,0x05,0x12,0x9c] + +[0x04,0x41,0x43,0xcc,0x00,0x05,0x12,0x1c] # neg_hi:[1,0,0] +# GFX12: v_wmma_bf16_16x16x16_bf16 v[4:5], v[0:1], v[2:3], v[4:5] neg_hi:[1,0,0] ; encoding: [0x04,0x41,0x43,0xcc,0x00,0x05,0x12,0x1c] + +[0x04,0x42,0x43,0xcc,0x00,0x05,0x12,0x1c] # neg_hi:[0,1,0] +# GFX12: v_wmma_bf16_16x16x16_bf16 v[4:5], v[0:1], v[2:3], v[4:5] neg_hi:[0,1,0] ; encoding: [0x04,0x42,0x43,0xcc,0x00,0x05,0x12,0x1c] + +[0x04,0x44,0x43,0xcc,0x00,0x05,0x12,0x1c] # neg_hi:[0,0,1] +# GFX12: v_wmma_bf16_16x16x16_bf16 v[4:5], v[0:1], v[2:3], v[4:5] neg_hi:[0,0,1] ; encoding: [0x04,0x44,0x43,0xcc,0x00,0x05,0x12,0x1c] + +[0x04,0x40,0x43,0xcc,0x00,0x04,0x12,0x1c] # sgpr_0 src0 +# GFX12: v_wmma_bf16_16x16x16_bf16 v[4:5], s[0:1]/*Invalid register, operand has 'VReg_64' register class*/, v[2:3], v[4:5] ; encoding: [0x04,0x40,0x43,0xcc,0x00,0x04,0x12,0x1c] + +[0x04,0x40,0x43,0xcc,0x00,0x01,0x10,0x1c] # sgpr_0 src1 +# GFX12: v_wmma_bf16_16x16x16_bf16 v[4:5], v[0:1], s[0:1]/*Invalid register, operand has 'VReg_64' register class*/, v[4:5] ; encoding: [0x04,0x40,0x43,0xcc,0x00,0x01,0x10,0x1c] + +[0x04,0x40,0x43,0xcc,0x00,0x05,0x02,0x18] # sgpr_0 src2 +# GFX12: v_wmma_bf16_16x16x16_bf16 v[4:5], v[0:1], v[2:3], s[0:1]/*Invalid register, operand has 'VReg_64' register class*/ ; encoding: [0x04,0x40,0x43,0xcc,0x00,0x05,0x02,0x18] + +[0x04,0x40,0x43,0xcc,0xf2,0x04,0x12,0x1c] # 1.0 src0 +# GFX12: v_wmma_bf16_16x16x16_bf16 v[4:5], 1.0/*Invalid immediate*/, v[2:3], v[4:5] ; encoding: [0x04,0x40,0x43,0xcc,0x00,0x04,0x12,0x1c] + +[0x04,0x40,0x43,0xcc,0x00,0xe5,0x11,0x1c] # 1.0 src1 +# GFX12: v_wmma_bf16_16x16x16_bf16 v[4:5], v[0:1], 1.0/*Invalid immediate*/, v[4:5] ; encoding: [0x04,0x40,0x43,0xcc,0x00,0x01,0x10,0x1c] + +[0x04,0x40,0x43,0xcc,0x00,0x05,0xca,0x1b] # 1.0 src2 +# GFX12: v_wmma_bf16_16x16x16_bf16 v[4:5], v[0:1], v[2:3], 1.0 ; encoding: [0x04,0x40,0x43,0xcc,0x00,0x05,0xca,0x1b] + +[0x04,0x40,0x43,0xcc,0x00,0x05,0x06,0x1a] # 1 src2 +# GFX12: v_wmma_bf16_16x16x16_bf16 v[4:5], v[0:1], v[2:3], 1 ; encoding: [0x04,0x40,0x43,0xcc,0x00,0x05,0x06,0x1a] + + + +[0x02,0x40,0x44,0xcc,0x00,0x03,0x0a,0x1c] +# GFX12: v_wmma_i32_16x16x16_iu8 v[2:5], v0, v1, v[2:5] ; encoding: [0x02,0x40,0x44,0xcc,0x00,0x03,0x0a,0x1c] + +[0x02,0xc0,0x44,0xcc,0x00,0x03,0x0a,0x1c] # clamp +# GFX12: v_wmma_i32_16x16x16_iu8 v[2:5], v0, v1, v[2:5] clamp ; encoding: [0x02,0xc0,0x44,0xcc,0x00,0x03,0x0a,0x1c] + +[0x02,0x48,0x44,0xcc,0x00,0x03,0x0a,0x1c] # op_sel:[1,0,0] +# GFX12-ERR: warning: invalid instruction encoding + +[0x02,0x50,0x44,0xcc,0x00,0x03,0x0a,0x1c] # op_sel:[0,1,0] +# GFX12-ERR: warning: invalid instruction encoding + +[0x02,0x60,0x44,0xcc,0x00,0x03,0x0a,0x1c] # op_sel:[0,0,1] +# GFX12-ERR: warning: invalid instruction encoding + +[0x02,0x40,0x44,0xcc,0x00,0x03,0x0a,0x14] # op_sel_hi:[0,1,1] +# GFX12-ERR: warning: invalid instruction encoding + +[0x02,0x40,0x44,0xcc,0x00,0x03,0x0a,0x0c] # op_sel_hi:[1,0,1] +# GFX12-ERR: warning: invalid instruction encoding + +[0x02,0x00,0x44,0xcc,0x00,0x03,0x0a,0x1c] # op_sel_hi:[1,1,0] +# GFX12-ERR: warning: invalid instruction encoding + +[0x02,0x48,0x44,0xcc,0x00,0x03,0x0a,0x1c] # index_key:1 +# GFX12-ERR: warning: invalid instruction encoding + +[0x02,0x50,0x44,0xcc,0x00,0x03,0x0a,0x1c] # index_key:2 +# GFX12-ERR: warning: invalid instruction encoding + +[0x02,0x58,0x44,0xcc,0x00,0x03,0x0a,0x1c] # index_key:3 +# GFX12-ERR: warning: invalid instruction encoding + +[0x02,0x40,0x44,0xcc,0x00,0x03,0x0a,0x3c] # neg_lo:[1,0,0] +# GFX12: v_wmma_i32_16x16x16_iu8 v[2:5], v0, v1, v[2:5] neg_lo:[1,0,0] ; encoding: [0x02,0x40,0x44,0xcc,0x00,0x03,0x0a,0x3c] + +[0x02,0x40,0x44,0xcc,0x00,0x03,0x0a,0x5c] # neg_lo:[0,1,0] +# GFX12: v_wmma_i32_16x16x16_iu8 v[2:5], v0, v1, v[2:5] neg_lo:[0,1,0] ; encoding: [0x02,0x40,0x44,0xcc,0x00,0x03,0x0a,0x5c] + +[0x02,0x40,0x44,0xcc,0x00,0x03,0x0a,0x9c] # neg_lo:[0,0,1] +# GFX12-ERR: warning: invalid instruction encoding + +[0x02,0x41,0x44,0xcc,0x00,0x03,0x0a,0x1c] # neg_hi:[1,0,0] +# GFX12-ERR: warning: invalid instruction encoding + +[0x02,0x42,0x44,0xcc,0x00,0x03,0x0a,0x1c] # neg_hi:[0,1,0] +# GFX12-ERR: warning: invalid instruction encoding + +[0x02,0x44,0x44,0xcc,0x00,0x03,0x0a,0x1c] # neg_hi:[0,0,1] +# GFX12-ERR: warning: invalid instruction encoding + +[0x02,0x40,0x44,0xcc,0x00,0x02,0x0a,0x1c] # sgpr_0 src0 +# GFX12: v_wmma_i32_16x16x16_iu8 v[2:5], s0/*Invalid register, operand has 'VGPR_32' register class*/, v1, v[2:5] ; encoding: [0x02,0x40,0x44,0xcc,0x00,0x02,0x0a,0x1c] + +[0x02,0x40,0x44,0xcc,0x00,0x01,0x08,0x1c] # sgpr_0 src1 +# GFX12: v_wmma_i32_16x16x16_iu8 v[2:5], v0, s0/*Invalid register, operand has 'VGPR_32' register class*/, v[2:5] ; encoding: [0x02,0x40,0x44,0xcc,0x00,0x01,0x08,0x1c] + +[0x02,0x40,0x44,0xcc,0x00,0x03,0x02,0x18] # sgpr_0 src2 +# GFX12: v_wmma_i32_16x16x16_iu8 v[2:5], v0, v1, s[0:3]/*Invalid register, operand has 'VReg_128' register class*/ ; encoding: [0x02,0x40,0x44,0xcc,0x00,0x03,0x02,0x18] + +[0x02,0x40,0x44,0xcc,0x81,0x02,0x0a,0x1c] # 1 src0 +# GFX12: v_wmma_i32_16x16x16_iu8 v[2:5], 1/*Invalid immediate*/, v1, v[2:5] ; encoding: [0x02,0x40,0x44,0xcc,0x01,0x02,0x0a,0x1c] + +[0x02,0x40,0x44,0xcc,0x00,0x03,0x09,0x1c] # 1 src1 +# GFX12: v_wmma_i32_16x16x16_iu8 v[2:5], v0, 1/*Invalid immediate*/, v[2:5] ; encoding: [0x02,0x40,0x44,0xcc,0x00,0x03,0x08,0x1c] + +[0x02,0x40,0x44,0xcc,0x00,0x03,0x06,0x1a] # 1 src2 +# GFX12: v_wmma_i32_16x16x16_iu8 v[2:5], v0, v1, 1 ; encoding: [0x02,0x40,0x44,0xcc,0x00,0x03,0x06,0x1a] + +[0x02,0x40,0x44,0xcc,0x00,0x03,0xca,0x1b] # 1.0 src2 +# GFX12: v_wmma_i32_16x16x16_iu8 v[2:5], v0, v1, 1.0 ; encoding: [0x02,0x40,0x44,0xcc,0x00,0x03,0xca,0x1b] + + + +[0x02,0x40,0x45,0xcc,0x00,0x03,0x0a,0x1c] +# GFX12: v_wmma_i32_16x16x16_iu4 v[2:5], v0, v1, v[2:5] ; encoding: [0x02,0x40,0x45,0xcc,0x00,0x03,0x0a,0x1c] + +[0x02,0xc0,0x45,0xcc,0x00,0x03,0x0a,0x1c] # clamp +# GFX12: v_wmma_i32_16x16x16_iu4 v[2:5], v0, v1, v[2:5] clamp ; encoding: [0x02,0xc0,0x45,0xcc,0x00,0x03,0x0a,0x1c] + +[0x02,0x48,0x45,0xcc,0x00,0x03,0x0a,0x1c] # op_sel:[1,0,0] +# GFX12-ERR: warning: invalid instruction encoding + +[0x02,0x50,0x45,0xcc,0x00,0x03,0x0a,0x1c] # op_sel:[0,1,0] +# GFX12-ERR: warning: invalid instruction encoding + +[0x02,0x60,0x45,0xcc,0x00,0x03,0x0a,0x1c] # op_sel:[0,0,1] +# GFX12-ERR: warning: invalid instruction encoding + +[0x02,0x40,0x45,0xcc,0x00,0x03,0x0a,0x14] # op_sel_hi:[0,1,1] +# GFX12-ERR: warning: invalid instruction encoding + +[0x02,0x40,0x45,0xcc,0x00,0x03,0x0a,0x0c] # op_sel_hi:[1,0,1] +# GFX12-ERR: warning: invalid instruction encoding + +[0x02,0x00,0x45,0xcc,0x00,0x03,0x0a,0x1c] # op_sel_hi:[1,1,0] +# GFX12-ERR: warning: invalid instruction encoding + +[0x02,0x48,0x45,0xcc,0x00,0x03,0x0a,0x1c] # index_key:1 +# GFX12-ERR: warning: invalid instruction encoding + +[0x02,0x50,0x45,0xcc,0x00,0x03,0x0a,0x1c] # index_key:2 +# GFX12-ERR: warning: invalid instruction encoding + +[0x02,0x58,0x45,0xcc,0x00,0x03,0x0a,0x1c] # index_key:3 +# GFX12-ERR: warning: invalid instruction encoding + +[0x02,0x40,0x45,0xcc,0x00,0x03,0x0a,0x3c] # neg_lo:[1,0,0] +# GFX12: v_wmma_i32_16x16x16_iu4 v[2:5], v0, v1, v[2:5] neg_lo:[1,0,0] ; encoding: [0x02,0x40,0x45,0xcc,0x00,0x03,0x0a,0x3c] + +[0x02,0x40,0x45,0xcc,0x00,0x03,0x0a,0x5c] # neg_lo:[0,1,0] +# GFX12: v_wmma_i32_16x16x16_iu4 v[2:5], v0, v1, v[2:5] neg_lo:[0,1,0] ; encoding: [0x02,0x40,0x45,0xcc,0x00,0x03,0x0a,0x5c] + +[0x02,0x40,0x45,0xcc,0x00,0x03,0x0a,0x9c] # neg_lo:[0,0,1] +# GFX12-ERR: warning: invalid instruction encoding + +[0x02,0x41,0x45,0xcc,0x00,0x03,0x0a,0x1c] # neg_hi:[1,0,0] +# GFX12-ERR: warning: invalid instruction encoding + +[0x02,0x42,0x45,0xcc,0x00,0x03,0x0a,0x1c] # neg_hi:[0,1,0] +# GFX12-ERR: warning: invalid instruction encoding + +[0x02,0x44,0x45,0xcc,0x00,0x03,0x0a,0x1c] # neg_hi:[0,0,1] +# GFX12-ERR: warning: invalid instruction encoding + +[0x02,0x40,0x45,0xcc,0x00,0x02,0x0a,0x1c] # sgpr_0 src0 +# GFX12: v_wmma_i32_16x16x16_iu4 v[2:5], s0/*Invalid register, operand has 'VGPR_32' register class*/, v1, v[2:5] ; encoding: [0x02,0x40,0x45,0xcc,0x00,0x02,0x0a,0x1c] + +[0x02,0x40,0x45,0xcc,0x00,0x01,0x08,0x1c] # sgpr_0 src1 +# GFX12: v_wmma_i32_16x16x16_iu4 v[2:5], v0, s0/*Invalid register, operand has 'VGPR_32' register class*/, v[2:5] ; encoding: [0x02,0x40,0x45,0xcc,0x00,0x01,0x08,0x1c] + +[0x02,0x40,0x45,0xcc,0x00,0x03,0x02,0x18] # sgpr_0 src2 +# GFX12: v_wmma_i32_16x16x16_iu4 v[2:5], v0, v1, s[0:3]/*Invalid register, operand has 'VReg_128' register class*/ ; encoding: [0x02,0x40,0x45,0xcc,0x00,0x03,0x02,0x18] + +[0x02,0x40,0x45,0xcc,0x81,0x02,0x0a,0x1c] # 1 src0 +# GFX12: v_wmma_i32_16x16x16_iu4 v[2:5], 1/*Invalid immediate*/, v1, v[2:5] ; encoding: [0x02,0x40,0x45,0xcc,0x01,0x02,0x0a,0x1c] + +[0x02,0x40,0x45,0xcc,0x00,0x03,0x09,0x1c] # 1 src1 +# GFX12: v_wmma_i32_16x16x16_iu4 v[2:5], v0, 1/*Invalid immediate*/, v[2:5] ; encoding: [0x02,0x40,0x45,0xcc,0x00,0x03,0x08,0x1c] + +[0x02,0x40,0x45,0xcc,0x00,0x03,0x06,0x1a] # 1 src2 +# GFX12: v_wmma_i32_16x16x16_iu4 v[2:5], v0, v1, 1 ; encoding: [0x02,0x40,0x45,0xcc,0x00,0x03,0x06,0x1a] + +[0x02,0x40,0x45,0xcc,0x00,0x03,0xca,0x1b] # 1.0 src2 +# GFX12: v_wmma_i32_16x16x16_iu4 v[2:5], v0, v1, 1.0 ; encoding: [0x02,0x40,0x45,0xcc,0x00,0x03,0xca,0x1b] + + + +[0x02,0x40,0x46,0xcc,0x00,0x03,0x0a,0x1c] +# GFX12: v_wmma_f32_16x16x16_fp8_fp8 v[2:5], v0, v1, v[2:5] ; encoding: [0x02,0x40,0x46,0xcc,0x00,0x03,0x0a,0x1c] + +[0x02,0xc0,0x46,0xcc,0x00,0x03,0x0a,0x1c] # clamp +# GFX12-ERR: warning: invalid instruction encoding + +[0x02,0x48,0x46,0xcc,0x00,0x03,0x0a,0x1c] # op_sel:[1,0,0] +# GFX12-ERR: warning: invalid instruction encoding + +[0x02,0x50,0x46,0xcc,0x00,0x03,0x0a,0x1c] # op_sel:[0,1,0] +# GFX12-ERR: warning: invalid instruction encoding + +[0x02,0x60,0x46,0xcc,0x00,0x03,0x0a,0x1c] # op_sel:[0,0,1] +# GFX12-ERR: warning: invalid instruction encoding + +[0x02,0x40,0x46,0xcc,0x00,0x03,0x0a,0x14] # op_sel_hi:[0,1,1] +# GFX12-ERR: warning: invalid instruction encoding + +[0x02,0x40,0x46,0xcc,0x00,0x03,0x0a,0x0c] # op_sel_hi:[1,0,1] +# GFX12-ERR: warning: invalid instruction encoding + +[0x02,0x00,0x46,0xcc,0x00,0x03,0x0a,0x1c] # op_sel_hi:[1,1,0] +# GFX12-ERR: warning: invalid instruction encoding + +[0x02,0x48,0x46,0xcc,0x00,0x03,0x0a,0x1c] # index_key:1 +# GFX12-ERR: warning: invalid instruction encoding + +[0x02,0x50,0x46,0xcc,0x00,0x03,0x0a,0x1c] # index_key:2 +# GFX12-ERR: warning: invalid instruction encoding + +[0x02,0x58,0x46,0xcc,0x00,0x03,0x0a,0x1c] # index_key:3 +# GFX12-ERR: warning: invalid instruction encoding + +[0x02,0x40,0x46,0xcc,0x00,0x03,0x0a,0x3c] # neg_lo:[1,0,0] +# GFX12-ERR: warning: invalid instruction encoding + +[0x02,0x40,0x46,0xcc,0x00,0x03,0x0a,0x5c] # neg_lo:[0,1,0] +# GFX12-ERR: warning: invalid instruction encoding + +[0x02,0x40,0x46,0xcc,0x00,0x03,0x0a,0x9c] # neg_lo:[0,0,1] +# GFX12: v_wmma_f32_16x16x16_fp8_fp8 v[2:5], v0, v1, v[2:5] neg_lo:[0,0,1] ; encoding: [0x02,0x40,0x46,0xcc,0x00,0x03,0x0a,0x9c] + +[0x02,0x41,0x46,0xcc,0x00,0x03,0x0a,0x1c] # neg_hi:[1,0,0] +# GFX12-ERR: warning: invalid instruction encoding + +[0x02,0x42,0x46,0xcc,0x00,0x03,0x0a,0x1c] # neg_hi:[0,1,0] +# GFX12-ERR: warning: invalid instruction encoding + +[0x02,0x44,0x46,0xcc,0x00,0x03,0x0a,0x1c] # neg_hi:[0,0,1] +# GFX12: v_wmma_f32_16x16x16_fp8_fp8 v[2:5], v0, v1, v[2:5] neg_hi:[0,0,1] ; encoding: [0x02,0x44,0x46,0xcc,0x00,0x03,0x0a,0x1c] + +[0x02,0x40,0x46,0xcc,0x00,0x02,0x0a,0x1c] # sgpr_0 src0 +# GFX12: v_wmma_f32_16x16x16_fp8_fp8 v[2:5], s0/*Invalid register, operand has 'VGPR_32' register class*/, v1, v[2:5] ; encoding: [0x02,0x40,0x46,0xcc,0x00,0x02,0x0a,0x1c] + +[0x02,0x40,0x46,0xcc,0x00,0x01,0x08,0x1c] # sgpr_0 src1 +# GFX12: v_wmma_f32_16x16x16_fp8_fp8 v[2:5], v0, s0/*Invalid register, operand has 'VGPR_32' register class*/, v[2:5] ; encoding: [0x02,0x40,0x46,0xcc,0x00,0x01,0x08,0x1c] + +[0x02,0x40,0x46,0xcc,0x00,0x03,0x02,0x18] # sgpr_0 src2 +# GFX12: v_wmma_f32_16x16x16_fp8_fp8 v[2:5], v0, v1, s[0:3]/*Invalid register, operand has 'VReg_128' register class*/ ; encoding: [0x02,0x40,0x46,0xcc,0x00,0x03,0x02,0x18] + +[0x02,0x40,0x46,0xcc,0xf2,0x02,0x0a,0x1c] # 1.0 src0 +# GFX12: v_wmma_f32_16x16x16_fp8_fp8 v[2:5], 1.0/*Invalid immediate*/, v1, v[2:5] ; encoding: [0x02,0x40,0x46,0xcc,0x00,0x02,0x0a,0x1c] + +[0x02,0x40,0x46,0xcc,0x00,0xe5,0x09,0x1c] # 1.0 src1 +# GFX12: v_wmma_f32_16x16x16_fp8_fp8 v[2:5], v0, 1.0/*Invalid immediate*/, v[2:5] ; encoding: [0x02,0x40,0x46,0xcc,0x00,0x01,0x08,0x1c] + +[0x02,0x40,0x46,0xcc,0x00,0x03,0xca,0x1b] # 1.0 src2 +# GFX12: v_wmma_f32_16x16x16_fp8_fp8 v[2:5], v0, v1, 1.0 ; encoding: [0x02,0x40,0x46,0xcc,0x00,0x03,0xca,0x1b] + +[0x02,0x40,0x46,0xcc,0x00,0x03,0x06,0x1a] # 1 src2 +# GFX12: v_wmma_f32_16x16x16_fp8_fp8 v[2:5], v0, v1, 1 ; encoding: [0x02,0x40,0x46,0xcc,0x00,0x03,0x06,0x1a] + + + +[0x02,0x40,0x47,0xcc,0x00,0x03,0x0a,0x1c] +# GFX12: v_wmma_f32_16x16x16_fp8_bf8 v[2:5], v0, v1, v[2:5] ; encoding: [0x02,0x40,0x47,0xcc,0x00,0x03,0x0a,0x1c] + +[0x02,0xc0,0x47,0xcc,0x00,0x03,0x0a,0x1c] # clamp +# GFX12-ERR: warning: invalid instruction encoding + +[0x02,0x48,0x47,0xcc,0x00,0x03,0x0a,0x1c] # op_sel:[1,0,0] +# GFX12-ERR: warning: invalid instruction encoding + +[0x02,0x50,0x47,0xcc,0x00,0x03,0x0a,0x1c] # op_sel:[0,1,0] +# GFX12-ERR: warning: invalid instruction encoding + +[0x02,0x60,0x47,0xcc,0x00,0x03,0x0a,0x1c] # op_sel:[0,0,1] +# GFX12-ERR: warning: invalid instruction encoding + +[0x02,0x40,0x47,0xcc,0x00,0x03,0x0a,0x14] # op_sel_hi:[0,1,1] +# GFX12-ERR: warning: invalid instruction encoding + +[0x02,0x40,0x47,0xcc,0x00,0x03,0x0a,0x0c] # op_sel_hi:[1,0,1] +# GFX12-ERR: warning: invalid instruction encoding + +[0x02,0x00,0x47,0xcc,0x00,0x03,0x0a,0x1c] # op_sel_hi:[1,1,0] +# GFX12-ERR: warning: invalid instruction encoding + +[0x02,0x48,0x47,0xcc,0x00,0x03,0x0a,0x1c] # index_key:1 +# GFX12-ERR: warning: invalid instruction encoding + +[0x02,0x50,0x47,0xcc,0x00,0x03,0x0a,0x1c] # index_key:2 +# GFX12-ERR: warning: invalid instruction encoding + +[0x02,0x58,0x47,0xcc,0x00,0x03,0x0a,0x1c] # index_key:3 +# GFX12-ERR: warning: invalid instruction encoding + +[0x02,0x40,0x47,0xcc,0x00,0x03,0x0a,0x3c] # neg_lo:[1,0,0] +# GFX12-ERR: warning: invalid instruction encoding + +[0x02,0x40,0x47,0xcc,0x00,0x03,0x0a,0x5c] # neg_lo:[0,1,0] +# GFX12-ERR: warning: invalid instruction encoding + +[0x02,0x40,0x47,0xcc,0x00,0x03,0x0a,0x9c] # neg_lo:[0,0,1] +# GFX12: v_wmma_f32_16x16x16_fp8_bf8 v[2:5], v0, v1, v[2:5] neg_lo:[0,0,1] ; encoding: [0x02,0x40,0x47,0xcc,0x00,0x03,0x0a,0x9c] + +[0x02,0x41,0x47,0xcc,0x00,0x03,0x0a,0x1c] # neg_hi:[1,0,0] +# GFX12-ERR: warning: invalid instruction encoding + +[0x02,0x42,0x47,0xcc,0x00,0x03,0x0a,0x1c] # neg_hi:[0,1,0] +# GFX12-ERR: warning: invalid instruction encoding + +[0x02,0x44,0x47,0xcc,0x00,0x03,0x0a,0x1c] # neg_hi:[0,0,1] +# GFX12: v_wmma_f32_16x16x16_fp8_bf8 v[2:5], v0, v1, v[2:5] neg_hi:[0,0,1] ; encoding: [0x02,0x44,0x47,0xcc,0x00,0x03,0x0a,0x1c] + +[0x02,0x40,0x47,0xcc,0x00,0x02,0x0a,0x1c] # sgpr_0 src0 +# GFX12: v_wmma_f32_16x16x16_fp8_bf8 v[2:5], s0/*Invalid register, operand has 'VGPR_32' register class*/, v1, v[2:5] ; encoding: [0x02,0x40,0x47,0xcc,0x00,0x02,0x0a,0x1c] + +[0x02,0x40,0x47,0xcc,0x00,0x01,0x08,0x1c] # sgpr_0 src1 +# GFX12: v_wmma_f32_16x16x16_fp8_bf8 v[2:5], v0, s0/*Invalid register, operand has 'VGPR_32' register class*/, v[2:5] ; encoding: [0x02,0x40,0x47,0xcc,0x00,0x01,0x08,0x1c] + +[0x02,0x40,0x47,0xcc,0x00,0x03,0x02,0x18] # sgpr_0 src2 +# GFX12: v_wmma_f32_16x16x16_fp8_bf8 v[2:5], v0, v1, s[0:3]/*Invalid register, operand has 'VReg_128' register class*/ ; encoding: [0x02,0x40,0x47,0xcc,0x00,0x03,0x02,0x18] + +[0x02,0x40,0x47,0xcc,0xf2,0x02,0x0a,0x1c] # 1.0 src0 +# GFX12: v_wmma_f32_16x16x16_fp8_bf8 v[2:5], 1.0/*Invalid immediate*/, v1, v[2:5] ; encoding: [0x02,0x40,0x47,0xcc,0x00,0x02,0x0a,0x1c] + +[0x02,0x40,0x47,0xcc,0x00,0xe5,0x09,0x1c] # 1.0 src1 +# GFX12: v_wmma_f32_16x16x16_fp8_bf8 v[2:5], v0, 1.0/*Invalid immediate*/, v[2:5] ; encoding: [0x02,0x40,0x47,0xcc,0x00,0x01,0x08,0x1c] + +[0x02,0x40,0x47,0xcc,0x00,0x03,0xca,0x1b] # 1.0 src2 +# GFX12: v_wmma_f32_16x16x16_fp8_bf8 v[2:5], v0, v1, 1.0 ; encoding: [0x02,0x40,0x47,0xcc,0x00,0x03,0xca,0x1b] + +[0x02,0x40,0x47,0xcc,0x00,0x03,0x06,0x1a] # 1 src2 +# GFX12: v_wmma_f32_16x16x16_fp8_bf8 v[2:5], v0, v1, 1 ; encoding: [0x02,0x40,0x47,0xcc,0x00,0x03,0x06,0x1a] + + + +[0x02,0x40,0x48,0xcc,0x00,0x03,0x0a,0x1c] +# GFX12: v_wmma_f32_16x16x16_bf8_fp8 v[2:5], v0, v1, v[2:5] ; encoding: [0x02,0x40,0x48,0xcc,0x00,0x03,0x0a,0x1c] + +[0x02,0xc0,0x48,0xcc,0x00,0x03,0x0a,0x1c] # clamp +# GFX12-ERR: warning: invalid instruction encoding + +[0x02,0x48,0x48,0xcc,0x00,0x03,0x0a,0x1c] # op_sel:[1,0,0] +# GFX12-ERR: warning: invalid instruction encoding + +[0x02,0x50,0x48,0xcc,0x00,0x03,0x0a,0x1c] # op_sel:[0,1,0] +# GFX12-ERR: warning: invalid instruction encoding + +[0x02,0x60,0x48,0xcc,0x00,0x03,0x0a,0x1c] # op_sel:[0,0,1] +# GFX12-ERR: warning: invalid instruction encoding + +[0x02,0x40,0x48,0xcc,0x00,0x03,0x0a,0x14] # op_sel_hi:[0,1,1] +# GFX12-ERR: warning: invalid instruction encoding + +[0x02,0x40,0x48,0xcc,0x00,0x03,0x0a,0x0c] # op_sel_hi:[1,0,1] +# GFX12-ERR: warning: invalid instruction encoding + +[0x02,0x00,0x48,0xcc,0x00,0x03,0x0a,0x1c] # op_sel_hi:[1,1,0] +# GFX12-ERR: warning: invalid instruction encoding + +[0x02,0x48,0x48,0xcc,0x00,0x03,0x0a,0x1c] # index_key:1 +# GFX12-ERR: warning: invalid instruction encoding + +[0x02,0x50,0x48,0xcc,0x00,0x03,0x0a,0x1c] # index_key:2 +# GFX12-ERR: warning: invalid instruction encoding + +[0x02,0x58,0x48,0xcc,0x00,0x03,0x0a,0x1c] # index_key:3 +# GFX12-ERR: warning: invalid instruction encoding + +[0x02,0x40,0x48,0xcc,0x00,0x03,0x0a,0x3c] # neg_lo:[1,0,0] +# GFX12-ERR: warning: invalid instruction encoding + +[0x02,0x40,0x48,0xcc,0x00,0x03,0x0a,0x5c] # neg_lo:[0,1,0] +# GFX12-ERR: warning: invalid instruction encoding + +[0x02,0x40,0x48,0xcc,0x00,0x03,0x0a,0x9c] # neg_lo:[0,0,1] +# GFX12: v_wmma_f32_16x16x16_bf8_fp8 v[2:5], v0, v1, v[2:5] neg_lo:[0,0,1] ; encoding: [0x02,0x40,0x48,0xcc,0x00,0x03,0x0a,0x9c] + +[0x02,0x41,0x48,0xcc,0x00,0x03,0x0a,0x1c] # neg_hi:[1,0,0] +# GFX12-ERR: warning: invalid instruction encoding + +[0x02,0x42,0x48,0xcc,0x00,0x03,0x0a,0x1c] # neg_hi:[0,1,0] +# GFX12-ERR: warning: invalid instruction encoding + +[0x02,0x44,0x48,0xcc,0x00,0x03,0x0a,0x1c] # neg_hi:[0,0,1] +# GFX12: v_wmma_f32_16x16x16_bf8_fp8 v[2:5], v0, v1, v[2:5] neg_hi:[0,0,1] ; encoding: [0x02,0x44,0x48,0xcc,0x00,0x03,0x0a,0x1c] + +[0x02,0x40,0x48,0xcc,0x00,0x02,0x0a,0x1c] # sgpr_0 src0 +# GFX12: v_wmma_f32_16x16x16_bf8_fp8 v[2:5], s0/*Invalid register, operand has 'VGPR_32' register class*/, v1, v[2:5] ; encoding: [0x02,0x40,0x48,0xcc,0x00,0x02,0x0a,0x1c] + +[0x02,0x40,0x48,0xcc,0x00,0x01,0x08,0x1c] # sgpr_0 src1 +# GFX12: v_wmma_f32_16x16x16_bf8_fp8 v[2:5], v0, s0/*Invalid register, operand has 'VGPR_32' register class*/, v[2:5] ; encoding: [0x02,0x40,0x48,0xcc,0x00,0x01,0x08,0x1c] + +[0x02,0x40,0x48,0xcc,0x00,0x03,0x02,0x18] # sgpr_0 src2 +# GFX12: v_wmma_f32_16x16x16_bf8_fp8 v[2:5], v0, v1, s[0:3]/*Invalid register, operand has 'VReg_128' register class*/ ; encoding: [0x02,0x40,0x48,0xcc,0x00,0x03,0x02,0x18] + +[0x02,0x40,0x48,0xcc,0xf2,0x02,0x0a,0x1c] # 1.0 src0 +# GFX12: v_wmma_f32_16x16x16_bf8_fp8 v[2:5], 1.0/*Invalid immediate*/, v1, v[2:5] ; encoding: [0x02,0x40,0x48,0xcc,0x00,0x02,0x0a,0x1c] + +[0x02,0x40,0x48,0xcc,0x00,0xe5,0x09,0x1c] # 1.0 src1 +# GFX12: v_wmma_f32_16x16x16_bf8_fp8 v[2:5], v0, 1.0/*Invalid immediate*/, v[2:5] ; encoding: [0x02,0x40,0x48,0xcc,0x00,0x01,0x08,0x1c] + +[0x02,0x40,0x48,0xcc,0x00,0x03,0xca,0x1b] # 1.0 src2 +# GFX12: v_wmma_f32_16x16x16_bf8_fp8 v[2:5], v0, v1, 1.0 ; encoding: [0x02,0x40,0x48,0xcc,0x00,0x03,0xca,0x1b] + +[0x02,0x40,0x48,0xcc,0x00,0x03,0x06,0x1a] # 1 src2 +# GFX12: v_wmma_f32_16x16x16_bf8_fp8 v[2:5], v0, v1, 1 ; encoding: [0x02,0x40,0x48,0xcc,0x00,0x03,0x06,0x1a] + + + +[0x02,0x40,0x49,0xcc,0x00,0x03,0x0a,0x1c] +# GFX12: v_wmma_f32_16x16x16_bf8_bf8 v[2:5], v0, v1, v[2:5] ; encoding: [0x02,0x40,0x49,0xcc,0x00,0x03,0x0a,0x1c] + +[0x02,0xc0,0x49,0xcc,0x00,0x03,0x0a,0x1c] # clamp +# GFX12-ERR: warning: invalid instruction encoding + +[0x02,0x48,0x49,0xcc,0x00,0x03,0x0a,0x1c] # op_sel:[1,0,0] +# GFX12-ERR: warning: invalid instruction encoding + +[0x02,0x50,0x49,0xcc,0x00,0x03,0x0a,0x1c] # op_sel:[0,1,0] +# GFX12-ERR: warning: invalid instruction encoding + +[0x02,0x60,0x49,0xcc,0x00,0x03,0x0a,0x1c] # op_sel:[0,0,1] +# GFX12-ERR: warning: invalid instruction encoding + +[0x02,0x40,0x49,0xcc,0x00,0x03,0x0a,0x14] # op_sel_hi:[0,1,1] +# GFX12-ERR: warning: invalid instruction encoding + +[0x02,0x40,0x49,0xcc,0x00,0x03,0x0a,0x0c] # op_sel_hi:[1,0,1] +# GFX12-ERR: warning: invalid instruction encoding + +[0x02,0x00,0x49,0xcc,0x00,0x03,0x0a,0x1c] # op_sel_hi:[1,1,0] +# GFX12-ERR: warning: invalid instruction encoding + +[0x02,0x48,0x49,0xcc,0x00,0x03,0x0a,0x1c] # index_key:1 +# GFX12-ERR: warning: invalid instruction encoding + +[0x02,0x50,0x49,0xcc,0x00,0x03,0x0a,0x1c] # index_key:2 +# GFX12-ERR: warning: invalid instruction encoding + +[0x02,0x58,0x49,0xcc,0x00,0x03,0x0a,0x1c] # index_key:3 +# GFX12-ERR: warning: invalid instruction encoding + +[0x02,0x40,0x49,0xcc,0x00,0x03,0x0a,0x3c] # neg_lo:[1,0,0] +# GFX12-ERR: warning: invalid instruction encoding + +[0x02,0x40,0x49,0xcc,0x00,0x03,0x0a,0x5c] # neg_lo:[0,1,0] +# GFX12-ERR: warning: invalid instruction encoding + +[0x02,0x40,0x49,0xcc,0x00,0x03,0x0a,0x9c] # neg_lo:[0,0,1] +# GFX12: v_wmma_f32_16x16x16_bf8_bf8 v[2:5], v0, v1, v[2:5] neg_lo:[0,0,1] ; encoding: [0x02,0x40,0x49,0xcc,0x00,0x03,0x0a,0x9c] + +[0x02,0x41,0x49,0xcc,0x00,0x03,0x0a,0x1c] # neg_hi:[1,0,0] +# GFX12-ERR: warning: invalid instruction encoding + +[0x02,0x42,0x49,0xcc,0x00,0x03,0x0a,0x1c] # neg_hi:[0,1,0] +# GFX12-ERR: warning: invalid instruction encoding + +[0x02,0x44,0x49,0xcc,0x00,0x03,0x0a,0x1c] # neg_hi:[0,0,1] +# GFX12: v_wmma_f32_16x16x16_bf8_bf8 v[2:5], v0, v1, v[2:5] neg_hi:[0,0,1] ; encoding: [0x02,0x44,0x49,0xcc,0x00,0x03,0x0a,0x1c] + +[0x02,0x40,0x49,0xcc,0x00,0x02,0x0a,0x1c] # sgpr_0 src0 +# GFX12: v_wmma_f32_16x16x16_bf8_bf8 v[2:5], s0/*Invalid register, operand has 'VGPR_32' register class*/, v1, v[2:5] ; encoding: [0x02,0x40,0x49,0xcc,0x00,0x02,0x0a,0x1c] + +[0x02,0x40,0x49,0xcc,0x00,0x01,0x08,0x1c] # sgpr_0 src1 +# GFX12: v_wmma_f32_16x16x16_bf8_bf8 v[2:5], v0, s0/*Invalid register, operand has 'VGPR_32' register class*/, v[2:5] ; encoding: [0x02,0x40,0x49,0xcc,0x00,0x01,0x08,0x1c] + +[0x02,0x40,0x49,0xcc,0x00,0x03,0x02,0x18] # sgpr_0 src2 +# GFX12: v_wmma_f32_16x16x16_bf8_bf8 v[2:5], v0, v1, s[0:3]/*Invalid register, operand has 'VReg_128' register class*/ ; encoding: [0x02,0x40,0x49,0xcc,0x00,0x03,0x02,0x18] + +[0x02,0x40,0x49,0xcc,0xf2,0x02,0x0a,0x1c] # 1.0 src0 +# GFX12: v_wmma_f32_16x16x16_bf8_bf8 v[2:5], 1.0/*Invalid immediate*/, v1, v[2:5] ; encoding: [0x02,0x40,0x49,0xcc,0x00,0x02,0x0a,0x1c] + +[0x02,0x40,0x49,0xcc,0x00,0xe5,0x09,0x1c] # 1.0 src1 +# GFX12: v_wmma_f32_16x16x16_bf8_bf8 v[2:5], v0, 1.0/*Invalid immediate*/, v[2:5] ; encoding: [0x02,0x40,0x49,0xcc,0x00,0x01,0x08,0x1c] + +[0x02,0x40,0x49,0xcc,0x00,0x03,0xca,0x1b] # 1.0 src2 +# GFX12: v_wmma_f32_16x16x16_bf8_bf8 v[2:5], v0, v1, 1.0 ; encoding: [0x02,0x40,0x49,0xcc,0x00,0x03,0xca,0x1b] + +[0x02,0x40,0x49,0xcc,0x00,0x03,0x06,0x1a] # 1 src2 +# GFX12: v_wmma_f32_16x16x16_bf8_bf8 v[2:5], v0, v1, 1 ; encoding: [0x02,0x40,0x49,0xcc,0x00,0x03,0x06,0x1a] + + + +[0x02,0x40,0x4a,0xcc,0x00,0x03,0x0a,0x1c] +# GFX12: v_wmma_i32_16x16x32_iu4 v[2:5], v0, v1, v[2:5] ; encoding: [0x02,0x40,0x4a,0xcc,0x00,0x03,0x0a,0x1c] + +[0x02,0xc0,0x4a,0xcc,0x00,0x03,0x0a,0x1c] # clamp +# GFX12: v_wmma_i32_16x16x32_iu4 v[2:5], v0, v1, v[2:5] clamp ; encoding: [0x02,0xc0,0x4a,0xcc,0x00,0x03,0x0a,0x1c] + +[0x02,0x48,0x4a,0xcc,0x00,0x03,0x0a,0x1c] # op_sel:[1,0,0] +# GFX12-ERR: warning: invalid instruction encoding + +[0x02,0x50,0x4a,0xcc,0x00,0x03,0x0a,0x1c] # op_sel:[0,1,0] +# GFX12-ERR: warning: invalid instruction encoding + +[0x02,0x60,0x4a,0xcc,0x00,0x03,0x0a,0x1c] # op_sel:[0,0,1] +# GFX12-ERR: warning: invalid instruction encoding + +[0x02,0x40,0x4a,0xcc,0x00,0x03,0x0a,0x14] # op_sel_hi:[0,1,1] +# GFX12-ERR: warning: invalid instruction encoding + +[0x02,0x40,0x4a,0xcc,0x00,0x03,0x0a,0x0c] # op_sel_hi:[1,0,1] +# GFX12-ERR: warning: invalid instruction encoding + +[0x02,0x00,0x4a,0xcc,0x00,0x03,0x0a,0x1c] # op_sel_hi:[1,1,0] +# GFX12-ERR: warning: invalid instruction encoding + +[0x02,0x48,0x4a,0xcc,0x00,0x03,0x0a,0x1c] # index_key:1 +# GFX12-ERR: warning: invalid instruction encoding + +[0x02,0x50,0x4a,0xcc,0x00,0x03,0x0a,0x1c] # index_key:2 +# GFX12-ERR: warning: invalid instruction encoding + +[0x02,0x58,0x4a,0xcc,0x00,0x03,0x0a,0x1c] # index_key:3 +# GFX12-ERR: warning: invalid instruction encoding + +[0x02,0x40,0x4a,0xcc,0x00,0x03,0x0a,0x3c] # neg_lo:[1,0,0] +# GFX12: v_wmma_i32_16x16x32_iu4 v[2:5], v0, v1, v[2:5] neg_lo:[1,0,0] ; encoding: [0x02,0x40,0x4a,0xcc,0x00,0x03,0x0a,0x3c] + +[0x02,0x40,0x4a,0xcc,0x00,0x03,0x0a,0x5c] # neg_lo:[0,1,0] +# GFX12: v_wmma_i32_16x16x32_iu4 v[2:5], v0, v1, v[2:5] neg_lo:[0,1,0] ; encoding: [0x02,0x40,0x4a,0xcc,0x00,0x03,0x0a,0x5c] + +[0x02,0x40,0x4a,0xcc,0x00,0x03,0x0a,0x9c] # neg_lo:[0,0,1] +# GFX12-ERR: warning: invalid instruction encoding + +[0x02,0x41,0x4a,0xcc,0x00,0x03,0x0a,0x1c] # neg_hi:[1,0,0] +# GFX12-ERR: warning: invalid instruction encoding + +[0x02,0x42,0x4a,0xcc,0x00,0x03,0x0a,0x1c] # neg_hi:[0,1,0] +# GFX12-ERR: warning: invalid instruction encoding + +[0x02,0x44,0x4a,0xcc,0x00,0x03,0x0a,0x1c] # neg_hi:[0,0,1] +# GFX12-ERR: warning: invalid instruction encoding + +[0x02,0x40,0x4a,0xcc,0x00,0x02,0x0a,0x1c] # sgpr_0 src0 +# GFX12: v_wmma_i32_16x16x32_iu4 v[2:5], s0/*Invalid register, operand has 'VGPR_32' register class*/, v1, v[2:5] ; encoding: [0x02,0x40,0x4a,0xcc,0x00,0x02,0x0a,0x1c] + +[0x02,0x40,0x4a,0xcc,0x00,0x01,0x08,0x1c] # sgpr_0 src1 +# GFX12: v_wmma_i32_16x16x32_iu4 v[2:5], v0, s0/*Invalid register, operand has 'VGPR_32' register class*/, v[2:5] ; encoding: [0x02,0x40,0x4a,0xcc,0x00,0x01,0x08,0x1c] + +[0x02,0x40,0x4a,0xcc,0x00,0x03,0x02,0x18] # sgpr_0 src2 +# GFX12: v_wmma_i32_16x16x32_iu4 v[2:5], v0, v1, s[0:3]/*Invalid register, operand has 'VReg_128' register class*/ ; encoding: [0x02,0x40,0x4a,0xcc,0x00,0x03,0x02,0x18] + +[0x02,0x40,0x4a,0xcc,0x81,0x02,0x0a,0x1c] # 1 src0 +# GFX12: v_wmma_i32_16x16x32_iu4 v[2:5], 1/*Invalid immediate*/, v1, v[2:5] ; encoding: [0x02,0x40,0x4a,0xcc,0x01,0x02,0x0a,0x1c] + +[0x02,0x40,0x4a,0xcc,0x00,0x03,0x09,0x1c] # 1 src1 +# GFX12: v_wmma_i32_16x16x32_iu4 v[2:5], v0, 1/*Invalid immediate*/, v[2:5] ; encoding: [0x02,0x40,0x4a,0xcc,0x00,0x03,0x08,0x1c] + +[0x02,0x40,0x4a,0xcc,0x00,0x03,0x06,0x1a] # 1 src2 +# GFX12: v_wmma_i32_16x16x32_iu4 v[2:5], v0, v1, 1 ; encoding: [0x02,0x40,0x4a,0xcc,0x00,0x03,0x06,0x1a] + +[0x02,0x40,0x4a,0xcc,0x00,0x03,0xca,0x1b] # 1.0 src2 +# GFX12: v_wmma_i32_16x16x32_iu4 v[2:5], v0, v1, 1.0 ; encoding: [0x02,0x40,0x4a,0xcc,0x00,0x03,0xca,0x1b] + + + +[0x06,0x40,0x50,0xcc,0x00,0x05,0x2a,0x1c] +# GFX12: v_swmmac_f32_16x16x32_f16 v[6:9], v[0:1], v[2:5], v10 ; encoding: [0x06,0x40,0x50,0xcc,0x00,0x05,0x2a,0x1c] + +[0x06,0xc0,0x50,0xcc,0x00,0x05,0x2a,0x1c] # clamp +# GFX12-ERR: warning: invalid instruction encoding + +[0x06,0x48,0x50,0xcc,0x00,0x05,0x2a,0x1c] # op_sel:[1,0,0] +# GFX12: v_swmmac_f32_16x16x32_f16 v[6:9], v[0:1], v[2:5], v10 index_key:1 ; encoding: [0x06,0x48,0x50,0xcc,0x00,0x05,0x2a,0x1c] + +[0x06,0x50,0x50,0xcc,0x00,0x05,0x2a,0x1c] # op_sel:[0,1,0] +# GFX12: v_swmmac_f32_16x16x32_f16 v[6:9], v[0:1], v[2:5], v10 index_key:2 ; encoding: [0x06,0x50,0x50,0xcc,0x00,0x05,0x2a,0x1c] + +[0x06,0x60,0x50,0xcc,0x00,0x05,0x2a,0x1c] # op_sel:[0,0,1] +# GFX12-ERR: warning: invalid instruction encoding + +[0x06,0x40,0x50,0xcc,0x00,0x05,0x2a,0x14] # op_sel_hi:[0,1,1] +# GFX12-ERR: warning: invalid instruction encoding + +[0x06,0x40,0x50,0xcc,0x00,0x05,0x2a,0x0c] # op_sel_hi:[1,0,1] +# GFX12-ERR: warning: invalid instruction encoding + +[0x06,0x00,0x50,0xcc,0x00,0x05,0x2a,0x1c] # op_sel_hi:[1,1,0] +# GFX12-ERR: warning: invalid instruction encoding + +[0x06,0x48,0x50,0xcc,0x00,0x05,0x2a,0x1c] # index_key:1 +# GFX12: v_swmmac_f32_16x16x32_f16 v[6:9], v[0:1], v[2:5], v10 index_key:1 ; encoding: [0x06,0x48,0x50,0xcc,0x00,0x05,0x2a,0x1c] + +[0x06,0x50,0x50,0xcc,0x00,0x05,0x2a,0x1c] # index_key:2 +# GFX12: v_swmmac_f32_16x16x32_f16 v[6:9], v[0:1], v[2:5], v10 index_key:2 ; encoding: [0x06,0x50,0x50,0xcc,0x00,0x05,0x2a,0x1c] + +[0x06,0x58,0x50,0xcc,0x00,0x05,0x2a,0x1c] # index_key:3 +# GFX12: v_swmmac_f32_16x16x32_f16 v[6:9], v[0:1], v[2:5], v10 index_key:3 ; encoding: [0x06,0x58,0x50,0xcc,0x00,0x05,0x2a,0x1c] + +[0x06,0x40,0x50,0xcc,0x00,0x05,0x2a,0x3c] # neg_lo:[1,0,0] +# GFX12: v_swmmac_f32_16x16x32_f16 v[6:9], v[0:1], v[2:5], v10 neg_lo:[1,0,0] ; encoding: [0x06,0x40,0x50,0xcc,0x00,0x05,0x2a,0x3c] + +[0x06,0x40,0x50,0xcc,0x00,0x05,0x2a,0x5c] # neg_lo:[0,1,0] +# GFX12: v_swmmac_f32_16x16x32_f16 v[6:9], v[0:1], v[2:5], v10 neg_lo:[0,1,0] ; encoding: [0x06,0x40,0x50,0xcc,0x00,0x05,0x2a,0x5c] + +[0x06,0x40,0x50,0xcc,0x00,0x05,0x2a,0x9c] # neg_lo:[0,0,1] +# GFX12-ERR: warning: invalid instruction encoding + +[0x06,0x41,0x50,0xcc,0x00,0x05,0x2a,0x1c] # neg_hi:[1,0,0] +# GFX12: v_swmmac_f32_16x16x32_f16 v[6:9], v[0:1], v[2:5], v10 neg_hi:[1,0,0] ; encoding: [0x06,0x41,0x50,0xcc,0x00,0x05,0x2a,0x1c] + +[0x06,0x42,0x50,0xcc,0x00,0x05,0x2a,0x1c] # neg_hi:[0,1,0] +# GFX12: v_swmmac_f32_16x16x32_f16 v[6:9], v[0:1], v[2:5], v10 neg_hi:[0,1,0] ; encoding: [0x06,0x42,0x50,0xcc,0x00,0x05,0x2a,0x1c] + +[0x06,0x44,0x50,0xcc,0x00,0x05,0x2a,0x1c] # neg_hi:[0,0,1] +# GFX12-ERR: warning: invalid instruction encoding + +[0x06,0x40,0x50,0xcc,0x00,0x04,0x2a,0x1c] # sgpr_0 src0 +# GFX12: v_swmmac_f32_16x16x32_f16 v[6:9], s[0:1]/*Invalid register, operand has 'VReg_64' register class*/, v[2:5], v10 ; encoding: [0x06,0x40,0x50,0xcc,0x00,0x04,0x2a,0x1c] + +[0x06,0x40,0x50,0xcc,0x00,0x01,0x28,0x1c] # sgpr_0 src1 +# GFX12: v_swmmac_f32_16x16x32_f16 v[6:9], v[0:1], s[0:3]/*Invalid register, operand has 'VReg_128' register class*/, v10 ; encoding: [0x06,0x40,0x50,0xcc,0x00,0x01,0x28,0x1c] + +[0x06,0x40,0x50,0xcc,0x00,0x05,0x02,0x18] # sgpr_0 src2 +# GFX12: v_swmmac_f32_16x16x32_f16 v[6:9], v[0:1], v[2:5], s0/*Invalid register, operand has 'VGPR_32' register class*/ ; encoding: [0x06,0x40,0x50,0xcc,0x00,0x05,0x02,0x18] + +[0x06,0x40,0x50,0xcc,0xf2,0x04,0x2a,0x1c] # 1.0 src0 +# GFX12: v_swmmac_f32_16x16x32_f16 v[6:9], 1.0/*Invalid immediate*/, v[2:5], v10 ; encoding: [0x06,0x40,0x50,0xcc,0x00,0x04,0x2a,0x1c] + +[0x06,0x40,0x50,0xcc,0x00,0xe5,0x29,0x1c] # 1.0 src1 +# GFX12: v_swmmac_f32_16x16x32_f16 v[6:9], v[0:1], 1.0/*Invalid immediate*/, v10 ; encoding: [0x06,0x40,0x50,0xcc,0x00,0x01,0x28,0x1c] + +[0x06,0x40,0x50,0xcc,0x00,0x05,0xca,0x1b] # 1.0 src2 +# GFX12: v_swmmac_f32_16x16x32_f16 v[6:9], v[0:1], v[2:5], 1.0/*Invalid immediate*/ ; encoding: [0x06,0x40,0x50,0xcc,0x00,0x05,0x02,0x18] + +[0x06,0x40,0x50,0xcc,0x00,0x05,0x06,0x1a] # 1 src2 +# GFX12: v_swmmac_f32_16x16x32_f16 v[6:9], v[0:1], v[2:5], 1/*Invalid immediate*/ ; encoding: [0x06,0x40,0x50,0xcc,0x00,0x05,0x06,0x18] + + + +[0x06,0x40,0x51,0xcc,0x00,0x05,0x2a,0x1c] +# GFX12: v_swmmac_f32_16x16x32_bf16 v[6:9], v[0:1], v[2:5], v10 ; encoding: [0x06,0x40,0x51,0xcc,0x00,0x05,0x2a,0x1c] + +[0x06,0xc0,0x51,0xcc,0x00,0x05,0x2a,0x1c] # clamp +# GFX12-ERR: warning: invalid instruction encoding + +[0x06,0x48,0x51,0xcc,0x00,0x05,0x2a,0x1c] # op_sel:[1,0,0] +# GFX12: v_swmmac_f32_16x16x32_bf16 v[6:9], v[0:1], v[2:5], v10 index_key:1 ; encoding: [0x06,0x48,0x51,0xcc,0x00,0x05,0x2a,0x1c] + +[0x06,0x50,0x51,0xcc,0x00,0x05,0x2a,0x1c] # op_sel:[0,1,0] +# GFX12: v_swmmac_f32_16x16x32_bf16 v[6:9], v[0:1], v[2:5], v10 index_key:2 ; encoding: [0x06,0x50,0x51,0xcc,0x00,0x05,0x2a,0x1c] + +[0x06,0x60,0x51,0xcc,0x00,0x05,0x2a,0x1c] # op_sel:[0,0,1] +# GFX12-ERR: warning: invalid instruction encoding + +[0x06,0x40,0x51,0xcc,0x00,0x05,0x2a,0x14] # op_sel_hi:[0,1,1] +# GFX12-ERR: warning: invalid instruction encoding + +[0x06,0x40,0x51,0xcc,0x00,0x05,0x2a,0x0c] # op_sel_hi:[1,0,1] +# GFX12-ERR: warning: invalid instruction encoding + +[0x06,0x00,0x51,0xcc,0x00,0x05,0x2a,0x1c] # op_sel_hi:[1,1,0] +# GFX12-ERR: warning: invalid instruction encoding + +[0x06,0x48,0x51,0xcc,0x00,0x05,0x2a,0x1c] # index_key:1 +# GFX12: v_swmmac_f32_16x16x32_bf16 v[6:9], v[0:1], v[2:5], v10 index_key:1 ; encoding: [0x06,0x48,0x51,0xcc,0x00,0x05,0x2a,0x1c] + +[0x06,0x50,0x51,0xcc,0x00,0x05,0x2a,0x1c] # index_key:2 +# GFX12: v_swmmac_f32_16x16x32_bf16 v[6:9], v[0:1], v[2:5], v10 index_key:2 ; encoding: [0x06,0x50,0x51,0xcc,0x00,0x05,0x2a,0x1c] + +[0x06,0x58,0x51,0xcc,0x00,0x05,0x2a,0x1c] # index_key:3 +# GFX12: v_swmmac_f32_16x16x32_bf16 v[6:9], v[0:1], v[2:5], v10 index_key:3 ; encoding: [0x06,0x58,0x51,0xcc,0x00,0x05,0x2a,0x1c] + +[0x06,0x40,0x51,0xcc,0x00,0x05,0x2a,0x3c] # neg_lo:[1,0,0] +# GFX12: v_swmmac_f32_16x16x32_bf16 v[6:9], v[0:1], v[2:5], v10 neg_lo:[1,0,0] ; encoding: [0x06,0x40,0x51,0xcc,0x00,0x05,0x2a,0x3c] + +[0x06,0x40,0x51,0xcc,0x00,0x05,0x2a,0x5c] # neg_lo:[0,1,0] +# GFX12: v_swmmac_f32_16x16x32_bf16 v[6:9], v[0:1], v[2:5], v10 neg_lo:[0,1,0] ; encoding: [0x06,0x40,0x51,0xcc,0x00,0x05,0x2a,0x5c] + +[0x06,0x40,0x51,0xcc,0x00,0x05,0x2a,0x9c] # neg_lo:[0,0,1] +# GFX12-ERR: warning: invalid instruction encoding + +[0x06,0x41,0x51,0xcc,0x00,0x05,0x2a,0x1c] # neg_hi:[1,0,0] +# GFX12: v_swmmac_f32_16x16x32_bf16 v[6:9], v[0:1], v[2:5], v10 neg_hi:[1,0,0] ; encoding: [0x06,0x41,0x51,0xcc,0x00,0x05,0x2a,0x1c] + +[0x06,0x42,0x51,0xcc,0x00,0x05,0x2a,0x1c] # neg_hi:[0,1,0] +# GFX12: v_swmmac_f32_16x16x32_bf16 v[6:9], v[0:1], v[2:5], v10 neg_hi:[0,1,0] ; encoding: [0x06,0x42,0x51,0xcc,0x00,0x05,0x2a,0x1c] + +[0x06,0x44,0x51,0xcc,0x00,0x05,0x2a,0x1c] # neg_hi:[0,0,1] +# GFX12-ERR: warning: invalid instruction encoding + +[0x06,0x40,0x51,0xcc,0x00,0x04,0x2a,0x1c] # sgpr_0 src0 +# GFX12: v_swmmac_f32_16x16x32_bf16 v[6:9], s[0:1]/*Invalid register, operand has 'VReg_64' register class*/, v[2:5], v10 ; encoding: [0x06,0x40,0x51,0xcc,0x00,0x04,0x2a,0x1c] + +[0x06,0x40,0x51,0xcc,0x00,0x01,0x28,0x1c] # sgpr_0 src1 +# GFX12: v_swmmac_f32_16x16x32_bf16 v[6:9], v[0:1], s[0:3]/*Invalid register, operand has 'VReg_128' register class*/, v10 ; encoding: [0x06,0x40,0x51,0xcc,0x00,0x01,0x28,0x1c] + +[0x06,0x40,0x51,0xcc,0x00,0x05,0x02,0x18] # sgpr_0 src2 +# GFX12: v_swmmac_f32_16x16x32_bf16 v[6:9], v[0:1], v[2:5], s0/*Invalid register, operand has 'VGPR_32' register class*/ ; encoding: [0x06,0x40,0x51,0xcc,0x00,0x05,0x02,0x18] + +[0x06,0x40,0x51,0xcc,0xf2,0x04,0x2a,0x1c] # 1.0 src0 +# GFX12: v_swmmac_f32_16x16x32_bf16 v[6:9], 1.0/*Invalid immediate*/, v[2:5], v10 ; encoding: [0x06,0x40,0x51,0xcc,0x00,0x04,0x2a,0x1c] + +[0x06,0x40,0x51,0xcc,0x00,0xe5,0x29,0x1c] # 1.0 src1 +# GFX12: v_swmmac_f32_16x16x32_bf16 v[6:9], v[0:1], 1.0/*Invalid immediate*/, v10 ; encoding: [0x06,0x40,0x51,0xcc,0x00,0x01,0x28,0x1c] + +[0x06,0x40,0x51,0xcc,0x00,0x05,0xca,0x1b] # 1.0 src2 +# GFX12: v_swmmac_f32_16x16x32_bf16 v[6:9], v[0:1], v[2:5], 1.0/*Invalid immediate*/ ; encoding: [0x06,0x40,0x51,0xcc,0x00,0x05,0x02,0x18] + +[0x06,0x40,0x51,0xcc,0x00,0x05,0x06,0x1a] # 1 src2 +# GFX12: v_swmmac_f32_16x16x32_bf16 v[6:9], v[0:1], v[2:5], 1/*Invalid immediate*/ ; encoding: [0x06,0x40,0x51,0xcc,0x00,0x05,0x06,0x18] + + + +[0x06,0x40,0x52,0xcc,0x00,0x05,0x22,0x1c] +# GFX12: v_swmmac_f16_16x16x32_f16 v[6:7], v[0:1], v[2:5], v8 ; encoding: [0x06,0x40,0x52,0xcc,0x00,0x05,0x22,0x1c] + +[0x06,0xc0,0x52,0xcc,0x00,0x05,0x22,0x1c] # clamp +# GFX12-ERR: warning: invalid instruction encoding + +[0x06,0x48,0x52,0xcc,0x00,0x05,0x22,0x1c] # op_sel:[1,0,0] +# GFX12: v_swmmac_f16_16x16x32_f16 v[6:7], v[0:1], v[2:5], v8 index_key:1 ; encoding: [0x06,0x48,0x52,0xcc,0x00,0x05,0x22,0x1c] + +[0x06,0x50,0x52,0xcc,0x00,0x05,0x22,0x1c] # op_sel:[0,1,0] +# GFX12: v_swmmac_f16_16x16x32_f16 v[6:7], v[0:1], v[2:5], v8 index_key:2 ; encoding: [0x06,0x50,0x52,0xcc,0x00,0x05,0x22,0x1c] + +[0x06,0x60,0x52,0xcc,0x00,0x05,0x22,0x1c] # op_sel:[0,0,1] +# GFX12-ERR: warning: invalid instruction encoding + +[0x06,0x40,0x52,0xcc,0x00,0x05,0x22,0x14] # op_sel_hi:[0,1,1] +# GFX12-ERR: warning: invalid instruction encoding + +[0x06,0x40,0x52,0xcc,0x00,0x05,0x22,0x0c] # op_sel_hi:[1,0,1] +# GFX12-ERR: warning: invalid instruction encoding + +[0x06,0x00,0x52,0xcc,0x00,0x05,0x22,0x1c] # op_sel_hi:[1,1,0] +# GFX12-ERR: warning: invalid instruction encoding + +[0x06,0x48,0x52,0xcc,0x00,0x05,0x22,0x1c] # index_key:1 +# GFX12: v_swmmac_f16_16x16x32_f16 v[6:7], v[0:1], v[2:5], v8 index_key:1 ; encoding: [0x06,0x48,0x52,0xcc,0x00,0x05,0x22,0x1c] + +[0x06,0x50,0x52,0xcc,0x00,0x05,0x22,0x1c] # index_key:2 +# GFX12: v_swmmac_f16_16x16x32_f16 v[6:7], v[0:1], v[2:5], v8 index_key:2 ; encoding: [0x06,0x50,0x52,0xcc,0x00,0x05,0x22,0x1c] + +[0x06,0x58,0x52,0xcc,0x00,0x05,0x22,0x1c] # index_key:3 +# GFX12: v_swmmac_f16_16x16x32_f16 v[6:7], v[0:1], v[2:5], v8 index_key:3 ; encoding: [0x06,0x58,0x52,0xcc,0x00,0x05,0x22,0x1c] + +[0x06,0x40,0x52,0xcc,0x00,0x05,0x22,0x3c] # neg_lo:[1,0,0] +# GFX12: v_swmmac_f16_16x16x32_f16 v[6:7], v[0:1], v[2:5], v8 neg_lo:[1,0,0] ; encoding: [0x06,0x40,0x52,0xcc,0x00,0x05,0x22,0x3c] + +[0x06,0x40,0x52,0xcc,0x00,0x05,0x22,0x5c] # neg_lo:[0,1,0] +# GFX12: v_swmmac_f16_16x16x32_f16 v[6:7], v[0:1], v[2:5], v8 neg_lo:[0,1,0] ; encoding: [0x06,0x40,0x52,0xcc,0x00,0x05,0x22,0x5c] + +[0x06,0x40,0x52,0xcc,0x00,0x05,0x22,0x9c] # neg_lo:[0,0,1] +# GFX12-ERR: warning: invalid instruction encoding + +[0x06,0x41,0x52,0xcc,0x00,0x05,0x22,0x1c] # neg_hi:[1,0,0] +# GFX12: v_swmmac_f16_16x16x32_f16 v[6:7], v[0:1], v[2:5], v8 neg_hi:[1,0,0] ; encoding: [0x06,0x41,0x52,0xcc,0x00,0x05,0x22,0x1c] + +[0x06,0x42,0x52,0xcc,0x00,0x05,0x22,0x1c] # neg_hi:[0,1,0] +# GFX12: v_swmmac_f16_16x16x32_f16 v[6:7], v[0:1], v[2:5], v8 neg_hi:[0,1,0] ; encoding: [0x06,0x42,0x52,0xcc,0x00,0x05,0x22,0x1c] + +[0x06,0x44,0x52,0xcc,0x00,0x05,0x22,0x1c] # neg_hi:[0,0,1] +# GFX12-ERR: warning: invalid instruction encoding + +[0x06,0x40,0x52,0xcc,0x00,0x04,0x22,0x1c] # sgpr_0 src0 +# GFX12: v_swmmac_f16_16x16x32_f16 v[6:7], s[0:1]/*Invalid register, operand has 'VReg_64' register class*/, v[2:5], v8 ; encoding: [0x06,0x40,0x52,0xcc,0x00,0x04,0x22,0x1c] + +[0x06,0x40,0x52,0xcc,0x00,0x01,0x20,0x1c] # sgpr_0 src1 +# GFX12: v_swmmac_f16_16x16x32_f16 v[6:7], v[0:1], s[0:3]/*Invalid register, operand has 'VReg_128' register class*/, v8 ; encoding: [0x06,0x40,0x52,0xcc,0x00,0x01,0x20,0x1c] + +[0x06,0x40,0x52,0xcc,0x00,0x05,0x02,0x18] # sgpr_0 src2 +# GFX12: v_swmmac_f16_16x16x32_f16 v[6:7], v[0:1], v[2:5], s0/*Invalid register, operand has 'VGPR_32' register class*/ ; encoding: [0x06,0x40,0x52,0xcc,0x00,0x05,0x02,0x18] + +[0x06,0x40,0x52,0xcc,0xf2,0x04,0x22,0x1c] # 1.0 src0 +# GFX12: v_swmmac_f16_16x16x32_f16 v[6:7], 1.0/*Invalid immediate*/, v[2:5], v8 ; encoding: [0x06,0x40,0x52,0xcc,0x00,0x04,0x22,0x1c] + +[0x06,0x40,0x52,0xcc,0x00,0xe5,0x21,0x1c] # 1.0 src1 +# GFX12: v_swmmac_f16_16x16x32_f16 v[6:7], v[0:1], 1.0/*Invalid immediate*/, v8 ; encoding: [0x06,0x40,0x52,0xcc,0x00,0x01,0x20,0x1c] + +[0x06,0x40,0x52,0xcc,0x00,0x05,0xca,0x1b] # 1.0 src2 +# GFX12: v_swmmac_f16_16x16x32_f16 v[6:7], v[0:1], v[2:5], 1.0/*Invalid immediate*/ ; encoding: [0x06,0x40,0x52,0xcc,0x00,0x05,0x02,0x18] + +[0x06,0x40,0x52,0xcc,0x00,0x05,0x06,0x1a] # 1 src2 +# GFX12: v_swmmac_f16_16x16x32_f16 v[6:7], v[0:1], v[2:5], 1/*Invalid immediate*/ ; encoding: [0x06,0x40,0x52,0xcc,0x00,0x05,0x06,0x18] + + + +[0x06,0x40,0x53,0xcc,0x00,0x05,0x22,0x1c] +# GFX12: v_swmmac_bf16_16x16x32_bf16 v[6:7], v[0:1], v[2:5], v8 ; encoding: [0x06,0x40,0x53,0xcc,0x00,0x05,0x22,0x1c] + +[0x06,0xc0,0x53,0xcc,0x00,0x05,0x22,0x1c] # clamp +# GFX12-ERR: warning: invalid instruction encoding + +[0x06,0x48,0x53,0xcc,0x00,0x05,0x22,0x1c] # op_sel:[1,0,0] +# GFX12: v_swmmac_bf16_16x16x32_bf16 v[6:7], v[0:1], v[2:5], v8 index_key:1 ; encoding: [0x06,0x48,0x53,0xcc,0x00,0x05,0x22,0x1c] + +[0x06,0x50,0x53,0xcc,0x00,0x05,0x22,0x1c] # op_sel:[0,1,0] +# GFX12: v_swmmac_bf16_16x16x32_bf16 v[6:7], v[0:1], v[2:5], v8 index_key:2 ; encoding: [0x06,0x50,0x53,0xcc,0x00,0x05,0x22,0x1c] + +[0x06,0x60,0x53,0xcc,0x00,0x05,0x22,0x1c] # op_sel:[0,0,1] +# GFX12-ERR: warning: invalid instruction encoding + +[0x06,0x40,0x53,0xcc,0x00,0x05,0x22,0x14] # op_sel_hi:[0,1,1] +# GFX12-ERR: warning: invalid instruction encoding + +[0x06,0x40,0x53,0xcc,0x00,0x05,0x22,0x0c] # op_sel_hi:[1,0,1] +# GFX12-ERR: warning: invalid instruction encoding + +[0x06,0x00,0x53,0xcc,0x00,0x05,0x22,0x1c] # op_sel_hi:[1,1,0] +# GFX12-ERR: warning: invalid instruction encoding + +[0x06,0x48,0x53,0xcc,0x00,0x05,0x22,0x1c] # index_key:1 +# GFX12: v_swmmac_bf16_16x16x32_bf16 v[6:7], v[0:1], v[2:5], v8 index_key:1 ; encoding: [0x06,0x48,0x53,0xcc,0x00,0x05,0x22,0x1c] + +[0x06,0x50,0x53,0xcc,0x00,0x05,0x22,0x1c] # index_key:2 +# GFX12: v_swmmac_bf16_16x16x32_bf16 v[6:7], v[0:1], v[2:5], v8 index_key:2 ; encoding: [0x06,0x50,0x53,0xcc,0x00,0x05,0x22,0x1c] + +[0x06,0x58,0x53,0xcc,0x00,0x05,0x22,0x1c] # index_key:3 +# GFX12: v_swmmac_bf16_16x16x32_bf16 v[6:7], v[0:1], v[2:5], v8 index_key:3 ; encoding: [0x06,0x58,0x53,0xcc,0x00,0x05,0x22,0x1c] + +[0x06,0x40,0x53,0xcc,0x00,0x05,0x22,0x3c] # neg_lo:[1,0,0] +# GFX12: v_swmmac_bf16_16x16x32_bf16 v[6:7], v[0:1], v[2:5], v8 neg_lo:[1,0,0] ; encoding: [0x06,0x40,0x53,0xcc,0x00,0x05,0x22,0x3c] + +[0x06,0x40,0x53,0xcc,0x00,0x05,0x22,0x5c] # neg_lo:[0,1,0] +# GFX12: v_swmmac_bf16_16x16x32_bf16 v[6:7], v[0:1], v[2:5], v8 neg_lo:[0,1,0] ; encoding: [0x06,0x40,0x53,0xcc,0x00,0x05,0x22,0x5c] + +[0x06,0x40,0x53,0xcc,0x00,0x05,0x22,0x9c] # neg_lo:[0,0,1] +# GFX12-ERR: warning: invalid instruction encoding + +[0x06,0x41,0x53,0xcc,0x00,0x05,0x22,0x1c] # neg_hi:[1,0,0] +# GFX12: v_swmmac_bf16_16x16x32_bf16 v[6:7], v[0:1], v[2:5], v8 neg_hi:[1,0,0] ; encoding: [0x06,0x41,0x53,0xcc,0x00,0x05,0x22,0x1c] + +[0x06,0x42,0x53,0xcc,0x00,0x05,0x22,0x1c] # neg_hi:[0,1,0] +# GFX12: v_swmmac_bf16_16x16x32_bf16 v[6:7], v[0:1], v[2:5], v8 neg_hi:[0,1,0] ; encoding: [0x06,0x42,0x53,0xcc,0x00,0x05,0x22,0x1c] + +[0x06,0x44,0x53,0xcc,0x00,0x05,0x22,0x1c] # neg_hi:[0,0,1] +# GFX12-ERR: warning: invalid instruction encoding + +[0x06,0x40,0x53,0xcc,0x00,0x04,0x22,0x1c] # sgpr_0 src0 +# GFX12: v_swmmac_bf16_16x16x32_bf16 v[6:7], s[0:1]/*Invalid register, operand has 'VReg_64' register class*/, v[2:5], v8 ; encoding: [0x06,0x40,0x53,0xcc,0x00,0x04,0x22,0x1c] + +[0x06,0x40,0x53,0xcc,0x00,0x01,0x20,0x1c] # sgpr_0 src1 +# GFX12: v_swmmac_bf16_16x16x32_bf16 v[6:7], v[0:1], s[0:3]/*Invalid register, operand has 'VReg_128' register class*/, v8 ; encoding: [0x06,0x40,0x53,0xcc,0x00,0x01,0x20,0x1c] + +[0x06,0x40,0x53,0xcc,0x00,0x05,0x02,0x18] # sgpr_0 src2 +# GFX12: v_swmmac_bf16_16x16x32_bf16 v[6:7], v[0:1], v[2:5], s0/*Invalid register, operand has 'VGPR_32' register class*/ ; encoding: [0x06,0x40,0x53,0xcc,0x00,0x05,0x02,0x18] + +[0x06,0x40,0x53,0xcc,0xf2,0x04,0x22,0x1c] # 1.0 src0 +# GFX12: v_swmmac_bf16_16x16x32_bf16 v[6:7], 1.0/*Invalid immediate*/, v[2:5], v8 ; encoding: [0x06,0x40,0x53,0xcc,0x00,0x04,0x22,0x1c] + +[0x06,0x40,0x53,0xcc,0x00,0xe5,0x21,0x1c] # 1.0 src1 +# GFX12: v_swmmac_bf16_16x16x32_bf16 v[6:7], v[0:1], 1.0/*Invalid immediate*/, v8 ; encoding: [0x06,0x40,0x53,0xcc,0x00,0x01,0x20,0x1c] + +[0x06,0x40,0x53,0xcc,0x00,0x05,0xca,0x1b] # 1.0 src2 +# GFX12: v_swmmac_bf16_16x16x32_bf16 v[6:7], v[0:1], v[2:5], 1.0/*Invalid immediate*/ ; encoding: [0x06,0x40,0x53,0xcc,0x00,0x05,0x02,0x18] + +[0x06,0x40,0x53,0xcc,0x00,0x05,0x06,0x1a] # 1 src2 +# GFX12: v_swmmac_bf16_16x16x32_bf16 v[6:7], v[0:1], v[2:5], 1/*Invalid immediate*/ ; encoding: [0x06,0x40,0x53,0xcc,0x00,0x05,0x06,0x18] + + + +[0x03,0x40,0x54,0xcc,0x00,0x03,0x1e,0x1c] +# GFX12: v_swmmac_i32_16x16x32_iu8 v[3:6], v0, v[1:2], v7 ; encoding: [0x03,0x40,0x54,0xcc,0x00,0x03,0x1e,0x1c] + +[0x03,0xc0,0x54,0xcc,0x00,0x03,0x1e,0x1c] # clamp +# GFX12: v_swmmac_i32_16x16x32_iu8 v[3:6], v0, v[1:2], v7 clamp ; encoding: [0x03,0xc0,0x54,0xcc,0x00,0x03,0x1e,0x1c] + +[0x03,0x48,0x54,0xcc,0x00,0x03,0x1e,0x1c] # op_sel:[1,0,0] +# GFX12: v_swmmac_i32_16x16x32_iu8 v[3:6], v0, v[1:2], v7 index_key:1 ; encoding: [0x03,0x48,0x54,0xcc,0x00,0x03,0x1e,0x1c] + +[0x03,0x50,0x54,0xcc,0x00,0x03,0x1e,0x1c] # op_sel:[0,1,0] +# GFX12: v_swmmac_i32_16x16x32_iu8 v[3:6], v0, v[1:2], v7 index_key:2 ; encoding: [0x03,0x50,0x54,0xcc,0x00,0x03,0x1e,0x1c] + +[0x03,0x60,0x54,0xcc,0x00,0x03,0x1e,0x1c] # op_sel:[0,0,1] +# GFX12-ERR: warning: invalid instruction encoding + +[0x03,0x40,0x54,0xcc,0x00,0x03,0x1e,0x14] # op_sel_hi:[0,1,1] +# GFX12-ERR: warning: invalid instruction encoding + +[0x03,0x40,0x54,0xcc,0x00,0x03,0x1e,0x0c] # op_sel_hi:[1,0,1] +# GFX12-ERR: warning: invalid instruction encoding + +[0x03,0x00,0x54,0xcc,0x00,0x03,0x1e,0x1c] # op_sel_hi:[1,1,0] +# GFX12-ERR: warning: invalid instruction encoding + +[0x03,0x48,0x54,0xcc,0x00,0x03,0x1e,0x1c] # index_key:1 +# GFX12: v_swmmac_i32_16x16x32_iu8 v[3:6], v0, v[1:2], v7 index_key:1 ; encoding: [0x03,0x48,0x54,0xcc,0x00,0x03,0x1e,0x1c] + +[0x03,0x50,0x54,0xcc,0x00,0x03,0x1e,0x1c] # index_key:2 +# GFX12: v_swmmac_i32_16x16x32_iu8 v[3:6], v0, v[1:2], v7 index_key:2 ; encoding: [0x03,0x50,0x54,0xcc,0x00,0x03,0x1e,0x1c] + +[0x03,0x58,0x54,0xcc,0x00,0x03,0x1e,0x1c] # index_key:3 +# GFX12: v_swmmac_i32_16x16x32_iu8 v[3:6], v0, v[1:2], v7 index_key:3 ; encoding: [0x03,0x58,0x54,0xcc,0x00,0x03,0x1e,0x1c] + +[0x03,0x40,0x54,0xcc,0x00,0x03,0x1e,0x3c] # neg_lo:[1,0,0] +# GFX12: v_swmmac_i32_16x16x32_iu8 v[3:6], v0, v[1:2], v7 neg_lo:[1,0,0] ; encoding: [0x03,0x40,0x54,0xcc,0x00,0x03,0x1e,0x3c] + +[0x03,0x40,0x54,0xcc,0x00,0x03,0x1e,0x5c] # neg_lo:[0,1,0] +# GFX12: v_swmmac_i32_16x16x32_iu8 v[3:6], v0, v[1:2], v7 neg_lo:[0,1,0] ; encoding: [0x03,0x40,0x54,0xcc,0x00,0x03,0x1e,0x5c] + +[0x03,0x40,0x54,0xcc,0x00,0x03,0x1e,0x9c] # neg_lo:[0,0,1] +# GFX12-ERR: warning: invalid instruction encoding + +[0x03,0x41,0x54,0xcc,0x00,0x03,0x1e,0x1c] # neg_hi:[1,0,0] +# GFX12-ERR: warning: invalid instruction encoding + +[0x03,0x42,0x54,0xcc,0x00,0x03,0x1e,0x1c] # neg_hi:[0,1,0] +# GFX12-ERR: warning: invalid instruction encoding + +[0x03,0x44,0x54,0xcc,0x00,0x03,0x1e,0x1c] # neg_hi:[0,0,1] +# GFX12-ERR: warning: invalid instruction encoding + +[0x03,0x40,0x54,0xcc,0x00,0x02,0x1e,0x1c] # sgpr_0 src0 +# GFX12: v_swmmac_i32_16x16x32_iu8 v[3:6], s0/*Invalid register, operand has 'VGPR_32' register class*/, v[1:2], v7 ; encoding: [0x03,0x40,0x54,0xcc,0x00,0x02,0x1e,0x1c] + +[0x03,0x40,0x54,0xcc,0x00,0x01,0x1c,0x1c] # sgpr_0 src1 +# GFX12: v_swmmac_i32_16x16x32_iu8 v[3:6], v0, s[0:1]/*Invalid register, operand has 'VReg_64' register class*/, v7 ; encoding: [0x03,0x40,0x54,0xcc,0x00,0x01,0x1c,0x1c] + +[0x03,0x40,0x54,0xcc,0x00,0x03,0x02,0x18] # sgpr_0 src2 +# GFX12: v_swmmac_i32_16x16x32_iu8 v[3:6], v0, v[1:2], s0/*Invalid register, operand has 'VGPR_32' register class*/ ; encoding: [0x03,0x40,0x54,0xcc,0x00,0x03,0x02,0x18] + +[0x03,0x40,0x54,0xcc,0x81,0x02,0x1e,0x1c] # 1 src0 +# GFX12: v_swmmac_i32_16x16x32_iu8 v[3:6], 1/*Invalid immediate*/, v[1:2], v7 ; encoding: [0x03,0x40,0x54,0xcc,0x01,0x02,0x1e,0x1c] + +[0x03,0x40,0x54,0xcc,0x00,0x03,0x1d,0x1c] # 1 src1 +# GFX12: v_swmmac_i32_16x16x32_iu8 v[3:6], v0, 1/*Invalid immediate*/, v7 ; encoding: [0x03,0x40,0x54,0xcc,0x00,0x03,0x1c,0x1c] + +[0x03,0x40,0x54,0xcc,0x00,0x03,0x06,0x1a] # 1 src2 +# GFX12: v_swmmac_i32_16x16x32_iu8 v[3:6], v0, v[1:2], 1/*Invalid immediate*/ ; encoding: [0x03,0x40,0x54,0xcc,0x00,0x03,0x06,0x18] + +[0x03,0x40,0x54,0xcc,0x00,0x03,0xca,0x1b] # 1.0 src2 +# GFX12: v_swmmac_i32_16x16x32_iu8 v[3:6], v0, v[1:2], 1.0/*Invalid immediate*/ ; encoding: [0x03,0x40,0x54,0xcc,0x00,0x03,0x02,0x18] + + + +[0x02,0x40,0x55,0xcc,0x00,0x03,0x1a,0x1c] +# GFX12: v_swmmac_i32_16x16x32_iu4 v[2:5], v0, v1, v6 ; encoding: [0x02,0x40,0x55,0xcc,0x00,0x03,0x1a,0x1c] + +[0x02,0xc0,0x55,0xcc,0x00,0x03,0x1a,0x1c] # clamp +# GFX12: v_swmmac_i32_16x16x32_iu4 v[2:5], v0, v1, v6 clamp ; encoding: [0x02,0xc0,0x55,0xcc,0x00,0x03,0x1a,0x1c] + +[0x02,0x48,0x55,0xcc,0x00,0x03,0x1a,0x1c] # op_sel:[1,0,0] +# GFX12: v_swmmac_i32_16x16x32_iu4 v[2:5], v0, v1, v6 index_key:1 ; encoding: [0x02,0x48,0x55,0xcc,0x00,0x03,0x1a,0x1c] + +[0x02,0x50,0x55,0xcc,0x00,0x03,0x1a,0x1c] # op_sel:[0,1,0] +# GFX12-ERR: warning: invalid instruction encoding + +[0x02,0x60,0x55,0xcc,0x00,0x03,0x1a,0x1c] # op_sel:[0,0,1] +# GFX12-ERR: warning: invalid instruction encoding + +[0x02,0x40,0x55,0xcc,0x00,0x03,0x1a,0x14] # op_sel_hi:[0,1,1] +# GFX12-ERR: warning: invalid instruction encoding + +[0x02,0x40,0x55,0xcc,0x00,0x03,0x1a,0x0c] # op_sel_hi:[1,0,1] +# GFX12-ERR: warning: invalid instruction encoding + +[0x02,0x00,0x55,0xcc,0x00,0x03,0x1a,0x1c] # op_sel_hi:[1,1,0] +# GFX12-ERR: warning: invalid instruction encoding + +[0x02,0x48,0x55,0xcc,0x00,0x03,0x1a,0x1c] # index_key:1 +# GFX12: v_swmmac_i32_16x16x32_iu4 v[2:5], v0, v1, v6 index_key:1 ; encoding: [0x02,0x48,0x55,0xcc,0x00,0x03,0x1a,0x1c] + +[0x02,0x50,0x55,0xcc,0x00,0x03,0x1a,0x1c] # index_key:2 +# GFX12-ERR: warning: invalid instruction encoding + +[0x02,0x58,0x55,0xcc,0x00,0x03,0x1a,0x1c] # index_key:3 +# GFX12-ERR: warning: invalid instruction encoding + +[0x02,0x40,0x55,0xcc,0x00,0x03,0x1a,0x3c] # neg_lo:[1,0,0] +# GFX12: v_swmmac_i32_16x16x32_iu4 v[2:5], v0, v1, v6 neg_lo:[1,0,0] ; encoding: [0x02,0x40,0x55,0xcc,0x00,0x03,0x1a,0x3c] + +[0x02,0x40,0x55,0xcc,0x00,0x03,0x1a,0x5c] # neg_lo:[0,1,0] +# GFX12: v_swmmac_i32_16x16x32_iu4 v[2:5], v0, v1, v6 neg_lo:[0,1,0] ; encoding: [0x02,0x40,0x55,0xcc,0x00,0x03,0x1a,0x5c] + +[0x02,0x40,0x55,0xcc,0x00,0x03,0x1a,0x9c] # neg_lo:[0,0,1] +# GFX12-ERR: warning: invalid instruction encoding + +[0x02,0x41,0x55,0xcc,0x00,0x03,0x1a,0x1c] # neg_hi:[1,0,0] +# GFX12-ERR: warning: invalid instruction encoding + +[0x02,0x42,0x55,0xcc,0x00,0x03,0x1a,0x1c] # neg_hi:[0,1,0] +# GFX12-ERR: warning: invalid instruction encoding + +[0x02,0x44,0x55,0xcc,0x00,0x03,0x1a,0x1c] # neg_hi:[0,0,1] +# GFX12-ERR: warning: invalid instruction encoding + +[0x02,0x40,0x55,0xcc,0x00,0x02,0x1a,0x1c] # sgpr_0 src0 +# GFX12: v_swmmac_i32_16x16x32_iu4 v[2:5], s0/*Invalid register, operand has 'VGPR_32' register class*/, v1, v6 ; encoding: [0x02,0x40,0x55,0xcc,0x00,0x02,0x1a,0x1c] + +[0x02,0x40,0x55,0xcc,0x00,0x01,0x18,0x1c] # sgpr_0 src1 +# GFX12: v_swmmac_i32_16x16x32_iu4 v[2:5], v0, s0/*Invalid register, operand has 'VGPR_32' register class*/, v6 ; encoding: [0x02,0x40,0x55,0xcc,0x00,0x01,0x18,0x1c] + +[0x02,0x40,0x55,0xcc,0x00,0x03,0x02,0x18] # sgpr_0 src2 +# GFX12: v_swmmac_i32_16x16x32_iu4 v[2:5], v0, v1, s0/*Invalid register, operand has 'VGPR_32' register class*/ ; encoding: [0x02,0x40,0x55,0xcc,0x00,0x03,0x02,0x18] + +[0x02,0x40,0x55,0xcc,0x81,0x02,0x1a,0x1c] # 1 src0 +# GFX12: v_swmmac_i32_16x16x32_iu4 v[2:5], 1/*Invalid immediate*/, v1, v6 ; encoding: [0x02,0x40,0x55,0xcc,0x01,0x02,0x1a,0x1c] + +[0x02,0x40,0x55,0xcc,0x00,0x03,0x19,0x1c] # 1 src1 +# GFX12: v_swmmac_i32_16x16x32_iu4 v[2:5], v0, 1/*Invalid immediate*/, v6 ; encoding: [0x02,0x40,0x55,0xcc,0x00,0x03,0x18,0x1c] + +[0x02,0x40,0x55,0xcc,0x00,0x03,0x06,0x1a] # 1 src2 +# GFX12: v_swmmac_i32_16x16x32_iu4 v[2:5], v0, v1, 1/*Invalid immediate*/ ; encoding: [0x02,0x40,0x55,0xcc,0x00,0x03,0x06,0x18] + +[0x02,0x40,0x55,0xcc,0x00,0x03,0xca,0x1b] # 1.0 src2 +# GFX12: v_swmmac_i32_16x16x32_iu4 v[2:5], v0, v1, 1.0/*Invalid immediate*/ ; encoding: [0x02,0x40,0x55,0xcc,0x00,0x03,0x02,0x18] + + + +[0x03,0x40,0x56,0xcc,0x00,0x03,0x1e,0x1c] +# GFX12: v_swmmac_i32_16x16x64_iu4 v[3:6], v0, v[1:2], v7 ; encoding: [0x03,0x40,0x56,0xcc,0x00,0x03,0x1e,0x1c] + +[0x03,0xc0,0x56,0xcc,0x00,0x03,0x1e,0x1c] # clamp +# GFX12: v_swmmac_i32_16x16x64_iu4 v[3:6], v0, v[1:2], v7 clamp ; encoding: [0x03,0xc0,0x56,0xcc,0x00,0x03,0x1e,0x1c] + +[0x03,0x48,0x56,0xcc,0x00,0x03,0x1e,0x1c] # op_sel:[1,0,0] +# GFX12: v_swmmac_i32_16x16x64_iu4 v[3:6], v0, v[1:2], v7 index_key:1 ; encoding: [0x03,0x48,0x56,0xcc,0x00,0x03,0x1e,0x1c] + +[0x03,0x50,0x56,0xcc,0x00,0x03,0x1e,0x1c] # op_sel:[0,1,0] +# GFX12-ERR: warning: invalid instruction encoding + +[0x03,0x60,0x56,0xcc,0x00,0x03,0x1e,0x1c] # op_sel:[0,0,1] +# GFX12-ERR: warning: invalid instruction encoding + +[0x03,0x40,0x56,0xcc,0x00,0x03,0x1e,0x14] # op_sel_hi:[0,1,1] +# GFX12-ERR: warning: invalid instruction encoding + +[0x03,0x40,0x56,0xcc,0x00,0x03,0x1e,0x0c] # op_sel_hi:[1,0,1] +# GFX12-ERR: warning: invalid instruction encoding + +[0x03,0x00,0x56,0xcc,0x00,0x03,0x1e,0x1c] # op_sel_hi:[1,1,0] +# GFX12-ERR: warning: invalid instruction encoding + +[0x03,0x48,0x56,0xcc,0x00,0x03,0x1e,0x1c] # index_key:1 +# GFX12: v_swmmac_i32_16x16x64_iu4 v[3:6], v0, v[1:2], v7 index_key:1 ; encoding: [0x03,0x48,0x56,0xcc,0x00,0x03,0x1e,0x1c] + +[0x03,0x50,0x56,0xcc,0x00,0x03,0x1e,0x1c] # index_key:2 +# GFX12-ERR: warning: invalid instruction encoding + +[0x03,0x58,0x56,0xcc,0x00,0x03,0x1e,0x1c] # index_key:3 +# GFX12-ERR: warning: invalid instruction encoding + +[0x03,0x40,0x56,0xcc,0x00,0x03,0x1e,0x3c] # neg_lo:[1,0,0] +# GFX12: v_swmmac_i32_16x16x64_iu4 v[3:6], v0, v[1:2], v7 neg_lo:[1,0,0] ; encoding: [0x03,0x40,0x56,0xcc,0x00,0x03,0x1e,0x3c] + +[0x03,0x40,0x56,0xcc,0x00,0x03,0x1e,0x5c] # neg_lo:[0,1,0] +# GFX12: v_swmmac_i32_16x16x64_iu4 v[3:6], v0, v[1:2], v7 neg_lo:[0,1,0] ; encoding: [0x03,0x40,0x56,0xcc,0x00,0x03,0x1e,0x5c] + +[0x03,0x40,0x56,0xcc,0x00,0x03,0x1e,0x9c] # neg_lo:[0,0,1] +# GFX12-ERR: warning: invalid instruction encoding + +[0x03,0x41,0x56,0xcc,0x00,0x03,0x1e,0x1c] # neg_hi:[1,0,0] +# GFX12-ERR: warning: invalid instruction encoding + +[0x03,0x42,0x56,0xcc,0x00,0x03,0x1e,0x1c] # neg_hi:[0,1,0] +# GFX12-ERR: warning: invalid instruction encoding + +[0x03,0x44,0x56,0xcc,0x00,0x03,0x1e,0x1c] # neg_hi:[0,0,1] +# GFX12-ERR: warning: invalid instruction encoding + +[0x03,0x40,0x56,0xcc,0x00,0x02,0x1e,0x1c] # sgpr_0 src0 +# GFX12: v_swmmac_i32_16x16x64_iu4 v[3:6], s0/*Invalid register, operand has 'VGPR_32' register class*/, v[1:2], v7 ; encoding: [0x03,0x40,0x56,0xcc,0x00,0x02,0x1e,0x1c] + +[0x03,0x40,0x56,0xcc,0x00,0x01,0x1c,0x1c] # sgpr_0 src1 +# GFX12: v_swmmac_i32_16x16x64_iu4 v[3:6], v0, s[0:1]/*Invalid register, operand has 'VReg_64' register class*/, v7 ; encoding: [0x03,0x40,0x56,0xcc,0x00,0x01,0x1c,0x1c] + +[0x03,0x40,0x56,0xcc,0x00,0x03,0x02,0x18] # sgpr_0 src2 +# GFX12: v_swmmac_i32_16x16x64_iu4 v[3:6], v0, v[1:2], s0/*Invalid register, operand has 'VGPR_32' register class*/ ; encoding: [0x03,0x40,0x56,0xcc,0x00,0x03,0x02,0x18] + +[0x03,0x40,0x56,0xcc,0x81,0x02,0x1e,0x1c] # 1 src0 +# GFX12: v_swmmac_i32_16x16x64_iu4 v[3:6], 1/*Invalid immediate*/, v[1:2], v7 ; encoding: [0x03,0x40,0x56,0xcc,0x01,0x02,0x1e,0x1c] + +[0x03,0x40,0x56,0xcc,0x00,0x03,0x1d,0x1c] # 1 src1 +# GFX12: v_swmmac_i32_16x16x64_iu4 v[3:6], v0, 1/*Invalid immediate*/, v7 ; encoding: [0x03,0x40,0x56,0xcc,0x00,0x03,0x1c,0x1c] + +[0x03,0x40,0x56,0xcc,0x00,0x03,0x06,0x1a] # 1 src2 +# GFX12: v_swmmac_i32_16x16x64_iu4 v[3:6], v0, v[1:2], 1/*Invalid immediate*/ ; encoding: [0x03,0x40,0x56,0xcc,0x00,0x03,0x06,0x18] + +[0x03,0x40,0x56,0xcc,0x00,0x03,0xca,0x1b] # 1.0 src2 +# GFX12: v_swmmac_i32_16x16x64_iu4 v[3:6], v0, v[1:2], 1.0/*Invalid immediate*/ ; encoding: [0x03,0x40,0x56,0xcc,0x00,0x03,0x02,0x18] + + + +[0x03,0x40,0x57,0xcc,0x00,0x03,0x1e,0x1c] +# GFX12: v_swmmac_f32_16x16x32_fp8_fp8 v[3:6], v0, v[1:2], v7 ; encoding: [0x03,0x40,0x57,0xcc,0x00,0x03,0x1e,0x1c] + +[0x03,0xc0,0x57,0xcc,0x00,0x03,0x1e,0x1c] # clamp +# GFX12-ERR: warning: invalid instruction encoding + +[0x03,0x48,0x57,0xcc,0x00,0x03,0x1e,0x1c] # op_sel:[1,0,0] +# GFX12: v_swmmac_f32_16x16x32_fp8_fp8 v[3:6], v0, v[1:2], v7 index_key:1 ; encoding: [0x03,0x48,0x57,0xcc,0x00,0x03,0x1e,0x1c] + +[0x03,0x50,0x57,0xcc,0x00,0x03,0x1e,0x1c] # op_sel:[0,1,0] +# GFX12: v_swmmac_f32_16x16x32_fp8_fp8 v[3:6], v0, v[1:2], v7 index_key:2 ; encoding: [0x03,0x50,0x57,0xcc,0x00,0x03,0x1e,0x1c] + +[0x03,0x60,0x57,0xcc,0x00,0x03,0x1e,0x1c] # op_sel:[0,0,1] +# GFX12-ERR: warning: invalid instruction encoding + +[0x03,0x40,0x57,0xcc,0x00,0x03,0x1e,0x14] # op_sel_hi:[0,1,1] +# GFX12-ERR: warning: invalid instruction encoding + +[0x03,0x40,0x57,0xcc,0x00,0x03,0x1e,0x0c] # op_sel_hi:[1,0,1] +# GFX12-ERR: warning: invalid instruction encoding + +[0x03,0x00,0x57,0xcc,0x00,0x03,0x1e,0x1c] # op_sel_hi:[1,1,0] +# GFX12-ERR: warning: invalid instruction encoding + +[0x03,0x48,0x57,0xcc,0x00,0x03,0x1e,0x1c] # index_key:1 +# GFX12: v_swmmac_f32_16x16x32_fp8_fp8 v[3:6], v0, v[1:2], v7 index_key:1 ; encoding: [0x03,0x48,0x57,0xcc,0x00,0x03,0x1e,0x1c] + +[0x03,0x50,0x57,0xcc,0x00,0x03,0x1e,0x1c] # index_key:2 +# GFX12: v_swmmac_f32_16x16x32_fp8_fp8 v[3:6], v0, v[1:2], v7 index_key:2 ; encoding: [0x03,0x50,0x57,0xcc,0x00,0x03,0x1e,0x1c] + +[0x03,0x58,0x57,0xcc,0x00,0x03,0x1e,0x1c] # index_key:3 +# GFX12: v_swmmac_f32_16x16x32_fp8_fp8 v[3:6], v0, v[1:2], v7 index_key:3 ; encoding: [0x03,0x58,0x57,0xcc,0x00,0x03,0x1e,0x1c] + +[0x03,0x40,0x57,0xcc,0x00,0x03,0x1e,0x3c] # neg_lo:[1,0,0] +# GFX12-ERR: warning: invalid instruction encoding + +[0x03,0x40,0x57,0xcc,0x00,0x03,0x1e,0x5c] # neg_lo:[0,1,0] +# GFX12-ERR: warning: invalid instruction encoding + +[0x03,0x40,0x57,0xcc,0x00,0x03,0x1e,0x9c] # neg_lo:[0,0,1] +# GFX12-ERR: warning: invalid instruction encoding + +[0x03,0x41,0x57,0xcc,0x00,0x03,0x1e,0x1c] # neg_hi:[1,0,0] +# GFX12-ERR: warning: invalid instruction encoding + +[0x03,0x42,0x57,0xcc,0x00,0x03,0x1e,0x1c] # neg_hi:[0,1,0] +# GFX12-ERR: warning: invalid instruction encoding + +[0x03,0x44,0x57,0xcc,0x00,0x03,0x1e,0x1c] # neg_hi:[0,0,1] +# GFX12-ERR: warning: invalid instruction encoding + +[0x03,0x40,0x57,0xcc,0x00,0x02,0x1e,0x1c] # sgpr_0 src0 +# GFX12: v_swmmac_f32_16x16x32_fp8_fp8 v[3:6], s0/*Invalid register, operand has 'VGPR_32' register class*/, v[1:2], v7 ; encoding: [0x03,0x40,0x57,0xcc,0x00,0x02,0x1e,0x1c] + +[0x03,0x40,0x57,0xcc,0x00,0x01,0x1c,0x1c] # sgpr_0 src1 +# GFX12: v_swmmac_f32_16x16x32_fp8_fp8 v[3:6], v0, s[0:1]/*Invalid register, operand has 'VReg_64' register class*/, v7 ; encoding: [0x03,0x40,0x57,0xcc,0x00,0x01,0x1c,0x1c] + +[0x03,0x40,0x57,0xcc,0x00,0x03,0x02,0x18] # sgpr_0 src2 +# GFX12: v_swmmac_f32_16x16x32_fp8_fp8 v[3:6], v0, v[1:2], s0/*Invalid register, operand has 'VGPR_32' register class*/ ; encoding: [0x03,0x40,0x57,0xcc,0x00,0x03,0x02,0x18] + +[0x03,0x40,0x57,0xcc,0xf2,0x02,0x1e,0x1c] # 1.0 src0 +# GFX12: v_swmmac_f32_16x16x32_fp8_fp8 v[3:6], 1.0/*Invalid immediate*/, v[1:2], v7 ; encoding: [0x03,0x40,0x57,0xcc,0x00,0x02,0x1e,0x1c] + +[0x03,0x40,0x57,0xcc,0x00,0xe5,0x1d,0x1c] # 1.0 src1 +# GFX12: v_swmmac_f32_16x16x32_fp8_fp8 v[3:6], v0, 1.0/*Invalid immediate*/, v7 ; encoding: [0x03,0x40,0x57,0xcc,0x00,0x01,0x1c,0x1c] + +[0x03,0x40,0x57,0xcc,0x00,0x03,0xca,0x1b] # 1.0 src2 +# GFX12: v_swmmac_f32_16x16x32_fp8_fp8 v[3:6], v0, v[1:2], 1.0/*Invalid immediate*/ ; encoding: [0x03,0x40,0x57,0xcc,0x00,0x03,0x02,0x18] + +[0x03,0x40,0x57,0xcc,0x00,0x03,0x06,0x1a] # 1 src2 +# GFX12: v_swmmac_f32_16x16x32_fp8_fp8 v[3:6], v0, v[1:2], 1/*Invalid immediate*/ ; encoding: [0x03,0x40,0x57,0xcc,0x00,0x03,0x06,0x18] + + + +[0x03,0x40,0x58,0xcc,0x00,0x03,0x1e,0x1c] +# GFX12: v_swmmac_f32_16x16x32_fp8_bf8 v[3:6], v0, v[1:2], v7 ; encoding: [0x03,0x40,0x58,0xcc,0x00,0x03,0x1e,0x1c] + +[0x03,0xc0,0x58,0xcc,0x00,0x03,0x1e,0x1c] # clamp +# GFX12-ERR: warning: invalid instruction encoding + +[0x03,0x48,0x58,0xcc,0x00,0x03,0x1e,0x1c] # op_sel:[1,0,0] +# GFX12: v_swmmac_f32_16x16x32_fp8_bf8 v[3:6], v0, v[1:2], v7 index_key:1 ; encoding: [0x03,0x48,0x58,0xcc,0x00,0x03,0x1e,0x1c] + +[0x03,0x50,0x58,0xcc,0x00,0x03,0x1e,0x1c] # op_sel:[0,1,0] +# GFX12: v_swmmac_f32_16x16x32_fp8_bf8 v[3:6], v0, v[1:2], v7 index_key:2 ; encoding: [0x03,0x50,0x58,0xcc,0x00,0x03,0x1e,0x1c] + +[0x03,0x60,0x58,0xcc,0x00,0x03,0x1e,0x1c] # op_sel:[0,0,1] +# GFX12-ERR: warning: invalid instruction encoding + +[0x03,0x40,0x58,0xcc,0x00,0x03,0x1e,0x14] # op_sel_hi:[0,1,1] +# GFX12-ERR: warning: invalid instruction encoding + +[0x03,0x40,0x58,0xcc,0x00,0x03,0x1e,0x0c] # op_sel_hi:[1,0,1] +# GFX12-ERR: warning: invalid instruction encoding + +[0x03,0x00,0x58,0xcc,0x00,0x03,0x1e,0x1c] # op_sel_hi:[1,1,0] +# GFX12-ERR: warning: invalid instruction encoding + +[0x03,0x48,0x58,0xcc,0x00,0x03,0x1e,0x1c] # index_key:1 +# GFX12: v_swmmac_f32_16x16x32_fp8_bf8 v[3:6], v0, v[1:2], v7 index_key:1 ; encoding: [0x03,0x48,0x58,0xcc,0x00,0x03,0x1e,0x1c] + +[0x03,0x50,0x58,0xcc,0x00,0x03,0x1e,0x1c] # index_key:2 +# GFX12: v_swmmac_f32_16x16x32_fp8_bf8 v[3:6], v0, v[1:2], v7 index_key:2 ; encoding: [0x03,0x50,0x58,0xcc,0x00,0x03,0x1e,0x1c] + +[0x03,0x58,0x58,0xcc,0x00,0x03,0x1e,0x1c] # index_key:3 +# GFX12: v_swmmac_f32_16x16x32_fp8_bf8 v[3:6], v0, v[1:2], v7 index_key:3 ; encoding: [0x03,0x58,0x58,0xcc,0x00,0x03,0x1e,0x1c] + +[0x03,0x40,0x58,0xcc,0x00,0x03,0x1e,0x3c] # neg_lo:[1,0,0] +# GFX12-ERR: warning: invalid instruction encoding + +[0x03,0x40,0x58,0xcc,0x00,0x03,0x1e,0x5c] # neg_lo:[0,1,0] +# GFX12-ERR: warning: invalid instruction encoding + +[0x03,0x40,0x58,0xcc,0x00,0x03,0x1e,0x9c] # neg_lo:[0,0,1] +# GFX12-ERR: warning: invalid instruction encoding + +[0x03,0x41,0x58,0xcc,0x00,0x03,0x1e,0x1c] # neg_hi:[1,0,0] +# GFX12-ERR: warning: invalid instruction encoding + +[0x03,0x42,0x58,0xcc,0x00,0x03,0x1e,0x1c] # neg_hi:[0,1,0] +# GFX12-ERR: warning: invalid instruction encoding + +[0x03,0x44,0x58,0xcc,0x00,0x03,0x1e,0x1c] # neg_hi:[0,0,1] +# GFX12-ERR: warning: invalid instruction encoding + +[0x03,0x40,0x58,0xcc,0x00,0x02,0x1e,0x1c] # sgpr_0 src0 +# GFX12: v_swmmac_f32_16x16x32_fp8_bf8 v[3:6], s0/*Invalid register, operand has 'VGPR_32' register class*/, v[1:2], v7 ; encoding: [0x03,0x40,0x58,0xcc,0x00,0x02,0x1e,0x1c] + +[0x03,0x40,0x58,0xcc,0x00,0x01,0x1c,0x1c] # sgpr_0 src1 +# GFX12: v_swmmac_f32_16x16x32_fp8_bf8 v[3:6], v0, s[0:1]/*Invalid register, operand has 'VReg_64' register class*/, v7 ; encoding: [0x03,0x40,0x58,0xcc,0x00,0x01,0x1c,0x1c] + +[0x03,0x40,0x58,0xcc,0x00,0x03,0x02,0x18] # sgpr_0 src2 +# GFX12: v_swmmac_f32_16x16x32_fp8_bf8 v[3:6], v0, v[1:2], s0/*Invalid register, operand has 'VGPR_32' register class*/ ; encoding: [0x03,0x40,0x58,0xcc,0x00,0x03,0x02,0x18] + +[0x03,0x40,0x58,0xcc,0xf2,0x02,0x1e,0x1c] # 1.0 src0 +# GFX12: v_swmmac_f32_16x16x32_fp8_bf8 v[3:6], 1.0/*Invalid immediate*/, v[1:2], v7 ; encoding: [0x03,0x40,0x58,0xcc,0x00,0x02,0x1e,0x1c] + +[0x03,0x40,0x58,0xcc,0x00,0xe5,0x1d,0x1c] # 1.0 src1 +# GFX12: v_swmmac_f32_16x16x32_fp8_bf8 v[3:6], v0, 1.0/*Invalid immediate*/, v7 ; encoding: [0x03,0x40,0x58,0xcc,0x00,0x01,0x1c,0x1c] + +[0x03,0x40,0x58,0xcc,0x00,0x03,0xca,0x1b] # 1.0 src2 +# GFX12: v_swmmac_f32_16x16x32_fp8_bf8 v[3:6], v0, v[1:2], 1.0/*Invalid immediate*/ ; encoding: [0x03,0x40,0x58,0xcc,0x00,0x03,0x02,0x18] + +[0x03,0x40,0x58,0xcc,0x00,0x03,0x06,0x1a] # 1 src2 +# GFX12: v_swmmac_f32_16x16x32_fp8_bf8 v[3:6], v0, v[1:2], 1/*Invalid immediate*/ ; encoding: [0x03,0x40,0x58,0xcc,0x00,0x03,0x06,0x18] + + + +[0x03,0x40,0x59,0xcc,0x00,0x03,0x1e,0x1c] +# GFX12: v_swmmac_f32_16x16x32_bf8_fp8 v[3:6], v0, v[1:2], v7 ; encoding: [0x03,0x40,0x59,0xcc,0x00,0x03,0x1e,0x1c] + +[0x03,0xc0,0x59,0xcc,0x00,0x03,0x1e,0x1c] # clamp +# GFX12-ERR: warning: invalid instruction encoding + +[0x03,0x48,0x59,0xcc,0x00,0x03,0x1e,0x1c] # op_sel:[1,0,0] +# GFX12: v_swmmac_f32_16x16x32_bf8_fp8 v[3:6], v0, v[1:2], v7 index_key:1 ; encoding: [0x03,0x48,0x59,0xcc,0x00,0x03,0x1e,0x1c] + +[0x03,0x50,0x59,0xcc,0x00,0x03,0x1e,0x1c] # op_sel:[0,1,0] +# GFX12: v_swmmac_f32_16x16x32_bf8_fp8 v[3:6], v0, v[1:2], v7 index_key:2 ; encoding: [0x03,0x50,0x59,0xcc,0x00,0x03,0x1e,0x1c] + +[0x03,0x60,0x59,0xcc,0x00,0x03,0x1e,0x1c] # op_sel:[0,0,1] +# GFX12-ERR: warning: invalid instruction encoding + +[0x03,0x40,0x59,0xcc,0x00,0x03,0x1e,0x14] # op_sel_hi:[0,1,1] +# GFX12-ERR: warning: invalid instruction encoding + +[0x03,0x40,0x59,0xcc,0x00,0x03,0x1e,0x0c] # op_sel_hi:[1,0,1] +# GFX12-ERR: warning: invalid instruction encoding + +[0x03,0x00,0x59,0xcc,0x00,0x03,0x1e,0x1c] # op_sel_hi:[1,1,0] +# GFX12-ERR: warning: invalid instruction encoding + +[0x03,0x48,0x59,0xcc,0x00,0x03,0x1e,0x1c] # index_key:1 +# GFX12: v_swmmac_f32_16x16x32_bf8_fp8 v[3:6], v0, v[1:2], v7 index_key:1 ; encoding: [0x03,0x48,0x59,0xcc,0x00,0x03,0x1e,0x1c] + +[0x03,0x50,0x59,0xcc,0x00,0x03,0x1e,0x1c] # index_key:2 +# GFX12: v_swmmac_f32_16x16x32_bf8_fp8 v[3:6], v0, v[1:2], v7 index_key:2 ; encoding: [0x03,0x50,0x59,0xcc,0x00,0x03,0x1e,0x1c] + +[0x03,0x58,0x59,0xcc,0x00,0x03,0x1e,0x1c] # index_key:3 +# GFX12: v_swmmac_f32_16x16x32_bf8_fp8 v[3:6], v0, v[1:2], v7 index_key:3 ; encoding: [0x03,0x58,0x59,0xcc,0x00,0x03,0x1e,0x1c] + +[0x03,0x40,0x59,0xcc,0x00,0x03,0x1e,0x3c] # neg_lo:[1,0,0] +# GFX12-ERR: warning: invalid instruction encoding + +[0x03,0x40,0x59,0xcc,0x00,0x03,0x1e,0x5c] # neg_lo:[0,1,0] +# GFX12-ERR: warning: invalid instruction encoding + +[0x03,0x40,0x59,0xcc,0x00,0x03,0x1e,0x9c] # neg_lo:[0,0,1] +# GFX12-ERR: warning: invalid instruction encoding + +[0x03,0x41,0x59,0xcc,0x00,0x03,0x1e,0x1c] # neg_hi:[1,0,0] +# GFX12-ERR: warning: invalid instruction encoding + +[0x03,0x42,0x59,0xcc,0x00,0x03,0x1e,0x1c] # neg_hi:[0,1,0] +# GFX12-ERR: warning: invalid instruction encoding + +[0x03,0x44,0x59,0xcc,0x00,0x03,0x1e,0x1c] # neg_hi:[0,0,1] +# GFX12-ERR: warning: invalid instruction encoding + +[0x03,0x40,0x59,0xcc,0x00,0x02,0x1e,0x1c] # sgpr_0 src0 +# GFX12: v_swmmac_f32_16x16x32_bf8_fp8 v[3:6], s0/*Invalid register, operand has 'VGPR_32' register class*/, v[1:2], v7 ; encoding: [0x03,0x40,0x59,0xcc,0x00,0x02,0x1e,0x1c] + +[0x03,0x40,0x59,0xcc,0x00,0x01,0x1c,0x1c] # sgpr_0 src1 +# GFX12: v_swmmac_f32_16x16x32_bf8_fp8 v[3:6], v0, s[0:1]/*Invalid register, operand has 'VReg_64' register class*/, v7 ; encoding: [0x03,0x40,0x59,0xcc,0x00,0x01,0x1c,0x1c] + +[0x03,0x40,0x59,0xcc,0x00,0x03,0x02,0x18] # sgpr_0 src2 +# GFX12: v_swmmac_f32_16x16x32_bf8_fp8 v[3:6], v0, v[1:2], s0/*Invalid register, operand has 'VGPR_32' register class*/ ; encoding: [0x03,0x40,0x59,0xcc,0x00,0x03,0x02,0x18] + +[0x03,0x40,0x59,0xcc,0xf2,0x02,0x1e,0x1c] # 1.0 src0 +# GFX12: v_swmmac_f32_16x16x32_bf8_fp8 v[3:6], 1.0/*Invalid immediate*/, v[1:2], v7 ; encoding: [0x03,0x40,0x59,0xcc,0x00,0x02,0x1e,0x1c] + +[0x03,0x40,0x59,0xcc,0x00,0xe5,0x1d,0x1c] # 1.0 src1 +# GFX12: v_swmmac_f32_16x16x32_bf8_fp8 v[3:6], v0, 1.0/*Invalid immediate*/, v7 ; encoding: [0x03,0x40,0x59,0xcc,0x00,0x01,0x1c,0x1c] + +[0x03,0x40,0x59,0xcc,0x00,0x03,0xca,0x1b] # 1.0 src2 +# GFX12: v_swmmac_f32_16x16x32_bf8_fp8 v[3:6], v0, v[1:2], 1.0/*Invalid immediate*/ ; encoding: [0x03,0x40,0x59,0xcc,0x00,0x03,0x02,0x18] + +[0x03,0x40,0x59,0xcc,0x00,0x03,0x06,0x1a] # 1 src2 +# GFX12: v_swmmac_f32_16x16x32_bf8_fp8 v[3:6], v0, v[1:2], 1/*Invalid immediate*/ ; encoding: [0x03,0x40,0x59,0xcc,0x00,0x03,0x06,0x18] + + + +[0x03,0x40,0x5a,0xcc,0x00,0x03,0x1e,0x1c] +# GFX12: v_swmmac_f32_16x16x32_bf8_bf8 v[3:6], v0, v[1:2], v7 ; encoding: [0x03,0x40,0x5a,0xcc,0x00,0x03,0x1e,0x1c] + +[0x03,0xc0,0x5a,0xcc,0x00,0x03,0x1e,0x1c] # clamp +# GFX12-ERR: warning: invalid instruction encoding + +[0x03,0x48,0x5a,0xcc,0x00,0x03,0x1e,0x1c] # op_sel:[1,0,0] +# GFX12: v_swmmac_f32_16x16x32_bf8_bf8 v[3:6], v0, v[1:2], v7 index_key:1 ; encoding: [0x03,0x48,0x5a,0xcc,0x00,0x03,0x1e,0x1c] + +[0x03,0x50,0x5a,0xcc,0x00,0x03,0x1e,0x1c] # op_sel:[0,1,0] +# GFX12: v_swmmac_f32_16x16x32_bf8_bf8 v[3:6], v0, v[1:2], v7 index_key:2 ; encoding: [0x03,0x50,0x5a,0xcc,0x00,0x03,0x1e,0x1c] + +[0x03,0x60,0x5a,0xcc,0x00,0x03,0x1e,0x1c] # op_sel:[0,0,1] +# GFX12-ERR: warning: invalid instruction encoding + +[0x03,0x40,0x5a,0xcc,0x00,0x03,0x1e,0x14] # op_sel_hi:[0,1,1] +# GFX12-ERR: warning: invalid instruction encoding + +[0x03,0x40,0x5a,0xcc,0x00,0x03,0x1e,0x0c] # op_sel_hi:[1,0,1] +# GFX12-ERR: warning: invalid instruction encoding + +[0x03,0x00,0x5a,0xcc,0x00,0x03,0x1e,0x1c] # op_sel_hi:[1,1,0] +# GFX12-ERR: warning: invalid instruction encoding + +[0x03,0x48,0x5a,0xcc,0x00,0x03,0x1e,0x1c] # index_key:1 +# GFX12: v_swmmac_f32_16x16x32_bf8_bf8 v[3:6], v0, v[1:2], v7 index_key:1 ; encoding: [0x03,0x48,0x5a,0xcc,0x00,0x03,0x1e,0x1c] + +[0x03,0x50,0x5a,0xcc,0x00,0x03,0x1e,0x1c] # index_key:2 +# GFX12: v_swmmac_f32_16x16x32_bf8_bf8 v[3:6], v0, v[1:2], v7 index_key:2 ; encoding: [0x03,0x50,0x5a,0xcc,0x00,0x03,0x1e,0x1c] + +[0x03,0x58,0x5a,0xcc,0x00,0x03,0x1e,0x1c] # index_key:3 +# GFX12: v_swmmac_f32_16x16x32_bf8_bf8 v[3:6], v0, v[1:2], v7 index_key:3 ; encoding: [0x03,0x58,0x5a,0xcc,0x00,0x03,0x1e,0x1c] + +[0x03,0x40,0x5a,0xcc,0x00,0x03,0x1e,0x3c] # neg_lo:[1,0,0] +# GFX12-ERR: warning: invalid instruction encoding + +[0x03,0x40,0x5a,0xcc,0x00,0x03,0x1e,0x5c] # neg_lo:[0,1,0] +# GFX12-ERR: warning: invalid instruction encoding + +[0x03,0x40,0x5a,0xcc,0x00,0x03,0x1e,0x9c] # neg_lo:[0,0,1] +# GFX12-ERR: warning: invalid instruction encoding + +[0x03,0x41,0x5a,0xcc,0x00,0x03,0x1e,0x1c] # neg_hi:[1,0,0] +# GFX12-ERR: warning: invalid instruction encoding + +[0x03,0x42,0x5a,0xcc,0x00,0x03,0x1e,0x1c] # neg_hi:[0,1,0] +# GFX12-ERR: warning: invalid instruction encoding + +[0x03,0x44,0x5a,0xcc,0x00,0x03,0x1e,0x1c] # neg_hi:[0,0,1] +# GFX12-ERR: warning: invalid instruction encoding + +[0x03,0x40,0x5a,0xcc,0x00,0x02,0x1e,0x1c] # sgpr_0 src0 +# GFX12: v_swmmac_f32_16x16x32_bf8_bf8 v[3:6], s0/*Invalid register, operand has 'VGPR_32' register class*/, v[1:2], v7 ; encoding: [0x03,0x40,0x5a,0xcc,0x00,0x02,0x1e,0x1c] + +[0x03,0x40,0x5a,0xcc,0x00,0x01,0x1c,0x1c] # sgpr_0 src1 +# GFX12: v_swmmac_f32_16x16x32_bf8_bf8 v[3:6], v0, s[0:1]/*Invalid register, operand has 'VReg_64' register class*/, v7 ; encoding: [0x03,0x40,0x5a,0xcc,0x00,0x01,0x1c,0x1c] + +[0x03,0x40,0x5a,0xcc,0x00,0x03,0x02,0x18] # sgpr_0 src2 +# GFX12: v_swmmac_f32_16x16x32_bf8_bf8 v[3:6], v0, v[1:2], s0/*Invalid register, operand has 'VGPR_32' register class*/ ; encoding: [0x03,0x40,0x5a,0xcc,0x00,0x03,0x02,0x18] + +[0x03,0x40,0x5a,0xcc,0xf2,0x02,0x1e,0x1c] # 1.0 src0 +# GFX12: v_swmmac_f32_16x16x32_bf8_bf8 v[3:6], 1.0/*Invalid immediate*/, v[1:2], v7 ; encoding: [0x03,0x40,0x5a,0xcc,0x00,0x02,0x1e,0x1c] + +[0x03,0x40,0x5a,0xcc,0x00,0xe5,0x1d,0x1c] # 1.0 src1 +# GFX12: v_swmmac_f32_16x16x32_bf8_bf8 v[3:6], v0, 1.0/*Invalid immediate*/, v7 ; encoding: [0x03,0x40,0x5a,0xcc,0x00,0x01,0x1c,0x1c] + +[0x03,0x40,0x5a,0xcc,0x00,0x03,0xca,0x1b] # 1.0 src2 +# GFX12: v_swmmac_f32_16x16x32_bf8_bf8 v[3:6], v0, v[1:2], 1.0/*Invalid immediate*/ ; encoding: [0x03,0x40,0x5a,0xcc,0x00,0x03,0x02,0x18] + +[0x03,0x40,0x5a,0xcc,0x00,0x03,0x06,0x1a] # 1 src2 +# GFX12: v_swmmac_f32_16x16x32_bf8_bf8 v[3:6], v0, v[1:2], 1/*Invalid immediate*/ ; encoding: [0x03,0x40,0x5a,0xcc,0x00,0x03,0x06,0x18] diff --git a/mlir/include/mlir/Dialect/LLVMIR/ROCDLOps.td b/mlir/include/mlir/Dialect/LLVMIR/ROCDLOps.td index 516a984399ff..638e46a2f9c7 100644 --- a/mlir/include/mlir/Dialect/LLVMIR/ROCDLOps.td +++ b/mlir/include/mlir/Dialect/LLVMIR/ROCDLOps.td @@ -253,22 +253,23 @@ def ROCDL_mfma_f32_32x32x16_fp8_fp8 : ROCDL_Mfma_IntrOp<"mfma.f32.32x32x16.fp8.f //===---------------------------------------------------------------------===// // WMMA intrinsics -class ROCDL_Wmma_IntrOp traits = []> : +class ROCDL_Wmma_IntrOp overloadedOperands, + list traits = []> : LLVM_IntrOpBase, + [0], overloadedOperands, traits, 1>, Arguments<(ins Variadic:$args)> { let assemblyFormat = "$args attr-dict `:` functional-type($args, $res)"; } // Available on RDNA3 -def ROCDL_wmma_f32_16x16x16_f16 : ROCDL_Wmma_IntrOp<"wmma.f32.16x16x16.f16">; -def ROCDL_wmma_f32_16x16x16_bf16 : ROCDL_Wmma_IntrOp<"wmma.f32.16x16x16.bf16">; -def ROCDL_wmma_f16_16x16x16_f16 : ROCDL_Wmma_IntrOp<"wmma.f16.16x16x16.f16">; -def ROCDL_wmma_bf16_16x16x16_bf16 : ROCDL_Wmma_IntrOp<"wmma.bf16.16x16x16.bf16">; -def ROCDL_wmma_i32_16x16x16_iu8 : ROCDL_Wmma_IntrOp<"wmma.i32.16x16x16.iu8">; -def ROCDL_wmma_i32_16x16x16_iu4 : ROCDL_Wmma_IntrOp<"wmma.i32.16x16x16.iu4">; +def ROCDL_wmma_f32_16x16x16_f16 : ROCDL_Wmma_IntrOp<"wmma.f32.16x16x16.f16", [0]>; +def ROCDL_wmma_f32_16x16x16_bf16 : ROCDL_Wmma_IntrOp<"wmma.f32.16x16x16.bf16", [0]>; +def ROCDL_wmma_f16_16x16x16_f16 : ROCDL_Wmma_IntrOp<"wmma.f16.16x16x16.f16", [0]>; +def ROCDL_wmma_bf16_16x16x16_bf16 : ROCDL_Wmma_IntrOp<"wmma.bf16.16x16x16.bf16", [0]>; +def ROCDL_wmma_i32_16x16x16_iu8 : ROCDL_Wmma_IntrOp<"wmma.i32.16x16x16.iu8", [1]>; +def ROCDL_wmma_i32_16x16x16_iu4 : ROCDL_Wmma_IntrOp<"wmma.i32.16x16x16.iu4", [1]>; //===---------------------------------------------------------------------===// // Operations on raw buffer resources (stride of 0, bounds checks either off or in diff --git a/mlir/test/Target/LLVMIR/rocdl.mlir b/mlir/test/Target/LLVMIR/rocdl.mlir index 3c9c70711ae2..26123300d748 100644 --- a/mlir/test/Target/LLVMIR/rocdl.mlir +++ b/mlir/test/Target/LLVMIR/rocdl.mlir @@ -248,53 +248,53 @@ llvm.func @rocdl.wmma(%arg0 : vector<8xf32>, %arg1 : vector<16 x f16>, %arg2 : v // ---- Wave32 ----- // f16 -> f32 - // CHECK: call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.f16.v8f32(<16 x half> %{{.*}}, <16 x half> %{{.*}}, <8 x float> %{{.*}}) + // CHECK: call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.f16.v8f32.v16f16(<16 x half> %{{.*}}, <16 x half> %{{.*}}, <8 x float> %{{.*}}) %r0 = rocdl.wmma.f32.16x16x16.f16 %arg1, %arg1, %arg0 : (vector<16xf16>, vector<16xf16>, vector<8xf32>) -> vector<8xf32> // bf16 -> f32 - // CHECK: call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf16.v8f32(<16 x i16> %{{.*}}, <16 x i16> %{{.*}}, <8 x float> %{{.*}}) + // CHECK: call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf16.v8f32.v16i16(<16 x i16> %{{.*}}, <16 x i16> %{{.*}}, <8 x float> %{{.*}}) %r1 = rocdl.wmma.f32.16x16x16.bf16 %arg2, %arg2, %arg0 : (vector<16xi16>, vector<16xi16>, vector<8xf32>) -> vector<8xf32> // f16 -> f16 (OPSEL = {0,1}) - // CHECK: call <16 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16.v16f16(<16 x half> %{{.*}}, <16 x half> %{{.*}}, <16 x half> %{{.*}}, i1 {{.*}}) + // CHECK: call <16 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16.v16f16.v16f16(<16 x half> %{{.*}}, <16 x half> %{{.*}}, <16 x half> %{{.*}}, i1 {{.*}}) %r2 = rocdl.wmma.f16.16x16x16.f16 %arg1, %arg1, %arg1, %zero : (vector<16xf16>, vector<16xf16>, vector<16xf16>, i1) -> vector<16xf16> // bf16 -> bf16 (OPSEL = {0,1}) - // CHECK: call <16 x i16> @llvm.amdgcn.wmma.bf16.16x16x16.bf16.v16i16(<16 x i16> %{{.*}}, <16 x i16> %{{.*}}, <16 x i16> %{{.*}}, i1 {{.*}}) + // CHECK: call <16 x i16> @llvm.amdgcn.wmma.bf16.16x16x16.bf16.v16i16.v16i16(<16 x i16> %{{.*}}, <16 x i16> %{{.*}}, <16 x i16> %{{.*}}, i1 {{.*}}) %r4 = rocdl.wmma.bf16.16x16x16.bf16 %arg2, %arg2, %arg2, %zero : (vector<16xi16>, vector<16xi16>, vector<16xi16>, i1) -> vector<16xi16> // int8 -> int32 (signA = {0,1}, signB = {0,1}, clamp = {0,1}) - // CHECK: call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8.v8i32(i1 {{.*}}, <4 x i32> %{{.*}}, i1 {{.*}}, <4 x i32> %{{.*}}, <8 x i32> %{{.*}}, i1 {{.*}}) + // CHECK: call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8.v8i32.v4i32(i1 {{.*}}, <4 x i32> %{{.*}}, i1 {{.*}}, <4 x i32> %{{.*}}, <8 x i32> %{{.*}}, i1 {{.*}}) %r5 = rocdl.wmma.i32.16x16x16.iu8 %zero, %arg5, %zero, %arg5, %arg3, %zero : (i1, vector<4xi32>, i1, vector<4xi32>, vector<8xi32>, i1) -> vector<8xi32> // int4 -> int32 (signA = {0,1}, signB = {0,1}, clamp = {0,1}) - // CHECK: call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4.v8i32(i1 {{.*}}, <2 x i32> %{{.*}}, i1 {{.*}}, <2 x i32> %{{.*}}, <8 x i32> %{{.*}}, i1 {{.*}}) + // CHECK: call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4.v8i32.v2i32(i1 {{.*}}, <2 x i32> %{{.*}}, i1 {{.*}}, <2 x i32> %{{.*}}, <8 x i32> %{{.*}}, i1 {{.*}}) %r6 = rocdl.wmma.i32.16x16x16.iu4 %zero, %arg4, %zero, %arg4, %arg3, %zero : (i1, vector<2xi32>, i1, vector<2xi32>, vector<8xi32>, i1) -> vector<8xi32> // ---- Wave64 ----- // f16 -> f32 - // CHECK: call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.f16.v4f32(<16 x half> %{{.*}}, <16 x half> %{{.*}}, <4 x float> %{{.*}}) + // CHECK: call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.f16.v4f32.v16f16(<16 x half> %{{.*}}, <16 x half> %{{.*}}, <4 x float> %{{.*}}) %r7 = rocdl.wmma.f32.16x16x16.f16 %arg1, %arg1, %arg6 : (vector<16xf16>, vector<16xf16>, vector<4xf32>) -> vector<4xf32> // bf16 -> f32 - // CHECK: call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf16.v4f32(<16 x i16> %{{.*}}, <16 x i16> %{{.*}}, <4 x float> %{{.*}}) + // CHECK: call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf16.v4f32.v16i16(<16 x i16> %{{.*}}, <16 x i16> %{{.*}}, <4 x float> %{{.*}}) %r8 = rocdl.wmma.f32.16x16x16.bf16 %arg2, %arg2, %arg6 : (vector<16xi16>, vector<16xi16>, vector<4xf32>) -> vector<4xf32> // f16 -> f16 (OPSEL = {0,1}) - // CHECK: call <8 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16.v8f16(<16 x half> %{{.*}}, <16 x half> %{{.*}}, <8 x half> %{{.*}}, i1 {{.*}}) + // CHECK: call <8 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16.v8f16.v16f16(<16 x half> %{{.*}}, <16 x half> %{{.*}}, <8 x half> %{{.*}}, i1 {{.*}}) %r9 = rocdl.wmma.f16.16x16x16.f16 %arg1, %arg1, %arg7, %zero : (vector<16xf16>, vector<16xf16>, vector<8xf16>, i1) -> vector<8xf16> // bf16 -> bf16 (OPSEL = {0,1}) - // CHECK: call <8 x i16> @llvm.amdgcn.wmma.bf16.16x16x16.bf16.v8i16(<16 x i16> %{{.*}}, <16 x i16> %{{.*}}, <8 x i16> %{{.*}}, i1 {{.*}}) + // CHECK: call <8 x i16> @llvm.amdgcn.wmma.bf16.16x16x16.bf16.v8i16.v16i16(<16 x i16> %{{.*}}, <16 x i16> %{{.*}}, <8 x i16> %{{.*}}, i1 {{.*}}) %r11 = rocdl.wmma.bf16.16x16x16.bf16 %arg2, %arg2, %arg8, %zero : (vector<16xi16>, vector<16xi16>, vector<8xi16>, i1) -> vector<8xi16> // int8 -> int32 (signA = {0,1}, signB = {0,1}, clamp = {0,1}) - // CHECK: call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8.v4i32(i1 {{.*}}, <4 x i32> %{{.*}}, i1 {{.*}}, <4 x i32> %{{.*}}, <4 x i32> %{{.*}}, i1 {{.*}}) + // CHECK: call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8.v4i32.v4i32(i1 {{.*}}, <4 x i32> %{{.*}}, i1 {{.*}}, <4 x i32> %{{.*}}, <4 x i32> %{{.*}}, i1 {{.*}}) %r12 = rocdl.wmma.i32.16x16x16.iu8 %zero, %arg5, %zero, %arg5, %arg5, %zero : (i1, vector<4xi32>, i1, vector<4xi32>, vector<4xi32>, i1) -> vector<4xi32> // int4 -> int32 (signA = {0,1}, signB = {0,1}, clamp = {0,1}) - // CHECK: call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4.v4i32(i1 {{.*}}, <2 x i32> %{{.*}}, i1 {{.*}}, <2 x i32> %{{.*}}, <4 x i32> %{{.*}}, i1 {{.*}}) + // CHECK: call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4.v4i32.v2i32(i1 {{.*}}, <2 x i32> %{{.*}}, i1 {{.*}}, <2 x i32> %{{.*}}, <4 x i32> %{{.*}}, i1 {{.*}}) %r13 = rocdl.wmma.i32.16x16x16.iu4 %zero, %arg4, %zero, %arg4, %arg5, %zero : (i1, vector<2xi32>, i1, vector<2xi32>, vector<4xi32>, i1) -> vector<4xi32> llvm.return %r0 : vector<8xf32>