mirror of
https://github.com/RPCS3/llvm.git
synced 2024-12-22 03:58:16 +00:00
Improved the operands commute transformation for X86-FMA3 instructions.
All 3 operands of FMA3 instructions are commutable now. Patch by Slava Klochkov Reviewers: Quentin Colombet(qcolombet), Ahmed Bougacha(ab). Differential Revision: http://reviews.llvm.org/D13269 git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@252335 91177308-0d34-0410-b5e6-96231b3b80d8
This commit is contained in:
parent
11228e360e
commit
91950eea55
@ -15,13 +15,31 @@
|
||||
// FMA3 - Intel 3 operand Fused Multiply-Add instructions
|
||||
//===----------------------------------------------------------------------===//
|
||||
|
||||
let Constraints = "$src1 = $dst" in {
|
||||
// For all FMA opcodes declared in fma3p_rm and fma3s_rm milticlasses defined
|
||||
// below, both the register and memory variants are commutable.
|
||||
// For the register form the commutable operands are 1, 2 and 3.
|
||||
// For the memory variant the folded operand must be in 3. Thus,
|
||||
// in that case, only the operands 1 and 2 can be swapped.
|
||||
// Commuting some of operands may require the opcode change.
|
||||
// FMA*213*:
|
||||
// operands 1 and 2 (memory & register forms): *213* --> *213*(no changes);
|
||||
// operands 1 and 3 (register forms only): *213* --> *231*;
|
||||
// operands 2 and 3 (register forms only): *213* --> *132*.
|
||||
// FMA*132*:
|
||||
// operands 1 and 2 (memory & register forms): *132* --> *231*;
|
||||
// operands 1 and 3 (register forms only): *132* --> *132*(no changes);
|
||||
// operands 2 and 3 (register forms only): *132* --> *213*.
|
||||
// FMA*231*:
|
||||
// operands 1 and 2 (memory & register forms): *231* --> *132*;
|
||||
// operands 1 and 3 (register forms only): *231* --> *213*;
|
||||
// operands 2 and 3 (register forms only): *231* --> *231*(no changes).
|
||||
|
||||
let Constraints = "$src1 = $dst", hasSideEffects = 0, isCommutable = 1 in
|
||||
multiclass fma3p_rm<bits<8> opc, string OpcodeStr,
|
||||
PatFrag MemFrag128, PatFrag MemFrag256,
|
||||
ValueType OpVT128, ValueType OpVT256,
|
||||
bit IsRVariantCommutable = 0, bit IsMVariantCommutable = 0,
|
||||
SDPatternOperator Op = null_frag> {
|
||||
let usesCustomInserter = 1, isCommutable = IsRVariantCommutable in
|
||||
let usesCustomInserter = 1 in
|
||||
def r : FMA3<opc, MRMSrcReg, (outs VR128:$dst),
|
||||
(ins VR128:$src1, VR128:$src2, VR128:$src3),
|
||||
!strconcat(OpcodeStr,
|
||||
@ -29,7 +47,7 @@ multiclass fma3p_rm<bits<8> opc, string OpcodeStr,
|
||||
[(set VR128:$dst, (OpVT128 (Op VR128:$src2,
|
||||
VR128:$src1, VR128:$src3)))]>;
|
||||
|
||||
let mayLoad = 1, isCommutable = IsMVariantCommutable in
|
||||
let mayLoad = 1 in
|
||||
def m : FMA3<opc, MRMSrcMem, (outs VR128:$dst),
|
||||
(ins VR128:$src1, VR128:$src2, f128mem:$src3),
|
||||
!strconcat(OpcodeStr,
|
||||
@ -37,7 +55,7 @@ multiclass fma3p_rm<bits<8> opc, string OpcodeStr,
|
||||
[(set VR128:$dst, (OpVT128 (Op VR128:$src2, VR128:$src1,
|
||||
(MemFrag128 addr:$src3))))]>;
|
||||
|
||||
let usesCustomInserter = 1, isCommutable = IsRVariantCommutable in
|
||||
let usesCustomInserter = 1 in
|
||||
def rY : FMA3<opc, MRMSrcReg, (outs VR256:$dst),
|
||||
(ins VR256:$src1, VR256:$src2, VR256:$src3),
|
||||
!strconcat(OpcodeStr,
|
||||
@ -45,7 +63,7 @@ multiclass fma3p_rm<bits<8> opc, string OpcodeStr,
|
||||
[(set VR256:$dst, (OpVT256 (Op VR256:$src2, VR256:$src1,
|
||||
VR256:$src3)))]>, VEX_L;
|
||||
|
||||
let mayLoad = 1, isCommutable = IsMVariantCommutable in
|
||||
let mayLoad = 1 in
|
||||
def mY : FMA3<opc, MRMSrcMem, (outs VR256:$dst),
|
||||
(ins VR256:$src1, VR256:$src2, f256mem:$src3),
|
||||
!strconcat(OpcodeStr,
|
||||
@ -54,34 +72,20 @@ multiclass fma3p_rm<bits<8> opc, string OpcodeStr,
|
||||
(OpVT256 (Op VR256:$src2, VR256:$src1,
|
||||
(MemFrag256 addr:$src3))))]>, VEX_L;
|
||||
}
|
||||
} // Constraints = "$src1 = $dst"
|
||||
|
||||
multiclass fma3p_forms<bits<8> opc132, bits<8> opc213, bits<8> opc231,
|
||||
string OpcodeStr, string PackTy,
|
||||
PatFrag MemFrag128, PatFrag MemFrag256,
|
||||
SDNode Op, ValueType OpTy128, ValueType OpTy256> {
|
||||
// For 213, both the register and memory variant are commutable.
|
||||
// Indeed, the commutable operands are 1 and 2 and both live in registers
|
||||
// for both variants.
|
||||
defm r213 : fma3p_rm<opc213,
|
||||
!strconcat(OpcodeStr, "213", PackTy),
|
||||
MemFrag128, MemFrag256, OpTy128, OpTy256,
|
||||
/* IsRVariantCommutable */ 1,
|
||||
/* IsMVariantCommutable */ 1,
|
||||
Op>;
|
||||
let hasSideEffects = 0 in {
|
||||
MemFrag128, MemFrag256, OpTy128, OpTy256, Op>;
|
||||
defm r132 : fma3p_rm<opc132,
|
||||
!strconcat(OpcodeStr, "132", PackTy),
|
||||
MemFrag128, MemFrag256, OpTy128, OpTy256>;
|
||||
// For 231, only the register variant is commutable.
|
||||
// For the memory variant the folded operand must be in 3. Thus,
|
||||
// in that case, it cannot be swapped with 2.
|
||||
defm r231 : fma3p_rm<opc231,
|
||||
!strconcat(OpcodeStr, "231", PackTy),
|
||||
MemFrag128, MemFrag256, OpTy128, OpTy256,
|
||||
/* IsRVariantCommutable */ 1,
|
||||
/* IsMVariantCommutable */ 0>;
|
||||
} // hasSideEffects = 0
|
||||
MemFrag128, MemFrag256, OpTy128, OpTy256>;
|
||||
}
|
||||
|
||||
// Fused Multiply-Add
|
||||
@ -126,32 +130,27 @@ let ExeDomain = SSEPackedDouble in {
|
||||
v4f64>, VEX_W;
|
||||
}
|
||||
|
||||
// All source register operands of FMA instructions can be commuted.
|
||||
// In many cases such commute transformation requres an opcode adjustment,
|
||||
// for example, commuting the operands 1 and 2 in FMA*132 form would require
|
||||
// an opcode change to FMA*231:
|
||||
// All source register operands of FMA opcodes defined in fma3s_rm multiclass
|
||||
// can be commuted. In many cases such commute transformation requres an opcode
|
||||
// adjustment, for example, commuting the operands 1 and 2 in FMA*132 form
|
||||
// would require an opcode change to FMA*231:
|
||||
// FMA*132* reg1, reg2, reg3; // reg1 * reg3 + reg2;
|
||||
// -->
|
||||
// FMA*231* reg2, reg1, reg3; // reg1 * reg3 + reg2;
|
||||
// Currently, the commute transformation is supported for only few FMA forms.
|
||||
// That is the reason why \p IsRVariantCommutable and \p IsMVariantCommutable
|
||||
// parameters are used here.
|
||||
// The general commute operands optimization working for all forms is going
|
||||
// to be implemented soon. (Please, see http://reviews.llvm.org/D13269
|
||||
// for details).
|
||||
let Constraints = "$src1 = $dst", hasSideEffects = 0 in {
|
||||
// Please see more detailed comment at the very beginning of the section
|
||||
// defining FMA3 opcodes above.
|
||||
let Constraints = "$src1 = $dst", isCommutable = 1, hasSideEffects = 0 in
|
||||
multiclass fma3s_rm<bits<8> opc, string OpcodeStr,
|
||||
X86MemOperand x86memop, RegisterClass RC,
|
||||
bit IsRVariantCommutable = 0, bit IsMVariantCommutable = 0,
|
||||
SDPatternOperator OpNode = null_frag> {
|
||||
let usesCustomInserter = 1, isCommutable = IsRVariantCommutable in
|
||||
let usesCustomInserter = 1 in
|
||||
def r : FMA3<opc, MRMSrcReg, (outs RC:$dst),
|
||||
(ins RC:$src1, RC:$src2, RC:$src3),
|
||||
!strconcat(OpcodeStr,
|
||||
"\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
|
||||
[(set RC:$dst, (OpNode RC:$src2, RC:$src1, RC:$src3))]>;
|
||||
|
||||
let mayLoad = 1, isCommutable = IsMVariantCommutable in
|
||||
let mayLoad = 1 in
|
||||
def m : FMA3<opc, MRMSrcMem, (outs RC:$dst),
|
||||
(ins RC:$src1, RC:$src2, x86memop:$src3),
|
||||
!strconcat(OpcodeStr,
|
||||
@ -159,22 +158,22 @@ multiclass fma3s_rm<bits<8> opc, string OpcodeStr,
|
||||
[(set RC:$dst,
|
||||
(OpNode RC:$src2, RC:$src1, (load addr:$src3)))]>;
|
||||
}
|
||||
} // Constraints = "$src1 = $dst", hasSideEffects = 0
|
||||
|
||||
// These FMA*_Int instructions are defined specially for being used when
|
||||
// the scalar FMA intrinsics are lowered to machine instructions, and in that
|
||||
// sence they are similar to existing ADD*_Int, SUB*_Int, MUL*_Int, etc.
|
||||
// instructions.
|
||||
//
|
||||
// The FMA*_Int instructions are _TEMPORARILY_ defined as NOT commutable.
|
||||
// The upper bits of the result of scalar FMA intrinsics must be copied from
|
||||
// the upper bits of the 1st operand. So, commuting the 1st operand would
|
||||
// invalidate the upper bits of the intrinsic result.
|
||||
// The corresponding optimization which allows commuting 2nd and 3rd operands
|
||||
// of FMA*_Int instructions has been developed and is waiting for
|
||||
// code-review approval and checkin (Please see http://reviews.llvm.org/D13269).
|
||||
// FIXME: The FMA*_Int instructions are TEMPORARILY defined as NOT commutable.
|
||||
// Commuting the 2nd and 3rd source register operands of FMAs is quite trivial
|
||||
// and the corresponding optimization has been developed (please see
|
||||
// http://reviews.llvm.org/D13269 for details). The optimization though needs
|
||||
// some minor tuning to enable it for FMA*_Int opcodes.
|
||||
// Commuting the 1st operand of FMA*_Int requires some additional analysis,
|
||||
// the commute optimization is legal only if all users of FMA*_Int use only
|
||||
// the lowest element of the FMA*_Int instruction.
|
||||
let Constraints = "$src1 = $dst", isCommutable = 0, isCodeGenOnly =1,
|
||||
hasSideEffects = 0 in {
|
||||
hasSideEffects = 0 in
|
||||
multiclass fma3s_rm_int<bits<8> opc, string OpcodeStr,
|
||||
Operand memopr, RegisterClass RC> {
|
||||
def r_Int : FMA3<opc, MRMSrcReg, (outs RC:$dst),
|
||||
@ -190,8 +189,6 @@ multiclass fma3s_rm_int<bits<8> opc, string OpcodeStr,
|
||||
"\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
|
||||
[]>;
|
||||
}
|
||||
} // Constraints = "$src1 = $dst", isCommutable = 0, isCodeGenOnly =1,
|
||||
// hasSideEffects = 0
|
||||
|
||||
multiclass fma3s_forms<bits<8> opc132, bits<8> opc213, bits<8> opc231,
|
||||
string OpStr, string PackTy,
|
||||
@ -199,13 +196,8 @@ multiclass fma3s_forms<bits<8> opc132, bits<8> opc213, bits<8> opc231,
|
||||
X86MemOperand x86memop> {
|
||||
defm r132 : fma3s_rm<opc132, !strconcat(OpStr, "132", PackTy), x86memop, RC>;
|
||||
defm r213 : fma3s_rm<opc213, !strconcat(OpStr, "213", PackTy), x86memop, RC,
|
||||
/* IsRVariantCommutable */ 1,
|
||||
/* IsMVariantCommutable */ 1,
|
||||
OpNode>;
|
||||
defm r231 : fma3s_rm<opc231, !strconcat(OpStr, "231", PackTy), x86memop, RC,
|
||||
/* IsRVariantCommutable */ 1,
|
||||
/* IsMVariantCommutable */ 0,
|
||||
null_frag>;
|
||||
defm r231 : fma3s_rm<opc231, !strconcat(OpStr, "231", PackTy), x86memop, RC>;
|
||||
}
|
||||
|
||||
// The FMA 213 form is created for lowering of scalar FMA intrinscis
|
||||
|
@ -2971,6 +2971,121 @@ X86InstrInfo::convertToThreeAddress(MachineFunction::iterator &MFI,
|
||||
return NewMI;
|
||||
}
|
||||
|
||||
/// Returns true if the given instruction opcode is FMA3.
|
||||
/// Otherwise, returns false.
|
||||
static bool isFMA3(unsigned Opcode) {
|
||||
switch (Opcode) {
|
||||
case X86::VFMADDSDr132r: case X86::VFMADDSDr132m:
|
||||
case X86::VFMADDSSr132r: case X86::VFMADDSSr132m:
|
||||
case X86::VFMSUBSDr132r: case X86::VFMSUBSDr132m:
|
||||
case X86::VFMSUBSSr132r: case X86::VFMSUBSSr132m:
|
||||
case X86::VFNMADDSDr132r: case X86::VFNMADDSDr132m:
|
||||
case X86::VFNMADDSSr132r: case X86::VFNMADDSSr132m:
|
||||
case X86::VFNMSUBSDr132r: case X86::VFNMSUBSDr132m:
|
||||
case X86::VFNMSUBSSr132r: case X86::VFNMSUBSSr132m:
|
||||
|
||||
case X86::VFMADDSDr213r: case X86::VFMADDSDr213m:
|
||||
case X86::VFMADDSSr213r: case X86::VFMADDSSr213m:
|
||||
case X86::VFMSUBSDr213r: case X86::VFMSUBSDr213m:
|
||||
case X86::VFMSUBSSr213r: case X86::VFMSUBSSr213m:
|
||||
case X86::VFNMADDSDr213r: case X86::VFNMADDSDr213m:
|
||||
case X86::VFNMADDSSr213r: case X86::VFNMADDSSr213m:
|
||||
case X86::VFNMSUBSDr213r: case X86::VFNMSUBSDr213m:
|
||||
case X86::VFNMSUBSSr213r: case X86::VFNMSUBSSr213m:
|
||||
|
||||
case X86::VFMADDSDr231r: case X86::VFMADDSDr231m:
|
||||
case X86::VFMADDSSr231r: case X86::VFMADDSSr231m:
|
||||
case X86::VFMSUBSDr231r: case X86::VFMSUBSDr231m:
|
||||
case X86::VFMSUBSSr231r: case X86::VFMSUBSSr231m:
|
||||
case X86::VFNMADDSDr231r: case X86::VFNMADDSDr231m:
|
||||
case X86::VFNMADDSSr231r: case X86::VFNMADDSSr231m:
|
||||
case X86::VFNMSUBSDr231r: case X86::VFNMSUBSDr231m:
|
||||
case X86::VFNMSUBSSr231r: case X86::VFNMSUBSSr231m:
|
||||
|
||||
case X86::VFMADDSUBPDr132r: case X86::VFMADDSUBPDr132m:
|
||||
case X86::VFMADDSUBPSr132r: case X86::VFMADDSUBPSr132m:
|
||||
case X86::VFMSUBADDPDr132r: case X86::VFMSUBADDPDr132m:
|
||||
case X86::VFMSUBADDPSr132r: case X86::VFMSUBADDPSr132m:
|
||||
case X86::VFMADDSUBPDr132rY: case X86::VFMADDSUBPDr132mY:
|
||||
case X86::VFMADDSUBPSr132rY: case X86::VFMADDSUBPSr132mY:
|
||||
case X86::VFMSUBADDPDr132rY: case X86::VFMSUBADDPDr132mY:
|
||||
case X86::VFMSUBADDPSr132rY: case X86::VFMSUBADDPSr132mY:
|
||||
|
||||
case X86::VFMADDPDr132r: case X86::VFMADDPDr132m:
|
||||
case X86::VFMADDPSr132r: case X86::VFMADDPSr132m:
|
||||
case X86::VFMSUBPDr132r: case X86::VFMSUBPDr132m:
|
||||
case X86::VFMSUBPSr132r: case X86::VFMSUBPSr132m:
|
||||
case X86::VFNMADDPDr132r: case X86::VFNMADDPDr132m:
|
||||
case X86::VFNMADDPSr132r: case X86::VFNMADDPSr132m:
|
||||
case X86::VFNMSUBPDr132r: case X86::VFNMSUBPDr132m:
|
||||
case X86::VFNMSUBPSr132r: case X86::VFNMSUBPSr132m:
|
||||
case X86::VFMADDPDr132rY: case X86::VFMADDPDr132mY:
|
||||
case X86::VFMADDPSr132rY: case X86::VFMADDPSr132mY:
|
||||
case X86::VFMSUBPDr132rY: case X86::VFMSUBPDr132mY:
|
||||
case X86::VFMSUBPSr132rY: case X86::VFMSUBPSr132mY:
|
||||
case X86::VFNMADDPDr132rY: case X86::VFNMADDPDr132mY:
|
||||
case X86::VFNMADDPSr132rY: case X86::VFNMADDPSr132mY:
|
||||
case X86::VFNMSUBPDr132rY: case X86::VFNMSUBPDr132mY:
|
||||
case X86::VFNMSUBPSr132rY: case X86::VFNMSUBPSr132mY:
|
||||
|
||||
case X86::VFMADDSUBPDr213r: case X86::VFMADDSUBPDr213m:
|
||||
case X86::VFMADDSUBPSr213r: case X86::VFMADDSUBPSr213m:
|
||||
case X86::VFMSUBADDPDr213r: case X86::VFMSUBADDPDr213m:
|
||||
case X86::VFMSUBADDPSr213r: case X86::VFMSUBADDPSr213m:
|
||||
case X86::VFMADDSUBPDr213rY: case X86::VFMADDSUBPDr213mY:
|
||||
case X86::VFMADDSUBPSr213rY: case X86::VFMADDSUBPSr213mY:
|
||||
case X86::VFMSUBADDPDr213rY: case X86::VFMSUBADDPDr213mY:
|
||||
case X86::VFMSUBADDPSr213rY: case X86::VFMSUBADDPSr213mY:
|
||||
|
||||
case X86::VFMADDPDr213r: case X86::VFMADDPDr213m:
|
||||
case X86::VFMADDPSr213r: case X86::VFMADDPSr213m:
|
||||
case X86::VFMSUBPDr213r: case X86::VFMSUBPDr213m:
|
||||
case X86::VFMSUBPSr213r: case X86::VFMSUBPSr213m:
|
||||
case X86::VFNMADDPDr213r: case X86::VFNMADDPDr213m:
|
||||
case X86::VFNMADDPSr213r: case X86::VFNMADDPSr213m:
|
||||
case X86::VFNMSUBPDr213r: case X86::VFNMSUBPDr213m:
|
||||
case X86::VFNMSUBPSr213r: case X86::VFNMSUBPSr213m:
|
||||
case X86::VFMADDPDr213rY: case X86::VFMADDPDr213mY:
|
||||
case X86::VFMADDPSr213rY: case X86::VFMADDPSr213mY:
|
||||
case X86::VFMSUBPDr213rY: case X86::VFMSUBPDr213mY:
|
||||
case X86::VFMSUBPSr213rY: case X86::VFMSUBPSr213mY:
|
||||
case X86::VFNMADDPDr213rY: case X86::VFNMADDPDr213mY:
|
||||
case X86::VFNMADDPSr213rY: case X86::VFNMADDPSr213mY:
|
||||
case X86::VFNMSUBPDr213rY: case X86::VFNMSUBPDr213mY:
|
||||
case X86::VFNMSUBPSr213rY: case X86::VFNMSUBPSr213mY:
|
||||
|
||||
case X86::VFMADDSUBPDr231r: case X86::VFMADDSUBPDr231m:
|
||||
case X86::VFMADDSUBPSr231r: case X86::VFMADDSUBPSr231m:
|
||||
case X86::VFMSUBADDPDr231r: case X86::VFMSUBADDPDr231m:
|
||||
case X86::VFMSUBADDPSr231r: case X86::VFMSUBADDPSr231m:
|
||||
case X86::VFMADDSUBPDr231rY: case X86::VFMADDSUBPDr231mY:
|
||||
case X86::VFMADDSUBPSr231rY: case X86::VFMADDSUBPSr231mY:
|
||||
case X86::VFMSUBADDPDr231rY: case X86::VFMSUBADDPDr231mY:
|
||||
case X86::VFMSUBADDPSr231rY: case X86::VFMSUBADDPSr231mY:
|
||||
|
||||
case X86::VFMADDPDr231r: case X86::VFMADDPDr231m:
|
||||
case X86::VFMADDPSr231r: case X86::VFMADDPSr231m:
|
||||
case X86::VFMSUBPDr231r: case X86::VFMSUBPDr231m:
|
||||
case X86::VFMSUBPSr231r: case X86::VFMSUBPSr231m:
|
||||
case X86::VFNMADDPDr231r: case X86::VFNMADDPDr231m:
|
||||
case X86::VFNMADDPSr231r: case X86::VFNMADDPSr231m:
|
||||
case X86::VFNMSUBPDr231r: case X86::VFNMSUBPDr231m:
|
||||
case X86::VFNMSUBPSr231r: case X86::VFNMSUBPSr231m:
|
||||
case X86::VFMADDPDr231rY: case X86::VFMADDPDr231mY:
|
||||
case X86::VFMADDPSr231rY: case X86::VFMADDPSr231mY:
|
||||
case X86::VFMSUBPDr231rY: case X86::VFMSUBPDr231mY:
|
||||
case X86::VFMSUBPSr231rY: case X86::VFMSUBPSr231mY:
|
||||
case X86::VFNMADDPDr231rY: case X86::VFNMADDPDr231mY:
|
||||
case X86::VFNMADDPSr231rY: case X86::VFNMADDPSr231mY:
|
||||
case X86::VFNMSUBPDr231rY: case X86::VFNMSUBPDr231mY:
|
||||
case X86::VFNMSUBPSr231rY: case X86::VFNMSUBPSr231mY:
|
||||
return true;
|
||||
default:
|
||||
return false;
|
||||
}
|
||||
llvm_unreachable("Opcode not handled by the switch");
|
||||
}
|
||||
|
||||
MachineInstr *X86InstrInfo::commuteInstructionImpl(MachineInstr *MI,
|
||||
bool NewMI,
|
||||
unsigned OpIdx1,
|
||||
@ -3181,10 +3296,219 @@ MachineInstr *X86InstrInfo::commuteInstructionImpl(MachineInstr *MI,
|
||||
// Fallthrough intended.
|
||||
}
|
||||
default:
|
||||
if (isFMA3(MI->getOpcode())) {
|
||||
unsigned Opc = getFMA3OpcodeToCommuteOperands(MI, OpIdx1, OpIdx2);
|
||||
if (Opc == 0)
|
||||
return nullptr;
|
||||
if (NewMI) {
|
||||
MachineFunction &MF = *MI->getParent()->getParent();
|
||||
MI = MF.CloneMachineInstr(MI);
|
||||
NewMI = false;
|
||||
}
|
||||
MI->setDesc(get(Opc));
|
||||
}
|
||||
return TargetInstrInfo::commuteInstructionImpl(MI, NewMI, OpIdx1, OpIdx2);
|
||||
}
|
||||
}
|
||||
|
||||
bool X86InstrInfo::findFMA3CommutedOpIndices(MachineInstr *MI,
|
||||
unsigned &SrcOpIdx1,
|
||||
unsigned &SrcOpIdx2) const {
|
||||
|
||||
unsigned RegOpsNum = isMem(MI, 3) ? 2 : 3;
|
||||
|
||||
// Only the first RegOpsNum operands are commutable.
|
||||
// Also, the value 'CommuteAnyOperandIndex' is valid here as it means
|
||||
// that the operand is not specified/fixed.
|
||||
if (SrcOpIdx1 != CommuteAnyOperandIndex &&
|
||||
(SrcOpIdx1 < 1 || SrcOpIdx1 > RegOpsNum))
|
||||
return false;
|
||||
if (SrcOpIdx2 != CommuteAnyOperandIndex &&
|
||||
(SrcOpIdx2 < 1 || SrcOpIdx2 > RegOpsNum))
|
||||
return false;
|
||||
|
||||
// Look for two different register operands assumed to be commutable
|
||||
// regardless of the FMA opcode. The FMA opcode is adjusted later.
|
||||
if (SrcOpIdx1 == CommuteAnyOperandIndex ||
|
||||
SrcOpIdx2 == CommuteAnyOperandIndex) {
|
||||
unsigned CommutableOpIdx1 = SrcOpIdx1;
|
||||
unsigned CommutableOpIdx2 = SrcOpIdx2;
|
||||
|
||||
// At least one of operands to be commuted is not specified and
|
||||
// this method is free to choose appropriate commutable operands.
|
||||
if (SrcOpIdx1 == SrcOpIdx2)
|
||||
// Both of operands are not fixed. By default set one of commutable
|
||||
// operands to the last register operand of the instruction.
|
||||
CommutableOpIdx2 = RegOpsNum;
|
||||
else if (SrcOpIdx2 == CommuteAnyOperandIndex)
|
||||
// Only one of operands is not fixed.
|
||||
CommutableOpIdx2 = SrcOpIdx1;
|
||||
|
||||
// CommutableOpIdx2 is well defined now. Let's choose another commutable
|
||||
// operand and assign its index to CommutableOpIdx1.
|
||||
unsigned Op2Reg = MI->getOperand(CommutableOpIdx2).getReg();
|
||||
for (CommutableOpIdx1 = RegOpsNum; CommutableOpIdx1 > 0; CommutableOpIdx1--) {
|
||||
// The commuted operands must have different registers.
|
||||
// Otherwise, the commute transformation does not change anything and
|
||||
// is useless then.
|
||||
if (Op2Reg != MI->getOperand(CommutableOpIdx1).getReg())
|
||||
break;
|
||||
}
|
||||
|
||||
// No appropriate commutable operands were found.
|
||||
if (CommutableOpIdx1 == 0)
|
||||
return false;
|
||||
|
||||
// Assign the found pair of commutable indices to SrcOpIdx1 and SrcOpidx2
|
||||
// to return those values.
|
||||
if (!fixCommutedOpIndices(SrcOpIdx1, SrcOpIdx2,
|
||||
CommutableOpIdx1, CommutableOpIdx2))
|
||||
return false;
|
||||
}
|
||||
|
||||
// Check if we can adjust the opcode to preserve the semantics when
|
||||
// commute the register operands.
|
||||
return getFMA3OpcodeToCommuteOperands(MI, SrcOpIdx1, SrcOpIdx2) != 0;
|
||||
}
|
||||
|
||||
unsigned X86InstrInfo::getFMA3OpcodeToCommuteOperands(MachineInstr *MI,
|
||||
unsigned SrcOpIdx1,
|
||||
unsigned SrcOpIdx2) const {
|
||||
unsigned Opc = MI->getOpcode();
|
||||
|
||||
// Define the array that holds FMA opcodes in groups
|
||||
// of 3 opcodes(132, 213, 231) in each group.
|
||||
static const unsigned OpcodeGroups[][3] = {
|
||||
{ X86::VFMADDSSr132r, X86::VFMADDSSr213r, X86::VFMADDSSr231r },
|
||||
{ X86::VFMADDSDr132r, X86::VFMADDSDr213r, X86::VFMADDSDr231r },
|
||||
{ X86::VFMADDPSr132r, X86::VFMADDPSr213r, X86::VFMADDPSr231r },
|
||||
{ X86::VFMADDPDr132r, X86::VFMADDPDr213r, X86::VFMADDPDr231r },
|
||||
{ X86::VFMADDPSr132rY, X86::VFMADDPSr213rY, X86::VFMADDPSr231rY },
|
||||
{ X86::VFMADDPDr132rY, X86::VFMADDPDr213rY, X86::VFMADDPDr231rY },
|
||||
{ X86::VFMADDSSr132m, X86::VFMADDSSr213m, X86::VFMADDSSr231m },
|
||||
{ X86::VFMADDSDr132m, X86::VFMADDSDr213m, X86::VFMADDSDr231m },
|
||||
{ X86::VFMADDPSr132m, X86::VFMADDPSr213m, X86::VFMADDPSr231m },
|
||||
{ X86::VFMADDPDr132m, X86::VFMADDPDr213m, X86::VFMADDPDr231m },
|
||||
{ X86::VFMADDPSr132mY, X86::VFMADDPSr213mY, X86::VFMADDPSr231mY },
|
||||
{ X86::VFMADDPDr132mY, X86::VFMADDPDr213mY, X86::VFMADDPDr231mY },
|
||||
|
||||
{ X86::VFMSUBSSr132r, X86::VFMSUBSSr213r, X86::VFMSUBSSr231r },
|
||||
{ X86::VFMSUBSDr132r, X86::VFMSUBSDr213r, X86::VFMSUBSDr231r },
|
||||
{ X86::VFMSUBPSr132r, X86::VFMSUBPSr213r, X86::VFMSUBPSr231r },
|
||||
{ X86::VFMSUBPDr132r, X86::VFMSUBPDr213r, X86::VFMSUBPDr231r },
|
||||
{ X86::VFMSUBPSr132rY, X86::VFMSUBPSr213rY, X86::VFMSUBPSr231rY },
|
||||
{ X86::VFMSUBPDr132rY, X86::VFMSUBPDr213rY, X86::VFMSUBPDr231rY },
|
||||
{ X86::VFMSUBSSr132m, X86::VFMSUBSSr213m, X86::VFMSUBSSr231m },
|
||||
{ X86::VFMSUBSDr132m, X86::VFMSUBSDr213m, X86::VFMSUBSDr231m },
|
||||
{ X86::VFMSUBPSr132m, X86::VFMSUBPSr213m, X86::VFMSUBPSr231m },
|
||||
{ X86::VFMSUBPDr132m, X86::VFMSUBPDr213m, X86::VFMSUBPDr231m },
|
||||
{ X86::VFMSUBPSr132mY, X86::VFMSUBPSr213mY, X86::VFMSUBPSr231mY },
|
||||
{ X86::VFMSUBPDr132mY, X86::VFMSUBPDr213mY, X86::VFMSUBPDr231mY },
|
||||
|
||||
{ X86::VFNMADDSSr132r, X86::VFNMADDSSr213r, X86::VFNMADDSSr231r },
|
||||
{ X86::VFNMADDSDr132r, X86::VFNMADDSDr213r, X86::VFNMADDSDr231r },
|
||||
{ X86::VFNMADDPSr132r, X86::VFNMADDPSr213r, X86::VFNMADDPSr231r },
|
||||
{ X86::VFNMADDPDr132r, X86::VFNMADDPDr213r, X86::VFNMADDPDr231r },
|
||||
{ X86::VFNMADDPSr132rY, X86::VFNMADDPSr213rY, X86::VFNMADDPSr231rY },
|
||||
{ X86::VFNMADDPDr132rY, X86::VFNMADDPDr213rY, X86::VFNMADDPDr231rY },
|
||||
{ X86::VFNMADDSSr132m, X86::VFNMADDSSr213m, X86::VFNMADDSSr231m },
|
||||
{ X86::VFNMADDSDr132m, X86::VFNMADDSDr213m, X86::VFNMADDSDr231m },
|
||||
{ X86::VFNMADDPSr132m, X86::VFNMADDPSr213m, X86::VFNMADDPSr231m },
|
||||
{ X86::VFNMADDPDr132m, X86::VFNMADDPDr213m, X86::VFNMADDPDr231m },
|
||||
{ X86::VFNMADDPSr132mY, X86::VFNMADDPSr213mY, X86::VFNMADDPSr231mY },
|
||||
{ X86::VFNMADDPDr132mY, X86::VFNMADDPDr213mY, X86::VFNMADDPDr231mY },
|
||||
|
||||
{ X86::VFNMSUBSSr132r, X86::VFNMSUBSSr213r, X86::VFNMSUBSSr231r },
|
||||
{ X86::VFNMSUBSDr132r, X86::VFNMSUBSDr213r, X86::VFNMSUBSDr231r },
|
||||
{ X86::VFNMSUBPSr132r, X86::VFNMSUBPSr213r, X86::VFNMSUBPSr231r },
|
||||
{ X86::VFNMSUBPDr132r, X86::VFNMSUBPDr213r, X86::VFNMSUBPDr231r },
|
||||
{ X86::VFNMSUBPSr132rY, X86::VFNMSUBPSr213rY, X86::VFNMSUBPSr231rY },
|
||||
{ X86::VFNMSUBPDr132rY, X86::VFNMSUBPDr213rY, X86::VFNMSUBPDr231rY },
|
||||
{ X86::VFNMSUBSSr132m, X86::VFNMSUBSSr213m, X86::VFNMSUBSSr231m },
|
||||
{ X86::VFNMSUBSDr132m, X86::VFNMSUBSDr213m, X86::VFNMSUBSDr231m },
|
||||
{ X86::VFNMSUBPSr132m, X86::VFNMSUBPSr213m, X86::VFNMSUBPSr231m },
|
||||
{ X86::VFNMSUBPDr132m, X86::VFNMSUBPDr213m, X86::VFNMSUBPDr231m },
|
||||
{ X86::VFNMSUBPSr132mY, X86::VFNMSUBPSr213mY, X86::VFNMSUBPSr231mY },
|
||||
{ X86::VFNMSUBPDr132mY, X86::VFNMSUBPDr213mY, X86::VFNMSUBPDr231mY },
|
||||
|
||||
{ X86::VFMADDSUBPSr132r, X86::VFMADDSUBPSr213r, X86::VFMADDSUBPSr231r },
|
||||
{ X86::VFMADDSUBPDr132r, X86::VFMADDSUBPDr213r, X86::VFMADDSUBPDr231r },
|
||||
{ X86::VFMADDSUBPSr132rY, X86::VFMADDSUBPSr213rY, X86::VFMADDSUBPSr231rY },
|
||||
{ X86::VFMADDSUBPDr132rY, X86::VFMADDSUBPDr213rY, X86::VFMADDSUBPDr231rY },
|
||||
{ X86::VFMADDSUBPSr132m, X86::VFMADDSUBPSr213m, X86::VFMADDSUBPSr231m },
|
||||
{ X86::VFMADDSUBPDr132m, X86::VFMADDSUBPDr213m, X86::VFMADDSUBPDr231m },
|
||||
{ X86::VFMADDSUBPSr132mY, X86::VFMADDSUBPSr213mY, X86::VFMADDSUBPSr231mY },
|
||||
{ X86::VFMADDSUBPDr132mY, X86::VFMADDSUBPDr213mY, X86::VFMADDSUBPDr231mY },
|
||||
|
||||
{ X86::VFMSUBADDPSr132r, X86::VFMSUBADDPSr213r, X86::VFMSUBADDPSr231r },
|
||||
{ X86::VFMSUBADDPDr132r, X86::VFMSUBADDPDr213r, X86::VFMSUBADDPDr231r },
|
||||
{ X86::VFMSUBADDPSr132rY, X86::VFMSUBADDPSr213rY, X86::VFMSUBADDPSr231rY },
|
||||
{ X86::VFMSUBADDPDr132rY, X86::VFMSUBADDPDr213rY, X86::VFMSUBADDPDr231rY },
|
||||
{ X86::VFMSUBADDPSr132m, X86::VFMSUBADDPSr213m, X86::VFMSUBADDPSr231m },
|
||||
{ X86::VFMSUBADDPDr132m, X86::VFMSUBADDPDr213m, X86::VFMSUBADDPDr231m },
|
||||
{ X86::VFMSUBADDPSr132mY, X86::VFMSUBADDPSr213mY, X86::VFMSUBADDPSr231mY },
|
||||
{ X86::VFMSUBADDPDr132mY, X86::VFMSUBADDPDr213mY, X86::VFMSUBADDPDr231mY }
|
||||
};
|
||||
const unsigned Form132Index = 0;
|
||||
const unsigned Form213Index = 1;
|
||||
const unsigned Form231Index = 2;
|
||||
const unsigned FormsNum = 3;
|
||||
|
||||
// Look for the input opcode in the OpcodeGroups table.
|
||||
unsigned OpcodeGroupsNum = sizeof(OpcodeGroups) / sizeof(OpcodeGroups[0]);
|
||||
unsigned GroupIndex = 0, FormIndex = FormsNum;
|
||||
for (; GroupIndex < OpcodeGroupsNum && FormIndex == FormsNum; GroupIndex++) {
|
||||
for (FormIndex = 0; FormIndex < FormsNum; FormIndex++) {
|
||||
if (OpcodeGroups[GroupIndex][FormIndex] == Opc)
|
||||
break;
|
||||
}
|
||||
}
|
||||
// Input opcode does not match with any of the opcodes from the table.
|
||||
if (FormIndex == FormsNum)
|
||||
return 0;
|
||||
// Do not forget to fix the GroupIndex after the loop.
|
||||
GroupIndex--;
|
||||
|
||||
// Put the lowest index to SrcOpIdx1 to simplify the checks below.
|
||||
if (SrcOpIdx1 > SrcOpIdx2)
|
||||
std::swap(SrcOpIdx1, SrcOpIdx2);
|
||||
|
||||
unsigned Case;
|
||||
if (SrcOpIdx1 == 1 && SrcOpIdx2 == 2)
|
||||
Case = 0;
|
||||
else if (SrcOpIdx1 == 1 && SrcOpIdx2 == 3)
|
||||
Case = 1;
|
||||
else if (SrcOpIdx1 == 2 && SrcOpIdx2 == 3)
|
||||
Case = 2;
|
||||
else
|
||||
return 0;
|
||||
|
||||
// Define the FMA forms mapping array that helps to map input FMA form
|
||||
// to output FMA form to preserve the operation semantics after
|
||||
// commuting the operands.
|
||||
static const unsigned FormMapping[][3] = {
|
||||
// 0: SrcOpIdx1 == 1 && SrcOpIdx2 == 2;
|
||||
// FMA132 A, C, b; ==> FMA231 C, A, b;
|
||||
// FMA213 B, A, c; ==> FMA213 A, B, c;
|
||||
// FMA231 C, A, b; ==> FMA132 A, C, b;
|
||||
{ Form231Index, Form213Index, Form132Index },
|
||||
// 1: SrcOpIdx1 == 1 && SrcOpIdx2 == 3;
|
||||
// FMA132 A, c, B; ==> FMA132 B, c, A;
|
||||
// FMA213 B, a, C; ==> FMA231 C, a, B;
|
||||
// FMA231 C, a, B; ==> FMA213 B, a, C;
|
||||
{ Form132Index, Form231Index, Form213Index },
|
||||
// 2: SrcOpIdx1 == 2 && SrcOpIdx2 == 3;
|
||||
// FMA132 a, C, B; ==> FMA213 a, B, C;
|
||||
// FMA213 b, A, C; ==> FMA132 b, C, A;
|
||||
// FMA231 c, A, B; ==> FMA231 c, B, A;
|
||||
{ Form213Index, Form132Index, Form231Index }
|
||||
};
|
||||
|
||||
// Everything is ready, just adjust the FMA opcode and return it.
|
||||
FormIndex = FormMapping[Case][FormIndex];
|
||||
return OpcodeGroups[GroupIndex][FormIndex];
|
||||
}
|
||||
|
||||
bool X86InstrInfo::findCommutedOpIndices(MachineInstr *MI,
|
||||
unsigned &SrcOpIdx1,
|
||||
unsigned &SrcOpIdx2) const {
|
||||
@ -3209,34 +3533,9 @@ bool X86InstrInfo::findCommutedOpIndices(MachineInstr *MI,
|
||||
}
|
||||
return false;
|
||||
}
|
||||
case X86::VFMADDPDr231r:
|
||||
case X86::VFMADDPSr231r:
|
||||
case X86::VFMADDSDr231r:
|
||||
case X86::VFMADDSSr231r:
|
||||
case X86::VFMSUBPDr231r:
|
||||
case X86::VFMSUBPSr231r:
|
||||
case X86::VFMSUBSDr231r:
|
||||
case X86::VFMSUBSSr231r:
|
||||
case X86::VFNMADDPDr231r:
|
||||
case X86::VFNMADDPSr231r:
|
||||
case X86::VFNMADDSDr231r:
|
||||
case X86::VFNMADDSSr231r:
|
||||
case X86::VFNMSUBPDr231r:
|
||||
case X86::VFNMSUBPSr231r:
|
||||
case X86::VFNMSUBSDr231r:
|
||||
case X86::VFNMSUBSSr231r:
|
||||
case X86::VFMADDPDr231rY:
|
||||
case X86::VFMADDPSr231rY:
|
||||
case X86::VFMSUBPDr231rY:
|
||||
case X86::VFMSUBPSr231rY:
|
||||
case X86::VFNMADDPDr231rY:
|
||||
case X86::VFNMADDPSr231rY:
|
||||
case X86::VFNMSUBPDr231rY:
|
||||
case X86::VFNMSUBPSr231rY:
|
||||
// The indices of the commutable operands are 2 and 3.
|
||||
// Assign them to the returned operand indices here.
|
||||
return fixCommutedOpIndices(SrcOpIdx1, SrcOpIdx2, 2, 3);
|
||||
default:
|
||||
if (isFMA3(MI->getOpcode()))
|
||||
return findFMA3CommutedOpIndices(MI, SrcOpIdx1, SrcOpIdx2);
|
||||
return TargetInstrInfo::findCommutedOpIndices(MI, SrcOpIdx1, SrcOpIdx2);
|
||||
}
|
||||
return false;
|
||||
|
@ -264,6 +264,46 @@ public:
|
||||
bool findCommutedOpIndices(MachineInstr *MI, unsigned &SrcOpIdx1,
|
||||
unsigned &SrcOpIdx2) const override;
|
||||
|
||||
/// Returns true if the routine could find two commutable operands
|
||||
/// in the given FMA instruction. Otherwise, returns false.
|
||||
///
|
||||
/// \p SrcOpIdx1 and \p SrcOpIdx2 are INPUT and OUTPUT arguments.
|
||||
/// The output indices of the commuted operands are returned in these
|
||||
/// arguments. Also, the input values of these arguments may be preset either
|
||||
/// to indices of operands that must be commuted or be equal to a special
|
||||
/// value 'CommuteAnyOperandIndex' which means that the corresponding
|
||||
/// operand index is not set and this method is free to pick any of
|
||||
/// available commutable operands.
|
||||
///
|
||||
/// For example, calling this method this way:
|
||||
/// unsigned Idx1 = 1, Idx2 = CommuteAnyOperandIndex;
|
||||
/// findFMA3CommutedOpIndices(MI, Idx1, Idx2);
|
||||
/// can be interpreted as a query asking if the operand #1 can be swapped
|
||||
/// with any other available operand (e.g. operand #2, operand #3, etc.).
|
||||
///
|
||||
/// The returned FMA opcode may differ from the opcode in the given MI.
|
||||
/// For example, commuting the operands #1 and #3 in the following FMA
|
||||
/// FMA213 #1, #2, #3
|
||||
/// results into instruction with adjusted opcode:
|
||||
/// FMA231 #3, #2, #1
|
||||
bool findFMA3CommutedOpIndices(MachineInstr *MI,
|
||||
unsigned &SrcOpIdx1,
|
||||
unsigned &SrcOpIdx2) const;
|
||||
|
||||
/// Returns an adjusted FMA opcode that must be used in FMA instruction that
|
||||
/// performs the same computations as the given MI but which has the operands
|
||||
/// \p SrcOpIdx1 and \p SrcOpIdx2 commuted.
|
||||
/// It may return 0 if it is unsafe to commute the operands.
|
||||
///
|
||||
/// The returned FMA opcode may differ from the opcode in the given \p MI.
|
||||
/// For example, commuting the operands #1 and #3 in the following FMA
|
||||
/// FMA213 #1, #2, #3
|
||||
/// results into instruction with adjusted opcode:
|
||||
/// FMA231 #3, #2, #1
|
||||
unsigned getFMA3OpcodeToCommuteOperands(MachineInstr *MI,
|
||||
unsigned SrcOpIdx1,
|
||||
unsigned SrcOpIdx2) const;
|
||||
|
||||
// Branch analysis.
|
||||
bool isUnpredicatedTerminator(const MachineInstr* MI) const override;
|
||||
bool AnalyzeBranch(MachineBasicBlock &MBB, MachineBasicBlock *&TBB,
|
||||
|
506
test/CodeGen/X86/fma-commute-x86.ll
Normal file
506
test/CodeGen/X86/fma-commute-x86.ll
Normal file
@ -0,0 +1,506 @@
|
||||
; RUN: llc < %s -mtriple=x86_64-pc-win32 -mcpu=core-avx2 | FileCheck %s
|
||||
; RUN: llc < %s -mtriple=x86_64-pc-win32 -mattr=+fma,+fma4 | FileCheck %s
|
||||
; RUN: llc < %s -mcpu=bdver2 -mtriple=x86_64-pc-win32 -mattr=-fma4 | FileCheck %s
|
||||
|
||||
attributes #0 = { nounwind }
|
||||
|
||||
declare <4 x float> @llvm.x86.fma.vfmadd.ps(<4 x float>, <4 x float>, <4 x float>) nounwind readnone
|
||||
define <4 x float> @test_x86_fmadd_baa_ps(<4 x float> %a, <4 x float> %b) #0 {
|
||||
; CHECK-LABEL: test_x86_fmadd_baa_ps:
|
||||
; CHECK: # BB#0:
|
||||
; CHECK-NEXT: vmovaps (%rcx), %xmm0
|
||||
; CHECK-NEXT: vfmadd132ps (%rdx), %xmm0, %xmm0
|
||||
; CHECK-NEXT: retq
|
||||
%res = call <4 x float> @llvm.x86.fma.vfmadd.ps(<4 x float> %b, <4 x float> %a, <4 x float> %a) nounwind
|
||||
ret <4 x float> %res
|
||||
}
|
||||
|
||||
define <4 x float> @test_x86_fmadd_aba_ps(<4 x float> %a, <4 x float> %b) #0 {
|
||||
; CHECK-LABEL: test_x86_fmadd_aba_ps:
|
||||
; CHECK: # BB#0:
|
||||
; CHECK-NEXT: vmovaps (%rcx), %xmm0
|
||||
; CHECK-NEXT: vfmadd231ps (%rdx), %xmm0, %xmm0
|
||||
; CHECK-NEXT: retq
|
||||
%res = call <4 x float> @llvm.x86.fma.vfmadd.ps(<4 x float> %a, <4 x float> %b, <4 x float> %a) nounwind
|
||||
ret <4 x float> %res
|
||||
}
|
||||
|
||||
define <4 x float> @test_x86_fmadd_bba_ps(<4 x float> %a, <4 x float> %b) #0 {
|
||||
; CHECK-LABEL: test_x86_fmadd_bba_ps:
|
||||
; CHECK: # BB#0:
|
||||
; CHECK-NEXT: vmovaps (%rdx), %xmm0
|
||||
; CHECK-NEXT: vfmadd213ps (%rcx), %xmm0, %xmm0
|
||||
; CHECK-NEXT: retq
|
||||
%res = call <4 x float> @llvm.x86.fma.vfmadd.ps(<4 x float> %b, <4 x float> %b, <4 x float> %a) nounwind
|
||||
ret <4 x float> %res
|
||||
}
|
||||
|
||||
declare <8 x float> @llvm.x86.fma.vfmadd.ps.256(<8 x float>, <8 x float>, <8 x float>) nounwind readnone
|
||||
define <8 x float> @test_x86_fmadd_baa_ps_y(<8 x float> %a, <8 x float> %b) #0 {
|
||||
; CHECK-LABEL: test_x86_fmadd_baa_ps_y:
|
||||
; CHECK: # BB#0:
|
||||
; CHECK-NEXT: vmovaps (%rcx), %ymm0
|
||||
; CHECK-NEXT: vfmadd132ps (%rdx), %ymm0, %ymm0
|
||||
; CHECK-NEXT: retq
|
||||
%res = call <8 x float> @llvm.x86.fma.vfmadd.ps.256(<8 x float> %b, <8 x float> %a, <8 x float> %a) nounwind
|
||||
ret <8 x float> %res
|
||||
}
|
||||
|
||||
define <8 x float> @test_x86_fmadd_aba_ps_y(<8 x float> %a, <8 x float> %b) #0 {
|
||||
; CHECK-LABEL: test_x86_fmadd_aba_ps_y:
|
||||
; CHECK: # BB#0:
|
||||
; CHECK-NEXT: vmovaps (%rcx), %ymm0
|
||||
; CHECK-NEXT: vfmadd231ps (%rdx), %ymm0, %ymm0
|
||||
; CHECK-NEXT: retq
|
||||
%res = call <8 x float> @llvm.x86.fma.vfmadd.ps.256(<8 x float> %a, <8 x float> %b, <8 x float> %a) nounwind
|
||||
ret <8 x float> %res
|
||||
}
|
||||
|
||||
define <8 x float> @test_x86_fmadd_bba_ps_y(<8 x float> %a, <8 x float> %b) #0 {
|
||||
; CHECK-LABEL: test_x86_fmadd_bba_ps_y:
|
||||
; CHECK: # BB#0:
|
||||
; CHECK-NEXT: vmovaps (%rdx), %ymm0
|
||||
; CHECK-NEXT: vfmadd213ps (%rcx), %ymm0, %ymm0
|
||||
; CHECK-NEXT: retq
|
||||
%res = call <8 x float> @llvm.x86.fma.vfmadd.ps.256(<8 x float> %b, <8 x float> %b, <8 x float> %a) nounwind
|
||||
ret <8 x float> %res
|
||||
}
|
||||
|
||||
declare <2 x double> @llvm.x86.fma.vfmadd.pd(<2 x double>, <2 x double>, <2 x double>) nounwind readnone
|
||||
define <2 x double> @test_x86_fmadd_baa_pd(<2 x double> %a, <2 x double> %b) #0 {
|
||||
; CHECK-LABEL: test_x86_fmadd_baa_pd:
|
||||
; CHECK: # BB#0:
|
||||
; CHECK-NEXT: vmovapd (%rcx), %xmm0
|
||||
; CHECK-NEXT: vfmadd132pd (%rdx), %xmm0, %xmm0
|
||||
; CHECK-NEXT: retq
|
||||
%res = call <2 x double> @llvm.x86.fma.vfmadd.pd(<2 x double> %b, <2 x double> %a, <2 x double> %a) nounwind
|
||||
ret <2 x double> %res
|
||||
}
|
||||
|
||||
define <2 x double> @test_x86_fmadd_aba_pd(<2 x double> %a, <2 x double> %b) #0 {
|
||||
; CHECK-LABEL: test_x86_fmadd_aba_pd:
|
||||
; CHECK: # BB#0:
|
||||
; CHECK-NEXT: vmovapd (%rcx), %xmm0
|
||||
; CHECK-NEXT: vfmadd231pd (%rdx), %xmm0, %xmm0
|
||||
; CHECK-NEXT: retq
|
||||
%res = call <2 x double> @llvm.x86.fma.vfmadd.pd(<2 x double> %a, <2 x double> %b, <2 x double> %a) nounwind
|
||||
ret <2 x double> %res
|
||||
}
|
||||
|
||||
define <2 x double> @test_x86_fmadd_bba_pd(<2 x double> %a, <2 x double> %b) #0 {
|
||||
; CHECK-LABEL: test_x86_fmadd_bba_pd:
|
||||
; CHECK: # BB#0:
|
||||
; CHECK-NEXT: vmovapd (%rdx), %xmm0
|
||||
; CHECK-NEXT: vfmadd213pd (%rcx), %xmm0, %xmm0
|
||||
; CHECK-NEXT: retq
|
||||
%res = call <2 x double> @llvm.x86.fma.vfmadd.pd(<2 x double> %b, <2 x double> %b, <2 x double> %a) nounwind
|
||||
ret <2 x double> %res
|
||||
}
|
||||
|
||||
declare <4 x double> @llvm.x86.fma.vfmadd.pd.256(<4 x double>, <4 x double>, <4 x double>) nounwind readnone
|
||||
define <4 x double> @test_x86_fmadd_baa_pd_y(<4 x double> %a, <4 x double> %b) #0 {
|
||||
; CHECK-LABEL: test_x86_fmadd_baa_pd_y:
|
||||
; CHECK: # BB#0:
|
||||
; CHECK-NEXT: vmovapd (%rcx), %ymm0
|
||||
; CHECK-NEXT: vfmadd132pd (%rdx), %ymm0, %ymm0
|
||||
; CHECK-NEXT: retq
|
||||
%res = call <4 x double> @llvm.x86.fma.vfmadd.pd.256(<4 x double> %b, <4 x double> %a, <4 x double> %a) nounwind
|
||||
ret <4 x double> %res
|
||||
}
|
||||
|
||||
define <4 x double> @test_x86_fmadd_aba_pd_y(<4 x double> %a, <4 x double> %b) #0 {
|
||||
; CHECK-LABEL: test_x86_fmadd_aba_pd_y:
|
||||
; CHECK: # BB#0:
|
||||
; CHECK-NEXT: vmovapd (%rcx), %ymm0
|
||||
; CHECK-NEXT: vfmadd231pd (%rdx), %ymm0, %ymm0
|
||||
; CHECK-NEXT: retq
|
||||
%res = call <4 x double> @llvm.x86.fma.vfmadd.pd.256(<4 x double> %a, <4 x double> %b, <4 x double> %a) nounwind
|
||||
ret <4 x double> %res
|
||||
}
|
||||
|
||||
define <4 x double> @test_x86_fmadd_bba_pd_y(<4 x double> %a, <4 x double> %b) #0 {
|
||||
; CHECK-LABEL: test_x86_fmadd_bba_pd_y:
|
||||
; CHECK: # BB#0:
|
||||
; CHECK-NEXT: vmovapd (%rdx), %ymm0
|
||||
; CHECK-NEXT: vfmadd213pd (%rcx), %ymm0, %ymm0
|
||||
; CHECK-NEXT: retq
|
||||
%res = call <4 x double> @llvm.x86.fma.vfmadd.pd.256(<4 x double> %b, <4 x double> %b, <4 x double> %a) nounwind
|
||||
ret <4 x double> %res
|
||||
}
|
||||
|
||||
|
||||
|
||||
declare <4 x float> @llvm.x86.fma.vfnmadd.ps(<4 x float>, <4 x float>, <4 x float>) nounwind readnone
|
||||
define <4 x float> @test_x86_fnmadd_baa_ps(<4 x float> %a, <4 x float> %b) #0 {
|
||||
; CHECK-LABEL: test_x86_fnmadd_baa_ps:
|
||||
; CHECK: # BB#0:
|
||||
; CHECK-NEXT: vmovaps (%rcx), %xmm0
|
||||
; CHECK-NEXT: vfnmadd132ps (%rdx), %xmm0, %xmm0
|
||||
; CHECK-NEXT: retq
|
||||
%res = call <4 x float> @llvm.x86.fma.vfnmadd.ps(<4 x float> %b, <4 x float> %a, <4 x float> %a) nounwind
|
||||
ret <4 x float> %res
|
||||
}
|
||||
|
||||
define <4 x float> @test_x86_fnmadd_aba_ps(<4 x float> %a, <4 x float> %b) #0 {
|
||||
; CHECK-LABEL: test_x86_fnmadd_aba_ps:
|
||||
; CHECK: # BB#0:
|
||||
; CHECK-NEXT: vmovaps (%rcx), %xmm0
|
||||
; CHECK-NEXT: vfnmadd231ps (%rdx), %xmm0, %xmm0
|
||||
; CHECK-NEXT: retq
|
||||
%res = call <4 x float> @llvm.x86.fma.vfnmadd.ps(<4 x float> %a, <4 x float> %b, <4 x float> %a) nounwind
|
||||
ret <4 x float> %res
|
||||
}
|
||||
|
||||
define <4 x float> @test_x86_fnmadd_bba_ps(<4 x float> %a, <4 x float> %b) #0 {
|
||||
; CHECK-LABEL: test_x86_fnmadd_bba_ps:
|
||||
; CHECK: # BB#0:
|
||||
; CHECK-NEXT: vmovaps (%rdx), %xmm0
|
||||
; CHECK-NEXT: vfnmadd213ps (%rcx), %xmm0, %xmm0
|
||||
; CHECK-NEXT: retq
|
||||
%res = call <4 x float> @llvm.x86.fma.vfnmadd.ps(<4 x float> %b, <4 x float> %b, <4 x float> %a) nounwind
|
||||
ret <4 x float> %res
|
||||
}
|
||||
|
||||
declare <8 x float> @llvm.x86.fma.vfnmadd.ps.256(<8 x float>, <8 x float>, <8 x float>) nounwind readnone
|
||||
define <8 x float> @test_x86_fnmadd_baa_ps_y(<8 x float> %a, <8 x float> %b) #0 {
|
||||
; CHECK-LABEL: test_x86_fnmadd_baa_ps_y:
|
||||
; CHECK: # BB#0:
|
||||
; CHECK-NEXT: vmovaps (%rcx), %ymm0
|
||||
; CHECK-NEXT: vfnmadd132ps (%rdx), %ymm0, %ymm0
|
||||
; CHECK-NEXT: retq
|
||||
%res = call <8 x float> @llvm.x86.fma.vfnmadd.ps.256(<8 x float> %b, <8 x float> %a, <8 x float> %a) nounwind
|
||||
ret <8 x float> %res
|
||||
}
|
||||
|
||||
define <8 x float> @test_x86_fnmadd_aba_ps_y(<8 x float> %a, <8 x float> %b) #0 {
|
||||
; CHECK-LABEL: test_x86_fnmadd_aba_ps_y:
|
||||
; CHECK: # BB#0:
|
||||
; CHECK-NEXT: vmovaps (%rcx), %ymm0
|
||||
; CHECK-NEXT: vfnmadd231ps (%rdx), %ymm0, %ymm0
|
||||
; CHECK-NEXT: retq
|
||||
%res = call <8 x float> @llvm.x86.fma.vfnmadd.ps.256(<8 x float> %a, <8 x float> %b, <8 x float> %a) nounwind
|
||||
ret <8 x float> %res
|
||||
}
|
||||
|
||||
define <8 x float> @test_x86_fnmadd_bba_ps_y(<8 x float> %a, <8 x float> %b) #0 {
|
||||
; CHECK-LABEL: test_x86_fnmadd_bba_ps_y:
|
||||
; CHECK: # BB#0:
|
||||
; CHECK-NEXT: vmovaps (%rdx), %ymm0
|
||||
; CHECK-NEXT: vfnmadd213ps (%rcx), %ymm0, %ymm0
|
||||
; CHECK-NEXT: retq
|
||||
%res = call <8 x float> @llvm.x86.fma.vfnmadd.ps.256(<8 x float> %b, <8 x float> %b, <8 x float> %a) nounwind
|
||||
ret <8 x float> %res
|
||||
}
|
||||
|
||||
declare <2 x double> @llvm.x86.fma.vfnmadd.pd(<2 x double>, <2 x double>, <2 x double>) nounwind readnone
|
||||
define <2 x double> @test_x86_fnmadd_baa_pd(<2 x double> %a, <2 x double> %b) #0 {
|
||||
; CHECK-LABEL: test_x86_fnmadd_baa_pd:
|
||||
; CHECK: # BB#0:
|
||||
; CHECK-NEXT: vmovapd (%rcx), %xmm0
|
||||
; CHECK-NEXT: vfnmadd132pd (%rdx), %xmm0, %xmm0
|
||||
; CHECK-NEXT: retq
|
||||
%res = call <2 x double> @llvm.x86.fma.vfnmadd.pd(<2 x double> %b, <2 x double> %a, <2 x double> %a) nounwind
|
||||
ret <2 x double> %res
|
||||
}
|
||||
|
||||
define <2 x double> @test_x86_fnmadd_aba_pd(<2 x double> %a, <2 x double> %b) #0 {
|
||||
; CHECK-LABEL: test_x86_fnmadd_aba_pd:
|
||||
; CHECK: # BB#0:
|
||||
; CHECK-NEXT: vmovapd (%rcx), %xmm0
|
||||
; CHECK-NEXT: vfnmadd231pd (%rdx), %xmm0, %xmm0
|
||||
; CHECK-NEXT: retq
|
||||
%res = call <2 x double> @llvm.x86.fma.vfnmadd.pd(<2 x double> %a, <2 x double> %b, <2 x double> %a) nounwind
|
||||
ret <2 x double> %res
|
||||
}
|
||||
|
||||
define <2 x double> @test_x86_fnmadd_bba_pd(<2 x double> %a, <2 x double> %b) #0 {
|
||||
; CHECK-LABEL: test_x86_fnmadd_bba_pd:
|
||||
; CHECK: # BB#0:
|
||||
; CHECK-NEXT: vmovapd (%rdx), %xmm0
|
||||
; CHECK-NEXT: vfnmadd213pd (%rcx), %xmm0, %xmm0
|
||||
; CHECK-NEXT: retq
|
||||
%res = call <2 x double> @llvm.x86.fma.vfnmadd.pd(<2 x double> %b, <2 x double> %b, <2 x double> %a) nounwind
|
||||
ret <2 x double> %res
|
||||
}
|
||||
|
||||
declare <4 x double> @llvm.x86.fma.vfnmadd.pd.256(<4 x double>, <4 x double>, <4 x double>) nounwind readnone
|
||||
define <4 x double> @test_x86_fnmadd_baa_pd_y(<4 x double> %a, <4 x double> %b) #0 {
|
||||
; CHECK-LABEL: test_x86_fnmadd_baa_pd_y:
|
||||
; CHECK: # BB#0:
|
||||
; CHECK-NEXT: vmovapd (%rcx), %ymm0
|
||||
; CHECK-NEXT: vfnmadd132pd (%rdx), %ymm0, %ymm0
|
||||
; CHECK-NEXT: retq
|
||||
%res = call <4 x double> @llvm.x86.fma.vfnmadd.pd.256(<4 x double> %b, <4 x double> %a, <4 x double> %a) nounwind
|
||||
ret <4 x double> %res
|
||||
}
|
||||
|
||||
define <4 x double> @test_x86_fnmadd_aba_pd_y(<4 x double> %a, <4 x double> %b) #0 {
|
||||
; CHECK-LABEL: test_x86_fnmadd_aba_pd_y:
|
||||
; CHECK: # BB#0:
|
||||
; CHECK-NEXT: vmovapd (%rcx), %ymm0
|
||||
; CHECK-NEXT: vfnmadd231pd (%rdx), %ymm0, %ymm0
|
||||
; CHECK-NEXT: retq
|
||||
%res = call <4 x double> @llvm.x86.fma.vfnmadd.pd.256(<4 x double> %a, <4 x double> %b, <4 x double> %a) nounwind
|
||||
ret <4 x double> %res
|
||||
}
|
||||
|
||||
define <4 x double> @test_x86_fnmadd_bba_pd_y(<4 x double> %a, <4 x double> %b) #0 {
|
||||
; CHECK-LABEL: test_x86_fnmadd_bba_pd_y:
|
||||
; CHECK: # BB#0:
|
||||
; CHECK-NEXT: vmovapd (%rdx), %ymm0
|
||||
; CHECK-NEXT: vfnmadd213pd (%rcx), %ymm0, %ymm0
|
||||
; CHECK-NEXT: retq
|
||||
%res = call <4 x double> @llvm.x86.fma.vfnmadd.pd.256(<4 x double> %b, <4 x double> %b, <4 x double> %a) nounwind
|
||||
ret <4 x double> %res
|
||||
}
|
||||
|
||||
|
||||
declare <4 x float> @llvm.x86.fma.vfmsub.ps(<4 x float>, <4 x float>, <4 x float>) nounwind readnone
|
||||
define <4 x float> @test_x86_fmsub_baa_ps(<4 x float> %a, <4 x float> %b) #0 {
|
||||
; CHECK-LABEL: test_x86_fmsub_baa_ps:
|
||||
; CHECK: # BB#0:
|
||||
; CHECK-NEXT: vmovaps (%rcx), %xmm0
|
||||
; CHECK-NEXT: vfmsub132ps (%rdx), %xmm0, %xmm0
|
||||
; CHECK-NEXT: retq
|
||||
%res = call <4 x float> @llvm.x86.fma.vfmsub.ps(<4 x float> %b, <4 x float> %a, <4 x float> %a) nounwind
|
||||
ret <4 x float> %res
|
||||
}
|
||||
|
||||
define <4 x float> @test_x86_fmsub_aba_ps(<4 x float> %a, <4 x float> %b) #0 {
|
||||
; CHECK-LABEL: test_x86_fmsub_aba_ps:
|
||||
; CHECK: # BB#0:
|
||||
; CHECK-NEXT: vmovaps (%rcx), %xmm0
|
||||
; CHECK-NEXT: vfmsub231ps (%rdx), %xmm0, %xmm0
|
||||
; CHECK-NEXT: retq
|
||||
%res = call <4 x float> @llvm.x86.fma.vfmsub.ps(<4 x float> %a, <4 x float> %b, <4 x float> %a) nounwind
|
||||
ret <4 x float> %res
|
||||
}
|
||||
|
||||
define <4 x float> @test_x86_fmsub_bba_ps(<4 x float> %a, <4 x float> %b) #0 {
|
||||
; CHECK-LABEL: test_x86_fmsub_bba_ps:
|
||||
; CHECK: # BB#0:
|
||||
; CHECK-NEXT: vmovaps (%rdx), %xmm0
|
||||
; CHECK-NEXT: vfmsub213ps (%rcx), %xmm0, %xmm0
|
||||
; CHECK-NEXT: retq
|
||||
%res = call <4 x float> @llvm.x86.fma.vfmsub.ps(<4 x float> %b, <4 x float> %b, <4 x float> %a) nounwind
|
||||
ret <4 x float> %res
|
||||
}
|
||||
|
||||
declare <8 x float> @llvm.x86.fma.vfmsub.ps.256(<8 x float>, <8 x float>, <8 x float>) nounwind readnone
|
||||
define <8 x float> @test_x86_fmsub_baa_ps_y(<8 x float> %a, <8 x float> %b) #0 {
|
||||
; CHECK-LABEL: test_x86_fmsub_baa_ps_y:
|
||||
; CHECK: # BB#0:
|
||||
; CHECK-NEXT: vmovaps (%rcx), %ymm0
|
||||
; CHECK-NEXT: vfmsub132ps (%rdx), %ymm0, %ymm0
|
||||
; CHECK-NEXT: retq
|
||||
%res = call <8 x float> @llvm.x86.fma.vfmsub.ps.256(<8 x float> %b, <8 x float> %a, <8 x float> %a) nounwind
|
||||
ret <8 x float> %res
|
||||
}
|
||||
|
||||
define <8 x float> @test_x86_fmsub_aba_ps_y(<8 x float> %a, <8 x float> %b) #0 {
|
||||
; CHECK-LABEL: test_x86_fmsub_aba_ps_y:
|
||||
; CHECK: # BB#0:
|
||||
; CHECK-NEXT: vmovaps (%rcx), %ymm0
|
||||
; CHECK-NEXT: vfmsub231ps (%rdx), %ymm0, %ymm0
|
||||
; CHECK-NEXT: retq
|
||||
%res = call <8 x float> @llvm.x86.fma.vfmsub.ps.256(<8 x float> %a, <8 x float> %b, <8 x float> %a) nounwind
|
||||
ret <8 x float> %res
|
||||
}
|
||||
|
||||
define <8 x float> @test_x86_fmsub_bba_ps_y(<8 x float> %a, <8 x float> %b) #0 {
|
||||
; CHECK-LABEL: test_x86_fmsub_bba_ps_y:
|
||||
; CHECK: # BB#0:
|
||||
; CHECK-NEXT: vmovaps (%rdx), %ymm0
|
||||
; CHECK-NEXT: vfmsub213ps (%rcx), %ymm0, %ymm0
|
||||
; CHECK-NEXT: retq
|
||||
%res = call <8 x float> @llvm.x86.fma.vfmsub.ps.256(<8 x float> %b, <8 x float> %b, <8 x float> %a) nounwind
|
||||
ret <8 x float> %res
|
||||
}
|
||||
|
||||
declare <2 x double> @llvm.x86.fma.vfmsub.pd(<2 x double>, <2 x double>, <2 x double>) nounwind readnone
|
||||
define <2 x double> @test_x86_fmsub_baa_pd(<2 x double> %a, <2 x double> %b) #0 {
|
||||
; CHECK-LABEL: test_x86_fmsub_baa_pd:
|
||||
; CHECK: # BB#0:
|
||||
; CHECK-NEXT: vmovapd (%rcx), %xmm0
|
||||
; CHECK-NEXT: vfmsub132pd (%rdx), %xmm0, %xmm0
|
||||
; CHECK-NEXT: retq
|
||||
%res = call <2 x double> @llvm.x86.fma.vfmsub.pd(<2 x double> %b, <2 x double> %a, <2 x double> %a) nounwind
|
||||
ret <2 x double> %res
|
||||
}
|
||||
|
||||
define <2 x double> @test_x86_fmsub_aba_pd(<2 x double> %a, <2 x double> %b) #0 {
|
||||
; CHECK-LABEL: test_x86_fmsub_aba_pd:
|
||||
; CHECK: # BB#0:
|
||||
; CHECK-NEXT: vmovapd (%rcx), %xmm0
|
||||
; CHECK-NEXT: vfmsub231pd (%rdx), %xmm0, %xmm0
|
||||
; CHECK-NEXT: retq
|
||||
%res = call <2 x double> @llvm.x86.fma.vfmsub.pd(<2 x double> %a, <2 x double> %b, <2 x double> %a) nounwind
|
||||
ret <2 x double> %res
|
||||
}
|
||||
|
||||
define <2 x double> @test_x86_fmsub_bba_pd(<2 x double> %a, <2 x double> %b) #0 {
|
||||
; CHECK-LABEL: test_x86_fmsub_bba_pd:
|
||||
; CHECK: # BB#0:
|
||||
; CHECK-NEXT: vmovapd (%rdx), %xmm0
|
||||
; CHECK-NEXT: vfmsub213pd (%rcx), %xmm0, %xmm0
|
||||
; CHECK-NEXT: retq
|
||||
%res = call <2 x double> @llvm.x86.fma.vfmsub.pd(<2 x double> %b, <2 x double> %b, <2 x double> %a) nounwind
|
||||
ret <2 x double> %res
|
||||
}
|
||||
|
||||
declare <4 x double> @llvm.x86.fma.vfmsub.pd.256(<4 x double>, <4 x double>, <4 x double>) nounwind readnone
|
||||
define <4 x double> @test_x86_fmsub_baa_pd_y(<4 x double> %a, <4 x double> %b) #0 {
|
||||
; CHECK-LABEL: test_x86_fmsub_baa_pd_y:
|
||||
; CHECK: # BB#0:
|
||||
; CHECK-NEXT: vmovapd (%rcx), %ymm0
|
||||
; CHECK-NEXT: vfmsub132pd (%rdx), %ymm0, %ymm0
|
||||
; CHECK-NEXT: retq
|
||||
%res = call <4 x double> @llvm.x86.fma.vfmsub.pd.256(<4 x double> %b, <4 x double> %a, <4 x double> %a) nounwind
|
||||
ret <4 x double> %res
|
||||
}
|
||||
|
||||
define <4 x double> @test_x86_fmsub_aba_pd_y(<4 x double> %a, <4 x double> %b) #0 {
|
||||
; CHECK-LABEL: test_x86_fmsub_aba_pd_y:
|
||||
; CHECK: # BB#0:
|
||||
; CHECK-NEXT: vmovapd (%rcx), %ymm0
|
||||
; CHECK-NEXT: vfmsub231pd (%rdx), %ymm0, %ymm0
|
||||
; CHECK-NEXT: retq
|
||||
%res = call <4 x double> @llvm.x86.fma.vfmsub.pd.256(<4 x double> %a, <4 x double> %b, <4 x double> %a) nounwind
|
||||
ret <4 x double> %res
|
||||
}
|
||||
|
||||
define <4 x double> @test_x86_fmsub_bba_pd_y(<4 x double> %a, <4 x double> %b) #0 {
|
||||
; CHECK-LABEL: test_x86_fmsub_bba_pd_y:
|
||||
; CHECK: # BB#0:
|
||||
; CHECK-NEXT: vmovapd (%rdx), %ymm0
|
||||
; CHECK-NEXT: vfmsub213pd (%rcx), %ymm0, %ymm0
|
||||
; CHECK-NEXT: retq
|
||||
%res = call <4 x double> @llvm.x86.fma.vfmsub.pd.256(<4 x double> %b, <4 x double> %b, <4 x double> %a) nounwind
|
||||
ret <4 x double> %res
|
||||
}
|
||||
|
||||
|
||||
declare <4 x float> @llvm.x86.fma.vfnmsub.ps(<4 x float>, <4 x float>, <4 x float>) nounwind readnone
|
||||
define <4 x float> @test_x86_fnmsub_baa_ps(<4 x float> %a, <4 x float> %b) #0 {
|
||||
; CHECK-LABEL: test_x86_fnmsub_baa_ps:
|
||||
; CHECK: # BB#0:
|
||||
; CHECK-NEXT: vmovaps (%rcx), %xmm0
|
||||
; CHECK-NEXT: vfnmsub132ps (%rdx), %xmm0, %xmm0
|
||||
; CHECK-NEXT: retq
|
||||
%res = call <4 x float> @llvm.x86.fma.vfnmsub.ps(<4 x float> %b, <4 x float> %a, <4 x float> %a) nounwind
|
||||
ret <4 x float> %res
|
||||
}
|
||||
|
||||
define <4 x float> @test_x86_fnmsub_aba_ps(<4 x float> %a, <4 x float> %b) #0 {
|
||||
; CHECK-LABEL: test_x86_fnmsub_aba_ps:
|
||||
; CHECK: # BB#0:
|
||||
; CHECK-NEXT: vmovaps (%rcx), %xmm0
|
||||
; CHECK-NEXT: vfnmsub231ps (%rdx), %xmm0, %xmm0
|
||||
; CHECK-NEXT: retq
|
||||
%res = call <4 x float> @llvm.x86.fma.vfnmsub.ps(<4 x float> %a, <4 x float> %b, <4 x float> %a) nounwind
|
||||
ret <4 x float> %res
|
||||
}
|
||||
|
||||
define <4 x float> @test_x86_fnmsub_bba_ps(<4 x float> %a, <4 x float> %b) #0 {
|
||||
; CHECK-LABEL: test_x86_fnmsub_bba_ps:
|
||||
; CHECK: # BB#0:
|
||||
; CHECK-NEXT: vmovaps (%rdx), %xmm0
|
||||
; CHECK-NEXT: vfnmsub213ps (%rcx), %xmm0, %xmm0
|
||||
; CHECK-NEXT: retq
|
||||
%res = call <4 x float> @llvm.x86.fma.vfnmsub.ps(<4 x float> %b, <4 x float> %b, <4 x float> %a) nounwind
|
||||
ret <4 x float> %res
|
||||
}
|
||||
|
||||
declare <8 x float> @llvm.x86.fma.vfnmsub.ps.256(<8 x float>, <8 x float>, <8 x float>) nounwind readnone
|
||||
define <8 x float> @test_x86_fnmsub_baa_ps_y(<8 x float> %a, <8 x float> %b) #0 {
|
||||
; CHECK-LABEL: test_x86_fnmsub_baa_ps_y:
|
||||
; CHECK: # BB#0:
|
||||
; CHECK-NEXT: vmovaps (%rcx), %ymm0
|
||||
; CHECK-NEXT: vfnmsub132ps (%rdx), %ymm0, %ymm0
|
||||
; CHECK-NEXT: retq
|
||||
%res = call <8 x float> @llvm.x86.fma.vfnmsub.ps.256(<8 x float> %b, <8 x float> %a, <8 x float> %a) nounwind
|
||||
ret <8 x float> %res
|
||||
}
|
||||
|
||||
define <8 x float> @test_x86_fnmsub_aba_ps_y(<8 x float> %a, <8 x float> %b) #0 {
|
||||
; CHECK-LABEL: test_x86_fnmsub_aba_ps_y:
|
||||
; CHECK: # BB#0:
|
||||
; CHECK-NEXT: vmovaps (%rcx), %ymm0
|
||||
; CHECK-NEXT: vfnmsub231ps (%rdx), %ymm0, %ymm0
|
||||
; CHECK-NEXT: retq
|
||||
%res = call <8 x float> @llvm.x86.fma.vfnmsub.ps.256(<8 x float> %a, <8 x float> %b, <8 x float> %a) nounwind
|
||||
ret <8 x float> %res
|
||||
}
|
||||
|
||||
define <8 x float> @test_x86_fnmsub_bba_ps_y(<8 x float> %a, <8 x float> %b) #0 {
|
||||
; CHECK-LABEL: test_x86_fnmsub_bba_ps_y:
|
||||
; CHECK: # BB#0:
|
||||
; CHECK-NEXT: vmovaps (%rdx), %ymm0
|
||||
; CHECK-NEXT: vfnmsub213ps (%rcx), %ymm0, %ymm0
|
||||
; CHECK-NEXT: retq
|
||||
%res = call <8 x float> @llvm.x86.fma.vfnmsub.ps.256(<8 x float> %b, <8 x float> %b, <8 x float> %a) nounwind
|
||||
ret <8 x float> %res
|
||||
}
|
||||
|
||||
declare <2 x double> @llvm.x86.fma.vfnmsub.pd(<2 x double>, <2 x double>, <2 x double>) nounwind readnone
|
||||
define <2 x double> @test_x86_fnmsub_baa_pd(<2 x double> %a, <2 x double> %b) #0 {
|
||||
; CHECK-LABEL: test_x86_fnmsub_baa_pd:
|
||||
; CHECK: # BB#0:
|
||||
; CHECK-NEXT: vmovapd (%rcx), %xmm0
|
||||
; CHECK-NEXT: vfnmsub132pd (%rdx), %xmm0, %xmm0
|
||||
; CHECK-NEXT: retq
|
||||
%res = call <2 x double> @llvm.x86.fma.vfnmsub.pd(<2 x double> %b, <2 x double> %a, <2 x double> %a) nounwind
|
||||
ret <2 x double> %res
|
||||
}
|
||||
|
||||
define <2 x double> @test_x86_fnmsub_aba_pd(<2 x double> %a, <2 x double> %b) #0 {
|
||||
; CHECK-LABEL: test_x86_fnmsub_aba_pd:
|
||||
; CHECK: # BB#0:
|
||||
; CHECK-NEXT: vmovapd (%rcx), %xmm0
|
||||
; CHECK-NEXT: vfnmsub231pd (%rdx), %xmm0, %xmm0
|
||||
; CHECK-NEXT: retq
|
||||
%res = call <2 x double> @llvm.x86.fma.vfnmsub.pd(<2 x double> %a, <2 x double> %b, <2 x double> %a) nounwind
|
||||
ret <2 x double> %res
|
||||
}
|
||||
|
||||
define <2 x double> @test_x86_fnmsub_bba_pd(<2 x double> %a, <2 x double> %b) #0 {
|
||||
; CHECK-LABEL: test_x86_fnmsub_bba_pd:
|
||||
; CHECK: # BB#0:
|
||||
; CHECK-NEXT: vmovapd (%rdx), %xmm0
|
||||
; CHECK-NEXT: vfnmsub213pd (%rcx), %xmm0, %xmm0
|
||||
; CHECK-NEXT: retq
|
||||
%res = call <2 x double> @llvm.x86.fma.vfnmsub.pd(<2 x double> %b, <2 x double> %b, <2 x double> %a) nounwind
|
||||
ret <2 x double> %res
|
||||
}
|
||||
|
||||
declare <4 x double> @llvm.x86.fma.vfnmsub.pd.256(<4 x double>, <4 x double>, <4 x double>) nounwind readnone
|
||||
define <4 x double> @test_x86_fnmsub_baa_pd_y(<4 x double> %a, <4 x double> %b) #0 {
|
||||
; CHECK-LABEL: test_x86_fnmsub_baa_pd_y:
|
||||
; CHECK: # BB#0:
|
||||
; CHECK-NEXT: vmovapd (%rcx), %ymm0
|
||||
; CHECK-NEXT: vfnmsub132pd (%rdx), %ymm0, %ymm0
|
||||
; CHECK-NEXT: retq
|
||||
%res = call <4 x double> @llvm.x86.fma.vfnmsub.pd.256(<4 x double> %b, <4 x double> %a, <4 x double> %a) nounwind
|
||||
ret <4 x double> %res
|
||||
}
|
||||
|
||||
define <4 x double> @test_x86_fnmsub_aba_pd_y(<4 x double> %a, <4 x double> %b) #0 {
|
||||
; CHECK-LABEL: test_x86_fnmsub_aba_pd_y:
|
||||
; CHECK: # BB#0:
|
||||
; CHECK-NEXT: vmovapd (%rcx), %ymm0
|
||||
; CHECK-NEXT: vfnmsub231pd (%rdx), %ymm0, %ymm0
|
||||
; CHECK-NEXT: retq
|
||||
%res = call <4 x double> @llvm.x86.fma.vfnmsub.pd.256(<4 x double> %a, <4 x double> %b, <4 x double> %a) nounwind
|
||||
ret <4 x double> %res
|
||||
}
|
||||
|
||||
define <4 x double> @test_x86_fnmsub_bba_pd_y(<4 x double> %a, <4 x double> %b) #0 {
|
||||
; CHECK-LABEL: test_x86_fnmsub_bba_pd_y:
|
||||
; CHECK: # BB#0:
|
||||
; CHECK-NEXT: vmovapd (%rdx), %ymm0
|
||||
; CHECK-NEXT: vfnmsub213pd (%rcx), %ymm0, %ymm0
|
||||
; CHECK-NEXT: retq
|
||||
%res = call <4 x double> @llvm.x86.fma.vfnmsub.pd.256(<4 x double> %b, <4 x double> %b, <4 x double> %a) nounwind
|
||||
ret <4 x double> %res
|
||||
}
|
||||
|
@ -235,11 +235,10 @@ define float @test_x86_fnmsub_ss(float %a0, float %a1, float %a2) {
|
||||
}
|
||||
|
||||
define <4 x float> @test_x86_fmadd_ps_load(<4 x float>* %a0, <4 x float> %a1, <4 x float> %a2) {
|
||||
; CHECK_FMA-LABEL: test_x86_fmadd_ps_load:
|
||||
; CHECK_FMA: # BB#0:
|
||||
; CHECK_FMA-NEXT: vmovaps (%rdi), %xmm2
|
||||
; CHECK_FMA-NEXT: vfmadd213ps %xmm1, %xmm2, %xmm0
|
||||
; CHECK_FMA-NEXT: retq
|
||||
; CHECK-FMA-LABEL: test_x86_fmadd_ps_load:
|
||||
; CHECK-FMA: # BB#0:
|
||||
; CHECK-FMA-NEXT: vfmadd132ps (%rdi), %xmm1, %xmm0
|
||||
; CHECK-FMA-NEXT: retq
|
||||
;
|
||||
; CHECK_FMA4-LABEL: test_x86_fmadd_ps_load:
|
||||
; CHECK_FMA4: # BB#0:
|
||||
@ -252,11 +251,10 @@ define <4 x float> @test_x86_fmadd_ps_load(<4 x float>* %a0, <4 x float> %a1, <4
|
||||
}
|
||||
|
||||
define <4 x float> @test_x86_fmsub_ps_load(<4 x float>* %a0, <4 x float> %a1, <4 x float> %a2) {
|
||||
; CHECK_FMA-LABEL: test_x86_fmsub_ps_load:
|
||||
; CHECK_FMA: # BB#0:
|
||||
; CHECK_FMA-NEXT: vmovaps (%rdi), %xmm2
|
||||
; CHECK_FMA-NEXT: vfmsub213ps %xmm1, %xmm2, %xmm0
|
||||
; CHECK_FMA-NEXT: retq
|
||||
; CHECK-FMA-LABEL: test_x86_fmsub_ps_load:
|
||||
; CHECK-FMA: # BB#0:
|
||||
; CHECK-FMA-NEXT: vfmsub132ps (%rdi), %xmm1, %xmm0
|
||||
; CHECK-FMA-NEXT: retq
|
||||
;
|
||||
; CHECK_FMA4-LABEL: test_x86_fmsub_ps_load:
|
||||
; CHECK_FMA4: # BB#0:
|
||||
@ -588,8 +586,7 @@ define <4 x float> @test_v4f32_fma_x_c1_fmul_x_c2(<4 x float> %x) #0 {
|
||||
define <4 x float> @test_v4f32_fma_fmul_x_c1_c2_y(<4 x float> %x, <4 x float> %y) #0 {
|
||||
; CHECK_FMA-LABEL: test_v4f32_fma_fmul_x_c1_c2_y:
|
||||
; CHECK_FMA: # BB#0:
|
||||
; CHECK_FMA-NEXT: vmovaps {{.*#+}} xmm2 = [4.000000e+00,6.000000e+00,6.000000e+00,4.000000e+00]
|
||||
; CHECK_FMA-NEXT: vfmadd213ps %xmm1, %xmm2, %xmm0
|
||||
; CHECK_FMA-NEXT: vfmadd132ps {{.*}}(%rip), %xmm1, %xmm0
|
||||
; CHECK_FMA-NEXT: retq
|
||||
;
|
||||
; CHECK_FMA4-LABEL: test_v4f32_fma_fmul_x_c1_c2_y:
|
||||
|
Loading…
Reference in New Issue
Block a user