mirror of
https://github.com/RPCS3/llvm-mirror.git
synced 2024-11-26 12:50:30 +00:00
[X86][XOP] Enable commutation for XOP instructions
Patch to allow XOP instructions (integer comparison and integer multiply-add) to be commuted. The comparison instructions sometimes require the compare mode to be flipped but the remaining instructions can use default commutation modes. This patch also sets the SSE domains of all the XOP instructions. Differential Revision: http://reviews.llvm.org/D7646 llvm-svn: 229267
This commit is contained in:
parent
aadc93bf08
commit
df7da0ee6e
@ -2906,6 +2906,32 @@ X86InstrInfo::commuteInstruction(MachineInstr *MI, bool NewMI) const {
|
||||
return nullptr;
|
||||
}
|
||||
}
|
||||
case X86::VPCOMBri: case X86::VPCOMUBri:
|
||||
case X86::VPCOMDri: case X86::VPCOMUDri:
|
||||
case X86::VPCOMQri: case X86::VPCOMUQri:
|
||||
case X86::VPCOMWri: case X86::VPCOMUWri: {
|
||||
// Flip comparison mode immediate (if necessary).
|
||||
unsigned Imm = MI->getOperand(3).getImm() & 0x7;
|
||||
switch (Imm) {
|
||||
case 0x00: Imm = 0x02; break; // LT -> GT
|
||||
case 0x01: Imm = 0x03; break; // LE -> GE
|
||||
case 0x02: Imm = 0x00; break; // GT -> LT
|
||||
case 0x03: Imm = 0x01; break; // GE -> LE
|
||||
case 0x04: // EQ
|
||||
case 0x05: // NE
|
||||
case 0x06: // FALSE
|
||||
case 0x07: // TRUE
|
||||
default:
|
||||
break;
|
||||
}
|
||||
if (NewMI) {
|
||||
MachineFunction &MF = *MI->getParent()->getParent();
|
||||
MI = MF.CloneMachineInstr(MI);
|
||||
NewMI = false;
|
||||
}
|
||||
MI->getOperand(3).setImm(Imm);
|
||||
return TargetInstrInfo::commuteInstruction(MI, NewMI);
|
||||
}
|
||||
case X86::CMOVB16rr: case X86::CMOVB32rr: case X86::CMOVB64rr:
|
||||
case X86::CMOVAE16rr: case X86::CMOVAE32rr: case X86::CMOVAE64rr:
|
||||
case X86::CMOVE16rr: case X86::CMOVE32rr: case X86::CMOVE64rr:
|
||||
|
@ -20,21 +20,23 @@ multiclass xop2op<bits<8> opc, string OpcodeStr, Intrinsic Int, PatFrag memop> {
|
||||
[(set VR128:$dst, (Int (bitconvert (memop addr:$src))))]>, XOP;
|
||||
}
|
||||
|
||||
defm VPHSUBWD : xop2op<0xE2, "vphsubwd", int_x86_xop_vphsubwd, loadv2i64>;
|
||||
defm VPHSUBDQ : xop2op<0xE3, "vphsubdq", int_x86_xop_vphsubdq, loadv2i64>;
|
||||
defm VPHSUBBW : xop2op<0xE1, "vphsubbw", int_x86_xop_vphsubbw, loadv2i64>;
|
||||
defm VPHADDWQ : xop2op<0xC7, "vphaddwq", int_x86_xop_vphaddwq, loadv2i64>;
|
||||
defm VPHADDWD : xop2op<0xC6, "vphaddwd", int_x86_xop_vphaddwd, loadv2i64>;
|
||||
defm VPHADDUWQ : xop2op<0xD7, "vphadduwq", int_x86_xop_vphadduwq, loadv2i64>;
|
||||
defm VPHADDUWD : xop2op<0xD6, "vphadduwd", int_x86_xop_vphadduwd, loadv2i64>;
|
||||
defm VPHADDUDQ : xop2op<0xDB, "vphaddudq", int_x86_xop_vphaddudq, loadv2i64>;
|
||||
defm VPHADDUBW : xop2op<0xD1, "vphaddubw", int_x86_xop_vphaddubw, loadv2i64>;
|
||||
defm VPHADDUBQ : xop2op<0xD3, "vphaddubq", int_x86_xop_vphaddubq, loadv2i64>;
|
||||
defm VPHADDUBD : xop2op<0xD2, "vphaddubd", int_x86_xop_vphaddubd, loadv2i64>;
|
||||
defm VPHADDDQ : xop2op<0xCB, "vphadddq", int_x86_xop_vphadddq, loadv2i64>;
|
||||
defm VPHADDBW : xop2op<0xC1, "vphaddbw", int_x86_xop_vphaddbw, loadv2i64>;
|
||||
defm VPHADDBQ : xop2op<0xC3, "vphaddbq", int_x86_xop_vphaddbq, loadv2i64>;
|
||||
defm VPHADDBD : xop2op<0xC2, "vphaddbd", int_x86_xop_vphaddbd, loadv2i64>;
|
||||
let ExeDomain = SSEPackedInt in {
|
||||
defm VPHSUBWD : xop2op<0xE2, "vphsubwd", int_x86_xop_vphsubwd, loadv2i64>;
|
||||
defm VPHSUBDQ : xop2op<0xE3, "vphsubdq", int_x86_xop_vphsubdq, loadv2i64>;
|
||||
defm VPHSUBBW : xop2op<0xE1, "vphsubbw", int_x86_xop_vphsubbw, loadv2i64>;
|
||||
defm VPHADDWQ : xop2op<0xC7, "vphaddwq", int_x86_xop_vphaddwq, loadv2i64>;
|
||||
defm VPHADDWD : xop2op<0xC6, "vphaddwd", int_x86_xop_vphaddwd, loadv2i64>;
|
||||
defm VPHADDUWQ : xop2op<0xD7, "vphadduwq", int_x86_xop_vphadduwq, loadv2i64>;
|
||||
defm VPHADDUWD : xop2op<0xD6, "vphadduwd", int_x86_xop_vphadduwd, loadv2i64>;
|
||||
defm VPHADDUDQ : xop2op<0xDB, "vphaddudq", int_x86_xop_vphaddudq, loadv2i64>;
|
||||
defm VPHADDUBW : xop2op<0xD1, "vphaddubw", int_x86_xop_vphaddubw, loadv2i64>;
|
||||
defm VPHADDUBQ : xop2op<0xD3, "vphaddubq", int_x86_xop_vphaddubq, loadv2i64>;
|
||||
defm VPHADDUBD : xop2op<0xD2, "vphaddubd", int_x86_xop_vphaddubd, loadv2i64>;
|
||||
defm VPHADDDQ : xop2op<0xCB, "vphadddq", int_x86_xop_vphadddq, loadv2i64>;
|
||||
defm VPHADDBW : xop2op<0xC1, "vphaddbw", int_x86_xop_vphaddbw, loadv2i64>;
|
||||
defm VPHADDBQ : xop2op<0xC3, "vphaddbq", int_x86_xop_vphaddbq, loadv2i64>;
|
||||
defm VPHADDBD : xop2op<0xC2, "vphaddbd", int_x86_xop_vphaddbd, loadv2i64>;
|
||||
}
|
||||
|
||||
// Scalar load 2 addr operand instructions
|
||||
multiclass xop2opsld<bits<8> opc, string OpcodeStr, Intrinsic Int,
|
||||
@ -47,11 +49,6 @@ multiclass xop2opsld<bits<8> opc, string OpcodeStr, Intrinsic Int,
|
||||
[(set VR128:$dst, (Int (bitconvert mem_cpat:$src)))]>, XOP;
|
||||
}
|
||||
|
||||
defm VFRCZSS : xop2opsld<0x82, "vfrczss", int_x86_xop_vfrcz_ss,
|
||||
ssmem, sse_load_f32>;
|
||||
defm VFRCZSD : xop2opsld<0x83, "vfrczsd", int_x86_xop_vfrcz_sd,
|
||||
sdmem, sse_load_f64>;
|
||||
|
||||
multiclass xop2op128<bits<8> opc, string OpcodeStr, Intrinsic Int,
|
||||
PatFrag memop> {
|
||||
def rr : IXOP<opc, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
|
||||
@ -62,9 +59,6 @@ multiclass xop2op128<bits<8> opc, string OpcodeStr, Intrinsic Int,
|
||||
[(set VR128:$dst, (Int (bitconvert (memop addr:$src))))]>, XOP;
|
||||
}
|
||||
|
||||
defm VFRCZPS : xop2op128<0x80, "vfrczps", int_x86_xop_vfrcz_ps, loadv4f32>;
|
||||
defm VFRCZPD : xop2op128<0x81, "vfrczpd", int_x86_xop_vfrcz_pd, loadv2f64>;
|
||||
|
||||
multiclass xop2op256<bits<8> opc, string OpcodeStr, Intrinsic Int,
|
||||
PatFrag memop> {
|
||||
def rrY : IXOP<opc, MRMSrcReg, (outs VR256:$dst), (ins VR256:$src),
|
||||
@ -75,8 +69,19 @@ multiclass xop2op256<bits<8> opc, string OpcodeStr, Intrinsic Int,
|
||||
[(set VR256:$dst, (Int (bitconvert (memop addr:$src))))]>, XOP, VEX_L;
|
||||
}
|
||||
|
||||
defm VFRCZPS : xop2op256<0x80, "vfrczps", int_x86_xop_vfrcz_ps_256, loadv8f32>;
|
||||
defm VFRCZPD : xop2op256<0x81, "vfrczpd", int_x86_xop_vfrcz_pd_256, loadv4f64>;
|
||||
let ExeDomain = SSEPackedSingle in {
|
||||
defm VFRCZSS : xop2opsld<0x82, "vfrczss", int_x86_xop_vfrcz_ss,
|
||||
ssmem, sse_load_f32>;
|
||||
defm VFRCZPS : xop2op128<0x80, "vfrczps", int_x86_xop_vfrcz_ps, loadv4f32>;
|
||||
defm VFRCZPS : xop2op256<0x80, "vfrczps", int_x86_xop_vfrcz_ps_256, loadv8f32>;
|
||||
}
|
||||
|
||||
let ExeDomain = SSEPackedDouble in {
|
||||
defm VFRCZSD : xop2opsld<0x83, "vfrczsd", int_x86_xop_vfrcz_sd,
|
||||
sdmem, sse_load_f64>;
|
||||
defm VFRCZPD : xop2op128<0x81, "vfrczpd", int_x86_xop_vfrcz_pd, loadv2f64>;
|
||||
defm VFRCZPD : xop2op256<0x81, "vfrczpd", int_x86_xop_vfrcz_pd_256, loadv4f64>;
|
||||
}
|
||||
|
||||
multiclass xop3op<bits<8> opc, string OpcodeStr, Intrinsic Int> {
|
||||
def rr : IXOP<opc, MRMSrcReg, (outs VR128:$dst),
|
||||
@ -97,18 +102,20 @@ multiclass xop3op<bits<8> opc, string OpcodeStr, Intrinsic Int> {
|
||||
XOP_4VOp3;
|
||||
}
|
||||
|
||||
defm VPSHLW : xop3op<0x95, "vpshlw", int_x86_xop_vpshlw>;
|
||||
defm VPSHLQ : xop3op<0x97, "vpshlq", int_x86_xop_vpshlq>;
|
||||
defm VPSHLD : xop3op<0x96, "vpshld", int_x86_xop_vpshld>;
|
||||
defm VPSHLB : xop3op<0x94, "vpshlb", int_x86_xop_vpshlb>;
|
||||
defm VPSHAW : xop3op<0x99, "vpshaw", int_x86_xop_vpshaw>;
|
||||
defm VPSHAQ : xop3op<0x9B, "vpshaq", int_x86_xop_vpshaq>;
|
||||
defm VPSHAD : xop3op<0x9A, "vpshad", int_x86_xop_vpshad>;
|
||||
defm VPSHAB : xop3op<0x98, "vpshab", int_x86_xop_vpshab>;
|
||||
defm VPROTW : xop3op<0x91, "vprotw", int_x86_xop_vprotw>;
|
||||
defm VPROTQ : xop3op<0x93, "vprotq", int_x86_xop_vprotq>;
|
||||
defm VPROTD : xop3op<0x92, "vprotd", int_x86_xop_vprotd>;
|
||||
defm VPROTB : xop3op<0x90, "vprotb", int_x86_xop_vprotb>;
|
||||
let ExeDomain = SSEPackedInt in {
|
||||
defm VPSHLW : xop3op<0x95, "vpshlw", int_x86_xop_vpshlw>;
|
||||
defm VPSHLQ : xop3op<0x97, "vpshlq", int_x86_xop_vpshlq>;
|
||||
defm VPSHLD : xop3op<0x96, "vpshld", int_x86_xop_vpshld>;
|
||||
defm VPSHLB : xop3op<0x94, "vpshlb", int_x86_xop_vpshlb>;
|
||||
defm VPSHAW : xop3op<0x99, "vpshaw", int_x86_xop_vpshaw>;
|
||||
defm VPSHAQ : xop3op<0x9B, "vpshaq", int_x86_xop_vpshaq>;
|
||||
defm VPSHAD : xop3op<0x9A, "vpshad", int_x86_xop_vpshad>;
|
||||
defm VPSHAB : xop3op<0x98, "vpshab", int_x86_xop_vpshab>;
|
||||
defm VPROTW : xop3op<0x91, "vprotw", int_x86_xop_vprotw>;
|
||||
defm VPROTQ : xop3op<0x93, "vprotq", int_x86_xop_vprotq>;
|
||||
defm VPROTD : xop3op<0x92, "vprotd", int_x86_xop_vprotd>;
|
||||
defm VPROTB : xop3op<0x90, "vprotb", int_x86_xop_vprotb>;
|
||||
}
|
||||
|
||||
multiclass xop3opimm<bits<8> opc, string OpcodeStr, Intrinsic Int> {
|
||||
def ri : IXOPi8<opc, MRMSrcReg, (outs VR128:$dst),
|
||||
@ -122,13 +129,16 @@ multiclass xop3opimm<bits<8> opc, string OpcodeStr, Intrinsic Int> {
|
||||
(Int (bitconvert (loadv2i64 addr:$src1)), imm:$src2))]>, XOP;
|
||||
}
|
||||
|
||||
defm VPROTW : xop3opimm<0xC1, "vprotw", int_x86_xop_vprotwi>;
|
||||
defm VPROTQ : xop3opimm<0xC3, "vprotq", int_x86_xop_vprotqi>;
|
||||
defm VPROTD : xop3opimm<0xC2, "vprotd", int_x86_xop_vprotdi>;
|
||||
defm VPROTB : xop3opimm<0xC0, "vprotb", int_x86_xop_vprotbi>;
|
||||
let ExeDomain = SSEPackedInt in {
|
||||
defm VPROTW : xop3opimm<0xC1, "vprotw", int_x86_xop_vprotwi>;
|
||||
defm VPROTQ : xop3opimm<0xC3, "vprotq", int_x86_xop_vprotqi>;
|
||||
defm VPROTD : xop3opimm<0xC2, "vprotd", int_x86_xop_vprotdi>;
|
||||
defm VPROTB : xop3opimm<0xC0, "vprotb", int_x86_xop_vprotbi>;
|
||||
}
|
||||
|
||||
// Instruction where second source can be memory, but third must be register
|
||||
multiclass xop4opm2<bits<8> opc, string OpcodeStr, Intrinsic Int> {
|
||||
let isCommutable = 1 in
|
||||
def rr : IXOPi8<opc, MRMSrcReg, (outs VR128:$dst),
|
||||
(ins VR128:$src1, VR128:$src2, VR128:$src3),
|
||||
!strconcat(OpcodeStr,
|
||||
@ -144,21 +154,24 @@ multiclass xop4opm2<bits<8> opc, string OpcodeStr, Intrinsic Int> {
|
||||
VR128:$src3))]>, XOP_4V, VEX_I8IMM;
|
||||
}
|
||||
|
||||
defm VPMADCSWD : xop4opm2<0xB6, "vpmadcswd", int_x86_xop_vpmadcswd>;
|
||||
defm VPMADCSSWD : xop4opm2<0xA6, "vpmadcsswd", int_x86_xop_vpmadcsswd>;
|
||||
defm VPMACSWW : xop4opm2<0x95, "vpmacsww", int_x86_xop_vpmacsww>;
|
||||
defm VPMACSWD : xop4opm2<0x96, "vpmacswd", int_x86_xop_vpmacswd>;
|
||||
defm VPMACSSWW : xop4opm2<0x85, "vpmacssww", int_x86_xop_vpmacssww>;
|
||||
defm VPMACSSWD : xop4opm2<0x86, "vpmacsswd", int_x86_xop_vpmacsswd>;
|
||||
defm VPMACSSDQL : xop4opm2<0x87, "vpmacssdql", int_x86_xop_vpmacssdql>;
|
||||
defm VPMACSSDQH : xop4opm2<0x8F, "vpmacssdqh", int_x86_xop_vpmacssdqh>;
|
||||
defm VPMACSSDD : xop4opm2<0x8E, "vpmacssdd", int_x86_xop_vpmacssdd>;
|
||||
defm VPMACSDQL : xop4opm2<0x97, "vpmacsdql", int_x86_xop_vpmacsdql>;
|
||||
defm VPMACSDQH : xop4opm2<0x9F, "vpmacsdqh", int_x86_xop_vpmacsdqh>;
|
||||
defm VPMACSDD : xop4opm2<0x9E, "vpmacsdd", int_x86_xop_vpmacsdd>;
|
||||
let ExeDomain = SSEPackedInt in {
|
||||
defm VPMADCSWD : xop4opm2<0xB6, "vpmadcswd", int_x86_xop_vpmadcswd>;
|
||||
defm VPMADCSSWD : xop4opm2<0xA6, "vpmadcsswd", int_x86_xop_vpmadcsswd>;
|
||||
defm VPMACSWW : xop4opm2<0x95, "vpmacsww", int_x86_xop_vpmacsww>;
|
||||
defm VPMACSWD : xop4opm2<0x96, "vpmacswd", int_x86_xop_vpmacswd>;
|
||||
defm VPMACSSWW : xop4opm2<0x85, "vpmacssww", int_x86_xop_vpmacssww>;
|
||||
defm VPMACSSWD : xop4opm2<0x86, "vpmacsswd", int_x86_xop_vpmacsswd>;
|
||||
defm VPMACSSDQL : xop4opm2<0x87, "vpmacssdql", int_x86_xop_vpmacssdql>;
|
||||
defm VPMACSSDQH : xop4opm2<0x8F, "vpmacssdqh", int_x86_xop_vpmacssdqh>;
|
||||
defm VPMACSSDD : xop4opm2<0x8E, "vpmacssdd", int_x86_xop_vpmacssdd>;
|
||||
defm VPMACSDQL : xop4opm2<0x97, "vpmacsdql", int_x86_xop_vpmacsdql>;
|
||||
defm VPMACSDQH : xop4opm2<0x9F, "vpmacsdqh", int_x86_xop_vpmacsdqh>;
|
||||
defm VPMACSDD : xop4opm2<0x9E, "vpmacsdd", int_x86_xop_vpmacsdd>;
|
||||
}
|
||||
|
||||
// Instruction where second source can be memory, third must be imm8
|
||||
multiclass xopvpcom<bits<8> opc, string Suffix, Intrinsic Int> {
|
||||
let isCommutable = 1 in
|
||||
def ri : IXOPi8<opc, MRMSrcReg, (outs VR128:$dst),
|
||||
(ins VR128:$src1, VR128:$src2, XOPCC:$cc),
|
||||
!strconcat("vpcom${cc}", Suffix,
|
||||
@ -187,14 +200,16 @@ multiclass xopvpcom<bits<8> opc, string Suffix, Intrinsic Int> {
|
||||
}
|
||||
}
|
||||
|
||||
defm VPCOMB : xopvpcom<0xCC, "b", int_x86_xop_vpcomb>;
|
||||
defm VPCOMW : xopvpcom<0xCD, "w", int_x86_xop_vpcomw>;
|
||||
defm VPCOMD : xopvpcom<0xCE, "d", int_x86_xop_vpcomd>;
|
||||
defm VPCOMQ : xopvpcom<0xCF, "q", int_x86_xop_vpcomq>;
|
||||
defm VPCOMUB : xopvpcom<0xEC, "ub", int_x86_xop_vpcomub>;
|
||||
defm VPCOMUW : xopvpcom<0xED, "uw", int_x86_xop_vpcomuw>;
|
||||
defm VPCOMUD : xopvpcom<0xEE, "ud", int_x86_xop_vpcomud>;
|
||||
defm VPCOMUQ : xopvpcom<0xEF, "uq", int_x86_xop_vpcomuq>;
|
||||
let ExeDomain = SSEPackedInt in { // SSE integer instructions
|
||||
defm VPCOMB : xopvpcom<0xCC, "b", int_x86_xop_vpcomb>;
|
||||
defm VPCOMW : xopvpcom<0xCD, "w", int_x86_xop_vpcomw>;
|
||||
defm VPCOMD : xopvpcom<0xCE, "d", int_x86_xop_vpcomd>;
|
||||
defm VPCOMQ : xopvpcom<0xCF, "q", int_x86_xop_vpcomq>;
|
||||
defm VPCOMUB : xopvpcom<0xEC, "ub", int_x86_xop_vpcomub>;
|
||||
defm VPCOMUW : xopvpcom<0xED, "uw", int_x86_xop_vpcomuw>;
|
||||
defm VPCOMUD : xopvpcom<0xEE, "ud", int_x86_xop_vpcomud>;
|
||||
defm VPCOMUQ : xopvpcom<0xEF, "uq", int_x86_xop_vpcomuq>;
|
||||
}
|
||||
|
||||
// Instruction where either second or third source can be memory
|
||||
multiclass xop4op<bits<8> opc, string OpcodeStr, Intrinsic Int> {
|
||||
@ -222,8 +237,10 @@ multiclass xop4op<bits<8> opc, string OpcodeStr, Intrinsic Int> {
|
||||
XOP_4V, VEX_I8IMM;
|
||||
}
|
||||
|
||||
defm VPPERM : xop4op<0xA3, "vpperm", int_x86_xop_vpperm>;
|
||||
defm VPCMOV : xop4op<0xA2, "vpcmov", int_x86_xop_vpcmov>;
|
||||
let ExeDomain = SSEPackedInt in {
|
||||
defm VPPERM : xop4op<0xA3, "vpperm", int_x86_xop_vpperm>;
|
||||
defm VPCMOV : xop4op<0xA2, "vpcmov", int_x86_xop_vpcmov>;
|
||||
}
|
||||
|
||||
multiclass xop4op256<bits<8> opc, string OpcodeStr, Intrinsic Int> {
|
||||
def rrY : IXOPi8<opc, MRMSrcReg, (outs VR256:$dst),
|
||||
@ -250,7 +267,8 @@ multiclass xop4op256<bits<8> opc, string OpcodeStr, Intrinsic Int> {
|
||||
XOP_4V, VEX_I8IMM, VEX_L;
|
||||
}
|
||||
|
||||
defm VPCMOV : xop4op256<0xA2, "vpcmov", int_x86_xop_vpcmov_256>;
|
||||
let ExeDomain = SSEPackedInt in
|
||||
defm VPCMOV : xop4op256<0xA2, "vpcmov", int_x86_xop_vpcmov_256>;
|
||||
|
||||
multiclass xop5op<bits<8> opc, string OpcodeStr, Intrinsic Int128,
|
||||
Intrinsic Int256, PatFrag ld_128, PatFrag ld_256> {
|
||||
@ -295,8 +313,11 @@ multiclass xop5op<bits<8> opc, string OpcodeStr, Intrinsic Int128,
|
||||
VEX_L;
|
||||
}
|
||||
|
||||
defm VPERMIL2PD : xop5op<0x49, "vpermil2pd", int_x86_xop_vpermil2pd,
|
||||
int_x86_xop_vpermil2pd_256, loadv2f64, loadv4f64>;
|
||||
defm VPERMIL2PS : xop5op<0x48, "vpermil2ps", int_x86_xop_vpermil2ps,
|
||||
int_x86_xop_vpermil2ps_256, loadv4f32, loadv8f32>;
|
||||
let ExeDomain = SSEPackedDouble in
|
||||
defm VPERMIL2PD : xop5op<0x49, "vpermil2pd", int_x86_xop_vpermil2pd,
|
||||
int_x86_xop_vpermil2pd_256, loadv2f64, loadv4f64>;
|
||||
|
||||
let ExeDomain = SSEPackedSingle in
|
||||
defm VPERMIL2PS : xop5op<0x48, "vpermil2ps", int_x86_xop_vpermil2ps,
|
||||
int_x86_xop_vpermil2ps_256, loadv4f32, loadv8f32>;
|
||||
|
||||
|
184
test/CodeGen/X86/commute-xop.ll
Normal file
184
test/CodeGen/X86/commute-xop.ll
Normal file
@ -0,0 +1,184 @@
|
||||
; RUN: llc -O3 -mtriple=x86_64-unknown-unknown -mcpu=x86-64 -mattr=+avx,+xop < %s | FileCheck %s
|
||||
|
||||
define <16 x i8> @commute_fold_vpcomb(<16 x i8>* %a0, <16 x i8> %a1) {
|
||||
;CHECK-LABEL: commute_fold_vpcomb
|
||||
;CHECK: vpcomgtb (%rdi), %xmm0, %xmm0
|
||||
%1 = load <16 x i8>* %a0
|
||||
%2 = call <16 x i8> @llvm.x86.xop.vpcomb(<16 x i8> %1, <16 x i8> %a1, i8 0) ; vpcomltb
|
||||
ret <16 x i8> %2
|
||||
}
|
||||
declare <16 x i8> @llvm.x86.xop.vpcomb(<16 x i8>, <16 x i8>, i8) nounwind readnone
|
||||
|
||||
define <4 x i32> @commute_fold_vpcomd(<4 x i32>* %a0, <4 x i32> %a1) {
|
||||
;CHECK-LABEL: commute_fold_vpcomd
|
||||
;CHECK: vpcomged (%rdi), %xmm0, %xmm0
|
||||
%1 = load <4 x i32>* %a0
|
||||
%2 = call <4 x i32> @llvm.x86.xop.vpcomd(<4 x i32> %1, <4 x i32> %a1, i8 1) ; vpcomled
|
||||
ret <4 x i32> %2
|
||||
}
|
||||
declare <4 x i32> @llvm.x86.xop.vpcomd(<4 x i32>, <4 x i32>, i8) nounwind readnone
|
||||
|
||||
define <2 x i64> @commute_fold_vpcomq(<2 x i64>* %a0, <2 x i64> %a1) {
|
||||
;CHECK-LABEL: commute_fold_vpcomq
|
||||
;CHECK: vpcomltq (%rdi), %xmm0, %xmm0
|
||||
%1 = load <2 x i64>* %a0
|
||||
%2 = call <2 x i64> @llvm.x86.xop.vpcomq(<2 x i64> %1, <2 x i64> %a1, i8 2) ; vpcomgtq
|
||||
ret <2 x i64> %2
|
||||
}
|
||||
declare <2 x i64> @llvm.x86.xop.vpcomq(<2 x i64>, <2 x i64>, i8) nounwind readnone
|
||||
|
||||
define <16 x i8> @commute_fold_vpcomub(<16 x i8>* %a0, <16 x i8> %a1) {
|
||||
;CHECK-LABEL: commute_fold_vpcomub
|
||||
;CHECK: vpcomleub (%rdi), %xmm0, %xmm0
|
||||
%1 = load <16 x i8>* %a0
|
||||
%2 = call <16 x i8> @llvm.x86.xop.vpcomub(<16 x i8> %1, <16 x i8> %a1, i8 3) ; vpcomgeub
|
||||
ret <16 x i8> %2
|
||||
}
|
||||
declare <16 x i8> @llvm.x86.xop.vpcomub(<16 x i8>, <16 x i8>, i8) nounwind readnone
|
||||
|
||||
define <4 x i32> @commute_fold_vpcomud(<4 x i32>* %a0, <4 x i32> %a1) {
|
||||
;CHECK-LABEL: commute_fold_vpcomud
|
||||
;CHECK: vpcomequd (%rdi), %xmm0, %xmm0
|
||||
%1 = load <4 x i32>* %a0
|
||||
%2 = call <4 x i32> @llvm.x86.xop.vpcomud(<4 x i32> %1, <4 x i32> %a1, i8 4) ; vpcomequd
|
||||
ret <4 x i32> %2
|
||||
}
|
||||
declare <4 x i32> @llvm.x86.xop.vpcomud(<4 x i32>, <4 x i32>, i8) nounwind readnone
|
||||
|
||||
define <2 x i64> @commute_fold_vpcomuq(<2 x i64>* %a0, <2 x i64> %a1) {
|
||||
;CHECK-LABEL: commute_fold_vpcomuq
|
||||
;CHECK: vpcomnequq (%rdi), %xmm0, %xmm0
|
||||
%1 = load <2 x i64>* %a0
|
||||
%2 = call <2 x i64> @llvm.x86.xop.vpcomuq(<2 x i64> %1, <2 x i64> %a1, i8 5) ; vpcomnequq
|
||||
ret <2 x i64> %2
|
||||
}
|
||||
declare <2 x i64> @llvm.x86.xop.vpcomuq(<2 x i64>, <2 x i64>, i8) nounwind readnone
|
||||
|
||||
define <8 x i16> @commute_fold_vpcomuw(<8 x i16>* %a0, <8 x i16> %a1) {
|
||||
;CHECK-LABEL: commute_fold_vpcomuw
|
||||
;CHECK: vpcomfalseuw (%rdi), %xmm0, %xmm0
|
||||
%1 = load <8 x i16>* %a0
|
||||
%2 = call <8 x i16> @llvm.x86.xop.vpcomuw(<8 x i16> %1, <8 x i16> %a1, i8 6) ; vpcomfalseuw
|
||||
ret <8 x i16> %2
|
||||
}
|
||||
declare <8 x i16> @llvm.x86.xop.vpcomuw(<8 x i16>, <8 x i16>, i8) nounwind readnone
|
||||
|
||||
define <8 x i16> @commute_fold_vpcomw(<8 x i16>* %a0, <8 x i16> %a1) {
|
||||
;CHECK-LABEL: commute_fold_vpcomw
|
||||
;CHECK: vpcomtruew (%rdi), %xmm0, %xmm0
|
||||
%1 = load <8 x i16>* %a0
|
||||
%2 = call <8 x i16> @llvm.x86.xop.vpcomw(<8 x i16> %1, <8 x i16> %a1, i8 7) ; vpcomtruew
|
||||
ret <8 x i16> %2
|
||||
}
|
||||
declare <8 x i16> @llvm.x86.xop.vpcomw(<8 x i16>, <8 x i16>, i8) nounwind readnone
|
||||
|
||||
define <4 x i32> @commute_fold_vpmacsdd(<4 x i32>* %a0, <4 x i32> %a1, <4 x i32> %a2) {
|
||||
;CHECK-LABEL: commute_fold_vpmacsdd
|
||||
;CHECK: vpmacsdd %xmm1, (%rdi), %xmm0, %xmm0
|
||||
%1 = load <4 x i32>* %a0
|
||||
%2 = call <4 x i32> @llvm.x86.xop.vpmacsdd(<4 x i32> %1, <4 x i32> %a1, <4 x i32> %a2)
|
||||
ret <4 x i32> %2
|
||||
}
|
||||
declare <4 x i32> @llvm.x86.xop.vpmacsdd(<4 x i32>, <4 x i32>, <4 x i32>) nounwind readnone
|
||||
|
||||
define <2 x i64> @commute_fold_vpmacsdqh(<4 x i32>* %a0, <4 x i32> %a1, <2 x i64> %a2) {
|
||||
;CHECK-LABEL: commute_fold_vpmacsdqh
|
||||
;CHECK: vpmacsdqh %xmm1, (%rdi), %xmm0, %xmm0
|
||||
%1 = load <4 x i32>* %a0
|
||||
%2 = call <2 x i64> @llvm.x86.xop.vpmacsdqh(<4 x i32> %1, <4 x i32> %a1, <2 x i64> %a2)
|
||||
ret <2 x i64> %2
|
||||
}
|
||||
declare <2 x i64> @llvm.x86.xop.vpmacsdqh(<4 x i32>, <4 x i32>, <2 x i64>) nounwind readnone
|
||||
|
||||
define <2 x i64> @commute_fold_vpmacsdql(<4 x i32>* %a0, <4 x i32> %a1, <2 x i64> %a2) {
|
||||
;CHECK-LABEL: commute_fold_vpmacsdql
|
||||
;CHECK: vpmacsdql %xmm1, (%rdi), %xmm0, %xmm0
|
||||
%1 = load <4 x i32>* %a0
|
||||
%2 = call <2 x i64> @llvm.x86.xop.vpmacsdql(<4 x i32> %1, <4 x i32> %a1, <2 x i64> %a2)
|
||||
ret <2 x i64> %2
|
||||
}
|
||||
declare <2 x i64> @llvm.x86.xop.vpmacsdql(<4 x i32>, <4 x i32>, <2 x i64>) nounwind readnone
|
||||
|
||||
define <4 x i32> @commute_fold_vpmacssdd(<4 x i32>* %a0, <4 x i32> %a1, <4 x i32> %a2) {
|
||||
;CHECK-LABEL: commute_fold_vpmacssdd
|
||||
;CHECK: vpmacssdd %xmm1, (%rdi), %xmm0, %xmm0
|
||||
%1 = load <4 x i32>* %a0
|
||||
%2 = call <4 x i32> @llvm.x86.xop.vpmacssdd(<4 x i32> %1, <4 x i32> %a1, <4 x i32> %a2)
|
||||
ret <4 x i32> %2
|
||||
}
|
||||
declare <4 x i32> @llvm.x86.xop.vpmacssdd(<4 x i32>, <4 x i32>, <4 x i32>) nounwind readnone
|
||||
|
||||
define <2 x i64> @commute_fold_vpmacssdqh(<4 x i32>* %a0, <4 x i32> %a1, <2 x i64> %a2) {
|
||||
;CHECK-LABEL: commute_fold_vpmacssdqh
|
||||
;CHECK: vpmacssdqh %xmm1, (%rdi), %xmm0, %xmm0
|
||||
%1 = load <4 x i32>* %a0
|
||||
%2 = call <2 x i64> @llvm.x86.xop.vpmacssdqh(<4 x i32> %1, <4 x i32> %a1, <2 x i64> %a2)
|
||||
ret <2 x i64> %2
|
||||
}
|
||||
declare <2 x i64> @llvm.x86.xop.vpmacssdqh(<4 x i32>, <4 x i32>, <2 x i64>) nounwind readnone
|
||||
|
||||
define <2 x i64> @commute_fold_vpmacssdql(<4 x i32>* %a0, <4 x i32> %a1, <2 x i64> %a2) {
|
||||
;CHECK-LABEL: commute_fold_vpmacssdql
|
||||
;CHECK: vpmacssdql %xmm1, (%rdi), %xmm0, %xmm0
|
||||
%1 = load <4 x i32>* %a0
|
||||
%2 = call <2 x i64> @llvm.x86.xop.vpmacssdql(<4 x i32> %1, <4 x i32> %a1, <2 x i64> %a2)
|
||||
ret <2 x i64> %2
|
||||
}
|
||||
declare <2 x i64> @llvm.x86.xop.vpmacssdql(<4 x i32>, <4 x i32>, <2 x i64>) nounwind readnone
|
||||
|
||||
define <4 x i32> @commute_fold_vpmacsswd(<8 x i16>* %a0, <8 x i16> %a1, <4 x i32> %a2) {
|
||||
;CHECK-LABEL: commute_fold_vpmacsswd
|
||||
;CHECK: vpmacsswd %xmm1, (%rdi), %xmm0, %xmm0
|
||||
%1 = load <8 x i16>* %a0
|
||||
%2 = call <4 x i32> @llvm.x86.xop.vpmacsswd(<8 x i16> %1, <8 x i16> %a1, <4 x i32> %a2)
|
||||
ret <4 x i32> %2
|
||||
}
|
||||
declare <4 x i32> @llvm.x86.xop.vpmacsswd(<8 x i16>, <8 x i16>, <4 x i32>) nounwind readnone
|
||||
|
||||
define <8 x i16> @commute_fold_vpmacssww(<8 x i16>* %a0, <8 x i16> %a1, <8 x i16> %a2) {
|
||||
;CHECK-LABEL: commute_fold_vpmacssww
|
||||
;CHECK: vpmacssww %xmm1, (%rdi), %xmm0, %xmm0
|
||||
%1 = load <8 x i16>* %a0
|
||||
%2 = call <8 x i16> @llvm.x86.xop.vpmacssww(<8 x i16> %1, <8 x i16> %a1, <8 x i16> %a2)
|
||||
ret <8 x i16> %2
|
||||
}
|
||||
declare <8 x i16> @llvm.x86.xop.vpmacssww(<8 x i16>, <8 x i16>, <8 x i16>) nounwind readnone
|
||||
|
||||
define <4 x i32> @commute_fold_vpmacswd(<8 x i16>* %a0, <8 x i16> %a1, <4 x i32> %a2) {
|
||||
;CHECK-LABEL: commute_fold_vpmacswd
|
||||
;CHECK: vpmacswd %xmm1, (%rdi), %xmm0, %xmm0
|
||||
%1 = load <8 x i16>* %a0
|
||||
%2 = call <4 x i32> @llvm.x86.xop.vpmacswd(<8 x i16> %1, <8 x i16> %a1, <4 x i32> %a2)
|
||||
ret <4 x i32> %2
|
||||
}
|
||||
declare <4 x i32> @llvm.x86.xop.vpmacswd(<8 x i16>, <8 x i16>, <4 x i32>) nounwind readnone
|
||||
|
||||
define <8 x i16> @commute_fold_vpmacsww(<8 x i16>* %a0, <8 x i16> %a1, <8 x i16> %a2) {
|
||||
;CHECK-LABEL: commute_fold_vpmacsww
|
||||
;CHECK: vpmacsww %xmm1, (%rdi), %xmm0, %xmm0
|
||||
%1 = load <8 x i16>* %a0
|
||||
%2 = call <8 x i16> @llvm.x86.xop.vpmacsww(<8 x i16> %1, <8 x i16> %a1, <8 x i16> %a2)
|
||||
ret <8 x i16> %2
|
||||
}
|
||||
declare <8 x i16> @llvm.x86.xop.vpmacsww(<8 x i16>, <8 x i16>, <8 x i16>) nounwind readnone
|
||||
|
||||
define <4 x i32> @commute_fold_vpmadcsswd(<8 x i16>* %a0, <8 x i16> %a1, <4 x i32> %a2) {
|
||||
;CHECK-LABEL: commute_fold_vpmadcsswd
|
||||
;CHECK: vpmadcsswd %xmm1, (%rdi), %xmm0, %xmm0
|
||||
%1 = load <8 x i16>* %a0
|
||||
%2 = call <4 x i32> @llvm.x86.xop.vpmadcsswd(<8 x i16> %1, <8 x i16> %a1, <4 x i32> %a2)
|
||||
ret <4 x i32> %2
|
||||
}
|
||||
declare <4 x i32> @llvm.x86.xop.vpmadcsswd(<8 x i16>, <8 x i16>, <4 x i32>) nounwind readnone
|
||||
|
||||
define <4 x i32> @commute_fold_vpmadcswd(<8 x i16>* %a0, <8 x i16> %a1, <4 x i32> %a2) {
|
||||
;CHECK-LABEL: commute_fold_vpmadcswd
|
||||
;CHECK: vpmadcswd %xmm1, (%rdi), %xmm0, %xmm0
|
||||
%1 = load <8 x i16>* %a0
|
||||
%2 = call <4 x i32> @llvm.x86.xop.vpmadcswd(<8 x i16> %1, <8 x i16> %a1, <4 x i32> %a2)
|
||||
ret <4 x i32> %2
|
||||
}
|
||||
declare <4 x i32> @llvm.x86.xop.vpmadcswd(<8 x i16>, <8 x i16>, <4 x i32>) nounwind readnone
|
||||
|
||||
|
||||
|
Loading…
Reference in New Issue
Block a user