mirror of
https://github.com/RPCSX/llvm.git
synced 2025-01-12 23:40:54 +00:00
Add target specific node for PMULUDQ. Change patterns to use it and custom lower intrinsics to it. Use it instead of intrinsic to handle 64-bit vector multiplies.
git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@149807 91177308-0d34-0410-b5e6-96231b3b80d8
This commit is contained in:
parent
2237f84735
commit
5b209e84f4
@ -9426,6 +9426,10 @@ X86TargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op, SelectionDAG &DAG) const
|
||||
}
|
||||
|
||||
// Arithmetic intrinsics.
|
||||
case Intrinsic::x86_sse2_pmulu_dq:
|
||||
case Intrinsic::x86_avx2_pmulu_dq:
|
||||
return DAG.getNode(X86ISD::PMULUDQ, dl, Op.getValueType(),
|
||||
Op.getOperand(1), Op.getOperand(2));
|
||||
case Intrinsic::x86_sse3_hadd_ps:
|
||||
case Intrinsic::x86_sse3_hadd_pd:
|
||||
case Intrinsic::x86_avx_hadd_ps_256:
|
||||
@ -10085,78 +10089,46 @@ SDValue X86TargetLowering::LowerMUL(SDValue Op, SelectionDAG &DAG) const {
|
||||
if (VT.getSizeInBits() == 256 && !Subtarget->hasAVX2())
|
||||
return Lower256IntArith(Op, DAG);
|
||||
|
||||
assert((VT == MVT::v2i64 || VT == MVT::v4i64) &&
|
||||
"Only know how to lower V2I64/V4I64 multiply");
|
||||
|
||||
DebugLoc dl = Op.getDebugLoc();
|
||||
|
||||
// Ahi = psrlqi(a, 32);
|
||||
// Bhi = psrlqi(b, 32);
|
||||
//
|
||||
// AloBlo = pmuludq(a, b);
|
||||
// AloBhi = pmuludq(a, Bhi);
|
||||
// AhiBlo = pmuludq(Ahi, b);
|
||||
|
||||
// AloBhi = psllqi(AloBhi, 32);
|
||||
// AhiBlo = psllqi(AhiBlo, 32);
|
||||
// return AloBlo + AloBhi + AhiBlo;
|
||||
|
||||
SDValue A = Op.getOperand(0);
|
||||
SDValue B = Op.getOperand(1);
|
||||
|
||||
if (VT == MVT::v4i64) {
|
||||
assert(Subtarget->hasAVX2() && "Lowering v4i64 multiply requires AVX2");
|
||||
SDValue ShAmt = DAG.getConstant(32, MVT::i32);
|
||||
|
||||
// ulong2 Ahi = __builtin_ia32_psrlqi256( a, 32);
|
||||
// ulong2 Bhi = __builtin_ia32_psrlqi256( b, 32);
|
||||
// ulong2 AloBlo = __builtin_ia32_pmuludq256( a, b );
|
||||
// ulong2 AloBhi = __builtin_ia32_pmuludq256( a, Bhi );
|
||||
// ulong2 AhiBlo = __builtin_ia32_pmuludq256( Ahi, b );
|
||||
//
|
||||
// AloBhi = __builtin_ia32_psllqi256( AloBhi, 32 );
|
||||
// AhiBlo = __builtin_ia32_psllqi256( AhiBlo, 32 );
|
||||
// return AloBlo + AloBhi + AhiBlo;
|
||||
SDValue Ahi = DAG.getNode(X86ISD::VSRLI, dl, VT, A, ShAmt);
|
||||
SDValue Bhi = DAG.getNode(X86ISD::VSRLI, dl, VT, B, ShAmt);
|
||||
|
||||
SDValue Ahi = DAG.getNode(X86ISD::VSRLI, dl, VT, A,
|
||||
DAG.getConstant(32, MVT::i32));
|
||||
SDValue Bhi = DAG.getNode(X86ISD::VSRLI, dl, VT, B,
|
||||
DAG.getConstant(32, MVT::i32));
|
||||
SDValue AloBlo = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT,
|
||||
DAG.getConstant(Intrinsic::x86_avx2_pmulu_dq, MVT::i32),
|
||||
A, B);
|
||||
SDValue AloBhi = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT,
|
||||
DAG.getConstant(Intrinsic::x86_avx2_pmulu_dq, MVT::i32),
|
||||
A, Bhi);
|
||||
SDValue AhiBlo = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT,
|
||||
DAG.getConstant(Intrinsic::x86_avx2_pmulu_dq, MVT::i32),
|
||||
Ahi, B);
|
||||
AloBhi = DAG.getNode(X86ISD::VSHLI, dl, VT, AloBhi,
|
||||
DAG.getConstant(32, MVT::i32));
|
||||
AhiBlo = DAG.getNode(X86ISD::VSHLI, dl, VT, AhiBlo,
|
||||
DAG.getConstant(32, MVT::i32));
|
||||
SDValue Res = DAG.getNode(ISD::ADD, dl, VT, AloBlo, AloBhi);
|
||||
Res = DAG.getNode(ISD::ADD, dl, VT, Res, AhiBlo);
|
||||
return Res;
|
||||
}
|
||||
// Bit cast to 32-bit vectors for MULUDQ
|
||||
EVT MulVT = (VT == MVT::v2i64) ? MVT::v4i32 : MVT::v8i32;
|
||||
A = DAG.getNode(ISD::BITCAST, dl, MulVT, A);
|
||||
B = DAG.getNode(ISD::BITCAST, dl, MulVT, B);
|
||||
Ahi = DAG.getNode(ISD::BITCAST, dl, MulVT, Ahi);
|
||||
Bhi = DAG.getNode(ISD::BITCAST, dl, MulVT, Bhi);
|
||||
|
||||
assert(VT == MVT::v2i64 && "Only know how to lower V2I64 multiply");
|
||||
SDValue AloBlo = DAG.getNode(X86ISD::PMULUDQ, dl, VT, A, B);
|
||||
SDValue AloBhi = DAG.getNode(X86ISD::PMULUDQ, dl, VT, A, Bhi);
|
||||
SDValue AhiBlo = DAG.getNode(X86ISD::PMULUDQ, dl, VT, Ahi, B);
|
||||
|
||||
// ulong2 Ahi = __builtin_ia32_psrlqi128( a, 32);
|
||||
// ulong2 Bhi = __builtin_ia32_psrlqi128( b, 32);
|
||||
// ulong2 AloBlo = __builtin_ia32_pmuludq128( a, b );
|
||||
// ulong2 AloBhi = __builtin_ia32_pmuludq128( a, Bhi );
|
||||
// ulong2 AhiBlo = __builtin_ia32_pmuludq128( Ahi, b );
|
||||
//
|
||||
// AloBhi = __builtin_ia32_psllqi128( AloBhi, 32 );
|
||||
// AhiBlo = __builtin_ia32_psllqi128( AhiBlo, 32 );
|
||||
// return AloBlo + AloBhi + AhiBlo;
|
||||
AloBhi = DAG.getNode(X86ISD::VSHLI, dl, VT, AloBhi, ShAmt);
|
||||
AhiBlo = DAG.getNode(X86ISD::VSHLI, dl, VT, AhiBlo, ShAmt);
|
||||
|
||||
SDValue Ahi = DAG.getNode(X86ISD::VSRLI, dl, VT, A,
|
||||
DAG.getConstant(32, MVT::i32));
|
||||
SDValue Bhi = DAG.getNode(X86ISD::VSRLI, dl, VT, B,
|
||||
DAG.getConstant(32, MVT::i32));
|
||||
SDValue AloBlo = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT,
|
||||
DAG.getConstant(Intrinsic::x86_sse2_pmulu_dq, MVT::i32),
|
||||
A, B);
|
||||
SDValue AloBhi = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT,
|
||||
DAG.getConstant(Intrinsic::x86_sse2_pmulu_dq, MVT::i32),
|
||||
A, Bhi);
|
||||
SDValue AhiBlo = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT,
|
||||
DAG.getConstant(Intrinsic::x86_sse2_pmulu_dq, MVT::i32),
|
||||
Ahi, B);
|
||||
AloBhi = DAG.getNode(X86ISD::VSHLI, dl, VT, AloBhi,
|
||||
DAG.getConstant(32, MVT::i32));
|
||||
AhiBlo = DAG.getNode(X86ISD::VSHLI, dl, VT, AhiBlo,
|
||||
DAG.getConstant(32, MVT::i32));
|
||||
SDValue Res = DAG.getNode(ISD::ADD, dl, VT, AloBlo, AloBhi);
|
||||
Res = DAG.getNode(ISD::ADD, dl, VT, Res, AhiBlo);
|
||||
return Res;
|
||||
return DAG.getNode(ISD::ADD, dl, VT, Res, AhiBlo);
|
||||
}
|
||||
|
||||
SDValue X86TargetLowering::LowerShift(SDValue Op, SelectionDAG &DAG) const {
|
||||
@ -11092,6 +11064,7 @@ const char *X86TargetLowering::getTargetNodeName(unsigned Opcode) const {
|
||||
case X86ISD::VBROADCAST: return "X86ISD::VBROADCAST";
|
||||
case X86ISD::VPERMILP: return "X86ISD::VPERMILP";
|
||||
case X86ISD::VPERM2X128: return "X86ISD::VPERM2X128";
|
||||
case X86ISD::PMULUDQ: return "X86ISD::PMULUDQ";
|
||||
case X86ISD::VASTART_SAVE_XMM_REGS: return "X86ISD::VASTART_SAVE_XMM_REGS";
|
||||
case X86ISD::VAARG_64: return "X86ISD::VAARG_64";
|
||||
case X86ISD::WIN_ALLOCA: return "X86ISD::WIN_ALLOCA";
|
||||
|
@ -219,7 +219,7 @@ namespace llvm {
|
||||
// VZEXT_MOVL - Vector move low and zero extend.
|
||||
VZEXT_MOVL,
|
||||
|
||||
// VZEXT_MOVL - Vector move low and sign extend.
|
||||
// VSEXT_MOVL - Vector move low and sign extend.
|
||||
VSEXT_MOVL,
|
||||
|
||||
// VSHL, VSRL - 128-bit vector logical left / right shift
|
||||
@ -283,6 +283,9 @@ namespace llvm {
|
||||
VPERM2X128,
|
||||
VBROADCAST,
|
||||
|
||||
// PMULUDQ - Vector multiply packed unsigned doubleword integers
|
||||
PMULUDQ,
|
||||
|
||||
// VASTART_SAVE_XMM_REGS - Save xmm argument registers to the stack,
|
||||
// according to %al. An operator is needed so that this can be expanded
|
||||
// with control flow.
|
||||
|
@ -109,6 +109,10 @@ def X86vpcomu : SDNode<"X86ISD::VPCOMU",
|
||||
SDTypeProfile<1, 3, [SDTCisVec<0>, SDTCisSameAs<0,1>,
|
||||
SDTCisSameAs<0,2>, SDTCisVT<3, i8>]>>;
|
||||
|
||||
def X86pmuludq : SDNode<"X86ISD::PMULUDQ",
|
||||
SDTypeProfile<1, 2, [SDTCisVec<0>, SDTCisVec<1>,
|
||||
SDTCisSameAs<1,2>]>>;
|
||||
|
||||
// Specific shuffle nodes - At some point ISD::VECTOR_SHUFFLE will always get
|
||||
// translated into one of the target nodes below during lowering.
|
||||
// Note: this is a work in progress...
|
||||
|
@ -3530,6 +3530,26 @@ multiclass PDI_binop_rmi<bits<8> opc, bits<8> opc2, Format ImmForm,
|
||||
[(set RC:$dst, (DstVT (OpNode2 RC:$src1, (i32 imm:$src2))))]>;
|
||||
}
|
||||
|
||||
/// PDI_binop_rm - Simple SSE2 binary operator with different src and dst types
|
||||
multiclass PDI_binop_rm2<bits<8> opc, string OpcodeStr, SDNode OpNode,
|
||||
ValueType DstVT, ValueType SrcVT, RegisterClass RC,
|
||||
PatFrag memop_frag, X86MemOperand x86memop,
|
||||
bit IsCommutable = 0, bit Is2Addr = 1> {
|
||||
let isCommutable = IsCommutable in
|
||||
def rr : PDI<opc, MRMSrcReg, (outs RC:$dst),
|
||||
(ins RC:$src1, RC:$src2),
|
||||
!if(Is2Addr,
|
||||
!strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
|
||||
!strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
|
||||
[(set RC:$dst, (DstVT (OpNode (SrcVT RC:$src1), RC:$src2)))]>;
|
||||
def rm : PDI<opc, MRMSrcMem, (outs RC:$dst),
|
||||
(ins RC:$src1, x86memop:$src2),
|
||||
!if(Is2Addr,
|
||||
!strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
|
||||
!strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
|
||||
[(set RC:$dst, (DstVT (OpNode (SrcVT RC:$src1),
|
||||
(bitconvert (memop_frag addr:$src2)))))]>;
|
||||
}
|
||||
} // ExeDomain = SSEPackedInt
|
||||
|
||||
// 128-bit Integer Arithmetic
|
||||
@ -3553,6 +3573,8 @@ defm VPSUBD : PDI_binop_rm<0xFA, "vpsubd", sub, v4i32, VR128, memopv2i64,
|
||||
i128mem, 0, 0>, VEX_4V;
|
||||
defm VPSUBQ : PDI_binop_rm<0xFB, "vpsubq", sub, v2i64, VR128, memopv2i64,
|
||||
i128mem, 0, 0>, VEX_4V;
|
||||
defm VPMULUDQ : PDI_binop_rm2<0xF4, "vpmuludq", X86pmuludq, v2i64, v4i32, VR128,
|
||||
memopv2i64, i128mem, 1, 0>, VEX_4V;
|
||||
|
||||
// Intrinsic forms
|
||||
defm VPSUBSB : PDI_binop_rm_int<0xE8, "vpsubsb" , int_x86_sse2_psubs_b,
|
||||
@ -3575,8 +3597,6 @@ defm VPMULHUW : PDI_binop_rm_int<0xE4, "vpmulhuw", int_x86_sse2_pmulhu_w,
|
||||
VR128, memopv2i64, i128mem, 1, 0>, VEX_4V;
|
||||
defm VPMULHW : PDI_binop_rm_int<0xE5, "vpmulhw" , int_x86_sse2_pmulh_w,
|
||||
VR128, memopv2i64, i128mem, 1, 0>, VEX_4V;
|
||||
defm VPMULUDQ : PDI_binop_rm_int<0xF4, "vpmuludq", int_x86_sse2_pmulu_dq,
|
||||
VR128, memopv2i64, i128mem, 1, 0>, VEX_4V;
|
||||
defm VPMADDWD : PDI_binop_rm_int<0xF5, "vpmaddwd", int_x86_sse2_pmadd_wd,
|
||||
VR128, memopv2i64, i128mem, 1, 0>, VEX_4V;
|
||||
defm VPAVGB : PDI_binop_rm_int<0xE0, "vpavgb", int_x86_sse2_pavg_b,
|
||||
@ -3614,6 +3634,8 @@ defm VPSUBDY : PDI_binop_rm<0xFA, "vpsubd", sub, v8i32, VR256, memopv4i64,
|
||||
i256mem, 0, 0>, VEX_4V;
|
||||
defm VPSUBQY : PDI_binop_rm<0xFB, "vpsubq", sub, v4i64, VR256, memopv4i64,
|
||||
i256mem, 0, 0>, VEX_4V;
|
||||
defm VPMULUDQY : PDI_binop_rm2<0xF4, "vpmuludq", X86pmuludq, v4i64, v8i32,
|
||||
VR256, memopv4i64, i256mem, 1, 0>, VEX_4V;
|
||||
|
||||
// Intrinsic forms
|
||||
defm VPSUBSBY : PDI_binop_rm_int<0xE8, "vpsubsb" , int_x86_avx2_psubs_b,
|
||||
@ -3636,8 +3658,6 @@ defm VPMULHUWY : PDI_binop_rm_int<0xE4, "vpmulhuw", int_x86_avx2_pmulhu_w,
|
||||
VR256, memopv4i64, i256mem, 1, 0>, VEX_4V;
|
||||
defm VPMULHWY : PDI_binop_rm_int<0xE5, "vpmulhw" , int_x86_avx2_pmulh_w,
|
||||
VR256, memopv4i64, i256mem, 1, 0>, VEX_4V;
|
||||
defm VPMULUDQY : PDI_binop_rm_int<0xF4, "vpmuludq", int_x86_avx2_pmulu_dq,
|
||||
VR256, memopv4i64, i256mem, 1, 0>, VEX_4V;
|
||||
defm VPMADDWDY : PDI_binop_rm_int<0xF5, "vpmaddwd", int_x86_avx2_pmadd_wd,
|
||||
VR256, memopv4i64, i256mem, 1, 0>, VEX_4V;
|
||||
defm VPAVGBY : PDI_binop_rm_int<0xE0, "vpavgb", int_x86_avx2_pavg_b,
|
||||
@ -3675,6 +3695,8 @@ defm PSUBD : PDI_binop_rm<0xFA, "psubd", sub, v4i32, VR128, memopv2i64,
|
||||
i128mem>;
|
||||
defm PSUBQ : PDI_binop_rm<0xFB, "psubq", sub, v2i64, VR128, memopv2i64,
|
||||
i128mem>;
|
||||
defm PMULUDQ : PDI_binop_rm2<0xF4, "pmuludq", X86pmuludq, v2i64, v4i32, VR128,
|
||||
memopv2i64, i128mem, 1>;
|
||||
|
||||
// Intrinsic forms
|
||||
defm PSUBSB : PDI_binop_rm_int<0xE8, "psubsb" , int_x86_sse2_psubs_b,
|
||||
@ -3697,8 +3719,6 @@ defm PMULHUW : PDI_binop_rm_int<0xE4, "pmulhuw", int_x86_sse2_pmulhu_w,
|
||||
VR128, memopv2i64, i128mem, 1>;
|
||||
defm PMULHW : PDI_binop_rm_int<0xE5, "pmulhw" , int_x86_sse2_pmulh_w,
|
||||
VR128, memopv2i64, i128mem, 1>;
|
||||
defm PMULUDQ : PDI_binop_rm_int<0xF4, "pmuludq", int_x86_sse2_pmulu_dq,
|
||||
VR128, memopv2i64, i128mem, 1>;
|
||||
defm PMADDWD : PDI_binop_rm_int<0xF5, "pmaddwd", int_x86_sse2_pmadd_wd,
|
||||
VR128, memopv2i64, i128mem, 1>;
|
||||
defm PAVGB : PDI_binop_rm_int<0xE0, "pavgb", int_x86_sse2_pavg_b,
|
||||
|
Loading…
x
Reference in New Issue
Block a user