mirror of
https://github.com/RPCS3/llvm.git
synced 2025-02-03 09:14:30 +00:00
X86: Lower SMUL_LOHI of v4i32 to pmuldq when SSE4.1 is available.
git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@207318 91177308-0d34-0410-b5e6-96231b3b80d8
This commit is contained in:
parent
fb625eadf9
commit
9f2c21871c
@ -1062,6 +1062,7 @@ void X86TargetLowering::resetOperationActions() {
|
||||
|
||||
// FIXME: Do we need to handle scalar-to-vector here?
|
||||
setOperationAction(ISD::MUL, MVT::v4i32, Legal);
|
||||
setOperationAction(ISD::SMUL_LOHI, MVT::v4i32, Custom);
|
||||
|
||||
setOperationAction(ISD::VSELECT, MVT::v2f64, Legal);
|
||||
setOperationAction(ISD::VSELECT, MVT::v2i64, Legal);
|
||||
@ -1227,6 +1228,7 @@ void X86TargetLowering::resetOperationActions() {
|
||||
// Don't lower v32i8 because there is no 128-bit byte mul
|
||||
|
||||
setOperationAction(ISD::UMUL_LOHI, MVT::v8i32, Custom);
|
||||
setOperationAction(ISD::SMUL_LOHI, MVT::v8i32, Custom);
|
||||
setOperationAction(ISD::MULHU, MVT::v16i16, Legal);
|
||||
setOperationAction(ISD::MULHS, MVT::v16i16, Legal);
|
||||
|
||||
@ -11729,6 +11731,11 @@ static SDValue LowerINTRINSIC_WO_CHAIN(SDValue Op, SelectionDAG &DAG) {
|
||||
return DAG.getNode(X86ISD::PMULUDQ, dl, Op.getValueType(),
|
||||
Op.getOperand(1), Op.getOperand(2));
|
||||
|
||||
case Intrinsic::x86_sse41_pmuldq:
|
||||
case Intrinsic::x86_avx2_pmul_dq:
|
||||
return DAG.getNode(X86ISD::PMULDQ, dl, Op.getValueType(),
|
||||
Op.getOperand(1), Op.getOperand(2));
|
||||
|
||||
case Intrinsic::x86_sse2_pmulhu_w:
|
||||
case Intrinsic::x86_avx2_pmulhu_w:
|
||||
return DAG.getNode(ISD::MULHU, dl, Op.getValueType(),
|
||||
@ -13168,8 +13175,8 @@ static SDValue LowerMUL(SDValue Op, const X86Subtarget *Subtarget,
|
||||
return DAG.getNode(ISD::ADD, dl, VT, Res, AhiBlo);
|
||||
}
|
||||
|
||||
static SDValue LowerUMUL_LOHI(SDValue Op, const X86Subtarget *Subtarget,
|
||||
SelectionDAG &DAG) {
|
||||
static SDValue LowerMUL_LOHI(SDValue Op, const X86Subtarget *Subtarget,
|
||||
SelectionDAG &DAG) {
|
||||
SDValue Op0 = Op.getOperand(0), Op1 = Op.getOperand(1);
|
||||
EVT VT = Op0.getValueType();
|
||||
SDLoc dl(Op);
|
||||
@ -13185,15 +13192,17 @@ static SDValue LowerUMUL_LOHI(SDValue Op, const X86Subtarget *Subtarget,
|
||||
// Emit two multiplies, one for the lower 2 ints and one for the higher 2
|
||||
// ints.
|
||||
MVT MulVT = VT == MVT::v4i32 ? MVT::v2i64 : MVT::v4i64;
|
||||
unsigned Opcode =
|
||||
Op->getOpcode() == ISD::UMUL_LOHI ? X86ISD::PMULUDQ : X86ISD::PMULDQ;
|
||||
SDValue Mul1 = DAG.getNode(ISD::BITCAST, dl, VT,
|
||||
DAG.getNode(X86ISD::PMULUDQ, dl, MulVT, Op0, Op1));
|
||||
DAG.getNode(Opcode, dl, MulVT, Op0, Op1));
|
||||
SDValue Mul2 = DAG.getNode(ISD::BITCAST, dl, VT,
|
||||
DAG.getNode(X86ISD::PMULUDQ, dl, MulVT, Hi0, Hi1));
|
||||
DAG.getNode(Opcode, dl, MulVT, Hi0, Hi1));
|
||||
|
||||
// Shuffle it back into the right order.
|
||||
const int HighMask[] = {1, 3, 5, 7, 9, 11, 13, 15};
|
||||
const int HighMask[] = {1, 5, 3, 7, 9, 13, 11, 15};
|
||||
SDValue Highs = DAG.getVectorShuffle(VT, dl, Mul1, Mul2, HighMask);
|
||||
const int LowMask[] = {0, 2, 4, 6, 8, 10, 12, 14};
|
||||
const int LowMask[] = {0, 4, 2, 6, 8, 12, 10, 14};
|
||||
SDValue Lows = DAG.getVectorShuffle(VT, dl, Mul1, Mul2, LowMask);
|
||||
|
||||
return DAG.getNode(ISD::MERGE_VALUES, dl, Op.getValueType(), Highs, Lows);
|
||||
@ -14188,7 +14197,8 @@ SDValue X86TargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
|
||||
case ISD::CTLZ_ZERO_UNDEF: return LowerCTLZ_ZERO_UNDEF(Op, DAG);
|
||||
case ISD::CTTZ: return LowerCTTZ(Op, DAG);
|
||||
case ISD::MUL: return LowerMUL(Op, Subtarget, DAG);
|
||||
case ISD::UMUL_LOHI: return LowerUMUL_LOHI(Op, Subtarget, DAG);
|
||||
case ISD::UMUL_LOHI:
|
||||
case ISD::SMUL_LOHI: return LowerMUL_LOHI(Op, Subtarget, DAG);
|
||||
case ISD::SRA:
|
||||
case ISD::SRL:
|
||||
case ISD::SHL: return LowerShift(Op, Subtarget, DAG);
|
||||
|
@ -347,6 +347,8 @@ namespace llvm {
|
||||
|
||||
// PMULUDQ - Vector multiply packed unsigned doubleword integers
|
||||
PMULUDQ,
|
||||
// PMULUDQ - Vector multiply packed signed doubleword integers
|
||||
PMULDQ,
|
||||
|
||||
// FMA nodes
|
||||
FMADD,
|
||||
|
@ -175,6 +175,9 @@ def X86select : SDNode<"X86ISD::SELECT" , SDTSelect>;
|
||||
def X86pmuludq : SDNode<"X86ISD::PMULUDQ",
|
||||
SDTypeProfile<1, 2, [SDTCisVec<0>, SDTCisVec<1>,
|
||||
SDTCisSameAs<1,2>]>>;
|
||||
def X86pmuldq : SDNode<"X86ISD::PMULDQ",
|
||||
SDTypeProfile<1, 2, [SDTCisVec<0>, SDTCisVec<1>,
|
||||
SDTCisSameAs<1,2>]>>;
|
||||
|
||||
// Specific shuffle nodes - At some point ISD::VECTOR_SHUFFLE will always get
|
||||
// translated into one of the target nodes below during lowering.
|
||||
|
@ -7003,6 +7003,31 @@ multiclass SS48I_binop_rm<bits<8> opc, string OpcodeStr, SDNode OpNode,
|
||||
Sched<[itins.Sched.Folded, ReadAfterLd]>;
|
||||
}
|
||||
|
||||
/// SS48I_binop_rm2 - Simple SSE41 binary operator with different src and dst
|
||||
/// types.
|
||||
multiclass SS48I_binop_rm2<bits<8> opc, string OpcodeStr, SDNode OpNode,
|
||||
ValueType DstVT, ValueType SrcVT, RegisterClass RC,
|
||||
PatFrag memop_frag, X86MemOperand x86memop,
|
||||
OpndItins itins,
|
||||
bit IsCommutable = 0, bit Is2Addr = 1> {
|
||||
let isCommutable = IsCommutable in
|
||||
def rr : SS48I<opc, MRMSrcReg, (outs RC:$dst),
|
||||
(ins RC:$src1, RC:$src2),
|
||||
!if(Is2Addr,
|
||||
!strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
|
||||
!strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
|
||||
[(set RC:$dst, (DstVT (OpNode (SrcVT RC:$src1), RC:$src2)))]>,
|
||||
Sched<[itins.Sched]>;
|
||||
def rm : SS48I<opc, MRMSrcMem, (outs RC:$dst),
|
||||
(ins RC:$src1, x86memop:$src2),
|
||||
!if(Is2Addr,
|
||||
!strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
|
||||
!strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
|
||||
[(set RC:$dst, (DstVT (OpNode (SrcVT RC:$src1),
|
||||
(bitconvert (memop_frag addr:$src2)))))]>,
|
||||
Sched<[itins.Sched.Folded, ReadAfterLd]>;
|
||||
}
|
||||
|
||||
let Predicates = [HasAVX] in {
|
||||
let isCommutable = 0 in
|
||||
defm VPACKUSDW : SS41I_binop_rm_int<0x2B, "vpackusdw", int_x86_sse41_packusdw,
|
||||
@ -7031,8 +7056,9 @@ let Predicates = [HasAVX] in {
|
||||
defm VPMAXUW : SS48I_binop_rm<0x3E, "vpmaxuw", X86umax, v8i16, VR128,
|
||||
loadv2i64, i128mem, 0, SSE_INTALU_ITINS_P>,
|
||||
VEX_4V;
|
||||
defm VPMULDQ : SS41I_binop_rm_int<0x28, "vpmuldq", int_x86_sse41_pmuldq,
|
||||
0, DEFAULT_ITINS_VECIMULSCHED>, VEX_4V;
|
||||
defm VPMULDQ : SS48I_binop_rm2<0x28, "vpmuldq", X86pmuldq, v2i64, v4i32,
|
||||
VR128, loadv2i64, i128mem,
|
||||
SSE_INTMUL_ITINS_P, 1, 0>, VEX_4V;
|
||||
}
|
||||
|
||||
let Predicates = [HasAVX2] in {
|
||||
@ -7064,9 +7090,9 @@ let Predicates = [HasAVX2] in {
|
||||
defm VPMAXUWY : SS48I_binop_rm<0x3E, "vpmaxuw", X86umax, v16i16, VR256,
|
||||
loadv4i64, i256mem, 0, SSE_INTALU_ITINS_P>,
|
||||
VEX_4V, VEX_L;
|
||||
defm VPMULDQ : SS41I_binop_rm_int_y<0x28, "vpmuldq",
|
||||
int_x86_avx2_pmul_dq, WriteVecIMul>,
|
||||
VEX_4V, VEX_L;
|
||||
defm VPMULDQY : SS48I_binop_rm2<0x28, "vpmuldq", X86pmuldq, v4i64, v8i32,
|
||||
VR256, loadv4i64, i256mem,
|
||||
SSE_INTMUL_ITINS_P, 1, 0>, VEX_4V, VEX_L;
|
||||
}
|
||||
|
||||
let Constraints = "$src1 = $dst" in {
|
||||
@ -7089,8 +7115,9 @@ let Constraints = "$src1 = $dst" in {
|
||||
memopv2i64, i128mem, 1, SSE_INTALU_ITINS_P>;
|
||||
defm PMAXUW : SS48I_binop_rm<0x3E, "pmaxuw", X86umax, v8i16, VR128,
|
||||
memopv2i64, i128mem, 1, SSE_INTALU_ITINS_P>;
|
||||
defm PMULDQ : SS41I_binop_rm_int<0x28, "pmuldq", int_x86_sse41_pmuldq,
|
||||
1, SSE_INTMUL_ITINS_P>;
|
||||
defm PMULDQ : SS48I_binop_rm2<0x28, "pmuldq", X86pmuldq, v2i64, v4i32,
|
||||
VR128, memopv2i64, i128mem,
|
||||
SSE_INTMUL_ITINS_P, 1>;
|
||||
}
|
||||
|
||||
let Predicates = [HasAVX] in {
|
||||
|
@ -1,4 +1,4 @@
|
||||
; RUN: llc -march=x86-64 -mcpu=core2 < %s | FileCheck %s -check-prefix=SSE
|
||||
; RUN: llc -march=x86-64 -mcpu=corei7 < %s | FileCheck %s -check-prefix=SSE
|
||||
; RUN: llc -march=x86-64 -mcpu=core-avx2 < %s | FileCheck %s -check-prefix=AVX
|
||||
|
||||
define <4 x i32> @test1(<4 x i32> %a) {
|
||||
@ -103,4 +103,51 @@ define <16 x i16> @test6(<16 x i16> %a) {
|
||||
; AVX-NOT: vpmulhw
|
||||
}
|
||||
|
||||
; TODO: sdiv -> pmuldq
|
||||
define <16 x i8> @test7(<16 x i8> %a) {
|
||||
%div = sdiv <16 x i8> %a, <i8 7, i8 7, i8 7, i8 7,i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7,i8 7, i8 7, i8 7, i8 7>
|
||||
ret <16 x i8> %div
|
||||
}
|
||||
|
||||
define <4 x i32> @test8(<4 x i32> %a) {
|
||||
%div = sdiv <4 x i32> %a, <i32 7, i32 7, i32 7, i32 7>
|
||||
ret <4 x i32> %div
|
||||
|
||||
; SSE-LABEL: test8:
|
||||
; SSE: pmuldq
|
||||
; SSE: pshufd $57
|
||||
; SSE: pmuldq
|
||||
; SSE: shufps $-35
|
||||
; SSE: pshufd $-40
|
||||
; SSE: padd
|
||||
; SSE: psrld $31
|
||||
; SSE: psrad $2
|
||||
; SSE: padd
|
||||
|
||||
; AVX-LABEL: test8:
|
||||
; AVX: vpmuldq
|
||||
; AVX: vpshufd $57
|
||||
; AVX: vpmuldq
|
||||
; AVX: vshufps $-35
|
||||
; AVX: vpshufd $-40
|
||||
; AVX: vpadd
|
||||
; AVX: vpsrld $31
|
||||
; AVX: vpsrad $2
|
||||
; AVX: vpadd
|
||||
}
|
||||
|
||||
define <8 x i32> @test9(<8 x i32> %a) {
|
||||
%div = sdiv <8 x i32> %a, <i32 7, i32 7, i32 7, i32 7,i32 7, i32 7, i32 7, i32 7>
|
||||
ret <8 x i32> %div
|
||||
|
||||
; AVX-LABEL: test9:
|
||||
; AVX: vpermd
|
||||
; AVX: vpmuldq
|
||||
; AVX: vshufps $-35
|
||||
; AVX: vpmuldq
|
||||
; AVX: vshufps $-35
|
||||
; AVX: vpshufd $-40
|
||||
; AVX: vpadd
|
||||
; AVX: vpsrld $31
|
||||
; AVX: vpsrad $2
|
||||
; AVX: vpadd
|
||||
}
|
||||
|
Loading…
x
Reference in New Issue
Block a user