mirror of
https://github.com/RPCSX/llvm.git
synced 2025-01-08 04:52:50 +00:00
Added more insertps optimizations
Summary: When inserting an element that's coming from a vector load or a broadcast of a vector (or scalar) load, combine the load into the insertps instruction. Added PerformINSERTPSCombine for the case where we need to fix the load (load of a vector + insertps with a non-zero CountS). Added patterns for the broadcasts. Also added tests for SSE4.1, AVX, and AVX2. Reviewers: delena, nadav, craig.topper Subscribers: llvm-commits Differential Revision: http://reviews.llvm.org/D3581 git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@209156 91177308-0d34-0410-b5e6-96231b3b80d8
This commit is contained in:
parent
861e2ef7b0
commit
ca162faee2
@ -7412,6 +7412,23 @@ SDValue getMOVLP(SDValue &Op, SDLoc &dl, SelectionDAG &DAG, bool HasSSE2) {
|
||||
getShuffleSHUFImmediate(SVOp), DAG);
|
||||
}
|
||||
|
||||
static SDValue NarrowVectorLoadToElement(LoadSDNode *Load, unsigned Index,
|
||||
SelectionDAG &DAG) {
|
||||
SDLoc dl(Load);
|
||||
MVT VT = Load->getSimpleValueType(0);
|
||||
MVT EVT = VT.getVectorElementType();
|
||||
SDValue Addr = Load->getOperand(1);
|
||||
SDValue NewAddr = DAG.getNode(
|
||||
ISD::ADD, dl, Addr.getSimpleValueType(), Addr,
|
||||
DAG.getConstant(Index * EVT.getStoreSize(), Addr.getSimpleValueType()));
|
||||
|
||||
SDValue NewLoad =
|
||||
DAG.getLoad(EVT, dl, Load->getChain(), NewAddr,
|
||||
DAG.getMachineFunction().getMachineMemOperand(
|
||||
Load->getMemOperand(), 0, EVT.getStoreSize()));
|
||||
return NewLoad;
|
||||
}
|
||||
|
||||
// It is only safe to call this function if isINSERTPSMask is true for
|
||||
// this shufflevector mask.
|
||||
static SDValue getINSERTPS(ShuffleVectorSDNode *SVOp, SDLoc &dl,
|
||||
@ -7423,7 +7440,6 @@ static SDValue getINSERTPS(ShuffleVectorSDNode *SVOp, SDLoc &dl,
|
||||
// If we're transferring an i32 from memory to a specific element in a
|
||||
// register, we output a generic DAG that will match the PINSRD
|
||||
// instruction.
|
||||
// TODO: Optimize for AVX cases too (VINSERTPS)
|
||||
MVT VT = SVOp->getSimpleValueType(0);
|
||||
MVT EVT = VT.getVectorElementType();
|
||||
SDValue V1 = SVOp->getOperand(0);
|
||||
@ -7456,17 +7472,10 @@ static SDValue getINSERTPS(ShuffleVectorSDNode *SVOp, SDLoc &dl,
|
||||
// Trivial case, when From comes from a load and is only used by the
|
||||
// shuffle. Make it use insertps from the vector that we need from that
|
||||
// load.
|
||||
SDValue Addr = From.getOperand(1);
|
||||
SDValue NewAddr =
|
||||
DAG.getNode(ISD::ADD, dl, Addr.getSimpleValueType(), Addr,
|
||||
DAG.getConstant(DestIndex * EVT.getStoreSize(),
|
||||
Addr.getSimpleValueType()));
|
||||
|
||||
LoadSDNode *Load = cast<LoadSDNode>(From);
|
||||
SDValue NewLoad =
|
||||
DAG.getLoad(EVT, dl, Load->getChain(), NewAddr,
|
||||
DAG.getMachineFunction().getMachineMemOperand(
|
||||
Load->getMemOperand(), 0, EVT.getStoreSize()));
|
||||
NarrowVectorLoadToElement(cast<LoadSDNode>(From), DestIndex, DAG);
|
||||
if (!NewLoad.getNode())
|
||||
return SDValue();
|
||||
|
||||
if (EVT == MVT::f32) {
|
||||
// Create this as a scalar to vector to match the instruction pattern.
|
||||
@ -20281,6 +20290,33 @@ static SDValue PerformISDSETCCCombine(SDNode *N, SelectionDAG &DAG,
|
||||
return SDValue();
|
||||
}
|
||||
|
||||
static SDValue PerformINSERTPSCombine(SDNode *N, SelectionDAG &DAG,
|
||||
const X86Subtarget *Subtarget) {
|
||||
SDLoc dl(N);
|
||||
MVT VT = N->getOperand(1)->getSimpleValueType(0);
|
||||
assert(VT == MVT::v4f32 ||
|
||||
VT == MVT::v4i32 && "X86insertps is only defined for v4x32");
|
||||
|
||||
SDValue Ld = N->getOperand(1);
|
||||
if (MayFoldLoad(Ld)) {
|
||||
// Extract the countS bits from the immediate so we can get the proper
|
||||
// address when narrowing the vector load to a specific element.
|
||||
// When the second source op is a memory address, interps doesn't use
|
||||
// countS and just gets an f32 from that address.
|
||||
unsigned DestIndex =
|
||||
cast<ConstantSDNode>(N->getOperand(2))->getZExtValue() >> 6;
|
||||
Ld = NarrowVectorLoadToElement(cast<LoadSDNode>(Ld), DestIndex, DAG);
|
||||
} else
|
||||
return SDValue();
|
||||
|
||||
// Create this as a scalar to vector to match the instruction pattern.
|
||||
SDValue LoadScalarToVector = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Ld);
|
||||
// countS bits are ignored when loading from memory on insertps, which
|
||||
// means we don't need to explicitly set them to 0.
|
||||
return DAG.getNode(X86ISD::INSERTPS, dl, VT, N->getOperand(0),
|
||||
LoadScalarToVector, N->getOperand(2));
|
||||
}
|
||||
|
||||
// Helper function of PerformSETCCCombine. It is to materialize "setb reg"
|
||||
// as "sbb reg,reg", since it can be extended without zext and produces
|
||||
// an all-ones bit which is more useful than 0/1 in some cases.
|
||||
@ -20584,6 +20620,8 @@ SDValue X86TargetLowering::PerformDAGCombine(SDNode *N,
|
||||
case ISD::FMA: return PerformFMACombine(N, DAG, Subtarget);
|
||||
case ISD::INTRINSIC_WO_CHAIN:
|
||||
return PerformINTRINSIC_WO_CHAINCombine(N, DAG, Subtarget);
|
||||
case X86ISD::INSERTPS:
|
||||
return PerformINSERTPSCombine(N, DAG, Subtarget);
|
||||
}
|
||||
|
||||
return SDValue();
|
||||
|
@ -6550,6 +6550,29 @@ let ExeDomain = SSEPackedSingle in {
|
||||
defm INSERTPS : SS41I_insertf32<0x21, "insertps", 1, SSE_INSERT_ITINS>;
|
||||
}
|
||||
|
||||
let Predicates = [UseSSE41] in {
|
||||
// If we're inserting an element from a load or a null pshuf of a load,
|
||||
// fold the load into the insertps instruction.
|
||||
def : Pat<(v4f32 (X86insertps (v4f32 VR128:$src1), (X86PShufd (v4f32
|
||||
(scalar_to_vector (loadf32 addr:$src2))), (i8 0)),
|
||||
imm:$src3)),
|
||||
(INSERTPSrm VR128:$src1, addr:$src2, imm:$src3)>;
|
||||
def : Pat<(v4f32 (X86insertps (v4f32 VR128:$src1), (X86PShufd
|
||||
(loadv4f32 addr:$src2), (i8 0)), imm:$src3)),
|
||||
(INSERTPSrm VR128:$src1, addr:$src2, imm:$src3)>;
|
||||
}
|
||||
|
||||
let Predicates = [UseAVX] in {
|
||||
// If we're inserting an element from a vbroadcast of a load, fold the
|
||||
// load into the X86insertps instruction.
|
||||
def : Pat<(v4f32 (X86insertps (v4f32 VR128:$src1),
|
||||
(X86VBroadcast (loadf32 addr:$src2)), imm:$src3)),
|
||||
(VINSERTPSrm VR128:$src1, addr:$src2, imm:$src3)>;
|
||||
def : Pat<(v4f32 (X86insertps (v4f32 VR128:$src1),
|
||||
(X86VBroadcast (loadv4f32 addr:$src2)), imm:$src3)),
|
||||
(VINSERTPSrm VR128:$src1, addr:$src2, imm:$src3)>;
|
||||
}
|
||||
|
||||
//===----------------------------------------------------------------------===//
|
||||
// SSE4.1 - Round Instructions
|
||||
//===----------------------------------------------------------------------===//
|
||||
|
@ -1,4 +1,5 @@
|
||||
; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=corei7-avx | FileCheck %s
|
||||
; RUN: llc < %s -mtriple=i686-apple-darwin -mcpu=corei7-avx | FileCheck %s -check-prefix=X32 --check-prefix=CHECK
|
||||
; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=corei7-avx | FileCheck %s -check-prefix=X64 --check-prefix=CHECK
|
||||
|
||||
define <4 x i32> @blendvb_fallback_v4i32(<4 x i1> %mask, <4 x i32> %x, <4 x i32> %y) {
|
||||
; CHECK-LABEL: @blendvb_fallback_v4i32
|
||||
@ -23,3 +24,113 @@ define <8 x float> @blendvb_fallback_v8f32(<8 x i1> %mask, <8 x float> %x, <8 x
|
||||
%ret = select <8 x i1> %mask, <8 x float> %x, <8 x float> %y
|
||||
ret <8 x float> %ret
|
||||
}
|
||||
|
||||
declare <4 x float> @llvm.x86.sse41.insertps(<4 x float>, <4 x float>, i32) nounwind readnone
|
||||
|
||||
define <4 x float> @insertps_from_vector_load(<4 x float> %a, <4 x float>* nocapture readonly %pb) {
|
||||
; CHECK-LABEL: insertps_from_vector_load:
|
||||
; On X32, account for the argument's move to registers
|
||||
; X32: movl 4(%esp), %eax
|
||||
; CHECK-NOT: mov
|
||||
; CHECK: insertps $48
|
||||
; CHECK-NEXT: ret
|
||||
%1 = load <4 x float>* %pb, align 16
|
||||
%2 = tail call <4 x float> @llvm.x86.sse41.insertps(<4 x float> %a, <4 x float> %1, i32 48)
|
||||
ret <4 x float> %2
|
||||
}
|
||||
|
||||
;; Use a non-zero CountS for insertps
|
||||
define <4 x float> @insertps_from_vector_load_offset(<4 x float> %a, <4 x float>* nocapture readonly %pb) {
|
||||
; CHECK-LABEL: insertps_from_vector_load_offset:
|
||||
; On X32, account for the argument's move to registers
|
||||
; X32: movl 4(%esp), %eax
|
||||
; CHECK-NOT: mov
|
||||
;; Try to match a bit more of the instr, since we need the load's offset.
|
||||
; CHECK: insertps $96, 4(%{{...}}), %
|
||||
; CHECK-NEXT: ret
|
||||
%1 = load <4 x float>* %pb, align 16
|
||||
%2 = tail call <4 x float> @llvm.x86.sse41.insertps(<4 x float> %a, <4 x float> %1, i32 96)
|
||||
ret <4 x float> %2
|
||||
}
|
||||
|
||||
define <4 x float> @insertps_from_vector_load_offset_2(<4 x float> %a, <4 x float>* nocapture readonly %pb, i64 %index) {
|
||||
; CHECK-LABEL: insertps_from_vector_load_offset_2:
|
||||
; On X32, account for the argument's move to registers
|
||||
; X32: movl 4(%esp), %eax
|
||||
; X32: movl 8(%esp), %ecx
|
||||
; CHECK-NOT: mov
|
||||
;; Try to match a bit more of the instr, since we need the load's offset.
|
||||
; CHECK: vinsertps $192, 12(%{{...}},%{{...}}), %
|
||||
; CHECK-NEXT: ret
|
||||
%1 = getelementptr inbounds <4 x float>* %pb, i64 %index
|
||||
%2 = load <4 x float>* %1, align 16
|
||||
%3 = tail call <4 x float> @llvm.x86.sse41.insertps(<4 x float> %a, <4 x float> %2, i32 192)
|
||||
ret <4 x float> %3
|
||||
}
|
||||
|
||||
define <4 x float> @insertps_from_broadcast_loadf32(<4 x float> %a, float* nocapture readonly %fb, i64 %index) {
|
||||
; CHECK-LABEL: insertps_from_broadcast_loadf32:
|
||||
; On X32, account for the arguments' move to registers
|
||||
; X32: movl 8(%esp), %eax
|
||||
; X32: movl 4(%esp), %ecx
|
||||
; CHECK-NOT: mov
|
||||
; CHECK: insertps $48
|
||||
; CHECK-NEXT: ret
|
||||
%1 = getelementptr inbounds float* %fb, i64 %index
|
||||
%2 = load float* %1, align 4
|
||||
%3 = insertelement <4 x float> undef, float %2, i32 0
|
||||
%4 = insertelement <4 x float> %3, float %2, i32 1
|
||||
%5 = insertelement <4 x float> %4, float %2, i32 2
|
||||
%6 = insertelement <4 x float> %5, float %2, i32 3
|
||||
%7 = tail call <4 x float> @llvm.x86.sse41.insertps(<4 x float> %a, <4 x float> %6, i32 48)
|
||||
ret <4 x float> %7
|
||||
}
|
||||
|
||||
define <4 x float> @insertps_from_broadcast_loadv4f32(<4 x float> %a, <4 x float>* nocapture readonly %b) {
|
||||
; CHECK-LABEL: insertps_from_broadcast_loadv4f32:
|
||||
; On X32, account for the arguments' move to registers
|
||||
; X32: movl 4(%esp), %{{...}}
|
||||
; CHECK-NOT: mov
|
||||
; CHECK: insertps $48
|
||||
; CHECK-NEXT: ret
|
||||
%1 = load <4 x float>* %b, align 4
|
||||
%2 = extractelement <4 x float> %1, i32 0
|
||||
%3 = insertelement <4 x float> undef, float %2, i32 0
|
||||
%4 = insertelement <4 x float> %3, float %2, i32 1
|
||||
%5 = insertelement <4 x float> %4, float %2, i32 2
|
||||
%6 = insertelement <4 x float> %5, float %2, i32 3
|
||||
%7 = tail call <4 x float> @llvm.x86.sse41.insertps(<4 x float> %a, <4 x float> %6, i32 48)
|
||||
ret <4 x float> %7
|
||||
}
|
||||
|
||||
;; FIXME: We're emitting an extraneous pshufd/vbroadcast.
|
||||
define <4 x float> @insertps_from_broadcast_multiple_use(<4 x float> %a, <4 x float> %b, <4 x float> %c, <4 x float> %d, float* nocapture readonly %fb, i64 %index) {
|
||||
; CHECK-LABEL: insertps_from_broadcast_multiple_use:
|
||||
; On X32, account for the arguments' move to registers
|
||||
; X32: movl 8(%esp), %eax
|
||||
; X32: movl 4(%esp), %ecx
|
||||
; CHECK: vbroadcastss
|
||||
; CHECK-NOT: mov
|
||||
; CHECK: insertps $48
|
||||
; CHECK: insertps $48
|
||||
; CHECK: insertps $48
|
||||
; CHECK: insertps $48
|
||||
; CHECK: vaddps
|
||||
; CHECK: vaddps
|
||||
; CHECK: vaddps
|
||||
; CHECK-NEXT: ret
|
||||
%1 = getelementptr inbounds float* %fb, i64 %index
|
||||
%2 = load float* %1, align 4
|
||||
%3 = insertelement <4 x float> undef, float %2, i32 0
|
||||
%4 = insertelement <4 x float> %3, float %2, i32 1
|
||||
%5 = insertelement <4 x float> %4, float %2, i32 2
|
||||
%6 = insertelement <4 x float> %5, float %2, i32 3
|
||||
%7 = tail call <4 x float> @llvm.x86.sse41.insertps(<4 x float> %a, <4 x float> %6, i32 48)
|
||||
%8 = tail call <4 x float> @llvm.x86.sse41.insertps(<4 x float> %b, <4 x float> %6, i32 48)
|
||||
%9 = tail call <4 x float> @llvm.x86.sse41.insertps(<4 x float> %c, <4 x float> %6, i32 48)
|
||||
%10 = tail call <4 x float> @llvm.x86.sse41.insertps(<4 x float> %d, <4 x float> %6, i32 48)
|
||||
%11 = fadd <4 x float> %7, %8
|
||||
%12 = fadd <4 x float> %9, %10
|
||||
%13 = fadd <4 x float> %11, %12
|
||||
ret <4 x float> %13
|
||||
}
|
||||
|
@ -5,7 +5,7 @@
|
||||
; loads from m32.
|
||||
define void @sample_test(<4 x float>* %source, <2 x float>* %dest) nounwind {
|
||||
; CHECK: sample_test
|
||||
; CHECK: movaps
|
||||
; CHECK-NOT: movaps
|
||||
; CHECK: insertps
|
||||
entry:
|
||||
%source.addr = alloca <4 x float>*, align 8
|
||||
|
@ -584,3 +584,111 @@ define <8 x i16> @blendvb_fallback(<8 x i1> %mask, <8 x i16> %x, <8 x i16> %y) {
|
||||
%ret = select <8 x i1> %mask, <8 x i16> %x, <8 x i16> %y
|
||||
ret <8 x i16> %ret
|
||||
}
|
||||
|
||||
define <4 x float> @insertps_from_vector_load(<4 x float> %a, <4 x float>* nocapture readonly %pb) {
|
||||
; CHECK-LABEL: insertps_from_vector_load:
|
||||
; On X32, account for the argument's move to registers
|
||||
; X32: movl 4(%esp), %eax
|
||||
; CHECK-NOT: mov
|
||||
; CHECK: insertps $48
|
||||
; CHECK-NEXT: ret
|
||||
%1 = load <4 x float>* %pb, align 16
|
||||
%2 = tail call <4 x float> @llvm.x86.sse41.insertps(<4 x float> %a, <4 x float> %1, i32 48)
|
||||
ret <4 x float> %2
|
||||
}
|
||||
|
||||
;; Use a non-zero CountS for insertps
|
||||
define <4 x float> @insertps_from_vector_load_offset(<4 x float> %a, <4 x float>* nocapture readonly %pb) {
|
||||
; CHECK-LABEL: insertps_from_vector_load_offset:
|
||||
; On X32, account for the argument's move to registers
|
||||
; X32: movl 4(%esp), %eax
|
||||
; CHECK-NOT: mov
|
||||
;; Try to match a bit more of the instr, since we need the load's offset.
|
||||
; CHECK: insertps $96, 4(%{{...}}), %
|
||||
; CHECK-NEXT: ret
|
||||
%1 = load <4 x float>* %pb, align 16
|
||||
%2 = tail call <4 x float> @llvm.x86.sse41.insertps(<4 x float> %a, <4 x float> %1, i32 96)
|
||||
ret <4 x float> %2
|
||||
}
|
||||
|
||||
define <4 x float> @insertps_from_vector_load_offset_2(<4 x float> %a, <4 x float>* nocapture readonly %pb, i64 %index) {
|
||||
; CHECK-LABEL: insertps_from_vector_load_offset_2:
|
||||
; On X32, account for the argument's move to registers
|
||||
; X32: movl 4(%esp), %eax
|
||||
; X32: movl 8(%esp), %ecx
|
||||
; CHECK-NOT: mov
|
||||
;; Try to match a bit more of the instr, since we need the load's offset.
|
||||
; CHECK: insertps $192, 12(%{{...}},%{{...}}), %
|
||||
; CHECK-NEXT: ret
|
||||
%1 = getelementptr inbounds <4 x float>* %pb, i64 %index
|
||||
%2 = load <4 x float>* %1, align 16
|
||||
%3 = tail call <4 x float> @llvm.x86.sse41.insertps(<4 x float> %a, <4 x float> %2, i32 192)
|
||||
ret <4 x float> %3
|
||||
}
|
||||
|
||||
define <4 x float> @insertps_from_broadcast_loadf32(<4 x float> %a, float* nocapture readonly %fb, i64 %index) {
|
||||
; CHECK-LABEL: insertps_from_broadcast_loadf32:
|
||||
; On X32, account for the arguments' move to registers
|
||||
; X32: movl 8(%esp), %eax
|
||||
; X32: movl 4(%esp), %ecx
|
||||
; CHECK-NOT: mov
|
||||
; CHECK: insertps $48
|
||||
; CHECK-NEXT: ret
|
||||
%1 = getelementptr inbounds float* %fb, i64 %index
|
||||
%2 = load float* %1, align 4
|
||||
%3 = insertelement <4 x float> undef, float %2, i32 0
|
||||
%4 = insertelement <4 x float> %3, float %2, i32 1
|
||||
%5 = insertelement <4 x float> %4, float %2, i32 2
|
||||
%6 = insertelement <4 x float> %5, float %2, i32 3
|
||||
%7 = tail call <4 x float> @llvm.x86.sse41.insertps(<4 x float> %a, <4 x float> %6, i32 48)
|
||||
ret <4 x float> %7
|
||||
}
|
||||
|
||||
define <4 x float> @insertps_from_broadcast_loadv4f32(<4 x float> %a, <4 x float>* nocapture readonly %b) {
|
||||
; CHECK-LABEL: insertps_from_broadcast_loadv4f32:
|
||||
; On X32, account for the arguments' move to registers
|
||||
; X32: movl 4(%esp), %{{...}}
|
||||
; CHECK-NOT: mov
|
||||
; CHECK: insertps $48
|
||||
; CHECK-NEXT: ret
|
||||
%1 = load <4 x float>* %b, align 4
|
||||
%2 = extractelement <4 x float> %1, i32 0
|
||||
%3 = insertelement <4 x float> undef, float %2, i32 0
|
||||
%4 = insertelement <4 x float> %3, float %2, i32 1
|
||||
%5 = insertelement <4 x float> %4, float %2, i32 2
|
||||
%6 = insertelement <4 x float> %5, float %2, i32 3
|
||||
%7 = tail call <4 x float> @llvm.x86.sse41.insertps(<4 x float> %a, <4 x float> %6, i32 48)
|
||||
ret <4 x float> %7
|
||||
}
|
||||
|
||||
;; FIXME: We're emitting an extraneous pshufd/vbroadcast.
|
||||
define <4 x float> @insertps_from_broadcast_multiple_use(<4 x float> %a, <4 x float> %b, <4 x float> %c, <4 x float> %d, float* nocapture readonly %fb, i64 %index) {
|
||||
; CHECK-LABEL: insertps_from_broadcast_multiple_use:
|
||||
; On X32, account for the arguments' move to registers
|
||||
; X32: movl 8(%esp), %eax
|
||||
; X32: movl 4(%esp), %ecx
|
||||
; CHECK: movss
|
||||
; CHECK-NOT: mov
|
||||
; CHECK: insertps $48
|
||||
; CHECK: insertps $48
|
||||
; CHECK: insertps $48
|
||||
; CHECK: insertps $48
|
||||
; CHECK: addps
|
||||
; CHECK: addps
|
||||
; CHECK: addps
|
||||
; CHECK-NEXT: ret
|
||||
%1 = getelementptr inbounds float* %fb, i64 %index
|
||||
%2 = load float* %1, align 4
|
||||
%3 = insertelement <4 x float> undef, float %2, i32 0
|
||||
%4 = insertelement <4 x float> %3, float %2, i32 1
|
||||
%5 = insertelement <4 x float> %4, float %2, i32 2
|
||||
%6 = insertelement <4 x float> %5, float %2, i32 3
|
||||
%7 = tail call <4 x float> @llvm.x86.sse41.insertps(<4 x float> %a, <4 x float> %6, i32 48)
|
||||
%8 = tail call <4 x float> @llvm.x86.sse41.insertps(<4 x float> %b, <4 x float> %6, i32 48)
|
||||
%9 = tail call <4 x float> @llvm.x86.sse41.insertps(<4 x float> %c, <4 x float> %6, i32 48)
|
||||
%10 = tail call <4 x float> @llvm.x86.sse41.insertps(<4 x float> %d, <4 x float> %6, i32 48)
|
||||
%11 = fadd <4 x float> %7, %8
|
||||
%12 = fadd <4 x float> %9, %10
|
||||
%13 = fadd <4 x float> %11, %12
|
||||
ret <4 x float> %13
|
||||
}
|
||||
|
Loading…
Reference in New Issue
Block a user