mirror of
https://github.com/RPCS3/llvm-mirror.git
synced 2025-01-10 05:41:40 +00:00
[X86] - Catch extra combine opportunities for redundant imuls.
When we fold "mul ((add x, c1), c1)" -> "add ((mul x, c2), c1*c2)", we bail if (add x, c1) has multiple users which would result in an extra add instruction. In such cases, this patch adds a check to see if we can eliminate a multiply instruction in exchange for the extra add. I also added the capability of doing the existing optimization with non-splatted vectors (splatted also works). Differential Revision: http://reviews.llvm.org/D13740 llvm-svn: 251028
This commit is contained in:
parent
b8e5332e87
commit
4fd5110a5a
@ -403,6 +403,14 @@ namespace {
|
||||
unsigned SequenceNum;
|
||||
};
|
||||
|
||||
/// This is a helper function for visitMUL to check the profitability
|
||||
/// of folding (mul (add x, c1), c2) -> (add (mul x, c2), c1*c2).
|
||||
/// MulNode is the original multiply, AddNode is (add x, c1),
|
||||
/// and ConstNode is c2.
|
||||
bool isMulAddWithConstProfitable(SDNode *MulNode,
|
||||
SDValue &AddNode,
|
||||
SDValue &ConstNode);
|
||||
|
||||
/// This is a helper function for MergeStoresOfConstantsOrVecElts. Returns a
|
||||
/// constant build_vector of the stored constant values in Stores.
|
||||
SDValue getMergedConstantVectorStore(SelectionDAG &DAG,
|
||||
@ -2139,14 +2147,15 @@ SDValue DAGCombiner::visitMUL(SDNode *N) {
|
||||
}
|
||||
|
||||
// fold (mul (add x, c1), c2) -> (add (mul x, c2), c1*c2)
|
||||
if (N1IsConst && N0.getOpcode() == ISD::ADD && N0.getNode()->hasOneUse() &&
|
||||
(isConstantSplatVector(N0.getOperand(1).getNode(), Val) ||
|
||||
isa<ConstantSDNode>(N0.getOperand(1))))
|
||||
return DAG.getNode(ISD::ADD, SDLoc(N), VT,
|
||||
DAG.getNode(ISD::MUL, SDLoc(N0), VT,
|
||||
N0.getOperand(0), N1),
|
||||
DAG.getNode(ISD::MUL, SDLoc(N1), VT,
|
||||
N0.getOperand(1), N1));
|
||||
if (isConstantIntBuildVectorOrConstantInt(N1) &&
|
||||
N0.getOpcode() == ISD::ADD &&
|
||||
isConstantIntBuildVectorOrConstantInt(N0.getOperand(1)) &&
|
||||
isMulAddWithConstProfitable(N, N0, N1))
|
||||
return DAG.getNode(ISD::ADD, SDLoc(N), VT,
|
||||
DAG.getNode(ISD::MUL, SDLoc(N0), VT,
|
||||
N0.getOperand(0), N1),
|
||||
DAG.getNode(ISD::MUL, SDLoc(N1), VT,
|
||||
N0.getOperand(1), N1));
|
||||
|
||||
// reassociate mul
|
||||
if (SDValue RMUL = ReassociateOps(ISD::MUL, SDLoc(N), N0, N1))
|
||||
@ -10839,6 +10848,81 @@ struct BaseIndexOffset {
|
||||
};
|
||||
} // namespace
|
||||
|
||||
// This is a helper function for visitMUL to check the profitability
|
||||
// of folding (mul (add x, c1), c2) -> (add (mul x, c2), c1*c2).
|
||||
// MulNode is the original multiply, AddNode is (add x, c1),
|
||||
// and ConstNode is c2.
|
||||
//
|
||||
// If the (add x, c1) has multiple uses, we could increase
|
||||
// the number of adds if we make this transformation.
|
||||
// It would only be worth doing this if we can remove a
|
||||
// multiply in the process. Check for that here.
|
||||
// To illustrate:
|
||||
// (A + c1) * c3
|
||||
// (A + c2) * c3
|
||||
// We're checking for cases where we have common "c3 * A" expressions.
|
||||
bool DAGCombiner::isMulAddWithConstProfitable(SDNode *MulNode,
|
||||
SDValue &AddNode,
|
||||
SDValue &ConstNode) {
|
||||
APInt Val;
|
||||
|
||||
// If the add only has one use, this would be OK to do.
|
||||
if (AddNode.getNode()->hasOneUse())
|
||||
return true;
|
||||
|
||||
// Walk all the users of the constant with which we're multiplying.
|
||||
for (SDNode *Use : ConstNode->uses()) {
|
||||
|
||||
if (Use == MulNode) // This use is the one we're on right now. Skip it.
|
||||
continue;
|
||||
|
||||
if (Use->getOpcode() == ISD::MUL) { // We have another multiply use.
|
||||
SDNode *OtherOp;
|
||||
SDNode *MulVar = AddNode.getOperand(0).getNode();
|
||||
|
||||
// OtherOp is what we're multiplying against the constant.
|
||||
if (Use->getOperand(0) == ConstNode)
|
||||
OtherOp = Use->getOperand(1).getNode();
|
||||
else
|
||||
OtherOp = Use->getOperand(0).getNode();
|
||||
|
||||
// Check to see if multiply is with the same operand of our "add".
|
||||
//
|
||||
// ConstNode = CONST
|
||||
// Use = ConstNode * A <-- visiting Use. OtherOp is A.
|
||||
// ...
|
||||
// AddNode = (A + c1) <-- MulVar is A.
|
||||
// = AddNode * ConstNode <-- current visiting instruction.
|
||||
//
|
||||
// If we make this transformation, we will have a common
|
||||
// multiply (ConstNode * A) that we can save.
|
||||
if (OtherOp == MulVar)
|
||||
return true;
|
||||
|
||||
// Now check to see if a future expansion will give us a common
|
||||
// multiply.
|
||||
//
|
||||
// ConstNode = CONST
|
||||
// AddNode = (A + c1)
|
||||
// ... = AddNode * ConstNode <-- current visiting instruction.
|
||||
// ...
|
||||
// OtherOp = (A + c2)
|
||||
// Use = OtherOp * ConstNode <-- visiting Use.
|
||||
//
|
||||
// If we make this transformation, we will have a common
|
||||
// multiply (CONST * A) after we also do the same transformation
|
||||
// to the "t2" instruction.
|
||||
if (OtherOp->getOpcode() == ISD::ADD &&
|
||||
isConstantIntBuildVectorOrConstantInt(OtherOp->getOperand(1)) &&
|
||||
OtherOp->getOperand(0).getNode() == MulVar)
|
||||
return true;
|
||||
}
|
||||
}
|
||||
|
||||
// Didn't find a case where this would be profitable.
|
||||
return false;
|
||||
}
|
||||
|
||||
SDValue DAGCombiner::getMergedConstantVectorStore(SelectionDAG &DAG,
|
||||
SDLoc SL,
|
||||
ArrayRef<MemOpLink> Stores,
|
||||
|
163
test/CodeGen/X86/combine-multiplies.ll
Normal file
163
test/CodeGen/X86/combine-multiplies.ll
Normal file
@ -0,0 +1,163 @@
|
||||
; RUN: llc < %s -mattr=sse2 -mtriple=i386-unknown-linux-gnu | FileCheck %s
|
||||
|
||||
; Source file looks something like this:
|
||||
;
|
||||
; typedef int AAA[100][100];
|
||||
;
|
||||
; void testCombineMultiplies(AAA a,int lll)
|
||||
; {
|
||||
; int LOC = lll + 5;
|
||||
;
|
||||
; a[LOC][LOC] = 11;
|
||||
;
|
||||
; a[LOC][20] = 22;
|
||||
; a[LOC+20][20] = 33;
|
||||
; }
|
||||
;
|
||||
; We want to make sure we don't generate 2 multiply instructions,
|
||||
; one for a[LOC][] and one for a[LOC+20]. visitMUL in DAGCombiner.cpp
|
||||
; should combine the instructions in such a way to avoid the extra
|
||||
; multiply.
|
||||
;
|
||||
; Output looks roughly like this:
|
||||
;
|
||||
; movl 8(%esp), %eax
|
||||
; movl 12(%esp), %ecx
|
||||
; imull $400, %ecx, %edx # imm = 0x190
|
||||
; leal (%edx,%eax), %esi
|
||||
; movl $11, 2020(%esi,%ecx,4)
|
||||
; movl $22, 2080(%edx,%eax)
|
||||
; movl $33, 10080(%edx,%eax)
|
||||
;
|
||||
; CHECK-LABEL: testCombineMultiplies
|
||||
; CHECK: imull $400, [[ARG1:%[a-z]+]], [[MUL:%[a-z]+]] # imm = 0x190
|
||||
; CHECK-NEXT: leal ([[MUL]],[[ARG2:%[a-z]+]]), [[LEA:%[a-z]+]]
|
||||
; CHECK-NEXT: movl $11, {{[0-9]+}}([[LEA]],[[ARG1]],4)
|
||||
; CHECK-NEXT: movl $22, {{[0-9]+}}([[MUL]],[[ARG2]])
|
||||
; CHECK-NEXT: movl $33, {{[0-9]+}}([[MUL]],[[ARG2]])
|
||||
; CHECK: retl
|
||||
;
|
||||
|
||||
; Function Attrs: nounwind
|
||||
define void @testCombineMultiplies([100 x i32]* nocapture %a, i32 %lll) {
|
||||
entry:
|
||||
%add = add nsw i32 %lll, 5
|
||||
%arrayidx1 = getelementptr inbounds [100 x i32], [100 x i32]* %a, i32 %add, i32 %add
|
||||
store i32 11, i32* %arrayidx1, align 4
|
||||
%arrayidx3 = getelementptr inbounds [100 x i32], [100 x i32]* %a, i32 %add, i32 20
|
||||
store i32 22, i32* %arrayidx3, align 4
|
||||
%add4 = add nsw i32 %lll, 25
|
||||
%arrayidx6 = getelementptr inbounds [100 x i32], [100 x i32]* %a, i32 %add4, i32 20
|
||||
store i32 33, i32* %arrayidx6, align 4
|
||||
ret void
|
||||
}
|
||||
|
||||
|
||||
; Test for the same optimization on vector multiplies.
|
||||
;
|
||||
; Source looks something like this:
|
||||
;
|
||||
; typedef int v4int __attribute__((__vector_size__(16)));
|
||||
;
|
||||
; v4int x;
|
||||
; v4int v2, v3;
|
||||
; void testCombineMultiplies_splat(v4int v1) {
|
||||
; v2 = (v1 + (v4int){ 11, 11, 11, 11 }) * (v4int) {22, 22, 22, 22};
|
||||
; v3 = (v1 + (v4int){ 33, 33, 33, 33 }) * (v4int) {22, 22, 22, 22};
|
||||
; x = (v1 + (v4int){ 11, 11, 11, 11 });
|
||||
; }
|
||||
;
|
||||
; Output looks something like this:
|
||||
;
|
||||
; testCombineMultiplies_splat: # @testCombineMultiplies_splat
|
||||
; # BB#0: # %entry
|
||||
; movdqa .LCPI1_0, %xmm1 # xmm1 = [11,11,11,11]
|
||||
; paddd %xmm0, %xmm1
|
||||
; movdqa .LCPI1_1, %xmm2 # xmm2 = [22,22,22,22]
|
||||
; pshufd $245, %xmm0, %xmm3 # xmm3 = xmm0[1,1,3,3]
|
||||
; pmuludq %xmm2, %xmm0
|
||||
; pshufd $232, %xmm0, %xmm0 # xmm0 = xmm0[0,2,2,3]
|
||||
; pmuludq %xmm2, %xmm3
|
||||
; pshufd $232, %xmm3, %xmm2 # xmm2 = xmm3[0,2,2,3]
|
||||
; punpckldq %xmm2, %xmm0 # xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
|
||||
; movdqa .LCPI1_2, %xmm2 # xmm2 = [242,242,242,242]
|
||||
; paddd %xmm0, %xmm2
|
||||
; paddd .LCPI1_3, %xmm0
|
||||
; movdqa %xmm2, v2
|
||||
; movdqa %xmm0, v3
|
||||
; movdqa %xmm1, x
|
||||
; retl
|
||||
;
|
||||
; Again, we want to make sure we don't generate two different multiplies.
|
||||
; We should have a single multiply for "v1 * {22, 22, 22, 22}" (made up of two
|
||||
; pmuludq instructions), followed by two adds. Without this optimization, we'd
|
||||
; do 2 adds, followed by 2 multiplies (i.e. 4 pmuludq instructions).
|
||||
;
|
||||
; CHECK-LABEL: testCombineMultiplies_splat
|
||||
; CHECK: movdqa .LCPI1_0, [[C11:%xmm[0-9]]]
|
||||
; CHECK-NEXT: paddd %xmm0, [[C11]]
|
||||
; CHECK-NEXT: movdqa .LCPI1_1, [[C22:%xmm[0-9]]]
|
||||
; CHECK-NEXT: pshufd $245, %xmm0, [[T1:%xmm[0-9]]]
|
||||
; CHECK-NEXT: pmuludq [[C22]], [[T2:%xmm[0-9]]]
|
||||
; CHECK-NEXT: pshufd $232, [[T2]], [[T3:%xmm[0-9]]]
|
||||
; CHECK-NEXT: pmuludq [[C22]], [[T4:%xmm[0-9]]]
|
||||
; CHECK-NEXT: pshufd $232, [[T4]], [[T5:%xmm[0-9]]]
|
||||
; CHECK-NEXT: punpckldq [[T5]], [[T6:%xmm[0-9]]]
|
||||
; CHECK-NEXT: movdqa .LCPI1_2, [[C242:%xmm[0-9]]]
|
||||
; CHECK-NEXT: paddd [[T6]], [[C242]]
|
||||
; CHECK-NEXT: paddd .LCPI1_3, [[C726:%xmm[0-9]]]
|
||||
; CHECK-NEXT: movdqa [[C242]], v2
|
||||
; CHECK-NEXT: [[C726]], v3
|
||||
; CHECK-NEXT: [[C11]], x
|
||||
; CHECK-NEXT: retl
|
||||
|
||||
@v2 = common global <4 x i32> zeroinitializer, align 16
|
||||
@v3 = common global <4 x i32> zeroinitializer, align 16
|
||||
@x = common global <4 x i32> zeroinitializer, align 16
|
||||
|
||||
; Function Attrs: nounwind
|
||||
define void @testCombineMultiplies_splat(<4 x i32> %v1) {
|
||||
entry:
|
||||
%add1 = add <4 x i32> %v1, <i32 11, i32 11, i32 11, i32 11>
|
||||
%mul1 = mul <4 x i32> %add1, <i32 22, i32 22, i32 22, i32 22>
|
||||
%add2 = add <4 x i32> %v1, <i32 33, i32 33, i32 33, i32 33>
|
||||
%mul2 = mul <4 x i32> %add2, <i32 22, i32 22, i32 22, i32 22>
|
||||
store <4 x i32> %mul1, <4 x i32>* @v2, align 16
|
||||
store <4 x i32> %mul2, <4 x i32>* @v3, align 16
|
||||
store <4 x i32> %add1, <4 x i32>* @x, align 16
|
||||
ret void
|
||||
}
|
||||
|
||||
; Finally, check the non-splatted vector case. This is very similar
|
||||
; to the previous test case, except for the vector values.
|
||||
;
|
||||
; CHECK-LABEL: testCombineMultiplies_non_splat
|
||||
; CHECK: movdqa .LCPI2_0, [[C11:%xmm[0-9]]]
|
||||
; CHECK-NEXT: paddd %xmm0, [[C11]]
|
||||
; CHECK-NEXT: movdqa .LCPI2_1, [[C22:%xmm[0-9]]]
|
||||
; CHECK-NEXT: pshufd $245, %xmm0, [[T1:%xmm[0-9]]]
|
||||
; CHECK-NEXT: pmuludq [[C22]], [[T2:%xmm[0-9]]]
|
||||
; CHECK-NEXT: pshufd $232, [[T2]], [[T3:%xmm[0-9]]]
|
||||
; CHECK-NEXT: pshufd $245, [[C22]], [[T7:%xmm[0-9]]]
|
||||
; CHECK-NEXT: pmuludq [[T1]], [[T7]]
|
||||
; CHECK-NEXT: pshufd $232, [[T7]], [[T5:%xmm[0-9]]]
|
||||
; CHECK-NEXT: punpckldq [[T5]], [[T6:%xmm[0-9]]]
|
||||
; CHECK-NEXT: movdqa .LCPI2_2, [[C242:%xmm[0-9]]]
|
||||
; CHECK-NEXT: paddd [[T6]], [[C242]]
|
||||
; CHECK-NEXT: paddd .LCPI2_3, [[C726:%xmm[0-9]]]
|
||||
; CHECK-NEXT: movdqa [[C242]], v2
|
||||
; CHECK-NEXT: [[C726]], v3
|
||||
; CHECK-NEXT: [[C11]], x
|
||||
; CHECK-NEXT: retl
|
||||
; Function Attrs: nounwind
|
||||
define void @testCombineMultiplies_non_splat(<4 x i32> %v1) {
|
||||
entry:
|
||||
%add1 = add <4 x i32> %v1, <i32 11, i32 22, i32 33, i32 44>
|
||||
%mul1 = mul <4 x i32> %add1, <i32 22, i32 33, i32 44, i32 55>
|
||||
%add2 = add <4 x i32> %v1, <i32 33, i32 44, i32 55, i32 66>
|
||||
%mul2 = mul <4 x i32> %add2, <i32 22, i32 33, i32 44, i32 55>
|
||||
store <4 x i32> %mul1, <4 x i32>* @v2, align 16
|
||||
store <4 x i32> %mul2, <4 x i32>* @v3, align 16
|
||||
store <4 x i32> %add1, <4 x i32>* @x, align 16
|
||||
ret void
|
||||
}
|
Loading…
Reference in New Issue
Block a user