mirror of
https://github.com/RPCSX/llvm.git
synced 2024-11-25 20:59:51 +00:00
Favors pshufd over shufps when shuffling elements from one vector. pshufd is faster than shufps.
git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@49244 91177308-0d34-0410-b5e6-96231b3b80d8
This commit is contained in:
parent
a4091d34f3
commit
0c0f83ff5d
@ -2782,23 +2782,28 @@ static SDOperand getSwapEltZeroMask(unsigned NumElems, unsigned DestElt,
|
||||
return DAG.getNode(ISD::BUILD_VECTOR, MaskVT, &MaskVec[0], MaskVec.size());
|
||||
}
|
||||
|
||||
/// PromoteSplat - Promote a splat of v8i16 or v16i8 to v4i32.
|
||||
///
|
||||
static SDOperand PromoteSplat(SDOperand Op, SelectionDAG &DAG) {
|
||||
/// PromoteSplat - Promote a splat of v4f32, v8i16 or v16i8 to v4i32.
|
||||
static SDOperand PromoteSplat(SDOperand Op, SelectionDAG &DAG, bool HasSSE2) {
|
||||
MVT::ValueType PVT = HasSSE2 ? MVT::v4i32 : MVT::v4f32;
|
||||
MVT::ValueType VT = Op.getValueType();
|
||||
if (PVT == VT)
|
||||
return Op;
|
||||
SDOperand V1 = Op.getOperand(0);
|
||||
SDOperand Mask = Op.getOperand(2);
|
||||
MVT::ValueType VT = Op.getValueType();
|
||||
unsigned NumElems = Mask.getNumOperands();
|
||||
Mask = getUnpacklMask(NumElems, DAG);
|
||||
while (NumElems != 4) {
|
||||
V1 = DAG.getNode(ISD::VECTOR_SHUFFLE, VT, V1, V1, Mask);
|
||||
NumElems >>= 1;
|
||||
// Special handling of v4f32 -> v4i32.
|
||||
if (VT != MVT::v4f32) {
|
||||
Mask = getUnpacklMask(NumElems, DAG);
|
||||
while (NumElems > 4) {
|
||||
V1 = DAG.getNode(ISD::VECTOR_SHUFFLE, VT, V1, V1, Mask);
|
||||
NumElems >>= 1;
|
||||
}
|
||||
Mask = getZeroVector(MVT::v4i32, DAG);
|
||||
}
|
||||
V1 = DAG.getNode(ISD::BIT_CONVERT, MVT::v4i32, V1);
|
||||
|
||||
Mask = getZeroVector(MVT::v4i32, DAG);
|
||||
SDOperand Shuffle = DAG.getNode(ISD::VECTOR_SHUFFLE, MVT::v4i32, V1,
|
||||
DAG.getNode(ISD::UNDEF, MVT::v4i32), Mask);
|
||||
V1 = DAG.getNode(ISD::BIT_CONVERT, PVT, V1);
|
||||
SDOperand Shuffle = DAG.getNode(ISD::VECTOR_SHUFFLE, PVT, V1,
|
||||
DAG.getNode(ISD::UNDEF, PVT), Mask);
|
||||
return DAG.getNode(ISD::BIT_CONVERT, VT, Shuffle);
|
||||
}
|
||||
|
||||
@ -3426,6 +3431,7 @@ X86TargetLowering::LowerVECTOR_SHUFFLE(SDOperand Op, SelectionDAG &DAG) {
|
||||
SDOperand PermMask = Op.getOperand(2);
|
||||
MVT::ValueType VT = Op.getValueType();
|
||||
unsigned NumElems = PermMask.getNumOperands();
|
||||
bool isMMX = MVT::getSizeInBits(VT) == 64;
|
||||
bool V1IsUndef = V1.getOpcode() == ISD::UNDEF;
|
||||
bool V2IsUndef = V2.getOpcode() == ISD::UNDEF;
|
||||
bool V1IsSplat = false;
|
||||
@ -3443,9 +3449,9 @@ X86TargetLowering::LowerVECTOR_SHUFFLE(SDOperand Op, SelectionDAG &DAG) {
|
||||
return V2;
|
||||
|
||||
if (isSplatMask(PermMask.Val)) {
|
||||
if (NumElems <= 4) return Op;
|
||||
// Promote it to a v4i32 splat.
|
||||
return PromoteSplat(Op, DAG);
|
||||
if (isMMX || NumElems < 4) return Op;
|
||||
// Promote it to a v4{if}32 splat.
|
||||
return PromoteSplat(Op, DAG, Subtarget->hasSSE2());
|
||||
}
|
||||
|
||||
// If the shuffle can be profitably rewritten as a narrower shuffle, then
|
||||
@ -3556,35 +3562,39 @@ X86TargetLowering::LowerVECTOR_SHUFFLE(SDOperand Op, SelectionDAG &DAG) {
|
||||
return Op;
|
||||
}
|
||||
|
||||
// If VT is integer, try PSHUF* first, then SHUFP*.
|
||||
if (MVT::isInteger(VT)) {
|
||||
// MMX doesn't have PSHUFD; it does have PSHUFW. While it's theoretically
|
||||
// possible to shuffle a v2i32 using PSHUFW, that's not yet implemented.
|
||||
if (((MVT::getSizeInBits(VT) != 64 || NumElems == 4) &&
|
||||
X86::isPSHUFDMask(PermMask.Val)) ||
|
||||
X86::isPSHUFHWMask(PermMask.Val) ||
|
||||
X86::isPSHUFLWMask(PermMask.Val)) {
|
||||
if (V2.getOpcode() != ISD::UNDEF)
|
||||
return DAG.getNode(ISD::VECTOR_SHUFFLE, VT, V1,
|
||||
DAG.getNode(ISD::UNDEF, V1.getValueType()),PermMask);
|
||||
// Try PSHUF* first, then SHUFP*.
|
||||
// MMX doesn't have PSHUFD but it does have PSHUFW. While it's theoretically
|
||||
// possible to shuffle a v2i32 using PSHUFW, that's not yet implemented.
|
||||
if (isMMX && NumElems == 4 && X86::isPSHUFDMask(PermMask.Val)) {
|
||||
if (V2.getOpcode() != ISD::UNDEF)
|
||||
return DAG.getNode(ISD::VECTOR_SHUFFLE, VT, V1,
|
||||
DAG.getNode(ISD::UNDEF, VT), PermMask);
|
||||
return Op;
|
||||
}
|
||||
|
||||
if (!isMMX) {
|
||||
if (Subtarget->hasSSE2() &&
|
||||
(X86::isPSHUFDMask(PermMask.Val) ||
|
||||
X86::isPSHUFHWMask(PermMask.Val) ||
|
||||
X86::isPSHUFLWMask(PermMask.Val))) {
|
||||
MVT::ValueType RVT = VT;
|
||||
if (VT == MVT::v4f32) {
|
||||
RVT = MVT::v4i32;
|
||||
Op = DAG.getNode(ISD::VECTOR_SHUFFLE, RVT,
|
||||
DAG.getNode(ISD::BIT_CONVERT, RVT, V1),
|
||||
DAG.getNode(ISD::UNDEF, RVT), PermMask);
|
||||
} else if (V2.getOpcode() != ISD::UNDEF)
|
||||
Op = DAG.getNode(ISD::VECTOR_SHUFFLE, RVT, V1,
|
||||
DAG.getNode(ISD::UNDEF, RVT), PermMask);
|
||||
if (RVT != VT)
|
||||
Op = DAG.getNode(ISD::BIT_CONVERT, VT, Op);
|
||||
return Op;
|
||||
}
|
||||
|
||||
if (X86::isSHUFPMask(PermMask.Val) &&
|
||||
MVT::getSizeInBits(VT) != 64) // Don't do this for MMX.
|
||||
// Binary or unary shufps.
|
||||
if (X86::isSHUFPMask(PermMask.Val) ||
|
||||
(V2.getOpcode() == ISD::UNDEF && X86::isPSHUFDMask(PermMask.Val)))
|
||||
return Op;
|
||||
} else {
|
||||
// Floating point cases in the other order.
|
||||
if (X86::isSHUFPMask(PermMask.Val))
|
||||
return Op;
|
||||
if (X86::isPSHUFDMask(PermMask.Val) ||
|
||||
X86::isPSHUFHWMask(PermMask.Val) ||
|
||||
X86::isPSHUFLWMask(PermMask.Val)) {
|
||||
if (V2.getOpcode() != ISD::UNDEF)
|
||||
return DAG.getNode(ISD::VECTOR_SHUFFLE, VT, V1,
|
||||
DAG.getNode(ISD::UNDEF, V1.getValueType()),PermMask);
|
||||
return Op;
|
||||
}
|
||||
}
|
||||
|
||||
// Handle v8i16 specifically since SSE can do byte extraction and insertion.
|
||||
@ -3595,7 +3605,7 @@ X86TargetLowering::LowerVECTOR_SHUFFLE(SDOperand Op, SelectionDAG &DAG) {
|
||||
}
|
||||
|
||||
// Handle all 4 wide cases with a number of shuffles.
|
||||
if (NumElems == 4 && MVT::getSizeInBits(VT) != 64) {
|
||||
if (NumElems == 4 && !isMMX) {
|
||||
// Don't do this for MMX.
|
||||
MVT::ValueType MaskVT = PermMask.getValueType();
|
||||
MVT::ValueType MaskEVT = MVT::getVectorElementType(MaskVT);
|
||||
|
@ -2803,13 +2803,7 @@ def : Pat<(vector_shuffle (v2i64 VR128:$src), (undef), UNPCKH_shuffle_mask:$sm),
|
||||
(PUNPCKHQDQrr VR128:$src, VR128:$src)>, Requires<[HasSSE2]>;
|
||||
}
|
||||
|
||||
// Splat v4f32
|
||||
def : Pat<(vector_shuffle (v4f32 VR128:$src), (undef), SSE_splat_mask:$sm),
|
||||
(SHUFPSrri VR128:$src, VR128:$src, SSE_splat_mask:$sm)>,
|
||||
Requires<[HasSSE1]>;
|
||||
|
||||
// Special unary SHUFPSrri case.
|
||||
// FIXME: when we want non two-address code, then we should use PSHUFD?
|
||||
def : Pat<(v4f32 (vector_shuffle VR128:$src1, (undef),
|
||||
SHUFP_unary_shuffle_mask:$sm)),
|
||||
(SHUFPSrri VR128:$src1, VR128:$src1, SHUFP_unary_shuffle_mask:$sm)>,
|
||||
@ -2820,7 +2814,7 @@ def : Pat<(v2f64 (vector_shuffle VR128:$src1, (undef),
|
||||
(SHUFPDrri VR128:$src1, VR128:$src1, SHUFP_unary_shuffle_mask:$sm)>,
|
||||
Requires<[HasSSE2]>;
|
||||
// Unary v4f32 shuffle with PSHUF* in order to fold a load.
|
||||
def : Pat<(vector_shuffle (memopv4f32 addr:$src1), (undef),
|
||||
def : Pat<(vector_shuffle (bc_v4i32 (memopv4f32 addr:$src1)), (undef),
|
||||
SHUFP_unary_shuffle_mask:$sm),
|
||||
(PSHUFDmi addr:$src1, SHUFP_unary_shuffle_mask:$sm)>,
|
||||
Requires<[HasSSE2]>;
|
||||
|
@ -1,6 +1,6 @@
|
||||
; RUN: llvm-as < %s | llc -march=x86 -mattr=+sse2 | grep 170
|
||||
|
||||
define i16 @f(<4 x float>* %tmp116117.i1061.i) {
|
||||
define i16 @f(<4 x float>* %tmp116117.i1061.i) nounwind {
|
||||
entry:
|
||||
alloca [4 x <4 x float>] ; <[4 x <4 x float>]*>:0 [#uses=167]
|
||||
alloca [4 x <4 x float>] ; <[4 x <4 x float>]*>:1 [#uses=170]
|
||||
|
@ -1,6 +1,6 @@
|
||||
; RUN: llvm-as < %s | llc -march=x86-64 -mattr=+sse2,-sse41 | grep {shufps \$3, %xmm0, %xmm0}
|
||||
; RUN: llvm-as < %s | llc -march=x86-64 -mattr=+sse2,-sse41 | grep {pshufd \$3, %xmm0, %xmm0}
|
||||
|
||||
define float @foo(<8 x float> %a) {
|
||||
define float @foo(<8 x float> %a) nounwind {
|
||||
%c = extractelement <8 x float> %a, i32 3
|
||||
ret float %c
|
||||
}
|
||||
|
@ -1,15 +1,14 @@
|
||||
; RUN: llvm-as < %s | llc -march=x86 -mattr=+sse2 -o %t -f
|
||||
; RUN: grep shufps %t | count 1
|
||||
; RUN: grep pshufd %t | count 1
|
||||
; RUN: grep pshufd %t | count 2
|
||||
|
||||
define <4 x float> @test(float %a) {
|
||||
define <4 x float> @test(float %a) nounwind {
|
||||
%tmp = insertelement <4 x float> zeroinitializer, float %a, i32 1 ; <<4 x float>> [#uses=1]
|
||||
%tmp5 = insertelement <4 x float> %tmp, float 0.000000e+00, i32 2 ; <<4 x float>> [#uses=1]
|
||||
%tmp6 = insertelement <4 x float> %tmp5, float 0.000000e+00, i32 3 ; <<4 x float>> [#uses=1]
|
||||
ret <4 x float> %tmp6
|
||||
}
|
||||
|
||||
define <2 x i64> @test2(i32 %a) {
|
||||
define <2 x i64> @test2(i32 %a) nounwind {
|
||||
%tmp7 = insertelement <4 x i32> zeroinitializer, i32 %a, i32 2 ; <<4 x i32>> [#uses=1]
|
||||
%tmp9 = insertelement <4 x i32> %tmp7, i32 0, i32 3 ; <<4 x i32>> [#uses=1]
|
||||
%tmp10 = bitcast <4 x i32> %tmp9 to <2 x i64> ; <<2 x i64>> [#uses=1]
|
||||
|
@ -2,7 +2,7 @@
|
||||
; RUN: grep unpcklps %t | count 1
|
||||
; RUN: grep shufps %t | count 1
|
||||
|
||||
define <4 x float> @test(float %a, float %b, float %c) {
|
||||
define <4 x float> @test(float %a, float %b, float %c) nounwind {
|
||||
%tmp = insertelement <4 x float> zeroinitializer, float %a, i32 1 ; <<4 x float>> [#uses=1]
|
||||
%tmp8 = insertelement <4 x float> %tmp, float %b, i32 2 ; <<4 x float>> [#uses=1]
|
||||
%tmp10 = insertelement <4 x float> %tmp8, float %c, i32 3 ; <<4 x float>> [#uses=1]
|
||||
|
@ -5,17 +5,17 @@
|
||||
; RUN: llvm-as < %s | llc -march=x86 -mattr=+sse2 | grep pshuflw | count 1
|
||||
; RUN: llvm-as < %s | llc -march=x86 -mattr=+sse2 | grep pshufhw | count 1
|
||||
|
||||
define <8 x i16> @t1(<8 x i16> %A, <8 x i16> %B) {
|
||||
define <8 x i16> @t1(<8 x i16> %A, <8 x i16> %B) nounwind {
|
||||
%tmp = shufflevector <8 x i16> %A, <8 x i16> %B, <8 x i32> < i32 8, i32 9, i32 0, i32 1, i32 10, i32 11, i32 2, i32 3 >
|
||||
ret <8 x i16> %tmp
|
||||
}
|
||||
|
||||
define <8 x i16> @t2(<8 x i16> %A, <8 x i16> %B) {
|
||||
define <8 x i16> @t2(<8 x i16> %A, <8 x i16> %B) nounwind {
|
||||
%tmp = shufflevector <8 x i16> %A, <8 x i16> %B, <8 x i32> < i32 8, i32 9, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7 >
|
||||
ret <8 x i16> %tmp
|
||||
}
|
||||
|
||||
define <8 x i16> @t3(<8 x i16> %A, <8 x i16> %B) {
|
||||
define <8 x i16> @t3(<8 x i16> %A, <8 x i16> %B) nounwind {
|
||||
%tmp = shufflevector <8 x i16> %A, <8 x i16> %B, <8 x i32> < i32 0, i32 0, i32 3, i32 2, i32 4, i32 6, i32 4, i32 7 >
|
||||
ret <8 x i16> %tmp
|
||||
}
|
||||
|
25
test/CodeGen/X86/vec_shuffle-16.ll
Normal file
25
test/CodeGen/X86/vec_shuffle-16.ll
Normal file
@ -0,0 +1,25 @@
|
||||
; RUN: llvm-as < %s | llc -march=x86 -mattr=+sse | grep shufps | count 4
|
||||
; RUN: llvm-as < %s | llc -march=x86 -mattr=+sse | grep mov | count 2
|
||||
; RUN: llvm-as < %s | llc -march=x86 -mattr=+sse2 | grep pshufd | count 4
|
||||
; RUN: llvm-as < %s | llc -march=x86 -mattr=+sse2 | not grep shufps
|
||||
; RUN: llvm-as < %s | llc -march=x86 -mattr=+sse2 | not grep mov
|
||||
|
||||
define <4 x float> @t1(<4 x float> %a, <4 x float> %b) nounwind {
|
||||
%tmp1 = shufflevector <4 x float> %b, <4 x float> undef, <4 x i32> zeroinitializer
|
||||
ret <4 x float> %tmp1
|
||||
}
|
||||
|
||||
define <4 x float> @t2(<4 x float> %A, <4 x float> %B) nounwind {
|
||||
%tmp = shufflevector <4 x float> %A, <4 x float> %B, <4 x i32> < i32 3, i32 3, i32 3, i32 3 >
|
||||
ret <4 x float> %tmp
|
||||
}
|
||||
|
||||
define <4 x float> @t3(<4 x float> %A, <4 x float> %B) nounwind {
|
||||
%tmp = shufflevector <4 x float> %A, <4 x float> %B, <4 x i32> < i32 4, i32 4, i32 4, i32 4 >
|
||||
ret <4 x float> %tmp
|
||||
}
|
||||
|
||||
define <4 x float> @t4(<4 x float> %A, <4 x float> %B) nounwind {
|
||||
%tmp = shufflevector <4 x float> %A, <4 x float> %B, <4 x i32> < i32 1, i32 3, i32 2, i32 0 >
|
||||
ret <4 x float> %tmp
|
||||
}
|
@ -2,7 +2,7 @@
|
||||
; RUN: grep movhlps %t | count 1
|
||||
; RUN: grep shufps %t | count 1
|
||||
|
||||
define void @test() {
|
||||
define void @test() nounwind {
|
||||
%tmp1 = load <4 x float>* null ; <<4 x float>> [#uses=2]
|
||||
%tmp2 = shufflevector <4 x float> %tmp1, <4 x float> < float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00 >, <4 x i32> < i32 0, i32 1, i32 6, i32 7 > ; <<4 x float>> [#uses=1]
|
||||
%tmp3 = shufflevector <4 x float> %tmp1, <4 x float> zeroinitializer, <4 x i32> < i32 2, i32 3, i32 6, i32 7 > ; <<4 x float>> [#uses=1]
|
||||
|
@ -3,7 +3,7 @@
|
||||
; RUN: grep movupd %t | count 1
|
||||
; RUN: grep pshufhw %t | count 1
|
||||
|
||||
define void @test_v4sf(<4 x float>* %P, float %X, float %Y) {
|
||||
define void @test_v4sf(<4 x float>* %P, float %X, float %Y) nounwind {
|
||||
%tmp = insertelement <4 x float> zeroinitializer, float %X, i32 0 ; <<4 x float>> [#uses=1]
|
||||
%tmp2 = insertelement <4 x float> %tmp, float %X, i32 1 ; <<4 x float>> [#uses=1]
|
||||
%tmp4 = insertelement <4 x float> %tmp2, float %Y, i32 2 ; <<4 x float>> [#uses=1]
|
||||
@ -12,14 +12,14 @@ define void @test_v4sf(<4 x float>* %P, float %X, float %Y) {
|
||||
ret void
|
||||
}
|
||||
|
||||
define void @test_v2sd(<2 x double>* %P, double %X, double %Y) {
|
||||
define void @test_v2sd(<2 x double>* %P, double %X, double %Y) nounwind {
|
||||
%tmp = insertelement <2 x double> zeroinitializer, double %X, i32 0 ; <<2 x double>> [#uses=1]
|
||||
%tmp2 = insertelement <2 x double> %tmp, double %Y, i32 1 ; <<2 x double>> [#uses=1]
|
||||
store <2 x double> %tmp2, <2 x double>* %P
|
||||
ret void
|
||||
}
|
||||
|
||||
define void @test_v8i16(<2 x i64>* %res, <2 x i64>* %A) {
|
||||
define void @test_v8i16(<2 x i64>* %res, <2 x i64>* %A) nounwind {
|
||||
%tmp = load <2 x i64>* %A ; <<2 x i64>> [#uses=1]
|
||||
%tmp.upgrd.1 = bitcast <2 x i64> %tmp to <8 x i16> ; <<8 x i16>> [#uses=8]
|
||||
%tmp.upgrd.2 = extractelement <8 x i16> %tmp.upgrd.1, i32 0 ; <i16> [#uses=1]
|
||||
|
@ -1,6 +1,6 @@
|
||||
; RUN: llvm-as < %s | llc -march=x86 -mattr=+sse2 | grep pshufd | count 1
|
||||
|
||||
define void @test(<2 x i64>* %P, i8 %x) {
|
||||
define void @test(<2 x i64>* %P, i8 %x) nounwind {
|
||||
%tmp = insertelement <16 x i8> zeroinitializer, i8 %x, i32 0 ; <<16 x i8>> [#uses=1]
|
||||
%tmp36 = insertelement <16 x i8> %tmp, i8 %x, i32 1 ; <<16 x i8>> [#uses=1]
|
||||
%tmp38 = insertelement <16 x i8> %tmp36, i8 %x, i32 2 ; <<16 x i8>> [#uses=1]
|
||||
|
@ -1,7 +1,7 @@
|
||||
; RUN: llvm-as < %s | llc -march=x86 -mattr=+sse2 | grep shufps
|
||||
; RUN: llvm-as < %s | llc -march=x86 -mattr=+sse2 | grep pshufd
|
||||
; RUN: llvm-as < %s | llc -march=x86 -mattr=+sse3 | grep movddup
|
||||
|
||||
define void @test_v4sf(<4 x float>* %P, <4 x float>* %Q, float %X) {
|
||||
define void @test_v4sf(<4 x float>* %P, <4 x float>* %Q, float %X) nounwind {
|
||||
%tmp = insertelement <4 x float> zeroinitializer, float %X, i32 0 ; <<4 x float>> [#uses=1]
|
||||
%tmp2 = insertelement <4 x float> %tmp, float %X, i32 1 ; <<4 x float>> [#uses=1]
|
||||
%tmp4 = insertelement <4 x float> %tmp2, float %X, i32 2 ; <<4 x float>> [#uses=1]
|
||||
@ -12,7 +12,7 @@ define void @test_v4sf(<4 x float>* %P, <4 x float>* %Q, float %X) {
|
||||
ret void
|
||||
}
|
||||
|
||||
define void @test_v2sd(<2 x double>* %P, <2 x double>* %Q, double %X) {
|
||||
define void @test_v2sd(<2 x double>* %P, <2 x double>* %Q, double %X) nounwind {
|
||||
%tmp = insertelement <2 x double> zeroinitializer, double %X, i32 0 ; <<2 x double>> [#uses=1]
|
||||
%tmp2 = insertelement <2 x double> %tmp, double %X, i32 1 ; <<2 x double>> [#uses=1]
|
||||
%tmp4 = load <2 x double>* %Q ; <<2 x double>> [#uses=1]
|
||||
|
Loading…
Reference in New Issue
Block a user