mirror of
https://github.com/RPCSX/llvm.git
synced 2024-12-12 06:06:19 +00:00
[x86] Teach the new vector shuffle lowering about the zero masking
abilities of INSERTPS which are really powerful and come up in very important contexts such as forming diagonal matrices, etc. With this I ended up being able to remove the somewhat weird helper I added for INSERTPS because we can collapse the entire state to a no-op mask. Added a bunch of tests for inserting into a zero-ish vector. git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@217117 91177308-0d34-0410-b5e6-96231b3b80d8
This commit is contained in:
parent
e35ac41a3a
commit
fa2dfaedf2
@ -7182,21 +7182,6 @@ static bool isSingleInputShuffleMask(ArrayRef<int> Mask) {
|
||||
return true;
|
||||
}
|
||||
|
||||
/// \brief Check wether all of one set of inputs to a shuffle mask are in place.
|
||||
///
|
||||
/// Mask entries pointing at the other input or undef will be skipped.
|
||||
static bool isShuffleMaskInputInPlace(ArrayRef<int> Mask, bool LoInput = true) {
|
||||
int Size = Mask.size();
|
||||
for (int i = 0; i < Size; ++i) {
|
||||
int M = Mask[i];
|
||||
if (M == -1 || (LoInput && M >= 4) || (!LoInput && M < 4))
|
||||
continue;
|
||||
if (M - (LoInput ? 0 : Size) != i)
|
||||
return false;
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
// Hide this symbol with an anonymous namespace instead of 'static' so that MSVC
|
||||
// 2013 will allow us to use it as a non-type template parameter.
|
||||
namespace {
|
||||
@ -7385,13 +7370,48 @@ static SDValue lowerV4F32VectorShuffle(SDValue Op, SDValue V1, SDValue V2,
|
||||
// INSERTPS when the V1 elements are already in the correct locations
|
||||
// because otherwise we can just always use two SHUFPS instructions which
|
||||
// are much smaller to encode than a SHUFPS and an INSERTPS.
|
||||
if (Subtarget->hasSSE41() &&
|
||||
isShuffleMaskInputInPlace(Mask, /*LoInput*/ true)) {
|
||||
// Insert the V2 element into the desired position.
|
||||
SDValue InsertPSMask =
|
||||
DAG.getIntPtrConstant(Mask[V2Index] << 6 | V2Index << 4);
|
||||
return DAG.getNode(X86ISD::INSERTPS, DL, MVT::v4f32, V1, V2,
|
||||
InsertPSMask);
|
||||
if (Subtarget->hasSSE41()) {
|
||||
// When using INSERTPS we can zero any lane of the destination. Collect
|
||||
// the zero inputs into a mask and drop them from the lanes of V1 which
|
||||
// actually need to be present as inputs to the INSERTPS.
|
||||
unsigned ZMask = 0;
|
||||
if (ISD::isBuildVectorAllZeros(V1.getNode())) {
|
||||
ZMask = 0xF ^ (1 << V2Index);
|
||||
} else if (V1.getOpcode() == ISD::BUILD_VECTOR) {
|
||||
for (int i = 0; i < 4; ++i) {
|
||||
int M = Mask[i];
|
||||
if (M >= 4)
|
||||
continue;
|
||||
if (M > -1) {
|
||||
SDValue Input = V1.getOperand(M);
|
||||
if (Input.getOpcode() != ISD::UNDEF &&
|
||||
!X86::isZeroNode(Input)) {
|
||||
// A non-zero input!
|
||||
ZMask = 0;
|
||||
break;
|
||||
}
|
||||
}
|
||||
ZMask |= 1 << i;
|
||||
}
|
||||
}
|
||||
|
||||
// Synthesize a shuffle mask for the non-zero and non-v2 inputs.
|
||||
int InsertShuffleMask[4] = {-1, -1, -1, -1};
|
||||
for (int i = 0; i < 4; ++i)
|
||||
if (i != V2Index && (ZMask & (1 << i)) == 0)
|
||||
InsertShuffleMask[i] = Mask[i];
|
||||
|
||||
if (isNoopShuffleMask(InsertShuffleMask)) {
|
||||
// Replace V1 with undef if nothing from V1 survives the INSERTPS.
|
||||
if ((ZMask | 1 << V2Index) == 0xF)
|
||||
V1 = DAG.getUNDEF(MVT::v4f32);
|
||||
|
||||
// Insert the V2 element into the desired position.
|
||||
SDValue InsertPSMask =
|
||||
DAG.getIntPtrConstant(Mask[V2Index] << 6 | V2Index << 4 | ZMask);
|
||||
return DAG.getNode(X86ISD::INSERTPS, DL, MVT::v4f32, V1, V2,
|
||||
InsertPSMask);
|
||||
}
|
||||
}
|
||||
|
||||
// Compute the index adjacent to V2Index and in the same half by toggling
|
||||
|
@ -207,3 +207,113 @@ define <4 x i32> @shuffle_v4i32_4015(<4 x i32> %a, <4 x i32> %b) {
|
||||
%shuffle = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 4, i32 0, i32 1, i32 5>
|
||||
ret <4 x i32> %shuffle
|
||||
}
|
||||
|
||||
define <4 x float> @shuffle_v4f32_4zzz(<4 x float> %a) {
|
||||
; SSE2-LABEL: @shuffle_v4f32_4zzz
|
||||
; SSE2: xorps %[[X:xmm[0-9]+]], %[[X]]
|
||||
; SSE2-NEXT: shufps {{.*}} # xmm0 = xmm0[0,0],[[X]][1,0]
|
||||
; SSE2-NEXT: shufps {{.*}} # xmm0 = xmm0[0,2],[[X]][2,3]
|
||||
; SSE2-NEXT: retq
|
||||
;
|
||||
; SSE41-LABEL: @shuffle_v4f32_4zzz
|
||||
; SSE41: insertps {{.*}} # xmm0 = xmm0[0],zero,zero,zero
|
||||
; SSE41-NEXT: retq
|
||||
;
|
||||
; AVX1-LABEL: @shuffle_v4f32_4zzz
|
||||
; AVX1: vinsertps {{.*}} # xmm0 = xmm0[0],zero,zero,zero
|
||||
; AVX1-NEXT: retq
|
||||
%shuffle = shufflevector <4 x float> zeroinitializer, <4 x float> %a, <4 x i32> <i32 4, i32 1, i32 2, i32 3>
|
||||
ret <4 x float> %shuffle
|
||||
}
|
||||
|
||||
define <4 x float> @shuffle_v4f32_z4zz(<4 x float> %a) {
|
||||
; SSE2-LABEL: @shuffle_v4f32_z4zz
|
||||
; SSE2: xorps %[[X:xmm[0-9]+]], %[[X]]
|
||||
; SSE2-NEXT: shufps {{.*}} # xmm0 = xmm0[0,0],[[X]][2,0]
|
||||
; SSE2-NEXT: shufps {{.*}} # xmm0 = xmm0[2,0],[[X]][3,0]
|
||||
; SSE2-NEXT: retq
|
||||
;
|
||||
; SSE41-LABEL: @shuffle_v4f32_z4zz
|
||||
; SSE41: insertps {{.*}} # xmm0 = zero,xmm0[0],zero,zero
|
||||
; SSE41-NEXT: retq
|
||||
;
|
||||
; AVX1-LABEL: @shuffle_v4f32_z4zz
|
||||
; AVX1: vinsertps {{.*}} # xmm0 = zero,xmm0[0],zero,zero
|
||||
; AVX1-NEXT: retq
|
||||
%shuffle = shufflevector <4 x float> zeroinitializer, <4 x float> %a, <4 x i32> <i32 2, i32 4, i32 3, i32 0>
|
||||
ret <4 x float> %shuffle
|
||||
}
|
||||
|
||||
define <4 x float> @shuffle_v4f32_zz4z(<4 x float> %a) {
|
||||
; SSE2-LABEL: @shuffle_v4f32_zz4z
|
||||
; SSE2: xorps %[[X:xmm[0-9]+]], %[[X]]
|
||||
; SSE2-NEXT: shufps {{.*}} # xmm0 = xmm0[0,0],[[X]][0,0]
|
||||
; SSE2-NEXT: shufps {{.*}} # [[X]] = [[X]][0,0],xmm0[0,2]
|
||||
; SSE2-NEXT: movaps %[[X]], %xmm0
|
||||
; SSE2-NEXT: retq
|
||||
;
|
||||
; SSE41-LABEL: @shuffle_v4f32_zz4z
|
||||
; SSE41: insertps {{.*}} # xmm0 = zero,zero,xmm0[0],zero
|
||||
; SSE41-NEXT: retq
|
||||
;
|
||||
; AVX1-LABEL: @shuffle_v4f32_zz4z
|
||||
; AVX1: vinsertps {{.*}} # xmm0 = zero,zero,xmm0[0],zero
|
||||
; AVX1-NEXT: retq
|
||||
%shuffle = shufflevector <4 x float> zeroinitializer, <4 x float> %a, <4 x i32> <i32 0, i32 0, i32 4, i32 0>
|
||||
ret <4 x float> %shuffle
|
||||
}
|
||||
|
||||
define <4 x float> @shuffle_v4f32_zuu4(<4 x float> %a) {
|
||||
; SSE2-LABEL: @shuffle_v4f32_zuu4
|
||||
; SSE2: xorps %[[X:xmm[0-9]+]], %[[X]]
|
||||
; SSE2-NEXT: shufps {{.*}} # [[X]] = [[X]][0,1],xmm0[2,0]
|
||||
; SSE2-NEXT: movaps %[[X]], %xmm0
|
||||
; SSE2-NEXT: retq
|
||||
;
|
||||
; SSE41-LABEL: @shuffle_v4f32_zuu4
|
||||
; SSE41: insertps {{.*}} # xmm0 = zero,zero,zero,xmm0[0]
|
||||
; SSE41-NEXT: retq
|
||||
;
|
||||
; AVX1-LABEL: @shuffle_v4f32_zuu4
|
||||
; AVX1: vinsertps {{.*}} # xmm0 = zero,zero,zero,xmm0[0]
|
||||
; AVX1-NEXT: retq
|
||||
%shuffle = shufflevector <4 x float> zeroinitializer, <4 x float> %a, <4 x i32> <i32 0, i32 undef, i32 undef, i32 4>
|
||||
ret <4 x float> %shuffle
|
||||
}
|
||||
|
||||
define <4 x float> @shuffle_v4f32_zzz7(<4 x float> %a) {
|
||||
; SSE2-LABEL: @shuffle_v4f32_zzz7
|
||||
; SSE2: xorps %[[X:xmm[0-9]+]], %[[X]]
|
||||
; SSE2-NEXT: shufps {{.*}} # xmm0 = xmm0[3,0],[[X]][2,0]
|
||||
; SSE2-NEXT: shufps {{.*}} # [[X]] = [[X]][0,1],xmm0[2,0]
|
||||
; SSE2-NEXT: movaps %[[X]], %xmm0
|
||||
; SSE2-NEXT: retq
|
||||
;
|
||||
; SSE41-LABEL: @shuffle_v4f32_zzz7
|
||||
; SSE41: insertps {{.*}} # xmm0 = zero,zero,zero,xmm0[3]
|
||||
; SSE41-NEXT: retq
|
||||
;
|
||||
; AVX1-LABEL: @shuffle_v4f32_zzz7
|
||||
; AVX1: vinsertps {{.*}} # xmm0 = zero,zero,zero,xmm0[3]
|
||||
; AVX1-NEXT: retq
|
||||
%shuffle = shufflevector <4 x float> zeroinitializer, <4 x float> %a, <4 x i32> <i32 0, i32 1, i32 2, i32 7>
|
||||
ret <4 x float> %shuffle
|
||||
}
|
||||
|
||||
define <4 x float> @shuffle_v4f32_z6zz(<4 x float> %a) {
|
||||
; SSE2-LABEL: @shuffle_v4f32_z6zz
|
||||
; SSE2: xorps %[[X:xmm[0-9]+]], %[[X]]
|
||||
; SSE2-NEXT: shufps {{.*}} # xmm0 = xmm0[2,0],[[X]][0,0]
|
||||
; SSE2-NEXT: shufps {{.*}} # xmm0 = xmm0[2,0],[[X]][2,3]
|
||||
; SSE2-NEXT: retq
|
||||
;
|
||||
; SSE41-LABEL: @shuffle_v4f32_z6zz
|
||||
; SSE41: insertps {{.*}} # xmm0 = zero,xmm0[2],zero,zero
|
||||
; SSE41-NEXT: retq
|
||||
;
|
||||
; AVX1-LABEL: @shuffle_v4f32_z6zz
|
||||
; AVX1: vinsertps {{.*}} # xmm0 = zero,xmm0[2],zero,zero
|
||||
; AVX1-NEXT: retq
|
||||
%shuffle = shufflevector <4 x float> zeroinitializer, <4 x float> %a, <4 x i32> <i32 0, i32 6, i32 2, i32 3>
|
||||
ret <4 x float> %shuffle
|
||||
}
|
||||
|
Loading…
Reference in New Issue
Block a user