mirror of
https://github.com/RPCS3/llvm.git
synced 2025-02-11 05:17:36 +00:00
[InstCombine] SSE/AVX vector shifts demanded shift amount bits
Most SSE/AVX (non-constant) vector shift instructions only use the lower 64-bits of the 128-bit shift amount vector operand, this patch calls SimplifyDemandedVectorElts to optimize for this. I had to refactor some of my recent InstCombiner work on the vector shifts to avoid quite a bit of duplicate code, it means that SimplifyX86immshift now (re)decodes the type of shift. Differential Revision: http://reviews.llvm.org/D11938 git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@244872 91177308-0d34-0410-b5e6-96231b3b80d8
This commit is contained in:
parent
d4177b2705
commit
335fc61873
@ -198,8 +198,52 @@ Instruction *InstCombiner::SimplifyMemSet(MemSetInst *MI) {
|
||||
}
|
||||
|
||||
static Value *SimplifyX86immshift(const IntrinsicInst &II,
|
||||
InstCombiner::BuilderTy &Builder,
|
||||
bool LogicalShift, bool ShiftLeft) {
|
||||
InstCombiner::BuilderTy &Builder) {
|
||||
bool LogicalShift = false;
|
||||
bool ShiftLeft = false;
|
||||
|
||||
switch (II.getIntrinsicID()) {
|
||||
default:
|
||||
return nullptr;
|
||||
case Intrinsic::x86_sse2_psra_d:
|
||||
case Intrinsic::x86_sse2_psra_w:
|
||||
case Intrinsic::x86_sse2_psrai_d:
|
||||
case Intrinsic::x86_sse2_psrai_w:
|
||||
case Intrinsic::x86_avx2_psra_d:
|
||||
case Intrinsic::x86_avx2_psra_w:
|
||||
case Intrinsic::x86_avx2_psrai_d:
|
||||
case Intrinsic::x86_avx2_psrai_w:
|
||||
LogicalShift = false; ShiftLeft = false;
|
||||
break;
|
||||
case Intrinsic::x86_sse2_psrl_d:
|
||||
case Intrinsic::x86_sse2_psrl_q:
|
||||
case Intrinsic::x86_sse2_psrl_w:
|
||||
case Intrinsic::x86_sse2_psrli_d:
|
||||
case Intrinsic::x86_sse2_psrli_q:
|
||||
case Intrinsic::x86_sse2_psrli_w:
|
||||
case Intrinsic::x86_avx2_psrl_d:
|
||||
case Intrinsic::x86_avx2_psrl_q:
|
||||
case Intrinsic::x86_avx2_psrl_w:
|
||||
case Intrinsic::x86_avx2_psrli_d:
|
||||
case Intrinsic::x86_avx2_psrli_q:
|
||||
case Intrinsic::x86_avx2_psrli_w:
|
||||
LogicalShift = true; ShiftLeft = false;
|
||||
break;
|
||||
case Intrinsic::x86_sse2_psll_d:
|
||||
case Intrinsic::x86_sse2_psll_q:
|
||||
case Intrinsic::x86_sse2_psll_w:
|
||||
case Intrinsic::x86_sse2_pslli_d:
|
||||
case Intrinsic::x86_sse2_pslli_q:
|
||||
case Intrinsic::x86_sse2_pslli_w:
|
||||
case Intrinsic::x86_avx2_psll_d:
|
||||
case Intrinsic::x86_avx2_psll_q:
|
||||
case Intrinsic::x86_avx2_psll_w:
|
||||
case Intrinsic::x86_avx2_pslli_d:
|
||||
case Intrinsic::x86_avx2_pslli_q:
|
||||
case Intrinsic::x86_avx2_pslli_w:
|
||||
LogicalShift = true; ShiftLeft = true;
|
||||
break;
|
||||
}
|
||||
assert((LogicalShift || !ShiftLeft) && "Only logical shifts can shift left");
|
||||
|
||||
// Simplify if count is constant.
|
||||
@ -788,52 +832,65 @@ Instruction *InstCombiner::visitCallInst(CallInst &CI) {
|
||||
}
|
||||
|
||||
// Constant fold ashr( <A x Bi>, Ci ).
|
||||
case Intrinsic::x86_sse2_psra_d:
|
||||
case Intrinsic::x86_sse2_psra_w:
|
||||
// Constant fold lshr( <A x Bi>, Ci ).
|
||||
// Constant fold shl( <A x Bi>, Ci ).
|
||||
case Intrinsic::x86_sse2_psrai_d:
|
||||
case Intrinsic::x86_sse2_psrai_w:
|
||||
case Intrinsic::x86_avx2_psra_d:
|
||||
case Intrinsic::x86_avx2_psra_w:
|
||||
case Intrinsic::x86_avx2_psrai_d:
|
||||
case Intrinsic::x86_avx2_psrai_w:
|
||||
if (Value *V = SimplifyX86immshift(*II, *Builder, false, false))
|
||||
return ReplaceInstUsesWith(*II, V);
|
||||
break;
|
||||
|
||||
// Constant fold lshr( <A x Bi>, Ci ).
|
||||
case Intrinsic::x86_sse2_psrl_d:
|
||||
case Intrinsic::x86_sse2_psrl_q:
|
||||
case Intrinsic::x86_sse2_psrl_w:
|
||||
case Intrinsic::x86_sse2_psrli_d:
|
||||
case Intrinsic::x86_sse2_psrli_q:
|
||||
case Intrinsic::x86_sse2_psrli_w:
|
||||
case Intrinsic::x86_avx2_psrl_d:
|
||||
case Intrinsic::x86_avx2_psrl_q:
|
||||
case Intrinsic::x86_avx2_psrl_w:
|
||||
case Intrinsic::x86_avx2_psrli_d:
|
||||
case Intrinsic::x86_avx2_psrli_q:
|
||||
case Intrinsic::x86_avx2_psrli_w:
|
||||
if (Value *V = SimplifyX86immshift(*II, *Builder, true, false))
|
||||
return ReplaceInstUsesWith(*II, V);
|
||||
break;
|
||||
|
||||
// Constant fold shl( <A x Bi>, Ci ).
|
||||
case Intrinsic::x86_sse2_psll_d:
|
||||
case Intrinsic::x86_sse2_psll_q:
|
||||
case Intrinsic::x86_sse2_psll_w:
|
||||
case Intrinsic::x86_sse2_pslli_d:
|
||||
case Intrinsic::x86_sse2_pslli_q:
|
||||
case Intrinsic::x86_sse2_pslli_w:
|
||||
case Intrinsic::x86_avx2_psll_d:
|
||||
case Intrinsic::x86_avx2_psll_q:
|
||||
case Intrinsic::x86_avx2_psll_w:
|
||||
case Intrinsic::x86_avx2_pslli_d:
|
||||
case Intrinsic::x86_avx2_pslli_q:
|
||||
case Intrinsic::x86_avx2_pslli_w:
|
||||
if (Value *V = SimplifyX86immshift(*II, *Builder, true, true))
|
||||
if (Value *V = SimplifyX86immshift(*II, *Builder))
|
||||
return ReplaceInstUsesWith(*II, V);
|
||||
break;
|
||||
|
||||
case Intrinsic::x86_sse2_psra_d:
|
||||
case Intrinsic::x86_sse2_psra_w:
|
||||
case Intrinsic::x86_avx2_psra_d:
|
||||
case Intrinsic::x86_avx2_psra_w:
|
||||
case Intrinsic::x86_sse2_psrl_d:
|
||||
case Intrinsic::x86_sse2_psrl_q:
|
||||
case Intrinsic::x86_sse2_psrl_w:
|
||||
case Intrinsic::x86_avx2_psrl_d:
|
||||
case Intrinsic::x86_avx2_psrl_q:
|
||||
case Intrinsic::x86_avx2_psrl_w:
|
||||
case Intrinsic::x86_sse2_psll_d:
|
||||
case Intrinsic::x86_sse2_psll_q:
|
||||
case Intrinsic::x86_sse2_psll_w:
|
||||
case Intrinsic::x86_avx2_psll_d:
|
||||
case Intrinsic::x86_avx2_psll_q:
|
||||
case Intrinsic::x86_avx2_psll_w: {
|
||||
if (Value *V = SimplifyX86immshift(*II, *Builder))
|
||||
return ReplaceInstUsesWith(*II, V);
|
||||
|
||||
// SSE2/AVX2 uses only the first 64-bits of the 128-bit vector
|
||||
// operand to compute the shift amount.
|
||||
auto ShiftAmt = II->getArgOperand(1);
|
||||
auto ShiftType = cast<VectorType>(ShiftAmt->getType());
|
||||
assert(ShiftType->getPrimitiveSizeInBits() == 128 &&
|
||||
"Unexpected packed shift size");
|
||||
unsigned VWidth = ShiftType->getNumElements();
|
||||
|
||||
APInt DemandedElts = APInt::getLowBitsSet(VWidth, VWidth / 2);
|
||||
APInt UndefElts(VWidth, 0);
|
||||
if (Value *V =
|
||||
SimplifyDemandedVectorElts(ShiftAmt, DemandedElts, UndefElts)) {
|
||||
II->setArgOperand(1, V);
|
||||
return II;
|
||||
}
|
||||
break;
|
||||
}
|
||||
|
||||
case Intrinsic::x86_sse41_pmovsxbd:
|
||||
case Intrinsic::x86_sse41_pmovsxbq:
|
||||
case Intrinsic::x86_sse41_pmovsxbw:
|
||||
|
@ -825,6 +825,154 @@ define <4 x i64> @avx2_psll_q_64(<4 x i64> %v) nounwind readnone uwtable {
|
||||
ret <4 x i64> %1
|
||||
}
|
||||
|
||||
;
|
||||
; Vector Demanded Bits
|
||||
;
|
||||
|
||||
define <8 x i16> @sse2_psra_w_var(<8 x i16> %v, <8 x i16> %a) nounwind readnone uwtable {
|
||||
; CHECK-LABEL: @sse2_psra_w_var
|
||||
; CHECK-NEXT: %1 = tail call <8 x i16> @llvm.x86.sse2.psra.w(<8 x i16> %v, <8 x i16> %a)
|
||||
; CHECK-NEXT: ret <8 x i16> %1
|
||||
%1 = shufflevector <8 x i16> %a, <8 x i16> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3>
|
||||
%2 = tail call <8 x i16> @llvm.x86.sse2.psra.w(<8 x i16> %v, <8 x i16> %1)
|
||||
ret <8 x i16> %2
|
||||
}
|
||||
|
||||
define <4 x i32> @sse2_psra_d_var(<4 x i32> %v, <4 x i32> %a) nounwind readnone uwtable {
|
||||
; CHECK-LABEL: @sse2_psra_d_var
|
||||
; CHECK-NEXT: %1 = tail call <4 x i32> @llvm.x86.sse2.psra.d(<4 x i32> %v, <4 x i32> %a)
|
||||
; CHECK-NEXT: ret <4 x i32> %1
|
||||
%1 = shufflevector <4 x i32> %a, <4 x i32> undef, <4 x i32> <i32 0, i32 1, i32 0, i32 1>
|
||||
%2 = tail call <4 x i32> @llvm.x86.sse2.psra.d(<4 x i32> %v, <4 x i32> %1)
|
||||
ret <4 x i32> %2
|
||||
}
|
||||
|
||||
define <16 x i16> @avx2_psra_w_var(<16 x i16> %v, <8 x i16> %a) nounwind readnone uwtable {
|
||||
; CHECK-LABEL: @avx2_psra_w_var
|
||||
; CHECK-NEXT: %1 = tail call <16 x i16> @llvm.x86.avx2.psra.w(<16 x i16> %v, <8 x i16> %a)
|
||||
; CHECK-NEXT: ret <16 x i16> %1
|
||||
%1 = shufflevector <8 x i16> %a, <8 x i16> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3>
|
||||
%2 = tail call <16 x i16> @llvm.x86.avx2.psra.w(<16 x i16> %v, <8 x i16> %1)
|
||||
ret <16 x i16> %2
|
||||
}
|
||||
|
||||
define <8 x i32> @avx2_psra_d_var(<8 x i32> %v, <4 x i32> %a) nounwind readnone uwtable {
|
||||
; CHECK-LABEL: @avx2_psra_d_var
|
||||
; CHECK-NEXT: %1 = tail call <8 x i32> @llvm.x86.avx2.psra.d(<8 x i32> %v, <4 x i32> %a)
|
||||
; CHECK-NEXT: ret <8 x i32> %1
|
||||
%1 = shufflevector <4 x i32> %a, <4 x i32> undef, <4 x i32> <i32 0, i32 1, i32 0, i32 1>
|
||||
%2 = tail call <8 x i32> @llvm.x86.avx2.psra.d(<8 x i32> %v, <4 x i32> %1)
|
||||
ret <8 x i32> %2
|
||||
}
|
||||
|
||||
define <8 x i16> @sse2_psrl_w_var(<8 x i16> %v, <8 x i16> %a) nounwind readnone uwtable {
|
||||
; CHECK-LABEL: @sse2_psrl_w_var
|
||||
; CHECK-NEXT: %1 = tail call <8 x i16> @llvm.x86.sse2.psrl.w(<8 x i16> %v, <8 x i16> %a)
|
||||
; CHECK-NEXT: ret <8 x i16> %1
|
||||
%1 = shufflevector <8 x i16> %a, <8 x i16> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3>
|
||||
%2 = tail call <8 x i16> @llvm.x86.sse2.psrl.w(<8 x i16> %v, <8 x i16> %1)
|
||||
ret <8 x i16> %2
|
||||
}
|
||||
|
||||
define <4 x i32> @sse2_psrl_d_var(<4 x i32> %v, <4 x i32> %a) nounwind readnone uwtable {
|
||||
; CHECK-LABEL: @sse2_psrl_d_var
|
||||
; CHECK-NEXT: %1 = tail call <4 x i32> @llvm.x86.sse2.psrl.d(<4 x i32> %v, <4 x i32> %a)
|
||||
; CHECK-NEXT: ret <4 x i32> %1
|
||||
%1 = shufflevector <4 x i32> %a, <4 x i32> undef, <4 x i32> <i32 0, i32 1, i32 0, i32 1>
|
||||
%2 = tail call <4 x i32> @llvm.x86.sse2.psrl.d(<4 x i32> %v, <4 x i32> %1)
|
||||
ret <4 x i32> %2
|
||||
}
|
||||
|
||||
define <2 x i64> @sse2_psrl_q_var(<2 x i64> %v, <2 x i64> %a) nounwind readnone uwtable {
|
||||
; CHECK-LABEL: @sse2_psrl_q_var
|
||||
; CHECK-NEXT: %1 = tail call <2 x i64> @llvm.x86.sse2.psrl.q(<2 x i64> %v, <2 x i64> %a)
|
||||
; CHECK-NEXT: ret <2 x i64> %1
|
||||
%1 = shufflevector <2 x i64> %a, <2 x i64> undef, <2 x i32> <i32 0, i32 0>
|
||||
%2 = tail call <2 x i64> @llvm.x86.sse2.psrl.q(<2 x i64> %v, <2 x i64> %1)
|
||||
ret <2 x i64> %2
|
||||
}
|
||||
|
||||
define <16 x i16> @avx2_psrl_w_var(<16 x i16> %v, <8 x i16> %a) nounwind readnone uwtable {
|
||||
; CHECK-LABEL: @avx2_psrl_w_var
|
||||
; CHECK-NEXT: %1 = tail call <16 x i16> @llvm.x86.avx2.psrl.w(<16 x i16> %v, <8 x i16> %a)
|
||||
; CHECK-NEXT: ret <16 x i16> %1
|
||||
%1 = shufflevector <8 x i16> %a, <8 x i16> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3>
|
||||
%2 = tail call <16 x i16> @llvm.x86.avx2.psrl.w(<16 x i16> %v, <8 x i16> %1)
|
||||
ret <16 x i16> %2
|
||||
}
|
||||
|
||||
define <8 x i32> @avx2_psrl_d_var(<8 x i32> %v, <4 x i32> %a) nounwind readnone uwtable {
|
||||
; CHECK-LABEL: @avx2_psrl_d_var
|
||||
; CHECK-NEXT: %1 = tail call <8 x i32> @llvm.x86.avx2.psrl.d(<8 x i32> %v, <4 x i32> %a)
|
||||
; CHECK-NEXT: ret <8 x i32> %1
|
||||
%1 = shufflevector <4 x i32> %a, <4 x i32> undef, <4 x i32> <i32 0, i32 1, i32 0, i32 1>
|
||||
%2 = tail call <8 x i32> @llvm.x86.avx2.psrl.d(<8 x i32> %v, <4 x i32> %1)
|
||||
ret <8 x i32> %2
|
||||
}
|
||||
|
||||
define <4 x i64> @avx2_psrl_q_var(<4 x i64> %v, <2 x i64> %a) nounwind readnone uwtable {
|
||||
; CHECK-LABEL: @avx2_psrl_q_var
|
||||
; CHECK-NEXT: %1 = tail call <4 x i64> @llvm.x86.avx2.psrl.q(<4 x i64> %v, <2 x i64> %a)
|
||||
; CHECK-NEXT: ret <4 x i64> %1
|
||||
%1 = shufflevector <2 x i64> %a, <2 x i64> undef, <2 x i32> <i32 0, i32 0>
|
||||
%2 = tail call <4 x i64> @llvm.x86.avx2.psrl.q(<4 x i64> %v, <2 x i64> %1)
|
||||
ret <4 x i64> %2
|
||||
}
|
||||
|
||||
define <8 x i16> @sse2_psll_w_var(<8 x i16> %v, <8 x i16> %a) nounwind readnone uwtable {
|
||||
; CHECK-LABEL: @sse2_psll_w_var
|
||||
; CHECK-NEXT: %1 = tail call <8 x i16> @llvm.x86.sse2.psll.w(<8 x i16> %v, <8 x i16> %a)
|
||||
; CHECK-NEXT: ret <8 x i16> %1
|
||||
%1 = shufflevector <8 x i16> %a, <8 x i16> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3>
|
||||
%2 = tail call <8 x i16> @llvm.x86.sse2.psll.w(<8 x i16> %v, <8 x i16> %1)
|
||||
ret <8 x i16> %2
|
||||
}
|
||||
|
||||
define <4 x i32> @sse2_psll_d_var(<4 x i32> %v, <4 x i32> %a) nounwind readnone uwtable {
|
||||
; CHECK-LABEL: @sse2_psll_d_var
|
||||
; CHECK-NEXT: %1 = tail call <4 x i32> @llvm.x86.sse2.psll.d(<4 x i32> %v, <4 x i32> %a)
|
||||
; CHECK-NEXT: ret <4 x i32> %1
|
||||
%1 = shufflevector <4 x i32> %a, <4 x i32> undef, <4 x i32> <i32 0, i32 1, i32 0, i32 1>
|
||||
%2 = tail call <4 x i32> @llvm.x86.sse2.psll.d(<4 x i32> %v, <4 x i32> %1)
|
||||
ret <4 x i32> %2
|
||||
}
|
||||
|
||||
define <2 x i64> @sse2_psll_q_var(<2 x i64> %v, <2 x i64> %a) nounwind readnone uwtable {
|
||||
; CHECK-LABEL: @sse2_psll_q_var
|
||||
; CHECK-NEXT: %1 = tail call <2 x i64> @llvm.x86.sse2.psll.q(<2 x i64> %v, <2 x i64> %a)
|
||||
; CHECK-NEXT: ret <2 x i64> %1
|
||||
%1 = shufflevector <2 x i64> %a, <2 x i64> undef, <2 x i32> <i32 0, i32 0>
|
||||
%2 = tail call <2 x i64> @llvm.x86.sse2.psll.q(<2 x i64> %v, <2 x i64> %1)
|
||||
ret <2 x i64> %2
|
||||
}
|
||||
|
||||
define <16 x i16> @avx2_psll_w_var(<16 x i16> %v, <8 x i16> %a) nounwind readnone uwtable {
|
||||
; CHECK-LABEL: @avx2_psll_w_var
|
||||
; CHECK-NEXT: %1 = tail call <16 x i16> @llvm.x86.avx2.psll.w(<16 x i16> %v, <8 x i16> %a)
|
||||
; CHECK-NEXT: ret <16 x i16> %1
|
||||
%1 = shufflevector <8 x i16> %a, <8 x i16> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3>
|
||||
%2 = tail call <16 x i16> @llvm.x86.avx2.psll.w(<16 x i16> %v, <8 x i16> %1)
|
||||
ret <16 x i16> %2
|
||||
}
|
||||
|
||||
define <8 x i32> @avx2_psll_d_var(<8 x i32> %v, <4 x i32> %a) nounwind readnone uwtable {
|
||||
; CHECK-LABEL: @avx2_psll_d_var
|
||||
; CHECK-NEXT: %1 = tail call <8 x i32> @llvm.x86.avx2.psll.d(<8 x i32> %v, <4 x i32> %a)
|
||||
; CHECK-NEXT: ret <8 x i32> %1
|
||||
%1 = shufflevector <4 x i32> %a, <4 x i32> undef, <4 x i32> <i32 0, i32 1, i32 0, i32 1>
|
||||
%2 = tail call <8 x i32> @llvm.x86.avx2.psll.d(<8 x i32> %v, <4 x i32> %1)
|
||||
ret <8 x i32> %2
|
||||
}
|
||||
|
||||
define <4 x i64> @avx2_psll_q_var(<4 x i64> %v, <2 x i64> %a) nounwind readnone uwtable {
|
||||
; CHECK-LABEL: @avx2_psll_q_var
|
||||
; CHECK-NEXT: %1 = tail call <4 x i64> @llvm.x86.avx2.psll.q(<4 x i64> %v, <2 x i64> %a)
|
||||
; CHECK-NEXT: ret <4 x i64> %1
|
||||
%1 = shufflevector <2 x i64> %a, <2 x i64> undef, <2 x i32> <i32 0, i32 0>
|
||||
%2 = tail call <4 x i64> @llvm.x86.avx2.psll.q(<4 x i64> %v, <2 x i64> %1)
|
||||
ret <4 x i64> %2
|
||||
}
|
||||
|
||||
;
|
||||
; Constant Folding
|
||||
;
|
||||
|
Loading…
x
Reference in New Issue
Block a user