mirror of
https://github.com/RPCSX/llvm.git
synced 2025-01-24 13:15:36 +00:00
[X86][SSE] Avoid scalarization of v2i64 vector shifts
Currently v2i64 vectors shifts (non-equal shift amounts) are scalarized, costing 4 x extract, 2 x x86-shifts and 2 x insert instructions - and it gets even more awkward on 32-bit targets. This patch separately shifts the vector by both shift amounts and then shuffles the partial results back together, costing 2 x shuffles and 2 x sse-shifts instructions (+ 2 movs on pre-AVX hardware). Note - this patch only improves the SHL / LSHR logical shifts as only these are supported in SSE hardware. Differential Revision: http://reviews.llvm.org/D8416 git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@232660 91177308-0d34-0410-b5e6-96231b3b80d8
This commit is contained in:
parent
db4d401364
commit
0ee70a1554
@ -5906,7 +5906,7 @@ static SDValue LowerCONCAT_VECTORSvXi1(SDValue Op,
|
||||
return DAG.getNode(ISD::OR, dl, ResVT, V1, V2);
|
||||
}
|
||||
|
||||
static SDValue LowerCONCAT_VECTORS(SDValue Op,
|
||||
static SDValue LowerCONCAT_VECTORS(SDValue Op,
|
||||
const X86Subtarget *Subtarget,
|
||||
SelectionDAG &DAG) {
|
||||
MVT VT = Op.getSimpleValueType();
|
||||
@ -13255,11 +13255,11 @@ SDValue X86TargetLowering::LowerSELECT(SDValue Op, SelectionDAG &DAG) const {
|
||||
// If we have AVX, we can use a variable vector select (VBLENDV) instead
|
||||
// of 3 logic instructions for size savings and potentially speed.
|
||||
// Unfortunately, there is no scalar form of VBLENDV.
|
||||
|
||||
|
||||
// If either operand is a constant, don't try this. We can expect to
|
||||
// optimize away at least one of the logic instructions later in that
|
||||
// case, so that sequence would be faster than a variable blend.
|
||||
|
||||
|
||||
// BLENDV was introduced with SSE 4.1, but the 2 register form implicitly
|
||||
// uses XMM0 as the selection register. That may need just as many
|
||||
// instructions as the AND/ANDN/OR sequence due to register moves, so
|
||||
@ -13267,10 +13267,10 @@ SDValue X86TargetLowering::LowerSELECT(SDValue Op, SelectionDAG &DAG) const {
|
||||
|
||||
if (Subtarget->hasAVX() &&
|
||||
!isa<ConstantFPSDNode>(Op1) && !isa<ConstantFPSDNode>(Op2)) {
|
||||
|
||||
|
||||
// Convert to vectors, do a VSELECT, and convert back to scalar.
|
||||
// All of the conversions should be optimized away.
|
||||
|
||||
|
||||
EVT VecVT = VT == MVT::f32 ? MVT::v4f32 : MVT::v2f64;
|
||||
SDValue VOp1 = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VecVT, Op1);
|
||||
SDValue VOp2 = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VecVT, Op2);
|
||||
@ -13278,9 +13278,9 @@ SDValue X86TargetLowering::LowerSELECT(SDValue Op, SelectionDAG &DAG) const {
|
||||
|
||||
EVT VCmpVT = VT == MVT::f32 ? MVT::v4i32 : MVT::v2i64;
|
||||
VCmp = DAG.getNode(ISD::BITCAST, DL, VCmpVT, VCmp);
|
||||
|
||||
|
||||
SDValue VSel = DAG.getNode(ISD::VSELECT, DL, VecVT, VCmp, VOp1, VOp2);
|
||||
|
||||
|
||||
return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT,
|
||||
VSel, DAG.getIntPtrConstant(0));
|
||||
}
|
||||
@ -16189,6 +16189,17 @@ static SDValue LowerShift(SDValue Op, const X86Subtarget* Subtarget,
|
||||
return Op;
|
||||
}
|
||||
|
||||
// 2i64 vector logical shifts can efficiently avoid scalarization - do the
|
||||
// shifts per-lane and then shuffle the partial results back together.
|
||||
if (VT == MVT::v2i64 && Op.getOpcode() != ISD::SRA) {
|
||||
// Splat the shift amounts so the scalar shifts above will catch it.
|
||||
SDValue Amt0 = DAG.getVectorShuffle(VT, dl, Amt, Amt, {0, 0});
|
||||
SDValue Amt1 = DAG.getVectorShuffle(VT, dl, Amt, Amt, {1, 1});
|
||||
SDValue R0 = DAG.getNode(Op->getOpcode(), dl, VT, R, Amt0);
|
||||
SDValue R1 = DAG.getNode(Op->getOpcode(), dl, VT, R, Amt1);
|
||||
return DAG.getVectorShuffle(VT, dl, R0, R1, {0, 3});
|
||||
}
|
||||
|
||||
// If possible, lower this packed shift into a vector multiply instead of
|
||||
// expanding it into a sequence of scalar shifts.
|
||||
// Do this only if the vector shift count is a constant build_vector.
|
||||
@ -21960,7 +21971,7 @@ static SDValue VectorZextCombine(SDNode *N, SelectionDAG &DAG,
|
||||
// an and with a mask.
|
||||
// We'd like to try to combine that into a shuffle with zero
|
||||
// plus a bitcast, removing the and.
|
||||
if (N0.getOpcode() != ISD::BITCAST ||
|
||||
if (N0.getOpcode() != ISD::BITCAST ||
|
||||
N0.getOperand(0).getOpcode() != ISD::VECTOR_SHUFFLE)
|
||||
return SDValue();
|
||||
|
||||
@ -21990,7 +22001,7 @@ static SDValue VectorZextCombine(SDNode *N, SelectionDAG &DAG,
|
||||
|
||||
unsigned ResSize = N1.getValueType().getScalarSizeInBits();
|
||||
// Make sure the splat matches the mask we expect
|
||||
if (SplatBitSize > ResSize ||
|
||||
if (SplatBitSize > ResSize ||
|
||||
(SplatValue + 1).exactLogBase2() != (int)SrcSize)
|
||||
return SDValue();
|
||||
|
||||
@ -22948,7 +22959,7 @@ static SDValue PerformFANDCombine(SDNode *N, SelectionDAG &DAG) {
|
||||
if (ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(N->getOperand(1)))
|
||||
if (C->getValueAPF().isPosZero())
|
||||
return N->getOperand(1);
|
||||
|
||||
|
||||
return SDValue();
|
||||
}
|
||||
|
||||
@ -23222,7 +23233,7 @@ static SDValue PerformISDSETCCCombine(SDNode *N, SelectionDAG &DAG,
|
||||
return DAG.getConstant(1, VT);
|
||||
if (CC == ISD::SETEQ || CC == ISD::SETGE)
|
||||
return DAG.getNOT(DL, LHS.getOperand(0), VT);
|
||||
|
||||
|
||||
assert((CC == ISD::SETNE || CC == ISD::SETLT) &&
|
||||
"Unexpected condition code!");
|
||||
return LHS.getOperand(0);
|
||||
@ -23264,7 +23275,7 @@ static SDValue PerformINSERTPSCombine(SDNode *N, SelectionDAG &DAG,
|
||||
// countS and just gets an f32 from that address.
|
||||
unsigned DestIndex =
|
||||
cast<ConstantSDNode>(N->getOperand(2))->getZExtValue() >> 6;
|
||||
|
||||
|
||||
Ld = NarrowVectorLoadToElement(cast<LoadSDNode>(Ld), DestIndex, DAG);
|
||||
|
||||
// Create this as a scalar to vector to match the instruction pattern.
|
||||
@ -23288,7 +23299,7 @@ static SDValue PerformBLENDICombine(SDNode *N, SelectionDAG &DAG) {
|
||||
// pattern-matching possibilities related to scalar math ops in SSE/AVX.
|
||||
// x86InstrInfo knows how to commute this back after instruction selection
|
||||
// if it would help register allocation.
|
||||
|
||||
|
||||
// TODO: If optimizing for size or a processor that doesn't suffer from
|
||||
// partial register update stalls, this should be transformed into a MOVSD
|
||||
// instruction because a MOVSD is 1-2 bytes smaller than a BLENDPD.
|
||||
|
@ -13,11 +13,16 @@ entry:
|
||||
ret void
|
||||
}
|
||||
|
||||
; shift1b can't use a packed shift
|
||||
; shift1b can't use a packed shift but can shift lanes separately and shuffle back together
|
||||
define void @shift1b(<2 x i64> %val, <2 x i64>* %dst, <2 x i64> %sh) nounwind {
|
||||
entry:
|
||||
; CHECK-LABEL: shift1b:
|
||||
; CHECK: shll
|
||||
; CHECK: pshufd {{.*#+}} xmm2 = xmm1[2,3,0,1]
|
||||
; CHECK-NEXT: movdqa %xmm0, %xmm3
|
||||
; CHECK-NEXT: psllq %xmm2, %xmm3
|
||||
; CHECK-NEXT: movq {{.*#+}} xmm1 = xmm1[0],zero
|
||||
; CHECK-NEXT: psllq %xmm1, %xmm0
|
||||
; CHECK-NEXT: movsd {{.*#+}} xmm3 = xmm0[0],xmm3[1]
|
||||
%shamt = shufflevector <2 x i64> %sh, <2 x i64> undef, <2 x i32> <i32 0, i32 1>
|
||||
%shl = shl <2 x i64> %val, %shamt
|
||||
store <2 x i64> %shl, <2 x i64>* %dst
|
||||
|
@ -118,10 +118,16 @@ entry:
|
||||
|
||||
define <2 x i64> @shr2_nosplat(<2 x i64> %A) nounwind {
|
||||
entry:
|
||||
; CHECK: shr2_nosplat
|
||||
; CHECK-NOT: psrlq
|
||||
; CHECK-NOT: psrlq
|
||||
; CHECK: ret
|
||||
; CHECK-LABEL: shr2_nosplat
|
||||
; CHECK: movdqa (%rcx), %xmm1
|
||||
; CHECK-NEXT: movdqa %xmm1, %xmm2
|
||||
; CHECK-NEXT: psrlq $8, %xmm2
|
||||
; CHECK-NEXT: movdqa %xmm1, %xmm0
|
||||
; CHECK-NEXT: psrlq $1, %xmm0
|
||||
; CHECK-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1]
|
||||
; CHECK-NEXT: movsd {{.*#+}} xmm0 = xmm2[0],xmm0[1]
|
||||
; CHECK-NEXT: xorpd %xmm1, %xmm0
|
||||
; CHECK-NEXT: ret
|
||||
%B = lshr <2 x i64> %A, < i64 8, i64 1>
|
||||
%C = lshr <2 x i64> %A, < i64 1, i64 0>
|
||||
%K = xor <2 x i64> %B, %C
|
||||
|
Loading…
x
Reference in New Issue
Block a user