mirror of
https://github.com/RPCS3/llvm-mirror.git
synced 2024-12-05 10:17:36 +00:00
[DAGCombiner] Bug 31275- Extract a shift from a constant mul or udiv if a rotate can be formed
Summary: Attempt to extract a shrl from a udiv or a shl from a mul if this allows a rotate to be formed. This targets cases where the input to a rotate pattern was a mul or udiv by a constant and InstCombine merged one of the shifts with the op. Patch by: sameconrad (Sam Conrad) Reviewers: RKSimon, craig.topper, spatel, lebedev.ri, javed.absar Reviewed By: lebedev.ri Subscribers: efriedma, kparzysz, llvm-commits Differential Revision: https://reviews.llvm.org/D47681 llvm-svn: 338270
This commit is contained in:
parent
9a0b5ee7a4
commit
29cbdcb1e3
@ -483,9 +483,6 @@ namespace {
|
||||
/// returns false.
|
||||
bool findBetterNeighborChains(StoreSDNode *St);
|
||||
|
||||
/// Match "(X shl/srl V1) & V2" where V2 may not be present.
|
||||
bool MatchRotateHalf(SDValue Op, SDValue &Shift, SDValue &Mask);
|
||||
|
||||
/// Holds a pointer to an LSBaseSDNode as well as information on where it
|
||||
/// is located in a sequence of memory operations connected by a chain.
|
||||
struct MemOpLink {
|
||||
@ -5148,25 +5145,140 @@ SDValue DAGCombiner::visitOR(SDNode *N) {
|
||||
return SDValue();
|
||||
}
|
||||
|
||||
/// Match "(X shl/srl V1) & V2" where V2 may not be present.
|
||||
bool DAGCombiner::MatchRotateHalf(SDValue Op, SDValue &Shift, SDValue &Mask) {
|
||||
if (Op.getOpcode() == ISD::AND) {
|
||||
if (DAG.isConstantIntBuildVectorOrConstantInt(Op.getOperand(1))) {
|
||||
Mask = Op.getOperand(1);
|
||||
Op = Op.getOperand(0);
|
||||
} else {
|
||||
return false;
|
||||
}
|
||||
static SDValue stripConstantMask(SelectionDAG &DAG, SDValue Op, SDValue &Mask) {
|
||||
if (Op.getOpcode() == ISD::AND &&
|
||||
DAG.isConstantIntBuildVectorOrConstantInt(Op.getOperand(1))) {
|
||||
Mask = Op.getOperand(1);
|
||||
return Op.getOperand(0);
|
||||
}
|
||||
return Op;
|
||||
}
|
||||
|
||||
/// Match "(X shl/srl V1) & V2" where V2 may not be present.
|
||||
static bool matchRotateHalf(SelectionDAG &DAG, SDValue Op, SDValue &Shift,
|
||||
SDValue &Mask) {
|
||||
Op = stripConstantMask(DAG, Op, Mask);
|
||||
if (Op.getOpcode() == ISD::SRL || Op.getOpcode() == ISD::SHL) {
|
||||
Shift = Op;
|
||||
return true;
|
||||
}
|
||||
|
||||
return false;
|
||||
}
|
||||
|
||||
/// Helper function for visitOR to extract the needed side of a rotate idiom
|
||||
/// from a shl/srl/mul/udiv. This is meant to handle cases where
|
||||
/// InstCombine merged some outside op with one of the shifts from
|
||||
/// the rotate pattern.
|
||||
/// \returns An empty \c SDValue if the needed shift couldn't be extracted.
|
||||
/// Otherwise, returns an expansion of \p ExtractFrom based on the following
|
||||
/// patterns:
|
||||
///
|
||||
/// (or (mul v c0) (shrl (mul v c1) c2)):
|
||||
/// expands (mul v c0) -> (shl (mul v c1) c3)
|
||||
///
|
||||
/// (or (udiv v c0) (shl (udiv v c1) c2)):
|
||||
/// expands (udiv v c0) -> (shrl (udiv v c1) c3)
|
||||
///
|
||||
/// (or (shl v c0) (shrl (shl v c1) c2)):
|
||||
/// expands (shl v c0) -> (shl (shl v c1) c3)
|
||||
///
|
||||
/// (or (shrl v c0) (shl (shrl v c1) c2)):
|
||||
/// expands (shrl v c0) -> (shrl (shrl v c1) c3)
|
||||
///
|
||||
/// Such that in all cases, c3+c2==bitwidth(op v c1).
|
||||
static SDValue extractShiftForRotate(SelectionDAG &DAG, SDValue OppShift,
|
||||
SDValue ExtractFrom, SDValue &Mask,
|
||||
const SDLoc &DL) {
|
||||
assert(OppShift && ExtractFrom && "Empty SDValue");
|
||||
assert(
|
||||
(OppShift.getOpcode() == ISD::SHL || OppShift.getOpcode() == ISD::SRL) &&
|
||||
"Existing shift must be valid as a rotate half");
|
||||
|
||||
ExtractFrom = stripConstantMask(DAG, ExtractFrom, Mask);
|
||||
// Preconditions:
|
||||
// (or (op0 v c0) (shiftl/r (op0 v c1) c2))
|
||||
//
|
||||
// Find opcode of the needed shift to be extracted from (op0 v c0).
|
||||
unsigned Opcode = ISD::DELETED_NODE;
|
||||
bool IsMulOrDiv = false;
|
||||
// Set Opcode and IsMulOrDiv if the extract opcode matches the needed shift
|
||||
// opcode or its arithmetic (mul or udiv) variant.
|
||||
auto SelectOpcode = [&](unsigned NeededShift, unsigned MulOrDivVariant) {
|
||||
IsMulOrDiv = ExtractFrom.getOpcode() == MulOrDivVariant;
|
||||
if (!IsMulOrDiv && ExtractFrom.getOpcode() != NeededShift)
|
||||
return false;
|
||||
Opcode = NeededShift;
|
||||
return true;
|
||||
};
|
||||
// op0 must be either the needed shift opcode or the mul/udiv equivalent
|
||||
// that the needed shift can be extracted from.
|
||||
if ((OppShift.getOpcode() != ISD::SRL || !SelectOpcode(ISD::SHL, ISD::MUL)) &&
|
||||
(OppShift.getOpcode() != ISD::SHL || !SelectOpcode(ISD::SRL, ISD::UDIV)))
|
||||
return SDValue();
|
||||
|
||||
// op0 must be the same opcode on both sides, have the same LHS argument,
|
||||
// and produce the same value type.
|
||||
SDValue OppShiftLHS = OppShift.getOperand(0);
|
||||
EVT ShiftedVT = OppShiftLHS.getValueType();
|
||||
if (OppShiftLHS.getOpcode() != ExtractFrom.getOpcode() ||
|
||||
OppShiftLHS.getOperand(0) != ExtractFrom.getOperand(0) ||
|
||||
ShiftedVT != ExtractFrom.getValueType())
|
||||
return SDValue();
|
||||
|
||||
// Amount of the existing shift.
|
||||
ConstantSDNode *OppShiftCst = isConstOrConstSplat(OppShift.getOperand(1));
|
||||
// Constant mul/udiv/shift amount from the RHS of the shift's LHS op.
|
||||
ConstantSDNode *OppLHSCst = isConstOrConstSplat(OppShiftLHS.getOperand(1));
|
||||
// Constant mul/udiv/shift amount from the RHS of the ExtractFrom op.
|
||||
ConstantSDNode *ExtractFromCst =
|
||||
isConstOrConstSplat(ExtractFrom.getOperand(1));
|
||||
// TODO: We should be able to handle non-uniform constant vectors for these values
|
||||
// Check that we have constant values.
|
||||
if (!OppShiftCst || !OppShiftCst->getAPIntValue() ||
|
||||
!OppLHSCst || !OppLHSCst->getAPIntValue() ||
|
||||
!ExtractFromCst || !ExtractFromCst->getAPIntValue())
|
||||
return SDValue();
|
||||
|
||||
// Compute the shift amount we need to extract to complete the rotate.
|
||||
const unsigned VTWidth = ShiftedVT.getScalarSizeInBits();
|
||||
APInt NeededShiftAmt = VTWidth - OppShiftCst->getAPIntValue();
|
||||
if (NeededShiftAmt.isNegative())
|
||||
return SDValue();
|
||||
// Normalize the bitwidth of the two mul/udiv/shift constant operands.
|
||||
APInt ExtractFromAmt = ExtractFromCst->getAPIntValue();
|
||||
APInt OppLHSAmt = OppLHSCst->getAPIntValue();
|
||||
zeroExtendToMatch(ExtractFromAmt, OppLHSAmt);
|
||||
|
||||
// Now try extract the needed shift from the ExtractFrom op and see if the
|
||||
// result matches up with the existing shift's LHS op.
|
||||
if (IsMulOrDiv) {
|
||||
// Op to extract from is a mul or udiv by a constant.
|
||||
// Check:
|
||||
// c2 / (1 << (bitwidth(op0 v c0) - c1)) == c0
|
||||
// c2 % (1 << (bitwidth(op0 v c0) - c1)) == 0
|
||||
const APInt ExtractDiv = APInt::getOneBitSet(ExtractFromAmt.getBitWidth(),
|
||||
NeededShiftAmt.getZExtValue());
|
||||
APInt ResultAmt;
|
||||
APInt Rem;
|
||||
APInt::udivrem(ExtractFromAmt, ExtractDiv, ResultAmt, Rem);
|
||||
if (Rem != 0 || ResultAmt != OppLHSAmt)
|
||||
return SDValue();
|
||||
} else {
|
||||
// Op to extract from is a shift by a constant.
|
||||
// Check:
|
||||
// c2 - (bitwidth(op0 v c0) - c1) == c0
|
||||
if (OppLHSAmt != ExtractFromAmt - NeededShiftAmt.zextOrTrunc(
|
||||
ExtractFromAmt.getBitWidth()))
|
||||
return SDValue();
|
||||
}
|
||||
|
||||
// Return the expanded shift op that should allow a rotate to be formed.
|
||||
EVT ShiftVT = OppShift.getOperand(1).getValueType();
|
||||
EVT ResVT = ExtractFrom.getValueType();
|
||||
SDValue NewShiftNode = DAG.getConstant(NeededShiftAmt, DL, ShiftVT);
|
||||
return DAG.getNode(Opcode, DL, ResVT, OppShiftLHS, NewShiftNode);
|
||||
}
|
||||
|
||||
// Return true if we can prove that, whenever Neg and Pos are both in the
|
||||
// range [0, EltSize), Neg == (Pos == 0 ? 0 : EltSize - Pos). This means that
|
||||
// for two opposing shifts shift1 and shift2 and a value X with OpBits bits:
|
||||
@ -5333,14 +5445,41 @@ SDNode *DAGCombiner::MatchRotate(SDValue LHS, SDValue RHS, const SDLoc &DL) {
|
||||
// Match "(X shl/srl V1) & V2" where V2 may not be present.
|
||||
SDValue LHSShift; // The shift.
|
||||
SDValue LHSMask; // AND value if any.
|
||||
if (!MatchRotateHalf(LHS, LHSShift, LHSMask))
|
||||
return nullptr; // Not part of a rotate.
|
||||
matchRotateHalf(DAG, LHS, LHSShift, LHSMask);
|
||||
|
||||
SDValue RHSShift; // The shift.
|
||||
SDValue RHSMask; // AND value if any.
|
||||
if (!MatchRotateHalf(RHS, RHSShift, RHSMask))
|
||||
return nullptr; // Not part of a rotate.
|
||||
matchRotateHalf(DAG, RHS, RHSShift, RHSMask);
|
||||
|
||||
// If neither side matched a rotate half, bail
|
||||
if (!LHSShift && !RHSShift)
|
||||
return nullptr;
|
||||
|
||||
// InstCombine may have combined a constant shl, srl, mul, or udiv with one
|
||||
// side of the rotate, so try to handle that here. In all cases we need to
|
||||
// pass the matched shift from the opposite side to compute the opcode and
|
||||
// needed shift amount to extract. We still want to do this if both sides
|
||||
// matched a rotate half because one half may be a potential overshift that
|
||||
// can be broken down (ie if InstCombine merged two shl or srl ops into a
|
||||
// single one).
|
||||
|
||||
// Have LHS side of the rotate, try to extract the needed shift from the RHS.
|
||||
if (LHSShift)
|
||||
if (SDValue NewRHSShift =
|
||||
extractShiftForRotate(DAG, LHSShift, RHS, RHSMask, DL))
|
||||
RHSShift = NewRHSShift;
|
||||
// Have RHS side of the rotate, try to extract the needed shift from the LHS.
|
||||
if (RHSShift)
|
||||
if (SDValue NewLHSShift =
|
||||
extractShiftForRotate(DAG, RHSShift, LHS, LHSMask, DL))
|
||||
LHSShift = NewLHSShift;
|
||||
|
||||
// If a side is still missing, nothing else we can do.
|
||||
if (!RHSShift || !LHSShift)
|
||||
return nullptr;
|
||||
|
||||
// At this point we've matched or extracted a shift op on each side.
|
||||
|
||||
if (LHSShift.getOperand(0) != RHSShift.getOperand(0))
|
||||
return nullptr; // Not shifting the same value.
|
||||
|
||||
|
@ -11,9 +11,8 @@
|
||||
define i64 @ror_extract_shl(i64 %i) nounwind {
|
||||
; CHECK-LABEL: ror_extract_shl:
|
||||
; CHECK: // %bb.0:
|
||||
; CHECK-NEXT: lsl x8, x0, #10
|
||||
; CHECK-NEXT: bfxil x8, x0, #54, #7
|
||||
; CHECK-NEXT: mov x0, x8
|
||||
; CHECK-NEXT: lsl x8, x0, #3
|
||||
; CHECK-NEXT: ror x0, x8, #57
|
||||
; CHECK-NEXT: ret
|
||||
%lhs_mul = shl i64 %i, 3
|
||||
%rhs_mul = shl i64 %i, 10
|
||||
@ -25,8 +24,8 @@ define i64 @ror_extract_shl(i64 %i) nounwind {
|
||||
define i32 @ror_extract_shrl(i32 %i) nounwind {
|
||||
; CHECK-LABEL: ror_extract_shrl:
|
||||
; CHECK: // %bb.0:
|
||||
; CHECK-NEXT: ror w8, w0, #7
|
||||
; CHECK-NEXT: and w0, w8, #0xf1ffffff
|
||||
; CHECK-NEXT: lsr w8, w0, #3
|
||||
; CHECK-NEXT: ror w0, w8, #4
|
||||
; CHECK-NEXT: ret
|
||||
%lhs_div = lshr i32 %i, 7
|
||||
%rhs_div = lshr i32 %i, 3
|
||||
@ -54,8 +53,8 @@ define i64 @ror_extract_udiv(i64 %i) nounwind {
|
||||
; CHECK-NEXT: mov x8, #-6148914691236517206
|
||||
; CHECK-NEXT: movk x8, #43691
|
||||
; CHECK-NEXT: umulh x8, x0, x8
|
||||
; CHECK-NEXT: ror x8, x8, #5
|
||||
; CHECK-NEXT: and x0, x8, #0xf7ffffffffffffff
|
||||
; CHECK-NEXT: lsr x8, x8, #1
|
||||
; CHECK-NEXT: ror x0, x8, #4
|
||||
; CHECK-NEXT: ret
|
||||
%lhs_div = udiv i64 %i, 3
|
||||
%rhs_div = udiv i64 %i, 48
|
||||
@ -67,11 +66,9 @@ define i64 @ror_extract_udiv(i64 %i) nounwind {
|
||||
define i64 @ror_extract_mul_with_mask(i64 %i) nounwind {
|
||||
; CHECK-LABEL: ror_extract_mul_with_mask:
|
||||
; CHECK: // %bb.0:
|
||||
; CHECK-NEXT: add w8, w0, w0, lsl #3
|
||||
; CHECK-NEXT: lsl w8, w8, #7
|
||||
; CHECK-NEXT: add x9, x0, x0, lsl #3
|
||||
; CHECK-NEXT: and x0, x8, #0x80
|
||||
; CHECK-NEXT: bfxil x0, x9, #57, #7
|
||||
; CHECK-NEXT: add x8, x0, x0, lsl #3
|
||||
; CHECK-NEXT: ror x8, x8, #57
|
||||
; CHECK-NEXT: and x0, x8, #0xff
|
||||
; CHECK-NEXT: ret
|
||||
%lhs_mul = mul i64 %i, 1152
|
||||
%rhs_mul = mul i64 %i, 9
|
||||
|
@ -12,10 +12,10 @@
|
||||
define <4 x i32> @vroll_v4i32_extract_shl(<4 x i32> %i) {
|
||||
; CHECK-LABEL: vroll_v4i32_extract_shl:
|
||||
; CHECK: # %bb.0:
|
||||
; CHECK-NEXT: vpslld $3, %xmm0, %xmm1
|
||||
; CHECK-NEXT: vpslld $10, %xmm0, %xmm0
|
||||
; CHECK-NEXT: vpsrld $25, %xmm1, %xmm1
|
||||
; CHECK-NEXT: vpor %xmm0, %xmm1, %xmm0
|
||||
; CHECK-NEXT: vpslld $3, %xmm0, %xmm0
|
||||
; CHECK-NEXT: vprold $7, %zmm0, %zmm0
|
||||
; CHECK-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0
|
||||
; CHECK-NEXT: vzeroupper
|
||||
; CHECK-NEXT: ret{{[l|q]}}
|
||||
%lhs_mul = shl <4 x i32> %i, <i32 3, i32 3, i32 3, i32 3>
|
||||
%rhs_mul = shl <4 x i32> %i, <i32 10, i32 10, i32 10, i32 10>
|
||||
@ -25,20 +25,12 @@ define <4 x i32> @vroll_v4i32_extract_shl(<4 x i32> %i) {
|
||||
}
|
||||
|
||||
define <4 x i64> @vrolq_v4i64_extract_shrl(<4 x i64> %i) nounwind {
|
||||
; X86-LABEL: vrolq_v4i64_extract_shrl:
|
||||
; X86: # %bb.0:
|
||||
; X86-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0
|
||||
; X86-NEXT: vprolq $24, %zmm0, %zmm0
|
||||
; X86-NEXT: vpand {{\.LCPI.*}}, %ymm0, %ymm0
|
||||
; X86-NEXT: retl
|
||||
;
|
||||
; X64-LABEL: vrolq_v4i64_extract_shrl:
|
||||
; X64: # %bb.0:
|
||||
; X64-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0
|
||||
; X64-NEXT: vprolq $24, %zmm0, %zmm0
|
||||
; X64-NEXT: vpbroadcastq {{.*#+}} ymm1 = [18446744073189457919,18446744073189457919,18446744073189457919,18446744073189457919]
|
||||
; X64-NEXT: vpand %ymm1, %ymm0, %ymm0
|
||||
; X64-NEXT: retq
|
||||
; CHECK-LABEL: vrolq_v4i64_extract_shrl:
|
||||
; CHECK: # %bb.0:
|
||||
; CHECK-NEXT: vpsrlq $5, %ymm0, %ymm0
|
||||
; CHECK-NEXT: vprolq $29, %zmm0, %zmm0
|
||||
; CHECK-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0
|
||||
; CHECK-NEXT: ret{{[l|q]}}
|
||||
%lhs_div = lshr <4 x i64> %i, <i64 40, i64 40, i64 40, i64 40>
|
||||
%rhs_div = lshr <4 x i64> %i, <i64 5, i64 5, i64 5, i64 5>
|
||||
%rhs_shift = shl <4 x i64> %rhs_div, <i64 29, i64 29, i64 29, i64 29>
|
||||
@ -49,12 +41,10 @@ define <4 x i64> @vrolq_v4i64_extract_shrl(<4 x i64> %i) nounwind {
|
||||
define <8 x i32> @vroll_extract_mul(<8 x i32> %i) nounwind {
|
||||
; CHECK-LABEL: vroll_extract_mul:
|
||||
; CHECK: # %bb.0:
|
||||
; CHECK-NEXT: vpbroadcastd {{.*#+}} ymm1 = [640,640,640,640,640,640,640,640]
|
||||
; CHECK-NEXT: vpmulld %ymm1, %ymm0, %ymm1
|
||||
; CHECK-NEXT: vpbroadcastd {{.*#+}} ymm2 = [10,10,10,10,10,10,10,10]
|
||||
; CHECK-NEXT: vpmulld %ymm2, %ymm0, %ymm0
|
||||
; CHECK-NEXT: vpsrld $26, %ymm0, %ymm0
|
||||
; CHECK-NEXT: vpor %ymm0, %ymm1, %ymm0
|
||||
; CHECK-NEXT: vpbroadcastd {{.*#+}} ymm1 = [10,10,10,10,10,10,10,10]
|
||||
; CHECK-NEXT: vpmulld %ymm1, %ymm0, %ymm0
|
||||
; CHECK-NEXT: vprold $6, %zmm0, %zmm0
|
||||
; CHECK-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0
|
||||
; CHECK-NEXT: ret{{[l|q]}}
|
||||
%lhs_mul = mul <8 x i32> %i, <i32 640, i32 640, i32 640, i32 640, i32 640, i32 640, i32 640, i32 640>
|
||||
%rhs_mul = mul <8 x i32> %i, <i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10>
|
||||
@ -66,7 +56,7 @@ define <8 x i32> @vroll_extract_mul(<8 x i32> %i) nounwind {
|
||||
define <2 x i64> @vrolq_extract_udiv(<2 x i64> %i) nounwind {
|
||||
; X86-LABEL: vrolq_extract_udiv:
|
||||
; X86: # %bb.0:
|
||||
; X86-NEXT: subl $60, %esp
|
||||
; X86-NEXT: subl $44, %esp
|
||||
; X86-NEXT: vmovups %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill
|
||||
; X86-NEXT: vextractps $1, %xmm0, {{[0-9]+}}(%esp)
|
||||
; X86-NEXT: vmovss %xmm0, (%esp)
|
||||
@ -85,53 +75,27 @@ define <2 x i64> @vrolq_extract_udiv(<2 x i64> %i) nounwind {
|
||||
; X86-NEXT: vmovdqu {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 16-byte Reload
|
||||
; X86-NEXT: vpinsrd $2, %eax, %xmm0, %xmm0
|
||||
; X86-NEXT: vpinsrd $3, %edx, %xmm0, %xmm0
|
||||
; X86-NEXT: vmovdqu %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill
|
||||
; X86-NEXT: vmovups {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 16-byte Reload
|
||||
; X86-NEXT: vextractps $1, %xmm0, {{[0-9]+}}(%esp)
|
||||
; X86-NEXT: vmovss %xmm0, (%esp)
|
||||
; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
|
||||
; X86-NEXT: movl $384, {{[0-9]+}}(%esp) # imm = 0x180
|
||||
; X86-NEXT: calll __udivdi3
|
||||
; X86-NEXT: vmovups {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 16-byte Reload
|
||||
; X86-NEXT: vextractps $3, %xmm0, {{[0-9]+}}(%esp)
|
||||
; X86-NEXT: vextractps $2, %xmm0, (%esp)
|
||||
; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
|
||||
; X86-NEXT: movl $384, {{[0-9]+}}(%esp) # imm = 0x180
|
||||
; X86-NEXT: vmovd %eax, %xmm0
|
||||
; X86-NEXT: vpinsrd $1, %edx, %xmm0, %xmm0
|
||||
; X86-NEXT: vmovdqu %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill
|
||||
; X86-NEXT: calll __udivdi3
|
||||
; X86-NEXT: vmovdqu {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 16-byte Reload
|
||||
; X86-NEXT: vpinsrd $2, %eax, %xmm0, %xmm0
|
||||
; X86-NEXT: vpinsrd $3, %edx, %xmm0, %xmm0
|
||||
; X86-NEXT: vmovdqu {{[-0-9]+}}(%e{{[sb]}}p), %xmm1 # 16-byte Reload
|
||||
; X86-NEXT: vpsllq $57, %xmm1, %xmm1
|
||||
; X86-NEXT: vpor %xmm0, %xmm1, %xmm0
|
||||
; X86-NEXT: addl $60, %esp
|
||||
; X86-NEXT: vprolq $57, %zmm0, %zmm0
|
||||
; X86-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0
|
||||
; X86-NEXT: addl $44, %esp
|
||||
; X86-NEXT: vzeroupper
|
||||
; X86-NEXT: retl
|
||||
;
|
||||
; X64-LABEL: vrolq_extract_udiv:
|
||||
; X64: # %bb.0:
|
||||
; X64-NEXT: vpextrq $1, %xmm0, %rax
|
||||
; X64-NEXT: movabsq $-6148914691236517205, %rsi # imm = 0xAAAAAAAAAAAAAAAB
|
||||
; X64-NEXT: mulq %rsi
|
||||
; X64-NEXT: movq %rdx, %rcx
|
||||
; X64-NEXT: movq %rdx, %rax
|
||||
; X64-NEXT: shrq %rax
|
||||
; X64-NEXT: vmovq %rax, %xmm1
|
||||
; X64-NEXT: movabsq $-6148914691236517205, %rcx # imm = 0xAAAAAAAAAAAAAAAB
|
||||
; X64-NEXT: mulq %rcx
|
||||
; X64-NEXT: shrq %rdx
|
||||
; X64-NEXT: vmovq %rdx, %xmm1
|
||||
; X64-NEXT: vmovq %xmm0, %rax
|
||||
; X64-NEXT: mulq %rsi
|
||||
; X64-NEXT: movq %rdx, %rax
|
||||
; X64-NEXT: shrq %rax
|
||||
; X64-NEXT: vmovq %rax, %xmm0
|
||||
; X64-NEXT: mulq %rcx
|
||||
; X64-NEXT: shrq %rdx
|
||||
; X64-NEXT: vmovq %rdx, %xmm0
|
||||
; X64-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
|
||||
; X64-NEXT: shrq $8, %rcx
|
||||
; X64-NEXT: vmovq %rcx, %xmm1
|
||||
; X64-NEXT: shrq $8, %rdx
|
||||
; X64-NEXT: vmovq %rdx, %xmm2
|
||||
; X64-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0]
|
||||
; X64-NEXT: vpsllq $57, %xmm0, %xmm0
|
||||
; X64-NEXT: vpor %xmm1, %xmm0, %xmm0
|
||||
; X64-NEXT: vprolq $57, %zmm0, %zmm0
|
||||
; X64-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0
|
||||
; X64-NEXT: vzeroupper
|
||||
; X64-NEXT: retq
|
||||
%lhs_div = udiv <2 x i64> %i, <i64 3, i64 3>
|
||||
%rhs_div = udiv <2 x i64> %i, <i64 384, i64 384>
|
||||
@ -141,17 +105,23 @@ define <2 x i64> @vrolq_extract_udiv(<2 x i64> %i) nounwind {
|
||||
}
|
||||
|
||||
define <4 x i32> @vrolw_extract_mul_with_mask(<4 x i32> %i) nounwind {
|
||||
; CHECK-LABEL: vrolw_extract_mul_with_mask:
|
||||
; CHECK: # %bb.0:
|
||||
; CHECK-NEXT: vpbroadcastd {{.*#+}} xmm1 = [1152,1152,1152,1152]
|
||||
; CHECK-NEXT: vpmulld %xmm1, %xmm0, %xmm1
|
||||
; CHECK-NEXT: vpbroadcastd {{.*#+}} xmm2 = [9,9,9,9]
|
||||
; CHECK-NEXT: vpmulld %xmm2, %xmm0, %xmm0
|
||||
; CHECK-NEXT: vpbroadcastd {{.*#+}} xmm2 = [160,160,160,160]
|
||||
; CHECK-NEXT: vpand %xmm2, %xmm1, %xmm1
|
||||
; CHECK-NEXT: vpsrld $25, %xmm0, %xmm0
|
||||
; CHECK-NEXT: vpor %xmm0, %xmm1, %xmm0
|
||||
; CHECK-NEXT: ret{{[l|q]}}
|
||||
; X86-LABEL: vrolw_extract_mul_with_mask:
|
||||
; X86: # %bb.0:
|
||||
; X86-NEXT: vpbroadcastd {{.*#+}} xmm1 = [9,9,9,9]
|
||||
; X86-NEXT: vpmulld %xmm1, %xmm0, %xmm0
|
||||
; X86-NEXT: vprold $7, %zmm0, %zmm0
|
||||
; X86-NEXT: vpand {{\.LCPI.*}}, %xmm0, %xmm0
|
||||
; X86-NEXT: vzeroupper
|
||||
; X86-NEXT: retl
|
||||
;
|
||||
; X64-LABEL: vrolw_extract_mul_with_mask:
|
||||
; X64: # %bb.0:
|
||||
; X64-NEXT: vpbroadcastd {{.*#+}} xmm1 = [9,9,9,9]
|
||||
; X64-NEXT: vpmulld %xmm1, %xmm0, %xmm0
|
||||
; X64-NEXT: vprold $7, %zmm0, %zmm0
|
||||
; X64-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0
|
||||
; X64-NEXT: vzeroupper
|
||||
; X64-NEXT: retq
|
||||
%lhs_mul = mul <4 x i32> %i, <i32 1152, i32 1152, i32 1152, i32 1152>
|
||||
%rhs_mul = mul <4 x i32> %i, <i32 9, i32 9, i32 9, i32 9>
|
||||
%lhs_and = and <4 x i32> %lhs_mul, <i32 160, i32 160, i32 160, i32 160>
|
||||
|
@ -24,9 +24,7 @@ define i64 @rolq_extract_shl(i64 %i) nounwind {
|
||||
; X64-LABEL: rolq_extract_shl:
|
||||
; X64: # %bb.0:
|
||||
; X64-NEXT: leaq (,%rdi,8), %rax
|
||||
; X64-NEXT: shlq $10, %rdi
|
||||
; X64-NEXT: shrq $57, %rax
|
||||
; X64-NEXT: orq %rdi, %rax
|
||||
; X64-NEXT: rolq $7, %rax
|
||||
; X64-NEXT: retq
|
||||
%lhs_mul = shl i64 %i, 3
|
||||
%rhs_mul = shl i64 %i, 10
|
||||
@ -39,16 +37,17 @@ define i16 @rolw_extract_shrl(i16 %i) nounwind {
|
||||
; X86-LABEL: rolw_extract_shrl:
|
||||
; X86: # %bb.0:
|
||||
; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax
|
||||
; X86-NEXT: rolw $9, %ax
|
||||
; X86-NEXT: andl $61951, %eax # imm = 0xF1FF
|
||||
; X86-NEXT: shrl $3, %eax
|
||||
; X86-NEXT: rolw $12, %ax
|
||||
; X86-NEXT: # kill: def $ax killed $ax killed $eax
|
||||
; X86-NEXT: retl
|
||||
;
|
||||
; X64-LABEL: rolw_extract_shrl:
|
||||
; X64: # %bb.0:
|
||||
; X64-NEXT: rolw $9, %di
|
||||
; X64-NEXT: andl $61951, %edi # imm = 0xF1FF
|
||||
; X64-NEXT: movl %edi, %eax
|
||||
; X64-NEXT: movzwl %di, %eax
|
||||
; X64-NEXT: shrl $3, %eax
|
||||
; X64-NEXT: rolw $12, %ax
|
||||
; X64-NEXT: # kill: def $ax killed $ax killed $eax
|
||||
; X64-NEXT: retq
|
||||
%lhs_div = lshr i16 %i, 7
|
||||
%rhs_div = lshr i16 %i, 3
|
||||
@ -60,22 +59,16 @@ define i16 @rolw_extract_shrl(i16 %i) nounwind {
|
||||
define i32 @roll_extract_mul(i32 %i) nounwind {
|
||||
; X86-LABEL: roll_extract_mul:
|
||||
; X86: # %bb.0:
|
||||
; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
|
||||
; X86-NEXT: leal (%ecx,%ecx,8), %eax
|
||||
; X86-NEXT: shll $7, %ecx
|
||||
; X86-NEXT: leal (%ecx,%ecx,8), %ecx
|
||||
; X86-NEXT: shrl $25, %eax
|
||||
; X86-NEXT: orl %ecx, %eax
|
||||
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
|
||||
; X86-NEXT: leal (%eax,%eax,8), %eax
|
||||
; X86-NEXT: roll $7, %eax
|
||||
; X86-NEXT: retl
|
||||
;
|
||||
; X64-LABEL: roll_extract_mul:
|
||||
; X64: # %bb.0:
|
||||
; X64-NEXT: # kill: def $edi killed $edi def $rdi
|
||||
; X64-NEXT: leal (%rdi,%rdi,8), %eax
|
||||
; X64-NEXT: shll $7, %edi
|
||||
; X64-NEXT: leal (%rdi,%rdi,8), %ecx
|
||||
; X64-NEXT: shrl $25, %eax
|
||||
; X64-NEXT: orl %ecx, %eax
|
||||
; X64-NEXT: roll $7, %eax
|
||||
; X64-NEXT: retq
|
||||
%lhs_mul = mul i32 %i, 9
|
||||
%rhs_mul = mul i32 %i, 1152
|
||||
@ -89,11 +82,8 @@ define i8 @rolb_extract_udiv(i8 %i) nounwind {
|
||||
; X86: # %bb.0:
|
||||
; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax
|
||||
; X86-NEXT: imull $171, %eax, %eax
|
||||
; X86-NEXT: movb %ah, %cl
|
||||
; X86-NEXT: shlb $3, %cl
|
||||
; X86-NEXT: andb $-16, %cl
|
||||
; X86-NEXT: shrl $13, %eax
|
||||
; X86-NEXT: orb %cl, %al
|
||||
; X86-NEXT: shrl $9, %eax
|
||||
; X86-NEXT: rolb $4, %al
|
||||
; X86-NEXT: # kill: def $al killed $al killed $eax
|
||||
; X86-NEXT: retl
|
||||
;
|
||||
@ -101,12 +91,8 @@ define i8 @rolb_extract_udiv(i8 %i) nounwind {
|
||||
; X64: # %bb.0:
|
||||
; X64-NEXT: movzbl %dil, %eax
|
||||
; X64-NEXT: imull $171, %eax, %eax
|
||||
; X64-NEXT: movl %eax, %ecx
|
||||
; X64-NEXT: shrl $8, %ecx
|
||||
; X64-NEXT: shlb $3, %cl
|
||||
; X64-NEXT: andb $-16, %cl
|
||||
; X64-NEXT: shrl $13, %eax
|
||||
; X64-NEXT: orb %cl, %al
|
||||
; X64-NEXT: shrl $9, %eax
|
||||
; X64-NEXT: rolb $4, %al
|
||||
; X64-NEXT: # kill: def $al killed $al killed $eax
|
||||
; X64-NEXT: retq
|
||||
%lhs_div = udiv i8 %i, 3
|
||||
@ -139,12 +125,8 @@ define i64 @rolq_extract_mul_with_mask(i64 %i) nounwind {
|
||||
; X64-LABEL: rolq_extract_mul_with_mask:
|
||||
; X64: # %bb.0:
|
||||
; X64-NEXT: leaq (%rdi,%rdi,8), %rax
|
||||
; X64-NEXT: # kill: def $edi killed $edi killed $rdi def $rdi
|
||||
; X64-NEXT: shll $7, %edi
|
||||
; X64-NEXT: leal (%rdi,%rdi,8), %ecx
|
||||
; X64-NEXT: movzbl %cl, %ecx
|
||||
; X64-NEXT: shrq $57, %rax
|
||||
; X64-NEXT: orq %rcx, %rax
|
||||
; X64-NEXT: rolq $7, %rax
|
||||
; X64-NEXT: movzbl %al, %eax
|
||||
; X64-NEXT: retq
|
||||
%lhs_mul = mul i64 %i, 1152
|
||||
%rhs_mul = mul i64 %i, 9
|
||||
|
Loading…
Reference in New Issue
Block a user