mirror of
https://github.com/RPCS3/llvm.git
synced 2025-01-26 04:34:41 +00:00
[X86] Take advantage of the lzcnt instruction on btver2 architectures when ORing comparisons to zero.
This change adds transformations such as: zext(or(setcc(eq, (cmp x, 0)), setcc(eq, (cmp y, 0)))) To: srl(or(ctlz(x), ctlz(y)), log2(bitsize(x)) This optimisation is beneficial on Jaguar architecture only, where lzcnt has a good reciprocal throughput. Other architectures such as Intel's Haswell/Broadwell or AMD's Bulldozer/PileDriver do not benefit from it. For this reason the change also adds a "HasFastLZCNT" feature which gets enabled for Jaguar. Differential Revision: https://reviews.llvm.org/D23446 git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@284248 91177308-0d34-0410-b5e6-96231b3b80d8
This commit is contained in:
parent
7aaf99b572
commit
4594395329
@ -262,6 +262,12 @@ def FeatureFastScalarFSQRT
|
||||
def FeatureFastVectorFSQRT
|
||||
: SubtargetFeature<"fast-vector-fsqrt", "HasFastVectorFSQRT",
|
||||
"true", "Vector SQRT is fast (disable Newton-Raphson)">;
|
||||
// If lzcnt has equivalent latency/throughput to most simple integer ops, it can
|
||||
// be used to replace test/set sequences.
|
||||
def FeatureFastLZCNT
|
||||
: SubtargetFeature<
|
||||
"fast-lzcnt", "HasFastLZCNT", "true",
|
||||
"LZCNT instructions are as fast as most simple integer ops">;
|
||||
|
||||
//===----------------------------------------------------------------------===//
|
||||
// X86 processors supported.
|
||||
@ -646,6 +652,7 @@ def : ProcessorModel<"btver2", BtVer2Model, [
|
||||
FeatureF16C,
|
||||
FeatureMOVBE,
|
||||
FeatureLZCNT,
|
||||
FeatureFastLZCNT,
|
||||
FeaturePOPCNT,
|
||||
FeatureXSAVE,
|
||||
FeatureXSAVEOPT,
|
||||
|
@ -4178,6 +4178,10 @@ bool X86TargetLowering::isCheapToSpeculateCtlz() const {
|
||||
return Subtarget.hasLZCNT();
|
||||
}
|
||||
|
||||
bool X86TargetLowering::isCtlzFast() const {
|
||||
return Subtarget.hasFastLZCNT();
|
||||
}
|
||||
|
||||
bool X86TargetLowering::hasAndNotCompare(SDValue Y) const {
|
||||
if (!Subtarget.hasBMI())
|
||||
return false;
|
||||
@ -29090,6 +29094,113 @@ static SDValue combineLogicBlendIntoPBLENDV(SDNode *N, SelectionDAG &DAG,
|
||||
return DAG.getBitcast(VT, Mask);
|
||||
}
|
||||
|
||||
// Helper function for combineOrCmpEqZeroToCtlzSrl
|
||||
// Transforms:
|
||||
// seteq(cmp x, 0)
|
||||
// into:
|
||||
// srl(ctlz x), log2(bitsize(x))
|
||||
// Input pattern is checked by caller.
|
||||
SDValue lowerX86CmpEqZeroToCtlzSrl(SDValue Op, EVT ExtTy, SelectionDAG &DAG) {
|
||||
SDValue Cmp = Op.getOperand(1);
|
||||
EVT VT = Cmp.getOperand(0).getValueType();
|
||||
unsigned Log2b = Log2_32(VT.getSizeInBits());
|
||||
SDLoc dl(Op);
|
||||
SDValue Clz = DAG.getNode(ISD::CTLZ, dl, VT, Cmp->getOperand(0));
|
||||
// The result of the shift is true or false, and on X86, the 32-bit
|
||||
// encoding of shr and lzcnt is more desirable.
|
||||
SDValue Trunc = DAG.getZExtOrTrunc(Clz, dl, MVT::i32);
|
||||
SDValue Scc = DAG.getNode(ISD::SRL, dl, MVT::i32, Trunc,
|
||||
DAG.getConstant(Log2b, dl, VT));
|
||||
return DAG.getZExtOrTrunc(Scc, dl, ExtTy);
|
||||
}
|
||||
|
||||
// Try to transform:
|
||||
// zext(or(setcc(eq, (cmp x, 0)), setcc(eq, (cmp y, 0))))
|
||||
// into:
|
||||
// srl(or(ctlz(x), ctlz(y)), log2(bitsize(x))
|
||||
// Will also attempt to match more generic cases, eg:
|
||||
// zext(or(or(setcc(eq, cmp 0), setcc(eq, cmp 0)), setcc(eq, cmp 0)))
|
||||
// Only applies if the target supports the FastLZCNT feature.
|
||||
static SDValue combineOrCmpEqZeroToCtlzSrl(SDNode *N, SelectionDAG &DAG,
|
||||
TargetLowering::DAGCombinerInfo &DCI,
|
||||
const X86Subtarget &Subtarget) {
|
||||
if (DCI.isBeforeLegalize() || !Subtarget.getTargetLowering()->isCtlzFast())
|
||||
return SDValue();
|
||||
|
||||
auto isORCandidate = [](SDValue N) {
|
||||
return (N->getOpcode() == ISD::OR && N->hasOneUse());
|
||||
};
|
||||
|
||||
// Check the zero extend is extending to 32-bit or more. The code generated by
|
||||
// srl(ctlz) for 16-bit or less variants of the pattern would require extra
|
||||
// instructions to clear the upper bits.
|
||||
if (!N->hasOneUse() || !N->getSimpleValueType(0).bitsGE(MVT::i32) ||
|
||||
!isORCandidate(N->getOperand(0)))
|
||||
return SDValue();
|
||||
|
||||
// Check the node matches: setcc(eq, cmp 0)
|
||||
auto isSetCCCandidate = [](SDValue N) {
|
||||
return N->getOpcode() == X86ISD::SETCC && N->hasOneUse() &&
|
||||
X86::CondCode(N->getConstantOperandVal(0)) == X86::COND_E &&
|
||||
N->getOperand(1).getOpcode() == X86ISD::CMP &&
|
||||
N->getOperand(1).getConstantOperandVal(1) == 0 &&
|
||||
N->getOperand(1).getValueType().bitsGE(MVT::i32);
|
||||
};
|
||||
|
||||
SDNode *OR = N->getOperand(0).getNode();
|
||||
SDValue LHS = OR->getOperand(0);
|
||||
SDValue RHS = OR->getOperand(1);
|
||||
|
||||
// Save nodes matching or(or, setcc(eq, cmp 0)).
|
||||
SmallVector<SDNode *, 2> ORNodes;
|
||||
while (((isORCandidate(LHS) && isSetCCCandidate(RHS)) ||
|
||||
(isORCandidate(RHS) && isSetCCCandidate(LHS)))) {
|
||||
ORNodes.push_back(OR);
|
||||
OR = (LHS->getOpcode() == ISD::OR) ? LHS.getNode() : RHS.getNode();
|
||||
LHS = OR->getOperand(0);
|
||||
RHS = OR->getOperand(1);
|
||||
}
|
||||
|
||||
// The last OR node should match or(setcc(eq, cmp 0), setcc(eq, cmp 0)).
|
||||
if (!(isSetCCCandidate(LHS) && isSetCCCandidate(RHS)) ||
|
||||
!isORCandidate(SDValue(OR, 0)))
|
||||
return SDValue();
|
||||
|
||||
// We have a or(setcc(eq, cmp 0), setcc(eq, cmp 0)) pattern, try to lower it
|
||||
// to
|
||||
// or(srl(ctlz),srl(ctlz)).
|
||||
// The dag combiner can then fold it into:
|
||||
// srl(or(ctlz, ctlz)).
|
||||
EVT VT = OR->getValueType(0);
|
||||
SDValue NewLHS = lowerX86CmpEqZeroToCtlzSrl(LHS, VT, DAG);
|
||||
SDValue Ret, NewRHS;
|
||||
if (NewLHS && (NewRHS = lowerX86CmpEqZeroToCtlzSrl(RHS, VT, DAG)))
|
||||
Ret = DAG.getNode(ISD::OR, SDLoc(OR), VT, NewLHS, NewRHS);
|
||||
|
||||
if (!Ret)
|
||||
return SDValue();
|
||||
|
||||
// Try to lower nodes matching the or(or, setcc(eq, cmp 0)) pattern.
|
||||
while (ORNodes.size() > 0) {
|
||||
OR = ORNodes.pop_back_val();
|
||||
LHS = OR->getOperand(0);
|
||||
RHS = OR->getOperand(1);
|
||||
// Swap rhs with lhs to match or(setcc(eq, cmp, 0), or).
|
||||
if (RHS->getOpcode() == ISD::OR)
|
||||
std::swap(LHS, RHS);
|
||||
EVT VT = OR->getValueType(0);
|
||||
SDValue NewRHS = lowerX86CmpEqZeroToCtlzSrl(RHS, VT, DAG);
|
||||
if (!NewRHS)
|
||||
return SDValue();
|
||||
Ret = DAG.getNode(ISD::OR, SDLoc(OR), VT, Ret, NewRHS);
|
||||
}
|
||||
|
||||
if (Ret)
|
||||
Ret = DAG.getNode(ISD::ZERO_EXTEND, SDLoc(N), N->getValueType(0), Ret);
|
||||
|
||||
return Ret;
|
||||
}
|
||||
|
||||
static SDValue combineOr(SDNode *N, SelectionDAG &DAG,
|
||||
TargetLowering::DAGCombinerInfo &DCI,
|
||||
const X86Subtarget &Subtarget) {
|
||||
@ -31121,6 +31232,9 @@ static SDValue combineZext(SDNode *N, SelectionDAG &DAG,
|
||||
if (SDValue NewAdd = promoteExtBeforeAdd(N, DAG, Subtarget))
|
||||
return NewAdd;
|
||||
|
||||
if (SDValue R = combineOrCmpEqZeroToCtlzSrl(N, DAG, DCI, Subtarget))
|
||||
return R;
|
||||
|
||||
return SDValue();
|
||||
}
|
||||
|
||||
|
@ -771,6 +771,8 @@ namespace llvm {
|
||||
|
||||
bool isCheapToSpeculateCtlz() const override;
|
||||
|
||||
bool isCtlzFast() const override;
|
||||
|
||||
bool hasBitPreservingFPLogic(EVT VT) const override {
|
||||
return VT == MVT::f32 || VT == MVT::f64 || VT.isVector();
|
||||
}
|
||||
|
@ -890,6 +890,7 @@ def CallImmAddr : Predicate<"Subtarget->isLegalToCallImmediateAddr()">;
|
||||
def FavorMemIndirectCall : Predicate<"!Subtarget->callRegIndirect()">;
|
||||
def NotSlowIncDec : Predicate<"!Subtarget->slowIncDec()">;
|
||||
def HasFastMem32 : Predicate<"!Subtarget->isUnalignedMem32Slow()">;
|
||||
def HasFastLZCNT : Predicate<"Subtarget->hasFastLZCNT()">;
|
||||
def HasMFence : Predicate<"Subtarget->hasMFence()">;
|
||||
|
||||
//===----------------------------------------------------------------------===//
|
||||
|
@ -284,6 +284,7 @@ void X86Subtarget::initializeEnvironment() {
|
||||
HasFastPartialYMMWrite = false;
|
||||
HasFastScalarFSQRT = false;
|
||||
HasFastVectorFSQRT = false;
|
||||
HasFastLZCNT = false;
|
||||
HasSlowDivide32 = false;
|
||||
HasSlowDivide64 = false;
|
||||
PadShortFunctions = false;
|
||||
|
@ -215,6 +215,9 @@ protected:
|
||||
/// 64-bit divisions and should be used when possible.
|
||||
bool HasSlowDivide64;
|
||||
|
||||
/// True if LZCNT instruction is fast.
|
||||
bool HasFastLZCNT;
|
||||
|
||||
/// True if the short functions should be padded to prevent
|
||||
/// a stall when returning too early.
|
||||
bool PadShortFunctions;
|
||||
@ -444,6 +447,7 @@ public:
|
||||
bool hasFastPartialYMMWrite() const { return HasFastPartialYMMWrite; }
|
||||
bool hasFastScalarFSQRT() const { return HasFastScalarFSQRT; }
|
||||
bool hasFastVectorFSQRT() const { return HasFastVectorFSQRT; }
|
||||
bool hasFastLZCNT() const { return HasFastLZCNT; }
|
||||
bool hasSlowDivide32() const { return HasSlowDivide32; }
|
||||
bool hasSlowDivide64() const { return HasSlowDivide64; }
|
||||
bool padShortFunctions() const { return PadShortFunctions; }
|
||||
|
341
test/CodeGen/X86/lzcnt-zext-cmp.ll
Normal file
341
test/CodeGen/X86/lzcnt-zext-cmp.ll
Normal file
@ -0,0 +1,341 @@
|
||||
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
|
||||
; Test patterns which generates lzcnt instructions.
|
||||
; Eg: zext(or(setcc(cmp), setcc(cmp))) -> shr(or(lzcnt, lzcnt))
|
||||
; RUN: llc < %s -mtriple=x86_64-pc-linux -mcpu=btver2 | FileCheck %s
|
||||
; RUN: llc < %s -mtriple=x86_64-pc-linux -mcpu=btver2 -mattr=-fast-lzcnt | FileCheck --check-prefix=NOFASTLZCNT %s
|
||||
|
||||
; Test one 32-bit input, output is 32-bit, no transformations expected.
|
||||
define i32 @test_zext_cmp0(i32 %a) {
|
||||
; CHECK-LABEL: test_zext_cmp0:
|
||||
; CHECK: # BB#0: # %entry
|
||||
; CHECK-NEXT: xorl %eax, %eax
|
||||
; CHECK-NEXT: testl %edi, %edi
|
||||
; CHECK-NEXT: sete %al
|
||||
; CHECK-NEXT: retq
|
||||
;
|
||||
; NOFASTLZCNT-LABEL: test_zext_cmp0:
|
||||
; NOFASTLZCNT: # BB#0: # %entry
|
||||
; NOFASTLZCNT-NEXT: xorl %eax, %eax
|
||||
; NOFASTLZCNT-NEXT: testl %edi, %edi
|
||||
; NOFASTLZCNT-NEXT: sete %al
|
||||
; NOFASTLZCNT-NEXT: retq
|
||||
entry:
|
||||
%cmp = icmp eq i32 %a, 0
|
||||
%conv = zext i1 %cmp to i32
|
||||
ret i32 %conv
|
||||
}
|
||||
|
||||
; Test two 32-bit inputs, output is 32-bit.
|
||||
define i32 @test_zext_cmp1(i32 %a, i32 %b) {
|
||||
; CHECK-LABEL: test_zext_cmp1:
|
||||
; CHECK: # BB#0:
|
||||
; CHECK-NEXT: lzcntl %edi, %ecx
|
||||
; CHECK-NEXT: lzcntl %esi, %eax
|
||||
; CHECK-NEXT: orl %ecx, %eax
|
||||
; CHECK-NEXT: shrl $5, %eax
|
||||
; CHECK-NEXT: retq
|
||||
;
|
||||
; NOFASTLZCNT-LABEL: test_zext_cmp1:
|
||||
; NOFASTLZCNT: # BB#0:
|
||||
; NOFASTLZCNT-NEXT: testl %edi, %edi
|
||||
; NOFASTLZCNT-NEXT: sete %al
|
||||
; NOFASTLZCNT-NEXT: testl %esi, %esi
|
||||
; NOFASTLZCNT-NEXT: sete %cl
|
||||
; NOFASTLZCNT-NEXT: orb %al, %cl
|
||||
; NOFASTLZCNT-NEXT: movzbl %cl, %eax
|
||||
; NOFASTLZCNT-NEXT: retq
|
||||
%cmp = icmp eq i32 %a, 0
|
||||
%cmp1 = icmp eq i32 %b, 0
|
||||
%or = or i1 %cmp, %cmp1
|
||||
%lor.ext = zext i1 %or to i32
|
||||
ret i32 %lor.ext
|
||||
}
|
||||
|
||||
; Test two 64-bit inputs, output is 64-bit.
|
||||
define i64 @test_zext_cmp2(i64 %a, i64 %b) {
|
||||
; CHECK-LABEL: test_zext_cmp2:
|
||||
; CHECK: # BB#0:
|
||||
; CHECK-NEXT: lzcntq %rdi, %rcx
|
||||
; CHECK-NEXT: lzcntq %rsi, %rax
|
||||
; CHECK-NEXT: orl %ecx, %eax
|
||||
; CHECK-NEXT: shrl $6, %eax
|
||||
; CHECK-NEXT: retq
|
||||
;
|
||||
; NOFASTLZCNT-LABEL: test_zext_cmp2:
|
||||
; NOFASTLZCNT: # BB#0:
|
||||
; NOFASTLZCNT-NEXT: testq %rdi, %rdi
|
||||
; NOFASTLZCNT-NEXT: sete %al
|
||||
; NOFASTLZCNT-NEXT: testq %rsi, %rsi
|
||||
; NOFASTLZCNT-NEXT: sete %cl
|
||||
; NOFASTLZCNT-NEXT: orb %al, %cl
|
||||
; NOFASTLZCNT-NEXT: movzbl %cl, %eax
|
||||
; NOFASTLZCNT-NEXT: retq
|
||||
%cmp = icmp eq i64 %a, 0
|
||||
%cmp1 = icmp eq i64 %b, 0
|
||||
%or = or i1 %cmp, %cmp1
|
||||
%lor.ext = zext i1 %or to i64
|
||||
ret i64 %lor.ext
|
||||
}
|
||||
|
||||
; Test two 16-bit inputs, output is 16-bit.
|
||||
; The transform is disabled for the 16-bit case, as we still have to clear the
|
||||
; upper 16-bits, adding one more instruction.
|
||||
define i16 @test_zext_cmp3(i16 %a, i16 %b) {
|
||||
; CHECK-LABEL: test_zext_cmp3:
|
||||
; CHECK: # BB#0:
|
||||
; CHECK-NEXT: testw %di, %di
|
||||
; CHECK-NEXT: sete %al
|
||||
; CHECK-NEXT: testw %si, %si
|
||||
; CHECK-NEXT: sete %cl
|
||||
; CHECK-NEXT: orb %al, %cl
|
||||
; CHECK-NEXT: movzbl %cl, %eax
|
||||
; CHECK-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
|
||||
; CHECK-NEXT: retq
|
||||
;
|
||||
; NOFASTLZCNT-LABEL: test_zext_cmp3:
|
||||
; NOFASTLZCNT: # BB#0:
|
||||
; NOFASTLZCNT-NEXT: testw %di, %di
|
||||
; NOFASTLZCNT-NEXT: sete %al
|
||||
; NOFASTLZCNT-NEXT: testw %si, %si
|
||||
; NOFASTLZCNT-NEXT: sete %cl
|
||||
; NOFASTLZCNT-NEXT: orb %al, %cl
|
||||
; NOFASTLZCNT-NEXT: movzbl %cl, %eax
|
||||
; NOFASTLZCNT-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
|
||||
; NOFASTLZCNT-NEXT: retq
|
||||
%cmp = icmp eq i16 %a, 0
|
||||
%cmp1 = icmp eq i16 %b, 0
|
||||
%or = or i1 %cmp, %cmp1
|
||||
%lor.ext = zext i1 %or to i16
|
||||
ret i16 %lor.ext
|
||||
}
|
||||
|
||||
; Test two 32-bit inputs, output is 64-bit.
|
||||
define i64 @test_zext_cmp4(i32 %a, i32 %b) {
|
||||
; CHECK-LABEL: test_zext_cmp4:
|
||||
; CHECK: # BB#0: # %entry
|
||||
; CHECK-NEXT: lzcntl %edi, %ecx
|
||||
; CHECK-NEXT: lzcntl %esi, %eax
|
||||
; CHECK-NEXT: orl %ecx, %eax
|
||||
; CHECK-NEXT: shrl $5, %eax
|
||||
; CHECK-NEXT: retq
|
||||
;
|
||||
; NOFASTLZCNT-LABEL: test_zext_cmp4:
|
||||
; NOFASTLZCNT: # BB#0: # %entry
|
||||
; NOFASTLZCNT-NEXT: testl %edi, %edi
|
||||
; NOFASTLZCNT-NEXT: sete %al
|
||||
; NOFASTLZCNT-NEXT: testl %esi, %esi
|
||||
; NOFASTLZCNT-NEXT: sete %cl
|
||||
; NOFASTLZCNT-NEXT: orb %al, %cl
|
||||
; NOFASTLZCNT-NEXT: movzbl %cl, %eax
|
||||
; NOFASTLZCNT-NEXT: retq
|
||||
entry:
|
||||
%cmp = icmp eq i32 %a, 0
|
||||
%cmp1 = icmp eq i32 %b, 0
|
||||
%0 = or i1 %cmp, %cmp1
|
||||
%conv = zext i1 %0 to i64
|
||||
ret i64 %conv
|
||||
}
|
||||
|
||||
; Test two 64-bit inputs, output is 32-bit.
|
||||
define i32 @test_zext_cmp5(i64 %a, i64 %b) {
|
||||
; CHECK-LABEL: test_zext_cmp5:
|
||||
; CHECK: # BB#0: # %entry
|
||||
; CHECK-NEXT: lzcntq %rdi, %rcx
|
||||
; CHECK-NEXT: lzcntq %rsi, %rax
|
||||
; CHECK-NEXT: orl %ecx, %eax
|
||||
; CHECK-NEXT: shrl $6, %eax
|
||||
; CHECK-NEXT: # kill: %EAX<def> %EAX<kill> %RAX<kill>
|
||||
; CHECK-NEXT: retq
|
||||
;
|
||||
; NOFASTLZCNT-LABEL: test_zext_cmp5:
|
||||
; NOFASTLZCNT: # BB#0: # %entry
|
||||
; NOFASTLZCNT-NEXT: testq %rdi, %rdi
|
||||
; NOFASTLZCNT-NEXT: sete %al
|
||||
; NOFASTLZCNT-NEXT: testq %rsi, %rsi
|
||||
; NOFASTLZCNT-NEXT: sete %cl
|
||||
; NOFASTLZCNT-NEXT: orb %al, %cl
|
||||
; NOFASTLZCNT-NEXT: movzbl %cl, %eax
|
||||
; NOFASTLZCNT-NEXT: retq
|
||||
entry:
|
||||
%cmp = icmp eq i64 %a, 0
|
||||
%cmp1 = icmp eq i64 %b, 0
|
||||
%0 = or i1 %cmp, %cmp1
|
||||
%lor.ext = zext i1 %0 to i32
|
||||
ret i32 %lor.ext
|
||||
}
|
||||
|
||||
; Test three 32-bit inputs, output is 32-bit.
|
||||
define i32 @test_zext_cmp6(i32 %a, i32 %b, i32 %c) {
|
||||
; CHECK-LABEL: test_zext_cmp6:
|
||||
; CHECK: # BB#0: # %entry
|
||||
; CHECK-NEXT: lzcntl %edi, %eax
|
||||
; CHECK-NEXT: lzcntl %esi, %ecx
|
||||
; CHECK-NEXT: orl %eax, %ecx
|
||||
; CHECK-NEXT: lzcntl %edx, %eax
|
||||
; CHECK-NEXT: orl %ecx, %eax
|
||||
; CHECK-NEXT: shrl $5, %eax
|
||||
; CHECK-NEXT: retq
|
||||
;
|
||||
; NOFASTLZCNT-LABEL: test_zext_cmp6:
|
||||
; NOFASTLZCNT: # BB#0: # %entry
|
||||
; NOFASTLZCNT-NEXT: testl %edi, %edi
|
||||
; NOFASTLZCNT-NEXT: sete %al
|
||||
; NOFASTLZCNT-NEXT: testl %esi, %esi
|
||||
; NOFASTLZCNT-NEXT: sete %cl
|
||||
; NOFASTLZCNT-NEXT: orb %al, %cl
|
||||
; NOFASTLZCNT-NEXT: testl %edx, %edx
|
||||
; NOFASTLZCNT-NEXT: sete %al
|
||||
; NOFASTLZCNT-NEXT: orb %cl, %al
|
||||
; NOFASTLZCNT-NEXT: movzbl %al, %eax
|
||||
; NOFASTLZCNT-NEXT: retq
|
||||
entry:
|
||||
%cmp = icmp eq i32 %a, 0
|
||||
%cmp1 = icmp eq i32 %b, 0
|
||||
%or.cond = or i1 %cmp, %cmp1
|
||||
%cmp2 = icmp eq i32 %c, 0
|
||||
%.cmp2 = or i1 %or.cond, %cmp2
|
||||
%lor.ext = zext i1 %.cmp2 to i32
|
||||
ret i32 %lor.ext
|
||||
}
|
||||
|
||||
; Test three 32-bit inputs, output is 32-bit, but compared to test_zext_cmp6 test,
|
||||
; %.cmp2 inputs' order is inverted.
|
||||
define i32 @test_zext_cmp7(i32 %a, i32 %b, i32 %c) {
|
||||
; CHECK-LABEL: test_zext_cmp7:
|
||||
; CHECK: # BB#0: # %entry
|
||||
; CHECK-NEXT: lzcntl %edi, %eax
|
||||
; CHECK-NEXT: lzcntl %esi, %ecx
|
||||
; CHECK-NEXT: orl %eax, %ecx
|
||||
; CHECK-NEXT: lzcntl %edx, %eax
|
||||
; CHECK-NEXT: orl %ecx, %eax
|
||||
; CHECK-NEXT: shrl $5, %eax
|
||||
; CHECK-NEXT: retq
|
||||
;
|
||||
; NOFASTLZCNT-LABEL: test_zext_cmp7:
|
||||
; NOFASTLZCNT: # BB#0: # %entry
|
||||
; NOFASTLZCNT-NEXT: testl %edi, %edi
|
||||
; NOFASTLZCNT-NEXT: sete %al
|
||||
; NOFASTLZCNT-NEXT: testl %esi, %esi
|
||||
; NOFASTLZCNT-NEXT: sete %cl
|
||||
; NOFASTLZCNT-NEXT: orb %al, %cl
|
||||
; NOFASTLZCNT-NEXT: testl %edx, %edx
|
||||
; NOFASTLZCNT-NEXT: sete %al
|
||||
; NOFASTLZCNT-NEXT: orb %cl, %al
|
||||
; NOFASTLZCNT-NEXT: movzbl %al, %eax
|
||||
; NOFASTLZCNT-NEXT: retq
|
||||
entry:
|
||||
%cmp = icmp eq i32 %a, 0
|
||||
%cmp1 = icmp eq i32 %b, 0
|
||||
%or.cond = or i1 %cmp, %cmp1
|
||||
%cmp2 = icmp eq i32 %c, 0
|
||||
%.cmp2 = or i1 %cmp2, %or.cond
|
||||
%lor.ext = zext i1 %.cmp2 to i32
|
||||
ret i32 %lor.ext
|
||||
}
|
||||
|
||||
; Test four 32-bit inputs, output is 32-bit.
|
||||
define i32 @test_zext_cmp8(i32 %a, i32 %b, i32 %c, i32 %d) {
|
||||
; CHECK-LABEL: test_zext_cmp8:
|
||||
; CHECK: # BB#0: # %entry
|
||||
; CHECK-NEXT: lzcntl %edi, %eax
|
||||
; CHECK-NEXT: lzcntl %esi, %esi
|
||||
; CHECK-NEXT: lzcntl %edx, %edx
|
||||
; CHECK-NEXT: orl %eax, %esi
|
||||
; CHECK-NEXT: lzcntl %ecx, %eax
|
||||
; CHECK-NEXT: orl %edx, %eax
|
||||
; CHECK-NEXT: orl %esi, %eax
|
||||
; CHECK-NEXT: shrl $5, %eax
|
||||
; CHECK-NEXT: retq
|
||||
;
|
||||
; NOFASTLZCNT-LABEL: test_zext_cmp8:
|
||||
; NOFASTLZCNT: # BB#0: # %entry
|
||||
; NOFASTLZCNT-NEXT: testl %edi, %edi
|
||||
; NOFASTLZCNT-NEXT: sete %dil
|
||||
; NOFASTLZCNT-NEXT: testl %esi, %esi
|
||||
; NOFASTLZCNT-NEXT: sete %al
|
||||
; NOFASTLZCNT-NEXT: orb %dil, %al
|
||||
; NOFASTLZCNT-NEXT: testl %edx, %edx
|
||||
; NOFASTLZCNT-NEXT: sete %dl
|
||||
; NOFASTLZCNT-NEXT: testl %ecx, %ecx
|
||||
; NOFASTLZCNT-NEXT: sete %cl
|
||||
; NOFASTLZCNT-NEXT: orb %dl, %cl
|
||||
; NOFASTLZCNT-NEXT: orb %al, %cl
|
||||
; NOFASTLZCNT-NEXT: movzbl %cl, %eax
|
||||
; NOFASTLZCNT-NEXT: retq
|
||||
entry:
|
||||
%cmp = icmp eq i32 %a, 0
|
||||
%cmp1 = icmp eq i32 %b, 0
|
||||
%or.cond = or i1 %cmp, %cmp1
|
||||
%cmp3 = icmp eq i32 %c, 0
|
||||
%or.cond5 = or i1 %or.cond, %cmp3
|
||||
%cmp4 = icmp eq i32 %d, 0
|
||||
%.cmp4 = or i1 %or.cond5, %cmp4
|
||||
%lor.ext = zext i1 %.cmp4 to i32
|
||||
ret i32 %lor.ext
|
||||
}
|
||||
|
||||
; Test one 32-bit input, one 64-bit input, output is 32-bit.
|
||||
define i32 @test_zext_cmp9(i32 %a, i64 %b) {
|
||||
; CHECK-LABEL: test_zext_cmp9:
|
||||
; CHECK: # BB#0: # %entry
|
||||
; CHECK-NEXT: lzcntq %rsi, %rax
|
||||
; CHECK-NEXT: lzcntl %edi, %ecx
|
||||
; CHECK-NEXT: shrl $5, %ecx
|
||||
; CHECK-NEXT: shrl $6, %eax
|
||||
; CHECK-NEXT: orl %ecx, %eax
|
||||
; CHECK-NEXT: # kill: %EAX<def> %EAX<kill> %RAX<kill>
|
||||
; CHECK-NEXT: retq
|
||||
;
|
||||
; NOFASTLZCNT-LABEL: test_zext_cmp9:
|
||||
; NOFASTLZCNT: # BB#0: # %entry
|
||||
; NOFASTLZCNT-NEXT: testl %edi, %edi
|
||||
; NOFASTLZCNT-NEXT: sete %al
|
||||
; NOFASTLZCNT-NEXT: testq %rsi, %rsi
|
||||
; NOFASTLZCNT-NEXT: sete %cl
|
||||
; NOFASTLZCNT-NEXT: orb %al, %cl
|
||||
; NOFASTLZCNT-NEXT: movzbl %cl, %eax
|
||||
; NOFASTLZCNT-NEXT: retq
|
||||
entry:
|
||||
%cmp = icmp eq i32 %a, 0
|
||||
%cmp1 = icmp eq i64 %b, 0
|
||||
%0 = or i1 %cmp, %cmp1
|
||||
%lor.ext = zext i1 %0 to i32
|
||||
ret i32 %lor.ext
|
||||
}
|
||||
|
||||
; Test 2 128-bit inputs, output is 32-bit, no transformations expected.
|
||||
define i32 @test_zext_cmp10(i64 %a.coerce0, i64 %a.coerce1, i64 %b.coerce0, i64 %b.coerce1) {
|
||||
; CHECK-LABEL: test_zext_cmp10:
|
||||
; CHECK: # BB#0: # %entry
|
||||
; CHECK-NEXT: orq %rsi, %rdi
|
||||
; CHECK-NEXT: sete %al
|
||||
; CHECK-NEXT: orq %rcx, %rdx
|
||||
; CHECK-NEXT: sete %cl
|
||||
; CHECK-NEXT: orb %al, %cl
|
||||
; CHECK-NEXT: movzbl %cl, %eax
|
||||
; CHECK-NEXT: retq
|
||||
;
|
||||
; NOFASTLZCNT-LABEL: test_zext_cmp10:
|
||||
; NOFASTLZCNT: # BB#0: # %entry
|
||||
; NOFASTLZCNT-NEXT: orq %rsi, %rdi
|
||||
; NOFASTLZCNT-NEXT: sete %al
|
||||
; NOFASTLZCNT-NEXT: orq %rcx, %rdx
|
||||
; NOFASTLZCNT-NEXT: sete %cl
|
||||
; NOFASTLZCNT-NEXT: orb %al, %cl
|
||||
; NOFASTLZCNT-NEXT: movzbl %cl, %eax
|
||||
; NOFASTLZCNT-NEXT: retq
|
||||
entry:
|
||||
%a.sroa.2.0.insert.ext = zext i64 %a.coerce1 to i128
|
||||
%a.sroa.2.0.insert.shift = shl nuw i128 %a.sroa.2.0.insert.ext, 64
|
||||
%a.sroa.0.0.insert.ext = zext i64 %a.coerce0 to i128
|
||||
%a.sroa.0.0.insert.insert = or i128 %a.sroa.2.0.insert.shift, %a.sroa.0.0.insert.ext
|
||||
%b.sroa.2.0.insert.ext = zext i64 %b.coerce1 to i128
|
||||
%b.sroa.2.0.insert.shift = shl nuw i128 %b.sroa.2.0.insert.ext, 64
|
||||
%b.sroa.0.0.insert.ext = zext i64 %b.coerce0 to i128
|
||||
%b.sroa.0.0.insert.insert = or i128 %b.sroa.2.0.insert.shift, %b.sroa.0.0.insert.ext
|
||||
%cmp = icmp eq i128 %a.sroa.0.0.insert.insert, 0
|
||||
%cmp3 = icmp eq i128 %b.sroa.0.0.insert.insert, 0
|
||||
%0 = or i1 %cmp, %cmp3
|
||||
%lor.ext = zext i1 %0 to i32
|
||||
ret i32 %lor.ext
|
||||
}
|
Loading…
x
Reference in New Issue
Block a user