diff --git a/include/llvm/Target/TargetInstrInfo.h b/include/llvm/Target/TargetInstrInfo.h index 73efc507fab..cfb9dd7de24 100644 --- a/include/llvm/Target/TargetInstrInfo.h +++ b/include/llvm/Target/TargetInstrInfo.h @@ -14,6 +14,7 @@ #ifndef LLVM_TARGET_TARGETINSTRINFO_H #define LLVM_TARGET_TARGETINSTRINFO_H +#include "llvm/ADT/SmallSet.h" #include "llvm/MC/MCInstrInfo.h" #include "llvm/CodeGen/DFAPacketizer.h" #include "llvm/CodeGen/MachineFunction.h" @@ -693,6 +694,16 @@ public: return false; } + /// optimizeLoadInstr - Try to remove the load by folding it to a register + /// operand at the use. We fold the load instructions if and only if the + /// def and use are in the same BB. + virtual MachineInstr* optimizeLoadInstr(MachineInstr *MI, + const MachineRegisterInfo *MRI, + unsigned &FoldAsLoadDefReg, + MachineInstr *&DefMI) const { + return 0; + } + /// FoldImmediate - 'Reg' is known to be defined by a move immediate /// instruction, try to fold the immediate into the use instruction. virtual bool FoldImmediate(MachineInstr *UseMI, MachineInstr *DefMI, diff --git a/lib/CodeGen/PeepholeOptimizer.cpp b/lib/CodeGen/PeepholeOptimizer.cpp index 91c33c4af41..d9474bf2400 100644 --- a/lib/CodeGen/PeepholeOptimizer.cpp +++ b/lib/CodeGen/PeepholeOptimizer.cpp @@ -78,6 +78,7 @@ STATISTIC(NumReuse, "Number of extension results reused"); STATISTIC(NumBitcasts, "Number of bitcasts eliminated"); STATISTIC(NumCmps, "Number of compares eliminated"); STATISTIC(NumImmFold, "Number of move immediate folded"); +STATISTIC(NumLoadFold, "Number of loads folded"); namespace { class PeepholeOptimizer : public MachineFunctionPass { @@ -114,6 +115,7 @@ namespace { bool foldImmediate(MachineInstr *MI, MachineBasicBlock *MBB, SmallSet &ImmDefRegs, DenseMap &ImmDefMIs); + bool isLoadFoldable(MachineInstr *MI, unsigned &FoldAsLoadDefReg); }; } @@ -384,6 +386,29 @@ bool PeepholeOptimizer::optimizeCmpInstr(MachineInstr *MI, return false; } +/// isLoadFoldable - Check whether MI is a candidate for folding into a later +/// instruction. We only fold loads to virtual registers and the virtual +/// register defined has a single use. +bool PeepholeOptimizer::isLoadFoldable(MachineInstr *MI, + unsigned &FoldAsLoadDefReg) { + if (MI->canFoldAsLoad()) { + const MCInstrDesc &MCID = MI->getDesc(); + if (MCID.getNumDefs() == 1) { + unsigned Reg = MI->getOperand(0).getReg(); + // To reduce compilation time, we check MRI->hasOneUse when inserting + // loads. It should be checked when processing uses of the load, since + // uses can be removed during peephole. + if (!MI->getOperand(0).getSubReg() && + TargetRegisterInfo::isVirtualRegister(Reg) && + MRI->hasOneUse(Reg)) { + FoldAsLoadDefReg = Reg; + return true; + } + } + } + return false; +} + bool PeepholeOptimizer::isMoveImmediate(MachineInstr *MI, SmallSet &ImmDefRegs, DenseMap &ImmDefMIs) { @@ -441,6 +466,7 @@ bool PeepholeOptimizer::runOnMachineFunction(MachineFunction &MF) { SmallPtrSet LocalMIs; SmallSet ImmDefRegs; DenseMap ImmDefMIs; + unsigned FoldAsLoadDefReg; for (MachineFunction::iterator I = MF.begin(), E = MF.end(); I != E; ++I) { MachineBasicBlock *MBB = &*I; @@ -448,6 +474,7 @@ bool PeepholeOptimizer::runOnMachineFunction(MachineFunction &MF) { LocalMIs.clear(); ImmDefRegs.clear(); ImmDefMIs.clear(); + FoldAsLoadDefReg = 0; bool First = true; MachineBasicBlock::iterator PMII; @@ -456,12 +483,17 @@ bool PeepholeOptimizer::runOnMachineFunction(MachineFunction &MF) { MachineInstr *MI = &*MII; LocalMIs.insert(MI); + // If there exists an instruction which belongs to the following + // categories, we will discard the load candidate. if (MI->isLabel() || MI->isPHI() || MI->isImplicitDef() || MI->isKill() || MI->isInlineAsm() || MI->isDebugValue() || MI->hasUnmodeledSideEffects()) { + FoldAsLoadDefReg = 0; ++MII; continue; } + if (MI->mayStore() || MI->isCall()) + FoldAsLoadDefReg = 0; if (MI->isBitcast()) { if (optimizeBitcastInstr(MI, MBB)) { @@ -489,6 +521,31 @@ bool PeepholeOptimizer::runOnMachineFunction(MachineFunction &MF) { Changed |= foldImmediate(MI, MBB, ImmDefRegs, ImmDefMIs); } + // Check whether MI is a load candidate for folding into a later + // instruction. If MI is not a candidate, check whether we can fold an + // earlier load into MI. + if (!isLoadFoldable(MI, FoldAsLoadDefReg) && FoldAsLoadDefReg) { + // We need to fold load after optimizeCmpInstr, since optimizeCmpInstr + // can enable folding by converting SUB to CMP. + MachineInstr *DefMI = 0; + MachineInstr *FoldMI = TII->optimizeLoadInstr(MI, MRI, + FoldAsLoadDefReg, DefMI); + if (FoldMI) { + // Update LocalMIs since we replaced MI with FoldMI and deleted DefMI. + LocalMIs.erase(MI); + LocalMIs.erase(DefMI); + LocalMIs.insert(FoldMI); + MI->eraseFromParent(); + DefMI->eraseFromParent(); + ++NumLoadFold; + + // MI is replaced with FoldMI. + Changed = true; + PMII = FoldMI; + MII = llvm::next(PMII); + continue; + } + } First = false; PMII = MII; ++MII; diff --git a/lib/Target/X86/X86InstrInfo.cpp b/lib/Target/X86/X86InstrInfo.cpp index 3f16372497c..ec793f8eb63 100644 --- a/lib/Target/X86/X86InstrInfo.cpp +++ b/lib/Target/X86/X86InstrInfo.cpp @@ -3323,6 +3323,81 @@ optimizeCompareInstr(MachineInstr *CmpInstr, unsigned SrcReg, unsigned SrcReg2, return true; } +/// optimizeLoadInstr - Try to remove the load by folding it to a register +/// operand at the use. We fold the load instructions if load defines a virtual +/// register, the virtual register is used once in the same BB, and the +/// instructions in-between do not load or store, and have no side effects. +MachineInstr* X86InstrInfo:: +optimizeLoadInstr(MachineInstr *MI, const MachineRegisterInfo *MRI, + unsigned &FoldAsLoadDefReg, + MachineInstr *&DefMI) const { + if (FoldAsLoadDefReg == 0) + return 0; + // To be conservative, if there exists another load, clear the load candidate. + if (MI->mayLoad()) { + FoldAsLoadDefReg = 0; + return 0; + } + + // Check whether we can move DefMI here. + DefMI = MRI->getVRegDef(FoldAsLoadDefReg); + assert(DefMI); + bool SawStore = false; + if (!DefMI->isSafeToMove(this, 0, SawStore)) + return 0; + + // We try to commute MI if possible. + unsigned IdxEnd = (MI->isCommutable()) ? 2 : 1; + for (unsigned Idx = 0; Idx < IdxEnd; Idx++) { + // Collect information about virtual register operands of MI. + unsigned SrcOperandId = 0; + bool FoundSrcOperand = false; + for (unsigned i = 0, e = MI->getDesc().getNumOperands(); i != e; ++i) { + MachineOperand &MO = MI->getOperand(i); + if (!MO.isReg()) + continue; + unsigned Reg = MO.getReg(); + if (Reg != FoldAsLoadDefReg) + continue; + // Do not fold if we have a subreg use or a def or multiple uses. + if (MO.getSubReg() || MO.isDef() || FoundSrcOperand) + return 0; + + SrcOperandId = i; + FoundSrcOperand = true; + } + if (!FoundSrcOperand) return 0; + + // Check whether we can fold the def into SrcOperandId. + SmallVector Ops; + Ops.push_back(SrcOperandId); + MachineInstr *FoldMI = foldMemoryOperand(MI, Ops, DefMI); + if (FoldMI) { + FoldAsLoadDefReg = 0; + return FoldMI; + } + + if (Idx == 1) { + // MI was changed but it didn't help, commute it back! + commuteInstruction(MI, false); + return 0; + } + + // Check whether we can commute MI and enable folding. + if (MI->isCommutable()) { + MachineInstr *NewMI = commuteInstruction(MI, false); + // Unable to commute. + if (!NewMI) return 0; + if (NewMI != MI) { + // New instruction. It doesn't need to be kept. + NewMI->eraseFromParent(); + return 0; + } + } + } + return 0; +} + /// Expand2AddrUndef - Expand a single-def pseudo instruction to a two-addr /// instruction with two undef reads of the register being defined. This is /// used for mapping: diff --git a/lib/Target/X86/X86InstrInfo.h b/lib/Target/X86/X86InstrInfo.h index ec9b2e619d9..9ed5210b22f 100644 --- a/lib/Target/X86/X86InstrInfo.h +++ b/lib/Target/X86/X86InstrInfo.h @@ -387,6 +387,14 @@ public: unsigned SrcReg2, int CmpMask, int CmpValue, const MachineRegisterInfo *MRI) const; + /// optimizeLoadInstr - Try to remove the load by folding it to a register + /// operand at the use. We fold the load instructions if and only if the + /// def and use are in the same BB. + virtual MachineInstr* optimizeLoadInstr(MachineInstr *MI, + const MachineRegisterInfo *MRI, + unsigned &FoldAsLoadDefReg, + MachineInstr *&DefMI) const; + private: MachineInstr * convertToThreeAddressWithLEA(unsigned MIOpc, MachineFunction::iterator &MFI, diff --git a/test/CodeGen/X86/2012-05-19-avx2-store.ll b/test/CodeGen/X86/2012-05-19-avx2-store.ll index 61fef90139d..1c1e8e2f0a2 100644 --- a/test/CodeGen/X86/2012-05-19-avx2-store.ll +++ b/test/CodeGen/X86/2012-05-19-avx2-store.ll @@ -3,8 +3,7 @@ define void @double_save(<4 x i32>* %Ap, <4 x i32>* %Bp, <8 x i32>* %P) nounwind ssp { entry: ; CHECK: vmovaps - ; CHECK: vmovaps - ; CHECK: vinsertf128 + ; CHECK: vinsertf128 $1, ([[A0:%rdi|%rsi]]), ; CHECK: vmovups %A = load <4 x i32>* %Ap %B = load <4 x i32>* %Bp diff --git a/test/CodeGen/X86/break-sse-dep.ll b/test/CodeGen/X86/break-sse-dep.ll index 3e658671436..4d801891da5 100644 --- a/test/CodeGen/X86/break-sse-dep.ll +++ b/test/CodeGen/X86/break-sse-dep.ll @@ -34,8 +34,7 @@ entry: define double @squirt(double* %x) nounwind { entry: ; CHECK: squirt: -; CHECK: movsd ([[A0]]), %xmm0 -; CHECK: sqrtsd %xmm0, %xmm0 +; CHECK: sqrtsd ([[A0]]), %xmm0 %z = load double* %x %t = call double @llvm.sqrt.f64(double %z) ret double %t diff --git a/test/CodeGen/X86/fold-load.ll b/test/CodeGen/X86/fold-load.ll index e03cb7edb58..c961f7576f9 100644 --- a/test/CodeGen/X86/fold-load.ll +++ b/test/CodeGen/X86/fold-load.ll @@ -45,3 +45,29 @@ L: } +; rdar://10554090 +; xor in exit block will be CSE'ed and load will be folded to xor in entry. +define i1 @test3(i32* %P, i32* %Q) nounwind { +; CHECK: test3: +; CHECK: movl 8(%esp), %eax +; CHECK: xorl (%eax), +; CHECK: j +; CHECK-NOT: xor +entry: + %0 = load i32* %P, align 4 + %1 = load i32* %Q, align 4 + %2 = xor i32 %0, %1 + %3 = and i32 %2, 65535 + %4 = icmp eq i32 %3, 0 + br i1 %4, label %exit, label %land.end + +exit: + %shr.i.i19 = xor i32 %1, %0 + %5 = and i32 %shr.i.i19, 2147418112 + %6 = icmp eq i32 %5, 0 + br label %land.end + +land.end: + %7 = phi i1 [ %6, %exit ], [ false, %entry ] + ret i1 %7 +} diff --git a/test/CodeGen/X86/fold-pcmpeqd-1.ll b/test/CodeGen/X86/fold-pcmpeqd-1.ll index cc4198d7caf..a35dccddbab 100644 --- a/test/CodeGen/X86/fold-pcmpeqd-1.ll +++ b/test/CodeGen/X86/fold-pcmpeqd-1.ll @@ -1,11 +1,14 @@ -; RUN: llc < %s -march=x86 -mattr=+sse2 > %t -; RUN: grep pcmpeqd %t | count 1 -; RUN: grep xor %t | count 1 -; RUN: not grep LCP %t +; RUN: llc < %s -march=x86 -mattr=+sse2 | FileCheck %s define <2 x double> @foo() nounwind { ret <2 x double> bitcast (<2 x i64> to <2 x double>) +; CHECK: foo: +; CHECK: pcmpeqd %xmm{{[0-9]+}}, %xmm{{[0-9]+}} +; CHECK-NEXT: ret } define <2 x double> @bar() nounwind { ret <2 x double> bitcast (<2 x i64> to <2 x double>) +; CHECK: bar: +; CHECK: xorps %xmm{{[0-9]+}}, %xmm{{[0-9]+}} +; CHECK-NEXT: ret } diff --git a/test/CodeGen/X86/sse-minmax.ll b/test/CodeGen/X86/sse-minmax.ll index 4405f684512..5d3dbce1df9 100644 --- a/test/CodeGen/X86/sse-minmax.ll +++ b/test/CodeGen/X86/sse-minmax.ll @@ -1,6 +1,6 @@ -; RUN: llc < %s -march=x86-64 -mcpu=nehalem -asm-verbose=false | FileCheck %s -; RUN: llc < %s -march=x86-64 -mcpu=nehalem -asm-verbose=false -enable-unsafe-fp-math -enable-no-nans-fp-math | FileCheck -check-prefix=UNSAFE %s -; RUN: llc < %s -march=x86-64 -mcpu=nehalem -asm-verbose=false -enable-no-nans-fp-math | FileCheck -check-prefix=FINITE %s +; RUN: llc < %s -march=x86-64 -mtriple=x86_64-apple-darwin -mcpu=nehalem -asm-verbose=false | FileCheck %s +; RUN: llc < %s -march=x86-64 -mtriple=x86_64-apple-darwin -mcpu=nehalem -asm-verbose=false -enable-unsafe-fp-math -enable-no-nans-fp-math | FileCheck -check-prefix=UNSAFE %s +; RUN: llc < %s -march=x86-64 -mtriple=x86_64-apple-darwin -mcpu=nehalem -asm-verbose=false -enable-no-nans-fp-math | FileCheck -check-prefix=FINITE %s ; Some of these patterns can be matched as SSE min or max. Some of ; then can be matched provided that the operands are swapped. @@ -137,16 +137,13 @@ define double @ole_inverse(double %x, double %y) nounwind { } ; CHECK: ogt_x: -; CHECK-NEXT: xorp{{[sd]}} %xmm1, %xmm1 -; CHECK-NEXT: maxsd %xmm1, %xmm0 +; CHECK-NEXT: maxsd LCP{{.*}}(%rip), %xmm0 ; CHECK-NEXT: ret ; UNSAFE: ogt_x: -; UNSAFE-NEXT: xorp{{[sd]}} %xmm1, %xmm1 -; UNSAFE-NEXT: maxsd %xmm1, %xmm0 +; UNSAFE-NEXT: maxsd LCP{{.*}}(%rip), %xmm0 ; UNSAFE-NEXT: ret ; FINITE: ogt_x: -; FINITE-NEXT: xorp{{[sd]}} %xmm1, %xmm1 -; FINITE-NEXT: maxsd %xmm1, %xmm0 +; FINITE-NEXT: maxsd LCP{{.*}}(%rip), %xmm0 ; FINITE-NEXT: ret define double @ogt_x(double %x) nounwind { %c = fcmp ogt double %x, 0.000000e+00 @@ -155,16 +152,13 @@ define double @ogt_x(double %x) nounwind { } ; CHECK: olt_x: -; CHECK-NEXT: xorp{{[sd]}} %xmm1, %xmm1 -; CHECK-NEXT: minsd %xmm1, %xmm0 +; CHECK-NEXT: minsd LCP{{.*}}(%rip), %xmm0 ; CHECK-NEXT: ret ; UNSAFE: olt_x: -; UNSAFE-NEXT: xorp{{[sd]}} %xmm1, %xmm1 -; UNSAFE-NEXT: minsd %xmm1, %xmm0 +; UNSAFE-NEXT: minsd LCP{{.*}}(%rip), %xmm0 ; UNSAFE-NEXT: ret ; FINITE: olt_x: -; FINITE-NEXT: xorp{{[sd]}} %xmm1, %xmm1 -; FINITE-NEXT: minsd %xmm1, %xmm0 +; FINITE-NEXT: minsd LCP{{.*}}(%rip), %xmm0 ; FINITE-NEXT: ret define double @olt_x(double %x) nounwind { %c = fcmp olt double %x, 0.000000e+00 @@ -217,12 +211,10 @@ define double @olt_inverse_x(double %x) nounwind { ; CHECK: oge_x: ; CHECK: ucomisd %xmm1, %xmm0 ; UNSAFE: oge_x: -; UNSAFE-NEXT: xorp{{[sd]}} %xmm1, %xmm1 -; UNSAFE-NEXT: maxsd %xmm1, %xmm0 +; UNSAFE-NEXT: maxsd LCP{{.*}}(%rip), %xmm0 ; UNSAFE-NEXT: ret ; FINITE: oge_x: -; FINITE-NEXT: xorp{{[sd]}} %xmm1, %xmm1 -; FINITE-NEXT: maxsd %xmm1, %xmm0 +; FINITE-NEXT: maxsd LCP{{.*}}(%rip), %xmm0 ; FINITE-NEXT: ret define double @oge_x(double %x) nounwind { %c = fcmp oge double %x, 0.000000e+00 @@ -233,12 +225,10 @@ define double @oge_x(double %x) nounwind { ; CHECK: ole_x: ; CHECK: ucomisd %xmm0, %xmm1 ; UNSAFE: ole_x: -; UNSAFE-NEXT: xorp{{[sd]}} %xmm1, %xmm1 -; UNSAFE-NEXT: minsd %xmm1, %xmm0 +; UNSAFE-NEXT: minsd LCP{{.*}}(%rip), %xmm0 ; UNSAFE-NEXT: ret ; FINITE: ole_x: -; FINITE-NEXT: xorp{{[sd]}} %xmm1, %xmm1 -; FINITE-NEXT: minsd %xmm1, %xmm0 +; FINITE-NEXT: minsd LCP{{.*}}(%rip), %xmm0 ; FINITE-NEXT: ret define double @ole_x(double %x) nounwind { %c = fcmp ole double %x, 0.000000e+00 @@ -411,12 +401,10 @@ define double @ule_inverse(double %x, double %y) nounwind { ; CHECK: ugt_x: ; CHECK: ucomisd %xmm0, %xmm1 ; UNSAFE: ugt_x: -; UNSAFE-NEXT: xorp{{[sd]}} %xmm1, %xmm1 -; UNSAFE-NEXT: maxsd %xmm1, %xmm0 +; UNSAFE-NEXT: maxsd LCP{{.*}}(%rip), %xmm0 ; UNSAFE-NEXT: ret ; FINITE: ugt_x: -; FINITE-NEXT: xorp{{[sd]}} %xmm1, %xmm1 -; FINITE-NEXT: maxsd %xmm1, %xmm0 +; FINITE-NEXT: maxsd LCP{{.*}}(%rip), %xmm0 ; FINITE-NEXT: ret define double @ugt_x(double %x) nounwind { %c = fcmp ugt double %x, 0.000000e+00 @@ -427,12 +415,10 @@ define double @ugt_x(double %x) nounwind { ; CHECK: ult_x: ; CHECK: ucomisd %xmm1, %xmm0 ; UNSAFE: ult_x: -; UNSAFE-NEXT: xorp{{[sd]}} %xmm1, %xmm1 -; UNSAFE-NEXT: minsd %xmm1, %xmm0 +; UNSAFE-NEXT: minsd LCP{{.*}}(%rip), %xmm0 ; UNSAFE-NEXT: ret ; FINITE: ult_x: -; FINITE-NEXT: xorp{{[sd]}} %xmm1, %xmm1 -; FINITE-NEXT: minsd %xmm1, %xmm0 +; FINITE-NEXT: minsd LCP{{.*}}(%rip), %xmm0 ; FINITE-NEXT: ret define double @ult_x(double %x) nounwind { %c = fcmp ult double %x, 0.000000e+00 @@ -482,12 +468,10 @@ define double @ult_inverse_x(double %x) nounwind { ; CHECK-NEXT: movap{{[sd]}} %xmm1, %xmm0 ; CHECK-NEXT: ret ; UNSAFE: uge_x: -; UNSAFE-NEXT: xorp{{[sd]}} %xmm1, %xmm1 -; UNSAFE-NEXT: maxsd %xmm1, %xmm0 +; UNSAFE-NEXT: maxsd LCP{{.*}}(%rip), %xmm0 ; UNSAFE-NEXT: ret ; FINITE: uge_x: -; FINITE-NEXT: xorp{{[sd]}} %xmm1, %xmm1 -; FINITE-NEXT: maxsd %xmm1, %xmm0 +; FINITE-NEXT: maxsd LCP{{.*}}(%rip), %xmm0 ; FINITE-NEXT: ret define double @uge_x(double %x) nounwind { %c = fcmp uge double %x, 0.000000e+00 @@ -501,12 +485,10 @@ define double @uge_x(double %x) nounwind { ; CHECK-NEXT: movap{{[sd]}} %xmm1, %xmm0 ; CHECK-NEXT: ret ; UNSAFE: ule_x: -; UNSAFE-NEXT: xorp{{[sd]}} %xmm1, %xmm1 -; UNSAFE-NEXT: minsd %xmm1, %xmm0 +; UNSAFE-NEXT: minsd LCP{{.*}}(%rip), %xmm0 ; UNSAFE-NEXT: ret ; FINITE: ule_x: -; FINITE-NEXT: xorp{{[sd]}} %xmm1, %xmm1 -; FINITE-NEXT: minsd %xmm1, %xmm0 +; FINITE-NEXT: minsd LCP{{.*}}(%rip), %xmm0 ; FINITE-NEXT: ret define double @ule_x(double %x) nounwind { %c = fcmp ule double %x, 0.000000e+00 @@ -515,8 +497,7 @@ define double @ule_x(double %x) nounwind { } ; CHECK: uge_inverse_x: -; CHECK-NEXT: xorp{{[sd]}} %xmm1, %xmm1 -; CHECK-NEXT: minsd %xmm1, %xmm0 +; CHECK-NEXT: minsd LCP{{.*}}(%rip), %xmm0 ; CHECK-NEXT: ret ; UNSAFE: uge_inverse_x: ; UNSAFE-NEXT: xorp{{[sd]}} %xmm1, %xmm1 @@ -535,8 +516,7 @@ define double @uge_inverse_x(double %x) nounwind { } ; CHECK: ule_inverse_x: -; CHECK-NEXT: xorp{{[sd]}} %xmm1, %xmm1 -; CHECK-NEXT: maxsd %xmm1, %xmm0 +; CHECK-NEXT: maxsd LCP{{.*}}(%rip), %xmm0 ; CHECK-NEXT: ret ; UNSAFE: ule_inverse_x: ; UNSAFE-NEXT: xorp{{[sd]}} %xmm1, %xmm1 diff --git a/test/CodeGen/X86/vec_compare.ll b/test/CodeGen/X86/vec_compare.ll index 39c9b770d5f..1e04f19ee89 100644 --- a/test/CodeGen/X86/vec_compare.ll +++ b/test/CodeGen/X86/vec_compare.ll @@ -1,4 +1,4 @@ -; RUN: llc < %s -march=x86 -mcpu=yonah | FileCheck %s +; RUN: llc < %s -march=x86 -mcpu=yonah -mtriple=i386-apple-darwin | FileCheck %s define <4 x i32> @test1(<4 x i32> %A, <4 x i32> %B) nounwind { @@ -14,8 +14,8 @@ define <4 x i32> @test1(<4 x i32> %A, <4 x i32> %B) nounwind { define <4 x i32> @test2(<4 x i32> %A, <4 x i32> %B) nounwind { ; CHECK: test2: ; CHECK: pcmp -; CHECK: pcmp -; CHECK: pxor +; CHECK: pxor LCP +; CHECK: movdqa ; CHECK: ret %C = icmp sge <4 x i32> %A, %B %D = sext <4 x i1> %C to <4 x i32>