llvm/lib/Target/X86/X86FastISel.cpp
Jeroen Ketema f5f9e9a3bf Arguments spilled on the stack before a function call may have
alignment requirements, for example in the case of vectors.
These requirements are exploited by the code generator by using
move instructions that have similar alignment requirements, e.g.,
movaps on x86.

Although the code generator properly aligns the arguments with
respect to the displacement of the stack pointer it computes,
the displacement itself may cause misalignment. For example if
we have

%3 = load <16 x float>, <16 x float>* %1, align 64
call void @bar(<16 x float> %3, i32 0)

the x86 back-end emits:

movaps  32(%ecx), %xmm2
movaps  (%ecx), %xmm0
movaps  16(%ecx), %xmm1
movaps  48(%ecx), %xmm3
subl    $20, %esp       <-- if %esp was 16-byte aligned before this instruction, it no longer will be afterwards 
movaps  %xmm3, (%esp)   <-- movaps requires 16-byte alignment, while %esp is not aligned as such.
movl    $0, 16(%esp)
calll   __bar

To solve this, we need to make sure that the computed value with which
the stack pointer is changed is a multiple af the maximal alignment seen
during its computation. With this change we get proper alignment:

subl    $32, %esp
movaps  %xmm3, (%esp)

Differential Revision: http://reviews.llvm.org/D12337


git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@248786 91177308-0d34-0410-b5e6-96231b3b80d8
2015-09-29 10:12:57 +00:00

3554 lines
122 KiB
C++

//===-- X86FastISel.cpp - X86 FastISel implementation ---------------------===//
//
// The LLVM Compiler Infrastructure
//
// This file is distributed under the University of Illinois Open Source
// License. See LICENSE.TXT for details.
//
//===----------------------------------------------------------------------===//
//
// This file defines the X86-specific support for the FastISel class. Much
// of the target-specific code is generated by tablegen in the file
// X86GenFastISel.inc, which is #included here.
//
//===----------------------------------------------------------------------===//
#include "X86.h"
#include "X86CallingConv.h"
#include "X86InstrBuilder.h"
#include "X86InstrInfo.h"
#include "X86MachineFunctionInfo.h"
#include "X86RegisterInfo.h"
#include "X86Subtarget.h"
#include "X86TargetMachine.h"
#include "llvm/Analysis/BranchProbabilityInfo.h"
#include "llvm/CodeGen/Analysis.h"
#include "llvm/CodeGen/FastISel.h"
#include "llvm/CodeGen/FunctionLoweringInfo.h"
#include "llvm/CodeGen/MachineConstantPool.h"
#include "llvm/CodeGen/MachineFrameInfo.h"
#include "llvm/CodeGen/MachineRegisterInfo.h"
#include "llvm/IR/CallSite.h"
#include "llvm/IR/CallingConv.h"
#include "llvm/IR/DerivedTypes.h"
#include "llvm/IR/GetElementPtrTypeIterator.h"
#include "llvm/IR/GlobalAlias.h"
#include "llvm/IR/GlobalVariable.h"
#include "llvm/IR/Instructions.h"
#include "llvm/IR/IntrinsicInst.h"
#include "llvm/IR/Operator.h"
#include "llvm/MC/MCAsmInfo.h"
#include "llvm/MC/MCSymbol.h"
#include "llvm/Support/ErrorHandling.h"
#include "llvm/Target/TargetOptions.h"
using namespace llvm;
namespace {
class X86FastISel final : public FastISel {
/// Subtarget - Keep a pointer to the X86Subtarget around so that we can
/// make the right decision when generating code for different targets.
const X86Subtarget *Subtarget;
/// X86ScalarSSEf32, X86ScalarSSEf64 - Select between SSE or x87
/// floating point ops.
/// When SSE is available, use it for f32 operations.
/// When SSE2 is available, use it for f64 operations.
bool X86ScalarSSEf64;
bool X86ScalarSSEf32;
public:
explicit X86FastISel(FunctionLoweringInfo &funcInfo,
const TargetLibraryInfo *libInfo)
: FastISel(funcInfo, libInfo) {
Subtarget = &funcInfo.MF->getSubtarget<X86Subtarget>();
X86ScalarSSEf64 = Subtarget->hasSSE2();
X86ScalarSSEf32 = Subtarget->hasSSE1();
}
bool fastSelectInstruction(const Instruction *I) override;
/// \brief The specified machine instr operand is a vreg, and that
/// vreg is being provided by the specified load instruction. If possible,
/// try to fold the load as an operand to the instruction, returning true if
/// possible.
bool tryToFoldLoadIntoMI(MachineInstr *MI, unsigned OpNo,
const LoadInst *LI) override;
bool fastLowerArguments() override;
bool fastLowerCall(CallLoweringInfo &CLI) override;
bool fastLowerIntrinsicCall(const IntrinsicInst *II) override;
#include "X86GenFastISel.inc"
private:
bool X86FastEmitCompare(const Value *LHS, const Value *RHS, EVT VT, DebugLoc DL);
bool X86FastEmitLoad(EVT VT, X86AddressMode &AM, MachineMemOperand *MMO,
unsigned &ResultReg, unsigned Alignment = 1);
bool X86FastEmitStore(EVT VT, const Value *Val, X86AddressMode &AM,
MachineMemOperand *MMO = nullptr, bool Aligned = false);
bool X86FastEmitStore(EVT VT, unsigned ValReg, bool ValIsKill,
X86AddressMode &AM,
MachineMemOperand *MMO = nullptr, bool Aligned = false);
bool X86FastEmitExtend(ISD::NodeType Opc, EVT DstVT, unsigned Src, EVT SrcVT,
unsigned &ResultReg);
bool X86SelectAddress(const Value *V, X86AddressMode &AM);
bool X86SelectCallAddress(const Value *V, X86AddressMode &AM);
bool X86SelectLoad(const Instruction *I);
bool X86SelectStore(const Instruction *I);
bool X86SelectRet(const Instruction *I);
bool X86SelectCmp(const Instruction *I);
bool X86SelectZExt(const Instruction *I);
bool X86SelectBranch(const Instruction *I);
bool X86SelectShift(const Instruction *I);
bool X86SelectDivRem(const Instruction *I);
bool X86FastEmitCMoveSelect(MVT RetVT, const Instruction *I);
bool X86FastEmitSSESelect(MVT RetVT, const Instruction *I);
bool X86FastEmitPseudoSelect(MVT RetVT, const Instruction *I);
bool X86SelectSelect(const Instruction *I);
bool X86SelectTrunc(const Instruction *I);
bool X86SelectFPExtOrFPTrunc(const Instruction *I, unsigned Opc,
const TargetRegisterClass *RC);
bool X86SelectFPExt(const Instruction *I);
bool X86SelectFPTrunc(const Instruction *I);
bool X86SelectSIToFP(const Instruction *I);
const X86InstrInfo *getInstrInfo() const {
return Subtarget->getInstrInfo();
}
const X86TargetMachine *getTargetMachine() const {
return static_cast<const X86TargetMachine *>(&TM);
}
bool handleConstantAddresses(const Value *V, X86AddressMode &AM);
unsigned X86MaterializeInt(const ConstantInt *CI, MVT VT);
unsigned X86MaterializeFP(const ConstantFP *CFP, MVT VT);
unsigned X86MaterializeGV(const GlobalValue *GV, MVT VT);
unsigned fastMaterializeConstant(const Constant *C) override;
unsigned fastMaterializeAlloca(const AllocaInst *C) override;
unsigned fastMaterializeFloatZero(const ConstantFP *CF) override;
/// isScalarFPTypeInSSEReg - Return true if the specified scalar FP type is
/// computed in an SSE register, not on the X87 floating point stack.
bool isScalarFPTypeInSSEReg(EVT VT) const {
return (VT == MVT::f64 && X86ScalarSSEf64) || // f64 is when SSE2
(VT == MVT::f32 && X86ScalarSSEf32); // f32 is when SSE1
}
bool isTypeLegal(Type *Ty, MVT &VT, bool AllowI1 = false);
bool IsMemcpySmall(uint64_t Len);
bool TryEmitSmallMemcpy(X86AddressMode DestAM,
X86AddressMode SrcAM, uint64_t Len);
bool foldX86XALUIntrinsic(X86::CondCode &CC, const Instruction *I,
const Value *Cond);
const MachineInstrBuilder &addFullAddress(const MachineInstrBuilder &MIB,
X86AddressMode &AM);
};
} // end anonymous namespace.
static std::pair<X86::CondCode, bool>
getX86ConditionCode(CmpInst::Predicate Predicate) {
X86::CondCode CC = X86::COND_INVALID;
bool NeedSwap = false;
switch (Predicate) {
default: break;
// Floating-point Predicates
case CmpInst::FCMP_UEQ: CC = X86::COND_E; break;
case CmpInst::FCMP_OLT: NeedSwap = true; // fall-through
case CmpInst::FCMP_OGT: CC = X86::COND_A; break;
case CmpInst::FCMP_OLE: NeedSwap = true; // fall-through
case CmpInst::FCMP_OGE: CC = X86::COND_AE; break;
case CmpInst::FCMP_UGT: NeedSwap = true; // fall-through
case CmpInst::FCMP_ULT: CC = X86::COND_B; break;
case CmpInst::FCMP_UGE: NeedSwap = true; // fall-through
case CmpInst::FCMP_ULE: CC = X86::COND_BE; break;
case CmpInst::FCMP_ONE: CC = X86::COND_NE; break;
case CmpInst::FCMP_UNO: CC = X86::COND_P; break;
case CmpInst::FCMP_ORD: CC = X86::COND_NP; break;
case CmpInst::FCMP_OEQ: // fall-through
case CmpInst::FCMP_UNE: CC = X86::COND_INVALID; break;
// Integer Predicates
case CmpInst::ICMP_EQ: CC = X86::COND_E; break;
case CmpInst::ICMP_NE: CC = X86::COND_NE; break;
case CmpInst::ICMP_UGT: CC = X86::COND_A; break;
case CmpInst::ICMP_UGE: CC = X86::COND_AE; break;
case CmpInst::ICMP_ULT: CC = X86::COND_B; break;
case CmpInst::ICMP_ULE: CC = X86::COND_BE; break;
case CmpInst::ICMP_SGT: CC = X86::COND_G; break;
case CmpInst::ICMP_SGE: CC = X86::COND_GE; break;
case CmpInst::ICMP_SLT: CC = X86::COND_L; break;
case CmpInst::ICMP_SLE: CC = X86::COND_LE; break;
}
return std::make_pair(CC, NeedSwap);
}
static std::pair<unsigned, bool>
getX86SSEConditionCode(CmpInst::Predicate Predicate) {
unsigned CC;
bool NeedSwap = false;
// SSE Condition code mapping:
// 0 - EQ
// 1 - LT
// 2 - LE
// 3 - UNORD
// 4 - NEQ
// 5 - NLT
// 6 - NLE
// 7 - ORD
switch (Predicate) {
default: llvm_unreachable("Unexpected predicate");
case CmpInst::FCMP_OEQ: CC = 0; break;
case CmpInst::FCMP_OGT: NeedSwap = true; // fall-through
case CmpInst::FCMP_OLT: CC = 1; break;
case CmpInst::FCMP_OGE: NeedSwap = true; // fall-through
case CmpInst::FCMP_OLE: CC = 2; break;
case CmpInst::FCMP_UNO: CC = 3; break;
case CmpInst::FCMP_UNE: CC = 4; break;
case CmpInst::FCMP_ULE: NeedSwap = true; // fall-through
case CmpInst::FCMP_UGE: CC = 5; break;
case CmpInst::FCMP_ULT: NeedSwap = true; // fall-through
case CmpInst::FCMP_UGT: CC = 6; break;
case CmpInst::FCMP_ORD: CC = 7; break;
case CmpInst::FCMP_UEQ:
case CmpInst::FCMP_ONE: CC = 8; break;
}
return std::make_pair(CC, NeedSwap);
}
/// \brief Adds a complex addressing mode to the given machine instr builder.
/// Note, this will constrain the index register. If its not possible to
/// constrain the given index register, then a new one will be created. The
/// IndexReg field of the addressing mode will be updated to match in this case.
const MachineInstrBuilder &
X86FastISel::addFullAddress(const MachineInstrBuilder &MIB,
X86AddressMode &AM) {
// First constrain the index register. It needs to be a GR64_NOSP.
AM.IndexReg = constrainOperandRegClass(MIB->getDesc(), AM.IndexReg,
MIB->getNumOperands() +
X86::AddrIndexReg);
return ::addFullAddress(MIB, AM);
}
/// \brief Check if it is possible to fold the condition from the XALU intrinsic
/// into the user. The condition code will only be updated on success.
bool X86FastISel::foldX86XALUIntrinsic(X86::CondCode &CC, const Instruction *I,
const Value *Cond) {
if (!isa<ExtractValueInst>(Cond))
return false;
const auto *EV = cast<ExtractValueInst>(Cond);
if (!isa<IntrinsicInst>(EV->getAggregateOperand()))
return false;
const auto *II = cast<IntrinsicInst>(EV->getAggregateOperand());
MVT RetVT;
const Function *Callee = II->getCalledFunction();
Type *RetTy =
cast<StructType>(Callee->getReturnType())->getTypeAtIndex(0U);
if (!isTypeLegal(RetTy, RetVT))
return false;
if (RetVT != MVT::i32 && RetVT != MVT::i64)
return false;
X86::CondCode TmpCC;
switch (II->getIntrinsicID()) {
default: return false;
case Intrinsic::sadd_with_overflow:
case Intrinsic::ssub_with_overflow:
case Intrinsic::smul_with_overflow:
case Intrinsic::umul_with_overflow: TmpCC = X86::COND_O; break;
case Intrinsic::uadd_with_overflow:
case Intrinsic::usub_with_overflow: TmpCC = X86::COND_B; break;
}
// Check if both instructions are in the same basic block.
if (II->getParent() != I->getParent())
return false;
// Make sure nothing is in the way
BasicBlock::const_iterator Start = I;
BasicBlock::const_iterator End = II;
for (auto Itr = std::prev(Start); Itr != End; --Itr) {
// We only expect extractvalue instructions between the intrinsic and the
// instruction to be selected.
if (!isa<ExtractValueInst>(Itr))
return false;
// Check that the extractvalue operand comes from the intrinsic.
const auto *EVI = cast<ExtractValueInst>(Itr);
if (EVI->getAggregateOperand() != II)
return false;
}
CC = TmpCC;
return true;
}
bool X86FastISel::isTypeLegal(Type *Ty, MVT &VT, bool AllowI1) {
EVT evt = TLI.getValueType(DL, Ty, /*HandleUnknown=*/true);
if (evt == MVT::Other || !evt.isSimple())
// Unhandled type. Halt "fast" selection and bail.
return false;
VT = evt.getSimpleVT();
// For now, require SSE/SSE2 for performing floating-point operations,
// since x87 requires additional work.
if (VT == MVT::f64 && !X86ScalarSSEf64)
return false;
if (VT == MVT::f32 && !X86ScalarSSEf32)
return false;
// Similarly, no f80 support yet.
if (VT == MVT::f80)
return false;
// We only handle legal types. For example, on x86-32 the instruction
// selector contains all of the 64-bit instructions from x86-64,
// under the assumption that i64 won't be used if the target doesn't
// support it.
return (AllowI1 && VT == MVT::i1) || TLI.isTypeLegal(VT);
}
#include "X86GenCallingConv.inc"
/// X86FastEmitLoad - Emit a machine instruction to load a value of type VT.
/// The address is either pre-computed, i.e. Ptr, or a GlobalAddress, i.e. GV.
/// Return true and the result register by reference if it is possible.
bool X86FastISel::X86FastEmitLoad(EVT VT, X86AddressMode &AM,
MachineMemOperand *MMO, unsigned &ResultReg,
unsigned Alignment) {
// Get opcode and regclass of the output for the given load instruction.
unsigned Opc = 0;
const TargetRegisterClass *RC = nullptr;
switch (VT.getSimpleVT().SimpleTy) {
default: return false;
case MVT::i1:
case MVT::i8:
Opc = X86::MOV8rm;
RC = &X86::GR8RegClass;
break;
case MVT::i16:
Opc = X86::MOV16rm;
RC = &X86::GR16RegClass;
break;
case MVT::i32:
Opc = X86::MOV32rm;
RC = &X86::GR32RegClass;
break;
case MVT::i64:
// Must be in x86-64 mode.
Opc = X86::MOV64rm;
RC = &X86::GR64RegClass;
break;
case MVT::f32:
if (X86ScalarSSEf32) {
Opc = Subtarget->hasAVX() ? X86::VMOVSSrm : X86::MOVSSrm;
RC = &X86::FR32RegClass;
} else {
Opc = X86::LD_Fp32m;
RC = &X86::RFP32RegClass;
}
break;
case MVT::f64:
if (X86ScalarSSEf64) {
Opc = Subtarget->hasAVX() ? X86::VMOVSDrm : X86::MOVSDrm;
RC = &X86::FR64RegClass;
} else {
Opc = X86::LD_Fp64m;
RC = &X86::RFP64RegClass;
}
break;
case MVT::f80:
// No f80 support yet.
return false;
case MVT::v4f32:
if (Alignment >= 16)
Opc = Subtarget->hasAVX() ? X86::VMOVAPSrm : X86::MOVAPSrm;
else
Opc = Subtarget->hasAVX() ? X86::VMOVUPSrm : X86::MOVUPSrm;
RC = &X86::VR128RegClass;
break;
case MVT::v2f64:
if (Alignment >= 16)
Opc = Subtarget->hasAVX() ? X86::VMOVAPDrm : X86::MOVAPDrm;
else
Opc = Subtarget->hasAVX() ? X86::VMOVUPDrm : X86::MOVUPDrm;
RC = &X86::VR128RegClass;
break;
case MVT::v4i32:
case MVT::v2i64:
case MVT::v8i16:
case MVT::v16i8:
if (Alignment >= 16)
Opc = Subtarget->hasAVX() ? X86::VMOVDQArm : X86::MOVDQArm;
else
Opc = Subtarget->hasAVX() ? X86::VMOVDQUrm : X86::MOVDQUrm;
RC = &X86::VR128RegClass;
break;
}
ResultReg = createResultReg(RC);
MachineInstrBuilder MIB =
BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(Opc), ResultReg);
addFullAddress(MIB, AM);
if (MMO)
MIB->addMemOperand(*FuncInfo.MF, MMO);
return true;
}
/// X86FastEmitStore - Emit a machine instruction to store a value Val of
/// type VT. The address is either pre-computed, consisted of a base ptr, Ptr
/// and a displacement offset, or a GlobalAddress,
/// i.e. V. Return true if it is possible.
bool X86FastISel::X86FastEmitStore(EVT VT, unsigned ValReg, bool ValIsKill,
X86AddressMode &AM,
MachineMemOperand *MMO, bool Aligned) {
// Get opcode and regclass of the output for the given store instruction.
unsigned Opc = 0;
switch (VT.getSimpleVT().SimpleTy) {
case MVT::f80: // No f80 support yet.
default: return false;
case MVT::i1: {
// Mask out all but lowest bit.
unsigned AndResult = createResultReg(&X86::GR8RegClass);
BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
TII.get(X86::AND8ri), AndResult)
.addReg(ValReg, getKillRegState(ValIsKill)).addImm(1);
ValReg = AndResult;
}
// FALLTHROUGH, handling i1 as i8.
case MVT::i8: Opc = X86::MOV8mr; break;
case MVT::i16: Opc = X86::MOV16mr; break;
case MVT::i32: Opc = X86::MOV32mr; break;
case MVT::i64: Opc = X86::MOV64mr; break; // Must be in x86-64 mode.
case MVT::f32:
Opc = X86ScalarSSEf32 ?
(Subtarget->hasAVX() ? X86::VMOVSSmr : X86::MOVSSmr) : X86::ST_Fp32m;
break;
case MVT::f64:
Opc = X86ScalarSSEf64 ?
(Subtarget->hasAVX() ? X86::VMOVSDmr : X86::MOVSDmr) : X86::ST_Fp64m;
break;
case MVT::v4f32:
if (Aligned)
Opc = Subtarget->hasAVX() ? X86::VMOVAPSmr : X86::MOVAPSmr;
else
Opc = Subtarget->hasAVX() ? X86::VMOVUPSmr : X86::MOVUPSmr;
break;
case MVT::v2f64:
if (Aligned)
Opc = Subtarget->hasAVX() ? X86::VMOVAPDmr : X86::MOVAPDmr;
else
Opc = Subtarget->hasAVX() ? X86::VMOVUPDmr : X86::MOVUPDmr;
break;
case MVT::v4i32:
case MVT::v2i64:
case MVT::v8i16:
case MVT::v16i8:
if (Aligned)
Opc = Subtarget->hasAVX() ? X86::VMOVDQAmr : X86::MOVDQAmr;
else
Opc = Subtarget->hasAVX() ? X86::VMOVDQUmr : X86::MOVDQUmr;
break;
}
MachineInstrBuilder MIB =
BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(Opc));
addFullAddress(MIB, AM).addReg(ValReg, getKillRegState(ValIsKill));
if (MMO)
MIB->addMemOperand(*FuncInfo.MF, MMO);
return true;
}
bool X86FastISel::X86FastEmitStore(EVT VT, const Value *Val,
X86AddressMode &AM,
MachineMemOperand *MMO, bool Aligned) {
// Handle 'null' like i32/i64 0.
if (isa<ConstantPointerNull>(Val))
Val = Constant::getNullValue(DL.getIntPtrType(Val->getContext()));
// If this is a store of a simple constant, fold the constant into the store.
if (const ConstantInt *CI = dyn_cast<ConstantInt>(Val)) {
unsigned Opc = 0;
bool Signed = true;
switch (VT.getSimpleVT().SimpleTy) {
default: break;
case MVT::i1: Signed = false; // FALLTHROUGH to handle as i8.
case MVT::i8: Opc = X86::MOV8mi; break;
case MVT::i16: Opc = X86::MOV16mi; break;
case MVT::i32: Opc = X86::MOV32mi; break;
case MVT::i64:
// Must be a 32-bit sign extended value.
if (isInt<32>(CI->getSExtValue()))
Opc = X86::MOV64mi32;
break;
}
if (Opc) {
MachineInstrBuilder MIB =
BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(Opc));
addFullAddress(MIB, AM).addImm(Signed ? (uint64_t) CI->getSExtValue()
: CI->getZExtValue());
if (MMO)
MIB->addMemOperand(*FuncInfo.MF, MMO);
return true;
}
}
unsigned ValReg = getRegForValue(Val);
if (ValReg == 0)
return false;
bool ValKill = hasTrivialKill(Val);
return X86FastEmitStore(VT, ValReg, ValKill, AM, MMO, Aligned);
}
/// X86FastEmitExtend - Emit a machine instruction to extend a value Src of
/// type SrcVT to type DstVT using the specified extension opcode Opc (e.g.
/// ISD::SIGN_EXTEND).
bool X86FastISel::X86FastEmitExtend(ISD::NodeType Opc, EVT DstVT,
unsigned Src, EVT SrcVT,
unsigned &ResultReg) {
unsigned RR = fastEmit_r(SrcVT.getSimpleVT(), DstVT.getSimpleVT(), Opc,
Src, /*TODO: Kill=*/false);
if (RR == 0)
return false;
ResultReg = RR;
return true;
}
bool X86FastISel::handleConstantAddresses(const Value *V, X86AddressMode &AM) {
// Handle constant address.
if (const GlobalValue *GV = dyn_cast<GlobalValue>(V)) {
// Can't handle alternate code models yet.
if (TM.getCodeModel() != CodeModel::Small)
return false;
// Can't handle TLS yet.
if (GV->isThreadLocal())
return false;
// RIP-relative addresses can't have additional register operands, so if
// we've already folded stuff into the addressing mode, just force the
// global value into its own register, which we can use as the basereg.
if (!Subtarget->isPICStyleRIPRel() ||
(AM.Base.Reg == 0 && AM.IndexReg == 0)) {
// Okay, we've committed to selecting this global. Set up the address.
AM.GV = GV;
// Allow the subtarget to classify the global.
unsigned char GVFlags = Subtarget->ClassifyGlobalReference(GV, TM);
// If this reference is relative to the pic base, set it now.
if (isGlobalRelativeToPICBase(GVFlags)) {
// FIXME: How do we know Base.Reg is free??
AM.Base.Reg = getInstrInfo()->getGlobalBaseReg(FuncInfo.MF);
}
// Unless the ABI requires an extra load, return a direct reference to
// the global.
if (!isGlobalStubReference(GVFlags)) {
if (Subtarget->isPICStyleRIPRel()) {
// Use rip-relative addressing if we can. Above we verified that the
// base and index registers are unused.
assert(AM.Base.Reg == 0 && AM.IndexReg == 0);
AM.Base.Reg = X86::RIP;
}
AM.GVOpFlags = GVFlags;
return true;
}
// Ok, we need to do a load from a stub. If we've already loaded from
// this stub, reuse the loaded pointer, otherwise emit the load now.
DenseMap<const Value *, unsigned>::iterator I = LocalValueMap.find(V);
unsigned LoadReg;
if (I != LocalValueMap.end() && I->second != 0) {
LoadReg = I->second;
} else {
// Issue load from stub.
unsigned Opc = 0;
const TargetRegisterClass *RC = nullptr;
X86AddressMode StubAM;
StubAM.Base.Reg = AM.Base.Reg;
StubAM.GV = GV;
StubAM.GVOpFlags = GVFlags;
// Prepare for inserting code in the local-value area.
SavePoint SaveInsertPt = enterLocalValueArea();
if (TLI.getPointerTy(DL) == MVT::i64) {
Opc = X86::MOV64rm;
RC = &X86::GR64RegClass;
if (Subtarget->isPICStyleRIPRel())
StubAM.Base.Reg = X86::RIP;
} else {
Opc = X86::MOV32rm;
RC = &X86::GR32RegClass;
}
LoadReg = createResultReg(RC);
MachineInstrBuilder LoadMI =
BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(Opc), LoadReg);
addFullAddress(LoadMI, StubAM);
// Ok, back to normal mode.
leaveLocalValueArea(SaveInsertPt);
// Prevent loading GV stub multiple times in same MBB.
LocalValueMap[V] = LoadReg;
}
// Now construct the final address. Note that the Disp, Scale,
// and Index values may already be set here.
AM.Base.Reg = LoadReg;
AM.GV = nullptr;
return true;
}
}
// If all else fails, try to materialize the value in a register.
if (!AM.GV || !Subtarget->isPICStyleRIPRel()) {
if (AM.Base.Reg == 0) {
AM.Base.Reg = getRegForValue(V);
return AM.Base.Reg != 0;
}
if (AM.IndexReg == 0) {
assert(AM.Scale == 1 && "Scale with no index!");
AM.IndexReg = getRegForValue(V);
return AM.IndexReg != 0;
}
}
return false;
}
/// X86SelectAddress - Attempt to fill in an address from the given value.
///
bool X86FastISel::X86SelectAddress(const Value *V, X86AddressMode &AM) {
SmallVector<const Value *, 32> GEPs;
redo_gep:
const User *U = nullptr;
unsigned Opcode = Instruction::UserOp1;
if (const Instruction *I = dyn_cast<Instruction>(V)) {
// Don't walk into other basic blocks; it's possible we haven't
// visited them yet, so the instructions may not yet be assigned
// virtual registers.
if (FuncInfo.StaticAllocaMap.count(static_cast<const AllocaInst *>(V)) ||
FuncInfo.MBBMap[I->getParent()] == FuncInfo.MBB) {
Opcode = I->getOpcode();
U = I;
}
} else if (const ConstantExpr *C = dyn_cast<ConstantExpr>(V)) {
Opcode = C->getOpcode();
U = C;
}
if (PointerType *Ty = dyn_cast<PointerType>(V->getType()))
if (Ty->getAddressSpace() > 255)
// Fast instruction selection doesn't support the special
// address spaces.
return false;
switch (Opcode) {
default: break;
case Instruction::BitCast:
// Look past bitcasts.
return X86SelectAddress(U->getOperand(0), AM);
case Instruction::IntToPtr:
// Look past no-op inttoptrs.
if (TLI.getValueType(DL, U->getOperand(0)->getType()) ==
TLI.getPointerTy(DL))
return X86SelectAddress(U->getOperand(0), AM);
break;
case Instruction::PtrToInt:
// Look past no-op ptrtoints.
if (TLI.getValueType(DL, U->getType()) == TLI.getPointerTy(DL))
return X86SelectAddress(U->getOperand(0), AM);
break;
case Instruction::Alloca: {
// Do static allocas.
const AllocaInst *A = cast<AllocaInst>(V);
DenseMap<const AllocaInst *, int>::iterator SI =
FuncInfo.StaticAllocaMap.find(A);
if (SI != FuncInfo.StaticAllocaMap.end()) {
AM.BaseType = X86AddressMode::FrameIndexBase;
AM.Base.FrameIndex = SI->second;
return true;
}
break;
}
case Instruction::Add: {
// Adds of constants are common and easy enough.
if (const ConstantInt *CI = dyn_cast<ConstantInt>(U->getOperand(1))) {
uint64_t Disp = (int32_t)AM.Disp + (uint64_t)CI->getSExtValue();
// They have to fit in the 32-bit signed displacement field though.
if (isInt<32>(Disp)) {
AM.Disp = (uint32_t)Disp;
return X86SelectAddress(U->getOperand(0), AM);
}
}
break;
}
case Instruction::GetElementPtr: {
X86AddressMode SavedAM = AM;
// Pattern-match simple GEPs.
uint64_t Disp = (int32_t)AM.Disp;
unsigned IndexReg = AM.IndexReg;
unsigned Scale = AM.Scale;
gep_type_iterator GTI = gep_type_begin(U);
// Iterate through the indices, folding what we can. Constants can be
// folded, and one dynamic index can be handled, if the scale is supported.
for (User::const_op_iterator i = U->op_begin() + 1, e = U->op_end();
i != e; ++i, ++GTI) {
const Value *Op = *i;
if (StructType *STy = dyn_cast<StructType>(*GTI)) {
const StructLayout *SL = DL.getStructLayout(STy);
Disp += SL->getElementOffset(cast<ConstantInt>(Op)->getZExtValue());
continue;
}
// A array/variable index is always of the form i*S where S is the
// constant scale size. See if we can push the scale into immediates.
uint64_t S = DL.getTypeAllocSize(GTI.getIndexedType());
for (;;) {
if (const ConstantInt *CI = dyn_cast<ConstantInt>(Op)) {
// Constant-offset addressing.
Disp += CI->getSExtValue() * S;
break;
}
if (canFoldAddIntoGEP(U, Op)) {
// A compatible add with a constant operand. Fold the constant.
ConstantInt *CI =
cast<ConstantInt>(cast<AddOperator>(Op)->getOperand(1));
Disp += CI->getSExtValue() * S;
// Iterate on the other operand.
Op = cast<AddOperator>(Op)->getOperand(0);
continue;
}
if (IndexReg == 0 &&
(!AM.GV || !Subtarget->isPICStyleRIPRel()) &&
(S == 1 || S == 2 || S == 4 || S == 8)) {
// Scaled-index addressing.
Scale = S;
IndexReg = getRegForGEPIndex(Op).first;
if (IndexReg == 0)
return false;
break;
}
// Unsupported.
goto unsupported_gep;
}
}
// Check for displacement overflow.
if (!isInt<32>(Disp))
break;
AM.IndexReg = IndexReg;
AM.Scale = Scale;
AM.Disp = (uint32_t)Disp;
GEPs.push_back(V);
if (const GetElementPtrInst *GEP =
dyn_cast<GetElementPtrInst>(U->getOperand(0))) {
// Ok, the GEP indices were covered by constant-offset and scaled-index
// addressing. Update the address state and move on to examining the base.
V = GEP;
goto redo_gep;
} else if (X86SelectAddress(U->getOperand(0), AM)) {
return true;
}
// If we couldn't merge the gep value into this addr mode, revert back to
// our address and just match the value instead of completely failing.
AM = SavedAM;
for (SmallVectorImpl<const Value *>::reverse_iterator
I = GEPs.rbegin(), E = GEPs.rend(); I != E; ++I)
if (handleConstantAddresses(*I, AM))
return true;
return false;
unsupported_gep:
// Ok, the GEP indices weren't all covered.
break;
}
}
return handleConstantAddresses(V, AM);
}
/// X86SelectCallAddress - Attempt to fill in an address from the given value.
///
bool X86FastISel::X86SelectCallAddress(const Value *V, X86AddressMode &AM) {
const User *U = nullptr;
unsigned Opcode = Instruction::UserOp1;
const Instruction *I = dyn_cast<Instruction>(V);
// Record if the value is defined in the same basic block.
//
// This information is crucial to know whether or not folding an
// operand is valid.
// Indeed, FastISel generates or reuses a virtual register for all
// operands of all instructions it selects. Obviously, the definition and
// its uses must use the same virtual register otherwise the produced
// code is incorrect.
// Before instruction selection, FunctionLoweringInfo::set sets the virtual
// registers for values that are alive across basic blocks. This ensures
// that the values are consistently set between across basic block, even
// if different instruction selection mechanisms are used (e.g., a mix of
// SDISel and FastISel).
// For values local to a basic block, the instruction selection process
// generates these virtual registers with whatever method is appropriate
// for its needs. In particular, FastISel and SDISel do not share the way
// local virtual registers are set.
// Therefore, this is impossible (or at least unsafe) to share values
// between basic blocks unless they use the same instruction selection
// method, which is not guarantee for X86.
// Moreover, things like hasOneUse could not be used accurately, if we
// allow to reference values across basic blocks whereas they are not
// alive across basic blocks initially.
bool InMBB = true;
if (I) {
Opcode = I->getOpcode();
U = I;
InMBB = I->getParent() == FuncInfo.MBB->getBasicBlock();
} else if (const ConstantExpr *C = dyn_cast<ConstantExpr>(V)) {
Opcode = C->getOpcode();
U = C;
}
switch (Opcode) {
default: break;
case Instruction::BitCast:
// Look past bitcasts if its operand is in the same BB.
if (InMBB)
return X86SelectCallAddress(U->getOperand(0), AM);
break;
case Instruction::IntToPtr:
// Look past no-op inttoptrs if its operand is in the same BB.
if (InMBB &&
TLI.getValueType(DL, U->getOperand(0)->getType()) ==
TLI.getPointerTy(DL))
return X86SelectCallAddress(U->getOperand(0), AM);
break;
case Instruction::PtrToInt:
// Look past no-op ptrtoints if its operand is in the same BB.
if (InMBB && TLI.getValueType(DL, U->getType()) == TLI.getPointerTy(DL))
return X86SelectCallAddress(U->getOperand(0), AM);
break;
}
// Handle constant address.
if (const GlobalValue *GV = dyn_cast<GlobalValue>(V)) {
// Can't handle alternate code models yet.
if (TM.getCodeModel() != CodeModel::Small)
return false;
// RIP-relative addresses can't have additional register operands.
if (Subtarget->isPICStyleRIPRel() &&
(AM.Base.Reg != 0 || AM.IndexReg != 0))
return false;
// Can't handle DLL Import.
if (GV->hasDLLImportStorageClass())
return false;
// Can't handle TLS.
if (const GlobalVariable *GVar = dyn_cast<GlobalVariable>(GV))
if (GVar->isThreadLocal())
return false;
// Okay, we've committed to selecting this global. Set up the basic address.
AM.GV = GV;
// No ABI requires an extra load for anything other than DLLImport, which
// we rejected above. Return a direct reference to the global.
if (Subtarget->isPICStyleRIPRel()) {
// Use rip-relative addressing if we can. Above we verified that the
// base and index registers are unused.
assert(AM.Base.Reg == 0 && AM.IndexReg == 0);
AM.Base.Reg = X86::RIP;
} else if (Subtarget->isPICStyleStubPIC()) {
AM.GVOpFlags = X86II::MO_PIC_BASE_OFFSET;
} else if (Subtarget->isPICStyleGOT()) {
AM.GVOpFlags = X86II::MO_GOTOFF;
}
return true;
}
// If all else fails, try to materialize the value in a register.
if (!AM.GV || !Subtarget->isPICStyleRIPRel()) {
if (AM.Base.Reg == 0) {
AM.Base.Reg = getRegForValue(V);
return AM.Base.Reg != 0;
}
if (AM.IndexReg == 0) {
assert(AM.Scale == 1 && "Scale with no index!");
AM.IndexReg = getRegForValue(V);
return AM.IndexReg != 0;
}
}
return false;
}
/// X86SelectStore - Select and emit code to implement store instructions.
bool X86FastISel::X86SelectStore(const Instruction *I) {
// Atomic stores need special handling.
const StoreInst *S = cast<StoreInst>(I);
if (S->isAtomic())
return false;
const Value *Val = S->getValueOperand();
const Value *Ptr = S->getPointerOperand();
MVT VT;
if (!isTypeLegal(Val->getType(), VT, /*AllowI1=*/true))
return false;
unsigned Alignment = S->getAlignment();
unsigned ABIAlignment = DL.getABITypeAlignment(Val->getType());
if (Alignment == 0) // Ensure that codegen never sees alignment 0
Alignment = ABIAlignment;
bool Aligned = Alignment >= ABIAlignment;
X86AddressMode AM;
if (!X86SelectAddress(Ptr, AM))
return false;
return X86FastEmitStore(VT, Val, AM, createMachineMemOperandFor(I), Aligned);
}
/// X86SelectRet - Select and emit code to implement ret instructions.
bool X86FastISel::X86SelectRet(const Instruction *I) {
const ReturnInst *Ret = cast<ReturnInst>(I);
const Function &F = *I->getParent()->getParent();
const X86MachineFunctionInfo *X86MFInfo =
FuncInfo.MF->getInfo<X86MachineFunctionInfo>();
if (!FuncInfo.CanLowerReturn)
return false;
CallingConv::ID CC = F.getCallingConv();
if (CC != CallingConv::C &&
CC != CallingConv::Fast &&
CC != CallingConv::X86_FastCall &&
CC != CallingConv::X86_64_SysV)
return false;
if (Subtarget->isCallingConvWin64(CC))
return false;
// Don't handle popping bytes on return for now.
if (X86MFInfo->getBytesToPopOnReturn() != 0)
return false;
// fastcc with -tailcallopt is intended to provide a guaranteed
// tail call optimization. Fastisel doesn't know how to do that.
if (CC == CallingConv::Fast && TM.Options.GuaranteedTailCallOpt)
return false;
// Let SDISel handle vararg functions.
if (F.isVarArg())
return false;
// Build a list of return value registers.
SmallVector<unsigned, 4> RetRegs;
if (Ret->getNumOperands() > 0) {
SmallVector<ISD::OutputArg, 4> Outs;
GetReturnInfo(F.getReturnType(), F.getAttributes(), Outs, TLI, DL);
// Analyze operands of the call, assigning locations to each operand.
SmallVector<CCValAssign, 16> ValLocs;
CCState CCInfo(CC, F.isVarArg(), *FuncInfo.MF, ValLocs, I->getContext());
CCInfo.AnalyzeReturn(Outs, RetCC_X86);
const Value *RV = Ret->getOperand(0);
unsigned Reg = getRegForValue(RV);
if (Reg == 0)
return false;
// Only handle a single return value for now.
if (ValLocs.size() != 1)
return false;
CCValAssign &VA = ValLocs[0];
// Don't bother handling odd stuff for now.
if (VA.getLocInfo() != CCValAssign::Full)
return false;
// Only handle register returns for now.
if (!VA.isRegLoc())
return false;
// The calling-convention tables for x87 returns don't tell
// the whole story.
if (VA.getLocReg() == X86::FP0 || VA.getLocReg() == X86::FP1)
return false;
unsigned SrcReg = Reg + VA.getValNo();
EVT SrcVT = TLI.getValueType(DL, RV->getType());
EVT DstVT = VA.getValVT();
// Special handling for extended integers.
if (SrcVT != DstVT) {
if (SrcVT != MVT::i1 && SrcVT != MVT::i8 && SrcVT != MVT::i16)
return false;
if (!Outs[0].Flags.isZExt() && !Outs[0].Flags.isSExt())
return false;
assert(DstVT == MVT::i32 && "X86 should always ext to i32");
if (SrcVT == MVT::i1) {
if (Outs[0].Flags.isSExt())
return false;
SrcReg = fastEmitZExtFromI1(MVT::i8, SrcReg, /*TODO: Kill=*/false);
SrcVT = MVT::i8;
}
unsigned Op = Outs[0].Flags.isZExt() ? ISD::ZERO_EXTEND :
ISD::SIGN_EXTEND;
SrcReg = fastEmit_r(SrcVT.getSimpleVT(), DstVT.getSimpleVT(), Op,
SrcReg, /*TODO: Kill=*/false);
}
// Make the copy.
unsigned DstReg = VA.getLocReg();
const TargetRegisterClass *SrcRC = MRI.getRegClass(SrcReg);
// Avoid a cross-class copy. This is very unlikely.
if (!SrcRC->contains(DstReg))
return false;
BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
TII.get(TargetOpcode::COPY), DstReg).addReg(SrcReg);
// Add register to return instruction.
RetRegs.push_back(VA.getLocReg());
}
// The x86-64 ABI for returning structs by value requires that we copy
// the sret argument into %rax for the return. We saved the argument into
// a virtual register in the entry block, so now we copy the value out
// and into %rax. We also do the same with %eax for Win32.
if (F.hasStructRetAttr() &&
(Subtarget->is64Bit() || Subtarget->isTargetKnownWindowsMSVC())) {
unsigned Reg = X86MFInfo->getSRetReturnReg();
assert(Reg &&
"SRetReturnReg should have been set in LowerFormalArguments()!");
unsigned RetReg = Subtarget->is64Bit() ? X86::RAX : X86::EAX;
BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
TII.get(TargetOpcode::COPY), RetReg).addReg(Reg);
RetRegs.push_back(RetReg);
}
// Now emit the RET.
MachineInstrBuilder MIB =
BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
TII.get(Subtarget->is64Bit() ? X86::RETQ : X86::RETL));
for (unsigned i = 0, e = RetRegs.size(); i != e; ++i)
MIB.addReg(RetRegs[i], RegState::Implicit);
return true;
}
/// X86SelectLoad - Select and emit code to implement load instructions.
///
bool X86FastISel::X86SelectLoad(const Instruction *I) {
const LoadInst *LI = cast<LoadInst>(I);
// Atomic loads need special handling.
if (LI->isAtomic())
return false;
MVT VT;
if (!isTypeLegal(LI->getType(), VT, /*AllowI1=*/true))
return false;
const Value *Ptr = LI->getPointerOperand();
X86AddressMode AM;
if (!X86SelectAddress(Ptr, AM))
return false;
unsigned Alignment = LI->getAlignment();
unsigned ABIAlignment = DL.getABITypeAlignment(LI->getType());
if (Alignment == 0) // Ensure that codegen never sees alignment 0
Alignment = ABIAlignment;
unsigned ResultReg = 0;
if (!X86FastEmitLoad(VT, AM, createMachineMemOperandFor(LI), ResultReg,
Alignment))
return false;
updateValueMap(I, ResultReg);
return true;
}
static unsigned X86ChooseCmpOpcode(EVT VT, const X86Subtarget *Subtarget) {
bool HasAVX = Subtarget->hasAVX();
bool X86ScalarSSEf32 = Subtarget->hasSSE1();
bool X86ScalarSSEf64 = Subtarget->hasSSE2();
switch (VT.getSimpleVT().SimpleTy) {
default: return 0;
case MVT::i8: return X86::CMP8rr;
case MVT::i16: return X86::CMP16rr;
case MVT::i32: return X86::CMP32rr;
case MVT::i64: return X86::CMP64rr;
case MVT::f32:
return X86ScalarSSEf32 ? (HasAVX ? X86::VUCOMISSrr : X86::UCOMISSrr) : 0;
case MVT::f64:
return X86ScalarSSEf64 ? (HasAVX ? X86::VUCOMISDrr : X86::UCOMISDrr) : 0;
}
}
/// If we have a comparison with RHS as the RHS of the comparison, return an
/// opcode that works for the compare (e.g. CMP32ri) otherwise return 0.
static unsigned X86ChooseCmpImmediateOpcode(EVT VT, const ConstantInt *RHSC) {
int64_t Val = RHSC->getSExtValue();
switch (VT.getSimpleVT().SimpleTy) {
// Otherwise, we can't fold the immediate into this comparison.
default:
return 0;
case MVT::i8:
return X86::CMP8ri;
case MVT::i16:
if (isInt<8>(Val))
return X86::CMP16ri8;
return X86::CMP16ri;
case MVT::i32:
if (isInt<8>(Val))
return X86::CMP32ri8;
return X86::CMP32ri;
case MVT::i64:
if (isInt<8>(Val))
return X86::CMP64ri8;
// 64-bit comparisons are only valid if the immediate fits in a 32-bit sext
// field.
if (isInt<32>(Val))
return X86::CMP64ri32;
return 0;
}
}
bool X86FastISel::X86FastEmitCompare(const Value *Op0, const Value *Op1,
EVT VT, DebugLoc CurDbgLoc) {
unsigned Op0Reg = getRegForValue(Op0);
if (Op0Reg == 0) return false;
// Handle 'null' like i32/i64 0.
if (isa<ConstantPointerNull>(Op1))
Op1 = Constant::getNullValue(DL.getIntPtrType(Op0->getContext()));
// We have two options: compare with register or immediate. If the RHS of
// the compare is an immediate that we can fold into this compare, use
// CMPri, otherwise use CMPrr.
if (const ConstantInt *Op1C = dyn_cast<ConstantInt>(Op1)) {
if (unsigned CompareImmOpc = X86ChooseCmpImmediateOpcode(VT, Op1C)) {
BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, CurDbgLoc, TII.get(CompareImmOpc))
.addReg(Op0Reg)
.addImm(Op1C->getSExtValue());
return true;
}
}
unsigned CompareOpc = X86ChooseCmpOpcode(VT, Subtarget);
if (CompareOpc == 0) return false;
unsigned Op1Reg = getRegForValue(Op1);
if (Op1Reg == 0) return false;
BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, CurDbgLoc, TII.get(CompareOpc))
.addReg(Op0Reg)
.addReg(Op1Reg);
return true;
}
bool X86FastISel::X86SelectCmp(const Instruction *I) {
const CmpInst *CI = cast<CmpInst>(I);
MVT VT;
if (!isTypeLegal(I->getOperand(0)->getType(), VT))
return false;
// Try to optimize or fold the cmp.
CmpInst::Predicate Predicate = optimizeCmpPredicate(CI);
unsigned ResultReg = 0;
switch (Predicate) {
default: break;
case CmpInst::FCMP_FALSE: {
ResultReg = createResultReg(&X86::GR32RegClass);
BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(X86::MOV32r0),
ResultReg);
ResultReg = fastEmitInst_extractsubreg(MVT::i8, ResultReg, /*Kill=*/true,
X86::sub_8bit);
if (!ResultReg)
return false;
break;
}
case CmpInst::FCMP_TRUE: {
ResultReg = createResultReg(&X86::GR8RegClass);
BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(X86::MOV8ri),
ResultReg).addImm(1);
break;
}
}
if (ResultReg) {
updateValueMap(I, ResultReg);
return true;
}
const Value *LHS = CI->getOperand(0);
const Value *RHS = CI->getOperand(1);
// The optimizer might have replaced fcmp oeq %x, %x with fcmp ord %x, 0.0.
// We don't have to materialize a zero constant for this case and can just use
// %x again on the RHS.
if (Predicate == CmpInst::FCMP_ORD || Predicate == CmpInst::FCMP_UNO) {
const auto *RHSC = dyn_cast<ConstantFP>(RHS);
if (RHSC && RHSC->isNullValue())
RHS = LHS;
}
// FCMP_OEQ and FCMP_UNE cannot be checked with a single instruction.
static unsigned SETFOpcTable[2][3] = {
{ X86::SETEr, X86::SETNPr, X86::AND8rr },
{ X86::SETNEr, X86::SETPr, X86::OR8rr }
};
unsigned *SETFOpc = nullptr;
switch (Predicate) {
default: break;
case CmpInst::FCMP_OEQ: SETFOpc = &SETFOpcTable[0][0]; break;
case CmpInst::FCMP_UNE: SETFOpc = &SETFOpcTable[1][0]; break;
}
ResultReg = createResultReg(&X86::GR8RegClass);
if (SETFOpc) {
if (!X86FastEmitCompare(LHS, RHS, VT, I->getDebugLoc()))
return false;
unsigned FlagReg1 = createResultReg(&X86::GR8RegClass);
unsigned FlagReg2 = createResultReg(&X86::GR8RegClass);
BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(SETFOpc[0]),
FlagReg1);
BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(SETFOpc[1]),
FlagReg2);
BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(SETFOpc[2]),
ResultReg).addReg(FlagReg1).addReg(FlagReg2);
updateValueMap(I, ResultReg);
return true;
}
X86::CondCode CC;
bool SwapArgs;
std::tie(CC, SwapArgs) = getX86ConditionCode(Predicate);
assert(CC <= X86::LAST_VALID_COND && "Unexpected condition code.");
unsigned Opc = X86::getSETFromCond(CC);
if (SwapArgs)
std::swap(LHS, RHS);
// Emit a compare of LHS/RHS.
if (!X86FastEmitCompare(LHS, RHS, VT, I->getDebugLoc()))
return false;
BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(Opc), ResultReg);
updateValueMap(I, ResultReg);
return true;
}
bool X86FastISel::X86SelectZExt(const Instruction *I) {
EVT DstVT = TLI.getValueType(DL, I->getType());
if (!TLI.isTypeLegal(DstVT))
return false;
unsigned ResultReg = getRegForValue(I->getOperand(0));
if (ResultReg == 0)
return false;
// Handle zero-extension from i1 to i8, which is common.
MVT SrcVT = TLI.getSimpleValueType(DL, I->getOperand(0)->getType());
if (SrcVT.SimpleTy == MVT::i1) {
// Set the high bits to zero.
ResultReg = fastEmitZExtFromI1(MVT::i8, ResultReg, /*TODO: Kill=*/false);
SrcVT = MVT::i8;
if (ResultReg == 0)
return false;
}
if (DstVT == MVT::i64) {
// Handle extension to 64-bits via sub-register shenanigans.
unsigned MovInst;
switch (SrcVT.SimpleTy) {
case MVT::i8: MovInst = X86::MOVZX32rr8; break;
case MVT::i16: MovInst = X86::MOVZX32rr16; break;
case MVT::i32: MovInst = X86::MOV32rr; break;
default: llvm_unreachable("Unexpected zext to i64 source type");
}
unsigned Result32 = createResultReg(&X86::GR32RegClass);
BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(MovInst), Result32)
.addReg(ResultReg);
ResultReg = createResultReg(&X86::GR64RegClass);
BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(TargetOpcode::SUBREG_TO_REG),
ResultReg)
.addImm(0).addReg(Result32).addImm(X86::sub_32bit);
} else if (DstVT != MVT::i8) {
ResultReg = fastEmit_r(MVT::i8, DstVT.getSimpleVT(), ISD::ZERO_EXTEND,
ResultReg, /*Kill=*/true);
if (ResultReg == 0)
return false;
}
updateValueMap(I, ResultReg);
return true;
}
bool X86FastISel::X86SelectBranch(const Instruction *I) {
// Unconditional branches are selected by tablegen-generated code.
// Handle a conditional branch.
const BranchInst *BI = cast<BranchInst>(I);
MachineBasicBlock *TrueMBB = FuncInfo.MBBMap[BI->getSuccessor(0)];
MachineBasicBlock *FalseMBB = FuncInfo.MBBMap[BI->getSuccessor(1)];
// Fold the common case of a conditional branch with a comparison
// in the same block (values defined on other blocks may not have
// initialized registers).
X86::CondCode CC;
if (const CmpInst *CI = dyn_cast<CmpInst>(BI->getCondition())) {
if (CI->hasOneUse() && CI->getParent() == I->getParent()) {
EVT VT = TLI.getValueType(DL, CI->getOperand(0)->getType());
// Try to optimize or fold the cmp.
CmpInst::Predicate Predicate = optimizeCmpPredicate(CI);
switch (Predicate) {
default: break;
case CmpInst::FCMP_FALSE: fastEmitBranch(FalseMBB, DbgLoc); return true;
case CmpInst::FCMP_TRUE: fastEmitBranch(TrueMBB, DbgLoc); return true;
}
const Value *CmpLHS = CI->getOperand(0);
const Value *CmpRHS = CI->getOperand(1);
// The optimizer might have replaced fcmp oeq %x, %x with fcmp ord %x,
// 0.0.
// We don't have to materialize a zero constant for this case and can just
// use %x again on the RHS.
if (Predicate == CmpInst::FCMP_ORD || Predicate == CmpInst::FCMP_UNO) {
const auto *CmpRHSC = dyn_cast<ConstantFP>(CmpRHS);
if (CmpRHSC && CmpRHSC->isNullValue())
CmpRHS = CmpLHS;
}
// Try to take advantage of fallthrough opportunities.
if (FuncInfo.MBB->isLayoutSuccessor(TrueMBB)) {
std::swap(TrueMBB, FalseMBB);
Predicate = CmpInst::getInversePredicate(Predicate);
}
// FCMP_OEQ and FCMP_UNE cannot be expressed with a single flag/condition
// code check. Instead two branch instructions are required to check all
// the flags. First we change the predicate to a supported condition code,
// which will be the first branch. Later one we will emit the second
// branch.
bool NeedExtraBranch = false;
switch (Predicate) {
default: break;
case CmpInst::FCMP_OEQ:
std::swap(TrueMBB, FalseMBB); // fall-through
case CmpInst::FCMP_UNE:
NeedExtraBranch = true;
Predicate = CmpInst::FCMP_ONE;
break;
}
bool SwapArgs;
unsigned BranchOpc;
std::tie(CC, SwapArgs) = getX86ConditionCode(Predicate);
assert(CC <= X86::LAST_VALID_COND && "Unexpected condition code.");
BranchOpc = X86::GetCondBranchFromCond(CC);
if (SwapArgs)
std::swap(CmpLHS, CmpRHS);
// Emit a compare of the LHS and RHS, setting the flags.
if (!X86FastEmitCompare(CmpLHS, CmpRHS, VT, CI->getDebugLoc()))
return false;
BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(BranchOpc))
.addMBB(TrueMBB);
// X86 requires a second branch to handle UNE (and OEQ, which is mapped
// to UNE above).
if (NeedExtraBranch) {
BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(X86::JP_1))
.addMBB(TrueMBB);
}
finishCondBranch(BI->getParent(), TrueMBB, FalseMBB);
return true;
}
} else if (TruncInst *TI = dyn_cast<TruncInst>(BI->getCondition())) {
// Handle things like "%cond = trunc i32 %X to i1 / br i1 %cond", which
// typically happen for _Bool and C++ bools.
MVT SourceVT;
if (TI->hasOneUse() && TI->getParent() == I->getParent() &&
isTypeLegal(TI->getOperand(0)->getType(), SourceVT)) {
unsigned TestOpc = 0;
switch (SourceVT.SimpleTy) {
default: break;
case MVT::i8: TestOpc = X86::TEST8ri; break;
case MVT::i16: TestOpc = X86::TEST16ri; break;
case MVT::i32: TestOpc = X86::TEST32ri; break;
case MVT::i64: TestOpc = X86::TEST64ri32; break;
}
if (TestOpc) {
unsigned OpReg = getRegForValue(TI->getOperand(0));
if (OpReg == 0) return false;
BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(TestOpc))
.addReg(OpReg).addImm(1);
unsigned JmpOpc = X86::JNE_1;
if (FuncInfo.MBB->isLayoutSuccessor(TrueMBB)) {
std::swap(TrueMBB, FalseMBB);
JmpOpc = X86::JE_1;
}
BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(JmpOpc))
.addMBB(TrueMBB);
finishCondBranch(BI->getParent(), TrueMBB, FalseMBB);
return true;
}
}
} else if (foldX86XALUIntrinsic(CC, BI, BI->getCondition())) {
// Fake request the condition, otherwise the intrinsic might be completely
// optimized away.
unsigned TmpReg = getRegForValue(BI->getCondition());
if (TmpReg == 0)
return false;
unsigned BranchOpc = X86::GetCondBranchFromCond(CC);
BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(BranchOpc))
.addMBB(TrueMBB);
finishCondBranch(BI->getParent(), TrueMBB, FalseMBB);
return true;
}
// Otherwise do a clumsy setcc and re-test it.
// Note that i1 essentially gets ANY_EXTEND'ed to i8 where it isn't used
// in an explicit cast, so make sure to handle that correctly.
unsigned OpReg = getRegForValue(BI->getCondition());
if (OpReg == 0) return false;
BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(X86::TEST8ri))
.addReg(OpReg).addImm(1);
BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(X86::JNE_1))
.addMBB(TrueMBB);
finishCondBranch(BI->getParent(), TrueMBB, FalseMBB);
return true;
}
bool X86FastISel::X86SelectShift(const Instruction *I) {
unsigned CReg = 0, OpReg = 0;
const TargetRegisterClass *RC = nullptr;
if (I->getType()->isIntegerTy(8)) {
CReg = X86::CL;
RC = &X86::GR8RegClass;
switch (I->getOpcode()) {
case Instruction::LShr: OpReg = X86::SHR8rCL; break;
case Instruction::AShr: OpReg = X86::SAR8rCL; break;
case Instruction::Shl: OpReg = X86::SHL8rCL; break;
default: return false;
}
} else if (I->getType()->isIntegerTy(16)) {
CReg = X86::CX;
RC = &X86::GR16RegClass;
switch (I->getOpcode()) {
case Instruction::LShr: OpReg = X86::SHR16rCL; break;
case Instruction::AShr: OpReg = X86::SAR16rCL; break;
case Instruction::Shl: OpReg = X86::SHL16rCL; break;
default: return false;
}
} else if (I->getType()->isIntegerTy(32)) {
CReg = X86::ECX;
RC = &X86::GR32RegClass;
switch (I->getOpcode()) {
case Instruction::LShr: OpReg = X86::SHR32rCL; break;
case Instruction::AShr: OpReg = X86::SAR32rCL; break;
case Instruction::Shl: OpReg = X86::SHL32rCL; break;
default: return false;
}
} else if (I->getType()->isIntegerTy(64)) {
CReg = X86::RCX;
RC = &X86::GR64RegClass;
switch (I->getOpcode()) {
case Instruction::LShr: OpReg = X86::SHR64rCL; break;
case Instruction::AShr: OpReg = X86::SAR64rCL; break;
case Instruction::Shl: OpReg = X86::SHL64rCL; break;
default: return false;
}
} else {
return false;
}
MVT VT;
if (!isTypeLegal(I->getType(), VT))
return false;
unsigned Op0Reg = getRegForValue(I->getOperand(0));
if (Op0Reg == 0) return false;
unsigned Op1Reg = getRegForValue(I->getOperand(1));
if (Op1Reg == 0) return false;
BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(TargetOpcode::COPY),
CReg).addReg(Op1Reg);
// The shift instruction uses X86::CL. If we defined a super-register
// of X86::CL, emit a subreg KILL to precisely describe what we're doing here.
if (CReg != X86::CL)
BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
TII.get(TargetOpcode::KILL), X86::CL)
.addReg(CReg, RegState::Kill);
unsigned ResultReg = createResultReg(RC);
BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(OpReg), ResultReg)
.addReg(Op0Reg);
updateValueMap(I, ResultReg);
return true;
}
bool X86FastISel::X86SelectDivRem(const Instruction *I) {
const static unsigned NumTypes = 4; // i8, i16, i32, i64
const static unsigned NumOps = 4; // SDiv, SRem, UDiv, URem
const static bool S = true; // IsSigned
const static bool U = false; // !IsSigned
const static unsigned Copy = TargetOpcode::COPY;
// For the X86 DIV/IDIV instruction, in most cases the dividend
// (numerator) must be in a specific register pair highreg:lowreg,
// producing the quotient in lowreg and the remainder in highreg.
// For most data types, to set up the instruction, the dividend is
// copied into lowreg, and lowreg is sign-extended or zero-extended
// into highreg. The exception is i8, where the dividend is defined
// as a single register rather than a register pair, and we
// therefore directly sign-extend or zero-extend the dividend into
// lowreg, instead of copying, and ignore the highreg.
const static struct DivRemEntry {
// The following portion depends only on the data type.
const TargetRegisterClass *RC;
unsigned LowInReg; // low part of the register pair
unsigned HighInReg; // high part of the register pair
// The following portion depends on both the data type and the operation.
struct DivRemResult {
unsigned OpDivRem; // The specific DIV/IDIV opcode to use.
unsigned OpSignExtend; // Opcode for sign-extending lowreg into
// highreg, or copying a zero into highreg.
unsigned OpCopy; // Opcode for copying dividend into lowreg, or
// zero/sign-extending into lowreg for i8.
unsigned DivRemResultReg; // Register containing the desired result.
bool IsOpSigned; // Whether to use signed or unsigned form.
} ResultTable[NumOps];
} OpTable[NumTypes] = {
{ &X86::GR8RegClass, X86::AX, 0, {
{ X86::IDIV8r, 0, X86::MOVSX16rr8, X86::AL, S }, // SDiv
{ X86::IDIV8r, 0, X86::MOVSX16rr8, X86::AH, S }, // SRem
{ X86::DIV8r, 0, X86::MOVZX16rr8, X86::AL, U }, // UDiv
{ X86::DIV8r, 0, X86::MOVZX16rr8, X86::AH, U }, // URem
}
}, // i8
{ &X86::GR16RegClass, X86::AX, X86::DX, {
{ X86::IDIV16r, X86::CWD, Copy, X86::AX, S }, // SDiv
{ X86::IDIV16r, X86::CWD, Copy, X86::DX, S }, // SRem
{ X86::DIV16r, X86::MOV32r0, Copy, X86::AX, U }, // UDiv
{ X86::DIV16r, X86::MOV32r0, Copy, X86::DX, U }, // URem
}
}, // i16
{ &X86::GR32RegClass, X86::EAX, X86::EDX, {
{ X86::IDIV32r, X86::CDQ, Copy, X86::EAX, S }, // SDiv
{ X86::IDIV32r, X86::CDQ, Copy, X86::EDX, S }, // SRem
{ X86::DIV32r, X86::MOV32r0, Copy, X86::EAX, U }, // UDiv
{ X86::DIV32r, X86::MOV32r0, Copy, X86::EDX, U }, // URem
}
}, // i32
{ &X86::GR64RegClass, X86::RAX, X86::RDX, {
{ X86::IDIV64r, X86::CQO, Copy, X86::RAX, S }, // SDiv
{ X86::IDIV64r, X86::CQO, Copy, X86::RDX, S }, // SRem
{ X86::DIV64r, X86::MOV32r0, Copy, X86::RAX, U }, // UDiv
{ X86::DIV64r, X86::MOV32r0, Copy, X86::RDX, U }, // URem
}
}, // i64
};
MVT VT;
if (!isTypeLegal(I->getType(), VT))
return false;
unsigned TypeIndex, OpIndex;
switch (VT.SimpleTy) {
default: return false;
case MVT::i8: TypeIndex = 0; break;
case MVT::i16: TypeIndex = 1; break;
case MVT::i32: TypeIndex = 2; break;
case MVT::i64: TypeIndex = 3;
if (!Subtarget->is64Bit())
return false;
break;
}
switch (I->getOpcode()) {
default: llvm_unreachable("Unexpected div/rem opcode");
case Instruction::SDiv: OpIndex = 0; break;
case Instruction::SRem: OpIndex = 1; break;
case Instruction::UDiv: OpIndex = 2; break;
case Instruction::URem: OpIndex = 3; break;
}
const DivRemEntry &TypeEntry = OpTable[TypeIndex];
const DivRemEntry::DivRemResult &OpEntry = TypeEntry.ResultTable[OpIndex];
unsigned Op0Reg = getRegForValue(I->getOperand(0));
if (Op0Reg == 0)
return false;
unsigned Op1Reg = getRegForValue(I->getOperand(1));
if (Op1Reg == 0)
return false;
// Move op0 into low-order input register.
BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
TII.get(OpEntry.OpCopy), TypeEntry.LowInReg).addReg(Op0Reg);
// Zero-extend or sign-extend into high-order input register.
if (OpEntry.OpSignExtend) {
if (OpEntry.IsOpSigned)
BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
TII.get(OpEntry.OpSignExtend));
else {
unsigned Zero32 = createResultReg(&X86::GR32RegClass);
BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
TII.get(X86::MOV32r0), Zero32);
// Copy the zero into the appropriate sub/super/identical physical
// register. Unfortunately the operations needed are not uniform enough
// to fit neatly into the table above.
if (VT.SimpleTy == MVT::i16) {
BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
TII.get(Copy), TypeEntry.HighInReg)
.addReg(Zero32, 0, X86::sub_16bit);
} else if (VT.SimpleTy == MVT::i32) {
BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
TII.get(Copy), TypeEntry.HighInReg)
.addReg(Zero32);
} else if (VT.SimpleTy == MVT::i64) {
BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
TII.get(TargetOpcode::SUBREG_TO_REG), TypeEntry.HighInReg)
.addImm(0).addReg(Zero32).addImm(X86::sub_32bit);
}
}
}
// Generate the DIV/IDIV instruction.
BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
TII.get(OpEntry.OpDivRem)).addReg(Op1Reg);
// For i8 remainder, we can't reference AH directly, as we'll end
// up with bogus copies like %R9B = COPY %AH. Reference AX
// instead to prevent AH references in a REX instruction.
//
// The current assumption of the fast register allocator is that isel
// won't generate explicit references to the GPR8_NOREX registers. If
// the allocator and/or the backend get enhanced to be more robust in
// that regard, this can be, and should be, removed.
unsigned ResultReg = 0;
if ((I->getOpcode() == Instruction::SRem ||
I->getOpcode() == Instruction::URem) &&
OpEntry.DivRemResultReg == X86::AH && Subtarget->is64Bit()) {
unsigned SourceSuperReg = createResultReg(&X86::GR16RegClass);
unsigned ResultSuperReg = createResultReg(&X86::GR16RegClass);
BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
TII.get(Copy), SourceSuperReg).addReg(X86::AX);
// Shift AX right by 8 bits instead of using AH.
BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(X86::SHR16ri),
ResultSuperReg).addReg(SourceSuperReg).addImm(8);
// Now reference the 8-bit subreg of the result.
ResultReg = fastEmitInst_extractsubreg(MVT::i8, ResultSuperReg,
/*Kill=*/true, X86::sub_8bit);
}
// Copy the result out of the physreg if we haven't already.
if (!ResultReg) {
ResultReg = createResultReg(TypeEntry.RC);
BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(Copy), ResultReg)
.addReg(OpEntry.DivRemResultReg);
}
updateValueMap(I, ResultReg);
return true;
}
/// \brief Emit a conditional move instruction (if the are supported) to lower
/// the select.
bool X86FastISel::X86FastEmitCMoveSelect(MVT RetVT, const Instruction *I) {
// Check if the subtarget supports these instructions.
if (!Subtarget->hasCMov())
return false;
// FIXME: Add support for i8.
if (RetVT < MVT::i16 || RetVT > MVT::i64)
return false;
const Value *Cond = I->getOperand(0);
const TargetRegisterClass *RC = TLI.getRegClassFor(RetVT);
bool NeedTest = true;
X86::CondCode CC = X86::COND_NE;
// Optimize conditions coming from a compare if both instructions are in the
// same basic block (values defined in other basic blocks may not have
// initialized registers).
const auto *CI = dyn_cast<CmpInst>(Cond);
if (CI && (CI->getParent() == I->getParent())) {
CmpInst::Predicate Predicate = optimizeCmpPredicate(CI);
// FCMP_OEQ and FCMP_UNE cannot be checked with a single instruction.
static unsigned SETFOpcTable[2][3] = {
{ X86::SETNPr, X86::SETEr , X86::TEST8rr },
{ X86::SETPr, X86::SETNEr, X86::OR8rr }
};
unsigned *SETFOpc = nullptr;
switch (Predicate) {
default: break;
case CmpInst::FCMP_OEQ:
SETFOpc = &SETFOpcTable[0][0];
Predicate = CmpInst::ICMP_NE;
break;
case CmpInst::FCMP_UNE:
SETFOpc = &SETFOpcTable[1][0];
Predicate = CmpInst::ICMP_NE;
break;
}
bool NeedSwap;
std::tie(CC, NeedSwap) = getX86ConditionCode(Predicate);
assert(CC <= X86::LAST_VALID_COND && "Unexpected condition code.");
const Value *CmpLHS = CI->getOperand(0);
const Value *CmpRHS = CI->getOperand(1);
if (NeedSwap)
std::swap(CmpLHS, CmpRHS);
EVT CmpVT = TLI.getValueType(DL, CmpLHS->getType());
// Emit a compare of the LHS and RHS, setting the flags.
if (!X86FastEmitCompare(CmpLHS, CmpRHS, CmpVT, CI->getDebugLoc()))
return false;
if (SETFOpc) {
unsigned FlagReg1 = createResultReg(&X86::GR8RegClass);
unsigned FlagReg2 = createResultReg(&X86::GR8RegClass);
BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(SETFOpc[0]),
FlagReg1);
BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(SETFOpc[1]),
FlagReg2);
auto const &II = TII.get(SETFOpc[2]);
if (II.getNumDefs()) {
unsigned TmpReg = createResultReg(&X86::GR8RegClass);
BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, II, TmpReg)
.addReg(FlagReg2).addReg(FlagReg1);
} else {
BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, II)
.addReg(FlagReg2).addReg(FlagReg1);
}
}
NeedTest = false;
} else if (foldX86XALUIntrinsic(CC, I, Cond)) {
// Fake request the condition, otherwise the intrinsic might be completely
// optimized away.
unsigned TmpReg = getRegForValue(Cond);
if (TmpReg == 0)
return false;
NeedTest = false;
}
if (NeedTest) {
// Selects operate on i1, however, CondReg is 8 bits width and may contain
// garbage. Indeed, only the less significant bit is supposed to be
// accurate. If we read more than the lsb, we may see non-zero values
// whereas lsb is zero. Therefore, we have to truncate Op0Reg to i1 for
// the select. This is achieved by performing TEST against 1.
unsigned CondReg = getRegForValue(Cond);
if (CondReg == 0)
return false;
bool CondIsKill = hasTrivialKill(Cond);
BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(X86::TEST8ri))
.addReg(CondReg, getKillRegState(CondIsKill)).addImm(1);
}
const Value *LHS = I->getOperand(1);
const Value *RHS = I->getOperand(2);
unsigned RHSReg = getRegForValue(RHS);
bool RHSIsKill = hasTrivialKill(RHS);
unsigned LHSReg = getRegForValue(LHS);
bool LHSIsKill = hasTrivialKill(LHS);
if (!LHSReg || !RHSReg)
return false;
unsigned Opc = X86::getCMovFromCond(CC, RC->getSize());
unsigned ResultReg = fastEmitInst_rr(Opc, RC, RHSReg, RHSIsKill,
LHSReg, LHSIsKill);
updateValueMap(I, ResultReg);
return true;
}
/// \brief Emit SSE or AVX instructions to lower the select.
///
/// Try to use SSE1/SSE2 instructions to simulate a select without branches.
/// This lowers fp selects into a CMP/AND/ANDN/OR sequence when the necessary
/// SSE instructions are available. If AVX is available, try to use a VBLENDV.
bool X86FastISel::X86FastEmitSSESelect(MVT RetVT, const Instruction *I) {
// Optimize conditions coming from a compare if both instructions are in the
// same basic block (values defined in other basic blocks may not have
// initialized registers).
const auto *CI = dyn_cast<FCmpInst>(I->getOperand(0));
if (!CI || (CI->getParent() != I->getParent()))
return false;
if (I->getType() != CI->getOperand(0)->getType() ||
!((Subtarget->hasSSE1() && RetVT == MVT::f32) ||
(Subtarget->hasSSE2() && RetVT == MVT::f64)))
return false;
const Value *CmpLHS = CI->getOperand(0);
const Value *CmpRHS = CI->getOperand(1);
CmpInst::Predicate Predicate = optimizeCmpPredicate(CI);
// The optimizer might have replaced fcmp oeq %x, %x with fcmp ord %x, 0.0.
// We don't have to materialize a zero constant for this case and can just use
// %x again on the RHS.
if (Predicate == CmpInst::FCMP_ORD || Predicate == CmpInst::FCMP_UNO) {
const auto *CmpRHSC = dyn_cast<ConstantFP>(CmpRHS);
if (CmpRHSC && CmpRHSC->isNullValue())
CmpRHS = CmpLHS;
}
unsigned CC;
bool NeedSwap;
std::tie(CC, NeedSwap) = getX86SSEConditionCode(Predicate);
if (CC > 7)
return false;
if (NeedSwap)
std::swap(CmpLHS, CmpRHS);
// Choose the SSE instruction sequence based on data type (float or double).
static unsigned OpcTable[2][4] = {
{ X86::CMPSSrr, X86::FsANDPSrr, X86::FsANDNPSrr, X86::FsORPSrr },
{ X86::CMPSDrr, X86::FsANDPDrr, X86::FsANDNPDrr, X86::FsORPDrr }
};
unsigned *Opc = nullptr;
switch (RetVT.SimpleTy) {
default: return false;
case MVT::f32: Opc = &OpcTable[0][0]; break;
case MVT::f64: Opc = &OpcTable[1][0]; break;
}
const Value *LHS = I->getOperand(1);
const Value *RHS = I->getOperand(2);
unsigned LHSReg = getRegForValue(LHS);
bool LHSIsKill = hasTrivialKill(LHS);
unsigned RHSReg = getRegForValue(RHS);
bool RHSIsKill = hasTrivialKill(RHS);
unsigned CmpLHSReg = getRegForValue(CmpLHS);
bool CmpLHSIsKill = hasTrivialKill(CmpLHS);
unsigned CmpRHSReg = getRegForValue(CmpRHS);
bool CmpRHSIsKill = hasTrivialKill(CmpRHS);
if (!LHSReg || !RHSReg || !CmpLHS || !CmpRHS)
return false;
const TargetRegisterClass *RC = TLI.getRegClassFor(RetVT);
unsigned ResultReg;
if (Subtarget->hasAVX()) {
const TargetRegisterClass *FR32 = &X86::FR32RegClass;
const TargetRegisterClass *VR128 = &X86::VR128RegClass;
// If we have AVX, create 1 blendv instead of 3 logic instructions.
// Blendv was introduced with SSE 4.1, but the 2 register form implicitly
// uses XMM0 as the selection register. That may need just as many
// instructions as the AND/ANDN/OR sequence due to register moves, so
// don't bother.
unsigned CmpOpcode =
(RetVT.SimpleTy == MVT::f32) ? X86::VCMPSSrr : X86::VCMPSDrr;
unsigned BlendOpcode =
(RetVT.SimpleTy == MVT::f32) ? X86::VBLENDVPSrr : X86::VBLENDVPDrr;
unsigned CmpReg = fastEmitInst_rri(CmpOpcode, FR32, CmpLHSReg, CmpLHSIsKill,
CmpRHSReg, CmpRHSIsKill, CC);
unsigned VBlendReg = fastEmitInst_rrr(BlendOpcode, VR128, RHSReg, RHSIsKill,
LHSReg, LHSIsKill, CmpReg, true);
ResultReg = createResultReg(RC);
BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
TII.get(TargetOpcode::COPY), ResultReg).addReg(VBlendReg);
} else {
unsigned CmpReg = fastEmitInst_rri(Opc[0], RC, CmpLHSReg, CmpLHSIsKill,
CmpRHSReg, CmpRHSIsKill, CC);
unsigned AndReg = fastEmitInst_rr(Opc[1], RC, CmpReg, /*IsKill=*/false,
LHSReg, LHSIsKill);
unsigned AndNReg = fastEmitInst_rr(Opc[2], RC, CmpReg, /*IsKill=*/true,
RHSReg, RHSIsKill);
ResultReg = fastEmitInst_rr(Opc[3], RC, AndNReg, /*IsKill=*/true,
AndReg, /*IsKill=*/true);
}
updateValueMap(I, ResultReg);
return true;
}
bool X86FastISel::X86FastEmitPseudoSelect(MVT RetVT, const Instruction *I) {
// These are pseudo CMOV instructions and will be later expanded into control-
// flow.
unsigned Opc;
switch (RetVT.SimpleTy) {
default: return false;
case MVT::i8: Opc = X86::CMOV_GR8; break;
case MVT::i16: Opc = X86::CMOV_GR16; break;
case MVT::i32: Opc = X86::CMOV_GR32; break;
case MVT::f32: Opc = X86::CMOV_FR32; break;
case MVT::f64: Opc = X86::CMOV_FR64; break;
}
const Value *Cond = I->getOperand(0);
X86::CondCode CC = X86::COND_NE;
// Optimize conditions coming from a compare if both instructions are in the
// same basic block (values defined in other basic blocks may not have
// initialized registers).
const auto *CI = dyn_cast<CmpInst>(Cond);
if (CI && (CI->getParent() == I->getParent())) {
bool NeedSwap;
std::tie(CC, NeedSwap) = getX86ConditionCode(CI->getPredicate());
if (CC > X86::LAST_VALID_COND)
return false;
const Value *CmpLHS = CI->getOperand(0);
const Value *CmpRHS = CI->getOperand(1);
if (NeedSwap)
std::swap(CmpLHS, CmpRHS);
EVT CmpVT = TLI.getValueType(DL, CmpLHS->getType());
if (!X86FastEmitCompare(CmpLHS, CmpRHS, CmpVT, CI->getDebugLoc()))
return false;
} else {
unsigned CondReg = getRegForValue(Cond);
if (CondReg == 0)
return false;
bool CondIsKill = hasTrivialKill(Cond);
BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(X86::TEST8ri))
.addReg(CondReg, getKillRegState(CondIsKill)).addImm(1);
}
const Value *LHS = I->getOperand(1);
const Value *RHS = I->getOperand(2);
unsigned LHSReg = getRegForValue(LHS);
bool LHSIsKill = hasTrivialKill(LHS);
unsigned RHSReg = getRegForValue(RHS);
bool RHSIsKill = hasTrivialKill(RHS);
if (!LHSReg || !RHSReg)
return false;
const TargetRegisterClass *RC = TLI.getRegClassFor(RetVT);
unsigned ResultReg =
fastEmitInst_rri(Opc, RC, RHSReg, RHSIsKill, LHSReg, LHSIsKill, CC);
updateValueMap(I, ResultReg);
return true;
}
bool X86FastISel::X86SelectSelect(const Instruction *I) {
MVT RetVT;
if (!isTypeLegal(I->getType(), RetVT))
return false;
// Check if we can fold the select.
if (const auto *CI = dyn_cast<CmpInst>(I->getOperand(0))) {
CmpInst::Predicate Predicate = optimizeCmpPredicate(CI);
const Value *Opnd = nullptr;
switch (Predicate) {
default: break;
case CmpInst::FCMP_FALSE: Opnd = I->getOperand(2); break;
case CmpInst::FCMP_TRUE: Opnd = I->getOperand(1); break;
}
// No need for a select anymore - this is an unconditional move.
if (Opnd) {
unsigned OpReg = getRegForValue(Opnd);
if (OpReg == 0)
return false;
bool OpIsKill = hasTrivialKill(Opnd);
const TargetRegisterClass *RC = TLI.getRegClassFor(RetVT);
unsigned ResultReg = createResultReg(RC);
BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
TII.get(TargetOpcode::COPY), ResultReg)
.addReg(OpReg, getKillRegState(OpIsKill));
updateValueMap(I, ResultReg);
return true;
}
}
// First try to use real conditional move instructions.
if (X86FastEmitCMoveSelect(RetVT, I))
return true;
// Try to use a sequence of SSE instructions to simulate a conditional move.
if (X86FastEmitSSESelect(RetVT, I))
return true;
// Fall-back to pseudo conditional move instructions, which will be later
// converted to control-flow.
if (X86FastEmitPseudoSelect(RetVT, I))
return true;
return false;
}
bool X86FastISel::X86SelectSIToFP(const Instruction *I) {
// The target-independent selection algorithm in FastISel already knows how
// to select a SINT_TO_FP if the target is SSE but not AVX.
// Early exit if the subtarget doesn't have AVX.
if (!Subtarget->hasAVX())
return false;
if (!I->getOperand(0)->getType()->isIntegerTy(32))
return false;
// Select integer to float/double conversion.
unsigned OpReg = getRegForValue(I->getOperand(0));
if (OpReg == 0)
return false;
const TargetRegisterClass *RC = nullptr;
unsigned Opcode;
if (I->getType()->isDoubleTy()) {
// sitofp int -> double
Opcode = X86::VCVTSI2SDrr;
RC = &X86::FR64RegClass;
} else if (I->getType()->isFloatTy()) {
// sitofp int -> float
Opcode = X86::VCVTSI2SSrr;
RC = &X86::FR32RegClass;
} else
return false;
unsigned ImplicitDefReg = createResultReg(RC);
BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
TII.get(TargetOpcode::IMPLICIT_DEF), ImplicitDefReg);
unsigned ResultReg =
fastEmitInst_rr(Opcode, RC, ImplicitDefReg, true, OpReg, false);
updateValueMap(I, ResultReg);
return true;
}
// Helper method used by X86SelectFPExt and X86SelectFPTrunc.
bool X86FastISel::X86SelectFPExtOrFPTrunc(const Instruction *I,
unsigned TargetOpc,
const TargetRegisterClass *RC) {
assert((I->getOpcode() == Instruction::FPExt ||
I->getOpcode() == Instruction::FPTrunc) &&
"Instruction must be an FPExt or FPTrunc!");
unsigned OpReg = getRegForValue(I->getOperand(0));
if (OpReg == 0)
return false;
unsigned ResultReg = createResultReg(RC);
MachineInstrBuilder MIB;
MIB = BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(TargetOpc),
ResultReg);
if (Subtarget->hasAVX())
MIB.addReg(OpReg);
MIB.addReg(OpReg);
updateValueMap(I, ResultReg);
return true;
}
bool X86FastISel::X86SelectFPExt(const Instruction *I) {
if (X86ScalarSSEf64 && I->getType()->isDoubleTy() &&
I->getOperand(0)->getType()->isFloatTy()) {
// fpext from float to double.
unsigned Opc = Subtarget->hasAVX() ? X86::VCVTSS2SDrr : X86::CVTSS2SDrr;
return X86SelectFPExtOrFPTrunc(I, Opc, &X86::FR64RegClass);
}
return false;
}
bool X86FastISel::X86SelectFPTrunc(const Instruction *I) {
if (X86ScalarSSEf64 && I->getType()->isFloatTy() &&
I->getOperand(0)->getType()->isDoubleTy()) {
// fptrunc from double to float.
unsigned Opc = Subtarget->hasAVX() ? X86::VCVTSD2SSrr : X86::CVTSD2SSrr;
return X86SelectFPExtOrFPTrunc(I, Opc, &X86::FR32RegClass);
}
return false;
}
bool X86FastISel::X86SelectTrunc(const Instruction *I) {
EVT SrcVT = TLI.getValueType(DL, I->getOperand(0)->getType());
EVT DstVT = TLI.getValueType(DL, I->getType());
// This code only handles truncation to byte.
if (DstVT != MVT::i8 && DstVT != MVT::i1)
return false;
if (!TLI.isTypeLegal(SrcVT))
return false;
unsigned InputReg = getRegForValue(I->getOperand(0));
if (!InputReg)
// Unhandled operand. Halt "fast" selection and bail.
return false;
if (SrcVT == MVT::i8) {
// Truncate from i8 to i1; no code needed.
updateValueMap(I, InputReg);
return true;
}
bool KillInputReg = false;
if (!Subtarget->is64Bit()) {
// If we're on x86-32; we can't extract an i8 from a general register.
// First issue a copy to GR16_ABCD or GR32_ABCD.
const TargetRegisterClass *CopyRC =
(SrcVT == MVT::i16) ? &X86::GR16_ABCDRegClass : &X86::GR32_ABCDRegClass;
unsigned CopyReg = createResultReg(CopyRC);
BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
TII.get(TargetOpcode::COPY), CopyReg).addReg(InputReg);
InputReg = CopyReg;
KillInputReg = true;
}
// Issue an extract_subreg.
unsigned ResultReg = fastEmitInst_extractsubreg(MVT::i8,
InputReg, KillInputReg,
X86::sub_8bit);
if (!ResultReg)
return false;
updateValueMap(I, ResultReg);
return true;
}
bool X86FastISel::IsMemcpySmall(uint64_t Len) {
return Len <= (Subtarget->is64Bit() ? 32 : 16);
}
bool X86FastISel::TryEmitSmallMemcpy(X86AddressMode DestAM,
X86AddressMode SrcAM, uint64_t Len) {
// Make sure we don't bloat code by inlining very large memcpy's.
if (!IsMemcpySmall(Len))
return false;
bool i64Legal = Subtarget->is64Bit();
// We don't care about alignment here since we just emit integer accesses.
while (Len) {
MVT VT;
if (Len >= 8 && i64Legal)
VT = MVT::i64;
else if (Len >= 4)
VT = MVT::i32;
else if (Len >= 2)
VT = MVT::i16;
else
VT = MVT::i8;
unsigned Reg;
bool RV = X86FastEmitLoad(VT, SrcAM, nullptr, Reg);
RV &= X86FastEmitStore(VT, Reg, /*Kill=*/true, DestAM);
assert(RV && "Failed to emit load or store??");
unsigned Size = VT.getSizeInBits()/8;
Len -= Size;
DestAM.Disp += Size;
SrcAM.Disp += Size;
}
return true;
}
bool X86FastISel::fastLowerIntrinsicCall(const IntrinsicInst *II) {
// FIXME: Handle more intrinsics.
switch (II->getIntrinsicID()) {
default: return false;
case Intrinsic::convert_from_fp16:
case Intrinsic::convert_to_fp16: {
if (Subtarget->useSoftFloat() || !Subtarget->hasF16C())
return false;
const Value *Op = II->getArgOperand(0);
unsigned InputReg = getRegForValue(Op);
if (InputReg == 0)
return false;
// F16C only allows converting from float to half and from half to float.
bool IsFloatToHalf = II->getIntrinsicID() == Intrinsic::convert_to_fp16;
if (IsFloatToHalf) {
if (!Op->getType()->isFloatTy())
return false;
} else {
if (!II->getType()->isFloatTy())
return false;
}
unsigned ResultReg = 0;
const TargetRegisterClass *RC = TLI.getRegClassFor(MVT::v8i16);
if (IsFloatToHalf) {
// 'InputReg' is implicitly promoted from register class FR32 to
// register class VR128 by method 'constrainOperandRegClass' which is
// directly called by 'fastEmitInst_ri'.
// Instruction VCVTPS2PHrr takes an extra immediate operand which is
// used to provide rounding control.
InputReg = fastEmitInst_ri(X86::VCVTPS2PHrr, RC, InputReg, false, 0);
// Move the lower 32-bits of ResultReg to another register of class GR32.
ResultReg = createResultReg(&X86::GR32RegClass);
BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
TII.get(X86::VMOVPDI2DIrr), ResultReg)
.addReg(InputReg, RegState::Kill);
// The result value is in the lower 16-bits of ResultReg.
unsigned RegIdx = X86::sub_16bit;
ResultReg = fastEmitInst_extractsubreg(MVT::i16, ResultReg, true, RegIdx);
} else {
assert(Op->getType()->isIntegerTy(16) && "Expected a 16-bit integer!");
// Explicitly sign-extend the input to 32-bit.
InputReg = fastEmit_r(MVT::i16, MVT::i32, ISD::SIGN_EXTEND, InputReg,
/*Kill=*/false);
// The following SCALAR_TO_VECTOR will be expanded into a VMOVDI2PDIrr.
InputReg = fastEmit_r(MVT::i32, MVT::v4i32, ISD::SCALAR_TO_VECTOR,
InputReg, /*Kill=*/true);
InputReg = fastEmitInst_r(X86::VCVTPH2PSrr, RC, InputReg, /*Kill=*/true);
// The result value is in the lower 32-bits of ResultReg.
// Emit an explicit copy from register class VR128 to register class FR32.
ResultReg = createResultReg(&X86::FR32RegClass);
BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
TII.get(TargetOpcode::COPY), ResultReg)
.addReg(InputReg, RegState::Kill);
}
updateValueMap(II, ResultReg);
return true;
}
case Intrinsic::frameaddress: {
MachineFunction *MF = FuncInfo.MF;
if (MF->getTarget().getMCAsmInfo()->usesWindowsCFI())
return false;
Type *RetTy = II->getCalledFunction()->getReturnType();
MVT VT;
if (!isTypeLegal(RetTy, VT))
return false;
unsigned Opc;
const TargetRegisterClass *RC = nullptr;
switch (VT.SimpleTy) {
default: llvm_unreachable("Invalid result type for frameaddress.");
case MVT::i32: Opc = X86::MOV32rm; RC = &X86::GR32RegClass; break;
case MVT::i64: Opc = X86::MOV64rm; RC = &X86::GR64RegClass; break;
}
// This needs to be set before we call getPtrSizedFrameRegister, otherwise
// we get the wrong frame register.
MachineFrameInfo *MFI = MF->getFrameInfo();
MFI->setFrameAddressIsTaken(true);
const X86RegisterInfo *RegInfo = Subtarget->getRegisterInfo();
unsigned FrameReg = RegInfo->getPtrSizedFrameRegister(*MF);
assert(((FrameReg == X86::RBP && VT == MVT::i64) ||
(FrameReg == X86::EBP && VT == MVT::i32)) &&
"Invalid Frame Register!");
// Always make a copy of the frame register to to a vreg first, so that we
// never directly reference the frame register (the TwoAddressInstruction-
// Pass doesn't like that).
unsigned SrcReg = createResultReg(RC);
BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
TII.get(TargetOpcode::COPY), SrcReg).addReg(FrameReg);
// Now recursively load from the frame address.
// movq (%rbp), %rax
// movq (%rax), %rax
// movq (%rax), %rax
// ...
unsigned DestReg;
unsigned Depth = cast<ConstantInt>(II->getOperand(0))->getZExtValue();
while (Depth--) {
DestReg = createResultReg(RC);
addDirectMem(BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
TII.get(Opc), DestReg), SrcReg);
SrcReg = DestReg;
}
updateValueMap(II, SrcReg);
return true;
}
case Intrinsic::memcpy: {
const MemCpyInst *MCI = cast<MemCpyInst>(II);
// Don't handle volatile or variable length memcpys.
if (MCI->isVolatile())
return false;
if (isa<ConstantInt>(MCI->getLength())) {
// Small memcpy's are common enough that we want to do them
// without a call if possible.
uint64_t Len = cast<ConstantInt>(MCI->getLength())->getZExtValue();
if (IsMemcpySmall(Len)) {
X86AddressMode DestAM, SrcAM;
if (!X86SelectAddress(MCI->getRawDest(), DestAM) ||
!X86SelectAddress(MCI->getRawSource(), SrcAM))
return false;
TryEmitSmallMemcpy(DestAM, SrcAM, Len);
return true;
}
}
unsigned SizeWidth = Subtarget->is64Bit() ? 64 : 32;
if (!MCI->getLength()->getType()->isIntegerTy(SizeWidth))
return false;
if (MCI->getSourceAddressSpace() > 255 || MCI->getDestAddressSpace() > 255)
return false;
return lowerCallTo(II, "memcpy", II->getNumArgOperands() - 2);
}
case Intrinsic::memset: {
const MemSetInst *MSI = cast<MemSetInst>(II);
if (MSI->isVolatile())
return false;
unsigned SizeWidth = Subtarget->is64Bit() ? 64 : 32;
if (!MSI->getLength()->getType()->isIntegerTy(SizeWidth))
return false;
if (MSI->getDestAddressSpace() > 255)
return false;
return lowerCallTo(II, "memset", II->getNumArgOperands() - 2);
}
case Intrinsic::stackprotector: {
// Emit code to store the stack guard onto the stack.
EVT PtrTy = TLI.getPointerTy(DL);
const Value *Op1 = II->getArgOperand(0); // The guard's value.
const AllocaInst *Slot = cast<AllocaInst>(II->getArgOperand(1));
MFI.setStackProtectorIndex(FuncInfo.StaticAllocaMap[Slot]);
// Grab the frame index.
X86AddressMode AM;
if (!X86SelectAddress(Slot, AM)) return false;
if (!X86FastEmitStore(PtrTy, Op1, AM)) return false;
return true;
}
case Intrinsic::dbg_declare: {
const DbgDeclareInst *DI = cast<DbgDeclareInst>(II);
X86AddressMode AM;
assert(DI->getAddress() && "Null address should be checked earlier!");
if (!X86SelectAddress(DI->getAddress(), AM))
return false;
const MCInstrDesc &II = TII.get(TargetOpcode::DBG_VALUE);
// FIXME may need to add RegState::Debug to any registers produced,
// although ESP/EBP should be the only ones at the moment.
assert(DI->getVariable()->isValidLocationForIntrinsic(DbgLoc) &&
"Expected inlined-at fields to agree");
addFullAddress(BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, II), AM)
.addImm(0)
.addMetadata(DI->getVariable())
.addMetadata(DI->getExpression());
return true;
}
case Intrinsic::trap: {
BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(X86::TRAP));
return true;
}
case Intrinsic::sqrt: {
if (!Subtarget->hasSSE1())
return false;
Type *RetTy = II->getCalledFunction()->getReturnType();
MVT VT;
if (!isTypeLegal(RetTy, VT))
return false;
// Unfortunately we can't use fastEmit_r, because the AVX version of FSQRT
// is not generated by FastISel yet.
// FIXME: Update this code once tablegen can handle it.
static const unsigned SqrtOpc[2][2] = {
{X86::SQRTSSr, X86::VSQRTSSr},
{X86::SQRTSDr, X86::VSQRTSDr}
};
bool HasAVX = Subtarget->hasAVX();
unsigned Opc;
const TargetRegisterClass *RC;
switch (VT.SimpleTy) {
default: return false;
case MVT::f32: Opc = SqrtOpc[0][HasAVX]; RC = &X86::FR32RegClass; break;
case MVT::f64: Opc = SqrtOpc[1][HasAVX]; RC = &X86::FR64RegClass; break;
}
const Value *SrcVal = II->getArgOperand(0);
unsigned SrcReg = getRegForValue(SrcVal);
if (SrcReg == 0)
return false;
unsigned ImplicitDefReg = 0;
if (HasAVX) {
ImplicitDefReg = createResultReg(RC);
BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
TII.get(TargetOpcode::IMPLICIT_DEF), ImplicitDefReg);
}
unsigned ResultReg = createResultReg(RC);
MachineInstrBuilder MIB;
MIB = BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(Opc),
ResultReg);
if (ImplicitDefReg)
MIB.addReg(ImplicitDefReg);
MIB.addReg(SrcReg);
updateValueMap(II, ResultReg);
return true;
}
case Intrinsic::sadd_with_overflow:
case Intrinsic::uadd_with_overflow:
case Intrinsic::ssub_with_overflow:
case Intrinsic::usub_with_overflow:
case Intrinsic::smul_with_overflow:
case Intrinsic::umul_with_overflow: {
// This implements the basic lowering of the xalu with overflow intrinsics
// into add/sub/mul followed by either seto or setb.
const Function *Callee = II->getCalledFunction();
auto *Ty = cast<StructType>(Callee->getReturnType());
Type *RetTy = Ty->getTypeAtIndex(0U);
Type *CondTy = Ty->getTypeAtIndex(1);
MVT VT;
if (!isTypeLegal(RetTy, VT))
return false;
if (VT < MVT::i8 || VT > MVT::i64)
return false;
const Value *LHS = II->getArgOperand(0);
const Value *RHS = II->getArgOperand(1);
// Canonicalize immediate to the RHS.
if (isa<ConstantInt>(LHS) && !isa<ConstantInt>(RHS) &&
isCommutativeIntrinsic(II))
std::swap(LHS, RHS);
bool UseIncDec = false;
if (isa<ConstantInt>(RHS) && cast<ConstantInt>(RHS)->isOne())
UseIncDec = true;
unsigned BaseOpc, CondOpc;
switch (II->getIntrinsicID()) {
default: llvm_unreachable("Unexpected intrinsic!");
case Intrinsic::sadd_with_overflow:
BaseOpc = UseIncDec ? unsigned(X86ISD::INC) : unsigned(ISD::ADD);
CondOpc = X86::SETOr;
break;
case Intrinsic::uadd_with_overflow:
BaseOpc = ISD::ADD; CondOpc = X86::SETBr; break;
case Intrinsic::ssub_with_overflow:
BaseOpc = UseIncDec ? unsigned(X86ISD::DEC) : unsigned(ISD::SUB);
CondOpc = X86::SETOr;
break;
case Intrinsic::usub_with_overflow:
BaseOpc = ISD::SUB; CondOpc = X86::SETBr; break;
case Intrinsic::smul_with_overflow:
BaseOpc = X86ISD::SMUL; CondOpc = X86::SETOr; break;
case Intrinsic::umul_with_overflow:
BaseOpc = X86ISD::UMUL; CondOpc = X86::SETOr; break;
}
unsigned LHSReg = getRegForValue(LHS);
if (LHSReg == 0)
return false;
bool LHSIsKill = hasTrivialKill(LHS);
unsigned ResultReg = 0;
// Check if we have an immediate version.
if (const auto *CI = dyn_cast<ConstantInt>(RHS)) {
static const unsigned Opc[2][4] = {
{ X86::INC8r, X86::INC16r, X86::INC32r, X86::INC64r },
{ X86::DEC8r, X86::DEC16r, X86::DEC32r, X86::DEC64r }
};
if (BaseOpc == X86ISD::INC || BaseOpc == X86ISD::DEC) {
ResultReg = createResultReg(TLI.getRegClassFor(VT));
bool IsDec = BaseOpc == X86ISD::DEC;
BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
TII.get(Opc[IsDec][VT.SimpleTy-MVT::i8]), ResultReg)
.addReg(LHSReg, getKillRegState(LHSIsKill));
} else
ResultReg = fastEmit_ri(VT, VT, BaseOpc, LHSReg, LHSIsKill,
CI->getZExtValue());
}
unsigned RHSReg;
bool RHSIsKill;
if (!ResultReg) {
RHSReg = getRegForValue(RHS);
if (RHSReg == 0)
return false;
RHSIsKill = hasTrivialKill(RHS);
ResultReg = fastEmit_rr(VT, VT, BaseOpc, LHSReg, LHSIsKill, RHSReg,
RHSIsKill);
}
// FastISel doesn't have a pattern for all X86::MUL*r and X86::IMUL*r. Emit
// it manually.
if (BaseOpc == X86ISD::UMUL && !ResultReg) {
static const unsigned MULOpc[] =
{ X86::MUL8r, X86::MUL16r, X86::MUL32r, X86::MUL64r };
static const unsigned Reg[] = { X86::AL, X86::AX, X86::EAX, X86::RAX };
// First copy the first operand into RAX, which is an implicit input to
// the X86::MUL*r instruction.
BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
TII.get(TargetOpcode::COPY), Reg[VT.SimpleTy-MVT::i8])
.addReg(LHSReg, getKillRegState(LHSIsKill));
ResultReg = fastEmitInst_r(MULOpc[VT.SimpleTy-MVT::i8],
TLI.getRegClassFor(VT), RHSReg, RHSIsKill);
} else if (BaseOpc == X86ISD::SMUL && !ResultReg) {
static const unsigned MULOpc[] =
{ X86::IMUL8r, X86::IMUL16rr, X86::IMUL32rr, X86::IMUL64rr };
if (VT == MVT::i8) {
// Copy the first operand into AL, which is an implicit input to the
// X86::IMUL8r instruction.
BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
TII.get(TargetOpcode::COPY), X86::AL)
.addReg(LHSReg, getKillRegState(LHSIsKill));
ResultReg = fastEmitInst_r(MULOpc[0], TLI.getRegClassFor(VT), RHSReg,
RHSIsKill);
} else
ResultReg = fastEmitInst_rr(MULOpc[VT.SimpleTy-MVT::i8],
TLI.getRegClassFor(VT), LHSReg, LHSIsKill,
RHSReg, RHSIsKill);
}
if (!ResultReg)
return false;
unsigned ResultReg2 = FuncInfo.CreateRegs(CondTy);
assert((ResultReg+1) == ResultReg2 && "Nonconsecutive result registers.");
BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(CondOpc),
ResultReg2);
updateValueMap(II, ResultReg, 2);
return true;
}
case Intrinsic::x86_sse_cvttss2si:
case Intrinsic::x86_sse_cvttss2si64:
case Intrinsic::x86_sse2_cvttsd2si:
case Intrinsic::x86_sse2_cvttsd2si64: {
bool IsInputDouble;
switch (II->getIntrinsicID()) {
default: llvm_unreachable("Unexpected intrinsic.");
case Intrinsic::x86_sse_cvttss2si:
case Intrinsic::x86_sse_cvttss2si64:
if (!Subtarget->hasSSE1())
return false;
IsInputDouble = false;
break;
case Intrinsic::x86_sse2_cvttsd2si:
case Intrinsic::x86_sse2_cvttsd2si64:
if (!Subtarget->hasSSE2())
return false;
IsInputDouble = true;
break;
}
Type *RetTy = II->getCalledFunction()->getReturnType();
MVT VT;
if (!isTypeLegal(RetTy, VT))
return false;
static const unsigned CvtOpc[2][2][2] = {
{ { X86::CVTTSS2SIrr, X86::VCVTTSS2SIrr },
{ X86::CVTTSS2SI64rr, X86::VCVTTSS2SI64rr } },
{ { X86::CVTTSD2SIrr, X86::VCVTTSD2SIrr },
{ X86::CVTTSD2SI64rr, X86::VCVTTSD2SI64rr } }
};
bool HasAVX = Subtarget->hasAVX();
unsigned Opc;
switch (VT.SimpleTy) {
default: llvm_unreachable("Unexpected result type.");
case MVT::i32: Opc = CvtOpc[IsInputDouble][0][HasAVX]; break;
case MVT::i64: Opc = CvtOpc[IsInputDouble][1][HasAVX]; break;
}
// Check if we can fold insertelement instructions into the convert.
const Value *Op = II->getArgOperand(0);
while (auto *IE = dyn_cast<InsertElementInst>(Op)) {
const Value *Index = IE->getOperand(2);
if (!isa<ConstantInt>(Index))
break;
unsigned Idx = cast<ConstantInt>(Index)->getZExtValue();
if (Idx == 0) {
Op = IE->getOperand(1);
break;
}
Op = IE->getOperand(0);
}
unsigned Reg = getRegForValue(Op);
if (Reg == 0)
return false;
unsigned ResultReg = createResultReg(TLI.getRegClassFor(VT));
BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(Opc), ResultReg)
.addReg(Reg);
updateValueMap(II, ResultReg);
return true;
}
}
}
bool X86FastISel::fastLowerArguments() {
if (!FuncInfo.CanLowerReturn)
return false;
const Function *F = FuncInfo.Fn;
if (F->isVarArg())
return false;
CallingConv::ID CC = F->getCallingConv();
if (CC != CallingConv::C)
return false;
if (Subtarget->isCallingConvWin64(CC))
return false;
if (!Subtarget->is64Bit())
return false;
// Only handle simple cases. i.e. Up to 6 i32/i64 scalar arguments.
unsigned GPRCnt = 0;
unsigned FPRCnt = 0;
unsigned Idx = 0;
for (auto const &Arg : F->args()) {
// The first argument is at index 1.
++Idx;
if (F->getAttributes().hasAttribute(Idx, Attribute::ByVal) ||
F->getAttributes().hasAttribute(Idx, Attribute::InReg) ||
F->getAttributes().hasAttribute(Idx, Attribute::StructRet) ||
F->getAttributes().hasAttribute(Idx, Attribute::Nest))
return false;
Type *ArgTy = Arg.getType();
if (ArgTy->isStructTy() || ArgTy->isArrayTy() || ArgTy->isVectorTy())
return false;
EVT ArgVT = TLI.getValueType(DL, ArgTy);
if (!ArgVT.isSimple()) return false;
switch (ArgVT.getSimpleVT().SimpleTy) {
default: return false;
case MVT::i32:
case MVT::i64:
++GPRCnt;
break;
case MVT::f32:
case MVT::f64:
if (!Subtarget->hasSSE1())
return false;
++FPRCnt;
break;
}
if (GPRCnt > 6)
return false;
if (FPRCnt > 8)
return false;
}
static const MCPhysReg GPR32ArgRegs[] = {
X86::EDI, X86::ESI, X86::EDX, X86::ECX, X86::R8D, X86::R9D
};
static const MCPhysReg GPR64ArgRegs[] = {
X86::RDI, X86::RSI, X86::RDX, X86::RCX, X86::R8 , X86::R9
};
static const MCPhysReg XMMArgRegs[] = {
X86::XMM0, X86::XMM1, X86::XMM2, X86::XMM3,
X86::XMM4, X86::XMM5, X86::XMM6, X86::XMM7
};
unsigned GPRIdx = 0;
unsigned FPRIdx = 0;
for (auto const &Arg : F->args()) {
MVT VT = TLI.getSimpleValueType(DL, Arg.getType());
const TargetRegisterClass *RC = TLI.getRegClassFor(VT);
unsigned SrcReg;
switch (VT.SimpleTy) {
default: llvm_unreachable("Unexpected value type.");
case MVT::i32: SrcReg = GPR32ArgRegs[GPRIdx++]; break;
case MVT::i64: SrcReg = GPR64ArgRegs[GPRIdx++]; break;
case MVT::f32: // fall-through
case MVT::f64: SrcReg = XMMArgRegs[FPRIdx++]; break;
}
unsigned DstReg = FuncInfo.MF->addLiveIn(SrcReg, RC);
// FIXME: Unfortunately it's necessary to emit a copy from the livein copy.
// Without this, EmitLiveInCopies may eliminate the livein if its only
// use is a bitcast (which isn't turned into an instruction).
unsigned ResultReg = createResultReg(RC);
BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
TII.get(TargetOpcode::COPY), ResultReg)
.addReg(DstReg, getKillRegState(true));
updateValueMap(&Arg, ResultReg);
}
return true;
}
static unsigned computeBytesPoppedByCallee(const X86Subtarget *Subtarget,
CallingConv::ID CC,
ImmutableCallSite *CS) {
if (Subtarget->is64Bit())
return 0;
if (Subtarget->getTargetTriple().isOSMSVCRT())
return 0;
if (CC == CallingConv::Fast || CC == CallingConv::GHC ||
CC == CallingConv::HiPE)
return 0;
if (CS && !CS->paramHasAttr(1, Attribute::StructRet))
return 0;
if (CS && CS->paramHasAttr(1, Attribute::InReg))
return 0;
return 4;
}
bool X86FastISel::fastLowerCall(CallLoweringInfo &CLI) {
auto &OutVals = CLI.OutVals;
auto &OutFlags = CLI.OutFlags;
auto &OutRegs = CLI.OutRegs;
auto &Ins = CLI.Ins;
auto &InRegs = CLI.InRegs;
CallingConv::ID CC = CLI.CallConv;
bool &IsTailCall = CLI.IsTailCall;
bool IsVarArg = CLI.IsVarArg;
const Value *Callee = CLI.Callee;
MCSymbol *Symbol = CLI.Symbol;
bool Is64Bit = Subtarget->is64Bit();
bool IsWin64 = Subtarget->isCallingConvWin64(CC);
// Handle only C, fastcc, and webkit_js calling conventions for now.
switch (CC) {
default: return false;
case CallingConv::C:
case CallingConv::Fast:
case CallingConv::WebKit_JS:
case CallingConv::X86_FastCall:
case CallingConv::X86_64_Win64:
case CallingConv::X86_64_SysV:
break;
}
// Allow SelectionDAG isel to handle tail calls.
if (IsTailCall)
return false;
// fastcc with -tailcallopt is intended to provide a guaranteed
// tail call optimization. Fastisel doesn't know how to do that.
if (CC == CallingConv::Fast && TM.Options.GuaranteedTailCallOpt)
return false;
// Don't know how to handle Win64 varargs yet. Nothing special needed for
// x86-32. Special handling for x86-64 is implemented.
if (IsVarArg && IsWin64)
return false;
// Don't know about inalloca yet.
if (CLI.CS && CLI.CS->hasInAllocaArgument())
return false;
// Fast-isel doesn't know about callee-pop yet.
if (X86::isCalleePop(CC, Subtarget->is64Bit(), IsVarArg,
TM.Options.GuaranteedTailCallOpt))
return false;
SmallVector<MVT, 16> OutVTs;
SmallVector<unsigned, 16> ArgRegs;
// If this is a constant i1/i8/i16 argument, promote to i32 to avoid an extra
// instruction. This is safe because it is common to all FastISel supported
// calling conventions on x86.
for (int i = 0, e = OutVals.size(); i != e; ++i) {
Value *&Val = OutVals[i];
ISD::ArgFlagsTy Flags = OutFlags[i];
if (auto *CI = dyn_cast<ConstantInt>(Val)) {
if (CI->getBitWidth() < 32) {
if (Flags.isSExt())
Val = ConstantExpr::getSExt(CI, Type::getInt32Ty(CI->getContext()));
else
Val = ConstantExpr::getZExt(CI, Type::getInt32Ty(CI->getContext()));
}
}
// Passing bools around ends up doing a trunc to i1 and passing it.
// Codegen this as an argument + "and 1".
MVT VT;
auto *TI = dyn_cast<TruncInst>(Val);
unsigned ResultReg;
if (TI && TI->getType()->isIntegerTy(1) && CLI.CS &&
(TI->getParent() == CLI.CS->getInstruction()->getParent()) &&
TI->hasOneUse()) {
Value *PrevVal = TI->getOperand(0);
ResultReg = getRegForValue(PrevVal);
if (!ResultReg)
return false;
if (!isTypeLegal(PrevVal->getType(), VT))
return false;
ResultReg =
fastEmit_ri(VT, VT, ISD::AND, ResultReg, hasTrivialKill(PrevVal), 1);
} else {
if (!isTypeLegal(Val->getType(), VT))
return false;
ResultReg = getRegForValue(Val);
}
if (!ResultReg)
return false;
ArgRegs.push_back(ResultReg);
OutVTs.push_back(VT);
}
// Analyze operands of the call, assigning locations to each operand.
SmallVector<CCValAssign, 16> ArgLocs;
CCState CCInfo(CC, IsVarArg, *FuncInfo.MF, ArgLocs, CLI.RetTy->getContext());
// Allocate shadow area for Win64
if (IsWin64)
CCInfo.AllocateStack(32, 8);
CCInfo.AnalyzeCallOperands(OutVTs, OutFlags, CC_X86);
// Get a count of how many bytes are to be pushed on the stack.
unsigned NumBytes = CCInfo.getAlignedCallFrameSize();
// Issue CALLSEQ_START
unsigned AdjStackDown = TII.getCallFrameSetupOpcode();
BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(AdjStackDown))
.addImm(NumBytes).addImm(0);
// Walk the register/memloc assignments, inserting copies/loads.
const X86RegisterInfo *RegInfo = Subtarget->getRegisterInfo();
for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) {
CCValAssign const &VA = ArgLocs[i];
const Value *ArgVal = OutVals[VA.getValNo()];
MVT ArgVT = OutVTs[VA.getValNo()];
if (ArgVT == MVT::x86mmx)
return false;
unsigned ArgReg = ArgRegs[VA.getValNo()];
// Promote the value if needed.
switch (VA.getLocInfo()) {
case CCValAssign::Full: break;
case CCValAssign::SExt: {
assert(VA.getLocVT().isInteger() && !VA.getLocVT().isVector() &&
"Unexpected extend");
bool Emitted = X86FastEmitExtend(ISD::SIGN_EXTEND, VA.getLocVT(), ArgReg,
ArgVT, ArgReg);
assert(Emitted && "Failed to emit a sext!"); (void)Emitted;
ArgVT = VA.getLocVT();
break;
}
case CCValAssign::ZExt: {
assert(VA.getLocVT().isInteger() && !VA.getLocVT().isVector() &&
"Unexpected extend");
bool Emitted = X86FastEmitExtend(ISD::ZERO_EXTEND, VA.getLocVT(), ArgReg,
ArgVT, ArgReg);
assert(Emitted && "Failed to emit a zext!"); (void)Emitted;
ArgVT = VA.getLocVT();
break;
}
case CCValAssign::AExt: {
assert(VA.getLocVT().isInteger() && !VA.getLocVT().isVector() &&
"Unexpected extend");
bool Emitted = X86FastEmitExtend(ISD::ANY_EXTEND, VA.getLocVT(), ArgReg,
ArgVT, ArgReg);
if (!Emitted)
Emitted = X86FastEmitExtend(ISD::ZERO_EXTEND, VA.getLocVT(), ArgReg,
ArgVT, ArgReg);
if (!Emitted)
Emitted = X86FastEmitExtend(ISD::SIGN_EXTEND, VA.getLocVT(), ArgReg,
ArgVT, ArgReg);
assert(Emitted && "Failed to emit a aext!"); (void)Emitted;
ArgVT = VA.getLocVT();
break;
}
case CCValAssign::BCvt: {
ArgReg = fastEmit_r(ArgVT, VA.getLocVT(), ISD::BITCAST, ArgReg,
/*TODO: Kill=*/false);
assert(ArgReg && "Failed to emit a bitcast!");
ArgVT = VA.getLocVT();
break;
}
case CCValAssign::VExt:
// VExt has not been implemented, so this should be impossible to reach
// for now. However, fallback to Selection DAG isel once implemented.
return false;
case CCValAssign::AExtUpper:
case CCValAssign::SExtUpper:
case CCValAssign::ZExtUpper:
case CCValAssign::FPExt:
llvm_unreachable("Unexpected loc info!");
case CCValAssign::Indirect:
// FIXME: Indirect doesn't need extending, but fast-isel doesn't fully
// support this.
return false;
}
if (VA.isRegLoc()) {
BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
TII.get(TargetOpcode::COPY), VA.getLocReg()).addReg(ArgReg);
OutRegs.push_back(VA.getLocReg());
} else {
assert(VA.isMemLoc());
// Don't emit stores for undef values.
if (isa<UndefValue>(ArgVal))
continue;
unsigned LocMemOffset = VA.getLocMemOffset();
X86AddressMode AM;
AM.Base.Reg = RegInfo->getStackRegister();
AM.Disp = LocMemOffset;
ISD::ArgFlagsTy Flags = OutFlags[VA.getValNo()];
unsigned Alignment = DL.getABITypeAlignment(ArgVal->getType());
MachineMemOperand *MMO = FuncInfo.MF->getMachineMemOperand(
MachinePointerInfo::getStack(*FuncInfo.MF, LocMemOffset),
MachineMemOperand::MOStore, ArgVT.getStoreSize(), Alignment);
if (Flags.isByVal()) {
X86AddressMode SrcAM;
SrcAM.Base.Reg = ArgReg;
if (!TryEmitSmallMemcpy(AM, SrcAM, Flags.getByValSize()))
return false;
} else if (isa<ConstantInt>(ArgVal) || isa<ConstantPointerNull>(ArgVal)) {
// If this is a really simple value, emit this with the Value* version
// of X86FastEmitStore. If it isn't simple, we don't want to do this,
// as it can cause us to reevaluate the argument.
if (!X86FastEmitStore(ArgVT, ArgVal, AM, MMO))
return false;
} else {
bool ValIsKill = hasTrivialKill(ArgVal);
if (!X86FastEmitStore(ArgVT, ArgReg, ValIsKill, AM, MMO))
return false;
}
}
}
// ELF / PIC requires GOT in the EBX register before function calls via PLT
// GOT pointer.
if (Subtarget->isPICStyleGOT()) {
unsigned Base = getInstrInfo()->getGlobalBaseReg(FuncInfo.MF);
BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
TII.get(TargetOpcode::COPY), X86::EBX).addReg(Base);
}
if (Is64Bit && IsVarArg && !IsWin64) {
// From AMD64 ABI document:
// For calls that may call functions that use varargs or stdargs
// (prototype-less calls or calls to functions containing ellipsis (...) in
// the declaration) %al is used as hidden argument to specify the number
// of SSE registers used. The contents of %al do not need to match exactly
// the number of registers, but must be an ubound on the number of SSE
// registers used and is in the range 0 - 8 inclusive.
// Count the number of XMM registers allocated.
static const MCPhysReg XMMArgRegs[] = {
X86::XMM0, X86::XMM1, X86::XMM2, X86::XMM3,
X86::XMM4, X86::XMM5, X86::XMM6, X86::XMM7
};
unsigned NumXMMRegs = CCInfo.getFirstUnallocated(XMMArgRegs);
assert((Subtarget->hasSSE1() || !NumXMMRegs)
&& "SSE registers cannot be used when SSE is disabled");
BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(X86::MOV8ri),
X86::AL).addImm(NumXMMRegs);
}
// Materialize callee address in a register. FIXME: GV address can be
// handled with a CALLpcrel32 instead.
X86AddressMode CalleeAM;
if (!X86SelectCallAddress(Callee, CalleeAM))
return false;
unsigned CalleeOp = 0;
const GlobalValue *GV = nullptr;
if (CalleeAM.GV != nullptr) {
GV = CalleeAM.GV;
} else if (CalleeAM.Base.Reg != 0) {
CalleeOp = CalleeAM.Base.Reg;
} else
return false;
// Issue the call.
MachineInstrBuilder MIB;
if (CalleeOp) {
// Register-indirect call.
unsigned CallOpc = Is64Bit ? X86::CALL64r : X86::CALL32r;
MIB = BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(CallOpc))
.addReg(CalleeOp);
} else {
// Direct call.
assert(GV && "Not a direct call");
unsigned CallOpc = Is64Bit ? X86::CALL64pcrel32 : X86::CALLpcrel32;
// See if we need any target-specific flags on the GV operand.
unsigned char OpFlags = 0;
// On ELF targets, in both X86-64 and X86-32 mode, direct calls to
// external symbols most go through the PLT in PIC mode. If the symbol
// has hidden or protected visibility, or if it is static or local, then
// we don't need to use the PLT - we can directly call it.
if (Subtarget->isTargetELF() &&
TM.getRelocationModel() == Reloc::PIC_ &&
GV->hasDefaultVisibility() && !GV->hasLocalLinkage()) {
OpFlags = X86II::MO_PLT;
} else if (Subtarget->isPICStyleStubAny() &&
!GV->isStrongDefinitionForLinker() &&
(!Subtarget->getTargetTriple().isMacOSX() ||
Subtarget->getTargetTriple().isMacOSXVersionLT(10, 5))) {
// PC-relative references to external symbols should go through $stub,
// unless we're building with the leopard linker or later, which
// automatically synthesizes these stubs.
OpFlags = X86II::MO_DARWIN_STUB;
}
MIB = BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(CallOpc));
if (Symbol)
MIB.addSym(Symbol, OpFlags);
else
MIB.addGlobalAddress(GV, 0, OpFlags);
}
// Add a register mask operand representing the call-preserved registers.
// Proper defs for return values will be added by setPhysRegsDeadExcept().
MIB.addRegMask(TRI.getCallPreservedMask(*FuncInfo.MF, CC));
// Add an implicit use GOT pointer in EBX.
if (Subtarget->isPICStyleGOT())
MIB.addReg(X86::EBX, RegState::Implicit);
if (Is64Bit && IsVarArg && !IsWin64)
MIB.addReg(X86::AL, RegState::Implicit);
// Add implicit physical register uses to the call.
for (auto Reg : OutRegs)
MIB.addReg(Reg, RegState::Implicit);
// Issue CALLSEQ_END
unsigned NumBytesForCalleeToPop =
computeBytesPoppedByCallee(Subtarget, CC, CLI.CS);
unsigned AdjStackUp = TII.getCallFrameDestroyOpcode();
BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(AdjStackUp))
.addImm(NumBytes).addImm(NumBytesForCalleeToPop);
// Now handle call return values.
SmallVector<CCValAssign, 16> RVLocs;
CCState CCRetInfo(CC, IsVarArg, *FuncInfo.MF, RVLocs,
CLI.RetTy->getContext());
CCRetInfo.AnalyzeCallResult(Ins, RetCC_X86);
// Copy all of the result registers out of their specified physreg.
unsigned ResultReg = FuncInfo.CreateRegs(CLI.RetTy);
for (unsigned i = 0; i != RVLocs.size(); ++i) {
CCValAssign &VA = RVLocs[i];
EVT CopyVT = VA.getValVT();
unsigned CopyReg = ResultReg + i;
// If this is x86-64, and we disabled SSE, we can't return FP values
if ((CopyVT == MVT::f32 || CopyVT == MVT::f64) &&
((Is64Bit || Ins[i].Flags.isInReg()) && !Subtarget->hasSSE1())) {
report_fatal_error("SSE register return with SSE disabled");
}
// If we prefer to use the value in xmm registers, copy it out as f80 and
// use a truncate to move it from fp stack reg to xmm reg.
if ((VA.getLocReg() == X86::FP0 || VA.getLocReg() == X86::FP1) &&
isScalarFPTypeInSSEReg(VA.getValVT())) {
CopyVT = MVT::f80;
CopyReg = createResultReg(&X86::RFP80RegClass);
}
// Copy out the result.
BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
TII.get(TargetOpcode::COPY), CopyReg).addReg(VA.getLocReg());
InRegs.push_back(VA.getLocReg());
// Round the f80 to the right size, which also moves it to the appropriate
// xmm register. This is accomplished by storing the f80 value in memory
// and then loading it back.
if (CopyVT != VA.getValVT()) {
EVT ResVT = VA.getValVT();
unsigned Opc = ResVT == MVT::f32 ? X86::ST_Fp80m32 : X86::ST_Fp80m64;
unsigned MemSize = ResVT.getSizeInBits()/8;
int FI = MFI.CreateStackObject(MemSize, MemSize, false);
addFrameReference(BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
TII.get(Opc)), FI)
.addReg(CopyReg);
Opc = ResVT == MVT::f32 ? X86::MOVSSrm : X86::MOVSDrm;
addFrameReference(BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
TII.get(Opc), ResultReg + i), FI);
}
}
CLI.ResultReg = ResultReg;
CLI.NumResultRegs = RVLocs.size();
CLI.Call = MIB;
return true;
}
bool
X86FastISel::fastSelectInstruction(const Instruction *I) {
switch (I->getOpcode()) {
default: break;
case Instruction::Load:
return X86SelectLoad(I);
case Instruction::Store:
return X86SelectStore(I);
case Instruction::Ret:
return X86SelectRet(I);
case Instruction::ICmp:
case Instruction::FCmp:
return X86SelectCmp(I);
case Instruction::ZExt:
return X86SelectZExt(I);
case Instruction::Br:
return X86SelectBranch(I);
case Instruction::LShr:
case Instruction::AShr:
case Instruction::Shl:
return X86SelectShift(I);
case Instruction::SDiv:
case Instruction::UDiv:
case Instruction::SRem:
case Instruction::URem:
return X86SelectDivRem(I);
case Instruction::Select:
return X86SelectSelect(I);
case Instruction::Trunc:
return X86SelectTrunc(I);
case Instruction::FPExt:
return X86SelectFPExt(I);
case Instruction::FPTrunc:
return X86SelectFPTrunc(I);
case Instruction::SIToFP:
return X86SelectSIToFP(I);
case Instruction::IntToPtr: // Deliberate fall-through.
case Instruction::PtrToInt: {
EVT SrcVT = TLI.getValueType(DL, I->getOperand(0)->getType());
EVT DstVT = TLI.getValueType(DL, I->getType());
if (DstVT.bitsGT(SrcVT))
return X86SelectZExt(I);
if (DstVT.bitsLT(SrcVT))
return X86SelectTrunc(I);
unsigned Reg = getRegForValue(I->getOperand(0));
if (Reg == 0) return false;
updateValueMap(I, Reg);
return true;
}
}
return false;
}
unsigned X86FastISel::X86MaterializeInt(const ConstantInt *CI, MVT VT) {
if (VT > MVT::i64)
return 0;
uint64_t Imm = CI->getZExtValue();
if (Imm == 0) {
unsigned SrcReg = fastEmitInst_(X86::MOV32r0, &X86::GR32RegClass);
switch (VT.SimpleTy) {
default: llvm_unreachable("Unexpected value type");
case MVT::i1:
case MVT::i8:
return fastEmitInst_extractsubreg(MVT::i8, SrcReg, /*Kill=*/true,
X86::sub_8bit);
case MVT::i16:
return fastEmitInst_extractsubreg(MVT::i16, SrcReg, /*Kill=*/true,
X86::sub_16bit);
case MVT::i32:
return SrcReg;
case MVT::i64: {
unsigned ResultReg = createResultReg(&X86::GR64RegClass);
BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
TII.get(TargetOpcode::SUBREG_TO_REG), ResultReg)
.addImm(0).addReg(SrcReg).addImm(X86::sub_32bit);
return ResultReg;
}
}
}
unsigned Opc = 0;
switch (VT.SimpleTy) {
default: llvm_unreachable("Unexpected value type");
case MVT::i1: VT = MVT::i8; // fall-through
case MVT::i8: Opc = X86::MOV8ri; break;
case MVT::i16: Opc = X86::MOV16ri; break;
case MVT::i32: Opc = X86::MOV32ri; break;
case MVT::i64: {
if (isUInt<32>(Imm))
Opc = X86::MOV32ri;
else if (isInt<32>(Imm))
Opc = X86::MOV64ri32;
else
Opc = X86::MOV64ri;
break;
}
}
if (VT == MVT::i64 && Opc == X86::MOV32ri) {
unsigned SrcReg = fastEmitInst_i(Opc, &X86::GR32RegClass, Imm);
unsigned ResultReg = createResultReg(&X86::GR64RegClass);
BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
TII.get(TargetOpcode::SUBREG_TO_REG), ResultReg)
.addImm(0).addReg(SrcReg).addImm(X86::sub_32bit);
return ResultReg;
}
return fastEmitInst_i(Opc, TLI.getRegClassFor(VT), Imm);
}
unsigned X86FastISel::X86MaterializeFP(const ConstantFP *CFP, MVT VT) {
if (CFP->isNullValue())
return fastMaterializeFloatZero(CFP);
// Can't handle alternate code models yet.
CodeModel::Model CM = TM.getCodeModel();
if (CM != CodeModel::Small && CM != CodeModel::Large)
return 0;
// Get opcode and regclass of the output for the given load instruction.
unsigned Opc = 0;
const TargetRegisterClass *RC = nullptr;
switch (VT.SimpleTy) {
default: return 0;
case MVT::f32:
if (X86ScalarSSEf32) {
Opc = Subtarget->hasAVX() ? X86::VMOVSSrm : X86::MOVSSrm;
RC = &X86::FR32RegClass;
} else {
Opc = X86::LD_Fp32m;
RC = &X86::RFP32RegClass;
}
break;
case MVT::f64:
if (X86ScalarSSEf64) {
Opc = Subtarget->hasAVX() ? X86::VMOVSDrm : X86::MOVSDrm;
RC = &X86::FR64RegClass;
} else {
Opc = X86::LD_Fp64m;
RC = &X86::RFP64RegClass;
}
break;
case MVT::f80:
// No f80 support yet.
return 0;
}
// MachineConstantPool wants an explicit alignment.
unsigned Align = DL.getPrefTypeAlignment(CFP->getType());
if (Align == 0) {
// Alignment of vector types. FIXME!
Align = DL.getTypeAllocSize(CFP->getType());
}
// x86-32 PIC requires a PIC base register for constant pools.
unsigned PICBase = 0;
unsigned char OpFlag = 0;
if (Subtarget->isPICStyleStubPIC()) { // Not dynamic-no-pic
OpFlag = X86II::MO_PIC_BASE_OFFSET;
PICBase = getInstrInfo()->getGlobalBaseReg(FuncInfo.MF);
} else if (Subtarget->isPICStyleGOT()) {
OpFlag = X86II::MO_GOTOFF;
PICBase = getInstrInfo()->getGlobalBaseReg(FuncInfo.MF);
} else if (Subtarget->isPICStyleRIPRel() &&
TM.getCodeModel() == CodeModel::Small) {
PICBase = X86::RIP;
}
// Create the load from the constant pool.
unsigned CPI = MCP.getConstantPoolIndex(CFP, Align);
unsigned ResultReg = createResultReg(RC);
if (CM == CodeModel::Large) {
unsigned AddrReg = createResultReg(&X86::GR64RegClass);
BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(X86::MOV64ri),
AddrReg)
.addConstantPoolIndex(CPI, 0, OpFlag);
MachineInstrBuilder MIB = BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
TII.get(Opc), ResultReg);
addDirectMem(MIB, AddrReg);
MachineMemOperand *MMO = FuncInfo.MF->getMachineMemOperand(
MachinePointerInfo::getConstantPool(*FuncInfo.MF),
MachineMemOperand::MOLoad, DL.getPointerSize(), Align);
MIB->addMemOperand(*FuncInfo.MF, MMO);
return ResultReg;
}
addConstantPoolReference(BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
TII.get(Opc), ResultReg),
CPI, PICBase, OpFlag);
return ResultReg;
}
unsigned X86FastISel::X86MaterializeGV(const GlobalValue *GV, MVT VT) {
// Can't handle alternate code models yet.
if (TM.getCodeModel() != CodeModel::Small)
return 0;
// Materialize addresses with LEA/MOV instructions.
X86AddressMode AM;
if (X86SelectAddress(GV, AM)) {
// If the expression is just a basereg, then we're done, otherwise we need
// to emit an LEA.
if (AM.BaseType == X86AddressMode::RegBase &&
AM.IndexReg == 0 && AM.Disp == 0 && AM.GV == nullptr)
return AM.Base.Reg;
unsigned ResultReg = createResultReg(TLI.getRegClassFor(VT));
if (TM.getRelocationModel() == Reloc::Static &&
TLI.getPointerTy(DL) == MVT::i64) {
// The displacement code could be more than 32 bits away so we need to use
// an instruction with a 64 bit immediate
BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(X86::MOV64ri),
ResultReg)
.addGlobalAddress(GV);
} else {
unsigned Opc =
TLI.getPointerTy(DL) == MVT::i32
? (Subtarget->isTarget64BitILP32() ? X86::LEA64_32r : X86::LEA32r)
: X86::LEA64r;
addFullAddress(BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
TII.get(Opc), ResultReg), AM);
}
return ResultReg;
}
return 0;
}
unsigned X86FastISel::fastMaterializeConstant(const Constant *C) {
EVT CEVT = TLI.getValueType(DL, C->getType(), true);
// Only handle simple types.
if (!CEVT.isSimple())
return 0;
MVT VT = CEVT.getSimpleVT();
if (const auto *CI = dyn_cast<ConstantInt>(C))
return X86MaterializeInt(CI, VT);
else if (const ConstantFP *CFP = dyn_cast<ConstantFP>(C))
return X86MaterializeFP(CFP, VT);
else if (const GlobalValue *GV = dyn_cast<GlobalValue>(C))
return X86MaterializeGV(GV, VT);
return 0;
}
unsigned X86FastISel::fastMaterializeAlloca(const AllocaInst *C) {
// Fail on dynamic allocas. At this point, getRegForValue has already
// checked its CSE maps, so if we're here trying to handle a dynamic
// alloca, we're not going to succeed. X86SelectAddress has a
// check for dynamic allocas, because it's called directly from
// various places, but targetMaterializeAlloca also needs a check
// in order to avoid recursion between getRegForValue,
// X86SelectAddrss, and targetMaterializeAlloca.
if (!FuncInfo.StaticAllocaMap.count(C))
return 0;
assert(C->isStaticAlloca() && "dynamic alloca in the static alloca map?");
X86AddressMode AM;
if (!X86SelectAddress(C, AM))
return 0;
unsigned Opc =
TLI.getPointerTy(DL) == MVT::i32
? (Subtarget->isTarget64BitILP32() ? X86::LEA64_32r : X86::LEA32r)
: X86::LEA64r;
const TargetRegisterClass *RC = TLI.getRegClassFor(TLI.getPointerTy(DL));
unsigned ResultReg = createResultReg(RC);
addFullAddress(BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
TII.get(Opc), ResultReg), AM);
return ResultReg;
}
unsigned X86FastISel::fastMaterializeFloatZero(const ConstantFP *CF) {
MVT VT;
if (!isTypeLegal(CF->getType(), VT))
return 0;
// Get opcode and regclass for the given zero.
unsigned Opc = 0;
const TargetRegisterClass *RC = nullptr;
switch (VT.SimpleTy) {
default: return 0;
case MVT::f32:
if (X86ScalarSSEf32) {
Opc = X86::FsFLD0SS;
RC = &X86::FR32RegClass;
} else {
Opc = X86::LD_Fp032;
RC = &X86::RFP32RegClass;
}
break;
case MVT::f64:
if (X86ScalarSSEf64) {
Opc = X86::FsFLD0SD;
RC = &X86::FR64RegClass;
} else {
Opc = X86::LD_Fp064;
RC = &X86::RFP64RegClass;
}
break;
case MVT::f80:
// No f80 support yet.
return 0;
}
unsigned ResultReg = createResultReg(RC);
BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(Opc), ResultReg);
return ResultReg;
}
bool X86FastISel::tryToFoldLoadIntoMI(MachineInstr *MI, unsigned OpNo,
const LoadInst *LI) {
const Value *Ptr = LI->getPointerOperand();
X86AddressMode AM;
if (!X86SelectAddress(Ptr, AM))
return false;
const X86InstrInfo &XII = (const X86InstrInfo &)TII;
unsigned Size = DL.getTypeAllocSize(LI->getType());
unsigned Alignment = LI->getAlignment();
if (Alignment == 0) // Ensure that codegen never sees alignment 0
Alignment = DL.getABITypeAlignment(LI->getType());
SmallVector<MachineOperand, 8> AddrOps;
AM.getFullAddress(AddrOps);
MachineInstr *Result = XII.foldMemoryOperandImpl(
*FuncInfo.MF, MI, OpNo, AddrOps, FuncInfo.InsertPt, Size, Alignment,
/*AllowCommute=*/true);
if (!Result)
return false;
// The index register could be in the wrong register class. Unfortunately,
// foldMemoryOperandImpl could have commuted the instruction so its not enough
// to just look at OpNo + the offset to the index reg. We actually need to
// scan the instruction to find the index reg and see if its the correct reg
// class.
unsigned OperandNo = 0;
for (MachineInstr::mop_iterator I = Result->operands_begin(),
E = Result->operands_end(); I != E; ++I, ++OperandNo) {
MachineOperand &MO = *I;
if (!MO.isReg() || MO.isDef() || MO.getReg() != AM.IndexReg)
continue;
// Found the index reg, now try to rewrite it.
unsigned IndexReg = constrainOperandRegClass(Result->getDesc(),
MO.getReg(), OperandNo);
if (IndexReg == MO.getReg())
continue;
MO.setReg(IndexReg);
}
Result->addMemOperand(*FuncInfo.MF, createMachineMemOperandFor(LI));
MI->eraseFromParent();
return true;
}
namespace llvm {
FastISel *X86::createFastISel(FunctionLoweringInfo &funcInfo,
const TargetLibraryInfo *libInfo) {
return new X86FastISel(funcInfo, libInfo);
}
}