mirror of
https://github.com/RPCS3/llvm-mirror.git
synced 2025-01-24 12:55:45 +00:00
Some enhancements for memcpy / memset inline expansion.
1. Teach it to use overlapping unaligned load / store to copy / set the trailing bytes. e.g. On 86, use two pairs of movups / movaps for 17 - 31 byte copies. 2. Use f64 for memcpy / memset on targets where i64 is not legal but f64 is. e.g. x86 and ARM. 3. When memcpy from a constant string, do *not* replace the load with a constant if it's not possible to materialize an integer immediate with a single instruction (required a new target hook: TLI.isIntImmLegal()). 4. Use unaligned load / stores more aggressively if target hooks indicates they are "fast". 5. Update ARM target hooks to use unaligned load / stores. e.g. vld1.8 / vst1.8. Also increase the threshold to something reasonable (8 for memset, 4 pairs for memcpy). This significantly improves Dhrystone, up to 50% on ARM iOS devices. rdar://12760078 llvm-svn: 169791
This commit is contained in:
parent
182d1ce4b7
commit
86dd733bc8
@ -371,6 +371,16 @@ public:
|
||||
return false;
|
||||
}
|
||||
|
||||
/// isIntImmLegal - Returns true if the target can instruction select the
|
||||
/// specified integer immediate natively (that is, it's materialized with one
|
||||
/// instruction). The current *assumption* in isel is all of integer
|
||||
/// immediates are "legal" and only the memcpy / memset expansion code is
|
||||
/// making use of this. The rest of isel doesn't have proper cost model for
|
||||
/// immediate materialization.
|
||||
virtual bool isIntImmLegal(const APInt &/*Imm*/, EVT /*VT*/) const {
|
||||
return true;
|
||||
}
|
||||
|
||||
/// isShuffleMaskLegal - Targets can use this to indicate that they only
|
||||
/// support *some* VECTOR_SHUFFLE operations, those with specific masks.
|
||||
/// By default, if a target supports the VECTOR_SHUFFLE node, all mask values
|
||||
@ -678,12 +688,14 @@ public:
|
||||
}
|
||||
|
||||
/// This function returns true if the target allows unaligned memory accesses.
|
||||
/// of the specified type. This is used, for example, in situations where an
|
||||
/// array copy/move/set is converted to a sequence of store operations. It's
|
||||
/// use helps to ensure that such replacements don't generate code that causes
|
||||
/// an alignment error (trap) on the target machine.
|
||||
/// of the specified type. If true, it also returns whether the unaligned
|
||||
/// memory access is "fast" in the second argument by reference. This is used,
|
||||
/// for example, in situations where an array copy/move/set is converted to a
|
||||
/// sequence of store operations. It's use helps to ensure that such
|
||||
/// replacements don't generate code that causes an alignment error (trap) on
|
||||
/// the target machine.
|
||||
/// @brief Determine if the target supports unaligned memory accesses.
|
||||
virtual bool allowsUnalignedMemoryAccesses(EVT) const {
|
||||
virtual bool allowsUnalignedMemoryAccesses(EVT, bool *Fast = 0) const {
|
||||
return false;
|
||||
}
|
||||
|
||||
|
@ -3373,7 +3373,7 @@ static SDValue getMemsetStringVal(EVT VT, DebugLoc dl, SelectionDAG &DAG,
|
||||
unsigned NumVTBytes = VT.getSizeInBits() / 8;
|
||||
unsigned NumBytes = std::min(NumVTBytes, unsigned(Str.size()));
|
||||
|
||||
uint64_t Val = 0;
|
||||
APInt Val(NumBytes*8, 0);
|
||||
if (TLI.isLittleEndian()) {
|
||||
for (unsigned i = 0; i != NumBytes; ++i)
|
||||
Val |= (uint64_t)(unsigned char)Str[i] << i*8;
|
||||
@ -3382,7 +3382,9 @@ static SDValue getMemsetStringVal(EVT VT, DebugLoc dl, SelectionDAG &DAG,
|
||||
Val |= (uint64_t)(unsigned char)Str[i] << (NumVTBytes-i-1)*8;
|
||||
}
|
||||
|
||||
return DAG.getConstant(Val, VT);
|
||||
if (TLI.isIntImmLegal(Val, VT))
|
||||
return DAG.getConstant(Val, VT);
|
||||
return SDValue(0, 0);
|
||||
}
|
||||
|
||||
/// getMemBasePlusOffset - Returns base and offset node for the
|
||||
@ -3422,6 +3424,7 @@ static bool FindOptimalMemOpLowering(std::vector<EVT> &MemOps,
|
||||
unsigned DstAlign, unsigned SrcAlign,
|
||||
bool IsZeroVal,
|
||||
bool MemcpyStrSrc,
|
||||
bool AllowOverlap,
|
||||
SelectionDAG &DAG,
|
||||
const TargetLowering &TLI) {
|
||||
assert((SrcAlign == 0 || SrcAlign >= DstAlign) &&
|
||||
@ -3461,24 +3464,47 @@ static bool FindOptimalMemOpLowering(std::vector<EVT> &MemOps,
|
||||
|
||||
unsigned NumMemOps = 0;
|
||||
while (Size != 0) {
|
||||
if (++NumMemOps > Limit)
|
||||
return false;
|
||||
|
||||
unsigned VTSize = VT.getSizeInBits() / 8;
|
||||
while (VTSize > Size) {
|
||||
// For now, only use non-vector load / store's for the left-over pieces.
|
||||
EVT NewVT;
|
||||
unsigned NewVTSize;
|
||||
if (VT.isVector() || VT.isFloatingPoint()) {
|
||||
VT = MVT::i64;
|
||||
while (!TLI.isTypeLegal(VT))
|
||||
VT = (MVT::SimpleValueType)(VT.getSimpleVT().SimpleTy - 1);
|
||||
VTSize = VT.getSizeInBits() / 8;
|
||||
NewVT = (VT.getSizeInBits() > 64) ? MVT::i64 : MVT::i32;
|
||||
while (!TLI.isOperationLegalOrCustom(ISD::STORE, NewVT)) {
|
||||
if (NewVT == MVT::i64 &&
|
||||
TLI.isOperationLegalOrCustom(ISD::STORE, MVT::f64)) {
|
||||
// i64 is usually not legal on 32-bit targets, but f64 may be.
|
||||
NewVT = MVT::f64;
|
||||
break;
|
||||
}
|
||||
NewVT = (MVT::SimpleValueType)(NewVT.getSimpleVT().SimpleTy - 1);
|
||||
}
|
||||
NewVTSize = NewVT.getSizeInBits() / 8;
|
||||
} else {
|
||||
// This can result in a type that is not legal on the target, e.g.
|
||||
// 1 or 2 bytes on PPC.
|
||||
VT = (MVT::SimpleValueType)(VT.getSimpleVT().SimpleTy - 1);
|
||||
VTSize >>= 1;
|
||||
NewVT = (MVT::SimpleValueType)(VT.getSimpleVT().SimpleTy - 1);
|
||||
NewVTSize = VTSize >> 1;
|
||||
}
|
||||
|
||||
// If the new VT cannot cover all of the remaining bits, then consider
|
||||
// issuing a (or a pair of) unaligned and overlapping load / store.
|
||||
// FIXME: Only does this for 64-bit or more since we don't have proper
|
||||
// cost model for unaligned load / store.
|
||||
bool Fast;
|
||||
if (AllowOverlap && VTSize >= 8 && NewVTSize < Size &&
|
||||
TLI.allowsUnalignedMemoryAccesses(VT, &Fast) && Fast)
|
||||
VTSize = Size;
|
||||
else {
|
||||
VT = NewVT;
|
||||
VTSize = NewVTSize;
|
||||
}
|
||||
}
|
||||
|
||||
if (++NumMemOps > Limit)
|
||||
return false;
|
||||
MemOps.push_back(VT);
|
||||
Size -= VTSize;
|
||||
}
|
||||
@ -3523,7 +3549,7 @@ static SDValue getMemcpyLoadsAndStores(SelectionDAG &DAG, DebugLoc dl,
|
||||
if (!FindOptimalMemOpLowering(MemOps, Limit, Size,
|
||||
(DstAlignCanChange ? 0 : Align),
|
||||
(isZeroStr ? 0 : SrcAlign),
|
||||
true, CopyFromStr, DAG, TLI))
|
||||
true, CopyFromStr, true, DAG, TLI))
|
||||
return SDValue();
|
||||
|
||||
if (DstAlignCanChange) {
|
||||
@ -3545,6 +3571,14 @@ static SDValue getMemcpyLoadsAndStores(SelectionDAG &DAG, DebugLoc dl,
|
||||
unsigned VTSize = VT.getSizeInBits() / 8;
|
||||
SDValue Value, Store;
|
||||
|
||||
if (VTSize > Size) {
|
||||
// Issuing an unaligned load / store pair that overlaps with the previous
|
||||
// pair. Adjust the offset accordingly.
|
||||
assert(i == NumMemOps-1 && i != 0);
|
||||
SrcOff -= VTSize - Size;
|
||||
DstOff -= VTSize - Size;
|
||||
}
|
||||
|
||||
if (CopyFromStr &&
|
||||
(isZeroStr || (VT.isInteger() && !VT.isVector()))) {
|
||||
// It's unlikely a store of a vector immediate can be done in a single
|
||||
@ -3553,11 +3587,14 @@ static SDValue getMemcpyLoadsAndStores(SelectionDAG &DAG, DebugLoc dl,
|
||||
// FIXME: Handle other cases where store of vector immediate is done in
|
||||
// a single instruction.
|
||||
Value = getMemsetStringVal(VT, dl, DAG, TLI, Str.substr(SrcOff));
|
||||
Store = DAG.getStore(Chain, dl, Value,
|
||||
getMemBasePlusOffset(Dst, DstOff, DAG),
|
||||
DstPtrInfo.getWithOffset(DstOff), isVol,
|
||||
false, Align);
|
||||
} else {
|
||||
if (Value.getNode())
|
||||
Store = DAG.getStore(Chain, dl, Value,
|
||||
getMemBasePlusOffset(Dst, DstOff, DAG),
|
||||
DstPtrInfo.getWithOffset(DstOff), isVol,
|
||||
false, Align);
|
||||
}
|
||||
|
||||
if (!Store.getNode()) {
|
||||
// The type might not be legal for the target. This should only happen
|
||||
// if the type is smaller than a legal type, as on PPC, so the right
|
||||
// thing to do is generate a LoadExt/StoreTrunc pair. These simplify
|
||||
@ -3577,6 +3614,7 @@ static SDValue getMemcpyLoadsAndStores(SelectionDAG &DAG, DebugLoc dl,
|
||||
OutChains.push_back(Store);
|
||||
SrcOff += VTSize;
|
||||
DstOff += VTSize;
|
||||
Size -= VTSize;
|
||||
}
|
||||
|
||||
return DAG.getNode(ISD::TokenFactor, dl, MVT::Other,
|
||||
@ -3613,7 +3651,7 @@ static SDValue getMemmoveLoadsAndStores(SelectionDAG &DAG, DebugLoc dl,
|
||||
|
||||
if (!FindOptimalMemOpLowering(MemOps, Limit, Size,
|
||||
(DstAlignCanChange ? 0 : Align),
|
||||
SrcAlign, true, false, DAG, TLI))
|
||||
SrcAlign, true, false, false, DAG, TLI))
|
||||
return SDValue();
|
||||
|
||||
if (DstAlignCanChange) {
|
||||
@ -3689,7 +3727,7 @@ static SDValue getMemsetStores(SelectionDAG &DAG, DebugLoc dl,
|
||||
isa<ConstantSDNode>(Src) && cast<ConstantSDNode>(Src)->isNullValue();
|
||||
if (!FindOptimalMemOpLowering(MemOps, TLI.getMaxStoresPerMemset(OptSize),
|
||||
Size, (DstAlignCanChange ? 0 : Align), 0,
|
||||
IsZeroVal, false, DAG, TLI))
|
||||
IsZeroVal, false, true, DAG, TLI))
|
||||
return SDValue();
|
||||
|
||||
if (DstAlignCanChange) {
|
||||
@ -3716,6 +3754,13 @@ static SDValue getMemsetStores(SelectionDAG &DAG, DebugLoc dl,
|
||||
|
||||
for (unsigned i = 0; i < NumMemOps; i++) {
|
||||
EVT VT = MemOps[i];
|
||||
unsigned VTSize = VT.getSizeInBits() / 8;
|
||||
if (VTSize > Size) {
|
||||
// Issuing an unaligned load / store pair that overlaps with the previous
|
||||
// pair. Adjust the offset accordingly.
|
||||
assert(i == NumMemOps-1 && i != 0);
|
||||
DstOff -= VTSize - Size;
|
||||
}
|
||||
|
||||
// If this store is smaller than the largest store see whether we can get
|
||||
// the smaller value for free with a truncate.
|
||||
@ -3734,6 +3779,7 @@ static SDValue getMemsetStores(SelectionDAG &DAG, DebugLoc dl,
|
||||
isVol, false, Align);
|
||||
OutChains.push_back(Store);
|
||||
DstOff += VT.getSizeInBits() / 8;
|
||||
Size -= VTSize;
|
||||
}
|
||||
|
||||
return DAG.getNode(ISD::TokenFactor, dl, MVT::Other,
|
||||
|
@ -833,9 +833,12 @@ ARMTargetLowering::ARMTargetLowering(TargetMachine &TM)
|
||||
setSchedulingPreference(Sched::Hybrid);
|
||||
|
||||
//// temporary - rewrite interface to use type
|
||||
maxStoresPerMemcpy = maxStoresPerMemcpyOptSize = 1;
|
||||
maxStoresPerMemset = 16;
|
||||
maxStoresPerMemset = 8;
|
||||
maxStoresPerMemsetOptSize = Subtarget->isTargetDarwin() ? 8 : 4;
|
||||
maxStoresPerMemcpy = 4; // For @llvm.memcpy -> sequence of stores
|
||||
maxStoresPerMemcpyOptSize = Subtarget->isTargetDarwin() ? 4 : 2;
|
||||
maxStoresPerMemmove = 4; // For @llvm.memmove -> sequence of stores
|
||||
maxStoresPerMemmoveOptSize = Subtarget->isTargetDarwin() ? 4 : 2;
|
||||
|
||||
// On ARM arguments smaller than 4 bytes are extended, so all arguments
|
||||
// are at least 4 bytes aligned.
|
||||
@ -9406,7 +9409,7 @@ bool ARMTargetLowering::isDesirableToTransformToIntegerOp(unsigned Opc,
|
||||
return (VT == MVT::f32) && (Opc == ISD::LOAD || Opc == ISD::STORE);
|
||||
}
|
||||
|
||||
bool ARMTargetLowering::allowsUnalignedMemoryAccesses(EVT VT) const {
|
||||
bool ARMTargetLowering::allowsUnalignedMemoryAccesses(EVT VT, bool *Fast) const {
|
||||
// The AllowsUnaliged flag models the SCTLR.A setting in ARM cpus
|
||||
bool AllowsUnaligned = Subtarget->allowsUnalignedMem();
|
||||
|
||||
@ -9415,15 +9418,27 @@ bool ARMTargetLowering::allowsUnalignedMemoryAccesses(EVT VT) const {
|
||||
return false;
|
||||
case MVT::i8:
|
||||
case MVT::i16:
|
||||
case MVT::i32:
|
||||
case MVT::i32: {
|
||||
// Unaligned access can use (for example) LRDB, LRDH, LDR
|
||||
return AllowsUnaligned;
|
||||
if (AllowsUnaligned) {
|
||||
if (Fast)
|
||||
*Fast = Subtarget->hasV7Ops();
|
||||
return true;
|
||||
}
|
||||
return false;
|
||||
}
|
||||
case MVT::f64:
|
||||
case MVT::v2f64:
|
||||
case MVT::v2f64: {
|
||||
// For any little-endian targets with neon, we can support unaligned ld/st
|
||||
// of D and Q (e.g. {D0,D1}) registers by using vld1.i8/vst1.i8.
|
||||
// A big-endian target may also explictly support unaligned accesses
|
||||
return Subtarget->hasNEON() && (AllowsUnaligned || isLittleEndian());
|
||||
if (Subtarget->hasNEON() && (AllowsUnaligned || isLittleEndian())) {
|
||||
if (Fast)
|
||||
*Fast = true;
|
||||
return true;
|
||||
}
|
||||
return false;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@ -9442,12 +9457,17 @@ EVT ARMTargetLowering::getOptimalMemOpType(uint64_t Size,
|
||||
|
||||
// See if we can use NEON instructions for this...
|
||||
if (IsZeroVal &&
|
||||
!F->getFnAttributes().hasAttribute(Attributes::NoImplicitFloat) &&
|
||||
Subtarget->hasNEON()) {
|
||||
if (memOpAlign(SrcAlign, DstAlign, 16) && Size >= 16) {
|
||||
return MVT::v4i32;
|
||||
} else if (memOpAlign(SrcAlign, DstAlign, 8) && Size >= 8) {
|
||||
return MVT::v2i32;
|
||||
Subtarget->hasNEON() &&
|
||||
!F->getFnAttributes().hasAttribute(Attributes::NoImplicitFloat)) {
|
||||
bool Fast;
|
||||
if (Size >= 16 && (memOpAlign(SrcAlign, DstAlign, 16) ||
|
||||
(allowsUnalignedMemoryAccesses(MVT::v2f64, &Fast) &&
|
||||
Fast))) {
|
||||
return MVT::v2f64;
|
||||
} else if (Size >= 8 && (memOpAlign(SrcAlign, DstAlign, 8) ||
|
||||
(allowsUnalignedMemoryAccesses(MVT::f64, &Fast) &&
|
||||
Fast))) {
|
||||
return MVT::f64;
|
||||
}
|
||||
}
|
||||
|
||||
@ -10241,6 +10261,24 @@ bool ARMTargetLowering::isFPImmLegal(const APFloat &Imm, EVT VT) const {
|
||||
return false;
|
||||
}
|
||||
|
||||
bool ARMTargetLowering::isIntImmLegal(const APInt &Imm, EVT VT) const {
|
||||
if (VT.getSizeInBits() > 32)
|
||||
return false;
|
||||
|
||||
int32_t ImmVal = Imm.getSExtValue();
|
||||
if (!Subtarget->isThumb()) {
|
||||
return (ImmVal >= 0 && ImmVal < 65536) ||
|
||||
(ARM_AM::getSOImmVal(ImmVal) != -1) ||
|
||||
(ARM_AM::getSOImmVal(~ImmVal) != -1);
|
||||
} else if (Subtarget->isThumb2()) {
|
||||
return (ImmVal >= 0 && ImmVal < 65536) ||
|
||||
(ARM_AM::getT2SOImmVal(ImmVal) != -1) ||
|
||||
(ARM_AM::getT2SOImmVal(~ImmVal) != -1);
|
||||
} else /*Thumb1*/ {
|
||||
return (ImmVal >= 0 && ImmVal < 256);
|
||||
}
|
||||
}
|
||||
|
||||
/// getTgtMemIntrinsic - Represent NEON load and store intrinsics as
|
||||
/// MemIntrinsicNodes. The associated MachineMemOperands record the alignment
|
||||
/// specified in the intrinsic calls.
|
||||
|
@ -285,8 +285,9 @@ namespace llvm {
|
||||
bool isDesirableToTransformToIntegerOp(unsigned Opc, EVT VT) const;
|
||||
|
||||
/// allowsUnalignedMemoryAccesses - Returns true if the target allows
|
||||
/// unaligned memory accesses. of the specified type.
|
||||
virtual bool allowsUnalignedMemoryAccesses(EVT VT) const;
|
||||
/// unaligned memory accesses of the specified type. Returns whether it
|
||||
/// is "fast" by reference in the second argument.
|
||||
virtual bool allowsUnalignedMemoryAccesses(EVT VT, bool *Fast) const;
|
||||
|
||||
virtual EVT getOptimalMemOpType(uint64_t Size,
|
||||
unsigned DstAlign, unsigned SrcAlign,
|
||||
@ -386,6 +387,8 @@ namespace llvm {
|
||||
/// materialize the FP immediate as a load from a constant pool.
|
||||
virtual bool isFPImmLegal(const APFloat &Imm, EVT VT) const;
|
||||
|
||||
virtual bool isIntImmLegal(const APInt &Imm, EVT VT) const;
|
||||
|
||||
virtual bool getTgtMemIntrinsic(IntrinsicInfo &Info,
|
||||
const CallInst &I,
|
||||
unsigned Intrinsic) const;
|
||||
|
@ -2315,13 +2315,15 @@ defm t2ORN : T2I_bin_irs<0b0011, "orn",
|
||||
/// changed to modify CPSR.
|
||||
multiclass T2I_un_irs<bits<4> opcod, string opc,
|
||||
InstrItinClass iii, InstrItinClass iir, InstrItinClass iis,
|
||||
PatFrag opnode, bit Cheap = 0, bit ReMat = 0> {
|
||||
PatFrag opnode,
|
||||
bit Cheap = 0, bit ReMat = 0, bit MoveImm = 0> {
|
||||
// shifted imm
|
||||
def i : T2sOneRegImm<(outs rGPR:$Rd), (ins t2_so_imm:$imm), iii,
|
||||
opc, "\t$Rd, $imm",
|
||||
[(set rGPR:$Rd, (opnode t2_so_imm:$imm))]> {
|
||||
let isAsCheapAsAMove = Cheap;
|
||||
let isReMaterializable = ReMat;
|
||||
let isMoveImm = MoveImm;
|
||||
let Inst{31-27} = 0b11110;
|
||||
let Inst{25} = 0;
|
||||
let Inst{24-21} = opcod;
|
||||
@ -2355,7 +2357,7 @@ multiclass T2I_un_irs<bits<4> opcod, string opc,
|
||||
let AddedComplexity = 1 in
|
||||
defm t2MVN : T2I_un_irs <0b0011, "mvn",
|
||||
IIC_iMVNi, IIC_iMVNr, IIC_iMVNsi,
|
||||
UnOpFrag<(not node:$Src)>, 1, 1>;
|
||||
UnOpFrag<(not node:$Src)>, 1, 1, 1>;
|
||||
|
||||
let AddedComplexity = 1 in
|
||||
def : T2Pat<(and rGPR:$src, t2_so_imm_not:$imm),
|
||||
|
@ -457,7 +457,8 @@ MipsTargetLowering(MipsTargetMachine &TM)
|
||||
maxStoresPerMemcpy = 16;
|
||||
}
|
||||
|
||||
bool MipsTargetLowering::allowsUnalignedMemoryAccesses(EVT VT) const {
|
||||
bool
|
||||
MipsTargetLowering::allowsUnalignedMemoryAccesses(EVT VT, bool *Fast) const {
|
||||
MVT::SimpleValueType SVT = VT.getSimpleVT().SimpleTy;
|
||||
|
||||
if (Subtarget->inMips16Mode())
|
||||
@ -466,6 +467,8 @@ bool MipsTargetLowering::allowsUnalignedMemoryAccesses(EVT VT) const {
|
||||
switch (SVT) {
|
||||
case MVT::i64:
|
||||
case MVT::i32:
|
||||
if (Fast)
|
||||
*Fast = true;
|
||||
return true;
|
||||
default:
|
||||
return false;
|
||||
|
@ -149,7 +149,7 @@ namespace llvm {
|
||||
|
||||
virtual MVT getShiftAmountTy(EVT LHSTy) const { return MVT::i32; }
|
||||
|
||||
virtual bool allowsUnalignedMemoryAccesses (EVT VT) const;
|
||||
virtual bool allowsUnalignedMemoryAccesses (EVT VT, bool *Fast) const;
|
||||
|
||||
virtual void LowerOperationWrapper(SDNode *N,
|
||||
SmallVectorImpl<SDValue> &Results,
|
||||
|
@ -1412,6 +1412,13 @@ X86TargetLowering::getOptimalMemOpType(uint64_t Size,
|
||||
return MVT::i32;
|
||||
}
|
||||
|
||||
bool
|
||||
X86TargetLowering::allowsUnalignedMemoryAccesses(EVT VT, bool *Fast) const {
|
||||
if (Fast)
|
||||
*Fast = Subtarget->isUnalignedMemAccessFast();
|
||||
return true;
|
||||
}
|
||||
|
||||
/// getJumpTableEncoding - Return the entry encoding for a jump table in the
|
||||
/// current function. The returned value is a member of the
|
||||
/// MachineJumpTableInfo::JTEntryKind enum.
|
||||
|
@ -507,10 +507,9 @@ namespace llvm {
|
||||
MachineFunction &MF) const;
|
||||
|
||||
/// allowsUnalignedMemoryAccesses - Returns true if the target allows
|
||||
/// unaligned memory accesses. of the specified type.
|
||||
virtual bool allowsUnalignedMemoryAccesses(EVT VT) const {
|
||||
return true;
|
||||
}
|
||||
/// unaligned memory accesses. of the specified type. Returns whether it
|
||||
/// is "fast" by reference in the second argument.
|
||||
virtual bool allowsUnalignedMemoryAccesses(EVT VT, bool *Fast) const;
|
||||
|
||||
/// LowerOperation - Provide custom lowering hooks for some operations.
|
||||
///
|
||||
|
@ -1,13 +1,5 @@
|
||||
; RUN: llc -march=arm -mcpu=cortex-a8 < %s | FileCheck %s
|
||||
|
||||
; Should trigger a NEON store.
|
||||
; CHECK: vstr
|
||||
define void @f_0_12(i8* nocapture %c) nounwind optsize {
|
||||
entry:
|
||||
call void @llvm.memset.p0i8.i64(i8* %c, i8 0, i64 12, i32 8, i1 false)
|
||||
ret void
|
||||
}
|
||||
|
||||
; Trigger multiple NEON stores.
|
||||
; CHECK: vst1.64
|
||||
; CHECK-NEXT: vst1.64
|
||||
|
@ -1,18 +1,115 @@
|
||||
; RUN: llc < %s -mtriple=thumbv7-apple-darwin -disable-post-ra | FileCheck %s
|
||||
|
||||
; CHECK: ldrd
|
||||
; CHECK: strd
|
||||
; CHECK: ldrb
|
||||
; RUN: llc < %s -mtriple=thumbv7-apple-ios -mcpu=cortex-a8 -pre-RA-sched=source -disable-post-ra | FileCheck %s
|
||||
|
||||
%struct.x = type { i8, i8, i8, i8, i8, i8, i8, i8, i8, i8, i8 }
|
||||
|
||||
@src = external global %struct.x
|
||||
@dst = external global %struct.x
|
||||
|
||||
define i32 @t() {
|
||||
@.str1 = private unnamed_addr constant [31 x i8] c"DHRYSTONE PROGRAM, SOME STRING\00", align 1
|
||||
@.str2 = private unnamed_addr constant [36 x i8] c"DHRYSTONE PROGRAM, SOME STRING BLAH\00", align 1
|
||||
@.str3 = private unnamed_addr constant [24 x i8] c"DHRYSTONE PROGRAM, SOME\00", align 1
|
||||
@.str4 = private unnamed_addr constant [18 x i8] c"DHRYSTONE PROGR \00", align 1
|
||||
@.str5 = private unnamed_addr constant [7 x i8] c"DHRYST\00", align 1
|
||||
@.str6 = private unnamed_addr constant [14 x i8] c"/tmp/rmXXXXXX\00", align 1
|
||||
@spool.splbuf = internal global [512 x i8] zeroinitializer, align 16
|
||||
|
||||
define i32 @t0() {
|
||||
entry:
|
||||
; CHECK: t0:
|
||||
; CHECK: vldr [[REG1:d[0-9]+]],
|
||||
; CHECK: vstr [[REG1]],
|
||||
call void @llvm.memcpy.p0i8.p0i8.i32(i8* getelementptr inbounds (%struct.x* @dst, i32 0, i32 0), i8* getelementptr inbounds (%struct.x* @src, i32 0, i32 0), i32 11, i32 8, i1 false)
|
||||
ret i32 0
|
||||
}
|
||||
|
||||
define void @t1(i8* nocapture %C) nounwind {
|
||||
entry:
|
||||
; CHECK: t1:
|
||||
; CHECK: vld1.8 {d{{[0-9]+}}, d{{[0-9]+}}}, [r1]
|
||||
; CHECK: vst1.8 {d{{[0-9]+}}, d{{[0-9]+}}}, [r0]
|
||||
; CHECK: adds r0, #15
|
||||
; CHECK: adds r1, #15
|
||||
; CHECK: vld1.8 {d{{[0-9]+}}, d{{[0-9]+}}}, [r1]
|
||||
; CHECK: vst1.8 {d{{[0-9]+}}, d{{[0-9]+}}}, [r0]
|
||||
tail call void @llvm.memcpy.p0i8.p0i8.i64(i8* %C, i8* getelementptr inbounds ([31 x i8]* @.str1, i64 0, i64 0), i64 31, i32 1, i1 false)
|
||||
ret void
|
||||
}
|
||||
|
||||
define void @t2(i8* nocapture %C) nounwind {
|
||||
entry:
|
||||
; CHECK: t2:
|
||||
; CHECK: ldr [[REG2:r[0-9]+]], [r1, #32]
|
||||
; CHECK: str [[REG2]], [r0, #32]
|
||||
; CHECK: vld1.8 {d{{[0-9]+}}, d{{[0-9]+}}}, [r1]
|
||||
; CHECK: vst1.8 {d{{[0-9]+}}, d{{[0-9]+}}}, [r0]
|
||||
; CHECK: adds r0, #16
|
||||
; CHECK: adds r1, #16
|
||||
; CHECK: vld1.8 {d{{[0-9]+}}, d{{[0-9]+}}}, [r1]
|
||||
; CHECK: vst1.8 {d{{[0-9]+}}, d{{[0-9]+}}}, [r0]
|
||||
tail call void @llvm.memcpy.p0i8.p0i8.i64(i8* %C, i8* getelementptr inbounds ([36 x i8]* @.str2, i64 0, i64 0), i64 36, i32 1, i1 false)
|
||||
ret void
|
||||
}
|
||||
|
||||
define void @t3(i8* nocapture %C) nounwind {
|
||||
entry:
|
||||
; CHECK: t3:
|
||||
; CHECK: vld1.8 {d{{[0-9]+}}, d{{[0-9]+}}}, [r1]
|
||||
; CHECK: vst1.8 {d{{[0-9]+}}, d{{[0-9]+}}}, [r0]
|
||||
; CHECK: adds r0, #16
|
||||
; CHECK: adds r1, #16
|
||||
; CHECK: vld1.8 {d{{[0-9]+}}}, [r1]
|
||||
; CHECK: vst1.8 {d{{[0-9]+}}}, [r0]
|
||||
tail call void @llvm.memcpy.p0i8.p0i8.i64(i8* %C, i8* getelementptr inbounds ([24 x i8]* @.str3, i64 0, i64 0), i64 24, i32 1, i1 false)
|
||||
ret void
|
||||
}
|
||||
|
||||
define void @t4(i8* nocapture %C) nounwind {
|
||||
entry:
|
||||
; CHECK: t4:
|
||||
; CHECK: vld1.8 {[[REG3:d[0-9]+]], [[REG4:d[0-9]+]]}, [r1]
|
||||
; CHECK: vst1.8 {[[REG3]], [[REG4]]}, [r0]
|
||||
tail call void @llvm.memcpy.p0i8.p0i8.i64(i8* %C, i8* getelementptr inbounds ([18 x i8]* @.str4, i64 0, i64 0), i64 18, i32 1, i1 false)
|
||||
ret void
|
||||
}
|
||||
|
||||
define void @t5(i8* nocapture %C) nounwind {
|
||||
entry:
|
||||
; CHECK: t5:
|
||||
; CHECK: movs [[REG5:r[0-9]+]], #0
|
||||
; CHECK: strb [[REG5]], [r0, #6]
|
||||
; CHECK: movw [[REG6:r[0-9]+]], #21587
|
||||
; CHECK: strh [[REG6]], [r0, #4]
|
||||
; CHECK: ldr [[REG7:r[0-9]+]],
|
||||
; CHECK: str [[REG7]]
|
||||
tail call void @llvm.memcpy.p0i8.p0i8.i64(i8* %C, i8* getelementptr inbounds ([7 x i8]* @.str5, i64 0, i64 0), i64 7, i32 1, i1 false)
|
||||
ret void
|
||||
}
|
||||
|
||||
define void @t6() nounwind {
|
||||
entry:
|
||||
; CHECK: t6:
|
||||
; CHECK: vld1.8 {[[REG8:d[0-9]+]]}, [r0]
|
||||
; CHECK: vstr [[REG8]], [r1]
|
||||
; CHECK: adds r1, #6
|
||||
; CHECK: adds r0, #6
|
||||
; CHECK: vld1.8
|
||||
; CHECK: vst1.16
|
||||
call void @llvm.memcpy.p0i8.p0i8.i64(i8* getelementptr inbounds ([512 x i8]* @spool.splbuf, i64 0, i64 0), i8* getelementptr inbounds ([14 x i8]* @.str6, i64 0, i64 0), i64 14, i32 1, i1 false)
|
||||
ret void
|
||||
}
|
||||
|
||||
%struct.Foo = type { i32, i32, i32, i32 }
|
||||
|
||||
define void @t7(%struct.Foo* nocapture %a, %struct.Foo* nocapture %b) nounwind {
|
||||
entry:
|
||||
; CHECK: t7
|
||||
; CHECK: vld1.32
|
||||
; CHECK: vst1.32
|
||||
%0 = bitcast %struct.Foo* %a to i8*
|
||||
%1 = bitcast %struct.Foo* %b to i8*
|
||||
tail call void @llvm.memcpy.p0i8.p0i8.i32(i8* %0, i8* %1, i32 16, i32 4, i1 false)
|
||||
ret void
|
||||
}
|
||||
|
||||
declare void @llvm.memcpy.p0i8.p0i8.i32(i8* nocapture, i8* nocapture, i32, i32, i1) nounwind
|
||||
declare void @llvm.memcpy.p0i8.p0i8.i64(i8* nocapture, i8* nocapture, i64, i32, i1) nounwind
|
||||
|
30
test/CodeGen/ARM/memset-inline.ll
Normal file
30
test/CodeGen/ARM/memset-inline.ll
Normal file
@ -0,0 +1,30 @@
|
||||
; RUN: llc < %s -mtriple=thumbv7-apple-ios -mcpu=cortex-a8 -pre-RA-sched=source -disable-post-ra | FileCheck %s
|
||||
|
||||
define void @t1(i8* nocapture %c) nounwind optsize {
|
||||
entry:
|
||||
; CHECK: t1:
|
||||
; CHECK: movs r1, #0
|
||||
; CHECK: str r1, [r0]
|
||||
; CHECK: str r1, [r0, #4]
|
||||
; CHECK: str r1, [r0, #8]
|
||||
call void @llvm.memset.p0i8.i64(i8* %c, i8 0, i64 12, i32 8, i1 false)
|
||||
ret void
|
||||
}
|
||||
|
||||
define void @t2() nounwind ssp {
|
||||
entry:
|
||||
; CHECK: t2:
|
||||
; CHECK: add.w r1, r0, #10
|
||||
; CHECK: vmov.i32 {{q[0-9]+}}, #0x0
|
||||
; CHECK: vst1.16 {d{{[0-9]+}}, d{{[0-9]+}}}, [r1]
|
||||
; CHECK: vst1.32 {d{{[0-9]+}}, d{{[0-9]+}}}, [r0]
|
||||
%buf = alloca [26 x i8], align 1
|
||||
%0 = getelementptr inbounds [26 x i8]* %buf, i32 0, i32 0
|
||||
call void @llvm.memset.p0i8.i32(i8* %0, i8 0, i32 26, i32 1, i1 false)
|
||||
call void @something(i8* %0) nounwind
|
||||
ret void
|
||||
}
|
||||
|
||||
declare void @something(i8*) nounwind
|
||||
declare void @llvm.memset.p0i8.i32(i8* nocapture, i8, i32, i32, i1) nounwind
|
||||
declare void @llvm.memset.p0i8.i64(i8* nocapture, i8, i64, i32, i1) nounwind
|
@ -1,16 +0,0 @@
|
||||
; RUN: llc < %s -march=arm -mcpu=cortex-a8 | FileCheck %s
|
||||
; Check that memcpy gets lowered to ldm/stm, at least in this very smple case.
|
||||
|
||||
%struct.Foo = type { i32, i32, i32, i32 }
|
||||
|
||||
define void @_Z10CopyStructP3FooS0_(%struct.Foo* nocapture %a, %struct.Foo* nocapture %b) nounwind {
|
||||
entry:
|
||||
;CHECK: ldm
|
||||
;CHECK: stm
|
||||
%0 = bitcast %struct.Foo* %a to i8*
|
||||
%1 = bitcast %struct.Foo* %b to i8*
|
||||
tail call void @llvm.memcpy.p0i8.p0i8.i32(i8* %0, i8* %1, i32 16, i32 4, i1 false)
|
||||
ret void
|
||||
}
|
||||
|
||||
declare void @llvm.memcpy.p0i8.p0i8.i32(i8* nocapture, i8* nocapture, i32, i32, i1) nounwind
|
@ -6,15 +6,16 @@
|
||||
define void @t(i32 %count) ssp nounwind {
|
||||
entry:
|
||||
; CHECK: t:
|
||||
; CHECK: movq ___stack_chk_guard@GOTPCREL(%rip)
|
||||
; CHECK: movups L_str(%rip), %xmm0
|
||||
; CHECK: movups L_str+12(%rip), %xmm0
|
||||
; CHECK: movups L_str(%rip), %xmm1
|
||||
%tmp0 = alloca [60 x i8], align 1
|
||||
%tmp1 = getelementptr inbounds [60 x i8]* %tmp0, i64 0, i64 0
|
||||
br label %bb1
|
||||
|
||||
bb1:
|
||||
; CHECK: LBB0_1:
|
||||
; CHECK: movaps %xmm0, (%rsp)
|
||||
; CHECK: movups %xmm0, 12(%rsp)
|
||||
; CHECK: movaps %xmm1, (%rsp)
|
||||
%tmp2 = phi i32 [ %tmp3, %bb1 ], [ 0, %entry ]
|
||||
call void @llvm.memcpy.p0i8.p0i8.i64(i8* %tmp1, i8* getelementptr inbounds ([28 x i8]* @str, i64 0, i64 0), i64 28, i32 1, i1 false)
|
||||
%tmp3 = add i32 %tmp2, 1
|
||||
|
@ -10,18 +10,18 @@
|
||||
define void @t1(i32 %argc, i8** %argv) nounwind {
|
||||
entry:
|
||||
; SSE2: t1:
|
||||
; SSE2: movsd _.str+16, %xmm0
|
||||
; SSE2: movsd %xmm0, 16(%esp)
|
||||
; SSE2: movaps _.str, %xmm0
|
||||
; SSE2: movaps %xmm0
|
||||
; SSE2: movb $0
|
||||
; SSE2: movl $0
|
||||
; SSE2: movl $0
|
||||
; SSE2: movb $0, 24(%esp)
|
||||
|
||||
; SSE1: t1:
|
||||
; SSE1: fldl _.str+16
|
||||
; SSE1: fstpl 16(%esp)
|
||||
; SSE1: movaps _.str, %xmm0
|
||||
; SSE1: movaps %xmm0
|
||||
; SSE1: movb $0
|
||||
; SSE1: movl $0
|
||||
; SSE1: movl $0
|
||||
; SSE1: movb $0, 24(%esp)
|
||||
|
||||
; NOSSE: t1:
|
||||
; NOSSE: movb $0
|
||||
|
Loading…
x
Reference in New Issue
Block a user