[InstCombineCalls] Unfold element atomic memcpy instruction

Differential Revision: https://reviews.llvm.org/D28909



git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@294453 91177308-0d34-0410-b5e6-96231b3b80d8
This commit is contained in:
Igor Laevsky 2017-02-08 14:32:04 +00:00
parent 25ddfba833
commit ab47a3d4af
3 changed files with 175 additions and 0 deletions

View File

@ -60,6 +60,12 @@ using namespace PatternMatch;
STATISTIC(NumSimplified, "Number of library calls simplified");
static cl::opt<uint64_t> UnfoldElementAtomicMemcpyMaxElements(
"unfold-element-atomic-memcpy-max-elements",
cl::init(16),
cl::desc("Maximum number of elements in atomic memcpy the optimizer is "
"allowed to unfold"));
/// Return the specified type promoted as it would be to pass though a va_arg
/// area.
static Type *getPromotedType(Type *Ty) {
@ -108,6 +114,78 @@ static Constant *getNegativeIsTrueBoolVec(ConstantDataVector *V) {
return ConstantVector::get(BoolVec);
}
Instruction *
InstCombiner::SimplifyElementAtomicMemCpy(ElementAtomicMemCpyInst *AMI) {
// Try to unfold this intrinsic into sequence of explicit atomic loads and
// stores.
// First check that number of elements is compile time constant.
auto *NumElementsCI = dyn_cast<ConstantInt>(AMI->getNumElements());
if (!NumElementsCI)
return nullptr;
// Check that there are not too many elements.
uint64_t NumElements = NumElementsCI->getZExtValue();
if (NumElements >= UnfoldElementAtomicMemcpyMaxElements)
return nullptr;
// Don't unfold into illegal integers
uint64_t ElementSizeInBytes = AMI->getElementSizeInBytes() * 8;
if (!getDataLayout().isLegalInteger(ElementSizeInBytes))
return nullptr;
// Cast source and destination to the correct type. Intrinsic input arguments
// are usually represented as i8*.
// Often operands will be explicitly casted to i8* and we can just strip
// those casts instead of inserting new ones. However it's easier to rely on
// other InstCombine rules which will cover trivial cases anyway.
Value *Src = AMI->getRawSource();
Value *Dst = AMI->getRawDest();
Type *ElementPointerType = Type::getIntNPtrTy(
AMI->getContext(), ElementSizeInBytes, Src->getType()->getPointerAddressSpace());
Value *SrcCasted = Builder->CreatePointerCast(Src, ElementPointerType,
"memcpy_unfold.src_casted");
Value *DstCasted = Builder->CreatePointerCast(Dst, ElementPointerType,
"memcpy_unfold.dst_casted");
for (uint64_t i = 0; i < NumElements; ++i) {
// Get current element addresses
ConstantInt *ElementIdxCI =
ConstantInt::get(AMI->getContext(), APInt(64, i));
Value *SrcElementAddr =
Builder->CreateGEP(SrcCasted, ElementIdxCI, "memcpy_unfold.src_addr");
Value *DstElementAddr =
Builder->CreateGEP(DstCasted, ElementIdxCI, "memcpy_unfold.dst_addr");
// Load from the source. Transfer alignment information and mark load as
// unordered atomic.
LoadInst *Load = Builder->CreateLoad(SrcElementAddr, "memcpy_unfold.val");
Load->setOrdering(AtomicOrdering::Unordered);
// We know alignment of the first element. It is also guaranteed by the
// verifier that element size is less or equal than first element alignment
// and both of this values are powers of two.
// This means that all subsequent accesses are at least element size
// aligned.
// TODO: We can infer better alignment but there is no evidence that this
// will matter.
Load->setAlignment(i == 0 ? AMI->getSrcAlignment()
: AMI->getElementSizeInBytes());
Load->setDebugLoc(AMI->getDebugLoc());
// Store loaded value via unordered atomic store.
StoreInst *Store = Builder->CreateStore(Load, DstElementAddr);
Store->setOrdering(AtomicOrdering::Unordered);
Store->setAlignment(i == 0 ? AMI->getDstAlignment()
: AMI->getElementSizeInBytes());
Store->setDebugLoc(AMI->getDebugLoc());
}
// Set the number of elements of the copy to 0, it will be deleted on the
// next iteration.
AMI->setNumElements(Constant::getNullValue(NumElementsCI->getType()));
return AMI;
}
Instruction *InstCombiner::SimplifyMemTransfer(MemIntrinsic *MI) {
unsigned DstAlign = getKnownAlignment(MI->getArgOperand(0), DL, MI, &AC, &DT);
unsigned SrcAlign = getKnownAlignment(MI->getArgOperand(1), DL, MI, &AC, &DT);
@ -1839,6 +1917,9 @@ Instruction *InstCombiner::visitCallInst(CallInst &CI) {
if (Constant *C = dyn_cast<Constant>(AMI->getNumElements()))
if (C->isNullValue())
return eraseInstFromFunction(*AMI);
if (Instruction *I = SimplifyElementAtomicMemCpy(AMI))
return I;
}
if (Instruction *I = SimplifyNVVMIntrinsic(II, *this))

View File

@ -650,6 +650,8 @@ private:
Instruction *PromoteCastOfAllocation(BitCastInst &CI, AllocaInst &AI);
Instruction *MatchBSwap(BinaryOperator &I);
bool SimplifyStoreAtEndOfBlock(StoreInst &SI);
Instruction *SimplifyElementAtomicMemCpy(ElementAtomicMemCpyInst *AMI);
Instruction *SimplifyMemTransfer(MemIntrinsic *MI);
Instruction *SimplifyMemSet(MemSetInst *MI);

View File

@ -0,0 +1,92 @@
; RUN: opt -instcombine -unfold-element-atomic-memcpy-max-elements=8 -S < %s | FileCheck %s
target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
; Test basic unfolding
define void @test1(i8* %Src, i8* %Dst) {
; CHECK-LABEL: test1
; CHECK-NOT: llvm.memcpy.element.atomic
; CHECK-DAG: %memcpy_unfold.src_casted = bitcast i8* %Src to i32*
; CHECK-DAG: %memcpy_unfold.dst_casted = bitcast i8* %Dst to i32*
; CHECK-DAG: [[VAL1:%[^\s]+]] = load atomic i32, i32* %memcpy_unfold.src_casted unordered, align 4
; CHECK-DAG: store atomic i32 [[VAL1]], i32* %memcpy_unfold.dst_casted unordered, align 8
; CHECK-DAG: [[VAL2:%[^\s]+]] = load atomic i32, i32* %{{[^\s]+}} unordered, align 4
; CHECK-DAG: store atomic i32 [[VAL2]], i32* %{{[^\s]+}} unordered, align 4
; CHECK-DAG: [[VAL3:%[^\s]+]] = load atomic i32, i32* %{{[^\s]+}} unordered, align 4
; CHECK-DAG: store atomic i32 [[VAL3]], i32* %{{[^\s]+}} unordered, align 4
; CHECK-DAG: [[VAL4:%[^\s]+]] = load atomic i32, i32* %{{[^\s]+}} unordered, align 4
; CHECK-DAG: store atomic i32 [[VAL4]], i32* %{{[^\s]+}} unordered, align 4
entry:
call void @llvm.memcpy.element.atomic.p0i8.p0i8(i8* align 4 %Dst, i8* align 8 %Src, i64 4, i32 4)
ret void
}
; Test that we don't unfold too much
define void @test2(i8* %Src, i8* %Dst) {
; CHECK-LABEL: test2
; CHECK-NOT: load
; CHECK-NOT: store
; CHECK: llvm.memcpy.element.atomic
entry:
call void @llvm.memcpy.element.atomic.p0i8.p0i8(i8* align 4 %Dst, i8* align 4 %Src, i64 1000, i32 4)
ret void
}
; Test that we will not unfold into non native integers
define void @test3(i8* %Src, i8* %Dst) {
; CHECK-LABEL: test3
; CHECK-NOT: load
; CHECK-NOT: store
; CHECK: llvm.memcpy.element.atomic
entry:
call void @llvm.memcpy.element.atomic.p0i8.p0i8(i8* align 64 %Dst, i8* align 64 %Src, i64 4, i32 64)
ret void
}
; Test that we will eliminate redundant bitcasts
define void @test4(i64* %Src, i64* %Dst) {
; CHECK-LABEL: test4
; CHECK-NOT: llvm.memcpy.element.atomic
; CHECK-NOT: bitcast
; CHECK-DAG: [[VAL1:%[^\s]+]] = load atomic i64, i64* %Src unordered, align 16
; CHECK-DAG: store atomic i64 [[VAL1]], i64* %Dst unordered, align 16
; CHECK-DAG: [[SRC_ADDR2:%[^ ]+]] = getelementptr i64, i64* %Src, i64 1
; CHECK-DAG: [[DST_ADDR2:%[^ ]+]] = getelementptr i64, i64* %Dst, i64 1
; CHECK-DAG: [[VAL2:%[^\s]+]] = load atomic i64, i64* [[SRC_ADDR2]] unordered, align 8
; CHECK-DAG: store atomic i64 [[VAL2]], i64* [[DST_ADDR2]] unordered, align 8
; CHECK-DAG: [[SRC_ADDR3:%[^ ]+]] = getelementptr i64, i64* %Src, i64 2
; CHECK-DAG: [[DST_ADDR3:%[^ ]+]] = getelementptr i64, i64* %Dst, i64 2
; CHECK-DAG: [[VAL3:%[^ ]+]] = load atomic i64, i64* [[SRC_ADDR3]] unordered, align 8
; CHECK-DAG: store atomic i64 [[VAL3]], i64* [[DST_ADDR3]] unordered, align 8
; CHECK-DAG: [[SRC_ADDR4:%[^ ]+]] = getelementptr i64, i64* %Src, i64 3
; CHECK-DAG: [[DST_ADDR4:%[^ ]+]] = getelementptr i64, i64* %Dst, i64 3
; CHECK-DAG: [[VAL4:%[^ ]+]] = load atomic i64, i64* [[SRC_ADDR4]] unordered, align 8
; CHECK-DAG: store atomic i64 [[VAL4]], i64* [[DST_ADDR4]] unordered, align 8
entry:
%Src.casted = bitcast i64* %Src to i8*
%Dst.casted = bitcast i64* %Dst to i8*
call void @llvm.memcpy.element.atomic.p0i8.p0i8(i8* align 16 %Dst.casted, i8* align 16 %Src.casted, i64 4, i32 8)
ret void
}
define void @test5(i8* %Src, i8* %Dst) {
; CHECK-LABEL: test5
; CHECK-NOT: llvm.memcpy.element.atomic.p0i8.p0i8(i8* align 64 %Dst, i8* align 64 %Src, i64 0, i32 64)
entry:
call void @llvm.memcpy.element.atomic.p0i8.p0i8(i8* align 64 %Dst, i8* align 64 %Src, i64 0, i32 64)
ret void
}
declare void @llvm.memcpy.element.atomic.p0i8.p0i8(i8* nocapture, i8* nocapture, i64, i32)