mirror of
https://github.com/RPCSX/llvm.git
synced 2024-11-24 12:19:53 +00:00
[InstCombineCalls] Unfold element atomic memcpy instruction
Differential Revision: https://reviews.llvm.org/D28909 git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@294453 91177308-0d34-0410-b5e6-96231b3b80d8
This commit is contained in:
parent
25ddfba833
commit
ab47a3d4af
@ -60,6 +60,12 @@ using namespace PatternMatch;
|
||||
|
||||
STATISTIC(NumSimplified, "Number of library calls simplified");
|
||||
|
||||
static cl::opt<uint64_t> UnfoldElementAtomicMemcpyMaxElements(
|
||||
"unfold-element-atomic-memcpy-max-elements",
|
||||
cl::init(16),
|
||||
cl::desc("Maximum number of elements in atomic memcpy the optimizer is "
|
||||
"allowed to unfold"));
|
||||
|
||||
/// Return the specified type promoted as it would be to pass though a va_arg
|
||||
/// area.
|
||||
static Type *getPromotedType(Type *Ty) {
|
||||
@ -108,6 +114,78 @@ static Constant *getNegativeIsTrueBoolVec(ConstantDataVector *V) {
|
||||
return ConstantVector::get(BoolVec);
|
||||
}
|
||||
|
||||
Instruction *
|
||||
InstCombiner::SimplifyElementAtomicMemCpy(ElementAtomicMemCpyInst *AMI) {
|
||||
// Try to unfold this intrinsic into sequence of explicit atomic loads and
|
||||
// stores.
|
||||
// First check that number of elements is compile time constant.
|
||||
auto *NumElementsCI = dyn_cast<ConstantInt>(AMI->getNumElements());
|
||||
if (!NumElementsCI)
|
||||
return nullptr;
|
||||
|
||||
// Check that there are not too many elements.
|
||||
uint64_t NumElements = NumElementsCI->getZExtValue();
|
||||
if (NumElements >= UnfoldElementAtomicMemcpyMaxElements)
|
||||
return nullptr;
|
||||
|
||||
// Don't unfold into illegal integers
|
||||
uint64_t ElementSizeInBytes = AMI->getElementSizeInBytes() * 8;
|
||||
if (!getDataLayout().isLegalInteger(ElementSizeInBytes))
|
||||
return nullptr;
|
||||
|
||||
// Cast source and destination to the correct type. Intrinsic input arguments
|
||||
// are usually represented as i8*.
|
||||
// Often operands will be explicitly casted to i8* and we can just strip
|
||||
// those casts instead of inserting new ones. However it's easier to rely on
|
||||
// other InstCombine rules which will cover trivial cases anyway.
|
||||
Value *Src = AMI->getRawSource();
|
||||
Value *Dst = AMI->getRawDest();
|
||||
Type *ElementPointerType = Type::getIntNPtrTy(
|
||||
AMI->getContext(), ElementSizeInBytes, Src->getType()->getPointerAddressSpace());
|
||||
|
||||
Value *SrcCasted = Builder->CreatePointerCast(Src, ElementPointerType,
|
||||
"memcpy_unfold.src_casted");
|
||||
Value *DstCasted = Builder->CreatePointerCast(Dst, ElementPointerType,
|
||||
"memcpy_unfold.dst_casted");
|
||||
|
||||
for (uint64_t i = 0; i < NumElements; ++i) {
|
||||
// Get current element addresses
|
||||
ConstantInt *ElementIdxCI =
|
||||
ConstantInt::get(AMI->getContext(), APInt(64, i));
|
||||
Value *SrcElementAddr =
|
||||
Builder->CreateGEP(SrcCasted, ElementIdxCI, "memcpy_unfold.src_addr");
|
||||
Value *DstElementAddr =
|
||||
Builder->CreateGEP(DstCasted, ElementIdxCI, "memcpy_unfold.dst_addr");
|
||||
|
||||
// Load from the source. Transfer alignment information and mark load as
|
||||
// unordered atomic.
|
||||
LoadInst *Load = Builder->CreateLoad(SrcElementAddr, "memcpy_unfold.val");
|
||||
Load->setOrdering(AtomicOrdering::Unordered);
|
||||
// We know alignment of the first element. It is also guaranteed by the
|
||||
// verifier that element size is less or equal than first element alignment
|
||||
// and both of this values are powers of two.
|
||||
// This means that all subsequent accesses are at least element size
|
||||
// aligned.
|
||||
// TODO: We can infer better alignment but there is no evidence that this
|
||||
// will matter.
|
||||
Load->setAlignment(i == 0 ? AMI->getSrcAlignment()
|
||||
: AMI->getElementSizeInBytes());
|
||||
Load->setDebugLoc(AMI->getDebugLoc());
|
||||
|
||||
// Store loaded value via unordered atomic store.
|
||||
StoreInst *Store = Builder->CreateStore(Load, DstElementAddr);
|
||||
Store->setOrdering(AtomicOrdering::Unordered);
|
||||
Store->setAlignment(i == 0 ? AMI->getDstAlignment()
|
||||
: AMI->getElementSizeInBytes());
|
||||
Store->setDebugLoc(AMI->getDebugLoc());
|
||||
}
|
||||
|
||||
// Set the number of elements of the copy to 0, it will be deleted on the
|
||||
// next iteration.
|
||||
AMI->setNumElements(Constant::getNullValue(NumElementsCI->getType()));
|
||||
return AMI;
|
||||
}
|
||||
|
||||
Instruction *InstCombiner::SimplifyMemTransfer(MemIntrinsic *MI) {
|
||||
unsigned DstAlign = getKnownAlignment(MI->getArgOperand(0), DL, MI, &AC, &DT);
|
||||
unsigned SrcAlign = getKnownAlignment(MI->getArgOperand(1), DL, MI, &AC, &DT);
|
||||
@ -1839,6 +1917,9 @@ Instruction *InstCombiner::visitCallInst(CallInst &CI) {
|
||||
if (Constant *C = dyn_cast<Constant>(AMI->getNumElements()))
|
||||
if (C->isNullValue())
|
||||
return eraseInstFromFunction(*AMI);
|
||||
|
||||
if (Instruction *I = SimplifyElementAtomicMemCpy(AMI))
|
||||
return I;
|
||||
}
|
||||
|
||||
if (Instruction *I = SimplifyNVVMIntrinsic(II, *this))
|
||||
|
@ -650,6 +650,8 @@ private:
|
||||
Instruction *PromoteCastOfAllocation(BitCastInst &CI, AllocaInst &AI);
|
||||
Instruction *MatchBSwap(BinaryOperator &I);
|
||||
bool SimplifyStoreAtEndOfBlock(StoreInst &SI);
|
||||
|
||||
Instruction *SimplifyElementAtomicMemCpy(ElementAtomicMemCpyInst *AMI);
|
||||
Instruction *SimplifyMemTransfer(MemIntrinsic *MI);
|
||||
Instruction *SimplifyMemSet(MemSetInst *MI);
|
||||
|
||||
|
@ -0,0 +1,92 @@
|
||||
; RUN: opt -instcombine -unfold-element-atomic-memcpy-max-elements=8 -S < %s | FileCheck %s
|
||||
target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
|
||||
|
||||
; Test basic unfolding
|
||||
define void @test1(i8* %Src, i8* %Dst) {
|
||||
; CHECK-LABEL: test1
|
||||
; CHECK-NOT: llvm.memcpy.element.atomic
|
||||
|
||||
; CHECK-DAG: %memcpy_unfold.src_casted = bitcast i8* %Src to i32*
|
||||
; CHECK-DAG: %memcpy_unfold.dst_casted = bitcast i8* %Dst to i32*
|
||||
|
||||
; CHECK-DAG: [[VAL1:%[^\s]+]] = load atomic i32, i32* %memcpy_unfold.src_casted unordered, align 4
|
||||
; CHECK-DAG: store atomic i32 [[VAL1]], i32* %memcpy_unfold.dst_casted unordered, align 8
|
||||
|
||||
; CHECK-DAG: [[VAL2:%[^\s]+]] = load atomic i32, i32* %{{[^\s]+}} unordered, align 4
|
||||
; CHECK-DAG: store atomic i32 [[VAL2]], i32* %{{[^\s]+}} unordered, align 4
|
||||
|
||||
; CHECK-DAG: [[VAL3:%[^\s]+]] = load atomic i32, i32* %{{[^\s]+}} unordered, align 4
|
||||
; CHECK-DAG: store atomic i32 [[VAL3]], i32* %{{[^\s]+}} unordered, align 4
|
||||
|
||||
; CHECK-DAG: [[VAL4:%[^\s]+]] = load atomic i32, i32* %{{[^\s]+}} unordered, align 4
|
||||
; CHECK-DAG: store atomic i32 [[VAL4]], i32* %{{[^\s]+}} unordered, align 4
|
||||
entry:
|
||||
call void @llvm.memcpy.element.atomic.p0i8.p0i8(i8* align 4 %Dst, i8* align 8 %Src, i64 4, i32 4)
|
||||
ret void
|
||||
}
|
||||
|
||||
; Test that we don't unfold too much
|
||||
define void @test2(i8* %Src, i8* %Dst) {
|
||||
; CHECK-LABEL: test2
|
||||
|
||||
; CHECK-NOT: load
|
||||
; CHECK-NOT: store
|
||||
; CHECK: llvm.memcpy.element.atomic
|
||||
entry:
|
||||
call void @llvm.memcpy.element.atomic.p0i8.p0i8(i8* align 4 %Dst, i8* align 4 %Src, i64 1000, i32 4)
|
||||
ret void
|
||||
}
|
||||
|
||||
; Test that we will not unfold into non native integers
|
||||
define void @test3(i8* %Src, i8* %Dst) {
|
||||
; CHECK-LABEL: test3
|
||||
|
||||
; CHECK-NOT: load
|
||||
; CHECK-NOT: store
|
||||
; CHECK: llvm.memcpy.element.atomic
|
||||
entry:
|
||||
call void @llvm.memcpy.element.atomic.p0i8.p0i8(i8* align 64 %Dst, i8* align 64 %Src, i64 4, i32 64)
|
||||
ret void
|
||||
}
|
||||
|
||||
; Test that we will eliminate redundant bitcasts
|
||||
define void @test4(i64* %Src, i64* %Dst) {
|
||||
; CHECK-LABEL: test4
|
||||
; CHECK-NOT: llvm.memcpy.element.atomic
|
||||
|
||||
; CHECK-NOT: bitcast
|
||||
|
||||
; CHECK-DAG: [[VAL1:%[^\s]+]] = load atomic i64, i64* %Src unordered, align 16
|
||||
; CHECK-DAG: store atomic i64 [[VAL1]], i64* %Dst unordered, align 16
|
||||
|
||||
; CHECK-DAG: [[SRC_ADDR2:%[^ ]+]] = getelementptr i64, i64* %Src, i64 1
|
||||
; CHECK-DAG: [[DST_ADDR2:%[^ ]+]] = getelementptr i64, i64* %Dst, i64 1
|
||||
; CHECK-DAG: [[VAL2:%[^\s]+]] = load atomic i64, i64* [[SRC_ADDR2]] unordered, align 8
|
||||
; CHECK-DAG: store atomic i64 [[VAL2]], i64* [[DST_ADDR2]] unordered, align 8
|
||||
|
||||
; CHECK-DAG: [[SRC_ADDR3:%[^ ]+]] = getelementptr i64, i64* %Src, i64 2
|
||||
; CHECK-DAG: [[DST_ADDR3:%[^ ]+]] = getelementptr i64, i64* %Dst, i64 2
|
||||
; CHECK-DAG: [[VAL3:%[^ ]+]] = load atomic i64, i64* [[SRC_ADDR3]] unordered, align 8
|
||||
; CHECK-DAG: store atomic i64 [[VAL3]], i64* [[DST_ADDR3]] unordered, align 8
|
||||
|
||||
; CHECK-DAG: [[SRC_ADDR4:%[^ ]+]] = getelementptr i64, i64* %Src, i64 3
|
||||
; CHECK-DAG: [[DST_ADDR4:%[^ ]+]] = getelementptr i64, i64* %Dst, i64 3
|
||||
; CHECK-DAG: [[VAL4:%[^ ]+]] = load atomic i64, i64* [[SRC_ADDR4]] unordered, align 8
|
||||
; CHECK-DAG: store atomic i64 [[VAL4]], i64* [[DST_ADDR4]] unordered, align 8
|
||||
entry:
|
||||
%Src.casted = bitcast i64* %Src to i8*
|
||||
%Dst.casted = bitcast i64* %Dst to i8*
|
||||
call void @llvm.memcpy.element.atomic.p0i8.p0i8(i8* align 16 %Dst.casted, i8* align 16 %Src.casted, i64 4, i32 8)
|
||||
ret void
|
||||
}
|
||||
|
||||
define void @test5(i8* %Src, i8* %Dst) {
|
||||
; CHECK-LABEL: test5
|
||||
|
||||
; CHECK-NOT: llvm.memcpy.element.atomic.p0i8.p0i8(i8* align 64 %Dst, i8* align 64 %Src, i64 0, i32 64)
|
||||
entry:
|
||||
call void @llvm.memcpy.element.atomic.p0i8.p0i8(i8* align 64 %Dst, i8* align 64 %Src, i64 0, i32 64)
|
||||
ret void
|
||||
}
|
||||
|
||||
declare void @llvm.memcpy.element.atomic.p0i8.p0i8(i8* nocapture, i8* nocapture, i64, i32)
|
Loading…
Reference in New Issue
Block a user