From 1d56cda0234398be983e8703b830d46fff9f803c Mon Sep 17 00:00:00 2001 From: Jingyue Wu Date: Wed, 16 Jul 2014 23:25:00 +0000 Subject: [PATCH] Partially revert r210444 due to performance regression Summary: Converting outermost zext(a) to sext(a) causes worse code when the computation of zext(a) could be reused. For example, after converting ... = array[zext(a)] ... = array[zext(a) + 1] to ... = array[sext(a)] ... = array[zext(a) + 1], the program computes sext(a), which is actually unnecessary. I added one test in split-gep-and-gvn.ll to illustrate this scenario. Also, with r211281 and r211084, we annotate more "nuw" tags to computation involving CUDA intrinsics such as threadIdx.x. These annotations help with splitting GEP a lot, rendering the benefit we get from this reverted optimization only marginal. Test Plan: make check-all Reviewers: eliben, meheff Reviewed By: meheff Subscribers: jholewinski, llvm-commits Differential Revision: http://reviews.llvm.org/D4542 git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@213209 91177308-0d34-0410-b5e6-96231b3b80d8 --- .../Scalar/SeparateConstOffsetFromGEP.cpp | 58 +---------------- .../NVPTX/split-gep-and-gvn.ll | 63 +++++++++++++++++-- .../NVPTX/split-gep.ll | 25 -------- 3 files changed, 60 insertions(+), 86 deletions(-) diff --git a/lib/Transforms/Scalar/SeparateConstOffsetFromGEP.cpp b/lib/Transforms/Scalar/SeparateConstOffsetFromGEP.cpp index 62f2026b8d9..6557ce4575d 100644 --- a/lib/Transforms/Scalar/SeparateConstOffsetFromGEP.cpp +++ b/lib/Transforms/Scalar/SeparateConstOffsetFromGEP.cpp @@ -272,23 +272,6 @@ class SeparateConstOffsetFromGEP : public FunctionPass { /// /// Verified in @i32_add in split-gep.ll bool canonicalizeArrayIndicesToPointerSize(GetElementPtrInst *GEP); - /// For each array index that is in the form of zext(a), convert it to sext(a) - /// if we can prove zext(a) <= max signed value of typeof(a). We prefer - /// sext(a) to zext(a), because in the special case where x + y >= 0 and - /// (x >= 0 or y >= 0), function CanTraceInto can split sext(x + y), - /// while no such case exists for zext(x + y). - /// - /// Note that - /// zext(x + y) = zext(x) + zext(y) - /// is wrong, e.g., - /// zext i32(UINT_MAX + 1) to i64 != - /// (zext i32 UINT_MAX to i64) + (zext i32 1 to i64) - /// - /// Returns true if the module changes. - /// - /// Verified in @inbounds_zext_add in split-gep.ll and @sum_of_array3 in - /// split-gep-and-gvn.ll - bool convertInBoundsZExtToSExt(GetElementPtrInst *GEP); const DataLayout *DL; }; @@ -613,43 +596,6 @@ bool SeparateConstOffsetFromGEP::canonicalizeArrayIndicesToPointerSize( return Changed; } -bool -SeparateConstOffsetFromGEP::convertInBoundsZExtToSExt(GetElementPtrInst *GEP) { - if (!GEP->isInBounds()) - return false; - - // TODO: consider alloca - GlobalVariable *UnderlyingObject = - dyn_cast(GEP->getPointerOperand()); - if (UnderlyingObject == nullptr) - return false; - - uint64_t ObjectSize = - DL->getTypeAllocSize(UnderlyingObject->getType()->getElementType()); - gep_type_iterator GTI = gep_type_begin(*GEP); - bool Changed = false; - for (User::op_iterator I = GEP->op_begin() + 1, E = GEP->op_end(); I != E; - ++I, ++GTI) { - if (isa(*GTI)) { - if (ZExtInst *Extended = dyn_cast(*I)) { - unsigned SrcBitWidth = - cast(Extended->getSrcTy())->getBitWidth(); - // For GEP operand zext(a), if a <= max signed value of typeof(a), then - // the sign bit of a is zero and sext(a) = zext(a). Because the GEP is - // in bounds, we know a <= ObjectSize, so the condition can be reduced - // to ObjectSize <= max signed value of typeof(a). - if (ObjectSize <= - APInt::getSignedMaxValue(SrcBitWidth).getZExtValue()) { - *I = new SExtInst(Extended->getOperand(0), Extended->getType(), - Extended->getName(), GEP); - Changed = true; - } - } - } - } - return Changed; -} - int64_t SeparateConstOffsetFromGEP::accumulateByteOffset(GetElementPtrInst *GEP, bool &NeedsExtraction) { @@ -684,9 +630,7 @@ bool SeparateConstOffsetFromGEP::splitGEP(GetElementPtrInst *GEP) { if (GEP->hasAllConstantIndices()) return false; - bool Changed = false; - Changed |= canonicalizeArrayIndicesToPointerSize(GEP); - Changed |= convertInBoundsZExtToSExt(GEP); + bool Changed = canonicalizeArrayIndicesToPointerSize(GEP); bool NeedsExtraction; int64_t AccumulativeByteOffset = accumulateByteOffset(GEP, NeedsExtraction); diff --git a/test/Transforms/SeparateConstOffsetFromGEP/NVPTX/split-gep-and-gvn.ll b/test/Transforms/SeparateConstOffsetFromGEP/NVPTX/split-gep-and-gvn.ll index 6b3cb5ae9c2..d054a3b1b9f 100644 --- a/test/Transforms/SeparateConstOffsetFromGEP/NVPTX/split-gep-and-gvn.ll +++ b/test/Transforms/SeparateConstOffsetFromGEP/NVPTX/split-gep-and-gvn.ll @@ -99,8 +99,17 @@ define void @sum_of_array2(i32 %x, i32 %y, float* nocapture %output) { ; IR: getelementptr float addrspace(3)* [[BASE_PTR]], i64 32 ; IR: getelementptr float addrspace(3)* [[BASE_PTR]], i64 33 -; Similar to @sum_of_array3, but extends array indices using zext instead of -; sext. e.g., array[zext(x + 1)][zext(y + 1)]. + +; This function loads +; array[zext(x)][zext(y)] +; array[zext(x)][zext(y +nuw 1)] +; array[zext(x +nuw 1)][zext(y)] +; array[zext(x +nuw 1)][zext(y +nuw 1)]. +; +; This function is similar to @sum_of_array, but it +; 1) extends array indices using zext instead of sext; +; 2) annotates the addition with "nuw"; otherwise, zext(x + 1) => zext(x) + 1 +; may be invalid. define void @sum_of_array3(i32 %x, i32 %y, float* nocapture %output) { .preheader: %0 = zext i32 %y to i64 @@ -109,13 +118,13 @@ define void @sum_of_array3(i32 %x, i32 %y, float* nocapture %output) { %3 = addrspacecast float addrspace(3)* %2 to float* %4 = load float* %3, align 4 %5 = fadd float %4, 0.000000e+00 - %6 = add i32 %y, 1 + %6 = add nuw i32 %y, 1 %7 = zext i32 %6 to i64 %8 = getelementptr inbounds [32 x [32 x float]] addrspace(3)* @array, i64 0, i64 %1, i64 %7 %9 = addrspacecast float addrspace(3)* %8 to float* %10 = load float* %9, align 4 %11 = fadd float %5, %10 - %12 = add i32 %x, 1 + %12 = add nuw i32 %x, 1 %13 = zext i32 %12 to i64 %14 = getelementptr inbounds [32 x [32 x float]] addrspace(3)* @array, i64 0, i64 %13, i64 %0 %15 = addrspacecast float addrspace(3)* %14 to float* @@ -139,3 +148,49 @@ define void @sum_of_array3(i32 %x, i32 %y, float* nocapture %output) { ; IR: getelementptr float addrspace(3)* [[BASE_PTR]], i64 1 ; IR: getelementptr float addrspace(3)* [[BASE_PTR]], i64 32 ; IR: getelementptr float addrspace(3)* [[BASE_PTR]], i64 33 + + +; This function loads +; array[zext(x)][zext(y)] +; array[zext(x)][zext(y)] +; array[zext(x) + 1][zext(y) + 1] +; array[zext(x) + 1][zext(y) + 1]. +; +; We expect the generated code to reuse the computation of +; &array[zext(x)][zext(y)]. See the expected IR and PTX for details. +define void @sum_of_array4(i32 %x, i32 %y, float* nocapture %output) { +.preheader: + %0 = zext i32 %y to i64 + %1 = zext i32 %x to i64 + %2 = getelementptr inbounds [32 x [32 x float]] addrspace(3)* @array, i64 0, i64 %1, i64 %0 + %3 = addrspacecast float addrspace(3)* %2 to float* + %4 = load float* %3, align 4 + %5 = fadd float %4, 0.000000e+00 + %6 = add i64 %0, 1 + %7 = getelementptr inbounds [32 x [32 x float]] addrspace(3)* @array, i64 0, i64 %1, i64 %6 + %8 = addrspacecast float addrspace(3)* %7 to float* + %9 = load float* %8, align 4 + %10 = fadd float %5, %9 + %11 = add i64 %1, 1 + %12 = getelementptr inbounds [32 x [32 x float]] addrspace(3)* @array, i64 0, i64 %11, i64 %0 + %13 = addrspacecast float addrspace(3)* %12 to float* + %14 = load float* %13, align 4 + %15 = fadd float %10, %14 + %16 = getelementptr inbounds [32 x [32 x float]] addrspace(3)* @array, i64 0, i64 %11, i64 %6 + %17 = addrspacecast float addrspace(3)* %16 to float* + %18 = load float* %17, align 4 + %19 = fadd float %15, %18 + store float %19, float* %output, align 4 + ret void +} +; PTX-LABEL: sum_of_array4( +; PTX: ld.shared.f32 {{%f[0-9]+}}, {{\[}}[[BASE_REG:%(rd|r)[0-9]+]]{{\]}} +; PTX: ld.shared.f32 {{%f[0-9]+}}, {{\[}}[[BASE_REG]]+4{{\]}} +; PTX: ld.shared.f32 {{%f[0-9]+}}, {{\[}}[[BASE_REG]]+128{{\]}} +; PTX: ld.shared.f32 {{%f[0-9]+}}, {{\[}}[[BASE_REG]]+132{{\]}} + +; IR-LABEL: @sum_of_array4( +; IR: [[BASE_PTR:%[a-zA-Z0-9]+]] = getelementptr inbounds [32 x [32 x float]] addrspace(3)* @array, i64 0, i64 %{{[a-zA-Z0-9]+}}, i64 %{{[a-zA-Z0-9]+}} +; IR: getelementptr float addrspace(3)* [[BASE_PTR]], i64 1 +; IR: getelementptr float addrspace(3)* [[BASE_PTR]], i64 32 +; IR: getelementptr float addrspace(3)* [[BASE_PTR]], i64 33 diff --git a/test/Transforms/SeparateConstOffsetFromGEP/NVPTX/split-gep.ll b/test/Transforms/SeparateConstOffsetFromGEP/NVPTX/split-gep.ll index ed40c7e9551..1784171454d 100644 --- a/test/Transforms/SeparateConstOffsetFromGEP/NVPTX/split-gep.ll +++ b/test/Transforms/SeparateConstOffsetFromGEP/NVPTX/split-gep.ll @@ -234,28 +234,3 @@ entry: ; CHECK-LABEL: @and( ; CHECK: getelementptr [32 x [32 x float]]* @float_2d_array ; CHECK-NOT: getelementptr - -; if zext(a + b) <= max signed value of typeof(a + b), then we can prove -; a + b >= 0 and zext(a + b) == sext(a + b). If we can prove further a or b is -; non-negative, we have zext(a + b) == sext(a) + sext(b). -define float* @inbounds_zext_add(i32 %i, i4 %j) { -entry: - %0 = add i32 %i, 1 - %1 = zext i32 %0 to i64 - ; Because zext(i + 1) is an index of an in bounds GEP based on - ; float_2d_array, zext(i + 1) <= sizeof(float_2d_array) = 4096. - ; Furthermore, since typeof(i + 1) is i32 and 4096 < 2^31, we are sure the - ; sign bit of i + 1 is 0. This implies zext(i + 1) = sext(i + 1). - %2 = add i4 %j, 2 - %3 = zext i4 %2 to i64 - ; In this case, typeof(j + 2) is i4, so zext(j + 2) <= 4096 does not imply - ; the sign bit of j + 2 is 0. - %p = getelementptr inbounds [32 x [32 x float]]* @float_2d_array, i64 0, i64 %1, i64 %3 - ret float* %p -} -; CHECK-LABEL: @inbounds_zext_add( -; CHECK-NOT: add -; CHECK: add i4 %j, 2 -; CHECK: sext -; CHECK: getelementptr [32 x [32 x float]]* @float_2d_array, i64 0, i64 %{{[a-zA-Z0-9]+}}, i64 %{{[a-zA-Z0-9]+}} -; CHECK: getelementptr float* %{{[a-zA-Z0-9]+}}, i64 32