diff --git a/llvm/lib/Target/PowerPC/PPCTargetTransformInfo.cpp b/llvm/lib/Target/PowerPC/PPCTargetTransformInfo.cpp index b617a6c4315f..f7785342b364 100644 --- a/llvm/lib/Target/PowerPC/PPCTargetTransformInfo.cpp +++ b/llvm/lib/Target/PowerPC/PPCTargetTransformInfo.cpp @@ -360,11 +360,6 @@ int PPCTTIImpl::getMemoryOpCost(unsigned Opcode, Type *Src, unsigned Alignment, int Cost = BaseT::getMemoryOpCost(Opcode, Src, Alignment, AddressSpace); - // Aligned loads and stores are easy. - unsigned SrcBytes = LT.second.getStoreSize(); - if (!SrcBytes || !Alignment || Alignment >= SrcBytes) - return Cost; - bool IsAltivecType = ST->hasAltivec() && (LT.second == MVT::v16i8 || LT.second == MVT::v8i16 || LT.second == MVT::v4i32 || LT.second == MVT::v4f32); @@ -373,6 +368,20 @@ int PPCTTIImpl::getMemoryOpCost(unsigned Opcode, Type *Src, unsigned Alignment, bool IsQPXType = ST->hasQPX() && (LT.second == MVT::v4f64 || LT.second == MVT::v4f32); + // VSX has 32b/64b load instructions. Legalization can handle loading of + // 32b/64b to VSR correctly and cheaply. But BaseT::getMemoryOpCost and + // PPCTargetLowering can't compute the cost appropriately. So here we + // explicitly check this case. + unsigned MemBytes = Src->getPrimitiveSizeInBits(); + if (Opcode == Instruction::Load && ST->hasVSX() && IsAltivecType && + (MemBytes == 64 || (ST->hasP8Vector() && MemBytes == 32))) + return 1; + + // Aligned loads and stores are easy. + unsigned SrcBytes = LT.second.getStoreSize(); + if (!SrcBytes || !Alignment || Alignment >= SrcBytes) + return Cost; + // If we can use the permutation-based load sequence, then this is also // relatively cheap (not counting loop-invariant instructions): one load plus // one permute (the last load in a series has extra cost, but we're diff --git a/llvm/test/Analysis/CostModel/PowerPC/vsr_load_32_64.ll b/llvm/test/Analysis/CostModel/PowerPC/vsr_load_32_64.ll new file mode 100644 index 000000000000..4afeabca00ad --- /dev/null +++ b/llvm/test/Analysis/CostModel/PowerPC/vsr_load_32_64.ll @@ -0,0 +1,19 @@ +; RUN: opt < %s -cost-model -analyze -mtriple=powerpc64-unknown-linux-gnu -mcpu=pwr8 -mattr=+vsx | FileCheck %s +target datalayout = "E-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-f128:128:128-v128:128:128-n32:64" +target triple = "powerpc64-unknown-linux-gnu" + +define i32 @loads(i32 %arg) { + ; CHECK: cost of 1 {{.*}} load + load <4 x i8>, <4 x i8>* undef, align 1 + + ; CHECK: cost of 1 {{.*}} load + load <8 x i8>, <8 x i8>* undef, align 1 + + ; CHECK: cost of 1 {{.*}} load + load <2 x i16>, <2 x i16>* undef, align 2 + + ; CHECK: cost of 1 {{.*}} load + load <4 x i16>, <4 x i16>* undef, align 2 + + ret i32 undef +} diff --git a/llvm/test/Transforms/LoopVectorize/PowerPC/pr30990.ll b/llvm/test/Transforms/LoopVectorize/PowerPC/pr30990.ll new file mode 100644 index 000000000000..d3cdabd26f50 --- /dev/null +++ b/llvm/test/Transforms/LoopVectorize/PowerPC/pr30990.ll @@ -0,0 +1,140 @@ +; RUN: opt < %s -loop-vectorize -mcpu=pwr8 -mattr=+vsx -force-vector-interleave=1 -vectorizer-maximize-bandwidth=0 -S | FileCheck %s + +target triple = "powerpc64-unknown-linux-gnu" + +define signext i32 @foo(i8* readonly %ptr, i32 signext %l) { +entry: + %idx.ext = sext i32 %l to i64 + %add.ptr = getelementptr inbounds i8, i8* %ptr, i64 %idx.ext + %cmp7 = icmp sgt i32 %l, 0 + br i1 %cmp7, label %while.body.preheader, label %while.end + +while.body.preheader: ; preds = %entry + br label %while.body + +while.body: ; preds = %while.body.preheader, %while.body + %count.09 = phi i32 [ %add, %while.body ], [ 0, %while.body.preheader ] + %ptr.addr.08 = phi i8* [ %incdec.ptr, %while.body ], [ %ptr, %while.body.preheader ] + %0 = load i8, i8* %ptr.addr.08, align 1 + %cmp1 = icmp slt i8 %0, -64 + %cond = zext i1 %cmp1 to i32 + %add = add nsw i32 %cond, %count.09 + %incdec.ptr = getelementptr inbounds i8, i8* %ptr.addr.08, i64 1 + %cmp = icmp ult i8* %incdec.ptr, %add.ptr + br i1 %cmp, label %while.body, label %while.end.loopexit + +while.end.loopexit: ; preds = %while.body + %add.lcssa = phi i32 [ %add, %while.body ] + br label %while.end + +while.end: ; preds = %while.end.loopexit, %entry + %count.0.lcssa = phi i32 [ 0, %entry ], [ %add.lcssa, %while.end.loopexit ] + ret i32 %count.0.lcssa + +; CHECK: load <4 x i8> +; CHECK: icmp slt <4 x i8> +} + + +define signext i16 @foo2(i8* readonly %ptr, i32 signext %l) { +entry: + %idx.ext = sext i32 %l to i64 + %add.ptr = getelementptr inbounds i8, i8* %ptr, i64 %idx.ext + %cmp7 = icmp sgt i32 %l, 0 + br i1 %cmp7, label %while.body.preheader, label %while.end + +while.body.preheader: ; preds = %entry + br label %while.body + +while.body: ; preds = %while.body.preheader, %while.body + %count.09 = phi i16 [ %add, %while.body ], [ 0, %while.body.preheader ] + %ptr.addr.08 = phi i8* [ %incdec.ptr, %while.body ], [ %ptr, %while.body.preheader ] + %0 = load i8, i8* %ptr.addr.08, align 1 + %cmp1 = icmp slt i8 %0, -64 + %cond = zext i1 %cmp1 to i16 + %add = add nsw i16 %cond, %count.09 + %incdec.ptr = getelementptr inbounds i8, i8* %ptr.addr.08, i64 1 + %cmp = icmp ult i8* %incdec.ptr, %add.ptr + br i1 %cmp, label %while.body, label %while.end.loopexit + +while.end.loopexit: ; preds = %while.body + %add.lcssa = phi i16 [ %add, %while.body ] + br label %while.end + +while.end: ; preds = %while.end.loopexit, %entry + %count.0.lcssa = phi i16 [ 0, %entry ], [ %add.lcssa, %while.end.loopexit ] + ret i16 %count.0.lcssa + +; CHECK-LABEL: foo2 +; CHECK: load <8 x i8> +; CHECK: icmp slt <8 x i8> +} + +define signext i32 @foo3(i16* readonly %ptr, i32 signext %l) { +entry: + %idx.ext = sext i32 %l to i64 + %add.ptr = getelementptr inbounds i16, i16* %ptr, i64 %idx.ext + %cmp7 = icmp sgt i32 %l, 0 + br i1 %cmp7, label %while.body.preheader, label %while.end + +while.body.preheader: ; preds = %entry + br label %while.body + +while.body: ; preds = %while.body.preheader, %while.body + %count.09 = phi i32 [ %add, %while.body ], [ 0, %while.body.preheader ] + %ptr.addr.16 = phi i16* [ %incdec.ptr, %while.body ], [ %ptr, %while.body.preheader ] + %0 = load i16, i16* %ptr.addr.16, align 1 + %cmp1 = icmp slt i16 %0, -64 + %cond = zext i1 %cmp1 to i32 + %add = add nsw i32 %cond, %count.09 + %incdec.ptr = getelementptr inbounds i16, i16* %ptr.addr.16, i64 1 + %cmp = icmp ult i16* %incdec.ptr, %add.ptr + br i1 %cmp, label %while.body, label %while.end.loopexit + +while.end.loopexit: ; preds = %while.body + %add.lcssa = phi i32 [ %add, %while.body ] + br label %while.end + +while.end: ; preds = %while.end.loopexit, %entry + %count.0.lcssa = phi i32 [ 0, %entry ], [ %add.lcssa, %while.end.loopexit ] + ret i32 %count.0.lcssa + +; CHECK-LABEL: foo3 +; CHECK: load <4 x i16> +; CHECK: icmp slt <4 x i16> +} + +define i64 @foo4(i16* readonly %ptr, i32 signext %l) { +entry: + %idx.ext = sext i32 %l to i64 + %add.ptr = getelementptr inbounds i16, i16* %ptr, i64 %idx.ext + %cmp7 = icmp sgt i32 %l, 0 + br i1 %cmp7, label %while.body.preheader, label %while.end + +while.body.preheader: ; preds = %entry + br label %while.body + +while.body: ; preds = %while.body.preheader, %while.body + %count.09 = phi i64 [ %add, %while.body ], [ 0, %while.body.preheader ] + %ptr.addr.16 = phi i16* [ %incdec.ptr, %while.body ], [ %ptr, %while.body.preheader ] + %0 = load i16, i16* %ptr.addr.16, align 1 + %cmp1 = icmp slt i16 %0, -64 + %cond = zext i1 %cmp1 to i64 + %add = add nsw i64 %cond, %count.09 + %incdec.ptr = getelementptr inbounds i16, i16* %ptr.addr.16, i64 1 + %cmp = icmp ult i16* %incdec.ptr, %add.ptr + br i1 %cmp, label %while.body, label %while.end.loopexit + +while.end.loopexit: ; preds = %while.body + %add.lcssa = phi i64 [ %add, %while.body ] + br label %while.end + +while.end: ; preds = %while.end.loopexit, %entry + %count.0.lcssa = phi i64 [ 0, %entry ], [ %add.lcssa, %while.end.loopexit ] + ret i64 %count.0.lcssa + +; CHECK-LABEL: foo4 +; CHECK: load <2 x i16> +; CHECK: icmp slt <2 x i16> +} +