diff --git a/lib/Target/PowerPC/PPCTargetTransformInfo.cpp b/lib/Target/PowerPC/PPCTargetTransformInfo.cpp index 53b2dd65d0f..ed849b5bc85 100644 --- a/lib/Target/PowerPC/PPCTargetTransformInfo.cpp +++ b/lib/Target/PowerPC/PPCTargetTransformInfo.cpp @@ -216,7 +216,9 @@ unsigned PPCTTI::getVectorInstrCost(unsigned Opcode, Type *Val, // experimentally as a minimum needed to prevent unprofitable // vectorization for the paq8p benchmark. It may need to be // raised further if other unprofitable cases remain. - unsigned LHSPenalty = 12; + unsigned LHSPenalty = 2; + if (ISD == ISD::INSERT_VECTOR_ELT) + LHSPenalty += 7; // Vector element insert/extract with Altivec is very expensive, // because they require store and reload with the attendant @@ -240,14 +242,32 @@ unsigned PPCTTI::getMemoryOpCost(unsigned Opcode, Type *Src, unsigned Alignment, unsigned Cost = TargetTransformInfo::getMemoryOpCost(Opcode, Src, Alignment, AddressSpace); - // FIXME: Update this for VSX loads/stores that support unaligned access. + // VSX loads/stores support unaligned access. + if (ST->hasVSX()) { + if (LT.second == MVT::v2f64 || LT.second == MVT::v2i64) + return Cost; + } + + bool UnalignedAltivec = + Src->isVectorTy() && + Src->getPrimitiveSizeInBits() >= LT.second.getSizeInBits() && + LT.second.getSizeInBits() == 128 && + Opcode == Instruction::Load; // PPC in general does not support unaligned loads and stores. They'll need // to be decomposed based on the alignment factor. unsigned SrcBytes = LT.second.getStoreSize(); - if (SrcBytes && Alignment && Alignment < SrcBytes) + if (SrcBytes && Alignment && Alignment < SrcBytes && !UnalignedAltivec) { Cost += LT.first*(SrcBytes/Alignment-1); + // For a vector type, there is also scalarization overhead (only for + // stores, loads are expanded using the vector-load + permutation sequence, + // which is much less expensive). + if (Src->isVectorTy() && Opcode == Instruction::Store) + for (int i = 0, e = Src->getVectorNumElements(); i < e; ++i) + Cost += getVectorInstrCost(Instruction::ExtractElement, Src, i); + } + return Cost; } diff --git a/test/Analysis/CostModel/PowerPC/ext.ll b/test/Analysis/CostModel/PowerPC/ext.ll index daaa8f5bac3..7d6a14e93cd 100644 --- a/test/Analysis/CostModel/PowerPC/ext.ll +++ b/test/Analysis/CostModel/PowerPC/ext.ll @@ -13,7 +13,7 @@ define void @exts() { ; CHECK: cost of 1 {{.*}} sext %v3 = sext <4 x i16> undef to <4 x i32> - ; CHECK: cost of 216 {{.*}} sext + ; CHECK: cost of 112 {{.*}} sext %v4 = sext <8 x i16> undef to <8 x i32> ret void diff --git a/test/Analysis/CostModel/PowerPC/insert_extract.ll b/test/Analysis/CostModel/PowerPC/insert_extract.ll index f51963d56fd..8dc003153a2 100644 --- a/test/Analysis/CostModel/PowerPC/insert_extract.ll +++ b/test/Analysis/CostModel/PowerPC/insert_extract.ll @@ -3,13 +3,13 @@ target datalayout = "E-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f3 target triple = "powerpc64-unknown-linux-gnu" define i32 @insert(i32 %arg) { - ; CHECK: cost of 13 {{.*}} insertelement + ; CHECK: cost of 10 {{.*}} insertelement %x = insertelement <4 x i32> undef, i32 %arg, i32 0 ret i32 undef } define i32 @extract(<4 x i32> %arg) { - ; CHECK: cost of 13 {{.*}} extractelement + ; CHECK: cost of 3 {{.*}} extractelement %x = extractelement <4 x i32> %arg, i32 0 ret i32 %x } diff --git a/test/Analysis/CostModel/PowerPC/load_store.ll b/test/Analysis/CostModel/PowerPC/load_store.ll index 8145a1dc715..40862780faf 100644 --- a/test/Analysis/CostModel/PowerPC/load_store.ll +++ b/test/Analysis/CostModel/PowerPC/load_store.ll @@ -31,9 +31,12 @@ define i32 @loads(i32 %arg) { ; FIXME: There actually are sub-vector Altivec loads, and so we could handle ; this with a small expense, but we don't currently. - ; CHECK: cost of 60 {{.*}} load + ; CHECK: cost of 48 {{.*}} load load <4 x i16>* undef, align 2 + ; CHECK: cost of 1 {{.*}} load + load <4 x i32>* undef, align 4 + ret i32 undef }