Add support to scalar replacement for partial vector accesses of an alloca, e.g.

a union of a float, <2 x float>, and <4 x float>. This mostly comes up with the
use of vector intrinsics, especially in NEON when programmers know the layout of
the register file. This enables codegen to eliminate a lot of the subregister
traffic it would otherwise generate.

This commit only enables this for a small number of floating-point cases, but a
lot more integer cases. I assume this is okay for all ports, but I did not do
extensive testing of the quality of code involving i512 vectors and the like. If
there is a use case where this generates worse code than before, let me know and
we can scale it back.

This fixes <rdar://problem/9036264>.

llvm-svn: 127317
This commit is contained in:
Cameron Zwarich 2011-03-09 05:43:05 +00:00
parent e80c47f295
commit e0b4705b03
2 changed files with 186 additions and 12 deletions

View File

@ -295,12 +295,16 @@ AllocaInst *ConvertToScalarInfo::TryConvert(AllocaInst *AI) {
/// MergeInType - Add the 'In' type to the accumulated vector type (VectorTy)
/// so far at the offset specified by Offset (which is specified in bytes).
///
/// There are two cases we handle here:
/// There are three cases we handle here:
/// 1) A union of vector types of the same size and potentially its elements.
/// Here we turn element accesses into insert/extract element operations.
/// This promotes a <4 x float> with a store of float to the third element
/// into a <4 x float> that uses insert element.
/// 2) A fully general blob of memory, which we turn into some (potentially
/// 2) A union of vector types with power-of-2 size differences, e.g. a float,
/// <2 x float> and <4 x float>. Here we turn element accesses into insert
/// and extract element operations, and <2 x float> accesses into a cast to
/// <2 x double>, an extract, and a cast back to <2 x float>.
/// 3) A fully general blob of memory, which we turn into some (potentially
/// large) integer type with extract and insert operations where the loads
/// and stores would mutate the memory. We mark this by setting VectorTy
/// to VoidTy.
@ -346,18 +350,68 @@ bool ConvertToScalarInfo::MergeInVectorType(const VectorType *VInTy,
// Remember if we saw a vector type.
HadAVector = true;
if (VInTy->getBitWidth()/8 == AllocaSize && Offset == 0) {
// If we're storing/loading a vector of the right size, allow it as a
// vector. If this the first vector we see, remember the type so that
// we know the element size. If this is a subsequent access, ignore it
// even if it is a differing type but the same size. Worst case we can
// bitcast the resultant vectors.
if (VectorTy == 0)
VectorTy = VInTy;
// TODO: Support nonzero offsets?
if (Offset != 0)
return false;
// Only allow vectors that are a power-of-2 away from the size of the alloca.
if (!isPowerOf2_64(AllocaSize / (VInTy->getBitWidth() / 8)))
return false;
// If this the first vector we see, remember the type so that we know the
// element size.
if (!VectorTy) {
VectorTy = VInTy;
return true;
}
return false;
unsigned BitWidth = cast<VectorType>(VectorTy)->getBitWidth();
unsigned InBitWidth = VInTy->getBitWidth();
// Vectors of the same size can be converted using a simple bitcast.
if (InBitWidth == BitWidth && AllocaSize == (InBitWidth / 8))
return true;
const Type *ElementTy = cast<VectorType>(VectorTy)->getElementType();
const Type *InElementTy = cast<VectorType>(VectorTy)->getElementType();
// Do not allow mixed integer and floating-point accesses from vectors of
// different sizes.
if (ElementTy->isFloatingPointTy() != InElementTy->isFloatingPointTy())
return false;
if (ElementTy->isFloatingPointTy()) {
// Only allow floating-point vectors of different sizes if they have the
// same element type.
// TODO: This could be loosened a bit, but would anything benefit?
if (ElementTy != InElementTy)
return false;
// There are no arbitrary-precision floating-point types, which limits the
// number of legal vector types with larger element types that we can form
// to bitcast and extract a subvector.
// TODO: We could support some more cases with mixed fp128 and double here.
if (!(BitWidth == 64 || BitWidth == 128) ||
!(InBitWidth == 64 || InBitWidth == 128))
return false;
} else {
assert(ElementTy->isIntegerTy() && "Vector elements must be either integer "
"or floating-point.");
unsigned BitWidth = ElementTy->getPrimitiveSizeInBits();
unsigned InBitWidth = InElementTy->getPrimitiveSizeInBits();
// Do not allow integer types smaller than a byte or types whose widths are
// not a multiple of a byte.
if (BitWidth < 8 || InBitWidth < 8 ||
BitWidth % 8 != 0 || InBitWidth % 8 != 0)
return false;
}
// Pick the largest of the two vector types.
if (InBitWidth > BitWidth)
VectorTy = VInTy;
return true;
}
/// CanConvertToScalar - V is a pointer. If we can convert the pointee and all
@ -586,6 +640,26 @@ void ConvertToScalarInfo::ConvertUsesToScalar(Value *Ptr, AllocaInst *NewAI,
}
}
/// getScaledElementType - Gets a scaled element type for a partial vector
/// access of an alloca. The input type must be an integer or float, and
/// the resulting type must be an integer, float or double.
static const Type *getScaledElementType(const Type *OldTy, unsigned Scale) {
assert((OldTy->isIntegerTy() || OldTy->isFloatTy()) && "Partial vector "
"accesses must be scaled from integer or float elements.");
LLVMContext &Context = OldTy->getContext();
unsigned Size = OldTy->getPrimitiveSizeInBits() * Scale;
if (OldTy->isIntegerTy())
return Type::getIntNTy(Context, Size);
if (Size == 32)
return Type::getFloatTy(Context);
if (Size == 64)
return Type::getDoubleTy(Context);
llvm_unreachable("Invalid type for a partial vector access of an alloca!");
}
/// ConvertScalar_ExtractValue - Extract a value of type ToType from an integer
/// or vector value FromVal, extracting the bits from the offset specified by
/// Offset. This returns the value, which is of type ToType.
@ -606,8 +680,27 @@ ConvertScalar_ExtractValue(Value *FromVal, const Type *ToType,
// If the result alloca is a vector type, this is either an element
// access or a bitcast to another vector type of the same size.
if (const VectorType *VTy = dyn_cast<VectorType>(FromVal->getType())) {
if (ToType->isVectorTy())
if (ToType->isVectorTy()) {
if (isPowerOf2_64(AllocaSize / TD.getTypeAllocSize(ToType))) {
assert(Offset == 0 && "Can't extract a value of a smaller vector type "
"from a nonzero offset.");
const Type *ToElementTy = cast<VectorType>(ToType)->getElementType();
unsigned Scale = AllocaSize / TD.getTypeAllocSize(ToType);
const Type *CastElementTy = getScaledElementType(ToElementTy, Scale);
unsigned NumCastVectorElements = VTy->getNumElements() / Scale;
LLVMContext &Context = FromVal->getContext();
const Type *CastTy = VectorType::get(CastElementTy,
NumCastVectorElements);
Value *Cast = Builder.CreateBitCast(FromVal, CastTy, "tmp");
Value *Extract = Builder.CreateExtractElement(Cast, ConstantInt::get(
Type::getInt32Ty(Context), 0), "tmp");
return Builder.CreateBitCast(Extract, ToType, "tmp");
}
return Builder.CreateBitCast(FromVal, ToType, "tmp");
}
// Otherwise it must be an element access.
unsigned Elt = 0;
@ -728,6 +821,28 @@ ConvertScalar_InsertValue(Value *SV, Value *Old,
if (ValSize == VecSize)
return Builder.CreateBitCast(SV, AllocaType, "tmp");
if (SV->getType()->isVectorTy() && isPowerOf2_64(VecSize / ValSize)) {
assert(Offset == 0 && "Can't insert a value of a smaller vector type at "
"a nonzero offset.");
const Type *ToElementTy =
cast<VectorType>(SV->getType())->getElementType();
unsigned Scale = VecSize / ValSize;
const Type *CastElementTy = getScaledElementType(ToElementTy, Scale);
unsigned NumCastVectorElements = VTy->getNumElements() / Scale;
LLVMContext &Context = SV->getContext();
const Type *OldCastTy = VectorType::get(CastElementTy,
NumCastVectorElements);
Value *OldCast = Builder.CreateBitCast(Old, OldCastTy, "tmp");
Value *SVCast = Builder.CreateBitCast(SV, CastElementTy, "tmp");
Value *Insert =
Builder.CreateInsertElement(OldCast, SVCast, ConstantInt::get(
Type::getInt32Ty(Context), 0), "tmp");
return Builder.CreateBitCast(Insert, AllocaType, "tmp");
}
uint64_t EltSize = TD.getTypeAllocSizeInBits(VTy->getElementType());
// Must be an element insertion.

View File

@ -98,3 +98,62 @@ define i64 @test6(<2 x float> %X) {
; CHECK: ret i64
}
define float @test7(<4 x float> %x) {
%a = alloca <4 x float>
store <4 x float> %x, <4 x float>* %a
%p = bitcast <4 x float>* %a to <2 x float>*
%b = load <2 x float>* %p
%q = getelementptr <4 x float>* %a, i32 0, i32 2
%c = load float* %q
ret float %c
; CHECK: @test7
; CHECK-NOT: alloca
; CHECK: bitcast <4 x float> %x to <2 x double>
; CHECK-NEXT: extractelement <2 x double>
; CHECK-NEXT: bitcast double %tmp4 to <2 x float>
; CHECK-NEXT: extractelement <4 x float>
}
define void @test8(<4 x float> %x, <2 x float> %y) {
%a = alloca <4 x float>
store <4 x float> %x, <4 x float>* %a
%p = bitcast <4 x float>* %a to <2 x float>*
store <2 x float> %y, <2 x float>* %p
ret void
; CHECK: @test8
; CHECK-NOT: alloca
; CHECK: bitcast <4 x float> %x to <2 x double>
; CHECK-NEXT: bitcast <2 x float> %y to double
; CHECK-NEXT: insertelement <2 x double>
; CHECK-NEXT: bitcast <2 x double> %tmp2 to <4 x float>
}
define i256 @test9(<4 x i256> %x) {
%a = alloca <4 x i256>
store <4 x i256> %x, <4 x i256>* %a
%p = bitcast <4 x i256>* %a to <2 x i256>*
%b = load <2 x i256>* %p
%q = getelementptr <4 x i256>* %a, i32 0, i32 2
%c = load i256* %q
ret i256 %c
; CHECK: @test9
; CHECK-NOT: alloca
; CHECK: bitcast <4 x i256> %x to <2 x i512>
; CHECK-NEXT: extractelement <2 x i512>
; CHECK-NEXT: bitcast i512 %tmp4 to <2 x i256>
; CHECK-NEXT: extractelement <4 x i256>
}
define void @test10(<4 x i256> %x, <2 x i256> %y) {
%a = alloca <4 x i256>
store <4 x i256> %x, <4 x i256>* %a
%p = bitcast <4 x i256>* %a to <2 x i256>*
store <2 x i256> %y, <2 x i256>* %p
ret void
; CHECK: @test10
; CHECK-NOT: alloca
; CHECK: bitcast <4 x i256> %x to <2 x i512>
; CHECK-NEXT: bitcast <2 x i256> %y to i512
; CHECK-NEXT: insertelement <2 x i512>
; CHECK-NEXT: bitcast <2 x i512> %tmp2 to <4 x i256>
}