LowerBitSets: Align referenced globals.

This change aligns globals to the next highest power of 2 bytes, up to a
maximum of 128. This makes it more likely that we will be able to compress
bit sets with a greater alignment. In many more cases, we can now take
advantage of a new optimization also introduced in this patch that removes
bit set checks if the bit set is all ones.

The 128 byte maximum was found to provide the best tradeoff between instruction
overhead and data overhead in a recent build of Chromium. It allows us to
remove ~2.4MB of instructions at the cost of ~250KB of data.

Differential Revision: http://reviews.llvm.org/D7873

llvm-svn: 230540
This commit is contained in:
Peter Collingbourne 2015-02-25 20:42:41 +00:00
parent e5d7d65572
commit e60f3e06fc
6 changed files with 101 additions and 73 deletions

View File

@ -48,6 +48,17 @@ struct BitSetInfo {
return Bits.size() == 1 && Bits[0] == 1;
}
bool isAllOnes() const {
for (unsigned I = 0; I != Bits.size() - 1; ++I)
if (Bits[I] != 0xFF)
return false;
if (BitSize % 8 == 0)
return Bits[Bits.size() - 1] == 0xFF;
return Bits[Bits.size() - 1] == (1 << (BitSize % 8)) - 1;
}
bool containsGlobalOffset(uint64_t Offset) const;
bool containsValue(const DataLayout *DL,

View File

@ -157,6 +157,7 @@ struct LowerBitSets : public ModulePass {
const DataLayout *DL;
IntegerType *Int1Ty;
IntegerType *Int8Ty;
IntegerType *Int32Ty;
Type *Int32PtrTy;
IntegerType *Int64Ty;
@ -173,7 +174,7 @@ struct LowerBitSets : public ModulePass {
const DenseMap<GlobalVariable *, uint64_t> &GlobalLayout);
Value *createBitSetTest(IRBuilder<> &B, const BitSetInfo &BSI,
GlobalVariable *BitSetGlobal, Value *BitOffset);
void
Value *
lowerBitSetCall(CallInst *CI, const BitSetInfo &BSI,
GlobalVariable *BitSetGlobal, GlobalVariable *CombinedGlobal,
const DenseMap<GlobalVariable *, uint64_t> &GlobalLayout);
@ -203,6 +204,7 @@ bool LowerBitSets::doInitialization(Module &M) {
report_fatal_error("Data layout required");
Int1Ty = Type::getInt1Ty(M.getContext());
Int8Ty = Type::getInt8Ty(M.getContext());
Int32Ty = Type::getInt32Ty(M.getContext());
Int32PtrTy = PointerType::getUnqual(Int32Ty);
Int64Ty = Type::getInt64Ty(M.getContext());
@ -293,19 +295,16 @@ Value *LowerBitSets::createBitSetTest(IRBuilder<> &B, const BitSetInfo &BSI,
}
}
/// Lower a llvm.bitset.test call to its implementation.
void LowerBitSets::lowerBitSetCall(
/// Lower a llvm.bitset.test call to its implementation. Returns the value to
/// replace the call with.
Value *LowerBitSets::lowerBitSetCall(
CallInst *CI, const BitSetInfo &BSI, GlobalVariable *BitSetGlobal,
GlobalVariable *CombinedGlobal,
const DenseMap<GlobalVariable *, uint64_t> &GlobalLayout) {
Value *Ptr = CI->getArgOperand(0);
if (BSI.containsValue(DL, GlobalLayout, Ptr)) {
CI->replaceAllUsesWith(
ConstantInt::getTrue(BitSetGlobal->getParent()->getContext()));
CI->eraseFromParent();
return;
}
if (BSI.containsValue(DL, GlobalLayout, Ptr))
return ConstantInt::getTrue(BitSetGlobal->getParent()->getContext());
Constant *GlobalAsInt = ConstantExpr::getPtrToInt(CombinedGlobal, IntPtrTy);
Constant *OffsetedGlobalAsInt = ConstantExpr::getAdd(
@ -317,12 +316,8 @@ void LowerBitSets::lowerBitSetCall(
Value *PtrAsInt = B.CreatePtrToInt(Ptr, IntPtrTy);
if (BSI.isSingleOffset()) {
Value *Eq = B.CreateICmpEQ(PtrAsInt, OffsetedGlobalAsInt);
CI->replaceAllUsesWith(Eq);
CI->eraseFromParent();
return;
}
if (BSI.isSingleOffset())
return B.CreateICmpEQ(PtrAsInt, OffsetedGlobalAsInt);
Value *PtrOffset = B.CreateSub(PtrAsInt, OffsetedGlobalAsInt);
@ -349,6 +344,10 @@ void LowerBitSets::lowerBitSetCall(
Constant *BitSizeConst = ConstantInt::get(IntPtrTy, BSI.BitSize);
Value *OffsetInRange = B.CreateICmpULT(BitOffset, BitSizeConst);
// If the bit set is all ones, testing against it is unnecessary.
if (BSI.isAllOnes())
return OffsetInRange;
TerminatorInst *Term = SplitBlockAndInsertIfThen(OffsetInRange, CI, false);
IRBuilder<> ThenB(Term);
@ -363,9 +362,7 @@ void LowerBitSets::lowerBitSetCall(
PHINode *P = B.CreatePHI(Int1Ty, 2);
P->addIncoming(ConstantInt::get(Int1Ty, 0), InitialBB);
P->addIncoming(Bit, ThenB.GetInsertBlock());
CI->replaceAllUsesWith(P);
CI->eraseFromParent();
return P;
}
/// Given a disjoint set of bitsets and globals, layout the globals, build the
@ -376,8 +373,24 @@ void LowerBitSets::buildBitSetsFromGlobals(
const std::vector<GlobalVariable *> &Globals) {
// Build a new global with the combined contents of the referenced globals.
std::vector<Constant *> GlobalInits;
for (GlobalVariable *G : Globals)
for (GlobalVariable *G : Globals) {
GlobalInits.push_back(G->getInitializer());
uint64_t InitSize = DL->getTypeAllocSize(G->getInitializer()->getType());
// Compute the amount of padding required to align the next element to the
// next power of 2.
uint64_t Padding = NextPowerOf2(InitSize - 1) - InitSize;
// Cap at 128 was found experimentally to have a good data/instruction
// overhead tradeoff.
if (Padding > 128)
Padding = RoundUpToAlignment(InitSize, 128) - InitSize;
GlobalInits.push_back(
ConstantAggregateZero::get(ArrayType::get(Int8Ty, Padding)));
}
if (!GlobalInits.empty())
GlobalInits.pop_back();
Constant *NewInit = ConstantStruct::getAnon(M.getContext(), GlobalInits);
auto CombinedGlobal =
new GlobalVariable(M, NewInit->getType(), /*isConstant=*/true,
@ -389,7 +402,8 @@ void LowerBitSets::buildBitSetsFromGlobals(
// Compute the offsets of the original globals within the new global.
DenseMap<GlobalVariable *, uint64_t> GlobalLayout;
for (unsigned I = 0; I != Globals.size(); ++I)
GlobalLayout[Globals[I]] = CombinedGlobalLayout->getElementOffset(I);
// Multiply by 2 to account for padding elements.
GlobalLayout[Globals[I]] = CombinedGlobalLayout->getElementOffset(I * 2);
// For each bitset in this disjoint set...
for (MDString *BS : BitSets) {
@ -406,7 +420,10 @@ void LowerBitSets::buildBitSetsFromGlobals(
// Lower each call to llvm.bitset.test for this bitset.
for (CallInst *CI : BitSetTestCallSites[BS]) {
++NumBitSetCallsLowered;
lowerBitSetCall(CI, BSI, BitSetGlobal, CombinedGlobal, GlobalLayout);
Value *Lowered =
lowerBitSetCall(CI, BSI, BitSetGlobal, CombinedGlobal, GlobalLayout);
CI->replaceAllUsesWith(Lowered);
CI->eraseFromParent();
}
}
@ -414,8 +431,9 @@ void LowerBitSets::buildBitSetsFromGlobals(
// global from which we built the combined global, and replace references
// to the original globals with references to the aliases.
for (unsigned I = 0; I != Globals.size(); ++I) {
// Multiply by 2 to account for padding elements.
Constant *CombinedGlobalIdxs[] = {ConstantInt::get(Int32Ty, 0),
ConstantInt::get(Int32Ty, I)};
ConstantInt::get(Int32Ty, I * 2)};
Constant *CombinedGlobalElemPtr =
ConstantExpr::getGetElementPtr(CombinedGlobal, CombinedGlobalIdxs);
GlobalAlias *GAlias = GlobalAlias::create(

View File

@ -6,7 +6,7 @@ target datalayout = "e-p:32:32"
; (see GlobalLayoutBuilder in include/llvm/Transforms/IPO/LowerBitSets.h).
; The chosen layout in this case is a, e, b, d, c.
; CHECK: private constant { i32, i32, i32, i32, i32 } { i32 1, i32 5, i32 2, i32 4, i32 3 }
; CHECK: private constant { i32, [0 x i8], i32, [0 x i8], i32, [0 x i8], i32, [0 x i8], i32 } { i32 1, [0 x i8] zeroinitializer, i32 5, [0 x i8] zeroinitializer, i32 2, [0 x i8] zeroinitializer, i32 4, [0 x i8] zeroinitializer, i32 3 }
@a = constant i32 1
@b = constant i32 2
@c = constant i32 3

View File

@ -3,14 +3,14 @@
target datalayout = "e-p:32:32"
; CHECK: [[G:@[^ ]*]] = private constant { i32, [63 x i32], i32, [2 x i32] } { i32 1, [63 x i32] zeroinitializer, i32 3, [2 x i32] [i32 4, i32 5] }
; CHECK: [[G:@[^ ]*]] = private constant { i32, [0 x i8], [63 x i32], [4 x i8], i32, [0 x i8], [2 x i32] } { i32 1, [0 x i8] zeroinitializer, [63 x i32] zeroinitializer, [4 x i8] zeroinitializer, i32 3, [0 x i8] zeroinitializer, [2 x i32] [i32 4, i32 5] }
@a = constant i32 1
@b = constant [63 x i32] zeroinitializer
@c = constant i32 3
@d = constant [2 x i32] [i32 4, i32 5]
; Offset 0, 4 byte alignment
; CHECK: @bitset1.bits = private constant [9 x i8] c"\03\00\00\00\00\00\00\00\04"
; CHECK: @bitset1.bits = private constant [9 x i8] c"\03\00\00\00\00\00\00\00\08"
!0 = !{!"bitset1", i32* @a, i32 0}
; CHECK-NODISCARD-DAG: !{!"bitset1", i32* @a, i32 0}
!1 = !{!"bitset1", [63 x i32]* @b, i32 0}
@ -18,15 +18,15 @@ target datalayout = "e-p:32:32"
!2 = !{!"bitset1", [2 x i32]* @d, i32 4}
; CHECK-NODISCARD-DAG: !{!"bitset1", [2 x i32]* @d, i32 4}
; Offset 4, 4 byte alignment
; CHECK: @bitset2.bits = private constant [8 x i8] c"\01\00\00\00\00\00\00\80"
; Offset 4, 256 byte alignment
; CHECK: @bitset2.bits = private constant [1 x i8] c"\03"
!3 = !{!"bitset2", [63 x i32]* @b, i32 0}
; CHECK-NODISCARD-DAG: !{!"bitset2", [63 x i32]* @b, i32 0}
!4 = !{!"bitset2", i32* @c, i32 0}
; CHECK-NODISCARD-DAG: !{!"bitset2", i32* @c, i32 0}
; Offset 0, 256 byte alignment
; CHECK: @bitset3.bits = private constant [1 x i8] c"\03"
; Offset 0, 4 byte alignment
; CHECK: @bitset3.bits = private constant [9 x i8] c"\01\00\00\00\00\00\00\00\02"
!5 = !{!"bitset3", i32* @a, i32 0}
; CHECK-NODISCARD-DAG: !{!"bitset3", i32* @a, i32 0}
!6 = !{!"bitset3", i32* @c, i32 0}
@ -38,10 +38,10 @@ target datalayout = "e-p:32:32"
!llvm.bitsets = !{ !0, !1, !2, !3, !4, !5, !6, !7 }
; CHECK: @a = alias getelementptr inbounds ({ i32, [63 x i32], i32, [2 x i32] }* [[G]], i32 0, i32 0)
; CHECK: @b = alias getelementptr inbounds ({ i32, [63 x i32], i32, [2 x i32] }* [[G]], i32 0, i32 1)
; CHECK: @c = alias getelementptr inbounds ({ i32, [63 x i32], i32, [2 x i32] }* [[G]], i32 0, i32 2)
; CHECK: @d = alias getelementptr inbounds ({ i32, [63 x i32], i32, [2 x i32] }* [[G]], i32 0, i32 3)
; CHECK: @a = alias getelementptr inbounds ({ i32, [0 x i8], [63 x i32], [4 x i8], i32, [0 x i8], [2 x i32] }* [[G]], i32 0, i32 0)
; CHECK: @b = alias getelementptr inbounds ({ i32, [0 x i8], [63 x i32], [4 x i8], i32, [0 x i8], [2 x i32] }* [[G]], i32 0, i32 2)
; CHECK: @c = alias getelementptr inbounds ({ i32, [0 x i8], [63 x i32], [4 x i8], i32, [0 x i8], [2 x i32] }* [[G]], i32 0, i32 4)
; CHECK: @d = alias getelementptr inbounds ({ i32, [0 x i8], [63 x i32], [4 x i8], i32, [0 x i8], [2 x i32] }* [[G]], i32 0, i32 6)
declare i1 @llvm.bitset.test(i8* %ptr, metadata %bitset) nounwind readnone
@ -52,11 +52,11 @@ define i1 @foo(i32* %p) {
; CHECK: [[R0:%[^ ]*]] = bitcast i32* [[A0]] to i8*
%pi8 = bitcast i32* %p to i8*
; CHECK: [[R1:%[^ ]*]] = ptrtoint i8* [[R0]] to i32
; CHECK: [[R2:%[^ ]*]] = sub i32 [[R1]], ptrtoint ({ i32, [63 x i32], i32, [2 x i32] }* [[G]] to i32)
; CHECK: [[R2:%[^ ]*]] = sub i32 [[R1]], ptrtoint ({ i32, [0 x i8], [63 x i32], [4 x i8], i32, [0 x i8], [2 x i32] }* [[G]] to i32)
; CHECK: [[R3:%[^ ]*]] = lshr i32 [[R2]], 2
; CHECK: [[R4:%[^ ]*]] = shl i32 [[R2]], 30
; CHECK: [[R5:%[^ ]*]] = or i32 [[R3]], [[R4]]
; CHECK: [[R6:%[^ ]*]] = icmp ult i32 [[R5]], 67
; CHECK: [[R6:%[^ ]*]] = icmp ult i32 [[R5]], 68
; CHECK: br i1 [[R6]]
; CHECK: [[R8:%[^ ]*]] = lshr i32 [[R5]], 5
@ -82,22 +82,14 @@ define i1 @bar(i32* %p) {
; CHECK: [[S0:%[^ ]*]] = bitcast i32* [[B0]] to i8*
%pi8 = bitcast i32* %p to i8*
; CHECK: [[S1:%[^ ]*]] = ptrtoint i8* [[S0]] to i32
; CHECK: [[S2:%[^ ]*]] = sub i32 [[S1]], add (i32 ptrtoint ({ i32, [63 x i32], i32, [2 x i32] }* [[G]] to i32), i32 4)
; CHECK: [[S3:%[^ ]*]] = lshr i32 [[S2]], 2
; CHECK: [[S4:%[^ ]*]] = shl i32 [[S2]], 30
; CHECK: [[S2:%[^ ]*]] = sub i32 [[S1]], add (i32 ptrtoint ({ i32, [0 x i8], [63 x i32], [4 x i8], i32, [0 x i8], [2 x i32] }* [[G]] to i32), i32 4)
; CHECK: [[S3:%[^ ]*]] = lshr i32 [[S2]], 8
; CHECK: [[S4:%[^ ]*]] = shl i32 [[S2]], 24
; CHECK: [[S5:%[^ ]*]] = or i32 [[S3]], [[S4]]
; CHECK: [[S6:%[^ ]*]] = icmp ult i32 [[S5]], 64
; CHECK: br i1 [[S6]]
; CHECK: [[S8:%[^ ]*]] = zext i32 [[S5]] to i64
; CHECK: [[S9:%[^ ]*]] = and i64 [[S8]], 63
; CHECK: [[S10:%[^ ]*]] = shl i64 1, [[S9]]
; CHECK: [[S11:%[^ ]*]] = and i64 -9223372036854775807, [[S10]]
; CHECK: [[S12:%[^ ]*]] = icmp ne i64 [[S11]], 0
; CHECK: [[S16:%[^ ]*]] = phi i1 [ false, {{%[^ ]*}} ], [ [[S12]], {{%[^ ]*}} ]
; CHECK: [[S6:%[^ ]*]] = icmp ult i32 [[S5]], 2
%x = call i1 @llvm.bitset.test(i8* %pi8, metadata !"bitset2")
; CHECK: ret i1 [[S16]]
; CHECK: ret i1 [[S6]]
ret i1 %x
}
@ -106,19 +98,22 @@ define i1 @baz(i32* %p) {
; CHECK: [[T0:%[^ ]*]] = bitcast i32* [[C0]] to i8*
%pi8 = bitcast i32* %p to i8*
; CHECK: [[T1:%[^ ]*]] = ptrtoint i8* [[T0]] to i32
; CHECK: [[T2:%[^ ]*]] = sub i32 [[T1]], ptrtoint ({ i32, [63 x i32], i32, [2 x i32] }* [[G]] to i32)
; CHECK: [[T3:%[^ ]*]] = lshr i32 [[T2]], 8
; CHECK: [[T4:%[^ ]*]] = shl i32 [[T2]], 24
; CHECK: [[T2:%[^ ]*]] = sub i32 [[T1]], ptrtoint ({ i32, [0 x i8], [63 x i32], [4 x i8], i32, [0 x i8], [2 x i32] }* [[G]] to i32)
; CHECK: [[T3:%[^ ]*]] = lshr i32 [[T2]], 2
; CHECK: [[T4:%[^ ]*]] = shl i32 [[T2]], 30
; CHECK: [[T5:%[^ ]*]] = or i32 [[T3]], [[T4]]
; CHECK: [[T6:%[^ ]*]] = icmp ult i32 [[T5]], 2
; CHECK: [[T6:%[^ ]*]] = icmp ult i32 [[T5]], 66
; CHECK: br i1 [[T6]]
; CHECK: [[T8:%[^ ]*]] = and i32 [[T5]], 31
; CHECK: [[T9:%[^ ]*]] = shl i32 1, [[T8]]
; CHECK: [[T10:%[^ ]*]] = and i32 3, [[T9]]
; CHECK: [[T11:%[^ ]*]] = icmp ne i32 [[T10]], 0
; CHECK: [[T8:%[^ ]*]] = lshr i32 [[T5]], 5
; CHECK: [[T9:%[^ ]*]] = getelementptr i32* bitcast ([9 x i8]* @bitset3.bits to i32*), i32 [[T8]]
; CHECK: [[T10:%[^ ]*]] = load i32* [[T9]]
; CHECK: [[T11:%[^ ]*]] = and i32 [[T5]], 31
; CHECK: [[T12:%[^ ]*]] = shl i32 1, [[T11]]
; CHECK: [[T13:%[^ ]*]] = and i32 [[T10]], [[T12]]
; CHECK: [[T14:%[^ ]*]] = icmp ne i32 [[T13]], 0
; CHECK: [[T16:%[^ ]*]] = phi i1 [ false, {{%[^ ]*}} ], [ [[T11]], {{%[^ ]*}} ]
; CHECK: [[T16:%[^ ]*]] = phi i1 [ false, {{%[^ ]*}} ], [ [[T14]], {{%[^ ]*}} ]
%x = call i1 @llvm.bitset.test(i8* %pi8, metadata !"bitset3")
; CHECK: ret i1 [[T16]]
ret i1 %x

View File

@ -2,7 +2,7 @@
target datalayout = "e-p:32:32"
; CHECK: [[G:@[^ ]*]] = private constant { i32, i32 }
; CHECK: [[G:@[^ ]*]] = private constant { i32, [0 x i8], i32 }
@a = constant i32 1
@b = constant i32 2
@ -18,7 +18,7 @@ declare i1 @llvm.bitset.test(i8* %ptr, metadata %bitset) nounwind readnone
; CHECK: @foo(i8* [[A0:%[^ ]*]])
define i1 @foo(i8* %p) {
; CHECK: [[R0:%[^ ]*]] = ptrtoint i8* [[A0]] to i32
; CHECK: [[R1:%[^ ]*]] = icmp eq i32 [[R0]], ptrtoint ({ i32, i32 }* [[G]] to i32)
; CHECK: [[R1:%[^ ]*]] = icmp eq i32 [[R0]], ptrtoint ({ i32, [0 x i8], i32 }* [[G]] to i32)
%x = call i1 @llvm.bitset.test(i8* %p, metadata !"bitset2")
; CHECK: ret i1 [[R1]]
ret i1 %x
@ -27,7 +27,7 @@ define i1 @foo(i8* %p) {
; CHECK: @bar(i8* [[B0:%[^ ]*]])
define i1 @bar(i8* %p) {
; CHECK: [[S0:%[^ ]*]] = ptrtoint i8* [[B0]] to i32
; CHECK: [[S1:%[^ ]*]] = icmp eq i32 [[S0]], add (i32 ptrtoint ({ i32, i32 }* [[G]] to i32), i32 4)
; CHECK: [[S1:%[^ ]*]] = icmp eq i32 [[S0]], add (i32 ptrtoint ({ i32, [0 x i8], i32 }* [[G]] to i32), i32 4)
%x = call i1 @llvm.bitset.test(i8* %p, metadata !"bitset3")
; CHECK: ret i1 [[S1]]
ret i1 %x

View File

@ -20,19 +20,22 @@ TEST(LowerBitSets, BitSetBuilder) {
uint64_t BitSize;
unsigned AlignLog2;
bool IsSingleOffset;
bool IsAllOnes;
} BSBTests[] = {
{{}, {0}, 0, 1, 0, false},
{{0}, {1}, 0, 1, 0, true},
{{4}, {1}, 4, 1, 0, true},
{{37}, {1}, 37, 1, 0, true},
{{0, 1}, {3}, 0, 2, 0, false},
{{0, 4}, {3}, 0, 2, 2, false},
{{0, uint64_t(1) << 33}, {3}, 0, 2, 33, false},
{{3, 7}, {3}, 3, 2, 2, false},
{{0, 1, 7}, {131}, 0, 8, 0, false},
{{0, 2, 14}, {131}, 0, 8, 1, false},
{{0, 1, 8}, {3, 1}, 0, 9, 0, false},
{{0, 2, 16}, {3, 1}, 0, 9, 1, false},
{{}, {0}, 0, 1, 0, false, false},
{{0}, {1}, 0, 1, 0, true, true},
{{4}, {1}, 4, 1, 0, true, true},
{{37}, {1}, 37, 1, 0, true, true},
{{0, 1}, {3}, 0, 2, 0, false, true},
{{0, 4}, {3}, 0, 2, 2, false, true},
{{0, uint64_t(1) << 33}, {3}, 0, 2, 33, false, true},
{{3, 7}, {3}, 3, 2, 2, false, true},
{{0, 1, 7}, {131}, 0, 8, 0, false, false},
{{0, 2, 14}, {131}, 0, 8, 1, false, false},
{{0, 1, 8}, {3, 1}, 0, 9, 0, false, false},
{{0, 2, 16}, {3, 1}, 0, 9, 1, false, false},
{{0, 1, 2, 3, 4, 5, 6, 7}, {255}, 0, 8, 0, false, true},
{{0, 1, 2, 3, 4, 5, 6, 7, 8}, {255, 1}, 0, 9, 0, false, true},
};
for (auto &&T : BSBTests) {
@ -47,6 +50,7 @@ TEST(LowerBitSets, BitSetBuilder) {
EXPECT_EQ(T.BitSize, BSI.BitSize);
EXPECT_EQ(T.AlignLog2, BSI.AlignLog2);
EXPECT_EQ(T.IsSingleOffset, BSI.isSingleOffset());
EXPECT_EQ(T.IsAllOnes, BSI.isAllOnes());
for (auto Offset : T.Offsets)
EXPECT_TRUE(BSI.containsGlobalOffset(Offset));