From 689da340edaa9f61dfae90e94ac69bfe189ee78f Mon Sep 17 00:00:00 2001 From: Graham Hunter Date: Fri, 19 Jan 2024 11:06:21 +0000 Subject: [PATCH] [NFC][LV] Test precommit for interleaved linear args --- .../LoopVectorize/AArch64/masked-call.ll | 594 +++++++++++++----- .../AArch64/uniform-args-call-variants.ll | 173 ++++- .../AArch64/vector-call-linear-args.ll | 208 +++++- 3 files changed, 787 insertions(+), 188 deletions(-) diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/masked-call.ll b/llvm/test/Transforms/LoopVectorize/AArch64/masked-call.ll index d2ef5b2d14bc..1e79c3e1e8dc 100644 --- a/llvm/test/Transforms/LoopVectorize/AArch64/masked-call.ll +++ b/llvm/test/Transforms/LoopVectorize/AArch64/masked-call.ll @@ -2,6 +2,7 @@ ; RUN: opt < %s -passes=loop-vectorize,instsimplify -force-vector-interleave=1 -S | FileCheck %s --check-prefixes=TFNONE ; RUN: opt < %s -passes=loop-vectorize,instsimplify,simplifycfg -force-vector-interleave=1 -prefer-predicate-over-epilogue=predicate-dont-vectorize -S | FileCheck %s --check-prefixes=TFCOMMON,TFALWAYS ; RUN: opt < %s -passes=loop-vectorize,instsimplify,simplifycfg -force-vector-interleave=1 -prefer-predicate-over-epilogue=predicate-else-scalar-epilogue -S | FileCheck %s --check-prefixes=TFCOMMON,TFFALLBACK +; RUN: opt < %s -passes=loop-vectorize,instsimplify,simplifycfg -force-vector-interleave=2 -prefer-predicate-over-epilogue=predicate-dont-vectorize -S | FileCheck %s --check-prefixes=TFA_INTERLEAVE target triple = "aarch64-unknown-linux-gnu" @@ -19,17 +20,17 @@ define void @test_widen(ptr noalias %a, ptr readnone %b) #4 { ; TFNONE-NEXT: [[TMP3:%.*]] = mul i64 [[TMP2]], 2 ; TFNONE-NEXT: [[N_MOD_VF:%.*]] = urem i64 1025, [[TMP3]] ; TFNONE-NEXT: [[N_VEC:%.*]] = sub i64 1025, [[N_MOD_VF]] -; TFNONE-NEXT: [[TMP7:%.*]] = call i64 @llvm.vscale.i64() -; TFNONE-NEXT: [[TMP8:%.*]] = mul i64 [[TMP7]], 2 +; TFNONE-NEXT: [[TMP4:%.*]] = call i64 @llvm.vscale.i64() +; TFNONE-NEXT: [[TMP5:%.*]] = mul i64 [[TMP4]], 2 ; TFNONE-NEXT: br label [[VECTOR_BODY:%.*]] ; TFNONE: vector.body: ; TFNONE-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] -; TFNONE-NEXT: [[TMP4:%.*]] = getelementptr i64, ptr [[B:%.*]], i64 [[INDEX]] -; TFNONE-NEXT: [[WIDE_LOAD:%.*]] = load , ptr [[TMP4]], align 8 -; TFNONE-NEXT: [[TMP5:%.*]] = call @foo_vector( [[WIDE_LOAD]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer)) -; TFNONE-NEXT: [[TMP6:%.*]] = getelementptr inbounds i64, ptr [[A:%.*]], i64 [[INDEX]] -; TFNONE-NEXT: store [[TMP5]], ptr [[TMP6]], align 8 -; TFNONE-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP8]] +; TFNONE-NEXT: [[TMP6:%.*]] = getelementptr i64, ptr [[B:%.*]], i64 [[INDEX]] +; TFNONE-NEXT: [[WIDE_LOAD:%.*]] = load , ptr [[TMP6]], align 8 +; TFNONE-NEXT: [[TMP7:%.*]] = call @foo_vector( [[WIDE_LOAD]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer)) +; TFNONE-NEXT: [[TMP8:%.*]] = getelementptr inbounds i64, ptr [[A:%.*]], i64 [[INDEX]] +; TFNONE-NEXT: store [[TMP7]], ptr [[TMP8]], align 8 +; TFNONE-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]] ; TFNONE-NEXT: [[TMP9:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] ; TFNONE-NEXT: br i1 [[TMP9]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] ; TFNONE: middle.block: @@ -60,19 +61,19 @@ define void @test_widen(ptr noalias %a, ptr readnone %b) #4 { ; TFCOMMON-NEXT: [[N_RND_UP:%.*]] = add i64 1025, [[TMP4]] ; TFCOMMON-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP1]] ; TFCOMMON-NEXT: [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]] -; TFCOMMON-NEXT: [[TMP8:%.*]] = call i64 @llvm.vscale.i64() -; TFCOMMON-NEXT: [[TMP9:%.*]] = mul i64 [[TMP8]], 2 +; TFCOMMON-NEXT: [[TMP5:%.*]] = call i64 @llvm.vscale.i64() +; TFCOMMON-NEXT: [[TMP6:%.*]] = mul i64 [[TMP5]], 2 ; TFCOMMON-NEXT: [[ACTIVE_LANE_MASK_ENTRY:%.*]] = call @llvm.get.active.lane.mask.nxv2i1.i64(i64 0, i64 1025) ; TFCOMMON-NEXT: br label [[VECTOR_BODY:%.*]] ; TFCOMMON: vector.body: ; TFCOMMON-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] ; TFCOMMON-NEXT: [[ACTIVE_LANE_MASK:%.*]] = phi [ [[ACTIVE_LANE_MASK_ENTRY]], [[ENTRY]] ], [ [[ACTIVE_LANE_MASK_NEXT:%.*]], [[VECTOR_BODY]] ] -; TFCOMMON-NEXT: [[TMP5:%.*]] = getelementptr i64, ptr [[B:%.*]], i64 [[INDEX]] -; TFCOMMON-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call @llvm.masked.load.nxv2i64.p0(ptr [[TMP5]], i32 8, [[ACTIVE_LANE_MASK]], poison) -; TFCOMMON-NEXT: [[TMP6:%.*]] = call @foo_vector( [[WIDE_MASKED_LOAD]], [[ACTIVE_LANE_MASK]]) -; TFCOMMON-NEXT: [[TMP7:%.*]] = getelementptr inbounds i64, ptr [[A:%.*]], i64 [[INDEX]] -; TFCOMMON-NEXT: call void @llvm.masked.store.nxv2i64.p0( [[TMP6]], ptr [[TMP7]], i32 8, [[ACTIVE_LANE_MASK]]) -; TFCOMMON-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP9]] +; TFCOMMON-NEXT: [[TMP7:%.*]] = getelementptr i64, ptr [[B:%.*]], i64 [[INDEX]] +; TFCOMMON-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call @llvm.masked.load.nxv2i64.p0(ptr [[TMP7]], i32 8, [[ACTIVE_LANE_MASK]], poison) +; TFCOMMON-NEXT: [[TMP8:%.*]] = call @foo_vector( [[WIDE_MASKED_LOAD]], [[ACTIVE_LANE_MASK]]) +; TFCOMMON-NEXT: [[TMP9:%.*]] = getelementptr inbounds i64, ptr [[A:%.*]], i64 [[INDEX]] +; TFCOMMON-NEXT: call void @llvm.masked.store.nxv2i64.p0( [[TMP8]], ptr [[TMP9]], i32 8, [[ACTIVE_LANE_MASK]]) +; TFCOMMON-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP6]] ; TFCOMMON-NEXT: [[ACTIVE_LANE_MASK_NEXT]] = call @llvm.get.active.lane.mask.nxv2i1.i64(i64 [[INDEX_NEXT]], i64 1025) ; TFCOMMON-NEXT: [[TMP10:%.*]] = xor [[ACTIVE_LANE_MASK_NEXT]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer) ; TFCOMMON-NEXT: [[TMP11:%.*]] = extractelement [[TMP10]], i32 0 @@ -80,6 +81,54 @@ define void @test_widen(ptr noalias %a, ptr readnone %b) #4 { ; TFCOMMON: for.cond.cleanup: ; TFCOMMON-NEXT: ret void ; +; TFA_INTERLEAVE-LABEL: @test_widen( +; TFA_INTERLEAVE-NEXT: entry: +; TFA_INTERLEAVE-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64() +; TFA_INTERLEAVE-NEXT: [[TMP1:%.*]] = mul i64 [[TMP0]], 4 +; TFA_INTERLEAVE-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64() +; TFA_INTERLEAVE-NEXT: [[TMP3:%.*]] = mul i64 [[TMP2]], 4 +; TFA_INTERLEAVE-NEXT: [[TMP4:%.*]] = sub i64 [[TMP3]], 1 +; TFA_INTERLEAVE-NEXT: [[N_RND_UP:%.*]] = add i64 1025, [[TMP4]] +; TFA_INTERLEAVE-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP1]] +; TFA_INTERLEAVE-NEXT: [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]] +; TFA_INTERLEAVE-NEXT: [[TMP5:%.*]] = call i64 @llvm.vscale.i64() +; TFA_INTERLEAVE-NEXT: [[TMP6:%.*]] = mul i64 [[TMP5]], 4 +; TFA_INTERLEAVE-NEXT: [[TMP7:%.*]] = call i64 @llvm.vscale.i64() +; TFA_INTERLEAVE-NEXT: [[TMP8:%.*]] = mul i64 [[TMP7]], 2 +; TFA_INTERLEAVE-NEXT: [[ACTIVE_LANE_MASK_ENTRY:%.*]] = call @llvm.get.active.lane.mask.nxv2i1.i64(i64 0, i64 1025) +; TFA_INTERLEAVE-NEXT: [[ACTIVE_LANE_MASK_ENTRY1:%.*]] = call @llvm.get.active.lane.mask.nxv2i1.i64(i64 [[TMP8]], i64 1025) +; TFA_INTERLEAVE-NEXT: br label [[VECTOR_BODY:%.*]] +; TFA_INTERLEAVE: vector.body: +; TFA_INTERLEAVE-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[INDEX_NEXT4:%.*]], [[VECTOR_BODY]] ] +; TFA_INTERLEAVE-NEXT: [[ACTIVE_LANE_MASK:%.*]] = phi [ [[ACTIVE_LANE_MASK_ENTRY]], [[ENTRY]] ], [ [[ACTIVE_LANE_MASK_NEXT:%.*]], [[VECTOR_BODY]] ] +; TFA_INTERLEAVE-NEXT: [[ACTIVE_LANE_MASK2:%.*]] = phi [ [[ACTIVE_LANE_MASK_ENTRY1]], [[ENTRY]] ], [ [[ACTIVE_LANE_MASK_NEXT5:%.*]], [[VECTOR_BODY]] ] +; TFA_INTERLEAVE-NEXT: [[TMP9:%.*]] = getelementptr i64, ptr [[B:%.*]], i64 [[INDEX]] +; TFA_INTERLEAVE-NEXT: [[TMP10:%.*]] = call i64 @llvm.vscale.i64() +; TFA_INTERLEAVE-NEXT: [[TMP11:%.*]] = mul i64 [[TMP10]], 2 +; TFA_INTERLEAVE-NEXT: [[TMP12:%.*]] = getelementptr i64, ptr [[TMP9]], i64 [[TMP11]] +; TFA_INTERLEAVE-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call @llvm.masked.load.nxv2i64.p0(ptr [[TMP9]], i32 8, [[ACTIVE_LANE_MASK]], poison) +; TFA_INTERLEAVE-NEXT: [[WIDE_MASKED_LOAD3:%.*]] = call @llvm.masked.load.nxv2i64.p0(ptr [[TMP12]], i32 8, [[ACTIVE_LANE_MASK2]], poison) +; TFA_INTERLEAVE-NEXT: [[TMP13:%.*]] = call @foo_vector( [[WIDE_MASKED_LOAD]], [[ACTIVE_LANE_MASK]]) +; TFA_INTERLEAVE-NEXT: [[TMP14:%.*]] = call @foo_vector( [[WIDE_MASKED_LOAD3]], [[ACTIVE_LANE_MASK2]]) +; TFA_INTERLEAVE-NEXT: [[TMP15:%.*]] = getelementptr inbounds i64, ptr [[A:%.*]], i64 [[INDEX]] +; TFA_INTERLEAVE-NEXT: [[TMP16:%.*]] = call i64 @llvm.vscale.i64() +; TFA_INTERLEAVE-NEXT: [[TMP17:%.*]] = mul i64 [[TMP16]], 2 +; TFA_INTERLEAVE-NEXT: [[TMP18:%.*]] = getelementptr inbounds i64, ptr [[TMP15]], i64 [[TMP17]] +; TFA_INTERLEAVE-NEXT: call void @llvm.masked.store.nxv2i64.p0( [[TMP13]], ptr [[TMP15]], i32 8, [[ACTIVE_LANE_MASK]]) +; TFA_INTERLEAVE-NEXT: call void @llvm.masked.store.nxv2i64.p0( [[TMP14]], ptr [[TMP18]], i32 8, [[ACTIVE_LANE_MASK2]]) +; TFA_INTERLEAVE-NEXT: [[INDEX_NEXT:%.*]] = add i64 [[INDEX]], [[TMP6]] +; TFA_INTERLEAVE-NEXT: [[INDEX_NEXT4]] = add i64 [[INDEX]], [[TMP6]] +; TFA_INTERLEAVE-NEXT: [[TMP19:%.*]] = call i64 @llvm.vscale.i64() +; TFA_INTERLEAVE-NEXT: [[TMP20:%.*]] = mul i64 [[TMP19]], 2 +; TFA_INTERLEAVE-NEXT: [[TMP21:%.*]] = add i64 [[INDEX_NEXT]], [[TMP20]] +; TFA_INTERLEAVE-NEXT: [[ACTIVE_LANE_MASK_NEXT]] = call @llvm.get.active.lane.mask.nxv2i1.i64(i64 [[INDEX_NEXT]], i64 1025) +; TFA_INTERLEAVE-NEXT: [[ACTIVE_LANE_MASK_NEXT5]] = call @llvm.get.active.lane.mask.nxv2i1.i64(i64 [[TMP21]], i64 1025) +; TFA_INTERLEAVE-NEXT: [[TMP22:%.*]] = xor [[ACTIVE_LANE_MASK_NEXT]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer) +; TFA_INTERLEAVE-NEXT: [[TMP23:%.*]] = extractelement [[TMP22]], i32 0 +; TFA_INTERLEAVE-NEXT: br i1 [[TMP23]], label [[FOR_COND_CLEANUP:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] +; TFA_INTERLEAVE: for.cond.cleanup: +; TFA_INTERLEAVE-NEXT: ret void +; entry: br label %for.body @@ -111,20 +160,20 @@ define void @test_if_then(ptr noalias %a, ptr readnone %b) #4 { ; TFNONE-NEXT: [[TMP3:%.*]] = mul i64 [[TMP2]], 2 ; TFNONE-NEXT: [[N_MOD_VF:%.*]] = urem i64 1025, [[TMP3]] ; TFNONE-NEXT: [[N_VEC:%.*]] = sub i64 1025, [[N_MOD_VF]] -; TFNONE-NEXT: [[TMP9:%.*]] = call i64 @llvm.vscale.i64() -; TFNONE-NEXT: [[TMP10:%.*]] = mul i64 [[TMP9]], 2 +; TFNONE-NEXT: [[TMP4:%.*]] = call i64 @llvm.vscale.i64() +; TFNONE-NEXT: [[TMP5:%.*]] = mul i64 [[TMP4]], 2 ; TFNONE-NEXT: br label [[VECTOR_BODY:%.*]] ; TFNONE: vector.body: ; TFNONE-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] -; TFNONE-NEXT: [[TMP4:%.*]] = getelementptr inbounds i64, ptr [[A:%.*]], i64 [[INDEX]] -; TFNONE-NEXT: [[WIDE_LOAD:%.*]] = load , ptr [[TMP4]], align 8 -; TFNONE-NEXT: [[TMP5:%.*]] = icmp ugt [[WIDE_LOAD]], shufflevector ( insertelement ( poison, i64 50, i64 0), poison, zeroinitializer) -; TFNONE-NEXT: [[TMP6:%.*]] = call @foo_vector( [[WIDE_LOAD]], [[TMP5]]) -; TFNONE-NEXT: [[TMP7:%.*]] = xor [[TMP5]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer) -; TFNONE-NEXT: [[PREDPHI:%.*]] = select [[TMP7]], zeroinitializer, [[TMP6]] -; TFNONE-NEXT: [[TMP8:%.*]] = getelementptr inbounds i64, ptr [[B:%.*]], i64 [[INDEX]] -; TFNONE-NEXT: store [[PREDPHI]], ptr [[TMP8]], align 8 -; TFNONE-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP10]] +; TFNONE-NEXT: [[TMP6:%.*]] = getelementptr inbounds i64, ptr [[A:%.*]], i64 [[INDEX]] +; TFNONE-NEXT: [[WIDE_LOAD:%.*]] = load , ptr [[TMP6]], align 8 +; TFNONE-NEXT: [[TMP7:%.*]] = icmp ugt [[WIDE_LOAD]], shufflevector ( insertelement ( poison, i64 50, i64 0), poison, zeroinitializer) +; TFNONE-NEXT: [[TMP8:%.*]] = call @foo_vector( [[WIDE_LOAD]], [[TMP7]]) +; TFNONE-NEXT: [[TMP9:%.*]] = xor [[TMP7]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer) +; TFNONE-NEXT: [[PREDPHI:%.*]] = select [[TMP9]], zeroinitializer, [[TMP8]] +; TFNONE-NEXT: [[TMP10:%.*]] = getelementptr inbounds i64, ptr [[B:%.*]], i64 [[INDEX]] +; TFNONE-NEXT: store [[PREDPHI]], ptr [[TMP10]], align 8 +; TFNONE-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]] ; TFNONE-NEXT: [[TMP11:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] ; TFNONE-NEXT: br i1 [[TMP11]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]] ; TFNONE: middle.block: @@ -161,25 +210,25 @@ define void @test_if_then(ptr noalias %a, ptr readnone %b) #4 { ; TFCOMMON-NEXT: [[N_RND_UP:%.*]] = add i64 1025, [[TMP4]] ; TFCOMMON-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP1]] ; TFCOMMON-NEXT: [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]] -; TFCOMMON-NEXT: [[TMP13:%.*]] = call i64 @llvm.vscale.i64() -; TFCOMMON-NEXT: [[TMP14:%.*]] = mul i64 [[TMP13]], 2 +; TFCOMMON-NEXT: [[TMP5:%.*]] = call i64 @llvm.vscale.i64() +; TFCOMMON-NEXT: [[TMP6:%.*]] = mul i64 [[TMP5]], 2 ; TFCOMMON-NEXT: [[ACTIVE_LANE_MASK_ENTRY:%.*]] = call @llvm.get.active.lane.mask.nxv2i1.i64(i64 0, i64 1025) ; TFCOMMON-NEXT: br label [[VECTOR_BODY:%.*]] ; TFCOMMON: vector.body: ; TFCOMMON-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] ; TFCOMMON-NEXT: [[ACTIVE_LANE_MASK:%.*]] = phi [ [[ACTIVE_LANE_MASK_ENTRY]], [[ENTRY]] ], [ [[ACTIVE_LANE_MASK_NEXT:%.*]], [[VECTOR_BODY]] ] -; TFCOMMON-NEXT: [[TMP5:%.*]] = getelementptr inbounds i64, ptr [[A:%.*]], i64 [[INDEX]] -; TFCOMMON-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call @llvm.masked.load.nxv2i64.p0(ptr [[TMP5]], i32 8, [[ACTIVE_LANE_MASK]], poison) -; TFCOMMON-NEXT: [[TMP6:%.*]] = icmp ugt [[WIDE_MASKED_LOAD]], shufflevector ( insertelement ( poison, i64 50, i64 0), poison, zeroinitializer) -; TFCOMMON-NEXT: [[TMP7:%.*]] = select [[ACTIVE_LANE_MASK]], [[TMP6]], zeroinitializer -; TFCOMMON-NEXT: [[TMP8:%.*]] = call @foo_vector( [[WIDE_MASKED_LOAD]], [[TMP7]]) -; TFCOMMON-NEXT: [[TMP9:%.*]] = xor [[TMP6]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer) -; TFCOMMON-NEXT: [[TMP10:%.*]] = select [[ACTIVE_LANE_MASK]], [[TMP9]], zeroinitializer -; TFCOMMON-NEXT: [[TMP12:%.*]] = or [[TMP7]], [[TMP10]] -; TFCOMMON-NEXT: [[PREDPHI:%.*]] = select [[TMP10]], zeroinitializer, [[TMP8]] -; TFCOMMON-NEXT: [[TMP11:%.*]] = getelementptr inbounds i64, ptr [[B:%.*]], i64 [[INDEX]] -; TFCOMMON-NEXT: call void @llvm.masked.store.nxv2i64.p0( [[PREDPHI]], ptr [[TMP11]], i32 8, [[TMP12]]) -; TFCOMMON-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP14]] +; TFCOMMON-NEXT: [[TMP7:%.*]] = getelementptr inbounds i64, ptr [[A:%.*]], i64 [[INDEX]] +; TFCOMMON-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call @llvm.masked.load.nxv2i64.p0(ptr [[TMP7]], i32 8, [[ACTIVE_LANE_MASK]], poison) +; TFCOMMON-NEXT: [[TMP8:%.*]] = icmp ugt [[WIDE_MASKED_LOAD]], shufflevector ( insertelement ( poison, i64 50, i64 0), poison, zeroinitializer) +; TFCOMMON-NEXT: [[TMP9:%.*]] = select [[ACTIVE_LANE_MASK]], [[TMP8]], zeroinitializer +; TFCOMMON-NEXT: [[TMP10:%.*]] = call @foo_vector( [[WIDE_MASKED_LOAD]], [[TMP9]]) +; TFCOMMON-NEXT: [[TMP11:%.*]] = xor [[TMP8]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer) +; TFCOMMON-NEXT: [[TMP12:%.*]] = select [[ACTIVE_LANE_MASK]], [[TMP11]], zeroinitializer +; TFCOMMON-NEXT: [[TMP13:%.*]] = or [[TMP9]], [[TMP12]] +; TFCOMMON-NEXT: [[PREDPHI:%.*]] = select [[TMP12]], zeroinitializer, [[TMP10]] +; TFCOMMON-NEXT: [[TMP14:%.*]] = getelementptr inbounds i64, ptr [[B:%.*]], i64 [[INDEX]] +; TFCOMMON-NEXT: call void @llvm.masked.store.nxv2i64.p0( [[PREDPHI]], ptr [[TMP14]], i32 8, [[TMP13]]) +; TFCOMMON-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP6]] ; TFCOMMON-NEXT: [[ACTIVE_LANE_MASK_NEXT]] = call @llvm.get.active.lane.mask.nxv2i1.i64(i64 [[INDEX_NEXT]], i64 1025) ; TFCOMMON-NEXT: [[TMP15:%.*]] = xor [[ACTIVE_LANE_MASK_NEXT]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer) ; TFCOMMON-NEXT: [[TMP16:%.*]] = extractelement [[TMP15]], i32 0 @@ -187,6 +236,66 @@ define void @test_if_then(ptr noalias %a, ptr readnone %b) #4 { ; TFCOMMON: for.cond.cleanup: ; TFCOMMON-NEXT: ret void ; +; TFA_INTERLEAVE-LABEL: @test_if_then( +; TFA_INTERLEAVE-NEXT: entry: +; TFA_INTERLEAVE-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64() +; TFA_INTERLEAVE-NEXT: [[TMP1:%.*]] = mul i64 [[TMP0]], 4 +; TFA_INTERLEAVE-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64() +; TFA_INTERLEAVE-NEXT: [[TMP3:%.*]] = mul i64 [[TMP2]], 4 +; TFA_INTERLEAVE-NEXT: [[TMP4:%.*]] = sub i64 [[TMP3]], 1 +; TFA_INTERLEAVE-NEXT: [[N_RND_UP:%.*]] = add i64 1025, [[TMP4]] +; TFA_INTERLEAVE-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP1]] +; TFA_INTERLEAVE-NEXT: [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]] +; TFA_INTERLEAVE-NEXT: [[TMP5:%.*]] = call i64 @llvm.vscale.i64() +; TFA_INTERLEAVE-NEXT: [[TMP6:%.*]] = mul i64 [[TMP5]], 4 +; TFA_INTERLEAVE-NEXT: [[TMP7:%.*]] = call i64 @llvm.vscale.i64() +; TFA_INTERLEAVE-NEXT: [[TMP8:%.*]] = mul i64 [[TMP7]], 2 +; TFA_INTERLEAVE-NEXT: [[ACTIVE_LANE_MASK_ENTRY:%.*]] = call @llvm.get.active.lane.mask.nxv2i1.i64(i64 0, i64 1025) +; TFA_INTERLEAVE-NEXT: [[ACTIVE_LANE_MASK_ENTRY1:%.*]] = call @llvm.get.active.lane.mask.nxv2i1.i64(i64 [[TMP8]], i64 1025) +; TFA_INTERLEAVE-NEXT: br label [[VECTOR_BODY:%.*]] +; TFA_INTERLEAVE: vector.body: +; TFA_INTERLEAVE-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[INDEX_NEXT5:%.*]], [[VECTOR_BODY]] ] +; TFA_INTERLEAVE-NEXT: [[ACTIVE_LANE_MASK:%.*]] = phi [ [[ACTIVE_LANE_MASK_ENTRY]], [[ENTRY]] ], [ [[ACTIVE_LANE_MASK_NEXT:%.*]], [[VECTOR_BODY]] ] +; TFA_INTERLEAVE-NEXT: [[ACTIVE_LANE_MASK2:%.*]] = phi [ [[ACTIVE_LANE_MASK_ENTRY1]], [[ENTRY]] ], [ [[ACTIVE_LANE_MASK_NEXT6:%.*]], [[VECTOR_BODY]] ] +; TFA_INTERLEAVE-NEXT: [[TMP9:%.*]] = getelementptr inbounds i64, ptr [[A:%.*]], i64 [[INDEX]] +; TFA_INTERLEAVE-NEXT: [[TMP10:%.*]] = call i64 @llvm.vscale.i64() +; TFA_INTERLEAVE-NEXT: [[TMP11:%.*]] = mul i64 [[TMP10]], 2 +; TFA_INTERLEAVE-NEXT: [[TMP12:%.*]] = getelementptr inbounds i64, ptr [[TMP9]], i64 [[TMP11]] +; TFA_INTERLEAVE-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call @llvm.masked.load.nxv2i64.p0(ptr [[TMP9]], i32 8, [[ACTIVE_LANE_MASK]], poison) +; TFA_INTERLEAVE-NEXT: [[WIDE_MASKED_LOAD3:%.*]] = call @llvm.masked.load.nxv2i64.p0(ptr [[TMP12]], i32 8, [[ACTIVE_LANE_MASK2]], poison) +; TFA_INTERLEAVE-NEXT: [[TMP13:%.*]] = icmp ugt [[WIDE_MASKED_LOAD]], shufflevector ( insertelement ( poison, i64 50, i64 0), poison, zeroinitializer) +; TFA_INTERLEAVE-NEXT: [[TMP14:%.*]] = icmp ugt [[WIDE_MASKED_LOAD3]], shufflevector ( insertelement ( poison, i64 50, i64 0), poison, zeroinitializer) +; TFA_INTERLEAVE-NEXT: [[TMP15:%.*]] = select [[ACTIVE_LANE_MASK]], [[TMP13]], zeroinitializer +; TFA_INTERLEAVE-NEXT: [[TMP16:%.*]] = select [[ACTIVE_LANE_MASK2]], [[TMP14]], zeroinitializer +; TFA_INTERLEAVE-NEXT: [[TMP17:%.*]] = call @foo_vector( [[WIDE_MASKED_LOAD]], [[TMP15]]) +; TFA_INTERLEAVE-NEXT: [[TMP18:%.*]] = call @foo_vector( [[WIDE_MASKED_LOAD3]], [[TMP16]]) +; TFA_INTERLEAVE-NEXT: [[TMP19:%.*]] = xor [[TMP13]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer) +; TFA_INTERLEAVE-NEXT: [[TMP20:%.*]] = xor [[TMP14]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer) +; TFA_INTERLEAVE-NEXT: [[TMP21:%.*]] = select [[ACTIVE_LANE_MASK]], [[TMP19]], zeroinitializer +; TFA_INTERLEAVE-NEXT: [[TMP22:%.*]] = select [[ACTIVE_LANE_MASK2]], [[TMP20]], zeroinitializer +; TFA_INTERLEAVE-NEXT: [[TMP23:%.*]] = or [[TMP15]], [[TMP21]] +; TFA_INTERLEAVE-NEXT: [[TMP24:%.*]] = or [[TMP16]], [[TMP22]] +; TFA_INTERLEAVE-NEXT: [[PREDPHI:%.*]] = select [[TMP21]], zeroinitializer, [[TMP17]] +; TFA_INTERLEAVE-NEXT: [[PREDPHI4:%.*]] = select [[TMP22]], zeroinitializer, [[TMP18]] +; TFA_INTERLEAVE-NEXT: [[TMP25:%.*]] = getelementptr inbounds i64, ptr [[B:%.*]], i64 [[INDEX]] +; TFA_INTERLEAVE-NEXT: [[TMP26:%.*]] = call i64 @llvm.vscale.i64() +; TFA_INTERLEAVE-NEXT: [[TMP27:%.*]] = mul i64 [[TMP26]], 2 +; TFA_INTERLEAVE-NEXT: [[TMP28:%.*]] = getelementptr inbounds i64, ptr [[TMP25]], i64 [[TMP27]] +; TFA_INTERLEAVE-NEXT: call void @llvm.masked.store.nxv2i64.p0( [[PREDPHI]], ptr [[TMP25]], i32 8, [[TMP23]]) +; TFA_INTERLEAVE-NEXT: call void @llvm.masked.store.nxv2i64.p0( [[PREDPHI4]], ptr [[TMP28]], i32 8, [[TMP24]]) +; TFA_INTERLEAVE-NEXT: [[INDEX_NEXT:%.*]] = add i64 [[INDEX]], [[TMP6]] +; TFA_INTERLEAVE-NEXT: [[INDEX_NEXT5]] = add i64 [[INDEX]], [[TMP6]] +; TFA_INTERLEAVE-NEXT: [[TMP29:%.*]] = call i64 @llvm.vscale.i64() +; TFA_INTERLEAVE-NEXT: [[TMP30:%.*]] = mul i64 [[TMP29]], 2 +; TFA_INTERLEAVE-NEXT: [[TMP31:%.*]] = add i64 [[INDEX_NEXT]], [[TMP30]] +; TFA_INTERLEAVE-NEXT: [[ACTIVE_LANE_MASK_NEXT]] = call @llvm.get.active.lane.mask.nxv2i1.i64(i64 [[INDEX_NEXT]], i64 1025) +; TFA_INTERLEAVE-NEXT: [[ACTIVE_LANE_MASK_NEXT6]] = call @llvm.get.active.lane.mask.nxv2i1.i64(i64 [[TMP31]], i64 1025) +; TFA_INTERLEAVE-NEXT: [[TMP32:%.*]] = xor [[ACTIVE_LANE_MASK_NEXT]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer) +; TFA_INTERLEAVE-NEXT: [[TMP33:%.*]] = extractelement [[TMP32]], i32 0 +; TFA_INTERLEAVE-NEXT: br i1 [[TMP33]], label [[FOR_COND_CLEANUP:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]] +; TFA_INTERLEAVE: for.cond.cleanup: +; TFA_INTERLEAVE-NEXT: ret void +; entry: br label %for.body @@ -229,21 +338,21 @@ define void @test_widen_if_then_else(ptr noalias %a, ptr readnone %b) #4 { ; TFNONE-NEXT: [[TMP3:%.*]] = mul i64 [[TMP2]], 2 ; TFNONE-NEXT: [[N_MOD_VF:%.*]] = urem i64 1025, [[TMP3]] ; TFNONE-NEXT: [[N_VEC:%.*]] = sub i64 1025, [[N_MOD_VF]] -; TFNONE-NEXT: [[TMP10:%.*]] = call i64 @llvm.vscale.i64() -; TFNONE-NEXT: [[TMP11:%.*]] = mul i64 [[TMP10]], 2 +; TFNONE-NEXT: [[TMP4:%.*]] = call i64 @llvm.vscale.i64() +; TFNONE-NEXT: [[TMP5:%.*]] = mul i64 [[TMP4]], 2 ; TFNONE-NEXT: br label [[VECTOR_BODY:%.*]] ; TFNONE: vector.body: ; TFNONE-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] -; TFNONE-NEXT: [[TMP4:%.*]] = getelementptr inbounds i64, ptr [[A:%.*]], i64 [[INDEX]] -; TFNONE-NEXT: [[WIDE_LOAD:%.*]] = load , ptr [[TMP4]], align 8 -; TFNONE-NEXT: [[TMP5:%.*]] = icmp ugt [[WIDE_LOAD]], shufflevector ( insertelement ( poison, i64 50, i64 0), poison, zeroinitializer) -; TFNONE-NEXT: [[TMP6:%.*]] = xor [[TMP5]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer) -; TFNONE-NEXT: [[TMP7:%.*]] = call @foo_vector( zeroinitializer, [[TMP6]]) -; TFNONE-NEXT: [[TMP8:%.*]] = call @foo_vector( [[WIDE_LOAD]], [[TMP5]]) -; TFNONE-NEXT: [[PREDPHI:%.*]] = select [[TMP6]], [[TMP7]], [[TMP8]] -; TFNONE-NEXT: [[TMP9:%.*]] = getelementptr inbounds i64, ptr [[B:%.*]], i64 [[INDEX]] -; TFNONE-NEXT: store [[PREDPHI]], ptr [[TMP9]], align 8 -; TFNONE-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP11]] +; TFNONE-NEXT: [[TMP6:%.*]] = getelementptr inbounds i64, ptr [[A:%.*]], i64 [[INDEX]] +; TFNONE-NEXT: [[WIDE_LOAD:%.*]] = load , ptr [[TMP6]], align 8 +; TFNONE-NEXT: [[TMP7:%.*]] = icmp ugt [[WIDE_LOAD]], shufflevector ( insertelement ( poison, i64 50, i64 0), poison, zeroinitializer) +; TFNONE-NEXT: [[TMP8:%.*]] = xor [[TMP7]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer) +; TFNONE-NEXT: [[TMP9:%.*]] = call @foo_vector( zeroinitializer, [[TMP8]]) +; TFNONE-NEXT: [[TMP10:%.*]] = call @foo_vector( [[WIDE_LOAD]], [[TMP7]]) +; TFNONE-NEXT: [[PREDPHI:%.*]] = select [[TMP8]], [[TMP9]], [[TMP10]] +; TFNONE-NEXT: [[TMP11:%.*]] = getelementptr inbounds i64, ptr [[B:%.*]], i64 [[INDEX]] +; TFNONE-NEXT: store [[PREDPHI]], ptr [[TMP11]], align 8 +; TFNONE-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]] ; TFNONE-NEXT: [[TMP12:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] ; TFNONE-NEXT: br i1 [[TMP12]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]] ; TFNONE: middle.block: @@ -283,26 +392,26 @@ define void @test_widen_if_then_else(ptr noalias %a, ptr readnone %b) #4 { ; TFCOMMON-NEXT: [[N_RND_UP:%.*]] = add i64 1025, [[TMP4]] ; TFCOMMON-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP1]] ; TFCOMMON-NEXT: [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]] -; TFCOMMON-NEXT: [[TMP14:%.*]] = call i64 @llvm.vscale.i64() -; TFCOMMON-NEXT: [[TMP15:%.*]] = mul i64 [[TMP14]], 2 +; TFCOMMON-NEXT: [[TMP5:%.*]] = call i64 @llvm.vscale.i64() +; TFCOMMON-NEXT: [[TMP6:%.*]] = mul i64 [[TMP5]], 2 ; TFCOMMON-NEXT: [[ACTIVE_LANE_MASK_ENTRY:%.*]] = call @llvm.get.active.lane.mask.nxv2i1.i64(i64 0, i64 1025) ; TFCOMMON-NEXT: br label [[VECTOR_BODY:%.*]] ; TFCOMMON: vector.body: ; TFCOMMON-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] ; TFCOMMON-NEXT: [[ACTIVE_LANE_MASK:%.*]] = phi [ [[ACTIVE_LANE_MASK_ENTRY]], [[ENTRY]] ], [ [[ACTIVE_LANE_MASK_NEXT:%.*]], [[VECTOR_BODY]] ] -; TFCOMMON-NEXT: [[TMP5:%.*]] = getelementptr inbounds i64, ptr [[A:%.*]], i64 [[INDEX]] -; TFCOMMON-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call @llvm.masked.load.nxv2i64.p0(ptr [[TMP5]], i32 8, [[ACTIVE_LANE_MASK]], poison) -; TFCOMMON-NEXT: [[TMP6:%.*]] = icmp ugt [[WIDE_MASKED_LOAD]], shufflevector ( insertelement ( poison, i64 50, i64 0), poison, zeroinitializer) -; TFCOMMON-NEXT: [[TMP7:%.*]] = xor [[TMP6]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer) -; TFCOMMON-NEXT: [[TMP8:%.*]] = select [[ACTIVE_LANE_MASK]], [[TMP7]], zeroinitializer -; TFCOMMON-NEXT: [[TMP9:%.*]] = call @foo_vector( zeroinitializer, [[TMP8]]) -; TFCOMMON-NEXT: [[TMP10:%.*]] = select [[ACTIVE_LANE_MASK]], [[TMP6]], zeroinitializer -; TFCOMMON-NEXT: [[TMP11:%.*]] = call @foo_vector( [[WIDE_MASKED_LOAD]], [[TMP10]]) -; TFCOMMON-NEXT: [[TMP13:%.*]] = or [[TMP8]], [[TMP10]] -; TFCOMMON-NEXT: [[PREDPHI:%.*]] = select [[TMP8]], [[TMP9]], [[TMP11]] -; TFCOMMON-NEXT: [[TMP12:%.*]] = getelementptr inbounds i64, ptr [[B:%.*]], i64 [[INDEX]] -; TFCOMMON-NEXT: call void @llvm.masked.store.nxv2i64.p0( [[PREDPHI]], ptr [[TMP12]], i32 8, [[TMP13]]) -; TFCOMMON-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP15]] +; TFCOMMON-NEXT: [[TMP7:%.*]] = getelementptr inbounds i64, ptr [[A:%.*]], i64 [[INDEX]] +; TFCOMMON-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call @llvm.masked.load.nxv2i64.p0(ptr [[TMP7]], i32 8, [[ACTIVE_LANE_MASK]], poison) +; TFCOMMON-NEXT: [[TMP8:%.*]] = icmp ugt [[WIDE_MASKED_LOAD]], shufflevector ( insertelement ( poison, i64 50, i64 0), poison, zeroinitializer) +; TFCOMMON-NEXT: [[TMP9:%.*]] = xor [[TMP8]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer) +; TFCOMMON-NEXT: [[TMP10:%.*]] = select [[ACTIVE_LANE_MASK]], [[TMP9]], zeroinitializer +; TFCOMMON-NEXT: [[TMP11:%.*]] = call @foo_vector( zeroinitializer, [[TMP10]]) +; TFCOMMON-NEXT: [[TMP12:%.*]] = select [[ACTIVE_LANE_MASK]], [[TMP8]], zeroinitializer +; TFCOMMON-NEXT: [[TMP13:%.*]] = call @foo_vector( [[WIDE_MASKED_LOAD]], [[TMP12]]) +; TFCOMMON-NEXT: [[TMP14:%.*]] = or [[TMP10]], [[TMP12]] +; TFCOMMON-NEXT: [[PREDPHI:%.*]] = select [[TMP10]], [[TMP11]], [[TMP13]] +; TFCOMMON-NEXT: [[TMP15:%.*]] = getelementptr inbounds i64, ptr [[B:%.*]], i64 [[INDEX]] +; TFCOMMON-NEXT: call void @llvm.masked.store.nxv2i64.p0( [[PREDPHI]], ptr [[TMP15]], i32 8, [[TMP14]]) +; TFCOMMON-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP6]] ; TFCOMMON-NEXT: [[ACTIVE_LANE_MASK_NEXT]] = call @llvm.get.active.lane.mask.nxv2i1.i64(i64 [[INDEX_NEXT]], i64 1025) ; TFCOMMON-NEXT: [[TMP16:%.*]] = xor [[ACTIVE_LANE_MASK_NEXT]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer) ; TFCOMMON-NEXT: [[TMP17:%.*]] = extractelement [[TMP16]], i32 0 @@ -310,6 +419,68 @@ define void @test_widen_if_then_else(ptr noalias %a, ptr readnone %b) #4 { ; TFCOMMON: for.cond.cleanup: ; TFCOMMON-NEXT: ret void ; +; TFA_INTERLEAVE-LABEL: @test_widen_if_then_else( +; TFA_INTERLEAVE-NEXT: entry: +; TFA_INTERLEAVE-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64() +; TFA_INTERLEAVE-NEXT: [[TMP1:%.*]] = mul i64 [[TMP0]], 4 +; TFA_INTERLEAVE-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64() +; TFA_INTERLEAVE-NEXT: [[TMP3:%.*]] = mul i64 [[TMP2]], 4 +; TFA_INTERLEAVE-NEXT: [[TMP4:%.*]] = sub i64 [[TMP3]], 1 +; TFA_INTERLEAVE-NEXT: [[N_RND_UP:%.*]] = add i64 1025, [[TMP4]] +; TFA_INTERLEAVE-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP1]] +; TFA_INTERLEAVE-NEXT: [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]] +; TFA_INTERLEAVE-NEXT: [[TMP5:%.*]] = call i64 @llvm.vscale.i64() +; TFA_INTERLEAVE-NEXT: [[TMP6:%.*]] = mul i64 [[TMP5]], 4 +; TFA_INTERLEAVE-NEXT: [[TMP7:%.*]] = call i64 @llvm.vscale.i64() +; TFA_INTERLEAVE-NEXT: [[TMP8:%.*]] = mul i64 [[TMP7]], 2 +; TFA_INTERLEAVE-NEXT: [[ACTIVE_LANE_MASK_ENTRY:%.*]] = call @llvm.get.active.lane.mask.nxv2i1.i64(i64 0, i64 1025) +; TFA_INTERLEAVE-NEXT: [[ACTIVE_LANE_MASK_ENTRY1:%.*]] = call @llvm.get.active.lane.mask.nxv2i1.i64(i64 [[TMP8]], i64 1025) +; TFA_INTERLEAVE-NEXT: br label [[VECTOR_BODY:%.*]] +; TFA_INTERLEAVE: vector.body: +; TFA_INTERLEAVE-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[INDEX_NEXT5:%.*]], [[VECTOR_BODY]] ] +; TFA_INTERLEAVE-NEXT: [[ACTIVE_LANE_MASK:%.*]] = phi [ [[ACTIVE_LANE_MASK_ENTRY]], [[ENTRY]] ], [ [[ACTIVE_LANE_MASK_NEXT:%.*]], [[VECTOR_BODY]] ] +; TFA_INTERLEAVE-NEXT: [[ACTIVE_LANE_MASK2:%.*]] = phi [ [[ACTIVE_LANE_MASK_ENTRY1]], [[ENTRY]] ], [ [[ACTIVE_LANE_MASK_NEXT6:%.*]], [[VECTOR_BODY]] ] +; TFA_INTERLEAVE-NEXT: [[TMP9:%.*]] = getelementptr inbounds i64, ptr [[A:%.*]], i64 [[INDEX]] +; TFA_INTERLEAVE-NEXT: [[TMP10:%.*]] = call i64 @llvm.vscale.i64() +; TFA_INTERLEAVE-NEXT: [[TMP11:%.*]] = mul i64 [[TMP10]], 2 +; TFA_INTERLEAVE-NEXT: [[TMP12:%.*]] = getelementptr inbounds i64, ptr [[TMP9]], i64 [[TMP11]] +; TFA_INTERLEAVE-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call @llvm.masked.load.nxv2i64.p0(ptr [[TMP9]], i32 8, [[ACTIVE_LANE_MASK]], poison) +; TFA_INTERLEAVE-NEXT: [[WIDE_MASKED_LOAD3:%.*]] = call @llvm.masked.load.nxv2i64.p0(ptr [[TMP12]], i32 8, [[ACTIVE_LANE_MASK2]], poison) +; TFA_INTERLEAVE-NEXT: [[TMP13:%.*]] = icmp ugt [[WIDE_MASKED_LOAD]], shufflevector ( insertelement ( poison, i64 50, i64 0), poison, zeroinitializer) +; TFA_INTERLEAVE-NEXT: [[TMP14:%.*]] = icmp ugt [[WIDE_MASKED_LOAD3]], shufflevector ( insertelement ( poison, i64 50, i64 0), poison, zeroinitializer) +; TFA_INTERLEAVE-NEXT: [[TMP15:%.*]] = xor [[TMP13]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer) +; TFA_INTERLEAVE-NEXT: [[TMP16:%.*]] = xor [[TMP14]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer) +; TFA_INTERLEAVE-NEXT: [[TMP17:%.*]] = select [[ACTIVE_LANE_MASK]], [[TMP15]], zeroinitializer +; TFA_INTERLEAVE-NEXT: [[TMP18:%.*]] = select [[ACTIVE_LANE_MASK2]], [[TMP16]], zeroinitializer +; TFA_INTERLEAVE-NEXT: [[TMP19:%.*]] = call @foo_vector( zeroinitializer, [[TMP17]]) +; TFA_INTERLEAVE-NEXT: [[TMP20:%.*]] = call @foo_vector( zeroinitializer, [[TMP18]]) +; TFA_INTERLEAVE-NEXT: [[TMP21:%.*]] = select [[ACTIVE_LANE_MASK]], [[TMP13]], zeroinitializer +; TFA_INTERLEAVE-NEXT: [[TMP22:%.*]] = select [[ACTIVE_LANE_MASK2]], [[TMP14]], zeroinitializer +; TFA_INTERLEAVE-NEXT: [[TMP23:%.*]] = call @foo_vector( [[WIDE_MASKED_LOAD]], [[TMP21]]) +; TFA_INTERLEAVE-NEXT: [[TMP24:%.*]] = call @foo_vector( [[WIDE_MASKED_LOAD3]], [[TMP22]]) +; TFA_INTERLEAVE-NEXT: [[TMP25:%.*]] = or [[TMP17]], [[TMP21]] +; TFA_INTERLEAVE-NEXT: [[TMP26:%.*]] = or [[TMP18]], [[TMP22]] +; TFA_INTERLEAVE-NEXT: [[PREDPHI:%.*]] = select [[TMP17]], [[TMP19]], [[TMP23]] +; TFA_INTERLEAVE-NEXT: [[PREDPHI4:%.*]] = select [[TMP18]], [[TMP20]], [[TMP24]] +; TFA_INTERLEAVE-NEXT: [[TMP27:%.*]] = getelementptr inbounds i64, ptr [[B:%.*]], i64 [[INDEX]] +; TFA_INTERLEAVE-NEXT: [[TMP28:%.*]] = call i64 @llvm.vscale.i64() +; TFA_INTERLEAVE-NEXT: [[TMP29:%.*]] = mul i64 [[TMP28]], 2 +; TFA_INTERLEAVE-NEXT: [[TMP30:%.*]] = getelementptr inbounds i64, ptr [[TMP27]], i64 [[TMP29]] +; TFA_INTERLEAVE-NEXT: call void @llvm.masked.store.nxv2i64.p0( [[PREDPHI]], ptr [[TMP27]], i32 8, [[TMP25]]) +; TFA_INTERLEAVE-NEXT: call void @llvm.masked.store.nxv2i64.p0( [[PREDPHI4]], ptr [[TMP30]], i32 8, [[TMP26]]) +; TFA_INTERLEAVE-NEXT: [[INDEX_NEXT:%.*]] = add i64 [[INDEX]], [[TMP6]] +; TFA_INTERLEAVE-NEXT: [[INDEX_NEXT5]] = add i64 [[INDEX]], [[TMP6]] +; TFA_INTERLEAVE-NEXT: [[TMP31:%.*]] = call i64 @llvm.vscale.i64() +; TFA_INTERLEAVE-NEXT: [[TMP32:%.*]] = mul i64 [[TMP31]], 2 +; TFA_INTERLEAVE-NEXT: [[TMP33:%.*]] = add i64 [[INDEX_NEXT]], [[TMP32]] +; TFA_INTERLEAVE-NEXT: [[ACTIVE_LANE_MASK_NEXT]] = call @llvm.get.active.lane.mask.nxv2i1.i64(i64 [[INDEX_NEXT]], i64 1025) +; TFA_INTERLEAVE-NEXT: [[ACTIVE_LANE_MASK_NEXT6]] = call @llvm.get.active.lane.mask.nxv2i1.i64(i64 [[TMP33]], i64 1025) +; TFA_INTERLEAVE-NEXT: [[TMP34:%.*]] = xor [[ACTIVE_LANE_MASK_NEXT]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer) +; TFA_INTERLEAVE-NEXT: [[TMP35:%.*]] = extractelement [[TMP34]], i32 0 +; TFA_INTERLEAVE-NEXT: br i1 [[TMP35]], label [[FOR_COND_CLEANUP:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]] +; TFA_INTERLEAVE: for.cond.cleanup: +; TFA_INTERLEAVE-NEXT: ret void +; entry: br label %for.body @@ -355,17 +526,17 @@ define void @test_widen_nomask(ptr noalias %a, ptr readnone %b) #4 { ; TFNONE-NEXT: [[TMP3:%.*]] = mul i64 [[TMP2]], 2 ; TFNONE-NEXT: [[N_MOD_VF:%.*]] = urem i64 1025, [[TMP3]] ; TFNONE-NEXT: [[N_VEC:%.*]] = sub i64 1025, [[N_MOD_VF]] -; TFNONE-NEXT: [[TMP7:%.*]] = call i64 @llvm.vscale.i64() -; TFNONE-NEXT: [[TMP8:%.*]] = mul i64 [[TMP7]], 2 +; TFNONE-NEXT: [[TMP4:%.*]] = call i64 @llvm.vscale.i64() +; TFNONE-NEXT: [[TMP5:%.*]] = mul i64 [[TMP4]], 2 ; TFNONE-NEXT: br label [[VECTOR_BODY:%.*]] ; TFNONE: vector.body: ; TFNONE-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] -; TFNONE-NEXT: [[TMP4:%.*]] = getelementptr i64, ptr [[B:%.*]], i64 [[INDEX]] -; TFNONE-NEXT: [[WIDE_LOAD:%.*]] = load , ptr [[TMP4]], align 8 -; TFNONE-NEXT: [[TMP5:%.*]] = call @foo_vector_nomask( [[WIDE_LOAD]]) -; TFNONE-NEXT: [[TMP6:%.*]] = getelementptr inbounds i64, ptr [[A:%.*]], i64 [[INDEX]] -; TFNONE-NEXT: store [[TMP5]], ptr [[TMP6]], align 8 -; TFNONE-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP8]] +; TFNONE-NEXT: [[TMP6:%.*]] = getelementptr i64, ptr [[B:%.*]], i64 [[INDEX]] +; TFNONE-NEXT: [[WIDE_LOAD:%.*]] = load , ptr [[TMP6]], align 8 +; TFNONE-NEXT: [[TMP7:%.*]] = call @foo_vector_nomask( [[WIDE_LOAD]]) +; TFNONE-NEXT: [[TMP8:%.*]] = getelementptr inbounds i64, ptr [[A:%.*]], i64 [[INDEX]] +; TFNONE-NEXT: store [[TMP7]], ptr [[TMP8]], align 8 +; TFNONE-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]] ; TFNONE-NEXT: [[TMP9:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] ; TFNONE-NEXT: br i1 [[TMP9]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]] ; TFNONE: middle.block: @@ -413,17 +584,17 @@ define void @test_widen_nomask(ptr noalias %a, ptr readnone %b) #4 { ; TFFALLBACK-NEXT: [[TMP3:%.*]] = mul i64 [[TMP2]], 2 ; TFFALLBACK-NEXT: [[N_MOD_VF:%.*]] = urem i64 1025, [[TMP3]] ; TFFALLBACK-NEXT: [[N_VEC:%.*]] = sub i64 1025, [[N_MOD_VF]] -; TFFALLBACK-NEXT: [[TMP7:%.*]] = call i64 @llvm.vscale.i64() -; TFFALLBACK-NEXT: [[TMP8:%.*]] = mul i64 [[TMP7]], 2 +; TFFALLBACK-NEXT: [[TMP4:%.*]] = call i64 @llvm.vscale.i64() +; TFFALLBACK-NEXT: [[TMP5:%.*]] = mul i64 [[TMP4]], 2 ; TFFALLBACK-NEXT: br label [[VECTOR_BODY:%.*]] ; TFFALLBACK: vector.body: ; TFFALLBACK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] -; TFFALLBACK-NEXT: [[TMP4:%.*]] = getelementptr i64, ptr [[B:%.*]], i64 [[INDEX]] -; TFFALLBACK-NEXT: [[WIDE_LOAD:%.*]] = load , ptr [[TMP4]], align 8 -; TFFALLBACK-NEXT: [[TMP5:%.*]] = call @foo_vector_nomask( [[WIDE_LOAD]]) -; TFFALLBACK-NEXT: [[TMP6:%.*]] = getelementptr inbounds i64, ptr [[A:%.*]], i64 [[INDEX]] -; TFFALLBACK-NEXT: store [[TMP5]], ptr [[TMP6]], align 8 -; TFFALLBACK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP8]] +; TFFALLBACK-NEXT: [[TMP6:%.*]] = getelementptr i64, ptr [[B:%.*]], i64 [[INDEX]] +; TFFALLBACK-NEXT: [[WIDE_LOAD:%.*]] = load , ptr [[TMP6]], align 8 +; TFFALLBACK-NEXT: [[TMP7:%.*]] = call @foo_vector_nomask( [[WIDE_LOAD]]) +; TFFALLBACK-NEXT: [[TMP8:%.*]] = getelementptr inbounds i64, ptr [[A:%.*]], i64 [[INDEX]] +; TFFALLBACK-NEXT: store [[TMP7]], ptr [[TMP8]], align 8 +; TFFALLBACK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]] ; TFFALLBACK-NEXT: [[TMP9:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] ; TFFALLBACK-NEXT: br i1 [[TMP9]], label [[SCALAR_PH]], label [[VECTOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]] ; TFFALLBACK: scalar.ph: @@ -442,6 +613,22 @@ define void @test_widen_nomask(ptr noalias %a, ptr readnone %b) #4 { ; TFFALLBACK: for.cond.cleanup: ; TFFALLBACK-NEXT: ret void ; +; TFA_INTERLEAVE-LABEL: @test_widen_nomask( +; TFA_INTERLEAVE-NEXT: entry: +; TFA_INTERLEAVE-NEXT: br label [[FOR_BODY:%.*]] +; TFA_INTERLEAVE: for.body: +; TFA_INTERLEAVE-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ] +; TFA_INTERLEAVE-NEXT: [[GEP:%.*]] = getelementptr i64, ptr [[B:%.*]], i64 [[INDVARS_IV]] +; TFA_INTERLEAVE-NEXT: [[LOAD:%.*]] = load i64, ptr [[GEP]], align 8 +; TFA_INTERLEAVE-NEXT: [[CALL:%.*]] = call i64 @foo(i64 [[LOAD]]) #[[ATTR5:[0-9]+]] +; TFA_INTERLEAVE-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i64, ptr [[A:%.*]], i64 [[INDVARS_IV]] +; TFA_INTERLEAVE-NEXT: store i64 [[CALL]], ptr [[ARRAYIDX]], align 8 +; TFA_INTERLEAVE-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1 +; TFA_INTERLEAVE-NEXT: [[EXITCOND:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], 1025 +; TFA_INTERLEAVE-NEXT: br i1 [[EXITCOND]], label [[FOR_COND_CLEANUP:%.*]], label [[FOR_BODY]] +; TFA_INTERLEAVE: for.cond.cleanup: +; TFA_INTERLEAVE-NEXT: ret void +; entry: br label %for.body @@ -475,17 +662,17 @@ define void @test_widen_optmask(ptr noalias %a, ptr readnone %b) #4 { ; TFNONE-NEXT: [[TMP3:%.*]] = mul i64 [[TMP2]], 2 ; TFNONE-NEXT: [[N_MOD_VF:%.*]] = urem i64 1025, [[TMP3]] ; TFNONE-NEXT: [[N_VEC:%.*]] = sub i64 1025, [[N_MOD_VF]] -; TFNONE-NEXT: [[TMP7:%.*]] = call i64 @llvm.vscale.i64() -; TFNONE-NEXT: [[TMP8:%.*]] = mul i64 [[TMP7]], 2 +; TFNONE-NEXT: [[TMP4:%.*]] = call i64 @llvm.vscale.i64() +; TFNONE-NEXT: [[TMP5:%.*]] = mul i64 [[TMP4]], 2 ; TFNONE-NEXT: br label [[VECTOR_BODY:%.*]] ; TFNONE: vector.body: ; TFNONE-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] -; TFNONE-NEXT: [[TMP4:%.*]] = getelementptr i64, ptr [[B:%.*]], i64 [[INDEX]] -; TFNONE-NEXT: [[WIDE_LOAD:%.*]] = load , ptr [[TMP4]], align 8 -; TFNONE-NEXT: [[TMP5:%.*]] = call @foo_vector_nomask( [[WIDE_LOAD]]) -; TFNONE-NEXT: [[TMP6:%.*]] = getelementptr inbounds i64, ptr [[A:%.*]], i64 [[INDEX]] -; TFNONE-NEXT: store [[TMP5]], ptr [[TMP6]], align 8 -; TFNONE-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP8]] +; TFNONE-NEXT: [[TMP6:%.*]] = getelementptr i64, ptr [[B:%.*]], i64 [[INDEX]] +; TFNONE-NEXT: [[WIDE_LOAD:%.*]] = load , ptr [[TMP6]], align 8 +; TFNONE-NEXT: [[TMP7:%.*]] = call @foo_vector_nomask( [[WIDE_LOAD]]) +; TFNONE-NEXT: [[TMP8:%.*]] = getelementptr inbounds i64, ptr [[A:%.*]], i64 [[INDEX]] +; TFNONE-NEXT: store [[TMP7]], ptr [[TMP8]], align 8 +; TFNONE-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]] ; TFNONE-NEXT: [[TMP9:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] ; TFNONE-NEXT: br i1 [[TMP9]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]] ; TFNONE: middle.block: @@ -516,19 +703,19 @@ define void @test_widen_optmask(ptr noalias %a, ptr readnone %b) #4 { ; TFALWAYS-NEXT: [[N_RND_UP:%.*]] = add i64 1025, [[TMP4]] ; TFALWAYS-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP1]] ; TFALWAYS-NEXT: [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]] -; TFALWAYS-NEXT: [[TMP8:%.*]] = call i64 @llvm.vscale.i64() -; TFALWAYS-NEXT: [[TMP9:%.*]] = mul i64 [[TMP8]], 2 +; TFALWAYS-NEXT: [[TMP5:%.*]] = call i64 @llvm.vscale.i64() +; TFALWAYS-NEXT: [[TMP6:%.*]] = mul i64 [[TMP5]], 2 ; TFALWAYS-NEXT: [[ACTIVE_LANE_MASK_ENTRY:%.*]] = call @llvm.get.active.lane.mask.nxv2i1.i64(i64 0, i64 1025) ; TFALWAYS-NEXT: br label [[VECTOR_BODY:%.*]] ; TFALWAYS: vector.body: ; TFALWAYS-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] ; TFALWAYS-NEXT: [[ACTIVE_LANE_MASK:%.*]] = phi [ [[ACTIVE_LANE_MASK_ENTRY]], [[ENTRY]] ], [ [[ACTIVE_LANE_MASK_NEXT:%.*]], [[VECTOR_BODY]] ] -; TFALWAYS-NEXT: [[TMP5:%.*]] = getelementptr i64, ptr [[B:%.*]], i64 [[INDEX]] -; TFALWAYS-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call @llvm.masked.load.nxv2i64.p0(ptr [[TMP5]], i32 8, [[ACTIVE_LANE_MASK]], poison) -; TFALWAYS-NEXT: [[TMP6:%.*]] = call @foo_vector( [[WIDE_MASKED_LOAD]], [[ACTIVE_LANE_MASK]]) -; TFALWAYS-NEXT: [[TMP7:%.*]] = getelementptr inbounds i64, ptr [[A:%.*]], i64 [[INDEX]] -; TFALWAYS-NEXT: call void @llvm.masked.store.nxv2i64.p0( [[TMP6]], ptr [[TMP7]], i32 8, [[ACTIVE_LANE_MASK]]) -; TFALWAYS-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP9]] +; TFALWAYS-NEXT: [[TMP7:%.*]] = getelementptr i64, ptr [[B:%.*]], i64 [[INDEX]] +; TFALWAYS-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call @llvm.masked.load.nxv2i64.p0(ptr [[TMP7]], i32 8, [[ACTIVE_LANE_MASK]], poison) +; TFALWAYS-NEXT: [[TMP8:%.*]] = call @foo_vector( [[WIDE_MASKED_LOAD]], [[ACTIVE_LANE_MASK]]) +; TFALWAYS-NEXT: [[TMP9:%.*]] = getelementptr inbounds i64, ptr [[A:%.*]], i64 [[INDEX]] +; TFALWAYS-NEXT: call void @llvm.masked.store.nxv2i64.p0( [[TMP8]], ptr [[TMP9]], i32 8, [[ACTIVE_LANE_MASK]]) +; TFALWAYS-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP6]] ; TFALWAYS-NEXT: [[ACTIVE_LANE_MASK_NEXT]] = call @llvm.get.active.lane.mask.nxv2i1.i64(i64 [[INDEX_NEXT]], i64 1025) ; TFALWAYS-NEXT: [[TMP10:%.*]] = xor [[ACTIVE_LANE_MASK_NEXT]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer) ; TFALWAYS-NEXT: [[TMP11:%.*]] = extractelement [[TMP10]], i32 0 @@ -546,19 +733,19 @@ define void @test_widen_optmask(ptr noalias %a, ptr readnone %b) #4 { ; TFFALLBACK-NEXT: [[N_RND_UP:%.*]] = add i64 1025, [[TMP4]] ; TFFALLBACK-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP1]] ; TFFALLBACK-NEXT: [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]] -; TFFALLBACK-NEXT: [[TMP8:%.*]] = call i64 @llvm.vscale.i64() -; TFFALLBACK-NEXT: [[TMP9:%.*]] = mul i64 [[TMP8]], 2 +; TFFALLBACK-NEXT: [[TMP5:%.*]] = call i64 @llvm.vscale.i64() +; TFFALLBACK-NEXT: [[TMP6:%.*]] = mul i64 [[TMP5]], 2 ; TFFALLBACK-NEXT: [[ACTIVE_LANE_MASK_ENTRY:%.*]] = call @llvm.get.active.lane.mask.nxv2i1.i64(i64 0, i64 1025) ; TFFALLBACK-NEXT: br label [[VECTOR_BODY:%.*]] ; TFFALLBACK: vector.body: ; TFFALLBACK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] ; TFFALLBACK-NEXT: [[ACTIVE_LANE_MASK:%.*]] = phi [ [[ACTIVE_LANE_MASK_ENTRY]], [[ENTRY]] ], [ [[ACTIVE_LANE_MASK_NEXT:%.*]], [[VECTOR_BODY]] ] -; TFFALLBACK-NEXT: [[TMP5:%.*]] = getelementptr i64, ptr [[B:%.*]], i64 [[INDEX]] -; TFFALLBACK-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call @llvm.masked.load.nxv2i64.p0(ptr [[TMP5]], i32 8, [[ACTIVE_LANE_MASK]], poison) -; TFFALLBACK-NEXT: [[TMP6:%.*]] = call @foo_vector( [[WIDE_MASKED_LOAD]], [[ACTIVE_LANE_MASK]]) -; TFFALLBACK-NEXT: [[TMP7:%.*]] = getelementptr inbounds i64, ptr [[A:%.*]], i64 [[INDEX]] -; TFFALLBACK-NEXT: call void @llvm.masked.store.nxv2i64.p0( [[TMP6]], ptr [[TMP7]], i32 8, [[ACTIVE_LANE_MASK]]) -; TFFALLBACK-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP9]] +; TFFALLBACK-NEXT: [[TMP7:%.*]] = getelementptr i64, ptr [[B:%.*]], i64 [[INDEX]] +; TFFALLBACK-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call @llvm.masked.load.nxv2i64.p0(ptr [[TMP7]], i32 8, [[ACTIVE_LANE_MASK]], poison) +; TFFALLBACK-NEXT: [[TMP8:%.*]] = call @foo_vector( [[WIDE_MASKED_LOAD]], [[ACTIVE_LANE_MASK]]) +; TFFALLBACK-NEXT: [[TMP9:%.*]] = getelementptr inbounds i64, ptr [[A:%.*]], i64 [[INDEX]] +; TFFALLBACK-NEXT: call void @llvm.masked.store.nxv2i64.p0( [[TMP8]], ptr [[TMP9]], i32 8, [[ACTIVE_LANE_MASK]]) +; TFFALLBACK-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP6]] ; TFFALLBACK-NEXT: [[ACTIVE_LANE_MASK_NEXT]] = call @llvm.get.active.lane.mask.nxv2i1.i64(i64 [[INDEX_NEXT]], i64 1025) ; TFFALLBACK-NEXT: [[TMP10:%.*]] = xor [[ACTIVE_LANE_MASK_NEXT]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer) ; TFFALLBACK-NEXT: [[TMP11:%.*]] = extractelement [[TMP10]], i32 0 @@ -566,6 +753,54 @@ define void @test_widen_optmask(ptr noalias %a, ptr readnone %b) #4 { ; TFFALLBACK: for.cond.cleanup: ; TFFALLBACK-NEXT: ret void ; +; TFA_INTERLEAVE-LABEL: @test_widen_optmask( +; TFA_INTERLEAVE-NEXT: entry: +; TFA_INTERLEAVE-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64() +; TFA_INTERLEAVE-NEXT: [[TMP1:%.*]] = mul i64 [[TMP0]], 4 +; TFA_INTERLEAVE-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64() +; TFA_INTERLEAVE-NEXT: [[TMP3:%.*]] = mul i64 [[TMP2]], 4 +; TFA_INTERLEAVE-NEXT: [[TMP4:%.*]] = sub i64 [[TMP3]], 1 +; TFA_INTERLEAVE-NEXT: [[N_RND_UP:%.*]] = add i64 1025, [[TMP4]] +; TFA_INTERLEAVE-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP1]] +; TFA_INTERLEAVE-NEXT: [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]] +; TFA_INTERLEAVE-NEXT: [[TMP5:%.*]] = call i64 @llvm.vscale.i64() +; TFA_INTERLEAVE-NEXT: [[TMP6:%.*]] = mul i64 [[TMP5]], 4 +; TFA_INTERLEAVE-NEXT: [[TMP7:%.*]] = call i64 @llvm.vscale.i64() +; TFA_INTERLEAVE-NEXT: [[TMP8:%.*]] = mul i64 [[TMP7]], 2 +; TFA_INTERLEAVE-NEXT: [[ACTIVE_LANE_MASK_ENTRY:%.*]] = call @llvm.get.active.lane.mask.nxv2i1.i64(i64 0, i64 1025) +; TFA_INTERLEAVE-NEXT: [[ACTIVE_LANE_MASK_ENTRY1:%.*]] = call @llvm.get.active.lane.mask.nxv2i1.i64(i64 [[TMP8]], i64 1025) +; TFA_INTERLEAVE-NEXT: br label [[VECTOR_BODY:%.*]] +; TFA_INTERLEAVE: vector.body: +; TFA_INTERLEAVE-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[INDEX_NEXT4:%.*]], [[VECTOR_BODY]] ] +; TFA_INTERLEAVE-NEXT: [[ACTIVE_LANE_MASK:%.*]] = phi [ [[ACTIVE_LANE_MASK_ENTRY]], [[ENTRY]] ], [ [[ACTIVE_LANE_MASK_NEXT:%.*]], [[VECTOR_BODY]] ] +; TFA_INTERLEAVE-NEXT: [[ACTIVE_LANE_MASK2:%.*]] = phi [ [[ACTIVE_LANE_MASK_ENTRY1]], [[ENTRY]] ], [ [[ACTIVE_LANE_MASK_NEXT5:%.*]], [[VECTOR_BODY]] ] +; TFA_INTERLEAVE-NEXT: [[TMP9:%.*]] = getelementptr i64, ptr [[B:%.*]], i64 [[INDEX]] +; TFA_INTERLEAVE-NEXT: [[TMP10:%.*]] = call i64 @llvm.vscale.i64() +; TFA_INTERLEAVE-NEXT: [[TMP11:%.*]] = mul i64 [[TMP10]], 2 +; TFA_INTERLEAVE-NEXT: [[TMP12:%.*]] = getelementptr i64, ptr [[TMP9]], i64 [[TMP11]] +; TFA_INTERLEAVE-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call @llvm.masked.load.nxv2i64.p0(ptr [[TMP9]], i32 8, [[ACTIVE_LANE_MASK]], poison) +; TFA_INTERLEAVE-NEXT: [[WIDE_MASKED_LOAD3:%.*]] = call @llvm.masked.load.nxv2i64.p0(ptr [[TMP12]], i32 8, [[ACTIVE_LANE_MASK2]], poison) +; TFA_INTERLEAVE-NEXT: [[TMP13:%.*]] = call @foo_vector( [[WIDE_MASKED_LOAD]], [[ACTIVE_LANE_MASK]]) +; TFA_INTERLEAVE-NEXT: [[TMP14:%.*]] = call @foo_vector( [[WIDE_MASKED_LOAD3]], [[ACTIVE_LANE_MASK2]]) +; TFA_INTERLEAVE-NEXT: [[TMP15:%.*]] = getelementptr inbounds i64, ptr [[A:%.*]], i64 [[INDEX]] +; TFA_INTERLEAVE-NEXT: [[TMP16:%.*]] = call i64 @llvm.vscale.i64() +; TFA_INTERLEAVE-NEXT: [[TMP17:%.*]] = mul i64 [[TMP16]], 2 +; TFA_INTERLEAVE-NEXT: [[TMP18:%.*]] = getelementptr inbounds i64, ptr [[TMP15]], i64 [[TMP17]] +; TFA_INTERLEAVE-NEXT: call void @llvm.masked.store.nxv2i64.p0( [[TMP13]], ptr [[TMP15]], i32 8, [[ACTIVE_LANE_MASK]]) +; TFA_INTERLEAVE-NEXT: call void @llvm.masked.store.nxv2i64.p0( [[TMP14]], ptr [[TMP18]], i32 8, [[ACTIVE_LANE_MASK2]]) +; TFA_INTERLEAVE-NEXT: [[INDEX_NEXT:%.*]] = add i64 [[INDEX]], [[TMP6]] +; TFA_INTERLEAVE-NEXT: [[INDEX_NEXT4]] = add i64 [[INDEX]], [[TMP6]] +; TFA_INTERLEAVE-NEXT: [[TMP19:%.*]] = call i64 @llvm.vscale.i64() +; TFA_INTERLEAVE-NEXT: [[TMP20:%.*]] = mul i64 [[TMP19]], 2 +; TFA_INTERLEAVE-NEXT: [[TMP21:%.*]] = add i64 [[INDEX_NEXT]], [[TMP20]] +; TFA_INTERLEAVE-NEXT: [[ACTIVE_LANE_MASK_NEXT]] = call @llvm.get.active.lane.mask.nxv2i1.i64(i64 [[INDEX_NEXT]], i64 1025) +; TFA_INTERLEAVE-NEXT: [[ACTIVE_LANE_MASK_NEXT5]] = call @llvm.get.active.lane.mask.nxv2i1.i64(i64 [[TMP21]], i64 1025) +; TFA_INTERLEAVE-NEXT: [[TMP22:%.*]] = xor [[ACTIVE_LANE_MASK_NEXT]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer) +; TFA_INTERLEAVE-NEXT: [[TMP23:%.*]] = extractelement [[TMP22]], i32 0 +; TFA_INTERLEAVE-NEXT: br i1 [[TMP23]], label [[FOR_COND_CLEANUP:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]] +; TFA_INTERLEAVE: for.cond.cleanup: +; TFA_INTERLEAVE-NEXT: ret void +; entry: br label %for.body @@ -599,30 +834,30 @@ define double @test_widen_fmuladd_and_call(ptr noalias %a, ptr readnone %b, doub ; TFNONE-NEXT: [[TMP3:%.*]] = mul i64 [[TMP2]], 2 ; TFNONE-NEXT: [[N_MOD_VF:%.*]] = urem i64 1025, [[TMP3]] ; TFNONE-NEXT: [[N_VEC:%.*]] = sub i64 1025, [[N_MOD_VF]] -; TFNONE-NEXT: [[TMP10:%.*]] = call i64 @llvm.vscale.i64() -; TFNONE-NEXT: [[TMP11:%.*]] = mul i64 [[TMP10]], 2 +; TFNONE-NEXT: [[TMP4:%.*]] = call i64 @llvm.vscale.i64() +; TFNONE-NEXT: [[TMP5:%.*]] = mul i64 [[TMP4]], 2 ; TFNONE-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement poison, double [[M:%.*]], i64 0 ; TFNONE-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector [[BROADCAST_SPLATINSERT]], poison, zeroinitializer ; TFNONE-NEXT: br label [[VECTOR_BODY:%.*]] ; TFNONE: vector.body: ; TFNONE-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] -; TFNONE-NEXT: [[VEC_PHI:%.*]] = phi double [ 0.000000e+00, [[VECTOR_PH]] ], [ [[TMP9:%.*]], [[VECTOR_BODY]] ] -; TFNONE-NEXT: [[TMP4:%.*]] = getelementptr double, ptr [[B:%.*]], i64 [[INDEX]] -; TFNONE-NEXT: [[WIDE_LOAD:%.*]] = load , ptr [[TMP4]], align 8 -; TFNONE-NEXT: [[TMP5:%.*]] = fmul [[WIDE_LOAD]], [[BROADCAST_SPLAT]] -; TFNONE-NEXT: [[TMP6:%.*]] = fptoui [[WIDE_LOAD]] to -; TFNONE-NEXT: [[TMP7:%.*]] = call @foo_vector( [[TMP6]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer)) -; TFNONE-NEXT: [[TMP8:%.*]] = getelementptr inbounds i64, ptr [[A:%.*]], i64 [[INDEX]] -; TFNONE-NEXT: store [[TMP7]], ptr [[TMP8]], align 8 -; TFNONE-NEXT: [[TMP9]] = call double @llvm.vector.reduce.fadd.nxv2f64(double [[VEC_PHI]], [[TMP5]]) -; TFNONE-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP11]] +; TFNONE-NEXT: [[VEC_PHI:%.*]] = phi double [ 0.000000e+00, [[VECTOR_PH]] ], [ [[TMP11:%.*]], [[VECTOR_BODY]] ] +; TFNONE-NEXT: [[TMP6:%.*]] = getelementptr double, ptr [[B:%.*]], i64 [[INDEX]] +; TFNONE-NEXT: [[WIDE_LOAD:%.*]] = load , ptr [[TMP6]], align 8 +; TFNONE-NEXT: [[TMP7:%.*]] = fmul [[WIDE_LOAD]], [[BROADCAST_SPLAT]] +; TFNONE-NEXT: [[TMP8:%.*]] = fptoui [[WIDE_LOAD]] to +; TFNONE-NEXT: [[TMP9:%.*]] = call @foo_vector( [[TMP8]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer)) +; TFNONE-NEXT: [[TMP10:%.*]] = getelementptr inbounds i64, ptr [[A:%.*]], i64 [[INDEX]] +; TFNONE-NEXT: store [[TMP9]], ptr [[TMP10]], align 8 +; TFNONE-NEXT: [[TMP11]] = call double @llvm.vector.reduce.fadd.nxv2f64(double [[VEC_PHI]], [[TMP7]]) +; TFNONE-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]] ; TFNONE-NEXT: [[TMP12:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] ; TFNONE-NEXT: br i1 [[TMP12]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP12:![0-9]+]] ; TFNONE: middle.block: ; TFNONE-NEXT: br i1 false, label [[FOR_COND_CLEANUP:%.*]], label [[SCALAR_PH]] ; TFNONE: scalar.ph: ; TFNONE-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] -; TFNONE-NEXT: [[BC_MERGE_RDX:%.*]] = phi double [ 0.000000e+00, [[ENTRY]] ], [ [[TMP9]], [[MIDDLE_BLOCK]] ] +; TFNONE-NEXT: [[BC_MERGE_RDX:%.*]] = phi double [ 0.000000e+00, [[ENTRY]] ], [ [[TMP11]], [[MIDDLE_BLOCK]] ] ; TFNONE-NEXT: br label [[FOR_BODY:%.*]] ; TFNONE: for.body: ; TFNONE-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ] @@ -638,7 +873,7 @@ define double @test_widen_fmuladd_and_call(ptr noalias %a, ptr readnone %b, doub ; TFNONE-NEXT: [[EXITCOND:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], 1025 ; TFNONE-NEXT: br i1 [[EXITCOND]], label [[FOR_COND_CLEANUP]], label [[FOR_BODY]], !llvm.loop [[LOOP13:![0-9]+]] ; TFNONE: for.cond.cleanup: -; TFNONE-NEXT: [[MULADD_LCSSA:%.*]] = phi double [ [[MULADD]], [[FOR_BODY]] ], [ [[TMP9]], [[MIDDLE_BLOCK]] ] +; TFNONE-NEXT: [[MULADD_LCSSA:%.*]] = phi double [ [[MULADD]], [[FOR_BODY]] ], [ [[TMP11]], [[MIDDLE_BLOCK]] ] ; TFNONE-NEXT: ret double [[MULADD_LCSSA]] ; ; TFALWAYS-LABEL: @test_widen_fmuladd_and_call( @@ -651,8 +886,8 @@ define double @test_widen_fmuladd_and_call(ptr noalias %a, ptr readnone %b, doub ; TFALWAYS-NEXT: [[N_RND_UP:%.*]] = add i64 1025, [[TMP4]] ; TFALWAYS-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP1]] ; TFALWAYS-NEXT: [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]] -; TFALWAYS-NEXT: [[TMP12:%.*]] = call i64 @llvm.vscale.i64() -; TFALWAYS-NEXT: [[TMP13:%.*]] = mul i64 [[TMP12]], 2 +; TFALWAYS-NEXT: [[TMP5:%.*]] = call i64 @llvm.vscale.i64() +; TFALWAYS-NEXT: [[TMP6:%.*]] = mul i64 [[TMP5]], 2 ; TFALWAYS-NEXT: [[ACTIVE_LANE_MASK_ENTRY:%.*]] = call @llvm.get.active.lane.mask.nxv2i1.i64(i64 0, i64 1025) ; TFALWAYS-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement poison, double [[M:%.*]], i64 0 ; TFALWAYS-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector [[BROADCAST_SPLATINSERT]], poison, zeroinitializer @@ -660,23 +895,23 @@ define double @test_widen_fmuladd_and_call(ptr noalias %a, ptr readnone %b, doub ; TFALWAYS: vector.body: ; TFALWAYS-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] ; TFALWAYS-NEXT: [[ACTIVE_LANE_MASK:%.*]] = phi [ [[ACTIVE_LANE_MASK_ENTRY]], [[ENTRY]] ], [ [[ACTIVE_LANE_MASK_NEXT:%.*]], [[VECTOR_BODY]] ] -; TFALWAYS-NEXT: [[VEC_PHI:%.*]] = phi double [ 0.000000e+00, [[ENTRY]] ], [ [[TMP11:%.*]], [[VECTOR_BODY]] ] -; TFALWAYS-NEXT: [[TMP5:%.*]] = getelementptr double, ptr [[B:%.*]], i64 [[INDEX]] -; TFALWAYS-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call @llvm.masked.load.nxv2f64.p0(ptr [[TMP5]], i32 8, [[ACTIVE_LANE_MASK]], poison) -; TFALWAYS-NEXT: [[TMP6:%.*]] = fmul [[WIDE_MASKED_LOAD]], [[BROADCAST_SPLAT]] -; TFALWAYS-NEXT: [[TMP7:%.*]] = fptoui [[WIDE_MASKED_LOAD]] to -; TFALWAYS-NEXT: [[TMP8:%.*]] = call @foo_vector( [[TMP7]], [[ACTIVE_LANE_MASK]]) -; TFALWAYS-NEXT: [[TMP9:%.*]] = getelementptr inbounds i64, ptr [[A:%.*]], i64 [[INDEX]] -; TFALWAYS-NEXT: call void @llvm.masked.store.nxv2i64.p0( [[TMP8]], ptr [[TMP9]], i32 8, [[ACTIVE_LANE_MASK]]) -; TFALWAYS-NEXT: [[TMP10:%.*]] = select [[ACTIVE_LANE_MASK]], [[TMP6]], shufflevector ( insertelement ( poison, double -0.000000e+00, i64 0), poison, zeroinitializer) -; TFALWAYS-NEXT: [[TMP11]] = call double @llvm.vector.reduce.fadd.nxv2f64(double [[VEC_PHI]], [[TMP10]]) -; TFALWAYS-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP13]] +; TFALWAYS-NEXT: [[VEC_PHI:%.*]] = phi double [ 0.000000e+00, [[ENTRY]] ], [ [[TMP13:%.*]], [[VECTOR_BODY]] ] +; TFALWAYS-NEXT: [[TMP7:%.*]] = getelementptr double, ptr [[B:%.*]], i64 [[INDEX]] +; TFALWAYS-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call @llvm.masked.load.nxv2f64.p0(ptr [[TMP7]], i32 8, [[ACTIVE_LANE_MASK]], poison) +; TFALWAYS-NEXT: [[TMP8:%.*]] = fmul [[WIDE_MASKED_LOAD]], [[BROADCAST_SPLAT]] +; TFALWAYS-NEXT: [[TMP9:%.*]] = fptoui [[WIDE_MASKED_LOAD]] to +; TFALWAYS-NEXT: [[TMP10:%.*]] = call @foo_vector( [[TMP9]], [[ACTIVE_LANE_MASK]]) +; TFALWAYS-NEXT: [[TMP11:%.*]] = getelementptr inbounds i64, ptr [[A:%.*]], i64 [[INDEX]] +; TFALWAYS-NEXT: call void @llvm.masked.store.nxv2i64.p0( [[TMP10]], ptr [[TMP11]], i32 8, [[ACTIVE_LANE_MASK]]) +; TFALWAYS-NEXT: [[TMP12:%.*]] = select [[ACTIVE_LANE_MASK]], [[TMP8]], shufflevector ( insertelement ( poison, double -0.000000e+00, i64 0), poison, zeroinitializer) +; TFALWAYS-NEXT: [[TMP13]] = call double @llvm.vector.reduce.fadd.nxv2f64(double [[VEC_PHI]], [[TMP12]]) +; TFALWAYS-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP6]] ; TFALWAYS-NEXT: [[ACTIVE_LANE_MASK_NEXT]] = call @llvm.get.active.lane.mask.nxv2i1.i64(i64 [[INDEX_NEXT]], i64 1025) ; TFALWAYS-NEXT: [[TMP14:%.*]] = xor [[ACTIVE_LANE_MASK_NEXT]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer) ; TFALWAYS-NEXT: [[TMP15:%.*]] = extractelement [[TMP14]], i32 0 ; TFALWAYS-NEXT: br i1 [[TMP15]], label [[FOR_COND_CLEANUP:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]] ; TFALWAYS: for.cond.cleanup: -; TFALWAYS-NEXT: ret double [[TMP11]] +; TFALWAYS-NEXT: ret double [[TMP13]] ; ; TFFALLBACK-LABEL: @test_widen_fmuladd_and_call( ; TFFALLBACK-NEXT: entry: @@ -688,8 +923,8 @@ define double @test_widen_fmuladd_and_call(ptr noalias %a, ptr readnone %b, doub ; TFFALLBACK-NEXT: [[N_RND_UP:%.*]] = add i64 1025, [[TMP4]] ; TFFALLBACK-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP1]] ; TFFALLBACK-NEXT: [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]] -; TFFALLBACK-NEXT: [[TMP12:%.*]] = call i64 @llvm.vscale.i64() -; TFFALLBACK-NEXT: [[TMP13:%.*]] = mul i64 [[TMP12]], 2 +; TFFALLBACK-NEXT: [[TMP5:%.*]] = call i64 @llvm.vscale.i64() +; TFFALLBACK-NEXT: [[TMP6:%.*]] = mul i64 [[TMP5]], 2 ; TFFALLBACK-NEXT: [[ACTIVE_LANE_MASK_ENTRY:%.*]] = call @llvm.get.active.lane.mask.nxv2i1.i64(i64 0, i64 1025) ; TFFALLBACK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement poison, double [[M:%.*]], i64 0 ; TFFALLBACK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector [[BROADCAST_SPLATINSERT]], poison, zeroinitializer @@ -697,23 +932,82 @@ define double @test_widen_fmuladd_and_call(ptr noalias %a, ptr readnone %b, doub ; TFFALLBACK: vector.body: ; TFFALLBACK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] ; TFFALLBACK-NEXT: [[ACTIVE_LANE_MASK:%.*]] = phi [ [[ACTIVE_LANE_MASK_ENTRY]], [[ENTRY]] ], [ [[ACTIVE_LANE_MASK_NEXT:%.*]], [[VECTOR_BODY]] ] -; TFFALLBACK-NEXT: [[VEC_PHI:%.*]] = phi double [ 0.000000e+00, [[ENTRY]] ], [ [[TMP11:%.*]], [[VECTOR_BODY]] ] -; TFFALLBACK-NEXT: [[TMP5:%.*]] = getelementptr double, ptr [[B:%.*]], i64 [[INDEX]] -; TFFALLBACK-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call @llvm.masked.load.nxv2f64.p0(ptr [[TMP5]], i32 8, [[ACTIVE_LANE_MASK]], poison) -; TFFALLBACK-NEXT: [[TMP6:%.*]] = fmul [[WIDE_MASKED_LOAD]], [[BROADCAST_SPLAT]] -; TFFALLBACK-NEXT: [[TMP7:%.*]] = fptoui [[WIDE_MASKED_LOAD]] to -; TFFALLBACK-NEXT: [[TMP8:%.*]] = call @foo_vector( [[TMP7]], [[ACTIVE_LANE_MASK]]) -; TFFALLBACK-NEXT: [[TMP9:%.*]] = getelementptr inbounds i64, ptr [[A:%.*]], i64 [[INDEX]] -; TFFALLBACK-NEXT: call void @llvm.masked.store.nxv2i64.p0( [[TMP8]], ptr [[TMP9]], i32 8, [[ACTIVE_LANE_MASK]]) -; TFFALLBACK-NEXT: [[TMP10:%.*]] = select [[ACTIVE_LANE_MASK]], [[TMP6]], shufflevector ( insertelement ( poison, double -0.000000e+00, i64 0), poison, zeroinitializer) -; TFFALLBACK-NEXT: [[TMP11]] = call double @llvm.vector.reduce.fadd.nxv2f64(double [[VEC_PHI]], [[TMP10]]) -; TFFALLBACK-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP13]] +; TFFALLBACK-NEXT: [[VEC_PHI:%.*]] = phi double [ 0.000000e+00, [[ENTRY]] ], [ [[TMP13:%.*]], [[VECTOR_BODY]] ] +; TFFALLBACK-NEXT: [[TMP7:%.*]] = getelementptr double, ptr [[B:%.*]], i64 [[INDEX]] +; TFFALLBACK-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call @llvm.masked.load.nxv2f64.p0(ptr [[TMP7]], i32 8, [[ACTIVE_LANE_MASK]], poison) +; TFFALLBACK-NEXT: [[TMP8:%.*]] = fmul [[WIDE_MASKED_LOAD]], [[BROADCAST_SPLAT]] +; TFFALLBACK-NEXT: [[TMP9:%.*]] = fptoui [[WIDE_MASKED_LOAD]] to +; TFFALLBACK-NEXT: [[TMP10:%.*]] = call @foo_vector( [[TMP9]], [[ACTIVE_LANE_MASK]]) +; TFFALLBACK-NEXT: [[TMP11:%.*]] = getelementptr inbounds i64, ptr [[A:%.*]], i64 [[INDEX]] +; TFFALLBACK-NEXT: call void @llvm.masked.store.nxv2i64.p0( [[TMP10]], ptr [[TMP11]], i32 8, [[ACTIVE_LANE_MASK]]) +; TFFALLBACK-NEXT: [[TMP12:%.*]] = select [[ACTIVE_LANE_MASK]], [[TMP8]], shufflevector ( insertelement ( poison, double -0.000000e+00, i64 0), poison, zeroinitializer) +; TFFALLBACK-NEXT: [[TMP13]] = call double @llvm.vector.reduce.fadd.nxv2f64(double [[VEC_PHI]], [[TMP12]]) +; TFFALLBACK-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP6]] ; TFFALLBACK-NEXT: [[ACTIVE_LANE_MASK_NEXT]] = call @llvm.get.active.lane.mask.nxv2i1.i64(i64 [[INDEX_NEXT]], i64 1025) ; TFFALLBACK-NEXT: [[TMP14:%.*]] = xor [[ACTIVE_LANE_MASK_NEXT]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer) ; TFFALLBACK-NEXT: [[TMP15:%.*]] = extractelement [[TMP14]], i32 0 ; TFFALLBACK-NEXT: br i1 [[TMP15]], label [[FOR_COND_CLEANUP:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]] ; TFFALLBACK: for.cond.cleanup: -; TFFALLBACK-NEXT: ret double [[TMP11]] +; TFFALLBACK-NEXT: ret double [[TMP13]] +; +; TFA_INTERLEAVE-LABEL: @test_widen_fmuladd_and_call( +; TFA_INTERLEAVE-NEXT: entry: +; TFA_INTERLEAVE-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64() +; TFA_INTERLEAVE-NEXT: [[TMP1:%.*]] = mul i64 [[TMP0]], 4 +; TFA_INTERLEAVE-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64() +; TFA_INTERLEAVE-NEXT: [[TMP3:%.*]] = mul i64 [[TMP2]], 4 +; TFA_INTERLEAVE-NEXT: [[TMP4:%.*]] = sub i64 [[TMP3]], 1 +; TFA_INTERLEAVE-NEXT: [[N_RND_UP:%.*]] = add i64 1025, [[TMP4]] +; TFA_INTERLEAVE-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP1]] +; TFA_INTERLEAVE-NEXT: [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]] +; TFA_INTERLEAVE-NEXT: [[TMP5:%.*]] = call i64 @llvm.vscale.i64() +; TFA_INTERLEAVE-NEXT: [[TMP6:%.*]] = mul i64 [[TMP5]], 4 +; TFA_INTERLEAVE-NEXT: [[TMP7:%.*]] = call i64 @llvm.vscale.i64() +; TFA_INTERLEAVE-NEXT: [[TMP8:%.*]] = mul i64 [[TMP7]], 2 +; TFA_INTERLEAVE-NEXT: [[ACTIVE_LANE_MASK_ENTRY:%.*]] = call @llvm.get.active.lane.mask.nxv2i1.i64(i64 0, i64 1025) +; TFA_INTERLEAVE-NEXT: [[ACTIVE_LANE_MASK_ENTRY1:%.*]] = call @llvm.get.active.lane.mask.nxv2i1.i64(i64 [[TMP8]], i64 1025) +; TFA_INTERLEAVE-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement poison, double [[M:%.*]], i64 0 +; TFA_INTERLEAVE-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector [[BROADCAST_SPLATINSERT]], poison, zeroinitializer +; TFA_INTERLEAVE-NEXT: br label [[VECTOR_BODY:%.*]] +; TFA_INTERLEAVE: vector.body: +; TFA_INTERLEAVE-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[INDEX_NEXT4:%.*]], [[VECTOR_BODY]] ] +; TFA_INTERLEAVE-NEXT: [[ACTIVE_LANE_MASK:%.*]] = phi [ [[ACTIVE_LANE_MASK_ENTRY]], [[ENTRY]] ], [ [[ACTIVE_LANE_MASK_NEXT:%.*]], [[VECTOR_BODY]] ] +; TFA_INTERLEAVE-NEXT: [[ACTIVE_LANE_MASK2:%.*]] = phi [ [[ACTIVE_LANE_MASK_ENTRY1]], [[ENTRY]] ], [ [[ACTIVE_LANE_MASK_NEXT5:%.*]], [[VECTOR_BODY]] ] +; TFA_INTERLEAVE-NEXT: [[VEC_PHI:%.*]] = phi double [ 0.000000e+00, [[ENTRY]] ], [ [[TMP26:%.*]], [[VECTOR_BODY]] ] +; TFA_INTERLEAVE-NEXT: [[TMP9:%.*]] = getelementptr double, ptr [[B:%.*]], i64 [[INDEX]] +; TFA_INTERLEAVE-NEXT: [[TMP10:%.*]] = call i64 @llvm.vscale.i64() +; TFA_INTERLEAVE-NEXT: [[TMP11:%.*]] = mul i64 [[TMP10]], 2 +; TFA_INTERLEAVE-NEXT: [[TMP12:%.*]] = getelementptr double, ptr [[TMP9]], i64 [[TMP11]] +; TFA_INTERLEAVE-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call @llvm.masked.load.nxv2f64.p0(ptr [[TMP9]], i32 8, [[ACTIVE_LANE_MASK]], poison) +; TFA_INTERLEAVE-NEXT: [[WIDE_MASKED_LOAD3:%.*]] = call @llvm.masked.load.nxv2f64.p0(ptr [[TMP12]], i32 8, [[ACTIVE_LANE_MASK2]], poison) +; TFA_INTERLEAVE-NEXT: [[TMP13:%.*]] = fmul [[WIDE_MASKED_LOAD]], [[BROADCAST_SPLAT]] +; TFA_INTERLEAVE-NEXT: [[TMP14:%.*]] = fmul [[WIDE_MASKED_LOAD3]], [[BROADCAST_SPLAT]] +; TFA_INTERLEAVE-NEXT: [[TMP15:%.*]] = fptoui [[WIDE_MASKED_LOAD]] to +; TFA_INTERLEAVE-NEXT: [[TMP16:%.*]] = fptoui [[WIDE_MASKED_LOAD3]] to +; TFA_INTERLEAVE-NEXT: [[TMP17:%.*]] = call @foo_vector( [[TMP15]], [[ACTIVE_LANE_MASK]]) +; TFA_INTERLEAVE-NEXT: [[TMP18:%.*]] = call @foo_vector( [[TMP16]], [[ACTIVE_LANE_MASK2]]) +; TFA_INTERLEAVE-NEXT: [[TMP19:%.*]] = getelementptr inbounds i64, ptr [[A:%.*]], i64 [[INDEX]] +; TFA_INTERLEAVE-NEXT: [[TMP20:%.*]] = call i64 @llvm.vscale.i64() +; TFA_INTERLEAVE-NEXT: [[TMP21:%.*]] = mul i64 [[TMP20]], 2 +; TFA_INTERLEAVE-NEXT: [[TMP22:%.*]] = getelementptr inbounds i64, ptr [[TMP19]], i64 [[TMP21]] +; TFA_INTERLEAVE-NEXT: call void @llvm.masked.store.nxv2i64.p0( [[TMP17]], ptr [[TMP19]], i32 8, [[ACTIVE_LANE_MASK]]) +; TFA_INTERLEAVE-NEXT: call void @llvm.masked.store.nxv2i64.p0( [[TMP18]], ptr [[TMP22]], i32 8, [[ACTIVE_LANE_MASK2]]) +; TFA_INTERLEAVE-NEXT: [[TMP23:%.*]] = select [[ACTIVE_LANE_MASK]], [[TMP13]], shufflevector ( insertelement ( poison, double -0.000000e+00, i64 0), poison, zeroinitializer) +; TFA_INTERLEAVE-NEXT: [[TMP24:%.*]] = call double @llvm.vector.reduce.fadd.nxv2f64(double [[VEC_PHI]], [[TMP23]]) +; TFA_INTERLEAVE-NEXT: [[TMP25:%.*]] = select [[ACTIVE_LANE_MASK2]], [[TMP14]], shufflevector ( insertelement ( poison, double -0.000000e+00, i64 0), poison, zeroinitializer) +; TFA_INTERLEAVE-NEXT: [[TMP26]] = call double @llvm.vector.reduce.fadd.nxv2f64(double [[TMP24]], [[TMP25]]) +; TFA_INTERLEAVE-NEXT: [[INDEX_NEXT:%.*]] = add i64 [[INDEX]], [[TMP6]] +; TFA_INTERLEAVE-NEXT: [[INDEX_NEXT4]] = add i64 [[INDEX]], [[TMP6]] +; TFA_INTERLEAVE-NEXT: [[TMP27:%.*]] = call i64 @llvm.vscale.i64() +; TFA_INTERLEAVE-NEXT: [[TMP28:%.*]] = mul i64 [[TMP27]], 2 +; TFA_INTERLEAVE-NEXT: [[TMP29:%.*]] = add i64 [[INDEX_NEXT]], [[TMP28]] +; TFA_INTERLEAVE-NEXT: [[ACTIVE_LANE_MASK_NEXT]] = call @llvm.get.active.lane.mask.nxv2i1.i64(i64 [[INDEX_NEXT]], i64 1025) +; TFA_INTERLEAVE-NEXT: [[ACTIVE_LANE_MASK_NEXT5]] = call @llvm.get.active.lane.mask.nxv2i1.i64(i64 [[TMP29]], i64 1025) +; TFA_INTERLEAVE-NEXT: [[TMP30:%.*]] = xor [[ACTIVE_LANE_MASK_NEXT]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer) +; TFA_INTERLEAVE-NEXT: [[TMP31:%.*]] = extractelement [[TMP30]], i32 0 +; TFA_INTERLEAVE-NEXT: br i1 [[TMP31]], label [[FOR_COND_CLEANUP:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]] +; TFA_INTERLEAVE: for.cond.cleanup: +; TFA_INTERLEAVE-NEXT: ret double [[TMP26]] ; entry: br label %for.body diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/uniform-args-call-variants.ll b/llvm/test/Transforms/LoopVectorize/AArch64/uniform-args-call-variants.ll index bda20ae18bc9..17edfe513dd0 100644 --- a/llvm/test/Transforms/LoopVectorize/AArch64/uniform-args-call-variants.ll +++ b/llvm/test/Transforms/LoopVectorize/AArch64/uniform-args-call-variants.ll @@ -1,5 +1,6 @@ ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 2 ; RUN: opt < %s -passes=loop-vectorize,simplifycfg,instcombine -force-vector-interleave=1 -prefer-predicate-over-epilogue=predicate-dont-vectorize -S | FileCheck %s +; RUN: opt < %s -passes=loop-vectorize,simplifycfg,instcombine -force-vector-interleave=2 -prefer-predicate-over-epilogue=predicate-dont-vectorize -S | FileCheck %s --check-prefix=INTERLEAVE target triple = "aarch64-unknown-linux-gnu" @@ -9,28 +10,73 @@ define void @test_uniform(ptr noalias %dst, ptr readonly %src, i64 %uniform , i6 ; CHECK-LABEL: define void @test_uniform ; CHECK-SAME: (ptr noalias [[DST:%.*]], ptr readonly [[SRC:%.*]], i64 [[UNIFORM:%.*]], i64 [[N:%.*]]) #[[ATTR0:[0-9]+]] { ; CHECK-NEXT: entry: -; CHECK-NEXT: [[TMP6:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-NEXT: [[TMP7:%.*]] = shl i64 [[TMP6]], 1 ; CHECK-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64() ; CHECK-NEXT: [[TMP1:%.*]] = shl i64 [[TMP0]], 1 -; CHECK-NEXT: [[TMP2:%.*]] = call i64 @llvm.usub.sat.i64(i64 [[N]], i64 [[TMP1]]) +; CHECK-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP3:%.*]] = shl i64 [[TMP2]], 1 +; CHECK-NEXT: [[TMP4:%.*]] = call i64 @llvm.usub.sat.i64(i64 [[N]], i64 [[TMP3]]) ; CHECK-NEXT: [[ACTIVE_LANE_MASK_ENTRY:%.*]] = call @llvm.get.active.lane.mask.nxv2i1.i64(i64 0, i64 [[N]]) ; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK: vector.body: ; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] ; CHECK-NEXT: [[ACTIVE_LANE_MASK:%.*]] = phi [ [[ACTIVE_LANE_MASK_ENTRY]], [[ENTRY]] ], [ [[ACTIVE_LANE_MASK_NEXT:%.*]], [[VECTOR_BODY]] ] -; CHECK-NEXT: [[TMP3:%.*]] = getelementptr double, ptr [[SRC]], i64 [[INDEX]] -; CHECK-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call @llvm.masked.load.nxv2f64.p0(ptr [[TMP3]], i32 8, [[ACTIVE_LANE_MASK]], poison) -; CHECK-NEXT: [[TMP4:%.*]] = call @foo_uniform( [[WIDE_MASKED_LOAD]], i64 [[UNIFORM]], [[ACTIVE_LANE_MASK]]) -; CHECK-NEXT: [[TMP5:%.*]] = getelementptr inbounds double, ptr [[DST]], i64 [[INDEX]] -; CHECK-NEXT: call void @llvm.masked.store.nxv2f64.p0( [[TMP4]], ptr [[TMP5]], i32 8, [[ACTIVE_LANE_MASK]]) -; CHECK-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP7]] -; CHECK-NEXT: [[ACTIVE_LANE_MASK_NEXT]] = call @llvm.get.active.lane.mask.nxv2i1.i64(i64 [[INDEX]], i64 [[TMP2]]) +; CHECK-NEXT: [[TMP5:%.*]] = getelementptr double, ptr [[SRC]], i64 [[INDEX]] +; CHECK-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call @llvm.masked.load.nxv2f64.p0(ptr [[TMP5]], i32 8, [[ACTIVE_LANE_MASK]], poison) +; CHECK-NEXT: [[TMP6:%.*]] = call @foo_uniform( [[WIDE_MASKED_LOAD]], i64 [[UNIFORM]], [[ACTIVE_LANE_MASK]]) +; CHECK-NEXT: [[TMP7:%.*]] = getelementptr inbounds double, ptr [[DST]], i64 [[INDEX]] +; CHECK-NEXT: call void @llvm.masked.store.nxv2f64.p0( [[TMP6]], ptr [[TMP7]], i32 8, [[ACTIVE_LANE_MASK]]) +; CHECK-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP1]] +; CHECK-NEXT: [[ACTIVE_LANE_MASK_NEXT]] = call @llvm.get.active.lane.mask.nxv2i1.i64(i64 [[INDEX]], i64 [[TMP4]]) ; CHECK-NEXT: [[TMP8:%.*]] = extractelement [[ACTIVE_LANE_MASK_NEXT]], i64 0 ; CHECK-NEXT: br i1 [[TMP8]], label [[VECTOR_BODY]], label [[FOR_COND_CLEANUP:%.*]], !llvm.loop [[LOOP0:![0-9]+]] ; CHECK: for.cond.cleanup: ; CHECK-NEXT: ret void ; +; INTERLEAVE-LABEL: define void @test_uniform +; INTERLEAVE-SAME: (ptr noalias [[DST:%.*]], ptr readonly [[SRC:%.*]], i64 [[UNIFORM:%.*]], i64 [[N:%.*]]) #[[ATTR0:[0-9]+]] { +; INTERLEAVE-NEXT: entry: +; INTERLEAVE-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64() +; INTERLEAVE-NEXT: [[TMP1:%.*]] = shl i64 [[TMP0]], 2 +; INTERLEAVE-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64() +; INTERLEAVE-NEXT: [[TMP3:%.*]] = shl i64 [[TMP2]], 2 +; INTERLEAVE-NEXT: [[TMP4:%.*]] = call i64 @llvm.usub.sat.i64(i64 [[N]], i64 [[TMP3]]) +; INTERLEAVE-NEXT: [[TMP5:%.*]] = call i64 @llvm.vscale.i64() +; INTERLEAVE-NEXT: [[TMP6:%.*]] = shl i64 [[TMP5]], 2 +; INTERLEAVE-NEXT: [[TMP7:%.*]] = call i64 @llvm.usub.sat.i64(i64 [[N]], i64 [[TMP6]]) +; INTERLEAVE-NEXT: [[TMP8:%.*]] = call i64 @llvm.vscale.i64() +; INTERLEAVE-NEXT: [[TMP9:%.*]] = shl i64 [[TMP8]], 1 +; INTERLEAVE-NEXT: [[ACTIVE_LANE_MASK_ENTRY:%.*]] = call @llvm.get.active.lane.mask.nxv2i1.i64(i64 0, i64 [[N]]) +; INTERLEAVE-NEXT: [[ACTIVE_LANE_MASK_ENTRY1:%.*]] = call @llvm.get.active.lane.mask.nxv2i1.i64(i64 [[TMP9]], i64 [[N]]) +; INTERLEAVE-NEXT: br label [[VECTOR_BODY:%.*]] +; INTERLEAVE: vector.body: +; INTERLEAVE-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; INTERLEAVE-NEXT: [[ACTIVE_LANE_MASK:%.*]] = phi [ [[ACTIVE_LANE_MASK_ENTRY]], [[ENTRY]] ], [ [[ACTIVE_LANE_MASK_NEXT:%.*]], [[VECTOR_BODY]] ] +; INTERLEAVE-NEXT: [[ACTIVE_LANE_MASK2:%.*]] = phi [ [[ACTIVE_LANE_MASK_ENTRY1]], [[ENTRY]] ], [ [[ACTIVE_LANE_MASK_NEXT4:%.*]], [[VECTOR_BODY]] ] +; INTERLEAVE-NEXT: [[TMP10:%.*]] = getelementptr double, ptr [[SRC]], i64 [[INDEX]] +; INTERLEAVE-NEXT: [[TMP11:%.*]] = call i64 @llvm.vscale.i64() +; INTERLEAVE-NEXT: [[TMP12:%.*]] = shl i64 [[TMP11]], 1 +; INTERLEAVE-NEXT: [[TMP13:%.*]] = getelementptr double, ptr [[TMP10]], i64 [[TMP12]] +; INTERLEAVE-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call @llvm.masked.load.nxv2f64.p0(ptr [[TMP10]], i32 8, [[ACTIVE_LANE_MASK]], poison) +; INTERLEAVE-NEXT: [[WIDE_MASKED_LOAD3:%.*]] = call @llvm.masked.load.nxv2f64.p0(ptr [[TMP13]], i32 8, [[ACTIVE_LANE_MASK2]], poison) +; INTERLEAVE-NEXT: [[TMP14:%.*]] = call @foo_uniform( [[WIDE_MASKED_LOAD]], i64 [[UNIFORM]], [[ACTIVE_LANE_MASK]]) +; INTERLEAVE-NEXT: [[TMP15:%.*]] = call @foo_uniform( [[WIDE_MASKED_LOAD3]], i64 [[UNIFORM]], [[ACTIVE_LANE_MASK2]]) +; INTERLEAVE-NEXT: [[TMP16:%.*]] = getelementptr inbounds double, ptr [[DST]], i64 [[INDEX]] +; INTERLEAVE-NEXT: [[TMP17:%.*]] = call i64 @llvm.vscale.i64() +; INTERLEAVE-NEXT: [[TMP18:%.*]] = shl i64 [[TMP17]], 1 +; INTERLEAVE-NEXT: [[TMP19:%.*]] = getelementptr inbounds double, ptr [[TMP16]], i64 [[TMP18]] +; INTERLEAVE-NEXT: call void @llvm.masked.store.nxv2f64.p0( [[TMP14]], ptr [[TMP16]], i32 8, [[ACTIVE_LANE_MASK]]) +; INTERLEAVE-NEXT: call void @llvm.masked.store.nxv2f64.p0( [[TMP15]], ptr [[TMP19]], i32 8, [[ACTIVE_LANE_MASK2]]) +; INTERLEAVE-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP1]] +; INTERLEAVE-NEXT: [[TMP20:%.*]] = call i64 @llvm.vscale.i64() +; INTERLEAVE-NEXT: [[TMP21:%.*]] = shl i64 [[TMP20]], 1 +; INTERLEAVE-NEXT: [[TMP22:%.*]] = add i64 [[INDEX]], [[TMP21]] +; INTERLEAVE-NEXT: [[ACTIVE_LANE_MASK_NEXT]] = call @llvm.get.active.lane.mask.nxv2i1.i64(i64 [[INDEX]], i64 [[TMP4]]) +; INTERLEAVE-NEXT: [[ACTIVE_LANE_MASK_NEXT4]] = call @llvm.get.active.lane.mask.nxv2i1.i64(i64 [[TMP22]], i64 [[TMP7]]) +; INTERLEAVE-NEXT: [[TMP23:%.*]] = extractelement [[ACTIVE_LANE_MASK_NEXT]], i64 0 +; INTERLEAVE-NEXT: br i1 [[TMP23]], label [[VECTOR_BODY]], label [[FOR_COND_CLEANUP:%.*]], !llvm.loop [[LOOP0:![0-9]+]] +; INTERLEAVE: for.cond.cleanup: +; INTERLEAVE-NEXT: ret void +; entry: br label %for.body @@ -53,28 +99,73 @@ define void @test_uniform_smaller_scalar(ptr noalias %dst, ptr readonly %src, i3 ; CHECK-LABEL: define void @test_uniform_smaller_scalar ; CHECK-SAME: (ptr noalias [[DST:%.*]], ptr readonly [[SRC:%.*]], i32 [[UNIFORM:%.*]], i64 [[N:%.*]]) #[[ATTR0]] { ; CHECK-NEXT: entry: -; CHECK-NEXT: [[TMP6:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-NEXT: [[TMP7:%.*]] = shl i64 [[TMP6]], 1 ; CHECK-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64() ; CHECK-NEXT: [[TMP1:%.*]] = shl i64 [[TMP0]], 1 -; CHECK-NEXT: [[TMP2:%.*]] = call i64 @llvm.usub.sat.i64(i64 [[N]], i64 [[TMP1]]) +; CHECK-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP3:%.*]] = shl i64 [[TMP2]], 1 +; CHECK-NEXT: [[TMP4:%.*]] = call i64 @llvm.usub.sat.i64(i64 [[N]], i64 [[TMP3]]) ; CHECK-NEXT: [[ACTIVE_LANE_MASK_ENTRY:%.*]] = call @llvm.get.active.lane.mask.nxv2i1.i64(i64 0, i64 [[N]]) ; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK: vector.body: ; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] ; CHECK-NEXT: [[ACTIVE_LANE_MASK:%.*]] = phi [ [[ACTIVE_LANE_MASK_ENTRY]], [[ENTRY]] ], [ [[ACTIVE_LANE_MASK_NEXT:%.*]], [[VECTOR_BODY]] ] -; CHECK-NEXT: [[TMP3:%.*]] = getelementptr double, ptr [[SRC]], i64 [[INDEX]] -; CHECK-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call @llvm.masked.load.nxv2f64.p0(ptr [[TMP3]], i32 8, [[ACTIVE_LANE_MASK]], poison) -; CHECK-NEXT: [[TMP4:%.*]] = call @bar_uniform( [[WIDE_MASKED_LOAD]], i32 [[UNIFORM]], [[ACTIVE_LANE_MASK]]) -; CHECK-NEXT: [[TMP5:%.*]] = getelementptr inbounds double, ptr [[DST]], i64 [[INDEX]] -; CHECK-NEXT: call void @llvm.masked.store.nxv2f64.p0( [[TMP4]], ptr [[TMP5]], i32 8, [[ACTIVE_LANE_MASK]]) -; CHECK-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP7]] -; CHECK-NEXT: [[ACTIVE_LANE_MASK_NEXT]] = call @llvm.get.active.lane.mask.nxv2i1.i64(i64 [[INDEX]], i64 [[TMP2]]) +; CHECK-NEXT: [[TMP5:%.*]] = getelementptr double, ptr [[SRC]], i64 [[INDEX]] +; CHECK-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call @llvm.masked.load.nxv2f64.p0(ptr [[TMP5]], i32 8, [[ACTIVE_LANE_MASK]], poison) +; CHECK-NEXT: [[TMP6:%.*]] = call @bar_uniform( [[WIDE_MASKED_LOAD]], i32 [[UNIFORM]], [[ACTIVE_LANE_MASK]]) +; CHECK-NEXT: [[TMP7:%.*]] = getelementptr inbounds double, ptr [[DST]], i64 [[INDEX]] +; CHECK-NEXT: call void @llvm.masked.store.nxv2f64.p0( [[TMP6]], ptr [[TMP7]], i32 8, [[ACTIVE_LANE_MASK]]) +; CHECK-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP1]] +; CHECK-NEXT: [[ACTIVE_LANE_MASK_NEXT]] = call @llvm.get.active.lane.mask.nxv2i1.i64(i64 [[INDEX]], i64 [[TMP4]]) ; CHECK-NEXT: [[TMP8:%.*]] = extractelement [[ACTIVE_LANE_MASK_NEXT]], i64 0 ; CHECK-NEXT: br i1 [[TMP8]], label [[VECTOR_BODY]], label [[FOR_COND_CLEANUP:%.*]], !llvm.loop [[LOOP3:![0-9]+]] ; CHECK: for.cond.cleanup: ; CHECK-NEXT: ret void ; +; INTERLEAVE-LABEL: define void @test_uniform_smaller_scalar +; INTERLEAVE-SAME: (ptr noalias [[DST:%.*]], ptr readonly [[SRC:%.*]], i32 [[UNIFORM:%.*]], i64 [[N:%.*]]) #[[ATTR0]] { +; INTERLEAVE-NEXT: entry: +; INTERLEAVE-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64() +; INTERLEAVE-NEXT: [[TMP1:%.*]] = shl i64 [[TMP0]], 2 +; INTERLEAVE-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64() +; INTERLEAVE-NEXT: [[TMP3:%.*]] = shl i64 [[TMP2]], 2 +; INTERLEAVE-NEXT: [[TMP4:%.*]] = call i64 @llvm.usub.sat.i64(i64 [[N]], i64 [[TMP3]]) +; INTERLEAVE-NEXT: [[TMP5:%.*]] = call i64 @llvm.vscale.i64() +; INTERLEAVE-NEXT: [[TMP6:%.*]] = shl i64 [[TMP5]], 2 +; INTERLEAVE-NEXT: [[TMP7:%.*]] = call i64 @llvm.usub.sat.i64(i64 [[N]], i64 [[TMP6]]) +; INTERLEAVE-NEXT: [[TMP8:%.*]] = call i64 @llvm.vscale.i64() +; INTERLEAVE-NEXT: [[TMP9:%.*]] = shl i64 [[TMP8]], 1 +; INTERLEAVE-NEXT: [[ACTIVE_LANE_MASK_ENTRY:%.*]] = call @llvm.get.active.lane.mask.nxv2i1.i64(i64 0, i64 [[N]]) +; INTERLEAVE-NEXT: [[ACTIVE_LANE_MASK_ENTRY1:%.*]] = call @llvm.get.active.lane.mask.nxv2i1.i64(i64 [[TMP9]], i64 [[N]]) +; INTERLEAVE-NEXT: br label [[VECTOR_BODY:%.*]] +; INTERLEAVE: vector.body: +; INTERLEAVE-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; INTERLEAVE-NEXT: [[ACTIVE_LANE_MASK:%.*]] = phi [ [[ACTIVE_LANE_MASK_ENTRY]], [[ENTRY]] ], [ [[ACTIVE_LANE_MASK_NEXT:%.*]], [[VECTOR_BODY]] ] +; INTERLEAVE-NEXT: [[ACTIVE_LANE_MASK2:%.*]] = phi [ [[ACTIVE_LANE_MASK_ENTRY1]], [[ENTRY]] ], [ [[ACTIVE_LANE_MASK_NEXT4:%.*]], [[VECTOR_BODY]] ] +; INTERLEAVE-NEXT: [[TMP10:%.*]] = getelementptr double, ptr [[SRC]], i64 [[INDEX]] +; INTERLEAVE-NEXT: [[TMP11:%.*]] = call i64 @llvm.vscale.i64() +; INTERLEAVE-NEXT: [[TMP12:%.*]] = shl i64 [[TMP11]], 1 +; INTERLEAVE-NEXT: [[TMP13:%.*]] = getelementptr double, ptr [[TMP10]], i64 [[TMP12]] +; INTERLEAVE-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call @llvm.masked.load.nxv2f64.p0(ptr [[TMP10]], i32 8, [[ACTIVE_LANE_MASK]], poison) +; INTERLEAVE-NEXT: [[WIDE_MASKED_LOAD3:%.*]] = call @llvm.masked.load.nxv2f64.p0(ptr [[TMP13]], i32 8, [[ACTIVE_LANE_MASK2]], poison) +; INTERLEAVE-NEXT: [[TMP14:%.*]] = call @bar_uniform( [[WIDE_MASKED_LOAD]], i32 [[UNIFORM]], [[ACTIVE_LANE_MASK]]) +; INTERLEAVE-NEXT: [[TMP15:%.*]] = call @bar_uniform( [[WIDE_MASKED_LOAD3]], i32 [[UNIFORM]], [[ACTIVE_LANE_MASK2]]) +; INTERLEAVE-NEXT: [[TMP16:%.*]] = getelementptr inbounds double, ptr [[DST]], i64 [[INDEX]] +; INTERLEAVE-NEXT: [[TMP17:%.*]] = call i64 @llvm.vscale.i64() +; INTERLEAVE-NEXT: [[TMP18:%.*]] = shl i64 [[TMP17]], 1 +; INTERLEAVE-NEXT: [[TMP19:%.*]] = getelementptr inbounds double, ptr [[TMP16]], i64 [[TMP18]] +; INTERLEAVE-NEXT: call void @llvm.masked.store.nxv2f64.p0( [[TMP14]], ptr [[TMP16]], i32 8, [[ACTIVE_LANE_MASK]]) +; INTERLEAVE-NEXT: call void @llvm.masked.store.nxv2f64.p0( [[TMP15]], ptr [[TMP19]], i32 8, [[ACTIVE_LANE_MASK2]]) +; INTERLEAVE-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP1]] +; INTERLEAVE-NEXT: [[TMP20:%.*]] = call i64 @llvm.vscale.i64() +; INTERLEAVE-NEXT: [[TMP21:%.*]] = shl i64 [[TMP20]], 1 +; INTERLEAVE-NEXT: [[TMP22:%.*]] = add i64 [[INDEX]], [[TMP21]] +; INTERLEAVE-NEXT: [[ACTIVE_LANE_MASK_NEXT]] = call @llvm.get.active.lane.mask.nxv2i1.i64(i64 [[INDEX]], i64 [[TMP4]]) +; INTERLEAVE-NEXT: [[ACTIVE_LANE_MASK_NEXT4]] = call @llvm.get.active.lane.mask.nxv2i1.i64(i64 [[TMP22]], i64 [[TMP7]]) +; INTERLEAVE-NEXT: [[TMP23:%.*]] = extractelement [[ACTIVE_LANE_MASK_NEXT]], i64 0 +; INTERLEAVE-NEXT: br i1 [[TMP23]], label [[VECTOR_BODY]], label [[FOR_COND_CLEANUP:%.*]], !llvm.loop [[LOOP3:![0-9]+]] +; INTERLEAVE: for.cond.cleanup: +; INTERLEAVE-NEXT: ret void +; entry: br label %for.body @@ -112,6 +203,48 @@ define void @test_uniform_not_invariant(ptr noalias %dst, ptr readonly %src, i64 ; CHECK: for.cond.cleanup: ; CHECK-NEXT: ret void ; +; INTERLEAVE-LABEL: define void @test_uniform_not_invariant +; INTERLEAVE-SAME: (ptr noalias [[DST:%.*]], ptr readonly [[SRC:%.*]], i64 [[N:%.*]]) #[[ATTR0]] { +; INTERLEAVE-NEXT: entry: +; INTERLEAVE-NEXT: [[TMP0:%.*]] = call i64 @llvm.usub.sat.i64(i64 [[N]], i64 2) +; INTERLEAVE-NEXT: [[TMP1:%.*]] = call i64 @llvm.usub.sat.i64(i64 [[N]], i64 2) +; INTERLEAVE-NEXT: [[ACTIVE_LANE_MASK_ENTRY:%.*]] = call <1 x i1> @llvm.get.active.lane.mask.v1i1.i64(i64 0, i64 [[N]]) +; INTERLEAVE-NEXT: [[ACTIVE_LANE_MASK_ENTRY1:%.*]] = call <1 x i1> @llvm.get.active.lane.mask.v1i1.i64(i64 1, i64 [[N]]) +; INTERLEAVE-NEXT: br label [[VECTOR_BODY:%.*]] +; INTERLEAVE: vector.body: +; INTERLEAVE-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[INDEX_NEXT:%.*]], [[PRED_STORE_CONTINUE4:%.*]] ] +; INTERLEAVE-NEXT: [[ACTIVE_LANE_MASK:%.*]] = phi <1 x i1> [ [[ACTIVE_LANE_MASK_ENTRY]], [[ENTRY]] ], [ [[ACTIVE_LANE_MASK_NEXT:%.*]], [[PRED_STORE_CONTINUE4]] ] +; INTERLEAVE-NEXT: [[ACTIVE_LANE_MASK2:%.*]] = phi <1 x i1> [ [[ACTIVE_LANE_MASK_ENTRY1]], [[ENTRY]] ], [ [[ACTIVE_LANE_MASK_NEXT5:%.*]], [[PRED_STORE_CONTINUE4]] ] +; INTERLEAVE-NEXT: [[TMP2:%.*]] = extractelement <1 x i1> [[ACTIVE_LANE_MASK]], i64 0 +; INTERLEAVE-NEXT: br i1 [[TMP2]], label [[PRED_STORE_IF:%.*]], label [[PRED_STORE_CONTINUE:%.*]] +; INTERLEAVE: pred.store.if: +; INTERLEAVE-NEXT: [[TMP3:%.*]] = getelementptr double, ptr [[SRC]], i64 [[INDEX]] +; INTERLEAVE-NEXT: [[TMP4:%.*]] = load double, ptr [[TMP3]], align 8 +; INTERLEAVE-NEXT: [[TMP5:%.*]] = call double @foo(double [[TMP4]], i64 [[INDEX]]) #[[ATTR5:[0-9]+]] +; INTERLEAVE-NEXT: [[TMP6:%.*]] = getelementptr inbounds double, ptr [[DST]], i64 [[INDEX]] +; INTERLEAVE-NEXT: store double [[TMP5]], ptr [[TMP6]], align 8 +; INTERLEAVE-NEXT: br label [[PRED_STORE_CONTINUE]] +; INTERLEAVE: pred.store.continue: +; INTERLEAVE-NEXT: [[TMP7:%.*]] = extractelement <1 x i1> [[ACTIVE_LANE_MASK2]], i64 0 +; INTERLEAVE-NEXT: br i1 [[TMP7]], label [[PRED_STORE_IF3:%.*]], label [[PRED_STORE_CONTINUE4]] +; INTERLEAVE: pred.store.if3: +; INTERLEAVE-NEXT: [[TMP8:%.*]] = or disjoint i64 [[INDEX]], 1 +; INTERLEAVE-NEXT: [[TMP9:%.*]] = getelementptr double, ptr [[SRC]], i64 [[TMP8]] +; INTERLEAVE-NEXT: [[TMP10:%.*]] = load double, ptr [[TMP9]], align 8 +; INTERLEAVE-NEXT: [[TMP11:%.*]] = call double @foo(double [[TMP10]], i64 [[TMP8]]) #[[ATTR5]] +; INTERLEAVE-NEXT: [[TMP12:%.*]] = getelementptr inbounds double, ptr [[DST]], i64 [[TMP8]] +; INTERLEAVE-NEXT: store double [[TMP11]], ptr [[TMP12]], align 8 +; INTERLEAVE-NEXT: br label [[PRED_STORE_CONTINUE4]] +; INTERLEAVE: pred.store.continue4: +; INTERLEAVE-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], 2 +; INTERLEAVE-NEXT: [[TMP13:%.*]] = or disjoint i64 [[INDEX]], 1 +; INTERLEAVE-NEXT: [[ACTIVE_LANE_MASK_NEXT]] = call <1 x i1> @llvm.get.active.lane.mask.v1i1.i64(i64 [[INDEX]], i64 [[TMP0]]) +; INTERLEAVE-NEXT: [[ACTIVE_LANE_MASK_NEXT5]] = call <1 x i1> @llvm.get.active.lane.mask.v1i1.i64(i64 [[TMP13]], i64 [[TMP1]]) +; INTERLEAVE-NEXT: [[TMP14:%.*]] = extractelement <1 x i1> [[ACTIVE_LANE_MASK_NEXT]], i64 0 +; INTERLEAVE-NEXT: br i1 [[TMP14]], label [[VECTOR_BODY]], label [[FOR_COND_CLEANUP:%.*]], !llvm.loop [[LOOP4:![0-9]+]] +; INTERLEAVE: for.cond.cleanup: +; INTERLEAVE-NEXT: ret void +; entry: br label %for.body diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/vector-call-linear-args.ll b/llvm/test/Transforms/LoopVectorize/AArch64/vector-call-linear-args.ll index 16506b3a5757..29440ca17424 100644 --- a/llvm/test/Transforms/LoopVectorize/AArch64/vector-call-linear-args.ll +++ b/llvm/test/Transforms/LoopVectorize/AArch64/vector-call-linear-args.ll @@ -1,7 +1,10 @@ -; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --filter "call.*(foo|bar|baz|quux|goo)" --version 2 +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --filter "call.*(foo|bar|baz|quux|goo)|extractelement" --version 2 ; RUN: opt < %s -passes=loop-vectorize -force-vector-interleave=1 -S | FileCheck %s --check-prefixes=NEON +; RUN: opt < %s -passes=loop-vectorize -force-vector-interleave=2 -S | FileCheck %s --check-prefixes=NEON_INTERLEAVE ; RUN: opt < %s -mattr=+sve -passes=loop-vectorize -force-vector-interleave=1 -S | FileCheck %s --check-prefixes=SVE_OR_NEON +; RUN: opt < %s -mattr=+sve -passes=loop-vectorize -force-vector-interleave=2 -S -prefer-predicate-over-epilogue=predicate-dont-vectorize | FileCheck %s --check-prefixes=SVE_OR_NEON_INTERLEAVE ; RUN: opt < %s -mattr=+sve -passes=loop-vectorize -force-vector-interleave=1 -S -prefer-predicate-over-epilogue=predicate-dont-vectorize | FileCheck %s --check-prefixes=SVE_TF +; RUN: opt < %s -mattr=+sve -passes=loop-vectorize -force-vector-interleave=2 -S -prefer-predicate-over-epilogue=predicate-dont-vectorize | FileCheck %s --check-prefixes=SVE_TF_INTERLEAVE target triple = "aarch64-unknown-linux-gnu" @@ -10,19 +13,46 @@ target triple = "aarch64-unknown-linux-gnu" define void @test_linear8(ptr noalias %a, ptr readnone %b, i64 %n) { ; NEON-LABEL: define void @test_linear8 ; NEON-SAME: (ptr noalias [[A:%.*]], ptr readnone [[B:%.*]], i64 [[N:%.*]]) { -; NEON: [[TMP3:%.*]] = call <2 x i64> @vec_foo_linear8_nomask_neon(ptr [[TMP2:%.*]]) +; NEON: [[TMP2:%.*]] = extractelement <2 x ptr> [[TMP1:%.*]], i32 0 +; NEON: [[TMP3:%.*]] = call <2 x i64> @vec_foo_linear8_nomask_neon(ptr [[TMP2]]) ; NEON: [[DATA:%.*]] = call i64 @foo(ptr [[GEPB:%.*]]) #[[ATTR0:[0-9]+]] ; +; NEON_INTERLEAVE-LABEL: define void @test_linear8 +; NEON_INTERLEAVE-SAME: (ptr noalias [[A:%.*]], ptr readnone [[B:%.*]], i64 [[N:%.*]]) { +; NEON_INTERLEAVE: [[TMP4:%.*]] = extractelement <2 x ptr> [[TMP2:%.*]], i32 0 +; NEON_INTERLEAVE: [[TMP5:%.*]] = call <2 x i64> @vec_foo_linear8_nomask_neon(ptr [[TMP4]]) +; NEON_INTERLEAVE: [[TMP6:%.*]] = call <2 x i64> @vec_foo_linear8_nomask_neon(ptr [[TMP4]]) +; NEON_INTERLEAVE: [[DATA:%.*]] = call i64 @foo(ptr [[GEPB:%.*]]) #[[ATTR0:[0-9]+]] +; ; SVE_OR_NEON-LABEL: define void @test_linear8 ; SVE_OR_NEON-SAME: (ptr noalias [[A:%.*]], ptr readnone [[B:%.*]], i64 [[N:%.*]]) #[[ATTR0:[0-9]+]] { -; SVE_OR_NEON: [[TMP15:%.*]] = call @vec_foo_linear8_nomask_sve(ptr [[TMP14:%.*]]) +; SVE_OR_NEON: [[TMP14:%.*]] = extractelement [[TMP13:%.*]], i32 0 +; SVE_OR_NEON: [[TMP15:%.*]] = call @vec_foo_linear8_nomask_sve(ptr [[TMP14]]) ; SVE_OR_NEON: [[DATA:%.*]] = call i64 @foo(ptr [[GEPB:%.*]]) #[[ATTR2:[0-9]+]] ; +; SVE_OR_NEON_INTERLEAVE-LABEL: define void @test_linear8 +; SVE_OR_NEON_INTERLEAVE-SAME: (ptr noalias [[A:%.*]], ptr readnone [[B:%.*]], i64 [[N:%.*]]) #[[ATTR0:[0-9]+]] { +; SVE_OR_NEON_INTERLEAVE: [[TMP33:%.*]] = extractelement [[TMP31:%.*]], i32 0 +; SVE_OR_NEON_INTERLEAVE: [[TMP34:%.*]] = call @vec_foo_linear8_mask_sve(ptr [[TMP33]], [[ACTIVE_LANE_MASK:%.*]]) +; SVE_OR_NEON_INTERLEAVE: [[TMP35:%.*]] = call @vec_foo_linear8_mask_sve(ptr [[TMP33]], [[ACTIVE_LANE_MASK2:%.*]]) +; SVE_OR_NEON_INTERLEAVE: [[TMP47:%.*]] = extractelement [[TMP45:%.*]], i32 0 +; SVE_OR_NEON_INTERLEAVE: [[DATA:%.*]] = call i64 @foo(ptr [[GEPB:%.*]]) #[[ATTR4:[0-9]+]] +; ; SVE_TF-LABEL: define void @test_linear8 ; SVE_TF-SAME: (ptr noalias [[A:%.*]], ptr readnone [[B:%.*]], i64 [[N:%.*]]) #[[ATTR0:[0-9]+]] { -; SVE_TF: [[TMP21:%.*]] = call @vec_foo_linear8_mask_sve(ptr [[TMP20:%.*]], [[ACTIVE_LANE_MASK:%.*]]) +; SVE_TF: [[TMP20:%.*]] = extractelement [[TMP19:%.*]], i32 0 +; SVE_TF: [[TMP21:%.*]] = call @vec_foo_linear8_mask_sve(ptr [[TMP20]], [[ACTIVE_LANE_MASK:%.*]]) +; SVE_TF: [[TMP25:%.*]] = extractelement [[TMP24:%.*]], i32 0 ; SVE_TF: [[DATA:%.*]] = call i64 @foo(ptr [[GEPB:%.*]]) #[[ATTR4:[0-9]+]] ; +; SVE_TF_INTERLEAVE-LABEL: define void @test_linear8 +; SVE_TF_INTERLEAVE-SAME: (ptr noalias [[A:%.*]], ptr readnone [[B:%.*]], i64 [[N:%.*]]) #[[ATTR0:[0-9]+]] { +; SVE_TF_INTERLEAVE: [[TMP33:%.*]] = extractelement [[TMP31:%.*]], i32 0 +; SVE_TF_INTERLEAVE: [[TMP34:%.*]] = call @vec_foo_linear8_mask_sve(ptr [[TMP33]], [[ACTIVE_LANE_MASK:%.*]]) +; SVE_TF_INTERLEAVE: [[TMP35:%.*]] = call @vec_foo_linear8_mask_sve(ptr [[TMP33]], [[ACTIVE_LANE_MASK2:%.*]]) +; SVE_TF_INTERLEAVE: [[TMP47:%.*]] = extractelement [[TMP45:%.*]], i32 0 +; SVE_TF_INTERLEAVE: [[DATA:%.*]] = call i64 @foo(ptr [[GEPB:%.*]]) #[[ATTR4:[0-9]+]] +; entry: br label %for.body @@ -43,18 +73,35 @@ for.cond.cleanup: define void @test_vector_linear4(ptr noalias %a, ptr readnone %b, ptr readonly %c, i64 %n) { ; NEON-LABEL: define void @test_vector_linear4 ; NEON-SAME: (ptr noalias [[A:%.*]], ptr readnone [[B:%.*]], ptr readonly [[C:%.*]], i64 [[N:%.*]]) { -; NEON: [[TMP5:%.*]] = call <4 x i32> @vec_baz_vector_linear4_nomask_neon(<4 x i32> [[WIDE_LOAD:%.*]], ptr [[TMP4:%.*]]) +; NEON: [[TMP4:%.*]] = extractelement <4 x ptr> [[TMP3:%.*]], i32 0 +; NEON: [[TMP5:%.*]] = call <4 x i32> @vec_baz_vector_linear4_nomask_neon(<4 x i32> [[WIDE_LOAD:%.*]], ptr [[TMP4]]) ; NEON: [[DATA:%.*]] = call i32 @baz(i32 [[INPUT:%.*]], ptr [[GEPB:%.*]]) #[[ATTR1:[0-9]+]] ; +; NEON_INTERLEAVE-LABEL: define void @test_vector_linear4 +; NEON_INTERLEAVE-SAME: (ptr noalias [[A:%.*]], ptr readnone [[B:%.*]], ptr readonly [[C:%.*]], i64 [[N:%.*]]) { +; NEON_INTERLEAVE: [[TMP8:%.*]] = extractelement <4 x ptr> [[TMP6:%.*]], i32 0 +; NEON_INTERLEAVE: [[TMP9:%.*]] = call <4 x i32> @vec_baz_vector_linear4_nomask_neon(<4 x i32> [[WIDE_LOAD:%.*]], ptr [[TMP8]]) +; NEON_INTERLEAVE: [[TMP10:%.*]] = call <4 x i32> @vec_baz_vector_linear4_nomask_neon(<4 x i32> [[WIDE_LOAD2:%.*]], ptr [[TMP8]]) +; NEON_INTERLEAVE: [[DATA:%.*]] = call i32 @baz(i32 [[INPUT:%.*]], ptr [[GEPB:%.*]]) #[[ATTR1:[0-9]+]] +; ; SVE_OR_NEON-LABEL: define void @test_vector_linear4 ; SVE_OR_NEON-SAME: (ptr noalias [[A:%.*]], ptr readnone [[B:%.*]], ptr readonly [[C:%.*]], i64 [[N:%.*]]) #[[ATTR0]] { -; SVE_OR_NEON: [[TMP17:%.*]] = call @vec_baz_vector_linear4_nomask_sve( [[WIDE_LOAD:%.*]], ptr [[TMP16:%.*]]) +; SVE_OR_NEON: [[TMP16:%.*]] = extractelement [[TMP15:%.*]], i32 0 +; SVE_OR_NEON: [[TMP17:%.*]] = call @vec_baz_vector_linear4_nomask_sve( [[WIDE_LOAD:%.*]], ptr [[TMP16]]) ; SVE_OR_NEON: [[DATA:%.*]] = call i32 @baz(i32 [[INPUT:%.*]], ptr [[GEPB:%.*]]) #[[ATTR3:[0-9]+]] ; +; SVE_OR_NEON_INTERLEAVE-LABEL: define void @test_vector_linear4 +; SVE_OR_NEON_INTERLEAVE-SAME: (ptr noalias [[A:%.*]], ptr readnone [[B:%.*]], ptr readonly [[C:%.*]], i64 [[N:%.*]]) #[[ATTR0]] { +; SVE_OR_NEON_INTERLEAVE: [[DATA:%.*]] = call i32 @baz(i32 [[INPUT:%.*]], ptr [[GEPB:%.*]]) #[[ATTR5:[0-9]+]] +; ; SVE_TF-LABEL: define void @test_vector_linear4 ; SVE_TF-SAME: (ptr noalias [[A:%.*]], ptr readnone [[B:%.*]], ptr readonly [[C:%.*]], i64 [[N:%.*]]) #[[ATTR0]] { ; SVE_TF: [[DATA:%.*]] = call i32 @baz(i32 [[INPUT:%.*]], ptr [[GEPB:%.*]]) #[[ATTR5:[0-9]+]] ; +; SVE_TF_INTERLEAVE-LABEL: define void @test_vector_linear4 +; SVE_TF_INTERLEAVE-SAME: (ptr noalias [[A:%.*]], ptr readnone [[B:%.*]], ptr readonly [[C:%.*]], i64 [[N:%.*]]) #[[ATTR0]] { +; SVE_TF_INTERLEAVE: [[DATA:%.*]] = call i32 @baz(i32 [[INPUT:%.*]], ptr [[GEPB:%.*]]) #[[ATTR5:[0-9]+]] +; entry: br label %for.body @@ -79,14 +126,28 @@ define void @test_linear8_bad_stride(ptr noalias %a, ptr readnone %b, i64 %n) { ; NEON-SAME: (ptr noalias [[A:%.*]], ptr readnone [[B:%.*]], i64 [[N:%.*]]) { ; NEON: [[DATA:%.*]] = call i64 @foo(ptr [[GEPB:%.*]]) #[[ATTR2:[0-9]+]] ; +; NEON_INTERLEAVE-LABEL: define void @test_linear8_bad_stride +; NEON_INTERLEAVE-SAME: (ptr noalias [[A:%.*]], ptr readnone [[B:%.*]], i64 [[N:%.*]]) { +; NEON_INTERLEAVE: [[TMP4:%.*]] = call i64 @foo(ptr [[TMP2:%.*]]) #[[ATTR2:[0-9]+]] +; NEON_INTERLEAVE: [[TMP5:%.*]] = call i64 @foo(ptr [[TMP3:%.*]]) #[[ATTR2]] +; NEON_INTERLEAVE: [[DATA:%.*]] = call i64 @foo(ptr [[GEPB:%.*]]) #[[ATTR2]] +; ; SVE_OR_NEON-LABEL: define void @test_linear8_bad_stride ; SVE_OR_NEON-SAME: (ptr noalias [[A:%.*]], ptr readnone [[B:%.*]], i64 [[N:%.*]]) #[[ATTR0]] { ; SVE_OR_NEON: [[DATA:%.*]] = call i64 @foo(ptr [[GEPB:%.*]]) #[[ATTR4:[0-9]+]] ; +; SVE_OR_NEON_INTERLEAVE-LABEL: define void @test_linear8_bad_stride +; SVE_OR_NEON_INTERLEAVE-SAME: (ptr noalias [[A:%.*]], ptr readnone [[B:%.*]], i64 [[N:%.*]]) #[[ATTR0]] { +; SVE_OR_NEON_INTERLEAVE: [[DATA:%.*]] = call i64 @foo(ptr [[GEPB:%.*]]) #[[ATTR6:[0-9]+]] +; ; SVE_TF-LABEL: define void @test_linear8_bad_stride ; SVE_TF-SAME: (ptr noalias [[A:%.*]], ptr readnone [[B:%.*]], i64 [[N:%.*]]) #[[ATTR0]] { ; SVE_TF: [[DATA:%.*]] = call i64 @foo(ptr [[GEPB:%.*]]) #[[ATTR6:[0-9]+]] ; +; SVE_TF_INTERLEAVE-LABEL: define void @test_linear8_bad_stride +; SVE_TF_INTERLEAVE-SAME: (ptr noalias [[A:%.*]], ptr readnone [[B:%.*]], i64 [[N:%.*]]) #[[ATTR0]] { +; SVE_TF_INTERLEAVE: [[DATA:%.*]] = call i64 @foo(ptr [[GEPB:%.*]]) #[[ATTR6:[0-9]+]] +; entry: br label %for.body @@ -107,18 +168,35 @@ for.cond.cleanup: define void @test_linear16_wide_stride(ptr noalias %a, ptr readnone %b, i64 %n) { ; NEON-LABEL: define void @test_linear16_wide_stride ; NEON-SAME: (ptr noalias [[A:%.*]], ptr readnone [[B:%.*]], i64 [[N:%.*]]) { -; NEON: [[TMP4:%.*]] = call <2 x i64> @vec_foo_linear16_nomask_neon(ptr [[TMP3:%.*]]) +; NEON: [[TMP3:%.*]] = extractelement <2 x ptr> [[TMP2:%.*]], i32 0 +; NEON: [[TMP4:%.*]] = call <2 x i64> @vec_foo_linear16_nomask_neon(ptr [[TMP3]]) ; NEON: [[DATA:%.*]] = call i64 @foo(ptr [[GEPB:%.*]]) #[[ATTR2]] ; +; NEON_INTERLEAVE-LABEL: define void @test_linear16_wide_stride +; NEON_INTERLEAVE-SAME: (ptr noalias [[A:%.*]], ptr readnone [[B:%.*]], i64 [[N:%.*]]) { +; NEON_INTERLEAVE: [[TMP6:%.*]] = extractelement <2 x ptr> [[TMP4:%.*]], i32 0 +; NEON_INTERLEAVE: [[TMP7:%.*]] = call <2 x i64> @vec_foo_linear16_nomask_neon(ptr [[TMP6]]) +; NEON_INTERLEAVE: [[TMP8:%.*]] = call <2 x i64> @vec_foo_linear16_nomask_neon(ptr [[TMP6]]) +; NEON_INTERLEAVE: [[DATA:%.*]] = call i64 @foo(ptr [[GEPB:%.*]]) #[[ATTR2]] +; ; SVE_OR_NEON-LABEL: define void @test_linear16_wide_stride ; SVE_OR_NEON-SAME: (ptr noalias [[A:%.*]], ptr readnone [[B:%.*]], i64 [[N:%.*]]) #[[ATTR0]] { -; SVE_OR_NEON: [[TMP16:%.*]] = call @vec_foo_linear16_nomask_sve(ptr [[TMP15:%.*]]) +; SVE_OR_NEON: [[TMP15:%.*]] = extractelement [[TMP14:%.*]], i32 0 +; SVE_OR_NEON: [[TMP16:%.*]] = call @vec_foo_linear16_nomask_sve(ptr [[TMP15]]) ; SVE_OR_NEON: [[DATA:%.*]] = call i64 @foo(ptr [[GEPB:%.*]]) #[[ATTR4]] ; +; SVE_OR_NEON_INTERLEAVE-LABEL: define void @test_linear16_wide_stride +; SVE_OR_NEON_INTERLEAVE-SAME: (ptr noalias [[A:%.*]], ptr readnone [[B:%.*]], i64 [[N:%.*]]) #[[ATTR0]] { +; SVE_OR_NEON_INTERLEAVE: [[DATA:%.*]] = call i64 @foo(ptr [[GEPB:%.*]]) #[[ATTR6]] +; ; SVE_TF-LABEL: define void @test_linear16_wide_stride ; SVE_TF-SAME: (ptr noalias [[A:%.*]], ptr readnone [[B:%.*]], i64 [[N:%.*]]) #[[ATTR0]] { ; SVE_TF: [[DATA:%.*]] = call i64 @foo(ptr [[GEPB:%.*]]) #[[ATTR6]] ; +; SVE_TF_INTERLEAVE-LABEL: define void @test_linear16_wide_stride +; SVE_TF_INTERLEAVE-SAME: (ptr noalias [[A:%.*]], ptr readnone [[B:%.*]], i64 [[N:%.*]]) #[[ATTR0]] { +; SVE_TF_INTERLEAVE: [[DATA:%.*]] = call i64 @foo(ptr [[GEPB:%.*]]) #[[ATTR6]] +; entry: br label %for.body @@ -140,19 +218,52 @@ for.cond.cleanup: define void @test_linear4_linear8(ptr noalias %a, ptr readnone %b, ptr readonly %c, i64 %n) { ; NEON-LABEL: define void @test_linear4_linear8 ; NEON-SAME: (ptr noalias [[A:%.*]], ptr readnone [[B:%.*]], ptr readonly [[C:%.*]], i64 [[N:%.*]]) { -; NEON: [[TMP5:%.*]] = call <4 x i32> @vec_quux_linear4_linear8_nomask_neon(ptr [[TMP3:%.*]], ptr [[TMP4:%.*]]) +; NEON: [[TMP3:%.*]] = extractelement <4 x ptr> [[TMP1:%.*]], i32 0 +; NEON: [[TMP4:%.*]] = extractelement <4 x ptr> [[TMP2:%.*]], i32 0 +; NEON: [[TMP5:%.*]] = call <4 x i32> @vec_quux_linear4_linear8_nomask_neon(ptr [[TMP3]], ptr [[TMP4]]) ; NEON: [[DATA:%.*]] = call i32 @quux(ptr [[GEPC:%.*]], ptr [[GEPB:%.*]]) #[[ATTR3:[0-9]+]] ; +; NEON_INTERLEAVE-LABEL: define void @test_linear4_linear8 +; NEON_INTERLEAVE-SAME: (ptr noalias [[A:%.*]], ptr readnone [[B:%.*]], ptr readonly [[C:%.*]], i64 [[N:%.*]]) { +; NEON_INTERLEAVE: [[TMP6:%.*]] = extractelement <4 x ptr> [[TMP2:%.*]], i32 0 +; NEON_INTERLEAVE: [[TMP7:%.*]] = extractelement <4 x ptr> [[TMP4:%.*]], i32 0 +; NEON_INTERLEAVE: [[TMP8:%.*]] = call <4 x i32> @vec_quux_linear4_linear8_nomask_neon(ptr [[TMP6]], ptr [[TMP7]]) +; NEON_INTERLEAVE: [[TMP9:%.*]] = call <4 x i32> @vec_quux_linear4_linear8_nomask_neon(ptr [[TMP6]], ptr [[TMP7]]) +; NEON_INTERLEAVE: [[DATA:%.*]] = call i32 @quux(ptr [[GEPC:%.*]], ptr [[GEPB:%.*]]) #[[ATTR3:[0-9]+]] +; ; SVE_OR_NEON-LABEL: define void @test_linear4_linear8 ; SVE_OR_NEON-SAME: (ptr noalias [[A:%.*]], ptr readnone [[B:%.*]], ptr readonly [[C:%.*]], i64 [[N:%.*]]) #[[ATTR0]] { -; SVE_OR_NEON: [[TMP17:%.*]] = call @vec_quux_linear4_linear8_mask_sve(ptr [[TMP15:%.*]], ptr [[TMP16:%.*]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer)) +; SVE_OR_NEON: [[TMP15:%.*]] = extractelement [[TMP13:%.*]], i32 0 +; SVE_OR_NEON: [[TMP16:%.*]] = extractelement [[TMP14:%.*]], i32 0 +; SVE_OR_NEON: [[TMP17:%.*]] = call @vec_quux_linear4_linear8_mask_sve(ptr [[TMP15]], ptr [[TMP16]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer)) ; SVE_OR_NEON: [[DATA:%.*]] = call i32 @quux(ptr [[GEPC:%.*]], ptr [[GEPB:%.*]]) #[[ATTR5:[0-9]+]] ; +; SVE_OR_NEON_INTERLEAVE-LABEL: define void @test_linear4_linear8 +; SVE_OR_NEON_INTERLEAVE-SAME: (ptr noalias [[A:%.*]], ptr readnone [[B:%.*]], ptr readonly [[C:%.*]], i64 [[N:%.*]]) #[[ATTR0]] { +; SVE_OR_NEON_INTERLEAVE: [[TMP35:%.*]] = extractelement [[TMP31:%.*]], i32 0 +; SVE_OR_NEON_INTERLEAVE: [[TMP36:%.*]] = extractelement [[TMP33:%.*]], i32 0 +; SVE_OR_NEON_INTERLEAVE: [[TMP37:%.*]] = call @vec_quux_linear4_linear8_mask_sve(ptr [[TMP35]], ptr [[TMP36]], [[ACTIVE_LANE_MASK:%.*]]) +; SVE_OR_NEON_INTERLEAVE: [[TMP38:%.*]] = call @vec_quux_linear4_linear8_mask_sve(ptr [[TMP35]], ptr [[TMP36]], [[ACTIVE_LANE_MASK2:%.*]]) +; SVE_OR_NEON_INTERLEAVE: [[TMP50:%.*]] = extractelement [[TMP48:%.*]], i32 0 +; SVE_OR_NEON_INTERLEAVE: [[DATA:%.*]] = call i32 @quux(ptr [[GEPC:%.*]], ptr [[GEPB:%.*]]) #[[ATTR7:[0-9]+]] +; ; SVE_TF-LABEL: define void @test_linear4_linear8 ; SVE_TF-SAME: (ptr noalias [[A:%.*]], ptr readnone [[B:%.*]], ptr readonly [[C:%.*]], i64 [[N:%.*]]) #[[ATTR0]] { -; SVE_TF: [[TMP23:%.*]] = call @vec_quux_linear4_linear8_mask_sve(ptr [[TMP21:%.*]], ptr [[TMP22:%.*]], [[ACTIVE_LANE_MASK:%.*]]) +; SVE_TF: [[TMP21:%.*]] = extractelement [[TMP19:%.*]], i32 0 +; SVE_TF: [[TMP22:%.*]] = extractelement [[TMP20:%.*]], i32 0 +; SVE_TF: [[TMP23:%.*]] = call @vec_quux_linear4_linear8_mask_sve(ptr [[TMP21]], ptr [[TMP22]], [[ACTIVE_LANE_MASK:%.*]]) +; SVE_TF: [[TMP27:%.*]] = extractelement [[TMP26:%.*]], i32 0 ; SVE_TF: [[DATA:%.*]] = call i32 @quux(ptr [[GEPC:%.*]], ptr [[GEPB:%.*]]) #[[ATTR7:[0-9]+]] ; +; SVE_TF_INTERLEAVE-LABEL: define void @test_linear4_linear8 +; SVE_TF_INTERLEAVE-SAME: (ptr noalias [[A:%.*]], ptr readnone [[B:%.*]], ptr readonly [[C:%.*]], i64 [[N:%.*]]) #[[ATTR0]] { +; SVE_TF_INTERLEAVE: [[TMP35:%.*]] = extractelement [[TMP31:%.*]], i32 0 +; SVE_TF_INTERLEAVE: [[TMP36:%.*]] = extractelement [[TMP33:%.*]], i32 0 +; SVE_TF_INTERLEAVE: [[TMP37:%.*]] = call @vec_quux_linear4_linear8_mask_sve(ptr [[TMP35]], ptr [[TMP36]], [[ACTIVE_LANE_MASK:%.*]]) +; SVE_TF_INTERLEAVE: [[TMP38:%.*]] = call @vec_quux_linear4_linear8_mask_sve(ptr [[TMP35]], ptr [[TMP36]], [[ACTIVE_LANE_MASK2:%.*]]) +; SVE_TF_INTERLEAVE: [[TMP50:%.*]] = extractelement [[TMP48:%.*]], i32 0 +; SVE_TF_INTERLEAVE: [[DATA:%.*]] = call i32 @quux(ptr [[GEPC:%.*]], ptr [[GEPB:%.*]]) #[[ATTR7:[0-9]+]] +; entry: br label %for.body @@ -174,18 +285,35 @@ for.cond.cleanup: define void @test_linear3_non_ptr(ptr noalias %a, i64 %n) { ; NEON-LABEL: define void @test_linear3_non_ptr ; NEON-SAME: (ptr noalias [[A:%.*]], i64 [[N:%.*]]) { -; NEON: [[TMP3:%.*]] = call <4 x i32> @vec_bar_linear3_nomask_neon(i32 [[TMP2:%.*]]) +; NEON: [[TMP2:%.*]] = extractelement <4 x i32> [[TMP1:%.*]], i32 0 +; NEON: [[TMP3:%.*]] = call <4 x i32> @vec_bar_linear3_nomask_neon(i32 [[TMP2]]) ; NEON: [[DATA:%.*]] = call i32 @bar(i32 [[TREBLED:%.*]]) #[[ATTR4:[0-9]+]] ; +; NEON_INTERLEAVE-LABEL: define void @test_linear3_non_ptr +; NEON_INTERLEAVE-SAME: (ptr noalias [[A:%.*]], i64 [[N:%.*]]) { +; NEON_INTERLEAVE: [[TMP4:%.*]] = extractelement <4 x i32> [[TMP2:%.*]], i32 0 +; NEON_INTERLEAVE: [[TMP5:%.*]] = call <4 x i32> @vec_bar_linear3_nomask_neon(i32 [[TMP4]]) +; NEON_INTERLEAVE: [[TMP6:%.*]] = call <4 x i32> @vec_bar_linear3_nomask_neon(i32 [[TMP4]]) +; NEON_INTERLEAVE: [[DATA:%.*]] = call i32 @bar(i32 [[TREBLED:%.*]]) #[[ATTR4:[0-9]+]] +; ; SVE_OR_NEON-LABEL: define void @test_linear3_non_ptr ; SVE_OR_NEON-SAME: (ptr noalias [[A:%.*]], i64 [[N:%.*]]) #[[ATTR0]] { -; SVE_OR_NEON: [[TMP15:%.*]] = call @vec_bar_linear3_nomask_sve(i32 [[TMP14:%.*]]) +; SVE_OR_NEON: [[TMP14:%.*]] = extractelement [[TMP13:%.*]], i32 0 +; SVE_OR_NEON: [[TMP15:%.*]] = call @vec_bar_linear3_nomask_sve(i32 [[TMP14]]) ; SVE_OR_NEON: [[DATA:%.*]] = call i32 @bar(i32 [[TREBLED:%.*]]) #[[ATTR6:[0-9]+]] ; +; SVE_OR_NEON_INTERLEAVE-LABEL: define void @test_linear3_non_ptr +; SVE_OR_NEON_INTERLEAVE-SAME: (ptr noalias [[A:%.*]], i64 [[N:%.*]]) #[[ATTR0]] { +; SVE_OR_NEON_INTERLEAVE: [[DATA:%.*]] = call i32 @bar(i32 [[TREBLED:%.*]]) #[[ATTR8:[0-9]+]] +; ; SVE_TF-LABEL: define void @test_linear3_non_ptr ; SVE_TF-SAME: (ptr noalias [[A:%.*]], i64 [[N:%.*]]) #[[ATTR0]] { ; SVE_TF: [[DATA:%.*]] = call i32 @bar(i32 [[TREBLED:%.*]]) #[[ATTR8:[0-9]+]] ; +; SVE_TF_INTERLEAVE-LABEL: define void @test_linear3_non_ptr +; SVE_TF_INTERLEAVE-SAME: (ptr noalias [[A:%.*]], i64 [[N:%.*]]) #[[ATTR0]] { +; SVE_TF_INTERLEAVE: [[DATA:%.*]] = call i32 @bar(i32 [[TREBLED:%.*]]) #[[ATTR8:[0-9]+]] +; entry: br label %for.body @@ -207,18 +335,35 @@ for.cond.cleanup: define void @test_linearn5_non_ptr_neg_stride(ptr noalias %a, i64 %n) { ; NEON-LABEL: define void @test_linearn5_non_ptr_neg_stride ; NEON-SAME: (ptr noalias [[A:%.*]], i64 [[N:%.*]]) { -; NEON: [[TMP3:%.*]] = call <4 x i32> @vec_bar_linearn5_nomask_neon(i32 [[TMP2:%.*]]) +; NEON: [[TMP2:%.*]] = extractelement <4 x i32> [[TMP1:%.*]], i32 0 +; NEON: [[TMP3:%.*]] = call <4 x i32> @vec_bar_linearn5_nomask_neon(i32 [[TMP2]]) ; NEON: [[DATA:%.*]] = call i32 @bar(i32 [[NEGSTRIDE:%.*]]) #[[ATTR5:[0-9]+]] ; +; NEON_INTERLEAVE-LABEL: define void @test_linearn5_non_ptr_neg_stride +; NEON_INTERLEAVE-SAME: (ptr noalias [[A:%.*]], i64 [[N:%.*]]) { +; NEON_INTERLEAVE: [[TMP4:%.*]] = extractelement <4 x i32> [[TMP2:%.*]], i32 0 +; NEON_INTERLEAVE: [[TMP5:%.*]] = call <4 x i32> @vec_bar_linearn5_nomask_neon(i32 [[TMP4]]) +; NEON_INTERLEAVE: [[TMP6:%.*]] = call <4 x i32> @vec_bar_linearn5_nomask_neon(i32 [[TMP4]]) +; NEON_INTERLEAVE: [[DATA:%.*]] = call i32 @bar(i32 [[NEGSTRIDE:%.*]]) #[[ATTR5:[0-9]+]] +; ; SVE_OR_NEON-LABEL: define void @test_linearn5_non_ptr_neg_stride ; SVE_OR_NEON-SAME: (ptr noalias [[A:%.*]], i64 [[N:%.*]]) #[[ATTR0]] { -; SVE_OR_NEON: [[TMP15:%.*]] = call @vec_bar_linearn5_nomask_sve(i32 [[TMP14:%.*]]) +; SVE_OR_NEON: [[TMP14:%.*]] = extractelement [[TMP13:%.*]], i32 0 +; SVE_OR_NEON: [[TMP15:%.*]] = call @vec_bar_linearn5_nomask_sve(i32 [[TMP14]]) ; SVE_OR_NEON: [[DATA:%.*]] = call i32 @bar(i32 [[NEGSTRIDE:%.*]]) #[[ATTR7:[0-9]+]] ; +; SVE_OR_NEON_INTERLEAVE-LABEL: define void @test_linearn5_non_ptr_neg_stride +; SVE_OR_NEON_INTERLEAVE-SAME: (ptr noalias [[A:%.*]], i64 [[N:%.*]]) #[[ATTR0]] { +; SVE_OR_NEON_INTERLEAVE: [[DATA:%.*]] = call i32 @bar(i32 [[NEGSTRIDE:%.*]]) #[[ATTR9:[0-9]+]] +; ; SVE_TF-LABEL: define void @test_linearn5_non_ptr_neg_stride ; SVE_TF-SAME: (ptr noalias [[A:%.*]], i64 [[N:%.*]]) #[[ATTR0]] { ; SVE_TF: [[DATA:%.*]] = call i32 @bar(i32 [[NEGSTRIDE:%.*]]) #[[ATTR9:[0-9]+]] ; +; SVE_TF_INTERLEAVE-LABEL: define void @test_linearn5_non_ptr_neg_stride +; SVE_TF_INTERLEAVE-SAME: (ptr noalias [[A:%.*]], i64 [[N:%.*]]) #[[ATTR0]] { +; SVE_TF_INTERLEAVE: [[DATA:%.*]] = call i32 @bar(i32 [[NEGSTRIDE:%.*]]) #[[ATTR9:[0-9]+]] +; entry: br label %for.body @@ -240,19 +385,46 @@ for.cond.cleanup: define void @test_linear8_return_void(ptr noalias %in, ptr noalias %out, i64 %n) { ; NEON-LABEL: define void @test_linear8_return_void ; NEON-SAME: (ptr noalias [[IN:%.*]], ptr noalias [[OUT:%.*]], i64 [[N:%.*]]) { -; NEON: call void @vec_goo_linear8_nomask_neon(<2 x i64> [[WIDE_LOAD:%.*]], ptr [[TMP4:%.*]]) +; NEON: [[TMP4:%.*]] = extractelement <2 x ptr> [[TMP3:%.*]], i32 0 +; NEON: call void @vec_goo_linear8_nomask_neon(<2 x i64> [[WIDE_LOAD:%.*]], ptr [[TMP4]]) ; NEON: call void @goo(i64 [[NUM:%.*]], ptr [[GEP_OUT:%.*]]) #[[ATTR6:[0-9]+]] ; +; NEON_INTERLEAVE-LABEL: define void @test_linear8_return_void +; NEON_INTERLEAVE-SAME: (ptr noalias [[IN:%.*]], ptr noalias [[OUT:%.*]], i64 [[N:%.*]]) { +; NEON_INTERLEAVE: [[TMP8:%.*]] = extractelement <2 x ptr> [[TMP6:%.*]], i32 0 +; NEON_INTERLEAVE: call void @vec_goo_linear8_nomask_neon(<2 x i64> [[WIDE_LOAD:%.*]], ptr [[TMP8]]) +; NEON_INTERLEAVE: call void @vec_goo_linear8_nomask_neon(<2 x i64> [[WIDE_LOAD2:%.*]], ptr [[TMP8]]) +; NEON_INTERLEAVE: call void @goo(i64 [[NUM:%.*]], ptr [[GEP_OUT:%.*]]) #[[ATTR6:[0-9]+]] +; ; SVE_OR_NEON-LABEL: define void @test_linear8_return_void ; SVE_OR_NEON-SAME: (ptr noalias [[IN:%.*]], ptr noalias [[OUT:%.*]], i64 [[N:%.*]]) #[[ATTR0]] { -; SVE_OR_NEON: call void @vec_goo_linear8_nomask_sve( [[WIDE_LOAD:%.*]], ptr [[TMP16:%.*]]) +; SVE_OR_NEON: [[TMP16:%.*]] = extractelement [[TMP15:%.*]], i32 0 +; SVE_OR_NEON: call void @vec_goo_linear8_nomask_sve( [[WIDE_LOAD:%.*]], ptr [[TMP16]]) ; SVE_OR_NEON: call void @goo(i64 [[NUM:%.*]], ptr [[GEP_OUT:%.*]]) #[[ATTR8:[0-9]+]] ; +; SVE_OR_NEON_INTERLEAVE-LABEL: define void @test_linear8_return_void +; SVE_OR_NEON_INTERLEAVE-SAME: (ptr noalias [[IN:%.*]], ptr noalias [[OUT:%.*]], i64 [[N:%.*]]) #[[ATTR0]] { +; SVE_OR_NEON_INTERLEAVE: [[TMP39:%.*]] = extractelement [[TMP37:%.*]], i32 0 +; SVE_OR_NEON_INTERLEAVE: call void @vec_goo_linear8_mask_sve( [[WIDE_MASKED_LOAD:%.*]], ptr [[TMP39]], [[ACTIVE_LANE_MASK:%.*]]) +; SVE_OR_NEON_INTERLEAVE: call void @vec_goo_linear8_mask_sve( [[WIDE_MASKED_LOAD4:%.*]], ptr [[TMP39]], [[ACTIVE_LANE_MASK2:%.*]]) +; SVE_OR_NEON_INTERLEAVE: [[TMP45:%.*]] = extractelement [[TMP43:%.*]], i32 0 +; SVE_OR_NEON_INTERLEAVE: call void @goo(i64 [[NUM:%.*]], ptr [[GEP_OUT:%.*]]) #[[ATTR10:[0-9]+]] +; ; SVE_TF-LABEL: define void @test_linear8_return_void ; SVE_TF-SAME: (ptr noalias [[IN:%.*]], ptr noalias [[OUT:%.*]], i64 [[N:%.*]]) #[[ATTR0]] { -; SVE_TF: call void @vec_goo_linear8_mask_sve( [[WIDE_MASKED_LOAD:%.*]], ptr [[TMP22:%.*]], [[ACTIVE_LANE_MASK:%.*]]) +; SVE_TF: [[TMP22:%.*]] = extractelement [[TMP21:%.*]], i32 0 +; SVE_TF: call void @vec_goo_linear8_mask_sve( [[WIDE_MASKED_LOAD:%.*]], ptr [[TMP22]], [[ACTIVE_LANE_MASK:%.*]]) +; SVE_TF: [[TMP24:%.*]] = extractelement [[TMP23:%.*]], i32 0 ; SVE_TF: call void @goo(i64 [[NUM:%.*]], ptr [[GEP_OUT:%.*]]) #[[ATTR10:[0-9]+]] ; +; SVE_TF_INTERLEAVE-LABEL: define void @test_linear8_return_void +; SVE_TF_INTERLEAVE-SAME: (ptr noalias [[IN:%.*]], ptr noalias [[OUT:%.*]], i64 [[N:%.*]]) #[[ATTR0]] { +; SVE_TF_INTERLEAVE: [[TMP39:%.*]] = extractelement [[TMP37:%.*]], i32 0 +; SVE_TF_INTERLEAVE: call void @vec_goo_linear8_mask_sve( [[WIDE_MASKED_LOAD:%.*]], ptr [[TMP39]], [[ACTIVE_LANE_MASK:%.*]]) +; SVE_TF_INTERLEAVE: call void @vec_goo_linear8_mask_sve( [[WIDE_MASKED_LOAD4:%.*]], ptr [[TMP39]], [[ACTIVE_LANE_MASK2:%.*]]) +; SVE_TF_INTERLEAVE: [[TMP45:%.*]] = extractelement [[TMP43:%.*]], i32 0 +; SVE_TF_INTERLEAVE: call void @goo(i64 [[NUM:%.*]], ptr [[GEP_OUT:%.*]]) #[[ATTR10:[0-9]+]] +; entry: br label %for.body