From d060c2582326ae5b5ebfc981a66df2005244e5d3 Mon Sep 17 00:00:00 2001 From: Nadav Rotem Date: Tue, 24 Apr 2012 11:07:03 +0000 Subject: [PATCH] AVX: We lower VECTOR_SHUFFLE and BUILD_VECTOR nodes into vbroadcast instructions using the pattern (vbroadcast (i32load src)). In some cases, after we generate this pattern new users are added to the load node, which prevent the selection of the blend pattern. This commit provides fallback patterns which perform in-vector broadcast (using in-vector vbroadcast in AVX2 and pshufd on AVX1). llvm-svn: 155437 --- lib/Target/X86/X86InstrSSE.td | 43 ++++++++++++++++++++++++++- test/CodeGen/X86/avx2-vbroadcast.ll | 45 ++++++++++++++++++++++++++--- 2 files changed, 83 insertions(+), 5 deletions(-) diff --git a/lib/Target/X86/X86InstrSSE.td b/lib/Target/X86/X86InstrSSE.td index 450d29a8574..8cd00a9aa0e 100644 --- a/lib/Target/X86/X86InstrSSE.td +++ b/lib/Target/X86/X86InstrSSE.td @@ -7723,6 +7723,20 @@ let Predicates = [HasAVX2] in { (VPBROADCASTQrm addr:$src)>; def : Pat<(v4i64 (X86VBroadcast (loadi64 addr:$src))), (VPBROADCASTQYrm addr:$src)>; + + // Provide fallback in case the load node that is used in the patterns above + // is used by additional users, which prevents the pattern selection. + let AddedComplexity = 20 in { + def : Pat<(v4f32 (X86VBroadcast FR32:$src)), + (VBROADCASTSSrr + (INSERT_SUBREG (v4f32 (IMPLICIT_DEF)), FR32:$src, sub_ss))>; + def : Pat<(v8f32 (X86VBroadcast FR32:$src)), + (VBROADCASTSSYrr + (INSERT_SUBREG (v4f32 (IMPLICIT_DEF)), FR32:$src, sub_ss))>; + def : Pat<(v4f64 (X86VBroadcast FR64:$src)), + (VBROADCASTSDrr + (INSERT_SUBREG (v2f64 (IMPLICIT_DEF)), FR64:$src, sub_sd))>; + } } // AVX1 broadcast patterns @@ -7735,11 +7749,38 @@ def : Pat<(v8f32 (X86VBroadcast (loadf32 addr:$src))), (VBROADCASTSSYrm addr:$src)>; def : Pat<(v4f64 (X86VBroadcast (loadf64 addr:$src))), (VBROADCASTSDrm addr:$src)>; - def : Pat<(v4f32 (X86VBroadcast (loadf32 addr:$src))), (VBROADCASTSSrm addr:$src)>; def : Pat<(v4i32 (X86VBroadcast (loadi32 addr:$src))), (VBROADCASTSSrm addr:$src)>; + + // Provide fallback in case the load node that is used in the patterns above + // is used by additional users, which prevents the pattern selection. + let AddedComplexity = 20 in { + // 128bit broadcasts: + def : Pat<(v2f64 (X86VBroadcast FR64:$src)), + (VPSHUFDri + (INSERT_SUBREG (v2f64 (IMPLICIT_DEF)), FR64:$src, sub_sd), 0)>; + def : Pat<(v4f32 (X86VBroadcast FR32:$src)), + (VPSHUFDri + (INSERT_SUBREG (v4f32 (IMPLICIT_DEF)), FR32:$src, sub_ss), 0)>; + def : Pat<(v8f32 (X86VBroadcast FR32:$src)), + (VINSERTF128rr (INSERT_SUBREG (v8f32 (IMPLICIT_DEF)), + (VPSHUFDri + (INSERT_SUBREG (v4f32 (IMPLICIT_DEF)), FR32:$src, sub_ss), 0), + sub_xmm), + (VPSHUFDri + (INSERT_SUBREG (v4f32 (IMPLICIT_DEF)), FR32:$src, sub_ss), + 0), 1)>; + def : Pat<(v4f64 (X86VBroadcast FR64:$src)), + (VINSERTF128rr (INSERT_SUBREG (v4f64 (IMPLICIT_DEF)), + (VPSHUFDri + (INSERT_SUBREG (v2f64 (IMPLICIT_DEF)), FR64:$src, sub_sd), 0), + sub_xmm), + (VPSHUFDri + (INSERT_SUBREG (v2f64 (IMPLICIT_DEF)), FR64:$src, sub_sd), + 0), 1)>; + } } //===----------------------------------------------------------------------===// diff --git a/test/CodeGen/X86/avx2-vbroadcast.ll b/test/CodeGen/X86/avx2-vbroadcast.ll index 1a78414761c..6eba694bd4a 100644 --- a/test/CodeGen/X86/avx2-vbroadcast.ll +++ b/test/CodeGen/X86/avx2-vbroadcast.ll @@ -160,6 +160,15 @@ entry: ret <8 x i32> %g } +; CHECK: V113 +; CHECK: vbroadcastss +; CHECK: ret +define <8 x float> @V113(<8 x float> %in) nounwind uwtable readnone ssp { +entry: + %g = fadd <8 x float> %in, + ret <8 x float> %g +} + ; CHECK: _e2 ; CHECK: vbroadcastss ; CHECK: ret @@ -179,9 +188,37 @@ define <8 x i8> @_e4(i8* %ptr) nounwind uwtable readnone ssp { %vecinit1.i = insertelement <8 x i8> %vecinit0.i, i8 52, i32 1 %vecinit2.i = insertelement <8 x i8> %vecinit1.i, i8 52, i32 2 %vecinit3.i = insertelement <8 x i8> %vecinit2.i, i8 52, i32 3 - %vecinit4.i = insertelement <8 x i8> %vecinit3.i, i8 52, i32 3 - %vecinit5.i = insertelement <8 x i8> %vecinit4.i, i8 52, i32 3 - %vecinit6.i = insertelement <8 x i8> %vecinit5.i, i8 52, i32 3 - %vecinit7.i = insertelement <8 x i8> %vecinit6.i, i8 52, i32 3 + %vecinit4.i = insertelement <8 x i8> %vecinit3.i, i8 52, i32 4 + %vecinit5.i = insertelement <8 x i8> %vecinit4.i, i8 52, i32 5 + %vecinit6.i = insertelement <8 x i8> %vecinit5.i, i8 52, i32 6 + %vecinit7.i = insertelement <8 x i8> %vecinit6.i, i8 52, i32 7 ret <8 x i8> %vecinit7.i } + + +define void @crash() nounwind alwaysinline { +WGLoopsEntry: + br i1 undef, label %ret, label %footer329VF + +footer329VF: + %A.0.inVF = fmul float undef, 6.553600e+04 + %B.0.in407VF = fmul <8 x float> undef, + %A.0VF = fptosi float %A.0.inVF to i32 + %B.0408VF = fptosi <8 x float> %B.0.in407VF to <8 x i32> + %0 = and <8 x i32> %B.0408VF, + %1 = and i32 %A.0VF, 65535 + %temp1098VF = insertelement <8 x i32> undef, i32 %1, i32 0 + %vector1099VF = shufflevector <8 x i32> %temp1098VF, <8 x i32> undef, <8 x i32> zeroinitializer + br i1 undef, label %preload1201VF, label %footer349VF + +preload1201VF: + br label %footer349VF + +footer349VF: + %2 = mul nsw <8 x i32> undef, %0 + %3 = mul nsw <8 x i32> undef, %vector1099VF + br label %footer329VF + +ret: + ret void +}