From 7cd893a9859dcba1045aef2ca66de95f0561344b Mon Sep 17 00:00:00 2001 From: Adam Nemet Date: Thu, 29 May 2014 23:35:36 +0000 Subject: [PATCH] [X86] Remove AVX1 vbroadcast intrinsics The corresponding CFE patch replaces these intrinsics with vector initializers in avxintrin.h. This patch removes the LLVM intrinsics from the backend. We now stop lowering at X86ISD::VBROADCAST custom node rather than lowering that further to the intrinsics. The patch only changes VBROADCASTS* and leaves VBROADCAST[FI]128 to continue to use intrinsics. As explained in the CFE patch, the reason is that we currently don't generate as good code for them without the intrinsics. CodeGen/X86/avx-vbroadcast.ll already provides coverage for this change. It checks that for a series of insertelements we generate the appropriate vbroadcast instruction. Also verified that there was no assembly change in the test-suite before and after this patch. git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@209864 91177308-0d34-0410-b5e6-96231b3b80d8 --- include/llvm/IR/IntrinsicsX86.td | 9 -------- lib/Target/X86/X86InstrSSE.td | 32 ++++++++++++++------------ test/CodeGen/X86/avx-intrinsics-x86.ll | 24 ------------------- 3 files changed, 17 insertions(+), 48 deletions(-) diff --git a/include/llvm/IR/IntrinsicsX86.td b/include/llvm/IR/IntrinsicsX86.td index 36d93fee8a0..88ee1cec2d5 100644 --- a/include/llvm/IR/IntrinsicsX86.td +++ b/include/llvm/IR/IntrinsicsX86.td @@ -1304,15 +1304,6 @@ let TargetPrefix = "x86" in { // All intrinsics start with "llvm.x86.". // Vector load with broadcast let TargetPrefix = "x86" in { // All intrinsics start with "llvm.x86.". - def int_x86_avx_vbroadcast_ss : - GCCBuiltin<"__builtin_ia32_vbroadcastss">, - Intrinsic<[llvm_v4f32_ty], [llvm_ptr_ty], [IntrReadArgMem]>; - def int_x86_avx_vbroadcast_sd_256 : - GCCBuiltin<"__builtin_ia32_vbroadcastsd256">, - Intrinsic<[llvm_v4f64_ty], [llvm_ptr_ty], [IntrReadArgMem]>; - def int_x86_avx_vbroadcast_ss_256 : - GCCBuiltin<"__builtin_ia32_vbroadcastss256">, - Intrinsic<[llvm_v8f32_ty], [llvm_ptr_ty], [IntrReadArgMem]>; def int_x86_avx_vbroadcastf128_pd_256 : GCCBuiltin<"__builtin_ia32_vbroadcastf128_pd256">, Intrinsic<[llvm_v4f64_ty], [llvm_ptr_ty], [IntrReadArgMem]>; diff --git a/lib/Target/X86/X86InstrSSE.td b/lib/Target/X86/X86InstrSSE.td index 1eb04851b72..043b2f32c6f 100644 --- a/lib/Target/X86/X86InstrSSE.td +++ b/lib/Target/X86/X86InstrSSE.td @@ -7969,6 +7969,16 @@ class avx_broadcast opc, string OpcodeStr, RegisterClass RC, !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"), [(set RC:$dst, (Int addr:$src))]>, Sched<[Sched]>, VEX; +class avx_broadcast_no_int opc, string OpcodeStr, RegisterClass RC, + X86MemOperand x86memop, ValueType VT, + PatFrag ld_frag, SchedWrite Sched> : + AVX8I, + Sched<[Sched]>, VEX { + let mayLoad = 1; +} + // AVX2 adds register forms class avx2_broadcast_reg opc, string OpcodeStr, RegisterClass RC, Intrinsic Int, SchedWrite Sched> : @@ -7977,16 +7987,15 @@ class avx2_broadcast_reg opc, string OpcodeStr, RegisterClass RC, [(set RC:$dst, (Int VR128:$src))]>, Sched<[Sched]>, VEX; let ExeDomain = SSEPackedSingle in { - def VBROADCASTSSrm : avx_broadcast<0x18, "vbroadcastss", VR128, f32mem, - int_x86_avx_vbroadcast_ss, WriteLoad>; - def VBROADCASTSSYrm : avx_broadcast<0x18, "vbroadcastss", VR256, f32mem, - int_x86_avx_vbroadcast_ss_256, - WriteFShuffleLd>, VEX_L; + def VBROADCASTSSrm : avx_broadcast_no_int<0x18, "vbroadcastss", VR128, + f32mem, v4f32, loadf32, WriteLoad>; + def VBROADCASTSSYrm : avx_broadcast_no_int<0x18, "vbroadcastss", VR256, + f32mem, v8f32, loadf32, + WriteFShuffleLd>, VEX_L; } let ExeDomain = SSEPackedDouble in -def VBROADCASTSDYrm : avx_broadcast<0x19, "vbroadcastsd", VR256, f64mem, - int_x86_avx_vbroadcast_sd_256, - WriteFShuffleLd>, VEX_L; +def VBROADCASTSDYrm : avx_broadcast_no_int<0x19, "vbroadcastsd", VR256, f64mem, + v4f64, loadf64, WriteFShuffleLd>, VEX_L; def VBROADCASTF128 : avx_broadcast<0x1A, "vbroadcastf128", VR256, f128mem, int_x86_avx_vbroadcastf128_pd_256, WriteFShuffleLd>, VEX_L; @@ -8543,13 +8552,6 @@ def : Pat<(v4i32 (X86VBroadcast (loadi32 addr:$src))), } let Predicates = [HasAVX] in { -def : Pat<(v8f32 (X86VBroadcast (loadf32 addr:$src))), - (VBROADCASTSSYrm addr:$src)>; -def : Pat<(v4f64 (X86VBroadcast (loadf64 addr:$src))), - (VBROADCASTSDYrm addr:$src)>; -def : Pat<(v4f32 (X86VBroadcast (loadf32 addr:$src))), - (VBROADCASTSSrm addr:$src)>; - // Provide fallback in case the load node that is used in the patterns above // is used by additional users, which prevents the pattern selection. let AddedComplexity = 20 in { diff --git a/test/CodeGen/X86/avx-intrinsics-x86.ll b/test/CodeGen/X86/avx-intrinsics-x86.ll index 0be83f648d1..ce31161dbbc 100644 --- a/test/CodeGen/X86/avx-intrinsics-x86.ll +++ b/test/CodeGen/X86/avx-intrinsics-x86.ll @@ -2219,14 +2219,6 @@ define void @test_x86_avx_storeu_ps_256(i8* %a0, <8 x float> %a1) { declare void @llvm.x86.avx.storeu.ps.256(i8*, <8 x float>) nounwind -define <4 x double> @test_x86_avx_vbroadcast_sd_256(i8* %a0) { - ; CHECK: vbroadcastsd - %res = call <4 x double> @llvm.x86.avx.vbroadcast.sd.256(i8* %a0) ; <<4 x double>> [#uses=1] - ret <4 x double> %res -} -declare <4 x double> @llvm.x86.avx.vbroadcast.sd.256(i8*) nounwind readonly - - define <4 x double> @test_x86_avx_vbroadcastf128_pd_256(i8* %a0) { ; CHECK: vbroadcastf128 %res = call <4 x double> @llvm.x86.avx.vbroadcastf128.pd.256(i8* %a0) ; <<4 x double>> [#uses=1] @@ -2243,22 +2235,6 @@ define <8 x float> @test_x86_avx_vbroadcastf128_ps_256(i8* %a0) { declare <8 x float> @llvm.x86.avx.vbroadcastf128.ps.256(i8*) nounwind readonly -define <4 x float> @test_x86_avx_vbroadcast_ss(i8* %a0) { - ; CHECK: vbroadcastss - %res = call <4 x float> @llvm.x86.avx.vbroadcast.ss(i8* %a0) ; <<4 x float>> [#uses=1] - ret <4 x float> %res -} -declare <4 x float> @llvm.x86.avx.vbroadcast.ss(i8*) nounwind readonly - - -define <8 x float> @test_x86_avx_vbroadcast_ss_256(i8* %a0) { - ; CHECK: vbroadcastss - %res = call <8 x float> @llvm.x86.avx.vbroadcast.ss.256(i8* %a0) ; <<8 x float>> [#uses=1] - ret <8 x float> %res -} -declare <8 x float> @llvm.x86.avx.vbroadcast.ss.256(i8*) nounwind readonly - - define <2 x double> @test_x86_avx_vextractf128_pd_256(<4 x double> %a0) { ; CHECK: vextractf128 %res = call <2 x double> @llvm.x86.avx.vextractf128.pd.256(<4 x double> %a0, i8 7) ; <<2 x double>> [#uses=1]