From 19ff9cf62b09cf67255af6606b37c69bbe0edbef Mon Sep 17 00:00:00 2001 From: Simon Pilgrim Date: Tue, 6 Jun 2017 14:18:39 +0000 Subject: [PATCH] [X86][AVX1] Split 256-bit vector non-temporal FastISel loads to keep it non-temporal (PR32744) Extension to D33728 git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@304798 91177308-0d34-0410-b5e6-96231b3b80d8 --- lib/Target/X86/X86FastISel.cpp | 6 ++++ test/CodeGen/X86/fast-isel-nontemporal.ll | 36 +++++++++++++++++++---- 2 files changed, 36 insertions(+), 6 deletions(-) diff --git a/lib/Target/X86/X86FastISel.cpp b/lib/Target/X86/X86FastISel.cpp index 3cfb924abd0..621505aaded 100644 --- a/lib/Target/X86/X86FastISel.cpp +++ b/lib/Target/X86/X86FastISel.cpp @@ -414,6 +414,8 @@ bool X86FastISel::X86FastEmitLoad(EVT VT, X86AddressMode &AM, assert(HasAVX); if (IsNonTemporal && Alignment >= 32 && HasAVX2) Opc = HasVLX ? X86::VMOVNTDQAZ256rm : X86::VMOVNTDQAYrm; + else if (IsNonTemporal && Alignment >= 16) + return false; // Force split for X86::VMOVNTDQArm else if (Alignment >= 32) Opc = HasVLX ? X86::VMOVAPSZ256rm : X86::VMOVAPSYrm; else @@ -424,6 +426,8 @@ bool X86FastISel::X86FastEmitLoad(EVT VT, X86AddressMode &AM, assert(HasAVX); if (IsNonTemporal && Alignment >= 32 && HasAVX2) Opc = X86::VMOVNTDQAYrm; + else if (IsNonTemporal && Alignment >= 16) + return false; // Force split for X86::VMOVNTDQArm else if (Alignment >= 32) Opc = HasVLX ? X86::VMOVAPDZ256rm : X86::VMOVAPDYrm; else @@ -437,6 +441,8 @@ bool X86FastISel::X86FastEmitLoad(EVT VT, X86AddressMode &AM, assert(HasAVX); if (IsNonTemporal && Alignment >= 32 && HasAVX2) Opc = X86::VMOVNTDQAYrm; + else if (IsNonTemporal && Alignment >= 16) + return false; // Force split for X86::VMOVNTDQArm else if (Alignment >= 32) Opc = HasVLX ? X86::VMOVDQA64Z256rm : X86::VMOVDQAYrm; else diff --git a/test/CodeGen/X86/fast-isel-nontemporal.ll b/test/CodeGen/X86/fast-isel-nontemporal.ll index 306012aa3bf..33d001cdc21 100644 --- a/test/CodeGen/X86/fast-isel-nontemporal.ll +++ b/test/CodeGen/X86/fast-isel-nontemporal.ll @@ -545,7 +545,11 @@ define <8 x float> @test_load_nt8xfloat(<8 x float>* nocapture %ptr) { ; ; AVX1-LABEL: test_load_nt8xfloat: ; AVX1: # BB#0: # %entry -; AVX1-NEXT: vmovaps (%rdi), %ymm0 +; AVX1-NEXT: vmovntdqa (%rdi), %xmm0 +; AVX1-NEXT: # implicit-def: %YMM1 +; AVX1-NEXT: vmovaps %xmm0, %xmm1 +; AVX1-NEXT: vmovntdqa 16(%rdi), %xmm0 +; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 ; AVX1-NEXT: retq ; ; AVX2-LABEL: test_load_nt8xfloat: @@ -583,7 +587,11 @@ define <4 x double> @test_load_nt4xdouble(<4 x double>* nocapture %ptr) { ; ; AVX1-LABEL: test_load_nt4xdouble: ; AVX1: # BB#0: # %entry -; AVX1-NEXT: vmovapd (%rdi), %ymm0 +; AVX1-NEXT: vmovntdqa (%rdi), %xmm0 +; AVX1-NEXT: # implicit-def: %YMM1 +; AVX1-NEXT: vmovaps %xmm0, %xmm1 +; AVX1-NEXT: vmovntdqa 16(%rdi), %xmm0 +; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 ; AVX1-NEXT: retq ; ; AVX2-LABEL: test_load_nt4xdouble: @@ -621,7 +629,11 @@ define <32 x i8> @test_load_nt32xi8(<32 x i8>* nocapture %ptr) { ; ; AVX1-LABEL: test_load_nt32xi8: ; AVX1: # BB#0: # %entry -; AVX1-NEXT: vmovdqa (%rdi), %ymm0 +; AVX1-NEXT: vmovntdqa (%rdi), %xmm0 +; AVX1-NEXT: # implicit-def: %YMM1 +; AVX1-NEXT: vmovaps %xmm0, %xmm1 +; AVX1-NEXT: vmovntdqa 16(%rdi), %xmm0 +; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 ; AVX1-NEXT: retq ; ; AVX2-LABEL: test_load_nt32xi8: @@ -659,7 +671,11 @@ define <16 x i16> @test_load_nt16xi16(<16 x i16>* nocapture %ptr) { ; ; AVX1-LABEL: test_load_nt16xi16: ; AVX1: # BB#0: # %entry -; AVX1-NEXT: vmovdqa (%rdi), %ymm0 +; AVX1-NEXT: vmovntdqa (%rdi), %xmm0 +; AVX1-NEXT: # implicit-def: %YMM1 +; AVX1-NEXT: vmovaps %xmm0, %xmm1 +; AVX1-NEXT: vmovntdqa 16(%rdi), %xmm0 +; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 ; AVX1-NEXT: retq ; ; AVX2-LABEL: test_load_nt16xi16: @@ -697,7 +713,11 @@ define <8 x i32> @test_load_nt8xi32(<8 x i32>* nocapture %ptr) { ; ; AVX1-LABEL: test_load_nt8xi32: ; AVX1: # BB#0: # %entry -; AVX1-NEXT: vmovdqa (%rdi), %ymm0 +; AVX1-NEXT: vmovntdqa (%rdi), %xmm0 +; AVX1-NEXT: # implicit-def: %YMM1 +; AVX1-NEXT: vmovaps %xmm0, %xmm1 +; AVX1-NEXT: vmovntdqa 16(%rdi), %xmm0 +; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 ; AVX1-NEXT: retq ; ; AVX2-LABEL: test_load_nt8xi32: @@ -735,7 +755,11 @@ define <4 x i64> @test_load_nt4xi64(<4 x i64>* nocapture %ptr) { ; ; AVX1-LABEL: test_load_nt4xi64: ; AVX1: # BB#0: # %entry -; AVX1-NEXT: vmovdqa (%rdi), %ymm0 +; AVX1-NEXT: vmovntdqa (%rdi), %xmm0 +; AVX1-NEXT: # implicit-def: %YMM1 +; AVX1-NEXT: vmovaps %xmm0, %xmm1 +; AVX1-NEXT: vmovntdqa 16(%rdi), %xmm0 +; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 ; AVX1-NEXT: retq ; ; AVX2-LABEL: test_load_nt4xi64: