From dfec7f8eea24c17ebc1489351edbf6804cf858ca Mon Sep 17 00:00:00 2001 From: Sanjay Patel Date: Thu, 3 Sep 2015 15:03:19 +0000 Subject: [PATCH] check for fastness before merging in DAGCombiner::MergeConsecutiveStores() Use and check the 'IsFast' optional parameter to TLI.allowsMemoryAccess() any time we have a merged access candidate. Without this patch, we were generating unaligned 16-byte (SSE) memops for x86 targets where those accesses are slow. This change was mentioned in: http://reviews.llvm.org/D10662 and http://reviews.llvm.org/D10905 and will help solve PR21711. Differential Revision: http://reviews.llvm.org/D12573 git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@246771 91177308-0d34-0410-b5e6-96231b3b80d8 --- lib/CodeGen/SelectionDAG/DAGCombiner.cpp | 27 ++++--- lib/Target/AMDGPU/SIISelLowering.cpp | 5 +- test/CodeGen/X86/dag-merge-fast-accesses.ll | 81 +++++++++++++++++++++ 3 files changed, 101 insertions(+), 12 deletions(-) create mode 100644 test/CodeGen/X86/dag-merge-fast-accesses.ll diff --git a/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/lib/CodeGen/SelectionDAG/DAGCombiner.cpp index 3c1e51116c2..755edee1c6c 100644 --- a/lib/CodeGen/SelectionDAG/DAGCombiner.cpp +++ b/lib/CodeGen/SelectionDAG/DAGCombiner.cpp @@ -11045,9 +11045,10 @@ bool DAGCombiner::MergeConsecutiveStores(StoreSDNode* St) { // Find a legal type for the constant store. unsigned SizeInBits = (i+1) * ElementSizeBytes * 8; EVT StoreTy = EVT::getIntegerVT(Context, SizeInBits); + bool IsFast; if (TLI.isTypeLegal(StoreTy) && TLI.allowsMemoryAccess(Context, DL, StoreTy, FirstStoreAS, - FirstStoreAlign)) { + FirstStoreAlign, &IsFast) && IsFast) { LastLegalType = i+1; // Or check whether a truncstore is legal. } else if (TLI.getTypeAction(Context, StoreTy) == @@ -11056,7 +11057,8 @@ bool DAGCombiner::MergeConsecutiveStores(StoreSDNode* St) { TLI.getTypeToTransformTo(Context, StoredVal.getValueType()); if (TLI.isTruncStoreLegal(LegalizedStoredValueTy, StoreTy) && TLI.allowsMemoryAccess(Context, DL, LegalizedStoredValueTy, - FirstStoreAS, FirstStoreAlign)) { + FirstStoreAS, FirstStoreAlign, &IsFast) && + IsFast) { LastLegalType = i + 1; } } @@ -11071,7 +11073,7 @@ bool DAGCombiner::MergeConsecutiveStores(StoreSDNode* St) { EVT Ty = EVT::getVectorVT(Context, MemVT, i+1); if (TLI.isTypeLegal(Ty) && TLI.allowsMemoryAccess(Context, DL, Ty, FirstStoreAS, - FirstStoreAlign)) + FirstStoreAlign, &IsFast) && IsFast) LastLegalVectorType = i + 1; } } @@ -11104,9 +11106,10 @@ bool DAGCombiner::MergeConsecutiveStores(StoreSDNode* St) { // Find a legal type for the vector store. EVT Ty = EVT::getVectorVT(Context, MemVT, i+1); + bool IsFast; if (TLI.isTypeLegal(Ty) && TLI.allowsMemoryAccess(Context, DL, Ty, FirstStoreAS, - FirstStoreAlign)) + FirstStoreAlign, &IsFast) && IsFast) NumElem = i + 1; } @@ -11192,14 +11195,14 @@ bool DAGCombiner::MergeConsecutiveStores(StoreSDNode* St) { if (CurrAddress - StartAddress != (ElementSizeBytes * i)) break; LastConsecutiveLoad = i; - // Find a legal type for the vector store. EVT StoreTy = EVT::getVectorVT(Context, MemVT, i+1); + bool IsFastSt, IsFastLd; if (TLI.isTypeLegal(StoreTy) && TLI.allowsMemoryAccess(Context, DL, StoreTy, FirstStoreAS, - FirstStoreAlign) && + FirstStoreAlign, &IsFastSt) && IsFastSt && TLI.allowsMemoryAccess(Context, DL, StoreTy, FirstLoadAS, - FirstLoadAlign)) { + FirstLoadAlign, &IsFastLd) && IsFastLd) { LastLegalVectorType = i + 1; } @@ -11208,9 +11211,9 @@ bool DAGCombiner::MergeConsecutiveStores(StoreSDNode* St) { StoreTy = EVT::getIntegerVT(Context, SizeInBits); if (TLI.isTypeLegal(StoreTy) && TLI.allowsMemoryAccess(Context, DL, StoreTy, FirstStoreAS, - FirstStoreAlign) && + FirstStoreAlign, &IsFastSt) && IsFastSt && TLI.allowsMemoryAccess(Context, DL, StoreTy, FirstLoadAS, - FirstLoadAlign)) + FirstLoadAlign, &IsFastLd) && IsFastLd) LastLegalIntegerType = i + 1; // Or check whether a truncstore and extload is legal. else if (TLI.getTypeAction(Context, StoreTy) == @@ -11222,9 +11225,11 @@ bool DAGCombiner::MergeConsecutiveStores(StoreSDNode* St) { TLI.isLoadExtLegal(ISD::SEXTLOAD, LegalizedStoredValueTy, StoreTy) && TLI.isLoadExtLegal(ISD::EXTLOAD, LegalizedStoredValueTy, StoreTy) && TLI.allowsMemoryAccess(Context, DL, LegalizedStoredValueTy, - FirstStoreAS, FirstStoreAlign) && + FirstStoreAS, FirstStoreAlign, &IsFastSt) && + IsFastSt && TLI.allowsMemoryAccess(Context, DL, LegalizedStoredValueTy, - FirstLoadAS, FirstLoadAlign)) + FirstLoadAS, FirstLoadAlign, &IsFastLd) && + IsFastLd) LastLegalIntegerType = i+1; } } diff --git a/lib/Target/AMDGPU/SIISelLowering.cpp b/lib/Target/AMDGPU/SIISelLowering.cpp index b90ba886bdd..e7dcd8a3267 100644 --- a/lib/Target/AMDGPU/SIISelLowering.cpp +++ b/lib/Target/AMDGPU/SIISelLowering.cpp @@ -409,7 +409,10 @@ bool SITargetLowering::allowsMisalignedMemoryAccesses(EVT VT, // ds_read/write_b64 require 8-byte alignment, but we can do a 4 byte // aligned, 8 byte access in a single operation using ds_read2/write2_b32 // with adjacent offsets. - return Align % 4 == 0; + bool AlignedBy4 = (Align % 4 == 0); + if (IsFast) + *IsFast = AlignedBy4; + return AlignedBy4; } // Smaller than dword value must be aligned. diff --git a/test/CodeGen/X86/dag-merge-fast-accesses.ll b/test/CodeGen/X86/dag-merge-fast-accesses.ll new file mode 100644 index 00000000000..c5bb34013ad --- /dev/null +++ b/test/CodeGen/X86/dag-merge-fast-accesses.ll @@ -0,0 +1,81 @@ +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=-slow-unaligned-mem-16 | FileCheck %s --check-prefix=FAST +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+slow-unaligned-mem-16 | FileCheck %s --check-prefix=SLOW + +; Verify that the DAGCombiner is creating unaligned 16-byte loads and stores +; if and only if those are fast. + +define void @merge_const_vec_store(i64* %ptr) { +; FAST-LABEL: merge_const_vec_store: +; FAST: # BB#0: +; FAST-NEXT: xorps %xmm0, %xmm0 +; FAST-NEXT: movups %xmm0, (%rdi) +; FAST-NEXT: retq +; +; SLOW-LABEL: merge_const_vec_store: +; SLOW: # BB#0: +; SLOW-NEXT: movq $0, (%rdi) +; SLOW-NEXT: movq $0, 8(%rdi) +; SLOW-NEXT: retq + + %idx0 = getelementptr i64, i64* %ptr, i64 0 + %idx1 = getelementptr i64, i64* %ptr, i64 1 + + store i64 0, i64* %idx0, align 8 + store i64 0, i64* %idx1, align 8 + ret void +} + + +define void @merge_vec_element_store(<4 x double> %v, double* %ptr) { +; FAST-LABEL: merge_vec_element_store: +; FAST: # BB#0: +; FAST-NEXT: movups %xmm0, (%rdi) +; FAST-NEXT: retq +; +; SLOW-LABEL: merge_vec_element_store: +; SLOW: # BB#0: +; SLOW-NEXT: movlpd %xmm0, (%rdi) +; SLOW-NEXT: movhpd %xmm0, 8(%rdi) +; SLOW-NEXT: retq + + %vecext0 = extractelement <4 x double> %v, i32 0 + %vecext1 = extractelement <4 x double> %v, i32 1 + + %idx0 = getelementptr double, double* %ptr, i64 0 + %idx1 = getelementptr double, double* %ptr, i64 1 + + store double %vecext0, double* %idx0, align 8 + store double %vecext1, double* %idx1, align 8 + ret void +} + + +define void @merge_vec_load_and_stores(i64 *%ptr) { +; FAST-LABEL: merge_vec_load_and_stores: +; FAST: # BB#0: +; FAST-NEXT: movups (%rdi), %xmm0 +; FAST-NEXT: movups %xmm0, 40(%rdi) +; FAST-NEXT: retq +; +; SLOW-LABEL: merge_vec_load_and_stores: +; SLOW: # BB#0: +; SLOW-NEXT: movq (%rdi), %rax +; SLOW-NEXT: movq 8(%rdi), %rcx +; SLOW-NEXT: movq %rax, 40(%rdi) +; SLOW-NEXT: movq %rcx, 48(%rdi) +; SLOW-NEXT: retq + + %idx0 = getelementptr i64, i64* %ptr, i64 0 + %idx1 = getelementptr i64, i64* %ptr, i64 1 + + %ld0 = load i64, i64* %idx0, align 4 + %ld1 = load i64, i64* %idx1, align 4 + + %idx4 = getelementptr i64, i64* %ptr, i64 5 + %idx5 = getelementptr i64, i64* %ptr, i64 6 + + store i64 %ld0, i64* %idx4, align 4 + store i64 %ld1, i64* %idx5, align 4 + ret void +} +