diff --git a/lib/Target/R600/AMDGPUISelLowering.cpp b/lib/Target/R600/AMDGPUISelLowering.cpp index 97771423567..aae275af7d3 100644 --- a/lib/Target/R600/AMDGPUISelLowering.cpp +++ b/lib/Target/R600/AMDGPUISelLowering.cpp @@ -360,6 +360,7 @@ AMDGPUTargetLowering::AMDGPUTargetLowering(TargetMachine &TM) : setTargetDAGCombine(ISD::MUL); setTargetDAGCombine(ISD::SELECT_CC); + setTargetDAGCombine(ISD::STORE); setSchedulingPreference(Sched::RegPressure); setJumpIsExpensive(true); @@ -1896,6 +1897,56 @@ static SDValue constantFoldBFE(SelectionDAG &DAG, IntTy Src0, return DAG.getConstant(Src0 >> Offset, MVT::i32); } +static bool usesAllNormalStores(SDNode *LoadVal) { + for (SDNode::use_iterator I = LoadVal->use_begin(); !I.atEnd(); ++I) { + if (!ISD::isNormalStore(*I)) + return false; + } + + return true; +} + +// If we have a copy of an illegal type, replace it with a load / store of an +// equivalently sized legal type. This avoids intermediate bit pack / unpack +// instructions emitted when handling extloads and truncstores. Ideally we could +// recognize the pack / unpack pattern to eliminate it. +SDValue AMDGPUTargetLowering::performStoreCombine(SDNode *N, + DAGCombinerInfo &DCI) const { + if (!DCI.isBeforeLegalize()) + return SDValue(); + + StoreSDNode *SN = cast(N); + SDValue Value = SN->getValue(); + EVT VT = Value.getValueType(); + + if (isTypeLegal(VT) || SN->isVolatile() || !ISD::isNormalLoad(Value.getNode())) + return SDValue(); + + LoadSDNode *LoadVal = cast(Value); + if (LoadVal->isVolatile() || !usesAllNormalStores(LoadVal)) + return SDValue(); + + EVT MemVT = LoadVal->getMemoryVT(); + + SDLoc SL(N); + SelectionDAG &DAG = DCI.DAG; + EVT LoadVT = getEquivalentMemType(*DAG.getContext(), MemVT); + + SDValue NewLoad = DAG.getLoad(ISD::UNINDEXED, ISD::NON_EXTLOAD, + LoadVT, SL, + LoadVal->getChain(), + LoadVal->getBasePtr(), + LoadVal->getOffset(), + LoadVT, + LoadVal->getMemOperand()); + + SDValue CastLoad = DAG.getNode(ISD::BITCAST, SL, VT, NewLoad.getValue(0)); + DCI.CombineTo(LoadVal, CastLoad, NewLoad.getValue(1), false); + + return DAG.getStore(SN->getChain(), SL, NewLoad, + SN->getBasePtr(), SN->getMemOperand()); +} + SDValue AMDGPUTargetLowering::performMulCombine(SDNode *N, DAGCombinerInfo &DCI) const { EVT VT = N->getValueType(0); @@ -1928,7 +1979,7 @@ SDValue AMDGPUTargetLowering::performMulCombine(SDNode *N, } SDValue AMDGPUTargetLowering::PerformDAGCombine(SDNode *N, - DAGCombinerInfo &DCI) const { + DAGCombinerInfo &DCI) const { SelectionDAG &DAG = DCI.DAG; SDLoc DL(N); @@ -2026,6 +2077,9 @@ SDValue AMDGPUTargetLowering::PerformDAGCombine(SDNode *N, break; } + + case ISD::STORE: + return performStoreCombine(N, DCI); } return SDValue(); } diff --git a/lib/Target/R600/AMDGPUISelLowering.h b/lib/Target/R600/AMDGPUISelLowering.h index 98a92ade115..4445f81bcc2 100644 --- a/lib/Target/R600/AMDGPUISelLowering.h +++ b/lib/Target/R600/AMDGPUISelLowering.h @@ -64,6 +64,7 @@ private: SelectionDAG &DAG) const; SDValue LowerSIGN_EXTEND_INREG(SDValue Op, SelectionDAG &DAG) const; + SDValue performStoreCombine(SDNode *N, DAGCombinerInfo &DCI) const; SDValue performMulCombine(SDNode *N, DAGCombinerInfo &DCI) const; protected: diff --git a/test/CodeGen/R600/copy-illegal-type.ll b/test/CodeGen/R600/copy-illegal-type.ll new file mode 100644 index 00000000000..f7c2321ae8f --- /dev/null +++ b/test/CodeGen/R600/copy-illegal-type.ll @@ -0,0 +1,166 @@ +; RUN: llc -march=r600 -mcpu=tahiti < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s + +; FUNC-LABEL: @test_copy_v4i8 +; SI: BUFFER_LOAD_DWORD [[REG:v[0-9]+]] +; SI: BUFFER_STORE_DWORD [[REG]] +; SI: S_ENDPGM +define void @test_copy_v4i8(<4 x i8> addrspace(1)* %out, <4 x i8> addrspace(1)* %in) nounwind { + %val = load <4 x i8> addrspace(1)* %in, align 4 + store <4 x i8> %val, <4 x i8> addrspace(1)* %out, align 4 + ret void +} + +; FUNC-LABEL: @test_copy_v4i8_x2 +; SI: BUFFER_LOAD_DWORD [[REG:v[0-9]+]] +; SI: BUFFER_STORE_DWORD [[REG]] +; SI: BUFFER_STORE_DWORD [[REG]] +; SI: S_ENDPGM +define void @test_copy_v4i8_x2(<4 x i8> addrspace(1)* %out0, <4 x i8> addrspace(1)* %out1, <4 x i8> addrspace(1)* %in) nounwind { + %val = load <4 x i8> addrspace(1)* %in, align 4 + store <4 x i8> %val, <4 x i8> addrspace(1)* %out0, align 4 + store <4 x i8> %val, <4 x i8> addrspace(1)* %out1, align 4 + ret void +} + +; FUNC-LABEL: @test_copy_v4i8_x3 +; SI: BUFFER_LOAD_DWORD [[REG:v[0-9]+]] +; SI: BUFFER_STORE_DWORD [[REG]] +; SI: BUFFER_STORE_DWORD [[REG]] +; SI: BUFFER_STORE_DWORD [[REG]] +; SI: S_ENDPGM +define void @test_copy_v4i8_x3(<4 x i8> addrspace(1)* %out0, <4 x i8> addrspace(1)* %out1, <4 x i8> addrspace(1)* %out2, <4 x i8> addrspace(1)* %in) nounwind { + %val = load <4 x i8> addrspace(1)* %in, align 4 + store <4 x i8> %val, <4 x i8> addrspace(1)* %out0, align 4 + store <4 x i8> %val, <4 x i8> addrspace(1)* %out1, align 4 + store <4 x i8> %val, <4 x i8> addrspace(1)* %out2, align 4 + ret void +} + +; FUNC-LABEL: @test_copy_v4i8_x4 +; SI: BUFFER_LOAD_DWORD [[REG:v[0-9]+]] +; SI: BUFFER_STORE_DWORD [[REG]] +; SI: BUFFER_STORE_DWORD [[REG]] +; SI: BUFFER_STORE_DWORD [[REG]] +; SI: BUFFER_STORE_DWORD [[REG]] +; SI: S_ENDPGM +define void @test_copy_v4i8_x4(<4 x i8> addrspace(1)* %out0, <4 x i8> addrspace(1)* %out1, <4 x i8> addrspace(1)* %out2, <4 x i8> addrspace(1)* %out3, <4 x i8> addrspace(1)* %in) nounwind { + %val = load <4 x i8> addrspace(1)* %in, align 4 + store <4 x i8> %val, <4 x i8> addrspace(1)* %out0, align 4 + store <4 x i8> %val, <4 x i8> addrspace(1)* %out1, align 4 + store <4 x i8> %val, <4 x i8> addrspace(1)* %out2, align 4 + store <4 x i8> %val, <4 x i8> addrspace(1)* %out3, align 4 + ret void +} + +; FUNC-LABEL: @test_copy_v4i8_extra_use +; SI: BUFFER_LOAD_UBYTE +; SI: BUFFER_LOAD_UBYTE +; SI: BUFFER_LOAD_UBYTE +; SI: BUFFER_LOAD_UBYTE +; SI-DAG: V_ADD +; SI-DAG: V_ADD +; SI-DAG: V_ADD +; SI-DAG: V_ADD +; SI-DAG: BUFFER_STORE_BYTE +; SI-DAG: BUFFER_STORE_BYTE +; SI-DAG: BUFFER_STORE_BYTE +; SI-DAG: BUFFER_STORE_BYTE +; SI-DAG: BUFFER_STORE_BYTE +; SI-DAG: BUFFER_STORE_BYTE +; SI-DAG: BUFFER_STORE_BYTE +; SI_DAG: BUFFER_STORE_BYTE + +; After scalarizing v4i8 loads is fixed. +; XSI: BUFFER_LOAD_DWORD +; XSI: V_BFE +; XSI: V_ADD +; XSI: V_ADD +; XSI: V_ADD +; XSI: BUFFER_STORE_DWORD +; XSI: BUFFER_STORE_DWORD + +; SI: S_ENDPGM +define void @test_copy_v4i8_extra_use(<4 x i8> addrspace(1)* %out0, <4 x i8> addrspace(1)* %out1, <4 x i8> addrspace(1)* %in) nounwind { + %val = load <4 x i8> addrspace(1)* %in, align 4 + %add = add <4 x i8> %val, + store <4 x i8> %val, <4 x i8> addrspace(1)* %out0, align 4 + store <4 x i8> %add, <4 x i8> addrspace(1)* %out1, align 4 + ret void +} + +; FUNC-LABEL: @test_copy_v4i8_x2_extra_use +; SI: BUFFER_LOAD_UBYTE +; SI: BUFFER_LOAD_UBYTE +; SI: BUFFER_LOAD_UBYTE +; SI: BUFFER_LOAD_UBYTE +; SI-DAG: V_ADD +; SI-DAG: V_ADD +; SI-DAG: V_ADD +; SI-DAG: V_ADD +; SI-DAG: BUFFER_STORE_BYTE +; SI-DAG: BUFFER_STORE_BYTE +; SI-DAG: BUFFER_STORE_BYTE +; SI-DAG: BUFFER_STORE_BYTE +; SI-DAG: BUFFER_STORE_BYTE +; SI-DAG: BUFFER_STORE_BYTE +; SI-DAG: BUFFER_STORE_BYTE +; SI_DAG: BUFFER_STORE_BYTE +; SI-DAG: BUFFER_STORE_BYTE +; SI-DAG: BUFFER_STORE_BYTE +; SI-DAG: BUFFER_STORE_BYTE +; SI_DAG: BUFFER_STORE_BYTE + +; XSI: BUFFER_LOAD_DWORD +; XSI: BFE +; XSI: BUFFER_STORE_DWORD +; XSI: V_ADD +; XSI: BUFFER_STORE_DWORD +; XSI-NEXT: BUFFER_STORE_DWORD + +; SI: S_ENDPGM +define void @test_copy_v4i8_x2_extra_use(<4 x i8> addrspace(1)* %out0, <4 x i8> addrspace(1)* %out1, <4 x i8> addrspace(1)* %out2, <4 x i8> addrspace(1)* %in) nounwind { + %val = load <4 x i8> addrspace(1)* %in, align 4 + %add = add <4 x i8> %val, + store <4 x i8> %val, <4 x i8> addrspace(1)* %out0, align 4 + store <4 x i8> %add, <4 x i8> addrspace(1)* %out1, align 4 + store <4 x i8> %val, <4 x i8> addrspace(1)* %out2, align 4 + ret void +} + +; FUNC-LABEL: @test_copy_v3i8 +; SI-NOT: BFE +; SI-NOT: BFI +; SI: S_ENDPGM +define void @test_copy_v3i8(<3 x i8> addrspace(1)* %out, <3 x i8> addrspace(1)* %in) nounwind { + %val = load <3 x i8> addrspace(1)* %in, align 4 + store <3 x i8> %val, <3 x i8> addrspace(1)* %out, align 4 + ret void +} + +; FUNC-LABEL: @test_copy_v4i8_volatile_load +; SI: BUFFER_LOAD_UBYTE +; SI: BUFFER_LOAD_UBYTE +; SI: BUFFER_LOAD_UBYTE +; SI: BUFFER_LOAD_UBYTE +; SI: S_ENDPGM +define void @test_copy_v4i8_volatile_load(<4 x i8> addrspace(1)* %out, <4 x i8> addrspace(1)* %in) nounwind { + %val = load volatile <4 x i8> addrspace(1)* %in, align 4 + store <4 x i8> %val, <4 x i8> addrspace(1)* %out, align 4 + ret void +} + +; FUNC-LABEL: @test_copy_v4i8_volatile_store +; SI: BUFFER_LOAD_UBYTE +; SI: BUFFER_LOAD_UBYTE +; SI: BUFFER_LOAD_UBYTE +; SI: BUFFER_LOAD_UBYTE +; SI: BUFFER_STORE_BYTE +; SI: BUFFER_STORE_BYTE +; SI: BUFFER_STORE_BYTE +; SI: BUFFER_STORE_BYTE +; SI: S_ENDPGM +define void @test_copy_v4i8_volatile_store(<4 x i8> addrspace(1)* %out, <4 x i8> addrspace(1)* %in) nounwind { + %val = load <4 x i8> addrspace(1)* %in, align 4 + store volatile <4 x i8> %val, <4 x i8> addrspace(1)* %out, align 4 + ret void +} diff --git a/test/CodeGen/R600/indirect-private-64.ll b/test/CodeGen/R600/indirect-private-64.ll index 1e23fd780ec..2f628458387 100644 --- a/test/CodeGen/R600/indirect-private-64.ll +++ b/test/CodeGen/R600/indirect-private-64.ll @@ -31,10 +31,14 @@ define void @private_access_f64_alloca(double addrspace(1)* noalias %out, double ; SI-ALLOCA: V_MOVRELS_B32_e32 ; SI-ALLOCA: V_MOVRELS_B32_e32 -; SI-PROMOTE: DS_WRITE_B64 -; SI-PROMOTE: DS_WRITE_B64 -; SI-PROMOTE: DS_READ_B64 -; SI-PROMOTE: DS_READ_B64 +; SI-PROMOTE: DS_WRITE_B32 +; SI-PROMOTE: DS_WRITE_B32 +; SI-PROMOTE: DS_WRITE_B32 +; SI-PROMOTE: DS_WRITE_B32 +; SI-PROMOTE: DS_READ_B32 +; SI-PROMOTE: DS_READ_B32 +; SI-PROMOTE: DS_READ_B32 +; SI-PROMOTE: DS_READ_B32 define void @private_access_v2f64_alloca(<2 x double> addrspace(1)* noalias %out, <2 x double> addrspace(1)* noalias %in, i32 %b) nounwind { %val = load <2 x double> addrspace(1)* %in, align 16 %array = alloca <2 x double>, i32 16, align 16 @@ -77,10 +81,14 @@ define void @private_access_i64_alloca(i64 addrspace(1)* noalias %out, i64 addrs ; SI-ALLOCA: V_MOVRELS_B32_e32 ; SI-ALLOCA: V_MOVRELS_B32_e32 -; SI-PROMOTE: DS_WRITE_B64 -; SI-PROMOTE: DS_WRITE_B64 -; SI-PROMOTE: DS_READ_B64 -; SI-PROMOTE: DS_READ_B64 +; SI-PROMOTE: DS_WRITE_B32 +; SI-PROMOTE: DS_WRITE_B32 +; SI-PROMOTE: DS_WRITE_B32 +; SI-PROMOTE: DS_WRITE_B32 +; SI-PROMOTE: DS_READ_B32 +; SI-PROMOTE: DS_READ_B32 +; SI-PROMOTE: DS_READ_B32 +; SI-PROMOTE: DS_READ_B32 define void @private_access_v2i64_alloca(<2 x i64> addrspace(1)* noalias %out, <2 x i64> addrspace(1)* noalias %in, i32 %b) nounwind { %val = load <2 x i64> addrspace(1)* %in, align 16 %array = alloca <2 x i64>, i32 16, align 16 diff --git a/test/CodeGen/R600/load.ll b/test/CodeGen/R600/load.ll index a57df5cc93e..8905fbd3aeb 100644 --- a/test/CodeGen/R600/load.ll +++ b/test/CodeGen/R600/load.ll @@ -254,8 +254,8 @@ entry: ; load a v2f32 value from the global address space ; FUNC-LABEL: @load_v2f32 +; R600-CHECK: MEM_RAT ; R600-CHECK: VTX_READ_64 - ; SI-CHECK: BUFFER_LOAD_DWORDX2 define void @load_v2f32(<2 x float> addrspace(1)* %out, <2 x float> addrspace(1)* %in) { entry: @@ -265,9 +265,7 @@ entry: } ; FUNC-LABEL: @load_i64 -; R600-CHECK: MEM_RAT -; R600-CHECK: MEM_RAT - +; R600-CHECK: VTX_READ_64 ; SI-CHECK: BUFFER_LOAD_DWORDX2 define void @load_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %in) { entry: