From 7dd37ae57a00f1c664b9ae0e9451c1717cf5348d Mon Sep 17 00:00:00 2001 From: Tom Stellard Date: Wed, 22 Jan 2014 19:24:14 +0000 Subject: [PATCH] R600/SI: Add support for i8 and i16 private loads/stores git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@199823 91177308-0d34-0410-b5e6-96231b3b80d8 --- lib/Target/R600/AMDGPUISelLowering.cpp | 78 ++++++++++++++++++++++++++ lib/Target/R600/AMDGPUISelLowering.h | 1 + lib/Target/R600/R600ISelLowering.cpp | 13 +++++ lib/Target/R600/SIISelLowering.cpp | 60 ++++++++++++++++---- test/CodeGen/R600/extload.ll | 14 +++-- test/CodeGen/R600/private-memory.ll | 59 ++++++++++++++++--- 6 files changed, 200 insertions(+), 25 deletions(-) diff --git a/lib/Target/R600/AMDGPUISelLowering.cpp b/lib/Target/R600/AMDGPUISelLowering.cpp index c59be7ce243..a65dd65b6e6 100644 --- a/lib/Target/R600/AMDGPUISelLowering.cpp +++ b/lib/Target/R600/AMDGPUISelLowering.cpp @@ -589,18 +589,96 @@ SDValue AMDGPUTargetLowering::SplitVectorStore(SDValue Op, return DAG.getNode(ISD::TokenFactor, SL, MVT::Other, &Chains[0], NumElts); } +SDValue AMDGPUTargetLowering::LowerLOAD(SDValue Op, SelectionDAG &DAG) const { + SDLoc DL(Op); + LoadSDNode *Load = cast(Op); + ISD::LoadExtType ExtType = Load->getExtensionType(); + + if (Load->getAddressSpace() != AMDGPUAS::PRIVATE_ADDRESS || + ExtType == ISD::NON_EXTLOAD || Load->getMemoryVT().bitsGE(MVT::i32)) + return SDValue(); + + + EVT VT = Op.getValueType(); + EVT MemVT = Load->getMemoryVT(); + unsigned Mask = 0; + if (Load->getMemoryVT() == MVT::i8) { + Mask = 0xff; + } else if (Load->getMemoryVT() == MVT::i16) { + Mask = 0xffff; + } + SDValue Ptr = DAG.getNode(ISD::SRL, DL, MVT::i32, Load->getBasePtr(), + DAG.getConstant(2, MVT::i32)); + SDValue Ret = DAG.getNode(AMDGPUISD::REGISTER_LOAD, DL, Op.getValueType(), + Load->getChain(), Ptr, + DAG.getTargetConstant(0, MVT::i32), + Op.getOperand(2)); + SDValue ByteIdx = DAG.getNode(ISD::AND, DL, MVT::i32, + Load->getBasePtr(), + DAG.getConstant(0x3, MVT::i32)); + SDValue ShiftAmt = DAG.getNode(ISD::SHL, DL, MVT::i32, ByteIdx, + DAG.getConstant(3, MVT::i32)); + Ret = DAG.getNode(ISD::SRL, DL, MVT::i32, Ret, ShiftAmt); + Ret = DAG.getNode(ISD::AND, DL, MVT::i32, Ret, + DAG.getConstant(Mask, MVT::i32)); + if (ExtType == ISD::SEXTLOAD) { + SDValue SExtShift = DAG.getConstant( + VT.getSizeInBits() - MemVT.getSizeInBits(), MVT::i32); + Ret = DAG.getNode(ISD::SHL, DL, MVT::i32, Ret, SExtShift); + Ret = DAG.getNode(ISD::SRA, DL, MVT::i32, Ret, SExtShift); + } + + return Ret; +} + SDValue AMDGPUTargetLowering::LowerSTORE(SDValue Op, SelectionDAG &DAG) const { + SDLoc DL(Op); SDValue Result = AMDGPUTargetLowering::MergeVectorStore(Op, DAG); if (Result.getNode()) { return Result; } StoreSDNode *Store = cast(Op); + SDValue Chain = Store->getChain(); if ((Store->getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS || Store->getAddressSpace() == AMDGPUAS::PRIVATE_ADDRESS) && Store->getValue().getValueType().isVector()) { return SplitVectorStore(Op, DAG); } + + if (Store->getAddressSpace() == AMDGPUAS::PRIVATE_ADDRESS && + Store->getMemoryVT().bitsLT(MVT::i32)) { + unsigned Mask = 0; + if (Store->getMemoryVT() == MVT::i8) { + Mask = 0xff; + } else if (Store->getMemoryVT() == MVT::i16) { + Mask = 0xffff; + } + SDValue TruncPtr = DAG.getZExtOrTrunc(Store->getBasePtr(), DL, MVT::i32); + SDValue Ptr = DAG.getNode(ISD::SRL, DL, MVT::i32, TruncPtr, + DAG.getConstant(2, MVT::i32)); + SDValue Dst = DAG.getNode(AMDGPUISD::REGISTER_LOAD, DL, MVT::i32, + Chain, Ptr, DAG.getTargetConstant(0, MVT::i32)); + SDValue ByteIdx = DAG.getNode(ISD::AND, DL, MVT::i32, TruncPtr, + DAG.getConstant(0x3, MVT::i32)); + SDValue ShiftAmt = DAG.getNode(ISD::SHL, DL, MVT::i32, ByteIdx, + DAG.getConstant(3, MVT::i32)); + SDValue SExtValue = DAG.getNode(ISD::SIGN_EXTEND, DL, MVT::i32, + Store->getValue()); + SDValue MaskedValue = DAG.getNode(ISD::AND, DL, MVT::i32, SExtValue, + DAG.getConstant(Mask, MVT::i32)); + SDValue ShiftedValue = DAG.getNode(ISD::SHL, DL, MVT::i32, + MaskedValue, ShiftAmt); + SDValue DstMask = DAG.getNode(ISD::SHL, DL, MVT::i32, DAG.getConstant(Mask, MVT::i32), + ShiftAmt); + DstMask = DAG.getNode(ISD::XOR, DL, MVT::i32, DstMask, + DAG.getConstant(0xffffffff, MVT::i32)); + Dst = DAG.getNode(ISD::AND, DL, MVT::i32, Dst, DstMask); + + SDValue Value = DAG.getNode(ISD::OR, DL, MVT::i32, Dst, ShiftedValue); + return DAG.getNode(AMDGPUISD::REGISTER_STORE, DL, MVT::Other, + Chain, Value, Ptr, DAG.getTargetConstant(0, MVT::i32)); + } return SDValue(); } diff --git a/lib/Target/R600/AMDGPUISelLowering.h b/lib/Target/R600/AMDGPUISelLowering.h index 2dfd3cf492a..fd6e3a59985 100644 --- a/lib/Target/R600/AMDGPUISelLowering.h +++ b/lib/Target/R600/AMDGPUISelLowering.h @@ -54,6 +54,7 @@ protected: /// \brief Split a vector load into multiple scalar loads. SDValue SplitVectorLoad(const SDValue &Op, SelectionDAG &DAG) const; SDValue SplitVectorStore(SDValue Op, SelectionDAG &DAG) const; + SDValue LowerLOAD(SDValue Op, SelectionDAG &DAG) const; SDValue LowerSTORE(SDValue Op, SelectionDAG &DAG) const; bool isHWTrueValue(SDValue Op) const; bool isHWFalseValue(SDValue Op) const; diff --git a/lib/Target/R600/R600ISelLowering.cpp b/lib/Target/R600/R600ISelLowering.cpp index 8d71919704d..03feabe23e6 100644 --- a/lib/Target/R600/R600ISelLowering.cpp +++ b/lib/Target/R600/R600ISelLowering.cpp @@ -1113,6 +1113,10 @@ SDValue R600TargetLowering::LowerSTORE(SDValue Op, SelectionDAG &DAG) const { return SDValue(); } + SDValue Ret = AMDGPUTargetLowering::LowerSTORE(Op, DAG); + if (Ret.getNode()) { + return Ret; + } // Lowering for indirect addressing const MachineFunction &MF = DAG.getMachineFunction(); @@ -1204,6 +1208,15 @@ SDValue R600TargetLowering::LowerLOAD(SDValue Op, SelectionDAG &DAG) const SDValue Ptr = Op.getOperand(1); SDValue LoweredLoad; + SDValue Ret = AMDGPUTargetLowering::LowerLOAD(Op, DAG); + if (Ret.getNode()) { + SDValue Ops[2]; + Ops[0] = Ret; + Ops[1] = Chain; + return DAG.getMergeValues(Ops, 2, DL); + } + + if (LoadNode->getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS && VT.isVector()) { SDValue MergedValues[2] = { SplitVectorLoad(Op, DAG), diff --git a/lib/Target/R600/SIISelLowering.cpp b/lib/Target/R600/SIISelLowering.cpp index 4fb844439ab..9430689c61c 100644 --- a/lib/Target/R600/SIISelLowering.cpp +++ b/lib/Target/R600/SIISelLowering.cpp @@ -125,11 +125,17 @@ SITargetLowering::SITargetLowering(TargetMachine &TM) : setOperationAction(ISD::INTRINSIC_VOID, MVT::Other, Custom); setLoadExtAction(ISD::SEXTLOAD, MVT::i32, Expand); - setLoadExtAction(ISD::EXTLOAD, MVT::i32, Expand); + setLoadExtAction(ISD::SEXTLOAD, MVT::i8, Custom); + setLoadExtAction(ISD::SEXTLOAD, MVT::i16, Custom); setLoadExtAction(ISD::SEXTLOAD, MVT::v8i16, Expand); setLoadExtAction(ISD::SEXTLOAD, MVT::v16i16, Expand); + setLoadExtAction(ISD::EXTLOAD, MVT::i8, Custom); + setLoadExtAction(ISD::EXTLOAD, MVT::i16, Custom); + setLoadExtAction(ISD::EXTLOAD, MVT::i32, Expand); setLoadExtAction(ISD::EXTLOAD, MVT::f32, Expand); + setTruncStoreAction(MVT::i32, MVT::i8, Custom); + setTruncStoreAction(MVT::i32, MVT::i16, Custom); setTruncStoreAction(MVT::f64, MVT::f32, Expand); setTruncStoreAction(MVT::i64, MVT::i32, Expand); setTruncStoreAction(MVT::i128, MVT::i64, Expand); @@ -700,21 +706,26 @@ SDValue SITargetLowering::LowerBRCOND(SDValue BRCOND, SDValue SITargetLowering::LowerLOAD(SDValue Op, SelectionDAG &DAG) const { SDLoc DL(Op); LoadSDNode *Load = cast(Op); + SDValue Ret = AMDGPUTargetLowering::LowerLOAD(Op, DAG); + SDValue MergedValues[2]; + MergedValues[1] = Load->getChain(); + if (Ret.getNode()) { + MergedValues[0] = Ret; + return DAG.getMergeValues(MergedValues, 2, DL); + } - if (Load->getAddressSpace() != AMDGPUAS::PRIVATE_ADDRESS) + if (Load->getAddressSpace() != AMDGPUAS::PRIVATE_ADDRESS) { return SDValue(); + } SDValue Ptr = DAG.getNode(ISD::SRL, DL, MVT::i32, Load->getBasePtr(), DAG.getConstant(2, MVT::i32)); + Ret = DAG.getNode(AMDGPUISD::REGISTER_LOAD, DL, Op.getValueType(), + Load->getChain(), Ptr, + DAG.getTargetConstant(0, MVT::i32), + Op.getOperand(2)); - SDValue Ret = DAG.getNode(AMDGPUISD::REGISTER_LOAD, DL, Op.getValueType(), - Load->getChain(), Ptr, - DAG.getTargetConstant(0, MVT::i32), - Op.getOperand(2)); - SDValue MergedValues[2] = { - Ret, - Load->getChain() - }; + MergedValues[0] = Ret; return DAG.getMergeValues(MergedValues, 2, DL); } @@ -796,7 +807,34 @@ SDValue SITargetLowering::LowerSTORE(SDValue Op, SelectionDAG &DAG) const { SDValue Chain = Store->getChain(); SmallVector Values; - if (VT == MVT::i64) { + if (Store->isTruncatingStore()) { + unsigned Mask = 0; + if (Store->getMemoryVT() == MVT::i8) { + Mask = 0xff; + } else if (Store->getMemoryVT() == MVT::i16) { + Mask = 0xffff; + } + SDValue Dst = DAG.getNode(AMDGPUISD::REGISTER_LOAD, DL, MVT::i32, + Chain, Store->getBasePtr(), + DAG.getConstant(0, MVT::i32)); + SDValue ByteIdx = DAG.getNode(ISD::AND, DL, MVT::i32, Store->getBasePtr(), + DAG.getConstant(0x3, MVT::i32)); + SDValue ShiftAmt = DAG.getNode(ISD::SHL, DL, MVT::i32, ByteIdx, + DAG.getConstant(3, MVT::i32)); + SDValue MaskedValue = DAG.getNode(ISD::AND, DL, MVT::i32, Store->getValue(), + DAG.getConstant(Mask, MVT::i32)); + SDValue ShiftedValue = DAG.getNode(ISD::SHL, DL, MVT::i32, + MaskedValue, ShiftAmt); + SDValue RotrAmt = DAG.getNode(ISD::SUB, DL, MVT::i32, + DAG.getConstant(32, MVT::i32), ShiftAmt); + SDValue DstMask = DAG.getNode(ISD::ROTR, DL, MVT::i32, + DAG.getConstant(Mask, MVT::i32), + RotrAmt); + Dst = DAG.getNode(ISD::AND, DL, MVT::i32, Dst, DstMask); + Dst = DAG.getNode(ISD::OR, DL, MVT::i32, Dst, ShiftedValue); + + Values.push_back(Dst); + } else if (VT == MVT::i64) { for (unsigned i = 0; i < 2; ++i) { Values.push_back(DAG.getNode(ISD::EXTRACT_ELEMENT, DL, MVT::i32, Store->getValue(), DAG.getConstant(i, MVT::i32))); diff --git a/test/CodeGen/R600/extload.ll b/test/CodeGen/R600/extload.ll index aa660b38838..f78cdc4fb02 100644 --- a/test/CodeGen/R600/extload.ll +++ b/test/CodeGen/R600/extload.ll @@ -2,7 +2,7 @@ ; EG-LABEL: @anyext_load_i8: ; EG: AND_INT -; EG-NEXT: 255 +; EG: 255 define void @anyext_load_i8(i8 addrspace(1)* nocapture noalias %out, i8 addrspace(1)* nocapture noalias %src) nounwind { %cast = bitcast i8 addrspace(1)* %src to i32 addrspace(1)* %load = load i32 addrspace(1)* %cast, align 1 @@ -14,8 +14,9 @@ define void @anyext_load_i8(i8 addrspace(1)* nocapture noalias %out, i8 addrspac ; EG-LABEL: @anyext_load_i16: ; EG: AND_INT -; EG: LSHL -; EG: 65535 +; EG: AND_INT +; EG-DAG: 65535 +; EG-DAG: -65536 define void @anyext_load_i16(i16 addrspace(1)* nocapture noalias %out, i16 addrspace(1)* nocapture noalias %src) nounwind { %cast = bitcast i16 addrspace(1)* %src to i32 addrspace(1)* %load = load i32 addrspace(1)* %cast, align 1 @@ -27,7 +28,7 @@ define void @anyext_load_i16(i16 addrspace(1)* nocapture noalias %out, i16 addrs ; EG-LABEL: @anyext_load_lds_i8: ; EG: AND_INT -; EG-NEXT: 255 +; EG: 255 define void @anyext_load_lds_i8(i8 addrspace(3)* nocapture noalias %out, i8 addrspace(3)* nocapture noalias %src) nounwind { %cast = bitcast i8 addrspace(3)* %src to i32 addrspace(3)* %load = load i32 addrspace(3)* %cast, align 1 @@ -39,8 +40,9 @@ define void @anyext_load_lds_i8(i8 addrspace(3)* nocapture noalias %out, i8 addr ; EG-LABEL: @anyext_load_lds_i16: ; EG: AND_INT -; EG: LSHL -; EG: 65535 +; EG: AND_INT +; EG-DAG: 65535 +; EG-DAG: -65536 define void @anyext_load_lds_i16(i16 addrspace(3)* nocapture noalias %out, i16 addrspace(3)* nocapture noalias %src) nounwind { %cast = bitcast i16 addrspace(3)* %src to i32 addrspace(3)* %load = load i32 addrspace(3)* %cast, align 1 diff --git a/test/CodeGen/R600/private-memory.ll b/test/CodeGen/R600/private-memory.ll index 848d164eebd..3fd67d75bb7 100644 --- a/test/CodeGen/R600/private-memory.ll +++ b/test/CodeGen/R600/private-memory.ll @@ -1,10 +1,11 @@ -; RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck %s --check-prefix=R600-CHECK -; RUN: llc < %s -march=r600 -mcpu=SI | FileCheck %s --check-prefix=SI-CHECK +; RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck %s --check-prefix=R600-CHECK --check-prefix=FUNC +; RUN: llc < %s -march=r600 -mcpu=SI | FileCheck %s --check-prefix=SI-CHECK --check-prefix=FUNC ; This test checks that uses and defs of the AR register happen in the same ; instruction clause. -; R600-CHECK-LABEL: @mova_same_clause +; FUNC-LABEL: @mova_same_clause + ; R600-CHECK: MOVA_INT ; R600-CHECK-NOT: ALU clause ; R600-CHECK: 0 + AR.x @@ -12,7 +13,6 @@ ; R600-CHECK-NOT: ALU clause ; R600-CHECK: 0 + AR.x -; SI-CHECK-LABEL: @mova_same_clause ; SI-CHECK: V_READFIRSTLANE ; SI-CHECK: V_MOVRELD ; SI-CHECK: S_CBRANCH @@ -46,9 +46,8 @@ entry: ; XXX: This generated code has unnecessary MOVs, we should be able to optimize ; this. -; R600-CHECK-LABEL: @multiple_structs +; FUNC-LABEL: @multiple_structs ; R600-CHECK-NOT: MOVA_INT -; SI-CHECK-LABEL: @multiple_structs ; SI-CHECK-NOT: V_MOVREL %struct.point = type { i32, i32 } @@ -77,9 +76,8 @@ entry: ; loads and stores should be lowered to copies, so there shouldn't be any ; MOVA instructions. -; R600-CHECK-LABEL: @direct_loop +; FUNC-LABEL: @direct_loop ; R600-CHECK-NOT: MOVA_INT -; SI-CHECK-LABEL: @direct_loop ; SI-CHECK-NOT: V_MOVREL define void @direct_loop(i32 addrspace(1)* %out, i32 addrspace(1)* %in) { @@ -113,3 +111,48 @@ for.end: store i32 %value, i32 addrspace(1)* %out ret void } + +; FUNC-LABEL: @short_array + +; R600-CHECK: MOV {{\** *}}T{{[0-9]\.[XYZW]}}, literal +; R600-CHECK: 65536 +; R600-CHECK: MOVA_INT + +; SI-CHECK: V_MOV_B32_e32 v{{[0-9]}}, 65536 +; SI-CHECK: V_MOVRELS_B32_e32 +define void @short_array(i32 addrspace(1)* %out, i32 %index) { +entry: + %0 = alloca [2 x i16] + %1 = getelementptr [2 x i16]* %0, i32 0, i32 0 + %2 = getelementptr [2 x i16]* %0, i32 0, i32 1 + store i16 0, i16* %1 + store i16 1, i16* %2 + %3 = getelementptr [2 x i16]* %0, i32 0, i32 %index + %4 = load i16* %3 + %5 = sext i16 %4 to i32 + store i32 %5, i32 addrspace(1)* %out + ret void +} + +; FUNC-LABEL: @char_array + +; R600-CHECK: OR_INT {{\** *}}T{{[0-9]\.[XYZW]}}, {{[PVT0-9]+\.[XYZW]}}, literal +; R600-CHECK: 256 +; R600-CHECK: MOVA_INT + +; SI-CHECK: V_OR_B32_e32 v{{[0-9]}}, 256 +; SI-CHECK: V_MOVRELS_B32_e32 +define void @char_array(i32 addrspace(1)* %out, i32 %index) { +entry: + %0 = alloca [2 x i8] + %1 = getelementptr [2 x i8]* %0, i32 0, i32 0 + %2 = getelementptr [2 x i8]* %0, i32 0, i32 1 + store i8 0, i8* %1 + store i8 1, i8* %2 + %3 = getelementptr [2 x i8]* %0, i32 0, i32 %index + %4 = load i8* %3 + %5 = sext i8 %4 to i32 + store i32 %5, i32 addrspace(1)* %out + ret void + +}