diff --git a/lib/Target/ARM/ARMISelDAGToDAG.cpp b/lib/Target/ARM/ARMISelDAGToDAG.cpp index 87c0d11b546..86407f71567 100644 --- a/lib/Target/ARM/ARMISelDAGToDAG.cpp +++ b/lib/Target/ARM/ARMISelDAGToDAG.cpp @@ -1484,6 +1484,61 @@ SDNode *ARMDAGToDAGISel::Select(SDValue Op) { N->getOperand(5), N->getOperand(6), Chain }; return CurDAG->getTargetNode(Opc, dl, MVT::Other, Ops, 8); } + + case Intrinsic::arm_neon_vst2lane: { + SDValue MemAddr, MemUpdate, MemOpc; + if (!SelectAddrMode6(Op, N->getOperand(2), MemAddr, MemUpdate, MemOpc)) + return NULL; + switch (N->getOperand(3).getValueType().getSimpleVT().SimpleTy) { + default: llvm_unreachable("unhandled vst2lane type"); + case MVT::v8i8: Opc = ARM::VST2LNd8; break; + case MVT::v4i16: Opc = ARM::VST2LNd16; break; + case MVT::v2f32: + case MVT::v2i32: Opc = ARM::VST2LNd32; break; + } + SDValue Chain = N->getOperand(0); + const SDValue Ops[] = { MemAddr, MemUpdate, MemOpc, + N->getOperand(3), N->getOperand(4), + N->getOperand(5), Chain }; + return CurDAG->getTargetNode(Opc, dl, MVT::Other, Ops, 7); + } + + case Intrinsic::arm_neon_vst3lane: { + SDValue MemAddr, MemUpdate, MemOpc; + if (!SelectAddrMode6(Op, N->getOperand(2), MemAddr, MemUpdate, MemOpc)) + return NULL; + switch (N->getOperand(3).getValueType().getSimpleVT().SimpleTy) { + default: llvm_unreachable("unhandled vst3lane type"); + case MVT::v8i8: Opc = ARM::VST3LNd8; break; + case MVT::v4i16: Opc = ARM::VST3LNd16; break; + case MVT::v2f32: + case MVT::v2i32: Opc = ARM::VST3LNd32; break; + } + SDValue Chain = N->getOperand(0); + const SDValue Ops[] = { MemAddr, MemUpdate, MemOpc, + N->getOperand(3), N->getOperand(4), + N->getOperand(5), N->getOperand(6), Chain }; + return CurDAG->getTargetNode(Opc, dl, MVT::Other, Ops, 8); + } + + case Intrinsic::arm_neon_vst4lane: { + SDValue MemAddr, MemUpdate, MemOpc; + if (!SelectAddrMode6(Op, N->getOperand(2), MemAddr, MemUpdate, MemOpc)) + return NULL; + switch (N->getOperand(3).getValueType().getSimpleVT().SimpleTy) { + default: llvm_unreachable("unhandled vst4lane type"); + case MVT::v8i8: Opc = ARM::VST4LNd8; break; + case MVT::v4i16: Opc = ARM::VST4LNd16; break; + case MVT::v2f32: + case MVT::v2i32: Opc = ARM::VST4LNd32; break; + } + SDValue Chain = N->getOperand(0); + const SDValue Ops[] = { MemAddr, MemUpdate, MemOpc, + N->getOperand(3), N->getOperand(4), + N->getOperand(5), N->getOperand(6), + N->getOperand(7), Chain }; + return CurDAG->getTargetNode(Opc, dl, MVT::Other, Ops, 9); + } } } } diff --git a/lib/Target/ARM/ARMISelLowering.cpp b/lib/Target/ARM/ARMISelLowering.cpp index 0e1606f950a..8d79e5b7318 100644 --- a/lib/Target/ARM/ARMISelLowering.cpp +++ b/lib/Target/ARM/ARMISelLowering.cpp @@ -1370,6 +1370,26 @@ static SDValue LowerNeonVLDLaneIntrinsic(SDValue Op, SelectionDAG &DAG, return DAG.UpdateNodeOperands(Op, &Ops[0], Ops.size()); } +static SDValue LowerNeonVSTLaneIntrinsic(SDValue Op, SelectionDAG &DAG, + unsigned NumVecs) { + SDNode *Node = Op.getNode(); + EVT VT = Node->getOperand(3).getValueType(); + + if (!VT.is64BitVector()) + return SDValue(); // unimplemented + + // Change the lane number operand to be a TargetConstant; otherwise it + // will be legalized into a register. + ConstantSDNode *Lane = dyn_cast(Node->getOperand(NumVecs+3)); + if (!Lane) { + assert(false && "vst lane number must be a constant"); + return SDValue(); + } + SmallVector Ops(Node->op_begin(), Node->op_end()); + Ops[NumVecs+3] = DAG.getTargetConstant(Lane->getZExtValue(), MVT::i32); + return DAG.UpdateNodeOperands(Op, &Ops[0], Ops.size()); +} + SDValue ARMTargetLowering::LowerINTRINSIC_W_CHAIN(SDValue Op, SelectionDAG &DAG) { unsigned IntNo = cast(Op.getOperand(1))->getZExtValue(); @@ -1388,6 +1408,12 @@ ARMTargetLowering::LowerINTRINSIC_W_CHAIN(SDValue Op, SelectionDAG &DAG) { return LowerNeonVSTIntrinsic(Op, DAG, 3); case Intrinsic::arm_neon_vst4: return LowerNeonVSTIntrinsic(Op, DAG, 4); + case Intrinsic::arm_neon_vst2lane: + return LowerNeonVSTLaneIntrinsic(Op, DAG, 2); + case Intrinsic::arm_neon_vst3lane: + return LowerNeonVSTLaneIntrinsic(Op, DAG, 3); + case Intrinsic::arm_neon_vst4lane: + return LowerNeonVSTLaneIntrinsic(Op, DAG, 4); default: return SDValue(); // Don't custom lower most intrinsics. } } diff --git a/lib/Target/ARM/ARMInstrNEON.td b/lib/Target/ARM/ARMInstrNEON.td index 6fbcf6cbff3..0c2f65523b4 100644 --- a/lib/Target/ARM/ARMInstrNEON.td +++ b/lib/Target/ARM/ARMInstrNEON.td @@ -300,6 +300,40 @@ class VST4D def VST4d8 : VST4D<"vst4.8">; def VST4d16 : VST4D<"vst4.16">; def VST4d32 : VST4D<"vst4.32">; + +// VST2LN : Vector Store (single 2-element structure from one lane) +class VST2LND + : NLdSt<(outs), (ins addrmode6:$addr, DPR:$src1, DPR:$src2, nohash_imm:$lane), + NoItinerary, + !strconcat(OpcodeStr, "\t\\{$src1[$lane],$src2[$lane]\\}, $addr"), + "", []>; + +def VST2LNd8 : VST2LND<"vst2.8">; +def VST2LNd16 : VST2LND<"vst2.16">; +def VST2LNd32 : VST2LND<"vst2.32">; + +// VST3LN : Vector Store (single 3-element structure from one lane) +class VST3LND + : NLdSt<(outs), (ins addrmode6:$addr, DPR:$src1, DPR:$src2, DPR:$src3, + nohash_imm:$lane), NoItinerary, + !strconcat(OpcodeStr, + "\t\\{$src1[$lane],$src2[$lane],$src3[$lane]\\}, $addr"), "", []>; + +def VST3LNd8 : VST3LND<"vst3.8">; +def VST3LNd16 : VST3LND<"vst3.16">; +def VST3LNd32 : VST3LND<"vst3.32">; + +// VST4LN : Vector Store (single 4-element structure from one lane) +class VST4LND + : NLdSt<(outs), (ins addrmode6:$addr, DPR:$src1, DPR:$src2, DPR:$src3, + DPR:$src4, nohash_imm:$lane), NoItinerary, + !strconcat(OpcodeStr, + "\t\\{$src1[$lane],$src2[$lane],$src3[$lane],$src4[$lane]\\}, $addr"), + "", []>; + +def VST4LNd8 : VST4LND<"vst4.8">; +def VST4LNd16 : VST4LND<"vst4.16">; +def VST4LNd32 : VST4LND<"vst4.32">; } diff --git a/lib/Target/ARM/NEONPreAllocPass.cpp b/lib/Target/ARM/NEONPreAllocPass.cpp index 69380130428..985cc86848b 100644 --- a/lib/Target/ARM/NEONPreAllocPass.cpp +++ b/lib/Target/ARM/NEONPreAllocPass.cpp @@ -75,6 +75,9 @@ static bool isNEONMultiRegOp(int Opcode, unsigned &FirstOpnd, case ARM::VST2d8: case ARM::VST2d16: case ARM::VST2d32: + case ARM::VST2LNd8: + case ARM::VST2LNd16: + case ARM::VST2LNd32: FirstOpnd = 3; NumRegs = 2; return true; @@ -82,6 +85,9 @@ static bool isNEONMultiRegOp(int Opcode, unsigned &FirstOpnd, case ARM::VST3d8: case ARM::VST3d16: case ARM::VST3d32: + case ARM::VST3LNd8: + case ARM::VST3LNd16: + case ARM::VST3LNd32: FirstOpnd = 3; NumRegs = 3; return true; @@ -89,6 +95,9 @@ static bool isNEONMultiRegOp(int Opcode, unsigned &FirstOpnd, case ARM::VST4d8: case ARM::VST4d16: case ARM::VST4d32: + case ARM::VST4LNd8: + case ARM::VST4LNd16: + case ARM::VST4LNd32: FirstOpnd = 3; NumRegs = 4; return true; diff --git a/test/CodeGen/ARM/vstlane.ll b/test/CodeGen/ARM/vstlane.ll new file mode 100644 index 00000000000..03d5153fd21 --- /dev/null +++ b/test/CodeGen/ARM/vstlane.ll @@ -0,0 +1,113 @@ +; RUN: llvm-as < %s | llc -march=arm -mattr=+neon | FileCheck %s + +define void @vst2lanei8(i8* %A, <8 x i8>* %B) nounwind { +;CHECK: vst2lanei8: +;CHECK: vst2.8 + %tmp1 = load <8 x i8>* %B + call void @llvm.arm.neon.vst2lane.v8i8(i8* %A, <8 x i8> %tmp1, <8 x i8> %tmp1, i32 1) + ret void +} + +define void @vst2lanei16(i16* %A, <4 x i16>* %B) nounwind { +;CHECK: vst2lanei16: +;CHECK: vst2.16 + %tmp1 = load <4 x i16>* %B + call void @llvm.arm.neon.vst2lane.v4i16(i16* %A, <4 x i16> %tmp1, <4 x i16> %tmp1, i32 1) + ret void +} + +define void @vst2lanei32(i32* %A, <2 x i32>* %B) nounwind { +;CHECK: vst2lanei32: +;CHECK: vst2.32 + %tmp1 = load <2 x i32>* %B + call void @llvm.arm.neon.vst2lane.v2i32(i32* %A, <2 x i32> %tmp1, <2 x i32> %tmp1, i32 1) + ret void +} + +define void @vst2lanef(float* %A, <2 x float>* %B) nounwind { +;CHECK: vst2lanef: +;CHECK: vst2.32 + %tmp1 = load <2 x float>* %B + call void @llvm.arm.neon.vst2lane.v2f32(float* %A, <2 x float> %tmp1, <2 x float> %tmp1, i32 1) + ret void +} + +declare void @llvm.arm.neon.vst2lane.v8i8(i8*, <8 x i8>, <8 x i8>, i32) nounwind +declare void @llvm.arm.neon.vst2lane.v4i16(i8*, <4 x i16>, <4 x i16>, i32) nounwind +declare void @llvm.arm.neon.vst2lane.v2i32(i8*, <2 x i32>, <2 x i32>, i32) nounwind +declare void @llvm.arm.neon.vst2lane.v2f32(i8*, <2 x float>, <2 x float>, i32) nounwind + +define void @vst3lanei8(i8* %A, <8 x i8>* %B) nounwind { +;CHECK: vst3lanei8: +;CHECK: vst3.8 + %tmp1 = load <8 x i8>* %B + call void @llvm.arm.neon.vst3lane.v8i8(i8* %A, <8 x i8> %tmp1, <8 x i8> %tmp1, <8 x i8> %tmp1, i32 1) + ret void +} + +define void @vst3lanei16(i16* %A, <4 x i16>* %B) nounwind { +;CHECK: vst3lanei16: +;CHECK: vst3.16 + %tmp1 = load <4 x i16>* %B + call void @llvm.arm.neon.vst3lane.v4i16(i16* %A, <4 x i16> %tmp1, <4 x i16> %tmp1, <4 x i16> %tmp1, i32 1) + ret void +} + +define void @vst3lanei32(i32* %A, <2 x i32>* %B) nounwind { +;CHECK: vst3lanei32: +;CHECK: vst3.32 + %tmp1 = load <2 x i32>* %B + call void @llvm.arm.neon.vst3lane.v2i32(i32* %A, <2 x i32> %tmp1, <2 x i32> %tmp1, <2 x i32> %tmp1, i32 1) + ret void +} + +define void @vst3lanef(float* %A, <2 x float>* %B) nounwind { +;CHECK: vst3lanef: +;CHECK: vst3.32 + %tmp1 = load <2 x float>* %B + call void @llvm.arm.neon.vst3lane.v2f32(float* %A, <2 x float> %tmp1, <2 x float> %tmp1, <2 x float> %tmp1, i32 1) + ret void +} + +declare void @llvm.arm.neon.vst3lane.v8i8(i8*, <8 x i8>, <8 x i8>, <8 x i8>, i32) nounwind +declare void @llvm.arm.neon.vst3lane.v4i16(i8*, <4 x i16>, <4 x i16>, <4 x i16>, i32) nounwind +declare void @llvm.arm.neon.vst3lane.v2i32(i8*, <2 x i32>, <2 x i32>, <2 x i32>, i32) nounwind +declare void @llvm.arm.neon.vst3lane.v2f32(i8*, <2 x float>, <2 x float>, <2 x float>, i32) nounwind + + +define void @vst4lanei8(i8* %A, <8 x i8>* %B) nounwind { +;CHECK: vst4lanei8: +;CHECK: vst4.8 + %tmp1 = load <8 x i8>* %B + call void @llvm.arm.neon.vst4lane.v8i8(i8* %A, <8 x i8> %tmp1, <8 x i8> %tmp1, <8 x i8> %tmp1, <8 x i8> %tmp1, i32 1) + ret void +} + +define void @vst4lanei16(i16* %A, <4 x i16>* %B) nounwind { +;CHECK: vst4lanei16: +;CHECK: vst4.16 + %tmp1 = load <4 x i16>* %B + call void @llvm.arm.neon.vst4lane.v4i16(i16* %A, <4 x i16> %tmp1, <4 x i16> %tmp1, <4 x i16> %tmp1, <4 x i16> %tmp1, i32 1) + ret void +} + +define void @vst4lanei32(i32* %A, <2 x i32>* %B) nounwind { +;CHECK: vst4lanei32: +;CHECK: vst4.32 + %tmp1 = load <2 x i32>* %B + call void @llvm.arm.neon.vst4lane.v2i32(i32* %A, <2 x i32> %tmp1, <2 x i32> %tmp1, <2 x i32> %tmp1, <2 x i32> %tmp1, i32 1) + ret void +} + +define void @vst4lanef(float* %A, <2 x float>* %B) nounwind { +;CHECK: vst4lanef: +;CHECK: vst4.32 + %tmp1 = load <2 x float>* %B + call void @llvm.arm.neon.vst4lane.v2f32(float* %A, <2 x float> %tmp1, <2 x float> %tmp1, <2 x float> %tmp1, <2 x float> %tmp1, i32 1) + ret void +} + +declare void @llvm.arm.neon.vst4lane.v8i8(i8*, <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8>, i32) nounwind +declare void @llvm.arm.neon.vst4lane.v4i16(i8*, <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16>, i32) nounwind +declare void @llvm.arm.neon.vst4lane.v2i32(i8*, <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32>, i32) nounwind +declare void @llvm.arm.neon.vst4lane.v2f32(i8*, <2 x float>, <2 x float>, <2 x float>, <2 x float>, i32) nounwind