From bcc4c1d2d1b6877418de92835c537d79d44363a6 Mon Sep 17 00:00:00 2001 From: Arnold Schwaighofer Date: Thu, 9 Aug 2012 15:25:52 +0000 Subject: [PATCH] Patch to implement UMLAL/SMLAL instructions for the ARM architecture This patch corrects the definition of umlal/smlal instructions and adds support for matching them to the ARM dag combiner. Bug 12213 Patch by Yin Ma! git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@161581 91177308-0d34-0410-b5e6-96231b3b80d8 --- lib/Target/ARM/ARMISelDAGToDAG.cpp | 32 ++++++ lib/Target/ARM/ARMISelLowering.cpp | 156 +++++++++++++++++++++++++++++ lib/Target/ARM/ARMISelLowering.h | 3 + lib/Target/ARM/ARMInstrInfo.td | 46 +++++++-- lib/Target/ARM/ARMInstrThumb2.td | 31 ++++-- test/CodeGen/ARM/longMAC.ll | 44 ++++++++ test/CodeGen/ARM/longMACt.ll | 44 ++++++++ 7 files changed, 339 insertions(+), 17 deletions(-) create mode 100644 test/CodeGen/ARM/longMAC.ll create mode 100644 test/CodeGen/ARM/longMACt.ll diff --git a/lib/Target/ARM/ARMISelDAGToDAG.cpp b/lib/Target/ARM/ARMISelDAGToDAG.cpp index ee349a753fe..7e8937587be 100644 --- a/lib/Target/ARM/ARMISelDAGToDAG.cpp +++ b/lib/Target/ARM/ARMISelDAGToDAG.cpp @@ -2747,6 +2747,38 @@ SDNode *ARMDAGToDAGISel::Select(SDNode *N) { dl, MVT::i32, MVT::i32, Ops, 5); } } + case ARMISD::UMLAL:{ + if (Subtarget->isThumb()) { + SDValue Ops[] = { N->getOperand(0), N->getOperand(1), N->getOperand(2), + N->getOperand(3), getAL(CurDAG), + CurDAG->getRegister(0, MVT::i32)}; + return CurDAG->getMachineNode(ARM::t2UMLAL, dl, MVT::i32, MVT::i32, Ops, 6); + }else{ + SDValue Ops[] = { N->getOperand(0), N->getOperand(1), N->getOperand(2), + N->getOperand(3), getAL(CurDAG), + CurDAG->getRegister(0, MVT::i32), + CurDAG->getRegister(0, MVT::i32) }; + return CurDAG->getMachineNode(Subtarget->hasV6Ops() ? + ARM::UMLAL : ARM::UMLALv5, + dl, MVT::i32, MVT::i32, Ops, 7); + } + } + case ARMISD::SMLAL:{ + if (Subtarget->isThumb()) { + SDValue Ops[] = { N->getOperand(0), N->getOperand(1), N->getOperand(2), + N->getOperand(3), getAL(CurDAG), + CurDAG->getRegister(0, MVT::i32)}; + return CurDAG->getMachineNode(ARM::t2SMLAL, dl, MVT::i32, MVT::i32, Ops, 6); + }else{ + SDValue Ops[] = { N->getOperand(0), N->getOperand(1), N->getOperand(2), + N->getOperand(3), getAL(CurDAG), + CurDAG->getRegister(0, MVT::i32), + CurDAG->getRegister(0, MVT::i32) }; + return CurDAG->getMachineNode(Subtarget->hasV6Ops() ? + ARM::SMLAL : ARM::SMLALv5, + dl, MVT::i32, MVT::i32, Ops, 7); + } + } case ISD::LOAD: { SDNode *ResNode = 0; if (Subtarget->isThumb() && Subtarget->hasThumb2()) diff --git a/lib/Target/ARM/ARMISelLowering.cpp b/lib/Target/ARM/ARMISelLowering.cpp index 7d8222927bb..bb2116f7743 100644 --- a/lib/Target/ARM/ARMISelLowering.cpp +++ b/lib/Target/ARM/ARMISelLowering.cpp @@ -571,6 +571,11 @@ ARMTargetLowering::ARMTargetLowering(TargetMachine &TM) } } + // ARM and Thumb2 support UMLAL/SMLAL. + if (!Subtarget->isThumb1Only()) + setTargetDAGCombine(ISD::ADDC); + + computeRegisterProperties(); // ARM does not have f32 extending load. @@ -989,6 +994,8 @@ const char *ARMTargetLowering::getTargetNodeName(unsigned Opcode) const { case ARMISD::VTBL2: return "ARMISD::VTBL2"; case ARMISD::VMULLs: return "ARMISD::VMULLs"; case ARMISD::VMULLu: return "ARMISD::VMULLu"; + case ARMISD::UMLAL: return "ARMISD::UMLAL"; + case ARMISD::SMLAL: return "ARMISD::SMLAL"; case ARMISD::BUILD_VECTOR: return "ARMISD::BUILD_VECTOR"; case ARMISD::FMAX: return "ARMISD::FMAX"; case ARMISD::FMIN: return "ARMISD::FMIN"; @@ -7127,6 +7134,154 @@ static SDValue AddCombineToVPADDL(SDNode *N, SDValue N0, SDValue N1, return DAG.getNode(ISD::TRUNCATE, N->getDebugLoc(), VT, tmp); } +static SDValue findMUL_LOHI(SDValue V) { + if (V->getOpcode() == ISD::UMUL_LOHI || + V->getOpcode() == ISD::SMUL_LOHI) + return V; + return SDValue(); +} + +static SDValue AddCombineTo64bitMLAL(SDNode *AddcNode, + TargetLowering::DAGCombinerInfo &DCI, + const ARMSubtarget *Subtarget) { + + if (Subtarget->isThumb1Only()) return SDValue(); + + // Only perform the checks after legalize when the pattern is available. + if (DCI.isBeforeLegalize()) return SDValue(); + + // Look for multiply add opportunities. + // The pattern is a ISD::UMUL_LOHI followed by two add nodes, where + // each add nodes consumes a value from ISD::UMUL_LOHI and there is + // a glue link from the first add to the second add. + // If we find this pattern, we can replace the U/SMUL_LOHI, ADDC, and ADDE by + // a S/UMLAL instruction. + // loAdd UMUL_LOHI + // \ / :lo \ :hi + // \ / \ [no multiline comment] + // ADDC | hiAdd + // \ :glue / / + // \ / / + // ADDE + // + assert(AddcNode->getOpcode() == ISD::ADDC && "Expect an ADDC"); + SDValue AddcOp0 = AddcNode->getOperand(0); + SDValue AddcOp1 = AddcNode->getOperand(1); + + // Check if the two operands are from the same mul_lohi node. + if (AddcOp0.getNode() == AddcOp1.getNode()) + return SDValue(); + + assert(AddcNode->getNumValues() == 2 && + AddcNode->getValueType(0) == MVT::i32 && + AddcNode->getValueType(1) == MVT::Glue && + "Expect ADDC with two result values: i32, glue"); + + // Check that the ADDC adds the low result of the S/UMUL_LOHI. + if (AddcOp0->getOpcode() != ISD::UMUL_LOHI && + AddcOp0->getOpcode() != ISD::SMUL_LOHI && + AddcOp1->getOpcode() != ISD::UMUL_LOHI && + AddcOp1->getOpcode() != ISD::SMUL_LOHI) + return SDValue(); + + // Look for the glued ADDE. + SDNode* AddeNode = AddcNode->getGluedUser(); + if (AddeNode == NULL) + return SDValue(); + + // Make sure it is really an ADDE. + if (AddeNode->getOpcode() != ISD::ADDE) + return SDValue(); + + assert(AddeNode->getNumOperands() == 3 && + AddeNode->getOperand(2).getValueType() == MVT::Glue && + "ADDE node has the wrong inputs"); + + // Check for the triangle shape. + SDValue AddeOp0 = AddeNode->getOperand(0); + SDValue AddeOp1 = AddeNode->getOperand(1); + + // Make sure that the ADDE operands are not coming from the same node. + if (AddeOp0.getNode() == AddeOp1.getNode()) + return SDValue(); + + // Find the MUL_LOHI node walking up ADDE's operands. + bool IsLeftOperandMUL = false; + SDValue MULOp = findMUL_LOHI(AddeOp0); + if (MULOp == SDValue()) + MULOp = findMUL_LOHI(AddeOp1); + else + IsLeftOperandMUL = true; + if (MULOp == SDValue()) + return SDValue(); + + // Figure out the right opcode. + unsigned Opc = MULOp->getOpcode(); + unsigned FinalOpc = (Opc == ISD::SMUL_LOHI) ? ARMISD::SMLAL : ARMISD::UMLAL; + + // Figure out the high and low input values to the MLAL node. + SDValue* HiMul = &MULOp; + SDValue* HiAdd = NULL; + SDValue* LoMul = NULL; + SDValue* LowAdd = NULL; + + if (IsLeftOperandMUL) + HiAdd = &AddeOp1; + else + HiAdd = &AddeOp0; + + + if (AddcOp0->getOpcode() == Opc) { + LoMul = &AddcOp0; + LowAdd = &AddcOp1; + } + if (AddcOp1->getOpcode() == Opc) { + LoMul = &AddcOp1; + LowAdd = &AddcOp0; + } + + if (LoMul == NULL) + return SDValue(); + + if (LoMul->getNode() != HiMul->getNode()) + return SDValue(); + + // Create the merged node. + SelectionDAG &DAG = DCI.DAG; + + // Build operand list. + SmallVector Ops; + Ops.push_back(LoMul->getOperand(0)); + Ops.push_back(LoMul->getOperand(1)); + Ops.push_back(*LowAdd); + Ops.push_back(*HiAdd); + + SDValue MLALNode = DAG.getNode(FinalOpc, AddcNode->getDebugLoc(), + DAG.getVTList(MVT::i32, MVT::i32), + &Ops[0], Ops.size()); + + // Replace the ADDs' nodes uses by the MLA node's values. + SDValue HiMLALResult(MLALNode.getNode(), 1); + DAG.ReplaceAllUsesOfValueWith(SDValue(AddeNode, 0), HiMLALResult); + + SDValue LoMLALResult(MLALNode.getNode(), 0); + DAG.ReplaceAllUsesOfValueWith(SDValue(AddcNode, 0), LoMLALResult); + + // Return original node to notify the driver to stop replacing. + SDValue resNode(AddcNode, 0); + return resNode; +} + +/// PerformADDCCombine - Target-specific dag combine transform from +/// ISD::ADDC, ISD::ADDE, and ISD::MUL_LOHI to MLAL. +static SDValue PerformADDCCombine(SDNode *N, + TargetLowering::DAGCombinerInfo &DCI, + const ARMSubtarget *Subtarget) { + + return AddCombineTo64bitMLAL(N, DCI, Subtarget); + +} + /// PerformADDCombineWithOperands - Try DAG combinations for an ADD with /// operands N0 and N1. This is a helper for PerformADDCombine that is /// called with the default operands, and if that fails, with commuted @@ -8738,6 +8893,7 @@ SDValue ARMTargetLowering::PerformDAGCombine(SDNode *N, DAGCombinerInfo &DCI) const { switch (N->getOpcode()) { default: break; + case ISD::ADDC: return PerformADDCCombine(N, DCI, Subtarget); case ISD::ADD: return PerformADDCombine(N, DCI, Subtarget); case ISD::SUB: return PerformSUBCombine(N, DCI); case ISD::MUL: return PerformMULCombine(N, DCI, Subtarget); diff --git a/lib/Target/ARM/ARMISelLowering.h b/lib/Target/ARM/ARMISelLowering.h index 213af20ebd1..8faadfd8317 100644 --- a/lib/Target/ARM/ARMISelLowering.h +++ b/lib/Target/ARM/ARMISelLowering.h @@ -176,6 +176,9 @@ namespace llvm { VMULLs, // ...signed VMULLu, // ...unsigned + UMLAL, // 64bit Unsigned Accumulate Multiply + SMLAL, // 64bit Signed Accumulate Multiply + // Operands of the standard BUILD_VECTOR node are not legalized, which // is fine if BUILD_VECTORs are always lowered to shuffles or other // operations, but for ARM some BUILD_VECTORs are legal as-is and their diff --git a/lib/Target/ARM/ARMInstrInfo.td b/lib/Target/ARM/ARMInstrInfo.td index 6340a58f1a0..52be43b0205 100644 --- a/lib/Target/ARM/ARMInstrInfo.td +++ b/lib/Target/ARM/ARMInstrInfo.td @@ -83,6 +83,13 @@ def SDTBinaryArithWithFlagsInOut : SDTypeProfile<2, 3, SDTCisInt<0>, SDTCisVT<1, i32>, SDTCisVT<4, i32>]>; + +def SDT_ARM64bitmlal : SDTypeProfile<2,4, [ SDTCisVT<0, i32>, SDTCisVT<1, i32>, + SDTCisVT<2, i32>, SDTCisVT<3, i32>, + SDTCisVT<4, i32>, SDTCisVT<5, i32> ] >; +def ARMUmlal : SDNode<"ARMISD::UMLAL", SDT_ARM64bitmlal>; +def ARMSmlal : SDNode<"ARMISD::SMLAL", SDT_ARM64bitmlal>; + // Node definitions. def ARMWrapper : SDNode<"ARMISD::Wrapper", SDTIntUnaryOp>; def ARMWrapperDYN : SDNode<"ARMISD::WrapperDYN", SDTIntUnaryOp>; @@ -3396,6 +3403,18 @@ class AsMul1I64 opcod, dag oops, dag iops, InstrItinClass itin, let Inst{11-8} = Rm; let Inst{3-0} = Rn; } +class AsMla1I64 opcod, dag oops, dag iops, InstrItinClass itin, + string opc, string asm, list pattern> + : AsMul1I { + bits<4> RdLo; + bits<4> RdHi; + bits<4> Rm; + bits<4> Rn; + let Inst{19-16} = RdHi; + let Inst{15-12} = RdLo; + let Inst{11-8} = Rm; + let Inst{3-0} = Rn; +} // FIXME: The v5 pseudos are only necessary for the additional Constraint // property. Remove them when it's possible to add those properties @@ -3478,14 +3497,14 @@ def UMULLv5 : ARMPseudoExpand<(outs GPR:$RdLo, GPR:$RdHi), } // Multiply + accumulate -def SMLAL : AsMul1I64<0b0000111, (outs GPR:$RdLo, GPR:$RdHi), - (ins GPR:$Rn, GPR:$Rm), IIC_iMAC64, +def SMLAL : AsMla1I64<0b0000111, (outs GPR:$RdLo, GPR:$RdHi), + (ins GPR:$Rn, GPR:$Rm, GPR:$RLo, GPR:$RHi), IIC_iMAC64, "smlal", "\t$RdLo, $RdHi, $Rn, $Rm", []>, - Requires<[IsARM, HasV6]>; -def UMLAL : AsMul1I64<0b0000101, (outs GPR:$RdLo, GPR:$RdHi), - (ins GPR:$Rn, GPR:$Rm), IIC_iMAC64, + RegConstraint<"$RLo = $RdLo, $RHi = $RdHi">, Requires<[IsARM, HasV6]>; +def UMLAL : AsMla1I64<0b0000101, (outs GPR:$RdLo, GPR:$RdHi), + (ins GPR:$Rn, GPR:$Rm, GPR:$RLo, GPR:$RHi), IIC_iMAC64, "umlal", "\t$RdLo, $RdHi, $Rn, $Rm", []>, - Requires<[IsARM, HasV6]>; + RegConstraint<"$RLo = $RdLo, $RHi = $RdHi">, Requires<[IsARM, HasV6]>; def UMAAL : AMul1I <0b0000010, (outs GPR:$RdLo, GPR:$RdHi), (ins GPR:$Rn, GPR:$Rm), IIC_iMAC64, @@ -3501,17 +3520,22 @@ def UMAAL : AMul1I <0b0000010, (outs GPR:$RdLo, GPR:$RdHi), let Inst{3-0} = Rn; } -let Constraints = "@earlyclobber $RdLo,@earlyclobber $RdHi" in { +let Constraints = "$RLo = $RdLo,$RHi = $RdHi" in { def SMLALv5 : ARMPseudoExpand<(outs GPR:$RdLo, GPR:$RdHi), - (ins GPR:$Rn, GPR:$Rm, pred:$p, cc_out:$s), + (ins GPR:$Rn, GPR:$Rm, GPR:$RLo, GPR:$RHi, pred:$p, cc_out:$s), 4, IIC_iMAC64, [], - (SMLAL GPR:$RdLo, GPR:$RdHi, GPR:$Rn, GPR:$Rm, pred:$p, cc_out:$s)>, + (SMLAL GPR:$RdLo, GPR:$RdHi, GPR:$Rn, GPR:$Rm, GPR:$RLo, GPR:$RHi, + pred:$p, cc_out:$s)>, Requires<[IsARM, NoV6]>; def UMLALv5 : ARMPseudoExpand<(outs GPR:$RdLo, GPR:$RdHi), - (ins GPR:$Rn, GPR:$Rm, pred:$p, cc_out:$s), + (ins GPR:$Rn, GPR:$Rm, GPR:$RLo, GPR:$RHi, pred:$p, cc_out:$s), 4, IIC_iMAC64, [], - (UMLAL GPR:$RdLo, GPR:$RdHi, GPR:$Rn, GPR:$Rm, pred:$p, cc_out:$s)>, + (UMLAL GPR:$RdLo, GPR:$RdHi, GPR:$Rn, GPR:$Rm, GPR:$RLo, GPR:$RHi, + pred:$p, cc_out:$s)>, Requires<[IsARM, NoV6]>; +} + +let Constraints = "@earlyclobber $RdLo,@earlyclobber $RdHi" in { def UMAALv5 : ARMPseudoExpand<(outs GPR:$RdLo, GPR:$RdHi), (ins GPR:$Rn, GPR:$Rm, pred:$p), 4, IIC_iMAC64, [], diff --git a/lib/Target/ARM/ARMInstrThumb2.td b/lib/Target/ARM/ARMInstrThumb2.td index 307006f413a..e600e8798a5 100644 --- a/lib/Target/ARM/ARMInstrThumb2.td +++ b/lib/Target/ARM/ARMInstrThumb2.td @@ -523,6 +523,23 @@ class T2MulLong opc22_20, bits<4> opc7_4, let Inst{7-4} = opc7_4; let Inst{3-0} = Rm; } +class T2MlaLong opc22_20, bits<4> opc7_4, + dag oops, dag iops, InstrItinClass itin, + string opc, string asm, list pattern> + : T2I { + bits<4> RdLo; + bits<4> RdHi; + bits<4> Rn; + bits<4> Rm; + + let Inst{31-23} = 0b111110111; + let Inst{22-20} = opc22_20; + let Inst{19-16} = Rn; + let Inst{15-12} = RdLo; + let Inst{11-8} = RdHi; + let Inst{7-4} = opc7_4; + let Inst{3-0} = Rm; +} /// T2I_bin_irs - Defines a set of (op reg, {so_imm|r|so_reg}) patterns for a @@ -2410,15 +2427,17 @@ def t2UMULL : T2MulLong<0b010, 0b0000, } // isCommutable // Multiply + accumulate -def t2SMLAL : T2MulLong<0b100, 0b0000, - (outs rGPR:$RdLo, rGPR:$RdHi), +def t2SMLAL : T2MlaLong<0b100, 0b0000, + (outs rGPR:$RdLo, rGPR:$RdHi, rGPR:$RLo, rGPR:$RHi), (ins rGPR:$Rn, rGPR:$Rm), IIC_iMAC64, - "smlal", "\t$RdLo, $RdHi, $Rn, $Rm", []>; + "smlal", "\t$RdLo, $RdHi, $Rn, $Rm", []>, + RegConstraint<"$RLo = $RdLo, $RHi = $RdHi">; -def t2UMLAL : T2MulLong<0b110, 0b0000, - (outs rGPR:$RdLo, rGPR:$RdHi), +def t2UMLAL : T2MlaLong<0b110, 0b0000, + (outs rGPR:$RdLo, rGPR:$RdHi, rGPR:$RLo, rGPR:$RHi), (ins rGPR:$Rn, rGPR:$Rm), IIC_iMAC64, - "umlal", "\t$RdLo, $RdHi, $Rn, $Rm", []>; + "umlal", "\t$RdLo, $RdHi, $Rn, $Rm", []>, + RegConstraint<"$RLo = $RdLo, $RHi = $RdHi">; def t2UMAAL : T2MulLong<0b110, 0b0110, (outs rGPR:$RdLo, rGPR:$RdHi), diff --git a/test/CodeGen/ARM/longMAC.ll b/test/CodeGen/ARM/longMAC.ll new file mode 100644 index 00000000000..e4a00e9ac30 --- /dev/null +++ b/test/CodeGen/ARM/longMAC.ll @@ -0,0 +1,44 @@ +; RUN: llc < %s -march=arm | FileCheck %s +; Check generated signed and unsigned multiply accumulate long. + +define i64 @MACLongTest1(i32 %a, i32 %b, i64 %c) { +;CHECK: MACLongTest1: +;CHECK: umlal + %conv = zext i32 %a to i64 + %conv1 = zext i32 %b to i64 + %mul = mul i64 %conv1, %conv + %add = add i64 %mul, %c + ret i64 %add +} + +define i64 @MACLongTest2(i32 %a, i32 %b, i64 %c) { +;CHECK: MACLongTest2: +;CHECK: smlal + %conv = sext i32 %a to i64 + %conv1 = sext i32 %b to i64 + %mul = mul nsw i64 %conv1, %conv + %add = add nsw i64 %mul, %c + ret i64 %add +} + +define i64 @MACLongTest3(i32 %a, i32 %b, i32 %c) { +;CHECK: MACLongTest3: +;CHECK: umlal + %conv = zext i32 %b to i64 + %conv1 = zext i32 %a to i64 + %mul = mul i64 %conv, %conv1 + %conv2 = zext i32 %c to i64 + %add = add i64 %mul, %conv2 + ret i64 %add +} + +define i64 @MACLongTest4(i32 %a, i32 %b, i32 %c) { +;CHECK: MACLongTest4: +;CHECK: smlal + %conv = sext i32 %b to i64 + %conv1 = sext i32 %a to i64 + %mul = mul nsw i64 %conv, %conv1 + %conv2 = sext i32 %c to i64 + %add = add nsw i64 %mul, %conv2 + ret i64 %add +} diff --git a/test/CodeGen/ARM/longMACt.ll b/test/CodeGen/ARM/longMACt.ll new file mode 100644 index 00000000000..beefd6044cf --- /dev/null +++ b/test/CodeGen/ARM/longMACt.ll @@ -0,0 +1,44 @@ +; RUN: llc < %s -march=thumb -mattr=+thumb2 | FileCheck %s +; Check generated signed and unsigned multiply accumulate long. + +define i64 @MACLongTest1(i32 %a, i32 %b, i64 %c) { +;CHECK: MACLongTest1: +;CHECK: umlal + %conv = zext i32 %a to i64 + %conv1 = zext i32 %b to i64 + %mul = mul i64 %conv1, %conv + %add = add i64 %mul, %c + ret i64 %add +} + +define i64 @MACLongTest2(i32 %a, i32 %b, i64 %c) { +;CHECK: MACLongTest2: +;CHECK: smlal + %conv = sext i32 %a to i64 + %conv1 = sext i32 %b to i64 + %mul = mul nsw i64 %conv1, %conv + %add = add nsw i64 %mul, %c + ret i64 %add +} + +define i64 @MACLongTest3(i32 %a, i32 %b, i32 %c) { +;CHECK: MACLongTest3: +;CHECK: umlal + %conv = zext i32 %b to i64 + %conv1 = zext i32 %a to i64 + %mul = mul i64 %conv, %conv1 + %conv2 = zext i32 %c to i64 + %add = add i64 %mul, %conv2 + ret i64 %add +} + +define i64 @MACLongTest4(i32 %a, i32 %b, i32 %c) { +;CHECK: MACLongTest4: +;CHECK: smlal + %conv = sext i32 %b to i64 + %conv1 = sext i32 %a to i64 + %mul = mul nsw i64 %conv, %conv1 + %conv2 = sext i32 %c to i64 + %add = add nsw i64 %mul, %conv2 + ret i64 %add +}