mirror of
https://github.com/RPCS3/llvm-mirror.git
synced 2025-01-07 03:40:35 +00:00
[AArch64] Generate vector signed/unsigned mul and mla/mls long.
Phabricator Revision: http://reviews.llvm.org/D5589 Patch by Balaram Makam <bmakam@codeaurora.org>!! llvm-svn: 219276
This commit is contained in:
parent
3853b2ab38
commit
75e17097bb
@ -531,6 +531,11 @@ AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM)
|
||||
|
||||
// AArch64 doesn't have MUL.2d:
|
||||
setOperationAction(ISD::MUL, MVT::v2i64, Expand);
|
||||
// Custom handling for some quad-vector types to detect MULL.
|
||||
setOperationAction(ISD::MUL, MVT::v8i16, Custom);
|
||||
setOperationAction(ISD::MUL, MVT::v4i32, Custom);
|
||||
setOperationAction(ISD::MUL, MVT::v2i64, Custom);
|
||||
|
||||
setOperationAction(ISD::ANY_EXTEND, MVT::v4i32, Legal);
|
||||
setTruncStoreAction(MVT::v2i32, MVT::v2i16, Expand);
|
||||
// Likewise, narrowing and extending vector loads/stores aren't handled
|
||||
@ -853,6 +858,8 @@ const char *AArch64TargetLowering::getTargetNodeName(unsigned Opcode) const {
|
||||
case AArch64ISD::ST2LANEpost: return "AArch64ISD::ST2LANEpost";
|
||||
case AArch64ISD::ST3LANEpost: return "AArch64ISD::ST3LANEpost";
|
||||
case AArch64ISD::ST4LANEpost: return "AArch64ISD::ST4LANEpost";
|
||||
case AArch64ISD::SMULL: return "AArch64ISD::SMULL";
|
||||
case AArch64ISD::UMULL: return "AArch64ISD::UMULL";
|
||||
}
|
||||
}
|
||||
|
||||
@ -1668,6 +1675,197 @@ static SDValue LowerBITCAST(SDValue Op, SelectionDAG &DAG) {
|
||||
0);
|
||||
}
|
||||
|
||||
static EVT getExtensionTo64Bits(const EVT &OrigVT) {
|
||||
if (OrigVT.getSizeInBits() >= 64)
|
||||
return OrigVT;
|
||||
|
||||
assert(OrigVT.isSimple() && "Expecting a simple value type");
|
||||
|
||||
MVT::SimpleValueType OrigSimpleTy = OrigVT.getSimpleVT().SimpleTy;
|
||||
switch (OrigSimpleTy) {
|
||||
default: llvm_unreachable("Unexpected Vector Type");
|
||||
case MVT::v2i8:
|
||||
case MVT::v2i16:
|
||||
return MVT::v2i32;
|
||||
case MVT::v4i8:
|
||||
return MVT::v4i16;
|
||||
}
|
||||
}
|
||||
|
||||
static SDValue addRequiredExtensionForVectorMULL(SDValue N, SelectionDAG &DAG,
|
||||
const EVT &OrigTy,
|
||||
const EVT &ExtTy,
|
||||
unsigned ExtOpcode) {
|
||||
// The vector originally had a size of OrigTy. It was then extended to ExtTy.
|
||||
// We expect the ExtTy to be 128-bits total. If the OrigTy is less than
|
||||
// 64-bits we need to insert a new extension so that it will be 64-bits.
|
||||
assert(ExtTy.is128BitVector() && "Unexpected extension size");
|
||||
if (OrigTy.getSizeInBits() >= 64)
|
||||
return N;
|
||||
|
||||
// Must extend size to at least 64 bits to be used as an operand for VMULL.
|
||||
EVT NewVT = getExtensionTo64Bits(OrigTy);
|
||||
|
||||
return DAG.getNode(ExtOpcode, SDLoc(N), NewVT, N);
|
||||
}
|
||||
|
||||
static bool isExtendedBUILD_VECTOR(SDNode *N, SelectionDAG &DAG,
|
||||
bool isSigned) {
|
||||
EVT VT = N->getValueType(0);
|
||||
|
||||
if (N->getOpcode() != ISD::BUILD_VECTOR)
|
||||
return false;
|
||||
|
||||
for (unsigned i = 0, e = N->getNumOperands(); i != e; ++i) {
|
||||
SDNode *Elt = N->getOperand(i).getNode();
|
||||
if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Elt)) {
|
||||
unsigned EltSize = VT.getVectorElementType().getSizeInBits();
|
||||
unsigned HalfSize = EltSize / 2;
|
||||
if (isSigned) {
|
||||
if (!isIntN(HalfSize, C->getSExtValue()))
|
||||
return false;
|
||||
} else {
|
||||
if (!isUIntN(HalfSize, C->getZExtValue()))
|
||||
return false;
|
||||
}
|
||||
continue;
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
static SDValue skipExtensionForVectorMULL(SDNode *N, SelectionDAG &DAG) {
|
||||
if (N->getOpcode() == ISD::SIGN_EXTEND || N->getOpcode() == ISD::ZERO_EXTEND)
|
||||
return addRequiredExtensionForVectorMULL(N->getOperand(0), DAG,
|
||||
N->getOperand(0)->getValueType(0),
|
||||
N->getValueType(0),
|
||||
N->getOpcode());
|
||||
|
||||
assert(N->getOpcode() == ISD::BUILD_VECTOR && "expected BUILD_VECTOR");
|
||||
EVT VT = N->getValueType(0);
|
||||
unsigned EltSize = VT.getVectorElementType().getSizeInBits() / 2;
|
||||
unsigned NumElts = VT.getVectorNumElements();
|
||||
MVT TruncVT = MVT::getIntegerVT(EltSize);
|
||||
SmallVector<SDValue, 8> Ops;
|
||||
for (unsigned i = 0; i != NumElts; ++i) {
|
||||
ConstantSDNode *C = cast<ConstantSDNode>(N->getOperand(i));
|
||||
const APInt &CInt = C->getAPIntValue();
|
||||
// Element types smaller than 32 bits are not legal, so use i32 elements.
|
||||
// The values are implicitly truncated so sext vs. zext doesn't matter.
|
||||
Ops.push_back(DAG.getConstant(CInt.zextOrTrunc(32), MVT::i32));
|
||||
}
|
||||
return DAG.getNode(ISD::BUILD_VECTOR, SDLoc(N),
|
||||
MVT::getVectorVT(TruncVT, NumElts), Ops);
|
||||
}
|
||||
|
||||
static bool isSignExtended(SDNode *N, SelectionDAG &DAG) {
|
||||
if (N->getOpcode() == ISD::SIGN_EXTEND)
|
||||
return true;
|
||||
if (isExtendedBUILD_VECTOR(N, DAG, true))
|
||||
return true;
|
||||
return false;
|
||||
}
|
||||
|
||||
static bool isZeroExtended(SDNode *N, SelectionDAG &DAG) {
|
||||
if (N->getOpcode() == ISD::ZERO_EXTEND)
|
||||
return true;
|
||||
if (isExtendedBUILD_VECTOR(N, DAG, false))
|
||||
return true;
|
||||
return false;
|
||||
}
|
||||
|
||||
static bool isAddSubSExt(SDNode *N, SelectionDAG &DAG) {
|
||||
unsigned Opcode = N->getOpcode();
|
||||
if (Opcode == ISD::ADD || Opcode == ISD::SUB) {
|
||||
SDNode *N0 = N->getOperand(0).getNode();
|
||||
SDNode *N1 = N->getOperand(1).getNode();
|
||||
return N0->hasOneUse() && N1->hasOneUse() &&
|
||||
isSignExtended(N0, DAG) && isSignExtended(N1, DAG);
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
static bool isAddSubZExt(SDNode *N, SelectionDAG &DAG) {
|
||||
unsigned Opcode = N->getOpcode();
|
||||
if (Opcode == ISD::ADD || Opcode == ISD::SUB) {
|
||||
SDNode *N0 = N->getOperand(0).getNode();
|
||||
SDNode *N1 = N->getOperand(1).getNode();
|
||||
return N0->hasOneUse() && N1->hasOneUse() &&
|
||||
isZeroExtended(N0, DAG) && isZeroExtended(N1, DAG);
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
static SDValue LowerMUL(SDValue Op, SelectionDAG &DAG) {
|
||||
// Multiplications are only custom-lowered for 128-bit vectors so that
|
||||
// VMULL can be detected. Otherwise v2i64 multiplications are not legal.
|
||||
EVT VT = Op.getValueType();
|
||||
assert(VT.is128BitVector() && VT.isInteger() &&
|
||||
"unexpected type for custom-lowering ISD::MUL");
|
||||
SDNode *N0 = Op.getOperand(0).getNode();
|
||||
SDNode *N1 = Op.getOperand(1).getNode();
|
||||
unsigned NewOpc = 0;
|
||||
bool isMLA = false;
|
||||
bool isN0SExt = isSignExtended(N0, DAG);
|
||||
bool isN1SExt = isSignExtended(N1, DAG);
|
||||
if (isN0SExt && isN1SExt)
|
||||
NewOpc = AArch64ISD::SMULL;
|
||||
else {
|
||||
bool isN0ZExt = isZeroExtended(N0, DAG);
|
||||
bool isN1ZExt = isZeroExtended(N1, DAG);
|
||||
if (isN0ZExt && isN1ZExt)
|
||||
NewOpc = AArch64ISD::UMULL;
|
||||
else if (isN1SExt || isN1ZExt) {
|
||||
// Look for (s/zext A + s/zext B) * (s/zext C). We want to turn these
|
||||
// into (s/zext A * s/zext C) + (s/zext B * s/zext C)
|
||||
if (isN1SExt && isAddSubSExt(N0, DAG)) {
|
||||
NewOpc = AArch64ISD::SMULL;
|
||||
isMLA = true;
|
||||
} else if (isN1ZExt && isAddSubZExt(N0, DAG)) {
|
||||
NewOpc = AArch64ISD::UMULL;
|
||||
isMLA = true;
|
||||
} else if (isN0ZExt && isAddSubZExt(N1, DAG)) {
|
||||
std::swap(N0, N1);
|
||||
NewOpc = AArch64ISD::UMULL;
|
||||
isMLA = true;
|
||||
}
|
||||
}
|
||||
|
||||
if (!NewOpc) {
|
||||
if (VT == MVT::v2i64)
|
||||
// Fall through to expand this. It is not legal.
|
||||
return SDValue();
|
||||
else
|
||||
// Other vector multiplications are legal.
|
||||
return Op;
|
||||
}
|
||||
}
|
||||
|
||||
// Legalize to a S/UMULL instruction
|
||||
SDLoc DL(Op);
|
||||
SDValue Op0;
|
||||
SDValue Op1 = skipExtensionForVectorMULL(N1, DAG);
|
||||
if (!isMLA) {
|
||||
Op0 = skipExtensionForVectorMULL(N0, DAG);
|
||||
assert(Op0.getValueType().is64BitVector() &&
|
||||
Op1.getValueType().is64BitVector() &&
|
||||
"unexpected types for extended operands to VMULL");
|
||||
return DAG.getNode(NewOpc, DL, VT, Op0, Op1);
|
||||
}
|
||||
// Optimizing (zext A + zext B) * C, to (S/UMULL A, C) + (S/UMULL B, C) during
|
||||
// isel lowering to take advantage of no-stall back to back s/umul + s/umla.
|
||||
// This is true for CPUs with accumulate forwarding such as Cortex-A53/A57
|
||||
SDValue N00 = skipExtensionForVectorMULL(N0->getOperand(0).getNode(), DAG);
|
||||
SDValue N01 = skipExtensionForVectorMULL(N0->getOperand(1).getNode(), DAG);
|
||||
EVT Op1VT = Op1.getValueType();
|
||||
return DAG.getNode(N0->getOpcode(), DL, VT,
|
||||
DAG.getNode(NewOpc, DL, VT,
|
||||
DAG.getNode(ISD::BITCAST, DL, Op1VT, N00), Op1),
|
||||
DAG.getNode(NewOpc, DL, VT,
|
||||
DAG.getNode(ISD::BITCAST, DL, Op1VT, N01), Op1));
|
||||
}
|
||||
|
||||
SDValue AArch64TargetLowering::LowerOperation(SDValue Op,
|
||||
SelectionDAG &DAG) const {
|
||||
@ -1768,6 +1966,8 @@ SDValue AArch64TargetLowering::LowerOperation(SDValue Op,
|
||||
return LowerFP_TO_INT(Op, DAG);
|
||||
case ISD::FSINCOS:
|
||||
return LowerFSINCOS(Op, DAG);
|
||||
case ISD::MUL:
|
||||
return LowerMUL(Op, DAG);
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -169,6 +169,9 @@ enum {
|
||||
/// mode without emitting such REV instructions.
|
||||
NVCAST,
|
||||
|
||||
SMULL,
|
||||
UMULL,
|
||||
|
||||
// NEON Load/Store with post-increment base updates
|
||||
LD2post = ISD::FIRST_TARGET_MEMORY_OPCODE,
|
||||
LD3post,
|
||||
|
@ -239,6 +239,11 @@ def AArch64WrapperLarge : SDNode<"AArch64ISD::WrapperLarge",
|
||||
|
||||
def AArch64NvCast : SDNode<"AArch64ISD::NVCAST", SDTUnaryOp>;
|
||||
|
||||
def SDT_AArch64mull : SDTypeProfile<1, 2, [SDTCisInt<0>, SDTCisInt<1>,
|
||||
SDTCisSameAs<1, 2>]>;
|
||||
def AArch64smull : SDNode<"AArch64ISD::SMULL", SDT_AArch64mull>;
|
||||
def AArch64umull : SDNode<"AArch64ISD::UMULL", SDT_AArch64mull>;
|
||||
|
||||
//===----------------------------------------------------------------------===//
|
||||
|
||||
//===----------------------------------------------------------------------===//
|
||||
@ -3148,6 +3153,46 @@ defm USUBL : SIMDLongThreeVectorBHS<1, 0b0010, "usubl",
|
||||
defm USUBW : SIMDWideThreeVectorBHS< 1, 0b0011, "usubw",
|
||||
BinOpFrag<(sub node:$LHS, (zext node:$RHS))>>;
|
||||
|
||||
// Additional patterns for SMULL and UMULL
|
||||
multiclass Neon_mul_widen_patterns<SDPatternOperator opnode,
|
||||
Instruction INST8B, Instruction INST4H, Instruction INST2S> {
|
||||
def : Pat<(v8i16 (opnode (v8i8 V64:$Rn), (v8i8 V64:$Rm))),
|
||||
(INST8B V64:$Rn, V64:$Rm)>;
|
||||
def : Pat<(v4i32 (opnode (v4i16 V64:$Rn), (v4i16 V64:$Rm))),
|
||||
(INST4H V64:$Rn, V64:$Rm)>;
|
||||
def : Pat<(v2i64 (opnode (v2i32 V64:$Rn), (v2i32 V64:$Rm))),
|
||||
(INST2S V64:$Rn, V64:$Rm)>;
|
||||
}
|
||||
|
||||
defm : Neon_mul_widen_patterns<AArch64smull, SMULLv8i8_v8i16,
|
||||
SMULLv4i16_v4i32, SMULLv2i32_v2i64>;
|
||||
defm : Neon_mul_widen_patterns<AArch64umull, UMULLv8i8_v8i16,
|
||||
UMULLv4i16_v4i32, UMULLv2i32_v2i64>;
|
||||
|
||||
// Additional patterns for SMLAL/SMLSL and UMLAL/UMLSL
|
||||
multiclass Neon_mulacc_widen_patterns<SDPatternOperator opnode,
|
||||
Instruction INST8B, Instruction INST4H, Instruction INST2S> {
|
||||
def : Pat<(v8i16 (opnode (v8i16 V128:$Rd), (v8i8 V64:$Rn), (v8i8 V64:$Rm))),
|
||||
(INST8B V128:$Rd, V64:$Rn, V64:$Rm)>;
|
||||
def : Pat<(v4i32 (opnode (v4i32 V128:$Rd), (v4i16 V64:$Rn), (v4i16 V64:$Rm))),
|
||||
(INST4H V128:$Rd, V64:$Rn, V64:$Rm)>;
|
||||
def : Pat<(v2i64 (opnode (v2i64 V128:$Rd), (v2i32 V64:$Rn), (v2i32 V64:$Rm))),
|
||||
(INST2S V128:$Rd, V64:$Rn, V64:$Rm)>;
|
||||
}
|
||||
|
||||
defm : Neon_mulacc_widen_patterns<
|
||||
TriOpFrag<(add node:$LHS, (AArch64smull node:$MHS, node:$RHS))>,
|
||||
SMLALv8i8_v8i16, SMLALv4i16_v4i32, SMLALv2i32_v2i64>;
|
||||
defm : Neon_mulacc_widen_patterns<
|
||||
TriOpFrag<(add node:$LHS, (AArch64umull node:$MHS, node:$RHS))>,
|
||||
UMLALv8i8_v8i16, UMLALv4i16_v4i32, UMLALv2i32_v2i64>;
|
||||
defm : Neon_mulacc_widen_patterns<
|
||||
TriOpFrag<(sub node:$LHS, (AArch64smull node:$MHS, node:$RHS))>,
|
||||
SMLSLv8i8_v8i16, SMLSLv4i16_v4i32, SMLSLv2i32_v2i64>;
|
||||
defm : Neon_mulacc_widen_patterns<
|
||||
TriOpFrag<(sub node:$LHS, (AArch64umull node:$MHS, node:$RHS))>,
|
||||
UMLSLv8i8_v8i16, UMLSLv4i16_v4i32, UMLSLv2i32_v2i64>;
|
||||
|
||||
// Patterns for 64-bit pmull
|
||||
def : Pat<(int_aarch64_neon_pmull64 V64:$Rn, V64:$Rm),
|
||||
(PMULLv1i64 V64:$Rn, V64:$Rm)>;
|
||||
|
332
test/CodeGen/AArch64/aarch64-smull.ll
Normal file
332
test/CodeGen/AArch64/aarch64-smull.ll
Normal file
@ -0,0 +1,332 @@
|
||||
; RUN: llc -mtriple=aarch64-none-linux-gnu -mattr=+neon < %s -o -| FileCheck %s
|
||||
|
||||
define <8 x i16> @smull_v8i8_v8i16(<8 x i8>* %A, <8 x i8>* %B) nounwind {
|
||||
; CHECK-LABEL: smull_v8i8_v8i16:
|
||||
; CHECK: smull {{v[0-9]+}}.8h, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
|
||||
%tmp1 = load <8 x i8>* %A
|
||||
%tmp2 = load <8 x i8>* %B
|
||||
%tmp3 = sext <8 x i8> %tmp1 to <8 x i16>
|
||||
%tmp4 = sext <8 x i8> %tmp2 to <8 x i16>
|
||||
%tmp5 = mul <8 x i16> %tmp3, %tmp4
|
||||
ret <8 x i16> %tmp5
|
||||
}
|
||||
|
||||
define <4 x i32> @smull_v4i16_v4i32(<4 x i16>* %A, <4 x i16>* %B) nounwind {
|
||||
; CHECK-LABEL: smull_v4i16_v4i32:
|
||||
; CHECK: smull {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
|
||||
%tmp1 = load <4 x i16>* %A
|
||||
%tmp2 = load <4 x i16>* %B
|
||||
%tmp3 = sext <4 x i16> %tmp1 to <4 x i32>
|
||||
%tmp4 = sext <4 x i16> %tmp2 to <4 x i32>
|
||||
%tmp5 = mul <4 x i32> %tmp3, %tmp4
|
||||
ret <4 x i32> %tmp5
|
||||
}
|
||||
|
||||
define <2 x i64> @smull_v2i32_v2i64(<2 x i32>* %A, <2 x i32>* %B) nounwind {
|
||||
; CHECK-LABEL: smull_v2i32_v2i64:
|
||||
; CHECK: smull {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
|
||||
%tmp1 = load <2 x i32>* %A
|
||||
%tmp2 = load <2 x i32>* %B
|
||||
%tmp3 = sext <2 x i32> %tmp1 to <2 x i64>
|
||||
%tmp4 = sext <2 x i32> %tmp2 to <2 x i64>
|
||||
%tmp5 = mul <2 x i64> %tmp3, %tmp4
|
||||
ret <2 x i64> %tmp5
|
||||
}
|
||||
|
||||
define <8 x i16> @umull_v8i8_v8i16(<8 x i8>* %A, <8 x i8>* %B) nounwind {
|
||||
; CHECK-LABEL: umull_v8i8_v8i16:
|
||||
; CHECK: umull {{v[0-9]+}}.8h, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
|
||||
%tmp1 = load <8 x i8>* %A
|
||||
%tmp2 = load <8 x i8>* %B
|
||||
%tmp3 = zext <8 x i8> %tmp1 to <8 x i16>
|
||||
%tmp4 = zext <8 x i8> %tmp2 to <8 x i16>
|
||||
%tmp5 = mul <8 x i16> %tmp3, %tmp4
|
||||
ret <8 x i16> %tmp5
|
||||
}
|
||||
|
||||
define <4 x i32> @umull_v4i16_v4i32(<4 x i16>* %A, <4 x i16>* %B) nounwind {
|
||||
; CHECK-LABEL: umull_v4i16_v4i32:
|
||||
; CHECK: umull {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
|
||||
%tmp1 = load <4 x i16>* %A
|
||||
%tmp2 = load <4 x i16>* %B
|
||||
%tmp3 = zext <4 x i16> %tmp1 to <4 x i32>
|
||||
%tmp4 = zext <4 x i16> %tmp2 to <4 x i32>
|
||||
%tmp5 = mul <4 x i32> %tmp3, %tmp4
|
||||
ret <4 x i32> %tmp5
|
||||
}
|
||||
|
||||
define <2 x i64> @umull_v2i32_v2i64(<2 x i32>* %A, <2 x i32>* %B) nounwind {
|
||||
; CHECK-LABEL: umull_v2i32_v2i64:
|
||||
; CHECK: umull {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
|
||||
%tmp1 = load <2 x i32>* %A
|
||||
%tmp2 = load <2 x i32>* %B
|
||||
%tmp3 = zext <2 x i32> %tmp1 to <2 x i64>
|
||||
%tmp4 = zext <2 x i32> %tmp2 to <2 x i64>
|
||||
%tmp5 = mul <2 x i64> %tmp3, %tmp4
|
||||
ret <2 x i64> %tmp5
|
||||
}
|
||||
|
||||
define <8 x i16> @smlal_v8i8_v8i16(<8 x i16>* %A, <8 x i8>* %B, <8 x i8>* %C) nounwind {
|
||||
; CHECK-LABEL: smlal_v8i8_v8i16:
|
||||
; CHECK: smlal {{v[0-9]+}}.8h, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
|
||||
%tmp1 = load <8 x i16>* %A
|
||||
%tmp2 = load <8 x i8>* %B
|
||||
%tmp3 = load <8 x i8>* %C
|
||||
%tmp4 = sext <8 x i8> %tmp2 to <8 x i16>
|
||||
%tmp5 = sext <8 x i8> %tmp3 to <8 x i16>
|
||||
%tmp6 = mul <8 x i16> %tmp4, %tmp5
|
||||
%tmp7 = add <8 x i16> %tmp1, %tmp6
|
||||
ret <8 x i16> %tmp7
|
||||
}
|
||||
|
||||
define <4 x i32> @smlal_v4i16_v4i32(<4 x i32>* %A, <4 x i16>* %B, <4 x i16>* %C) nounwind {
|
||||
; CHECK-LABEL: smlal_v4i16_v4i32:
|
||||
; CHECK: smlal {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
|
||||
%tmp1 = load <4 x i32>* %A
|
||||
%tmp2 = load <4 x i16>* %B
|
||||
%tmp3 = load <4 x i16>* %C
|
||||
%tmp4 = sext <4 x i16> %tmp2 to <4 x i32>
|
||||
%tmp5 = sext <4 x i16> %tmp3 to <4 x i32>
|
||||
%tmp6 = mul <4 x i32> %tmp4, %tmp5
|
||||
%tmp7 = add <4 x i32> %tmp1, %tmp6
|
||||
ret <4 x i32> %tmp7
|
||||
}
|
||||
|
||||
define <2 x i64> @smlal_v2i32_v2i64(<2 x i64>* %A, <2 x i32>* %B, <2 x i32>* %C) nounwind {
|
||||
; CHECK-LABEL: smlal_v2i32_v2i64:
|
||||
; CHECK: smlal {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
|
||||
%tmp1 = load <2 x i64>* %A
|
||||
%tmp2 = load <2 x i32>* %B
|
||||
%tmp3 = load <2 x i32>* %C
|
||||
%tmp4 = sext <2 x i32> %tmp2 to <2 x i64>
|
||||
%tmp5 = sext <2 x i32> %tmp3 to <2 x i64>
|
||||
%tmp6 = mul <2 x i64> %tmp4, %tmp5
|
||||
%tmp7 = add <2 x i64> %tmp1, %tmp6
|
||||
ret <2 x i64> %tmp7
|
||||
}
|
||||
|
||||
define <8 x i16> @umlal_v8i8_v8i16(<8 x i16>* %A, <8 x i8>* %B, <8 x i8>* %C) nounwind {
|
||||
; CHECK-LABEL: umlal_v8i8_v8i16:
|
||||
; CHECK: umlal {{v[0-9]+}}.8h, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
|
||||
%tmp1 = load <8 x i16>* %A
|
||||
%tmp2 = load <8 x i8>* %B
|
||||
%tmp3 = load <8 x i8>* %C
|
||||
%tmp4 = zext <8 x i8> %tmp2 to <8 x i16>
|
||||
%tmp5 = zext <8 x i8> %tmp3 to <8 x i16>
|
||||
%tmp6 = mul <8 x i16> %tmp4, %tmp5
|
||||
%tmp7 = add <8 x i16> %tmp1, %tmp6
|
||||
ret <8 x i16> %tmp7
|
||||
}
|
||||
|
||||
define <4 x i32> @umlal_v4i16_v4i32(<4 x i32>* %A, <4 x i16>* %B, <4 x i16>* %C) nounwind {
|
||||
; CHECK-LABEL: umlal_v4i16_v4i32:
|
||||
; CHECK: umlal {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
|
||||
%tmp1 = load <4 x i32>* %A
|
||||
%tmp2 = load <4 x i16>* %B
|
||||
%tmp3 = load <4 x i16>* %C
|
||||
%tmp4 = zext <4 x i16> %tmp2 to <4 x i32>
|
||||
%tmp5 = zext <4 x i16> %tmp3 to <4 x i32>
|
||||
%tmp6 = mul <4 x i32> %tmp4, %tmp5
|
||||
%tmp7 = add <4 x i32> %tmp1, %tmp6
|
||||
ret <4 x i32> %tmp7
|
||||
}
|
||||
|
||||
define <2 x i64> @umlal_v2i32_v2i64(<2 x i64>* %A, <2 x i32>* %B, <2 x i32>* %C) nounwind {
|
||||
; CHECK-LABEL: umlal_v2i32_v2i64:
|
||||
; CHECK: umlal {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
|
||||
%tmp1 = load <2 x i64>* %A
|
||||
%tmp2 = load <2 x i32>* %B
|
||||
%tmp3 = load <2 x i32>* %C
|
||||
%tmp4 = zext <2 x i32> %tmp2 to <2 x i64>
|
||||
%tmp5 = zext <2 x i32> %tmp3 to <2 x i64>
|
||||
%tmp6 = mul <2 x i64> %tmp4, %tmp5
|
||||
%tmp7 = add <2 x i64> %tmp1, %tmp6
|
||||
ret <2 x i64> %tmp7
|
||||
}
|
||||
|
||||
define <8 x i16> @smlsl_v8i8_v8i16(<8 x i16>* %A, <8 x i8>* %B, <8 x i8>* %C) nounwind {
|
||||
; CHECK-LABEL: smlsl_v8i8_v8i16:
|
||||
; CHECK: smlsl {{v[0-9]+}}.8h, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
|
||||
%tmp1 = load <8 x i16>* %A
|
||||
%tmp2 = load <8 x i8>* %B
|
||||
%tmp3 = load <8 x i8>* %C
|
||||
%tmp4 = sext <8 x i8> %tmp2 to <8 x i16>
|
||||
%tmp5 = sext <8 x i8> %tmp3 to <8 x i16>
|
||||
%tmp6 = mul <8 x i16> %tmp4, %tmp5
|
||||
%tmp7 = sub <8 x i16> %tmp1, %tmp6
|
||||
ret <8 x i16> %tmp7
|
||||
}
|
||||
|
||||
define <4 x i32> @smlsl_v4i16_v4i32(<4 x i32>* %A, <4 x i16>* %B, <4 x i16>* %C) nounwind {
|
||||
; CHECK-LABEL: smlsl_v4i16_v4i32:
|
||||
; CHECK: smlsl {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
|
||||
%tmp1 = load <4 x i32>* %A
|
||||
%tmp2 = load <4 x i16>* %B
|
||||
%tmp3 = load <4 x i16>* %C
|
||||
%tmp4 = sext <4 x i16> %tmp2 to <4 x i32>
|
||||
%tmp5 = sext <4 x i16> %tmp3 to <4 x i32>
|
||||
%tmp6 = mul <4 x i32> %tmp4, %tmp5
|
||||
%tmp7 = sub <4 x i32> %tmp1, %tmp6
|
||||
ret <4 x i32> %tmp7
|
||||
}
|
||||
|
||||
define <2 x i64> @smlsl_v2i32_v2i64(<2 x i64>* %A, <2 x i32>* %B, <2 x i32>* %C) nounwind {
|
||||
; CHECK-LABEL: smlsl_v2i32_v2i64:
|
||||
; CHECK: smlsl {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
|
||||
%tmp1 = load <2 x i64>* %A
|
||||
%tmp2 = load <2 x i32>* %B
|
||||
%tmp3 = load <2 x i32>* %C
|
||||
%tmp4 = sext <2 x i32> %tmp2 to <2 x i64>
|
||||
%tmp5 = sext <2 x i32> %tmp3 to <2 x i64>
|
||||
%tmp6 = mul <2 x i64> %tmp4, %tmp5
|
||||
%tmp7 = sub <2 x i64> %tmp1, %tmp6
|
||||
ret <2 x i64> %tmp7
|
||||
}
|
||||
|
||||
define <8 x i16> @umlsl_v8i8_v8i16(<8 x i16>* %A, <8 x i8>* %B, <8 x i8>* %C) nounwind {
|
||||
; CHECK-LABEL: umlsl_v8i8_v8i16:
|
||||
; CHECK: umlsl {{v[0-9]+}}.8h, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
|
||||
%tmp1 = load <8 x i16>* %A
|
||||
%tmp2 = load <8 x i8>* %B
|
||||
%tmp3 = load <8 x i8>* %C
|
||||
%tmp4 = zext <8 x i8> %tmp2 to <8 x i16>
|
||||
%tmp5 = zext <8 x i8> %tmp3 to <8 x i16>
|
||||
%tmp6 = mul <8 x i16> %tmp4, %tmp5
|
||||
%tmp7 = sub <8 x i16> %tmp1, %tmp6
|
||||
ret <8 x i16> %tmp7
|
||||
}
|
||||
|
||||
define <4 x i32> @umlsl_v4i16_v4i32(<4 x i32>* %A, <4 x i16>* %B, <4 x i16>* %C) nounwind {
|
||||
; CHECK-LABEL: umlsl_v4i16_v4i32:
|
||||
; CHECK: umlsl {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
|
||||
%tmp1 = load <4 x i32>* %A
|
||||
%tmp2 = load <4 x i16>* %B
|
||||
%tmp3 = load <4 x i16>* %C
|
||||
%tmp4 = zext <4 x i16> %tmp2 to <4 x i32>
|
||||
%tmp5 = zext <4 x i16> %tmp3 to <4 x i32>
|
||||
%tmp6 = mul <4 x i32> %tmp4, %tmp5
|
||||
%tmp7 = sub <4 x i32> %tmp1, %tmp6
|
||||
ret <4 x i32> %tmp7
|
||||
}
|
||||
|
||||
define <2 x i64> @umlsl_v2i32_v2i64(<2 x i64>* %A, <2 x i32>* %B, <2 x i32>* %C) nounwind {
|
||||
; CHECK-LABEL: umlsl_v2i32_v2i64:
|
||||
; CHECK: umlsl {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
|
||||
%tmp1 = load <2 x i64>* %A
|
||||
%tmp2 = load <2 x i32>* %B
|
||||
%tmp3 = load <2 x i32>* %C
|
||||
%tmp4 = zext <2 x i32> %tmp2 to <2 x i64>
|
||||
%tmp5 = zext <2 x i32> %tmp3 to <2 x i64>
|
||||
%tmp6 = mul <2 x i64> %tmp4, %tmp5
|
||||
%tmp7 = sub <2 x i64> %tmp1, %tmp6
|
||||
ret <2 x i64> %tmp7
|
||||
}
|
||||
|
||||
; SMULL recognizing BUILD_VECTORs with sign/zero-extended elements.
|
||||
define <8 x i16> @smull_extvec_v8i8_v8i16(<8 x i8> %arg) nounwind {
|
||||
; CHECK-LABEL: smull_extvec_v8i8_v8i16:
|
||||
; CHECK: smull {{v[0-9]+}}.8h, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
|
||||
%tmp3 = sext <8 x i8> %arg to <8 x i16>
|
||||
%tmp4 = mul <8 x i16> %tmp3, <i16 -12, i16 -12, i16 -12, i16 -12, i16 -12, i16 -12, i16 -12, i16 -12>
|
||||
ret <8 x i16> %tmp4
|
||||
}
|
||||
|
||||
define <8 x i16> @smull_noextvec_v8i8_v8i16(<8 x i8> %arg) nounwind {
|
||||
; Do not use SMULL if the BUILD_VECTOR element values are too big.
|
||||
; CHECK-LABEL: smull_noextvec_v8i8_v8i16:
|
||||
; CHECK: movz
|
||||
; CHECK: mul {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
|
||||
%tmp3 = sext <8 x i8> %arg to <8 x i16>
|
||||
%tmp4 = mul <8 x i16> %tmp3, <i16 -999, i16 -999, i16 -999, i16 -999, i16 -999, i16 -999, i16 -999, i16 -999>
|
||||
ret <8 x i16> %tmp4
|
||||
}
|
||||
|
||||
define <4 x i32> @smull_extvec_v4i16_v4i32(<4 x i16> %arg) nounwind {
|
||||
; CHECK-LABEL: smull_extvec_v4i16_v4i32:
|
||||
; CHECK: smull {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
|
||||
%tmp3 = sext <4 x i16> %arg to <4 x i32>
|
||||
%tmp4 = mul <4 x i32> %tmp3, <i32 -12, i32 -12, i32 -12, i32 -12>
|
||||
ret <4 x i32> %tmp4
|
||||
}
|
||||
|
||||
define <2 x i64> @smull_extvec_v2i32_v2i64(<2 x i32> %arg) nounwind {
|
||||
; CHECK: smull_extvec_v2i32_v2i64
|
||||
; CHECK: smull {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
|
||||
%tmp3 = sext <2 x i32> %arg to <2 x i64>
|
||||
%tmp4 = mul <2 x i64> %tmp3, <i64 -1234, i64 -1234>
|
||||
ret <2 x i64> %tmp4
|
||||
}
|
||||
|
||||
define <8 x i16> @umull_extvec_v8i8_v8i16(<8 x i8> %arg) nounwind {
|
||||
; CHECK-LABEL: umull_extvec_v8i8_v8i16:
|
||||
; CHECK: umull {{v[0-9]+}}.8h, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
|
||||
%tmp3 = zext <8 x i8> %arg to <8 x i16>
|
||||
%tmp4 = mul <8 x i16> %tmp3, <i16 12, i16 12, i16 12, i16 12, i16 12, i16 12, i16 12, i16 12>
|
||||
ret <8 x i16> %tmp4
|
||||
}
|
||||
|
||||
define <8 x i16> @umull_noextvec_v8i8_v8i16(<8 x i8> %arg) nounwind {
|
||||
; Do not use SMULL if the BUILD_VECTOR element values are too big.
|
||||
; CHECK-LABEL: umull_noextvec_v8i8_v8i16:
|
||||
; CHECK: movz
|
||||
; CHECK: mul {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
|
||||
%tmp3 = zext <8 x i8> %arg to <8 x i16>
|
||||
%tmp4 = mul <8 x i16> %tmp3, <i16 999, i16 999, i16 999, i16 999, i16 999, i16 999, i16 999, i16 999>
|
||||
ret <8 x i16> %tmp4
|
||||
}
|
||||
|
||||
define <4 x i32> @umull_extvec_v4i16_v4i32(<4 x i16> %arg) nounwind {
|
||||
; CHECK-LABEL: umull_extvec_v4i16_v4i32:
|
||||
; CHECK: umull {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
|
||||
%tmp3 = zext <4 x i16> %arg to <4 x i32>
|
||||
%tmp4 = mul <4 x i32> %tmp3, <i32 1234, i32 1234, i32 1234, i32 1234>
|
||||
ret <4 x i32> %tmp4
|
||||
}
|
||||
|
||||
define <2 x i64> @umull_extvec_v2i32_v2i64(<2 x i32> %arg) nounwind {
|
||||
; CHECK-LABEL: umull_extvec_v2i32_v2i64:
|
||||
; CHECK: umull {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
|
||||
%tmp3 = zext <2 x i32> %arg to <2 x i64>
|
||||
%tmp4 = mul <2 x i64> %tmp3, <i64 1234, i64 1234>
|
||||
ret <2 x i64> %tmp4
|
||||
}
|
||||
|
||||
define i16 @smullWithInconsistentExtensions(<8 x i8> %vec) {
|
||||
; If one operand has a zero-extend and the other a sign-extend, smull
|
||||
; cannot be used.
|
||||
; CHECK-LABEL: smullWithInconsistentExtensions:
|
||||
; CHECK: mul {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
|
||||
%1 = sext <8 x i8> %vec to <8 x i16>
|
||||
%2 = mul <8 x i16> %1, <i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255>
|
||||
%3 = extractelement <8 x i16> %2, i32 0
|
||||
ret i16 %3
|
||||
}
|
||||
|
||||
define void @distribute(i16* %dst, i8* %src, i32 %mul) nounwind {
|
||||
entry:
|
||||
; CHECK-LABEL: distribute:
|
||||
; CHECK: umull [[REG1:(v[0-9]+.8h)]], {{v[0-9]+}}.8b, [[REG2:(v[0-9]+.8b)]]
|
||||
; CHECK: umlal [[REG1]], {{v[0-9]+}}.8b, [[REG2]]
|
||||
%0 = trunc i32 %mul to i8
|
||||
%1 = insertelement <8 x i8> undef, i8 %0, i32 0
|
||||
%2 = shufflevector <8 x i8> %1, <8 x i8> undef, <8 x i32> zeroinitializer
|
||||
%3 = tail call <16 x i8> @llvm.aarch64.neon.vld1.v16i8(i8* %src, i32 1)
|
||||
%4 = bitcast <16 x i8> %3 to <2 x double>
|
||||
%5 = extractelement <2 x double> %4, i32 1
|
||||
%6 = bitcast double %5 to <8 x i8>
|
||||
%7 = zext <8 x i8> %6 to <8 x i16>
|
||||
%8 = zext <8 x i8> %2 to <8 x i16>
|
||||
%9 = extractelement <2 x double> %4, i32 0
|
||||
%10 = bitcast double %9 to <8 x i8>
|
||||
%11 = zext <8 x i8> %10 to <8 x i16>
|
||||
%12 = add <8 x i16> %7, %11
|
||||
%13 = mul <8 x i16> %12, %8
|
||||
%14 = bitcast i16* %dst to i8*
|
||||
tail call void @llvm.aarch64.neon.vst1.v8i16(i8* %14, <8 x i16> %13, i32 2)
|
||||
ret void
|
||||
}
|
||||
|
||||
declare <16 x i8> @llvm.aarch64.neon.vld1.v16i8(i8*, i32) nounwind readonly
|
||||
|
||||
declare void @llvm.aarch64.neon.vst1.v8i16(i8*, <8 x i16>, i32) nounwind
|
||||
|
Loading…
Reference in New Issue
Block a user