[AArch64] Support selecting STNP.

We could go through the load/store optimizer and match STNP where
we would have matched a nontemporal-annotated STP, but that's not
reliable enough, as an opportunistic optimization.
Insetad, we can guarantee emitting STNP, by matching them at ISel.
Since there are no single-input nontemporal stores, we have to
resort to some high-bits-extracting trickery to generate an STNP
from a plain store.

Also, we need to support another, LDP/STP-specific addressing mode,
base + signed scaled 7-bit immediate offset.
For now, only match the base. Let's make it smart separately.

Part of PR24086.


git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@247231 91177308-0d34-0410-b5e6-96231b3b80d8
This commit is contained in:
Ahmed Bougacha 2015-09-10 01:42:28 +00:00
parent 3a2cec85a7
commit d636e64cbc
4 changed files with 270 additions and 0 deletions

View File

@ -77,6 +77,21 @@ public:
bool SelectLogicalShiftedRegister(SDValue N, SDValue &Reg, SDValue &Shift) {
return SelectShiftedRegister(N, true, Reg, Shift);
}
bool SelectAddrModeIndexed7S8(SDValue N, SDValue &Base, SDValue &OffImm) {
return SelectAddrModeIndexed7S(N, 1, Base, OffImm);
}
bool SelectAddrModeIndexed7S16(SDValue N, SDValue &Base, SDValue &OffImm) {
return SelectAddrModeIndexed7S(N, 2, Base, OffImm);
}
bool SelectAddrModeIndexed7S32(SDValue N, SDValue &Base, SDValue &OffImm) {
return SelectAddrModeIndexed7S(N, 4, Base, OffImm);
}
bool SelectAddrModeIndexed7S64(SDValue N, SDValue &Base, SDValue &OffImm) {
return SelectAddrModeIndexed7S(N, 8, Base, OffImm);
}
bool SelectAddrModeIndexed7S128(SDValue N, SDValue &Base, SDValue &OffImm) {
return SelectAddrModeIndexed7S(N, 16, Base, OffImm);
}
bool SelectAddrModeIndexed8(SDValue N, SDValue &Base, SDValue &OffImm) {
return SelectAddrModeIndexed(N, 1, Base, OffImm);
}
@ -164,6 +179,8 @@ public:
private:
bool SelectShiftedRegister(SDValue N, bool AllowROR, SDValue &Reg,
SDValue &Shift);
bool SelectAddrModeIndexed7S(SDValue N, unsigned Size, SDValue &Base,
SDValue &OffImm);
bool SelectAddrModeIndexed(SDValue N, unsigned Size, SDValue &Base,
SDValue &OffImm);
bool SelectAddrModeUnscaled(SDValue N, unsigned Size, SDValue &Base,
@ -606,6 +623,22 @@ static bool isWorthFoldingADDlow(SDValue N) {
return true;
}
/// SelectAddrModeIndexed7S - Select a "register plus scaled signed 7-bit
/// immediate" address. The "Size" argument is the size in bytes of the memory
/// reference, which determines the scale.
bool AArch64DAGToDAGISel::SelectAddrModeIndexed7S(SDValue N, unsigned Size,
SDValue &Base,
SDValue &OffImm) {
SDLoc dl(N);
// Base only. The address will be materialized into a register before
// the memory is accessed.
// add x0, Xbase, #offset
// stp x1, x2, [x0]
Base = N;
OffImm = CurDAG->getTargetConstant(0, dl, MVT::i64);
return true;
}
/// SelectAddrModeIndexed - Select a "register plus scaled unsigned 12-bit
/// immediate" address. The "Size" argument is the size in bytes of the memory
/// reference, which determines the scale.

View File

@ -248,6 +248,12 @@ def simm7s16 : Operand<i32> {
let PrintMethod = "printImmScale<16>";
}
def am_indexed7s8 : ComplexPattern<i64, 2, "SelectAddrModeIndexed7S8", []>;
def am_indexed7s16 : ComplexPattern<i64, 2, "SelectAddrModeIndexed7S16", []>;
def am_indexed7s32 : ComplexPattern<i64, 2, "SelectAddrModeIndexed7S32", []>;
def am_indexed7s64 : ComplexPattern<i64, 2, "SelectAddrModeIndexed7S64", []>;
def am_indexed7s128 : ComplexPattern<i64, 2, "SelectAddrModeIndexed7S128", []>;
class AsmImmRange<int Low, int High> : AsmOperandClass {
let Name = "Imm" # Low # "_" # High;
let DiagnosticType = "InvalidImm" # Low # "_" # High;

View File

@ -5825,6 +5825,45 @@ def : Pat<(i64 (int_aarch64_neon_srshl (i64 FPR64:$Rn), (i64 FPR64:$Rm))),
def : Pat<(i64 (int_aarch64_neon_urshl (i64 FPR64:$Rn), (i64 FPR64:$Rm))),
(URSHLv1i64 FPR64:$Rn, FPR64:$Rm)>;
// Patterns for nontemporal/no-allocate stores.
// We have to resort to tricks to turn a single-input store into a store pair,
// because there is no single-input nontemporal store, only STNP.
let Predicates = [IsLE] in {
let AddedComplexity = 15 in {
class NTStore128Pat<ValueType VT> :
Pat<(nontemporalstore (VT FPR128:$Rt),
(am_indexed7s64 GPR64sp:$Rn, simm7s8:$offset)),
(STNPDi (EXTRACT_SUBREG FPR128:$Rt, dsub),
(CPYi64 FPR128:$Rt, (i64 1)),
GPR64sp:$Rn, simm7s8:$offset)>;
def : NTStore128Pat<v2i64>;
def : NTStore128Pat<v4i32>;
def : NTStore128Pat<v8i16>;
def : NTStore128Pat<v16i8>;
class NTStore64Pat<ValueType VT> :
Pat<(nontemporalstore (VT FPR64:$Rt),
(am_indexed7s32 GPR64sp:$Rn, simm7s4:$offset)),
(STNPSi (EXTRACT_SUBREG FPR64:$Rt, ssub),
(CPYi32 (SUBREG_TO_REG (i64 0), FPR64:$Rt, dsub), (i64 1)),
GPR64sp:$Rn, simm7s4:$offset)>;
// FIXME: Shouldn't v1f64 loads/stores be promoted to v1i64?
def : NTStore64Pat<v1f64>;
def : NTStore64Pat<v1i64>;
def : NTStore64Pat<v2i32>;
def : NTStore64Pat<v4i16>;
def : NTStore64Pat<v8i8>;
def : Pat<(nontemporalstore GPR64:$Rt,
(am_indexed7s32 GPR64sp:$Rn, simm7s4:$offset)),
(STNPWi (EXTRACT_SUBREG GPR64:$Rt, sub_32),
(EXTRACT_SUBREG (UBFMXri GPR64:$Rt, 0, 31), sub_32),
GPR64sp:$Rn, simm7s4:$offset)>;
} // AddedComplexity=10
} // Predicates = [IsLE]
// Tail call return handling. These are all compiler pseudo-instructions,
// so no encoding information or anything like that.
let isCall = 1, isTerminator = 1, isReturn = 1, isBarrier = 1, Uses = [SP] in {

View File

@ -0,0 +1,192 @@
; RUN: llc < %s -mtriple aarch64-apple-darwin -asm-verbose=false | FileCheck %s
define void @test_stnp_v4i64(<4 x i64>* %p, <4 x i64> %v) #0 {
; CHECK-LABEL: test_stnp_v4i64:
; CHECK-NEXT: add x[[PTR:[0-9]+]], x0, #16
; CHECK-NEXT: mov d[[HI1:[0-9]+]], v1[1]
; CHECK-NEXT: mov d[[HI0:[0-9]+]], v0[1]
; CHECK-NEXT: stnp d1, d[[HI1]], [x[[PTR]]]
; CHECK-NEXT: stnp d0, d[[HI0]], [x0]
; CHECK-NEXT: ret
store <4 x i64> %v, <4 x i64>* %p, align 1, !nontemporal !0
ret void
}
define void @test_stnp_v4i32(<4 x i32>* %p, <4 x i32> %v) #0 {
; CHECK-LABEL: test_stnp_v4i32:
; CHECK-NEXT: mov d[[HI:[0-9]+]], v0[1]
; CHECK-NEXT: stnp d0, d[[HI]], [x0]
; CHECK-NEXT: ret
store <4 x i32> %v, <4 x i32>* %p, align 1, !nontemporal !0
ret void
}
define void @test_stnp_v8i16(<8 x i16>* %p, <8 x i16> %v) #0 {
; CHECK-LABEL: test_stnp_v8i16:
; CHECK-NEXT: mov d[[HI:[0-9]+]], v0[1]
; CHECK-NEXT: stnp d0, d[[HI]], [x0]
; CHECK-NEXT: ret
store <8 x i16> %v, <8 x i16>* %p, align 1, !nontemporal !0
ret void
}
define void @test_stnp_v16i8(<16 x i8>* %p, <16 x i8> %v) #0 {
; CHECK-LABEL: test_stnp_v16i8:
; CHECK-NEXT: mov d[[HI:[0-9]+]], v0[1]
; CHECK-NEXT: stnp d0, d[[HI]], [x0]
; CHECK-NEXT: ret
store <16 x i8> %v, <16 x i8>* %p, align 1, !nontemporal !0
ret void
}
define void @test_stnp_v2i32(<2 x i32>* %p, <2 x i32> %v) #0 {
; CHECK-LABEL: test_stnp_v2i32:
; CHECK-NEXT: mov s[[HI:[0-9]+]], v0[1]
; CHECK-NEXT: stnp s0, s[[HI]], [x0]
; CHECK-NEXT: ret
store <2 x i32> %v, <2 x i32>* %p, align 1, !nontemporal !0
ret void
}
define void @test_stnp_v4i16(<4 x i16>* %p, <4 x i16> %v) #0 {
; CHECK-LABEL: test_stnp_v4i16:
; CHECK-NEXT: mov s[[HI:[0-9]+]], v0[1]
; CHECK-NEXT: stnp s0, s[[HI]], [x0]
; CHECK-NEXT: ret
store <4 x i16> %v, <4 x i16>* %p, align 1, !nontemporal !0
ret void
}
define void @test_stnp_v8i8(<8 x i8>* %p, <8 x i8> %v) #0 {
; CHECK-LABEL: test_stnp_v8i8:
; CHECK-NEXT: mov s[[HI:[0-9]+]], v0[1]
; CHECK-NEXT: stnp s0, s[[HI]], [x0]
; CHECK-NEXT: ret
store <8 x i8> %v, <8 x i8>* %p, align 1, !nontemporal !0
ret void
}
define void @test_stnp_v2f64(<2 x double>* %p, <2 x double> %v) #0 {
; CHECK-LABEL: test_stnp_v2f64:
; CHECK-NEXT: mov d[[HI:[0-9]+]], v0[1]
; CHECK-NEXT: stnp d0, d[[HI]], [x0]
; CHECK-NEXT: ret
store <2 x double> %v, <2 x double>* %p, align 1, !nontemporal !0
ret void
}
define void @test_stnp_v4f32(<4 x float>* %p, <4 x float> %v) #0 {
; CHECK-LABEL: test_stnp_v4f32:
; CHECK-NEXT: mov d[[HI:[0-9]+]], v0[1]
; CHECK-NEXT: stnp d0, d[[HI]], [x0]
; CHECK-NEXT: ret
store <4 x float> %v, <4 x float>* %p, align 1, !nontemporal !0
ret void
}
define void @test_stnp_v2f32(<2 x float>* %p, <2 x float> %v) #0 {
; CHECK-LABEL: test_stnp_v2f32:
; CHECK-NEXT: mov s[[HI:[0-9]+]], v0[1]
; CHECK-NEXT: stnp s0, s[[HI]], [x0]
; CHECK-NEXT: ret
store <2 x float> %v, <2 x float>* %p, align 1, !nontemporal !0
ret void
}
define void @test_stnp_v1f64(<1 x double>* %p, <1 x double> %v) #0 {
; CHECK-LABEL: test_stnp_v1f64:
; CHECK-NEXT: mov s[[HI:[0-9]+]], v0[1]
; CHECK-NEXT: stnp s0, s[[HI]], [x0]
; CHECK-NEXT: ret
store <1 x double> %v, <1 x double>* %p, align 1, !nontemporal !0
ret void
}
define void @test_stnp_v1i64(<1 x i64>* %p, <1 x i64> %v) #0 {
; CHECK-LABEL: test_stnp_v1i64:
; CHECK-NEXT: mov s[[HI:[0-9]+]], v0[1]
; CHECK-NEXT: stnp s0, s[[HI]], [x0]
; CHECK-NEXT: ret
store <1 x i64> %v, <1 x i64>* %p, align 1, !nontemporal !0
ret void
}
define void @test_stnp_i64(i64* %p, i64 %v) #0 {
; CHECK-LABEL: test_stnp_i64:
; CHECK-NEXT: ubfx x[[HI:[0-9]+]], x1, #0, #32
; CHECK-NEXT: stnp w1, w[[HI]], [x0]
; CHECK-NEXT: ret
store i64 %v, i64* %p, align 1, !nontemporal !0
ret void
}
define void @test_stnp_v2f64_offset(<2 x double>* %p, <2 x double> %v) #0 {
; CHECK-LABEL: test_stnp_v2f64_offset:
; CHECK-NEXT: add x[[PTR:[0-9]+]], x0, #16
; CHECK-NEXT: mov d[[HI:[0-9]+]], v0[1]
; CHECK-NEXT: stnp d0, d[[HI]], [x[[PTR]]]
; CHECK-NEXT: ret
%tmp0 = getelementptr <2 x double>, <2 x double>* %p, i32 1
store <2 x double> %v, <2 x double>* %tmp0, align 1, !nontemporal !0
ret void
}
define void @test_stnp_v2f64_offset_neg(<2 x double>* %p, <2 x double> %v) #0 {
; CHECK-LABEL: test_stnp_v2f64_offset_neg:
; CHECK-NEXT: sub x[[PTR:[0-9]+]], x0, #16
; CHECK-NEXT: mov d[[HI:[0-9]+]], v0[1]
; CHECK-NEXT: stnp d0, d[[HI]], [x[[PTR]]]
; CHECK-NEXT: ret
%tmp0 = getelementptr <2 x double>, <2 x double>* %p, i32 -1
store <2 x double> %v, <2 x double>* %tmp0, align 1, !nontemporal !0
ret void
}
define void @test_stnp_v2f32_offset(<2 x float>* %p, <2 x float> %v) #0 {
; CHECK-LABEL: test_stnp_v2f32_offset:
; CHECK-NEXT: add x[[PTR:[0-9]+]], x0, #8
; CHECK-NEXT: mov s[[HI:[0-9]+]], v0[1]
; CHECK-NEXT: stnp s0, s[[HI]], [x[[PTR]]]
; CHECK-NEXT: ret
%tmp0 = getelementptr <2 x float>, <2 x float>* %p, i32 1
store <2 x float> %v, <2 x float>* %tmp0, align 1, !nontemporal !0
ret void
}
define void @test_stnp_v2f32_offset_neg(<2 x float>* %p, <2 x float> %v) #0 {
; CHECK-LABEL: test_stnp_v2f32_offset_neg:
; CHECK-NEXT: sub x[[PTR:[0-9]+]], x0, #8
; CHECK-NEXT: mov s[[HI:[0-9]+]], v0[1]
; CHECK-NEXT: stnp s0, s[[HI]], [x[[PTR]]]
; CHECK-NEXT: ret
%tmp0 = getelementptr <2 x float>, <2 x float>* %p, i32 -1
store <2 x float> %v, <2 x float>* %tmp0, align 1, !nontemporal !0
ret void
}
define void @test_stnp_i64_offset(i64* %p, i64 %v) #0 {
; CHECK-LABEL: test_stnp_i64_offset:
; CHECK-NEXT: add x[[PTR:[0-9]+]], x0, #8
; CHECK-NEXT: ubfx x[[HI:[0-9]+]], x1, #0, #32
; CHECK-NEXT: stnp w1, w[[HI]], [x[[PTR]]]
; CHECK-NEXT: ret
%tmp0 = getelementptr i64, i64* %p, i32 1
store i64 %v, i64* %tmp0, align 1, !nontemporal !0
ret void
}
define void @test_stnp_i64_offset_neg(i64* %p, i64 %v) #0 {
; CHECK-LABEL: test_stnp_i64_offset_neg:
; CHECK-NEXT: sub x[[PTR:[0-9]+]], x0, #8
; CHECK-NEXT: ubfx x[[HI:[0-9]+]], x1, #0, #32
; CHECK-NEXT: stnp w1, w[[HI]], [x[[PTR]]]
; CHECK-NEXT: ret
%tmp0 = getelementptr i64, i64* %p, i32 -1
store i64 %v, i64* %tmp0, align 1, !nontemporal !0
ret void
}
!0 = !{ i32 1 }
attributes #0 = { nounwind }