mirror of
https://github.com/RPCS3/llvm.git
synced 2025-02-07 02:56:52 +00:00
[AArch64] Support selecting STNP.
We could go through the load/store optimizer and match STNP where we would have matched a nontemporal-annotated STP, but that's not reliable enough, as an opportunistic optimization. Insetad, we can guarantee emitting STNP, by matching them at ISel. Since there are no single-input nontemporal stores, we have to resort to some high-bits-extracting trickery to generate an STNP from a plain store. Also, we need to support another, LDP/STP-specific addressing mode, base + signed scaled 7-bit immediate offset. For now, only match the base. Let's make it smart separately. Part of PR24086. git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@247231 91177308-0d34-0410-b5e6-96231b3b80d8
This commit is contained in:
parent
3a2cec85a7
commit
d636e64cbc
@ -77,6 +77,21 @@ public:
|
||||
bool SelectLogicalShiftedRegister(SDValue N, SDValue &Reg, SDValue &Shift) {
|
||||
return SelectShiftedRegister(N, true, Reg, Shift);
|
||||
}
|
||||
bool SelectAddrModeIndexed7S8(SDValue N, SDValue &Base, SDValue &OffImm) {
|
||||
return SelectAddrModeIndexed7S(N, 1, Base, OffImm);
|
||||
}
|
||||
bool SelectAddrModeIndexed7S16(SDValue N, SDValue &Base, SDValue &OffImm) {
|
||||
return SelectAddrModeIndexed7S(N, 2, Base, OffImm);
|
||||
}
|
||||
bool SelectAddrModeIndexed7S32(SDValue N, SDValue &Base, SDValue &OffImm) {
|
||||
return SelectAddrModeIndexed7S(N, 4, Base, OffImm);
|
||||
}
|
||||
bool SelectAddrModeIndexed7S64(SDValue N, SDValue &Base, SDValue &OffImm) {
|
||||
return SelectAddrModeIndexed7S(N, 8, Base, OffImm);
|
||||
}
|
||||
bool SelectAddrModeIndexed7S128(SDValue N, SDValue &Base, SDValue &OffImm) {
|
||||
return SelectAddrModeIndexed7S(N, 16, Base, OffImm);
|
||||
}
|
||||
bool SelectAddrModeIndexed8(SDValue N, SDValue &Base, SDValue &OffImm) {
|
||||
return SelectAddrModeIndexed(N, 1, Base, OffImm);
|
||||
}
|
||||
@ -164,6 +179,8 @@ public:
|
||||
private:
|
||||
bool SelectShiftedRegister(SDValue N, bool AllowROR, SDValue &Reg,
|
||||
SDValue &Shift);
|
||||
bool SelectAddrModeIndexed7S(SDValue N, unsigned Size, SDValue &Base,
|
||||
SDValue &OffImm);
|
||||
bool SelectAddrModeIndexed(SDValue N, unsigned Size, SDValue &Base,
|
||||
SDValue &OffImm);
|
||||
bool SelectAddrModeUnscaled(SDValue N, unsigned Size, SDValue &Base,
|
||||
@ -606,6 +623,22 @@ static bool isWorthFoldingADDlow(SDValue N) {
|
||||
return true;
|
||||
}
|
||||
|
||||
/// SelectAddrModeIndexed7S - Select a "register plus scaled signed 7-bit
|
||||
/// immediate" address. The "Size" argument is the size in bytes of the memory
|
||||
/// reference, which determines the scale.
|
||||
bool AArch64DAGToDAGISel::SelectAddrModeIndexed7S(SDValue N, unsigned Size,
|
||||
SDValue &Base,
|
||||
SDValue &OffImm) {
|
||||
SDLoc dl(N);
|
||||
// Base only. The address will be materialized into a register before
|
||||
// the memory is accessed.
|
||||
// add x0, Xbase, #offset
|
||||
// stp x1, x2, [x0]
|
||||
Base = N;
|
||||
OffImm = CurDAG->getTargetConstant(0, dl, MVT::i64);
|
||||
return true;
|
||||
}
|
||||
|
||||
/// SelectAddrModeIndexed - Select a "register plus scaled unsigned 12-bit
|
||||
/// immediate" address. The "Size" argument is the size in bytes of the memory
|
||||
/// reference, which determines the scale.
|
||||
|
@ -248,6 +248,12 @@ def simm7s16 : Operand<i32> {
|
||||
let PrintMethod = "printImmScale<16>";
|
||||
}
|
||||
|
||||
def am_indexed7s8 : ComplexPattern<i64, 2, "SelectAddrModeIndexed7S8", []>;
|
||||
def am_indexed7s16 : ComplexPattern<i64, 2, "SelectAddrModeIndexed7S16", []>;
|
||||
def am_indexed7s32 : ComplexPattern<i64, 2, "SelectAddrModeIndexed7S32", []>;
|
||||
def am_indexed7s64 : ComplexPattern<i64, 2, "SelectAddrModeIndexed7S64", []>;
|
||||
def am_indexed7s128 : ComplexPattern<i64, 2, "SelectAddrModeIndexed7S128", []>;
|
||||
|
||||
class AsmImmRange<int Low, int High> : AsmOperandClass {
|
||||
let Name = "Imm" # Low # "_" # High;
|
||||
let DiagnosticType = "InvalidImm" # Low # "_" # High;
|
||||
|
@ -5825,6 +5825,45 @@ def : Pat<(i64 (int_aarch64_neon_srshl (i64 FPR64:$Rn), (i64 FPR64:$Rm))),
|
||||
def : Pat<(i64 (int_aarch64_neon_urshl (i64 FPR64:$Rn), (i64 FPR64:$Rm))),
|
||||
(URSHLv1i64 FPR64:$Rn, FPR64:$Rm)>;
|
||||
|
||||
// Patterns for nontemporal/no-allocate stores.
|
||||
// We have to resort to tricks to turn a single-input store into a store pair,
|
||||
// because there is no single-input nontemporal store, only STNP.
|
||||
let Predicates = [IsLE] in {
|
||||
let AddedComplexity = 15 in {
|
||||
class NTStore128Pat<ValueType VT> :
|
||||
Pat<(nontemporalstore (VT FPR128:$Rt),
|
||||
(am_indexed7s64 GPR64sp:$Rn, simm7s8:$offset)),
|
||||
(STNPDi (EXTRACT_SUBREG FPR128:$Rt, dsub),
|
||||
(CPYi64 FPR128:$Rt, (i64 1)),
|
||||
GPR64sp:$Rn, simm7s8:$offset)>;
|
||||
|
||||
def : NTStore128Pat<v2i64>;
|
||||
def : NTStore128Pat<v4i32>;
|
||||
def : NTStore128Pat<v8i16>;
|
||||
def : NTStore128Pat<v16i8>;
|
||||
|
||||
class NTStore64Pat<ValueType VT> :
|
||||
Pat<(nontemporalstore (VT FPR64:$Rt),
|
||||
(am_indexed7s32 GPR64sp:$Rn, simm7s4:$offset)),
|
||||
(STNPSi (EXTRACT_SUBREG FPR64:$Rt, ssub),
|
||||
(CPYi32 (SUBREG_TO_REG (i64 0), FPR64:$Rt, dsub), (i64 1)),
|
||||
GPR64sp:$Rn, simm7s4:$offset)>;
|
||||
|
||||
// FIXME: Shouldn't v1f64 loads/stores be promoted to v1i64?
|
||||
def : NTStore64Pat<v1f64>;
|
||||
def : NTStore64Pat<v1i64>;
|
||||
def : NTStore64Pat<v2i32>;
|
||||
def : NTStore64Pat<v4i16>;
|
||||
def : NTStore64Pat<v8i8>;
|
||||
|
||||
def : Pat<(nontemporalstore GPR64:$Rt,
|
||||
(am_indexed7s32 GPR64sp:$Rn, simm7s4:$offset)),
|
||||
(STNPWi (EXTRACT_SUBREG GPR64:$Rt, sub_32),
|
||||
(EXTRACT_SUBREG (UBFMXri GPR64:$Rt, 0, 31), sub_32),
|
||||
GPR64sp:$Rn, simm7s4:$offset)>;
|
||||
} // AddedComplexity=10
|
||||
} // Predicates = [IsLE]
|
||||
|
||||
// Tail call return handling. These are all compiler pseudo-instructions,
|
||||
// so no encoding information or anything like that.
|
||||
let isCall = 1, isTerminator = 1, isReturn = 1, isBarrier = 1, Uses = [SP] in {
|
||||
|
192
test/CodeGen/AArch64/nontemporal.ll
Normal file
192
test/CodeGen/AArch64/nontemporal.ll
Normal file
@ -0,0 +1,192 @@
|
||||
; RUN: llc < %s -mtriple aarch64-apple-darwin -asm-verbose=false | FileCheck %s
|
||||
|
||||
define void @test_stnp_v4i64(<4 x i64>* %p, <4 x i64> %v) #0 {
|
||||
; CHECK-LABEL: test_stnp_v4i64:
|
||||
; CHECK-NEXT: add x[[PTR:[0-9]+]], x0, #16
|
||||
; CHECK-NEXT: mov d[[HI1:[0-9]+]], v1[1]
|
||||
; CHECK-NEXT: mov d[[HI0:[0-9]+]], v0[1]
|
||||
; CHECK-NEXT: stnp d1, d[[HI1]], [x[[PTR]]]
|
||||
; CHECK-NEXT: stnp d0, d[[HI0]], [x0]
|
||||
; CHECK-NEXT: ret
|
||||
store <4 x i64> %v, <4 x i64>* %p, align 1, !nontemporal !0
|
||||
ret void
|
||||
}
|
||||
|
||||
define void @test_stnp_v4i32(<4 x i32>* %p, <4 x i32> %v) #0 {
|
||||
; CHECK-LABEL: test_stnp_v4i32:
|
||||
; CHECK-NEXT: mov d[[HI:[0-9]+]], v0[1]
|
||||
; CHECK-NEXT: stnp d0, d[[HI]], [x0]
|
||||
; CHECK-NEXT: ret
|
||||
store <4 x i32> %v, <4 x i32>* %p, align 1, !nontemporal !0
|
||||
ret void
|
||||
}
|
||||
|
||||
define void @test_stnp_v8i16(<8 x i16>* %p, <8 x i16> %v) #0 {
|
||||
; CHECK-LABEL: test_stnp_v8i16:
|
||||
; CHECK-NEXT: mov d[[HI:[0-9]+]], v0[1]
|
||||
; CHECK-NEXT: stnp d0, d[[HI]], [x0]
|
||||
; CHECK-NEXT: ret
|
||||
store <8 x i16> %v, <8 x i16>* %p, align 1, !nontemporal !0
|
||||
ret void
|
||||
}
|
||||
|
||||
define void @test_stnp_v16i8(<16 x i8>* %p, <16 x i8> %v) #0 {
|
||||
; CHECK-LABEL: test_stnp_v16i8:
|
||||
; CHECK-NEXT: mov d[[HI:[0-9]+]], v0[1]
|
||||
; CHECK-NEXT: stnp d0, d[[HI]], [x0]
|
||||
; CHECK-NEXT: ret
|
||||
store <16 x i8> %v, <16 x i8>* %p, align 1, !nontemporal !0
|
||||
ret void
|
||||
}
|
||||
|
||||
define void @test_stnp_v2i32(<2 x i32>* %p, <2 x i32> %v) #0 {
|
||||
; CHECK-LABEL: test_stnp_v2i32:
|
||||
; CHECK-NEXT: mov s[[HI:[0-9]+]], v0[1]
|
||||
; CHECK-NEXT: stnp s0, s[[HI]], [x0]
|
||||
; CHECK-NEXT: ret
|
||||
store <2 x i32> %v, <2 x i32>* %p, align 1, !nontemporal !0
|
||||
ret void
|
||||
}
|
||||
|
||||
define void @test_stnp_v4i16(<4 x i16>* %p, <4 x i16> %v) #0 {
|
||||
; CHECK-LABEL: test_stnp_v4i16:
|
||||
; CHECK-NEXT: mov s[[HI:[0-9]+]], v0[1]
|
||||
; CHECK-NEXT: stnp s0, s[[HI]], [x0]
|
||||
; CHECK-NEXT: ret
|
||||
store <4 x i16> %v, <4 x i16>* %p, align 1, !nontemporal !0
|
||||
ret void
|
||||
}
|
||||
|
||||
define void @test_stnp_v8i8(<8 x i8>* %p, <8 x i8> %v) #0 {
|
||||
; CHECK-LABEL: test_stnp_v8i8:
|
||||
; CHECK-NEXT: mov s[[HI:[0-9]+]], v0[1]
|
||||
; CHECK-NEXT: stnp s0, s[[HI]], [x0]
|
||||
; CHECK-NEXT: ret
|
||||
store <8 x i8> %v, <8 x i8>* %p, align 1, !nontemporal !0
|
||||
ret void
|
||||
}
|
||||
|
||||
define void @test_stnp_v2f64(<2 x double>* %p, <2 x double> %v) #0 {
|
||||
; CHECK-LABEL: test_stnp_v2f64:
|
||||
; CHECK-NEXT: mov d[[HI:[0-9]+]], v0[1]
|
||||
; CHECK-NEXT: stnp d0, d[[HI]], [x0]
|
||||
; CHECK-NEXT: ret
|
||||
store <2 x double> %v, <2 x double>* %p, align 1, !nontemporal !0
|
||||
ret void
|
||||
}
|
||||
|
||||
define void @test_stnp_v4f32(<4 x float>* %p, <4 x float> %v) #0 {
|
||||
; CHECK-LABEL: test_stnp_v4f32:
|
||||
; CHECK-NEXT: mov d[[HI:[0-9]+]], v0[1]
|
||||
; CHECK-NEXT: stnp d0, d[[HI]], [x0]
|
||||
; CHECK-NEXT: ret
|
||||
store <4 x float> %v, <4 x float>* %p, align 1, !nontemporal !0
|
||||
ret void
|
||||
}
|
||||
|
||||
define void @test_stnp_v2f32(<2 x float>* %p, <2 x float> %v) #0 {
|
||||
; CHECK-LABEL: test_stnp_v2f32:
|
||||
; CHECK-NEXT: mov s[[HI:[0-9]+]], v0[1]
|
||||
; CHECK-NEXT: stnp s0, s[[HI]], [x0]
|
||||
; CHECK-NEXT: ret
|
||||
store <2 x float> %v, <2 x float>* %p, align 1, !nontemporal !0
|
||||
ret void
|
||||
}
|
||||
|
||||
define void @test_stnp_v1f64(<1 x double>* %p, <1 x double> %v) #0 {
|
||||
; CHECK-LABEL: test_stnp_v1f64:
|
||||
; CHECK-NEXT: mov s[[HI:[0-9]+]], v0[1]
|
||||
; CHECK-NEXT: stnp s0, s[[HI]], [x0]
|
||||
; CHECK-NEXT: ret
|
||||
store <1 x double> %v, <1 x double>* %p, align 1, !nontemporal !0
|
||||
ret void
|
||||
}
|
||||
|
||||
define void @test_stnp_v1i64(<1 x i64>* %p, <1 x i64> %v) #0 {
|
||||
; CHECK-LABEL: test_stnp_v1i64:
|
||||
; CHECK-NEXT: mov s[[HI:[0-9]+]], v0[1]
|
||||
; CHECK-NEXT: stnp s0, s[[HI]], [x0]
|
||||
; CHECK-NEXT: ret
|
||||
store <1 x i64> %v, <1 x i64>* %p, align 1, !nontemporal !0
|
||||
ret void
|
||||
}
|
||||
|
||||
define void @test_stnp_i64(i64* %p, i64 %v) #0 {
|
||||
; CHECK-LABEL: test_stnp_i64:
|
||||
; CHECK-NEXT: ubfx x[[HI:[0-9]+]], x1, #0, #32
|
||||
; CHECK-NEXT: stnp w1, w[[HI]], [x0]
|
||||
; CHECK-NEXT: ret
|
||||
store i64 %v, i64* %p, align 1, !nontemporal !0
|
||||
ret void
|
||||
}
|
||||
|
||||
|
||||
define void @test_stnp_v2f64_offset(<2 x double>* %p, <2 x double> %v) #0 {
|
||||
; CHECK-LABEL: test_stnp_v2f64_offset:
|
||||
; CHECK-NEXT: add x[[PTR:[0-9]+]], x0, #16
|
||||
; CHECK-NEXT: mov d[[HI:[0-9]+]], v0[1]
|
||||
; CHECK-NEXT: stnp d0, d[[HI]], [x[[PTR]]]
|
||||
; CHECK-NEXT: ret
|
||||
%tmp0 = getelementptr <2 x double>, <2 x double>* %p, i32 1
|
||||
store <2 x double> %v, <2 x double>* %tmp0, align 1, !nontemporal !0
|
||||
ret void
|
||||
}
|
||||
|
||||
define void @test_stnp_v2f64_offset_neg(<2 x double>* %p, <2 x double> %v) #0 {
|
||||
; CHECK-LABEL: test_stnp_v2f64_offset_neg:
|
||||
; CHECK-NEXT: sub x[[PTR:[0-9]+]], x0, #16
|
||||
; CHECK-NEXT: mov d[[HI:[0-9]+]], v0[1]
|
||||
; CHECK-NEXT: stnp d0, d[[HI]], [x[[PTR]]]
|
||||
; CHECK-NEXT: ret
|
||||
%tmp0 = getelementptr <2 x double>, <2 x double>* %p, i32 -1
|
||||
store <2 x double> %v, <2 x double>* %tmp0, align 1, !nontemporal !0
|
||||
ret void
|
||||
}
|
||||
|
||||
define void @test_stnp_v2f32_offset(<2 x float>* %p, <2 x float> %v) #0 {
|
||||
; CHECK-LABEL: test_stnp_v2f32_offset:
|
||||
; CHECK-NEXT: add x[[PTR:[0-9]+]], x0, #8
|
||||
; CHECK-NEXT: mov s[[HI:[0-9]+]], v0[1]
|
||||
; CHECK-NEXT: stnp s0, s[[HI]], [x[[PTR]]]
|
||||
; CHECK-NEXT: ret
|
||||
%tmp0 = getelementptr <2 x float>, <2 x float>* %p, i32 1
|
||||
store <2 x float> %v, <2 x float>* %tmp0, align 1, !nontemporal !0
|
||||
ret void
|
||||
}
|
||||
|
||||
define void @test_stnp_v2f32_offset_neg(<2 x float>* %p, <2 x float> %v) #0 {
|
||||
; CHECK-LABEL: test_stnp_v2f32_offset_neg:
|
||||
; CHECK-NEXT: sub x[[PTR:[0-9]+]], x0, #8
|
||||
; CHECK-NEXT: mov s[[HI:[0-9]+]], v0[1]
|
||||
; CHECK-NEXT: stnp s0, s[[HI]], [x[[PTR]]]
|
||||
; CHECK-NEXT: ret
|
||||
%tmp0 = getelementptr <2 x float>, <2 x float>* %p, i32 -1
|
||||
store <2 x float> %v, <2 x float>* %tmp0, align 1, !nontemporal !0
|
||||
ret void
|
||||
}
|
||||
|
||||
define void @test_stnp_i64_offset(i64* %p, i64 %v) #0 {
|
||||
; CHECK-LABEL: test_stnp_i64_offset:
|
||||
; CHECK-NEXT: add x[[PTR:[0-9]+]], x0, #8
|
||||
; CHECK-NEXT: ubfx x[[HI:[0-9]+]], x1, #0, #32
|
||||
; CHECK-NEXT: stnp w1, w[[HI]], [x[[PTR]]]
|
||||
; CHECK-NEXT: ret
|
||||
%tmp0 = getelementptr i64, i64* %p, i32 1
|
||||
store i64 %v, i64* %tmp0, align 1, !nontemporal !0
|
||||
ret void
|
||||
}
|
||||
|
||||
define void @test_stnp_i64_offset_neg(i64* %p, i64 %v) #0 {
|
||||
; CHECK-LABEL: test_stnp_i64_offset_neg:
|
||||
; CHECK-NEXT: sub x[[PTR:[0-9]+]], x0, #8
|
||||
; CHECK-NEXT: ubfx x[[HI:[0-9]+]], x1, #0, #32
|
||||
; CHECK-NEXT: stnp w1, w[[HI]], [x[[PTR]]]
|
||||
; CHECK-NEXT: ret
|
||||
%tmp0 = getelementptr i64, i64* %p, i32 -1
|
||||
store i64 %v, i64* %tmp0, align 1, !nontemporal !0
|
||||
ret void
|
||||
}
|
||||
|
||||
!0 = !{ i32 1 }
|
||||
|
||||
attributes #0 = { nounwind }
|
Loading…
x
Reference in New Issue
Block a user