[AArch64] Support selecting STNP.

We could go through the load/store optimizer and match STNP where we would have matched a nontemporal-annotated STP, but that's not reliable enough, as an opportunistic optimization. Insetad, we can guarantee emitting STNP, by matching them at ISel. Since there are no single-input nontemporal stores, we have to resort to some high-bits-extracting trickery to generate an STNP from a plain store. Also, we need to support another, LDP/STP-specific addressing mode, base + signed scaled 7-bit immediate offset. For now, only match the base. Let's make it smart separately. Part of PR24086. git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@247231 91177308-0d34-0410-b5e6-96231b3b80d8
2025-02-07 02:56:52 +00:00 · 2015-09-10 01:42:28 +00:00 · 2015-09-10 01:42:28 +00:00 · d636e64cbc
commit d636e64cbc
parent 3a2cec85a7
4 changed files with 270 additions and 0 deletions
--- a/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp
+++ b/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp
@ -77,6 +77,21 @@ public:
  bool SelectLogicalShiftedRegister(SDValue N, SDValue &Reg, SDValue &Shift) {
    return SelectShiftedRegister(N, true, Reg, Shift);
  }
+  bool SelectAddrModeIndexed7S8(SDValue N, SDValue &Base, SDValue &OffImm) {
+    return SelectAddrModeIndexed7S(N, 1, Base, OffImm);
+  }
+  bool SelectAddrModeIndexed7S16(SDValue N, SDValue &Base, SDValue &OffImm) {
+    return SelectAddrModeIndexed7S(N, 2, Base, OffImm);
+  }
+  bool SelectAddrModeIndexed7S32(SDValue N, SDValue &Base, SDValue &OffImm) {
+    return SelectAddrModeIndexed7S(N, 4, Base, OffImm);
+  }
+  bool SelectAddrModeIndexed7S64(SDValue N, SDValue &Base, SDValue &OffImm) {
+    return SelectAddrModeIndexed7S(N, 8, Base, OffImm);
+  }
+  bool SelectAddrModeIndexed7S128(SDValue N, SDValue &Base, SDValue &OffImm) {
+    return SelectAddrModeIndexed7S(N, 16, Base, OffImm);
+  }
  bool SelectAddrModeIndexed8(SDValue N, SDValue &Base, SDValue &OffImm) {
    return SelectAddrModeIndexed(N, 1, Base, OffImm);
  }
@ -164,6 +179,8 @@ public:
 private:
  bool SelectShiftedRegister(SDValue N, bool AllowROR, SDValue &Reg,
                             SDValue &Shift);
+  bool SelectAddrModeIndexed7S(SDValue N, unsigned Size, SDValue &Base,
+                               SDValue &OffImm);
  bool SelectAddrModeIndexed(SDValue N, unsigned Size, SDValue &Base,
                             SDValue &OffImm);
  bool SelectAddrModeUnscaled(SDValue N, unsigned Size, SDValue &Base,
@ -606,6 +623,22 @@ static bool isWorthFoldingADDlow(SDValue N) {
  return true;
 }

+/// SelectAddrModeIndexed7S - Select a "register plus scaled signed 7-bit
+/// immediate" address.  The "Size" argument is the size in bytes of the memory
+/// reference, which determines the scale.
+bool AArch64DAGToDAGISel::SelectAddrModeIndexed7S(SDValue N, unsigned Size,
+                                                  SDValue &Base,
+                                                  SDValue &OffImm) {
+  SDLoc dl(N);
+  // Base only. The address will be materialized into a register before
+  // the memory is accessed.
+  //    add x0, Xbase, #offset
+  //    stp x1, x2, [x0]
+  Base = N;
+  OffImm = CurDAG->getTargetConstant(0, dl, MVT::i64);
+  return true;
+}
+
 /// SelectAddrModeIndexed - Select a "register plus scaled unsigned 12-bit
 /// immediate" address.  The "Size" argument is the size in bytes of the memory
 /// reference, which determines the scale.
--- a/lib/Target/AArch64/AArch64InstrFormats.td
+++ b/lib/Target/AArch64/AArch64InstrFormats.td
@ -248,6 +248,12 @@ def simm7s16 : Operand<i32> {
  let PrintMethod = "printImmScale<16>";
 }

+def am_indexed7s8   : ComplexPattern<i64, 2, "SelectAddrModeIndexed7S8", []>;
+def am_indexed7s16  : ComplexPattern<i64, 2, "SelectAddrModeIndexed7S16", []>;
+def am_indexed7s32  : ComplexPattern<i64, 2, "SelectAddrModeIndexed7S32", []>;
+def am_indexed7s64  : ComplexPattern<i64, 2, "SelectAddrModeIndexed7S64", []>;
+def am_indexed7s128 : ComplexPattern<i64, 2, "SelectAddrModeIndexed7S128", []>;
+
 class AsmImmRange<int Low, int High> : AsmOperandClass {
  let Name = "Imm" # Low # "_" # High;
  let DiagnosticType = "InvalidImm" # Low # "_" # High;
--- a/lib/Target/AArch64/AArch64InstrInfo.td
+++ b/lib/Target/AArch64/AArch64InstrInfo.td
@ -5825,6 +5825,45 @@ def : Pat<(i64 (int_aarch64_neon_srshl (i64 FPR64:$Rn), (i64 FPR64:$Rm))),
 def : Pat<(i64 (int_aarch64_neon_urshl (i64 FPR64:$Rn), (i64 FPR64:$Rm))),
          (URSHLv1i64 FPR64:$Rn, FPR64:$Rm)>;

+// Patterns for nontemporal/no-allocate stores.
+// We have to resort to tricks to turn a single-input store into a store pair,
+// because there is no single-input nontemporal store, only STNP.
+let Predicates = [IsLE] in {
+let AddedComplexity = 15 in {
+class NTStore128Pat<ValueType VT> :
+  Pat<(nontemporalstore (VT FPR128:$Rt),
+        (am_indexed7s64 GPR64sp:$Rn, simm7s8:$offset)),
+      (STNPDi (EXTRACT_SUBREG FPR128:$Rt, dsub),
+              (CPYi64 FPR128:$Rt, (i64 1)),
+              GPR64sp:$Rn, simm7s8:$offset)>;
+
+def : NTStore128Pat<v2i64>;
+def : NTStore128Pat<v4i32>;
+def : NTStore128Pat<v8i16>;
+def : NTStore128Pat<v16i8>;
+
+class NTStore64Pat<ValueType VT> :
+  Pat<(nontemporalstore (VT FPR64:$Rt),
+        (am_indexed7s32 GPR64sp:$Rn, simm7s4:$offset)),
+      (STNPSi (EXTRACT_SUBREG FPR64:$Rt, ssub),
+              (CPYi32 (SUBREG_TO_REG (i64 0), FPR64:$Rt, dsub), (i64 1)),
+              GPR64sp:$Rn, simm7s4:$offset)>;
+
+// FIXME: Shouldn't v1f64 loads/stores be promoted to v1i64?
+def : NTStore64Pat<v1f64>;
+def : NTStore64Pat<v1i64>;
+def : NTStore64Pat<v2i32>;
+def : NTStore64Pat<v4i16>;
+def : NTStore64Pat<v8i8>;
+
+def : Pat<(nontemporalstore GPR64:$Rt,
+            (am_indexed7s32 GPR64sp:$Rn, simm7s4:$offset)),
+          (STNPWi (EXTRACT_SUBREG GPR64:$Rt, sub_32),
+                  (EXTRACT_SUBREG (UBFMXri GPR64:$Rt, 0, 31), sub_32),
+                  GPR64sp:$Rn, simm7s4:$offset)>;
+} // AddedComplexity=10
+} // Predicates = [IsLE]
+
 // Tail call return handling. These are all compiler pseudo-instructions,
 // so no encoding information or anything like that.
 let isCall = 1, isTerminator = 1, isReturn = 1, isBarrier = 1, Uses = [SP] in {
--- a/test/CodeGen/AArch64/nontemporal.ll
+++ b/test/CodeGen/AArch64/nontemporal.ll
@ -0,0 +1,192 @@
+; RUN: llc < %s -mtriple aarch64-apple-darwin -asm-verbose=false | FileCheck %s
+
+define void @test_stnp_v4i64(<4 x i64>* %p, <4 x i64> %v) #0 {
+; CHECK-LABEL: test_stnp_v4i64:
+; CHECK-NEXT:  add x[[PTR:[0-9]+]], x0, #16
+; CHECK-NEXT:  mov d[[HI1:[0-9]+]], v1[1]
+; CHECK-NEXT:  mov d[[HI0:[0-9]+]], v0[1]
+; CHECK-NEXT:  stnp d1, d[[HI1]], [x[[PTR]]]
+; CHECK-NEXT:  stnp d0, d[[HI0]], [x0]
+; CHECK-NEXT:  ret
+  store <4 x i64> %v, <4 x i64>* %p, align 1, !nontemporal !0
+  ret void
+}
+
+define void @test_stnp_v4i32(<4 x i32>* %p, <4 x i32> %v) #0 {
+; CHECK-LABEL: test_stnp_v4i32:
+; CHECK-NEXT:  mov d[[HI:[0-9]+]], v0[1]
+; CHECK-NEXT:  stnp d0, d[[HI]], [x0]
+; CHECK-NEXT:  ret
+  store <4 x i32> %v, <4 x i32>* %p, align 1, !nontemporal !0
+  ret void
+}
+
+define void @test_stnp_v8i16(<8 x i16>* %p, <8 x i16> %v) #0 {
+; CHECK-LABEL: test_stnp_v8i16:
+; CHECK-NEXT:  mov d[[HI:[0-9]+]], v0[1]
+; CHECK-NEXT:  stnp d0, d[[HI]], [x0]
+; CHECK-NEXT:  ret
+  store <8 x i16> %v, <8 x i16>* %p, align 1, !nontemporal !0
+  ret void
+}
+
+define void @test_stnp_v16i8(<16 x i8>* %p, <16 x i8> %v) #0 {
+; CHECK-LABEL: test_stnp_v16i8:
+; CHECK-NEXT:  mov d[[HI:[0-9]+]], v0[1]
+; CHECK-NEXT:  stnp d0, d[[HI]], [x0]
+; CHECK-NEXT:  ret
+  store <16 x i8> %v, <16 x i8>* %p, align 1, !nontemporal !0
+  ret void
+}
+
+define void @test_stnp_v2i32(<2 x i32>* %p, <2 x i32> %v) #0 {
+; CHECK-LABEL: test_stnp_v2i32:
+; CHECK-NEXT:  mov s[[HI:[0-9]+]], v0[1]
+; CHECK-NEXT:  stnp s0, s[[HI]], [x0]
+; CHECK-NEXT:  ret
+  store <2 x i32> %v, <2 x i32>* %p, align 1, !nontemporal !0
+  ret void
+}
+
+define void @test_stnp_v4i16(<4 x i16>* %p, <4 x i16> %v) #0 {
+; CHECK-LABEL: test_stnp_v4i16:
+; CHECK-NEXT:  mov s[[HI:[0-9]+]], v0[1]
+; CHECK-NEXT:  stnp s0, s[[HI]], [x0]
+; CHECK-NEXT:  ret
+  store <4 x i16> %v, <4 x i16>* %p, align 1, !nontemporal !0
+  ret void
+}
+
+define void @test_stnp_v8i8(<8 x i8>* %p, <8 x i8> %v) #0 {
+; CHECK-LABEL: test_stnp_v8i8:
+; CHECK-NEXT:  mov s[[HI:[0-9]+]], v0[1]
+; CHECK-NEXT:  stnp s0, s[[HI]], [x0]
+; CHECK-NEXT:  ret
+  store <8 x i8> %v, <8 x i8>* %p, align 1, !nontemporal !0
+  ret void
+}
+
+define void @test_stnp_v2f64(<2 x double>* %p, <2 x double> %v) #0 {
+; CHECK-LABEL: test_stnp_v2f64:
+; CHECK-NEXT:  mov d[[HI:[0-9]+]], v0[1]
+; CHECK-NEXT:  stnp d0, d[[HI]], [x0]
+; CHECK-NEXT:  ret
+  store <2 x double> %v, <2 x double>* %p, align 1, !nontemporal !0
+  ret void
+}
+
+define void @test_stnp_v4f32(<4 x float>* %p, <4 x float> %v) #0 {
+; CHECK-LABEL: test_stnp_v4f32:
+; CHECK-NEXT:  mov d[[HI:[0-9]+]], v0[1]
+; CHECK-NEXT:  stnp d0, d[[HI]], [x0]
+; CHECK-NEXT:  ret
+  store <4 x float> %v, <4 x float>* %p, align 1, !nontemporal !0
+  ret void
+}
+
+define void @test_stnp_v2f32(<2 x float>* %p, <2 x float> %v) #0 {
+; CHECK-LABEL: test_stnp_v2f32:
+; CHECK-NEXT:  mov s[[HI:[0-9]+]], v0[1]
+; CHECK-NEXT:  stnp s0, s[[HI]], [x0]
+; CHECK-NEXT:  ret
+  store <2 x float> %v, <2 x float>* %p, align 1, !nontemporal !0
+  ret void
+}
+
+define void @test_stnp_v1f64(<1 x double>* %p, <1 x double> %v) #0 {
+; CHECK-LABEL: test_stnp_v1f64:
+; CHECK-NEXT:  mov s[[HI:[0-9]+]], v0[1]
+; CHECK-NEXT:  stnp s0, s[[HI]], [x0]
+; CHECK-NEXT:  ret
+  store <1 x double> %v, <1 x double>* %p, align 1, !nontemporal !0
+  ret void
+}
+
+define void @test_stnp_v1i64(<1 x i64>* %p, <1 x i64> %v) #0 {
+; CHECK-LABEL: test_stnp_v1i64:
+; CHECK-NEXT:  mov s[[HI:[0-9]+]], v0[1]
+; CHECK-NEXT:  stnp s0, s[[HI]], [x0]
+; CHECK-NEXT:  ret
+  store <1 x i64> %v, <1 x i64>* %p, align 1, !nontemporal !0
+  ret void
+}
+
+define void @test_stnp_i64(i64* %p, i64 %v) #0 {
+; CHECK-LABEL: test_stnp_i64:
+; CHECK-NEXT:  ubfx x[[HI:[0-9]+]], x1, #0, #32
+; CHECK-NEXT:  stnp w1, w[[HI]], [x0]
+; CHECK-NEXT:  ret
+  store i64 %v, i64* %p, align 1, !nontemporal !0
+  ret void
+}
+
+
+define void @test_stnp_v2f64_offset(<2 x double>* %p, <2 x double> %v) #0 {
+; CHECK-LABEL: test_stnp_v2f64_offset:
+; CHECK-NEXT:  add x[[PTR:[0-9]+]], x0, #16
+; CHECK-NEXT:  mov d[[HI:[0-9]+]], v0[1]
+; CHECK-NEXT:  stnp d0, d[[HI]], [x[[PTR]]]
+; CHECK-NEXT:  ret
+  %tmp0 = getelementptr <2 x double>, <2 x double>* %p, i32 1
+  store <2 x double> %v, <2 x double>* %tmp0, align 1, !nontemporal !0
+  ret void
+}
+
+define void @test_stnp_v2f64_offset_neg(<2 x double>* %p, <2 x double> %v) #0 {
+; CHECK-LABEL: test_stnp_v2f64_offset_neg:
+; CHECK-NEXT:  sub x[[PTR:[0-9]+]], x0, #16
+; CHECK-NEXT:  mov d[[HI:[0-9]+]], v0[1]
+; CHECK-NEXT:  stnp d0, d[[HI]], [x[[PTR]]]
+; CHECK-NEXT:  ret
+  %tmp0 = getelementptr <2 x double>, <2 x double>* %p, i32 -1
+  store <2 x double> %v, <2 x double>* %tmp0, align 1, !nontemporal !0
+  ret void
+}
+
+define void @test_stnp_v2f32_offset(<2 x float>* %p, <2 x float> %v) #0 {
+; CHECK-LABEL: test_stnp_v2f32_offset:
+; CHECK-NEXT:  add x[[PTR:[0-9]+]], x0, #8
+; CHECK-NEXT:  mov s[[HI:[0-9]+]], v0[1]
+; CHECK-NEXT:  stnp s0, s[[HI]], [x[[PTR]]]
+; CHECK-NEXT:  ret
+  %tmp0 = getelementptr <2 x float>, <2 x float>* %p, i32 1
+  store <2 x float> %v, <2 x float>* %tmp0, align 1, !nontemporal !0
+  ret void
+}
+
+define void @test_stnp_v2f32_offset_neg(<2 x float>* %p, <2 x float> %v) #0 {
+; CHECK-LABEL: test_stnp_v2f32_offset_neg:
+; CHECK-NEXT:  sub x[[PTR:[0-9]+]], x0, #8
+; CHECK-NEXT:  mov s[[HI:[0-9]+]], v0[1]
+; CHECK-NEXT:  stnp s0, s[[HI]], [x[[PTR]]]
+; CHECK-NEXT:  ret
+  %tmp0 = getelementptr <2 x float>, <2 x float>* %p, i32 -1
+  store <2 x float> %v, <2 x float>* %tmp0, align 1, !nontemporal !0
+  ret void
+}
+
+define void @test_stnp_i64_offset(i64* %p, i64 %v) #0 {
+; CHECK-LABEL: test_stnp_i64_offset:
+; CHECK-NEXT:  add x[[PTR:[0-9]+]], x0, #8
+; CHECK-NEXT:  ubfx x[[HI:[0-9]+]], x1, #0, #32
+; CHECK-NEXT:  stnp w1, w[[HI]], [x[[PTR]]]
+; CHECK-NEXT:  ret
+  %tmp0 = getelementptr i64, i64* %p, i32 1
+  store i64 %v, i64* %tmp0, align 1, !nontemporal !0
+  ret void
+}
+
+define void @test_stnp_i64_offset_neg(i64* %p, i64 %v) #0 {
+; CHECK-LABEL: test_stnp_i64_offset_neg:
+; CHECK-NEXT:  sub x[[PTR:[0-9]+]], x0, #8
+; CHECK-NEXT:  ubfx x[[HI:[0-9]+]], x1, #0, #32
+; CHECK-NEXT:  stnp w1, w[[HI]], [x[[PTR]]]
+; CHECK-NEXT:  ret
+  %tmp0 = getelementptr i64, i64* %p, i32 -1
+  store i64 %v, i64* %tmp0, align 1, !nontemporal !0
+  ret void
+}
+
+!0 = !{ i32 1 }
+
+attributes #0 = { nounwind }