[AArch64] Split zero cycle feature more granularly

Split the `zcz` feature into specific ones got GP and FP registers, `zcz-gp` and `zcz-fp`, respectively, while retaining the original feature option to mean both. Differential revision: https://reviews.llvm.org/D52621 llvm-svn: 343354
2024-12-09 04:24:10 +00:00 · 2018-09-28 19:05:09 +00:00 · 2018-09-28 19:05:09 +00:00 · fdd7b1d490
commit fdd7b1d490
parent 882252baeb
6 changed files with 200 additions and 54 deletions
--- a/lib/Target/AArch64/AArch64.td
+++ b/lib/Target/AArch64/AArch64.td
@ -80,13 +80,17 @@ def FeatureSPE : SubtargetFeature<"spe", "HasSPE", "true",
 def FeatureSVE : SubtargetFeature<"sve", "HasSVE", "true",
  "Enable Scalable Vector Extension (SVE) instructions">;

-/// Cyclone has register move instructions which are "free".
 def FeatureZCRegMove : SubtargetFeature<"zcm", "HasZeroCycleRegMove", "true",
                                        "Has zero-cycle register moves">;
+def FeatureZCZeroingGP : SubtargetFeature<"zcz-gp", "HasZeroCycleZeroingGP", "true",
+                                        "Has zero-cycle zeroing instructions for generic registers">;
+
+def FeatureZCZeroingFP : SubtargetFeature<"zcz-fp", "HasZeroCycleZeroingFP", "true",
+                                        "Has zero-cycle zeroing instructions for FP registers">;

-/// Cyclone has instructions which zero registers for "free".
 def FeatureZCZeroing : SubtargetFeature<"zcz", "HasZeroCycleZeroing", "true",
-                                        "Has zero-cycle zeroing instructions">;
+                                        "Has zero-cycle zeroing instructions",
+                                        [FeatureZCZeroingGP, FeatureZCZeroingFP]>;

 /// ... but the floating-point version doesn't quite work in rare cases on older
 /// CPUs.
@ -404,7 +408,7 @@ def ProcExynosM1 : SubtargetFeature<"exynosm1", "ARMProcFamily", "ExynosM1",
                                     FeaturePostRAScheduler,
                                     FeatureSlowMisaligned128Store,
                                     FeatureUseRSqrt,
-                                     FeatureZCZeroing]>;
+                                     FeatureZCZeroingFP]>;

 def ProcExynosM2 : SubtargetFeature<"exynosm2", "ARMProcFamily", "ExynosM1",
                                    "Samsung Exynos-M2 processors",
@ -418,7 +422,7 @@ def ProcExynosM2 : SubtargetFeature<"exynosm2", "ARMProcFamily", "ExynosM1",
                                     FeaturePerfMon,
                                     FeaturePostRAScheduler,
                                     FeatureSlowMisaligned128Store,
-                                     FeatureZCZeroing]>;
+                                     FeatureZCZeroingFP]>;

 def ProcExynosM3 : SubtargetFeature<"exynosm3", "ARMProcFamily", "ExynosM3",
                                    "Samsung Exynos-M3 processors",
@ -435,7 +439,7 @@ def ProcExynosM3 : SubtargetFeature<"exynosm3", "ARMProcFamily", "ExynosM3",
                                     FeaturePerfMon,
                                     FeaturePostRAScheduler,
                                     FeaturePredictableSelectIsExpensive,
-                                     FeatureZCZeroing]>;
+                                     FeatureZCZeroingFP]>;

 def ProcKryo    : SubtargetFeature<"kryo", "ARMProcFamily", "Kryo",
                                   "Qualcomm Kryo processors", [
--- a/lib/Target/AArch64/AArch64AsmPrinter.cpp
+++ b/lib/Target/AArch64/AArch64AsmPrinter.cpp
@ -503,7 +503,7 @@ void AArch64AsmPrinter::LowerPATCHPOINT(MCStreamer &OutStreamer, StackMaps &SM,

 void AArch64AsmPrinter::EmitFMov0(const MachineInstr &MI) {
  unsigned DestReg = MI.getOperand(0).getReg();
-  if (STI->hasZeroCycleZeroing() && !STI->hasZeroCycleZeroingFPWorkaround()) {
+  if (STI->hasZeroCycleZeroingFP() && !STI->hasZeroCycleZeroingFPWorkaround()) {
    // Convert H/S/D register to corresponding Q register
    if (AArch64::H0 <= DestReg && DestReg <= AArch64::H31)
      DestReg = AArch64::Q0 + (DestReg - AArch64::H0);
--- a/lib/Target/AArch64/AArch64InstrInfo.cpp
+++ b/lib/Target/AArch64/AArch64InstrInfo.cpp
@ -729,9 +729,9 @@ bool AArch64InstrInfo::isAsCheapAsAMove(const MachineInstr &MI) const {
  case AArch64::FMOVH0:
  case AArch64::FMOVS0:
  case AArch64::FMOVD0:
-    return Subtarget.hasZeroCycleZeroing();
+    return Subtarget.hasZeroCycleZeroingFP();
  case TargetOpcode::COPY:
-    return (Subtarget.hasZeroCycleZeroing() &&
+    return (Subtarget.hasZeroCycleZeroingGP() &&
            (MI.getOperand(1).getReg() == AArch64::WZR ||
             MI.getOperand(1).getReg() == AArch64::XZR));
  }
@ -2481,7 +2481,7 @@ void AArch64InstrInfo::copyPhysReg(MachineBasicBlock &MBB,
            .addImm(0)
            .addImm(AArch64_AM::getShifterImm(AArch64_AM::LSL, 0));
      }
-    } else if (SrcReg == AArch64::WZR && Subtarget.hasZeroCycleZeroing()) {
+    } else if (SrcReg == AArch64::WZR && Subtarget.hasZeroCycleZeroingGP()) {
      BuildMI(MBB, I, DL, get(AArch64::MOVZWi), DestReg)
          .addImm(0)
          .addImm(AArch64_AM::getShifterImm(AArch64_AM::LSL, 0));
@ -2518,7 +2518,7 @@ void AArch64InstrInfo::copyPhysReg(MachineBasicBlock &MBB,
          .addReg(SrcReg, getKillRegState(KillSrc))
          .addImm(0)
          .addImm(AArch64_AM::getShifterImm(AArch64_AM::LSL, 0));
-    } else if (SrcReg == AArch64::XZR && Subtarget.hasZeroCycleZeroing()) {
+    } else if (SrcReg == AArch64::XZR && Subtarget.hasZeroCycleZeroingGP()) {
      BuildMI(MBB, I, DL, get(AArch64::MOVZXi), DestReg)
          .addImm(0)
          .addImm(AArch64_AM::getShifterImm(AArch64_AM::LSL, 0));
--- a/lib/Target/AArch64/AArch64Subtarget.h
+++ b/lib/Target/AArch64/AArch64Subtarget.h
@ -109,6 +109,8 @@ protected:

  // HasZeroCycleZeroing - Has zero-cycle zeroing instructions.
  bool HasZeroCycleZeroing = false;
+  bool HasZeroCycleZeroingGP = false;
+  bool HasZeroCycleZeroingFP = false;
  bool HasZeroCycleZeroingFPWorkaround = false;

  // StrictAlign - Disallow unaligned memory accesses.
@ -228,7 +230,9 @@ public:

  bool hasZeroCycleRegMove() const { return HasZeroCycleRegMove; }

-  bool hasZeroCycleZeroing() const { return HasZeroCycleZeroing; }
+  bool hasZeroCycleZeroingGP() const { return HasZeroCycleZeroingGP; }
+
+  bool hasZeroCycleZeroingFP() const { return HasZeroCycleZeroingFP; }

  bool hasZeroCycleZeroingFPWorkaround() const {
    return HasZeroCycleZeroingFPWorkaround;
--- a/test/CodeGen/AArch64/arm64-zero-cycle-regmov.ll
+++ b/test/CodeGen/AArch64/arm64-zero-cycle-regmov.ll
@ -1,16 +1,20 @@
-; RUN: llc -mtriple=arm64-apple-ios -mcpu=cyclone < %s | FileCheck %s
-; rdar://12254953
+; RUN: llc < %s -mtriple=arm64-apple-ios -mattr=-zcm   | FileCheck %s -check-prefixes=CHECK,NOT
+; RUN: llc < %s -mtriple=arm64-apple-ios -mattr=+zcm   | FileCheck %s -check-prefixes=CHECK,YES
+; RUN: llc < %s -mtriple=arm64-apple-ios -mcpu=cyclone | FileCheck %s -check-prefixes=CHECK,YES

+; rdar://12254953
 define i32 @t(i32 %a, i32 %b, i32 %c, i32 %d) nounwind ssp {
 entry:
 ; CHECK-LABEL: t:
-; CHECK: mov [[REG2:x[0-9]+]], x3
-; CHECK: mov [[REG1:x[0-9]+]], x2
-; CHECK: mov x0, x2
-; CHECK: mov x1, x3
+; NOT: mov [[REG2:w[0-9]+]], w3
+; NOT: mov [[REG1:w[0-9]+]], w2
+; YES: mov [[REG2:x[0-9]+]], x3
+; YES: mov [[REG1:x[0-9]+]], x2
 ; CHECK: bl _foo
-; CHECK: mov x0, [[REG1]]
-; CHECK: mov x1, [[REG2]]
+; NOT: mov w0, [[REG1]]
+; NOT: mov w1, [[REG2]]
+; YES: mov x0, [[REG1]]
+; YES: mov x1, [[REG2]]
  %call = call i32 @foo(i32 %c, i32 %d) nounwind
  %call1 = call i32 @foo(i32 %c, i32 %d) nounwind
  unreachable
--- a/test/CodeGen/AArch64/arm64-zero-cycle-zeroing.ll
+++ b/test/CodeGen/AArch64/arm64-zero-cycle-zeroing.ll
@ -1,9 +1,14 @@
-; RUN: llc -mtriple=arm64-apple-ios   -mcpu=cyclone   < %s | FileCheck %s -check-prefixes=ALL,CYCLONE
-; RUN: llc -mtriple=arm64-apple-ios   -mcpu=cyclone -mattr=+fullfp16 < %s | FileCheck %s -check-prefixes=CYCLONE-FULLFP16
-; RUN: llc -mtriple=aarch64-gnu-linux -mcpu=exynos-m1 < %s | FileCheck %s -check-prefixes=ALL,OTHERS
-; RUN: llc -mtriple=aarch64-gnu-linux -mcpu=exynos-m3 < %s | FileCheck %s -check-prefixes=ALL,OTHERS
-; RUN: llc -mtriple=aarch64-gnu-linux -mcpu=kryo      < %s | FileCheck %s -check-prefixes=ALL,OTHERS
-; RUN: llc -mtriple=aarch64-gnu-linux -mcpu=falkor    < %s | FileCheck %s -check-prefixes=ALL,OTHERS
+; RUN: llc < %s -mtriple=aarch64-linux-gnu -mattr=-zcz                    | FileCheck %s -check-prefixes=ALL,NONEGP,NONEFP
+; RUN: llc < %s -mtriple=aarch64-linux-gnu -mattr=+zcz                    | FileCheck %s -check-prefixes=ALL,ZEROGP,ZEROFP
+; RUN: llc < %s -mtriple=aarch64-linux-gnu -mattr=+zcz -mattr=+fullfp16   | FileCheck %s -check-prefixes=ALL,ZEROGP,ZERO16
+; RUN: llc < %s -mtriple=aarch64-linux-gnu -mattr=+zcz-gp                 | FileCheck %s -check-prefixes=ALL,ZEROGP,NONEFP
+; RUN: llc < %s -mtriple=aarch64-linux-gnu -mattr=+zcz-fp                 | FileCheck %s -check-prefixes=ALL,NONEGP,ZEROFP
+; RUN: llc < %s -mtriple=arm64-apple-ios   -mcpu=cyclone                  | FileCheck %s -check-prefixes=ALL,ZEROGP,NONEFP
+; RUN: llc < %s -mtriple=arm64-apple-ios   -mcpu=cyclone -mattr=+fullfp16 | FileCheck %s -check-prefixes=ALL,ZEROGP,NONE16
+; RUN: llc < %s -mtriple=aarch64-linux-gnu -mcpu=exynos-m1                | FileCheck %s -check-prefixes=ALL,NONEGP,ZEROFP
+; RUN: llc < %s -mtriple=aarch64-linux-gnu -mcpu=exynos-m3                | FileCheck %s -check-prefixes=ALL,NONEGP,ZEROFP
+; RUN: llc < %s -mtriple=aarch64-linux-gnu -mcpu=kryo                     | FileCheck %s -check-prefixes=ALL,ZEROGP,ZEROFP
+; RUN: llc < %s -mtriple=aarch64-linux-gnu -mcpu=falkor                   | FileCheck %s -check-prefixes=ALL,ZEROGP,ZEROFP

 declare void @bar(half, float, double, <2 x double>)
 declare void @bari(i32, i32)
@ -14,17 +19,22 @@ define void @t1() nounwind ssp {
 entry:
 ; ALL-LABEL: t1:
 ; ALL-NOT: fmov
-; ALL:     ldr h0,{{.*}}
-; CYCLONE: fmov s1, wzr
-; CYCLONE: fmov d2, xzr
-; CYCLONE: movi.16b v3, #0
-; CYCLONE-FULLFP16: fmov h0, wzr
-; CYCLONE-FULLFP16: fmov s1, wzr
-; CYCLONE-FULLFP16: fmov d2, xzr
-; CYCLONE-FULLFP16: movi.16b v3, #0
-; OTHERS: movi v{{[0-3]+}}.2d, #0000000000000000
-; OTHERS: movi v{{[0-3]+}}.2d, #0000000000000000
-; OTHERS: movi v{{[0-3]+}}.2d, #0000000000000000
+; NONEFP: ldr h0,{{.*}}
+; NONEFP: fmov s1, wzr
+; NONEFP: fmov d2, xzr
+; NONEFP: movi{{(.16b)?}} v3{{(.2d)?}}, #0
+; NONE16: fmov h0, wzr
+; NONE16: fmov s1, wzr
+; NONE16: fmov d2, xzr
+; NONE16: movi{{(.16b)?}} v3{{(.2d)?}}, #0
+; ZEROFP: ldr h0,{{.*}}
+; ZEROFP: movi v{{[0-3]+}}.2d, #0
+; ZEROFP: movi v{{[0-3]+}}.2d, #0
+; ZEROFP: movi v{{[0-3]+}}.2d, #0
+; ZERO16: movi v{{[0-3]+}}.2d, #0
+; ZERO16: movi v{{[0-3]+}}.2d, #0
+; ZERO16: movi v{{[0-3]+}}.2d, #0
+; ZERO16: movi v{{[0-3]+}}.2d, #0
  tail call void @bar(half 0.000000e+00, float 0.000000e+00, double 0.000000e+00, <2 x double> <double 0.000000e+00, double 0.000000e+00>) nounwind
  ret void
 }
@ -32,9 +42,10 @@ entry:
 define void @t2() nounwind ssp {
 entry:
 ; ALL-LABEL: t2:
-; ALL-NOT: mov w0, wzr
-; ALL: mov w{{[0-3]+}}, #0
-; ALL: mov w{{[0-3]+}}, #0
+; NONEGP: mov w0, wzr
+; NONEGP: mov w1, wzr
+; ZEROGP: mov w0, #0
+; ZEROGP: mov w1, #0
  tail call void @bari(i32 0, i32 0) nounwind
  ret void
 }
@ -42,26 +53,26 @@ entry:
 define void @t3() nounwind ssp {
 entry:
 ; ALL-LABEL: t3:
-; ALL-NOT: mov x0, xzr
-; ALL: mov x{{[0-3]+}}, #0
-; ALL: mov x{{[0-3]+}}, #0
+; NONEGP: mov x0, xzr
+; NONEGP: mov x1, xzr
+; ZEROGP: mov x0, #0
+; ZEROGP: mov x1, #0
  tail call void @barl(i64 0, i64 0) nounwind
  ret void
 }

 define void @t4() nounwind ssp {
 ; ALL-LABEL: t4:
-; ALL-NOT: fmov
-; CYCLONE: fmov s{{[0-3]+}}, wzr
-; CYCLONE: fmov s{{[0-3]+}}, wzr
-; CYCLONE-FULLFP16: fmov s{{[0-3]+}}, wzr
-; CYCLONE-FULLFP16: fmov s{{[0-3]+}}, wzr
-; OTHERS: movi v{{[0-3]+}}.2d, #0000000000000000
-; OTHERS: movi v{{[0-3]+}}.2d, #0000000000000000
+; NONEFP: fmov s{{[0-3]+}}, wzr
+; NONEFP: fmov s{{[0-3]+}}, wzr
+; ZEROFP: movi v{{[0-3]+}}.2d, #0
+; ZEROFP: movi v{{[0-3]+}}.2d, #0
  tail call void @barf(float 0.000000e+00, float 0.000000e+00) nounwind
  ret void
 }

+declare double @sin(double)
+
 ; We used to produce spills+reloads for a Q register with zero cycle zeroing
 ; enabled.
 ; ALL-LABEL: foo:
@ -88,10 +99,133 @@ for.end:

 define <2 x i64> @t6() {
 ; ALL-LABEL: t6:
-; CYCLONE: movi.16b v0, #0
-; OTHERS: movi v0.2d, #0000000000000000
- ret <2 x i64> zeroinitializer
+; ALL: movi{{(.16b)?}} v0{{(.2d)?}}, #0
+  ret <2 x i64> zeroinitializer
 }

+define i1 @ti1() {
+entry:
+; ALL-LABEL: ti1:
+; NONEGP: mov w0, wzr
+; ZEROGP: mov w0, #0
+  ret i1 false
+}
+
+define i8 @ti8() {
+entry:
+; ALL-LABEL: ti8:
+; NONEGP: mov w0, wzr
+; ZEROGP: mov w0, #0
+  ret i8 0
+}
+
+define i16 @ti16() {
+entry:
+; ALL-LABEL: ti16:
+; NONEGP: mov w0, wzr
+ ; ZEROGP: mov w0, #0
+  ret i16 0
+}
+
+define i32 @ti32() {
+entry:
+; ALL-LABEL: ti32:
+; NONEGP: mov w0, wzr
+; ZEROGP: mov w0, #0
+  ret i32 0
+}
+
+define i64 @ti64() {
+entry:
+; ALL-LABEL: ti64:
+; NONEGP: mov x0, xzr
+; ZEROGP: mov x0, #0
+  ret i64 0
+}
+
+define float @tf32() {
+entry:
+; ALL-LABEL: tf32:
+; NONEFP: mov s0, wzr
+; ZEROFP: movi v0.2d, #0
+  ret float 0.0
+}
+
+define double @td64() {
+entry:
+; ALL-LABEL: td64:
+; NONEFP: mov d0, xzr
+; ZEROFP: movi v0.2d, #0
+  ret double 0.0
+}
+
+define <8 x i8> @tv8i8() {
+entry:
+; ALL-LABEL: tv8i8:
+; ALL: movi d0, #0
+  ret <8 x i8> <i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0>
+}
+
+define <4 x i16> @tv4i16() {
+entry:
+; ALL-LABEL: tv4i16:
+; ALL: movi d0, #0
+  ret <4 x i16> <i16 0, i16 0, i16 0, i16 0>
+}
+
+define <2 x i32> @tv2i32() {
+entry:
+; ALL-LABEL: tv2i32:
+; ALL: movi d0, #0
+  ret <2 x i32> <i32 0, i32 0>
+}
+
+define <2 x float> @tv2f32() {
+entry:
+; ALL-LABEL: tv2f32:
+; ALL: movi d0, #0
+  ret <2 x float> <float 0.0, float 0.0>
+}
+
+define <16 x i8> @tv16i8() {
+entry:
+; ALL-LABEL: tv16i8:
+; ALL: movi{{(.16b)?}} v0{{(.2d)?}}, #0
+  ret <16 x i8> <i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0>
+}
+
+define <8 x i16> @tv8i16() {
+entry:
+; ALL-LABEL: tv8i16:
+; ALL: movi{{(.16b)?}} v0{{(.2d)?}}, #0
+  ret <8 x i16> <i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0>
+}
+
+define <4 x i32> @tv4i32() {
+entry:
+; ALL-LABEL: tv4i32:
+; ALL: movi{{(.16b)?}} v0{{(.2d)?}}, #0
+  ret <4 x i32> <i32 0, i32 0, i32 0, i32 0>
+}
+
+define <2 x i64> @tv2i64() {
+entry:
+; ALL-LABEL: tv2i64:
+; ALL: movi{{(.16b)?}} v0{{(.2d)?}}, #0
+  ret <2 x i64> <i64 0, i64 0>
+}
+
+define <4 x float> @tv4f32() {
+entry:
+; ALL-LABEL: tv4f32:
+; ALL: movi{{(.16b)?}} v0{{(.2d)?}}, #0
+  ret <4 x float> <float 0.0, float 0.0, float 0.0, float 0.0>
+}
+
+define <2 x double> @tv2d64() {
+entry:
+; ALL-LABEL: tv2d64:
+; ALL: movi{{(.16b)?}} v0{{(.2d)?}}, #0
+  ret <2 x double> <double 0.0, double 0.0>
+}

-declare double @sin(double)