[AArch64] Split zero cycle feature more granularly

Split the `zcz` feature into specific ones got GP and FP registers, `zcz-gp`
and `zcz-fp`, respectively, while retaining the original feature option to
mean both.

Differential revision: https://reviews.llvm.org/D52621

llvm-svn: 343354
This commit is contained in:
Evandro Menezes 2018-09-28 19:05:09 +00:00
parent 882252baeb
commit fdd7b1d490
6 changed files with 200 additions and 54 deletions

View File

@ -80,13 +80,17 @@ def FeatureSPE : SubtargetFeature<"spe", "HasSPE", "true",
def FeatureSVE : SubtargetFeature<"sve", "HasSVE", "true",
"Enable Scalable Vector Extension (SVE) instructions">;
/// Cyclone has register move instructions which are "free".
def FeatureZCRegMove : SubtargetFeature<"zcm", "HasZeroCycleRegMove", "true",
"Has zero-cycle register moves">;
def FeatureZCZeroingGP : SubtargetFeature<"zcz-gp", "HasZeroCycleZeroingGP", "true",
"Has zero-cycle zeroing instructions for generic registers">;
def FeatureZCZeroingFP : SubtargetFeature<"zcz-fp", "HasZeroCycleZeroingFP", "true",
"Has zero-cycle zeroing instructions for FP registers">;
/// Cyclone has instructions which zero registers for "free".
def FeatureZCZeroing : SubtargetFeature<"zcz", "HasZeroCycleZeroing", "true",
"Has zero-cycle zeroing instructions">;
"Has zero-cycle zeroing instructions",
[FeatureZCZeroingGP, FeatureZCZeroingFP]>;
/// ... but the floating-point version doesn't quite work in rare cases on older
/// CPUs.
@ -404,7 +408,7 @@ def ProcExynosM1 : SubtargetFeature<"exynosm1", "ARMProcFamily", "ExynosM1",
FeaturePostRAScheduler,
FeatureSlowMisaligned128Store,
FeatureUseRSqrt,
FeatureZCZeroing]>;
FeatureZCZeroingFP]>;
def ProcExynosM2 : SubtargetFeature<"exynosm2", "ARMProcFamily", "ExynosM1",
"Samsung Exynos-M2 processors",
@ -418,7 +422,7 @@ def ProcExynosM2 : SubtargetFeature<"exynosm2", "ARMProcFamily", "ExynosM1",
FeaturePerfMon,
FeaturePostRAScheduler,
FeatureSlowMisaligned128Store,
FeatureZCZeroing]>;
FeatureZCZeroingFP]>;
def ProcExynosM3 : SubtargetFeature<"exynosm3", "ARMProcFamily", "ExynosM3",
"Samsung Exynos-M3 processors",
@ -435,7 +439,7 @@ def ProcExynosM3 : SubtargetFeature<"exynosm3", "ARMProcFamily", "ExynosM3",
FeaturePerfMon,
FeaturePostRAScheduler,
FeaturePredictableSelectIsExpensive,
FeatureZCZeroing]>;
FeatureZCZeroingFP]>;
def ProcKryo : SubtargetFeature<"kryo", "ARMProcFamily", "Kryo",
"Qualcomm Kryo processors", [

View File

@ -503,7 +503,7 @@ void AArch64AsmPrinter::LowerPATCHPOINT(MCStreamer &OutStreamer, StackMaps &SM,
void AArch64AsmPrinter::EmitFMov0(const MachineInstr &MI) {
unsigned DestReg = MI.getOperand(0).getReg();
if (STI->hasZeroCycleZeroing() && !STI->hasZeroCycleZeroingFPWorkaround()) {
if (STI->hasZeroCycleZeroingFP() && !STI->hasZeroCycleZeroingFPWorkaround()) {
// Convert H/S/D register to corresponding Q register
if (AArch64::H0 <= DestReg && DestReg <= AArch64::H31)
DestReg = AArch64::Q0 + (DestReg - AArch64::H0);

View File

@ -729,9 +729,9 @@ bool AArch64InstrInfo::isAsCheapAsAMove(const MachineInstr &MI) const {
case AArch64::FMOVH0:
case AArch64::FMOVS0:
case AArch64::FMOVD0:
return Subtarget.hasZeroCycleZeroing();
return Subtarget.hasZeroCycleZeroingFP();
case TargetOpcode::COPY:
return (Subtarget.hasZeroCycleZeroing() &&
return (Subtarget.hasZeroCycleZeroingGP() &&
(MI.getOperand(1).getReg() == AArch64::WZR ||
MI.getOperand(1).getReg() == AArch64::XZR));
}
@ -2481,7 +2481,7 @@ void AArch64InstrInfo::copyPhysReg(MachineBasicBlock &MBB,
.addImm(0)
.addImm(AArch64_AM::getShifterImm(AArch64_AM::LSL, 0));
}
} else if (SrcReg == AArch64::WZR && Subtarget.hasZeroCycleZeroing()) {
} else if (SrcReg == AArch64::WZR && Subtarget.hasZeroCycleZeroingGP()) {
BuildMI(MBB, I, DL, get(AArch64::MOVZWi), DestReg)
.addImm(0)
.addImm(AArch64_AM::getShifterImm(AArch64_AM::LSL, 0));
@ -2518,7 +2518,7 @@ void AArch64InstrInfo::copyPhysReg(MachineBasicBlock &MBB,
.addReg(SrcReg, getKillRegState(KillSrc))
.addImm(0)
.addImm(AArch64_AM::getShifterImm(AArch64_AM::LSL, 0));
} else if (SrcReg == AArch64::XZR && Subtarget.hasZeroCycleZeroing()) {
} else if (SrcReg == AArch64::XZR && Subtarget.hasZeroCycleZeroingGP()) {
BuildMI(MBB, I, DL, get(AArch64::MOVZXi), DestReg)
.addImm(0)
.addImm(AArch64_AM::getShifterImm(AArch64_AM::LSL, 0));

View File

@ -109,6 +109,8 @@ protected:
// HasZeroCycleZeroing - Has zero-cycle zeroing instructions.
bool HasZeroCycleZeroing = false;
bool HasZeroCycleZeroingGP = false;
bool HasZeroCycleZeroingFP = false;
bool HasZeroCycleZeroingFPWorkaround = false;
// StrictAlign - Disallow unaligned memory accesses.
@ -228,7 +230,9 @@ public:
bool hasZeroCycleRegMove() const { return HasZeroCycleRegMove; }
bool hasZeroCycleZeroing() const { return HasZeroCycleZeroing; }
bool hasZeroCycleZeroingGP() const { return HasZeroCycleZeroingGP; }
bool hasZeroCycleZeroingFP() const { return HasZeroCycleZeroingFP; }
bool hasZeroCycleZeroingFPWorkaround() const {
return HasZeroCycleZeroingFPWorkaround;

View File

@ -1,16 +1,20 @@
; RUN: llc -mtriple=arm64-apple-ios -mcpu=cyclone < %s | FileCheck %s
; rdar://12254953
; RUN: llc < %s -mtriple=arm64-apple-ios -mattr=-zcm | FileCheck %s -check-prefixes=CHECK,NOT
; RUN: llc < %s -mtriple=arm64-apple-ios -mattr=+zcm | FileCheck %s -check-prefixes=CHECK,YES
; RUN: llc < %s -mtriple=arm64-apple-ios -mcpu=cyclone | FileCheck %s -check-prefixes=CHECK,YES
; rdar://12254953
define i32 @t(i32 %a, i32 %b, i32 %c, i32 %d) nounwind ssp {
entry:
; CHECK-LABEL: t:
; CHECK: mov [[REG2:x[0-9]+]], x3
; CHECK: mov [[REG1:x[0-9]+]], x2
; CHECK: mov x0, x2
; CHECK: mov x1, x3
; NOT: mov [[REG2:w[0-9]+]], w3
; NOT: mov [[REG1:w[0-9]+]], w2
; YES: mov [[REG2:x[0-9]+]], x3
; YES: mov [[REG1:x[0-9]+]], x2
; CHECK: bl _foo
; CHECK: mov x0, [[REG1]]
; CHECK: mov x1, [[REG2]]
; NOT: mov w0, [[REG1]]
; NOT: mov w1, [[REG2]]
; YES: mov x0, [[REG1]]
; YES: mov x1, [[REG2]]
%call = call i32 @foo(i32 %c, i32 %d) nounwind
%call1 = call i32 @foo(i32 %c, i32 %d) nounwind
unreachable

View File

@ -1,9 +1,14 @@
; RUN: llc -mtriple=arm64-apple-ios -mcpu=cyclone < %s | FileCheck %s -check-prefixes=ALL,CYCLONE
; RUN: llc -mtriple=arm64-apple-ios -mcpu=cyclone -mattr=+fullfp16 < %s | FileCheck %s -check-prefixes=CYCLONE-FULLFP16
; RUN: llc -mtriple=aarch64-gnu-linux -mcpu=exynos-m1 < %s | FileCheck %s -check-prefixes=ALL,OTHERS
; RUN: llc -mtriple=aarch64-gnu-linux -mcpu=exynos-m3 < %s | FileCheck %s -check-prefixes=ALL,OTHERS
; RUN: llc -mtriple=aarch64-gnu-linux -mcpu=kryo < %s | FileCheck %s -check-prefixes=ALL,OTHERS
; RUN: llc -mtriple=aarch64-gnu-linux -mcpu=falkor < %s | FileCheck %s -check-prefixes=ALL,OTHERS
; RUN: llc < %s -mtriple=aarch64-linux-gnu -mattr=-zcz | FileCheck %s -check-prefixes=ALL,NONEGP,NONEFP
; RUN: llc < %s -mtriple=aarch64-linux-gnu -mattr=+zcz | FileCheck %s -check-prefixes=ALL,ZEROGP,ZEROFP
; RUN: llc < %s -mtriple=aarch64-linux-gnu -mattr=+zcz -mattr=+fullfp16 | FileCheck %s -check-prefixes=ALL,ZEROGP,ZERO16
; RUN: llc < %s -mtriple=aarch64-linux-gnu -mattr=+zcz-gp | FileCheck %s -check-prefixes=ALL,ZEROGP,NONEFP
; RUN: llc < %s -mtriple=aarch64-linux-gnu -mattr=+zcz-fp | FileCheck %s -check-prefixes=ALL,NONEGP,ZEROFP
; RUN: llc < %s -mtriple=arm64-apple-ios -mcpu=cyclone | FileCheck %s -check-prefixes=ALL,ZEROGP,NONEFP
; RUN: llc < %s -mtriple=arm64-apple-ios -mcpu=cyclone -mattr=+fullfp16 | FileCheck %s -check-prefixes=ALL,ZEROGP,NONE16
; RUN: llc < %s -mtriple=aarch64-linux-gnu -mcpu=exynos-m1 | FileCheck %s -check-prefixes=ALL,NONEGP,ZEROFP
; RUN: llc < %s -mtriple=aarch64-linux-gnu -mcpu=exynos-m3 | FileCheck %s -check-prefixes=ALL,NONEGP,ZEROFP
; RUN: llc < %s -mtriple=aarch64-linux-gnu -mcpu=kryo | FileCheck %s -check-prefixes=ALL,ZEROGP,ZEROFP
; RUN: llc < %s -mtriple=aarch64-linux-gnu -mcpu=falkor | FileCheck %s -check-prefixes=ALL,ZEROGP,ZEROFP
declare void @bar(half, float, double, <2 x double>)
declare void @bari(i32, i32)
@ -14,17 +19,22 @@ define void @t1() nounwind ssp {
entry:
; ALL-LABEL: t1:
; ALL-NOT: fmov
; ALL: ldr h0,{{.*}}
; CYCLONE: fmov s1, wzr
; CYCLONE: fmov d2, xzr
; CYCLONE: movi.16b v3, #0
; CYCLONE-FULLFP16: fmov h0, wzr
; CYCLONE-FULLFP16: fmov s1, wzr
; CYCLONE-FULLFP16: fmov d2, xzr
; CYCLONE-FULLFP16: movi.16b v3, #0
; OTHERS: movi v{{[0-3]+}}.2d, #0000000000000000
; OTHERS: movi v{{[0-3]+}}.2d, #0000000000000000
; OTHERS: movi v{{[0-3]+}}.2d, #0000000000000000
; NONEFP: ldr h0,{{.*}}
; NONEFP: fmov s1, wzr
; NONEFP: fmov d2, xzr
; NONEFP: movi{{(.16b)?}} v3{{(.2d)?}}, #0
; NONE16: fmov h0, wzr
; NONE16: fmov s1, wzr
; NONE16: fmov d2, xzr
; NONE16: movi{{(.16b)?}} v3{{(.2d)?}}, #0
; ZEROFP: ldr h0,{{.*}}
; ZEROFP: movi v{{[0-3]+}}.2d, #0
; ZEROFP: movi v{{[0-3]+}}.2d, #0
; ZEROFP: movi v{{[0-3]+}}.2d, #0
; ZERO16: movi v{{[0-3]+}}.2d, #0
; ZERO16: movi v{{[0-3]+}}.2d, #0
; ZERO16: movi v{{[0-3]+}}.2d, #0
; ZERO16: movi v{{[0-3]+}}.2d, #0
tail call void @bar(half 0.000000e+00, float 0.000000e+00, double 0.000000e+00, <2 x double> <double 0.000000e+00, double 0.000000e+00>) nounwind
ret void
}
@ -32,9 +42,10 @@ entry:
define void @t2() nounwind ssp {
entry:
; ALL-LABEL: t2:
; ALL-NOT: mov w0, wzr
; ALL: mov w{{[0-3]+}}, #0
; ALL: mov w{{[0-3]+}}, #0
; NONEGP: mov w0, wzr
; NONEGP: mov w1, wzr
; ZEROGP: mov w0, #0
; ZEROGP: mov w1, #0
tail call void @bari(i32 0, i32 0) nounwind
ret void
}
@ -42,26 +53,26 @@ entry:
define void @t3() nounwind ssp {
entry:
; ALL-LABEL: t3:
; ALL-NOT: mov x0, xzr
; ALL: mov x{{[0-3]+}}, #0
; ALL: mov x{{[0-3]+}}, #0
; NONEGP: mov x0, xzr
; NONEGP: mov x1, xzr
; ZEROGP: mov x0, #0
; ZEROGP: mov x1, #0
tail call void @barl(i64 0, i64 0) nounwind
ret void
}
define void @t4() nounwind ssp {
; ALL-LABEL: t4:
; ALL-NOT: fmov
; CYCLONE: fmov s{{[0-3]+}}, wzr
; CYCLONE: fmov s{{[0-3]+}}, wzr
; CYCLONE-FULLFP16: fmov s{{[0-3]+}}, wzr
; CYCLONE-FULLFP16: fmov s{{[0-3]+}}, wzr
; OTHERS: movi v{{[0-3]+}}.2d, #0000000000000000
; OTHERS: movi v{{[0-3]+}}.2d, #0000000000000000
; NONEFP: fmov s{{[0-3]+}}, wzr
; NONEFP: fmov s{{[0-3]+}}, wzr
; ZEROFP: movi v{{[0-3]+}}.2d, #0
; ZEROFP: movi v{{[0-3]+}}.2d, #0
tail call void @barf(float 0.000000e+00, float 0.000000e+00) nounwind
ret void
}
declare double @sin(double)
; We used to produce spills+reloads for a Q register with zero cycle zeroing
; enabled.
; ALL-LABEL: foo:
@ -88,10 +99,133 @@ for.end:
define <2 x i64> @t6() {
; ALL-LABEL: t6:
; CYCLONE: movi.16b v0, #0
; OTHERS: movi v0.2d, #0000000000000000
ret <2 x i64> zeroinitializer
; ALL: movi{{(.16b)?}} v0{{(.2d)?}}, #0
ret <2 x i64> zeroinitializer
}
define i1 @ti1() {
entry:
; ALL-LABEL: ti1:
; NONEGP: mov w0, wzr
; ZEROGP: mov w0, #0
ret i1 false
}
define i8 @ti8() {
entry:
; ALL-LABEL: ti8:
; NONEGP: mov w0, wzr
; ZEROGP: mov w0, #0
ret i8 0
}
define i16 @ti16() {
entry:
; ALL-LABEL: ti16:
; NONEGP: mov w0, wzr
; ZEROGP: mov w0, #0
ret i16 0
}
define i32 @ti32() {
entry:
; ALL-LABEL: ti32:
; NONEGP: mov w0, wzr
; ZEROGP: mov w0, #0
ret i32 0
}
define i64 @ti64() {
entry:
; ALL-LABEL: ti64:
; NONEGP: mov x0, xzr
; ZEROGP: mov x0, #0
ret i64 0
}
define float @tf32() {
entry:
; ALL-LABEL: tf32:
; NONEFP: mov s0, wzr
; ZEROFP: movi v0.2d, #0
ret float 0.0
}
define double @td64() {
entry:
; ALL-LABEL: td64:
; NONEFP: mov d0, xzr
; ZEROFP: movi v0.2d, #0
ret double 0.0
}
define <8 x i8> @tv8i8() {
entry:
; ALL-LABEL: tv8i8:
; ALL: movi d0, #0
ret <8 x i8> <i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0>
}
define <4 x i16> @tv4i16() {
entry:
; ALL-LABEL: tv4i16:
; ALL: movi d0, #0
ret <4 x i16> <i16 0, i16 0, i16 0, i16 0>
}
define <2 x i32> @tv2i32() {
entry:
; ALL-LABEL: tv2i32:
; ALL: movi d0, #0
ret <2 x i32> <i32 0, i32 0>
}
define <2 x float> @tv2f32() {
entry:
; ALL-LABEL: tv2f32:
; ALL: movi d0, #0
ret <2 x float> <float 0.0, float 0.0>
}
define <16 x i8> @tv16i8() {
entry:
; ALL-LABEL: tv16i8:
; ALL: movi{{(.16b)?}} v0{{(.2d)?}}, #0
ret <16 x i8> <i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0>
}
define <8 x i16> @tv8i16() {
entry:
; ALL-LABEL: tv8i16:
; ALL: movi{{(.16b)?}} v0{{(.2d)?}}, #0
ret <8 x i16> <i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0>
}
define <4 x i32> @tv4i32() {
entry:
; ALL-LABEL: tv4i32:
; ALL: movi{{(.16b)?}} v0{{(.2d)?}}, #0
ret <4 x i32> <i32 0, i32 0, i32 0, i32 0>
}
define <2 x i64> @tv2i64() {
entry:
; ALL-LABEL: tv2i64:
; ALL: movi{{(.16b)?}} v0{{(.2d)?}}, #0
ret <2 x i64> <i64 0, i64 0>
}
define <4 x float> @tv4f32() {
entry:
; ALL-LABEL: tv4f32:
; ALL: movi{{(.16b)?}} v0{{(.2d)?}}, #0
ret <4 x float> <float 0.0, float 0.0, float 0.0, float 0.0>
}
define <2 x double> @tv2d64() {
entry:
; ALL-LABEL: tv2d64:
; ALL: movi{{(.16b)?}} v0{{(.2d)?}}, #0
ret <2 x double> <double 0.0, double 0.0>
}
declare double @sin(double)