[AArch64] Add support for Qualcomm Kryo CPU.

Machine model description by Dave Estes <cestes@codeaurora.org>. git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@260686 91177308-0d34-0410-b5e6-96231b3b80d8
2024-11-24 12:19:53 +00:00 · 2016-02-12 15:51:51 +00:00 · 2016-02-12 15:51:51 +00:00 · 1f88b2d0b7
commit 1f88b2d0b7
parent 9234391598
11 changed files with 2509 additions and 5 deletions
--- a/lib/Target/AArch64/AArch64.td
+++ b/lib/Target/AArch64/AArch64.td
@ -91,6 +91,7 @@ include "AArch64SchedA53.td"
 include "AArch64SchedA57.td"
 include "AArch64SchedCyclone.td"
 include "AArch64SchedM1.td"
+include "AArch64SchedKryo.td"

 def ProcA35     : SubtargetFeature<"a35", "ARMProcFamily", "CortexA35",
                                   "Cortex-A35 ARM processors",
@ -133,6 +134,14 @@ def ProcExynosM1 : SubtargetFeature<"exynosm1", "ARMProcFamily", "ExynosM1",
                                    FeatureCRC,
                                    FeaturePerfMon]>;

+def ProcKryo    : SubtargetFeature<"kryo", "ARMProcFamily", "Kryo",
+                                   "Qualcomm Kryo processors",
+                                   [FeatureFPARMv8,
+                                   FeatureNEON,
+                                   FeatureCrypto,
+                                   FeatureCRC,
+                                   FeaturePerfMon]>;
+
 def : ProcessorModel<"generic", NoSchedModel, [FeatureFPARMv8,
                                              FeatureNEON,
                                              FeatureCRC,
@ -146,6 +155,7 @@ def : ProcessorModel<"cortex-a57", CortexA57Model, [ProcA57]>;
 def : ProcessorModel<"cortex-a72", CortexA57Model, [ProcA57]>;
 def : ProcessorModel<"cyclone", CycloneModel, [ProcCyclone]>;
 def : ProcessorModel<"exynos-m1", ExynosM1Model, [ProcExynosM1]>;
+def : ProcessorModel<"kryo", KryoModel, [ProcKryo]>;

 //===----------------------------------------------------------------------===//
 // Assembly parser
--- a/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/lib/Target/AArch64/AArch64ISelLowering.cpp
@ -637,7 +637,7 @@ AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM,
  }

  // Prefer likely predicted branches to selects on out-of-order cores.
-  if (Subtarget->isCortexA57())
+  if (Subtarget->isCortexA57() || Subtarget->isKryo())
    PredictableSelectIsExpensive = true;
 }

--- a/lib/Target/AArch64/AArch64InstrInfo.cpp
+++ b/lib/Target/AArch64/AArch64InstrInfo.cpp
@ -543,7 +543,8 @@ static bool canBeExpandedToORR(const MachineInstr *MI, unsigned BitSize) {
 // FIXME: this implementation should be micro-architecture dependent, so a
 // micro-architecture target hook should be introduced here in future.
 bool AArch64InstrInfo::isAsCheapAsAMove(const MachineInstr *MI) const {
-  if (!Subtarget.isCortexA57() && !Subtarget.isCortexA53())
+  if (!Subtarget.isCortexA57() && !Subtarget.isCortexA53() &&
+      !Subtarget.isKryo())
    return MI->isAsCheapAsAMove();

  switch (MI->getOpcode()) {
--- a/lib/Target/AArch64/AArch64LoadStoreOptimizer.cpp
+++ b/lib/Target/AArch64/AArch64LoadStoreOptimizer.cpp
@ -1969,7 +1969,7 @@ bool AArch64LoadStoreOpt::optimizeBlock(MachineBasicBlock &MBB,
 }

 bool AArch64LoadStoreOpt::enableNarrowLdMerge(MachineFunction &Fn) {
-  bool ProfitableArch = Subtarget->isCortexA57();
+  bool ProfitableArch = Subtarget->isCortexA57() || Subtarget->isKryo();
  // FIXME: The benefit from converting narrow loads into a wider load could be
  // microarchitectural as it assumes that a single load with two bitfield
  // extracts is cheaper than two narrow loads. Currently, this conversion is
--- a/lib/Target/AArch64/AArch64SchedKryo.td
+++ b/lib/Target/AArch64/AArch64SchedKryo.td
@ -0,0 +1,130 @@
+//==- AArch64SchedKryo.td - Qualcomm Kryo Scheduling Defs ---*- tablegen -*-==//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines the machine model for Qualcomm Kryo to support
+// instruction scheduling and other instruction cost heuristics.
+//
+//===----------------------------------------------------------------------===//
+
+//===----------------------------------------------------------------------===//
+// The issue width is set to five, matching the five issue queues for expanded
+// uops. Now, the latency spreadsheet has information based on fragmented uops,
+// but these do not actually take up an issue queue.
+
+def KryoModel : SchedMachineModel {
+  let IssueWidth        =   5; // 5-wide issue for expanded uops
+  let MicroOpBufferSize = 128; // Out-of-order with temporary unified issue buffer
+  let LoadLatency       =   4; // Optimistic load latency
+  let MispredictPenalty =  14; // Fetch + Decode/Rename/Dispatch + Branch
+
+  // Enable partial & runtime unrolling. The magic number is chosen based on
+  // experiments and benchmarking data.
+  let LoopMicroOpBufferSize = 16;
+}
+
+//===----------------------------------------------------------------------===//
+// Define each kind of processor resource and number available on Kryo.
+
+let SchedModel = KryoModel in {
+  def KryoUnitXA : ProcResource<1>;                   // Type X(A) micro-ops
+  def KryoUnitXB : ProcResource<1>;                   // Type X(B) micro-ops
+  def KryoUnitYA : ProcResource<1>;                   // Type Y(A) micro-ops
+  def KryoUnitYB : ProcResource<1>;                   // Type Y(B) micro-ops
+  def KryoUnitX : ProcResGroup<[KryoUnitXA,          // Type X micro-ops
+                                KryoUnitXB]>;
+  def KryoUnitY : ProcResGroup<[KryoUnitYA,          // Type Y micro-ops
+                                KryoUnitYB]>;
+  def KryoUnitXY : ProcResGroup<[KryoUnitXA,         // Type XY micro-ops
+                                 KryoUnitXB,
+                                 KryoUnitYA,
+                                 KryoUnitYB]>;
+  def KryoUnitLSA : ProcResource<1>;                  // Type LS(A) micro-ops
+  def KryoUnitLSB : ProcResource<1>;                  // Type LS(B) micro-ops
+  def KryoUnitLS : ProcResGroup<[KryoUnitLSA,        // Type LS micro-ops
+                                 KryoUnitLSB]>;
+}
+
+let SchedModel = KryoModel in {
+
+//===----------------------------------------------------------------------===//
+// Map the target-defined scheduler read/write resources and latency for
+// Kryo.
+
+def : WriteRes<WriteImm,   [KryoUnitXY]> { let Latency = 1; }
+def : WriteRes<WriteI,     [KryoUnitXY]> { let Latency = 1; }
+def : WriteRes<WriteISReg, [KryoUnitXY, KryoUnitXY]>
+      { let Latency = 2; let NumMicroOps = 2; }
+def : WriteRes<WriteIEReg, [KryoUnitXY, KryoUnitXY]>
+      { let Latency = 2; let NumMicroOps = 2; }
+def : WriteRes<WriteExtr,  [KryoUnitXY, KryoUnitX]>
+      { let Latency = 2; let NumMicroOps = 2; }
+def : WriteRes<WriteIS,    [KryoUnitXY]> { let Latency = 2; }
+def : WriteRes<WriteID32,  [KryoUnitXA, KryoUnitY]>
+      { let Latency = 8; let NumMicroOps = 1; } // Fragent -1
+def : WriteRes<WriteID64,  [KryoUnitXA, KryoUnitY]>
+      { let Latency = 8; let NumMicroOps = 1; } // Fragent -1
+def : WriteRes<WriteIM32,  [KryoUnitX]> { let Latency = 5; }
+def : WriteRes<WriteIM64,  [KryoUnitX]> { let Latency = 5; }
+def : WriteRes<WriteBr,    [KryoUnitXY]> { let Latency = 1; }
+def : WriteRes<WriteBrReg, [KryoUnitXY]> { let Latency = 1; }
+def : WriteRes<WriteLD,    [KryoUnitLS]> { let Latency = 4; }
+def : WriteRes<WriteST,    [KryoUnitLS]> { let Latency = 4; }
+def : WriteRes<WriteSTP,   [KryoUnitLS]> { let Latency = 4; }
+def : WriteRes<WriteAdr,   [KryoUnitXY]> { let Latency = 6; }
+def : WriteRes<WriteLDIdx, [KryoUnitLS]> { let Latency = 4; }
+def : WriteRes<WriteSTIdx, [KryoUnitLS]> { let Latency = 4; }
+def : WriteRes<WriteF,     [KryoUnitXY, KryoUnitXY]>
+      { let Latency = 3; let NumMicroOps = 2; }
+def : WriteRes<WriteFCmp,  [KryoUnitXY]> { let Latency = 2; }
+def : WriteRes<WriteFCvt,  [KryoUnitX]> { let Latency = 4; }
+def : WriteRes<WriteFCopy, [KryoUnitXY]> { let Latency = 6; }
+def : WriteRes<WriteFImm,  [KryoUnitXY]> { let Latency = 6; }
+def : WriteRes<WriteFMul,  [KryoUnitX, KryoUnitX]>
+      { let Latency = 6; let NumMicroOps = 2; }
+def : WriteRes<WriteFDiv,  [KryoUnitXA, KryoUnitY]>
+      { let Latency = 12; let NumMicroOps = 2; } // Fragent -1 / NoRSV +1
+def : WriteRes<WriteV,     [KryoUnitXY]> { let Latency = 6; }
+def : WriteRes<WriteVLD,   [KryoUnitLS]> { let Latency = 4; }
+def : WriteRes<WriteVST,   [KryoUnitLS]> { let Latency = 4; }
+
+def : WriteRes<WriteSys,     []> { let Latency = 1; }
+def : WriteRes<WriteBarrier, []> { let Latency = 1; }
+def : WriteRes<WriteHint,    []> { let Latency = 1; }
+
+def : WriteRes<WriteLDHi,    []> { let Latency = 4; }
+
+// No forwarding logic is modelled yet.
+def : ReadAdvance<ReadI,       0>;
+def : ReadAdvance<ReadISReg,   0>;
+def : ReadAdvance<ReadIEReg,   0>;
+def : ReadAdvance<ReadIM,      0>;
+def : ReadAdvance<ReadIMA,     0>;
+def : ReadAdvance<ReadID,      0>;
+def : ReadAdvance<ReadExtrHi,  0>;
+def : ReadAdvance<ReadAdrBase, 0>;
+def : ReadAdvance<ReadVLD,     0>;
+
+
+//===----------------------------------------------------------------------===//
+// Specialize the coarse model by associating instruction groups with the
+// subtarget-defined types. As the modeled is refined, this will override most
+// of the above SchedWriteRes and SchedAlias mappings.
+
+// Miscellaneous
+// -----------------------------------------------------------------------------
+
+def : InstRW<[WriteI], (instrs COPY)>;
+
+
+// Detailed Refinedments
+// -----------------------------------------------------------------------------
+include "AArch64SchedKryoDetails.td"
+
+
+} // SchedModel = KryoModel
--- a/lib/Target/AArch64/AArch64SchedKryoDetails.td
+++ b/lib/Target/AArch64/AArch64SchedKryoDetails.td
--- a/lib/Target/AArch64/AArch64Subtarget.h
+++ b/lib/Target/AArch64/AArch64Subtarget.h
@ -39,7 +39,8 @@ protected:
    CortexA53,
    CortexA57,
    Cyclone,
-    ExynosM1
+    ExynosM1,
+    Kryo
  };

  /// ARMProcFamily - ARM processor family: Cortex-A53, Cortex-A57, and others.
@ -151,6 +152,7 @@ public:
  bool isCortexA57() const { return CPUString == "cortex-a57"; }
  bool isCortexA53() const { return CPUString == "cortex-a53"; }
  bool isExynosM1() const { return CPUString == "exynos-m1"; }
+  bool isKryo() const { return CPUString == "kryo"; }

  bool useAA() const override { return isCortexA53(); }

--- a/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
+++ b/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
@ -472,7 +472,7 @@ int AArch64TTIImpl::getCostOfKeepingLiveOverCall(ArrayRef<Type *> Tys) {
 }

 unsigned AArch64TTIImpl::getMaxInterleaveFactor(unsigned VF) {
-  if (ST->isCortexA57())
+  if (ST->isCortexA57() || ST->isKryo())
    return 4;
  return 2;
 }
--- a/test/CodeGen/AArch64/arm64-narrow-ldst-merge.ll
+++ b/test/CodeGen/AArch64/arm64-narrow-ldst-merge.ll
@ -1,5 +1,6 @@
 ; RUN: llc < %s -mtriple aarch64--none-eabi -mcpu=cortex-a57 -verify-machineinstrs | FileCheck %s --check-prefix=CHECK --check-prefix=LE
 ; RUN: llc < %s -mtriple aarch64_be--none-eabi -mcpu=cortex-a57 -verify-machineinstrs | FileCheck %s --check-prefix=CHECK --check-prefix=BE
+; RUN: llc < %s -mtriple aarch64--none-eabi -mcpu=kryo -verify-machineinstrs | FileCheck %s --check-prefix=CHECK --check-prefix=LE

 ; CHECK-LABEL: Ldrh_merge
 ; CHECK-NOT: ldrh
--- a/test/CodeGen/AArch64/cpus.ll
+++ b/test/CodeGen/AArch64/cpus.ll
@ -7,6 +7,7 @@
 ; RUN: llc < %s -mtriple=arm64-unknown-unknown -mcpu=cortex-a57 2>&1 | FileCheck %s
 ; RUN: llc < %s -mtriple=arm64-unknown-unknown -mcpu=cortex-a72 2>&1 | FileCheck %s
 ; RUN: llc < %s -mtriple=arm64-unknown-unknown -mcpu=exynos-m1 2>&1 | FileCheck %s
+; RUN: llc < %s -mtriple=arm64-unknown-unknown -mcpu=kryo 2>&1 | FileCheck %s
 ; RUN: llc < %s -mtriple=arm64-unknown-unknown -mcpu=invalidcpu 2>&1 | FileCheck %s --check-prefix=INVALID

 ; CHECK-NOT: {{.*}}  is not a recognized processor for this target
--- a/test/CodeGen/AArch64/remat.ll
+++ b/test/CodeGen/AArch64/remat.ll
@ -3,6 +3,7 @@
 ; RUN: llc -mtriple=aarch64-linux-gnuabi -mcpu=cortex-a53 -o - %s | FileCheck %s
 ; RUN: llc -mtriple=aarch64-linux-gnuabi -mcpu=cortex-a72 -o - %s | FileCheck %s
 ; RUN: llc -mtriple=aarch64-linux-gnuabi -mcpu=exynos-m1 -o - %s | FileCheck %s
+; RUN: llc -mtriple=aarch64-linux-gnuabi -mcpu=kryo -o - %s | FileCheck %s

 %X = type { i64, i64, i64 }
 declare void @f(%X*)