From 02e45aadbe1ba57402b81a71aeb3fd584ef034e4 Mon Sep 17 00:00:00 2001
From: Craig Topper <craig.topper@gmail.com>
Date: Tue, 21 Feb 2017 06:39:13 +0000
Subject: [PATCH] [X86] Use SHLD with both inputs from the same register to
 implement rotate on Sandy Bridge and later Intel CPUs

Summary:
Sandy Bridge and later CPUs have better throughput using a SHLD to implement rotate versus the normal rotate instructions. Additionally it saves one uop and avoids a partial flag update dependency.

This patch implements this change on any Sandy Bridge or later processor without BMI2 instructions. With BMI2 we will use RORX as we currently do.

Reviewers: zvi

Reviewed By: zvi

Subscribers: llvm-commits

Differential Revision: https://reviews.llvm.org/D30181

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@295697 91177308-0d34-0410-b5e6-96231b3b80d8
---
 lib/Target/X86/X86.td                 | 12 +++++++++++-
 lib/Target/X86/X86InstrInfo.td        |  1 +
 lib/Target/X86/X86InstrShiftRotate.td |  9 +++++++++
 lib/Target/X86/X86Subtarget.cpp       |  1 +
 lib/Target/X86/X86Subtarget.h         |  4 ++++
 test/CodeGen/X86/rot32.ll             |  9 +++++++++
 test/CodeGen/X86/rot64.ll             |  9 +++++++++
 7 files changed, 44 insertions(+), 1 deletion(-)
diff --git a/lib/Target/X86/X86.td b/lib/Target/X86/X86.td
index 0253a9bcc5b..d1807043350 100644
--- a/lib/Target/X86/X86.td
+++ b/lib/Target/X86/X86.td
@@ -263,6 +263,15 @@ def FeatureFastLZCNT
           "fast-lzcnt", "HasFastLZCNT", "true",
           "LZCNT instructions are as fast as most simple integer ops">;
 
+
+// Sandy Bridge and newer processors can use SHLD with the same source on both
+// inputs to implement rotate to avoid the partial flag update of the normal
+// rotate instructions.
+def FeatureFastSHLDRotate
+    : SubtargetFeature<
+          "fast-shld-rotate", "HasFastSHLDRotate", "true",
+          "SHLD can be used as a faster rotate">;
+
 //===----------------------------------------------------------------------===//
 // X86 processors supported.
 //===----------------------------------------------------------------------===//
@@ -458,7 +467,8 @@ def SNBFeatures : ProcessorFeatures<[], [
   FeatureXSAVE,
   FeatureXSAVEOPT,
   FeatureLAHFSAHF,
-  FeatureFastScalarFSQRT
+  FeatureFastScalarFSQRT,
+  FeatureFastSHLDRotate
 ]>;
 
 class SandyBridgeProc<string Name> : ProcModel<Name, SandyBridgeModel,
diff --git a/lib/Target/X86/X86InstrInfo.td b/lib/Target/X86/X86InstrInfo.td
index 8073fe6e726..f7b5e5604a7 100644
--- a/lib/Target/X86/X86InstrInfo.td
+++ b/lib/Target/X86/X86InstrInfo.td
@@ -897,6 +897,7 @@ def FavorMemIndirectCall  : Predicate<"!Subtarget->callRegIndirect()">;
 def NotSlowIncDec : Predicate<"!Subtarget->slowIncDec()">;
 def HasFastMem32 : Predicate<"!Subtarget->isUnalignedMem32Slow()">;
 def HasFastLZCNT : Predicate<"Subtarget->hasFastLZCNT()">;
+def HasFastSHLDRotate : Predicate<"Subtarget->hasFastSHLDRotate()">;
 def HasMFence    : Predicate<"Subtarget->hasMFence()">;
 
 //===----------------------------------------------------------------------===//
diff --git a/lib/Target/X86/X86InstrShiftRotate.td b/lib/Target/X86/X86InstrShiftRotate.td
index 8291ba0dc39..b21f0b923da 100644
--- a/lib/Target/X86/X86InstrShiftRotate.td
+++ b/lib/Target/X86/X86InstrShiftRotate.td
@@ -846,6 +846,15 @@ def SHRD64mri8 : RIi8<0xAC, MRMDestMem,
 
 } // Defs = [EFLAGS]
 
+// Sandy Bridge and newer Intel processors support faster rotates using
+// SHLD to avoid a partial flag update on the normal rotate instructions.
+let Predicates = [HasFastSHLDRotate], AddedComplexity = 5 in {
+  def : Pat<(rotl GR32:$src, (i8 imm:$shamt)),
+            (SHLD32rri8 GR32:$src, GR32:$src, imm:$shamt)>;
+  def : Pat<(rotl GR64:$src, (i8 imm:$shamt)),
+            (SHLD64rri8 GR64:$src, GR64:$src, imm:$shamt)>;
+}
+
 def ROT32L2R_imm8  : SDNodeXForm<imm, [{
   // Convert a ROTL shamt to a ROTR shamt on 32-bit integer.
   return getI8Imm(32 - N->getZExtValue(), SDLoc(N));
diff --git a/lib/Target/X86/X86Subtarget.cpp b/lib/Target/X86/X86Subtarget.cpp
index dfe22faef02..336db6647a3 100644
--- a/lib/Target/X86/X86Subtarget.cpp
+++ b/lib/Target/X86/X86Subtarget.cpp
@@ -302,6 +302,7 @@ void X86Subtarget::initializeEnvironment() {
   HasFastScalarFSQRT = false;
   HasFastVectorFSQRT = false;
   HasFastLZCNT = false;
+  HasFastSHLDRotate = false;
   HasSlowDivide32 = false;
   HasSlowDivide64 = false;
   PadShortFunctions = false;
diff --git a/lib/Target/X86/X86Subtarget.h b/lib/Target/X86/X86Subtarget.h
index a82d92c2854..75b87a04e51 100644
--- a/lib/Target/X86/X86Subtarget.h
+++ b/lib/Target/X86/X86Subtarget.h
@@ -229,6 +229,9 @@ protected:
   /// True if LZCNT instruction is fast.
   bool HasFastLZCNT;
 
+  /// True if SHLD based rotate is fast.
+  bool HasFastSHLDRotate;
+
   /// True if the short functions should be padded to prevent
   /// a stall when returning too early.
   bool PadShortFunctions;
@@ -466,6 +469,7 @@ public:
   bool hasFastScalarFSQRT() const { return HasFastScalarFSQRT; }
   bool hasFastVectorFSQRT() const { return HasFastVectorFSQRT; }
   bool hasFastLZCNT() const { return HasFastLZCNT; }
+  bool hasFastSHLDRotate() const { return HasFastSHLDRotate; }
   bool hasSlowDivide32() const { return HasSlowDivide32; }
   bool hasSlowDivide64() const { return HasSlowDivide64; }
   bool padShortFunctions() const { return PadShortFunctions; }
diff --git a/test/CodeGen/X86/rot32.ll b/test/CodeGen/X86/rot32.ll
index 35f4784bc1d..79ecbe0514d 100644
--- a/test/CodeGen/X86/rot32.ll
+++ b/test/CodeGen/X86/rot32.ll
@@ -1,4 +1,5 @@
 ; RUN: llc < %s -march=x86 -mcpu=corei7 | FileCheck %s
+; RUN: llc < %s -march=x86 -mcpu=corei7-avx | FileCheck %s --check-prefix=SHLD
 ; RUN: llc < %s -march=x86 -mcpu=core-avx2 | FileCheck %s --check-prefix=BMI2
 
 define i32 @foo(i32 %x, i32 %y, i32 %z) nounwind readnone {
@@ -49,6 +50,8 @@ define i32 @xfoo(i32 %x, i32 %y, i32 %z) nounwind readnone {
 entry:
 ; CHECK-LABEL: xfoo:
 ; CHECK: roll $7
+; SHLD-LABEL: xfoo:
+; SHLD: shldl $7
 ; BMI2-LABEL: xfoo:
 ; BMI2: rorxl $25
 	%0 = lshr i32 %x, 25
@@ -61,6 +64,8 @@ define i32 @xfoop(i32* %p) nounwind readnone {
 entry:
 ; CHECK-LABEL: xfoop:
 ; CHECK: roll $7
+; SHLD-LABEL: xfoop:
+; SHLD: shldl $7
 ; BMI2-LABEL: xfoop:
 ; BMI2: rorxl $25
 	%x = load i32, i32* %p
@@ -84,6 +89,8 @@ define i32 @xun(i32 %x, i32 %y, i32 %z) nounwind readnone {
 entry:
 ; CHECK-LABEL: xun:
 ; CHECK: roll $25
+; SHLD-LABEL: xun:
+; SHLD: shldl $25
 ; BMI2-LABEL: xun:
 ; BMI2: rorxl $7
 	%0 = lshr i32 %x, 7
@@ -96,6 +103,8 @@ define i32 @xunp(i32* %p) nounwind readnone {
 entry:
 ; CHECK-LABEL: xunp:
 ; CHECK: roll $25
+; shld-label: xunp:
+; shld: shldl $25
 ; BMI2-LABEL: xunp:
 ; BMI2: rorxl $7
 	%x = load i32, i32* %p
diff --git a/test/CodeGen/X86/rot64.ll b/test/CodeGen/X86/rot64.ll
index fc382ed7fc6..976acbb0167 100644
--- a/test/CodeGen/X86/rot64.ll
+++ b/test/CodeGen/X86/rot64.ll
@@ -1,4 +1,5 @@
 ; RUN: llc < %s -march=x86-64 -mcpu=corei7 | FileCheck %s
+; RUN: llc < %s -march=x86-64 -mcpu=corei7-avx | FileCheck %s --check-prefix=SHLD
 ; RUN: llc < %s -march=x86-64 -mcpu=core-avx2 | FileCheck %s --check-prefix=BMI2
 
 define i64 @foo(i64 %x, i64 %y, i64 %z) nounwind readnone {
@@ -49,6 +50,8 @@ define i64 @xfoo(i64 %x, i64 %y, i64 %z) nounwind readnone {
 entry:
 ; CHECK-LABEL: xfoo:
 ; CHECK: rolq $7
+; SHLD-LABEL: xfoo:
+; SHLD: shldq $7
 ; BMI2-LABEL: xfoo:
 ; BMI2: rorxq $57
 	%0 = lshr i64 %x, 57
@@ -61,6 +64,8 @@ define i64 @xfoop(i64* %p) nounwind readnone {
 entry:
 ; CHECK-LABEL: xfoop:
 ; CHECK: rolq $7
+; SHLD-LABEL: xfoop:
+; SHLD: shldq $7
 ; BMI2-LABEL: xfoop:
 ; BMI2: rorxq $57
 	%x = load i64, i64* %p
@@ -84,6 +89,8 @@ define i64 @xun(i64 %x, i64 %y, i64 %z) nounwind readnone {
 entry:
 ; CHECK-LABEL: xun:
 ; CHECK: rolq $57
+; SHLD-LABEL: xun:
+; SHLD: shldq $57
 ; BMI2-LABEL: xun:
 ; BMI2: rorxq $7
 	%0 = lshr i64 %x, 7
@@ -96,6 +103,8 @@ define i64 @xunp(i64* %p) nounwind readnone {
 entry:
 ; CHECK-LABEL: xunp:
 ; CHECK: rolq $57
+; SHLD-LABEL: xunp:
+; SHLD: shldq $57
 ; BMI2-LABEL: xunp:
 ; BMI2: rorxq $7
 	%x = load i64, i64* %p