Amjad Aboud 3766448ba4 [X86] Improved X86::CMOV to Branch heuristic.
Resolved PR33954.
This patch contains two more constraints that aim to reduce the noise cases where we convert CMOV into branch for small gain, and end up spending more cycles due to overhead.

Differential Revision: https://reviews.llvm.org/D36081

llvm-svn: 310352
2017-08-08 12:17:56 +00:00

92 lines
3.4 KiB
LLVM

; RUN: llc -mtriple=x86_64-pc-linux -x86-cmov-converter=true -verify-machineinstrs < %s | FileCheck %s
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;; This test checks that x86-cmov-converter optimization does not transform CMOV
;; instruction when the gain (in cycles) of converting to branch is less than
;; a fix threshold (measured for "-x86-cmov-converter-threshold=4").
;;
;; Test was created using the following command line:
;; > clang -S -O2 -m64 -fno-vectorize -fno-unroll-loops -emit-llvm foo.c -o -
;; Where foo.c is:
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;;int bar(int *a, int *b, int n) {
;; int sum = 0;
;; for (int i = 0; i < n; ++i) {
;; int x = a[i] * a[i+1] * a[i+2];
;; int y = b[i] * b[i+1];
;; sum += y > x ? x : 0;
;; }
;; return sum;
;;}
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;; Adding a test to the above function shows code with CMOV is 25% faster than
;; the code with branch.
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;;#define N 10000
;;int A[N];
;;int B[N];
;;
;;
;;
;;int main () {
;; for (int i=0; i< N; ++i) {
;; A[i] = i%4;
;; B[i] = i%5;
;; }
;; int sum = 0;
;; for (int i=0; i< N*10; ++i)
;; sum += bar(A, B, N);
;; return sum;
;;}
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
; CHECK-NOT: jg
; CHECK: cmovle
define i32 @bar(i32* nocapture readonly %a, i32* nocapture readonly %b, i32 %n) #0 {
entry:
%cmp30 = icmp sgt i32 %n, 0
br i1 %cmp30, label %for.body.preheader, label %for.cond.cleanup
for.body.preheader: ; preds = %entry
%.pre = load i32, i32* %a, align 4
%arrayidx2.phi.trans.insert = getelementptr inbounds i32, i32* %a, i64 1
%.pre34 = load i32, i32* %arrayidx2.phi.trans.insert, align 4
%.pre35 = load i32, i32* %b, align 4
%wide.trip.count = zext i32 %n to i64
br label %for.body
for.cond.cleanup: ; preds = %for.body, %entry
%sum.0.lcssa = phi i32 [ 0, %entry ], [ %add14, %for.body ]
ret i32 %sum.0.lcssa
for.body: ; preds = %for.body, %for.body.preheader
%0 = phi i32 [ %.pre35, %for.body.preheader ], [ %5, %for.body ]
%1 = phi i32 [ %.pre34, %for.body.preheader ], [ %4, %for.body ]
%2 = phi i32 [ %.pre, %for.body.preheader ], [ %1, %for.body ]
%indvars.iv = phi i64 [ 0, %for.body.preheader ], [ %indvars.iv.next, %for.body ]
%sum.032 = phi i32 [ 0, %for.body.preheader ], [ %add14, %for.body ]
%indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
%mul = mul nsw i32 %1, %2
%3 = add nuw nsw i64 %indvars.iv, 2
%arrayidx5 = getelementptr inbounds i32, i32* %a, i64 %3
%4 = load i32, i32* %arrayidx5, align 4
%mul6 = mul nsw i32 %mul, %4
%arrayidx11 = getelementptr inbounds i32, i32* %b, i64 %indvars.iv.next
%5 = load i32, i32* %arrayidx11, align 4
%mul12 = mul nsw i32 %5, %0
%cmp13 = icmp sgt i32 %mul12, %mul6
%cond = select i1 %cmp13, i32 %mul6, i32 0
%add14 = add nsw i32 %cond, %sum.032
%exitcond = icmp eq i64 %indvars.iv.next, %wide.trip.count
br i1 %exitcond, label %for.cond.cleanup, label %for.body
}
attributes #0 = {"target-cpu"="skylake"}
!llvm.module.flags = !{!0, !1}
!llvm.ident = !{!2}
!0 = !{i32 1, !"wchar_size", i32 2}
!1 = !{i32 7, !"PIC Level", i32 2}
!2 = !{!"clang version 5.0.0 (trunk)"}