mirror of
https://github.com/RPCS3/llvm-mirror.git
synced 2025-01-27 06:54:30 +00:00
3766448ba4
Resolved PR33954. This patch contains two more constraints that aim to reduce the noise cases where we convert CMOV into branch for small gain, and end up spending more cycles due to overhead. Differential Revision: https://reviews.llvm.org/D36081 llvm-svn: 310352
92 lines
3.4 KiB
LLVM
92 lines
3.4 KiB
LLVM
; RUN: llc -mtriple=x86_64-pc-linux -x86-cmov-converter=true -verify-machineinstrs < %s | FileCheck %s
|
|
|
|
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
|
;; This test checks that x86-cmov-converter optimization does not transform CMOV
|
|
;; instruction when the gain (in cycles) of converting to branch is less than
|
|
;; a fix threshold (measured for "-x86-cmov-converter-threshold=4").
|
|
;;
|
|
;; Test was created using the following command line:
|
|
;; > clang -S -O2 -m64 -fno-vectorize -fno-unroll-loops -emit-llvm foo.c -o -
|
|
;; Where foo.c is:
|
|
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
|
;;int bar(int *a, int *b, int n) {
|
|
;; int sum = 0;
|
|
;; for (int i = 0; i < n; ++i) {
|
|
;; int x = a[i] * a[i+1] * a[i+2];
|
|
;; int y = b[i] * b[i+1];
|
|
;; sum += y > x ? x : 0;
|
|
;; }
|
|
;; return sum;
|
|
;;}
|
|
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
|
;; Adding a test to the above function shows code with CMOV is 25% faster than
|
|
;; the code with branch.
|
|
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
|
;;#define N 10000
|
|
;;int A[N];
|
|
;;int B[N];
|
|
;;
|
|
;;
|
|
;;
|
|
;;int main () {
|
|
;; for (int i=0; i< N; ++i) {
|
|
;; A[i] = i%4;
|
|
;; B[i] = i%5;
|
|
;; }
|
|
;; int sum = 0;
|
|
;; for (int i=0; i< N*10; ++i)
|
|
;; sum += bar(A, B, N);
|
|
;; return sum;
|
|
;;}
|
|
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
|
|
|
; CHECK-NOT: jg
|
|
; CHECK: cmovle
|
|
define i32 @bar(i32* nocapture readonly %a, i32* nocapture readonly %b, i32 %n) #0 {
|
|
entry:
|
|
%cmp30 = icmp sgt i32 %n, 0
|
|
br i1 %cmp30, label %for.body.preheader, label %for.cond.cleanup
|
|
|
|
for.body.preheader: ; preds = %entry
|
|
%.pre = load i32, i32* %a, align 4
|
|
%arrayidx2.phi.trans.insert = getelementptr inbounds i32, i32* %a, i64 1
|
|
%.pre34 = load i32, i32* %arrayidx2.phi.trans.insert, align 4
|
|
%.pre35 = load i32, i32* %b, align 4
|
|
%wide.trip.count = zext i32 %n to i64
|
|
br label %for.body
|
|
|
|
for.cond.cleanup: ; preds = %for.body, %entry
|
|
%sum.0.lcssa = phi i32 [ 0, %entry ], [ %add14, %for.body ]
|
|
ret i32 %sum.0.lcssa
|
|
|
|
for.body: ; preds = %for.body, %for.body.preheader
|
|
%0 = phi i32 [ %.pre35, %for.body.preheader ], [ %5, %for.body ]
|
|
%1 = phi i32 [ %.pre34, %for.body.preheader ], [ %4, %for.body ]
|
|
%2 = phi i32 [ %.pre, %for.body.preheader ], [ %1, %for.body ]
|
|
%indvars.iv = phi i64 [ 0, %for.body.preheader ], [ %indvars.iv.next, %for.body ]
|
|
%sum.032 = phi i32 [ 0, %for.body.preheader ], [ %add14, %for.body ]
|
|
%indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
|
|
%mul = mul nsw i32 %1, %2
|
|
%3 = add nuw nsw i64 %indvars.iv, 2
|
|
%arrayidx5 = getelementptr inbounds i32, i32* %a, i64 %3
|
|
%4 = load i32, i32* %arrayidx5, align 4
|
|
%mul6 = mul nsw i32 %mul, %4
|
|
%arrayidx11 = getelementptr inbounds i32, i32* %b, i64 %indvars.iv.next
|
|
%5 = load i32, i32* %arrayidx11, align 4
|
|
%mul12 = mul nsw i32 %5, %0
|
|
%cmp13 = icmp sgt i32 %mul12, %mul6
|
|
%cond = select i1 %cmp13, i32 %mul6, i32 0
|
|
%add14 = add nsw i32 %cond, %sum.032
|
|
%exitcond = icmp eq i64 %indvars.iv.next, %wide.trip.count
|
|
br i1 %exitcond, label %for.cond.cleanup, label %for.body
|
|
}
|
|
|
|
attributes #0 = {"target-cpu"="skylake"}
|
|
|
|
!llvm.module.flags = !{!0, !1}
|
|
!llvm.ident = !{!2}
|
|
|
|
!0 = !{i32 1, !"wchar_size", i32 2}
|
|
!1 = !{i32 7, !"PIC Level", i32 2}
|
|
!2 = !{!"clang version 5.0.0 (trunk)"}
|