mirror of
https://github.com/RPCS3/llvm-mirror.git
synced 2024-12-04 01:11:44 +00:00
f4315577b8
This patch generalizes the lowering of shuffles as zero extensions to allow extensions that don't start from the first element. It now recognises extensions starting anywhere in the lower 128-bits or at the start of any higher 128-bit lane. The motivation was to reduce the number of high cost pshufb calls, but it also improves the SSE2 case as well. Differential Revision: http://reviews.llvm.org/D12561 llvm-svn: 248250
110 lines
3.8 KiB
LLVM
110 lines
3.8 KiB
LLVM
; RUN: llc -mtriple=x86_64-apple-macosx -mcpu=nocona -verify-machineinstrs < %s | FileCheck %s
|
|
|
|
; After tail duplication, two copies in an early exit BB can be cancelled out.
|
|
; rdar://10640363
|
|
define i32 @t1(i32 %a, i32 %b) nounwind {
|
|
entry:
|
|
; CHECK-LABEL: t1:
|
|
; CHECK: je [[LABEL:.*BB.*]]
|
|
%cmp1 = icmp eq i32 %b, 0
|
|
br i1 %cmp1, label %while.end, label %while.body
|
|
|
|
; CHECK: [[LABEL]]:
|
|
; CHECK-NOT: mov
|
|
; CHECK: ret
|
|
|
|
while.body: ; preds = %entry, %while.body
|
|
%a.addr.03 = phi i32 [ %b.addr.02, %while.body ], [ %a, %entry ]
|
|
%b.addr.02 = phi i32 [ %rem, %while.body ], [ %b, %entry ]
|
|
%rem = srem i32 %a.addr.03, %b.addr.02
|
|
%cmp = icmp eq i32 %rem, 0
|
|
br i1 %cmp, label %while.end, label %while.body
|
|
|
|
while.end: ; preds = %while.body, %entry
|
|
%a.addr.0.lcssa = phi i32 [ %a, %entry ], [ %b.addr.02, %while.body ]
|
|
ret i32 %a.addr.0.lcssa
|
|
}
|
|
|
|
; Two movdqa (from phi-elimination) in the entry BB cancels out.
|
|
; rdar://10428165
|
|
define <8 x i16> @t2(<8 x i16> %T0, <8 x i16> %T1) nounwind readnone {
|
|
entry:
|
|
; CHECK-LABEL: t2:
|
|
; CHECK-NOT: movdqa
|
|
%tmp8 = shufflevector <8 x i16> %T0, <8 x i16> %T1, <8 x i32> < i32 undef, i32 undef, i32 7, i32 2, i32 8, i32 undef, i32 undef , i32 undef >
|
|
ret <8 x i16> %tmp8
|
|
}
|
|
|
|
define i32 @t3(i64 %a, i64 %b) nounwind {
|
|
entry:
|
|
; CHECK-LABEL: t3:
|
|
; CHECK: je [[LABEL:.*BB.*]]
|
|
%cmp1 = icmp eq i64 %b, 0
|
|
br i1 %cmp1, label %while.end, label %while.body
|
|
|
|
; CHECK: [[LABEL]]:
|
|
; CHECK-NOT: mov
|
|
; CHECK: ret
|
|
|
|
while.body: ; preds = %entry, %while.body
|
|
%a.addr.03 = phi i64 [ %b.addr.02, %while.body ], [ %a, %entry ]
|
|
%b.addr.02 = phi i64 [ %rem, %while.body ], [ %b, %entry ]
|
|
%rem = srem i64 %a.addr.03, %b.addr.02
|
|
%cmp = icmp eq i64 %rem, 0
|
|
br i1 %cmp, label %while.end, label %while.body
|
|
|
|
while.end: ; preds = %while.body, %entry
|
|
%a.addr.0.lcssa = phi i64 [ %a, %entry ], [ %b.addr.02, %while.body ]
|
|
%t = trunc i64 %a.addr.0.lcssa to i32
|
|
ret i32 %t
|
|
}
|
|
|
|
; Check that copy propagation does not kill thing like:
|
|
; dst = copy src <-- do not kill that.
|
|
; ... = op1 dst<undef>
|
|
; ... = op2 dst <-- this is used here.
|
|
;
|
|
; CHECK-LABEL: foo:
|
|
; CHECK: psllw $7,
|
|
; CHECK: psllw $7, [[SRC1:%xmm[0-9]+]]
|
|
; CHECK-NEXT: pand {{.*}}(%rip), [[SRC1]]
|
|
; CHECK-NEXT: pcmpgtb [[SRC1]], [[SRC2:%xmm[0-9]+]]
|
|
; CHECK-NEXT: pand %xmm{{[0-9]+}}, [[SRC2]]
|
|
; CHECK-NEXT: movdqa [[SRC2]], [[CPY1:%xmm[0-9]+]]
|
|
; CHECK-NEXT: punpcklbw %xmm{{[0-9]+}}, [[CPY1]]
|
|
; Check that CPY1 is not redefined.
|
|
; CHECK-NOT: , [[CPY1]]
|
|
; CHECK: punpckhwd %xmm{{[0-9]+}}, [[CPY1]]
|
|
; CHECK-NEXT: pslld $31, [[CPY1]]
|
|
; CHECK-NEXT: psrad $31, [[CPY1]]
|
|
; CHECK: punpckhbw %xmm{{[0-9]+}}, [[CPY2:%xmm[0-9]+]]
|
|
; Check that CPY2 is not redefined.
|
|
; CHECK-NOT: , [[CPY2]]
|
|
; CHECK: punpckhwd %xmm{{[0-9]+}}, [[CPY2]]
|
|
; CHECK-NEXT: pslld $31, [[CPY2]]
|
|
; CHECK-NEXT: psrad $31, [[CPY2]]
|
|
define <16 x float> @foo(<16 x float> %x) {
|
|
bb:
|
|
%v3 = icmp slt <16 x i32> undef, zeroinitializer
|
|
%v14 = zext <16 x i1> %v3 to <16 x i32>
|
|
%v16 = fcmp olt <16 x float> %x, zeroinitializer
|
|
%v17 = sext <16 x i1> %v16 to <16 x i32>
|
|
%v18 = zext <16 x i1> %v16 to <16 x i32>
|
|
%v19 = xor <16 x i32> %v14, %v18
|
|
%v20 = or <16 x i32> %v17, undef
|
|
%v21 = fptosi <16 x float> %x to <16 x i32>
|
|
%v22 = sitofp <16 x i32> %v21 to <16 x float>
|
|
%v69 = fcmp ogt <16 x float> %v22, zeroinitializer
|
|
%v75 = and <16 x i1> %v69, %v3
|
|
%v77 = bitcast <16 x float> %v22 to <16 x i32>
|
|
%v79 = sext <16 x i1> %v75 to <16 x i32>
|
|
%v80 = and <16 x i32> undef, %v79
|
|
%v81 = xor <16 x i32> %v77, %v80
|
|
%v82 = and <16 x i32> undef, %v81
|
|
%v83 = xor <16 x i32> %v19, %v82
|
|
%v84 = and <16 x i32> %v83, %v20
|
|
%v85 = xor <16 x i32> %v19, %v84
|
|
%v86 = bitcast <16 x i32> %v85 to <16 x float>
|
|
ret <16 x float> %v86
|
|
}
|