mirror of
https://github.com/RPCS3/llvm.git
synced 2025-01-12 07:21:56 +00:00
98eac0a244
has settled without incident, removing the x86-specific and overly strict 'isVectorSplat' routine in favor of generic and more powerful splat detection. The primary motivation and result of this is that the x86 backend can now see through splats which contain undef elements. This is essential if we are using a widening form of legalization and I've updated a test case to also run in that mode as before this change the generated code for the test case was completely scalarized. This version of the patch much more carefully handles the undef lanes. - We aren't overly conservative about them in the shift lowering (where we will never use the splat itself). - One place where the splat would have been re-used by the existing code now explicitly constructs a new constant splat that will be safe. - The broadcast lowering is much more reasonable with undefs by doing a correct check of whether the splat is the only user of a loaded value, checking that the splat actually crosses multiple lanes before using a broadcast, and handling broadcasts of non-constant splats. As a consequence of the last bullet, the weird usage of vpshufd instead of vbroadcast is gone, and we actually can lower an AVX splat with vbroadcastss where before we emitted a really strange pattern of a vector load and a manual splat across the vector. git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@212602 91177308-0d34-0410-b5e6-96231b3b80d8
79 lines
2.9 KiB
LLVM
79 lines
2.9 KiB
LLVM
; RUN: llc < %s -march=x86 -mattr=+sse4.2 | FileCheck %s
|
|
; RUN: llc < %s -march=x86 -mattr=+sse4.2 -x86-experimental-vector-widening-legalization | FileCheck %s --check-prefix=CHECK-WIDE
|
|
|
|
define void @update(i64* %dst_i, i64* %src_i, i32 %n) nounwind {
|
|
; CHECK-LABEL: update:
|
|
; CHECK-WIDE-LABEL: update:
|
|
entry:
|
|
%dst_i.addr = alloca i64* ; <i64**> [#uses=2]
|
|
%src_i.addr = alloca i64* ; <i64**> [#uses=2]
|
|
%n.addr = alloca i32 ; <i32*> [#uses=2]
|
|
%i = alloca i32, align 4 ; <i32*> [#uses=8]
|
|
%dst = alloca <8 x i8>*, align 4 ; <<8 x i8>**> [#uses=2]
|
|
%src = alloca <8 x i8>*, align 4 ; <<8 x i8>**> [#uses=2]
|
|
store i64* %dst_i, i64** %dst_i.addr
|
|
store i64* %src_i, i64** %src_i.addr
|
|
store i32 %n, i32* %n.addr
|
|
store i32 0, i32* %i
|
|
br label %forcond
|
|
|
|
forcond: ; preds = %forinc, %entry
|
|
%tmp = load i32* %i ; <i32> [#uses=1]
|
|
%tmp1 = load i32* %n.addr ; <i32> [#uses=1]
|
|
%cmp = icmp slt i32 %tmp, %tmp1 ; <i1> [#uses=1]
|
|
br i1 %cmp, label %forbody, label %afterfor
|
|
|
|
forbody: ; preds = %forcond
|
|
%tmp2 = load i32* %i ; <i32> [#uses=1]
|
|
%tmp3 = load i64** %dst_i.addr ; <i64*> [#uses=1]
|
|
%arrayidx = getelementptr i64* %tmp3, i32 %tmp2 ; <i64*> [#uses=1]
|
|
%conv = bitcast i64* %arrayidx to <8 x i8>* ; <<8 x i8>*> [#uses=1]
|
|
store <8 x i8>* %conv, <8 x i8>** %dst
|
|
%tmp4 = load i32* %i ; <i32> [#uses=1]
|
|
%tmp5 = load i64** %src_i.addr ; <i64*> [#uses=1]
|
|
%arrayidx6 = getelementptr i64* %tmp5, i32 %tmp4 ; <i64*> [#uses=1]
|
|
%conv7 = bitcast i64* %arrayidx6 to <8 x i8>* ; <<8 x i8>*> [#uses=1]
|
|
store <8 x i8>* %conv7, <8 x i8>** %src
|
|
%tmp8 = load i32* %i ; <i32> [#uses=1]
|
|
%tmp9 = load <8 x i8>** %dst ; <<8 x i8>*> [#uses=1]
|
|
%arrayidx10 = getelementptr <8 x i8>* %tmp9, i32 %tmp8 ; <<8 x i8>*> [#uses=1]
|
|
%tmp11 = load i32* %i ; <i32> [#uses=1]
|
|
%tmp12 = load <8 x i8>** %src ; <<8 x i8>*> [#uses=1]
|
|
%arrayidx13 = getelementptr <8 x i8>* %tmp12, i32 %tmp11 ; <<8 x i8>*> [#uses=1]
|
|
%tmp14 = load <8 x i8>* %arrayidx13 ; <<8 x i8>> [#uses=1]
|
|
%add = add <8 x i8> %tmp14, < i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1 > ; <<8 x i8>> [#uses=1]
|
|
%shr = ashr <8 x i8> %add, < i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2 > ; <<8 x i8>> [#uses=1]
|
|
store <8 x i8> %shr, <8 x i8>* %arrayidx10
|
|
br label %forinc
|
|
; CHECK: %forbody
|
|
; CHECK: pmovzxbw
|
|
; CHECK-NEXT: paddw
|
|
; CHECK-NEXT: psllw $8
|
|
; CHECK-NEXT: psraw $8
|
|
; CHECK-NEXT: psraw $2
|
|
; CHECK-NEXT: pshufb
|
|
; CHECK-NEXT: movlpd
|
|
;
|
|
; FIXME: We shouldn't require both a movd and an insert.
|
|
; CHECK-WIDE: %forbody
|
|
; CHECK-WIDE: movd
|
|
; CHECK-WIDE-NEXT: pinsrd
|
|
; CHECK-WIDE-NEXT: paddb
|
|
; CHECK-WIDE-NEXT: psrlw $2
|
|
; CHECK-WIDE-NEXT: pand
|
|
; CHECK-WIDE-NEXT: pxor
|
|
; CHECK-WIDE-NEXT: psubb
|
|
; CHECK-WIDE-NEXT: pextrd
|
|
; CHECK-WIDE-NEXT: movd
|
|
|
|
forinc: ; preds = %forbody
|
|
%tmp15 = load i32* %i ; <i32> [#uses=1]
|
|
%inc = add i32 %tmp15, 1 ; <i32> [#uses=1]
|
|
store i32 %inc, i32* %i
|
|
br label %forcond
|
|
|
|
afterfor: ; preds = %forcond
|
|
ret void
|
|
}
|
|
|