mirror of
https://github.com/RPCS3/llvm.git
synced 2025-01-26 20:57:15 +00:00
6539887847
We generate broadcast instructions on CPUs with AVX2 to load some constant splat vectors. This patch should preserve all existing behavior with regular optimization levels, but also use splats whenever possible when optimizing for *size* on any CPU with AVX or AVX2. The tradeoff is up to 5 extra instruction bytes for the broadcast instruction to save at least 8 bytes (up to 31 bytes) of constant pool data. Differential Revision: http://reviews.llvm.org/D5347 git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@218263 91177308-0d34-0410-b5e6-96231b3b80d8
142 lines
4.6 KiB
LLVM
142 lines
4.6 KiB
LLVM
; RUN: llc -mtriple=x86_64-unknown-unknown -mattr=avx < %s | FileCheck %s -check-prefix=CHECK --check-prefix=AVX
|
|
; RUN: llc -mtriple=x86_64-unknown-unknown -mattr=avx2 < %s | FileCheck %s -check-prefix=CHECK --check-prefix=AVX2
|
|
|
|
; Check constant loads of every 128-bit and 256-bit vector type
|
|
; for size optimization using splat ops available with AVX and AVX2.
|
|
|
|
; There is no AVX broadcast from double to 128-bit vector because movddup has been around since SSE3 (grrr).
|
|
define <2 x double> @splat_v2f64(<2 x double> %x) #0 {
|
|
%add = fadd <2 x double> %x, <double 1.0, double 1.0>
|
|
ret <2 x double> %add
|
|
; CHECK-LABEL: splat_v2f64
|
|
; CHECK: vmovddup
|
|
; CHECK: vaddpd
|
|
; CHECK-NEXT: retq
|
|
}
|
|
|
|
define <4 x double> @splat_v4f64(<4 x double> %x) #0 {
|
|
%add = fadd <4 x double> %x, <double 1.0, double 1.0, double 1.0, double 1.0>
|
|
ret <4 x double> %add
|
|
; CHECK-LABEL: splat_v4f64
|
|
; CHECK: vbroadcastsd
|
|
; CHECK-NEXT: vaddpd
|
|
; CHECK-NEXT: retq
|
|
}
|
|
|
|
define <4 x float> @splat_v4f32(<4 x float> %x) #0 {
|
|
%add = fadd <4 x float> %x, <float 1.0, float 1.0, float 1.0, float 1.0>
|
|
ret <4 x float> %add
|
|
; CHECK-LABEL: splat_v4f32
|
|
; CHECK: vbroadcastss
|
|
; CHECK-NEXT: vaddps
|
|
; CHECK-NEXT: retq
|
|
}
|
|
|
|
define <8 x float> @splat_v8f32(<8 x float> %x) #0 {
|
|
%add = fadd <8 x float> %x, <float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0>
|
|
ret <8 x float> %add
|
|
; CHECK-LABEL: splat_v8f32
|
|
; CHECK: vbroadcastss
|
|
; CHECK-NEXT: vaddps
|
|
; CHECK-NEXT: retq
|
|
}
|
|
|
|
; AVX can't do integer splats, so fake it: use vmovddup to splat 64-bit value.
|
|
; We also generate vmovddup for AVX2 because it's one byte smaller than vpbroadcastq.
|
|
define <2 x i64> @splat_v2i64(<2 x i64> %x) #0 {
|
|
%add = add <2 x i64> %x, <i64 1, i64 1>
|
|
ret <2 x i64> %add
|
|
; CHECK-LABEL: splat_v2i64
|
|
; CHECK: vmovddup
|
|
; CHECK: vpaddq
|
|
; CHECK-NEXT: retq
|
|
}
|
|
|
|
; AVX can't do 256-bit integer ops, so we split this into two 128-bit vectors,
|
|
; and then we fake it: use vmovddup to splat 64-bit value.
|
|
define <4 x i64> @splat_v4i64(<4 x i64> %x) #0 {
|
|
%add = add <4 x i64> %x, <i64 1, i64 1, i64 1, i64 1>
|
|
ret <4 x i64> %add
|
|
; CHECK-LABEL: splat_v4i64
|
|
; AVX: vmovddup
|
|
; AVX: vpaddq
|
|
; AVX: vpaddq
|
|
; AVX2: vpbroadcastq
|
|
; AVX2: vpaddq
|
|
; CHECK: retq
|
|
}
|
|
|
|
; AVX can't do integer splats, so fake it: use vbroadcastss to splat 32-bit value.
|
|
define <4 x i32> @splat_v4i32(<4 x i32> %x) #0 {
|
|
%add = add <4 x i32> %x, <i32 1, i32 1, i32 1, i32 1>
|
|
ret <4 x i32> %add
|
|
; CHECK-LABEL: splat_v4i32
|
|
; AVX: vbroadcastss
|
|
; AVX2: vpbroadcastd
|
|
; CHECK-NEXT: vpaddd
|
|
; CHECK-NEXT: retq
|
|
}
|
|
|
|
; AVX can't do integer splats, so fake it: use vbroadcastss to splat 32-bit value.
|
|
define <8 x i32> @splat_v8i32(<8 x i32> %x) #0 {
|
|
%add = add <8 x i32> %x, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
|
|
ret <8 x i32> %add
|
|
; CHECK-LABEL: splat_v8i32
|
|
; AVX: vbroadcastss
|
|
; AVX: vpaddd
|
|
; AVX: vpaddd
|
|
; AVX2: vpbroadcastd
|
|
; AVX2: vpaddd
|
|
; CHECK: retq
|
|
}
|
|
|
|
; AVX can't do integer splats, and there's no broadcast fakery for 16-bit. Could use pshuflw, etc?
|
|
define <8 x i16> @splat_v8i16(<8 x i16> %x) #0 {
|
|
%add = add <8 x i16> %x, <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>
|
|
ret <8 x i16> %add
|
|
; CHECK-LABEL: splat_v8i16
|
|
; AVX-NOT: broadcast
|
|
; AVX2: vpbroadcastw
|
|
; CHECK: vpaddw
|
|
; CHECK-NEXT: retq
|
|
}
|
|
|
|
; AVX can't do integer splats, and there's no broadcast fakery for 16-bit. Could use pshuflw, etc?
|
|
define <16 x i16> @splat_v16i16(<16 x i16> %x) #0 {
|
|
%add = add <16 x i16> %x, <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>
|
|
ret <16 x i16> %add
|
|
; CHECK-LABEL: splat_v16i16
|
|
; AVX-NOT: broadcast
|
|
; AVX: vpaddw
|
|
; AVX: vpaddw
|
|
; AVX2: vpbroadcastw
|
|
; AVX2: vpaddw
|
|
; CHECK: retq
|
|
}
|
|
|
|
; AVX can't do integer splats, and there's no broadcast fakery for 8-bit. Could use pshufb, etc?
|
|
define <16 x i8> @splat_v16i8(<16 x i8> %x) #0 {
|
|
%add = add <16 x i8> %x, <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>
|
|
ret <16 x i8> %add
|
|
; CHECK-LABEL: splat_v16i8
|
|
; AVX-NOT: broadcast
|
|
; AVX2: vpbroadcastb
|
|
; CHECK: vpaddb
|
|
; CHECK-NEXT: retq
|
|
}
|
|
|
|
; AVX can't do integer splats, and there's no broadcast fakery for 8-bit. Could use pshufb, etc?
|
|
define <32 x i8> @splat_v32i8(<32 x i8> %x) #0 {
|
|
%add = add <32 x i8> %x, <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>
|
|
ret <32 x i8> %add
|
|
; CHECK-LABEL: splat_v32i8
|
|
; AVX-NOT: broadcast
|
|
; AVX: vpaddb
|
|
; AVX: vpaddb
|
|
; AVX2: vpbroadcastb
|
|
; AVX2: vpaddb
|
|
; CHECK: retq
|
|
}
|
|
|
|
attributes #0 = { optsize }
|