mirror of
https://github.com/RPCSX/llvm.git
synced 2025-01-15 08:58:51 +00:00
6df35e7844
Patch to allow int8 vectors to be multiplied on the SSE unit instead of being scalarized. The patch sign extends the i8 lanes to i16, uses the SSE2 pmullw multiplication instruction, then packs the lower byte from each result. Differential Revision: http://reviews.llvm.org/D9115 git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@235837 91177308-0d34-0410-b5e6-96231b3b80d8
212 lines
5.8 KiB
LLVM
212 lines
5.8 KiB
LLVM
; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=core-avx2 -mattr=+avx2 | FileCheck %s
|
|
|
|
; CHECK: vpaddq %ymm
|
|
define <4 x i64> @test_vpaddq(<4 x i64> %i, <4 x i64> %j) nounwind readnone {
|
|
%x = add <4 x i64> %i, %j
|
|
ret <4 x i64> %x
|
|
}
|
|
|
|
; CHECK: vpaddd %ymm
|
|
define <8 x i32> @test_vpaddd(<8 x i32> %i, <8 x i32> %j) nounwind readnone {
|
|
%x = add <8 x i32> %i, %j
|
|
ret <8 x i32> %x
|
|
}
|
|
|
|
; CHECK: vpaddw %ymm
|
|
define <16 x i16> @test_vpaddw(<16 x i16> %i, <16 x i16> %j) nounwind readnone {
|
|
%x = add <16 x i16> %i, %j
|
|
ret <16 x i16> %x
|
|
}
|
|
|
|
; CHECK: vpaddb %ymm
|
|
define <32 x i8> @test_vpaddb(<32 x i8> %i, <32 x i8> %j) nounwind readnone {
|
|
%x = add <32 x i8> %i, %j
|
|
ret <32 x i8> %x
|
|
}
|
|
|
|
; CHECK: vpsubq %ymm
|
|
define <4 x i64> @test_vpsubq(<4 x i64> %i, <4 x i64> %j) nounwind readnone {
|
|
%x = sub <4 x i64> %i, %j
|
|
ret <4 x i64> %x
|
|
}
|
|
|
|
; CHECK: vpsubd %ymm
|
|
define <8 x i32> @test_vpsubd(<8 x i32> %i, <8 x i32> %j) nounwind readnone {
|
|
%x = sub <8 x i32> %i, %j
|
|
ret <8 x i32> %x
|
|
}
|
|
|
|
; CHECK: vpsubw %ymm
|
|
define <16 x i16> @test_vpsubw(<16 x i16> %i, <16 x i16> %j) nounwind readnone {
|
|
%x = sub <16 x i16> %i, %j
|
|
ret <16 x i16> %x
|
|
}
|
|
|
|
; CHECK: vpsubb %ymm
|
|
define <32 x i8> @test_vpsubb(<32 x i8> %i, <32 x i8> %j) nounwind readnone {
|
|
%x = sub <32 x i8> %i, %j
|
|
ret <32 x i8> %x
|
|
}
|
|
|
|
; CHECK: vpmulld %ymm
|
|
define <8 x i32> @test_vpmulld(<8 x i32> %i, <8 x i32> %j) nounwind readnone {
|
|
%x = mul <8 x i32> %i, %j
|
|
ret <8 x i32> %x
|
|
}
|
|
|
|
; CHECK: vpmullw %ymm
|
|
define <16 x i16> @test_vpmullw(<16 x i16> %i, <16 x i16> %j) nounwind readnone {
|
|
%x = mul <16 x i16> %i, %j
|
|
ret <16 x i16> %x
|
|
}
|
|
|
|
; CHECK: mul-v16i8
|
|
; CHECK: # BB#0:
|
|
; CHECK-NEXT: vpmovsxbw %xmm1, %ymm1
|
|
; CHECK-NEXT: vpmovsxbw %xmm0, %ymm0
|
|
; CHECK-NEXT: vpmullw %ymm1, %ymm0, %ymm0
|
|
; CHECK-NEXT: vextracti128 $1, %ymm0, %xmm1
|
|
; CHECK-NEXT: vmovdqa {{.*#+}} xmm2 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u>
|
|
; CHECK-NEXT: vpshufb %xmm2, %xmm1, %xmm1
|
|
; CHECK-NEXT: vpshufb %xmm2, %xmm0, %xmm0
|
|
; CHECK-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
|
|
; CHECK-NEXT: vzeroupper
|
|
; CHECK-NEXT: retq
|
|
define <16 x i8> @mul-v16i8(<16 x i8> %i, <16 x i8> %j) nounwind readnone {
|
|
%x = mul <16 x i8> %i, %j
|
|
ret <16 x i8> %x
|
|
}
|
|
|
|
; CHECK: mul-v32i8
|
|
; CHECK: # BB#0:
|
|
; CHECK-NEXT: vextracti128 $1, %ymm1, %xmm2
|
|
; CHECK-NEXT: vpmovsxbw %xmm2, %ymm2
|
|
; CHECK-NEXT: vextracti128 $1, %ymm0, %xmm3
|
|
; CHECK-NEXT: vpmovsxbw %xmm3, %ymm3
|
|
; CHECK-NEXT: vpmullw %ymm2, %ymm3, %ymm2
|
|
; CHECK-NEXT: vextracti128 $1, %ymm2, %xmm3
|
|
; CHECK-NEXT: vmovdqa {{.*#+}} xmm4 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u>
|
|
; CHECK-NEXT: vpshufb %xmm4, %xmm3, %xmm3
|
|
; CHECK-NEXT: vpshufb %xmm4, %xmm2, %xmm2
|
|
; CHECK-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm3[0]
|
|
; CHECK-NEXT: vpmovsxbw %xmm1, %ymm1
|
|
; CHECK-NEXT: vpmovsxbw %xmm0, %ymm0
|
|
; CHECK-NEXT: vpmullw %ymm1, %ymm0, %ymm0
|
|
; CHECK-NEXT: vextracti128 $1, %ymm0, %xmm1
|
|
; CHECK-NEXT: vpshufb %xmm4, %xmm1, %xmm1
|
|
; CHECK-NEXT: vpshufb %xmm4, %xmm0, %xmm0
|
|
; CHECK-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
|
|
; CHECK-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm0
|
|
; CHECK-NEXT: retq
|
|
define <32 x i8> @mul-v32i8(<32 x i8> %i, <32 x i8> %j) nounwind readnone {
|
|
%x = mul <32 x i8> %i, %j
|
|
ret <32 x i8> %x
|
|
}
|
|
|
|
; CHECK: mul-v4i64
|
|
; CHECK: vpmuludq %ymm
|
|
; CHECK-NEXT: vpsrlq $32, %ymm
|
|
; CHECK-NEXT: vpmuludq %ymm
|
|
; CHECK-NEXT: vpsllq $32, %ymm
|
|
; CHECK-NEXT: vpaddq %ymm
|
|
; CHECK-NEXT: vpsrlq $32, %ymm
|
|
; CHECK-NEXT: vpmuludq %ymm
|
|
; CHECK-NEXT: vpsllq $32, %ymm
|
|
; CHECK-NEXT: vpaddq %ymm
|
|
define <4 x i64> @mul-v4i64(<4 x i64> %i, <4 x i64> %j) nounwind readnone {
|
|
%x = mul <4 x i64> %i, %j
|
|
ret <4 x i64> %x
|
|
}
|
|
|
|
; CHECK: mul_const1
|
|
; CHECK: vpaddd
|
|
; CHECK: ret
|
|
define <8 x i32> @mul_const1(<8 x i32> %x) {
|
|
%y = mul <8 x i32> %x, <i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2>
|
|
ret <8 x i32> %y
|
|
}
|
|
|
|
; CHECK: mul_const2
|
|
; CHECK: vpsllq $2
|
|
; CHECK: ret
|
|
define <4 x i64> @mul_const2(<4 x i64> %x) {
|
|
%y = mul <4 x i64> %x, <i64 4, i64 4, i64 4, i64 4>
|
|
ret <4 x i64> %y
|
|
}
|
|
|
|
; CHECK: mul_const3
|
|
; CHECK: vpsllw $3
|
|
; CHECK: ret
|
|
define <16 x i16> @mul_const3(<16 x i16> %x) {
|
|
%y = mul <16 x i16> %x, <i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8>
|
|
ret <16 x i16> %y
|
|
}
|
|
|
|
; CHECK: mul_const4
|
|
; CHECK: vpxor
|
|
; CHECK: vpsubq
|
|
; CHECK: ret
|
|
define <4 x i64> @mul_const4(<4 x i64> %x) {
|
|
%y = mul <4 x i64> %x, <i64 -1, i64 -1, i64 -1, i64 -1>
|
|
ret <4 x i64> %y
|
|
}
|
|
|
|
; CHECK: mul_const5
|
|
; CHECK: vxorps
|
|
; CHECK-NEXT: ret
|
|
define <8 x i32> @mul_const5(<8 x i32> %x) {
|
|
%y = mul <8 x i32> %x, <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>
|
|
ret <8 x i32> %y
|
|
}
|
|
|
|
; CHECK: mul_const6
|
|
; CHECK: vpmulld
|
|
; CHECK: ret
|
|
define <8 x i32> @mul_const6(<8 x i32> %x) {
|
|
%y = mul <8 x i32> %x, <i32 0, i32 0, i32 0, i32 2, i32 0, i32 2, i32 0, i32 0>
|
|
ret <8 x i32> %y
|
|
}
|
|
|
|
; CHECK: mul_const7
|
|
; CHECK: vpaddq
|
|
; CHECK: vpaddq
|
|
; CHECK: ret
|
|
define <8 x i64> @mul_const7(<8 x i64> %x) {
|
|
%y = mul <8 x i64> %x, <i64 2, i64 2, i64 2, i64 2, i64 2, i64 2, i64 2, i64 2>
|
|
ret <8 x i64> %y
|
|
}
|
|
|
|
; CHECK: mul_const8
|
|
; CHECK: vpsllw $3
|
|
; CHECK: ret
|
|
define <8 x i16> @mul_const8(<8 x i16> %x) {
|
|
%y = mul <8 x i16> %x, <i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8>
|
|
ret <8 x i16> %y
|
|
}
|
|
|
|
; CHECK: mul_const9
|
|
; CHECK: vpmulld
|
|
; CHECK: ret
|
|
define <8 x i32> @mul_const9(<8 x i32> %x) {
|
|
%y = mul <8 x i32> %x, <i32 2, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>
|
|
ret <8 x i32> %y
|
|
}
|
|
|
|
; CHECK: mul_const10
|
|
; CHECK: vpmulld
|
|
; CHECK: ret
|
|
define <4 x i32> @mul_const10(<4 x i32> %x) {
|
|
; %x * 0x01010101
|
|
%m = mul <4 x i32> %x, <i32 16843009, i32 16843009, i32 16843009, i32 16843009>
|
|
ret <4 x i32> %m
|
|
}
|
|
|
|
; CHECK: mul_const11
|
|
; CHECK: vpmulld
|
|
; CHECK: ret
|
|
define <4 x i32> @mul_const11(<4 x i32> %x) {
|
|
; %x * 0x80808080
|
|
%m = mul <4 x i32> %x, <i32 2155905152, i32 2155905152, i32 2155905152, i32 2155905152>
|
|
ret <4 x i32> %m
|
|
}
|