llvm/test/CodeGen/X86/avx2-arith.ll
Simon Pilgrim 6df35e7844 [X86][SSE] Add v16i8/v32i8 multiplication support
Patch to allow int8 vectors to be multiplied on the SSE unit instead of being scalarized.

The patch sign extends the i8 lanes to i16, uses the SSE2 pmullw multiplication instruction, then packs the lower byte from each result.

Differential Revision: http://reviews.llvm.org/D9115

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@235837 91177308-0d34-0410-b5e6-96231b3b80d8
2015-04-27 07:55:46 +00:00

212 lines
5.8 KiB
LLVM

; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=core-avx2 -mattr=+avx2 | FileCheck %s
; CHECK: vpaddq %ymm
define <4 x i64> @test_vpaddq(<4 x i64> %i, <4 x i64> %j) nounwind readnone {
%x = add <4 x i64> %i, %j
ret <4 x i64> %x
}
; CHECK: vpaddd %ymm
define <8 x i32> @test_vpaddd(<8 x i32> %i, <8 x i32> %j) nounwind readnone {
%x = add <8 x i32> %i, %j
ret <8 x i32> %x
}
; CHECK: vpaddw %ymm
define <16 x i16> @test_vpaddw(<16 x i16> %i, <16 x i16> %j) nounwind readnone {
%x = add <16 x i16> %i, %j
ret <16 x i16> %x
}
; CHECK: vpaddb %ymm
define <32 x i8> @test_vpaddb(<32 x i8> %i, <32 x i8> %j) nounwind readnone {
%x = add <32 x i8> %i, %j
ret <32 x i8> %x
}
; CHECK: vpsubq %ymm
define <4 x i64> @test_vpsubq(<4 x i64> %i, <4 x i64> %j) nounwind readnone {
%x = sub <4 x i64> %i, %j
ret <4 x i64> %x
}
; CHECK: vpsubd %ymm
define <8 x i32> @test_vpsubd(<8 x i32> %i, <8 x i32> %j) nounwind readnone {
%x = sub <8 x i32> %i, %j
ret <8 x i32> %x
}
; CHECK: vpsubw %ymm
define <16 x i16> @test_vpsubw(<16 x i16> %i, <16 x i16> %j) nounwind readnone {
%x = sub <16 x i16> %i, %j
ret <16 x i16> %x
}
; CHECK: vpsubb %ymm
define <32 x i8> @test_vpsubb(<32 x i8> %i, <32 x i8> %j) nounwind readnone {
%x = sub <32 x i8> %i, %j
ret <32 x i8> %x
}
; CHECK: vpmulld %ymm
define <8 x i32> @test_vpmulld(<8 x i32> %i, <8 x i32> %j) nounwind readnone {
%x = mul <8 x i32> %i, %j
ret <8 x i32> %x
}
; CHECK: vpmullw %ymm
define <16 x i16> @test_vpmullw(<16 x i16> %i, <16 x i16> %j) nounwind readnone {
%x = mul <16 x i16> %i, %j
ret <16 x i16> %x
}
; CHECK: mul-v16i8
; CHECK: # BB#0:
; CHECK-NEXT: vpmovsxbw %xmm1, %ymm1
; CHECK-NEXT: vpmovsxbw %xmm0, %ymm0
; CHECK-NEXT: vpmullw %ymm1, %ymm0, %ymm0
; CHECK-NEXT: vextracti128 $1, %ymm0, %xmm1
; CHECK-NEXT: vmovdqa {{.*#+}} xmm2 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u>
; CHECK-NEXT: vpshufb %xmm2, %xmm1, %xmm1
; CHECK-NEXT: vpshufb %xmm2, %xmm0, %xmm0
; CHECK-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
; CHECK-NEXT: vzeroupper
; CHECK-NEXT: retq
define <16 x i8> @mul-v16i8(<16 x i8> %i, <16 x i8> %j) nounwind readnone {
%x = mul <16 x i8> %i, %j
ret <16 x i8> %x
}
; CHECK: mul-v32i8
; CHECK: # BB#0:
; CHECK-NEXT: vextracti128 $1, %ymm1, %xmm2
; CHECK-NEXT: vpmovsxbw %xmm2, %ymm2
; CHECK-NEXT: vextracti128 $1, %ymm0, %xmm3
; CHECK-NEXT: vpmovsxbw %xmm3, %ymm3
; CHECK-NEXT: vpmullw %ymm2, %ymm3, %ymm2
; CHECK-NEXT: vextracti128 $1, %ymm2, %xmm3
; CHECK-NEXT: vmovdqa {{.*#+}} xmm4 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u>
; CHECK-NEXT: vpshufb %xmm4, %xmm3, %xmm3
; CHECK-NEXT: vpshufb %xmm4, %xmm2, %xmm2
; CHECK-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm3[0]
; CHECK-NEXT: vpmovsxbw %xmm1, %ymm1
; CHECK-NEXT: vpmovsxbw %xmm0, %ymm0
; CHECK-NEXT: vpmullw %ymm1, %ymm0, %ymm0
; CHECK-NEXT: vextracti128 $1, %ymm0, %xmm1
; CHECK-NEXT: vpshufb %xmm4, %xmm1, %xmm1
; CHECK-NEXT: vpshufb %xmm4, %xmm0, %xmm0
; CHECK-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
; CHECK-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm0
; CHECK-NEXT: retq
define <32 x i8> @mul-v32i8(<32 x i8> %i, <32 x i8> %j) nounwind readnone {
%x = mul <32 x i8> %i, %j
ret <32 x i8> %x
}
; CHECK: mul-v4i64
; CHECK: vpmuludq %ymm
; CHECK-NEXT: vpsrlq $32, %ymm
; CHECK-NEXT: vpmuludq %ymm
; CHECK-NEXT: vpsllq $32, %ymm
; CHECK-NEXT: vpaddq %ymm
; CHECK-NEXT: vpsrlq $32, %ymm
; CHECK-NEXT: vpmuludq %ymm
; CHECK-NEXT: vpsllq $32, %ymm
; CHECK-NEXT: vpaddq %ymm
define <4 x i64> @mul-v4i64(<4 x i64> %i, <4 x i64> %j) nounwind readnone {
%x = mul <4 x i64> %i, %j
ret <4 x i64> %x
}
; CHECK: mul_const1
; CHECK: vpaddd
; CHECK: ret
define <8 x i32> @mul_const1(<8 x i32> %x) {
%y = mul <8 x i32> %x, <i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2>
ret <8 x i32> %y
}
; CHECK: mul_const2
; CHECK: vpsllq $2
; CHECK: ret
define <4 x i64> @mul_const2(<4 x i64> %x) {
%y = mul <4 x i64> %x, <i64 4, i64 4, i64 4, i64 4>
ret <4 x i64> %y
}
; CHECK: mul_const3
; CHECK: vpsllw $3
; CHECK: ret
define <16 x i16> @mul_const3(<16 x i16> %x) {
%y = mul <16 x i16> %x, <i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8>
ret <16 x i16> %y
}
; CHECK: mul_const4
; CHECK: vpxor
; CHECK: vpsubq
; CHECK: ret
define <4 x i64> @mul_const4(<4 x i64> %x) {
%y = mul <4 x i64> %x, <i64 -1, i64 -1, i64 -1, i64 -1>
ret <4 x i64> %y
}
; CHECK: mul_const5
; CHECK: vxorps
; CHECK-NEXT: ret
define <8 x i32> @mul_const5(<8 x i32> %x) {
%y = mul <8 x i32> %x, <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>
ret <8 x i32> %y
}
; CHECK: mul_const6
; CHECK: vpmulld
; CHECK: ret
define <8 x i32> @mul_const6(<8 x i32> %x) {
%y = mul <8 x i32> %x, <i32 0, i32 0, i32 0, i32 2, i32 0, i32 2, i32 0, i32 0>
ret <8 x i32> %y
}
; CHECK: mul_const7
; CHECK: vpaddq
; CHECK: vpaddq
; CHECK: ret
define <8 x i64> @mul_const7(<8 x i64> %x) {
%y = mul <8 x i64> %x, <i64 2, i64 2, i64 2, i64 2, i64 2, i64 2, i64 2, i64 2>
ret <8 x i64> %y
}
; CHECK: mul_const8
; CHECK: vpsllw $3
; CHECK: ret
define <8 x i16> @mul_const8(<8 x i16> %x) {
%y = mul <8 x i16> %x, <i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8>
ret <8 x i16> %y
}
; CHECK: mul_const9
; CHECK: vpmulld
; CHECK: ret
define <8 x i32> @mul_const9(<8 x i32> %x) {
%y = mul <8 x i32> %x, <i32 2, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>
ret <8 x i32> %y
}
; CHECK: mul_const10
; CHECK: vpmulld
; CHECK: ret
define <4 x i32> @mul_const10(<4 x i32> %x) {
; %x * 0x01010101
%m = mul <4 x i32> %x, <i32 16843009, i32 16843009, i32 16843009, i32 16843009>
ret <4 x i32> %m
}
; CHECK: mul_const11
; CHECK: vpmulld
; CHECK: ret
define <4 x i32> @mul_const11(<4 x i32> %x) {
; %x * 0x80808080
%m = mul <4 x i32> %x, <i32 2155905152, i32 2155905152, i32 2155905152, i32 2155905152>
ret <4 x i32> %m
}