mirror of
https://github.com/RPCS3/llvm-mirror.git
synced 2024-12-02 00:16:25 +00:00
[X86] Fix several issues related to X86's psadbw instruction.
This patch fixes the following issues: 1. Fix the return type of X86psadbw: it should not be the same type of inputs. For vNi8 inputs the output should be vMi64, where M = N/8. 2. Fix the return type of int_x86_avx512_psad_bw_512 accordingly. 3. Fix the definiton of PSADBW, VPSADBW, and VPSADBWY accordingly. 4. Adjust the return type when building a DAG node of X86ISD::PSADBW type. 5. Update related tests. Differential revision: http://reviews.llvm.org/D14897 llvm-svn: 254010
This commit is contained in:
parent
7a187fa24b
commit
c0bb26286b
@ -5448,7 +5448,7 @@ let TargetPrefix = "x86" in { // All intrinsics start with "llvm.x86.".
|
||||
llvm_v2f64_ty, llvm_i8_ty, llvm_i32_ty],
|
||||
[IntrNoMem]>;
|
||||
def int_x86_avx512_psad_bw_512 : GCCBuiltin<"__builtin_ia32_psadbw512">,
|
||||
Intrinsic<[llvm_v64i8_ty], [llvm_v64i8_ty, llvm_v64i8_ty],
|
||||
Intrinsic<[llvm_v8i64_ty], [llvm_v64i8_ty, llvm_v64i8_ty],
|
||||
[IntrNoMem]>;
|
||||
}
|
||||
// FP logical ops
|
||||
|
@ -19316,7 +19316,8 @@ static SDValue LowerHorizontalByteSum(SDValue V, MVT VT,
|
||||
// chunks, thus directly computes the pop count for v2i64 and v4i64.
|
||||
if (EltVT == MVT::i64) {
|
||||
SDValue Zeros = getZeroVector(ByteVecVT, Subtarget, DAG, DL);
|
||||
V = DAG.getNode(X86ISD::PSADBW, DL, ByteVecVT, V, Zeros);
|
||||
MVT SadVecVT = MVT::getVectorVT(MVT::i64, VecSize / 64);
|
||||
V = DAG.getNode(X86ISD::PSADBW, DL, SadVecVT, V, Zeros);
|
||||
return DAG.getBitcast(VT, V);
|
||||
}
|
||||
|
||||
@ -19332,9 +19333,10 @@ static SDValue LowerHorizontalByteSum(SDValue V, MVT VT,
|
||||
|
||||
// Do the horizontal sums into two v2i64s.
|
||||
Zeros = getZeroVector(ByteVecVT, Subtarget, DAG, DL);
|
||||
Low = DAG.getNode(X86ISD::PSADBW, DL, ByteVecVT,
|
||||
MVT SadVecVT = MVT::getVectorVT(MVT::i64, VecSize / 64);
|
||||
Low = DAG.getNode(X86ISD::PSADBW, DL, SadVecVT,
|
||||
DAG.getBitcast(ByteVecVT, Low), Zeros);
|
||||
High = DAG.getNode(X86ISD::PSADBW, DL, ByteVecVT,
|
||||
High = DAG.getNode(X86ISD::PSADBW, DL, SadVecVT,
|
||||
DAG.getBitcast(ByteVecVT, High), Zeros);
|
||||
|
||||
// Merge them together.
|
||||
|
@ -7369,32 +7369,34 @@ defm VPSRLDQ : avx512_shift_packed_all<0x73, X86vshrdq, MRM3r, MRM3m, "vpsrldq",
|
||||
|
||||
|
||||
multiclass avx512_psadbw_packed<bits<8> opc, SDNode OpNode,
|
||||
string OpcodeStr, X86VectorVTInfo _src>{
|
||||
string OpcodeStr, X86VectorVTInfo _dst,
|
||||
X86VectorVTInfo _src>{
|
||||
def rr : AVX512BI<opc, MRMSrcReg,
|
||||
(outs _src.RC:$dst), (ins _src.RC:$src1, _src.RC:$src2),
|
||||
(outs _dst.RC:$dst), (ins _src.RC:$src1, _src.RC:$src2),
|
||||
!strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
|
||||
[(set _src.RC:$dst,(_src.VT
|
||||
(OpNode _src.RC:$src1, _src.RC:$src2)))]>;
|
||||
[(set _dst.RC:$dst,(_dst.VT
|
||||
(OpNode (_src.VT _src.RC:$src1),
|
||||
(_src.VT _src.RC:$src2))))]>;
|
||||
let mayLoad = 1 in
|
||||
def rm : AVX512BI<opc, MRMSrcMem,
|
||||
(outs _src.RC:$dst), (ins _src.RC:$src1, _src.MemOp:$src2),
|
||||
(outs _dst.RC:$dst), (ins _src.RC:$src1, _src.MemOp:$src2),
|
||||
!strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
|
||||
[(set _src.RC:$dst,(_src.VT
|
||||
(OpNode _src.RC:$src1,
|
||||
(_src.VT (bitconvert
|
||||
[(set _dst.RC:$dst,(_dst.VT
|
||||
(OpNode (_src.VT _src.RC:$src1),
|
||||
(_src.VT (bitconvert
|
||||
(_src.LdFrag addr:$src2))))))]>;
|
||||
}
|
||||
|
||||
multiclass avx512_psadbw_packed_all<bits<8> opc, SDNode OpNode,
|
||||
string OpcodeStr, Predicate prd> {
|
||||
let Predicates = [prd] in
|
||||
defm Z512 : avx512_psadbw_packed<opc, OpNode, OpcodeStr, v64i8_info>,
|
||||
EVEX_V512;
|
||||
defm Z512 : avx512_psadbw_packed<opc, OpNode, OpcodeStr, v8i64_info,
|
||||
v64i8_info>, EVEX_V512;
|
||||
let Predicates = [prd, HasVLX] in {
|
||||
defm Z256 : avx512_psadbw_packed<opc, OpNode, OpcodeStr, v32i8x_info>,
|
||||
EVEX_V256;
|
||||
defm Z128 : avx512_psadbw_packed<opc, OpNode, OpcodeStr, v16i8x_info>,
|
||||
EVEX_V128;
|
||||
defm Z256 : avx512_psadbw_packed<opc, OpNode, OpcodeStr, v4i64x_info,
|
||||
v32i8x_info>, EVEX_V256;
|
||||
defm Z128 : avx512_psadbw_packed<opc, OpNode, OpcodeStr, v2i64x_info,
|
||||
v16i8x_info>, EVEX_V128;
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -79,8 +79,8 @@ def X86pshufb : SDNode<"X86ISD::PSHUFB",
|
||||
SDTypeProfile<1, 2, [SDTCisVec<0>, SDTCisSameAs<0,1>,
|
||||
SDTCisSameAs<0,2>]>>;
|
||||
def X86psadbw : SDNode<"X86ISD::PSADBW",
|
||||
SDTypeProfile<1, 2, [SDTCisVec<0>, SDTCisSameAs<0,1>,
|
||||
SDTCisSameAs<0,2>]>>;
|
||||
SDTypeProfile<1, 2, [SDTCisVec<0>, SDTCisVec<1>,
|
||||
SDTCisSameAs<1,2>]>>;
|
||||
def X86dbpsadbw : SDNode<"X86ISD::DBPSADBW",
|
||||
SDTypeProfile<1, 3, [SDTCisVec<0>, SDTCisVec<1>,
|
||||
SDTCisSameAs<1,2>, SDTCisInt<3>]>>;
|
||||
|
@ -4066,22 +4066,18 @@ defm PADDUSW : PDI_binop_all_int<0xDD, "paddusw", int_x86_sse2_paddus_w,
|
||||
int_x86_avx2_paddus_w, SSE_INTALU_ITINS_P, 1>;
|
||||
defm PMADDWD : PDI_binop_all_int<0xF5, "pmaddwd", int_x86_sse2_pmadd_wd,
|
||||
int_x86_avx2_pmadd_wd, SSE_PMADD, 1>;
|
||||
defm PSADBW : PDI_binop_all_int<0xF6, "psadbw", int_x86_sse2_psad_bw,
|
||||
int_x86_avx2_psad_bw, SSE_PMADD, 1>;
|
||||
|
||||
let Predicates = [HasAVX2] in
|
||||
def : Pat<(v32i8 (X86psadbw (v32i8 VR256:$src1),
|
||||
(v32i8 VR256:$src2))),
|
||||
(VPSADBWYrr VR256:$src2, VR256:$src1)>;
|
||||
|
||||
let Predicates = [HasAVX] in
|
||||
def : Pat<(v16i8 (X86psadbw (v16i8 VR128:$src1),
|
||||
(v16i8 VR128:$src2))),
|
||||
(VPSADBWrr VR128:$src2, VR128:$src1)>;
|
||||
|
||||
def : Pat<(v16i8 (X86psadbw (v16i8 VR128:$src1),
|
||||
(v16i8 VR128:$src2))),
|
||||
(PSADBWrr VR128:$src2, VR128:$src1)>;
|
||||
defm VPSADBW : PDI_binop_rm2<0xF6, "vpsadbw", X86psadbw, v2i64, v16i8, VR128,
|
||||
loadv2i64, i128mem, SSE_INTMUL_ITINS_P, 1, 0>,
|
||||
VEX_4V;
|
||||
let Predicates = [HasAVX2] in
|
||||
defm VPSADBWY : PDI_binop_rm2<0xF6, "vpsadbw", X86psadbw, v4i64, v32i8, VR256,
|
||||
loadv4i64, i256mem, SSE_INTMUL_ITINS_P, 1, 0>,
|
||||
VEX_4V, VEX_L;
|
||||
let Constraints = "$src1 = $dst" in
|
||||
defm PSADBW : PDI_binop_rm2<0xF6, "psadbw", X86psadbw, v2i64, v16i8, VR128,
|
||||
memopv2i64, i128mem, SSE_INTALU_ITINS_P, 1>;
|
||||
|
||||
let Predicates = [HasAVX] in
|
||||
defm VPMULUDQ : PDI_binop_rm2<0xF4, "vpmuludq", X86pmuludq, v2i64, v4i32, VR128,
|
||||
|
@ -284,6 +284,7 @@ static const IntrinsicData IntrinsicsWithoutChain[] = {
|
||||
X86_INTRINSIC_DATA(avx2_pmulh_w, INTR_TYPE_2OP, ISD::MULHS, 0),
|
||||
X86_INTRINSIC_DATA(avx2_pmulhu_w, INTR_TYPE_2OP, ISD::MULHU, 0),
|
||||
X86_INTRINSIC_DATA(avx2_pmulu_dq, INTR_TYPE_2OP, X86ISD::PMULUDQ, 0),
|
||||
X86_INTRINSIC_DATA(avx2_psad_bw, INTR_TYPE_2OP, X86ISD::PSADBW, 0),
|
||||
X86_INTRINSIC_DATA(avx2_pshuf_b, INTR_TYPE_2OP, X86ISD::PSHUFB, 0),
|
||||
X86_INTRINSIC_DATA(avx2_psign_b, INTR_TYPE_2OP, X86ISD::PSIGN, 0),
|
||||
X86_INTRINSIC_DATA(avx2_psign_d, INTR_TYPE_2OP, X86ISD::PSIGN, 0),
|
||||
@ -1710,6 +1711,7 @@ static const IntrinsicData IntrinsicsWithoutChain[] = {
|
||||
X86_INTRINSIC_DATA(sse2_pmulh_w, INTR_TYPE_2OP, ISD::MULHS, 0),
|
||||
X86_INTRINSIC_DATA(sse2_pmulhu_w, INTR_TYPE_2OP, ISD::MULHU, 0),
|
||||
X86_INTRINSIC_DATA(sse2_pmulu_dq, INTR_TYPE_2OP, X86ISD::PMULUDQ, 0),
|
||||
X86_INTRINSIC_DATA(sse2_psad_bw, INTR_TYPE_2OP, X86ISD::PSADBW, 0),
|
||||
X86_INTRINSIC_DATA(sse2_pshuf_d, INTR_TYPE_2OP, X86ISD::PSHUFD, 0),
|
||||
X86_INTRINSIC_DATA(sse2_pshufh_w, INTR_TYPE_2OP, X86ISD::PSHUFHW, 0),
|
||||
X86_INTRINSIC_DATA(sse2_pshufl_w, INTR_TYPE_2OP, X86ISD::PSHUFLW, 0),
|
||||
|
@ -1255,15 +1255,15 @@ define <8 x i64>@test_int_x86_avx512_mask_psrl_dq_512(<8 x i64> %x0) {
|
||||
%res2 = add <8 x i64> %res, %res1
|
||||
ret <8 x i64> %res2
|
||||
}
|
||||
declare <64 x i8> @llvm.x86.avx512.psad.bw.512(<64 x i8>, <64 x i8>)
|
||||
declare <8 x i64> @llvm.x86.avx512.psad.bw.512(<64 x i8>, <64 x i8>)
|
||||
|
||||
; CHECK-LABEL: @test_int_x86_avx512_mask_psadb_w_512
|
||||
; CHECK-NOT: call
|
||||
; CHECK: vpsadbw %zmm1
|
||||
; CHECK: vpsadbw %zmm2
|
||||
define <64 x i8>@test_int_x86_avx512_mask_psadb_w_512(<64 x i8> %x0, <64 x i8> %x1, <64 x i8> %x2){
|
||||
%res = call <64 x i8> @llvm.x86.avx512.psad.bw.512(<64 x i8> %x0, <64 x i8> %x1)
|
||||
%res1 = call <64 x i8> @llvm.x86.avx512.psad.bw.512(<64 x i8> %x0, <64 x i8> %x2)
|
||||
%res2 = add <64 x i8> %res, %res1
|
||||
ret <64 x i8> %res2
|
||||
define <8 x i64>@test_int_x86_avx512_mask_psadb_w_512(<64 x i8> %x0, <64 x i8> %x1, <64 x i8> %x2){
|
||||
%res = call <8 x i64> @llvm.x86.avx512.psad.bw.512(<64 x i8> %x0, <64 x i8> %x1)
|
||||
%res1 = call <8 x i64> @llvm.x86.avx512.psad.bw.512(<64 x i8> %x0, <64 x i8> %x2)
|
||||
%res2 = add <8 x i64> %res, %res1
|
||||
ret <8 x i64> %res2
|
||||
}
|
||||
|
@ -92,7 +92,7 @@ define <2 x i64> @testv2i64(<2 x i64> %in) nounwind {
|
||||
; AVX-NEXT: vpshufb %xmm0, %xmm3, %xmm0
|
||||
; AVX-NEXT: vpaddb %xmm2, %xmm0, %xmm0
|
||||
; AVX-NEXT: vpxor %xmm1, %xmm1, %xmm1
|
||||
; AVX-NEXT: vpsadbw %xmm0, %xmm1, %xmm0
|
||||
; AVX-NEXT: vpsadbw %xmm1, %xmm0, %xmm0
|
||||
; AVX-NEXT: retq
|
||||
%out = call <2 x i64> @llvm.ctpop.v2i64(<2 x i64> %in)
|
||||
ret <2 x i64> %out
|
||||
@ -207,9 +207,9 @@ define <4 x i32> @testv4i32(<4 x i32> %in) nounwind {
|
||||
; AVX-NEXT: vpaddb %xmm2, %xmm0, %xmm0
|
||||
; AVX-NEXT: vpxor %xmm1, %xmm1, %xmm1
|
||||
; AVX-NEXT: vpunpckhdq {{.*#+}} xmm2 = xmm0[2],xmm1[2],xmm0[3],xmm1[3]
|
||||
; AVX-NEXT: vpsadbw %xmm2, %xmm1, %xmm2
|
||||
; AVX-NEXT: vpsadbw %xmm1, %xmm2, %xmm2
|
||||
; AVX-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
|
||||
; AVX-NEXT: vpsadbw %xmm0, %xmm1, %xmm0
|
||||
; AVX-NEXT: vpsadbw %xmm1, %xmm0, %xmm0
|
||||
; AVX-NEXT: vpackuswb %xmm2, %xmm0, %xmm0
|
||||
; AVX-NEXT: retq
|
||||
%out = call <4 x i32> @llvm.ctpop.v4i32(<4 x i32> %in)
|
||||
|
@ -15,14 +15,14 @@ define <4 x i64> @testv4i64(<4 x i64> %in) nounwind {
|
||||
; AVX1-NEXT: vpshufb %xmm1, %xmm4, %xmm1
|
||||
; AVX1-NEXT: vpaddb %xmm3, %xmm1, %xmm1
|
||||
; AVX1-NEXT: vpxor %xmm3, %xmm3, %xmm3
|
||||
; AVX1-NEXT: vpsadbw %xmm1, %xmm3, %xmm1
|
||||
; AVX1-NEXT: vpsadbw %xmm3, %xmm1, %xmm1
|
||||
; AVX1-NEXT: vandps %xmm2, %xmm0, %xmm5
|
||||
; AVX1-NEXT: vpshufb %xmm5, %xmm4, %xmm5
|
||||
; AVX1-NEXT: vpsrlw $4, %xmm0, %xmm0
|
||||
; AVX1-NEXT: vpand %xmm2, %xmm0, %xmm0
|
||||
; AVX1-NEXT: vpshufb %xmm0, %xmm4, %xmm0
|
||||
; AVX1-NEXT: vpaddb %xmm5, %xmm0, %xmm0
|
||||
; AVX1-NEXT: vpsadbw %xmm0, %xmm3, %xmm0
|
||||
; AVX1-NEXT: vpsadbw %xmm3, %xmm0, %xmm0
|
||||
; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
|
||||
; AVX1-NEXT: retq
|
||||
;
|
||||
@ -37,7 +37,7 @@ define <4 x i64> @testv4i64(<4 x i64> %in) nounwind {
|
||||
; AVX2-NEXT: vpshufb %ymm0, %ymm3, %ymm0
|
||||
; AVX2-NEXT: vpaddb %ymm2, %ymm0, %ymm0
|
||||
; AVX2-NEXT: vpxor %ymm1, %ymm1, %ymm1
|
||||
; AVX2-NEXT: vpsadbw %ymm0, %ymm1, %ymm0
|
||||
; AVX2-NEXT: vpsadbw %ymm1, %ymm0, %ymm0
|
||||
; AVX2-NEXT: retq
|
||||
%out = call <4 x i64> @llvm.ctpop.v4i64(<4 x i64> %in)
|
||||
ret <4 x i64> %out
|
||||
@ -57,9 +57,9 @@ define <8 x i32> @testv8i32(<8 x i32> %in) nounwind {
|
||||
; AVX1-NEXT: vpaddb %xmm3, %xmm1, %xmm1
|
||||
; AVX1-NEXT: vpxor %xmm3, %xmm3, %xmm3
|
||||
; AVX1-NEXT: vpunpckhdq {{.*#+}} xmm5 = xmm1[2],xmm3[2],xmm1[3],xmm3[3]
|
||||
; AVX1-NEXT: vpsadbw %xmm5, %xmm3, %xmm5
|
||||
; AVX1-NEXT: vpsadbw %xmm3, %xmm5, %xmm5
|
||||
; AVX1-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1]
|
||||
; AVX1-NEXT: vpsadbw %xmm1, %xmm3, %xmm1
|
||||
; AVX1-NEXT: vpsadbw %xmm3, %xmm1, %xmm1
|
||||
; AVX1-NEXT: vpackuswb %xmm5, %xmm1, %xmm1
|
||||
; AVX1-NEXT: vandps %xmm2, %xmm0, %xmm5
|
||||
; AVX1-NEXT: vpshufb %xmm5, %xmm4, %xmm5
|
||||
@ -68,9 +68,9 @@ define <8 x i32> @testv8i32(<8 x i32> %in) nounwind {
|
||||
; AVX1-NEXT: vpshufb %xmm0, %xmm4, %xmm0
|
||||
; AVX1-NEXT: vpaddb %xmm5, %xmm0, %xmm0
|
||||
; AVX1-NEXT: vpunpckhdq {{.*#+}} xmm2 = xmm0[2],xmm3[2],xmm0[3],xmm3[3]
|
||||
; AVX1-NEXT: vpsadbw %xmm2, %xmm3, %xmm2
|
||||
; AVX1-NEXT: vpsadbw %xmm3, %xmm2, %xmm2
|
||||
; AVX1-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1]
|
||||
; AVX1-NEXT: vpsadbw %xmm0, %xmm3, %xmm0
|
||||
; AVX1-NEXT: vpsadbw %xmm3, %xmm0, %xmm0
|
||||
; AVX1-NEXT: vpackuswb %xmm2, %xmm0, %xmm0
|
||||
; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
|
||||
; AVX1-NEXT: retq
|
||||
@ -87,9 +87,9 @@ define <8 x i32> @testv8i32(<8 x i32> %in) nounwind {
|
||||
; AVX2-NEXT: vpaddb %ymm2, %ymm0, %ymm0
|
||||
; AVX2-NEXT: vpxor %ymm1, %ymm1, %ymm1
|
||||
; AVX2-NEXT: vpunpckhdq {{.*#+}} ymm2 = ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[6],ymm1[6],ymm0[7],ymm1[7]
|
||||
; AVX2-NEXT: vpsadbw %ymm2, %ymm1, %ymm2
|
||||
; AVX2-NEXT: vpsadbw %ymm1, %ymm2, %ymm2
|
||||
; AVX2-NEXT: vpunpckldq {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[4],ymm1[4],ymm0[5],ymm1[5]
|
||||
; AVX2-NEXT: vpsadbw %ymm0, %ymm1, %ymm0
|
||||
; AVX2-NEXT: vpsadbw %ymm1, %ymm0, %ymm0
|
||||
; AVX2-NEXT: vpackuswb %ymm2, %ymm0, %ymm0
|
||||
; AVX2-NEXT: retq
|
||||
%out = call <8 x i32> @llvm.ctpop.v8i32(<8 x i32> %in)
|
||||
|
@ -275,9 +275,9 @@ define <4 x i32> @testv4i32(<4 x i32> %in) nounwind {
|
||||
; AVX1-NEXT: vpshufb %xmm0, %xmm4, %xmm0
|
||||
; AVX1-NEXT: vpaddb %xmm3, %xmm0, %xmm0
|
||||
; AVX1-NEXT: vpunpckhdq {{.*#+}} xmm2 = xmm0[2],xmm1[2],xmm0[3],xmm1[3]
|
||||
; AVX1-NEXT: vpsadbw %xmm2, %xmm1, %xmm2
|
||||
; AVX1-NEXT: vpsadbw %xmm1, %xmm2, %xmm2
|
||||
; AVX1-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
|
||||
; AVX1-NEXT: vpsadbw %xmm0, %xmm1, %xmm0
|
||||
; AVX1-NEXT: vpsadbw %xmm1, %xmm0, %xmm0
|
||||
; AVX1-NEXT: vpackuswb %xmm2, %xmm0, %xmm0
|
||||
; AVX1-NEXT: retq
|
||||
;
|
||||
@ -297,9 +297,9 @@ define <4 x i32> @testv4i32(<4 x i32> %in) nounwind {
|
||||
; AVX2-NEXT: vpshufb %xmm0, %xmm4, %xmm0
|
||||
; AVX2-NEXT: vpaddb %xmm3, %xmm0, %xmm0
|
||||
; AVX2-NEXT: vpunpckhdq {{.*#+}} xmm2 = xmm0[2],xmm1[2],xmm0[3],xmm1[3]
|
||||
; AVX2-NEXT: vpsadbw %xmm2, %xmm1, %xmm2
|
||||
; AVX2-NEXT: vpsadbw %xmm1, %xmm2, %xmm2
|
||||
; AVX2-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
|
||||
; AVX2-NEXT: vpsadbw %xmm0, %xmm1, %xmm0
|
||||
; AVX2-NEXT: vpsadbw %xmm1, %xmm0, %xmm0
|
||||
; AVX2-NEXT: vpackuswb %xmm2, %xmm0, %xmm0
|
||||
; AVX2-NEXT: retq
|
||||
%out = call <4 x i32> @llvm.cttz.v4i32(<4 x i32> %in, i1 0)
|
||||
@ -430,9 +430,9 @@ define <4 x i32> @testv4i32u(<4 x i32> %in) nounwind {
|
||||
; AVX1-NEXT: vpshufb %xmm0, %xmm4, %xmm0
|
||||
; AVX1-NEXT: vpaddb %xmm3, %xmm0, %xmm0
|
||||
; AVX1-NEXT: vpunpckhdq {{.*#+}} xmm2 = xmm0[2],xmm1[2],xmm0[3],xmm1[3]
|
||||
; AVX1-NEXT: vpsadbw %xmm2, %xmm1, %xmm2
|
||||
; AVX1-NEXT: vpsadbw %xmm1, %xmm2, %xmm2
|
||||
; AVX1-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
|
||||
; AVX1-NEXT: vpsadbw %xmm0, %xmm1, %xmm0
|
||||
; AVX1-NEXT: vpsadbw %xmm1, %xmm0, %xmm0
|
||||
; AVX1-NEXT: vpackuswb %xmm2, %xmm0, %xmm0
|
||||
; AVX1-NEXT: retq
|
||||
;
|
||||
@ -452,9 +452,9 @@ define <4 x i32> @testv4i32u(<4 x i32> %in) nounwind {
|
||||
; AVX2-NEXT: vpshufb %xmm0, %xmm4, %xmm0
|
||||
; AVX2-NEXT: vpaddb %xmm3, %xmm0, %xmm0
|
||||
; AVX2-NEXT: vpunpckhdq {{.*#+}} xmm2 = xmm0[2],xmm1[2],xmm0[3],xmm1[3]
|
||||
; AVX2-NEXT: vpsadbw %xmm2, %xmm1, %xmm2
|
||||
; AVX2-NEXT: vpsadbw %xmm1, %xmm2, %xmm2
|
||||
; AVX2-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
|
||||
; AVX2-NEXT: vpsadbw %xmm0, %xmm1, %xmm0
|
||||
; AVX2-NEXT: vpsadbw %xmm1, %xmm0, %xmm0
|
||||
; AVX2-NEXT: vpackuswb %xmm2, %xmm0, %xmm0
|
||||
; AVX2-NEXT: retq
|
||||
%out = call <4 x i32> @llvm.cttz.v4i32(<4 x i32> %in, i1 -1)
|
||||
|
@ -22,7 +22,7 @@ define <4 x i64> @testv4i64(<4 x i64> %in) nounwind {
|
||||
; AVX1-NEXT: vpand %xmm4, %xmm1, %xmm1
|
||||
; AVX1-NEXT: vpshufb %xmm1, %xmm6, %xmm1
|
||||
; AVX1-NEXT: vpaddb %xmm5, %xmm1, %xmm1
|
||||
; AVX1-NEXT: vpsadbw %xmm1, %xmm2, %xmm1
|
||||
; AVX1-NEXT: vpsadbw %xmm2, %xmm1, %xmm1
|
||||
; AVX1-NEXT: vpsubq %xmm3, %xmm0, %xmm0
|
||||
; AVX1-NEXT: vpand %xmm4, %xmm0, %xmm3
|
||||
; AVX1-NEXT: vpshufb %xmm3, %xmm6, %xmm3
|
||||
@ -30,7 +30,7 @@ define <4 x i64> @testv4i64(<4 x i64> %in) nounwind {
|
||||
; AVX1-NEXT: vpand %xmm4, %xmm0, %xmm0
|
||||
; AVX1-NEXT: vpshufb %xmm0, %xmm6, %xmm0
|
||||
; AVX1-NEXT: vpaddb %xmm3, %xmm0, %xmm0
|
||||
; AVX1-NEXT: vpsadbw %xmm0, %xmm2, %xmm0
|
||||
; AVX1-NEXT: vpsadbw %xmm2, %xmm0, %xmm0
|
||||
; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
|
||||
; AVX1-NEXT: retq
|
||||
;
|
||||
@ -49,7 +49,7 @@ define <4 x i64> @testv4i64(<4 x i64> %in) nounwind {
|
||||
; AVX2-NEXT: vpand %ymm2, %ymm0, %ymm0
|
||||
; AVX2-NEXT: vpshufb %ymm0, %ymm4, %ymm0
|
||||
; AVX2-NEXT: vpaddb %ymm3, %ymm0, %ymm0
|
||||
; AVX2-NEXT: vpsadbw %ymm0, %ymm1, %ymm0
|
||||
; AVX2-NEXT: vpsadbw %ymm1, %ymm0, %ymm0
|
||||
; AVX2-NEXT: retq
|
||||
%out = call <4 x i64> @llvm.cttz.v4i64(<4 x i64> %in, i1 0)
|
||||
ret <4 x i64> %out
|
||||
@ -75,7 +75,7 @@ define <4 x i64> @testv4i64u(<4 x i64> %in) nounwind {
|
||||
; AVX1-NEXT: vpand %xmm4, %xmm1, %xmm1
|
||||
; AVX1-NEXT: vpshufb %xmm1, %xmm6, %xmm1
|
||||
; AVX1-NEXT: vpaddb %xmm5, %xmm1, %xmm1
|
||||
; AVX1-NEXT: vpsadbw %xmm1, %xmm2, %xmm1
|
||||
; AVX1-NEXT: vpsadbw %xmm2, %xmm1, %xmm1
|
||||
; AVX1-NEXT: vpsubq %xmm3, %xmm0, %xmm0
|
||||
; AVX1-NEXT: vpand %xmm4, %xmm0, %xmm3
|
||||
; AVX1-NEXT: vpshufb %xmm3, %xmm6, %xmm3
|
||||
@ -83,7 +83,7 @@ define <4 x i64> @testv4i64u(<4 x i64> %in) nounwind {
|
||||
; AVX1-NEXT: vpand %xmm4, %xmm0, %xmm0
|
||||
; AVX1-NEXT: vpshufb %xmm0, %xmm6, %xmm0
|
||||
; AVX1-NEXT: vpaddb %xmm3, %xmm0, %xmm0
|
||||
; AVX1-NEXT: vpsadbw %xmm0, %xmm2, %xmm0
|
||||
; AVX1-NEXT: vpsadbw %xmm2, %xmm0, %xmm0
|
||||
; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
|
||||
; AVX1-NEXT: retq
|
||||
;
|
||||
@ -102,7 +102,7 @@ define <4 x i64> @testv4i64u(<4 x i64> %in) nounwind {
|
||||
; AVX2-NEXT: vpand %ymm2, %ymm0, %ymm0
|
||||
; AVX2-NEXT: vpshufb %ymm0, %ymm4, %ymm0
|
||||
; AVX2-NEXT: vpaddb %ymm3, %ymm0, %ymm0
|
||||
; AVX2-NEXT: vpsadbw %ymm0, %ymm1, %ymm0
|
||||
; AVX2-NEXT: vpsadbw %ymm1, %ymm0, %ymm0
|
||||
; AVX2-NEXT: retq
|
||||
%out = call <4 x i64> @llvm.cttz.v4i64(<4 x i64> %in, i1 -1)
|
||||
ret <4 x i64> %out
|
||||
@ -129,9 +129,9 @@ define <8 x i32> @testv8i32(<8 x i32> %in) nounwind {
|
||||
; AVX1-NEXT: vpshufb %xmm2, %xmm6, %xmm2
|
||||
; AVX1-NEXT: vpaddb %xmm5, %xmm2, %xmm2
|
||||
; AVX1-NEXT: vpunpckhdq {{.*#+}} xmm5 = xmm2[2],xmm1[2],xmm2[3],xmm1[3]
|
||||
; AVX1-NEXT: vpsadbw %xmm5, %xmm1, %xmm5
|
||||
; AVX1-NEXT: vpsadbw %xmm1, %xmm5, %xmm5
|
||||
; AVX1-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
|
||||
; AVX1-NEXT: vpsadbw %xmm2, %xmm1, %xmm2
|
||||
; AVX1-NEXT: vpsadbw %xmm1, %xmm2, %xmm2
|
||||
; AVX1-NEXT: vpackuswb %xmm5, %xmm2, %xmm2
|
||||
; AVX1-NEXT: vpsubd %xmm3, %xmm0, %xmm0
|
||||
; AVX1-NEXT: vpand %xmm4, %xmm0, %xmm3
|
||||
@ -141,9 +141,9 @@ define <8 x i32> @testv8i32(<8 x i32> %in) nounwind {
|
||||
; AVX1-NEXT: vpshufb %xmm0, %xmm6, %xmm0
|
||||
; AVX1-NEXT: vpaddb %xmm3, %xmm0, %xmm0
|
||||
; AVX1-NEXT: vpunpckhdq {{.*#+}} xmm3 = xmm0[2],xmm1[2],xmm0[3],xmm1[3]
|
||||
; AVX1-NEXT: vpsadbw %xmm3, %xmm1, %xmm3
|
||||
; AVX1-NEXT: vpsadbw %xmm1, %xmm3, %xmm3
|
||||
; AVX1-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
|
||||
; AVX1-NEXT: vpsadbw %xmm0, %xmm1, %xmm0
|
||||
; AVX1-NEXT: vpsadbw %xmm1, %xmm0, %xmm0
|
||||
; AVX1-NEXT: vpackuswb %xmm3, %xmm0, %xmm0
|
||||
; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
|
||||
; AVX1-NEXT: retq
|
||||
@ -164,9 +164,9 @@ define <8 x i32> @testv8i32(<8 x i32> %in) nounwind {
|
||||
; AVX2-NEXT: vpshufb %ymm0, %ymm4, %ymm0
|
||||
; AVX2-NEXT: vpaddb %ymm3, %ymm0, %ymm0
|
||||
; AVX2-NEXT: vpunpckhdq {{.*#+}} ymm2 = ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[6],ymm1[6],ymm0[7],ymm1[7]
|
||||
; AVX2-NEXT: vpsadbw %ymm2, %ymm1, %ymm2
|
||||
; AVX2-NEXT: vpsadbw %ymm1, %ymm2, %ymm2
|
||||
; AVX2-NEXT: vpunpckldq {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[4],ymm1[4],ymm0[5],ymm1[5]
|
||||
; AVX2-NEXT: vpsadbw %ymm0, %ymm1, %ymm0
|
||||
; AVX2-NEXT: vpsadbw %ymm1, %ymm0, %ymm0
|
||||
; AVX2-NEXT: vpackuswb %ymm2, %ymm0, %ymm0
|
||||
; AVX2-NEXT: retq
|
||||
%out = call <8 x i32> @llvm.cttz.v8i32(<8 x i32> %in, i1 0)
|
||||
@ -194,9 +194,9 @@ define <8 x i32> @testv8i32u(<8 x i32> %in) nounwind {
|
||||
; AVX1-NEXT: vpshufb %xmm2, %xmm6, %xmm2
|
||||
; AVX1-NEXT: vpaddb %xmm5, %xmm2, %xmm2
|
||||
; AVX1-NEXT: vpunpckhdq {{.*#+}} xmm5 = xmm2[2],xmm1[2],xmm2[3],xmm1[3]
|
||||
; AVX1-NEXT: vpsadbw %xmm5, %xmm1, %xmm5
|
||||
; AVX1-NEXT: vpsadbw %xmm1, %xmm5, %xmm5
|
||||
; AVX1-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
|
||||
; AVX1-NEXT: vpsadbw %xmm2, %xmm1, %xmm2
|
||||
; AVX1-NEXT: vpsadbw %xmm1, %xmm2, %xmm2
|
||||
; AVX1-NEXT: vpackuswb %xmm5, %xmm2, %xmm2
|
||||
; AVX1-NEXT: vpsubd %xmm3, %xmm0, %xmm0
|
||||
; AVX1-NEXT: vpand %xmm4, %xmm0, %xmm3
|
||||
@ -206,9 +206,9 @@ define <8 x i32> @testv8i32u(<8 x i32> %in) nounwind {
|
||||
; AVX1-NEXT: vpshufb %xmm0, %xmm6, %xmm0
|
||||
; AVX1-NEXT: vpaddb %xmm3, %xmm0, %xmm0
|
||||
; AVX1-NEXT: vpunpckhdq {{.*#+}} xmm3 = xmm0[2],xmm1[2],xmm0[3],xmm1[3]
|
||||
; AVX1-NEXT: vpsadbw %xmm3, %xmm1, %xmm3
|
||||
; AVX1-NEXT: vpsadbw %xmm1, %xmm3, %xmm3
|
||||
; AVX1-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
|
||||
; AVX1-NEXT: vpsadbw %xmm0, %xmm1, %xmm0
|
||||
; AVX1-NEXT: vpsadbw %xmm1, %xmm0, %xmm0
|
||||
; AVX1-NEXT: vpackuswb %xmm3, %xmm0, %xmm0
|
||||
; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
|
||||
; AVX1-NEXT: retq
|
||||
@ -229,9 +229,9 @@ define <8 x i32> @testv8i32u(<8 x i32> %in) nounwind {
|
||||
; AVX2-NEXT: vpshufb %ymm0, %ymm4, %ymm0
|
||||
; AVX2-NEXT: vpaddb %ymm3, %ymm0, %ymm0
|
||||
; AVX2-NEXT: vpunpckhdq {{.*#+}} ymm2 = ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[6],ymm1[6],ymm0[7],ymm1[7]
|
||||
; AVX2-NEXT: vpsadbw %ymm2, %ymm1, %ymm2
|
||||
; AVX2-NEXT: vpsadbw %ymm1, %ymm2, %ymm2
|
||||
; AVX2-NEXT: vpunpckldq {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[4],ymm1[4],ymm0[5],ymm1[5]
|
||||
; AVX2-NEXT: vpsadbw %ymm0, %ymm1, %ymm0
|
||||
; AVX2-NEXT: vpsadbw %ymm1, %ymm0, %ymm0
|
||||
; AVX2-NEXT: vpackuswb %ymm2, %ymm0, %ymm0
|
||||
; AVX2-NEXT: retq
|
||||
%out = call <8 x i32> @llvm.cttz.v8i32(<8 x i32> %in, i1 -1)
|
||||
|
Loading…
Reference in New Issue
Block a user