[LegalizeVectorTypes][X86][ARM][AArch64][PowerPC] Don't use SplitVecOp_TruncateHelper for FP_TO_SINT/UINT.

SplitVecOp_TruncateHelper tries to promote the result type while splitting FP_TO_SINT/UINT. It then concatenates the result and introduces a truncate to the original result type. But it does this without inserting the AssertZExt/AssertSExt that the regular result type promotion would insert. Nor does it turn FP_TO_UINT into FP_TO_SINT the way normal result type promotion for these operations does. This is bad on X86 which doesn't support FP_TO_SINT until AVX512.

This patch disables the use of SplitVecOp_TruncateHelper for these operations and just lets normal promotion handle it. I've tweaked a couple things in X86ISelLowering to avoid a few obvious regressions there. I believe all the changes on X86 are improvements. The other targets look neutral.

Differential Revision: https://reviews.llvm.org/D54906

llvm-svn: 347593
This commit is contained in:
Craig Topper 2018-11-26 21:12:39 +00:00
parent 388284330d
commit 0792d88e71
10 changed files with 256 additions and 516 deletions

View File

@ -1694,13 +1694,6 @@ bool DAGTypeLegalizer::SplitVectorOperand(SDNode *N, unsigned OpNo) {
case ISD::VSELECT:
Res = SplitVecOp_VSELECT(N, OpNo);
break;
case ISD::FP_TO_SINT:
case ISD::FP_TO_UINT:
if (N->getValueType(0).bitsLT(N->getOperand(0).getValueType()))
Res = SplitVecOp_TruncateHelper(N);
else
Res = SplitVecOp_UnaryOp(N);
break;
case ISD::SINT_TO_FP:
case ISD::UINT_TO_FP:
if (N->getValueType(0).bitsLT(N->getOperand(0).getValueType()))
@ -1708,6 +1701,8 @@ bool DAGTypeLegalizer::SplitVectorOperand(SDNode *N, unsigned OpNo) {
else
Res = SplitVecOp_UnaryOp(N);
break;
case ISD::FP_TO_SINT:
case ISD::FP_TO_UINT:
case ISD::CTTZ:
case ISD::CTLZ:
case ISD::CTPOP:

View File

@ -909,6 +909,14 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
setOperationAction(ISD::FP_TO_UINT, MVT::v2i16, Custom);
setOperationAction(ISD::FP_TO_UINT, MVT::v4i16, Custom);
// By marking FP_TO_SINT v8i16 as Custom, will trick type legalization into
// promoting v8i8 FP_TO_UINT into FP_TO_SINT. When the v8i16 FP_TO_SINT is
// split again based on the input type, this will cause an AssertSExt i16 to
// be emitted instead of an AssertZExt. This will allow packssdw followed by
// packuswb to be used to truncate to v8i8. This is necessary since packusdw
// isn't available until sse4.1.
setOperationAction(ISD::FP_TO_SINT, MVT::v8i16, Custom);
setOperationAction(ISD::SINT_TO_FP, MVT::v4i32, Legal);
setOperationAction(ISD::SINT_TO_FP, MVT::v2i32, Custom);
@ -26458,11 +26466,7 @@ void X86TargetLowering::ReplaceNodeResults(SDNode *N,
unsigned NewEltWidth = std::min(128 / VT.getVectorNumElements(), 32U);
MVT PromoteVT = MVT::getVectorVT(MVT::getIntegerVT(NewEltWidth),
VT.getVectorNumElements());
unsigned Opc = N->getOpcode();
if (PromoteVT == MVT::v2i32 || PromoteVT == MVT::v4i32)
Opc = ISD::FP_TO_SINT;
SDValue Res = DAG.getNode(Opc, dl, PromoteVT, Src);
SDValue Res = DAG.getNode(ISD::FP_TO_SINT, dl, PromoteVT, Src);
// Preserve what we know about the size of the original result. Except
// when the result is v2i32 since we can't widen the assert.

View File

@ -2,30 +2,30 @@
define <4 x i16> @fptosi_v4f64_to_v4i16(<4 x double>* %ptr) {
; CHECK: fptosi_v4f64_to_v4i16
; CHECK-LABEL: fptosi_v4f64_to_v4i16
; CHECK-DAG: fcvtzs v[[LHS:[0-9]+]].2d, v0.2d
; CHECK-DAG: fcvtzs v[[RHS:[0-9]+]].2d, v1.2d
; CHECK-DAG: xtn v[[MID:[0-9]+]].2s, v[[LHS]].2d
; CHECK-DAG: xtn2 v[[MID]].4s, v[[RHS]].2d
; CHECK: xtn v0.4h, v[[MID]].4s
; CHECK-DAG: xtn v[[XTN0:[0-9]+]].2s, v[[LHS]].2d
; CHECK-DAG: xtn v[[XTN1:[0-9]+]].2s, v[[RHS]].2d
; CHECK: uzp1 v0.4h, v[[XTN1]].4h, v[[XTN0]].4h
%tmp1 = load <4 x double>, <4 x double>* %ptr
%tmp2 = fptosi <4 x double> %tmp1 to <4 x i16>
ret <4 x i16> %tmp2
}
define <8 x i8> @fptosi_v4f64_to_v4i8(<8 x double>* %ptr) {
; CHECK: fptosi_v4f64_to_v4i8
; CHECK-LABEL: fptosi_v4f64_to_v4i8
; CHECK-DAG: fcvtzs v[[CONV0:[0-9]+]].2d, v0.2d
; CHECK-DAG: fcvtzs v[[CONV1:[0-9]+]].2d, v1.2d
; CHECK-DAG: fcvtzs v[[CONV2:[0-9]+]].2d, v2.2d
; CHECK-DAG: fcvtzs v[[CONV3:[0-9]+]].2d, v3.2d
; CHECK-DAG: xtn v[[NA2:[0-9]+]].2s, v[[CONV2]].2d
; CHECK-DAG: xtn2 v[[NA2]].4s, v[[CONV3]].2d
; CHECK-DAG: xtn v[[NA0:[0-9]+]].2s, v[[CONV0]].2d
; CHECK-DAG: xtn2 v[[NA0]].4s, v[[CONV1]].2d
; CHECK-DAG: xtn v[[TMP1:[0-9]+]].4h, v[[NA2]].4s
; CHECK-DAG: xtn2 v[[TMP1]].8h, v[[NA0]].4s
; CHECK: xtn v0.8b, v[[TMP1]].8h
; CHECK-DAG: xtn v[[XTN0:[0-9]+]].2s, v[[CONV0]].2d
; CHECK-DAG: xtn v[[XTN1:[0-9]+]].2s, v[[CONV1]].2d
; CHECK-DAG: xtn v[[XTN2:[0-9]+]].2s, v[[CONV2]].2d
; CHECK-DAG: xtn v[[XTN3:[0-9]+]].2s, v[[CONV3]].2d
; CHECK-DAG: uzp1 v[[UZP0:[0-9]+]].4h, v[[XTN1]].4h, v[[XTN0]].4h
; CHECK-DAG: uzp1 v[[UZP1:[0-9]+]].4h, v[[XTN3]].4h, v[[XTN2]].4h
; CHECK: uzp1 v0.8b, v[[UZP1:[0-9]+]].8b, v[[UZP0:[0-9]+]].8b
%tmp1 = load <8 x double>, <8 x double>* %ptr
%tmp2 = fptosi <8 x double> %tmp1 to <8 x i8>
ret <8 x i8> %tmp2
@ -54,12 +54,12 @@ define <4 x i16> @trunc_v4i64_to_v4i16(<4 x i64>* %ptr) {
}
define <4 x i16> @fptoui_v4f64_to_v4i16(<4 x double>* %ptr) {
; CHECK: fptoui_v4f64_to_v4i16
; CHECK-DAG: fcvtzu v[[LHS:[0-9]+]].2d, v0.2d
; CHECK-DAG: fcvtzu v[[RHS:[0-9]+]].2d, v1.2d
; CHECK-DAG: xtn v[[MID:[0-9]+]].2s, v[[LHS]].2d
; CHECK-DAG: xtn2 v[[MID]].4s, v[[RHS]].2d
; CHECK: xtn v0.4h, v[[MID]].4s
; CHECK-LABEL: fptoui_v4f64_to_v4i16
; CHECK-DAG: fcvtzs v[[LHS:[0-9]+]].2d, v0.2d
; CHECK-DAG: fcvtzs v[[RHS:[0-9]+]].2d, v1.2d
; CHECK-DAG: xtn v[[XTN0:[0-9]+]].2s, v[[LHS]].2d
; CHECK-DAG: xtn v[[XTN1:[0-9]+]].2s, v[[RHS]].2d
; CHECK: uzp1 v0.4h, v[[XTN1]].4h, v[[XTN0]].4h
%tmp1 = load <4 x double>, <4 x double>* %ptr
%tmp2 = fptoui <4 x double> %tmp1 to <4 x i16>
ret <4 x i16> %tmp2

View File

@ -2,14 +2,14 @@
define <8 x i8> @float_to_i8(<8 x float>* %in) {
; CHECK-LABEL: float_to_i8:
; CHECK: ldp q1, q0, [x0]
; CHECK-DAG: fadd v[[LSB:[0-9]+]].4s, v1.4s, v1.4s
; CHECK-DAG: fadd v[[MSB:[0-9]+]].4s, v0.4s, v0.4s
; CHECK-DAG: fcvtzu v[[LSB2:[0-9]+]].4s, v[[LSB]].4s
; CHECK-DAG: fcvtzu v[[MSB2:[0-9]+]].4s, v[[MSB]].4s
; CHECK: ldp q0, q1, [x0]
; CHECK-DAG: fadd v[[LSB:[0-9]+]].4s, v0.4s, v0.4s
; CHECK-DAG: fadd v[[MSB:[0-9]+]].4s, v1.4s, v1.4s
; CHECK-DAG: fcvtzs v[[LSB2:[0-9]+]].4s, v[[LSB]].4s
; CHECK-DAG: fcvtzs v[[MSB2:[0-9]+]].4s, v[[MSB]].4s
; CHECK-DAG: xtn v[[TMP:[0-9]+]].4h, v[[LSB]].4s
; CHECK-DAG: xtn2 v[[TMP]].8h, v[[MSB]].4s
; CHECK-DAG: xtn v0.8b, v[[TMP]].8h
; CHECK-DAG: xtn v[[TMP2:[0-9]+]].4h, v[[MSB]].4s
; CHECK-DAG: uzp1 v0.8b, v[[TMP]].8b, v[[TMP2]].8b
%l = load <8 x float>, <8 x float>* %in
%scale = fmul <8 x float> %l, <float 2.0, float 2.0, float 2.0, float 2.0, float 2.0, float 2.0, float 2.0, float 2.0>
%conv = fptoui <8 x float> %scale to <8 x i8>

View File

@ -293,14 +293,14 @@ define <4 x i16> @fix_double_to_i16(<4 x double> %in) {
; CHECK-NEXT: vld1.64 {d16, d17}, [r12]
; CHECK-NEXT: vmov d19, r2, r3
; CHECK-NEXT: vadd.f64 d18, d18, d18
; CHECK-NEXT: vcvt.u32.f64 s0, d18
; CHECK-NEXT: vcvt.s32.f64 s0, d18
; CHECK-NEXT: vmov r0, s0
; CHECK-NEXT: vadd.f64 d20, d16, d16
; CHECK-NEXT: vadd.f64 d19, d19, d19
; CHECK-NEXT: vadd.f64 d16, d17, d17
; CHECK-NEXT: vcvt.u32.f64 s2, d20
; CHECK-NEXT: vcvt.u32.f64 s4, d19
; CHECK-NEXT: vcvt.u32.f64 s6, d16
; CHECK-NEXT: vcvt.s32.f64 s2, d20
; CHECK-NEXT: vcvt.s32.f64 s4, d19
; CHECK-NEXT: vcvt.s32.f64 s6, d16
; CHECK-NEXT: vmov.32 d16[0], r0
; CHECK-NEXT: vmov r0, s2
; CHECK-NEXT: vmov.32 d17[0], r0
@ -308,7 +308,7 @@ define <4 x i16> @fix_double_to_i16(<4 x double> %in) {
; CHECK-NEXT: vmov.32 d16[1], r0
; CHECK-NEXT: vmov r0, s6
; CHECK-NEXT: vmov.32 d17[1], r0
; CHECK-NEXT: vmovn.i32 d16, q8
; CHECK-NEXT: vuzp.16 d16, d17
; CHECK-NEXT: vmov r0, r1, d16
; CHECK-NEXT: mov pc, lr

View File

@ -166,19 +166,19 @@ define <8 x i16> @test8elt(<8 x double>* nocapture readonly) local_unnamed_addr
; CHECK-P8-NEXT: lxvd2x vs2, r3, r4
; CHECK-P8-NEXT: li r4, 48
; CHECK-P8-NEXT: lxvd2x vs3, r3, r4
; CHECK-P8-NEXT: xscvdpuxws f4, f0
; CHECK-P8-NEXT: xscvdpsxws f4, f0
; CHECK-P8-NEXT: xxswapd vs0, vs0
; CHECK-P8-NEXT: xscvdpuxws f5, f1
; CHECK-P8-NEXT: xscvdpsxws f5, f1
; CHECK-P8-NEXT: xxswapd vs1, vs1
; CHECK-P8-NEXT: xscvdpuxws f6, f2
; CHECK-P8-NEXT: xscvdpsxws f6, f2
; CHECK-P8-NEXT: xxswapd vs2, vs2
; CHECK-P8-NEXT: xscvdpuxws f7, f3
; CHECK-P8-NEXT: xscvdpsxws f7, f3
; CHECK-P8-NEXT: xxswapd vs3, vs3
; CHECK-P8-NEXT: xscvdpuxws f0, f0
; CHECK-P8-NEXT: xscvdpuxws f1, f1
; CHECK-P8-NEXT: xscvdpsxws f0, f0
; CHECK-P8-NEXT: xscvdpsxws f1, f1
; CHECK-P8-NEXT: mfvsrwz r3, f4
; CHECK-P8-NEXT: xscvdpuxws f2, f2
; CHECK-P8-NEXT: xscvdpuxws f3, f3
; CHECK-P8-NEXT: xscvdpsxws f2, f2
; CHECK-P8-NEXT: xscvdpsxws f3, f3
; CHECK-P8-NEXT: mfvsrwz r4, f5
; CHECK-P8-NEXT: mtvsrd f4, r3
; CHECK-P8-NEXT: mfvsrwz r3, f6
@ -221,14 +221,14 @@ define <8 x i16> @test8elt(<8 x double>* nocapture readonly) local_unnamed_addr
; CHECK-P9-NEXT: xxswapd vs5, vs2
; CHECK-P9-NEXT: xxswapd vs6, vs1
; CHECK-P9-NEXT: xxswapd vs7, vs0
; CHECK-P9-NEXT: xscvdpuxws f3, f3
; CHECK-P9-NEXT: xscvdpuxws f2, f2
; CHECK-P9-NEXT: xscvdpuxws f1, f1
; CHECK-P9-NEXT: xscvdpuxws f0, f0
; CHECK-P9-NEXT: xscvdpuxws f4, f4
; CHECK-P9-NEXT: xscvdpuxws f5, f5
; CHECK-P9-NEXT: xscvdpuxws f6, f6
; CHECK-P9-NEXT: xscvdpuxws f7, f7
; CHECK-P9-NEXT: xscvdpsxws f3, f3
; CHECK-P9-NEXT: xscvdpsxws f2, f2
; CHECK-P9-NEXT: xscvdpsxws f1, f1
; CHECK-P9-NEXT: xscvdpsxws f0, f0
; CHECK-P9-NEXT: xscvdpsxws f4, f4
; CHECK-P9-NEXT: xscvdpsxws f5, f5
; CHECK-P9-NEXT: xscvdpsxws f6, f6
; CHECK-P9-NEXT: xscvdpsxws f7, f7
; CHECK-P9-NEXT: mfvsrwz r3, f3
; CHECK-P9-NEXT: mfvsrwz r5, f2
; CHECK-P9-NEXT: mfvsrwz r7, f1
@ -272,14 +272,14 @@ define <8 x i16> @test8elt(<8 x double>* nocapture readonly) local_unnamed_addr
; CHECK-BE-NEXT: xxswapd vs5, vs2
; CHECK-BE-NEXT: xxswapd vs6, vs1
; CHECK-BE-NEXT: xxswapd vs7, vs0
; CHECK-BE-NEXT: xscvdpuxws f3, f3
; CHECK-BE-NEXT: xscvdpuxws f2, f2
; CHECK-BE-NEXT: xscvdpuxws f1, f1
; CHECK-BE-NEXT: xscvdpuxws f0, f0
; CHECK-BE-NEXT: xscvdpuxws f4, f4
; CHECK-BE-NEXT: xscvdpuxws f5, f5
; CHECK-BE-NEXT: xscvdpuxws f6, f6
; CHECK-BE-NEXT: xscvdpuxws f7, f7
; CHECK-BE-NEXT: xscvdpsxws f3, f3
; CHECK-BE-NEXT: xscvdpsxws f2, f2
; CHECK-BE-NEXT: xscvdpsxws f1, f1
; CHECK-BE-NEXT: xscvdpsxws f0, f0
; CHECK-BE-NEXT: xscvdpsxws f4, f4
; CHECK-BE-NEXT: xscvdpsxws f5, f5
; CHECK-BE-NEXT: xscvdpsxws f6, f6
; CHECK-BE-NEXT: xscvdpsxws f7, f7
; CHECK-BE-NEXT: mfvsrwz r3, f3
; CHECK-BE-NEXT: mfvsrwz r5, f2
; CHECK-BE-NEXT: mfvsrwz r7, f1
@ -329,60 +329,60 @@ define void @test16elt(<16 x i16>* noalias nocapture sret %agg.result, <16 x dou
; CHECK-P8-NEXT: li r6, 48
; CHECK-P8-NEXT: lxvd2x vs3, r4, r6
; CHECK-P8-NEXT: li r6, 64
; CHECK-P8-NEXT: xscvdpuxws f4, f0
; CHECK-P8-NEXT: xscvdpsxws f4, f0
; CHECK-P8-NEXT: lxvd2x vs5, r4, r6
; CHECK-P8-NEXT: li r6, 80
; CHECK-P8-NEXT: xxswapd vs0, vs0
; CHECK-P8-NEXT: xscvdpuxws f6, f1
; CHECK-P8-NEXT: xscvdpsxws f6, f1
; CHECK-P8-NEXT: lxvd2x vs7, r4, r6
; CHECK-P8-NEXT: li r6, 96
; CHECK-P8-NEXT: xxswapd vs1, vs1
; CHECK-P8-NEXT: xscvdpuxws f8, f2
; CHECK-P8-NEXT: xscvdpsxws f8, f2
; CHECK-P8-NEXT: lxvd2x vs9, r4, r6
; CHECK-P8-NEXT: li r6, 112
; CHECK-P8-NEXT: xxswapd vs2, vs2
; CHECK-P8-NEXT: xscvdpuxws f10, f3
; CHECK-P8-NEXT: xscvdpsxws f10, f3
; CHECK-P8-NEXT: lxvd2x vs11, r4, r6
; CHECK-P8-NEXT: xxswapd vs3, vs3
; CHECK-P8-NEXT: xscvdpuxws f12, f5
; CHECK-P8-NEXT: xscvdpsxws f12, f5
; CHECK-P8-NEXT: xxswapd vs5, vs5
; CHECK-P8-NEXT: xscvdpuxws f13, f7
; CHECK-P8-NEXT: xscvdpsxws f13, f7
; CHECK-P8-NEXT: xxswapd vs7, vs7
; CHECK-P8-NEXT: xscvdpuxws v2, f9
; CHECK-P8-NEXT: xscvdpsxws v2, f9
; CHECK-P8-NEXT: xxswapd vs9, vs9
; CHECK-P8-NEXT: mfvsrwz r4, f4
; CHECK-P8-NEXT: xscvdpuxws v3, f11
; CHECK-P8-NEXT: xscvdpsxws v3, f11
; CHECK-P8-NEXT: xxswapd vs11, vs11
; CHECK-P8-NEXT: xscvdpuxws f0, f0
; CHECK-P8-NEXT: xscvdpsxws f0, f0
; CHECK-P8-NEXT: mfvsrwz r6, f6
; CHECK-P8-NEXT: mtvsrd f4, r4
; CHECK-P8-NEXT: mfvsrwz r4, f8
; CHECK-P8-NEXT: xscvdpuxws f1, f1
; CHECK-P8-NEXT: xscvdpsxws f1, f1
; CHECK-P8-NEXT: xxswapd v4, vs4
; CHECK-P8-NEXT: xscvdpuxws f2, f2
; CHECK-P8-NEXT: xscvdpsxws f2, f2
; CHECK-P8-NEXT: mtvsrd f6, r6
; CHECK-P8-NEXT: mfvsrwz r6, f10
; CHECK-P8-NEXT: mtvsrd f8, r4
; CHECK-P8-NEXT: xxswapd v5, vs6
; CHECK-P8-NEXT: mfvsrwz r4, f12
; CHECK-P8-NEXT: xscvdpuxws f5, f5
; CHECK-P8-NEXT: xscvdpsxws f5, f5
; CHECK-P8-NEXT: xxswapd v0, vs8
; CHECK-P8-NEXT: mtvsrd f10, r6
; CHECK-P8-NEXT: mfvsrwz r6, f13
; CHECK-P8-NEXT: mtvsrd f12, r4
; CHECK-P8-NEXT: xxswapd v1, vs10
; CHECK-P8-NEXT: mfvsrwz r4, v2
; CHECK-P8-NEXT: xscvdpuxws f3, f3
; CHECK-P8-NEXT: xscvdpsxws f3, f3
; CHECK-P8-NEXT: xxswapd v6, vs12
; CHECK-P8-NEXT: xscvdpuxws f9, f9
; CHECK-P8-NEXT: xscvdpsxws f9, f9
; CHECK-P8-NEXT: mtvsrd f13, r6
; CHECK-P8-NEXT: mfvsrwz r6, v3
; CHECK-P8-NEXT: mtvsrd v2, r4
; CHECK-P8-NEXT: xxswapd v7, vs13
; CHECK-P8-NEXT: mfvsrwz r4, f0
; CHECK-P8-NEXT: xscvdpuxws f7, f7
; CHECK-P8-NEXT: xscvdpsxws f7, f7
; CHECK-P8-NEXT: xxswapd v2, v2
; CHECK-P8-NEXT: xscvdpuxws f11, f11
; CHECK-P8-NEXT: xscvdpsxws f11, f11
; CHECK-P8-NEXT: mtvsrd v3, r6
; CHECK-P8-NEXT: mfvsrwz r6, f1
; CHECK-P8-NEXT: mtvsrd f0, r4
@ -450,22 +450,22 @@ define void @test16elt(<16 x i16>* noalias nocapture sret %agg.result, <16 x dou
; CHECK-P9-NEXT: xxswapd vs13, vs3
; CHECK-P9-NEXT: xxswapd v2, vs1
; CHECK-P9-NEXT: xxswapd v3, vs0
; CHECK-P9-NEXT: xscvdpuxws f6, f6
; CHECK-P9-NEXT: xscvdpuxws f5, f5
; CHECK-P9-NEXT: xscvdpuxws f4, f4
; CHECK-P9-NEXT: xscvdpuxws f2, f2
; CHECK-P9-NEXT: xscvdpuxws f7, f7
; CHECK-P9-NEXT: xscvdpuxws f3, f3
; CHECK-P9-NEXT: xscvdpuxws f1, f1
; CHECK-P9-NEXT: xscvdpuxws f0, f0
; CHECK-P9-NEXT: xscvdpuxws f8, f8
; CHECK-P9-NEXT: xscvdpuxws f9, f9
; CHECK-P9-NEXT: xscvdpuxws f10, f10
; CHECK-P9-NEXT: xscvdpuxws f11, f11
; CHECK-P9-NEXT: xscvdpuxws f12, f12
; CHECK-P9-NEXT: xscvdpuxws f13, f13
; CHECK-P9-NEXT: xscvdpuxws v2, v2
; CHECK-P9-NEXT: xscvdpuxws v3, v3
; CHECK-P9-NEXT: xscvdpsxws f6, f6
; CHECK-P9-NEXT: xscvdpsxws f5, f5
; CHECK-P9-NEXT: xscvdpsxws f4, f4
; CHECK-P9-NEXT: xscvdpsxws f2, f2
; CHECK-P9-NEXT: xscvdpsxws f7, f7
; CHECK-P9-NEXT: xscvdpsxws f3, f3
; CHECK-P9-NEXT: xscvdpsxws f1, f1
; CHECK-P9-NEXT: xscvdpsxws f0, f0
; CHECK-P9-NEXT: xscvdpsxws f8, f8
; CHECK-P9-NEXT: xscvdpsxws f9, f9
; CHECK-P9-NEXT: xscvdpsxws f10, f10
; CHECK-P9-NEXT: xscvdpsxws f11, f11
; CHECK-P9-NEXT: xscvdpsxws f12, f12
; CHECK-P9-NEXT: xscvdpsxws f13, f13
; CHECK-P9-NEXT: xscvdpsxws v2, v2
; CHECK-P9-NEXT: xscvdpsxws v3, v3
; CHECK-P9-NEXT: mfvsrwz r4, f6
; CHECK-P9-NEXT: mfvsrwz r5, f5
; CHECK-P9-NEXT: mfvsrwz r6, f4
@ -562,22 +562,22 @@ define void @test16elt(<16 x i16>* noalias nocapture sret %agg.result, <16 x dou
; CHECK-BE-NEXT: xxswapd vs13, vs3
; CHECK-BE-NEXT: xxswapd v2, vs1
; CHECK-BE-NEXT: xxswapd v3, vs0
; CHECK-BE-NEXT: xscvdpuxws f6, f6
; CHECK-BE-NEXT: xscvdpuxws f5, f5
; CHECK-BE-NEXT: xscvdpuxws f4, f4
; CHECK-BE-NEXT: xscvdpuxws f2, f2
; CHECK-BE-NEXT: xscvdpuxws f7, f7
; CHECK-BE-NEXT: xscvdpuxws f3, f3
; CHECK-BE-NEXT: xscvdpuxws f1, f1
; CHECK-BE-NEXT: xscvdpuxws f0, f0
; CHECK-BE-NEXT: xscvdpuxws f8, f8
; CHECK-BE-NEXT: xscvdpuxws f9, f9
; CHECK-BE-NEXT: xscvdpuxws f10, f10
; CHECK-BE-NEXT: xscvdpuxws f11, f11
; CHECK-BE-NEXT: xscvdpuxws f12, f12
; CHECK-BE-NEXT: xscvdpuxws f13, f13
; CHECK-BE-NEXT: xscvdpuxws v2, v2
; CHECK-BE-NEXT: xscvdpuxws v3, v3
; CHECK-BE-NEXT: xscvdpsxws f6, f6
; CHECK-BE-NEXT: xscvdpsxws f5, f5
; CHECK-BE-NEXT: xscvdpsxws f4, f4
; CHECK-BE-NEXT: xscvdpsxws f2, f2
; CHECK-BE-NEXT: xscvdpsxws f7, f7
; CHECK-BE-NEXT: xscvdpsxws f3, f3
; CHECK-BE-NEXT: xscvdpsxws f1, f1
; CHECK-BE-NEXT: xscvdpsxws f0, f0
; CHECK-BE-NEXT: xscvdpsxws f8, f8
; CHECK-BE-NEXT: xscvdpsxws f9, f9
; CHECK-BE-NEXT: xscvdpsxws f10, f10
; CHECK-BE-NEXT: xscvdpsxws f11, f11
; CHECK-BE-NEXT: xscvdpsxws f12, f12
; CHECK-BE-NEXT: xscvdpsxws f13, f13
; CHECK-BE-NEXT: xscvdpsxws v2, v2
; CHECK-BE-NEXT: xscvdpsxws v3, v3
; CHECK-BE-NEXT: mfvsrwz r4, f6
; CHECK-BE-NEXT: mfvsrwz r5, f5
; CHECK-BE-NEXT: mfvsrwz r6, f4

View File

@ -343,60 +343,60 @@ define <16 x i8> @test16elt(<16 x double>* nocapture readonly) local_unnamed_add
; CHECK-P8-NEXT: li r4, 48
; CHECK-P8-NEXT: lxvd2x vs3, r3, r4
; CHECK-P8-NEXT: li r4, 64
; CHECK-P8-NEXT: xscvdpuxws f4, f0
; CHECK-P8-NEXT: xscvdpsxws f4, f0
; CHECK-P8-NEXT: xxswapd vs0, vs0
; CHECK-P8-NEXT: lxvd2x vs5, r3, r4
; CHECK-P8-NEXT: li r4, 80
; CHECK-P8-NEXT: xscvdpuxws f6, f1
; CHECK-P8-NEXT: xscvdpsxws f6, f1
; CHECK-P8-NEXT: xxswapd vs1, vs1
; CHECK-P8-NEXT: lxvd2x vs7, r3, r4
; CHECK-P8-NEXT: li r4, 96
; CHECK-P8-NEXT: xscvdpuxws f8, f2
; CHECK-P8-NEXT: xscvdpsxws f8, f2
; CHECK-P8-NEXT: xxswapd vs2, vs2
; CHECK-P8-NEXT: lxvd2x vs9, r3, r4
; CHECK-P8-NEXT: li r4, 112
; CHECK-P8-NEXT: xscvdpuxws f10, f3
; CHECK-P8-NEXT: xscvdpsxws f10, f3
; CHECK-P8-NEXT: xxswapd vs3, vs3
; CHECK-P8-NEXT: lxvd2x vs11, r3, r4
; CHECK-P8-NEXT: xscvdpuxws f12, f5
; CHECK-P8-NEXT: xscvdpsxws f12, f5
; CHECK-P8-NEXT: xxswapd vs5, vs5
; CHECK-P8-NEXT: xscvdpuxws f13, f7
; CHECK-P8-NEXT: xscvdpsxws f13, f7
; CHECK-P8-NEXT: xxswapd vs7, vs7
; CHECK-P8-NEXT: xscvdpuxws v2, f9
; CHECK-P8-NEXT: xscvdpsxws v2, f9
; CHECK-P8-NEXT: xxswapd vs9, vs9
; CHECK-P8-NEXT: mfvsrwz r3, f4
; CHECK-P8-NEXT: xscvdpuxws v3, f11
; CHECK-P8-NEXT: xscvdpsxws v3, f11
; CHECK-P8-NEXT: xxswapd vs11, vs11
; CHECK-P8-NEXT: mfvsrwz r4, f6
; CHECK-P8-NEXT: xscvdpuxws f0, f0
; CHECK-P8-NEXT: xscvdpsxws f0, f0
; CHECK-P8-NEXT: mtvsrd f4, r3
; CHECK-P8-NEXT: mfvsrwz r3, f8
; CHECK-P8-NEXT: xscvdpuxws f1, f1
; CHECK-P8-NEXT: xscvdpsxws f1, f1
; CHECK-P8-NEXT: xxswapd v4, vs4
; CHECK-P8-NEXT: mtvsrd f6, r4
; CHECK-P8-NEXT: mfvsrwz r4, f10
; CHECK-P8-NEXT: xscvdpuxws f2, f2
; CHECK-P8-NEXT: xscvdpsxws f2, f2
; CHECK-P8-NEXT: xxswapd v5, vs6
; CHECK-P8-NEXT: mtvsrd f8, r3
; CHECK-P8-NEXT: mfvsrwz r3, f12
; CHECK-P8-NEXT: xscvdpuxws f3, f3
; CHECK-P8-NEXT: xscvdpsxws f3, f3
; CHECK-P8-NEXT: xxswapd v0, vs8
; CHECK-P8-NEXT: mtvsrd f10, r4
; CHECK-P8-NEXT: mfvsrwz r4, f13
; CHECK-P8-NEXT: xscvdpuxws f5, f5
; CHECK-P8-NEXT: xscvdpsxws f5, f5
; CHECK-P8-NEXT: xxswapd v1, vs10
; CHECK-P8-NEXT: mtvsrd f12, r3
; CHECK-P8-NEXT: mfvsrwz r3, v2
; CHECK-P8-NEXT: xscvdpuxws f7, f7
; CHECK-P8-NEXT: xscvdpsxws f7, f7
; CHECK-P8-NEXT: xxswapd v6, vs12
; CHECK-P8-NEXT: mtvsrd f13, r4
; CHECK-P8-NEXT: mfvsrwz r4, v3
; CHECK-P8-NEXT: mtvsrd v2, r3
; CHECK-P8-NEXT: xxswapd v7, vs13
; CHECK-P8-NEXT: mfvsrwz r3, f0
; CHECK-P8-NEXT: xscvdpuxws f9, f9
; CHECK-P8-NEXT: xscvdpsxws f9, f9
; CHECK-P8-NEXT: xxswapd v2, v2
; CHECK-P8-NEXT: xscvdpuxws f11, f11
; CHECK-P8-NEXT: xscvdpsxws f11, f11
; CHECK-P8-NEXT: mtvsrd v3, r4
; CHECK-P8-NEXT: mfvsrwz r4, f1
; CHECK-P8-NEXT: mtvsrd f0, r3
@ -462,22 +462,22 @@ define <16 x i8> @test16elt(<16 x double>* nocapture readonly) local_unnamed_add
; CHECK-P9-NEXT: xxswapd vs13, vs6
; CHECK-P9-NEXT: xxswapd v2, vs1
; CHECK-P9-NEXT: xxswapd v3, vs0
; CHECK-P9-NEXT: xscvdpuxws f5, f5
; CHECK-P9-NEXT: xscvdpuxws f4, f4
; CHECK-P9-NEXT: xscvdpuxws f3, f3
; CHECK-P9-NEXT: xscvdpuxws f2, f2
; CHECK-P9-NEXT: xscvdpuxws f7, f7
; CHECK-P9-NEXT: xscvdpuxws f6, f6
; CHECK-P9-NEXT: xscvdpuxws f1, f1
; CHECK-P9-NEXT: xscvdpuxws f0, f0
; CHECK-P9-NEXT: xscvdpuxws f8, f8
; CHECK-P9-NEXT: xscvdpuxws f9, f9
; CHECK-P9-NEXT: xscvdpuxws f10, f10
; CHECK-P9-NEXT: xscvdpuxws f11, f11
; CHECK-P9-NEXT: xscvdpuxws f12, f12
; CHECK-P9-NEXT: xscvdpuxws f13, f13
; CHECK-P9-NEXT: xscvdpuxws v2, v2
; CHECK-P9-NEXT: xscvdpuxws v3, v3
; CHECK-P9-NEXT: xscvdpsxws f5, f5
; CHECK-P9-NEXT: xscvdpsxws f4, f4
; CHECK-P9-NEXT: xscvdpsxws f3, f3
; CHECK-P9-NEXT: xscvdpsxws f2, f2
; CHECK-P9-NEXT: xscvdpsxws f7, f7
; CHECK-P9-NEXT: xscvdpsxws f6, f6
; CHECK-P9-NEXT: xscvdpsxws f1, f1
; CHECK-P9-NEXT: xscvdpsxws f0, f0
; CHECK-P9-NEXT: xscvdpsxws f8, f8
; CHECK-P9-NEXT: xscvdpsxws f9, f9
; CHECK-P9-NEXT: xscvdpsxws f10, f10
; CHECK-P9-NEXT: xscvdpsxws f11, f11
; CHECK-P9-NEXT: xscvdpsxws f12, f12
; CHECK-P9-NEXT: xscvdpsxws f13, f13
; CHECK-P9-NEXT: xscvdpsxws v2, v2
; CHECK-P9-NEXT: xscvdpsxws v3, v3
; CHECK-P9-NEXT: mfvsrwz r3, f5
; CHECK-P9-NEXT: mfvsrwz r4, f4
; CHECK-P9-NEXT: mfvsrwz r5, f3
@ -571,22 +571,22 @@ define <16 x i8> @test16elt(<16 x double>* nocapture readonly) local_unnamed_add
; CHECK-BE-NEXT: xxswapd vs13, vs6
; CHECK-BE-NEXT: xxswapd v2, vs1
; CHECK-BE-NEXT: xxswapd v3, vs0
; CHECK-BE-NEXT: xscvdpuxws f5, f5
; CHECK-BE-NEXT: xscvdpuxws f4, f4
; CHECK-BE-NEXT: xscvdpuxws f3, f3
; CHECK-BE-NEXT: xscvdpuxws f2, f2
; CHECK-BE-NEXT: xscvdpuxws f7, f7
; CHECK-BE-NEXT: xscvdpuxws f6, f6
; CHECK-BE-NEXT: xscvdpuxws f1, f1
; CHECK-BE-NEXT: xscvdpuxws f0, f0
; CHECK-BE-NEXT: xscvdpuxws f8, f8
; CHECK-BE-NEXT: xscvdpuxws f9, f9
; CHECK-BE-NEXT: xscvdpuxws f10, f10
; CHECK-BE-NEXT: xscvdpuxws f11, f11
; CHECK-BE-NEXT: xscvdpuxws f12, f12
; CHECK-BE-NEXT: xscvdpuxws f13, f13
; CHECK-BE-NEXT: xscvdpuxws v2, v2
; CHECK-BE-NEXT: xscvdpuxws v3, v3
; CHECK-BE-NEXT: xscvdpsxws f5, f5
; CHECK-BE-NEXT: xscvdpsxws f4, f4
; CHECK-BE-NEXT: xscvdpsxws f3, f3
; CHECK-BE-NEXT: xscvdpsxws f2, f2
; CHECK-BE-NEXT: xscvdpsxws f7, f7
; CHECK-BE-NEXT: xscvdpsxws f6, f6
; CHECK-BE-NEXT: xscvdpsxws f1, f1
; CHECK-BE-NEXT: xscvdpsxws f0, f0
; CHECK-BE-NEXT: xscvdpsxws f8, f8
; CHECK-BE-NEXT: xscvdpsxws f9, f9
; CHECK-BE-NEXT: xscvdpsxws f10, f10
; CHECK-BE-NEXT: xscvdpsxws f11, f11
; CHECK-BE-NEXT: xscvdpsxws f12, f12
; CHECK-BE-NEXT: xscvdpsxws f13, f13
; CHECK-BE-NEXT: xscvdpsxws v2, v2
; CHECK-BE-NEXT: xscvdpsxws v3, v3
; CHECK-BE-NEXT: mfvsrwz r3, f5
; CHECK-BE-NEXT: mfvsrwz r4, f4
; CHECK-BE-NEXT: mfvsrwz r5, f3

View File

@ -245,7 +245,7 @@ define <8 x i8> @cvt_v8f32_v8u8(<8 x float> %src) {
; CHECK-WIDE: ## %bb.0:
; CHECK-WIDE-NEXT: vcvttps2dq %ymm0, %ymm0
; CHECK-WIDE-NEXT: vextractf128 $1, %ymm0, %xmm1
; CHECK-WIDE-NEXT: vpackusdw %xmm1, %xmm0, %xmm0
; CHECK-WIDE-NEXT: vpackssdw %xmm1, %xmm0, %xmm0
; CHECK-WIDE-NEXT: vpackuswb %xmm0, %xmm0, %xmm0
; CHECK-WIDE-NEXT: vzeroupper
; CHECK-WIDE-NEXT: retl

View File

@ -2444,40 +2444,22 @@ define <2 x i16> @fptoui_2f64_to_2i16(<2 x double> %a) {
define <8 x i16> @fptosi_8f64_to_8i16(<8 x double> %a) {
; SSE-LABEL: fptosi_8f64_to_8i16:
; SSE: # %bb.0:
; SSE-NEXT: cvttpd2dq %xmm3, %xmm3
; SSE-NEXT: cvttpd2dq %xmm2, %xmm2
; SSE-NEXT: unpcklpd {{.*#+}} xmm2 = xmm2[0],xmm3[0]
; SSE-NEXT: cvttpd2dq %xmm1, %xmm1
; SSE-NEXT: cvttpd2dq %xmm0, %xmm0
; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm0[0,2,1,3,4,5,6,7]
; SSE-NEXT: cvttpd2dq %xmm3, %xmm0
; SSE-NEXT: cvttpd2dq %xmm2, %xmm2
; SSE-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3]
; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm2[0,1,2,0]
; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,6,4,7,5]
; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
; SSE-NEXT: unpcklpd {{.*#+}} xmm0 = xmm0[0],xmm1[0]
; SSE-NEXT: packssdw %xmm2, %xmm0
; SSE-NEXT: retq
;
; AVX1-LABEL: fptosi_8f64_to_8i16:
; AVX1: # %bb.0:
; AVX1-NEXT: vcvttpd2dq %ymm1, %xmm1
; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1
; AVX1-NEXT: vcvttpd2dq %ymm0, %xmm0
; AVX1-NEXT: vpshufb %xmm2, %xmm0, %xmm0
; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
; AVX1-NEXT: vzeroupper
; AVX1-NEXT: retq
;
; AVX2-LABEL: fptosi_8f64_to_8i16:
; AVX2: # %bb.0:
; AVX2-NEXT: vcvttpd2dq %ymm0, %xmm0
; AVX2-NEXT: vcvttpd2dq %ymm1, %xmm1
; AVX2-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31]
; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
; AVX2-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
; AVX2-NEXT: vzeroupper
; AVX2-NEXT: retq
; VEX-LABEL: fptosi_8f64_to_8i16:
; VEX: # %bb.0:
; VEX-NEXT: vcvttpd2dq %ymm1, %xmm1
; VEX-NEXT: vcvttpd2dq %ymm0, %xmm0
; VEX-NEXT: vpackssdw %xmm1, %xmm0, %xmm0
; VEX-NEXT: vzeroupper
; VEX-NEXT: retq
;
; AVX512F-LABEL: fptosi_8f64_to_8i16:
; AVX512F: # %bb.0:
@ -2515,89 +2497,28 @@ define <8 x i16> @fptosi_8f64_to_8i16(<8 x double> %a) {
define <8 x i16> @fptoui_8f64_to_8i16(<8 x double> %a) {
; SSE-LABEL: fptoui_8f64_to_8i16:
; SSE: # %bb.0:
; SSE-NEXT: cvttsd2si %xmm3, %rax
; SSE-NEXT: movd %eax, %xmm4
; SSE-NEXT: movhlps {{.*#+}} xmm3 = xmm3[1,1]
; SSE-NEXT: cvttsd2si %xmm3, %rax
; SSE-NEXT: movd %eax, %xmm3
; SSE-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3]
; SSE-NEXT: cvttsd2si %xmm2, %rax
; SSE-NEXT: movd %eax, %xmm3
; SSE-NEXT: movhlps {{.*#+}} xmm2 = xmm2[1,1]
; SSE-NEXT: cvttsd2si %xmm2, %rax
; SSE-NEXT: movd %eax, %xmm2
; SSE-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3]
; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1]
; SSE-NEXT: cvttsd2si %xmm1, %rax
; SSE-NEXT: movd %eax, %xmm2
; SSE-NEXT: movhlps {{.*#+}} xmm1 = xmm1[1,1]
; SSE-NEXT: cvttsd2si %xmm1, %rax
; SSE-NEXT: movd %eax, %xmm1
; SSE-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3]
; SSE-NEXT: cvttsd2si %xmm0, %rax
; SSE-NEXT: movd %eax, %xmm1
; SSE-NEXT: movhlps {{.*#+}} xmm0 = xmm0[1,1]
; SSE-NEXT: cvttsd2si %xmm0, %rax
; SSE-NEXT: movd %eax, %xmm0
; SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
; SSE-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm3[0]
; SSE-NEXT: movdqa %xmm1, %xmm0
; SSE-NEXT: cvttpd2dq %xmm3, %xmm3
; SSE-NEXT: cvttpd2dq %xmm2, %xmm2
; SSE-NEXT: unpcklpd {{.*#+}} xmm2 = xmm2[0],xmm3[0]
; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[0,2,2,3,4,5,6,7]
; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,6,6,7]
; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3]
; SSE-NEXT: cvttpd2dq %xmm1, %xmm1
; SSE-NEXT: cvttpd2dq %xmm0, %xmm0
; SSE-NEXT: unpcklpd {{.*#+}} xmm0 = xmm0[0],xmm1[0]
; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7]
; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,6,7]
; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
; SSE-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0]
; SSE-NEXT: retq
;
; AVX1-LABEL: fptoui_8f64_to_8i16:
; AVX1: # %bb.0:
; AVX1-NEXT: vmovapd {{.*#+}} ymm2 = [2.147483648E+9,2.147483648E+9,2.147483648E+9,2.147483648E+9]
; AVX1-NEXT: vcmpltpd %ymm2, %ymm1, %ymm3
; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm4
; AVX1-NEXT: vpackssdw %xmm4, %xmm3, %xmm3
; AVX1-NEXT: vsubpd %ymm2, %ymm1, %ymm4
; AVX1-NEXT: vcvttpd2dq %ymm4, %xmm4
; AVX1-NEXT: vmovapd {{.*#+}} xmm5 = [2147483648,2147483648,2147483648,2147483648]
; AVX1-NEXT: vxorpd %xmm5, %xmm4, %xmm4
; AVX1-NEXT: vcvttpd2dq %ymm1, %xmm1
; AVX1-NEXT: vblendvps %xmm3, %xmm1, %xmm4, %xmm1
; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
; AVX1-NEXT: vpshufb %xmm3, %xmm1, %xmm1
; AVX1-NEXT: vcmpltpd %ymm2, %ymm0, %ymm4
; AVX1-NEXT: vextractf128 $1, %ymm4, %xmm6
; AVX1-NEXT: vpackssdw %xmm6, %xmm4, %xmm4
; AVX1-NEXT: vsubpd %ymm2, %ymm0, %ymm2
; AVX1-NEXT: vcvttpd2dq %ymm2, %xmm2
; AVX1-NEXT: vxorpd %xmm5, %xmm2, %xmm2
; AVX1-NEXT: vcvttpd2dq %ymm0, %xmm0
; AVX1-NEXT: vblendvps %xmm4, %xmm0, %xmm2, %xmm0
; AVX1-NEXT: vpshufb %xmm3, %xmm0, %xmm0
; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
; AVX1-NEXT: vzeroupper
; AVX1-NEXT: retq
;
; AVX2-LABEL: fptoui_8f64_to_8i16:
; AVX2: # %bb.0:
; AVX2-NEXT: vbroadcastsd {{.*#+}} ymm2 = [2.147483648E+9,2.147483648E+9,2.147483648E+9,2.147483648E+9]
; AVX2-NEXT: vcmpltpd %ymm2, %ymm1, %ymm3
; AVX2-NEXT: vextractf128 $1, %ymm3, %xmm4
; AVX2-NEXT: vpackssdw %xmm4, %xmm3, %xmm3
; AVX2-NEXT: vsubpd %ymm2, %ymm1, %ymm4
; AVX2-NEXT: vcvttpd2dq %ymm4, %xmm4
; AVX2-NEXT: vbroadcastss {{.*#+}} xmm5 = [2147483648,2147483648,2147483648,2147483648]
; AVX2-NEXT: vxorpd %xmm5, %xmm4, %xmm4
; AVX2-NEXT: vcvttpd2dq %ymm1, %xmm1
; AVX2-NEXT: vblendvps %xmm3, %xmm1, %xmm4, %xmm1
; AVX2-NEXT: vcmpltpd %ymm2, %ymm0, %ymm3
; AVX2-NEXT: vextractf128 $1, %ymm3, %xmm4
; AVX2-NEXT: vpackssdw %xmm4, %xmm3, %xmm3
; AVX2-NEXT: vsubpd %ymm2, %ymm0, %ymm2
; AVX2-NEXT: vcvttpd2dq %ymm2, %xmm2
; AVX2-NEXT: vxorpd %xmm5, %xmm2, %xmm2
; AVX2-NEXT: vcvttpd2dq %ymm0, %xmm0
; AVX2-NEXT: vblendvps %xmm3, %xmm0, %xmm2, %xmm0
; AVX2-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31]
; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
; AVX2-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
; AVX2-NEXT: vzeroupper
; AVX2-NEXT: retq
; VEX-LABEL: fptoui_8f64_to_8i16:
; VEX: # %bb.0:
; VEX-NEXT: vcvttpd2dq %ymm1, %xmm1
; VEX-NEXT: vcvttpd2dq %ymm0, %xmm0
; VEX-NEXT: vpackusdw %xmm1, %xmm0, %xmm0
; VEX-NEXT: vzeroupper
; VEX-NEXT: retq
;
; AVX512F-LABEL: fptoui_8f64_to_8i16:
; AVX512F: # %bb.0:
@ -2636,17 +2557,12 @@ define <16 x i8> @fptosi_16f32_to_16i8(<16 x float> %a) {
; SSE-LABEL: fptosi_16f32_to_16i8:
; SSE: # %bb.0:
; SSE-NEXT: cvttps2dq %xmm3, %xmm3
; SSE-NEXT: movdqa {{.*#+}} xmm4 = [255,255,255,255]
; SSE-NEXT: pand %xmm4, %xmm3
; SSE-NEXT: cvttps2dq %xmm2, %xmm2
; SSE-NEXT: pand %xmm4, %xmm2
; SSE-NEXT: packuswb %xmm3, %xmm2
; SSE-NEXT: packssdw %xmm3, %xmm2
; SSE-NEXT: cvttps2dq %xmm1, %xmm1
; SSE-NEXT: pand %xmm4, %xmm1
; SSE-NEXT: cvttps2dq %xmm0, %xmm0
; SSE-NEXT: pand %xmm4, %xmm0
; SSE-NEXT: packuswb %xmm1, %xmm0
; SSE-NEXT: packuswb %xmm2, %xmm0
; SSE-NEXT: packssdw %xmm1, %xmm0
; SSE-NEXT: packsswb %xmm2, %xmm0
; SSE-NEXT: retq
;
; AVX1-LABEL: fptosi_16f32_to_16i8:
@ -2654,13 +2570,10 @@ define <16 x i8> @fptosi_16f32_to_16i8(<16 x float> %a) {
; AVX1-NEXT: vcvttps2dq %ymm1, %ymm1
; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
; AVX1-NEXT: vpackssdw %xmm2, %xmm1, %xmm1
; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [255,255,255,255,255,255,255,255]
; AVX1-NEXT: vpand %xmm2, %xmm1, %xmm1
; AVX1-NEXT: vcvttps2dq %ymm0, %ymm0
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3
; AVX1-NEXT: vpackssdw %xmm3, %xmm0, %xmm0
; AVX1-NEXT: vpand %xmm2, %xmm0, %xmm0
; AVX1-NEXT: vpackuswb %xmm1, %xmm0, %xmm0
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
; AVX1-NEXT: vpackssdw %xmm2, %xmm0, %xmm0
; AVX1-NEXT: vpacksswb %xmm1, %xmm0, %xmm0
; AVX1-NEXT: vzeroupper
; AVX1-NEXT: retq
;
@ -2669,13 +2582,10 @@ define <16 x i8> @fptosi_16f32_to_16i8(<16 x float> %a) {
; AVX2-NEXT: vcvttps2dq %ymm1, %ymm1
; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm2
; AVX2-NEXT: vpackssdw %xmm2, %xmm1, %xmm1
; AVX2-NEXT: vmovdqa {{.*#+}} xmm2 = [255,255,255,255,255,255,255,255]
; AVX2-NEXT: vpand %xmm2, %xmm1, %xmm1
; AVX2-NEXT: vcvttps2dq %ymm0, %ymm0
; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm3
; AVX2-NEXT: vpackssdw %xmm3, %xmm0, %xmm0
; AVX2-NEXT: vpand %xmm2, %xmm0, %xmm0
; AVX2-NEXT: vpackuswb %xmm1, %xmm0, %xmm0
; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm2
; AVX2-NEXT: vpackssdw %xmm2, %xmm0, %xmm0
; AVX2-NEXT: vpacksswb %xmm1, %xmm0, %xmm0
; AVX2-NEXT: vzeroupper
; AVX2-NEXT: retq
;
@ -2693,16 +2603,11 @@ define <16 x i8> @fptoui_16f32_to_16i8(<16 x float> %a) {
; SSE-LABEL: fptoui_16f32_to_16i8:
; SSE: # %bb.0:
; SSE-NEXT: cvttps2dq %xmm3, %xmm3
; SSE-NEXT: movdqa {{.*#+}} xmm4 = [255,255,255,255]
; SSE-NEXT: pand %xmm4, %xmm3
; SSE-NEXT: cvttps2dq %xmm2, %xmm2
; SSE-NEXT: pand %xmm4, %xmm2
; SSE-NEXT: packuswb %xmm3, %xmm2
; SSE-NEXT: packssdw %xmm3, %xmm2
; SSE-NEXT: cvttps2dq %xmm1, %xmm1
; SSE-NEXT: pand %xmm4, %xmm1
; SSE-NEXT: cvttps2dq %xmm0, %xmm0
; SSE-NEXT: pand %xmm4, %xmm0
; SSE-NEXT: packuswb %xmm1, %xmm0
; SSE-NEXT: packssdw %xmm1, %xmm0
; SSE-NEXT: packuswb %xmm2, %xmm0
; SSE-NEXT: retq
;
@ -2710,13 +2615,10 @@ define <16 x i8> @fptoui_16f32_to_16i8(<16 x float> %a) {
; AVX1: # %bb.0:
; AVX1-NEXT: vcvttps2dq %ymm1, %ymm1
; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
; AVX1-NEXT: vpackusdw %xmm2, %xmm1, %xmm1
; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [255,255,255,255,255,255,255,255]
; AVX1-NEXT: vpand %xmm2, %xmm1, %xmm1
; AVX1-NEXT: vpackssdw %xmm2, %xmm1, %xmm1
; AVX1-NEXT: vcvttps2dq %ymm0, %ymm0
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3
; AVX1-NEXT: vpackusdw %xmm3, %xmm0, %xmm0
; AVX1-NEXT: vpand %xmm2, %xmm0, %xmm0
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
; AVX1-NEXT: vpackssdw %xmm2, %xmm0, %xmm0
; AVX1-NEXT: vpackuswb %xmm1, %xmm0, %xmm0
; AVX1-NEXT: vzeroupper
; AVX1-NEXT: retq
@ -2725,13 +2627,10 @@ define <16 x i8> @fptoui_16f32_to_16i8(<16 x float> %a) {
; AVX2: # %bb.0:
; AVX2-NEXT: vcvttps2dq %ymm1, %ymm1
; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm2
; AVX2-NEXT: vpackusdw %xmm2, %xmm1, %xmm1
; AVX2-NEXT: vmovdqa {{.*#+}} xmm2 = [255,255,255,255,255,255,255,255]
; AVX2-NEXT: vpand %xmm2, %xmm1, %xmm1
; AVX2-NEXT: vpackssdw %xmm2, %xmm1, %xmm1
; AVX2-NEXT: vcvttps2dq %ymm0, %ymm0
; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm3
; AVX2-NEXT: vpackusdw %xmm3, %xmm0, %xmm0
; AVX2-NEXT: vpand %xmm2, %xmm0, %xmm0
; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm2
; AVX2-NEXT: vpackssdw %xmm2, %xmm0, %xmm0
; AVX2-NEXT: vpackuswb %xmm1, %xmm0, %xmm0
; AVX2-NEXT: vzeroupper
; AVX2-NEXT: retq

View File

@ -2726,40 +2726,22 @@ define <2 x i16> @fptoui_2f64_to_2i16(<2 x double> %a) {
define <8 x i16> @fptosi_8f64_to_8i16(<8 x double> %a) {
; SSE-LABEL: fptosi_8f64_to_8i16:
; SSE: # %bb.0:
; SSE-NEXT: cvttpd2dq %xmm3, %xmm3
; SSE-NEXT: cvttpd2dq %xmm2, %xmm2
; SSE-NEXT: unpcklpd {{.*#+}} xmm2 = xmm2[0],xmm3[0]
; SSE-NEXT: cvttpd2dq %xmm1, %xmm1
; SSE-NEXT: cvttpd2dq %xmm0, %xmm0
; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm0[0,2,1,3,4,5,6,7]
; SSE-NEXT: cvttpd2dq %xmm3, %xmm0
; SSE-NEXT: cvttpd2dq %xmm2, %xmm2
; SSE-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3]
; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm2[0,1,2,0]
; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,6,4,7,5]
; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
; SSE-NEXT: unpcklpd {{.*#+}} xmm0 = xmm0[0],xmm1[0]
; SSE-NEXT: packssdw %xmm2, %xmm0
; SSE-NEXT: retq
;
; AVX1-LABEL: fptosi_8f64_to_8i16:
; AVX1: # %bb.0:
; AVX1-NEXT: vcvttpd2dq %ymm1, %xmm1
; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1
; AVX1-NEXT: vcvttpd2dq %ymm0, %xmm0
; AVX1-NEXT: vpshufb %xmm2, %xmm0, %xmm0
; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
; AVX1-NEXT: vzeroupper
; AVX1-NEXT: retq
;
; AVX2-LABEL: fptosi_8f64_to_8i16:
; AVX2: # %bb.0:
; AVX2-NEXT: vcvttpd2dq %ymm0, %xmm0
; AVX2-NEXT: vcvttpd2dq %ymm1, %xmm1
; AVX2-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31]
; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
; AVX2-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
; AVX2-NEXT: vzeroupper
; AVX2-NEXT: retq
; VEX-LABEL: fptosi_8f64_to_8i16:
; VEX: # %bb.0:
; VEX-NEXT: vcvttpd2dq %ymm1, %xmm1
; VEX-NEXT: vcvttpd2dq %ymm0, %xmm0
; VEX-NEXT: vpackssdw %xmm1, %xmm0, %xmm0
; VEX-NEXT: vzeroupper
; VEX-NEXT: retq
;
; AVX512F-LABEL: fptosi_8f64_to_8i16:
; AVX512F: # %bb.0:
@ -2797,146 +2779,28 @@ define <8 x i16> @fptosi_8f64_to_8i16(<8 x double> %a) {
define <8 x i16> @fptoui_8f64_to_8i16(<8 x double> %a) {
; SSE-LABEL: fptoui_8f64_to_8i16:
; SSE: # %bb.0:
; SSE-NEXT: movsd {{.*#+}} xmm4 = mem[0],zero
; SSE-NEXT: movapd %xmm1, %xmm5
; SSE-NEXT: subsd %xmm4, %xmm5
; SSE-NEXT: cvttsd2si %xmm5, %rcx
; SSE-NEXT: movabsq $-9223372036854775808, %rax # imm = 0x8000000000000000
; SSE-NEXT: xorq %rax, %rcx
; SSE-NEXT: cvttsd2si %xmm1, %rdx
; SSE-NEXT: ucomisd %xmm4, %xmm1
; SSE-NEXT: cmovaeq %rcx, %rdx
; SSE-NEXT: movq %rdx, %xmm5
; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1,1]
; SSE-NEXT: movapd %xmm1, %xmm6
; SSE-NEXT: subsd %xmm4, %xmm6
; SSE-NEXT: cvttsd2si %xmm6, %rcx
; SSE-NEXT: xorq %rax, %rcx
; SSE-NEXT: cvttsd2si %xmm1, %rdx
; SSE-NEXT: ucomisd %xmm4, %xmm1
; SSE-NEXT: cmovaeq %rcx, %rdx
; SSE-NEXT: movq %rdx, %xmm1
; SSE-NEXT: punpcklqdq {{.*#+}} xmm5 = xmm5[0],xmm1[0]
; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm5[0,2,2,3]
; SSE-NEXT: pshuflw {{.*#+}} xmm5 = xmm1[0,2,2,3,4,5,6,7]
; SSE-NEXT: movapd %xmm0, %xmm1
; SSE-NEXT: subsd %xmm4, %xmm1
; SSE-NEXT: cvttsd2si %xmm1, %rcx
; SSE-NEXT: xorq %rax, %rcx
; SSE-NEXT: cvttsd2si %xmm0, %rdx
; SSE-NEXT: ucomisd %xmm4, %xmm0
; SSE-NEXT: cmovaeq %rcx, %rdx
; SSE-NEXT: movq %rdx, %xmm1
; SSE-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1,1]
; SSE-NEXT: movapd %xmm0, %xmm6
; SSE-NEXT: subsd %xmm4, %xmm6
; SSE-NEXT: cvttsd2si %xmm6, %rcx
; SSE-NEXT: xorq %rax, %rcx
; SSE-NEXT: cvttsd2si %xmm0, %rdx
; SSE-NEXT: ucomisd %xmm4, %xmm0
; SSE-NEXT: cmovaeq %rcx, %rdx
; SSE-NEXT: movq %rdx, %xmm0
; SSE-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm0[0]
; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[0,2,2,3]
; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm0[0,2,2,3,4,5,6,7]
; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm5[0],xmm1[1],xmm5[1]
; SSE-NEXT: movapd %xmm3, %xmm0
; SSE-NEXT: subsd %xmm4, %xmm0
; SSE-NEXT: cvttsd2si %xmm0, %rcx
; SSE-NEXT: xorq %rax, %rcx
; SSE-NEXT: cvttsd2si %xmm3, %rdx
; SSE-NEXT: ucomisd %xmm4, %xmm3
; SSE-NEXT: cmovaeq %rcx, %rdx
; SSE-NEXT: movq %rdx, %xmm0
; SSE-NEXT: unpckhpd {{.*#+}} xmm3 = xmm3[1,1]
; SSE-NEXT: movapd %xmm3, %xmm5
; SSE-NEXT: subsd %xmm4, %xmm5
; SSE-NEXT: cvttsd2si %xmm5, %rcx
; SSE-NEXT: xorq %rax, %rcx
; SSE-NEXT: cvttsd2si %xmm3, %rdx
; SSE-NEXT: ucomisd %xmm4, %xmm3
; SSE-NEXT: cmovaeq %rcx, %rdx
; SSE-NEXT: movq %rdx, %xmm3
; SSE-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm3[0]
; SSE-NEXT: cvttpd2dq %xmm3, %xmm3
; SSE-NEXT: cvttpd2dq %xmm2, %xmm2
; SSE-NEXT: unpcklpd {{.*#+}} xmm2 = xmm2[0],xmm3[0]
; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[0,2,2,3,4,5,6,7]
; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,6,6,7]
; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3]
; SSE-NEXT: cvttpd2dq %xmm1, %xmm1
; SSE-NEXT: cvttpd2dq %xmm0, %xmm0
; SSE-NEXT: unpcklpd {{.*#+}} xmm0 = xmm0[0],xmm1[0]
; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7]
; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,6,7]
; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm0[0,1,0,2,4,5,6,7]
; SSE-NEXT: movapd %xmm2, %xmm0
; SSE-NEXT: subsd %xmm4, %xmm0
; SSE-NEXT: cvttsd2si %xmm0, %rcx
; SSE-NEXT: xorq %rax, %rcx
; SSE-NEXT: cvttsd2si %xmm2, %rdx
; SSE-NEXT: ucomisd %xmm4, %xmm2
; SSE-NEXT: cmovaeq %rcx, %rdx
; SSE-NEXT: movq %rdx, %xmm0
; SSE-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1,1]
; SSE-NEXT: movapd %xmm2, %xmm5
; SSE-NEXT: subsd %xmm4, %xmm5
; SSE-NEXT: cvttsd2si %xmm5, %rcx
; SSE-NEXT: xorq %rax, %rcx
; SSE-NEXT: cvttsd2si %xmm2, %rax
; SSE-NEXT: ucomisd %xmm4, %xmm2
; SSE-NEXT: cmovaeq %rcx, %rax
; SSE-NEXT: movq %rax, %xmm2
; SSE-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0]
; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,1,0,2,4,5,6,7]
; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1]
; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
; SSE-NEXT: retq
;
; AVX1-LABEL: fptoui_8f64_to_8i16:
; AVX1: # %bb.0:
; AVX1-NEXT: vmovapd {{.*#+}} ymm2 = [2.147483648E+9,2.147483648E+9,2.147483648E+9,2.147483648E+9]
; AVX1-NEXT: vcmpltpd %ymm2, %ymm1, %ymm3
; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm4
; AVX1-NEXT: vpackssdw %xmm4, %xmm3, %xmm3
; AVX1-NEXT: vsubpd %ymm2, %ymm1, %ymm4
; AVX1-NEXT: vcvttpd2dq %ymm4, %xmm4
; AVX1-NEXT: vmovapd {{.*#+}} xmm5 = [2147483648,2147483648,2147483648,2147483648]
; AVX1-NEXT: vxorpd %xmm5, %xmm4, %xmm4
; AVX1-NEXT: vcvttpd2dq %ymm1, %xmm1
; AVX1-NEXT: vblendvps %xmm3, %xmm1, %xmm4, %xmm1
; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
; AVX1-NEXT: vpshufb %xmm3, %xmm1, %xmm1
; AVX1-NEXT: vcmpltpd %ymm2, %ymm0, %ymm4
; AVX1-NEXT: vextractf128 $1, %ymm4, %xmm6
; AVX1-NEXT: vpackssdw %xmm6, %xmm4, %xmm4
; AVX1-NEXT: vsubpd %ymm2, %ymm0, %ymm2
; AVX1-NEXT: vcvttpd2dq %ymm2, %xmm2
; AVX1-NEXT: vxorpd %xmm5, %xmm2, %xmm2
; AVX1-NEXT: vcvttpd2dq %ymm0, %xmm0
; AVX1-NEXT: vblendvps %xmm4, %xmm0, %xmm2, %xmm0
; AVX1-NEXT: vpshufb %xmm3, %xmm0, %xmm0
; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
; AVX1-NEXT: vzeroupper
; AVX1-NEXT: retq
;
; AVX2-LABEL: fptoui_8f64_to_8i16:
; AVX2: # %bb.0:
; AVX2-NEXT: vbroadcastsd {{.*#+}} ymm2 = [2.147483648E+9,2.147483648E+9,2.147483648E+9,2.147483648E+9]
; AVX2-NEXT: vcmpltpd %ymm2, %ymm1, %ymm3
; AVX2-NEXT: vextractf128 $1, %ymm3, %xmm4
; AVX2-NEXT: vpackssdw %xmm4, %xmm3, %xmm3
; AVX2-NEXT: vsubpd %ymm2, %ymm1, %ymm4
; AVX2-NEXT: vcvttpd2dq %ymm4, %xmm4
; AVX2-NEXT: vbroadcastss {{.*#+}} xmm5 = [2147483648,2147483648,2147483648,2147483648]
; AVX2-NEXT: vxorpd %xmm5, %xmm4, %xmm4
; AVX2-NEXT: vcvttpd2dq %ymm1, %xmm1
; AVX2-NEXT: vblendvps %xmm3, %xmm1, %xmm4, %xmm1
; AVX2-NEXT: vcmpltpd %ymm2, %ymm0, %ymm3
; AVX2-NEXT: vextractf128 $1, %ymm3, %xmm4
; AVX2-NEXT: vpackssdw %xmm4, %xmm3, %xmm3
; AVX2-NEXT: vsubpd %ymm2, %ymm0, %ymm2
; AVX2-NEXT: vcvttpd2dq %ymm2, %xmm2
; AVX2-NEXT: vxorpd %xmm5, %xmm2, %xmm2
; AVX2-NEXT: vcvttpd2dq %ymm0, %xmm0
; AVX2-NEXT: vblendvps %xmm3, %xmm0, %xmm2, %xmm0
; AVX2-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31]
; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
; AVX2-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
; AVX2-NEXT: vzeroupper
; AVX2-NEXT: retq
; VEX-LABEL: fptoui_8f64_to_8i16:
; VEX: # %bb.0:
; VEX-NEXT: vcvttpd2dq %ymm1, %xmm1
; VEX-NEXT: vcvttpd2dq %ymm0, %xmm0
; VEX-NEXT: vpackusdw %xmm1, %xmm0, %xmm0
; VEX-NEXT: vzeroupper
; VEX-NEXT: retq
;
; AVX512F-LABEL: fptoui_8f64_to_8i16:
; AVX512F: # %bb.0:
@ -2975,17 +2839,12 @@ define <16 x i8> @fptosi_16f32_to_16i8(<16 x float> %a) {
; SSE-LABEL: fptosi_16f32_to_16i8:
; SSE: # %bb.0:
; SSE-NEXT: cvttps2dq %xmm3, %xmm3
; SSE-NEXT: movdqa {{.*#+}} xmm4 = [255,255,255,255]
; SSE-NEXT: pand %xmm4, %xmm3
; SSE-NEXT: cvttps2dq %xmm2, %xmm2
; SSE-NEXT: pand %xmm4, %xmm2
; SSE-NEXT: packuswb %xmm3, %xmm2
; SSE-NEXT: packssdw %xmm3, %xmm2
; SSE-NEXT: cvttps2dq %xmm1, %xmm1
; SSE-NEXT: pand %xmm4, %xmm1
; SSE-NEXT: cvttps2dq %xmm0, %xmm0
; SSE-NEXT: pand %xmm4, %xmm0
; SSE-NEXT: packuswb %xmm1, %xmm0
; SSE-NEXT: packuswb %xmm2, %xmm0
; SSE-NEXT: packssdw %xmm1, %xmm0
; SSE-NEXT: packsswb %xmm2, %xmm0
; SSE-NEXT: retq
;
; AVX1-LABEL: fptosi_16f32_to_16i8:
@ -2993,13 +2852,10 @@ define <16 x i8> @fptosi_16f32_to_16i8(<16 x float> %a) {
; AVX1-NEXT: vcvttps2dq %ymm1, %ymm1
; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
; AVX1-NEXT: vpackssdw %xmm2, %xmm1, %xmm1
; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [255,255,255,255,255,255,255,255]
; AVX1-NEXT: vpand %xmm2, %xmm1, %xmm1
; AVX1-NEXT: vcvttps2dq %ymm0, %ymm0
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3
; AVX1-NEXT: vpackssdw %xmm3, %xmm0, %xmm0
; AVX1-NEXT: vpand %xmm2, %xmm0, %xmm0
; AVX1-NEXT: vpackuswb %xmm1, %xmm0, %xmm0
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
; AVX1-NEXT: vpackssdw %xmm2, %xmm0, %xmm0
; AVX1-NEXT: vpacksswb %xmm1, %xmm0, %xmm0
; AVX1-NEXT: vzeroupper
; AVX1-NEXT: retq
;
@ -3008,13 +2864,10 @@ define <16 x i8> @fptosi_16f32_to_16i8(<16 x float> %a) {
; AVX2-NEXT: vcvttps2dq %ymm1, %ymm1
; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm2
; AVX2-NEXT: vpackssdw %xmm2, %xmm1, %xmm1
; AVX2-NEXT: vmovdqa {{.*#+}} xmm2 = [255,255,255,255,255,255,255,255]
; AVX2-NEXT: vpand %xmm2, %xmm1, %xmm1
; AVX2-NEXT: vcvttps2dq %ymm0, %ymm0
; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm3
; AVX2-NEXT: vpackssdw %xmm3, %xmm0, %xmm0
; AVX2-NEXT: vpand %xmm2, %xmm0, %xmm0
; AVX2-NEXT: vpackuswb %xmm1, %xmm0, %xmm0
; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm2
; AVX2-NEXT: vpackssdw %xmm2, %xmm0, %xmm0
; AVX2-NEXT: vpacksswb %xmm1, %xmm0, %xmm0
; AVX2-NEXT: vzeroupper
; AVX2-NEXT: retq
;
@ -3032,16 +2885,11 @@ define <16 x i8> @fptoui_16f32_to_16i8(<16 x float> %a) {
; SSE-LABEL: fptoui_16f32_to_16i8:
; SSE: # %bb.0:
; SSE-NEXT: cvttps2dq %xmm3, %xmm3
; SSE-NEXT: movdqa {{.*#+}} xmm4 = [255,255,255,255]
; SSE-NEXT: pand %xmm4, %xmm3
; SSE-NEXT: cvttps2dq %xmm2, %xmm2
; SSE-NEXT: pand %xmm4, %xmm2
; SSE-NEXT: packuswb %xmm3, %xmm2
; SSE-NEXT: packssdw %xmm3, %xmm2
; SSE-NEXT: cvttps2dq %xmm1, %xmm1
; SSE-NEXT: pand %xmm4, %xmm1
; SSE-NEXT: cvttps2dq %xmm0, %xmm0
; SSE-NEXT: pand %xmm4, %xmm0
; SSE-NEXT: packuswb %xmm1, %xmm0
; SSE-NEXT: packssdw %xmm1, %xmm0
; SSE-NEXT: packuswb %xmm2, %xmm0
; SSE-NEXT: retq
;
@ -3050,12 +2898,9 @@ define <16 x i8> @fptoui_16f32_to_16i8(<16 x float> %a) {
; AVX1-NEXT: vcvttps2dq %ymm1, %ymm1
; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
; AVX1-NEXT: vpackusdw %xmm2, %xmm1, %xmm1
; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [255,255,255,255,255,255,255,255]
; AVX1-NEXT: vpand %xmm2, %xmm1, %xmm1
; AVX1-NEXT: vcvttps2dq %ymm0, %ymm0
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3
; AVX1-NEXT: vpackusdw %xmm3, %xmm0, %xmm0
; AVX1-NEXT: vpand %xmm2, %xmm0, %xmm0
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
; AVX1-NEXT: vpackusdw %xmm2, %xmm0, %xmm0
; AVX1-NEXT: vpackuswb %xmm1, %xmm0, %xmm0
; AVX1-NEXT: vzeroupper
; AVX1-NEXT: retq
@ -3065,12 +2910,9 @@ define <16 x i8> @fptoui_16f32_to_16i8(<16 x float> %a) {
; AVX2-NEXT: vcvttps2dq %ymm1, %ymm1
; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm2
; AVX2-NEXT: vpackusdw %xmm2, %xmm1, %xmm1
; AVX2-NEXT: vmovdqa {{.*#+}} xmm2 = [255,255,255,255,255,255,255,255]
; AVX2-NEXT: vpand %xmm2, %xmm1, %xmm1
; AVX2-NEXT: vcvttps2dq %ymm0, %ymm0
; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm3
; AVX2-NEXT: vpackusdw %xmm3, %xmm0, %xmm0
; AVX2-NEXT: vpand %xmm2, %xmm0, %xmm0
; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm2
; AVX2-NEXT: vpackusdw %xmm2, %xmm0, %xmm0
; AVX2-NEXT: vpackuswb %xmm1, %xmm0, %xmm0
; AVX2-NEXT: vzeroupper
; AVX2-NEXT: retq