[X86] Use EVEX encoded VRNDSCALE instructions to implement the legacy round intrinsics.

The VRNDSCALE instructions implement a superset of the (V)ROUND instructions. They are equivalent if the upper 4-bits of the immediate are 0.

This patch lowers the legacy intrinsics to the VRNDSCALE ISD node and masks the upper bits of the immediate to 0. This allows us to take advantage of the larger register encoding space.

We should maybe consider converting VRNDSCALE back to VROUND in the EVEX to VEX pass if the extended registers are not being used.

I notice some load folding opportunities being missed for the VRNDSCALESS/SD instructions that I'll try to fix in future patches.

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@318008 91177308-0d34-0410-b5e6-96231b3b80d8
This commit is contained in:
Craig Topper 2017-11-13 02:03:00 +00:00
parent fa371074cb
commit e6cdd20402
9 changed files with 197 additions and 98 deletions

View File

@ -20336,6 +20336,26 @@ SDValue X86TargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
DAG.getIntPtrConstant(0, dl));
return DAG.getBitcast(Op.getValueType(), Res);
}
case ROUNDP: {
assert(IntrData->Opc0 == X86ISD::VRNDSCALE && "Unexpected opcode");
// Clear the upper bits of the rounding immediate so that the legacy
// intrinsic can't trigger the scaling behavior of VRNDSCALE.
SDValue RoundingMode = DAG.getNode(ISD::AND, dl, MVT::i32,
Op.getOperand(2),
DAG.getConstant(0xf, dl, MVT::i32));
return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(),
Op.getOperand(1), RoundingMode);
}
case ROUNDS: {
assert(IntrData->Opc0 == X86ISD::VRNDSCALES && "Unexpected opcode");
// Clear the upper bits of the rounding immediate so that the legacy
// intrinsic can't trigger the scaling behavior of VRNDSCALE.
SDValue RoundingMode = DAG.getNode(ISD::AND, dl, MVT::i32,
Op.getOperand(3),
DAG.getConstant(0xf, dl, MVT::i32));
return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(),
Op.getOperand(1), Op.getOperand(2), RoundingMode);
}
default:
break;
}

View File

@ -430,6 +430,8 @@ namespace llvm {
// Reduce - Perform Reduction Transformation on scalar\packed FP.
VREDUCE, VREDUCE_RND, VREDUCES, VREDUCES_RND,
// RndScale - Round FP Values To Include A Given Number Of Fraction Bits.
// Also used by the legacy (V)ROUND intrinsics where we mask out the
// scaling part of the immediate.
VRNDSCALE, VRNDSCALE_RND, VRNDSCALES, VRNDSCALES_RND,
// Tests Types Of a FP Values for packed types.
VFPCLASS,

View File

@ -5785,8 +5785,9 @@ let Predicates = [UseAVX] in {
multiclass sse41_fp_unop_p<bits<8> opcps, bits<8> opcpd, string OpcodeStr,
X86MemOperand x86memop, RegisterClass RC,
ValueType VT32, ValueType VT64,
PatFrag mem_frag32, PatFrag mem_frag64,
Intrinsic V4F32Int, Intrinsic V2F64Int> {
SDNode OpNode> {
let ExeDomain = SSEPackedSingle in {
// Intrinsic operation, reg.
// Vector intrinsic operation, reg
@ -5794,7 +5795,7 @@ let ExeDomain = SSEPackedSingle in {
(outs RC:$dst), (ins RC:$src1, i32u8imm:$src2),
!strconcat(OpcodeStr,
"ps\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
[(set RC:$dst, (V4F32Int RC:$src1, imm:$src2))],
[(set RC:$dst, (VT32 (OpNode RC:$src1, imm:$src2)))],
IIC_SSE_ROUNDPS_REG>, Sched<[WriteFAdd]>;
// Vector intrinsic operation, mem
@ -5803,7 +5804,7 @@ let ExeDomain = SSEPackedSingle in {
!strconcat(OpcodeStr,
"ps\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
[(set RC:$dst,
(V4F32Int (mem_frag32 addr:$src1),imm:$src2))],
(VT32 (OpNode (mem_frag32 addr:$src1),imm:$src2)))],
IIC_SSE_ROUNDPS_MEM>, Sched<[WriteFAddLd]>;
} // ExeDomain = SSEPackedSingle
@ -5813,7 +5814,7 @@ let ExeDomain = SSEPackedDouble in {
(outs RC:$dst), (ins RC:$src1, i32u8imm:$src2),
!strconcat(OpcodeStr,
"pd\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
[(set RC:$dst, (V2F64Int RC:$src1, imm:$src2))],
[(set RC:$dst, (VT64 (OpNode RC:$src1, imm:$src2)))],
IIC_SSE_ROUNDPS_REG>, Sched<[WriteFAdd]>;
// Vector intrinsic operation, mem
@ -5822,7 +5823,7 @@ let ExeDomain = SSEPackedDouble in {
!strconcat(OpcodeStr,
"pd\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
[(set RC:$dst,
(V2F64Int (mem_frag64 addr:$src1),imm:$src2))],
(VT64 (OpNode (mem_frag64 addr:$src1),imm:$src2)))],
IIC_SSE_ROUNDPS_REG>, Sched<[WriteFAddLd]>;
} // ExeDomain = SSEPackedDouble
}
@ -5894,9 +5895,8 @@ let ExeDomain = SSEPackedDouble, hasSideEffects = 0 in {
}
multiclass sse41_fp_binop_s<bits<8> opcss, bits<8> opcsd,
string OpcodeStr,
Intrinsic F32Int,
Intrinsic F64Int, bit Is2Addr = 1> {
string OpcodeStr, ValueType VT32, ValueType VT64,
SDNode OpNode, bit Is2Addr = 1> {
let ExeDomain = SSEPackedSingle, isCodeGenOnly = 1 in {
def SSr_Int : SS4AIi8<opcss, MRMSrcReg,
(outs VR128:$dst), (ins VR128:$src1, VR128:$src2, i32u8imm:$src3),
@ -5905,7 +5905,7 @@ let ExeDomain = SSEPackedSingle, isCodeGenOnly = 1 in {
"ss\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
!strconcat(OpcodeStr,
"ss\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")),
[(set VR128:$dst, (F32Int VR128:$src1, VR128:$src2, imm:$src3))]>,
[(set VR128:$dst, (VT32 (OpNode VR128:$src1, VR128:$src2, imm:$src3)))]>,
Sched<[WriteFAdd]>;
def SSm_Int : SS4AIi8<opcss, MRMSrcMem,
@ -5916,7 +5916,7 @@ let ExeDomain = SSEPackedSingle, isCodeGenOnly = 1 in {
!strconcat(OpcodeStr,
"ss\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")),
[(set VR128:$dst,
(F32Int VR128:$src1, sse_load_f32:$src2, imm:$src3))]>,
(OpNode VR128:$src1, sse_load_f32:$src2, imm:$src3))]>,
Sched<[WriteFAddLd, ReadAfterLd]>;
} // ExeDomain = SSEPackedSingle, isCodeGenOnly = 1
@ -5928,7 +5928,7 @@ let ExeDomain = SSEPackedDouble, isCodeGenOnly = 1 in {
"sd\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
!strconcat(OpcodeStr,
"sd\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")),
[(set VR128:$dst, (F64Int VR128:$src1, VR128:$src2, imm:$src3))]>,
[(set VR128:$dst, (VT64 (OpNode VR128:$src1, VR128:$src2, imm:$src3)))]>,
Sched<[WriteFAdd]>;
def SDm_Int : SS4AIi8<opcsd, MRMSrcMem,
@ -5939,25 +5939,24 @@ let ExeDomain = SSEPackedDouble, isCodeGenOnly = 1 in {
!strconcat(OpcodeStr,
"sd\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")),
[(set VR128:$dst,
(F64Int VR128:$src1, sse_load_f64:$src2, imm:$src3))]>,
(OpNode VR128:$src1, sse_load_f64:$src2, imm:$src3))]>,
Sched<[WriteFAddLd, ReadAfterLd]>;
} // ExeDomain = SSEPackedDouble, isCodeGenOnly = 1
}
// FP round - roundss, roundps, roundsd, roundpd
let Predicates = [HasAVX] in {
let Predicates = [HasAVX, NoVLX] in {
// Intrinsic form
defm VROUND : sse41_fp_unop_p<0x08, 0x09, "vround", f128mem, VR128,
loadv4f32, loadv2f64,
int_x86_sse41_round_ps,
int_x86_sse41_round_pd>, VEX, VEX_WIG;
defm VROUNDY : sse41_fp_unop_p<0x08, 0x09, "vround", f256mem, VR256,
loadv8f32, loadv4f64,
int_x86_avx_round_ps_256,
int_x86_avx_round_pd_256>, VEX, VEX_L, VEX_WIG;
defm VROUND : sse41_fp_binop_s<0x0A, 0x0B, "vround",
int_x86_sse41_round_ss,
int_x86_sse41_round_sd, 0>, VEX_4V, VEX_LIG, VEX_WIG;
defm VROUND : sse41_fp_unop_p<0x08, 0x09, "vround", f128mem, VR128, v4f32,
v2f64, loadv4f32, loadv2f64, X86VRndScale>,
VEX, VEX_WIG;
defm VROUNDY : sse41_fp_unop_p<0x08, 0x09, "vround", f256mem, VR256, v8f32,
v4f64, loadv8f32, loadv4f64, X86VRndScale>,
VEX, VEX_L, VEX_WIG;
}
let Predicates = [HasAVX, NoAVX512] in {
defm VROUND : sse41_fp_binop_s<0x0A, 0x0B, "vround", v4f32, v2f64,
X86RndScales, 0>, VEX_4V, VEX_LIG, VEX_WIG;
defm VROUND : avx_fp_unop_rm<0x0A, 0x0B, "vround">, VEX_4V, VEX_LIG, VEX_WIG;
}
@ -6030,15 +6029,13 @@ let Predicates = [HasAVX, NoVLX] in {
(VROUNDYPDr VR256:$src, (i32 0xB))>;
}
defm ROUND : sse41_fp_unop_p<0x08, 0x09, "round", f128mem, VR128,
memopv4f32, memopv2f64, int_x86_sse41_round_ps,
int_x86_sse41_round_pd>;
defm ROUND : sse41_fp_unop_p<0x08, 0x09, "round", f128mem, VR128, v4f32, v2f64,
memopv4f32, memopv2f64, X86VRndScale>;
defm ROUND : sse41_fp_unop_s<0x0A, 0x0B, "round">;
let Constraints = "$src1 = $dst" in
defm ROUND : sse41_fp_binop_s<0x0A, 0x0B, "round",
int_x86_sse41_round_ss, int_x86_sse41_round_sd>;
defm ROUND : sse41_fp_binop_s<0x0A, 0x0B, "round", v4f32, v2f64, X86RndScales>;
let Predicates = [UseSSE41] in {
def : Pat<(ffloor FR32:$src),

View File

@ -38,6 +38,7 @@ enum IntrinsicType : uint16_t {
EXPAND_FROM_MEM,
TERLOG_OP_MASK, TERLOG_OP_MASKZ, BROADCASTM, KUNPCK, FIXUPIMM, FIXUPIMM_MASKZ, FIXUPIMMS,
FIXUPIMMS_MASKZ, CONVERT_TO_MASK, GATHER_AVX2, MASK_BINOP,
ROUNDP, ROUNDS
};
struct IntrinsicData {
@ -363,6 +364,8 @@ static const IntrinsicData IntrinsicsWithoutChain[] = {
X86_INTRINSIC_DATA(avx_movmsk_pd_256, INTR_TYPE_1OP, X86ISD::MOVMSK, 0),
X86_INTRINSIC_DATA(avx_movmsk_ps_256, INTR_TYPE_1OP, X86ISD::MOVMSK, 0),
X86_INTRINSIC_DATA(avx_rcp_ps_256, INTR_TYPE_1OP, X86ISD::FRCP, 0),
X86_INTRINSIC_DATA(avx_round_pd_256, ROUNDP, X86ISD::VRNDSCALE, 0),
X86_INTRINSIC_DATA(avx_round_ps_256, ROUNDP, X86ISD::VRNDSCALE, 0),
X86_INTRINSIC_DATA(avx_rsqrt_ps_256, INTR_TYPE_1OP, X86ISD::FRSQRT, 0),
X86_INTRINSIC_DATA(avx_sqrt_pd_256, INTR_TYPE_1OP, ISD::FSQRT, 0),
X86_INTRINSIC_DATA(avx_sqrt_ps_256, INTR_TYPE_1OP, ISD::FSQRT, 0),
@ -1577,6 +1580,10 @@ static const IntrinsicData IntrinsicsWithoutChain[] = {
X86_INTRINSIC_DATA(sse41_insertps, INTR_TYPE_3OP, X86ISD::INSERTPS, 0),
X86_INTRINSIC_DATA(sse41_packusdw, INTR_TYPE_2OP, X86ISD::PACKUS, 0),
X86_INTRINSIC_DATA(sse41_pmuldq, INTR_TYPE_2OP, X86ISD::PMULDQ, 0),
X86_INTRINSIC_DATA(sse41_round_pd, ROUNDP, X86ISD::VRNDSCALE, 0),
X86_INTRINSIC_DATA(sse41_round_ps, ROUNDP, X86ISD::VRNDSCALE, 0),
X86_INTRINSIC_DATA(sse41_round_sd, ROUNDS, X86ISD::VRNDSCALES, 0),
X86_INTRINSIC_DATA(sse41_round_ss, ROUNDS, X86ISD::VRNDSCALES, 0),
X86_INTRINSIC_DATA(sse4a_extrqi, INTR_TYPE_3OP, X86ISD::EXTRQI, 0),
X86_INTRINSIC_DATA(sse4a_insertqi, INTR_TYPE_4OP, X86ISD::INSERTQI, 0),
X86_INTRINSIC_DATA(ssse3_phadd_d_128, INTR_TYPE_2OP, X86ISD::HADD, 0),

View File

@ -592,10 +592,15 @@ declare <8 x float> @llvm.x86.avx.rcp.ps.256(<8 x float>) nounwind readnone
define <4 x double> @test_x86_avx_round_pd_256(<4 x double> %a0) {
; CHECK-LABEL: test_x86_avx_round_pd_256:
; CHECK: # BB#0:
; CHECK-NEXT: vroundpd $7, %ymm0, %ymm0 # encoding: [0xc4,0xe3,0x7d,0x09,0xc0,0x07]
; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3]
; AVX-LABEL: test_x86_avx_round_pd_256:
; AVX: # BB#0:
; AVX-NEXT: vroundpd $7, %ymm0, %ymm0 # encoding: [0xc4,0xe3,0x7d,0x09,0xc0,0x07]
; AVX-NEXT: ret{{[l|q]}} # encoding: [0xc3]
;
; AVX512VL-LABEL: test_x86_avx_round_pd_256:
; AVX512VL: # BB#0:
; AVX512VL-NEXT: vrndscalepd $7, %ymm0, %ymm0 # encoding: [0x62,0xf3,0xfd,0x28,0x09,0xc0,0x07]
; AVX512VL-NEXT: ret{{[l|q]}} # encoding: [0xc3]
%res = call <4 x double> @llvm.x86.avx.round.pd.256(<4 x double> %a0, i32 7) ; <<4 x double>> [#uses=1]
ret <4 x double> %res
}
@ -603,10 +608,15 @@ declare <4 x double> @llvm.x86.avx.round.pd.256(<4 x double>, i32) nounwind read
define <8 x float> @test_x86_avx_round_ps_256(<8 x float> %a0) {
; CHECK-LABEL: test_x86_avx_round_ps_256:
; CHECK: # BB#0:
; CHECK-NEXT: vroundps $7, %ymm0, %ymm0 # encoding: [0xc4,0xe3,0x7d,0x08,0xc0,0x07]
; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3]
; AVX-LABEL: test_x86_avx_round_ps_256:
; AVX: # BB#0:
; AVX-NEXT: vroundps $7, %ymm0, %ymm0 # encoding: [0xc4,0xe3,0x7d,0x08,0xc0,0x07]
; AVX-NEXT: ret{{[l|q]}} # encoding: [0xc3]
;
; AVX512VL-LABEL: test_x86_avx_round_ps_256:
; AVX512VL: # BB#0:
; AVX512VL-NEXT: vrndscaleps $7, %ymm0, %ymm0 # encoding: [0x62,0xf3,0x7d,0x28,0x08,0xc0,0x07]
; AVX512VL-NEXT: ret{{[l|q]}} # encoding: [0xc3]
%res = call <8 x float> @llvm.x86.avx.round.ps.256(<8 x float> %a0, i32 7) ; <<8 x float>> [#uses=1]
ret <8 x float> %res
}

View File

@ -4046,8 +4046,8 @@ define <4 x double> @test_roundpd(<4 x double> %a0, <4 x double> *%a1) {
;
; SKX-LABEL: test_roundpd:
; SKX: # BB#0:
; SKX-NEXT: vroundpd $7, %ymm0, %ymm0 # sched: [8:0.67]
; SKX-NEXT: vroundpd $7, (%rdi), %ymm1 # sched: [15:0.67]
; SKX-NEXT: vrndscalepd $7, %ymm0, %ymm0 # sched: [8:0.67]
; SKX-NEXT: vrndscalepd $7, (%rdi), %ymm1 # sched: [15:0.67]
; SKX-NEXT: vaddpd %ymm1, %ymm0, %ymm0 # sched: [4:0.33]
; SKX-NEXT: retq # sched: [7:1.00]
;
@ -4110,8 +4110,8 @@ define <8 x float> @test_roundps(<8 x float> %a0, <8 x float> *%a1) {
;
; SKX-LABEL: test_roundps:
; SKX: # BB#0:
; SKX-NEXT: vroundps $7, %ymm0, %ymm0 # sched: [8:0.67]
; SKX-NEXT: vroundps $7, (%rdi), %ymm1 # sched: [15:0.67]
; SKX-NEXT: vrndscaleps $7, %ymm0, %ymm0 # sched: [8:0.67]
; SKX-NEXT: vrndscaleps $7, (%rdi), %ymm1 # sched: [15:0.67]
; SKX-NEXT: vaddps %ymm1, %ymm0, %ymm0 # sched: [4:0.33]
; SKX-NEXT: retq # sched: [7:1.00]
;

View File

@ -451,10 +451,15 @@ define <2 x double> @test_x86_sse41_round_pd(<2 x double> %a0) {
; SSE41-NEXT: roundpd $7, %xmm0, %xmm0 ## encoding: [0x66,0x0f,0x3a,0x09,0xc0,0x07]
; SSE41-NEXT: retl ## encoding: [0xc3]
;
; VCHECK-LABEL: test_x86_sse41_round_pd:
; VCHECK: ## BB#0:
; VCHECK-NEXT: vroundpd $7, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x09,0xc0,0x07]
; VCHECK-NEXT: retl ## encoding: [0xc3]
; AVX2-LABEL: test_x86_sse41_round_pd:
; AVX2: ## BB#0:
; AVX2-NEXT: vroundpd $7, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x09,0xc0,0x07]
; AVX2-NEXT: retl ## encoding: [0xc3]
;
; SKX-LABEL: test_x86_sse41_round_pd:
; SKX: ## BB#0:
; SKX-NEXT: vrndscalepd $7, %xmm0, %xmm0 ## encoding: [0x62,0xf3,0xfd,0x08,0x09,0xc0,0x07]
; SKX-NEXT: retl ## encoding: [0xc3]
%res = call <2 x double> @llvm.x86.sse41.round.pd(<2 x double> %a0, i32 7) ; <<2 x double>> [#uses=1]
ret <2 x double> %res
}
@ -467,10 +472,15 @@ define <4 x float> @test_x86_sse41_round_ps(<4 x float> %a0) {
; SSE41-NEXT: roundps $7, %xmm0, %xmm0 ## encoding: [0x66,0x0f,0x3a,0x08,0xc0,0x07]
; SSE41-NEXT: retl ## encoding: [0xc3]
;
; VCHECK-LABEL: test_x86_sse41_round_ps:
; VCHECK: ## BB#0:
; VCHECK-NEXT: vroundps $7, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x08,0xc0,0x07]
; VCHECK-NEXT: retl ## encoding: [0xc3]
; AVX2-LABEL: test_x86_sse41_round_ps:
; AVX2: ## BB#0:
; AVX2-NEXT: vroundps $7, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x08,0xc0,0x07]
; AVX2-NEXT: retl ## encoding: [0xc3]
;
; SKX-LABEL: test_x86_sse41_round_ps:
; SKX: ## BB#0:
; SKX-NEXT: vrndscaleps $7, %xmm0, %xmm0 ## encoding: [0x62,0xf3,0x7d,0x08,0x08,0xc0,0x07]
; SKX-NEXT: retl ## encoding: [0xc3]
%res = call <4 x float> @llvm.x86.sse41.round.ps(<4 x float> %a0, i32 7) ; <<4 x float>> [#uses=1]
ret <4 x float> %res
}
@ -483,10 +493,15 @@ define <2 x double> @test_x86_sse41_round_sd(<2 x double> %a0, <2 x double> %a1)
; SSE41-NEXT: roundsd $7, %xmm1, %xmm0 ## encoding: [0x66,0x0f,0x3a,0x0b,0xc1,0x07]
; SSE41-NEXT: retl ## encoding: [0xc3]
;
; VCHECK-LABEL: test_x86_sse41_round_sd:
; VCHECK: ## BB#0:
; VCHECK-NEXT: vroundsd $7, %xmm1, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x0b,0xc1,0x07]
; VCHECK-NEXT: retl ## encoding: [0xc3]
; AVX2-LABEL: test_x86_sse41_round_sd:
; AVX2: ## BB#0:
; AVX2-NEXT: vroundsd $7, %xmm1, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x0b,0xc1,0x07]
; AVX2-NEXT: retl ## encoding: [0xc3]
;
; SKX-LABEL: test_x86_sse41_round_sd:
; SKX: ## BB#0:
; SKX-NEXT: vrndscalesd $7, %xmm1, %xmm0, %xmm0 ## encoding: [0x62,0xf3,0xfd,0x08,0x0b,0xc1,0x07]
; SKX-NEXT: retl ## encoding: [0xc3]
%res = call <2 x double> @llvm.x86.sse41.round.sd(<2 x double> %a0, <2 x double> %a1, i32 7) ; <<2 x double>> [#uses=1]
ret <2 x double> %res
}
@ -500,11 +515,18 @@ define <2 x double> @test_x86_sse41_round_sd_load(<2 x double> %a0, <2 x double>
; SSE41-NEXT: roundsd $7, (%eax), %xmm0 ## encoding: [0x66,0x0f,0x3a,0x0b,0x00,0x07]
; SSE41-NEXT: retl ## encoding: [0xc3]
;
; VCHECK-LABEL: test_x86_sse41_round_sd_load:
; VCHECK: ## BB#0:
; VCHECK-NEXT: movl {{[0-9]+}}(%esp), %eax ## encoding: [0x8b,0x44,0x24,0x04]
; VCHECK-NEXT: vroundsd $7, (%eax), %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x0b,0x00,0x07]
; VCHECK-NEXT: retl ## encoding: [0xc3]
; AVX2-LABEL: test_x86_sse41_round_sd_load:
; AVX2: ## BB#0:
; AVX2-NEXT: movl {{[0-9]+}}(%esp), %eax ## encoding: [0x8b,0x44,0x24,0x04]
; AVX2-NEXT: vroundsd $7, (%eax), %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x0b,0x00,0x07]
; AVX2-NEXT: retl ## encoding: [0xc3]
;
; SKX-LABEL: test_x86_sse41_round_sd_load:
; SKX: ## BB#0:
; SKX-NEXT: movl {{[0-9]+}}(%esp), %eax ## encoding: [0x8b,0x44,0x24,0x04]
; SKX-NEXT: vmovapd (%eax), %xmm1 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x28,0x08]
; SKX-NEXT: vrndscalesd $7, %xmm1, %xmm0, %xmm0 ## encoding: [0x62,0xf3,0xfd,0x08,0x0b,0xc1,0x07]
; SKX-NEXT: retl ## encoding: [0xc3]
%a1b = load <2 x double>, <2 x double>* %a1
%res = call <2 x double> @llvm.x86.sse41.round.sd(<2 x double> %a0, <2 x double> %a1b, i32 7) ; <<2 x double>> [#uses=1]
ret <2 x double> %res
@ -517,10 +539,15 @@ define <4 x float> @test_x86_sse41_round_ss(<4 x float> %a0, <4 x float> %a1) {
; SSE41-NEXT: roundss $7, %xmm1, %xmm0 ## encoding: [0x66,0x0f,0x3a,0x0a,0xc1,0x07]
; SSE41-NEXT: retl ## encoding: [0xc3]
;
; VCHECK-LABEL: test_x86_sse41_round_ss:
; VCHECK: ## BB#0:
; VCHECK-NEXT: vroundss $7, %xmm1, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x0a,0xc1,0x07]
; VCHECK-NEXT: retl ## encoding: [0xc3]
; AVX2-LABEL: test_x86_sse41_round_ss:
; AVX2: ## BB#0:
; AVX2-NEXT: vroundss $7, %xmm1, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x0a,0xc1,0x07]
; AVX2-NEXT: retl ## encoding: [0xc3]
;
; SKX-LABEL: test_x86_sse41_round_ss:
; SKX: ## BB#0:
; SKX-NEXT: vrndscaless $7, %xmm1, %xmm0, %xmm0 ## encoding: [0x62,0xf3,0x7d,0x08,0x0a,0xc1,0x07]
; SKX-NEXT: retl ## encoding: [0xc3]
%res = call <4 x float> @llvm.x86.sse41.round.ss(<4 x float> %a0, <4 x float> %a1, i32 7) ; <<4 x float>> [#uses=1]
ret <4 x float> %res
}

View File

@ -3006,8 +3006,8 @@ define <2 x double> @test_roundpd(<2 x double> %a0, <2 x double> *%a1) {
;
; SKX-LABEL: test_roundpd:
; SKX: # BB#0:
; SKX-NEXT: vroundpd $7, %xmm0, %xmm0 # sched: [8:0.67]
; SKX-NEXT: vroundpd $7, (%rdi), %xmm1 # sched: [14:0.67]
; SKX-NEXT: vrndscalepd $7, %xmm0, %xmm0 # sched: [8:0.67]
; SKX-NEXT: vrndscalepd $7, (%rdi), %xmm1 # sched: [14:0.67]
; SKX-NEXT: vaddpd %xmm1, %xmm0, %xmm0 # sched: [4:0.33]
; SKX-NEXT: retq # sched: [7:1.00]
;
@ -3078,8 +3078,8 @@ define <4 x float> @test_roundps(<4 x float> %a0, <4 x float> *%a1) {
;
; SKX-LABEL: test_roundps:
; SKX: # BB#0:
; SKX-NEXT: vroundps $7, %xmm0, %xmm0 # sched: [8:0.67]
; SKX-NEXT: vroundps $7, (%rdi), %xmm1 # sched: [14:0.67]
; SKX-NEXT: vrndscaleps $7, %xmm0, %xmm0 # sched: [8:0.67]
; SKX-NEXT: vrndscaleps $7, (%rdi), %xmm1 # sched: [14:0.67]
; SKX-NEXT: vaddps %xmm1, %xmm0, %xmm0 # sched: [4:0.33]
; SKX-NEXT: retq # sched: [7:1.00]
;
@ -3151,8 +3151,9 @@ define <2 x double> @test_roundsd(<2 x double> %a0, <2 x double> %a1, <2 x doubl
;
; SKX-LABEL: test_roundsd:
; SKX: # BB#0:
; SKX-NEXT: vroundsd $7, %xmm1, %xmm0, %xmm1 # sched: [8:0.67]
; SKX-NEXT: vroundsd $7, (%rdi), %xmm0, %xmm0 # sched: [14:0.67]
; SKX-NEXT: vrndscalesd $7, %xmm1, %xmm0, %xmm1 # sched: [8:0.67]
; SKX-NEXT: vmovapd (%rdi), %xmm2 # sched: [6:0.50]
; SKX-NEXT: vrndscalesd $7, %xmm2, %xmm0, %xmm0 # sched: [8:0.67]
; SKX-NEXT: vaddpd %xmm0, %xmm1, %xmm0 # sched: [4:0.33]
; SKX-NEXT: retq # sched: [7:1.00]
;
@ -3224,8 +3225,9 @@ define <4 x float> @test_roundss(<4 x float> %a0, <4 x float> %a1, <4 x float> *
;
; SKX-LABEL: test_roundss:
; SKX: # BB#0:
; SKX-NEXT: vroundss $7, %xmm1, %xmm0, %xmm1 # sched: [8:0.67]
; SKX-NEXT: vroundss $7, (%rdi), %xmm0, %xmm0 # sched: [14:0.67]
; SKX-NEXT: vrndscaless $7, %xmm1, %xmm0, %xmm1 # sched: [8:0.67]
; SKX-NEXT: vmovaps (%rdi), %xmm2 # sched: [6:0.50]
; SKX-NEXT: vrndscaless $7, %xmm2, %xmm0, %xmm0 # sched: [8:0.67]
; SKX-NEXT: vaddps %xmm0, %xmm1, %xmm0 # sched: [4:0.33]
; SKX-NEXT: retq # sched: [7:1.00]
;

View File

@ -176,16 +176,27 @@ define <4 x float> @test3(<4 x float> %A, float *%b, i32 %C) nounwind {
; X64-NEXT: roundss $4, (%rdi), %xmm0
; X64-NEXT: retq
;
; X32_AVX-LABEL: test3:
; X32_AVX: ## BB#0:
; X32_AVX-NEXT: movl {{[0-9]+}}(%esp), %eax
; X32_AVX-NEXT: vroundss $4, (%eax), %xmm0, %xmm0
; X32_AVX-NEXT: retl
; X32_AVX1-LABEL: test3:
; X32_AVX1: ## BB#0:
; X32_AVX1-NEXT: movl {{[0-9]+}}(%esp), %eax
; X32_AVX1-NEXT: vroundss $4, (%eax), %xmm0, %xmm0
; X32_AVX1-NEXT: retl
;
; X64_AVX-LABEL: test3:
; X64_AVX: ## BB#0:
; X64_AVX-NEXT: vroundss $4, (%rdi), %xmm0, %xmm0
; X64_AVX-NEXT: retq
; X64_AVX1-LABEL: test3:
; X64_AVX1: ## BB#0:
; X64_AVX1-NEXT: vroundss $4, (%rdi), %xmm0, %xmm0
; X64_AVX1-NEXT: retq
;
; X32_AVX512-LABEL: test3:
; X32_AVX512: ## BB#0:
; X32_AVX512-NEXT: movl {{[0-9]+}}(%esp), %eax
; X32_AVX512-NEXT: vrndscaless $4, (%eax), %xmm0, %xmm0
; X32_AVX512-NEXT: retl
;
; X64_AVX512-LABEL: test3:
; X64_AVX512: ## BB#0:
; X64_AVX512-NEXT: vrndscaless $4, (%rdi), %xmm0, %xmm0
; X64_AVX512-NEXT: retq
%a = load float , float *%b
%B = insertelement <4 x float> undef, float %a, i32 0
%X = call <4 x float> @llvm.x86.sse41.round.ss(<4 x float> %A, <4 x float> %B, i32 4)
@ -214,26 +225,49 @@ define <4 x float> @test4(<4 x float> %A, float *%b, i32 %C) nounwind {
; X64-NEXT: addq $24, %rsp
; X64-NEXT: retq
;
; X32_AVX-LABEL: test4:
; X32_AVX: ## BB#0:
; X32_AVX-NEXT: subl $28, %esp
; X32_AVX-NEXT: movl {{[0-9]+}}(%esp), %eax
; X32_AVX-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
; X32_AVX-NEXT: vmovaps %xmm0, (%esp) ## 16-byte Spill
; X32_AVX-NEXT: calll _f
; X32_AVX-NEXT: vroundss $4, (%esp), %xmm0, %xmm0 ## 16-byte Folded Reload
; X32_AVX-NEXT: addl $28, %esp
; X32_AVX-NEXT: retl
; X32_AVX1-LABEL: test4:
; X32_AVX1: ## BB#0:
; X32_AVX1-NEXT: subl $28, %esp
; X32_AVX1-NEXT: movl {{[0-9]+}}(%esp), %eax
; X32_AVX1-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
; X32_AVX1-NEXT: vmovaps %xmm0, (%esp) ## 16-byte Spill
; X32_AVX1-NEXT: calll _f
; X32_AVX1-NEXT: vroundss $4, (%esp), %xmm0, %xmm0 ## 16-byte Folded Reload
; X32_AVX1-NEXT: addl $28, %esp
; X32_AVX1-NEXT: retl
;
; X64_AVX-LABEL: test4:
; X64_AVX: ## BB#0:
; X64_AVX-NEXT: subq $24, %rsp
; X64_AVX-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
; X64_AVX-NEXT: vmovaps %xmm0, (%rsp) ## 16-byte Spill
; X64_AVX-NEXT: callq _f
; X64_AVX-NEXT: vroundss $4, (%rsp), %xmm0, %xmm0 ## 16-byte Folded Reload
; X64_AVX-NEXT: addq $24, %rsp
; X64_AVX-NEXT: retq
; X64_AVX1-LABEL: test4:
; X64_AVX1: ## BB#0:
; X64_AVX1-NEXT: subq $24, %rsp
; X64_AVX1-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
; X64_AVX1-NEXT: vmovaps %xmm0, (%rsp) ## 16-byte Spill
; X64_AVX1-NEXT: callq _f
; X64_AVX1-NEXT: vroundss $4, (%rsp), %xmm0, %xmm0 ## 16-byte Folded Reload
; X64_AVX1-NEXT: addq $24, %rsp
; X64_AVX1-NEXT: retq
;
; X32_AVX512-LABEL: test4:
; X32_AVX512: ## BB#0:
; X32_AVX512-NEXT: subl $28, %esp
; X32_AVX512-NEXT: movl {{[0-9]+}}(%esp), %eax
; X32_AVX512-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
; X32_AVX512-NEXT: vmovaps %xmm0, (%esp) ## 16-byte Spill
; X32_AVX512-NEXT: calll _f
; X32_AVX512-NEXT: vmovaps (%esp), %xmm1 ## 16-byte Reload
; X32_AVX512-NEXT: vrndscaless $4, %xmm1, %xmm0, %xmm0
; X32_AVX512-NEXT: addl $28, %esp
; X32_AVX512-NEXT: retl
;
; X64_AVX512-LABEL: test4:
; X64_AVX512: ## BB#0:
; X64_AVX512-NEXT: subq $24, %rsp
; X64_AVX512-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
; X64_AVX512-NEXT: vmovaps %xmm0, (%rsp) ## 16-byte Spill
; X64_AVX512-NEXT: callq _f
; X64_AVX512-NEXT: vmovaps (%rsp), %xmm1 ## 16-byte Reload
; X64_AVX512-NEXT: vrndscaless $4, %xmm1, %xmm0, %xmm0
; X64_AVX512-NEXT: addq $24, %rsp
; X64_AVX512-NEXT: retq
%a = load float , float *%b
%B = insertelement <4 x float> undef, float %a, i32 0
%q = call <4 x float> @f()