[X86] Replace X86ISD::AVG with generic ISD::AVGCEILU

Pulled out of D106237, this replaces the X86ISD::AVG DAG node with the
generic ISD::AVGCEILU. It doesn't remove the detectAVGPattern method,
but the extra generic ISel matching does alter the existing test.

Differential Revision: https://reviews.llvm.org/D119073
This commit is contained in:
David Green 2022-02-11 18:57:18 +00:00
parent 19fdf85f58
commit f810b40c3b
8 changed files with 70 additions and 54 deletions

View File

@ -949,6 +949,8 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
setOperationAction(ISD::MULHU, MVT::v8i16, Legal);
setOperationAction(ISD::MULHS, MVT::v8i16, Legal);
setOperationAction(ISD::MUL, MVT::v8i16, Legal);
setOperationAction(ISD::AVGCEILU, MVT::v16i8, Legal);
setOperationAction(ISD::AVGCEILU, MVT::v8i16, Legal);
setOperationAction(ISD::SMULO, MVT::v16i8, Custom);
setOperationAction(ISD::UMULO, MVT::v16i8, Custom);
@ -1353,6 +1355,10 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
setOperationAction(ISD::MULHS, MVT::v16i16, HasInt256 ? Legal : Custom);
setOperationAction(ISD::MULHU, MVT::v32i8, Custom);
setOperationAction(ISD::MULHS, MVT::v32i8, Custom);
if (HasInt256) {
setOperationAction(ISD::AVGCEILU, MVT::v16i16, Legal);
setOperationAction(ISD::AVGCEILU, MVT::v32i8, Legal);
}
setOperationAction(ISD::SMULO, MVT::v32i8, Custom);
setOperationAction(ISD::UMULO, MVT::v32i8, Custom);
@ -1652,6 +1658,10 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
setOperationAction(ISD::MULHU, MVT::v32i16, HasBWI ? Legal : Custom);
setOperationAction(ISD::MULHS, MVT::v64i8, Custom);
setOperationAction(ISD::MULHU, MVT::v64i8, Custom);
if (HasBWI) {
setOperationAction(ISD::AVGCEILU, MVT::v32i16, Legal);
setOperationAction(ISD::AVGCEILU, MVT::v64i8, Legal);
}
setOperationAction(ISD::SMULO, MVT::v64i8, Custom);
setOperationAction(ISD::UMULO, MVT::v64i8, Custom);
@ -31807,9 +31817,8 @@ void X86TargetLowering::ReplaceNodeResults(SDNode *N,
Results.push_back(Res);
return;
}
case X86ISD::VPMADDWD:
case X86ISD::AVG: {
// Legalize types for X86ISD::AVG/VPMADDWD by widening.
case X86ISD::VPMADDWD: {
// Legalize types for X86ISD::VPMADDWD by widening.
assert(Subtarget.hasSSE2() && "Requires at least SSE2!");
EVT VT = N->getValueType(0);
@ -33041,7 +33050,6 @@ const char *X86TargetLowering::getTargetNodeName(unsigned Opcode) const {
NODE_NAME_CASE(SCALEF_RND)
NODE_NAME_CASE(SCALEFS)
NODE_NAME_CASE(SCALEFS_RND)
NODE_NAME_CASE(AVG)
NODE_NAME_CASE(MULHRS)
NODE_NAME_CASE(SINT_TO_FP_RND)
NODE_NAME_CASE(UINT_TO_FP_RND)
@ -33222,7 +33230,6 @@ bool X86TargetLowering::isBinOp(unsigned Opcode) const {
bool X86TargetLowering::isCommutativeBinOp(unsigned Opcode) const {
switch (Opcode) {
// TODO: Add more X86ISD opcodes once we have test coverage.
case X86ISD::AVG:
case X86ISD::PCMPEQ:
case X86ISD::PMULDQ:
case X86ISD::PMULUDQ:
@ -40632,7 +40639,6 @@ bool X86TargetLowering::SimplifyDemandedVectorEltsForTargetNode(
case X86ISD::UNPCKH:
case X86ISD::BLENDI:
// Integer ops.
case X86ISD::AVG:
case X86ISD::PACKSS:
case X86ISD::PACKUS:
// Horizontal Ops.
@ -47789,7 +47795,7 @@ static SDValue combineTruncateWithSat(SDValue In, EVT VT, const SDLoc &DL,
/// This function detects the AVG pattern between vectors of unsigned i8/i16,
/// which is c = (a + b + 1) / 2, and replace this operation with the efficient
/// X86ISD::AVG instruction.
/// ISD::AVGCEILU (AVG) instruction.
static SDValue detectAVGPattern(SDValue In, EVT VT, SelectionDAG &DAG,
const X86Subtarget &Subtarget,
const SDLoc &DL) {
@ -47852,7 +47858,7 @@ static SDValue detectAVGPattern(SDValue In, EVT VT, SelectionDAG &DAG,
auto AVGBuilder = [](SelectionDAG &DAG, const SDLoc &DL,
ArrayRef<SDValue> Ops) {
return DAG.getNode(X86ISD::AVG, DL, Ops[0].getValueType(), Ops);
return DAG.getNode(ISD::AVGCEILU, DL, Ops[0].getValueType(), Ops);
};
auto AVGSplitter = [&](std::array<SDValue, 2> Ops) {

View File

@ -249,9 +249,6 @@ namespace llvm {
SCALEFS,
SCALEFS_RND,
// Unsigned Integer average.
AVG,
/// Integer horizontal add/sub.
HADD,
HSUB,

View File

@ -5039,7 +5039,7 @@ defm VPMULHUW : avx512_binop_rm_vl_w<0xE4, "vpmulhuw", mulhu, SchedWriteVecIMul,
HasBWI, 1>;
defm VPMULHRSW : avx512_binop_rm_vl_w<0x0B, "vpmulhrsw", X86mulhrs,
SchedWriteVecIMul, HasBWI, 1>, T8PD;
defm VPAVG : avx512_binop_rm_vl_bw<0xE0, 0xE3, "vpavg", X86avg,
defm VPAVG : avx512_binop_rm_vl_bw<0xE0, 0xE3, "vpavg", avgceilu,
SchedWriteVecALU, HasBWI, 1>;
defm VPMULDQ : avx512_binop_rm_vl_q<0x28, "vpmuldq", X86pmuldq,
SchedWriteVecIMul, HasAVX512, 1>, T8PD;

View File

@ -287,7 +287,6 @@ def SDTX86CmpPTest : SDTypeProfile<1, 2, [SDTCisVT<0, i32>,
SDTCisSameAs<2, 1>]>;
def X86mulhrs : SDNode<"X86ISD::MULHRS", SDTIntBinOp, [SDNPCommutative]>;
def X86avg : SDNode<"X86ISD::AVG" , SDTIntBinOp, [SDNPCommutative]>;
def X86ptest : SDNode<"X86ISD::PTEST", SDTX86CmpPTest>;
def X86testp : SDNode<"X86ISD::TESTP", SDTX86CmpPTest>;
def X86kortest : SDNode<"X86ISD::KORTEST", SDTX86CmpPTest>;

View File

@ -3471,9 +3471,9 @@ defm PMAXUB : PDI_binop_all<0xDE, "pmaxub", umax, v16i8, v32i8,
SchedWriteVecALU, 1, NoVLX_Or_NoBWI>;
defm PMAXSW : PDI_binop_all<0xEE, "pmaxsw", smax, v8i16, v16i16,
SchedWriteVecALU, 1, NoVLX_Or_NoBWI>;
defm PAVGB : PDI_binop_all<0xE0, "pavgb", X86avg, v16i8, v32i8,
defm PAVGB : PDI_binop_all<0xE0, "pavgb", avgceilu, v16i8, v32i8,
SchedWriteVecALU, 1, NoVLX_Or_NoBWI>;
defm PAVGW : PDI_binop_all<0xE3, "pavgw", X86avg, v8i16, v16i16,
defm PAVGW : PDI_binop_all<0xE3, "pavgw", avgceilu, v8i16, v16i16,
SchedWriteVecALU, 1, NoVLX_Or_NoBWI>;
defm PMULUDQ : PDI_binop_all<0xF4, "pmuludq", X86pmuludq, v2i64, v4i64,
SchedWriteVecIMul, 1, NoVLX>;

View File

@ -371,8 +371,8 @@ static const IntrinsicData IntrinsicsWithoutChain[] = {
X86_INTRINSIC_DATA(avx2_packsswb, INTR_TYPE_2OP, X86ISD::PACKSS, 0),
X86_INTRINSIC_DATA(avx2_packusdw, INTR_TYPE_2OP, X86ISD::PACKUS, 0),
X86_INTRINSIC_DATA(avx2_packuswb, INTR_TYPE_2OP, X86ISD::PACKUS, 0),
X86_INTRINSIC_DATA(avx2_pavg_b, INTR_TYPE_2OP, X86ISD::AVG, 0),
X86_INTRINSIC_DATA(avx2_pavg_w, INTR_TYPE_2OP, X86ISD::AVG, 0),
X86_INTRINSIC_DATA(avx2_pavg_b, INTR_TYPE_2OP, ISD::AVGCEILU, 0),
X86_INTRINSIC_DATA(avx2_pavg_w, INTR_TYPE_2OP, ISD::AVGCEILU, 0),
X86_INTRINSIC_DATA(avx2_pblendvb, BLENDV, X86ISD::BLENDV, 0),
X86_INTRINSIC_DATA(avx2_permd, VPERM_2OP, X86ISD::VPERMV, 0),
X86_INTRINSIC_DATA(avx2_permps, VPERM_2OP, X86ISD::VPERMV, 0),
@ -818,8 +818,8 @@ static const IntrinsicData IntrinsicsWithoutChain[] = {
X86_INTRINSIC_DATA(avx512_packsswb_512, INTR_TYPE_2OP, X86ISD::PACKSS, 0),
X86_INTRINSIC_DATA(avx512_packusdw_512, INTR_TYPE_2OP, X86ISD::PACKUS, 0),
X86_INTRINSIC_DATA(avx512_packuswb_512, INTR_TYPE_2OP, X86ISD::PACKUS, 0),
X86_INTRINSIC_DATA(avx512_pavg_b_512, INTR_TYPE_2OP, X86ISD::AVG, 0),
X86_INTRINSIC_DATA(avx512_pavg_w_512, INTR_TYPE_2OP, X86ISD::AVG, 0),
X86_INTRINSIC_DATA(avx512_pavg_b_512, INTR_TYPE_2OP, ISD::AVGCEILU, 0),
X86_INTRINSIC_DATA(avx512_pavg_w_512, INTR_TYPE_2OP, ISD::AVGCEILU, 0),
X86_INTRINSIC_DATA(avx512_permvar_df_256, VPERM_2OP, X86ISD::VPERMV, 0),
X86_INTRINSIC_DATA(avx512_permvar_df_512, VPERM_2OP, X86ISD::VPERMV, 0),
X86_INTRINSIC_DATA(avx512_permvar_di_256, VPERM_2OP, X86ISD::VPERMV, 0),
@ -1281,8 +1281,8 @@ static const IntrinsicData IntrinsicsWithoutChain[] = {
X86_INTRINSIC_DATA(sse2_packssdw_128, INTR_TYPE_2OP, X86ISD::PACKSS, 0),
X86_INTRINSIC_DATA(sse2_packsswb_128, INTR_TYPE_2OP, X86ISD::PACKSS, 0),
X86_INTRINSIC_DATA(sse2_packuswb_128, INTR_TYPE_2OP, X86ISD::PACKUS, 0),
X86_INTRINSIC_DATA(sse2_pavg_b, INTR_TYPE_2OP, X86ISD::AVG, 0),
X86_INTRINSIC_DATA(sse2_pavg_w, INTR_TYPE_2OP, X86ISD::AVG, 0),
X86_INTRINSIC_DATA(sse2_pavg_b, INTR_TYPE_2OP, ISD::AVGCEILU, 0),
X86_INTRINSIC_DATA(sse2_pavg_w, INTR_TYPE_2OP, ISD::AVGCEILU, 0),
X86_INTRINSIC_DATA(sse2_pmadd_wd, INTR_TYPE_2OP, X86ISD::VPMADDWD, 0),
X86_INTRINSIC_DATA(sse2_pmovmskb_128, INTR_TYPE_1OP, X86ISD::MOVMSK, 0),
X86_INTRINSIC_DATA(sse2_pmulh_w, INTR_TYPE_2OP, ISD::MULHS, 0),

View File

@ -64,17 +64,31 @@ define void @avg_v8i8(<8 x i8>* %a, <8 x i8>* %b) nounwind {
define void @avg_v16i8(<16 x i8>* %a, <16 x i8>* %b) nounwind {
; SSE2-LABEL: avg_v16i8:
; SSE2: # %bb.0:
; SSE2-NEXT: movdqa (%rsi), %xmm0
; SSE2-NEXT: pavgb (%rdi), %xmm0
; SSE2-NEXT: movdqa (%rdi), %xmm0
; SSE2-NEXT: pavgb (%rsi), %xmm0
; SSE2-NEXT: movdqu %xmm0, (%rax)
; SSE2-NEXT: retq
;
; AVX-LABEL: avg_v16i8:
; AVX: # %bb.0:
; AVX-NEXT: vmovdqa (%rsi), %xmm0
; AVX-NEXT: vpavgb (%rdi), %xmm0, %xmm0
; AVX-NEXT: vmovdqu %xmm0, (%rax)
; AVX-NEXT: retq
; AVX1-LABEL: avg_v16i8:
; AVX1: # %bb.0:
; AVX1-NEXT: vmovdqa (%rdi), %xmm0
; AVX1-NEXT: vpavgb (%rsi), %xmm0, %xmm0
; AVX1-NEXT: vmovdqu %xmm0, (%rax)
; AVX1-NEXT: retq
;
; AVX2-LABEL: avg_v16i8:
; AVX2: # %bb.0:
; AVX2-NEXT: vmovdqa (%rdi), %xmm0
; AVX2-NEXT: vpavgb (%rsi), %xmm0, %xmm0
; AVX2-NEXT: vmovdqu %xmm0, (%rax)
; AVX2-NEXT: retq
;
; AVX512-LABEL: avg_v16i8:
; AVX512: # %bb.0:
; AVX512-NEXT: vmovdqa (%rsi), %xmm0
; AVX512-NEXT: vpavgb (%rdi), %xmm0, %xmm0
; AVX512-NEXT: vmovdqu %xmm0, (%rax)
; AVX512-NEXT: retq
%1 = load <16 x i8>, <16 x i8>* %a
%2 = load <16 x i8>, <16 x i8>* %b
%3 = zext <16 x i8> %1 to <16 x i32>
@ -162,16 +176,16 @@ define void @avg_v32i8(<32 x i8>* %a, <32 x i8>* %b) nounwind {
;
; AVX2-LABEL: avg_v32i8:
; AVX2: # %bb.0:
; AVX2-NEXT: vmovdqa (%rsi), %ymm0
; AVX2-NEXT: vpavgb (%rdi), %ymm0, %ymm0
; AVX2-NEXT: vmovdqa (%rdi), %ymm0
; AVX2-NEXT: vpavgb (%rsi), %ymm0, %ymm0
; AVX2-NEXT: vmovdqu %ymm0, (%rax)
; AVX2-NEXT: vzeroupper
; AVX2-NEXT: retq
;
; AVX512-LABEL: avg_v32i8:
; AVX512: # %bb.0:
; AVX512-NEXT: vmovdqa (%rsi), %ymm0
; AVX512-NEXT: vpavgb (%rdi), %ymm0, %ymm0
; AVX512-NEXT: vmovdqa (%rdi), %ymm0
; AVX512-NEXT: vpavgb (%rsi), %ymm0, %ymm0
; AVX512-NEXT: vmovdqu %ymm0, (%rax)
; AVX512-NEXT: vzeroupper
; AVX512-NEXT: retq
@ -313,8 +327,8 @@ define void @avg_v64i8(<64 x i8>* %a, <64 x i8>* %b) nounwind {
;
; AVX512BW-LABEL: avg_v64i8:
; AVX512BW: # %bb.0:
; AVX512BW-NEXT: vmovdqa64 (%rsi), %zmm0
; AVX512BW-NEXT: vpavgb (%rdi), %zmm0, %zmm0
; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm0
; AVX512BW-NEXT: vpavgb (%rsi), %zmm0, %zmm0
; AVX512BW-NEXT: vmovdqu64 %zmm0, (%rax)
; AVX512BW-NEXT: vzeroupper
; AVX512BW-NEXT: retq
@ -361,15 +375,15 @@ define void @avg_v4i16(<4 x i16>* %a, <4 x i16>* %b) nounwind {
define void @avg_v8i16(<8 x i16>* %a, <8 x i16>* %b) nounwind {
; SSE2-LABEL: avg_v8i16:
; SSE2: # %bb.0:
; SSE2-NEXT: movdqa (%rsi), %xmm0
; SSE2-NEXT: pavgw (%rdi), %xmm0
; SSE2-NEXT: movdqa (%rdi), %xmm0
; SSE2-NEXT: pavgw (%rsi), %xmm0
; SSE2-NEXT: movdqu %xmm0, (%rax)
; SSE2-NEXT: retq
;
; AVX-LABEL: avg_v8i16:
; AVX: # %bb.0:
; AVX-NEXT: vmovdqa (%rsi), %xmm0
; AVX-NEXT: vpavgw (%rdi), %xmm0, %xmm0
; AVX-NEXT: vmovdqa (%rdi), %xmm0
; AVX-NEXT: vpavgw (%rsi), %xmm0, %xmm0
; AVX-NEXT: vmovdqu %xmm0, (%rax)
; AVX-NEXT: retq
%1 = load <8 x i16>, <8 x i16>* %a
@ -407,8 +421,8 @@ define void @avg_v16i16(<16 x i16>* %a, <16 x i16>* %b) nounwind {
;
; AVX2-LABEL: avg_v16i16:
; AVX2: # %bb.0:
; AVX2-NEXT: vmovdqa (%rsi), %ymm0
; AVX2-NEXT: vpavgw (%rdi), %ymm0, %ymm0
; AVX2-NEXT: vmovdqa (%rdi), %ymm0
; AVX2-NEXT: vpavgw (%rsi), %ymm0, %ymm0
; AVX2-NEXT: vmovdqu %ymm0, (%rax)
; AVX2-NEXT: vzeroupper
; AVX2-NEXT: retq
@ -489,8 +503,8 @@ define void @avg_v32i16(<32 x i16>* %a, <32 x i16>* %b) nounwind {
;
; AVX512BW-LABEL: avg_v32i16:
; AVX512BW: # %bb.0:
; AVX512BW-NEXT: vmovdqa64 (%rsi), %zmm0
; AVX512BW-NEXT: vpavgw (%rdi), %zmm0, %zmm0
; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm0
; AVX512BW-NEXT: vpavgw (%rsi), %zmm0, %zmm0
; AVX512BW-NEXT: vmovdqu64 %zmm0, (%rax)
; AVX512BW-NEXT: vzeroupper
; AVX512BW-NEXT: retq
@ -561,15 +575,15 @@ define void @avg_v40i16(<40 x i16>* %a, <40 x i16>* %b) nounwind {
;
; AVX512F-LABEL: avg_v40i16:
; AVX512F: # %bb.0:
; AVX512F-NEXT: vmovdqa 64(%rsi), %xmm0
; AVX512F-NEXT: vpavgw 64(%rdi), %xmm0, %xmm0
; AVX512F-NEXT: vmovdqa (%rsi), %ymm1
; AVX512F-NEXT: vmovdqa 32(%rsi), %ymm2
; AVX512F-NEXT: vpavgw (%rdi), %ymm1, %ymm1
; AVX512F-NEXT: vpavgw 32(%rdi), %ymm2, %ymm2
; AVX512F-NEXT: vmovdqu %ymm2, (%rax)
; AVX512F-NEXT: vmovdqa (%rsi), %ymm0
; AVX512F-NEXT: vmovdqa 32(%rsi), %ymm1
; AVX512F-NEXT: vpavgw (%rdi), %ymm0, %ymm0
; AVX512F-NEXT: vpavgw 32(%rdi), %ymm1, %ymm1
; AVX512F-NEXT: vmovdqa 64(%rsi), %xmm2
; AVX512F-NEXT: vpavgw 64(%rdi), %xmm2, %xmm2
; AVX512F-NEXT: vmovdqu %ymm1, (%rax)
; AVX512F-NEXT: vmovdqu %xmm0, (%rax)
; AVX512F-NEXT: vmovdqu %ymm0, (%rax)
; AVX512F-NEXT: vmovdqu %xmm2, (%rax)
; AVX512F-NEXT: vzeroupper
; AVX512F-NEXT: retq
;
@ -2645,7 +2659,7 @@ define <8 x i16> @PR52131_pavg_chain(<8 x i16> %a, <8 x i16> %b, <8 x i16> %c) {
;
; AVX-LABEL: PR52131_pavg_chain:
; AVX: # %bb.0:
; AVX-NEXT: vpavgw %xmm0, %xmm1, %xmm0
; AVX-NEXT: vpavgw %xmm1, %xmm0, %xmm0
; AVX-NEXT: vpavgw %xmm2, %xmm0, %xmm0
; AVX-NEXT: retq
%i = zext <8 x i16> %a to <8 x i32>

View File

@ -72,8 +72,8 @@ define dso_local void @avg_v64i8_256(<64 x i8>* %a, <64 x i8>* %b) "min-legal-ve
define dso_local void @avg_v64i8_512(<64 x i8>* %a, <64 x i8>* %b) "min-legal-vector-width"="512" {
; CHECK-LABEL: avg_v64i8_512:
; CHECK: # %bb.0:
; CHECK-NEXT: vmovdqa64 (%rsi), %zmm0
; CHECK-NEXT: vpavgb (%rdi), %zmm0, %zmm0
; CHECK-NEXT: vmovdqa64 (%rdi), %zmm0
; CHECK-NEXT: vpavgb (%rsi), %zmm0, %zmm0
; CHECK-NEXT: vmovdqu64 %zmm0, (%rax)
; CHECK-NEXT: vzeroupper
; CHECK-NEXT: retq