mirror of
https://github.com/RPCSX/llvm.git
synced 2025-01-24 05:09:34 +00:00
[x86, SSE] change patterns for CMPP to float types to allow matching with SSE1 (PR28044)
This patch is intended to solve: https://llvm.org/bugs/show_bug.cgi?id=28044 By changing the definition of X86ISD::CMPP to use float types, we allow it to be created and pass legalization for an SSE1-only target where v4i32 is not legal. The motivational trail for this change includes: https://llvm.org/bugs/show_bug.cgi?id=28001 and eventually makes this trigger: http://reviews.llvm.org/D21190 Ie, after this step, we should be free to have Clang generate FP compare IR instead of x86 intrinsics for SSE C packed compare intrinsics. (We can auto-upgrade and remove the LLVM sse.cmp intrinsics as a follow-up step.) Once we're generating vector IR instead of x86 intrinsics, a big pile of generic optimizations can trigger. Differential Revision: http://reviews.llvm.org/D21235 git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@272511 91177308-0d34-0410-b5e6-96231b3b80d8
This commit is contained in:
parent
b2cfb64e72
commit
9a476793c5
@ -15168,32 +15168,57 @@ static SDValue LowerVSETCC(SDValue Op, const X86Subtarget &Subtarget,
|
|||||||
assert(EltVT == MVT::f32 || EltVT == MVT::f64);
|
assert(EltVT == MVT::f32 || EltVT == MVT::f64);
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
unsigned SSECC = translateX86FSETCC(SetCCOpcode, Op0, Op1);
|
unsigned Opc;
|
||||||
unsigned Opc = X86ISD::CMPP;
|
|
||||||
if (Subtarget.hasAVX512() && VT.getVectorElementType() == MVT::i1) {
|
if (Subtarget.hasAVX512() && VT.getVectorElementType() == MVT::i1) {
|
||||||
assert(VT.getVectorNumElements() <= 16);
|
assert(VT.getVectorNumElements() <= 16);
|
||||||
Opc = X86ISD::CMPM;
|
Opc = X86ISD::CMPM;
|
||||||
|
} else {
|
||||||
|
Opc = X86ISD::CMPP;
|
||||||
|
// The SSE/AVX packed FP comparison nodes are defined with a
|
||||||
|
// floating-point vector result that matches the operand type. This allows
|
||||||
|
// them to work with an SSE1 target (integer vector types are not legal).
|
||||||
|
VT = Op0.getSimpleValueType();
|
||||||
}
|
}
|
||||||
// In the two special cases we can't handle, emit two comparisons.
|
|
||||||
|
// In the two cases not handled by SSE compare predicates (SETUEQ/SETONE),
|
||||||
|
// emit two comparisons and a logic op to tie them together.
|
||||||
|
// TODO: This can be avoided if Intel (and only Intel as of 2016) AVX is
|
||||||
|
// available.
|
||||||
|
SDValue Cmp;
|
||||||
|
unsigned SSECC = translateX86FSETCC(SetCCOpcode, Op0, Op1);
|
||||||
if (SSECC == 8) {
|
if (SSECC == 8) {
|
||||||
|
// LLVM predicate is SETUEQ or SETONE.
|
||||||
unsigned CC0, CC1;
|
unsigned CC0, CC1;
|
||||||
unsigned CombineOpc;
|
unsigned CombineOpc;
|
||||||
if (SetCCOpcode == ISD::SETUEQ) {
|
if (SetCCOpcode == ISD::SETUEQ) {
|
||||||
CC0 = 3; CC1 = 0; CombineOpc = ISD::OR;
|
CC0 = 3; // UNORD
|
||||||
|
CC1 = 0; // EQ
|
||||||
|
CombineOpc = Opc == X86ISD::CMPP ? X86ISD::FOR : ISD::OR;
|
||||||
} else {
|
} else {
|
||||||
assert(SetCCOpcode == ISD::SETONE);
|
assert(SetCCOpcode == ISD::SETONE);
|
||||||
CC0 = 7; CC1 = 4; CombineOpc = ISD::AND;
|
CC0 = 7; // ORD
|
||||||
|
CC1 = 4; // NEQ
|
||||||
|
CombineOpc = Opc == X86ISD::CMPP ? X86ISD::FAND : ISD::AND;
|
||||||
}
|
}
|
||||||
|
|
||||||
SDValue Cmp0 = DAG.getNode(Opc, dl, VT, Op0, Op1,
|
SDValue Cmp0 = DAG.getNode(Opc, dl, VT, Op0, Op1,
|
||||||
DAG.getConstant(CC0, dl, MVT::i8));
|
DAG.getConstant(CC0, dl, MVT::i8));
|
||||||
SDValue Cmp1 = DAG.getNode(Opc, dl, VT, Op0, Op1,
|
SDValue Cmp1 = DAG.getNode(Opc, dl, VT, Op0, Op1,
|
||||||
DAG.getConstant(CC1, dl, MVT::i8));
|
DAG.getConstant(CC1, dl, MVT::i8));
|
||||||
return DAG.getNode(CombineOpc, dl, VT, Cmp0, Cmp1);
|
Cmp = DAG.getNode(CombineOpc, dl, VT, Cmp0, Cmp1);
|
||||||
|
} else {
|
||||||
|
// Handle all other FP comparisons here.
|
||||||
|
Cmp = DAG.getNode(Opc, dl, VT, Op0, Op1,
|
||||||
|
DAG.getConstant(SSECC, dl, MVT::i8));
|
||||||
}
|
}
|
||||||
// Handle all other FP comparisons here.
|
|
||||||
return DAG.getNode(Opc, dl, VT, Op0, Op1,
|
// If this is SSE/AVX CMPP, bitcast the result back to integer to match the
|
||||||
DAG.getConstant(SSECC, dl, MVT::i8));
|
// result type of SETCC. The bitcast is expected to be optimized away
|
||||||
|
// during combining/isel.
|
||||||
|
if (Opc == X86ISD::CMPP)
|
||||||
|
Cmp = DAG.getBitcast(Op.getSimpleValueType(), Cmp);
|
||||||
|
|
||||||
|
return Cmp;
|
||||||
}
|
}
|
||||||
|
|
||||||
MVT VTOp0 = Op0.getSimpleValueType();
|
MVT VTOp0 = Op0.getSimpleValueType();
|
||||||
@ -29647,6 +29672,11 @@ static SDValue combineSetCC(SDNode *N, SelectionDAG &DAG,
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// For an SSE1-only target, lower to X86ISD::CMPP early to avoid scalarization
|
||||||
|
// via legalization because v4i32 is not a legal type.
|
||||||
|
if (Subtarget.hasSSE1() && !Subtarget.hasSSE2() && VT == MVT::v4i32)
|
||||||
|
return LowerVSETCC(SDValue(N, 0), Subtarget, DAG);
|
||||||
|
|
||||||
return SDValue();
|
return SDValue();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -35,7 +35,7 @@ def bc_mmx : PatFrag<(ops node:$in), (x86mmx (bitconvert node:$in))>;
|
|||||||
// SSE specific DAG Nodes.
|
// SSE specific DAG Nodes.
|
||||||
//===----------------------------------------------------------------------===//
|
//===----------------------------------------------------------------------===//
|
||||||
|
|
||||||
def SDTX86VFCMP : SDTypeProfile<1, 3, [SDTCisInt<0>, SDTCisSameAs<1, 2>,
|
def SDTX86VFCMP : SDTypeProfile<1, 3, [SDTCisFP<0>, SDTCisSameAs<1, 2>,
|
||||||
SDTCisFP<1>, SDTCisVT<3, i8>,
|
SDTCisFP<1>, SDTCisVT<3, i8>,
|
||||||
SDTCisVec<1>]>;
|
SDTCisVec<1>]>;
|
||||||
def SDTX86CmpTestSae : SDTypeProfile<1, 3, [SDTCisVT<0, i32>,
|
def SDTX86CmpTestSae : SDTypeProfile<1, 3, [SDTCisVT<0, i32>,
|
||||||
|
@ -2498,36 +2498,36 @@ let Constraints = "$src1 = $dst" in {
|
|||||||
}
|
}
|
||||||
|
|
||||||
let Predicates = [HasAVX] in {
|
let Predicates = [HasAVX] in {
|
||||||
def : Pat<(v4i32 (X86cmpp (v4f32 VR128:$src1), VR128:$src2, imm:$cc)),
|
def : Pat<(v4f32 (X86cmpp (v4f32 VR128:$src1), VR128:$src2, imm:$cc)),
|
||||||
(VCMPPSrri (v4f32 VR128:$src1), (v4f32 VR128:$src2), imm:$cc)>;
|
(VCMPPSrri (v4f32 VR128:$src1), (v4f32 VR128:$src2), imm:$cc)>;
|
||||||
def : Pat<(v4i32 (X86cmpp (v4f32 VR128:$src1), (loadv4f32 addr:$src2), imm:$cc)),
|
def : Pat<(v4f32 (X86cmpp (v4f32 VR128:$src1), (loadv4f32 addr:$src2), imm:$cc)),
|
||||||
(VCMPPSrmi (v4f32 VR128:$src1), addr:$src2, imm:$cc)>;
|
(VCMPPSrmi (v4f32 VR128:$src1), addr:$src2, imm:$cc)>;
|
||||||
def : Pat<(v2i64 (X86cmpp (v2f64 VR128:$src1), VR128:$src2, imm:$cc)),
|
def : Pat<(v2f64 (X86cmpp (v2f64 VR128:$src1), VR128:$src2, imm:$cc)),
|
||||||
(VCMPPDrri VR128:$src1, VR128:$src2, imm:$cc)>;
|
(VCMPPDrri VR128:$src1, VR128:$src2, imm:$cc)>;
|
||||||
def : Pat<(v2i64 (X86cmpp (v2f64 VR128:$src1), (loadv2f64 addr:$src2), imm:$cc)),
|
def : Pat<(v2f64 (X86cmpp (v2f64 VR128:$src1), (loadv2f64 addr:$src2), imm:$cc)),
|
||||||
(VCMPPDrmi VR128:$src1, addr:$src2, imm:$cc)>;
|
(VCMPPDrmi VR128:$src1, addr:$src2, imm:$cc)>;
|
||||||
|
|
||||||
def : Pat<(v8i32 (X86cmpp (v8f32 VR256:$src1), VR256:$src2, imm:$cc)),
|
def : Pat<(v8f32 (X86cmpp (v8f32 VR256:$src1), VR256:$src2, imm:$cc)),
|
||||||
(VCMPPSYrri (v8f32 VR256:$src1), (v8f32 VR256:$src2), imm:$cc)>;
|
(VCMPPSYrri (v8f32 VR256:$src1), (v8f32 VR256:$src2), imm:$cc)>;
|
||||||
def : Pat<(v8i32 (X86cmpp (v8f32 VR256:$src1), (loadv8f32 addr:$src2), imm:$cc)),
|
def : Pat<(v8f32 (X86cmpp (v8f32 VR256:$src1), (loadv8f32 addr:$src2), imm:$cc)),
|
||||||
(VCMPPSYrmi (v8f32 VR256:$src1), addr:$src2, imm:$cc)>;
|
(VCMPPSYrmi (v8f32 VR256:$src1), addr:$src2, imm:$cc)>;
|
||||||
def : Pat<(v4i64 (X86cmpp (v4f64 VR256:$src1), VR256:$src2, imm:$cc)),
|
def : Pat<(v4f64 (X86cmpp (v4f64 VR256:$src1), VR256:$src2, imm:$cc)),
|
||||||
(VCMPPDYrri VR256:$src1, VR256:$src2, imm:$cc)>;
|
(VCMPPDYrri VR256:$src1, VR256:$src2, imm:$cc)>;
|
||||||
def : Pat<(v4i64 (X86cmpp (v4f64 VR256:$src1), (loadv4f64 addr:$src2), imm:$cc)),
|
def : Pat<(v4f64 (X86cmpp (v4f64 VR256:$src1), (loadv4f64 addr:$src2), imm:$cc)),
|
||||||
(VCMPPDYrmi VR256:$src1, addr:$src2, imm:$cc)>;
|
(VCMPPDYrmi VR256:$src1, addr:$src2, imm:$cc)>;
|
||||||
}
|
}
|
||||||
|
|
||||||
let Predicates = [UseSSE1] in {
|
let Predicates = [UseSSE1] in {
|
||||||
def : Pat<(v4i32 (X86cmpp (v4f32 VR128:$src1), VR128:$src2, imm:$cc)),
|
def : Pat<(v4f32 (X86cmpp (v4f32 VR128:$src1), VR128:$src2, imm:$cc)),
|
||||||
(CMPPSrri (v4f32 VR128:$src1), (v4f32 VR128:$src2), imm:$cc)>;
|
(CMPPSrri (v4f32 VR128:$src1), (v4f32 VR128:$src2), imm:$cc)>;
|
||||||
def : Pat<(v4i32 (X86cmpp (v4f32 VR128:$src1), (memopv4f32 addr:$src2), imm:$cc)),
|
def : Pat<(v4f32 (X86cmpp (v4f32 VR128:$src1), (memopv4f32 addr:$src2), imm:$cc)),
|
||||||
(CMPPSrmi (v4f32 VR128:$src1), addr:$src2, imm:$cc)>;
|
(CMPPSrmi (v4f32 VR128:$src1), addr:$src2, imm:$cc)>;
|
||||||
}
|
}
|
||||||
|
|
||||||
let Predicates = [UseSSE2] in {
|
let Predicates = [UseSSE2] in {
|
||||||
def : Pat<(v2i64 (X86cmpp (v2f64 VR128:$src1), VR128:$src2, imm:$cc)),
|
def : Pat<(v2f64 (X86cmpp (v2f64 VR128:$src1), VR128:$src2, imm:$cc)),
|
||||||
(CMPPDrri VR128:$src1, VR128:$src2, imm:$cc)>;
|
(CMPPDrri VR128:$src1, VR128:$src2, imm:$cc)>;
|
||||||
def : Pat<(v2i64 (X86cmpp (v2f64 VR128:$src1), (memopv2f64 addr:$src2), imm:$cc)),
|
def : Pat<(v2f64 (X86cmpp (v2f64 VR128:$src1), (memopv2f64 addr:$src2), imm:$cc)),
|
||||||
(CMPPDrmi VR128:$src1, addr:$src2, imm:$cc)>;
|
(CMPPDrmi VR128:$src1, addr:$src2, imm:$cc)>;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -53,55 +53,8 @@ entry:
|
|||||||
define <4 x float> @PR28044(<4 x float> %a0, <4 x float> %a1) nounwind {
|
define <4 x float> @PR28044(<4 x float> %a0, <4 x float> %a1) nounwind {
|
||||||
; CHECK-LABEL: PR28044:
|
; CHECK-LABEL: PR28044:
|
||||||
; CHECK: # BB#0:
|
; CHECK: # BB#0:
|
||||||
; CHECK: movaps %xmm1, %xmm2
|
; CHECK-NEXT: cmpeqps %xmm1, %xmm0
|
||||||
; CHECK-NEXT: shufps {{.*#+}} xmm2 = xmm2[3,1,2,3]
|
; CHECK-NEXT: ret
|
||||||
; CHECK-NEXT: movaps %xmm0, %xmm3
|
|
||||||
; CHECK-NEXT: shufps {{.*#+}} xmm3 = xmm3[3,1,2,3]
|
|
||||||
; CHECK-NEXT: ucomiss %xmm2, %xmm3
|
|
||||||
; CHECK-NEXT: setnp %al
|
|
||||||
; CHECK-NEXT: sete %cl
|
|
||||||
; CHECK-NEXT: andb %al, %cl
|
|
||||||
; CHECK-NEXT: movzbl %cl, %eax
|
|
||||||
; CHECK-NEXT: shll $31, %eax
|
|
||||||
; CHECK-NEXT: sarl $31, %eax
|
|
||||||
; CHECK-NEXT: movl %eax,
|
|
||||||
; CHECK-NEXT: movaps %xmm1, %xmm2
|
|
||||||
; CHECK-NEXT: shufps {{.*#+}} xmm2 = xmm2[1,1,2,3]
|
|
||||||
; CHECK-NEXT: movaps %xmm0, %xmm3
|
|
||||||
; CHECK-NEXT: shufps {{.*#+}} xmm3 = xmm3[1,1,2,3]
|
|
||||||
; CHECK-NEXT: ucomiss %xmm2, %xmm3
|
|
||||||
; CHECK-NEXT: setnp %al
|
|
||||||
; CHECK-NEXT: sete %cl
|
|
||||||
; CHECK-NEXT: andb %al, %cl
|
|
||||||
; CHECK-NEXT: movzbl %cl, %eax
|
|
||||||
; CHECK-NEXT: shll $31, %eax
|
|
||||||
; CHECK-NEXT: sarl $31, %eax
|
|
||||||
; CHECK-NEXT: movl %eax,
|
|
||||||
; CHECK-NEXT: ucomiss %xmm1, %xmm0
|
|
||||||
; CHECK-NEXT: setnp %al
|
|
||||||
; CHECK-NEXT: sete %cl
|
|
||||||
; CHECK-NEXT: andb %al, %cl
|
|
||||||
; CHECK-NEXT: movzbl %cl, %eax
|
|
||||||
; CHECK-NEXT: shll $31, %eax
|
|
||||||
; CHECK-NEXT: sarl $31, %eax
|
|
||||||
; CHECK-NEXT: movl %eax,
|
|
||||||
; CHECK-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,1,2,3]
|
|
||||||
; CHECK-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,1,2,3]
|
|
||||||
; CHECK-NEXT: ucomiss %xmm1, %xmm0
|
|
||||||
; CHECK-NEXT: setnp %al
|
|
||||||
; CHECK-NEXT: sete %cl
|
|
||||||
; CHECK-NEXT: andb %al, %cl
|
|
||||||
; CHECK-NEXT: movzbl %cl, %eax
|
|
||||||
; CHECK-NEXT: shll $31, %eax
|
|
||||||
; CHECK-NEXT: sarl $31, %eax
|
|
||||||
; CHECK-NEXT: movl %eax,
|
|
||||||
; CHECK-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
|
|
||||||
; CHECK-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
|
|
||||||
; CHECK-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
|
|
||||||
; CHECK-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
|
|
||||||
; CHECK-NEXT: movss {{.*#+}} xmm2 = mem[0],zero,zero,zero
|
|
||||||
; CHECK-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
|
|
||||||
; CHECK-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
|
|
||||||
;
|
;
|
||||||
%cmp = fcmp oeq <4 x float> %a0, %a1
|
%cmp = fcmp oeq <4 x float> %a0, %a1
|
||||||
%sext = sext <4 x i1> %cmp to <4 x i32>
|
%sext = sext <4 x i1> %cmp to <4 x i32>
|
||||||
|
Loading…
x
Reference in New Issue
Block a user