mirror of
https://github.com/RPCS3/llvm.git
synced 2024-12-13 23:18:58 +00:00
Optimized load + SIGN_EXTEND patterns in the X86 backend.
git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@170506 91177308-0d34-0410-b5e6-96231b3b80d8
This commit is contained in:
parent
bf5a2c6a39
commit
4b977312c7
@ -5235,6 +5235,7 @@ SDValue DAGCombiner::visitSIGN_EXTEND_INREG(SDNode *N) {
|
||||
LN0->getAlignment());
|
||||
CombineTo(N, ExtLoad);
|
||||
CombineTo(N0.getNode(), ExtLoad, ExtLoad.getValue(1));
|
||||
AddToWorkList(ExtLoad.getNode());
|
||||
return SDValue(N, 0); // Return N so it doesn't get rechecked!
|
||||
}
|
||||
// fold (sext_inreg (zextload x)) -> (sextload x) iff load has one use
|
||||
|
@ -15929,10 +15929,13 @@ static SDValue PerformLOADCombine(SDNode *N, SelectionDAG &DAG,
|
||||
|
||||
// If this is a vector EXT Load then attempt to optimize it using a
|
||||
// shuffle. We need SSSE3 shuffles.
|
||||
// SEXT loads are suppoted starting SSE41.
|
||||
// We generate X86ISD::VSEXT for them.
|
||||
// TODO: It is possible to support ZExt by zeroing the undef values
|
||||
// during the shuffle phase or after the shuffle.
|
||||
if (RegVT.isVector() && RegVT.isInteger() &&
|
||||
Ext == ISD::EXTLOAD && Subtarget->hasSSSE3()) {
|
||||
(Ext == ISD::EXTLOAD && Subtarget->hasSSSE3() ||
|
||||
Ext == ISD::SEXTLOAD && Subtarget->hasSSE41())){
|
||||
assert(MemVT != RegVT && "Cannot extend to the same type");
|
||||
assert(MemVT.isVector() && "Must load a vector from memory");
|
||||
|
||||
@ -15941,6 +15944,9 @@ static SDValue PerformLOADCombine(SDNode *N, SelectionDAG &DAG,
|
||||
unsigned MemSz = MemVT.getSizeInBits();
|
||||
assert(RegSz > MemSz && "Register size must be greater than the mem size");
|
||||
|
||||
if (Ext == ISD::SEXTLOAD && RegSz == 256 && !Subtarget->hasInt256())
|
||||
return SDValue();
|
||||
|
||||
// All sizes must be a power of two.
|
||||
if (!isPowerOf2_32(RegSz * MemSz * NumElems))
|
||||
return SDValue();
|
||||
@ -15964,16 +15970,23 @@ static SDValue PerformLOADCombine(SDNode *N, SelectionDAG &DAG,
|
||||
// Calculate the number of scalar loads that we need to perform
|
||||
// in order to load our vector from memory.
|
||||
unsigned NumLoads = MemSz / SclrLoadTy.getSizeInBits();
|
||||
if (Ext == ISD::SEXTLOAD && NumLoads > 1)
|
||||
return SDValue();
|
||||
|
||||
unsigned loadRegZize = RegSz;
|
||||
if (Ext == ISD::SEXTLOAD && RegSz == 256)
|
||||
loadRegZize /= 2;
|
||||
|
||||
// Represent our vector as a sequence of elements which are the
|
||||
// largest scalar that we can load.
|
||||
EVT LoadUnitVecVT = EVT::getVectorVT(*DAG.getContext(), SclrLoadTy,
|
||||
RegSz/SclrLoadTy.getSizeInBits());
|
||||
loadRegZize/SclrLoadTy.getSizeInBits());
|
||||
|
||||
// Represent the data using the same element type that is stored in
|
||||
// memory. In practice, we ''widen'' MemVT.
|
||||
EVT WideVecVT = EVT::getVectorVT(*DAG.getContext(), MemVT.getScalarType(),
|
||||
RegSz/MemVT.getScalarType().getSizeInBits());
|
||||
EVT WideVecVT =
|
||||
EVT::getVectorVT(*DAG.getContext(), MemVT.getScalarType(),
|
||||
loadRegZize/MemVT.getScalarType().getSizeInBits());
|
||||
|
||||
assert(WideVecVT.getSizeInBits() == LoadUnitVecVT.getSizeInBits() &&
|
||||
"Invalid vector type");
|
||||
@ -16014,6 +16027,10 @@ static SDValue PerformLOADCombine(SDNode *N, SelectionDAG &DAG,
|
||||
SDValue SlicedVec = DAG.getNode(ISD::BITCAST, dl, WideVecVT, Res);
|
||||
unsigned SizeRatio = RegSz/MemSz;
|
||||
|
||||
if (Ext == ISD::SEXTLOAD) {
|
||||
SDValue Sext = DAG.getNode(X86ISD::VSEXT, dl, RegVT, SlicedVec);
|
||||
return DCI.CombineTo(N, Sext, TF, true);
|
||||
}
|
||||
// Redistribute the loaded elements into the different locations.
|
||||
SmallVector<int, 8> ShuffleVec(NumElems * SizeRatio, -1);
|
||||
for (unsigned i = 0; i != NumElems; ++i)
|
||||
|
@ -5842,6 +5842,31 @@ defm VPMOVZXBQ : SS41I_binop_rm_int4_y<0x32, "vpmovzxbq",
|
||||
defm PMOVSXBQ : SS41I_binop_rm_int2<0x22, "pmovsxbq", int_x86_sse41_pmovsxbq>;
|
||||
defm PMOVZXBQ : SS41I_binop_rm_int2<0x32, "pmovzxbq", int_x86_sse41_pmovzxbq>;
|
||||
|
||||
let Predicates = [HasAVX2] in {
|
||||
def : Pat<(v8i32 (X86vsmovl (v8i16 (bitconvert (v2i64 (load addr:$src)))))),
|
||||
(VPMOVSXWDYrm addr:$src)>;
|
||||
def : Pat<(v4i64 (X86vsmovl (v4i32 (bitconvert (v2i64 (load addr:$src)))))),
|
||||
(VPMOVSXDQYrm addr:$src)>;
|
||||
|
||||
def : Pat<(v8i32 (X86vsext (v16i8 (bitconvert (v2i64
|
||||
(scalar_to_vector (loadi64 addr:$src))))))),
|
||||
(VPMOVSXBDYrm addr:$src)>;
|
||||
def : Pat<(v8i32 (X86vsext (v16i8 (bitconvert (v2f64
|
||||
(scalar_to_vector (loadf64 addr:$src))))))),
|
||||
(VPMOVSXBDYrm addr:$src)>;
|
||||
|
||||
def : Pat<(v4i64 (X86vsext (v8i16 (bitconvert (v2i64
|
||||
(scalar_to_vector (loadi64 addr:$src))))))),
|
||||
(VPMOVSXWQYrm addr:$src)>;
|
||||
def : Pat<(v4i64 (X86vsext (v8i16 (bitconvert (v2f64
|
||||
(scalar_to_vector (loadf64 addr:$src))))))),
|
||||
(VPMOVSXWQYrm addr:$src)>;
|
||||
|
||||
def : Pat<(v4i64 (X86vsext (v16i8 (bitconvert (v4i32
|
||||
(scalar_to_vector (loadi32 addr:$src))))))),
|
||||
(VPMOVSXBQYrm addr:$src)>;
|
||||
}
|
||||
|
||||
let Predicates = [HasAVX] in {
|
||||
// Common patterns involving scalar load
|
||||
def : Pat<(int_x86_sse41_pmovsxbq
|
||||
@ -5866,6 +5891,34 @@ let Predicates = [UseSSE41] in {
|
||||
(bitconvert (v4i32 (X86vzmovl
|
||||
(v4i32 (scalar_to_vector (loadi32 addr:$src))))))),
|
||||
(PMOVZXBQrm addr:$src)>;
|
||||
|
||||
def : Pat<(v4i32 (X86vsext (v8i16 (bitconvert (v2i64
|
||||
(scalar_to_vector (loadi64 addr:$src))))))),
|
||||
(PMOVSXWDrm addr:$src)>;
|
||||
def : Pat<(v4i32 (X86vsext (v8i16 (bitconvert (v2f64
|
||||
(scalar_to_vector (loadf64 addr:$src))))))),
|
||||
(PMOVSXWDrm addr:$src)>;
|
||||
def : Pat<(v4i32 (X86vsext (v16i8 (bitconvert (v4i32
|
||||
(scalar_to_vector (loadi32 addr:$src))))))),
|
||||
(PMOVSXBDrm addr:$src)>;
|
||||
def : Pat<(v2i64 (X86vsext (v8i16 (bitconvert (v4i32
|
||||
(scalar_to_vector (loadi32 addr:$src))))))),
|
||||
(PMOVSXWQrm addr:$src)>;
|
||||
def : Pat<(v2i64 (X86vsext (v16i8 (bitconvert (v4i32
|
||||
(scalar_to_vector (extloadi32i16 addr:$src))))))),
|
||||
(PMOVSXBQrm addr:$src)>;
|
||||
def : Pat<(v2i64 (X86vsext (v4i32 (bitconvert (v2i64
|
||||
(scalar_to_vector (loadi64 addr:$src))))))),
|
||||
(PMOVSXDQrm addr:$src)>;
|
||||
def : Pat<(v2i64 (X86vsext (v4i32 (bitconvert (v2f64
|
||||
(scalar_to_vector (loadf64 addr:$src))))))),
|
||||
(PMOVSXDQrm addr:$src)>;
|
||||
def : Pat<(v8i16 (X86vsext (v16i8 (bitconvert (v2i64
|
||||
(scalar_to_vector (loadi64 addr:$src))))))),
|
||||
(PMOVSXBWrm addr:$src)>;
|
||||
def : Pat<(v8i16 (X86vsext (v16i8 (bitconvert (v2f64
|
||||
(scalar_to_vector (loadf64 addr:$src))))))),
|
||||
(PMOVSXBWrm addr:$src)>;
|
||||
}
|
||||
|
||||
let Predicates = [HasAVX2] in {
|
||||
@ -5926,6 +5979,35 @@ let Predicates = [HasAVX] in {
|
||||
(VPMOVZXDQrm addr:$src)>;
|
||||
def : Pat<(v2i64 (X86vzext (v4i32 (bitconvert (v2i64 (X86vzload addr:$src)))))),
|
||||
(VPMOVZXDQrm addr:$src)>;
|
||||
|
||||
def : Pat<(v4i32 (X86vsext (v8i16 (bitconvert (v2i64
|
||||
(scalar_to_vector (loadi64 addr:$src))))))),
|
||||
(VPMOVSXWDrm addr:$src)>;
|
||||
def : Pat<(v2i64 (X86vsext (v4i32 (bitconvert (v2i64
|
||||
(scalar_to_vector (loadi64 addr:$src))))))),
|
||||
(VPMOVSXDQrm addr:$src)>;
|
||||
def : Pat<(v4i32 (X86vsext (v8i16 (bitconvert (v2f64
|
||||
(scalar_to_vector (loadf64 addr:$src))))))),
|
||||
(VPMOVSXWDrm addr:$src)>;
|
||||
def : Pat<(v2i64 (X86vsext (v4i32 (bitconvert (v2f64
|
||||
(scalar_to_vector (loadf64 addr:$src))))))),
|
||||
(VPMOVSXDQrm addr:$src)>;
|
||||
def : Pat<(v8i16 (X86vsext (v16i8 (bitconvert (v2i64
|
||||
(scalar_to_vector (loadi64 addr:$src))))))),
|
||||
(VPMOVSXBWrm addr:$src)>;
|
||||
def : Pat<(v8i16 (X86vsext (v16i8 (bitconvert (v2f64
|
||||
(scalar_to_vector (loadf64 addr:$src))))))),
|
||||
(VPMOVSXBWrm addr:$src)>;
|
||||
|
||||
def : Pat<(v4i32 (X86vsext (v16i8 (bitconvert (v4i32
|
||||
(scalar_to_vector (loadi32 addr:$src))))))),
|
||||
(VPMOVSXBDrm addr:$src)>;
|
||||
def : Pat<(v2i64 (X86vsext (v8i16 (bitconvert (v4i32
|
||||
(scalar_to_vector (loadi32 addr:$src))))))),
|
||||
(VPMOVSXWQrm addr:$src)>;
|
||||
def : Pat<(v2i64 (X86vsext (v16i8 (bitconvert (v4i32
|
||||
(scalar_to_vector (extloadi32i16 addr:$src))))))),
|
||||
(VPMOVSXBQrm addr:$src)>;
|
||||
}
|
||||
|
||||
let Predicates = [UseSSE41] in {
|
||||
|
@ -16,8 +16,8 @@ target triple = "x86_64-unknown-linux-gnu"
|
||||
; CHECK: main
|
||||
define i32 @main() nounwind uwtable {
|
||||
entry:
|
||||
; CHECK: movsbq j(%rip), %
|
||||
; CHECK: movsbq i(%rip), %
|
||||
; CHECK: pmovsxbq j(%rip), %
|
||||
; CHECK: pmovsxbq i(%rip), %
|
||||
%0 = load <2 x i8>* @i, align 8
|
||||
%1 = load <2 x i8>* @j, align 8
|
||||
%div = sdiv <2 x i8> %1, %0
|
||||
|
@ -1,4 +1,4 @@
|
||||
; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=corei7-avx -mattr=+avx | FileCheck %s
|
||||
; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=corei7-avx | FileCheck %s
|
||||
|
||||
define <8 x i32> @sext_8i16_to_8i32(<8 x i16> %A) nounwind uwtable readnone ssp {
|
||||
;CHECK: sext_8i16_to_8i32
|
||||
@ -15,3 +15,57 @@ define <4 x i64> @sext_4i32_to_4i64(<4 x i32> %A) nounwind uwtable readnone ssp
|
||||
%B = sext <4 x i32> %A to <4 x i64>
|
||||
ret <4 x i64>%B
|
||||
}
|
||||
|
||||
; CHECK: load_sext_test1
|
||||
; CHECK: vpmovsxwd (%r{{[^,]*}}), %xmm{{.*}}
|
||||
; CHECK: ret
|
||||
define <4 x i32> @load_sext_test1(<4 x i16> *%ptr) {
|
||||
%X = load <4 x i16>* %ptr
|
||||
%Y = sext <4 x i16> %X to <4 x i32>
|
||||
ret <4 x i32>%Y
|
||||
}
|
||||
|
||||
; CHECK: load_sext_test2
|
||||
; CHECK: vpmovsxbd (%r{{[^,]*}}), %xmm{{.*}}
|
||||
; CHECK: ret
|
||||
define <4 x i32> @load_sext_test2(<4 x i8> *%ptr) {
|
||||
%X = load <4 x i8>* %ptr
|
||||
%Y = sext <4 x i8> %X to <4 x i32>
|
||||
ret <4 x i32>%Y
|
||||
}
|
||||
|
||||
; CHECK: load_sext_test3
|
||||
; CHECK: vpmovsxbq (%r{{[^,]*}}), %xmm{{.*}}
|
||||
; CHECK: ret
|
||||
define <2 x i64> @load_sext_test3(<2 x i8> *%ptr) {
|
||||
%X = load <2 x i8>* %ptr
|
||||
%Y = sext <2 x i8> %X to <2 x i64>
|
||||
ret <2 x i64>%Y
|
||||
}
|
||||
|
||||
; CHECK: load_sext_test4
|
||||
; CHECK: vpmovsxwq (%r{{[^,]*}}), %xmm{{.*}}
|
||||
; CHECK: ret
|
||||
define <2 x i64> @load_sext_test4(<2 x i16> *%ptr) {
|
||||
%X = load <2 x i16>* %ptr
|
||||
%Y = sext <2 x i16> %X to <2 x i64>
|
||||
ret <2 x i64>%Y
|
||||
}
|
||||
|
||||
; CHECK: load_sext_test5
|
||||
; CHECK: vpmovsxdq (%r{{[^,]*}}), %xmm{{.*}}
|
||||
; CHECK: ret
|
||||
define <2 x i64> @load_sext_test5(<2 x i32> *%ptr) {
|
||||
%X = load <2 x i32>* %ptr
|
||||
%Y = sext <2 x i32> %X to <2 x i64>
|
||||
ret <2 x i64>%Y
|
||||
}
|
||||
|
||||
; CHECK: load_sext_test6
|
||||
; CHECK: vpmovsxbw (%r{{[^,]*}}), %xmm{{.*}}
|
||||
; CHECK: ret
|
||||
define <8 x i16> @load_sext_test6(<8 x i8> *%ptr) {
|
||||
%X = load <8 x i8>* %ptr
|
||||
%Y = sext <8 x i8> %X to <8 x i16>
|
||||
ret <8 x i16>%Y
|
||||
}
|
||||
|
@ -63,6 +63,47 @@ define <8 x i32> @zext_8i8_8i32(<8 x i8> %A) nounwind {
|
||||
ret <8 x i32>%B
|
||||
}
|
||||
|
||||
; CHECK: load_sext_test1
|
||||
; CHECK: vpmovsxdq (%r{{[^,]*}}), %ymm{{.*}}
|
||||
; CHECK: ret
|
||||
define <4 x i64> @load_sext_test1(<4 x i32> *%ptr) {
|
||||
%X = load <4 x i32>* %ptr
|
||||
%Y = sext <4 x i32> %X to <4 x i64>
|
||||
ret <4 x i64>%Y
|
||||
}
|
||||
|
||||
; CHECK: load_sext_test2
|
||||
; CHECK: vpmovsxbq (%r{{[^,]*}}), %ymm{{.*}}
|
||||
; CHECK: ret
|
||||
define <4 x i64> @load_sext_test2(<4 x i8> *%ptr) {
|
||||
%X = load <4 x i8>* %ptr
|
||||
%Y = sext <4 x i8> %X to <4 x i64>
|
||||
ret <4 x i64>%Y
|
||||
}
|
||||
|
||||
; CHECK: load_sext_test3
|
||||
; CHECK: vpmovsxwq (%r{{[^,]*}}), %ymm{{.*}}
|
||||
; CHECK: ret
|
||||
define <4 x i64> @load_sext_test3(<4 x i16> *%ptr) {
|
||||
%X = load <4 x i16>* %ptr
|
||||
%Y = sext <4 x i16> %X to <4 x i64>
|
||||
ret <4 x i64>%Y
|
||||
}
|
||||
|
||||
; CHECK: load_sext_test4
|
||||
; CHECK: vpmovsxwd (%r{{[^,]*}}), %ymm{{.*}}
|
||||
; CHECK: ret
|
||||
define <8 x i32> @load_sext_test4(<8 x i16> *%ptr) {
|
||||
%X = load <8 x i16>* %ptr
|
||||
%Y = sext <8 x i16> %X to <8 x i32>
|
||||
ret <8 x i32>%Y
|
||||
}
|
||||
|
||||
; CHECK: load_sext_test5
|
||||
; CHECK: vpmovsxbd (%r{{[^,]*}}), %ymm{{.*}}
|
||||
; CHECK: ret
|
||||
define <8 x i32> @load_sext_test5(<8 x i8> *%ptr) {
|
||||
%X = load <8 x i8>* %ptr
|
||||
%Y = sext <8 x i8> %X to <8 x i32>
|
||||
ret <8 x i32>%Y
|
||||
}
|
||||
|
Loading…
Reference in New Issue
Block a user