mirror of
https://github.com/RPCS3/llvm.git
synced 2024-12-15 07:59:57 +00:00
[x86, AVX] replace masked load with full vector load when possible
Converting masked vector loads to regular vector loads for x86 AVX should always be a win. I raised the legality issue of reading the extra memory bytes on llvm-dev. I did not see any objections. 1. x86 already does this kind of optimization for multiple scalar loads -> vector load. 2. If other targets have the same flexibility, we could move this transform up to CGP or DAGCombiner. Differential Revision: http://reviews.llvm.org/D18094 git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@263446 91177308-0d34-0410-b5e6-96231b3b80d8
This commit is contained in:
parent
eacb2ec057
commit
d26914da05
@ -27386,22 +27386,40 @@ reduceMaskedLoadToScalarLoad(MaskedLoadSDNode *ML, SelectionDAG &DAG,
|
||||
return DCI.CombineTo(ML, Insert, Load.getValue(1), true);
|
||||
}
|
||||
|
||||
/// Convert a masked load with a constant mask into a masked load and a select.
|
||||
/// This allows the select operation to use a faster kind of shuffle instruction
|
||||
/// (for example, vblendvps -> vblendps).
|
||||
static SDValue
|
||||
combineMaskedLoadConstantMask(MaskedLoadSDNode *ML, SelectionDAG &DAG,
|
||||
TargetLowering::DAGCombinerInfo &DCI) {
|
||||
if (!ISD::isBuildVectorOfConstantSDNodes(ML->getMask().getNode()))
|
||||
return SDValue();
|
||||
|
||||
SDLoc DL(ML);
|
||||
EVT VT = ML->getValueType(0);
|
||||
|
||||
// If we are loading the first and last elements of a vector, it is safe and
|
||||
// always faster to load the whole vector. Replace the masked load with a
|
||||
// vector load and select.
|
||||
unsigned NumElts = VT.getVectorNumElements();
|
||||
BuildVectorSDNode *MaskBV = cast<BuildVectorSDNode>(ML->getMask());
|
||||
bool LoadFirstElt = !isNullConstant(MaskBV->getOperand(0));
|
||||
bool LoadLastElt = !isNullConstant(MaskBV->getOperand(NumElts - 1));
|
||||
if (LoadFirstElt && LoadLastElt) {
|
||||
SDValue VecLd = DAG.getLoad(VT, DL, ML->getChain(), ML->getBasePtr(),
|
||||
ML->getMemOperand());
|
||||
SDValue Blend = DAG.getSelect(DL, VT, ML->getMask(), VecLd, ML->getSrc0());
|
||||
return DCI.CombineTo(ML, Blend, VecLd.getValue(1), true);
|
||||
}
|
||||
|
||||
// Convert a masked load with a constant mask into a masked load and a select.
|
||||
// This allows the select operation to use a faster kind of select instruction
|
||||
// (for example, vblendvps -> vblendps).
|
||||
|
||||
// Don't try this if the pass-through operand is already undefined. That would
|
||||
// cause an infinite loop because that's what we're about to create.
|
||||
if (!ISD::isBuildVectorOfConstantSDNodes(ML->getMask().getNode()) ||
|
||||
ML->getSrc0().getOpcode() == ISD::UNDEF)
|
||||
if (ML->getSrc0().isUndef())
|
||||
return SDValue();
|
||||
|
||||
// The new masked load has an undef pass-through operand. The select uses the
|
||||
// original pass-through operand.
|
||||
SDLoc DL(ML);
|
||||
EVT VT = ML->getValueType(0);
|
||||
SDValue NewML = DAG.getMaskedLoad(VT, DL, ML->getChain(), ML->getBasePtr(),
|
||||
ML->getMask(), DAG.getUNDEF(VT),
|
||||
ML->getMemoryVT(), ML->getMemOperand(),
|
||||
|
@ -915,8 +915,7 @@ define <2 x float> @test18(<2 x i32> %trigger, <2 x float>* %addr) {
|
||||
define <4 x float> @load_all(<4 x i32> %trigger, <4 x float>* %addr) {
|
||||
; AVX-LABEL: load_all:
|
||||
; AVX: ## BB#0:
|
||||
; AVX-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0
|
||||
; AVX-NEXT: vmaskmovps (%rdi), %xmm0, %xmm0
|
||||
; AVX-NEXT: vmovups (%rdi), %xmm0
|
||||
; AVX-NEXT: retq
|
||||
;
|
||||
; AVX512F-LABEL: load_all:
|
||||
@ -942,9 +941,7 @@ define <4 x float> @load_all(<4 x i32> %trigger, <4 x float>* %addr) {
|
||||
define <4 x float> @mload_constmask_v4f32(<4 x float>* %addr, <4 x float> %dst) {
|
||||
; AVX-LABEL: mload_constmask_v4f32:
|
||||
; AVX: ## BB#0:
|
||||
; AVX-NEXT: vmovaps {{.*#+}} xmm1 = [4294967295,0,4294967295,4294967295]
|
||||
; AVX-NEXT: vmaskmovps (%rdi), %xmm1, %xmm1
|
||||
; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2,3]
|
||||
; AVX-NEXT: vblendps {{.*#+}} xmm0 = mem[0],xmm0[1],mem[2,3]
|
||||
; AVX-NEXT: retq
|
||||
;
|
||||
; AVX512F-LABEL: mload_constmask_v4f32:
|
||||
@ -1055,16 +1052,12 @@ define <4 x double> @mload_constmask_v4f64(<4 x double>* %addr, <4 x double> %ds
|
||||
define <8 x i32> @mload_constmask_v8i32(<8 x i32>* %addr, <8 x i32> %dst) {
|
||||
; AVX1-LABEL: mload_constmask_v8i32:
|
||||
; AVX1: ## BB#0:
|
||||
; AVX1-NEXT: vmovaps {{.*#+}} ymm1 = [4294967295,4294967295,4294967295,0,0,0,0,4294967295]
|
||||
; AVX1-NEXT: vmaskmovps (%rdi), %ymm1, %ymm1
|
||||
; AVX1-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6],ymm1[7]
|
||||
; AVX1-NEXT: vblendps {{.*#+}} ymm0 = mem[0,1,2],ymm0[3,4,5,6],mem[7]
|
||||
; AVX1-NEXT: retq
|
||||
;
|
||||
; AVX2-LABEL: mload_constmask_v8i32:
|
||||
; AVX2: ## BB#0:
|
||||
; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [4294967295,4294967295,4294967295,0,0,0,0,4294967295]
|
||||
; AVX2-NEXT: vpmaskmovd (%rdi), %ymm1, %ymm1
|
||||
; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6],ymm1[7]
|
||||
; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = mem[0,1,2],ymm0[3,4,5,6],mem[7]
|
||||
; AVX2-NEXT: retq
|
||||
;
|
||||
; AVX512F-LABEL: mload_constmask_v8i32:
|
||||
@ -1087,16 +1080,12 @@ define <8 x i32> @mload_constmask_v8i32(<8 x i32>* %addr, <8 x i32> %dst) {
|
||||
define <4 x i64> @mload_constmask_v4i64(<4 x i64>* %addr, <4 x i64> %dst) {
|
||||
; AVX1-LABEL: mload_constmask_v4i64:
|
||||
; AVX1: ## BB#0:
|
||||
; AVX1-NEXT: vmovapd {{.*#+}} ymm1 = [18446744073709551615,0,0,18446744073709551615]
|
||||
; AVX1-NEXT: vmaskmovpd (%rdi), %ymm1, %ymm1
|
||||
; AVX1-NEXT: vblendpd {{.*#+}} ymm0 = ymm1[0],ymm0[1,2],ymm1[3]
|
||||
; AVX1-NEXT: vblendpd {{.*#+}} ymm0 = mem[0],ymm0[1,2],mem[3]
|
||||
; AVX1-NEXT: retq
|
||||
;
|
||||
; AVX2-LABEL: mload_constmask_v4i64:
|
||||
; AVX2: ## BB#0:
|
||||
; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [18446744073709551615,0,0,18446744073709551615]
|
||||
; AVX2-NEXT: vpmaskmovq (%rdi), %ymm1, %ymm1
|
||||
; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5],ymm1[6,7]
|
||||
; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = mem[0,1],ymm0[2,3,4,5],mem[6,7]
|
||||
; AVX2-NEXT: retq
|
||||
;
|
||||
; AVX512F-LABEL: mload_constmask_v4i64:
|
||||
@ -1121,12 +1110,8 @@ define <4 x i64> @mload_constmask_v4i64(<4 x i64>* %addr, <4 x i64> %dst) {
|
||||
define <8 x double> @mload_constmask_v8f64(<8 x double>* %addr, <8 x double> %dst) {
|
||||
; AVX-LABEL: mload_constmask_v8f64:
|
||||
; AVX: ## BB#0:
|
||||
; AVX-NEXT: vmovapd {{.*#+}} ymm2 = [18446744073709551615,18446744073709551615,18446744073709551615,0]
|
||||
; AVX-NEXT: vmaskmovpd (%rdi), %ymm2, %ymm2
|
||||
; AVX-NEXT: vblendpd {{.*#+}} ymm0 = ymm2[0,1,2],ymm0[3]
|
||||
; AVX-NEXT: vmovapd {{.*#+}} ymm2 = [0,0,0,18446744073709551615]
|
||||
; AVX-NEXT: vmaskmovpd 32(%rdi), %ymm2, %ymm2
|
||||
; AVX-NEXT: vblendpd {{.*#+}} ymm1 = ymm1[0,1,2],ymm2[3]
|
||||
; AVX-NEXT: vblendpd {{.*#+}} ymm1 = ymm1[0,1,2],mem[3]
|
||||
; AVX-NEXT: vblendpd {{.*#+}} ymm0 = mem[0,1,2],ymm0[3]
|
||||
; AVX-NEXT: retq
|
||||
;
|
||||
; AVX512-LABEL: mload_constmask_v8f64:
|
||||
|
Loading…
Reference in New Issue
Block a user