From b6e168f770df425f5097073f899296d78e023789 Mon Sep 17 00:00:00 2001 From: Craig Topper Date: Mon, 16 Feb 2015 20:52:07 +0000 Subject: [PATCH] [X86] Remove the multiply by 8 that goes into the shift constant for X86ISD::VSHLDQ and X86ISD::VSRLDQ. This simplifies the pattern matching in isel and allows these nodes to become the patterns embedded in the instruction. llvm-svn: 229431 --- lib/IR/AutoUpgrade.cpp | 12 ++-- lib/Target/X86/X86ISelLowering.cpp | 11 ++-- lib/Target/X86/X86InstrSSE.td | 56 +++++++++---------- .../X86/avx2-intrinsics-x86-upgrade.ll | 15 +++++ test/CodeGen/X86/avx2-intrinsics-x86.ll | 16 ------ 5 files changed, 52 insertions(+), 58 deletions(-) diff --git a/lib/IR/AutoUpgrade.cpp b/lib/IR/AutoUpgrade.cpp index 3c5e469b34d..aefe69d2e60 100644 --- a/lib/IR/AutoUpgrade.cpp +++ b/lib/IR/AutoUpgrade.cpp @@ -538,10 +538,10 @@ void llvm::UpgradeIntrinsicCall(CallInst *CI, Function *NewFn) { if (Shift < 16) { SmallVector Idxs; - for (unsigned l = 0; l < 32; l += 16) + for (unsigned l = 0; l != 32; l += 16) for (unsigned i = 0; i != 16; ++i) { - unsigned Idx = i + Shift; - if (Idx >= 16) Idx += 16; // end of lane, switch operand. + unsigned Idx = 32 + i - Shift; + if (Idx < 32) Idx -= 16; // end of lane, switch operand. Idxs.push_back(Builder.getInt32(Idx + l)); } @@ -561,10 +561,10 @@ void llvm::UpgradeIntrinsicCall(CallInst *CI, Function *NewFn) { if (Shift < 16) { SmallVector Idxs; - for (unsigned l = 0; l < 32; l += 16) + for (unsigned l = 0; l != 32; l += 16) for (unsigned i = 0; i != 16; ++i) { - unsigned Idx = 32 + i - Shift; - if (Idx < 32) Idx -= 16; // end of lane, switch operand. + unsigned Idx = i + Shift; + if (Idx >= 16) Idx += 16; // end of lane, switch operand. Idxs.push_back(Builder.getInt32(Idx + l)); } diff --git a/lib/Target/X86/X86ISelLowering.cpp b/lib/Target/X86/X86ISelLowering.cpp index 00095f04c67..6b3b73b60f2 100644 --- a/lib/Target/X86/X86ISelLowering.cpp +++ b/lib/Target/X86/X86ISelLowering.cpp @@ -5930,7 +5930,8 @@ static SDValue getVShift(bool isLeft, EVT VT, SDValue SrcOp, unsigned Opc = isLeft ? X86ISD::VSHLDQ : X86ISD::VSRLDQ; SrcOp = DAG.getNode(ISD::BITCAST, dl, ShVT, SrcOp); MVT ScalarShiftTy = TLI.getScalarShiftAmountTy(SrcOp.getValueType()); - SDValue ShiftVal = DAG.getConstant(NumBits, ScalarShiftTy); + assert(NumBits % 8 == 0 && "Only support byte sized shifts"); + SDValue ShiftVal = DAG.getConstant(NumBits/8, ScalarShiftTy); return DAG.getNode(ISD::BITCAST, dl, VT, DAG.getNode(Opc, dl, ShVT, SrcOp, ShiftVal)); } @@ -7761,9 +7762,9 @@ static SDValue lowerVectorShuffleAsByteRotate(SDLoc DL, MVT VT, SDValue V1, Hi = DAG.getNode(ISD::BITCAST, DL, MVT::v2i64, Hi); SDValue LoShift = DAG.getNode(X86ISD::VSHLDQ, DL, MVT::v2i64, Lo, - DAG.getConstant(8 * LoByteShift, MVT::i8)); + DAG.getConstant(LoByteShift, MVT::i8)); SDValue HiShift = DAG.getNode(X86ISD::VSRLDQ, DL, MVT::v2i64, Hi, - DAG.getConstant(8 * HiByteShift, MVT::i8)); + DAG.getConstant(HiByteShift, MVT::i8)); return DAG.getNode(ISD::BITCAST, DL, VT, DAG.getNode(ISD::OR, DL, MVT::v2i64, LoShift, HiShift)); } @@ -7907,7 +7908,7 @@ static SDValue lowerVectorShuffleAsByteShift(SDLoc DL, MVT VT, SDValue V1, SDValue V = MatchV1 ? V1 : V2; V = DAG.getNode(ISD::BITCAST, DL, ShiftVT, V); V = DAG.getNode(Op, DL, ShiftVT, V, - DAG.getConstant(ByteShift * 8, MVT::i8)); + DAG.getConstant(ByteShift, MVT::i8)); return DAG.getNode(ISD::BITCAST, DL, VT, V); }; @@ -8300,7 +8301,7 @@ static SDValue lowerVectorShuffleAsElementInsertion( V2 = DAG.getNode( X86ISD::VSHLDQ, DL, MVT::v2i64, V2, DAG.getConstant( - V2Index * EltVT.getSizeInBits(), + V2Index * EltVT.getSizeInBits()/8, DAG.getTargetLoweringInfo().getScalarShiftAmountTy(MVT::v2i64))); V2 = DAG.getNode(ISD::BITCAST, DL, VT, V2); } diff --git a/lib/Target/X86/X86InstrSSE.td b/lib/Target/X86/X86InstrSSE.td index 9799b61ca25..d565d4bac35 100644 --- a/lib/Target/X86/X86InstrSSE.td +++ b/lib/Target/X86/X86InstrSSE.td @@ -4174,16 +4174,20 @@ defm VPSRAD : PDI_binop_rmi<0xE2, 0x72, MRM4r, "vpsrad", X86vsra, X86vsrai, VR128, v4i32, v4i32, bc_v4i32, loadv2i64, SSE_INTSHIFT_ITINS_P, 0>, VEX_4V; -let ExeDomain = SSEPackedInt, SchedRW = [WriteVecShift], hasSideEffects = 0 in { +let ExeDomain = SSEPackedInt, SchedRW = [WriteVecShift] in { // 128-bit logical shifts. def VPSLLDQri : PDIi8<0x73, MRM7r, - (outs VR128:$dst), (ins VR128:$src1, i32u8imm:$src2), + (outs VR128:$dst), (ins VR128:$src1, u8imm:$src2), "vpslldq\t{$src2, $src1, $dst|$dst, $src1, $src2}", - []>, VEX_4V; + [(set VR128:$dst, + (v2i64 (X86vshldq VR128:$src1, (i8 imm:$src2))))]>, + VEX_4V; def VPSRLDQri : PDIi8<0x73, MRM3r, - (outs VR128:$dst), (ins VR128:$src1, i32u8imm:$src2), + (outs VR128:$dst), (ins VR128:$src1, u8imm:$src2), "vpsrldq\t{$src2, $src1, $dst|$dst, $src1, $src2}", - []>, VEX_4V; + [(set VR128:$dst, + (v2i64 (X86vshrdq VR128:$src1, (i8 imm:$src2))))]>, + VEX_4V; // PSRADQri doesn't exist in SSE[1-3]. } } // Predicates = [HasAVX] @@ -4219,13 +4223,17 @@ defm VPSRADY : PDI_binop_rmi<0xE2, 0x72, MRM4r, "vpsrad", X86vsra, X86vsrai, let ExeDomain = SSEPackedInt, SchedRW = [WriteVecShift], hasSideEffects = 0 in { // 256-bit logical shifts. def VPSLLDQYri : PDIi8<0x73, MRM7r, - (outs VR256:$dst), (ins VR256:$src1, i32u8imm:$src2), + (outs VR256:$dst), (ins VR256:$src1, u8imm:$src2), "vpslldq\t{$src2, $src1, $dst|$dst, $src1, $src2}", - []>, VEX_4V, VEX_L; + [(set VR256:$dst, + (v4i64 (X86vshldq VR256:$src1, (i8 imm:$src2))))]>, + VEX_4V, VEX_L; def VPSRLDQYri : PDIi8<0x73, MRM3r, - (outs VR256:$dst), (ins VR256:$src1, i32u8imm:$src2), + (outs VR256:$dst), (ins VR256:$src1, u8imm:$src2), "vpsrldq\t{$src2, $src1, $dst|$dst, $src1, $src2}", - []>, VEX_4V, VEX_L; + [(set VR256:$dst, + (v4i64 (X86vshrdq VR256:$src1, (i8 imm:$src2))))]>, + VEX_4V, VEX_L; // PSRADQYri doesn't exist in SSE[1-3]. } } // Predicates = [HasAVX2] @@ -4261,13 +4269,17 @@ defm PSRAD : PDI_binop_rmi<0xE2, 0x72, MRM4r, "psrad", X86vsra, X86vsrai, let ExeDomain = SSEPackedInt, SchedRW = [WriteVecShift], hasSideEffects = 0 in { // 128-bit logical shifts. def PSLLDQri : PDIi8<0x73, MRM7r, - (outs VR128:$dst), (ins VR128:$src1, i32u8imm:$src2), + (outs VR128:$dst), (ins VR128:$src1, u8imm:$src2), "pslldq\t{$src2, $dst|$dst, $src2}", - [], IIC_SSE_INTSHDQ_P_RI>; + [(set VR128:$dst, + (v2i64 (X86vshldq VR128:$src1, (i8 imm:$src2))))], + IIC_SSE_INTSHDQ_P_RI>; def PSRLDQri : PDIi8<0x73, MRM3r, - (outs VR128:$dst), (ins VR128:$src1, i32u8imm:$src2), + (outs VR128:$dst), (ins VR128:$src1, u8imm:$src2), "psrldq\t{$src2, $dst|$dst, $src2}", - [], IIC_SSE_INTSHDQ_P_RI>; + [(set VR128:$dst, + (v2i64 (X86vshrdq VR128:$src1, (i8 imm:$src2))))], + IIC_SSE_INTSHDQ_P_RI>; // PSRADQri doesn't exist in SSE[1-3]. } } // Constraints = "$src1 = $dst" @@ -4279,12 +4291,6 @@ let Predicates = [HasAVX] in { (VPSRLDQri VR128:$src1, (BYTE_imm imm:$src2))>; def : Pat<(v2f64 (X86fsrl VR128:$src1, i32immSExt8:$src2)), (VPSRLDQri VR128:$src1, (BYTE_imm imm:$src2))>; - - // Shift up / down and insert zero's. - def : Pat<(v2i64 (X86vshldq VR128:$src, (i8 imm:$amt))), - (VPSLLDQri VR128:$src, (BYTE_imm imm:$amt))>; - def : Pat<(v2i64 (X86vshrdq VR128:$src, (i8 imm:$amt))), - (VPSRLDQri VR128:$src, (BYTE_imm imm:$amt))>; } let Predicates = [HasAVX2] in { @@ -4292,12 +4298,6 @@ let Predicates = [HasAVX2] in { (VPSLLDQYri VR256:$src1, (BYTE_imm imm:$src2))>; def : Pat<(int_x86_avx2_psrl_dq VR256:$src1, imm:$src2), (VPSRLDQYri VR256:$src1, (BYTE_imm imm:$src2))>; - - // Shift up / down and insert zero's. - def : Pat<(v4i64 (X86vshldq VR256:$src, (i8 imm:$amt))), - (VPSLLDQYri VR256:$src, (BYTE_imm imm:$amt))>; - def : Pat<(v4i64 (X86vshrdq VR256:$src, (i8 imm:$amt))), - (VPSRLDQYri VR256:$src, (BYTE_imm imm:$amt))>; } let Predicates = [UseSSE2] in { @@ -4307,12 +4307,6 @@ let Predicates = [UseSSE2] in { (PSRLDQri VR128:$src1, (BYTE_imm imm:$src2))>; def : Pat<(v2f64 (X86fsrl VR128:$src1, i32immSExt8:$src2)), (PSRLDQri VR128:$src1, (BYTE_imm imm:$src2))>; - - // Shift up / down and insert zero's. - def : Pat<(v2i64 (X86vshldq VR128:$src, (i8 imm:$amt))), - (PSLLDQri VR128:$src, (BYTE_imm imm:$amt))>; - def : Pat<(v2i64 (X86vshrdq VR128:$src, (i8 imm:$amt))), - (PSRLDQri VR128:$src, (BYTE_imm imm:$amt))>; } //===---------------------------------------------------------------------===// diff --git a/test/CodeGen/X86/avx2-intrinsics-x86-upgrade.ll b/test/CodeGen/X86/avx2-intrinsics-x86-upgrade.ll index ac2c73bb932..fa1e9ee66eb 100644 --- a/test/CodeGen/X86/avx2-intrinsics-x86-upgrade.ll +++ b/test/CodeGen/X86/avx2-intrinsics-x86-upgrade.ll @@ -31,3 +31,18 @@ define <16 x i16> @test_x86_avx2_mpsadbw(<32 x i8> %a0, <32 x i8> %a1) { } declare <16 x i16> @llvm.x86.avx2.mpsadbw(<32 x i8>, <32 x i8>, i32) nounwind readnone + +define <4 x i64> @test_x86_avx2_psll_dq_bs(<4 x i64> %a0) { + ; CHECK: vpslldq {{.*#+}} ymm0 = zero,zero,zero,zero,zero,zero,zero,ymm0[0,1,2,3,4,5,6,7,8],zero,zero,zero,zero,zero,zero,zero,ymm0[16,17,18,19,20,21,22,23,24] + %res = call <4 x i64> @llvm.x86.avx2.psll.dq.bs(<4 x i64> %a0, i32 7) ; <<4 x i64>> [#uses=1] + ret <4 x i64> %res +} +declare <4 x i64> @llvm.x86.avx2.psll.dq.bs(<4 x i64>, i32) nounwind readnone + + +define <4 x i64> @test_x86_avx2_psrl_dq_bs(<4 x i64> %a0) { + ; CHECK: vpsrldq {{.*#+}} ymm0 = ymm0[7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,ymm0[23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero,zero + %res = call <4 x i64> @llvm.x86.avx2.psrl.dq.bs(<4 x i64> %a0, i32 7) ; <<4 x i64>> [#uses=1] + ret <4 x i64> %res +} +declare <4 x i64> @llvm.x86.avx2.psrl.dq.bs(<4 x i64>, i32) nounwind readnone diff --git a/test/CodeGen/X86/avx2-intrinsics-x86.ll b/test/CodeGen/X86/avx2-intrinsics-x86.ll index 79a3361bfe8..2dec98991cb 100644 --- a/test/CodeGen/X86/avx2-intrinsics-x86.ll +++ b/test/CodeGen/X86/avx2-intrinsics-x86.ll @@ -168,14 +168,6 @@ define <4 x i64> @test_x86_avx2_psll_dq(<4 x i64> %a0) { declare <4 x i64> @llvm.x86.avx2.psll.dq(<4 x i64>, i32) nounwind readnone -define <4 x i64> @test_x86_avx2_psll_dq_bs(<4 x i64> %a0) { - ; CHECK: vpslldq {{.*#+}} ymm0 = zero,zero,zero,zero,zero,zero,zero,ymm0[0,1,2,3,4,5,6,7,8],zero,zero,zero,zero,zero,zero,zero,ymm0[16,17,18,19,20,21,22,23,24] - %res = call <4 x i64> @llvm.x86.avx2.psll.dq.bs(<4 x i64> %a0, i32 7) ; <<4 x i64>> [#uses=1] - ret <4 x i64> %res -} -declare <4 x i64> @llvm.x86.avx2.psll.dq.bs(<4 x i64>, i32) nounwind readnone - - define <4 x i64> @test_x86_avx2_psll_q(<4 x i64> %a0, <2 x i64> %a1) { ; CHECK: vpsllq %res = call <4 x i64> @llvm.x86.avx2.psll.q(<4 x i64> %a0, <2 x i64> %a1) ; <<4 x i64>> [#uses=1] @@ -264,14 +256,6 @@ define <4 x i64> @test_x86_avx2_psrl_dq(<4 x i64> %a0) { declare <4 x i64> @llvm.x86.avx2.psrl.dq(<4 x i64>, i32) nounwind readnone -define <4 x i64> @test_x86_avx2_psrl_dq_bs(<4 x i64> %a0) { - ; CHECK: vpsrldq {{.*#+}} ymm0 = ymm0[7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,ymm0[23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero,zero - %res = call <4 x i64> @llvm.x86.avx2.psrl.dq.bs(<4 x i64> %a0, i32 7) ; <<4 x i64>> [#uses=1] - ret <4 x i64> %res -} -declare <4 x i64> @llvm.x86.avx2.psrl.dq.bs(<4 x i64>, i32) nounwind readnone - - define <4 x i64> @test_x86_avx2_psrl_q(<4 x i64> %a0, <2 x i64> %a1) { ; CHECK: vpsrlq %res = call <4 x i64> @llvm.x86.avx2.psrl.q(<4 x i64> %a0, <2 x i64> %a1) ; <<4 x i64>> [#uses=1]