diff --git a/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/lib/CodeGen/SelectionDAG/DAGCombiner.cpp index 968d5a995fa..5829de09d3a 100644 --- a/lib/CodeGen/SelectionDAG/DAGCombiner.cpp +++ b/lib/CodeGen/SelectionDAG/DAGCombiner.cpp @@ -323,7 +323,8 @@ namespace { } bool SimplifyDemandedBits(SDValue Op, const APInt &DemandedBits, - const APInt &DemandedElts); + const APInt &DemandedElts, + bool AssumeSingleUse = false); bool SimplifyDemandedVectorElts(SDValue Op, const APInt &DemandedElts, bool AssumeSingleUse = false); @@ -1058,10 +1059,12 @@ CommitTargetLoweringOpt(const TargetLowering::TargetLoweringOpt &TLO) { /// Check the specified integer node value to see if it can be simplified or if /// things it uses can be simplified by bit propagation. If so, return true. bool DAGCombiner::SimplifyDemandedBits(SDValue Op, const APInt &DemandedBits, - const APInt &DemandedElts) { + const APInt &DemandedElts, + bool AssumeSingleUse) { TargetLowering::TargetLoweringOpt TLO(DAG, LegalTypes, LegalOperations); KnownBits Known; - if (!TLI.SimplifyDemandedBits(Op, DemandedBits, DemandedElts, Known, TLO)) + if (!TLI.SimplifyDemandedBits(Op, DemandedBits, DemandedElts, Known, TLO, 0, + AssumeSingleUse)) return false; // Revisit the node. @@ -17210,6 +17213,7 @@ SDValue DAGCombiner::visitEXTRACT_VECTOR_ELT(SDNode *N) { // extract_vector_elt of out-of-bounds element -> UNDEF auto *IndexC = dyn_cast(Index); unsigned NumElts = VecVT.getVectorNumElements(); + unsigned VecEltBitWidth = VecVT.getScalarSizeInBits(); if (IndexC && IndexC->getAPIntValue().uge(NumElts)) return DAG.getUNDEF(ScalarVT); @@ -17251,7 +17255,6 @@ SDValue DAGCombiner::visitEXTRACT_VECTOR_ELT(SDNode *N) { "Extract element and scalar to vector can't change element type " "from FP to integer."); unsigned XBitWidth = X.getValueSizeInBits(); - unsigned VecEltBitWidth = VecVT.getScalarSizeInBits(); BCTruncElt = IsLE ? 0 : XBitWidth / VecEltBitWidth - 1; // An extract element return value type can be wider than its vector @@ -17334,6 +17337,14 @@ SDValue DAGCombiner::visitEXTRACT_VECTOR_ELT(SDNode *N) { AddToWorklist(N); return SDValue(N, 0); } + APInt DemandedBits = APInt::getAllOnesValue(VecEltBitWidth); + if (SimplifyDemandedBits(VecOp, DemandedBits, DemandedElts, true)) { + // We simplified the vector operand of this extract element. If this + // extract is not dead, visit it again so it is folded properly. + if (N->getOpcode() != ISD::DELETED_NODE) + AddToWorklist(N); + return SDValue(N, 0); + } } // Everything under here is trying to match an extract of a loaded value. diff --git a/test/CodeGen/AArch64/vecreduce-and-legalization.ll b/test/CodeGen/AArch64/vecreduce-and-legalization.ll index 810a0ba0612..43b6f5290d1 100644 --- a/test/CodeGen/AArch64/vecreduce-and-legalization.ll +++ b/test/CodeGen/AArch64/vecreduce-and-legalization.ll @@ -102,8 +102,6 @@ define i8 @test_v9i8(<9 x i8> %a) nounwind { ; CHECK-NEXT: mov v0.b[11], w8 ; CHECK-NEXT: mov v0.b[12], w8 ; CHECK-NEXT: mov v0.b[13], w8 -; CHECK-NEXT: mov v0.b[14], w8 -; CHECK-NEXT: mov v0.b[15], w8 ; CHECK-NEXT: ext v1.16b, v0.16b, v0.16b, #8 ; CHECK-NEXT: and v1.8b, v0.8b, v1.8b ; CHECK-NEXT: umov w8, v1.b[1] @@ -113,7 +111,7 @@ define i8 @test_v9i8(<9 x i8> %a) nounwind { ; CHECK-NEXT: and w8, w8, w9 ; CHECK-NEXT: umov w9, v1.b[3] ; CHECK-NEXT: and w8, w8, w9 -; CHECK-NEXT: umov w9, v1.b[4] +; CHECK-NEXT: umov w9, v0.b[4] ; CHECK-NEXT: and w8, w8, w9 ; CHECK-NEXT: umov w9, v1.b[5] ; CHECK-NEXT: and w8, w8, w9 diff --git a/test/CodeGen/AMDGPU/extractelt-to-trunc.ll b/test/CodeGen/AMDGPU/extractelt-to-trunc.ll index ef6079d9b35..1165cd82a29 100644 --- a/test/CodeGen/AMDGPU/extractelt-to-trunc.ll +++ b/test/CodeGen/AMDGPU/extractelt-to-trunc.ll @@ -69,7 +69,7 @@ define amdgpu_kernel void @bitcast_int_to_fpvector_extract_0(float addrspace(1)* ; GCN-LABEL: bitcast_int_to_fpvector_extract_0: ; GCN: ; %bb.0: ; GCN-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; GCN-NEXT: s_load_dwordx2 s[12:13], s[0:1], 0xd +; GCN-NEXT: s_load_dword s12, s[0:1], 0xd ; GCN-NEXT: s_mov_b32 s3, 0xf000 ; GCN-NEXT: s_mov_b32 s10, 0 ; GCN-NEXT: v_lshlrev_b32_e32 v0, 3, v0 @@ -77,7 +77,7 @@ define amdgpu_kernel void @bitcast_int_to_fpvector_extract_0(float addrspace(1)* ; GCN-NEXT: s_mov_b32 s11, s3 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: s_mov_b64 s[8:9], s[6:7] -; GCN-NEXT: buffer_load_dwordx2 v[0:1], v[0:1], s[8:11], 0 addr64 +; GCN-NEXT: buffer_load_dword v0, v[0:1], s[8:11], 0 addr64 ; GCN-NEXT: s_mov_b32 s2, -1 ; GCN-NEXT: s_mov_b32 s0, s4 ; GCN-NEXT: s_mov_b32 s1, s5 diff --git a/test/CodeGen/AMDGPU/idot4s.ll b/test/CodeGen/AMDGPU/idot4s.ll index 6f9ba28e4ac..43ca2a40039 100644 --- a/test/CodeGen/AMDGPU/idot4s.ll +++ b/test/CodeGen/AMDGPU/idot4s.ll @@ -927,35 +927,31 @@ define amdgpu_kernel void @idot4_acc16_vecMul(<4 x i8> addrspace(1)* %src1, ; GFX8: ; %bb.0: ; %entry ; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 -; GFX8-NEXT: s_mov_b32 s2, 0xffff ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: s_load_dword s3, s[6:7], 0x0 +; GFX8-NEXT: s_load_dword s2, s[6:7], 0x0 ; GFX8-NEXT: v_mov_b32_e32 v0, s0 ; GFX8-NEXT: v_mov_b32_e32 v1, s1 ; GFX8-NEXT: flat_load_ushort v2, v[0:1] ; GFX8-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: s_bfe_i32 s6, s3, 0x80000 -; GFX8-NEXT: s_lshr_b32 s4, s3, 16 -; GFX8-NEXT: v_ashrrev_i16_e64 v3, 8, s3 -; GFX8-NEXT: s_bfe_i32 s3, s4, 0x80000 -; GFX8-NEXT: s_lshr_b32 s1, s0, 16 -; GFX8-NEXT: s_bfe_i32 s5, s0, 0x80000 -; GFX8-NEXT: v_ashrrev_i16_e64 v4, 8, s0 -; GFX8-NEXT: s_bfe_i32 s0, s1, 0x80000 -; GFX8-NEXT: v_ashrrev_i16_e64 v6, 8, s1 -; GFX8-NEXT: s_and_b32 s1, s2, s6 -; GFX8-NEXT: v_ashrrev_i16_e64 v5, 8, s4 -; GFX8-NEXT: s_and_b32 s4, s2, s5 -; GFX8-NEXT: v_mov_b32_e32 v7, s1 -; GFX8-NEXT: s_and_b32 s3, s2, s3 -; GFX8-NEXT: s_and_b32 s0, s2, s0 +; GFX8-NEXT: s_sext_i32_i8 s1, s2 +; GFX8-NEXT: v_lshrrev_b16_e64 v3, 8, s2 +; GFX8-NEXT: v_mov_b32_e32 v4, s1 +; GFX8-NEXT: s_bfe_i32 s3, s2, 0x80010 +; GFX8-NEXT: v_lshrrev_b16_e64 v5, 8, s0 +; GFX8-NEXT: s_sext_i32_i8 s1, s0 +; GFX8-NEXT: v_bfe_i32 v5, v5, 0, 8 +; GFX8-NEXT: v_bfe_i32 v3, v3, 0, 8 +; GFX8-NEXT: s_bfe_i32 s4, s0, 0x80010 +; GFX8-NEXT: s_ashr_i32 s2, s2, 24 +; GFX8-NEXT: v_mov_b32_e32 v6, s3 +; GFX8-NEXT: s_ashr_i32 s0, s0, 24 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_mad_u32_u24 v2, s4, v7, v2 -; GFX8-NEXT: v_mad_u32_u24 v2, v4, v3, v2 -; GFX8-NEXT: v_mov_b32_e32 v3, s3 -; GFX8-NEXT: v_mad_u32_u24 v2, s0, v3, v2 -; GFX8-NEXT: v_mad_u32_u24 v2, v6, v5, v2 +; GFX8-NEXT: v_mad_i32_i24 v2, s1, v4, v2 +; GFX8-NEXT: v_mad_i32_i24 v2, v5, v3, v2 +; GFX8-NEXT: v_mad_i32_i24 v2, s4, v6, v2 +; GFX8-NEXT: v_mov_b32_e32 v3, s2 +; GFX8-NEXT: v_mad_i32_i24 v2, s0, v3, v2 ; GFX8-NEXT: flat_store_short v[0:1], v2 ; GFX8-NEXT: s_endpgm ; diff --git a/test/CodeGen/AMDGPU/idot4u.ll b/test/CodeGen/AMDGPU/idot4u.ll index 33cbe55f32e..0ca56d31196 100644 --- a/test/CodeGen/AMDGPU/idot4u.ll +++ b/test/CodeGen/AMDGPU/idot4u.ll @@ -2043,34 +2043,33 @@ define amdgpu_kernel void @udot4_acc8_vecMul(<4 x i8> addrspace(1)* %src1, ; GFX8: ; %bb.0: ; %entry ; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 -; GFX8-NEXT: s_movk_i32 s8, 0xff ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v0, s0 ; GFX8-NEXT: v_mov_b32_e32 v1, s1 ; GFX8-NEXT: flat_load_ubyte v2, v[0:1] +; GFX8-NEXT: s_movk_i32 s0, 0xff +; GFX8-NEXT: v_mov_b32_e32 v3, s0 ; GFX8-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX8-NEXT: s_load_dword s1, s[6:7], 0x0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v3, s0 -; GFX8-NEXT: v_mov_b32_e32 v4, s1 -; GFX8-NEXT: s_and_b32 s7, s1, s8 ; GFX8-NEXT: s_lshr_b32 s2, s0, 24 -; GFX8-NEXT: s_lshr_b32 s3, s1, 24 -; GFX8-NEXT: s_bfe_u32 s6, s1, 0x80010 -; GFX8-NEXT: v_mul_u32_u24_sdwa v3, v3, v4 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:BYTE_1 -; GFX8-NEXT: s_and_b32 s5, s0, s8 -; GFX8-NEXT: v_mov_b32_e32 v4, s7 -; GFX8-NEXT: v_mul_u32_u24_e32 v4, s5, v4 -; GFX8-NEXT: s_bfe_u32 s4, s0, 0x80010 -; GFX8-NEXT: v_mov_b32_e32 v5, s6 -; GFX8-NEXT: v_mov_b32_e32 v6, s3 +; GFX8-NEXT: s_lshr_b32 s4, s1, 24 +; GFX8-NEXT: s_lshr_b32 s3, s0, 16 +; GFX8-NEXT: v_mov_b32_e32 v4, s0 +; GFX8-NEXT: v_mov_b32_e32 v5, s1 +; GFX8-NEXT: s_mul_i32 s0, s0, s1 +; GFX8-NEXT: s_lshr_b32 s5, s1, 16 +; GFX8-NEXT: v_mul_u32_u24_sdwa v4, v4, v5 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:BYTE_1 +; GFX8-NEXT: v_mov_b32_e32 v5, s5 +; GFX8-NEXT: v_and_b32_e32 v3, s0, v3 +; GFX8-NEXT: v_mov_b32_e32 v6, s4 ; GFX8-NEXT: v_mov_b32_e32 v7, s2 -; GFX8-NEXT: v_or_b32_sdwa v3, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX8-NEXT: v_mul_u32_u24_e32 v5, s4, v5 +; GFX8-NEXT: v_or_b32_e32 v3, v3, v4 +; GFX8-NEXT: v_mul_u32_u24_e32 v5, s3, v5 ; GFX8-NEXT: v_mul_u32_u24_sdwa v6, v7, v6 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX8-NEXT: v_and_b32_e32 v3, 0xffff, v3 -; GFX8-NEXT: v_or_b32_sdwa v5, v5, v6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX8-NEXT: v_or_b32_e32 v4, v3, v5 +; GFX8-NEXT: v_or_b32_sdwa v4, v5, v6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX8-NEXT: v_or_b32_e32 v4, v3, v4 ; GFX8-NEXT: v_lshrrev_b32_e32 v5, 8, v4 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_add_u32_e32 v2, vcc, v2, v3 diff --git a/test/CodeGen/AMDGPU/idot8s.ll b/test/CodeGen/AMDGPU/idot8s.ll index e5f6023618e..13c243fcdd9 100644 --- a/test/CodeGen/AMDGPU/idot8s.ll +++ b/test/CodeGen/AMDGPU/idot8s.ll @@ -1637,68 +1637,60 @@ define amdgpu_kernel void @idot8_acc16_vecMul(<8 x i4> addrspace(1)* %src1, ; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: s_load_dword s2, s[6:7], 0x0 +; GFX8-NEXT: s_load_dword s7, s[6:7], 0x0 ; GFX8-NEXT: v_mov_b32_e32 v0, s0 ; GFX8-NEXT: v_mov_b32_e32 v1, s1 ; GFX8-NEXT: flat_load_ushort v2, v[0:1] -; GFX8-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX8-NEXT: s_load_dword s1, s[4:5], 0x0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_lshlrev_b16_e64 v4, 12, s2 -; GFX8-NEXT: s_lshr_b32 s15, s2, 4 -; GFX8-NEXT: s_lshr_b32 s16, s2, 8 -; GFX8-NEXT: v_lshlrev_b16_e64 v12, 12, s16 -; GFX8-NEXT: v_lshlrev_b16_e64 v3, 12, s0 -; GFX8-NEXT: s_lshr_b32 s8, s0, 4 -; GFX8-NEXT: s_lshr_b32 s9, s0, 8 -; GFX8-NEXT: v_lshlrev_b16_e64 v5, 12, s9 -; GFX8-NEXT: v_lshlrev_b16_e64 v6, 12, s8 -; GFX8-NEXT: v_lshlrev_b16_e64 v13, 12, s15 -; GFX8-NEXT: v_ashrrev_i16_e32 v3, 12, v3 -; GFX8-NEXT: v_ashrrev_i16_e32 v4, 12, v4 -; GFX8-NEXT: s_lshr_b32 s7, s0, 12 -; GFX8-NEXT: s_lshr_b32 s14, s2, 12 -; GFX8-NEXT: v_ashrrev_i16_e32 v6, 12, v6 -; GFX8-NEXT: v_ashrrev_i16_e32 v5, 12, v5 -; GFX8-NEXT: v_ashrrev_i16_e32 v12, 12, v12 -; GFX8-NEXT: v_ashrrev_i16_e32 v13, 12, v13 -; GFX8-NEXT: v_lshlrev_b16_e64 v7, 12, s7 -; GFX8-NEXT: v_lshlrev_b16_e64 v14, 12, s14 -; GFX8-NEXT: s_lshr_b32 s6, s0, 16 -; GFX8-NEXT: s_lshr_b32 s13, s2, 16 -; GFX8-NEXT: v_mul_u32_u24_e32 v5, v5, v12 -; GFX8-NEXT: v_lshlrev_b16_e64 v8, 12, s6 -; GFX8-NEXT: v_lshlrev_b16_e64 v15, 12, s13 -; GFX8-NEXT: s_lshr_b32 s5, s0, 20 -; GFX8-NEXT: s_lshr_b32 s12, s2, 20 -; GFX8-NEXT: v_ashrrev_i16_e32 v7, 12, v7 -; GFX8-NEXT: v_ashrrev_i16_e32 v14, 12, v14 -; GFX8-NEXT: v_lshlrev_b16_e64 v9, 12, s5 -; GFX8-NEXT: v_lshlrev_b16_e64 v16, 12, s12 -; GFX8-NEXT: s_lshr_b32 s4, s0, 24 -; GFX8-NEXT: s_lshr_b32 s11, s2, 24 -; GFX8-NEXT: v_ashrrev_i16_e32 v8, 12, v8 -; GFX8-NEXT: v_ashrrev_i16_e32 v15, 12, v15 -; GFX8-NEXT: v_lshlrev_b16_e64 v10, 12, s4 -; GFX8-NEXT: v_lshlrev_b16_e64 v17, 12, s11 -; GFX8-NEXT: s_lshr_b32 s1, s0, 28 -; GFX8-NEXT: s_lshr_b32 s10, s2, 28 -; GFX8-NEXT: v_ashrrev_i16_e32 v9, 12, v9 -; GFX8-NEXT: v_ashrrev_i16_e32 v16, 12, v16 -; GFX8-NEXT: v_lshlrev_b16_e64 v11, 12, s1 -; GFX8-NEXT: v_lshlrev_b16_e64 v18, 12, s10 -; GFX8-NEXT: v_ashrrev_i16_e32 v10, 12, v10 -; GFX8-NEXT: v_ashrrev_i16_e32 v17, 12, v17 -; GFX8-NEXT: v_ashrrev_i16_e32 v11, 12, v11 -; GFX8-NEXT: v_ashrrev_i16_e32 v18, 12, v18 +; GFX8-NEXT: s_lshl_b32 s5, s7, 28 +; GFX8-NEXT: s_ashr_i64 s[4:5], s[4:5], 60 +; GFX8-NEXT: s_lshl_b32 s9, s7, 24 +; GFX8-NEXT: s_lshl_b32 s11, s7, 20 +; GFX8-NEXT: s_lshl_b32 s5, s1, 28 +; GFX8-NEXT: s_ashr_i64 s[14:15], s[4:5], 60 +; GFX8-NEXT: s_lshl_b32 s5, s1, 20 +; GFX8-NEXT: s_lshl_b32 s13, s1, 24 +; GFX8-NEXT: s_ashr_i64 s[8:9], s[8:9], 60 +; GFX8-NEXT: s_ashr_i64 s[10:11], s[10:11], 60 +; GFX8-NEXT: v_mov_b32_e32 v3, s4 +; GFX8-NEXT: s_ashr_i64 s[4:5], s[4:5], 60 +; GFX8-NEXT: v_mov_b32_e32 v4, s10 +; GFX8-NEXT: s_ashr_i64 s[12:13], s[12:13], 60 +; GFX8-NEXT: s_lshl_b32 s5, s7, 16 +; GFX8-NEXT: v_mov_b32_e32 v5, s8 +; GFX8-NEXT: s_lshl_b32 s9, s1, 16 +; GFX8-NEXT: s_lshl_b32 s11, s7, 12 +; GFX8-NEXT: s_ashr_i64 s[16:17], s[4:5], 60 +; GFX8-NEXT: s_ashr_i64 s[8:9], s[8:9], 60 +; GFX8-NEXT: v_mul_i32_i24_e32 v4, s4, v4 +; GFX8-NEXT: s_lshl_b32 s5, s1, 12 +; GFX8-NEXT: s_lshl_b32 s9, s7, 8 +; GFX8-NEXT: s_ashr_i64 s[10:11], s[10:11], 60 +; GFX8-NEXT: v_mov_b32_e32 v6, s16 +; GFX8-NEXT: s_ashr_i64 s[20:21], s[8:9], 60 +; GFX8-NEXT: s_lshl_b32 s13, s1, 8 +; GFX8-NEXT: s_ashr_i64 s[18:19], s[4:5], 60 +; GFX8-NEXT: s_lshl_b32 s5, s7, 4 +; GFX8-NEXT: v_mov_b32_e32 v7, s10 +; GFX8-NEXT: s_lshl_b32 s9, s1, 4 +; GFX8-NEXT: s_ashr_i64 s[24:25], s[4:5], 60 +; GFX8-NEXT: s_ashr_i64 s[22:23], s[12:13], 60 +; GFX8-NEXT: v_mov_b32_e32 v8, s20 +; GFX8-NEXT: s_ashr_i64 s[6:7], s[6:7], 60 +; GFX8-NEXT: s_ashr_i64 s[26:27], s[8:9], 60 +; GFX8-NEXT: v_mov_b32_e32 v9, s24 +; GFX8-NEXT: s_ashr_i64 s[0:1], s[0:1], 60 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_mad_u32_u24 v2, v3, v4, v2 -; GFX8-NEXT: v_mad_u32_u24 v2, v6, v13, v2 -; GFX8-NEXT: v_add_u32_sdwa v2, vcc, v5, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:WORD_0 -; GFX8-NEXT: v_mad_u32_u24 v2, v7, v14, v2 -; GFX8-NEXT: v_mad_u32_u24 v2, v8, v15, v2 -; GFX8-NEXT: v_mad_u32_u24 v2, v9, v16, v2 -; GFX8-NEXT: v_mad_u32_u24 v2, v10, v17, v2 -; GFX8-NEXT: v_mad_u32_u24 v2, v11, v18, v2 +; GFX8-NEXT: v_mad_i32_i24 v2, s14, v3, v2 +; GFX8-NEXT: v_mad_i32_i24 v2, s12, v5, v2 +; GFX8-NEXT: v_add_u32_sdwa v2, vcc, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:WORD_0 +; GFX8-NEXT: v_mad_i32_i24 v2, s8, v6, v2 +; GFX8-NEXT: v_mad_i32_i24 v2, s18, v7, v2 +; GFX8-NEXT: v_mad_i32_i24 v2, s22, v8, v2 +; GFX8-NEXT: v_mad_i32_i24 v2, s26, v9, v2 +; GFX8-NEXT: v_mov_b32_e32 v3, s6 +; GFX8-NEXT: v_mad_i32_i24 v2, s0, v3, v2 ; GFX8-NEXT: flat_store_short v[0:1], v2 ; GFX8-NEXT: s_endpgm ; @@ -2023,87 +2015,83 @@ define amdgpu_kernel void @idot8_acc8_vecMul(<8 x i4> addrspace(1)* %src1, ; GFX8: ; %bb.0: ; %entry ; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GFX8-NEXT: s_mov_b32 s2, 0xffff ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v0, s0 ; GFX8-NEXT: v_mov_b32_e32 v1, s1 ; GFX8-NEXT: flat_load_ubyte v2, v[0:1] ; GFX8-NEXT: s_load_dword s1, s[4:5], 0x0 -; GFX8-NEXT: s_load_dword s2, s[6:7], 0x0 -; GFX8-NEXT: s_mov_b32 s0, 0xffff +; GFX8-NEXT: s_load_dword s5, s[6:7], 0x0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: s_lshr_b32 s8, s1, 4 -; GFX8-NEXT: s_lshr_b32 s9, s1, 12 -; GFX8-NEXT: s_lshr_b32 s10, s1, 8 -; GFX8-NEXT: s_lshr_b32 s15, s2, 4 -; GFX8-NEXT: s_lshr_b32 s16, s2, 12 -; GFX8-NEXT: s_lshr_b32 s17, s2, 8 -; GFX8-NEXT: v_lshlrev_b16_e64 v3, 12, s1 -; GFX8-NEXT: v_lshlrev_b16_e64 v4, 12, s2 -; GFX8-NEXT: v_lshlrev_b16_e64 v5, 12, s10 -; GFX8-NEXT: v_lshlrev_b16_e64 v6, 12, s9 -; GFX8-NEXT: v_lshlrev_b16_e64 v7, 12, s8 -; GFX8-NEXT: v_lshlrev_b16_e64 v12, 12, s17 -; GFX8-NEXT: v_lshlrev_b16_e64 v13, 12, s16 -; GFX8-NEXT: v_lshlrev_b16_e64 v14, 12, s15 -; GFX8-NEXT: s_lshr_b32 s4, s1, 20 -; GFX8-NEXT: s_lshr_b32 s5, s1, 16 -; GFX8-NEXT: s_lshr_b32 s6, s1, 28 -; GFX8-NEXT: s_lshr_b32 s7, s1, 24 -; GFX8-NEXT: s_lshr_b32 s11, s2, 20 -; GFX8-NEXT: s_lshr_b32 s12, s2, 16 -; GFX8-NEXT: s_lshr_b32 s13, s2, 28 -; GFX8-NEXT: s_lshr_b32 s14, s2, 24 -; GFX8-NEXT: v_lshlrev_b16_e64 v8, 12, s7 -; GFX8-NEXT: v_lshlrev_b16_e64 v9, 12, s6 -; GFX8-NEXT: v_lshlrev_b16_e64 v10, 12, s5 -; GFX8-NEXT: v_lshlrev_b16_e64 v11, 12, s4 -; GFX8-NEXT: v_lshlrev_b16_e64 v15, 12, s14 -; GFX8-NEXT: v_lshlrev_b16_e64 v16, 12, s13 -; GFX8-NEXT: v_lshlrev_b16_e64 v17, 12, s12 -; GFX8-NEXT: v_lshlrev_b16_e64 v18, 12, s11 -; GFX8-NEXT: v_ashrrev_i16_e32 v3, 12, v3 -; GFX8-NEXT: v_ashrrev_i16_e32 v4, 12, v4 -; GFX8-NEXT: v_ashrrev_i16_e32 v5, 12, v5 -; GFX8-NEXT: v_ashrrev_i16_e32 v6, 12, v6 -; GFX8-NEXT: v_ashrrev_i16_e32 v7, 12, v7 -; GFX8-NEXT: v_ashrrev_i16_e32 v12, 12, v12 -; GFX8-NEXT: v_ashrrev_i16_e32 v13, 12, v13 -; GFX8-NEXT: v_ashrrev_i16_e32 v14, 12, v14 -; GFX8-NEXT: v_mul_u32_u24_sdwa v3, v3, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:BYTE_0 -; GFX8-NEXT: v_mul_u32_u24_sdwa v4, v5, v12 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:BYTE_0 -; GFX8-NEXT: v_mul_u32_u24_sdwa v5, v6, v13 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:BYTE_0 -; GFX8-NEXT: v_mul_u32_u24_sdwa v6, v7, v14 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:BYTE_0 -; GFX8-NEXT: v_ashrrev_i16_e32 v8, 12, v8 -; GFX8-NEXT: v_ashrrev_i16_e32 v15, 12, v15 -; GFX8-NEXT: v_ashrrev_i16_e32 v9, 12, v9 -; GFX8-NEXT: v_ashrrev_i16_e32 v10, 12, v10 -; GFX8-NEXT: v_ashrrev_i16_e32 v11, 12, v11 -; GFX8-NEXT: v_ashrrev_i16_e32 v16, 12, v16 -; GFX8-NEXT: v_ashrrev_i16_e32 v17, 12, v17 -; GFX8-NEXT: v_ashrrev_i16_e32 v18, 12, v18 -; GFX8-NEXT: v_or_b32_sdwa v3, v3, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX8-NEXT: v_mul_u32_u24_sdwa v7, v8, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:BYTE_0 -; GFX8-NEXT: v_mul_u32_u24_sdwa v8, v9, v16 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:BYTE_0 -; GFX8-NEXT: v_mul_u32_u24_sdwa v9, v10, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:BYTE_0 -; GFX8-NEXT: v_mul_u32_u24_sdwa v10, v11, v18 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:BYTE_0 -; GFX8-NEXT: v_and_b32_e32 v3, s0, v3 -; GFX8-NEXT: v_or_b32_sdwa v4, v4, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX8-NEXT: v_or_b32_sdwa v9, v9, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX8-NEXT: v_or_b32_sdwa v7, v7, v8 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX8-NEXT: v_and_b32_e32 v5, s0, v9 -; GFX8-NEXT: v_or_b32_e32 v4, v3, v4 -; GFX8-NEXT: v_or_b32_e32 v6, v5, v7 -; GFX8-NEXT: v_lshrrev_b32_e32 v7, 8, v4 -; GFX8-NEXT: v_lshrrev_b32_e32 v8, 8, v6 +; GFX8-NEXT: s_lshl_b32 s13, s1, 24 +; GFX8-NEXT: s_lshl_b32 s17, s1, 16 +; GFX8-NEXT: s_ashr_i64 s[22:23], s[4:5], 60 +; GFX8-NEXT: s_lshl_b32 s25, s5, 24 +; GFX8-NEXT: s_lshl_b32 s27, s5, 28 +; GFX8-NEXT: s_lshl_b32 s29, s5, 16 +; GFX8-NEXT: s_ashr_i64 s[10:11], s[0:1], 60 +; GFX8-NEXT: s_lshl_b32 s15, s1, 28 +; GFX8-NEXT: s_lshl_b32 s19, s5, 8 +; GFX8-NEXT: s_lshl_b32 s21, s5, 12 +; GFX8-NEXT: s_lshl_b32 s23, s5, 4 +; GFX8-NEXT: s_lshl_b32 s5, s5, 20 +; GFX8-NEXT: s_ashr_i64 s[12:13], s[12:13], 60 +; GFX8-NEXT: s_ashr_i64 s[16:17], s[16:17], 60 +; GFX8-NEXT: s_ashr_i64 s[24:25], s[24:25], 60 +; GFX8-NEXT: s_ashr_i64 s[26:27], s[26:27], 60 +; GFX8-NEXT: s_ashr_i64 s[28:29], s[28:29], 60 +; GFX8-NEXT: s_lshl_b32 s7, s1, 8 +; GFX8-NEXT: s_lshl_b32 s9, s1, 12 +; GFX8-NEXT: s_lshl_b32 s11, s1, 4 +; GFX8-NEXT: s_lshl_b32 s1, s1, 20 +; GFX8-NEXT: s_ashr_i64 s[4:5], s[4:5], 60 +; GFX8-NEXT: s_ashr_i64 s[14:15], s[14:15], 60 +; GFX8-NEXT: v_mov_b32_e32 v6, s28 +; GFX8-NEXT: v_mov_b32_e32 v7, s16 +; GFX8-NEXT: v_mov_b32_e32 v8, s26 +; GFX8-NEXT: v_mov_b32_e32 v9, s24 +; GFX8-NEXT: v_mov_b32_e32 v10, s12 +; GFX8-NEXT: v_mul_i32_i24_sdwa v6, v7, v6 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX8-NEXT: v_mul_i32_i24_e32 v7, s14, v8 +; GFX8-NEXT: v_mul_i32_i24_sdwa v8, v10, v9 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX8-NEXT: s_ashr_i64 s[0:1], s[0:1], 60 +; GFX8-NEXT: v_mov_b32_e32 v5, s4 +; GFX8-NEXT: v_mul_i32_i24_e32 v5, s0, v5 +; GFX8-NEXT: v_or_b32_sdwa v7, v7, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX8-NEXT: s_ashr_i64 s[6:7], s[6:7], 60 +; GFX8-NEXT: s_ashr_i64 s[18:19], s[18:19], 60 +; GFX8-NEXT: v_or_b32_sdwa v5, v5, v6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX8-NEXT: v_and_b32_e32 v6, s2, v7 +; GFX8-NEXT: s_ashr_i64 s[20:21], s[20:21], 60 +; GFX8-NEXT: v_mov_b32_e32 v3, s22 +; GFX8-NEXT: v_mov_b32_e32 v4, s10 +; GFX8-NEXT: s_ashr_i64 s[32:33], s[22:23], 60 +; GFX8-NEXT: v_mul_i32_i24_sdwa v3, v4, v3 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX8-NEXT: v_or_b32_e32 v5, v6, v5 +; GFX8-NEXT: s_ashr_i64 s[8:9], s[8:9], 60 +; GFX8-NEXT: v_mov_b32_e32 v4, s20 +; GFX8-NEXT: v_mov_b32_e32 v12, s18 +; GFX8-NEXT: v_mov_b32_e32 v13, s6 +; GFX8-NEXT: s_ashr_i64 s[30:31], s[10:11], 60 +; GFX8-NEXT: v_mov_b32_e32 v11, s32 +; GFX8-NEXT: v_mul_i32_i24_e32 v4, s8, v4 +; GFX8-NEXT: v_mul_i32_i24_sdwa v10, v13, v12 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX8-NEXT: v_lshrrev_b32_e32 v7, 8, v5 +; GFX8-NEXT: v_or_b32_sdwa v4, v4, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX8-NEXT: v_mul_i32_i24_e32 v9, s30, v11 +; GFX8-NEXT: v_or_b32_sdwa v3, v9, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX8-NEXT: v_and_b32_e32 v4, s2, v4 +; GFX8-NEXT: v_or_b32_e32 v3, v4, v3 +; GFX8-NEXT: v_lshrrev_b32_e32 v8, 8, v3 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_add_u32_e32 v2, vcc, v2, v3 +; GFX8-NEXT: v_add_u32_e32 v2, vcc, v2, v6 ; GFX8-NEXT: v_add_u32_e32 v2, vcc, v7, v2 -; GFX8-NEXT: v_add_u32_sdwa v2, vcc, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_2 src1_sel:BYTE_0 -; GFX8-NEXT: v_add_u32_sdwa v2, vcc, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD -; GFX8-NEXT: v_add_u32_e32 v2, vcc, v2, v5 +; GFX8-NEXT: v_add_u32_sdwa v2, vcc, v5, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_2 src1_sel:BYTE_0 +; GFX8-NEXT: v_add_u32_sdwa v2, vcc, v5, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD +; GFX8-NEXT: v_add_u32_e32 v2, vcc, v2, v4 ; GFX8-NEXT: v_add_u32_e32 v2, vcc, v8, v2 -; GFX8-NEXT: v_add_u32_sdwa v2, vcc, v6, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; GFX8-NEXT: v_add_u32_sdwa v2, vcc, v6, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD +; GFX8-NEXT: v_add_u32_sdwa v2, vcc, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX8-NEXT: v_add_u32_sdwa v2, vcc, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD ; GFX8-NEXT: flat_store_byte v[0:1], v2 ; GFX8-NEXT: s_endpgm ; diff --git a/test/CodeGen/AMDGPU/store-weird-sizes.ll b/test/CodeGen/AMDGPU/store-weird-sizes.ll index d03d09f2be5..d9a0ae7ffda 100644 --- a/test/CodeGen/AMDGPU/store-weird-sizes.ll +++ b/test/CodeGen/AMDGPU/store-weird-sizes.ll @@ -52,43 +52,49 @@ define amdgpu_kernel void @local_store_i55(i55 addrspace(3)* %ptr, i55 %arg) #0 ; ; FIJI-LABEL: local_store_i55: ; FIJI: ; %bb.0: -; FIJI-NEXT: s_add_u32 s0, s4, 14 -; FIJI-NEXT: s_addc_u32 s1, s5, 0 -; FIJI-NEXT: v_mov_b32_e32 v0, s0 -; FIJI-NEXT: v_mov_b32_e32 v1, s1 -; FIJI-NEXT: flat_load_ubyte v0, v[0:1] ; FIJI-NEXT: s_load_dword s0, s[4:5], 0x0 ; FIJI-NEXT: s_load_dword s1, s[4:5], 0x8 ; FIJI-NEXT: s_load_dword s2, s[4:5], 0xc ; FIJI-NEXT: s_mov_b32 m0, -1 ; FIJI-NEXT: s_waitcnt lgkmcnt(0) -; FIJI-NEXT: v_mov_b32_e32 v1, s0 +; FIJI-NEXT: v_mov_b32_e32 v2, s0 ; FIJI-NEXT: v_mov_b32_e32 v3, s1 -; FIJI-NEXT: v_mov_b32_e32 v2, s2 -; FIJI-NEXT: ds_write_b16 v1, v2 offset:4 -; FIJI-NEXT: s_waitcnt vmcnt(0) -; FIJI-NEXT: v_and_b32_e32 v0, 0x7f, v0 -; FIJI-NEXT: ds_write_b8 v1, v0 offset:6 -; FIJI-NEXT: ds_write_b32 v1, v3 +; FIJI-NEXT: s_and_b32 s3, s2, 0xffff +; FIJI-NEXT: s_add_u32 s0, s4, 14 +; FIJI-NEXT: s_addc_u32 s1, s5, 0 +; FIJI-NEXT: v_mov_b32_e32 v0, s0 +; FIJI-NEXT: v_mov_b32_e32 v1, s1 +; FIJI-NEXT: flat_load_ubyte v0, v[0:1] +; FIJI-NEXT: v_mov_b32_e32 v1, s2 +; FIJI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; FIJI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; FIJI-NEXT: v_or_b32_e32 v0, s3, v0 +; FIJI-NEXT: v_bfe_u32 v0, v0, 16, 7 +; FIJI-NEXT: ds_write_b16 v2, v1 offset:4 +; FIJI-NEXT: ds_write_b8 v2, v0 offset:6 +; FIJI-NEXT: ds_write_b32 v2, v3 ; FIJI-NEXT: s_endpgm ; ; GFX9-LABEL: local_store_i55: ; GFX9: ; %bb.0: ; GFX9-NEXT: v_mov_b32_e32 v0, s4 ; GFX9-NEXT: v_mov_b32_e32 v1, s5 -; GFX9-NEXT: global_load_ubyte_d16_hi v0, v[0:1], off offset:14 +; GFX9-NEXT: v_mov_b32_e32 v2, 0 +; GFX9-NEXT: global_load_ubyte_d16_hi v2, v[0:1], off offset:14 ; GFX9-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX9-NEXT: s_load_dword s1, s[4:5], 0x8 ; GFX9-NEXT: s_load_dword s2, s[4:5], 0xc ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v1, s0 +; GFX9-NEXT: v_mov_b32_e32 v0, s0 ; GFX9-NEXT: v_mov_b32_e32 v3, s1 -; GFX9-NEXT: v_mov_b32_e32 v2, s2 -; GFX9-NEXT: ds_write_b16 v1, v2 offset:4 +; GFX9-NEXT: v_mov_b32_e32 v1, s2 +; GFX9-NEXT: s_and_b32 s3, s2, 0xffff +; GFX9-NEXT: ds_write_b16 v0, v1 offset:4 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_bfe_u32 v0, v0, 16, 7 -; GFX9-NEXT: ds_write_b8 v1, v0 offset:6 -; GFX9-NEXT: ds_write_b32 v1, v3 +; GFX9-NEXT: v_or_b32_e32 v1, s3, v2 +; GFX9-NEXT: v_and_b32_e32 v1, 0x7fffff, v1 +; GFX9-NEXT: ds_write_b8_d16_hi v0, v1 offset:6 +; GFX9-NEXT: ds_write_b32 v0, v3 ; GFX9-NEXT: s_endpgm store i55 %arg, i55 addrspace(3)* %ptr, align 8 ret void diff --git a/test/CodeGen/X86/psadbw.ll b/test/CodeGen/X86/psadbw.ll index 753e88c3dbc..7115695d66a 100644 --- a/test/CodeGen/X86/psadbw.ll +++ b/test/CodeGen/X86/psadbw.ll @@ -21,8 +21,7 @@ define i64 @combine_psadbw_demandedelt(<16 x i8> %0, <16 x i8> %1) { ; X86-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,3,2] ; X86-NEXT: psadbw %xmm0, %xmm1 ; X86-NEXT: movd %xmm1, %eax -; X86-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3] -; X86-NEXT: movd %xmm0, %edx +; X86-NEXT: xorl %edx, %edx ; X86-NEXT: retl ; ; X64-LABEL: combine_psadbw_demandedelt: diff --git a/test/CodeGen/X86/vector-reduce-mul.ll b/test/CodeGen/X86/vector-reduce-mul.ll index 927f51e9b1b..2da46c0f5d3 100644 --- a/test/CodeGen/X86/vector-reduce-mul.ll +++ b/test/CodeGen/X86/vector-reduce-mul.ll @@ -1576,10 +1576,8 @@ define i8 @test_v4i8(<4 x i8> %a0) { ; SSE2-NEXT: psrldq {{.*#+}} xmm0 = xmm0[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero ; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] ; SSE2-NEXT: pmullw %xmm1, %xmm0 -; SSE2-NEXT: pand {{.*}}(%rip), %xmm0 -; SSE2-NEXT: packuswb %xmm0, %xmm0 ; SSE2-NEXT: movdqa %xmm0, %xmm1 -; SSE2-NEXT: psrlw $8, %xmm1 +; SSE2-NEXT: psrld $16, %xmm1 ; SSE2-NEXT: pmullw %xmm0, %xmm1 ; SSE2-NEXT: movd %xmm1, %eax ; SSE2-NEXT: # kill: def $al killed $al killed $eax @@ -1591,7 +1589,7 @@ define i8 @test_v4i8(<4 x i8> %a0) { ; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] ; SSE41-NEXT: pmullw %xmm0, %xmm1 ; SSE41-NEXT: movdqa %xmm1, %xmm0 -; SSE41-NEXT: pshufb {{.*#+}} xmm0 = xmm0[2],zero,xmm0[6],zero,xmm0[u],zero,xmm0[u],zero,xmm0[u],zero,xmm0[u],zero,xmm0[u],zero,xmm0[u],zero +; SSE41-NEXT: psrld $16, %xmm0 ; SSE41-NEXT: pmullw %xmm1, %xmm0 ; SSE41-NEXT: pextrb $0, %xmm0, %eax ; SSE41-NEXT: # kill: def $al killed $al killed $eax @@ -1602,7 +1600,7 @@ define i8 @test_v4i8(<4 x i8> %a0) { ; AVX-NEXT: vpmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero ; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] ; AVX-NEXT: vpmullw %xmm1, %xmm0, %xmm0 -; AVX-NEXT: vpshufb {{.*#+}} xmm1 = xmm0[2],zero,xmm0[6],zero,xmm0[u],zero,xmm0[u],zero,xmm0[u],zero,xmm0[u],zero,xmm0[u],zero,xmm0[u],zero +; AVX-NEXT: vpsrld $16, %xmm0, %xmm1 ; AVX-NEXT: vpmullw %xmm1, %xmm0, %xmm0 ; AVX-NEXT: vpextrb $0, %xmm0, %eax ; AVX-NEXT: # kill: def $al killed $al killed $eax @@ -1613,7 +1611,7 @@ define i8 @test_v4i8(<4 x i8> %a0) { ; AVX512BW-NEXT: vpmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero ; AVX512BW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] ; AVX512BW-NEXT: vpmullw %xmm1, %xmm0, %xmm0 -; AVX512BW-NEXT: vpshufb {{.*#+}} xmm1 = xmm0[2],zero,xmm0[6],zero,xmm0[u],zero,xmm0[u],zero,xmm0[u],zero,xmm0[u],zero,xmm0[u],zero,xmm0[u],zero +; AVX512BW-NEXT: vpsrld $16, %xmm0, %xmm1 ; AVX512BW-NEXT: vpmullw %xmm1, %xmm0, %xmm0 ; AVX512BW-NEXT: vpextrb $0, %xmm0, %eax ; AVX512BW-NEXT: # kill: def $al killed $al killed $eax @@ -1625,7 +1623,7 @@ define i8 @test_v4i8(<4 x i8> %a0) { ; AVX512VL-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,2,3] ; AVX512VL-NEXT: vpmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero ; AVX512VL-NEXT: vpmullw %xmm1, %xmm0, %xmm0 -; AVX512VL-NEXT: vpshufb {{.*#+}} xmm1 = xmm0[2],zero,xmm0[6],zero,xmm0[u],zero,xmm0[u],zero,xmm0[u],zero,xmm0[u],zero,xmm0[u],zero,xmm0[u],zero +; AVX512VL-NEXT: vpsrld $16, %xmm0, %xmm1 ; AVX512VL-NEXT: vpmullw %xmm1, %xmm0, %xmm0 ; AVX512VL-NEXT: vpextrb $0, %xmm0, %eax ; AVX512VL-NEXT: # kill: def $al killed $al killed $eax @@ -1636,7 +1634,7 @@ define i8 @test_v4i8(<4 x i8> %a0) { ; AVX512DQ-NEXT: vpmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero ; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] ; AVX512DQ-NEXT: vpmullw %xmm1, %xmm0, %xmm0 -; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm1 = xmm0[2],zero,xmm0[6],zero,xmm0[u],zero,xmm0[u],zero,xmm0[u],zero,xmm0[u],zero,xmm0[u],zero,xmm0[u],zero +; AVX512DQ-NEXT: vpsrld $16, %xmm0, %xmm1 ; AVX512DQ-NEXT: vpmullw %xmm1, %xmm0, %xmm0 ; AVX512DQ-NEXT: vpextrb $0, %xmm0, %eax ; AVX512DQ-NEXT: # kill: def $al killed $al killed $eax @@ -1654,10 +1652,8 @@ define i8 @test_v8i8(<8 x i8> %a0) { ; SSE2-NEXT: pmullw %xmm0, %xmm1 ; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,2,3,0] ; SSE2-NEXT: pmullw %xmm1, %xmm0 -; SSE2-NEXT: pand {{.*}}(%rip), %xmm0 -; SSE2-NEXT: packuswb %xmm0, %xmm0 ; SSE2-NEXT: movdqa %xmm0, %xmm1 -; SSE2-NEXT: psrlw $8, %xmm1 +; SSE2-NEXT: psrld $16, %xmm1 ; SSE2-NEXT: pmullw %xmm0, %xmm1 ; SSE2-NEXT: movd %xmm1, %eax ; SSE2-NEXT: # kill: def $al killed $al killed $eax @@ -1673,7 +1669,7 @@ define i8 @test_v8i8(<8 x i8> %a0) { ; SSE41-NEXT: pmovzxbw {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero ; SSE41-NEXT: pmullw %xmm0, %xmm1 ; SSE41-NEXT: movdqa %xmm1, %xmm0 -; SSE41-NEXT: pshufb {{.*#+}} xmm0 = xmm0[2],zero,xmm0[6],zero,xmm0[10],zero,xmm0[14],zero,xmm0[u],zero,xmm0[u],zero,xmm0[u],zero,xmm0[u],zero +; SSE41-NEXT: psrld $16, %xmm0 ; SSE41-NEXT: pmullw %xmm1, %xmm0 ; SSE41-NEXT: pextrb $0, %xmm0, %eax ; SSE41-NEXT: # kill: def $al killed $al killed $eax @@ -1686,7 +1682,7 @@ define i8 @test_v8i8(<8 x i8> %a0) { ; AVX-NEXT: vpmullw %xmm1, %xmm0, %xmm0 ; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] ; AVX-NEXT: vpmullw %xmm1, %xmm0, %xmm0 -; AVX-NEXT: vpshufb {{.*#+}} xmm1 = xmm0[2],zero,xmm0[6],zero,xmm0[10],zero,xmm0[14],zero,xmm0[u],zero,xmm0[u],zero,xmm0[u],zero,xmm0[u],zero +; AVX-NEXT: vpsrld $16, %xmm0, %xmm1 ; AVX-NEXT: vpmullw %xmm1, %xmm0, %xmm0 ; AVX-NEXT: vpextrb $0, %xmm0, %eax ; AVX-NEXT: # kill: def $al killed $al killed $eax @@ -1699,7 +1695,7 @@ define i8 @test_v8i8(<8 x i8> %a0) { ; AVX512BW-NEXT: vpmullw %xmm1, %xmm0, %xmm0 ; AVX512BW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] ; AVX512BW-NEXT: vpmullw %xmm1, %xmm0, %xmm0 -; AVX512BW-NEXT: vpshufb {{.*#+}} xmm1 = xmm0[2],zero,xmm0[6],zero,xmm0[10],zero,xmm0[14],zero,xmm0[u],zero,xmm0[u],zero,xmm0[u],zero,xmm0[u],zero +; AVX512BW-NEXT: vpsrld $16, %xmm0, %xmm1 ; AVX512BW-NEXT: vpmullw %xmm1, %xmm0, %xmm0 ; AVX512BW-NEXT: vpextrb $0, %xmm0, %eax ; AVX512BW-NEXT: # kill: def $al killed $al killed $eax @@ -1713,7 +1709,7 @@ define i8 @test_v8i8(<8 x i8> %a0) { ; AVX512VL-NEXT: vpmullw %xmm1, %xmm0, %xmm0 ; AVX512VL-NEXT: vpalignr {{.*#+}} xmm1 = xmm0[4,5,6,7,8,9,10,11,12,13,14,15,0,1,2,3] ; AVX512VL-NEXT: vpmullw %xmm1, %xmm0, %xmm0 -; AVX512VL-NEXT: vpshufb {{.*#+}} xmm1 = xmm0[2],zero,xmm0[6],zero,xmm0[10],zero,xmm0[14],zero,xmm0[u],zero,xmm0[u],zero,xmm0[u],zero,xmm0[u],zero +; AVX512VL-NEXT: vpsrld $16, %xmm0, %xmm1 ; AVX512VL-NEXT: vpmullw %xmm1, %xmm0, %xmm0 ; AVX512VL-NEXT: vpextrb $0, %xmm0, %eax ; AVX512VL-NEXT: # kill: def $al killed $al killed $eax @@ -1726,7 +1722,7 @@ define i8 @test_v8i8(<8 x i8> %a0) { ; AVX512DQ-NEXT: vpmullw %xmm1, %xmm0, %xmm0 ; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] ; AVX512DQ-NEXT: vpmullw %xmm1, %xmm0, %xmm0 -; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm1 = xmm0[2],zero,xmm0[6],zero,xmm0[10],zero,xmm0[14],zero,xmm0[u],zero,xmm0[u],zero,xmm0[u],zero,xmm0[u],zero +; AVX512DQ-NEXT: vpsrld $16, %xmm0, %xmm1 ; AVX512DQ-NEXT: vpmullw %xmm1, %xmm0, %xmm0 ; AVX512DQ-NEXT: vpextrb $0, %xmm0, %eax ; AVX512DQ-NEXT: # kill: def $al killed $al killed $eax @@ -1756,8 +1752,9 @@ define i8 @test_v16i8(<16 x i8> %a0) { ; SSE2-NEXT: psrldq {{.*#+}} xmm0 = xmm0[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero ; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] ; SSE2-NEXT: pmullw %xmm2, %xmm0 -; SSE2-NEXT: pand %xmm0, %xmm1 -; SSE2-NEXT: packuswb %xmm3, %xmm1 +; SSE2-NEXT: pand %xmm1, %xmm0 +; SSE2-NEXT: packuswb %xmm3, %xmm0 +; SSE2-NEXT: movdqa %xmm0, %xmm1 ; SSE2-NEXT: psrlw $8, %xmm1 ; SSE2-NEXT: pmullw %xmm0, %xmm1 ; SSE2-NEXT: movd %xmm1, %eax @@ -1799,10 +1796,10 @@ define i8 @test_v16i8(<16 x i8> %a0) { ; AVX1-NEXT: vpmullw %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] ; AVX1-NEXT: vpmullw %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm1 -; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX1-NEXT: vpackuswb %xmm2, %xmm1, %xmm1 -; AVX1-NEXT: vpsrlw $8, %xmm1, %xmm1 +; AVX1-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0 +; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVX1-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpsrlw $8, %xmm0, %xmm1 ; AVX1-NEXT: vpmullw %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vpextrb $0, %xmm0, %eax ; AVX1-NEXT: # kill: def $al killed $al killed $eax @@ -1947,8 +1944,9 @@ define i8 @test_v32i8(<32 x i8> %a0) { ; SSE2-NEXT: psrldq {{.*#+}} xmm0 = xmm0[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero ; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] ; SSE2-NEXT: pmullw %xmm2, %xmm0 -; SSE2-NEXT: pand %xmm0, %xmm1 -; SSE2-NEXT: packuswb %xmm3, %xmm1 +; SSE2-NEXT: pand %xmm1, %xmm0 +; SSE2-NEXT: packuswb %xmm3, %xmm0 +; SSE2-NEXT: movdqa %xmm0, %xmm1 ; SSE2-NEXT: psrlw $8, %xmm1 ; SSE2-NEXT: pmullw %xmm0, %xmm1 ; SSE2-NEXT: movd %xmm1, %eax @@ -2037,10 +2035,9 @@ define i8 @test_v32i8(<32 x i8> %a0) { ; AVX2-NEXT: vpsrld $16, %xmm1, %xmm1 ; AVX2-NEXT: vpmovzxbw {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero ; AVX2-NEXT: vpmullw %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vpand %xmm3, %xmm0, %xmm1 -; AVX2-NEXT: vpackuswb %xmm2, %xmm1, %xmm1 -; AVX2-NEXT: vpsrlw $8, %xmm1, %xmm1 -; AVX2-NEXT: vpmovzxbw {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero +; AVX2-NEXT: vpand %xmm3, %xmm0, %xmm0 +; AVX2-NEXT: vpackuswb %xmm2, %xmm0, %xmm0 +; AVX2-NEXT: vpsrlw $8, %xmm0, %xmm1 ; AVX2-NEXT: vpmullw %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: vpextrb $0, %xmm0, %eax ; AVX2-NEXT: # kill: def $al killed $al killed $eax @@ -2126,10 +2123,9 @@ define i8 @test_v32i8(<32 x i8> %a0) { ; AVX512DQ-NEXT: vpsrld $16, %xmm1, %xmm1 ; AVX512DQ-NEXT: vpmovzxbw {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero ; AVX512DQ-NEXT: vpmullw %xmm1, %xmm0, %xmm0 -; AVX512DQ-NEXT: vpand %xmm3, %xmm0, %xmm1 -; AVX512DQ-NEXT: vpackuswb %xmm2, %xmm1, %xmm1 -; AVX512DQ-NEXT: vpsrlw $8, %xmm1, %xmm1 -; AVX512DQ-NEXT: vpmovzxbw {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero +; AVX512DQ-NEXT: vpand %xmm3, %xmm0, %xmm0 +; AVX512DQ-NEXT: vpackuswb %xmm2, %xmm0, %xmm0 +; AVX512DQ-NEXT: vpsrlw $8, %xmm0, %xmm1 ; AVX512DQ-NEXT: vpmullw %xmm1, %xmm0, %xmm0 ; AVX512DQ-NEXT: vpextrb $0, %xmm0, %eax ; AVX512DQ-NEXT: # kill: def $al killed $al killed $eax @@ -2159,10 +2155,9 @@ define i8 @test_v32i8(<32 x i8> %a0) { ; AVX512DQVL-NEXT: vpsrld $16, %xmm1, %xmm1 ; AVX512DQVL-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] ; AVX512DQVL-NEXT: vpmullw %xmm1, %xmm0, %xmm0 -; AVX512DQVL-NEXT: vpand %xmm3, %xmm0, %xmm1 -; AVX512DQVL-NEXT: vpackuswb %xmm2, %xmm1, %xmm1 -; AVX512DQVL-NEXT: vpsrlw $8, %xmm1, %xmm1 -; AVX512DQVL-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] +; AVX512DQVL-NEXT: vpand %xmm3, %xmm0, %xmm0 +; AVX512DQVL-NEXT: vpackuswb %xmm2, %xmm0, %xmm0 +; AVX512DQVL-NEXT: vpsrlw $8, %xmm0, %xmm1 ; AVX512DQVL-NEXT: vpmullw %xmm1, %xmm0, %xmm0 ; AVX512DQVL-NEXT: vpextrb $0, %xmm0, %eax ; AVX512DQVL-NEXT: # kill: def $al killed $al killed $eax @@ -2208,8 +2203,9 @@ define i8 @test_v64i8(<64 x i8> %a0) { ; SSE2-NEXT: psrldq {{.*#+}} xmm0 = xmm0[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero ; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] ; SSE2-NEXT: pmullw %xmm2, %xmm0 -; SSE2-NEXT: pand %xmm0, %xmm1 -; SSE2-NEXT: packuswb %xmm3, %xmm1 +; SSE2-NEXT: pand %xmm1, %xmm0 +; SSE2-NEXT: packuswb %xmm3, %xmm0 +; SSE2-NEXT: movdqa %xmm0, %xmm1 ; SSE2-NEXT: psrlw $8, %xmm1 ; SSE2-NEXT: pmullw %xmm0, %xmm1 ; SSE2-NEXT: movd %xmm1, %eax @@ -2323,10 +2319,9 @@ define i8 @test_v64i8(<64 x i8> %a0) { ; AVX2-NEXT: vpsrld $16, %xmm2, %xmm2 ; AVX2-NEXT: vpmovzxbw {{.*#+}} xmm2 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero ; AVX2-NEXT: vpmullw %xmm2, %xmm0, %xmm0 -; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm1 -; AVX2-NEXT: vpackuswb %xmm3, %xmm1, %xmm1 -; AVX2-NEXT: vpsrlw $8, %xmm1, %xmm1 -; AVX2-NEXT: vpmovzxbw {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero +; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpackuswb %xmm3, %xmm0, %xmm0 +; AVX2-NEXT: vpsrlw $8, %xmm0, %xmm1 ; AVX2-NEXT: vpmullw %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: vpextrb $0, %xmm0, %eax ; AVX2-NEXT: # kill: def $al killed $al killed $eax @@ -2365,10 +2360,9 @@ define i8 @test_v64i8(<64 x i8> %a0) { ; AVX512BW-NEXT: vpsrld $16, %xmm1, %xmm1 ; AVX512BW-NEXT: vpmovzxbw {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero ; AVX512BW-NEXT: vpmullw %xmm1, %xmm0, %xmm0 -; AVX512BW-NEXT: vpand %xmm3, %xmm0, %xmm1 -; AVX512BW-NEXT: vpackuswb %xmm2, %xmm1, %xmm1 -; AVX512BW-NEXT: vpsrlw $8, %xmm1, %xmm1 -; AVX512BW-NEXT: vpmovzxbw {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero +; AVX512BW-NEXT: vpand %xmm3, %xmm0, %xmm0 +; AVX512BW-NEXT: vpackuswb %xmm2, %xmm0, %xmm0 +; AVX512BW-NEXT: vpsrlw $8, %xmm0, %xmm1 ; AVX512BW-NEXT: vpmullw %xmm1, %xmm0, %xmm0 ; AVX512BW-NEXT: vpextrb $0, %xmm0, %eax ; AVX512BW-NEXT: # kill: def $al killed $al killed $eax @@ -2407,10 +2401,9 @@ define i8 @test_v64i8(<64 x i8> %a0) { ; AVX512BWVL-NEXT: vpsrld $16, %xmm1, %xmm1 ; AVX512BWVL-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] ; AVX512BWVL-NEXT: vpmullw %xmm1, %xmm0, %xmm0 -; AVX512BWVL-NEXT: vpand %xmm3, %xmm0, %xmm1 -; AVX512BWVL-NEXT: vpackuswb %xmm2, %xmm1, %xmm1 -; AVX512BWVL-NEXT: vpsrlw $8, %xmm1, %xmm1 -; AVX512BWVL-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] +; AVX512BWVL-NEXT: vpand %xmm3, %xmm0, %xmm0 +; AVX512BWVL-NEXT: vpackuswb %xmm2, %xmm0, %xmm0 +; AVX512BWVL-NEXT: vpsrlw $8, %xmm0, %xmm1 ; AVX512BWVL-NEXT: vpmullw %xmm1, %xmm0, %xmm0 ; AVX512BWVL-NEXT: vpextrb $0, %xmm0, %eax ; AVX512BWVL-NEXT: # kill: def $al killed $al killed $eax @@ -2444,10 +2437,9 @@ define i8 @test_v64i8(<64 x i8> %a0) { ; AVX512DQ-NEXT: vpsrld $16, %xmm2, %xmm2 ; AVX512DQ-NEXT: vpmovzxbw {{.*#+}} xmm2 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero ; AVX512DQ-NEXT: vpmullw %xmm2, %xmm0, %xmm0 -; AVX512DQ-NEXT: vpand %xmm1, %xmm0, %xmm1 -; AVX512DQ-NEXT: vpackuswb %xmm3, %xmm1, %xmm1 -; AVX512DQ-NEXT: vpsrlw $8, %xmm1, %xmm1 -; AVX512DQ-NEXT: vpmovzxbw {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero +; AVX512DQ-NEXT: vpand %xmm1, %xmm0, %xmm0 +; AVX512DQ-NEXT: vpackuswb %xmm3, %xmm0, %xmm0 +; AVX512DQ-NEXT: vpsrlw $8, %xmm0, %xmm1 ; AVX512DQ-NEXT: vpmullw %xmm1, %xmm0, %xmm0 ; AVX512DQ-NEXT: vpextrb $0, %xmm0, %eax ; AVX512DQ-NEXT: # kill: def $al killed $al killed $eax @@ -2486,10 +2478,9 @@ define i8 @test_v64i8(<64 x i8> %a0) { ; AVX512DQVL-NEXT: vpsrld $16, %xmm1, %xmm1 ; AVX512DQVL-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] ; AVX512DQVL-NEXT: vpmullw %xmm1, %xmm0, %xmm0 -; AVX512DQVL-NEXT: vpand %xmm3, %xmm0, %xmm1 -; AVX512DQVL-NEXT: vpackuswb %xmm2, %xmm1, %xmm1 -; AVX512DQVL-NEXT: vpsrlw $8, %xmm1, %xmm1 -; AVX512DQVL-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] +; AVX512DQVL-NEXT: vpand %xmm3, %xmm0, %xmm0 +; AVX512DQVL-NEXT: vpackuswb %xmm2, %xmm0, %xmm0 +; AVX512DQVL-NEXT: vpsrlw $8, %xmm0, %xmm1 ; AVX512DQVL-NEXT: vpmullw %xmm1, %xmm0, %xmm0 ; AVX512DQVL-NEXT: vpextrb $0, %xmm0, %eax ; AVX512DQVL-NEXT: # kill: def $al killed $al killed $eax @@ -2555,8 +2546,9 @@ define i8 @test_v128i8(<128 x i8> %a0) { ; SSE2-NEXT: psrldq {{.*#+}} xmm1 = xmm1[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero ; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] ; SSE2-NEXT: pmullw %xmm2, %xmm1 -; SSE2-NEXT: pand %xmm1, %xmm0 -; SSE2-NEXT: packuswb %xmm3, %xmm0 +; SSE2-NEXT: pand %xmm0, %xmm1 +; SSE2-NEXT: packuswb %xmm3, %xmm1 +; SSE2-NEXT: movdqa %xmm1, %xmm0 ; SSE2-NEXT: psrlw $8, %xmm0 ; SSE2-NEXT: pmullw %xmm1, %xmm0 ; SSE2-NEXT: movd %xmm0, %eax @@ -2729,10 +2721,9 @@ define i8 @test_v128i8(<128 x i8> %a0) { ; AVX2-NEXT: vpsrld $16, %xmm1, %xmm1 ; AVX2-NEXT: vpmovzxbw {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero ; AVX2-NEXT: vpmullw %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vpand %xmm2, %xmm0, %xmm1 -; AVX2-NEXT: vpackuswb %xmm3, %xmm1, %xmm1 -; AVX2-NEXT: vpsrlw $8, %xmm1, %xmm1 -; AVX2-NEXT: vpmovzxbw {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero +; AVX2-NEXT: vpand %xmm2, %xmm0, %xmm0 +; AVX2-NEXT: vpackuswb %xmm3, %xmm0, %xmm0 +; AVX2-NEXT: vpsrlw $8, %xmm0, %xmm1 ; AVX2-NEXT: vpmullw %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: vpextrb $0, %xmm0, %eax ; AVX2-NEXT: # kill: def $al killed $al killed $eax @@ -2779,10 +2770,9 @@ define i8 @test_v128i8(<128 x i8> %a0) { ; AVX512BW-NEXT: vpsrld $16, %xmm1, %xmm1 ; AVX512BW-NEXT: vpmovzxbw {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero ; AVX512BW-NEXT: vpmullw %xmm1, %xmm0, %xmm0 -; AVX512BW-NEXT: vpand %xmm3, %xmm0, %xmm1 -; AVX512BW-NEXT: vpackuswb %xmm2, %xmm1, %xmm1 -; AVX512BW-NEXT: vpsrlw $8, %xmm1, %xmm1 -; AVX512BW-NEXT: vpmovzxbw {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero +; AVX512BW-NEXT: vpand %xmm3, %xmm0, %xmm0 +; AVX512BW-NEXT: vpackuswb %xmm2, %xmm0, %xmm0 +; AVX512BW-NEXT: vpsrlw $8, %xmm0, %xmm1 ; AVX512BW-NEXT: vpmullw %xmm1, %xmm0, %xmm0 ; AVX512BW-NEXT: vpextrb $0, %xmm0, %eax ; AVX512BW-NEXT: # kill: def $al killed $al killed $eax @@ -2829,10 +2819,9 @@ define i8 @test_v128i8(<128 x i8> %a0) { ; AVX512BWVL-NEXT: vpsrld $16, %xmm1, %xmm1 ; AVX512BWVL-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] ; AVX512BWVL-NEXT: vpmullw %xmm1, %xmm0, %xmm0 -; AVX512BWVL-NEXT: vpand %xmm3, %xmm0, %xmm1 -; AVX512BWVL-NEXT: vpackuswb %xmm2, %xmm1, %xmm1 -; AVX512BWVL-NEXT: vpsrlw $8, %xmm1, %xmm1 -; AVX512BWVL-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] +; AVX512BWVL-NEXT: vpand %xmm3, %xmm0, %xmm0 +; AVX512BWVL-NEXT: vpackuswb %xmm2, %xmm0, %xmm0 +; AVX512BWVL-NEXT: vpsrlw $8, %xmm0, %xmm1 ; AVX512BWVL-NEXT: vpmullw %xmm1, %xmm0, %xmm0 ; AVX512BWVL-NEXT: vpextrb $0, %xmm0, %eax ; AVX512BWVL-NEXT: # kill: def $al killed $al killed $eax @@ -2873,10 +2862,9 @@ define i8 @test_v128i8(<128 x i8> %a0) { ; AVX512DQ-NEXT: vpsrld $16, %xmm1, %xmm1 ; AVX512DQ-NEXT: vpmovzxbw {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero ; AVX512DQ-NEXT: vpmullw %xmm1, %xmm0, %xmm0 -; AVX512DQ-NEXT: vpand %xmm2, %xmm0, %xmm1 -; AVX512DQ-NEXT: vpackuswb %xmm3, %xmm1, %xmm1 -; AVX512DQ-NEXT: vpsrlw $8, %xmm1, %xmm1 -; AVX512DQ-NEXT: vpmovzxbw {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero +; AVX512DQ-NEXT: vpand %xmm2, %xmm0, %xmm0 +; AVX512DQ-NEXT: vpackuswb %xmm3, %xmm0, %xmm0 +; AVX512DQ-NEXT: vpsrlw $8, %xmm0, %xmm1 ; AVX512DQ-NEXT: vpmullw %xmm1, %xmm0, %xmm0 ; AVX512DQ-NEXT: vpextrb $0, %xmm0, %eax ; AVX512DQ-NEXT: # kill: def $al killed $al killed $eax @@ -2922,10 +2910,9 @@ define i8 @test_v128i8(<128 x i8> %a0) { ; AVX512DQVL-NEXT: vpsrld $16, %xmm1, %xmm1 ; AVX512DQVL-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] ; AVX512DQVL-NEXT: vpmullw %xmm1, %xmm0, %xmm0 -; AVX512DQVL-NEXT: vpand %xmm2, %xmm0, %xmm1 -; AVX512DQVL-NEXT: vpackuswb %xmm3, %xmm1, %xmm1 -; AVX512DQVL-NEXT: vpsrlw $8, %xmm1, %xmm1 -; AVX512DQVL-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] +; AVX512DQVL-NEXT: vpand %xmm2, %xmm0, %xmm0 +; AVX512DQVL-NEXT: vpackuswb %xmm3, %xmm0, %xmm0 +; AVX512DQVL-NEXT: vpsrlw $8, %xmm0, %xmm1 ; AVX512DQVL-NEXT: vpmullw %xmm1, %xmm0, %xmm0 ; AVX512DQVL-NEXT: vpextrb $0, %xmm0, %eax ; AVX512DQVL-NEXT: # kill: def $al killed $al killed $eax