[DAGCombine] visitEXTRACT_VECTOR_ELT - add SimplifyDemandedBits multi use support

Similar to what we already do with SimplifyDemandedVectorElts, call SimplifyDemandedBits across all the extracted elements of the source vector, treating it as single use.

There's a minor regression in store-weird-sizes.ll which will be addressed in an upcoming SimplifyDemandedBits patch.
This commit is contained in:
Simon Pilgrim 2020-02-20 15:49:22 +00:00
parent e1296518ec
commit ac3ec9a0c4
9 changed files with 262 additions and 278 deletions

View File

@ -323,7 +323,8 @@ namespace {
}
bool SimplifyDemandedBits(SDValue Op, const APInt &DemandedBits,
const APInt &DemandedElts);
const APInt &DemandedElts,
bool AssumeSingleUse = false);
bool SimplifyDemandedVectorElts(SDValue Op, const APInt &DemandedElts,
bool AssumeSingleUse = false);
@ -1058,10 +1059,12 @@ CommitTargetLoweringOpt(const TargetLowering::TargetLoweringOpt &TLO) {
/// Check the specified integer node value to see if it can be simplified or if
/// things it uses can be simplified by bit propagation. If so, return true.
bool DAGCombiner::SimplifyDemandedBits(SDValue Op, const APInt &DemandedBits,
const APInt &DemandedElts) {
const APInt &DemandedElts,
bool AssumeSingleUse) {
TargetLowering::TargetLoweringOpt TLO(DAG, LegalTypes, LegalOperations);
KnownBits Known;
if (!TLI.SimplifyDemandedBits(Op, DemandedBits, DemandedElts, Known, TLO))
if (!TLI.SimplifyDemandedBits(Op, DemandedBits, DemandedElts, Known, TLO, 0,
AssumeSingleUse))
return false;
// Revisit the node.
@ -17210,6 +17213,7 @@ SDValue DAGCombiner::visitEXTRACT_VECTOR_ELT(SDNode *N) {
// extract_vector_elt of out-of-bounds element -> UNDEF
auto *IndexC = dyn_cast<ConstantSDNode>(Index);
unsigned NumElts = VecVT.getVectorNumElements();
unsigned VecEltBitWidth = VecVT.getScalarSizeInBits();
if (IndexC && IndexC->getAPIntValue().uge(NumElts))
return DAG.getUNDEF(ScalarVT);
@ -17251,7 +17255,6 @@ SDValue DAGCombiner::visitEXTRACT_VECTOR_ELT(SDNode *N) {
"Extract element and scalar to vector can't change element type "
"from FP to integer.");
unsigned XBitWidth = X.getValueSizeInBits();
unsigned VecEltBitWidth = VecVT.getScalarSizeInBits();
BCTruncElt = IsLE ? 0 : XBitWidth / VecEltBitWidth - 1;
// An extract element return value type can be wider than its vector
@ -17334,6 +17337,14 @@ SDValue DAGCombiner::visitEXTRACT_VECTOR_ELT(SDNode *N) {
AddToWorklist(N);
return SDValue(N, 0);
}
APInt DemandedBits = APInt::getAllOnesValue(VecEltBitWidth);
if (SimplifyDemandedBits(VecOp, DemandedBits, DemandedElts, true)) {
// We simplified the vector operand of this extract element. If this
// extract is not dead, visit it again so it is folded properly.
if (N->getOpcode() != ISD::DELETED_NODE)
AddToWorklist(N);
return SDValue(N, 0);
}
}
// Everything under here is trying to match an extract of a loaded value.

View File

@ -102,8 +102,6 @@ define i8 @test_v9i8(<9 x i8> %a) nounwind {
; CHECK-NEXT: mov v0.b[11], w8
; CHECK-NEXT: mov v0.b[12], w8
; CHECK-NEXT: mov v0.b[13], w8
; CHECK-NEXT: mov v0.b[14], w8
; CHECK-NEXT: mov v0.b[15], w8
; CHECK-NEXT: ext v1.16b, v0.16b, v0.16b, #8
; CHECK-NEXT: and v1.8b, v0.8b, v1.8b
; CHECK-NEXT: umov w8, v1.b[1]
@ -113,7 +111,7 @@ define i8 @test_v9i8(<9 x i8> %a) nounwind {
; CHECK-NEXT: and w8, w8, w9
; CHECK-NEXT: umov w9, v1.b[3]
; CHECK-NEXT: and w8, w8, w9
; CHECK-NEXT: umov w9, v1.b[4]
; CHECK-NEXT: umov w9, v0.b[4]
; CHECK-NEXT: and w8, w8, w9
; CHECK-NEXT: umov w9, v1.b[5]
; CHECK-NEXT: and w8, w8, w9

View File

@ -69,7 +69,7 @@ define amdgpu_kernel void @bitcast_int_to_fpvector_extract_0(float addrspace(1)*
; GCN-LABEL: bitcast_int_to_fpvector_extract_0:
; GCN: ; %bb.0:
; GCN-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9
; GCN-NEXT: s_load_dwordx2 s[12:13], s[0:1], 0xd
; GCN-NEXT: s_load_dword s12, s[0:1], 0xd
; GCN-NEXT: s_mov_b32 s3, 0xf000
; GCN-NEXT: s_mov_b32 s10, 0
; GCN-NEXT: v_lshlrev_b32_e32 v0, 3, v0
@ -77,7 +77,7 @@ define amdgpu_kernel void @bitcast_int_to_fpvector_extract_0(float addrspace(1)*
; GCN-NEXT: s_mov_b32 s11, s3
; GCN-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NEXT: s_mov_b64 s[8:9], s[6:7]
; GCN-NEXT: buffer_load_dwordx2 v[0:1], v[0:1], s[8:11], 0 addr64
; GCN-NEXT: buffer_load_dword v0, v[0:1], s[8:11], 0 addr64
; GCN-NEXT: s_mov_b32 s2, -1
; GCN-NEXT: s_mov_b32 s0, s4
; GCN-NEXT: s_mov_b32 s1, s5

View File

@ -927,35 +927,31 @@ define amdgpu_kernel void @idot4_acc16_vecMul(<4 x i8> addrspace(1)* %src1,
; GFX8: ; %bb.0: ; %entry
; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
; GFX8-NEXT: s_mov_b32 s2, 0xffff
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: s_load_dword s3, s[6:7], 0x0
; GFX8-NEXT: s_load_dword s2, s[6:7], 0x0
; GFX8-NEXT: v_mov_b32_e32 v0, s0
; GFX8-NEXT: v_mov_b32_e32 v1, s1
; GFX8-NEXT: flat_load_ushort v2, v[0:1]
; GFX8-NEXT: s_load_dword s0, s[4:5], 0x0
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: s_bfe_i32 s6, s3, 0x80000
; GFX8-NEXT: s_lshr_b32 s4, s3, 16
; GFX8-NEXT: v_ashrrev_i16_e64 v3, 8, s3
; GFX8-NEXT: s_bfe_i32 s3, s4, 0x80000
; GFX8-NEXT: s_lshr_b32 s1, s0, 16
; GFX8-NEXT: s_bfe_i32 s5, s0, 0x80000
; GFX8-NEXT: v_ashrrev_i16_e64 v4, 8, s0
; GFX8-NEXT: s_bfe_i32 s0, s1, 0x80000
; GFX8-NEXT: v_ashrrev_i16_e64 v6, 8, s1
; GFX8-NEXT: s_and_b32 s1, s2, s6
; GFX8-NEXT: v_ashrrev_i16_e64 v5, 8, s4
; GFX8-NEXT: s_and_b32 s4, s2, s5
; GFX8-NEXT: v_mov_b32_e32 v7, s1
; GFX8-NEXT: s_and_b32 s3, s2, s3
; GFX8-NEXT: s_and_b32 s0, s2, s0
; GFX8-NEXT: s_sext_i32_i8 s1, s2
; GFX8-NEXT: v_lshrrev_b16_e64 v3, 8, s2
; GFX8-NEXT: v_mov_b32_e32 v4, s1
; GFX8-NEXT: s_bfe_i32 s3, s2, 0x80010
; GFX8-NEXT: v_lshrrev_b16_e64 v5, 8, s0
; GFX8-NEXT: s_sext_i32_i8 s1, s0
; GFX8-NEXT: v_bfe_i32 v5, v5, 0, 8
; GFX8-NEXT: v_bfe_i32 v3, v3, 0, 8
; GFX8-NEXT: s_bfe_i32 s4, s0, 0x80010
; GFX8-NEXT: s_ashr_i32 s2, s2, 24
; GFX8-NEXT: v_mov_b32_e32 v6, s3
; GFX8-NEXT: s_ashr_i32 s0, s0, 24
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: v_mad_u32_u24 v2, s4, v7, v2
; GFX8-NEXT: v_mad_u32_u24 v2, v4, v3, v2
; GFX8-NEXT: v_mov_b32_e32 v3, s3
; GFX8-NEXT: v_mad_u32_u24 v2, s0, v3, v2
; GFX8-NEXT: v_mad_u32_u24 v2, v6, v5, v2
; GFX8-NEXT: v_mad_i32_i24 v2, s1, v4, v2
; GFX8-NEXT: v_mad_i32_i24 v2, v5, v3, v2
; GFX8-NEXT: v_mad_i32_i24 v2, s4, v6, v2
; GFX8-NEXT: v_mov_b32_e32 v3, s2
; GFX8-NEXT: v_mad_i32_i24 v2, s0, v3, v2
; GFX8-NEXT: flat_store_short v[0:1], v2
; GFX8-NEXT: s_endpgm
;

View File

@ -2043,34 +2043,33 @@ define amdgpu_kernel void @udot4_acc8_vecMul(<4 x i8> addrspace(1)* %src1,
; GFX8: ; %bb.0: ; %entry
; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
; GFX8-NEXT: s_movk_i32 s8, 0xff
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: v_mov_b32_e32 v0, s0
; GFX8-NEXT: v_mov_b32_e32 v1, s1
; GFX8-NEXT: flat_load_ubyte v2, v[0:1]
; GFX8-NEXT: s_movk_i32 s0, 0xff
; GFX8-NEXT: v_mov_b32_e32 v3, s0
; GFX8-NEXT: s_load_dword s0, s[4:5], 0x0
; GFX8-NEXT: s_load_dword s1, s[6:7], 0x0
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: v_mov_b32_e32 v3, s0
; GFX8-NEXT: v_mov_b32_e32 v4, s1
; GFX8-NEXT: s_and_b32 s7, s1, s8
; GFX8-NEXT: s_lshr_b32 s2, s0, 24
; GFX8-NEXT: s_lshr_b32 s3, s1, 24
; GFX8-NEXT: s_bfe_u32 s6, s1, 0x80010
; GFX8-NEXT: v_mul_u32_u24_sdwa v3, v3, v4 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:BYTE_1
; GFX8-NEXT: s_and_b32 s5, s0, s8
; GFX8-NEXT: v_mov_b32_e32 v4, s7
; GFX8-NEXT: v_mul_u32_u24_e32 v4, s5, v4
; GFX8-NEXT: s_bfe_u32 s4, s0, 0x80010
; GFX8-NEXT: v_mov_b32_e32 v5, s6
; GFX8-NEXT: v_mov_b32_e32 v6, s3
; GFX8-NEXT: s_lshr_b32 s4, s1, 24
; GFX8-NEXT: s_lshr_b32 s3, s0, 16
; GFX8-NEXT: v_mov_b32_e32 v4, s0
; GFX8-NEXT: v_mov_b32_e32 v5, s1
; GFX8-NEXT: s_mul_i32 s0, s0, s1
; GFX8-NEXT: s_lshr_b32 s5, s1, 16
; GFX8-NEXT: v_mul_u32_u24_sdwa v4, v4, v5 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:BYTE_1
; GFX8-NEXT: v_mov_b32_e32 v5, s5
; GFX8-NEXT: v_and_b32_e32 v3, s0, v3
; GFX8-NEXT: v_mov_b32_e32 v6, s4
; GFX8-NEXT: v_mov_b32_e32 v7, s2
; GFX8-NEXT: v_or_b32_sdwa v3, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; GFX8-NEXT: v_mul_u32_u24_e32 v5, s4, v5
; GFX8-NEXT: v_or_b32_e32 v3, v3, v4
; GFX8-NEXT: v_mul_u32_u24_e32 v5, s3, v5
; GFX8-NEXT: v_mul_u32_u24_sdwa v6, v7, v6 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
; GFX8-NEXT: v_and_b32_e32 v3, 0xffff, v3
; GFX8-NEXT: v_or_b32_sdwa v5, v5, v6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; GFX8-NEXT: v_or_b32_e32 v4, v3, v5
; GFX8-NEXT: v_or_b32_sdwa v4, v5, v6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; GFX8-NEXT: v_or_b32_e32 v4, v3, v4
; GFX8-NEXT: v_lshrrev_b32_e32 v5, 8, v4
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: v_add_u32_e32 v2, vcc, v2, v3

View File

@ -1637,68 +1637,60 @@ define amdgpu_kernel void @idot8_acc16_vecMul(<8 x i4> addrspace(1)* %src1,
; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: s_load_dword s2, s[6:7], 0x0
; GFX8-NEXT: s_load_dword s7, s[6:7], 0x0
; GFX8-NEXT: v_mov_b32_e32 v0, s0
; GFX8-NEXT: v_mov_b32_e32 v1, s1
; GFX8-NEXT: flat_load_ushort v2, v[0:1]
; GFX8-NEXT: s_load_dword s0, s[4:5], 0x0
; GFX8-NEXT: s_load_dword s1, s[4:5], 0x0
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: v_lshlrev_b16_e64 v4, 12, s2
; GFX8-NEXT: s_lshr_b32 s15, s2, 4
; GFX8-NEXT: s_lshr_b32 s16, s2, 8
; GFX8-NEXT: v_lshlrev_b16_e64 v12, 12, s16
; GFX8-NEXT: v_lshlrev_b16_e64 v3, 12, s0
; GFX8-NEXT: s_lshr_b32 s8, s0, 4
; GFX8-NEXT: s_lshr_b32 s9, s0, 8
; GFX8-NEXT: v_lshlrev_b16_e64 v5, 12, s9
; GFX8-NEXT: v_lshlrev_b16_e64 v6, 12, s8
; GFX8-NEXT: v_lshlrev_b16_e64 v13, 12, s15
; GFX8-NEXT: v_ashrrev_i16_e32 v3, 12, v3
; GFX8-NEXT: v_ashrrev_i16_e32 v4, 12, v4
; GFX8-NEXT: s_lshr_b32 s7, s0, 12
; GFX8-NEXT: s_lshr_b32 s14, s2, 12
; GFX8-NEXT: v_ashrrev_i16_e32 v6, 12, v6
; GFX8-NEXT: v_ashrrev_i16_e32 v5, 12, v5
; GFX8-NEXT: v_ashrrev_i16_e32 v12, 12, v12
; GFX8-NEXT: v_ashrrev_i16_e32 v13, 12, v13
; GFX8-NEXT: v_lshlrev_b16_e64 v7, 12, s7
; GFX8-NEXT: v_lshlrev_b16_e64 v14, 12, s14
; GFX8-NEXT: s_lshr_b32 s6, s0, 16
; GFX8-NEXT: s_lshr_b32 s13, s2, 16
; GFX8-NEXT: v_mul_u32_u24_e32 v5, v5, v12
; GFX8-NEXT: v_lshlrev_b16_e64 v8, 12, s6
; GFX8-NEXT: v_lshlrev_b16_e64 v15, 12, s13
; GFX8-NEXT: s_lshr_b32 s5, s0, 20
; GFX8-NEXT: s_lshr_b32 s12, s2, 20
; GFX8-NEXT: v_ashrrev_i16_e32 v7, 12, v7
; GFX8-NEXT: v_ashrrev_i16_e32 v14, 12, v14
; GFX8-NEXT: v_lshlrev_b16_e64 v9, 12, s5
; GFX8-NEXT: v_lshlrev_b16_e64 v16, 12, s12
; GFX8-NEXT: s_lshr_b32 s4, s0, 24
; GFX8-NEXT: s_lshr_b32 s11, s2, 24
; GFX8-NEXT: v_ashrrev_i16_e32 v8, 12, v8
; GFX8-NEXT: v_ashrrev_i16_e32 v15, 12, v15
; GFX8-NEXT: v_lshlrev_b16_e64 v10, 12, s4
; GFX8-NEXT: v_lshlrev_b16_e64 v17, 12, s11
; GFX8-NEXT: s_lshr_b32 s1, s0, 28
; GFX8-NEXT: s_lshr_b32 s10, s2, 28
; GFX8-NEXT: v_ashrrev_i16_e32 v9, 12, v9
; GFX8-NEXT: v_ashrrev_i16_e32 v16, 12, v16
; GFX8-NEXT: v_lshlrev_b16_e64 v11, 12, s1
; GFX8-NEXT: v_lshlrev_b16_e64 v18, 12, s10
; GFX8-NEXT: v_ashrrev_i16_e32 v10, 12, v10
; GFX8-NEXT: v_ashrrev_i16_e32 v17, 12, v17
; GFX8-NEXT: v_ashrrev_i16_e32 v11, 12, v11
; GFX8-NEXT: v_ashrrev_i16_e32 v18, 12, v18
; GFX8-NEXT: s_lshl_b32 s5, s7, 28
; GFX8-NEXT: s_ashr_i64 s[4:5], s[4:5], 60
; GFX8-NEXT: s_lshl_b32 s9, s7, 24
; GFX8-NEXT: s_lshl_b32 s11, s7, 20
; GFX8-NEXT: s_lshl_b32 s5, s1, 28
; GFX8-NEXT: s_ashr_i64 s[14:15], s[4:5], 60
; GFX8-NEXT: s_lshl_b32 s5, s1, 20
; GFX8-NEXT: s_lshl_b32 s13, s1, 24
; GFX8-NEXT: s_ashr_i64 s[8:9], s[8:9], 60
; GFX8-NEXT: s_ashr_i64 s[10:11], s[10:11], 60
; GFX8-NEXT: v_mov_b32_e32 v3, s4
; GFX8-NEXT: s_ashr_i64 s[4:5], s[4:5], 60
; GFX8-NEXT: v_mov_b32_e32 v4, s10
; GFX8-NEXT: s_ashr_i64 s[12:13], s[12:13], 60
; GFX8-NEXT: s_lshl_b32 s5, s7, 16
; GFX8-NEXT: v_mov_b32_e32 v5, s8
; GFX8-NEXT: s_lshl_b32 s9, s1, 16
; GFX8-NEXT: s_lshl_b32 s11, s7, 12
; GFX8-NEXT: s_ashr_i64 s[16:17], s[4:5], 60
; GFX8-NEXT: s_ashr_i64 s[8:9], s[8:9], 60
; GFX8-NEXT: v_mul_i32_i24_e32 v4, s4, v4
; GFX8-NEXT: s_lshl_b32 s5, s1, 12
; GFX8-NEXT: s_lshl_b32 s9, s7, 8
; GFX8-NEXT: s_ashr_i64 s[10:11], s[10:11], 60
; GFX8-NEXT: v_mov_b32_e32 v6, s16
; GFX8-NEXT: s_ashr_i64 s[20:21], s[8:9], 60
; GFX8-NEXT: s_lshl_b32 s13, s1, 8
; GFX8-NEXT: s_ashr_i64 s[18:19], s[4:5], 60
; GFX8-NEXT: s_lshl_b32 s5, s7, 4
; GFX8-NEXT: v_mov_b32_e32 v7, s10
; GFX8-NEXT: s_lshl_b32 s9, s1, 4
; GFX8-NEXT: s_ashr_i64 s[24:25], s[4:5], 60
; GFX8-NEXT: s_ashr_i64 s[22:23], s[12:13], 60
; GFX8-NEXT: v_mov_b32_e32 v8, s20
; GFX8-NEXT: s_ashr_i64 s[6:7], s[6:7], 60
; GFX8-NEXT: s_ashr_i64 s[26:27], s[8:9], 60
; GFX8-NEXT: v_mov_b32_e32 v9, s24
; GFX8-NEXT: s_ashr_i64 s[0:1], s[0:1], 60
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: v_mad_u32_u24 v2, v3, v4, v2
; GFX8-NEXT: v_mad_u32_u24 v2, v6, v13, v2
; GFX8-NEXT: v_add_u32_sdwa v2, vcc, v5, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:WORD_0
; GFX8-NEXT: v_mad_u32_u24 v2, v7, v14, v2
; GFX8-NEXT: v_mad_u32_u24 v2, v8, v15, v2
; GFX8-NEXT: v_mad_u32_u24 v2, v9, v16, v2
; GFX8-NEXT: v_mad_u32_u24 v2, v10, v17, v2
; GFX8-NEXT: v_mad_u32_u24 v2, v11, v18, v2
; GFX8-NEXT: v_mad_i32_i24 v2, s14, v3, v2
; GFX8-NEXT: v_mad_i32_i24 v2, s12, v5, v2
; GFX8-NEXT: v_add_u32_sdwa v2, vcc, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:WORD_0
; GFX8-NEXT: v_mad_i32_i24 v2, s8, v6, v2
; GFX8-NEXT: v_mad_i32_i24 v2, s18, v7, v2
; GFX8-NEXT: v_mad_i32_i24 v2, s22, v8, v2
; GFX8-NEXT: v_mad_i32_i24 v2, s26, v9, v2
; GFX8-NEXT: v_mov_b32_e32 v3, s6
; GFX8-NEXT: v_mad_i32_i24 v2, s0, v3, v2
; GFX8-NEXT: flat_store_short v[0:1], v2
; GFX8-NEXT: s_endpgm
;
@ -2023,87 +2015,83 @@ define amdgpu_kernel void @idot8_acc8_vecMul(<8 x i4> addrspace(1)* %src1,
; GFX8: ; %bb.0: ; %entry
; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
; GFX8-NEXT: s_mov_b32 s2, 0xffff
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: v_mov_b32_e32 v0, s0
; GFX8-NEXT: v_mov_b32_e32 v1, s1
; GFX8-NEXT: flat_load_ubyte v2, v[0:1]
; GFX8-NEXT: s_load_dword s1, s[4:5], 0x0
; GFX8-NEXT: s_load_dword s2, s[6:7], 0x0
; GFX8-NEXT: s_mov_b32 s0, 0xffff
; GFX8-NEXT: s_load_dword s5, s[6:7], 0x0
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: s_lshr_b32 s8, s1, 4
; GFX8-NEXT: s_lshr_b32 s9, s1, 12
; GFX8-NEXT: s_lshr_b32 s10, s1, 8
; GFX8-NEXT: s_lshr_b32 s15, s2, 4
; GFX8-NEXT: s_lshr_b32 s16, s2, 12
; GFX8-NEXT: s_lshr_b32 s17, s2, 8
; GFX8-NEXT: v_lshlrev_b16_e64 v3, 12, s1
; GFX8-NEXT: v_lshlrev_b16_e64 v4, 12, s2
; GFX8-NEXT: v_lshlrev_b16_e64 v5, 12, s10
; GFX8-NEXT: v_lshlrev_b16_e64 v6, 12, s9
; GFX8-NEXT: v_lshlrev_b16_e64 v7, 12, s8
; GFX8-NEXT: v_lshlrev_b16_e64 v12, 12, s17
; GFX8-NEXT: v_lshlrev_b16_e64 v13, 12, s16
; GFX8-NEXT: v_lshlrev_b16_e64 v14, 12, s15
; GFX8-NEXT: s_lshr_b32 s4, s1, 20
; GFX8-NEXT: s_lshr_b32 s5, s1, 16
; GFX8-NEXT: s_lshr_b32 s6, s1, 28
; GFX8-NEXT: s_lshr_b32 s7, s1, 24
; GFX8-NEXT: s_lshr_b32 s11, s2, 20
; GFX8-NEXT: s_lshr_b32 s12, s2, 16
; GFX8-NEXT: s_lshr_b32 s13, s2, 28
; GFX8-NEXT: s_lshr_b32 s14, s2, 24
; GFX8-NEXT: v_lshlrev_b16_e64 v8, 12, s7
; GFX8-NEXT: v_lshlrev_b16_e64 v9, 12, s6
; GFX8-NEXT: v_lshlrev_b16_e64 v10, 12, s5
; GFX8-NEXT: v_lshlrev_b16_e64 v11, 12, s4
; GFX8-NEXT: v_lshlrev_b16_e64 v15, 12, s14
; GFX8-NEXT: v_lshlrev_b16_e64 v16, 12, s13
; GFX8-NEXT: v_lshlrev_b16_e64 v17, 12, s12
; GFX8-NEXT: v_lshlrev_b16_e64 v18, 12, s11
; GFX8-NEXT: v_ashrrev_i16_e32 v3, 12, v3
; GFX8-NEXT: v_ashrrev_i16_e32 v4, 12, v4
; GFX8-NEXT: v_ashrrev_i16_e32 v5, 12, v5
; GFX8-NEXT: v_ashrrev_i16_e32 v6, 12, v6
; GFX8-NEXT: v_ashrrev_i16_e32 v7, 12, v7
; GFX8-NEXT: v_ashrrev_i16_e32 v12, 12, v12
; GFX8-NEXT: v_ashrrev_i16_e32 v13, 12, v13
; GFX8-NEXT: v_ashrrev_i16_e32 v14, 12, v14
; GFX8-NEXT: v_mul_u32_u24_sdwa v3, v3, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:BYTE_0
; GFX8-NEXT: v_mul_u32_u24_sdwa v4, v5, v12 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:BYTE_0
; GFX8-NEXT: v_mul_u32_u24_sdwa v5, v6, v13 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:BYTE_0
; GFX8-NEXT: v_mul_u32_u24_sdwa v6, v7, v14 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:BYTE_0
; GFX8-NEXT: v_ashrrev_i16_e32 v8, 12, v8
; GFX8-NEXT: v_ashrrev_i16_e32 v15, 12, v15
; GFX8-NEXT: v_ashrrev_i16_e32 v9, 12, v9
; GFX8-NEXT: v_ashrrev_i16_e32 v10, 12, v10
; GFX8-NEXT: v_ashrrev_i16_e32 v11, 12, v11
; GFX8-NEXT: v_ashrrev_i16_e32 v16, 12, v16
; GFX8-NEXT: v_ashrrev_i16_e32 v17, 12, v17
; GFX8-NEXT: v_ashrrev_i16_e32 v18, 12, v18
; GFX8-NEXT: v_or_b32_sdwa v3, v3, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; GFX8-NEXT: v_mul_u32_u24_sdwa v7, v8, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:BYTE_0
; GFX8-NEXT: v_mul_u32_u24_sdwa v8, v9, v16 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:BYTE_0
; GFX8-NEXT: v_mul_u32_u24_sdwa v9, v10, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:BYTE_0
; GFX8-NEXT: v_mul_u32_u24_sdwa v10, v11, v18 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:BYTE_0
; GFX8-NEXT: v_and_b32_e32 v3, s0, v3
; GFX8-NEXT: v_or_b32_sdwa v4, v4, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; GFX8-NEXT: v_or_b32_sdwa v9, v9, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; GFX8-NEXT: v_or_b32_sdwa v7, v7, v8 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; GFX8-NEXT: v_and_b32_e32 v5, s0, v9
; GFX8-NEXT: v_or_b32_e32 v4, v3, v4
; GFX8-NEXT: v_or_b32_e32 v6, v5, v7
; GFX8-NEXT: v_lshrrev_b32_e32 v7, 8, v4
; GFX8-NEXT: v_lshrrev_b32_e32 v8, 8, v6
; GFX8-NEXT: s_lshl_b32 s13, s1, 24
; GFX8-NEXT: s_lshl_b32 s17, s1, 16
; GFX8-NEXT: s_ashr_i64 s[22:23], s[4:5], 60
; GFX8-NEXT: s_lshl_b32 s25, s5, 24
; GFX8-NEXT: s_lshl_b32 s27, s5, 28
; GFX8-NEXT: s_lshl_b32 s29, s5, 16
; GFX8-NEXT: s_ashr_i64 s[10:11], s[0:1], 60
; GFX8-NEXT: s_lshl_b32 s15, s1, 28
; GFX8-NEXT: s_lshl_b32 s19, s5, 8
; GFX8-NEXT: s_lshl_b32 s21, s5, 12
; GFX8-NEXT: s_lshl_b32 s23, s5, 4
; GFX8-NEXT: s_lshl_b32 s5, s5, 20
; GFX8-NEXT: s_ashr_i64 s[12:13], s[12:13], 60
; GFX8-NEXT: s_ashr_i64 s[16:17], s[16:17], 60
; GFX8-NEXT: s_ashr_i64 s[24:25], s[24:25], 60
; GFX8-NEXT: s_ashr_i64 s[26:27], s[26:27], 60
; GFX8-NEXT: s_ashr_i64 s[28:29], s[28:29], 60
; GFX8-NEXT: s_lshl_b32 s7, s1, 8
; GFX8-NEXT: s_lshl_b32 s9, s1, 12
; GFX8-NEXT: s_lshl_b32 s11, s1, 4
; GFX8-NEXT: s_lshl_b32 s1, s1, 20
; GFX8-NEXT: s_ashr_i64 s[4:5], s[4:5], 60
; GFX8-NEXT: s_ashr_i64 s[14:15], s[14:15], 60
; GFX8-NEXT: v_mov_b32_e32 v6, s28
; GFX8-NEXT: v_mov_b32_e32 v7, s16
; GFX8-NEXT: v_mov_b32_e32 v8, s26
; GFX8-NEXT: v_mov_b32_e32 v9, s24
; GFX8-NEXT: v_mov_b32_e32 v10, s12
; GFX8-NEXT: v_mul_i32_i24_sdwa v6, v7, v6 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
; GFX8-NEXT: v_mul_i32_i24_e32 v7, s14, v8
; GFX8-NEXT: v_mul_i32_i24_sdwa v8, v10, v9 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
; GFX8-NEXT: s_ashr_i64 s[0:1], s[0:1], 60
; GFX8-NEXT: v_mov_b32_e32 v5, s4
; GFX8-NEXT: v_mul_i32_i24_e32 v5, s0, v5
; GFX8-NEXT: v_or_b32_sdwa v7, v7, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; GFX8-NEXT: s_ashr_i64 s[6:7], s[6:7], 60
; GFX8-NEXT: s_ashr_i64 s[18:19], s[18:19], 60
; GFX8-NEXT: v_or_b32_sdwa v5, v5, v6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; GFX8-NEXT: v_and_b32_e32 v6, s2, v7
; GFX8-NEXT: s_ashr_i64 s[20:21], s[20:21], 60
; GFX8-NEXT: v_mov_b32_e32 v3, s22
; GFX8-NEXT: v_mov_b32_e32 v4, s10
; GFX8-NEXT: s_ashr_i64 s[32:33], s[22:23], 60
; GFX8-NEXT: v_mul_i32_i24_sdwa v3, v4, v3 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
; GFX8-NEXT: v_or_b32_e32 v5, v6, v5
; GFX8-NEXT: s_ashr_i64 s[8:9], s[8:9], 60
; GFX8-NEXT: v_mov_b32_e32 v4, s20
; GFX8-NEXT: v_mov_b32_e32 v12, s18
; GFX8-NEXT: v_mov_b32_e32 v13, s6
; GFX8-NEXT: s_ashr_i64 s[30:31], s[10:11], 60
; GFX8-NEXT: v_mov_b32_e32 v11, s32
; GFX8-NEXT: v_mul_i32_i24_e32 v4, s8, v4
; GFX8-NEXT: v_mul_i32_i24_sdwa v10, v13, v12 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
; GFX8-NEXT: v_lshrrev_b32_e32 v7, 8, v5
; GFX8-NEXT: v_or_b32_sdwa v4, v4, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; GFX8-NEXT: v_mul_i32_i24_e32 v9, s30, v11
; GFX8-NEXT: v_or_b32_sdwa v3, v9, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; GFX8-NEXT: v_and_b32_e32 v4, s2, v4
; GFX8-NEXT: v_or_b32_e32 v3, v4, v3
; GFX8-NEXT: v_lshrrev_b32_e32 v8, 8, v3
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: v_add_u32_e32 v2, vcc, v2, v3
; GFX8-NEXT: v_add_u32_e32 v2, vcc, v2, v6
; GFX8-NEXT: v_add_u32_e32 v2, vcc, v7, v2
; GFX8-NEXT: v_add_u32_sdwa v2, vcc, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_2 src1_sel:BYTE_0
; GFX8-NEXT: v_add_u32_sdwa v2, vcc, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD
; GFX8-NEXT: v_add_u32_e32 v2, vcc, v2, v5
; GFX8-NEXT: v_add_u32_sdwa v2, vcc, v5, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_2 src1_sel:BYTE_0
; GFX8-NEXT: v_add_u32_sdwa v2, vcc, v5, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD
; GFX8-NEXT: v_add_u32_e32 v2, vcc, v2, v4
; GFX8-NEXT: v_add_u32_e32 v2, vcc, v8, v2
; GFX8-NEXT: v_add_u32_sdwa v2, vcc, v6, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
; GFX8-NEXT: v_add_u32_sdwa v2, vcc, v6, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD
; GFX8-NEXT: v_add_u32_sdwa v2, vcc, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
; GFX8-NEXT: v_add_u32_sdwa v2, vcc, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD
; GFX8-NEXT: flat_store_byte v[0:1], v2
; GFX8-NEXT: s_endpgm
;

View File

@ -52,43 +52,49 @@ define amdgpu_kernel void @local_store_i55(i55 addrspace(3)* %ptr, i55 %arg) #0
;
; FIJI-LABEL: local_store_i55:
; FIJI: ; %bb.0:
; FIJI-NEXT: s_add_u32 s0, s4, 14
; FIJI-NEXT: s_addc_u32 s1, s5, 0
; FIJI-NEXT: v_mov_b32_e32 v0, s0
; FIJI-NEXT: v_mov_b32_e32 v1, s1
; FIJI-NEXT: flat_load_ubyte v0, v[0:1]
; FIJI-NEXT: s_load_dword s0, s[4:5], 0x0
; FIJI-NEXT: s_load_dword s1, s[4:5], 0x8
; FIJI-NEXT: s_load_dword s2, s[4:5], 0xc
; FIJI-NEXT: s_mov_b32 m0, -1
; FIJI-NEXT: s_waitcnt lgkmcnt(0)
; FIJI-NEXT: v_mov_b32_e32 v1, s0
; FIJI-NEXT: v_mov_b32_e32 v2, s0
; FIJI-NEXT: v_mov_b32_e32 v3, s1
; FIJI-NEXT: v_mov_b32_e32 v2, s2
; FIJI-NEXT: ds_write_b16 v1, v2 offset:4
; FIJI-NEXT: s_waitcnt vmcnt(0)
; FIJI-NEXT: v_and_b32_e32 v0, 0x7f, v0
; FIJI-NEXT: ds_write_b8 v1, v0 offset:6
; FIJI-NEXT: ds_write_b32 v1, v3
; FIJI-NEXT: s_and_b32 s3, s2, 0xffff
; FIJI-NEXT: s_add_u32 s0, s4, 14
; FIJI-NEXT: s_addc_u32 s1, s5, 0
; FIJI-NEXT: v_mov_b32_e32 v0, s0
; FIJI-NEXT: v_mov_b32_e32 v1, s1
; FIJI-NEXT: flat_load_ubyte v0, v[0:1]
; FIJI-NEXT: v_mov_b32_e32 v1, s2
; FIJI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; FIJI-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; FIJI-NEXT: v_or_b32_e32 v0, s3, v0
; FIJI-NEXT: v_bfe_u32 v0, v0, 16, 7
; FIJI-NEXT: ds_write_b16 v2, v1 offset:4
; FIJI-NEXT: ds_write_b8 v2, v0 offset:6
; FIJI-NEXT: ds_write_b32 v2, v3
; FIJI-NEXT: s_endpgm
;
; GFX9-LABEL: local_store_i55:
; GFX9: ; %bb.0:
; GFX9-NEXT: v_mov_b32_e32 v0, s4
; GFX9-NEXT: v_mov_b32_e32 v1, s5
; GFX9-NEXT: global_load_ubyte_d16_hi v0, v[0:1], off offset:14
; GFX9-NEXT: v_mov_b32_e32 v2, 0
; GFX9-NEXT: global_load_ubyte_d16_hi v2, v[0:1], off offset:14
; GFX9-NEXT: s_load_dword s0, s[4:5], 0x0
; GFX9-NEXT: s_load_dword s1, s[4:5], 0x8
; GFX9-NEXT: s_load_dword s2, s[4:5], 0xc
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: v_mov_b32_e32 v1, s0
; GFX9-NEXT: v_mov_b32_e32 v0, s0
; GFX9-NEXT: v_mov_b32_e32 v3, s1
; GFX9-NEXT: v_mov_b32_e32 v2, s2
; GFX9-NEXT: ds_write_b16 v1, v2 offset:4
; GFX9-NEXT: v_mov_b32_e32 v1, s2
; GFX9-NEXT: s_and_b32 s3, s2, 0xffff
; GFX9-NEXT: ds_write_b16 v0, v1 offset:4
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_bfe_u32 v0, v0, 16, 7
; GFX9-NEXT: ds_write_b8 v1, v0 offset:6
; GFX9-NEXT: ds_write_b32 v1, v3
; GFX9-NEXT: v_or_b32_e32 v1, s3, v2
; GFX9-NEXT: v_and_b32_e32 v1, 0x7fffff, v1
; GFX9-NEXT: ds_write_b8_d16_hi v0, v1 offset:6
; GFX9-NEXT: ds_write_b32 v0, v3
; GFX9-NEXT: s_endpgm
store i55 %arg, i55 addrspace(3)* %ptr, align 8
ret void

View File

@ -21,8 +21,7 @@ define i64 @combine_psadbw_demandedelt(<16 x i8> %0, <16 x i8> %1) {
; X86-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,3,2]
; X86-NEXT: psadbw %xmm0, %xmm1
; X86-NEXT: movd %xmm1, %eax
; X86-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
; X86-NEXT: movd %xmm0, %edx
; X86-NEXT: xorl %edx, %edx
; X86-NEXT: retl
;
; X64-LABEL: combine_psadbw_demandedelt:

View File

@ -1576,10 +1576,8 @@ define i8 @test_v4i8(<4 x i8> %a0) {
; SSE2-NEXT: psrldq {{.*#+}} xmm0 = xmm0[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero
; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
; SSE2-NEXT: pmullw %xmm1, %xmm0
; SSE2-NEXT: pand {{.*}}(%rip), %xmm0
; SSE2-NEXT: packuswb %xmm0, %xmm0
; SSE2-NEXT: movdqa %xmm0, %xmm1
; SSE2-NEXT: psrlw $8, %xmm1
; SSE2-NEXT: psrld $16, %xmm1
; SSE2-NEXT: pmullw %xmm0, %xmm1
; SSE2-NEXT: movd %xmm1, %eax
; SSE2-NEXT: # kill: def $al killed $al killed $eax
@ -1591,7 +1589,7 @@ define i8 @test_v4i8(<4 x i8> %a0) {
; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
; SSE41-NEXT: pmullw %xmm0, %xmm1
; SSE41-NEXT: movdqa %xmm1, %xmm0
; SSE41-NEXT: pshufb {{.*#+}} xmm0 = xmm0[2],zero,xmm0[6],zero,xmm0[u],zero,xmm0[u],zero,xmm0[u],zero,xmm0[u],zero,xmm0[u],zero,xmm0[u],zero
; SSE41-NEXT: psrld $16, %xmm0
; SSE41-NEXT: pmullw %xmm1, %xmm0
; SSE41-NEXT: pextrb $0, %xmm0, %eax
; SSE41-NEXT: # kill: def $al killed $al killed $eax
@ -1602,7 +1600,7 @@ define i8 @test_v4i8(<4 x i8> %a0) {
; AVX-NEXT: vpmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
; AVX-NEXT: vpmullw %xmm1, %xmm0, %xmm0
; AVX-NEXT: vpshufb {{.*#+}} xmm1 = xmm0[2],zero,xmm0[6],zero,xmm0[u],zero,xmm0[u],zero,xmm0[u],zero,xmm0[u],zero,xmm0[u],zero,xmm0[u],zero
; AVX-NEXT: vpsrld $16, %xmm0, %xmm1
; AVX-NEXT: vpmullw %xmm1, %xmm0, %xmm0
; AVX-NEXT: vpextrb $0, %xmm0, %eax
; AVX-NEXT: # kill: def $al killed $al killed $eax
@ -1613,7 +1611,7 @@ define i8 @test_v4i8(<4 x i8> %a0) {
; AVX512BW-NEXT: vpmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
; AVX512BW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
; AVX512BW-NEXT: vpmullw %xmm1, %xmm0, %xmm0
; AVX512BW-NEXT: vpshufb {{.*#+}} xmm1 = xmm0[2],zero,xmm0[6],zero,xmm0[u],zero,xmm0[u],zero,xmm0[u],zero,xmm0[u],zero,xmm0[u],zero,xmm0[u],zero
; AVX512BW-NEXT: vpsrld $16, %xmm0, %xmm1
; AVX512BW-NEXT: vpmullw %xmm1, %xmm0, %xmm0
; AVX512BW-NEXT: vpextrb $0, %xmm0, %eax
; AVX512BW-NEXT: # kill: def $al killed $al killed $eax
@ -1625,7 +1623,7 @@ define i8 @test_v4i8(<4 x i8> %a0) {
; AVX512VL-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,2,3]
; AVX512VL-NEXT: vpmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
; AVX512VL-NEXT: vpmullw %xmm1, %xmm0, %xmm0
; AVX512VL-NEXT: vpshufb {{.*#+}} xmm1 = xmm0[2],zero,xmm0[6],zero,xmm0[u],zero,xmm0[u],zero,xmm0[u],zero,xmm0[u],zero,xmm0[u],zero,xmm0[u],zero
; AVX512VL-NEXT: vpsrld $16, %xmm0, %xmm1
; AVX512VL-NEXT: vpmullw %xmm1, %xmm0, %xmm0
; AVX512VL-NEXT: vpextrb $0, %xmm0, %eax
; AVX512VL-NEXT: # kill: def $al killed $al killed $eax
@ -1636,7 +1634,7 @@ define i8 @test_v4i8(<4 x i8> %a0) {
; AVX512DQ-NEXT: vpmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
; AVX512DQ-NEXT: vpmullw %xmm1, %xmm0, %xmm0
; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm1 = xmm0[2],zero,xmm0[6],zero,xmm0[u],zero,xmm0[u],zero,xmm0[u],zero,xmm0[u],zero,xmm0[u],zero,xmm0[u],zero
; AVX512DQ-NEXT: vpsrld $16, %xmm0, %xmm1
; AVX512DQ-NEXT: vpmullw %xmm1, %xmm0, %xmm0
; AVX512DQ-NEXT: vpextrb $0, %xmm0, %eax
; AVX512DQ-NEXT: # kill: def $al killed $al killed $eax
@ -1654,10 +1652,8 @@ define i8 @test_v8i8(<8 x i8> %a0) {
; SSE2-NEXT: pmullw %xmm0, %xmm1
; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,2,3,0]
; SSE2-NEXT: pmullw %xmm1, %xmm0
; SSE2-NEXT: pand {{.*}}(%rip), %xmm0
; SSE2-NEXT: packuswb %xmm0, %xmm0
; SSE2-NEXT: movdqa %xmm0, %xmm1
; SSE2-NEXT: psrlw $8, %xmm1
; SSE2-NEXT: psrld $16, %xmm1
; SSE2-NEXT: pmullw %xmm0, %xmm1
; SSE2-NEXT: movd %xmm1, %eax
; SSE2-NEXT: # kill: def $al killed $al killed $eax
@ -1673,7 +1669,7 @@ define i8 @test_v8i8(<8 x i8> %a0) {
; SSE41-NEXT: pmovzxbw {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
; SSE41-NEXT: pmullw %xmm0, %xmm1
; SSE41-NEXT: movdqa %xmm1, %xmm0
; SSE41-NEXT: pshufb {{.*#+}} xmm0 = xmm0[2],zero,xmm0[6],zero,xmm0[10],zero,xmm0[14],zero,xmm0[u],zero,xmm0[u],zero,xmm0[u],zero,xmm0[u],zero
; SSE41-NEXT: psrld $16, %xmm0
; SSE41-NEXT: pmullw %xmm1, %xmm0
; SSE41-NEXT: pextrb $0, %xmm0, %eax
; SSE41-NEXT: # kill: def $al killed $al killed $eax
@ -1686,7 +1682,7 @@ define i8 @test_v8i8(<8 x i8> %a0) {
; AVX-NEXT: vpmullw %xmm1, %xmm0, %xmm0
; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
; AVX-NEXT: vpmullw %xmm1, %xmm0, %xmm0
; AVX-NEXT: vpshufb {{.*#+}} xmm1 = xmm0[2],zero,xmm0[6],zero,xmm0[10],zero,xmm0[14],zero,xmm0[u],zero,xmm0[u],zero,xmm0[u],zero,xmm0[u],zero
; AVX-NEXT: vpsrld $16, %xmm0, %xmm1
; AVX-NEXT: vpmullw %xmm1, %xmm0, %xmm0
; AVX-NEXT: vpextrb $0, %xmm0, %eax
; AVX-NEXT: # kill: def $al killed $al killed $eax
@ -1699,7 +1695,7 @@ define i8 @test_v8i8(<8 x i8> %a0) {
; AVX512BW-NEXT: vpmullw %xmm1, %xmm0, %xmm0
; AVX512BW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
; AVX512BW-NEXT: vpmullw %xmm1, %xmm0, %xmm0
; AVX512BW-NEXT: vpshufb {{.*#+}} xmm1 = xmm0[2],zero,xmm0[6],zero,xmm0[10],zero,xmm0[14],zero,xmm0[u],zero,xmm0[u],zero,xmm0[u],zero,xmm0[u],zero
; AVX512BW-NEXT: vpsrld $16, %xmm0, %xmm1
; AVX512BW-NEXT: vpmullw %xmm1, %xmm0, %xmm0
; AVX512BW-NEXT: vpextrb $0, %xmm0, %eax
; AVX512BW-NEXT: # kill: def $al killed $al killed $eax
@ -1713,7 +1709,7 @@ define i8 @test_v8i8(<8 x i8> %a0) {
; AVX512VL-NEXT: vpmullw %xmm1, %xmm0, %xmm0
; AVX512VL-NEXT: vpalignr {{.*#+}} xmm1 = xmm0[4,5,6,7,8,9,10,11,12,13,14,15,0,1,2,3]
; AVX512VL-NEXT: vpmullw %xmm1, %xmm0, %xmm0
; AVX512VL-NEXT: vpshufb {{.*#+}} xmm1 = xmm0[2],zero,xmm0[6],zero,xmm0[10],zero,xmm0[14],zero,xmm0[u],zero,xmm0[u],zero,xmm0[u],zero,xmm0[u],zero
; AVX512VL-NEXT: vpsrld $16, %xmm0, %xmm1
; AVX512VL-NEXT: vpmullw %xmm1, %xmm0, %xmm0
; AVX512VL-NEXT: vpextrb $0, %xmm0, %eax
; AVX512VL-NEXT: # kill: def $al killed $al killed $eax
@ -1726,7 +1722,7 @@ define i8 @test_v8i8(<8 x i8> %a0) {
; AVX512DQ-NEXT: vpmullw %xmm1, %xmm0, %xmm0
; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
; AVX512DQ-NEXT: vpmullw %xmm1, %xmm0, %xmm0
; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm1 = xmm0[2],zero,xmm0[6],zero,xmm0[10],zero,xmm0[14],zero,xmm0[u],zero,xmm0[u],zero,xmm0[u],zero,xmm0[u],zero
; AVX512DQ-NEXT: vpsrld $16, %xmm0, %xmm1
; AVX512DQ-NEXT: vpmullw %xmm1, %xmm0, %xmm0
; AVX512DQ-NEXT: vpextrb $0, %xmm0, %eax
; AVX512DQ-NEXT: # kill: def $al killed $al killed $eax
@ -1756,8 +1752,9 @@ define i8 @test_v16i8(<16 x i8> %a0) {
; SSE2-NEXT: psrldq {{.*#+}} xmm0 = xmm0[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero
; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
; SSE2-NEXT: pmullw %xmm2, %xmm0
; SSE2-NEXT: pand %xmm0, %xmm1
; SSE2-NEXT: packuswb %xmm3, %xmm1
; SSE2-NEXT: pand %xmm1, %xmm0
; SSE2-NEXT: packuswb %xmm3, %xmm0
; SSE2-NEXT: movdqa %xmm0, %xmm1
; SSE2-NEXT: psrlw $8, %xmm1
; SSE2-NEXT: pmullw %xmm0, %xmm1
; SSE2-NEXT: movd %xmm1, %eax
@ -1799,10 +1796,10 @@ define i8 @test_v16i8(<16 x i8> %a0) {
; AVX1-NEXT: vpmullw %xmm1, %xmm0, %xmm0
; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
; AVX1-NEXT: vpmullw %xmm1, %xmm0, %xmm0
; AVX1-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm1
; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2
; AVX1-NEXT: vpackuswb %xmm2, %xmm1, %xmm1
; AVX1-NEXT: vpsrlw $8, %xmm1, %xmm1
; AVX1-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0
; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1
; AVX1-NEXT: vpackuswb %xmm1, %xmm0, %xmm0
; AVX1-NEXT: vpsrlw $8, %xmm0, %xmm1
; AVX1-NEXT: vpmullw %xmm1, %xmm0, %xmm0
; AVX1-NEXT: vpextrb $0, %xmm0, %eax
; AVX1-NEXT: # kill: def $al killed $al killed $eax
@ -1947,8 +1944,9 @@ define i8 @test_v32i8(<32 x i8> %a0) {
; SSE2-NEXT: psrldq {{.*#+}} xmm0 = xmm0[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero
; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
; SSE2-NEXT: pmullw %xmm2, %xmm0
; SSE2-NEXT: pand %xmm0, %xmm1
; SSE2-NEXT: packuswb %xmm3, %xmm1
; SSE2-NEXT: pand %xmm1, %xmm0
; SSE2-NEXT: packuswb %xmm3, %xmm0
; SSE2-NEXT: movdqa %xmm0, %xmm1
; SSE2-NEXT: psrlw $8, %xmm1
; SSE2-NEXT: pmullw %xmm0, %xmm1
; SSE2-NEXT: movd %xmm1, %eax
@ -2037,10 +2035,9 @@ define i8 @test_v32i8(<32 x i8> %a0) {
; AVX2-NEXT: vpsrld $16, %xmm1, %xmm1
; AVX2-NEXT: vpmovzxbw {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
; AVX2-NEXT: vpmullw %xmm1, %xmm0, %xmm0
; AVX2-NEXT: vpand %xmm3, %xmm0, %xmm1
; AVX2-NEXT: vpackuswb %xmm2, %xmm1, %xmm1
; AVX2-NEXT: vpsrlw $8, %xmm1, %xmm1
; AVX2-NEXT: vpmovzxbw {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
; AVX2-NEXT: vpand %xmm3, %xmm0, %xmm0
; AVX2-NEXT: vpackuswb %xmm2, %xmm0, %xmm0
; AVX2-NEXT: vpsrlw $8, %xmm0, %xmm1
; AVX2-NEXT: vpmullw %xmm1, %xmm0, %xmm0
; AVX2-NEXT: vpextrb $0, %xmm0, %eax
; AVX2-NEXT: # kill: def $al killed $al killed $eax
@ -2126,10 +2123,9 @@ define i8 @test_v32i8(<32 x i8> %a0) {
; AVX512DQ-NEXT: vpsrld $16, %xmm1, %xmm1
; AVX512DQ-NEXT: vpmovzxbw {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
; AVX512DQ-NEXT: vpmullw %xmm1, %xmm0, %xmm0
; AVX512DQ-NEXT: vpand %xmm3, %xmm0, %xmm1
; AVX512DQ-NEXT: vpackuswb %xmm2, %xmm1, %xmm1
; AVX512DQ-NEXT: vpsrlw $8, %xmm1, %xmm1
; AVX512DQ-NEXT: vpmovzxbw {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
; AVX512DQ-NEXT: vpand %xmm3, %xmm0, %xmm0
; AVX512DQ-NEXT: vpackuswb %xmm2, %xmm0, %xmm0
; AVX512DQ-NEXT: vpsrlw $8, %xmm0, %xmm1
; AVX512DQ-NEXT: vpmullw %xmm1, %xmm0, %xmm0
; AVX512DQ-NEXT: vpextrb $0, %xmm0, %eax
; AVX512DQ-NEXT: # kill: def $al killed $al killed $eax
@ -2159,10 +2155,9 @@ define i8 @test_v32i8(<32 x i8> %a0) {
; AVX512DQVL-NEXT: vpsrld $16, %xmm1, %xmm1
; AVX512DQVL-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
; AVX512DQVL-NEXT: vpmullw %xmm1, %xmm0, %xmm0
; AVX512DQVL-NEXT: vpand %xmm3, %xmm0, %xmm1
; AVX512DQVL-NEXT: vpackuswb %xmm2, %xmm1, %xmm1
; AVX512DQVL-NEXT: vpsrlw $8, %xmm1, %xmm1
; AVX512DQVL-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
; AVX512DQVL-NEXT: vpand %xmm3, %xmm0, %xmm0
; AVX512DQVL-NEXT: vpackuswb %xmm2, %xmm0, %xmm0
; AVX512DQVL-NEXT: vpsrlw $8, %xmm0, %xmm1
; AVX512DQVL-NEXT: vpmullw %xmm1, %xmm0, %xmm0
; AVX512DQVL-NEXT: vpextrb $0, %xmm0, %eax
; AVX512DQVL-NEXT: # kill: def $al killed $al killed $eax
@ -2208,8 +2203,9 @@ define i8 @test_v64i8(<64 x i8> %a0) {
; SSE2-NEXT: psrldq {{.*#+}} xmm0 = xmm0[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero
; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
; SSE2-NEXT: pmullw %xmm2, %xmm0
; SSE2-NEXT: pand %xmm0, %xmm1
; SSE2-NEXT: packuswb %xmm3, %xmm1
; SSE2-NEXT: pand %xmm1, %xmm0
; SSE2-NEXT: packuswb %xmm3, %xmm0
; SSE2-NEXT: movdqa %xmm0, %xmm1
; SSE2-NEXT: psrlw $8, %xmm1
; SSE2-NEXT: pmullw %xmm0, %xmm1
; SSE2-NEXT: movd %xmm1, %eax
@ -2323,10 +2319,9 @@ define i8 @test_v64i8(<64 x i8> %a0) {
; AVX2-NEXT: vpsrld $16, %xmm2, %xmm2
; AVX2-NEXT: vpmovzxbw {{.*#+}} xmm2 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero
; AVX2-NEXT: vpmullw %xmm2, %xmm0, %xmm0
; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm1
; AVX2-NEXT: vpackuswb %xmm3, %xmm1, %xmm1
; AVX2-NEXT: vpsrlw $8, %xmm1, %xmm1
; AVX2-NEXT: vpmovzxbw {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0
; AVX2-NEXT: vpackuswb %xmm3, %xmm0, %xmm0
; AVX2-NEXT: vpsrlw $8, %xmm0, %xmm1
; AVX2-NEXT: vpmullw %xmm1, %xmm0, %xmm0
; AVX2-NEXT: vpextrb $0, %xmm0, %eax
; AVX2-NEXT: # kill: def $al killed $al killed $eax
@ -2365,10 +2360,9 @@ define i8 @test_v64i8(<64 x i8> %a0) {
; AVX512BW-NEXT: vpsrld $16, %xmm1, %xmm1
; AVX512BW-NEXT: vpmovzxbw {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
; AVX512BW-NEXT: vpmullw %xmm1, %xmm0, %xmm0
; AVX512BW-NEXT: vpand %xmm3, %xmm0, %xmm1
; AVX512BW-NEXT: vpackuswb %xmm2, %xmm1, %xmm1
; AVX512BW-NEXT: vpsrlw $8, %xmm1, %xmm1
; AVX512BW-NEXT: vpmovzxbw {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
; AVX512BW-NEXT: vpand %xmm3, %xmm0, %xmm0
; AVX512BW-NEXT: vpackuswb %xmm2, %xmm0, %xmm0
; AVX512BW-NEXT: vpsrlw $8, %xmm0, %xmm1
; AVX512BW-NEXT: vpmullw %xmm1, %xmm0, %xmm0
; AVX512BW-NEXT: vpextrb $0, %xmm0, %eax
; AVX512BW-NEXT: # kill: def $al killed $al killed $eax
@ -2407,10 +2401,9 @@ define i8 @test_v64i8(<64 x i8> %a0) {
; AVX512BWVL-NEXT: vpsrld $16, %xmm1, %xmm1
; AVX512BWVL-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
; AVX512BWVL-NEXT: vpmullw %xmm1, %xmm0, %xmm0
; AVX512BWVL-NEXT: vpand %xmm3, %xmm0, %xmm1
; AVX512BWVL-NEXT: vpackuswb %xmm2, %xmm1, %xmm1
; AVX512BWVL-NEXT: vpsrlw $8, %xmm1, %xmm1
; AVX512BWVL-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
; AVX512BWVL-NEXT: vpand %xmm3, %xmm0, %xmm0
; AVX512BWVL-NEXT: vpackuswb %xmm2, %xmm0, %xmm0
; AVX512BWVL-NEXT: vpsrlw $8, %xmm0, %xmm1
; AVX512BWVL-NEXT: vpmullw %xmm1, %xmm0, %xmm0
; AVX512BWVL-NEXT: vpextrb $0, %xmm0, %eax
; AVX512BWVL-NEXT: # kill: def $al killed $al killed $eax
@ -2444,10 +2437,9 @@ define i8 @test_v64i8(<64 x i8> %a0) {
; AVX512DQ-NEXT: vpsrld $16, %xmm2, %xmm2
; AVX512DQ-NEXT: vpmovzxbw {{.*#+}} xmm2 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero
; AVX512DQ-NEXT: vpmullw %xmm2, %xmm0, %xmm0
; AVX512DQ-NEXT: vpand %xmm1, %xmm0, %xmm1
; AVX512DQ-NEXT: vpackuswb %xmm3, %xmm1, %xmm1
; AVX512DQ-NEXT: vpsrlw $8, %xmm1, %xmm1
; AVX512DQ-NEXT: vpmovzxbw {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
; AVX512DQ-NEXT: vpand %xmm1, %xmm0, %xmm0
; AVX512DQ-NEXT: vpackuswb %xmm3, %xmm0, %xmm0
; AVX512DQ-NEXT: vpsrlw $8, %xmm0, %xmm1
; AVX512DQ-NEXT: vpmullw %xmm1, %xmm0, %xmm0
; AVX512DQ-NEXT: vpextrb $0, %xmm0, %eax
; AVX512DQ-NEXT: # kill: def $al killed $al killed $eax
@ -2486,10 +2478,9 @@ define i8 @test_v64i8(<64 x i8> %a0) {
; AVX512DQVL-NEXT: vpsrld $16, %xmm1, %xmm1
; AVX512DQVL-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
; AVX512DQVL-NEXT: vpmullw %xmm1, %xmm0, %xmm0
; AVX512DQVL-NEXT: vpand %xmm3, %xmm0, %xmm1
; AVX512DQVL-NEXT: vpackuswb %xmm2, %xmm1, %xmm1
; AVX512DQVL-NEXT: vpsrlw $8, %xmm1, %xmm1
; AVX512DQVL-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
; AVX512DQVL-NEXT: vpand %xmm3, %xmm0, %xmm0
; AVX512DQVL-NEXT: vpackuswb %xmm2, %xmm0, %xmm0
; AVX512DQVL-NEXT: vpsrlw $8, %xmm0, %xmm1
; AVX512DQVL-NEXT: vpmullw %xmm1, %xmm0, %xmm0
; AVX512DQVL-NEXT: vpextrb $0, %xmm0, %eax
; AVX512DQVL-NEXT: # kill: def $al killed $al killed $eax
@ -2555,8 +2546,9 @@ define i8 @test_v128i8(<128 x i8> %a0) {
; SSE2-NEXT: psrldq {{.*#+}} xmm1 = xmm1[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero
; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
; SSE2-NEXT: pmullw %xmm2, %xmm1
; SSE2-NEXT: pand %xmm1, %xmm0
; SSE2-NEXT: packuswb %xmm3, %xmm0
; SSE2-NEXT: pand %xmm0, %xmm1
; SSE2-NEXT: packuswb %xmm3, %xmm1
; SSE2-NEXT: movdqa %xmm1, %xmm0
; SSE2-NEXT: psrlw $8, %xmm0
; SSE2-NEXT: pmullw %xmm1, %xmm0
; SSE2-NEXT: movd %xmm0, %eax
@ -2729,10 +2721,9 @@ define i8 @test_v128i8(<128 x i8> %a0) {
; AVX2-NEXT: vpsrld $16, %xmm1, %xmm1
; AVX2-NEXT: vpmovzxbw {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
; AVX2-NEXT: vpmullw %xmm1, %xmm0, %xmm0
; AVX2-NEXT: vpand %xmm2, %xmm0, %xmm1
; AVX2-NEXT: vpackuswb %xmm3, %xmm1, %xmm1
; AVX2-NEXT: vpsrlw $8, %xmm1, %xmm1
; AVX2-NEXT: vpmovzxbw {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
; AVX2-NEXT: vpand %xmm2, %xmm0, %xmm0
; AVX2-NEXT: vpackuswb %xmm3, %xmm0, %xmm0
; AVX2-NEXT: vpsrlw $8, %xmm0, %xmm1
; AVX2-NEXT: vpmullw %xmm1, %xmm0, %xmm0
; AVX2-NEXT: vpextrb $0, %xmm0, %eax
; AVX2-NEXT: # kill: def $al killed $al killed $eax
@ -2779,10 +2770,9 @@ define i8 @test_v128i8(<128 x i8> %a0) {
; AVX512BW-NEXT: vpsrld $16, %xmm1, %xmm1
; AVX512BW-NEXT: vpmovzxbw {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
; AVX512BW-NEXT: vpmullw %xmm1, %xmm0, %xmm0
; AVX512BW-NEXT: vpand %xmm3, %xmm0, %xmm1
; AVX512BW-NEXT: vpackuswb %xmm2, %xmm1, %xmm1
; AVX512BW-NEXT: vpsrlw $8, %xmm1, %xmm1
; AVX512BW-NEXT: vpmovzxbw {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
; AVX512BW-NEXT: vpand %xmm3, %xmm0, %xmm0
; AVX512BW-NEXT: vpackuswb %xmm2, %xmm0, %xmm0
; AVX512BW-NEXT: vpsrlw $8, %xmm0, %xmm1
; AVX512BW-NEXT: vpmullw %xmm1, %xmm0, %xmm0
; AVX512BW-NEXT: vpextrb $0, %xmm0, %eax
; AVX512BW-NEXT: # kill: def $al killed $al killed $eax
@ -2829,10 +2819,9 @@ define i8 @test_v128i8(<128 x i8> %a0) {
; AVX512BWVL-NEXT: vpsrld $16, %xmm1, %xmm1
; AVX512BWVL-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
; AVX512BWVL-NEXT: vpmullw %xmm1, %xmm0, %xmm0
; AVX512BWVL-NEXT: vpand %xmm3, %xmm0, %xmm1
; AVX512BWVL-NEXT: vpackuswb %xmm2, %xmm1, %xmm1
; AVX512BWVL-NEXT: vpsrlw $8, %xmm1, %xmm1
; AVX512BWVL-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
; AVX512BWVL-NEXT: vpand %xmm3, %xmm0, %xmm0
; AVX512BWVL-NEXT: vpackuswb %xmm2, %xmm0, %xmm0
; AVX512BWVL-NEXT: vpsrlw $8, %xmm0, %xmm1
; AVX512BWVL-NEXT: vpmullw %xmm1, %xmm0, %xmm0
; AVX512BWVL-NEXT: vpextrb $0, %xmm0, %eax
; AVX512BWVL-NEXT: # kill: def $al killed $al killed $eax
@ -2873,10 +2862,9 @@ define i8 @test_v128i8(<128 x i8> %a0) {
; AVX512DQ-NEXT: vpsrld $16, %xmm1, %xmm1
; AVX512DQ-NEXT: vpmovzxbw {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
; AVX512DQ-NEXT: vpmullw %xmm1, %xmm0, %xmm0
; AVX512DQ-NEXT: vpand %xmm2, %xmm0, %xmm1
; AVX512DQ-NEXT: vpackuswb %xmm3, %xmm1, %xmm1
; AVX512DQ-NEXT: vpsrlw $8, %xmm1, %xmm1
; AVX512DQ-NEXT: vpmovzxbw {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
; AVX512DQ-NEXT: vpand %xmm2, %xmm0, %xmm0
; AVX512DQ-NEXT: vpackuswb %xmm3, %xmm0, %xmm0
; AVX512DQ-NEXT: vpsrlw $8, %xmm0, %xmm1
; AVX512DQ-NEXT: vpmullw %xmm1, %xmm0, %xmm0
; AVX512DQ-NEXT: vpextrb $0, %xmm0, %eax
; AVX512DQ-NEXT: # kill: def $al killed $al killed $eax
@ -2922,10 +2910,9 @@ define i8 @test_v128i8(<128 x i8> %a0) {
; AVX512DQVL-NEXT: vpsrld $16, %xmm1, %xmm1
; AVX512DQVL-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
; AVX512DQVL-NEXT: vpmullw %xmm1, %xmm0, %xmm0
; AVX512DQVL-NEXT: vpand %xmm2, %xmm0, %xmm1
; AVX512DQVL-NEXT: vpackuswb %xmm3, %xmm1, %xmm1
; AVX512DQVL-NEXT: vpsrlw $8, %xmm1, %xmm1
; AVX512DQVL-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
; AVX512DQVL-NEXT: vpand %xmm2, %xmm0, %xmm0
; AVX512DQVL-NEXT: vpackuswb %xmm3, %xmm0, %xmm0
; AVX512DQVL-NEXT: vpsrlw $8, %xmm0, %xmm1
; AVX512DQVL-NEXT: vpmullw %xmm1, %xmm0, %xmm0
; AVX512DQVL-NEXT: vpextrb $0, %xmm0, %eax
; AVX512DQVL-NEXT: # kill: def $al killed $al killed $eax