mirror of
https://github.com/RPCS3/llvm-mirror.git
synced 2024-12-02 16:36:40 +00:00
[DAGCombine] visitEXTRACT_VECTOR_ELT - add SimplifyDemandedBits multi use support
Similar to what we already do with SimplifyDemandedVectorElts, call SimplifyDemandedBits across all the extracted elements of the source vector, treating it as single use. There's a minor regression in store-weird-sizes.ll which will be addressed in an upcoming SimplifyDemandedBits patch.
This commit is contained in:
parent
e1296518ec
commit
ac3ec9a0c4
@ -323,7 +323,8 @@ namespace {
|
||||
}
|
||||
|
||||
bool SimplifyDemandedBits(SDValue Op, const APInt &DemandedBits,
|
||||
const APInt &DemandedElts);
|
||||
const APInt &DemandedElts,
|
||||
bool AssumeSingleUse = false);
|
||||
bool SimplifyDemandedVectorElts(SDValue Op, const APInt &DemandedElts,
|
||||
bool AssumeSingleUse = false);
|
||||
|
||||
@ -1058,10 +1059,12 @@ CommitTargetLoweringOpt(const TargetLowering::TargetLoweringOpt &TLO) {
|
||||
/// Check the specified integer node value to see if it can be simplified or if
|
||||
/// things it uses can be simplified by bit propagation. If so, return true.
|
||||
bool DAGCombiner::SimplifyDemandedBits(SDValue Op, const APInt &DemandedBits,
|
||||
const APInt &DemandedElts) {
|
||||
const APInt &DemandedElts,
|
||||
bool AssumeSingleUse) {
|
||||
TargetLowering::TargetLoweringOpt TLO(DAG, LegalTypes, LegalOperations);
|
||||
KnownBits Known;
|
||||
if (!TLI.SimplifyDemandedBits(Op, DemandedBits, DemandedElts, Known, TLO))
|
||||
if (!TLI.SimplifyDemandedBits(Op, DemandedBits, DemandedElts, Known, TLO, 0,
|
||||
AssumeSingleUse))
|
||||
return false;
|
||||
|
||||
// Revisit the node.
|
||||
@ -17210,6 +17213,7 @@ SDValue DAGCombiner::visitEXTRACT_VECTOR_ELT(SDNode *N) {
|
||||
// extract_vector_elt of out-of-bounds element -> UNDEF
|
||||
auto *IndexC = dyn_cast<ConstantSDNode>(Index);
|
||||
unsigned NumElts = VecVT.getVectorNumElements();
|
||||
unsigned VecEltBitWidth = VecVT.getScalarSizeInBits();
|
||||
if (IndexC && IndexC->getAPIntValue().uge(NumElts))
|
||||
return DAG.getUNDEF(ScalarVT);
|
||||
|
||||
@ -17251,7 +17255,6 @@ SDValue DAGCombiner::visitEXTRACT_VECTOR_ELT(SDNode *N) {
|
||||
"Extract element and scalar to vector can't change element type "
|
||||
"from FP to integer.");
|
||||
unsigned XBitWidth = X.getValueSizeInBits();
|
||||
unsigned VecEltBitWidth = VecVT.getScalarSizeInBits();
|
||||
BCTruncElt = IsLE ? 0 : XBitWidth / VecEltBitWidth - 1;
|
||||
|
||||
// An extract element return value type can be wider than its vector
|
||||
@ -17334,6 +17337,14 @@ SDValue DAGCombiner::visitEXTRACT_VECTOR_ELT(SDNode *N) {
|
||||
AddToWorklist(N);
|
||||
return SDValue(N, 0);
|
||||
}
|
||||
APInt DemandedBits = APInt::getAllOnesValue(VecEltBitWidth);
|
||||
if (SimplifyDemandedBits(VecOp, DemandedBits, DemandedElts, true)) {
|
||||
// We simplified the vector operand of this extract element. If this
|
||||
// extract is not dead, visit it again so it is folded properly.
|
||||
if (N->getOpcode() != ISD::DELETED_NODE)
|
||||
AddToWorklist(N);
|
||||
return SDValue(N, 0);
|
||||
}
|
||||
}
|
||||
|
||||
// Everything under here is trying to match an extract of a loaded value.
|
||||
|
@ -102,8 +102,6 @@ define i8 @test_v9i8(<9 x i8> %a) nounwind {
|
||||
; CHECK-NEXT: mov v0.b[11], w8
|
||||
; CHECK-NEXT: mov v0.b[12], w8
|
||||
; CHECK-NEXT: mov v0.b[13], w8
|
||||
; CHECK-NEXT: mov v0.b[14], w8
|
||||
; CHECK-NEXT: mov v0.b[15], w8
|
||||
; CHECK-NEXT: ext v1.16b, v0.16b, v0.16b, #8
|
||||
; CHECK-NEXT: and v1.8b, v0.8b, v1.8b
|
||||
; CHECK-NEXT: umov w8, v1.b[1]
|
||||
@ -113,7 +111,7 @@ define i8 @test_v9i8(<9 x i8> %a) nounwind {
|
||||
; CHECK-NEXT: and w8, w8, w9
|
||||
; CHECK-NEXT: umov w9, v1.b[3]
|
||||
; CHECK-NEXT: and w8, w8, w9
|
||||
; CHECK-NEXT: umov w9, v1.b[4]
|
||||
; CHECK-NEXT: umov w9, v0.b[4]
|
||||
; CHECK-NEXT: and w8, w8, w9
|
||||
; CHECK-NEXT: umov w9, v1.b[5]
|
||||
; CHECK-NEXT: and w8, w8, w9
|
||||
|
@ -69,7 +69,7 @@ define amdgpu_kernel void @bitcast_int_to_fpvector_extract_0(float addrspace(1)*
|
||||
; GCN-LABEL: bitcast_int_to_fpvector_extract_0:
|
||||
; GCN: ; %bb.0:
|
||||
; GCN-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9
|
||||
; GCN-NEXT: s_load_dwordx2 s[12:13], s[0:1], 0xd
|
||||
; GCN-NEXT: s_load_dword s12, s[0:1], 0xd
|
||||
; GCN-NEXT: s_mov_b32 s3, 0xf000
|
||||
; GCN-NEXT: s_mov_b32 s10, 0
|
||||
; GCN-NEXT: v_lshlrev_b32_e32 v0, 3, v0
|
||||
@ -77,7 +77,7 @@ define amdgpu_kernel void @bitcast_int_to_fpvector_extract_0(float addrspace(1)*
|
||||
; GCN-NEXT: s_mov_b32 s11, s3
|
||||
; GCN-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GCN-NEXT: s_mov_b64 s[8:9], s[6:7]
|
||||
; GCN-NEXT: buffer_load_dwordx2 v[0:1], v[0:1], s[8:11], 0 addr64
|
||||
; GCN-NEXT: buffer_load_dword v0, v[0:1], s[8:11], 0 addr64
|
||||
; GCN-NEXT: s_mov_b32 s2, -1
|
||||
; GCN-NEXT: s_mov_b32 s0, s4
|
||||
; GCN-NEXT: s_mov_b32 s1, s5
|
||||
|
@ -927,35 +927,31 @@ define amdgpu_kernel void @idot4_acc16_vecMul(<4 x i8> addrspace(1)* %src1,
|
||||
; GFX8: ; %bb.0: ; %entry
|
||||
; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
|
||||
; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
|
||||
; GFX8-NEXT: s_mov_b32 s2, 0xffff
|
||||
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX8-NEXT: s_load_dword s3, s[6:7], 0x0
|
||||
; GFX8-NEXT: s_load_dword s2, s[6:7], 0x0
|
||||
; GFX8-NEXT: v_mov_b32_e32 v0, s0
|
||||
; GFX8-NEXT: v_mov_b32_e32 v1, s1
|
||||
; GFX8-NEXT: flat_load_ushort v2, v[0:1]
|
||||
; GFX8-NEXT: s_load_dword s0, s[4:5], 0x0
|
||||
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX8-NEXT: s_bfe_i32 s6, s3, 0x80000
|
||||
; GFX8-NEXT: s_lshr_b32 s4, s3, 16
|
||||
; GFX8-NEXT: v_ashrrev_i16_e64 v3, 8, s3
|
||||
; GFX8-NEXT: s_bfe_i32 s3, s4, 0x80000
|
||||
; GFX8-NEXT: s_lshr_b32 s1, s0, 16
|
||||
; GFX8-NEXT: s_bfe_i32 s5, s0, 0x80000
|
||||
; GFX8-NEXT: v_ashrrev_i16_e64 v4, 8, s0
|
||||
; GFX8-NEXT: s_bfe_i32 s0, s1, 0x80000
|
||||
; GFX8-NEXT: v_ashrrev_i16_e64 v6, 8, s1
|
||||
; GFX8-NEXT: s_and_b32 s1, s2, s6
|
||||
; GFX8-NEXT: v_ashrrev_i16_e64 v5, 8, s4
|
||||
; GFX8-NEXT: s_and_b32 s4, s2, s5
|
||||
; GFX8-NEXT: v_mov_b32_e32 v7, s1
|
||||
; GFX8-NEXT: s_and_b32 s3, s2, s3
|
||||
; GFX8-NEXT: s_and_b32 s0, s2, s0
|
||||
; GFX8-NEXT: s_sext_i32_i8 s1, s2
|
||||
; GFX8-NEXT: v_lshrrev_b16_e64 v3, 8, s2
|
||||
; GFX8-NEXT: v_mov_b32_e32 v4, s1
|
||||
; GFX8-NEXT: s_bfe_i32 s3, s2, 0x80010
|
||||
; GFX8-NEXT: v_lshrrev_b16_e64 v5, 8, s0
|
||||
; GFX8-NEXT: s_sext_i32_i8 s1, s0
|
||||
; GFX8-NEXT: v_bfe_i32 v5, v5, 0, 8
|
||||
; GFX8-NEXT: v_bfe_i32 v3, v3, 0, 8
|
||||
; GFX8-NEXT: s_bfe_i32 s4, s0, 0x80010
|
||||
; GFX8-NEXT: s_ashr_i32 s2, s2, 24
|
||||
; GFX8-NEXT: v_mov_b32_e32 v6, s3
|
||||
; GFX8-NEXT: s_ashr_i32 s0, s0, 24
|
||||
; GFX8-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX8-NEXT: v_mad_u32_u24 v2, s4, v7, v2
|
||||
; GFX8-NEXT: v_mad_u32_u24 v2, v4, v3, v2
|
||||
; GFX8-NEXT: v_mov_b32_e32 v3, s3
|
||||
; GFX8-NEXT: v_mad_u32_u24 v2, s0, v3, v2
|
||||
; GFX8-NEXT: v_mad_u32_u24 v2, v6, v5, v2
|
||||
; GFX8-NEXT: v_mad_i32_i24 v2, s1, v4, v2
|
||||
; GFX8-NEXT: v_mad_i32_i24 v2, v5, v3, v2
|
||||
; GFX8-NEXT: v_mad_i32_i24 v2, s4, v6, v2
|
||||
; GFX8-NEXT: v_mov_b32_e32 v3, s2
|
||||
; GFX8-NEXT: v_mad_i32_i24 v2, s0, v3, v2
|
||||
; GFX8-NEXT: flat_store_short v[0:1], v2
|
||||
; GFX8-NEXT: s_endpgm
|
||||
;
|
||||
|
@ -2043,34 +2043,33 @@ define amdgpu_kernel void @udot4_acc8_vecMul(<4 x i8> addrspace(1)* %src1,
|
||||
; GFX8: ; %bb.0: ; %entry
|
||||
; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
|
||||
; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
|
||||
; GFX8-NEXT: s_movk_i32 s8, 0xff
|
||||
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX8-NEXT: v_mov_b32_e32 v0, s0
|
||||
; GFX8-NEXT: v_mov_b32_e32 v1, s1
|
||||
; GFX8-NEXT: flat_load_ubyte v2, v[0:1]
|
||||
; GFX8-NEXT: s_movk_i32 s0, 0xff
|
||||
; GFX8-NEXT: v_mov_b32_e32 v3, s0
|
||||
; GFX8-NEXT: s_load_dword s0, s[4:5], 0x0
|
||||
; GFX8-NEXT: s_load_dword s1, s[6:7], 0x0
|
||||
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX8-NEXT: v_mov_b32_e32 v3, s0
|
||||
; GFX8-NEXT: v_mov_b32_e32 v4, s1
|
||||
; GFX8-NEXT: s_and_b32 s7, s1, s8
|
||||
; GFX8-NEXT: s_lshr_b32 s2, s0, 24
|
||||
; GFX8-NEXT: s_lshr_b32 s3, s1, 24
|
||||
; GFX8-NEXT: s_bfe_u32 s6, s1, 0x80010
|
||||
; GFX8-NEXT: v_mul_u32_u24_sdwa v3, v3, v4 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:BYTE_1
|
||||
; GFX8-NEXT: s_and_b32 s5, s0, s8
|
||||
; GFX8-NEXT: v_mov_b32_e32 v4, s7
|
||||
; GFX8-NEXT: v_mul_u32_u24_e32 v4, s5, v4
|
||||
; GFX8-NEXT: s_bfe_u32 s4, s0, 0x80010
|
||||
; GFX8-NEXT: v_mov_b32_e32 v5, s6
|
||||
; GFX8-NEXT: v_mov_b32_e32 v6, s3
|
||||
; GFX8-NEXT: s_lshr_b32 s4, s1, 24
|
||||
; GFX8-NEXT: s_lshr_b32 s3, s0, 16
|
||||
; GFX8-NEXT: v_mov_b32_e32 v4, s0
|
||||
; GFX8-NEXT: v_mov_b32_e32 v5, s1
|
||||
; GFX8-NEXT: s_mul_i32 s0, s0, s1
|
||||
; GFX8-NEXT: s_lshr_b32 s5, s1, 16
|
||||
; GFX8-NEXT: v_mul_u32_u24_sdwa v4, v4, v5 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:BYTE_1
|
||||
; GFX8-NEXT: v_mov_b32_e32 v5, s5
|
||||
; GFX8-NEXT: v_and_b32_e32 v3, s0, v3
|
||||
; GFX8-NEXT: v_mov_b32_e32 v6, s4
|
||||
; GFX8-NEXT: v_mov_b32_e32 v7, s2
|
||||
; GFX8-NEXT: v_or_b32_sdwa v3, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
|
||||
; GFX8-NEXT: v_mul_u32_u24_e32 v5, s4, v5
|
||||
; GFX8-NEXT: v_or_b32_e32 v3, v3, v4
|
||||
; GFX8-NEXT: v_mul_u32_u24_e32 v5, s3, v5
|
||||
; GFX8-NEXT: v_mul_u32_u24_sdwa v6, v7, v6 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
|
||||
; GFX8-NEXT: v_and_b32_e32 v3, 0xffff, v3
|
||||
; GFX8-NEXT: v_or_b32_sdwa v5, v5, v6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
|
||||
; GFX8-NEXT: v_or_b32_e32 v4, v3, v5
|
||||
; GFX8-NEXT: v_or_b32_sdwa v4, v5, v6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
|
||||
; GFX8-NEXT: v_or_b32_e32 v4, v3, v4
|
||||
; GFX8-NEXT: v_lshrrev_b32_e32 v5, 8, v4
|
||||
; GFX8-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX8-NEXT: v_add_u32_e32 v2, vcc, v2, v3
|
||||
|
@ -1637,68 +1637,60 @@ define amdgpu_kernel void @idot8_acc16_vecMul(<8 x i4> addrspace(1)* %src1,
|
||||
; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
|
||||
; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
|
||||
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX8-NEXT: s_load_dword s2, s[6:7], 0x0
|
||||
; GFX8-NEXT: s_load_dword s7, s[6:7], 0x0
|
||||
; GFX8-NEXT: v_mov_b32_e32 v0, s0
|
||||
; GFX8-NEXT: v_mov_b32_e32 v1, s1
|
||||
; GFX8-NEXT: flat_load_ushort v2, v[0:1]
|
||||
; GFX8-NEXT: s_load_dword s0, s[4:5], 0x0
|
||||
; GFX8-NEXT: s_load_dword s1, s[4:5], 0x0
|
||||
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX8-NEXT: v_lshlrev_b16_e64 v4, 12, s2
|
||||
; GFX8-NEXT: s_lshr_b32 s15, s2, 4
|
||||
; GFX8-NEXT: s_lshr_b32 s16, s2, 8
|
||||
; GFX8-NEXT: v_lshlrev_b16_e64 v12, 12, s16
|
||||
; GFX8-NEXT: v_lshlrev_b16_e64 v3, 12, s0
|
||||
; GFX8-NEXT: s_lshr_b32 s8, s0, 4
|
||||
; GFX8-NEXT: s_lshr_b32 s9, s0, 8
|
||||
; GFX8-NEXT: v_lshlrev_b16_e64 v5, 12, s9
|
||||
; GFX8-NEXT: v_lshlrev_b16_e64 v6, 12, s8
|
||||
; GFX8-NEXT: v_lshlrev_b16_e64 v13, 12, s15
|
||||
; GFX8-NEXT: v_ashrrev_i16_e32 v3, 12, v3
|
||||
; GFX8-NEXT: v_ashrrev_i16_e32 v4, 12, v4
|
||||
; GFX8-NEXT: s_lshr_b32 s7, s0, 12
|
||||
; GFX8-NEXT: s_lshr_b32 s14, s2, 12
|
||||
; GFX8-NEXT: v_ashrrev_i16_e32 v6, 12, v6
|
||||
; GFX8-NEXT: v_ashrrev_i16_e32 v5, 12, v5
|
||||
; GFX8-NEXT: v_ashrrev_i16_e32 v12, 12, v12
|
||||
; GFX8-NEXT: v_ashrrev_i16_e32 v13, 12, v13
|
||||
; GFX8-NEXT: v_lshlrev_b16_e64 v7, 12, s7
|
||||
; GFX8-NEXT: v_lshlrev_b16_e64 v14, 12, s14
|
||||
; GFX8-NEXT: s_lshr_b32 s6, s0, 16
|
||||
; GFX8-NEXT: s_lshr_b32 s13, s2, 16
|
||||
; GFX8-NEXT: v_mul_u32_u24_e32 v5, v5, v12
|
||||
; GFX8-NEXT: v_lshlrev_b16_e64 v8, 12, s6
|
||||
; GFX8-NEXT: v_lshlrev_b16_e64 v15, 12, s13
|
||||
; GFX8-NEXT: s_lshr_b32 s5, s0, 20
|
||||
; GFX8-NEXT: s_lshr_b32 s12, s2, 20
|
||||
; GFX8-NEXT: v_ashrrev_i16_e32 v7, 12, v7
|
||||
; GFX8-NEXT: v_ashrrev_i16_e32 v14, 12, v14
|
||||
; GFX8-NEXT: v_lshlrev_b16_e64 v9, 12, s5
|
||||
; GFX8-NEXT: v_lshlrev_b16_e64 v16, 12, s12
|
||||
; GFX8-NEXT: s_lshr_b32 s4, s0, 24
|
||||
; GFX8-NEXT: s_lshr_b32 s11, s2, 24
|
||||
; GFX8-NEXT: v_ashrrev_i16_e32 v8, 12, v8
|
||||
; GFX8-NEXT: v_ashrrev_i16_e32 v15, 12, v15
|
||||
; GFX8-NEXT: v_lshlrev_b16_e64 v10, 12, s4
|
||||
; GFX8-NEXT: v_lshlrev_b16_e64 v17, 12, s11
|
||||
; GFX8-NEXT: s_lshr_b32 s1, s0, 28
|
||||
; GFX8-NEXT: s_lshr_b32 s10, s2, 28
|
||||
; GFX8-NEXT: v_ashrrev_i16_e32 v9, 12, v9
|
||||
; GFX8-NEXT: v_ashrrev_i16_e32 v16, 12, v16
|
||||
; GFX8-NEXT: v_lshlrev_b16_e64 v11, 12, s1
|
||||
; GFX8-NEXT: v_lshlrev_b16_e64 v18, 12, s10
|
||||
; GFX8-NEXT: v_ashrrev_i16_e32 v10, 12, v10
|
||||
; GFX8-NEXT: v_ashrrev_i16_e32 v17, 12, v17
|
||||
; GFX8-NEXT: v_ashrrev_i16_e32 v11, 12, v11
|
||||
; GFX8-NEXT: v_ashrrev_i16_e32 v18, 12, v18
|
||||
; GFX8-NEXT: s_lshl_b32 s5, s7, 28
|
||||
; GFX8-NEXT: s_ashr_i64 s[4:5], s[4:5], 60
|
||||
; GFX8-NEXT: s_lshl_b32 s9, s7, 24
|
||||
; GFX8-NEXT: s_lshl_b32 s11, s7, 20
|
||||
; GFX8-NEXT: s_lshl_b32 s5, s1, 28
|
||||
; GFX8-NEXT: s_ashr_i64 s[14:15], s[4:5], 60
|
||||
; GFX8-NEXT: s_lshl_b32 s5, s1, 20
|
||||
; GFX8-NEXT: s_lshl_b32 s13, s1, 24
|
||||
; GFX8-NEXT: s_ashr_i64 s[8:9], s[8:9], 60
|
||||
; GFX8-NEXT: s_ashr_i64 s[10:11], s[10:11], 60
|
||||
; GFX8-NEXT: v_mov_b32_e32 v3, s4
|
||||
; GFX8-NEXT: s_ashr_i64 s[4:5], s[4:5], 60
|
||||
; GFX8-NEXT: v_mov_b32_e32 v4, s10
|
||||
; GFX8-NEXT: s_ashr_i64 s[12:13], s[12:13], 60
|
||||
; GFX8-NEXT: s_lshl_b32 s5, s7, 16
|
||||
; GFX8-NEXT: v_mov_b32_e32 v5, s8
|
||||
; GFX8-NEXT: s_lshl_b32 s9, s1, 16
|
||||
; GFX8-NEXT: s_lshl_b32 s11, s7, 12
|
||||
; GFX8-NEXT: s_ashr_i64 s[16:17], s[4:5], 60
|
||||
; GFX8-NEXT: s_ashr_i64 s[8:9], s[8:9], 60
|
||||
; GFX8-NEXT: v_mul_i32_i24_e32 v4, s4, v4
|
||||
; GFX8-NEXT: s_lshl_b32 s5, s1, 12
|
||||
; GFX8-NEXT: s_lshl_b32 s9, s7, 8
|
||||
; GFX8-NEXT: s_ashr_i64 s[10:11], s[10:11], 60
|
||||
; GFX8-NEXT: v_mov_b32_e32 v6, s16
|
||||
; GFX8-NEXT: s_ashr_i64 s[20:21], s[8:9], 60
|
||||
; GFX8-NEXT: s_lshl_b32 s13, s1, 8
|
||||
; GFX8-NEXT: s_ashr_i64 s[18:19], s[4:5], 60
|
||||
; GFX8-NEXT: s_lshl_b32 s5, s7, 4
|
||||
; GFX8-NEXT: v_mov_b32_e32 v7, s10
|
||||
; GFX8-NEXT: s_lshl_b32 s9, s1, 4
|
||||
; GFX8-NEXT: s_ashr_i64 s[24:25], s[4:5], 60
|
||||
; GFX8-NEXT: s_ashr_i64 s[22:23], s[12:13], 60
|
||||
; GFX8-NEXT: v_mov_b32_e32 v8, s20
|
||||
; GFX8-NEXT: s_ashr_i64 s[6:7], s[6:7], 60
|
||||
; GFX8-NEXT: s_ashr_i64 s[26:27], s[8:9], 60
|
||||
; GFX8-NEXT: v_mov_b32_e32 v9, s24
|
||||
; GFX8-NEXT: s_ashr_i64 s[0:1], s[0:1], 60
|
||||
; GFX8-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX8-NEXT: v_mad_u32_u24 v2, v3, v4, v2
|
||||
; GFX8-NEXT: v_mad_u32_u24 v2, v6, v13, v2
|
||||
; GFX8-NEXT: v_add_u32_sdwa v2, vcc, v5, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:WORD_0
|
||||
; GFX8-NEXT: v_mad_u32_u24 v2, v7, v14, v2
|
||||
; GFX8-NEXT: v_mad_u32_u24 v2, v8, v15, v2
|
||||
; GFX8-NEXT: v_mad_u32_u24 v2, v9, v16, v2
|
||||
; GFX8-NEXT: v_mad_u32_u24 v2, v10, v17, v2
|
||||
; GFX8-NEXT: v_mad_u32_u24 v2, v11, v18, v2
|
||||
; GFX8-NEXT: v_mad_i32_i24 v2, s14, v3, v2
|
||||
; GFX8-NEXT: v_mad_i32_i24 v2, s12, v5, v2
|
||||
; GFX8-NEXT: v_add_u32_sdwa v2, vcc, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:WORD_0
|
||||
; GFX8-NEXT: v_mad_i32_i24 v2, s8, v6, v2
|
||||
; GFX8-NEXT: v_mad_i32_i24 v2, s18, v7, v2
|
||||
; GFX8-NEXT: v_mad_i32_i24 v2, s22, v8, v2
|
||||
; GFX8-NEXT: v_mad_i32_i24 v2, s26, v9, v2
|
||||
; GFX8-NEXT: v_mov_b32_e32 v3, s6
|
||||
; GFX8-NEXT: v_mad_i32_i24 v2, s0, v3, v2
|
||||
; GFX8-NEXT: flat_store_short v[0:1], v2
|
||||
; GFX8-NEXT: s_endpgm
|
||||
;
|
||||
@ -2023,87 +2015,83 @@ define amdgpu_kernel void @idot8_acc8_vecMul(<8 x i4> addrspace(1)* %src1,
|
||||
; GFX8: ; %bb.0: ; %entry
|
||||
; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
|
||||
; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
|
||||
; GFX8-NEXT: s_mov_b32 s2, 0xffff
|
||||
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX8-NEXT: v_mov_b32_e32 v0, s0
|
||||
; GFX8-NEXT: v_mov_b32_e32 v1, s1
|
||||
; GFX8-NEXT: flat_load_ubyte v2, v[0:1]
|
||||
; GFX8-NEXT: s_load_dword s1, s[4:5], 0x0
|
||||
; GFX8-NEXT: s_load_dword s2, s[6:7], 0x0
|
||||
; GFX8-NEXT: s_mov_b32 s0, 0xffff
|
||||
; GFX8-NEXT: s_load_dword s5, s[6:7], 0x0
|
||||
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX8-NEXT: s_lshr_b32 s8, s1, 4
|
||||
; GFX8-NEXT: s_lshr_b32 s9, s1, 12
|
||||
; GFX8-NEXT: s_lshr_b32 s10, s1, 8
|
||||
; GFX8-NEXT: s_lshr_b32 s15, s2, 4
|
||||
; GFX8-NEXT: s_lshr_b32 s16, s2, 12
|
||||
; GFX8-NEXT: s_lshr_b32 s17, s2, 8
|
||||
; GFX8-NEXT: v_lshlrev_b16_e64 v3, 12, s1
|
||||
; GFX8-NEXT: v_lshlrev_b16_e64 v4, 12, s2
|
||||
; GFX8-NEXT: v_lshlrev_b16_e64 v5, 12, s10
|
||||
; GFX8-NEXT: v_lshlrev_b16_e64 v6, 12, s9
|
||||
; GFX8-NEXT: v_lshlrev_b16_e64 v7, 12, s8
|
||||
; GFX8-NEXT: v_lshlrev_b16_e64 v12, 12, s17
|
||||
; GFX8-NEXT: v_lshlrev_b16_e64 v13, 12, s16
|
||||
; GFX8-NEXT: v_lshlrev_b16_e64 v14, 12, s15
|
||||
; GFX8-NEXT: s_lshr_b32 s4, s1, 20
|
||||
; GFX8-NEXT: s_lshr_b32 s5, s1, 16
|
||||
; GFX8-NEXT: s_lshr_b32 s6, s1, 28
|
||||
; GFX8-NEXT: s_lshr_b32 s7, s1, 24
|
||||
; GFX8-NEXT: s_lshr_b32 s11, s2, 20
|
||||
; GFX8-NEXT: s_lshr_b32 s12, s2, 16
|
||||
; GFX8-NEXT: s_lshr_b32 s13, s2, 28
|
||||
; GFX8-NEXT: s_lshr_b32 s14, s2, 24
|
||||
; GFX8-NEXT: v_lshlrev_b16_e64 v8, 12, s7
|
||||
; GFX8-NEXT: v_lshlrev_b16_e64 v9, 12, s6
|
||||
; GFX8-NEXT: v_lshlrev_b16_e64 v10, 12, s5
|
||||
; GFX8-NEXT: v_lshlrev_b16_e64 v11, 12, s4
|
||||
; GFX8-NEXT: v_lshlrev_b16_e64 v15, 12, s14
|
||||
; GFX8-NEXT: v_lshlrev_b16_e64 v16, 12, s13
|
||||
; GFX8-NEXT: v_lshlrev_b16_e64 v17, 12, s12
|
||||
; GFX8-NEXT: v_lshlrev_b16_e64 v18, 12, s11
|
||||
; GFX8-NEXT: v_ashrrev_i16_e32 v3, 12, v3
|
||||
; GFX8-NEXT: v_ashrrev_i16_e32 v4, 12, v4
|
||||
; GFX8-NEXT: v_ashrrev_i16_e32 v5, 12, v5
|
||||
; GFX8-NEXT: v_ashrrev_i16_e32 v6, 12, v6
|
||||
; GFX8-NEXT: v_ashrrev_i16_e32 v7, 12, v7
|
||||
; GFX8-NEXT: v_ashrrev_i16_e32 v12, 12, v12
|
||||
; GFX8-NEXT: v_ashrrev_i16_e32 v13, 12, v13
|
||||
; GFX8-NEXT: v_ashrrev_i16_e32 v14, 12, v14
|
||||
; GFX8-NEXT: v_mul_u32_u24_sdwa v3, v3, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:BYTE_0
|
||||
; GFX8-NEXT: v_mul_u32_u24_sdwa v4, v5, v12 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:BYTE_0
|
||||
; GFX8-NEXT: v_mul_u32_u24_sdwa v5, v6, v13 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:BYTE_0
|
||||
; GFX8-NEXT: v_mul_u32_u24_sdwa v6, v7, v14 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:BYTE_0
|
||||
; GFX8-NEXT: v_ashrrev_i16_e32 v8, 12, v8
|
||||
; GFX8-NEXT: v_ashrrev_i16_e32 v15, 12, v15
|
||||
; GFX8-NEXT: v_ashrrev_i16_e32 v9, 12, v9
|
||||
; GFX8-NEXT: v_ashrrev_i16_e32 v10, 12, v10
|
||||
; GFX8-NEXT: v_ashrrev_i16_e32 v11, 12, v11
|
||||
; GFX8-NEXT: v_ashrrev_i16_e32 v16, 12, v16
|
||||
; GFX8-NEXT: v_ashrrev_i16_e32 v17, 12, v17
|
||||
; GFX8-NEXT: v_ashrrev_i16_e32 v18, 12, v18
|
||||
; GFX8-NEXT: v_or_b32_sdwa v3, v3, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
|
||||
; GFX8-NEXT: v_mul_u32_u24_sdwa v7, v8, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:BYTE_0
|
||||
; GFX8-NEXT: v_mul_u32_u24_sdwa v8, v9, v16 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:BYTE_0
|
||||
; GFX8-NEXT: v_mul_u32_u24_sdwa v9, v10, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:BYTE_0
|
||||
; GFX8-NEXT: v_mul_u32_u24_sdwa v10, v11, v18 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:BYTE_0
|
||||
; GFX8-NEXT: v_and_b32_e32 v3, s0, v3
|
||||
; GFX8-NEXT: v_or_b32_sdwa v4, v4, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
|
||||
; GFX8-NEXT: v_or_b32_sdwa v9, v9, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
|
||||
; GFX8-NEXT: v_or_b32_sdwa v7, v7, v8 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
|
||||
; GFX8-NEXT: v_and_b32_e32 v5, s0, v9
|
||||
; GFX8-NEXT: v_or_b32_e32 v4, v3, v4
|
||||
; GFX8-NEXT: v_or_b32_e32 v6, v5, v7
|
||||
; GFX8-NEXT: v_lshrrev_b32_e32 v7, 8, v4
|
||||
; GFX8-NEXT: v_lshrrev_b32_e32 v8, 8, v6
|
||||
; GFX8-NEXT: s_lshl_b32 s13, s1, 24
|
||||
; GFX8-NEXT: s_lshl_b32 s17, s1, 16
|
||||
; GFX8-NEXT: s_ashr_i64 s[22:23], s[4:5], 60
|
||||
; GFX8-NEXT: s_lshl_b32 s25, s5, 24
|
||||
; GFX8-NEXT: s_lshl_b32 s27, s5, 28
|
||||
; GFX8-NEXT: s_lshl_b32 s29, s5, 16
|
||||
; GFX8-NEXT: s_ashr_i64 s[10:11], s[0:1], 60
|
||||
; GFX8-NEXT: s_lshl_b32 s15, s1, 28
|
||||
; GFX8-NEXT: s_lshl_b32 s19, s5, 8
|
||||
; GFX8-NEXT: s_lshl_b32 s21, s5, 12
|
||||
; GFX8-NEXT: s_lshl_b32 s23, s5, 4
|
||||
; GFX8-NEXT: s_lshl_b32 s5, s5, 20
|
||||
; GFX8-NEXT: s_ashr_i64 s[12:13], s[12:13], 60
|
||||
; GFX8-NEXT: s_ashr_i64 s[16:17], s[16:17], 60
|
||||
; GFX8-NEXT: s_ashr_i64 s[24:25], s[24:25], 60
|
||||
; GFX8-NEXT: s_ashr_i64 s[26:27], s[26:27], 60
|
||||
; GFX8-NEXT: s_ashr_i64 s[28:29], s[28:29], 60
|
||||
; GFX8-NEXT: s_lshl_b32 s7, s1, 8
|
||||
; GFX8-NEXT: s_lshl_b32 s9, s1, 12
|
||||
; GFX8-NEXT: s_lshl_b32 s11, s1, 4
|
||||
; GFX8-NEXT: s_lshl_b32 s1, s1, 20
|
||||
; GFX8-NEXT: s_ashr_i64 s[4:5], s[4:5], 60
|
||||
; GFX8-NEXT: s_ashr_i64 s[14:15], s[14:15], 60
|
||||
; GFX8-NEXT: v_mov_b32_e32 v6, s28
|
||||
; GFX8-NEXT: v_mov_b32_e32 v7, s16
|
||||
; GFX8-NEXT: v_mov_b32_e32 v8, s26
|
||||
; GFX8-NEXT: v_mov_b32_e32 v9, s24
|
||||
; GFX8-NEXT: v_mov_b32_e32 v10, s12
|
||||
; GFX8-NEXT: v_mul_i32_i24_sdwa v6, v7, v6 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
|
||||
; GFX8-NEXT: v_mul_i32_i24_e32 v7, s14, v8
|
||||
; GFX8-NEXT: v_mul_i32_i24_sdwa v8, v10, v9 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
|
||||
; GFX8-NEXT: s_ashr_i64 s[0:1], s[0:1], 60
|
||||
; GFX8-NEXT: v_mov_b32_e32 v5, s4
|
||||
; GFX8-NEXT: v_mul_i32_i24_e32 v5, s0, v5
|
||||
; GFX8-NEXT: v_or_b32_sdwa v7, v7, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
|
||||
; GFX8-NEXT: s_ashr_i64 s[6:7], s[6:7], 60
|
||||
; GFX8-NEXT: s_ashr_i64 s[18:19], s[18:19], 60
|
||||
; GFX8-NEXT: v_or_b32_sdwa v5, v5, v6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
|
||||
; GFX8-NEXT: v_and_b32_e32 v6, s2, v7
|
||||
; GFX8-NEXT: s_ashr_i64 s[20:21], s[20:21], 60
|
||||
; GFX8-NEXT: v_mov_b32_e32 v3, s22
|
||||
; GFX8-NEXT: v_mov_b32_e32 v4, s10
|
||||
; GFX8-NEXT: s_ashr_i64 s[32:33], s[22:23], 60
|
||||
; GFX8-NEXT: v_mul_i32_i24_sdwa v3, v4, v3 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
|
||||
; GFX8-NEXT: v_or_b32_e32 v5, v6, v5
|
||||
; GFX8-NEXT: s_ashr_i64 s[8:9], s[8:9], 60
|
||||
; GFX8-NEXT: v_mov_b32_e32 v4, s20
|
||||
; GFX8-NEXT: v_mov_b32_e32 v12, s18
|
||||
; GFX8-NEXT: v_mov_b32_e32 v13, s6
|
||||
; GFX8-NEXT: s_ashr_i64 s[30:31], s[10:11], 60
|
||||
; GFX8-NEXT: v_mov_b32_e32 v11, s32
|
||||
; GFX8-NEXT: v_mul_i32_i24_e32 v4, s8, v4
|
||||
; GFX8-NEXT: v_mul_i32_i24_sdwa v10, v13, v12 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
|
||||
; GFX8-NEXT: v_lshrrev_b32_e32 v7, 8, v5
|
||||
; GFX8-NEXT: v_or_b32_sdwa v4, v4, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
|
||||
; GFX8-NEXT: v_mul_i32_i24_e32 v9, s30, v11
|
||||
; GFX8-NEXT: v_or_b32_sdwa v3, v9, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
|
||||
; GFX8-NEXT: v_and_b32_e32 v4, s2, v4
|
||||
; GFX8-NEXT: v_or_b32_e32 v3, v4, v3
|
||||
; GFX8-NEXT: v_lshrrev_b32_e32 v8, 8, v3
|
||||
; GFX8-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX8-NEXT: v_add_u32_e32 v2, vcc, v2, v3
|
||||
; GFX8-NEXT: v_add_u32_e32 v2, vcc, v2, v6
|
||||
; GFX8-NEXT: v_add_u32_e32 v2, vcc, v7, v2
|
||||
; GFX8-NEXT: v_add_u32_sdwa v2, vcc, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_2 src1_sel:BYTE_0
|
||||
; GFX8-NEXT: v_add_u32_sdwa v2, vcc, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD
|
||||
; GFX8-NEXT: v_add_u32_e32 v2, vcc, v2, v5
|
||||
; GFX8-NEXT: v_add_u32_sdwa v2, vcc, v5, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_2 src1_sel:BYTE_0
|
||||
; GFX8-NEXT: v_add_u32_sdwa v2, vcc, v5, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD
|
||||
; GFX8-NEXT: v_add_u32_e32 v2, vcc, v2, v4
|
||||
; GFX8-NEXT: v_add_u32_e32 v2, vcc, v8, v2
|
||||
; GFX8-NEXT: v_add_u32_sdwa v2, vcc, v6, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
|
||||
; GFX8-NEXT: v_add_u32_sdwa v2, vcc, v6, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD
|
||||
; GFX8-NEXT: v_add_u32_sdwa v2, vcc, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
|
||||
; GFX8-NEXT: v_add_u32_sdwa v2, vcc, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD
|
||||
; GFX8-NEXT: flat_store_byte v[0:1], v2
|
||||
; GFX8-NEXT: s_endpgm
|
||||
;
|
||||
|
@ -52,43 +52,49 @@ define amdgpu_kernel void @local_store_i55(i55 addrspace(3)* %ptr, i55 %arg) #0
|
||||
;
|
||||
; FIJI-LABEL: local_store_i55:
|
||||
; FIJI: ; %bb.0:
|
||||
; FIJI-NEXT: s_add_u32 s0, s4, 14
|
||||
; FIJI-NEXT: s_addc_u32 s1, s5, 0
|
||||
; FIJI-NEXT: v_mov_b32_e32 v0, s0
|
||||
; FIJI-NEXT: v_mov_b32_e32 v1, s1
|
||||
; FIJI-NEXT: flat_load_ubyte v0, v[0:1]
|
||||
; FIJI-NEXT: s_load_dword s0, s[4:5], 0x0
|
||||
; FIJI-NEXT: s_load_dword s1, s[4:5], 0x8
|
||||
; FIJI-NEXT: s_load_dword s2, s[4:5], 0xc
|
||||
; FIJI-NEXT: s_mov_b32 m0, -1
|
||||
; FIJI-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; FIJI-NEXT: v_mov_b32_e32 v1, s0
|
||||
; FIJI-NEXT: v_mov_b32_e32 v2, s0
|
||||
; FIJI-NEXT: v_mov_b32_e32 v3, s1
|
||||
; FIJI-NEXT: v_mov_b32_e32 v2, s2
|
||||
; FIJI-NEXT: ds_write_b16 v1, v2 offset:4
|
||||
; FIJI-NEXT: s_waitcnt vmcnt(0)
|
||||
; FIJI-NEXT: v_and_b32_e32 v0, 0x7f, v0
|
||||
; FIJI-NEXT: ds_write_b8 v1, v0 offset:6
|
||||
; FIJI-NEXT: ds_write_b32 v1, v3
|
||||
; FIJI-NEXT: s_and_b32 s3, s2, 0xffff
|
||||
; FIJI-NEXT: s_add_u32 s0, s4, 14
|
||||
; FIJI-NEXT: s_addc_u32 s1, s5, 0
|
||||
; FIJI-NEXT: v_mov_b32_e32 v0, s0
|
||||
; FIJI-NEXT: v_mov_b32_e32 v1, s1
|
||||
; FIJI-NEXT: flat_load_ubyte v0, v[0:1]
|
||||
; FIJI-NEXT: v_mov_b32_e32 v1, s2
|
||||
; FIJI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
||||
; FIJI-NEXT: v_lshlrev_b32_e32 v0, 16, v0
|
||||
; FIJI-NEXT: v_or_b32_e32 v0, s3, v0
|
||||
; FIJI-NEXT: v_bfe_u32 v0, v0, 16, 7
|
||||
; FIJI-NEXT: ds_write_b16 v2, v1 offset:4
|
||||
; FIJI-NEXT: ds_write_b8 v2, v0 offset:6
|
||||
; FIJI-NEXT: ds_write_b32 v2, v3
|
||||
; FIJI-NEXT: s_endpgm
|
||||
;
|
||||
; GFX9-LABEL: local_store_i55:
|
||||
; GFX9: ; %bb.0:
|
||||
; GFX9-NEXT: v_mov_b32_e32 v0, s4
|
||||
; GFX9-NEXT: v_mov_b32_e32 v1, s5
|
||||
; GFX9-NEXT: global_load_ubyte_d16_hi v0, v[0:1], off offset:14
|
||||
; GFX9-NEXT: v_mov_b32_e32 v2, 0
|
||||
; GFX9-NEXT: global_load_ubyte_d16_hi v2, v[0:1], off offset:14
|
||||
; GFX9-NEXT: s_load_dword s0, s[4:5], 0x0
|
||||
; GFX9-NEXT: s_load_dword s1, s[4:5], 0x8
|
||||
; GFX9-NEXT: s_load_dword s2, s[4:5], 0xc
|
||||
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX9-NEXT: v_mov_b32_e32 v1, s0
|
||||
; GFX9-NEXT: v_mov_b32_e32 v0, s0
|
||||
; GFX9-NEXT: v_mov_b32_e32 v3, s1
|
||||
; GFX9-NEXT: v_mov_b32_e32 v2, s2
|
||||
; GFX9-NEXT: ds_write_b16 v1, v2 offset:4
|
||||
; GFX9-NEXT: v_mov_b32_e32 v1, s2
|
||||
; GFX9-NEXT: s_and_b32 s3, s2, 0xffff
|
||||
; GFX9-NEXT: ds_write_b16 v0, v1 offset:4
|
||||
; GFX9-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX9-NEXT: v_bfe_u32 v0, v0, 16, 7
|
||||
; GFX9-NEXT: ds_write_b8 v1, v0 offset:6
|
||||
; GFX9-NEXT: ds_write_b32 v1, v3
|
||||
; GFX9-NEXT: v_or_b32_e32 v1, s3, v2
|
||||
; GFX9-NEXT: v_and_b32_e32 v1, 0x7fffff, v1
|
||||
; GFX9-NEXT: ds_write_b8_d16_hi v0, v1 offset:6
|
||||
; GFX9-NEXT: ds_write_b32 v0, v3
|
||||
; GFX9-NEXT: s_endpgm
|
||||
store i55 %arg, i55 addrspace(3)* %ptr, align 8
|
||||
ret void
|
||||
|
@ -21,8 +21,7 @@ define i64 @combine_psadbw_demandedelt(<16 x i8> %0, <16 x i8> %1) {
|
||||
; X86-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,3,2]
|
||||
; X86-NEXT: psadbw %xmm0, %xmm1
|
||||
; X86-NEXT: movd %xmm1, %eax
|
||||
; X86-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
|
||||
; X86-NEXT: movd %xmm0, %edx
|
||||
; X86-NEXT: xorl %edx, %edx
|
||||
; X86-NEXT: retl
|
||||
;
|
||||
; X64-LABEL: combine_psadbw_demandedelt:
|
||||
|
@ -1576,10 +1576,8 @@ define i8 @test_v4i8(<4 x i8> %a0) {
|
||||
; SSE2-NEXT: psrldq {{.*#+}} xmm0 = xmm0[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero
|
||||
; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
|
||||
; SSE2-NEXT: pmullw %xmm1, %xmm0
|
||||
; SSE2-NEXT: pand {{.*}}(%rip), %xmm0
|
||||
; SSE2-NEXT: packuswb %xmm0, %xmm0
|
||||
; SSE2-NEXT: movdqa %xmm0, %xmm1
|
||||
; SSE2-NEXT: psrlw $8, %xmm1
|
||||
; SSE2-NEXT: psrld $16, %xmm1
|
||||
; SSE2-NEXT: pmullw %xmm0, %xmm1
|
||||
; SSE2-NEXT: movd %xmm1, %eax
|
||||
; SSE2-NEXT: # kill: def $al killed $al killed $eax
|
||||
@ -1591,7 +1589,7 @@ define i8 @test_v4i8(<4 x i8> %a0) {
|
||||
; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
|
||||
; SSE41-NEXT: pmullw %xmm0, %xmm1
|
||||
; SSE41-NEXT: movdqa %xmm1, %xmm0
|
||||
; SSE41-NEXT: pshufb {{.*#+}} xmm0 = xmm0[2],zero,xmm0[6],zero,xmm0[u],zero,xmm0[u],zero,xmm0[u],zero,xmm0[u],zero,xmm0[u],zero,xmm0[u],zero
|
||||
; SSE41-NEXT: psrld $16, %xmm0
|
||||
; SSE41-NEXT: pmullw %xmm1, %xmm0
|
||||
; SSE41-NEXT: pextrb $0, %xmm0, %eax
|
||||
; SSE41-NEXT: # kill: def $al killed $al killed $eax
|
||||
@ -1602,7 +1600,7 @@ define i8 @test_v4i8(<4 x i8> %a0) {
|
||||
; AVX-NEXT: vpmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
|
||||
; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
|
||||
; AVX-NEXT: vpmullw %xmm1, %xmm0, %xmm0
|
||||
; AVX-NEXT: vpshufb {{.*#+}} xmm1 = xmm0[2],zero,xmm0[6],zero,xmm0[u],zero,xmm0[u],zero,xmm0[u],zero,xmm0[u],zero,xmm0[u],zero,xmm0[u],zero
|
||||
; AVX-NEXT: vpsrld $16, %xmm0, %xmm1
|
||||
; AVX-NEXT: vpmullw %xmm1, %xmm0, %xmm0
|
||||
; AVX-NEXT: vpextrb $0, %xmm0, %eax
|
||||
; AVX-NEXT: # kill: def $al killed $al killed $eax
|
||||
@ -1613,7 +1611,7 @@ define i8 @test_v4i8(<4 x i8> %a0) {
|
||||
; AVX512BW-NEXT: vpmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
|
||||
; AVX512BW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
|
||||
; AVX512BW-NEXT: vpmullw %xmm1, %xmm0, %xmm0
|
||||
; AVX512BW-NEXT: vpshufb {{.*#+}} xmm1 = xmm0[2],zero,xmm0[6],zero,xmm0[u],zero,xmm0[u],zero,xmm0[u],zero,xmm0[u],zero,xmm0[u],zero,xmm0[u],zero
|
||||
; AVX512BW-NEXT: vpsrld $16, %xmm0, %xmm1
|
||||
; AVX512BW-NEXT: vpmullw %xmm1, %xmm0, %xmm0
|
||||
; AVX512BW-NEXT: vpextrb $0, %xmm0, %eax
|
||||
; AVX512BW-NEXT: # kill: def $al killed $al killed $eax
|
||||
@ -1625,7 +1623,7 @@ define i8 @test_v4i8(<4 x i8> %a0) {
|
||||
; AVX512VL-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,2,3]
|
||||
; AVX512VL-NEXT: vpmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
|
||||
; AVX512VL-NEXT: vpmullw %xmm1, %xmm0, %xmm0
|
||||
; AVX512VL-NEXT: vpshufb {{.*#+}} xmm1 = xmm0[2],zero,xmm0[6],zero,xmm0[u],zero,xmm0[u],zero,xmm0[u],zero,xmm0[u],zero,xmm0[u],zero,xmm0[u],zero
|
||||
; AVX512VL-NEXT: vpsrld $16, %xmm0, %xmm1
|
||||
; AVX512VL-NEXT: vpmullw %xmm1, %xmm0, %xmm0
|
||||
; AVX512VL-NEXT: vpextrb $0, %xmm0, %eax
|
||||
; AVX512VL-NEXT: # kill: def $al killed $al killed $eax
|
||||
@ -1636,7 +1634,7 @@ define i8 @test_v4i8(<4 x i8> %a0) {
|
||||
; AVX512DQ-NEXT: vpmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
|
||||
; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
|
||||
; AVX512DQ-NEXT: vpmullw %xmm1, %xmm0, %xmm0
|
||||
; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm1 = xmm0[2],zero,xmm0[6],zero,xmm0[u],zero,xmm0[u],zero,xmm0[u],zero,xmm0[u],zero,xmm0[u],zero,xmm0[u],zero
|
||||
; AVX512DQ-NEXT: vpsrld $16, %xmm0, %xmm1
|
||||
; AVX512DQ-NEXT: vpmullw %xmm1, %xmm0, %xmm0
|
||||
; AVX512DQ-NEXT: vpextrb $0, %xmm0, %eax
|
||||
; AVX512DQ-NEXT: # kill: def $al killed $al killed $eax
|
||||
@ -1654,10 +1652,8 @@ define i8 @test_v8i8(<8 x i8> %a0) {
|
||||
; SSE2-NEXT: pmullw %xmm0, %xmm1
|
||||
; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,2,3,0]
|
||||
; SSE2-NEXT: pmullw %xmm1, %xmm0
|
||||
; SSE2-NEXT: pand {{.*}}(%rip), %xmm0
|
||||
; SSE2-NEXT: packuswb %xmm0, %xmm0
|
||||
; SSE2-NEXT: movdqa %xmm0, %xmm1
|
||||
; SSE2-NEXT: psrlw $8, %xmm1
|
||||
; SSE2-NEXT: psrld $16, %xmm1
|
||||
; SSE2-NEXT: pmullw %xmm0, %xmm1
|
||||
; SSE2-NEXT: movd %xmm1, %eax
|
||||
; SSE2-NEXT: # kill: def $al killed $al killed $eax
|
||||
@ -1673,7 +1669,7 @@ define i8 @test_v8i8(<8 x i8> %a0) {
|
||||
; SSE41-NEXT: pmovzxbw {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
|
||||
; SSE41-NEXT: pmullw %xmm0, %xmm1
|
||||
; SSE41-NEXT: movdqa %xmm1, %xmm0
|
||||
; SSE41-NEXT: pshufb {{.*#+}} xmm0 = xmm0[2],zero,xmm0[6],zero,xmm0[10],zero,xmm0[14],zero,xmm0[u],zero,xmm0[u],zero,xmm0[u],zero,xmm0[u],zero
|
||||
; SSE41-NEXT: psrld $16, %xmm0
|
||||
; SSE41-NEXT: pmullw %xmm1, %xmm0
|
||||
; SSE41-NEXT: pextrb $0, %xmm0, %eax
|
||||
; SSE41-NEXT: # kill: def $al killed $al killed $eax
|
||||
@ -1686,7 +1682,7 @@ define i8 @test_v8i8(<8 x i8> %a0) {
|
||||
; AVX-NEXT: vpmullw %xmm1, %xmm0, %xmm0
|
||||
; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
|
||||
; AVX-NEXT: vpmullw %xmm1, %xmm0, %xmm0
|
||||
; AVX-NEXT: vpshufb {{.*#+}} xmm1 = xmm0[2],zero,xmm0[6],zero,xmm0[10],zero,xmm0[14],zero,xmm0[u],zero,xmm0[u],zero,xmm0[u],zero,xmm0[u],zero
|
||||
; AVX-NEXT: vpsrld $16, %xmm0, %xmm1
|
||||
; AVX-NEXT: vpmullw %xmm1, %xmm0, %xmm0
|
||||
; AVX-NEXT: vpextrb $0, %xmm0, %eax
|
||||
; AVX-NEXT: # kill: def $al killed $al killed $eax
|
||||
@ -1699,7 +1695,7 @@ define i8 @test_v8i8(<8 x i8> %a0) {
|
||||
; AVX512BW-NEXT: vpmullw %xmm1, %xmm0, %xmm0
|
||||
; AVX512BW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
|
||||
; AVX512BW-NEXT: vpmullw %xmm1, %xmm0, %xmm0
|
||||
; AVX512BW-NEXT: vpshufb {{.*#+}} xmm1 = xmm0[2],zero,xmm0[6],zero,xmm0[10],zero,xmm0[14],zero,xmm0[u],zero,xmm0[u],zero,xmm0[u],zero,xmm0[u],zero
|
||||
; AVX512BW-NEXT: vpsrld $16, %xmm0, %xmm1
|
||||
; AVX512BW-NEXT: vpmullw %xmm1, %xmm0, %xmm0
|
||||
; AVX512BW-NEXT: vpextrb $0, %xmm0, %eax
|
||||
; AVX512BW-NEXT: # kill: def $al killed $al killed $eax
|
||||
@ -1713,7 +1709,7 @@ define i8 @test_v8i8(<8 x i8> %a0) {
|
||||
; AVX512VL-NEXT: vpmullw %xmm1, %xmm0, %xmm0
|
||||
; AVX512VL-NEXT: vpalignr {{.*#+}} xmm1 = xmm0[4,5,6,7,8,9,10,11,12,13,14,15,0,1,2,3]
|
||||
; AVX512VL-NEXT: vpmullw %xmm1, %xmm0, %xmm0
|
||||
; AVX512VL-NEXT: vpshufb {{.*#+}} xmm1 = xmm0[2],zero,xmm0[6],zero,xmm0[10],zero,xmm0[14],zero,xmm0[u],zero,xmm0[u],zero,xmm0[u],zero,xmm0[u],zero
|
||||
; AVX512VL-NEXT: vpsrld $16, %xmm0, %xmm1
|
||||
; AVX512VL-NEXT: vpmullw %xmm1, %xmm0, %xmm0
|
||||
; AVX512VL-NEXT: vpextrb $0, %xmm0, %eax
|
||||
; AVX512VL-NEXT: # kill: def $al killed $al killed $eax
|
||||
@ -1726,7 +1722,7 @@ define i8 @test_v8i8(<8 x i8> %a0) {
|
||||
; AVX512DQ-NEXT: vpmullw %xmm1, %xmm0, %xmm0
|
||||
; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
|
||||
; AVX512DQ-NEXT: vpmullw %xmm1, %xmm0, %xmm0
|
||||
; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm1 = xmm0[2],zero,xmm0[6],zero,xmm0[10],zero,xmm0[14],zero,xmm0[u],zero,xmm0[u],zero,xmm0[u],zero,xmm0[u],zero
|
||||
; AVX512DQ-NEXT: vpsrld $16, %xmm0, %xmm1
|
||||
; AVX512DQ-NEXT: vpmullw %xmm1, %xmm0, %xmm0
|
||||
; AVX512DQ-NEXT: vpextrb $0, %xmm0, %eax
|
||||
; AVX512DQ-NEXT: # kill: def $al killed $al killed $eax
|
||||
@ -1756,8 +1752,9 @@ define i8 @test_v16i8(<16 x i8> %a0) {
|
||||
; SSE2-NEXT: psrldq {{.*#+}} xmm0 = xmm0[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero
|
||||
; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
|
||||
; SSE2-NEXT: pmullw %xmm2, %xmm0
|
||||
; SSE2-NEXT: pand %xmm0, %xmm1
|
||||
; SSE2-NEXT: packuswb %xmm3, %xmm1
|
||||
; SSE2-NEXT: pand %xmm1, %xmm0
|
||||
; SSE2-NEXT: packuswb %xmm3, %xmm0
|
||||
; SSE2-NEXT: movdqa %xmm0, %xmm1
|
||||
; SSE2-NEXT: psrlw $8, %xmm1
|
||||
; SSE2-NEXT: pmullw %xmm0, %xmm1
|
||||
; SSE2-NEXT: movd %xmm1, %eax
|
||||
@ -1799,10 +1796,10 @@ define i8 @test_v16i8(<16 x i8> %a0) {
|
||||
; AVX1-NEXT: vpmullw %xmm1, %xmm0, %xmm0
|
||||
; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
|
||||
; AVX1-NEXT: vpmullw %xmm1, %xmm0, %xmm0
|
||||
; AVX1-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm1
|
||||
; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2
|
||||
; AVX1-NEXT: vpackuswb %xmm2, %xmm1, %xmm1
|
||||
; AVX1-NEXT: vpsrlw $8, %xmm1, %xmm1
|
||||
; AVX1-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0
|
||||
; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1
|
||||
; AVX1-NEXT: vpackuswb %xmm1, %xmm0, %xmm0
|
||||
; AVX1-NEXT: vpsrlw $8, %xmm0, %xmm1
|
||||
; AVX1-NEXT: vpmullw %xmm1, %xmm0, %xmm0
|
||||
; AVX1-NEXT: vpextrb $0, %xmm0, %eax
|
||||
; AVX1-NEXT: # kill: def $al killed $al killed $eax
|
||||
@ -1947,8 +1944,9 @@ define i8 @test_v32i8(<32 x i8> %a0) {
|
||||
; SSE2-NEXT: psrldq {{.*#+}} xmm0 = xmm0[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero
|
||||
; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
|
||||
; SSE2-NEXT: pmullw %xmm2, %xmm0
|
||||
; SSE2-NEXT: pand %xmm0, %xmm1
|
||||
; SSE2-NEXT: packuswb %xmm3, %xmm1
|
||||
; SSE2-NEXT: pand %xmm1, %xmm0
|
||||
; SSE2-NEXT: packuswb %xmm3, %xmm0
|
||||
; SSE2-NEXT: movdqa %xmm0, %xmm1
|
||||
; SSE2-NEXT: psrlw $8, %xmm1
|
||||
; SSE2-NEXT: pmullw %xmm0, %xmm1
|
||||
; SSE2-NEXT: movd %xmm1, %eax
|
||||
@ -2037,10 +2035,9 @@ define i8 @test_v32i8(<32 x i8> %a0) {
|
||||
; AVX2-NEXT: vpsrld $16, %xmm1, %xmm1
|
||||
; AVX2-NEXT: vpmovzxbw {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
|
||||
; AVX2-NEXT: vpmullw %xmm1, %xmm0, %xmm0
|
||||
; AVX2-NEXT: vpand %xmm3, %xmm0, %xmm1
|
||||
; AVX2-NEXT: vpackuswb %xmm2, %xmm1, %xmm1
|
||||
; AVX2-NEXT: vpsrlw $8, %xmm1, %xmm1
|
||||
; AVX2-NEXT: vpmovzxbw {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
|
||||
; AVX2-NEXT: vpand %xmm3, %xmm0, %xmm0
|
||||
; AVX2-NEXT: vpackuswb %xmm2, %xmm0, %xmm0
|
||||
; AVX2-NEXT: vpsrlw $8, %xmm0, %xmm1
|
||||
; AVX2-NEXT: vpmullw %xmm1, %xmm0, %xmm0
|
||||
; AVX2-NEXT: vpextrb $0, %xmm0, %eax
|
||||
; AVX2-NEXT: # kill: def $al killed $al killed $eax
|
||||
@ -2126,10 +2123,9 @@ define i8 @test_v32i8(<32 x i8> %a0) {
|
||||
; AVX512DQ-NEXT: vpsrld $16, %xmm1, %xmm1
|
||||
; AVX512DQ-NEXT: vpmovzxbw {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
|
||||
; AVX512DQ-NEXT: vpmullw %xmm1, %xmm0, %xmm0
|
||||
; AVX512DQ-NEXT: vpand %xmm3, %xmm0, %xmm1
|
||||
; AVX512DQ-NEXT: vpackuswb %xmm2, %xmm1, %xmm1
|
||||
; AVX512DQ-NEXT: vpsrlw $8, %xmm1, %xmm1
|
||||
; AVX512DQ-NEXT: vpmovzxbw {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
|
||||
; AVX512DQ-NEXT: vpand %xmm3, %xmm0, %xmm0
|
||||
; AVX512DQ-NEXT: vpackuswb %xmm2, %xmm0, %xmm0
|
||||
; AVX512DQ-NEXT: vpsrlw $8, %xmm0, %xmm1
|
||||
; AVX512DQ-NEXT: vpmullw %xmm1, %xmm0, %xmm0
|
||||
; AVX512DQ-NEXT: vpextrb $0, %xmm0, %eax
|
||||
; AVX512DQ-NEXT: # kill: def $al killed $al killed $eax
|
||||
@ -2159,10 +2155,9 @@ define i8 @test_v32i8(<32 x i8> %a0) {
|
||||
; AVX512DQVL-NEXT: vpsrld $16, %xmm1, %xmm1
|
||||
; AVX512DQVL-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
|
||||
; AVX512DQVL-NEXT: vpmullw %xmm1, %xmm0, %xmm0
|
||||
; AVX512DQVL-NEXT: vpand %xmm3, %xmm0, %xmm1
|
||||
; AVX512DQVL-NEXT: vpackuswb %xmm2, %xmm1, %xmm1
|
||||
; AVX512DQVL-NEXT: vpsrlw $8, %xmm1, %xmm1
|
||||
; AVX512DQVL-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
|
||||
; AVX512DQVL-NEXT: vpand %xmm3, %xmm0, %xmm0
|
||||
; AVX512DQVL-NEXT: vpackuswb %xmm2, %xmm0, %xmm0
|
||||
; AVX512DQVL-NEXT: vpsrlw $8, %xmm0, %xmm1
|
||||
; AVX512DQVL-NEXT: vpmullw %xmm1, %xmm0, %xmm0
|
||||
; AVX512DQVL-NEXT: vpextrb $0, %xmm0, %eax
|
||||
; AVX512DQVL-NEXT: # kill: def $al killed $al killed $eax
|
||||
@ -2208,8 +2203,9 @@ define i8 @test_v64i8(<64 x i8> %a0) {
|
||||
; SSE2-NEXT: psrldq {{.*#+}} xmm0 = xmm0[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero
|
||||
; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
|
||||
; SSE2-NEXT: pmullw %xmm2, %xmm0
|
||||
; SSE2-NEXT: pand %xmm0, %xmm1
|
||||
; SSE2-NEXT: packuswb %xmm3, %xmm1
|
||||
; SSE2-NEXT: pand %xmm1, %xmm0
|
||||
; SSE2-NEXT: packuswb %xmm3, %xmm0
|
||||
; SSE2-NEXT: movdqa %xmm0, %xmm1
|
||||
; SSE2-NEXT: psrlw $8, %xmm1
|
||||
; SSE2-NEXT: pmullw %xmm0, %xmm1
|
||||
; SSE2-NEXT: movd %xmm1, %eax
|
||||
@ -2323,10 +2319,9 @@ define i8 @test_v64i8(<64 x i8> %a0) {
|
||||
; AVX2-NEXT: vpsrld $16, %xmm2, %xmm2
|
||||
; AVX2-NEXT: vpmovzxbw {{.*#+}} xmm2 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero
|
||||
; AVX2-NEXT: vpmullw %xmm2, %xmm0, %xmm0
|
||||
; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm1
|
||||
; AVX2-NEXT: vpackuswb %xmm3, %xmm1, %xmm1
|
||||
; AVX2-NEXT: vpsrlw $8, %xmm1, %xmm1
|
||||
; AVX2-NEXT: vpmovzxbw {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
|
||||
; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0
|
||||
; AVX2-NEXT: vpackuswb %xmm3, %xmm0, %xmm0
|
||||
; AVX2-NEXT: vpsrlw $8, %xmm0, %xmm1
|
||||
; AVX2-NEXT: vpmullw %xmm1, %xmm0, %xmm0
|
||||
; AVX2-NEXT: vpextrb $0, %xmm0, %eax
|
||||
; AVX2-NEXT: # kill: def $al killed $al killed $eax
|
||||
@ -2365,10 +2360,9 @@ define i8 @test_v64i8(<64 x i8> %a0) {
|
||||
; AVX512BW-NEXT: vpsrld $16, %xmm1, %xmm1
|
||||
; AVX512BW-NEXT: vpmovzxbw {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
|
||||
; AVX512BW-NEXT: vpmullw %xmm1, %xmm0, %xmm0
|
||||
; AVX512BW-NEXT: vpand %xmm3, %xmm0, %xmm1
|
||||
; AVX512BW-NEXT: vpackuswb %xmm2, %xmm1, %xmm1
|
||||
; AVX512BW-NEXT: vpsrlw $8, %xmm1, %xmm1
|
||||
; AVX512BW-NEXT: vpmovzxbw {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
|
||||
; AVX512BW-NEXT: vpand %xmm3, %xmm0, %xmm0
|
||||
; AVX512BW-NEXT: vpackuswb %xmm2, %xmm0, %xmm0
|
||||
; AVX512BW-NEXT: vpsrlw $8, %xmm0, %xmm1
|
||||
; AVX512BW-NEXT: vpmullw %xmm1, %xmm0, %xmm0
|
||||
; AVX512BW-NEXT: vpextrb $0, %xmm0, %eax
|
||||
; AVX512BW-NEXT: # kill: def $al killed $al killed $eax
|
||||
@ -2407,10 +2401,9 @@ define i8 @test_v64i8(<64 x i8> %a0) {
|
||||
; AVX512BWVL-NEXT: vpsrld $16, %xmm1, %xmm1
|
||||
; AVX512BWVL-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
|
||||
; AVX512BWVL-NEXT: vpmullw %xmm1, %xmm0, %xmm0
|
||||
; AVX512BWVL-NEXT: vpand %xmm3, %xmm0, %xmm1
|
||||
; AVX512BWVL-NEXT: vpackuswb %xmm2, %xmm1, %xmm1
|
||||
; AVX512BWVL-NEXT: vpsrlw $8, %xmm1, %xmm1
|
||||
; AVX512BWVL-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
|
||||
; AVX512BWVL-NEXT: vpand %xmm3, %xmm0, %xmm0
|
||||
; AVX512BWVL-NEXT: vpackuswb %xmm2, %xmm0, %xmm0
|
||||
; AVX512BWVL-NEXT: vpsrlw $8, %xmm0, %xmm1
|
||||
; AVX512BWVL-NEXT: vpmullw %xmm1, %xmm0, %xmm0
|
||||
; AVX512BWVL-NEXT: vpextrb $0, %xmm0, %eax
|
||||
; AVX512BWVL-NEXT: # kill: def $al killed $al killed $eax
|
||||
@ -2444,10 +2437,9 @@ define i8 @test_v64i8(<64 x i8> %a0) {
|
||||
; AVX512DQ-NEXT: vpsrld $16, %xmm2, %xmm2
|
||||
; AVX512DQ-NEXT: vpmovzxbw {{.*#+}} xmm2 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero
|
||||
; AVX512DQ-NEXT: vpmullw %xmm2, %xmm0, %xmm0
|
||||
; AVX512DQ-NEXT: vpand %xmm1, %xmm0, %xmm1
|
||||
; AVX512DQ-NEXT: vpackuswb %xmm3, %xmm1, %xmm1
|
||||
; AVX512DQ-NEXT: vpsrlw $8, %xmm1, %xmm1
|
||||
; AVX512DQ-NEXT: vpmovzxbw {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
|
||||
; AVX512DQ-NEXT: vpand %xmm1, %xmm0, %xmm0
|
||||
; AVX512DQ-NEXT: vpackuswb %xmm3, %xmm0, %xmm0
|
||||
; AVX512DQ-NEXT: vpsrlw $8, %xmm0, %xmm1
|
||||
; AVX512DQ-NEXT: vpmullw %xmm1, %xmm0, %xmm0
|
||||
; AVX512DQ-NEXT: vpextrb $0, %xmm0, %eax
|
||||
; AVX512DQ-NEXT: # kill: def $al killed $al killed $eax
|
||||
@ -2486,10 +2478,9 @@ define i8 @test_v64i8(<64 x i8> %a0) {
|
||||
; AVX512DQVL-NEXT: vpsrld $16, %xmm1, %xmm1
|
||||
; AVX512DQVL-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
|
||||
; AVX512DQVL-NEXT: vpmullw %xmm1, %xmm0, %xmm0
|
||||
; AVX512DQVL-NEXT: vpand %xmm3, %xmm0, %xmm1
|
||||
; AVX512DQVL-NEXT: vpackuswb %xmm2, %xmm1, %xmm1
|
||||
; AVX512DQVL-NEXT: vpsrlw $8, %xmm1, %xmm1
|
||||
; AVX512DQVL-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
|
||||
; AVX512DQVL-NEXT: vpand %xmm3, %xmm0, %xmm0
|
||||
; AVX512DQVL-NEXT: vpackuswb %xmm2, %xmm0, %xmm0
|
||||
; AVX512DQVL-NEXT: vpsrlw $8, %xmm0, %xmm1
|
||||
; AVX512DQVL-NEXT: vpmullw %xmm1, %xmm0, %xmm0
|
||||
; AVX512DQVL-NEXT: vpextrb $0, %xmm0, %eax
|
||||
; AVX512DQVL-NEXT: # kill: def $al killed $al killed $eax
|
||||
@ -2555,8 +2546,9 @@ define i8 @test_v128i8(<128 x i8> %a0) {
|
||||
; SSE2-NEXT: psrldq {{.*#+}} xmm1 = xmm1[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero
|
||||
; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
|
||||
; SSE2-NEXT: pmullw %xmm2, %xmm1
|
||||
; SSE2-NEXT: pand %xmm1, %xmm0
|
||||
; SSE2-NEXT: packuswb %xmm3, %xmm0
|
||||
; SSE2-NEXT: pand %xmm0, %xmm1
|
||||
; SSE2-NEXT: packuswb %xmm3, %xmm1
|
||||
; SSE2-NEXT: movdqa %xmm1, %xmm0
|
||||
; SSE2-NEXT: psrlw $8, %xmm0
|
||||
; SSE2-NEXT: pmullw %xmm1, %xmm0
|
||||
; SSE2-NEXT: movd %xmm0, %eax
|
||||
@ -2729,10 +2721,9 @@ define i8 @test_v128i8(<128 x i8> %a0) {
|
||||
; AVX2-NEXT: vpsrld $16, %xmm1, %xmm1
|
||||
; AVX2-NEXT: vpmovzxbw {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
|
||||
; AVX2-NEXT: vpmullw %xmm1, %xmm0, %xmm0
|
||||
; AVX2-NEXT: vpand %xmm2, %xmm0, %xmm1
|
||||
; AVX2-NEXT: vpackuswb %xmm3, %xmm1, %xmm1
|
||||
; AVX2-NEXT: vpsrlw $8, %xmm1, %xmm1
|
||||
; AVX2-NEXT: vpmovzxbw {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
|
||||
; AVX2-NEXT: vpand %xmm2, %xmm0, %xmm0
|
||||
; AVX2-NEXT: vpackuswb %xmm3, %xmm0, %xmm0
|
||||
; AVX2-NEXT: vpsrlw $8, %xmm0, %xmm1
|
||||
; AVX2-NEXT: vpmullw %xmm1, %xmm0, %xmm0
|
||||
; AVX2-NEXT: vpextrb $0, %xmm0, %eax
|
||||
; AVX2-NEXT: # kill: def $al killed $al killed $eax
|
||||
@ -2779,10 +2770,9 @@ define i8 @test_v128i8(<128 x i8> %a0) {
|
||||
; AVX512BW-NEXT: vpsrld $16, %xmm1, %xmm1
|
||||
; AVX512BW-NEXT: vpmovzxbw {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
|
||||
; AVX512BW-NEXT: vpmullw %xmm1, %xmm0, %xmm0
|
||||
; AVX512BW-NEXT: vpand %xmm3, %xmm0, %xmm1
|
||||
; AVX512BW-NEXT: vpackuswb %xmm2, %xmm1, %xmm1
|
||||
; AVX512BW-NEXT: vpsrlw $8, %xmm1, %xmm1
|
||||
; AVX512BW-NEXT: vpmovzxbw {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
|
||||
; AVX512BW-NEXT: vpand %xmm3, %xmm0, %xmm0
|
||||
; AVX512BW-NEXT: vpackuswb %xmm2, %xmm0, %xmm0
|
||||
; AVX512BW-NEXT: vpsrlw $8, %xmm0, %xmm1
|
||||
; AVX512BW-NEXT: vpmullw %xmm1, %xmm0, %xmm0
|
||||
; AVX512BW-NEXT: vpextrb $0, %xmm0, %eax
|
||||
; AVX512BW-NEXT: # kill: def $al killed $al killed $eax
|
||||
@ -2829,10 +2819,9 @@ define i8 @test_v128i8(<128 x i8> %a0) {
|
||||
; AVX512BWVL-NEXT: vpsrld $16, %xmm1, %xmm1
|
||||
; AVX512BWVL-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
|
||||
; AVX512BWVL-NEXT: vpmullw %xmm1, %xmm0, %xmm0
|
||||
; AVX512BWVL-NEXT: vpand %xmm3, %xmm0, %xmm1
|
||||
; AVX512BWVL-NEXT: vpackuswb %xmm2, %xmm1, %xmm1
|
||||
; AVX512BWVL-NEXT: vpsrlw $8, %xmm1, %xmm1
|
||||
; AVX512BWVL-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
|
||||
; AVX512BWVL-NEXT: vpand %xmm3, %xmm0, %xmm0
|
||||
; AVX512BWVL-NEXT: vpackuswb %xmm2, %xmm0, %xmm0
|
||||
; AVX512BWVL-NEXT: vpsrlw $8, %xmm0, %xmm1
|
||||
; AVX512BWVL-NEXT: vpmullw %xmm1, %xmm0, %xmm0
|
||||
; AVX512BWVL-NEXT: vpextrb $0, %xmm0, %eax
|
||||
; AVX512BWVL-NEXT: # kill: def $al killed $al killed $eax
|
||||
@ -2873,10 +2862,9 @@ define i8 @test_v128i8(<128 x i8> %a0) {
|
||||
; AVX512DQ-NEXT: vpsrld $16, %xmm1, %xmm1
|
||||
; AVX512DQ-NEXT: vpmovzxbw {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
|
||||
; AVX512DQ-NEXT: vpmullw %xmm1, %xmm0, %xmm0
|
||||
; AVX512DQ-NEXT: vpand %xmm2, %xmm0, %xmm1
|
||||
; AVX512DQ-NEXT: vpackuswb %xmm3, %xmm1, %xmm1
|
||||
; AVX512DQ-NEXT: vpsrlw $8, %xmm1, %xmm1
|
||||
; AVX512DQ-NEXT: vpmovzxbw {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
|
||||
; AVX512DQ-NEXT: vpand %xmm2, %xmm0, %xmm0
|
||||
; AVX512DQ-NEXT: vpackuswb %xmm3, %xmm0, %xmm0
|
||||
; AVX512DQ-NEXT: vpsrlw $8, %xmm0, %xmm1
|
||||
; AVX512DQ-NEXT: vpmullw %xmm1, %xmm0, %xmm0
|
||||
; AVX512DQ-NEXT: vpextrb $0, %xmm0, %eax
|
||||
; AVX512DQ-NEXT: # kill: def $al killed $al killed $eax
|
||||
@ -2922,10 +2910,9 @@ define i8 @test_v128i8(<128 x i8> %a0) {
|
||||
; AVX512DQVL-NEXT: vpsrld $16, %xmm1, %xmm1
|
||||
; AVX512DQVL-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
|
||||
; AVX512DQVL-NEXT: vpmullw %xmm1, %xmm0, %xmm0
|
||||
; AVX512DQVL-NEXT: vpand %xmm2, %xmm0, %xmm1
|
||||
; AVX512DQVL-NEXT: vpackuswb %xmm3, %xmm1, %xmm1
|
||||
; AVX512DQVL-NEXT: vpsrlw $8, %xmm1, %xmm1
|
||||
; AVX512DQVL-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
|
||||
; AVX512DQVL-NEXT: vpand %xmm2, %xmm0, %xmm0
|
||||
; AVX512DQVL-NEXT: vpackuswb %xmm3, %xmm0, %xmm0
|
||||
; AVX512DQVL-NEXT: vpsrlw $8, %xmm0, %xmm1
|
||||
; AVX512DQVL-NEXT: vpmullw %xmm1, %xmm0, %xmm0
|
||||
; AVX512DQVL-NEXT: vpextrb $0, %xmm0, %eax
|
||||
; AVX512DQVL-NEXT: # kill: def $al killed $al killed $eax
|
||||
|
Loading…
Reference in New Issue
Block a user