mirror of
https://git.eden-emu.dev/eden-emu/eden
synced 2026-02-04 02:51:18 +01:00
AVX2 and SSE impl for SSHLU32
This commit is contained in:
@@ -2878,25 +2878,19 @@ static void EmitVectorPairedMinMax16(BlockOfCode& code, EmitContext& ctx, IR::In
|
||||
template<typename Function>
|
||||
static void EmitVectorPairedMinMaxLower16(BlockOfCode& code, EmitContext& ctx, IR::Inst* inst, Function fn) {
|
||||
auto args = ctx.reg_alloc.GetArgumentInfo(inst);
|
||||
|
||||
auto const x = ctx.reg_alloc.UseScratchXmm(code, args[0]);
|
||||
auto const y = ctx.reg_alloc.UseScratchXmm(code, args[1]);
|
||||
auto const tmp = ctx.reg_alloc.ScratchXmm(code);
|
||||
|
||||
// swap idxs 1 and 2 so that both registers contain even then odd-indexed pairs of elements
|
||||
code.pshuflw(x, x, 0b11'01'10'00);
|
||||
code.pshuflw(y, y, 0b11'01'10'00);
|
||||
|
||||
// move pairs of even/odd-indexed elements into one register each
|
||||
|
||||
// tmp = x[0, 2], y[0, 2], 0s...
|
||||
code.movaps(tmp, y);
|
||||
code.insertps(tmp, x, 0b01001100);
|
||||
// x = x[1, 3], y[1, 3], 0s...
|
||||
code.insertps(x, y, 0b00011100);
|
||||
|
||||
(code.*fn)(x, tmp);
|
||||
|
||||
ctx.reg_alloc.DefineValue(code, inst, x);
|
||||
}
|
||||
|
||||
@@ -3105,12 +3099,11 @@ void EmitX64::EmitVectorPairedMaxLowerS8(EmitContext& ctx, IR::Inst* inst) {
|
||||
void EmitX64::EmitVectorPairedMaxLowerS16(EmitContext& ctx, IR::Inst* inst) {
|
||||
if (code.HasHostFeature(HostFeature::SSE41)) {
|
||||
EmitVectorPairedMinMaxLower16(code, ctx, inst, &Xbyak::CodeGenerator::pmaxsw);
|
||||
return;
|
||||
} else {
|
||||
EmitTwoArgumentFallback(code, ctx, inst, [](VectorArray<s16>& result, const VectorArray<s16>& a, const VectorArray<s16>& b) {
|
||||
LowerPairedMax(result, a, b);
|
||||
});
|
||||
}
|
||||
|
||||
EmitTwoArgumentFallback(code, ctx, inst, [](VectorArray<s16>& result, const VectorArray<s16>& a, const VectorArray<s16>& b) {
|
||||
LowerPairedMax(result, a, b);
|
||||
});
|
||||
}
|
||||
|
||||
void EmitX64::EmitVectorPairedMaxLowerS32(EmitContext& ctx, IR::Inst* inst) {
|
||||
@@ -4952,13 +4945,13 @@ static bool VectorSignedSaturatedShiftLeftUnsigned(VectorArray<T>& dst, const Ve
|
||||
auto const element = data[i];
|
||||
auto const shifted = U(element) << U(T(shift_amount));
|
||||
auto const shifted_test = shifted >> U(T(shift_amount));
|
||||
if (element <= 0)
|
||||
dst[i] = 0;
|
||||
auto result = 0;
|
||||
if (element > 0 && shifted_test != U(element))
|
||||
dst[i] = T((std::numeric_limits<U>::max)());
|
||||
result = T((std::numeric_limits<U>::max)());
|
||||
if (element > 0 && shifted_test == U(element))
|
||||
dst[i] = shifted;
|
||||
qc_flag = element < 0 || (element > 0 && shifted_test != U(element));
|
||||
result = shifted;
|
||||
qc_flag |= element < 0 || (element > 0 && shifted_test != U(element));
|
||||
dst[i] = result;
|
||||
}
|
||||
return qc_flag;
|
||||
}
|
||||
@@ -4972,34 +4965,96 @@ void EmitX64::EmitVectorSignedSaturatedShiftLeftUnsigned16(EmitContext& ctx, IR:
|
||||
}
|
||||
|
||||
void EmitX64::EmitVectorSignedSaturatedShiftLeftUnsigned32(EmitContext& ctx, IR::Inst* inst) {
|
||||
auto args = ctx.reg_alloc.GetArgumentInfo(inst);
|
||||
auto const imm8 = args[1].GetImmediateU8();
|
||||
if (code.HasHostFeature(HostFeature::AVX2)) {
|
||||
auto args = ctx.reg_alloc.GetArgumentInfo(inst);
|
||||
auto const imm8 = args[1].GetImmediateU8();
|
||||
auto const tmp0 = ctx.reg_alloc.UseScratchXmm(code, args[0]);
|
||||
auto const tmp1 = ctx.reg_alloc.ScratchXmm(code);
|
||||
auto const tmp2 = ctx.reg_alloc.ScratchXmm(code);
|
||||
auto const tmp3 = ctx.reg_alloc.ScratchXmm(code);
|
||||
auto const tmp4 = ctx.reg_alloc.ScratchXmm(code);
|
||||
auto const tmp_flag = ctx.reg_alloc.ScratchGpr(code);
|
||||
code.vpslld(tmp2, tmp0, imm8);
|
||||
code.vpxor(tmp3, tmp3, tmp3);
|
||||
code.vpsrld(tmp1, tmp2, imm8);
|
||||
code.vpcmpeqd(tmp3, tmp0, tmp3);
|
||||
code.vpcmpeqd(tmp1, tmp1, tmp0);
|
||||
code.vpor(tmp3, tmp3, tmp1);
|
||||
code.vpcmpeqd(tmp4, tmp4, tmp4);
|
||||
code.vpxor(tmp3, tmp3, tmp4);
|
||||
code.vpor(tmp3, tmp3, tmp0);
|
||||
code.vtestps(tmp3, tmp3);
|
||||
code.vpbroadcastd(tmp3, code.Const(dword, 1));
|
||||
code.vpcmpgtd(tmp0, tmp3, tmp0);
|
||||
code.vpor(tmp0, tmp0, tmp1);
|
||||
code.vblendvps(tmp0, tmp4, tmp2, tmp0);
|
||||
code.setne(tmp_flag.cvt8());
|
||||
auto const tmp0 = ctx.reg_alloc.UseScratchXmm(code, args[0]);
|
||||
if (imm8 == 0) {
|
||||
auto const tmp1 = ctx.reg_alloc.ScratchXmm(code);
|
||||
auto const tmp2 = ctx.reg_alloc.ScratchXmm(code);
|
||||
code.vpshufd(tmp1, tmp0, 85);
|
||||
code.vpshufd(tmp2, tmp0, 238);
|
||||
code.vpor(tmp1, tmp1, tmp2);
|
||||
code.vpshufd(tmp2, tmp0, 255);
|
||||
code.vpor(tmp2, tmp2, tmp0);
|
||||
code.vpor(tmp1, tmp1, tmp2);
|
||||
code.vmovd(tmp_flag.cvt32(), tmp1);
|
||||
code.shr(tmp_flag.cvt32(), 31);
|
||||
code.vpxor(tmp1, tmp1, tmp1);
|
||||
code.vpmaxsd(tmp0, tmp0, tmp1);
|
||||
} else {
|
||||
auto const tmp1 = ctx.reg_alloc.ScratchXmm(code);
|
||||
auto const tmp2 = ctx.reg_alloc.ScratchXmm(code);
|
||||
auto const tmp3 = ctx.reg_alloc.ScratchXmm(code);
|
||||
auto const tmp4 = ctx.reg_alloc.ScratchXmm(code);
|
||||
auto const cmp_value = u32(1ULL << 31) >> (imm8 - 1);
|
||||
code.vpshufd(tmp1, tmp0, 238);
|
||||
code.vpor(tmp1, tmp1, tmp0);
|
||||
code.vpshufd(tmp2, tmp1, 85);
|
||||
code.vpor(tmp1, tmp1, tmp2);
|
||||
code.vmovd(tmp_flag.cvt32(), tmp1);
|
||||
code.cmp(tmp_flag.cvt32(), cmp_value);
|
||||
code.vpslld(tmp1, tmp0, imm8);
|
||||
code.vpbroadcastd(tmp2, code.Const(dword, cmp_value - 2));
|
||||
code.vpbroadcastd(tmp3, code.Const(dword, cmp_value - 1));
|
||||
code.vpcmpgtd(tmp3, tmp0, tmp3);
|
||||
code.vpcmpeqd(tmp4, tmp4, tmp4);
|
||||
code.vpaddd(tmp0, tmp0, tmp4);
|
||||
code.vpminud(tmp2, tmp0, tmp2);
|
||||
code.vpcmpeqd(tmp0, tmp0, tmp2);
|
||||
code.vblendvps(tmp0, tmp3, tmp1, tmp0);
|
||||
code.setae(tmp_flag.cvt8());
|
||||
}
|
||||
code.or_(code.byte[code.ABI_JIT_PTR + code.GetJitStateInfo().offsetof_fpsr_qc], tmp_flag.cvt8());
|
||||
ctx.reg_alloc.DefineValue(code, inst, tmp0);
|
||||
} else {
|
||||
EmitTwoArgumentFallbackWithSaturationAndImmediate(code, ctx, inst, VectorSignedSaturatedShiftLeftUnsigned<s32>);
|
||||
auto const tmp_flag = ctx.reg_alloc.ScratchGpr(code);
|
||||
auto const tmp0 = ctx.reg_alloc.UseScratchXmm(code, args[0]);
|
||||
if (imm8 == 0) {
|
||||
auto const tmp1 = ctx.reg_alloc.ScratchXmm(code);
|
||||
auto const tmp2 = ctx.reg_alloc.ScratchXmm(code);
|
||||
code.pshufd(tmp1, tmp0, 85);
|
||||
code.pshufd(tmp2, tmp0, 238);
|
||||
code.por(tmp2, tmp1);
|
||||
code.pshufd(tmp1, tmp0, 255);
|
||||
code.por(tmp1, tmp0);
|
||||
code.por(tmp1, tmp2);
|
||||
code.movd(tmp_flag.cvt32(), tmp1);
|
||||
code.shr(tmp_flag.cvt32(), 31);
|
||||
code.pxor(tmp1, tmp1);
|
||||
code.movdqa(tmp2, tmp0);
|
||||
code.pcmpgtd(tmp2, tmp1);
|
||||
code.pand(tmp0, tmp2);
|
||||
} else {
|
||||
auto const tmp1 = ctx.reg_alloc.ScratchXmm(code);
|
||||
auto const tmp2 = ctx.reg_alloc.ScratchXmm(code);
|
||||
auto const tmp3 = ctx.reg_alloc.ScratchXmm(code);
|
||||
u64 const cmp_value = u64(1ULL << 31) >> (imm8 - 1);
|
||||
u64 const cmp_one = cmp_value - 1;
|
||||
u64 const cmp_add = (cmp_value - 2) + 0x80000000;
|
||||
code.pshufd(tmp1, tmp0, 238);
|
||||
code.por(tmp1, tmp0);
|
||||
code.pshufd(tmp2, tmp1, 85);
|
||||
code.por(tmp2, tmp1);
|
||||
code.movd(tmp_flag.cvt32(), tmp2);
|
||||
code.cmp(tmp_flag.cvt32(), cmp_value);
|
||||
code.movdqa(tmp1, tmp0);
|
||||
code.pslld(tmp1, imm8);
|
||||
code.movdqa(tmp2, tmp0);
|
||||
code.pcmpgtd(tmp2, code.Const(xword, cmp_one | (cmp_one << 32), cmp_one | (cmp_one << 32)));
|
||||
code.pcmpeqd(tmp3, tmp3);
|
||||
code.paddd(tmp0, tmp3);
|
||||
code.pxor(tmp0, code.Const(xword, 0x80000000'80000000, 0x80000000'80000000));
|
||||
code.pcmpgtd(tmp0, code.Const(xword, cmp_add | (cmp_add << 32), cmp_add | (cmp_add << 32)));
|
||||
code.pand(tmp2, tmp0);
|
||||
code.pandn(tmp0, tmp1);
|
||||
code.por(tmp0, tmp2);
|
||||
code.setae(tmp_flag.cvt8());
|
||||
}
|
||||
code.or_(code.byte[code.ABI_JIT_PTR + code.GetJitStateInfo().offsetof_fpsr_qc], tmp_flag.cvt8());
|
||||
ctx.reg_alloc.DefineValue(code, inst, tmp0);
|
||||
// EmitTwoArgumentFallbackWithSaturationAndImmediate(code, ctx, inst, VectorSignedSaturatedShiftLeftUnsigned<s32>);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@@ -421,23 +421,31 @@ TEST_CASE("A64: SQSHLU", "[a64]") {
|
||||
A64::Jit jit{jit_user_config};
|
||||
|
||||
oaknut::VectorCodeGenerator code{env.code_mem, nullptr};
|
||||
code.SQSHLU(V0.B16(), V1.B16(), 1);
|
||||
code.SQSHLU(V3.H8(), V4.H8(), 2);
|
||||
code.SQSHLU(V6.S4(), V7.S4(), 28);
|
||||
code.SQSHLU(V9.D2(), V10.D2(), 4);
|
||||
code.SQSHLU(V8.B16(), V0.B16(), 1);
|
||||
code.SQSHLU(V9.H8(), V1.H8(), 2);
|
||||
code.SQSHLU(V10.S4(), V2.S4(), 28);
|
||||
code.SQSHLU(V11.D2(), V3.D2(), 4);
|
||||
code.SQSHLU(V12.S4(), V0.S4(), 1);
|
||||
code.SQSHLU(V13.S4(), V1.S4(), 3);
|
||||
code.SQSHLU(V14.S4(), V2.S4(), 0);
|
||||
code.SQSHLU(V15.S4(), V3.S4(), 0);
|
||||
|
||||
jit.SetVector(1, Vector{0xffffffff'18ba6a6a, 0x7fffffff'943b954f});
|
||||
jit.SetVector(4, Vector{0x0000000b'0000000f, 0xffffffff'ffffffff});
|
||||
jit.SetVector(7, Vector{0x00000001'000000ff, 0x00000010'0000007f});
|
||||
jit.SetVector(10, Vector{0xffffffffffffffff, 0x96dc5c140705cd04});
|
||||
jit.SetVector(0, Vector{0xffffffff'18ba6a6a, 0x7fffffff'943b954f});
|
||||
jit.SetVector(1, Vector{0x0000000b'0000000f, 0xffffffff'ffffffff});
|
||||
jit.SetVector(2, Vector{0x00000001'000000ff, 0x00000010'0000007f});
|
||||
jit.SetVector(3, Vector{0xffffffffffffffff, 0x96dc5c140705cd04});
|
||||
|
||||
env.ticks_left = env.code_mem.size();
|
||||
CheckedRun([&]() { jit.Run(); });
|
||||
|
||||
CHECK(jit.GetVector(0) == Vector{0x3000d4d4, 0xfe0000000076009e});
|
||||
CHECK(jit.GetVector(3) == Vector{0x2c0000003c, 0});
|
||||
CHECK(jit.GetVector(6) == Vector{0x10000000'ffffffff, 0xffffffff'ffffffff});
|
||||
CHECK(jit.GetVector(9) == Vector{0, 0});
|
||||
CHECK(jit.GetVector(8) == Vector{0x3000d4d4, 0xfe0000000076009e});
|
||||
CHECK(jit.GetVector(9) == Vector{0x2c0000003c, 0});
|
||||
CHECK(jit.GetVector(10) == Vector{0x10000000'ffffffff, 0xffffffff'ffffffff});
|
||||
CHECK(jit.GetVector(11) == Vector{0, 0});
|
||||
CHECK(jit.GetVector(12) == Vector{0x3174d4d4, 0xfffffffe00000000});
|
||||
CHECK(jit.GetVector(13) == Vector{0x5800000078, 0});
|
||||
CHECK(jit.GetVector(14) == Vector{0x1000000ff, 0x100000007f});
|
||||
CHECK(jit.GetVector(15) == Vector{0, 0x705cd04});
|
||||
}
|
||||
|
||||
TEST_CASE("A64: XTN", "[a64]") {
|
||||
|
||||
Reference in New Issue
Block a user