mirror of
https://github.com/FEX-Emu/FEX.git
synced 2025-03-05 13:08:23 +00:00
AVX128: Minor optimization to 256-bit vpshufb
This commit is contained in:
parent
76f3391ebc
commit
cffae9cb0f
@ -1469,7 +1469,8 @@ private:
|
||||
|
||||
Ref PSADBWOpImpl(size_t Size, Ref Src1, Ref Src2);
|
||||
|
||||
Ref PSHUFBOpImpl(uint8_t SrcSize, Ref Src1, Ref Src2);
|
||||
Ref GeneratePSHUFBMask(uint8_t SrcSize);
|
||||
Ref PSHUFBOpImpl(uint8_t SrcSize, Ref Src1, Ref Src2, Ref MaskVector);
|
||||
|
||||
Ref PSIGNImpl(OpcodeArgs, size_t ElementSize, Ref Src1, Ref Src2);
|
||||
|
||||
|
@ -2012,8 +2012,9 @@ void OpDispatchBuilder::AVX128_VHSUBP(OpcodeArgs) {
|
||||
}
|
||||
|
||||
void OpDispatchBuilder::AVX128_VPSHUFB(OpcodeArgs) {
|
||||
auto MaskVector = GeneratePSHUFBMask(OpSize::i128Bit);
|
||||
AVX128_VectorBinaryImpl(Op, GetDstSize(Op), OpSize::i8Bit,
|
||||
[this](size_t, Ref Src1, Ref Src2) { return PSHUFBOpImpl(OpSize::i128Bit, Src1, Src2); });
|
||||
[this, MaskVector](size_t, Ref Src1, Ref Src2) { return PSHUFBOpImpl(OpSize::i128Bit, Src1, Src2, MaskVector); });
|
||||
}
|
||||
|
||||
void OpDispatchBuilder::AVX128_VPSADBW(OpcodeArgs) {
|
||||
|
@ -989,13 +989,7 @@ template void OpDispatchBuilder::VPUNPCKHOp<2>(OpcodeArgs);
|
||||
template void OpDispatchBuilder::VPUNPCKHOp<4>(OpcodeArgs);
|
||||
template void OpDispatchBuilder::VPUNPCKHOp<8>(OpcodeArgs);
|
||||
|
||||
Ref OpDispatchBuilder::PSHUFBOpImpl(uint8_t SrcSize, Ref Src1, Ref Src2) {
|
||||
const auto Is256Bit = SrcSize == Core::CPUState::XMM_AVX_REG_SIZE;
|
||||
|
||||
// We perform the 256-bit version as two 128-bit operations due to
|
||||
// the lane splitting behavior, so cap the maximum size at 16.
|
||||
const auto SanitizedSrcSize = std::min(SrcSize, uint8_t {16});
|
||||
|
||||
Ref OpDispatchBuilder::GeneratePSHUFBMask(uint8_t SrcSize) {
|
||||
// PSHUFB doesn't 100% match VTBL behaviour
|
||||
// VTBL will set the element zero if the index is greater than
|
||||
// the number of elements in the array
|
||||
@ -1006,7 +1000,16 @@ Ref OpDispatchBuilder::PSHUFBOpImpl(uint8_t SrcSize, Ref Src1, Ref Src2) {
|
||||
// Bits [6:3] is reserved for 64-bit
|
||||
const uint8_t MaskImm = SrcSize == 8 ? 0b1000'0111 : 0b1000'1111;
|
||||
|
||||
Ref MaskVector = _VectorImm(SrcSize, 1, MaskImm);
|
||||
return _VectorImm(SrcSize, 1, MaskImm);
|
||||
}
|
||||
|
||||
Ref OpDispatchBuilder::PSHUFBOpImpl(uint8_t SrcSize, Ref Src1, Ref Src2, Ref MaskVector) {
|
||||
const auto Is256Bit = SrcSize == Core::CPUState::XMM_AVX_REG_SIZE;
|
||||
|
||||
// We perform the 256-bit version as two 128-bit operations due to
|
||||
// the lane splitting behavior, so cap the maximum size at 16.
|
||||
const auto SanitizedSrcSize = std::min(SrcSize, uint8_t {16});
|
||||
|
||||
Ref MaskedIndices = _VAnd(SrcSize, SrcSize, Src2, MaskVector);
|
||||
|
||||
Ref Low = _VTBL1(SanitizedSrcSize, Src1, MaskedIndices);
|
||||
@ -1024,7 +1027,7 @@ void OpDispatchBuilder::PSHUFBOp(OpcodeArgs) {
|
||||
Ref Src1 = LoadSource(FPRClass, Op, Op->Dest, Op->Flags);
|
||||
Ref Src2 = LoadSource(FPRClass, Op, Op->Src[0], Op->Flags);
|
||||
|
||||
Ref Result = PSHUFBOpImpl(SrcSize, Src1, Src2);
|
||||
Ref Result = PSHUFBOpImpl(SrcSize, Src1, Src2, GeneratePSHUFBMask(SrcSize));
|
||||
StoreResult(FPRClass, Op, Result, -1);
|
||||
}
|
||||
|
||||
@ -1033,7 +1036,7 @@ void OpDispatchBuilder::VPSHUFBOp(OpcodeArgs) {
|
||||
Ref Src1 = LoadSource(FPRClass, Op, Op->Src[0], Op->Flags);
|
||||
Ref Src2 = LoadSource(FPRClass, Op, Op->Src[1], Op->Flags);
|
||||
|
||||
Ref Result = PSHUFBOpImpl(SrcSize, Src1, Src2);
|
||||
Ref Result = PSHUFBOpImpl(SrcSize, Src1, Src2, GeneratePSHUFBMask(SrcSize));
|
||||
StoreResult(FPRClass, Op, Result, -1);
|
||||
}
|
||||
|
||||
|
Loading…
x
Reference in New Issue
Block a user