OpcodeDispatcher: Optimizes SSE movmaskps

This now improves the instruction implementation from 17 instructions
down to 5 or 6 depending on if the host supports SVE.

I would say this is now optimal.
This commit is contained in:
Ryan Houdek 2023-08-26 02:25:08 -07:00
parent 8d110738ac
commit 514a8223d9
4 changed files with 28 additions and 6 deletions

View File

@ -14,6 +14,7 @@ constexpr static uint64_t NamedVectorConstants[FEXCore::IR::NamedVectorConstant:
{0x0000'0000'8000'0000, 0x0000'0000'8000'0000}, // NAMED_VECTOR_PADDSUBPS_INVERT_UPPER
{0x8000'0000'0000'0000, 0x0000'0000'0000'0000}, // NAMED_VECTOR_PADDSUBPD_INVERT
{0x8000'0000'0000'0000, 0x0000'0000'0000'0000}, // NAMED_VECTOR_PADDSUBPD_INVERT_UPPER
{0x0000'0001'0000'0000, 0x0000'0003'0000'0002}, // NAMED_VECTOR_MOVMSKPS_SHIFT
};
constexpr static auto PSHUFLW_LUT {
@ -130,12 +131,9 @@ CPUBackend::CPUBackend(FEXCore::Core::InternalThreadState *ThreadState, size_t I
auto &Common = ThreadState->CurrentFrame->Pointers.Common;
// Initialize named vector constants.
Common.NamedVectorConstantPointers[FEXCore::IR::NamedVectorConstant::NAMED_VECTOR_INCREMENTAL_U16_INDEX] = reinterpret_cast<uint64_t>(NamedVectorConstants[0]);
Common.NamedVectorConstantPointers[FEXCore::IR::NamedVectorConstant::NAMED_VECTOR_INCREMENTAL_U16_INDEX_UPPER] = reinterpret_cast<uint64_t>(NamedVectorConstants[1]);
Common.NamedVectorConstantPointers[FEXCore::IR::NamedVectorConstant::NAMED_VECTOR_PADDSUBPS_INVERT] = reinterpret_cast<uint64_t>(NamedVectorConstants[2]);
Common.NamedVectorConstantPointers[FEXCore::IR::NamedVectorConstant::NAMED_VECTOR_PADDSUBPS_INVERT_UPPER] = reinterpret_cast<uint64_t>(NamedVectorConstants[3]);
Common.NamedVectorConstantPointers[FEXCore::IR::NamedVectorConstant::NAMED_VECTOR_PADDSUBPD_INVERT] = reinterpret_cast<uint64_t>(NamedVectorConstants[4]);
Common.NamedVectorConstantPointers[FEXCore::IR::NamedVectorConstant::NAMED_VECTOR_PADDSUBPD_INVERT_UPPER] = reinterpret_cast<uint64_t>(NamedVectorConstants[5]);
for (size_t i = 0; i < FEXCore::IR::NamedVectorConstant::NAMED_VECTOR_MAX; ++i) {
Common.NamedVectorConstantPointers[i] = reinterpret_cast<uint64_t>(NamedVectorConstants[i]);
}
// Initialize Indexed named vector constants.
Common.IndexedNamedVectorConstantPointers[FEXCore::IR::IndexNamedVectorConstant::INDEXED_NAMED_VECTOR_PSHUFLW] = reinterpret_cast<uint64_t>(PSHUFLW_LUT.data());

View File

@ -81,6 +81,16 @@ DEF_OP(LoadNamedVectorConstant) {
const auto Dst = GetVReg(Node);
if (HostSupportsSVE128) {
switch (Op->Constant) {
case FEXCore::IR::NamedVectorConstant::NAMED_VECTOR_MOVMSKPS_SHIFT:
index(ARMEmitter::SubRegSize::i32Bit, Dst.Z(), 0, 1);
return;
default:
// Intentionally doing nothing.
break;
}
}
// Load the pointer.
ldr(TMP1, STATE_PTR(CpuStateFrame, Pointers.Common.NamedVectorConstantPointers[Op->Constant]));

View File

@ -787,6 +787,19 @@ void OpDispatchBuilder::MOVMSKOp(OpcodeArgs) {
GPR = _Lshr(GPR, _Constant(62));
StoreResult_WithOpSize(GPRClass, Op, Op->Dest, GPR, CTX->GetGPRSize(), -1);
}
else if (Size == 16 && ElementSize == 4) {
// Shift all the sign bits to the bottom of their respective elements.
Src = _VUShrI(Size, 4, Src, 31);
// Load the specific 128-bit movmskps shift elements operator.
auto ConstantUSHL = LoadAndCacheNamedVectorConstant(Size, NAMED_VECTOR_MOVMSKPS_SHIFT);
// Shift the sign bits in to specific locations.
Src = _VUShl(Size, 4, Src, ConstantUSHL, false);
// Add across the vector so the sign bits will end up in bits [3:0]
Src = _VAddV(Size, 4, Src);
// Extract to a GPR.
OrderedNode *GPR = _VExtractToGPR(Size, 4, Src, 0);
StoreResult_WithOpSize(GPRClass, Op, Op->Dest, GPR, CTX->GetGPRSize(), -1);
}
else {
OrderedNode *CurrentVal = _Constant(0);

View File

@ -523,6 +523,7 @@ enum NamedVectorConstant : uint8_t {
NAMED_VECTOR_PADDSUBPS_INVERT_UPPER,
NAMED_VECTOR_PADDSUBPD_INVERT,
NAMED_VECTOR_PADDSUBPD_INVERT_UPPER,
NAMED_VECTOR_MOVMSKPS_SHIFT,
NAMED_VECTOR_MAX,
};