AVX128: Implement support for vmovddup

This instruction is a little weird.
When accessing memory, the 128-bit operating size of the instruction
only loads 64-bits.
Meanwhile the 256-bit operating size of the instruction fetches a full
256-bits.

Theoretically the hardware could get away with two 64-bit loads or a
wacky 24-byte load, but it looks like to simplify hardware they just
spec'd it that the 256-bit version will always load the full range.
This commit is contained in:
Ryan Houdek 2024-06-17 20:12:54 -07:00
parent dbaf95a8f3
commit 96aafb4f07
No known key found for this signature in database
2 changed files with 37 additions and 1 deletions

View File

@ -1006,6 +1006,7 @@ public:
void AVX128_MOVQ(OpcodeArgs);
void AVX128_VMOVLP(OpcodeArgs);
void AVX128_VMOVHP(OpcodeArgs);
void AVX128_VMOVDDUP(OpcodeArgs);
// End of AVX 128-bit implementation

View File

@ -36,7 +36,7 @@ void OpDispatchBuilder::InstallAVX128Handlers() {
{OPD(1, 0b00, 0x12), 1, &OpDispatchBuilder::AVX128_VMOVLP},
{OPD(1, 0b01, 0x12), 1, &OpDispatchBuilder::AVX128_VMOVLP},
// TODO: {OPD(1, 0b10, 0x12), 1, &OpDispatchBuilder::VMOVSLDUPOp},
// TODO: {OPD(1, 0b11, 0x12), 1, &OpDispatchBuilder::VMOVDDUPOp},
{OPD(1, 0b11, 0x12), 1, &OpDispatchBuilder::AVX128_VMOVDDUP},
{OPD(1, 0b00, 0x13), 1, &OpDispatchBuilder::AVX128_VMOVLP},
{OPD(1, 0b01, 0x13), 1, &OpDispatchBuilder::AVX128_VMOVLP},
@ -710,4 +710,39 @@ void OpDispatchBuilder::AVX128_VMOVHP(OpcodeArgs) {
}
}
void OpDispatchBuilder::AVX128_VMOVDDUP(OpcodeArgs) {
const auto SrcSize = GetSrcSize(Op);
const auto Is128Bit = SrcSize == Core::CPUState::XMM_SSE_REG_SIZE;
const auto IsSrcGPR = Op->Src[0].IsGPR();
RefPair Src {};
if (IsSrcGPR) {
Src = AVX128_LoadSource_WithOpSize(Op, Op->Src[0], Op->Flags, !Is128Bit);
} else {
// Accesses from memory are a little weird.
// 128-bit operation only loads 8-bytes.
// 256-bit operation loads a full 32-bytes.
if (Is128Bit) {
Src.Low = LoadSource_WithOpSize(FPRClass, Op, Op->Src[0], OpSize::i64Bit, Op->Flags);
} else {
Src = AVX128_LoadSource_WithOpSize(Op, Op->Src[0], Op->Flags, true);
}
}
if (Is128Bit) {
// Duplicate Src[63:0] in to low 128-bits
auto Result_Low = _VDupElement(OpSize::i128Bit, OpSize::i64Bit, Src.Low, 0);
Ref ZeroVector = LoadZeroVector(OpSize::i128Bit);
AVX128_StoreResult_WithOpSize(Op, Op->Dest, RefPair {.Low = Result_Low, .High = ZeroVector});
} else {
// Duplicate Src.Low[63:0] in to low 128-bits
auto Result_Low = _VDupElement(OpSize::i128Bit, OpSize::i64Bit, Src.Low, 0);
// Duplicate Src.High[63:0] in to high 128-bits
auto Result_High = _VDupElement(OpSize::i128Bit, OpSize::i64Bit, Src.High, 0);
AVX128_StoreResult_WithOpSize(Op, Op->Dest, RefPair {.Low = Result_Low, .High = Result_High});
}
}
} // namespace FEXCore::IR