mirror of
https://github.com/FEX-Emu/FEX.git
synced 2024-11-23 06:30:01 +00:00
Merge pull request #4149 from Sonicadvance1/iropsize_convert_class
Some checks failed
Build + Test / build_plus_test ([self-hosted ARMv8.0]) (push) Has been cancelled
Build + Test / build_plus_test ([self-hosted ARMv8.2]) (push) Has been cancelled
Build + Test / build_plus_test ([self-hosted ARMv8.4]) (push) Has been cancelled
GLIBC fault test / glibc_fault_test ([self-hosted ARM64]) (push) Has been cancelled
Hostrunner tests / hostrunner_tests ([self-hosted x64]) (push) Has been cancelled
Instruction Count CI run / instcountci_tests ([self-hosted ARM64]) (push) Has been cancelled
Instruction Count CI run / instcountci_tests ([self-hosted x64]) (push) Has been cancelled
Mingw build / mingw_build ([self-hosted ARM64 mingw]) (push) Has been cancelled
Mingw build / mingw_build ([self-hosted ARM64EC mingw ARM64]) (push) Has been cancelled
Vixl Simulator run / vixl_simulator ([self-hosted ARMv8.4]) (push) Has been cancelled
Vixl Simulator run / vixl_simulator ([self-hosted x64]) (push) Has been cancelled
Some checks failed
Build + Test / build_plus_test ([self-hosted ARMv8.0]) (push) Has been cancelled
Build + Test / build_plus_test ([self-hosted ARMv8.2]) (push) Has been cancelled
Build + Test / build_plus_test ([self-hosted ARMv8.4]) (push) Has been cancelled
GLIBC fault test / glibc_fault_test ([self-hosted ARM64]) (push) Has been cancelled
Hostrunner tests / hostrunner_tests ([self-hosted x64]) (push) Has been cancelled
Instruction Count CI run / instcountci_tests ([self-hosted ARM64]) (push) Has been cancelled
Instruction Count CI run / instcountci_tests ([self-hosted x64]) (push) Has been cancelled
Mingw build / mingw_build ([self-hosted ARM64 mingw]) (push) Has been cancelled
Mingw build / mingw_build ([self-hosted ARM64EC mingw ARM64]) (push) Has been cancelled
Vixl Simulator run / vixl_simulator ([self-hosted ARMv8.4]) (push) Has been cancelled
Vixl Simulator run / vixl_simulator ([self-hosted x64]) (push) Has been cancelled
IR: Convert OpSize over to enum class
This commit is contained in:
commit
5ad7fdb2f3
@ -632,12 +632,12 @@ def print_ir_allocator_helpers():
|
||||
|
||||
output_file.write("\tIR::OpSize GetOpSize(const OrderedNode *Op) const {\n")
|
||||
output_file.write("\t\tauto HeaderOp = Op->Header.Value.GetNode(DualListData.DataBegin());\n")
|
||||
output_file.write("\t\treturn IR::SizeToOpSize(HeaderOp->Size);\n")
|
||||
output_file.write("\t\treturn HeaderOp->Size;\n")
|
||||
output_file.write("\t}\n\n")
|
||||
|
||||
output_file.write("\tIR::OpSize GetOpElementSize(const OrderedNode *Op) const {\n")
|
||||
output_file.write("\t\tauto HeaderOp = Op->Header.Value.GetNode(DualListData.DataBegin());\n")
|
||||
output_file.write("\t\treturn IR::SizeToOpSize(HeaderOp->ElementSize);\n")
|
||||
output_file.write("\t\treturn HeaderOp->ElementSize;\n")
|
||||
output_file.write("\t}\n\n")
|
||||
|
||||
output_file.write("\tuint8_t GetOpElements(const OrderedNode *Op) const {\n")
|
||||
|
@ -79,7 +79,7 @@ void InterpreterOps::FillFallbackIndexPointers(uint64_t* Info) {
|
||||
}
|
||||
|
||||
bool InterpreterOps::GetFallbackHandler(bool SupportsPreserveAllABI, const IR::IROp_Header* IROp, FallbackInfo* Info) {
|
||||
uint8_t OpSize = IROp->Size;
|
||||
const auto OpSize = IROp->Size;
|
||||
switch (IROp->Op) {
|
||||
case IR::OP_F80CVTTO: {
|
||||
auto Op = IROp->C<IR::IROp_F80CVTTo>();
|
||||
@ -99,11 +99,11 @@ bool InterpreterOps::GetFallbackHandler(bool SupportsPreserveAllABI, const IR::I
|
||||
}
|
||||
case IR::OP_F80CVT: {
|
||||
switch (OpSize) {
|
||||
case 4: {
|
||||
case IR::OpSize::i32Bit: {
|
||||
*Info = {FABI_F32_I16_F80, (void*)&FEXCore::CPU::OpHandlers<IR::OP_F80CVT>::handle4, Core::OPINDEX_F80CVT_4, SupportsPreserveAllABI};
|
||||
return true;
|
||||
}
|
||||
case 8: {
|
||||
case IR::OpSize::i64Bit: {
|
||||
*Info = {FABI_F64_I16_F80, (void*)&FEXCore::CPU::OpHandlers<IR::OP_F80CVT>::handle8, Core::OPINDEX_F80CVT_8, SupportsPreserveAllABI};
|
||||
return true;
|
||||
}
|
||||
@ -115,7 +115,7 @@ bool InterpreterOps::GetFallbackHandler(bool SupportsPreserveAllABI, const IR::I
|
||||
auto Op = IROp->C<IR::IROp_F80CVTInt>();
|
||||
|
||||
switch (OpSize) {
|
||||
case 2: {
|
||||
case IR::OpSize::i16Bit: {
|
||||
if (Op->Truncate) {
|
||||
*Info = {FABI_I16_I16_F80, (void*)&FEXCore::CPU::OpHandlers<IR::OP_F80CVTINT>::handle2t, Core::OPINDEX_F80CVTINT_TRUNC2,
|
||||
SupportsPreserveAllABI};
|
||||
@ -124,7 +124,7 @@ bool InterpreterOps::GetFallbackHandler(bool SupportsPreserveAllABI, const IR::I
|
||||
}
|
||||
return true;
|
||||
}
|
||||
case 4: {
|
||||
case IR::OpSize::i32Bit: {
|
||||
if (Op->Truncate) {
|
||||
*Info = {FABI_I32_I16_F80, (void*)&FEXCore::CPU::OpHandlers<IR::OP_F80CVTINT>::handle4t, Core::OPINDEX_F80CVTINT_TRUNC4,
|
||||
SupportsPreserveAllABI};
|
||||
@ -133,7 +133,7 @@ bool InterpreterOps::GetFallbackHandler(bool SupportsPreserveAllABI, const IR::I
|
||||
}
|
||||
return true;
|
||||
}
|
||||
case 8: {
|
||||
case IR::OpSize::i64Bit: {
|
||||
if (Op->Truncate) {
|
||||
*Info = {FABI_I64_I16_F80, (void*)&FEXCore::CPU::OpHandlers<IR::OP_F80CVTINT>::handle8t, Core::OPINDEX_F80CVTINT_TRUNC8,
|
||||
SupportsPreserveAllABI};
|
||||
|
@ -54,8 +54,8 @@ DEF_OP(EntrypointOffset) {
|
||||
auto Constant = Entry + Op->Offset;
|
||||
auto Dst = GetReg(Node);
|
||||
uint64_t Mask = ~0ULL;
|
||||
uint8_t OpSize = IROp->Size;
|
||||
if (OpSize == 4) {
|
||||
const auto OpSize = IROp->Size;
|
||||
if (OpSize == IR::OpSize::i32Bit) {
|
||||
Mask = 0xFFFF'FFFFULL;
|
||||
}
|
||||
|
||||
@ -92,10 +92,10 @@ DEF_OP(AddNZCV) {
|
||||
|
||||
uint64_t Const;
|
||||
if (IsInlineConstant(Op->Src2, &Const)) {
|
||||
LOGMAN_THROW_AA_FMT(IROp->Size >= 4, "Constant not allowed here");
|
||||
LOGMAN_THROW_AA_FMT(IROp->Size >= IR::OpSize::i32Bit, "Constant not allowed here");
|
||||
cmn(EmitSize, Src1, Const);
|
||||
} else if (IROp->Size < 4) {
|
||||
unsigned Shift = 32 - (8 * IROp->Size);
|
||||
} else if (IROp->Size < IR::OpSize::i32Bit) {
|
||||
unsigned Shift = 32 - IR::OpSizeAsBits(IROp->Size);
|
||||
|
||||
lsl(ARMEmitter::Size::i32Bit, TMP1, Src1, Shift);
|
||||
cmn(EmitSize, TMP1, GetReg(Op->Src2.ID()), ARMEmitter::ShiftType::LSL, Shift);
|
||||
@ -165,7 +165,7 @@ DEF_OP(TestNZ) {
|
||||
// Shift the sign bit into place, clearing out the garbage in upper bits.
|
||||
// Adding zero does an effective test, setting NZ according to the result and
|
||||
// zeroing CV.
|
||||
if (IROp->Size < 4) {
|
||||
if (IROp->Size < IR::OpSize::i32Bit) {
|
||||
// Cheaper to and+cmn than to lsl+lsl+tst, so do the and ourselves if
|
||||
// needed.
|
||||
if (Op->Src1 != Op->Src2) {
|
||||
@ -179,7 +179,7 @@ DEF_OP(TestNZ) {
|
||||
Src1 = TMP1;
|
||||
}
|
||||
|
||||
unsigned Shift = 32 - (IROp->Size * 8);
|
||||
unsigned Shift = 32 - IR::OpSizeAsBits(IROp->Size);
|
||||
cmn(EmitSize, ARMEmitter::Reg::zr, Src1, ARMEmitter::ShiftType::LSL, Shift);
|
||||
} else {
|
||||
if (IsInlineConstant(Op->Src2, &Const)) {
|
||||
@ -193,11 +193,11 @@ DEF_OP(TestNZ) {
|
||||
|
||||
DEF_OP(TestZ) {
|
||||
auto Op = IROp->C<IR::IROp_TestZ>();
|
||||
LOGMAN_THROW_AA_FMT(IROp->Size < 4, "TestNZ used at higher sizes");
|
||||
LOGMAN_THROW_AA_FMT(IROp->Size < IR::OpSize::i32Bit, "TestNZ used at higher sizes");
|
||||
const auto EmitSize = ARMEmitter::Size::i32Bit;
|
||||
|
||||
uint64_t Const;
|
||||
uint64_t Mask = IROp->Size == 8 ? ~0ULL : ((1ull << (IROp->Size * 8)) - 1);
|
||||
uint64_t Mask = IROp->Size == IR::OpSize::i64Bit ? ~0ULL : ((1ull << IR::OpSizeAsBits(IROp->Size)) - 1);
|
||||
auto Src1 = GetReg(Op->Src1.ID());
|
||||
|
||||
if (IsInlineConstant(Op->Src2, &Const)) {
|
||||
@ -223,25 +223,25 @@ DEF_OP(SubShift) {
|
||||
|
||||
DEF_OP(SubNZCV) {
|
||||
auto Op = IROp->C<IR::IROp_SubNZCV>();
|
||||
const uint8_t OpSize = IROp->Size;
|
||||
const auto OpSize = IROp->Size;
|
||||
const auto EmitSize = ConvertSize(IROp);
|
||||
|
||||
uint64_t Const;
|
||||
if (IsInlineConstant(Op->Src2, &Const)) {
|
||||
LOGMAN_THROW_AA_FMT(OpSize >= 4, "Constant not allowed here");
|
||||
LOGMAN_THROW_AA_FMT(OpSize >= IR::OpSize::i32Bit, "Constant not allowed here");
|
||||
cmp(EmitSize, GetReg(Op->Src1.ID()), Const);
|
||||
} else {
|
||||
unsigned Shift = OpSize < 4 ? (32 - (8 * OpSize)) : 0;
|
||||
unsigned Shift = OpSize < IR::OpSize::i32Bit ? (32 - IR::OpSizeAsBits(OpSize)) : 0;
|
||||
ARMEmitter::Register ShiftedSrc1 = GetZeroableReg(Op->Src1);
|
||||
|
||||
// Shift to fix flags for <32-bit ops.
|
||||
// Any shift of zero is still zero so optimize out silly zero shifts.
|
||||
if (OpSize < 4 && ShiftedSrc1 != ARMEmitter::Reg::zr) {
|
||||
if (OpSize < IR::OpSize::i32Bit && ShiftedSrc1 != ARMEmitter::Reg::zr) {
|
||||
lsl(ARMEmitter::Size::i32Bit, TMP1, ShiftedSrc1, Shift);
|
||||
ShiftedSrc1 = TMP1;
|
||||
}
|
||||
|
||||
if (OpSize < 4) {
|
||||
if (OpSize < IR::OpSize::i32Bit) {
|
||||
cmp(EmitSize, ShiftedSrc1, GetReg(Op->Src2.ID()), ARMEmitter::ShiftType::LSL, Shift);
|
||||
} else {
|
||||
cmp(EmitSize, ShiftedSrc1, GetReg(Op->Src2.ID()));
|
||||
@ -286,10 +286,10 @@ DEF_OP(SetSmallNZV) {
|
||||
auto Op = IROp->C<IR::IROp_SetSmallNZV>();
|
||||
LOGMAN_THROW_A_FMT(CTX->HostFeatures.SupportsFlagM, "Unsupported flagm op");
|
||||
|
||||
const uint8_t OpSize = IROp->Size;
|
||||
LOGMAN_THROW_AA_FMT(OpSize == 1 || OpSize == 2, "Unsupported {} size: {}", __func__, OpSize);
|
||||
const auto OpSize = IROp->Size;
|
||||
LOGMAN_THROW_AA_FMT(OpSize == IR::OpSize::i8Bit || OpSize == IR::OpSize::i16Bit, "Unsupported {} size: {}", __func__, OpSize);
|
||||
|
||||
if (OpSize == 1) {
|
||||
if (OpSize == IR::OpSize::i8Bit) {
|
||||
setf8(GetReg(Op->Src.ID()).W());
|
||||
} else {
|
||||
setf16(GetReg(Op->Src.ID()).W());
|
||||
@ -401,20 +401,20 @@ DEF_OP(Div) {
|
||||
|
||||
// Each source is OpSize in size
|
||||
// So you can have up to a 128bit divide from x86-64
|
||||
const uint8_t OpSize = IROp->Size;
|
||||
const auto OpSize = IROp->Size;
|
||||
const auto EmitSize = ConvertSize(IROp);
|
||||
|
||||
const auto Dst = GetReg(Node);
|
||||
auto Src1 = GetReg(Op->Src1.ID());
|
||||
auto Src2 = GetReg(Op->Src2.ID());
|
||||
|
||||
if (OpSize == 1) {
|
||||
if (OpSize == IR::OpSize::i8Bit) {
|
||||
sxtb(EmitSize, TMP1, Src1);
|
||||
sxtb(EmitSize, TMP2, Src2);
|
||||
|
||||
Src1 = TMP1;
|
||||
Src2 = TMP2;
|
||||
} else if (OpSize == 2) {
|
||||
} else if (OpSize == IR::OpSize::i16Bit) {
|
||||
sxth(EmitSize, TMP1, Src1);
|
||||
sxth(EmitSize, TMP2, Src2);
|
||||
|
||||
@ -430,20 +430,20 @@ DEF_OP(UDiv) {
|
||||
|
||||
// Each source is OpSize in size
|
||||
// So you can have up to a 128bit divide from x86-64
|
||||
const uint8_t OpSize = IROp->Size;
|
||||
const auto OpSize = IROp->Size;
|
||||
const auto EmitSize = ConvertSize(IROp);
|
||||
|
||||
const auto Dst = GetReg(Node);
|
||||
auto Src1 = GetReg(Op->Src1.ID());
|
||||
auto Src2 = GetReg(Op->Src2.ID());
|
||||
|
||||
if (OpSize == 1) {
|
||||
if (OpSize == IR::OpSize::i8Bit) {
|
||||
uxtb(EmitSize, TMP1, Src1);
|
||||
uxtb(EmitSize, TMP2, Src2);
|
||||
|
||||
Src1 = TMP1;
|
||||
Src2 = TMP2;
|
||||
} else if (OpSize == 2) {
|
||||
} else if (OpSize == IR::OpSize::i16Bit) {
|
||||
uxth(EmitSize, TMP1, Src1);
|
||||
uxth(EmitSize, TMP2, Src2);
|
||||
|
||||
@ -458,20 +458,20 @@ DEF_OP(Rem) {
|
||||
auto Op = IROp->C<IR::IROp_Rem>();
|
||||
// Each source is OpSize in size
|
||||
// So you can have up to a 128bit divide from x86-64
|
||||
const uint8_t OpSize = IROp->Size;
|
||||
const auto OpSize = IROp->Size;
|
||||
const auto EmitSize = ConvertSize(IROp);
|
||||
|
||||
const auto Dst = GetReg(Node);
|
||||
auto Src1 = GetReg(Op->Src1.ID());
|
||||
auto Src2 = GetReg(Op->Src2.ID());
|
||||
|
||||
if (OpSize == 1) {
|
||||
if (OpSize == IR::OpSize::i8Bit) {
|
||||
sxtb(EmitSize, TMP1, Src1);
|
||||
sxtb(EmitSize, TMP2, Src2);
|
||||
|
||||
Src1 = TMP1;
|
||||
Src2 = TMP2;
|
||||
} else if (OpSize == 2) {
|
||||
} else if (OpSize == IR::OpSize::i16Bit) {
|
||||
sxth(EmitSize, TMP1, Src1);
|
||||
sxth(EmitSize, TMP2, Src2);
|
||||
|
||||
@ -487,20 +487,20 @@ DEF_OP(URem) {
|
||||
auto Op = IROp->C<IR::IROp_URem>();
|
||||
// Each source is OpSize in size
|
||||
// So you can have up to a 128bit divide from x86-64
|
||||
const uint8_t OpSize = IROp->Size;
|
||||
const auto OpSize = IROp->Size;
|
||||
const auto EmitSize = ConvertSize(IROp);
|
||||
|
||||
const auto Dst = GetReg(Node);
|
||||
auto Src1 = GetReg(Op->Src1.ID());
|
||||
auto Src2 = GetReg(Op->Src2.ID());
|
||||
|
||||
if (OpSize == 1) {
|
||||
if (OpSize == IR::OpSize::i8Bit) {
|
||||
uxtb(EmitSize, TMP1, Src1);
|
||||
uxtb(EmitSize, TMP2, Src2);
|
||||
|
||||
Src1 = TMP1;
|
||||
Src2 = TMP2;
|
||||
} else if (OpSize == 2) {
|
||||
} else if (OpSize == IR::OpSize::i16Bit) {
|
||||
uxth(EmitSize, TMP1, Src1);
|
||||
uxth(EmitSize, TMP2, Src2);
|
||||
|
||||
@ -514,15 +514,15 @@ DEF_OP(URem) {
|
||||
|
||||
DEF_OP(MulH) {
|
||||
auto Op = IROp->C<IR::IROp_MulH>();
|
||||
const uint8_t OpSize = IROp->Size;
|
||||
const auto OpSize = IROp->Size;
|
||||
|
||||
LOGMAN_THROW_AA_FMT(OpSize == 4 || OpSize == 8, "Unsupported {} size: {}", __func__, OpSize);
|
||||
LOGMAN_THROW_AA_FMT(OpSize == IR::OpSize::i32Bit || OpSize == IR::OpSize::i64Bit, "Unsupported {} size: {}", __func__, OpSize);
|
||||
|
||||
const auto Dst = GetReg(Node);
|
||||
const auto Src1 = GetReg(Op->Src1.ID());
|
||||
const auto Src2 = GetReg(Op->Src2.ID());
|
||||
|
||||
if (OpSize == 4) {
|
||||
if (OpSize == IR::OpSize::i32Bit) {
|
||||
sxtw(TMP1, Src1.W());
|
||||
sxtw(TMP2, Src2.W());
|
||||
mul(ARMEmitter::Size::i32Bit, Dst, TMP1, TMP2);
|
||||
@ -534,15 +534,15 @@ DEF_OP(MulH) {
|
||||
|
||||
DEF_OP(UMulH) {
|
||||
auto Op = IROp->C<IR::IROp_UMulH>();
|
||||
const uint8_t OpSize = IROp->Size;
|
||||
const auto OpSize = IROp->Size;
|
||||
|
||||
LOGMAN_THROW_AA_FMT(OpSize == 4 || OpSize == 8, "Unsupported {} size: {}", __func__, OpSize);
|
||||
LOGMAN_THROW_AA_FMT(OpSize == IR::OpSize::i32Bit || OpSize == IR::OpSize::i64Bit, "Unsupported {} size: {}", __func__, OpSize);
|
||||
|
||||
const auto Dst = GetReg(Node);
|
||||
const auto Src1 = GetReg(Op->Src1.ID());
|
||||
const auto Src2 = GetReg(Op->Src2.ID());
|
||||
|
||||
if (OpSize == 4) {
|
||||
if (OpSize == IR::OpSize::i32Bit) {
|
||||
uxtw(ARMEmitter::Size::i64Bit, TMP1, Src1);
|
||||
uxtw(ARMEmitter::Size::i64Bit, TMP2, Src2);
|
||||
mul(ARMEmitter::Size::i64Bit, Dst, TMP1, TMP2);
|
||||
@ -593,7 +593,7 @@ DEF_OP(Ornror) {
|
||||
|
||||
DEF_OP(AndWithFlags) {
|
||||
auto Op = IROp->C<IR::IROp_AndWithFlags>();
|
||||
const uint8_t OpSize = IROp->Size;
|
||||
const auto OpSize = IROp->Size;
|
||||
const auto EmitSize = ConvertSize(IROp);
|
||||
|
||||
uint64_t Const;
|
||||
@ -601,7 +601,7 @@ DEF_OP(AndWithFlags) {
|
||||
auto Src1 = GetReg(Op->Src1.ID());
|
||||
|
||||
// See TestNZ
|
||||
if (OpSize < 4) {
|
||||
if (OpSize < IR::OpSize::i32Bit) {
|
||||
if (IsInlineConstant(Op->Src2, &Const)) {
|
||||
and_(EmitSize, Dst, Src1, Const);
|
||||
} else {
|
||||
@ -614,7 +614,7 @@ DEF_OP(AndWithFlags) {
|
||||
}
|
||||
}
|
||||
|
||||
unsigned Shift = 32 - (OpSize * 8);
|
||||
unsigned Shift = 32 - IR::OpSizeAsBits(OpSize);
|
||||
cmn(EmitSize, ARMEmitter::Reg::zr, Dst, ARMEmitter::ShiftType::LSL, Shift);
|
||||
} else {
|
||||
if (IsInlineConstant(Op->Src2, &Const)) {
|
||||
@ -648,21 +648,21 @@ DEF_OP(Ashr) {
|
||||
|
||||
uint64_t Const;
|
||||
if (IsInlineConstant(Op->Src2, &Const)) {
|
||||
if (OpSize >= 4) {
|
||||
if (OpSize >= IR::OpSize::i32Bit) {
|
||||
asr(EmitSize, Dst, Src1, (unsigned int)Const);
|
||||
} else {
|
||||
sbfx(EmitSize, TMP1, Src1, 0, OpSize * 8);
|
||||
sbfx(EmitSize, TMP1, Src1, 0, IR::OpSizeAsBits(OpSize));
|
||||
asr(EmitSize, Dst, TMP1, (unsigned int)Const);
|
||||
ubfx(EmitSize, Dst, Dst, 0, OpSize * 8);
|
||||
ubfx(EmitSize, Dst, Dst, 0, IR::OpSizeAsBits(OpSize));
|
||||
}
|
||||
} else {
|
||||
const auto Src2 = GetReg(Op->Src2.ID());
|
||||
if (OpSize >= 4) {
|
||||
if (OpSize >= IR::OpSize::i32Bit) {
|
||||
asrv(EmitSize, Dst, Src1, Src2);
|
||||
} else {
|
||||
sbfx(EmitSize, TMP1, Src1, 0, OpSize * 8);
|
||||
sbfx(EmitSize, TMP1, Src1, 0, IR::OpSizeAsBits(OpSize));
|
||||
asrv(EmitSize, Dst, TMP1, Src2);
|
||||
ubfx(EmitSize, Dst, Dst, 0, OpSize * 8);
|
||||
ubfx(EmitSize, Dst, Dst, 0, IR::OpSizeAsBits(OpSize));
|
||||
}
|
||||
}
|
||||
}
|
||||
@ -897,7 +897,7 @@ DEF_OP(PDep) {
|
||||
DEF_OP(PExt) {
|
||||
auto Op = IROp->C<IR::IROp_PExt>();
|
||||
const auto OpSize = IROp->Size;
|
||||
const auto OpSizeBitsM1 = (OpSize * 8) - 1;
|
||||
const auto OpSizeBitsM1 = IR::OpSizeAsBits(OpSize) - 1;
|
||||
const auto EmitSize = ConvertSize48(IROp);
|
||||
|
||||
const auto Input = GetReg(Op->Input.ID());
|
||||
@ -952,8 +952,8 @@ DEF_OP(PExt) {
|
||||
|
||||
DEF_OP(LDiv) {
|
||||
auto Op = IROp->C<IR::IROp_LDiv>();
|
||||
const uint8_t OpSize = IROp->Size;
|
||||
const auto EmitSize = OpSize >= 4 ? ARMEmitter::Size::i64Bit : ARMEmitter::Size::i32Bit;
|
||||
const auto OpSize = IROp->Size;
|
||||
const auto EmitSize = OpSize >= IR::OpSize::i32Bit ? ARMEmitter::Size::i64Bit : ARMEmitter::Size::i32Bit;
|
||||
|
||||
const auto Dst = GetReg(Node);
|
||||
const auto Upper = GetReg(Op->Upper.ID());
|
||||
@ -963,14 +963,14 @@ DEF_OP(LDiv) {
|
||||
// Each source is OpSize in size
|
||||
// So you can have up to a 128bit divide from x86-64
|
||||
switch (OpSize) {
|
||||
case 2: {
|
||||
case IR::OpSize::i16Bit: {
|
||||
uxth(EmitSize, TMP1, Lower);
|
||||
bfi(EmitSize, TMP1, Upper, 16, 16);
|
||||
sxth(EmitSize, TMP2, Divisor);
|
||||
sdiv(EmitSize, Dst, TMP1, TMP2);
|
||||
break;
|
||||
}
|
||||
case 4: {
|
||||
case IR::OpSize::i32Bit: {
|
||||
// TODO: 32-bit operation should be guaranteed not to leave garbage in the upper bits.
|
||||
mov(EmitSize, TMP1, Lower);
|
||||
bfi(EmitSize, TMP1, Upper, 32, 32);
|
||||
@ -978,7 +978,7 @@ DEF_OP(LDiv) {
|
||||
sdiv(EmitSize, Dst, TMP1, TMP2);
|
||||
break;
|
||||
}
|
||||
case 8: {
|
||||
case IR::OpSize::i64Bit: {
|
||||
ARMEmitter::SingleUseForwardLabel Only64Bit {};
|
||||
ARMEmitter::SingleUseForwardLabel LongDIVRet {};
|
||||
|
||||
@ -1022,8 +1022,8 @@ DEF_OP(LDiv) {
|
||||
|
||||
DEF_OP(LUDiv) {
|
||||
auto Op = IROp->C<IR::IROp_LUDiv>();
|
||||
const uint8_t OpSize = IROp->Size;
|
||||
const auto EmitSize = OpSize >= 4 ? ARMEmitter::Size::i64Bit : ARMEmitter::Size::i32Bit;
|
||||
const auto OpSize = IROp->Size;
|
||||
const auto EmitSize = OpSize >= IR::OpSize::i32Bit ? ARMEmitter::Size::i64Bit : ARMEmitter::Size::i32Bit;
|
||||
|
||||
const auto Dst = GetReg(Node);
|
||||
const auto Upper = GetReg(Op->Upper.ID());
|
||||
@ -1033,20 +1033,20 @@ DEF_OP(LUDiv) {
|
||||
// Each source is OpSize in size
|
||||
// So you can have up to a 128bit divide from x86-64=
|
||||
switch (OpSize) {
|
||||
case 2: {
|
||||
case IR::OpSize::i16Bit: {
|
||||
uxth(EmitSize, TMP1, Lower);
|
||||
bfi(EmitSize, TMP1, Upper, 16, 16);
|
||||
udiv(EmitSize, Dst, TMP1, Divisor);
|
||||
break;
|
||||
}
|
||||
case 4: {
|
||||
case IR::OpSize::i32Bit: {
|
||||
// TODO: 32-bit operation should be guaranteed not to leave garbage in the upper bits.
|
||||
mov(EmitSize, TMP1, Lower);
|
||||
bfi(EmitSize, TMP1, Upper, 32, 32);
|
||||
udiv(EmitSize, Dst, TMP1, Divisor);
|
||||
break;
|
||||
}
|
||||
case 8: {
|
||||
case IR::OpSize::i64Bit: {
|
||||
ARMEmitter::SingleUseForwardLabel Only64Bit {};
|
||||
ARMEmitter::SingleUseForwardLabel LongDIVRet {};
|
||||
|
||||
@ -1086,8 +1086,8 @@ DEF_OP(LUDiv) {
|
||||
|
||||
DEF_OP(LRem) {
|
||||
auto Op = IROp->C<IR::IROp_LRem>();
|
||||
const uint8_t OpSize = IROp->Size;
|
||||
const auto EmitSize = OpSize >= 4 ? ARMEmitter::Size::i64Bit : ARMEmitter::Size::i32Bit;
|
||||
const auto OpSize = IROp->Size;
|
||||
const auto EmitSize = OpSize >= IR::OpSize::i32Bit ? ARMEmitter::Size::i64Bit : ARMEmitter::Size::i32Bit;
|
||||
|
||||
const auto Dst = GetReg(Node);
|
||||
const auto Upper = GetReg(Op->Upper.ID());
|
||||
@ -1097,7 +1097,7 @@ DEF_OP(LRem) {
|
||||
// Each source is OpSize in size
|
||||
// So you can have up to a 128bit divide from x86-64
|
||||
switch (OpSize) {
|
||||
case 2: {
|
||||
case IR::OpSize::i16Bit: {
|
||||
uxth(EmitSize, TMP1, Lower);
|
||||
bfi(EmitSize, TMP1, Upper, 16, 16);
|
||||
sxth(EmitSize, TMP2, Divisor);
|
||||
@ -1105,7 +1105,7 @@ DEF_OP(LRem) {
|
||||
msub(EmitSize, Dst, TMP3, TMP2, TMP1);
|
||||
break;
|
||||
}
|
||||
case 4: {
|
||||
case IR::OpSize::i32Bit: {
|
||||
// TODO: 32-bit operation should be guaranteed not to leave garbage in the upper bits.
|
||||
mov(EmitSize, TMP1, Lower);
|
||||
bfi(EmitSize, TMP1, Upper, 32, 32);
|
||||
@ -1114,7 +1114,7 @@ DEF_OP(LRem) {
|
||||
msub(EmitSize, Dst, TMP2, TMP3, TMP1);
|
||||
break;
|
||||
}
|
||||
case 8: {
|
||||
case IR::OpSize::i64Bit: {
|
||||
ARMEmitter::SingleUseForwardLabel Only64Bit {};
|
||||
ARMEmitter::SingleUseForwardLabel LongDIVRet {};
|
||||
|
||||
@ -1160,8 +1160,8 @@ DEF_OP(LRem) {
|
||||
|
||||
DEF_OP(LURem) {
|
||||
auto Op = IROp->C<IR::IROp_LURem>();
|
||||
const uint8_t OpSize = IROp->Size;
|
||||
const auto EmitSize = OpSize >= 4 ? ARMEmitter::Size::i64Bit : ARMEmitter::Size::i32Bit;
|
||||
const auto OpSize = IROp->Size;
|
||||
const auto EmitSize = OpSize >= IR::OpSize::i32Bit ? ARMEmitter::Size::i64Bit : ARMEmitter::Size::i32Bit;
|
||||
|
||||
const auto Dst = GetReg(Node);
|
||||
const auto Upper = GetReg(Op->Upper.ID());
|
||||
@ -1171,14 +1171,14 @@ DEF_OP(LURem) {
|
||||
// Each source is OpSize in size
|
||||
// So you can have up to a 128bit divide from x86-64
|
||||
switch (OpSize) {
|
||||
case 2: {
|
||||
case IR::OpSize::i16Bit: {
|
||||
uxth(EmitSize, TMP1, Lower);
|
||||
bfi(EmitSize, TMP1, Upper, 16, 16);
|
||||
udiv(EmitSize, TMP2, TMP1, Divisor);
|
||||
msub(EmitSize, Dst, TMP2, Divisor, TMP1);
|
||||
break;
|
||||
}
|
||||
case 4: {
|
||||
case IR::OpSize::i32Bit: {
|
||||
// TODO: 32-bit operation should be guaranteed not to leave garbage in the upper bits.
|
||||
mov(EmitSize, TMP1, Lower);
|
||||
bfi(EmitSize, TMP1, Upper, 32, 32);
|
||||
@ -1186,7 +1186,7 @@ DEF_OP(LURem) {
|
||||
msub(EmitSize, Dst, TMP2, Divisor, TMP1);
|
||||
break;
|
||||
}
|
||||
case 8: {
|
||||
case IR::OpSize::i64Bit: {
|
||||
ARMEmitter::SingleUseForwardLabel Only64Bit {};
|
||||
ARMEmitter::SingleUseForwardLabel LongDIVRet {};
|
||||
|
||||
@ -1238,30 +1238,30 @@ DEF_OP(Not) {
|
||||
|
||||
DEF_OP(Popcount) {
|
||||
auto Op = IROp->C<IR::IROp_Popcount>();
|
||||
const uint8_t OpSize = IROp->Size;
|
||||
const auto OpSize = IROp->Size;
|
||||
|
||||
const auto Dst = GetReg(Node);
|
||||
const auto Src = GetReg(Op->Src.ID());
|
||||
|
||||
switch (OpSize) {
|
||||
case 0x1:
|
||||
case IR::OpSize::i8Bit:
|
||||
fmov(ARMEmitter::Size::i32Bit, VTMP1.S(), Src);
|
||||
// only use lowest byte
|
||||
cnt(ARMEmitter::SubRegSize::i8Bit, VTMP1.D(), VTMP1.D());
|
||||
break;
|
||||
case 0x2:
|
||||
case IR::OpSize::i16Bit:
|
||||
fmov(ARMEmitter::Size::i32Bit, VTMP1.S(), Src);
|
||||
cnt(ARMEmitter::SubRegSize::i8Bit, VTMP1.D(), VTMP1.D());
|
||||
// only count two lowest bytes
|
||||
addp(ARMEmitter::SubRegSize::i8Bit, VTMP1.D(), VTMP1.D(), VTMP1.D());
|
||||
break;
|
||||
case 0x4:
|
||||
case IR::OpSize::i32Bit:
|
||||
fmov(ARMEmitter::Size::i32Bit, VTMP1.S(), Src);
|
||||
cnt(ARMEmitter::SubRegSize::i8Bit, VTMP1.D(), VTMP1.D());
|
||||
// fmov has zero extended, unused bytes are zero
|
||||
addv(ARMEmitter::SubRegSize::i8Bit, VTMP1.D(), VTMP1.D());
|
||||
break;
|
||||
case 0x8:
|
||||
case IR::OpSize::i64Bit:
|
||||
fmov(ARMEmitter::Size::i64Bit, VTMP1.D(), Src);
|
||||
cnt(ARMEmitter::SubRegSize::i8Bit, VTMP1.D(), VTMP1.D());
|
||||
// fmov has zero extended, unused bytes are zero
|
||||
@ -1288,17 +1288,18 @@ DEF_OP(FindLSB) {
|
||||
|
||||
DEF_OP(FindMSB) {
|
||||
auto Op = IROp->C<IR::IROp_FindMSB>();
|
||||
const uint8_t OpSize = IROp->Size;
|
||||
const auto OpSize = IROp->Size;
|
||||
|
||||
LOGMAN_THROW_AA_FMT(OpSize == 2 || OpSize == 4 || OpSize == 8, "Unsupported {} size: {}", __func__, OpSize);
|
||||
LOGMAN_THROW_AA_FMT(OpSize == IR::OpSize::i16Bit || OpSize == IR::OpSize::i32Bit || OpSize == IR::OpSize::i64Bit,
|
||||
"Unsupported {} size: {}", __func__, OpSize);
|
||||
const auto EmitSize = ConvertSize(IROp);
|
||||
|
||||
const auto Dst = GetReg(Node);
|
||||
const auto Src = GetReg(Op->Src.ID());
|
||||
|
||||
movz(ARMEmitter::Size::i64Bit, TMP1, OpSize * 8 - 1);
|
||||
movz(ARMEmitter::Size::i64Bit, TMP1, IR::OpSizeAsBits(OpSize) - 1);
|
||||
|
||||
if (OpSize == 2) {
|
||||
if (OpSize == IR::OpSize::i16Bit) {
|
||||
lsl(EmitSize, Dst, Src, 16);
|
||||
clz(EmitSize, Dst, Dst);
|
||||
} else {
|
||||
@ -1310,9 +1311,10 @@ DEF_OP(FindMSB) {
|
||||
|
||||
DEF_OP(FindTrailingZeroes) {
|
||||
auto Op = IROp->C<IR::IROp_FindTrailingZeroes>();
|
||||
const uint8_t OpSize = IROp->Size;
|
||||
const auto OpSize = IROp->Size;
|
||||
|
||||
LOGMAN_THROW_AA_FMT(OpSize == 2 || OpSize == 4 || OpSize == 8, "Unsupported {} size: {}", __func__, OpSize);
|
||||
LOGMAN_THROW_AA_FMT(OpSize == IR::OpSize::i16Bit || OpSize == IR::OpSize::i32Bit || OpSize == IR::OpSize::i64Bit,
|
||||
"Unsupported {} size: {}", __func__, OpSize);
|
||||
const auto EmitSize = ConvertSize(IROp);
|
||||
|
||||
const auto Dst = GetReg(Node);
|
||||
@ -1320,7 +1322,7 @@ DEF_OP(FindTrailingZeroes) {
|
||||
|
||||
rbit(EmitSize, Dst, Src);
|
||||
|
||||
if (OpSize == 2) {
|
||||
if (OpSize == IR::OpSize::i16Bit) {
|
||||
// This orr does two things. First, if the (masked) source is zero, it
|
||||
// reverses to zero in the top so it forces clz to return 16. Second, it
|
||||
// ensures garbage in the upper bits of the source don't affect clz, because
|
||||
@ -1334,15 +1336,16 @@ DEF_OP(FindTrailingZeroes) {
|
||||
|
||||
DEF_OP(CountLeadingZeroes) {
|
||||
auto Op = IROp->C<IR::IROp_CountLeadingZeroes>();
|
||||
const uint8_t OpSize = IROp->Size;
|
||||
const auto OpSize = IROp->Size;
|
||||
|
||||
LOGMAN_THROW_AA_FMT(OpSize == 2 || OpSize == 4 || OpSize == 8, "Unsupported {} size: {}", __func__, OpSize);
|
||||
LOGMAN_THROW_AA_FMT(OpSize == IR::OpSize::i16Bit || OpSize == IR::OpSize::i32Bit || OpSize == IR::OpSize::i64Bit,
|
||||
"Unsupported {} size: {}", __func__, OpSize);
|
||||
const auto EmitSize = ConvertSize(IROp);
|
||||
|
||||
const auto Dst = GetReg(Node);
|
||||
const auto Src = GetReg(Op->Src.ID());
|
||||
|
||||
if (OpSize == 2) {
|
||||
if (OpSize == IR::OpSize::i16Bit) {
|
||||
// Expressing as lsl+orr+clz clears away any garbage in the upper bits
|
||||
// (alternatively could do uxth+clz+sub.. equal cost in total).
|
||||
lsl(EmitSize, Dst, Src, 16);
|
||||
@ -1355,16 +1358,17 @@ DEF_OP(CountLeadingZeroes) {
|
||||
|
||||
DEF_OP(Rev) {
|
||||
auto Op = IROp->C<IR::IROp_Rev>();
|
||||
const uint8_t OpSize = IROp->Size;
|
||||
const auto OpSize = IROp->Size;
|
||||
|
||||
LOGMAN_THROW_AA_FMT(OpSize == 2 || OpSize == 4 || OpSize == 8, "Unsupported {} size: {}", __func__, OpSize);
|
||||
LOGMAN_THROW_AA_FMT(OpSize == IR::OpSize::i16Bit || OpSize == IR::OpSize::i32Bit || OpSize == IR::OpSize::i64Bit,
|
||||
"Unsupported {} size: {}", __func__, OpSize);
|
||||
const auto EmitSize = ConvertSize(IROp);
|
||||
|
||||
const auto Dst = GetReg(Node);
|
||||
const auto Src = GetReg(Op->Src.ID());
|
||||
|
||||
rev(EmitSize, Dst, Src);
|
||||
if (OpSize == 2) {
|
||||
if (OpSize == IR::OpSize::i16Bit) {
|
||||
lsr(EmitSize, Dst, Dst, 16);
|
||||
}
|
||||
}
|
||||
@ -1390,10 +1394,10 @@ DEF_OP(Bfi) {
|
||||
mov(EmitSize, TMP1, SrcDst);
|
||||
bfi(EmitSize, TMP1, Src, Op->lsb, Op->Width);
|
||||
|
||||
if (IROp->Size >= 4) {
|
||||
if (IROp->Size >= IR::OpSize::i32Bit) {
|
||||
mov(EmitSize, Dst, TMP1.R());
|
||||
} else {
|
||||
ubfx(EmitSize, Dst, TMP1, 0, IROp->Size * 8);
|
||||
ubfx(EmitSize, Dst, TMP1, 0, IR::OpSizeAsBits(IROp->Size));
|
||||
}
|
||||
}
|
||||
}
|
||||
@ -1424,7 +1428,7 @@ DEF_OP(Bfxil) {
|
||||
|
||||
DEF_OP(Bfe) {
|
||||
auto Op = IROp->C<IR::IROp_Bfe>();
|
||||
LOGMAN_THROW_AA_FMT(IROp->Size <= 8, "OpSize is too large for BFE: {}", IROp->Size);
|
||||
LOGMAN_THROW_AA_FMT(IROp->Size <= IR::OpSize::i64Bit, "OpSize is too large for BFE: {}", IROp->Size);
|
||||
LOGMAN_THROW_AA_FMT(Op->Width != 0, "Invalid BFE width of 0");
|
||||
const auto EmitSize = ConvertSize(IROp);
|
||||
|
||||
@ -1434,7 +1438,7 @@ DEF_OP(Bfe) {
|
||||
if (Op->lsb == 0 && Op->Width == 32) {
|
||||
mov(ARMEmitter::Size::i32Bit, Dst, Src);
|
||||
} else if (Op->lsb == 0 && Op->Width == 64) {
|
||||
LOGMAN_THROW_AA_FMT(IROp->Size == 8, "Must be 64-bit wide register");
|
||||
LOGMAN_THROW_AA_FMT(IROp->Size == IR::OpSize::i64Bit, "Must be 64-bit wide register");
|
||||
mov(ARMEmitter::Size::i64Bit, Dst, Src);
|
||||
} else {
|
||||
ubfx(EmitSize, Dst, Src, Op->lsb, Op->Width);
|
||||
@ -1451,7 +1455,7 @@ DEF_OP(Sbfe) {
|
||||
|
||||
DEF_OP(Select) {
|
||||
auto Op = IROp->C<IR::IROp_Select>();
|
||||
const uint8_t OpSize = IROp->Size;
|
||||
const auto OpSize = IROp->Size;
|
||||
const auto EmitSize = ConvertSize(IROp);
|
||||
const auto CompareEmitSize = Op->CompareSize == IR::OpSize::i64Bit ? ARMEmitter::Size::i64Bit : ARMEmitter::Size::i32Bit;
|
||||
|
||||
@ -1479,7 +1483,7 @@ DEF_OP(Select) {
|
||||
bool is_const_true = IsInlineConstant(Op->TrueVal, &const_true);
|
||||
bool is_const_false = IsInlineConstant(Op->FalseVal, &const_false);
|
||||
|
||||
uint64_t all_ones = OpSize == 8 ? 0xffff'ffff'ffff'ffffull : 0xffff'ffffull;
|
||||
uint64_t all_ones = OpSize == IR::OpSize::i64Bit ? 0xffff'ffff'ffff'ffffull : 0xffff'ffffull;
|
||||
|
||||
ARMEmitter::Register Dst = GetReg(Node);
|
||||
|
||||
@ -1508,7 +1512,7 @@ DEF_OP(NZCVSelect) {
|
||||
bool is_const_true = IsInlineConstant(Op->TrueVal, &const_true);
|
||||
bool is_const_false = IsInlineConstant(Op->FalseVal, &const_false);
|
||||
|
||||
uint64_t all_ones = IROp->Size == 8 ? 0xffff'ffff'ffff'ffffull : 0xffff'ffffull;
|
||||
uint64_t all_ones = IROp->Size == IR::OpSize::i64Bit ? 0xffff'ffff'ffff'ffffull : 0xffff'ffffull;
|
||||
|
||||
ARMEmitter::Register Dst = GetReg(Node);
|
||||
|
||||
@ -1547,7 +1551,7 @@ DEF_OP(VExtractToGPR) {
|
||||
|
||||
constexpr auto AVXRegBitSize = Core::CPUState::XMM_AVX_REG_SIZE * 8;
|
||||
constexpr auto SSERegBitSize = Core::CPUState::XMM_SSE_REG_SIZE * 8;
|
||||
const auto ElementSizeBits = Op->Header.ElementSize * 8;
|
||||
const auto ElementSizeBits = IR::OpSizeAsBits(Op->Header.ElementSize);
|
||||
|
||||
const auto Offset = ElementSizeBits * Op->Index;
|
||||
const auto Is256Bit = Offset >= SSERegBitSize;
|
||||
@ -1558,10 +1562,10 @@ DEF_OP(VExtractToGPR) {
|
||||
|
||||
const auto PerformMove = [&](const ARMEmitter::VRegister reg, int index) {
|
||||
switch (OpSize) {
|
||||
case 1: umov<ARMEmitter::SubRegSize::i8Bit>(Dst, Vector, index); break;
|
||||
case 2: umov<ARMEmitter::SubRegSize::i16Bit>(Dst, Vector, index); break;
|
||||
case 4: umov<ARMEmitter::SubRegSize::i32Bit>(Dst, Vector, index); break;
|
||||
case 8: umov<ARMEmitter::SubRegSize::i64Bit>(Dst, Vector, index); break;
|
||||
case IR::OpSize::i8Bit: umov<ARMEmitter::SubRegSize::i8Bit>(Dst, Vector, index); break;
|
||||
case IR::OpSize::i16Bit: umov<ARMEmitter::SubRegSize::i16Bit>(Dst, Vector, index); break;
|
||||
case IR::OpSize::i32Bit: umov<ARMEmitter::SubRegSize::i32Bit>(Dst, Vector, index); break;
|
||||
case IR::OpSize::i64Bit: umov<ARMEmitter::SubRegSize::i64Bit>(Dst, Vector, index); break;
|
||||
default: LOGMAN_MSG_A_FMT("Unhandled ExtractElementSize: {}", OpSize); break;
|
||||
}
|
||||
};
|
||||
@ -1586,10 +1590,10 @@ DEF_OP(VExtractToGPR) {
|
||||
// upper half of the vector.
|
||||
const auto SanitizedIndex = [OpSize, Op] {
|
||||
switch (OpSize) {
|
||||
case 1: return Op->Index - 16;
|
||||
case 2: return Op->Index - 8;
|
||||
case 4: return Op->Index - 4;
|
||||
case 8: return Op->Index - 2;
|
||||
case IR::OpSize::i8Bit: return Op->Index - 16;
|
||||
case IR::OpSize::i16Bit: return Op->Index - 8;
|
||||
case IR::OpSize::i32Bit: return Op->Index - 4;
|
||||
case IR::OpSize::i64Bit: return Op->Index - 2;
|
||||
default: LOGMAN_MSG_A_FMT("Unhandled OpSize: {}", OpSize); return 0;
|
||||
}
|
||||
}();
|
||||
|
@ -15,18 +15,18 @@ DEF_OP(VInsGPR) {
|
||||
|
||||
const auto DestIdx = Op->DestIdx;
|
||||
const auto ElementSize = Op->Header.ElementSize;
|
||||
const auto Is256Bit = OpSize == Core::CPUState::XMM_AVX_REG_SIZE;
|
||||
const auto Is256Bit = OpSize == IR::OpSize::i256Bit;
|
||||
LOGMAN_THROW_A_FMT(!Is256Bit || (Is256Bit && HostSupportsSVE256), "Need SVE256 support in order to use {} with 256-bit operation", __func__);
|
||||
|
||||
const auto SubEmitSize = ConvertSubRegSize8(IROp);
|
||||
const auto ElementsPer128Bit = 16 / ElementSize;
|
||||
const auto ElementsPer128Bit = IR::NumElements(IR::OpSize::i128Bit, ElementSize);
|
||||
|
||||
const auto Dst = GetVReg(Node);
|
||||
const auto DestVector = GetVReg(Op->DestVector.ID());
|
||||
const auto Src = GetReg(Op->Src.ID());
|
||||
|
||||
if (HostSupportsSVE256 && Is256Bit) {
|
||||
const auto ElementSizeBits = ElementSize * 8;
|
||||
const auto ElementSizeBits = IR::OpSizeAsBits(ElementSize);
|
||||
const auto Offset = ElementSizeBits * DestIdx;
|
||||
|
||||
const auto SSEBitSize = Core::CPUState::XMM_SSE_REG_SIZE * 8;
|
||||
@ -90,16 +90,16 @@ DEF_OP(VCastFromGPR) {
|
||||
auto Src = GetReg(Op->Src.ID());
|
||||
|
||||
switch (Op->Header.ElementSize) {
|
||||
case 1:
|
||||
case IR::OpSize::i8Bit:
|
||||
uxtb(ARMEmitter::Size::i32Bit, TMP1, Src);
|
||||
fmov(ARMEmitter::Size::i32Bit, Dst.S(), TMP1);
|
||||
break;
|
||||
case 2:
|
||||
case IR::OpSize::i16Bit:
|
||||
uxth(ARMEmitter::Size::i32Bit, TMP1, Src);
|
||||
fmov(ARMEmitter::Size::i32Bit, Dst.S(), TMP1);
|
||||
break;
|
||||
case 4: fmov(ARMEmitter::Size::i32Bit, Dst.S(), Src); break;
|
||||
case 8: fmov(ARMEmitter::Size::i64Bit, Dst.D(), Src); break;
|
||||
case IR::OpSize::i32Bit: fmov(ARMEmitter::Size::i32Bit, Dst.S(), Src); break;
|
||||
case IR::OpSize::i64Bit: fmov(ARMEmitter::Size::i64Bit, Dst.D(), Src); break;
|
||||
default: LOGMAN_MSG_A_FMT("Unknown castGPR element size: {}", Op->Header.ElementSize);
|
||||
}
|
||||
}
|
||||
@ -111,7 +111,7 @@ DEF_OP(VDupFromGPR) {
|
||||
const auto Dst = GetVReg(Node);
|
||||
const auto Src = GetReg(Op->Src.ID());
|
||||
|
||||
const auto Is256Bit = OpSize == Core::CPUState::XMM_AVX_REG_SIZE;
|
||||
const auto Is256Bit = OpSize == IR::OpSize::i256Bit;
|
||||
LOGMAN_THROW_A_FMT(!Is256Bit || (Is256Bit && HostSupportsSVE256), "Need SVE256 support in order to use {} with 256-bit operation", __func__);
|
||||
|
||||
const auto SubEmitSize = ConvertSubRegSize8(IROp);
|
||||
@ -126,7 +126,7 @@ DEF_OP(VDupFromGPR) {
|
||||
DEF_OP(Float_FromGPR_S) {
|
||||
const auto Op = IROp->C<IR::IROp_Float_FromGPR_S>();
|
||||
|
||||
const uint16_t ElementSize = Op->Header.ElementSize;
|
||||
const uint16_t ElementSize = IR::OpSizeToSize(Op->Header.ElementSize);
|
||||
const uint16_t Conv = (ElementSize << 8) | IR::OpSizeToSize(Op->SrcElementSize);
|
||||
|
||||
auto Dst = GetVReg(Node);
|
||||
@ -165,7 +165,7 @@ DEF_OP(Float_FromGPR_S) {
|
||||
|
||||
DEF_OP(Float_FToF) {
|
||||
auto Op = IROp->C<IR::IROp_Float_FToF>();
|
||||
const uint16_t Conv = (Op->Header.ElementSize << 8) | IR::OpSizeToSize(Op->SrcElementSize);
|
||||
const uint16_t Conv = (IR::OpSizeToSize(Op->Header.ElementSize) << 8) | IR::OpSizeToSize(Op->SrcElementSize);
|
||||
|
||||
auto Dst = GetVReg(Node);
|
||||
auto Src = GetVReg(Op->Scalar.ID());
|
||||
@ -205,7 +205,7 @@ DEF_OP(Vector_SToF) {
|
||||
|
||||
const auto ElementSize = Op->Header.ElementSize;
|
||||
const auto SubEmitSize = ConvertSubRegSize248(IROp);
|
||||
const auto Is256Bit = OpSize == Core::CPUState::XMM_AVX_REG_SIZE;
|
||||
const auto Is256Bit = OpSize == IR::OpSize::i256Bit;
|
||||
LOGMAN_THROW_A_FMT(!Is256Bit || (Is256Bit && HostSupportsSVE256), "Need SVE256 support in order to use {} with 256-bit operation", __func__);
|
||||
|
||||
const auto Dst = GetVReg(Node);
|
||||
@ -215,15 +215,15 @@ DEF_OP(Vector_SToF) {
|
||||
scvtf(Dst.Z(), SubEmitSize, Mask.Merging(), Vector.Z(), SubEmitSize);
|
||||
} else {
|
||||
if (OpSize == ElementSize) {
|
||||
if (ElementSize == 8) {
|
||||
if (ElementSize == IR::OpSize::i64Bit) {
|
||||
scvtf(ARMEmitter::ScalarRegSize::i64Bit, Dst.D(), Vector.D());
|
||||
} else if (ElementSize == 4) {
|
||||
} else if (ElementSize == IR::OpSize::i32Bit) {
|
||||
scvtf(ARMEmitter::ScalarRegSize::i32Bit, Dst.S(), Vector.S());
|
||||
} else {
|
||||
scvtf(ARMEmitter::ScalarRegSize::i16Bit, Dst.H(), Vector.H());
|
||||
}
|
||||
} else {
|
||||
if (OpSize == 8) {
|
||||
if (OpSize == IR::OpSize::i64Bit) {
|
||||
scvtf(SubEmitSize, Dst.D(), Vector.D());
|
||||
} else {
|
||||
scvtf(SubEmitSize, Dst.Q(), Vector.Q());
|
||||
@ -238,7 +238,7 @@ DEF_OP(Vector_FToZS) {
|
||||
|
||||
const auto ElementSize = Op->Header.ElementSize;
|
||||
const auto SubEmitSize = ConvertSubRegSize248(IROp);
|
||||
const auto Is256Bit = OpSize == Core::CPUState::XMM_AVX_REG_SIZE;
|
||||
const auto Is256Bit = OpSize == IR::OpSize::i256Bit;
|
||||
LOGMAN_THROW_A_FMT(!Is256Bit || (Is256Bit && HostSupportsSVE256), "Need SVE256 support in order to use {} with 256-bit operation", __func__);
|
||||
|
||||
const auto Dst = GetVReg(Node);
|
||||
@ -248,15 +248,15 @@ DEF_OP(Vector_FToZS) {
|
||||
fcvtzs(Dst.Z(), SubEmitSize, Mask.Merging(), Vector.Z(), SubEmitSize);
|
||||
} else {
|
||||
if (OpSize == ElementSize) {
|
||||
if (ElementSize == 8) {
|
||||
if (ElementSize == IR::OpSize::i64Bit) {
|
||||
fcvtzs(ARMEmitter::ScalarRegSize::i64Bit, Dst.D(), Vector.D());
|
||||
} else if (ElementSize == 4) {
|
||||
} else if (ElementSize == IR::OpSize::i32Bit) {
|
||||
fcvtzs(ARMEmitter::ScalarRegSize::i32Bit, Dst.S(), Vector.S());
|
||||
} else {
|
||||
fcvtzs(ARMEmitter::ScalarRegSize::i16Bit, Dst.H(), Vector.H());
|
||||
}
|
||||
} else {
|
||||
if (OpSize == 8) {
|
||||
if (OpSize == IR::OpSize::i64Bit) {
|
||||
fcvtzs(SubEmitSize, Dst.D(), Vector.D());
|
||||
} else {
|
||||
fcvtzs(SubEmitSize, Dst.Q(), Vector.Q());
|
||||
@ -269,7 +269,7 @@ DEF_OP(Vector_FToS) {
|
||||
const auto Op = IROp->C<IR::IROp_Vector_FToS>();
|
||||
const auto OpSize = IROp->Size;
|
||||
|
||||
const auto Is256Bit = OpSize == Core::CPUState::XMM_AVX_REG_SIZE;
|
||||
const auto Is256Bit = OpSize == IR::OpSize::i256Bit;
|
||||
LOGMAN_THROW_A_FMT(!Is256Bit || (Is256Bit && HostSupportsSVE256), "Need SVE256 support in order to use {} with 256-bit operation", __func__);
|
||||
|
||||
const auto SubEmitSize = ConvertSubRegSize248(IROp);
|
||||
@ -284,7 +284,7 @@ DEF_OP(Vector_FToS) {
|
||||
} else {
|
||||
const auto Dst = GetVReg(Node);
|
||||
const auto Vector = GetVReg(Op->Vector.ID());
|
||||
if (OpSize == 8) {
|
||||
if (OpSize == IR::OpSize::i64Bit) {
|
||||
frinti(SubEmitSize, Dst.D(), Vector.D());
|
||||
fcvtzs(SubEmitSize, Dst.D(), Dst.D());
|
||||
} else {
|
||||
@ -300,10 +300,10 @@ DEF_OP(Vector_FToF) {
|
||||
|
||||
const auto ElementSize = Op->Header.ElementSize;
|
||||
const auto SubEmitSize = ConvertSubRegSize248(IROp);
|
||||
const auto Is256Bit = OpSize == Core::CPUState::XMM_AVX_REG_SIZE;
|
||||
const auto Is256Bit = OpSize == IR::OpSize::i256Bit;
|
||||
LOGMAN_THROW_A_FMT(!Is256Bit || (Is256Bit && HostSupportsSVE256), "Need SVE256 support in order to use {} with 256-bit operation", __func__);
|
||||
|
||||
const auto Conv = (ElementSize << 8) | IR::OpSizeToSize(Op->SrcElementSize);
|
||||
const auto Conv = (IR::OpSizeToSize(ElementSize) << 8) | IR::OpSizeToSize(Op->SrcElementSize);
|
||||
|
||||
const auto Dst = GetVReg(Node);
|
||||
const auto Vector = GetVReg(Op->Vector.ID());
|
||||
@ -403,7 +403,7 @@ DEF_OP(Vector_FToI) {
|
||||
|
||||
const auto ElementSize = Op->Header.ElementSize;
|
||||
const auto SubEmitSize = ConvertSubRegSize248(IROp);
|
||||
const auto Is256Bit = OpSize == Core::CPUState::XMM_AVX_REG_SIZE;
|
||||
const auto Is256Bit = OpSize == IR::OpSize::i256Bit;
|
||||
LOGMAN_THROW_A_FMT(!Is256Bit || (Is256Bit && HostSupportsSVE256), "Need SVE256 support in order to use {} with 256-bit operation", __func__);
|
||||
|
||||
const auto Dst = GetVReg(Node);
|
||||
@ -427,15 +427,15 @@ DEF_OP(Vector_FToI) {
|
||||
// frinti having AdvSIMD, AdvSIMD scalar, and an SVE version),
|
||||
// we can't just use a lambda without some seriously ugly casting.
|
||||
// This is fairly self-contained otherwise.
|
||||
#define ROUNDING_FN(name) \
|
||||
if (ElementSize == 2) { \
|
||||
name(Dst.H(), Vector.H()); \
|
||||
} else if (ElementSize == 4) { \
|
||||
name(Dst.S(), Vector.S()); \
|
||||
} else if (ElementSize == 8) { \
|
||||
name(Dst.D(), Vector.D()); \
|
||||
} else { \
|
||||
FEX_UNREACHABLE; \
|
||||
#define ROUNDING_FN(name) \
|
||||
if (ElementSize == IR::OpSize::i16Bit) { \
|
||||
name(Dst.H(), Vector.H()); \
|
||||
} else if (ElementSize == IR::OpSize::i32Bit) { \
|
||||
name(Dst.S(), Vector.S()); \
|
||||
} else if (ElementSize == IR::OpSize::i64Bit) { \
|
||||
name(Dst.D(), Vector.D()); \
|
||||
} else { \
|
||||
FEX_UNREACHABLE; \
|
||||
}
|
||||
|
||||
switch (Op->Round) {
|
||||
@ -464,7 +464,7 @@ DEF_OP(Vector_F64ToI32) {
|
||||
const auto OpSize = IROp->Size;
|
||||
const auto Round = Op->Round;
|
||||
|
||||
const auto Is256Bit = OpSize == Core::CPUState::XMM_AVX_REG_SIZE;
|
||||
const auto Is256Bit = OpSize == IR::OpSize::i256Bit;
|
||||
LOGMAN_THROW_A_FMT(!Is256Bit || (Is256Bit && HostSupportsSVE256), "Need SVE256 support in order to use {} with 256-bit operation", __func__);
|
||||
|
||||
const auto Dst = GetVReg(Node);
|
||||
|
@ -24,7 +24,7 @@ DEF_OP(VAESEnc) {
|
||||
const auto State = GetVReg(Op->State.ID());
|
||||
const auto ZeroReg = GetVReg(Op->ZeroReg.ID());
|
||||
|
||||
LOGMAN_THROW_AA_FMT(OpSize == Core::CPUState::XMM_SSE_REG_SIZE, "Currently only supports 128-bit operations.");
|
||||
LOGMAN_THROW_AA_FMT(OpSize == IR::OpSize::i128Bit, "Currently only supports 128-bit operations.");
|
||||
|
||||
if (Dst == State && Dst != Key) {
|
||||
// Optimal case in which Dst already contains the starting state.
|
||||
@ -49,7 +49,7 @@ DEF_OP(VAESEncLast) {
|
||||
const auto State = GetVReg(Op->State.ID());
|
||||
const auto ZeroReg = GetVReg(Op->ZeroReg.ID());
|
||||
|
||||
LOGMAN_THROW_AA_FMT(OpSize == Core::CPUState::XMM_SSE_REG_SIZE, "Currently only supports 128-bit operations.");
|
||||
LOGMAN_THROW_AA_FMT(OpSize == IR::OpSize::i128Bit, "Currently only supports 128-bit operations.");
|
||||
|
||||
if (Dst == State && Dst != Key) {
|
||||
// Optimal case in which Dst already contains the starting state.
|
||||
@ -72,7 +72,7 @@ DEF_OP(VAESDec) {
|
||||
const auto State = GetVReg(Op->State.ID());
|
||||
const auto ZeroReg = GetVReg(Op->ZeroReg.ID());
|
||||
|
||||
LOGMAN_THROW_AA_FMT(OpSize == Core::CPUState::XMM_SSE_REG_SIZE, "Currently only supports 128-bit operations.");
|
||||
LOGMAN_THROW_AA_FMT(OpSize == IR::OpSize::i128Bit, "Currently only supports 128-bit operations.");
|
||||
|
||||
if (Dst == State && Dst != Key) {
|
||||
// Optimal case in which Dst already contains the starting state.
|
||||
@ -97,7 +97,7 @@ DEF_OP(VAESDecLast) {
|
||||
const auto State = GetVReg(Op->State.ID());
|
||||
const auto ZeroReg = GetVReg(Op->ZeroReg.ID());
|
||||
|
||||
LOGMAN_THROW_AA_FMT(OpSize == Core::CPUState::XMM_SSE_REG_SIZE, "Currently only supports 128-bit operations.");
|
||||
LOGMAN_THROW_AA_FMT(OpSize == IR::OpSize::i128Bit, "Currently only supports 128-bit operations.");
|
||||
|
||||
if (Dst == State && Dst != Key) {
|
||||
// Optimal case in which Dst already contains the starting state.
|
||||
@ -193,7 +193,7 @@ DEF_OP(PCLMUL) {
|
||||
const auto Src1 = GetVReg(Op->Src1.ID());
|
||||
const auto Src2 = GetVReg(Op->Src2.ID());
|
||||
|
||||
LOGMAN_THROW_AA_FMT(OpSize == Core::CPUState::XMM_SSE_REG_SIZE, "Currently only supports 128-bit operations.");
|
||||
LOGMAN_THROW_AA_FMT(OpSize == IR::OpSize::i128Bit, "Currently only supports 128-bit operations.");
|
||||
|
||||
switch (Op->Selector) {
|
||||
case 0b00000000: pmull(ARMEmitter::SubRegSize::i128Bit, Dst.D(), Src1.D(), Src2.D()); break;
|
||||
|
@ -228,7 +228,7 @@ private:
|
||||
bool IsGPR(IR::NodeID Node) const;
|
||||
|
||||
[[nodiscard]]
|
||||
ARMEmitter::ExtendedMemOperand GenerateMemOperand(uint8_t AccessSize, ARMEmitter::Register Base, IR::OrderedNodeWrapper Offset,
|
||||
ARMEmitter::ExtendedMemOperand GenerateMemOperand(IR::OpSize AccessSize, ARMEmitter::Register Base, IR::OrderedNodeWrapper Offset,
|
||||
IR::MemOffsetType OffsetType, uint8_t OffsetScale);
|
||||
|
||||
// NOTE: Will use TMP1 as a way to encode immediates that happen to fall outside
|
||||
@ -237,7 +237,7 @@ private:
|
||||
// TMP1 is safe to use again once this memory operand is used with its
|
||||
// equivalent loads or stores that this was called for.
|
||||
[[nodiscard]]
|
||||
ARMEmitter::SVEMemOperand GenerateSVEMemOperand(uint8_t AccessSize, ARMEmitter::Register Base, IR::OrderedNodeWrapper Offset,
|
||||
ARMEmitter::SVEMemOperand GenerateSVEMemOperand(IR::OpSize AccessSize, ARMEmitter::Register Base, IR::OrderedNodeWrapper Offset,
|
||||
IR::MemOffsetType OffsetType, uint8_t OffsetScale);
|
||||
|
||||
[[nodiscard]]
|
||||
@ -318,15 +318,16 @@ private:
|
||||
|
||||
using ScalarFMAOpCaller =
|
||||
std::function<void(ARMEmitter::VRegister Dst, ARMEmitter::VRegister Src1, ARMEmitter::VRegister Src2, ARMEmitter::VRegister Src3)>;
|
||||
void VFScalarFMAOperation(uint8_t OpSize, uint8_t ElementSize, ScalarFMAOpCaller ScalarEmit, ARMEmitter::VRegister Dst,
|
||||
void VFScalarFMAOperation(IR::OpSize OpSize, IR::OpSize ElementSize, ScalarFMAOpCaller ScalarEmit, ARMEmitter::VRegister Dst,
|
||||
ARMEmitter::VRegister Upper, ARMEmitter::VRegister Vector1, ARMEmitter::VRegister Vector2,
|
||||
ARMEmitter::VRegister Addend);
|
||||
using ScalarBinaryOpCaller = std::function<void(ARMEmitter::VRegister Dst, ARMEmitter::VRegister Src1, ARMEmitter::VRegister Src2)>;
|
||||
void VFScalarOperation(uint8_t OpSize, uint8_t ElementSize, bool ZeroUpperBits, ScalarBinaryOpCaller ScalarEmit,
|
||||
void VFScalarOperation(IR::OpSize OpSize, IR::OpSize ElementSize, bool ZeroUpperBits, ScalarBinaryOpCaller ScalarEmit,
|
||||
ARMEmitter::VRegister Dst, ARMEmitter::VRegister Vector1, ARMEmitter::VRegister Vector2);
|
||||
using ScalarUnaryOpCaller = std::function<void(ARMEmitter::VRegister Dst, std::variant<ARMEmitter::VRegister, ARMEmitter::Register> SrcVar)>;
|
||||
void VFScalarUnaryOperation(uint8_t OpSize, uint8_t ElementSize, bool ZeroUpperBits, ScalarUnaryOpCaller ScalarEmit, ARMEmitter::VRegister Dst,
|
||||
ARMEmitter::VRegister Vector1, std::variant<ARMEmitter::VRegister, ARMEmitter::Register> Vector2);
|
||||
void VFScalarUnaryOperation(IR::OpSize OpSize, IR::OpSize ElementSize, bool ZeroUpperBits, ScalarUnaryOpCaller ScalarEmit,
|
||||
ARMEmitter::VRegister Dst, ARMEmitter::VRegister Vector1,
|
||||
std::variant<ARMEmitter::VRegister, ARMEmitter::Register> Vector2);
|
||||
|
||||
void Emulate128BitGather(IR::OpSize Size, IR::OpSize ElementSize, ARMEmitter::VRegister Dst, ARMEmitter::VRegister IncomingDst,
|
||||
std::optional<ARMEmitter::Register> BaseAddr, ARMEmitter::VRegister VectorIndexLow,
|
||||
|
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
@ -168,7 +168,7 @@ void OpDispatchBuilder::RETOp(OpcodeArgs) {
|
||||
|
||||
if (Op->OP == 0xC2) {
|
||||
auto Offset = LoadSource(GPRClass, Op, Op->Src[0], Op->Flags);
|
||||
SP = _Add(IR::SizeToOpSize(GPRSize), SP, Offset);
|
||||
SP = _Add(GPRSize, SP, Offset);
|
||||
}
|
||||
|
||||
// Store the new stack pointer
|
||||
@ -297,7 +297,7 @@ void OpDispatchBuilder::ADCOp(OpcodeArgs, uint32_t SrcIndex) {
|
||||
HandledLock = true;
|
||||
|
||||
Ref DestMem = MakeSegmentAddress(Op, Op->Dest);
|
||||
Before = _AtomicFetchAdd(IR::SizeToOpSize(Size), ALUOp, DestMem);
|
||||
Before = _AtomicFetchAdd(Size, ALUOp, DestMem);
|
||||
} else {
|
||||
Before = LoadSource(GPRClass, Op, Op->Dest, Op->Flags, {.AllowUpperGarbage = true});
|
||||
}
|
||||
@ -334,7 +334,7 @@ void OpDispatchBuilder::SBBOp(OpcodeArgs, uint32_t SrcIndex) {
|
||||
|
||||
Ref DestMem = MakeSegmentAddress(Op, Op->Dest);
|
||||
auto SrcPlusCF = IncrementByCarry(OpSize, Src);
|
||||
Before = _AtomicFetchSub(IR::SizeToOpSize(Size), SrcPlusCF, DestMem);
|
||||
Before = _AtomicFetchSub(Size, SrcPlusCF, DestMem);
|
||||
} else {
|
||||
Before = LoadSource(GPRClass, Op, Op->Dest, Op->Flags, {.AllowUpperGarbage = true});
|
||||
}
|
||||
@ -494,7 +494,7 @@ void OpDispatchBuilder::POPAOp(OpcodeArgs) {
|
||||
StoreGPRRegister(X86State::REG_RBP, Pop(Size, SP), Size);
|
||||
|
||||
// Skip loading RSP because it'll be correct at the end
|
||||
SP = _RMWHandle(_Add(OpSize::i64Bit, SP, _InlineConstant(Size)));
|
||||
SP = _RMWHandle(_Add(OpSize::i64Bit, SP, _InlineConstant(IR::OpSizeToSize(Size))));
|
||||
|
||||
StoreGPRRegister(X86State::REG_RBX, Pop(Size, SP), Size);
|
||||
StoreGPRRegister(X86State::REG_RDX, Pop(Size, SP), Size);
|
||||
@ -567,7 +567,7 @@ void OpDispatchBuilder::CALLOp(OpcodeArgs) {
|
||||
uint64_t InstRIP = Op->PC + Op->InstSize;
|
||||
uint64_t TargetRIP = InstRIP + TargetOffset;
|
||||
|
||||
Ref NewRIP = _Add(IR::SizeToOpSize(GPRSize), ConstantPC, _Constant(TargetOffset));
|
||||
Ref NewRIP = _Add(GPRSize, ConstantPC, _Constant(TargetOffset));
|
||||
|
||||
// Push the return address.
|
||||
Push(GPRSize, ConstantPC);
|
||||
@ -715,7 +715,7 @@ void OpDispatchBuilder::CMOVOp(OpcodeArgs) {
|
||||
Src = LoadSource(GPRClass, Op, Op->Src[0], Op->Flags);
|
||||
}
|
||||
|
||||
auto SrcCond = SelectCC(Op->OP & 0xF, IR::SizeToOpSize(std::max<uint8_t>(OpSize::i32Bit, GetSrcSize(Op))), Src, Dest);
|
||||
auto SrcCond = SelectCC(Op->OP & 0xF, std::max(OpSize::i32Bit, OpSizeFromSrc(Op)), Src, Dest);
|
||||
|
||||
StoreResult(GPRClass, Op, SrcCond, OpSize::iInvalid);
|
||||
}
|
||||
@ -731,7 +731,7 @@ void OpDispatchBuilder::CondJUMPOp(OpcodeArgs) {
|
||||
uint64_t InstRIP = Op->PC + Op->InstSize;
|
||||
uint64_t Target = InstRIP + TargetOffset;
|
||||
|
||||
if (CTX->GetGPRSize() == OpSize::i32Bit) {
|
||||
if (CTX->GetGPROpSize() == OpSize::i32Bit) {
|
||||
// If the GPRSize is 4 then we need to be careful about PC wrapping
|
||||
if (TargetOffset < 0 && -TargetOffset > InstRIP) {
|
||||
// Invert the signed value if we are underflowing
|
||||
@ -802,7 +802,7 @@ void OpDispatchBuilder::CondJUMPRCXOp(OpcodeArgs) {
|
||||
|
||||
BlockSetRIP = true;
|
||||
auto JcxGPRSize = CTX->GetGPROpSize();
|
||||
JcxGPRSize = (Op->Flags & X86Tables::DecodeFlags::FLAG_ADDRESS_SIZE) ? (IR::DivideOpSize(JcxGPRSize, 2)) : JcxGPRSize;
|
||||
JcxGPRSize = (Op->Flags & X86Tables::DecodeFlags::FLAG_ADDRESS_SIZE) ? (JcxGPRSize >> 1) : JcxGPRSize;
|
||||
|
||||
uint64_t Target = Op->PC + Op->InstSize + Op->Src[0].Literal();
|
||||
|
||||
@ -937,7 +937,7 @@ void OpDispatchBuilder::JUMPOp(OpcodeArgs) {
|
||||
uint64_t InstRIP = Op->PC + Op->InstSize;
|
||||
uint64_t TargetRIP = InstRIP + TargetOffset;
|
||||
|
||||
if (CTX->GetGPRSize() == OpSize::i32Bit) {
|
||||
if (CTX->GetGPROpSize() == OpSize::i32Bit) {
|
||||
// If the GPRSize is 4 then we need to be careful about PC wrapping
|
||||
if (TargetOffset < 0 && -TargetOffset > InstRIP) {
|
||||
// Invert the signed value if we are underflowing
|
||||
@ -1000,18 +1000,18 @@ void OpDispatchBuilder::TESTOp(OpcodeArgs, uint32_t SrcIndex) {
|
||||
Ref Src = LoadSource(GPRClass, Op, Op->Src[SrcIndex], Op->Flags, {.AllowUpperGarbage = true});
|
||||
Ref Dest = LoadSource(GPRClass, Op, Op->Dest, Op->Flags, {.AllowUpperGarbage = true});
|
||||
|
||||
auto Size = GetDstSize(Op);
|
||||
const auto Size = OpSizeFromDst(Op);
|
||||
|
||||
uint64_t Const;
|
||||
bool AlwaysNonnegative = false;
|
||||
if (IsValueConstant(WrapNode(Src), &Const)) {
|
||||
// Optimize out masking constants
|
||||
if (Const == (Size == OpSize::i64Bit ? ~0ULL : ((1ull << Size * 8) - 1))) {
|
||||
if (Const == (Size == OpSize::i64Bit ? ~0ULL : ((1ull << IR::OpSizeAsBits(Size)) - 1))) {
|
||||
Src = Dest;
|
||||
}
|
||||
|
||||
// Optimize test with non-sign bits
|
||||
AlwaysNonnegative = (Const & (1ull << ((Size * 8) - 1))) == 0;
|
||||
AlwaysNonnegative = (Const & (1ull << (IR::OpSizeAsBits(Size) - 1))) == 0;
|
||||
}
|
||||
|
||||
if (Dest == Src) {
|
||||
@ -1024,7 +1024,7 @@ void OpDispatchBuilder::TESTOp(OpcodeArgs, uint32_t SrcIndex) {
|
||||
SetNZ_ZeroCV(OpSize::i32Bit, Res);
|
||||
} else {
|
||||
HandleNZ00Write();
|
||||
CalculatePF(_AndWithFlags(IR::SizeToOpSize(Size), Dest, Src));
|
||||
CalculatePF(_AndWithFlags(Size, Dest, Src));
|
||||
}
|
||||
|
||||
InvalidateAF();
|
||||
@ -1049,7 +1049,7 @@ void OpDispatchBuilder::MOVSXDOp(OpcodeArgs) {
|
||||
StoreResult_WithOpSize(GPRClass, Op, Op->Dest, Src, Size, OpSize::iInvalid);
|
||||
} else if (Sext) {
|
||||
// With REX.W then Sext
|
||||
Src = _Sbfe(OpSize::i64Bit, Size * 8, 0, Src);
|
||||
Src = _Sbfe(OpSize::i64Bit, IR::OpSizeAsBits(Size), 0, Src);
|
||||
StoreResult(GPRClass, Op, Src, OpSize::iInvalid);
|
||||
} else {
|
||||
// Without REX.W then Zext (store result implicitly zero extends)
|
||||
@ -1059,13 +1059,13 @@ void OpDispatchBuilder::MOVSXDOp(OpcodeArgs) {
|
||||
|
||||
void OpDispatchBuilder::MOVSXOp(OpcodeArgs) {
|
||||
// Load garbage in upper bits, since we're sign extending anyway
|
||||
uint8_t Size = GetSrcSize(Op);
|
||||
const auto Size = OpSizeFromSrc(Op);
|
||||
Ref Src = LoadSource(GPRClass, Op, Op->Src[0], Op->Flags, {.AllowUpperGarbage = true});
|
||||
|
||||
// Sign-extend to DstSize and zero-extend to the register size, using a fast
|
||||
// path for 32-bit dests where the native 32-bit Sbfe zero extends the top.
|
||||
uint8_t DstSize = GetDstSize(Op);
|
||||
Src = _Sbfe(DstSize == OpSize::i64Bit ? OpSize::i64Bit : OpSize::i32Bit, Size * 8, 0, Src);
|
||||
const auto DstSize = OpSizeFromDst(Op);
|
||||
Src = _Sbfe(DstSize == OpSize::i64Bit ? OpSize::i64Bit : OpSize::i32Bit, IR::OpSizeAsBits(Size), 0, Src);
|
||||
StoreResult(GPRClass, Op, Op->Dest, Src, OpSize::iInvalid);
|
||||
}
|
||||
|
||||
@ -1134,10 +1134,10 @@ void OpDispatchBuilder::XCHGOp(OpcodeArgs) {
|
||||
|
||||
void OpDispatchBuilder::CDQOp(OpcodeArgs) {
|
||||
const auto DstSize = OpSizeFromDst(Op);
|
||||
const auto SrcSize = IR::SizeToOpSize(IR::OpSizeToSize(DstSize) >> 1);
|
||||
const auto SrcSize = DstSize / 2;
|
||||
Ref Src = LoadGPRRegister(X86State::REG_RAX, SrcSize, 0, true);
|
||||
|
||||
Src = _Sbfe(DstSize <= OpSize::i32Bit ? OpSize::i32Bit : OpSize::i64Bit, SrcSize * 8, 0, Src);
|
||||
Src = _Sbfe(DstSize <= OpSize::i32Bit ? OpSize::i32Bit : OpSize::i64Bit, IR::OpSizeAsBits(SrcSize), 0, Src);
|
||||
|
||||
StoreResult_WithOpSize(GPRClass, Op, Op->Dest, Src, DstSize, OpSize::iInvalid);
|
||||
}
|
||||
@ -1374,7 +1374,7 @@ void OpDispatchBuilder::XGetBVOp(OpcodeArgs) {
|
||||
}
|
||||
|
||||
void OpDispatchBuilder::SHLOp(OpcodeArgs) {
|
||||
const auto Size = GetSrcSize(Op);
|
||||
const auto Size = OpSizeFromSrc(Op);
|
||||
auto Dest = LoadSource(GPRClass, Op, Op->Dest, Op->Flags, {.AllowUpperGarbage = true});
|
||||
auto Src = LoadSource(GPRClass, Op, Op->Src[1], Op->Flags, {.AllowUpperGarbage = true});
|
||||
|
||||
@ -1398,7 +1398,7 @@ void OpDispatchBuilder::SHLImmediateOp(OpcodeArgs, bool SHL1Bit) {
|
||||
|
||||
void OpDispatchBuilder::SHROp(OpcodeArgs) {
|
||||
const auto Size = OpSizeFromSrc(Op);
|
||||
auto Dest = LoadSource(GPRClass, Op, Op->Dest, Op->Flags, {.AllowUpperGarbage = Size >= 4});
|
||||
auto Dest = LoadSource(GPRClass, Op, Op->Dest, Op->Flags, {.AllowUpperGarbage = Size >= OpSize::i32Bit});
|
||||
auto Src = LoadSource(GPRClass, Op, Op->Src[1], Op->Flags, {.AllowUpperGarbage = true});
|
||||
|
||||
auto ALUOp = _Lshr(std::max(OpSize::i32Bit, Size), Dest, Src);
|
||||
@ -1557,29 +1557,29 @@ void OpDispatchBuilder::SHRDImmediateOp(OpcodeArgs) {
|
||||
}
|
||||
|
||||
void OpDispatchBuilder::ASHROp(OpcodeArgs, bool Immediate, bool SHR1Bit) {
|
||||
const auto Size = GetSrcSize(Op);
|
||||
const auto Size = OpSizeFromSrc(Op);
|
||||
const auto OpSize = std::max(OpSize::i32Bit, OpSizeFromDst(Op));
|
||||
|
||||
// If Size < 4, then we Sbfe the Dest so we can have garbage.
|
||||
// Otherwise, if Size = Opsize, then both are 4 or 8 and match the a64
|
||||
// semantics directly, so again we can have garbage. The only case where we
|
||||
// need zero-extension here is when the sizes mismatch.
|
||||
auto Dest = LoadSource(GPRClass, Op, Op->Dest, Op->Flags, {.AllowUpperGarbage = (OpSize == Size) || (Size < 4)});
|
||||
auto Dest = LoadSource(GPRClass, Op, Op->Dest, Op->Flags, {.AllowUpperGarbage = (OpSize == Size) || (Size < OpSize::i32Bit)});
|
||||
|
||||
if (Size < OpSize::i32Bit) {
|
||||
Dest = _Sbfe(OpSize::i64Bit, Size * 8, 0, Dest);
|
||||
Dest = _Sbfe(OpSize::i64Bit, IR::OpSizeAsBits(Size), 0, Dest);
|
||||
}
|
||||
|
||||
if (Immediate) {
|
||||
uint64_t Shift = LoadConstantShift(Op, SHR1Bit);
|
||||
Ref Result = _Ashr(IR::SizeToOpSize(OpSize), Dest, _Constant(Shift));
|
||||
Ref Result = _Ashr(OpSize, Dest, _Constant(Shift));
|
||||
|
||||
CalculateFlags_SignShiftRightImmediate(OpSizeFromSrc(Op), Result, Dest, Shift);
|
||||
CalculateDeferredFlags();
|
||||
StoreResult(GPRClass, Op, Result, OpSize::iInvalid);
|
||||
} else {
|
||||
auto Src = LoadSource(GPRClass, Op, Op->Src[1], Op->Flags, {.AllowUpperGarbage = true});
|
||||
Ref Result = _Ashr(IR::SizeToOpSize(OpSize), Dest, Src);
|
||||
Ref Result = _Ashr(OpSize, Dest, Src);
|
||||
|
||||
HandleShift(Op, Result, Dest, ShiftType::ASR, Src);
|
||||
}
|
||||
@ -1660,12 +1660,12 @@ void OpDispatchBuilder::BEXTRBMIOp(OpcodeArgs) {
|
||||
// Essentially (Src1 >> Start) & ((1 << Length) - 1)
|
||||
// along with some edge-case handling and flag setting.
|
||||
|
||||
LOGMAN_THROW_A_FMT(Op->InstSize >= OpSize::i32Bit, "No masking needed");
|
||||
LOGMAN_THROW_A_FMT(Op->InstSize >= 4, "No masking needed");
|
||||
auto* Src1 = LoadSource(GPRClass, Op, Op->Src[0], Op->Flags, {.AllowUpperGarbage = true});
|
||||
auto* Src2 = LoadSource(GPRClass, Op, Op->Src[1], Op->Flags, {.AllowUpperGarbage = true});
|
||||
|
||||
const auto Size = OpSizeFromSrc(Op);
|
||||
const auto SrcSize = Size * 8;
|
||||
const auto SrcSize = IR::OpSizeAsBits(Size);
|
||||
const auto MaxSrcBit = SrcSize - 1;
|
||||
auto MaxSrcBitOp = _Constant(Size, MaxSrcBit);
|
||||
|
||||
@ -1701,8 +1701,8 @@ void OpDispatchBuilder::BEXTRBMIOp(OpcodeArgs) {
|
||||
|
||||
void OpDispatchBuilder::BLSIBMIOp(OpcodeArgs) {
|
||||
// Equivalent to performing: SRC & -SRC
|
||||
LOGMAN_THROW_A_FMT(Op->InstSize >= OpSize::i32Bit, "No masking needed");
|
||||
auto Size = OpSizeFromSrc(Op);
|
||||
LOGMAN_THROW_A_FMT(Op->InstSize >= 4, "No masking needed");
|
||||
const auto Size = OpSizeFromSrc(Op);
|
||||
|
||||
auto* Src = LoadSource(GPRClass, Op, Op->Src[0], Op->Flags, {.AllowUpperGarbage = true});
|
||||
auto NegatedSrc = _Neg(Size, Src);
|
||||
@ -1715,15 +1715,15 @@ void OpDispatchBuilder::BLSIBMIOp(OpcodeArgs) {
|
||||
// inverted ZF.
|
||||
//
|
||||
// ZF/SF/OF set as usual.
|
||||
SetNZ_ZeroCV(GetSrcSize(Op), Result);
|
||||
SetNZ_ZeroCV(Size, Result);
|
||||
InvalidatePF_AF();
|
||||
SetCFInverted(GetRFLAG(X86State::RFLAG_ZF_RAW_LOC));
|
||||
}
|
||||
|
||||
void OpDispatchBuilder::BLSMSKBMIOp(OpcodeArgs) {
|
||||
// Equivalent to: (Src - 1) ^ Src
|
||||
LOGMAN_THROW_A_FMT(Op->InstSize >= OpSize::i32Bit, "No masking needed");
|
||||
auto Size = OpSizeFromSrc(Op);
|
||||
LOGMAN_THROW_A_FMT(Op->InstSize >= 4, "No masking needed");
|
||||
const auto Size = OpSizeFromSrc(Op);
|
||||
|
||||
auto* Src = LoadSource(GPRClass, Op, Op->Src[0], Op->Flags, {.AllowUpperGarbage = true});
|
||||
auto Result = _Xor(Size, _Sub(Size, Src, _InlineConstant(1)), Src);
|
||||
@ -1738,24 +1738,25 @@ void OpDispatchBuilder::BLSMSKBMIOp(OpcodeArgs) {
|
||||
|
||||
// The output of BLSMSK is always nonzero, so TST will clear Z (along with C
|
||||
// and O) while setting S.
|
||||
SetNZ_ZeroCV(GetSrcSize(Op), Result);
|
||||
SetNZ_ZeroCV(Size, Result);
|
||||
SetCFInverted(CFInv);
|
||||
}
|
||||
|
||||
void OpDispatchBuilder::BLSRBMIOp(OpcodeArgs) {
|
||||
// Equivalent to: (Src - 1) & Src
|
||||
LOGMAN_THROW_A_FMT(Op->InstSize >= OpSize::i32Bit, "No masking needed");
|
||||
auto* Src = LoadSource(GPRClass, Op, Op->Src[0], Op->Flags, {.AllowUpperGarbage = true});
|
||||
auto Size = OpSizeFromSrc(Op);
|
||||
LOGMAN_THROW_A_FMT(Op->InstSize >= 4, "No masking needed");
|
||||
const auto Size = OpSizeFromSrc(Op);
|
||||
|
||||
auto* Src = LoadSource(GPRClass, Op, Op->Src[0], Op->Flags, {.AllowUpperGarbage = true});
|
||||
auto Result = _And(Size, _Sub(Size, Src, _InlineConstant(1)), Src);
|
||||
|
||||
StoreResult(GPRClass, Op, Result, OpSize::iInvalid);
|
||||
|
||||
auto Zero = _Constant(0);
|
||||
auto One = _Constant(1);
|
||||
auto CFInv = _Select(IR::COND_NEQ, Src, Zero, One, Zero);
|
||||
|
||||
SetNZ_ZeroCV(GetSrcSize(Op), Result);
|
||||
SetNZ_ZeroCV(Size, Result);
|
||||
SetCFInverted(CFInv);
|
||||
InvalidatePF_AF();
|
||||
}
|
||||
@ -1774,13 +1775,13 @@ void OpDispatchBuilder::BMI2Shift(OpcodeArgs) {
|
||||
Ref Result;
|
||||
if (Op->OP == 0x6F7) {
|
||||
// SARX
|
||||
Result = _Ashr(IR::SizeToOpSize(Size), Src, Shift);
|
||||
Result = _Ashr(Size, Src, Shift);
|
||||
} else if (Op->OP == 0x5F7) {
|
||||
// SHLX
|
||||
Result = _Lshl(IR::SizeToOpSize(Size), Src, Shift);
|
||||
Result = _Lshl(Size, Src, Shift);
|
||||
} else {
|
||||
// SHRX
|
||||
Result = _Lshr(IR::SizeToOpSize(Size), Src, Shift);
|
||||
Result = _Lshr(Size, Src, Shift);
|
||||
}
|
||||
|
||||
StoreResult(GPRClass, Op, Result, OpSize::iInvalid);
|
||||
@ -1788,7 +1789,7 @@ void OpDispatchBuilder::BMI2Shift(OpcodeArgs) {
|
||||
|
||||
void OpDispatchBuilder::BZHI(OpcodeArgs) {
|
||||
const auto Size = OpSizeFromSrc(Op);
|
||||
const auto OperandSize = Size * 8;
|
||||
const auto OperandSize = IR::OpSizeAsBits(Size);
|
||||
|
||||
// In 32-bit mode we only look at bottom 32-bit, no 8 or 16-bit BZHI so no
|
||||
// need to zero-extend sources
|
||||
@ -1853,13 +1854,12 @@ void OpDispatchBuilder::RORX(OpcodeArgs) {
|
||||
|
||||
void OpDispatchBuilder::MULX(OpcodeArgs) {
|
||||
// RDX is the implied source operand in the instruction
|
||||
const auto OperandSize = OpSizeFromSrc(Op);
|
||||
const auto OpSize = IR::SizeToOpSize(OperandSize);
|
||||
const auto OpSize = OpSizeFromSrc(Op);
|
||||
|
||||
// Src1 can be a memory operand, so ensure we constrain to the
|
||||
// absolute width of the access in that scenario.
|
||||
const auto GPRSize = CTX->GetGPROpSize();
|
||||
const auto Src1Size = Op->Src[1].IsGPR() ? GPRSize : OperandSize;
|
||||
const auto Src1Size = Op->Src[1].IsGPR() ? GPRSize : OpSize;
|
||||
|
||||
Ref Src1 = LoadSource_WithOpSize(GPRClass, Op, Op->Src[1], Src1Size, Op->Flags);
|
||||
Ref Src2 = LoadGPRRegister(X86State::REG_RDX, GPRSize);
|
||||
@ -1880,7 +1880,7 @@ void OpDispatchBuilder::MULX(OpcodeArgs) {
|
||||
}
|
||||
|
||||
void OpDispatchBuilder::PDEP(OpcodeArgs) {
|
||||
LOGMAN_THROW_A_FMT(Op->InstSize >= OpSize::i32Bit, "No masking needed");
|
||||
LOGMAN_THROW_A_FMT(Op->InstSize >= 4, "No masking needed");
|
||||
auto* Input = LoadSource(GPRClass, Op, Op->Src[0], Op->Flags, {.AllowUpperGarbage = true});
|
||||
auto* Mask = LoadSource(GPRClass, Op, Op->Src[1], Op->Flags, {.AllowUpperGarbage = true});
|
||||
auto Result = _PDep(OpSizeFromSrc(Op), Input, Mask);
|
||||
@ -1889,7 +1889,7 @@ void OpDispatchBuilder::PDEP(OpcodeArgs) {
|
||||
}
|
||||
|
||||
void OpDispatchBuilder::PEXT(OpcodeArgs) {
|
||||
LOGMAN_THROW_A_FMT(Op->InstSize >= OpSize::i32Bit, "No masking needed");
|
||||
LOGMAN_THROW_A_FMT(Op->InstSize >= 4, "No masking needed");
|
||||
auto* Input = LoadSource(GPRClass, Op, Op->Src[0], Op->Flags, {.AllowUpperGarbage = true});
|
||||
auto* Mask = LoadSource(GPRClass, Op, Op->Src[1], Op->Flags, {.AllowUpperGarbage = true});
|
||||
auto Result = _PExt(OpSizeFromSrc(Op), Input, Mask);
|
||||
@ -2093,7 +2093,7 @@ void OpDispatchBuilder::RCROp(OpcodeArgs) {
|
||||
|
||||
StoreResult(GPRClass, Op, Res, OpSize::iInvalid);
|
||||
},
|
||||
GetSrcSize(Op) == OpSize::i32Bit ? std::make_optional(&OpDispatchBuilder::ZeroShiftResult) : std::nullopt);
|
||||
OpSizeFromSrc(Op) == OpSize::i32Bit ? std::make_optional(&OpDispatchBuilder::ZeroShiftResult) : std::nullopt);
|
||||
}
|
||||
|
||||
void OpDispatchBuilder::RCRSmallerOp(OpcodeArgs) {
|
||||
@ -2315,7 +2315,7 @@ void OpDispatchBuilder::RCLOp(OpcodeArgs) {
|
||||
|
||||
StoreResult(GPRClass, Op, Res, OpSize::iInvalid);
|
||||
},
|
||||
GetSrcSize(Op) == OpSize::i32Bit ? std::make_optional(&OpDispatchBuilder::ZeroShiftResult) : std::nullopt);
|
||||
OpSizeFromSrc(Op) == OpSize::i32Bit ? std::make_optional(&OpDispatchBuilder::ZeroShiftResult) : std::nullopt);
|
||||
}
|
||||
|
||||
void OpDispatchBuilder::RCLSmallerOp(OpcodeArgs) {
|
||||
@ -2405,7 +2405,7 @@ void OpDispatchBuilder::BTOp(OpcodeArgs, uint32_t SrcIndex, BTAction Action) {
|
||||
|
||||
// Get the bit selection from the src. We need to mask for 8/16-bit, but
|
||||
// rely on the implicit masking of Lshr for native sizes.
|
||||
unsigned LshrSize = std::max<uint8_t>(OpSize::i32Bit, Size / 8);
|
||||
unsigned LshrSize = std::max<uint8_t>(IR::OpSizeToSize(OpSize::i32Bit), Size / 8);
|
||||
auto BitSelect = (Size == (LshrSize * 8)) ? Src : _And(OpSize::i64Bit, Src, _Constant(Mask));
|
||||
|
||||
// OF/SF/ZF/AF/PF undefined.
|
||||
@ -2458,7 +2458,7 @@ void OpDispatchBuilder::BTOp(OpcodeArgs, uint32_t SrcIndex, BTAction Action) {
|
||||
// Load the address to the memory location
|
||||
Ref Dest = MakeSegmentAddress(Op, Op->Dest);
|
||||
// Get the bit selection from the src
|
||||
Ref BitSelect = _Bfe(IR::SizeToOpSize(std::max<uint8_t>(4u, GetOpSize(Src))), 3, 0, Src);
|
||||
Ref BitSelect = _Bfe(std::max(OpSize::i32Bit, GetOpSize(Src)), 3, 0, Src);
|
||||
|
||||
// Address is provided as bits we want BYTE offsets
|
||||
// Extract Signed offset
|
||||
@ -2523,7 +2523,7 @@ void OpDispatchBuilder::BTOp(OpcodeArgs, uint32_t SrcIndex, BTAction Action) {
|
||||
}
|
||||
|
||||
// Now shift in to the correct bit location
|
||||
Value = _Lshr(IR::SizeToOpSize(std::max<uint8_t>(4u, GetOpSize(Value))), Value, BitSelect);
|
||||
Value = _Lshr(std::max(OpSize::i32Bit, GetOpSize(Value)), Value, BitSelect);
|
||||
|
||||
// OF/SF/ZF/AF/PF undefined.
|
||||
SetCFDirect(Value, ConstantShift, true);
|
||||
@ -2536,21 +2536,22 @@ void OpDispatchBuilder::IMUL1SrcOp(OpcodeArgs) {
|
||||
Ref Src2 = LoadSource(GPRClass, Op, Op->Src[0], Op->Flags, {.AllowUpperGarbage = true});
|
||||
|
||||
const auto Size = OpSizeFromSrc(Op);
|
||||
const auto SizeBits = IR::OpSizeAsBits(Size);
|
||||
|
||||
Ref Dest {};
|
||||
Ref ResultHigh {};
|
||||
switch (Size) {
|
||||
case OpSize::i8Bit:
|
||||
case OpSize::i16Bit: {
|
||||
Src1 = _Sbfe(OpSize::i64Bit, Size * 8, 0, Src1);
|
||||
Src2 = _Sbfe(OpSize::i64Bit, Size * 8, 0, Src2);
|
||||
Src1 = _Sbfe(OpSize::i64Bit, SizeBits, 0, Src1);
|
||||
Src2 = _Sbfe(OpSize::i64Bit, SizeBits, 0, Src2);
|
||||
Dest = _Mul(OpSize::i64Bit, Src1, Src2);
|
||||
ResultHigh = _Sbfe(OpSize::i64Bit, Size * 8, Size * 8, Dest);
|
||||
ResultHigh = _Sbfe(OpSize::i64Bit, SizeBits, SizeBits, Dest);
|
||||
break;
|
||||
}
|
||||
case OpSize::i32Bit: {
|
||||
ResultHigh = _SMull(Src1, Src2);
|
||||
ResultHigh = _Sbfe(OpSize::i64Bit, Size * 8, Size * 8, ResultHigh);
|
||||
ResultHigh = _Sbfe(OpSize::i64Bit, SizeBits, SizeBits, ResultHigh);
|
||||
// Flipped order to save a move
|
||||
Dest = _Mul(OpSize::i32Bit, Src1, Src2);
|
||||
break;
|
||||
@ -2573,6 +2574,7 @@ void OpDispatchBuilder::IMUL2SrcOp(OpcodeArgs) {
|
||||
Ref Src2 = LoadSource(GPRClass, Op, Op->Src[1], Op->Flags, {.AllowUpperGarbage = true});
|
||||
|
||||
const auto Size = OpSizeFromSrc(Op);
|
||||
const auto SizeBits = IR::OpSizeAsBits(Size);
|
||||
|
||||
Ref Dest {};
|
||||
Ref ResultHigh {};
|
||||
@ -2580,15 +2582,15 @@ void OpDispatchBuilder::IMUL2SrcOp(OpcodeArgs) {
|
||||
switch (Size) {
|
||||
case OpSize::i8Bit:
|
||||
case OpSize::i16Bit: {
|
||||
Src1 = _Sbfe(OpSize::i64Bit, Size * 8, 0, Src1);
|
||||
Src2 = _Sbfe(OpSize::i64Bit, Size * 8, 0, Src2);
|
||||
Src1 = _Sbfe(OpSize::i64Bit, SizeBits, 0, Src1);
|
||||
Src2 = _Sbfe(OpSize::i64Bit, SizeBits, 0, Src2);
|
||||
Dest = _Mul(OpSize::i64Bit, Src1, Src2);
|
||||
ResultHigh = _Sbfe(OpSize::i64Bit, Size * 8, Size * 8, Dest);
|
||||
ResultHigh = _Sbfe(OpSize::i64Bit, SizeBits, SizeBits, Dest);
|
||||
break;
|
||||
}
|
||||
case OpSize::i32Bit: {
|
||||
ResultHigh = _SMull(Src1, Src2);
|
||||
ResultHigh = _Sbfe(OpSize::i64Bit, Size * 8, Size * 8, ResultHigh);
|
||||
ResultHigh = _Sbfe(OpSize::i64Bit, SizeBits, SizeBits, ResultHigh);
|
||||
// Flipped order to save a move
|
||||
Dest = _Mul(OpSize::i32Bit, Src1, Src2);
|
||||
break;
|
||||
@ -2608,13 +2610,14 @@ void OpDispatchBuilder::IMUL2SrcOp(OpcodeArgs) {
|
||||
|
||||
void OpDispatchBuilder::IMULOp(OpcodeArgs) {
|
||||
const auto Size = OpSizeFromSrc(Op);
|
||||
const auto SizeBits = IR::OpSizeAsBits(Size);
|
||||
|
||||
Ref Src1 = LoadSource(GPRClass, Op, Op->Dest, Op->Flags, {.AllowUpperGarbage = true});
|
||||
Ref Src2 = LoadGPRRegister(X86State::REG_RAX);
|
||||
|
||||
if (Size != OpSize::i64Bit) {
|
||||
Src1 = _Sbfe(OpSize::i64Bit, Size * 8, 0, Src1);
|
||||
Src2 = _Sbfe(OpSize::i64Bit, Size * 8, 0, Src2);
|
||||
Src1 = _Sbfe(OpSize::i64Bit, SizeBits, 0, Src1);
|
||||
Src2 = _Sbfe(OpSize::i64Bit, SizeBits, 0, Src2);
|
||||
}
|
||||
|
||||
// 64-bit special cased to save a move
|
||||
@ -2659,14 +2662,15 @@ void OpDispatchBuilder::IMULOp(OpcodeArgs) {
|
||||
|
||||
void OpDispatchBuilder::MULOp(OpcodeArgs) {
|
||||
const auto Size = OpSizeFromSrc(Op);
|
||||
const auto SizeBits = IR::OpSizeAsBits(Size);
|
||||
|
||||
Ref Src1 = LoadSource(GPRClass, Op, Op->Dest, Op->Flags, {.AllowUpperGarbage = true});
|
||||
Ref Src2 = LoadGPRRegister(X86State::REG_RAX);
|
||||
Ref Result;
|
||||
|
||||
if (Size != OpSize::i64Bit) {
|
||||
Src1 = _Bfe(OpSize::i64Bit, Size * 8, 0, Src1);
|
||||
Src2 = _Bfe(OpSize::i64Bit, Size * 8, 0, Src2);
|
||||
Src1 = _Bfe(OpSize::i64Bit, SizeBits, 0, Src1);
|
||||
Src2 = _Bfe(OpSize::i64Bit, SizeBits, 0, Src2);
|
||||
Result = _UMul(OpSize::i64Bit, Src1, Src2);
|
||||
}
|
||||
Ref ResultHigh {};
|
||||
@ -2709,17 +2713,19 @@ void OpDispatchBuilder::MULOp(OpcodeArgs) {
|
||||
|
||||
void OpDispatchBuilder::NOTOp(OpcodeArgs) {
|
||||
const auto Size = OpSizeFromSrc(Op);
|
||||
const auto SizeBits = IR::OpSizeAsBits(Size);
|
||||
|
||||
Ref MaskConst {};
|
||||
if (Size == OpSize::i64Bit) {
|
||||
MaskConst = _Constant(~0ULL);
|
||||
} else {
|
||||
MaskConst = _Constant((1ULL << (Size * 8)) - 1);
|
||||
MaskConst = _Constant((1ULL << SizeBits) - 1);
|
||||
}
|
||||
|
||||
if (DestIsLockedMem(Op)) {
|
||||
HandledLock = true;
|
||||
Ref DestMem = MakeSegmentAddress(Op, Op->Dest);
|
||||
_AtomicXor(IR::SizeToOpSize(Size), MaskConst, DestMem);
|
||||
_AtomicXor(Size, MaskConst, DestMem);
|
||||
} else if (!Op->Dest.IsGPR()) {
|
||||
// GPR version plays fast and loose with sizes, be safe for memory tho.
|
||||
Ref Src = LoadSource(GPRClass, Op, Op->Dest, Op->Flags);
|
||||
@ -2742,13 +2748,13 @@ void OpDispatchBuilder::NOTOp(OpcodeArgs) {
|
||||
|
||||
// For 8/16-bit, use 64-bit invert so we invert in place, while getting
|
||||
// insert behaviour. For 32-bit, use 32-bit invert to zero the upper bits.
|
||||
unsigned EffectiveSize = Size == OpSize::i32Bit ? OpSize::i32Bit : GPRSize;
|
||||
const auto EffectiveSize = Size == OpSize::i32Bit ? OpSize::i32Bit : GPRSize;
|
||||
|
||||
// If we're inverting the whole thing, use Not instead of Xor to save a constant.
|
||||
if (Size >= OpSize::i32Bit) {
|
||||
Src = _Not(IR::SizeToOpSize(EffectiveSize), Src);
|
||||
Src = _Not(EffectiveSize, Src);
|
||||
} else {
|
||||
Src = _Xor(IR::SizeToOpSize(EffectiveSize), Src, MaskConst);
|
||||
Src = _Xor(EffectiveSize, Src, MaskConst);
|
||||
}
|
||||
|
||||
// Always store 64-bit, the Not/Xor correctly handle the upper bits and this
|
||||
@ -2816,7 +2822,7 @@ void OpDispatchBuilder::DAAOp(OpcodeArgs) {
|
||||
|
||||
// SF, ZF, PF set according to result. CF set per above. OF undefined.
|
||||
StoreGPRRegister(X86State::REG_RAX, AL, OpSize::i8Bit);
|
||||
SetNZ_ZeroCV(1, AL);
|
||||
SetNZ_ZeroCV(OpSize::i8Bit, AL);
|
||||
SetCFInverted(CFInv);
|
||||
CalculatePF(AL);
|
||||
SetAFAndFixup(AF);
|
||||
@ -2842,7 +2848,7 @@ void OpDispatchBuilder::DASOp(OpcodeArgs) {
|
||||
|
||||
// SF, ZF, PF set according to result. CF set per above. OF undefined.
|
||||
StoreGPRRegister(X86State::REG_RAX, AL, OpSize::i8Bit);
|
||||
SetNZ_ZeroCV(1, AL);
|
||||
SetNZ_ZeroCV(OpSize::i8Bit, AL);
|
||||
SetCFDirect(NewCF);
|
||||
CalculatePF(AL);
|
||||
SetAFAndFixup(AF);
|
||||
@ -2898,7 +2904,7 @@ void OpDispatchBuilder::AAMOp(OpcodeArgs) {
|
||||
auto Res = _AddShift(OpSize::i64Bit, URemOp, UDivOp, ShiftType::LSL, 8);
|
||||
StoreGPRRegister(X86State::REG_RAX, Res, OpSize::i16Bit);
|
||||
|
||||
SetNZ_ZeroCV(1, Res);
|
||||
SetNZ_ZeroCV(OpSize::i8Bit, Res);
|
||||
CalculatePF(Res);
|
||||
InvalidateAF();
|
||||
}
|
||||
@ -2913,7 +2919,7 @@ void OpDispatchBuilder::AADOp(OpcodeArgs) {
|
||||
auto Result = _And(OpSize::i64Bit, NewAL, _Constant(0xFF));
|
||||
StoreGPRRegister(X86State::REG_RAX, Result, OpSize::i16Bit);
|
||||
|
||||
SetNZ_ZeroCV(1, Result);
|
||||
SetNZ_ZeroCV(OpSize::i8Bit, Result);
|
||||
CalculatePF(Result);
|
||||
InvalidateAF();
|
||||
}
|
||||
@ -2978,14 +2984,14 @@ void OpDispatchBuilder::EnterOp(OpcodeArgs) {
|
||||
|
||||
if (Level > 0) {
|
||||
for (uint8_t i = 1; i < Level; ++i) {
|
||||
auto Offset = _Constant(i * GPRSize);
|
||||
auto MemLoc = _Sub(IR::SizeToOpSize(GPRSize), OldBP, Offset);
|
||||
auto Offset = _Constant(i * IR::OpSizeToSize(GPRSize));
|
||||
auto MemLoc = _Sub(GPRSize, OldBP, Offset);
|
||||
auto Mem = _LoadMem(GPRClass, GPRSize, MemLoc, GPRSize);
|
||||
NewSP = PushValue(GPRSize, Mem);
|
||||
}
|
||||
NewSP = PushValue(GPRSize, temp_RBP);
|
||||
}
|
||||
NewSP = _Sub(IR::SizeToOpSize(GPRSize), NewSP, _Constant(AllocSpace));
|
||||
NewSP = _Sub(GPRSize, NewSP, _Constant(AllocSpace));
|
||||
StoreGPRRegister(X86State::REG_RSP, NewSP);
|
||||
StoreGPRRegister(X86State::REG_RBP, temp_RBP);
|
||||
}
|
||||
@ -3186,7 +3192,7 @@ void OpDispatchBuilder::STOSOp(OpcodeArgs) {
|
||||
|
||||
// Offset the pointer
|
||||
Ref TailDest = LoadGPRRegister(X86State::REG_RDI);
|
||||
StoreGPRRegister(X86State::REG_RDI, OffsetByDir(TailDest, Size));
|
||||
StoreGPRRegister(X86State::REG_RDI, OffsetByDir(TailDest, IR::OpSizeToSize(Size)));
|
||||
} else {
|
||||
// FEX doesn't support partial faulting REP instructions.
|
||||
// Converting this to a `MemSet` IR op optimizes this quite significantly in our codegen.
|
||||
@ -3255,7 +3261,7 @@ void OpDispatchBuilder::MOVSOp(OpcodeArgs) {
|
||||
// Store to memory where RDI points
|
||||
_StoreMemAutoTSO(GPRClass, Size, RDI, Src, Size);
|
||||
|
||||
auto PtrDir = LoadDir(Size);
|
||||
auto PtrDir = LoadDir(IR::OpSizeToSize(Size));
|
||||
RSI = _Add(OpSize::i64Bit, RSI, PtrDir);
|
||||
RDI = _Add(OpSize::i64Bit, RDI, PtrDir);
|
||||
|
||||
@ -3285,7 +3291,7 @@ void OpDispatchBuilder::CMPSOp(OpcodeArgs) {
|
||||
|
||||
CalculateFlags_SUB(OpSizeFromSrc(Op), Src2, Src1);
|
||||
|
||||
auto PtrDir = LoadDir(Size);
|
||||
auto PtrDir = LoadDir(IR::OpSizeToSize(Size));
|
||||
|
||||
// Offset the pointer
|
||||
Dest_RDI = _Add(OpSize::i64Bit, Dest_RDI, PtrDir);
|
||||
@ -3342,11 +3348,11 @@ void OpDispatchBuilder::CMPSOp(OpcodeArgs) {
|
||||
StoreGPRRegister(X86State::REG_RCX, TailCounter);
|
||||
|
||||
// Offset the pointer
|
||||
Dest_RDI = _Add(OpSize::i64Bit, Dest_RDI, _Constant(PtrDir * Size));
|
||||
Dest_RDI = _Add(OpSize::i64Bit, Dest_RDI, _Constant(PtrDir * IR::OpSizeToSize(Size)));
|
||||
StoreGPRRegister(X86State::REG_RDI, Dest_RDI);
|
||||
|
||||
// Offset second pointer
|
||||
Dest_RSI = _Add(OpSize::i64Bit, Dest_RSI, _Constant(PtrDir * Size));
|
||||
Dest_RSI = _Add(OpSize::i64Bit, Dest_RSI, _Constant(PtrDir * IR::OpSizeToSize(Size)));
|
||||
StoreGPRRegister(X86State::REG_RSI, Dest_RSI);
|
||||
|
||||
// If TailCounter != 0, compare sources.
|
||||
@ -3403,7 +3409,7 @@ void OpDispatchBuilder::LODSOp(OpcodeArgs) {
|
||||
|
||||
// Offset the pointer
|
||||
Ref TailDest_RSI = LoadGPRRegister(X86State::REG_RSI);
|
||||
StoreGPRRegister(X86State::REG_RSI, OffsetByDir(TailDest_RSI, Size));
|
||||
StoreGPRRegister(X86State::REG_RSI, OffsetByDir(TailDest_RSI, IR::OpSizeToSize(Size)));
|
||||
} else {
|
||||
// Calculate flags early. because end of block
|
||||
CalculateDeferredFlags();
|
||||
@ -3452,7 +3458,7 @@ void OpDispatchBuilder::LODSOp(OpcodeArgs) {
|
||||
StoreGPRRegister(X86State::REG_RCX, TailCounter);
|
||||
|
||||
// Offset the pointer
|
||||
TailDest_RSI = _Add(OpSize::i64Bit, TailDest_RSI, _Constant(PtrDir * Size));
|
||||
TailDest_RSI = _Add(OpSize::i64Bit, TailDest_RSI, _Constant(PtrDir * IR::OpSizeToSize(Size)));
|
||||
StoreGPRRegister(X86State::REG_RSI, TailDest_RSI);
|
||||
|
||||
// Jump back to the start, we have more work to do
|
||||
@ -3487,7 +3493,7 @@ void OpDispatchBuilder::SCASOp(OpcodeArgs) {
|
||||
|
||||
// Offset the pointer
|
||||
Ref TailDest_RDI = LoadGPRRegister(X86State::REG_RDI);
|
||||
StoreGPRRegister(X86State::REG_RDI, OffsetByDir(TailDest_RDI, Size));
|
||||
StoreGPRRegister(X86State::REG_RDI, OffsetByDir(TailDest_RDI, IR::OpSizeToSize(Size)));
|
||||
} else {
|
||||
// Calculate flags early. because end of block
|
||||
CalculateDeferredFlags();
|
||||
@ -3536,7 +3542,7 @@ void OpDispatchBuilder::SCASOp(OpcodeArgs) {
|
||||
StoreGPRRegister(X86State::REG_RCX, TailCounter);
|
||||
|
||||
// Offset the pointer
|
||||
TailDest_RDI = _Add(OpSize::i64Bit, TailDest_RDI, _Constant(Dir * Size));
|
||||
TailDest_RDI = _Add(OpSize::i64Bit, TailDest_RDI, _Constant(Dir * IR::OpSizeToSize(Size)));
|
||||
StoreGPRRegister(X86State::REG_RDI, TailDest_RDI);
|
||||
|
||||
CalculateDeferredFlags();
|
||||
@ -3598,7 +3604,7 @@ void OpDispatchBuilder::NEGOp(OpcodeArgs) {
|
||||
|
||||
if (DestIsLockedMem(Op)) {
|
||||
Ref DestMem = MakeSegmentAddress(Op, Op->Dest);
|
||||
Ref Dest = _AtomicFetchNeg(IR::SizeToOpSize(Size), DestMem);
|
||||
Ref Dest = _AtomicFetchNeg(Size, DestMem);
|
||||
CalculateFlags_SUB(Size, ZeroConst, Dest);
|
||||
} else {
|
||||
Ref Dest = LoadSource(GPRClass, Op, Op->Dest, Op->Flags, {.AllowUpperGarbage = true});
|
||||
@ -3622,7 +3628,7 @@ void OpDispatchBuilder::DIVOp(OpcodeArgs) {
|
||||
auto URemOp = _URem(OpSize::i16Bit, Src1, Divisor);
|
||||
|
||||
// AX[15:0] = concat<URem[7:0]:UDiv[7:0]>
|
||||
auto ResultAX = _Bfi(IR::SizeToOpSize(GPRSize), 8, 8, UDivOp, URemOp);
|
||||
auto ResultAX = _Bfi(GPRSize, 8, 8, UDivOp, URemOp);
|
||||
StoreGPRRegister(X86State::REG_RAX, ResultAX, OpSize::i16Bit);
|
||||
} else if (Size == OpSize::i16Bit) {
|
||||
Ref Src1 = LoadGPRRegister(X86State::REG_RAX, Size);
|
||||
@ -3636,8 +3642,8 @@ void OpDispatchBuilder::DIVOp(OpcodeArgs) {
|
||||
Ref Src1 = LoadGPRRegister(X86State::REG_RAX, Size);
|
||||
Ref Src2 = LoadGPRRegister(X86State::REG_RDX, Size);
|
||||
|
||||
Ref UDivOp = _Bfe(OpSize::i32Bit, Size * 8, 0, _LUDiv(OpSize::i32Bit, Src1, Src2, Divisor));
|
||||
Ref URemOp = _Bfe(OpSize::i32Bit, Size * 8, 0, _LURem(OpSize::i32Bit, Src1, Src2, Divisor));
|
||||
Ref UDivOp = _Bfe(OpSize::i32Bit, IR::OpSizeAsBits(Size), 0, _LUDiv(OpSize::i32Bit, Src1, Src2, Divisor));
|
||||
Ref URemOp = _Bfe(OpSize::i32Bit, IR::OpSizeAsBits(Size), 0, _LURem(OpSize::i32Bit, Src1, Src2, Divisor));
|
||||
|
||||
StoreGPRRegister(X86State::REG_RAX, UDivOp);
|
||||
StoreGPRRegister(X86State::REG_RDX, URemOp);
|
||||
@ -3674,7 +3680,7 @@ void OpDispatchBuilder::IDIVOp(OpcodeArgs) {
|
||||
auto URemOp = _Rem(OpSize::i64Bit, Src1, Divisor);
|
||||
|
||||
// AX[15:0] = concat<URem[7:0]:UDiv[7:0]>
|
||||
auto ResultAX = _Bfi(IR::SizeToOpSize(GPRSize), 8, 8, UDivOp, URemOp);
|
||||
auto ResultAX = _Bfi(GPRSize, 8, 8, UDivOp, URemOp);
|
||||
StoreGPRRegister(X86State::REG_RAX, ResultAX, OpSize::i16Bit);
|
||||
} else if (Size == OpSize::i16Bit) {
|
||||
Ref Src1 = LoadGPRRegister(X86State::REG_RAX, Size);
|
||||
@ -3688,8 +3694,8 @@ void OpDispatchBuilder::IDIVOp(OpcodeArgs) {
|
||||
Ref Src1 = LoadGPRRegister(X86State::REG_RAX, Size);
|
||||
Ref Src2 = LoadGPRRegister(X86State::REG_RDX, Size);
|
||||
|
||||
Ref UDivOp = _Bfe(OpSize::i32Bit, Size * 8, 0, _LDiv(OpSize::i32Bit, Src1, Src2, Divisor));
|
||||
Ref URemOp = _Bfe(OpSize::i32Bit, Size * 8, 0, _LRem(OpSize::i32Bit, Src1, Src2, Divisor));
|
||||
Ref UDivOp = _Bfe(OpSize::i32Bit, IR::OpSizeAsBits(Size), 0, _LDiv(OpSize::i32Bit, Src1, Src2, Divisor));
|
||||
Ref URemOp = _Bfe(OpSize::i32Bit, IR::OpSizeAsBits(Size), 0, _LRem(OpSize::i32Bit, Src1, Src2, Divisor));
|
||||
|
||||
StoreGPRRegister(X86State::REG_RAX, UDivOp);
|
||||
StoreGPRRegister(X86State::REG_RDX, URemOp);
|
||||
@ -3728,7 +3734,7 @@ void OpDispatchBuilder::BSFOp(OpcodeArgs) {
|
||||
// Although Intel does not guarantee that semantic, AMD does and Intel
|
||||
// hardware satisfies it. We provide the stronger AMD behaviour as
|
||||
// applications might rely on that in the wild.
|
||||
auto SelectOp = NZCVSelect(IR::SizeToOpSize(GPRSize), {COND_EQ}, Dest, Result);
|
||||
auto SelectOp = NZCVSelect(GPRSize, {COND_EQ}, Dest, Result);
|
||||
StoreResult_WithOpSize(GPRClass, Op, Op->Dest, SelectOp, DstSize, OpSize::iInvalid);
|
||||
}
|
||||
|
||||
@ -3746,7 +3752,7 @@ void OpDispatchBuilder::BSROp(OpcodeArgs) {
|
||||
SetZ_InvalidateNCV(OpSizeFromSrc(Op), Src);
|
||||
|
||||
// If Src was zero then the destination doesn't get modified
|
||||
auto SelectOp = NZCVSelect(IR::SizeToOpSize(GPRSize), {COND_EQ}, Dest, Result);
|
||||
auto SelectOp = NZCVSelect(GPRSize, {COND_EQ}, Dest, Result);
|
||||
StoreResult_WithOpSize(GPRClass, Op, Op->Dest, SelectOp, DstSize, OpSize::iInvalid);
|
||||
}
|
||||
|
||||
@ -3784,7 +3790,7 @@ void OpDispatchBuilder::CMPXCHGOp(OpcodeArgs) {
|
||||
|
||||
if (GPRSize == OpSize::i64Bit && Size == OpSize::i32Bit) {
|
||||
Src1 = LoadSource_WithOpSize(GPRClass, Op, Op->Dest, GPRSize, Op->Flags, {.AllowUpperGarbage = true});
|
||||
Src1Lower = _Bfe(IR::SizeToOpSize(GPRSize), Size * 8, 0, Src1);
|
||||
Src1Lower = _Bfe(GPRSize, IR::OpSizeAsBits(Size), 0, Src1);
|
||||
} else {
|
||||
Src1 = LoadSource_WithOpSize(GPRClass, Op, Op->Dest, Size, Op->Flags, {.AllowUpperGarbage = true});
|
||||
Src1Lower = Src1;
|
||||
@ -3797,7 +3803,7 @@ void OpDispatchBuilder::CMPXCHGOp(OpcodeArgs) {
|
||||
if (!Trivial) {
|
||||
if (GPRSize == OpSize::i64Bit && Size == OpSize::i32Bit) {
|
||||
// This allows us to only hit the ZEXT case on failure
|
||||
Ref RAXResult = NZCVSelect(IR::i64Bit, {COND_EQ}, Src3, Src1Lower);
|
||||
Ref RAXResult = NZCVSelect(OpSize::i64Bit, {COND_EQ}, Src3, Src1Lower);
|
||||
|
||||
// When the size is 4 we need to make sure not zext the GPR when the comparison fails
|
||||
StoreGPRRegister(X86State::REG_RAX, RAXResult);
|
||||
@ -3809,7 +3815,7 @@ void OpDispatchBuilder::CMPXCHGOp(OpcodeArgs) {
|
||||
// Op1 = RAX == Op1 ? Op2 : Op1
|
||||
// If they match then set the rm operand to the input
|
||||
// else don't set the rm operand
|
||||
Ref DestResult = Trivial ? Src2 : NZCVSelect(IR::i64Bit, CondClassType {COND_EQ}, Src2, Src1);
|
||||
Ref DestResult = Trivial ? Src2 : NZCVSelect(OpSize::i64Bit, CondClassType {COND_EQ}, Src2, Src1);
|
||||
|
||||
// Store in to GPR Dest
|
||||
if (GPRSize == OpSize::i64Bit && Size == OpSize::i32Bit) {
|
||||
@ -3837,7 +3843,7 @@ void OpDispatchBuilder::CMPXCHGOp(OpcodeArgs) {
|
||||
// if (DataSrc == Src3) { *Src1 == Src2; } Src2 = DataSrc
|
||||
// This will write to memory! Careful!
|
||||
// Third operand must be a calculated guest memory address
|
||||
Ref CASResult = _CAS(IR::SizeToOpSize(Size), Src3Lower, Src2, Src1);
|
||||
Ref CASResult = _CAS(Size, Src3Lower, Src2, Src1);
|
||||
Ref RAXResult = CASResult;
|
||||
|
||||
CalculateFlags_SUB(OpSizeFromSrc(Op), Src3Lower, CASResult);
|
||||
@ -3845,7 +3851,7 @@ void OpDispatchBuilder::CMPXCHGOp(OpcodeArgs) {
|
||||
|
||||
if (GPRSize == OpSize::i64Bit && Size == OpSize::i32Bit) {
|
||||
// This allows us to only hit the ZEXT case on failure
|
||||
RAXResult = _NZCVSelect(IR::i64Bit, {COND_EQ}, Src3, CASResult);
|
||||
RAXResult = _NZCVSelect(OpSize::i64Bit, {COND_EQ}, Src3, CASResult);
|
||||
Size = OpSize::i64Bit;
|
||||
}
|
||||
|
||||
@ -3885,10 +3891,10 @@ void OpDispatchBuilder::CMPXCHGPairOp(OpcodeArgs) {
|
||||
|
||||
Ref Result_Lower = _AllocateGPR(true);
|
||||
Ref Result_Upper = _AllocateGPRAfter(Result_Lower);
|
||||
_CASPair(IR::SizeToOpSize(Size), Expected_Lower, Expected_Upper, Desired_Lower, Desired_Upper, Src1, Result_Lower, Result_Upper);
|
||||
_CASPair(Size, Expected_Lower, Expected_Upper, Desired_Lower, Desired_Upper, Src1, Result_Lower, Result_Upper);
|
||||
|
||||
HandleNZCV_RMW();
|
||||
_CmpPairZ(IR::SizeToOpSize(Size), Result_Lower, Result_Upper, Expected_Lower, Expected_Upper);
|
||||
_CmpPairZ(Size, Result_Lower, Result_Upper, Expected_Lower, Expected_Upper);
|
||||
CalculateDeferredFlags();
|
||||
|
||||
auto UpdateIfNotZF = [this](auto Reg, auto Value) {
|
||||
@ -4020,7 +4026,7 @@ Ref OpDispatchBuilder::GetSegment(uint32_t Flags, uint32_t DefaultPrefix, bool O
|
||||
Ref OpDispatchBuilder::AppendSegmentOffset(Ref Value, uint32_t Flags, uint32_t DefaultPrefix, bool Override) {
|
||||
auto Segment = GetSegment(Flags, DefaultPrefix, Override);
|
||||
if (Segment) {
|
||||
Value = _Add(IR::SizeToOpSize(std::max<uint8_t>(OpSize::i32Bit, std::max(GetOpSize(Value), GetOpSize(Segment)))), Value, Segment);
|
||||
Value = _Add(std::max(OpSize::i32Bit, std::max(GetOpSize(Value), GetOpSize(Segment))), Value, Segment);
|
||||
}
|
||||
|
||||
return Value;
|
||||
@ -4144,7 +4150,7 @@ Ref OpDispatchBuilder::LoadEffectiveAddress(AddressMode A, bool AddSegmentBase,
|
||||
|
||||
if (A.Offset) {
|
||||
Ref Offset = _Constant(A.Offset);
|
||||
Tmp = Tmp ? _Add(IR::SizeToOpSize(GPRSize), Tmp, Offset) : Offset;
|
||||
Tmp = Tmp ? _Add(GPRSize, Tmp, Offset) : Offset;
|
||||
}
|
||||
|
||||
if (A.Index) {
|
||||
@ -4167,7 +4173,7 @@ Ref OpDispatchBuilder::LoadEffectiveAddress(AddressMode A, bool AddSegmentBase,
|
||||
//
|
||||
// If the AddrSize is not the GPRSize then we need to clear the upper bits.
|
||||
if ((A.AddrSize < GPRSize) && !AllowUpperGarbage && Tmp) {
|
||||
Tmp = _Bfe(GPRSize, A.AddrSize * 8, 0, Tmp);
|
||||
Tmp = _Bfe(GPRSize, IR::OpSizeAsBits(A.AddrSize), 0, Tmp);
|
||||
}
|
||||
|
||||
if (A.Segment && AddSegmentBase) {
|
||||
@ -4177,7 +4183,7 @@ Ref OpDispatchBuilder::LoadEffectiveAddress(AddressMode A, bool AddSegmentBase,
|
||||
return Tmp ?: _Constant(0);
|
||||
}
|
||||
|
||||
AddressMode OpDispatchBuilder::SelectAddressMode(AddressMode A, bool AtomicTSO, bool Vector, unsigned AccessSize) {
|
||||
AddressMode OpDispatchBuilder::SelectAddressMode(AddressMode A, bool AtomicTSO, bool Vector, IR::OpSize AccessSize) {
|
||||
const auto GPRSize = CTX->GetGPROpSize();
|
||||
|
||||
// In the future this also needs to account for LRCPC3.
|
||||
@ -4207,9 +4213,10 @@ AddressMode OpDispatchBuilder::SelectAddressMode(AddressMode A, bool AtomicTSO,
|
||||
}
|
||||
|
||||
// Try a (possibly scaled) register index.
|
||||
if (A.AddrSize == OpSize::i64Bit && A.Base && (A.Index || A.Segment) && !A.Offset && (A.IndexScale == 1 || A.IndexScale == AccessSize)) {
|
||||
if (A.AddrSize == OpSize::i64Bit && A.Base && (A.Index || A.Segment) && !A.Offset &&
|
||||
(A.IndexScale == 1 || A.IndexScale == IR::OpSizeToSize(AccessSize))) {
|
||||
if (A.Index && A.Segment) {
|
||||
A.Base = _Add(IR::SizeToOpSize(GPRSize), A.Base, A.Segment);
|
||||
A.Base = _Add(GPRSize, A.Base, A.Segment);
|
||||
} else if (A.Segment) {
|
||||
A.Index = A.Segment;
|
||||
A.IndexScale = 1;
|
||||
@ -4231,7 +4238,7 @@ AddressMode OpDispatchBuilder::DecodeAddress(const X86Tables::DecodedOp& Op, con
|
||||
|
||||
AddressMode A {};
|
||||
A.Segment = GetSegment(Op->Flags);
|
||||
A.AddrSize = (Op->Flags & X86Tables::DecodeFlags::FLAG_ADDRESS_SIZE) != 0 ? (IR::DivideOpSize(GPRSize, 2)) : GPRSize;
|
||||
A.AddrSize = (Op->Flags & X86Tables::DecodeFlags::FLAG_ADDRESS_SIZE) != 0 ? (GPRSize >> 1) : GPRSize;
|
||||
A.NonTSO = AccessType == MemoryAccessType::NONTSO || AccessType == MemoryAccessType::STREAM;
|
||||
|
||||
if (Operand.IsLiteral()) {
|
||||
@ -4312,7 +4319,7 @@ Ref OpDispatchBuilder::LoadSource_WithOpSize(RegisterClassType Class, const X86T
|
||||
// Now extract the subregister if it was a partial load /smaller/ than SSE size
|
||||
// TODO: Instead of doing the VMov implicitly on load, hunt down all use cases that require partial loads and do it after load.
|
||||
// We don't have information here to know if the operation needs zero upper bits or can contain data.
|
||||
if (!AllowUpperGarbage && OpSize < Core::CPUState::XMM_SSE_REG_SIZE) {
|
||||
if (!AllowUpperGarbage && OpSize < OpSize::i128Bit) {
|
||||
A.Base = _VMov(OpSize, A.Base);
|
||||
}
|
||||
} else {
|
||||
@ -4345,7 +4352,7 @@ Ref OpDispatchBuilder::LoadGPRRegister(uint32_t GPR, IR::OpSize Size, uint8_t Of
|
||||
if (AllowUpperGarbage) {
|
||||
Reg = _Lshr(OpSize, Reg, _Constant(Offset));
|
||||
} else {
|
||||
Reg = _Bfe(OpSize, Size * 8, Offset, Reg);
|
||||
Reg = _Bfe(OpSize, IR::OpSizeAsBits(Size), Offset, Reg);
|
||||
}
|
||||
}
|
||||
return Reg;
|
||||
@ -4360,7 +4367,7 @@ void OpDispatchBuilder::StoreGPRRegister(uint32_t GPR, const Ref Src, IR::OpSize
|
||||
Ref Reg = Src;
|
||||
if (Size != GPRSize || Offset != 0) {
|
||||
// Need to do an insert if not automatic size or zero offset.
|
||||
Reg = _Bfi(GPRSize, Size * 8, Offset, LoadGPRRegister(GPR), Src);
|
||||
Reg = _Bfi(GPRSize, IR::OpSizeAsBits(Size), Offset, LoadGPRRegister(GPR), Src);
|
||||
}
|
||||
|
||||
StoreRegister(GPR, false, Reg);
|
||||
@ -4408,7 +4415,7 @@ void OpDispatchBuilder::StoreResult_WithOpSize(FEXCore::IR::RegisterClassType Cl
|
||||
LOGMAN_THROW_A_FMT(Class != IR::GPRClass, "Partial writes from GPR not allowed. Instruction: {}", Op->TableInfo->Name);
|
||||
|
||||
// XMM-size is handled in implementations.
|
||||
if (VectorSize != Core::CPUState::XMM_AVX_REG_SIZE || OpSize != Core::CPUState::XMM_SSE_REG_SIZE) {
|
||||
if (VectorSize != OpSize::i256Bit || OpSize != OpSize::i128Bit) {
|
||||
auto SrcVector = LoadXMMRegister(gprIndex);
|
||||
Result = _VInsElement(VectorSize, OpSize, 0, 0, SrcVector, Src);
|
||||
}
|
||||
@ -4443,7 +4450,7 @@ void OpDispatchBuilder::StoreResult_WithOpSize(FEXCore::IR::RegisterClassType Cl
|
||||
|
||||
AddressMode A = DecodeAddress(Op, Operand, AccessType, false /* IsLoad */);
|
||||
|
||||
if (OpSize == 10) {
|
||||
if (OpSize == OpSize::f80Bit) {
|
||||
Ref MemStoreDst = LoadEffectiveAddress(A, true);
|
||||
|
||||
// For X87 extended doubles, split before storing
|
||||
@ -4547,7 +4554,7 @@ void OpDispatchBuilder::ALUOp(OpcodeArgs, FEXCore::IR::IROps ALUIROp, FEXCore::I
|
||||
(ALUIROp == IR::IROps::OP_XOR || ALUIROp == IR::IROps::OP_OR || ALUIROp == IR::IROps::OP_ANDWITHFLAGS)) {
|
||||
|
||||
RoundedSize = ResultSize = CTX->GetGPROpSize();
|
||||
LOGMAN_THROW_A_FMT(Const < (1ull << (Size * 8)), "does not clobber");
|
||||
LOGMAN_THROW_A_FMT(Const < (1ull << IR::OpSizeAsBits(Size)), "does not clobber");
|
||||
|
||||
// For AND, we can play the same trick but we instead need the upper bits of
|
||||
// the constant to be all-1s instead of all-0s to preserve. We also can't
|
||||
@ -4559,7 +4566,7 @@ void OpDispatchBuilder::ALUOp(OpcodeArgs, FEXCore::IR::IROps ALUIROp, FEXCore::I
|
||||
// adjusted constant here will inline into the arm64 and instruction, so if
|
||||
// flags are not needed, we save an instruction overall.
|
||||
if (ALUIROp == IR::IROps::OP_ANDWITHFLAGS) {
|
||||
Src = _Constant(Const | ~((1ull << (Size * 8)) - 1));
|
||||
Src = _Constant(Const | ~((1ull << IR::OpSizeAsBits(Size)) - 1));
|
||||
ALUIROp = IR::IROps::OP_AND;
|
||||
}
|
||||
}
|
||||
@ -4570,13 +4577,13 @@ void OpDispatchBuilder::ALUOp(OpcodeArgs, FEXCore::IR::IROps ALUIROp, FEXCore::I
|
||||
if (DestIsLockedMem(Op)) {
|
||||
HandledLock = true;
|
||||
Ref DestMem = MakeSegmentAddress(Op, Op->Dest);
|
||||
DeriveOp(FetchOp, AtomicFetchOp, _AtomicFetchAdd(IR::SizeToOpSize(Size), Src, DestMem));
|
||||
DeriveOp(FetchOp, AtomicFetchOp, _AtomicFetchAdd(Size, Src, DestMem));
|
||||
Dest = FetchOp;
|
||||
} else {
|
||||
Dest = LoadSource(GPRClass, Op, Op->Dest, Op->Flags, {.AllowUpperGarbage = true});
|
||||
}
|
||||
|
||||
const auto OpSize = IR::SizeToOpSize(RoundedSize);
|
||||
const auto OpSize = RoundedSize;
|
||||
DeriveOp(ALUOp, ALUIROp, _AndWithFlags(OpSize, Dest, Src));
|
||||
Result = ALUOp;
|
||||
|
||||
@ -4756,7 +4763,7 @@ void OpDispatchBuilder::MOVBEOp(OpcodeArgs) {
|
||||
// Rev of 16-bit value as 32-bit replaces the result in the upper 16-bits of the result.
|
||||
// bfxil the 16-bit result in to the GPR.
|
||||
Ref Dest = LoadSource_WithOpSize(GPRClass, Op, Op->Dest, GPRSize, Op->Flags);
|
||||
auto Result = _Bfxil(IR::SizeToOpSize(GPRSize), 16, 16, Dest, Src);
|
||||
auto Result = _Bfxil(GPRSize, 16, 16, Dest, Src);
|
||||
StoreResult_WithOpSize(GPRClass, Op, Op->Dest, Result, GPRSize, OpSize::iInvalid);
|
||||
} else {
|
||||
// 32-bit does regular zext
|
||||
|
@ -938,12 +938,12 @@ public:
|
||||
void AVX128_VectorALU(OpcodeArgs, IROps IROp, IR::OpSize ElementSize);
|
||||
void AVX128_VectorUnary(OpcodeArgs, IROps IROp, IR::OpSize ElementSize);
|
||||
void AVX128_VectorUnaryImpl(OpcodeArgs, IR::OpSize SrcSize, IR::OpSize ElementSize, std::function<Ref(IR::OpSize ElementSize, Ref Src)> Helper);
|
||||
void AVX128_VectorBinaryImpl(OpcodeArgs, size_t SrcSize, IR::OpSize ElementSize,
|
||||
void AVX128_VectorBinaryImpl(OpcodeArgs, IR::OpSize SrcSize, IR::OpSize ElementSize,
|
||||
std::function<Ref(IR::OpSize ElementSize, Ref Src1, Ref Src2)> Helper);
|
||||
void AVX128_VectorShiftWideImpl(OpcodeArgs, IR::OpSize ElementSize, IROps IROp);
|
||||
void AVX128_VectorShiftImmImpl(OpcodeArgs, IR::OpSize ElementSize, IROps IROp);
|
||||
void AVX128_VectorTrinaryImpl(OpcodeArgs, size_t SrcSize, size_t ElementSize, Ref Src3,
|
||||
std::function<Ref(size_t ElementSize, Ref Src1, Ref Src2, Ref Src3)> Helper);
|
||||
void AVX128_VectorTrinaryImpl(OpcodeArgs, IR::OpSize SrcSize, IR::OpSize ElementSize, Ref Src3,
|
||||
std::function<Ref(IR::OpSize ElementSize, Ref Src1, Ref Src2, Ref Src3)> Helper);
|
||||
|
||||
enum class ShiftDirection { RIGHT, LEFT };
|
||||
void AVX128_ShiftDoubleImm(OpcodeArgs, ShiftDirection Dir);
|
||||
@ -993,7 +993,7 @@ public:
|
||||
template<IR::OpSize ElementSize>
|
||||
void AVX128_PExtr(OpcodeArgs);
|
||||
void AVX128_ExtendVectorElements(OpcodeArgs, IR::OpSize ElementSize, IR::OpSize DstElementSize, bool Signed);
|
||||
template<size_t ElementSize>
|
||||
template<IR::OpSize ElementSize>
|
||||
void AVX128_MOVMSK(OpcodeArgs);
|
||||
void AVX128_MOVMSKB(OpcodeArgs);
|
||||
void AVX128_PINSRImpl(OpcodeArgs, IR::OpSize ElementSize, const X86Tables::DecodedOperand& Src1Op,
|
||||
@ -1065,7 +1065,7 @@ public:
|
||||
template<IR::OpSize ElementSize>
|
||||
void AVX128_VSHUF(OpcodeArgs);
|
||||
|
||||
template<size_t ElementSize>
|
||||
template<IR::OpSize ElementSize>
|
||||
void AVX128_VPERMILImm(OpcodeArgs);
|
||||
|
||||
template<IROps IROp, IR::OpSize ElementSize>
|
||||
@ -1137,7 +1137,7 @@ public:
|
||||
void StoreResult_WithAVXInsert(VectorOpType Type, FEXCore::IR::RegisterClassType Class, FEXCore::X86Tables::DecodedOp Op, Ref Value,
|
||||
IR::OpSize Align, MemoryAccessType AccessType = MemoryAccessType::DEFAULT) {
|
||||
if (Op->Dest.IsGPR() && Op->Dest.Data.GPR.GPR >= X86State::REG_XMM_0 && Op->Dest.Data.GPR.GPR <= X86State::REG_XMM_15 &&
|
||||
GetGuestVectorLength() == Core::CPUState::XMM_AVX_REG_SIZE && Type == VectorOpType::SSE) {
|
||||
GetGuestVectorLength() == OpSize::i256Bit && Type == VectorOpType::SSE) {
|
||||
const auto gpr = Op->Dest.Data.GPR.GPR;
|
||||
const auto gprIndex = gpr - X86State::REG_XMM_0;
|
||||
auto DestVector = LoadXMMRegister(gprIndex);
|
||||
@ -1150,7 +1150,7 @@ public:
|
||||
}
|
||||
|
||||
void StoreXMMRegister_WithAVXInsert(VectorOpType Type, uint32_t XMM, Ref Value) {
|
||||
if (GetGuestVectorLength() == Core::CPUState::XMM_AVX_REG_SIZE && Type == VectorOpType::SSE) {
|
||||
if (GetGuestVectorLength() == OpSize::i256Bit && Type == VectorOpType::SSE) {
|
||||
///< SSE vector stores need to insert in the low 128-bit lane of the 256-bit register.
|
||||
auto DestVector = LoadXMMRegister(XMM);
|
||||
Value = _VInsElement(GetGuestVectorLength(), OpSize::i128Bit, 0, 0, DestVector, Value);
|
||||
@ -1233,12 +1233,14 @@ public:
|
||||
// Use stp where possible to store multiple values at a time. This accelerates AVX.
|
||||
// TODO: this is all really confusing because of backwards iteration,
|
||||
// can we peel back that hack?
|
||||
if ((Bits & NextBit) && !Partial && Size >= 4 && CacheIndexToContextOffset(Index - 1) == Offset - Size && (Offset - Size) / Size < 64) {
|
||||
const auto SizeInt = IR::OpSizeToSize(Size);
|
||||
if ((Bits & NextBit) && !Partial && Size >= OpSize::i32Bit && CacheIndexToContextOffset(Index - 1) == Offset - SizeInt &&
|
||||
(Offset - SizeInt) / SizeInt < 64) {
|
||||
LOGMAN_THROW_A_FMT(CacheIndexClass(Index - 1) == Class, "construction");
|
||||
LOGMAN_THROW_A_FMT((Offset % Size) == 0, "construction");
|
||||
LOGMAN_THROW_A_FMT((Offset % SizeInt) == 0, "construction");
|
||||
Ref ValueNext = RegCache.Value[Index - 1];
|
||||
|
||||
_StoreContextPair(Size, Class, ValueNext, Value, Offset - Size);
|
||||
_StoreContextPair(Size, Class, ValueNext, Value, Offset - SizeInt);
|
||||
Bits &= ~NextBit;
|
||||
} else {
|
||||
_StoreContext(Size, Class, Value, Offset);
|
||||
@ -1380,7 +1382,7 @@ private:
|
||||
Ref InsertPSOpImpl(OpcodeArgs, const X86Tables::DecodedOperand& Src1, const X86Tables::DecodedOperand& Src2,
|
||||
const X86Tables::DecodedOperand& Imm);
|
||||
|
||||
Ref MPSADBWOpImpl(size_t SrcSize, Ref Src1, Ref Src2, uint8_t Select);
|
||||
Ref MPSADBWOpImpl(IR::OpSize SrcSize, Ref Src1, Ref Src2, uint8_t Select);
|
||||
|
||||
Ref PALIGNROpImpl(OpcodeArgs, const X86Tables::DecodedOperand& Src1, const X86Tables::DecodedOperand& Src2,
|
||||
const X86Tables::DecodedOperand& Imm, bool IsAVX);
|
||||
@ -1503,7 +1505,7 @@ private:
|
||||
Ref GetRelocatedPC(const FEXCore::X86Tables::DecodedOp& Op, int64_t Offset = 0);
|
||||
|
||||
Ref LoadEffectiveAddress(AddressMode A, bool AddSegmentBase, bool AllowUpperGarbage = false);
|
||||
AddressMode SelectAddressMode(AddressMode A, bool AtomicTSO, bool Vector, unsigned AccessSize);
|
||||
AddressMode SelectAddressMode(AddressMode A, bool AtomicTSO, bool Vector, IR::OpSize AccessSize);
|
||||
|
||||
bool IsOperandMem(const X86Tables::DecodedOperand& Operand, bool Load) {
|
||||
// Literals are immediates as sources but memory addresses as destinations.
|
||||
@ -1627,24 +1629,24 @@ private:
|
||||
NZCVDirty = true;
|
||||
}
|
||||
|
||||
void SetNZ_ZeroCV(unsigned SrcSize, Ref Res, bool SetPF = false) {
|
||||
void SetNZ_ZeroCV(IR::OpSize SrcSize, Ref Res, bool SetPF = false) {
|
||||
HandleNZ00Write();
|
||||
|
||||
// x - 0 = x. NZ set according to Res. C always set. V always unset. This
|
||||
// matches what we want since we want carry inverted.
|
||||
//
|
||||
// This is currently worse for 8/16-bit, but that should be optimized. TODO
|
||||
if (SrcSize >= 4) {
|
||||
if (SrcSize >= OpSize::i32Bit) {
|
||||
if (SetPF) {
|
||||
CalculatePF(_SubWithFlags(IR::SizeToOpSize(SrcSize), Res, _Constant(0)));
|
||||
CalculatePF(_SubWithFlags(SrcSize, Res, _Constant(0)));
|
||||
} else {
|
||||
_SubNZCV(IR::SizeToOpSize(SrcSize), Res, _Constant(0));
|
||||
_SubNZCV(SrcSize, Res, _Constant(0));
|
||||
}
|
||||
|
||||
PossiblySetNZCVBits |= 1u << IndexNZCV(FEXCore::X86State::RFLAG_CF_RAW_LOC);
|
||||
CFInverted = true;
|
||||
} else {
|
||||
_TestNZ(IR::SizeToOpSize(SrcSize), Res, Res);
|
||||
_TestNZ(SrcSize, Res, Res);
|
||||
CFInverted = false;
|
||||
|
||||
if (SetPF) {
|
||||
@ -1653,7 +1655,7 @@ private:
|
||||
}
|
||||
}
|
||||
|
||||
void SetNZP_ZeroCV(unsigned SrcSize, Ref Res) {
|
||||
void SetNZP_ZeroCV(IR::OpSize SrcSize, Ref Res) {
|
||||
SetNZ_ZeroCV(SrcSize, Res, true);
|
||||
}
|
||||
|
||||
@ -1705,8 +1707,8 @@ private:
|
||||
HandleNZCVWrite();
|
||||
CFInverted = true;
|
||||
|
||||
if (Size < 4) {
|
||||
_TestNZ(OpSize::i32Bit, Src, _InlineConstant((1u << (8 * Size)) - 1));
|
||||
if (Size < OpSize::i32Bit) {
|
||||
_TestNZ(OpSize::i32Bit, Src, _InlineConstant((1u << (IR::OpSizeAsBits(Size))) - 1));
|
||||
} else {
|
||||
_TestNZ(Size, Src, Src);
|
||||
}
|
||||
@ -1882,7 +1884,7 @@ private:
|
||||
LOGMAN_THROW_AA_FMT(Index < 64, "valid index");
|
||||
uint64_t Bit = (1ull << (uint64_t)Index);
|
||||
|
||||
if (Size == 16 && (RegCache.Partial & Bit)) {
|
||||
if (Size == OpSize::i128Bit && (RegCache.Partial & Bit)) {
|
||||
// We need to load the full register extend if we previously did a partial access.
|
||||
Ref Value = RegCache.Value[Index];
|
||||
Ref Full = _LoadContext(Size, RegClass, Offset);
|
||||
@ -1902,7 +1904,7 @@ private:
|
||||
RegCache.Value[Index] = _LoadContext(Size, RegClass, Offset);
|
||||
|
||||
// We may have done a partial load, this requires special handling.
|
||||
if (Size == 8) {
|
||||
if (Size == OpSize::i64Bit) {
|
||||
RegCache.Partial |= Bit;
|
||||
}
|
||||
} else if (Index == PFIndex) {
|
||||
@ -1938,12 +1940,13 @@ private:
|
||||
|
||||
// Try to load a pair into the cache
|
||||
uint64_t Bits = (3ull << (uint64_t)Index);
|
||||
if (((RegCache.Partial | RegCache.Cached) & Bits) == 0 && ((Offset / Size) < 64)) {
|
||||
const auto SizeInt = IR::OpSizeToSize(Size);
|
||||
if (((RegCache.Partial | RegCache.Cached) & Bits) == 0 && ((Offset / SizeInt) < 64)) {
|
||||
auto Values = LoadContextPair_Uncached(RegClass, Size, Offset);
|
||||
RegCache.Value[Index] = Values.Low;
|
||||
RegCache.Value[Index + 1] = Values.High;
|
||||
RegCache.Cached |= Bits;
|
||||
if (Size == 8) {
|
||||
if (Size == OpSize::i64Bit) {
|
||||
RegCache.Partial |= Bits;
|
||||
}
|
||||
return Values;
|
||||
@ -1952,7 +1955,7 @@ private:
|
||||
// Fallback on a pair of loads
|
||||
return {
|
||||
.Low = LoadRegCache(Offset, Index, RegClass, Size),
|
||||
.High = LoadRegCache(Offset + Size, Index + 1, RegClass, Size),
|
||||
.High = LoadRegCache(Offset + SizeInt, Index + 1, RegClass, Size),
|
||||
};
|
||||
}
|
||||
|
||||
@ -2427,10 +2430,11 @@ private:
|
||||
}
|
||||
|
||||
AddressMode SelectPairAddressMode(AddressMode A, IR::OpSize Size) {
|
||||
const auto SizeInt = IR::OpSizeToSize(Size);
|
||||
AddressMode Out {};
|
||||
|
||||
signed OffsetEl = A.Offset / Size;
|
||||
if ((A.Offset % Size) == 0 && OffsetEl >= -64 && OffsetEl < 64) {
|
||||
signed OffsetEl = A.Offset / SizeInt;
|
||||
if ((A.Offset % SizeInt) == 0 && OffsetEl >= -64 && OffsetEl < 64) {
|
||||
Out.Offset = A.Offset;
|
||||
A.Offset = 0;
|
||||
}
|
||||
@ -2477,6 +2481,7 @@ private:
|
||||
|
||||
void _StoreMemPairAutoTSO(FEXCore::IR::RegisterClassType Class, IR::OpSize Size, AddressMode A, Ref Value1, Ref Value2,
|
||||
IR::OpSize Align = IR::OpSize::i8Bit) {
|
||||
const auto SizeInt = IR::OpSizeToSize(Size);
|
||||
bool AtomicTSO = IsTSOEnabled(Class) && !A.NonTSO;
|
||||
|
||||
// Use stp if possible, otherwise fallback on two stores.
|
||||
@ -2485,7 +2490,7 @@ private:
|
||||
_StoreMemPair(Class, Size, Value1, Value2, A.Base, A.Offset);
|
||||
} else {
|
||||
_StoreMemAutoTSO(Class, Size, A, Value1, OpSize::i8Bit);
|
||||
A.Offset += Size;
|
||||
A.Offset += SizeInt;
|
||||
_StoreMemAutoTSO(Class, Size, A, Value2, OpSize::i8Bit);
|
||||
}
|
||||
}
|
||||
|
@ -74,8 +74,8 @@ void OpDispatchBuilder::InstallAVX128Handlers() {
|
||||
{OPD(1, 0b00, 0x2F), 1, &OpDispatchBuilder::AVX128_UCOMISx<OpSize::i32Bit>},
|
||||
{OPD(1, 0b01, 0x2F), 1, &OpDispatchBuilder::AVX128_UCOMISx<OpSize::i64Bit>},
|
||||
|
||||
{OPD(1, 0b00, 0x50), 1, &OpDispatchBuilder::AVX128_MOVMSK<4>},
|
||||
{OPD(1, 0b01, 0x50), 1, &OpDispatchBuilder::AVX128_MOVMSK<8>},
|
||||
{OPD(1, 0b00, 0x50), 1, &OpDispatchBuilder::AVX128_MOVMSK<OpSize::i32Bit>},
|
||||
{OPD(1, 0b01, 0x50), 1, &OpDispatchBuilder::AVX128_MOVMSK<OpSize::i64Bit>},
|
||||
|
||||
{OPD(1, 0b00, 0x51), 1, &OpDispatchBuilder::Bind<&OpDispatchBuilder::AVX128_VectorUnary, IR::OP_VFSQRT, OpSize::i32Bit>},
|
||||
{OPD(1, 0b01, 0x51), 1, &OpDispatchBuilder::Bind<&OpDispatchBuilder::AVX128_VectorUnary, IR::OP_VFSQRT, OpSize::i64Bit>},
|
||||
@ -158,7 +158,7 @@ void OpDispatchBuilder::InstallAVX128Handlers() {
|
||||
{OPD(1, 0b01, 0x6F), 1, &OpDispatchBuilder::AVX128_VMOVAPS},
|
||||
{OPD(1, 0b10, 0x6F), 1, &OpDispatchBuilder::AVX128_VMOVAPS},
|
||||
|
||||
{OPD(1, 0b01, 0x70), 1, &OpDispatchBuilder::AVX128_VPERMILImm<4>},
|
||||
{OPD(1, 0b01, 0x70), 1, &OpDispatchBuilder::AVX128_VPERMILImm<OpSize::i32Bit>},
|
||||
{OPD(1, 0b10, 0x70), 1, &OpDispatchBuilder::Bind<&OpDispatchBuilder::AVX128_VPSHUFW, false>},
|
||||
{OPD(1, 0b11, 0x70), 1, &OpDispatchBuilder::Bind<&OpDispatchBuilder::AVX128_VPSHUFW, true>},
|
||||
|
||||
@ -379,8 +379,8 @@ void OpDispatchBuilder::InstallAVX128Handlers() {
|
||||
{OPD(3, 0b01, 0x00), 1, &OpDispatchBuilder::AVX128_VPERMQ},
|
||||
{OPD(3, 0b01, 0x01), 1, &OpDispatchBuilder::AVX128_VPERMQ},
|
||||
{OPD(3, 0b01, 0x02), 1, &OpDispatchBuilder::AVX128_VBLEND<OpSize::i32Bit>},
|
||||
{OPD(3, 0b01, 0x04), 1, &OpDispatchBuilder::AVX128_VPERMILImm<4>},
|
||||
{OPD(3, 0b01, 0x05), 1, &OpDispatchBuilder::AVX128_VPERMILImm<8>},
|
||||
{OPD(3, 0b01, 0x04), 1, &OpDispatchBuilder::AVX128_VPERMILImm<OpSize::i32Bit>},
|
||||
{OPD(3, 0b01, 0x05), 1, &OpDispatchBuilder::AVX128_VPERMILImm<OpSize::i64Bit>},
|
||||
{OPD(3, 0b01, 0x06), 1, &OpDispatchBuilder::AVX128_VPERM2},
|
||||
{OPD(3, 0b01, 0x08), 1, &OpDispatchBuilder::AVX128_VectorRound<OpSize::i32Bit>},
|
||||
{OPD(3, 0b01, 0x09), 1, &OpDispatchBuilder::AVX128_VectorRound<OpSize::i64Bit>},
|
||||
@ -665,7 +665,7 @@ void OpDispatchBuilder::AVX128_VectorUnary(OpcodeArgs, IROps IROp, IR::OpSize El
|
||||
|
||||
void OpDispatchBuilder::AVX128_VectorUnaryImpl(OpcodeArgs, IR::OpSize SrcSize, IR::OpSize ElementSize,
|
||||
std::function<Ref(IR::OpSize ElementSize, Ref Src)> Helper) {
|
||||
const auto Is128Bit = SrcSize == Core::CPUState::XMM_SSE_REG_SIZE;
|
||||
const auto Is128Bit = SrcSize == OpSize::i128Bit;
|
||||
|
||||
auto Src = AVX128_LoadSource_WithOpSize(Op, Op->Src[0], Op->Flags, !Is128Bit);
|
||||
RefPair Result {};
|
||||
@ -680,9 +680,9 @@ void OpDispatchBuilder::AVX128_VectorUnaryImpl(OpcodeArgs, IR::OpSize SrcSize, I
|
||||
AVX128_StoreResult_WithOpSize(Op, Op->Dest, Result);
|
||||
}
|
||||
|
||||
void OpDispatchBuilder::AVX128_VectorBinaryImpl(OpcodeArgs, size_t SrcSize, IR::OpSize ElementSize,
|
||||
void OpDispatchBuilder::AVX128_VectorBinaryImpl(OpcodeArgs, IR::OpSize SrcSize, IR::OpSize ElementSize,
|
||||
std::function<Ref(IR::OpSize ElementSize, Ref Src1, Ref Src2)> Helper) {
|
||||
const auto Is128Bit = SrcSize == Core::CPUState::XMM_SSE_REG_SIZE;
|
||||
const auto Is128Bit = SrcSize == OpSize::i128Bit;
|
||||
|
||||
auto Src1 = AVX128_LoadSource_WithOpSize(Op, Op->Src[0], Op->Flags, !Is128Bit);
|
||||
auto Src2 = AVX128_LoadSource_WithOpSize(Op, Op->Src[1], Op->Flags, !Is128Bit);
|
||||
@ -698,9 +698,9 @@ void OpDispatchBuilder::AVX128_VectorBinaryImpl(OpcodeArgs, size_t SrcSize, IR::
|
||||
AVX128_StoreResult_WithOpSize(Op, Op->Dest, Result);
|
||||
}
|
||||
|
||||
void OpDispatchBuilder::AVX128_VectorTrinaryImpl(OpcodeArgs, size_t SrcSize, size_t ElementSize, Ref Src3,
|
||||
std::function<Ref(size_t ElementSize, Ref Src1, Ref Src2, Ref Src3)> Helper) {
|
||||
const auto Is128Bit = SrcSize == Core::CPUState::XMM_SSE_REG_SIZE;
|
||||
void OpDispatchBuilder::AVX128_VectorTrinaryImpl(OpcodeArgs, IR::OpSize SrcSize, IR::OpSize ElementSize, Ref Src3,
|
||||
std::function<Ref(IR::OpSize ElementSize, Ref Src1, Ref Src2, Ref Src3)> Helper) {
|
||||
const auto Is128Bit = SrcSize == OpSize::i128Bit;
|
||||
|
||||
auto Src1 = AVX128_LoadSource_WithOpSize(Op, Op->Src[0], Op->Flags, !Is128Bit);
|
||||
auto Src2 = AVX128_LoadSource_WithOpSize(Op, Op->Src[1], Op->Flags, !Is128Bit);
|
||||
@ -984,13 +984,13 @@ void OpDispatchBuilder::AVX128_VBROADCAST(OpcodeArgs) {
|
||||
|
||||
template<IR::OpSize ElementSize>
|
||||
void OpDispatchBuilder::AVX128_VPUNPCKL(OpcodeArgs) {
|
||||
AVX128_VectorBinaryImpl(Op, GetSrcSize(Op), ElementSize,
|
||||
AVX128_VectorBinaryImpl(Op, OpSizeFromSrc(Op), ElementSize,
|
||||
[this](IR::OpSize _ElementSize, Ref Src1, Ref Src2) { return _VZip(OpSize::i128Bit, _ElementSize, Src1, Src2); });
|
||||
}
|
||||
|
||||
template<IR::OpSize ElementSize>
|
||||
void OpDispatchBuilder::AVX128_VPUNPCKH(OpcodeArgs) {
|
||||
AVX128_VectorBinaryImpl(Op, GetSrcSize(Op), ElementSize,
|
||||
AVX128_VectorBinaryImpl(Op, OpSizeFromSrc(Op), ElementSize,
|
||||
[this](IR::OpSize _ElementSize, Ref Src1, Ref Src2) { return _VZip2(OpSize::i128Bit, _ElementSize, Src1, Src2); });
|
||||
}
|
||||
|
||||
@ -1039,7 +1039,7 @@ void OpDispatchBuilder::AVX128_InsertCVTGPR_To_FPR(OpcodeArgs) {
|
||||
Result.Low = _VSToFVectorInsert(DstSize, DstElementSize, DstElementSize, Src1.Low, Src2.Low, false, false);
|
||||
}
|
||||
|
||||
[[maybe_unused]] const auto Is128Bit = DstSize == Core::CPUState::XMM_SSE_REG_SIZE;
|
||||
[[maybe_unused]] const auto Is128Bit = DstSize == OpSize::i128Bit;
|
||||
LOGMAN_THROW_A_FMT(Is128Bit, "Programming Error: This should never occur!");
|
||||
Result.High = LoadZeroVector(OpSize::i128Bit);
|
||||
|
||||
@ -1073,33 +1073,33 @@ void OpDispatchBuilder::AVX128_CVTFPR_To_GPR(OpcodeArgs) {
|
||||
}
|
||||
|
||||
void OpDispatchBuilder::AVX128_VANDN(OpcodeArgs) {
|
||||
AVX128_VectorBinaryImpl(Op, GetSrcSize(Op), OpSize::i128Bit,
|
||||
AVX128_VectorBinaryImpl(Op, OpSizeFromSrc(Op), OpSize::i128Bit,
|
||||
[this](IR::OpSize _ElementSize, Ref Src1, Ref Src2) { return _VAndn(OpSize::i128Bit, _ElementSize, Src2, Src1); });
|
||||
}
|
||||
|
||||
template<IR::OpSize ElementSize>
|
||||
void OpDispatchBuilder::AVX128_VPACKSS(OpcodeArgs) {
|
||||
AVX128_VectorBinaryImpl(Op, GetSrcSize(Op), ElementSize, [this](IR::OpSize _ElementSize, Ref Src1, Ref Src2) {
|
||||
AVX128_VectorBinaryImpl(Op, OpSizeFromSrc(Op), ElementSize, [this](IR::OpSize _ElementSize, Ref Src1, Ref Src2) {
|
||||
return _VSQXTNPair(OpSize::i128Bit, _ElementSize, Src1, Src2);
|
||||
});
|
||||
}
|
||||
|
||||
template<IR::OpSize ElementSize>
|
||||
void OpDispatchBuilder::AVX128_VPACKUS(OpcodeArgs) {
|
||||
AVX128_VectorBinaryImpl(Op, GetSrcSize(Op), ElementSize, [this](IR::OpSize _ElementSize, Ref Src1, Ref Src2) {
|
||||
AVX128_VectorBinaryImpl(Op, OpSizeFromSrc(Op), ElementSize, [this](IR::OpSize _ElementSize, Ref Src1, Ref Src2) {
|
||||
return _VSQXTUNPair(OpSize::i128Bit, _ElementSize, Src1, Src2);
|
||||
});
|
||||
}
|
||||
|
||||
Ref OpDispatchBuilder::AVX128_PSIGNImpl(IR::OpSize ElementSize, Ref Src1, Ref Src2) {
|
||||
Ref Control = _VSQSHL(OpSize::i128Bit, ElementSize, Src2, (ElementSize * 8) - 1);
|
||||
Control = _VSRSHR(OpSize::i128Bit, ElementSize, Control, (ElementSize * 8) - 1);
|
||||
Ref Control = _VSQSHL(OpSize::i128Bit, ElementSize, Src2, IR::OpSizeAsBits(ElementSize) - 1);
|
||||
Control = _VSRSHR(OpSize::i128Bit, ElementSize, Control, IR::OpSizeAsBits(ElementSize) - 1);
|
||||
return _VMul(OpSize::i128Bit, ElementSize, Src1, Control);
|
||||
}
|
||||
|
||||
template<IR::OpSize ElementSize>
|
||||
void OpDispatchBuilder::AVX128_VPSIGN(OpcodeArgs) {
|
||||
AVX128_VectorBinaryImpl(Op, GetSrcSize(Op), ElementSize,
|
||||
AVX128_VectorBinaryImpl(Op, OpSizeFromSrc(Op), ElementSize,
|
||||
[this](IR::OpSize _ElementSize, Ref Src1, Ref Src2) { return AVX128_PSIGNImpl(_ElementSize, Src1, Src2); });
|
||||
}
|
||||
|
||||
@ -1154,7 +1154,7 @@ void OpDispatchBuilder::AVX128_VFCMP(OpcodeArgs) {
|
||||
.CompType = CompType,
|
||||
};
|
||||
|
||||
AVX128_VectorBinaryImpl(Op, GetSrcSize(Op), ElementSize, [this, &Capture](IR::OpSize _ElementSize, Ref Src1, Ref Src2) {
|
||||
AVX128_VectorBinaryImpl(Op, OpSizeFromSrc(Op), ElementSize, [this, &Capture](IR::OpSize _ElementSize, Ref Src1, Ref Src2) {
|
||||
return VFCMPOpImpl(OpSize::i128Bit, _ElementSize, Src1, Src2, Capture.CompType);
|
||||
});
|
||||
}
|
||||
@ -1234,7 +1234,7 @@ void OpDispatchBuilder::AVX128_PExtr(OpcodeArgs) {
|
||||
}
|
||||
|
||||
// AVX version only operates on 128-bit.
|
||||
const uint8_t NumElements = std::min<uint8_t>(GetSrcSize(Op), OpSize::i128Bit) / OverridenElementSize;
|
||||
const uint8_t NumElements = IR::NumElements(std::min(OpSizeFromSrc(Op), OpSize::i128Bit), OverridenElementSize);
|
||||
Index &= NumElements - 1;
|
||||
|
||||
if (Op->Dest.IsGPR()) {
|
||||
@ -1251,14 +1251,14 @@ void OpDispatchBuilder::AVX128_PExtr(OpcodeArgs) {
|
||||
}
|
||||
|
||||
void OpDispatchBuilder::AVX128_ExtendVectorElements(OpcodeArgs, IR::OpSize ElementSize, IR::OpSize DstElementSize, bool Signed) {
|
||||
const auto DstSize = GetDstSize(Op);
|
||||
const auto DstSize = OpSizeFromDst(Op);
|
||||
|
||||
const auto GetSrc = [&] {
|
||||
if (Op->Src[0].IsGPR()) {
|
||||
return AVX128_LoadSource_WithOpSize(Op, Op->Src[0], Op->Flags, false).Low;
|
||||
} else {
|
||||
// For memory operands the 256-bit variant loads twice the size specified in the table.
|
||||
const auto Is256Bit = DstSize == Core::CPUState::XMM_AVX_REG_SIZE;
|
||||
const auto Is256Bit = DstSize == OpSize::i256Bit;
|
||||
const auto SrcSize = OpSizeFromSrc(Op);
|
||||
const auto LoadSize = Is256Bit ? IR::SizeToOpSize(IR::OpSizeToSize(SrcSize) * 2) : SrcSize;
|
||||
|
||||
@ -1267,8 +1267,7 @@ void OpDispatchBuilder::AVX128_ExtendVectorElements(OpcodeArgs, IR::OpSize Eleme
|
||||
};
|
||||
|
||||
auto Transform = [=, this](Ref Src) {
|
||||
for (auto CurrentElementSize = ElementSize; CurrentElementSize != DstElementSize;
|
||||
CurrentElementSize = IR::MultiplyOpSize(CurrentElementSize, 2)) {
|
||||
for (auto CurrentElementSize = ElementSize; CurrentElementSize != DstElementSize; CurrentElementSize = CurrentElementSize << 1) {
|
||||
if (Signed) {
|
||||
Src = _VSXTL(OpSize::i128Bit, CurrentElementSize, Src);
|
||||
} else {
|
||||
@ -1286,8 +1285,8 @@ void OpDispatchBuilder::AVX128_ExtendVectorElements(OpcodeArgs, IR::OpSize Eleme
|
||||
Result.Low = Transform(Src);
|
||||
} else {
|
||||
// 256-bit operation is a bit special. It splits the incoming source between lower and upper registers.
|
||||
size_t TotalElementCount = OpSize::i256Bit / DstElementSize;
|
||||
size_t TotalElementsToSplitSize = (TotalElementCount / 2) * ElementSize;
|
||||
size_t TotalElementCount = IR::NumElements(OpSize::i256Bit, DstElementSize);
|
||||
size_t TotalElementsToSplitSize = (TotalElementCount / 2) * IR::OpSizeToSize(ElementSize);
|
||||
|
||||
// Split the number of elements in half between lower and upper.
|
||||
Ref SrcHigh = _VDupElement(OpSize::i128Bit, IR::SizeToOpSize(TotalElementsToSplitSize), Src, 1);
|
||||
@ -1303,10 +1302,10 @@ void OpDispatchBuilder::AVX128_ExtendVectorElements(OpcodeArgs, IR::OpSize Eleme
|
||||
AVX128_StoreResult_WithOpSize(Op, Op->Dest, Result);
|
||||
}
|
||||
|
||||
template<size_t ElementSize>
|
||||
template<IR::OpSize ElementSize>
|
||||
void OpDispatchBuilder::AVX128_MOVMSK(OpcodeArgs) {
|
||||
const auto SrcSize = GetSrcSize(Op);
|
||||
const auto Is128Bit = SrcSize == Core::CPUState::XMM_SSE_REG_SIZE;
|
||||
const auto SrcSize = OpSizeFromSrc(Op);
|
||||
const auto Is128Bit = SrcSize == OpSize::i128Bit;
|
||||
|
||||
auto Src = AVX128_LoadSource_WithOpSize(Op, Op->Src[0], Op->Flags, !Is128Bit);
|
||||
|
||||
@ -1385,7 +1384,7 @@ void OpDispatchBuilder::AVX128_MOVMSKB(OpcodeArgs) {
|
||||
|
||||
void OpDispatchBuilder::AVX128_PINSRImpl(OpcodeArgs, IR::OpSize ElementSize, const X86Tables::DecodedOperand& Src1Op,
|
||||
const X86Tables::DecodedOperand& Src2Op, const X86Tables::DecodedOperand& Imm) {
|
||||
const auto NumElements = OpSize::i128Bit / ElementSize;
|
||||
const auto NumElements = IR::NumElements(OpSize::i128Bit, ElementSize);
|
||||
const uint64_t Index = Imm.Literal() & (NumElements - 1);
|
||||
auto Src1 = AVX128_LoadSource_WithOpSize(Op, Src1Op, Op->Flags, false);
|
||||
|
||||
@ -1419,7 +1418,7 @@ void OpDispatchBuilder::AVX128_VPINSRDQ(OpcodeArgs) {
|
||||
}
|
||||
|
||||
void OpDispatchBuilder::AVX128_VariableShiftImpl(OpcodeArgs, IROps IROp) {
|
||||
AVX128_VectorBinaryImpl(Op, GetDstSize(Op), OpSizeFromSrc(Op), [this, IROp](IR::OpSize ElementSize, Ref Src1, Ref Src2) {
|
||||
AVX128_VectorBinaryImpl(Op, OpSizeFromDst(Op), OpSizeFromSrc(Op), [this, IROp](IR::OpSize ElementSize, Ref Src1, Ref Src2) {
|
||||
DeriveOp(Shift, IROp, _VUShr(OpSize::i128Bit, ElementSize, Src1, Src2, true));
|
||||
return Shift;
|
||||
});
|
||||
@ -1431,7 +1430,7 @@ void OpDispatchBuilder::AVX128_ShiftDoubleImm(OpcodeArgs, ShiftDirection Dir) {
|
||||
const bool Right = Dir == ShiftDirection::RIGHT;
|
||||
|
||||
const uint64_t Shift = Op->Src[1].Literal();
|
||||
const uint64_t ExtrShift = Right ? Shift : OpSize::i128Bit - Shift;
|
||||
const uint64_t ExtrShift = Right ? Shift : IR::OpSizeToSize(OpSize::i128Bit) - Shift;
|
||||
|
||||
auto Src = AVX128_LoadSource_WithOpSize(Op, Op->Src[0], Op->Flags, !Is128Bit);
|
||||
|
||||
@ -1486,40 +1485,40 @@ void OpDispatchBuilder::AVX128_VINSERTPS(OpcodeArgs) {
|
||||
|
||||
template<IR::OpSize ElementSize>
|
||||
void OpDispatchBuilder::AVX128_VPHSUB(OpcodeArgs) {
|
||||
AVX128_VectorBinaryImpl(Op, GetDstSize(Op), ElementSize, [this](IR::OpSize _ElementSize, Ref Src1, Ref Src2) {
|
||||
AVX128_VectorBinaryImpl(Op, OpSizeFromDst(Op), ElementSize, [this](IR::OpSize _ElementSize, Ref Src1, Ref Src2) {
|
||||
return PHSUBOpImpl(OpSize::i128Bit, Src1, Src2, _ElementSize);
|
||||
});
|
||||
}
|
||||
|
||||
void OpDispatchBuilder::AVX128_VPHSUBSW(OpcodeArgs) {
|
||||
AVX128_VectorBinaryImpl(Op, GetDstSize(Op), OpSize::i16Bit,
|
||||
AVX128_VectorBinaryImpl(Op, OpSizeFromDst(Op), OpSize::i16Bit,
|
||||
[this](IR::OpSize _ElementSize, Ref Src1, Ref Src2) { return PHSUBSOpImpl(OpSize::i128Bit, Src1, Src2); });
|
||||
}
|
||||
|
||||
template<IR::OpSize ElementSize>
|
||||
void OpDispatchBuilder::AVX128_VADDSUBP(OpcodeArgs) {
|
||||
AVX128_VectorBinaryImpl(Op, GetDstSize(Op), ElementSize, [this](IR::OpSize _ElementSize, Ref Src1, Ref Src2) {
|
||||
AVX128_VectorBinaryImpl(Op, OpSizeFromDst(Op), ElementSize, [this](IR::OpSize _ElementSize, Ref Src1, Ref Src2) {
|
||||
return ADDSUBPOpImpl(OpSize::i128Bit, _ElementSize, Src1, Src2);
|
||||
});
|
||||
}
|
||||
|
||||
template<IR::OpSize ElementSize, bool Signed>
|
||||
void OpDispatchBuilder::AVX128_VPMULL(OpcodeArgs) {
|
||||
static_assert(ElementSize == sizeof(uint32_t), "Currently only handles 32-bit -> 64-bit");
|
||||
static_assert(ElementSize == OpSize::i32Bit, "Currently only handles 32-bit -> 64-bit");
|
||||
|
||||
AVX128_VectorBinaryImpl(Op, GetDstSize(Op), ElementSize, [this](IR::OpSize _ElementSize, Ref Src1, Ref Src2) -> Ref {
|
||||
AVX128_VectorBinaryImpl(Op, OpSizeFromDst(Op), ElementSize, [this](IR::OpSize _ElementSize, Ref Src1, Ref Src2) -> Ref {
|
||||
return PMULLOpImpl(OpSize::i128Bit, ElementSize, Signed, Src1, Src2);
|
||||
});
|
||||
}
|
||||
|
||||
void OpDispatchBuilder::AVX128_VPMULHRSW(OpcodeArgs) {
|
||||
AVX128_VectorBinaryImpl(Op, GetDstSize(Op), OpSize::i16Bit,
|
||||
AVX128_VectorBinaryImpl(Op, OpSizeFromDst(Op), OpSize::i16Bit,
|
||||
[this](IR::OpSize _ElementSize, Ref Src1, Ref Src2) -> Ref { return PMULHRSWOpImpl(OpSize::i128Bit, Src1, Src2); });
|
||||
}
|
||||
|
||||
template<bool Signed>
|
||||
void OpDispatchBuilder::AVX128_VPMULHW(OpcodeArgs) {
|
||||
AVX128_VectorBinaryImpl(Op, GetDstSize(Op), OpSize::i16Bit, [this](IR::OpSize _ElementSize, Ref Src1, Ref Src2) -> Ref {
|
||||
AVX128_VectorBinaryImpl(Op, OpSizeFromDst(Op), OpSize::i16Bit, [this](IR::OpSize _ElementSize, Ref Src1, Ref Src2) -> Ref {
|
||||
if (Signed) {
|
||||
return _VSMulH(OpSize::i128Bit, _ElementSize, Src1, Src2);
|
||||
} else {
|
||||
@ -1546,9 +1545,9 @@ void OpDispatchBuilder::AVX128_Vector_CVT_Float_To_Float(OpcodeArgs) {
|
||||
const auto SrcSize = OpSizeFromSrc(Op);
|
||||
const auto DstSize = OpSizeFromDst(Op);
|
||||
|
||||
const auto IsFloatSrc = SrcElementSize == 4;
|
||||
auto Is128BitSrc = SrcSize == Core::CPUState::XMM_SSE_REG_SIZE;
|
||||
auto Is128BitDst = DstSize == Core::CPUState::XMM_SSE_REG_SIZE;
|
||||
const auto IsFloatSrc = SrcElementSize == OpSize::i32Bit;
|
||||
auto Is128BitSrc = SrcSize == OpSize::i128Bit;
|
||||
auto Is128BitDst = DstSize == OpSize::i128Bit;
|
||||
|
||||
///< Decompose correctly.
|
||||
if (DstElementSize > SrcElementSize && !Is128BitDst) {
|
||||
@ -1630,7 +1629,7 @@ void OpDispatchBuilder::AVX128_Vector_CVT_Float_To_Int(OpcodeArgs) {
|
||||
auto Convert = [this](Ref Src) -> Ref {
|
||||
auto ElementSize = SrcElementSize;
|
||||
if (Narrow) {
|
||||
ElementSize = IR::DivideOpSize(ElementSize, 2);
|
||||
ElementSize = ElementSize >> 1;
|
||||
Src = _Vector_FToF(OpSize::i128Bit, ElementSize, Src, SrcElementSize);
|
||||
}
|
||||
|
||||
@ -1663,7 +1662,7 @@ void OpDispatchBuilder::AVX128_Vector_CVT_Float_To_Int(OpcodeArgs) {
|
||||
template<IR::OpSize SrcElementSize, bool Widen>
|
||||
void OpDispatchBuilder::AVX128_Vector_CVT_Int_To_Float(OpcodeArgs) {
|
||||
const auto Size = OpSizeFromDst(Op);
|
||||
const auto Is128Bit = Size == Core::CPUState::XMM_SSE_REG_SIZE;
|
||||
const auto Is128Bit = Size == OpSize::i128Bit;
|
||||
|
||||
RefPair Src = [&] {
|
||||
if (Widen && !Op->Src[0].IsGPR()) {
|
||||
@ -1682,7 +1681,7 @@ void OpDispatchBuilder::AVX128_Vector_CVT_Int_To_Float(OpcodeArgs) {
|
||||
if (Widen) {
|
||||
DeriveOp(Extended, Op, _VSXTL(OpSize::i128Bit, ElementSize, Src));
|
||||
Src = Extended;
|
||||
ElementSize = IR::MultiplyOpSize(ElementSize, 2);
|
||||
ElementSize = ElementSize << 1;
|
||||
}
|
||||
|
||||
return _Vector_SToF(OpSize::i128Bit, ElementSize, Src);
|
||||
@ -1732,23 +1731,23 @@ void OpDispatchBuilder::AVX128_VAESImc(OpcodeArgs) {
|
||||
}
|
||||
|
||||
void OpDispatchBuilder::AVX128_VAESEnc(OpcodeArgs) {
|
||||
AVX128_VectorTrinaryImpl(Op, GetDstSize(Op), OpSize::i128Bit, LoadZeroVector(OpSize::i128Bit),
|
||||
[this](size_t, Ref Src1, Ref Src2, Ref Src3) { return _VAESEnc(OpSize::i128Bit, Src1, Src2, Src3); });
|
||||
AVX128_VectorTrinaryImpl(Op, OpSizeFromDst(Op), OpSize::i128Bit, LoadZeroVector(OpSize::i128Bit),
|
||||
[this](IR::OpSize, Ref Src1, Ref Src2, Ref Src3) { return _VAESEnc(OpSize::i128Bit, Src1, Src2, Src3); });
|
||||
}
|
||||
|
||||
void OpDispatchBuilder::AVX128_VAESEncLast(OpcodeArgs) {
|
||||
AVX128_VectorTrinaryImpl(Op, GetDstSize(Op), OpSize::i128Bit, LoadZeroVector(OpSize::i128Bit),
|
||||
[this](size_t, Ref Src1, Ref Src2, Ref Src3) { return _VAESEncLast(OpSize::i128Bit, Src1, Src2, Src3); });
|
||||
AVX128_VectorTrinaryImpl(Op, OpSizeFromDst(Op), OpSize::i128Bit, LoadZeroVector(OpSize::i128Bit),
|
||||
[this](IR::OpSize, Ref Src1, Ref Src2, Ref Src3) { return _VAESEncLast(OpSize::i128Bit, Src1, Src2, Src3); });
|
||||
}
|
||||
|
||||
void OpDispatchBuilder::AVX128_VAESDec(OpcodeArgs) {
|
||||
AVX128_VectorTrinaryImpl(Op, GetDstSize(Op), OpSize::i128Bit, LoadZeroVector(OpSize::i128Bit),
|
||||
[this](size_t, Ref Src1, Ref Src2, Ref Src3) { return _VAESDec(OpSize::i128Bit, Src1, Src2, Src3); });
|
||||
AVX128_VectorTrinaryImpl(Op, OpSizeFromDst(Op), OpSize::i128Bit, LoadZeroVector(OpSize::i128Bit),
|
||||
[this](IR::OpSize, Ref Src1, Ref Src2, Ref Src3) { return _VAESDec(OpSize::i128Bit, Src1, Src2, Src3); });
|
||||
}
|
||||
|
||||
void OpDispatchBuilder::AVX128_VAESDecLast(OpcodeArgs) {
|
||||
AVX128_VectorTrinaryImpl(Op, GetDstSize(Op), OpSize::i128Bit, LoadZeroVector(OpSize::i128Bit),
|
||||
[this](size_t, Ref Src1, Ref Src2, Ref Src3) { return _VAESDecLast(OpSize::i128Bit, Src1, Src2, Src3); });
|
||||
AVX128_VectorTrinaryImpl(Op, OpSizeFromDst(Op), OpSize::i128Bit, LoadZeroVector(OpSize::i128Bit),
|
||||
[this](IR::OpSize, Ref Src1, Ref Src2, Ref Src3) { return _VAESDecLast(OpSize::i128Bit, Src1, Src2, Src3); });
|
||||
}
|
||||
|
||||
void OpDispatchBuilder::AVX128_VAESKeyGenAssist(OpcodeArgs) {
|
||||
@ -1838,7 +1837,7 @@ template<IR::OpSize ElementSize>
|
||||
void OpDispatchBuilder::AVX128_VDPP(OpcodeArgs) {
|
||||
const uint64_t Literal = Op->Src[2].Literal();
|
||||
|
||||
AVX128_VectorBinaryImpl(Op, GetSrcSize(Op), ElementSize, [this, Literal](IR::OpSize, Ref Src1, Ref Src2) {
|
||||
AVX128_VectorBinaryImpl(Op, OpSizeFromSrc(Op), ElementSize, [this, Literal](IR::OpSize, Ref Src1, Ref Src2) {
|
||||
return DPPOpImpl(OpSize::i128Bit, Src1, Src2, Literal, ElementSize);
|
||||
});
|
||||
}
|
||||
@ -1927,7 +1926,7 @@ void OpDispatchBuilder::AVX128_VSHUF(OpcodeArgs) {
|
||||
AVX128_StoreResult_WithOpSize(Op, Op->Dest, Result);
|
||||
}
|
||||
|
||||
template<size_t ElementSize>
|
||||
template<IR::OpSize ElementSize>
|
||||
void OpDispatchBuilder::AVX128_VPERMILImm(OpcodeArgs) {
|
||||
const auto SrcSize = GetSrcSize(Op);
|
||||
const auto Is128Bit = SrcSize == Core::CPUState::XMM_SSE_REG_SIZE;
|
||||
@ -1967,31 +1966,31 @@ void OpDispatchBuilder::AVX128_VPERMILImm(OpcodeArgs) {
|
||||
|
||||
template<IROps IROp, IR::OpSize ElementSize>
|
||||
void OpDispatchBuilder::AVX128_VHADDP(OpcodeArgs) {
|
||||
AVX128_VectorBinaryImpl(Op, GetSrcSize(Op), ElementSize, [this](IR::OpSize, Ref Src1, Ref Src2) {
|
||||
AVX128_VectorBinaryImpl(Op, OpSizeFromSrc(Op), ElementSize, [this](IR::OpSize, Ref Src1, Ref Src2) {
|
||||
DeriveOp(Res, IROp, _VFAddP(OpSize::i128Bit, ElementSize, Src1, Src2));
|
||||
return Res;
|
||||
});
|
||||
}
|
||||
|
||||
void OpDispatchBuilder::AVX128_VPHADDSW(OpcodeArgs) {
|
||||
AVX128_VectorBinaryImpl(Op, GetDstSize(Op), OpSize::i16Bit,
|
||||
AVX128_VectorBinaryImpl(Op, OpSizeFromDst(Op), OpSize::i16Bit,
|
||||
[this](IR::OpSize _ElementSize, Ref Src1, Ref Src2) { return PHADDSOpImpl(OpSize::i128Bit, Src1, Src2); });
|
||||
}
|
||||
|
||||
void OpDispatchBuilder::AVX128_VPMADDUBSW(OpcodeArgs) {
|
||||
AVX128_VectorBinaryImpl(Op, GetSrcSize(Op), OpSize::i128Bit,
|
||||
AVX128_VectorBinaryImpl(Op, OpSizeFromDst(Op), OpSize::i128Bit,
|
||||
[this](IR::OpSize _ElementSize, Ref Src1, Ref Src2) { return PMADDUBSWOpImpl(OpSize::i128Bit, Src1, Src2); });
|
||||
}
|
||||
|
||||
void OpDispatchBuilder::AVX128_VPMADDWD(OpcodeArgs) {
|
||||
AVX128_VectorBinaryImpl(Op, GetSrcSize(Op), OpSize::i128Bit,
|
||||
AVX128_VectorBinaryImpl(Op, OpSizeFromDst(Op), OpSize::i128Bit,
|
||||
[this](IR::OpSize _ElementSize, Ref Src1, Ref Src2) { return PMADDWDOpImpl(OpSize::i128Bit, Src1, Src2); });
|
||||
}
|
||||
|
||||
template<IR::OpSize ElementSize>
|
||||
void OpDispatchBuilder::AVX128_VBLEND(OpcodeArgs) {
|
||||
const auto SrcSize = OpSizeFromSrc(Op);
|
||||
const auto Is128Bit = SrcSize == Core::CPUState::XMM_SSE_REG_SIZE;
|
||||
const auto Is128Bit = SrcSize == OpSize::i128Bit;
|
||||
const uint64_t Selector = Op->Src[2].Literal();
|
||||
|
||||
///< High Selector shift depends on element size:
|
||||
@ -2017,19 +2016,19 @@ void OpDispatchBuilder::AVX128_VBLEND(OpcodeArgs) {
|
||||
|
||||
template<IR::OpSize ElementSize>
|
||||
void OpDispatchBuilder::AVX128_VHSUBP(OpcodeArgs) {
|
||||
AVX128_VectorBinaryImpl(Op, GetDstSize(Op), ElementSize,
|
||||
AVX128_VectorBinaryImpl(Op, OpSizeFromDst(Op), ElementSize,
|
||||
[this](IR::OpSize, Ref Src1, Ref Src2) { return HSUBPOpImpl(OpSize::i128Bit, ElementSize, Src1, Src2); });
|
||||
}
|
||||
|
||||
void OpDispatchBuilder::AVX128_VPSHUFB(OpcodeArgs) {
|
||||
auto MaskVector = GeneratePSHUFBMask(OpSize::i128Bit);
|
||||
AVX128_VectorBinaryImpl(Op, GetDstSize(Op), OpSize::i8Bit, [this, MaskVector](IR::OpSize, Ref Src1, Ref Src2) {
|
||||
AVX128_VectorBinaryImpl(Op, OpSizeFromDst(Op), OpSize::i8Bit, [this, MaskVector](IR::OpSize, Ref Src1, Ref Src2) {
|
||||
return PSHUFBOpImpl(OpSize::i128Bit, Src1, Src2, MaskVector);
|
||||
});
|
||||
}
|
||||
|
||||
void OpDispatchBuilder::AVX128_VPSADBW(OpcodeArgs) {
|
||||
AVX128_VectorBinaryImpl(Op, GetDstSize(Op), OpSize::i8Bit,
|
||||
AVX128_VectorBinaryImpl(Op, OpSizeFromDst(Op), OpSize::i8Bit,
|
||||
[this](IR::OpSize, Ref Src1, Ref Src2) { return PSADBWOpImpl(OpSize::i128Bit, Src1, Src2); });
|
||||
}
|
||||
|
||||
@ -2061,7 +2060,7 @@ void OpDispatchBuilder::AVX128_VPALIGNR(OpcodeArgs) {
|
||||
const auto SanitizedDstSize = std::min(Size, OpSize::i128Bit);
|
||||
|
||||
AVX128_VectorBinaryImpl(Op, Size, SanitizedDstSize, [this, Index](IR::OpSize SanitizedDstSize, Ref Src1, Ref Src2) -> Ref {
|
||||
if (Index >= (SanitizedDstSize * 2)) {
|
||||
if (Index >= (IR::OpSizeToSize(SanitizedDstSize) * 2)) {
|
||||
// If the immediate is greater than both vectors combined then it zeroes the vector
|
||||
return LoadZeroVector(OpSize::i128Bit);
|
||||
}
|
||||
@ -2076,7 +2075,7 @@ void OpDispatchBuilder::AVX128_VPALIGNR(OpcodeArgs) {
|
||||
|
||||
void OpDispatchBuilder::AVX128_VMASKMOVImpl(OpcodeArgs, IR::OpSize ElementSize, IR::OpSize DstSize, bool IsStore,
|
||||
const X86Tables::DecodedOperand& MaskOp, const X86Tables::DecodedOperand& DataOp) {
|
||||
const auto Is128Bit = DstSize == Core::CPUState::XMM_SSE_REG_SIZE;
|
||||
const auto Is128Bit = DstSize == OpSize::i128Bit;
|
||||
|
||||
auto Mask = AVX128_LoadSource_WithOpSize(Op, MaskOp, Op->Flags, !Is128Bit);
|
||||
|
||||
@ -2098,14 +2097,14 @@ void OpDispatchBuilder::AVX128_VMASKMOVImpl(OpcodeArgs, IR::OpSize ElementSize,
|
||||
auto Address = MakeAddress(DataOp);
|
||||
|
||||
RefPair Result {};
|
||||
Result.Low = _VLoadVectorMasked(OpSize::i128Bit, ElementSize, Mask.Low, Address, Invalid(), MEM_OFFSET_SXTX, OpSize::i8Bit);
|
||||
Result.Low = _VLoadVectorMasked(OpSize::i128Bit, ElementSize, Mask.Low, Address, Invalid(), MEM_OFFSET_SXTX, 1);
|
||||
|
||||
if (Is128Bit) {
|
||||
Result.High = LoadZeroVector(OpSize::i128Bit);
|
||||
} else {
|
||||
///< TODO: This can be cleaner if AVX128_LoadSource_WithOpSize could return both constructed addresses.
|
||||
auto AddressHigh = _Add(OpSize::i64Bit, Address, _Constant(16));
|
||||
Result.High = _VLoadVectorMasked(OpSize::i128Bit, ElementSize, Mask.High, AddressHigh, Invalid(), MEM_OFFSET_SXTX, OpSize::i8Bit);
|
||||
Result.High = _VLoadVectorMasked(OpSize::i128Bit, ElementSize, Mask.High, AddressHigh, Invalid(), MEM_OFFSET_SXTX, 1);
|
||||
}
|
||||
AVX128_StoreResult_WithOpSize(Op, Op->Dest, Result);
|
||||
}
|
||||
@ -2124,7 +2123,7 @@ void OpDispatchBuilder::AVX128_VMASKMOV(OpcodeArgs) {
|
||||
void OpDispatchBuilder::AVX128_MASKMOV(OpcodeArgs) {
|
||||
///< This instruction only supports 128-bit.
|
||||
const auto Size = OpSizeFromSrc(Op);
|
||||
const auto Is128Bit = Size == Core::CPUState::XMM_SSE_REG_SIZE;
|
||||
const auto Is128Bit = Size == OpSize::i128Bit;
|
||||
|
||||
auto MaskSrc = AVX128_LoadSource_WithOpSize(Op, Op->Src[0], Op->Flags, !Is128Bit);
|
||||
|
||||
@ -2147,11 +2146,9 @@ void OpDispatchBuilder::AVX128_MASKMOV(OpcodeArgs) {
|
||||
template<IR::OpSize ElementSize>
|
||||
void OpDispatchBuilder::AVX128_VectorVariableBlend(OpcodeArgs) {
|
||||
const auto Size = OpSizeFromSrc(Op);
|
||||
const auto Is128Bit = Size == Core::CPUState::XMM_SSE_REG_SIZE;
|
||||
const auto Is128Bit = Size == OpSize::i128Bit;
|
||||
const auto Src3Selector = Op->Src[2].Literal();
|
||||
|
||||
constexpr auto ElementSizeBits = ElementSize * 8;
|
||||
|
||||
auto Src1 = AVX128_LoadSource_WithOpSize(Op, Op->Src[0], Op->Flags, !Is128Bit);
|
||||
auto Src2 = AVX128_LoadSource_WithOpSize(Op, Op->Src[1], Op->Flags, !Is128Bit);
|
||||
|
||||
@ -2163,6 +2160,7 @@ void OpDispatchBuilder::AVX128_VectorVariableBlend(OpcodeArgs) {
|
||||
}
|
||||
|
||||
auto Convert = [this](Ref Src1, Ref Src2, Ref Mask) {
|
||||
const auto ElementSizeBits = IR::OpSizeAsBits(ElementSize);
|
||||
Ref Shifted = _VSShrI(OpSize::i128Bit, ElementSize, Mask, ElementSizeBits - 1);
|
||||
return _VBSL(OpSize::i128Bit, Shifted, Src2, Src1);
|
||||
};
|
||||
@ -2248,7 +2246,7 @@ void OpDispatchBuilder::AVX128_VTESTP(OpcodeArgs) {
|
||||
Ref ZeroConst = _Constant(0);
|
||||
Ref OneConst = _Constant(1);
|
||||
|
||||
const auto ElementSizeInBits = ElementSize * 8;
|
||||
const auto ElementSizeInBits = IR::OpSizeAsBits(ElementSize);
|
||||
|
||||
{
|
||||
// Calculate ZF first.
|
||||
@ -2292,7 +2290,7 @@ void OpDispatchBuilder::AVX128_VTESTP(OpcodeArgs) {
|
||||
}
|
||||
|
||||
// As in PTest, this sets Z appropriately while zeroing the rest of NZCV.
|
||||
SetNZ_ZeroCV(32, ZF);
|
||||
SetNZ_ZeroCV(OpSize::i32Bit, ZF);
|
||||
SetCFInverted(CFInv);
|
||||
ZeroPF_AF();
|
||||
}
|
||||
@ -2339,14 +2337,14 @@ void OpDispatchBuilder::AVX128_PTest(OpcodeArgs) {
|
||||
// Set ZF according to Test1. SF will be zeroed since we do a 32-bit test on
|
||||
// the results of a 16-bit value from the UMaxV, so the 32-bit sign bit is
|
||||
// cleared even if the 16-bit scalars were negative.
|
||||
SetNZ_ZeroCV(32, Test1);
|
||||
SetNZ_ZeroCV(OpSize::i32Bit, Test1);
|
||||
SetCFInverted(Test2);
|
||||
ZeroPF_AF();
|
||||
}
|
||||
|
||||
template<IR::OpSize ElementSize>
|
||||
void OpDispatchBuilder::AVX128_VPERMILReg(OpcodeArgs) {
|
||||
AVX128_VectorBinaryImpl(Op, GetSrcSize(Op), ElementSize, [this](size_t _ElementSize, Ref Src, Ref Indices) {
|
||||
AVX128_VectorBinaryImpl(Op, OpSizeFromSrc(Op), ElementSize, [this](IR::OpSize _ElementSize, Ref Src, Ref Indices) {
|
||||
return VPERMILRegOpImpl(OpSize::i128Bit, ElementSize, Src, Indices);
|
||||
});
|
||||
}
|
||||
@ -2376,7 +2374,7 @@ void OpDispatchBuilder::AVX128_VPERMD(OpcodeArgs) {
|
||||
void OpDispatchBuilder::AVX128_VPCLMULQDQ(OpcodeArgs) {
|
||||
const auto Selector = static_cast<uint8_t>(Op->Src[2].Literal());
|
||||
|
||||
AVX128_VectorBinaryImpl(Op, GetSrcSize(Op), OpSize::iInvalid, [this, Selector](size_t _, Ref Src1, Ref Src2) {
|
||||
AVX128_VectorBinaryImpl(Op, OpSizeFromSrc(Op), OpSize::iInvalid, [this, Selector](IR::OpSize, Ref Src1, Ref Src2) {
|
||||
return _PCLMUL(OpSize::i128Bit, Src1, Src2, Selector & 0b1'0001);
|
||||
});
|
||||
}
|
||||
@ -2548,7 +2546,7 @@ void OpDispatchBuilder::AVX128_VFMAddSubImpl(OpcodeArgs, bool AddSub, uint8_t Sr
|
||||
OpDispatchBuilder::RefPair OpDispatchBuilder::AVX128_VPGatherImpl(OpSize Size, OpSize ElementLoadSize, OpSize AddrElementSize, RefPair Dest,
|
||||
RefPair Mask, RefVSIB VSIB) {
|
||||
LOGMAN_THROW_A_FMT(AddrElementSize == OpSize::i32Bit || AddrElementSize == OpSize::i64Bit, "Unknown address element size");
|
||||
const auto Is128Bit = Size == Core::CPUState::XMM_SSE_REG_SIZE;
|
||||
const auto Is128Bit = Size == OpSize::i128Bit;
|
||||
|
||||
///< BaseAddr doesn't need to exist, calculate that here.
|
||||
Ref BaseAddr = VSIB.BaseAddr;
|
||||
@ -2686,17 +2684,17 @@ OpDispatchBuilder::RefPair OpDispatchBuilder::AVX128_VPGatherQPSImpl(Ref Dest, R
|
||||
template<OpSize AddrElementSize>
|
||||
void OpDispatchBuilder::AVX128_VPGATHER(OpcodeArgs) {
|
||||
|
||||
const auto Size = GetDstSize(Op);
|
||||
const auto Is128Bit = Size == Core::CPUState::XMM_SSE_REG_SIZE;
|
||||
const auto Size = OpSizeFromDst(Op);
|
||||
const auto Is128Bit = Size == OpSize::i128Bit;
|
||||
|
||||
///< Element size is determined by W flag.
|
||||
const OpSize ElementLoadSize = Op->Flags & X86Tables::DecodeFlags::FLAG_OPTION_AVX_W ? OpSize::i64Bit : OpSize::i32Bit;
|
||||
|
||||
// We only need the high address register if the number of data elements is more than what the low half can consume.
|
||||
// But also the number of address elements is clamped by the destination size as well.
|
||||
const size_t NumDataElements = Size / ElementLoadSize;
|
||||
const size_t NumAddrElementBytes = std::min<size_t>(Size, (NumDataElements * AddrElementSize));
|
||||
const bool NeedsHighAddrBytes = NumAddrElementBytes > OpSize::i128Bit;
|
||||
const size_t NumDataElements = IR::NumElements(Size, ElementLoadSize);
|
||||
const size_t NumAddrElementBytes = std::min<size_t>(IR::OpSizeToSize(Size), (NumDataElements * IR::OpSizeToSize(AddrElementSize)));
|
||||
const bool NeedsHighAddrBytes = NumAddrElementBytes > IR::OpSizeToSize(OpSize::i128Bit);
|
||||
|
||||
auto Dest = AVX128_LoadSource_WithOpSize(Op, Op->Dest, Op->Flags, !Is128Bit);
|
||||
auto VSIB = AVX128_LoadVSIB(Op, Op->Src[0], Op->Flags, NeedsHighAddrBytes);
|
||||
@ -2740,7 +2738,7 @@ void OpDispatchBuilder::AVX128_VPGATHER(OpcodeArgs) {
|
||||
} else if (AddrElementSize == OpSize::i64Bit && ElementLoadSize == OpSize::i32Bit) {
|
||||
Result = AVX128_VPGatherQPSImpl(Dest.Low, Mask.Low, VSIB);
|
||||
} else {
|
||||
Result = AVX128_VPGatherImpl(SizeToOpSize(Size), ElementLoadSize, AddrElementSize, Dest, Mask, VSIB);
|
||||
Result = AVX128_VPGatherImpl(Size, ElementLoadSize, AddrElementSize, Dest, Mask, VSIB);
|
||||
}
|
||||
AVX128_StoreResult_WithOpSize(Op, Op->Dest, Result);
|
||||
|
||||
@ -2754,8 +2752,8 @@ void OpDispatchBuilder::AVX128_VPGATHER(OpcodeArgs) {
|
||||
void OpDispatchBuilder::AVX128_VCVTPH2PS(OpcodeArgs) {
|
||||
const auto DstSize = OpSizeFromDst(Op);
|
||||
const auto SrcSize = IR::SizeToOpSize(IR::OpSizeToSize(DstSize) / 2);
|
||||
const auto Is128BitSrc = SrcSize == Core::CPUState::XMM_SSE_REG_SIZE;
|
||||
const auto Is128BitDst = DstSize == Core::CPUState::XMM_SSE_REG_SIZE;
|
||||
const auto Is128BitSrc = SrcSize == OpSize::i128Bit;
|
||||
const auto Is128BitDst = DstSize == OpSize::i128Bit;
|
||||
|
||||
RefPair Src {};
|
||||
if (Op->Src[0].IsGPR()) {
|
||||
@ -2783,7 +2781,7 @@ void OpDispatchBuilder::AVX128_VCVTPH2PS(OpcodeArgs) {
|
||||
|
||||
void OpDispatchBuilder::AVX128_VCVTPS2PH(OpcodeArgs) {
|
||||
const auto SrcSize = OpSizeFromSrc(Op);
|
||||
const auto Is128BitSrc = SrcSize == Core::CPUState::XMM_SSE_REG_SIZE;
|
||||
const auto Is128BitSrc = SrcSize == OpSize::i128Bit;
|
||||
const auto StoreSize = Op->Dest.IsGPR() ? OpSize::i128Bit : IR::SizeToOpSize(IR::OpSizeToSize(SrcSize) / 2);
|
||||
|
||||
const auto Imm8 = Op->Src[1].Literal();
|
||||
@ -2814,7 +2812,7 @@ void OpDispatchBuilder::AVX128_VCVTPS2PH(OpcodeArgs) {
|
||||
|
||||
// We need to eliminate upper junk if we're storing into a register with
|
||||
// a 256-bit source (VCVTPS2PH's destination for registers is an XMM).
|
||||
if (Op->Src[0].IsGPR() && SrcSize == Core::CPUState::XMM_AVX_REG_SIZE) {
|
||||
if (Op->Src[0].IsGPR() && SrcSize == OpSize::i256Bit) {
|
||||
Result = AVX128_Zext(Result.Low);
|
||||
}
|
||||
|
||||
|
@ -322,7 +322,7 @@ void OpDispatchBuilder::AESEncOp(OpcodeArgs) {
|
||||
|
||||
void OpDispatchBuilder::VAESEncOp(OpcodeArgs) {
|
||||
const auto DstSize = OpSizeFromDst(Op);
|
||||
[[maybe_unused]] const auto Is128Bit = DstSize == Core::CPUState::XMM_SSE_REG_SIZE;
|
||||
[[maybe_unused]] const auto Is128Bit = DstSize == OpSize::i128Bit;
|
||||
|
||||
// TODO: Handle 256-bit VAESENC.
|
||||
LOGMAN_THROW_A_FMT(Is128Bit, "256-bit VAESENC unimplemented");
|
||||
@ -343,7 +343,7 @@ void OpDispatchBuilder::AESEncLastOp(OpcodeArgs) {
|
||||
|
||||
void OpDispatchBuilder::VAESEncLastOp(OpcodeArgs) {
|
||||
const auto DstSize = OpSizeFromDst(Op);
|
||||
[[maybe_unused]] const auto Is128Bit = DstSize == Core::CPUState::XMM_SSE_REG_SIZE;
|
||||
[[maybe_unused]] const auto Is128Bit = DstSize == OpSize::i128Bit;
|
||||
|
||||
// TODO: Handle 256-bit VAESENCLAST.
|
||||
LOGMAN_THROW_A_FMT(Is128Bit, "256-bit VAESENCLAST unimplemented");
|
||||
@ -364,7 +364,7 @@ void OpDispatchBuilder::AESDecOp(OpcodeArgs) {
|
||||
|
||||
void OpDispatchBuilder::VAESDecOp(OpcodeArgs) {
|
||||
const auto DstSize = OpSizeFromDst(Op);
|
||||
[[maybe_unused]] const auto Is128Bit = DstSize == Core::CPUState::XMM_SSE_REG_SIZE;
|
||||
[[maybe_unused]] const auto Is128Bit = DstSize == OpSize::i128Bit;
|
||||
|
||||
// TODO: Handle 256-bit VAESDEC.
|
||||
LOGMAN_THROW_A_FMT(Is128Bit, "256-bit VAESDEC unimplemented");
|
||||
@ -385,7 +385,7 @@ void OpDispatchBuilder::AESDecLastOp(OpcodeArgs) {
|
||||
|
||||
void OpDispatchBuilder::VAESDecLastOp(OpcodeArgs) {
|
||||
const auto DstSize = OpSizeFromDst(Op);
|
||||
[[maybe_unused]] const auto Is128Bit = DstSize == Core::CPUState::XMM_SSE_REG_SIZE;
|
||||
[[maybe_unused]] const auto Is128Bit = DstSize == OpSize::i128Bit;
|
||||
|
||||
// TODO: Handle 256-bit VAESDECLAST.
|
||||
LOGMAN_THROW_A_FMT(Is128Bit, "256-bit VAESDECLAST unimplemented");
|
||||
|
@ -139,8 +139,8 @@ Ref OpDispatchBuilder::GetPackedRFLAG(uint32_t FlagsMask) {
|
||||
}
|
||||
|
||||
void OpDispatchBuilder::CalculateOF(IR::OpSize SrcSize, Ref Res, Ref Src1, Ref Src2, bool Sub) {
|
||||
auto OpSize = SrcSize == 8 ? OpSize::i64Bit : OpSize::i32Bit;
|
||||
uint64_t SignBit = (SrcSize * 8) - 1;
|
||||
const auto OpSize = SrcSize == OpSize::i64Bit ? OpSize::i64Bit : OpSize::i32Bit;
|
||||
const uint64_t SignBit = IR::OpSizeAsBits(SrcSize) - 1;
|
||||
Ref Anded = nullptr;
|
||||
|
||||
// For add, OF is set iff the sources have the same sign but the destination
|
||||
@ -171,7 +171,7 @@ void OpDispatchBuilder::CalculateOF(IR::OpSize SrcSize, Ref Res, Ref Src1, Ref S
|
||||
}
|
||||
}
|
||||
|
||||
SetRFLAG<FEXCore::X86State::RFLAG_OF_RAW_LOC>(Anded, SrcSize * 8 - 1, true);
|
||||
SetRFLAG<FEXCore::X86State::RFLAG_OF_RAW_LOC>(Anded, SignBit, true);
|
||||
}
|
||||
|
||||
Ref OpDispatchBuilder::LoadPFRaw(bool Mask, bool Invert) {
|
||||
@ -265,7 +265,7 @@ Ref OpDispatchBuilder::IncrementByCarry(OpSize OpSize, Ref Src) {
|
||||
Ref OpDispatchBuilder::CalculateFlags_ADC(IR::OpSize SrcSize, Ref Src1, Ref Src2) {
|
||||
auto Zero = _InlineConstant(0);
|
||||
auto One = _InlineConstant(1);
|
||||
auto OpSize = SrcSize == 8 ? OpSize::i64Bit : OpSize::i32Bit;
|
||||
auto OpSize = SrcSize == OpSize::i64Bit ? OpSize::i64Bit : OpSize::i32Bit;
|
||||
Ref Res;
|
||||
|
||||
CalculateAF(Src1, Src2);
|
||||
@ -277,7 +277,7 @@ Ref OpDispatchBuilder::CalculateFlags_ADC(IR::OpSize SrcSize, Ref Src1, Ref Src2
|
||||
CFInverted = false;
|
||||
} else {
|
||||
// Need to zero-extend for correct comparisons below
|
||||
Src2 = _Bfe(OpSize, SrcSize * 8, 0, Src2);
|
||||
Src2 = _Bfe(OpSize, IR::OpSizeAsBits(SrcSize), 0, Src2);
|
||||
|
||||
// Note that we do not extend Src2PlusCF, since we depend on proper
|
||||
// 32-bit arithmetic to correctly handle the Src2 = 0xffff case.
|
||||
@ -285,7 +285,7 @@ Ref OpDispatchBuilder::CalculateFlags_ADC(IR::OpSize SrcSize, Ref Src1, Ref Src2
|
||||
|
||||
// Need to zero-extend for the comparison.
|
||||
Res = _Add(OpSize, Src1, Src2PlusCF);
|
||||
Res = _Bfe(OpSize, SrcSize * 8, 0, Res);
|
||||
Res = _Bfe(OpSize, IR::OpSizeAsBits(SrcSize), 0, Res);
|
||||
|
||||
// TODO: We can fold that second Bfe in (cmp uxth).
|
||||
auto SelectCFInv = _Select(FEXCore::IR::COND_UGE, Res, Src2PlusCF, One, Zero);
|
||||
@ -302,7 +302,7 @@ Ref OpDispatchBuilder::CalculateFlags_ADC(IR::OpSize SrcSize, Ref Src1, Ref Src2
|
||||
Ref OpDispatchBuilder::CalculateFlags_SBB(IR::OpSize SrcSize, Ref Src1, Ref Src2) {
|
||||
auto Zero = _InlineConstant(0);
|
||||
auto One = _InlineConstant(1);
|
||||
auto OpSize = SrcSize == 8 ? OpSize::i64Bit : OpSize::i32Bit;
|
||||
auto OpSize = SrcSize == OpSize::i64Bit ? OpSize::i64Bit : OpSize::i32Bit;
|
||||
|
||||
CalculateAF(Src1, Src2);
|
||||
|
||||
@ -316,13 +316,13 @@ Ref OpDispatchBuilder::CalculateFlags_SBB(IR::OpSize SrcSize, Ref Src1, Ref Src2
|
||||
CFInverted = true;
|
||||
} else {
|
||||
// Zero extend for correct comparison behaviour with Src1 = 0xffff.
|
||||
Src1 = _Bfe(OpSize, SrcSize * 8, 0, Src1);
|
||||
Src2 = _Bfe(OpSize, SrcSize * 8, 0, Src2);
|
||||
Src1 = _Bfe(OpSize, IR::OpSizeAsBits(SrcSize), 0, Src1);
|
||||
Src2 = _Bfe(OpSize, IR::OpSizeAsBits(SrcSize), 0, Src2);
|
||||
|
||||
auto Src2PlusCF = IncrementByCarry(OpSize, Src2);
|
||||
|
||||
Res = _Sub(OpSize, Src1, Src2PlusCF);
|
||||
Res = _Bfe(OpSize, SrcSize * 8, 0, Res);
|
||||
Res = _Bfe(OpSize, IR::OpSizeAsBits(SrcSize), 0, Res);
|
||||
|
||||
auto SelectCFInv = _Select(FEXCore::IR::COND_UGE, Src1, Src2PlusCF, One, Zero);
|
||||
|
||||
@ -345,9 +345,9 @@ Ref OpDispatchBuilder::CalculateFlags_SUB(IR::OpSize SrcSize, Ref Src1, Ref Src2
|
||||
|
||||
Ref Res;
|
||||
if (SrcSize >= OpSize::i32Bit) {
|
||||
Res = _SubWithFlags(IR::SizeToOpSize(SrcSize), Src1, Src2);
|
||||
Res = _SubWithFlags(SrcSize, Src1, Src2);
|
||||
} else {
|
||||
_SubNZCV(IR::SizeToOpSize(SrcSize), Src1, Src2);
|
||||
_SubNZCV(SrcSize, Src1, Src2);
|
||||
Res = _Sub(OpSize::i32Bit, Src1, Src2);
|
||||
}
|
||||
|
||||
@ -375,9 +375,9 @@ Ref OpDispatchBuilder::CalculateFlags_ADD(IR::OpSize SrcSize, Ref Src1, Ref Src2
|
||||
|
||||
Ref Res;
|
||||
if (SrcSize >= OpSize::i32Bit) {
|
||||
Res = _AddWithFlags(IR::SizeToOpSize(SrcSize), Src1, Src2);
|
||||
Res = _AddWithFlags(SrcSize, Src1, Src2);
|
||||
} else {
|
||||
_AddNZCV(IR::SizeToOpSize(SrcSize), Src1, Src2);
|
||||
_AddNZCV(SrcSize, Src1, Src2);
|
||||
Res = _Add(OpSize::i32Bit, Src1, Src2);
|
||||
}
|
||||
|
||||
@ -400,7 +400,7 @@ void OpDispatchBuilder::CalculateFlags_MUL(IR::OpSize SrcSize, Ref Res, Ref High
|
||||
|
||||
// CF and OF are set if the result of the operation can't be fit in to the destination register
|
||||
// If the value can fit then the top bits will be zero
|
||||
auto SignBit = _Sbfe(OpSize::i64Bit, 1, SrcSize * 8 - 1, Res);
|
||||
auto SignBit = _Sbfe(OpSize::i64Bit, 1, IR::OpSizeAsBits(SrcSize) - 1, Res);
|
||||
_SubNZCV(OpSize::i64Bit, High, SignBit);
|
||||
|
||||
// If High = SignBit, then sets to nZCv. Else sets to nzcV. Since SF/ZF
|
||||
@ -415,7 +415,7 @@ void OpDispatchBuilder::CalculateFlags_UMUL(Ref High) {
|
||||
InvalidatePF_AF();
|
||||
|
||||
auto Zero = _InlineConstant(0);
|
||||
OpSize Size = IR::SizeToOpSize(GetOpSize(High));
|
||||
const auto Size = GetOpSize(High);
|
||||
|
||||
// CF and OF are set if the result of the operation can't be fit in to the destination register
|
||||
// The result register will be all zero if it can't fit due to how multiplication behaves
|
||||
@ -442,7 +442,7 @@ void OpDispatchBuilder::CalculateFlags_ShiftLeftImmediate(IR::OpSize SrcSize, Re
|
||||
return;
|
||||
}
|
||||
|
||||
auto OpSize = SrcSize == 8 ? OpSize::i64Bit : OpSize::i32Bit;
|
||||
auto OpSize = SrcSize == OpSize::i64Bit ? OpSize::i64Bit : OpSize::i32Bit;
|
||||
|
||||
SetNZ_ZeroCV(SrcSize, UnmaskedRes);
|
||||
|
||||
@ -451,7 +451,7 @@ void OpDispatchBuilder::CalculateFlags_ShiftLeftImmediate(IR::OpSize SrcSize, Re
|
||||
// Extract the last bit shifted in to CF. Shift is already masked, but for
|
||||
// 8/16-bit it might be >= SrcSizeBits, in which case CF is cleared. There's
|
||||
// nothing to do in that case since we already cleared CF above.
|
||||
auto SrcSizeBits = SrcSize * 8;
|
||||
const auto SrcSizeBits = IR::OpSizeAsBits(SrcSize);
|
||||
if (Shift < SrcSizeBits) {
|
||||
SetCFDirect(Src1, SrcSizeBits - Shift, true);
|
||||
}
|
||||
@ -464,7 +464,7 @@ void OpDispatchBuilder::CalculateFlags_ShiftLeftImmediate(IR::OpSize SrcSize, Re
|
||||
// In the case of left shift. OF is only set from the result of <Top Source Bit> XOR <Top Result Bit>
|
||||
if (Shift == 1) {
|
||||
auto Xor = _Xor(OpSize, UnmaskedRes, Src1);
|
||||
SetRFLAG<FEXCore::X86State::RFLAG_OF_RAW_LOC>(Xor, SrcSize * 8 - 1, true);
|
||||
SetRFLAG<FEXCore::X86State::RFLAG_OF_RAW_LOC>(Xor, IR::OpSizeAsBits(SrcSize) - 1, true);
|
||||
} else {
|
||||
// Undefined, we choose to zero as part of SetNZ_ZeroCV
|
||||
}
|
||||
@ -515,7 +515,7 @@ void OpDispatchBuilder::CalculateFlags_ShiftRightImmediate(IR::OpSize SrcSize, R
|
||||
// Only defined when Shift is 1 else undefined
|
||||
// Is set to the MSB of the original value
|
||||
if (Shift == 1) {
|
||||
SetRFLAG<FEXCore::X86State::RFLAG_OF_RAW_LOC>(Src1, SrcSize * 8 - 1, true);
|
||||
SetRFLAG<FEXCore::X86State::RFLAG_OF_RAW_LOC>(Src1, IR::OpSizeAsBits(SrcSize) - 1, true);
|
||||
}
|
||||
}
|
||||
}
|
||||
@ -526,7 +526,7 @@ void OpDispatchBuilder::CalculateFlags_ShiftRightDoubleImmediate(IR::OpSize SrcS
|
||||
return;
|
||||
}
|
||||
|
||||
const auto OpSize = SrcSize == 8 ? OpSize::i64Bit : OpSize::i32Bit;
|
||||
const auto OpSize = SrcSize == OpSize::i64Bit ? OpSize::i64Bit : OpSize::i32Bit;
|
||||
CalculateFlags_ShiftRightImmediateCommon(SrcSize, Res, Src1, Shift);
|
||||
|
||||
// OF
|
||||
@ -536,7 +536,7 @@ void OpDispatchBuilder::CalculateFlags_ShiftRightDoubleImmediate(IR::OpSize SrcS
|
||||
// XOR of Result and Src1
|
||||
if (Shift == 1) {
|
||||
auto val = _Xor(OpSize, Src1, Res);
|
||||
SetRFLAG<FEXCore::X86State::RFLAG_OF_RAW_LOC>(val, SrcSize * 8 - 1, true);
|
||||
SetRFLAG<FEXCore::X86State::RFLAG_OF_RAW_LOC>(val, IR::OpSizeAsBits(SrcSize) - 1, true);
|
||||
}
|
||||
}
|
||||
}
|
||||
@ -549,7 +549,7 @@ void OpDispatchBuilder::CalculateFlags_ZCNT(IR::OpSize SrcSize, Ref Result) {
|
||||
// Now set CF if the Result = SrcSize * 8. Since SrcSize is a power-of-two and
|
||||
// Result is <= SrcSize * 8, we equivalently check if the log2(SrcSize * 8)
|
||||
// bit is set. No masking is needed because no higher bits could be set.
|
||||
unsigned CarryBit = FEXCore::ilog2(SrcSize * 8u);
|
||||
unsigned CarryBit = FEXCore::ilog2(IR::OpSizeAsBits(SrcSize));
|
||||
SetCFDirect(Result, CarryBit);
|
||||
}
|
||||
|
||||
|
@ -418,7 +418,7 @@ void OpDispatchBuilder::InsertMMX_To_XMM_Vector_CVT_Int_To_Float(OpcodeArgs) {
|
||||
// Always 32-bit.
|
||||
const auto ElementSize = OpSize::i32Bit;
|
||||
// Always signed
|
||||
Dest = _VSToFVectorInsert(IR::SizeToOpSize(DstSize), ElementSize, ElementSize, Dest, Src, true, false);
|
||||
Dest = _VSToFVectorInsert(DstSize, ElementSize, ElementSize, Dest, Src, true, false);
|
||||
|
||||
StoreResult_WithOpSize(FPRClass, Op, Op->Dest, Dest, DstSize, OpSize::iInvalid);
|
||||
}
|
||||
@ -482,7 +482,7 @@ Ref OpDispatchBuilder::InsertScalar_CVT_Float_To_FloatImpl(OpcodeArgs, IR::OpSiz
|
||||
Ref Src1 = LoadSource_WithOpSize(FPRClass, Op, Src1Op, DstSize, Op->Flags);
|
||||
Ref Src2 = LoadSource_WithOpSize(FPRClass, Op, Src2Op, SrcSize, Op->Flags, {.AllowUpperGarbage = true});
|
||||
|
||||
return _VFToFScalarInsert(IR::SizeToOpSize(DstSize), DstElementSize, SrcElementSize, Src1, Src2, ZeroUpperBits);
|
||||
return _VFToFScalarInsert(DstSize, DstElementSize, SrcElementSize, Src1, Src2, ZeroUpperBits);
|
||||
}
|
||||
|
||||
template<IR::OpSize DstElementSize, IR::OpSize SrcElementSize>
|
||||
@ -530,7 +530,7 @@ Ref OpDispatchBuilder::InsertScalarRoundImpl(OpcodeArgs, IR::OpSize DstSize, IR:
|
||||
Ref Src2 = LoadSource_WithOpSize(FPRClass, Op, Src2Op, SrcSize, Op->Flags, {.AllowUpperGarbage = true});
|
||||
|
||||
const auto SourceMode = TranslateRoundType(Mode);
|
||||
auto ALUOp = _VFToIScalarInsert(IR::SizeToOpSize(DstSize), ElementSize, Src1, Src2, SourceMode, ZeroUpperBits);
|
||||
auto ALUOp = _VFToIScalarInsert(DstSize, ElementSize, Src1, Src2, SourceMode, ZeroUpperBits);
|
||||
|
||||
return ALUOp;
|
||||
}
|
||||
@ -600,7 +600,7 @@ void OpDispatchBuilder::InsertScalarFCMPOp(OpcodeArgs) {
|
||||
Ref Src1 = LoadSource_WithOpSize(FPRClass, Op, Op->Dest, DstSize, Op->Flags);
|
||||
Ref Src2 = LoadSource_WithOpSize(FPRClass, Op, Op->Src[0], SrcSize, Op->Flags, {.AllowUpperGarbage = true});
|
||||
|
||||
Ref Result = InsertScalarFCMPOpImpl(IR::SizeToOpSize(DstSize), OpSizeFromDst(Op), ElementSize, Src1, Src2, CompType, false);
|
||||
Ref Result = InsertScalarFCMPOpImpl(DstSize, OpSizeFromDst(Op), ElementSize, Src1, Src2, CompType, false);
|
||||
StoreResult_WithOpSize(FPRClass, Op, Op->Dest, Result, DstSize, OpSize::iInvalid);
|
||||
}
|
||||
|
||||
@ -619,7 +619,7 @@ void OpDispatchBuilder::AVXInsertScalarFCMPOp(OpcodeArgs) {
|
||||
Ref Src1 = LoadSource_WithOpSize(FPRClass, Op, Op->Src[0], DstSize, Op->Flags);
|
||||
Ref Src2 = LoadSource_WithOpSize(FPRClass, Op, Op->Src[1], SrcSize, Op->Flags, {.AllowUpperGarbage = true});
|
||||
|
||||
Ref Result = InsertScalarFCMPOpImpl(IR::SizeToOpSize(DstSize), OpSizeFromDst(Op), ElementSize, Src1, Src2, CompType, true);
|
||||
Ref Result = InsertScalarFCMPOpImpl(DstSize, OpSizeFromDst(Op), ElementSize, Src1, Src2, CompType, true);
|
||||
StoreResult_WithOpSize(FPRClass, Op, Op->Dest, Result, DstSize, OpSize::iInvalid);
|
||||
}
|
||||
|
||||
@ -741,10 +741,10 @@ void OpDispatchBuilder::MOVMSKOp(OpcodeArgs, IR::OpSize ElementSize) {
|
||||
for (unsigned i = 0; i < NumElements; ++i) {
|
||||
// Extract the top bit of the element
|
||||
Ref Tmp = _VExtractToGPR(Size, ElementSize, Src, i);
|
||||
Tmp = _Bfe(IR::SizeToOpSize(ElementSize), 1, ElementSize * 8 - 1, Tmp);
|
||||
Tmp = _Bfe(ElementSize, 1, IR::OpSizeAsBits(ElementSize) - 1, Tmp);
|
||||
|
||||
// Shift it to the correct location
|
||||
Tmp = _Lshl(IR::SizeToOpSize(ElementSize), Tmp, _Constant(i));
|
||||
Tmp = _Lshl(ElementSize, Tmp, _Constant(i));
|
||||
|
||||
// Or it with the current value
|
||||
CurrentVal = _Or(OpSize::i64Bit, CurrentVal, Tmp);
|
||||
@ -755,7 +755,7 @@ void OpDispatchBuilder::MOVMSKOp(OpcodeArgs, IR::OpSize ElementSize) {
|
||||
|
||||
void OpDispatchBuilder::MOVMSKOpOne(OpcodeArgs) {
|
||||
const auto SrcSize = OpSizeFromSrc(Op);
|
||||
const auto Is256Bit = SrcSize == Core::CPUState::XMM_AVX_REG_SIZE;
|
||||
const auto Is256Bit = SrcSize == OpSize::i256Bit;
|
||||
const auto ExtractSize = Is256Bit ? OpSize::i32Bit : OpSize::i16Bit;
|
||||
|
||||
Ref Src = LoadSource(FPRClass, Op, Op->Src[0], Op->Flags);
|
||||
@ -767,7 +767,7 @@ void OpDispatchBuilder::MOVMSKOpOne(OpcodeArgs) {
|
||||
// Since we also handle the MM MOVMSKB here too,
|
||||
// we need to clamp the lower bound.
|
||||
const auto VAdd1Size = std::max(SrcSize, OpSize::i128Bit);
|
||||
const auto VAdd2Size = std::max(IR::DivideOpSize(SrcSize, 2), OpSize::i64Bit);
|
||||
const auto VAdd2Size = std::max(SrcSize >> 1, OpSize::i64Bit);
|
||||
|
||||
auto VAdd1 = _VAddP(VAdd1Size, OpSize::i8Bit, VAnd, VAnd);
|
||||
auto VAdd2 = _VAddP(VAdd2Size, OpSize::i8Bit, VAdd1, VAdd1);
|
||||
@ -790,7 +790,7 @@ void OpDispatchBuilder::PUNPCKLOp(OpcodeArgs, IR::OpSize ElementSize) {
|
||||
|
||||
void OpDispatchBuilder::VPUNPCKLOp(OpcodeArgs, IR::OpSize ElementSize) {
|
||||
const auto SrcSize = OpSizeFromSrc(Op);
|
||||
const auto Is128Bit = SrcSize == Core::CPUState::XMM_SSE_REG_SIZE;
|
||||
const auto Is128Bit = SrcSize == OpSize::i128Bit;
|
||||
|
||||
Ref Src1 = LoadSource(FPRClass, Op, Op->Src[0], Op->Flags);
|
||||
Ref Src2 = LoadSource(FPRClass, Op, Op->Src[1], Op->Flags);
|
||||
@ -819,8 +819,7 @@ void OpDispatchBuilder::PUNPCKHOp(OpcodeArgs, IR::OpSize ElementSize) {
|
||||
|
||||
void OpDispatchBuilder::VPUNPCKHOp(OpcodeArgs, IR::OpSize ElementSize) {
|
||||
const auto SrcSize = OpSizeFromSrc(Op);
|
||||
const auto Is128Bit = SrcSize == Core::CPUState::XMM_SSE_REG_SIZE;
|
||||
|
||||
const auto Is128Bit = SrcSize == OpSize::i128Bit;
|
||||
Ref Src1 = LoadSource(FPRClass, Op, Op->Src[0], Op->Flags);
|
||||
Ref Src2 = LoadSource(FPRClass, Op, Op->Src[1], Op->Flags);
|
||||
|
||||
@ -852,7 +851,7 @@ Ref OpDispatchBuilder::GeneratePSHUFBMask(IR::OpSize SrcSize) {
|
||||
}
|
||||
|
||||
Ref OpDispatchBuilder::PSHUFBOpImpl(IR::OpSize SrcSize, Ref Src1, Ref Src2, Ref MaskVector) {
|
||||
const auto Is256Bit = SrcSize == Core::CPUState::XMM_AVX_REG_SIZE;
|
||||
const auto Is256Bit = SrcSize == OpSize::i256Bit;
|
||||
|
||||
// We perform the 256-bit version as two 128-bit operations due to
|
||||
// the lane splitting behavior, so cap the maximum size at 16.
|
||||
@ -1173,7 +1172,7 @@ void OpDispatchBuilder::PSHUFDOp(OpcodeArgs) {
|
||||
|
||||
void OpDispatchBuilder::VPSHUFWOp(OpcodeArgs, IR::OpSize ElementSize, bool Low) {
|
||||
const auto SrcSize = OpSizeFromSrc(Op);
|
||||
const auto Is256Bit = SrcSize == Core::CPUState::XMM_AVX_REG_SIZE;
|
||||
const auto Is256Bit = SrcSize == OpSize::i256Bit;
|
||||
auto Shuffle = Op->Src[1].Literal();
|
||||
|
||||
Ref Src = LoadSource(FPRClass, Op, Op->Src[0], Op->Flags);
|
||||
@ -1195,7 +1194,7 @@ void OpDispatchBuilder::VPSHUFWOp(OpcodeArgs, IR::OpSize ElementSize, bool Low)
|
||||
if (Is256Bit) {
|
||||
for (size_t i = 0; i < 4; i++) {
|
||||
const auto Index = Shuffle & 0b11;
|
||||
const auto UpperLaneOffset = Core::CPUState::XMM_SSE_REG_SIZE / ElementSize;
|
||||
const auto UpperLaneOffset = IR::NumElements(OpSize::i128Bit, ElementSize);
|
||||
|
||||
const auto LowDstIndex = BaseElement + i;
|
||||
const auto LowSrcIndex = BaseElement + Index;
|
||||
@ -1224,10 +1223,10 @@ Ref OpDispatchBuilder::SHUFOpImpl(OpcodeArgs, IR::OpSize DstSize, IR::OpSize Ele
|
||||
// Since 256-bit variants and up don't lane cross, we can construct
|
||||
// everything in terms of the 128-variant, as each lane is essentially
|
||||
// its own 128-bit segment.
|
||||
const uint8_t NumElements = Core::CPUState::XMM_SSE_REG_SIZE / ElementSize;
|
||||
const uint8_t NumElements = IR::NumElements(OpSize::i128Bit, ElementSize);
|
||||
const uint8_t HalfNumElements = NumElements >> 1;
|
||||
|
||||
const bool Is256Bit = DstSize == Core::CPUState::XMM_AVX_REG_SIZE;
|
||||
const bool Is256Bit = DstSize == OpSize::i256Bit;
|
||||
|
||||
std::array<Ref, 4> Srcs {};
|
||||
for (size_t i = 0; i < HalfNumElements; ++i) {
|
||||
@ -1248,7 +1247,7 @@ Ref OpDispatchBuilder::SHUFOpImpl(OpcodeArgs, IR::OpSize DstSize, IR::OpSize Ele
|
||||
// AVX differs the behavior of VSHUFPD and VSHUFPS.
|
||||
// The same immediate bits are used for both lanes with VSHUFPS,
|
||||
// but VSHUFPD uses different immediate bits for each lane.
|
||||
const auto SrcIndex2 = ElementSize == 4 ? SrcIndex1 : ((Shuffle >> 2) & SelectionMask);
|
||||
const auto SrcIndex2 = ElementSize == OpSize::i32Bit ? SrcIndex1 : ((Shuffle >> 2) & SelectionMask);
|
||||
|
||||
Ref Insert = _VInsElement(DstSize, ElementSize, Element, SrcIndex1, Dest, Srcs[Element]);
|
||||
Dest = _VInsElement(DstSize, ElementSize, Element + NumElements, SrcIndex2 + NumElements, Insert, Srcs[Element]);
|
||||
@ -1442,7 +1441,7 @@ void OpDispatchBuilder::VANDNOp(OpcodeArgs) {
|
||||
template<IROps IROp, IR::OpSize ElementSize>
|
||||
void OpDispatchBuilder::VHADDPOp(OpcodeArgs) {
|
||||
const auto SrcSize = OpSizeFromSrc(Op);
|
||||
const auto Is256Bit = SrcSize == Core::CPUState::XMM_AVX_REG_SIZE;
|
||||
const auto Is256Bit = SrcSize == OpSize::i256Bit;
|
||||
|
||||
Ref Src1 = LoadSource(FPRClass, Op, Op->Src[0], Op->Flags);
|
||||
Ref Src2 = LoadSource(FPRClass, Op, Op->Src[1], Op->Flags);
|
||||
@ -1485,7 +1484,7 @@ void OpDispatchBuilder::VBROADCASTOp(OpcodeArgs, IR::OpSize ElementSize) {
|
||||
Ref OpDispatchBuilder::PINSROpImpl(OpcodeArgs, IR::OpSize ElementSize, const X86Tables::DecodedOperand& Src1Op,
|
||||
const X86Tables::DecodedOperand& Src2Op, const X86Tables::DecodedOperand& Imm) {
|
||||
const auto Size = OpSizeFromDst(Op);
|
||||
const auto NumElements = Size / ElementSize;
|
||||
const auto NumElements = IR::NumElements(Size, ElementSize);
|
||||
const uint64_t Index = Imm.Literal() & (NumElements - 1);
|
||||
Ref Src1 = LoadSource_WithOpSize(FPRClass, Op, Src1Op, Size, Op->Flags);
|
||||
|
||||
@ -1608,7 +1607,7 @@ void OpDispatchBuilder::PExtrOp(OpcodeArgs, IR::OpSize ElementSize) {
|
||||
}
|
||||
|
||||
// AVX version only operates on 128-bit.
|
||||
const uint8_t NumElements = std::min<uint8_t>(GetSrcSize(Op), OpSize::i128Bit) / OverridenElementSize;
|
||||
const uint8_t NumElements = IR::NumElements(std::min(OpSizeFromSrc(Op), OpSize::i128Bit), OverridenElementSize);
|
||||
Index &= NumElements - 1;
|
||||
|
||||
if (Op->Dest.IsGPR()) {
|
||||
@ -1649,8 +1648,8 @@ void OpDispatchBuilder::VEXTRACT128Op(OpcodeArgs) {
|
||||
Ref OpDispatchBuilder::PSIGNImpl(OpcodeArgs, IR::OpSize ElementSize, Ref Src1, Ref Src2) {
|
||||
const auto Size = OpSizeFromSrc(Op);
|
||||
|
||||
Ref Control = _VSQSHL(Size, ElementSize, Src2, (ElementSize * 8) - 1);
|
||||
Control = _VSRSHR(Size, ElementSize, Control, (ElementSize * 8) - 1);
|
||||
Ref Control = _VSQSHL(Size, ElementSize, Src2, IR::OpSizeAsBits(ElementSize) - 1);
|
||||
Control = _VSRSHR(Size, ElementSize, Control, IR::OpSizeAsBits(ElementSize) - 1);
|
||||
return _VMul(Size, ElementSize, Src1, Control);
|
||||
}
|
||||
|
||||
@ -1725,7 +1724,7 @@ void OpDispatchBuilder::PSRLI(OpcodeArgs, IR::OpSize ElementSize) {
|
||||
|
||||
void OpDispatchBuilder::VPSRLIOp(OpcodeArgs, IR::OpSize ElementSize) {
|
||||
const auto Size = OpSizeFromSrc(Op);
|
||||
const auto Is128Bit = Size == Core::CPUState::XMM_SSE_REG_SIZE;
|
||||
const auto Is128Bit = Size == OpSize::i128Bit;
|
||||
const uint64_t ShiftConstant = Op->Src[1].Literal();
|
||||
|
||||
Ref Src = LoadSource(FPRClass, Op, Op->Src[0], Op->Flags);
|
||||
@ -1848,7 +1847,7 @@ void OpDispatchBuilder::PSRLDQ(OpcodeArgs) {
|
||||
Ref Dest = LoadSource(FPRClass, Op, Op->Dest, Op->Flags);
|
||||
Ref Result = LoadZeroVector(Size);
|
||||
|
||||
if (Shift < Size) {
|
||||
if (Shift < IR::OpSizeToSize(Size)) {
|
||||
Result = _VExtr(Size, OpSize::i8Bit, Result, Dest, Shift);
|
||||
}
|
||||
StoreResult(FPRClass, Op, Result, OpSize::iInvalid);
|
||||
@ -1856,7 +1855,7 @@ void OpDispatchBuilder::PSRLDQ(OpcodeArgs) {
|
||||
|
||||
void OpDispatchBuilder::VPSRLDQOp(OpcodeArgs) {
|
||||
const auto DstSize = OpSizeFromDst(Op);
|
||||
const auto Is128Bit = DstSize == Core::CPUState::XMM_SSE_REG_SIZE;
|
||||
const auto Is128Bit = DstSize == OpSize::i128Bit;
|
||||
const uint64_t Shift = Op->Src[1].Literal();
|
||||
|
||||
Ref Src = LoadSource(FPRClass, Op, Op->Src[0], Op->Flags);
|
||||
@ -1872,7 +1871,7 @@ void OpDispatchBuilder::VPSRLDQOp(OpcodeArgs) {
|
||||
Result = LoadZeroVector(DstSize);
|
||||
|
||||
if (Is128Bit) {
|
||||
if (Shift < DstSize) {
|
||||
if (Shift < IR::OpSizeToSize(DstSize)) {
|
||||
Result = _VExtr(DstSize, OpSize::i8Bit, Result, Src, Shift);
|
||||
}
|
||||
} else {
|
||||
@ -1899,8 +1898,8 @@ void OpDispatchBuilder::PSLLDQ(OpcodeArgs) {
|
||||
|
||||
Ref Dest = LoadSource(FPRClass, Op, Op->Dest, Op->Flags);
|
||||
Ref Result = LoadZeroVector(Size);
|
||||
if (Shift < Size) {
|
||||
Result = _VExtr(Size, OpSize::i8Bit, Dest, Result, Size - Shift);
|
||||
if (Shift < IR::OpSizeToSize(Size)) {
|
||||
Result = _VExtr(Size, OpSize::i8Bit, Dest, Result, IR::OpSizeToSize(Size) - Shift);
|
||||
}
|
||||
|
||||
StoreResult(FPRClass, Op, Result, OpSize::iInvalid);
|
||||
@ -1908,7 +1907,8 @@ void OpDispatchBuilder::PSLLDQ(OpcodeArgs) {
|
||||
|
||||
void OpDispatchBuilder::VPSLLDQOp(OpcodeArgs) {
|
||||
const auto DstSize = OpSizeFromDst(Op);
|
||||
const auto Is128Bit = DstSize == Core::CPUState::XMM_SSE_REG_SIZE;
|
||||
const auto DstSizeInt = IR::OpSizeToSize(DstSize);
|
||||
const auto Is128Bit = DstSize == OpSize::i128Bit;
|
||||
const uint64_t Shift = Op->Src[1].Literal();
|
||||
|
||||
Ref Src = LoadSource(FPRClass, Op, Op->Src[0], Op->Flags);
|
||||
@ -1922,13 +1922,13 @@ void OpDispatchBuilder::VPSLLDQOp(OpcodeArgs) {
|
||||
} else {
|
||||
Result = LoadZeroVector(DstSize);
|
||||
if (Is128Bit) {
|
||||
if (Shift < DstSize) {
|
||||
Result = _VExtr(DstSize, OpSize::i8Bit, Src, Result, DstSize - Shift);
|
||||
if (Shift < DstSizeInt) {
|
||||
Result = _VExtr(DstSize, OpSize::i8Bit, Src, Result, DstSizeInt - Shift);
|
||||
}
|
||||
} else {
|
||||
if (Shift < Core::CPUState::XMM_SSE_REG_SIZE) {
|
||||
Ref ResultBottom = _VExtr(OpSize::i128Bit, OpSize::i8Bit, Src, Result, 16 - Shift);
|
||||
Ref ResultTop = _VExtr(DstSize, OpSize::i8Bit, Src, Result, DstSize - Shift);
|
||||
Ref ResultTop = _VExtr(DstSize, OpSize::i8Bit, Src, Result, DstSizeInt - Shift);
|
||||
|
||||
Result = _VInsElement(DstSize, OpSize::i128Bit, 1, 0, ResultBottom, ResultTop);
|
||||
}
|
||||
@ -1954,7 +1954,7 @@ void OpDispatchBuilder::PSRAIOp(OpcodeArgs, IR::OpSize ElementSize) {
|
||||
void OpDispatchBuilder::VPSRAIOp(OpcodeArgs, IR::OpSize ElementSize) {
|
||||
const uint64_t Shift = Op->Src[1].Literal();
|
||||
const auto Size = OpSizeFromDst(Op);
|
||||
const auto Is128Bit = Size == Core::CPUState::XMM_SSE_REG_SIZE;
|
||||
const auto Is128Bit = Size == OpSize::i128Bit;
|
||||
|
||||
Ref Src = LoadSource(FPRClass, Op, Op->Src[0], Op->Flags);
|
||||
Ref Result = Src;
|
||||
@ -2008,7 +2008,7 @@ void OpDispatchBuilder::MOVDDUPOp(OpcodeArgs) {
|
||||
void OpDispatchBuilder::VMOVDDUPOp(OpcodeArgs) {
|
||||
const auto SrcSize = OpSizeFromSrc(Op);
|
||||
const auto IsSrcGPR = Op->Src[0].IsGPR();
|
||||
const auto Is256Bit = SrcSize == Core::CPUState::XMM_AVX_REG_SIZE;
|
||||
const auto Is256Bit = SrcSize == OpSize::i256Bit;
|
||||
const auto MemSize = Is256Bit ? OpSize::i256Bit : OpSize::i64Bit;
|
||||
|
||||
Ref Src = IsSrcGPR ? LoadSource_WithOpSize(FPRClass, Op, Op->Src[0], SrcSize, Op->Flags) :
|
||||
@ -2112,7 +2112,7 @@ Ref OpDispatchBuilder::Vector_CVT_Int_To_FloatImpl(OpcodeArgs, IR::OpSize SrcEle
|
||||
auto ElementSize = SrcElementSize;
|
||||
if (Widen) {
|
||||
Src = _VSXTL(Size, ElementSize, Src);
|
||||
ElementSize = IR::MultiplyOpSize(ElementSize, 2);
|
||||
ElementSize = ElementSize << 1;
|
||||
}
|
||||
|
||||
return _Vector_SToF(Size, ElementSize, Src);
|
||||
@ -2143,8 +2143,8 @@ Ref OpDispatchBuilder::Vector_CVT_Float_To_IntImpl(OpcodeArgs, IR::OpSize SrcEle
|
||||
Ref Src = LoadSource(FPRClass, Op, Op->Src[0], Op->Flags);
|
||||
|
||||
if (Narrow) {
|
||||
Src = _Vector_FToF(DstSize, IR::DivideOpSize(SrcElementSize, 2), Src, SrcElementSize);
|
||||
ElementSize = IR::DivideOpSize(ElementSize, 2);
|
||||
Src = _Vector_FToF(DstSize, SrcElementSize >> 1, Src, SrcElementSize);
|
||||
ElementSize = ElementSize >> 1;
|
||||
}
|
||||
|
||||
if (HostRoundingMode) {
|
||||
@ -2236,17 +2236,17 @@ void OpDispatchBuilder::Vector_CVT_Float_To_Float(OpcodeArgs, IR::OpSize DstElem
|
||||
const auto SrcSize = OpSizeFromSrc(Op);
|
||||
|
||||
const auto IsFloatSrc = SrcElementSize == OpSize::i32Bit;
|
||||
const auto Is128Bit = SrcSize == Core::CPUState::XMM_SSE_REG_SIZE;
|
||||
const auto Is128Bit = SrcSize == OpSize::i128Bit;
|
||||
|
||||
const auto LoadSize = IsFloatSrc && !Op->Src[0].IsGPR() ? IR::SizeToOpSize(IR::OpSizeToSize(SrcSize) / 2) : SrcSize;
|
||||
const auto LoadSize = IsFloatSrc && !Op->Src[0].IsGPR() ? (SrcSize >> 1) : SrcSize;
|
||||
|
||||
Ref Src = LoadSource_WithOpSize(FPRClass, Op, Op->Src[0], LoadSize, Op->Flags);
|
||||
|
||||
Ref Result {};
|
||||
if (DstElementSize > SrcElementSize) {
|
||||
Result = _Vector_FToF(SrcSize, IR::MultiplyOpSize(SrcElementSize, 2), Src, SrcElementSize);
|
||||
Result = _Vector_FToF(SrcSize, SrcElementSize << 1, Src, SrcElementSize);
|
||||
} else {
|
||||
Result = _Vector_FToF(SrcSize, IR::DivideOpSize(SrcElementSize, 2), Src, SrcElementSize);
|
||||
Result = _Vector_FToF(SrcSize, SrcElementSize >> 1, Src, SrcElementSize);
|
||||
}
|
||||
|
||||
if (IsAVX) {
|
||||
@ -2269,7 +2269,7 @@ void OpDispatchBuilder::MMX_To_XMM_Vector_CVT_Int_To_Float(OpcodeArgs) {
|
||||
const auto DstSize = OpSizeFromDst(Op);
|
||||
|
||||
Src = _VSXTL(DstSize, ElementSize, Src);
|
||||
ElementSize = IR::MultiplyOpSize(ElementSize, 2);
|
||||
ElementSize = ElementSize << 1;
|
||||
|
||||
// Always signed
|
||||
Src = _Vector_SToF(DstSize, ElementSize, Src);
|
||||
@ -2294,8 +2294,8 @@ void OpDispatchBuilder::XMM_To_MMX_Vector_CVT_Float_To_Int(OpcodeArgs) {
|
||||
const auto Size = OpSizeFromDst(Op);
|
||||
|
||||
if (Narrow) {
|
||||
Src = _Vector_FToF(Size, IR::DivideOpSize(SrcElementSize, 2), Src, SrcElementSize);
|
||||
ElementSize = IR::DivideOpSize(ElementSize, 2);
|
||||
Src = _Vector_FToF(Size, SrcElementSize >> 1, Src, SrcElementSize);
|
||||
ElementSize = ElementSize >> 1;
|
||||
}
|
||||
|
||||
if constexpr (HostRoundingMode) {
|
||||
@ -2816,7 +2816,7 @@ Ref OpDispatchBuilder::PALIGNROpImpl(OpcodeArgs, const X86Tables::DecodedOperand
|
||||
const auto DstSize = OpSizeFromDst(Op);
|
||||
const auto SanitizedDstSize = std::min(DstSize, OpSize::i128Bit);
|
||||
|
||||
const auto Is256Bit = DstSize == Core::CPUState::XMM_AVX_REG_SIZE;
|
||||
const auto Is256Bit = DstSize == OpSize::i256Bit;
|
||||
const auto Index = Imm.Literal();
|
||||
|
||||
Ref Src2Node = LoadSource(FPRClass, Op, Src2, Op->Flags);
|
||||
@ -2830,7 +2830,7 @@ Ref OpDispatchBuilder::PALIGNROpImpl(OpcodeArgs, const X86Tables::DecodedOperand
|
||||
}
|
||||
Ref Src1Node = LoadSource(FPRClass, Op, Src1, Op->Flags);
|
||||
|
||||
if (Index >= (SanitizedDstSize * 2)) {
|
||||
if (Index >= (IR::OpSizeToSize(SanitizedDstSize) * 2)) {
|
||||
// If the immediate is greater than both vectors combined then it zeroes the vector
|
||||
return LoadZeroVector(DstSize);
|
||||
}
|
||||
@ -2891,7 +2891,7 @@ template void OpDispatchBuilder::PACKUSOp<OpSize::i32Bit>(OpcodeArgs);
|
||||
|
||||
void OpDispatchBuilder::VPACKUSOp(OpcodeArgs, IR::OpSize ElementSize) {
|
||||
const auto DstSize = OpSizeFromDst(Op);
|
||||
const auto Is256Bit = DstSize == Core::CPUState::XMM_AVX_REG_SIZE;
|
||||
const auto Is256Bit = DstSize == OpSize::i256Bit;
|
||||
|
||||
Ref Src1 = LoadSource(FPRClass, Op, Op->Src[0], Op->Flags);
|
||||
Ref Src2 = LoadSource(FPRClass, Op, Op->Src[1], Op->Flags);
|
||||
@ -2919,7 +2919,7 @@ template void OpDispatchBuilder::PACKSSOp<OpSize::i32Bit>(OpcodeArgs);
|
||||
|
||||
void OpDispatchBuilder::VPACKSSOp(OpcodeArgs, IR::OpSize ElementSize) {
|
||||
const auto DstSize = OpSizeFromDst(Op);
|
||||
const auto Is256Bit = DstSize == Core::CPUState::XMM_AVX_REG_SIZE;
|
||||
const auto Is256Bit = DstSize == OpSize::i256Bit;
|
||||
|
||||
Ref Src1 = LoadSource(FPRClass, Op, Op->Src[0], Op->Flags);
|
||||
Ref Src2 = LoadSource(FPRClass, Op, Op->Src[1], Op->Flags);
|
||||
@ -2954,7 +2954,7 @@ Ref OpDispatchBuilder::PMULLOpImpl(OpSize Size, IR::OpSize ElementSize, bool Sig
|
||||
|
||||
template<IR::OpSize ElementSize, bool Signed>
|
||||
void OpDispatchBuilder::PMULLOp(OpcodeArgs) {
|
||||
static_assert(ElementSize == sizeof(uint32_t), "Currently only handles 32-bit -> 64-bit");
|
||||
static_assert(ElementSize == OpSize::i32Bit, "Currently only handles 32-bit -> 64-bit");
|
||||
|
||||
Ref Src1 = LoadSource(FPRClass, Op, Op->Dest, Op->Flags);
|
||||
Ref Src2 = LoadSource(FPRClass, Op, Op->Src[0], Op->Flags);
|
||||
@ -2968,7 +2968,7 @@ template void OpDispatchBuilder::PMULLOp<OpSize::i32Bit, true>(OpcodeArgs);
|
||||
|
||||
template<IR::OpSize ElementSize, bool Signed>
|
||||
void OpDispatchBuilder::VPMULLOp(OpcodeArgs) {
|
||||
static_assert(ElementSize == sizeof(uint32_t), "Currently only handles 32-bit -> 64-bit");
|
||||
static_assert(ElementSize == OpSize::i32Bit, "Currently only handles 32-bit -> 64-bit");
|
||||
|
||||
Ref Src1 = LoadSource(FPRClass, Op, Op->Src[0], Op->Flags);
|
||||
Ref Src2 = LoadSource(FPRClass, Op, Op->Src[1], Op->Flags);
|
||||
@ -3124,15 +3124,15 @@ void OpDispatchBuilder::PMULHRWOp(OpcodeArgs) {
|
||||
|
||||
// Implementation is more efficient for 8byte registers
|
||||
// Multiplies 4 16bit values in to 4 32bit values
|
||||
Res = _VSMull(IR::MultiplyOpSize(Size, 2), OpSize::i16Bit, Dest, Src);
|
||||
Res = _VSMull(Size << 1, OpSize::i16Bit, Dest, Src);
|
||||
|
||||
// Load 0x0000_8000 in to each 32-bit element.
|
||||
Ref VConstant = _VectorImm(OpSize::i128Bit, OpSize::i32Bit, 0x80, 8);
|
||||
|
||||
Res = _VAdd(IR::MultiplyOpSize(Size, 2), OpSize::i32Bit, Res, VConstant);
|
||||
Res = _VAdd(Size << 1, OpSize::i32Bit, Res, VConstant);
|
||||
|
||||
// Now shift and narrow to convert 32-bit values to 16bit, storing the top 16bits
|
||||
Res = _VUShrNI(IR::MultiplyOpSize(Size, 2), OpSize::i32Bit, Res, 16);
|
||||
Res = _VUShrNI(Size << 1, OpSize::i32Bit, Res, 16);
|
||||
|
||||
StoreResult(FPRClass, Op, Res, OpSize::iInvalid);
|
||||
}
|
||||
@ -3177,7 +3177,7 @@ Ref OpDispatchBuilder::PMADDWDOpImpl(IR::OpSize Size, Ref Src1, Ref Src2) {
|
||||
|
||||
if (Size == OpSize::i64Bit) {
|
||||
// MMX implementation can be slightly more optimal
|
||||
Size = IR::DivideOpSize(Size, 2);
|
||||
Size = Size >> 1;
|
||||
auto MullResult = _VSMull(Size, OpSize::i16Bit, Src1, Src2);
|
||||
return _VAddP(Size, OpSize::i32Bit, MullResult, MullResult);
|
||||
}
|
||||
@ -3211,7 +3211,7 @@ void OpDispatchBuilder::VPMADDWDOp(OpcodeArgs) {
|
||||
|
||||
Ref OpDispatchBuilder::PMADDUBSWOpImpl(IR::OpSize Size, Ref Src1, Ref Src2) {
|
||||
if (Size == OpSize::i64Bit) {
|
||||
const auto MultSize = IR::MultiplyOpSize(Size, 2);
|
||||
const auto MultSize = Size << 1;
|
||||
// 64bit is more efficient
|
||||
|
||||
// Src1 is unsigned
|
||||
@ -3314,11 +3314,11 @@ Ref OpDispatchBuilder::PMULHRSWOpImpl(OpSize Size, Ref Src1, Ref Src2) {
|
||||
Ref Res {};
|
||||
if (Size == OpSize::i64Bit) {
|
||||
// Implementation is more efficient for 8byte registers
|
||||
Res = _VSMull(IR::MultiplyOpSize(Size, 2), OpSize::i16Bit, Src1, Src2);
|
||||
Res = _VSShrI(IR::MultiplyOpSize(Size, 2), OpSize::i32Bit, Res, 14);
|
||||
auto OneVector = _VectorImm(IR::MultiplyOpSize(Size, 2), OpSize::i32Bit, 1);
|
||||
Res = _VAdd(IR::MultiplyOpSize(Size, 2), OpSize::i32Bit, Res, OneVector);
|
||||
return _VUShrNI(IR::MultiplyOpSize(Size, 2), OpSize::i32Bit, Res, 1);
|
||||
Res = _VSMull(Size << 1, OpSize::i16Bit, Src1, Src2);
|
||||
Res = _VSShrI(Size << 1, OpSize::i32Bit, Res, 14);
|
||||
auto OneVector = _VectorImm(Size << 1, OpSize::i32Bit, 1);
|
||||
Res = _VAdd(Size << 1, OpSize::i32Bit, Res, OneVector);
|
||||
return _VUShrNI(Size << 1, OpSize::i32Bit, Res, 1);
|
||||
} else {
|
||||
// 128-bit and 256-bit are less efficient
|
||||
Ref ResultLow;
|
||||
@ -3375,7 +3375,7 @@ template void OpDispatchBuilder::HSUBP<OpSize::i64Bit>(OpcodeArgs);
|
||||
|
||||
void OpDispatchBuilder::VHSUBPOp(OpcodeArgs, IR::OpSize ElementSize) {
|
||||
const auto DstSize = OpSizeFromDst(Op);
|
||||
const auto Is256Bit = DstSize == Core::CPUState::XMM_AVX_REG_SIZE;
|
||||
const auto Is256Bit = DstSize == OpSize::i256Bit;
|
||||
|
||||
Ref Src1 = LoadSource(FPRClass, Op, Op->Src[0], Op->Flags);
|
||||
Ref Src2 = LoadSource(FPRClass, Op, Op->Src[1], Op->Flags);
|
||||
@ -3409,7 +3409,7 @@ template void OpDispatchBuilder::PHSUB<OpSize::i32Bit>(OpcodeArgs);
|
||||
|
||||
void OpDispatchBuilder::VPHSUBOp(OpcodeArgs, IR::OpSize ElementSize) {
|
||||
const auto DstSize = OpSizeFromDst(Op);
|
||||
const auto Is256Bit = DstSize == Core::CPUState::XMM_AVX_REG_SIZE;
|
||||
const auto Is256Bit = DstSize == OpSize::i256Bit;
|
||||
|
||||
Ref Src1 = LoadSource(FPRClass, Op, Op->Src[0], Op->Flags);
|
||||
Ref Src2 = LoadSource(FPRClass, Op, Op->Src[1], Op->Flags);
|
||||
@ -3441,7 +3441,7 @@ void OpDispatchBuilder::PHADDS(OpcodeArgs) {
|
||||
|
||||
void OpDispatchBuilder::VPHADDSWOp(OpcodeArgs) {
|
||||
const auto SrcSize = OpSizeFromSrc(Op);
|
||||
const auto Is256Bit = SrcSize == Core::CPUState::XMM_AVX_REG_SIZE;
|
||||
const auto Is256Bit = SrcSize == OpSize::i256Bit;
|
||||
|
||||
Ref Src1 = LoadSource(FPRClass, Op, Op->Src[0], Op->Flags);
|
||||
Ref Src2 = LoadSource(FPRClass, Op, Op->Src[1], Op->Flags);
|
||||
@ -3476,7 +3476,7 @@ void OpDispatchBuilder::PHSUBS(OpcodeArgs) {
|
||||
|
||||
void OpDispatchBuilder::VPHSUBSWOp(OpcodeArgs) {
|
||||
const auto DstSize = OpSizeFromDst(Op);
|
||||
const auto Is256Bit = DstSize == Core::CPUState::XMM_AVX_REG_SIZE;
|
||||
const auto Is256Bit = DstSize == OpSize::i256Bit;
|
||||
|
||||
Ref Src1 = LoadSource(FPRClass, Op, Op->Src[0], Op->Flags);
|
||||
Ref Src2 = LoadSource(FPRClass, Op, Op->Src[1], Op->Flags);
|
||||
@ -3497,13 +3497,13 @@ Ref OpDispatchBuilder::PSADBWOpImpl(IR::OpSize Size, Ref Src1, Ref Src2) {
|
||||
// but it actually operates in more than 8bit space
|
||||
// This can be seen with `abs(0 - 0xFF)` returning a different result depending
|
||||
// on bit length
|
||||
const auto Is128Bit = Size == Core::CPUState::XMM_SSE_REG_SIZE;
|
||||
const auto Is128Bit = Size == OpSize::i128Bit;
|
||||
|
||||
if (Size == OpSize::i64Bit) {
|
||||
auto AbsResult = _VUABDL(IR::MultiplyOpSize(Size, 2), OpSize::i8Bit, Src1, Src2);
|
||||
auto AbsResult = _VUABDL(Size << 1, OpSize::i8Bit, Src1, Src2);
|
||||
|
||||
// Now vector-wide add the results for each
|
||||
return _VAddV(IR::MultiplyOpSize(Size, 2), OpSize::i16Bit, AbsResult);
|
||||
return _VAddV(Size << 1, OpSize::i16Bit, AbsResult);
|
||||
}
|
||||
|
||||
auto AbsResult_Low = _VUABDL(Size, OpSize::i8Bit, Src1, Src2);
|
||||
@ -3558,7 +3558,7 @@ Ref OpDispatchBuilder::ExtendVectorElementsImpl(OpcodeArgs, IR::OpSize ElementSi
|
||||
return LoadSource_WithOpSize(FPRClass, Op, Op->Src[0], DstSize, Op->Flags);
|
||||
} else {
|
||||
// For memory operands the 256-bit variant loads twice the size specified in the table.
|
||||
const auto Is256Bit = DstSize == Core::CPUState::XMM_AVX_REG_SIZE;
|
||||
const auto Is256Bit = DstSize == OpSize::i256Bit;
|
||||
const auto SrcSize = OpSizeFromSrc(Op);
|
||||
const auto LoadSize = Is256Bit ? IR::SizeToOpSize(IR::OpSizeToSize(SrcSize) * 2) : SrcSize;
|
||||
|
||||
@ -3569,8 +3569,7 @@ Ref OpDispatchBuilder::ExtendVectorElementsImpl(OpcodeArgs, IR::OpSize ElementSi
|
||||
Ref Src = GetSrc();
|
||||
Ref Result {Src};
|
||||
|
||||
for (auto CurrentElementSize = ElementSize; CurrentElementSize != DstElementSize;
|
||||
CurrentElementSize = IR::MultiplyOpSize(CurrentElementSize, 2)) {
|
||||
for (auto CurrentElementSize = ElementSize; CurrentElementSize != DstElementSize; CurrentElementSize = CurrentElementSize << 1) {
|
||||
if (Signed) {
|
||||
Result = _VSXTL(DstSize, CurrentElementSize, Result);
|
||||
} else {
|
||||
@ -3901,7 +3900,7 @@ void OpDispatchBuilder::VectorVariableBlend(OpcodeArgs, IR::OpSize ElementSize)
|
||||
//
|
||||
// To emulate this on AArch64
|
||||
// Arithmetic shift right by the element size, then use BSL to select the registers
|
||||
Mask = _VSShrI(Size, ElementSize, Mask, (ElementSize * 8) - 1);
|
||||
Mask = _VSShrI(Size, ElementSize, Mask, IR::OpSizeAsBits(ElementSize) - 1);
|
||||
|
||||
auto Result = _VBSL(Size, Mask, Src, Dest);
|
||||
|
||||
@ -3910,7 +3909,7 @@ void OpDispatchBuilder::VectorVariableBlend(OpcodeArgs, IR::OpSize ElementSize)
|
||||
|
||||
void OpDispatchBuilder::AVXVectorVariableBlend(OpcodeArgs, IR::OpSize ElementSize) {
|
||||
const auto SrcSize = OpSizeFromSrc(Op);
|
||||
const auto ElementSizeBits = ElementSize * 8;
|
||||
const auto ElementSizeBits = IR::OpSizeAsBits(ElementSize);
|
||||
|
||||
Ref Src1 = LoadSource(FPRClass, Op, Op->Src[0], Op->Flags);
|
||||
Ref Src2 = LoadSource(FPRClass, Op, Op->Src[1], Op->Flags);
|
||||
@ -3947,7 +3946,7 @@ void OpDispatchBuilder::PTestOpImpl(OpSize Size, Ref Dest, Ref Src) {
|
||||
// Set ZF according to Test1. SF will be zeroed since we do a 32-bit test on
|
||||
// the results of a 16-bit value from the UMaxV, so the 32-bit sign bit is
|
||||
// cleared even if the 16-bit scalars were negative.
|
||||
SetNZ_ZeroCV(32, Test1);
|
||||
SetNZ_ZeroCV(OpSize::i32Bit, Test1);
|
||||
SetCFInverted(Test2);
|
||||
ZeroPF_AF();
|
||||
}
|
||||
@ -3962,7 +3961,7 @@ void OpDispatchBuilder::PTestOp(OpcodeArgs) {
|
||||
void OpDispatchBuilder::VTESTOpImpl(OpSize SrcSize, IR::OpSize ElementSize, Ref Src1, Ref Src2) {
|
||||
InvalidateDeferredFlags();
|
||||
|
||||
const auto ElementSizeInBits = ElementSize * 8;
|
||||
const auto ElementSizeInBits = IR::OpSizeAsBits(ElementSize);
|
||||
const auto MaskConstant = uint64_t {1} << (ElementSizeInBits - 1);
|
||||
|
||||
Ref Mask = _VDupFromGPR(SrcSize, ElementSize, _Constant(MaskConstant));
|
||||
@ -3985,7 +3984,7 @@ void OpDispatchBuilder::VTESTOpImpl(OpSize SrcSize, IR::OpSize ElementSize, Ref
|
||||
Ref CFInv = _Select(IR::COND_NEQ, AndNotGPR, ZeroConst, OneConst, ZeroConst);
|
||||
|
||||
// As in PTest, this sets Z appropriately while zeroing the rest of NZCV.
|
||||
SetNZ_ZeroCV(32, AndGPR);
|
||||
SetNZ_ZeroCV(OpSize::i32Bit, AndGPR);
|
||||
SetCFInverted(CFInv);
|
||||
ZeroPF_AF();
|
||||
}
|
||||
@ -4083,7 +4082,7 @@ Ref OpDispatchBuilder::DPPOpImpl(IR::OpSize DstSize, Ref Src1, Ref Src2, uint8_t
|
||||
|
||||
// Now using the destination mask we choose where the result ends up
|
||||
// It can duplicate and zero results
|
||||
if (ElementSize == 8) {
|
||||
if (ElementSize == OpSize::i64Bit) {
|
||||
switch (DstMask) {
|
||||
case 0b01:
|
||||
// Dest[63:0] = Result
|
||||
@ -4105,7 +4104,7 @@ Ref OpDispatchBuilder::DPPOpImpl(IR::OpSize DstSize, Ref Src1, Ref Src2, uint8_t
|
||||
auto BadPath = [&]() {
|
||||
Ref Result = ZeroVec;
|
||||
|
||||
for (size_t i = 0; i < (DstSize / ElementSize); ++i) {
|
||||
for (size_t i = 0; i < IR::NumElements(DstSize, ElementSize); ++i) {
|
||||
const auto Bit = 1U << (i % 4);
|
||||
|
||||
if ((DstMask & Bit) != 0) {
|
||||
@ -4127,13 +4126,13 @@ Ref OpDispatchBuilder::DPPOpImpl(IR::OpSize DstSize, Ref Src1, Ref Src2, uint8_t
|
||||
// Dest[63:32] = Result
|
||||
// Dest[95:64] = Zero
|
||||
// Dest[127:96] = Zero
|
||||
return _VZip(IR::DivideOpSize(DstSize, 2), ElementSize, ZeroVec, Temp);
|
||||
return _VZip(DstSize >> 1, ElementSize, ZeroVec, Temp);
|
||||
case 0b0011:
|
||||
// Dest[31:0] = Result
|
||||
// Dest[63:32] = Result
|
||||
// Dest[95:64] = Zero
|
||||
// Dest[127:96] = Zero
|
||||
return _VDupElement(IR::DivideOpSize(DstSize, 2), ElementSize, Temp, 0);
|
||||
return _VDupElement(DstSize >> 1, ElementSize, Temp, 0);
|
||||
case 0b0100:
|
||||
// Dest[31:0] = Zero
|
||||
// Dest[63:32] = Zero
|
||||
@ -4251,7 +4250,7 @@ Ref OpDispatchBuilder::VDPPSOpImpl(OpcodeArgs, const X86Tables::DecodedOperand&
|
||||
Ref Temp = _VFMul(DstSize, ElementSize, Src1V, Src2V);
|
||||
|
||||
// Now we zero out elements based on src mask
|
||||
for (size_t i = 0; i < (DstSize / ElementSize); ++i) {
|
||||
for (size_t i = 0; i < IR::NumElements(DstSize, ElementSize); ++i) {
|
||||
const auto Bit = 1U << (i % 4);
|
||||
|
||||
if ((SrcMask & Bit) == 0) {
|
||||
@ -4272,7 +4271,7 @@ Ref OpDispatchBuilder::VDPPSOpImpl(OpcodeArgs, const X86Tables::DecodedOperand&
|
||||
// It can duplicate and zero results
|
||||
Ref Result = ZeroVec;
|
||||
|
||||
for (size_t i = 0; i < (DstSize / ElementSize); ++i) {
|
||||
for (size_t i = 0; i < IR::NumElements(DstSize, ElementSize); ++i) {
|
||||
const auto Bit = 1U << (i % 4);
|
||||
|
||||
if ((DstMask & Bit) != 0) {
|
||||
@ -4285,17 +4284,17 @@ Ref OpDispatchBuilder::VDPPSOpImpl(OpcodeArgs, const X86Tables::DecodedOperand&
|
||||
|
||||
template<IR::OpSize ElementSize>
|
||||
void OpDispatchBuilder::VDPPOp(OpcodeArgs) {
|
||||
const auto DstSize = GetDstSize(Op);
|
||||
const auto DstSize = OpSizeFromDst(Op);
|
||||
|
||||
Ref Result {};
|
||||
if (ElementSize == 4 && DstSize == Core::CPUState::XMM_AVX_REG_SIZE) {
|
||||
if (ElementSize == OpSize::i32Bit && DstSize == OpSize::i256Bit) {
|
||||
// 256-bit DPPS isn't handled by the 128-bit solution.
|
||||
Result = VDPPSOpImpl(Op, Op->Src[0], Op->Src[1], Op->Src[2]);
|
||||
} else {
|
||||
Ref Src1 = LoadSource(FPRClass, Op, Op->Src[0], Op->Flags);
|
||||
Ref Src2 = LoadSource(FPRClass, Op, Op->Src[1], Op->Flags);
|
||||
|
||||
Result = DPPOpImpl(OpSizeFromDst(Op), Src1, Src2, Op->Src[2].Literal(), ElementSize);
|
||||
Result = DPPOpImpl(DstSize, Src1, Src2, Op->Src[2].Literal(), ElementSize);
|
||||
}
|
||||
|
||||
// We don't need to emit a _VMov to clear the upper lane, since DPPOpImpl uses a zero vector
|
||||
@ -4306,7 +4305,7 @@ void OpDispatchBuilder::VDPPOp(OpcodeArgs) {
|
||||
template void OpDispatchBuilder::VDPPOp<OpSize::i32Bit>(OpcodeArgs);
|
||||
template void OpDispatchBuilder::VDPPOp<OpSize::i64Bit>(OpcodeArgs);
|
||||
|
||||
Ref OpDispatchBuilder::MPSADBWOpImpl(size_t SrcSize, Ref Src1, Ref Src2, uint8_t Select) {
|
||||
Ref OpDispatchBuilder::MPSADBWOpImpl(IR::OpSize SrcSize, Ref Src1, Ref Src2, uint8_t Select) {
|
||||
const auto LaneHelper = [&, this](uint32_t Selector_Src1, uint32_t Selector_Src2, Ref Src1, Ref Src2) {
|
||||
// Src2 will grab a 32bit element and duplicate it across the 128bits
|
||||
Ref DupSrc = _VDupElement(OpSize::i128Bit, OpSize::i32Bit, Src2, Selector_Src2);
|
||||
@ -4373,7 +4372,7 @@ Ref OpDispatchBuilder::MPSADBWOpImpl(size_t SrcSize, Ref Src1, Ref Src2, uint8_t
|
||||
return _VAddP(OpSize::i128Bit, OpSize::i16Bit, TmpTranspose1, TmpTranspose2);
|
||||
};
|
||||
|
||||
const auto Is128Bit = SrcSize == Core::CPUState::XMM_SSE_REG_SIZE;
|
||||
const auto Is128Bit = SrcSize == OpSize::i128Bit;
|
||||
|
||||
// Src1 needs to be in byte offset
|
||||
const uint8_t Select_Src1_Low = ((Select & 0b100) >> 2) * 32 / 8;
|
||||
@ -4395,7 +4394,7 @@ Ref OpDispatchBuilder::MPSADBWOpImpl(size_t SrcSize, Ref Src1, Ref Src2, uint8_t
|
||||
|
||||
void OpDispatchBuilder::MPSADBWOp(OpcodeArgs) {
|
||||
const uint8_t Select = Op->Src[1].Literal();
|
||||
const uint8_t SrcSize = GetSrcSize(Op);
|
||||
const auto SrcSize = OpSizeFromSrc(Op);
|
||||
Ref Src1 = LoadSource(FPRClass, Op, Op->Dest, Op->Flags);
|
||||
Ref Src2 = LoadSource(FPRClass, Op, Op->Src[0], Op->Flags);
|
||||
|
||||
@ -4405,7 +4404,7 @@ void OpDispatchBuilder::MPSADBWOp(OpcodeArgs) {
|
||||
|
||||
void OpDispatchBuilder::VMPSADBWOp(OpcodeArgs) {
|
||||
const uint8_t Select = Op->Src[2].Literal();
|
||||
const uint8_t SrcSize = GetSrcSize(Op);
|
||||
const auto SrcSize = OpSizeFromSrc(Op);
|
||||
Ref Src1 = LoadSource(FPRClass, Op, Op->Src[0], Op->Flags);
|
||||
Ref Src2 = LoadSource(FPRClass, Op, Op->Src[1], Op->Flags);
|
||||
|
||||
@ -4463,7 +4462,7 @@ void OpDispatchBuilder::VCVTPS2PHOp(OpcodeArgs) {
|
||||
|
||||
// We need to eliminate upper junk if we're storing into a register with
|
||||
// a 256-bit source (VCVTPS2PH's destination for registers is an XMM).
|
||||
if (Op->Src[0].IsGPR() && SrcSize == Core::CPUState::XMM_AVX_REG_SIZE) {
|
||||
if (Op->Src[0].IsGPR() && SrcSize == OpSize::i256Bit) {
|
||||
Result = _VMov(OpSize::i128Bit, Result);
|
||||
}
|
||||
|
||||
@ -4617,7 +4616,7 @@ Ref OpDispatchBuilder::VBLENDOpImpl(IR::OpSize VecSize, IR::OpSize ElementSize,
|
||||
|
||||
void OpDispatchBuilder::VBLENDPDOp(OpcodeArgs) {
|
||||
const auto DstSize = OpSizeFromDst(Op);
|
||||
const auto Is256Bit = DstSize == Core::CPUState::XMM_AVX_REG_SIZE;
|
||||
const auto Is256Bit = DstSize == OpSize::i256Bit;
|
||||
const auto Selector = Op->Src[2].Literal();
|
||||
|
||||
Ref Src1 = LoadSource(FPRClass, Op, Op->Src[0], Op->Flags);
|
||||
@ -4642,7 +4641,7 @@ void OpDispatchBuilder::VBLENDPDOp(OpcodeArgs) {
|
||||
|
||||
void OpDispatchBuilder::VPBLENDDOp(OpcodeArgs) {
|
||||
const auto DstSize = OpSizeFromDst(Op);
|
||||
const auto Is256Bit = DstSize == Core::CPUState::XMM_AVX_REG_SIZE;
|
||||
const auto Is256Bit = DstSize == OpSize::i256Bit;
|
||||
const auto Selector = Op->Src[2].Literal();
|
||||
|
||||
Ref Src1 = LoadSource(FPRClass, Op, Op->Src[0], Op->Flags);
|
||||
@ -4686,7 +4685,7 @@ void OpDispatchBuilder::VPBLENDDOp(OpcodeArgs) {
|
||||
|
||||
void OpDispatchBuilder::VPBLENDWOp(OpcodeArgs) {
|
||||
const auto DstSize = OpSizeFromDst(Op);
|
||||
const auto Is128Bit = DstSize == Core::CPUState::XMM_SSE_REG_SIZE;
|
||||
const auto Is128Bit = DstSize == OpSize::i128Bit;
|
||||
const auto Selector = Op->Src[2].Literal();
|
||||
|
||||
Ref Src1 = LoadSource(FPRClass, Op, Op->Src[0], Op->Flags);
|
||||
@ -4718,7 +4717,7 @@ void OpDispatchBuilder::VPBLENDWOp(OpcodeArgs) {
|
||||
|
||||
void OpDispatchBuilder::VZEROOp(OpcodeArgs) {
|
||||
const auto DstSize = OpSizeFromDst(Op);
|
||||
const auto IsVZEROALL = DstSize == Core::CPUState::XMM_AVX_REG_SIZE;
|
||||
const auto IsVZEROALL = DstSize == OpSize::i256Bit;
|
||||
const auto NumRegs = CTX->Config.Is64BitMode ? 16U : 8U;
|
||||
|
||||
if (IsVZEROALL) {
|
||||
@ -4743,7 +4742,7 @@ void OpDispatchBuilder::VZEROOp(OpcodeArgs) {
|
||||
|
||||
void OpDispatchBuilder::VPERMILImmOp(OpcodeArgs, IR::OpSize ElementSize) {
|
||||
const auto DstSize = OpSizeFromDst(Op);
|
||||
const auto Is256Bit = DstSize == Core::CPUState::XMM_AVX_REG_SIZE;
|
||||
const auto Is256Bit = DstSize == OpSize::i256Bit;
|
||||
const auto Selector = Op->Src[1].Literal() & 0xFF;
|
||||
|
||||
Ref Src = LoadSource(FPRClass, Op, Op->Src[0], Op->Flags);
|
||||
@ -4780,7 +4779,7 @@ Ref OpDispatchBuilder::VPERMILRegOpImpl(OpSize DstSize, IR::OpSize ElementSize,
|
||||
// The only difference here is that we need to add 16 to the upper lane
|
||||
// before doing the final addition to build up the indices for TBL.
|
||||
|
||||
const auto Is256Bit = DstSize == Core::CPUState::XMM_AVX_REG_SIZE;
|
||||
const auto Is256Bit = DstSize == OpSize::i256Bit;
|
||||
auto IsPD = ElementSize == OpSize::i64Bit;
|
||||
|
||||
if (IsPD) {
|
||||
@ -4856,7 +4855,7 @@ void OpDispatchBuilder::PCMPXSTRXOpImpl(OpcodeArgs, bool IsExplicit, bool IsMask
|
||||
// While the control bit immediate for the instruction itself is only ever 8 bits
|
||||
// in size, we use it as a 16-bit value so that we can use the 8th bit to signify
|
||||
// whether or not RAX and RDX should be interpreted as a 64-bit value.
|
||||
const auto SrcSize = GetSrcSize(Op);
|
||||
const auto SrcSize = OpSizeFromSrc(Op);
|
||||
const auto Is64Bit = SrcSize == OpSize::i64Bit;
|
||||
const auto NewControl = uint16_t(Control | (uint16_t(Is64Bit) << 8));
|
||||
|
||||
@ -4935,7 +4934,7 @@ void OpDispatchBuilder::VPCMPISTRMOp(OpcodeArgs) {
|
||||
|
||||
void OpDispatchBuilder::VFMAImpl(OpcodeArgs, IROps IROp, bool Scalar, uint8_t Src1Idx, uint8_t Src2Idx, uint8_t AddendIdx) {
|
||||
const auto Size = OpSizeFromDst(Op);
|
||||
const auto Is256Bit = Size == Core::CPUState::XMM_AVX_REG_SIZE;
|
||||
const auto Is256Bit = Size == OpSize::i256Bit;
|
||||
|
||||
const OpSize ElementSize = Op->Flags & X86Tables::DecodeFlags::FLAG_OPTION_AVX_W ? OpSize::i64Bit : OpSize::i32Bit;
|
||||
|
||||
@ -4964,7 +4963,7 @@ void OpDispatchBuilder::VFMAImpl(OpcodeArgs, IROps IROp, bool Scalar, uint8_t Sr
|
||||
|
||||
void OpDispatchBuilder::VFMAddSubImpl(OpcodeArgs, bool AddSub, uint8_t Src1Idx, uint8_t Src2Idx, uint8_t AddendIdx) {
|
||||
const auto Size = OpSizeFromDst(Op);
|
||||
const auto Is256Bit = Size == Core::CPUState::XMM_AVX_REG_SIZE;
|
||||
const auto Is256Bit = Size == OpSize::i256Bit;
|
||||
|
||||
const OpSize ElementSize = Op->Flags & X86Tables::DecodeFlags::FLAG_OPTION_AVX_W ? OpSize::i64Bit : OpSize::i32Bit;
|
||||
|
||||
@ -5024,20 +5023,20 @@ void OpDispatchBuilder::VPGATHER(OpcodeArgs) {
|
||||
LOGMAN_THROW_A_FMT(AddrElementSize == OpSize::i32Bit || AddrElementSize == OpSize::i64Bit, "Unknown address element size");
|
||||
|
||||
const auto Size = OpSizeFromDst(Op);
|
||||
const auto Is128Bit = Size == Core::CPUState::XMM_SSE_REG_SIZE;
|
||||
const auto Is128Bit = Size == OpSize::i128Bit;
|
||||
|
||||
///< Element size is determined by W flag.
|
||||
const OpSize ElementLoadSize = Op->Flags & X86Tables::DecodeFlags::FLAG_OPTION_AVX_W ? OpSize::i64Bit : OpSize::i32Bit;
|
||||
|
||||
// We only need the high address register if the number of data elements is more than what the low half can consume.
|
||||
// But also the number of address elements is clamped by the destination size as well.
|
||||
const size_t NumDataElements = Size / ElementLoadSize;
|
||||
const size_t NumAddrElementBytes = std::min<size_t>(Size, (NumDataElements * AddrElementSize));
|
||||
const bool Needs128BitHighAddrBytes = NumAddrElementBytes > OpSize::i128Bit;
|
||||
const size_t NumDataElements = IR::NumElements(Size, ElementLoadSize);
|
||||
const size_t NumAddrElementBytes = std::min<size_t>(IR::OpSizeToSize(Size), (NumDataElements * IR::OpSizeToSize(AddrElementSize)));
|
||||
const bool Needs128BitHighAddrBytes = NumAddrElementBytes > IR::OpSizeToSize(OpSize::i128Bit);
|
||||
|
||||
auto VSIB = LoadVSIB(Op, Op->Src[0], Op->Flags);
|
||||
|
||||
const bool SupportsSVELoad = (VSIB.Scale == 1 || VSIB.Scale == AddrElementSize) && (AddrElementSize == ElementLoadSize);
|
||||
const bool SupportsSVELoad = (VSIB.Scale == 1 || VSIB.Scale == IR::OpSizeToSize(AddrElementSize)) && (AddrElementSize == ElementLoadSize);
|
||||
|
||||
Ref Dest = LoadSource(FPRClass, Op, Op->Dest, Op->Flags);
|
||||
Ref Mask = LoadSource(FPRClass, Op, Op->Src[1], Op->Flags);
|
||||
@ -5067,7 +5066,7 @@ void OpDispatchBuilder::VPGATHER(OpcodeArgs) {
|
||||
}
|
||||
}
|
||||
|
||||
auto Result128 = AVX128_VPGatherImpl(SizeToOpSize(Size), ElementLoadSize, AddrElementSize, Dest128, Mask128, VSIB128);
|
||||
auto Result128 = AVX128_VPGatherImpl(Size, ElementLoadSize, AddrElementSize, Dest128, Mask128, VSIB128);
|
||||
// The registers are current split, need to merge them.
|
||||
Result = _VInsElement(OpSize::i256Bit, OpSize::i128Bit, 1, 0, Result128.Low, Result128.High);
|
||||
} else {
|
||||
|
@ -103,7 +103,7 @@ void OpDispatchBuilder::FILD(OpcodeArgs) {
|
||||
|
||||
// Sign extend to 64bits
|
||||
if (ReadWidth != OpSize::i64Bit) {
|
||||
Data = _Sbfe(OpSize::i64Bit, ReadWidth * 8, 0, Data);
|
||||
Data = _Sbfe(OpSize::i64Bit, IR::OpSizeAsBits(ReadWidth), 0, Data);
|
||||
}
|
||||
|
||||
// We're about to clobber flags to grab the sign, so save NZCV.
|
||||
@ -351,33 +351,33 @@ void OpDispatchBuilder::X87FNSTENV(OpcodeArgs) {
|
||||
_StoreMem(GPRClass, Size, Mem, FCW, Size);
|
||||
}
|
||||
|
||||
{ _StoreMem(GPRClass, Size, ReconstructFSW_Helper(), Mem, _Constant(Size * 1), Size, MEM_OFFSET_SXTX, OpSize::i8Bit); }
|
||||
{ _StoreMem(GPRClass, Size, ReconstructFSW_Helper(), Mem, _Constant(IR::OpSizeToSize(Size) * 1), Size, MEM_OFFSET_SXTX, 1); }
|
||||
|
||||
auto ZeroConst = _Constant(0);
|
||||
|
||||
{
|
||||
// FTW
|
||||
_StoreMem(GPRClass, Size, GetX87FTW_Helper(), Mem, _Constant(Size * 2), Size, MEM_OFFSET_SXTX, OpSize::i8Bit);
|
||||
_StoreMem(GPRClass, Size, GetX87FTW_Helper(), Mem, _Constant(IR::OpSizeToSize(Size) * 2), Size, MEM_OFFSET_SXTX, 1);
|
||||
}
|
||||
|
||||
{
|
||||
// Instruction Offset
|
||||
_StoreMem(GPRClass, Size, ZeroConst, Mem, _Constant(Size * 3), Size, MEM_OFFSET_SXTX, OpSize::i8Bit);
|
||||
_StoreMem(GPRClass, Size, ZeroConst, Mem, _Constant(IR::OpSizeToSize(Size) * 3), Size, MEM_OFFSET_SXTX, 1);
|
||||
}
|
||||
|
||||
{
|
||||
// Instruction CS selector (+ Opcode)
|
||||
_StoreMem(GPRClass, Size, ZeroConst, Mem, _Constant(Size * 4), Size, MEM_OFFSET_SXTX, OpSize::i8Bit);
|
||||
_StoreMem(GPRClass, Size, ZeroConst, Mem, _Constant(IR::OpSizeToSize(Size) * 4), Size, MEM_OFFSET_SXTX, 1);
|
||||
}
|
||||
|
||||
{
|
||||
// Data pointer offset
|
||||
_StoreMem(GPRClass, Size, ZeroConst, Mem, _Constant(Size * 5), Size, MEM_OFFSET_SXTX, OpSize::i8Bit);
|
||||
_StoreMem(GPRClass, Size, ZeroConst, Mem, _Constant(IR::OpSizeToSize(Size) * 5), Size, MEM_OFFSET_SXTX, 1);
|
||||
}
|
||||
|
||||
{
|
||||
// Data pointer selector
|
||||
_StoreMem(GPRClass, Size, ZeroConst, Mem, _Constant(Size * 6), Size, MEM_OFFSET_SXTX, OpSize::i8Bit);
|
||||
_StoreMem(GPRClass, Size, ZeroConst, Mem, _Constant(IR::OpSizeToSize(Size) * 6), Size, MEM_OFFSET_SXTX, 1);
|
||||
}
|
||||
}
|
||||
|
||||
@ -407,13 +407,13 @@ void OpDispatchBuilder::X87LDENV(OpcodeArgs) {
|
||||
auto NewFCW = _LoadMem(GPRClass, OpSize::i16Bit, Mem, OpSize::i16Bit);
|
||||
_StoreContext(OpSize::i16Bit, GPRClass, NewFCW, offsetof(FEXCore::Core::CPUState, FCW));
|
||||
|
||||
Ref MemLocation = _Add(OpSize::i64Bit, Mem, _Constant(Size * 1));
|
||||
Ref MemLocation = _Add(OpSize::i64Bit, Mem, _Constant(IR::OpSizeToSize(Size) * 1));
|
||||
auto NewFSW = _LoadMem(GPRClass, Size, MemLocation, Size);
|
||||
ReconstructX87StateFromFSW_Helper(NewFSW);
|
||||
|
||||
{
|
||||
// FTW
|
||||
Ref MemLocation = _Add(OpSize::i64Bit, Mem, _Constant(Size * 2));
|
||||
Ref MemLocation = _Add(OpSize::i64Bit, Mem, _Constant(IR::OpSizeToSize(Size) * 2));
|
||||
SetX87FTW(_LoadMem(GPRClass, Size, MemLocation, Size));
|
||||
}
|
||||
}
|
||||
@ -447,58 +447,58 @@ void OpDispatchBuilder::X87FNSAVE(OpcodeArgs) {
|
||||
_StoreMem(GPRClass, Size, Mem, FCW, Size);
|
||||
}
|
||||
|
||||
{ _StoreMem(GPRClass, Size, ReconstructFSW_Helper(), Mem, _Constant(Size * 1), Size, MEM_OFFSET_SXTX, 1); }
|
||||
{ _StoreMem(GPRClass, Size, ReconstructFSW_Helper(), Mem, _Constant(IR::OpSizeToSize(Size) * 1), Size, MEM_OFFSET_SXTX, 1); }
|
||||
|
||||
auto ZeroConst = _Constant(0);
|
||||
|
||||
{
|
||||
// FTW
|
||||
_StoreMem(GPRClass, Size, GetX87FTW_Helper(), Mem, _Constant(Size * 2), Size, MEM_OFFSET_SXTX, 1);
|
||||
_StoreMem(GPRClass, Size, GetX87FTW_Helper(), Mem, _Constant(IR::OpSizeToSize(Size) * 2), Size, MEM_OFFSET_SXTX, 1);
|
||||
}
|
||||
|
||||
{
|
||||
// Instruction Offset
|
||||
_StoreMem(GPRClass, Size, ZeroConst, Mem, _Constant(Size * 3), Size, MEM_OFFSET_SXTX, 1);
|
||||
_StoreMem(GPRClass, Size, ZeroConst, Mem, _Constant(IR::OpSizeToSize(Size) * 3), Size, MEM_OFFSET_SXTX, 1);
|
||||
}
|
||||
|
||||
{
|
||||
// Instruction CS selector (+ Opcode)
|
||||
_StoreMem(GPRClass, Size, ZeroConst, Mem, _Constant(Size * 4), Size, MEM_OFFSET_SXTX, 1);
|
||||
_StoreMem(GPRClass, Size, ZeroConst, Mem, _Constant(IR::OpSizeToSize(Size) * 4), Size, MEM_OFFSET_SXTX, 1);
|
||||
}
|
||||
|
||||
{
|
||||
// Data pointer offset
|
||||
_StoreMem(GPRClass, Size, ZeroConst, Mem, _Constant(Size * 5), Size, MEM_OFFSET_SXTX, 1);
|
||||
_StoreMem(GPRClass, Size, ZeroConst, Mem, _Constant(IR::OpSizeToSize(Size) * 5), Size, MEM_OFFSET_SXTX, 1);
|
||||
}
|
||||
|
||||
{
|
||||
// Data pointer selector
|
||||
_StoreMem(GPRClass, Size, ZeroConst, Mem, _Constant(Size * 6), Size, MEM_OFFSET_SXTX, 1);
|
||||
_StoreMem(GPRClass, Size, ZeroConst, Mem, _Constant(IR::OpSizeToSize(Size) * 6), Size, MEM_OFFSET_SXTX, 1);
|
||||
}
|
||||
|
||||
auto OneConst = _Constant(1);
|
||||
auto SevenConst = _Constant(7);
|
||||
const auto LoadSize = ReducedPrecisionMode ? OpSize::i64Bit : OpSize::i128Bit;
|
||||
for (int i = 0; i < 7; ++i) {
|
||||
Ref data = _LoadContextIndexed(Top, LoadSize, MMBaseOffset(), OpSize::i128Bit, FPRClass);
|
||||
Ref data = _LoadContextIndexed(Top, LoadSize, MMBaseOffset(), IR::OpSizeToSize(OpSize::i128Bit), FPRClass);
|
||||
if (ReducedPrecisionMode) {
|
||||
data = _F80CVTTo(data, OpSize::i64Bit);
|
||||
}
|
||||
_StoreMem(FPRClass, OpSize::i128Bit, data, Mem, _Constant((Size * 7) + (10 * i)), OpSize::i8Bit, MEM_OFFSET_SXTX, 1);
|
||||
_StoreMem(FPRClass, OpSize::i128Bit, data, Mem, _Constant((IR::OpSizeToSize(Size) * 7) + (10 * i)), OpSize::i8Bit, MEM_OFFSET_SXTX, 1);
|
||||
Top = _And(OpSize::i32Bit, _Add(OpSize::i32Bit, Top, OneConst), SevenConst);
|
||||
}
|
||||
|
||||
// The final st(7) needs a bit of special handling here
|
||||
Ref data = _LoadContextIndexed(Top, LoadSize, MMBaseOffset(), OpSize::i128Bit, FPRClass);
|
||||
Ref data = _LoadContextIndexed(Top, LoadSize, MMBaseOffset(), IR::OpSizeToSize(OpSize::i128Bit), FPRClass);
|
||||
if (ReducedPrecisionMode) {
|
||||
data = _F80CVTTo(data, OpSize::i64Bit);
|
||||
}
|
||||
// ST7 broken in to two parts
|
||||
// Lower 64bits [63:0]
|
||||
// upper 16 bits [79:64]
|
||||
_StoreMem(FPRClass, OpSize::i64Bit, data, Mem, _Constant((Size * 7) + (7 * 10)), OpSize::i8Bit, MEM_OFFSET_SXTX, 1);
|
||||
_StoreMem(FPRClass, OpSize::i64Bit, data, Mem, _Constant((IR::OpSizeToSize(Size) * 7) + (7 * 10)), OpSize::i8Bit, MEM_OFFSET_SXTX, 1);
|
||||
auto topBytes = _VDupElement(OpSize::i128Bit, OpSize::i16Bit, data, 4);
|
||||
_StoreMem(FPRClass, OpSize::i16Bit, topBytes, Mem, _Constant((Size * 7) + (7 * 10) + 8), OpSize::i8Bit, MEM_OFFSET_SXTX, 1);
|
||||
_StoreMem(FPRClass, OpSize::i16Bit, topBytes, Mem, _Constant((IR::OpSizeToSize(Size) * 7) + (7 * 10) + 8), OpSize::i8Bit, MEM_OFFSET_SXTX, 1);
|
||||
|
||||
// reset to default
|
||||
FNINIT(Op);
|
||||
@ -522,11 +522,11 @@ void OpDispatchBuilder::X87FRSTOR(OpcodeArgs) {
|
||||
_SetRoundingMode(roundingMode, false, roundingMode);
|
||||
}
|
||||
|
||||
auto NewFSW = _LoadMem(GPRClass, Size, Mem, _Constant(Size * 1), Size, MEM_OFFSET_SXTX, 1);
|
||||
auto NewFSW = _LoadMem(GPRClass, Size, Mem, _Constant(IR::OpSizeToSize(Size) * 1), Size, MEM_OFFSET_SXTX, 1);
|
||||
Ref Top = ReconstructX87StateFromFSW_Helper(NewFSW);
|
||||
{
|
||||
// FTW
|
||||
SetX87FTW(_LoadMem(GPRClass, Size, Mem, _Constant(Size * 2), Size, MEM_OFFSET_SXTX, 1));
|
||||
SetX87FTW(_LoadMem(GPRClass, Size, Mem, _Constant(IR::OpSizeToSize(Size) * 2), Size, MEM_OFFSET_SXTX, 1));
|
||||
}
|
||||
|
||||
auto OneConst = _Constant(1);
|
||||
@ -538,14 +538,14 @@ void OpDispatchBuilder::X87FRSTOR(OpcodeArgs) {
|
||||
Mask = _VInsGPR(OpSize::i128Bit, OpSize::i64Bit, 1, Mask, high);
|
||||
const auto StoreSize = ReducedPrecisionMode ? OpSize::i64Bit : OpSize::i128Bit;
|
||||
for (int i = 0; i < 7; ++i) {
|
||||
Ref Reg = _LoadMem(FPRClass, OpSize::i128Bit, Mem, _Constant((Size * 7) + (10 * i)), OpSize::i8Bit, MEM_OFFSET_SXTX, 1);
|
||||
Ref Reg = _LoadMem(FPRClass, OpSize::i128Bit, Mem, _Constant((IR::OpSizeToSize(Size) * 7) + (10 * i)), OpSize::i8Bit, MEM_OFFSET_SXTX, 1);
|
||||
// Mask off the top bits
|
||||
Reg = _VAnd(OpSize::i128Bit, OpSize::i128Bit, Reg, Mask);
|
||||
if (ReducedPrecisionMode) {
|
||||
// Convert to double precision
|
||||
Reg = _F80CVT(OpSize::i64Bit, Reg);
|
||||
}
|
||||
_StoreContextIndexed(Reg, Top, StoreSize, MMBaseOffset(), OpSize::i128Bit, FPRClass);
|
||||
_StoreContextIndexed(Reg, Top, StoreSize, MMBaseOffset(), IR::OpSizeToSize(OpSize::i128Bit), FPRClass);
|
||||
|
||||
Top = _And(OpSize::i32Bit, _Add(OpSize::i32Bit, Top, OneConst), SevenConst);
|
||||
}
|
||||
@ -554,13 +554,14 @@ void OpDispatchBuilder::X87FRSTOR(OpcodeArgs) {
|
||||
// ST7 broken in to two parts
|
||||
// Lower 64bits [63:0]
|
||||
// upper 16 bits [79:64]
|
||||
Ref Reg = _LoadMem(FPRClass, OpSize::i64Bit, Mem, _Constant((Size * 7) + (10 * 7)), OpSize::i8Bit, MEM_OFFSET_SXTX, 1);
|
||||
Ref RegHigh = _LoadMem(FPRClass, OpSize::i16Bit, Mem, _Constant((Size * 7) + (10 * 7) + 8), OpSize::i8Bit, MEM_OFFSET_SXTX, 1);
|
||||
Ref Reg = _LoadMem(FPRClass, OpSize::i64Bit, Mem, _Constant((IR::OpSizeToSize(Size) * 7) + (10 * 7)), OpSize::i8Bit, MEM_OFFSET_SXTX, 1);
|
||||
Ref RegHigh =
|
||||
_LoadMem(FPRClass, OpSize::i16Bit, Mem, _Constant((IR::OpSizeToSize(Size) * 7) + (10 * 7) + 8), OpSize::i8Bit, MEM_OFFSET_SXTX, 1);
|
||||
Reg = _VInsElement(OpSize::i128Bit, OpSize::i16Bit, 4, 0, Reg, RegHigh);
|
||||
if (ReducedPrecisionMode) {
|
||||
Reg = _F80CVT(OpSize::i64Bit, Reg); // Convert to double precision
|
||||
}
|
||||
_StoreContextIndexed(Reg, Top, StoreSize, MMBaseOffset(), OpSize::i128Bit, FPRClass);
|
||||
_StoreContextIndexed(Reg, Top, StoreSize, MMBaseOffset(), IR::OpSizeToSize(OpSize::i128Bit), FPRClass);
|
||||
}
|
||||
|
||||
// Load / Store Control Word
|
||||
|
@ -36,12 +36,12 @@ void OpDispatchBuilder::X87LDENVF64(OpcodeArgs) {
|
||||
_SetRoundingMode(roundingMode, false, roundingMode);
|
||||
_StoreContext(OpSize::i16Bit, GPRClass, NewFCW, offsetof(FEXCore::Core::CPUState, FCW));
|
||||
|
||||
auto NewFSW = _LoadMem(GPRClass, Size, Mem, _Constant(Size * 1), Size, MEM_OFFSET_SXTX, 1);
|
||||
auto NewFSW = _LoadMem(GPRClass, Size, Mem, _Constant(IR::OpSizeToSize(Size)), Size, MEM_OFFSET_SXTX, 1);
|
||||
ReconstructX87StateFromFSW_Helper(NewFSW);
|
||||
|
||||
{
|
||||
// FTW
|
||||
SetX87FTW(_LoadMem(GPRClass, Size, Mem, _Constant(Size * 2), Size, MEM_OFFSET_SXTX, 1));
|
||||
SetX87FTW(_LoadMem(GPRClass, Size, Mem, _Constant(IR::OpSizeToSize(Size) * 2), Size, MEM_OFFSET_SXTX, 1));
|
||||
}
|
||||
}
|
||||
|
||||
@ -97,7 +97,7 @@ void OpDispatchBuilder::FILDF64(OpcodeArgs) {
|
||||
// Read from memory
|
||||
Ref Data = LoadSource_WithOpSize(GPRClass, Op, Op->Src[0], ReadWidth, Op->Flags);
|
||||
if (ReadWidth == OpSize::i16Bit) {
|
||||
Data = _Sbfe(OpSize::i64Bit, ReadWidth * 8, 0, Data);
|
||||
Data = _Sbfe(OpSize::i64Bit, IR::OpSizeAsBits(ReadWidth), 0, Data);
|
||||
}
|
||||
auto ConvertedData = _Float_FromGPR_S(OpSize::i64Bit, ReadWidth == OpSize::i32Bit ? OpSize::i32Bit : OpSize::i64Bit, Data);
|
||||
_PushStack(ConvertedData, Data, ReadWidth, false);
|
||||
@ -117,9 +117,9 @@ void OpDispatchBuilder::FISTF64(OpcodeArgs, bool Truncate) {
|
||||
|
||||
Ref data = _ReadStackValue(0);
|
||||
if (Truncate) {
|
||||
data = _Float_ToGPR_ZS(Size == 4 ? OpSize::i32Bit : OpSize::i64Bit, OpSize::i64Bit, data);
|
||||
data = _Float_ToGPR_ZS(Size == OpSize::i32Bit ? OpSize::i32Bit : OpSize::i64Bit, OpSize::i64Bit, data);
|
||||
} else {
|
||||
data = _Float_ToGPR_S(Size == 4 ? OpSize::i32Bit : OpSize::i64Bit, OpSize::i64Bit, data);
|
||||
data = _Float_ToGPR_S(Size == OpSize::i32Bit ? OpSize::i32Bit : OpSize::i64Bit, OpSize::i64Bit, data);
|
||||
}
|
||||
StoreResult_WithOpSize(GPRClass, Op, Op->Dest, data, Size, OpSize::i8Bit);
|
||||
|
||||
@ -339,7 +339,7 @@ void OpDispatchBuilder::FCOMIF64(OpcodeArgs, IR::OpSize Width, bool Integer, OpD
|
||||
if (Width == OpSize::i16Bit) {
|
||||
arg = _Sbfe(OpSize::i64Bit, 16, 0, arg);
|
||||
}
|
||||
b = _Float_FromGPR_S(OpSize::i64Bit, Width == 64 ? OpSize::i64Bit : OpSize::i32Bit, arg);
|
||||
b = _Float_FromGPR_S(OpSize::i64Bit, Width == OpSize::i64Bit ? OpSize::i64Bit : OpSize::i32Bit, arg);
|
||||
} else if (Width == OpSize::i32Bit) {
|
||||
arg = LoadSource(FPRClass, Op, Op->Src[0], Op->Flags);
|
||||
b = _Float_FToF(OpSize::i64Bit, OpSize::i32Bit, arg);
|
||||
|
@ -548,7 +548,7 @@ protected:
|
||||
|
||||
// This must directly match bytes to the named opsize.
|
||||
// Implicit sized IR operations does math to get between sizes.
|
||||
enum OpSize : uint8_t {
|
||||
enum class OpSize : uint8_t {
|
||||
iUnsized = 0,
|
||||
i8Bit = 1,
|
||||
i16Bit = 2,
|
||||
@ -615,14 +615,18 @@ static inline uint16_t OpSizeAsBits(IR::OpSize Size) {
|
||||
return IR::OpSizeToSize(Size) * 8u;
|
||||
}
|
||||
|
||||
static inline OpSize MultiplyOpSize(IR::OpSize Size, uint8_t Multiplier) {
|
||||
template<typename T>
|
||||
requires (std::is_integral_v<T>)
|
||||
static inline OpSize operator<<(IR::OpSize Size, T Shift) {
|
||||
LOGMAN_THROW_A_FMT(Size != IR::OpSize::iInvalid, "Invalid Size");
|
||||
return IR::SizeToOpSize(IR::OpSizeToSize(Size) * Multiplier);
|
||||
return IR::SizeToOpSize(IR::OpSizeToSize(Size) << Shift);
|
||||
}
|
||||
|
||||
static inline OpSize DivideOpSize(IR::OpSize Size, uint8_t Divisor) {
|
||||
template<typename T>
|
||||
requires (std::is_integral_v<T>)
|
||||
static inline OpSize operator>>(IR::OpSize Size, T Shift) {
|
||||
LOGMAN_THROW_A_FMT(Size != IR::OpSize::iInvalid, "Invalid Size");
|
||||
return IR::SizeToOpSize(IR::OpSizeToSize(Size) / Divisor);
|
||||
return IR::SizeToOpSize(IR::OpSizeToSize(Size) >> Shift);
|
||||
}
|
||||
|
||||
static inline OpSize operator/(IR::OpSize Size, IR::OpSize Divisor) {
|
||||
@ -630,7 +634,9 @@ static inline OpSize operator/(IR::OpSize Size, IR::OpSize Divisor) {
|
||||
return IR::SizeToOpSize(IR::OpSizeToSize(Size) / IR::OpSizeToSize(Divisor));
|
||||
}
|
||||
|
||||
static inline OpSize operator/(IR::OpSize Size, uint8_t Divisor) {
|
||||
template<typename T>
|
||||
requires (std::is_integral_v<T>)
|
||||
static inline OpSize operator/(IR::OpSize Size, T Divisor) {
|
||||
LOGMAN_THROW_A_FMT(Size != IR::OpSize::iInvalid, "Invalid Size");
|
||||
return IR::SizeToOpSize(IR::OpSizeToSize(Size) / Divisor);
|
||||
}
|
||||
|
@ -736,7 +736,7 @@
|
||||
"HasSideEffects": true,
|
||||
"DestSize": "RegisterSize",
|
||||
"EmitValidation": [
|
||||
"Offset % RegisterSize == 0",
|
||||
"Offset % IR::OpSizeToSize(RegisterSize) == 0",
|
||||
"RegisterSize == FEXCore::IR::OpSize::i128Bit || RegisterSize == FEXCore::IR::OpSize::i256Bit"
|
||||
]
|
||||
},
|
||||
@ -748,7 +748,7 @@
|
||||
"HasSideEffects": true,
|
||||
"DestSize": "RegisterSize",
|
||||
"EmitValidation": [
|
||||
"Offset % RegisterSize == 0",
|
||||
"Offset % IR::OpSizeToSize(RegisterSize) == 0",
|
||||
"RegisterSize == FEXCore::IR::OpSize::i128Bit"
|
||||
]
|
||||
},
|
||||
@ -760,7 +760,7 @@
|
||||
"HasSideEffects": true,
|
||||
"DestSize": "RegisterSize",
|
||||
"EmitValidation": [
|
||||
"Offset % RegisterSize == 0",
|
||||
"Offset % IR::OpSizeToSize(RegisterSize) == 0",
|
||||
"RegisterSize == FEXCore::IR::OpSize::i128Bit || RegisterSize == FEXCore::IR::OpSize::i256Bit"
|
||||
]
|
||||
}
|
||||
@ -2017,7 +2017,7 @@
|
||||
"TiedSource": 0,
|
||||
"Desc": "Unsigned shifts right each element and then narrows to the next lower element size",
|
||||
"DestSize": "RegisterSize",
|
||||
"NumElements": "RegisterSize / (IR::DivideOpSize(ElementSize, 2))"
|
||||
"NumElements": "RegisterSize / (ElementSize >> 1)"
|
||||
},
|
||||
|
||||
"FPR = VUShrNI2 OpSize:#RegisterSize, OpSize:#ElementSize, FPR:$VectorLower, FPR:$VectorUpper, u8:$BitShift": {
|
||||
@ -2026,73 +2026,73 @@
|
||||
"Inserts results in to the high elements of the first argument"
|
||||
],
|
||||
"DestSize": "RegisterSize",
|
||||
"NumElements": "RegisterSize / (IR::DivideOpSize(ElementSize, 2))"
|
||||
"NumElements": "RegisterSize / (ElementSize >> 1)"
|
||||
},
|
||||
"FPR = VSXTL OpSize:#RegisterSize, OpSize:#ElementSize, FPR:$Vector": {
|
||||
"Desc": "Sign extends elements from the source element size to the next size up",
|
||||
"DestSize": "RegisterSize",
|
||||
"NumElements": "RegisterSize / (IR::MultiplyOpSize(ElementSize, 2))"
|
||||
"NumElements": "RegisterSize / (ElementSize << 1)"
|
||||
},
|
||||
"FPR = VSXTL2 OpSize:#RegisterSize, OpSize:#ElementSize, FPR:$Vector": {
|
||||
"Desc": ["Sign extends elements from the source element size to the next size up",
|
||||
"Source elements come from the upper half of the register"
|
||||
],
|
||||
"DestSize": "RegisterSize",
|
||||
"NumElements": "RegisterSize / (IR::MultiplyOpSize(ElementSize, 2))"
|
||||
"NumElements": "RegisterSize / (ElementSize << 1)"
|
||||
},
|
||||
"FPR = VSSHLL OpSize:#RegisterSize, OpSize:#ElementSize, FPR:$Vector, u8:$BitShift{0}": {
|
||||
"Desc": "Sign extends elements from the source element size to the next size up",
|
||||
"DestSize": "RegisterSize",
|
||||
"NumElements": "RegisterSize / (IR::MultiplyOpSize(ElementSize, 2))"
|
||||
"NumElements": "RegisterSize / (ElementSize << 1)"
|
||||
},
|
||||
"FPR = VSSHLL2 OpSize:#RegisterSize, OpSize:#ElementSize, FPR:$Vector, u8:$BitShift{0}": {
|
||||
"Desc": ["Sign extends elements from the source element size to the next size up",
|
||||
"Source elements come from the upper half of the register"
|
||||
],
|
||||
"DestSize": "RegisterSize",
|
||||
"NumElements": "RegisterSize / (IR::MultiplyOpSize(ElementSize, 2))"
|
||||
"NumElements": "RegisterSize / (ElementSize << 1)"
|
||||
},
|
||||
"FPR = VUXTL OpSize:#RegisterSize, OpSize:#ElementSize, FPR:$Vector": {
|
||||
"Desc": "Zero extends elements from the source element size to the next size up",
|
||||
"DestSize": "RegisterSize",
|
||||
"NumElements": "RegisterSize / (IR::MultiplyOpSize(ElementSize, 2))"
|
||||
"NumElements": "RegisterSize / (ElementSize << 1)"
|
||||
},
|
||||
"FPR = VUXTL2 OpSize:#RegisterSize, OpSize:#ElementSize, FPR:$Vector": {
|
||||
"Desc": ["Zero extends elements from the source element size to the next size up",
|
||||
"Source elements come from the upper half of the register"
|
||||
],
|
||||
"DestSize": "RegisterSize",
|
||||
"NumElements": "RegisterSize / (IR::MultiplyOpSize(ElementSize, 2))"
|
||||
"NumElements": "RegisterSize / (ElementSize << 1)"
|
||||
},
|
||||
"FPR = VSQXTN OpSize:#RegisterSize, OpSize:#ElementSize, FPR:$Vector": {
|
||||
"TiedSource": 0,
|
||||
"DestSize": "RegisterSize",
|
||||
"NumElements": "RegisterSize / (IR::DivideOpSize(ElementSize, 2))"
|
||||
"NumElements": "RegisterSize / (ElementSize >> 1)"
|
||||
},
|
||||
"FPR = VSQXTN2 OpSize:#RegisterSize, OpSize:#ElementSize, FPR:$VectorLower, FPR:$VectorUpper": {
|
||||
"TiedSource": 0,
|
||||
"DestSize": "RegisterSize",
|
||||
"NumElements": "RegisterSize / (IR::DivideOpSize(ElementSize, 2))"
|
||||
"NumElements": "RegisterSize / (ElementSize >> 1)"
|
||||
},
|
||||
"FPR = VSQXTNPair OpSize:#RegisterSize, OpSize:#ElementSize, FPR:$VectorLower, FPR:$VectorUpper": {
|
||||
"Desc": ["Does both VSQXTN and VSQXTN2 in a combined operation."
|
||||
],
|
||||
"DestSize": "RegisterSize",
|
||||
"NumElements": "RegisterSize / (IR::DivideOpSize(ElementSize, 2))"
|
||||
"NumElements": "RegisterSize / (ElementSize >> 1)"
|
||||
},
|
||||
"FPR = VSQXTUN OpSize:#RegisterSize, OpSize:#ElementSize, FPR:$Vector": {
|
||||
"DestSize": "RegisterSize",
|
||||
"NumElements": "RegisterSize / (IR::DivideOpSize(ElementSize, 2))"
|
||||
"NumElements": "RegisterSize / (ElementSize >> 1)"
|
||||
},
|
||||
"FPR = VSQXTUN2 OpSize:#RegisterSize, OpSize:#ElementSize, FPR:$VectorLower, FPR:$VectorUpper": {
|
||||
"DestSize": "RegisterSize",
|
||||
"NumElements": "RegisterSize / (IR::DivideOpSize(ElementSize, 2))"
|
||||
"NumElements": "RegisterSize / (ElementSize >> 1)"
|
||||
},
|
||||
"FPR = VSQXTUNPair OpSize:#RegisterSize, OpSize:#ElementSize, FPR:$VectorLower, FPR:$VectorUpper": {
|
||||
"Desc": ["Does both VSQXTUN and VSQXTUN2 in a combined operation."
|
||||
],
|
||||
"DestSize": "RegisterSize",
|
||||
"NumElements": "RegisterSize / (IR::DivideOpSize(ElementSize, 2))"
|
||||
"NumElements": "RegisterSize / (ElementSize >> 1)"
|
||||
},
|
||||
"FPR = VSRSHR OpSize:#RegisterSize, OpSize:#ElementSize, FPR:$Vector, u8:$BitShift": {
|
||||
"Desc": ["Signed rounding shift right by immediate",
|
||||
@ -2271,24 +2271,24 @@
|
||||
},
|
||||
"FPR = VUMull OpSize:#RegisterSize, OpSize:#ElementSize, FPR:$Vector1, FPR:$Vector2": {
|
||||
"DestSize": "RegisterSize",
|
||||
"NumElements": "RegisterSize / (IR::MultiplyOpSize(ElementSize, 2))"
|
||||
"NumElements": "RegisterSize / (ElementSize << 1)"
|
||||
},
|
||||
"FPR = VSMull OpSize:#RegisterSize, OpSize:#ElementSize, FPR:$Vector1, FPR:$Vector2": {
|
||||
"Desc": [ "Does a signed integer multiply with extend.",
|
||||
"ElementSize is the source size"
|
||||
],
|
||||
"DestSize": "RegisterSize",
|
||||
"NumElements": "RegisterSize / (IR::MultiplyOpSize(ElementSize, 2))"
|
||||
"NumElements": "RegisterSize / (ElementSize << 1)"
|
||||
},
|
||||
"FPR = VUMull2 OpSize:#RegisterSize, OpSize:#ElementSize, FPR:$Vector1, FPR:$Vector2": {
|
||||
"Desc": "Multiplies the high elements with size extension",
|
||||
"DestSize": "RegisterSize",
|
||||
"NumElements": "RegisterSize / (IR::MultiplyOpSize(ElementSize, 2))"
|
||||
"NumElements": "RegisterSize / (ElementSize << 1)"
|
||||
},
|
||||
"FPR = VSMull2 OpSize:#RegisterSize, OpSize:#ElementSize, FPR:$Vector1, FPR:$Vector2": {
|
||||
"Desc": "Multiplies the high elements with size extension",
|
||||
"DestSize": "RegisterSize",
|
||||
"NumElements": "RegisterSize / (IR::MultiplyOpSize(ElementSize, 2))"
|
||||
"NumElements": "RegisterSize / (ElementSize << 1)"
|
||||
},
|
||||
"FPR = VUMulH OpSize:#RegisterSize, OpSize:#ElementSize, FPR:$Vector1, FPR:$Vector2": {
|
||||
"Desc": "Wide unsigned multiply returning the high results",
|
||||
@ -2305,14 +2305,14 @@
|
||||
"Desc": ["Unsigned Absolute Difference Long"
|
||||
],
|
||||
"DestSize": "RegisterSize",
|
||||
"NumElements": "RegisterSize / (IR::MultiplyOpSize(ElementSize, 2))"
|
||||
"NumElements": "RegisterSize / (ElementSize << 1)"
|
||||
},
|
||||
"FPR = VUABDL2 OpSize:#RegisterSize, OpSize:#ElementSize, FPR:$Vector1, FPR:$Vector2": {
|
||||
"Desc": ["Unsigned Absolute Difference Long",
|
||||
"Using the high elements of the source vectors"
|
||||
],
|
||||
"DestSize": "RegisterSize",
|
||||
"NumElements": "RegisterSize / (IR::MultiplyOpSize(ElementSize, 2))"
|
||||
"NumElements": "RegisterSize / (ElementSize << 1)"
|
||||
},
|
||||
"FPR = VUShl OpSize:#RegisterSize, OpSize:#ElementSize, FPR:$Vector, FPR:$ShiftVector, i1:$RangeCheck": {
|
||||
"TiedSource": 0,
|
||||
@ -2580,7 +2580,7 @@
|
||||
"Selecting from the high half of the register."
|
||||
],
|
||||
"DestSize": "RegisterSize",
|
||||
"NumElements": "RegisterSize / (IR::MultiplyOpSize(ElementSize, 2))",
|
||||
"NumElements": "RegisterSize / (ElementSize << 1)",
|
||||
"EmitValidation": [
|
||||
"RegisterSize != FEXCore::IR::OpSize::i256Bit && \"What does 256-bit mean in this context?\""
|
||||
]
|
||||
@ -2594,7 +2594,7 @@
|
||||
"F64->F32, F32->F16"
|
||||
],
|
||||
"DestSize": "RegisterSize",
|
||||
"NumElements": "RegisterSize / (IR::DivideOpSize(ElementSize, 2))",
|
||||
"NumElements": "RegisterSize / (ElementSize >> 1)",
|
||||
"EmitValidation": [
|
||||
"RegisterSize != FEXCore::IR::OpSize::i256Bit && \"What does 256-bit mean in this context?\""
|
||||
]
|
||||
|
@ -112,17 +112,17 @@ static void PrintArg(fextl::stringstream* out, const IRListView* IR, OrderedNode
|
||||
}
|
||||
|
||||
if (GetHasDest(IROp->Op)) {
|
||||
uint32_t ElementSize = IROp->ElementSize;
|
||||
uint32_t NumElements = IROp->Size;
|
||||
if (!IROp->ElementSize) {
|
||||
auto ElementSize = IROp->ElementSize;
|
||||
uint32_t NumElements = 0;
|
||||
if (IROp->ElementSize == OpSize::iUnsized) {
|
||||
ElementSize = IROp->Size;
|
||||
}
|
||||
|
||||
if (ElementSize) {
|
||||
NumElements /= ElementSize;
|
||||
if (ElementSize != OpSize::iUnsized) {
|
||||
NumElements = IR::NumElements(IROp->Size, ElementSize);
|
||||
}
|
||||
|
||||
*out << " i" << std::dec << (ElementSize * 8);
|
||||
*out << " i" << std::dec << IR::OpSizeAsBits(ElementSize);
|
||||
|
||||
if (NumElements > 1) {
|
||||
*out << "v" << std::dec << NumElements;
|
||||
@ -296,11 +296,11 @@ void Dump(fextl::stringstream* out, const IRListView* IR, IR::RegisterAllocation
|
||||
|
||||
auto ElementSize = IROp->ElementSize;
|
||||
uint8_t NumElements = 0;
|
||||
if (!IROp->ElementSize) {
|
||||
if (IROp->ElementSize != OpSize::iUnsized) {
|
||||
ElementSize = IROp->Size;
|
||||
}
|
||||
|
||||
if (ElementSize) {
|
||||
if (ElementSize != OpSize::iUnsized) {
|
||||
NumElements = IR::NumElements(IROp->Size, ElementSize);
|
||||
}
|
||||
|
||||
@ -324,7 +324,7 @@ void Dump(fextl::stringstream* out, const IRListView* IR, IR::RegisterAllocation
|
||||
}
|
||||
}
|
||||
|
||||
*out << " i" << std::dec << (ElementSize * 8);
|
||||
*out << " i" << std::dec << IR::OpSizeAsBits(ElementSize);
|
||||
|
||||
if (NumElements > 1) {
|
||||
*out << "v" << std::dec << NumElements;
|
||||
@ -334,16 +334,16 @@ void Dump(fextl::stringstream* out, const IRListView* IR, IR::RegisterAllocation
|
||||
} else {
|
||||
|
||||
auto ElementSize = IROp->ElementSize;
|
||||
if (!IROp->ElementSize) {
|
||||
if (IROp->ElementSize == OpSize::iUnsized) {
|
||||
ElementSize = IROp->Size;
|
||||
}
|
||||
uint32_t NumElements = 0;
|
||||
if (ElementSize) {
|
||||
if (ElementSize != OpSize::iUnsized) {
|
||||
NumElements = IR::NumElements(IROp->Size, ElementSize);
|
||||
}
|
||||
|
||||
*out << "(%" << std::dec << ID << ' ';
|
||||
*out << 'i' << std::dec << (ElementSize * 8);
|
||||
*out << 'i' << std::dec << IR::OpSizeAsBits(ElementSize);
|
||||
if (NumElements > 1) {
|
||||
*out << 'v' << std::dec << NumElements;
|
||||
}
|
||||
|
@ -71,19 +71,18 @@ public:
|
||||
return _Jump(InvalidNode);
|
||||
}
|
||||
IRPair<IROp_CondJump> _CondJump(Ref ssa0, CondClassType cond = {COND_NEQ}) {
|
||||
return _CondJump(ssa0, _Constant(0), InvalidNode, InvalidNode, cond, IR::SizeToOpSize(GetOpSize(ssa0)));
|
||||
return _CondJump(ssa0, _Constant(0), InvalidNode, InvalidNode, cond, GetOpSize(ssa0));
|
||||
}
|
||||
IRPair<IROp_CondJump> _CondJump(Ref ssa0, Ref ssa1, Ref ssa2, CondClassType cond = {COND_NEQ}) {
|
||||
return _CondJump(ssa0, _Constant(0), ssa1, ssa2, cond, IR::SizeToOpSize(GetOpSize(ssa0)));
|
||||
return _CondJump(ssa0, _Constant(0), ssa1, ssa2, cond, GetOpSize(ssa0));
|
||||
}
|
||||
// TODO: Work to remove this implicit sized Select implementation.
|
||||
IRPair<IROp_Select> _Select(uint8_t Cond, Ref ssa0, Ref ssa1, Ref ssa2, Ref ssa3, uint8_t CompareSize = 0) {
|
||||
if (CompareSize == 0) {
|
||||
CompareSize = std::max<uint8_t>(4, std::max<uint8_t>(GetOpSize(ssa0), GetOpSize(ssa1)));
|
||||
IRPair<IROp_Select> _Select(uint8_t Cond, Ref ssa0, Ref ssa1, Ref ssa2, Ref ssa3, IR::OpSize CompareSize = OpSize::iUnsized) {
|
||||
if (CompareSize == OpSize::iUnsized) {
|
||||
CompareSize = std::max(OpSize::i32Bit, std::max(GetOpSize(ssa0), GetOpSize(ssa1)));
|
||||
}
|
||||
|
||||
return _Select(IR::SizeToOpSize(std::max<uint8_t>(4, std::max<uint8_t>(GetOpSize(ssa2), GetOpSize(ssa3)))),
|
||||
IR::SizeToOpSize(CompareSize), CondClassType {Cond}, ssa0, ssa1, ssa2, ssa3);
|
||||
return _Select(std::max(OpSize::i32Bit, std::max(GetOpSize(ssa2), GetOpSize(ssa3))), CompareSize, CondClassType {Cond}, ssa0, ssa1, ssa2, ssa3);
|
||||
}
|
||||
IRPair<IROp_LoadMem> _LoadMem(FEXCore::IR::RegisterClassType Class, IR::OpSize Size, Ref ssa0, IR::OpSize Align = OpSize::i8Bit) {
|
||||
return _LoadMem(Class, Size, ssa0, Invalid(), Align, MEM_OFFSET_SXTX, 1);
|
||||
|
@ -29,7 +29,7 @@ $end_info$
|
||||
namespace FEXCore::IR {
|
||||
|
||||
uint64_t getMask(IROp_Header* Op) {
|
||||
uint64_t NumBits = Op->Size * 8;
|
||||
uint64_t NumBits = IR::OpSizeAsBits(Op->Size);
|
||||
return (~0ULL) >> (64 - NumBits);
|
||||
}
|
||||
|
||||
@ -91,7 +91,7 @@ private:
|
||||
// We don't allow 8/16-bit operations to have constants, since no
|
||||
// constant would be in bounds after the JIT's 24/16 shift.
|
||||
auto Filter = [&IROp](uint64_t X) {
|
||||
return ARMEmitter::IsImmAddSub(X) && IROp->Size >= 4;
|
||||
return ARMEmitter::IsImmAddSub(X) && IROp->Size >= OpSize::i32Bit;
|
||||
};
|
||||
|
||||
return InlineIf(IREmit, CurrentIR, CodeNode, IROp, Index, Filter);
|
||||
@ -112,7 +112,7 @@ private:
|
||||
IsSIMM9 &= (SupportsTSOImm9 || !TSO);
|
||||
|
||||
// Extended offsets for regular loadstore only.
|
||||
bool IsExtended = (Imm & (IROp->Size - 1)) == 0 && Imm / IROp->Size <= 4095;
|
||||
bool IsExtended = (Imm & (IR::OpSizeToSize(IROp->Size) - 1)) == 0 && Imm / IR::OpSizeToSize(IROp->Size) <= 4095;
|
||||
IsExtended &= !TSO;
|
||||
|
||||
if (IsSIMM9 || IsExtended) {
|
||||
@ -204,7 +204,7 @@ void ConstProp::ConstantPropagation(IREmitter* IREmit, const IRListView& Current
|
||||
/* IsImmAddSub assumes the constants are sign-extended, take care of that
|
||||
* here so we get the optimization for 32-bit adds too.
|
||||
*/
|
||||
if (Op->Header.Size == 4) {
|
||||
if (Op->Header.Size == OpSize::i32Bit) {
|
||||
Constant1 = (int64_t)(int32_t)Constant1;
|
||||
Constant2 = (int64_t)(int32_t)Constant2;
|
||||
}
|
||||
@ -290,12 +290,12 @@ void ConstProp::ConstantPropagation(IREmitter* IREmit, const IRListView& Current
|
||||
}
|
||||
|
||||
if (!Replaced) {
|
||||
InlineIf(IREmit, CurrentIR, CodeNode, IROp, 1, [&IROp](uint64_t X) { return IsImmLogical(X, IROp->Size * 8); });
|
||||
InlineIf(IREmit, CurrentIR, CodeNode, IROp, 1, [&IROp](uint64_t X) { return IsImmLogical(X, IR::OpSizeAsBits(IROp->Size)); });
|
||||
}
|
||||
break;
|
||||
}
|
||||
case OP_OR: {
|
||||
InlineIf(IREmit, CurrentIR, CodeNode, IROp, 1, [&IROp](uint64_t X) { return IsImmLogical(X, IROp->Size * 8); });
|
||||
InlineIf(IREmit, CurrentIR, CodeNode, IROp, 1, [&IROp](uint64_t X) { return IsImmLogical(X, IR::OpSizeAsBits(IROp->Size)); });
|
||||
break;
|
||||
}
|
||||
case OP_XOR: {
|
||||
@ -325,7 +325,7 @@ void ConstProp::ConstantPropagation(IREmitter* IREmit, const IRListView& Current
|
||||
}
|
||||
|
||||
if (!Replaced) {
|
||||
InlineIf(IREmit, CurrentIR, CodeNode, IROp, 1, [&IROp](uint64_t X) { return IsImmLogical(X, IROp->Size * 8); });
|
||||
InlineIf(IREmit, CurrentIR, CodeNode, IROp, 1, [&IROp](uint64_t X) { return IsImmLogical(X, IR::OpSizeAsBits(IROp->Size)); });
|
||||
}
|
||||
}
|
||||
break;
|
||||
@ -333,7 +333,7 @@ void ConstProp::ConstantPropagation(IREmitter* IREmit, const IRListView& Current
|
||||
case OP_ANDWITHFLAGS:
|
||||
case OP_ANDN:
|
||||
case OP_TESTNZ: {
|
||||
InlineIf(IREmit, CurrentIR, CodeNode, IROp, 1, [&IROp](uint64_t X) { return IsImmLogical(X, IROp->Size * 8); });
|
||||
InlineIf(IREmit, CurrentIR, CodeNode, IROp, 1, [&IROp](uint64_t X) { return IsImmLogical(X, IR::OpSizeAsBits(IROp->Size)); });
|
||||
break;
|
||||
}
|
||||
case OP_NEG: {
|
||||
@ -356,7 +356,7 @@ void ConstProp::ConstantPropagation(IREmitter* IREmit, const IRListView& Current
|
||||
|
||||
if (IREmit->IsValueConstant(IROp->Args[0], &Constant1) && IREmit->IsValueConstant(IROp->Args[1], &Constant2)) {
|
||||
// Shifts mask the shift amount by 63 or 31 depending on operating size;
|
||||
uint64_t ShiftMask = IROp->Size == 8 ? 63 : 31;
|
||||
uint64_t ShiftMask = IROp->Size == OpSize::i64Bit ? 63 : 31;
|
||||
uint64_t NewConstant = (Constant1 << (Constant2 & ShiftMask)) & getMask(IROp);
|
||||
IREmit->ReplaceWithConstant(CodeNode, NewConstant);
|
||||
} else if (IREmit->IsValueConstant(IROp->Args[1], &Constant2) && Constant2 == 0) {
|
||||
@ -384,7 +384,7 @@ void ConstProp::ConstantPropagation(IREmitter* IREmit, const IRListView& Current
|
||||
auto Op = IROp->C<IR::IROp_Bfe>();
|
||||
uint64_t Constant;
|
||||
|
||||
if (IROp->Size <= 8 && IREmit->IsValueConstant(Op->Src, &Constant)) {
|
||||
if (IROp->Size <= OpSize::i64Bit && IREmit->IsValueConstant(Op->Src, &Constant)) {
|
||||
uint64_t SourceMask = Op->Width == 64 ? ~0ULL : ((1ULL << Op->Width) - 1);
|
||||
SourceMask <<= Op->lsb;
|
||||
|
||||
@ -400,7 +400,7 @@ void ConstProp::ConstantPropagation(IREmitter* IREmit, const IRListView& Current
|
||||
if (IREmit->IsValueConstant(Op->Src, &Constant)) {
|
||||
// SBFE of a constant can be converted to a constant.
|
||||
uint64_t SourceMask = Op->Width == 64 ? ~0ULL : ((1ULL << Op->Width) - 1);
|
||||
uint64_t DestSizeInBits = IROp->Size * 8;
|
||||
uint64_t DestSizeInBits = IR::OpSizeAsBits(IROp->Size);
|
||||
uint64_t DestMask = DestSizeInBits == 64 ? ~0ULL : ((1ULL << DestSizeInBits) - 1);
|
||||
SourceMask <<= Op->lsb;
|
||||
|
||||
@ -424,11 +424,11 @@ void ConstProp::ConstantPropagation(IREmitter* IREmit, const IRListView& Current
|
||||
uint64_t NewConstant = SourceMask << Op->lsb;
|
||||
|
||||
if (ConstantSrc & 1) {
|
||||
auto orr = IREmit->_Or(IR::SizeToOpSize(IROp->Size), CurrentIR.GetNode(IROp->Args[0]), IREmit->_Constant(NewConstant));
|
||||
auto orr = IREmit->_Or(IROp->Size, CurrentIR.GetNode(IROp->Args[0]), IREmit->_Constant(NewConstant));
|
||||
IREmit->ReplaceAllUsesWith(CodeNode, orr);
|
||||
} else {
|
||||
// We are wanting to clear the bitfield.
|
||||
auto andn = IREmit->_Andn(IR::SizeToOpSize(IROp->Size), CurrentIR.GetNode(IROp->Args[0]), IREmit->_Constant(NewConstant));
|
||||
auto andn = IREmit->_Andn(IROp->Size, CurrentIR.GetNode(IROp->Args[0]), IREmit->_Constant(NewConstant));
|
||||
IREmit->ReplaceAllUsesWith(CodeNode, andn);
|
||||
}
|
||||
}
|
||||
@ -596,7 +596,7 @@ void ConstProp::ConstantPropagation(IREmitter* IREmit, const IRListView& Current
|
||||
case OP_SELECT: {
|
||||
InlineIf(IREmit, CurrentIR, CodeNode, IROp, 1, ARMEmitter::IsImmAddSub);
|
||||
|
||||
uint64_t AllOnes = IROp->Size == 8 ? 0xffff'ffff'ffff'ffffull : 0xffff'ffffull;
|
||||
uint64_t AllOnes = IROp->Size == OpSize::i64Bit ? 0xffff'ffff'ffff'ffffull : 0xffff'ffffull;
|
||||
|
||||
uint64_t Constant2 {};
|
||||
uint64_t Constant3 {};
|
||||
@ -614,7 +614,7 @@ void ConstProp::ConstantPropagation(IREmitter* IREmit, const IRListView& Current
|
||||
// We always allow source 1 to be zero, but source 0 can only be a
|
||||
// special 1/~0 constant if source 1 is 0.
|
||||
if (InlineIfZero(IREmit, CurrentIR, CodeNode, IROp, 1)) {
|
||||
uint64_t AllOnes = IROp->Size == 8 ? 0xffff'ffff'ffff'ffffull : 0xffff'ffffull;
|
||||
uint64_t AllOnes = IROp->Size == OpSize::i64Bit ? 0xffff'ffff'ffff'ffffull : 0xffff'ffffull;
|
||||
InlineIf(IREmit, CurrentIR, CodeNode, IROp, 0, [&AllOnes](uint64_t X) { return X == 1 || X == AllOnes; });
|
||||
}
|
||||
break;
|
||||
@ -632,7 +632,7 @@ void ConstProp::ConstantPropagation(IREmitter* IREmit, const IRListView& Current
|
||||
auto EO = NewRIP->C<IR::IROp_EntrypointOffset>();
|
||||
IREmit->SetWriteCursor(CurrentIR.GetNode(Op->NewRIP));
|
||||
|
||||
IREmit->ReplaceNodeArgument(CodeNode, 0, IREmit->_InlineEntrypointOffset(IR::SizeToOpSize(EO->Header.Size), EO->Offset));
|
||||
IREmit->ReplaceNodeArgument(CodeNode, 0, IREmit->_InlineEntrypointOffset(EO->Header.Size, EO->Offset));
|
||||
}
|
||||
}
|
||||
break;
|
||||
|
@ -79,12 +79,12 @@ void IRValidation::Run(IREmitter* IREmit) {
|
||||
|
||||
for (auto [CodeNode, IROp] : CurrentIR.GetCode(BlockNode)) {
|
||||
const auto ID = CurrentIR.GetID(CodeNode);
|
||||
const uint8_t OpSize = IROp->Size;
|
||||
const auto OpSize = IROp->Size;
|
||||
|
||||
if (GetHasDest(IROp->Op)) {
|
||||
HadError |= OpSize == 0;
|
||||
HadError |= OpSize == IR::OpSize::iInvalid;
|
||||
// Does the op have a destination of size 0?
|
||||
if (OpSize == 0) {
|
||||
if (OpSize == IR::OpSize::iInvalid) {
|
||||
Errors << "%" << ID << ": Had destination but with no size" << std::endl;
|
||||
}
|
||||
|
||||
|
@ -521,7 +521,7 @@ void DeadFlagCalculationEliminination::FoldBranch(IREmitter* IREmit, IRListView&
|
||||
// Pattern match a branch fed by a compare. We could also handle bit tests
|
||||
// here, but tbz/tbnz has a limited offset range which we don't have a way to
|
||||
// deal with yet. Let's hope that's not a big deal.
|
||||
if (!(Op->Cond == COND_NEQ || Op->Cond == COND_EQ) || (Prev->Size < 4)) {
|
||||
if (!(Op->Cond == COND_NEQ || Op->Cond == COND_EQ) || (Prev->Size < OpSize::i32Bit)) {
|
||||
return;
|
||||
}
|
||||
|
||||
@ -534,7 +534,7 @@ void DeadFlagCalculationEliminination::FoldBranch(IREmitter* IREmit, IRListView&
|
||||
IREmit->ReplaceNodeArgument(CodeNode, 0, CurrentIR.GetNode(Prev->Args[0]));
|
||||
IREmit->ReplaceNodeArgument(CodeNode, 1, CurrentIR.GetNode(Prev->Args[1]));
|
||||
Op->FromNZCV = false;
|
||||
Op->CompareSize = IR::SizeToOpSize(Prev->Size);
|
||||
Op->CompareSize = Prev->Size;
|
||||
} else {
|
||||
return;
|
||||
}
|
||||
@ -612,7 +612,7 @@ bool DeadFlagCalculationEliminination::ProcessBlock(IREmitter* IREmit, IRListVie
|
||||
// this flag is outside of the if, since the TestNZ might result from
|
||||
// optimizing AndWithFlags, and we need to converge locally in a single
|
||||
// iteration.
|
||||
if (IROp->Op == OP_TESTNZ && IROp->Size < 4 && !(FlagsRead & (FLAG_N | FLAG_C))) {
|
||||
if (IROp->Op == OP_TESTNZ && IROp->Size < OpSize::i32Bit && !(FlagsRead & (FLAG_N | FLAG_C))) {
|
||||
IROp->Op = OP_TESTZ;
|
||||
}
|
||||
|
||||
|
@ -582,7 +582,7 @@ void ConstrainedRAPass::Run(IREmitter* IREmit_) {
|
||||
|
||||
if (Reg.Class == FPRFixedClass) {
|
||||
IROp_Header* Header = IR->GetOp<IROp_Header>(Old);
|
||||
Copy = IREmit->_VMov(IR::SizeToOpSize(Header->Size), Map(Old));
|
||||
Copy = IREmit->_VMov(Header->Size, Map(Old));
|
||||
} else {
|
||||
Copy = IREmit->_Copy(Map(Old));
|
||||
}
|
||||
|
@ -731,7 +731,7 @@ void X87StackOptimization::Run(IREmitter* Emit) {
|
||||
} else {
|
||||
auto* SourceNode = CurrentIR.GetNode(Op->X80Src);
|
||||
auto* OriginalNode = CurrentIR.GetNode(Op->OriginalValue);
|
||||
StackData.push(StackMemberInfo {SourceNode, OriginalNode, SizeToOpSize(Op->LoadSize), Op->Float});
|
||||
StackData.push(StackMemberInfo {SourceNode, OriginalNode, Op->LoadSize, Op->Float});
|
||||
}
|
||||
break;
|
||||
}
|
||||
@ -793,7 +793,7 @@ void X87StackOptimization::Run(IREmitter* Emit) {
|
||||
// or similar. As long as the source size and dest size are one and the same.
|
||||
// This will avoid any conversions between source and stack element size and conversion back.
|
||||
if (!SlowPath && Value->Source && Value->Source->first == Op->StoreSize && Value->InterpretAsFloat) {
|
||||
IREmit->_StoreMem(Value->InterpretAsFloat ? FPRClass : GPRClass, IR::SizeToOpSize(Op->StoreSize), AddrNode, Value->Source->second);
|
||||
IREmit->_StoreMem(Value->InterpretAsFloat ? FPRClass : GPRClass, Op->StoreSize, AddrNode, Value->Source->second);
|
||||
} else {
|
||||
if (ReducedPrecisionMode) {
|
||||
switch (Op->StoreSize) {
|
||||
@ -826,7 +826,7 @@ void X87StackOptimization::Run(IREmitter* Emit) {
|
||||
auto DestAddr = IREmit->_Add(OpSize::i64Bit, AddrNode, GetConstant(8));
|
||||
IREmit->_StoreMem(GPRClass, OpSize::i16Bit, DestAddr, Upper, OpSize::i64Bit);
|
||||
} else {
|
||||
IREmit->_StoreMem(FPRClass, IR::SizeToOpSize(Op->StoreSize), AddrNode, StackNode);
|
||||
IREmit->_StoreMem(FPRClass, Op->StoreSize, AddrNode, StackNode);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
Loading…
Reference in New Issue
Block a user