Merge pull request #4149 from Sonicadvance1/iropsize_convert_class
Some checks failed
Build + Test / build_plus_test ([self-hosted ARMv8.0]) (push) Has been cancelled
Build + Test / build_plus_test ([self-hosted ARMv8.2]) (push) Has been cancelled
Build + Test / build_plus_test ([self-hosted ARMv8.4]) (push) Has been cancelled
GLIBC fault test / glibc_fault_test ([self-hosted ARM64]) (push) Has been cancelled
Hostrunner tests / hostrunner_tests ([self-hosted x64]) (push) Has been cancelled
Instruction Count CI run / instcountci_tests ([self-hosted ARM64]) (push) Has been cancelled
Instruction Count CI run / instcountci_tests ([self-hosted x64]) (push) Has been cancelled
Mingw build / mingw_build ([self-hosted ARM64 mingw]) (push) Has been cancelled
Mingw build / mingw_build ([self-hosted ARM64EC mingw ARM64]) (push) Has been cancelled
Vixl Simulator run / vixl_simulator ([self-hosted ARMv8.4]) (push) Has been cancelled
Vixl Simulator run / vixl_simulator ([self-hosted x64]) (push) Has been cancelled

IR: Convert OpSize over to enum class
This commit is contained in:
LC 2024-10-30 23:55:55 -04:00 committed by GitHub
commit 5ad7fdb2f3
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
25 changed files with 1207 additions and 1177 deletions

View File

@ -632,12 +632,12 @@ def print_ir_allocator_helpers():
output_file.write("\tIR::OpSize GetOpSize(const OrderedNode *Op) const {\n")
output_file.write("\t\tauto HeaderOp = Op->Header.Value.GetNode(DualListData.DataBegin());\n")
output_file.write("\t\treturn IR::SizeToOpSize(HeaderOp->Size);\n")
output_file.write("\t\treturn HeaderOp->Size;\n")
output_file.write("\t}\n\n")
output_file.write("\tIR::OpSize GetOpElementSize(const OrderedNode *Op) const {\n")
output_file.write("\t\tauto HeaderOp = Op->Header.Value.GetNode(DualListData.DataBegin());\n")
output_file.write("\t\treturn IR::SizeToOpSize(HeaderOp->ElementSize);\n")
output_file.write("\t\treturn HeaderOp->ElementSize;\n")
output_file.write("\t}\n\n")
output_file.write("\tuint8_t GetOpElements(const OrderedNode *Op) const {\n")

View File

@ -79,7 +79,7 @@ void InterpreterOps::FillFallbackIndexPointers(uint64_t* Info) {
}
bool InterpreterOps::GetFallbackHandler(bool SupportsPreserveAllABI, const IR::IROp_Header* IROp, FallbackInfo* Info) {
uint8_t OpSize = IROp->Size;
const auto OpSize = IROp->Size;
switch (IROp->Op) {
case IR::OP_F80CVTTO: {
auto Op = IROp->C<IR::IROp_F80CVTTo>();
@ -99,11 +99,11 @@ bool InterpreterOps::GetFallbackHandler(bool SupportsPreserveAllABI, const IR::I
}
case IR::OP_F80CVT: {
switch (OpSize) {
case 4: {
case IR::OpSize::i32Bit: {
*Info = {FABI_F32_I16_F80, (void*)&FEXCore::CPU::OpHandlers<IR::OP_F80CVT>::handle4, Core::OPINDEX_F80CVT_4, SupportsPreserveAllABI};
return true;
}
case 8: {
case IR::OpSize::i64Bit: {
*Info = {FABI_F64_I16_F80, (void*)&FEXCore::CPU::OpHandlers<IR::OP_F80CVT>::handle8, Core::OPINDEX_F80CVT_8, SupportsPreserveAllABI};
return true;
}
@ -115,7 +115,7 @@ bool InterpreterOps::GetFallbackHandler(bool SupportsPreserveAllABI, const IR::I
auto Op = IROp->C<IR::IROp_F80CVTInt>();
switch (OpSize) {
case 2: {
case IR::OpSize::i16Bit: {
if (Op->Truncate) {
*Info = {FABI_I16_I16_F80, (void*)&FEXCore::CPU::OpHandlers<IR::OP_F80CVTINT>::handle2t, Core::OPINDEX_F80CVTINT_TRUNC2,
SupportsPreserveAllABI};
@ -124,7 +124,7 @@ bool InterpreterOps::GetFallbackHandler(bool SupportsPreserveAllABI, const IR::I
}
return true;
}
case 4: {
case IR::OpSize::i32Bit: {
if (Op->Truncate) {
*Info = {FABI_I32_I16_F80, (void*)&FEXCore::CPU::OpHandlers<IR::OP_F80CVTINT>::handle4t, Core::OPINDEX_F80CVTINT_TRUNC4,
SupportsPreserveAllABI};
@ -133,7 +133,7 @@ bool InterpreterOps::GetFallbackHandler(bool SupportsPreserveAllABI, const IR::I
}
return true;
}
case 8: {
case IR::OpSize::i64Bit: {
if (Op->Truncate) {
*Info = {FABI_I64_I16_F80, (void*)&FEXCore::CPU::OpHandlers<IR::OP_F80CVTINT>::handle8t, Core::OPINDEX_F80CVTINT_TRUNC8,
SupportsPreserveAllABI};

View File

@ -54,8 +54,8 @@ DEF_OP(EntrypointOffset) {
auto Constant = Entry + Op->Offset;
auto Dst = GetReg(Node);
uint64_t Mask = ~0ULL;
uint8_t OpSize = IROp->Size;
if (OpSize == 4) {
const auto OpSize = IROp->Size;
if (OpSize == IR::OpSize::i32Bit) {
Mask = 0xFFFF'FFFFULL;
}
@ -92,10 +92,10 @@ DEF_OP(AddNZCV) {
uint64_t Const;
if (IsInlineConstant(Op->Src2, &Const)) {
LOGMAN_THROW_AA_FMT(IROp->Size >= 4, "Constant not allowed here");
LOGMAN_THROW_AA_FMT(IROp->Size >= IR::OpSize::i32Bit, "Constant not allowed here");
cmn(EmitSize, Src1, Const);
} else if (IROp->Size < 4) {
unsigned Shift = 32 - (8 * IROp->Size);
} else if (IROp->Size < IR::OpSize::i32Bit) {
unsigned Shift = 32 - IR::OpSizeAsBits(IROp->Size);
lsl(ARMEmitter::Size::i32Bit, TMP1, Src1, Shift);
cmn(EmitSize, TMP1, GetReg(Op->Src2.ID()), ARMEmitter::ShiftType::LSL, Shift);
@ -165,7 +165,7 @@ DEF_OP(TestNZ) {
// Shift the sign bit into place, clearing out the garbage in upper bits.
// Adding zero does an effective test, setting NZ according to the result and
// zeroing CV.
if (IROp->Size < 4) {
if (IROp->Size < IR::OpSize::i32Bit) {
// Cheaper to and+cmn than to lsl+lsl+tst, so do the and ourselves if
// needed.
if (Op->Src1 != Op->Src2) {
@ -179,7 +179,7 @@ DEF_OP(TestNZ) {
Src1 = TMP1;
}
unsigned Shift = 32 - (IROp->Size * 8);
unsigned Shift = 32 - IR::OpSizeAsBits(IROp->Size);
cmn(EmitSize, ARMEmitter::Reg::zr, Src1, ARMEmitter::ShiftType::LSL, Shift);
} else {
if (IsInlineConstant(Op->Src2, &Const)) {
@ -193,11 +193,11 @@ DEF_OP(TestNZ) {
DEF_OP(TestZ) {
auto Op = IROp->C<IR::IROp_TestZ>();
LOGMAN_THROW_AA_FMT(IROp->Size < 4, "TestNZ used at higher sizes");
LOGMAN_THROW_AA_FMT(IROp->Size < IR::OpSize::i32Bit, "TestNZ used at higher sizes");
const auto EmitSize = ARMEmitter::Size::i32Bit;
uint64_t Const;
uint64_t Mask = IROp->Size == 8 ? ~0ULL : ((1ull << (IROp->Size * 8)) - 1);
uint64_t Mask = IROp->Size == IR::OpSize::i64Bit ? ~0ULL : ((1ull << IR::OpSizeAsBits(IROp->Size)) - 1);
auto Src1 = GetReg(Op->Src1.ID());
if (IsInlineConstant(Op->Src2, &Const)) {
@ -223,25 +223,25 @@ DEF_OP(SubShift) {
DEF_OP(SubNZCV) {
auto Op = IROp->C<IR::IROp_SubNZCV>();
const uint8_t OpSize = IROp->Size;
const auto OpSize = IROp->Size;
const auto EmitSize = ConvertSize(IROp);
uint64_t Const;
if (IsInlineConstant(Op->Src2, &Const)) {
LOGMAN_THROW_AA_FMT(OpSize >= 4, "Constant not allowed here");
LOGMAN_THROW_AA_FMT(OpSize >= IR::OpSize::i32Bit, "Constant not allowed here");
cmp(EmitSize, GetReg(Op->Src1.ID()), Const);
} else {
unsigned Shift = OpSize < 4 ? (32 - (8 * OpSize)) : 0;
unsigned Shift = OpSize < IR::OpSize::i32Bit ? (32 - IR::OpSizeAsBits(OpSize)) : 0;
ARMEmitter::Register ShiftedSrc1 = GetZeroableReg(Op->Src1);
// Shift to fix flags for <32-bit ops.
// Any shift of zero is still zero so optimize out silly zero shifts.
if (OpSize < 4 && ShiftedSrc1 != ARMEmitter::Reg::zr) {
if (OpSize < IR::OpSize::i32Bit && ShiftedSrc1 != ARMEmitter::Reg::zr) {
lsl(ARMEmitter::Size::i32Bit, TMP1, ShiftedSrc1, Shift);
ShiftedSrc1 = TMP1;
}
if (OpSize < 4) {
if (OpSize < IR::OpSize::i32Bit) {
cmp(EmitSize, ShiftedSrc1, GetReg(Op->Src2.ID()), ARMEmitter::ShiftType::LSL, Shift);
} else {
cmp(EmitSize, ShiftedSrc1, GetReg(Op->Src2.ID()));
@ -286,10 +286,10 @@ DEF_OP(SetSmallNZV) {
auto Op = IROp->C<IR::IROp_SetSmallNZV>();
LOGMAN_THROW_A_FMT(CTX->HostFeatures.SupportsFlagM, "Unsupported flagm op");
const uint8_t OpSize = IROp->Size;
LOGMAN_THROW_AA_FMT(OpSize == 1 || OpSize == 2, "Unsupported {} size: {}", __func__, OpSize);
const auto OpSize = IROp->Size;
LOGMAN_THROW_AA_FMT(OpSize == IR::OpSize::i8Bit || OpSize == IR::OpSize::i16Bit, "Unsupported {} size: {}", __func__, OpSize);
if (OpSize == 1) {
if (OpSize == IR::OpSize::i8Bit) {
setf8(GetReg(Op->Src.ID()).W());
} else {
setf16(GetReg(Op->Src.ID()).W());
@ -401,20 +401,20 @@ DEF_OP(Div) {
// Each source is OpSize in size
// So you can have up to a 128bit divide from x86-64
const uint8_t OpSize = IROp->Size;
const auto OpSize = IROp->Size;
const auto EmitSize = ConvertSize(IROp);
const auto Dst = GetReg(Node);
auto Src1 = GetReg(Op->Src1.ID());
auto Src2 = GetReg(Op->Src2.ID());
if (OpSize == 1) {
if (OpSize == IR::OpSize::i8Bit) {
sxtb(EmitSize, TMP1, Src1);
sxtb(EmitSize, TMP2, Src2);
Src1 = TMP1;
Src2 = TMP2;
} else if (OpSize == 2) {
} else if (OpSize == IR::OpSize::i16Bit) {
sxth(EmitSize, TMP1, Src1);
sxth(EmitSize, TMP2, Src2);
@ -430,20 +430,20 @@ DEF_OP(UDiv) {
// Each source is OpSize in size
// So you can have up to a 128bit divide from x86-64
const uint8_t OpSize = IROp->Size;
const auto OpSize = IROp->Size;
const auto EmitSize = ConvertSize(IROp);
const auto Dst = GetReg(Node);
auto Src1 = GetReg(Op->Src1.ID());
auto Src2 = GetReg(Op->Src2.ID());
if (OpSize == 1) {
if (OpSize == IR::OpSize::i8Bit) {
uxtb(EmitSize, TMP1, Src1);
uxtb(EmitSize, TMP2, Src2);
Src1 = TMP1;
Src2 = TMP2;
} else if (OpSize == 2) {
} else if (OpSize == IR::OpSize::i16Bit) {
uxth(EmitSize, TMP1, Src1);
uxth(EmitSize, TMP2, Src2);
@ -458,20 +458,20 @@ DEF_OP(Rem) {
auto Op = IROp->C<IR::IROp_Rem>();
// Each source is OpSize in size
// So you can have up to a 128bit divide from x86-64
const uint8_t OpSize = IROp->Size;
const auto OpSize = IROp->Size;
const auto EmitSize = ConvertSize(IROp);
const auto Dst = GetReg(Node);
auto Src1 = GetReg(Op->Src1.ID());
auto Src2 = GetReg(Op->Src2.ID());
if (OpSize == 1) {
if (OpSize == IR::OpSize::i8Bit) {
sxtb(EmitSize, TMP1, Src1);
sxtb(EmitSize, TMP2, Src2);
Src1 = TMP1;
Src2 = TMP2;
} else if (OpSize == 2) {
} else if (OpSize == IR::OpSize::i16Bit) {
sxth(EmitSize, TMP1, Src1);
sxth(EmitSize, TMP2, Src2);
@ -487,20 +487,20 @@ DEF_OP(URem) {
auto Op = IROp->C<IR::IROp_URem>();
// Each source is OpSize in size
// So you can have up to a 128bit divide from x86-64
const uint8_t OpSize = IROp->Size;
const auto OpSize = IROp->Size;
const auto EmitSize = ConvertSize(IROp);
const auto Dst = GetReg(Node);
auto Src1 = GetReg(Op->Src1.ID());
auto Src2 = GetReg(Op->Src2.ID());
if (OpSize == 1) {
if (OpSize == IR::OpSize::i8Bit) {
uxtb(EmitSize, TMP1, Src1);
uxtb(EmitSize, TMP2, Src2);
Src1 = TMP1;
Src2 = TMP2;
} else if (OpSize == 2) {
} else if (OpSize == IR::OpSize::i16Bit) {
uxth(EmitSize, TMP1, Src1);
uxth(EmitSize, TMP2, Src2);
@ -514,15 +514,15 @@ DEF_OP(URem) {
DEF_OP(MulH) {
auto Op = IROp->C<IR::IROp_MulH>();
const uint8_t OpSize = IROp->Size;
const auto OpSize = IROp->Size;
LOGMAN_THROW_AA_FMT(OpSize == 4 || OpSize == 8, "Unsupported {} size: {}", __func__, OpSize);
LOGMAN_THROW_AA_FMT(OpSize == IR::OpSize::i32Bit || OpSize == IR::OpSize::i64Bit, "Unsupported {} size: {}", __func__, OpSize);
const auto Dst = GetReg(Node);
const auto Src1 = GetReg(Op->Src1.ID());
const auto Src2 = GetReg(Op->Src2.ID());
if (OpSize == 4) {
if (OpSize == IR::OpSize::i32Bit) {
sxtw(TMP1, Src1.W());
sxtw(TMP2, Src2.W());
mul(ARMEmitter::Size::i32Bit, Dst, TMP1, TMP2);
@ -534,15 +534,15 @@ DEF_OP(MulH) {
DEF_OP(UMulH) {
auto Op = IROp->C<IR::IROp_UMulH>();
const uint8_t OpSize = IROp->Size;
const auto OpSize = IROp->Size;
LOGMAN_THROW_AA_FMT(OpSize == 4 || OpSize == 8, "Unsupported {} size: {}", __func__, OpSize);
LOGMAN_THROW_AA_FMT(OpSize == IR::OpSize::i32Bit || OpSize == IR::OpSize::i64Bit, "Unsupported {} size: {}", __func__, OpSize);
const auto Dst = GetReg(Node);
const auto Src1 = GetReg(Op->Src1.ID());
const auto Src2 = GetReg(Op->Src2.ID());
if (OpSize == 4) {
if (OpSize == IR::OpSize::i32Bit) {
uxtw(ARMEmitter::Size::i64Bit, TMP1, Src1);
uxtw(ARMEmitter::Size::i64Bit, TMP2, Src2);
mul(ARMEmitter::Size::i64Bit, Dst, TMP1, TMP2);
@ -593,7 +593,7 @@ DEF_OP(Ornror) {
DEF_OP(AndWithFlags) {
auto Op = IROp->C<IR::IROp_AndWithFlags>();
const uint8_t OpSize = IROp->Size;
const auto OpSize = IROp->Size;
const auto EmitSize = ConvertSize(IROp);
uint64_t Const;
@ -601,7 +601,7 @@ DEF_OP(AndWithFlags) {
auto Src1 = GetReg(Op->Src1.ID());
// See TestNZ
if (OpSize < 4) {
if (OpSize < IR::OpSize::i32Bit) {
if (IsInlineConstant(Op->Src2, &Const)) {
and_(EmitSize, Dst, Src1, Const);
} else {
@ -614,7 +614,7 @@ DEF_OP(AndWithFlags) {
}
}
unsigned Shift = 32 - (OpSize * 8);
unsigned Shift = 32 - IR::OpSizeAsBits(OpSize);
cmn(EmitSize, ARMEmitter::Reg::zr, Dst, ARMEmitter::ShiftType::LSL, Shift);
} else {
if (IsInlineConstant(Op->Src2, &Const)) {
@ -648,21 +648,21 @@ DEF_OP(Ashr) {
uint64_t Const;
if (IsInlineConstant(Op->Src2, &Const)) {
if (OpSize >= 4) {
if (OpSize >= IR::OpSize::i32Bit) {
asr(EmitSize, Dst, Src1, (unsigned int)Const);
} else {
sbfx(EmitSize, TMP1, Src1, 0, OpSize * 8);
sbfx(EmitSize, TMP1, Src1, 0, IR::OpSizeAsBits(OpSize));
asr(EmitSize, Dst, TMP1, (unsigned int)Const);
ubfx(EmitSize, Dst, Dst, 0, OpSize * 8);
ubfx(EmitSize, Dst, Dst, 0, IR::OpSizeAsBits(OpSize));
}
} else {
const auto Src2 = GetReg(Op->Src2.ID());
if (OpSize >= 4) {
if (OpSize >= IR::OpSize::i32Bit) {
asrv(EmitSize, Dst, Src1, Src2);
} else {
sbfx(EmitSize, TMP1, Src1, 0, OpSize * 8);
sbfx(EmitSize, TMP1, Src1, 0, IR::OpSizeAsBits(OpSize));
asrv(EmitSize, Dst, TMP1, Src2);
ubfx(EmitSize, Dst, Dst, 0, OpSize * 8);
ubfx(EmitSize, Dst, Dst, 0, IR::OpSizeAsBits(OpSize));
}
}
}
@ -897,7 +897,7 @@ DEF_OP(PDep) {
DEF_OP(PExt) {
auto Op = IROp->C<IR::IROp_PExt>();
const auto OpSize = IROp->Size;
const auto OpSizeBitsM1 = (OpSize * 8) - 1;
const auto OpSizeBitsM1 = IR::OpSizeAsBits(OpSize) - 1;
const auto EmitSize = ConvertSize48(IROp);
const auto Input = GetReg(Op->Input.ID());
@ -952,8 +952,8 @@ DEF_OP(PExt) {
DEF_OP(LDiv) {
auto Op = IROp->C<IR::IROp_LDiv>();
const uint8_t OpSize = IROp->Size;
const auto EmitSize = OpSize >= 4 ? ARMEmitter::Size::i64Bit : ARMEmitter::Size::i32Bit;
const auto OpSize = IROp->Size;
const auto EmitSize = OpSize >= IR::OpSize::i32Bit ? ARMEmitter::Size::i64Bit : ARMEmitter::Size::i32Bit;
const auto Dst = GetReg(Node);
const auto Upper = GetReg(Op->Upper.ID());
@ -963,14 +963,14 @@ DEF_OP(LDiv) {
// Each source is OpSize in size
// So you can have up to a 128bit divide from x86-64
switch (OpSize) {
case 2: {
case IR::OpSize::i16Bit: {
uxth(EmitSize, TMP1, Lower);
bfi(EmitSize, TMP1, Upper, 16, 16);
sxth(EmitSize, TMP2, Divisor);
sdiv(EmitSize, Dst, TMP1, TMP2);
break;
}
case 4: {
case IR::OpSize::i32Bit: {
// TODO: 32-bit operation should be guaranteed not to leave garbage in the upper bits.
mov(EmitSize, TMP1, Lower);
bfi(EmitSize, TMP1, Upper, 32, 32);
@ -978,7 +978,7 @@ DEF_OP(LDiv) {
sdiv(EmitSize, Dst, TMP1, TMP2);
break;
}
case 8: {
case IR::OpSize::i64Bit: {
ARMEmitter::SingleUseForwardLabel Only64Bit {};
ARMEmitter::SingleUseForwardLabel LongDIVRet {};
@ -1022,8 +1022,8 @@ DEF_OP(LDiv) {
DEF_OP(LUDiv) {
auto Op = IROp->C<IR::IROp_LUDiv>();
const uint8_t OpSize = IROp->Size;
const auto EmitSize = OpSize >= 4 ? ARMEmitter::Size::i64Bit : ARMEmitter::Size::i32Bit;
const auto OpSize = IROp->Size;
const auto EmitSize = OpSize >= IR::OpSize::i32Bit ? ARMEmitter::Size::i64Bit : ARMEmitter::Size::i32Bit;
const auto Dst = GetReg(Node);
const auto Upper = GetReg(Op->Upper.ID());
@ -1033,20 +1033,20 @@ DEF_OP(LUDiv) {
// Each source is OpSize in size
// So you can have up to a 128bit divide from x86-64=
switch (OpSize) {
case 2: {
case IR::OpSize::i16Bit: {
uxth(EmitSize, TMP1, Lower);
bfi(EmitSize, TMP1, Upper, 16, 16);
udiv(EmitSize, Dst, TMP1, Divisor);
break;
}
case 4: {
case IR::OpSize::i32Bit: {
// TODO: 32-bit operation should be guaranteed not to leave garbage in the upper bits.
mov(EmitSize, TMP1, Lower);
bfi(EmitSize, TMP1, Upper, 32, 32);
udiv(EmitSize, Dst, TMP1, Divisor);
break;
}
case 8: {
case IR::OpSize::i64Bit: {
ARMEmitter::SingleUseForwardLabel Only64Bit {};
ARMEmitter::SingleUseForwardLabel LongDIVRet {};
@ -1086,8 +1086,8 @@ DEF_OP(LUDiv) {
DEF_OP(LRem) {
auto Op = IROp->C<IR::IROp_LRem>();
const uint8_t OpSize = IROp->Size;
const auto EmitSize = OpSize >= 4 ? ARMEmitter::Size::i64Bit : ARMEmitter::Size::i32Bit;
const auto OpSize = IROp->Size;
const auto EmitSize = OpSize >= IR::OpSize::i32Bit ? ARMEmitter::Size::i64Bit : ARMEmitter::Size::i32Bit;
const auto Dst = GetReg(Node);
const auto Upper = GetReg(Op->Upper.ID());
@ -1097,7 +1097,7 @@ DEF_OP(LRem) {
// Each source is OpSize in size
// So you can have up to a 128bit divide from x86-64
switch (OpSize) {
case 2: {
case IR::OpSize::i16Bit: {
uxth(EmitSize, TMP1, Lower);
bfi(EmitSize, TMP1, Upper, 16, 16);
sxth(EmitSize, TMP2, Divisor);
@ -1105,7 +1105,7 @@ DEF_OP(LRem) {
msub(EmitSize, Dst, TMP3, TMP2, TMP1);
break;
}
case 4: {
case IR::OpSize::i32Bit: {
// TODO: 32-bit operation should be guaranteed not to leave garbage in the upper bits.
mov(EmitSize, TMP1, Lower);
bfi(EmitSize, TMP1, Upper, 32, 32);
@ -1114,7 +1114,7 @@ DEF_OP(LRem) {
msub(EmitSize, Dst, TMP2, TMP3, TMP1);
break;
}
case 8: {
case IR::OpSize::i64Bit: {
ARMEmitter::SingleUseForwardLabel Only64Bit {};
ARMEmitter::SingleUseForwardLabel LongDIVRet {};
@ -1160,8 +1160,8 @@ DEF_OP(LRem) {
DEF_OP(LURem) {
auto Op = IROp->C<IR::IROp_LURem>();
const uint8_t OpSize = IROp->Size;
const auto EmitSize = OpSize >= 4 ? ARMEmitter::Size::i64Bit : ARMEmitter::Size::i32Bit;
const auto OpSize = IROp->Size;
const auto EmitSize = OpSize >= IR::OpSize::i32Bit ? ARMEmitter::Size::i64Bit : ARMEmitter::Size::i32Bit;
const auto Dst = GetReg(Node);
const auto Upper = GetReg(Op->Upper.ID());
@ -1171,14 +1171,14 @@ DEF_OP(LURem) {
// Each source is OpSize in size
// So you can have up to a 128bit divide from x86-64
switch (OpSize) {
case 2: {
case IR::OpSize::i16Bit: {
uxth(EmitSize, TMP1, Lower);
bfi(EmitSize, TMP1, Upper, 16, 16);
udiv(EmitSize, TMP2, TMP1, Divisor);
msub(EmitSize, Dst, TMP2, Divisor, TMP1);
break;
}
case 4: {
case IR::OpSize::i32Bit: {
// TODO: 32-bit operation should be guaranteed not to leave garbage in the upper bits.
mov(EmitSize, TMP1, Lower);
bfi(EmitSize, TMP1, Upper, 32, 32);
@ -1186,7 +1186,7 @@ DEF_OP(LURem) {
msub(EmitSize, Dst, TMP2, Divisor, TMP1);
break;
}
case 8: {
case IR::OpSize::i64Bit: {
ARMEmitter::SingleUseForwardLabel Only64Bit {};
ARMEmitter::SingleUseForwardLabel LongDIVRet {};
@ -1238,30 +1238,30 @@ DEF_OP(Not) {
DEF_OP(Popcount) {
auto Op = IROp->C<IR::IROp_Popcount>();
const uint8_t OpSize = IROp->Size;
const auto OpSize = IROp->Size;
const auto Dst = GetReg(Node);
const auto Src = GetReg(Op->Src.ID());
switch (OpSize) {
case 0x1:
case IR::OpSize::i8Bit:
fmov(ARMEmitter::Size::i32Bit, VTMP1.S(), Src);
// only use lowest byte
cnt(ARMEmitter::SubRegSize::i8Bit, VTMP1.D(), VTMP1.D());
break;
case 0x2:
case IR::OpSize::i16Bit:
fmov(ARMEmitter::Size::i32Bit, VTMP1.S(), Src);
cnt(ARMEmitter::SubRegSize::i8Bit, VTMP1.D(), VTMP1.D());
// only count two lowest bytes
addp(ARMEmitter::SubRegSize::i8Bit, VTMP1.D(), VTMP1.D(), VTMP1.D());
break;
case 0x4:
case IR::OpSize::i32Bit:
fmov(ARMEmitter::Size::i32Bit, VTMP1.S(), Src);
cnt(ARMEmitter::SubRegSize::i8Bit, VTMP1.D(), VTMP1.D());
// fmov has zero extended, unused bytes are zero
addv(ARMEmitter::SubRegSize::i8Bit, VTMP1.D(), VTMP1.D());
break;
case 0x8:
case IR::OpSize::i64Bit:
fmov(ARMEmitter::Size::i64Bit, VTMP1.D(), Src);
cnt(ARMEmitter::SubRegSize::i8Bit, VTMP1.D(), VTMP1.D());
// fmov has zero extended, unused bytes are zero
@ -1288,17 +1288,18 @@ DEF_OP(FindLSB) {
DEF_OP(FindMSB) {
auto Op = IROp->C<IR::IROp_FindMSB>();
const uint8_t OpSize = IROp->Size;
const auto OpSize = IROp->Size;
LOGMAN_THROW_AA_FMT(OpSize == 2 || OpSize == 4 || OpSize == 8, "Unsupported {} size: {}", __func__, OpSize);
LOGMAN_THROW_AA_FMT(OpSize == IR::OpSize::i16Bit || OpSize == IR::OpSize::i32Bit || OpSize == IR::OpSize::i64Bit,
"Unsupported {} size: {}", __func__, OpSize);
const auto EmitSize = ConvertSize(IROp);
const auto Dst = GetReg(Node);
const auto Src = GetReg(Op->Src.ID());
movz(ARMEmitter::Size::i64Bit, TMP1, OpSize * 8 - 1);
movz(ARMEmitter::Size::i64Bit, TMP1, IR::OpSizeAsBits(OpSize) - 1);
if (OpSize == 2) {
if (OpSize == IR::OpSize::i16Bit) {
lsl(EmitSize, Dst, Src, 16);
clz(EmitSize, Dst, Dst);
} else {
@ -1310,9 +1311,10 @@ DEF_OP(FindMSB) {
DEF_OP(FindTrailingZeroes) {
auto Op = IROp->C<IR::IROp_FindTrailingZeroes>();
const uint8_t OpSize = IROp->Size;
const auto OpSize = IROp->Size;
LOGMAN_THROW_AA_FMT(OpSize == 2 || OpSize == 4 || OpSize == 8, "Unsupported {} size: {}", __func__, OpSize);
LOGMAN_THROW_AA_FMT(OpSize == IR::OpSize::i16Bit || OpSize == IR::OpSize::i32Bit || OpSize == IR::OpSize::i64Bit,
"Unsupported {} size: {}", __func__, OpSize);
const auto EmitSize = ConvertSize(IROp);
const auto Dst = GetReg(Node);
@ -1320,7 +1322,7 @@ DEF_OP(FindTrailingZeroes) {
rbit(EmitSize, Dst, Src);
if (OpSize == 2) {
if (OpSize == IR::OpSize::i16Bit) {
// This orr does two things. First, if the (masked) source is zero, it
// reverses to zero in the top so it forces clz to return 16. Second, it
// ensures garbage in the upper bits of the source don't affect clz, because
@ -1334,15 +1336,16 @@ DEF_OP(FindTrailingZeroes) {
DEF_OP(CountLeadingZeroes) {
auto Op = IROp->C<IR::IROp_CountLeadingZeroes>();
const uint8_t OpSize = IROp->Size;
const auto OpSize = IROp->Size;
LOGMAN_THROW_AA_FMT(OpSize == 2 || OpSize == 4 || OpSize == 8, "Unsupported {} size: {}", __func__, OpSize);
LOGMAN_THROW_AA_FMT(OpSize == IR::OpSize::i16Bit || OpSize == IR::OpSize::i32Bit || OpSize == IR::OpSize::i64Bit,
"Unsupported {} size: {}", __func__, OpSize);
const auto EmitSize = ConvertSize(IROp);
const auto Dst = GetReg(Node);
const auto Src = GetReg(Op->Src.ID());
if (OpSize == 2) {
if (OpSize == IR::OpSize::i16Bit) {
// Expressing as lsl+orr+clz clears away any garbage in the upper bits
// (alternatively could do uxth+clz+sub.. equal cost in total).
lsl(EmitSize, Dst, Src, 16);
@ -1355,16 +1358,17 @@ DEF_OP(CountLeadingZeroes) {
DEF_OP(Rev) {
auto Op = IROp->C<IR::IROp_Rev>();
const uint8_t OpSize = IROp->Size;
const auto OpSize = IROp->Size;
LOGMAN_THROW_AA_FMT(OpSize == 2 || OpSize == 4 || OpSize == 8, "Unsupported {} size: {}", __func__, OpSize);
LOGMAN_THROW_AA_FMT(OpSize == IR::OpSize::i16Bit || OpSize == IR::OpSize::i32Bit || OpSize == IR::OpSize::i64Bit,
"Unsupported {} size: {}", __func__, OpSize);
const auto EmitSize = ConvertSize(IROp);
const auto Dst = GetReg(Node);
const auto Src = GetReg(Op->Src.ID());
rev(EmitSize, Dst, Src);
if (OpSize == 2) {
if (OpSize == IR::OpSize::i16Bit) {
lsr(EmitSize, Dst, Dst, 16);
}
}
@ -1390,10 +1394,10 @@ DEF_OP(Bfi) {
mov(EmitSize, TMP1, SrcDst);
bfi(EmitSize, TMP1, Src, Op->lsb, Op->Width);
if (IROp->Size >= 4) {
if (IROp->Size >= IR::OpSize::i32Bit) {
mov(EmitSize, Dst, TMP1.R());
} else {
ubfx(EmitSize, Dst, TMP1, 0, IROp->Size * 8);
ubfx(EmitSize, Dst, TMP1, 0, IR::OpSizeAsBits(IROp->Size));
}
}
}
@ -1424,7 +1428,7 @@ DEF_OP(Bfxil) {
DEF_OP(Bfe) {
auto Op = IROp->C<IR::IROp_Bfe>();
LOGMAN_THROW_AA_FMT(IROp->Size <= 8, "OpSize is too large for BFE: {}", IROp->Size);
LOGMAN_THROW_AA_FMT(IROp->Size <= IR::OpSize::i64Bit, "OpSize is too large for BFE: {}", IROp->Size);
LOGMAN_THROW_AA_FMT(Op->Width != 0, "Invalid BFE width of 0");
const auto EmitSize = ConvertSize(IROp);
@ -1434,7 +1438,7 @@ DEF_OP(Bfe) {
if (Op->lsb == 0 && Op->Width == 32) {
mov(ARMEmitter::Size::i32Bit, Dst, Src);
} else if (Op->lsb == 0 && Op->Width == 64) {
LOGMAN_THROW_AA_FMT(IROp->Size == 8, "Must be 64-bit wide register");
LOGMAN_THROW_AA_FMT(IROp->Size == IR::OpSize::i64Bit, "Must be 64-bit wide register");
mov(ARMEmitter::Size::i64Bit, Dst, Src);
} else {
ubfx(EmitSize, Dst, Src, Op->lsb, Op->Width);
@ -1451,7 +1455,7 @@ DEF_OP(Sbfe) {
DEF_OP(Select) {
auto Op = IROp->C<IR::IROp_Select>();
const uint8_t OpSize = IROp->Size;
const auto OpSize = IROp->Size;
const auto EmitSize = ConvertSize(IROp);
const auto CompareEmitSize = Op->CompareSize == IR::OpSize::i64Bit ? ARMEmitter::Size::i64Bit : ARMEmitter::Size::i32Bit;
@ -1479,7 +1483,7 @@ DEF_OP(Select) {
bool is_const_true = IsInlineConstant(Op->TrueVal, &const_true);
bool is_const_false = IsInlineConstant(Op->FalseVal, &const_false);
uint64_t all_ones = OpSize == 8 ? 0xffff'ffff'ffff'ffffull : 0xffff'ffffull;
uint64_t all_ones = OpSize == IR::OpSize::i64Bit ? 0xffff'ffff'ffff'ffffull : 0xffff'ffffull;
ARMEmitter::Register Dst = GetReg(Node);
@ -1508,7 +1512,7 @@ DEF_OP(NZCVSelect) {
bool is_const_true = IsInlineConstant(Op->TrueVal, &const_true);
bool is_const_false = IsInlineConstant(Op->FalseVal, &const_false);
uint64_t all_ones = IROp->Size == 8 ? 0xffff'ffff'ffff'ffffull : 0xffff'ffffull;
uint64_t all_ones = IROp->Size == IR::OpSize::i64Bit ? 0xffff'ffff'ffff'ffffull : 0xffff'ffffull;
ARMEmitter::Register Dst = GetReg(Node);
@ -1547,7 +1551,7 @@ DEF_OP(VExtractToGPR) {
constexpr auto AVXRegBitSize = Core::CPUState::XMM_AVX_REG_SIZE * 8;
constexpr auto SSERegBitSize = Core::CPUState::XMM_SSE_REG_SIZE * 8;
const auto ElementSizeBits = Op->Header.ElementSize * 8;
const auto ElementSizeBits = IR::OpSizeAsBits(Op->Header.ElementSize);
const auto Offset = ElementSizeBits * Op->Index;
const auto Is256Bit = Offset >= SSERegBitSize;
@ -1558,10 +1562,10 @@ DEF_OP(VExtractToGPR) {
const auto PerformMove = [&](const ARMEmitter::VRegister reg, int index) {
switch (OpSize) {
case 1: umov<ARMEmitter::SubRegSize::i8Bit>(Dst, Vector, index); break;
case 2: umov<ARMEmitter::SubRegSize::i16Bit>(Dst, Vector, index); break;
case 4: umov<ARMEmitter::SubRegSize::i32Bit>(Dst, Vector, index); break;
case 8: umov<ARMEmitter::SubRegSize::i64Bit>(Dst, Vector, index); break;
case IR::OpSize::i8Bit: umov<ARMEmitter::SubRegSize::i8Bit>(Dst, Vector, index); break;
case IR::OpSize::i16Bit: umov<ARMEmitter::SubRegSize::i16Bit>(Dst, Vector, index); break;
case IR::OpSize::i32Bit: umov<ARMEmitter::SubRegSize::i32Bit>(Dst, Vector, index); break;
case IR::OpSize::i64Bit: umov<ARMEmitter::SubRegSize::i64Bit>(Dst, Vector, index); break;
default: LOGMAN_MSG_A_FMT("Unhandled ExtractElementSize: {}", OpSize); break;
}
};
@ -1586,10 +1590,10 @@ DEF_OP(VExtractToGPR) {
// upper half of the vector.
const auto SanitizedIndex = [OpSize, Op] {
switch (OpSize) {
case 1: return Op->Index - 16;
case 2: return Op->Index - 8;
case 4: return Op->Index - 4;
case 8: return Op->Index - 2;
case IR::OpSize::i8Bit: return Op->Index - 16;
case IR::OpSize::i16Bit: return Op->Index - 8;
case IR::OpSize::i32Bit: return Op->Index - 4;
case IR::OpSize::i64Bit: return Op->Index - 2;
default: LOGMAN_MSG_A_FMT("Unhandled OpSize: {}", OpSize); return 0;
}
}();

View File

@ -15,18 +15,18 @@ DEF_OP(VInsGPR) {
const auto DestIdx = Op->DestIdx;
const auto ElementSize = Op->Header.ElementSize;
const auto Is256Bit = OpSize == Core::CPUState::XMM_AVX_REG_SIZE;
const auto Is256Bit = OpSize == IR::OpSize::i256Bit;
LOGMAN_THROW_A_FMT(!Is256Bit || (Is256Bit && HostSupportsSVE256), "Need SVE256 support in order to use {} with 256-bit operation", __func__);
const auto SubEmitSize = ConvertSubRegSize8(IROp);
const auto ElementsPer128Bit = 16 / ElementSize;
const auto ElementsPer128Bit = IR::NumElements(IR::OpSize::i128Bit, ElementSize);
const auto Dst = GetVReg(Node);
const auto DestVector = GetVReg(Op->DestVector.ID());
const auto Src = GetReg(Op->Src.ID());
if (HostSupportsSVE256 && Is256Bit) {
const auto ElementSizeBits = ElementSize * 8;
const auto ElementSizeBits = IR::OpSizeAsBits(ElementSize);
const auto Offset = ElementSizeBits * DestIdx;
const auto SSEBitSize = Core::CPUState::XMM_SSE_REG_SIZE * 8;
@ -90,16 +90,16 @@ DEF_OP(VCastFromGPR) {
auto Src = GetReg(Op->Src.ID());
switch (Op->Header.ElementSize) {
case 1:
case IR::OpSize::i8Bit:
uxtb(ARMEmitter::Size::i32Bit, TMP1, Src);
fmov(ARMEmitter::Size::i32Bit, Dst.S(), TMP1);
break;
case 2:
case IR::OpSize::i16Bit:
uxth(ARMEmitter::Size::i32Bit, TMP1, Src);
fmov(ARMEmitter::Size::i32Bit, Dst.S(), TMP1);
break;
case 4: fmov(ARMEmitter::Size::i32Bit, Dst.S(), Src); break;
case 8: fmov(ARMEmitter::Size::i64Bit, Dst.D(), Src); break;
case IR::OpSize::i32Bit: fmov(ARMEmitter::Size::i32Bit, Dst.S(), Src); break;
case IR::OpSize::i64Bit: fmov(ARMEmitter::Size::i64Bit, Dst.D(), Src); break;
default: LOGMAN_MSG_A_FMT("Unknown castGPR element size: {}", Op->Header.ElementSize);
}
}
@ -111,7 +111,7 @@ DEF_OP(VDupFromGPR) {
const auto Dst = GetVReg(Node);
const auto Src = GetReg(Op->Src.ID());
const auto Is256Bit = OpSize == Core::CPUState::XMM_AVX_REG_SIZE;
const auto Is256Bit = OpSize == IR::OpSize::i256Bit;
LOGMAN_THROW_A_FMT(!Is256Bit || (Is256Bit && HostSupportsSVE256), "Need SVE256 support in order to use {} with 256-bit operation", __func__);
const auto SubEmitSize = ConvertSubRegSize8(IROp);
@ -126,7 +126,7 @@ DEF_OP(VDupFromGPR) {
DEF_OP(Float_FromGPR_S) {
const auto Op = IROp->C<IR::IROp_Float_FromGPR_S>();
const uint16_t ElementSize = Op->Header.ElementSize;
const uint16_t ElementSize = IR::OpSizeToSize(Op->Header.ElementSize);
const uint16_t Conv = (ElementSize << 8) | IR::OpSizeToSize(Op->SrcElementSize);
auto Dst = GetVReg(Node);
@ -165,7 +165,7 @@ DEF_OP(Float_FromGPR_S) {
DEF_OP(Float_FToF) {
auto Op = IROp->C<IR::IROp_Float_FToF>();
const uint16_t Conv = (Op->Header.ElementSize << 8) | IR::OpSizeToSize(Op->SrcElementSize);
const uint16_t Conv = (IR::OpSizeToSize(Op->Header.ElementSize) << 8) | IR::OpSizeToSize(Op->SrcElementSize);
auto Dst = GetVReg(Node);
auto Src = GetVReg(Op->Scalar.ID());
@ -205,7 +205,7 @@ DEF_OP(Vector_SToF) {
const auto ElementSize = Op->Header.ElementSize;
const auto SubEmitSize = ConvertSubRegSize248(IROp);
const auto Is256Bit = OpSize == Core::CPUState::XMM_AVX_REG_SIZE;
const auto Is256Bit = OpSize == IR::OpSize::i256Bit;
LOGMAN_THROW_A_FMT(!Is256Bit || (Is256Bit && HostSupportsSVE256), "Need SVE256 support in order to use {} with 256-bit operation", __func__);
const auto Dst = GetVReg(Node);
@ -215,15 +215,15 @@ DEF_OP(Vector_SToF) {
scvtf(Dst.Z(), SubEmitSize, Mask.Merging(), Vector.Z(), SubEmitSize);
} else {
if (OpSize == ElementSize) {
if (ElementSize == 8) {
if (ElementSize == IR::OpSize::i64Bit) {
scvtf(ARMEmitter::ScalarRegSize::i64Bit, Dst.D(), Vector.D());
} else if (ElementSize == 4) {
} else if (ElementSize == IR::OpSize::i32Bit) {
scvtf(ARMEmitter::ScalarRegSize::i32Bit, Dst.S(), Vector.S());
} else {
scvtf(ARMEmitter::ScalarRegSize::i16Bit, Dst.H(), Vector.H());
}
} else {
if (OpSize == 8) {
if (OpSize == IR::OpSize::i64Bit) {
scvtf(SubEmitSize, Dst.D(), Vector.D());
} else {
scvtf(SubEmitSize, Dst.Q(), Vector.Q());
@ -238,7 +238,7 @@ DEF_OP(Vector_FToZS) {
const auto ElementSize = Op->Header.ElementSize;
const auto SubEmitSize = ConvertSubRegSize248(IROp);
const auto Is256Bit = OpSize == Core::CPUState::XMM_AVX_REG_SIZE;
const auto Is256Bit = OpSize == IR::OpSize::i256Bit;
LOGMAN_THROW_A_FMT(!Is256Bit || (Is256Bit && HostSupportsSVE256), "Need SVE256 support in order to use {} with 256-bit operation", __func__);
const auto Dst = GetVReg(Node);
@ -248,15 +248,15 @@ DEF_OP(Vector_FToZS) {
fcvtzs(Dst.Z(), SubEmitSize, Mask.Merging(), Vector.Z(), SubEmitSize);
} else {
if (OpSize == ElementSize) {
if (ElementSize == 8) {
if (ElementSize == IR::OpSize::i64Bit) {
fcvtzs(ARMEmitter::ScalarRegSize::i64Bit, Dst.D(), Vector.D());
} else if (ElementSize == 4) {
} else if (ElementSize == IR::OpSize::i32Bit) {
fcvtzs(ARMEmitter::ScalarRegSize::i32Bit, Dst.S(), Vector.S());
} else {
fcvtzs(ARMEmitter::ScalarRegSize::i16Bit, Dst.H(), Vector.H());
}
} else {
if (OpSize == 8) {
if (OpSize == IR::OpSize::i64Bit) {
fcvtzs(SubEmitSize, Dst.D(), Vector.D());
} else {
fcvtzs(SubEmitSize, Dst.Q(), Vector.Q());
@ -269,7 +269,7 @@ DEF_OP(Vector_FToS) {
const auto Op = IROp->C<IR::IROp_Vector_FToS>();
const auto OpSize = IROp->Size;
const auto Is256Bit = OpSize == Core::CPUState::XMM_AVX_REG_SIZE;
const auto Is256Bit = OpSize == IR::OpSize::i256Bit;
LOGMAN_THROW_A_FMT(!Is256Bit || (Is256Bit && HostSupportsSVE256), "Need SVE256 support in order to use {} with 256-bit operation", __func__);
const auto SubEmitSize = ConvertSubRegSize248(IROp);
@ -284,7 +284,7 @@ DEF_OP(Vector_FToS) {
} else {
const auto Dst = GetVReg(Node);
const auto Vector = GetVReg(Op->Vector.ID());
if (OpSize == 8) {
if (OpSize == IR::OpSize::i64Bit) {
frinti(SubEmitSize, Dst.D(), Vector.D());
fcvtzs(SubEmitSize, Dst.D(), Dst.D());
} else {
@ -300,10 +300,10 @@ DEF_OP(Vector_FToF) {
const auto ElementSize = Op->Header.ElementSize;
const auto SubEmitSize = ConvertSubRegSize248(IROp);
const auto Is256Bit = OpSize == Core::CPUState::XMM_AVX_REG_SIZE;
const auto Is256Bit = OpSize == IR::OpSize::i256Bit;
LOGMAN_THROW_A_FMT(!Is256Bit || (Is256Bit && HostSupportsSVE256), "Need SVE256 support in order to use {} with 256-bit operation", __func__);
const auto Conv = (ElementSize << 8) | IR::OpSizeToSize(Op->SrcElementSize);
const auto Conv = (IR::OpSizeToSize(ElementSize) << 8) | IR::OpSizeToSize(Op->SrcElementSize);
const auto Dst = GetVReg(Node);
const auto Vector = GetVReg(Op->Vector.ID());
@ -403,7 +403,7 @@ DEF_OP(Vector_FToI) {
const auto ElementSize = Op->Header.ElementSize;
const auto SubEmitSize = ConvertSubRegSize248(IROp);
const auto Is256Bit = OpSize == Core::CPUState::XMM_AVX_REG_SIZE;
const auto Is256Bit = OpSize == IR::OpSize::i256Bit;
LOGMAN_THROW_A_FMT(!Is256Bit || (Is256Bit && HostSupportsSVE256), "Need SVE256 support in order to use {} with 256-bit operation", __func__);
const auto Dst = GetVReg(Node);
@ -427,15 +427,15 @@ DEF_OP(Vector_FToI) {
// frinti having AdvSIMD, AdvSIMD scalar, and an SVE version),
// we can't just use a lambda without some seriously ugly casting.
// This is fairly self-contained otherwise.
#define ROUNDING_FN(name) \
if (ElementSize == 2) { \
name(Dst.H(), Vector.H()); \
} else if (ElementSize == 4) { \
name(Dst.S(), Vector.S()); \
} else if (ElementSize == 8) { \
name(Dst.D(), Vector.D()); \
} else { \
FEX_UNREACHABLE; \
#define ROUNDING_FN(name) \
if (ElementSize == IR::OpSize::i16Bit) { \
name(Dst.H(), Vector.H()); \
} else if (ElementSize == IR::OpSize::i32Bit) { \
name(Dst.S(), Vector.S()); \
} else if (ElementSize == IR::OpSize::i64Bit) { \
name(Dst.D(), Vector.D()); \
} else { \
FEX_UNREACHABLE; \
}
switch (Op->Round) {
@ -464,7 +464,7 @@ DEF_OP(Vector_F64ToI32) {
const auto OpSize = IROp->Size;
const auto Round = Op->Round;
const auto Is256Bit = OpSize == Core::CPUState::XMM_AVX_REG_SIZE;
const auto Is256Bit = OpSize == IR::OpSize::i256Bit;
LOGMAN_THROW_A_FMT(!Is256Bit || (Is256Bit && HostSupportsSVE256), "Need SVE256 support in order to use {} with 256-bit operation", __func__);
const auto Dst = GetVReg(Node);

View File

@ -24,7 +24,7 @@ DEF_OP(VAESEnc) {
const auto State = GetVReg(Op->State.ID());
const auto ZeroReg = GetVReg(Op->ZeroReg.ID());
LOGMAN_THROW_AA_FMT(OpSize == Core::CPUState::XMM_SSE_REG_SIZE, "Currently only supports 128-bit operations.");
LOGMAN_THROW_AA_FMT(OpSize == IR::OpSize::i128Bit, "Currently only supports 128-bit operations.");
if (Dst == State && Dst != Key) {
// Optimal case in which Dst already contains the starting state.
@ -49,7 +49,7 @@ DEF_OP(VAESEncLast) {
const auto State = GetVReg(Op->State.ID());
const auto ZeroReg = GetVReg(Op->ZeroReg.ID());
LOGMAN_THROW_AA_FMT(OpSize == Core::CPUState::XMM_SSE_REG_SIZE, "Currently only supports 128-bit operations.");
LOGMAN_THROW_AA_FMT(OpSize == IR::OpSize::i128Bit, "Currently only supports 128-bit operations.");
if (Dst == State && Dst != Key) {
// Optimal case in which Dst already contains the starting state.
@ -72,7 +72,7 @@ DEF_OP(VAESDec) {
const auto State = GetVReg(Op->State.ID());
const auto ZeroReg = GetVReg(Op->ZeroReg.ID());
LOGMAN_THROW_AA_FMT(OpSize == Core::CPUState::XMM_SSE_REG_SIZE, "Currently only supports 128-bit operations.");
LOGMAN_THROW_AA_FMT(OpSize == IR::OpSize::i128Bit, "Currently only supports 128-bit operations.");
if (Dst == State && Dst != Key) {
// Optimal case in which Dst already contains the starting state.
@ -97,7 +97,7 @@ DEF_OP(VAESDecLast) {
const auto State = GetVReg(Op->State.ID());
const auto ZeroReg = GetVReg(Op->ZeroReg.ID());
LOGMAN_THROW_AA_FMT(OpSize == Core::CPUState::XMM_SSE_REG_SIZE, "Currently only supports 128-bit operations.");
LOGMAN_THROW_AA_FMT(OpSize == IR::OpSize::i128Bit, "Currently only supports 128-bit operations.");
if (Dst == State && Dst != Key) {
// Optimal case in which Dst already contains the starting state.
@ -193,7 +193,7 @@ DEF_OP(PCLMUL) {
const auto Src1 = GetVReg(Op->Src1.ID());
const auto Src2 = GetVReg(Op->Src2.ID());
LOGMAN_THROW_AA_FMT(OpSize == Core::CPUState::XMM_SSE_REG_SIZE, "Currently only supports 128-bit operations.");
LOGMAN_THROW_AA_FMT(OpSize == IR::OpSize::i128Bit, "Currently only supports 128-bit operations.");
switch (Op->Selector) {
case 0b00000000: pmull(ARMEmitter::SubRegSize::i128Bit, Dst.D(), Src1.D(), Src2.D()); break;

View File

@ -228,7 +228,7 @@ private:
bool IsGPR(IR::NodeID Node) const;
[[nodiscard]]
ARMEmitter::ExtendedMemOperand GenerateMemOperand(uint8_t AccessSize, ARMEmitter::Register Base, IR::OrderedNodeWrapper Offset,
ARMEmitter::ExtendedMemOperand GenerateMemOperand(IR::OpSize AccessSize, ARMEmitter::Register Base, IR::OrderedNodeWrapper Offset,
IR::MemOffsetType OffsetType, uint8_t OffsetScale);
// NOTE: Will use TMP1 as a way to encode immediates that happen to fall outside
@ -237,7 +237,7 @@ private:
// TMP1 is safe to use again once this memory operand is used with its
// equivalent loads or stores that this was called for.
[[nodiscard]]
ARMEmitter::SVEMemOperand GenerateSVEMemOperand(uint8_t AccessSize, ARMEmitter::Register Base, IR::OrderedNodeWrapper Offset,
ARMEmitter::SVEMemOperand GenerateSVEMemOperand(IR::OpSize AccessSize, ARMEmitter::Register Base, IR::OrderedNodeWrapper Offset,
IR::MemOffsetType OffsetType, uint8_t OffsetScale);
[[nodiscard]]
@ -318,15 +318,16 @@ private:
using ScalarFMAOpCaller =
std::function<void(ARMEmitter::VRegister Dst, ARMEmitter::VRegister Src1, ARMEmitter::VRegister Src2, ARMEmitter::VRegister Src3)>;
void VFScalarFMAOperation(uint8_t OpSize, uint8_t ElementSize, ScalarFMAOpCaller ScalarEmit, ARMEmitter::VRegister Dst,
void VFScalarFMAOperation(IR::OpSize OpSize, IR::OpSize ElementSize, ScalarFMAOpCaller ScalarEmit, ARMEmitter::VRegister Dst,
ARMEmitter::VRegister Upper, ARMEmitter::VRegister Vector1, ARMEmitter::VRegister Vector2,
ARMEmitter::VRegister Addend);
using ScalarBinaryOpCaller = std::function<void(ARMEmitter::VRegister Dst, ARMEmitter::VRegister Src1, ARMEmitter::VRegister Src2)>;
void VFScalarOperation(uint8_t OpSize, uint8_t ElementSize, bool ZeroUpperBits, ScalarBinaryOpCaller ScalarEmit,
void VFScalarOperation(IR::OpSize OpSize, IR::OpSize ElementSize, bool ZeroUpperBits, ScalarBinaryOpCaller ScalarEmit,
ARMEmitter::VRegister Dst, ARMEmitter::VRegister Vector1, ARMEmitter::VRegister Vector2);
using ScalarUnaryOpCaller = std::function<void(ARMEmitter::VRegister Dst, std::variant<ARMEmitter::VRegister, ARMEmitter::Register> SrcVar)>;
void VFScalarUnaryOperation(uint8_t OpSize, uint8_t ElementSize, bool ZeroUpperBits, ScalarUnaryOpCaller ScalarEmit, ARMEmitter::VRegister Dst,
ARMEmitter::VRegister Vector1, std::variant<ARMEmitter::VRegister, ARMEmitter::Register> Vector2);
void VFScalarUnaryOperation(IR::OpSize OpSize, IR::OpSize ElementSize, bool ZeroUpperBits, ScalarUnaryOpCaller ScalarEmit,
ARMEmitter::VRegister Dst, ARMEmitter::VRegister Vector1,
std::variant<ARMEmitter::VRegister, ARMEmitter::Register> Vector2);
void Emulate128BitGather(IR::OpSize Size, IR::OpSize ElementSize, ARMEmitter::VRegister Dst, ARMEmitter::VRegister IncomingDst,
std::optional<ARMEmitter::Register> BaseAddr, ARMEmitter::VRegister VectorIndexLow,

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

View File

@ -168,7 +168,7 @@ void OpDispatchBuilder::RETOp(OpcodeArgs) {
if (Op->OP == 0xC2) {
auto Offset = LoadSource(GPRClass, Op, Op->Src[0], Op->Flags);
SP = _Add(IR::SizeToOpSize(GPRSize), SP, Offset);
SP = _Add(GPRSize, SP, Offset);
}
// Store the new stack pointer
@ -297,7 +297,7 @@ void OpDispatchBuilder::ADCOp(OpcodeArgs, uint32_t SrcIndex) {
HandledLock = true;
Ref DestMem = MakeSegmentAddress(Op, Op->Dest);
Before = _AtomicFetchAdd(IR::SizeToOpSize(Size), ALUOp, DestMem);
Before = _AtomicFetchAdd(Size, ALUOp, DestMem);
} else {
Before = LoadSource(GPRClass, Op, Op->Dest, Op->Flags, {.AllowUpperGarbage = true});
}
@ -334,7 +334,7 @@ void OpDispatchBuilder::SBBOp(OpcodeArgs, uint32_t SrcIndex) {
Ref DestMem = MakeSegmentAddress(Op, Op->Dest);
auto SrcPlusCF = IncrementByCarry(OpSize, Src);
Before = _AtomicFetchSub(IR::SizeToOpSize(Size), SrcPlusCF, DestMem);
Before = _AtomicFetchSub(Size, SrcPlusCF, DestMem);
} else {
Before = LoadSource(GPRClass, Op, Op->Dest, Op->Flags, {.AllowUpperGarbage = true});
}
@ -494,7 +494,7 @@ void OpDispatchBuilder::POPAOp(OpcodeArgs) {
StoreGPRRegister(X86State::REG_RBP, Pop(Size, SP), Size);
// Skip loading RSP because it'll be correct at the end
SP = _RMWHandle(_Add(OpSize::i64Bit, SP, _InlineConstant(Size)));
SP = _RMWHandle(_Add(OpSize::i64Bit, SP, _InlineConstant(IR::OpSizeToSize(Size))));
StoreGPRRegister(X86State::REG_RBX, Pop(Size, SP), Size);
StoreGPRRegister(X86State::REG_RDX, Pop(Size, SP), Size);
@ -567,7 +567,7 @@ void OpDispatchBuilder::CALLOp(OpcodeArgs) {
uint64_t InstRIP = Op->PC + Op->InstSize;
uint64_t TargetRIP = InstRIP + TargetOffset;
Ref NewRIP = _Add(IR::SizeToOpSize(GPRSize), ConstantPC, _Constant(TargetOffset));
Ref NewRIP = _Add(GPRSize, ConstantPC, _Constant(TargetOffset));
// Push the return address.
Push(GPRSize, ConstantPC);
@ -715,7 +715,7 @@ void OpDispatchBuilder::CMOVOp(OpcodeArgs) {
Src = LoadSource(GPRClass, Op, Op->Src[0], Op->Flags);
}
auto SrcCond = SelectCC(Op->OP & 0xF, IR::SizeToOpSize(std::max<uint8_t>(OpSize::i32Bit, GetSrcSize(Op))), Src, Dest);
auto SrcCond = SelectCC(Op->OP & 0xF, std::max(OpSize::i32Bit, OpSizeFromSrc(Op)), Src, Dest);
StoreResult(GPRClass, Op, SrcCond, OpSize::iInvalid);
}
@ -731,7 +731,7 @@ void OpDispatchBuilder::CondJUMPOp(OpcodeArgs) {
uint64_t InstRIP = Op->PC + Op->InstSize;
uint64_t Target = InstRIP + TargetOffset;
if (CTX->GetGPRSize() == OpSize::i32Bit) {
if (CTX->GetGPROpSize() == OpSize::i32Bit) {
// If the GPRSize is 4 then we need to be careful about PC wrapping
if (TargetOffset < 0 && -TargetOffset > InstRIP) {
// Invert the signed value if we are underflowing
@ -802,7 +802,7 @@ void OpDispatchBuilder::CondJUMPRCXOp(OpcodeArgs) {
BlockSetRIP = true;
auto JcxGPRSize = CTX->GetGPROpSize();
JcxGPRSize = (Op->Flags & X86Tables::DecodeFlags::FLAG_ADDRESS_SIZE) ? (IR::DivideOpSize(JcxGPRSize, 2)) : JcxGPRSize;
JcxGPRSize = (Op->Flags & X86Tables::DecodeFlags::FLAG_ADDRESS_SIZE) ? (JcxGPRSize >> 1) : JcxGPRSize;
uint64_t Target = Op->PC + Op->InstSize + Op->Src[0].Literal();
@ -937,7 +937,7 @@ void OpDispatchBuilder::JUMPOp(OpcodeArgs) {
uint64_t InstRIP = Op->PC + Op->InstSize;
uint64_t TargetRIP = InstRIP + TargetOffset;
if (CTX->GetGPRSize() == OpSize::i32Bit) {
if (CTX->GetGPROpSize() == OpSize::i32Bit) {
// If the GPRSize is 4 then we need to be careful about PC wrapping
if (TargetOffset < 0 && -TargetOffset > InstRIP) {
// Invert the signed value if we are underflowing
@ -1000,18 +1000,18 @@ void OpDispatchBuilder::TESTOp(OpcodeArgs, uint32_t SrcIndex) {
Ref Src = LoadSource(GPRClass, Op, Op->Src[SrcIndex], Op->Flags, {.AllowUpperGarbage = true});
Ref Dest = LoadSource(GPRClass, Op, Op->Dest, Op->Flags, {.AllowUpperGarbage = true});
auto Size = GetDstSize(Op);
const auto Size = OpSizeFromDst(Op);
uint64_t Const;
bool AlwaysNonnegative = false;
if (IsValueConstant(WrapNode(Src), &Const)) {
// Optimize out masking constants
if (Const == (Size == OpSize::i64Bit ? ~0ULL : ((1ull << Size * 8) - 1))) {
if (Const == (Size == OpSize::i64Bit ? ~0ULL : ((1ull << IR::OpSizeAsBits(Size)) - 1))) {
Src = Dest;
}
// Optimize test with non-sign bits
AlwaysNonnegative = (Const & (1ull << ((Size * 8) - 1))) == 0;
AlwaysNonnegative = (Const & (1ull << (IR::OpSizeAsBits(Size) - 1))) == 0;
}
if (Dest == Src) {
@ -1024,7 +1024,7 @@ void OpDispatchBuilder::TESTOp(OpcodeArgs, uint32_t SrcIndex) {
SetNZ_ZeroCV(OpSize::i32Bit, Res);
} else {
HandleNZ00Write();
CalculatePF(_AndWithFlags(IR::SizeToOpSize(Size), Dest, Src));
CalculatePF(_AndWithFlags(Size, Dest, Src));
}
InvalidateAF();
@ -1049,7 +1049,7 @@ void OpDispatchBuilder::MOVSXDOp(OpcodeArgs) {
StoreResult_WithOpSize(GPRClass, Op, Op->Dest, Src, Size, OpSize::iInvalid);
} else if (Sext) {
// With REX.W then Sext
Src = _Sbfe(OpSize::i64Bit, Size * 8, 0, Src);
Src = _Sbfe(OpSize::i64Bit, IR::OpSizeAsBits(Size), 0, Src);
StoreResult(GPRClass, Op, Src, OpSize::iInvalid);
} else {
// Without REX.W then Zext (store result implicitly zero extends)
@ -1059,13 +1059,13 @@ void OpDispatchBuilder::MOVSXDOp(OpcodeArgs) {
void OpDispatchBuilder::MOVSXOp(OpcodeArgs) {
// Load garbage in upper bits, since we're sign extending anyway
uint8_t Size = GetSrcSize(Op);
const auto Size = OpSizeFromSrc(Op);
Ref Src = LoadSource(GPRClass, Op, Op->Src[0], Op->Flags, {.AllowUpperGarbage = true});
// Sign-extend to DstSize and zero-extend to the register size, using a fast
// path for 32-bit dests where the native 32-bit Sbfe zero extends the top.
uint8_t DstSize = GetDstSize(Op);
Src = _Sbfe(DstSize == OpSize::i64Bit ? OpSize::i64Bit : OpSize::i32Bit, Size * 8, 0, Src);
const auto DstSize = OpSizeFromDst(Op);
Src = _Sbfe(DstSize == OpSize::i64Bit ? OpSize::i64Bit : OpSize::i32Bit, IR::OpSizeAsBits(Size), 0, Src);
StoreResult(GPRClass, Op, Op->Dest, Src, OpSize::iInvalid);
}
@ -1134,10 +1134,10 @@ void OpDispatchBuilder::XCHGOp(OpcodeArgs) {
void OpDispatchBuilder::CDQOp(OpcodeArgs) {
const auto DstSize = OpSizeFromDst(Op);
const auto SrcSize = IR::SizeToOpSize(IR::OpSizeToSize(DstSize) >> 1);
const auto SrcSize = DstSize / 2;
Ref Src = LoadGPRRegister(X86State::REG_RAX, SrcSize, 0, true);
Src = _Sbfe(DstSize <= OpSize::i32Bit ? OpSize::i32Bit : OpSize::i64Bit, SrcSize * 8, 0, Src);
Src = _Sbfe(DstSize <= OpSize::i32Bit ? OpSize::i32Bit : OpSize::i64Bit, IR::OpSizeAsBits(SrcSize), 0, Src);
StoreResult_WithOpSize(GPRClass, Op, Op->Dest, Src, DstSize, OpSize::iInvalid);
}
@ -1374,7 +1374,7 @@ void OpDispatchBuilder::XGetBVOp(OpcodeArgs) {
}
void OpDispatchBuilder::SHLOp(OpcodeArgs) {
const auto Size = GetSrcSize(Op);
const auto Size = OpSizeFromSrc(Op);
auto Dest = LoadSource(GPRClass, Op, Op->Dest, Op->Flags, {.AllowUpperGarbage = true});
auto Src = LoadSource(GPRClass, Op, Op->Src[1], Op->Flags, {.AllowUpperGarbage = true});
@ -1398,7 +1398,7 @@ void OpDispatchBuilder::SHLImmediateOp(OpcodeArgs, bool SHL1Bit) {
void OpDispatchBuilder::SHROp(OpcodeArgs) {
const auto Size = OpSizeFromSrc(Op);
auto Dest = LoadSource(GPRClass, Op, Op->Dest, Op->Flags, {.AllowUpperGarbage = Size >= 4});
auto Dest = LoadSource(GPRClass, Op, Op->Dest, Op->Flags, {.AllowUpperGarbage = Size >= OpSize::i32Bit});
auto Src = LoadSource(GPRClass, Op, Op->Src[1], Op->Flags, {.AllowUpperGarbage = true});
auto ALUOp = _Lshr(std::max(OpSize::i32Bit, Size), Dest, Src);
@ -1557,29 +1557,29 @@ void OpDispatchBuilder::SHRDImmediateOp(OpcodeArgs) {
}
void OpDispatchBuilder::ASHROp(OpcodeArgs, bool Immediate, bool SHR1Bit) {
const auto Size = GetSrcSize(Op);
const auto Size = OpSizeFromSrc(Op);
const auto OpSize = std::max(OpSize::i32Bit, OpSizeFromDst(Op));
// If Size < 4, then we Sbfe the Dest so we can have garbage.
// Otherwise, if Size = Opsize, then both are 4 or 8 and match the a64
// semantics directly, so again we can have garbage. The only case where we
// need zero-extension here is when the sizes mismatch.
auto Dest = LoadSource(GPRClass, Op, Op->Dest, Op->Flags, {.AllowUpperGarbage = (OpSize == Size) || (Size < 4)});
auto Dest = LoadSource(GPRClass, Op, Op->Dest, Op->Flags, {.AllowUpperGarbage = (OpSize == Size) || (Size < OpSize::i32Bit)});
if (Size < OpSize::i32Bit) {
Dest = _Sbfe(OpSize::i64Bit, Size * 8, 0, Dest);
Dest = _Sbfe(OpSize::i64Bit, IR::OpSizeAsBits(Size), 0, Dest);
}
if (Immediate) {
uint64_t Shift = LoadConstantShift(Op, SHR1Bit);
Ref Result = _Ashr(IR::SizeToOpSize(OpSize), Dest, _Constant(Shift));
Ref Result = _Ashr(OpSize, Dest, _Constant(Shift));
CalculateFlags_SignShiftRightImmediate(OpSizeFromSrc(Op), Result, Dest, Shift);
CalculateDeferredFlags();
StoreResult(GPRClass, Op, Result, OpSize::iInvalid);
} else {
auto Src = LoadSource(GPRClass, Op, Op->Src[1], Op->Flags, {.AllowUpperGarbage = true});
Ref Result = _Ashr(IR::SizeToOpSize(OpSize), Dest, Src);
Ref Result = _Ashr(OpSize, Dest, Src);
HandleShift(Op, Result, Dest, ShiftType::ASR, Src);
}
@ -1660,12 +1660,12 @@ void OpDispatchBuilder::BEXTRBMIOp(OpcodeArgs) {
// Essentially (Src1 >> Start) & ((1 << Length) - 1)
// along with some edge-case handling and flag setting.
LOGMAN_THROW_A_FMT(Op->InstSize >= OpSize::i32Bit, "No masking needed");
LOGMAN_THROW_A_FMT(Op->InstSize >= 4, "No masking needed");
auto* Src1 = LoadSource(GPRClass, Op, Op->Src[0], Op->Flags, {.AllowUpperGarbage = true});
auto* Src2 = LoadSource(GPRClass, Op, Op->Src[1], Op->Flags, {.AllowUpperGarbage = true});
const auto Size = OpSizeFromSrc(Op);
const auto SrcSize = Size * 8;
const auto SrcSize = IR::OpSizeAsBits(Size);
const auto MaxSrcBit = SrcSize - 1;
auto MaxSrcBitOp = _Constant(Size, MaxSrcBit);
@ -1701,8 +1701,8 @@ void OpDispatchBuilder::BEXTRBMIOp(OpcodeArgs) {
void OpDispatchBuilder::BLSIBMIOp(OpcodeArgs) {
// Equivalent to performing: SRC & -SRC
LOGMAN_THROW_A_FMT(Op->InstSize >= OpSize::i32Bit, "No masking needed");
auto Size = OpSizeFromSrc(Op);
LOGMAN_THROW_A_FMT(Op->InstSize >= 4, "No masking needed");
const auto Size = OpSizeFromSrc(Op);
auto* Src = LoadSource(GPRClass, Op, Op->Src[0], Op->Flags, {.AllowUpperGarbage = true});
auto NegatedSrc = _Neg(Size, Src);
@ -1715,15 +1715,15 @@ void OpDispatchBuilder::BLSIBMIOp(OpcodeArgs) {
// inverted ZF.
//
// ZF/SF/OF set as usual.
SetNZ_ZeroCV(GetSrcSize(Op), Result);
SetNZ_ZeroCV(Size, Result);
InvalidatePF_AF();
SetCFInverted(GetRFLAG(X86State::RFLAG_ZF_RAW_LOC));
}
void OpDispatchBuilder::BLSMSKBMIOp(OpcodeArgs) {
// Equivalent to: (Src - 1) ^ Src
LOGMAN_THROW_A_FMT(Op->InstSize >= OpSize::i32Bit, "No masking needed");
auto Size = OpSizeFromSrc(Op);
LOGMAN_THROW_A_FMT(Op->InstSize >= 4, "No masking needed");
const auto Size = OpSizeFromSrc(Op);
auto* Src = LoadSource(GPRClass, Op, Op->Src[0], Op->Flags, {.AllowUpperGarbage = true});
auto Result = _Xor(Size, _Sub(Size, Src, _InlineConstant(1)), Src);
@ -1738,24 +1738,25 @@ void OpDispatchBuilder::BLSMSKBMIOp(OpcodeArgs) {
// The output of BLSMSK is always nonzero, so TST will clear Z (along with C
// and O) while setting S.
SetNZ_ZeroCV(GetSrcSize(Op), Result);
SetNZ_ZeroCV(Size, Result);
SetCFInverted(CFInv);
}
void OpDispatchBuilder::BLSRBMIOp(OpcodeArgs) {
// Equivalent to: (Src - 1) & Src
LOGMAN_THROW_A_FMT(Op->InstSize >= OpSize::i32Bit, "No masking needed");
auto* Src = LoadSource(GPRClass, Op, Op->Src[0], Op->Flags, {.AllowUpperGarbage = true});
auto Size = OpSizeFromSrc(Op);
LOGMAN_THROW_A_FMT(Op->InstSize >= 4, "No masking needed");
const auto Size = OpSizeFromSrc(Op);
auto* Src = LoadSource(GPRClass, Op, Op->Src[0], Op->Flags, {.AllowUpperGarbage = true});
auto Result = _And(Size, _Sub(Size, Src, _InlineConstant(1)), Src);
StoreResult(GPRClass, Op, Result, OpSize::iInvalid);
auto Zero = _Constant(0);
auto One = _Constant(1);
auto CFInv = _Select(IR::COND_NEQ, Src, Zero, One, Zero);
SetNZ_ZeroCV(GetSrcSize(Op), Result);
SetNZ_ZeroCV(Size, Result);
SetCFInverted(CFInv);
InvalidatePF_AF();
}
@ -1774,13 +1775,13 @@ void OpDispatchBuilder::BMI2Shift(OpcodeArgs) {
Ref Result;
if (Op->OP == 0x6F7) {
// SARX
Result = _Ashr(IR::SizeToOpSize(Size), Src, Shift);
Result = _Ashr(Size, Src, Shift);
} else if (Op->OP == 0x5F7) {
// SHLX
Result = _Lshl(IR::SizeToOpSize(Size), Src, Shift);
Result = _Lshl(Size, Src, Shift);
} else {
// SHRX
Result = _Lshr(IR::SizeToOpSize(Size), Src, Shift);
Result = _Lshr(Size, Src, Shift);
}
StoreResult(GPRClass, Op, Result, OpSize::iInvalid);
@ -1788,7 +1789,7 @@ void OpDispatchBuilder::BMI2Shift(OpcodeArgs) {
void OpDispatchBuilder::BZHI(OpcodeArgs) {
const auto Size = OpSizeFromSrc(Op);
const auto OperandSize = Size * 8;
const auto OperandSize = IR::OpSizeAsBits(Size);
// In 32-bit mode we only look at bottom 32-bit, no 8 or 16-bit BZHI so no
// need to zero-extend sources
@ -1853,13 +1854,12 @@ void OpDispatchBuilder::RORX(OpcodeArgs) {
void OpDispatchBuilder::MULX(OpcodeArgs) {
// RDX is the implied source operand in the instruction
const auto OperandSize = OpSizeFromSrc(Op);
const auto OpSize = IR::SizeToOpSize(OperandSize);
const auto OpSize = OpSizeFromSrc(Op);
// Src1 can be a memory operand, so ensure we constrain to the
// absolute width of the access in that scenario.
const auto GPRSize = CTX->GetGPROpSize();
const auto Src1Size = Op->Src[1].IsGPR() ? GPRSize : OperandSize;
const auto Src1Size = Op->Src[1].IsGPR() ? GPRSize : OpSize;
Ref Src1 = LoadSource_WithOpSize(GPRClass, Op, Op->Src[1], Src1Size, Op->Flags);
Ref Src2 = LoadGPRRegister(X86State::REG_RDX, GPRSize);
@ -1880,7 +1880,7 @@ void OpDispatchBuilder::MULX(OpcodeArgs) {
}
void OpDispatchBuilder::PDEP(OpcodeArgs) {
LOGMAN_THROW_A_FMT(Op->InstSize >= OpSize::i32Bit, "No masking needed");
LOGMAN_THROW_A_FMT(Op->InstSize >= 4, "No masking needed");
auto* Input = LoadSource(GPRClass, Op, Op->Src[0], Op->Flags, {.AllowUpperGarbage = true});
auto* Mask = LoadSource(GPRClass, Op, Op->Src[1], Op->Flags, {.AllowUpperGarbage = true});
auto Result = _PDep(OpSizeFromSrc(Op), Input, Mask);
@ -1889,7 +1889,7 @@ void OpDispatchBuilder::PDEP(OpcodeArgs) {
}
void OpDispatchBuilder::PEXT(OpcodeArgs) {
LOGMAN_THROW_A_FMT(Op->InstSize >= OpSize::i32Bit, "No masking needed");
LOGMAN_THROW_A_FMT(Op->InstSize >= 4, "No masking needed");
auto* Input = LoadSource(GPRClass, Op, Op->Src[0], Op->Flags, {.AllowUpperGarbage = true});
auto* Mask = LoadSource(GPRClass, Op, Op->Src[1], Op->Flags, {.AllowUpperGarbage = true});
auto Result = _PExt(OpSizeFromSrc(Op), Input, Mask);
@ -2093,7 +2093,7 @@ void OpDispatchBuilder::RCROp(OpcodeArgs) {
StoreResult(GPRClass, Op, Res, OpSize::iInvalid);
},
GetSrcSize(Op) == OpSize::i32Bit ? std::make_optional(&OpDispatchBuilder::ZeroShiftResult) : std::nullopt);
OpSizeFromSrc(Op) == OpSize::i32Bit ? std::make_optional(&OpDispatchBuilder::ZeroShiftResult) : std::nullopt);
}
void OpDispatchBuilder::RCRSmallerOp(OpcodeArgs) {
@ -2315,7 +2315,7 @@ void OpDispatchBuilder::RCLOp(OpcodeArgs) {
StoreResult(GPRClass, Op, Res, OpSize::iInvalid);
},
GetSrcSize(Op) == OpSize::i32Bit ? std::make_optional(&OpDispatchBuilder::ZeroShiftResult) : std::nullopt);
OpSizeFromSrc(Op) == OpSize::i32Bit ? std::make_optional(&OpDispatchBuilder::ZeroShiftResult) : std::nullopt);
}
void OpDispatchBuilder::RCLSmallerOp(OpcodeArgs) {
@ -2405,7 +2405,7 @@ void OpDispatchBuilder::BTOp(OpcodeArgs, uint32_t SrcIndex, BTAction Action) {
// Get the bit selection from the src. We need to mask for 8/16-bit, but
// rely on the implicit masking of Lshr for native sizes.
unsigned LshrSize = std::max<uint8_t>(OpSize::i32Bit, Size / 8);
unsigned LshrSize = std::max<uint8_t>(IR::OpSizeToSize(OpSize::i32Bit), Size / 8);
auto BitSelect = (Size == (LshrSize * 8)) ? Src : _And(OpSize::i64Bit, Src, _Constant(Mask));
// OF/SF/ZF/AF/PF undefined.
@ -2458,7 +2458,7 @@ void OpDispatchBuilder::BTOp(OpcodeArgs, uint32_t SrcIndex, BTAction Action) {
// Load the address to the memory location
Ref Dest = MakeSegmentAddress(Op, Op->Dest);
// Get the bit selection from the src
Ref BitSelect = _Bfe(IR::SizeToOpSize(std::max<uint8_t>(4u, GetOpSize(Src))), 3, 0, Src);
Ref BitSelect = _Bfe(std::max(OpSize::i32Bit, GetOpSize(Src)), 3, 0, Src);
// Address is provided as bits we want BYTE offsets
// Extract Signed offset
@ -2523,7 +2523,7 @@ void OpDispatchBuilder::BTOp(OpcodeArgs, uint32_t SrcIndex, BTAction Action) {
}
// Now shift in to the correct bit location
Value = _Lshr(IR::SizeToOpSize(std::max<uint8_t>(4u, GetOpSize(Value))), Value, BitSelect);
Value = _Lshr(std::max(OpSize::i32Bit, GetOpSize(Value)), Value, BitSelect);
// OF/SF/ZF/AF/PF undefined.
SetCFDirect(Value, ConstantShift, true);
@ -2536,21 +2536,22 @@ void OpDispatchBuilder::IMUL1SrcOp(OpcodeArgs) {
Ref Src2 = LoadSource(GPRClass, Op, Op->Src[0], Op->Flags, {.AllowUpperGarbage = true});
const auto Size = OpSizeFromSrc(Op);
const auto SizeBits = IR::OpSizeAsBits(Size);
Ref Dest {};
Ref ResultHigh {};
switch (Size) {
case OpSize::i8Bit:
case OpSize::i16Bit: {
Src1 = _Sbfe(OpSize::i64Bit, Size * 8, 0, Src1);
Src2 = _Sbfe(OpSize::i64Bit, Size * 8, 0, Src2);
Src1 = _Sbfe(OpSize::i64Bit, SizeBits, 0, Src1);
Src2 = _Sbfe(OpSize::i64Bit, SizeBits, 0, Src2);
Dest = _Mul(OpSize::i64Bit, Src1, Src2);
ResultHigh = _Sbfe(OpSize::i64Bit, Size * 8, Size * 8, Dest);
ResultHigh = _Sbfe(OpSize::i64Bit, SizeBits, SizeBits, Dest);
break;
}
case OpSize::i32Bit: {
ResultHigh = _SMull(Src1, Src2);
ResultHigh = _Sbfe(OpSize::i64Bit, Size * 8, Size * 8, ResultHigh);
ResultHigh = _Sbfe(OpSize::i64Bit, SizeBits, SizeBits, ResultHigh);
// Flipped order to save a move
Dest = _Mul(OpSize::i32Bit, Src1, Src2);
break;
@ -2573,6 +2574,7 @@ void OpDispatchBuilder::IMUL2SrcOp(OpcodeArgs) {
Ref Src2 = LoadSource(GPRClass, Op, Op->Src[1], Op->Flags, {.AllowUpperGarbage = true});
const auto Size = OpSizeFromSrc(Op);
const auto SizeBits = IR::OpSizeAsBits(Size);
Ref Dest {};
Ref ResultHigh {};
@ -2580,15 +2582,15 @@ void OpDispatchBuilder::IMUL2SrcOp(OpcodeArgs) {
switch (Size) {
case OpSize::i8Bit:
case OpSize::i16Bit: {
Src1 = _Sbfe(OpSize::i64Bit, Size * 8, 0, Src1);
Src2 = _Sbfe(OpSize::i64Bit, Size * 8, 0, Src2);
Src1 = _Sbfe(OpSize::i64Bit, SizeBits, 0, Src1);
Src2 = _Sbfe(OpSize::i64Bit, SizeBits, 0, Src2);
Dest = _Mul(OpSize::i64Bit, Src1, Src2);
ResultHigh = _Sbfe(OpSize::i64Bit, Size * 8, Size * 8, Dest);
ResultHigh = _Sbfe(OpSize::i64Bit, SizeBits, SizeBits, Dest);
break;
}
case OpSize::i32Bit: {
ResultHigh = _SMull(Src1, Src2);
ResultHigh = _Sbfe(OpSize::i64Bit, Size * 8, Size * 8, ResultHigh);
ResultHigh = _Sbfe(OpSize::i64Bit, SizeBits, SizeBits, ResultHigh);
// Flipped order to save a move
Dest = _Mul(OpSize::i32Bit, Src1, Src2);
break;
@ -2608,13 +2610,14 @@ void OpDispatchBuilder::IMUL2SrcOp(OpcodeArgs) {
void OpDispatchBuilder::IMULOp(OpcodeArgs) {
const auto Size = OpSizeFromSrc(Op);
const auto SizeBits = IR::OpSizeAsBits(Size);
Ref Src1 = LoadSource(GPRClass, Op, Op->Dest, Op->Flags, {.AllowUpperGarbage = true});
Ref Src2 = LoadGPRRegister(X86State::REG_RAX);
if (Size != OpSize::i64Bit) {
Src1 = _Sbfe(OpSize::i64Bit, Size * 8, 0, Src1);
Src2 = _Sbfe(OpSize::i64Bit, Size * 8, 0, Src2);
Src1 = _Sbfe(OpSize::i64Bit, SizeBits, 0, Src1);
Src2 = _Sbfe(OpSize::i64Bit, SizeBits, 0, Src2);
}
// 64-bit special cased to save a move
@ -2659,14 +2662,15 @@ void OpDispatchBuilder::IMULOp(OpcodeArgs) {
void OpDispatchBuilder::MULOp(OpcodeArgs) {
const auto Size = OpSizeFromSrc(Op);
const auto SizeBits = IR::OpSizeAsBits(Size);
Ref Src1 = LoadSource(GPRClass, Op, Op->Dest, Op->Flags, {.AllowUpperGarbage = true});
Ref Src2 = LoadGPRRegister(X86State::REG_RAX);
Ref Result;
if (Size != OpSize::i64Bit) {
Src1 = _Bfe(OpSize::i64Bit, Size * 8, 0, Src1);
Src2 = _Bfe(OpSize::i64Bit, Size * 8, 0, Src2);
Src1 = _Bfe(OpSize::i64Bit, SizeBits, 0, Src1);
Src2 = _Bfe(OpSize::i64Bit, SizeBits, 0, Src2);
Result = _UMul(OpSize::i64Bit, Src1, Src2);
}
Ref ResultHigh {};
@ -2709,17 +2713,19 @@ void OpDispatchBuilder::MULOp(OpcodeArgs) {
void OpDispatchBuilder::NOTOp(OpcodeArgs) {
const auto Size = OpSizeFromSrc(Op);
const auto SizeBits = IR::OpSizeAsBits(Size);
Ref MaskConst {};
if (Size == OpSize::i64Bit) {
MaskConst = _Constant(~0ULL);
} else {
MaskConst = _Constant((1ULL << (Size * 8)) - 1);
MaskConst = _Constant((1ULL << SizeBits) - 1);
}
if (DestIsLockedMem(Op)) {
HandledLock = true;
Ref DestMem = MakeSegmentAddress(Op, Op->Dest);
_AtomicXor(IR::SizeToOpSize(Size), MaskConst, DestMem);
_AtomicXor(Size, MaskConst, DestMem);
} else if (!Op->Dest.IsGPR()) {
// GPR version plays fast and loose with sizes, be safe for memory tho.
Ref Src = LoadSource(GPRClass, Op, Op->Dest, Op->Flags);
@ -2742,13 +2748,13 @@ void OpDispatchBuilder::NOTOp(OpcodeArgs) {
// For 8/16-bit, use 64-bit invert so we invert in place, while getting
// insert behaviour. For 32-bit, use 32-bit invert to zero the upper bits.
unsigned EffectiveSize = Size == OpSize::i32Bit ? OpSize::i32Bit : GPRSize;
const auto EffectiveSize = Size == OpSize::i32Bit ? OpSize::i32Bit : GPRSize;
// If we're inverting the whole thing, use Not instead of Xor to save a constant.
if (Size >= OpSize::i32Bit) {
Src = _Not(IR::SizeToOpSize(EffectiveSize), Src);
Src = _Not(EffectiveSize, Src);
} else {
Src = _Xor(IR::SizeToOpSize(EffectiveSize), Src, MaskConst);
Src = _Xor(EffectiveSize, Src, MaskConst);
}
// Always store 64-bit, the Not/Xor correctly handle the upper bits and this
@ -2816,7 +2822,7 @@ void OpDispatchBuilder::DAAOp(OpcodeArgs) {
// SF, ZF, PF set according to result. CF set per above. OF undefined.
StoreGPRRegister(X86State::REG_RAX, AL, OpSize::i8Bit);
SetNZ_ZeroCV(1, AL);
SetNZ_ZeroCV(OpSize::i8Bit, AL);
SetCFInverted(CFInv);
CalculatePF(AL);
SetAFAndFixup(AF);
@ -2842,7 +2848,7 @@ void OpDispatchBuilder::DASOp(OpcodeArgs) {
// SF, ZF, PF set according to result. CF set per above. OF undefined.
StoreGPRRegister(X86State::REG_RAX, AL, OpSize::i8Bit);
SetNZ_ZeroCV(1, AL);
SetNZ_ZeroCV(OpSize::i8Bit, AL);
SetCFDirect(NewCF);
CalculatePF(AL);
SetAFAndFixup(AF);
@ -2898,7 +2904,7 @@ void OpDispatchBuilder::AAMOp(OpcodeArgs) {
auto Res = _AddShift(OpSize::i64Bit, URemOp, UDivOp, ShiftType::LSL, 8);
StoreGPRRegister(X86State::REG_RAX, Res, OpSize::i16Bit);
SetNZ_ZeroCV(1, Res);
SetNZ_ZeroCV(OpSize::i8Bit, Res);
CalculatePF(Res);
InvalidateAF();
}
@ -2913,7 +2919,7 @@ void OpDispatchBuilder::AADOp(OpcodeArgs) {
auto Result = _And(OpSize::i64Bit, NewAL, _Constant(0xFF));
StoreGPRRegister(X86State::REG_RAX, Result, OpSize::i16Bit);
SetNZ_ZeroCV(1, Result);
SetNZ_ZeroCV(OpSize::i8Bit, Result);
CalculatePF(Result);
InvalidateAF();
}
@ -2978,14 +2984,14 @@ void OpDispatchBuilder::EnterOp(OpcodeArgs) {
if (Level > 0) {
for (uint8_t i = 1; i < Level; ++i) {
auto Offset = _Constant(i * GPRSize);
auto MemLoc = _Sub(IR::SizeToOpSize(GPRSize), OldBP, Offset);
auto Offset = _Constant(i * IR::OpSizeToSize(GPRSize));
auto MemLoc = _Sub(GPRSize, OldBP, Offset);
auto Mem = _LoadMem(GPRClass, GPRSize, MemLoc, GPRSize);
NewSP = PushValue(GPRSize, Mem);
}
NewSP = PushValue(GPRSize, temp_RBP);
}
NewSP = _Sub(IR::SizeToOpSize(GPRSize), NewSP, _Constant(AllocSpace));
NewSP = _Sub(GPRSize, NewSP, _Constant(AllocSpace));
StoreGPRRegister(X86State::REG_RSP, NewSP);
StoreGPRRegister(X86State::REG_RBP, temp_RBP);
}
@ -3186,7 +3192,7 @@ void OpDispatchBuilder::STOSOp(OpcodeArgs) {
// Offset the pointer
Ref TailDest = LoadGPRRegister(X86State::REG_RDI);
StoreGPRRegister(X86State::REG_RDI, OffsetByDir(TailDest, Size));
StoreGPRRegister(X86State::REG_RDI, OffsetByDir(TailDest, IR::OpSizeToSize(Size)));
} else {
// FEX doesn't support partial faulting REP instructions.
// Converting this to a `MemSet` IR op optimizes this quite significantly in our codegen.
@ -3255,7 +3261,7 @@ void OpDispatchBuilder::MOVSOp(OpcodeArgs) {
// Store to memory where RDI points
_StoreMemAutoTSO(GPRClass, Size, RDI, Src, Size);
auto PtrDir = LoadDir(Size);
auto PtrDir = LoadDir(IR::OpSizeToSize(Size));
RSI = _Add(OpSize::i64Bit, RSI, PtrDir);
RDI = _Add(OpSize::i64Bit, RDI, PtrDir);
@ -3285,7 +3291,7 @@ void OpDispatchBuilder::CMPSOp(OpcodeArgs) {
CalculateFlags_SUB(OpSizeFromSrc(Op), Src2, Src1);
auto PtrDir = LoadDir(Size);
auto PtrDir = LoadDir(IR::OpSizeToSize(Size));
// Offset the pointer
Dest_RDI = _Add(OpSize::i64Bit, Dest_RDI, PtrDir);
@ -3342,11 +3348,11 @@ void OpDispatchBuilder::CMPSOp(OpcodeArgs) {
StoreGPRRegister(X86State::REG_RCX, TailCounter);
// Offset the pointer
Dest_RDI = _Add(OpSize::i64Bit, Dest_RDI, _Constant(PtrDir * Size));
Dest_RDI = _Add(OpSize::i64Bit, Dest_RDI, _Constant(PtrDir * IR::OpSizeToSize(Size)));
StoreGPRRegister(X86State::REG_RDI, Dest_RDI);
// Offset second pointer
Dest_RSI = _Add(OpSize::i64Bit, Dest_RSI, _Constant(PtrDir * Size));
Dest_RSI = _Add(OpSize::i64Bit, Dest_RSI, _Constant(PtrDir * IR::OpSizeToSize(Size)));
StoreGPRRegister(X86State::REG_RSI, Dest_RSI);
// If TailCounter != 0, compare sources.
@ -3403,7 +3409,7 @@ void OpDispatchBuilder::LODSOp(OpcodeArgs) {
// Offset the pointer
Ref TailDest_RSI = LoadGPRRegister(X86State::REG_RSI);
StoreGPRRegister(X86State::REG_RSI, OffsetByDir(TailDest_RSI, Size));
StoreGPRRegister(X86State::REG_RSI, OffsetByDir(TailDest_RSI, IR::OpSizeToSize(Size)));
} else {
// Calculate flags early. because end of block
CalculateDeferredFlags();
@ -3452,7 +3458,7 @@ void OpDispatchBuilder::LODSOp(OpcodeArgs) {
StoreGPRRegister(X86State::REG_RCX, TailCounter);
// Offset the pointer
TailDest_RSI = _Add(OpSize::i64Bit, TailDest_RSI, _Constant(PtrDir * Size));
TailDest_RSI = _Add(OpSize::i64Bit, TailDest_RSI, _Constant(PtrDir * IR::OpSizeToSize(Size)));
StoreGPRRegister(X86State::REG_RSI, TailDest_RSI);
// Jump back to the start, we have more work to do
@ -3487,7 +3493,7 @@ void OpDispatchBuilder::SCASOp(OpcodeArgs) {
// Offset the pointer
Ref TailDest_RDI = LoadGPRRegister(X86State::REG_RDI);
StoreGPRRegister(X86State::REG_RDI, OffsetByDir(TailDest_RDI, Size));
StoreGPRRegister(X86State::REG_RDI, OffsetByDir(TailDest_RDI, IR::OpSizeToSize(Size)));
} else {
// Calculate flags early. because end of block
CalculateDeferredFlags();
@ -3536,7 +3542,7 @@ void OpDispatchBuilder::SCASOp(OpcodeArgs) {
StoreGPRRegister(X86State::REG_RCX, TailCounter);
// Offset the pointer
TailDest_RDI = _Add(OpSize::i64Bit, TailDest_RDI, _Constant(Dir * Size));
TailDest_RDI = _Add(OpSize::i64Bit, TailDest_RDI, _Constant(Dir * IR::OpSizeToSize(Size)));
StoreGPRRegister(X86State::REG_RDI, TailDest_RDI);
CalculateDeferredFlags();
@ -3598,7 +3604,7 @@ void OpDispatchBuilder::NEGOp(OpcodeArgs) {
if (DestIsLockedMem(Op)) {
Ref DestMem = MakeSegmentAddress(Op, Op->Dest);
Ref Dest = _AtomicFetchNeg(IR::SizeToOpSize(Size), DestMem);
Ref Dest = _AtomicFetchNeg(Size, DestMem);
CalculateFlags_SUB(Size, ZeroConst, Dest);
} else {
Ref Dest = LoadSource(GPRClass, Op, Op->Dest, Op->Flags, {.AllowUpperGarbage = true});
@ -3622,7 +3628,7 @@ void OpDispatchBuilder::DIVOp(OpcodeArgs) {
auto URemOp = _URem(OpSize::i16Bit, Src1, Divisor);
// AX[15:0] = concat<URem[7:0]:UDiv[7:0]>
auto ResultAX = _Bfi(IR::SizeToOpSize(GPRSize), 8, 8, UDivOp, URemOp);
auto ResultAX = _Bfi(GPRSize, 8, 8, UDivOp, URemOp);
StoreGPRRegister(X86State::REG_RAX, ResultAX, OpSize::i16Bit);
} else if (Size == OpSize::i16Bit) {
Ref Src1 = LoadGPRRegister(X86State::REG_RAX, Size);
@ -3636,8 +3642,8 @@ void OpDispatchBuilder::DIVOp(OpcodeArgs) {
Ref Src1 = LoadGPRRegister(X86State::REG_RAX, Size);
Ref Src2 = LoadGPRRegister(X86State::REG_RDX, Size);
Ref UDivOp = _Bfe(OpSize::i32Bit, Size * 8, 0, _LUDiv(OpSize::i32Bit, Src1, Src2, Divisor));
Ref URemOp = _Bfe(OpSize::i32Bit, Size * 8, 0, _LURem(OpSize::i32Bit, Src1, Src2, Divisor));
Ref UDivOp = _Bfe(OpSize::i32Bit, IR::OpSizeAsBits(Size), 0, _LUDiv(OpSize::i32Bit, Src1, Src2, Divisor));
Ref URemOp = _Bfe(OpSize::i32Bit, IR::OpSizeAsBits(Size), 0, _LURem(OpSize::i32Bit, Src1, Src2, Divisor));
StoreGPRRegister(X86State::REG_RAX, UDivOp);
StoreGPRRegister(X86State::REG_RDX, URemOp);
@ -3674,7 +3680,7 @@ void OpDispatchBuilder::IDIVOp(OpcodeArgs) {
auto URemOp = _Rem(OpSize::i64Bit, Src1, Divisor);
// AX[15:0] = concat<URem[7:0]:UDiv[7:0]>
auto ResultAX = _Bfi(IR::SizeToOpSize(GPRSize), 8, 8, UDivOp, URemOp);
auto ResultAX = _Bfi(GPRSize, 8, 8, UDivOp, URemOp);
StoreGPRRegister(X86State::REG_RAX, ResultAX, OpSize::i16Bit);
} else if (Size == OpSize::i16Bit) {
Ref Src1 = LoadGPRRegister(X86State::REG_RAX, Size);
@ -3688,8 +3694,8 @@ void OpDispatchBuilder::IDIVOp(OpcodeArgs) {
Ref Src1 = LoadGPRRegister(X86State::REG_RAX, Size);
Ref Src2 = LoadGPRRegister(X86State::REG_RDX, Size);
Ref UDivOp = _Bfe(OpSize::i32Bit, Size * 8, 0, _LDiv(OpSize::i32Bit, Src1, Src2, Divisor));
Ref URemOp = _Bfe(OpSize::i32Bit, Size * 8, 0, _LRem(OpSize::i32Bit, Src1, Src2, Divisor));
Ref UDivOp = _Bfe(OpSize::i32Bit, IR::OpSizeAsBits(Size), 0, _LDiv(OpSize::i32Bit, Src1, Src2, Divisor));
Ref URemOp = _Bfe(OpSize::i32Bit, IR::OpSizeAsBits(Size), 0, _LRem(OpSize::i32Bit, Src1, Src2, Divisor));
StoreGPRRegister(X86State::REG_RAX, UDivOp);
StoreGPRRegister(X86State::REG_RDX, URemOp);
@ -3728,7 +3734,7 @@ void OpDispatchBuilder::BSFOp(OpcodeArgs) {
// Although Intel does not guarantee that semantic, AMD does and Intel
// hardware satisfies it. We provide the stronger AMD behaviour as
// applications might rely on that in the wild.
auto SelectOp = NZCVSelect(IR::SizeToOpSize(GPRSize), {COND_EQ}, Dest, Result);
auto SelectOp = NZCVSelect(GPRSize, {COND_EQ}, Dest, Result);
StoreResult_WithOpSize(GPRClass, Op, Op->Dest, SelectOp, DstSize, OpSize::iInvalid);
}
@ -3746,7 +3752,7 @@ void OpDispatchBuilder::BSROp(OpcodeArgs) {
SetZ_InvalidateNCV(OpSizeFromSrc(Op), Src);
// If Src was zero then the destination doesn't get modified
auto SelectOp = NZCVSelect(IR::SizeToOpSize(GPRSize), {COND_EQ}, Dest, Result);
auto SelectOp = NZCVSelect(GPRSize, {COND_EQ}, Dest, Result);
StoreResult_WithOpSize(GPRClass, Op, Op->Dest, SelectOp, DstSize, OpSize::iInvalid);
}
@ -3784,7 +3790,7 @@ void OpDispatchBuilder::CMPXCHGOp(OpcodeArgs) {
if (GPRSize == OpSize::i64Bit && Size == OpSize::i32Bit) {
Src1 = LoadSource_WithOpSize(GPRClass, Op, Op->Dest, GPRSize, Op->Flags, {.AllowUpperGarbage = true});
Src1Lower = _Bfe(IR::SizeToOpSize(GPRSize), Size * 8, 0, Src1);
Src1Lower = _Bfe(GPRSize, IR::OpSizeAsBits(Size), 0, Src1);
} else {
Src1 = LoadSource_WithOpSize(GPRClass, Op, Op->Dest, Size, Op->Flags, {.AllowUpperGarbage = true});
Src1Lower = Src1;
@ -3797,7 +3803,7 @@ void OpDispatchBuilder::CMPXCHGOp(OpcodeArgs) {
if (!Trivial) {
if (GPRSize == OpSize::i64Bit && Size == OpSize::i32Bit) {
// This allows us to only hit the ZEXT case on failure
Ref RAXResult = NZCVSelect(IR::i64Bit, {COND_EQ}, Src3, Src1Lower);
Ref RAXResult = NZCVSelect(OpSize::i64Bit, {COND_EQ}, Src3, Src1Lower);
// When the size is 4 we need to make sure not zext the GPR when the comparison fails
StoreGPRRegister(X86State::REG_RAX, RAXResult);
@ -3809,7 +3815,7 @@ void OpDispatchBuilder::CMPXCHGOp(OpcodeArgs) {
// Op1 = RAX == Op1 ? Op2 : Op1
// If they match then set the rm operand to the input
// else don't set the rm operand
Ref DestResult = Trivial ? Src2 : NZCVSelect(IR::i64Bit, CondClassType {COND_EQ}, Src2, Src1);
Ref DestResult = Trivial ? Src2 : NZCVSelect(OpSize::i64Bit, CondClassType {COND_EQ}, Src2, Src1);
// Store in to GPR Dest
if (GPRSize == OpSize::i64Bit && Size == OpSize::i32Bit) {
@ -3837,7 +3843,7 @@ void OpDispatchBuilder::CMPXCHGOp(OpcodeArgs) {
// if (DataSrc == Src3) { *Src1 == Src2; } Src2 = DataSrc
// This will write to memory! Careful!
// Third operand must be a calculated guest memory address
Ref CASResult = _CAS(IR::SizeToOpSize(Size), Src3Lower, Src2, Src1);
Ref CASResult = _CAS(Size, Src3Lower, Src2, Src1);
Ref RAXResult = CASResult;
CalculateFlags_SUB(OpSizeFromSrc(Op), Src3Lower, CASResult);
@ -3845,7 +3851,7 @@ void OpDispatchBuilder::CMPXCHGOp(OpcodeArgs) {
if (GPRSize == OpSize::i64Bit && Size == OpSize::i32Bit) {
// This allows us to only hit the ZEXT case on failure
RAXResult = _NZCVSelect(IR::i64Bit, {COND_EQ}, Src3, CASResult);
RAXResult = _NZCVSelect(OpSize::i64Bit, {COND_EQ}, Src3, CASResult);
Size = OpSize::i64Bit;
}
@ -3885,10 +3891,10 @@ void OpDispatchBuilder::CMPXCHGPairOp(OpcodeArgs) {
Ref Result_Lower = _AllocateGPR(true);
Ref Result_Upper = _AllocateGPRAfter(Result_Lower);
_CASPair(IR::SizeToOpSize(Size), Expected_Lower, Expected_Upper, Desired_Lower, Desired_Upper, Src1, Result_Lower, Result_Upper);
_CASPair(Size, Expected_Lower, Expected_Upper, Desired_Lower, Desired_Upper, Src1, Result_Lower, Result_Upper);
HandleNZCV_RMW();
_CmpPairZ(IR::SizeToOpSize(Size), Result_Lower, Result_Upper, Expected_Lower, Expected_Upper);
_CmpPairZ(Size, Result_Lower, Result_Upper, Expected_Lower, Expected_Upper);
CalculateDeferredFlags();
auto UpdateIfNotZF = [this](auto Reg, auto Value) {
@ -4020,7 +4026,7 @@ Ref OpDispatchBuilder::GetSegment(uint32_t Flags, uint32_t DefaultPrefix, bool O
Ref OpDispatchBuilder::AppendSegmentOffset(Ref Value, uint32_t Flags, uint32_t DefaultPrefix, bool Override) {
auto Segment = GetSegment(Flags, DefaultPrefix, Override);
if (Segment) {
Value = _Add(IR::SizeToOpSize(std::max<uint8_t>(OpSize::i32Bit, std::max(GetOpSize(Value), GetOpSize(Segment)))), Value, Segment);
Value = _Add(std::max(OpSize::i32Bit, std::max(GetOpSize(Value), GetOpSize(Segment))), Value, Segment);
}
return Value;
@ -4144,7 +4150,7 @@ Ref OpDispatchBuilder::LoadEffectiveAddress(AddressMode A, bool AddSegmentBase,
if (A.Offset) {
Ref Offset = _Constant(A.Offset);
Tmp = Tmp ? _Add(IR::SizeToOpSize(GPRSize), Tmp, Offset) : Offset;
Tmp = Tmp ? _Add(GPRSize, Tmp, Offset) : Offset;
}
if (A.Index) {
@ -4167,7 +4173,7 @@ Ref OpDispatchBuilder::LoadEffectiveAddress(AddressMode A, bool AddSegmentBase,
//
// If the AddrSize is not the GPRSize then we need to clear the upper bits.
if ((A.AddrSize < GPRSize) && !AllowUpperGarbage && Tmp) {
Tmp = _Bfe(GPRSize, A.AddrSize * 8, 0, Tmp);
Tmp = _Bfe(GPRSize, IR::OpSizeAsBits(A.AddrSize), 0, Tmp);
}
if (A.Segment && AddSegmentBase) {
@ -4177,7 +4183,7 @@ Ref OpDispatchBuilder::LoadEffectiveAddress(AddressMode A, bool AddSegmentBase,
return Tmp ?: _Constant(0);
}
AddressMode OpDispatchBuilder::SelectAddressMode(AddressMode A, bool AtomicTSO, bool Vector, unsigned AccessSize) {
AddressMode OpDispatchBuilder::SelectAddressMode(AddressMode A, bool AtomicTSO, bool Vector, IR::OpSize AccessSize) {
const auto GPRSize = CTX->GetGPROpSize();
// In the future this also needs to account for LRCPC3.
@ -4207,9 +4213,10 @@ AddressMode OpDispatchBuilder::SelectAddressMode(AddressMode A, bool AtomicTSO,
}
// Try a (possibly scaled) register index.
if (A.AddrSize == OpSize::i64Bit && A.Base && (A.Index || A.Segment) && !A.Offset && (A.IndexScale == 1 || A.IndexScale == AccessSize)) {
if (A.AddrSize == OpSize::i64Bit && A.Base && (A.Index || A.Segment) && !A.Offset &&
(A.IndexScale == 1 || A.IndexScale == IR::OpSizeToSize(AccessSize))) {
if (A.Index && A.Segment) {
A.Base = _Add(IR::SizeToOpSize(GPRSize), A.Base, A.Segment);
A.Base = _Add(GPRSize, A.Base, A.Segment);
} else if (A.Segment) {
A.Index = A.Segment;
A.IndexScale = 1;
@ -4231,7 +4238,7 @@ AddressMode OpDispatchBuilder::DecodeAddress(const X86Tables::DecodedOp& Op, con
AddressMode A {};
A.Segment = GetSegment(Op->Flags);
A.AddrSize = (Op->Flags & X86Tables::DecodeFlags::FLAG_ADDRESS_SIZE) != 0 ? (IR::DivideOpSize(GPRSize, 2)) : GPRSize;
A.AddrSize = (Op->Flags & X86Tables::DecodeFlags::FLAG_ADDRESS_SIZE) != 0 ? (GPRSize >> 1) : GPRSize;
A.NonTSO = AccessType == MemoryAccessType::NONTSO || AccessType == MemoryAccessType::STREAM;
if (Operand.IsLiteral()) {
@ -4312,7 +4319,7 @@ Ref OpDispatchBuilder::LoadSource_WithOpSize(RegisterClassType Class, const X86T
// Now extract the subregister if it was a partial load /smaller/ than SSE size
// TODO: Instead of doing the VMov implicitly on load, hunt down all use cases that require partial loads and do it after load.
// We don't have information here to know if the operation needs zero upper bits or can contain data.
if (!AllowUpperGarbage && OpSize < Core::CPUState::XMM_SSE_REG_SIZE) {
if (!AllowUpperGarbage && OpSize < OpSize::i128Bit) {
A.Base = _VMov(OpSize, A.Base);
}
} else {
@ -4345,7 +4352,7 @@ Ref OpDispatchBuilder::LoadGPRRegister(uint32_t GPR, IR::OpSize Size, uint8_t Of
if (AllowUpperGarbage) {
Reg = _Lshr(OpSize, Reg, _Constant(Offset));
} else {
Reg = _Bfe(OpSize, Size * 8, Offset, Reg);
Reg = _Bfe(OpSize, IR::OpSizeAsBits(Size), Offset, Reg);
}
}
return Reg;
@ -4360,7 +4367,7 @@ void OpDispatchBuilder::StoreGPRRegister(uint32_t GPR, const Ref Src, IR::OpSize
Ref Reg = Src;
if (Size != GPRSize || Offset != 0) {
// Need to do an insert if not automatic size or zero offset.
Reg = _Bfi(GPRSize, Size * 8, Offset, LoadGPRRegister(GPR), Src);
Reg = _Bfi(GPRSize, IR::OpSizeAsBits(Size), Offset, LoadGPRRegister(GPR), Src);
}
StoreRegister(GPR, false, Reg);
@ -4408,7 +4415,7 @@ void OpDispatchBuilder::StoreResult_WithOpSize(FEXCore::IR::RegisterClassType Cl
LOGMAN_THROW_A_FMT(Class != IR::GPRClass, "Partial writes from GPR not allowed. Instruction: {}", Op->TableInfo->Name);
// XMM-size is handled in implementations.
if (VectorSize != Core::CPUState::XMM_AVX_REG_SIZE || OpSize != Core::CPUState::XMM_SSE_REG_SIZE) {
if (VectorSize != OpSize::i256Bit || OpSize != OpSize::i128Bit) {
auto SrcVector = LoadXMMRegister(gprIndex);
Result = _VInsElement(VectorSize, OpSize, 0, 0, SrcVector, Src);
}
@ -4443,7 +4450,7 @@ void OpDispatchBuilder::StoreResult_WithOpSize(FEXCore::IR::RegisterClassType Cl
AddressMode A = DecodeAddress(Op, Operand, AccessType, false /* IsLoad */);
if (OpSize == 10) {
if (OpSize == OpSize::f80Bit) {
Ref MemStoreDst = LoadEffectiveAddress(A, true);
// For X87 extended doubles, split before storing
@ -4547,7 +4554,7 @@ void OpDispatchBuilder::ALUOp(OpcodeArgs, FEXCore::IR::IROps ALUIROp, FEXCore::I
(ALUIROp == IR::IROps::OP_XOR || ALUIROp == IR::IROps::OP_OR || ALUIROp == IR::IROps::OP_ANDWITHFLAGS)) {
RoundedSize = ResultSize = CTX->GetGPROpSize();
LOGMAN_THROW_A_FMT(Const < (1ull << (Size * 8)), "does not clobber");
LOGMAN_THROW_A_FMT(Const < (1ull << IR::OpSizeAsBits(Size)), "does not clobber");
// For AND, we can play the same trick but we instead need the upper bits of
// the constant to be all-1s instead of all-0s to preserve. We also can't
@ -4559,7 +4566,7 @@ void OpDispatchBuilder::ALUOp(OpcodeArgs, FEXCore::IR::IROps ALUIROp, FEXCore::I
// adjusted constant here will inline into the arm64 and instruction, so if
// flags are not needed, we save an instruction overall.
if (ALUIROp == IR::IROps::OP_ANDWITHFLAGS) {
Src = _Constant(Const | ~((1ull << (Size * 8)) - 1));
Src = _Constant(Const | ~((1ull << IR::OpSizeAsBits(Size)) - 1));
ALUIROp = IR::IROps::OP_AND;
}
}
@ -4570,13 +4577,13 @@ void OpDispatchBuilder::ALUOp(OpcodeArgs, FEXCore::IR::IROps ALUIROp, FEXCore::I
if (DestIsLockedMem(Op)) {
HandledLock = true;
Ref DestMem = MakeSegmentAddress(Op, Op->Dest);
DeriveOp(FetchOp, AtomicFetchOp, _AtomicFetchAdd(IR::SizeToOpSize(Size), Src, DestMem));
DeriveOp(FetchOp, AtomicFetchOp, _AtomicFetchAdd(Size, Src, DestMem));
Dest = FetchOp;
} else {
Dest = LoadSource(GPRClass, Op, Op->Dest, Op->Flags, {.AllowUpperGarbage = true});
}
const auto OpSize = IR::SizeToOpSize(RoundedSize);
const auto OpSize = RoundedSize;
DeriveOp(ALUOp, ALUIROp, _AndWithFlags(OpSize, Dest, Src));
Result = ALUOp;
@ -4756,7 +4763,7 @@ void OpDispatchBuilder::MOVBEOp(OpcodeArgs) {
// Rev of 16-bit value as 32-bit replaces the result in the upper 16-bits of the result.
// bfxil the 16-bit result in to the GPR.
Ref Dest = LoadSource_WithOpSize(GPRClass, Op, Op->Dest, GPRSize, Op->Flags);
auto Result = _Bfxil(IR::SizeToOpSize(GPRSize), 16, 16, Dest, Src);
auto Result = _Bfxil(GPRSize, 16, 16, Dest, Src);
StoreResult_WithOpSize(GPRClass, Op, Op->Dest, Result, GPRSize, OpSize::iInvalid);
} else {
// 32-bit does regular zext

View File

@ -938,12 +938,12 @@ public:
void AVX128_VectorALU(OpcodeArgs, IROps IROp, IR::OpSize ElementSize);
void AVX128_VectorUnary(OpcodeArgs, IROps IROp, IR::OpSize ElementSize);
void AVX128_VectorUnaryImpl(OpcodeArgs, IR::OpSize SrcSize, IR::OpSize ElementSize, std::function<Ref(IR::OpSize ElementSize, Ref Src)> Helper);
void AVX128_VectorBinaryImpl(OpcodeArgs, size_t SrcSize, IR::OpSize ElementSize,
void AVX128_VectorBinaryImpl(OpcodeArgs, IR::OpSize SrcSize, IR::OpSize ElementSize,
std::function<Ref(IR::OpSize ElementSize, Ref Src1, Ref Src2)> Helper);
void AVX128_VectorShiftWideImpl(OpcodeArgs, IR::OpSize ElementSize, IROps IROp);
void AVX128_VectorShiftImmImpl(OpcodeArgs, IR::OpSize ElementSize, IROps IROp);
void AVX128_VectorTrinaryImpl(OpcodeArgs, size_t SrcSize, size_t ElementSize, Ref Src3,
std::function<Ref(size_t ElementSize, Ref Src1, Ref Src2, Ref Src3)> Helper);
void AVX128_VectorTrinaryImpl(OpcodeArgs, IR::OpSize SrcSize, IR::OpSize ElementSize, Ref Src3,
std::function<Ref(IR::OpSize ElementSize, Ref Src1, Ref Src2, Ref Src3)> Helper);
enum class ShiftDirection { RIGHT, LEFT };
void AVX128_ShiftDoubleImm(OpcodeArgs, ShiftDirection Dir);
@ -993,7 +993,7 @@ public:
template<IR::OpSize ElementSize>
void AVX128_PExtr(OpcodeArgs);
void AVX128_ExtendVectorElements(OpcodeArgs, IR::OpSize ElementSize, IR::OpSize DstElementSize, bool Signed);
template<size_t ElementSize>
template<IR::OpSize ElementSize>
void AVX128_MOVMSK(OpcodeArgs);
void AVX128_MOVMSKB(OpcodeArgs);
void AVX128_PINSRImpl(OpcodeArgs, IR::OpSize ElementSize, const X86Tables::DecodedOperand& Src1Op,
@ -1065,7 +1065,7 @@ public:
template<IR::OpSize ElementSize>
void AVX128_VSHUF(OpcodeArgs);
template<size_t ElementSize>
template<IR::OpSize ElementSize>
void AVX128_VPERMILImm(OpcodeArgs);
template<IROps IROp, IR::OpSize ElementSize>
@ -1137,7 +1137,7 @@ public:
void StoreResult_WithAVXInsert(VectorOpType Type, FEXCore::IR::RegisterClassType Class, FEXCore::X86Tables::DecodedOp Op, Ref Value,
IR::OpSize Align, MemoryAccessType AccessType = MemoryAccessType::DEFAULT) {
if (Op->Dest.IsGPR() && Op->Dest.Data.GPR.GPR >= X86State::REG_XMM_0 && Op->Dest.Data.GPR.GPR <= X86State::REG_XMM_15 &&
GetGuestVectorLength() == Core::CPUState::XMM_AVX_REG_SIZE && Type == VectorOpType::SSE) {
GetGuestVectorLength() == OpSize::i256Bit && Type == VectorOpType::SSE) {
const auto gpr = Op->Dest.Data.GPR.GPR;
const auto gprIndex = gpr - X86State::REG_XMM_0;
auto DestVector = LoadXMMRegister(gprIndex);
@ -1150,7 +1150,7 @@ public:
}
void StoreXMMRegister_WithAVXInsert(VectorOpType Type, uint32_t XMM, Ref Value) {
if (GetGuestVectorLength() == Core::CPUState::XMM_AVX_REG_SIZE && Type == VectorOpType::SSE) {
if (GetGuestVectorLength() == OpSize::i256Bit && Type == VectorOpType::SSE) {
///< SSE vector stores need to insert in the low 128-bit lane of the 256-bit register.
auto DestVector = LoadXMMRegister(XMM);
Value = _VInsElement(GetGuestVectorLength(), OpSize::i128Bit, 0, 0, DestVector, Value);
@ -1233,12 +1233,14 @@ public:
// Use stp where possible to store multiple values at a time. This accelerates AVX.
// TODO: this is all really confusing because of backwards iteration,
// can we peel back that hack?
if ((Bits & NextBit) && !Partial && Size >= 4 && CacheIndexToContextOffset(Index - 1) == Offset - Size && (Offset - Size) / Size < 64) {
const auto SizeInt = IR::OpSizeToSize(Size);
if ((Bits & NextBit) && !Partial && Size >= OpSize::i32Bit && CacheIndexToContextOffset(Index - 1) == Offset - SizeInt &&
(Offset - SizeInt) / SizeInt < 64) {
LOGMAN_THROW_A_FMT(CacheIndexClass(Index - 1) == Class, "construction");
LOGMAN_THROW_A_FMT((Offset % Size) == 0, "construction");
LOGMAN_THROW_A_FMT((Offset % SizeInt) == 0, "construction");
Ref ValueNext = RegCache.Value[Index - 1];
_StoreContextPair(Size, Class, ValueNext, Value, Offset - Size);
_StoreContextPair(Size, Class, ValueNext, Value, Offset - SizeInt);
Bits &= ~NextBit;
} else {
_StoreContext(Size, Class, Value, Offset);
@ -1380,7 +1382,7 @@ private:
Ref InsertPSOpImpl(OpcodeArgs, const X86Tables::DecodedOperand& Src1, const X86Tables::DecodedOperand& Src2,
const X86Tables::DecodedOperand& Imm);
Ref MPSADBWOpImpl(size_t SrcSize, Ref Src1, Ref Src2, uint8_t Select);
Ref MPSADBWOpImpl(IR::OpSize SrcSize, Ref Src1, Ref Src2, uint8_t Select);
Ref PALIGNROpImpl(OpcodeArgs, const X86Tables::DecodedOperand& Src1, const X86Tables::DecodedOperand& Src2,
const X86Tables::DecodedOperand& Imm, bool IsAVX);
@ -1503,7 +1505,7 @@ private:
Ref GetRelocatedPC(const FEXCore::X86Tables::DecodedOp& Op, int64_t Offset = 0);
Ref LoadEffectiveAddress(AddressMode A, bool AddSegmentBase, bool AllowUpperGarbage = false);
AddressMode SelectAddressMode(AddressMode A, bool AtomicTSO, bool Vector, unsigned AccessSize);
AddressMode SelectAddressMode(AddressMode A, bool AtomicTSO, bool Vector, IR::OpSize AccessSize);
bool IsOperandMem(const X86Tables::DecodedOperand& Operand, bool Load) {
// Literals are immediates as sources but memory addresses as destinations.
@ -1627,24 +1629,24 @@ private:
NZCVDirty = true;
}
void SetNZ_ZeroCV(unsigned SrcSize, Ref Res, bool SetPF = false) {
void SetNZ_ZeroCV(IR::OpSize SrcSize, Ref Res, bool SetPF = false) {
HandleNZ00Write();
// x - 0 = x. NZ set according to Res. C always set. V always unset. This
// matches what we want since we want carry inverted.
//
// This is currently worse for 8/16-bit, but that should be optimized. TODO
if (SrcSize >= 4) {
if (SrcSize >= OpSize::i32Bit) {
if (SetPF) {
CalculatePF(_SubWithFlags(IR::SizeToOpSize(SrcSize), Res, _Constant(0)));
CalculatePF(_SubWithFlags(SrcSize, Res, _Constant(0)));
} else {
_SubNZCV(IR::SizeToOpSize(SrcSize), Res, _Constant(0));
_SubNZCV(SrcSize, Res, _Constant(0));
}
PossiblySetNZCVBits |= 1u << IndexNZCV(FEXCore::X86State::RFLAG_CF_RAW_LOC);
CFInverted = true;
} else {
_TestNZ(IR::SizeToOpSize(SrcSize), Res, Res);
_TestNZ(SrcSize, Res, Res);
CFInverted = false;
if (SetPF) {
@ -1653,7 +1655,7 @@ private:
}
}
void SetNZP_ZeroCV(unsigned SrcSize, Ref Res) {
void SetNZP_ZeroCV(IR::OpSize SrcSize, Ref Res) {
SetNZ_ZeroCV(SrcSize, Res, true);
}
@ -1705,8 +1707,8 @@ private:
HandleNZCVWrite();
CFInverted = true;
if (Size < 4) {
_TestNZ(OpSize::i32Bit, Src, _InlineConstant((1u << (8 * Size)) - 1));
if (Size < OpSize::i32Bit) {
_TestNZ(OpSize::i32Bit, Src, _InlineConstant((1u << (IR::OpSizeAsBits(Size))) - 1));
} else {
_TestNZ(Size, Src, Src);
}
@ -1882,7 +1884,7 @@ private:
LOGMAN_THROW_AA_FMT(Index < 64, "valid index");
uint64_t Bit = (1ull << (uint64_t)Index);
if (Size == 16 && (RegCache.Partial & Bit)) {
if (Size == OpSize::i128Bit && (RegCache.Partial & Bit)) {
// We need to load the full register extend if we previously did a partial access.
Ref Value = RegCache.Value[Index];
Ref Full = _LoadContext(Size, RegClass, Offset);
@ -1902,7 +1904,7 @@ private:
RegCache.Value[Index] = _LoadContext(Size, RegClass, Offset);
// We may have done a partial load, this requires special handling.
if (Size == 8) {
if (Size == OpSize::i64Bit) {
RegCache.Partial |= Bit;
}
} else if (Index == PFIndex) {
@ -1938,12 +1940,13 @@ private:
// Try to load a pair into the cache
uint64_t Bits = (3ull << (uint64_t)Index);
if (((RegCache.Partial | RegCache.Cached) & Bits) == 0 && ((Offset / Size) < 64)) {
const auto SizeInt = IR::OpSizeToSize(Size);
if (((RegCache.Partial | RegCache.Cached) & Bits) == 0 && ((Offset / SizeInt) < 64)) {
auto Values = LoadContextPair_Uncached(RegClass, Size, Offset);
RegCache.Value[Index] = Values.Low;
RegCache.Value[Index + 1] = Values.High;
RegCache.Cached |= Bits;
if (Size == 8) {
if (Size == OpSize::i64Bit) {
RegCache.Partial |= Bits;
}
return Values;
@ -1952,7 +1955,7 @@ private:
// Fallback on a pair of loads
return {
.Low = LoadRegCache(Offset, Index, RegClass, Size),
.High = LoadRegCache(Offset + Size, Index + 1, RegClass, Size),
.High = LoadRegCache(Offset + SizeInt, Index + 1, RegClass, Size),
};
}
@ -2427,10 +2430,11 @@ private:
}
AddressMode SelectPairAddressMode(AddressMode A, IR::OpSize Size) {
const auto SizeInt = IR::OpSizeToSize(Size);
AddressMode Out {};
signed OffsetEl = A.Offset / Size;
if ((A.Offset % Size) == 0 && OffsetEl >= -64 && OffsetEl < 64) {
signed OffsetEl = A.Offset / SizeInt;
if ((A.Offset % SizeInt) == 0 && OffsetEl >= -64 && OffsetEl < 64) {
Out.Offset = A.Offset;
A.Offset = 0;
}
@ -2477,6 +2481,7 @@ private:
void _StoreMemPairAutoTSO(FEXCore::IR::RegisterClassType Class, IR::OpSize Size, AddressMode A, Ref Value1, Ref Value2,
IR::OpSize Align = IR::OpSize::i8Bit) {
const auto SizeInt = IR::OpSizeToSize(Size);
bool AtomicTSO = IsTSOEnabled(Class) && !A.NonTSO;
// Use stp if possible, otherwise fallback on two stores.
@ -2485,7 +2490,7 @@ private:
_StoreMemPair(Class, Size, Value1, Value2, A.Base, A.Offset);
} else {
_StoreMemAutoTSO(Class, Size, A, Value1, OpSize::i8Bit);
A.Offset += Size;
A.Offset += SizeInt;
_StoreMemAutoTSO(Class, Size, A, Value2, OpSize::i8Bit);
}
}

View File

@ -74,8 +74,8 @@ void OpDispatchBuilder::InstallAVX128Handlers() {
{OPD(1, 0b00, 0x2F), 1, &OpDispatchBuilder::AVX128_UCOMISx<OpSize::i32Bit>},
{OPD(1, 0b01, 0x2F), 1, &OpDispatchBuilder::AVX128_UCOMISx<OpSize::i64Bit>},
{OPD(1, 0b00, 0x50), 1, &OpDispatchBuilder::AVX128_MOVMSK<4>},
{OPD(1, 0b01, 0x50), 1, &OpDispatchBuilder::AVX128_MOVMSK<8>},
{OPD(1, 0b00, 0x50), 1, &OpDispatchBuilder::AVX128_MOVMSK<OpSize::i32Bit>},
{OPD(1, 0b01, 0x50), 1, &OpDispatchBuilder::AVX128_MOVMSK<OpSize::i64Bit>},
{OPD(1, 0b00, 0x51), 1, &OpDispatchBuilder::Bind<&OpDispatchBuilder::AVX128_VectorUnary, IR::OP_VFSQRT, OpSize::i32Bit>},
{OPD(1, 0b01, 0x51), 1, &OpDispatchBuilder::Bind<&OpDispatchBuilder::AVX128_VectorUnary, IR::OP_VFSQRT, OpSize::i64Bit>},
@ -158,7 +158,7 @@ void OpDispatchBuilder::InstallAVX128Handlers() {
{OPD(1, 0b01, 0x6F), 1, &OpDispatchBuilder::AVX128_VMOVAPS},
{OPD(1, 0b10, 0x6F), 1, &OpDispatchBuilder::AVX128_VMOVAPS},
{OPD(1, 0b01, 0x70), 1, &OpDispatchBuilder::AVX128_VPERMILImm<4>},
{OPD(1, 0b01, 0x70), 1, &OpDispatchBuilder::AVX128_VPERMILImm<OpSize::i32Bit>},
{OPD(1, 0b10, 0x70), 1, &OpDispatchBuilder::Bind<&OpDispatchBuilder::AVX128_VPSHUFW, false>},
{OPD(1, 0b11, 0x70), 1, &OpDispatchBuilder::Bind<&OpDispatchBuilder::AVX128_VPSHUFW, true>},
@ -379,8 +379,8 @@ void OpDispatchBuilder::InstallAVX128Handlers() {
{OPD(3, 0b01, 0x00), 1, &OpDispatchBuilder::AVX128_VPERMQ},
{OPD(3, 0b01, 0x01), 1, &OpDispatchBuilder::AVX128_VPERMQ},
{OPD(3, 0b01, 0x02), 1, &OpDispatchBuilder::AVX128_VBLEND<OpSize::i32Bit>},
{OPD(3, 0b01, 0x04), 1, &OpDispatchBuilder::AVX128_VPERMILImm<4>},
{OPD(3, 0b01, 0x05), 1, &OpDispatchBuilder::AVX128_VPERMILImm<8>},
{OPD(3, 0b01, 0x04), 1, &OpDispatchBuilder::AVX128_VPERMILImm<OpSize::i32Bit>},
{OPD(3, 0b01, 0x05), 1, &OpDispatchBuilder::AVX128_VPERMILImm<OpSize::i64Bit>},
{OPD(3, 0b01, 0x06), 1, &OpDispatchBuilder::AVX128_VPERM2},
{OPD(3, 0b01, 0x08), 1, &OpDispatchBuilder::AVX128_VectorRound<OpSize::i32Bit>},
{OPD(3, 0b01, 0x09), 1, &OpDispatchBuilder::AVX128_VectorRound<OpSize::i64Bit>},
@ -665,7 +665,7 @@ void OpDispatchBuilder::AVX128_VectorUnary(OpcodeArgs, IROps IROp, IR::OpSize El
void OpDispatchBuilder::AVX128_VectorUnaryImpl(OpcodeArgs, IR::OpSize SrcSize, IR::OpSize ElementSize,
std::function<Ref(IR::OpSize ElementSize, Ref Src)> Helper) {
const auto Is128Bit = SrcSize == Core::CPUState::XMM_SSE_REG_SIZE;
const auto Is128Bit = SrcSize == OpSize::i128Bit;
auto Src = AVX128_LoadSource_WithOpSize(Op, Op->Src[0], Op->Flags, !Is128Bit);
RefPair Result {};
@ -680,9 +680,9 @@ void OpDispatchBuilder::AVX128_VectorUnaryImpl(OpcodeArgs, IR::OpSize SrcSize, I
AVX128_StoreResult_WithOpSize(Op, Op->Dest, Result);
}
void OpDispatchBuilder::AVX128_VectorBinaryImpl(OpcodeArgs, size_t SrcSize, IR::OpSize ElementSize,
void OpDispatchBuilder::AVX128_VectorBinaryImpl(OpcodeArgs, IR::OpSize SrcSize, IR::OpSize ElementSize,
std::function<Ref(IR::OpSize ElementSize, Ref Src1, Ref Src2)> Helper) {
const auto Is128Bit = SrcSize == Core::CPUState::XMM_SSE_REG_SIZE;
const auto Is128Bit = SrcSize == OpSize::i128Bit;
auto Src1 = AVX128_LoadSource_WithOpSize(Op, Op->Src[0], Op->Flags, !Is128Bit);
auto Src2 = AVX128_LoadSource_WithOpSize(Op, Op->Src[1], Op->Flags, !Is128Bit);
@ -698,9 +698,9 @@ void OpDispatchBuilder::AVX128_VectorBinaryImpl(OpcodeArgs, size_t SrcSize, IR::
AVX128_StoreResult_WithOpSize(Op, Op->Dest, Result);
}
void OpDispatchBuilder::AVX128_VectorTrinaryImpl(OpcodeArgs, size_t SrcSize, size_t ElementSize, Ref Src3,
std::function<Ref(size_t ElementSize, Ref Src1, Ref Src2, Ref Src3)> Helper) {
const auto Is128Bit = SrcSize == Core::CPUState::XMM_SSE_REG_SIZE;
void OpDispatchBuilder::AVX128_VectorTrinaryImpl(OpcodeArgs, IR::OpSize SrcSize, IR::OpSize ElementSize, Ref Src3,
std::function<Ref(IR::OpSize ElementSize, Ref Src1, Ref Src2, Ref Src3)> Helper) {
const auto Is128Bit = SrcSize == OpSize::i128Bit;
auto Src1 = AVX128_LoadSource_WithOpSize(Op, Op->Src[0], Op->Flags, !Is128Bit);
auto Src2 = AVX128_LoadSource_WithOpSize(Op, Op->Src[1], Op->Flags, !Is128Bit);
@ -984,13 +984,13 @@ void OpDispatchBuilder::AVX128_VBROADCAST(OpcodeArgs) {
template<IR::OpSize ElementSize>
void OpDispatchBuilder::AVX128_VPUNPCKL(OpcodeArgs) {
AVX128_VectorBinaryImpl(Op, GetSrcSize(Op), ElementSize,
AVX128_VectorBinaryImpl(Op, OpSizeFromSrc(Op), ElementSize,
[this](IR::OpSize _ElementSize, Ref Src1, Ref Src2) { return _VZip(OpSize::i128Bit, _ElementSize, Src1, Src2); });
}
template<IR::OpSize ElementSize>
void OpDispatchBuilder::AVX128_VPUNPCKH(OpcodeArgs) {
AVX128_VectorBinaryImpl(Op, GetSrcSize(Op), ElementSize,
AVX128_VectorBinaryImpl(Op, OpSizeFromSrc(Op), ElementSize,
[this](IR::OpSize _ElementSize, Ref Src1, Ref Src2) { return _VZip2(OpSize::i128Bit, _ElementSize, Src1, Src2); });
}
@ -1039,7 +1039,7 @@ void OpDispatchBuilder::AVX128_InsertCVTGPR_To_FPR(OpcodeArgs) {
Result.Low = _VSToFVectorInsert(DstSize, DstElementSize, DstElementSize, Src1.Low, Src2.Low, false, false);
}
[[maybe_unused]] const auto Is128Bit = DstSize == Core::CPUState::XMM_SSE_REG_SIZE;
[[maybe_unused]] const auto Is128Bit = DstSize == OpSize::i128Bit;
LOGMAN_THROW_A_FMT(Is128Bit, "Programming Error: This should never occur!");
Result.High = LoadZeroVector(OpSize::i128Bit);
@ -1073,33 +1073,33 @@ void OpDispatchBuilder::AVX128_CVTFPR_To_GPR(OpcodeArgs) {
}
void OpDispatchBuilder::AVX128_VANDN(OpcodeArgs) {
AVX128_VectorBinaryImpl(Op, GetSrcSize(Op), OpSize::i128Bit,
AVX128_VectorBinaryImpl(Op, OpSizeFromSrc(Op), OpSize::i128Bit,
[this](IR::OpSize _ElementSize, Ref Src1, Ref Src2) { return _VAndn(OpSize::i128Bit, _ElementSize, Src2, Src1); });
}
template<IR::OpSize ElementSize>
void OpDispatchBuilder::AVX128_VPACKSS(OpcodeArgs) {
AVX128_VectorBinaryImpl(Op, GetSrcSize(Op), ElementSize, [this](IR::OpSize _ElementSize, Ref Src1, Ref Src2) {
AVX128_VectorBinaryImpl(Op, OpSizeFromSrc(Op), ElementSize, [this](IR::OpSize _ElementSize, Ref Src1, Ref Src2) {
return _VSQXTNPair(OpSize::i128Bit, _ElementSize, Src1, Src2);
});
}
template<IR::OpSize ElementSize>
void OpDispatchBuilder::AVX128_VPACKUS(OpcodeArgs) {
AVX128_VectorBinaryImpl(Op, GetSrcSize(Op), ElementSize, [this](IR::OpSize _ElementSize, Ref Src1, Ref Src2) {
AVX128_VectorBinaryImpl(Op, OpSizeFromSrc(Op), ElementSize, [this](IR::OpSize _ElementSize, Ref Src1, Ref Src2) {
return _VSQXTUNPair(OpSize::i128Bit, _ElementSize, Src1, Src2);
});
}
Ref OpDispatchBuilder::AVX128_PSIGNImpl(IR::OpSize ElementSize, Ref Src1, Ref Src2) {
Ref Control = _VSQSHL(OpSize::i128Bit, ElementSize, Src2, (ElementSize * 8) - 1);
Control = _VSRSHR(OpSize::i128Bit, ElementSize, Control, (ElementSize * 8) - 1);
Ref Control = _VSQSHL(OpSize::i128Bit, ElementSize, Src2, IR::OpSizeAsBits(ElementSize) - 1);
Control = _VSRSHR(OpSize::i128Bit, ElementSize, Control, IR::OpSizeAsBits(ElementSize) - 1);
return _VMul(OpSize::i128Bit, ElementSize, Src1, Control);
}
template<IR::OpSize ElementSize>
void OpDispatchBuilder::AVX128_VPSIGN(OpcodeArgs) {
AVX128_VectorBinaryImpl(Op, GetSrcSize(Op), ElementSize,
AVX128_VectorBinaryImpl(Op, OpSizeFromSrc(Op), ElementSize,
[this](IR::OpSize _ElementSize, Ref Src1, Ref Src2) { return AVX128_PSIGNImpl(_ElementSize, Src1, Src2); });
}
@ -1154,7 +1154,7 @@ void OpDispatchBuilder::AVX128_VFCMP(OpcodeArgs) {
.CompType = CompType,
};
AVX128_VectorBinaryImpl(Op, GetSrcSize(Op), ElementSize, [this, &Capture](IR::OpSize _ElementSize, Ref Src1, Ref Src2) {
AVX128_VectorBinaryImpl(Op, OpSizeFromSrc(Op), ElementSize, [this, &Capture](IR::OpSize _ElementSize, Ref Src1, Ref Src2) {
return VFCMPOpImpl(OpSize::i128Bit, _ElementSize, Src1, Src2, Capture.CompType);
});
}
@ -1234,7 +1234,7 @@ void OpDispatchBuilder::AVX128_PExtr(OpcodeArgs) {
}
// AVX version only operates on 128-bit.
const uint8_t NumElements = std::min<uint8_t>(GetSrcSize(Op), OpSize::i128Bit) / OverridenElementSize;
const uint8_t NumElements = IR::NumElements(std::min(OpSizeFromSrc(Op), OpSize::i128Bit), OverridenElementSize);
Index &= NumElements - 1;
if (Op->Dest.IsGPR()) {
@ -1251,14 +1251,14 @@ void OpDispatchBuilder::AVX128_PExtr(OpcodeArgs) {
}
void OpDispatchBuilder::AVX128_ExtendVectorElements(OpcodeArgs, IR::OpSize ElementSize, IR::OpSize DstElementSize, bool Signed) {
const auto DstSize = GetDstSize(Op);
const auto DstSize = OpSizeFromDst(Op);
const auto GetSrc = [&] {
if (Op->Src[0].IsGPR()) {
return AVX128_LoadSource_WithOpSize(Op, Op->Src[0], Op->Flags, false).Low;
} else {
// For memory operands the 256-bit variant loads twice the size specified in the table.
const auto Is256Bit = DstSize == Core::CPUState::XMM_AVX_REG_SIZE;
const auto Is256Bit = DstSize == OpSize::i256Bit;
const auto SrcSize = OpSizeFromSrc(Op);
const auto LoadSize = Is256Bit ? IR::SizeToOpSize(IR::OpSizeToSize(SrcSize) * 2) : SrcSize;
@ -1267,8 +1267,7 @@ void OpDispatchBuilder::AVX128_ExtendVectorElements(OpcodeArgs, IR::OpSize Eleme
};
auto Transform = [=, this](Ref Src) {
for (auto CurrentElementSize = ElementSize; CurrentElementSize != DstElementSize;
CurrentElementSize = IR::MultiplyOpSize(CurrentElementSize, 2)) {
for (auto CurrentElementSize = ElementSize; CurrentElementSize != DstElementSize; CurrentElementSize = CurrentElementSize << 1) {
if (Signed) {
Src = _VSXTL(OpSize::i128Bit, CurrentElementSize, Src);
} else {
@ -1286,8 +1285,8 @@ void OpDispatchBuilder::AVX128_ExtendVectorElements(OpcodeArgs, IR::OpSize Eleme
Result.Low = Transform(Src);
} else {
// 256-bit operation is a bit special. It splits the incoming source between lower and upper registers.
size_t TotalElementCount = OpSize::i256Bit / DstElementSize;
size_t TotalElementsToSplitSize = (TotalElementCount / 2) * ElementSize;
size_t TotalElementCount = IR::NumElements(OpSize::i256Bit, DstElementSize);
size_t TotalElementsToSplitSize = (TotalElementCount / 2) * IR::OpSizeToSize(ElementSize);
// Split the number of elements in half between lower and upper.
Ref SrcHigh = _VDupElement(OpSize::i128Bit, IR::SizeToOpSize(TotalElementsToSplitSize), Src, 1);
@ -1303,10 +1302,10 @@ void OpDispatchBuilder::AVX128_ExtendVectorElements(OpcodeArgs, IR::OpSize Eleme
AVX128_StoreResult_WithOpSize(Op, Op->Dest, Result);
}
template<size_t ElementSize>
template<IR::OpSize ElementSize>
void OpDispatchBuilder::AVX128_MOVMSK(OpcodeArgs) {
const auto SrcSize = GetSrcSize(Op);
const auto Is128Bit = SrcSize == Core::CPUState::XMM_SSE_REG_SIZE;
const auto SrcSize = OpSizeFromSrc(Op);
const auto Is128Bit = SrcSize == OpSize::i128Bit;
auto Src = AVX128_LoadSource_WithOpSize(Op, Op->Src[0], Op->Flags, !Is128Bit);
@ -1385,7 +1384,7 @@ void OpDispatchBuilder::AVX128_MOVMSKB(OpcodeArgs) {
void OpDispatchBuilder::AVX128_PINSRImpl(OpcodeArgs, IR::OpSize ElementSize, const X86Tables::DecodedOperand& Src1Op,
const X86Tables::DecodedOperand& Src2Op, const X86Tables::DecodedOperand& Imm) {
const auto NumElements = OpSize::i128Bit / ElementSize;
const auto NumElements = IR::NumElements(OpSize::i128Bit, ElementSize);
const uint64_t Index = Imm.Literal() & (NumElements - 1);
auto Src1 = AVX128_LoadSource_WithOpSize(Op, Src1Op, Op->Flags, false);
@ -1419,7 +1418,7 @@ void OpDispatchBuilder::AVX128_VPINSRDQ(OpcodeArgs) {
}
void OpDispatchBuilder::AVX128_VariableShiftImpl(OpcodeArgs, IROps IROp) {
AVX128_VectorBinaryImpl(Op, GetDstSize(Op), OpSizeFromSrc(Op), [this, IROp](IR::OpSize ElementSize, Ref Src1, Ref Src2) {
AVX128_VectorBinaryImpl(Op, OpSizeFromDst(Op), OpSizeFromSrc(Op), [this, IROp](IR::OpSize ElementSize, Ref Src1, Ref Src2) {
DeriveOp(Shift, IROp, _VUShr(OpSize::i128Bit, ElementSize, Src1, Src2, true));
return Shift;
});
@ -1431,7 +1430,7 @@ void OpDispatchBuilder::AVX128_ShiftDoubleImm(OpcodeArgs, ShiftDirection Dir) {
const bool Right = Dir == ShiftDirection::RIGHT;
const uint64_t Shift = Op->Src[1].Literal();
const uint64_t ExtrShift = Right ? Shift : OpSize::i128Bit - Shift;
const uint64_t ExtrShift = Right ? Shift : IR::OpSizeToSize(OpSize::i128Bit) - Shift;
auto Src = AVX128_LoadSource_WithOpSize(Op, Op->Src[0], Op->Flags, !Is128Bit);
@ -1486,40 +1485,40 @@ void OpDispatchBuilder::AVX128_VINSERTPS(OpcodeArgs) {
template<IR::OpSize ElementSize>
void OpDispatchBuilder::AVX128_VPHSUB(OpcodeArgs) {
AVX128_VectorBinaryImpl(Op, GetDstSize(Op), ElementSize, [this](IR::OpSize _ElementSize, Ref Src1, Ref Src2) {
AVX128_VectorBinaryImpl(Op, OpSizeFromDst(Op), ElementSize, [this](IR::OpSize _ElementSize, Ref Src1, Ref Src2) {
return PHSUBOpImpl(OpSize::i128Bit, Src1, Src2, _ElementSize);
});
}
void OpDispatchBuilder::AVX128_VPHSUBSW(OpcodeArgs) {
AVX128_VectorBinaryImpl(Op, GetDstSize(Op), OpSize::i16Bit,
AVX128_VectorBinaryImpl(Op, OpSizeFromDst(Op), OpSize::i16Bit,
[this](IR::OpSize _ElementSize, Ref Src1, Ref Src2) { return PHSUBSOpImpl(OpSize::i128Bit, Src1, Src2); });
}
template<IR::OpSize ElementSize>
void OpDispatchBuilder::AVX128_VADDSUBP(OpcodeArgs) {
AVX128_VectorBinaryImpl(Op, GetDstSize(Op), ElementSize, [this](IR::OpSize _ElementSize, Ref Src1, Ref Src2) {
AVX128_VectorBinaryImpl(Op, OpSizeFromDst(Op), ElementSize, [this](IR::OpSize _ElementSize, Ref Src1, Ref Src2) {
return ADDSUBPOpImpl(OpSize::i128Bit, _ElementSize, Src1, Src2);
});
}
template<IR::OpSize ElementSize, bool Signed>
void OpDispatchBuilder::AVX128_VPMULL(OpcodeArgs) {
static_assert(ElementSize == sizeof(uint32_t), "Currently only handles 32-bit -> 64-bit");
static_assert(ElementSize == OpSize::i32Bit, "Currently only handles 32-bit -> 64-bit");
AVX128_VectorBinaryImpl(Op, GetDstSize(Op), ElementSize, [this](IR::OpSize _ElementSize, Ref Src1, Ref Src2) -> Ref {
AVX128_VectorBinaryImpl(Op, OpSizeFromDst(Op), ElementSize, [this](IR::OpSize _ElementSize, Ref Src1, Ref Src2) -> Ref {
return PMULLOpImpl(OpSize::i128Bit, ElementSize, Signed, Src1, Src2);
});
}
void OpDispatchBuilder::AVX128_VPMULHRSW(OpcodeArgs) {
AVX128_VectorBinaryImpl(Op, GetDstSize(Op), OpSize::i16Bit,
AVX128_VectorBinaryImpl(Op, OpSizeFromDst(Op), OpSize::i16Bit,
[this](IR::OpSize _ElementSize, Ref Src1, Ref Src2) -> Ref { return PMULHRSWOpImpl(OpSize::i128Bit, Src1, Src2); });
}
template<bool Signed>
void OpDispatchBuilder::AVX128_VPMULHW(OpcodeArgs) {
AVX128_VectorBinaryImpl(Op, GetDstSize(Op), OpSize::i16Bit, [this](IR::OpSize _ElementSize, Ref Src1, Ref Src2) -> Ref {
AVX128_VectorBinaryImpl(Op, OpSizeFromDst(Op), OpSize::i16Bit, [this](IR::OpSize _ElementSize, Ref Src1, Ref Src2) -> Ref {
if (Signed) {
return _VSMulH(OpSize::i128Bit, _ElementSize, Src1, Src2);
} else {
@ -1546,9 +1545,9 @@ void OpDispatchBuilder::AVX128_Vector_CVT_Float_To_Float(OpcodeArgs) {
const auto SrcSize = OpSizeFromSrc(Op);
const auto DstSize = OpSizeFromDst(Op);
const auto IsFloatSrc = SrcElementSize == 4;
auto Is128BitSrc = SrcSize == Core::CPUState::XMM_SSE_REG_SIZE;
auto Is128BitDst = DstSize == Core::CPUState::XMM_SSE_REG_SIZE;
const auto IsFloatSrc = SrcElementSize == OpSize::i32Bit;
auto Is128BitSrc = SrcSize == OpSize::i128Bit;
auto Is128BitDst = DstSize == OpSize::i128Bit;
///< Decompose correctly.
if (DstElementSize > SrcElementSize && !Is128BitDst) {
@ -1630,7 +1629,7 @@ void OpDispatchBuilder::AVX128_Vector_CVT_Float_To_Int(OpcodeArgs) {
auto Convert = [this](Ref Src) -> Ref {
auto ElementSize = SrcElementSize;
if (Narrow) {
ElementSize = IR::DivideOpSize(ElementSize, 2);
ElementSize = ElementSize >> 1;
Src = _Vector_FToF(OpSize::i128Bit, ElementSize, Src, SrcElementSize);
}
@ -1663,7 +1662,7 @@ void OpDispatchBuilder::AVX128_Vector_CVT_Float_To_Int(OpcodeArgs) {
template<IR::OpSize SrcElementSize, bool Widen>
void OpDispatchBuilder::AVX128_Vector_CVT_Int_To_Float(OpcodeArgs) {
const auto Size = OpSizeFromDst(Op);
const auto Is128Bit = Size == Core::CPUState::XMM_SSE_REG_SIZE;
const auto Is128Bit = Size == OpSize::i128Bit;
RefPair Src = [&] {
if (Widen && !Op->Src[0].IsGPR()) {
@ -1682,7 +1681,7 @@ void OpDispatchBuilder::AVX128_Vector_CVT_Int_To_Float(OpcodeArgs) {
if (Widen) {
DeriveOp(Extended, Op, _VSXTL(OpSize::i128Bit, ElementSize, Src));
Src = Extended;
ElementSize = IR::MultiplyOpSize(ElementSize, 2);
ElementSize = ElementSize << 1;
}
return _Vector_SToF(OpSize::i128Bit, ElementSize, Src);
@ -1732,23 +1731,23 @@ void OpDispatchBuilder::AVX128_VAESImc(OpcodeArgs) {
}
void OpDispatchBuilder::AVX128_VAESEnc(OpcodeArgs) {
AVX128_VectorTrinaryImpl(Op, GetDstSize(Op), OpSize::i128Bit, LoadZeroVector(OpSize::i128Bit),
[this](size_t, Ref Src1, Ref Src2, Ref Src3) { return _VAESEnc(OpSize::i128Bit, Src1, Src2, Src3); });
AVX128_VectorTrinaryImpl(Op, OpSizeFromDst(Op), OpSize::i128Bit, LoadZeroVector(OpSize::i128Bit),
[this](IR::OpSize, Ref Src1, Ref Src2, Ref Src3) { return _VAESEnc(OpSize::i128Bit, Src1, Src2, Src3); });
}
void OpDispatchBuilder::AVX128_VAESEncLast(OpcodeArgs) {
AVX128_VectorTrinaryImpl(Op, GetDstSize(Op), OpSize::i128Bit, LoadZeroVector(OpSize::i128Bit),
[this](size_t, Ref Src1, Ref Src2, Ref Src3) { return _VAESEncLast(OpSize::i128Bit, Src1, Src2, Src3); });
AVX128_VectorTrinaryImpl(Op, OpSizeFromDst(Op), OpSize::i128Bit, LoadZeroVector(OpSize::i128Bit),
[this](IR::OpSize, Ref Src1, Ref Src2, Ref Src3) { return _VAESEncLast(OpSize::i128Bit, Src1, Src2, Src3); });
}
void OpDispatchBuilder::AVX128_VAESDec(OpcodeArgs) {
AVX128_VectorTrinaryImpl(Op, GetDstSize(Op), OpSize::i128Bit, LoadZeroVector(OpSize::i128Bit),
[this](size_t, Ref Src1, Ref Src2, Ref Src3) { return _VAESDec(OpSize::i128Bit, Src1, Src2, Src3); });
AVX128_VectorTrinaryImpl(Op, OpSizeFromDst(Op), OpSize::i128Bit, LoadZeroVector(OpSize::i128Bit),
[this](IR::OpSize, Ref Src1, Ref Src2, Ref Src3) { return _VAESDec(OpSize::i128Bit, Src1, Src2, Src3); });
}
void OpDispatchBuilder::AVX128_VAESDecLast(OpcodeArgs) {
AVX128_VectorTrinaryImpl(Op, GetDstSize(Op), OpSize::i128Bit, LoadZeroVector(OpSize::i128Bit),
[this](size_t, Ref Src1, Ref Src2, Ref Src3) { return _VAESDecLast(OpSize::i128Bit, Src1, Src2, Src3); });
AVX128_VectorTrinaryImpl(Op, OpSizeFromDst(Op), OpSize::i128Bit, LoadZeroVector(OpSize::i128Bit),
[this](IR::OpSize, Ref Src1, Ref Src2, Ref Src3) { return _VAESDecLast(OpSize::i128Bit, Src1, Src2, Src3); });
}
void OpDispatchBuilder::AVX128_VAESKeyGenAssist(OpcodeArgs) {
@ -1838,7 +1837,7 @@ template<IR::OpSize ElementSize>
void OpDispatchBuilder::AVX128_VDPP(OpcodeArgs) {
const uint64_t Literal = Op->Src[2].Literal();
AVX128_VectorBinaryImpl(Op, GetSrcSize(Op), ElementSize, [this, Literal](IR::OpSize, Ref Src1, Ref Src2) {
AVX128_VectorBinaryImpl(Op, OpSizeFromSrc(Op), ElementSize, [this, Literal](IR::OpSize, Ref Src1, Ref Src2) {
return DPPOpImpl(OpSize::i128Bit, Src1, Src2, Literal, ElementSize);
});
}
@ -1927,7 +1926,7 @@ void OpDispatchBuilder::AVX128_VSHUF(OpcodeArgs) {
AVX128_StoreResult_WithOpSize(Op, Op->Dest, Result);
}
template<size_t ElementSize>
template<IR::OpSize ElementSize>
void OpDispatchBuilder::AVX128_VPERMILImm(OpcodeArgs) {
const auto SrcSize = GetSrcSize(Op);
const auto Is128Bit = SrcSize == Core::CPUState::XMM_SSE_REG_SIZE;
@ -1967,31 +1966,31 @@ void OpDispatchBuilder::AVX128_VPERMILImm(OpcodeArgs) {
template<IROps IROp, IR::OpSize ElementSize>
void OpDispatchBuilder::AVX128_VHADDP(OpcodeArgs) {
AVX128_VectorBinaryImpl(Op, GetSrcSize(Op), ElementSize, [this](IR::OpSize, Ref Src1, Ref Src2) {
AVX128_VectorBinaryImpl(Op, OpSizeFromSrc(Op), ElementSize, [this](IR::OpSize, Ref Src1, Ref Src2) {
DeriveOp(Res, IROp, _VFAddP(OpSize::i128Bit, ElementSize, Src1, Src2));
return Res;
});
}
void OpDispatchBuilder::AVX128_VPHADDSW(OpcodeArgs) {
AVX128_VectorBinaryImpl(Op, GetDstSize(Op), OpSize::i16Bit,
AVX128_VectorBinaryImpl(Op, OpSizeFromDst(Op), OpSize::i16Bit,
[this](IR::OpSize _ElementSize, Ref Src1, Ref Src2) { return PHADDSOpImpl(OpSize::i128Bit, Src1, Src2); });
}
void OpDispatchBuilder::AVX128_VPMADDUBSW(OpcodeArgs) {
AVX128_VectorBinaryImpl(Op, GetSrcSize(Op), OpSize::i128Bit,
AVX128_VectorBinaryImpl(Op, OpSizeFromDst(Op), OpSize::i128Bit,
[this](IR::OpSize _ElementSize, Ref Src1, Ref Src2) { return PMADDUBSWOpImpl(OpSize::i128Bit, Src1, Src2); });
}
void OpDispatchBuilder::AVX128_VPMADDWD(OpcodeArgs) {
AVX128_VectorBinaryImpl(Op, GetSrcSize(Op), OpSize::i128Bit,
AVX128_VectorBinaryImpl(Op, OpSizeFromDst(Op), OpSize::i128Bit,
[this](IR::OpSize _ElementSize, Ref Src1, Ref Src2) { return PMADDWDOpImpl(OpSize::i128Bit, Src1, Src2); });
}
template<IR::OpSize ElementSize>
void OpDispatchBuilder::AVX128_VBLEND(OpcodeArgs) {
const auto SrcSize = OpSizeFromSrc(Op);
const auto Is128Bit = SrcSize == Core::CPUState::XMM_SSE_REG_SIZE;
const auto Is128Bit = SrcSize == OpSize::i128Bit;
const uint64_t Selector = Op->Src[2].Literal();
///< High Selector shift depends on element size:
@ -2017,19 +2016,19 @@ void OpDispatchBuilder::AVX128_VBLEND(OpcodeArgs) {
template<IR::OpSize ElementSize>
void OpDispatchBuilder::AVX128_VHSUBP(OpcodeArgs) {
AVX128_VectorBinaryImpl(Op, GetDstSize(Op), ElementSize,
AVX128_VectorBinaryImpl(Op, OpSizeFromDst(Op), ElementSize,
[this](IR::OpSize, Ref Src1, Ref Src2) { return HSUBPOpImpl(OpSize::i128Bit, ElementSize, Src1, Src2); });
}
void OpDispatchBuilder::AVX128_VPSHUFB(OpcodeArgs) {
auto MaskVector = GeneratePSHUFBMask(OpSize::i128Bit);
AVX128_VectorBinaryImpl(Op, GetDstSize(Op), OpSize::i8Bit, [this, MaskVector](IR::OpSize, Ref Src1, Ref Src2) {
AVX128_VectorBinaryImpl(Op, OpSizeFromDst(Op), OpSize::i8Bit, [this, MaskVector](IR::OpSize, Ref Src1, Ref Src2) {
return PSHUFBOpImpl(OpSize::i128Bit, Src1, Src2, MaskVector);
});
}
void OpDispatchBuilder::AVX128_VPSADBW(OpcodeArgs) {
AVX128_VectorBinaryImpl(Op, GetDstSize(Op), OpSize::i8Bit,
AVX128_VectorBinaryImpl(Op, OpSizeFromDst(Op), OpSize::i8Bit,
[this](IR::OpSize, Ref Src1, Ref Src2) { return PSADBWOpImpl(OpSize::i128Bit, Src1, Src2); });
}
@ -2061,7 +2060,7 @@ void OpDispatchBuilder::AVX128_VPALIGNR(OpcodeArgs) {
const auto SanitizedDstSize = std::min(Size, OpSize::i128Bit);
AVX128_VectorBinaryImpl(Op, Size, SanitizedDstSize, [this, Index](IR::OpSize SanitizedDstSize, Ref Src1, Ref Src2) -> Ref {
if (Index >= (SanitizedDstSize * 2)) {
if (Index >= (IR::OpSizeToSize(SanitizedDstSize) * 2)) {
// If the immediate is greater than both vectors combined then it zeroes the vector
return LoadZeroVector(OpSize::i128Bit);
}
@ -2076,7 +2075,7 @@ void OpDispatchBuilder::AVX128_VPALIGNR(OpcodeArgs) {
void OpDispatchBuilder::AVX128_VMASKMOVImpl(OpcodeArgs, IR::OpSize ElementSize, IR::OpSize DstSize, bool IsStore,
const X86Tables::DecodedOperand& MaskOp, const X86Tables::DecodedOperand& DataOp) {
const auto Is128Bit = DstSize == Core::CPUState::XMM_SSE_REG_SIZE;
const auto Is128Bit = DstSize == OpSize::i128Bit;
auto Mask = AVX128_LoadSource_WithOpSize(Op, MaskOp, Op->Flags, !Is128Bit);
@ -2098,14 +2097,14 @@ void OpDispatchBuilder::AVX128_VMASKMOVImpl(OpcodeArgs, IR::OpSize ElementSize,
auto Address = MakeAddress(DataOp);
RefPair Result {};
Result.Low = _VLoadVectorMasked(OpSize::i128Bit, ElementSize, Mask.Low, Address, Invalid(), MEM_OFFSET_SXTX, OpSize::i8Bit);
Result.Low = _VLoadVectorMasked(OpSize::i128Bit, ElementSize, Mask.Low, Address, Invalid(), MEM_OFFSET_SXTX, 1);
if (Is128Bit) {
Result.High = LoadZeroVector(OpSize::i128Bit);
} else {
///< TODO: This can be cleaner if AVX128_LoadSource_WithOpSize could return both constructed addresses.
auto AddressHigh = _Add(OpSize::i64Bit, Address, _Constant(16));
Result.High = _VLoadVectorMasked(OpSize::i128Bit, ElementSize, Mask.High, AddressHigh, Invalid(), MEM_OFFSET_SXTX, OpSize::i8Bit);
Result.High = _VLoadVectorMasked(OpSize::i128Bit, ElementSize, Mask.High, AddressHigh, Invalid(), MEM_OFFSET_SXTX, 1);
}
AVX128_StoreResult_WithOpSize(Op, Op->Dest, Result);
}
@ -2124,7 +2123,7 @@ void OpDispatchBuilder::AVX128_VMASKMOV(OpcodeArgs) {
void OpDispatchBuilder::AVX128_MASKMOV(OpcodeArgs) {
///< This instruction only supports 128-bit.
const auto Size = OpSizeFromSrc(Op);
const auto Is128Bit = Size == Core::CPUState::XMM_SSE_REG_SIZE;
const auto Is128Bit = Size == OpSize::i128Bit;
auto MaskSrc = AVX128_LoadSource_WithOpSize(Op, Op->Src[0], Op->Flags, !Is128Bit);
@ -2147,11 +2146,9 @@ void OpDispatchBuilder::AVX128_MASKMOV(OpcodeArgs) {
template<IR::OpSize ElementSize>
void OpDispatchBuilder::AVX128_VectorVariableBlend(OpcodeArgs) {
const auto Size = OpSizeFromSrc(Op);
const auto Is128Bit = Size == Core::CPUState::XMM_SSE_REG_SIZE;
const auto Is128Bit = Size == OpSize::i128Bit;
const auto Src3Selector = Op->Src[2].Literal();
constexpr auto ElementSizeBits = ElementSize * 8;
auto Src1 = AVX128_LoadSource_WithOpSize(Op, Op->Src[0], Op->Flags, !Is128Bit);
auto Src2 = AVX128_LoadSource_WithOpSize(Op, Op->Src[1], Op->Flags, !Is128Bit);
@ -2163,6 +2160,7 @@ void OpDispatchBuilder::AVX128_VectorVariableBlend(OpcodeArgs) {
}
auto Convert = [this](Ref Src1, Ref Src2, Ref Mask) {
const auto ElementSizeBits = IR::OpSizeAsBits(ElementSize);
Ref Shifted = _VSShrI(OpSize::i128Bit, ElementSize, Mask, ElementSizeBits - 1);
return _VBSL(OpSize::i128Bit, Shifted, Src2, Src1);
};
@ -2248,7 +2246,7 @@ void OpDispatchBuilder::AVX128_VTESTP(OpcodeArgs) {
Ref ZeroConst = _Constant(0);
Ref OneConst = _Constant(1);
const auto ElementSizeInBits = ElementSize * 8;
const auto ElementSizeInBits = IR::OpSizeAsBits(ElementSize);
{
// Calculate ZF first.
@ -2292,7 +2290,7 @@ void OpDispatchBuilder::AVX128_VTESTP(OpcodeArgs) {
}
// As in PTest, this sets Z appropriately while zeroing the rest of NZCV.
SetNZ_ZeroCV(32, ZF);
SetNZ_ZeroCV(OpSize::i32Bit, ZF);
SetCFInverted(CFInv);
ZeroPF_AF();
}
@ -2339,14 +2337,14 @@ void OpDispatchBuilder::AVX128_PTest(OpcodeArgs) {
// Set ZF according to Test1. SF will be zeroed since we do a 32-bit test on
// the results of a 16-bit value from the UMaxV, so the 32-bit sign bit is
// cleared even if the 16-bit scalars were negative.
SetNZ_ZeroCV(32, Test1);
SetNZ_ZeroCV(OpSize::i32Bit, Test1);
SetCFInverted(Test2);
ZeroPF_AF();
}
template<IR::OpSize ElementSize>
void OpDispatchBuilder::AVX128_VPERMILReg(OpcodeArgs) {
AVX128_VectorBinaryImpl(Op, GetSrcSize(Op), ElementSize, [this](size_t _ElementSize, Ref Src, Ref Indices) {
AVX128_VectorBinaryImpl(Op, OpSizeFromSrc(Op), ElementSize, [this](IR::OpSize _ElementSize, Ref Src, Ref Indices) {
return VPERMILRegOpImpl(OpSize::i128Bit, ElementSize, Src, Indices);
});
}
@ -2376,7 +2374,7 @@ void OpDispatchBuilder::AVX128_VPERMD(OpcodeArgs) {
void OpDispatchBuilder::AVX128_VPCLMULQDQ(OpcodeArgs) {
const auto Selector = static_cast<uint8_t>(Op->Src[2].Literal());
AVX128_VectorBinaryImpl(Op, GetSrcSize(Op), OpSize::iInvalid, [this, Selector](size_t _, Ref Src1, Ref Src2) {
AVX128_VectorBinaryImpl(Op, OpSizeFromSrc(Op), OpSize::iInvalid, [this, Selector](IR::OpSize, Ref Src1, Ref Src2) {
return _PCLMUL(OpSize::i128Bit, Src1, Src2, Selector & 0b1'0001);
});
}
@ -2548,7 +2546,7 @@ void OpDispatchBuilder::AVX128_VFMAddSubImpl(OpcodeArgs, bool AddSub, uint8_t Sr
OpDispatchBuilder::RefPair OpDispatchBuilder::AVX128_VPGatherImpl(OpSize Size, OpSize ElementLoadSize, OpSize AddrElementSize, RefPair Dest,
RefPair Mask, RefVSIB VSIB) {
LOGMAN_THROW_A_FMT(AddrElementSize == OpSize::i32Bit || AddrElementSize == OpSize::i64Bit, "Unknown address element size");
const auto Is128Bit = Size == Core::CPUState::XMM_SSE_REG_SIZE;
const auto Is128Bit = Size == OpSize::i128Bit;
///< BaseAddr doesn't need to exist, calculate that here.
Ref BaseAddr = VSIB.BaseAddr;
@ -2686,17 +2684,17 @@ OpDispatchBuilder::RefPair OpDispatchBuilder::AVX128_VPGatherQPSImpl(Ref Dest, R
template<OpSize AddrElementSize>
void OpDispatchBuilder::AVX128_VPGATHER(OpcodeArgs) {
const auto Size = GetDstSize(Op);
const auto Is128Bit = Size == Core::CPUState::XMM_SSE_REG_SIZE;
const auto Size = OpSizeFromDst(Op);
const auto Is128Bit = Size == OpSize::i128Bit;
///< Element size is determined by W flag.
const OpSize ElementLoadSize = Op->Flags & X86Tables::DecodeFlags::FLAG_OPTION_AVX_W ? OpSize::i64Bit : OpSize::i32Bit;
// We only need the high address register if the number of data elements is more than what the low half can consume.
// But also the number of address elements is clamped by the destination size as well.
const size_t NumDataElements = Size / ElementLoadSize;
const size_t NumAddrElementBytes = std::min<size_t>(Size, (NumDataElements * AddrElementSize));
const bool NeedsHighAddrBytes = NumAddrElementBytes > OpSize::i128Bit;
const size_t NumDataElements = IR::NumElements(Size, ElementLoadSize);
const size_t NumAddrElementBytes = std::min<size_t>(IR::OpSizeToSize(Size), (NumDataElements * IR::OpSizeToSize(AddrElementSize)));
const bool NeedsHighAddrBytes = NumAddrElementBytes > IR::OpSizeToSize(OpSize::i128Bit);
auto Dest = AVX128_LoadSource_WithOpSize(Op, Op->Dest, Op->Flags, !Is128Bit);
auto VSIB = AVX128_LoadVSIB(Op, Op->Src[0], Op->Flags, NeedsHighAddrBytes);
@ -2740,7 +2738,7 @@ void OpDispatchBuilder::AVX128_VPGATHER(OpcodeArgs) {
} else if (AddrElementSize == OpSize::i64Bit && ElementLoadSize == OpSize::i32Bit) {
Result = AVX128_VPGatherQPSImpl(Dest.Low, Mask.Low, VSIB);
} else {
Result = AVX128_VPGatherImpl(SizeToOpSize(Size), ElementLoadSize, AddrElementSize, Dest, Mask, VSIB);
Result = AVX128_VPGatherImpl(Size, ElementLoadSize, AddrElementSize, Dest, Mask, VSIB);
}
AVX128_StoreResult_WithOpSize(Op, Op->Dest, Result);
@ -2754,8 +2752,8 @@ void OpDispatchBuilder::AVX128_VPGATHER(OpcodeArgs) {
void OpDispatchBuilder::AVX128_VCVTPH2PS(OpcodeArgs) {
const auto DstSize = OpSizeFromDst(Op);
const auto SrcSize = IR::SizeToOpSize(IR::OpSizeToSize(DstSize) / 2);
const auto Is128BitSrc = SrcSize == Core::CPUState::XMM_SSE_REG_SIZE;
const auto Is128BitDst = DstSize == Core::CPUState::XMM_SSE_REG_SIZE;
const auto Is128BitSrc = SrcSize == OpSize::i128Bit;
const auto Is128BitDst = DstSize == OpSize::i128Bit;
RefPair Src {};
if (Op->Src[0].IsGPR()) {
@ -2783,7 +2781,7 @@ void OpDispatchBuilder::AVX128_VCVTPH2PS(OpcodeArgs) {
void OpDispatchBuilder::AVX128_VCVTPS2PH(OpcodeArgs) {
const auto SrcSize = OpSizeFromSrc(Op);
const auto Is128BitSrc = SrcSize == Core::CPUState::XMM_SSE_REG_SIZE;
const auto Is128BitSrc = SrcSize == OpSize::i128Bit;
const auto StoreSize = Op->Dest.IsGPR() ? OpSize::i128Bit : IR::SizeToOpSize(IR::OpSizeToSize(SrcSize) / 2);
const auto Imm8 = Op->Src[1].Literal();
@ -2814,7 +2812,7 @@ void OpDispatchBuilder::AVX128_VCVTPS2PH(OpcodeArgs) {
// We need to eliminate upper junk if we're storing into a register with
// a 256-bit source (VCVTPS2PH's destination for registers is an XMM).
if (Op->Src[0].IsGPR() && SrcSize == Core::CPUState::XMM_AVX_REG_SIZE) {
if (Op->Src[0].IsGPR() && SrcSize == OpSize::i256Bit) {
Result = AVX128_Zext(Result.Low);
}

View File

@ -322,7 +322,7 @@ void OpDispatchBuilder::AESEncOp(OpcodeArgs) {
void OpDispatchBuilder::VAESEncOp(OpcodeArgs) {
const auto DstSize = OpSizeFromDst(Op);
[[maybe_unused]] const auto Is128Bit = DstSize == Core::CPUState::XMM_SSE_REG_SIZE;
[[maybe_unused]] const auto Is128Bit = DstSize == OpSize::i128Bit;
// TODO: Handle 256-bit VAESENC.
LOGMAN_THROW_A_FMT(Is128Bit, "256-bit VAESENC unimplemented");
@ -343,7 +343,7 @@ void OpDispatchBuilder::AESEncLastOp(OpcodeArgs) {
void OpDispatchBuilder::VAESEncLastOp(OpcodeArgs) {
const auto DstSize = OpSizeFromDst(Op);
[[maybe_unused]] const auto Is128Bit = DstSize == Core::CPUState::XMM_SSE_REG_SIZE;
[[maybe_unused]] const auto Is128Bit = DstSize == OpSize::i128Bit;
// TODO: Handle 256-bit VAESENCLAST.
LOGMAN_THROW_A_FMT(Is128Bit, "256-bit VAESENCLAST unimplemented");
@ -364,7 +364,7 @@ void OpDispatchBuilder::AESDecOp(OpcodeArgs) {
void OpDispatchBuilder::VAESDecOp(OpcodeArgs) {
const auto DstSize = OpSizeFromDst(Op);
[[maybe_unused]] const auto Is128Bit = DstSize == Core::CPUState::XMM_SSE_REG_SIZE;
[[maybe_unused]] const auto Is128Bit = DstSize == OpSize::i128Bit;
// TODO: Handle 256-bit VAESDEC.
LOGMAN_THROW_A_FMT(Is128Bit, "256-bit VAESDEC unimplemented");
@ -385,7 +385,7 @@ void OpDispatchBuilder::AESDecLastOp(OpcodeArgs) {
void OpDispatchBuilder::VAESDecLastOp(OpcodeArgs) {
const auto DstSize = OpSizeFromDst(Op);
[[maybe_unused]] const auto Is128Bit = DstSize == Core::CPUState::XMM_SSE_REG_SIZE;
[[maybe_unused]] const auto Is128Bit = DstSize == OpSize::i128Bit;
// TODO: Handle 256-bit VAESDECLAST.
LOGMAN_THROW_A_FMT(Is128Bit, "256-bit VAESDECLAST unimplemented");

View File

@ -139,8 +139,8 @@ Ref OpDispatchBuilder::GetPackedRFLAG(uint32_t FlagsMask) {
}
void OpDispatchBuilder::CalculateOF(IR::OpSize SrcSize, Ref Res, Ref Src1, Ref Src2, bool Sub) {
auto OpSize = SrcSize == 8 ? OpSize::i64Bit : OpSize::i32Bit;
uint64_t SignBit = (SrcSize * 8) - 1;
const auto OpSize = SrcSize == OpSize::i64Bit ? OpSize::i64Bit : OpSize::i32Bit;
const uint64_t SignBit = IR::OpSizeAsBits(SrcSize) - 1;
Ref Anded = nullptr;
// For add, OF is set iff the sources have the same sign but the destination
@ -171,7 +171,7 @@ void OpDispatchBuilder::CalculateOF(IR::OpSize SrcSize, Ref Res, Ref Src1, Ref S
}
}
SetRFLAG<FEXCore::X86State::RFLAG_OF_RAW_LOC>(Anded, SrcSize * 8 - 1, true);
SetRFLAG<FEXCore::X86State::RFLAG_OF_RAW_LOC>(Anded, SignBit, true);
}
Ref OpDispatchBuilder::LoadPFRaw(bool Mask, bool Invert) {
@ -265,7 +265,7 @@ Ref OpDispatchBuilder::IncrementByCarry(OpSize OpSize, Ref Src) {
Ref OpDispatchBuilder::CalculateFlags_ADC(IR::OpSize SrcSize, Ref Src1, Ref Src2) {
auto Zero = _InlineConstant(0);
auto One = _InlineConstant(1);
auto OpSize = SrcSize == 8 ? OpSize::i64Bit : OpSize::i32Bit;
auto OpSize = SrcSize == OpSize::i64Bit ? OpSize::i64Bit : OpSize::i32Bit;
Ref Res;
CalculateAF(Src1, Src2);
@ -277,7 +277,7 @@ Ref OpDispatchBuilder::CalculateFlags_ADC(IR::OpSize SrcSize, Ref Src1, Ref Src2
CFInverted = false;
} else {
// Need to zero-extend for correct comparisons below
Src2 = _Bfe(OpSize, SrcSize * 8, 0, Src2);
Src2 = _Bfe(OpSize, IR::OpSizeAsBits(SrcSize), 0, Src2);
// Note that we do not extend Src2PlusCF, since we depend on proper
// 32-bit arithmetic to correctly handle the Src2 = 0xffff case.
@ -285,7 +285,7 @@ Ref OpDispatchBuilder::CalculateFlags_ADC(IR::OpSize SrcSize, Ref Src1, Ref Src2
// Need to zero-extend for the comparison.
Res = _Add(OpSize, Src1, Src2PlusCF);
Res = _Bfe(OpSize, SrcSize * 8, 0, Res);
Res = _Bfe(OpSize, IR::OpSizeAsBits(SrcSize), 0, Res);
// TODO: We can fold that second Bfe in (cmp uxth).
auto SelectCFInv = _Select(FEXCore::IR::COND_UGE, Res, Src2PlusCF, One, Zero);
@ -302,7 +302,7 @@ Ref OpDispatchBuilder::CalculateFlags_ADC(IR::OpSize SrcSize, Ref Src1, Ref Src2
Ref OpDispatchBuilder::CalculateFlags_SBB(IR::OpSize SrcSize, Ref Src1, Ref Src2) {
auto Zero = _InlineConstant(0);
auto One = _InlineConstant(1);
auto OpSize = SrcSize == 8 ? OpSize::i64Bit : OpSize::i32Bit;
auto OpSize = SrcSize == OpSize::i64Bit ? OpSize::i64Bit : OpSize::i32Bit;
CalculateAF(Src1, Src2);
@ -316,13 +316,13 @@ Ref OpDispatchBuilder::CalculateFlags_SBB(IR::OpSize SrcSize, Ref Src1, Ref Src2
CFInverted = true;
} else {
// Zero extend for correct comparison behaviour with Src1 = 0xffff.
Src1 = _Bfe(OpSize, SrcSize * 8, 0, Src1);
Src2 = _Bfe(OpSize, SrcSize * 8, 0, Src2);
Src1 = _Bfe(OpSize, IR::OpSizeAsBits(SrcSize), 0, Src1);
Src2 = _Bfe(OpSize, IR::OpSizeAsBits(SrcSize), 0, Src2);
auto Src2PlusCF = IncrementByCarry(OpSize, Src2);
Res = _Sub(OpSize, Src1, Src2PlusCF);
Res = _Bfe(OpSize, SrcSize * 8, 0, Res);
Res = _Bfe(OpSize, IR::OpSizeAsBits(SrcSize), 0, Res);
auto SelectCFInv = _Select(FEXCore::IR::COND_UGE, Src1, Src2PlusCF, One, Zero);
@ -345,9 +345,9 @@ Ref OpDispatchBuilder::CalculateFlags_SUB(IR::OpSize SrcSize, Ref Src1, Ref Src2
Ref Res;
if (SrcSize >= OpSize::i32Bit) {
Res = _SubWithFlags(IR::SizeToOpSize(SrcSize), Src1, Src2);
Res = _SubWithFlags(SrcSize, Src1, Src2);
} else {
_SubNZCV(IR::SizeToOpSize(SrcSize), Src1, Src2);
_SubNZCV(SrcSize, Src1, Src2);
Res = _Sub(OpSize::i32Bit, Src1, Src2);
}
@ -375,9 +375,9 @@ Ref OpDispatchBuilder::CalculateFlags_ADD(IR::OpSize SrcSize, Ref Src1, Ref Src2
Ref Res;
if (SrcSize >= OpSize::i32Bit) {
Res = _AddWithFlags(IR::SizeToOpSize(SrcSize), Src1, Src2);
Res = _AddWithFlags(SrcSize, Src1, Src2);
} else {
_AddNZCV(IR::SizeToOpSize(SrcSize), Src1, Src2);
_AddNZCV(SrcSize, Src1, Src2);
Res = _Add(OpSize::i32Bit, Src1, Src2);
}
@ -400,7 +400,7 @@ void OpDispatchBuilder::CalculateFlags_MUL(IR::OpSize SrcSize, Ref Res, Ref High
// CF and OF are set if the result of the operation can't be fit in to the destination register
// If the value can fit then the top bits will be zero
auto SignBit = _Sbfe(OpSize::i64Bit, 1, SrcSize * 8 - 1, Res);
auto SignBit = _Sbfe(OpSize::i64Bit, 1, IR::OpSizeAsBits(SrcSize) - 1, Res);
_SubNZCV(OpSize::i64Bit, High, SignBit);
// If High = SignBit, then sets to nZCv. Else sets to nzcV. Since SF/ZF
@ -415,7 +415,7 @@ void OpDispatchBuilder::CalculateFlags_UMUL(Ref High) {
InvalidatePF_AF();
auto Zero = _InlineConstant(0);
OpSize Size = IR::SizeToOpSize(GetOpSize(High));
const auto Size = GetOpSize(High);
// CF and OF are set if the result of the operation can't be fit in to the destination register
// The result register will be all zero if it can't fit due to how multiplication behaves
@ -442,7 +442,7 @@ void OpDispatchBuilder::CalculateFlags_ShiftLeftImmediate(IR::OpSize SrcSize, Re
return;
}
auto OpSize = SrcSize == 8 ? OpSize::i64Bit : OpSize::i32Bit;
auto OpSize = SrcSize == OpSize::i64Bit ? OpSize::i64Bit : OpSize::i32Bit;
SetNZ_ZeroCV(SrcSize, UnmaskedRes);
@ -451,7 +451,7 @@ void OpDispatchBuilder::CalculateFlags_ShiftLeftImmediate(IR::OpSize SrcSize, Re
// Extract the last bit shifted in to CF. Shift is already masked, but for
// 8/16-bit it might be >= SrcSizeBits, in which case CF is cleared. There's
// nothing to do in that case since we already cleared CF above.
auto SrcSizeBits = SrcSize * 8;
const auto SrcSizeBits = IR::OpSizeAsBits(SrcSize);
if (Shift < SrcSizeBits) {
SetCFDirect(Src1, SrcSizeBits - Shift, true);
}
@ -464,7 +464,7 @@ void OpDispatchBuilder::CalculateFlags_ShiftLeftImmediate(IR::OpSize SrcSize, Re
// In the case of left shift. OF is only set from the result of <Top Source Bit> XOR <Top Result Bit>
if (Shift == 1) {
auto Xor = _Xor(OpSize, UnmaskedRes, Src1);
SetRFLAG<FEXCore::X86State::RFLAG_OF_RAW_LOC>(Xor, SrcSize * 8 - 1, true);
SetRFLAG<FEXCore::X86State::RFLAG_OF_RAW_LOC>(Xor, IR::OpSizeAsBits(SrcSize) - 1, true);
} else {
// Undefined, we choose to zero as part of SetNZ_ZeroCV
}
@ -515,7 +515,7 @@ void OpDispatchBuilder::CalculateFlags_ShiftRightImmediate(IR::OpSize SrcSize, R
// Only defined when Shift is 1 else undefined
// Is set to the MSB of the original value
if (Shift == 1) {
SetRFLAG<FEXCore::X86State::RFLAG_OF_RAW_LOC>(Src1, SrcSize * 8 - 1, true);
SetRFLAG<FEXCore::X86State::RFLAG_OF_RAW_LOC>(Src1, IR::OpSizeAsBits(SrcSize) - 1, true);
}
}
}
@ -526,7 +526,7 @@ void OpDispatchBuilder::CalculateFlags_ShiftRightDoubleImmediate(IR::OpSize SrcS
return;
}
const auto OpSize = SrcSize == 8 ? OpSize::i64Bit : OpSize::i32Bit;
const auto OpSize = SrcSize == OpSize::i64Bit ? OpSize::i64Bit : OpSize::i32Bit;
CalculateFlags_ShiftRightImmediateCommon(SrcSize, Res, Src1, Shift);
// OF
@ -536,7 +536,7 @@ void OpDispatchBuilder::CalculateFlags_ShiftRightDoubleImmediate(IR::OpSize SrcS
// XOR of Result and Src1
if (Shift == 1) {
auto val = _Xor(OpSize, Src1, Res);
SetRFLAG<FEXCore::X86State::RFLAG_OF_RAW_LOC>(val, SrcSize * 8 - 1, true);
SetRFLAG<FEXCore::X86State::RFLAG_OF_RAW_LOC>(val, IR::OpSizeAsBits(SrcSize) - 1, true);
}
}
}
@ -549,7 +549,7 @@ void OpDispatchBuilder::CalculateFlags_ZCNT(IR::OpSize SrcSize, Ref Result) {
// Now set CF if the Result = SrcSize * 8. Since SrcSize is a power-of-two and
// Result is <= SrcSize * 8, we equivalently check if the log2(SrcSize * 8)
// bit is set. No masking is needed because no higher bits could be set.
unsigned CarryBit = FEXCore::ilog2(SrcSize * 8u);
unsigned CarryBit = FEXCore::ilog2(IR::OpSizeAsBits(SrcSize));
SetCFDirect(Result, CarryBit);
}

View File

@ -418,7 +418,7 @@ void OpDispatchBuilder::InsertMMX_To_XMM_Vector_CVT_Int_To_Float(OpcodeArgs) {
// Always 32-bit.
const auto ElementSize = OpSize::i32Bit;
// Always signed
Dest = _VSToFVectorInsert(IR::SizeToOpSize(DstSize), ElementSize, ElementSize, Dest, Src, true, false);
Dest = _VSToFVectorInsert(DstSize, ElementSize, ElementSize, Dest, Src, true, false);
StoreResult_WithOpSize(FPRClass, Op, Op->Dest, Dest, DstSize, OpSize::iInvalid);
}
@ -482,7 +482,7 @@ Ref OpDispatchBuilder::InsertScalar_CVT_Float_To_FloatImpl(OpcodeArgs, IR::OpSiz
Ref Src1 = LoadSource_WithOpSize(FPRClass, Op, Src1Op, DstSize, Op->Flags);
Ref Src2 = LoadSource_WithOpSize(FPRClass, Op, Src2Op, SrcSize, Op->Flags, {.AllowUpperGarbage = true});
return _VFToFScalarInsert(IR::SizeToOpSize(DstSize), DstElementSize, SrcElementSize, Src1, Src2, ZeroUpperBits);
return _VFToFScalarInsert(DstSize, DstElementSize, SrcElementSize, Src1, Src2, ZeroUpperBits);
}
template<IR::OpSize DstElementSize, IR::OpSize SrcElementSize>
@ -530,7 +530,7 @@ Ref OpDispatchBuilder::InsertScalarRoundImpl(OpcodeArgs, IR::OpSize DstSize, IR:
Ref Src2 = LoadSource_WithOpSize(FPRClass, Op, Src2Op, SrcSize, Op->Flags, {.AllowUpperGarbage = true});
const auto SourceMode = TranslateRoundType(Mode);
auto ALUOp = _VFToIScalarInsert(IR::SizeToOpSize(DstSize), ElementSize, Src1, Src2, SourceMode, ZeroUpperBits);
auto ALUOp = _VFToIScalarInsert(DstSize, ElementSize, Src1, Src2, SourceMode, ZeroUpperBits);
return ALUOp;
}
@ -600,7 +600,7 @@ void OpDispatchBuilder::InsertScalarFCMPOp(OpcodeArgs) {
Ref Src1 = LoadSource_WithOpSize(FPRClass, Op, Op->Dest, DstSize, Op->Flags);
Ref Src2 = LoadSource_WithOpSize(FPRClass, Op, Op->Src[0], SrcSize, Op->Flags, {.AllowUpperGarbage = true});
Ref Result = InsertScalarFCMPOpImpl(IR::SizeToOpSize(DstSize), OpSizeFromDst(Op), ElementSize, Src1, Src2, CompType, false);
Ref Result = InsertScalarFCMPOpImpl(DstSize, OpSizeFromDst(Op), ElementSize, Src1, Src2, CompType, false);
StoreResult_WithOpSize(FPRClass, Op, Op->Dest, Result, DstSize, OpSize::iInvalid);
}
@ -619,7 +619,7 @@ void OpDispatchBuilder::AVXInsertScalarFCMPOp(OpcodeArgs) {
Ref Src1 = LoadSource_WithOpSize(FPRClass, Op, Op->Src[0], DstSize, Op->Flags);
Ref Src2 = LoadSource_WithOpSize(FPRClass, Op, Op->Src[1], SrcSize, Op->Flags, {.AllowUpperGarbage = true});
Ref Result = InsertScalarFCMPOpImpl(IR::SizeToOpSize(DstSize), OpSizeFromDst(Op), ElementSize, Src1, Src2, CompType, true);
Ref Result = InsertScalarFCMPOpImpl(DstSize, OpSizeFromDst(Op), ElementSize, Src1, Src2, CompType, true);
StoreResult_WithOpSize(FPRClass, Op, Op->Dest, Result, DstSize, OpSize::iInvalid);
}
@ -741,10 +741,10 @@ void OpDispatchBuilder::MOVMSKOp(OpcodeArgs, IR::OpSize ElementSize) {
for (unsigned i = 0; i < NumElements; ++i) {
// Extract the top bit of the element
Ref Tmp = _VExtractToGPR(Size, ElementSize, Src, i);
Tmp = _Bfe(IR::SizeToOpSize(ElementSize), 1, ElementSize * 8 - 1, Tmp);
Tmp = _Bfe(ElementSize, 1, IR::OpSizeAsBits(ElementSize) - 1, Tmp);
// Shift it to the correct location
Tmp = _Lshl(IR::SizeToOpSize(ElementSize), Tmp, _Constant(i));
Tmp = _Lshl(ElementSize, Tmp, _Constant(i));
// Or it with the current value
CurrentVal = _Or(OpSize::i64Bit, CurrentVal, Tmp);
@ -755,7 +755,7 @@ void OpDispatchBuilder::MOVMSKOp(OpcodeArgs, IR::OpSize ElementSize) {
void OpDispatchBuilder::MOVMSKOpOne(OpcodeArgs) {
const auto SrcSize = OpSizeFromSrc(Op);
const auto Is256Bit = SrcSize == Core::CPUState::XMM_AVX_REG_SIZE;
const auto Is256Bit = SrcSize == OpSize::i256Bit;
const auto ExtractSize = Is256Bit ? OpSize::i32Bit : OpSize::i16Bit;
Ref Src = LoadSource(FPRClass, Op, Op->Src[0], Op->Flags);
@ -767,7 +767,7 @@ void OpDispatchBuilder::MOVMSKOpOne(OpcodeArgs) {
// Since we also handle the MM MOVMSKB here too,
// we need to clamp the lower bound.
const auto VAdd1Size = std::max(SrcSize, OpSize::i128Bit);
const auto VAdd2Size = std::max(IR::DivideOpSize(SrcSize, 2), OpSize::i64Bit);
const auto VAdd2Size = std::max(SrcSize >> 1, OpSize::i64Bit);
auto VAdd1 = _VAddP(VAdd1Size, OpSize::i8Bit, VAnd, VAnd);
auto VAdd2 = _VAddP(VAdd2Size, OpSize::i8Bit, VAdd1, VAdd1);
@ -790,7 +790,7 @@ void OpDispatchBuilder::PUNPCKLOp(OpcodeArgs, IR::OpSize ElementSize) {
void OpDispatchBuilder::VPUNPCKLOp(OpcodeArgs, IR::OpSize ElementSize) {
const auto SrcSize = OpSizeFromSrc(Op);
const auto Is128Bit = SrcSize == Core::CPUState::XMM_SSE_REG_SIZE;
const auto Is128Bit = SrcSize == OpSize::i128Bit;
Ref Src1 = LoadSource(FPRClass, Op, Op->Src[0], Op->Flags);
Ref Src2 = LoadSource(FPRClass, Op, Op->Src[1], Op->Flags);
@ -819,8 +819,7 @@ void OpDispatchBuilder::PUNPCKHOp(OpcodeArgs, IR::OpSize ElementSize) {
void OpDispatchBuilder::VPUNPCKHOp(OpcodeArgs, IR::OpSize ElementSize) {
const auto SrcSize = OpSizeFromSrc(Op);
const auto Is128Bit = SrcSize == Core::CPUState::XMM_SSE_REG_SIZE;
const auto Is128Bit = SrcSize == OpSize::i128Bit;
Ref Src1 = LoadSource(FPRClass, Op, Op->Src[0], Op->Flags);
Ref Src2 = LoadSource(FPRClass, Op, Op->Src[1], Op->Flags);
@ -852,7 +851,7 @@ Ref OpDispatchBuilder::GeneratePSHUFBMask(IR::OpSize SrcSize) {
}
Ref OpDispatchBuilder::PSHUFBOpImpl(IR::OpSize SrcSize, Ref Src1, Ref Src2, Ref MaskVector) {
const auto Is256Bit = SrcSize == Core::CPUState::XMM_AVX_REG_SIZE;
const auto Is256Bit = SrcSize == OpSize::i256Bit;
// We perform the 256-bit version as two 128-bit operations due to
// the lane splitting behavior, so cap the maximum size at 16.
@ -1173,7 +1172,7 @@ void OpDispatchBuilder::PSHUFDOp(OpcodeArgs) {
void OpDispatchBuilder::VPSHUFWOp(OpcodeArgs, IR::OpSize ElementSize, bool Low) {
const auto SrcSize = OpSizeFromSrc(Op);
const auto Is256Bit = SrcSize == Core::CPUState::XMM_AVX_REG_SIZE;
const auto Is256Bit = SrcSize == OpSize::i256Bit;
auto Shuffle = Op->Src[1].Literal();
Ref Src = LoadSource(FPRClass, Op, Op->Src[0], Op->Flags);
@ -1195,7 +1194,7 @@ void OpDispatchBuilder::VPSHUFWOp(OpcodeArgs, IR::OpSize ElementSize, bool Low)
if (Is256Bit) {
for (size_t i = 0; i < 4; i++) {
const auto Index = Shuffle & 0b11;
const auto UpperLaneOffset = Core::CPUState::XMM_SSE_REG_SIZE / ElementSize;
const auto UpperLaneOffset = IR::NumElements(OpSize::i128Bit, ElementSize);
const auto LowDstIndex = BaseElement + i;
const auto LowSrcIndex = BaseElement + Index;
@ -1224,10 +1223,10 @@ Ref OpDispatchBuilder::SHUFOpImpl(OpcodeArgs, IR::OpSize DstSize, IR::OpSize Ele
// Since 256-bit variants and up don't lane cross, we can construct
// everything in terms of the 128-variant, as each lane is essentially
// its own 128-bit segment.
const uint8_t NumElements = Core::CPUState::XMM_SSE_REG_SIZE / ElementSize;
const uint8_t NumElements = IR::NumElements(OpSize::i128Bit, ElementSize);
const uint8_t HalfNumElements = NumElements >> 1;
const bool Is256Bit = DstSize == Core::CPUState::XMM_AVX_REG_SIZE;
const bool Is256Bit = DstSize == OpSize::i256Bit;
std::array<Ref, 4> Srcs {};
for (size_t i = 0; i < HalfNumElements; ++i) {
@ -1248,7 +1247,7 @@ Ref OpDispatchBuilder::SHUFOpImpl(OpcodeArgs, IR::OpSize DstSize, IR::OpSize Ele
// AVX differs the behavior of VSHUFPD and VSHUFPS.
// The same immediate bits are used for both lanes with VSHUFPS,
// but VSHUFPD uses different immediate bits for each lane.
const auto SrcIndex2 = ElementSize == 4 ? SrcIndex1 : ((Shuffle >> 2) & SelectionMask);
const auto SrcIndex2 = ElementSize == OpSize::i32Bit ? SrcIndex1 : ((Shuffle >> 2) & SelectionMask);
Ref Insert = _VInsElement(DstSize, ElementSize, Element, SrcIndex1, Dest, Srcs[Element]);
Dest = _VInsElement(DstSize, ElementSize, Element + NumElements, SrcIndex2 + NumElements, Insert, Srcs[Element]);
@ -1442,7 +1441,7 @@ void OpDispatchBuilder::VANDNOp(OpcodeArgs) {
template<IROps IROp, IR::OpSize ElementSize>
void OpDispatchBuilder::VHADDPOp(OpcodeArgs) {
const auto SrcSize = OpSizeFromSrc(Op);
const auto Is256Bit = SrcSize == Core::CPUState::XMM_AVX_REG_SIZE;
const auto Is256Bit = SrcSize == OpSize::i256Bit;
Ref Src1 = LoadSource(FPRClass, Op, Op->Src[0], Op->Flags);
Ref Src2 = LoadSource(FPRClass, Op, Op->Src[1], Op->Flags);
@ -1485,7 +1484,7 @@ void OpDispatchBuilder::VBROADCASTOp(OpcodeArgs, IR::OpSize ElementSize) {
Ref OpDispatchBuilder::PINSROpImpl(OpcodeArgs, IR::OpSize ElementSize, const X86Tables::DecodedOperand& Src1Op,
const X86Tables::DecodedOperand& Src2Op, const X86Tables::DecodedOperand& Imm) {
const auto Size = OpSizeFromDst(Op);
const auto NumElements = Size / ElementSize;
const auto NumElements = IR::NumElements(Size, ElementSize);
const uint64_t Index = Imm.Literal() & (NumElements - 1);
Ref Src1 = LoadSource_WithOpSize(FPRClass, Op, Src1Op, Size, Op->Flags);
@ -1608,7 +1607,7 @@ void OpDispatchBuilder::PExtrOp(OpcodeArgs, IR::OpSize ElementSize) {
}
// AVX version only operates on 128-bit.
const uint8_t NumElements = std::min<uint8_t>(GetSrcSize(Op), OpSize::i128Bit) / OverridenElementSize;
const uint8_t NumElements = IR::NumElements(std::min(OpSizeFromSrc(Op), OpSize::i128Bit), OverridenElementSize);
Index &= NumElements - 1;
if (Op->Dest.IsGPR()) {
@ -1649,8 +1648,8 @@ void OpDispatchBuilder::VEXTRACT128Op(OpcodeArgs) {
Ref OpDispatchBuilder::PSIGNImpl(OpcodeArgs, IR::OpSize ElementSize, Ref Src1, Ref Src2) {
const auto Size = OpSizeFromSrc(Op);
Ref Control = _VSQSHL(Size, ElementSize, Src2, (ElementSize * 8) - 1);
Control = _VSRSHR(Size, ElementSize, Control, (ElementSize * 8) - 1);
Ref Control = _VSQSHL(Size, ElementSize, Src2, IR::OpSizeAsBits(ElementSize) - 1);
Control = _VSRSHR(Size, ElementSize, Control, IR::OpSizeAsBits(ElementSize) - 1);
return _VMul(Size, ElementSize, Src1, Control);
}
@ -1725,7 +1724,7 @@ void OpDispatchBuilder::PSRLI(OpcodeArgs, IR::OpSize ElementSize) {
void OpDispatchBuilder::VPSRLIOp(OpcodeArgs, IR::OpSize ElementSize) {
const auto Size = OpSizeFromSrc(Op);
const auto Is128Bit = Size == Core::CPUState::XMM_SSE_REG_SIZE;
const auto Is128Bit = Size == OpSize::i128Bit;
const uint64_t ShiftConstant = Op->Src[1].Literal();
Ref Src = LoadSource(FPRClass, Op, Op->Src[0], Op->Flags);
@ -1848,7 +1847,7 @@ void OpDispatchBuilder::PSRLDQ(OpcodeArgs) {
Ref Dest = LoadSource(FPRClass, Op, Op->Dest, Op->Flags);
Ref Result = LoadZeroVector(Size);
if (Shift < Size) {
if (Shift < IR::OpSizeToSize(Size)) {
Result = _VExtr(Size, OpSize::i8Bit, Result, Dest, Shift);
}
StoreResult(FPRClass, Op, Result, OpSize::iInvalid);
@ -1856,7 +1855,7 @@ void OpDispatchBuilder::PSRLDQ(OpcodeArgs) {
void OpDispatchBuilder::VPSRLDQOp(OpcodeArgs) {
const auto DstSize = OpSizeFromDst(Op);
const auto Is128Bit = DstSize == Core::CPUState::XMM_SSE_REG_SIZE;
const auto Is128Bit = DstSize == OpSize::i128Bit;
const uint64_t Shift = Op->Src[1].Literal();
Ref Src = LoadSource(FPRClass, Op, Op->Src[0], Op->Flags);
@ -1872,7 +1871,7 @@ void OpDispatchBuilder::VPSRLDQOp(OpcodeArgs) {
Result = LoadZeroVector(DstSize);
if (Is128Bit) {
if (Shift < DstSize) {
if (Shift < IR::OpSizeToSize(DstSize)) {
Result = _VExtr(DstSize, OpSize::i8Bit, Result, Src, Shift);
}
} else {
@ -1899,8 +1898,8 @@ void OpDispatchBuilder::PSLLDQ(OpcodeArgs) {
Ref Dest = LoadSource(FPRClass, Op, Op->Dest, Op->Flags);
Ref Result = LoadZeroVector(Size);
if (Shift < Size) {
Result = _VExtr(Size, OpSize::i8Bit, Dest, Result, Size - Shift);
if (Shift < IR::OpSizeToSize(Size)) {
Result = _VExtr(Size, OpSize::i8Bit, Dest, Result, IR::OpSizeToSize(Size) - Shift);
}
StoreResult(FPRClass, Op, Result, OpSize::iInvalid);
@ -1908,7 +1907,8 @@ void OpDispatchBuilder::PSLLDQ(OpcodeArgs) {
void OpDispatchBuilder::VPSLLDQOp(OpcodeArgs) {
const auto DstSize = OpSizeFromDst(Op);
const auto Is128Bit = DstSize == Core::CPUState::XMM_SSE_REG_SIZE;
const auto DstSizeInt = IR::OpSizeToSize(DstSize);
const auto Is128Bit = DstSize == OpSize::i128Bit;
const uint64_t Shift = Op->Src[1].Literal();
Ref Src = LoadSource(FPRClass, Op, Op->Src[0], Op->Flags);
@ -1922,13 +1922,13 @@ void OpDispatchBuilder::VPSLLDQOp(OpcodeArgs) {
} else {
Result = LoadZeroVector(DstSize);
if (Is128Bit) {
if (Shift < DstSize) {
Result = _VExtr(DstSize, OpSize::i8Bit, Src, Result, DstSize - Shift);
if (Shift < DstSizeInt) {
Result = _VExtr(DstSize, OpSize::i8Bit, Src, Result, DstSizeInt - Shift);
}
} else {
if (Shift < Core::CPUState::XMM_SSE_REG_SIZE) {
Ref ResultBottom = _VExtr(OpSize::i128Bit, OpSize::i8Bit, Src, Result, 16 - Shift);
Ref ResultTop = _VExtr(DstSize, OpSize::i8Bit, Src, Result, DstSize - Shift);
Ref ResultTop = _VExtr(DstSize, OpSize::i8Bit, Src, Result, DstSizeInt - Shift);
Result = _VInsElement(DstSize, OpSize::i128Bit, 1, 0, ResultBottom, ResultTop);
}
@ -1954,7 +1954,7 @@ void OpDispatchBuilder::PSRAIOp(OpcodeArgs, IR::OpSize ElementSize) {
void OpDispatchBuilder::VPSRAIOp(OpcodeArgs, IR::OpSize ElementSize) {
const uint64_t Shift = Op->Src[1].Literal();
const auto Size = OpSizeFromDst(Op);
const auto Is128Bit = Size == Core::CPUState::XMM_SSE_REG_SIZE;
const auto Is128Bit = Size == OpSize::i128Bit;
Ref Src = LoadSource(FPRClass, Op, Op->Src[0], Op->Flags);
Ref Result = Src;
@ -2008,7 +2008,7 @@ void OpDispatchBuilder::MOVDDUPOp(OpcodeArgs) {
void OpDispatchBuilder::VMOVDDUPOp(OpcodeArgs) {
const auto SrcSize = OpSizeFromSrc(Op);
const auto IsSrcGPR = Op->Src[0].IsGPR();
const auto Is256Bit = SrcSize == Core::CPUState::XMM_AVX_REG_SIZE;
const auto Is256Bit = SrcSize == OpSize::i256Bit;
const auto MemSize = Is256Bit ? OpSize::i256Bit : OpSize::i64Bit;
Ref Src = IsSrcGPR ? LoadSource_WithOpSize(FPRClass, Op, Op->Src[0], SrcSize, Op->Flags) :
@ -2112,7 +2112,7 @@ Ref OpDispatchBuilder::Vector_CVT_Int_To_FloatImpl(OpcodeArgs, IR::OpSize SrcEle
auto ElementSize = SrcElementSize;
if (Widen) {
Src = _VSXTL(Size, ElementSize, Src);
ElementSize = IR::MultiplyOpSize(ElementSize, 2);
ElementSize = ElementSize << 1;
}
return _Vector_SToF(Size, ElementSize, Src);
@ -2143,8 +2143,8 @@ Ref OpDispatchBuilder::Vector_CVT_Float_To_IntImpl(OpcodeArgs, IR::OpSize SrcEle
Ref Src = LoadSource(FPRClass, Op, Op->Src[0], Op->Flags);
if (Narrow) {
Src = _Vector_FToF(DstSize, IR::DivideOpSize(SrcElementSize, 2), Src, SrcElementSize);
ElementSize = IR::DivideOpSize(ElementSize, 2);
Src = _Vector_FToF(DstSize, SrcElementSize >> 1, Src, SrcElementSize);
ElementSize = ElementSize >> 1;
}
if (HostRoundingMode) {
@ -2236,17 +2236,17 @@ void OpDispatchBuilder::Vector_CVT_Float_To_Float(OpcodeArgs, IR::OpSize DstElem
const auto SrcSize = OpSizeFromSrc(Op);
const auto IsFloatSrc = SrcElementSize == OpSize::i32Bit;
const auto Is128Bit = SrcSize == Core::CPUState::XMM_SSE_REG_SIZE;
const auto Is128Bit = SrcSize == OpSize::i128Bit;
const auto LoadSize = IsFloatSrc && !Op->Src[0].IsGPR() ? IR::SizeToOpSize(IR::OpSizeToSize(SrcSize) / 2) : SrcSize;
const auto LoadSize = IsFloatSrc && !Op->Src[0].IsGPR() ? (SrcSize >> 1) : SrcSize;
Ref Src = LoadSource_WithOpSize(FPRClass, Op, Op->Src[0], LoadSize, Op->Flags);
Ref Result {};
if (DstElementSize > SrcElementSize) {
Result = _Vector_FToF(SrcSize, IR::MultiplyOpSize(SrcElementSize, 2), Src, SrcElementSize);
Result = _Vector_FToF(SrcSize, SrcElementSize << 1, Src, SrcElementSize);
} else {
Result = _Vector_FToF(SrcSize, IR::DivideOpSize(SrcElementSize, 2), Src, SrcElementSize);
Result = _Vector_FToF(SrcSize, SrcElementSize >> 1, Src, SrcElementSize);
}
if (IsAVX) {
@ -2269,7 +2269,7 @@ void OpDispatchBuilder::MMX_To_XMM_Vector_CVT_Int_To_Float(OpcodeArgs) {
const auto DstSize = OpSizeFromDst(Op);
Src = _VSXTL(DstSize, ElementSize, Src);
ElementSize = IR::MultiplyOpSize(ElementSize, 2);
ElementSize = ElementSize << 1;
// Always signed
Src = _Vector_SToF(DstSize, ElementSize, Src);
@ -2294,8 +2294,8 @@ void OpDispatchBuilder::XMM_To_MMX_Vector_CVT_Float_To_Int(OpcodeArgs) {
const auto Size = OpSizeFromDst(Op);
if (Narrow) {
Src = _Vector_FToF(Size, IR::DivideOpSize(SrcElementSize, 2), Src, SrcElementSize);
ElementSize = IR::DivideOpSize(ElementSize, 2);
Src = _Vector_FToF(Size, SrcElementSize >> 1, Src, SrcElementSize);
ElementSize = ElementSize >> 1;
}
if constexpr (HostRoundingMode) {
@ -2816,7 +2816,7 @@ Ref OpDispatchBuilder::PALIGNROpImpl(OpcodeArgs, const X86Tables::DecodedOperand
const auto DstSize = OpSizeFromDst(Op);
const auto SanitizedDstSize = std::min(DstSize, OpSize::i128Bit);
const auto Is256Bit = DstSize == Core::CPUState::XMM_AVX_REG_SIZE;
const auto Is256Bit = DstSize == OpSize::i256Bit;
const auto Index = Imm.Literal();
Ref Src2Node = LoadSource(FPRClass, Op, Src2, Op->Flags);
@ -2830,7 +2830,7 @@ Ref OpDispatchBuilder::PALIGNROpImpl(OpcodeArgs, const X86Tables::DecodedOperand
}
Ref Src1Node = LoadSource(FPRClass, Op, Src1, Op->Flags);
if (Index >= (SanitizedDstSize * 2)) {
if (Index >= (IR::OpSizeToSize(SanitizedDstSize) * 2)) {
// If the immediate is greater than both vectors combined then it zeroes the vector
return LoadZeroVector(DstSize);
}
@ -2891,7 +2891,7 @@ template void OpDispatchBuilder::PACKUSOp<OpSize::i32Bit>(OpcodeArgs);
void OpDispatchBuilder::VPACKUSOp(OpcodeArgs, IR::OpSize ElementSize) {
const auto DstSize = OpSizeFromDst(Op);
const auto Is256Bit = DstSize == Core::CPUState::XMM_AVX_REG_SIZE;
const auto Is256Bit = DstSize == OpSize::i256Bit;
Ref Src1 = LoadSource(FPRClass, Op, Op->Src[0], Op->Flags);
Ref Src2 = LoadSource(FPRClass, Op, Op->Src[1], Op->Flags);
@ -2919,7 +2919,7 @@ template void OpDispatchBuilder::PACKSSOp<OpSize::i32Bit>(OpcodeArgs);
void OpDispatchBuilder::VPACKSSOp(OpcodeArgs, IR::OpSize ElementSize) {
const auto DstSize = OpSizeFromDst(Op);
const auto Is256Bit = DstSize == Core::CPUState::XMM_AVX_REG_SIZE;
const auto Is256Bit = DstSize == OpSize::i256Bit;
Ref Src1 = LoadSource(FPRClass, Op, Op->Src[0], Op->Flags);
Ref Src2 = LoadSource(FPRClass, Op, Op->Src[1], Op->Flags);
@ -2954,7 +2954,7 @@ Ref OpDispatchBuilder::PMULLOpImpl(OpSize Size, IR::OpSize ElementSize, bool Sig
template<IR::OpSize ElementSize, bool Signed>
void OpDispatchBuilder::PMULLOp(OpcodeArgs) {
static_assert(ElementSize == sizeof(uint32_t), "Currently only handles 32-bit -> 64-bit");
static_assert(ElementSize == OpSize::i32Bit, "Currently only handles 32-bit -> 64-bit");
Ref Src1 = LoadSource(FPRClass, Op, Op->Dest, Op->Flags);
Ref Src2 = LoadSource(FPRClass, Op, Op->Src[0], Op->Flags);
@ -2968,7 +2968,7 @@ template void OpDispatchBuilder::PMULLOp<OpSize::i32Bit, true>(OpcodeArgs);
template<IR::OpSize ElementSize, bool Signed>
void OpDispatchBuilder::VPMULLOp(OpcodeArgs) {
static_assert(ElementSize == sizeof(uint32_t), "Currently only handles 32-bit -> 64-bit");
static_assert(ElementSize == OpSize::i32Bit, "Currently only handles 32-bit -> 64-bit");
Ref Src1 = LoadSource(FPRClass, Op, Op->Src[0], Op->Flags);
Ref Src2 = LoadSource(FPRClass, Op, Op->Src[1], Op->Flags);
@ -3124,15 +3124,15 @@ void OpDispatchBuilder::PMULHRWOp(OpcodeArgs) {
// Implementation is more efficient for 8byte registers
// Multiplies 4 16bit values in to 4 32bit values
Res = _VSMull(IR::MultiplyOpSize(Size, 2), OpSize::i16Bit, Dest, Src);
Res = _VSMull(Size << 1, OpSize::i16Bit, Dest, Src);
// Load 0x0000_8000 in to each 32-bit element.
Ref VConstant = _VectorImm(OpSize::i128Bit, OpSize::i32Bit, 0x80, 8);
Res = _VAdd(IR::MultiplyOpSize(Size, 2), OpSize::i32Bit, Res, VConstant);
Res = _VAdd(Size << 1, OpSize::i32Bit, Res, VConstant);
// Now shift and narrow to convert 32-bit values to 16bit, storing the top 16bits
Res = _VUShrNI(IR::MultiplyOpSize(Size, 2), OpSize::i32Bit, Res, 16);
Res = _VUShrNI(Size << 1, OpSize::i32Bit, Res, 16);
StoreResult(FPRClass, Op, Res, OpSize::iInvalid);
}
@ -3177,7 +3177,7 @@ Ref OpDispatchBuilder::PMADDWDOpImpl(IR::OpSize Size, Ref Src1, Ref Src2) {
if (Size == OpSize::i64Bit) {
// MMX implementation can be slightly more optimal
Size = IR::DivideOpSize(Size, 2);
Size = Size >> 1;
auto MullResult = _VSMull(Size, OpSize::i16Bit, Src1, Src2);
return _VAddP(Size, OpSize::i32Bit, MullResult, MullResult);
}
@ -3211,7 +3211,7 @@ void OpDispatchBuilder::VPMADDWDOp(OpcodeArgs) {
Ref OpDispatchBuilder::PMADDUBSWOpImpl(IR::OpSize Size, Ref Src1, Ref Src2) {
if (Size == OpSize::i64Bit) {
const auto MultSize = IR::MultiplyOpSize(Size, 2);
const auto MultSize = Size << 1;
// 64bit is more efficient
// Src1 is unsigned
@ -3314,11 +3314,11 @@ Ref OpDispatchBuilder::PMULHRSWOpImpl(OpSize Size, Ref Src1, Ref Src2) {
Ref Res {};
if (Size == OpSize::i64Bit) {
// Implementation is more efficient for 8byte registers
Res = _VSMull(IR::MultiplyOpSize(Size, 2), OpSize::i16Bit, Src1, Src2);
Res = _VSShrI(IR::MultiplyOpSize(Size, 2), OpSize::i32Bit, Res, 14);
auto OneVector = _VectorImm(IR::MultiplyOpSize(Size, 2), OpSize::i32Bit, 1);
Res = _VAdd(IR::MultiplyOpSize(Size, 2), OpSize::i32Bit, Res, OneVector);
return _VUShrNI(IR::MultiplyOpSize(Size, 2), OpSize::i32Bit, Res, 1);
Res = _VSMull(Size << 1, OpSize::i16Bit, Src1, Src2);
Res = _VSShrI(Size << 1, OpSize::i32Bit, Res, 14);
auto OneVector = _VectorImm(Size << 1, OpSize::i32Bit, 1);
Res = _VAdd(Size << 1, OpSize::i32Bit, Res, OneVector);
return _VUShrNI(Size << 1, OpSize::i32Bit, Res, 1);
} else {
// 128-bit and 256-bit are less efficient
Ref ResultLow;
@ -3375,7 +3375,7 @@ template void OpDispatchBuilder::HSUBP<OpSize::i64Bit>(OpcodeArgs);
void OpDispatchBuilder::VHSUBPOp(OpcodeArgs, IR::OpSize ElementSize) {
const auto DstSize = OpSizeFromDst(Op);
const auto Is256Bit = DstSize == Core::CPUState::XMM_AVX_REG_SIZE;
const auto Is256Bit = DstSize == OpSize::i256Bit;
Ref Src1 = LoadSource(FPRClass, Op, Op->Src[0], Op->Flags);
Ref Src2 = LoadSource(FPRClass, Op, Op->Src[1], Op->Flags);
@ -3409,7 +3409,7 @@ template void OpDispatchBuilder::PHSUB<OpSize::i32Bit>(OpcodeArgs);
void OpDispatchBuilder::VPHSUBOp(OpcodeArgs, IR::OpSize ElementSize) {
const auto DstSize = OpSizeFromDst(Op);
const auto Is256Bit = DstSize == Core::CPUState::XMM_AVX_REG_SIZE;
const auto Is256Bit = DstSize == OpSize::i256Bit;
Ref Src1 = LoadSource(FPRClass, Op, Op->Src[0], Op->Flags);
Ref Src2 = LoadSource(FPRClass, Op, Op->Src[1], Op->Flags);
@ -3441,7 +3441,7 @@ void OpDispatchBuilder::PHADDS(OpcodeArgs) {
void OpDispatchBuilder::VPHADDSWOp(OpcodeArgs) {
const auto SrcSize = OpSizeFromSrc(Op);
const auto Is256Bit = SrcSize == Core::CPUState::XMM_AVX_REG_SIZE;
const auto Is256Bit = SrcSize == OpSize::i256Bit;
Ref Src1 = LoadSource(FPRClass, Op, Op->Src[0], Op->Flags);
Ref Src2 = LoadSource(FPRClass, Op, Op->Src[1], Op->Flags);
@ -3476,7 +3476,7 @@ void OpDispatchBuilder::PHSUBS(OpcodeArgs) {
void OpDispatchBuilder::VPHSUBSWOp(OpcodeArgs) {
const auto DstSize = OpSizeFromDst(Op);
const auto Is256Bit = DstSize == Core::CPUState::XMM_AVX_REG_SIZE;
const auto Is256Bit = DstSize == OpSize::i256Bit;
Ref Src1 = LoadSource(FPRClass, Op, Op->Src[0], Op->Flags);
Ref Src2 = LoadSource(FPRClass, Op, Op->Src[1], Op->Flags);
@ -3497,13 +3497,13 @@ Ref OpDispatchBuilder::PSADBWOpImpl(IR::OpSize Size, Ref Src1, Ref Src2) {
// but it actually operates in more than 8bit space
// This can be seen with `abs(0 - 0xFF)` returning a different result depending
// on bit length
const auto Is128Bit = Size == Core::CPUState::XMM_SSE_REG_SIZE;
const auto Is128Bit = Size == OpSize::i128Bit;
if (Size == OpSize::i64Bit) {
auto AbsResult = _VUABDL(IR::MultiplyOpSize(Size, 2), OpSize::i8Bit, Src1, Src2);
auto AbsResult = _VUABDL(Size << 1, OpSize::i8Bit, Src1, Src2);
// Now vector-wide add the results for each
return _VAddV(IR::MultiplyOpSize(Size, 2), OpSize::i16Bit, AbsResult);
return _VAddV(Size << 1, OpSize::i16Bit, AbsResult);
}
auto AbsResult_Low = _VUABDL(Size, OpSize::i8Bit, Src1, Src2);
@ -3558,7 +3558,7 @@ Ref OpDispatchBuilder::ExtendVectorElementsImpl(OpcodeArgs, IR::OpSize ElementSi
return LoadSource_WithOpSize(FPRClass, Op, Op->Src[0], DstSize, Op->Flags);
} else {
// For memory operands the 256-bit variant loads twice the size specified in the table.
const auto Is256Bit = DstSize == Core::CPUState::XMM_AVX_REG_SIZE;
const auto Is256Bit = DstSize == OpSize::i256Bit;
const auto SrcSize = OpSizeFromSrc(Op);
const auto LoadSize = Is256Bit ? IR::SizeToOpSize(IR::OpSizeToSize(SrcSize) * 2) : SrcSize;
@ -3569,8 +3569,7 @@ Ref OpDispatchBuilder::ExtendVectorElementsImpl(OpcodeArgs, IR::OpSize ElementSi
Ref Src = GetSrc();
Ref Result {Src};
for (auto CurrentElementSize = ElementSize; CurrentElementSize != DstElementSize;
CurrentElementSize = IR::MultiplyOpSize(CurrentElementSize, 2)) {
for (auto CurrentElementSize = ElementSize; CurrentElementSize != DstElementSize; CurrentElementSize = CurrentElementSize << 1) {
if (Signed) {
Result = _VSXTL(DstSize, CurrentElementSize, Result);
} else {
@ -3901,7 +3900,7 @@ void OpDispatchBuilder::VectorVariableBlend(OpcodeArgs, IR::OpSize ElementSize)
//
// To emulate this on AArch64
// Arithmetic shift right by the element size, then use BSL to select the registers
Mask = _VSShrI(Size, ElementSize, Mask, (ElementSize * 8) - 1);
Mask = _VSShrI(Size, ElementSize, Mask, IR::OpSizeAsBits(ElementSize) - 1);
auto Result = _VBSL(Size, Mask, Src, Dest);
@ -3910,7 +3909,7 @@ void OpDispatchBuilder::VectorVariableBlend(OpcodeArgs, IR::OpSize ElementSize)
void OpDispatchBuilder::AVXVectorVariableBlend(OpcodeArgs, IR::OpSize ElementSize) {
const auto SrcSize = OpSizeFromSrc(Op);
const auto ElementSizeBits = ElementSize * 8;
const auto ElementSizeBits = IR::OpSizeAsBits(ElementSize);
Ref Src1 = LoadSource(FPRClass, Op, Op->Src[0], Op->Flags);
Ref Src2 = LoadSource(FPRClass, Op, Op->Src[1], Op->Flags);
@ -3947,7 +3946,7 @@ void OpDispatchBuilder::PTestOpImpl(OpSize Size, Ref Dest, Ref Src) {
// Set ZF according to Test1. SF will be zeroed since we do a 32-bit test on
// the results of a 16-bit value from the UMaxV, so the 32-bit sign bit is
// cleared even if the 16-bit scalars were negative.
SetNZ_ZeroCV(32, Test1);
SetNZ_ZeroCV(OpSize::i32Bit, Test1);
SetCFInverted(Test2);
ZeroPF_AF();
}
@ -3962,7 +3961,7 @@ void OpDispatchBuilder::PTestOp(OpcodeArgs) {
void OpDispatchBuilder::VTESTOpImpl(OpSize SrcSize, IR::OpSize ElementSize, Ref Src1, Ref Src2) {
InvalidateDeferredFlags();
const auto ElementSizeInBits = ElementSize * 8;
const auto ElementSizeInBits = IR::OpSizeAsBits(ElementSize);
const auto MaskConstant = uint64_t {1} << (ElementSizeInBits - 1);
Ref Mask = _VDupFromGPR(SrcSize, ElementSize, _Constant(MaskConstant));
@ -3985,7 +3984,7 @@ void OpDispatchBuilder::VTESTOpImpl(OpSize SrcSize, IR::OpSize ElementSize, Ref
Ref CFInv = _Select(IR::COND_NEQ, AndNotGPR, ZeroConst, OneConst, ZeroConst);
// As in PTest, this sets Z appropriately while zeroing the rest of NZCV.
SetNZ_ZeroCV(32, AndGPR);
SetNZ_ZeroCV(OpSize::i32Bit, AndGPR);
SetCFInverted(CFInv);
ZeroPF_AF();
}
@ -4083,7 +4082,7 @@ Ref OpDispatchBuilder::DPPOpImpl(IR::OpSize DstSize, Ref Src1, Ref Src2, uint8_t
// Now using the destination mask we choose where the result ends up
// It can duplicate and zero results
if (ElementSize == 8) {
if (ElementSize == OpSize::i64Bit) {
switch (DstMask) {
case 0b01:
// Dest[63:0] = Result
@ -4105,7 +4104,7 @@ Ref OpDispatchBuilder::DPPOpImpl(IR::OpSize DstSize, Ref Src1, Ref Src2, uint8_t
auto BadPath = [&]() {
Ref Result = ZeroVec;
for (size_t i = 0; i < (DstSize / ElementSize); ++i) {
for (size_t i = 0; i < IR::NumElements(DstSize, ElementSize); ++i) {
const auto Bit = 1U << (i % 4);
if ((DstMask & Bit) != 0) {
@ -4127,13 +4126,13 @@ Ref OpDispatchBuilder::DPPOpImpl(IR::OpSize DstSize, Ref Src1, Ref Src2, uint8_t
// Dest[63:32] = Result
// Dest[95:64] = Zero
// Dest[127:96] = Zero
return _VZip(IR::DivideOpSize(DstSize, 2), ElementSize, ZeroVec, Temp);
return _VZip(DstSize >> 1, ElementSize, ZeroVec, Temp);
case 0b0011:
// Dest[31:0] = Result
// Dest[63:32] = Result
// Dest[95:64] = Zero
// Dest[127:96] = Zero
return _VDupElement(IR::DivideOpSize(DstSize, 2), ElementSize, Temp, 0);
return _VDupElement(DstSize >> 1, ElementSize, Temp, 0);
case 0b0100:
// Dest[31:0] = Zero
// Dest[63:32] = Zero
@ -4251,7 +4250,7 @@ Ref OpDispatchBuilder::VDPPSOpImpl(OpcodeArgs, const X86Tables::DecodedOperand&
Ref Temp = _VFMul(DstSize, ElementSize, Src1V, Src2V);
// Now we zero out elements based on src mask
for (size_t i = 0; i < (DstSize / ElementSize); ++i) {
for (size_t i = 0; i < IR::NumElements(DstSize, ElementSize); ++i) {
const auto Bit = 1U << (i % 4);
if ((SrcMask & Bit) == 0) {
@ -4272,7 +4271,7 @@ Ref OpDispatchBuilder::VDPPSOpImpl(OpcodeArgs, const X86Tables::DecodedOperand&
// It can duplicate and zero results
Ref Result = ZeroVec;
for (size_t i = 0; i < (DstSize / ElementSize); ++i) {
for (size_t i = 0; i < IR::NumElements(DstSize, ElementSize); ++i) {
const auto Bit = 1U << (i % 4);
if ((DstMask & Bit) != 0) {
@ -4285,17 +4284,17 @@ Ref OpDispatchBuilder::VDPPSOpImpl(OpcodeArgs, const X86Tables::DecodedOperand&
template<IR::OpSize ElementSize>
void OpDispatchBuilder::VDPPOp(OpcodeArgs) {
const auto DstSize = GetDstSize(Op);
const auto DstSize = OpSizeFromDst(Op);
Ref Result {};
if (ElementSize == 4 && DstSize == Core::CPUState::XMM_AVX_REG_SIZE) {
if (ElementSize == OpSize::i32Bit && DstSize == OpSize::i256Bit) {
// 256-bit DPPS isn't handled by the 128-bit solution.
Result = VDPPSOpImpl(Op, Op->Src[0], Op->Src[1], Op->Src[2]);
} else {
Ref Src1 = LoadSource(FPRClass, Op, Op->Src[0], Op->Flags);
Ref Src2 = LoadSource(FPRClass, Op, Op->Src[1], Op->Flags);
Result = DPPOpImpl(OpSizeFromDst(Op), Src1, Src2, Op->Src[2].Literal(), ElementSize);
Result = DPPOpImpl(DstSize, Src1, Src2, Op->Src[2].Literal(), ElementSize);
}
// We don't need to emit a _VMov to clear the upper lane, since DPPOpImpl uses a zero vector
@ -4306,7 +4305,7 @@ void OpDispatchBuilder::VDPPOp(OpcodeArgs) {
template void OpDispatchBuilder::VDPPOp<OpSize::i32Bit>(OpcodeArgs);
template void OpDispatchBuilder::VDPPOp<OpSize::i64Bit>(OpcodeArgs);
Ref OpDispatchBuilder::MPSADBWOpImpl(size_t SrcSize, Ref Src1, Ref Src2, uint8_t Select) {
Ref OpDispatchBuilder::MPSADBWOpImpl(IR::OpSize SrcSize, Ref Src1, Ref Src2, uint8_t Select) {
const auto LaneHelper = [&, this](uint32_t Selector_Src1, uint32_t Selector_Src2, Ref Src1, Ref Src2) {
// Src2 will grab a 32bit element and duplicate it across the 128bits
Ref DupSrc = _VDupElement(OpSize::i128Bit, OpSize::i32Bit, Src2, Selector_Src2);
@ -4373,7 +4372,7 @@ Ref OpDispatchBuilder::MPSADBWOpImpl(size_t SrcSize, Ref Src1, Ref Src2, uint8_t
return _VAddP(OpSize::i128Bit, OpSize::i16Bit, TmpTranspose1, TmpTranspose2);
};
const auto Is128Bit = SrcSize == Core::CPUState::XMM_SSE_REG_SIZE;
const auto Is128Bit = SrcSize == OpSize::i128Bit;
// Src1 needs to be in byte offset
const uint8_t Select_Src1_Low = ((Select & 0b100) >> 2) * 32 / 8;
@ -4395,7 +4394,7 @@ Ref OpDispatchBuilder::MPSADBWOpImpl(size_t SrcSize, Ref Src1, Ref Src2, uint8_t
void OpDispatchBuilder::MPSADBWOp(OpcodeArgs) {
const uint8_t Select = Op->Src[1].Literal();
const uint8_t SrcSize = GetSrcSize(Op);
const auto SrcSize = OpSizeFromSrc(Op);
Ref Src1 = LoadSource(FPRClass, Op, Op->Dest, Op->Flags);
Ref Src2 = LoadSource(FPRClass, Op, Op->Src[0], Op->Flags);
@ -4405,7 +4404,7 @@ void OpDispatchBuilder::MPSADBWOp(OpcodeArgs) {
void OpDispatchBuilder::VMPSADBWOp(OpcodeArgs) {
const uint8_t Select = Op->Src[2].Literal();
const uint8_t SrcSize = GetSrcSize(Op);
const auto SrcSize = OpSizeFromSrc(Op);
Ref Src1 = LoadSource(FPRClass, Op, Op->Src[0], Op->Flags);
Ref Src2 = LoadSource(FPRClass, Op, Op->Src[1], Op->Flags);
@ -4463,7 +4462,7 @@ void OpDispatchBuilder::VCVTPS2PHOp(OpcodeArgs) {
// We need to eliminate upper junk if we're storing into a register with
// a 256-bit source (VCVTPS2PH's destination for registers is an XMM).
if (Op->Src[0].IsGPR() && SrcSize == Core::CPUState::XMM_AVX_REG_SIZE) {
if (Op->Src[0].IsGPR() && SrcSize == OpSize::i256Bit) {
Result = _VMov(OpSize::i128Bit, Result);
}
@ -4617,7 +4616,7 @@ Ref OpDispatchBuilder::VBLENDOpImpl(IR::OpSize VecSize, IR::OpSize ElementSize,
void OpDispatchBuilder::VBLENDPDOp(OpcodeArgs) {
const auto DstSize = OpSizeFromDst(Op);
const auto Is256Bit = DstSize == Core::CPUState::XMM_AVX_REG_SIZE;
const auto Is256Bit = DstSize == OpSize::i256Bit;
const auto Selector = Op->Src[2].Literal();
Ref Src1 = LoadSource(FPRClass, Op, Op->Src[0], Op->Flags);
@ -4642,7 +4641,7 @@ void OpDispatchBuilder::VBLENDPDOp(OpcodeArgs) {
void OpDispatchBuilder::VPBLENDDOp(OpcodeArgs) {
const auto DstSize = OpSizeFromDst(Op);
const auto Is256Bit = DstSize == Core::CPUState::XMM_AVX_REG_SIZE;
const auto Is256Bit = DstSize == OpSize::i256Bit;
const auto Selector = Op->Src[2].Literal();
Ref Src1 = LoadSource(FPRClass, Op, Op->Src[0], Op->Flags);
@ -4686,7 +4685,7 @@ void OpDispatchBuilder::VPBLENDDOp(OpcodeArgs) {
void OpDispatchBuilder::VPBLENDWOp(OpcodeArgs) {
const auto DstSize = OpSizeFromDst(Op);
const auto Is128Bit = DstSize == Core::CPUState::XMM_SSE_REG_SIZE;
const auto Is128Bit = DstSize == OpSize::i128Bit;
const auto Selector = Op->Src[2].Literal();
Ref Src1 = LoadSource(FPRClass, Op, Op->Src[0], Op->Flags);
@ -4718,7 +4717,7 @@ void OpDispatchBuilder::VPBLENDWOp(OpcodeArgs) {
void OpDispatchBuilder::VZEROOp(OpcodeArgs) {
const auto DstSize = OpSizeFromDst(Op);
const auto IsVZEROALL = DstSize == Core::CPUState::XMM_AVX_REG_SIZE;
const auto IsVZEROALL = DstSize == OpSize::i256Bit;
const auto NumRegs = CTX->Config.Is64BitMode ? 16U : 8U;
if (IsVZEROALL) {
@ -4743,7 +4742,7 @@ void OpDispatchBuilder::VZEROOp(OpcodeArgs) {
void OpDispatchBuilder::VPERMILImmOp(OpcodeArgs, IR::OpSize ElementSize) {
const auto DstSize = OpSizeFromDst(Op);
const auto Is256Bit = DstSize == Core::CPUState::XMM_AVX_REG_SIZE;
const auto Is256Bit = DstSize == OpSize::i256Bit;
const auto Selector = Op->Src[1].Literal() & 0xFF;
Ref Src = LoadSource(FPRClass, Op, Op->Src[0], Op->Flags);
@ -4780,7 +4779,7 @@ Ref OpDispatchBuilder::VPERMILRegOpImpl(OpSize DstSize, IR::OpSize ElementSize,
// The only difference here is that we need to add 16 to the upper lane
// before doing the final addition to build up the indices for TBL.
const auto Is256Bit = DstSize == Core::CPUState::XMM_AVX_REG_SIZE;
const auto Is256Bit = DstSize == OpSize::i256Bit;
auto IsPD = ElementSize == OpSize::i64Bit;
if (IsPD) {
@ -4856,7 +4855,7 @@ void OpDispatchBuilder::PCMPXSTRXOpImpl(OpcodeArgs, bool IsExplicit, bool IsMask
// While the control bit immediate for the instruction itself is only ever 8 bits
// in size, we use it as a 16-bit value so that we can use the 8th bit to signify
// whether or not RAX and RDX should be interpreted as a 64-bit value.
const auto SrcSize = GetSrcSize(Op);
const auto SrcSize = OpSizeFromSrc(Op);
const auto Is64Bit = SrcSize == OpSize::i64Bit;
const auto NewControl = uint16_t(Control | (uint16_t(Is64Bit) << 8));
@ -4935,7 +4934,7 @@ void OpDispatchBuilder::VPCMPISTRMOp(OpcodeArgs) {
void OpDispatchBuilder::VFMAImpl(OpcodeArgs, IROps IROp, bool Scalar, uint8_t Src1Idx, uint8_t Src2Idx, uint8_t AddendIdx) {
const auto Size = OpSizeFromDst(Op);
const auto Is256Bit = Size == Core::CPUState::XMM_AVX_REG_SIZE;
const auto Is256Bit = Size == OpSize::i256Bit;
const OpSize ElementSize = Op->Flags & X86Tables::DecodeFlags::FLAG_OPTION_AVX_W ? OpSize::i64Bit : OpSize::i32Bit;
@ -4964,7 +4963,7 @@ void OpDispatchBuilder::VFMAImpl(OpcodeArgs, IROps IROp, bool Scalar, uint8_t Sr
void OpDispatchBuilder::VFMAddSubImpl(OpcodeArgs, bool AddSub, uint8_t Src1Idx, uint8_t Src2Idx, uint8_t AddendIdx) {
const auto Size = OpSizeFromDst(Op);
const auto Is256Bit = Size == Core::CPUState::XMM_AVX_REG_SIZE;
const auto Is256Bit = Size == OpSize::i256Bit;
const OpSize ElementSize = Op->Flags & X86Tables::DecodeFlags::FLAG_OPTION_AVX_W ? OpSize::i64Bit : OpSize::i32Bit;
@ -5024,20 +5023,20 @@ void OpDispatchBuilder::VPGATHER(OpcodeArgs) {
LOGMAN_THROW_A_FMT(AddrElementSize == OpSize::i32Bit || AddrElementSize == OpSize::i64Bit, "Unknown address element size");
const auto Size = OpSizeFromDst(Op);
const auto Is128Bit = Size == Core::CPUState::XMM_SSE_REG_SIZE;
const auto Is128Bit = Size == OpSize::i128Bit;
///< Element size is determined by W flag.
const OpSize ElementLoadSize = Op->Flags & X86Tables::DecodeFlags::FLAG_OPTION_AVX_W ? OpSize::i64Bit : OpSize::i32Bit;
// We only need the high address register if the number of data elements is more than what the low half can consume.
// But also the number of address elements is clamped by the destination size as well.
const size_t NumDataElements = Size / ElementLoadSize;
const size_t NumAddrElementBytes = std::min<size_t>(Size, (NumDataElements * AddrElementSize));
const bool Needs128BitHighAddrBytes = NumAddrElementBytes > OpSize::i128Bit;
const size_t NumDataElements = IR::NumElements(Size, ElementLoadSize);
const size_t NumAddrElementBytes = std::min<size_t>(IR::OpSizeToSize(Size), (NumDataElements * IR::OpSizeToSize(AddrElementSize)));
const bool Needs128BitHighAddrBytes = NumAddrElementBytes > IR::OpSizeToSize(OpSize::i128Bit);
auto VSIB = LoadVSIB(Op, Op->Src[0], Op->Flags);
const bool SupportsSVELoad = (VSIB.Scale == 1 || VSIB.Scale == AddrElementSize) && (AddrElementSize == ElementLoadSize);
const bool SupportsSVELoad = (VSIB.Scale == 1 || VSIB.Scale == IR::OpSizeToSize(AddrElementSize)) && (AddrElementSize == ElementLoadSize);
Ref Dest = LoadSource(FPRClass, Op, Op->Dest, Op->Flags);
Ref Mask = LoadSource(FPRClass, Op, Op->Src[1], Op->Flags);
@ -5067,7 +5066,7 @@ void OpDispatchBuilder::VPGATHER(OpcodeArgs) {
}
}
auto Result128 = AVX128_VPGatherImpl(SizeToOpSize(Size), ElementLoadSize, AddrElementSize, Dest128, Mask128, VSIB128);
auto Result128 = AVX128_VPGatherImpl(Size, ElementLoadSize, AddrElementSize, Dest128, Mask128, VSIB128);
// The registers are current split, need to merge them.
Result = _VInsElement(OpSize::i256Bit, OpSize::i128Bit, 1, 0, Result128.Low, Result128.High);
} else {

View File

@ -103,7 +103,7 @@ void OpDispatchBuilder::FILD(OpcodeArgs) {
// Sign extend to 64bits
if (ReadWidth != OpSize::i64Bit) {
Data = _Sbfe(OpSize::i64Bit, ReadWidth * 8, 0, Data);
Data = _Sbfe(OpSize::i64Bit, IR::OpSizeAsBits(ReadWidth), 0, Data);
}
// We're about to clobber flags to grab the sign, so save NZCV.
@ -351,33 +351,33 @@ void OpDispatchBuilder::X87FNSTENV(OpcodeArgs) {
_StoreMem(GPRClass, Size, Mem, FCW, Size);
}
{ _StoreMem(GPRClass, Size, ReconstructFSW_Helper(), Mem, _Constant(Size * 1), Size, MEM_OFFSET_SXTX, OpSize::i8Bit); }
{ _StoreMem(GPRClass, Size, ReconstructFSW_Helper(), Mem, _Constant(IR::OpSizeToSize(Size) * 1), Size, MEM_OFFSET_SXTX, 1); }
auto ZeroConst = _Constant(0);
{
// FTW
_StoreMem(GPRClass, Size, GetX87FTW_Helper(), Mem, _Constant(Size * 2), Size, MEM_OFFSET_SXTX, OpSize::i8Bit);
_StoreMem(GPRClass, Size, GetX87FTW_Helper(), Mem, _Constant(IR::OpSizeToSize(Size) * 2), Size, MEM_OFFSET_SXTX, 1);
}
{
// Instruction Offset
_StoreMem(GPRClass, Size, ZeroConst, Mem, _Constant(Size * 3), Size, MEM_OFFSET_SXTX, OpSize::i8Bit);
_StoreMem(GPRClass, Size, ZeroConst, Mem, _Constant(IR::OpSizeToSize(Size) * 3), Size, MEM_OFFSET_SXTX, 1);
}
{
// Instruction CS selector (+ Opcode)
_StoreMem(GPRClass, Size, ZeroConst, Mem, _Constant(Size * 4), Size, MEM_OFFSET_SXTX, OpSize::i8Bit);
_StoreMem(GPRClass, Size, ZeroConst, Mem, _Constant(IR::OpSizeToSize(Size) * 4), Size, MEM_OFFSET_SXTX, 1);
}
{
// Data pointer offset
_StoreMem(GPRClass, Size, ZeroConst, Mem, _Constant(Size * 5), Size, MEM_OFFSET_SXTX, OpSize::i8Bit);
_StoreMem(GPRClass, Size, ZeroConst, Mem, _Constant(IR::OpSizeToSize(Size) * 5), Size, MEM_OFFSET_SXTX, 1);
}
{
// Data pointer selector
_StoreMem(GPRClass, Size, ZeroConst, Mem, _Constant(Size * 6), Size, MEM_OFFSET_SXTX, OpSize::i8Bit);
_StoreMem(GPRClass, Size, ZeroConst, Mem, _Constant(IR::OpSizeToSize(Size) * 6), Size, MEM_OFFSET_SXTX, 1);
}
}
@ -407,13 +407,13 @@ void OpDispatchBuilder::X87LDENV(OpcodeArgs) {
auto NewFCW = _LoadMem(GPRClass, OpSize::i16Bit, Mem, OpSize::i16Bit);
_StoreContext(OpSize::i16Bit, GPRClass, NewFCW, offsetof(FEXCore::Core::CPUState, FCW));
Ref MemLocation = _Add(OpSize::i64Bit, Mem, _Constant(Size * 1));
Ref MemLocation = _Add(OpSize::i64Bit, Mem, _Constant(IR::OpSizeToSize(Size) * 1));
auto NewFSW = _LoadMem(GPRClass, Size, MemLocation, Size);
ReconstructX87StateFromFSW_Helper(NewFSW);
{
// FTW
Ref MemLocation = _Add(OpSize::i64Bit, Mem, _Constant(Size * 2));
Ref MemLocation = _Add(OpSize::i64Bit, Mem, _Constant(IR::OpSizeToSize(Size) * 2));
SetX87FTW(_LoadMem(GPRClass, Size, MemLocation, Size));
}
}
@ -447,58 +447,58 @@ void OpDispatchBuilder::X87FNSAVE(OpcodeArgs) {
_StoreMem(GPRClass, Size, Mem, FCW, Size);
}
{ _StoreMem(GPRClass, Size, ReconstructFSW_Helper(), Mem, _Constant(Size * 1), Size, MEM_OFFSET_SXTX, 1); }
{ _StoreMem(GPRClass, Size, ReconstructFSW_Helper(), Mem, _Constant(IR::OpSizeToSize(Size) * 1), Size, MEM_OFFSET_SXTX, 1); }
auto ZeroConst = _Constant(0);
{
// FTW
_StoreMem(GPRClass, Size, GetX87FTW_Helper(), Mem, _Constant(Size * 2), Size, MEM_OFFSET_SXTX, 1);
_StoreMem(GPRClass, Size, GetX87FTW_Helper(), Mem, _Constant(IR::OpSizeToSize(Size) * 2), Size, MEM_OFFSET_SXTX, 1);
}
{
// Instruction Offset
_StoreMem(GPRClass, Size, ZeroConst, Mem, _Constant(Size * 3), Size, MEM_OFFSET_SXTX, 1);
_StoreMem(GPRClass, Size, ZeroConst, Mem, _Constant(IR::OpSizeToSize(Size) * 3), Size, MEM_OFFSET_SXTX, 1);
}
{
// Instruction CS selector (+ Opcode)
_StoreMem(GPRClass, Size, ZeroConst, Mem, _Constant(Size * 4), Size, MEM_OFFSET_SXTX, 1);
_StoreMem(GPRClass, Size, ZeroConst, Mem, _Constant(IR::OpSizeToSize(Size) * 4), Size, MEM_OFFSET_SXTX, 1);
}
{
// Data pointer offset
_StoreMem(GPRClass, Size, ZeroConst, Mem, _Constant(Size * 5), Size, MEM_OFFSET_SXTX, 1);
_StoreMem(GPRClass, Size, ZeroConst, Mem, _Constant(IR::OpSizeToSize(Size) * 5), Size, MEM_OFFSET_SXTX, 1);
}
{
// Data pointer selector
_StoreMem(GPRClass, Size, ZeroConst, Mem, _Constant(Size * 6), Size, MEM_OFFSET_SXTX, 1);
_StoreMem(GPRClass, Size, ZeroConst, Mem, _Constant(IR::OpSizeToSize(Size) * 6), Size, MEM_OFFSET_SXTX, 1);
}
auto OneConst = _Constant(1);
auto SevenConst = _Constant(7);
const auto LoadSize = ReducedPrecisionMode ? OpSize::i64Bit : OpSize::i128Bit;
for (int i = 0; i < 7; ++i) {
Ref data = _LoadContextIndexed(Top, LoadSize, MMBaseOffset(), OpSize::i128Bit, FPRClass);
Ref data = _LoadContextIndexed(Top, LoadSize, MMBaseOffset(), IR::OpSizeToSize(OpSize::i128Bit), FPRClass);
if (ReducedPrecisionMode) {
data = _F80CVTTo(data, OpSize::i64Bit);
}
_StoreMem(FPRClass, OpSize::i128Bit, data, Mem, _Constant((Size * 7) + (10 * i)), OpSize::i8Bit, MEM_OFFSET_SXTX, 1);
_StoreMem(FPRClass, OpSize::i128Bit, data, Mem, _Constant((IR::OpSizeToSize(Size) * 7) + (10 * i)), OpSize::i8Bit, MEM_OFFSET_SXTX, 1);
Top = _And(OpSize::i32Bit, _Add(OpSize::i32Bit, Top, OneConst), SevenConst);
}
// The final st(7) needs a bit of special handling here
Ref data = _LoadContextIndexed(Top, LoadSize, MMBaseOffset(), OpSize::i128Bit, FPRClass);
Ref data = _LoadContextIndexed(Top, LoadSize, MMBaseOffset(), IR::OpSizeToSize(OpSize::i128Bit), FPRClass);
if (ReducedPrecisionMode) {
data = _F80CVTTo(data, OpSize::i64Bit);
}
// ST7 broken in to two parts
// Lower 64bits [63:0]
// upper 16 bits [79:64]
_StoreMem(FPRClass, OpSize::i64Bit, data, Mem, _Constant((Size * 7) + (7 * 10)), OpSize::i8Bit, MEM_OFFSET_SXTX, 1);
_StoreMem(FPRClass, OpSize::i64Bit, data, Mem, _Constant((IR::OpSizeToSize(Size) * 7) + (7 * 10)), OpSize::i8Bit, MEM_OFFSET_SXTX, 1);
auto topBytes = _VDupElement(OpSize::i128Bit, OpSize::i16Bit, data, 4);
_StoreMem(FPRClass, OpSize::i16Bit, topBytes, Mem, _Constant((Size * 7) + (7 * 10) + 8), OpSize::i8Bit, MEM_OFFSET_SXTX, 1);
_StoreMem(FPRClass, OpSize::i16Bit, topBytes, Mem, _Constant((IR::OpSizeToSize(Size) * 7) + (7 * 10) + 8), OpSize::i8Bit, MEM_OFFSET_SXTX, 1);
// reset to default
FNINIT(Op);
@ -522,11 +522,11 @@ void OpDispatchBuilder::X87FRSTOR(OpcodeArgs) {
_SetRoundingMode(roundingMode, false, roundingMode);
}
auto NewFSW = _LoadMem(GPRClass, Size, Mem, _Constant(Size * 1), Size, MEM_OFFSET_SXTX, 1);
auto NewFSW = _LoadMem(GPRClass, Size, Mem, _Constant(IR::OpSizeToSize(Size) * 1), Size, MEM_OFFSET_SXTX, 1);
Ref Top = ReconstructX87StateFromFSW_Helper(NewFSW);
{
// FTW
SetX87FTW(_LoadMem(GPRClass, Size, Mem, _Constant(Size * 2), Size, MEM_OFFSET_SXTX, 1));
SetX87FTW(_LoadMem(GPRClass, Size, Mem, _Constant(IR::OpSizeToSize(Size) * 2), Size, MEM_OFFSET_SXTX, 1));
}
auto OneConst = _Constant(1);
@ -538,14 +538,14 @@ void OpDispatchBuilder::X87FRSTOR(OpcodeArgs) {
Mask = _VInsGPR(OpSize::i128Bit, OpSize::i64Bit, 1, Mask, high);
const auto StoreSize = ReducedPrecisionMode ? OpSize::i64Bit : OpSize::i128Bit;
for (int i = 0; i < 7; ++i) {
Ref Reg = _LoadMem(FPRClass, OpSize::i128Bit, Mem, _Constant((Size * 7) + (10 * i)), OpSize::i8Bit, MEM_OFFSET_SXTX, 1);
Ref Reg = _LoadMem(FPRClass, OpSize::i128Bit, Mem, _Constant((IR::OpSizeToSize(Size) * 7) + (10 * i)), OpSize::i8Bit, MEM_OFFSET_SXTX, 1);
// Mask off the top bits
Reg = _VAnd(OpSize::i128Bit, OpSize::i128Bit, Reg, Mask);
if (ReducedPrecisionMode) {
// Convert to double precision
Reg = _F80CVT(OpSize::i64Bit, Reg);
}
_StoreContextIndexed(Reg, Top, StoreSize, MMBaseOffset(), OpSize::i128Bit, FPRClass);
_StoreContextIndexed(Reg, Top, StoreSize, MMBaseOffset(), IR::OpSizeToSize(OpSize::i128Bit), FPRClass);
Top = _And(OpSize::i32Bit, _Add(OpSize::i32Bit, Top, OneConst), SevenConst);
}
@ -554,13 +554,14 @@ void OpDispatchBuilder::X87FRSTOR(OpcodeArgs) {
// ST7 broken in to two parts
// Lower 64bits [63:0]
// upper 16 bits [79:64]
Ref Reg = _LoadMem(FPRClass, OpSize::i64Bit, Mem, _Constant((Size * 7) + (10 * 7)), OpSize::i8Bit, MEM_OFFSET_SXTX, 1);
Ref RegHigh = _LoadMem(FPRClass, OpSize::i16Bit, Mem, _Constant((Size * 7) + (10 * 7) + 8), OpSize::i8Bit, MEM_OFFSET_SXTX, 1);
Ref Reg = _LoadMem(FPRClass, OpSize::i64Bit, Mem, _Constant((IR::OpSizeToSize(Size) * 7) + (10 * 7)), OpSize::i8Bit, MEM_OFFSET_SXTX, 1);
Ref RegHigh =
_LoadMem(FPRClass, OpSize::i16Bit, Mem, _Constant((IR::OpSizeToSize(Size) * 7) + (10 * 7) + 8), OpSize::i8Bit, MEM_OFFSET_SXTX, 1);
Reg = _VInsElement(OpSize::i128Bit, OpSize::i16Bit, 4, 0, Reg, RegHigh);
if (ReducedPrecisionMode) {
Reg = _F80CVT(OpSize::i64Bit, Reg); // Convert to double precision
}
_StoreContextIndexed(Reg, Top, StoreSize, MMBaseOffset(), OpSize::i128Bit, FPRClass);
_StoreContextIndexed(Reg, Top, StoreSize, MMBaseOffset(), IR::OpSizeToSize(OpSize::i128Bit), FPRClass);
}
// Load / Store Control Word

View File

@ -36,12 +36,12 @@ void OpDispatchBuilder::X87LDENVF64(OpcodeArgs) {
_SetRoundingMode(roundingMode, false, roundingMode);
_StoreContext(OpSize::i16Bit, GPRClass, NewFCW, offsetof(FEXCore::Core::CPUState, FCW));
auto NewFSW = _LoadMem(GPRClass, Size, Mem, _Constant(Size * 1), Size, MEM_OFFSET_SXTX, 1);
auto NewFSW = _LoadMem(GPRClass, Size, Mem, _Constant(IR::OpSizeToSize(Size)), Size, MEM_OFFSET_SXTX, 1);
ReconstructX87StateFromFSW_Helper(NewFSW);
{
// FTW
SetX87FTW(_LoadMem(GPRClass, Size, Mem, _Constant(Size * 2), Size, MEM_OFFSET_SXTX, 1));
SetX87FTW(_LoadMem(GPRClass, Size, Mem, _Constant(IR::OpSizeToSize(Size) * 2), Size, MEM_OFFSET_SXTX, 1));
}
}
@ -97,7 +97,7 @@ void OpDispatchBuilder::FILDF64(OpcodeArgs) {
// Read from memory
Ref Data = LoadSource_WithOpSize(GPRClass, Op, Op->Src[0], ReadWidth, Op->Flags);
if (ReadWidth == OpSize::i16Bit) {
Data = _Sbfe(OpSize::i64Bit, ReadWidth * 8, 0, Data);
Data = _Sbfe(OpSize::i64Bit, IR::OpSizeAsBits(ReadWidth), 0, Data);
}
auto ConvertedData = _Float_FromGPR_S(OpSize::i64Bit, ReadWidth == OpSize::i32Bit ? OpSize::i32Bit : OpSize::i64Bit, Data);
_PushStack(ConvertedData, Data, ReadWidth, false);
@ -117,9 +117,9 @@ void OpDispatchBuilder::FISTF64(OpcodeArgs, bool Truncate) {
Ref data = _ReadStackValue(0);
if (Truncate) {
data = _Float_ToGPR_ZS(Size == 4 ? OpSize::i32Bit : OpSize::i64Bit, OpSize::i64Bit, data);
data = _Float_ToGPR_ZS(Size == OpSize::i32Bit ? OpSize::i32Bit : OpSize::i64Bit, OpSize::i64Bit, data);
} else {
data = _Float_ToGPR_S(Size == 4 ? OpSize::i32Bit : OpSize::i64Bit, OpSize::i64Bit, data);
data = _Float_ToGPR_S(Size == OpSize::i32Bit ? OpSize::i32Bit : OpSize::i64Bit, OpSize::i64Bit, data);
}
StoreResult_WithOpSize(GPRClass, Op, Op->Dest, data, Size, OpSize::i8Bit);
@ -339,7 +339,7 @@ void OpDispatchBuilder::FCOMIF64(OpcodeArgs, IR::OpSize Width, bool Integer, OpD
if (Width == OpSize::i16Bit) {
arg = _Sbfe(OpSize::i64Bit, 16, 0, arg);
}
b = _Float_FromGPR_S(OpSize::i64Bit, Width == 64 ? OpSize::i64Bit : OpSize::i32Bit, arg);
b = _Float_FromGPR_S(OpSize::i64Bit, Width == OpSize::i64Bit ? OpSize::i64Bit : OpSize::i32Bit, arg);
} else if (Width == OpSize::i32Bit) {
arg = LoadSource(FPRClass, Op, Op->Src[0], Op->Flags);
b = _Float_FToF(OpSize::i64Bit, OpSize::i32Bit, arg);

View File

@ -548,7 +548,7 @@ protected:
// This must directly match bytes to the named opsize.
// Implicit sized IR operations does math to get between sizes.
enum OpSize : uint8_t {
enum class OpSize : uint8_t {
iUnsized = 0,
i8Bit = 1,
i16Bit = 2,
@ -615,14 +615,18 @@ static inline uint16_t OpSizeAsBits(IR::OpSize Size) {
return IR::OpSizeToSize(Size) * 8u;
}
static inline OpSize MultiplyOpSize(IR::OpSize Size, uint8_t Multiplier) {
template<typename T>
requires (std::is_integral_v<T>)
static inline OpSize operator<<(IR::OpSize Size, T Shift) {
LOGMAN_THROW_A_FMT(Size != IR::OpSize::iInvalid, "Invalid Size");
return IR::SizeToOpSize(IR::OpSizeToSize(Size) * Multiplier);
return IR::SizeToOpSize(IR::OpSizeToSize(Size) << Shift);
}
static inline OpSize DivideOpSize(IR::OpSize Size, uint8_t Divisor) {
template<typename T>
requires (std::is_integral_v<T>)
static inline OpSize operator>>(IR::OpSize Size, T Shift) {
LOGMAN_THROW_A_FMT(Size != IR::OpSize::iInvalid, "Invalid Size");
return IR::SizeToOpSize(IR::OpSizeToSize(Size) / Divisor);
return IR::SizeToOpSize(IR::OpSizeToSize(Size) >> Shift);
}
static inline OpSize operator/(IR::OpSize Size, IR::OpSize Divisor) {
@ -630,7 +634,9 @@ static inline OpSize operator/(IR::OpSize Size, IR::OpSize Divisor) {
return IR::SizeToOpSize(IR::OpSizeToSize(Size) / IR::OpSizeToSize(Divisor));
}
static inline OpSize operator/(IR::OpSize Size, uint8_t Divisor) {
template<typename T>
requires (std::is_integral_v<T>)
static inline OpSize operator/(IR::OpSize Size, T Divisor) {
LOGMAN_THROW_A_FMT(Size != IR::OpSize::iInvalid, "Invalid Size");
return IR::SizeToOpSize(IR::OpSizeToSize(Size) / Divisor);
}

View File

@ -736,7 +736,7 @@
"HasSideEffects": true,
"DestSize": "RegisterSize",
"EmitValidation": [
"Offset % RegisterSize == 0",
"Offset % IR::OpSizeToSize(RegisterSize) == 0",
"RegisterSize == FEXCore::IR::OpSize::i128Bit || RegisterSize == FEXCore::IR::OpSize::i256Bit"
]
},
@ -748,7 +748,7 @@
"HasSideEffects": true,
"DestSize": "RegisterSize",
"EmitValidation": [
"Offset % RegisterSize == 0",
"Offset % IR::OpSizeToSize(RegisterSize) == 0",
"RegisterSize == FEXCore::IR::OpSize::i128Bit"
]
},
@ -760,7 +760,7 @@
"HasSideEffects": true,
"DestSize": "RegisterSize",
"EmitValidation": [
"Offset % RegisterSize == 0",
"Offset % IR::OpSizeToSize(RegisterSize) == 0",
"RegisterSize == FEXCore::IR::OpSize::i128Bit || RegisterSize == FEXCore::IR::OpSize::i256Bit"
]
}
@ -2017,7 +2017,7 @@
"TiedSource": 0,
"Desc": "Unsigned shifts right each element and then narrows to the next lower element size",
"DestSize": "RegisterSize",
"NumElements": "RegisterSize / (IR::DivideOpSize(ElementSize, 2))"
"NumElements": "RegisterSize / (ElementSize >> 1)"
},
"FPR = VUShrNI2 OpSize:#RegisterSize, OpSize:#ElementSize, FPR:$VectorLower, FPR:$VectorUpper, u8:$BitShift": {
@ -2026,73 +2026,73 @@
"Inserts results in to the high elements of the first argument"
],
"DestSize": "RegisterSize",
"NumElements": "RegisterSize / (IR::DivideOpSize(ElementSize, 2))"
"NumElements": "RegisterSize / (ElementSize >> 1)"
},
"FPR = VSXTL OpSize:#RegisterSize, OpSize:#ElementSize, FPR:$Vector": {
"Desc": "Sign extends elements from the source element size to the next size up",
"DestSize": "RegisterSize",
"NumElements": "RegisterSize / (IR::MultiplyOpSize(ElementSize, 2))"
"NumElements": "RegisterSize / (ElementSize << 1)"
},
"FPR = VSXTL2 OpSize:#RegisterSize, OpSize:#ElementSize, FPR:$Vector": {
"Desc": ["Sign extends elements from the source element size to the next size up",
"Source elements come from the upper half of the register"
],
"DestSize": "RegisterSize",
"NumElements": "RegisterSize / (IR::MultiplyOpSize(ElementSize, 2))"
"NumElements": "RegisterSize / (ElementSize << 1)"
},
"FPR = VSSHLL OpSize:#RegisterSize, OpSize:#ElementSize, FPR:$Vector, u8:$BitShift{0}": {
"Desc": "Sign extends elements from the source element size to the next size up",
"DestSize": "RegisterSize",
"NumElements": "RegisterSize / (IR::MultiplyOpSize(ElementSize, 2))"
"NumElements": "RegisterSize / (ElementSize << 1)"
},
"FPR = VSSHLL2 OpSize:#RegisterSize, OpSize:#ElementSize, FPR:$Vector, u8:$BitShift{0}": {
"Desc": ["Sign extends elements from the source element size to the next size up",
"Source elements come from the upper half of the register"
],
"DestSize": "RegisterSize",
"NumElements": "RegisterSize / (IR::MultiplyOpSize(ElementSize, 2))"
"NumElements": "RegisterSize / (ElementSize << 1)"
},
"FPR = VUXTL OpSize:#RegisterSize, OpSize:#ElementSize, FPR:$Vector": {
"Desc": "Zero extends elements from the source element size to the next size up",
"DestSize": "RegisterSize",
"NumElements": "RegisterSize / (IR::MultiplyOpSize(ElementSize, 2))"
"NumElements": "RegisterSize / (ElementSize << 1)"
},
"FPR = VUXTL2 OpSize:#RegisterSize, OpSize:#ElementSize, FPR:$Vector": {
"Desc": ["Zero extends elements from the source element size to the next size up",
"Source elements come from the upper half of the register"
],
"DestSize": "RegisterSize",
"NumElements": "RegisterSize / (IR::MultiplyOpSize(ElementSize, 2))"
"NumElements": "RegisterSize / (ElementSize << 1)"
},
"FPR = VSQXTN OpSize:#RegisterSize, OpSize:#ElementSize, FPR:$Vector": {
"TiedSource": 0,
"DestSize": "RegisterSize",
"NumElements": "RegisterSize / (IR::DivideOpSize(ElementSize, 2))"
"NumElements": "RegisterSize / (ElementSize >> 1)"
},
"FPR = VSQXTN2 OpSize:#RegisterSize, OpSize:#ElementSize, FPR:$VectorLower, FPR:$VectorUpper": {
"TiedSource": 0,
"DestSize": "RegisterSize",
"NumElements": "RegisterSize / (IR::DivideOpSize(ElementSize, 2))"
"NumElements": "RegisterSize / (ElementSize >> 1)"
},
"FPR = VSQXTNPair OpSize:#RegisterSize, OpSize:#ElementSize, FPR:$VectorLower, FPR:$VectorUpper": {
"Desc": ["Does both VSQXTN and VSQXTN2 in a combined operation."
],
"DestSize": "RegisterSize",
"NumElements": "RegisterSize / (IR::DivideOpSize(ElementSize, 2))"
"NumElements": "RegisterSize / (ElementSize >> 1)"
},
"FPR = VSQXTUN OpSize:#RegisterSize, OpSize:#ElementSize, FPR:$Vector": {
"DestSize": "RegisterSize",
"NumElements": "RegisterSize / (IR::DivideOpSize(ElementSize, 2))"
"NumElements": "RegisterSize / (ElementSize >> 1)"
},
"FPR = VSQXTUN2 OpSize:#RegisterSize, OpSize:#ElementSize, FPR:$VectorLower, FPR:$VectorUpper": {
"DestSize": "RegisterSize",
"NumElements": "RegisterSize / (IR::DivideOpSize(ElementSize, 2))"
"NumElements": "RegisterSize / (ElementSize >> 1)"
},
"FPR = VSQXTUNPair OpSize:#RegisterSize, OpSize:#ElementSize, FPR:$VectorLower, FPR:$VectorUpper": {
"Desc": ["Does both VSQXTUN and VSQXTUN2 in a combined operation."
],
"DestSize": "RegisterSize",
"NumElements": "RegisterSize / (IR::DivideOpSize(ElementSize, 2))"
"NumElements": "RegisterSize / (ElementSize >> 1)"
},
"FPR = VSRSHR OpSize:#RegisterSize, OpSize:#ElementSize, FPR:$Vector, u8:$BitShift": {
"Desc": ["Signed rounding shift right by immediate",
@ -2271,24 +2271,24 @@
},
"FPR = VUMull OpSize:#RegisterSize, OpSize:#ElementSize, FPR:$Vector1, FPR:$Vector2": {
"DestSize": "RegisterSize",
"NumElements": "RegisterSize / (IR::MultiplyOpSize(ElementSize, 2))"
"NumElements": "RegisterSize / (ElementSize << 1)"
},
"FPR = VSMull OpSize:#RegisterSize, OpSize:#ElementSize, FPR:$Vector1, FPR:$Vector2": {
"Desc": [ "Does a signed integer multiply with extend.",
"ElementSize is the source size"
],
"DestSize": "RegisterSize",
"NumElements": "RegisterSize / (IR::MultiplyOpSize(ElementSize, 2))"
"NumElements": "RegisterSize / (ElementSize << 1)"
},
"FPR = VUMull2 OpSize:#RegisterSize, OpSize:#ElementSize, FPR:$Vector1, FPR:$Vector2": {
"Desc": "Multiplies the high elements with size extension",
"DestSize": "RegisterSize",
"NumElements": "RegisterSize / (IR::MultiplyOpSize(ElementSize, 2))"
"NumElements": "RegisterSize / (ElementSize << 1)"
},
"FPR = VSMull2 OpSize:#RegisterSize, OpSize:#ElementSize, FPR:$Vector1, FPR:$Vector2": {
"Desc": "Multiplies the high elements with size extension",
"DestSize": "RegisterSize",
"NumElements": "RegisterSize / (IR::MultiplyOpSize(ElementSize, 2))"
"NumElements": "RegisterSize / (ElementSize << 1)"
},
"FPR = VUMulH OpSize:#RegisterSize, OpSize:#ElementSize, FPR:$Vector1, FPR:$Vector2": {
"Desc": "Wide unsigned multiply returning the high results",
@ -2305,14 +2305,14 @@
"Desc": ["Unsigned Absolute Difference Long"
],
"DestSize": "RegisterSize",
"NumElements": "RegisterSize / (IR::MultiplyOpSize(ElementSize, 2))"
"NumElements": "RegisterSize / (ElementSize << 1)"
},
"FPR = VUABDL2 OpSize:#RegisterSize, OpSize:#ElementSize, FPR:$Vector1, FPR:$Vector2": {
"Desc": ["Unsigned Absolute Difference Long",
"Using the high elements of the source vectors"
],
"DestSize": "RegisterSize",
"NumElements": "RegisterSize / (IR::MultiplyOpSize(ElementSize, 2))"
"NumElements": "RegisterSize / (ElementSize << 1)"
},
"FPR = VUShl OpSize:#RegisterSize, OpSize:#ElementSize, FPR:$Vector, FPR:$ShiftVector, i1:$RangeCheck": {
"TiedSource": 0,
@ -2580,7 +2580,7 @@
"Selecting from the high half of the register."
],
"DestSize": "RegisterSize",
"NumElements": "RegisterSize / (IR::MultiplyOpSize(ElementSize, 2))",
"NumElements": "RegisterSize / (ElementSize << 1)",
"EmitValidation": [
"RegisterSize != FEXCore::IR::OpSize::i256Bit && \"What does 256-bit mean in this context?\""
]
@ -2594,7 +2594,7 @@
"F64->F32, F32->F16"
],
"DestSize": "RegisterSize",
"NumElements": "RegisterSize / (IR::DivideOpSize(ElementSize, 2))",
"NumElements": "RegisterSize / (ElementSize >> 1)",
"EmitValidation": [
"RegisterSize != FEXCore::IR::OpSize::i256Bit && \"What does 256-bit mean in this context?\""
]

View File

@ -112,17 +112,17 @@ static void PrintArg(fextl::stringstream* out, const IRListView* IR, OrderedNode
}
if (GetHasDest(IROp->Op)) {
uint32_t ElementSize = IROp->ElementSize;
uint32_t NumElements = IROp->Size;
if (!IROp->ElementSize) {
auto ElementSize = IROp->ElementSize;
uint32_t NumElements = 0;
if (IROp->ElementSize == OpSize::iUnsized) {
ElementSize = IROp->Size;
}
if (ElementSize) {
NumElements /= ElementSize;
if (ElementSize != OpSize::iUnsized) {
NumElements = IR::NumElements(IROp->Size, ElementSize);
}
*out << " i" << std::dec << (ElementSize * 8);
*out << " i" << std::dec << IR::OpSizeAsBits(ElementSize);
if (NumElements > 1) {
*out << "v" << std::dec << NumElements;
@ -296,11 +296,11 @@ void Dump(fextl::stringstream* out, const IRListView* IR, IR::RegisterAllocation
auto ElementSize = IROp->ElementSize;
uint8_t NumElements = 0;
if (!IROp->ElementSize) {
if (IROp->ElementSize != OpSize::iUnsized) {
ElementSize = IROp->Size;
}
if (ElementSize) {
if (ElementSize != OpSize::iUnsized) {
NumElements = IR::NumElements(IROp->Size, ElementSize);
}
@ -324,7 +324,7 @@ void Dump(fextl::stringstream* out, const IRListView* IR, IR::RegisterAllocation
}
}
*out << " i" << std::dec << (ElementSize * 8);
*out << " i" << std::dec << IR::OpSizeAsBits(ElementSize);
if (NumElements > 1) {
*out << "v" << std::dec << NumElements;
@ -334,16 +334,16 @@ void Dump(fextl::stringstream* out, const IRListView* IR, IR::RegisterAllocation
} else {
auto ElementSize = IROp->ElementSize;
if (!IROp->ElementSize) {
if (IROp->ElementSize == OpSize::iUnsized) {
ElementSize = IROp->Size;
}
uint32_t NumElements = 0;
if (ElementSize) {
if (ElementSize != OpSize::iUnsized) {
NumElements = IR::NumElements(IROp->Size, ElementSize);
}
*out << "(%" << std::dec << ID << ' ';
*out << 'i' << std::dec << (ElementSize * 8);
*out << 'i' << std::dec << IR::OpSizeAsBits(ElementSize);
if (NumElements > 1) {
*out << 'v' << std::dec << NumElements;
}

View File

@ -71,19 +71,18 @@ public:
return _Jump(InvalidNode);
}
IRPair<IROp_CondJump> _CondJump(Ref ssa0, CondClassType cond = {COND_NEQ}) {
return _CondJump(ssa0, _Constant(0), InvalidNode, InvalidNode, cond, IR::SizeToOpSize(GetOpSize(ssa0)));
return _CondJump(ssa0, _Constant(0), InvalidNode, InvalidNode, cond, GetOpSize(ssa0));
}
IRPair<IROp_CondJump> _CondJump(Ref ssa0, Ref ssa1, Ref ssa2, CondClassType cond = {COND_NEQ}) {
return _CondJump(ssa0, _Constant(0), ssa1, ssa2, cond, IR::SizeToOpSize(GetOpSize(ssa0)));
return _CondJump(ssa0, _Constant(0), ssa1, ssa2, cond, GetOpSize(ssa0));
}
// TODO: Work to remove this implicit sized Select implementation.
IRPair<IROp_Select> _Select(uint8_t Cond, Ref ssa0, Ref ssa1, Ref ssa2, Ref ssa3, uint8_t CompareSize = 0) {
if (CompareSize == 0) {
CompareSize = std::max<uint8_t>(4, std::max<uint8_t>(GetOpSize(ssa0), GetOpSize(ssa1)));
IRPair<IROp_Select> _Select(uint8_t Cond, Ref ssa0, Ref ssa1, Ref ssa2, Ref ssa3, IR::OpSize CompareSize = OpSize::iUnsized) {
if (CompareSize == OpSize::iUnsized) {
CompareSize = std::max(OpSize::i32Bit, std::max(GetOpSize(ssa0), GetOpSize(ssa1)));
}
return _Select(IR::SizeToOpSize(std::max<uint8_t>(4, std::max<uint8_t>(GetOpSize(ssa2), GetOpSize(ssa3)))),
IR::SizeToOpSize(CompareSize), CondClassType {Cond}, ssa0, ssa1, ssa2, ssa3);
return _Select(std::max(OpSize::i32Bit, std::max(GetOpSize(ssa2), GetOpSize(ssa3))), CompareSize, CondClassType {Cond}, ssa0, ssa1, ssa2, ssa3);
}
IRPair<IROp_LoadMem> _LoadMem(FEXCore::IR::RegisterClassType Class, IR::OpSize Size, Ref ssa0, IR::OpSize Align = OpSize::i8Bit) {
return _LoadMem(Class, Size, ssa0, Invalid(), Align, MEM_OFFSET_SXTX, 1);

View File

@ -29,7 +29,7 @@ $end_info$
namespace FEXCore::IR {
uint64_t getMask(IROp_Header* Op) {
uint64_t NumBits = Op->Size * 8;
uint64_t NumBits = IR::OpSizeAsBits(Op->Size);
return (~0ULL) >> (64 - NumBits);
}
@ -91,7 +91,7 @@ private:
// We don't allow 8/16-bit operations to have constants, since no
// constant would be in bounds after the JIT's 24/16 shift.
auto Filter = [&IROp](uint64_t X) {
return ARMEmitter::IsImmAddSub(X) && IROp->Size >= 4;
return ARMEmitter::IsImmAddSub(X) && IROp->Size >= OpSize::i32Bit;
};
return InlineIf(IREmit, CurrentIR, CodeNode, IROp, Index, Filter);
@ -112,7 +112,7 @@ private:
IsSIMM9 &= (SupportsTSOImm9 || !TSO);
// Extended offsets for regular loadstore only.
bool IsExtended = (Imm & (IROp->Size - 1)) == 0 && Imm / IROp->Size <= 4095;
bool IsExtended = (Imm & (IR::OpSizeToSize(IROp->Size) - 1)) == 0 && Imm / IR::OpSizeToSize(IROp->Size) <= 4095;
IsExtended &= !TSO;
if (IsSIMM9 || IsExtended) {
@ -204,7 +204,7 @@ void ConstProp::ConstantPropagation(IREmitter* IREmit, const IRListView& Current
/* IsImmAddSub assumes the constants are sign-extended, take care of that
* here so we get the optimization for 32-bit adds too.
*/
if (Op->Header.Size == 4) {
if (Op->Header.Size == OpSize::i32Bit) {
Constant1 = (int64_t)(int32_t)Constant1;
Constant2 = (int64_t)(int32_t)Constant2;
}
@ -290,12 +290,12 @@ void ConstProp::ConstantPropagation(IREmitter* IREmit, const IRListView& Current
}
if (!Replaced) {
InlineIf(IREmit, CurrentIR, CodeNode, IROp, 1, [&IROp](uint64_t X) { return IsImmLogical(X, IROp->Size * 8); });
InlineIf(IREmit, CurrentIR, CodeNode, IROp, 1, [&IROp](uint64_t X) { return IsImmLogical(X, IR::OpSizeAsBits(IROp->Size)); });
}
break;
}
case OP_OR: {
InlineIf(IREmit, CurrentIR, CodeNode, IROp, 1, [&IROp](uint64_t X) { return IsImmLogical(X, IROp->Size * 8); });
InlineIf(IREmit, CurrentIR, CodeNode, IROp, 1, [&IROp](uint64_t X) { return IsImmLogical(X, IR::OpSizeAsBits(IROp->Size)); });
break;
}
case OP_XOR: {
@ -325,7 +325,7 @@ void ConstProp::ConstantPropagation(IREmitter* IREmit, const IRListView& Current
}
if (!Replaced) {
InlineIf(IREmit, CurrentIR, CodeNode, IROp, 1, [&IROp](uint64_t X) { return IsImmLogical(X, IROp->Size * 8); });
InlineIf(IREmit, CurrentIR, CodeNode, IROp, 1, [&IROp](uint64_t X) { return IsImmLogical(X, IR::OpSizeAsBits(IROp->Size)); });
}
}
break;
@ -333,7 +333,7 @@ void ConstProp::ConstantPropagation(IREmitter* IREmit, const IRListView& Current
case OP_ANDWITHFLAGS:
case OP_ANDN:
case OP_TESTNZ: {
InlineIf(IREmit, CurrentIR, CodeNode, IROp, 1, [&IROp](uint64_t X) { return IsImmLogical(X, IROp->Size * 8); });
InlineIf(IREmit, CurrentIR, CodeNode, IROp, 1, [&IROp](uint64_t X) { return IsImmLogical(X, IR::OpSizeAsBits(IROp->Size)); });
break;
}
case OP_NEG: {
@ -356,7 +356,7 @@ void ConstProp::ConstantPropagation(IREmitter* IREmit, const IRListView& Current
if (IREmit->IsValueConstant(IROp->Args[0], &Constant1) && IREmit->IsValueConstant(IROp->Args[1], &Constant2)) {
// Shifts mask the shift amount by 63 or 31 depending on operating size;
uint64_t ShiftMask = IROp->Size == 8 ? 63 : 31;
uint64_t ShiftMask = IROp->Size == OpSize::i64Bit ? 63 : 31;
uint64_t NewConstant = (Constant1 << (Constant2 & ShiftMask)) & getMask(IROp);
IREmit->ReplaceWithConstant(CodeNode, NewConstant);
} else if (IREmit->IsValueConstant(IROp->Args[1], &Constant2) && Constant2 == 0) {
@ -384,7 +384,7 @@ void ConstProp::ConstantPropagation(IREmitter* IREmit, const IRListView& Current
auto Op = IROp->C<IR::IROp_Bfe>();
uint64_t Constant;
if (IROp->Size <= 8 && IREmit->IsValueConstant(Op->Src, &Constant)) {
if (IROp->Size <= OpSize::i64Bit && IREmit->IsValueConstant(Op->Src, &Constant)) {
uint64_t SourceMask = Op->Width == 64 ? ~0ULL : ((1ULL << Op->Width) - 1);
SourceMask <<= Op->lsb;
@ -400,7 +400,7 @@ void ConstProp::ConstantPropagation(IREmitter* IREmit, const IRListView& Current
if (IREmit->IsValueConstant(Op->Src, &Constant)) {
// SBFE of a constant can be converted to a constant.
uint64_t SourceMask = Op->Width == 64 ? ~0ULL : ((1ULL << Op->Width) - 1);
uint64_t DestSizeInBits = IROp->Size * 8;
uint64_t DestSizeInBits = IR::OpSizeAsBits(IROp->Size);
uint64_t DestMask = DestSizeInBits == 64 ? ~0ULL : ((1ULL << DestSizeInBits) - 1);
SourceMask <<= Op->lsb;
@ -424,11 +424,11 @@ void ConstProp::ConstantPropagation(IREmitter* IREmit, const IRListView& Current
uint64_t NewConstant = SourceMask << Op->lsb;
if (ConstantSrc & 1) {
auto orr = IREmit->_Or(IR::SizeToOpSize(IROp->Size), CurrentIR.GetNode(IROp->Args[0]), IREmit->_Constant(NewConstant));
auto orr = IREmit->_Or(IROp->Size, CurrentIR.GetNode(IROp->Args[0]), IREmit->_Constant(NewConstant));
IREmit->ReplaceAllUsesWith(CodeNode, orr);
} else {
// We are wanting to clear the bitfield.
auto andn = IREmit->_Andn(IR::SizeToOpSize(IROp->Size), CurrentIR.GetNode(IROp->Args[0]), IREmit->_Constant(NewConstant));
auto andn = IREmit->_Andn(IROp->Size, CurrentIR.GetNode(IROp->Args[0]), IREmit->_Constant(NewConstant));
IREmit->ReplaceAllUsesWith(CodeNode, andn);
}
}
@ -596,7 +596,7 @@ void ConstProp::ConstantPropagation(IREmitter* IREmit, const IRListView& Current
case OP_SELECT: {
InlineIf(IREmit, CurrentIR, CodeNode, IROp, 1, ARMEmitter::IsImmAddSub);
uint64_t AllOnes = IROp->Size == 8 ? 0xffff'ffff'ffff'ffffull : 0xffff'ffffull;
uint64_t AllOnes = IROp->Size == OpSize::i64Bit ? 0xffff'ffff'ffff'ffffull : 0xffff'ffffull;
uint64_t Constant2 {};
uint64_t Constant3 {};
@ -614,7 +614,7 @@ void ConstProp::ConstantPropagation(IREmitter* IREmit, const IRListView& Current
// We always allow source 1 to be zero, but source 0 can only be a
// special 1/~0 constant if source 1 is 0.
if (InlineIfZero(IREmit, CurrentIR, CodeNode, IROp, 1)) {
uint64_t AllOnes = IROp->Size == 8 ? 0xffff'ffff'ffff'ffffull : 0xffff'ffffull;
uint64_t AllOnes = IROp->Size == OpSize::i64Bit ? 0xffff'ffff'ffff'ffffull : 0xffff'ffffull;
InlineIf(IREmit, CurrentIR, CodeNode, IROp, 0, [&AllOnes](uint64_t X) { return X == 1 || X == AllOnes; });
}
break;
@ -632,7 +632,7 @@ void ConstProp::ConstantPropagation(IREmitter* IREmit, const IRListView& Current
auto EO = NewRIP->C<IR::IROp_EntrypointOffset>();
IREmit->SetWriteCursor(CurrentIR.GetNode(Op->NewRIP));
IREmit->ReplaceNodeArgument(CodeNode, 0, IREmit->_InlineEntrypointOffset(IR::SizeToOpSize(EO->Header.Size), EO->Offset));
IREmit->ReplaceNodeArgument(CodeNode, 0, IREmit->_InlineEntrypointOffset(EO->Header.Size, EO->Offset));
}
}
break;

View File

@ -79,12 +79,12 @@ void IRValidation::Run(IREmitter* IREmit) {
for (auto [CodeNode, IROp] : CurrentIR.GetCode(BlockNode)) {
const auto ID = CurrentIR.GetID(CodeNode);
const uint8_t OpSize = IROp->Size;
const auto OpSize = IROp->Size;
if (GetHasDest(IROp->Op)) {
HadError |= OpSize == 0;
HadError |= OpSize == IR::OpSize::iInvalid;
// Does the op have a destination of size 0?
if (OpSize == 0) {
if (OpSize == IR::OpSize::iInvalid) {
Errors << "%" << ID << ": Had destination but with no size" << std::endl;
}

View File

@ -521,7 +521,7 @@ void DeadFlagCalculationEliminination::FoldBranch(IREmitter* IREmit, IRListView&
// Pattern match a branch fed by a compare. We could also handle bit tests
// here, but tbz/tbnz has a limited offset range which we don't have a way to
// deal with yet. Let's hope that's not a big deal.
if (!(Op->Cond == COND_NEQ || Op->Cond == COND_EQ) || (Prev->Size < 4)) {
if (!(Op->Cond == COND_NEQ || Op->Cond == COND_EQ) || (Prev->Size < OpSize::i32Bit)) {
return;
}
@ -534,7 +534,7 @@ void DeadFlagCalculationEliminination::FoldBranch(IREmitter* IREmit, IRListView&
IREmit->ReplaceNodeArgument(CodeNode, 0, CurrentIR.GetNode(Prev->Args[0]));
IREmit->ReplaceNodeArgument(CodeNode, 1, CurrentIR.GetNode(Prev->Args[1]));
Op->FromNZCV = false;
Op->CompareSize = IR::SizeToOpSize(Prev->Size);
Op->CompareSize = Prev->Size;
} else {
return;
}
@ -612,7 +612,7 @@ bool DeadFlagCalculationEliminination::ProcessBlock(IREmitter* IREmit, IRListVie
// this flag is outside of the if, since the TestNZ might result from
// optimizing AndWithFlags, and we need to converge locally in a single
// iteration.
if (IROp->Op == OP_TESTNZ && IROp->Size < 4 && !(FlagsRead & (FLAG_N | FLAG_C))) {
if (IROp->Op == OP_TESTNZ && IROp->Size < OpSize::i32Bit && !(FlagsRead & (FLAG_N | FLAG_C))) {
IROp->Op = OP_TESTZ;
}

View File

@ -582,7 +582,7 @@ void ConstrainedRAPass::Run(IREmitter* IREmit_) {
if (Reg.Class == FPRFixedClass) {
IROp_Header* Header = IR->GetOp<IROp_Header>(Old);
Copy = IREmit->_VMov(IR::SizeToOpSize(Header->Size), Map(Old));
Copy = IREmit->_VMov(Header->Size, Map(Old));
} else {
Copy = IREmit->_Copy(Map(Old));
}

View File

@ -731,7 +731,7 @@ void X87StackOptimization::Run(IREmitter* Emit) {
} else {
auto* SourceNode = CurrentIR.GetNode(Op->X80Src);
auto* OriginalNode = CurrentIR.GetNode(Op->OriginalValue);
StackData.push(StackMemberInfo {SourceNode, OriginalNode, SizeToOpSize(Op->LoadSize), Op->Float});
StackData.push(StackMemberInfo {SourceNode, OriginalNode, Op->LoadSize, Op->Float});
}
break;
}
@ -793,7 +793,7 @@ void X87StackOptimization::Run(IREmitter* Emit) {
// or similar. As long as the source size and dest size are one and the same.
// This will avoid any conversions between source and stack element size and conversion back.
if (!SlowPath && Value->Source && Value->Source->first == Op->StoreSize && Value->InterpretAsFloat) {
IREmit->_StoreMem(Value->InterpretAsFloat ? FPRClass : GPRClass, IR::SizeToOpSize(Op->StoreSize), AddrNode, Value->Source->second);
IREmit->_StoreMem(Value->InterpretAsFloat ? FPRClass : GPRClass, Op->StoreSize, AddrNode, Value->Source->second);
} else {
if (ReducedPrecisionMode) {
switch (Op->StoreSize) {
@ -826,7 +826,7 @@ void X87StackOptimization::Run(IREmitter* Emit) {
auto DestAddr = IREmit->_Add(OpSize::i64Bit, AddrNode, GetConstant(8));
IREmit->_StoreMem(GPRClass, OpSize::i16Bit, DestAddr, Upper, OpSize::i64Bit);
} else {
IREmit->_StoreMem(FPRClass, IR::SizeToOpSize(Op->StoreSize), AddrNode, StackNode);
IREmit->_StoreMem(FPRClass, Op->StoreSize, AddrNode, StackNode);
}
}
}