Merge pull request #2861 from Sonicadvance1/fix_vector_shift_by_zero

FEXCore: Fixes vector shifts by zero
This commit is contained in:
Mai 2023-08-09 05:52:10 -04:00 committed by GitHub
commit c77ed78f5a
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
5 changed files with 294 additions and 32 deletions

View File

@ -2474,11 +2474,25 @@ DEF_OP(VUShrI) {
if (HostSupportsSVE && Is256Bit) {
const auto Mask = PRED_TMP_32B.Merging();
// SVE LSR is destructive, so lets set up the destination.
movprfx(Dst.Z(), Vector.Z());
lsr(SubRegSize, Dst.Z(), Mask, Dst.Z(), BitShift);
if (BitShift == 0) {
if (Dst.Idx() != Vector.Idx()) {
mov(Dst.Z(), Vector.Z());
}
}
else {
// SVE LSR is destructive, so lets set up the destination.
movprfx(Dst.Z(), Vector.Z());
lsr(SubRegSize, Dst.Z(), Mask, Dst.Z(), BitShift);
}
} else {
ushr(SubRegSize, Dst.Q(), Vector.Q(), BitShift);
if (BitShift == 0) {
if (Dst.Idx() != Vector.Idx()) {
mov(Dst.Q(), Vector.Q());
}
}
else {
ushr(SubRegSize, Dst.Q(), Vector.Q(), BitShift);
}
}
}
}
@ -2504,11 +2518,25 @@ DEF_OP(VSShrI) {
if (HostSupportsSVE && Is256Bit) {
const auto Mask = PRED_TMP_32B.Merging();
// SVE ASR is destructive, so lets set up the destination.
movprfx(Dst.Z(), Vector.Z());
asr(SubRegSize, Dst.Z(), Mask, Dst.Z(), Shift);
if (Shift == 0) {
if (Dst.Idx() != Vector.Idx()) {
mov(Dst.Z(), Vector.Z());
}
}
else {
// SVE ASR is destructive, so lets set up the destination.
movprfx(Dst.Z(), Vector.Z());
asr(SubRegSize, Dst.Z(), Mask, Dst.Z(), Shift);
}
} else {
sshr(SubRegSize, Dst.Q(), Vector.Q(), Shift);
if (Shift == 0) {
if (Dst.Idx() != Vector.Idx()) {
mov(Dst.Q(), Vector.Q());
}
}
else {
sshr(SubRegSize, Dst.Q(), Vector.Q(), Shift);
}
}
}
@ -2537,12 +2565,25 @@ DEF_OP(VShlI) {
if (HostSupportsSVE && Is256Bit) {
const auto Mask = PRED_TMP_32B.Merging();
// SVE LSL is destructive, so lets set up the destination.
movprfx(Dst.Z(), Vector.Z());
lsl(SubRegSize, Dst.Z(), Mask, Dst.Z(), BitShift);
if (BitShift == 0) {
if (Dst.Idx() != Vector.Idx()) {
mov(Dst.Z(), Vector.Z());
}
}
else {
// SVE LSL is destructive, so lets set up the destination.
movprfx(Dst.Z(), Vector.Z());
lsl(SubRegSize, Dst.Z(), Mask, Dst.Z(), BitShift);
}
} else {
shl(SubRegSize, Dst.Q(), Vector.Q(), BitShift);
if (BitShift == 0) {
if (Dst.Idx() != Vector.Idx()) {
mov(Dst.Q(), Vector.Q());
}
}
else {
shl(SubRegSize, Dst.Q(), Vector.Q(), BitShift);
}
}
}
}

View File

@ -1591,7 +1591,10 @@ void OpDispatchBuilder::PSRLI(OpcodeArgs) {
auto Size = GetSrcSize(Op);
auto Shift = _VUShrI(Size, ElementSize, Dest, ShiftConstant);
auto Shift = Dest;
if (ShiftConstant != 0) {
Shift = _VUShrI(Size, ElementSize, Dest, ShiftConstant);
}
StoreResult(FPRClass, Op, Shift, -1);
}
@ -1611,7 +1614,11 @@ void OpDispatchBuilder::VPSRLIOp(OpcodeArgs) {
const uint64_t ShiftConstant = Op->Src[1].Data.Literal.Value;
OrderedNode *Src = LoadSource(FPRClass, Op, Op->Src[0], Op->Flags, -1);
OrderedNode *Result = _VUShrI(Size, ElementSize, Src, ShiftConstant);
OrderedNode *Result = Src;
if (ShiftConstant != 0) {
Result = _VUShrI(Size, ElementSize, Src, ShiftConstant);
}
if (Is128Bit) {
Result = _VMov(16, Result);
@ -1628,6 +1635,10 @@ void OpDispatchBuilder::VPSRLIOp<8>(OpcodeArgs);
OrderedNode* OpDispatchBuilder::PSLLIImpl(OpcodeArgs, size_t ElementSize,
OrderedNode *Src, uint64_t Shift) {
if (Shift == 0) {
// If zero-shift then just return the source.
return Src;
}
const auto Size = GetSrcSize(Op);
return _VShlI(Size, ElementSize, Src, Shift);
}
@ -1639,7 +1650,10 @@ void OpDispatchBuilder::PSLLI(OpcodeArgs) {
LOGMAN_THROW_A_FMT(Op->Src[1].IsLiteral(), "Src1 needs to be literal here");
const uint64_t ShiftConstant = Op->Src[1].Data.Literal.Value;
OrderedNode *Result = PSLLIImpl(Op, ElementSize, Dest, ShiftConstant);
OrderedNode *Result = Dest;
if (ShiftConstant != 0) {
Result = PSLLIImpl(Op, ElementSize, Dest, ShiftConstant);
}
StoreResult(FPRClass, Op, Result, -1);
}
@ -1660,7 +1674,11 @@ void OpDispatchBuilder::VPSLLIOp(OpcodeArgs) {
LOGMAN_THROW_A_FMT(Op->Src[1].IsLiteral(), "Src1 needs to be literal here");
const uint64_t ShiftConstant = Op->Src[1].Data.Literal.Value;
OrderedNode *Result = PSLLIImpl(Op, ElementSize, Src, ShiftConstant);
OrderedNode *Result = Src;
if (ShiftConstant != 0) {
Result = PSLLIImpl(Op, ElementSize, Src, ShiftConstant);
}
if (Is128Bit) {
Result = _VMov(16, Result);
}
@ -1819,9 +1837,12 @@ void OpDispatchBuilder::PSLLDQ(OpcodeArgs) {
auto Size = GetDstSize(Op);
OrderedNode *Result = _VectorZero(Size);
if (Shift < Size) {
Result = _VExtr(Size, 1, Dest, Result, Size - Shift);
OrderedNode *Result = Dest;
if (Shift != 0) {
Result = _VectorZero(Size);
if (Shift < Size) {
Result = _VExtr(Size, 1, Dest, Result, Size - Shift);
}
}
StoreResult(FPRClass, Op, Result, -1);
}
@ -1835,17 +1856,20 @@ void OpDispatchBuilder::VPSLLDQOp(OpcodeArgs) {
OrderedNode *Src = LoadSource(FPRClass, Op, Op->Src[0], Op->Flags, -1);
OrderedNode *Result = _VectorZero(DstSize);
if (Is128Bit) {
if (Shift < DstSize) {
Result = _VExtr(DstSize, 1, Src, Result, DstSize - Shift);
}
} else {
if (Shift < Core::CPUState::XMM_SSE_REG_SIZE) {
OrderedNode *ResultBottom = _VExtr(16, 1, Src, Result, 16 - Shift);
OrderedNode* ResultTop = _VExtr(DstSize, 1, Src, Result, DstSize - Shift);
OrderedNode *Result = Src;
if (Shift != 0) {
Result = _VectorZero(DstSize);
if (Is128Bit) {
if (Shift < DstSize) {
Result = _VExtr(DstSize, 1, Src, Result, DstSize - Shift);
}
} else {
if (Shift < Core::CPUState::XMM_SSE_REG_SIZE) {
OrderedNode *ResultBottom = _VExtr(16, 1, Src, Result, 16 - Shift);
OrderedNode* ResultTop = _VExtr(DstSize, 1, Src, Result, DstSize - Shift);
Result = _VInsElement(DstSize, 16, 1, 0, ResultBottom, ResultTop);
Result = _VInsElement(DstSize, 16, 1, 0, ResultBottom, ResultTop);
}
}
}
StoreResult(FPRClass, Op, Result, -1);
@ -1860,7 +1884,10 @@ void OpDispatchBuilder::PSRAIOp(OpcodeArgs) {
auto Size = GetDstSize(Op);
auto Result = _VSShrI(Size, ElementSize, Dest, Shift);
auto Result = Dest;
if (Shift != 0) {
Result = _VSShrI(Size, ElementSize, Dest, Shift);
}
StoreResult(FPRClass, Op, Result, -1);
}
@ -1878,7 +1905,10 @@ void OpDispatchBuilder::VPSRAIOp(OpcodeArgs) {
const auto Is128Bit = Size == Core::CPUState::XMM_SSE_REG_SIZE;
OrderedNode *Src = LoadSource(FPRClass, Op, Op->Src[0], Op->Flags, -1);
OrderedNode *Result = _VSShrI(Size, ElementSize, Src, Shift);
OrderedNode *Result = Src;
if (Shift != 0) {
Result = _VSShrI(Size, ElementSize, Src, Shift);
}
if (Is128Bit) {
Result = _VMov(16, Result);

View File

@ -0,0 +1,75 @@
%ifdef CONFIG
{
"RegData": {
"MM0": "0x4054c664c2f837b5",
"MM1": "0x40516053e2d6238e",
"MM2": "0x4044836d86ec17ec",
"MM3": "0x402a1e1c58255b03",
"MM4": "0x401568e0c9d9d346",
"MM5": "0x4035fe425aee6320",
"MM6": "0x402359003eea209b",
"MM7": "0x40154b7d41743e96",
"XMM0": ["0x4054c664c2f837b5", "0x40516053e2d6238e"],
"XMM1": ["0x4044836d86ec17ec", "0x402a1e1c58255b03"],
"XMM2": ["0x401568e0c9d9d346", "0x4035fe425aee6320"],
"XMM3": ["0x402359003eea209b", "0x40154b7d41743e96"],
"XMM4": ["0x403d075a31a4bdba", "0x4050a018bd66277c"],
"XMM5": ["0x40334ec17ebaf102", "0x4056d7404ea4a8c1"],
"XMM6": ["0x404439b5c7cd898b", "0x40497b136a400fbb"],
"XMM7": ["0x4040528bc169c23b", "0x4037f9ca18bd6627"],
"XMM8": ["0x4056a929888f861a", "0x403839b866e43aa8"],
"XMM9": ["0x4058bc1f212d7732", "0x4056cde5c91d14e4"]
}
}
%endif
; FEX had a bug where immediate encoded shifts by zero would generate bad code on AArch64.
lea rdx, [rel .data]
movq mm0, [rdx + 8 * 0]
movq mm1, [rdx + 8 * 1]
movq mm2, [rdx + 8 * 2]
movq mm3, [rdx + 8 * 3]
movq mm4, [rdx + 8 * 4]
movq mm5, [rdx + 8 * 5]
movq mm6, [rdx + 8 * 6]
movq mm7, [rdx + 8 * 7]
movapd xmm0, [rdx + 16 * 0]
movapd xmm1, [rdx + 16 * 1]
movapd xmm2, [rdx + 16 * 2]
movapd xmm3, [rdx + 16 * 3]
movapd xmm4, [rdx + 16 * 4]
movapd xmm5, [rdx + 16 * 5]
movapd xmm6, [rdx + 16 * 6]
movapd xmm7, [rdx + 16 * 7]
movapd xmm8, [rdx + 16 * 8]
movapd xmm9, [rdx + 16 * 9]
; Test MMX first
psllw mm0, 0
pslld mm1, 0
psllq mm2, 0
psraw mm3, 0
psrad mm4, 0
psrlw mm5, 0
psrld mm6, 0
psrlq mm7, 0
; Now test XMM
psllw xmm0, 0
pslld xmm1, 0
psllq xmm2, 0
pslldq xmm3, 0
psraw xmm4, 0
psrad xmm5, 0
psrlw xmm6, 0
psrld xmm7, 0
psrlq xmm8, 0
pslldq xmm9, 0
hlt
align 16
; 512bytes of random data
.data:
dq 83.0999,69.50512,41.02678,13.05881,5.35242,21.9932,9.67383,5.32372,29.02872,66.50151,19.30764,91.3633,40.45086,50.96153,32.64489,23.97574,90.64316,24.22547,98.9394,91.21715,90.80143,99.48407,64.97245,74.39838,35.22761,25.35321,5.8732,90.19956,33.03133,52.02952,58.38554,10.17531,47.84703,84.04831,90.02965,65.81329,96.27991,6.64479,25.58971,95.00694,88.1929,37.16964,49.52602,10.27223,77.70605,20.21439,9.8056,41.29389,15.4071,57.54286,9.61117,55.54302,52.90745,4.88086,72.52882,3.0201,56.55091,71.22749,61.84736,88.74295,47.72641,24.17404,33.70564,96.71303

View File

@ -0,0 +1,58 @@
%ifdef CONFIG
{
"HostFeatures": ["AVX"],
"RegData": {
"XMM0": ["0x4047dcfb00bcbe62", "0x40382c8de2ac3223", "0x4040da5269595fee", "0x40582da24894c448"],
"XMM1": ["0x404c46843808850a", "0x4051ce8f32378ab1", "0x404eec764adff823", "0x40562f8c7e28240b"],
"XMM2": ["0x404a7427525460aa", "0x4013860029f16b12", "0x405221d82fd75e20", "0x4008292a30553261"],
"XMM3": ["0x402ed06f69446738", "0x404cc57c6fbd273d", "0x402338eb463497b7", "0x404bc581adea8976"],
"XMM4": ["0x40536d2fec56d5d0", "0x403436e2435696e6", "0x40239c779a6b50b1", "0x4044a59e30014f8b"],
"XMM5": ["0x40560c58793dd97f", "0x404295b6c3760bf6", "0x4048c3549f94855e", "0x40248b61bb05faec"],
"XMM6": ["0x405811ea0ba1f4b2", "0x401a9443d46b26c0", "0x403996f73c0c1fc9", "0x4057c071b4784231"],
"XMM7": ["0x4047ec6b7aa25d8d", "0x4055031782d38477", "0x405681e5c91d14e4", "0x4050740cf1800a7c"],
"XMM10": ["0x40560c58793dd97f", "0x404295b6c3760bf6", "0x4048c3549f94855e", "0x40248b61bb05faec"],
"XMM11": ["0x40536d2fec56d5d0", "0x403436e2435696e6", "0x40239c779a6b50b1", "0x4044a59e30014f8b"],
"XMM12": ["0x402ed06f69446738", "0x404cc57c6fbd273d", "0x402338eb463497b7", "0x404bc581adea8976"],
"XMM13": ["0x404a7427525460aa", "0x4013860029f16b12", "0x405221d82fd75e20", "0x4008292a30553261"],
"XMM14": ["0x404c46843808850a", "0x4051ce8f32378ab1", "0x404eec764adff823", "0x40562f8c7e28240b"],
"XMM15": ["0x4047dcfb00bcbe62", "0x40382c8de2ac3223", "0x4040da5269595fee", "0x40582da24894c448"]
}
}
%endif
; FEX had a bug where immediate encoded shifts by zero would generate bad code on AArch64.
lea rdx, [rel .data]
vmovapd ymm0, [rdx + 32 * 0]
vmovapd ymm1, [rdx + 32 * 1]
vmovapd ymm2, [rdx + 32 * 2]
vmovapd ymm3, [rdx + 32 * 3]
vmovapd ymm4, [rdx + 32 * 4]
vmovapd ymm5, [rdx + 32 * 5]
vmovapd ymm6, [rdx + 32 * 6]
vmovapd ymm7, [rdx + 32 * 7]
vmovapd ymm8, [rdx + 32 * 8]
vmovapd ymm9, [rdx + 32 * 9]
vmovapd ymm10, [rdx + 32 * 10]
vmovapd ymm11, [rdx + 32 * 11]
vmovapd ymm12, [rdx + 32 * 12]
vmovapd ymm13, [rdx + 32 * 13]
vmovapd ymm14, [rdx + 32 * 14]
vmovapd ymm15, [rdx + 32 * 15]
vpsllw ymm0, ymm15, 0
vpslld ymm1, ymm14, 0
vpsllq ymm2, ymm13, 0
vpslldq ymm3, ymm12, 0
vpsraw ymm4, ymm11, 0
vpsrad ymm5, ymm10, 0
vpsrlw ymm6, ymm9, 0
vpsrld ymm7, ymm8, 0
vpsrlq ymm8, ymm7, 0
vpslldq ymm9, ymm6, 0
hlt
align 16
; 512bytes of random data
.data:
dq 83.0999,69.50512,41.02678,13.05881,5.35242,21.9932,9.67383,5.32372,29.02872,66.50151,19.30764,91.3633,40.45086,50.96153,32.64489,23.97574,90.64316,24.22547,98.9394,91.21715,90.80143,99.48407,64.97245,74.39838,35.22761,25.35321,5.8732,90.19956,33.03133,52.02952,58.38554,10.17531,47.84703,84.04831,90.02965,65.81329,96.27991,6.64479,25.58971,95.00694,88.1929,37.16964,49.52602,10.27223,77.70605,20.21439,9.8056,41.29389,15.4071,57.54286,9.61117,55.54302,52.90745,4.88086,72.52882,3.0201,56.55091,71.22749,61.84736,88.74295,47.72641,24.17404,33.70564,96.71303

View File

@ -0,0 +1,58 @@
%ifdef CONFIG
{
"HostFeatures": ["AVX"],
"RegData": {
"XMM0": ["0x4047dcfb00bcbe62", "0x40382c8de2ac3223", "0", "0"],
"XMM1": ["0x404c46843808850a", "0x4051ce8f32378ab1", "0", "0"],
"XMM2": ["0x404a7427525460aa", "0x4013860029f16b12", "0", "0"],
"XMM3": ["0x402ed06f69446738", "0x404cc57c6fbd273d", "0", "0"],
"XMM4": ["0x40536d2fec56d5d0", "0x403436e2435696e6", "0", "0"],
"XMM5": ["0x40560c58793dd97f", "0x404295b6c3760bf6", "0", "0"],
"XMM6": ["0x405811ea0ba1f4b2", "0x401a9443d46b26c0", "0", "0"],
"XMM7": ["0x4047ec6b7aa25d8d", "0x4055031782d38477", "0", "0"],
"XMM10": ["0x40560c58793dd97f", "0x404295b6c3760bf6", "0x4048c3549f94855e", "0x40248b61bb05faec"],
"XMM11": ["0x40536d2fec56d5d0", "0x403436e2435696e6", "0x40239c779a6b50b1", "0x4044a59e30014f8b"],
"XMM12": ["0x402ed06f69446738", "0x404cc57c6fbd273d", "0x402338eb463497b7", "0x404bc581adea8976"],
"XMM13": ["0x404a7427525460aa", "0x4013860029f16b12", "0x405221d82fd75e20", "0x4008292a30553261"],
"XMM14": ["0x404c46843808850a", "0x4051ce8f32378ab1", "0x404eec764adff823", "0x40562f8c7e28240b"],
"XMM15": ["0x4047dcfb00bcbe62", "0x40382c8de2ac3223", "0x4040da5269595fee", "0x40582da24894c448"]
}
}
%endif
; FEX had a bug where immediate encoded shifts by zero would generate bad code on AArch64.
lea rdx, [rel .data]
vmovapd ymm0, [rdx + 32 * 0]
vmovapd ymm1, [rdx + 32 * 1]
vmovapd ymm2, [rdx + 32 * 2]
vmovapd ymm3, [rdx + 32 * 3]
vmovapd ymm4, [rdx + 32 * 4]
vmovapd ymm5, [rdx + 32 * 5]
vmovapd ymm6, [rdx + 32 * 6]
vmovapd ymm7, [rdx + 32 * 7]
vmovapd ymm8, [rdx + 32 * 8]
vmovapd ymm9, [rdx + 32 * 9]
vmovapd ymm10, [rdx + 32 * 10]
vmovapd ymm11, [rdx + 32 * 11]
vmovapd ymm12, [rdx + 32 * 12]
vmovapd ymm13, [rdx + 32 * 13]
vmovapd ymm14, [rdx + 32 * 14]
vmovapd ymm15, [rdx + 32 * 15]
vpsllw xmm0, xmm15, 0
vpslld xmm1, xmm14, 0
vpsllq xmm2, xmm13, 0
vpslldq xmm3, xmm12, 0
vpsraw xmm4, xmm11, 0
vpsrad xmm5, xmm10, 0
vpsrlw xmm6, xmm9, 0
vpsrld xmm7, xmm8, 0
vpsrlq xmm8, xmm7, 0
vpslldq xmm9, xmm6, 0
hlt
align 16
; 512bytes of random data
.data:
dq 83.0999,69.50512,41.02678,13.05881,5.35242,21.9932,9.67383,5.32372,29.02872,66.50151,19.30764,91.3633,40.45086,50.96153,32.64489,23.97574,90.64316,24.22547,98.9394,91.21715,90.80143,99.48407,64.97245,74.39838,35.22761,25.35321,5.8732,90.19956,33.03133,52.02952,58.38554,10.17531,47.84703,84.04831,90.02965,65.81329,96.27991,6.64479,25.58971,95.00694,88.1929,37.16964,49.52602,10.27223,77.70605,20.21439,9.8056,41.29389,15.4071,57.54286,9.61117,55.54302,52.90745,4.88086,72.52882,3.0201,56.55091,71.22749,61.84736,88.74295,47.72641,24.17404,33.70564,96.71303