mirror of
https://github.com/PCSX2/pcsx2-sourceforge.git
synced 2026-02-04 03:11:18 +01:00
fixed some of the bugs i made, some changes by gabest in here too :)
This commit is contained in:
10
pcsx2/Misc.h
10
pcsx2/Misc.h
@@ -208,7 +208,7 @@ __forceinline void memcpy_pcsx2(void* dest, const void* src, size_t n)
|
||||
//FreezeMMXRegs(1); // mmx not used
|
||||
FreezeXMMRegs(1);
|
||||
memcpy(dest, src, n);
|
||||
// have to be unfroze by parent call!
|
||||
// have to be unfrozen by parent call!
|
||||
}
|
||||
#else
|
||||
#define memcpy_pcsx2 memcpy
|
||||
@@ -221,7 +221,8 @@ __forceinline void memcpy_pcsx2(void* dest, const void* src, size_t n)
|
||||
#if defined(_WIN32) && !defined(__x86_64__)
|
||||
// faster memcpy
|
||||
void * memcpy_amd_(void *dest, const void *src, size_t n);
|
||||
#define memcpy_fast memcpy_amd_
|
||||
//#define memcpy_fast memcpy_amd_
|
||||
#define memcpy_fast memcpy
|
||||
#else
|
||||
// for now disable linux fast memcpy
|
||||
#define memcpy_fast memcpy_pcsx2
|
||||
@@ -269,8 +270,9 @@ extern __forceinline void pcsx2_aligned_free(void* pmem)
|
||||
|
||||
// cross-platform atomic operations
|
||||
#if defined (_WIN32)
|
||||
|
||||
/*
|
||||
#ifndef __x86_64__ // for some reason x64 doesn't like this
|
||||
|
||||
LONG __cdecl _InterlockedIncrement(LONG volatile *Addend);
|
||||
LONG __cdecl _InterlockedDecrement(LONG volatile *Addend);
|
||||
LONG __cdecl _InterlockedCompareExchange(LPLONG volatile Dest, LONG Exchange, LONG Comp);
|
||||
@@ -286,7 +288,7 @@ LONG __cdecl _InterlockedAnd(LPLONG volatile Addend, LONG Value);
|
||||
|
||||
#pragma intrinsic (_InterlockedExchangeAdd)
|
||||
#define InterlockedExchangeAdd _InterlockedExchangeAdd
|
||||
|
||||
*/
|
||||
#else
|
||||
|
||||
typedef void* PVOID;
|
||||
|
||||
@@ -71,6 +71,7 @@ BOOL CALLBACK CpuDlgProc(HWND hW, UINT uMsg, WPARAM wParam, LPARAM lParam)
|
||||
if(cpucaps.hasStreamingSIMDExtensions) strcat(features,",SSE");
|
||||
if(cpucaps.hasStreamingSIMD2Extensions) strcat(features,",SSE2");
|
||||
if(cpucaps.hasStreamingSIMD3Extensions) strcat(features,",SSE3");
|
||||
if(cpucaps.hasStreamingSIMD4Extensions) strcat(features,",SSE4.1");
|
||||
// if(cpucaps.has3DNOWInstructionExtensions) strcat(features,",3DNOW");
|
||||
// if(cpucaps.has3DNOWInstructionExtensionsExt)strcat(features,",3DNOW+");
|
||||
if(cpucaps.hasAMD64BitArchitecture) strcat(features,",x86-64");
|
||||
|
||||
@@ -1315,6 +1315,41 @@ extern "C" void cpudetectSSE3(void* pfnCallSSE3)
|
||||
#endif
|
||||
}
|
||||
|
||||
extern "C" void cpudetectSSE4(void* pfnCallSSE4)
|
||||
{
|
||||
return;
|
||||
cpucaps.hasStreamingSIMD4Extensions = 1;
|
||||
|
||||
#ifdef _MSC_VER
|
||||
__try {
|
||||
//__asm call pfnCallSSE4;
|
||||
((TESTFNPTR)pfnCallSSE4)();
|
||||
}
|
||||
__except(EXCEPTION_EXECUTE_HANDLER) {
|
||||
cpucaps.hasStreamingSIMD4Extensions = 0;
|
||||
#ifdef PCSX2_VIRTUAL_MEM
|
||||
// necessary since can potentially kill the custom handler
|
||||
install_my_handler();
|
||||
#endif
|
||||
}
|
||||
#else // linux
|
||||
|
||||
#ifdef PCSX2_FORCESSE4
|
||||
cpucaps.hasStreamingSIMD4Extensions = 1;
|
||||
#else
|
||||
// exception handling doesn't work, so disable for x86 builds of linux
|
||||
cpucaps.hasStreamingSIMD4Extensions = 0;
|
||||
#endif
|
||||
// try {
|
||||
// __asm__("call *%0" : : "m"(pfnCallSSE4) );
|
||||
// }
|
||||
// catch(...) {
|
||||
// SysPrintf("no SSE4.1 found\n");
|
||||
// cpucaps.hasStreamingSIMD4Extensions = 0;
|
||||
// }
|
||||
#endif
|
||||
}
|
||||
|
||||
struct BASEBLOCKS
|
||||
{
|
||||
// 0 - ee, 1 - iop
|
||||
|
||||
@@ -396,6 +396,7 @@ BASEBLOCKEX** GetAllBaseBlocks(int* pnum, int cpu);
|
||||
|
||||
void SetMMXstate();
|
||||
void cpudetectSSE3(void* pfnCallSSE3);
|
||||
void cpudetectSSE4(void* pfnCallSSE4);
|
||||
|
||||
void _recMove128MtoM(u32 to, u32 from);
|
||||
|
||||
|
||||
@@ -985,7 +985,7 @@ void recRSQRT_S_xmm(int info)
|
||||
switch(info & (PROCESS_EE_S|PROCESS_EE_T) ) {
|
||||
case PROCESS_EE_S:
|
||||
if( EEREC_D == EEREC_S ) {
|
||||
///SysPrintf("RSQRT1\n");
|
||||
SysPrintf("RSQRT1\n");
|
||||
SSE_SQRTSS_M32_to_XMM(t0reg, (uptr)&fpuRegs.fpr[_Ft_]);
|
||||
SSE_DIVSS_XMM_to_XMM(EEREC_D, t0reg);
|
||||
}
|
||||
@@ -998,15 +998,19 @@ void recRSQRT_S_xmm(int info)
|
||||
|
||||
break;
|
||||
case PROCESS_EE_T:
|
||||
//SysPrintf("RSQRT3\n");
|
||||
SysPrintf("RSQRT3\n");
|
||||
if(EEREC_D == EEREC_T) {
|
||||
SSE_SQRTSS_XMM_to_XMM(t0reg, EEREC_T);
|
||||
SSE_MOVSS_M32_to_XMM(EEREC_D, (uptr)&fpuRegs.fpr[_Fs_]);
|
||||
}
|
||||
}else
|
||||
if(EEREC_D == EEREC_S) {
|
||||
SSE_SQRTSS_XMM_to_XMM(t0reg, EEREC_T);
|
||||
}
|
||||
else SSE_MOVSS_M32_to_XMM(EEREC_D, (uptr)&fpuRegs.fpr[_Fs_]);
|
||||
else {
|
||||
SysPrintf("RSQ3 Whoops\n");
|
||||
SSE_SQRTSS_XMM_to_XMM(t0reg, EEREC_T);
|
||||
SSE_MOVSS_M32_to_XMM(EEREC_D, (uptr)&fpuRegs.fpr[_Fs_]);
|
||||
}
|
||||
|
||||
SSE_DIVSS_XMM_to_XMM(EEREC_D, t0reg);
|
||||
break;
|
||||
@@ -1034,7 +1038,10 @@ void recRSQRT_S_xmm(int info)
|
||||
SSE_SQRTSS_XMM_to_XMM(t0reg, EEREC_T);
|
||||
}
|
||||
else {*/
|
||||
SSE_SQRTSS_M32_to_XMM(t0reg, (uptr)&fpuRegs.fpr[_Ft_]);
|
||||
//SSE_MOVSS_XMM_to_XMM(t0reg, EEREC_D);
|
||||
SSE_SQRTSS_M32_to_XMM(t0reg, (uptr)&fpuRegs.fpr[_Ft_]);
|
||||
|
||||
|
||||
//}
|
||||
|
||||
SysPrintf("RSQRT4\n");
|
||||
|
||||
@@ -133,20 +133,41 @@ static int SSEmovMask[ 16 ][ 4 ] =
|
||||
void VU_MERGE0(int dest, int src) { // 0000
|
||||
}
|
||||
void VU_MERGE1(int dest, int src) { // 1000
|
||||
SSE_MOVHLPS_XMM_to_XMM(src, dest);
|
||||
SSE_SHUFPS_XMM_to_XMM(dest, src, 0xc4);
|
||||
if( cpucaps.hasStreamingSIMD4Extensions )
|
||||
{
|
||||
SSE4_INSERTPS_XMM_to_XMM(dest, src, _MM_MK_INSERTPS_NDX(3, 3, 0));
|
||||
}
|
||||
else
|
||||
{
|
||||
SSE_MOVHLPS_XMM_to_XMM(src, dest);
|
||||
SSE_SHUFPS_XMM_to_XMM(dest, src, 0xc4);
|
||||
}
|
||||
}
|
||||
void VU_MERGE2(int dest, int src) { // 0100
|
||||
SSE_MOVHLPS_XMM_to_XMM(src, dest);
|
||||
SSE_SHUFPS_XMM_to_XMM(dest, src, 0x64);
|
||||
if( cpucaps.hasStreamingSIMD4Extensions )
|
||||
{
|
||||
SSE4_INSERTPS_XMM_to_XMM(dest, src, _MM_MK_INSERTPS_NDX(2, 2, 0));
|
||||
}
|
||||
else
|
||||
{
|
||||
SSE_MOVHLPS_XMM_to_XMM(src, dest);
|
||||
SSE_SHUFPS_XMM_to_XMM(dest, src, 0x64);
|
||||
}
|
||||
}
|
||||
void VU_MERGE3(int dest, int src) { // 1100
|
||||
SSE_SHUFPS_XMM_to_XMM(dest, src, 0xe4);
|
||||
}
|
||||
void VU_MERGE4(int dest, int src) { // 0010s
|
||||
SSE_MOVSS_XMM_to_XMM(src, dest);
|
||||
SSE_SHUFPS_XMM_to_XMM(src, dest, 0xe4);
|
||||
SSE_MOVAPS_XMM_to_XMM(dest, src);
|
||||
if( cpucaps.hasStreamingSIMD4Extensions )
|
||||
{
|
||||
SSE4_INSERTPS_XMM_to_XMM(dest, src, _MM_MK_INSERTPS_NDX(1, 1, 0));
|
||||
}
|
||||
else
|
||||
{
|
||||
SSE_MOVSS_XMM_to_XMM(src, dest);
|
||||
SSE_SHUFPS_XMM_to_XMM(src, dest, 0xe4);
|
||||
SSE_MOVAPS_XMM_to_XMM(dest, src);
|
||||
}
|
||||
}
|
||||
void VU_MERGE5(int dest, int src) { // 1010
|
||||
SSE_SHUFPS_XMM_to_XMM(dest, src, 0xd8);
|
||||
@@ -221,6 +242,7 @@ void _unpackVF_xyzw(int dstreg, int srcreg, int xyzw)
|
||||
}
|
||||
}
|
||||
else {
|
||||
/*
|
||||
switch (xyzw) {
|
||||
case 0:
|
||||
SSE3_MOVSLDUP_XMM_to_XMM(dstreg, srcreg);
|
||||
@@ -239,35 +261,60 @@ void _unpackVF_xyzw(int dstreg, int srcreg, int xyzw)
|
||||
SSE_MOVHLPS_XMM_to_XMM(dstreg, dstreg);
|
||||
break;
|
||||
}
|
||||
*/
|
||||
switch (xyzw) {
|
||||
case 0:
|
||||
SSE2_PSHUFD_XMM_to_XMM(dstreg, srcreg, 0x00);
|
||||
break;
|
||||
case 1:
|
||||
SSE2_PSHUFD_XMM_to_XMM(dstreg, srcreg, 0x55);
|
||||
break;
|
||||
case 2:
|
||||
SSE2_PSHUFD_XMM_to_XMM(dstreg, srcreg, 0xaa);
|
||||
break;
|
||||
case 3:
|
||||
SSE2_PSHUFD_XMM_to_XMM(dstreg, srcreg, 0xff);
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
void _unpackVFSS_xyzw(int dstreg, int srcreg, int xyzw)
|
||||
{
|
||||
switch (xyzw) {
|
||||
case 0:
|
||||
if( dstreg != srcreg ) SSE_MOVAPS_XMM_to_XMM(dstreg, srcreg);
|
||||
break;
|
||||
case 1:
|
||||
if( cpucaps.hasStreamingSIMD3Extensions ) SSE3_MOVSHDUP_XMM_to_XMM(dstreg, srcreg);
|
||||
else {
|
||||
if( cpucaps.hasStreamingSIMD4Extensions ) {
|
||||
switch (xyzw) {
|
||||
case 0: SSE4_INSERTPS_XMM_to_XMM(dstreg, srcreg, _MM_MK_INSERTPS_NDX(0, 0, 0)); break;
|
||||
case 1: SSE4_INSERTPS_XMM_to_XMM(dstreg, srcreg, _MM_MK_INSERTPS_NDX(1, 0, 0)); break;
|
||||
case 2: SSE4_INSERTPS_XMM_to_XMM(dstreg, srcreg, _MM_MK_INSERTPS_NDX(2, 0, 0)); break;
|
||||
case 3: SSE4_INSERTPS_XMM_to_XMM(dstreg, srcreg, _MM_MK_INSERTPS_NDX(3, 0, 0)); break;
|
||||
}
|
||||
}
|
||||
else {
|
||||
switch (xyzw) {
|
||||
case 0:
|
||||
if( dstreg != srcreg ) SSE_MOVAPS_XMM_to_XMM(dstreg, srcreg);
|
||||
SSE_SHUFPS_XMM_to_XMM(dstreg, dstreg, 0x55);
|
||||
}
|
||||
break;
|
||||
case 2:
|
||||
SSE_MOVHLPS_XMM_to_XMM(dstreg, srcreg);
|
||||
break;
|
||||
case 3:
|
||||
if( cpucaps.hasStreamingSIMD3Extensions && dstreg != srcreg ) {
|
||||
SSE3_MOVSHDUP_XMM_to_XMM(dstreg, srcreg);
|
||||
SSE_MOVHLPS_XMM_to_XMM(dstreg, dstreg);
|
||||
}
|
||||
else {
|
||||
if( dstreg != srcreg ) SSE_MOVAPS_XMM_to_XMM(dstreg, srcreg);
|
||||
SSE_SHUFPS_XMM_to_XMM(dstreg, dstreg, 0xff);
|
||||
}
|
||||
break;
|
||||
break;
|
||||
case 1:
|
||||
if( cpucaps.hasStreamingSIMD3Extensions ) SSE3_MOVSHDUP_XMM_to_XMM(dstreg, srcreg);
|
||||
else {
|
||||
if( dstreg != srcreg ) SSE_MOVAPS_XMM_to_XMM(dstreg, srcreg);
|
||||
SSE_SHUFPS_XMM_to_XMM(dstreg, dstreg, 0x55);
|
||||
}
|
||||
break;
|
||||
case 2:
|
||||
SSE_MOVHLPS_XMM_to_XMM(dstreg, srcreg);
|
||||
break;
|
||||
case 3:
|
||||
if( cpucaps.hasStreamingSIMD3Extensions && dstreg != srcreg ) {
|
||||
SSE3_MOVSHDUP_XMM_to_XMM(dstreg, srcreg);
|
||||
SSE_MOVHLPS_XMM_to_XMM(dstreg, dstreg);
|
||||
}
|
||||
else {
|
||||
if( dstreg != srcreg ) SSE_MOVAPS_XMM_to_XMM(dstreg, srcreg);
|
||||
SSE_SHUFPS_XMM_to_XMM(dstreg, dstreg, 0xff);
|
||||
}
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -927,6 +974,7 @@ int _vuGetTempXMMreg(int info)
|
||||
|
||||
if( _hasFreeXMMreg() ) {
|
||||
t1reg = _allocTempXMMreg(XMMT_FPS, -1);
|
||||
/*
|
||||
if( t1reg == EEREC_TEMP && _hasFreeXMMreg() ) {
|
||||
int t = _allocTempXMMreg(XMMT_FPS, -1);
|
||||
_freeXMMreg(t1reg);
|
||||
@@ -937,6 +985,18 @@ int _vuGetTempXMMreg(int info)
|
||||
_freeXMMreg(t1reg);
|
||||
t1reg = -1;
|
||||
}
|
||||
*/
|
||||
if( t1reg == EEREC_TEMP ) {
|
||||
if( _hasFreeXMMreg() ) {
|
||||
int t = _allocTempXMMreg(XMMT_FPS, -1);
|
||||
_freeXMMreg(t1reg);
|
||||
t1reg = t;
|
||||
}
|
||||
else {
|
||||
_freeXMMreg(t1reg);
|
||||
t1reg = -1;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return t1reg;
|
||||
@@ -3073,6 +3133,8 @@ void recVUMI_CLIP(VURegs *VU, int info)
|
||||
|
||||
void recVUMI_DIV(VURegs *VU, int info)
|
||||
{
|
||||
int t1reg;
|
||||
|
||||
if( _Fs_ == 0 ) {
|
||||
|
||||
if( _Ft_ == 0 ) {
|
||||
@@ -3101,10 +3163,31 @@ void recVUMI_DIV(VURegs *VU, int info)
|
||||
// don't use RCPSS (very bad precision)
|
||||
SSE_MOVSS_M32_to_XMM(EEREC_TEMP, (uptr)&VU->VF[0].UL[3]);
|
||||
|
||||
if( _Ftf_ == 0 || (xmmregs[EEREC_T].mode & MODE_WRITE) ) {
|
||||
if( _Ftf_ ) SSE_SHUFPS_XMM_to_XMM(EEREC_T, EEREC_T, (0xe4e4>>(2*_Ftf_))&0xff);
|
||||
SSE_DIVSS_XMM_to_XMM(EEREC_TEMP, EEREC_T);
|
||||
if( _Ftf_ ) SSE_SHUFPS_XMM_to_XMM(EEREC_T, EEREC_T, (0xe4e4>>(8-2*_Ftf_))&0xff);
|
||||
if( _Ftf_ != 0 || (xmmregs[EEREC_T].mode & MODE_WRITE) )
|
||||
{
|
||||
if( _Ftf_ )
|
||||
{
|
||||
t1reg = _vuGetTempXMMreg(info);
|
||||
|
||||
if( t1reg >= 0 )
|
||||
{
|
||||
_unpackVFSS_xyzw(t1reg, EEREC_T, _Ftf_);
|
||||
|
||||
SSE_DIVSS_XMM_to_XMM(EEREC_TEMP, t1reg);
|
||||
|
||||
_freeXMMreg(t1reg);
|
||||
}
|
||||
else
|
||||
{
|
||||
SSE_SHUFPS_XMM_to_XMM(EEREC_T, EEREC_T, (0xe4e4>>(2*_Ftf_))&0xff);
|
||||
SSE_DIVSS_XMM_to_XMM(EEREC_TEMP, EEREC_T);
|
||||
SSE_SHUFPS_XMM_to_XMM(EEREC_T, EEREC_T, (0xe4e4>>(8-2*_Ftf_))&0xff); // revert
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
SSE_DIVSS_XMM_to_XMM(EEREC_TEMP, EEREC_T);
|
||||
}
|
||||
}
|
||||
else {
|
||||
SSE_DIVSS_M32_to_XMM(EEREC_TEMP, (uptr)&VU->VF[_Ft_].UL[_Ftf_]);
|
||||
@@ -3136,14 +3219,33 @@ void recVUMI_DIV(VURegs *VU, int info)
|
||||
|
||||
return;
|
||||
}
|
||||
|
||||
if( _Fsf_ == 0 ) SSE_MOVAPS_XMM_to_XMM(EEREC_TEMP, EEREC_S);
|
||||
else _unpackVF_xyzw(EEREC_TEMP, EEREC_S, _Fsf_);
|
||||
|
||||
if( _Ftf_ ) SSE_SHUFPS_XMM_to_XMM(EEREC_T, EEREC_T, (0xe4e4>>(2*_Ftf_))&0xff);
|
||||
SSE_DIVSS_XMM_to_XMM(EEREC_TEMP, EEREC_T);
|
||||
if( _Ftf_ )
|
||||
{
|
||||
t1reg = _vuGetTempXMMreg(info);
|
||||
|
||||
// revert
|
||||
if( _Ftf_ ) SSE_SHUFPS_XMM_to_XMM(EEREC_T, EEREC_T, (0xe4e4>>(8-2*_Ftf_))&0xff);
|
||||
if( t1reg >= 0 )
|
||||
{
|
||||
_unpackVFSS_xyzw(t1reg, EEREC_T, _Ftf_);
|
||||
|
||||
SSE_DIVSS_XMM_to_XMM(EEREC_TEMP, t1reg);
|
||||
|
||||
_freeXMMreg(t1reg);
|
||||
}
|
||||
else
|
||||
{
|
||||
SSE_SHUFPS_XMM_to_XMM(EEREC_T, EEREC_T, (0xe4e4>>(2*_Ftf_))&0xff);
|
||||
SSE_DIVSS_XMM_to_XMM(EEREC_TEMP, EEREC_T);
|
||||
SSE_SHUFPS_XMM_to_XMM(EEREC_T, EEREC_T, (0xe4e4>>(8-2*_Ftf_))&0xff); // revert
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
SSE_DIVSS_XMM_to_XMM(EEREC_TEMP, EEREC_T);
|
||||
}
|
||||
}
|
||||
|
||||
//if( !CHECK_FORCEABS ) {
|
||||
@@ -3226,11 +3328,11 @@ void recVUMI_RSQRT(VURegs *VU, int info)
|
||||
if( _Fsf_ == 3 ) {
|
||||
if(_Ft_ != 0 ||_Ftf_ == 3 )
|
||||
{
|
||||
//SysPrintf("_Fs_ = 0.3 _Ft_ != 0 || _Ft_ = 0.3 \n");
|
||||
SysPrintf("_Fs_ = 0.3 _Ft_ != 0 || _Ft_ = 0.3 \n");
|
||||
SSE_SQRTSS_XMM_to_XMM(EEREC_TEMP, EEREC_TEMP); //Dont use RSQRT, terrible accuracy
|
||||
SSE_MOVSS_XMM_to_M32(VU_VI_ADDR(REG_Q, 0), EEREC_TEMP);
|
||||
SSE_MOVAPS_XMM_to_XMM(EEREC_TEMP, EEREC_S);
|
||||
_unpackVF_xyzw(EEREC_TEMP, EEREC_TEMP, _Fsf_);
|
||||
//SSE_MOVAPS_XMM_to_XMM(EEREC_TEMP, EEREC_S);
|
||||
_unpackVF_xyzw(EEREC_TEMP, EEREC_S, _Fsf_);
|
||||
SSE_DIVSS_M32_to_XMM(EEREC_TEMP, VU_VI_ADDR(REG_Q, 0));
|
||||
|
||||
|
||||
@@ -3283,7 +3385,7 @@ void recVUMI_RSQRT(VURegs *VU, int info)
|
||||
}
|
||||
|
||||
}
|
||||
//SysPrintf("Normal RSQRT\n");
|
||||
SysPrintf("Normal RSQRT\n");
|
||||
SSE_RSQRTSS_XMM_to_XMM(EEREC_TEMP, EEREC_TEMP);
|
||||
if( _Fsf_ ) SSE_SHUFPS_XMM_to_XMM(EEREC_S, EEREC_S, (0xe4e4>>(2*_Fsf_))&0xff);
|
||||
SSE_MULSS_XMM_to_XMM(EEREC_TEMP, EEREC_S);
|
||||
@@ -4681,23 +4783,32 @@ void recVUMI_WAITP(VURegs *VU, int info)
|
||||
// in all EFU insts, EEREC_D is a temp reg
|
||||
void vuSqSumXYZ(int regd, int regs, int regtemp)
|
||||
{
|
||||
SSE_MOVAPS_XMM_to_XMM(regtemp, regs);
|
||||
SSE_MULPS_XMM_to_XMM(regtemp, regtemp);
|
||||
if( cpucaps.hasStreamingSIMD4Extensions )
|
||||
{
|
||||
SSE_MOVAPS_XMM_to_XMM(regd, regs);
|
||||
SSE4_DPPS_XMM_to_XMM(regd, regd, 0x71);
|
||||
}
|
||||
else
|
||||
{
|
||||
SSE_MOVAPS_XMM_to_XMM(regtemp, regs);
|
||||
SSE_MULPS_XMM_to_XMM(regtemp, regtemp);
|
||||
|
||||
if( cpucaps.hasStreamingSIMD3Extensions ) {
|
||||
SSE3_HADDPS_XMM_to_XMM(regd, regtemp);
|
||||
SSE_ADDPS_XMM_to_XMM(regd, regtemp); // regd.z = x+y+z
|
||||
SSE_MOVHLPS_XMM_to_XMM(regd, regd); // move to x
|
||||
if( cpucaps.hasStreamingSIMD3Extensions ) {
|
||||
SSE3_HADDPS_XMM_to_XMM(regd, regtemp);
|
||||
SSE_ADDPS_XMM_to_XMM(regd, regtemp); // regd.z = x+y+z
|
||||
SSE_MOVHLPS_XMM_to_XMM(regd, regd); // move to x
|
||||
}
|
||||
else {
|
||||
SSE_MOVSS_XMM_to_XMM(regd, regtemp);
|
||||
SSE_SHUFPS_XMM_to_XMM(regtemp, regtemp, 0xE1);
|
||||
SSE_ADDSS_XMM_to_XMM(regd, regtemp);
|
||||
SSE_SHUFPS_XMM_to_XMM(regtemp, regtemp, 0xD2);
|
||||
SSE_ADDSS_XMM_to_XMM(regd, regtemp);
|
||||
SSE_SHUFPS_XMM_to_XMM(regtemp, regtemp, 0xC6);
|
||||
}
|
||||
}
|
||||
else {
|
||||
SSE_MOVSS_XMM_to_XMM(regd, regtemp);
|
||||
SSE_SHUFPS_XMM_to_XMM(regtemp, regtemp, 0xE1);
|
||||
SSE_ADDSS_XMM_to_XMM(regd, regtemp);
|
||||
SSE_SHUFPS_XMM_to_XMM(regtemp, regtemp, 0xD2);
|
||||
SSE_ADDSS_XMM_to_XMM(regd, regtemp);
|
||||
SSE_SHUFPS_XMM_to_XMM(regtemp, regtemp, 0xC6);
|
||||
}
|
||||
//SysPrintf("SUMXYZ\n");
|
||||
|
||||
//SysPrintf("SUMXYZ\n");
|
||||
}
|
||||
|
||||
void recVUMI_ESADD( VURegs *VU, int info)
|
||||
@@ -4717,24 +4828,34 @@ void recVUMI_ESADD( VURegs *VU, int info)
|
||||
void recVUMI_ERSADD( VURegs *VU, int info )
|
||||
{
|
||||
assert( VU == &VU1 );
|
||||
// almost same as vuSqSumXYZ
|
||||
SSE_MOVAPS_XMM_to_XMM(EEREC_TEMP, EEREC_S);
|
||||
SSE_MULPS_XMM_to_XMM(EEREC_TEMP, EEREC_TEMP);
|
||||
|
||||
if( cpucaps.hasStreamingSIMD3Extensions ) {
|
||||
SSE3_HADDPS_XMM_to_XMM(EEREC_D, EEREC_TEMP);
|
||||
SSE_ADDPS_XMM_to_XMM(EEREC_D, EEREC_TEMP); // EEREC_D.z = x+y+z
|
||||
SSE_MOVSS_M32_to_XMM(EEREC_TEMP, (uptr)&VU->VF[0].UL[3]);
|
||||
SSE_MOVHLPS_XMM_to_XMM(EEREC_D, EEREC_D); // move to x
|
||||
// almost same as vuSqSumXYZ
|
||||
|
||||
if( cpucaps.hasStreamingSIMD4Extensions )
|
||||
{
|
||||
SSE_MOVAPS_XMM_to_XMM(EEREC_TEMP, EEREC_S);
|
||||
SSE4_DPPS_XMM_to_XMM(EEREC_TEMP, EEREC_TEMP, 0x71);
|
||||
}
|
||||
else {
|
||||
SSE_MOVHLPS_XMM_to_XMM(EEREC_D, EEREC_TEMP);
|
||||
SSE_ADDSS_XMM_to_XMM(EEREC_D, EEREC_TEMP);
|
||||
SSE_SHUFPS_XMM_to_XMM(EEREC_TEMP, EEREC_TEMP, 0x55);
|
||||
SSE_ADDSS_XMM_to_XMM(EEREC_D, EEREC_TEMP);
|
||||
SSE_MOVSS_M32_to_XMM(EEREC_TEMP, (uptr)&VU->VF[0].UL[3]);
|
||||
else
|
||||
{
|
||||
SSE_MOVAPS_XMM_to_XMM(EEREC_TEMP, EEREC_S);
|
||||
SSE_MULPS_XMM_to_XMM(EEREC_TEMP, EEREC_TEMP);
|
||||
|
||||
if( cpucaps.hasStreamingSIMD3Extensions ) {
|
||||
SSE3_HADDPS_XMM_to_XMM(EEREC_D, EEREC_TEMP);
|
||||
SSE_ADDPS_XMM_to_XMM(EEREC_D, EEREC_TEMP); // EEREC_D.z = x+y+z
|
||||
SSE_MOVHLPS_XMM_to_XMM(EEREC_D, EEREC_D); // move to x
|
||||
}
|
||||
else {
|
||||
SSE_MOVHLPS_XMM_to_XMM(EEREC_D, EEREC_TEMP);
|
||||
SSE_ADDSS_XMM_to_XMM(EEREC_D, EEREC_TEMP);
|
||||
SSE_SHUFPS_XMM_to_XMM(EEREC_TEMP, EEREC_TEMP, 0x55);
|
||||
SSE_ADDSS_XMM_to_XMM(EEREC_D, EEREC_TEMP);
|
||||
}
|
||||
}
|
||||
|
||||
SSE_MOVSS_M32_to_XMM(EEREC_TEMP, (uptr)&VU->VF[0].UL[3]);
|
||||
|
||||
// don't use RCPSS (very bad precision)
|
||||
SSE_DIVSS_XMM_to_XMM(EEREC_TEMP, EEREC_D);
|
||||
CheckForOverflowSS_(EEREC_TEMP, EEREC_D);
|
||||
@@ -4756,9 +4877,9 @@ void recVUMI_ELENG( VURegs *VU, int info )
|
||||
void recVUMI_ERLENG( VURegs *VU, int info )
|
||||
{
|
||||
assert( VU == &VU1 );
|
||||
vuSqSumXYZ(EEREC_D, EEREC_S, EEREC_TEMP);
|
||||
//SysPrintf("ERLENG\n");
|
||||
SSE_SQRTSS_XMM_to_XMM(EEREC_TEMP, EEREC_D);
|
||||
vuSqSumXYZ(EEREC_TEMP, EEREC_S, EEREC_TEMP); //Dont want to use EEREC_D incase it overwrites something
|
||||
SysPrintf("ERLENG\n");
|
||||
SSE_SQRTSS_XMM_to_XMM(EEREC_TEMP, EEREC_TEMP);
|
||||
SSE_MOVSS_XMM_to_M32(VU_VI_ADDR(REG_P, 0), EEREC_TEMP);
|
||||
SSE_MOVSS_M32_to_XMM(EEREC_TEMP, (uptr)&VU->VF[0].UL[3]);
|
||||
SSE_DIVSS_M32_to_XMM(EEREC_TEMP, VU_VI_ADDR(REG_P, 0));
|
||||
@@ -4946,7 +5067,9 @@ void recVUMI_ERSQRT( VURegs *VU, int info )
|
||||
//SSE_CMPNESS_XMM_to_XMM(EEREC_D, EEREC_TEMP);
|
||||
SysPrintf("ERSQRT\n");
|
||||
SSE_SQRTSS_XMM_to_XMM(EEREC_TEMP, EEREC_TEMP);
|
||||
SSE_DIVSS_M32_to_XMM(EEREC_TEMP, (uptr)&VU->VF[0].UL[3]);
|
||||
SSE_MOVSS_XMM_to_M32(VU_VI_ADDR(REG_P, 0), EEREC_TEMP);
|
||||
SSE_MOVSS_M32_to_XMM(EEREC_TEMP, (uptr)&VU->VF[0].UL[3]);
|
||||
SSE_DIVSS_M32_to_XMM(EEREC_TEMP, VU_VI_ADDR(REG_P, 0));
|
||||
//SSE_ANDPS_XMM_to_XMM(EEREC_TEMP, EEREC_D);
|
||||
}
|
||||
else {
|
||||
|
||||
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
@@ -234,6 +234,7 @@ void cpudetectInit( void )
|
||||
cpuinfo.x86PType = (regs[ 0 ] >> 12) & 0x3;
|
||||
x86_64_8BITBRANDID = regs[1] & 0xff;
|
||||
cpuinfo.x86Flags = regs[ 3 ];
|
||||
cpuinfo.x86Flags2 = regs[ 2 ];
|
||||
}
|
||||
}
|
||||
if ( iCpuId( 0x80000000, regs ) != -1 )
|
||||
@@ -302,6 +303,7 @@ void cpudetectInit( void )
|
||||
cpucaps.hasFastStreamingSIMDExtensionsSaveRestore = ( cpuinfo.x86Flags >> 24 ) & 1;
|
||||
cpucaps.hasStreamingSIMDExtensions = ( cpuinfo.x86Flags >> 25 ) & 1; //sse
|
||||
cpucaps.hasStreamingSIMD2Extensions = ( cpuinfo.x86Flags >> 26 ) & 1; //sse2
|
||||
cpucaps.hasStreamingSIMD4Extensions = ( cpuinfo.x86Flags2 >> 19 ) & 1; //sse4.1
|
||||
cpucaps.hasSelfSnoop = ( cpuinfo.x86Flags >> 27 ) & 1;
|
||||
cpucaps.hasHyperThreading = ( cpuinfo.x86Flags >> 28 ) & 1;
|
||||
cpucaps.hasThermalMonitor = ( cpuinfo.x86Flags >> 29 ) & 1;
|
||||
|
||||
@@ -437,6 +437,7 @@ void SSE_ORPS_XMM_to_XMM( x86SSERegType to, x86SSERegType from ) { SSERtoR( 0x5
|
||||
//**********************************************************************************/
|
||||
//XORPS : Bitwise Logical XOR of Single-Precision FP Values *
|
||||
//**********************************************************************************
|
||||
|
||||
void SSE_XORPS_M128_to_XMM( x86SSERegType to, uptr from ) { SSEMtoR( 0x570f, 0 ); }
|
||||
void SSE_XORPS_XMM_to_XMM( x86SSERegType to, x86SSERegType from ) { SSERtoR( 0x570f ); }
|
||||
|
||||
@@ -1149,6 +1150,45 @@ void SSE3_MOVSLDUP_M128_to_XMM(x86SSERegType to, uptr from) { write8(0xf3); SSEM
|
||||
void SSE3_MOVSHDUP_XMM_to_XMM(x86SSERegType to, x86SSERegType from) { write8(0xf3); SSERtoR(0x160f); }
|
||||
void SSE3_MOVSHDUP_M128_to_XMM(x86SSERegType to, uptr from) { write8(0xf3); SSEMtoR(0x160f, 0); }
|
||||
|
||||
// SSE4.1
|
||||
|
||||
void SSE4_DPPS_XMM_to_XMM(x86SSERegType to, x86SSERegType from, u8 imm8)
|
||||
{
|
||||
write8(0x66);
|
||||
write24(0x403A0F);
|
||||
ModRM(3, to, from);
|
||||
write8(imm8);
|
||||
}
|
||||
|
||||
void SSE4_DPPS_M128_to_XMM(x86SSERegType to, uptr from, u8 imm8)
|
||||
{
|
||||
const int overb = 0; // TODO: x64?
|
||||
|
||||
write8(0x66);
|
||||
write24(0x403A0F);
|
||||
ModRM(0, to, DISP32);
|
||||
write32(MEMADDR(from, 4 + overb));
|
||||
write8(imm8);
|
||||
}
|
||||
|
||||
void SSE4_INSERTPS_XMM_to_XMM(x86SSERegType to, x86SSERegType from, u8 imm8)
|
||||
{
|
||||
write8(0x66);
|
||||
RexRB(0, to, from);
|
||||
write24(0x213A0F);
|
||||
ModRM(3, to, from);
|
||||
write8(imm8);
|
||||
}
|
||||
|
||||
void SSE4_EXTRACTPS_XMM_to_R32(x86IntRegType to, x86SSERegType from, u8 imm8)
|
||||
{
|
||||
write8(0x66);
|
||||
RexRB(0, to, from);
|
||||
write24(0x173A0F);
|
||||
ModRM(3, to, from);
|
||||
write8(imm8);
|
||||
}
|
||||
|
||||
// SSE-X
|
||||
void SSEX_MOVDQA_M128_to_XMM( x86SSERegType to, uptr from )
|
||||
{
|
||||
|
||||
Reference in New Issue
Block a user