fixed some of the bugs i made, some changes by gabest in here too :)

This commit is contained in:
refractionpcsx2
2008-05-19 18:08:04 +00:00
parent 09fe8c869b
commit b8144ab706
10 changed files with 5409 additions and 5167 deletions

View File

@@ -208,7 +208,7 @@ __forceinline void memcpy_pcsx2(void* dest, const void* src, size_t n)
//FreezeMMXRegs(1); // mmx not used
FreezeXMMRegs(1);
memcpy(dest, src, n);
// have to be unfroze by parent call!
// have to be unfrozen by parent call!
}
#else
#define memcpy_pcsx2 memcpy
@@ -221,7 +221,8 @@ __forceinline void memcpy_pcsx2(void* dest, const void* src, size_t n)
#if defined(_WIN32) && !defined(__x86_64__)
// faster memcpy
void * memcpy_amd_(void *dest, const void *src, size_t n);
#define memcpy_fast memcpy_amd_
//#define memcpy_fast memcpy_amd_
#define memcpy_fast memcpy
#else
// for now disable linux fast memcpy
#define memcpy_fast memcpy_pcsx2
@@ -269,8 +270,9 @@ extern __forceinline void pcsx2_aligned_free(void* pmem)
// cross-platform atomic operations
#if defined (_WIN32)
/*
#ifndef __x86_64__ // for some reason x64 doesn't like this
LONG __cdecl _InterlockedIncrement(LONG volatile *Addend);
LONG __cdecl _InterlockedDecrement(LONG volatile *Addend);
LONG __cdecl _InterlockedCompareExchange(LPLONG volatile Dest, LONG Exchange, LONG Comp);
@@ -286,7 +288,7 @@ LONG __cdecl _InterlockedAnd(LPLONG volatile Addend, LONG Value);
#pragma intrinsic (_InterlockedExchangeAdd)
#define InterlockedExchangeAdd _InterlockedExchangeAdd
*/
#else
typedef void* PVOID;

View File

@@ -71,6 +71,7 @@ BOOL CALLBACK CpuDlgProc(HWND hW, UINT uMsg, WPARAM wParam, LPARAM lParam)
if(cpucaps.hasStreamingSIMDExtensions) strcat(features,",SSE");
if(cpucaps.hasStreamingSIMD2Extensions) strcat(features,",SSE2");
if(cpucaps.hasStreamingSIMD3Extensions) strcat(features,",SSE3");
if(cpucaps.hasStreamingSIMD4Extensions) strcat(features,",SSE4.1");
// if(cpucaps.has3DNOWInstructionExtensions) strcat(features,",3DNOW");
// if(cpucaps.has3DNOWInstructionExtensionsExt)strcat(features,",3DNOW+");
if(cpucaps.hasAMD64BitArchitecture) strcat(features,",x86-64");

View File

@@ -1315,6 +1315,41 @@ extern "C" void cpudetectSSE3(void* pfnCallSSE3)
#endif
}
extern "C" void cpudetectSSE4(void* pfnCallSSE4)
{
return;
cpucaps.hasStreamingSIMD4Extensions = 1;
#ifdef _MSC_VER
__try {
//__asm call pfnCallSSE4;
((TESTFNPTR)pfnCallSSE4)();
}
__except(EXCEPTION_EXECUTE_HANDLER) {
cpucaps.hasStreamingSIMD4Extensions = 0;
#ifdef PCSX2_VIRTUAL_MEM
// necessary since can potentially kill the custom handler
install_my_handler();
#endif
}
#else // linux
#ifdef PCSX2_FORCESSE4
cpucaps.hasStreamingSIMD4Extensions = 1;
#else
// exception handling doesn't work, so disable for x86 builds of linux
cpucaps.hasStreamingSIMD4Extensions = 0;
#endif
// try {
// __asm__("call *%0" : : "m"(pfnCallSSE4) );
// }
// catch(...) {
// SysPrintf("no SSE4.1 found\n");
// cpucaps.hasStreamingSIMD4Extensions = 0;
// }
#endif
}
struct BASEBLOCKS
{
// 0 - ee, 1 - iop

View File

@@ -396,6 +396,7 @@ BASEBLOCKEX** GetAllBaseBlocks(int* pnum, int cpu);
void SetMMXstate();
void cpudetectSSE3(void* pfnCallSSE3);
void cpudetectSSE4(void* pfnCallSSE4);
void _recMove128MtoM(u32 to, u32 from);

View File

@@ -985,7 +985,7 @@ void recRSQRT_S_xmm(int info)
switch(info & (PROCESS_EE_S|PROCESS_EE_T) ) {
case PROCESS_EE_S:
if( EEREC_D == EEREC_S ) {
///SysPrintf("RSQRT1\n");
SysPrintf("RSQRT1\n");
SSE_SQRTSS_M32_to_XMM(t0reg, (uptr)&fpuRegs.fpr[_Ft_]);
SSE_DIVSS_XMM_to_XMM(EEREC_D, t0reg);
}
@@ -998,15 +998,19 @@ void recRSQRT_S_xmm(int info)
break;
case PROCESS_EE_T:
//SysPrintf("RSQRT3\n");
SysPrintf("RSQRT3\n");
if(EEREC_D == EEREC_T) {
SSE_SQRTSS_XMM_to_XMM(t0reg, EEREC_T);
SSE_MOVSS_M32_to_XMM(EEREC_D, (uptr)&fpuRegs.fpr[_Fs_]);
}
}else
if(EEREC_D == EEREC_S) {
SSE_SQRTSS_XMM_to_XMM(t0reg, EEREC_T);
}
else SSE_MOVSS_M32_to_XMM(EEREC_D, (uptr)&fpuRegs.fpr[_Fs_]);
else {
SysPrintf("RSQ3 Whoops\n");
SSE_SQRTSS_XMM_to_XMM(t0reg, EEREC_T);
SSE_MOVSS_M32_to_XMM(EEREC_D, (uptr)&fpuRegs.fpr[_Fs_]);
}
SSE_DIVSS_XMM_to_XMM(EEREC_D, t0reg);
break;
@@ -1034,7 +1038,10 @@ void recRSQRT_S_xmm(int info)
SSE_SQRTSS_XMM_to_XMM(t0reg, EEREC_T);
}
else {*/
SSE_SQRTSS_M32_to_XMM(t0reg, (uptr)&fpuRegs.fpr[_Ft_]);
//SSE_MOVSS_XMM_to_XMM(t0reg, EEREC_D);
SSE_SQRTSS_M32_to_XMM(t0reg, (uptr)&fpuRegs.fpr[_Ft_]);
//}
SysPrintf("RSQRT4\n");

View File

@@ -133,20 +133,41 @@ static int SSEmovMask[ 16 ][ 4 ] =
void VU_MERGE0(int dest, int src) { // 0000
}
void VU_MERGE1(int dest, int src) { // 1000
SSE_MOVHLPS_XMM_to_XMM(src, dest);
SSE_SHUFPS_XMM_to_XMM(dest, src, 0xc4);
if( cpucaps.hasStreamingSIMD4Extensions )
{
SSE4_INSERTPS_XMM_to_XMM(dest, src, _MM_MK_INSERTPS_NDX(3, 3, 0));
}
else
{
SSE_MOVHLPS_XMM_to_XMM(src, dest);
SSE_SHUFPS_XMM_to_XMM(dest, src, 0xc4);
}
}
void VU_MERGE2(int dest, int src) { // 0100
SSE_MOVHLPS_XMM_to_XMM(src, dest);
SSE_SHUFPS_XMM_to_XMM(dest, src, 0x64);
if( cpucaps.hasStreamingSIMD4Extensions )
{
SSE4_INSERTPS_XMM_to_XMM(dest, src, _MM_MK_INSERTPS_NDX(2, 2, 0));
}
else
{
SSE_MOVHLPS_XMM_to_XMM(src, dest);
SSE_SHUFPS_XMM_to_XMM(dest, src, 0x64);
}
}
void VU_MERGE3(int dest, int src) { // 1100
SSE_SHUFPS_XMM_to_XMM(dest, src, 0xe4);
}
void VU_MERGE4(int dest, int src) { // 0010s
SSE_MOVSS_XMM_to_XMM(src, dest);
SSE_SHUFPS_XMM_to_XMM(src, dest, 0xe4);
SSE_MOVAPS_XMM_to_XMM(dest, src);
if( cpucaps.hasStreamingSIMD4Extensions )
{
SSE4_INSERTPS_XMM_to_XMM(dest, src, _MM_MK_INSERTPS_NDX(1, 1, 0));
}
else
{
SSE_MOVSS_XMM_to_XMM(src, dest);
SSE_SHUFPS_XMM_to_XMM(src, dest, 0xe4);
SSE_MOVAPS_XMM_to_XMM(dest, src);
}
}
void VU_MERGE5(int dest, int src) { // 1010
SSE_SHUFPS_XMM_to_XMM(dest, src, 0xd8);
@@ -221,6 +242,7 @@ void _unpackVF_xyzw(int dstreg, int srcreg, int xyzw)
}
}
else {
/*
switch (xyzw) {
case 0:
SSE3_MOVSLDUP_XMM_to_XMM(dstreg, srcreg);
@@ -239,35 +261,60 @@ void _unpackVF_xyzw(int dstreg, int srcreg, int xyzw)
SSE_MOVHLPS_XMM_to_XMM(dstreg, dstreg);
break;
}
*/
switch (xyzw) {
case 0:
SSE2_PSHUFD_XMM_to_XMM(dstreg, srcreg, 0x00);
break;
case 1:
SSE2_PSHUFD_XMM_to_XMM(dstreg, srcreg, 0x55);
break;
case 2:
SSE2_PSHUFD_XMM_to_XMM(dstreg, srcreg, 0xaa);
break;
case 3:
SSE2_PSHUFD_XMM_to_XMM(dstreg, srcreg, 0xff);
break;
}
}
}
void _unpackVFSS_xyzw(int dstreg, int srcreg, int xyzw)
{
switch (xyzw) {
case 0:
if( dstreg != srcreg ) SSE_MOVAPS_XMM_to_XMM(dstreg, srcreg);
break;
case 1:
if( cpucaps.hasStreamingSIMD3Extensions ) SSE3_MOVSHDUP_XMM_to_XMM(dstreg, srcreg);
else {
if( cpucaps.hasStreamingSIMD4Extensions ) {
switch (xyzw) {
case 0: SSE4_INSERTPS_XMM_to_XMM(dstreg, srcreg, _MM_MK_INSERTPS_NDX(0, 0, 0)); break;
case 1: SSE4_INSERTPS_XMM_to_XMM(dstreg, srcreg, _MM_MK_INSERTPS_NDX(1, 0, 0)); break;
case 2: SSE4_INSERTPS_XMM_to_XMM(dstreg, srcreg, _MM_MK_INSERTPS_NDX(2, 0, 0)); break;
case 3: SSE4_INSERTPS_XMM_to_XMM(dstreg, srcreg, _MM_MK_INSERTPS_NDX(3, 0, 0)); break;
}
}
else {
switch (xyzw) {
case 0:
if( dstreg != srcreg ) SSE_MOVAPS_XMM_to_XMM(dstreg, srcreg);
SSE_SHUFPS_XMM_to_XMM(dstreg, dstreg, 0x55);
}
break;
case 2:
SSE_MOVHLPS_XMM_to_XMM(dstreg, srcreg);
break;
case 3:
if( cpucaps.hasStreamingSIMD3Extensions && dstreg != srcreg ) {
SSE3_MOVSHDUP_XMM_to_XMM(dstreg, srcreg);
SSE_MOVHLPS_XMM_to_XMM(dstreg, dstreg);
}
else {
if( dstreg != srcreg ) SSE_MOVAPS_XMM_to_XMM(dstreg, srcreg);
SSE_SHUFPS_XMM_to_XMM(dstreg, dstreg, 0xff);
}
break;
break;
case 1:
if( cpucaps.hasStreamingSIMD3Extensions ) SSE3_MOVSHDUP_XMM_to_XMM(dstreg, srcreg);
else {
if( dstreg != srcreg ) SSE_MOVAPS_XMM_to_XMM(dstreg, srcreg);
SSE_SHUFPS_XMM_to_XMM(dstreg, dstreg, 0x55);
}
break;
case 2:
SSE_MOVHLPS_XMM_to_XMM(dstreg, srcreg);
break;
case 3:
if( cpucaps.hasStreamingSIMD3Extensions && dstreg != srcreg ) {
SSE3_MOVSHDUP_XMM_to_XMM(dstreg, srcreg);
SSE_MOVHLPS_XMM_to_XMM(dstreg, dstreg);
}
else {
if( dstreg != srcreg ) SSE_MOVAPS_XMM_to_XMM(dstreg, srcreg);
SSE_SHUFPS_XMM_to_XMM(dstreg, dstreg, 0xff);
}
break;
}
}
}
@@ -927,6 +974,7 @@ int _vuGetTempXMMreg(int info)
if( _hasFreeXMMreg() ) {
t1reg = _allocTempXMMreg(XMMT_FPS, -1);
/*
if( t1reg == EEREC_TEMP && _hasFreeXMMreg() ) {
int t = _allocTempXMMreg(XMMT_FPS, -1);
_freeXMMreg(t1reg);
@@ -937,6 +985,18 @@ int _vuGetTempXMMreg(int info)
_freeXMMreg(t1reg);
t1reg = -1;
}
*/
if( t1reg == EEREC_TEMP ) {
if( _hasFreeXMMreg() ) {
int t = _allocTempXMMreg(XMMT_FPS, -1);
_freeXMMreg(t1reg);
t1reg = t;
}
else {
_freeXMMreg(t1reg);
t1reg = -1;
}
}
}
return t1reg;
@@ -3073,6 +3133,8 @@ void recVUMI_CLIP(VURegs *VU, int info)
void recVUMI_DIV(VURegs *VU, int info)
{
int t1reg;
if( _Fs_ == 0 ) {
if( _Ft_ == 0 ) {
@@ -3101,10 +3163,31 @@ void recVUMI_DIV(VURegs *VU, int info)
// don't use RCPSS (very bad precision)
SSE_MOVSS_M32_to_XMM(EEREC_TEMP, (uptr)&VU->VF[0].UL[3]);
if( _Ftf_ == 0 || (xmmregs[EEREC_T].mode & MODE_WRITE) ) {
if( _Ftf_ ) SSE_SHUFPS_XMM_to_XMM(EEREC_T, EEREC_T, (0xe4e4>>(2*_Ftf_))&0xff);
SSE_DIVSS_XMM_to_XMM(EEREC_TEMP, EEREC_T);
if( _Ftf_ ) SSE_SHUFPS_XMM_to_XMM(EEREC_T, EEREC_T, (0xe4e4>>(8-2*_Ftf_))&0xff);
if( _Ftf_ != 0 || (xmmregs[EEREC_T].mode & MODE_WRITE) )
{
if( _Ftf_ )
{
t1reg = _vuGetTempXMMreg(info);
if( t1reg >= 0 )
{
_unpackVFSS_xyzw(t1reg, EEREC_T, _Ftf_);
SSE_DIVSS_XMM_to_XMM(EEREC_TEMP, t1reg);
_freeXMMreg(t1reg);
}
else
{
SSE_SHUFPS_XMM_to_XMM(EEREC_T, EEREC_T, (0xe4e4>>(2*_Ftf_))&0xff);
SSE_DIVSS_XMM_to_XMM(EEREC_TEMP, EEREC_T);
SSE_SHUFPS_XMM_to_XMM(EEREC_T, EEREC_T, (0xe4e4>>(8-2*_Ftf_))&0xff); // revert
}
}
else
{
SSE_DIVSS_XMM_to_XMM(EEREC_TEMP, EEREC_T);
}
}
else {
SSE_DIVSS_M32_to_XMM(EEREC_TEMP, (uptr)&VU->VF[_Ft_].UL[_Ftf_]);
@@ -3136,14 +3219,33 @@ void recVUMI_DIV(VURegs *VU, int info)
return;
}
if( _Fsf_ == 0 ) SSE_MOVAPS_XMM_to_XMM(EEREC_TEMP, EEREC_S);
else _unpackVF_xyzw(EEREC_TEMP, EEREC_S, _Fsf_);
if( _Ftf_ ) SSE_SHUFPS_XMM_to_XMM(EEREC_T, EEREC_T, (0xe4e4>>(2*_Ftf_))&0xff);
SSE_DIVSS_XMM_to_XMM(EEREC_TEMP, EEREC_T);
if( _Ftf_ )
{
t1reg = _vuGetTempXMMreg(info);
// revert
if( _Ftf_ ) SSE_SHUFPS_XMM_to_XMM(EEREC_T, EEREC_T, (0xe4e4>>(8-2*_Ftf_))&0xff);
if( t1reg >= 0 )
{
_unpackVFSS_xyzw(t1reg, EEREC_T, _Ftf_);
SSE_DIVSS_XMM_to_XMM(EEREC_TEMP, t1reg);
_freeXMMreg(t1reg);
}
else
{
SSE_SHUFPS_XMM_to_XMM(EEREC_T, EEREC_T, (0xe4e4>>(2*_Ftf_))&0xff);
SSE_DIVSS_XMM_to_XMM(EEREC_TEMP, EEREC_T);
SSE_SHUFPS_XMM_to_XMM(EEREC_T, EEREC_T, (0xe4e4>>(8-2*_Ftf_))&0xff); // revert
}
}
else
{
SSE_DIVSS_XMM_to_XMM(EEREC_TEMP, EEREC_T);
}
}
//if( !CHECK_FORCEABS ) {
@@ -3226,11 +3328,11 @@ void recVUMI_RSQRT(VURegs *VU, int info)
if( _Fsf_ == 3 ) {
if(_Ft_ != 0 ||_Ftf_ == 3 )
{
//SysPrintf("_Fs_ = 0.3 _Ft_ != 0 || _Ft_ = 0.3 \n");
SysPrintf("_Fs_ = 0.3 _Ft_ != 0 || _Ft_ = 0.3 \n");
SSE_SQRTSS_XMM_to_XMM(EEREC_TEMP, EEREC_TEMP); //Dont use RSQRT, terrible accuracy
SSE_MOVSS_XMM_to_M32(VU_VI_ADDR(REG_Q, 0), EEREC_TEMP);
SSE_MOVAPS_XMM_to_XMM(EEREC_TEMP, EEREC_S);
_unpackVF_xyzw(EEREC_TEMP, EEREC_TEMP, _Fsf_);
//SSE_MOVAPS_XMM_to_XMM(EEREC_TEMP, EEREC_S);
_unpackVF_xyzw(EEREC_TEMP, EEREC_S, _Fsf_);
SSE_DIVSS_M32_to_XMM(EEREC_TEMP, VU_VI_ADDR(REG_Q, 0));
@@ -3283,7 +3385,7 @@ void recVUMI_RSQRT(VURegs *VU, int info)
}
}
//SysPrintf("Normal RSQRT\n");
SysPrintf("Normal RSQRT\n");
SSE_RSQRTSS_XMM_to_XMM(EEREC_TEMP, EEREC_TEMP);
if( _Fsf_ ) SSE_SHUFPS_XMM_to_XMM(EEREC_S, EEREC_S, (0xe4e4>>(2*_Fsf_))&0xff);
SSE_MULSS_XMM_to_XMM(EEREC_TEMP, EEREC_S);
@@ -4681,23 +4783,32 @@ void recVUMI_WAITP(VURegs *VU, int info)
// in all EFU insts, EEREC_D is a temp reg
void vuSqSumXYZ(int regd, int regs, int regtemp)
{
SSE_MOVAPS_XMM_to_XMM(regtemp, regs);
SSE_MULPS_XMM_to_XMM(regtemp, regtemp);
if( cpucaps.hasStreamingSIMD4Extensions )
{
SSE_MOVAPS_XMM_to_XMM(regd, regs);
SSE4_DPPS_XMM_to_XMM(regd, regd, 0x71);
}
else
{
SSE_MOVAPS_XMM_to_XMM(regtemp, regs);
SSE_MULPS_XMM_to_XMM(regtemp, regtemp);
if( cpucaps.hasStreamingSIMD3Extensions ) {
SSE3_HADDPS_XMM_to_XMM(regd, regtemp);
SSE_ADDPS_XMM_to_XMM(regd, regtemp); // regd.z = x+y+z
SSE_MOVHLPS_XMM_to_XMM(regd, regd); // move to x
if( cpucaps.hasStreamingSIMD3Extensions ) {
SSE3_HADDPS_XMM_to_XMM(regd, regtemp);
SSE_ADDPS_XMM_to_XMM(regd, regtemp); // regd.z = x+y+z
SSE_MOVHLPS_XMM_to_XMM(regd, regd); // move to x
}
else {
SSE_MOVSS_XMM_to_XMM(regd, regtemp);
SSE_SHUFPS_XMM_to_XMM(regtemp, regtemp, 0xE1);
SSE_ADDSS_XMM_to_XMM(regd, regtemp);
SSE_SHUFPS_XMM_to_XMM(regtemp, regtemp, 0xD2);
SSE_ADDSS_XMM_to_XMM(regd, regtemp);
SSE_SHUFPS_XMM_to_XMM(regtemp, regtemp, 0xC6);
}
}
else {
SSE_MOVSS_XMM_to_XMM(regd, regtemp);
SSE_SHUFPS_XMM_to_XMM(regtemp, regtemp, 0xE1);
SSE_ADDSS_XMM_to_XMM(regd, regtemp);
SSE_SHUFPS_XMM_to_XMM(regtemp, regtemp, 0xD2);
SSE_ADDSS_XMM_to_XMM(regd, regtemp);
SSE_SHUFPS_XMM_to_XMM(regtemp, regtemp, 0xC6);
}
//SysPrintf("SUMXYZ\n");
//SysPrintf("SUMXYZ\n");
}
void recVUMI_ESADD( VURegs *VU, int info)
@@ -4717,24 +4828,34 @@ void recVUMI_ESADD( VURegs *VU, int info)
void recVUMI_ERSADD( VURegs *VU, int info )
{
assert( VU == &VU1 );
// almost same as vuSqSumXYZ
SSE_MOVAPS_XMM_to_XMM(EEREC_TEMP, EEREC_S);
SSE_MULPS_XMM_to_XMM(EEREC_TEMP, EEREC_TEMP);
if( cpucaps.hasStreamingSIMD3Extensions ) {
SSE3_HADDPS_XMM_to_XMM(EEREC_D, EEREC_TEMP);
SSE_ADDPS_XMM_to_XMM(EEREC_D, EEREC_TEMP); // EEREC_D.z = x+y+z
SSE_MOVSS_M32_to_XMM(EEREC_TEMP, (uptr)&VU->VF[0].UL[3]);
SSE_MOVHLPS_XMM_to_XMM(EEREC_D, EEREC_D); // move to x
// almost same as vuSqSumXYZ
if( cpucaps.hasStreamingSIMD4Extensions )
{
SSE_MOVAPS_XMM_to_XMM(EEREC_TEMP, EEREC_S);
SSE4_DPPS_XMM_to_XMM(EEREC_TEMP, EEREC_TEMP, 0x71);
}
else {
SSE_MOVHLPS_XMM_to_XMM(EEREC_D, EEREC_TEMP);
SSE_ADDSS_XMM_to_XMM(EEREC_D, EEREC_TEMP);
SSE_SHUFPS_XMM_to_XMM(EEREC_TEMP, EEREC_TEMP, 0x55);
SSE_ADDSS_XMM_to_XMM(EEREC_D, EEREC_TEMP);
SSE_MOVSS_M32_to_XMM(EEREC_TEMP, (uptr)&VU->VF[0].UL[3]);
else
{
SSE_MOVAPS_XMM_to_XMM(EEREC_TEMP, EEREC_S);
SSE_MULPS_XMM_to_XMM(EEREC_TEMP, EEREC_TEMP);
if( cpucaps.hasStreamingSIMD3Extensions ) {
SSE3_HADDPS_XMM_to_XMM(EEREC_D, EEREC_TEMP);
SSE_ADDPS_XMM_to_XMM(EEREC_D, EEREC_TEMP); // EEREC_D.z = x+y+z
SSE_MOVHLPS_XMM_to_XMM(EEREC_D, EEREC_D); // move to x
}
else {
SSE_MOVHLPS_XMM_to_XMM(EEREC_D, EEREC_TEMP);
SSE_ADDSS_XMM_to_XMM(EEREC_D, EEREC_TEMP);
SSE_SHUFPS_XMM_to_XMM(EEREC_TEMP, EEREC_TEMP, 0x55);
SSE_ADDSS_XMM_to_XMM(EEREC_D, EEREC_TEMP);
}
}
SSE_MOVSS_M32_to_XMM(EEREC_TEMP, (uptr)&VU->VF[0].UL[3]);
// don't use RCPSS (very bad precision)
SSE_DIVSS_XMM_to_XMM(EEREC_TEMP, EEREC_D);
CheckForOverflowSS_(EEREC_TEMP, EEREC_D);
@@ -4756,9 +4877,9 @@ void recVUMI_ELENG( VURegs *VU, int info )
void recVUMI_ERLENG( VURegs *VU, int info )
{
assert( VU == &VU1 );
vuSqSumXYZ(EEREC_D, EEREC_S, EEREC_TEMP);
//SysPrintf("ERLENG\n");
SSE_SQRTSS_XMM_to_XMM(EEREC_TEMP, EEREC_D);
vuSqSumXYZ(EEREC_TEMP, EEREC_S, EEREC_TEMP); //Dont want to use EEREC_D incase it overwrites something
SysPrintf("ERLENG\n");
SSE_SQRTSS_XMM_to_XMM(EEREC_TEMP, EEREC_TEMP);
SSE_MOVSS_XMM_to_M32(VU_VI_ADDR(REG_P, 0), EEREC_TEMP);
SSE_MOVSS_M32_to_XMM(EEREC_TEMP, (uptr)&VU->VF[0].UL[3]);
SSE_DIVSS_M32_to_XMM(EEREC_TEMP, VU_VI_ADDR(REG_P, 0));
@@ -4946,7 +5067,9 @@ void recVUMI_ERSQRT( VURegs *VU, int info )
//SSE_CMPNESS_XMM_to_XMM(EEREC_D, EEREC_TEMP);
SysPrintf("ERSQRT\n");
SSE_SQRTSS_XMM_to_XMM(EEREC_TEMP, EEREC_TEMP);
SSE_DIVSS_M32_to_XMM(EEREC_TEMP, (uptr)&VU->VF[0].UL[3]);
SSE_MOVSS_XMM_to_M32(VU_VI_ADDR(REG_P, 0), EEREC_TEMP);
SSE_MOVSS_M32_to_XMM(EEREC_TEMP, (uptr)&VU->VF[0].UL[3]);
SSE_DIVSS_M32_to_XMM(EEREC_TEMP, VU_VI_ADDR(REG_P, 0));
//SSE_ANDPS_XMM_to_XMM(EEREC_TEMP, EEREC_D);
}
else {

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

View File

@@ -234,6 +234,7 @@ void cpudetectInit( void )
cpuinfo.x86PType = (regs[ 0 ] >> 12) & 0x3;
x86_64_8BITBRANDID = regs[1] & 0xff;
cpuinfo.x86Flags = regs[ 3 ];
cpuinfo.x86Flags2 = regs[ 2 ];
}
}
if ( iCpuId( 0x80000000, regs ) != -1 )
@@ -302,6 +303,7 @@ void cpudetectInit( void )
cpucaps.hasFastStreamingSIMDExtensionsSaveRestore = ( cpuinfo.x86Flags >> 24 ) & 1;
cpucaps.hasStreamingSIMDExtensions = ( cpuinfo.x86Flags >> 25 ) & 1; //sse
cpucaps.hasStreamingSIMD2Extensions = ( cpuinfo.x86Flags >> 26 ) & 1; //sse2
cpucaps.hasStreamingSIMD4Extensions = ( cpuinfo.x86Flags2 >> 19 ) & 1; //sse4.1
cpucaps.hasSelfSnoop = ( cpuinfo.x86Flags >> 27 ) & 1;
cpucaps.hasHyperThreading = ( cpuinfo.x86Flags >> 28 ) & 1;
cpucaps.hasThermalMonitor = ( cpuinfo.x86Flags >> 29 ) & 1;

View File

@@ -437,6 +437,7 @@ void SSE_ORPS_XMM_to_XMM( x86SSERegType to, x86SSERegType from ) { SSERtoR( 0x5
//**********************************************************************************/
//XORPS : Bitwise Logical XOR of Single-Precision FP Values *
//**********************************************************************************
void SSE_XORPS_M128_to_XMM( x86SSERegType to, uptr from ) { SSEMtoR( 0x570f, 0 ); }
void SSE_XORPS_XMM_to_XMM( x86SSERegType to, x86SSERegType from ) { SSERtoR( 0x570f ); }
@@ -1149,6 +1150,45 @@ void SSE3_MOVSLDUP_M128_to_XMM(x86SSERegType to, uptr from) { write8(0xf3); SSEM
void SSE3_MOVSHDUP_XMM_to_XMM(x86SSERegType to, x86SSERegType from) { write8(0xf3); SSERtoR(0x160f); }
void SSE3_MOVSHDUP_M128_to_XMM(x86SSERegType to, uptr from) { write8(0xf3); SSEMtoR(0x160f, 0); }
// SSE4.1
void SSE4_DPPS_XMM_to_XMM(x86SSERegType to, x86SSERegType from, u8 imm8)
{
write8(0x66);
write24(0x403A0F);
ModRM(3, to, from);
write8(imm8);
}
void SSE4_DPPS_M128_to_XMM(x86SSERegType to, uptr from, u8 imm8)
{
const int overb = 0; // TODO: x64?
write8(0x66);
write24(0x403A0F);
ModRM(0, to, DISP32);
write32(MEMADDR(from, 4 + overb));
write8(imm8);
}
void SSE4_INSERTPS_XMM_to_XMM(x86SSERegType to, x86SSERegType from, u8 imm8)
{
write8(0x66);
RexRB(0, to, from);
write24(0x213A0F);
ModRM(3, to, from);
write8(imm8);
}
void SSE4_EXTRACTPS_XMM_to_R32(x86IntRegType to, x86SSERegType from, u8 imm8)
{
write8(0x66);
RexRB(0, to, from);
write24(0x173A0F);
ModRM(3, to, from);
write8(imm8);
}
// SSE-X
void SSEX_MOVDQA_M128_to_XMM( x86SSERegType to, uptr from )
{