ReorderingMTGS:

* Implemented GIFPath_CopyTag, which performs a "copy-in-place" while parsing tags (big speedup over the old parse-then-copy strategy, especially with the SSE intrinsics I've included for kicks).
 * Removed the old ringbuffer 'restart' mechanism and replaced it with a truly free-flowing wrapping mechanism.  Utilizes the ringbuffer more efficiently, and removes quite a bit of overhead from the MTGS's PrepDataPacket call.

git-svn-id: http://pcsx2.googlecode.com/svn/branches/ReorderingMTGS@3458 96395faa-99c1-11dd-bbfe-3dabce05a288
This commit is contained in:
Jake.Stine 2010-07-11 04:53:50 +00:00
parent 65f50f009f
commit a9084741bc
12 changed files with 304 additions and 301 deletions

View File

@ -564,6 +564,7 @@ typedef void (CALLBACK* _PS2EsetEmuVersion)(const char* emuId, u32 version); //
typedef s32 (CALLBACK* _GSopen)(void *pDsp, char *Title, int multithread);
typedef s32 (CALLBACK* _GSopen2)( void *pDsp, u32 flags );
typedef void (CALLBACK* _GSvsync)(int field);
typedef void (CALLBACK* _GSgifTransfer)(u32 *pMem, u32 size);
typedef void (CALLBACK* _GSgifTransfer1)(u32 *pMem, u32 addr);
typedef void (CALLBACK* _GSgifTransfer2)(u32 *pMem, u32 size);
typedef void (CALLBACK* _GSgifTransfer3)(u32 *pMem, u32 size);
@ -723,6 +724,7 @@ typedef void (CALLBACK* _FWirqCallback)(void (*callback)());
extern _GSopen GSopen;
extern _GSopen2 GSopen2;
extern _GSvsync GSvsync;
extern _GSgifTransfer GSgifTransfer;
extern _GSgifTransfer1 GSgifTransfer1;
extern _GSgifTransfer2 GSgifTransfer2;
extern _GSgifTransfer3 GSgifTransfer3;

View File

@ -36,7 +36,7 @@
// Only used in the Windows version of memzero.h. But it's in Misc.cpp for some reason.
void _memset16_unaligned( void* dest, u16 data, size_t size );
#define memcpy_fast memcpy_amd_ // Fast memcpy
#define memcpy_aligned memcpy_amd_ // Memcpy with 16-byte Aligned addresses
#define memcpy_const memcpy_amd_ // Memcpy with constant size
#define memcpy_constA memcpy_amd_ // Memcpy with constant size and 16-byte aligned
#define memcpy_fast memcpy_amd_ // Fast memcpy
#define memcpy_aligned(d,s,c) memcpy_amd_(d,s,c*16) // Memcpy with 16-byte Aligned addresses
#define memcpy_const memcpy_amd_ // Memcpy with constant size
#define memcpy_constA memcpy_amd_ // Memcpy with constant size and 16-byte aligned

View File

@ -195,10 +195,9 @@ void __fastcall WriteFIFO_page_6(u32 mem, const mem128_t *value)
nloop0_packet[1] = psHu32(GIF_FIFO + 4);
nloop0_packet[2] = psHu32(GIF_FIFO + 8);
nloop0_packet[3] = psHu32(GIF_FIFO + 12);
GetMTGS().PrepDataPacket(GIF_PATH_3, (u8*)nloop0_packet, 1);
GetMTGS().PrepDataPacket(GIF_PATH_3, 1);
u64* data = (u64*)GetMTGS().GetDataPacketPtr();
data[0] = value[0];
data[1] = value[1];
GIFPath_CopyTag( GIF_PATH_3, (u128*)nloop0_packet, 1 );
GetMTGS().SendDataPacket();
if(GSTransferStatus.PTH3 == STOPPED_MODE && gifRegs->stat.APATH == GIF_APATH3 )
{

View File

@ -229,7 +229,7 @@ enum GIF_PATH
GIF_PATH_3,
};
extern int GIFPath_ParseTag(GIF_PATH pathidx, const u8* pMem, u32 size);
extern int GIFPath_CopyTag(GIF_PATH pathidx, const u128* pMem, u32 size);
extern int GIFPath_ParseTagQuick(GIF_PATH pathidx, const u8* pMem, u32 size);
extern void GIFPath_Reset();
extern void GIFPath_Clear( GIF_PATH pathidx );
@ -282,7 +282,6 @@ public:
volatile s32 m_SignalRingPosition;
int m_QueuedFrameCount;
u32 m_RingWrapSpot;
Mutex m_lock_RingBufferBusy;
Semaphore m_sem_OnRingReset;
@ -301,6 +300,7 @@ public:
// These vars maintain instance data for sending Data Packets.
// Only one data packet can be constructed and uploaded at a time.
uint m_packet_startpos; // size of the packet (data only, ie. not including the 16 byte command!)
uint m_packet_size; // size of the packet (data only, ie. not including the 16 byte command!)
uint m_packet_ringpos; // index of the data location in the ringbuffer.
@ -317,14 +317,13 @@ public:
void WaitGS();
void ResetGS();
int PrepDataPacket( MTGS_RingCommand cmd, u32 size );
int PrepDataPacket( GIF_PATH pathidx, const u8* srcdata, u32 size );
void PrepDataPacket( MTGS_RingCommand cmd, u32 size );
void PrepDataPacket( GIF_PATH pathidx, u32 size );
void SendDataPacket();
void SendGameCRC( u32 crc );
void WaitForOpen();
void Freeze( int mode, MTGS_FreezeData& data );
void RestartRingbuffer( uint packsize=0 );
void SendSimplePacket( MTGS_RingCommand type, int data0, int data1, int data2 );
void SendPointerPacket( MTGS_RingCommand type, u32 data0, void* data1 );
@ -416,3 +415,31 @@ extern int g_nLeftGSFrames;
#endif
// Size of the ringbuffer as a power of 2 -- size is a multiple of simd128s.
// (actual size is 1<<m_RingBufferSizeFactor simd vectors [128-bit values])
// A value of 19 is a 8meg ring buffer. 18 would be 4 megs, and 20 would be 16 megs.
// Default was 2mb, but some games with lots of MTGS activity want 8mb to run fast (rama)
static const uint RingBufferSizeFactor = 19;
// size of the ringbuffer in simd128's.
static const uint RingBufferSize = 1<<RingBufferSizeFactor;
// Mask to apply to ring buffer indices to wrap the pointer from end to
// start (the wrapping is what makes it a ringbuffer, yo!)
static const uint RingBufferMask = RingBufferSize - 1;
struct MTGS_BufferedData
{
u128 m_Ring[RingBufferSize];
u8 Regs[Ps2MemSize::GSregs];
MTGS_BufferedData() {}
u128& operator[]( uint idx )
{
pxAssert( idx < RingBufferSize );
return m_Ring[idx];
}
};
extern __aligned(32) MTGS_BufferedData RingBuffer;

View File

@ -59,16 +59,15 @@ void gsPath1Interrupt()
gifRegs->stat.P1Q = false;
while(Path1WritePos > 0)
{
u32 size = GetMTGS().PrepDataPacket(GIF_PATH_1, Path1Buffer + (Path1ReadPos * 16), (Path1WritePos - Path1ReadPos));
u8* pDest = GetMTGS().GetDataPacketPtr();
uint size = (Path1WritePos - Path1ReadPos);
GetMTGS().PrepDataPacket(GIF_PATH_1, size);
//DevCon.Warning("Flush Size = %x", size);
memcpy_aligned(pDest, Path1Buffer + (Path1ReadPos * 16), size * 16);
GetMTGS().SendDataPacket();
Path1ReadPos += size;
uint count = GIFPath_CopyTag(GIF_PATH_1, ((u128*)Path1Buffer) + Path1ReadPos, size);
GetMTGS().SendDataPacket();
pxAssume( count == size );
Path1ReadPos += count;
if(GSTransferStatus.PTH1 == STOPPED_MODE)
{
gifRegs->stat.OPH = false;
@ -150,11 +149,9 @@ __forceinline void gsInterrupt()
static u32 WRITERING_DMA(u32 *pMem, u32 qwc)
{
int size = GetMTGS().PrepDataPacket(GIF_PATH_3, (u8*)pMem, qwc);
u8* pgsmem = GetMTGS().GetDataPacketPtr();
memcpy_aligned(pgsmem, pMem, size<<4);
GetMTGS().PrepDataPacket(GIF_PATH_3, qwc);
//uint len1 = GIFPath_ParseTag(GIF_PATH_3, (u8*)pMem, qwc );
uint size = GIFPath_CopyTag(GIF_PATH_3, (u128*)pMem, qwc );
GetMTGS().SendDataPacket();
return size;
}

View File

@ -29,7 +29,7 @@
using namespace Threading;
#if 0 // PCSX2_DEBUG
#if 0 //PCSX2_DEBUG
# define MTGS_LOG Console.WriteLn
#else
# define MTGS_LOG 0&&
@ -46,34 +46,7 @@ using namespace Threading;
// MTGS Threaded Class Implementation
// =====================================================================================================
// Size of the ringbuffer as a power of 2 -- size is a multiple of simd128s.
// (actual size is 1<<m_RingBufferSizeFactor simd vectors [128-bit values])
// A value of 19 is a 8meg ring buffer. 18 would be 4 megs, and 20 would be 16 megs.
// Default was 2mb, but some games with lots of MTGS activity want 8mb to run fast (rama)
static const uint RingBufferSizeFactor = 19;
// size of the ringbuffer in simd128's.
static const uint RingBufferSize = 1<<RingBufferSizeFactor;
// Mask to apply to ring buffer indices to wrap the pointer from end to
// start (the wrapping is what makes it a ringbuffer, yo!)
static const uint RingBufferMask = RingBufferSize - 1;
struct MTGS_BufferedData
{
u128 m_Ring[RingBufferSize];
u8 Regs[Ps2MemSize::GSregs];
MTGS_BufferedData() {}
u128& operator[]( uint idx )
{
pxAssert( idx < RingBufferSize );
return m_Ring[idx];
}
};
static __aligned(32) MTGS_BufferedData RingBuffer;
__aligned(32) MTGS_BufferedData RingBuffer;
extern bool renderswitch;
@ -106,7 +79,6 @@ void SysMtgsThread::OnStart()
m_QueuedFrameCount = 0;
m_SignalRingEnable = 0;
m_SignalRingPosition= 0;
m_RingWrapSpot = 0;
m_CopyDataTally = 0;
@ -125,12 +97,15 @@ void SysMtgsThread::OnResumeReady()
void SysMtgsThread::ResetGS()
{
pxAssertDev( !IsOpen() || (m_RingPos == m_WritePos), "Must close or terminate the GS thread prior to gsReset." );
// MTGS Reset process:
// * clear the ringbuffer.
// * Signal a reset.
// * clear the path and byRegs structs (used by GIFtagDummy)
m_RingPos = m_WritePos;
m_QueuedFrameCount = 0;
MTGS_LOG( "MTGS: Sending Reset..." );
SendSimplePacket( GS_RINGTYPE_RESET, 0, 0, 0 );
@ -155,7 +130,8 @@ void SysMtgsThread::PostVsyncEnd()
// 256-byte copy is only a few dozen cycles -- executed 60 times a second -- so probably
// not worth the effort or overhead of trying to selectively avoid it.
PrepDataPacket(GS_RINGTYPE_VSYNC, sizeof(RingCmdPacket_Vsync));
uint packsize = sizeof(RingCmdPacket_Vsync) / 16;
PrepDataPacket(GS_RINGTYPE_VSYNC, packsize);
RingCmdPacket_Vsync& local( *(RingCmdPacket_Vsync*)GetDataPacketPtr() );
memcpy_fast( local.regset1, PS2MEM_GS, sizeof(local.regset1) );
@ -163,6 +139,7 @@ void SysMtgsThread::PostVsyncEnd()
local.imr = GSIMR;
local.siglblid = GSSIGLBLID;
m_packet_ringpos += packsize;
SendDataPacket();
// Alter-frame flushing! Restarts the ringbuffer (wraps) on every other frame. This is a
@ -172,13 +149,29 @@ void SysMtgsThread::PostVsyncEnd()
// and they also allow us to reuse the front of the ringbuffer more often, which should improve
// L2 cache performance.
if( m_QueuedFrameCount > 0 )
RestartRingbuffer();
if( AtomicIncrement(m_QueuedFrameCount) == 0 ) return;
uint readpos = volatize(m_RingPos);
uint freeroom;
if (m_WritePos < readpos)
freeroom = readpos - m_WritePos;
else
{
m_QueuedFrameCount++;
SetEvent();
}
freeroom = RingBufferSize - (m_WritePos - readpos);
uint totalAccum = RingBufferSize - freeroom;
uint somedone = totalAccum / 4;
m_SignalRingPosition = totalAccum;
//Console.WriteLn( Color_Blue, "(MTGS Sync) EEcore Vsync Sleep!\t\twrapspot=0x%06x, ringpos=0x%06x, writepos=0x%06x, signalpos=0x%06x", m_RingWrapSpot, readpos, writepos, m_SignalRingPosition );
AtomicExchange( m_SignalRingEnable, 1 );
SetEvent();
m_sem_OnRingReset.WaitWithoutYield();
readpos = volatize(m_RingPos);
pxAssertDev( m_SignalRingPosition <= 0, "MTGS Thread Synchronization Error" );
}
struct PacketTagType
@ -197,7 +190,7 @@ void SysMtgsThread::OpenPlugin()
{
if( m_PluginOpened ) return;
memcpy_aligned( RingBuffer.Regs, PS2MEM_GS, sizeof(PS2MEM_GS) );
memcpy_aligned( RingBuffer.Regs, PS2MEM_GS, sizeof(PS2MEM_GS)/16 );
GSsetBaseMem( RingBuffer.Regs );
GSirqCallback( dummyIrqCallback );
@ -330,38 +323,75 @@ void SysMtgsThread::ExecuteTaskInThread()
{
case GS_RINGTYPE_P1:
{
uint datapos = (m_RingPos+1) & RingBufferMask;
const int qsize = tag.data[0];
const u128* data = &RingBuffer[m_RingPos+1];
const u128* data = &RingBuffer[datapos];
MTGS_LOG( "(MTGS Packet Read) ringtype=P1, qwc=%u", qsize );
// make sure that tag>>16 is the MAX size readable
GSgifTransfer1((u32*)(data - 0x400 + qsize), 0x4000-qsize*16);
//GSgifTransfer1((u32*)data, qsize);
uint endpos = datapos + qsize;
if( endpos >= RingBufferSize )
{
uint firstcopylen = RingBufferSize - datapos;
GSgifTransfer( (u32*)data, firstcopylen );
datapos = endpos & RingBufferMask;
GSgifTransfer( (u32*)RingBuffer.m_Ring, datapos );
}
else
{
GSgifTransfer( (u32*)data, qsize );
}
ringposinc += qsize;
}
break;
case GS_RINGTYPE_P2:
{
uint datapos = (m_RingPos+1) & RingBufferMask;
const int qsize = tag.data[0];
const u128* data = &RingBuffer[m_RingPos+1];
const u128* data = &RingBuffer[datapos];
MTGS_LOG( "(MTGS Packet Read) ringtype=P2, qwc=%u", qsize );
GSgifTransfer2((u32*)data, qsize);
uint endpos = datapos + qsize;
if( endpos >= RingBufferSize )
{
uint firstcopylen = RingBufferSize - datapos;
GSgifTransfer2( (u32*)data, firstcopylen );
datapos = endpos & RingBufferMask;
GSgifTransfer2( (u32*)RingBuffer.m_Ring, datapos );
}
else
{
GSgifTransfer2( (u32*)data, qsize );
}
ringposinc += qsize;
}
break;
case GS_RINGTYPE_P3:
{
uint datapos = (m_RingPos+1) & RingBufferMask;
const int qsize = tag.data[0];
const u128* data = &RingBuffer[m_RingPos+1];
const u128* data = &RingBuffer[datapos];
MTGS_LOG( "(MTGS Packet Read) ringtype=P3, qwc=%u", qsize );
GSgifTransfer3((u32*)data, qsize);
uint endpos = datapos + qsize;
if( endpos >= RingBufferSize )
{
uint firstcopylen = RingBufferSize - datapos;
GSgifTransfer3( (u32*)data, firstcopylen );
datapos = endpos & RingBufferMask;
GSgifTransfer3( (u32*)RingBuffer.m_Ring, datapos );
}
else
{
GSgifTransfer3( (u32*)data, qsize );
}
ringposinc += qsize;
}
break;
@ -380,7 +410,7 @@ void SysMtgsThread::ExecuteTaskInThread()
const int qsize = tag.data[0];
ringposinc += qsize;
MTGS_LOG( "(MTGS Packet Read) ringtype=Vsync, field=%u, skip=%s", tag.data[0], tag.data[1] ? "true" : "false" );
MTGS_LOG( "(MTGS Packet Read) ringtype=Vsync, field=%u, skip=%s", !!(((u32&)RingBuffer.Regs[0x1000]) & 0x2000) ? 0 : 1, tag.data[1] ? "true" : "false" );
// Mail in the important GS registers.
RingCmdPacket_Vsync& local((RingCmdPacket_Vsync&)RingBuffer[m_RingPos+1]);
@ -398,6 +428,7 @@ void SysMtgsThread::ExecuteTaskInThread()
if( (GSopen2 == NULL) && (PADupdate != NULL) )
PADupdate(0);
AtomicDecrement( m_QueuedFrameCount );
StateCheckInThread();
}
break;
@ -450,9 +481,14 @@ void SysMtgsThread::ExecuteTaskInThread()
}
}
uint newringpos = m_RingPos + ringposinc;
pxAssert( newringpos <= RingBufferSize );
m_RingPos = newringpos & RingBufferMask;
uint newringpos = (m_RingPos + ringposinc) & RingBufferMask;
if( EmuConfig.GS.SynchronousMTGS )
{
pxAssert( m_WritePos == newringpos );
}
m_RingPos = newringpos;
if( m_SignalRingEnable != 0 )
{
@ -546,7 +582,7 @@ void SysMtgsThread::SetEvent()
u8* SysMtgsThread::GetDataPacketPtr() const
{
return (u8*)&RingBuffer[m_packet_ringpos];
return (u8*)&RingBuffer[m_packet_ringpos & RingBufferMask];
}
// Closes the data packet send command, and initiates the gs thread (if needed).
@ -555,6 +591,7 @@ void SysMtgsThread::SendDataPacket()
// make sure a previous copy block has been started somewhere.
pxAssert( m_packet_size != 0 );
#if 0
uint temp = m_packet_ringpos + m_packet_size;
pxAssert( temp <= RingBufferSize );
temp &= RingBufferMask;
@ -578,8 +615,16 @@ void SysMtgsThread::SendDataPacket()
pxAssert( readpos != temp );
}
}
#endif
m_WritePos = temp;
uint actualSize = ((m_packet_ringpos - m_packet_startpos) & RingBufferMask)-1;
pxAssert( actualSize <= m_packet_size );
pxAssert( m_packet_ringpos < RingBufferSize );
PacketTagType& tag = (PacketTagType&)RingBuffer[m_packet_startpos];
tag.data[0] = actualSize;
m_WritePos = m_packet_ringpos;
if( EmuConfig.GS.SynchronousMTGS )
{
@ -596,7 +641,7 @@ void SysMtgsThread::SendDataPacket()
//m_PacketLocker.Release();
}
int SysMtgsThread::PrepDataPacket( MTGS_RingCommand cmd, u32 size )
void SysMtgsThread::PrepDataPacket( MTGS_RingCommand cmd, u32 size )
{
// Note on volatiles: m_WritePos is not modified by the GS thread, so there's no need
// to use volatile reads here. We do cache it though, since we know it never changes,
@ -613,119 +658,63 @@ int SysMtgsThread::PrepDataPacket( MTGS_RingCommand cmd, u32 size )
m_packet_size = size;
++size; // takes into account our RingCommand QWC.
if( writepos + size < RingBufferSize )
// generic gs wait/stall.
// if the writepos is past the readpos then we're safe.
// But if not then we need to make sure the readpos is outside the scope of
// the block about to be written (writepos + size)
uint readpos = volatize(m_RingPos);
uint endpos = writepos+size;
uint freeroom;
if (writepos < readpos)
freeroom = readpos - writepos;
else
freeroom = RingBufferSize - (writepos - readpos);
if (freeroom < size)
{
// generic gs wait/stall.
// if the writepos is past the readpos then we're safe.
// But if not then we need to make sure the readpos is outside the scope of
// the block about to be written (writepos + size)
// writepos will overlap readpos if we commit the data, so we need to wait until
// readpos is out past the end of the future write pos, or until it wraps around
// (in which case writepos will be >= readpos).
uint readpos = volatize(m_RingPos);
if( (writepos < readpos) && (writepos+size >= readpos) )
// Ideally though we want to wait longer, because if we just toss in this packet
// the next packet will likely stall up too. So lets set a condition for the MTGS
// thread to wake up the EE once there's a sizable chunk of the ringbuffer emptied.
uint somedone = (RingBufferSize - freeroom) / 4;
if( somedone < size+1 ) somedone = size + 1;
// FMV Optimization: FMVs typically send *very* little data to the GS, in some cases
// every other frame is nothing more than a page swap. Sleeping the EEcore is a
// waste of time, and we get better results using a spinwait.
if( somedone > 0x80 )
{
// writepos is behind the readpos and will overlap it if we commit the data,
// so we need to wait until readpos is out past the end of the future write pos,
// or until it wraps around (in which case writepos will be >= readpos).
pxAssertDev( m_SignalRingEnable == 0, "MTGS Thread Synchronization Error" );
m_SignalRingPosition = somedone;
// Ideally though we want to wait longer, because if we just toss in this packet
// the next packet will likely stall up too. So lets set a condition for the MTGS
// thread to wake up the EE once there's a sizable chunk of the ringbuffer emptied.
//Console.WriteLn( Color_Blue, "(EEcore Sleep) GenStall \tringpos=0x%06x, writepos=0x%06x, wrapspot=0x%06x, signalpos=0x%06x", readpos, writepos, m_RingWrapSpot, m_SignalRingPosition );
uint totalAccum = (m_RingWrapSpot - readpos) + writepos;
uint somedone = totalAccum / 4;
if( somedone < size+1 ) somedone = size + 1;
// FMV Optimization: FMVs typically send *very* little data to the GS, in some cases
// every other frame is nothing more than a page swap. Sleeping the EEcore is a
// waste of time, and we get better results using a spinwait.
if( somedone > 0x80 )
{
pxAssertDev( m_SignalRingEnable == 0, "MTGS Thread Synchronization Error" );
m_SignalRingPosition = somedone;
//Console.WriteLn( Color_Blue, "(EEcore Sleep) GenStall \tringpos=0x%06x, writepos=0x%06x, wrapspot=0x%06x, signalpos=0x%06x", readpos, writepos, m_RingWrapSpot, m_SignalRingPosition );
do {
AtomicExchange( m_SignalRingEnable, 1 );
SetEvent();
m_sem_OnRingReset.WaitWithoutYield();
readpos = volatize(m_RingPos);
//Console.WriteLn( Color_Blue, "(EEcore Awake) Report!\tringpos=0x%06x", readpos );
} while( (writepos < readpos) && (writepos+size >= readpos) );
pxAssertDev( m_SignalRingPosition <= 0, "MTGS Thread Synchronization Error" );
}
else
{
do {
AtomicExchange( m_SignalRingEnable, 1 );
SetEvent();
do {
SpinWait();
readpos = volatize(m_RingPos);
} while( (writepos < readpos) && (writepos+size >= readpos) );
}
m_sem_OnRingReset.WaitWithoutYield();
readpos = volatize(m_RingPos);
//Console.WriteLn( Color_Blue, "(EEcore Awake) Report!\tringpos=0x%06x", readpos );
} while( (writepos < readpos) && (writepos+size >= readpos) );
pxAssertDev( m_SignalRingPosition <= 0, "MTGS Thread Synchronization Error" );
}
else
{
SetEvent();
do {
SpinWait();
readpos = volatize(m_RingPos);
} while( (writepos < readpos) && (writepos+size >= readpos) );
}
}
else if( writepos + size > RingBufferSize )
{
pxAssert( writepos != 0 );
// If the incoming packet doesn't fit, then start over from the start of the ring
// buffer (it's a lot easier than trying to wrap the packet around the end of the
// buffer).
//Console.WriteLn( "MTGS > Ringbuffer Got Filled!");
RestartRingbuffer( size );
writepos = m_WritePos;
}
else // always true - if( writepos + size == MTGS_RINGBUFFEREND )
{
// Yay. Perfect fit. What are the odds?
// Copy is ready so long as readpos is less than writepos and *not* equal to the
// base of the ringbuffer (otherwise the buffer will stop when the writepos is
// wrapped around to zero later-on in SendDataPacket).
uint readpos = volatize(m_RingPos);
//Console.WriteLn( "MTGS > Perfect Fit!\tringpos=0x%06x, writepos=0x%06x", readpos, writepos );
if( readpos > writepos || readpos == 0 )
{
uint totalAccum = (readpos == 0) ? RingBufferSize : ((m_RingWrapSpot - readpos) + writepos);
uint somedone = totalAccum / 4;
if( somedone < size+1 ) somedone = size + 1;
// FMV Optimization: (see above) This condition of a perfect fit is so rare that optimizing
// for it is pointless -- but it was also mindlessly simple copy-paste. So there. :p
if( somedone > 0x80 )
{
m_SignalRingPosition = somedone;
//Console.WriteLn( Color_Blue, "(MTGS Sync) EEcore Perfect Sleep!\twrapspot=0x%06x, ringpos=0x%06x, writepos=0x%06x, signalpos=0x%06x", m_RingWrapSpot, readpos, writepos, m_SignalRingPosition );
do {
AtomicExchange( m_SignalRingEnable, 1 );
SetEvent();
m_sem_OnRingReset.WaitWithoutYield();
readpos = volatize(m_RingPos);
//Console.WriteLn( Color_Blue, "(MTGS Sync) EEcore Perfect Post-sleep Report!\tringpos=0x%06x", readpos );
} while( (writepos < readpos) || (readpos==0) );
pxAssertDev( m_SignalRingPosition <= 0, "MTGS Thread Synchronization Error" );
}
else
{
//Console.WriteLn( Color_Blue, "(MTGS Sync) EEcore Perfect Spin!" );
SetEvent();
do {
SpinWait();
readpos = volatize(m_RingPos);
} while( (writepos < readpos) || (readpos==0) );
}
}
m_QueuedFrameCount = 0;
m_RingWrapSpot = RingBufferSize;
}
#ifdef RINGBUF_DEBUG_STACK
m_lock_Stack.Lock();
@ -739,9 +728,8 @@ int SysMtgsThread::PrepDataPacket( MTGS_RingCommand cmd, u32 size )
PacketTagType& tag = (PacketTagType&)RingBuffer[m_WritePos];
tag.command = cmd;
tag.data[0] = m_packet_size;
m_packet_ringpos = m_WritePos + 1;
return m_packet_size;
m_packet_startpos = m_WritePos;
m_packet_ringpos = (m_WritePos + 1) & RingBufferMask;
}
// Returns the amount of giftag data processed (in simd128 values).
@ -749,13 +737,14 @@ int SysMtgsThread::PrepDataPacket( MTGS_RingCommand cmd, u32 size )
// around VU memory instead of having buffer overflow...
// Parameters:
// size - size of the packet data, in smd128's
int SysMtgsThread::PrepDataPacket( GIF_PATH pathidx, const u8* srcdata, u32 size )
void SysMtgsThread::PrepDataPacket( GIF_PATH pathidx, u32 size )
{
//m_PacketLocker.Acquire();
return PrepDataPacket( (MTGS_RingCommand)pathidx, GIFPath_ParseTag(pathidx, srcdata, size) );
PrepDataPacket( (MTGS_RingCommand)pathidx, size );
}
#if 0
void SysMtgsThread::RestartRingbuffer( uint packsize )
{
if( m_WritePos == 0 ) return;
@ -816,6 +805,7 @@ void SysMtgsThread::RestartRingbuffer( uint packsize )
if( EmuConfig.GS.SynchronousMTGS )
WaitGS();
}
#endif
__forceinline uint SysMtgsThread::_PrepForSimplePacket()
{
@ -830,10 +820,7 @@ __forceinline uint SysMtgsThread::_PrepForSimplePacket()
future_writepos &= RingBufferMask;
if( future_writepos == 0 )
{
m_QueuedFrameCount = 0;
m_RingWrapSpot = RingBufferSize;
}
uint readpos = volatize(m_RingPos);
if( future_writepos == readpos )
@ -841,7 +828,15 @@ __forceinline uint SysMtgsThread::_PrepForSimplePacket()
// The ringbuffer read pos is blocking the future write position, so stall out
// until the read position has moved.
uint totalAccum = (m_RingWrapSpot - readpos) + future_writepos;
uint freeroom;
if (future_writepos < readpos)
freeroom = readpos - future_writepos;
else
freeroom = RingBufferSize - (future_writepos - readpos);
uint totalAccum = RingBufferSize - freeroom;
uint somedone = totalAccum / 4;
if( somedone > 0x80 )

View File

@ -144,6 +144,7 @@ static s32 CALLBACK fallback_test() { return 0; }
_GSvsync GSvsync;
_GSopen GSopen;
_GSopen2 GSopen2;
_GSgifTransfer GSgifTransfer;
_GSgifTransfer1 GSgifTransfer1;
_GSgifTransfer2 GSgifTransfer2;
_GSgifTransfer3 GSgifTransfer3;
@ -309,7 +310,8 @@ static const LegacyApi_ReqMethod s_MethMessReq_GS[] =
{
{ "GSopen", (vMeth**)&GSopen, NULL },
{ "GSvsync", (vMeth**)&GSvsync, NULL },
{ "GSgifTransfer1", (vMeth**)&GSgifTransfer1, NULL },
{ "GSgifTransfer", (vMeth**)&GSgifTransfer, NULL },
//{ "GSgifTransfer1", (vMeth**)&GSgifTransfer1, NULL },
{ "GSgifTransfer2", (vMeth**)&GSgifTransfer2, NULL },
{ "GSgifTransfer3", (vMeth**)&GSgifTransfer3, NULL },
{ "GSreadFIFO2", (vMeth**)&GSreadFIFO2, NULL },

View File

@ -2057,21 +2057,8 @@ void _vuXGKICK(VURegs * VU)
u8* data = ((u8*)VU->Mem + ((VU->VI[_Is_].US[0]*16) & 0x3fff));
u32 size;
size = GetMTGS().PrepDataPacket( GIF_PATH_1, data, (0x4000-((VU->VI[_Is_].US[0]*16) & 0x3fff)) >> 4);
u8* pmem = GetMTGS().GetDataPacketPtr();
if((size << 4) > (u32)(0x4000-((VU->VI[_Is_].US[0]*16) & 0x3fff)))
{
//DevCon.Warning("addr + Size = 0x%x, transferring %x then doing %x", ((VU->VI[_Is_].US[0]*16) & 0x3fff) + (size << 4), (0x4000-((VU->VI[_Is_].US[0]*16) & 0x3fff)) >> 4, size - (0x4000-((VU->VI[_Is_].US[0]*16) & 0x3fff) >> 4));
memcpy_aligned(pmem, (u8*)VU->Mem+((VU->VI[_Is_].US[0]*16) & 0x3fff), 0x4000-((VU->VI[_Is_].US[0]*16) & 0x3fff));
size -= (0x4000-((VU->VI[_Is_].US[0]*16) & 0x3fff)) >> 4;
//DevCon.Warning("Size left %x", size);
pmem += 0x4000-((VU->VI[_Is_].US[0]*16) & 0x3fff);
memcpy_aligned(pmem, (u8*)VU->Mem, size<<4);
}
else {
memcpy_aligned(pmem, (u8*)VU->Mem+((VU->VI[_Is_].US[0]*16) & 0x3fff), size<<4);
}
GetMTGS().PrepDataPacket( GIF_PATH_1, 0x400 );
size = GIFPath_CopyTag( GIF_PATH_1, (u128*)data, (0x400-(VU->VI[_Is_].US[0] & 0x3ff)) );
GetMTGS().SendDataPacket();
}

View File

@ -213,8 +213,8 @@ template<int idx> _f int _vifCode_Direct(int pass, u8* data, bool isDirectHL) {
v.bSize = 0;
v.bPtr = 0;
}
const uint count = GetMTGS().PrepDataPacket(GIF_PATH_2, v.buffer, 1);
memcpy_fast(GetMTGS().GetDataPacketPtr(), v.buffer, count << 4);
GetMTGS().PrepDataPacket(GIF_PATH_2, 1);
GIFPath_CopyTag(GIF_PATH_2, (u128*)v.buffer, 1);
GetMTGS().SendDataPacket();
if(vif1.tag.size == 0)
@ -226,16 +226,17 @@ template<int idx> _f int _vifCode_Direct(int pass, u8* data, bool isDirectHL) {
}
else
{
const uint count = GetMTGS().PrepDataPacket(GIF_PATH_2, data, size >> 4);
memcpy_fast(GetMTGS().GetDataPacketPtr(), data, count << 4);
GetMTGS().PrepDataPacket(GIF_PATH_2, size/16);
uint count = GIFPath_CopyTag(GIF_PATH_2, (u128*)data, size/16) * 4;
GetMTGS().SendDataPacket();
vif1.tag.size -= count << 2;
vif1.tag.size -= count;
if(vif1.tag.size == 0)
{
vif1.cmd = 0;
}
vif1.vifstalled = true;
return count << 2;
return count;
}
}

View File

@ -97,7 +97,7 @@ struct GIFPath
u8 GetReg();
bool IsActive() const;
int ParseTag(GIF_PATH pathidx, const u8* pMem, u32 size);
int CopyTag(GIF_PATH pathidx, const u128* pMem, u32 size);
int ParseTagQuick(GIF_PATH pathidx, const u8* pMem, u32 size);
};
@ -287,7 +287,8 @@ __forceinline void GIFPath::PrepPackedRegs()
__forceinline void GIFPath::SetTag(const void* mem)
{
const_cast<GIFTAG&>(tag) = *((GIFTAG*)mem);
_mm_store_ps( (float*)&tag, _mm_loadu_ps((float*)mem) );
//const_cast<GIFTAG&>(tag) = *((GIFTAG*)mem);
nloop = tag.NLOOP;
curreg = 0;
@ -521,15 +522,50 @@ __forceinline int GIFPath::ParseTagQuick(GIF_PATH pathidx, const u8* pMem, u32 s
return size;
}
__forceinline int GIFPath::ParseTag(GIF_PATH pathidx, const u8* pMem, u32 size)
void MemCopy_WrappedDest( const u128* src, u128* destBase, uint& destStart, uint destSize, uint len )
{
uint endpos = destStart + len;
if( endpos >= destSize )
{
uint firstcopylen = RingBufferSize - destStart;
memcpy_aligned(&destBase[destStart], src, firstcopylen );
destStart = endpos & RingBufferMask;
memcpy_aligned(destBase, src+firstcopylen, destStart );
}
else
{
memcpy_aligned(&destBase[destStart], src, len );
destStart += len;
}
}
// [TODO] optimization: If later templated, we can have Paths 1 and 3 use aligned SSE movs,
// since only PATH2 can feed us unaligned source data.
#define copyTag() do { \
/*RingBuffer.m_Ring[ringpos] = *pMem128;*/ \
_mm_store_ps( (float*)&RingBuffer.m_Ring[ringpos], _mm_loadu_ps((float*)pMem128)); \
++pMem128; --size; \
ringpos = (ringpos+1)&RingBufferMask; \
} while(false)
__forceinline int GIFPath::CopyTag(GIF_PATH pathidx, const u128* pMem128, u32 size)
{
uint& ringpos = GetMTGS().m_packet_ringpos;
const uint original_ringpos = ringpos;
u32 startSize = size; // Start Size
while (size > 0) {
if (!nloop) {
SetTag(pMem);
incTag(1);
// [TODO] Optimization: Use MMX intrinsics for SetTag and CopyTag, which both currently
// produce a series of mov eax,[src]; mov [dest],eax instructions to copy these
// individual qwcs. Warning: Path2 transfers are not always QWC-aligned, but they are
// always aligned on an 8 byte boundary; so its probably best to use MMX here.
SetTag((u8*)pMem128);
copyTag();
if(nloop > 0)
{
@ -599,9 +635,9 @@ __forceinline int GIFPath::ParseTag(GIF_PATH pathidx, const u8* pMem, u32 size)
{
do {
if (GetReg() == 0xe) {
gsHandler(pMem);
gsHandler((u8*)pMem128);
}
incTag(1);
copyTag();
} while(StepReg() && size > 0 && SIGNAL_IMR_Pending == false);
}
else
@ -644,11 +680,14 @@ __forceinline int GIFPath::ParseTag(GIF_PATH pathidx, const u8* pMem, u32 size)
curreg = 0;
nloop = 0;
}
incTag(len);
MemCopy_WrappedDest( pMem128, RingBuffer.m_Ring, ringpos, RingBufferSize, len );
pMem128 += len;
size -= len;
}
break;
case GIF_FLG_REGLIST:
{
{
GIF_LOG("Reglist Mode EOP %x", tag.EOP);
// In reglist mode, the GIF packs 2 registers into each QWC. The nloop however
@ -687,8 +726,9 @@ __forceinline int GIFPath::ParseTag(GIF_PATH pathidx, const u8* pMem, u32 size)
nloop = 0;
}
incTag(len);
MemCopy_WrappedDest( pMem128, RingBuffer.m_Ring, ringpos, RingBufferSize, len );
pMem128 += len;
size -= len;
}
break;
case GIF_FLG_IMAGE:
@ -696,13 +736,15 @@ __forceinline int GIFPath::ParseTag(GIF_PATH pathidx, const u8* pMem, u32 size)
{
GIF_LOG("IMAGE Mode EOP %x", tag.EOP);
int len = aMin(size, nloop);
incTag(len);
MemCopy_WrappedDest( pMem128, RingBuffer.m_Ring, ringpos, RingBufferSize, len );
pMem128 += len;
size -= len;
nloop -= len;
}
break;
}
}
if(pathidx == GIF_PATH_1)
@ -713,11 +755,11 @@ __forceinline int GIFPath::ParseTag(GIF_PATH pathidx, const u8* pMem, u32 size)
{
size = 0x3ff - startSize;
startSize = 0x3ff;
pMem -= 0x4000;
pMem128 -= 0x400;
}
else
{
// Note: The BIOS does an XGKICK on the VU1 and lets yt DMA to the GS without an EOP
// Note: The BIOS does an XGKICK on the VU1 and lets it DMA to the GS without an EOP
// (seemingly to loop forever), only to write an EOP later on. No other game is known to
// do anything of the sort.
// So lets just cap the DMA at 16k, and force it to "look" like it's terminated for now.
@ -727,6 +769,11 @@ __forceinline int GIFPath::ParseTag(GIF_PATH pathidx, const u8* pMem, u32 size)
Console.Warning("GIFTAG error, size exceeded VU memory size %x", startSize);
nloop = 0;
// Don't send the packet to the GS -- its incomplete and might cause the GS plugin
// to get confused and die. >_<
ringpos = original_ringpos;
}
}
}
@ -793,47 +840,18 @@ __forceinline int GIFPath::ParseTag(GIF_PATH pathidx, const u8* pMem, u32 size)
gif->qwc -= size;
}
}
return size;
}
// Processes a GIFtag & packet, and throws out some gsIRQs as needed.
// Used to keep interrupts in sync with the EE, while the GS itself
// runs potentially several frames behind.
// Parameters:
// size - max size of incoming data stream, in qwc (simd128)
__forceinline int GIFPath_ParseTag(GIF_PATH pathidx, const u8* pMem, u32 size)
__forceinline int GIFPath_CopyTag(GIF_PATH pathidx, const u128* pMem, u32 size)
{
#ifdef PCSX2_GSRING_SAMPLING_STATS
static uptr profStartPtr = 0;
static uptr profEndPtr = 0;
if (profStartPtr == 0) {
__asm
{
__beginfunc:
mov profStartPtr, offset __beginfunc;
mov profEndPtr, offset __endfunc;
}
ProfilerRegisterSource( "GSRingBufCopy", (void*)profStartPtr, profEndPtr - profStartPtr );
}
#endif
int retSize = s_gifPath[pathidx].ParseTag(pathidx, pMem, size);
#ifdef PCSX2_GSRING_SAMPLING_STATS
__asm
{
__endfunc:
nop;
}
#endif
return retSize;
return s_gifPath[pathidx].CopyTag(pathidx, pMem, size);
}
//Quick version for queueing PATH1 data
// Quick version for queueing PATH1 data.
// This version calculates the real length of the packet data only. It does not process
// IRQs or DMA status updates.
__forceinline int GIFPath_ParseTagQuick(GIF_PATH pathidx, const u8* pMem, u32 size)
{
int retSize = s_gifPath[pathidx].ParseTagQuick(pathidx, pMem, size);

View File

@ -1101,27 +1101,15 @@ void __fastcall mVU_XGKICK_(u32 addr) {
if(gifRegs->stat.APATH <= GIF_APATH1 || (gifRegs->stat.APATH == GIF_APATH3 && gifRegs->stat.IP3 == true) && SIGNAL_IMR_Pending == false)
{
if(Path1WritePos != 0)
{
//Flush any pending transfers so things dont go up in the wrong order
while(gifRegs->stat.P1Q == true) gsPath1Interrupt();
}
size = GetMTGS().PrepDataPacket(GIF_PATH_1, data, diff);
pDest = GetMTGS().GetDataPacketPtr();
if (size > diff) {
// fixme: one of these days the following *16's will get cleaned up when we introduce
// a special qwc/simd16 optimized version of memcpy_aligned. :)
//DevCon.Status("XGkick Wrap!");
memcpy_aligned(pDest, microVU1.regs->Mem + (addr*16), diff*16);
size -= diff;
pDest += diff*16;
memcpy_aligned(pDest, microVU1.regs->Mem, size*16);
}
else {
memcpy_aligned(pDest, microVU1.regs->Mem + (addr*16), size*16);
}
GetMTGS().PrepDataPacket(GIF_PATH_1, 0x400);
size = GIFPath_CopyTag(GIF_PATH_1, (u128*)data, diff);
GetMTGS().SendDataPacket();
if(GSTransferStatus.PTH1 == STOPPED_MODE)
{
gifRegs->stat.OPH = false;
@ -1141,14 +1129,14 @@ void __fastcall mVU_XGKICK_(u32 addr) {
// fixme: one of these days the following *16's will get cleaned up when we introduce
// a special qwc/simd16 optimized version of memcpy_aligned. :)
//DevCon.Status("XGkick Wrap!");
memcpy_aligned(pDest, microVU1.regs->Mem + (addr*16), diff*16);
memcpy_aligned(pDest, microVU1.regs->Mem + (addr*16), diff);
Path1WritePos += size;
size -= diff;
pDest += diff*16;
memcpy_aligned(pDest, microVU1.regs->Mem, size*16);
memcpy_aligned(pDest, microVU1.regs->Mem, size);
}
else {
memcpy_aligned(pDest, microVU1.regs->Mem + (addr*16), size*16);
memcpy_aligned(pDest, microVU1.regs->Mem + (addr*16), size);
Path1WritePos += size;
}
//if(!gifRegs->stat.P1Q) CPU_INT(28, 128);

View File

@ -1988,21 +1988,10 @@ void __fastcall VU1XGKICK_MTGSTransfer(u32 *pMem, u32 addr)
//Flush any pending transfers so things dont go up in the wrong order
while(gifRegs->stat.P1Q == true) gsPath1Interrupt();
}
size = GetMTGS().PrepDataPacket(GIF_PATH_1, data, diff);
pDest = GetMTGS().GetDataPacketPtr();
if (size > diff) {
// fixme: one of these days the following *16's will get cleaned up when we introduce
// a special qwc/simd16 optimized version of memcpy_aligned. :)
memcpy_aligned(pDest, VU1.Mem + addr, diff*16);
size -= diff;
pDest += diff*16;
memcpy_aligned(pDest, VU1.Mem, size*16);
}
else {
memcpy_aligned(pDest, VU1.Mem + addr, size*16);
}
GetMTGS().PrepDataPacket(GIF_PATH_1, 0x400);
size = GIFPath_CopyTag(GIF_PATH_1, (u128*)data, diff);
GetMTGS().SendDataPacket();
if(GSTransferStatus.PTH1 == STOPPED_MODE )
{
gifRegs->stat.OPH = false;
@ -2015,8 +2004,6 @@ void __fastcall VU1XGKICK_MTGSTransfer(u32 *pMem, u32 addr)
size = GIFPath_ParseTagQuick(GIF_PATH_1, data, diff);
pDest = &Path1Buffer[Path1WritePos*16];
pxAssumeMsg((Path1WritePos+size < sizeof(Path1Buffer)), "XGKick Buffer Overflow detected on Path1Buffer!");
//DevCon.Warning("Storing size %x PATH 1", size);
@ -2024,14 +2011,14 @@ void __fastcall VU1XGKICK_MTGSTransfer(u32 *pMem, u32 addr)
// fixme: one of these days the following *16's will get cleaned up when we introduce
// a special qwc/simd16 optimized version of memcpy_aligned. :)
//DevCon.Status("XGkick Wrap!");
memcpy_aligned(pDest, VU1.Mem + addr, diff*16);
memcpy_aligned(pDest, VU1.Mem + addr, diff);
Path1WritePos += size;
size -= diff;
pDest += diff*16;
memcpy_aligned(pDest, VU1.Mem, size*16);
memcpy_aligned(pDest, VU1.Mem, size);
}
else {
memcpy_aligned(pDest, VU1.Mem + addr, size*16);
memcpy_aligned(pDest, VU1.Mem + addr, size);
Path1WritePos += size;
}
//if(!gifRegs->stat.P1Q) CPU_INT(28, 128);