(3DS) video driver: performance improvements.

This commit is contained in:
aliaspider 2015-04-13 01:50:00 +01:00
parent 64e3e40bb6
commit e754c328b0
2 changed files with 133 additions and 91 deletions

View File

@ -202,7 +202,7 @@ static void* ctr_init(const video_info_t* video,
CTRGU_ATTRIBFMT(GPU_SHORT, 2) << 4,
sizeof(ctr_vertex_t));
GPUCMD_Finalize();
GPUCMD_FlushAndRun(NULL);
ctrGuFlushAndRun(true);
gspWaitForEvent(GSPEVENT_P3D, false);
if (input && input_data)
@ -214,7 +214,7 @@ static void* ctr_init(const video_info_t* video,
return ctr;
}
//#define gspWaitForEvent(...)
static bool ctr_frame(void* data, const void* frame,
unsigned width, unsigned height, unsigned pitch, const char* msg)
{
@ -249,17 +249,17 @@ static bool ctr_frame(void* data, const void* frame,
}
frames++;
currentTick = osGetTime();
currentTick = svcGetSystemTick();
uint32_t diff = currentTick - lastTick;
if(diff > 1000)
if(diff > CTR_CPU_TICKS_PER_SECOND)
{
fps = (float)frames * (1000.0 / diff);
fps = (float)frames * ((float) CTR_CPU_TICKS_PER_SECOND / (float) diff);
lastTick = currentTick;
frames = 0;
}
printf("fps: %8.4f frames: %i\r", fps, total_frames++);
fflush(stdout);
// fflush(stdout);
/* enable this to profile the core without video output */
#if 0
@ -267,60 +267,83 @@ static bool ctr_frame(void* data, const void* frame,
goto end;
#endif
svcWaitSynchronization(gspEvents[GSPEVENT_P3D], 20000000);
svcClearEvent(gspEvents[GSPEVENT_P3D]);
svcWaitSynchronization(gspEvents[GSPEVENT_PPF], 20000000);
svcClearEvent(gspEvents[GSPEVENT_PPF]);
gfxSwapBuffersGpu();
if (ctr->vsync)
gspWaitForEvent(GSPEVENT_VBlank0, true);
ctrGuSetMemoryFill(true, (u32*)CTR_GPU_FRAMEBUFFER, 0x00000000,
(u32*)(CTR_GPU_FRAMEBUFFER + CTR_TOP_FRAMEBUFFER_WIDTH * CTR_TOP_FRAMEBUFFER_HEIGHT * sizeof(uint32_t)),
0x201, (u32*)CTR_GPU_DEPTHBUFFER, 0x00000000,
(u32*)(CTR_GPU_DEPTHBUFFER + CTR_TOP_FRAMEBUFFER_WIDTH * CTR_TOP_FRAMEBUFFER_HEIGHT * sizeof(uint32_t)),
0x201);
GPUCMD_SetBufferOffset(0);
if (width > ctr->texture_width)
width = ctr->texture_width;
if (height > ctr->texture_height)
height = ctr->texture_height;
if(frame)
{
int i;
uint16_t* dst = (uint16_t*)ctr->texture_linear;
const uint8_t* src = frame;
if (width > ctr->texture_width)
width = ctr->texture_width;
if (height > ctr->texture_height)
height = ctr->texture_height;
for (i = 0; i < height; i++)
if(((((u32)(frame)) >= 0x14000000 && ((u32)(frame)) < 0x1c000000)) /* frame in linear memory */
&& !((u32)frame & 0x7F) /* 128-byte aligned */
&& !((pitch) & 0xF)) /* 16-byte aligned */
{
memcpy(dst, src, width * sizeof(uint16_t));
dst += ctr->texture_width;
src += pitch;
/* can copy the buffer directly with the GPU */
ctrGuCopyImage(false, frame, pitch / 2, height, CTRGU_RGB565, false,
ctr->texture_swizzled, ctr->texture_width, CTRGU_RGB565, true);
}
GSPGPU_FlushDataCache(NULL, ctr->texture_linear,
ctr->texture_width * ctr->texture_height * sizeof(uint16_t));
else
{
int i;
uint16_t* dst = (uint16_t*)ctr->texture_linear;
const uint8_t* src = frame;
for (i = 0; i < height; i++)
{
memcpy(dst, src, width * sizeof(uint16_t));
dst += ctr->texture_width;
src += pitch;
}
GSPGPU_FlushDataCache(NULL, ctr->texture_linear,
ctr->texture_width * ctr->texture_height * sizeof(uint16_t));
ctrGuCopyImage(ctr->texture_linear, ctr->texture_width, ctr->menu.texture_height, CTRGU_RGB565, false,
ctr->texture_swizzled, ctr->texture_width, CTRGU_RGB565, true);
gspWaitForEvent(GSPEVENT_PPF, false);
ctrGuSetTexture(GPU_TEXUNIT0, VIRT_TO_PHYS(ctr->texture_swizzled), ctr->texture_width, ctr->texture_height,
GPU_TEXTURE_MAG_FILTER(GPU_LINEAR) | GPU_TEXTURE_MIN_FILTER(GPU_LINEAR) |
GPU_TEXTURE_WRAP_S(GPU_CLAMP_TO_EDGE) | GPU_TEXTURE_WRAP_T(GPU_CLAMP_TO_EDGE),
GPU_RGB565);
ctr->frame_coords->u = width;
ctr->frame_coords->v = height;
GSPGPU_FlushDataCache(NULL, (u8*)ctr->frame_coords, sizeof(ctr_vertex_t));
ctrGuSetAttributeBuffersAddress(VIRT_TO_PHYS(ctr->frame_coords));
ctrGuSetVertexShaderFloatUniform(0, (float*)&ctr->scale_vector, 1);
GPU_DrawArray(GPU_UNKPRIM, 1);
ctrGuCopyImage(false, ctr->texture_linear, ctr->texture_width, ctr->menu.texture_height, CTRGU_RGB565, false,
ctr->texture_swizzled, ctr->texture_width, CTRGU_RGB565, true);
}
}
ctrGuSetTexture(GPU_TEXUNIT0, VIRT_TO_PHYS(ctr->texture_swizzled), ctr->texture_width, ctr->texture_height,
GPU_TEXTURE_MAG_FILTER(GPU_LINEAR) | GPU_TEXTURE_MIN_FILTER(GPU_LINEAR) |
GPU_TEXTURE_WRAP_S(GPU_CLAMP_TO_EDGE) | GPU_TEXTURE_WRAP_T(GPU_CLAMP_TO_EDGE),
GPU_RGB565);
ctr->frame_coords->u = width;
ctr->frame_coords->v = height;
GSPGPU_FlushDataCache(NULL, (u8*)ctr->frame_coords, sizeof(ctr_vertex_t));
ctrGuSetAttributeBuffersAddress(VIRT_TO_PHYS(ctr->frame_coords));
ctrGuSetVertexShaderFloatUniform(0, (float*)&ctr->scale_vector, 1);
GPU_DrawArray(GPU_UNKPRIM, 1);
if (ctr->menu_texture_enable)
{
GSPGPU_FlushDataCache(NULL, ctr->menu.texture_linear,
ctr->menu.texture_width * ctr->menu.texture_height * sizeof(uint16_t));
ctrGuCopyImage(ctr->menu.texture_linear, ctr->menu.texture_width, ctr->menu.texture_height, CTRGU_RGBA4444,false,
ctrGuCopyImage(false, ctr->menu.texture_linear, ctr->menu.texture_width, ctr->menu.texture_height, CTRGU_RGBA4444,false,
ctr->menu.texture_swizzled, ctr->menu.texture_width, CTRGU_RGBA4444, true);
gspWaitForEvent(GSPEVENT_PPF, false);
ctrGuSetTexture(GPU_TEXUNIT0, VIRT_TO_PHYS(ctr->menu.texture_swizzled), ctr->menu.texture_width, ctr->menu.texture_height,
GPU_TEXTURE_MAG_FILTER(GPU_LINEAR) | GPU_TEXTURE_MIN_FILTER(GPU_LINEAR) |
GPU_TEXTURE_WRAP_S(GPU_CLAMP_TO_EDGE) | GPU_TEXTURE_WRAP_T(GPU_CLAMP_TO_EDGE),
@ -334,27 +357,14 @@ static bool ctr_frame(void* data, const void* frame,
GPU_FinishDrawing();
GPUCMD_Finalize();
GPUCMD_FlushAndRun(NULL);
gspWaitForEvent(GSPEVENT_P3D, false);
ctrGuFlushAndRun(true);
ctrGuDisplayTransfer(CTR_GPU_FRAMEBUFFER, 240,400, CTRGU_RGBA8,
ctrGuDisplayTransfer(true, CTR_GPU_FRAMEBUFFER, 240,400, CTRGU_RGBA8,
gfxGetFramebuffer(GFX_TOP, GFX_LEFT, NULL, NULL), 240,400,CTRGU_RGB8, CTRGU_MULTISAMPLE_NONE);
gspWaitForEvent(GSPEVENT_PPF, false);
GX_SetMemoryFill(NULL, (u32*)CTR_GPU_FRAMEBUFFER, 0x00000000,
(u32*)(CTR_GPU_FRAMEBUFFER + CTR_TOP_FRAMEBUFFER_WIDTH * CTR_TOP_FRAMEBUFFER_HEIGHT * sizeof(uint32_t)),
0x201, (u32*)CTR_GPU_DEPTHBUFFER, 0x00000000,
(u32*)(CTR_GPU_DEPTHBUFFER + CTR_TOP_FRAMEBUFFER_WIDTH * CTR_TOP_FRAMEBUFFER_HEIGHT * sizeof(uint32_t)),
0x201);
gspWaitForEvent(GSPEVENT_PSC0, false);
gfxSwapBuffersGpu();
// if (ctr->vsync)
// gspWaitForEvent(GSPEVENT_VBlank0, true);
end:
// gspWaitForEvent(GSPEVENT_VBlank0, true);
RARCH_PERFORMANCE_STOP(ctrframe_f);
return true;
}

View File

@ -48,36 +48,13 @@
#define CTRGU_MULTISAMPLE_2x1 (1 << 24)
#define CTRGU_MULTISAMPLE_2x2 (2 << 24)
typedef struct
{
uint32_t buffer[8];
} gtrgu_gx_command_t;
#define CTR_CPU_TICKS_PER_SECOND 268123480
__attribute__((always_inline))
static INLINE int ctrGuWriteDisplayTransferCommand(gtrgu_gx_command_t* command,
void* src, int src_w, int src_h,
void* dst, int dst_w, int dst_h,
uint32_t flags)
{
command->buffer[0] = 0x03; //CommandID
command->buffer[1] = (uint32_t)src;
command->buffer[2] = (uint32_t)dst;
command->buffer[3] = CTRGU_SIZE(src_w, src_h);
command->buffer[4] = CTRGU_SIZE(dst_w, dst_h);
command->buffer[5] = flags;
command->buffer[6] = 0x0;
command->buffer[7] = 0x0;
return 0;
}
__attribute__((always_inline))
static INLINE int ctrGuSubmitGxCommand(u32* gxbuf, gtrgu_gx_command_t* command)
{
if(!gxbuf) gxbuf = gxCmdBuf;
return GSPGPU_SubmitGxCommand(gxbuf, (u32*)command, NULL);
}
extern Handle gspEvents[GSPEVENT_MAX];
extern u32* gpuCmdBuf;
extern u32 gpuCmdBufOffset;
extern u32 __linear_heap_size;
extern u32* __linear_heap;
__attribute__((always_inline))
static INLINE void ctrGuSetTexture(GPU_TEXUNIT unit, u32* data,
@ -108,14 +85,68 @@ static INLINE void ctrGuSetTexture(GPU_TEXUNIT unit, u32* data,
}
}
__attribute__((always_inline))
static INLINE Result ctrGuSetCommandList_First(bool queued, u32* buf0a, u32 buf0s, u32* buf1a, u32 buf1s, u32* buf2a, u32 buf2s)
{
u32 gxCommand[0x8];
gxCommand[0]=0x05 | (queued? 0x01000000 : 0x0); //CommandID
gxCommand[1]=(u32)buf0a; //buf0 address
gxCommand[2]=(u32)buf0s; //buf0 size
gxCommand[3]=(u32)buf1a; //buf1 address
gxCommand[4]=(u32)buf1s; //buf1 size
gxCommand[5]=(u32)buf2a; //buf2 address
gxCommand[6]=(u32)buf2s; //buf2 size
gxCommand[7]=0x0;
return GSPGPU_SubmitGxCommand(gxCmdBuf, gxCommand, NULL);
}
__attribute__((always_inline))
static INLINE Result ctrGuSetCommandList_Last(bool queued, u32* buf0a, u32 buf0s, u8 flags)
{
u32 gxCommand[0x8];
gxCommand[0]=0x01 | (queued? 0x01000000 : 0x0); //CommandID
gxCommand[1]=(u32)buf0a; //buf0 address
gxCommand[2]=(u32)buf0s; //buf0 size
gxCommand[3]=flags&1; //written to GSP module state
gxCommand[4]=gxCommand[5]=gxCommand[6]=0x0;
gxCommand[7]=(flags>>1)&1; //when non-zero, call svcFlushProcessDataCache() with the specified buffer
return GSPGPU_SubmitGxCommand(gxCmdBuf, gxCommand, NULL);
}
__attribute__((always_inline))
static INLINE void ctrGuFlushAndRun(bool queued)
{
//take advantage of GX_SetCommandList_First to flush gsp heap
ctrGuSetCommandList_First(queued, gpuCmdBuf, gpuCmdBufOffset*4, __linear_heap, __linear_heap_size, NULL, 0);
ctrGuSetCommandList_Last(queued, gpuCmdBuf, gpuCmdBufOffset*4, 0x0);
}
__attribute__((always_inline))
static INLINE Result ctrGuSetMemoryFill(bool queued, u32* buf0a, u32 buf0v, u32* buf0e, u16 width0, u32* buf1a, u32 buf1v, u32* buf1e, u16 width1)
{
u32 gxCommand[0x8];
gxCommand[0]=0x02 | (queued? 0x01000000 : 0x0); //CommandID
gxCommand[1]=(u32)buf0a; //buf0 address
gxCommand[2]=buf0v; //buf0 value
gxCommand[3]=(u32)buf0e; //buf0 end addr
gxCommand[4]=(u32)buf1a; //buf1 address
gxCommand[5]=buf1v; //buf1 value
gxCommand[6]=(u32)buf1e; //buf1 end addr
gxCommand[7]=(width0)|(width1<<16);
return GSPGPU_SubmitGxCommand(gxCmdBuf, gxCommand, NULL);
}
__attribute__((always_inline))
static INLINE Result ctrGuCopyImage
(void* src, int src_w, int src_h, int src_fmt, bool src_is_tiled,
void* dst, int dst_w, int dst_fmt, bool dst_is_tiled)
(bool queued,
const void* src, int src_w, int src_h, int src_fmt, bool src_is_tiled,
void* dst, int dst_w, int dst_fmt, bool dst_is_tiled)
{
u32 gxCommand[0x8];
gxCommand[0]=0x03; //CommandID
gxCommand[0]=0x03 | (queued? 0x01000000 : 0x0); //CommandID
gxCommand[1]=(u32)src;
gxCommand[2]=(u32)dst;
gxCommand[3]=dst_w&0xFF8;
@ -133,11 +164,12 @@ static INLINE Result ctrGuCopyImage
__attribute__((always_inline))
static INLINE Result ctrGuDisplayTransfer
(void* src, int src_w, int src_h, int src_fmt,
(bool queued,
void* src, int src_w, int src_h, int src_fmt,
void* dst, int dst_w, int dst_h, int dst_fmt, int multisample_lvl)
{
u32 gxCommand[0x8];
gxCommand[0]=0x03; //CommandID
gxCommand[0]=0x03 | (queued? 0x01000000 : 0x0); //CommandID
gxCommand[1]=(u32)src;
gxCommand[2]=(u32)dst;
gxCommand[3]=CTRGU_SIZE(dst_w, dst_h);