zfreeze: cache vertex positions

Suggested by degasus.
This commit is contained in:
Tillmann Karras 2015-06-01 19:58:27 +02:00
parent 9e2f4dd7da
commit 5ddd2cef6c
9 changed files with 80 additions and 17 deletions

View File

@ -1823,6 +1823,7 @@ void XEmitter::PCMPGTD(X64Reg dest, const OpArg& arg) {WriteSSEOp(0x66, 0x66, d
void XEmitter::PEXTRW(X64Reg dest, const OpArg& arg, u8 subreg) {WriteSSEOp(0x66, 0xC5, dest, arg); Write8(subreg);}
void XEmitter::PINSRW(X64Reg dest, const OpArg& arg, u8 subreg) {WriteSSEOp(0x66, 0xC4, dest, arg); Write8(subreg);}
void XEmitter::PINSRD(X64Reg dest, const OpArg& arg, u8 subreg) {WriteSSE41Op(0x66, 0x3A22, dest, arg); Write8(subreg);}
void XEmitter::PMADDWD(X64Reg dest, const OpArg& arg) {WriteSSEOp(0x66, 0xF5, dest, arg); }
void XEmitter::PSADBW(X64Reg dest, const OpArg& arg) {WriteSSEOp(0x66, 0xF6, dest, arg);}

View File

@ -711,6 +711,7 @@ public:
void PEXTRW(X64Reg dest, const OpArg& arg, u8 subreg);
void PINSRW(X64Reg dest, const OpArg& arg, u8 subreg);
void PINSRD(X64Reg dest, const OpArg& arg, u8 subreg);
void PMADDWD(X64Reg dest, const OpArg& arg);
void PSADBW(X64Reg dest, const OpArg& arg);

View File

@ -14,6 +14,7 @@
#include "VideoCommon/VertexLoader_Normal.h"
#include "VideoCommon/VertexLoader_Position.h"
#include "VideoCommon/VertexLoader_TextCoord.h"
#include "VideoCommon/VertexLoaderManager.h"
#include "VideoCommon/VideoCommon.h"
#include "VideoCommon/VideoConfig.h"
@ -24,6 +25,8 @@ u8* g_vertex_manager_write_ptr;
static void LOADERDECL PosMtx_ReadDirect_UByte(VertexLoader* loader)
{
u32 posmtx = DataReadU8() & 0x3f;
if (loader->m_counter < 3)
VertexLoaderManager::position_matrix_index[loader->m_counter] = posmtx;
DataWrite<u32>(posmtx);
PRIM_LOG("posmtx: %d, ", posmtx);
}
@ -316,7 +319,7 @@ int VertexLoader::RunVertices(DataReader src, DataReader dst, int count)
m_numLoadedVertices += count;
m_skippedVertices = 0;
for (int s = 0; s < count; s++)
for (m_counter = count - 1; m_counter >= 0; m_counter--)
{
m_tcIndex = 0;
m_colIndex = 0;

View File

@ -49,6 +49,7 @@ public:
int m_texmtxread;
bool m_vertexSkip;
int m_skippedVertices;
int m_counter;
private:
// Pipeline.

View File

@ -26,6 +26,9 @@
namespace VertexLoaderManager
{
float position_cache[3][4];
u32 position_matrix_index[3];
typedef std::unordered_map<PortableVertexDeclaration, std::unique_ptr<NativeVertexFormat>> NativeVertexFormatMap;
static NativeVertexFormatMap s_native_vertex_map;
static NativeVertexFormat* s_current_vtx_fmt;

View File

@ -28,5 +28,10 @@ namespace VertexLoaderManager
// Resolved pointers to array bases. Used by vertex loaders.
extern u8 *cached_arraybases[12];
void UpdateVertexArrayPointers();
// Position cache for zfreeze (3 vertices, 4 floats each to allow SIMD overwrite).
// These arrays are in reverse order.
extern float position_cache[3][4];
extern u32 position_matrix_index[3];
}

View File

@ -23,6 +23,11 @@ static const X64Reg base_reg = RBX;
static const u8* memory_base_ptr = (u8*)&g_main_cp_state.array_strides;
static OpArg MPIC(const void* ptr, X64Reg scale_reg, int scale = SCALE_1)
{
return MComplex(base_reg, scale_reg, scale, (s32)((u8*)ptr - memory_base_ptr));
}
static OpArg MPIC(const void* ptr)
{
return MDisp(base_reg, (s32)((u8*)ptr - memory_base_ptr));
@ -193,6 +198,31 @@ int VertexLoaderX64::ReadVertex(OpArg data, u64 attribute, int format, int count
MOV(32, dest, R(scratch3));
data.AddMemOffset(sizeof(float));
dest.AddMemOffset(sizeof(float));
// zfreeze
if (native_format == &m_native_vtx_decl.position)
{
if (cpu_info.bSSE4_1)
{
PINSRD(coords, R(scratch3), i);
}
else
{
PINSRW(coords, R(scratch3), 2 * i + 0);
SHR(32, R(scratch3), Imm8(16));
PINSRW(coords, R(scratch3), 2 * i + 1);
}
}
}
// zfreeze
if (native_format == &m_native_vtx_decl.position)
{
CMP(32, R(count_reg), Imm8(3));
FixupBranch dont_store = J_CC(CC_A);
LEA(32, scratch3, MScaled(count_reg, SCALE_4, -4));
MOVUPS(MPIC(VertexLoaderManager::position_cache, scratch3, SCALE_4), coords);
SetJumpTarget(dont_store);
}
return load_bytes;
}
@ -213,6 +243,16 @@ int VertexLoaderX64::ReadVertex(OpArg data, u64 attribute, int format, int count
case 3: MOVUPS(dest, coords); break;
}
// zfreeze
if (native_format == &m_native_vtx_decl.position)
{
CMP(32, R(count_reg), Imm8(3));
FixupBranch dont_store = J_CC(CC_A);
LEA(32, scratch3, MScaled(count_reg, SCALE_4, -4));
MOVUPS(MPIC(VertexLoaderManager::position_cache, scratch3, SCALE_4), coords);
SetJumpTarget(dont_store);
}
return load_bytes;
}
@ -388,6 +428,13 @@ void VertexLoaderX64::GenerateVertexLoader()
MOVZX(32, 8, scratch1, MDisp(src_reg, m_src_ofs));
AND(32, R(scratch1), Imm8(0x3F));
MOV(32, MDisp(dst_reg, m_dst_ofs), R(scratch1));
// zfreeze
CMP(32, R(count_reg), Imm8(3));
FixupBranch dont_store = J_CC(CC_A);
MOV(32, MPIC(VertexLoaderManager::position_matrix_index - 1, count_reg, SCALE_4), R(scratch1));
SetJumpTarget(dont_store);
m_native_components |= VB_HAS_POSMTXIDX;
m_native_vtx_decl.posmtx.components = 4;
m_native_vtx_decl.posmtx.enable = true;

View File

@ -32,7 +32,12 @@ void LOADERDECL Pos_ReadDirect(VertexLoader* loader)
DataReader src(g_video_buffer_read_ptr, nullptr);
for (int i = 0; i < N; ++i)
dst.Write(PosScale(src.Read<T>(), scale));
{
float value = PosScale(src.Read<T>(), scale);
if (loader->m_counter < 3)
VertexLoaderManager::position_cache[loader->m_counter][i] = value;
dst.Write(value);
}
g_vertex_manager_write_ptr = dst.GetPointer();
g_video_buffer_read_ptr = src.GetPointer();
@ -52,7 +57,12 @@ void LOADERDECL Pos_ReadIndex(VertexLoader* loader)
DataReader dst(g_vertex_manager_write_ptr, nullptr);
for (int i = 0; i < N; ++i)
dst.Write(PosScale(Common::FromBigEndian(data[i]), scale));
{
float value = PosScale(Common::FromBigEndian(data[i]), scale);
if (loader->m_counter < 3)
VertexLoaderManager::position_cache[loader->m_counter][i] = value;
dst.Write(value);
}
g_vertex_manager_write_ptr = dst.GetPointer();
LOG_VTX();

View File

@ -279,7 +279,6 @@ void VertexManager::DoState(PointerWrap& p)
void VertexManager::CalculateZSlope(NativeVertexFormat* format)
{
float vtx[9];
float out[12];
float viewOffset[2] = { xfmem.viewport.xOrig - bpmem.scissorOffset.x * 2,
xfmem.viewport.yOrig - bpmem.scissorOffset.y * 2};
@ -290,31 +289,24 @@ void VertexManager::CalculateZSlope(NativeVertexFormat* format)
// Global matrix ID.
u32 mtxIdx = g_main_cp_state.matrix_index_a.PosNormalMtxIdx;
const PortableVertexDeclaration vert_decl = format->GetVertexDeclaration();
size_t posOff = vert_decl.position.offset;
size_t mtxOff = vert_decl.posmtx.offset;
// Make sure the buffer contains at least 3 vertices.
if ((s_pCurBufferPointer - s_pBaseBufferPointer) < (vert_decl.stride * 3))
return;
// Lookup vertices of the last rendered triangle and software-transform them
// This allows us to determine the depth slope, which will be used if z--freeze
// This allows us to determine the depth slope, which will be used if z-freeze
// is enabled in the following flush.
for (unsigned int i = 0; i < 3; ++i)
{
u8* vtx_ptr = s_pCurBufferPointer - vert_decl.stride * (3 - i);
vtx[0 + i * 3] = ((float*)(vtx_ptr + posOff))[0];
vtx[1 + i * 3] = ((float*)(vtx_ptr + posOff))[1];
if (vert_decl.position.components == 3)
vtx[2 + i * 3] = ((float*)(vtx_ptr + posOff))[2];
else
vtx[2 + i * 3] = 0;
// If this vertex format has per-vertex position matrix IDs, look it up.
if (vert_decl.posmtx.enable)
mtxIdx = *((u32*)(vtx_ptr + mtxOff));
mtxIdx = VertexLoaderManager::position_matrix_index[2 - i];
VertexShaderManager::TransformToClipSpace(&vtx[i * 3], &out[i * 4], mtxIdx);
if (vert_decl.position.components == 2)
VertexLoaderManager::position_cache[2 - i][2] = 0;
VertexShaderManager::TransformToClipSpace(&VertexLoaderManager::position_cache[2 - i][0], &out[i * 4], mtxIdx);
// Transform to Screenspace
float inv_w = 1.0f / out[3 + i * 4];