Don't translate bone weights to floats unnecessarily. Minor optimization to ApplyShader, pushing it down the profile from 2% to 0.5% in Wipeout.

This commit is contained in:
Henrik Rydgard 2013-02-04 23:09:01 +01:00
parent 06d79195ad
commit bf51291527
6 changed files with 70 additions and 27 deletions

View File

@ -315,6 +315,15 @@ void LinkedShader::updateUniforms() {
dirtyUniforms = 0;
}
ShaderManager::ShaderManager() : lastShader(NULL), globalDirty(0xFFFFFFFF), shaderSwitchDirty(0) {
codeBuffer_ = new char[16384];
}
ShaderManager::~ShaderManager() {
delete [] codeBuffer_;
}
void ShaderManager::DirtyUniform(u32 what) {
globalDirty |= what;
}
@ -353,10 +362,9 @@ void ShaderManager::DirtyShader()
LinkedShader *ShaderManager::ApplyShader(int prim)
{
if (globalDirty) {
// Deferred dirtying! Let's see if we can make this even more clever later.
for (LinkedShaderCache::iterator iter = linkedShaderCache.begin(); iter != linkedShaderCache.end(); ++iter) {
iter->second->dirtyUniforms |= globalDirty;
}
if (lastShader)
lastShader->dirtyUniforms |= globalDirty;
shaderSwitchDirty |= globalDirty;
globalDirty = 0;
}
@ -376,6 +384,12 @@ LinkedShader *ShaderManager::ApplyShader(int prim)
lastShader->stop();
}
// Deferred dirtying! Let's see if we can make this even more clever later.
for (LinkedShaderCache::iterator iter = linkedShaderCache.begin(); iter != linkedShaderCache.end(); ++iter) {
iter->second->dirtyUniforms |= shaderSwitchDirty;
}
shaderSwitchDirty = 0;
lastVSID = VSID;
lastFSID = FSID;

View File

@ -133,12 +133,8 @@ private:
class ShaderManager
{
public:
ShaderManager() : lastShader(NULL), globalDirty(0xFFFFFFFF) {
codeBuffer_ = new char[16384];
}
~ShaderManager() {
delete [] codeBuffer_;
}
ShaderManager();
~ShaderManager();
void ClearCache(bool deleteThem); // TODO: deleteThem currently not respected
LinkedShader *ApplyShader(int prim);
@ -160,6 +156,7 @@ private:
LinkedShader *lastShader;
u32 globalDirty;
u32 shaderSwitchDirty;
char *codeBuffer_;
typedef std::map<FragmentShaderID, Shader *> FSCache;

View File

@ -298,12 +298,12 @@ void Lighter::Light(float colorOut0[4], float colorOut1[4], const float colorIn[
}
struct GlTypeInfo {
GLuint type;
int count;
GLboolean normalized;
u16 type;
u8 count;
u8 normalized;
};
const GlTypeInfo GLComp[] = {
static const GlTypeInfo GLComp[] = {
{0}, // DEC_NONE,
{GL_FLOAT, 1, GL_FALSE}, // DEC_FLOAT_1,
{GL_FLOAT, 2, GL_FALSE}, // DEC_FLOAT_2,
@ -311,8 +311,10 @@ const GlTypeInfo GLComp[] = {
{GL_FLOAT, 4, GL_FALSE}, // DEC_FLOAT_4,
{GL_BYTE, 4, GL_TRUE}, // DEC_S8_3,
{GL_SHORT, 4, GL_TRUE},// DEC_S16_3,
{GL_UNSIGNED_BYTE, 4, GL_TRUE},// DEC_U8_4,
{GL_UNSIGNED_BYTE, 1, GL_TRUE},// DEC_U8_1,
{GL_UNSIGNED_BYTE, 2, GL_TRUE},// DEC_U8_2,
{GL_UNSIGNED_BYTE, 3, GL_TRUE},// DEC_U8_3,
{GL_UNSIGNED_BYTE, 4, GL_TRUE},// DEC_U8_4,
};
static inline void VertexAttribSetup(int attrib, int fmt, int stride, u8 *ptr) {
@ -838,8 +840,9 @@ void TransformDrawEngine::ClearTrackedVertexArrays() {
}
void TransformDrawEngine::DecimateTrackedVertexArrays() {
int threshold = gpuStats.numFrames - VAI_KILL_AGE;
for (auto iter = vai_.begin(); iter != vai_.end(); ) {
if (iter->second->lastFrame + VAI_KILL_AGE < gpuStats.numFrames) {
if (iter->second->lastFrame < threshold ) {
delete iter->second;
vai_.erase(iter++);
}

View File

@ -69,6 +69,9 @@ int DecFmtSize(u8 fmt) {
case DEC_FLOAT_4: return 16;
case DEC_S8_3: return 4;
case DEC_S16_3: return 8;
case DEC_U8_1: return 4;
case DEC_U8_2: return 4;
case DEC_U8_3: return 4;
case DEC_U8_4: return 4;
default:
return 0;
@ -107,10 +110,10 @@ DecVtxFormat GetTransformedVtxFormat(const DecVtxFormat &fmt) {
void VertexDecoder::Step_WeightsU8() const
{
float *wt = (float *)(decoded_ + decFmt.w0off);
u8 *wt = (u8 *)(decoded_ + decFmt.w0off);
const u8 *wdata = (const u8*)(ptr_);
for (int j = 0; j < nweights; j++)
wt[j] = (float)wdata[j] / 128.0f;
wt[j] = wdata[j];
}
void VertexDecoder::Step_WeightsU16() const
@ -118,14 +121,19 @@ void VertexDecoder::Step_WeightsU16() const
float *wt = (float *)(decoded_ + decFmt.w0off);
const u16 *wdata = (const u16*)(ptr_);
for (int j = 0; j < nweights; j++)
wt[j] = (float)wdata[j] / 32768.0f;
wt[j] = (float)wdata[j] / 65535.0f;
}
// Float weights should be uncommon, we can live with having to multiply these by 2.0
// to avoid special checks in the vertex shader generator.
// (PSP uses 0.0-2.0 fixed point numbers for weights)
void VertexDecoder::Step_WeightsFloat() const
{
float *wt = (float *)(decoded_ + decFmt.w0off);
const float *wdata = (const float*)(ptr_);
memcpy(wt, wdata, nweights * sizeof(float));
for (int i = 0; i < nweights; i++) {
wt[i] = wdata[i] * 0.5f;
}
}
void VertexDecoder::Step_TcU8() const
@ -544,14 +552,21 @@ void VertexDecoder::SetVertexType(u32 fmt) {
steps_[numSteps_++] = wtstep[weighttype];
int fmtBase = DEC_FLOAT_1;
int weightSize = 4;
if (weighttype == GE_VTYPE_WEIGHT_8BIT >> GE_VTYPE_WEIGHT_SHIFT) {
fmtBase = DEC_U8_1;
weightSize = 1;
}
if (nweights < 5) {
decFmt.w0off = decOff;
decFmt.w0fmt = DEC_FLOAT_1 + nweights - 1;
decFmt.w0fmt = fmtBase + nweights - 1;
} else {
decFmt.w0off = decOff;
decFmt.w0fmt = DEC_FLOAT_4;
decFmt.w1off = decOff + 4 * 4;
decFmt.w1fmt = DEC_FLOAT_1 + nweights - 5;
decFmt.w0fmt = fmtBase + 3;
decFmt.w1off = decOff + 4 * weightSize;
decFmt.w1fmt = fmtBase + nweights - 5;
}
decOff += nweights * 4;
}

View File

@ -34,6 +34,9 @@ enum {
DEC_FLOAT_4,
DEC_S8_3,
DEC_S16_3,
DEC_U8_1,
DEC_U8_2,
DEC_U8_3,
DEC_U8_4,
};
@ -243,7 +246,7 @@ public:
switch (decFmt_.c0fmt) {
case DEC_U8_4:
{
u8 *p = (u8 *)(data_ + decFmt_.c0off);
const u8 *p = (const u8 *)(data_ + decFmt_.c0off);
for (int i = 0; i < 4; i++)
color[i] = p[i] / 255.0f;
}
@ -260,7 +263,7 @@ public:
switch (decFmt_.c1fmt) {
case DEC_U8_4:
{
u8 *p = (u8 *)(data_ + decFmt_.c1off);
const u8 *p = (const u8 *)(data_ + decFmt_.c1off);
for (int i = 0; i < 3; i++)
color[i] = p[i] / 255.0f;
}
@ -274,15 +277,22 @@ public:
}
void ReadWeights(float weights[8]) {
const u8 *p = (const u8 *)(data_ + decFmt_.w0off);
switch (decFmt_.w0fmt) {
case DEC_FLOAT_1: memcpy(weights, data_ + decFmt_.w0off, 4); break;
case DEC_FLOAT_2: memcpy(weights, data_ + decFmt_.w0off, 8); break;
case DEC_FLOAT_3: memcpy(weights, data_ + decFmt_.w0off, 12); break;
case DEC_FLOAT_4: memcpy(weights, data_ + decFmt_.w0off, 16); break;
case DEC_U8_1: weights[0] = p[0] / 128.f; break;
case DEC_U8_2: for (int i = 0; i < 2; i++) weights[i] = p[i] / 128.f; break;
case DEC_U8_3: for (int i = 0; i < 3; i++) weights[i] = p[i] / 128.f; break;
case DEC_U8_4: for (int i = 0; i < 4; i++) weights[i] = p[i] / 128.f; break;
default:
ERROR_LOG(G3D, "Reader: Unsupported W0 Format");
break;
}
p = (const u8 *)(data_ + decFmt_.w1off);
switch (decFmt_.w1fmt) {
case 0:
// It's fine for there to be w0 weights but not w1.
@ -291,6 +301,10 @@ public:
case DEC_FLOAT_2: memcpy(weights + 4, data_ + decFmt_.w1off, 8); break;
case DEC_FLOAT_3: memcpy(weights + 4, data_ + decFmt_.w1off, 12); break;
case DEC_FLOAT_4: memcpy(weights + 4, data_ + decFmt_.w1off, 16); break;
case DEC_U8_1: weights[4] = p[0] / 128.f; break;
case DEC_U8_2: for (int i = 0; i < 2; i++) weights[i+4] = p[i] / 128.f; break;
case DEC_U8_3: for (int i = 0; i < 3; i++) weights[i+4] = p[i] / 128.f; break;
case DEC_U8_4: for (int i = 0; i < 4; i++) weights[i+4] = p[i] / 128.f; break;
default:
ERROR_LOG(G3D, "Reader: Unsupported W1 Format");
break;

View File

@ -275,7 +275,7 @@ void GenerateVertexShader(int prim, char *buffer) {
WRITE(p, " worldnormal += %s * (u_bone%i * vec4(a_normal, 0.0)).xyz;\n", weightAttr, i);
}
// Finally, multiply by world matrix (yes, we have to).
WRITE(p, " worldpos = (u_world * vec4(worldpos, 1.0)).xyz;\n");
WRITE(p, " worldpos = (u_world * vec4(worldpos * 2.0, 1.0)).xyz;\n");
if (hasNormal)
WRITE(p, " worldnormal = (u_world * vec4(worldnormal, 0.0)).xyz;\n");
}