mirror of
https://github.com/hrydgard/ppsspp.git
synced 2024-11-26 23:10:38 +00:00
Don't translate bone weights to floats unnecessarily. Minor optimization to ApplyShader, pushing it down the profile from 2% to 0.5% in Wipeout.
This commit is contained in:
parent
06d79195ad
commit
bf51291527
@ -315,6 +315,15 @@ void LinkedShader::updateUniforms() {
|
||||
dirtyUniforms = 0;
|
||||
}
|
||||
|
||||
ShaderManager::ShaderManager() : lastShader(NULL), globalDirty(0xFFFFFFFF), shaderSwitchDirty(0) {
|
||||
codeBuffer_ = new char[16384];
|
||||
}
|
||||
|
||||
ShaderManager::~ShaderManager() {
|
||||
delete [] codeBuffer_;
|
||||
}
|
||||
|
||||
|
||||
void ShaderManager::DirtyUniform(u32 what) {
|
||||
globalDirty |= what;
|
||||
}
|
||||
@ -353,10 +362,9 @@ void ShaderManager::DirtyShader()
|
||||
LinkedShader *ShaderManager::ApplyShader(int prim)
|
||||
{
|
||||
if (globalDirty) {
|
||||
// Deferred dirtying! Let's see if we can make this even more clever later.
|
||||
for (LinkedShaderCache::iterator iter = linkedShaderCache.begin(); iter != linkedShaderCache.end(); ++iter) {
|
||||
iter->second->dirtyUniforms |= globalDirty;
|
||||
}
|
||||
if (lastShader)
|
||||
lastShader->dirtyUniforms |= globalDirty;
|
||||
shaderSwitchDirty |= globalDirty;
|
||||
globalDirty = 0;
|
||||
}
|
||||
|
||||
@ -376,6 +384,12 @@ LinkedShader *ShaderManager::ApplyShader(int prim)
|
||||
lastShader->stop();
|
||||
}
|
||||
|
||||
// Deferred dirtying! Let's see if we can make this even more clever later.
|
||||
for (LinkedShaderCache::iterator iter = linkedShaderCache.begin(); iter != linkedShaderCache.end(); ++iter) {
|
||||
iter->second->dirtyUniforms |= shaderSwitchDirty;
|
||||
}
|
||||
shaderSwitchDirty = 0;
|
||||
|
||||
lastVSID = VSID;
|
||||
lastFSID = FSID;
|
||||
|
||||
|
@ -133,12 +133,8 @@ private:
|
||||
class ShaderManager
|
||||
{
|
||||
public:
|
||||
ShaderManager() : lastShader(NULL), globalDirty(0xFFFFFFFF) {
|
||||
codeBuffer_ = new char[16384];
|
||||
}
|
||||
~ShaderManager() {
|
||||
delete [] codeBuffer_;
|
||||
}
|
||||
ShaderManager();
|
||||
~ShaderManager();
|
||||
|
||||
void ClearCache(bool deleteThem); // TODO: deleteThem currently not respected
|
||||
LinkedShader *ApplyShader(int prim);
|
||||
@ -160,6 +156,7 @@ private:
|
||||
|
||||
LinkedShader *lastShader;
|
||||
u32 globalDirty;
|
||||
u32 shaderSwitchDirty;
|
||||
char *codeBuffer_;
|
||||
|
||||
typedef std::map<FragmentShaderID, Shader *> FSCache;
|
||||
|
@ -298,12 +298,12 @@ void Lighter::Light(float colorOut0[4], float colorOut1[4], const float colorIn[
|
||||
}
|
||||
|
||||
struct GlTypeInfo {
|
||||
GLuint type;
|
||||
int count;
|
||||
GLboolean normalized;
|
||||
u16 type;
|
||||
u8 count;
|
||||
u8 normalized;
|
||||
};
|
||||
|
||||
const GlTypeInfo GLComp[] = {
|
||||
static const GlTypeInfo GLComp[] = {
|
||||
{0}, // DEC_NONE,
|
||||
{GL_FLOAT, 1, GL_FALSE}, // DEC_FLOAT_1,
|
||||
{GL_FLOAT, 2, GL_FALSE}, // DEC_FLOAT_2,
|
||||
@ -311,8 +311,10 @@ const GlTypeInfo GLComp[] = {
|
||||
{GL_FLOAT, 4, GL_FALSE}, // DEC_FLOAT_4,
|
||||
{GL_BYTE, 4, GL_TRUE}, // DEC_S8_3,
|
||||
{GL_SHORT, 4, GL_TRUE},// DEC_S16_3,
|
||||
{GL_UNSIGNED_BYTE, 4, GL_TRUE},// DEC_U8_4,
|
||||
{GL_UNSIGNED_BYTE, 1, GL_TRUE},// DEC_U8_1,
|
||||
{GL_UNSIGNED_BYTE, 2, GL_TRUE},// DEC_U8_2,
|
||||
{GL_UNSIGNED_BYTE, 3, GL_TRUE},// DEC_U8_3,
|
||||
{GL_UNSIGNED_BYTE, 4, GL_TRUE},// DEC_U8_4,
|
||||
};
|
||||
|
||||
static inline void VertexAttribSetup(int attrib, int fmt, int stride, u8 *ptr) {
|
||||
@ -838,8 +840,9 @@ void TransformDrawEngine::ClearTrackedVertexArrays() {
|
||||
}
|
||||
|
||||
void TransformDrawEngine::DecimateTrackedVertexArrays() {
|
||||
int threshold = gpuStats.numFrames - VAI_KILL_AGE;
|
||||
for (auto iter = vai_.begin(); iter != vai_.end(); ) {
|
||||
if (iter->second->lastFrame + VAI_KILL_AGE < gpuStats.numFrames) {
|
||||
if (iter->second->lastFrame < threshold ) {
|
||||
delete iter->second;
|
||||
vai_.erase(iter++);
|
||||
}
|
||||
|
@ -69,6 +69,9 @@ int DecFmtSize(u8 fmt) {
|
||||
case DEC_FLOAT_4: return 16;
|
||||
case DEC_S8_3: return 4;
|
||||
case DEC_S16_3: return 8;
|
||||
case DEC_U8_1: return 4;
|
||||
case DEC_U8_2: return 4;
|
||||
case DEC_U8_3: return 4;
|
||||
case DEC_U8_4: return 4;
|
||||
default:
|
||||
return 0;
|
||||
@ -107,10 +110,10 @@ DecVtxFormat GetTransformedVtxFormat(const DecVtxFormat &fmt) {
|
||||
|
||||
void VertexDecoder::Step_WeightsU8() const
|
||||
{
|
||||
float *wt = (float *)(decoded_ + decFmt.w0off);
|
||||
u8 *wt = (u8 *)(decoded_ + decFmt.w0off);
|
||||
const u8 *wdata = (const u8*)(ptr_);
|
||||
for (int j = 0; j < nweights; j++)
|
||||
wt[j] = (float)wdata[j] / 128.0f;
|
||||
wt[j] = wdata[j];
|
||||
}
|
||||
|
||||
void VertexDecoder::Step_WeightsU16() const
|
||||
@ -118,14 +121,19 @@ void VertexDecoder::Step_WeightsU16() const
|
||||
float *wt = (float *)(decoded_ + decFmt.w0off);
|
||||
const u16 *wdata = (const u16*)(ptr_);
|
||||
for (int j = 0; j < nweights; j++)
|
||||
wt[j] = (float)wdata[j] / 32768.0f;
|
||||
wt[j] = (float)wdata[j] / 65535.0f;
|
||||
}
|
||||
|
||||
// Float weights should be uncommon, we can live with having to multiply these by 2.0
|
||||
// to avoid special checks in the vertex shader generator.
|
||||
// (PSP uses 0.0-2.0 fixed point numbers for weights)
|
||||
void VertexDecoder::Step_WeightsFloat() const
|
||||
{
|
||||
float *wt = (float *)(decoded_ + decFmt.w0off);
|
||||
const float *wdata = (const float*)(ptr_);
|
||||
memcpy(wt, wdata, nweights * sizeof(float));
|
||||
for (int i = 0; i < nweights; i++) {
|
||||
wt[i] = wdata[i] * 0.5f;
|
||||
}
|
||||
}
|
||||
|
||||
void VertexDecoder::Step_TcU8() const
|
||||
@ -544,14 +552,21 @@ void VertexDecoder::SetVertexType(u32 fmt) {
|
||||
|
||||
steps_[numSteps_++] = wtstep[weighttype];
|
||||
|
||||
int fmtBase = DEC_FLOAT_1;
|
||||
int weightSize = 4;
|
||||
if (weighttype == GE_VTYPE_WEIGHT_8BIT >> GE_VTYPE_WEIGHT_SHIFT) {
|
||||
fmtBase = DEC_U8_1;
|
||||
weightSize = 1;
|
||||
}
|
||||
|
||||
if (nweights < 5) {
|
||||
decFmt.w0off = decOff;
|
||||
decFmt.w0fmt = DEC_FLOAT_1 + nweights - 1;
|
||||
decFmt.w0fmt = fmtBase + nweights - 1;
|
||||
} else {
|
||||
decFmt.w0off = decOff;
|
||||
decFmt.w0fmt = DEC_FLOAT_4;
|
||||
decFmt.w1off = decOff + 4 * 4;
|
||||
decFmt.w1fmt = DEC_FLOAT_1 + nweights - 5;
|
||||
decFmt.w0fmt = fmtBase + 3;
|
||||
decFmt.w1off = decOff + 4 * weightSize;
|
||||
decFmt.w1fmt = fmtBase + nweights - 5;
|
||||
}
|
||||
decOff += nweights * 4;
|
||||
}
|
||||
|
@ -34,6 +34,9 @@ enum {
|
||||
DEC_FLOAT_4,
|
||||
DEC_S8_3,
|
||||
DEC_S16_3,
|
||||
DEC_U8_1,
|
||||
DEC_U8_2,
|
||||
DEC_U8_3,
|
||||
DEC_U8_4,
|
||||
};
|
||||
|
||||
@ -243,7 +246,7 @@ public:
|
||||
switch (decFmt_.c0fmt) {
|
||||
case DEC_U8_4:
|
||||
{
|
||||
u8 *p = (u8 *)(data_ + decFmt_.c0off);
|
||||
const u8 *p = (const u8 *)(data_ + decFmt_.c0off);
|
||||
for (int i = 0; i < 4; i++)
|
||||
color[i] = p[i] / 255.0f;
|
||||
}
|
||||
@ -260,7 +263,7 @@ public:
|
||||
switch (decFmt_.c1fmt) {
|
||||
case DEC_U8_4:
|
||||
{
|
||||
u8 *p = (u8 *)(data_ + decFmt_.c1off);
|
||||
const u8 *p = (const u8 *)(data_ + decFmt_.c1off);
|
||||
for (int i = 0; i < 3; i++)
|
||||
color[i] = p[i] / 255.0f;
|
||||
}
|
||||
@ -274,15 +277,22 @@ public:
|
||||
}
|
||||
|
||||
void ReadWeights(float weights[8]) {
|
||||
const u8 *p = (const u8 *)(data_ + decFmt_.w0off);
|
||||
switch (decFmt_.w0fmt) {
|
||||
case DEC_FLOAT_1: memcpy(weights, data_ + decFmt_.w0off, 4); break;
|
||||
case DEC_FLOAT_2: memcpy(weights, data_ + decFmt_.w0off, 8); break;
|
||||
case DEC_FLOAT_3: memcpy(weights, data_ + decFmt_.w0off, 12); break;
|
||||
case DEC_FLOAT_4: memcpy(weights, data_ + decFmt_.w0off, 16); break;
|
||||
case DEC_U8_1: weights[0] = p[0] / 128.f; break;
|
||||
case DEC_U8_2: for (int i = 0; i < 2; i++) weights[i] = p[i] / 128.f; break;
|
||||
case DEC_U8_3: for (int i = 0; i < 3; i++) weights[i] = p[i] / 128.f; break;
|
||||
case DEC_U8_4: for (int i = 0; i < 4; i++) weights[i] = p[i] / 128.f; break;
|
||||
default:
|
||||
ERROR_LOG(G3D, "Reader: Unsupported W0 Format");
|
||||
break;
|
||||
}
|
||||
|
||||
p = (const u8 *)(data_ + decFmt_.w1off);
|
||||
switch (decFmt_.w1fmt) {
|
||||
case 0:
|
||||
// It's fine for there to be w0 weights but not w1.
|
||||
@ -291,6 +301,10 @@ public:
|
||||
case DEC_FLOAT_2: memcpy(weights + 4, data_ + decFmt_.w1off, 8); break;
|
||||
case DEC_FLOAT_3: memcpy(weights + 4, data_ + decFmt_.w1off, 12); break;
|
||||
case DEC_FLOAT_4: memcpy(weights + 4, data_ + decFmt_.w1off, 16); break;
|
||||
case DEC_U8_1: weights[4] = p[0] / 128.f; break;
|
||||
case DEC_U8_2: for (int i = 0; i < 2; i++) weights[i+4] = p[i] / 128.f; break;
|
||||
case DEC_U8_3: for (int i = 0; i < 3; i++) weights[i+4] = p[i] / 128.f; break;
|
||||
case DEC_U8_4: for (int i = 0; i < 4; i++) weights[i+4] = p[i] / 128.f; break;
|
||||
default:
|
||||
ERROR_LOG(G3D, "Reader: Unsupported W1 Format");
|
||||
break;
|
||||
|
@ -275,7 +275,7 @@ void GenerateVertexShader(int prim, char *buffer) {
|
||||
WRITE(p, " worldnormal += %s * (u_bone%i * vec4(a_normal, 0.0)).xyz;\n", weightAttr, i);
|
||||
}
|
||||
// Finally, multiply by world matrix (yes, we have to).
|
||||
WRITE(p, " worldpos = (u_world * vec4(worldpos, 1.0)).xyz;\n");
|
||||
WRITE(p, " worldpos = (u_world * vec4(worldpos * 2.0, 1.0)).xyz;\n");
|
||||
if (hasNormal)
|
||||
WRITE(p, " worldnormal = (u_world * vec4(worldnormal, 0.0)).xyz;\n");
|
||||
}
|
||||
|
Loading…
Reference in New Issue
Block a user