From e5c6cf965b90ece9e06a7ea2295e00343d8f212b Mon Sep 17 00:00:00 2001 From: Henrik Rydgard Date: Fri, 16 Nov 2012 15:16:14 +0100 Subject: [PATCH] Fixes and optimizations to vertex decoding and lighting. Motorcycles are now visible in MotoGP. --- GPU/GLES/FragmentShaderGenerator.cpp | 1 - GPU/GLES/TextureCache.cpp | 12 -- GPU/GLES/TransformPipeline.cpp | 213 +++++++++++++++++---------- GPU/GLES/TransformPipeline.h | 1 + GPU/GLES/VertexDecoder.cpp | 213 +++++++++++++++++---------- GPU/GLES/VertexDecoder.h | 28 ++-- GPU/GLES/VertexShaderGenerator.cpp | 1 - Globals.h | 18 +++ Windows/PPSSPP.sln | 34 ----- Windows/PPSSPP.vcxproj | 110 +------------- android/jni/Android.mk | 4 +- 11 files changed, 308 insertions(+), 327 deletions(-) diff --git a/GPU/GLES/FragmentShaderGenerator.cpp b/GPU/GLES/FragmentShaderGenerator.cpp index efa8425de..8b1e0b257 100644 --- a/GPU/GLES/FragmentShaderGenerator.cpp +++ b/GPU/GLES/FragmentShaderGenerator.cpp @@ -74,7 +74,6 @@ char *GenerateFragmentShader() #endif int lmode = gstate.lmode & 1; - lmode = 0; /// for now if (gstate.textureMapEnable & 1) WRITE(p, "uniform sampler2D tex;\n"); diff --git a/GPU/GLES/TextureCache.cpp b/GPU/GLES/TextureCache.cpp index 6066af4fb..6cc852b6c 100644 --- a/GPU/GLES/TextureCache.cpp +++ b/GPU/GLES/TextureCache.cpp @@ -400,18 +400,6 @@ struct DXT1Block u16 color2; }; -inline u8 Convert5To8(u8 v) -{ - // Swizzle bits: 00012345 -> 12345123 - return (v << 3) | (v >> 2); -} - -inline u8 Convert6To8(u8 v) -{ - // Swizzle bits: 00123456 -> 12345612 - return (v << 2) | (v >> 4); -} - inline u32 makecol(int r, int g, int b, int a) { return (a << 24)|(r << 16)|(g << 8)|b; diff --git a/GPU/GLES/TransformPipeline.cpp b/GPU/GLES/TransformPipeline.cpp index 15a28e08e..1e749e251 100644 --- a/GPU/GLES/TransformPipeline.cpp +++ b/GPU/GLES/TransformPipeline.cpp @@ -57,72 +57,86 @@ uint16_t indexBuffer[65536]; // Unused // TODO: This should really return 2 colors, one for specular and one for diffuse. -void Light(float colorOut[4], const float colorIn[4], Vec3 pos, Vec3 normal, float dots[4]) -{ - // could cache a lot of stuff, such as ambient, across vertices... +// Convenient way to do precomputation to save the parts of the lighting calculation +// that's common between the many vertices of a draw call. +class Lighter { +public: + Lighter(); + void Light(float colorOut0[4], float colorOut1[4], const float colorIn[4], Vec3 pos, Vec3 normal, float dots[4]); - bool doShadeMapping = (gstate.texmapmode & 0x3) == 2; - if (!doShadeMapping && !(gstate.lightEnable[0]&1) && !(gstate.lightEnable[1]&1) && !(gstate.lightEnable[2]&1) && !(gstate.lightEnable[3]&1)) - { - memcpy(colorOut, colorIn, sizeof(float) * 4); - return; - } - - Color4 emissive; - emissive.GetFromRGB(gstate.materialemissive); +private: + bool disabled_; Color4 globalAmbient; + Color4 materialEmissive; + Color4 materialAmbient; + Color4 materialDiffuse; + Color4 materialSpecular; + float specCoef_; + Vec3 viewer_; + bool doShadeMapping_; + int materialUpdate_; +}; + +Lighter::Lighter() { + disabled_ = false; + doShadeMapping_ = (gstate.texmapmode & 0x3) == 2; + if (!doShadeMapping_ && !(gstate.lightEnable[0]&1) && !(gstate.lightEnable[1]&1) && !(gstate.lightEnable[2]&1) && !(gstate.lightEnable[3]&1)) + { + disabled_ = true; + } + materialEmissive.GetFromRGB(gstate.materialemissive); + materialEmissive.a = 0.0f; globalAmbient.GetFromRGB(gstate.ambientcolor); globalAmbient.GetFromA(gstate.ambientalpha); + materialAmbient.GetFromRGB(gstate.materialambient); + materialAmbient.a = 1.0f; + materialDiffuse.GetFromRGB(gstate.materialdiffuse); + materialDiffuse.a = 1.0f; + materialSpecular.GetFromRGB(gstate.materialspecular); + materialSpecular.a = 1.0f; + specCoef_ = getFloat24(gstate.materialspecularcoef); + viewer_ = Vec3(-gstate.viewMatrix[9], -gstate.viewMatrix[10], -gstate.viewMatrix[11]); + materialUpdate_ = gstate.materialupdate & 7; +} + +void Lighter::Light(float colorOut0[4], float colorOut1[4], const float colorIn[4], Vec3 pos, Vec3 normal, float dots[4]) +{ + if (disabled_) { + memcpy(colorOut0, colorIn, sizeof(float) * 4); + memset(colorOut1, 0, sizeof(float) * 4); + return; + } Vec3 norm = normal.Normalized(); Color4 in(colorIn); - Color4 ambient; - if (gstate.materialupdate & 1) - { - ambient = in; - } + const Color4 *ambient; + if (materialUpdate_ & 1) + ambient = ∈ else - { - ambient.GetFromRGB(gstate.materialambient); - ambient.a=1.0f; - } + ambient = &materialAmbient; - Color4 diffuse; - if (gstate.materialupdate & 2) - { - diffuse = in; - } + const Color4 *diffuse; + if (materialUpdate_ & 2) + diffuse = ∈ else - { - diffuse.GetFromRGB(gstate.materialdiffuse); - diffuse.a=1.0f; - } + diffuse = &materialDiffuse; - Color4 specular; - if (gstate.materialupdate & 4) - { - specular = in; - } + const Color4 *specular; + if (materialUpdate_ & 4) + specular = ∈ else - { - specular.GetFromRGB(gstate.materialspecular); - specular.a=1.0f; - } - - float specCoef = getFloat24(gstate.materialspecularcoef); - - Vec3 viewer(-gstate.viewMatrix[9], -gstate.viewMatrix[10], -gstate.viewMatrix[11]); - - Color4 lightSum = globalAmbient * ambient + emissive; - + specular = &materialSpecular; + + Color4 lightSum0 = globalAmbient * *ambient + materialEmissive; + Color4 lightSum1(0,0,0,0); // Try lights.elf - there's something wrong with the lighting for (int l = 0; l < 4; l++) { // can we skip this light? - if ((gstate.lightEnable[l] & 1) == 0 && !doShadeMapping) + if ((gstate.lightEnable[l] & 1) == 0 && !doShadeMapping_) continue; GELightComputation comp = (GELightComputation)(gstate.ltype[l]&3); @@ -151,10 +165,9 @@ void Light(float colorOut[4], const float colorIn[4], Vec3 pos, Vec3 normal, flo if (dot < 0.0f) dot = 0.0f; if (poweredDiffuse) - dot = powf(dot, specCoef); + dot = powf(dot, specCoef_); - Color4 diff = (gstate.lightColor[1][l] * diffuse) * (dot * lightScale); - Color4 spec(0,0,0,0); + Color4 diff = (gstate.lightColor[1][l] * *diffuse) * (dot * lightScale); // Real PSP specular Vec3 toViewer(0,0,1); @@ -170,20 +183,27 @@ void Light(float colorOut[4], const float colorIn[4], Vec3 pos, Vec3 normal, flo dot = halfVec * norm; if (dot >= 0) { - spec += (gstate.lightColor[2][l] * specular * (powf(dot, specCoef)*lightScale)); + lightSum1 += (gstate.lightColor[2][l] * *specular * (powf(dot, specCoef_)*lightScale)); } } dots[l] = dot; if (gstate.lightEnable[l] & 1) { - lightSum += gstate.lightColor[0][l]*ambient + diff + spec; + lightSum0 += gstate.lightColor[0][l] * *ambient + diff; } } - for (int i = 0; i < 3; i++) - colorOut[i] = lightSum[i]; + // 4? + for (int i = 0; i < 4; i++) { + colorOut0[i] = lightSum0[i]; + colorOut1[i] = lightSum1[i]; + } } +// This is the software transform pipeline, which is necessary for supporting RECT +// primitives correctly. Other primitives are possible to transform and light in hardware +// using vertex shader, which will be way, way faster, especially on mobile. This has +// not yet been implemented though. void TransformAndDrawPrim(void *verts, void *inds, int prim, int vertexCount, LinkedShader *program, float *customUV, int forceIndexType) { // First, decode the verts and apply morphing @@ -234,6 +254,8 @@ void TransformAndDrawPrim(void *verts, void *inds, int prim, int vertexCount, Li vertexCount = 0x10000/3; #endif + Lighter lighter; + for (int i = 0; i < vertexCount; i++) { int indexType = (gstate.vertType & GE_VTYPE_IDX_MASK); @@ -255,9 +277,10 @@ void TransformAndDrawPrim(void *verts, void *inds, int prim, int vertexCount, Li index = i; } - float v[3] = {0,0,0}; - float c[4] = {1,1,1,1}; - float uv[2] = {0,0}; + float v[3] = {0, 0, 0}; + float c0[4] = {1, 1, 1, 1}; + float c1[4] = {0, 0, 0, 0}; + float uv[2] = {0, 0}; if (gstate.vertType & GE_VTYPE_THROUGH_MASK) { @@ -265,8 +288,11 @@ void TransformAndDrawPrim(void *verts, void *inds, int prim, int vertexCount, Li for (int j=0; j<3; j++) v[j] = decoded[index].pos[j]; // TODO : check if has color - for (int j=0; j<4; j++) - c[j] = decoded[index].color[j]; + for (int j=0; j<4; j++) { + c0[j] = decoded[index].color[j] / 255.0f; + c1[j] = 0.0f; + } + // TODO : check if has uv for (int j=0; j<2; j++) uv[j] = decoded[index].uv[j]; @@ -304,30 +330,51 @@ void TransformAndDrawPrim(void *verts, void *inds, int prim, int vertexCount, Li Norm3ByMatrix43(norm, nsum.v, gstate.worldMatrix); } - // Perform lighting here if enabled. don't need to check through, it's checked above. float dots[4] = {0,0,0,0}; if (program->a_color0 != -1) { - //c[1] = norm[1]; - float litColor[4] = {0,0,0,0}; - Light(litColor, decoded[index].color, out, norm, dots); + float unlitColor[4]; + for (int j = 0; j < 4; j++) { + unlitColor[j] = decoded[index].color[j] / 255.0f; + } + float litColor0[4]; + float litColor1[4]; + lighter.Light(litColor0, litColor1, unlitColor, out, norm, dots); + if (gstate.lightingEnable & 1) { - memcpy(c, litColor, sizeof(litColor)); + // TODO: don't ignore gstate.lmode - we should send two colors in that case + if (gstate.lmode & 1) { + // Separate colors + for (int j = 0; j < 4; j++) { + c0[j] = litColor0[j]; + c1[j] = litColor1[j]; + } + } else { + // Summed color into c0 + for (int j = 0; j < 4; j++) { + c0[j] = litColor0[j] + litColor1[j]; + c1[j] = 0.0f; + } + } } else { // no lighting? copy the color. - for (int j = 0; j < 4; j++) - c[j] = decoded[index].color[j]; + for (int j = 0; j < 4; j++) { + c0[j] = unlitColor[j]; + c1[j] = 0.0f; + } } } else { // no color in the fragment program??? - for (int j = 0; j < 4; j++) - c[j] = decoded[index].color[j]; + for (int j = 0; j < 4; j++) { + c0[j] = decoded[index].color[j] / 255.0f; + c1[j] = 0.0f; + } } if (customUV) { @@ -382,11 +429,13 @@ void TransformAndDrawPrim(void *verts, void *inds, int prim, int vertexCount, Li } } - // Transform the coord by the view matrix. Should this be done before or after texcoord generation? + // Transform the coord by the view matrix. + // We only really need to do it here for RECTANGLES drawing. However, + // there's no point in optimizing it out because all other primitives + // will be moved to hardware transform anyway. Vec3ByMatrix43(v, out, gstate.viewMatrix); } - // We need to tesselate axis-aligned rectangles, as they're only specified by two coordinates. if (prim == GE_PRIM_RECTANGLES) { @@ -404,42 +453,48 @@ void TransformAndDrawPrim(void *verts, void *inds, int prim, int vertexCount, Li trans->x = v[0]; trans->y = v[1]; trans->z = v[2]; trans->uv[0] = uv[0]; trans->uv[1] = uv[1]; - memcpy(trans->color, c, 4*sizeof(float)); + memcpy(trans->color0, c0, 4*sizeof(float)); + memcpy(trans->color1, c1, 4*sizeof(float)); trans++; // top right trans->x = v2[0]; trans->y = v[1]; trans->z = v[2]; trans->uv[0] = uv2[0]; trans->uv[1] = uv[1]; - memcpy(trans->color, c, 4*sizeof(float)); + memcpy(trans->color0, c0, 4*sizeof(float)); + memcpy(trans->color1, c1, 4*sizeof(float)); trans++; // bottom right trans->x = v2[0]; trans->y = v2[1]; trans->z = v[2]; trans->uv[0] = uv2[0]; trans->uv[1] = uv2[1]; - memcpy(trans->color, c, 4*sizeof(float)); + memcpy(trans->color0, c0, 4*sizeof(float)); + memcpy(trans->color1, c1, 4*sizeof(float)); trans++; // bottom left trans->x = v[0]; trans->y = v2[1]; trans->z = v[2]; trans->uv[0] = uv[0]; trans->uv[1] = uv2[1]; - memcpy(trans->color, c, 4*sizeof(float)); + memcpy(trans->color0, c0, 4*sizeof(float)); + memcpy(trans->color1, c1, 4*sizeof(float)); trans++; // top left trans->x = v[0]; trans->y = v[1]; trans->z = v[2]; trans->uv[0] = uv[0]; trans->uv[1] = uv[1]; - memcpy(trans->color, c, 4*sizeof(float)); + memcpy(trans->color0, c0, 4*sizeof(float)); + memcpy(trans->color1, c1, 4*sizeof(float)); trans++; // bottom right trans->x = v2[0]; trans->y = v2[1]; trans->z = v[2]; trans->uv[0] = uv2[0]; trans->uv[1] = uv2[1]; - memcpy(trans->color, c, 4*sizeof(float)); + memcpy(trans->color0, c0, 4*sizeof(float)); + memcpy(trans->color1, c1, 4*sizeof(float)); trans++; numTrans += 6; @@ -448,7 +503,8 @@ void TransformAndDrawPrim(void *verts, void *inds, int prim, int vertexCount, Li else { memcpy(&trans->x, v, 3*sizeof(float)); - memcpy(trans->color, c, 4*sizeof(float)); + memcpy(trans->color0, c0, 4*sizeof(float)); + memcpy(trans->color1, c1, 4*sizeof(float)); memcpy(trans->uv, uv, 2*sizeof(float)); trans++; numTrans++; @@ -458,15 +514,18 @@ void TransformAndDrawPrim(void *verts, void *inds, int prim, int vertexCount, Li glEnableVertexAttribArray(program->a_position); if (useTexCoord && program->a_texcoord != -1) glEnableVertexAttribArray(program->a_texcoord); if (program->a_color0 != -1) glEnableVertexAttribArray(program->a_color0); + if (program->a_color1 != -1) glEnableVertexAttribArray(program->a_color1); const int vertexSize = sizeof(*trans); glVertexAttribPointer(program->a_position, 3, GL_FLOAT, GL_FALSE, vertexSize, transformed); if (useTexCoord && program->a_texcoord != -1) glVertexAttribPointer(program->a_texcoord, 2, GL_FLOAT, GL_FALSE, vertexSize, ((uint8_t*)transformed) + 3 * 4); if (program->a_color0 != -1) glVertexAttribPointer(program->a_color0, 4, GL_FLOAT, GL_FALSE, vertexSize, ((uint8_t*)transformed) + 5 * 4); + if (program->a_color1 != -1) glVertexAttribPointer(program->a_color1, 4, GL_FLOAT, GL_FALSE, vertexSize, ((uint8_t*)transformed) + 9 * 4); // NOTICE_LOG(G3D,"DrawPrimitive: %i", numTrans); glDrawArrays(glprim[prim], 0, numTrans); glDisableVertexAttribArray(program->a_position); if (useTexCoord && program->a_texcoord != -1) glDisableVertexAttribArray(program->a_texcoord); if (program->a_color0 != -1) glDisableVertexAttribArray(program->a_color0); + if (program->a_color1 != -1) glDisableVertexAttribArray(program->a_color1); /* if (((gstate.vertType ) & GE_VTYPE_IDX_MASK) == GE_VTYPE_IDX_8BIT) diff --git a/GPU/GLES/TransformPipeline.h b/GPU/GLES/TransformPipeline.h index 7d1c24a22..330921778 100644 --- a/GPU/GLES/TransformPipeline.h +++ b/GPU/GLES/TransformPipeline.h @@ -19,4 +19,5 @@ struct LinkedShader; + void TransformAndDrawPrim(void *verts, void *inds, int prim, int count, LinkedShader *shader, float *customUV = 0, int forceIndexType = -1); diff --git a/GPU/GLES/VertexDecoder.cpp b/GPU/GLES/VertexDecoder.cpp index 163fb373d..44bc7bb9c 100644 --- a/GPU/GLES/VertexDecoder.cpp +++ b/GPU/GLES/VertexDecoder.cpp @@ -46,8 +46,6 @@ inline int align(int n, int align) return (n+(align-1)) & ~(align-1); } -static int onesize; - void VertexDecoder::SetVertexType(u32 fmt) { fmt = fmt; @@ -80,13 +78,13 @@ void VertexDecoder::SetVertexType(u32 fmt) size = align(size, tcalign[tc]); tcoff = size; size += tcsize[tc]; - if (tcalign[tc]>biggest) - biggest=tcalign[tc]; + if (tcalign[tc] > biggest) + biggest = tcalign[tc]; } if (col) { - size = align(size,colalign[col]); + size = align(size, colalign[col]); coloff = size; size += colsize[col]; if (colalign[col] > biggest) @@ -96,26 +94,27 @@ void VertexDecoder::SetVertexType(u32 fmt) { coloff = 0; } + if (nrm) { - size = align(size,nrmalign[nrm]); + size = align(size, nrmalign[nrm]); nrmoff = size; size += nrmsize[nrm]; if (nrmalign[nrm] > biggest) biggest = nrmalign[nrm]; } - //if (pos) + //if (pos) - there's always a position { - size = align(size,posalign[pos]); + size = align(size, posalign[pos]); posoff = size; size += possize[pos]; if (posalign[pos] > biggest) biggest = posalign[pos]; } - size = align(size,biggest); - onesize = size; + size = align(size, biggest); + onesize_ = size; size *= morphcount; DEBUG_LOG(G3D,"SVT : size = %i, aligned to biggest %i", size, biggest); } @@ -127,24 +126,36 @@ void VertexDecoder::DecodeVerts(DecodedVertex *decoded, const void *verts, const char *ptr = (char *)verts; - for (int i = 0; i < count; i++) - { - int index; - if (idx == (GE_VTYPE_IDX_8BIT >> 11)) - { - index = ((u8*)inds)[i]; - } - else if (idx == (GE_VTYPE_IDX_16BIT >> 11)) - { - index = ((u16*)inds)[i]; + // Find index bounds. Could cache this in display lists. + int lowerBound = 0x7FFFFFFF; + int upperBound = 0; + if (idx == (GE_VTYPE_IDX_8BIT >> GE_VTYPE_IDX_SHIFT)) { + const u8 *ind8 = (const u8 *)inds; + for (int i = 0; i < count; i++) { + if (ind8[i] < lowerBound) + lowerBound = ind8[i]; + if (ind8[i] > upperBound) + upperBound = ind8[i]; } - else - { - index = i; + } else if (idx == (GE_VTYPE_IDX_16BIT >> GE_VTYPE_IDX_SHIFT)) { + const u16 *ind16 = (const u16*)inds; + for (int i = 0; i < count; i++) { + if (ind16[i] < lowerBound) + lowerBound = ind16[i]; + if (ind16[i] > upperBound) + upperBound = ind16[i]; } + } else { + lowerBound = 0; + upperBound = count - 1; + } + // Decode the vertices within the found bounds, once each (unlike the previous way..) + for (int index = lowerBound; index <= upperBound; index++) + { ptr = (char*)verts + (index * size); + // TODO: Should weights be morphed? float *wt = decoded[index].weights; switch (weighttype) { @@ -153,29 +164,30 @@ void VertexDecoder::DecodeVerts(DecodedVertex *decoded, const void *verts, const case GE_VTYPE_WEIGHT_8BIT >> 9: { - u8 *wdata = (u8*)(ptr); - for (int j=0; j> 9: { - u16 *wdata = (u16*)(ptr); - for (int j=0; j> 9: { - float *wdata = (float*)(ptr+0); - for (int j=0; j>2: + case GE_VTYPE_COL_4444 >> 2: { u16 cdata = *(u16*)(ptr + coloff); for (int j = 0; j < 4; j++) - c[j] = (float)(cdata>>(j * 4) & 0xF) / 15.0f; + c[j] = Convert4To8((cdata >> (j * 4)) & 0xF); } break; - case GE_VTYPE_COL_565>>2: + case GE_VTYPE_COL_565 >> 2: { u16 cdata = *(u16*)(ptr + coloff); - c[0] = (float)(cdata & 0x1f) / 31.0f; - c[1] = (float)((cdata>>5) & 0x3f) / 63.0f; - c[2] = (float)((cdata>>11) & 0x1f) / 31.0f; + c[0] = Convert5To8(cdata & 0x1f); + c[1] = Convert6To8((cdata>>5) & 0x3f); + c[2] = Convert5To8((cdata>>11) & 0x1f); c[3] = 1.0f; } break; - case GE_VTYPE_COL_5551>>2: + case GE_VTYPE_COL_5551 >> 2: { u16 cdata = *(u16*)(ptr + coloff); - c[0] = (float)(cdata & 0x1f) / 31.0f; - c[1] = (float)((cdata>>5) & 0x1f) / 31.0f; - c[2] = (float)((cdata>>10) & 0x1f) / 31.0f; - c[3] = (float)(cdata>>15); + c[0] = Convert5To8(cdata & 0x1f); + c[1] = Convert5To8((cdata>>5) & 0x1f); + c[2] = Convert5To8((cdata>>10) & 0x1f); + c[3] = (cdata>>15) ? 255 : 0; } break; - case GE_VTYPE_COL_8888>>2: + case GE_VTYPE_COL_8888 >> 2: { + // TODO: speedup u8 *cdata = (u8*)(ptr + coloff); - for (int j=0; j<4; j++) - c[j] = (float)cdata[j] / 255.0f; + for (int j = 0; j < 4; j++) + c[j] = cdata[j]; } break; default: - c[0]=1.0f; c[1]=1.0f; c[2]=1.0f; c[3]=1.0f; + c[0]=255; c[1]=255; c[2]=255; c[3]=255; break; } - float *normal = decoded[index].normal; - memset(normal,0,sizeof(float)*3); - for (int n=0; n>5: + case GE_VTYPE_NRM_FLOAT >> 5: { - float *fv = (float*)(ptr + onesize*n + nrmoff); - for (int j=0; j<3; j++) - normal[j] += fv[j] * gstate.morphWeights[n]; + const float *fv = (const float*)(ptr + onesize_*n + nrmoff); + for (int j = 0; j < 3; j++) + normal[j] += fv[j] * multiplier; } break; - case GE_VTYPE_NRM_16BIT>>5: + case GE_VTYPE_NRM_16BIT >> 5: { - short *sv = (short*)(ptr + onesize*n + nrmoff); - for (int j=0; j<3; j++) - normal[j] += (sv[j]/32767.0f) * gstate.morphWeights[n]; + const short *sv = (const short*)(ptr + onesize_*n + nrmoff); + for (int j = 0; j < 3; j++) + normal[j] += (sv[j]/32767.0f) * multiplier; } break; @@ -294,38 +311,78 @@ void VertexDecoder::DecodeVerts(DecodedVertex *decoded, const void *verts, const } } - if (gstate.reversenormals & 0xFFFFFF) - { - for (int j = 0; j < 3; j++) - normal[j] = -normal[j]; - } - float *v = decoded[index].pos; - memset(v, 0, sizeof(float)*3); - for (int n = 0; n < morphcount; n++) - { + + if (morphcount == 1) { switch (pos) { - case GE_VTYPE_POS_FLOAT>>7: + case GE_VTYPE_POS_FLOAT >> 7: { - float *fv = (float*)(ptr + onesize*n + posoff); - for (int j=0; j<3; j++) - v[j] += fv[j] * gstate.morphWeights[n]; + const float *fv = (const float*)(ptr + posoff); + for (int j = 0; j < 3; j++) + v[j] = fv[j]; } break; - case GE_VTYPE_POS_16BIT>>7: + case GE_VTYPE_POS_16BIT >> 7: { - short *sv = (short*)(ptr + onesize*n + posoff); + float multiplier = 1.0f / 32767.0f; + if (throughmode) multiplier = 1.0f; + const short *sv = (const short*)(ptr + posoff); for (int j = 0; j < 3; j++) - v[j] += sv[j] * gstate.morphWeights[n]; + v[j] = sv[j] * multiplier; + } + break; + + case GE_VTYPE_POS_8BIT >> 7: + { + const s8 *sv = (const s8*)(ptr + posoff); + for (int j = 0; j < 3; j++) + v[j] = sv[j] / 127.f; } break; default: - DEBUG_LOG(G3D,"Unknown position format %i",pos); + ERROR_LOG(G3D,"Unknown position format %i",pos); break; } + } else { + memset(v, 0, sizeof(float) * 3); + for (int n = 0; n < morphcount; n++) + { + switch (pos) + { + case GE_VTYPE_POS_FLOAT >> 7: + { + const float *fv = (const float*)(ptr + posoff); + for (int j = 0; j < 3; j++) + v[j] += fv[j] * gstate.morphWeights[n]; + } + break; + + case GE_VTYPE_POS_16BIT >> 7: + { + float multiplier = 1.0f / 32767.0f; + if (throughmode) multiplier = 1.0f; + const short *sv = (const short*)(ptr + posoff); + for (int j = 0; j < 3; j++) + v[j] += (sv[j] * multiplier) * gstate.morphWeights[n]; + } + break; + + case GE_VTYPE_POS_8BIT >> 7: + { + const s8 *sv = (const s8*)(ptr + posoff); + for (int j = 0; j < 3; j++) + v[j] += (sv[j] / 127.f) * gstate.morphWeights[n]; + } + break; + + default: + ERROR_LOG(G3D,"Unknown position format %i",pos); + break; + } + } } } } diff --git a/GPU/GLES/VertexDecoder.h b/GPU/GLES/VertexDecoder.h index ae77d1c8e..5ac8c379b 100644 --- a/GPU/GLES/VertexDecoder.h +++ b/GPU/GLES/VertexDecoder.h @@ -24,15 +24,16 @@ struct DecodedVertex float pos[3]; // in case of morph, preblend during decode float normal[3]; // in case of morph, preblend during decode float uv[2]; // scaled by uscale, vscale, if there - float color[4]; // unlit - float weights[8]; + u8 color[4]; // unlit + float weights[8]; // ugh, expensive }; struct TransformedVertex { float x, y, z; // in case of morph, preblend during decode float uv[2]; // scaled by uscale, vscale, if there - float color[4]; // prelit + float color0[4]; // prelit + float color1[4]; // prelit }; @@ -44,22 +45,30 @@ struct TransformedVertex // - will compile into lighting fast specialized x86 // - will not bother translating components that can be read directly // by OpenGL ES. Will still have to translate 565 colors, and things -// like that. DecodedVertex will not be a fixed struct. +// like that. DecodedVertex will not be a fixed struct. Will have to +// do morphing here. // // We want 100% perf on 1Ghz even in vertex complex games! class VertexDecoder { +public: + VertexDecoder() : coloff(0), nrmoff(0), posoff(0) {} + ~VertexDecoder() {} + void SetVertexType(u32 fmt); + void DecodeVerts(DecodedVertex *decoded, const void *verts, const void *inds, int prim, int count) const; + +private: u32 fmt; bool throughmode; int biggest; + int size; + int onesize_; int weightoff; int tcoff; int coloff; int nrmoff; int posoff; - int size; - int oneSize; int tc; int col; @@ -70,11 +79,4 @@ class VertexDecoder int morphcount; int nweights; -public: - VertexDecoder() : coloff(0), nrmoff(0), posoff(0) {} - ~VertexDecoder() {} - void SetVertexType(u32 fmt); - void DecodeVerts(DecodedVertex *decoded, const void *verts, const void *inds, int prim, int count) const; - - // void DoGLVertexAttribPointer() }; diff --git a/GPU/GLES/VertexShaderGenerator.cpp b/GPU/GLES/VertexShaderGenerator.cpp index ecac3fe32..d48536f6a 100644 --- a/GPU/GLES/VertexShaderGenerator.cpp +++ b/GPU/GLES/VertexShaderGenerator.cpp @@ -60,7 +60,6 @@ char *GenerateVertexShader() #endif int lmode = gstate.lmode & 1; - lmode = 0; // TODO: support separate specular WRITE("attribute vec4 a_position;"); WRITE("attribute vec2 a_texcoord;"); diff --git a/Globals.h b/Globals.h index 557360114..a07fb6655 100644 --- a/Globals.h +++ b/Globals.h @@ -35,6 +35,24 @@ inline u32 _byteswap_ulong(u32 data) #endif +inline u8 Convert4To8(u8 v) +{ + // Swizzle bits: 00012345 -> 12345123 + return (v << 4) | (v); +} + +inline u8 Convert5To8(u8 v) +{ + // Swizzle bits: 00012345 -> 12345123 + return (v << 3) | (v >> 2); +} + +inline u8 Convert6To8(u8 v) +{ + // Swizzle bits: 00123456 -> 12345612 + return (v << 2) | (v >> 4); +} + #ifndef DISALLOW_COPY_AND_ASSIGN #define DISALLOW_COPY_AND_ASSIGN(t) \ private: \ diff --git a/Windows/PPSSPP.sln b/Windows/PPSSPP.sln index 2647de4ef..0ee6604d9 100644 --- a/Windows/PPSSPP.sln +++ b/Windows/PPSSPP.sln @@ -35,8 +35,6 @@ Global GlobalSection(SolutionConfigurationPlatforms) = preSolution Debug|Win32 = Debug|Win32 Debug|x64 = Debug|x64 - DebugFast|Win32 = DebugFast|Win32 - DebugFast|x64 = DebugFast|x64 Release|Win32 = Release|Win32 Release|x64 = Release|x64 EndGlobalSection @@ -45,10 +43,6 @@ Global {567AF8DB-42C1-4D08-96CD-D70A2DFEFC6B}.Debug|Win32.Build.0 = Debug|Win32 {567AF8DB-42C1-4D08-96CD-D70A2DFEFC6B}.Debug|x64.ActiveCfg = Debug|x64 {567AF8DB-42C1-4D08-96CD-D70A2DFEFC6B}.Debug|x64.Build.0 = Debug|x64 - {567AF8DB-42C1-4D08-96CD-D70A2DFEFC6B}.DebugFast|Win32.ActiveCfg = DebugFast|Win32 - {567AF8DB-42C1-4D08-96CD-D70A2DFEFC6B}.DebugFast|Win32.Build.0 = DebugFast|Win32 - {567AF8DB-42C1-4D08-96CD-D70A2DFEFC6B}.DebugFast|x64.ActiveCfg = Debug|x64 - {567AF8DB-42C1-4D08-96CD-D70A2DFEFC6B}.DebugFast|x64.Build.0 = Debug|x64 {567AF8DB-42C1-4D08-96CD-D70A2DFEFC6B}.Release|Win32.ActiveCfg = Release|Win32 {567AF8DB-42C1-4D08-96CD-D70A2DFEFC6B}.Release|Win32.Build.0 = Release|Win32 {567AF8DB-42C1-4D08-96CD-D70A2DFEFC6B}.Release|x64.ActiveCfg = Release|x64 @@ -57,10 +51,6 @@ Global {3FCDBAE2-5103-4350-9A8E-848CE9C73195}.Debug|Win32.Build.0 = Debug|Win32 {3FCDBAE2-5103-4350-9A8E-848CE9C73195}.Debug|x64.ActiveCfg = Debug|x64 {3FCDBAE2-5103-4350-9A8E-848CE9C73195}.Debug|x64.Build.0 = Debug|x64 - {3FCDBAE2-5103-4350-9A8E-848CE9C73195}.DebugFast|Win32.ActiveCfg = Debug|Win32 - {3FCDBAE2-5103-4350-9A8E-848CE9C73195}.DebugFast|Win32.Build.0 = Debug|Win32 - {3FCDBAE2-5103-4350-9A8E-848CE9C73195}.DebugFast|x64.ActiveCfg = Debug|x64 - {3FCDBAE2-5103-4350-9A8E-848CE9C73195}.DebugFast|x64.Build.0 = Debug|x64 {3FCDBAE2-5103-4350-9A8E-848CE9C73195}.Release|Win32.ActiveCfg = Release|Win32 {3FCDBAE2-5103-4350-9A8E-848CE9C73195}.Release|Win32.Build.0 = Release|Win32 {3FCDBAE2-5103-4350-9A8E-848CE9C73195}.Release|x64.ActiveCfg = Release|x64 @@ -69,10 +59,6 @@ Global {F761046E-6C38-4428-A5F1-38391A37BB34}.Debug|Win32.Build.0 = Debug|Win32 {F761046E-6C38-4428-A5F1-38391A37BB34}.Debug|x64.ActiveCfg = Debug|x64 {F761046E-6C38-4428-A5F1-38391A37BB34}.Debug|x64.Build.0 = Debug|x64 - {F761046E-6C38-4428-A5F1-38391A37BB34}.DebugFast|Win32.ActiveCfg = Debug|Win32 - {F761046E-6C38-4428-A5F1-38391A37BB34}.DebugFast|Win32.Build.0 = Debug|Win32 - {F761046E-6C38-4428-A5F1-38391A37BB34}.DebugFast|x64.ActiveCfg = Debug|x64 - {F761046E-6C38-4428-A5F1-38391A37BB34}.DebugFast|x64.Build.0 = Debug|x64 {F761046E-6C38-4428-A5F1-38391A37BB34}.Release|Win32.ActiveCfg = Release|Win32 {F761046E-6C38-4428-A5F1-38391A37BB34}.Release|Win32.Build.0 = Release|Win32 {F761046E-6C38-4428-A5F1-38391A37BB34}.Release|x64.ActiveCfg = Release|x64 @@ -81,10 +67,6 @@ Global {457F45D2-556F-47BC-A31D-AFF0D15BEAED}.Debug|Win32.Build.0 = Debug|Win32 {457F45D2-556F-47BC-A31D-AFF0D15BEAED}.Debug|x64.ActiveCfg = Debug|x64 {457F45D2-556F-47BC-A31D-AFF0D15BEAED}.Debug|x64.Build.0 = Debug|x64 - {457F45D2-556F-47BC-A31D-AFF0D15BEAED}.DebugFast|Win32.ActiveCfg = Debug|Win32 - {457F45D2-556F-47BC-A31D-AFF0D15BEAED}.DebugFast|Win32.Build.0 = Debug|Win32 - {457F45D2-556F-47BC-A31D-AFF0D15BEAED}.DebugFast|x64.ActiveCfg = Debug|x64 - {457F45D2-556F-47BC-A31D-AFF0D15BEAED}.DebugFast|x64.Build.0 = Debug|x64 {457F45D2-556F-47BC-A31D-AFF0D15BEAED}.Release|Win32.ActiveCfg = Release|Win32 {457F45D2-556F-47BC-A31D-AFF0D15BEAED}.Release|Win32.Build.0 = Release|Win32 {457F45D2-556F-47BC-A31D-AFF0D15BEAED}.Release|x64.ActiveCfg = Release|x64 @@ -93,10 +75,6 @@ Global {533F1D30-D04D-47CC-AD71-20F658907E36}.Debug|Win32.Build.0 = Debug|Win32 {533F1D30-D04D-47CC-AD71-20F658907E36}.Debug|x64.ActiveCfg = Debug|x64 {533F1D30-D04D-47CC-AD71-20F658907E36}.Debug|x64.Build.0 = Debug|x64 - {533F1D30-D04D-47CC-AD71-20F658907E36}.DebugFast|Win32.ActiveCfg = Debug|Win32 - {533F1D30-D04D-47CC-AD71-20F658907E36}.DebugFast|Win32.Build.0 = Debug|Win32 - {533F1D30-D04D-47CC-AD71-20F658907E36}.DebugFast|x64.ActiveCfg = Debug|x64 - {533F1D30-D04D-47CC-AD71-20F658907E36}.DebugFast|x64.Build.0 = Debug|x64 {533F1D30-D04D-47CC-AD71-20F658907E36}.Release|Win32.ActiveCfg = Release|Win32 {533F1D30-D04D-47CC-AD71-20F658907E36}.Release|Win32.Build.0 = Release|Win32 {533F1D30-D04D-47CC-AD71-20F658907E36}.Release|x64.ActiveCfg = Release|x64 @@ -105,10 +83,6 @@ Global {E8B58922-9827-493D-81E0-4B6E6BD77171}.Debug|Win32.Build.0 = Debug|Win32 {E8B58922-9827-493D-81E0-4B6E6BD77171}.Debug|x64.ActiveCfg = Debug|x64 {E8B58922-9827-493D-81E0-4B6E6BD77171}.Debug|x64.Build.0 = Debug|x64 - {E8B58922-9827-493D-81E0-4B6E6BD77171}.DebugFast|Win32.ActiveCfg = Debug|Win32 - {E8B58922-9827-493D-81E0-4B6E6BD77171}.DebugFast|Win32.Build.0 = Debug|Win32 - {E8B58922-9827-493D-81E0-4B6E6BD77171}.DebugFast|x64.ActiveCfg = Debug|x64 - {E8B58922-9827-493D-81E0-4B6E6BD77171}.DebugFast|x64.Build.0 = Debug|x64 {E8B58922-9827-493D-81E0-4B6E6BD77171}.Release|Win32.ActiveCfg = Release|Win32 {E8B58922-9827-493D-81E0-4B6E6BD77171}.Release|Win32.Build.0 = Release|Win32 {E8B58922-9827-493D-81E0-4B6E6BD77171}.Release|x64.ActiveCfg = Release|x64 @@ -117,10 +91,6 @@ Global {EE9BD869-CAA3-447D-8328-294D90DE2C1F}.Debug|Win32.Build.0 = Debug|Win32 {EE9BD869-CAA3-447D-8328-294D90DE2C1F}.Debug|x64.ActiveCfg = Debug|x64 {EE9BD869-CAA3-447D-8328-294D90DE2C1F}.Debug|x64.Build.0 = Debug|x64 - {EE9BD869-CAA3-447D-8328-294D90DE2C1F}.DebugFast|Win32.ActiveCfg = Debug|Win32 - {EE9BD869-CAA3-447D-8328-294D90DE2C1F}.DebugFast|Win32.Build.0 = Debug|Win32 - {EE9BD869-CAA3-447D-8328-294D90DE2C1F}.DebugFast|x64.ActiveCfg = Debug|x64 - {EE9BD869-CAA3-447D-8328-294D90DE2C1F}.DebugFast|x64.Build.0 = Debug|x64 {EE9BD869-CAA3-447D-8328-294D90DE2C1F}.Release|Win32.ActiveCfg = Release|Win32 {EE9BD869-CAA3-447D-8328-294D90DE2C1F}.Release|Win32.Build.0 = Release|Win32 {EE9BD869-CAA3-447D-8328-294D90DE2C1F}.Release|x64.ActiveCfg = Release|x64 @@ -129,10 +99,6 @@ Global {3BAAE095-E0AB-4B0E-B5DF-CE39C8AE31DE}.Debug|Win32.Build.0 = Debug|Win32 {3BAAE095-E0AB-4B0E-B5DF-CE39C8AE31DE}.Debug|x64.ActiveCfg = Debug|x64 {3BAAE095-E0AB-4B0E-B5DF-CE39C8AE31DE}.Debug|x64.Build.0 = Debug|x64 - {3BAAE095-E0AB-4B0E-B5DF-CE39C8AE31DE}.DebugFast|Win32.ActiveCfg = Debug|Win32 - {3BAAE095-E0AB-4B0E-B5DF-CE39C8AE31DE}.DebugFast|Win32.Build.0 = Debug|Win32 - {3BAAE095-E0AB-4B0E-B5DF-CE39C8AE31DE}.DebugFast|x64.ActiveCfg = Debug|x64 - {3BAAE095-E0AB-4B0E-B5DF-CE39C8AE31DE}.DebugFast|x64.Build.0 = Debug|x64 {3BAAE095-E0AB-4B0E-B5DF-CE39C8AE31DE}.Release|Win32.ActiveCfg = Release|Win32 {3BAAE095-E0AB-4B0E-B5DF-CE39C8AE31DE}.Release|Win32.Build.0 = Release|Win32 {3BAAE095-E0AB-4B0E-B5DF-CE39C8AE31DE}.Release|x64.ActiveCfg = Release|x64 diff --git a/Windows/PPSSPP.vcxproj b/Windows/PPSSPP.vcxproj index 8d486faac..b5a069f27 100644 --- a/Windows/PPSSPP.vcxproj +++ b/Windows/PPSSPP.vcxproj @@ -1,14 +1,6 @@  - - DebugFast - Win32 - - - DebugFast - x64 - Debug Win32 @@ -33,11 +25,6 @@ PPSSPPWindows - - Application - MultiByte - false - Application MultiByte @@ -47,11 +34,6 @@ Application MultiByte - - Application - MultiByte - false - Application MultiByte @@ -64,10 +46,6 @@ - - - - @@ -76,10 +54,6 @@ - - - - @@ -102,18 +76,6 @@ $(Platform)\$(Configuration)\ $(Platform)\$(Configuration)\ false - ..\ - $(Configuration)\ - false - $(Platform)\$(Configuration)\ - $(Platform)\$(Configuration)\ - false - AllRules.ruleset - - - AllRules.ruleset - - AllRules.ruleset @@ -246,94 +208,34 @@ true - - - WIN32;NDEBUG;_WINDOWS;LOGGING;_DEBUG;%(PreprocessorDefinitions) - Sync - MultiThreadedDebugDLL - false - Use - - - Level3 - ProgramDatabase - ../common;..;../native;../native/ext/glew;../ext/zlib - stdafx.h - - - XInput.lib;Winmm.lib;Ws2_32.lib;opengl32.lib;dsound.lib;glu32.lib;comctl32.lib;%(AdditionalDependencies) - $(OutDir)$(TargetName)$(TargetExt) - %(AdditionalLibraryDirectories) - true - Windows - true - true - MachineX86 - - - - - X64 - - - WIN32;NDEBUG;_WINDOWS;LOGGING;_DEBUG;%(PreprocessorDefinitions) - Sync - MultiThreadedDebugDLL - false - Use - Level3 - ProgramDatabase - ../common;..;../native;../native/ext/glew;../ext/zlib - - - imgdecoder.lib;opengl32.lib;dsound.lib;glu32.lib;comctl32.lib;XInput.lib;%(AdditionalDependencies) - $(OutDir)DaShDebugFast.exe - %(AdditionalLibraryDirectories) - true - Windows - true - true - MachineX64 - - - true true true - true true true - true true true - true true true - true true true - true true true - true true true - true true true - true true true - true true true @@ -344,12 +246,6 @@ NotUsing - NotUsing - - - NotUsing - - NotUsing @@ -368,8 +264,6 @@ - $(IntDir)%(Filename)2.obj - $(IntDir)%(Filename)2.obj $(IntDir)%(Filename)2.obj $(IntDir)%(Filename)2.obj $(IntDir)%(Filename)2.obj @@ -385,8 +279,6 @@ - Create - Create Create Create Create @@ -464,4 +356,4 @@ - + \ No newline at end of file diff --git a/android/jni/Android.mk b/android/jni/Android.mk index 5b0680bcc..025c73a38 100644 --- a/android/jni/Android.mk +++ b/android/jni/Android.mk @@ -5,7 +5,7 @@ LOCAL_PATH := $(call my-dir) include $(CLEAR_VARS) LOCAL_MODULE := native_audio -LOCAL_CFLAGS := -O2 -fsigned-char -Wall -Wno-multichar -Wno-psabi -std=gnu++0x +LOCAL_CFLAGS := -O2 -fsigned-char -ffast-math -Wall -Wno-multichar -Wno-psabi -std=gnu++0x NATIVE := ../../native LOCAL_SRC_FILES := \ $(NATIVE)/android/native-audio-so.cpp @@ -24,7 +24,7 @@ LOCAL_MODULE := ppsspp_jni NATIVE := ../../native SRC := ../.. -LOCAL_CFLAGS := -DUSE_PROFILER -DGL_GLEXT_PROTOTYPES -O2 -fsigned-char -Wall -Wno-multichar -Wno-psabi -std=gnu++0x -Wno-unused-variable -fno-strict-aliasing +LOCAL_CFLAGS := -DUSE_PROFILER -DGL_GLEXT_PROTOTYPES -O2 -fsigned-char -Wall -Wno-multichar -Wno-psabi -std=gnu++0x -Wno-unused-variable -fno-strict-aliasing -ffast-math LOCAL_CPPFLAGS := LOCAL_C_INCLUDES := \ $(LOCAL_PATH)/../../Common \