Merge pull request #17729 from unknownbrackets/softgpu-lighting

softgpu: Reduce some non-SIMD lighting math
This commit is contained in:
Henrik Rydgård 2023-07-16 21:00:38 +02:00 committed by GitHub
commit 89d846ecbe
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
3 changed files with 98 additions and 32 deletions

View File

@ -1367,6 +1367,16 @@ inline Vec3<float> Vec3<float>::operator + (const Vec3 &other) const {
return Vec3<float>(_mm_add_ps(SAFE_M128(vec), SAFE_M128(other.vec)));
}
template<>
inline void Vec3<float>::operator -= (const Vec3<float> &other) {
vec = _mm_sub_ps(SAFE_M128(vec), SAFE_M128(other.vec));
}
template<>
inline Vec3<float> Vec3<float>::operator - (const Vec3 &other) const {
return Vec3<float>(_mm_sub_ps(SAFE_M128(vec), SAFE_M128(other.vec)));
}
template<>
inline Vec3<float> Vec3<float>::operator * (const Vec3 &other) const {
return Vec3<float>(_mm_mul_ps(SAFE_M128(vec), SAFE_M128(other.vec)));

View File

@ -86,15 +86,13 @@ void ComputeState(State *state, bool hasColor0) {
bool anyAmbient = false;
bool anyDiffuse = false;
bool anySpecular = false;
bool anyDirectional = false;
bool anyNonDirectional = false;
for (int light = 0; light < 4; ++light) {
auto &lstate = state->lights[light];
lstate.enabled = gstate.isLightChanEnabled(light);
if (!lstate.enabled)
continue;
lstate.spot = gstate.isSpotLight(light);
lstate.directional = gstate.isDirectionalLight(light);
lstate.poweredDiffuse = gstate.isUsingPoweredDiffuseLight(light);
lstate.specular = gstate.isUsingSpecularLight(light);
@ -112,14 +110,22 @@ void ComputeState(State *state, bool hasColor0) {
anySpecular = anySpecular || lstate.specular;
}
lstate.pos = GetLightVec(gstate.lpos, light);
if (lstate.directional) {
lstate.pos.NormalizeOr001();
anyDirectional = true;
} else {
lstate.att = GetLightVec(gstate.latt, light);
// Doesn't actually need to be on if nothing will affect it.
if (!lstate.specular && !lstate.ambient && !lstate.diffuse) {
lstate.enabled = false;
continue;
}
lstate.pos = GetLightVec(gstate.lpos, light);
lstate.directional = gstate.isDirectionalLight(light);
if (lstate.directional) {
lstate.pos.NormalizeOr001();
} else {
lstate.att = GetLightVec(gstate.latt, light);
anyNonDirectional = true;
}
lstate.spot = gstate.isSpotLight(light);
if (lstate.spot) {
lstate.spotDir = GetLightVec(gstate.ldir, light);
lstate.spotDir.Normalize();
@ -177,7 +183,7 @@ void ComputeState(State *state, bool hasColor0) {
state->baseAmbientColorFactor = LightColorFactor(gstate.getAmbientRGBA(), ones);
state->setColor1 = gstate.isUsingSecondaryColor() && anySpecular;
state->addColor1 = !gstate.isUsingSecondaryColor() && anySpecular;
state->usesWorldPos = anyDirectional;
state->usesWorldPos = anyNonDirectional;
state->usesWorldNormal = gstate.getUVGenMode() == GE_TEXMAP_ENVIRONMENT_MAP || anyDiffuse || anySpecular;
}
@ -215,7 +221,7 @@ static inline __m128i LightColorScaleBy512SSE4(__m128i factor, __m128i color, __
__m128i result18 = _mm_madd_epi16(factor, color);
// But now with 18 bits, we need a full multiply.
__m128i multiplied = _mm_mullo_epi32(result18, scale);
return _mm_srai_epi32(multiplied, 19);
return _mm_srai_epi32(multiplied, 10 + 9);
}
#endif
@ -240,9 +246,9 @@ static Vec4<int> LightColorScaleBy512(const Vec4<int> &factor, const Vec4<int> &
return LightColorScaleBy512SSE4(factor.ivec, color.ivec, _mm_set1_epi32(scale));
#elif PPSSPP_ARCH(ARM64_NEON)
int32x4_t multiplied = vmulq_n_s32(vmulq_s32(factor.ivec, color.ivec), scale);
return vshrq_n_s32(multiplied, 19);
return vshrq_n_s32(multiplied, 10 + 19);
#endif
return (factor * color * scale) / (1024 * 512);
return (factor * color * scale) >> (10 + 9);
}
static inline void LightColorSum(Vec4<int> &sum, const Vec4<int> &src) {
@ -296,25 +302,26 @@ static void ProcessSIMD(VertexData &vertex, const WorldCoords &worldpos, const W
// L = vector from vertex to light source
// TODO: Should transfer the light positions to world/view space for these calculations?
Vec3<float> L = lstate.pos;
float att = 1.0f;
float attspot = 1.0f;
if (!lstate.directional) {
L -= worldpos;
// TODO: Should this normalize (0, 0, 0) to (0, 0, 1)?
float d = L.NormalizeOr001();
att = 1.0f / Dot33(lstate.att, Vec3f(1.0f, d, d * d));
float att = 1.0f / Dot33(lstate.att, Vec3f(1.0f, d, d * d));
if (!(att > 0.0f))
att = 0.0f;
else if (att > 1.0f)
att = 1.0f;
attspot = att;
}
float spot = 1.0f;
if (lstate.spot) {
float rawSpot = Dot33(lstate.spotDir, L);
if (std::isnan(rawSpot))
rawSpot = std::signbit(rawSpot) ? 0.0f : 1.0f;
float spot = 1.0f;
if (rawSpot >= lstate.spotCutoff) {
spot = pspLightPow(rawSpot, lstate.spotExp);
if (std::isnan(spot))
@ -322,14 +329,16 @@ static void ProcessSIMD(VertexData &vertex, const WorldCoords &worldpos, const W
} else {
spot = 0.0f;
}
attspot *= spot;
}
// ambient lighting
if (lstate.ambient) {
int attspot = (int)LightCeil<useSSE4>(256 * 2 * att * spot + 1);
if (attspot > 512)
attspot = 512;
Vec4<int> lambient = LightColorScaleBy512<useSSE4>(lstate.ambientColorFactor, mac, attspot);
int attspot512 = (int)LightCeil<useSSE4>(256 * 2 * attspot + 1);
if (attspot512 > 512)
attspot512 = 512;
Vec4<int> lambient = LightColorScaleBy512<useSSE4>(lstate.ambientColorFactor, mac, attspot512);
LightColorSum(final_color, lambient);
}
@ -343,7 +352,7 @@ static void ProcessSIMD(VertexData &vertex, const WorldCoords &worldpos, const W
}
if (lstate.diffuse && diffuse_factor > 0.0f) {
int diffuse_attspot = (int)LightCeil<useSSE4>(256 * 2 * att * spot * diffuse_factor + 1);
int diffuse_attspot = (int)LightCeil<useSSE4>(256 * 2 * attspot * diffuse_factor + 1);
if (diffuse_attspot > 512)
diffuse_attspot = 512;
Vec4<int> mdc = state.colorForDiffuse ? colorFactor : state.material.diffuseColorFactor;
@ -358,7 +367,7 @@ static void ProcessSIMD(VertexData &vertex, const WorldCoords &worldpos, const W
specular_factor = pspLightPow(specular_factor, state.specularExp);
if (specular_factor > 0.0f) {
int specular_attspot = (int)LightCeil<useSSE4>(256 * 2 * att * spot * specular_factor + 1);
int specular_attspot = (int)LightCeil<useSSE4>(256 * 2 * attspot * specular_factor + 1);
if (specular_attspot > 512)
specular_attspot = 512;

View File

@ -104,14 +104,14 @@
<PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|Win32'" Label="Configuration">
<ConfigurationType>StaticLibrary</ConfigurationType>
<UseDebugLibraries>false</UseDebugLibraries>
<WholeProgramOptimization>true</WholeProgramOptimization>
<WholeProgramOptimization>false</WholeProgramOptimization>
<CharacterSet>Unicode</CharacterSet>
<PlatformToolset>$(DefaultPlatformToolset)</PlatformToolset>
</PropertyGroup>
<PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|ARM'" Label="Configuration">
<ConfigurationType>StaticLibrary</ConfigurationType>
<UseDebugLibraries>false</UseDebugLibraries>
<WholeProgramOptimization>true</WholeProgramOptimization>
<WholeProgramOptimization>false</WholeProgramOptimization>
<CharacterSet>Unicode</CharacterSet>
<PlatformToolset>$(DefaultPlatformToolset)</PlatformToolset>
</PropertyGroup>
@ -130,14 +130,14 @@
<PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'" Label="Configuration">
<ConfigurationType>StaticLibrary</ConfigurationType>
<UseDebugLibraries>false</UseDebugLibraries>
<WholeProgramOptimization>true</WholeProgramOptimization>
<WholeProgramOptimization>false</WholeProgramOptimization>
<CharacterSet>Unicode</CharacterSet>
<PlatformToolset>$(DefaultPlatformToolset)</PlatformToolset>
</PropertyGroup>
<PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|ARM64'" Label="Configuration">
<ConfigurationType>StaticLibrary</ConfigurationType>
<UseDebugLibraries>false</UseDebugLibraries>
<WholeProgramOptimization>true</WholeProgramOptimization>
<WholeProgramOptimization>false</WholeProgramOptimization>
<CharacterSet>Unicode</CharacterSet>
<PlatformToolset>$(DefaultPlatformToolset)</PlatformToolset>
</PropertyGroup>
@ -174,11 +174,15 @@
<ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
<ClCompile>
<WarningLevel>Level3</WarningLevel>
<SDLCheck>true</SDLCheck>
<PreprocessorDefinitions>RC_DISABLE_LUA;WIN32;_DEBUG;_LIB;%(PreprocessorDefinitions)</PreprocessorDefinitions>
<ConformanceMode>true</ConformanceMode>
<PrecompiledHeaderFile>pch.h</PrecompiledHeaderFile>
<AdditionalIncludeDirectories>../rcheevos/include</AdditionalIncludeDirectories>
<EnableEnhancedInstructionSet>StreamingSIMDExtensions2</EnableEnhancedInstructionSet>
<FloatingPointModel>Precise</FloatingPointModel>
<BasicRuntimeChecks>Default</BasicRuntimeChecks>
<MultiProcessorCompilation>true</MultiProcessorCompilation>
<MinimalRebuild>false</MinimalRebuild>
<RuntimeLibrary>MultiThreadedDebug</RuntimeLibrary>
</ClCompile>
<Link>
@ -190,11 +194,15 @@
<ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Debug|ARM'">
<ClCompile>
<WarningLevel>Level3</WarningLevel>
<SDLCheck>true</SDLCheck>
<PreprocessorDefinitions>RC_DISABLE_LUA;WIN32;_DEBUG;_LIB;%(PreprocessorDefinitions)</PreprocessorDefinitions>
<ConformanceMode>true</ConformanceMode>
<PrecompiledHeaderFile>pch.h</PrecompiledHeaderFile>
<AdditionalIncludeDirectories>../rcheevos/include</AdditionalIncludeDirectories>
<OmitFramePointers>false</OmitFramePointers>
<EnableEnhancedInstructionSet>NotSet</EnableEnhancedInstructionSet>
<FloatingPointModel>Precise</FloatingPointModel>
<MultiProcessorCompilation>true</MultiProcessorCompilation>
<MinimalRebuild>false</MinimalRebuild>
<RuntimeLibrary>MultiThreadedDebug</RuntimeLibrary>
</ClCompile>
<Link>
@ -206,13 +214,19 @@
<ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">
<ClCompile>
<WarningLevel>Level3</WarningLevel>
<Optimization>MaxSpeed</Optimization>
<FunctionLevelLinking>true</FunctionLevelLinking>
<IntrinsicFunctions>true</IntrinsicFunctions>
<SDLCheck>true</SDLCheck>
<PreprocessorDefinitions>RC_DISABLE_LUA;WIN32;NDEBUG;_LIB;%(PreprocessorDefinitions)</PreprocessorDefinitions>
<ConformanceMode>true</ConformanceMode>
<PrecompiledHeaderFile>pch.h</PrecompiledHeaderFile>
<AdditionalIncludeDirectories>../rcheevos/include</AdditionalIncludeDirectories>
<EnableEnhancedInstructionSet>StreamingSIMDExtensions2</EnableEnhancedInstructionSet>
<BufferSecurityCheck>false</BufferSecurityCheck>
<FloatingPointModel>Precise</FloatingPointModel>
<FavorSizeOrSpeed>Speed</FavorSizeOrSpeed>
<InlineFunctionExpansion>AnySuitable</InlineFunctionExpansion>
<MultiProcessorCompilation>true</MultiProcessorCompilation>
<RuntimeLibrary>MultiThreaded</RuntimeLibrary>
</ClCompile>
<Link>
@ -226,13 +240,20 @@
<ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Release|ARM'">
<ClCompile>
<WarningLevel>Level3</WarningLevel>
<Optimization>MaxSpeed</Optimization>
<FunctionLevelLinking>true</FunctionLevelLinking>
<IntrinsicFunctions>true</IntrinsicFunctions>
<SDLCheck>true</SDLCheck>
<PreprocessorDefinitions>RC_DISABLE_LUA;WIN32;NDEBUG;_LIB;%(PreprocessorDefinitions)</PreprocessorDefinitions>
<ConformanceMode>true</ConformanceMode>
<PrecompiledHeaderFile>pch.h</PrecompiledHeaderFile>
<AdditionalIncludeDirectories>../rcheevos/include</AdditionalIncludeDirectories>
<BufferSecurityCheck>false</BufferSecurityCheck>
<InlineFunctionExpansion>AnySuitable</InlineFunctionExpansion>
<FavorSizeOrSpeed>Speed</FavorSizeOrSpeed>
<OmitFramePointers>false</OmitFramePointers>
<EnableEnhancedInstructionSet>NotSet</EnableEnhancedInstructionSet>
<FloatingPointModel>Precise</FloatingPointModel>
<MultiProcessorCompilation>true</MultiProcessorCompilation>
<RuntimeLibrary>MultiThreaded</RuntimeLibrary>
</ClCompile>
<Link>
@ -251,6 +272,12 @@
<ConformanceMode>true</ConformanceMode>
<PrecompiledHeaderFile>pch.h</PrecompiledHeaderFile>
<AdditionalIncludeDirectories>../rcheevos/include</AdditionalIncludeDirectories>
<OmitFramePointers>false</OmitFramePointers>
<EnableEnhancedInstructionSet>NotSet</EnableEnhancedInstructionSet>
<FloatingPointModel>Precise</FloatingPointModel>
<BasicRuntimeChecks>Default</BasicRuntimeChecks>
<MultiProcessorCompilation>true</MultiProcessorCompilation>
<MinimalRebuild>false</MinimalRebuild>
<RuntimeLibrary>MultiThreadedDebug</RuntimeLibrary>
</ClCompile>
<Link>
@ -267,6 +294,12 @@
<ConformanceMode>true</ConformanceMode>
<PrecompiledHeaderFile>pch.h</PrecompiledHeaderFile>
<AdditionalIncludeDirectories>../rcheevos/include</AdditionalIncludeDirectories>
<OmitFramePointers>false</OmitFramePointers>
<EnableEnhancedInstructionSet>NotSet</EnableEnhancedInstructionSet>
<FloatingPointModel>Precise</FloatingPointModel>
<BasicRuntimeChecks>Default</BasicRuntimeChecks>
<MultiProcessorCompilation>true</MultiProcessorCompilation>
<MinimalRebuild>false</MinimalRebuild>
<RuntimeLibrary>MultiThreadedDebug</RuntimeLibrary>
</ClCompile>
<Link>
@ -278,13 +311,20 @@
<ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'">
<ClCompile>
<WarningLevel>Level3</WarningLevel>
<Optimization>MaxSpeed</Optimization>
<FunctionLevelLinking>true</FunctionLevelLinking>
<IntrinsicFunctions>true</IntrinsicFunctions>
<SDLCheck>true</SDLCheck>
<PreprocessorDefinitions>RC_DISABLE_LUA;NDEBUG;_LIB;%(PreprocessorDefinitions)</PreprocessorDefinitions>
<ConformanceMode>true</ConformanceMode>
<PrecompiledHeaderFile>pch.h</PrecompiledHeaderFile>
<AdditionalIncludeDirectories>../rcheevos/include</AdditionalIncludeDirectories>
<BufferSecurityCheck>false</BufferSecurityCheck>
<InlineFunctionExpansion>AnySuitable</InlineFunctionExpansion>
<FavorSizeOrSpeed>Speed</FavorSizeOrSpeed>
<OmitFramePointers>false</OmitFramePointers>
<EnableEnhancedInstructionSet>NotSet</EnableEnhancedInstructionSet>
<FloatingPointModel>Precise</FloatingPointModel>
<MultiProcessorCompilation>true</MultiProcessorCompilation>
<RuntimeLibrary>MultiThreaded</RuntimeLibrary>
</ClCompile>
<Link>
@ -298,13 +338,20 @@
<ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Release|ARM64'">
<ClCompile>
<WarningLevel>Level3</WarningLevel>
<Optimization>MaxSpeed</Optimization>
<FunctionLevelLinking>true</FunctionLevelLinking>
<IntrinsicFunctions>true</IntrinsicFunctions>
<SDLCheck>true</SDLCheck>
<PreprocessorDefinitions>RC_DISABLE_LUA;NDEBUG;_LIB;%(PreprocessorDefinitions)</PreprocessorDefinitions>
<ConformanceMode>true</ConformanceMode>
<PrecompiledHeaderFile>pch.h</PrecompiledHeaderFile>
<AdditionalIncludeDirectories>../rcheevos/include</AdditionalIncludeDirectories>
<BufferSecurityCheck>false</BufferSecurityCheck>
<InlineFunctionExpansion>AnySuitable</InlineFunctionExpansion>
<FavorSizeOrSpeed>Speed</FavorSizeOrSpeed>
<OmitFramePointers>false</OmitFramePointers>
<EnableEnhancedInstructionSet>NotSet</EnableEnhancedInstructionSet>
<FloatingPointModel>Precise</FloatingPointModel>
<MultiProcessorCompilation>true</MultiProcessorCompilation>
<RuntimeLibrary>MultiThreaded</RuntimeLibrary>
</ClCompile>
<Link>