GS-OGL: Add comment on m_accurate_stq usage

GS: Remove inaccurate stq calculations from GSVertexTrace
They were the same speed or slower than full div on IvyBridge+ and Bulldozer+
2026-01-31 01:15:24 +01:00 · 2021-11-03 01:21:11 +00:00 · 2021-11-03 01:21:11 +00:00 · 2021-11-03 01:21:11 +00:00 · 2021-11-03 01:21:11 +00:00 · 2021-11-03 00:54:29 +00:00
6 changed files with 317 additions and 463 deletions
--- a/bin/GameIndex.yaml
+++ b/bin/GameIndex.yaml
--- a/pcsx2/GS/GSVector4.h
+++ b/pcsx2/GS/GSVector4.h
@@ -205,6 +205,17 @@ public:
 		return m;
 	}

+	/// Makes Clang think that the whole vector is needed, preventing it from changing shuffles around because it thinks we don't need the whole vector
+	/// Useful for e.g. preventing clang from optimizing shuffles that remove possibly-denormal garbage data from vectors before computing with them
+	__forceinline GSVector4 noopt()
+	{
+		// Note: Clang is currently the only compiler that attempts to optimize vector intrinsics, if that changes in the future the implementation should be updated
+#ifdef __clang__
+		__asm__("":"+x"(m)::);
+#endif
+		return *this;
+	}
+
 	__forceinline uint32 rgba32() const
 	{
 		return GSVector4i(*this).rgba32();
--- a/pcsx2/GS/Renderers/Common/GSVertexTrace.cpp
+++ b/pcsx2/GS/Renderers/Common/GSVertexTrace.cpp
@@ -27,8 +27,7 @@ GSVertexTrace::GSVertexTrace(const GSState* state)
 	memset(&m_alpha, 0, sizeof(m_alpha));

 	#define InitUpdate3(P, IIP, TME, FST, COLOR) \
-		m_fmm[0][COLOR][FST][TME][IIP][P] = &GSVertexTrace::FindMinMax<P, IIP, TME, FST, COLOR, 0>; \
-		m_fmm[1][COLOR][FST][TME][IIP][P] = &GSVertexTrace::FindMinMax<P, IIP, TME, FST, COLOR, 1>; \
+		m_fmm[COLOR][FST][TME][IIP][P] = &GSVertexTrace::FindMinMax<P, IIP, TME, FST, COLOR>;

 	#define InitUpdate2(P, IIP, TME) \
 		InitUpdate3(P, IIP, TME, 0, 0) \
@@ -57,7 +56,7 @@ void GSVertexTrace::Update(const void* vertex, const uint32* index, int v_count,
 	uint32 fst = m_state->PRIM->FST;
 	uint32 color = !(m_state->PRIM->TME && m_state->m_context->TEX0.TFX == TFX_DECAL && m_state->m_context->TEX0.TCC);

-	(this->*m_fmm[m_accurate_stq][color][fst][tme][iip][primclass])(vertex, index, i_count);
+	(this->*m_fmm[color][fst][tme][iip][primclass])(vertex, index, i_count);

 	// Potential float overflow detected. Better uses the slower division instead
 	// Note: If Q is too big, 1/Q will end up as 0. 1e30 is a random number
@@ -66,7 +65,6 @@ void GSVertexTrace::Update(const void* vertex, const uint32* index, int v_count,
 	{
 		fprintf(stderr, "Vertex Trace: float overflow detected ! min %e max %e\n", m_min.t.z, m_max.t.z);
 		m_accurate_stq = true;
-		(this->*m_fmm[m_accurate_stq][color][fst][tme][iip][primclass])(vertex, index, i_count);
 	}

 	m_eq.value = (m_min.c == m_max.c).mask() | ((m_min.p == m_max.p).mask() << 16) | ((m_min.t == m_max.t).mask() << 20);
@@ -150,7 +148,7 @@ void GSVertexTrace::Update(const void* vertex, const uint32* index, int v_count,
 	}
 }

-template <GS_PRIM_CLASS primclass, uint32 iip, uint32 tme, uint32 fst, uint32 color, uint32 accurate_stq>
+template <GS_PRIM_CLASS primclass, uint32 iip, uint32 tme, uint32 fst, uint32 color>
 void GSVertexTrace::FindMinMax(const void* vertex, const uint32* index, int count)
 {
 	const GSDrawingContext* context = m_state->m_context;
@@ -181,287 +179,123 @@ void GSVertexTrace::FindMinMax(const void* vertex, const uint32* index, int coun

 	const GSVertex* RESTRICT v = (GSVertex*)vertex;

-	for (int i = 0; i < count; i += n)
+	// Process 2 vertices at a time for increased efficiency
+	auto processVertices = [&](const GSVertex& v0, const GSVertex& v1, bool finalVertex)
 	{
-		if (primclass == GS_POINT_CLASS)
+		if (color)
 		{
-			GSVector4i c(v[index[i]].m[0]);
-
-			if (color)
+			GSVector4i c0 = GSVector4i::load(v0.RGBAQ.u32[0]);
+			GSVector4i c1 = GSVector4i::load(v1.RGBAQ.u32[0]);
+			if (iip || finalVertex)
 			{
-				cmin = cmin.min_u8(c);
-				cmax = cmax.max_u8(c);
+				cmin = cmin.min_u8(c0.min_u8(c1));
+				cmax = cmax.max_u8(c0.max_u8(c1));
 			}
-
-			if (tme)
+			else if (n == 2)
 			{
-				if (!fst)
-				{
-					GSVector4 stq = GSVector4::cast(c);
-
-					GSVector4 q = stq.wwww();
-
-					if (accurate_stq)
-						stq = (stq.xyww() / q).xyww(q);
-					else
-						stq = (stq.xyww() * q.rcpnr()).xyww(q);
-
-					tmin = tmin.min(stq);
-					tmax = tmax.max(stq);
-				}
-				else
-				{
-					GSVector4i uv(v[index[i]].m[1]);
-
-					GSVector4 st = GSVector4(uv.uph16()).xyxy();
-
-					tmin = tmin.min(st);
-					tmax = tmax.max(st);
-				}
+				// For even n, we process v1 and v2 of the same prim
+				// (For odd n, we process one vertex from each of two prims)
+				cmin = cmin.min_u8(c1);
+				cmax = cmax.max_u8(c1);
 			}
-
-			GSVector4i xyzf(v[index[i]].m[1]);
-
-			GSVector4i xy = xyzf.upl16();
-			GSVector4i z = xyzf.yyyy();
-
-			GSVector4i p = xy.blend16<0xf0>(z.uph32(xyzf));
-
-			pmin = pmin.min_u32(p);
-			pmax = pmax.max_u32(p);
 		}
-		else if (primclass == GS_LINE_CLASS)
+
+		if (tme)
 		{
-			GSVector4i c0(v[index[i + 0]].m[0]);
-			GSVector4i c1(v[index[i + 1]].m[0]);
-
-			if (color)
+			if (!fst)
 			{
-				if (iip)
-				{
-					cmin = cmin.min_u8(c0.min_u8(c1));
-					cmax = cmax.max_u8(c0.max_u8(c1));
-				}
-				else
-				{
-					cmin = cmin.min_u8(c1);
-					cmax = cmax.max_u8(c1);
-				}
-			}
+				GSVector4 stq0 = GSVector4::cast(GSVector4i(v0.m[0]));
+				GSVector4 stq1 = GSVector4::cast(GSVector4i(v1.m[0]));

-			if (tme)
+				GSVector4 q;
+				// Sprites always have indices == vertices, so we don't have to look at the index table here
+				if (primclass == GS_SPRITE_CLASS)
+					q = stq1.wwww();
+				else
+					q = stq0.wwww(stq1);
+
+				// Note: If in the future this is changed in a way that causes parts of calculations to go unused,
+				//       make sure to remove the z (rgba) field as it's often denormal.
+				//       Then, use GSVector4::noopt() to prevent clang from optimizing out your "useless" shuffle
+				//       e.g. stq = (stq.xyww() / stq.wwww()).noopt().xyww(stq);
+				GSVector4 st = stq0.xyxy(stq1) / q;
+
+				stq0 = st.xyww(primclass == GS_SPRITE_CLASS ? stq1 : stq0);
+				stq1 = st.zwww(stq1);
+
+				tmin = tmin.min(stq0.min(stq1));
+				tmax = tmax.max(stq0.max(stq1));
+			}
+			else
 			{
-				if (!fst)
-				{
-					GSVector4 stq0 = GSVector4::cast(c0);
-					GSVector4 stq1 = GSVector4::cast(c1);
+				GSVector4i uv0(v0.m[1]);
+				GSVector4i uv1(v1.m[1]);

-					if (accurate_stq)
-					{
-						GSVector4 q = stq0.wwww(stq1);
+				GSVector4 st0 = GSVector4(uv0.uph16()).xyxy();
+				GSVector4 st1 = GSVector4(uv1.uph16()).xyxy();

-						stq0 = (stq0.xyww() / q.xxxx()).xyww(stq0);
-						stq1 = (stq1.xyww() / q.zzzz()).xyww(stq1);
-					}
-					else
-					{
-						GSVector4 q = stq0.wwww(stq1).rcpnr();
-
-						stq0 = (stq0.xyww() * q.xxxx()).xyww(stq0);
-						stq1 = (stq1.xyww() * q.zzzz()).xyww(stq1);
-					}
-
-					tmin = tmin.min(stq0.min(stq1));
-					tmax = tmax.max(stq0.max(stq1));
-				}
-				else
-				{
-					GSVector4i uv0(v[index[i + 0]].m[1]);
-					GSVector4i uv1(v[index[i + 1]].m[1]);
-
-					GSVector4 st0 = GSVector4(uv0.uph16()).xyxy();
-					GSVector4 st1 = GSVector4(uv1.uph16()).xyxy();
-
-					tmin = tmin.min(st0.min(st1));
-					tmax = tmax.max(st0.max(st1));
-				}
+				tmin = tmin.min(st0.min(st1));
+				tmax = tmax.max(st0.max(st1));
 			}
-
-			GSVector4i xyzf0(v[index[i + 0]].m[1]);
-			GSVector4i xyzf1(v[index[i + 1]].m[1]);
-
-			GSVector4i xy0 = xyzf0.upl16();
-			GSVector4i z0 = xyzf0.yyyy();
-			GSVector4i xy1 = xyzf1.upl16();
-			GSVector4i z1 = xyzf1.yyyy();
-
-			GSVector4i p0 = xy0.blend16<0xf0>(z0.uph32(xyzf0));
-			GSVector4i p1 = xy1.blend16<0xf0>(z1.uph32(xyzf1));
-
-			pmin = pmin.min_u32(p0.min_u32(p1));
-			pmax = pmax.max_u32(p0.max_u32(p1));
 		}
-		else if (primclass == GS_TRIANGLE_CLASS)
+
+		GSVector4i xyzf0(v0.m[1]);
+		GSVector4i xyzf1(v1.m[1]);
+
+		GSVector4i xy0 = xyzf0.upl16();
+		GSVector4i z0 = xyzf0.yyyy();
+		GSVector4i xy1 = xyzf1.upl16();
+		GSVector4i z1 = xyzf1.yyyy();
+
+		GSVector4i p0 = xy0.blend16<0xf0>(z0.uph32(primclass == GS_SPRITE_CLASS ? xyzf1 : xyzf0));
+		GSVector4i p1 = xy1.blend16<0xf0>(z1.uph32(xyzf1));
+
+		pmin = pmin.min_u32(p0.min_u32(p1));
+		pmax = pmax.max_u32(p0.max_u32(p1));
+	};
+
+	if (n == 2)
+	{
+		for (int i = 0; i < count; i += 2)
 		{
-			GSVector4i c0(v[index[i + 0]].m[0]);
-			GSVector4i c1(v[index[i + 1]].m[0]);
-			GSVector4i c2(v[index[i + 2]].m[0]);
-
-			if (color)
-			{
-				if (iip)
-				{
-					cmin = cmin.min_u8(c2).min_u8(c0.min_u8(c1));
-					cmax = cmax.max_u8(c2).max_u8(c0.max_u8(c1));
-				}
-				else
-				{
-					cmin = cmin.min_u8(c2);
-					cmax = cmax.max_u8(c2);
-				}
-			}
-
-			if (tme)
-			{
-				if (!fst)
-				{
-					GSVector4 stq0 = GSVector4::cast(c0);
-					GSVector4 stq1 = GSVector4::cast(c1);
-					GSVector4 stq2 = GSVector4::cast(c2);
-
-					if (accurate_stq)
-					{
-						GSVector4 q = stq0.wwww(stq1).xzww(stq2);
-
-						stq0 = (stq0.xyww() / q.xxxx()).xyww(stq0);
-						stq1 = (stq1.xyww() / q.yyyy()).xyww(stq1);
-						stq2 = (stq2.xyww() / q.zzzz()).xyww(stq2);
-					}
-					else
-					{
-						GSVector4 q = stq0.wwww(stq1).xzww(stq2).rcpnr();
-
-						stq0 = (stq0.xyww() * q.xxxx()).xyww(stq0);
-						stq1 = (stq1.xyww() * q.yyyy()).xyww(stq1);
-						stq2 = (stq2.xyww() * q.zzzz()).xyww(stq2);
-					}
-
-					tmin = tmin.min(stq2).min(stq0.min(stq1));
-					tmax = tmax.max(stq2).max(stq0.max(stq1));
-				}
-				else
-				{
-					GSVector4i uv0(v[index[i + 0]].m[1]);
-					GSVector4i uv1(v[index[i + 1]].m[1]);
-					GSVector4i uv2(v[index[i + 2]].m[1]);
-
-					GSVector4 st0 = GSVector4(uv0.uph16()).xyxy();
-					GSVector4 st1 = GSVector4(uv1.uph16()).xyxy();
-					GSVector4 st2 = GSVector4(uv2.uph16()).xyxy();
-
-					tmin = tmin.min(st2).min(st0.min(st1));
-					tmax = tmax.max(st2).max(st0.max(st1));
-				}
-			}
-
-			GSVector4i xyzf0(v[index[i + 0]].m[1]);
-			GSVector4i xyzf1(v[index[i + 1]].m[1]);
-			GSVector4i xyzf2(v[index[i + 2]].m[1]);
-
-			GSVector4i xy0 = xyzf0.upl16();
-			GSVector4i z0 = xyzf0.yyyy();
-			GSVector4i xy1 = xyzf1.upl16();
-			GSVector4i z1 = xyzf1.yyyy();
-			GSVector4i xy2 = xyzf2.upl16();
-			GSVector4i z2 = xyzf2.yyyy();
-
-			GSVector4i p0 = xy0.blend16<0xf0>(z0.uph32(xyzf0));
-			GSVector4i p1 = xy1.blend16<0xf0>(z1.uph32(xyzf1));
-			GSVector4i p2 = xy2.blend16<0xf0>(z2.uph32(xyzf2));
-
-			pmin = pmin.min_u32(p2).min_u32(p0.min_u32(p1));
-			pmax = pmax.max_u32(p2).max_u32(p0.max_u32(p1));
-		}
-		else if (primclass == GS_SPRITE_CLASS)
-		{
-			GSVector4i c0(v[index[i + 0]].m[0]);
-			GSVector4i c1(v[index[i + 1]].m[0]);
-
-			if (color)
-			{
-				if (iip)
-				{
-					cmin = cmin.min_u8(c0.min_u8(c1));
-					cmax = cmax.max_u8(c0.max_u8(c1));
-				}
-				else
-				{
-					cmin = cmin.min_u8(c1);
-					cmax = cmax.max_u8(c1);
-				}
-			}
-
-			if (tme)
-			{
-				if (!fst)
-				{
-					GSVector4 stq0 = GSVector4::cast(c0);
-					GSVector4 stq1 = GSVector4::cast(c1);
-
-					if (accurate_stq)
-					{
-						GSVector4 q = stq1.wwww();
-
-						stq0 = (stq0.xyww() / q).xyww(stq1);
-						stq1 = (stq1.xyww() / q).xyww(stq1);
-					}
-					else
-					{
-						GSVector4 q = stq1.wwww().rcpnr();
-
-						stq0 = (stq0.xyww() * q).xyww(stq1);
-						stq1 = (stq1.xyww() * q).xyww(stq1);
-					}
-
-					tmin = tmin.min(stq0.min(stq1));
-					tmax = tmax.max(stq0.max(stq1));
-				}
-				else
-				{
-					GSVector4i uv0(v[index[i + 0]].m[1]);
-					GSVector4i uv1(v[index[i + 1]].m[1]);
-
-					GSVector4 st0 = GSVector4(uv0.uph16()).xyxy();
-					GSVector4 st1 = GSVector4(uv1.uph16()).xyxy();
-
-					tmin = tmin.min(st0.min(st1));
-					tmax = tmax.max(st0.max(st1));
-				}
-			}
-
-			GSVector4i xyzf0(v[index[i + 0]].m[1]);
-			GSVector4i xyzf1(v[index[i + 1]].m[1]);
-
-			GSVector4i xy0 = xyzf0.upl16();
-			GSVector4i z0 = xyzf0.yyyy();
-			GSVector4i xy1 = xyzf1.upl16();
-			GSVector4i z1 = xyzf1.yyyy();
-
-			GSVector4i p0 = xy0.blend16<0xf0>(z0.uph32(xyzf1));
-			GSVector4i p1 = xy1.blend16<0xf0>(z1.uph32(xyzf1));
-
-			pmin = pmin.min_u32(p0.min_u32(p1));
-			pmax = pmax.max_u32(p0.max_u32(p1));
+			processVertices(v[index[i + 0]], v[index[i + 1]], false);
 		}
 	}
-
-	// FIXME/WARNING. A division by 2 is done on the depth. I suspect to avoid
-	// negative value. However it means that we lost the lsb bit. m_eq.z could
-	// be true if depth isn't constant but close enough. It also imply that
-	// pmin.z & 1 == 0 and pax.z & 1 == 0
-
-	pmin = pmin.blend16<0x30>(pmin.srl32(1));
-	pmax = pmax.blend16<0x30>(pmax.srl32(1));
+	else if (iip || n == 1) // iip means final and non-final vertexes are treated the same
+	{
+		int i = 0;
+		for (; i < (count - 1); i += 2) // 2x loop unroll
+		{
+			processVertices(v[index[i + 0]], v[index[i + 1]], true);
+		}
+		if (count & 1)
+		{
+			// Compiler optimizations go!
+			// (And if they don't, it's only one vertex out of many)
+			processVertices(v[index[i]], v[index[i]], true);
+		}
+	}
+	else if (n == 3)
+	{
+		int i = 0;
+		for (; i < (count - 3); i += 6)
+		{
+			processVertices(v[index[i + 0]], v[index[i + 3]], false);
+			processVertices(v[index[i + 1]], v[index[i + 4]], false);
+			processVertices(v[index[i + 2]], v[index[i + 5]], true);
+		}
+		if (count & 1)
+		{
+			processVertices(v[index[i + 0]], v[index[i + 1]], false);
+			// Compiler optimizations go!
+			// (And if they don't, it's only one vertex out of many)
+			processVertices(v[index[i + 2]], v[index[i + 2]], true);
+		}
+	}
+	else
+	{
+		pxAssertRel(0, "Bad n value");
+	}

 	GSVector4 o(context->XYOFFSET);
 	GSVector4 s(1.0f / 16, 1.0f / 16, 2.0f, 1.0f);
@@ -469,6 +303,10 @@ void GSVertexTrace::FindMinMax(const void* vertex, const uint32* index, int coun
 	m_min.p = (GSVector4(pmin) - o) * s;
 	m_max.p = (GSVector4(pmax) - o) * s;

+	// Fix signed int conversion
+	m_min.p = m_min.p.insert32<0, 2>(GSVector4::load((float)(uint32)pmin.extract32<2>()));
+	m_max.p = m_max.p.insert32<0, 2>(GSVector4::load((float)(uint32)pmax.extract32<2>()));
+
 	if (tme)
 	{
 		if (fst)
@@ -491,8 +329,8 @@ void GSVertexTrace::FindMinMax(const void* vertex, const uint32* index, int coun

 	if (color)
 	{
-		m_min.c = cmin.zzzz().u8to32();
-		m_max.c = cmax.zzzz().u8to32();
+		m_min.c = cmin.u8to32();
+		m_max.c = cmax.u8to32();
 	}
 	else
 	{
--- a/pcsx2/GS/Renderers/Common/GSVertexTrace.h
+++ b/pcsx2/GS/Renderers/Common/GSVertexTrace.h
@@ -47,9 +47,9 @@ protected:

 	typedef void (GSVertexTrace::*FindMinMaxPtr)(const void* vertex, const uint32* index, int count);

-	FindMinMaxPtr m_fmm[2][2][2][2][2][4];
+	FindMinMaxPtr m_fmm[2][2][2][2][4];

-	template <GS_PRIM_CLASS primclass, uint32 iip, uint32 tme, uint32 fst, uint32 color, uint32 accurate_stq>
+	template <GS_PRIM_CLASS primclass, uint32 iip, uint32 tme, uint32 fst, uint32 color>
 	void FindMinMax(const void* vertex, const uint32* index, int count);

 public:
--- a/pcsx2/GS/Renderers/HW/GSTextureCache.cpp
+++ b/pcsx2/GS/Renderers/HW/GSTextureCache.cpp
@@ -1096,7 +1096,8 @@ void GSTextureCache::InvalidateLocalMem(GSOffset* off, const GSVector4i& r)
 				// the game can then draw using 8H format
 				// in the case of silent hill blit 8H -> 8P
 				// this will matter later when the data ends up in GS memory in the wrong format
-				if (t->m_32_bits_fmt)
+				// Be careful to avoid 24 bit textures which are technically 32bit, as you could lose alpha (8H) data.
+				if (t->m_32_bits_fmt && t->m_TEX0.PSM > PSM_PSMCT24)
 					t->m_TEX0.PSM = PSM_PSMCT32;

 				if (GSTextureCache::m_disable_partial_invalidation)
--- a/pcsx2/GS/Renderers/OpenGL/GSRendererOGL.cpp
+++ b/pcsx2/GS/Renderers/OpenGL/GSRendererOGL.cpp
@@ -82,6 +82,9 @@ void GSRendererOGL::SetupIA(const float& sx, const float& sy)
 			//
 			// Note2: Due to MultiThreaded driver, Nvidia suffers less of the previous issue. Still it isn't free
 			// Shadow Heart is 90 fps (gs) vs 113 fps (no gs)
+			//
+			// Note3: Some GPUs (Happens on GT 750m, not on Intel 5200) don't properly divide by large floats (e.g. FLT_MAX/FLT_MAX == 0)
+			// Lines2Sprites predivides by Q, avoiding this issue, so always use it if m_vt.m_accurate_stq

 			// If the draw calls contains few primitives. Geometry Shader gain with be rather small versus
 			// the extra validation cost of the extra stage.
Author	SHA1	Message	Date
tellowkrinkle	60791e4c2b	GS-OGL: Add comment on m_accurate_stq usage	2021-11-03 01:21:11 +00:00
TellowKrinkle	bd8fcc8f81	GS: Remove inaccurate stq calculations from GSVertexTrace They were the same speed or slower than full div on IvyBridge+ and Bulldozer+	2021-11-03 01:21:11 +00:00
TellowKrinkle	5d33165fa5	GS: Reduce repeated code in GSVertexTrace::FindMinMax Why repeat things when you can make the compiler repeat them for you	2021-11-03 01:21:11 +00:00
TellowKrinkle	2e1d147135	GS: Faster accurate_stq calculations	2021-11-03 01:21:11 +00:00
$refractionpcsx2$ refractionpcsx2	ee8d24a260	GS: Don't propagate 24bit textures on download	2021-11-03 00:54:29 +00:00
RedDevilus	81ac26c33c	GameDB: Clean-up V2 Some minor typo clean-up and fix japanese serials.	2021-11-03 00:54:06 +00:00